From 55944e5e40b1be2afc4855d8d2baf4b73d1876b5 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 10 Apr 2024 22:49:52 +0200 Subject: Adding upstream version 255.4. Signed-off-by: Daniel Baumann --- src/ac-power/ac-power.c | 109 + src/ac-power/meson.build | 9 + src/analyze/analyze-blame.c | 69 + src/analyze/analyze-blame.h | 4 + src/analyze/analyze-calendar.c | 140 + src/analyze/analyze-calendar.h | 4 + src/analyze/analyze-capability.c | 56 + src/analyze/analyze-capability.h | 4 + src/analyze/analyze-cat-config.c | 44 + src/analyze/analyze-cat-config.h | 4 + src/analyze/analyze-compare-versions.c | 52 + src/analyze/analyze-compare-versions.h | 3 + src/analyze/analyze-condition.c | 137 + src/analyze/analyze-condition.h | 4 + src/analyze/analyze-critical-chain.c | 230 + src/analyze/analyze-critical-chain.h | 4 + src/analyze/analyze-dot.c | 182 + src/analyze/analyze-dot.h | 4 + src/analyze/analyze-dump.c | 145 + src/analyze/analyze-dump.h | 4 + src/analyze/analyze-exit-status.c | 56 + src/analyze/analyze-exit-status.h | 4 + src/analyze/analyze-fdstore.c | 110 + src/analyze/analyze-fdstore.h | 5 + src/analyze/analyze-filesystems.c | 221 + src/analyze/analyze-filesystems.h | 4 + src/analyze/analyze-image-policy.c | 156 + src/analyze/analyze-image-policy.h | 3 + src/analyze/analyze-inspect-elf.c | 116 + src/analyze/analyze-inspect-elf.h | 4 + src/analyze/analyze-log-control.c | 22 + src/analyze/analyze-log-control.h | 4 + src/analyze/analyze-malloc.c | 63 + src/analyze/analyze-malloc.h | 5 + src/analyze/analyze-pcrs.c | 144 + src/analyze/analyze-pcrs.h | 4 + src/analyze/analyze-plot.c | 493 ++ src/analyze/analyze-plot.h | 4 + src/analyze/analyze-security.c | 2956 ++++++++ src/analyze/analyze-security.h | 10 + src/analyze/analyze-service-watchdogs.c | 41 + src/analyze/analyze-service-watchdogs.h | 4 + src/analyze/analyze-srk.c | 51 + src/analyze/analyze-srk.h | 4 + src/analyze/analyze-syscall-filter.c | 202 + src/analyze/analyze-syscall-filter.h | 4 + src/analyze/analyze-time-data.c | 331 + src/analyze/analyze-time-data.h | 59 + src/analyze/analyze-time.c | 22 + src/analyze/analyze-time.h | 4 + src/analyze/analyze-timespan.c | 66 + src/analyze/analyze-timespan.h | 4 + src/analyze/analyze-timestamp.c | 90 + src/analyze/analyze-timestamp.h | 4 + src/analyze/analyze-unit-files.c | 50 + src/analyze/analyze-unit-files.h | 4 + src/analyze/analyze-unit-paths.c | 20 + src/analyze/analyze-unit-paths.h | 4 + src/analyze/analyze-verify-util.c | 376 + src/analyze/analyze-verify-util.h | 23 + src/analyze/analyze-verify.c | 70 + src/analyze/analyze-verify.h | 4 + src/analyze/analyze.c | 691 ++ src/analyze/analyze.h | 51 + src/analyze/meson.build | 56 + src/analyze/test-verify.c | 16 + src/ask-password/ask-password.c | 263 + src/ask-password/meson.build | 9 + src/backlight/backlight.c | 612 ++ src/backlight/meson.build | 9 + src/basic/MurmurHash2.c | 91 + src/basic/MurmurHash2.h | 31 + src/basic/af-list.c | 52 + src/basic/af-list.h | 28 + src/basic/af-to-name.awk | 11 + src/basic/alloc-util.c | 135 + src/basic/alloc-util.h | 276 + src/basic/architecture.c | 178 + src/basic/architecture.h | 246 + src/basic/argv-util.c | 233 + src/basic/argv-util.h | 25 + src/basic/arphrd-to-name.awk | 14 + src/basic/arphrd-util.c | 45 + src/basic/arphrd-util.h | 10 + src/basic/audit-util.c | 146 + src/basic/audit-util.h | 17 + src/basic/bitfield.h | 73 + src/basic/btrfs.c | 98 + src/basic/btrfs.h | 9 + src/basic/build.c | 283 + src/basic/build.h | 6 + src/basic/bus-label.c | 79 + src/basic/bus-label.h | 14 + src/basic/cap-list.c | 187 + src/basic/cap-list.h | 21 + src/basic/cap-to-name.awk | 11 + src/basic/capability-util.c | 642 ++ src/basic/capability-util.h | 88 + src/basic/cgroup-util.c | 2434 ++++++ src/basic/cgroup-util.h | 356 + src/basic/chase.c | 1156 +++ src/basic/chase.h | 64 + src/basic/chattr-util.c | 162 + src/basic/chattr-util.h | 64 + src/basic/check-filesystems.sh | 36 + src/basic/compress.c | 1088 +++ src/basic/compress.h | 109 + src/basic/conf-files.c | 374 + src/basic/conf-files.h | 31 + src/basic/confidential-virt.c | 228 + src/basic/confidential-virt.h | 25 + src/basic/constants.h | 97 + src/basic/coverage.h | 66 + src/basic/devnum-util.c | 138 + src/basic/devnum-util.h | 56 + src/basic/dirent-util.c | 105 + src/basic/dirent-util.h | 64 + src/basic/dns-def.h | 17 + src/basic/efivars.c | 446 ++ src/basic/efivars.h | 102 + src/basic/env-file.c | 647 ++ src/basic/env-file.h | 24 + src/basic/env-util.c | 1095 +++ src/basic/env-util.h | 81 + src/basic/errno-list.c | 37 + src/basic/errno-list.h | 15 + src/basic/errno-to-name.awk | 11 + src/basic/errno-util.h | 206 + src/basic/escape.c | 576 ++ src/basic/escape.h | 72 + src/basic/ether-addr-util.c | 272 + src/basic/ether-addr-util.h | 115 + src/basic/extract-word.c | 297 + src/basic/extract-word.h | 22 + src/basic/fd-util.c | 992 +++ src/basic/fd-util.h | 152 + src/basic/fileio.c | 1573 ++++ src/basic/fileio.h | 175 + src/basic/filesystems-gperf.gperf | 131 + src/basic/filesystems.c | 175 + src/basic/filesystems.h | 42 + src/basic/format-util.c | 101 + src/basic/format-util.h | 105 + src/basic/fs-util.c | 1238 ++++ src/basic/fs-util.h | 142 + src/basic/gcrypt-util.c | 54 + src/basic/gcrypt-util.h | 39 + src/basic/generate-af-list.sh | 8 + src/basic/generate-arphrd-list.sh | 8 + src/basic/generate-cap-list.sh | 8 + src/basic/generate-errno-list.sh | 11 + src/basic/generate-filesystem-list.py | 15 + src/basic/generate-filesystem-switch-case.py | 53 + src/basic/getopt-defs.h | 75 + src/basic/glob-util.c | 105 + src/basic/glob-util.h | 25 + src/basic/glyph-util.c | 147 + src/basic/glyph-util.h | 70 + src/basic/gunicode.c | 111 + src/basic/gunicode.h | 30 + src/basic/hash-funcs.c | 121 + src/basic/hash-funcs.h | 111 + src/basic/hashmap.c | 2160 ++++++ src/basic/hashmap.h | 468 ++ src/basic/hexdecoct.c | 907 +++ src/basic/hexdecoct.h | 53 + src/basic/hmac.c | 60 + src/basic/hmac.h | 12 + src/basic/hostname-util.c | 209 + src/basic/hostname-util.h | 71 + src/basic/in-addr-util.c | 984 +++ src/basic/in-addr-util.h | 213 + src/basic/initrd-util.c | 42 + src/basic/initrd-util.h | 7 + src/basic/inotify-util.c | 78 + src/basic/inotify-util.h | 36 + src/basic/io-util.c | 308 + src/basic/io-util.h | 46 + src/basic/ioprio-util.c | 31 + src/basic/ioprio-util.h | 27 + src/basic/iovec-util.c | 70 + src/basic/iovec-util.h | 44 + src/basic/iovec-wrapper.c | 131 + src/basic/iovec-wrapper.h | 42 + src/basic/label.c | 30 + src/basic/label.h | 14 + src/basic/limits-util.c | 192 + src/basic/limits-util.h | 10 + src/basic/linux/README | 8 + src/basic/linux/batman_adv.h | 704 ++ src/basic/linux/btrfs.h | 1173 +++ src/basic/linux/btrfs_tree.h | 1260 ++++ src/basic/linux/can/netlink.h | 185 + src/basic/linux/can/vxcan.h | 13 + src/basic/linux/cfm_bridge.h | 64 + src/basic/linux/fib_rules.h | 90 + src/basic/linux/fou.h | 48 + src/basic/linux/genetlink.h | 103 + src/basic/linux/hdlc/ioctl.h | 94 + src/basic/linux/if.h | 297 + src/basic/linux/if_addr.h | 79 + src/basic/linux/if_bonding.h | 155 + src/basic/linux/if_bridge.h | 826 +++ src/basic/linux/if_ether.h | 181 + src/basic/linux/if_link.h | 1392 ++++ src/basic/linux/if_macsec.h | 194 + src/basic/linux/if_tun.h | 118 + src/basic/linux/if_tunnel.h | 185 + src/basic/linux/in.h | 331 + src/basic/linux/in6.h | 302 + src/basic/linux/ipv6_route.h | 64 + src/basic/linux/l2tp.h | 203 + src/basic/linux/libc-compat.h | 267 + src/basic/linux/mrp_bridge.h | 74 + src/basic/linux/netdevice.h | 66 + src/basic/linux/netfilter/nf_tables.h | 1963 +++++ src/basic/linux/netfilter/nfnetlink.h | 82 + src/basic/linux/netlink.h | 378 + src/basic/linux/nexthop.h | 104 + src/basic/linux/nl80211.h | 7726 +++++++++++++++++++ src/basic/linux/pkt_sched.h | 1281 ++++ src/basic/linux/rtnetlink.h | 826 +++ src/basic/linux/stddef.h | 46 + src/basic/linux/update.sh | 11 + src/basic/linux/wireguard.h | 196 + src/basic/list.h | 209 + src/basic/locale-util.c | 376 + src/basic/locale-util.h | 55 + src/basic/lock-util.c | 277 + src/basic/lock-util.h | 45 + src/basic/log.c | 1810 +++++ src/basic/log.h | 537 ++ src/basic/login-util.c | 12 + src/basic/login-util.h | 22 + src/basic/macro.h | 392 + src/basic/mallinfo-util.h | 24 + src/basic/math-util.h | 14 + src/basic/memfd-util.c | 201 + src/basic/memfd-util.h | 23 + src/basic/memory-util.c | 41 + src/basic/memory-util.h | 109 + src/basic/mempool.c | 175 + src/basic/mempool.h | 28 + src/basic/memstream-util.c | 75 + src/basic/memstream-util.h | 27 + src/basic/meson.build | 316 + src/basic/missing_audit.h | 24 + src/basic/missing_capability.h | 39 + src/basic/missing_drm.h | 10 + src/basic/missing_fcntl.h | 94 + src/basic/missing_fs.h | 87 + src/basic/missing_input.h | 45 + src/basic/missing_ioprio.h | 59 + src/basic/missing_keyctl.h | 79 + src/basic/missing_loop.h | 24 + src/basic/missing_magic.h | 194 + src/basic/missing_mman.h | 20 + src/basic/missing_mount.h | 9 + src/basic/missing_network.h | 83 + src/basic/missing_prctl.h | 26 + src/basic/missing_random.h | 20 + src/basic/missing_resource.h | 11 + src/basic/missing_sched.h | 26 + src/basic/missing_securebits.h | 18 + src/basic/missing_socket.h | 81 + src/basic/missing_stat.h | 135 + src/basic/missing_stdlib.h | 13 + src/basic/missing_syscall.h | 680 ++ src/basic/missing_syscall_def.h | 1199 +++ src/basic/missing_syscalls.py | 162 + src/basic/missing_threads.h | 15 + src/basic/missing_timerfd.h | 8 + src/basic/missing_type.h | 12 + src/basic/missing_xfs.h | 42 + src/basic/mkdir.c | 269 + src/basic/mkdir.h | 36 + src/basic/mountpoint-util.c | 786 ++ src/basic/mountpoint-util.h | 71 + src/basic/namespace-util.c | 269 + src/basic/namespace-util.h | 48 + src/basic/nss-util.h | 273 + src/basic/nulstr-util.c | 145 + src/basic/nulstr-util.h | 42 + src/basic/ordered-set.c | 103 + src/basic/ordered-set.h | 109 + src/basic/origin-id.h | 36 + src/basic/os-util.c | 442 ++ src/basic/os-util.h | 59 + src/basic/parse-util.c | 806 ++ src/basic/parse-util.h | 161 + src/basic/path-lookup.c | 910 +++ src/basic/path-lookup.h | 76 + src/basic/path-util.c | 1434 ++++ src/basic/path-util.h | 222 + src/basic/pcapng.h | 115 + src/basic/percent-util.c | 157 + src/basic/percent-util.h | 65 + src/basic/pidref.c | 285 + src/basic/pidref.h | 65 + src/basic/prioq.c | 309 + src/basic/prioq.h | 33 + src/basic/proc-cmdline.c | 501 ++ src/basic/proc-cmdline.h | 42 + src/basic/process-util.c | 2060 ++++++ src/basic/process-util.h | 230 + src/basic/procfs-util.c | 268 + src/basic/procfs-util.h | 21 + src/basic/psi-util.c | 128 + src/basic/psi-util.h | 35 + src/basic/pthread-util.h | 16 + src/basic/random-util.c | 251 + src/basic/random-util.h | 32 + src/basic/ratelimit.c | 60 + src/basic/ratelimit.h | 30 + src/basic/raw-clone.h | 85 + src/basic/raw-reboot.h | 14 + src/basic/recurse-dir.c | 503 ++ src/basic/recurse-dir.h | 81 + src/basic/replace-var.c | 93 + src/basic/replace-var.h | 4 + src/basic/rlimit-util.c | 428 ++ src/basic/rlimit-util.h | 27 + src/basic/runtime-scope.c | 20 + src/basic/runtime-scope.h | 19 + src/basic/set.h | 155 + src/basic/sigbus.c | 152 + src/basic/sigbus.h | 7 + src/basic/signal-util.c | 303 + src/basic/signal-util.h | 69 + src/basic/siphash24.c | 201 + src/basic/siphash24.h | 53 + src/basic/socket-util.c | 1696 +++++ src/basic/socket-util.h | 387 + src/basic/sort-util.c | 37 + src/basic/sort-util.h | 79 + src/basic/sparse-endian.h | 90 + src/basic/special.h | 124 + src/basic/stat-util.c | 520 ++ src/basic/stat-util.h | 116 + src/basic/static-destruct.h | 105 + src/basic/stdio-util.h | 74 + src/basic/strbuf.c | 181 + src/basic/strbuf.h | 39 + src/basic/string-table.c | 15 + src/basic/string-table.h | 117 + src/basic/string-util.c | 1521 ++++ src/basic/string-util.h | 324 + src/basic/strv.c | 923 +++ src/basic/strv.h | 256 + src/basic/strxcpyx.c | 145 + src/basic/strxcpyx.h | 33 + src/basic/sync-util.c | 159 + src/basic/sync-util.h | 11 + src/basic/syscall-list.txt | 515 ++ src/basic/syscalls-alpha.txt | 515 ++ src/basic/syscalls-arc.txt | 515 ++ src/basic/syscalls-arm.txt | 515 ++ src/basic/syscalls-arm64.txt | 515 ++ src/basic/syscalls-i386.txt | 515 ++ src/basic/syscalls-ia64.txt | 604 ++ src/basic/syscalls-loongarch64.txt | 515 ++ src/basic/syscalls-m68k.txt | 515 ++ src/basic/syscalls-mips64.txt | 515 ++ src/basic/syscalls-mips64n32.txt | 515 ++ src/basic/syscalls-mipso32.txt | 515 ++ src/basic/syscalls-parisc.txt | 515 ++ src/basic/syscalls-powerpc.txt | 515 ++ src/basic/syscalls-powerpc64.txt | 515 ++ src/basic/syscalls-riscv32.txt | 515 ++ src/basic/syscalls-riscv64.txt | 515 ++ src/basic/syscalls-s390.txt | 515 ++ src/basic/syscalls-s390x.txt | 515 ++ src/basic/syscalls-sparc.txt | 515 ++ src/basic/syscalls-x86_64.txt | 515 ++ src/basic/sysctl-util.c | 137 + src/basic/sysctl-util.h | 30 + src/basic/syslog-util.c | 131 + src/basic/syslog-util.h | 16 + src/basic/terminal-util.c | 1553 ++++ src/basic/terminal-util.h | 277 + src/basic/time-util.c | 1773 +++++ src/basic/time-util.h | 244 + src/basic/tmpfile-util.c | 472 ++ src/basic/tmpfile-util.h | 44 + src/basic/uid-alloc-range.c | 131 + src/basic/uid-alloc-range.h | 36 + src/basic/uid-range.c | 237 + src/basic/uid-range.h | 34 + src/basic/umask-util.h | 29 + src/basic/unaligned.h | 83 + src/basic/unit-def.c | 338 + src/basic/unit-def.h | 343 + src/basic/unit-file.c | 833 +++ src/basic/unit-file.h | 62 + src/basic/unit-name.c | 916 +++ src/basic/unit-name.h | 69 + src/basic/user-util.c | 1060 +++ src/basic/user-util.h | 157 + src/basic/utf8.c | 630 ++ src/basic/utf8.h | 59 + src/basic/virt.c | 1071 +++ src/basic/virt.h | 70 + src/basic/xattr-util.c | 379 + src/basic/xattr-util.h | 42 + src/battery-check/battery-check.c | 183 + src/battery-check/meson.build | 9 + src/binfmt/binfmt.c | 264 + src/binfmt/meson.build | 17 + src/boot/bless-boot-generator.c | 56 + src/boot/bless-boot.c | 527 ++ src/boot/boot-check-no-failures.c | 113 + src/boot/bootctl-install.c | 1101 +++ src/boot/bootctl-install.h | 6 + src/boot/bootctl-random-seed.c | 239 + src/boot/bootctl-random-seed.h | 6 + src/boot/bootctl-reboot-to-firmware.c | 38 + src/boot/bootctl-reboot-to-firmware.h | 3 + src/boot/bootctl-set-efivar.c | 171 + src/boot/bootctl-set-efivar.h | 4 + src/boot/bootctl-status.c | 829 +++ src/boot/bootctl-status.h | 5 + src/boot/bootctl-systemd-efi-options.c | 43 + src/boot/bootctl-systemd-efi-options.h | 4 + src/boot/bootctl-uki.c | 39 + src/boot/bootctl-uki.h | 4 + src/boot/bootctl-util.c | 132 + src/boot/bootctl-util.h | 14 + src/boot/bootctl.c | 516 ++ src/boot/bootctl.h | 46 + src/boot/efi/UEFI_SECURITY.md | 122 + src/boot/efi/addon.c | 14 + src/boot/efi/bcd.c | 306 + src/boot/efi/bcd.h | 6 + src/boot/efi/boot.c | 2748 +++++++ src/boot/efi/console.c | 312 + src/boot/efi/console.h | 38 + src/boot/efi/cpio.c | 512 ++ src/boot/efi/cpio.h | 31 + src/boot/efi/device-path-util.c | 138 + src/boot/efi/device-path-util.h | 27 + src/boot/efi/devicetree.c | 149 + src/boot/efi/devicetree.h | 15 + src/boot/efi/drivers.c | 115 + src/boot/efi/drivers.h | 11 + src/boot/efi/efi-string.c | 1084 +++ src/boot/efi/efi-string.h | 180 + src/boot/efi/efi.h | 459 ++ src/boot/efi/fuzz-bcd.c | 24 + src/boot/efi/fuzz-efi-osrel.c | 28 + src/boot/efi/fuzz-efi-printf.c | 78 + src/boot/efi/fuzz-efi-string.c | 42 + src/boot/efi/graphics.c | 41 + src/boot/efi/graphics.h | 10 + src/boot/efi/initrd.c | 136 + src/boot/efi/initrd.h | 16 + src/boot/efi/linux.c | 156 + src/boot/efi/linux.h | 19 + src/boot/efi/linux_x86.c | 224 + src/boot/efi/log.c | 115 + src/boot/efi/log.h | 32 + src/boot/efi/measure.c | 296 + src/boot/efi/measure.h | 44 + src/boot/efi/meson.build | 409 ++ src/boot/efi/part-discovery.c | 298 + src/boot/efi/part-discovery.h | 12 + src/boot/efi/pe.c | 332 + src/boot/efi/pe.h | 19 + src/boot/efi/proto/block-io.h | 43 + src/boot/efi/proto/console-control.h | 28 + src/boot/efi/proto/device-path.h | 86 + src/boot/efi/proto/dt-fixup.h | 30 + src/boot/efi/proto/file-io.h | 83 + src/boot/efi/proto/graphics-output.h | 78 + src/boot/efi/proto/load-file.h | 21 + src/boot/efi/proto/loaded-image.h | 29 + src/boot/efi/proto/rng.h | 20 + src/boot/efi/proto/security-arch.h | 32 + src/boot/efi/proto/shell-parameters.h | 15 + src/boot/efi/proto/simple-text-io.h | 182 + src/boot/efi/proto/tcg.h | 117 + src/boot/efi/random-seed.c | 325 + src/boot/efi/random-seed.h | 6 + src/boot/efi/secure-boot.c | 223 + src/boot/efi/secure-boot.h | 26 + src/boot/efi/shim.c | 108 + src/boot/efi/shim.h | 16 + src/boot/efi/splash.c | 334 + src/boot/efi/splash.h | 6 + src/boot/efi/stub.c | 816 ++ src/boot/efi/test-bcd.c | 162 + src/boot/efi/test-efi-string.c | 794 ++ src/boot/efi/ticks.c | 111 + src/boot/efi/ticks.h | 6 + src/boot/efi/ubsan.c | 46 + src/boot/efi/util.c | 705 ++ src/boot/efi/util.h | 213 + src/boot/efi/vmm.c | 426 ++ src/boot/efi/vmm.h | 13 + src/boot/measure.c | 1085 +++ src/boot/meson.build | 69 + src/busctl/busctl-introspect.c | 715 ++ src/busctl/busctl-introspect.h | 15 + src/busctl/busctl.c | 2593 +++++++ src/busctl/meson.build | 20 + src/busctl/test-busctl-introspect.c | 364 + src/cgls/cgls.c | 329 + src/cgls/meson.build | 9 + src/cgroups-agent/cgroups-agent.c | 54 + src/cgroups-agent/meson.build | 8 + src/cgtop/cgtop.c | 1110 +++ src/cgtop/meson.build | 9 + src/core/all-units.h | 15 + src/core/apparmor-setup.c | 99 + src/core/apparmor-setup.h | 4 + src/core/audit-fd.c | 62 + src/core/audit-fd.h | 5 + src/core/automount.c | 1149 +++ src/core/automount.h | 45 + src/core/bpf-devices.c | 505 ++ src/core/bpf-devices.h | 21 + src/core/bpf-firewall.c | 974 +++ src/core/bpf-firewall.h | 25 + src/core/bpf-foreign.c | 154 + src/core/bpf-foreign.h | 15 + src/core/bpf-lsm.c | 320 + src/core/bpf-lsm.h | 28 + src/core/bpf-socket-bind.c | 244 + src/core/bpf-socket-bind.h | 15 + src/core/bpf-util.c | 36 + src/core/bpf-util.h | 5 + src/core/bpf/restrict_fs/meson.build | 24 + src/core/bpf/restrict_fs/restrict-fs-skel.h | 14 + src/core/bpf/restrict_fs/restrict-fs.bpf.c | 82 + src/core/bpf/restrict_ifaces/meson.build | 24 + .../bpf/restrict_ifaces/restrict-ifaces-skel.h | 14 + src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c | 52 + src/core/bpf/socket_bind/meson.build | 24 + src/core/bpf/socket_bind/socket-bind-api.bpf.h | 51 + src/core/bpf/socket_bind/socket-bind-skel.h | 14 + src/core/bpf/socket_bind/socket-bind.bpf.c | 111 + src/core/cgroup.c | 4665 ++++++++++++ src/core/cgroup.h | 429 ++ src/core/core-varlink.c | 652 ++ src/core/core-varlink.h | 16 + src/core/crash-handler.c | 193 + src/core/crash-handler.h | 7 + src/core/dbus-automount.c | 68 + src/core/dbus-automount.h | 11 + src/core/dbus-cgroup.c | 2287 ++++++ src/core/dbus-cgroup.h | 15 + src/core/dbus-device.c | 11 + src/core/dbus-device.h | 6 + src/core/dbus-execute.c | 3758 ++++++++++ src/core/dbus-execute.h | 35 + src/core/dbus-job.c | 374 + src/core/dbus-job.h | 20 + src/core/dbus-kill.c | 81 + src/core/dbus-kill.h | 12 + src/core/dbus-manager.c | 3628 +++++++++ src/core/dbus-manager.h | 18 + src/core/dbus-mount.c | 174 + src/core/dbus-mount.h | 12 + src/core/dbus-path.c | 164 + src/core/dbus-path.h | 11 + src/core/dbus-scope.c | 318 + src/core/dbus-scope.h | 19 + src/core/dbus-service.c | 791 ++ src/core/dbus-service.h | 15 + src/core/dbus-slice.c | 34 + src/core/dbus-slice.h | 12 + src/core/dbus-socket.c | 470 ++ src/core/dbus-socket.h | 12 + src/core/dbus-swap.c | 55 + src/core/dbus-swap.h | 16 + src/core/dbus-target.c | 9 + src/core/dbus-target.h | 6 + src/core/dbus-timer.c | 364 + src/core/dbus-timer.h | 11 + src/core/dbus-unit.c | 2629 +++++++ src/core/dbus-unit.h | 55 + src/core/dbus-util.c | 286 + src/core/dbus-util.h | 256 + src/core/dbus.c | 1273 ++++ src/core/dbus.h | 37 + src/core/device.c | 1301 ++++ src/core/device.h | 44 + src/core/dynamic-user.c | 871 +++ src/core/dynamic-user.h | 49 + src/core/efi-random.c | 34 + src/core/efi-random.h | 4 + src/core/emergency-action.c | 224 + src/core/emergency-action.h | 45 + src/core/exec-credential.c | 1023 +++ src/core/exec-credential.h | 54 + src/core/exec-invoke.c | 5235 +++++++++++++ src/core/exec-invoke.h | 16 + src/core/execute-serialize.c | 3896 ++++++++++ src/core/execute-serialize.h | 23 + src/core/execute.c | 2742 +++++++ src/core/execute.h | 701 ++ src/core/executor.c | 272 + src/core/fuzz-execute-serialize.c | 89 + src/core/fuzz-manager-serialize.c | 36 + src/core/fuzz-manager-serialize.options | 2 + src/core/fuzz-unit-file.c | 86 + src/core/fuzz-unit-file.options | 2 + src/core/generator-setup.c | 58 + src/core/generator-setup.h | 8 + src/core/ima-setup.c | 92 + src/core/ima-setup.h | 9 + src/core/import-creds.c | 938 +++ src/core/import-creds.h | 4 + src/core/job.c | 1712 +++++ src/core/job.h | 250 + src/core/kill.c | 56 + src/core/kill.h | 56 + src/core/kmod-setup.c | 201 + src/core/kmod-setup.h | 4 + src/core/load-dropin.c | 130 + src/core/load-dropin.h | 20 + src/core/load-fragment-gperf-nulstr.awk | 16 + src/core/load-fragment-gperf.gperf.in | 595 ++ src/core/load-fragment.c | 6735 +++++++++++++++++ src/core/load-fragment.h | 165 + src/core/main.c | 3227 ++++++++ src/core/main.h | 9 + src/core/manager-dump.c | 119 + src/core/manager-dump.h | 13 + src/core/manager-serialize.c | 539 ++ src/core/manager-serialize.h | 13 + src/core/manager.c | 5039 +++++++++++++ src/core/manager.h | 646 ++ src/core/meson.build | 260 + src/core/mount.c | 2502 +++++++ src/core/mount.h | 110 + src/core/namespace.c | 3047 ++++++++ src/core/namespace.h | 200 + src/core/org.freedesktop.systemd1.conf | 452 ++ src/core/org.freedesktop.systemd1.policy.in | 83 + src/core/org.freedesktop.systemd1.service | 13 + src/core/path.c | 1075 +++ src/core/path.h | 89 + src/core/restrict-ifaces.c | 200 + src/core/restrict-ifaces.h | 16 + src/core/scope.c | 829 +++ src/core/scope.h | 52 + src/core/selinux-access.c | 288 + src/core/selinux-access.h | 14 + src/core/selinux-setup.c | 106 + src/core/selinux-setup.h | 6 + src/core/service.c | 5161 +++++++++++++ src/core/service.h | 290 + src/core/show-status.c | 128 + src/core/show-status.h | 44 + src/core/slice.c | 462 ++ src/core/slice.h | 18 + src/core/smack-setup.c | 393 + src/core/smack-setup.h | 10 + src/core/socket.c | 3617 +++++++++ src/core/socket.h | 204 + src/core/swap.c | 1680 +++++ src/core/swap.h | 103 + src/core/system.conf.in | 83 + src/core/systemd.pc.in | 108 + src/core/target.c | 216 + src/core/target.h | 16 + src/core/timer.c | 1106 +++ src/core/timer.h | 91 + src/core/transaction.c | 1261 ++++ src/core/transaction.h | 51 + src/core/unit-dependency-atom.c | 251 + src/core/unit-dependency-atom.h | 92 + src/core/unit-printf.c | 265 + src/core/unit-printf.h | 26 + src/core/unit-serialize.c | 890 +++ src/core/unit-serialize.h | 16 + src/core/unit.c | 6617 +++++++++++++++++ src/core/unit.h | 1249 ++++ src/core/user.conf.in | 59 + src/coredump/coredump-vacuum.c | 244 + src/coredump/coredump-vacuum.h | 7 + src/coredump/coredump.c | 1718 +++++ src/coredump/coredump.conf | 27 + src/coredump/coredumpctl.c | 1418 ++++ src/coredump/meson.build | 48 + src/coredump/test-coredump-vacuum.c | 15 + src/creds/creds.c | 967 +++ src/creds/meson.build | 25 + src/cryptenroll/cryptenroll-fido2.c | 151 + src/cryptenroll/cryptenroll-fido2.h | 24 + src/cryptenroll/cryptenroll-list.c | 127 + src/cryptenroll/cryptenroll-list.h | 6 + src/cryptenroll/cryptenroll-password.c | 181 + src/cryptenroll/cryptenroll-password.h | 9 + src/cryptenroll/cryptenroll-pkcs11.c | 100 + src/cryptenroll/cryptenroll-pkcs11.h | 16 + src/cryptenroll/cryptenroll-recovery.c | 101 + src/cryptenroll/cryptenroll-recovery.h | 8 + src/cryptenroll/cryptenroll-tpm2.c | 383 + src/cryptenroll/cryptenroll-tpm2.h | 17 + src/cryptenroll/cryptenroll-wipe.c | 445 ++ src/cryptenroll/cryptenroll-wipe.h | 12 + src/cryptenroll/cryptenroll.c | 762 ++ src/cryptenroll/cryptenroll.h | 36 + src/cryptenroll/meson.build | 36 + src/cryptsetup/cryptsetup-generator.c | 940 +++ src/cryptsetup/cryptsetup-keyfile.c | 62 + src/cryptsetup/cryptsetup-keyfile.h | 12 + src/cryptsetup/cryptsetup-pkcs11.c | 173 + src/cryptsetup/cryptsetup-pkcs11.h | 64 + .../cryptsetup-token-systemd-fido2.c | 218 + .../cryptsetup-token-systemd-pkcs11.c | 144 + .../cryptsetup-token-systemd-tpm2.c | 352 + .../cryptsetup-tokens/cryptsetup-token-util.c | 70 + .../cryptsetup-tokens/cryptsetup-token-util.h | 40 + .../cryptsetup-tokens/cryptsetup-token.h | 19 + .../cryptsetup-tokens/cryptsetup-token.sym | 19 + src/cryptsetup/cryptsetup-tokens/luks2-fido2.c | 158 + src/cryptsetup/cryptsetup-tokens/luks2-fido2.h | 24 + src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.c | 272 + src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.h | 21 + src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c | 109 + src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h | 30 + src/cryptsetup/cryptsetup-tokens/meson.build | 75 + src/cryptsetup/cryptsetup-tpm2.c | 314 + src/cryptsetup/cryptsetup-tpm2.h | 126 + src/cryptsetup/cryptsetup.c | 2423 ++++++ src/cryptsetup/meson.build | 42 + src/debug-generator/debug-generator.c | 178 + src/debug-generator/meson.build | 8 + src/delta/delta.c | 637 ++ src/delta/meson.build | 9 + src/detect-virt/detect-virt.c | 202 + src/detect-virt/meson.build | 9 + src/dissect/dissect.c | 1927 +++++ src/dissect/meson.build | 17 + .../environment-d-generator.c | 99 + src/environment-d-generator/meson.build | 17 + src/escape/escape.c | 281 + src/escape/meson.build | 9 + src/firstboot/firstboot.c | 1763 +++++ src/firstboot/meson.build | 11 + src/fsck/fsck.c | 420 ++ src/fsck/meson.build | 8 + src/fstab-generator/fstab-generator.c | 1646 +++++ src/fstab-generator/meson.build | 12 + src/fundamental/bootspec-fundamental.c | 62 + src/fundamental/bootspec-fundamental.h | 17 + src/fundamental/confidential-virt-fundamental.h | 72 + src/fundamental/efivars-fundamental.c | 37 + src/fundamental/efivars-fundamental.h | 50 + src/fundamental/logarithm.h | 59 + src/fundamental/macro-fundamental.h | 515 ++ src/fundamental/memory-util-fundamental.h | 108 + src/fundamental/meson.build | 11 + src/fundamental/sbat.h | 14 + src/fundamental/sha256.c | 285 + src/fundamental/sha256.h | 39 + src/fundamental/string-util-fundamental.c | 228 + src/fundamental/string-util-fundamental.h | 121 + src/fundamental/tpm2-pcr.h | 51 + src/fundamental/uki.c | 23 + src/fundamental/uki.h | 28 + src/fundamental/unaligned-fundamental.h | 40 + src/fuzz/fuzz-bootspec-gen.py | 21 + src/fuzz/fuzz-bootspec.c | 123 + src/fuzz/fuzz-bootspec.options | 2 + src/fuzz/fuzz-bus-label.c | 20 + src/fuzz/fuzz-calendarspec.c | 57 + src/fuzz/fuzz-catalog.c | 25 + src/fuzz/fuzz-compress.c | 66 + src/fuzz/fuzz-env-file.c | 28 + src/fuzz/fuzz-env-file.options | 2 + src/fuzz/fuzz-hostname-setup.c | 20 + src/fuzz/fuzz-json.c | 115 + src/fuzz/fuzz-main.c | 56 + src/fuzz/fuzz-time-util.c | 26 + src/fuzz/fuzz-udev-database.c | 25 + src/fuzz/fuzz-varlink-idl.c | 34 + src/fuzz/fuzz-varlink.c | 129 + src/fuzz/fuzz.h | 40 + src/fuzz/meson.build | 16 + src/getty-generator/getty-generator.c | 298 + src/getty-generator/meson.build | 8 + src/gpt-auto-generator/gpt-auto-generator.c | 1019 +++ src/gpt-auto-generator/meson.build | 10 + src/hibernate-resume/hibernate-resume-config.c | 266 + src/hibernate-resume/hibernate-resume-config.h | 55 + src/hibernate-resume/hibernate-resume-generator.c | 130 + src/hibernate-resume/hibernate-resume.c | 83 + src/hibernate-resume/meson.build | 20 + src/home/home-util.c | 139 + src/home/home-util.h | 37 + src/home/homectl-fido2.c | 211 + src/home/homectl-fido2.h | 7 + src/home/homectl-pkcs11.c | 218 + src/home/homectl-pkcs11.h | 11 + src/home/homectl-recovery-key.c | 165 + src/home/homectl-recovery-key.h | 6 + src/home/homectl.c | 3875 ++++++++++ src/home/homed-bus.c | 66 + src/home/homed-bus.h | 10 + src/home/homed-conf.c | 42 + src/home/homed-conf.h | 12 + src/home/homed-gperf.gperf | 22 + src/home/homed-home-bus.c | 926 +++ src/home/homed-home-bus.h | 34 + src/home/homed-home.c | 3214 ++++++++ src/home/homed-home.h | 224 + src/home/homed-manager-bus.c | 859 +++ src/home/homed-manager-bus.h | 6 + src/home/homed-manager.c | 2224 ++++++ src/home/homed-manager.h | 93 + src/home/homed-operation.c | 76 + src/home/homed-operation.h | 63 + src/home/homed-varlink.c | 359 + src/home/homed-varlink.h | 8 + src/home/homed.c | 51 + src/home/homed.conf | 21 + src/home/homework-cifs.c | 254 + src/home/homework-cifs.h | 11 + src/home/homework-directory.c | 313 + src/home/homework-directory.h | 10 + src/home/homework-fido2.c | 74 + src/home/homework-fido2.h | 6 + src/home/homework-fscrypt.c | 674 ++ src/home/homework-fscrypt.h | 11 + src/home/homework-luks.c | 3925 ++++++++++ src/home/homework-luks.h | 49 + src/home/homework-mount.c | 309 + src/home/homework-mount.h | 10 + src/home/homework-password-cache.c | 57 + src/home/homework-password-cache.h | 28 + src/home/homework-pkcs11.c | 102 + src/home/homework-pkcs11.h | 21 + src/home/homework-quota.c | 118 + src/home/homework-quota.h | 8 + src/home/homework.c | 1979 +++++ src/home/homework.h | 97 + src/home/meson.build | 140 + src/home/org.freedesktop.home1.conf | 201 + src/home/org.freedesktop.home1.policy | 72 + src/home/org.freedesktop.home1.service | 7 + src/home/pam_systemd_home.c | 1064 +++ src/home/pam_systemd_home.sym | 12 + src/home/user-record-password-quality.c | 87 + src/home/user-record-password-quality.h | 7 + src/home/user-record-sign.c | 161 + src/home/user-record-sign.h | 19 + src/home/user-record-util.c | 1512 ++++ src/home/user-record-util.h | 65 + src/hostname/hostnamectl.c | 740 ++ src/hostname/hostnamed.c | 1633 +++++ src/hostname/meson.build | 25 + src/hostname/org.freedesktop.hostname1.conf | 29 + src/hostname/org.freedesktop.hostname1.policy | 80 + src/hostname/org.freedesktop.hostname1.service | 12 + src/hwdb/hwdb.c | 134 + src/hwdb/meson.build | 12 + src/id128/id128.c | 285 + src/id128/meson.build | 9 + src/import/curl-util.c | 384 + src/import/curl-util.h | 39 + src/import/export-raw.c | 324 + src/import/export-raw.h | 18 + src/import/export-tar.c | 326 + src/import/export-tar.h | 18 + src/import/export.c | 303 + src/import/import-common.c | 304 + src/import/import-common.h | 28 + src/import/import-compress.c | 478 ++ src/import/import-compress.h | 48 + src/import/import-fs.c | 392 + src/import/import-pubring.gpg | Bin 0 -> 9551 bytes src/import/import-raw.c | 529 ++ src/import/import-raw.h | 19 + src/import/import-tar.c | 380 + src/import/import-tar.h | 19 + src/import/import.c | 492 ++ src/import/importd.c | 1422 ++++ src/import/meson.build | 125 + src/import/org.freedesktop.import1.conf | 84 + src/import/org.freedesktop.import1.policy | 51 + src/import/org.freedesktop.import1.service | 14 + src/import/pull-common.c | 669 ++ src/import/pull-common.h | 49 + src/import/pull-job.c | 784 ++ src/import/pull-job.h | 93 + src/import/pull-raw.c | 983 +++ src/import/pull-raw.h | 19 + src/import/pull-tar.c | 677 ++ src/import/pull-tar.h | 19 + src/import/pull.c | 556 ++ src/import/qcow2-util.c | 333 + src/import/qcow2-util.h | 5 + src/import/test-qcow2.c | 42 + src/initctl/initctl.c | 355 + src/initctl/meson.build | 9 + src/integritysetup/integrity-util.c | 86 + src/integritysetup/integrity-util.h | 19 + src/integritysetup/integritysetup-generator.c | 176 + src/integritysetup/integritysetup.c | 203 + src/integritysetup/meson.build | 21 + src/journal-remote/browse.html | 548 ++ src/journal-remote/fuzz-journal-remote.c | 95 + src/journal-remote/fuzz-journal-remote.options | 2 + src/journal-remote/journal-gatewayd.c | 1092 +++ src/journal-remote/journal-remote-main.c | 1161 +++ src/journal-remote/journal-remote-parse.c | 87 + src/journal-remote/journal-remote-parse.h | 20 + src/journal-remote/journal-remote-write.c | 148 + src/journal-remote/journal-remote-write.h | 40 + src/journal-remote/journal-remote.c | 569 ++ src/journal-remote/journal-remote.conf.in | 28 + src/journal-remote/journal-remote.h | 67 + src/journal-remote/journal-upload-journal.c | 409 ++ src/journal-remote/journal-upload.c | 877 +++ src/journal-remote/journal-upload.conf.in | 23 + src/journal-remote/journal-upload.h | 74 + src/journal-remote/log-generator.py | 72 + src/journal-remote/meson.build | 116 + src/journal-remote/microhttpd-util.c | 295 + src/journal-remote/microhttpd-util.h | 105 + src/journal/bsod.c | 311 + src/journal/cat.c | 185 + src/journal/fuzz-journald-audit.c | 17 + src/journal/fuzz-journald-kmsg.c | 20 + src/journal/fuzz-journald-native-fd.c | 43 + src/journal/fuzz-journald-native.c | 12 + src/journal/fuzz-journald-stream.c | 36 + src/journal/fuzz-journald-stream.options | 2 + src/journal/fuzz-journald-syslog.c | 12 + src/journal/fuzz-journald.c | 45 + src/journal/fuzz-journald.h | 12 + src/journal/journalctl.c | 2631 +++++++ src/journal/journald-audit.c | 556 ++ src/journal/journald-audit.h | 11 + src/journal/journald-client.c | 113 + src/journal/journald-client.h | 7 + src/journal/journald-console.c | 107 + src/journal/journald-console.h | 6 + src/journal/journald-context.c | 799 ++ src/journal/journald-context.h | 106 + src/journal/journald-gperf.gperf | 53 + src/journal/journald-kmsg.c | 441 ++ src/journal/journald-kmsg.h | 13 + src/journal/journald-native.c | 536 ++ src/journal/journald-native.h | 23 + src/journal/journald-rate-limit.c | 256 + src/journal/journald-rate-limit.h | 10 + src/journal/journald-server.c | 2914 ++++++++ src/journal/journald-server.h | 241 + src/journal/journald-stream.c | 1004 +++ src/journal/journald-stream.h | 15 + src/journal/journald-syslog.c | 533 ++ src/journal/journald-syslog.h | 15 + src/journal/journald-wall.c | 54 + src/journal/journald-wall.h | 8 + src/journal/journald.c | 138 + src/journal/journald.conf | 49 + src/journal/meson.build | 182 + src/journal/test-journald-config.c | 50 + src/journal/test-journald-syslog.c | 70 + src/journal/test-journald-tables.c | 14 + src/kernel-install/50-depmod.install | 53 + src/kernel-install/60-ukify.install.in | 265 + src/kernel-install/90-loaderentry.install.in | 210 + src/kernel-install/90-uki-copy.install | 97 + src/kernel-install/install.conf | 12 + src/kernel-install/kernel-install.c | 1743 +++++ src/kernel-install/meson.build | 51 + src/kernel-install/test-kernel-install.sh | 333 + src/libsystemd-network/arp-util.c | 139 + src/libsystemd-network/arp-util.h | 36 + src/libsystemd-network/dhcp-client-internal.h | 50 + src/libsystemd-network/dhcp-identifier.c | 209 + src/libsystemd-network/dhcp-identifier.h | 87 + src/libsystemd-network/dhcp-lease-internal.h | 98 + src/libsystemd-network/dhcp-network.c | 287 + src/libsystemd-network/dhcp-network.h | 35 + src/libsystemd-network/dhcp-option.c | 461 ++ src/libsystemd-network/dhcp-option.h | 46 + src/libsystemd-network/dhcp-packet.c | 193 + src/libsystemd-network/dhcp-packet.h | 31 + src/libsystemd-network/dhcp-protocol.h | 102 + src/libsystemd-network/dhcp-server-internal.h | 139 + src/libsystemd-network/dhcp6-client-internal.h | 10 + src/libsystemd-network/dhcp6-internal.h | 104 + src/libsystemd-network/dhcp6-lease-internal.h | 90 + src/libsystemd-network/dhcp6-network.c | 78 + src/libsystemd-network/dhcp6-option.c | 979 +++ src/libsystemd-network/dhcp6-option.h | 105 + src/libsystemd-network/dhcp6-protocol.c | 96 + src/libsystemd-network/dhcp6-protocol.h | 158 + src/libsystemd-network/fuzz-dhcp-client.c | 83 + src/libsystemd-network/fuzz-dhcp-server-relay.c | 48 + src/libsystemd-network/fuzz-dhcp-server.c | 102 + src/libsystemd-network/fuzz-dhcp6-client.c | 111 + src/libsystemd-network/fuzz-dhcp6-client.options | 2 + src/libsystemd-network/fuzz-lldp-rx.c | 45 + src/libsystemd-network/fuzz-lldp-rx.options | 2 + src/libsystemd-network/fuzz-ndisc-rs.c | 40 + src/libsystemd-network/fuzz-ndisc-rs.options | 2 + src/libsystemd-network/icmp6-util-unix.c | 53 + src/libsystemd-network/icmp6-util-unix.h | 9 + src/libsystemd-network/icmp6-util.c | 203 + src/libsystemd-network/icmp6-util.h | 28 + src/libsystemd-network/lldp-neighbor.c | 795 ++ src/libsystemd-network/lldp-neighbor.h | 92 + src/libsystemd-network/lldp-network.c | 70 + src/libsystemd-network/lldp-network.h | 6 + src/libsystemd-network/lldp-rx-internal.h | 48 + src/libsystemd-network/meson.build | 121 + src/libsystemd-network/ndisc-internal.h | 51 + src/libsystemd-network/ndisc-protocol.c | 34 + src/libsystemd-network/ndisc-protocol.h | 31 + src/libsystemd-network/ndisc-router.c | 913 +++ src/libsystemd-network/ndisc-router.h | 49 + src/libsystemd-network/network-common.c | 126 + src/libsystemd-network/network-common.h | 49 + src/libsystemd-network/network-internal.c | 239 + src/libsystemd-network/network-internal.h | 31 + src/libsystemd-network/radv-internal.h | 222 + src/libsystemd-network/sd-dhcp-client.c | 2568 +++++++ src/libsystemd-network/sd-dhcp-lease.c | 1607 ++++ src/libsystemd-network/sd-dhcp-server.c | 1792 +++++ src/libsystemd-network/sd-dhcp6-client.c | 1594 ++++ src/libsystemd-network/sd-dhcp6-lease.c | 964 +++ src/libsystemd-network/sd-ipv4acd.c | 617 ++ src/libsystemd-network/sd-ipv4ll.c | 365 + src/libsystemd-network/sd-lldp-rx.c | 524 ++ src/libsystemd-network/sd-lldp-tx.c | 628 ++ src/libsystemd-network/sd-ndisc.c | 381 + src/libsystemd-network/sd-radv.c | 1161 +++ src/libsystemd-network/test-acd.c | 94 + src/libsystemd-network/test-dhcp-client.c | 562 ++ src/libsystemd-network/test-dhcp-option.c | 386 + src/libsystemd-network/test-dhcp-server.c | 330 + src/libsystemd-network/test-dhcp6-client.c | 1127 +++ src/libsystemd-network/test-ipv4ll-manual.c | 115 + src/libsystemd-network/test-ipv4ll.c | 206 + src/libsystemd-network/test-lldp-rx.c | 378 + src/libsystemd-network/test-ndisc-ra.c | 376 + src/libsystemd-network/test-ndisc-rs.c | 339 + src/libsystemd-network/test-sd-dhcp-lease.c | 86 + src/libsystemd/libsystemd.pc.in | 20 + src/libsystemd/libsystemd.sym | 836 +++ src/libsystemd/meson.build | 265 + src/libsystemd/sd-bus/bus-common-errors.c | 151 + src/libsystemd/sd-bus/bus-common-errors.h | 155 + src/libsystemd/sd-bus/bus-container.c | 103 + src/libsystemd/sd-bus/bus-container.h | 6 + src/libsystemd/sd-bus/bus-control.c | 1038 +++ src/libsystemd/sd-bus/bus-control.h | 9 + src/libsystemd/sd-bus/bus-convenience.c | 824 +++ src/libsystemd/sd-bus/bus-creds.c | 1337 ++++ src/libsystemd/sd-bus/bus-creds.h | 72 + src/libsystemd/sd-bus/bus-dump.c | 649 ++ src/libsystemd/sd-bus/bus-dump.h | 12 + src/libsystemd/sd-bus/bus-error.c | 628 ++ src/libsystemd/sd-bus/bus-error.h | 57 + src/libsystemd/sd-bus/bus-internal.c | 338 + src/libsystemd/sd-bus/bus-internal.h | 427 ++ src/libsystemd/sd-bus/bus-introspect.c | 290 + src/libsystemd/sd-bus/bus-introspect.h | 25 + src/libsystemd/sd-bus/bus-kernel.c | 44 + src/libsystemd/sd-bus/bus-kernel.h | 24 + src/libsystemd/sd-bus/bus-match.c | 1058 +++ src/libsystemd/sd-bus/bus-match.h | 82 + src/libsystemd/sd-bus/bus-message.c | 4712 ++++++++++++ src/libsystemd/sd-bus/bus-message.h | 191 + src/libsystemd/sd-bus/bus-objects.c | 3033 ++++++++ src/libsystemd/sd-bus/bus-objects.h | 20 + src/libsystemd/sd-bus/bus-protocol.h | 88 + src/libsystemd/sd-bus/bus-signature.c | 146 + src/libsystemd/sd-bus/bus-signature.h | 10 + src/libsystemd/sd-bus/bus-slot.c | 311 + src/libsystemd/sd-bus/bus-slot.h | 10 + src/libsystemd/sd-bus/bus-socket.c | 1428 ++++ src/libsystemd/sd-bus/bus-socket.h | 20 + src/libsystemd/sd-bus/bus-track.c | 495 ++ src/libsystemd/sd-bus/bus-track.h | 5 + src/libsystemd/sd-bus/bus-type.c | 162 + src/libsystemd/sd-bus/bus-type.h | 16 + src/libsystemd/sd-bus/fuzz-bus-match.c | 86 + src/libsystemd/sd-bus/fuzz-bus-match.options | 2 + src/libsystemd/sd-bus/fuzz-bus-message.c | 42 + src/libsystemd/sd-bus/sd-bus.c | 4441 +++++++++++ src/libsystemd/sd-bus/test-bus-address.c | 61 + src/libsystemd/sd-bus/test-bus-benchmark.c | 326 + src/libsystemd/sd-bus/test-bus-chat.c | 539 ++ src/libsystemd/sd-bus/test-bus-cleanup.c | 105 + src/libsystemd/sd-bus/test-bus-creds.c | 34 + src/libsystemd/sd-bus/test-bus-error.c | 294 + src/libsystemd/sd-bus/test-bus-introspect.c | 33 + src/libsystemd/sd-bus/test-bus-marshal.c | 418 ++ src/libsystemd/sd-bus/test-bus-match.c | 145 + src/libsystemd/sd-bus/test-bus-objects.c | 677 ++ src/libsystemd/sd-bus/test-bus-peersockaddr.c | 127 + src/libsystemd/sd-bus/test-bus-queue-ref-cycle.c | 56 + src/libsystemd/sd-bus/test-bus-server.c | 185 + src/libsystemd/sd-bus/test-bus-signature.c | 150 + src/libsystemd/sd-bus/test-bus-track.c | 151 + src/libsystemd/sd-bus/test-bus-vtable-cc.cc | 1 + src/libsystemd/sd-bus/test-bus-vtable.c | 76 + src/libsystemd/sd-bus/test-bus-watch-bind.c | 228 + src/libsystemd/sd-bus/test-vtable-data.h | 132 + src/libsystemd/sd-daemon/sd-daemon.c | 775 ++ .../sd-device/device-enumerator-private.h | 32 + src/libsystemd/sd-device/device-enumerator.c | 1194 +++ src/libsystemd/sd-device/device-filter.c | 115 + src/libsystemd/sd-device/device-filter.h | 13 + src/libsystemd/sd-device/device-internal.h | 117 + src/libsystemd/sd-device/device-monitor-private.h | 22 + src/libsystemd/sd-device/device-monitor.c | 929 +++ src/libsystemd/sd-device/device-private.c | 903 +++ src/libsystemd/sd-device/device-private.h | 74 + src/libsystemd/sd-device/device-util.c | 141 + src/libsystemd/sd-device/device-util.h | 104 + src/libsystemd/sd-device/sd-device.c | 2715 +++++++ src/libsystemd/sd-device/test-device-util.c | 23 + src/libsystemd/sd-device/test-sd-device-monitor.c | 344 + src/libsystemd/sd-device/test-sd-device-thread.c | 51 + src/libsystemd/sd-device/test-sd-device.c | 678 ++ src/libsystemd/sd-event/event-source.h | 239 + src/libsystemd/sd-event/event-util.c | 153 + src/libsystemd/sd-event/event-util.h | 34 + src/libsystemd/sd-event/sd-event.c | 5357 ++++++++++++++ src/libsystemd/sd-event/test-event.c | 902 +++ src/libsystemd/sd-hwdb/hwdb-internal.h | 89 + src/libsystemd/sd-hwdb/sd-hwdb.c | 436 ++ src/libsystemd/sd-id128/id128-util.c | 265 + src/libsystemd/sd-id128/id128-util.h | 58 + src/libsystemd/sd-id128/sd-id128.c | 382 + src/libsystemd/sd-journal/audit-type.c | 6 + src/libsystemd/sd-journal/audit-type.h | 22 + src/libsystemd/sd-journal/audit_type-to-name.awk | 14 + src/libsystemd/sd-journal/catalog.c | 743 ++ src/libsystemd/sd-journal/catalog.h | 19 + src/libsystemd/sd-journal/fsprg.c | 381 + src/libsystemd/sd-journal/fsprg.h | 61 + .../sd-journal/generate-audit_type-list.sh | 17 + src/libsystemd/sd-journal/journal-authenticate.c | 525 ++ src/libsystemd/sd-journal/journal-authenticate.h | 23 + src/libsystemd/sd-journal/journal-def.h | 269 + src/libsystemd/sd-journal/journal-file.c | 4696 ++++++++++++ src/libsystemd/sd-journal/journal-file.h | 393 + src/libsystemd/sd-journal/journal-internal.h | 142 + src/libsystemd/sd-journal/journal-send.c | 576 ++ src/libsystemd/sd-journal/journal-send.h | 7 + src/libsystemd/sd-journal/journal-vacuum.c | 330 + src/libsystemd/sd-journal/journal-vacuum.h | 9 + src/libsystemd/sd-journal/journal-verify.c | 1436 ++++ src/libsystemd/sd-journal/journal-verify.h | 6 + src/libsystemd/sd-journal/lookup3.c | 1002 +++ src/libsystemd/sd-journal/lookup3.h | 23 + src/libsystemd/sd-journal/mmap-cache.c | 562 ++ src/libsystemd/sd-journal/mmap-cache.h | 60 + src/libsystemd/sd-journal/sd-journal.c | 3528 +++++++++ src/libsystemd/sd-journal/test-audit-type.c | 24 + src/libsystemd/sd-journal/test-catalog.c | 235 + src/libsystemd/sd-journal/test-journal-append.c | 269 + src/libsystemd/sd-journal/test-journal-enum.c | 37 + src/libsystemd/sd-journal/test-journal-file.c | 45 + src/libsystemd/sd-journal/test-journal-flush.c | 118 + src/libsystemd/sd-journal/test-journal-init.c | 68 + .../sd-journal/test-journal-interleaving.c | 737 ++ src/libsystemd/sd-journal/test-journal-match.c | 61 + src/libsystemd/sd-journal/test-journal-send.c | 111 + src/libsystemd/sd-journal/test-journal-stream.c | 201 + src/libsystemd/sd-journal/test-journal-verify.c | 210 + src/libsystemd/sd-journal/test-journal.c | 280 + src/libsystemd/sd-journal/test-mmap-cache.c | 68 + src/libsystemd/sd-login/sd-login.c | 1323 ++++ src/libsystemd/sd-login/test-login.c | 334 + src/libsystemd/sd-netlink/netlink-genl.c | 488 ++ src/libsystemd/sd-netlink/netlink-genl.h | 8 + src/libsystemd/sd-netlink/netlink-internal.h | 212 + src/libsystemd/sd-netlink/netlink-message-nfnl.c | 417 ++ src/libsystemd/sd-netlink/netlink-message-rtnl.c | 1204 +++ src/libsystemd/sd-netlink/netlink-message.c | 1421 ++++ src/libsystemd/sd-netlink/netlink-slot.c | 188 + src/libsystemd/sd-netlink/netlink-slot.h | 14 + src/libsystemd/sd-netlink/netlink-socket.c | 459 ++ src/libsystemd/sd-netlink/netlink-types-genl.c | 251 + src/libsystemd/sd-netlink/netlink-types-internal.h | 66 + src/libsystemd/sd-netlink/netlink-types-nfnl.c | 194 + src/libsystemd/sd-netlink/netlink-types-rtnl.c | 1229 ++++ src/libsystemd/sd-netlink/netlink-types.c | 153 + src/libsystemd/sd-netlink/netlink-types.h | 63 + src/libsystemd/sd-netlink/netlink-util.c | 818 +++ src/libsystemd/sd-netlink/netlink-util.h | 113 + src/libsystemd/sd-netlink/sd-netlink.c | 909 +++ src/libsystemd/sd-netlink/test-netlink.c | 686 ++ src/libsystemd/sd-network/network-util.c | 157 + src/libsystemd/sd-network/network-util.h | 86 + src/libsystemd/sd-network/sd-network.c | 462 ++ src/libsystemd/sd-path/sd-path.c | 693 ++ src/libsystemd/sd-resolve/resolve-private.h | 39 + src/libsystemd/sd-resolve/sd-resolve.c | 1296 ++++ src/libsystemd/sd-resolve/test-resolve.c | 111 + src/libudev/libudev-device-internal.h | 10 + src/libudev/libudev-device.c | 895 +++ src/libudev/libudev-enumerate.c | 458 ++ src/libudev/libudev-hwdb.c | 123 + src/libudev/libudev-list-internal.h | 16 + src/libudev/libudev-list.c | 235 + src/libudev/libudev-monitor.c | 309 + src/libudev/libudev-queue.c | 231 + src/libudev/libudev-util.c | 27 + src/libudev/libudev-util.h | 14 + src/libudev/libudev.c | 154 + src/libudev/libudev.h | 191 + src/libudev/libudev.pc.in | 20 + src/libudev/libudev.sym | 126 + src/libudev/meson.build | 44 + src/libudev/test-libudev.c | 496 ++ src/libudev/test-udev-device-thread.c | 51 + src/locale/kbd-model-map | 72 + src/locale/language-fallback-map | 13 + src/locale/localectl.c | 535 ++ src/locale/localed-util.c | 1161 +++ src/locale/localed-util.h | 80 + src/locale/localed.c | 680 ++ src/locale/meson.build | 62 + src/locale/org.freedesktop.locale1.conf | 29 + src/locale/org.freedesktop.locale1.policy | 42 + src/locale/org.freedesktop.locale1.service | 14 + src/locale/test-localed-util.c | 236 + src/locale/xkbcommon-util.c | 80 + src/locale/xkbcommon-util.h | 30 + src/login/inhibit.c | 318 + src/login/loginctl.c | 1653 +++++ src/login/logind-action.c | 331 + src/login/logind-action.h | 72 + src/login/logind-brightness.c | 250 + src/login/logind-brightness.h | 9 + src/login/logind-button.c | 526 ++ src/login/logind-button.h | 26 + src/login/logind-core.c | 850 +++ src/login/logind-dbus.c | 4406 +++++++++++ src/login/logind-dbus.h | 37 + src/login/logind-device.c | 104 + src/login/logind-device.h | 25 + src/login/logind-gperf.gperf | 53 + src/login/logind-inhibit.c | 532 ++ src/login/logind-inhibit.h | 79 + src/login/logind-polkit.c | 24 + src/login/logind-polkit.h | 9 + src/login/logind-seat-dbus.c | 442 ++ src/login/logind-seat-dbus.h | 16 + src/login/logind-seat.c | 682 ++ src/login/logind-seat.h | 74 + src/login/logind-session-dbus.c | 994 +++ src/login/logind-session-dbus.h | 23 + src/login/logind-session-device.c | 507 ++ src/login/logind-session-device.h | 41 + src/login/logind-session.c | 1624 ++++ src/login/logind-session.h | 185 + src/login/logind-user-dbus.c | 421 ++ src/login/logind-user-dbus.h | 16 + src/login/logind-user.c | 940 +++ src/login/logind-user.h | 75 + src/login/logind-wall.c | 166 + src/login/logind.c | 1206 +++ src/login/logind.conf.in | 51 + src/login/logind.h | 188 + src/login/meson.build | 152 + src/login/org.freedesktop.login1.conf | 360 + src/login/org.freedesktop.login1.policy | 415 ++ src/login/org.freedesktop.login1.service | 14 + src/login/pam_systemd.c | 1266 ++++ src/login/pam_systemd.sym | 8 + src/login/pam_systemd_loadkey.c | 98 + src/login/pam_systemd_loadkey.sym | 8 + src/login/sysfs-show.c | 164 + src/login/sysfs-show.h | 8 + src/login/systemd-user.in | 23 + src/login/test-inhibit.c | 85 + src/login/test-login-shared.c | 16 + src/login/test-login-tables.c | 20 + src/login/test-session-properties.c | 193 + src/login/user-runtime-dir.c | 216 + src/machine-id-setup/machine-id-setup-main.c | 202 + src/machine-id-setup/meson.build | 8 + src/machine/image-dbus.c | 530 ++ src/machine/image-dbus.h | 19 + src/machine/machine-dbus.c | 1399 ++++ src/machine/machine-dbus.h | 32 + src/machine/machine.c | 910 +++ src/machine/machine.h | 103 + src/machine/machinectl.c | 3007 ++++++++ src/machine/machined-core.c | 106 + src/machine/machined-dbus.c | 1516 ++++ src/machine/machined-varlink.c | 426 ++ src/machine/machined-varlink.h | 7 + src/machine/machined.c | 378 + src/machine/machined.h | 69 + src/machine/meson.build | 61 + src/machine/operation.c | 138 + src/machine/operation.h | 31 + src/machine/org.freedesktop.machine1.conf | 242 + src/machine/org.freedesktop.machine1.policy | 104 + src/machine/org.freedesktop.machine1.service | 14 + src/machine/test-machine-tables.c | 15 + src/modules-load/meson.build | 17 + src/modules-load/modules-load.c | 210 + src/mount/meson.build | 13 + src/mount/mount-tool.c | 1590 ++++ src/network/fuzz-netdev-parser.c | 27 + src/network/fuzz-netdev-parser.options | 2 + src/network/fuzz-network-parser.c | 27 + src/network/fuzz-network-parser.options | 2 + src/network/generator/main.c | 218 + src/network/generator/network-generator.c | 1432 ++++ src/network/generator/network-generator.h | 116 + src/network/generator/test-network-generator.c | 462 ++ src/network/meson.build | 266 + src/network/netdev/bareudp.c | 70 + src/network/netdev/bareudp.h | 34 + src/network/netdev/batadv.c | 208 + src/network/netdev/batadv.h | 47 + src/network/netdev/bond.c | 415 ++ src/network/netdev/bond.h | 60 + src/network/netdev/bridge.c | 253 + src/network/netdev/bridge.h | 46 + src/network/netdev/dummy.c | 13 + src/network/netdev/dummy.h | 11 + src/network/netdev/fou-tunnel.c | 265 + src/network/netdev/fou-tunnel.h | 42 + src/network/netdev/geneve.c | 276 + src/network/netdev/geneve.h | 54 + src/network/netdev/ifb.c | 14 + src/network/netdev/ifb.h | 13 + src/network/netdev/ipoib.c | 150 + src/network/netdev/ipoib.h | 30 + src/network/netdev/ipvlan.c | 73 + src/network/netdev/ipvlan.h | 25 + src/network/netdev/l2tp-tunnel.c | 825 +++ src/network/netdev/l2tp-tunnel.h | 80 + src/network/netdev/macsec.c | 1204 +++ src/network/netdev/macsec.h | 87 + src/network/netdev/macvlan.c | 132 + src/network/netdev/macvlan.h | 25 + src/network/netdev/netdev-gperf.gperf | 272 + src/network/netdev/netdev-util.c | 100 + src/network/netdev/netdev-util.h | 27 + src/network/netdev/netdev.c | 957 +++ src/network/netdev/netdev.h | 261 + src/network/netdev/netdevsim.c | 13 + src/network/netdev/netdevsim.h | 13 + src/network/netdev/nlmon.c | 25 + src/network/netdev/nlmon.h | 14 + src/network/netdev/tunnel.c | 1242 ++++ src/network/netdev/tunnel.h | 139 + src/network/netdev/tuntap.c | 261 + src/network/netdev/tuntap.h | 26 + src/network/netdev/vcan.c | 12 + src/network/netdev/vcan.h | 17 + src/network/netdev/veth.c | 82 + src/network/netdev/veth.h | 16 + src/network/netdev/vlan.c | 217 + src/network/netdev/vlan.h | 27 + src/network/netdev/vrf.c | 30 + src/network/netdev/vrf.h | 15 + src/network/netdev/vxcan.c | 58 + src/network/netdev/vxcan.h | 16 + src/network/netdev/vxlan.c | 435 ++ src/network/netdev/vxlan.h | 76 + src/network/netdev/wireguard.c | 1141 +++ src/network/netdev/wireguard.h | 84 + src/network/netdev/wlan.c | 228 + src/network/netdev/wlan.h | 22 + src/network/netdev/xfrm.c | 45 + src/network/netdev/xfrm.h | 14 + src/network/networkctl.c | 3499 +++++++++ src/network/networkd-address-generation.c | 439 ++ src/network/networkd-address-generation.h | 14 + src/network/networkd-address-label.c | 298 + src/network/networkd-address-label.h | 30 + src/network/networkd-address-pool.c | 187 + src/network/networkd-address-pool.h | 17 + src/network/networkd-address.c | 2566 +++++++ src/network/networkd-address.h | 145 + src/network/networkd-bridge-fdb.c | 535 ++ src/network/networkd-bridge-fdb.h | 54 + src/network/networkd-bridge-mdb.c | 365 + src/network/networkd-bridge-mdb.h | 29 + src/network/networkd-bridge-vlan.c | 249 + src/network/networkd-bridge-vlan.h | 31 + src/network/networkd-can.c | 336 + src/network/networkd-can.h | 18 + src/network/networkd-conf.c | 35 + src/network/networkd-conf.h | 14 + src/network/networkd-dhcp-common.c | 1489 ++++ src/network/networkd-dhcp-common.h | 114 + src/network/networkd-dhcp-prefix-delegation.c | 1257 ++++ src/network/networkd-dhcp-prefix-delegation.h | 23 + src/network/networkd-dhcp-server-bus.c | 117 + src/network/networkd-dhcp-server-bus.h | 11 + src/network/networkd-dhcp-server-static-lease.c | 210 + src/network/networkd-dhcp-server-static-lease.h | 26 + src/network/networkd-dhcp-server.c | 779 ++ src/network/networkd-dhcp-server.h | 17 + src/network/networkd-dhcp4-bus.c | 77 + src/network/networkd-dhcp4-bus.h | 10 + src/network/networkd-dhcp4.c | 2025 +++++ src/network/networkd-dhcp4.h | 35 + src/network/networkd-dhcp6-bus.c | 76 + src/network/networkd-dhcp6-bus.h | 10 + src/network/networkd-dhcp6.c | 892 +++ src/network/networkd-dhcp6.h | 32 + src/network/networkd-gperf.gperf | 36 + src/network/networkd-ipv4acd.c | 336 + src/network/networkd-ipv4acd.h | 14 + src/network/networkd-ipv4ll.c | 319 + src/network/networkd-ipv4ll.h | 16 + src/network/networkd-ipv6-proxy-ndp.c | 180 + src/network/networkd-ipv6-proxy-ndp.h | 13 + src/network/networkd-ipv6ll.c | 247 + src/network/networkd-ipv6ll.h | 37 + src/network/networkd-json.c | 1434 ++++ src/network/networkd-json.h | 10 + src/network/networkd-link-bus.c | 898 +++ src/network/networkd-link-bus.h | 39 + src/network/networkd-link.c | 2773 +++++++ src/network/networkd-link.h | 253 + src/network/networkd-lldp-rx.c | 173 + src/network/networkd-lldp-rx.h | 22 + src/network/networkd-lldp-tx.c | 137 + src/network/networkd-lldp-tx.h | 10 + src/network/networkd-manager-bus.c | 425 ++ src/network/networkd-manager-bus.h | 12 + src/network/networkd-manager.c | 1108 +++ src/network/networkd-manager.h | 127 + src/network/networkd-ndisc.c | 1531 ++++ src/network/networkd-ndisc.h | 66 + src/network/networkd-neighbor.c | 756 ++ src/network/networkd-neighbor.h | 44 + src/network/networkd-netlabel.c | 128 + src/network/networkd-netlabel.h | 5 + src/network/networkd-network-bus.c | 144 + src/network/networkd-network-bus.h | 13 + src/network/networkd-network-gperf.gperf | 627 ++ src/network/networkd-network.c | 1349 ++++ src/network/networkd-network.h | 440 ++ src/network/networkd-nexthop.c | 1384 ++++ src/network/networkd-nexthop.h | 59 + src/network/networkd-queue.c | 333 + src/network/networkd-queue.h | 141 + src/network/networkd-radv.c | 1619 ++++ src/network/networkd-radv.h | 101 + src/network/networkd-route-util.c | 586 ++ src/network/networkd-route-util.h | 55 + src/network/networkd-route.c | 3148 ++++++++ src/network/networkd-route.h | 135 + src/network/networkd-routing-policy-rule.c | 1754 +++++ src/network/networkd-routing-policy-rule.h | 90 + src/network/networkd-setlink.c | 1309 ++++ src/network/networkd-setlink.h | 29 + src/network/networkd-speed-meter.c | 111 + src/network/networkd-speed-meter.h | 12 + src/network/networkd-sriov.c | 352 + src/network/networkd-sriov.h | 17 + src/network/networkd-state-file.c | 863 +++ src/network/networkd-state-file.h | 14 + src/network/networkd-sysctl.c | 335 + src/network/networkd-sysctl.h | 39 + src/network/networkd-util.c | 257 + src/network/networkd-util.h | 165 + src/network/networkd-wifi.c | 345 + src/network/networkd-wifi.h | 11 + src/network/networkd-wiphy.c | 495 ++ src/network/networkd-wiphy.h | 71 + src/network/networkd.c | 119 + src/network/networkd.conf | 33 + src/network/org.freedesktop.network1.conf | 27 + src/network/org.freedesktop.network1.policy | 186 + src/network/org.freedesktop.network1.service | 14 + src/network/systemd-networkd.pkla | 7 + src/network/systemd-networkd.rules | 13 + src/network/tc/cake.c | 737 ++ src/network/tc/cake.h | 90 + src/network/tc/codel.c | 244 + src/network/tc/codel.h | 24 + src/network/tc/drr.c | 108 + src/network/tc/drr.h | 23 + src/network/tc/ets.c | 342 + src/network/tc/ets.h | 25 + src/network/tc/fifo.c | 183 + src/network/tc/fifo.h | 25 + src/network/tc/fq-codel.c | 343 + src/network/tc/fq-codel.h | 28 + src/network/tc/fq-pie.c | 102 + src/network/tc/fq-pie.h | 17 + src/network/tc/fq.c | 409 ++ src/network/tc/fq.h | 29 + src/network/tc/gred.c | 185 + src/network/tc/gred.h | 20 + src/network/tc/hhf.c | 96 + src/network/tc/hhf.h | 17 + src/network/tc/htb.c | 487 ++ src/network/tc/htb.h | 39 + src/network/tc/netem.c | 227 + src/network/tc/netem.h | 25 + src/network/tc/pie.c | 96 + src/network/tc/pie.h | 17 + src/network/tc/qdisc.c | 715 ++ src/network/tc/qdisc.h | 112 + src/network/tc/qfq.c | 177 + src/network/tc/qfq.h | 26 + src/network/tc/sfb.c | 107 + src/network/tc/sfb.h | 17 + src/network/tc/sfq.c | 91 + src/network/tc/sfq.h | 18 + src/network/tc/tbf.c | 343 + src/network/tc/tbf.h | 26 + src/network/tc/tc-util.c | 133 + src/network/tc/tc-util.h | 14 + src/network/tc/tc.c | 41 + src/network/tc/tc.h | 6 + src/network/tc/tclass.c | 639 ++ src/network/tc/tclass.h | 79 + src/network/tc/teql.c | 97 + src/network/tc/teql.h | 16 + src/network/test-network-tables.c | 54 + src/network/test-network.c | 251 + src/network/test-networkd-address.c | 25 + src/network/test-networkd-conf.c | 278 + src/network/test-networkd-util.c | 19 + src/network/wait-online/link.c | 250 + src/network/wait-online/link.h | 34 + src/network/wait-online/manager.c | 441 ++ src/network/wait-online/manager.h | 44 + src/network/wait-online/wait-online.c | 238 + src/notify/meson.build | 9 + src/notify/notify.c | 473 ++ src/nspawn/fuzz-nspawn-oci.c | 23 + src/nspawn/fuzz-nspawn-oci.options | 2 + src/nspawn/fuzz-nspawn-settings.c | 23 + src/nspawn/fuzz-nspawn-settings.options | 2 + src/nspawn/meson.build | 78 + src/nspawn/nspawn-bind-user.c | 474 ++ src/nspawn/nspawn-bind-user.h | 29 + src/nspawn/nspawn-cgroup.c | 621 ++ src/nspawn/nspawn-cgroup.h | 14 + src/nspawn/nspawn-def.h | 9 + src/nspawn/nspawn-expose-ports.c | 214 + src/nspawn/nspawn-expose-ports.h | 27 + src/nspawn/nspawn-gperf.gperf | 82 + src/nspawn/nspawn-mount.c | 1406 ++++ src/nspawn/nspawn-mount.h | 71 + src/nspawn/nspawn-network.c | 815 ++ src/nspawn/nspawn-network.h | 29 + src/nspawn/nspawn-oci.c | 2197 ++++++ src/nspawn/nspawn-oci.h | 6 + src/nspawn/nspawn-patch-uid.c | 477 ++ src/nspawn/nspawn-patch-uid.h | 7 + src/nspawn/nspawn-register.c | 416 ++ src/nspawn/nspawn-register.h | 15 + src/nspawn/nspawn-seccomp.c | 256 + src/nspawn/nspawn-seccomp.h | 6 + src/nspawn/nspawn-settings.c | 1015 +++ src/nspawn/nspawn-settings.h | 287 + src/nspawn/nspawn-setuid.c | 235 + src/nspawn/nspawn-setuid.h | 5 + src/nspawn/nspawn-stub-pid1.c | 199 + src/nspawn/nspawn-stub-pid1.h | 6 + src/nspawn/nspawn-util.c | 76 + src/nspawn/nspawn-util.h | 4 + src/nspawn/nspawn.c | 5870 +++++++++++++++ src/nspawn/nspawn.h | 7 + src/nspawn/test-nspawn-tables.c | 14 + src/nspawn/test-nspawn-util.c | 22 + src/nspawn/test-patch-uid.c | 43 + src/nss-myhostname/meson.build | 10 + src/nss-myhostname/nss-myhostname.c | 523 ++ src/nss-myhostname/nss-myhostname.sym | 19 + src/nss-mymachines/meson.build | 10 + src/nss-mymachines/nss-mymachines.c | 440 ++ src/nss-mymachines/nss-mymachines.sym | 21 + src/nss-resolve/meson.build | 12 + src/nss-resolve/nss-resolve.c | 759 ++ src/nss-resolve/nss-resolve.sym | 19 + src/nss-systemd/meson.build | 13 + src/nss-systemd/nss-systemd.c | 1084 +++ src/nss-systemd/nss-systemd.h | 13 + src/nss-systemd/nss-systemd.sym | 36 + src/nss-systemd/userdb-glue.c | 478 ++ src/nss-systemd/userdb-glue.h | 27 + src/oom/meson.build | 44 + src/oom/oomctl.c | 133 + src/oom/oomd-manager-bus.c | 52 + src/oom/oomd-manager-bus.h | 8 + src/oom/oomd-manager.c | 851 +++ src/oom/oomd-manager.h | 73 + src/oom/oomd-util.c | 648 ++ src/oom/oomd-util.h | 146 + src/oom/oomd.c | 197 + src/oom/oomd.conf | 22 + src/oom/org.freedesktop.oom1.conf | 47 + src/oom/org.freedesktop.oom1.service | 14 + src/oom/test-oomd-util.c | 513 ++ .../definitions/confext.repart.d/10-root.conf | 16 + .../confext.repart.d/20-root-verity.conf | 14 + .../confext.repart.d/30-root-verity-sig.conf | 13 + .../definitions/portable.repart.d/10-root.conf | 16 + .../portable.repart.d/20-root-verity.conf | 14 + .../portable.repart.d/30-root-verity-sig.conf | 13 + .../definitions/sysext.repart.d/10-root.conf | 17 + .../sysext.repart.d/20-root-verity.conf | 14 + .../sysext.repart.d/30-root-verity-sig.conf | 13 + src/partition/growfs.c | 277 + src/partition/makefs.c | 84 + src/partition/meson.build | 62 + src/partition/repart.c | 7753 ++++++++++++++++++++ src/path/meson.build | 9 + src/path/path.c | 238 + src/pcrextend/meson.build | 19 + src/pcrextend/pcrextend.c | 391 + src/pcrlock/meson.build | 37 + src/pcrlock/pcrlock-firmware.c | 168 + src/pcrlock/pcrlock-firmware.h | 25 + src/pcrlock/pcrlock.c | 5011 +++++++++++++ .../pcrlock.d/350-action-efi-application.pcrlock | 1 + .../300-0x00000000.pcrlock | 1 + .../600-0xffffffff.pcrlock | 1 + .../500-separator.pcrlock.d/300-0x00000000.pcrlock | 1 + .../500-separator.pcrlock.d/600-0xffffffff.pcrlock | 1 + .../300-present.pcrlock | 1 + .../600-absent.pcrlock | 1 + src/pcrlock/pcrlock.d/750-enter-initrd.pcrlock | 1 + src/pcrlock/pcrlock.d/800-leave-initrd.pcrlock | 1 + src/pcrlock/pcrlock.d/850-sysinit.pcrlock | 1 + src/pcrlock/pcrlock.d/900-ready.pcrlock | 1 + src/pcrlock/pcrlock.d/950-shutdown.pcrlock | 1 + src/pcrlock/pcrlock.d/990-final.pcrlock | 1 + src/pcrlock/pehash.c | 246 + src/pcrlock/pehash.h | 11 + src/portable/meson.build | 55 + src/portable/org.freedesktop.portable1.conf | 125 + src/portable/org.freedesktop.portable1.policy | 43 + src/portable/org.freedesktop.portable1.service | 7 + src/portable/portable.c | 2105 ++++++ src/portable/portable.h | 86 + src/portable/portablectl.c | 1459 ++++ src/portable/portabled-bus.c | 612 ++ src/portable/portabled-bus.h | 11 + src/portable/portabled-image-bus.c | 1191 +++ src/portable/portabled-image-bus.h | 43 + src/portable/portabled-image.c | 102 + src/portable/portabled-image.h | 12 + src/portable/portabled-operation.c | 129 + src/portable/portabled-operation.h | 29 + src/portable/portabled.c | 177 + src/portable/portabled.h | 28 + src/portable/profile/default/service.conf | 30 + src/portable/profile/nonetwork/service.conf | 30 + src/portable/profile/strict/service.conf | 29 + src/portable/profile/trusted/service.conf | 8 + src/pstore/meson.build | 21 + src/pstore/pstore.c | 367 + src/pstore/pstore.conf | 21 + src/quotacheck/meson.build | 9 + src/quotacheck/quotacheck.c | 101 + src/random-seed/meson.build | 9 + src/random-seed/random-seed.c | 457 ++ src/rc-local-generator/meson.build | 9 + src/rc-local-generator/rc-local-generator.c | 76 + src/remount-fs/meson.build | 8 + src/remount-fs/remount-fs.c | 168 + src/reply-password/meson.build | 8 + src/reply-password/reply-password.c | 80 + src/resolve/RFCs | 60 + src/resolve/dns-type.c | 316 + src/resolve/dns-type.h | 162 + src/resolve/dns_type-to-name.awk | 16 + src/resolve/fuzz-dns-packet.c | 27 + src/resolve/fuzz-dns-packet.options | 2 + src/resolve/fuzz-etc-hosts.c | 19 + src/resolve/fuzz-resource-record.c | 37 + src/resolve/generate-dns_type-gperf.py | 25 + src/resolve/generate-dns_type-list.sed | 2 + src/resolve/meson.build | 240 + src/resolve/org.freedesktop.resolve1.conf | 27 + src/resolve/org.freedesktop.resolve1.policy | 142 + src/resolve/org.freedesktop.resolve1.service | 14 + src/resolve/resolv.conf | 19 + src/resolve/resolvconf-compat.c | 277 + src/resolve/resolvconf-compat.h | 4 + src/resolve/resolvectl.c | 4076 ++++++++++ src/resolve/resolvectl.h | 35 + src/resolve/resolved-bus.c | 2285 ++++++ src/resolve/resolved-bus.h | 17 + src/resolve/resolved-conf.c | 603 ++ src/resolve/resolved-conf.h | 22 + src/resolve/resolved-def.h | 82 + src/resolve/resolved-dns-answer.c | 862 +++ src/resolve/resolved-dns-answer.h | 138 + src/resolve/resolved-dns-cache.c | 1486 ++++ src/resolve/resolved-dns-cache.h | 60 + src/resolve/resolved-dns-dnssec.c | 2589 +++++++ src/resolve/resolved-dns-dnssec.h | 88 + src/resolve/resolved-dns-packet.c | 2686 +++++++ src/resolve/resolved-dns-packet.h | 349 + src/resolve/resolved-dns-query.c | 1299 ++++ src/resolve/resolved-dns-query.h | 166 + src/resolve/resolved-dns-question.c | 552 ++ src/resolve/resolved-dns-question.h | 84 + src/resolve/resolved-dns-rr.c | 2159 ++++++ src/resolve/resolved-dns-rr.h | 387 + src/resolve/resolved-dns-scope.c | 1683 +++++ src/resolve/resolved-dns-scope.h | 114 + src/resolve/resolved-dns-search-domain.c | 199 + src/resolve/resolved-dns-search-domain.h | 56 + src/resolve/resolved-dns-server.c | 1122 +++ src/resolve/resolved-dns-server.h | 177 + src/resolve/resolved-dns-stream.c | 595 ++ src/resolve/resolved-dns-stream.h | 128 + src/resolve/resolved-dns-stub.c | 1427 ++++ src/resolve/resolved-dns-stub.h | 48 + src/resolve/resolved-dns-synthesize.c | 571 ++ src/resolve/resolved-dns-synthesize.h | 11 + src/resolve/resolved-dns-transaction.c | 3670 +++++++++ src/resolve/resolved-dns-transaction.h | 219 + src/resolve/resolved-dns-trust-anchor.c | 779 ++ src/resolve/resolved-dns-trust-anchor.h | 25 + src/resolve/resolved-dns-zone.c | 686 ++ src/resolve/resolved-dns-zone.h | 69 + src/resolve/resolved-dnssd-bus.c | 131 + src/resolve/resolved-dnssd-bus.h | 11 + src/resolve/resolved-dnssd-gperf.gperf | 25 + src/resolve/resolved-dnssd.c | 362 + src/resolve/resolved-dnssd.h | 61 + src/resolve/resolved-dnstls-gnutls.c | 253 + src/resolve/resolved-dnstls-gnutls.h | 24 + src/resolve/resolved-dnstls-openssl.c | 422 ++ src/resolve/resolved-dnstls-openssl.h | 25 + src/resolve/resolved-dnstls.h | 38 + src/resolve/resolved-etc-hosts.c | 586 ++ src/resolve/resolved-etc-hosts.h | 23 + src/resolve/resolved-gperf.gperf | 35 + src/resolve/resolved-link-bus.c | 907 +++ src/resolve/resolved-link-bus.h | 22 + src/resolve/resolved-link.c | 1445 ++++ src/resolve/resolved-link.h | 127 + src/resolve/resolved-llmnr.c | 471 ++ src/resolve/resolved-llmnr.h | 14 + src/resolve/resolved-manager.c | 1860 +++++ src/resolve/resolved-manager.h | 230 + src/resolve/resolved-mdns.c | 614 ++ src/resolve/resolved-mdns.h | 13 + src/resolve/resolved-resolv-conf.c | 434 ++ src/resolve/resolved-resolv-conf.h | 23 + src/resolve/resolved-socket-graveyard.c | 131 + src/resolve/resolved-socket-graveyard.h | 18 + src/resolve/resolved-util.c | 84 + src/resolve/resolved-util.h | 4 + src/resolve/resolved-varlink.c | 796 ++ src/resolve/resolved-varlink.h | 7 + src/resolve/resolved.c | 99 + src/resolve/resolved.conf.in | 37 + src/resolve/test-dns-packet.c | 155 + src/resolve/test-dnssec-complex.c | 215 + src/resolve/test-dnssec.c | 787 ++ src/resolve/test-resolve-tables.c | 57 + src/resolve/test-resolved-etc-hosts.c | 154 + src/resolve/test-resolved-packet.c | 26 + src/resolve/test-resolved-stream.c | 394 + src/rfkill/meson.build | 9 + src/rfkill/rfkill.c | 378 + src/rpm/macros.systemd.in | 199 + src/rpm/meson.build | 25 + src/rpm/systemd-update-helper.in | 141 + src/rpm/triggers.systemd.in | 82 + src/rpm/triggers.systemd.sh.in | 87 + src/run-generator/meson.build | 8 + src/run-generator/run-generator.c | 138 + src/run/meson.build | 9 + src/run/run.c | 1987 +++++ src/shared/acl-util.c | 652 ++ src/shared/acl-util.h | 60 + src/shared/acpi-fpdt.c | 187 + src/shared/acpi-fpdt.h | 6 + src/shared/apparmor-util.c | 22 + src/shared/apparmor-util.h | 6 + src/shared/ask-password-api.c | 1002 +++ src/shared/ask-password-api.h | 23 + src/shared/async.c | 137 + src/shared/async.h | 26 + src/shared/barrier.c | 394 + src/shared/barrier.h | 74 + src/shared/base-filesystem.c | 210 + src/shared/base-filesystem.h | 7 + src/shared/battery-util.c | 283 + src/shared/battery-util.h | 11 + src/shared/binfmt-util.c | 55 + src/shared/binfmt-util.h | 5 + src/shared/bitmap.c | 211 + src/shared/bitmap.h | 36 + src/shared/blkid-util.h | 47 + src/shared/blockdev-util.c | 828 +++ src/shared/blockdev-util.h | 61 + src/shared/bond-util.c | 73 + src/shared/bond-util.h | 106 + src/shared/boot-entry.c | 273 + src/shared/boot-entry.h | 35 + src/shared/boot-timestamps.c | 46 + src/shared/boot-timestamps.h | 6 + src/shared/bootspec.c | 1434 ++++ src/shared/bootspec.h | 129 + src/shared/bpf-compat.h | 54 + src/shared/bpf-dlopen.c | 146 + src/shared/bpf-dlopen.h | 34 + src/shared/bpf-link.c | 43 + src/shared/bpf-link.h | 16 + src/shared/bpf-program.c | 513 ++ src/shared/bpf-program.h | 65 + src/shared/bridge-util.c | 13 + src/shared/bridge-util.h | 20 + src/shared/btrfs-util.c | 2164 ++++++ src/shared/btrfs-util.h | 149 + src/shared/bus-get-properties.c | 166 + src/shared/bus-get-properties.h | 101 + src/shared/bus-locator.c | 231 + src/shared/bus-locator.h | 37 + src/shared/bus-log-control-api.c | 114 + src/shared/bus-log-control-api.h | 19 + src/shared/bus-map-properties.c | 251 + src/shared/bus-map-properties.h | 25 + src/shared/bus-message-util.c | 185 + src/shared/bus-message-util.h | 18 + src/shared/bus-object.c | 177 + src/shared/bus-object.h | 34 + src/shared/bus-polkit.c | 575 ++ src/shared/bus-polkit.h | 11 + src/shared/bus-print-properties.c | 440 ++ src/shared/bus-print-properties.h | 21 + src/shared/bus-unit-procs.c | 402 + src/shared/bus-unit-procs.h | 8 + src/shared/bus-unit-util.c | 2938 ++++++++ src/shared/bus-unit-util.h | 37 + src/shared/bus-util.c | 711 ++ src/shared/bus-util.h | 75 + src/shared/bus-wait-for-jobs.c | 333 + src/shared/bus-wait-for-jobs.h | 16 + src/shared/bus-wait-for-units.c | 426 ++ src/shared/bus-wait-for-units.h | 35 + src/shared/calendarspec.c | 1435 ++++ src/shared/calendarspec.h | 44 + src/shared/cgroup-setup.c | 1008 +++ src/shared/cgroup-setup.h | 38 + src/shared/cgroup-show.c | 471 ++ src/shared/cgroup-show.h | 24 + src/shared/chown-recursive.c | 177 + src/shared/chown-recursive.h | 8 + src/shared/clean-ipc.c | 452 ++ src/shared/clean-ipc.h | 17 + src/shared/clock-util.c | 167 + src/shared/clock-util.h | 20 + src/shared/common-signal.c | 85 + src/shared/common-signal.h | 63 + src/shared/compare-operator.c | 119 + src/shared/compare-operator.h | 62 + src/shared/condition.c | 1360 ++++ src/shared/condition.h | 113 + src/shared/conf-parser.c | 1984 +++++ src/shared/conf-parser.h | 481 ++ src/shared/copy.c | 1635 +++++ src/shared/copy.h | 106 + src/shared/coredump-util.c | 179 + src/shared/coredump-util.h | 43 + src/shared/cpu-set-util.c | 292 + src/shared/cpu-set-util.h | 52 + src/shared/creds-util.c | 1395 ++++ src/shared/creds-util.h | 79 + src/shared/cryptsetup-fido2.c | 276 + src/shared/cryptsetup-fido2.h | 82 + src/shared/cryptsetup-util.c | 349 + src/shared/cryptsetup-util.h | 111 + src/shared/daemon-util.c | 76 + src/shared/daemon-util.h | 28 + src/shared/data-fd-util.c | 391 + src/shared/data-fd-util.h | 16 + src/shared/dev-setup.c | 137 + src/shared/dev-setup.h | 10 + src/shared/device-nodes.c | 87 + src/shared/device-nodes.h | 9 + src/shared/devnode-acl.c | 226 + src/shared/devnode-acl.h | 34 + src/shared/discover-image.c | 1385 ++++ src/shared/discover-image.h | 122 + src/shared/dissect-image.c | 4069 ++++++++++ src/shared/dissect-image.h | 230 + src/shared/dlfcn-util.c | 64 + src/shared/dlfcn-util.h | 39 + src/shared/dm-util.c | 45 + src/shared/dm-util.h | 4 + src/shared/dns-domain.c | 1421 ++++ src/shared/dns-domain.h | 104 + src/shared/dropin.c | 278 + src/shared/dropin.h | 26 + src/shared/edit-util.c | 370 + src/shared/edit-util.h | 40 + src/shared/efi-api.c | 556 ++ src/shared/efi-api.h | 74 + src/shared/efi-loader.c | 363 + src/shared/efi-loader.h | 63 + src/shared/elf-util.c | 899 +++ src/shared/elf-util.h | 18 + src/shared/enable-mempool.c | 19 + src/shared/env-file-label.c | 35 + src/shared/env-file-label.h | 10 + src/shared/ethtool-link-mode.py | 61 + src/shared/ethtool-util.c | 1423 ++++ src/shared/ethtool-util.h | 205 + src/shared/exec-util.c | 605 ++ src/shared/exec-util.h | 64 + src/shared/exit-status.c | 179 + src/shared/exit-status.h | 113 + src/shared/extension-util.c | 166 + src/shared/extension-util.h | 23 + src/shared/fdisk-util.c | 163 + src/shared/fdisk-util.h | 25 + src/shared/fdset.c | 323 + src/shared/fdset.h | 47 + src/shared/fileio-label.c | 43 + src/shared/fileio-label.h | 15 + src/shared/find-esp.c | 909 +++ src/shared/find-esp.h | 15 + src/shared/firewall-util-iptables.c | 392 + src/shared/firewall-util-nft.c | 1372 ++++ src/shared/firewall-util-private.h | 69 + src/shared/firewall-util.c | 160 + src/shared/firewall-util.h | 104 + src/shared/format-table.c | 3061 ++++++++ src/shared/format-table.h | 165 + src/shared/fsck-util.h | 14 + src/shared/fstab-util.c | 366 + src/shared/fstab-util.h | 59 + src/shared/generate-ip-protocol-list.sh | 9 + src/shared/generate-syscall-list.py | 7 + src/shared/generator.c | 888 +++ src/shared/generator.h | 105 + src/shared/geneve-util.c | 12 + src/shared/geneve-util.h | 17 + src/shared/gpt.c | 361 + src/shared/gpt.h | 102 + src/shared/group-record.c | 347 + src/shared/group-record.h | 46 + src/shared/hibernate-util.c | 520 ++ src/shared/hibernate-util.h | 26 + src/shared/hostname-setup.c | 213 + src/shared/hostname-setup.h | 25 + src/shared/hwdb-util.c | 712 ++ src/shared/hwdb-util.h | 10 + src/shared/id128-print.c | 74 + src/shared/id128-print.h | 19 + src/shared/idn-util.c | 69 + src/shared/idn-util.h | 32 + src/shared/ima-util.c | 15 + src/shared/ima-util.h | 6 + src/shared/image-policy.c | 774 ++ src/shared/image-policy.h | 104 + src/shared/import-util.c | 233 + src/shared/import-util.h | 36 + src/shared/in-addr-prefix-util.c | 325 + src/shared/in-addr-prefix-util.h | 23 + src/shared/initreq.h | 74 + src/shared/install-file.c | 270 + src/shared/install-file.h | 14 + src/shared/install-printf.c | 125 + src/shared/install-printf.h | 11 + src/shared/install.c | 3760 ++++++++++ src/shared/install.h | 244 + src/shared/ip-protocol-list.c | 84 + src/shared/ip-protocol-list.h | 14 + src/shared/ip-protocol-to-name.awk | 11 + src/shared/ipvlan-util.c | 22 + src/shared/ipvlan-util.h | 29 + src/shared/journal-file-util.c | 534 ++ src/shared/journal-file-util.h | 29 + src/shared/journal-importer.c | 482 ++ src/shared/journal-importer.h | 60 + src/shared/journal-util.c | 188 + src/shared/journal-util.h | 11 + src/shared/json-internal.h | 76 + src/shared/json.c | 5132 +++++++++++++ src/shared/json.h | 474 ++ src/shared/kbd-util.c | 155 + src/shared/kbd-util.h | 13 + src/shared/kernel-image.c | 178 + src/shared/kernel-image.h | 24 + src/shared/keyring-util.c | 35 + src/shared/keyring-util.h | 11 + src/shared/killall.c | 319 + src/shared/killall.h | 6 + src/shared/label-util.c | 141 + src/shared/label-util.h | 29 + src/shared/libcrypt-util.c | 211 + src/shared/libcrypt-util.h | 13 + src/shared/libfido2-util.c | 1296 ++++ src/shared/libfido2-util.h | 131 + src/shared/libmount-util.c | 59 + src/shared/libmount-util.h | 20 + src/shared/libshared.sym | 3 + src/shared/linux/README | 9 + src/shared/linux/auto_dev-ioctl.h | 220 + src/shared/linux/bpf.h | 7053 ++++++++++++++++++ src/shared/linux/bpf_common.h | 57 + src/shared/linux/bpf_insn.h | 241 + src/shared/linux/dm-ioctl.h | 385 + src/shared/linux/ethtool.h | 2164 ++++++ src/shared/local-addresses.c | 506 ++ src/shared/local-addresses.h | 19 + src/shared/locale-setup.c | 294 + src/shared/locale-setup.h | 29 + src/shared/log-link.h | 59 + src/shared/logs-show.c | 2102 ++++++ src/shared/logs-show.h | 77 + src/shared/loop-util.c | 1209 +++ src/shared/loop-util.h | 59 + src/shared/loopback-setup.c | 232 + src/shared/loopback-setup.h | 4 + src/shared/lsm-util.c | 33 + src/shared/lsm-util.h | 4 + src/shared/machine-credential.c | 127 + src/shared/machine-credential.h | 14 + src/shared/machine-id-setup.c | 295 + src/shared/machine-id-setup.h | 7 + src/shared/machine-pool.c | 51 + src/shared/machine-pool.h | 8 + src/shared/macvlan-util.c | 15 + src/shared/macvlan-util.h | 17 + src/shared/main-func.h | 42 + src/shared/meson.build | 375 + src/shared/mkdir-label.c | 42 + src/shared/mkdir-label.h | 26 + src/shared/mkfs-util.c | 684 ++ src/shared/mkfs-util.h | 25 + src/shared/module-util.c | 124 + src/shared/module-util.h | 12 + src/shared/mount-setup.c | 591 ++ src/shared/mount-setup.h | 12 + src/shared/mount-util.c | 1785 +++++ src/shared/mount-util.h | 143 + src/shared/net-condition.c | 399 + src/shared/net-condition.h | 47 + src/shared/netif-naming-scheme.c | 103 + src/shared/netif-naming-scheme.h | 97 + src/shared/netif-sriov.c | 643 ++ src/shared/netif-sriov.h | 50 + src/shared/netif-util.c | 206 + src/shared/netif-util.h | 22 + src/shared/nscd-flush.c | 142 + src/shared/nscd-flush.h | 8 + src/shared/nsflags.c | 67 + src/shared/nsflags.h | 23 + src/shared/numa-util.c | 188 + src/shared/numa-util.h | 35 + src/shared/open-file.c | 147 + src/shared/open-file.h | 36 + src/shared/openssl-util.c | 1149 +++ src/shared/openssl-util.h | 167 + src/shared/output-mode.c | 43 + src/shared/output-mode.h | 57 + src/shared/pager.c | 330 + src/shared/pager.h | 17 + src/shared/pam-util.c | 211 + src/shared/pam-util.h | 41 + src/shared/parse-argument.c | 123 + src/shared/parse-argument.h | 9 + src/shared/parse-helpers.c | 237 + src/shared/parse-helpers.h | 38 + src/shared/password-quality-util-passwdqc.c | 142 + src/shared/password-quality-util-passwdqc.h | 23 + src/shared/password-quality-util-pwquality.c | 163 + src/shared/password-quality-util-pwquality.h | 27 + src/shared/password-quality-util.h | 30 + src/shared/pcre2-util.c | 166 + src/shared/pcre2-util.h | 44 + src/shared/pcrextend-util.c | 152 + src/shared/pcrextend-util.h | 5 + src/shared/pe-binary.c | 241 + src/shared/pe-binary.h | 144 + src/shared/pkcs11-util.c | 1371 ++++ src/shared/pkcs11-util.h | 111 + src/shared/plymouth-util.c | 33 + src/shared/plymouth-util.h | 13 + src/shared/pretty-print.c | 421 ++ src/shared/pretty-print.h | 49 + src/shared/ptyfwd.c | 677 ++ src/shared/ptyfwd.h | 42 + src/shared/qrcode-util.c | 221 + src/shared/qrcode-util.h | 22 + src/shared/quota-util.c | 42 + src/shared/quota-util.h | 19 + src/shared/reboot-util.c | 196 + src/shared/reboot-util.h | 17 + src/shared/recovery-key.c | 109 + src/shared/recovery-key.h | 16 + src/shared/resize-fs.c | 126 + src/shared/resize-fs.h | 17 + src/shared/resolve-util.c | 52 + src/shared/resolve-util.h | 99 + src/shared/rm-rf.c | 519 ++ src/shared/rm-rf.h | 59 + src/shared/seccomp-util.c | 2499 +++++++ src/shared/seccomp-util.h | 180 + src/shared/securebits-util.c | 66 + src/shared/securebits-util.h | 20 + src/shared/selinux-util.c | 762 ++ src/shared/selinux-util.h | 50 + src/shared/serialize.c | 552 ++ src/shared/serialize.h | 53 + src/shared/service-util.c | 87 + src/shared/service-util.h | 10 + src/shared/sleep-config.c | 390 + src/shared/sleep-config.h | 59 + src/shared/smack-util.c | 311 + src/shared/smack-util.h | 53 + src/shared/socket-label.c | 132 + src/shared/socket-netlink.c | 409 ++ src/shared/socket-netlink.h | 44 + src/shared/spawn-ask-password-agent.c | 59 + src/shared/spawn-ask-password-agent.h | 11 + src/shared/spawn-polkit-agent.c | 96 + src/shared/spawn-polkit-agent.h | 11 + src/shared/specifier.c | 498 ++ src/shared/specifier.h | 108 + src/shared/switch-root.c | 212 + src/shared/switch-root.h | 13 + src/shared/test-tables.h | 43 + src/shared/tests.c | 346 + src/shared/tests.h | 181 + src/shared/tmpfile-util-label.c | 30 + src/shared/tmpfile-util-label.h | 14 + src/shared/tomoyo-util.c | 15 + src/shared/tomoyo-util.h | 6 + src/shared/tpm2-event-log.c | 67 + src/shared/tpm2-event-log.h | 139 + src/shared/tpm2-util.c | 7664 +++++++++++++++++++ src/shared/tpm2-util.h | 478 ++ src/shared/udev-util.c | 439 ++ src/shared/udev-util.h | 34 + src/shared/user-record-nss.c | 529 ++ src/shared/user-record-nss.h | 24 + src/shared/user-record-show.c | 601 ++ src/shared/user-record-show.h | 10 + src/shared/user-record.c | 2319 ++++++ src/shared/user-record.h | 450 ++ src/shared/userdb-dropin.c | 304 + src/shared/userdb-dropin.h | 22 + src/shared/userdb.c | 1465 ++++ src/shared/userdb.h | 58 + src/shared/utmp-wtmp.c | 278 + src/shared/utmp-wtmp.h | 52 + src/shared/varlink-idl.c | 1603 ++++ src/shared/varlink-idl.h | 158 + src/shared/varlink-internal.h | 10 + src/shared/varlink-io.systemd.Journal.c | 19 + src/shared/varlink-io.systemd.Journal.h | 6 + src/shared/varlink-io.systemd.ManagedOOM.c | 23 + src/shared/varlink-io.systemd.ManagedOOM.h | 6 + src/shared/varlink-io.systemd.PCRExtend.c | 14 + src/shared/varlink-io.systemd.PCRExtend.h | 6 + src/shared/varlink-io.systemd.Resolve.Monitor.c | 176 + src/shared/varlink-io.systemd.Resolve.Monitor.h | 6 + src/shared/varlink-io.systemd.Resolve.c | 76 + src/shared/varlink-io.systemd.Resolve.h | 6 + src/shared/varlink-io.systemd.UserDatabase.c | 46 + src/shared/varlink-io.systemd.UserDatabase.h | 6 + src/shared/varlink-io.systemd.c | 21 + src/shared/varlink-io.systemd.h | 6 + src/shared/varlink-io.systemd.oom.c | 25 + src/shared/varlink-io.systemd.oom.h | 7 + src/shared/varlink-io.systemd.service.c | 70 + src/shared/varlink-io.systemd.service.h | 10 + src/shared/varlink-io.systemd.sysext.c | 67 + src/shared/varlink-io.systemd.sysext.h | 6 + src/shared/varlink-org.varlink.service.c | 49 + src/shared/varlink-org.varlink.service.h | 6 + src/shared/varlink.c | 3767 ++++++++++ src/shared/varlink.h | 224 + src/shared/verb-log-control.c | 51 + src/shared/verb-log-control.h | 8 + src/shared/verbs.c | 171 + src/shared/verbs.h | 23 + src/shared/vlan-util.c | 98 + src/shared/vlan-util.h | 21 + src/shared/volatile-util.c | 46 + src/shared/volatile-util.h | 16 + src/shared/wall.c | 187 + src/shared/wall.h | 27 + src/shared/watchdog.c | 504 ++ src/shared/watchdog.h | 21 + src/shared/web-util.c | 66 + src/shared/web-util.h | 13 + src/shared/wifi-util.c | 306 + src/shared/wifi-util.h | 16 + src/shared/xml.c | 237 + src/shared/xml.h | 14 + src/shutdown/detach-dm.c | 167 + src/shutdown/detach-dm.h | 10 + src/shutdown/detach-loopback.c | 225 + src/shutdown/detach-loopback.h | 10 + src/shutdown/detach-md.c | 188 + src/shutdown/detach-md.h | 10 + src/shutdown/detach-swap.c | 110 + src/shutdown/detach-swap.h | 21 + src/shutdown/meson.build | 39 + src/shutdown/shutdown.c | 663 ++ src/shutdown/test-umount.c | 68 + src/shutdown/umount.c | 494 ++ src/shutdown/umount.h | 26 + src/sleep/battery-capacity.c | 384 + src/sleep/battery-capacity.h | 18 + src/sleep/meson.build | 22 + src/sleep/sleep.c | 651 ++ src/sleep/sleep.conf | 27 + src/sleep/test-battery-capacity.c | 45 + src/socket-activate/meson.build | 10 + src/socket-activate/socket-activate.c | 495 ++ src/socket-proxy/meson.build | 10 + src/socket-proxy/socket-proxyd.c | 722 ++ src/stdio-bridge/meson.build | 9 + src/stdio-bridge/stdio-bridge.c | 252 + src/storagetm/meson.build | 11 + src/storagetm/storagetm.c | 1244 ++++ src/sulogin-shell/meson.build | 8 + src/sulogin-shell/sulogin-shell.c | 157 + src/sysctl/meson.build | 9 + src/sysctl/sysctl.c | 490 ++ src/sysext/meson.build | 15 + src/sysext/sysext.c | 1568 ++++ src/system-update-generator/meson.build | 8 + .../system-update-generator.c | 82 + src/systemctl/fuzz-systemctl-parse-argv.c | 73 + src/systemctl/meson.build | 82 + src/systemctl/systemctl-add-dependency.c | 86 + src/systemctl/systemctl-add-dependency.h | 4 + src/systemctl/systemctl-cancel-job.c | 44 + src/systemctl/systemctl-cancel-job.h | 4 + src/systemctl/systemctl-clean-or-freeze.c | 100 + src/systemctl/systemctl-clean-or-freeze.h | 4 + src/systemctl/systemctl-compat-halt.c | 203 + src/systemctl/systemctl-compat-halt.h | 6 + src/systemctl/systemctl-compat-runlevel.c | 82 + src/systemctl/systemctl-compat-runlevel.h | 6 + src/systemctl/systemctl-compat-shutdown.c | 159 + src/systemctl/systemctl-compat-shutdown.h | 4 + src/systemctl/systemctl-compat-telinit.c | 165 + src/systemctl/systemctl-compat-telinit.h | 7 + src/systemctl/systemctl-daemon-reload.c | 78 + src/systemctl/systemctl-daemon-reload.h | 8 + src/systemctl/systemctl-edit.c | 368 + src/systemctl/systemctl-edit.h | 5 + src/systemctl/systemctl-enable.c | 337 + src/systemctl/systemctl-enable.h | 4 + src/systemctl/systemctl-is-active.c | 96 + src/systemctl/systemctl-is-active.h | 5 + src/systemctl/systemctl-is-enabled.c | 157 + src/systemctl/systemctl-is-enabled.h | 4 + src/systemctl/systemctl-is-system-running.c | 82 + src/systemctl/systemctl-is-system-running.h | 4 + src/systemctl/systemctl-kill.c | 58 + src/systemctl/systemctl-kill.h | 4 + src/systemctl/systemctl-list-dependencies.c | 196 + src/systemctl/systemctl-list-dependencies.h | 4 + src/systemctl/systemctl-list-jobs.c | 174 + src/systemctl/systemctl-list-jobs.h | 4 + src/systemctl/systemctl-list-machines.c | 247 + src/systemctl/systemctl-list-machines.h | 27 + src/systemctl/systemctl-list-unit-files.c | 268 + src/systemctl/systemctl-list-unit-files.h | 4 + src/systemctl/systemctl-list-units.c | 1191 +++ src/systemctl/systemctl-list-units.h | 10 + src/systemctl/systemctl-log-setting.c | 96 + src/systemctl/systemctl-log-setting.h | 5 + src/systemctl/systemctl-logind.c | 449 ++ src/systemctl/systemctl-logind.h | 17 + src/systemctl/systemctl-mount.c | 116 + src/systemctl/systemctl-mount.h | 5 + src/systemctl/systemctl-preset-all.c | 60 + src/systemctl/systemctl-preset-all.h | 4 + src/systemctl/systemctl-reset-failed.c | 40 + src/systemctl/systemctl-reset-failed.h | 4 + src/systemctl/systemctl-service-watchdogs.c | 43 + src/systemctl/systemctl-service-watchdogs.h | 4 + src/systemctl/systemctl-set-default.c | 160 + src/systemctl/systemctl-set-default.h | 5 + src/systemctl/systemctl-set-environment.c | 225 + src/systemctl/systemctl-set-environment.h | 6 + src/systemctl/systemctl-set-property.c | 65 + src/systemctl/systemctl-set-property.h | 4 + src/systemctl/systemctl-show.c | 2503 +++++++ src/systemctl/systemctl-show.h | 4 + src/systemctl/systemctl-start-special.c | 261 + src/systemctl/systemctl-start-special.h | 5 + src/systemctl/systemctl-start-unit.c | 409 ++ src/systemctl/systemctl-start-unit.h | 16 + src/systemctl/systemctl-switch-root.c | 118 + src/systemctl/systemctl-switch-root.h | 4 + src/systemctl/systemctl-sysv-compat.c | 275 + src/systemctl/systemctl-sysv-compat.h | 41 + src/systemctl/systemctl-trivial-method.c | 45 + src/systemctl/systemctl-trivial-method.h | 4 + src/systemctl/systemctl-util.c | 996 +++ src/systemctl/systemctl-util.h | 60 + src/systemctl/systemctl-whoami.c | 70 + src/systemctl/systemctl-whoami.h | 4 + src/systemctl/systemctl.c | 1348 ++++ src/systemctl/systemctl.h | 111 + src/systemctl/systemd-sysv-install.SKELETON | 51 + src/systemd/_sd-common.h | 108 + src/systemd/meson.build | 105 + src/systemd/sd-bus-protocol.h | 108 + src/systemd/sd-bus-vtable.h | 353 + src/systemd/sd-bus.h | 541 ++ src/systemd/sd-daemon.h | 347 + src/systemd/sd-device.h | 168 + src/systemd/sd-dhcp-client.h | 182 + src/systemd/sd-dhcp-lease.h | 98 + src/systemd/sd-dhcp-option.h | 40 + src/systemd/sd-dhcp-protocol.h | 203 + src/systemd/sd-dhcp-server.h | 100 + src/systemd/sd-dhcp6-client.h | 142 + src/systemd/sd-dhcp6-lease.h | 91 + src/systemd/sd-dhcp6-option.h | 39 + src/systemd/sd-dhcp6-protocol.h | 174 + src/systemd/sd-event.h | 187 + src/systemd/sd-gpt.h | 369 + src/systemd/sd-hwdb.h | 46 + src/systemd/sd-id128.h | 164 + src/systemd/sd-ipv4acd.h | 64 + src/systemd/sd-ipv4ll.h | 65 + src/systemd/sd-journal.h | 180 + src/systemd/sd-lldp-rx.h | 109 + src/systemd/sd-lldp-tx.h | 70 + src/systemd/sd-lldp.h | 123 + src/systemd/sd-login.h | 270 + src/systemd/sd-messages.h | 277 + src/systemd/sd-ndisc.h | 148 + src/systemd/sd-netlink.h | 250 + src/systemd/sd-network.h | 224 + src/systemd/sd-path.h | 131 + src/systemd/sd-radv.h | 110 + src/systemd/sd-resolve.h | 124 + src/systemd/sd-utf8.h | 29 + src/sysupdate/meson.build | 31 + src/sysupdate/sysupdate-cache.c | 88 + src/sysupdate/sysupdate-cache.h | 18 + src/sysupdate/sysupdate-instance.c | 63 + src/sysupdate/sysupdate-instance.h | 67 + src/sysupdate/sysupdate-partition.c | 284 + src/sysupdate/sysupdate-partition.h | 49 + src/sysupdate/sysupdate-pattern.c | 643 ++ src/sysupdate/sysupdate-pattern.h | 18 + src/sysupdate/sysupdate-resource.c | 707 ++ src/sysupdate/sysupdate-resource.h | 111 + src/sysupdate/sysupdate-transfer.c | 1252 ++++ src/sysupdate/sysupdate-transfer.h | 62 + src/sysupdate/sysupdate-update-set.c | 63 + src/sysupdate/sysupdate-update-set.h | 32 + src/sysupdate/sysupdate-util.c | 4 + src/sysupdate/sysupdate.c | 1416 ++++ src/sysupdate/sysupdate.h | 21 + src/sysusers/meson.build | 25 + src/sysusers/sysusers.c | 2394 ++++++ src/sysv-generator/meson.build | 9 + src/sysv-generator/sysv-generator.c | 935 +++ src/test/generate-sym-test.py | 114 + src/test/meson.build | 597 ++ src/test/nss-test-util.c | 42 + src/test/nss-test-util.h | 8 + src/test/test-acl-util.c | 130 + src/test/test-af-list.c | 30 + src/test/test-alloc-util.c | 233 + src/test/test-architecture.c | 53 + src/test/test-argv-util.c | 132 + src/test/test-arphrd-util.c | 26 + src/test/test-ask-password-api.c | 21 + src/test/test-async.c | 100 + src/test/test-barrier.c | 441 ++ src/test/test-bitfield.c | 230 + src/test/test-bitmap.c | 116 + src/test/test-blockdev-util.c | 41 + src/test/test-boot-timestamps.c | 89 + src/test/test-bootspec.c | 211 + src/test/test-bpf-devices.c | 307 + src/test/test-bpf-firewall.c | 217 + src/test/test-bpf-foreign-programs.c | 330 + src/test/test-bpf-lsm.c | 102 + src/test/test-btrfs-physical-offset.c | 37 + src/test/test-btrfs.c | 200 + src/test/test-bus-util.c | 47 + src/test/test-calendarspec.c | 264 + src/test/test-cap-list.c | 177 + src/test/test-capability.c | 332 + src/test/test-cgroup-cpu.c | 34 + src/test/test-cgroup-mask.c | 184 + src/test/test-cgroup-setup.c | 73 + src/test/test-cgroup-unit-default.c | 138 + src/test/test-cgroup-util.c | 466 ++ src/test/test-cgroup.c | 132 + src/test/test-chase-manual.c | 116 + src/test/test-chase.c | 756 ++ src/test/test-chown-rec.c | 162 + src/test/test-clock.c | 74 + src/test/test-compare-operator.c | 43 + src/test/test-compress-benchmark.c | 176 + src/test/test-compress.c | 373 + src/test/test-condition.c | 1496 ++++ src/test/test-conf-files.c | 218 + src/test/test-conf-parser.c | 393 + src/test/test-copy.c | 532 ++ src/test/test-core-unit.c | 120 + src/test/test-coredump-util.c | 161 + src/test/test-cpu-set-util.c | 280 + src/test/test-creds.c | 121 + src/test/test-cryptolib.c | 38 + src/test/test-daemon.c | 60 + src/test/test-data-fd-util.c | 148 + src/test/test-date.c | 112 + src/test/test-dev-setup.c | 66 + src/test/test-device-nodes.c | 38 + src/test/test-devnum-util.c | 124 + src/test/test-dlopen-so.c | 76 + src/test/test-dlopen.c | 19 + src/test/test-dns-domain.c | 753 ++ src/test/test-ellipsize.c | 159 + src/test/test-emergency-action.c | 51 + src/test/test-engine.c | 300 + src/test/test-env-file.c | 191 + src/test/test-env-util.c | 563 ++ src/test/test-errno-list.c | 32 + src/test/test-errno-util.c | 112 + src/test/test-escape.c | 242 + src/test/test-ether-addr-util.c | 162 + src/test/test-exec-util.c | 456 ++ src/test/test-execute.c | 1550 ++++ src/test/test-execve.c | 40 + src/test/test-exit-status.c | 38 + src/test/test-extract-word.c | 763 ++ src/test/test-fd-util.c | 765 ++ src/test/test-fdset.c | 212 + src/test/test-fiemap.c | 64 + src/test/test-fileio.c | 1151 +++ src/test/test-firewall-util.c | 123 + src/test/test-format-table.c | 635 ++ src/test/test-format-util.c | 61 + src/test/test-fs-util.c | 796 ++ src/test/test-fstab-util.c | 192 + src/test/test-glob-util.c | 137 + src/test/test-gpt.c | 111 + src/test/test-gunicode.c | 27 + src/test/test-hash-funcs.c | 77 + src/test/test-hashmap-ordered.awk | 12 + src/test/test-hashmap-plain.c | 1008 +++ src/test/test-hashmap.c | 171 + src/test/test-hexdecoct.c | 548 ++ src/test/test-hmac.c | 68 + src/test/test-hostname-setup.c | 64 + src/test/test-hostname-util.c | 116 + src/test/test-id128.c | 339 + src/test/test-image-policy.c | 132 + src/test/test-import-util.c | 64 + src/test/test-in-addr-prefix-util.c | 121 + src/test/test-in-addr-util.c | 408 + src/test/test-install-file.c | 64 + src/test/test-install-root.c | 1299 ++++ src/test/test-install.c | 270 + src/test/test-io-util.c | 49 + src/test/test-ip-protocol-list.c | 71 + src/test/test-ipcrm.c | 29 + src/test/test-job-type.c | 81 + src/test/test-journal-importer.c | 71 + src/test/test-json.c | 816 ++ src/test/test-kbd-util.c | 27 + src/test/test-libcrypt-util.c | 126 + src/test/test-libmount.c | 110 + src/test/test-limits-util.c | 89 + src/test/test-list.c | 286 + src/test/test-load-fragment.c | 1105 +++ src/test/test-local-addresses.c | 73 + src/test/test-locale-util.c | 132 + src/test/test-lock-util.c | 65 + src/test/test-log.c | 226 + src/test/test-logarithm.c | 95 + src/test/test-loop-block.c | 350 + src/test/test-loopback.c | 46 + src/test/test-macro.c | 1040 +++ src/test/test-manager.c | 19 + src/test/test-math-util.c | 110 + src/test/test-memfd-util.c | 30 + src/test/test-memory-util.c | 125 + src/test/test-mempool.c | 92 + src/test/test-mempress.c | 309 + src/test/test-memstream-util.c | 60 + src/test/test-mkdir.c | 141 + src/test/test-modhex.c | 51 + src/test/test-mount-util.c | 509 ++ src/test/test-mountpoint-util.c | 434 ++ src/test/test-namespace.c | 199 + src/test/test-net-naming-scheme.c | 31 + src/test/test-netlink-manual.c | 126 + src/test/test-nft-set.c | 78 + src/test/test-ns.c | 125 + src/test/test-nscd-flush.c | 20 + src/test/test-nss-hosts.c | 495 ++ src/test/test-nss-users.c | 256 + src/test/test-nulstr-util.c | 184 + src/test/test-open-file.c | 185 + src/test/test-openssl.c | 483 ++ src/test/test-ordered-set.c | 112 + src/test/test-os-util.c | 135 + src/test/test-parse-argument.c | 53 + src/test/test-parse-helpers.c | 95 + src/test/test-parse-util.c | 979 +++ src/test/test-path-lookup.c | 126 + src/test/test-path-util.c | 1308 ++++ src/test/test-path.c | 418 ++ src/test/test-percent-util.c | 199 + src/test/test-pretty-print.c | 79 + src/test/test-prioq.c | 123 + src/test/test-proc-cmdline.c | 354 + src/test/test-process-util.c | 954 +++ src/test/test-procfs-util.c | 76 + src/test/test-psi-util.c | 78 + src/test/test-qrcode-util.c | 23 + src/test/test-random-util.c | 79 + src/test/test-ratelimit.c | 43 + src/test/test-raw-clone.c | 41 + src/test/test-recurse-dir.c | 177 + src/test/test-replace-var.c | 32 + src/test/test-rlimit-util.c | 139 + src/test/test-rm-rf.c | 114 + src/test/test-sbat.c | 26 + src/test/test-sched-prio.c | 81 + src/test/test-sd-hwdb.c | 85 + src/test/test-sd-path.c | 59 + src/test/test-seccomp.c | 1234 ++++ src/test/test-secure-bits.c | 97 + src/test/test-selinux.c | 104 + src/test/test-serialize.c | 265 + src/test/test-set-disable-mempool.c | 58 + src/test/test-set.c | 403 + src/test/test-sha256.c | 50 + src/test/test-sigbus.c | 62 + src/test/test-signal-util.c | 175 + src/test/test-siphash24.c | 108 + src/test/test-sizeof.c | 130 + src/test/test-sleep-config.c | 79 + src/test/test-socket-bind.c | 149 + src/test/test-socket-netlink.c | 372 + src/test/test-socket-util.c | 593 ++ src/test/test-specifier.c | 188 + src/test/test-stat-util.c | 188 + src/test/test-static-destruct.c | 67 + src/test/test-strbuf.c | 73 + src/test/test-string-util.c | 1327 ++++ src/test/test-strip-tab-ansi.c | 72 + src/test/test-strv.c | 1009 +++ src/test/test-strxcpyx.c | 175 + src/test/test-sysctl-util.c | 75 + src/test/test-tables.c | 131 + src/test/test-terminal-util.c | 168 + src/test/test-time-util.c | 1195 +++ src/test/test-tmpfile-util.c | 306 + src/test/test-tpm2.c | 1324 ++++ src/test/test-udev-util.c | 63 + src/test/test-uid-alloc-range.c | 93 + src/test/test-uid-range.c | 175 + src/test/test-umask-util.c | 57 + src/test/test-unaligned.c | 168 + src/test/test-unit-file.c | 110 + src/test/test-unit-name.c | 1009 +++ src/test/test-unit-serialize.c | 63 + src/test/test-user-util.c | 484 ++ src/test/test-utf8.c | 235 + src/test/test-utmp.c | 58 + src/test/test-varlink-idl.c | 385 + src/test/test-varlink.c | 376 + src/test/test-verbs.c | 59 + src/test/test-watch-pid.c | 102 + src/test/test-watchdog.c | 39 + src/test/test-web-util.c | 21 + src/test/test-xattr-util.c | 129 + src/test/test-xml.c | 68 + src/timedate/meson.build | 26 + src/timedate/org.freedesktop.timedate1.conf | 29 + src/timedate/org.freedesktop.timedate1.policy | 62 + src/timedate/org.freedesktop.timedate1.service | 14 + src/timedate/timedatectl.c | 1039 +++ src/timedate/timedated.c | 1162 +++ src/timesync/80-systemd-timesync.list | 4 + src/timesync/meson.build | 80 + src/timesync/org.freedesktop.timesync1.conf | 46 + src/timesync/org.freedesktop.timesync1.policy | 32 + src/timesync/org.freedesktop.timesync1.service | 14 + src/timesync/test-timesync.c | 28 + src/timesync/timesyncd-bus.c | 264 + src/timesync/timesyncd-bus.h | 6 + src/timesync/timesyncd-conf.c | 127 + src/timesync/timesyncd-conf.h | 14 + src/timesync/timesyncd-gperf.gperf | 28 + src/timesync/timesyncd-manager.c | 1287 ++++ src/timesync/timesyncd-manager.h | 142 + src/timesync/timesyncd-ntp-message.h | 45 + src/timesync/timesyncd-server.c | 177 + src/timesync/timesyncd-server.h | 50 + src/timesync/timesyncd.c | 231 + src/timesync/timesyncd.conf.in | 26 + src/timesync/wait-sync.c | 240 + src/tmpfiles/meson.build | 38 + src/tmpfiles/offline-passwd.c | 168 + src/tmpfiles/offline-passwd.h | 9 + src/tmpfiles/test-offline-passwd.c | 85 + src/tmpfiles/tmpfiles.c | 4576 ++++++++++++ src/tpm2-setup/meson.build | 16 + src/tpm2-setup/tpm2-setup.c | 373 + src/tty-ask-password-agent/meson.build | 9 + .../tty-ask-password-agent.c | 710 ++ src/udev/ata_id/ata_id.c | 635 ++ src/udev/cdrom_id/cdrom_id.c | 1023 +++ src/udev/dmi_memory_id/dmi_memory_id.c | 721 ++ src/udev/fido_id/fido_id.c | 129 + src/udev/fido_id/fido_id_desc.c | 92 + src/udev/fido_id/fido_id_desc.h | 8 + src/udev/fido_id/fuzz-fido-id-desc.c | 21 + src/udev/fido_id/test-fido-id-desc.c | 80 + src/udev/fuzz-udev-rule-parse-value.c | 32 + src/udev/fuzz-udev-rules.c | 35 + src/udev/fuzz-udev-rules.options | 2 + src/udev/generate-keyboard-keys-gperf.sh | 20 + src/udev/generate-keyboard-keys-list.sh | 9 + src/udev/iocost/iocost.c | 321 + src/udev/iocost/iocost.conf | 20 + src/udev/meson.build | 273 + src/udev/mtd_probe/mtd_probe.c | 90 + src/udev/mtd_probe/mtd_probe.h | 52 + src/udev/mtd_probe/probe_smartmedia.c | 97 + src/udev/net/fuzz-link-parser.c | 27 + src/udev/net/fuzz-link-parser.options | 2 + src/udev/net/link-config-gperf.gperf | 118 + src/udev/net/link-config.c | 1133 +++ src/udev/net/link-config.h | 117 + src/udev/net/test-link-config-tables.c | 13 + src/udev/scsi_id/README | 4 + src/udev/scsi_id/scsi.h | 100 + src/udev/scsi_id/scsi_id.c | 515 ++ src/udev/scsi_id/scsi_id.h | 63 + src/udev/scsi_id/scsi_serial.c | 892 +++ src/udev/test-udev-builtin.c | 20 + src/udev/test-udev-format.c | 37 + src/udev/test-udev-manager.c | 19 + src/udev/test-udev-node.c | 50 + src/udev/test-udev-rule-runner.c | 178 + src/udev/test-udev-rules.c | 77 + src/udev/test-udev-spawn.c | 113 + src/udev/udev-builtin-blkid.c | 474 ++ src/udev/udev-builtin-btrfs.c | 50 + src/udev/udev-builtin-hwdb.c | 227 + src/udev/udev-builtin-input_id.c | 433 ++ src/udev/udev-builtin-keyboard.c | 252 + src/udev/udev-builtin-kmod.c | 94 + src/udev/udev-builtin-net_driver.c | 43 + src/udev/udev-builtin-net_id.c | 1366 ++++ src/udev/udev-builtin-net_setup_link.c | 113 + src/udev/udev-builtin-path_id.c | 896 +++ src/udev/udev-builtin-uaccess.c | 81 + src/udev/udev-builtin-usb_id.c | 489 ++ src/udev/udev-builtin.c | 156 + src/udev/udev-builtin.h | 88 + src/udev/udev-ctrl.c | 353 + src/udev/udev-ctrl.h | 78 + src/udev/udev-event.c | 411 ++ src/udev/udev-event.h | 60 + src/udev/udev-format.c | 550 ++ src/udev/udev-format.h | 21 + src/udev/udev-manager.c | 1352 ++++ src/udev/udev-manager.h | 62 + src/udev/udev-node.c | 790 ++ src/udev/udev-node.h | 29 + src/udev/udev-rules.c | 2965 ++++++++ src/udev/udev-rules.h | 49 + src/udev/udev-spawn.c | 355 + src/udev/udev-spawn.h | 28 + src/udev/udev-trace.h | 35 + src/udev/udev-watch.c | 260 + src/udev/udev-watch.h | 13 + src/udev/udev-worker.c | 352 + src/udev/udev-worker.h | 52 + src/udev/udev.conf | 11 + src/udev/udev.pc.in | 15 + src/udev/udevadm-control.c | 239 + src/udev/udevadm-hwdb.c | 102 + src/udev/udevadm-info.c | 1120 +++ src/udev/udevadm-lock.c | 306 + src/udev/udevadm-monitor.c | 246 + src/udev/udevadm-settle.c | 252 + src/udev/udevadm-test-builtin.c | 117 + src/udev/udevadm-test.c | 152 + src/udev/udevadm-trigger.c | 569 ++ src/udev/udevadm-util.c | 124 + src/udev/udevadm-util.h | 8 + src/udev/udevadm-verify.c | 236 + src/udev/udevadm-wait.c | 456 ++ src/udev/udevadm.c | 140 + src/udev/udevadm.h | 24 + src/udev/udevd.c | 408 + src/udev/udevd.h | 4 + src/udev/v4l_id/v4l_id.c | 113 + src/ukify/test/example.signing.crt.base64 | 23 + src/ukify/test/example.signing.key.base64 | 30 + src/ukify/test/example.tpm2-pcr-private.pem.base64 | 30 + .../test/example.tpm2-pcr-private2.pem.base64 | 30 + src/ukify/test/example.tpm2-pcr-public.pem.base64 | 8 + src/ukify/test/example.tpm2-pcr-public2.pem.base64 | 8 + src/ukify/test/meson.build | 21 + src/ukify/test/test_ukify.py | 876 +++ src/ukify/ukify.py | 1668 +++++ src/update-done/meson.build | 8 + src/update-done/update-done.c | 59 + src/update-utmp/meson.build | 10 + src/update-utmp/update-utmp.c | 266 + src/user-sessions/meson.build | 9 + src/user-sessions/user-sessions.c | 42 + src/userdb/meson.build | 25 + src/userdb/userdbctl.c | 1334 ++++ src/userdb/userdbd-manager.c | 321 + src/userdb/userdbd-manager.h | 32 + src/userdb/userdbd.c | 59 + src/userdb/userwork.c | 575 ++ src/varlinkctl/meson.build | 13 + src/varlinkctl/varlinkctl.c | 529 ++ src/vconsole/meson.build | 9 + src/vconsole/vconsole-setup.c | 635 ++ src/veritysetup/meson.build | 15 + src/veritysetup/veritysetup-generator.c | 526 ++ src/veritysetup/veritysetup.c | 431 ++ src/version/version.h.in | 10 + src/vmspawn/meson.build | 27 + src/vmspawn/vmspawn-settings.c | 3 + src/vmspawn/vmspawn-settings.h | 11 + src/vmspawn/vmspawn-util.c | 344 + src/vmspawn/vmspawn-util.h | 26 + src/vmspawn/vmspawn.c | 766 ++ src/volatile-root/meson.build | 9 + src/volatile-root/volatile-root.c | 197 + src/xdg-autostart-generator/fuzz-xdg-desktop.c | 36 + .../fuzz-xdg-desktop.options | 2 + src/xdg-autostart-generator/meson.build | 32 + src/xdg-autostart-generator/test-xdg-autostart.c | 104 + .../xdg-autostart-condition.c | 45 + .../xdg-autostart-generator.c | 118 + .../xdg-autostart-service.c | 695 ++ .../xdg-autostart-service.h | 36 + 2758 files changed, 867920 insertions(+) create mode 100644 src/ac-power/ac-power.c create mode 100644 src/ac-power/meson.build create mode 100644 src/analyze/analyze-blame.c create mode 100644 src/analyze/analyze-blame.h create mode 100644 src/analyze/analyze-calendar.c create mode 100644 src/analyze/analyze-calendar.h create mode 100644 src/analyze/analyze-capability.c create mode 100644 src/analyze/analyze-capability.h create mode 100644 src/analyze/analyze-cat-config.c create mode 100644 src/analyze/analyze-cat-config.h create mode 100644 src/analyze/analyze-compare-versions.c create mode 100644 src/analyze/analyze-compare-versions.h create mode 100644 src/analyze/analyze-condition.c create mode 100644 src/analyze/analyze-condition.h create mode 100644 src/analyze/analyze-critical-chain.c create mode 100644 src/analyze/analyze-critical-chain.h create mode 100644 src/analyze/analyze-dot.c create mode 100644 src/analyze/analyze-dot.h create mode 100644 src/analyze/analyze-dump.c create mode 100644 src/analyze/analyze-dump.h create mode 100644 src/analyze/analyze-exit-status.c create mode 100644 src/analyze/analyze-exit-status.h create mode 100644 src/analyze/analyze-fdstore.c create mode 100644 src/analyze/analyze-fdstore.h create mode 100644 src/analyze/analyze-filesystems.c create mode 100644 src/analyze/analyze-filesystems.h create mode 100644 src/analyze/analyze-image-policy.c create mode 100644 src/analyze/analyze-image-policy.h create mode 100644 src/analyze/analyze-inspect-elf.c create mode 100644 src/analyze/analyze-inspect-elf.h create mode 100644 src/analyze/analyze-log-control.c create mode 100644 src/analyze/analyze-log-control.h create mode 100644 src/analyze/analyze-malloc.c create mode 100644 src/analyze/analyze-malloc.h create mode 100644 src/analyze/analyze-pcrs.c create mode 100644 src/analyze/analyze-pcrs.h create mode 100644 src/analyze/analyze-plot.c create mode 100644 src/analyze/analyze-plot.h create mode 100644 src/analyze/analyze-security.c create mode 100644 src/analyze/analyze-security.h create mode 100644 src/analyze/analyze-service-watchdogs.c create mode 100644 src/analyze/analyze-service-watchdogs.h create mode 100644 src/analyze/analyze-srk.c create mode 100644 src/analyze/analyze-srk.h create mode 100644 src/analyze/analyze-syscall-filter.c create mode 100644 src/analyze/analyze-syscall-filter.h create mode 100644 src/analyze/analyze-time-data.c create mode 100644 src/analyze/analyze-time-data.h create mode 100644 src/analyze/analyze-time.c create mode 100644 src/analyze/analyze-time.h create mode 100644 src/analyze/analyze-timespan.c create mode 100644 src/analyze/analyze-timespan.h create mode 100644 src/analyze/analyze-timestamp.c create mode 100644 src/analyze/analyze-timestamp.h create mode 100644 src/analyze/analyze-unit-files.c create mode 100644 src/analyze/analyze-unit-files.h create mode 100644 src/analyze/analyze-unit-paths.c create mode 100644 src/analyze/analyze-unit-paths.h create mode 100644 src/analyze/analyze-verify-util.c create mode 100644 src/analyze/analyze-verify-util.h create mode 100644 src/analyze/analyze-verify.c create mode 100644 src/analyze/analyze-verify.h create mode 100644 src/analyze/analyze.c create mode 100644 src/analyze/analyze.h create mode 100644 src/analyze/meson.build create mode 100644 src/analyze/test-verify.c create mode 100644 src/ask-password/ask-password.c create mode 100644 src/ask-password/meson.build create mode 100644 src/backlight/backlight.c create mode 100644 src/backlight/meson.build create mode 100644 src/basic/MurmurHash2.c create mode 100644 src/basic/MurmurHash2.h create mode 100644 src/basic/af-list.c create mode 100644 src/basic/af-list.h create mode 100644 src/basic/af-to-name.awk create mode 100644 src/basic/alloc-util.c create mode 100644 src/basic/alloc-util.h create mode 100644 src/basic/architecture.c create mode 100644 src/basic/architecture.h create mode 100644 src/basic/argv-util.c create mode 100644 src/basic/argv-util.h create mode 100644 src/basic/arphrd-to-name.awk create mode 100644 src/basic/arphrd-util.c create mode 100644 src/basic/arphrd-util.h create mode 100644 src/basic/audit-util.c create mode 100644 src/basic/audit-util.h create mode 100644 src/basic/bitfield.h create mode 100644 src/basic/btrfs.c create mode 100644 src/basic/btrfs.h create mode 100644 src/basic/build.c create mode 100644 src/basic/build.h create mode 100644 src/basic/bus-label.c create mode 100644 src/basic/bus-label.h create mode 100644 src/basic/cap-list.c create mode 100644 src/basic/cap-list.h create mode 100644 src/basic/cap-to-name.awk create mode 100644 src/basic/capability-util.c create mode 100644 src/basic/capability-util.h create mode 100644 src/basic/cgroup-util.c create mode 100644 src/basic/cgroup-util.h create mode 100644 src/basic/chase.c create mode 100644 src/basic/chase.h create mode 100644 src/basic/chattr-util.c create mode 100644 src/basic/chattr-util.h create mode 100755 src/basic/check-filesystems.sh create mode 100644 src/basic/compress.c create mode 100644 src/basic/compress.h create mode 100644 src/basic/conf-files.c create mode 100644 src/basic/conf-files.h create mode 100644 src/basic/confidential-virt.c create mode 100644 src/basic/confidential-virt.h create mode 100644 src/basic/constants.h create mode 100644 src/basic/coverage.h create mode 100644 src/basic/devnum-util.c create mode 100644 src/basic/devnum-util.h create mode 100644 src/basic/dirent-util.c create mode 100644 src/basic/dirent-util.h create mode 100644 src/basic/dns-def.h create mode 100644 src/basic/efivars.c create mode 100644 src/basic/efivars.h create mode 100644 src/basic/env-file.c create mode 100644 src/basic/env-file.h create mode 100644 src/basic/env-util.c create mode 100644 src/basic/env-util.h create mode 100644 src/basic/errno-list.c create mode 100644 src/basic/errno-list.h create mode 100644 src/basic/errno-to-name.awk create mode 100644 src/basic/errno-util.h create mode 100644 src/basic/escape.c create mode 100644 src/basic/escape.h create mode 100644 src/basic/ether-addr-util.c create mode 100644 src/basic/ether-addr-util.h create mode 100644 src/basic/extract-word.c create mode 100644 src/basic/extract-word.h create mode 100644 src/basic/fd-util.c create mode 100644 src/basic/fd-util.h create mode 100644 src/basic/fileio.c create mode 100644 src/basic/fileio.h create mode 100644 src/basic/filesystems-gperf.gperf create mode 100644 src/basic/filesystems.c create mode 100644 src/basic/filesystems.h create mode 100644 src/basic/format-util.c create mode 100644 src/basic/format-util.h create mode 100644 src/basic/fs-util.c create mode 100644 src/basic/fs-util.h create mode 100644 src/basic/gcrypt-util.c create mode 100644 src/basic/gcrypt-util.h create mode 100755 src/basic/generate-af-list.sh create mode 100755 src/basic/generate-arphrd-list.sh create mode 100755 src/basic/generate-cap-list.sh create mode 100755 src/basic/generate-errno-list.sh create mode 100755 src/basic/generate-filesystem-list.py create mode 100755 src/basic/generate-filesystem-switch-case.py create mode 100644 src/basic/getopt-defs.h create mode 100644 src/basic/glob-util.c create mode 100644 src/basic/glob-util.h create mode 100644 src/basic/glyph-util.c create mode 100644 src/basic/glyph-util.h create mode 100644 src/basic/gunicode.c create mode 100644 src/basic/gunicode.h create mode 100644 src/basic/hash-funcs.c create mode 100644 src/basic/hash-funcs.h create mode 100644 src/basic/hashmap.c create mode 100644 src/basic/hashmap.h create mode 100644 src/basic/hexdecoct.c create mode 100644 src/basic/hexdecoct.h create mode 100644 src/basic/hmac.c create mode 100644 src/basic/hmac.h create mode 100644 src/basic/hostname-util.c create mode 100644 src/basic/hostname-util.h create mode 100644 src/basic/in-addr-util.c create mode 100644 src/basic/in-addr-util.h create mode 100644 src/basic/initrd-util.c create mode 100644 src/basic/initrd-util.h create mode 100644 src/basic/inotify-util.c create mode 100644 src/basic/inotify-util.h create mode 100644 src/basic/io-util.c create mode 100644 src/basic/io-util.h create mode 100644 src/basic/ioprio-util.c create mode 100644 src/basic/ioprio-util.h create mode 100644 src/basic/iovec-util.c create mode 100644 src/basic/iovec-util.h create mode 100644 src/basic/iovec-wrapper.c create mode 100644 src/basic/iovec-wrapper.h create mode 100644 src/basic/label.c create mode 100644 src/basic/label.h create mode 100644 src/basic/limits-util.c create mode 100644 src/basic/limits-util.h create mode 100644 src/basic/linux/README create mode 100644 src/basic/linux/batman_adv.h create mode 100644 src/basic/linux/btrfs.h create mode 100644 src/basic/linux/btrfs_tree.h create mode 100644 src/basic/linux/can/netlink.h create mode 100644 src/basic/linux/can/vxcan.h create mode 100644 src/basic/linux/cfm_bridge.h create mode 100644 src/basic/linux/fib_rules.h create mode 100644 src/basic/linux/fou.h create mode 100644 src/basic/linux/genetlink.h create mode 100644 src/basic/linux/hdlc/ioctl.h create mode 100644 src/basic/linux/if.h create mode 100644 src/basic/linux/if_addr.h create mode 100644 src/basic/linux/if_bonding.h create mode 100644 src/basic/linux/if_bridge.h create mode 100644 src/basic/linux/if_ether.h create mode 100644 src/basic/linux/if_link.h create mode 100644 src/basic/linux/if_macsec.h create mode 100644 src/basic/linux/if_tun.h create mode 100644 src/basic/linux/if_tunnel.h create mode 100644 src/basic/linux/in.h create mode 100644 src/basic/linux/in6.h create mode 100644 src/basic/linux/ipv6_route.h create mode 100644 src/basic/linux/l2tp.h create mode 100644 src/basic/linux/libc-compat.h create mode 100644 src/basic/linux/mrp_bridge.h create mode 100644 src/basic/linux/netdevice.h create mode 100644 src/basic/linux/netfilter/nf_tables.h create mode 100644 src/basic/linux/netfilter/nfnetlink.h create mode 100644 src/basic/linux/netlink.h create mode 100644 src/basic/linux/nexthop.h create mode 100644 src/basic/linux/nl80211.h create mode 100644 src/basic/linux/pkt_sched.h create mode 100644 src/basic/linux/rtnetlink.h create mode 100644 src/basic/linux/stddef.h create mode 100755 src/basic/linux/update.sh create mode 100644 src/basic/linux/wireguard.h create mode 100644 src/basic/list.h create mode 100644 src/basic/locale-util.c create mode 100644 src/basic/locale-util.h create mode 100644 src/basic/lock-util.c create mode 100644 src/basic/lock-util.h create mode 100644 src/basic/log.c create mode 100644 src/basic/log.h create mode 100644 src/basic/login-util.c create mode 100644 src/basic/login-util.h create mode 100644 src/basic/macro.h create mode 100644 src/basic/mallinfo-util.h create mode 100644 src/basic/math-util.h create mode 100644 src/basic/memfd-util.c create mode 100644 src/basic/memfd-util.h create mode 100644 src/basic/memory-util.c create mode 100644 src/basic/memory-util.h create mode 100644 src/basic/mempool.c create mode 100644 src/basic/mempool.h create mode 100644 src/basic/memstream-util.c create mode 100644 src/basic/memstream-util.h create mode 100644 src/basic/meson.build create mode 100644 src/basic/missing_audit.h create mode 100644 src/basic/missing_capability.h create mode 100644 src/basic/missing_drm.h create mode 100644 src/basic/missing_fcntl.h create mode 100644 src/basic/missing_fs.h create mode 100644 src/basic/missing_input.h create mode 100644 src/basic/missing_ioprio.h create mode 100644 src/basic/missing_keyctl.h create mode 100644 src/basic/missing_loop.h create mode 100644 src/basic/missing_magic.h create mode 100644 src/basic/missing_mman.h create mode 100644 src/basic/missing_mount.h create mode 100644 src/basic/missing_network.h create mode 100644 src/basic/missing_prctl.h create mode 100644 src/basic/missing_random.h create mode 100644 src/basic/missing_resource.h create mode 100644 src/basic/missing_sched.h create mode 100644 src/basic/missing_securebits.h create mode 100644 src/basic/missing_socket.h create mode 100644 src/basic/missing_stat.h create mode 100644 src/basic/missing_stdlib.h create mode 100644 src/basic/missing_syscall.h create mode 100644 src/basic/missing_syscall_def.h create mode 100644 src/basic/missing_syscalls.py create mode 100644 src/basic/missing_threads.h create mode 100644 src/basic/missing_timerfd.h create mode 100644 src/basic/missing_type.h create mode 100644 src/basic/missing_xfs.h create mode 100644 src/basic/mkdir.c create mode 100644 src/basic/mkdir.h create mode 100644 src/basic/mountpoint-util.c create mode 100644 src/basic/mountpoint-util.h create mode 100644 src/basic/namespace-util.c create mode 100644 src/basic/namespace-util.h create mode 100644 src/basic/nss-util.h create mode 100644 src/basic/nulstr-util.c create mode 100644 src/basic/nulstr-util.h create mode 100644 src/basic/ordered-set.c create mode 100644 src/basic/ordered-set.h create mode 100644 src/basic/origin-id.h create mode 100644 src/basic/os-util.c create mode 100644 src/basic/os-util.h create mode 100644 src/basic/parse-util.c create mode 100644 src/basic/parse-util.h create mode 100644 src/basic/path-lookup.c create mode 100644 src/basic/path-lookup.h create mode 100644 src/basic/path-util.c create mode 100644 src/basic/path-util.h create mode 100644 src/basic/pcapng.h create mode 100644 src/basic/percent-util.c create mode 100644 src/basic/percent-util.h create mode 100644 src/basic/pidref.c create mode 100644 src/basic/pidref.h create mode 100644 src/basic/prioq.c create mode 100644 src/basic/prioq.h create mode 100644 src/basic/proc-cmdline.c create mode 100644 src/basic/proc-cmdline.h create mode 100644 src/basic/process-util.c create mode 100644 src/basic/process-util.h create mode 100644 src/basic/procfs-util.c create mode 100644 src/basic/procfs-util.h create mode 100644 src/basic/psi-util.c create mode 100644 src/basic/psi-util.h create mode 100644 src/basic/pthread-util.h create mode 100644 src/basic/random-util.c create mode 100644 src/basic/random-util.h create mode 100644 src/basic/ratelimit.c create mode 100644 src/basic/ratelimit.h create mode 100644 src/basic/raw-clone.h create mode 100644 src/basic/raw-reboot.h create mode 100644 src/basic/recurse-dir.c create mode 100644 src/basic/recurse-dir.h create mode 100644 src/basic/replace-var.c create mode 100644 src/basic/replace-var.h create mode 100644 src/basic/rlimit-util.c create mode 100644 src/basic/rlimit-util.h create mode 100644 src/basic/runtime-scope.c create mode 100644 src/basic/runtime-scope.h create mode 100644 src/basic/set.h create mode 100644 src/basic/sigbus.c create mode 100644 src/basic/sigbus.h create mode 100644 src/basic/signal-util.c create mode 100644 src/basic/signal-util.h create mode 100644 src/basic/siphash24.c create mode 100644 src/basic/siphash24.h create mode 100644 src/basic/socket-util.c create mode 100644 src/basic/socket-util.h create mode 100644 src/basic/sort-util.c create mode 100644 src/basic/sort-util.h create mode 100644 src/basic/sparse-endian.h create mode 100644 src/basic/special.h create mode 100644 src/basic/stat-util.c create mode 100644 src/basic/stat-util.h create mode 100644 src/basic/static-destruct.h create mode 100644 src/basic/stdio-util.h create mode 100644 src/basic/strbuf.c create mode 100644 src/basic/strbuf.h create mode 100644 src/basic/string-table.c create mode 100644 src/basic/string-table.h create mode 100644 src/basic/string-util.c create mode 100644 src/basic/string-util.h create mode 100644 src/basic/strv.c create mode 100644 src/basic/strv.h create mode 100644 src/basic/strxcpyx.c create mode 100644 src/basic/strxcpyx.h create mode 100644 src/basic/sync-util.c create mode 100644 src/basic/sync-util.h create mode 100644 src/basic/syscall-list.txt create mode 100644 src/basic/syscalls-alpha.txt create mode 100644 src/basic/syscalls-arc.txt create mode 100644 src/basic/syscalls-arm.txt create mode 100644 src/basic/syscalls-arm64.txt create mode 100644 src/basic/syscalls-i386.txt create mode 100644 src/basic/syscalls-ia64.txt create mode 100644 src/basic/syscalls-loongarch64.txt create mode 100644 src/basic/syscalls-m68k.txt create mode 100644 src/basic/syscalls-mips64.txt create mode 100644 src/basic/syscalls-mips64n32.txt create mode 100644 src/basic/syscalls-mipso32.txt create mode 100644 src/basic/syscalls-parisc.txt create mode 100644 src/basic/syscalls-powerpc.txt create mode 100644 src/basic/syscalls-powerpc64.txt create mode 100644 src/basic/syscalls-riscv32.txt create mode 100644 src/basic/syscalls-riscv64.txt create mode 100644 src/basic/syscalls-s390.txt create mode 100644 src/basic/syscalls-s390x.txt create mode 100644 src/basic/syscalls-sparc.txt create mode 100644 src/basic/syscalls-x86_64.txt create mode 100644 src/basic/sysctl-util.c create mode 100644 src/basic/sysctl-util.h create mode 100644 src/basic/syslog-util.c create mode 100644 src/basic/syslog-util.h create mode 100644 src/basic/terminal-util.c create mode 100644 src/basic/terminal-util.h create mode 100644 src/basic/time-util.c create mode 100644 src/basic/time-util.h create mode 100644 src/basic/tmpfile-util.c create mode 100644 src/basic/tmpfile-util.h create mode 100644 src/basic/uid-alloc-range.c create mode 100644 src/basic/uid-alloc-range.h create mode 100644 src/basic/uid-range.c create mode 100644 src/basic/uid-range.h create mode 100644 src/basic/umask-util.h create mode 100644 src/basic/unaligned.h create mode 100644 src/basic/unit-def.c create mode 100644 src/basic/unit-def.h create mode 100644 src/basic/unit-file.c create mode 100644 src/basic/unit-file.h create mode 100644 src/basic/unit-name.c create mode 100644 src/basic/unit-name.h create mode 100644 src/basic/user-util.c create mode 100644 src/basic/user-util.h create mode 100644 src/basic/utf8.c create mode 100644 src/basic/utf8.h create mode 100644 src/basic/virt.c create mode 100644 src/basic/virt.h create mode 100644 src/basic/xattr-util.c create mode 100644 src/basic/xattr-util.h create mode 100644 src/battery-check/battery-check.c create mode 100644 src/battery-check/meson.build create mode 100644 src/binfmt/binfmt.c create mode 100644 src/binfmt/meson.build create mode 100644 src/boot/bless-boot-generator.c create mode 100644 src/boot/bless-boot.c create mode 100644 src/boot/boot-check-no-failures.c create mode 100644 src/boot/bootctl-install.c create mode 100644 src/boot/bootctl-install.h create mode 100644 src/boot/bootctl-random-seed.c create mode 100644 src/boot/bootctl-random-seed.h create mode 100644 src/boot/bootctl-reboot-to-firmware.c create mode 100644 src/boot/bootctl-reboot-to-firmware.h create mode 100644 src/boot/bootctl-set-efivar.c create mode 100644 src/boot/bootctl-set-efivar.h create mode 100644 src/boot/bootctl-status.c create mode 100644 src/boot/bootctl-status.h create mode 100644 src/boot/bootctl-systemd-efi-options.c create mode 100644 src/boot/bootctl-systemd-efi-options.h create mode 100644 src/boot/bootctl-uki.c create mode 100644 src/boot/bootctl-uki.h create mode 100644 src/boot/bootctl-util.c create mode 100644 src/boot/bootctl-util.h create mode 100644 src/boot/bootctl.c create mode 100644 src/boot/bootctl.h create mode 100644 src/boot/efi/UEFI_SECURITY.md create mode 100644 src/boot/efi/addon.c create mode 100644 src/boot/efi/bcd.c create mode 100644 src/boot/efi/bcd.h create mode 100644 src/boot/efi/boot.c create mode 100644 src/boot/efi/console.c create mode 100644 src/boot/efi/console.h create mode 100644 src/boot/efi/cpio.c create mode 100644 src/boot/efi/cpio.h create mode 100644 src/boot/efi/device-path-util.c create mode 100644 src/boot/efi/device-path-util.h create mode 100644 src/boot/efi/devicetree.c create mode 100644 src/boot/efi/devicetree.h create mode 100644 src/boot/efi/drivers.c create mode 100644 src/boot/efi/drivers.h create mode 100644 src/boot/efi/efi-string.c create mode 100644 src/boot/efi/efi-string.h create mode 100644 src/boot/efi/efi.h create mode 100644 src/boot/efi/fuzz-bcd.c create mode 100644 src/boot/efi/fuzz-efi-osrel.c create mode 100644 src/boot/efi/fuzz-efi-printf.c create mode 100644 src/boot/efi/fuzz-efi-string.c create mode 100644 src/boot/efi/graphics.c create mode 100644 src/boot/efi/graphics.h create mode 100644 src/boot/efi/initrd.c create mode 100644 src/boot/efi/initrd.h create mode 100644 src/boot/efi/linux.c create mode 100644 src/boot/efi/linux.h create mode 100644 src/boot/efi/linux_x86.c create mode 100644 src/boot/efi/log.c create mode 100644 src/boot/efi/log.h create mode 100644 src/boot/efi/measure.c create mode 100644 src/boot/efi/measure.h create mode 100644 src/boot/efi/meson.build create mode 100644 src/boot/efi/part-discovery.c create mode 100644 src/boot/efi/part-discovery.h create mode 100644 src/boot/efi/pe.c create mode 100644 src/boot/efi/pe.h create mode 100644 src/boot/efi/proto/block-io.h create mode 100644 src/boot/efi/proto/console-control.h create mode 100644 src/boot/efi/proto/device-path.h create mode 100644 src/boot/efi/proto/dt-fixup.h create mode 100644 src/boot/efi/proto/file-io.h create mode 100644 src/boot/efi/proto/graphics-output.h create mode 100644 src/boot/efi/proto/load-file.h create mode 100644 src/boot/efi/proto/loaded-image.h create mode 100644 src/boot/efi/proto/rng.h create mode 100644 src/boot/efi/proto/security-arch.h create mode 100644 src/boot/efi/proto/shell-parameters.h create mode 100644 src/boot/efi/proto/simple-text-io.h create mode 100644 src/boot/efi/proto/tcg.h create mode 100644 src/boot/efi/random-seed.c create mode 100644 src/boot/efi/random-seed.h create mode 100644 src/boot/efi/secure-boot.c create mode 100644 src/boot/efi/secure-boot.h create mode 100644 src/boot/efi/shim.c create mode 100644 src/boot/efi/shim.h create mode 100644 src/boot/efi/splash.c create mode 100644 src/boot/efi/splash.h create mode 100644 src/boot/efi/stub.c create mode 100644 src/boot/efi/test-bcd.c create mode 100644 src/boot/efi/test-efi-string.c create mode 100644 src/boot/efi/ticks.c create mode 100644 src/boot/efi/ticks.h create mode 100644 src/boot/efi/ubsan.c create mode 100644 src/boot/efi/util.c create mode 100644 src/boot/efi/util.h create mode 100644 src/boot/efi/vmm.c create mode 100644 src/boot/efi/vmm.h create mode 100644 src/boot/measure.c create mode 100644 src/boot/meson.build create mode 100644 src/busctl/busctl-introspect.c create mode 100644 src/busctl/busctl-introspect.h create mode 100644 src/busctl/busctl.c create mode 100644 src/busctl/meson.build create mode 100644 src/busctl/test-busctl-introspect.c create mode 100644 src/cgls/cgls.c create mode 100644 src/cgls/meson.build create mode 100644 src/cgroups-agent/cgroups-agent.c create mode 100644 src/cgroups-agent/meson.build create mode 100644 src/cgtop/cgtop.c create mode 100644 src/cgtop/meson.build create mode 100644 src/core/all-units.h create mode 100644 src/core/apparmor-setup.c create mode 100644 src/core/apparmor-setup.h create mode 100644 src/core/audit-fd.c create mode 100644 src/core/audit-fd.h create mode 100644 src/core/automount.c create mode 100644 src/core/automount.h create mode 100644 src/core/bpf-devices.c create mode 100644 src/core/bpf-devices.h create mode 100644 src/core/bpf-firewall.c create mode 100644 src/core/bpf-firewall.h create mode 100644 src/core/bpf-foreign.c create mode 100644 src/core/bpf-foreign.h create mode 100644 src/core/bpf-lsm.c create mode 100644 src/core/bpf-lsm.h create mode 100644 src/core/bpf-socket-bind.c create mode 100644 src/core/bpf-socket-bind.h create mode 100644 src/core/bpf-util.c create mode 100644 src/core/bpf-util.h create mode 100644 src/core/bpf/restrict_fs/meson.build create mode 100644 src/core/bpf/restrict_fs/restrict-fs-skel.h create mode 100644 src/core/bpf/restrict_fs/restrict-fs.bpf.c create mode 100644 src/core/bpf/restrict_ifaces/meson.build create mode 100644 src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h create mode 100644 src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c create mode 100644 src/core/bpf/socket_bind/meson.build create mode 100644 src/core/bpf/socket_bind/socket-bind-api.bpf.h create mode 100644 src/core/bpf/socket_bind/socket-bind-skel.h create mode 100644 src/core/bpf/socket_bind/socket-bind.bpf.c create mode 100644 src/core/cgroup.c create mode 100644 src/core/cgroup.h create mode 100644 src/core/core-varlink.c create mode 100644 src/core/core-varlink.h create mode 100644 src/core/crash-handler.c create mode 100644 src/core/crash-handler.h create mode 100644 src/core/dbus-automount.c create mode 100644 src/core/dbus-automount.h create mode 100644 src/core/dbus-cgroup.c create mode 100644 src/core/dbus-cgroup.h create mode 100644 src/core/dbus-device.c create mode 100644 src/core/dbus-device.h create mode 100644 src/core/dbus-execute.c create mode 100644 src/core/dbus-execute.h create mode 100644 src/core/dbus-job.c create mode 100644 src/core/dbus-job.h create mode 100644 src/core/dbus-kill.c create mode 100644 src/core/dbus-kill.h create mode 100644 src/core/dbus-manager.c create mode 100644 src/core/dbus-manager.h create mode 100644 src/core/dbus-mount.c create mode 100644 src/core/dbus-mount.h create mode 100644 src/core/dbus-path.c create mode 100644 src/core/dbus-path.h create mode 100644 src/core/dbus-scope.c create mode 100644 src/core/dbus-scope.h create mode 100644 src/core/dbus-service.c create mode 100644 src/core/dbus-service.h create mode 100644 src/core/dbus-slice.c create mode 100644 src/core/dbus-slice.h create mode 100644 src/core/dbus-socket.c create mode 100644 src/core/dbus-socket.h create mode 100644 src/core/dbus-swap.c create mode 100644 src/core/dbus-swap.h create mode 100644 src/core/dbus-target.c create mode 100644 src/core/dbus-target.h create mode 100644 src/core/dbus-timer.c create mode 100644 src/core/dbus-timer.h create mode 100644 src/core/dbus-unit.c create mode 100644 src/core/dbus-unit.h create mode 100644 src/core/dbus-util.c create mode 100644 src/core/dbus-util.h create mode 100644 src/core/dbus.c create mode 100644 src/core/dbus.h create mode 100644 src/core/device.c create mode 100644 src/core/device.h create mode 100644 src/core/dynamic-user.c create mode 100644 src/core/dynamic-user.h create mode 100644 src/core/efi-random.c create mode 100644 src/core/efi-random.h create mode 100644 src/core/emergency-action.c create mode 100644 src/core/emergency-action.h create mode 100644 src/core/exec-credential.c create mode 100644 src/core/exec-credential.h create mode 100644 src/core/exec-invoke.c create mode 100644 src/core/exec-invoke.h create mode 100644 src/core/execute-serialize.c create mode 100644 src/core/execute-serialize.h create mode 100644 src/core/execute.c create mode 100644 src/core/execute.h create mode 100644 src/core/executor.c create mode 100644 src/core/fuzz-execute-serialize.c create mode 100644 src/core/fuzz-manager-serialize.c create mode 100644 src/core/fuzz-manager-serialize.options create mode 100644 src/core/fuzz-unit-file.c create mode 100644 src/core/fuzz-unit-file.options create mode 100644 src/core/generator-setup.c create mode 100644 src/core/generator-setup.h create mode 100644 src/core/ima-setup.c create mode 100644 src/core/ima-setup.h create mode 100644 src/core/import-creds.c create mode 100644 src/core/import-creds.h create mode 100644 src/core/job.c create mode 100644 src/core/job.h create mode 100644 src/core/kill.c create mode 100644 src/core/kill.h create mode 100644 src/core/kmod-setup.c create mode 100644 src/core/kmod-setup.h create mode 100644 src/core/load-dropin.c create mode 100644 src/core/load-dropin.h create mode 100644 src/core/load-fragment-gperf-nulstr.awk create mode 100644 src/core/load-fragment-gperf.gperf.in create mode 100644 src/core/load-fragment.c create mode 100644 src/core/load-fragment.h create mode 100644 src/core/main.c create mode 100644 src/core/main.h create mode 100644 src/core/manager-dump.c create mode 100644 src/core/manager-dump.h create mode 100644 src/core/manager-serialize.c create mode 100644 src/core/manager-serialize.h create mode 100644 src/core/manager.c create mode 100644 src/core/manager.h create mode 100644 src/core/meson.build create mode 100644 src/core/mount.c create mode 100644 src/core/mount.h create mode 100644 src/core/namespace.c create mode 100644 src/core/namespace.h create mode 100644 src/core/org.freedesktop.systemd1.conf create mode 100644 src/core/org.freedesktop.systemd1.policy.in create mode 100644 src/core/org.freedesktop.systemd1.service create mode 100644 src/core/path.c create mode 100644 src/core/path.h create mode 100644 src/core/restrict-ifaces.c create mode 100644 src/core/restrict-ifaces.h create mode 100644 src/core/scope.c create mode 100644 src/core/scope.h create mode 100644 src/core/selinux-access.c create mode 100644 src/core/selinux-access.h create mode 100644 src/core/selinux-setup.c create mode 100644 src/core/selinux-setup.h create mode 100644 src/core/service.c create mode 100644 src/core/service.h create mode 100644 src/core/show-status.c create mode 100644 src/core/show-status.h create mode 100644 src/core/slice.c create mode 100644 src/core/slice.h create mode 100644 src/core/smack-setup.c create mode 100644 src/core/smack-setup.h create mode 100644 src/core/socket.c create mode 100644 src/core/socket.h create mode 100644 src/core/swap.c create mode 100644 src/core/swap.h create mode 100644 src/core/system.conf.in create mode 100644 src/core/systemd.pc.in create mode 100644 src/core/target.c create mode 100644 src/core/target.h create mode 100644 src/core/timer.c create mode 100644 src/core/timer.h create mode 100644 src/core/transaction.c create mode 100644 src/core/transaction.h create mode 100644 src/core/unit-dependency-atom.c create mode 100644 src/core/unit-dependency-atom.h create mode 100644 src/core/unit-printf.c create mode 100644 src/core/unit-printf.h create mode 100644 src/core/unit-serialize.c create mode 100644 src/core/unit-serialize.h create mode 100644 src/core/unit.c create mode 100644 src/core/unit.h create mode 100644 src/core/user.conf.in create mode 100644 src/coredump/coredump-vacuum.c create mode 100644 src/coredump/coredump-vacuum.h create mode 100644 src/coredump/coredump.c create mode 100644 src/coredump/coredump.conf create mode 100644 src/coredump/coredumpctl.c create mode 100644 src/coredump/meson.build create mode 100644 src/coredump/test-coredump-vacuum.c create mode 100644 src/creds/creds.c create mode 100644 src/creds/meson.build create mode 100644 src/cryptenroll/cryptenroll-fido2.c create mode 100644 src/cryptenroll/cryptenroll-fido2.h create mode 100644 src/cryptenroll/cryptenroll-list.c create mode 100644 src/cryptenroll/cryptenroll-list.h create mode 100644 src/cryptenroll/cryptenroll-password.c create mode 100644 src/cryptenroll/cryptenroll-password.h create mode 100644 src/cryptenroll/cryptenroll-pkcs11.c create mode 100644 src/cryptenroll/cryptenroll-pkcs11.h create mode 100644 src/cryptenroll/cryptenroll-recovery.c create mode 100644 src/cryptenroll/cryptenroll-recovery.h create mode 100644 src/cryptenroll/cryptenroll-tpm2.c create mode 100644 src/cryptenroll/cryptenroll-tpm2.h create mode 100644 src/cryptenroll/cryptenroll-wipe.c create mode 100644 src/cryptenroll/cryptenroll-wipe.h create mode 100644 src/cryptenroll/cryptenroll.c create mode 100644 src/cryptenroll/cryptenroll.h create mode 100644 src/cryptenroll/meson.build create mode 100644 src/cryptsetup/cryptsetup-generator.c create mode 100644 src/cryptsetup/cryptsetup-keyfile.c create mode 100644 src/cryptsetup/cryptsetup-keyfile.h create mode 100644 src/cryptsetup/cryptsetup-pkcs11.c create mode 100644 src/cryptsetup/cryptsetup-pkcs11.h create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-fido2.c create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-pkcs11.c create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-tpm2.c create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.c create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.h create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token.h create mode 100644 src/cryptsetup/cryptsetup-tokens/cryptsetup-token.sym create mode 100644 src/cryptsetup/cryptsetup-tokens/luks2-fido2.c create mode 100644 src/cryptsetup/cryptsetup-tokens/luks2-fido2.h create mode 100644 src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.c create mode 100644 src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.h create mode 100644 src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c create mode 100644 src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h create mode 100644 src/cryptsetup/cryptsetup-tokens/meson.build create mode 100644 src/cryptsetup/cryptsetup-tpm2.c create mode 100644 src/cryptsetup/cryptsetup-tpm2.h create mode 100644 src/cryptsetup/cryptsetup.c create mode 100644 src/cryptsetup/meson.build create mode 100644 src/debug-generator/debug-generator.c create mode 100644 src/debug-generator/meson.build create mode 100644 src/delta/delta.c create mode 100644 src/delta/meson.build create mode 100644 src/detect-virt/detect-virt.c create mode 100644 src/detect-virt/meson.build create mode 100644 src/dissect/dissect.c create mode 100644 src/dissect/meson.build create mode 100644 src/environment-d-generator/environment-d-generator.c create mode 100644 src/environment-d-generator/meson.build create mode 100644 src/escape/escape.c create mode 100644 src/escape/meson.build create mode 100644 src/firstboot/firstboot.c create mode 100644 src/firstboot/meson.build create mode 100644 src/fsck/fsck.c create mode 100644 src/fsck/meson.build create mode 100644 src/fstab-generator/fstab-generator.c create mode 100644 src/fstab-generator/meson.build create mode 100644 src/fundamental/bootspec-fundamental.c create mode 100644 src/fundamental/bootspec-fundamental.h create mode 100644 src/fundamental/confidential-virt-fundamental.h create mode 100644 src/fundamental/efivars-fundamental.c create mode 100644 src/fundamental/efivars-fundamental.h create mode 100644 src/fundamental/logarithm.h create mode 100644 src/fundamental/macro-fundamental.h create mode 100644 src/fundamental/memory-util-fundamental.h create mode 100644 src/fundamental/meson.build create mode 100644 src/fundamental/sbat.h create mode 100644 src/fundamental/sha256.c create mode 100644 src/fundamental/sha256.h create mode 100644 src/fundamental/string-util-fundamental.c create mode 100644 src/fundamental/string-util-fundamental.h create mode 100644 src/fundamental/tpm2-pcr.h create mode 100644 src/fundamental/uki.c create mode 100644 src/fundamental/uki.h create mode 100644 src/fundamental/unaligned-fundamental.h create mode 100644 src/fuzz/fuzz-bootspec-gen.py create mode 100644 src/fuzz/fuzz-bootspec.c create mode 100644 src/fuzz/fuzz-bootspec.options create mode 100644 src/fuzz/fuzz-bus-label.c create mode 100644 src/fuzz/fuzz-calendarspec.c create mode 100644 src/fuzz/fuzz-catalog.c create mode 100644 src/fuzz/fuzz-compress.c create mode 100644 src/fuzz/fuzz-env-file.c create mode 100644 src/fuzz/fuzz-env-file.options create mode 100644 src/fuzz/fuzz-hostname-setup.c create mode 100644 src/fuzz/fuzz-json.c create mode 100644 src/fuzz/fuzz-main.c create mode 100644 src/fuzz/fuzz-time-util.c create mode 100644 src/fuzz/fuzz-udev-database.c create mode 100644 src/fuzz/fuzz-varlink-idl.c create mode 100644 src/fuzz/fuzz-varlink.c create mode 100644 src/fuzz/fuzz.h create mode 100644 src/fuzz/meson.build create mode 100644 src/getty-generator/getty-generator.c create mode 100644 src/getty-generator/meson.build create mode 100644 src/gpt-auto-generator/gpt-auto-generator.c create mode 100644 src/gpt-auto-generator/meson.build create mode 100644 src/hibernate-resume/hibernate-resume-config.c create mode 100644 src/hibernate-resume/hibernate-resume-config.h create mode 100644 src/hibernate-resume/hibernate-resume-generator.c create mode 100644 src/hibernate-resume/hibernate-resume.c create mode 100644 src/hibernate-resume/meson.build create mode 100644 src/home/home-util.c create mode 100644 src/home/home-util.h create mode 100644 src/home/homectl-fido2.c create mode 100644 src/home/homectl-fido2.h create mode 100644 src/home/homectl-pkcs11.c create mode 100644 src/home/homectl-pkcs11.h create mode 100644 src/home/homectl-recovery-key.c create mode 100644 src/home/homectl-recovery-key.h create mode 100644 src/home/homectl.c create mode 100644 src/home/homed-bus.c create mode 100644 src/home/homed-bus.h create mode 100644 src/home/homed-conf.c create mode 100644 src/home/homed-conf.h create mode 100644 src/home/homed-gperf.gperf create mode 100644 src/home/homed-home-bus.c create mode 100644 src/home/homed-home-bus.h create mode 100644 src/home/homed-home.c create mode 100644 src/home/homed-home.h create mode 100644 src/home/homed-manager-bus.c create mode 100644 src/home/homed-manager-bus.h create mode 100644 src/home/homed-manager.c create mode 100644 src/home/homed-manager.h create mode 100644 src/home/homed-operation.c create mode 100644 src/home/homed-operation.h create mode 100644 src/home/homed-varlink.c create mode 100644 src/home/homed-varlink.h create mode 100644 src/home/homed.c create mode 100644 src/home/homed.conf create mode 100644 src/home/homework-cifs.c create mode 100644 src/home/homework-cifs.h create mode 100644 src/home/homework-directory.c create mode 100644 src/home/homework-directory.h create mode 100644 src/home/homework-fido2.c create mode 100644 src/home/homework-fido2.h create mode 100644 src/home/homework-fscrypt.c create mode 100644 src/home/homework-fscrypt.h create mode 100644 src/home/homework-luks.c create mode 100644 src/home/homework-luks.h create mode 100644 src/home/homework-mount.c create mode 100644 src/home/homework-mount.h create mode 100644 src/home/homework-password-cache.c create mode 100644 src/home/homework-password-cache.h create mode 100644 src/home/homework-pkcs11.c create mode 100644 src/home/homework-pkcs11.h create mode 100644 src/home/homework-quota.c create mode 100644 src/home/homework-quota.h create mode 100644 src/home/homework.c create mode 100644 src/home/homework.h create mode 100644 src/home/meson.build create mode 100644 src/home/org.freedesktop.home1.conf create mode 100644 src/home/org.freedesktop.home1.policy create mode 100644 src/home/org.freedesktop.home1.service create mode 100644 src/home/pam_systemd_home.c create mode 100644 src/home/pam_systemd_home.sym create mode 100644 src/home/user-record-password-quality.c create mode 100644 src/home/user-record-password-quality.h create mode 100644 src/home/user-record-sign.c create mode 100644 src/home/user-record-sign.h create mode 100644 src/home/user-record-util.c create mode 100644 src/home/user-record-util.h create mode 100644 src/hostname/hostnamectl.c create mode 100644 src/hostname/hostnamed.c create mode 100644 src/hostname/meson.build create mode 100644 src/hostname/org.freedesktop.hostname1.conf create mode 100644 src/hostname/org.freedesktop.hostname1.policy create mode 100644 src/hostname/org.freedesktop.hostname1.service create mode 100644 src/hwdb/hwdb.c create mode 100644 src/hwdb/meson.build create mode 100644 src/id128/id128.c create mode 100644 src/id128/meson.build create mode 100644 src/import/curl-util.c create mode 100644 src/import/curl-util.h create mode 100644 src/import/export-raw.c create mode 100644 src/import/export-raw.h create mode 100644 src/import/export-tar.c create mode 100644 src/import/export-tar.h create mode 100644 src/import/export.c create mode 100644 src/import/import-common.c create mode 100644 src/import/import-common.h create mode 100644 src/import/import-compress.c create mode 100644 src/import/import-compress.h create mode 100644 src/import/import-fs.c create mode 100644 src/import/import-pubring.gpg create mode 100644 src/import/import-raw.c create mode 100644 src/import/import-raw.h create mode 100644 src/import/import-tar.c create mode 100644 src/import/import-tar.h create mode 100644 src/import/import.c create mode 100644 src/import/importd.c create mode 100644 src/import/meson.build create mode 100644 src/import/org.freedesktop.import1.conf create mode 100644 src/import/org.freedesktop.import1.policy create mode 100644 src/import/org.freedesktop.import1.service create mode 100644 src/import/pull-common.c create mode 100644 src/import/pull-common.h create mode 100644 src/import/pull-job.c create mode 100644 src/import/pull-job.h create mode 100644 src/import/pull-raw.c create mode 100644 src/import/pull-raw.h create mode 100644 src/import/pull-tar.c create mode 100644 src/import/pull-tar.h create mode 100644 src/import/pull.c create mode 100644 src/import/qcow2-util.c create mode 100644 src/import/qcow2-util.h create mode 100644 src/import/test-qcow2.c create mode 100644 src/initctl/initctl.c create mode 100644 src/initctl/meson.build create mode 100644 src/integritysetup/integrity-util.c create mode 100644 src/integritysetup/integrity-util.h create mode 100644 src/integritysetup/integritysetup-generator.c create mode 100644 src/integritysetup/integritysetup.c create mode 100644 src/integritysetup/meson.build create mode 100644 src/journal-remote/browse.html create mode 100644 src/journal-remote/fuzz-journal-remote.c create mode 100644 src/journal-remote/fuzz-journal-remote.options create mode 100644 src/journal-remote/journal-gatewayd.c create mode 100644 src/journal-remote/journal-remote-main.c create mode 100644 src/journal-remote/journal-remote-parse.c create mode 100644 src/journal-remote/journal-remote-parse.h create mode 100644 src/journal-remote/journal-remote-write.c create mode 100644 src/journal-remote/journal-remote-write.h create mode 100644 src/journal-remote/journal-remote.c create mode 100644 src/journal-remote/journal-remote.conf.in create mode 100644 src/journal-remote/journal-remote.h create mode 100644 src/journal-remote/journal-upload-journal.c create mode 100644 src/journal-remote/journal-upload.c create mode 100644 src/journal-remote/journal-upload.conf.in create mode 100644 src/journal-remote/journal-upload.h create mode 100755 src/journal-remote/log-generator.py create mode 100644 src/journal-remote/meson.build create mode 100644 src/journal-remote/microhttpd-util.c create mode 100644 src/journal-remote/microhttpd-util.h create mode 100644 src/journal/bsod.c create mode 100644 src/journal/cat.c create mode 100644 src/journal/fuzz-journald-audit.c create mode 100644 src/journal/fuzz-journald-kmsg.c create mode 100644 src/journal/fuzz-journald-native-fd.c create mode 100644 src/journal/fuzz-journald-native.c create mode 100644 src/journal/fuzz-journald-stream.c create mode 100644 src/journal/fuzz-journald-stream.options create mode 100644 src/journal/fuzz-journald-syslog.c create mode 100644 src/journal/fuzz-journald.c create mode 100644 src/journal/fuzz-journald.h create mode 100644 src/journal/journalctl.c create mode 100644 src/journal/journald-audit.c create mode 100644 src/journal/journald-audit.h create mode 100644 src/journal/journald-client.c create mode 100644 src/journal/journald-client.h create mode 100644 src/journal/journald-console.c create mode 100644 src/journal/journald-console.h create mode 100644 src/journal/journald-context.c create mode 100644 src/journal/journald-context.h create mode 100644 src/journal/journald-gperf.gperf create mode 100644 src/journal/journald-kmsg.c create mode 100644 src/journal/journald-kmsg.h create mode 100644 src/journal/journald-native.c create mode 100644 src/journal/journald-native.h create mode 100644 src/journal/journald-rate-limit.c create mode 100644 src/journal/journald-rate-limit.h create mode 100644 src/journal/journald-server.c create mode 100644 src/journal/journald-server.h create mode 100644 src/journal/journald-stream.c create mode 100644 src/journal/journald-stream.h create mode 100644 src/journal/journald-syslog.c create mode 100644 src/journal/journald-syslog.h create mode 100644 src/journal/journald-wall.c create mode 100644 src/journal/journald-wall.h create mode 100644 src/journal/journald.c create mode 100644 src/journal/journald.conf create mode 100644 src/journal/meson.build create mode 100644 src/journal/test-journald-config.c create mode 100644 src/journal/test-journald-syslog.c create mode 100644 src/journal/test-journald-tables.c create mode 100755 src/kernel-install/50-depmod.install create mode 100755 src/kernel-install/60-ukify.install.in create mode 100755 src/kernel-install/90-loaderentry.install.in create mode 100755 src/kernel-install/90-uki-copy.install create mode 100644 src/kernel-install/install.conf create mode 100644 src/kernel-install/kernel-install.c create mode 100644 src/kernel-install/meson.build create mode 100755 src/kernel-install/test-kernel-install.sh create mode 100644 src/libsystemd-network/arp-util.c create mode 100644 src/libsystemd-network/arp-util.h create mode 100644 src/libsystemd-network/dhcp-client-internal.h create mode 100644 src/libsystemd-network/dhcp-identifier.c create mode 100644 src/libsystemd-network/dhcp-identifier.h create mode 100644 src/libsystemd-network/dhcp-lease-internal.h create mode 100644 src/libsystemd-network/dhcp-network.c create mode 100644 src/libsystemd-network/dhcp-network.h create mode 100644 src/libsystemd-network/dhcp-option.c create mode 100644 src/libsystemd-network/dhcp-option.h create mode 100644 src/libsystemd-network/dhcp-packet.c create mode 100644 src/libsystemd-network/dhcp-packet.h create mode 100644 src/libsystemd-network/dhcp-protocol.h create mode 100644 src/libsystemd-network/dhcp-server-internal.h create mode 100644 src/libsystemd-network/dhcp6-client-internal.h create mode 100644 src/libsystemd-network/dhcp6-internal.h create mode 100644 src/libsystemd-network/dhcp6-lease-internal.h create mode 100644 src/libsystemd-network/dhcp6-network.c create mode 100644 src/libsystemd-network/dhcp6-option.c create mode 100644 src/libsystemd-network/dhcp6-option.h create mode 100644 src/libsystemd-network/dhcp6-protocol.c create mode 100644 src/libsystemd-network/dhcp6-protocol.h create mode 100644 src/libsystemd-network/fuzz-dhcp-client.c create mode 100644 src/libsystemd-network/fuzz-dhcp-server-relay.c create mode 100644 src/libsystemd-network/fuzz-dhcp-server.c create mode 100644 src/libsystemd-network/fuzz-dhcp6-client.c create mode 100644 src/libsystemd-network/fuzz-dhcp6-client.options create mode 100644 src/libsystemd-network/fuzz-lldp-rx.c create mode 100644 src/libsystemd-network/fuzz-lldp-rx.options create mode 100644 src/libsystemd-network/fuzz-ndisc-rs.c create mode 100644 src/libsystemd-network/fuzz-ndisc-rs.options create mode 100644 src/libsystemd-network/icmp6-util-unix.c create mode 100644 src/libsystemd-network/icmp6-util-unix.h create mode 100644 src/libsystemd-network/icmp6-util.c create mode 100644 src/libsystemd-network/icmp6-util.h create mode 100644 src/libsystemd-network/lldp-neighbor.c create mode 100644 src/libsystemd-network/lldp-neighbor.h create mode 100644 src/libsystemd-network/lldp-network.c create mode 100644 src/libsystemd-network/lldp-network.h create mode 100644 src/libsystemd-network/lldp-rx-internal.h create mode 100644 src/libsystemd-network/meson.build create mode 100644 src/libsystemd-network/ndisc-internal.h create mode 100644 src/libsystemd-network/ndisc-protocol.c create mode 100644 src/libsystemd-network/ndisc-protocol.h create mode 100644 src/libsystemd-network/ndisc-router.c create mode 100644 src/libsystemd-network/ndisc-router.h create mode 100644 src/libsystemd-network/network-common.c create mode 100644 src/libsystemd-network/network-common.h create mode 100644 src/libsystemd-network/network-internal.c create mode 100644 src/libsystemd-network/network-internal.h create mode 100644 src/libsystemd-network/radv-internal.h create mode 100644 src/libsystemd-network/sd-dhcp-client.c create mode 100644 src/libsystemd-network/sd-dhcp-lease.c create mode 100644 src/libsystemd-network/sd-dhcp-server.c create mode 100644 src/libsystemd-network/sd-dhcp6-client.c create mode 100644 src/libsystemd-network/sd-dhcp6-lease.c create mode 100644 src/libsystemd-network/sd-ipv4acd.c create mode 100644 src/libsystemd-network/sd-ipv4ll.c create mode 100644 src/libsystemd-network/sd-lldp-rx.c create mode 100644 src/libsystemd-network/sd-lldp-tx.c create mode 100644 src/libsystemd-network/sd-ndisc.c create mode 100644 src/libsystemd-network/sd-radv.c create mode 100644 src/libsystemd-network/test-acd.c create mode 100644 src/libsystemd-network/test-dhcp-client.c create mode 100644 src/libsystemd-network/test-dhcp-option.c create mode 100644 src/libsystemd-network/test-dhcp-server.c create mode 100644 src/libsystemd-network/test-dhcp6-client.c create mode 100644 src/libsystemd-network/test-ipv4ll-manual.c create mode 100644 src/libsystemd-network/test-ipv4ll.c create mode 100644 src/libsystemd-network/test-lldp-rx.c create mode 100644 src/libsystemd-network/test-ndisc-ra.c create mode 100644 src/libsystemd-network/test-ndisc-rs.c create mode 100644 src/libsystemd-network/test-sd-dhcp-lease.c create mode 100644 src/libsystemd/libsystemd.pc.in create mode 100644 src/libsystemd/libsystemd.sym create mode 100644 src/libsystemd/meson.build create mode 100644 src/libsystemd/sd-bus/bus-common-errors.c create mode 100644 src/libsystemd/sd-bus/bus-common-errors.h create mode 100644 src/libsystemd/sd-bus/bus-container.c create mode 100644 src/libsystemd/sd-bus/bus-container.h create mode 100644 src/libsystemd/sd-bus/bus-control.c create mode 100644 src/libsystemd/sd-bus/bus-control.h create mode 100644 src/libsystemd/sd-bus/bus-convenience.c create mode 100644 src/libsystemd/sd-bus/bus-creds.c create mode 100644 src/libsystemd/sd-bus/bus-creds.h create mode 100644 src/libsystemd/sd-bus/bus-dump.c create mode 100644 src/libsystemd/sd-bus/bus-dump.h create mode 100644 src/libsystemd/sd-bus/bus-error.c create mode 100644 src/libsystemd/sd-bus/bus-error.h create mode 100644 src/libsystemd/sd-bus/bus-internal.c create mode 100644 src/libsystemd/sd-bus/bus-internal.h create mode 100644 src/libsystemd/sd-bus/bus-introspect.c create mode 100644 src/libsystemd/sd-bus/bus-introspect.h create mode 100644 src/libsystemd/sd-bus/bus-kernel.c create mode 100644 src/libsystemd/sd-bus/bus-kernel.h create mode 100644 src/libsystemd/sd-bus/bus-match.c create mode 100644 src/libsystemd/sd-bus/bus-match.h create mode 100644 src/libsystemd/sd-bus/bus-message.c create mode 100644 src/libsystemd/sd-bus/bus-message.h create mode 100644 src/libsystemd/sd-bus/bus-objects.c create mode 100644 src/libsystemd/sd-bus/bus-objects.h create mode 100644 src/libsystemd/sd-bus/bus-protocol.h create mode 100644 src/libsystemd/sd-bus/bus-signature.c create mode 100644 src/libsystemd/sd-bus/bus-signature.h create mode 100644 src/libsystemd/sd-bus/bus-slot.c create mode 100644 src/libsystemd/sd-bus/bus-slot.h create mode 100644 src/libsystemd/sd-bus/bus-socket.c create mode 100644 src/libsystemd/sd-bus/bus-socket.h create mode 100644 src/libsystemd/sd-bus/bus-track.c create mode 100644 src/libsystemd/sd-bus/bus-track.h create mode 100644 src/libsystemd/sd-bus/bus-type.c create mode 100644 src/libsystemd/sd-bus/bus-type.h create mode 100644 src/libsystemd/sd-bus/fuzz-bus-match.c create mode 100644 src/libsystemd/sd-bus/fuzz-bus-match.options create mode 100644 src/libsystemd/sd-bus/fuzz-bus-message.c create mode 100644 src/libsystemd/sd-bus/sd-bus.c create mode 100644 src/libsystemd/sd-bus/test-bus-address.c create mode 100644 src/libsystemd/sd-bus/test-bus-benchmark.c create mode 100644 src/libsystemd/sd-bus/test-bus-chat.c create mode 100644 src/libsystemd/sd-bus/test-bus-cleanup.c create mode 100644 src/libsystemd/sd-bus/test-bus-creds.c create mode 100644 src/libsystemd/sd-bus/test-bus-error.c create mode 100644 src/libsystemd/sd-bus/test-bus-introspect.c create mode 100644 src/libsystemd/sd-bus/test-bus-marshal.c create mode 100644 src/libsystemd/sd-bus/test-bus-match.c create mode 100644 src/libsystemd/sd-bus/test-bus-objects.c create mode 100644 src/libsystemd/sd-bus/test-bus-peersockaddr.c create mode 100644 src/libsystemd/sd-bus/test-bus-queue-ref-cycle.c create mode 100644 src/libsystemd/sd-bus/test-bus-server.c create mode 100644 src/libsystemd/sd-bus/test-bus-signature.c create mode 100644 src/libsystemd/sd-bus/test-bus-track.c create mode 120000 src/libsystemd/sd-bus/test-bus-vtable-cc.cc create mode 100644 src/libsystemd/sd-bus/test-bus-vtable.c create mode 100644 src/libsystemd/sd-bus/test-bus-watch-bind.c create mode 100644 src/libsystemd/sd-bus/test-vtable-data.h create mode 100644 src/libsystemd/sd-daemon/sd-daemon.c create mode 100644 src/libsystemd/sd-device/device-enumerator-private.h create mode 100644 src/libsystemd/sd-device/device-enumerator.c create mode 100644 src/libsystemd/sd-device/device-filter.c create mode 100644 src/libsystemd/sd-device/device-filter.h create mode 100644 src/libsystemd/sd-device/device-internal.h create mode 100644 src/libsystemd/sd-device/device-monitor-private.h create mode 100644 src/libsystemd/sd-device/device-monitor.c create mode 100644 src/libsystemd/sd-device/device-private.c create mode 100644 src/libsystemd/sd-device/device-private.h create mode 100644 src/libsystemd/sd-device/device-util.c create mode 100644 src/libsystemd/sd-device/device-util.h create mode 100644 src/libsystemd/sd-device/sd-device.c create mode 100644 src/libsystemd/sd-device/test-device-util.c create mode 100644 src/libsystemd/sd-device/test-sd-device-monitor.c create mode 100644 src/libsystemd/sd-device/test-sd-device-thread.c create mode 100644 src/libsystemd/sd-device/test-sd-device.c create mode 100644 src/libsystemd/sd-event/event-source.h create mode 100644 src/libsystemd/sd-event/event-util.c create mode 100644 src/libsystemd/sd-event/event-util.h create mode 100644 src/libsystemd/sd-event/sd-event.c create mode 100644 src/libsystemd/sd-event/test-event.c create mode 100644 src/libsystemd/sd-hwdb/hwdb-internal.h create mode 100644 src/libsystemd/sd-hwdb/sd-hwdb.c create mode 100644 src/libsystemd/sd-id128/id128-util.c create mode 100644 src/libsystemd/sd-id128/id128-util.h create mode 100644 src/libsystemd/sd-id128/sd-id128.c create mode 100644 src/libsystemd/sd-journal/audit-type.c create mode 100644 src/libsystemd/sd-journal/audit-type.h create mode 100644 src/libsystemd/sd-journal/audit_type-to-name.awk create mode 100644 src/libsystemd/sd-journal/catalog.c create mode 100644 src/libsystemd/sd-journal/catalog.h create mode 100644 src/libsystemd/sd-journal/fsprg.c create mode 100644 src/libsystemd/sd-journal/fsprg.h create mode 100755 src/libsystemd/sd-journal/generate-audit_type-list.sh create mode 100644 src/libsystemd/sd-journal/journal-authenticate.c create mode 100644 src/libsystemd/sd-journal/journal-authenticate.h create mode 100644 src/libsystemd/sd-journal/journal-def.h create mode 100644 src/libsystemd/sd-journal/journal-file.c create mode 100644 src/libsystemd/sd-journal/journal-file.h create mode 100644 src/libsystemd/sd-journal/journal-internal.h create mode 100644 src/libsystemd/sd-journal/journal-send.c create mode 100644 src/libsystemd/sd-journal/journal-send.h create mode 100644 src/libsystemd/sd-journal/journal-vacuum.c create mode 100644 src/libsystemd/sd-journal/journal-vacuum.h create mode 100644 src/libsystemd/sd-journal/journal-verify.c create mode 100644 src/libsystemd/sd-journal/journal-verify.h create mode 100644 src/libsystemd/sd-journal/lookup3.c create mode 100644 src/libsystemd/sd-journal/lookup3.h create mode 100644 src/libsystemd/sd-journal/mmap-cache.c create mode 100644 src/libsystemd/sd-journal/mmap-cache.h create mode 100644 src/libsystemd/sd-journal/sd-journal.c create mode 100644 src/libsystemd/sd-journal/test-audit-type.c create mode 100644 src/libsystemd/sd-journal/test-catalog.c create mode 100644 src/libsystemd/sd-journal/test-journal-append.c create mode 100644 src/libsystemd/sd-journal/test-journal-enum.c create mode 100644 src/libsystemd/sd-journal/test-journal-file.c create mode 100644 src/libsystemd/sd-journal/test-journal-flush.c create mode 100644 src/libsystemd/sd-journal/test-journal-init.c create mode 100644 src/libsystemd/sd-journal/test-journal-interleaving.c create mode 100644 src/libsystemd/sd-journal/test-journal-match.c create mode 100644 src/libsystemd/sd-journal/test-journal-send.c create mode 100644 src/libsystemd/sd-journal/test-journal-stream.c create mode 100644 src/libsystemd/sd-journal/test-journal-verify.c create mode 100644 src/libsystemd/sd-journal/test-journal.c create mode 100644 src/libsystemd/sd-journal/test-mmap-cache.c create mode 100644 src/libsystemd/sd-login/sd-login.c create mode 100644 src/libsystemd/sd-login/test-login.c create mode 100644 src/libsystemd/sd-netlink/netlink-genl.c create mode 100644 src/libsystemd/sd-netlink/netlink-genl.h create mode 100644 src/libsystemd/sd-netlink/netlink-internal.h create mode 100644 src/libsystemd/sd-netlink/netlink-message-nfnl.c create mode 100644 src/libsystemd/sd-netlink/netlink-message-rtnl.c create mode 100644 src/libsystemd/sd-netlink/netlink-message.c create mode 100644 src/libsystemd/sd-netlink/netlink-slot.c create mode 100644 src/libsystemd/sd-netlink/netlink-slot.h create mode 100644 src/libsystemd/sd-netlink/netlink-socket.c create mode 100644 src/libsystemd/sd-netlink/netlink-types-genl.c create mode 100644 src/libsystemd/sd-netlink/netlink-types-internal.h create mode 100644 src/libsystemd/sd-netlink/netlink-types-nfnl.c create mode 100644 src/libsystemd/sd-netlink/netlink-types-rtnl.c create mode 100644 src/libsystemd/sd-netlink/netlink-types.c create mode 100644 src/libsystemd/sd-netlink/netlink-types.h create mode 100644 src/libsystemd/sd-netlink/netlink-util.c create mode 100644 src/libsystemd/sd-netlink/netlink-util.h create mode 100644 src/libsystemd/sd-netlink/sd-netlink.c create mode 100644 src/libsystemd/sd-netlink/test-netlink.c create mode 100644 src/libsystemd/sd-network/network-util.c create mode 100644 src/libsystemd/sd-network/network-util.h create mode 100644 src/libsystemd/sd-network/sd-network.c create mode 100644 src/libsystemd/sd-path/sd-path.c create mode 100644 src/libsystemd/sd-resolve/resolve-private.h create mode 100644 src/libsystemd/sd-resolve/sd-resolve.c create mode 100644 src/libsystemd/sd-resolve/test-resolve.c create mode 100644 src/libudev/libudev-device-internal.h create mode 100644 src/libudev/libudev-device.c create mode 100644 src/libudev/libudev-enumerate.c create mode 100644 src/libudev/libudev-hwdb.c create mode 100644 src/libudev/libudev-list-internal.h create mode 100644 src/libudev/libudev-list.c create mode 100644 src/libudev/libudev-monitor.c create mode 100644 src/libudev/libudev-queue.c create mode 100644 src/libudev/libudev-util.c create mode 100644 src/libudev/libudev-util.h create mode 100644 src/libudev/libudev.c create mode 100644 src/libudev/libudev.h create mode 100644 src/libudev/libudev.pc.in create mode 100644 src/libudev/libudev.sym create mode 100644 src/libudev/meson.build create mode 100644 src/libudev/test-libudev.c create mode 100644 src/libudev/test-udev-device-thread.c create mode 100644 src/locale/kbd-model-map create mode 100644 src/locale/language-fallback-map create mode 100644 src/locale/localectl.c create mode 100644 src/locale/localed-util.c create mode 100644 src/locale/localed-util.h create mode 100644 src/locale/localed.c create mode 100644 src/locale/meson.build create mode 100644 src/locale/org.freedesktop.locale1.conf create mode 100644 src/locale/org.freedesktop.locale1.policy create mode 100644 src/locale/org.freedesktop.locale1.service create mode 100644 src/locale/test-localed-util.c create mode 100644 src/locale/xkbcommon-util.c create mode 100644 src/locale/xkbcommon-util.h create mode 100644 src/login/inhibit.c create mode 100644 src/login/loginctl.c create mode 100644 src/login/logind-action.c create mode 100644 src/login/logind-action.h create mode 100644 src/login/logind-brightness.c create mode 100644 src/login/logind-brightness.h create mode 100644 src/login/logind-button.c create mode 100644 src/login/logind-button.h create mode 100644 src/login/logind-core.c create mode 100644 src/login/logind-dbus.c create mode 100644 src/login/logind-dbus.h create mode 100644 src/login/logind-device.c create mode 100644 src/login/logind-device.h create mode 100644 src/login/logind-gperf.gperf create mode 100644 src/login/logind-inhibit.c create mode 100644 src/login/logind-inhibit.h create mode 100644 src/login/logind-polkit.c create mode 100644 src/login/logind-polkit.h create mode 100644 src/login/logind-seat-dbus.c create mode 100644 src/login/logind-seat-dbus.h create mode 100644 src/login/logind-seat.c create mode 100644 src/login/logind-seat.h create mode 100644 src/login/logind-session-dbus.c create mode 100644 src/login/logind-session-dbus.h create mode 100644 src/login/logind-session-device.c create mode 100644 src/login/logind-session-device.h create mode 100644 src/login/logind-session.c create mode 100644 src/login/logind-session.h create mode 100644 src/login/logind-user-dbus.c create mode 100644 src/login/logind-user-dbus.h create mode 100644 src/login/logind-user.c create mode 100644 src/login/logind-user.h create mode 100644 src/login/logind-wall.c create mode 100644 src/login/logind.c create mode 100644 src/login/logind.conf.in create mode 100644 src/login/logind.h create mode 100644 src/login/meson.build create mode 100644 src/login/org.freedesktop.login1.conf create mode 100644 src/login/org.freedesktop.login1.policy create mode 100644 src/login/org.freedesktop.login1.service create mode 100644 src/login/pam_systemd.c create mode 100644 src/login/pam_systemd.sym create mode 100644 src/login/pam_systemd_loadkey.c create mode 100644 src/login/pam_systemd_loadkey.sym create mode 100644 src/login/sysfs-show.c create mode 100644 src/login/sysfs-show.h create mode 100644 src/login/systemd-user.in create mode 100644 src/login/test-inhibit.c create mode 100644 src/login/test-login-shared.c create mode 100644 src/login/test-login-tables.c create mode 100644 src/login/test-session-properties.c create mode 100644 src/login/user-runtime-dir.c create mode 100644 src/machine-id-setup/machine-id-setup-main.c create mode 100644 src/machine-id-setup/meson.build create mode 100644 src/machine/image-dbus.c create mode 100644 src/machine/image-dbus.h create mode 100644 src/machine/machine-dbus.c create mode 100644 src/machine/machine-dbus.h create mode 100644 src/machine/machine.c create mode 100644 src/machine/machine.h create mode 100644 src/machine/machinectl.c create mode 100644 src/machine/machined-core.c create mode 100644 src/machine/machined-dbus.c create mode 100644 src/machine/machined-varlink.c create mode 100644 src/machine/machined-varlink.h create mode 100644 src/machine/machined.c create mode 100644 src/machine/machined.h create mode 100644 src/machine/meson.build create mode 100644 src/machine/operation.c create mode 100644 src/machine/operation.h create mode 100644 src/machine/org.freedesktop.machine1.conf create mode 100644 src/machine/org.freedesktop.machine1.policy create mode 100644 src/machine/org.freedesktop.machine1.service create mode 100644 src/machine/test-machine-tables.c create mode 100644 src/modules-load/meson.build create mode 100644 src/modules-load/modules-load.c create mode 100644 src/mount/meson.build create mode 100644 src/mount/mount-tool.c create mode 100644 src/network/fuzz-netdev-parser.c create mode 100644 src/network/fuzz-netdev-parser.options create mode 100644 src/network/fuzz-network-parser.c create mode 100644 src/network/fuzz-network-parser.options create mode 100644 src/network/generator/main.c create mode 100644 src/network/generator/network-generator.c create mode 100644 src/network/generator/network-generator.h create mode 100644 src/network/generator/test-network-generator.c create mode 100644 src/network/meson.build create mode 100644 src/network/netdev/bareudp.c create mode 100644 src/network/netdev/bareudp.h create mode 100644 src/network/netdev/batadv.c create mode 100644 src/network/netdev/batadv.h create mode 100644 src/network/netdev/bond.c create mode 100644 src/network/netdev/bond.h create mode 100644 src/network/netdev/bridge.c create mode 100644 src/network/netdev/bridge.h create mode 100644 src/network/netdev/dummy.c create mode 100644 src/network/netdev/dummy.h create mode 100644 src/network/netdev/fou-tunnel.c create mode 100644 src/network/netdev/fou-tunnel.h create mode 100644 src/network/netdev/geneve.c create mode 100644 src/network/netdev/geneve.h create mode 100644 src/network/netdev/ifb.c create mode 100644 src/network/netdev/ifb.h create mode 100644 src/network/netdev/ipoib.c create mode 100644 src/network/netdev/ipoib.h create mode 100644 src/network/netdev/ipvlan.c create mode 100644 src/network/netdev/ipvlan.h create mode 100644 src/network/netdev/l2tp-tunnel.c create mode 100644 src/network/netdev/l2tp-tunnel.h create mode 100644 src/network/netdev/macsec.c create mode 100644 src/network/netdev/macsec.h create mode 100644 src/network/netdev/macvlan.c create mode 100644 src/network/netdev/macvlan.h create mode 100644 src/network/netdev/netdev-gperf.gperf create mode 100644 src/network/netdev/netdev-util.c create mode 100644 src/network/netdev/netdev-util.h create mode 100644 src/network/netdev/netdev.c create mode 100644 src/network/netdev/netdev.h create mode 100644 src/network/netdev/netdevsim.c create mode 100644 src/network/netdev/netdevsim.h create mode 100644 src/network/netdev/nlmon.c create mode 100644 src/network/netdev/nlmon.h create mode 100644 src/network/netdev/tunnel.c create mode 100644 src/network/netdev/tunnel.h create mode 100644 src/network/netdev/tuntap.c create mode 100644 src/network/netdev/tuntap.h create mode 100644 src/network/netdev/vcan.c create mode 100644 src/network/netdev/vcan.h create mode 100644 src/network/netdev/veth.c create mode 100644 src/network/netdev/veth.h create mode 100644 src/network/netdev/vlan.c create mode 100644 src/network/netdev/vlan.h create mode 100644 src/network/netdev/vrf.c create mode 100644 src/network/netdev/vrf.h create mode 100644 src/network/netdev/vxcan.c create mode 100644 src/network/netdev/vxcan.h create mode 100644 src/network/netdev/vxlan.c create mode 100644 src/network/netdev/vxlan.h create mode 100644 src/network/netdev/wireguard.c create mode 100644 src/network/netdev/wireguard.h create mode 100644 src/network/netdev/wlan.c create mode 100644 src/network/netdev/wlan.h create mode 100644 src/network/netdev/xfrm.c create mode 100644 src/network/netdev/xfrm.h create mode 100644 src/network/networkctl.c create mode 100644 src/network/networkd-address-generation.c create mode 100644 src/network/networkd-address-generation.h create mode 100644 src/network/networkd-address-label.c create mode 100644 src/network/networkd-address-label.h create mode 100644 src/network/networkd-address-pool.c create mode 100644 src/network/networkd-address-pool.h create mode 100644 src/network/networkd-address.c create mode 100644 src/network/networkd-address.h create mode 100644 src/network/networkd-bridge-fdb.c create mode 100644 src/network/networkd-bridge-fdb.h create mode 100644 src/network/networkd-bridge-mdb.c create mode 100644 src/network/networkd-bridge-mdb.h create mode 100644 src/network/networkd-bridge-vlan.c create mode 100644 src/network/networkd-bridge-vlan.h create mode 100644 src/network/networkd-can.c create mode 100644 src/network/networkd-can.h create mode 100644 src/network/networkd-conf.c create mode 100644 src/network/networkd-conf.h create mode 100644 src/network/networkd-dhcp-common.c create mode 100644 src/network/networkd-dhcp-common.h create mode 100644 src/network/networkd-dhcp-prefix-delegation.c create mode 100644 src/network/networkd-dhcp-prefix-delegation.h create mode 100644 src/network/networkd-dhcp-server-bus.c create mode 100644 src/network/networkd-dhcp-server-bus.h create mode 100644 src/network/networkd-dhcp-server-static-lease.c create mode 100644 src/network/networkd-dhcp-server-static-lease.h create mode 100644 src/network/networkd-dhcp-server.c create mode 100644 src/network/networkd-dhcp-server.h create mode 100644 src/network/networkd-dhcp4-bus.c create mode 100644 src/network/networkd-dhcp4-bus.h create mode 100644 src/network/networkd-dhcp4.c create mode 100644 src/network/networkd-dhcp4.h create mode 100644 src/network/networkd-dhcp6-bus.c create mode 100644 src/network/networkd-dhcp6-bus.h create mode 100644 src/network/networkd-dhcp6.c create mode 100644 src/network/networkd-dhcp6.h create mode 100644 src/network/networkd-gperf.gperf create mode 100644 src/network/networkd-ipv4acd.c create mode 100644 src/network/networkd-ipv4acd.h create mode 100644 src/network/networkd-ipv4ll.c create mode 100644 src/network/networkd-ipv4ll.h create mode 100644 src/network/networkd-ipv6-proxy-ndp.c create mode 100644 src/network/networkd-ipv6-proxy-ndp.h create mode 100644 src/network/networkd-ipv6ll.c create mode 100644 src/network/networkd-ipv6ll.h create mode 100644 src/network/networkd-json.c create mode 100644 src/network/networkd-json.h create mode 100644 src/network/networkd-link-bus.c create mode 100644 src/network/networkd-link-bus.h create mode 100644 src/network/networkd-link.c create mode 100644 src/network/networkd-link.h create mode 100644 src/network/networkd-lldp-rx.c create mode 100644 src/network/networkd-lldp-rx.h create mode 100644 src/network/networkd-lldp-tx.c create mode 100644 src/network/networkd-lldp-tx.h create mode 100644 src/network/networkd-manager-bus.c create mode 100644 src/network/networkd-manager-bus.h create mode 100644 src/network/networkd-manager.c create mode 100644 src/network/networkd-manager.h create mode 100644 src/network/networkd-ndisc.c create mode 100644 src/network/networkd-ndisc.h create mode 100644 src/network/networkd-neighbor.c create mode 100644 src/network/networkd-neighbor.h create mode 100644 src/network/networkd-netlabel.c create mode 100644 src/network/networkd-netlabel.h create mode 100644 src/network/networkd-network-bus.c create mode 100644 src/network/networkd-network-bus.h create mode 100644 src/network/networkd-network-gperf.gperf create mode 100644 src/network/networkd-network.c create mode 100644 src/network/networkd-network.h create mode 100644 src/network/networkd-nexthop.c create mode 100644 src/network/networkd-nexthop.h create mode 100644 src/network/networkd-queue.c create mode 100644 src/network/networkd-queue.h create mode 100644 src/network/networkd-radv.c create mode 100644 src/network/networkd-radv.h create mode 100644 src/network/networkd-route-util.c create mode 100644 src/network/networkd-route-util.h create mode 100644 src/network/networkd-route.c create mode 100644 src/network/networkd-route.h create mode 100644 src/network/networkd-routing-policy-rule.c create mode 100644 src/network/networkd-routing-policy-rule.h create mode 100644 src/network/networkd-setlink.c create mode 100644 src/network/networkd-setlink.h create mode 100644 src/network/networkd-speed-meter.c create mode 100644 src/network/networkd-speed-meter.h create mode 100644 src/network/networkd-sriov.c create mode 100644 src/network/networkd-sriov.h create mode 100644 src/network/networkd-state-file.c create mode 100644 src/network/networkd-state-file.h create mode 100644 src/network/networkd-sysctl.c create mode 100644 src/network/networkd-sysctl.h create mode 100644 src/network/networkd-util.c create mode 100644 src/network/networkd-util.h create mode 100644 src/network/networkd-wifi.c create mode 100644 src/network/networkd-wifi.h create mode 100644 src/network/networkd-wiphy.c create mode 100644 src/network/networkd-wiphy.h create mode 100644 src/network/networkd.c create mode 100644 src/network/networkd.conf create mode 100644 src/network/org.freedesktop.network1.conf create mode 100644 src/network/org.freedesktop.network1.policy create mode 100644 src/network/org.freedesktop.network1.service create mode 100644 src/network/systemd-networkd.pkla create mode 100644 src/network/systemd-networkd.rules create mode 100644 src/network/tc/cake.c create mode 100644 src/network/tc/cake.h create mode 100644 src/network/tc/codel.c create mode 100644 src/network/tc/codel.h create mode 100644 src/network/tc/drr.c create mode 100644 src/network/tc/drr.h create mode 100644 src/network/tc/ets.c create mode 100644 src/network/tc/ets.h create mode 100644 src/network/tc/fifo.c create mode 100644 src/network/tc/fifo.h create mode 100644 src/network/tc/fq-codel.c create mode 100644 src/network/tc/fq-codel.h create mode 100644 src/network/tc/fq-pie.c create mode 100644 src/network/tc/fq-pie.h create mode 100644 src/network/tc/fq.c create mode 100644 src/network/tc/fq.h create mode 100644 src/network/tc/gred.c create mode 100644 src/network/tc/gred.h create mode 100644 src/network/tc/hhf.c create mode 100644 src/network/tc/hhf.h create mode 100644 src/network/tc/htb.c create mode 100644 src/network/tc/htb.h create mode 100644 src/network/tc/netem.c create mode 100644 src/network/tc/netem.h create mode 100644 src/network/tc/pie.c create mode 100644 src/network/tc/pie.h create mode 100644 src/network/tc/qdisc.c create mode 100644 src/network/tc/qdisc.h create mode 100644 src/network/tc/qfq.c create mode 100644 src/network/tc/qfq.h create mode 100644 src/network/tc/sfb.c create mode 100644 src/network/tc/sfb.h create mode 100644 src/network/tc/sfq.c create mode 100644 src/network/tc/sfq.h create mode 100644 src/network/tc/tbf.c create mode 100644 src/network/tc/tbf.h create mode 100644 src/network/tc/tc-util.c create mode 100644 src/network/tc/tc-util.h create mode 100644 src/network/tc/tc.c create mode 100644 src/network/tc/tc.h create mode 100644 src/network/tc/tclass.c create mode 100644 src/network/tc/tclass.h create mode 100644 src/network/tc/teql.c create mode 100644 src/network/tc/teql.h create mode 100644 src/network/test-network-tables.c create mode 100644 src/network/test-network.c create mode 100644 src/network/test-networkd-address.c create mode 100644 src/network/test-networkd-conf.c create mode 100644 src/network/test-networkd-util.c create mode 100644 src/network/wait-online/link.c create mode 100644 src/network/wait-online/link.h create mode 100644 src/network/wait-online/manager.c create mode 100644 src/network/wait-online/manager.h create mode 100644 src/network/wait-online/wait-online.c create mode 100644 src/notify/meson.build create mode 100644 src/notify/notify.c create mode 100644 src/nspawn/fuzz-nspawn-oci.c create mode 100644 src/nspawn/fuzz-nspawn-oci.options create mode 100644 src/nspawn/fuzz-nspawn-settings.c create mode 100644 src/nspawn/fuzz-nspawn-settings.options create mode 100644 src/nspawn/meson.build create mode 100644 src/nspawn/nspawn-bind-user.c create mode 100644 src/nspawn/nspawn-bind-user.h create mode 100644 src/nspawn/nspawn-cgroup.c create mode 100644 src/nspawn/nspawn-cgroup.h create mode 100644 src/nspawn/nspawn-def.h create mode 100644 src/nspawn/nspawn-expose-ports.c create mode 100644 src/nspawn/nspawn-expose-ports.h create mode 100644 src/nspawn/nspawn-gperf.gperf create mode 100644 src/nspawn/nspawn-mount.c create mode 100644 src/nspawn/nspawn-mount.h create mode 100644 src/nspawn/nspawn-network.c create mode 100644 src/nspawn/nspawn-network.h create mode 100644 src/nspawn/nspawn-oci.c create mode 100644 src/nspawn/nspawn-oci.h create mode 100644 src/nspawn/nspawn-patch-uid.c create mode 100644 src/nspawn/nspawn-patch-uid.h create mode 100644 src/nspawn/nspawn-register.c create mode 100644 src/nspawn/nspawn-register.h create mode 100644 src/nspawn/nspawn-seccomp.c create mode 100644 src/nspawn/nspawn-seccomp.h create mode 100644 src/nspawn/nspawn-settings.c create mode 100644 src/nspawn/nspawn-settings.h create mode 100644 src/nspawn/nspawn-setuid.c create mode 100644 src/nspawn/nspawn-setuid.h create mode 100644 src/nspawn/nspawn-stub-pid1.c create mode 100644 src/nspawn/nspawn-stub-pid1.h create mode 100644 src/nspawn/nspawn-util.c create mode 100644 src/nspawn/nspawn-util.h create mode 100644 src/nspawn/nspawn.c create mode 100644 src/nspawn/nspawn.h create mode 100644 src/nspawn/test-nspawn-tables.c create mode 100644 src/nspawn/test-nspawn-util.c create mode 100644 src/nspawn/test-patch-uid.c create mode 100644 src/nss-myhostname/meson.build create mode 100644 src/nss-myhostname/nss-myhostname.c create mode 100644 src/nss-myhostname/nss-myhostname.sym create mode 100644 src/nss-mymachines/meson.build create mode 100644 src/nss-mymachines/nss-mymachines.c create mode 100644 src/nss-mymachines/nss-mymachines.sym create mode 100644 src/nss-resolve/meson.build create mode 100644 src/nss-resolve/nss-resolve.c create mode 100644 src/nss-resolve/nss-resolve.sym create mode 100644 src/nss-systemd/meson.build create mode 100644 src/nss-systemd/nss-systemd.c create mode 100644 src/nss-systemd/nss-systemd.h create mode 100644 src/nss-systemd/nss-systemd.sym create mode 100644 src/nss-systemd/userdb-glue.c create mode 100644 src/nss-systemd/userdb-glue.h create mode 100644 src/oom/meson.build create mode 100644 src/oom/oomctl.c create mode 100644 src/oom/oomd-manager-bus.c create mode 100644 src/oom/oomd-manager-bus.h create mode 100644 src/oom/oomd-manager.c create mode 100644 src/oom/oomd-manager.h create mode 100644 src/oom/oomd-util.c create mode 100644 src/oom/oomd-util.h create mode 100644 src/oom/oomd.c create mode 100644 src/oom/oomd.conf create mode 100644 src/oom/org.freedesktop.oom1.conf create mode 100644 src/oom/org.freedesktop.oom1.service create mode 100644 src/oom/test-oomd-util.c create mode 100644 src/partition/definitions/confext.repart.d/10-root.conf create mode 100644 src/partition/definitions/confext.repart.d/20-root-verity.conf create mode 100644 src/partition/definitions/confext.repart.d/30-root-verity-sig.conf create mode 100644 src/partition/definitions/portable.repart.d/10-root.conf create mode 100644 src/partition/definitions/portable.repart.d/20-root-verity.conf create mode 100644 src/partition/definitions/portable.repart.d/30-root-verity-sig.conf create mode 100644 src/partition/definitions/sysext.repart.d/10-root.conf create mode 100644 src/partition/definitions/sysext.repart.d/20-root-verity.conf create mode 100644 src/partition/definitions/sysext.repart.d/30-root-verity-sig.conf create mode 100644 src/partition/growfs.c create mode 100644 src/partition/makefs.c create mode 100644 src/partition/meson.build create mode 100644 src/partition/repart.c create mode 100644 src/path/meson.build create mode 100644 src/path/path.c create mode 100644 src/pcrextend/meson.build create mode 100644 src/pcrextend/pcrextend.c create mode 100644 src/pcrlock/meson.build create mode 100644 src/pcrlock/pcrlock-firmware.c create mode 100644 src/pcrlock/pcrlock-firmware.h create mode 100644 src/pcrlock/pcrlock.c create mode 100644 src/pcrlock/pcrlock.d/350-action-efi-application.pcrlock create mode 100644 src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/300-0x00000000.pcrlock create mode 100644 src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/600-0xffffffff.pcrlock create mode 100644 src/pcrlock/pcrlock.d/500-separator.pcrlock.d/300-0x00000000.pcrlock create mode 100644 src/pcrlock/pcrlock.d/500-separator.pcrlock.d/600-0xffffffff.pcrlock create mode 100644 src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/300-present.pcrlock create mode 100644 src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/600-absent.pcrlock create mode 100644 src/pcrlock/pcrlock.d/750-enter-initrd.pcrlock create mode 100644 src/pcrlock/pcrlock.d/800-leave-initrd.pcrlock create mode 100644 src/pcrlock/pcrlock.d/850-sysinit.pcrlock create mode 100644 src/pcrlock/pcrlock.d/900-ready.pcrlock create mode 100644 src/pcrlock/pcrlock.d/950-shutdown.pcrlock create mode 100644 src/pcrlock/pcrlock.d/990-final.pcrlock create mode 100644 src/pcrlock/pehash.c create mode 100644 src/pcrlock/pehash.h create mode 100644 src/portable/meson.build create mode 100644 src/portable/org.freedesktop.portable1.conf create mode 100644 src/portable/org.freedesktop.portable1.policy create mode 100644 src/portable/org.freedesktop.portable1.service create mode 100644 src/portable/portable.c create mode 100644 src/portable/portable.h create mode 100644 src/portable/portablectl.c create mode 100644 src/portable/portabled-bus.c create mode 100644 src/portable/portabled-bus.h create mode 100644 src/portable/portabled-image-bus.c create mode 100644 src/portable/portabled-image-bus.h create mode 100644 src/portable/portabled-image.c create mode 100644 src/portable/portabled-image.h create mode 100644 src/portable/portabled-operation.c create mode 100644 src/portable/portabled-operation.h create mode 100644 src/portable/portabled.c create mode 100644 src/portable/portabled.h create mode 100644 src/portable/profile/default/service.conf create mode 100644 src/portable/profile/nonetwork/service.conf create mode 100644 src/portable/profile/strict/service.conf create mode 100644 src/portable/profile/trusted/service.conf create mode 100644 src/pstore/meson.build create mode 100644 src/pstore/pstore.c create mode 100644 src/pstore/pstore.conf create mode 100644 src/quotacheck/meson.build create mode 100644 src/quotacheck/quotacheck.c create mode 100644 src/random-seed/meson.build create mode 100644 src/random-seed/random-seed.c create mode 100644 src/rc-local-generator/meson.build create mode 100644 src/rc-local-generator/rc-local-generator.c create mode 100644 src/remount-fs/meson.build create mode 100644 src/remount-fs/remount-fs.c create mode 100644 src/reply-password/meson.build create mode 100644 src/reply-password/reply-password.c create mode 100644 src/resolve/RFCs create mode 100644 src/resolve/dns-type.c create mode 100644 src/resolve/dns-type.h create mode 100644 src/resolve/dns_type-to-name.awk create mode 100644 src/resolve/fuzz-dns-packet.c create mode 100644 src/resolve/fuzz-dns-packet.options create mode 100644 src/resolve/fuzz-etc-hosts.c create mode 100644 src/resolve/fuzz-resource-record.c create mode 100755 src/resolve/generate-dns_type-gperf.py create mode 100644 src/resolve/generate-dns_type-list.sed create mode 100644 src/resolve/meson.build create mode 100644 src/resolve/org.freedesktop.resolve1.conf create mode 100644 src/resolve/org.freedesktop.resolve1.policy create mode 100644 src/resolve/org.freedesktop.resolve1.service create mode 100644 src/resolve/resolv.conf create mode 100644 src/resolve/resolvconf-compat.c create mode 100644 src/resolve/resolvconf-compat.h create mode 100644 src/resolve/resolvectl.c create mode 100644 src/resolve/resolvectl.h create mode 100644 src/resolve/resolved-bus.c create mode 100644 src/resolve/resolved-bus.h create mode 100644 src/resolve/resolved-conf.c create mode 100644 src/resolve/resolved-conf.h create mode 100644 src/resolve/resolved-def.h create mode 100644 src/resolve/resolved-dns-answer.c create mode 100644 src/resolve/resolved-dns-answer.h create mode 100644 src/resolve/resolved-dns-cache.c create mode 100644 src/resolve/resolved-dns-cache.h create mode 100644 src/resolve/resolved-dns-dnssec.c create mode 100644 src/resolve/resolved-dns-dnssec.h create mode 100644 src/resolve/resolved-dns-packet.c create mode 100644 src/resolve/resolved-dns-packet.h create mode 100644 src/resolve/resolved-dns-query.c create mode 100644 src/resolve/resolved-dns-query.h create mode 100644 src/resolve/resolved-dns-question.c create mode 100644 src/resolve/resolved-dns-question.h create mode 100644 src/resolve/resolved-dns-rr.c create mode 100644 src/resolve/resolved-dns-rr.h create mode 100644 src/resolve/resolved-dns-scope.c create mode 100644 src/resolve/resolved-dns-scope.h create mode 100644 src/resolve/resolved-dns-search-domain.c create mode 100644 src/resolve/resolved-dns-search-domain.h create mode 100644 src/resolve/resolved-dns-server.c create mode 100644 src/resolve/resolved-dns-server.h create mode 100644 src/resolve/resolved-dns-stream.c create mode 100644 src/resolve/resolved-dns-stream.h create mode 100644 src/resolve/resolved-dns-stub.c create mode 100644 src/resolve/resolved-dns-stub.h create mode 100644 src/resolve/resolved-dns-synthesize.c create mode 100644 src/resolve/resolved-dns-synthesize.h create mode 100644 src/resolve/resolved-dns-transaction.c create mode 100644 src/resolve/resolved-dns-transaction.h create mode 100644 src/resolve/resolved-dns-trust-anchor.c create mode 100644 src/resolve/resolved-dns-trust-anchor.h create mode 100644 src/resolve/resolved-dns-zone.c create mode 100644 src/resolve/resolved-dns-zone.h create mode 100644 src/resolve/resolved-dnssd-bus.c create mode 100644 src/resolve/resolved-dnssd-bus.h create mode 100644 src/resolve/resolved-dnssd-gperf.gperf create mode 100644 src/resolve/resolved-dnssd.c create mode 100644 src/resolve/resolved-dnssd.h create mode 100644 src/resolve/resolved-dnstls-gnutls.c create mode 100644 src/resolve/resolved-dnstls-gnutls.h create mode 100644 src/resolve/resolved-dnstls-openssl.c create mode 100644 src/resolve/resolved-dnstls-openssl.h create mode 100644 src/resolve/resolved-dnstls.h create mode 100644 src/resolve/resolved-etc-hosts.c create mode 100644 src/resolve/resolved-etc-hosts.h create mode 100644 src/resolve/resolved-gperf.gperf create mode 100644 src/resolve/resolved-link-bus.c create mode 100644 src/resolve/resolved-link-bus.h create mode 100644 src/resolve/resolved-link.c create mode 100644 src/resolve/resolved-link.h create mode 100644 src/resolve/resolved-llmnr.c create mode 100644 src/resolve/resolved-llmnr.h create mode 100644 src/resolve/resolved-manager.c create mode 100644 src/resolve/resolved-manager.h create mode 100644 src/resolve/resolved-mdns.c create mode 100644 src/resolve/resolved-mdns.h create mode 100644 src/resolve/resolved-resolv-conf.c create mode 100644 src/resolve/resolved-resolv-conf.h create mode 100644 src/resolve/resolved-socket-graveyard.c create mode 100644 src/resolve/resolved-socket-graveyard.h create mode 100644 src/resolve/resolved-util.c create mode 100644 src/resolve/resolved-util.h create mode 100644 src/resolve/resolved-varlink.c create mode 100644 src/resolve/resolved-varlink.h create mode 100644 src/resolve/resolved.c create mode 100644 src/resolve/resolved.conf.in create mode 100644 src/resolve/test-dns-packet.c create mode 100644 src/resolve/test-dnssec-complex.c create mode 100644 src/resolve/test-dnssec.c create mode 100644 src/resolve/test-resolve-tables.c create mode 100644 src/resolve/test-resolved-etc-hosts.c create mode 100644 src/resolve/test-resolved-packet.c create mode 100644 src/resolve/test-resolved-stream.c create mode 100644 src/rfkill/meson.build create mode 100644 src/rfkill/rfkill.c create mode 100644 src/rpm/macros.systemd.in create mode 100644 src/rpm/meson.build create mode 100755 src/rpm/systemd-update-helper.in create mode 100644 src/rpm/triggers.systemd.in create mode 100644 src/rpm/triggers.systemd.sh.in create mode 100644 src/run-generator/meson.build create mode 100644 src/run-generator/run-generator.c create mode 100644 src/run/meson.build create mode 100644 src/run/run.c create mode 100644 src/shared/acl-util.c create mode 100644 src/shared/acl-util.h create mode 100644 src/shared/acpi-fpdt.c create mode 100644 src/shared/acpi-fpdt.h create mode 100644 src/shared/apparmor-util.c create mode 100644 src/shared/apparmor-util.h create mode 100644 src/shared/ask-password-api.c create mode 100644 src/shared/ask-password-api.h create mode 100644 src/shared/async.c create mode 100644 src/shared/async.h create mode 100644 src/shared/barrier.c create mode 100644 src/shared/barrier.h create mode 100644 src/shared/base-filesystem.c create mode 100644 src/shared/base-filesystem.h create mode 100644 src/shared/battery-util.c create mode 100644 src/shared/battery-util.h create mode 100644 src/shared/binfmt-util.c create mode 100644 src/shared/binfmt-util.h create mode 100644 src/shared/bitmap.c create mode 100644 src/shared/bitmap.h create mode 100644 src/shared/blkid-util.h create mode 100644 src/shared/blockdev-util.c create mode 100644 src/shared/blockdev-util.h create mode 100644 src/shared/bond-util.c create mode 100644 src/shared/bond-util.h create mode 100644 src/shared/boot-entry.c create mode 100644 src/shared/boot-entry.h create mode 100644 src/shared/boot-timestamps.c create mode 100644 src/shared/boot-timestamps.h create mode 100644 src/shared/bootspec.c create mode 100644 src/shared/bootspec.h create mode 100644 src/shared/bpf-compat.h create mode 100644 src/shared/bpf-dlopen.c create mode 100644 src/shared/bpf-dlopen.h create mode 100644 src/shared/bpf-link.c create mode 100644 src/shared/bpf-link.h create mode 100644 src/shared/bpf-program.c create mode 100644 src/shared/bpf-program.h create mode 100644 src/shared/bridge-util.c create mode 100644 src/shared/bridge-util.h create mode 100644 src/shared/btrfs-util.c create mode 100644 src/shared/btrfs-util.h create mode 100644 src/shared/bus-get-properties.c create mode 100644 src/shared/bus-get-properties.h create mode 100644 src/shared/bus-locator.c create mode 100644 src/shared/bus-locator.h create mode 100644 src/shared/bus-log-control-api.c create mode 100644 src/shared/bus-log-control-api.h create mode 100644 src/shared/bus-map-properties.c create mode 100644 src/shared/bus-map-properties.h create mode 100644 src/shared/bus-message-util.c create mode 100644 src/shared/bus-message-util.h create mode 100644 src/shared/bus-object.c create mode 100644 src/shared/bus-object.h create mode 100644 src/shared/bus-polkit.c create mode 100644 src/shared/bus-polkit.h create mode 100644 src/shared/bus-print-properties.c create mode 100644 src/shared/bus-print-properties.h create mode 100644 src/shared/bus-unit-procs.c create mode 100644 src/shared/bus-unit-procs.h create mode 100644 src/shared/bus-unit-util.c create mode 100644 src/shared/bus-unit-util.h create mode 100644 src/shared/bus-util.c create mode 100644 src/shared/bus-util.h create mode 100644 src/shared/bus-wait-for-jobs.c create mode 100644 src/shared/bus-wait-for-jobs.h create mode 100644 src/shared/bus-wait-for-units.c create mode 100644 src/shared/bus-wait-for-units.h create mode 100644 src/shared/calendarspec.c create mode 100644 src/shared/calendarspec.h create mode 100644 src/shared/cgroup-setup.c create mode 100644 src/shared/cgroup-setup.h create mode 100644 src/shared/cgroup-show.c create mode 100644 src/shared/cgroup-show.h create mode 100644 src/shared/chown-recursive.c create mode 100644 src/shared/chown-recursive.h create mode 100644 src/shared/clean-ipc.c create mode 100644 src/shared/clean-ipc.h create mode 100644 src/shared/clock-util.c create mode 100644 src/shared/clock-util.h create mode 100644 src/shared/common-signal.c create mode 100644 src/shared/common-signal.h create mode 100644 src/shared/compare-operator.c create mode 100644 src/shared/compare-operator.h create mode 100644 src/shared/condition.c create mode 100644 src/shared/condition.h create mode 100644 src/shared/conf-parser.c create mode 100644 src/shared/conf-parser.h create mode 100644 src/shared/copy.c create mode 100644 src/shared/copy.h create mode 100644 src/shared/coredump-util.c create mode 100644 src/shared/coredump-util.h create mode 100644 src/shared/cpu-set-util.c create mode 100644 src/shared/cpu-set-util.h create mode 100644 src/shared/creds-util.c create mode 100644 src/shared/creds-util.h create mode 100644 src/shared/cryptsetup-fido2.c create mode 100644 src/shared/cryptsetup-fido2.h create mode 100644 src/shared/cryptsetup-util.c create mode 100644 src/shared/cryptsetup-util.h create mode 100644 src/shared/daemon-util.c create mode 100644 src/shared/daemon-util.h create mode 100644 src/shared/data-fd-util.c create mode 100644 src/shared/data-fd-util.h create mode 100644 src/shared/dev-setup.c create mode 100644 src/shared/dev-setup.h create mode 100644 src/shared/device-nodes.c create mode 100644 src/shared/device-nodes.h create mode 100644 src/shared/devnode-acl.c create mode 100644 src/shared/devnode-acl.h create mode 100644 src/shared/discover-image.c create mode 100644 src/shared/discover-image.h create mode 100644 src/shared/dissect-image.c create mode 100644 src/shared/dissect-image.h create mode 100644 src/shared/dlfcn-util.c create mode 100644 src/shared/dlfcn-util.h create mode 100644 src/shared/dm-util.c create mode 100644 src/shared/dm-util.h create mode 100644 src/shared/dns-domain.c create mode 100644 src/shared/dns-domain.h create mode 100644 src/shared/dropin.c create mode 100644 src/shared/dropin.h create mode 100644 src/shared/edit-util.c create mode 100644 src/shared/edit-util.h create mode 100644 src/shared/efi-api.c create mode 100644 src/shared/efi-api.h create mode 100644 src/shared/efi-loader.c create mode 100644 src/shared/efi-loader.h create mode 100644 src/shared/elf-util.c create mode 100644 src/shared/elf-util.h create mode 100644 src/shared/enable-mempool.c create mode 100644 src/shared/env-file-label.c create mode 100644 src/shared/env-file-label.h create mode 100644 src/shared/ethtool-link-mode.py create mode 100644 src/shared/ethtool-util.c create mode 100644 src/shared/ethtool-util.h create mode 100644 src/shared/exec-util.c create mode 100644 src/shared/exec-util.h create mode 100644 src/shared/exit-status.c create mode 100644 src/shared/exit-status.h create mode 100644 src/shared/extension-util.c create mode 100644 src/shared/extension-util.h create mode 100644 src/shared/fdisk-util.c create mode 100644 src/shared/fdisk-util.h create mode 100644 src/shared/fdset.c create mode 100644 src/shared/fdset.h create mode 100644 src/shared/fileio-label.c create mode 100644 src/shared/fileio-label.h create mode 100644 src/shared/find-esp.c create mode 100644 src/shared/find-esp.h create mode 100644 src/shared/firewall-util-iptables.c create mode 100644 src/shared/firewall-util-nft.c create mode 100644 src/shared/firewall-util-private.h create mode 100644 src/shared/firewall-util.c create mode 100644 src/shared/firewall-util.h create mode 100644 src/shared/format-table.c create mode 100644 src/shared/format-table.h create mode 100644 src/shared/fsck-util.h create mode 100644 src/shared/fstab-util.c create mode 100644 src/shared/fstab-util.h create mode 100755 src/shared/generate-ip-protocol-list.sh create mode 100755 src/shared/generate-syscall-list.py create mode 100644 src/shared/generator.c create mode 100644 src/shared/generator.h create mode 100644 src/shared/geneve-util.c create mode 100644 src/shared/geneve-util.h create mode 100644 src/shared/gpt.c create mode 100644 src/shared/gpt.h create mode 100644 src/shared/group-record.c create mode 100644 src/shared/group-record.h create mode 100644 src/shared/hibernate-util.c create mode 100644 src/shared/hibernate-util.h create mode 100644 src/shared/hostname-setup.c create mode 100644 src/shared/hostname-setup.h create mode 100644 src/shared/hwdb-util.c create mode 100644 src/shared/hwdb-util.h create mode 100644 src/shared/id128-print.c create mode 100644 src/shared/id128-print.h create mode 100644 src/shared/idn-util.c create mode 100644 src/shared/idn-util.h create mode 100644 src/shared/ima-util.c create mode 100644 src/shared/ima-util.h create mode 100644 src/shared/image-policy.c create mode 100644 src/shared/image-policy.h create mode 100644 src/shared/import-util.c create mode 100644 src/shared/import-util.h create mode 100644 src/shared/in-addr-prefix-util.c create mode 100644 src/shared/in-addr-prefix-util.h create mode 100644 src/shared/initreq.h create mode 100644 src/shared/install-file.c create mode 100644 src/shared/install-file.h create mode 100644 src/shared/install-printf.c create mode 100644 src/shared/install-printf.h create mode 100644 src/shared/install.c create mode 100644 src/shared/install.h create mode 100644 src/shared/ip-protocol-list.c create mode 100644 src/shared/ip-protocol-list.h create mode 100644 src/shared/ip-protocol-to-name.awk create mode 100644 src/shared/ipvlan-util.c create mode 100644 src/shared/ipvlan-util.h create mode 100644 src/shared/journal-file-util.c create mode 100644 src/shared/journal-file-util.h create mode 100644 src/shared/journal-importer.c create mode 100644 src/shared/journal-importer.h create mode 100644 src/shared/journal-util.c create mode 100644 src/shared/journal-util.h create mode 100644 src/shared/json-internal.h create mode 100644 src/shared/json.c create mode 100644 src/shared/json.h create mode 100644 src/shared/kbd-util.c create mode 100644 src/shared/kbd-util.h create mode 100644 src/shared/kernel-image.c create mode 100644 src/shared/kernel-image.h create mode 100644 src/shared/keyring-util.c create mode 100644 src/shared/keyring-util.h create mode 100644 src/shared/killall.c create mode 100644 src/shared/killall.h create mode 100644 src/shared/label-util.c create mode 100644 src/shared/label-util.h create mode 100644 src/shared/libcrypt-util.c create mode 100644 src/shared/libcrypt-util.h create mode 100644 src/shared/libfido2-util.c create mode 100644 src/shared/libfido2-util.h create mode 100644 src/shared/libmount-util.c create mode 100644 src/shared/libmount-util.h create mode 100644 src/shared/libshared.sym create mode 100644 src/shared/linux/README create mode 100644 src/shared/linux/auto_dev-ioctl.h create mode 100644 src/shared/linux/bpf.h create mode 100644 src/shared/linux/bpf_common.h create mode 100644 src/shared/linux/bpf_insn.h create mode 100644 src/shared/linux/dm-ioctl.h create mode 100644 src/shared/linux/ethtool.h create mode 100644 src/shared/local-addresses.c create mode 100644 src/shared/local-addresses.h create mode 100644 src/shared/locale-setup.c create mode 100644 src/shared/locale-setup.h create mode 100644 src/shared/log-link.h create mode 100644 src/shared/logs-show.c create mode 100644 src/shared/logs-show.h create mode 100644 src/shared/loop-util.c create mode 100644 src/shared/loop-util.h create mode 100644 src/shared/loopback-setup.c create mode 100644 src/shared/loopback-setup.h create mode 100644 src/shared/lsm-util.c create mode 100644 src/shared/lsm-util.h create mode 100644 src/shared/machine-credential.c create mode 100644 src/shared/machine-credential.h create mode 100644 src/shared/machine-id-setup.c create mode 100644 src/shared/machine-id-setup.h create mode 100644 src/shared/machine-pool.c create mode 100644 src/shared/machine-pool.h create mode 100644 src/shared/macvlan-util.c create mode 100644 src/shared/macvlan-util.h create mode 100644 src/shared/main-func.h create mode 100644 src/shared/meson.build create mode 100644 src/shared/mkdir-label.c create mode 100644 src/shared/mkdir-label.h create mode 100644 src/shared/mkfs-util.c create mode 100644 src/shared/mkfs-util.h create mode 100644 src/shared/module-util.c create mode 100644 src/shared/module-util.h create mode 100644 src/shared/mount-setup.c create mode 100644 src/shared/mount-setup.h create mode 100644 src/shared/mount-util.c create mode 100644 src/shared/mount-util.h create mode 100644 src/shared/net-condition.c create mode 100644 src/shared/net-condition.h create mode 100644 src/shared/netif-naming-scheme.c create mode 100644 src/shared/netif-naming-scheme.h create mode 100644 src/shared/netif-sriov.c create mode 100644 src/shared/netif-sriov.h create mode 100644 src/shared/netif-util.c create mode 100644 src/shared/netif-util.h create mode 100644 src/shared/nscd-flush.c create mode 100644 src/shared/nscd-flush.h create mode 100644 src/shared/nsflags.c create mode 100644 src/shared/nsflags.h create mode 100644 src/shared/numa-util.c create mode 100644 src/shared/numa-util.h create mode 100644 src/shared/open-file.c create mode 100644 src/shared/open-file.h create mode 100644 src/shared/openssl-util.c create mode 100644 src/shared/openssl-util.h create mode 100644 src/shared/output-mode.c create mode 100644 src/shared/output-mode.h create mode 100644 src/shared/pager.c create mode 100644 src/shared/pager.h create mode 100644 src/shared/pam-util.c create mode 100644 src/shared/pam-util.h create mode 100644 src/shared/parse-argument.c create mode 100644 src/shared/parse-argument.h create mode 100644 src/shared/parse-helpers.c create mode 100644 src/shared/parse-helpers.h create mode 100644 src/shared/password-quality-util-passwdqc.c create mode 100644 src/shared/password-quality-util-passwdqc.h create mode 100644 src/shared/password-quality-util-pwquality.c create mode 100644 src/shared/password-quality-util-pwquality.h create mode 100644 src/shared/password-quality-util.h create mode 100644 src/shared/pcre2-util.c create mode 100644 src/shared/pcre2-util.h create mode 100644 src/shared/pcrextend-util.c create mode 100644 src/shared/pcrextend-util.h create mode 100644 src/shared/pe-binary.c create mode 100644 src/shared/pe-binary.h create mode 100644 src/shared/pkcs11-util.c create mode 100644 src/shared/pkcs11-util.h create mode 100644 src/shared/plymouth-util.c create mode 100644 src/shared/plymouth-util.h create mode 100644 src/shared/pretty-print.c create mode 100644 src/shared/pretty-print.h create mode 100644 src/shared/ptyfwd.c create mode 100644 src/shared/ptyfwd.h create mode 100644 src/shared/qrcode-util.c create mode 100644 src/shared/qrcode-util.h create mode 100644 src/shared/quota-util.c create mode 100644 src/shared/quota-util.h create mode 100644 src/shared/reboot-util.c create mode 100644 src/shared/reboot-util.h create mode 100644 src/shared/recovery-key.c create mode 100644 src/shared/recovery-key.h create mode 100644 src/shared/resize-fs.c create mode 100644 src/shared/resize-fs.h create mode 100644 src/shared/resolve-util.c create mode 100644 src/shared/resolve-util.h create mode 100644 src/shared/rm-rf.c create mode 100644 src/shared/rm-rf.h create mode 100644 src/shared/seccomp-util.c create mode 100644 src/shared/seccomp-util.h create mode 100644 src/shared/securebits-util.c create mode 100644 src/shared/securebits-util.h create mode 100644 src/shared/selinux-util.c create mode 100644 src/shared/selinux-util.h create mode 100644 src/shared/serialize.c create mode 100644 src/shared/serialize.h create mode 100644 src/shared/service-util.c create mode 100644 src/shared/service-util.h create mode 100644 src/shared/sleep-config.c create mode 100644 src/shared/sleep-config.h create mode 100644 src/shared/smack-util.c create mode 100644 src/shared/smack-util.h create mode 100644 src/shared/socket-label.c create mode 100644 src/shared/socket-netlink.c create mode 100644 src/shared/socket-netlink.h create mode 100644 src/shared/spawn-ask-password-agent.c create mode 100644 src/shared/spawn-ask-password-agent.h create mode 100644 src/shared/spawn-polkit-agent.c create mode 100644 src/shared/spawn-polkit-agent.h create mode 100644 src/shared/specifier.c create mode 100644 src/shared/specifier.h create mode 100644 src/shared/switch-root.c create mode 100644 src/shared/switch-root.h create mode 100644 src/shared/test-tables.h create mode 100644 src/shared/tests.c create mode 100644 src/shared/tests.h create mode 100644 src/shared/tmpfile-util-label.c create mode 100644 src/shared/tmpfile-util-label.h create mode 100644 src/shared/tomoyo-util.c create mode 100644 src/shared/tomoyo-util.h create mode 100644 src/shared/tpm2-event-log.c create mode 100644 src/shared/tpm2-event-log.h create mode 100644 src/shared/tpm2-util.c create mode 100644 src/shared/tpm2-util.h create mode 100644 src/shared/udev-util.c create mode 100644 src/shared/udev-util.h create mode 100644 src/shared/user-record-nss.c create mode 100644 src/shared/user-record-nss.h create mode 100644 src/shared/user-record-show.c create mode 100644 src/shared/user-record-show.h create mode 100644 src/shared/user-record.c create mode 100644 src/shared/user-record.h create mode 100644 src/shared/userdb-dropin.c create mode 100644 src/shared/userdb-dropin.h create mode 100644 src/shared/userdb.c create mode 100644 src/shared/userdb.h create mode 100644 src/shared/utmp-wtmp.c create mode 100644 src/shared/utmp-wtmp.h create mode 100644 src/shared/varlink-idl.c create mode 100644 src/shared/varlink-idl.h create mode 100644 src/shared/varlink-internal.h create mode 100644 src/shared/varlink-io.systemd.Journal.c create mode 100644 src/shared/varlink-io.systemd.Journal.h create mode 100644 src/shared/varlink-io.systemd.ManagedOOM.c create mode 100644 src/shared/varlink-io.systemd.ManagedOOM.h create mode 100644 src/shared/varlink-io.systemd.PCRExtend.c create mode 100644 src/shared/varlink-io.systemd.PCRExtend.h create mode 100644 src/shared/varlink-io.systemd.Resolve.Monitor.c create mode 100644 src/shared/varlink-io.systemd.Resolve.Monitor.h create mode 100644 src/shared/varlink-io.systemd.Resolve.c create mode 100644 src/shared/varlink-io.systemd.Resolve.h create mode 100644 src/shared/varlink-io.systemd.UserDatabase.c create mode 100644 src/shared/varlink-io.systemd.UserDatabase.h create mode 100644 src/shared/varlink-io.systemd.c create mode 100644 src/shared/varlink-io.systemd.h create mode 100644 src/shared/varlink-io.systemd.oom.c create mode 100644 src/shared/varlink-io.systemd.oom.h create mode 100644 src/shared/varlink-io.systemd.service.c create mode 100644 src/shared/varlink-io.systemd.service.h create mode 100644 src/shared/varlink-io.systemd.sysext.c create mode 100644 src/shared/varlink-io.systemd.sysext.h create mode 100644 src/shared/varlink-org.varlink.service.c create mode 100644 src/shared/varlink-org.varlink.service.h create mode 100644 src/shared/varlink.c create mode 100644 src/shared/varlink.h create mode 100644 src/shared/verb-log-control.c create mode 100644 src/shared/verb-log-control.h create mode 100644 src/shared/verbs.c create mode 100644 src/shared/verbs.h create mode 100644 src/shared/vlan-util.c create mode 100644 src/shared/vlan-util.h create mode 100644 src/shared/volatile-util.c create mode 100644 src/shared/volatile-util.h create mode 100644 src/shared/wall.c create mode 100644 src/shared/wall.h create mode 100644 src/shared/watchdog.c create mode 100644 src/shared/watchdog.h create mode 100644 src/shared/web-util.c create mode 100644 src/shared/web-util.h create mode 100644 src/shared/wifi-util.c create mode 100644 src/shared/wifi-util.h create mode 100644 src/shared/xml.c create mode 100644 src/shared/xml.h create mode 100644 src/shutdown/detach-dm.c create mode 100644 src/shutdown/detach-dm.h create mode 100644 src/shutdown/detach-loopback.c create mode 100644 src/shutdown/detach-loopback.h create mode 100644 src/shutdown/detach-md.c create mode 100644 src/shutdown/detach-md.h create mode 100644 src/shutdown/detach-swap.c create mode 100644 src/shutdown/detach-swap.h create mode 100644 src/shutdown/meson.build create mode 100644 src/shutdown/shutdown.c create mode 100644 src/shutdown/test-umount.c create mode 100644 src/shutdown/umount.c create mode 100644 src/shutdown/umount.h create mode 100644 src/sleep/battery-capacity.c create mode 100644 src/sleep/battery-capacity.h create mode 100644 src/sleep/meson.build create mode 100644 src/sleep/sleep.c create mode 100644 src/sleep/sleep.conf create mode 100644 src/sleep/test-battery-capacity.c create mode 100644 src/socket-activate/meson.build create mode 100644 src/socket-activate/socket-activate.c create mode 100644 src/socket-proxy/meson.build create mode 100644 src/socket-proxy/socket-proxyd.c create mode 100644 src/stdio-bridge/meson.build create mode 100644 src/stdio-bridge/stdio-bridge.c create mode 100644 src/storagetm/meson.build create mode 100644 src/storagetm/storagetm.c create mode 100644 src/sulogin-shell/meson.build create mode 100644 src/sulogin-shell/sulogin-shell.c create mode 100644 src/sysctl/meson.build create mode 100644 src/sysctl/sysctl.c create mode 100644 src/sysext/meson.build create mode 100644 src/sysext/sysext.c create mode 100644 src/system-update-generator/meson.build create mode 100644 src/system-update-generator/system-update-generator.c create mode 100644 src/systemctl/fuzz-systemctl-parse-argv.c create mode 100644 src/systemctl/meson.build create mode 100644 src/systemctl/systemctl-add-dependency.c create mode 100644 src/systemctl/systemctl-add-dependency.h create mode 100644 src/systemctl/systemctl-cancel-job.c create mode 100644 src/systemctl/systemctl-cancel-job.h create mode 100644 src/systemctl/systemctl-clean-or-freeze.c create mode 100644 src/systemctl/systemctl-clean-or-freeze.h create mode 100644 src/systemctl/systemctl-compat-halt.c create mode 100644 src/systemctl/systemctl-compat-halt.h create mode 100644 src/systemctl/systemctl-compat-runlevel.c create mode 100644 src/systemctl/systemctl-compat-runlevel.h create mode 100644 src/systemctl/systemctl-compat-shutdown.c create mode 100644 src/systemctl/systemctl-compat-shutdown.h create mode 100644 src/systemctl/systemctl-compat-telinit.c create mode 100644 src/systemctl/systemctl-compat-telinit.h create mode 100644 src/systemctl/systemctl-daemon-reload.c create mode 100644 src/systemctl/systemctl-daemon-reload.h create mode 100644 src/systemctl/systemctl-edit.c create mode 100644 src/systemctl/systemctl-edit.h create mode 100644 src/systemctl/systemctl-enable.c create mode 100644 src/systemctl/systemctl-enable.h create mode 100644 src/systemctl/systemctl-is-active.c create mode 100644 src/systemctl/systemctl-is-active.h create mode 100644 src/systemctl/systemctl-is-enabled.c create mode 100644 src/systemctl/systemctl-is-enabled.h create mode 100644 src/systemctl/systemctl-is-system-running.c create mode 100644 src/systemctl/systemctl-is-system-running.h create mode 100644 src/systemctl/systemctl-kill.c create mode 100644 src/systemctl/systemctl-kill.h create mode 100644 src/systemctl/systemctl-list-dependencies.c create mode 100644 src/systemctl/systemctl-list-dependencies.h create mode 100644 src/systemctl/systemctl-list-jobs.c create mode 100644 src/systemctl/systemctl-list-jobs.h create mode 100644 src/systemctl/systemctl-list-machines.c create mode 100644 src/systemctl/systemctl-list-machines.h create mode 100644 src/systemctl/systemctl-list-unit-files.c create mode 100644 src/systemctl/systemctl-list-unit-files.h create mode 100644 src/systemctl/systemctl-list-units.c create mode 100644 src/systemctl/systemctl-list-units.h create mode 100644 src/systemctl/systemctl-log-setting.c create mode 100644 src/systemctl/systemctl-log-setting.h create mode 100644 src/systemctl/systemctl-logind.c create mode 100644 src/systemctl/systemctl-logind.h create mode 100644 src/systemctl/systemctl-mount.c create mode 100644 src/systemctl/systemctl-mount.h create mode 100644 src/systemctl/systemctl-preset-all.c create mode 100644 src/systemctl/systemctl-preset-all.h create mode 100644 src/systemctl/systemctl-reset-failed.c create mode 100644 src/systemctl/systemctl-reset-failed.h create mode 100644 src/systemctl/systemctl-service-watchdogs.c create mode 100644 src/systemctl/systemctl-service-watchdogs.h create mode 100644 src/systemctl/systemctl-set-default.c create mode 100644 src/systemctl/systemctl-set-default.h create mode 100644 src/systemctl/systemctl-set-environment.c create mode 100644 src/systemctl/systemctl-set-environment.h create mode 100644 src/systemctl/systemctl-set-property.c create mode 100644 src/systemctl/systemctl-set-property.h create mode 100644 src/systemctl/systemctl-show.c create mode 100644 src/systemctl/systemctl-show.h create mode 100644 src/systemctl/systemctl-start-special.c create mode 100644 src/systemctl/systemctl-start-special.h create mode 100644 src/systemctl/systemctl-start-unit.c create mode 100644 src/systemctl/systemctl-start-unit.h create mode 100644 src/systemctl/systemctl-switch-root.c create mode 100644 src/systemctl/systemctl-switch-root.h create mode 100644 src/systemctl/systemctl-sysv-compat.c create mode 100644 src/systemctl/systemctl-sysv-compat.h create mode 100644 src/systemctl/systemctl-trivial-method.c create mode 100644 src/systemctl/systemctl-trivial-method.h create mode 100644 src/systemctl/systemctl-util.c create mode 100644 src/systemctl/systemctl-util.h create mode 100644 src/systemctl/systemctl-whoami.c create mode 100644 src/systemctl/systemctl-whoami.h create mode 100644 src/systemctl/systemctl.c create mode 100644 src/systemctl/systemctl.h create mode 100755 src/systemctl/systemd-sysv-install.SKELETON create mode 100644 src/systemd/_sd-common.h create mode 100644 src/systemd/meson.build create mode 100644 src/systemd/sd-bus-protocol.h create mode 100644 src/systemd/sd-bus-vtable.h create mode 100644 src/systemd/sd-bus.h create mode 100644 src/systemd/sd-daemon.h create mode 100644 src/systemd/sd-device.h create mode 100644 src/systemd/sd-dhcp-client.h create mode 100644 src/systemd/sd-dhcp-lease.h create mode 100644 src/systemd/sd-dhcp-option.h create mode 100644 src/systemd/sd-dhcp-protocol.h create mode 100644 src/systemd/sd-dhcp-server.h create mode 100644 src/systemd/sd-dhcp6-client.h create mode 100644 src/systemd/sd-dhcp6-lease.h create mode 100644 src/systemd/sd-dhcp6-option.h create mode 100644 src/systemd/sd-dhcp6-protocol.h create mode 100644 src/systemd/sd-event.h create mode 100644 src/systemd/sd-gpt.h create mode 100644 src/systemd/sd-hwdb.h create mode 100644 src/systemd/sd-id128.h create mode 100644 src/systemd/sd-ipv4acd.h create mode 100644 src/systemd/sd-ipv4ll.h create mode 100644 src/systemd/sd-journal.h create mode 100644 src/systemd/sd-lldp-rx.h create mode 100644 src/systemd/sd-lldp-tx.h create mode 100644 src/systemd/sd-lldp.h create mode 100644 src/systemd/sd-login.h create mode 100644 src/systemd/sd-messages.h create mode 100644 src/systemd/sd-ndisc.h create mode 100644 src/systemd/sd-netlink.h create mode 100644 src/systemd/sd-network.h create mode 100644 src/systemd/sd-path.h create mode 100644 src/systemd/sd-radv.h create mode 100644 src/systemd/sd-resolve.h create mode 100644 src/systemd/sd-utf8.h create mode 100644 src/sysupdate/meson.build create mode 100644 src/sysupdate/sysupdate-cache.c create mode 100644 src/sysupdate/sysupdate-cache.h create mode 100644 src/sysupdate/sysupdate-instance.c create mode 100644 src/sysupdate/sysupdate-instance.h create mode 100644 src/sysupdate/sysupdate-partition.c create mode 100644 src/sysupdate/sysupdate-partition.h create mode 100644 src/sysupdate/sysupdate-pattern.c create mode 100644 src/sysupdate/sysupdate-pattern.h create mode 100644 src/sysupdate/sysupdate-resource.c create mode 100644 src/sysupdate/sysupdate-resource.h create mode 100644 src/sysupdate/sysupdate-transfer.c create mode 100644 src/sysupdate/sysupdate-transfer.h create mode 100644 src/sysupdate/sysupdate-update-set.c create mode 100644 src/sysupdate/sysupdate-update-set.h create mode 100644 src/sysupdate/sysupdate-util.c create mode 100644 src/sysupdate/sysupdate.c create mode 100644 src/sysupdate/sysupdate.h create mode 100644 src/sysusers/meson.build create mode 100644 src/sysusers/sysusers.c create mode 100644 src/sysv-generator/meson.build create mode 100644 src/sysv-generator/sysv-generator.c create mode 100755 src/test/generate-sym-test.py create mode 100644 src/test/meson.build create mode 100644 src/test/nss-test-util.c create mode 100644 src/test/nss-test-util.h create mode 100644 src/test/test-acl-util.c create mode 100644 src/test/test-af-list.c create mode 100644 src/test/test-alloc-util.c create mode 100644 src/test/test-architecture.c create mode 100644 src/test/test-argv-util.c create mode 100644 src/test/test-arphrd-util.c create mode 100644 src/test/test-ask-password-api.c create mode 100644 src/test/test-async.c create mode 100644 src/test/test-barrier.c create mode 100644 src/test/test-bitfield.c create mode 100644 src/test/test-bitmap.c create mode 100644 src/test/test-blockdev-util.c create mode 100644 src/test/test-boot-timestamps.c create mode 100644 src/test/test-bootspec.c create mode 100644 src/test/test-bpf-devices.c create mode 100644 src/test/test-bpf-firewall.c create mode 100644 src/test/test-bpf-foreign-programs.c create mode 100644 src/test/test-bpf-lsm.c create mode 100644 src/test/test-btrfs-physical-offset.c create mode 100644 src/test/test-btrfs.c create mode 100644 src/test/test-bus-util.c create mode 100644 src/test/test-calendarspec.c create mode 100644 src/test/test-cap-list.c create mode 100644 src/test/test-capability.c create mode 100644 src/test/test-cgroup-cpu.c create mode 100644 src/test/test-cgroup-mask.c create mode 100644 src/test/test-cgroup-setup.c create mode 100644 src/test/test-cgroup-unit-default.c create mode 100644 src/test/test-cgroup-util.c create mode 100644 src/test/test-cgroup.c create mode 100644 src/test/test-chase-manual.c create mode 100644 src/test/test-chase.c create mode 100644 src/test/test-chown-rec.c create mode 100644 src/test/test-clock.c create mode 100644 src/test/test-compare-operator.c create mode 100644 src/test/test-compress-benchmark.c create mode 100644 src/test/test-compress.c create mode 100644 src/test/test-condition.c create mode 100644 src/test/test-conf-files.c create mode 100644 src/test/test-conf-parser.c create mode 100644 src/test/test-copy.c create mode 100644 src/test/test-core-unit.c create mode 100644 src/test/test-coredump-util.c create mode 100644 src/test/test-cpu-set-util.c create mode 100644 src/test/test-creds.c create mode 100644 src/test/test-cryptolib.c create mode 100644 src/test/test-daemon.c create mode 100644 src/test/test-data-fd-util.c create mode 100644 src/test/test-date.c create mode 100644 src/test/test-dev-setup.c create mode 100644 src/test/test-device-nodes.c create mode 100644 src/test/test-devnum-util.c create mode 100644 src/test/test-dlopen-so.c create mode 100644 src/test/test-dlopen.c create mode 100644 src/test/test-dns-domain.c create mode 100644 src/test/test-ellipsize.c create mode 100644 src/test/test-emergency-action.c create mode 100644 src/test/test-engine.c create mode 100644 src/test/test-env-file.c create mode 100644 src/test/test-env-util.c create mode 100644 src/test/test-errno-list.c create mode 100644 src/test/test-errno-util.c create mode 100644 src/test/test-escape.c create mode 100644 src/test/test-ether-addr-util.c create mode 100644 src/test/test-exec-util.c create mode 100644 src/test/test-execute.c create mode 100644 src/test/test-execve.c create mode 100644 src/test/test-exit-status.c create mode 100644 src/test/test-extract-word.c create mode 100644 src/test/test-fd-util.c create mode 100644 src/test/test-fdset.c create mode 100644 src/test/test-fiemap.c create mode 100644 src/test/test-fileio.c create mode 100644 src/test/test-firewall-util.c create mode 100644 src/test/test-format-table.c create mode 100644 src/test/test-format-util.c create mode 100644 src/test/test-fs-util.c create mode 100644 src/test/test-fstab-util.c create mode 100644 src/test/test-glob-util.c create mode 100644 src/test/test-gpt.c create mode 100644 src/test/test-gunicode.c create mode 100644 src/test/test-hash-funcs.c create mode 100644 src/test/test-hashmap-ordered.awk create mode 100644 src/test/test-hashmap-plain.c create mode 100644 src/test/test-hashmap.c create mode 100644 src/test/test-hexdecoct.c create mode 100644 src/test/test-hmac.c create mode 100644 src/test/test-hostname-setup.c create mode 100644 src/test/test-hostname-util.c create mode 100644 src/test/test-id128.c create mode 100644 src/test/test-image-policy.c create mode 100644 src/test/test-import-util.c create mode 100644 src/test/test-in-addr-prefix-util.c create mode 100644 src/test/test-in-addr-util.c create mode 100644 src/test/test-install-file.c create mode 100644 src/test/test-install-root.c create mode 100644 src/test/test-install.c create mode 100644 src/test/test-io-util.c create mode 100644 src/test/test-ip-protocol-list.c create mode 100644 src/test/test-ipcrm.c create mode 100644 src/test/test-job-type.c create mode 100644 src/test/test-journal-importer.c create mode 100644 src/test/test-json.c create mode 100644 src/test/test-kbd-util.c create mode 100644 src/test/test-libcrypt-util.c create mode 100644 src/test/test-libmount.c create mode 100644 src/test/test-limits-util.c create mode 100644 src/test/test-list.c create mode 100644 src/test/test-load-fragment.c create mode 100644 src/test/test-local-addresses.c create mode 100644 src/test/test-locale-util.c create mode 100644 src/test/test-lock-util.c create mode 100644 src/test/test-log.c create mode 100644 src/test/test-logarithm.c create mode 100644 src/test/test-loop-block.c create mode 100644 src/test/test-loopback.c create mode 100644 src/test/test-macro.c create mode 100644 src/test/test-manager.c create mode 100644 src/test/test-math-util.c create mode 100644 src/test/test-memfd-util.c create mode 100644 src/test/test-memory-util.c create mode 100644 src/test/test-mempool.c create mode 100644 src/test/test-mempress.c create mode 100644 src/test/test-memstream-util.c create mode 100644 src/test/test-mkdir.c create mode 100644 src/test/test-modhex.c create mode 100644 src/test/test-mount-util.c create mode 100644 src/test/test-mountpoint-util.c create mode 100644 src/test/test-namespace.c create mode 100644 src/test/test-net-naming-scheme.c create mode 100644 src/test/test-netlink-manual.c create mode 100644 src/test/test-nft-set.c create mode 100644 src/test/test-ns.c create mode 100644 src/test/test-nscd-flush.c create mode 100644 src/test/test-nss-hosts.c create mode 100644 src/test/test-nss-users.c create mode 100644 src/test/test-nulstr-util.c create mode 100644 src/test/test-open-file.c create mode 100644 src/test/test-openssl.c create mode 100644 src/test/test-ordered-set.c create mode 100644 src/test/test-os-util.c create mode 100644 src/test/test-parse-argument.c create mode 100644 src/test/test-parse-helpers.c create mode 100644 src/test/test-parse-util.c create mode 100644 src/test/test-path-lookup.c create mode 100644 src/test/test-path-util.c create mode 100644 src/test/test-path.c create mode 100644 src/test/test-percent-util.c create mode 100644 src/test/test-pretty-print.c create mode 100644 src/test/test-prioq.c create mode 100644 src/test/test-proc-cmdline.c create mode 100644 src/test/test-process-util.c create mode 100644 src/test/test-procfs-util.c create mode 100644 src/test/test-psi-util.c create mode 100644 src/test/test-qrcode-util.c create mode 100644 src/test/test-random-util.c create mode 100644 src/test/test-ratelimit.c create mode 100644 src/test/test-raw-clone.c create mode 100644 src/test/test-recurse-dir.c create mode 100644 src/test/test-replace-var.c create mode 100644 src/test/test-rlimit-util.c create mode 100644 src/test/test-rm-rf.c create mode 100644 src/test/test-sbat.c create mode 100644 src/test/test-sched-prio.c create mode 100644 src/test/test-sd-hwdb.c create mode 100644 src/test/test-sd-path.c create mode 100644 src/test/test-seccomp.c create mode 100644 src/test/test-secure-bits.c create mode 100644 src/test/test-selinux.c create mode 100644 src/test/test-serialize.c create mode 100644 src/test/test-set-disable-mempool.c create mode 100644 src/test/test-set.c create mode 100644 src/test/test-sha256.c create mode 100644 src/test/test-sigbus.c create mode 100644 src/test/test-signal-util.c create mode 100644 src/test/test-siphash24.c create mode 100644 src/test/test-sizeof.c create mode 100644 src/test/test-sleep-config.c create mode 100644 src/test/test-socket-bind.c create mode 100644 src/test/test-socket-netlink.c create mode 100644 src/test/test-socket-util.c create mode 100644 src/test/test-specifier.c create mode 100644 src/test/test-stat-util.c create mode 100644 src/test/test-static-destruct.c create mode 100644 src/test/test-strbuf.c create mode 100644 src/test/test-string-util.c create mode 100644 src/test/test-strip-tab-ansi.c create mode 100644 src/test/test-strv.c create mode 100644 src/test/test-strxcpyx.c create mode 100644 src/test/test-sysctl-util.c create mode 100644 src/test/test-tables.c create mode 100644 src/test/test-terminal-util.c create mode 100644 src/test/test-time-util.c create mode 100644 src/test/test-tmpfile-util.c create mode 100644 src/test/test-tpm2.c create mode 100644 src/test/test-udev-util.c create mode 100644 src/test/test-uid-alloc-range.c create mode 100644 src/test/test-uid-range.c create mode 100644 src/test/test-umask-util.c create mode 100644 src/test/test-unaligned.c create mode 100644 src/test/test-unit-file.c create mode 100644 src/test/test-unit-name.c create mode 100644 src/test/test-unit-serialize.c create mode 100644 src/test/test-user-util.c create mode 100644 src/test/test-utf8.c create mode 100644 src/test/test-utmp.c create mode 100644 src/test/test-varlink-idl.c create mode 100644 src/test/test-varlink.c create mode 100644 src/test/test-verbs.c create mode 100644 src/test/test-watch-pid.c create mode 100644 src/test/test-watchdog.c create mode 100644 src/test/test-web-util.c create mode 100644 src/test/test-xattr-util.c create mode 100644 src/test/test-xml.c create mode 100644 src/timedate/meson.build create mode 100644 src/timedate/org.freedesktop.timedate1.conf create mode 100644 src/timedate/org.freedesktop.timedate1.policy create mode 100644 src/timedate/org.freedesktop.timedate1.service create mode 100644 src/timedate/timedatectl.c create mode 100644 src/timedate/timedated.c create mode 100644 src/timesync/80-systemd-timesync.list create mode 100644 src/timesync/meson.build create mode 100644 src/timesync/org.freedesktop.timesync1.conf create mode 100644 src/timesync/org.freedesktop.timesync1.policy create mode 100644 src/timesync/org.freedesktop.timesync1.service create mode 100644 src/timesync/test-timesync.c create mode 100644 src/timesync/timesyncd-bus.c create mode 100644 src/timesync/timesyncd-bus.h create mode 100644 src/timesync/timesyncd-conf.c create mode 100644 src/timesync/timesyncd-conf.h create mode 100644 src/timesync/timesyncd-gperf.gperf create mode 100644 src/timesync/timesyncd-manager.c create mode 100644 src/timesync/timesyncd-manager.h create mode 100644 src/timesync/timesyncd-ntp-message.h create mode 100644 src/timesync/timesyncd-server.c create mode 100644 src/timesync/timesyncd-server.h create mode 100644 src/timesync/timesyncd.c create mode 100644 src/timesync/timesyncd.conf.in create mode 100644 src/timesync/wait-sync.c create mode 100644 src/tmpfiles/meson.build create mode 100644 src/tmpfiles/offline-passwd.c create mode 100644 src/tmpfiles/offline-passwd.h create mode 100644 src/tmpfiles/test-offline-passwd.c create mode 100644 src/tmpfiles/tmpfiles.c create mode 100644 src/tpm2-setup/meson.build create mode 100644 src/tpm2-setup/tpm2-setup.c create mode 100644 src/tty-ask-password-agent/meson.build create mode 100644 src/tty-ask-password-agent/tty-ask-password-agent.c create mode 100644 src/udev/ata_id/ata_id.c create mode 100644 src/udev/cdrom_id/cdrom_id.c create mode 100644 src/udev/dmi_memory_id/dmi_memory_id.c create mode 100644 src/udev/fido_id/fido_id.c create mode 100644 src/udev/fido_id/fido_id_desc.c create mode 100644 src/udev/fido_id/fido_id_desc.h create mode 100644 src/udev/fido_id/fuzz-fido-id-desc.c create mode 100644 src/udev/fido_id/test-fido-id-desc.c create mode 100644 src/udev/fuzz-udev-rule-parse-value.c create mode 100644 src/udev/fuzz-udev-rules.c create mode 100644 src/udev/fuzz-udev-rules.options create mode 100755 src/udev/generate-keyboard-keys-gperf.sh create mode 100755 src/udev/generate-keyboard-keys-list.sh create mode 100644 src/udev/iocost/iocost.c create mode 100644 src/udev/iocost/iocost.conf create mode 100644 src/udev/meson.build create mode 100644 src/udev/mtd_probe/mtd_probe.c create mode 100644 src/udev/mtd_probe/mtd_probe.h create mode 100644 src/udev/mtd_probe/probe_smartmedia.c create mode 100644 src/udev/net/fuzz-link-parser.c create mode 100644 src/udev/net/fuzz-link-parser.options create mode 100644 src/udev/net/link-config-gperf.gperf create mode 100644 src/udev/net/link-config.c create mode 100644 src/udev/net/link-config.h create mode 100644 src/udev/net/test-link-config-tables.c create mode 100644 src/udev/scsi_id/README create mode 100644 src/udev/scsi_id/scsi.h create mode 100644 src/udev/scsi_id/scsi_id.c create mode 100644 src/udev/scsi_id/scsi_id.h create mode 100644 src/udev/scsi_id/scsi_serial.c create mode 100644 src/udev/test-udev-builtin.c create mode 100644 src/udev/test-udev-format.c create mode 100644 src/udev/test-udev-manager.c create mode 100644 src/udev/test-udev-node.c create mode 100644 src/udev/test-udev-rule-runner.c create mode 100644 src/udev/test-udev-rules.c create mode 100644 src/udev/test-udev-spawn.c create mode 100644 src/udev/udev-builtin-blkid.c create mode 100644 src/udev/udev-builtin-btrfs.c create mode 100644 src/udev/udev-builtin-hwdb.c create mode 100644 src/udev/udev-builtin-input_id.c create mode 100644 src/udev/udev-builtin-keyboard.c create mode 100644 src/udev/udev-builtin-kmod.c create mode 100644 src/udev/udev-builtin-net_driver.c create mode 100644 src/udev/udev-builtin-net_id.c create mode 100644 src/udev/udev-builtin-net_setup_link.c create mode 100644 src/udev/udev-builtin-path_id.c create mode 100644 src/udev/udev-builtin-uaccess.c create mode 100644 src/udev/udev-builtin-usb_id.c create mode 100644 src/udev/udev-builtin.c create mode 100644 src/udev/udev-builtin.h create mode 100644 src/udev/udev-ctrl.c create mode 100644 src/udev/udev-ctrl.h create mode 100644 src/udev/udev-event.c create mode 100644 src/udev/udev-event.h create mode 100644 src/udev/udev-format.c create mode 100644 src/udev/udev-format.h create mode 100644 src/udev/udev-manager.c create mode 100644 src/udev/udev-manager.h create mode 100644 src/udev/udev-node.c create mode 100644 src/udev/udev-node.h create mode 100644 src/udev/udev-rules.c create mode 100644 src/udev/udev-rules.h create mode 100644 src/udev/udev-spawn.c create mode 100644 src/udev/udev-spawn.h create mode 100644 src/udev/udev-trace.h create mode 100644 src/udev/udev-watch.c create mode 100644 src/udev/udev-watch.h create mode 100644 src/udev/udev-worker.c create mode 100644 src/udev/udev-worker.h create mode 100644 src/udev/udev.conf create mode 100644 src/udev/udev.pc.in create mode 100644 src/udev/udevadm-control.c create mode 100644 src/udev/udevadm-hwdb.c create mode 100644 src/udev/udevadm-info.c create mode 100644 src/udev/udevadm-lock.c create mode 100644 src/udev/udevadm-monitor.c create mode 100644 src/udev/udevadm-settle.c create mode 100644 src/udev/udevadm-test-builtin.c create mode 100644 src/udev/udevadm-test.c create mode 100644 src/udev/udevadm-trigger.c create mode 100644 src/udev/udevadm-util.c create mode 100644 src/udev/udevadm-util.h create mode 100644 src/udev/udevadm-verify.c create mode 100644 src/udev/udevadm-wait.c create mode 100644 src/udev/udevadm.c create mode 100644 src/udev/udevadm.h create mode 100644 src/udev/udevd.c create mode 100644 src/udev/udevd.h create mode 100644 src/udev/v4l_id/v4l_id.c create mode 100644 src/ukify/test/example.signing.crt.base64 create mode 100644 src/ukify/test/example.signing.key.base64 create mode 100644 src/ukify/test/example.tpm2-pcr-private.pem.base64 create mode 100644 src/ukify/test/example.tpm2-pcr-private2.pem.base64 create mode 100644 src/ukify/test/example.tpm2-pcr-public.pem.base64 create mode 100644 src/ukify/test/example.tpm2-pcr-public2.pem.base64 create mode 100644 src/ukify/test/meson.build create mode 100755 src/ukify/test/test_ukify.py create mode 100755 src/ukify/ukify.py create mode 100644 src/update-done/meson.build create mode 100644 src/update-done/update-done.c create mode 100644 src/update-utmp/meson.build create mode 100644 src/update-utmp/update-utmp.c create mode 100644 src/user-sessions/meson.build create mode 100644 src/user-sessions/user-sessions.c create mode 100644 src/userdb/meson.build create mode 100644 src/userdb/userdbctl.c create mode 100644 src/userdb/userdbd-manager.c create mode 100644 src/userdb/userdbd-manager.h create mode 100644 src/userdb/userdbd.c create mode 100644 src/userdb/userwork.c create mode 100644 src/varlinkctl/meson.build create mode 100644 src/varlinkctl/varlinkctl.c create mode 100644 src/vconsole/meson.build create mode 100644 src/vconsole/vconsole-setup.c create mode 100644 src/veritysetup/meson.build create mode 100644 src/veritysetup/veritysetup-generator.c create mode 100644 src/veritysetup/veritysetup.c create mode 100644 src/version/version.h.in create mode 100644 src/vmspawn/meson.build create mode 100644 src/vmspawn/vmspawn-settings.c create mode 100644 src/vmspawn/vmspawn-settings.h create mode 100644 src/vmspawn/vmspawn-util.c create mode 100644 src/vmspawn/vmspawn-util.h create mode 100644 src/vmspawn/vmspawn.c create mode 100644 src/volatile-root/meson.build create mode 100644 src/volatile-root/volatile-root.c create mode 100644 src/xdg-autostart-generator/fuzz-xdg-desktop.c create mode 100644 src/xdg-autostart-generator/fuzz-xdg-desktop.options create mode 100644 src/xdg-autostart-generator/meson.build create mode 100644 src/xdg-autostart-generator/test-xdg-autostart.c create mode 100644 src/xdg-autostart-generator/xdg-autostart-condition.c create mode 100644 src/xdg-autostart-generator/xdg-autostart-generator.c create mode 100644 src/xdg-autostart-generator/xdg-autostart-service.c create mode 100644 src/xdg-autostart-generator/xdg-autostart-service.h (limited to 'src') diff --git a/src/ac-power/ac-power.c b/src/ac-power/ac-power.c new file mode 100644 index 0000000..fadf1da --- /dev/null +++ b/src/ac-power/ac-power.c @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "battery-util.h" +#include "build.h" +#include "main-func.h" + +static bool arg_verbose = false; + +static enum { + ACTION_AC_POWER, + ACTION_LOW, +} arg_action = ACTION_AC_POWER; + +static void help(void) { + printf("%s\n\n" + "Report whether we are connected to an external power source.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -v --verbose Show state as text\n" + " --low Check if battery is discharging and low\n", + program_invocation_short_name); +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_LOW, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "verbose", no_argument, NULL, 'v' }, + { "low", no_argument, NULL, ARG_LOW }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hv", options, NULL)) >= 0) + + switch (c) { + + case 'h': + help(); + return 0; + + case ARG_VERSION: + return version(); + + case 'v': + arg_verbose = true; + break; + + case ARG_LOW: + arg_action = ACTION_LOW; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s takes no arguments.", + program_invocation_short_name); + + return 1; +} + +static int run(int argc, char *argv[]) { + int r; + + /* This is mostly intended to be used for scripts which want + * to detect whether AC power is plugged in or not. */ + + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_action == ACTION_AC_POWER) { + r = on_ac_power(); + if (r < 0) + return log_error_errno(r, "Failed to read AC status: %m"); + } else { + r = battery_is_discharging_and_low(); + if (r < 0) + return log_error_errno(r, "Failed to read battery discharging + low status: %m"); + } + + if (arg_verbose) + puts(yes_no(r)); + + return r == 0; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/ac-power/meson.build b/src/ac-power/meson.build new file mode 100644 index 0000000..032c027 --- /dev/null +++ b/src/ac-power/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-ac-power', + 'public' : true, + 'sources' : files('ac-power.c'), + }, +] diff --git a/src/analyze/analyze-blame.c b/src/analyze/analyze-blame.c new file mode 100644 index 0000000..81e5c59 --- /dev/null +++ b/src/analyze/analyze-blame.c @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-blame.h" +#include "analyze-time-data.h" +#include "format-table.h" + +int verb_blame(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(unit_times_free_arrayp) UnitTimes *times = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + TableCell *cell; + int n, r; + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + n = acquire_time_data(bus, /* require_finished = */ false, ×); + if (n <= 0) + return n; + + table = table_new("time", "unit"); + if (!table) + return log_oom(); + + table_set_header(table, false); + + assert_se(cell = table_get_cell(table, 0, 0)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + r = table_set_align_percent(table, cell, 100); + if (r < 0) + return r; + + assert_se(cell = table_get_cell(table, 0, 1)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + r = table_set_sort(table, (size_t) 0); + if (r < 0) + return r; + + r = table_set_reverse(table, 0, true); + if (r < 0) + return r; + + for (UnitTimes *u = times; u->has_data; u++) { + if (u->time <= 0) + continue; + + r = table_add_many(table, + TABLE_TIMESPAN_MSEC, u->time, + TABLE_STRING, u->name); + if (r < 0) + return table_log_add_error(r); + } + + pager_open(arg_pager_flags); + + r = table_print(table, NULL); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-blame.h b/src/analyze/analyze-blame.h new file mode 100644 index 0000000..d9aa985 --- /dev/null +++ b/src/analyze/analyze-blame.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_blame(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-calendar.c b/src/analyze/analyze-calendar.c new file mode 100644 index 0000000..6daab08 --- /dev/null +++ b/src/analyze/analyze-calendar.c @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-calendar.h" +#include "calendarspec.h" +#include "format-table.h" +#include "terminal-util.h" + +static int test_calendar_one(usec_t n, const char *p) { + _cleanup_(calendar_spec_freep) CalendarSpec *spec = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *t = NULL; + TableCell *cell; + int r; + + r = calendar_spec_from_string(p, &spec); + if (r < 0) { + log_error_errno(r, "Failed to parse calendar specification '%s': %m", p); + time_parsing_hint(p, /* calendar= */ false, /* timestamp= */ true, /* timespan= */ true); + return r; + } + + r = calendar_spec_to_string(spec, &t); + if (r < 0) + return log_error_errno(r, "Failed to format calendar specification '%s': %m", p); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + assert_se(cell = table_get_cell(table, 0, 1)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + if (!streq(t, p)) { + r = table_add_many(table, + TABLE_FIELD, "Original form", + TABLE_STRING, p); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "Normalized form", + TABLE_STRING, t); + if (r < 0) + return table_log_add_error(r); + + for (unsigned i = 0; i < arg_iterations; i++) { + usec_t next; + + r = calendar_spec_next_usec(spec, n, &next); + if (r == -ENOENT) { + if (i == 0) { + r = table_add_many(table, + TABLE_FIELD, "Next elapse", + TABLE_STRING, "never", + TABLE_SET_COLOR, ansi_highlight_yellow()); + if (r < 0) + return table_log_add_error(r); + } + break; + } + if (r < 0) + return log_error_errno(r, "Failed to determine next elapse for '%s': %m", p); + + if (i == 0) { + r = table_add_many(table, + TABLE_FIELD, "Next elapse", + TABLE_TIMESTAMP, next, + TABLE_SET_COLOR, ansi_highlight_blue()); + if (r < 0) + return table_log_add_error(r); + } else { + int k = DECIMAL_STR_WIDTH(i + 1); + + if (k < 8) + k = 8 - k; + else + k = 0; + + r = table_add_cell_stringf_full(table, NULL, TABLE_FIELD, "Iteration #%u", i+1); + if (r < 0) + return table_log_add_error(r); + + r = table_add_many(table, + TABLE_TIMESTAMP, next, + TABLE_SET_COLOR, ansi_highlight_blue()); + if (r < 0) + return table_log_add_error(r); + } + + if (!in_utc_timezone()) { + r = table_add_many(table, + TABLE_FIELD, "(in UTC)", + TABLE_TIMESTAMP_UTC, next); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "From now", + TABLE_TIMESTAMP_RELATIVE, next); + if (r < 0) + return table_log_add_error(r); + + n = next; + } + + return table_print(table, NULL); +} + +int verb_calendar(int argc, char *argv[], void *userdata) { + int r = 0; + usec_t n; + + if (arg_base_time != USEC_INFINITY) + n = arg_base_time; + else + n = now(CLOCK_REALTIME); /* We want to use the same "base" for all expressions */ + + STRV_FOREACH(p, strv_skip(argv, 1)) { + int k; + + k = test_calendar_one(n, *p); + if (r == 0 && k < 0) + r = k; + + if (p[1]) + putchar('\n'); + } + + return r; +} diff --git a/src/analyze/analyze-calendar.h b/src/analyze/analyze-calendar.h new file mode 100644 index 0000000..3d6eac2 --- /dev/null +++ b/src/analyze/analyze-calendar.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_calendar(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-capability.c b/src/analyze/analyze-capability.c new file mode 100644 index 0000000..8072175 --- /dev/null +++ b/src/analyze/analyze-capability.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-capability.h" +#include "cap-list.h" +#include "capability-util.h" +#include "format-table.h" + +int verb_capabilities(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + unsigned last_cap; + int r; + + table = table_new("name", "number"); + if (!table) + return log_oom(); + + (void) table_set_align_percent(table, table_get_cell(table, 0, 1), 100); + + /* Determine the maximum of the last cap known by the kernel and by us */ + last_cap = MAX((unsigned) CAP_LAST_CAP, cap_last_cap()); + + if (strv_isempty(strv_skip(argv, 1))) + for (unsigned c = 0; c <= last_cap; c++) { + r = table_add_many(table, + TABLE_STRING, capability_to_name(c) ?: "cap_???", + TABLE_UINT, c); + if (r < 0) + return table_log_add_error(r); + } + else { + for (int i = 1; i < argc; i++) { + int c; + + c = capability_from_name(argv[i]); + if (c < 0 || (unsigned) c > last_cap) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Capability \"%s\" not known.", argv[i]); + + r = table_add_many(table, + TABLE_STRING, capability_to_name(c) ?: "cap_???", + TABLE_UINT, (unsigned) c); + if (r < 0) + return table_log_add_error(r); + } + + (void) table_set_sort(table, (size_t) 1); + } + + pager_open(arg_pager_flags); + + r = table_print(table, NULL); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-capability.h b/src/analyze/analyze-capability.h new file mode 100644 index 0000000..07ff088 --- /dev/null +++ b/src/analyze/analyze-capability.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_capabilities(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-cat-config.c b/src/analyze/analyze-cat-config.c new file mode 100644 index 0000000..66bbbc1 --- /dev/null +++ b/src/analyze/analyze-cat-config.c @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-cat-config.h" +#include "conf-files.h" +#include "constants.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "strv.h" + +int verb_cat_config(int argc, char *argv[], void *userdata) { + char **list; + int r; + + pager_open(arg_pager_flags); + + list = strv_skip(argv, 1); + STRV_FOREACH(arg, list) { + const char *t = NULL; + + if (arg != list) + print_separator(); + + if (path_is_absolute(*arg)) { + NULSTR_FOREACH(dir, CONF_PATHS_NULSTR("")) { + t = path_startswith(*arg, dir); + if (t) + break; + } + + if (!t) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path %s does not start with any known prefix.", *arg); + } else + t = *arg; + + r = conf_files_cat(arg_root, t, arg_cat_flags); + if (r < 0) + return r; + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-cat-config.h b/src/analyze/analyze-cat-config.h new file mode 100644 index 0000000..64e87a3 --- /dev/null +++ b/src/analyze/analyze-cat-config.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_cat_config(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-compare-versions.c b/src/analyze/analyze-compare-versions.c new file mode 100644 index 0000000..94cff18 --- /dev/null +++ b/src/analyze/analyze-compare-versions.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "analyze-compare-versions.h" +#include "compare-operator.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" + +int verb_compare_versions(int argc, char *argv[], void *userdata) { + const char *v1 = ASSERT_PTR(argv[1]), *v2 = ASSERT_PTR(argv[argc-1]); + int r; + + assert(IN_SET(argc, 3, 4)); + assert(argv); + + /* We only output a warning on invalid version strings (instead of failing), since the comparison + * functions try to handle invalid strings gracefully and it's still interesting to see what the + * comparison result will be. */ + if (!version_is_valid_versionspec(v1)) + log_warning("Version string 1 contains disallowed characters, they will be treated as separators: %s", v1); + if (!version_is_valid_versionspec(v2)) + log_warning("Version string 2 contains disallowed characters, they will be treated as separators: %s", v2); + + if (argc == 3) { + r = strverscmp_improved(v1, v2); + printf("%s %s %s\n", + isempty(v1) ? "''" : v1, + comparison_operator(r), + isempty(v2) ? "''" : v2); + + /* This matches the exit convention used by rpmdev-vercmp. + * We don't use named values because 11 and 12 don't have names. */ + return r < 0 ? 12 : r > 0 ? 11 : 0; + + } else { + const char *op = ASSERT_PTR(argv[2]); + CompareOperator operator; + assert(argc == 4); + + operator = parse_compare_operator(&op, COMPARE_ALLOW_TEXTUAL); + if (operator < 0 || !isempty(op)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown operator \"%s\".", op); + + r = version_or_fnmatch_compare(operator, v1, v2); + if (r < 0) + return log_error_errno(r, "Failed to compare versions: %m"); + + return r ? EXIT_SUCCESS : EXIT_FAILURE; + } +} diff --git a/src/analyze/analyze-compare-versions.h b/src/analyze/analyze-compare-versions.h new file mode 100644 index 0000000..ac90ede --- /dev/null +++ b/src/analyze/analyze-compare-versions.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +int verb_compare_versions(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-condition.c b/src/analyze/analyze-condition.c new file mode 100644 index 0000000..1e9136d --- /dev/null +++ b/src/analyze/analyze-condition.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "analyze.h" +#include "analyze-condition.h" +#include "analyze-verify-util.h" +#include "condition.h" +#include "conf-parser.h" +#include "load-fragment.h" +#include "service.h" + +static int parse_condition(Unit *u, const char *line) { + assert(u); + assert(line); + + for (ConditionType t = 0; t < _CONDITION_TYPE_MAX; t++) { + ConfigParserCallback callback; + Condition **target; + const char *p, *name; + + name = condition_type_to_string(t); + p = startswith(line, name); + if (p) + target = &u->conditions; + else { + name = assert_type_to_string(t); + p = startswith(line, name); + if (!p) + continue; + + target = &u->asserts; + } + + p += strspn(p, WHITESPACE); + + if (*p != '=') + continue; + p++; + + p += strspn(p, WHITESPACE); + + if (condition_takes_path(t)) + callback = config_parse_unit_condition_path; + else + callback = config_parse_unit_condition_string; + + return callback(NULL, "(cmdline)", 0, NULL, 0, name, t, p, target, u); + } + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot parse \"%s\".", line); +} + +_printf_(7, 8) +static int log_helper(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) { + Unit *u = ASSERT_PTR(userdata); + va_list ap; + int r; + + /* "upgrade" debug messages */ + level = MIN(LOG_INFO, level); + + va_start(ap, format); + r = log_object_internalv(level, error, file, line, func, + NULL, + u->id, + NULL, + NULL, + format, ap); + va_end(ap); + + return r; +} + +static int verify_conditions(char **lines, RuntimeScope scope, const char *unit, const char *root) { + _cleanup_(manager_freep) Manager *m = NULL; + Unit *u; + int r, q = 1; + + if (unit) { + r = verify_set_unit_path(STRV_MAKE(unit)); + if (r < 0) + return log_error_errno(r, "Failed to set unit load path: %m"); + } + + r = manager_new(scope, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m); + if (r < 0) + return log_error_errno(r, "Failed to initialize manager: %m"); + + log_debug("Starting manager..."); + r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, root); + if (r < 0) + return r; + + if (unit) { + _cleanup_free_ char *prepared = NULL; + + r = verify_prepare_filename(unit, &prepared); + if (r < 0) + return log_error_errno(r, "Failed to prepare filename %s: %m", unit); + + r = manager_load_startable_unit_or_warn(m, NULL, prepared, &u); + if (r < 0) + return r; + } else { + r = unit_new_for_name(m, sizeof(Service), "test.service", &u); + if (r < 0) + return log_error_errno(r, "Failed to create test.service: %m"); + + STRV_FOREACH(line, lines) { + r = parse_condition(u, *line); + if (r < 0) + return r; + } + } + + condition_test_logger_t logger = arg_quiet ? NULL : log_helper; + r = condition_test_list(u->asserts, environ, assert_type_to_string, logger, u); + if (u->asserts) + log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE, "Asserts %s.", r > 0 ? "succeeded" : "failed"); + + q = condition_test_list(u->conditions, environ, condition_type_to_string, logger, u); + if (u->conditions) + log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE, "Conditions %s.", q > 0 ? "succeeded" : "failed"); + + return r > 0 && q > 0 ? 0 : -EIO; +} + +int verb_condition(int argc, char *argv[], void *userdata) { + int r; + + r = verify_conditions(strv_skip(argv, 1), arg_runtime_scope, arg_unit, arg_root); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-condition.h b/src/analyze/analyze-condition.h new file mode 100644 index 0000000..28ef51a --- /dev/null +++ b/src/analyze/analyze-condition.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_condition(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-critical-chain.c b/src/analyze/analyze-critical-chain.c new file mode 100644 index 0000000..4a7f452 --- /dev/null +++ b/src/analyze/analyze-critical-chain.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-critical-chain.h" +#include "analyze-time-data.h" +#include "analyze.h" +#include "bus-error.h" +#include "copy.h" +#include "path-util.h" +#include "sort-util.h" +#include "special.h" +#include "static-destruct.h" +#include "strv.h" +#include "terminal-util.h" + +static Hashmap *unit_times_hashmap = NULL; +STATIC_DESTRUCTOR_REGISTER(unit_times_hashmap, hashmap_freep); + +static int list_dependencies_print( + const char *name, + unsigned level, + unsigned branches, + bool last, + UnitTimes *times, + BootTimes *boot) { + + for (unsigned i = level; i != 0; i--) + printf("%s", special_glyph(branches & (1 << (i-1)) ? SPECIAL_GLYPH_TREE_VERTICAL : SPECIAL_GLYPH_TREE_SPACE)); + + printf("%s", special_glyph(last ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH)); + + if (times) { + if (timestamp_is_set(times->time)) + printf("%s%s @%s +%s%s", ansi_highlight_red(), name, + FORMAT_TIMESPAN(times->activating - boot->userspace_time, USEC_PER_MSEC), + FORMAT_TIMESPAN(times->time, USEC_PER_MSEC), ansi_normal()); + else if (times->activated > boot->userspace_time) + printf("%s @%s", name, FORMAT_TIMESPAN(times->activated - boot->userspace_time, USEC_PER_MSEC)); + else + printf("%s", name); + } else + printf("%s", name); + printf("\n"); + + return 0; +} + +static int list_dependencies_get_dependencies(sd_bus *bus, const char *name, char ***deps) { + _cleanup_free_ char *path = NULL; + + assert(bus); + assert(name); + assert(deps); + + path = unit_dbus_path_from_name(name); + if (!path) + return -ENOMEM; + + return bus_get_unit_property_strv(bus, path, "After", deps); +} + +static int list_dependencies_compare(char *const *a, char *const *b) { + usec_t usa = 0, usb = 0; + UnitTimes *times; + + times = hashmap_get(unit_times_hashmap, *a); + if (times) + usa = times->activated; + times = hashmap_get(unit_times_hashmap, *b); + if (times) + usb = times->activated; + + return CMP(usb, usa); +} + +static bool times_in_range(const UnitTimes *times, const BootTimes *boot) { + return times && times->activated > 0 && times->activated <= boot->finish_time; +} + +static int list_dependencies_one(sd_bus *bus, const char *name, unsigned level, char ***units, unsigned branches) { + _cleanup_strv_free_ char **deps = NULL; + int r; + usec_t service_longest = 0; + int to_print = 0; + UnitTimes *times; + BootTimes *boot; + + if (strv_extend(units, name)) + return log_oom(); + + r = list_dependencies_get_dependencies(bus, name, &deps); + if (r < 0) + return r; + + typesafe_qsort(deps, strv_length(deps), list_dependencies_compare); + + r = acquire_boot_times(bus, /* require_finished = */ true, &boot); + if (r < 0) + return r; + + STRV_FOREACH(c, deps) { + times = hashmap_get(unit_times_hashmap, *c); + if (times_in_range(times, boot) && times->activated >= service_longest) + service_longest = times->activated; + } + + if (service_longest == 0) + return r; + + STRV_FOREACH(c, deps) { + times = hashmap_get(unit_times_hashmap, *c); + if (times_in_range(times, boot) && service_longest - times->activated <= arg_fuzz) + to_print++; + } + + if (!to_print) + return r; + + STRV_FOREACH(c, deps) { + times = hashmap_get(unit_times_hashmap, *c); + if (!times_in_range(times, boot) || service_longest - times->activated > arg_fuzz) + continue; + + to_print--; + + r = list_dependencies_print(*c, level, branches, to_print == 0, times, boot); + if (r < 0) + return r; + + if (strv_contains(*units, *c)) { + r = list_dependencies_print("...", level + 1, (branches << 1) | (to_print ? 1 : 0), + true, NULL, boot); + if (r < 0) + return r; + continue; + } + + r = list_dependencies_one(bus, *c, level + 1, units, (branches << 1) | (to_print ? 1 : 0)); + if (r < 0) + return r; + + if (to_print == 0) + break; + } + return 0; +} + +static int list_dependencies(sd_bus *bus, const char *name) { + _cleanup_strv_free_ char **units = NULL; + UnitTimes *times; + int r; + const char *id; + _cleanup_free_ char *path = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + BootTimes *boot; + + assert(bus); + + path = unit_dbus_path_from_name(name); + if (!path) + return -ENOMEM; + + r = sd_bus_get_property( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "Id", + &error, + &reply, + "s"); + if (r < 0) + return log_error_errno(r, "Failed to get ID: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &id); + if (r < 0) + return bus_log_parse_error(r); + + times = hashmap_get(unit_times_hashmap, id); + + r = acquire_boot_times(bus, /* require_finished = */ true, &boot); + if (r < 0) + return r; + + if (times) { + if (times->time) + printf("%s%s +%s%s\n", ansi_highlight_red(), id, + FORMAT_TIMESPAN(times->time, USEC_PER_MSEC), ansi_normal()); + else if (times->activated > boot->userspace_time) + printf("%s @%s\n", id, + FORMAT_TIMESPAN(times->activated - boot->userspace_time, USEC_PER_MSEC)); + else + printf("%s\n", id); + } + + return list_dependencies_one(bus, name, 0, &units, 0); +} + +int verb_critical_chain(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(unit_times_free_arrayp) UnitTimes *times = NULL; + int n, r; + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + n = acquire_time_data(bus, /* require_finished = */ true, ×); + if (n <= 0) + return n; + + for (UnitTimes *u = times; u->has_data; u++) { + r = hashmap_ensure_put(&unit_times_hashmap, &string_hash_ops, u->name, u); + if (r < 0) + return log_error_errno(r, "Failed to add entry to hashmap: %m"); + } + + pager_open(arg_pager_flags); + + puts("The time when unit became active or started is printed after the \"@\" character.\n" + "The time the unit took to start is printed after the \"+\" character.\n"); + + if (argc > 1) + STRV_FOREACH(name, strv_skip(argv, 1)) + list_dependencies(bus, *name); + else + list_dependencies(bus, SPECIAL_DEFAULT_TARGET); + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-critical-chain.h b/src/analyze/analyze-critical-chain.h new file mode 100644 index 0000000..844249c --- /dev/null +++ b/src/analyze/analyze-critical-chain.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_critical_chain(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-dot.c b/src/analyze/analyze-dot.c new file mode 100644 index 0000000..bf8aa81 --- /dev/null +++ b/src/analyze/analyze-dot.c @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-dot.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "glob-util.h" +#include "terminal-util.h" + +static int graph_one_property( + sd_bus *bus, + const UnitInfo *u, + const char *prop, + const char *color, + char *patterns[], + char *from_patterns[], + char *to_patterns[]) { + + _cleanup_strv_free_ char **units = NULL; + bool match_patterns; + int r; + + assert(u); + assert(prop); + assert(color); + + match_patterns = strv_fnmatch(patterns, u->id); + + if (!strv_isempty(from_patterns) && !match_patterns && !strv_fnmatch(from_patterns, u->id)) + return 0; + + r = bus_get_unit_property_strv(bus, u->unit_path, prop, &units); + if (r < 0) + return r; + + STRV_FOREACH(unit, units) { + bool match_patterns2; + + match_patterns2 = strv_fnmatch(patterns, *unit); + + if (!strv_isempty(to_patterns) && !match_patterns2 && !strv_fnmatch(to_patterns, *unit)) + continue; + + if (!strv_isempty(patterns) && !match_patterns && !match_patterns2) + continue; + + printf("\t\"%s\"->\"%s\" [color=\"%s\"];\n", u->id, *unit, color); + } + + return 0; +} + +static int graph_one(sd_bus *bus, const UnitInfo *u, char *patterns[], char *from_patterns[], char *to_patterns[]) { + int r; + + assert(bus); + assert(u); + + if (IN_SET(arg_dot, DEP_ORDER, DEP_ALL)) { + r = graph_one_property(bus, u, "After", "green", patterns, from_patterns, to_patterns); + if (r < 0) + return r; + } + + if (IN_SET(arg_dot, DEP_REQUIRE, DEP_ALL)) { + r = graph_one_property(bus, u, "Requires", "black", patterns, from_patterns, to_patterns); + if (r < 0) + return r; + r = graph_one_property(bus, u, "Requisite", "darkblue", patterns, from_patterns, to_patterns); + if (r < 0) + return r; + r = graph_one_property(bus, u, "Wants", "grey66", patterns, from_patterns, to_patterns); + if (r < 0) + return r; + r = graph_one_property(bus, u, "Conflicts", "red", patterns, from_patterns, to_patterns); + if (r < 0) + return r; + } + + return 0; +} + +static int expand_patterns(sd_bus *bus, char **patterns, char ***ret) { + _cleanup_strv_free_ char **expanded_patterns = NULL; + int r; + + STRV_FOREACH(pattern, patterns) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *unit = NULL, *unit_id = NULL; + + if (strv_extend(&expanded_patterns, *pattern) < 0) + return log_oom(); + + if (string_is_glob(*pattern)) + continue; + + unit = unit_dbus_path_from_name(*pattern); + if (!unit) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + unit, + "org.freedesktop.systemd1.Unit", + "Id", + &error, + &unit_id); + if (r < 0) + return log_error_errno(r, "Failed to get ID: %s", bus_error_message(&error, r)); + + if (!streq(*pattern, unit_id)) { + if (strv_extend(&expanded_patterns, unit_id) < 0) + return log_oom(); + } + } + + *ret = TAKE_PTR(expanded_patterns); /* do not free */ + + return 0; +} + +int verb_dot(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_strv_free_ char **expanded_patterns = NULL; + _cleanup_strv_free_ char **expanded_from_patterns = NULL; + _cleanup_strv_free_ char **expanded_to_patterns = NULL; + int r; + UnitInfo u; + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + r = expand_patterns(bus, strv_skip(argv, 1), &expanded_patterns); + if (r < 0) + return r; + + r = expand_patterns(bus, arg_dot_from_patterns, &expanded_from_patterns); + if (r < 0) + return r; + + r = expand_patterns(bus, arg_dot_to_patterns, &expanded_to_patterns); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_systemd_mgr, "ListUnits", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list units: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); + if (r < 0) + return bus_log_parse_error(r); + + printf("digraph systemd {\n"); + + while ((r = bus_parse_unit_info(reply, &u)) > 0) { + + r = graph_one(bus, &u, expanded_patterns, expanded_from_patterns, expanded_to_patterns); + if (r < 0) + return r; + } + if (r < 0) + return bus_log_parse_error(r); + + printf("}\n"); + + log_info(" Color legend: black = Requires\n" + " dark blue = Requisite\n" + " dark grey = Wants\n" + " red = Conflicts\n" + " green = After\n"); + + if (on_tty() && !arg_quiet) + log_notice("-- You probably want to process this output with graphviz' dot tool.\n" + "-- Try a shell pipeline like 'systemd-analyze dot | dot -Tsvg > systemd.svg'!\n"); + + return 0; +} diff --git a/src/analyze/analyze-dot.h b/src/analyze/analyze-dot.h new file mode 100644 index 0000000..144b43d --- /dev/null +++ b/src/analyze/analyze-dot.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_dot(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-dump.c b/src/analyze/analyze-dump.c new file mode 100644 index 0000000..2642582 --- /dev/null +++ b/src/analyze/analyze-dump.c @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "analyze-dump.h" +#include "analyze.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "copy.h" + +static int dump_fallback(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *text; + int r; + + assert(bus); + + r = bus_call_method(bus, bus_systemd_mgr, "Dump", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to call Dump: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &text); + if (r < 0) + return bus_log_parse_error(r); + + fputs(text, stdout); + return 0; +} + +static int dump(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + r = bus_call_method(bus, bus_systemd_mgr, "DumpByFileDescriptor", &error, &reply, NULL); + if (IN_SET(r, -EACCES, -EBADR)) + return 0; /* Fall back to non-fd method. We need to do this even if the bus supports sending + * fds to cater to very old managers which didn't have the fd-based method. */ + if (r < 0) + return log_error_errno(r, "Failed to call DumpByFileDescriptor: %s", + bus_error_message(&error, r)); + + return dump_fd_reply(reply); +} + +static int dump_patterns_fallback(sd_bus *bus, char **patterns) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + const char *text; + int r; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "DumpUnitsMatchingPatterns"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, patterns); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to call DumpUnitsMatchingPatterns: %s", + bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &text); + if (r < 0) + return bus_log_parse_error(r); + + fputs(text, stdout); + return 0; +} + +static int dump_patterns(sd_bus *bus, char **patterns) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + int r; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "DumpUnitsMatchingPatternsByFileDescriptor"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, patterns); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to call DumpUnitsMatchingPatternsByFileDescriptor: %s", + bus_error_message(&error, r)); + + return dump_fd_reply(reply); +} + +static int mangle_patterns(char **args, char ***ret) { + _cleanup_strv_free_ char **mangled = NULL; + int r; + + STRV_FOREACH(arg, args) { + char *t; + + r = unit_name_mangle_with_suffix(*arg, NULL, UNIT_NAME_MANGLE_GLOB, ".service", &t); + if (r < 0) + return log_error_errno(r, "Failed to mangle name '%s': %m", *arg); + + r = strv_consume(&mangled, t); + if (r < 0) + return log_oom(); + } + + if (strv_isempty(mangled)) + mangled = strv_free(mangled); + + *ret = TAKE_PTR(mangled); + return 0; +} + +int verb_dump(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_strv_free_ char **patterns = NULL; + int r; + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + pager_open(arg_pager_flags); + + r = mangle_patterns(strv_skip(argv, 1), &patterns); + if (r < 0) + return r; + + r = sd_bus_can_send(bus, SD_BUS_TYPE_UNIX_FD); + if (r < 0) + return log_error_errno(r, "Unable to determine if bus connection supports fd passing: %m"); + if (r > 0) + r = patterns ? dump_patterns(bus, patterns) : dump(bus); + if (r == 0) /* wasn't supported */ + r = patterns ? dump_patterns_fallback(bus, patterns) : dump_fallback(bus); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-dump.h b/src/analyze/analyze-dump.h new file mode 100644 index 0000000..5d6107c --- /dev/null +++ b/src/analyze/analyze-dump.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_dump(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-exit-status.c b/src/analyze/analyze-exit-status.c new file mode 100644 index 0000000..3a8d3f4 --- /dev/null +++ b/src/analyze/analyze-exit-status.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-exit-status.h" +#include "exit-status.h" +#include "format-table.h" + +int verb_exit_status(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + table = table_new("name", "status", "class"); + if (!table) + return log_oom(); + + r = table_set_align_percent(table, table_get_cell(table, 0, 1), 100); + if (r < 0) + return log_error_errno(r, "Failed to right-align status: %m"); + + if (strv_isempty(strv_skip(argv, 1))) + for (size_t i = 0; i < ELEMENTSOF(exit_status_mappings); i++) { + if (!exit_status_mappings[i].name) + continue; + + r = table_add_many(table, + TABLE_STRING, exit_status_mappings[i].name, + TABLE_INT, (int) i, + TABLE_STRING, exit_status_class(i)); + if (r < 0) + return table_log_add_error(r); + } + else + for (int i = 1; i < argc; i++) { + int status; + + status = exit_status_from_string(argv[i]); + if (status < 0) + return log_error_errno(status, "Invalid exit status \"%s\".", argv[i]); + + assert(status >= 0 && (size_t) status < ELEMENTSOF(exit_status_mappings)); + r = table_add_many(table, + TABLE_STRING, exit_status_mappings[status].name ?: "-", + TABLE_INT, status, + TABLE_STRING, exit_status_class(status) ?: "-"); + if (r < 0) + return table_log_add_error(r); + } + + pager_open(arg_pager_flags); + + r = table_print(table, NULL); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-exit-status.h b/src/analyze/analyze-exit-status.h new file mode 100644 index 0000000..ce14cdb --- /dev/null +++ b/src/analyze/analyze-exit-status.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_exit_status(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-fdstore.c b/src/analyze/analyze-fdstore.c new file mode 100644 index 0000000..13db7f5 --- /dev/null +++ b/src/analyze/analyze-fdstore.c @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-fdstore.h" +#include "analyze.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "fd-util.h" +#include "format-table.h" + +static int dump_fdstore(sd_bus *bus, const char *arg) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *unit = NULL; + int r; + + assert(bus); + assert(arg); + + r = unit_name_mangle_with_suffix(arg, NULL, UNIT_NAME_MANGLE_GLOB, ".service", &unit); + if (r < 0) + return log_error_errno(r, "Failed to mangle name '%s': %m", arg); + + r = bus_call_method( + bus, + bus_systemd_mgr, + "DumpUnitFileDescriptorStore", + &error, + &reply, + "s", unit); + if (r < 0) + return log_error_errno(r, "Failed to call DumpUnitFileDescriptorStore: %s", + bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(suuutuusu)"); + if (r < 0) + return bus_log_parse_error(r); + + table = table_new("fdname", "type", "devno", "inode", "rdevno", "path", "flags"); + if (!table) + return log_oom(); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + (void) table_set_align_percent(table, TABLE_HEADER_CELL(3), 100); + + for (;;) { + uint32_t mode, major, minor, rmajor, rminor, flags; + const char *fdname, *path; + uint64_t inode; + + r = sd_bus_message_read( + reply, + "(suuutuusu)", + &fdname, + &mode, + &major, &minor, + &inode, + &rmajor, &rminor, + &path, + &flags); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = table_add_many( + table, + TABLE_STRING, fdname, + TABLE_MODE_INODE_TYPE, mode, + TABLE_DEVNUM, makedev(major, minor), + TABLE_UINT64, inode, + TABLE_DEVNUM, makedev(rmajor, rminor), + TABLE_PATH, path, + TABLE_STRING, accmode_to_string(flags)); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF) && table_get_rows(table) <= 0) + log_info("No file descriptors in fdstore of '%s'.", unit); + else { + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, /* show_header= */true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + } + + return EXIT_SUCCESS; +} + +int verb_fdstore(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + STRV_FOREACH(arg, strv_skip(argv, 1)) { + r = dump_fdstore(bus, *arg); + if (r < 0) + return r; + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-fdstore.h b/src/analyze/analyze-fdstore.h new file mode 100644 index 0000000..0b990db --- /dev/null +++ b/src/analyze/analyze-fdstore.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +int verb_fdstore(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-filesystems.c b/src/analyze/analyze-filesystems.c new file mode 100644 index 0000000..582e04e --- /dev/null +++ b/src/analyze/analyze-filesystems.c @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-filesystems.h" +#include "fileio.h" +#include "filesystems.h" +#include "set.h" +#include "strv.h" +#include "terminal-util.h" + +static int load_available_kernel_filesystems(Set **ret) { + _cleanup_set_free_ Set *filesystems = NULL; + _cleanup_free_ char *t = NULL; + int r; + + assert(ret); + + /* Let's read the available filesystems */ + + r = read_virtual_file("/proc/filesystems", SIZE_MAX, &t, NULL); + if (r < 0) + return r; + + for (int i = 0;;) { + _cleanup_free_ char *line = NULL; + const char *p; + + r = string_extract_line(t, i++, &line); + if (r < 0) + return log_oom(); + if (r == 0) + break; + + if (!line) + line = t; + + p = strchr(line, '\t'); + if (!p) + continue; + + p += strspn(p, WHITESPACE); + + r = set_put_strdup(&filesystems, p); + if (r < 0) + return log_error_errno(r, "Failed to add filesystem to list: %m"); + } + + *ret = TAKE_PTR(filesystems); + return 0; +} + +static void filesystem_set_remove(Set *s, const FilesystemSet *set) { + NULSTR_FOREACH(filesystem, set->value) { + if (filesystem[0] == '@') + continue; + + free(set_remove(s, filesystem)); + } +} + +static void dump_filesystem_set(const FilesystemSet *set) { + int r; + + if (!set) + return; + + printf("%s%s%s\n" + " # %s\n", + ansi_highlight(), + set->name, + ansi_normal(), + set->help); + + NULSTR_FOREACH(filesystem, set->value) { + const statfs_f_type_t *magic; + + if (filesystem[0] == '@') { + printf(" %s%s%s\n", ansi_underline(), filesystem, ansi_normal()); + continue; + } + + r = fs_type_from_string(filesystem, &magic); + assert_se(r >= 0); + + printf(" %s", filesystem); + + for (size_t i = 0; magic[i] != 0; i++) { + const char *primary; + if (i == 0) + printf(" %s(magic: ", ansi_grey()); + else + printf(", "); + + printf("0x%llx", (unsigned long long) magic[i]); + + primary = fs_type_to_string(magic[i]); + if (primary && !streq(primary, filesystem)) + printf("[%s]", primary); + + if (magic[i+1] == 0) + printf(")%s", ansi_normal()); + } + + printf("\n"); + } +} + +int verb_filesystems(int argc, char *argv[], void *userdata) { + bool first = true; + +#if ! HAVE_LIBBPF + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Not compiled with libbpf support, sorry."); +#endif + + pager_open(arg_pager_flags); + + if (strv_isempty(strv_skip(argv, 1))) { + _cleanup_set_free_ Set *kernel = NULL, *known = NULL; + int k; + + NULSTR_FOREACH(fs, filesystem_sets[FILESYSTEM_SET_KNOWN].value) + if (set_put_strdup(&known, fs) < 0) + return log_oom(); + + k = load_available_kernel_filesystems(&kernel); + + for (FilesystemGroups i = 0; i < _FILESYSTEM_SET_MAX; i++) { + const FilesystemSet *set = filesystem_sets + i; + if (!first) + puts(""); + + dump_filesystem_set(set); + filesystem_set_remove(kernel, set); + if (i != FILESYSTEM_SET_KNOWN) + filesystem_set_remove(known, set); + first = false; + } + + if (arg_quiet) /* Let's not show the extra stuff in quiet mode */ + return 0; + + if (!set_isempty(known)) { + _cleanup_free_ char **l = NULL; + + printf("\n" + "# %sUngrouped filesystems%s (known but not included in any of the groups except @known):\n", + ansi_highlight(), ansi_normal()); + + l = set_get_strv(known); + if (!l) + return log_oom(); + + strv_sort(l); + + STRV_FOREACH(filesystem, l) { + const statfs_f_type_t *magic; + bool is_primary = false; + + assert_se(fs_type_from_string(*filesystem, &magic) >= 0); + + for (size_t i = 0; magic[i] != 0; i++) { + const char *primary; + + primary = fs_type_to_string(magic[i]); + assert(primary); + + if (streq(primary, *filesystem)) + is_primary = true; + } + + if (!is_primary) { + log_debug("Skipping ungrouped file system '%s', because it's an alias for another one.", *filesystem); + continue; + } + + printf("# %s\n", *filesystem); + } + } + + if (k < 0) { + fputc('\n', stdout); + fflush(stdout); + log_notice_errno(k, "# Not showing unlisted filesystems, couldn't retrieve kernel filesystem list: %m"); + } else if (!set_isempty(kernel)) { + _cleanup_free_ char **l = NULL; + + printf("\n" + "# %sUnlisted filesystems%s (available to the local kernel, but not included in any of the groups listed above):\n", + ansi_highlight(), ansi_normal()); + + l = set_get_strv(kernel); + if (!l) + return log_oom(); + + strv_sort(l); + + STRV_FOREACH(filesystem, l) + printf("# %s\n", *filesystem); + } + } else + STRV_FOREACH(name, strv_skip(argv, 1)) { + const FilesystemSet *set; + + if (!first) + puts(""); + + set = filesystem_set_find(*name); + if (!set) { + /* make sure the error appears below normal output */ + fflush(stdout); + + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Filesystem set \"%s\" not found.", *name); + } + + dump_filesystem_set(set); + first = false; + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-filesystems.h b/src/analyze/analyze-filesystems.h new file mode 100644 index 0000000..0904571 --- /dev/null +++ b/src/analyze/analyze-filesystems.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_filesystems(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-image-policy.c b/src/analyze/analyze-image-policy.c new file mode 100644 index 0000000..0146b50 --- /dev/null +++ b/src/analyze/analyze-image-policy.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-image-policy.h" +#include "analyze.h" +#include "format-table.h" +#include "terminal-util.h" + +static int table_add_designator_line(Table *table, PartitionDesignator d, PartitionPolicyFlags f) { + _cleanup_free_ char *q = NULL; + const char *color; + int r; + + assert(table); + assert(f >= 0); + + if (partition_policy_flags_to_string(f & _PARTITION_POLICY_USE_MASK, /* simplify= */ true, &q) < 0) + return log_oom(); + + color = (f & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_IGNORE ? ansi_grey() : + ((f & (PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT)) == + (PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT)) ? ansi_highlight_yellow() : + (f & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_ABSENT ? ansi_highlight_red() : + !(f & PARTITION_POLICY_UNPROTECTED) ? ansi_highlight_green() : NULL; + + if (d < 0) + r = table_add_many(table, + TABLE_STRING, "default", + TABLE_SET_COLOR, ansi_highlight_green(), + TABLE_STRING, q, + TABLE_SET_COLOR, color); + else + r = table_add_many(table, + TABLE_STRING, partition_designator_to_string(d), + TABLE_SET_COLOR, ansi_normal(), + TABLE_STRING, q, + TABLE_SET_COLOR, color); + if (r < 0) + return table_log_add_error(r); + + switch (f & _PARTITION_POLICY_READ_ONLY_MASK) { + + case PARTITION_POLICY_READ_ONLY_ON: + r = table_add_many(table, TABLE_BOOLEAN, true); + break; + + case PARTITION_POLICY_READ_ONLY_OFF: + r = table_add_many(table, TABLE_BOOLEAN, false); + break; + + default: + r = table_add_many(table, TABLE_EMPTY); + break; + } + if (r < 0) + return table_log_add_error(r); + + switch (f & _PARTITION_POLICY_GROWFS_MASK) { + + case PARTITION_POLICY_GROWFS_ON: + r = table_add_many(table, TABLE_BOOLEAN, true); + break; + + case PARTITION_POLICY_GROWFS_OFF: + r = table_add_many(table, TABLE_BOOLEAN, false); + break; + + default: + r = table_add_many(table, TABLE_EMPTY); + break; + } + + if (r < 0) + return table_log_add_error(r); + + return 0; +} + +int verb_image_policy(int argc, char *argv[], void *userdata) { + int r; + + for (int i = 1; i < argc; i++) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_(image_policy_freep) ImagePolicy *pbuf = NULL; + _cleanup_free_ char *as_string = NULL, *as_string_simplified = NULL; + const ImagePolicy *p; + + /* NB: The magic '@' strings are not officially documented for now, since we might change + * around defaults (and in particular where precisely to reuse policy). We should document + * them once the dust has settled a bit. For now it's just useful for debugging and + * introspect our own defaults without guaranteeing API safety. */ + if (streq(argv[i], "@sysext")) + p = &image_policy_sysext; + else if (streq(argv[i], "@sysext-strict")) + p = &image_policy_sysext_strict; + else if (streq(argv[i], "@confext")) + p = &image_policy_confext; + else if (streq(argv[i], "@container")) + p = &image_policy_container; + else if (streq(argv[i], "@service")) + p = &image_policy_service; + else if (streq(argv[i], "@host")) + p = &image_policy_host; + else { + r = image_policy_from_string(argv[i], &pbuf); + if (r < 0) + return log_error_errno(r, "Failed to parse image policy '%s': %m", argv[i]); + + p = pbuf; + } + + r = image_policy_to_string(p, /* simplify= */ false, &as_string); + if (r < 0) + return log_error_errno(r, "Failed to format policy '%s' as string: %m", argv[i]); + + r = image_policy_to_string(p, /* simplify= */ true, &as_string_simplified); + if (r < 0) + return log_error_errno(r, "Failed to format policy '%s' as string: %m", argv[i]); + + pager_open(arg_pager_flags); + + if (streq(as_string, as_string_simplified)) + printf("Analyzing policy: %s%s%s\n", ansi_highlight_magenta_underline(), as_string, ansi_normal()); + else + printf("Analyzing policy: %s%s%s\n" + " Long form: %s%s%s\n", + ansi_highlight(), as_string_simplified, ansi_normal(), + ansi_grey(), as_string, ansi_normal()); + + table = table_new("partition", "mode", "read-only", "growfs"); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + PartitionPolicyFlags f = image_policy_get_exhaustively(p, d); + assert(f >= 0); + + r = table_add_designator_line(table, d, f); + if (r < 0) + return r; + } + + r = table_add_designator_line(table, _PARTITION_DESIGNATOR_INVALID, image_policy_default(p)); + if (r < 0) + return r; + + putc('\n', stdout); + + r = table_print(table, NULL); + if (r < 0) + return r; + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-image-policy.h b/src/analyze/analyze-image-policy.h new file mode 100644 index 0000000..fa08447 --- /dev/null +++ b/src/analyze/analyze-image-policy.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +int verb_image_policy(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-inspect-elf.c b/src/analyze/analyze-inspect-elf.c new file mode 100644 index 0000000..70226a8 --- /dev/null +++ b/src/analyze/analyze-inspect-elf.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-inspect-elf.h" +#include "elf-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-table.h" +#include "format-util.h" +#include "json.h" +#include "path-util.h" +#include "strv.h" + +static int analyze_elf(char **filenames, JsonFormatFlags json_flags) { + int r; + + STRV_FOREACH(filename, filenames) { + _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL; + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_free_ char *abspath = NULL; + _cleanup_close_ int fd = -EBADF; + + r = path_make_absolute_cwd(*filename, &abspath); + if (r < 0) + return log_error_errno(r, "Could not make an absolute path out of \"%s\": %m", *filename); + + path_simplify(abspath); + + fd = RET_NERRNO(open(abspath, O_RDONLY|O_CLOEXEC)); + if (fd < 0) + return log_error_errno(fd, "Could not open \"%s\": %m", abspath); + + r = parse_elf_object(fd, abspath, /* fork_disable_dump= */false, NULL, &package_metadata); + if (r < 0) + return log_error_errno(r, "Parsing \"%s\" as ELF object failed: %m", abspath); + + t = table_new_vertical(); + if (!t) + return log_oom(); + + r = table_add_many( + t, + TABLE_FIELD, "path", + TABLE_STRING, abspath); + if (r < 0) + return table_log_add_error(r); + + if (package_metadata) { + JsonVariant *module_json; + const char *module_name; + + JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, package_metadata) { + const char *field_name; + JsonVariant *field; + + /* The ELF type and architecture are added as top-level objects, + * since they are only parsed for the file itself, but the packaging + * metadata is parsed recursively in core files, so there might be + * multiple modules. */ + if (STR_IN_SET(module_name, "elfType", "elfArchitecture")) { + r = table_add_many( + t, + TABLE_FIELD, module_name, + TABLE_STRING, json_variant_string(module_json)); + if (r < 0) + return table_log_add_error(r); + + continue; + } + + /* path/elfType/elfArchitecture come first just once per file, + * then we might have multiple modules, so add a separator between + * them to make the output more readable. */ + r = table_add_many(t, TABLE_EMPTY, TABLE_EMPTY); + if (r < 0) + return table_log_add_error(r); + + /* In case of core files the module name will be the executable, + * but for binaries/libraries it's just the path, so don't print it + * twice. */ + if (!streq(abspath, module_name)) { + r = table_add_many( + t, + TABLE_FIELD, "module name", + TABLE_STRING, module_name); + if (r < 0) + return table_log_add_error(r); + } + + JSON_VARIANT_OBJECT_FOREACH(field_name, field, module_json) + if (json_variant_is_string(field)) { + r = table_add_many( + t, + TABLE_FIELD, field_name, + TABLE_STRING, json_variant_string(field)); + if (r < 0) + return table_log_add_error(r); + } + } + } + if (json_flags & JSON_FORMAT_OFF) { + r = table_print(t, NULL); + if (r < 0) + return table_log_print_error(r); + } else + json_variant_dump(package_metadata, json_flags, stdout, NULL); + } + + return 0; +} + +int verb_elf_inspection(int argc, char *argv[], void *userdata) { + pager_open(arg_pager_flags); + + return analyze_elf(strv_skip(argv, 1), arg_json_format_flags); +} diff --git a/src/analyze/analyze-inspect-elf.h b/src/analyze/analyze-inspect-elf.h new file mode 100644 index 0000000..a790eae --- /dev/null +++ b/src/analyze/analyze-inspect-elf.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_elf_inspection(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-log-control.c b/src/analyze/analyze-log-control.c new file mode 100644 index 0000000..cead0e8 --- /dev/null +++ b/src/analyze/analyze-log-control.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-log-control.h" +#include "verb-log-control.h" + +int verb_log_control(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + assert(IN_SET(argc, 1, 2)); + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + r = verb_log_control_common(bus, "org.freedesktop.systemd1", argv[0], argc == 2 ? argv[1] : NULL); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-log-control.h b/src/analyze/analyze-log-control.h new file mode 100644 index 0000000..350c228 --- /dev/null +++ b/src/analyze/analyze-log-control.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_log_control(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-malloc.c b/src/analyze/analyze-malloc.c new file mode 100644 index 0000000..5e6ff5b --- /dev/null +++ b/src/analyze/analyze-malloc.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "analyze-malloc.h" +#include "analyze.h" +#include "bus-error.h" +#include "bus-internal.h" + +static int dump_malloc_info(sd_bus *bus, char *service) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(bus); + assert(service); + + r = sd_bus_call_method(bus, + service, + "/org/freedesktop/MemoryAllocation1", + "org.freedesktop.MemoryAllocation1", + "GetMallocInfo", + &error, + &reply, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to call GetMallocInfo on '%s': %s", service, bus_error_message(&error, r)); + + return dump_fd_reply(reply); +} + +int verb_malloc(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + char **services = STRV_MAKE("org.freedesktop.systemd1"); + int r; + + if (!strv_isempty(strv_skip(argv, 1))) { + services = strv_skip(argv, 1); + STRV_FOREACH(service, services) + if (!service_name_is_valid(*service)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "D-Bus service name '%s' is not valid.", *service); + } + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + r = sd_bus_can_send(bus, SD_BUS_TYPE_UNIX_FD); + if (r < 0) + return log_error_errno(r, "Unable to determine if bus connection supports fd passing: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unable to receive FDs over D-Bus."); + + pager_open(arg_pager_flags); + + STRV_FOREACH(service, services) { + r = dump_malloc_info(bus, *service); + if (r < 0) + return r; + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-malloc.h b/src/analyze/analyze-malloc.h new file mode 100644 index 0000000..d3feabd --- /dev/null +++ b/src/analyze/analyze-malloc.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +int verb_malloc(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-pcrs.c b/src/analyze/analyze-pcrs.c new file mode 100644 index 0000000..ed907f7 --- /dev/null +++ b/src/analyze/analyze-pcrs.c @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-pcrs.h" +#include "fileio.h" +#include "format-table.h" +#include "hexdecoct.h" +#include "terminal-util.h" +#include "tpm2-util.h" + +static int get_pcr_alg(const char **ret) { + assert(ret); + + FOREACH_STRING(alg, "sha256", "sha1") { + _cleanup_free_ char *p = NULL; + + if (asprintf(&p, "/sys/class/tpm/tpm0/pcr-%s/0", alg) < 0) + return log_oom(); + + if (access(p, F_OK) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine whether %s exists: %m", p); + } else { + *ret = alg; + return 1; + } + } + + log_notice("Kernel does not support reading PCR values."); + *ret = NULL; + return 0; +} + +static int get_current_pcr(const char *alg, uint32_t pcr, void **ret, size_t *ret_size) { + _cleanup_free_ char *p = NULL, *s = NULL; + _cleanup_free_ void *buf = NULL; + size_t ss = 0, bufsize = 0; + int r; + + assert(alg); + assert(ret); + assert(ret_size); + + if (asprintf(&p, "/sys/class/tpm/tpm0/pcr-%s/%" PRIu32, alg, pcr) < 0) + return log_oom(); + + r = read_virtual_file(p, 4096, &s, &ss); + if (r < 0) + return log_error_errno(r, "Failed to read '%s': %m", p); + + r = unhexmem(s, ss, &buf, &bufsize); + if (r < 0) + return log_error_errno(r, "Failed to decode hex PCR data '%s': %m", s); + + *ret = TAKE_PTR(buf); + *ret_size = bufsize; + return 0; +} + +static int add_pcr_to_table(Table *table, const char *alg, uint32_t pcr) { + _cleanup_free_ char *h = NULL; + const char *color = NULL; + int r; + + if (alg) { + _cleanup_free_ void *buf = NULL; + size_t bufsize = 0; + + r = get_current_pcr(alg, pcr, &buf, &bufsize); + if (r < 0) + return r; + + h = hexmem(buf, bufsize); + if (!h) + return log_oom(); + + /* Grey out PCRs that are not sensibly initialized */ + if (memeqbyte(0, buf, bufsize) || + memeqbyte(0xFFU, buf, bufsize)) + color = ANSI_GREY; + } + + r = table_add_many(table, + TABLE_UINT32, pcr, + TABLE_STRING, tpm2_pcr_index_to_string(pcr), + TABLE_STRING, h, + TABLE_SET_COLOR, color); + if (r < 0) + return table_log_add_error(r); + + return 0; +} + +int verb_pcrs(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + const char *alg = NULL; + int r; + + if (tpm2_support() != TPM2_SUPPORT_FULL) + log_notice("System lacks full TPM2 support, not showing PCR state."); + else { + r = get_pcr_alg(&alg); + if (r < 0) + return r; + } + + table = table_new("nr", "name", alg ?: "-"); + if (!table) + return log_oom(); + + (void) table_set_align_percent(table, table_get_cell(table, 0, 0), 100); + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + if (!alg) /* hide hash column if we couldn't acquire it */ + (void) table_set_display(table, 0, 1); + + if (strv_isempty(strv_skip(argv, 1))) + for (uint32_t pi = 0; pi < _TPM2_PCR_INDEX_MAX_DEFINED; pi++) { + r = add_pcr_to_table(table, alg, pi); + if (r < 0) + return r; + } + else { + for (int i = 1; i < argc; i++) { + int pi; + + pi = tpm2_pcr_index_from_string(argv[i]); + if (pi < 0) + return log_error_errno(pi, "PCR index \"%s\" not known.", argv[i]); + + r = add_pcr_to_table(table, alg, pi); + if (r < 0) + return r; + } + + (void) table_set_sort(table, (size_t) 0); + } + + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, /* show_header= */true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-pcrs.h b/src/analyze/analyze-pcrs.h new file mode 100644 index 0000000..2a59511 --- /dev/null +++ b/src/analyze/analyze-pcrs.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_pcrs(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-plot.c b/src/analyze/analyze-plot.c new file mode 100644 index 0000000..81fc25b --- /dev/null +++ b/src/analyze/analyze-plot.c @@ -0,0 +1,493 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-plot.h" +#include "analyze-time-data.h" +#include "analyze.h" +#include "bus-error.h" +#include "bus-map-properties.h" +#include "format-table.h" +#include "os-util.h" +#include "sort-util.h" +#include "strv.h" +#include "unit-def.h" +#include "version.h" + +#define SCALE_X (0.1 / 1000.0) /* pixels per us */ +#define SCALE_Y (20.0) + +#define svg(...) printf(__VA_ARGS__) + +#define svg_bar(class, x1, x2, y) \ + svg(" \n", \ + (class), \ + SCALE_X * (x1), SCALE_Y * (y), \ + SCALE_X * ((x2) - (x1)), SCALE_Y - 1.0) + +#define svg_text(b, x, y, format, ...) \ + do { \ + svg(" ", (b) ? "left" : "right", SCALE_X * (x) + (b ? 5.0 : -5.0), SCALE_Y * (y) + 14.0); \ + svg(format, ## __VA_ARGS__); \ + svg("\n"); \ + } while (false) + + +typedef struct HostInfo { + char *hostname; + char *kernel_name; + char *kernel_release; + char *kernel_version; + char *os_pretty_name; + char *virtualization; + char *architecture; +} HostInfo; + +static HostInfo *free_host_info(HostInfo *hi) { + if (!hi) + return NULL; + + free(hi->hostname); + free(hi->kernel_name); + free(hi->kernel_release); + free(hi->kernel_version); + free(hi->os_pretty_name); + free(hi->virtualization); + free(hi->architecture); + return mfree(hi); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(HostInfo *, free_host_info); + +static int acquire_host_info(sd_bus *bus, HostInfo **hi) { + static const struct bus_properties_map hostname_map[] = { + { "Hostname", "s", NULL, offsetof(HostInfo, hostname) }, + { "KernelName", "s", NULL, offsetof(HostInfo, kernel_name) }, + { "KernelRelease", "s", NULL, offsetof(HostInfo, kernel_release) }, + { "KernelVersion", "s", NULL, offsetof(HostInfo, kernel_version) }, + { "OperatingSystemPrettyName", "s", NULL, offsetof(HostInfo, os_pretty_name) }, + {} + }; + + static const struct bus_properties_map manager_map[] = { + { "Virtualization", "s", NULL, offsetof(HostInfo, virtualization) }, + { "Architecture", "s", NULL, offsetof(HostInfo, architecture) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL; + _cleanup_(free_host_infop) HostInfo *host = NULL; + int r; + + host = new0(HostInfo, 1); + if (!host) + return log_oom(); + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) { + r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, &system_bus); + if (r < 0) { + log_debug_errno(r, "Failed to connect to system bus, ignoring: %m"); + goto manager; + } + } + + r = bus_map_all_properties( + system_bus ?: bus, + "org.freedesktop.hostname1", + "/org/freedesktop/hostname1", + hostname_map, + BUS_MAP_STRDUP, + &error, + NULL, + host); + if (r < 0) { + log_debug_errno(r, "Failed to get host information from systemd-hostnamed, ignoring: %s", + bus_error_message(&error, r)); + sd_bus_error_free(&error); + } + +manager: + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + manager_map, + BUS_MAP_STRDUP, + &error, + NULL, + host); + if (r < 0) + return log_error_errno(r, "Failed to get host information from systemd: %s", + bus_error_message(&error, r)); + + *hi = TAKE_PTR(host); + return 0; +} + +static int compare_unit_start(const UnitTimes *a, const UnitTimes *b) { + return CMP(a->activating, b->activating); +} + +static void svg_graph_box(double height, double begin, double end) { + /* outside box, fill */ + svg("\n", + SCALE_X * (end - begin), + SCALE_Y * height); + + for (long long i = ((long long) (begin / 100000)) * 100000; i <= end; i += 100000) { + /* lines for each second */ + if (i % 5000000 == 0) + svg(" \n" + " %.01fs\n", + SCALE_X * i, + SCALE_X * i, + SCALE_Y * height, + SCALE_X * i, + -5.0, + 0.000001 * i); + else if (i % 1000000 == 0) + svg(" \n" + " %.01fs\n", + SCALE_X * i, + SCALE_X * i, + SCALE_Y * height, + SCALE_X * i, + -5.0, + 0.000001 * i); + else + svg(" \n", + SCALE_X * i, + SCALE_X * i, + SCALE_Y * height); + } +} + +static void plot_tooltip(const UnitTimes *ut) { + assert(ut); + assert(ut->name); + + svg("%s:\n", ut->name); + + UnitDependency i; + VA_ARGS_FOREACH(i, UNIT_AFTER, UNIT_BEFORE, UNIT_REQUIRES, UNIT_REQUISITE, UNIT_WANTS, UNIT_CONFLICTS, UNIT_UPHOLDS) + if (!strv_isempty(ut->deps[i])) { + svg("\n%s:\n", unit_dependency_to_string(i)); + STRV_FOREACH(s, ut->deps[i]) + svg(" %s\n", *s); + } +} + +static int plot_unit_times(UnitTimes *u, double width, int y) { + bool b; + + if (!u->name) + return 0; + + svg("\n"); + svg(""); + plot_tooltip(u); + svg("\n"); + svg_bar("activating", u->activating, u->activated, y); + svg_bar("active", u->activated, u->deactivating, y); + svg_bar("deactivating", u->deactivating, u->deactivated, y); + + /* place the text on the left if we have passed the half of the svg width */ + b = u->activating * SCALE_X < width / 2; + if (u->time) + svg_text(b, u->activating, y, "%s (%s)", + u->name, FORMAT_TIMESPAN(u->time, USEC_PER_MSEC)); + else + svg_text(b, u->activating, y, "%s", u->name); + svg("\n"); + + return 1; +} + +static void limit_times_to_boot(const BootTimes *boot, UnitTimes *u) { + if (u->deactivated > u->activating && u->deactivated <= boot->finish_time && u->activated == 0 + && u->deactivating == 0) + u->activated = u->deactivating = u->deactivated; + if (u->activated < u->activating || u->activated > boot->finish_time) + u->activated = boot->finish_time; + if (u->deactivating < u->activated || u->deactivating > boot->finish_time) + u->deactivating = boot->finish_time; + if (u->deactivated < u->deactivating || u->deactivated > boot->finish_time) + u->deactivated = boot->finish_time; +} + +static int produce_plot_as_svg( + UnitTimes *times, + const HostInfo *host, + const BootTimes *boot, + const char *pretty_times) { + int m = 1, y = 0; + UnitTimes *u; + double width; + + width = SCALE_X * (boot->firmware_time + boot->finish_time); + if (width < 800.0) + width = 800.0; + + if (boot->firmware_time > boot->loader_time) + m++; + if (timestamp_is_set(boot->loader_time)) { + m++; + if (width < 1000.0) + width = 1000.0; + } + if (timestamp_is_set(boot->initrd_time)) + m++; + if (timestamp_is_set(boot->kernel_done_time)) + m++; + + for (u = times; u->has_data; u++) { + double text_start, text_width; + + if (u->activating > boot->finish_time) { + unit_times_clear(u); + continue; + } + + /* If the text cannot fit on the left side then + * increase the svg width so it fits on the right. + * TODO: calculate the text width more accurately */ + text_width = 8.0 * strlen(u->name); + text_start = (boot->firmware_time + u->activating) * SCALE_X; + if (text_width > text_start && text_width + text_start > width) + width = text_width + text_start; + + limit_times_to_boot(boot, u); + + m++; + } + + svg("\n" + "\n"); + + svg("\n\n", + 80.0 + width, 150.0 + (m * SCALE_Y) + + 5 * SCALE_Y /* legend */); + + /* write some basic info as a comment, including some help */ + svg("\n" + "\n" + "\n" + "\n" + "\n\n" + "\n\n", GIT_VERSION); + + /* style sheet */ + svg("\n \n\n\n"); + + svg("\n"); + svg("%s", pretty_times); + if (host) + svg("%s %s (%s %s %s) %s %s", + os_release_pretty_name(host->os_pretty_name, NULL), + strempty(host->hostname), + strempty(host->kernel_name), + strempty(host->kernel_release), + strempty(host->kernel_version), + strempty(host->architecture), + strempty(host->virtualization)); + + svg("\n", 20.0 + (SCALE_X * boot->firmware_time)); + svg_graph_box(m, -(double) boot->firmware_time, boot->finish_time); + + if (timestamp_is_set(boot->firmware_time)) { + svg_bar("firmware", -(double) boot->firmware_time, -(double) boot->loader_time, y); + svg_text(true, -(double) boot->firmware_time, y, "firmware"); + y++; + } + if (timestamp_is_set(boot->loader_time)) { + svg_bar("loader", -(double) boot->loader_time, 0, y); + svg_text(true, -(double) boot->loader_time, y, "loader"); + y++; + } + if (timestamp_is_set(boot->kernel_done_time)) { + svg_bar("kernel", 0, boot->kernel_done_time, y); + svg_text(true, 0, y, "kernel"); + y++; + } + if (timestamp_is_set(boot->initrd_time)) { + svg_bar("initrd", boot->initrd_time, boot->userspace_time, y); + if (boot->initrd_security_start_time < boot->initrd_security_finish_time) + svg_bar("security", boot->initrd_security_start_time, boot->initrd_security_finish_time, y); + if (boot->initrd_generators_start_time < boot->initrd_generators_finish_time) + svg_bar("generators", boot->initrd_generators_start_time, boot->initrd_generators_finish_time, y); + if (boot->initrd_unitsload_start_time < boot->initrd_unitsload_finish_time) + svg_bar("unitsload", boot->initrd_unitsload_start_time, boot->initrd_unitsload_finish_time, y); + svg_text(true, boot->initrd_time, y, "initrd"); + y++; + } + + for (u = times; u->has_data; u++) { + if (u->activating >= boot->userspace_time) + break; + + y += plot_unit_times(u, width, y); + } + + svg_bar("active", boot->userspace_time, boot->finish_time, y); + if (timestamp_is_set(boot->security_start_time)) + svg_bar("security", boot->security_start_time, boot->security_finish_time, y); + svg_bar("generators", boot->generators_start_time, boot->generators_finish_time, y); + svg_bar("unitsload", boot->unitsload_start_time, boot->unitsload_finish_time, y); + svg_text(true, boot->userspace_time, y, "systemd"); + y++; + + for (; u->has_data; u++) + y += plot_unit_times(u, width, y); + + svg("\n"); + + /* Legend */ + svg("\n"); + y++; + svg_bar("activating", 0, 300000, y); + svg_text(true, 400000, y, "Activating"); + y++; + svg_bar("active", 0, 300000, y); + svg_text(true, 400000, y, "Active"); + y++; + svg_bar("deactivating", 0, 300000, y); + svg_text(true, 400000, y, "Deactivating"); + y++; + if (timestamp_is_set(boot->security_start_time)) { + svg_bar("security", 0, 300000, y); + svg_text(true, 400000, y, "Setting up security module"); + y++; + } + svg_bar("generators", 0, 300000, y); + svg_text(true, 400000, y, "Generators"); + y++; + svg_bar("unitsload", 0, 300000, y); + svg_text(true, 400000, y, "Loading unit files"); + y++; + + svg("\n\n"); + + svg("\n"); + + return 0; +} + +static int show_table(Table *table, const char *word) { + int r; + + assert(table); + assert(word); + + if (table_get_rows(table) > 1) { + table_set_header(table, arg_legend); + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + r = table_print_json(table, NULL, arg_json_format_flags | JSON_FORMAT_COLOR_AUTO); + else + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + if (table_get_rows(table) > 1) + printf("\n%zu %s listed.\n", table_get_rows(table) - 1, word); + else + printf("No %s.\n", word); + } + + return 0; +} + +static int produce_plot_as_text(UnitTimes *times, const BootTimes *boot) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + table = table_new("name", "activated", "activating", "time", "deactivated", "deactivating"); + if (!table) + return log_oom(); + + for (; times->has_data; times++) { + limit_times_to_boot(boot, times); + + r = table_add_many( + table, + TABLE_STRING, times->name, + TABLE_TIMESPAN_MSEC, times->activated, + TABLE_TIMESPAN_MSEC, times->activating, + TABLE_TIMESPAN_MSEC, times->time, + TABLE_TIMESPAN_MSEC, times->deactivated, + TABLE_TIMESPAN_MSEC, times->deactivating); + if (r < 0) + return table_log_add_error(r); + } + + return show_table(table, "Units"); +} + +int verb_plot(int argc, char *argv[], void *userdata) { + _cleanup_(free_host_infop) HostInfo *host = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(unit_times_free_arrayp) UnitTimes *times = NULL; + _cleanup_free_ char *pretty_times = NULL; + bool use_full_bus = arg_runtime_scope == RUNTIME_SCOPE_SYSTEM; + BootTimes *boot; + int n, r; + + r = acquire_bus(&bus, &use_full_bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + n = acquire_boot_times(bus, /* require_finished = */ true, &boot); + if (n < 0) + return n; + + n = pretty_boot_time(bus, &pretty_times); + if (n < 0) + return n; + + if (use_full_bus || arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) { + n = acquire_host_info(bus, &host); + if (n < 0) + return n; + } + + n = acquire_time_data(bus, /* require_finished = */ true, ×); + if (n <= 0) + return n; + + typesafe_qsort(times, n, compare_unit_start); + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF) || arg_table) + r = produce_plot_as_text(times, boot); + else + r = produce_plot_as_svg(times, host, boot, pretty_times); + if (r < 0) + return r; + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-plot.h b/src/analyze/analyze-plot.h new file mode 100644 index 0000000..eb2e398 --- /dev/null +++ b/src/analyze/analyze-plot.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_plot(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-security.c b/src/analyze/analyze-security.c new file mode 100644 index 0000000..5f1b5e6 --- /dev/null +++ b/src/analyze/analyze-security.c @@ -0,0 +1,2956 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "af-list.h" +#include "analyze.h" +#include "analyze-security.h" +#include "analyze-verify.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "copy.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "in-addr-prefix-util.h" +#include "locale-util.h" +#include "macro.h" +#include "manager.h" +#include "missing_capability.h" +#include "missing_sched.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "seccomp-util.h" +#include "service.h" +#include "set.h" +#include "stdio-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "unit-def.h" +#include "unit-name.h" +#include "unit-serialize.h" + +typedef struct SecurityInfo { + char *id; + char *type; + char *load_state; + char *fragment_path; + bool default_dependencies; + + uint64_t ambient_capabilities; + uint64_t capability_bounding_set; + + char *user; + char **supplementary_groups; + bool dynamic_user; + + bool ip_address_deny_all; + bool ip_address_allow_localhost; + bool ip_address_allow_other; + + bool ip_filters_custom_ingress; + bool ip_filters_custom_egress; + + char *keyring_mode; + char *protect_proc; + char *proc_subset; + bool lock_personality; + bool memory_deny_write_execute; + bool no_new_privileges; + char *notify_access; + bool protect_hostname; + + bool private_devices; + bool private_mounts; + bool private_network; + bool private_tmp; + bool private_users; + + bool protect_control_groups; + bool protect_kernel_modules; + bool protect_kernel_tunables; + bool protect_kernel_logs; + bool protect_clock; + + char *protect_home; + char *protect_system; + + bool remove_ipc; + + bool restrict_address_family_inet; + bool restrict_address_family_unix; + bool restrict_address_family_netlink; + bool restrict_address_family_packet; + bool restrict_address_family_other; + + unsigned long long restrict_namespaces; + bool restrict_realtime; + bool restrict_suid_sgid; + + char *root_directory; + char *root_image; + + bool delegate; + char *device_policy; + char **device_allow; + + Set *system_call_architectures; + + bool system_call_filter_allow_list; + Set *system_call_filter; + + mode_t _umask; +} SecurityInfo; + +struct security_assessor { + const char *id; + const char *json_field; + const char *description_good; + const char *description_bad; + const char *description_na; + const char *url; + uint64_t weight; + uint64_t range; + int (*assess)( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description); + size_t offset; + uint64_t parameter; + bool default_dependencies_only; +}; + +static SecurityInfo *security_info_new(void) { + SecurityInfo *info = new(SecurityInfo, 1); + if (!info) + return NULL; + + *info = (SecurityInfo) { + .default_dependencies = true, + .capability_bounding_set = UINT64_MAX, + .restrict_namespaces = UINT64_MAX, + ._umask = 0002, + }; + + return info; +} + +static SecurityInfo *security_info_free(SecurityInfo *i) { + if (!i) + return NULL; + + free(i->id); + free(i->type); + free(i->load_state); + free(i->fragment_path); + + free(i->user); + + free(i->protect_home); + free(i->protect_system); + + free(i->root_directory); + free(i->root_image); + + free(i->keyring_mode); + free(i->protect_proc); + free(i->proc_subset); + free(i->notify_access); + + free(i->device_policy); + strv_free(i->device_allow); + + strv_free(i->supplementary_groups); + set_free(i->system_call_architectures); + set_free(i->system_call_filter); + + return mfree(i); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(SecurityInfo*, security_info_free); + +static bool security_info_runs_privileged(const SecurityInfo *i) { + assert(i); + + if (STRPTR_IN_SET(i->user, "0", "root")) + return true; + + if (i->dynamic_user) + return false; + + return isempty(i->user); +} + +static int assess_bool( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + const bool *b = ASSERT_PTR(data); + + assert(ret_badness); + assert(ret_description); + + *ret_badness = a->parameter ? *b : !*b; + *ret_description = NULL; + + return 0; +} + +static int assess_user( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + _cleanup_free_ char *d = NULL; + uint64_t b; + + assert(ret_badness); + assert(ret_description); + + if (streq_ptr(info->user, NOBODY_USER_NAME)) { + d = strdup("Service runs under as '" NOBODY_USER_NAME "' user, which should not be used for services"); + b = 9; + } else if (info->dynamic_user && !STR_IN_SET(info->user, "0", "root")) { + d = strdup("Service runs under a transient non-root user identity"); + b = 0; + } else if (info->user && !STR_IN_SET(info->user, "0", "root", "")) { + d = strdup("Service runs under a static non-root user identity"); + b = 0; + } else { + *ret_badness = 10; + *ret_description = NULL; + return 0; + } + + if (!d) + return log_oom(); + + *ret_badness = b; + *ret_description = TAKE_PTR(d); + + return 0; +} + +static int assess_protect_home( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + const char *description; + uint64_t badness; + char *copy; + int r; + + assert(ret_badness); + assert(ret_description); + + badness = 10; + description = "Service has full access to home directories"; + + r = parse_boolean(info->protect_home); + if (r < 0) { + if (streq_ptr(info->protect_home, "read-only")) { + badness = 5; + description = "Service has read-only access to home directories"; + } else if (streq_ptr(info->protect_home, "tmpfs")) { + badness = 1; + description = "Service has access to fake empty home directories"; + } + } else if (r > 0) { + badness = 0; + description = "Service has no access to home directories"; + } + + copy = strdup(description); + if (!copy) + return log_oom(); + + *ret_badness = badness; + *ret_description = copy; + + return 0; +} + +static int assess_protect_system( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + const char *description; + uint64_t badness; + char *copy; + int r; + + assert(ret_badness); + assert(ret_description); + + badness = 10; + description = "Service has full access to the OS file hierarchy"; + + r = parse_boolean(info->protect_system); + if (r < 0) { + if (streq_ptr(info->protect_system, "full")) { + badness = 3; + description = "Service has very limited write access to the OS file hierarchy"; + } else if (streq_ptr(info->protect_system, "strict")) { + badness = 0; + description = "Service has strict read-only access to the OS file hierarchy"; + } + } else if (r > 0) { + badness = 5; + description = "Service has limited write access to the OS file hierarchy"; + } + + copy = strdup(description); + if (!copy) + return log_oom(); + + *ret_badness = badness; + *ret_description = copy; + + return 0; +} + +static int assess_root_directory( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = + empty_or_root(info->root_directory) && + empty_or_root(info->root_image); + *ret_description = NULL; + + return 0; +} + +static int assess_capability_bounding_set( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = !!(info->capability_bounding_set & a->parameter); + *ret_description = NULL; + + return 0; +} + +static int assess_umask( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + char *copy = NULL; + const char *d; + uint64_t b; + + assert(ret_badness); + assert(ret_description); + + if (!FLAGS_SET(info->_umask, 0002)) { + d = "Files created by service are world-writable by default"; + b = 10; + } else if (!FLAGS_SET(info->_umask, 0004)) { + d = "Files created by service are world-readable by default"; + b = 5; + } else if (!FLAGS_SET(info->_umask, 0020)) { + d = "Files created by service are group-writable by default"; + b = 2; + } else if (!FLAGS_SET(info->_umask, 0040)) { + d = "Files created by service are group-readable by default"; + b = 1; + } else { + d = "Files created by service are accessible only by service's own user by default"; + b = 0; + } + + copy = strdup(d); + if (!copy) + return log_oom(); + + *ret_badness = b; + *ret_description = copy; + + return 0; +} + +static int assess_keyring_mode( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = !streq_ptr(info->keyring_mode, "private"); + *ret_description = NULL; + + return 0; +} + +static int assess_protect_proc( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + if (streq_ptr(info->protect_proc, "noaccess")) + *ret_badness = 1; + else if (STRPTR_IN_SET(info->protect_proc, "invisible", "ptraceable")) + *ret_badness = 0; + else + *ret_badness = 3; + + *ret_description = NULL; + + return 0; +} + +static int assess_proc_subset( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = !streq_ptr(info->proc_subset, "pid"); + *ret_description = NULL; + + return 0; +} + +static int assess_notify_access( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = streq_ptr(info->notify_access, "all"); + *ret_description = NULL; + + return 0; +} + +static int assess_remove_ipc( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + if (security_info_runs_privileged(info)) + *ret_badness = UINT64_MAX; + else + *ret_badness = !info->remove_ipc; + + *ret_description = NULL; + return 0; +} + +static int assess_supplementary_groups( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + if (security_info_runs_privileged(info)) + *ret_badness = UINT64_MAX; + else + *ret_badness = !strv_isempty(info->supplementary_groups); + + *ret_description = NULL; + return 0; +} + +static int assess_restrict_namespaces( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = !!(info->restrict_namespaces & a->parameter); + *ret_description = NULL; + + return 0; +} + +#if HAVE_SECCOMP + +static int assess_system_call_architectures( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + char *d; + uint64_t b; + + assert(ret_badness); + assert(ret_description); + + if (set_isempty(info->system_call_architectures)) { + b = 10; + d = strdup("Service may execute system calls with all ABIs"); + } else if (set_contains(info->system_call_architectures, "native") && + set_size(info->system_call_architectures) == 1) { + b = 0; + d = strdup("Service may execute system calls only with native ABI"); + } else { + b = 8; + d = strdup("Service may execute system calls with multiple ABIs"); + } + + if (!d) + return log_oom(); + + *ret_badness = b; + *ret_description = d; + + return 0; +} + +static bool syscall_names_in_filter(Set *s, bool allow_list, const SyscallFilterSet *f, const char **ret_offending_syscall) { + NULSTR_FOREACH(syscall, f->value) { + if (syscall[0] == '@') { + const SyscallFilterSet *g; + + assert_se(g = syscall_filter_set_find(syscall)); + if (syscall_names_in_filter(s, allow_list, g, ret_offending_syscall)) + return true; /* bad! */ + + continue; + } + + /* Let's see if the system call actually exists on this platform, before complaining */ + if (seccomp_syscall_resolve_name(syscall) < 0) + continue; + + if (set_contains(s, syscall) == allow_list) { + log_debug("Offending syscall filter item: %s", syscall); + if (ret_offending_syscall) + *ret_offending_syscall = syscall; + return true; /* bad! */ + } + } + + *ret_offending_syscall = NULL; + return false; +} + +static int assess_system_call_filter( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(a); + assert(info); + assert(ret_badness); + assert(ret_description); + + assert(a->parameter < _SYSCALL_FILTER_SET_MAX); + const SyscallFilterSet *f = syscall_filter_sets + a->parameter; + + _cleanup_free_ char *d = NULL; + uint64_t b; + int r; + + if (!info->system_call_filter_allow_list && set_isempty(info->system_call_filter)) { + r = free_and_strdup(&d, "Service does not filter system calls"); + b = 10; + } else { + bool bad; + const char *offender = NULL; + + log_debug("Analyzing system call filter, checking against: %s", f->name); + bad = syscall_names_in_filter(info->system_call_filter, info->system_call_filter_allow_list, f, &offender); + log_debug("Result: %s", bad ? "bad" : "good"); + + if (info->system_call_filter_allow_list) { + if (bad) { + r = asprintf(&d, "System call allow list defined for service, and %s is included " + "(e.g. %s is allowed)", + f->name, offender); + b = 9; + } else { + r = asprintf(&d, "System call allow list defined for service, and %s is not included", + f->name); + b = 0; + } + } else { + if (bad) { + r = asprintf(&d, "System call deny list defined for service, and %s is not included " + "(e.g. %s is allowed)", + f->name, offender); + b = 10; + } else { + r = asprintf(&d, "System call deny list defined for service, and %s is included", + f->name); + b = 0; + } + } + } + if (r < 0) + return log_oom(); + + *ret_badness = b; + *ret_description = TAKE_PTR(d); + + return 0; +} + +#endif + +static int assess_ip_address_allow( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + char *d = NULL; + uint64_t b; + + assert(info); + assert(ret_badness); + assert(ret_description); + + if (info->ip_filters_custom_ingress || info->ip_filters_custom_egress) { + d = strdup("Service defines custom ingress/egress IP filters with BPF programs"); + b = 0; + } else if (!info->ip_address_deny_all) { + d = strdup("Service does not define an IP address allow list"); + b = 10; + } else if (info->ip_address_allow_other) { + d = strdup("Service defines IP address allow list with non-localhost entries"); + b = 5; + } else if (info->ip_address_allow_localhost) { + d = strdup("Service defines IP address allow list with only localhost entries"); + b = 2; + } else { + d = strdup("Service blocks all IP address ranges"); + b = 0; + } + + if (!d) + return log_oom(); + + *ret_badness = b; + *ret_description = d; + + return 0; +} + +static int assess_device_allow( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + char *d = NULL; + uint64_t b; + + assert(info); + assert(ret_badness); + assert(ret_description); + + if (STRPTR_IN_SET(info->device_policy, "strict", "closed")) { + + if (!strv_isempty(info->device_allow)) { + _cleanup_free_ char *join = NULL; + + join = strv_join(info->device_allow, " "); + if (!join) + return log_oom(); + + d = strjoin("Service has a device ACL with some special devices: ", join); + b = 5; + } else { + d = strdup("Service has a minimal device ACL"); + b = 0; + } + } else { + d = strdup("Service has no device ACL"); + b = 10; + } + + if (!d) + return log_oom(); + + *ret_badness = b; + *ret_description = d; + + return 0; +} + +static int assess_ambient_capabilities( + const struct security_assessor *a, + const SecurityInfo *info, + const void *data, + uint64_t *ret_badness, + char **ret_description) { + + assert(ret_badness); + assert(ret_description); + + *ret_badness = info->ambient_capabilities != 0; + *ret_description = NULL; + + return 0; +} + +static const struct security_assessor security_assessor_table[] = { + { + .id = "User=/DynamicUser=", + .json_field = "UserOrDynamicUser", + .description_bad = "Service runs as root user", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#User=", + .weight = 2000, + .range = 10, + .assess = assess_user, + }, + { + .id = "SupplementaryGroups=", + .json_field = "SupplementaryGroups", + .description_good = "Service has no supplementary groups", + .description_bad = "Service runs with supplementary groups", + .description_na = "Service runs as root, option does not matter", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SupplementaryGroups=", + .weight = 200, + .range = 1, + .assess = assess_supplementary_groups, + }, + { + .id = "PrivateDevices=", + .json_field = "PrivateDevices", + .description_good = "Service has no access to hardware devices", + .description_bad = "Service potentially has access to hardware devices", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#PrivateDevices=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, private_devices), + }, + { + .id = "PrivateMounts=", + .json_field = "PrivateMounts", + .description_good = "Service cannot install system mounts", + .description_bad = "Service may install system mounts", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#PrivateMounts=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, private_mounts), + }, + { + .id = "PrivateNetwork=", + .json_field = "PrivateNetwork", + .description_good = "Service has no access to the host's network", + .description_bad = "Service has access to the host's network", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#PrivateNetwork=", + .weight = 2500, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, private_network), + }, + { + .id = "PrivateTmp=", + .json_field = "PrivateTmp", + .description_good = "Service has no access to other software's temporary files", + .description_bad = "Service has access to other software's temporary files", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#PrivateTmp=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, private_tmp), + .default_dependencies_only = true, + }, + { + .id = "PrivateUsers=", + .json_field = "PrivateUsers", + .description_good = "Service does not have access to other users", + .description_bad = "Service has access to other users", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#PrivateUsers=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, private_users), + }, + { + .id = "ProtectControlGroups=", + .json_field = "ProtectControlGroups", + .description_good = "Service cannot modify the control group file system", + .description_bad = "Service may modify the control group file system", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectControlGroups=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, protect_control_groups), + }, + { + .id = "ProtectKernelModules=", + .json_field = "ProtectKernelModules", + .description_good = "Service cannot load or read kernel modules", + .description_bad = "Service may load or read kernel modules", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectKernelModules=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, protect_kernel_modules), + }, + { + .id = "ProtectKernelTunables=", + .json_field = "ProtectKernelTunables", + .description_good = "Service cannot alter kernel tunables (/proc/sys, …)", + .description_bad = "Service may alter kernel tunables", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectKernelTunables=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, protect_kernel_tunables), + }, + { + .id = "ProtectKernelLogs=", + .json_field = "ProtectKernelLogs", + .description_good = "Service cannot read from or write to the kernel log ring buffer", + .description_bad = "Service may read from or write to the kernel log ring buffer", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectKernelLogs=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, protect_kernel_logs), + }, + { + .id = "ProtectClock=", + .json_field = "ProtectClock", + .description_good = "Service cannot write to the hardware clock or system clock", + .description_bad = "Service may write to the hardware clock or system clock", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectClock=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, protect_clock), + }, + { + .id = "ProtectHome=", + .json_field = "ProtectHome", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectHome=", + .weight = 1000, + .range = 10, + .assess = assess_protect_home, + .default_dependencies_only = true, + }, + { + .id = "ProtectHostname=", + .json_field = "ProtectHostname", + .description_good = "Service cannot change system host/domainname", + .description_bad = "Service may change system host/domainname", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectHostname=", + .weight = 50, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, protect_hostname), + }, + { + .id = "ProtectSystem=", + .json_field = "ProtectSystem", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectSystem=", + .weight = 1000, + .range = 10, + .assess = assess_protect_system, + .default_dependencies_only = true, + }, + { + .id = "RootDirectory=/RootImage=", + .json_field = "RootDirectoryOrRootImage", + .description_good = "Service has its own root directory/image", + .description_bad = "Service runs within the host's root directory", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RootDirectory=", + .weight = 200, + .range = 1, + .assess = assess_root_directory, + .default_dependencies_only = true, + }, + { + .id = "LockPersonality=", + .json_field = "LockPersonality", + .description_good = "Service cannot change ABI personality", + .description_bad = "Service may change ABI personality", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#LockPersonality=", + .weight = 100, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, lock_personality), + }, + { + .id = "MemoryDenyWriteExecute=", + .json_field = "MemoryDenyWriteExecute", + .description_good = "Service cannot create writable executable memory mappings", + .description_bad = "Service may create writable executable memory mappings", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#MemoryDenyWriteExecute=", + .weight = 100, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, memory_deny_write_execute), + }, + { + .id = "NoNewPrivileges=", + .json_field = "NoNewPrivileges", + .description_good = "Service processes cannot acquire new privileges", + .description_bad = "Service processes may acquire new privileges", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#NoNewPrivileges=", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, no_new_privileges), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_ADMIN", + .json_field = "CapabilityBoundingSet_CAP_SYS_ADMIN", + .description_good = "Service has no administrator privileges", + .description_bad = "Service has administrator privileges", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = UINT64_C(1) << CAP_SYS_ADMIN, + }, + { + .id = "CapabilityBoundingSet=~CAP_SET(UID|GID|PCAP)", + .json_field = "CapabilityBoundingSet_CAP_SET_UID_GID_PCAP", + .description_good = "Service cannot change UID/GID identities/capabilities", + .description_bad = "Service may change UID/GID identities/capabilities", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SETUID)| + (UINT64_C(1) << CAP_SETGID)| + (UINT64_C(1) << CAP_SETPCAP), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_PTRACE", + .json_field = "CapabilityBoundingSet_CAP_SYS_PTRACE", + .description_good = "Service has no ptrace() debugging abilities", + .description_bad = "Service has ptrace() debugging abilities", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_PTRACE), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_TIME", + .json_field = "CapabilityBoundingSet_CAP_SYS_TIME", + .description_good = "Service processes cannot change the system clock", + .description_bad = "Service processes may change the system clock", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1000, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = UINT64_C(1) << CAP_SYS_TIME, + }, + { + .id = "CapabilityBoundingSet=~CAP_NET_ADMIN", + .json_field = "CapabilityBoundingSet_CAP_NET_ADMIN", + .description_good = "Service has no network configuration privileges", + .description_bad = "Service has network configuration privileges", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1000, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_NET_ADMIN), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_RAWIO", + .json_field = "CapabilityBoundingSet_CAP_SYS_RAWIO", + .description_good = "Service has no raw I/O access", + .description_bad = "Service has raw I/O access", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1000, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_RAWIO), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_MODULE", + .json_field = "CapabilityBoundingSet_CAP_SYS_MODULE", + .description_good = "Service cannot load kernel modules", + .description_bad = "Service may load kernel modules", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1000, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_MODULE), + }, + { + .id = "CapabilityBoundingSet=~CAP_AUDIT_*", + .json_field = "CapabilityBoundingSet_CAP_AUDIT", + .description_good = "Service has no audit subsystem access", + .description_bad = "Service has audit subsystem access", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_AUDIT_CONTROL) | + (UINT64_C(1) << CAP_AUDIT_READ) | + (UINT64_C(1) << CAP_AUDIT_WRITE), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYSLOG", + .json_field = "CapabilityBoundingSet_CAP_SYSLOG", + .description_good = "Service has no access to kernel logging", + .description_bad = "Service has access to kernel logging", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYSLOG), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_(NICE|RESOURCE)", + .json_field = "CapabilityBoundingSet_CAP_SYS_NICE_RESOURCE", + .description_good = "Service has no privileges to change resource use parameters", + .description_bad = "Service has privileges to change resource use parameters", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_NICE) | + (UINT64_C(1) << CAP_SYS_RESOURCE), + }, + { + .id = "CapabilityBoundingSet=~CAP_MKNOD", + .json_field = "CapabilityBoundingSet_CAP_MKNOD", + .description_good = "Service cannot create device nodes", + .description_bad = "Service may create device nodes", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_MKNOD), + }, + { + .id = "CapabilityBoundingSet=~CAP_(CHOWN|FSETID|SETFCAP)", + .json_field = "CapabilityBoundingSet_CAP_CHOWN_FSETID_SETFCAP", + .description_good = "Service cannot change file ownership/access mode/capabilities", + .description_bad = "Service may change file ownership/access mode/capabilities unrestricted", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1000, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_CHOWN) | + (UINT64_C(1) << CAP_FSETID) | + (UINT64_C(1) << CAP_SETFCAP), + }, + { + .id = "CapabilityBoundingSet=~CAP_(DAC_*|FOWNER|IPC_OWNER)", + .json_field = "CapabilityBoundingSet_CAP_DAC_FOWNER_IPC_OWNER", + .description_good = "Service cannot override UNIX file/IPC permission checks", + .description_bad = "Service may override UNIX file/IPC permission checks", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 1000, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_DAC_OVERRIDE) | + (UINT64_C(1) << CAP_DAC_READ_SEARCH) | + (UINT64_C(1) << CAP_FOWNER) | + (UINT64_C(1) << CAP_IPC_OWNER), + }, + { + .id = "CapabilityBoundingSet=~CAP_KILL", + .json_field = "CapabilityBoundingSet_CAP_KILL", + .description_good = "Service cannot send UNIX signals to arbitrary processes", + .description_bad = "Service may send UNIX signals to arbitrary processes", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_KILL), + }, + { + .id = "CapabilityBoundingSet=~CAP_NET_(BIND_SERVICE|BROADCAST|RAW)", + .json_field = "CapabilityBoundingSet_CAP_NET_BIND_SERVICE_BROADCAST_RAW)", + .description_good = "Service has no elevated networking privileges", + .description_bad = "Service has elevated networking privileges", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 500, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_NET_BIND_SERVICE) | + (UINT64_C(1) << CAP_NET_BROADCAST) | + (UINT64_C(1) << CAP_NET_RAW), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_BOOT", + .json_field = "CapabilityBoundingSet_CAP_SYS_BOOT", + .description_good = "Service cannot issue reboot()", + .description_bad = "Service may issue reboot()", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 100, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_BOOT), + }, + { + .id = "CapabilityBoundingSet=~CAP_MAC_*", + .json_field = "CapabilityBoundingSet_CAP_MAC", + .description_good = "Service cannot adjust SMACK MAC", + .description_bad = "Service may adjust SMACK MAC", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 100, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_MAC_ADMIN)| + (UINT64_C(1) << CAP_MAC_OVERRIDE), + }, + { + .id = "CapabilityBoundingSet=~CAP_LINUX_IMMUTABLE", + .json_field = "CapabilityBoundingSet_CAP_LINUX_IMMUTABLE", + .description_good = "Service cannot mark files immutable", + .description_bad = "Service may mark files immutable", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 75, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_LINUX_IMMUTABLE), + }, + { + .id = "CapabilityBoundingSet=~CAP_IPC_LOCK", + .json_field = "CapabilityBoundingSet_CAP_IPC_LOCK", + .description_good = "Service cannot lock memory into RAM", + .description_bad = "Service may lock memory into RAM", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 50, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_IPC_LOCK), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_CHROOT", + .json_field = "CapabilityBoundingSet_CAP_SYS_CHROOT", + .description_good = "Service cannot issue chroot()", + .description_bad = "Service may issue chroot()", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 50, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_CHROOT), + }, + { + .id = "CapabilityBoundingSet=~CAP_BLOCK_SUSPEND", + .json_field = "CapabilityBoundingSet_CAP_BLOCK_SUSPEND", + .description_good = "Service cannot establish wake locks", + .description_bad = "Service may establish wake locks", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 25, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_BLOCK_SUSPEND), + }, + { + .id = "CapabilityBoundingSet=~CAP_WAKE_ALARM", + .json_field = "CapabilityBoundingSet_CAP_WAKE_ALARM", + .description_good = "Service cannot program timers that wake up the system", + .description_bad = "Service may program timers that wake up the system", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 25, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_WAKE_ALARM), + }, + { + .id = "CapabilityBoundingSet=~CAP_LEASE", + .json_field = "CapabilityBoundingSet_CAP_LEASE", + .description_good = "Service cannot create file leases", + .description_bad = "Service may create file leases", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 25, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_LEASE), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_TTY_CONFIG", + .json_field = "CapabilityBoundingSet_CAP_SYS_TTY_CONFIG", + .description_good = "Service cannot issue vhangup()", + .description_bad = "Service may issue vhangup()", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 25, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_TTY_CONFIG), + }, + { + .id = "CapabilityBoundingSet=~CAP_SYS_PACCT", + .json_field = "CapabilityBoundingSet_CAP_SYS_PACCT", + .description_good = "Service cannot use acct()", + .description_bad = "Service may use acct()", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 25, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_SYS_PACCT), + }, + { + .id = "CapabilityBoundingSet=~CAP_BPF", + .json_field = "CapabilityBoundingSet_CAP_BPF", + .description_good = "Service may load BPF programs", + .description_bad = "Service may not load BPF programs", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#CapabilityBoundingSet=", + .weight = 25, + .range = 1, + .assess = assess_capability_bounding_set, + .parameter = (UINT64_C(1) << CAP_BPF), + }, + { + .id = "UMask=", + .json_field = "UMask", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#UMask=", + .weight = 100, + .range = 10, + .assess = assess_umask, + }, + { + .id = "KeyringMode=", + .json_field = "KeyringMode", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#KeyringMode=", + .description_good = "Service doesn't share key material with other services", + .description_bad = "Service shares key material with other service", + .weight = 1000, + .range = 1, + .assess = assess_keyring_mode, + }, + { + .id = "ProtectProc=", + .json_field = "ProtectProc", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectProc=", + .description_good = "Service has restricted access to process tree (/proc hidepid=)", + .description_bad = "Service has full access to process tree (/proc hidepid=)", + .weight = 1000, + .range = 3, + .assess = assess_protect_proc, + }, + { + .id = "ProcSubset=", + .json_field = "ProcSubset", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProcSubset=", + .description_good = "Service has no access to non-process /proc files (/proc subset=)", + .description_bad = "Service has full access to non-process /proc files (/proc subset=)", + .weight = 10, + .range = 1, + .assess = assess_proc_subset, + }, + { + .id = "NotifyAccess=", + .json_field = "NotifyAccess", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#NotifyAccess=", + .description_good = "Service child processes cannot alter service state", + .description_bad = "Service child processes may alter service state", + .weight = 1000, + .range = 1, + .assess = assess_notify_access, + }, + { + .id = "RemoveIPC=", + .json_field = "RemoveIPC", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RemoveIPC=", + .description_good = "Service user cannot leave SysV IPC objects around", + .description_bad = "Service user may leave SysV IPC objects around", + .description_na = "Service runs as root, option does not apply", + .weight = 100, + .range = 1, + .assess = assess_remove_ipc, + .offset = offsetof(SecurityInfo, remove_ipc), + }, + { + .id = "Delegate=", + .json_field = "Delegate", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#Delegate=", + .description_good = "Service does not maintain its own delegated control group subtree", + .description_bad = "Service maintains its own delegated control group subtree", + .weight = 100, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, delegate), + .parameter = true, /* invert! */ + }, + { + .id = "RestrictRealtime=", + .json_field = "RestrictRealtime", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictRealtime=", + .description_good = "Service realtime scheduling access is restricted", + .description_bad = "Service may acquire realtime scheduling", + .weight = 500, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_realtime), + }, + { + .id = "RestrictSUIDSGID=", + .json_field = "RestrictSUIDSGID", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictSUIDSGID=", + .description_good = "SUID/SGID file creation by service is restricted", + .description_bad = "Service may create SUID/SGID files", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_suid_sgid), + }, + { + .id = "RestrictNamespaces=~user", + .json_field = "RestrictNamespaces_user", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create user namespaces", + .description_bad = "Service may create user namespaces", + .weight = 1500, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWUSER, + }, + { + .id = "RestrictNamespaces=~mnt", + .json_field = "RestrictNamespaces_mnt", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create file system namespaces", + .description_bad = "Service may create file system namespaces", + .weight = 500, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWNS, + }, + { + .id = "RestrictNamespaces=~ipc", + .json_field = "RestrictNamespaces_ipc", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create IPC namespaces", + .description_bad = "Service may create IPC namespaces", + .weight = 500, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWIPC, + }, + { + .id = "RestrictNamespaces=~pid", + .json_field = "RestrictNamespaces_pid", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create process namespaces", + .description_bad = "Service may create process namespaces", + .weight = 500, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWPID, + }, + { + .id = "RestrictNamespaces=~cgroup", + .json_field = "RestrictNamespaces_cgroup", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create cgroup namespaces", + .description_bad = "Service may create cgroup namespaces", + .weight = 500, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWCGROUP, + }, + { + .id = "RestrictNamespaces=~net", + .json_field = "RestrictNamespaces_net", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create network namespaces", + .description_bad = "Service may create network namespaces", + .weight = 500, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWNET, + }, + { + .id = "RestrictNamespaces=~uts", + .json_field = "RestrictNamespaces_uts", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictNamespaces=", + .description_good = "Service cannot create hostname namespaces", + .description_bad = "Service may create hostname namespaces", + .weight = 100, + .range = 1, + .assess = assess_restrict_namespaces, + .parameter = CLONE_NEWUTS, + }, + { + .id = "RestrictAddressFamilies=~AF_(INET|INET6)", + .json_field = "RestrictAddressFamilies_AF_INET_INET6", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictAddressFamilies=", + .description_good = "Service cannot allocate Internet sockets", + .description_bad = "Service may allocate Internet sockets", + .weight = 1500, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_address_family_inet), + }, + { + .id = "RestrictAddressFamilies=~AF_UNIX", + .json_field = "RestrictAddressFamilies_AF_UNIX", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictAddressFamilies=", + .description_good = "Service cannot allocate local sockets", + .description_bad = "Service may allocate local sockets", + .weight = 25, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_address_family_unix), + }, + { + .id = "RestrictAddressFamilies=~AF_NETLINK", + .json_field = "RestrictAddressFamilies_AF_NETLINK", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictAddressFamilies=", + .description_good = "Service cannot allocate netlink sockets", + .description_bad = "Service may allocate netlink sockets", + .weight = 200, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_address_family_netlink), + }, + { + .id = "RestrictAddressFamilies=~AF_PACKET", + .json_field = "RestrictAddressFamilies_AF_PACKET", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictAddressFamilies=", + .description_good = "Service cannot allocate packet sockets", + .description_bad = "Service may allocate packet sockets", + .weight = 1000, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_address_family_packet), + }, + { + .id = "RestrictAddressFamilies=~…", + .json_field = "RestrictAddressFamilies_OTHER", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#RestrictAddressFamilies=", + .description_good = "Service cannot allocate exotic sockets", + .description_bad = "Service may allocate exotic sockets", + .weight = 1250, + .range = 1, + .assess = assess_bool, + .offset = offsetof(SecurityInfo, restrict_address_family_other), + }, +#if HAVE_SECCOMP + { + .id = "SystemCallArchitectures=", + .json_field = "SystemCallArchitectures", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallArchitectures=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_architectures, + }, + { + .id = "SystemCallFilter=~@swap", + .json_field = "SystemCallFilter_swap", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_SWAP, + }, + { + .id = "SystemCallFilter=~@obsolete", + .json_field = "SystemCallFilter_obsolete", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 250, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_OBSOLETE, + }, + { + .id = "SystemCallFilter=~@clock", + .json_field = "SystemCallFilter_clock", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_CLOCK, + }, + { + .id = "SystemCallFilter=~@cpu-emulation", + .json_field = "SystemCallFilter_cpu_emulation", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 250, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_CPU_EMULATION, + }, + { + .id = "SystemCallFilter=~@debug", + .json_field = "SystemCallFilter_debug", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_DEBUG, + }, + { + .id = "SystemCallFilter=~@mount", + .json_field = "SystemCallFilter_mount", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_MOUNT, + }, + { + .id = "SystemCallFilter=~@module", + .json_field = "SystemCallFilter_module", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_MODULE, + }, + { + .id = "SystemCallFilter=~@raw-io", + .json_field = "SystemCallFilter_raw_io", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_RAW_IO, + }, + { + .id = "SystemCallFilter=~@reboot", + .json_field = "SystemCallFilter_reboot", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 1000, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_REBOOT, + }, + { + .id = "SystemCallFilter=~@privileged", + .json_field = "SystemCallFilter_privileged", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 700, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_PRIVILEGED, + }, + { + .id = "SystemCallFilter=~@resources", + .json_field = "SystemCallFilter_resources", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=", + .weight = 700, + .range = 10, + .assess = assess_system_call_filter, + .parameter = SYSCALL_FILTER_SET_RESOURCES, + }, +#endif + { + .id = "IPAddressDeny=", + .json_field = "IPAddressDeny", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#IPAddressDeny=", + .weight = 1000, + .range = 10, + .assess = assess_ip_address_allow, + }, + { + .id = "DeviceAllow=", + .json_field = "DeviceAllow", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#DeviceAllow=", + .weight = 1000, + .range = 10, + .assess = assess_device_allow, + }, + { + .id = "AmbientCapabilities=", + .json_field = "AmbientCapabilities", + .url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#AmbientCapabilities=", + .description_good = "Service process does not receive ambient capabilities", + .description_bad = "Service process receives ambient capabilities", + .weight = 500, + .range = 1, + .assess = assess_ambient_capabilities, + }, +}; + +static JsonVariant* security_assessor_find_in_policy(const struct security_assessor *a, JsonVariant *policy, const char *name) { + JsonVariant *item; + assert(a); + + if (!policy) + return NULL; + if (!json_variant_is_object(policy)) { + log_debug("Specified policy is not a JSON object, ignoring."); + return NULL; + } + + item = json_variant_by_key(policy, a->json_field); + if (!item) + return NULL; + if (!json_variant_is_object(item)) { + log_debug("Item for '%s' in policy JSON object is not an object, ignoring.", a->id); + return NULL; + } + + return name ? json_variant_by_key(item, name) : item; +} + +static uint64_t access_weight(const struct security_assessor *a, JsonVariant *policy) { + JsonVariant *val; + + assert(a); + + val = security_assessor_find_in_policy(a, policy, "weight"); + if (val) { + if (json_variant_is_unsigned(val)) + return json_variant_unsigned(val); + log_debug("JSON field 'weight' of policy for %s is not an unsigned integer, ignoring.", a->id); + } + + return a->weight; +} + +static uint64_t access_range(const struct security_assessor *a, JsonVariant *policy) { + JsonVariant *val; + + assert(a); + + val = security_assessor_find_in_policy(a, policy, "range"); + if (val) { + if (json_variant_is_unsigned(val)) + return json_variant_unsigned(val); + log_debug("JSON field 'range' of policy for %s is not an unsigned integer, ignoring.", a->id); + } + + return a->range; +} + +static const char *access_description_na(const struct security_assessor *a, JsonVariant *policy) { + JsonVariant *val; + + assert(a); + + val = security_assessor_find_in_policy(a, policy, "description_na"); + if (val) { + if (json_variant_is_string(val)) + return json_variant_string(val); + log_debug("JSON field 'description_na' of policy for %s is not a string, ignoring.", a->id); + } + + return a->description_na; +} + +static const char *access_description_good(const struct security_assessor *a, JsonVariant *policy) { + JsonVariant *val; + + assert(a); + + val = security_assessor_find_in_policy(a, policy, "description_good"); + if (val) { + if (json_variant_is_string(val)) + return json_variant_string(val); + log_debug("JSON field 'description_good' of policy for %s is not a string, ignoring.", a->id); + } + + return a->description_good; +} + +static const char *access_description_bad(const struct security_assessor *a, JsonVariant *policy) { + JsonVariant *val; + + assert(a); + + val = security_assessor_find_in_policy(a, policy, "description_bad"); + if (val) { + if (json_variant_is_string(val)) + return json_variant_string(val); + log_debug("JSON field 'description_bad' of policy for %s is not a string, ignoring.", a->id); + } + + return a->description_bad; +} + +static int assess(const SecurityInfo *info, + Table *overview_table, + AnalyzeSecurityFlags flags, + unsigned threshold, + JsonVariant *policy, + PagerFlags pager_flags, + JsonFormatFlags json_format_flags) { + + static const struct { + uint64_t exposure; + const char *name; + const char *color; + SpecialGlyph smiley; + } badness_table[] = { + { 100, "DANGEROUS", ANSI_HIGHLIGHT_RED, SPECIAL_GLYPH_DEPRESSED_SMILEY }, + { 90, "UNSAFE", ANSI_HIGHLIGHT_RED, SPECIAL_GLYPH_UNHAPPY_SMILEY }, + { 75, "EXPOSED", ANSI_HIGHLIGHT_YELLOW, SPECIAL_GLYPH_SLIGHTLY_UNHAPPY_SMILEY }, + { 50, "MEDIUM", NULL, SPECIAL_GLYPH_NEUTRAL_SMILEY }, + { 10, "OK", ANSI_HIGHLIGHT_GREEN, SPECIAL_GLYPH_SLIGHTLY_HAPPY_SMILEY }, + { 1, "SAFE", ANSI_HIGHLIGHT_GREEN, SPECIAL_GLYPH_HAPPY_SMILEY }, + { 0, "PERFECT", ANSI_HIGHLIGHT_GREEN, SPECIAL_GLYPH_ECSTATIC_SMILEY }, + }; + + uint64_t badness_sum = 0, weight_sum = 0, exposure; + _cleanup_(table_unrefp) Table *details_table = NULL; + size_t i; + int r; + + if (!FLAGS_SET(flags, ANALYZE_SECURITY_SHORT)) { + details_table = table_new(" ", "name", "json_field", "description", "weight", "badness", "range", "exposure"); + if (!details_table) + return log_oom(); + + r = table_set_json_field_name(details_table, 0, "set"); + if (r < 0) + return log_error_errno(r, "Failed to set JSON field name of column 0: %m"); + + (void) table_set_sort(details_table, (size_t) 3, (size_t) 1); + (void) table_set_reverse(details_table, 3, true); + + if (getenv_bool("SYSTEMD_ANALYZE_DEBUG") <= 0) + (void) table_set_display(details_table, (size_t) 0, (size_t) 1, (size_t) 2, (size_t) 3, (size_t) 7); + } + + for (i = 0; i < ELEMENTSOF(security_assessor_table); i++) { + const struct security_assessor *a = security_assessor_table + i; + _cleanup_free_ char *d = NULL; + uint64_t badness; + void *data; + uint64_t weight = access_weight(a, policy); + uint64_t range = access_range(a, policy); + + data = (uint8_t *) info + a->offset; + + if (a->default_dependencies_only && !info->default_dependencies) { + badness = UINT64_MAX; + d = strdup("Service runs in special boot phase, option is not appropriate"); + if (!d) + return log_oom(); + } else if (weight == 0) { + badness = UINT64_MAX; + d = strdup("Option excluded by policy, skipping"); + if (!d) + return log_oom(); + } else { + r = a->assess(a, info, data, &badness, &d); + if (r < 0) + return r; + } + + assert(range > 0); + + if (badness != UINT64_MAX) { + assert(badness <= range); + + badness_sum += DIV_ROUND_UP(badness * weight, range); + weight_sum += weight; + } + + if (details_table) { + const char *description, *color = NULL; + int checkmark; + + if (badness == UINT64_MAX) { + checkmark = -1; + description = access_description_na(a, policy); + color = NULL; + } else if (badness == a->range) { + checkmark = 0; + description = access_description_bad(a, policy); + color = ansi_highlight_red(); + } else if (badness == 0) { + checkmark = 1; + description = access_description_good(a, policy); + color = ansi_highlight_green(); + } else { + checkmark = 0; + description = NULL; + color = ansi_highlight_red(); + } + + if (d) + description = d; + + if (checkmark < 0) { + r = table_add_many(details_table, TABLE_EMPTY); + if (r < 0) + return table_log_add_error(r); + } else { + r = table_add_many(details_table, + TABLE_BOOLEAN_CHECKMARK, checkmark > 0, + TABLE_SET_MINIMUM_WIDTH, 1, + TABLE_SET_MAXIMUM_WIDTH, 1, + TABLE_SET_ELLIPSIZE_PERCENT, 0, + TABLE_SET_COLOR, color); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(details_table, + TABLE_STRING, a->id, TABLE_SET_URL, a->url, + TABLE_STRING, a->json_field, + TABLE_STRING, description, + TABLE_UINT64, weight, TABLE_SET_ALIGN_PERCENT, 100, + TABLE_UINT64, badness, TABLE_SET_ALIGN_PERCENT, 100, + TABLE_UINT64, range, TABLE_SET_ALIGN_PERCENT, 100, + TABLE_EMPTY, TABLE_SET_ALIGN_PERCENT, 100); + if (r < 0) + return table_log_add_error(r); + } + } + + assert(weight_sum > 0); + + if (details_table) { + size_t row; + + for (row = 1; row < table_get_rows(details_table); row++) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1 + DECIMAL_STR_MAX(uint64_t) + 1]; + const uint64_t *weight, *badness, *range; + TableCell *cell; + uint64_t x; + + assert_se(weight = table_get_at(details_table, row, 4)); + assert_se(badness = table_get_at(details_table, row, 5)); + assert_se(range = table_get_at(details_table, row, 6)); + + if (*badness == UINT64_MAX || *badness == 0) + continue; + + assert_se(cell = table_get_cell(details_table, row, 7)); + + x = DIV_ROUND_UP(DIV_ROUND_UP(*badness * *weight * 100U, *range), weight_sum); + xsprintf(buf, "%" PRIu64 ".%" PRIu64, x / 10, x % 10); + + r = table_update(details_table, cell, TABLE_STRING, buf); + if (r < 0) + return log_error_errno(r, "Failed to update cell in table: %m"); + } + + if (json_format_flags & JSON_FORMAT_OFF) { + r = table_hide_column_from_display(details_table, (size_t) 2); + if (r < 0) + return log_error_errno(r, "Failed to set columns to display: %m"); + } + + r = table_print_with_pager(details_table, json_format_flags, pager_flags, /* show_header= */true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + } + + exposure = DIV_ROUND_UP(badness_sum * 100U, weight_sum); + + for (i = 0; i < ELEMENTSOF(badness_table); i++) + if (exposure >= badness_table[i].exposure) + break; + + assert(i < ELEMENTSOF(badness_table)); + + if (details_table && (json_format_flags & JSON_FORMAT_OFF)) { + _cleanup_free_ char *clickable = NULL; + const char *name; + + /* If we shall output the details table, also print the brief summary underneath */ + + if (info->fragment_path) { + r = terminal_urlify_path(info->fragment_path, info->id, &clickable); + if (r < 0) + return log_oom(); + + name = clickable; + } else + name = info->id; + + printf("\n%s %sOverall exposure level for %s%s: %s%" PRIu64 ".%" PRIu64 " %s%s %s\n", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + ansi_highlight(), + name, + ansi_normal(), + colors_enabled() ? strempty(badness_table[i].color) : "", + exposure / 10, exposure % 10, + badness_table[i].name, + ansi_normal(), + special_glyph(badness_table[i].smiley)); + } + + fflush(stdout); + + if (overview_table) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1 + DECIMAL_STR_MAX(uint64_t) + 1]; + _cleanup_free_ char *url = NULL; + + if (info->fragment_path) { + r = file_url_from_path(info->fragment_path, &url); + if (r < 0) + return log_error_errno(r, "Failed to generate URL from path: %m"); + } + + xsprintf(buf, "%" PRIu64 ".%" PRIu64, exposure / 10, exposure % 10); + + r = table_add_many(overview_table, + TABLE_STRING, info->id, + TABLE_SET_URL, url, + TABLE_STRING, buf, + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_STRING, badness_table[i].name, + TABLE_SET_COLOR, strempty(badness_table[i].color), + TABLE_STRING, special_glyph(badness_table[i].smiley)); + if (r < 0) + return table_log_add_error(r); + } + + /* Return error when overall exposure level is over threshold */ + if (exposure > threshold) + return -EINVAL; + + return 0; +} + +static int property_read_restrict_namespaces( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = ASSERT_PTR(userdata); + int r; + uint64_t namespaces; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_read(m, "t", &namespaces); + if (r < 0) + return r; + + info->restrict_namespaces = (unsigned long long) namespaces; + + return 0; +} + +static int property_read_umask( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = ASSERT_PTR(userdata); + int r; + uint32_t umask; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_read(m, "u", &umask); + if (r < 0) + return r; + + info->_umask = (mode_t) umask; + + return 0; +} + +static int property_read_restrict_address_families( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = userdata; + int allow_list, r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "b", &allow_list); + if (r < 0) + return r; + + info->restrict_address_family_inet = + info->restrict_address_family_unix = + info->restrict_address_family_netlink = + info->restrict_address_family_packet = + info->restrict_address_family_other = allow_list; + + r = sd_bus_message_enter_container(m, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *name; + + r = sd_bus_message_read(m, "s", &name); + if (r < 0) + return r; + if (r == 0) + break; + + if (STR_IN_SET(name, "AF_INET", "AF_INET6")) + info->restrict_address_family_inet = !allow_list; + else if (streq(name, "AF_UNIX")) + info->restrict_address_family_unix = !allow_list; + else if (streq(name, "AF_NETLINK")) + info->restrict_address_family_netlink = !allow_list; + else if (streq(name, "AF_PACKET")) + info->restrict_address_family_packet = !allow_list; + else + info->restrict_address_family_other = !allow_list; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return sd_bus_message_exit_container(m); +} + +static int property_read_syscall_archs( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *name; + + r = sd_bus_message_read(m, "s", &name); + if (r < 0) + return r; + if (r == 0) + break; + + r = set_put_strdup(&info->system_call_architectures, name); + if (r < 0) + return r; + } + + return sd_bus_message_exit_container(m); +} + +static int property_read_system_call_filter( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = userdata; + int allow_list, r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'r', "bas"); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "b", &allow_list); + if (r < 0) + return r; + + info->system_call_filter_allow_list = allow_list; + + r = sd_bus_message_enter_container(m, 'a', "s"); + if (r < 0) + return r; + + for (;;) { + const char *name; + + r = sd_bus_message_read(m, "s", &name); + if (r < 0) + return r; + if (r == 0) + break; + + /* ignore errno or action after colon */ + r = set_put_strndup(&info->system_call_filter, name, strchrnul(name, ':') - name); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return sd_bus_message_exit_container(m); +} + +static int property_read_ip_address_allow( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = userdata; + bool deny_ipv4 = false, deny_ipv6 = false; + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', "(iayu)"); + if (r < 0) + return r; + + for (;;) { + const void *data; + size_t size; + int32_t family; + uint32_t prefixlen; + + r = sd_bus_message_enter_container(m, 'r', "iayu"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(m, "i", &family); + if (r < 0) + return r; + + r = sd_bus_message_read_array(m, 'y', &data, &size); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "u", &prefixlen); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + if (streq(member, "IPAddressAllow")) { + union in_addr_union u; + + if (family == AF_INET && size == 4 && prefixlen == 8) + memcpy(&u.in, data, size); + else if (family == AF_INET6 && size == 16 && prefixlen == 128) + memcpy(&u.in6, data, size); + else { + info->ip_address_allow_other = true; + continue; + } + + if (in_addr_is_localhost(family, &u)) + info->ip_address_allow_localhost = true; + else + info->ip_address_allow_other = true; + } else { + assert(streq(member, "IPAddressDeny")); + + if (family == AF_INET && size == 4 && prefixlen == 0) + deny_ipv4 = true; + else if (family == AF_INET6 && size == 16 && prefixlen == 0) + deny_ipv6 = true; + } + } + + info->ip_address_deny_all = deny_ipv4 && deny_ipv6; + + return sd_bus_message_exit_container(m); +} + +static int property_read_ip_filters( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = userdata; + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_read_strv(m, &l); + if (r < 0) + return r; + + if (streq(member, "IPIngressFilterPath")) + info->ip_filters_custom_ingress = !strv_isempty(l); + else if (streq(member, "IPEgressFilterPath")) + info->ip_filters_custom_egress = !strv_isempty(l); + + return 0; +} + +static int property_read_device_allow( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + SecurityInfo *info = userdata; + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', "(ss)"); + if (r < 0) + return r; + + for (;;) { + const char *name, *policy; + + r = sd_bus_message_read(m, "(ss)", &name, &policy); + if (r < 0) + return r; + if (r == 0) + break; + + r = strv_extendf(&info->device_allow, "%s:%s", name, policy); + if (r < 0) + return r; + } + + return sd_bus_message_exit_container(m); +} + +static int acquire_security_info(sd_bus *bus, const char *name, SecurityInfo *info, AnalyzeSecurityFlags flags) { + + static const struct bus_properties_map security_map[] = { + { "AmbientCapabilities", "t", NULL, offsetof(SecurityInfo, ambient_capabilities) }, + { "CapabilityBoundingSet", "t", NULL, offsetof(SecurityInfo, capability_bounding_set) }, + { "DefaultDependencies", "b", NULL, offsetof(SecurityInfo, default_dependencies) }, + { "Delegate", "b", NULL, offsetof(SecurityInfo, delegate) }, + { "DeviceAllow", "a(ss)", property_read_device_allow, 0 }, + { "DevicePolicy", "s", NULL, offsetof(SecurityInfo, device_policy) }, + { "DynamicUser", "b", NULL, offsetof(SecurityInfo, dynamic_user) }, + { "FragmentPath", "s", NULL, offsetof(SecurityInfo, fragment_path) }, + { "IPAddressAllow", "a(iayu)", property_read_ip_address_allow, 0 }, + { "IPAddressDeny", "a(iayu)", property_read_ip_address_allow, 0 }, + { "IPIngressFilterPath", "as", property_read_ip_filters, 0 }, + { "IPEgressFilterPath", "as", property_read_ip_filters, 0 }, + { "Id", "s", NULL, offsetof(SecurityInfo, id) }, + { "KeyringMode", "s", NULL, offsetof(SecurityInfo, keyring_mode) }, + { "ProtectProc", "s", NULL, offsetof(SecurityInfo, protect_proc) }, + { "ProcSubset", "s", NULL, offsetof(SecurityInfo, proc_subset) }, + { "LoadState", "s", NULL, offsetof(SecurityInfo, load_state) }, + { "LockPersonality", "b", NULL, offsetof(SecurityInfo, lock_personality) }, + { "MemoryDenyWriteExecute", "b", NULL, offsetof(SecurityInfo, memory_deny_write_execute) }, + { "NoNewPrivileges", "b", NULL, offsetof(SecurityInfo, no_new_privileges) }, + { "NotifyAccess", "s", NULL, offsetof(SecurityInfo, notify_access) }, + { "PrivateDevices", "b", NULL, offsetof(SecurityInfo, private_devices) }, + { "PrivateMounts", "b", NULL, offsetof(SecurityInfo, private_mounts) }, + { "PrivateNetwork", "b", NULL, offsetof(SecurityInfo, private_network) }, + { "PrivateTmp", "b", NULL, offsetof(SecurityInfo, private_tmp) }, + { "PrivateUsers", "b", NULL, offsetof(SecurityInfo, private_users) }, + { "ProtectControlGroups", "b", NULL, offsetof(SecurityInfo, protect_control_groups) }, + { "ProtectHome", "s", NULL, offsetof(SecurityInfo, protect_home) }, + { "ProtectHostname", "b", NULL, offsetof(SecurityInfo, protect_hostname) }, + { "ProtectKernelModules", "b", NULL, offsetof(SecurityInfo, protect_kernel_modules) }, + { "ProtectKernelTunables", "b", NULL, offsetof(SecurityInfo, protect_kernel_tunables) }, + { "ProtectKernelLogs", "b", NULL, offsetof(SecurityInfo, protect_kernel_logs) }, + { "ProtectClock", "b", NULL, offsetof(SecurityInfo, protect_clock) }, + { "ProtectSystem", "s", NULL, offsetof(SecurityInfo, protect_system) }, + { "RemoveIPC", "b", NULL, offsetof(SecurityInfo, remove_ipc) }, + { "RestrictAddressFamilies", "(bas)", property_read_restrict_address_families, 0 }, + { "RestrictNamespaces", "t", property_read_restrict_namespaces, 0 }, + { "RestrictRealtime", "b", NULL, offsetof(SecurityInfo, restrict_realtime) }, + { "RestrictSUIDSGID", "b", NULL, offsetof(SecurityInfo, restrict_suid_sgid) }, + { "RootDirectory", "s", NULL, offsetof(SecurityInfo, root_directory) }, + { "RootImage", "s", NULL, offsetof(SecurityInfo, root_image) }, + { "SupplementaryGroups", "as", NULL, offsetof(SecurityInfo, supplementary_groups) }, + { "SystemCallArchitectures", "as", property_read_syscall_archs, 0 }, + { "SystemCallFilter", "(as)", property_read_system_call_filter, 0 }, + { "Type", "s", NULL, offsetof(SecurityInfo, type) }, + { "UMask", "u", property_read_umask, 0 }, + { "User", "s", NULL, offsetof(SecurityInfo, user) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL; + int r; + + /* Note: this mangles *info on failure! */ + + assert(bus); + assert(name); + assert(info); + + path = unit_dbus_path_from_name(name); + if (!path) + return log_oom(); + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + path, + security_map, + BUS_MAP_STRDUP | BUS_MAP_BOOLEAN_AS_BOOL, + &error, + NULL, + info); + if (r < 0) + return log_error_errno(r, "Failed to get unit properties: %s", bus_error_message(&error, r)); + + if (!streq_ptr(info->load_state, "loaded")) { + + if (FLAGS_SET(flags, ANALYZE_SECURITY_ONLY_LOADED)) + return -EMEDIUMTYPE; + + if (streq_ptr(info->load_state, "not-found")) + log_error("Unit %s not found, cannot analyze.", name); + else if (streq_ptr(info->load_state, "masked")) + log_error("Unit %s is masked, cannot analyze.", name); + else + log_error("Unit %s not loaded properly, cannot analyze.", name); + + return -EINVAL; + } + + if (FLAGS_SET(flags, ANALYZE_SECURITY_ONLY_LONG_RUNNING) && streq_ptr(info->type, "oneshot")) + return -EMEDIUMTYPE; + + if (info->private_devices || + info->private_tmp || + info->protect_control_groups || + info->protect_kernel_tunables || + info->protect_kernel_modules || + !streq_ptr(info->protect_home, "no") || + !streq_ptr(info->protect_system, "no") || + info->root_image) + info->private_mounts = true; + + if (info->protect_kernel_modules) + info->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + + if (info->protect_kernel_logs) + info->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYSLOG); + + if (info->protect_clock) + info->capability_bounding_set &= ~((UINT64_C(1) << CAP_SYS_TIME) | + (UINT64_C(1) << CAP_WAKE_ALARM)); + + if (info->private_devices) + info->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | + (UINT64_C(1) << CAP_SYS_RAWIO)); + + return 0; +} + +static int analyze_security_one(sd_bus *bus, + const char *name, + Table *overview_table, + AnalyzeSecurityFlags flags, + unsigned threshold, + JsonVariant *policy, + PagerFlags pager_flags, + JsonFormatFlags json_format_flags) { + + _cleanup_(security_info_freep) SecurityInfo *info = security_info_new(); + if (!info) + return log_oom(); + + int r; + + assert(bus); + assert(name); + + r = acquire_security_info(bus, name, info, flags); + if (r == -EMEDIUMTYPE) /* Ignore this one because not loaded or Type is oneshot */ + return 0; + if (r < 0) + return r; + + r = assess(info, overview_table, flags, threshold, policy, pager_flags, json_format_flags); + if (r < 0) + return r; + + return 0; +} + +/* Refactoring SecurityInfo so that it can make use of existing struct variables instead of reading from dbus */ +static int get_security_info(Unit *u, ExecContext *c, CGroupContext *g, SecurityInfo **ret_info) { + assert(ret_info); + + _cleanup_(security_info_freep) SecurityInfo *info = security_info_new(); + if (!info) + return log_oom(); + + if (u) { + if (u->id) { + info->id = strdup(u->id); + if (!info->id) + return log_oom(); + } + if (unit_type_to_string(u->type)) { + info->type = strdup(unit_type_to_string(u->type)); + if (!info->type) + return log_oom(); + } + if (unit_load_state_to_string(u->load_state)) { + info->load_state = strdup(unit_load_state_to_string(u->load_state)); + if (!info->load_state) + return log_oom(); + } + if (u->fragment_path) { + info->fragment_path = strdup(u->fragment_path); + if (!info->fragment_path) + return log_oom(); + } + info->default_dependencies = u->default_dependencies; + if (u->type == UNIT_SERVICE && notify_access_to_string(SERVICE(u)->notify_access)) { + info->notify_access = strdup(notify_access_to_string(SERVICE(u)->notify_access)); + if (!info->notify_access) + return log_oom(); + } + } + + if (c) { + info->ambient_capabilities = c->capability_ambient_set; + info->capability_bounding_set = c->capability_bounding_set; + if (c->user) { + info->user = strdup(c->user); + if (!info->user) + return log_oom(); + } + if (c->supplementary_groups) { + info->supplementary_groups = strv_copy(c->supplementary_groups); + if (!info->supplementary_groups) + return log_oom(); + } + info->dynamic_user = c->dynamic_user; + if (exec_keyring_mode_to_string(c->keyring_mode)) { + info->keyring_mode = strdup(exec_keyring_mode_to_string(c->keyring_mode)); + if (!info->keyring_mode) + return log_oom(); + } + if (protect_proc_to_string(c->protect_proc)) { + info->protect_proc = strdup(protect_proc_to_string(c->protect_proc)); + if (!info->protect_proc) + return log_oom(); + } + if (proc_subset_to_string(c->proc_subset)) { + info->proc_subset = strdup(proc_subset_to_string(c->proc_subset)); + if (!info->proc_subset) + return log_oom(); + } + info->lock_personality = c->lock_personality; + info->memory_deny_write_execute = c->memory_deny_write_execute; + info->no_new_privileges = c->no_new_privileges; + info->protect_hostname = c->protect_hostname; + info->private_devices = c->private_devices; + info->private_mounts = c->private_mounts; + info->private_network = c->private_network; + info->private_tmp = c->private_tmp; + info->private_users = c->private_users; + info->protect_control_groups = c->protect_control_groups; + info->protect_kernel_modules = c->protect_kernel_modules; + info->protect_kernel_tunables = c->protect_kernel_tunables; + info->protect_kernel_logs = c->protect_kernel_logs; + info->protect_clock = c->protect_clock; + if (protect_home_to_string(c->protect_home)) { + info->protect_home = strdup(protect_home_to_string(c->protect_home)); + if (!info->protect_home) + return log_oom(); + } + if (protect_system_to_string(c->protect_system)) { + info->protect_system = strdup(protect_system_to_string(c->protect_system)); + if (!info->protect_system) + return log_oom(); + } + info->remove_ipc = c->remove_ipc; + info->restrict_address_family_inet = + info->restrict_address_family_unix = + info->restrict_address_family_netlink = + info->restrict_address_family_packet = + info->restrict_address_family_other = + c->address_families_allow_list; + + void *key; + SET_FOREACH(key, c->address_families) { + int family = PTR_TO_INT(key); + if (family == 0) + continue; + if (IN_SET(family, AF_INET, AF_INET6)) + info->restrict_address_family_inet = !c->address_families_allow_list; + else if (family == AF_UNIX) + info->restrict_address_family_unix = !c->address_families_allow_list; + else if (family == AF_NETLINK) + info->restrict_address_family_netlink = !c->address_families_allow_list; + else if (family == AF_PACKET) + info->restrict_address_family_packet = !c->address_families_allow_list; + else + info->restrict_address_family_other = !c->address_families_allow_list; + } + + info->restrict_namespaces = c->restrict_namespaces; + info->restrict_realtime = c->restrict_realtime; + info->restrict_suid_sgid = c->restrict_suid_sgid; + if (c->root_directory) { + info->root_directory = strdup(c->root_directory); + if (!info->root_directory) + return log_oom(); + } + if (c->root_image) { + info->root_image = strdup(c->root_image); + if (!info->root_image) + return log_oom(); + } + info->_umask = c->umask; + +#if HAVE_SECCOMP + SET_FOREACH(key, c->syscall_archs) { + const char *name; + + name = seccomp_arch_to_string(PTR_TO_UINT32(key) - 1); + if (!name) + continue; + + if (set_put_strdup(&info->system_call_architectures, name) < 0) + return log_oom(); + } + + info->system_call_filter_allow_list = c->syscall_allow_list; + + void *id, *num; + HASHMAP_FOREACH_KEY(num, id, c->syscall_filter) { + _cleanup_free_ char *name = NULL; + + if (info->system_call_filter_allow_list && PTR_TO_INT(num) >= 0) + continue; + + name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1); + if (!name) + continue; + + if (set_ensure_consume(&info->system_call_filter, &string_hash_ops_free, TAKE_PTR(name)) < 0) + return log_oom(); + } +#endif + } + + if (g) { + info->delegate = g->delegate; + if (cgroup_device_policy_to_string(g->device_policy)) { + info->device_policy = strdup(cgroup_device_policy_to_string(g->device_policy)); + if (!info->device_policy) + return log_oom(); + } + + struct in_addr_prefix *i; + bool deny_ipv4 = false, deny_ipv6 = false; + + SET_FOREACH(i, g->ip_address_deny) { + if (i->family == AF_INET && i->prefixlen == 0) + deny_ipv4 = true; + else if (i->family == AF_INET6 && i->prefixlen == 0) + deny_ipv6 = true; + } + info->ip_address_deny_all = deny_ipv4 && deny_ipv6; + + info->ip_address_allow_localhost = info->ip_address_allow_other = false; + SET_FOREACH(i, g->ip_address_allow) { + if (in_addr_is_localhost(i->family, &i->address)) + info->ip_address_allow_localhost = true; + else + info->ip_address_allow_other = true; + } + + info->ip_filters_custom_ingress = !strv_isempty(g->ip_filters_ingress); + info->ip_filters_custom_egress = !strv_isempty(g->ip_filters_egress); + + LIST_FOREACH(device_allow, a, g->device_allow) + if (strv_extendf(&info->device_allow, + "%s:%s", + a->path, + cgroup_device_permissions_to_string(a->permissions)) < 0) + return log_oom(); + } + + *ret_info = TAKE_PTR(info); + + return 0; +} + +static int offline_security_check(Unit *u, + unsigned threshold, + JsonVariant *policy, + PagerFlags pager_flags, + JsonFormatFlags json_format_flags) { + + _cleanup_(table_unrefp) Table *overview_table = NULL; + AnalyzeSecurityFlags flags = 0; + _cleanup_(security_info_freep) SecurityInfo *info = NULL; + int r; + + assert(u); + + if (DEBUG_LOGGING) + unit_dump(u, stdout, "\t"); + + r = get_security_info(u, unit_get_exec_context(u), unit_get_cgroup_context(u), &info); + if (r < 0) + return r; + + return assess(info, overview_table, flags, threshold, policy, pager_flags, json_format_flags); +} + +static int offline_security_checks( + char **filenames, + JsonVariant *policy, + RuntimeScope scope, + bool check_man, + bool run_generators, + unsigned threshold, + const char *root, + const char *profile, + PagerFlags pager_flags, + JsonFormatFlags json_format_flags) { + + const ManagerTestRunFlags flags = + MANAGER_TEST_RUN_MINIMAL | + MANAGER_TEST_RUN_ENV_GENERATORS | + MANAGER_TEST_RUN_IGNORE_DEPENDENCIES | + MANAGER_TEST_DONT_OPEN_EXECUTOR | + run_generators * MANAGER_TEST_RUN_GENERATORS; + + _cleanup_(manager_freep) Manager *m = NULL; + Unit *units[strv_length(filenames)]; + int r, k; + size_t count = 0; + + if (strv_isempty(filenames)) + return 0; + + r = verify_set_unit_path(filenames); + if (r < 0) + return log_error_errno(r, "Failed to set unit load path: %m"); + + r = manager_new(scope, flags, &m); + if (r < 0) + return log_error_errno(r, "Failed to initialize manager: %m"); + + log_debug("Starting manager..."); + + r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, root); + if (r < 0) + return r; + + if (profile) { + /* Ensure the temporary directory is in the search path, so that we can add drop-ins. */ + r = strv_extend(&m->lookup_paths.search_path, m->lookup_paths.temporary_dir); + if (r < 0) + return log_oom(); + } + + log_debug("Loading remaining units from the command line..."); + + STRV_FOREACH(filename, filenames) { + _cleanup_free_ char *prepared = NULL; + + log_debug("Handling %s...", *filename); + + k = verify_prepare_filename(*filename, &prepared); + if (k < 0) { + log_warning_errno(k, "Failed to prepare filename %s: %m", *filename); + RET_GATHER(r, k); + continue; + } + + /* When a portable image is analyzed, the profile is what provides a good chunk of + * the security-related settings, but they are obviously not shipped with the image. + * This allows to take them in consideration. */ + if (profile) { + _cleanup_free_ char *unit_name = NULL, *dropin = NULL, *profile_path = NULL; + + r = path_extract_filename(prepared, &unit_name); + if (r < 0) + return log_oom(); + + dropin = strjoin(m->lookup_paths.temporary_dir, "/", unit_name, ".d/profile.conf"); + if (!dropin) + return log_oom(); + (void) mkdir_parents(dropin, 0755); + + if (!is_path(profile)) { + r = find_portable_profile(profile, unit_name, &profile_path); + if (r < 0) + return log_error_errno(r, "Failed to find portable profile %s: %m", profile); + profile = profile_path; + } + + r = copy_file(profile, dropin, 0, 0644, 0); + if (r < 0) + return log_error_errno(r, "Failed to copy: %m"); + } + + k = manager_load_startable_unit_or_warn(m, NULL, prepared, &units[count]); + if (k < 0) { + RET_GATHER(r, k); + continue; + } + + count++; + } + + for (size_t i = 0; i < count; i++) + RET_GATHER(r, offline_security_check(units[i], threshold, policy, pager_flags, json_format_flags)); + + return r; +} + +static int analyze_security(sd_bus *bus, + char **units, + JsonVariant *policy, + RuntimeScope scope, + bool check_man, + bool run_generators, + bool offline, + unsigned threshold, + const char *root, + const char *profile, + JsonFormatFlags json_format_flags, + PagerFlags pager_flags, + AnalyzeSecurityFlags flags) { + + _cleanup_(table_unrefp) Table *overview_table = NULL; + int ret = 0, r; + + assert(!!bus != offline); + + if (offline) + return offline_security_checks(units, policy, scope, check_man, run_generators, threshold, root, profile, pager_flags, json_format_flags); + + if (strv_length(units) != 1) { + overview_table = table_new("unit", "exposure", "predicate", "happy"); + if (!overview_table) + return log_oom(); + } + + if (strv_isempty(units)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_strv_free_ char **list = NULL; + size_t n = 0; + + r = bus_call_method( + bus, + bus_systemd_mgr, + "ListUnits", + &error, + &reply, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to list units: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + UnitInfo info; + char *copy = NULL; + + r = bus_parse_unit_info(reply, &info); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (!endswith(info.id, ".service")) + continue; + + if (!GREEDY_REALLOC(list, n + 2)) + return log_oom(); + + copy = strdup(info.id); + if (!copy) + return log_oom(); + + list[n++] = copy; + list[n] = NULL; + } + + strv_sort(list); + + flags |= ANALYZE_SECURITY_SHORT|ANALYZE_SECURITY_ONLY_LOADED|ANALYZE_SECURITY_ONLY_LONG_RUNNING; + + STRV_FOREACH(i, list) { + r = analyze_security_one(bus, *i, overview_table, flags, threshold, policy, pager_flags, json_format_flags); + if (r < 0 && ret >= 0) + ret = r; + } + + } else + STRV_FOREACH(i, units) { + _cleanup_free_ char *mangled = NULL, *instance = NULL; + const char *name; + + if (!FLAGS_SET(flags, ANALYZE_SECURITY_SHORT) && i != units) { + putc('\n', stdout); + fflush(stdout); + } + + r = unit_name_mangle(*i, 0, &mangled); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name '%s': %m", *i); + + if (!endswith(mangled, ".service")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unit %s is not a service unit, refusing.", + *i); + + if (unit_name_is_valid(mangled, UNIT_NAME_TEMPLATE)) { + r = unit_name_replace_instance(mangled, "test-instance", &instance); + if (r < 0) + return log_oom(); + + name = instance; + } else + name = mangled; + + r = analyze_security_one(bus, name, overview_table, flags, threshold, policy, pager_flags, json_format_flags); + if (r < 0 && ret >= 0) + ret = r; + } + + if (overview_table) { + if (!FLAGS_SET(flags, ANALYZE_SECURITY_SHORT)) { + putc('\n', stdout); + fflush(stdout); + } + + r = table_print_with_pager(overview_table, json_format_flags, pager_flags, /* show_header= */true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + } + return ret; +} + +int verb_security(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *policy = NULL; + int r; + unsigned line, column; + + if (!arg_offline) { + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + } + + pager_open(arg_pager_flags); + + if (arg_security_policy) { + r = json_parse_file(/*f=*/ NULL, arg_security_policy, /*flags=*/ 0, &policy, &line, &column); + if (r < 0) + return log_error_errno(r, "Failed to parse '%s' at %u:%u: %m", arg_security_policy, line, column); + } else { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *pp = NULL; + + r = search_and_fopen_nulstr("systemd-analyze-security.policy", "re", /*root=*/ NULL, CONF_PATHS_NULSTR("systemd"), &f, &pp); + if (r < 0 && r != -ENOENT) + return r; + + if (f) { + r = json_parse_file(f, pp, /*flags=*/ 0, &policy, &line, &column); + if (r < 0) + return log_error_errno(r, "[%s:%u:%u] Failed to parse JSON policy: %m", pp, line, column); + } + } + + return analyze_security( + bus, + strv_skip(argv, 1), + policy, + arg_runtime_scope, + arg_man, + arg_generators, + arg_offline, + arg_threshold, + arg_root, + arg_profile, + arg_json_format_flags, + arg_pager_flags, + /*flags=*/ 0); +} diff --git a/src/analyze/analyze-security.h b/src/analyze/analyze-security.h new file mode 100644 index 0000000..82f4c7f --- /dev/null +++ b/src/analyze/analyze-security.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef enum AnalyzeSecurityFlags { + ANALYZE_SECURITY_SHORT = 1 << 0, + ANALYZE_SECURITY_ONLY_LOADED = 1 << 1, + ANALYZE_SECURITY_ONLY_LONG_RUNNING = 1 << 2, +} AnalyzeSecurityFlags; + +int verb_security(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-service-watchdogs.c b/src/analyze/analyze-service-watchdogs.c new file mode 100644 index 0000000..6535eb1 --- /dev/null +++ b/src/analyze/analyze-service-watchdogs.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-service-watchdogs.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "parse-util.h" + +int verb_service_watchdogs(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int b, r; + + assert(IN_SET(argc, 1, 2)); + assert(argv); + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + if (argc == 1) { + /* get ServiceWatchdogs */ + r = bus_get_property_trivial(bus, bus_systemd_mgr, "ServiceWatchdogs", &error, 'b', &b); + if (r < 0) + return log_error_errno(r, "Failed to get service-watchdog state: %s", bus_error_message(&error, r)); + + printf("%s\n", yes_no(!!b)); + + } else { + /* set ServiceWatchdogs */ + b = parse_boolean(argv[1]); + if (b < 0) + return log_error_errno(b, "Failed to parse service-watchdogs argument: %m"); + + r = bus_set_property(bus, bus_systemd_mgr, "ServiceWatchdogs", &error, "b", b); + if (r < 0) + return log_error_errno(r, "Failed to set service-watchdog state: %s", bus_error_message(&error, r)); + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-service-watchdogs.h b/src/analyze/analyze-service-watchdogs.h new file mode 100644 index 0000000..2f59f5a --- /dev/null +++ b/src/analyze/analyze-service-watchdogs.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_service_watchdogs(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-srk.c b/src/analyze/analyze-srk.c new file mode 100644 index 0000000..0e24b41 --- /dev/null +++ b/src/analyze/analyze-srk.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-srk.h" +#include "fileio.h" +#include "tpm2-util.h" + +int verb_srk(int argc, char *argv[], void *userdata) { +#if HAVE_TPM2 + _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL; + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + int r; + + r = tpm2_context_new(/* device= */ NULL, &c); + if (r < 0) + return log_error_errno(r, "Failed to create TPM2 context: %m"); + + r = tpm2_get_srk( + c, + /* session= */ NULL, + &public, + /* ret_name= */ NULL, + /* ret_qname= */ NULL, + /* ret_handle= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to get SRK: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "No SRK stored so far."); + + _cleanup_free_ void *marshalled = NULL; + size_t marshalled_size = 0; + r = tpm2_marshal_public(public, &marshalled, &marshalled_size); + if (r < 0) + return log_error_errno(r, "Failed to marshal SRK: %m"); + + if (isatty(STDOUT_FILENO)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Refusing to write binary data to TTY, please redirect output to file."); + + if (fwrite(marshalled, 1, marshalled_size, stdout) != marshalled_size) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to write SRK to stdout: %m"); + + r = fflush_and_check(stdout); + if (r < 0) + return log_error_errno(r, "Failed to write SRK to stdout: %m"); + + return EXIT_SUCCESS; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support not available."); +#endif +} diff --git a/src/analyze/analyze-srk.h b/src/analyze/analyze-srk.h new file mode 100644 index 0000000..2602835 --- /dev/null +++ b/src/analyze/analyze-srk.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_srk(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-syscall-filter.c b/src/analyze/analyze-syscall-filter.c new file mode 100644 index 0000000..66a52da --- /dev/null +++ b/src/analyze/analyze-syscall-filter.c @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-syscall-filter.h" +#include "analyze.h" +#include "fd-util.h" +#include "fileio.h" +#include "nulstr-util.h" +#include "seccomp-util.h" +#include "set.h" +#include "strv.h" +#include "terminal-util.h" + +#if HAVE_SECCOMP + +static int load_kernel_syscalls(Set **ret) { + _cleanup_set_free_ Set *syscalls = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + /* Let's read the available system calls from the list of available tracing events. Slightly dirty, + * but good enough for analysis purposes. */ + + f = fopen("/sys/kernel/tracing/available_events", "re"); + if (!f) { + /* We tried the non-debugfs mount point and that didn't work. If it wasn't mounted, maybe the + * old debugfs mount point works? */ + f = fopen("/sys/kernel/debug/tracing/available_events", "re"); + if (!f) + return log_full_errno(IN_SET(errno, EPERM, EACCES, ENOENT) ? LOG_DEBUG : LOG_WARNING, errno, + "Can't read open tracefs' available_events file: %m"); + } + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *e; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read system call list: %m"); + if (r == 0) + break; + + e = startswith(line, "syscalls:sys_enter_"); + if (!e) + continue; + + /* These are named differently inside the kernel than their external name for historical + * reasons. Let's hide them here. */ + if (STR_IN_SET(e, "newuname", "newfstat", "newstat", "newlstat", "sysctl")) + continue; + + r = set_put_strdup(&syscalls, e); + if (r < 0) + return log_error_errno(r, "Failed to add system call to list: %m"); + } + + *ret = TAKE_PTR(syscalls); + return 0; +} + +static int syscall_set_add(Set **s, const SyscallFilterSet *set) { + int r; + + assert(s); + + if (!set) + return 0; + + NULSTR_FOREACH(sc, set->value) { + if (sc[0] == '@') + continue; + + r = set_put_strdup(s, sc); + if (r < 0) + return r; + } + + return 0; +} + +static void syscall_set_remove(Set *s, const SyscallFilterSet *set) { + if (!set) + return; + + NULSTR_FOREACH(sc, set->value) { + if (sc[0] == '@') + continue; + + free(set_remove(s, sc)); + } +} + +static void dump_syscall_filter(const SyscallFilterSet *set) { + printf("%s%s%s\n" + " # %s\n", + ansi_highlight(), + set->name, + ansi_normal(), + set->help); + + NULSTR_FOREACH(syscall, set->value) + printf(" %s%s%s\n", syscall[0] == '@' ? ansi_underline() : "", syscall, ansi_normal()); +} + +int verb_syscall_filters(int argc, char *argv[], void *userdata) { + bool first = true; + int r; + + pager_open(arg_pager_flags); + + if (strv_isempty(strv_skip(argv, 1))) { + _cleanup_set_free_ Set *kernel = NULL, *known = NULL; + int k = 0; /* explicit initialization to appease gcc */ + + r = syscall_set_add(&known, syscall_filter_sets + SYSCALL_FILTER_SET_KNOWN); + if (r < 0) + return log_error_errno(r, "Failed to prepare set of known system calls: %m"); + + if (!arg_quiet) + k = load_kernel_syscalls(&kernel); + + for (int i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) { + const SyscallFilterSet *set = syscall_filter_sets + i; + if (!first) + puts(""); + + dump_syscall_filter(set); + syscall_set_remove(kernel, set); + if (i != SYSCALL_FILTER_SET_KNOWN) + syscall_set_remove(known, set); + first = false; + } + + if (arg_quiet) /* Let's not show the extra stuff in quiet mode */ + return 0; + + if (!set_isempty(known)) { + _cleanup_free_ char **l = NULL; + + printf("\n" + "# %sUngrouped System Calls%s (known but not included in any of the groups except @known):\n", + ansi_highlight(), ansi_normal()); + + l = set_get_strv(known); + if (!l) + return log_oom(); + + strv_sort(l); + + STRV_FOREACH(syscall, l) + printf("# %s\n", *syscall); + } + + if (k < 0) { + fputc('\n', stdout); + fflush(stdout); + if (!arg_quiet) + log_notice_errno(k, "# Not showing unlisted system calls, couldn't retrieve kernel system call list: %m"); + } else if (!set_isempty(kernel)) { + _cleanup_free_ char **l = NULL; + + printf("\n" + "# %sUnlisted System Calls%s (supported by the local kernel, but not included in any of the groups listed above):\n", + ansi_highlight(), ansi_normal()); + + l = set_get_strv(kernel); + if (!l) + return log_oom(); + + strv_sort(l); + + STRV_FOREACH(syscall, l) + printf("# %s\n", *syscall); + } + } else + STRV_FOREACH(name, strv_skip(argv, 1)) { + const SyscallFilterSet *set; + + if (!first) + puts(""); + + set = syscall_filter_set_find(*name); + if (!set) { + /* make sure the error appears below normal output */ + fflush(stdout); + + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Filter set \"%s\" not found.", *name); + } + + dump_syscall_filter(set); + first = false; + } + + return EXIT_SUCCESS; +} + +#else +int verb_syscall_filters(int argc, char *argv[], void *userdata) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Not compiled with syscall filters, sorry."); +} +#endif diff --git a/src/analyze/analyze-syscall-filter.h b/src/analyze/analyze-syscall-filter.h new file mode 100644 index 0000000..3a1af85 --- /dev/null +++ b/src/analyze/analyze-syscall-filter.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_syscall_filters(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-time-data.c b/src/analyze/analyze-time-data.c new file mode 100644 index 0000000..741cab3 --- /dev/null +++ b/src/analyze/analyze-time-data.c @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-time-data.h" +#include "analyze.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-unit-util.h" +#include "memory-util.h" +#include "special.h" +#include "strv.h" + +static void subtract_timestamp(usec_t *a, usec_t b) { + assert(a); + + if (*a > 0) { + assert(*a >= b); + *a -= b; + } +} + +static int log_not_finished(usec_t finish_time) { + return log_error_errno(SYNTHETIC_ERRNO(EINPROGRESS), + "Bootup is not yet finished (org.freedesktop.systemd1.Manager.FinishTimestampMonotonic=%"PRIu64").\n" + "Please try again later.\n" + "Hint: Use 'systemctl%s list-jobs' to see active jobs", + finish_time, + arg_runtime_scope == RUNTIME_SCOPE_SYSTEM ? "" : " --user"); +} + +int acquire_boot_times(sd_bus *bus, bool require_finished, BootTimes **ret) { + static const struct bus_properties_map property_map[] = { + { "FirmwareTimestampMonotonic", "t", NULL, offsetof(BootTimes, firmware_time) }, + { "LoaderTimestampMonotonic", "t", NULL, offsetof(BootTimes, loader_time) }, + { "KernelTimestamp", "t", NULL, offsetof(BootTimes, kernel_time) }, + { "InitRDTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_time) }, + { "UserspaceTimestampMonotonic", "t", NULL, offsetof(BootTimes, userspace_time) }, + { "FinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, finish_time) }, + { "SecurityStartTimestampMonotonic", "t", NULL, offsetof(BootTimes, security_start_time) }, + { "SecurityFinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, security_finish_time) }, + { "GeneratorsStartTimestampMonotonic", "t", NULL, offsetof(BootTimes, generators_start_time) }, + { "GeneratorsFinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, generators_finish_time) }, + { "UnitsLoadStartTimestampMonotonic", "t", NULL, offsetof(BootTimes, unitsload_start_time) }, + { "UnitsLoadFinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, unitsload_finish_time) }, + { "InitRDSecurityStartTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_security_start_time) }, + { "InitRDSecurityFinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_security_finish_time) }, + { "InitRDGeneratorsStartTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_generators_start_time) }, + { "InitRDGeneratorsFinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_generators_finish_time) }, + { "InitRDUnitsLoadStartTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_unitsload_start_time) }, + { "InitRDUnitsLoadFinishTimestampMonotonic", "t", NULL, offsetof(BootTimes, initrd_unitsload_finish_time) }, + {}, + }; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + static BootTimes times; + static bool cached = false; + int r; + + if (cached) { + if (require_finished && times.finish_time <= 0) + return log_not_finished(times.finish_time); + + if (ret) + *ret = × + return 0; + } + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + property_map, + BUS_MAP_STRDUP, + &error, + NULL, + ×); + if (r < 0) + return log_error_errno(r, "Failed to get timestamp properties: %s", bus_error_message(&error, r)); + + if (require_finished && times.finish_time <= 0) + return log_not_finished(times.finish_time); + + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM && times.security_start_time > 0) { + /* security_start_time is set when systemd is not running under container environment. */ + if (times.initrd_time > 0) + times.kernel_done_time = times.initrd_time; + else + times.kernel_done_time = times.userspace_time; + } else { + /* + * User-instance-specific or container-system-specific timestamps processing + * (see comment to reverse_offset in BootTimes). + */ + times.reverse_offset = times.userspace_time; + + times.firmware_time = times.loader_time = times.kernel_time = times.initrd_time = + times.userspace_time = times.security_start_time = times.security_finish_time = 0; + + if (times.finish_time > 0) + subtract_timestamp(×.finish_time, times.reverse_offset); + + subtract_timestamp(×.generators_start_time, times.reverse_offset); + subtract_timestamp(×.generators_finish_time, times.reverse_offset); + + subtract_timestamp(×.unitsload_start_time, times.reverse_offset); + subtract_timestamp(×.unitsload_finish_time, times.reverse_offset); + } + + cached = true; + + if (ret) + *ret = × + return 0; +} + +static int bus_get_uint64_property(sd_bus *bus, const char *path, const char *interface, const char *property, uint64_t *val) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(property); + assert(val); + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + interface, + property, + &error, + 't', val); + if (r < 0) + return log_error_errno(r, "Failed to parse reply: %s", bus_error_message(&error, r)); + + return 0; +} + +int pretty_boot_time(sd_bus *bus, char **ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL, *unit_id = NULL, *text = NULL; + usec_t activated_time = USEC_INFINITY; + BootTimes *t; + int r; + + r = acquire_boot_times(bus, /* require_finished = */ true, &t); + if (r < 0) + return r; + + path = unit_dbus_path_from_name(SPECIAL_DEFAULT_TARGET); + if (!path) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "Id", + &error, + &unit_id); + if (r < 0) + log_warning_errno(r, "default.target doesn't seem to exist, ignoring: %s", bus_error_message(&error, r)); + + r = bus_get_uint64_property(bus, path, + "org.freedesktop.systemd1.Unit", + "ActiveEnterTimestampMonotonic", + &activated_time); + if (r < 0) + log_warning_errno(r, "Could not get time to reach default.target, ignoring: %m"); + + text = strdup("Startup finished in "); + if (!text) + return log_oom(); + + if (timestamp_is_set(t->firmware_time) && !strextend(&text, FORMAT_TIMESPAN(t->firmware_time - t->loader_time, USEC_PER_MSEC), " (firmware) + ")) + return log_oom(); + if (timestamp_is_set(t->loader_time) && !strextend(&text, FORMAT_TIMESPAN(t->loader_time, USEC_PER_MSEC), " (loader) + ")) + return log_oom(); + if (timestamp_is_set(t->kernel_done_time) && !strextend(&text, FORMAT_TIMESPAN(t->kernel_done_time, USEC_PER_MSEC), " (kernel) + ")) + return log_oom(); + if (timestamp_is_set(t->initrd_time) && !strextend(&text, FORMAT_TIMESPAN(t->userspace_time - t->initrd_time, USEC_PER_MSEC), " (initrd) + ")) + return log_oom(); + + if (!strextend(&text, FORMAT_TIMESPAN(t->finish_time - t->userspace_time, USEC_PER_MSEC), " (userspace) ")) + return log_oom(); + + if (timestamp_is_set(t->kernel_done_time)) + if (!strextend(&text, "= ", FORMAT_TIMESPAN(t->firmware_time + t->finish_time, USEC_PER_MSEC), " ")) + return log_oom(); + + if (unit_id && timestamp_is_set(activated_time)) { + usec_t base = timestamp_is_set(t->userspace_time) ? t->userspace_time : t->reverse_offset; + + if (!strextend(&text, "\n", unit_id, " reached after ", FORMAT_TIMESPAN(activated_time - base, USEC_PER_MSEC), " in userspace.")) + return log_oom(); + + } else if (unit_id && activated_time == 0) { + + if (!strextend(&text, "\n", unit_id, " was never reached.")) + return log_oom(); + + } else if (unit_id && activated_time == USEC_INFINITY) { + + if (!strextend(&text, "\nCould not get time to reach ", unit_id, ".")) + return log_oom(); + + } else if (!unit_id) { + + if (!strextend(&text, "\ncould not find default.target.")) + return log_oom(); + } + + *ret = TAKE_PTR(text); + return 0; +} + +void unit_times_clear(UnitTimes *t) { + if (!t) + return; + + FOREACH_ARRAY(d, t->deps, ELEMENTSOF(t->deps)) + *d = strv_free(*d); + + t->name = mfree(t->name); +} + +UnitTimes* unit_times_free_array(UnitTimes *t) { + if (!t) + return NULL; + + for (UnitTimes *p = t; p->has_data; p++) + unit_times_clear(p); + + return mfree(t); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(UnitTimes*, unit_times_clear, NULL); + +int acquire_time_data(sd_bus *bus, bool require_finished, UnitTimes **out) { + static const struct bus_properties_map property_map[] = { + { "InactiveExitTimestampMonotonic", "t", NULL, offsetof(UnitTimes, activating) }, + { "ActiveEnterTimestampMonotonic", "t", NULL, offsetof(UnitTimes, activated) }, + { "ActiveExitTimestampMonotonic", "t", NULL, offsetof(UnitTimes, deactivating) }, + { "InactiveEnterTimestampMonotonic", "t", NULL, offsetof(UnitTimes, deactivated) }, + { "After", "as", NULL, offsetof(UnitTimes, deps[UNIT_AFTER]) }, + { "Before", "as", NULL, offsetof(UnitTimes, deps[UNIT_BEFORE]) }, + { "Requires", "as", NULL, offsetof(UnitTimes, deps[UNIT_REQUIRES]) }, + { "Requisite", "as", NULL, offsetof(UnitTimes, deps[UNIT_REQUISITE]) }, + { "Wants", "as", NULL, offsetof(UnitTimes, deps[UNIT_WANTS]) }, + { "Conflicts", "as", NULL, offsetof(UnitTimes, deps[UNIT_CONFLICTS]) }, + { "Upholds", "as", NULL, offsetof(UnitTimes, deps[UNIT_UPHOLDS]) }, + {}, + }; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(unit_times_free_arrayp) UnitTimes *unit_times = NULL; + BootTimes *boot_times; + size_t c = 0; + UnitInfo u; + int r; + + r = acquire_boot_times(bus, require_finished, &boot_times); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_systemd_mgr, "ListUnits", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list units: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = bus_parse_unit_info(reply, &u)) > 0) { + _cleanup_(unit_times_clearp) UnitTimes *t = NULL; + + if (!GREEDY_REALLOC0(unit_times, c + 2)) + return log_oom(); + + /* t initially has pointers zeroed by the allocation, and unit_times_clearp will have zeroed + * them if the entry is being reused. */ + t = &unit_times[c]; + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + u.unit_path, + property_map, + BUS_MAP_STRDUP, + &error, + NULL, + t); + if (r < 0) + return log_error_errno(r, "Failed to get timestamp properties of unit %s: %s", + u.id, bus_error_message(&error, r)); + + subtract_timestamp(&t->activating, boot_times->reverse_offset); + subtract_timestamp(&t->activated, boot_times->reverse_offset); + subtract_timestamp(&t->deactivating, boot_times->reverse_offset); + subtract_timestamp(&t->deactivated, boot_times->reverse_offset); + + if (t->activated >= t->activating) + t->time = t->activated - t->activating; + else if (t->deactivated >= t->activating) + t->time = t->deactivated - t->activating; + else + t->time = 0; + + if (t->activating == 0) + continue; + + t->name = strdup(u.id); + if (!t->name) + return log_oom(); + + t->has_data = true; + /* Prevent destructor from running on t reference. */ + TAKE_PTR(t); + c++; + } + if (r < 0) + return bus_log_parse_error(r); + + *out = TAKE_PTR(unit_times); + return c; +} diff --git a/src/analyze/analyze-time-data.h b/src/analyze/analyze-time-data.h new file mode 100644 index 0000000..9049d87 --- /dev/null +++ b/src/analyze/analyze-time-data.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" +#include "unit-def.h" + +typedef struct BootTimes { + usec_t firmware_time; + usec_t loader_time; + usec_t kernel_time; + usec_t kernel_done_time; + usec_t initrd_time; + usec_t userspace_time; + usec_t finish_time; + usec_t security_start_time; + usec_t security_finish_time; + usec_t generators_start_time; + usec_t generators_finish_time; + usec_t unitsload_start_time; + usec_t unitsload_finish_time; + usec_t initrd_security_start_time; + usec_t initrd_security_finish_time; + usec_t initrd_generators_start_time; + usec_t initrd_generators_finish_time; + usec_t initrd_unitsload_start_time; + usec_t initrd_unitsload_finish_time; + + /* + * If we're analyzing the user instance, all timestamps will be offset by its own start-up timestamp, + * which may be arbitrarily big. With "plot", this causes arbitrarily wide output SVG files which + * almost completely consist of empty space. Thus we cancel out this offset. + * + * This offset is subtracted from times above by acquire_boot_times(), but it still needs to be + * subtracted from unit-specific timestamps (so it is stored here for reference). + */ + usec_t reverse_offset; +} BootTimes; + +typedef struct UnitTimes { + bool has_data; + char *name; + usec_t activating; + usec_t activated; + usec_t deactivated; + usec_t deactivating; + usec_t time; + char **deps[_UNIT_DEPENDENCY_MAX]; +} UnitTimes; + +int acquire_boot_times(sd_bus *bus, bool require_finished, BootTimes **ret); +int pretty_boot_time(sd_bus *bus, char **ret); + +void unit_times_clear(UnitTimes *t); +UnitTimes* unit_times_free_array(UnitTimes *t); +DEFINE_TRIVIAL_CLEANUP_FUNC(UnitTimes*, unit_times_free_array); + +int acquire_time_data(sd_bus *bus, bool require_finished, UnitTimes **out); diff --git a/src/analyze/analyze-time.c b/src/analyze/analyze-time.c new file mode 100644 index 0000000..c233b1f --- /dev/null +++ b/src/analyze/analyze-time.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-time.h" +#include "analyze-time-data.h" + +int verb_time(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *buf = NULL; + int r; + + r = acquire_bus(&bus, NULL); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + r = pretty_boot_time(bus, &buf); + if (r < 0) + return r; + + puts(buf); + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-time.h b/src/analyze/analyze-time.h new file mode 100644 index 0000000..a8f8575 --- /dev/null +++ b/src/analyze/analyze-time.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_time(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-timespan.c b/src/analyze/analyze-timespan.c new file mode 100644 index 0000000..3fdf0f9 --- /dev/null +++ b/src/analyze/analyze-timespan.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-timespan.h" +#include "calendarspec.h" +#include "format-table.h" +#include "glyph-util.h" +#include "strv.h" +#include "terminal-util.h" + +int verb_timespan(int argc, char *argv[], void *userdata) { + STRV_FOREACH(input_timespan, strv_skip(argv, 1)) { + _cleanup_(table_unrefp) Table *table = NULL; + usec_t output_usecs; + TableCell *cell; + int r; + + r = parse_time(*input_timespan, &output_usecs, USEC_PER_SEC); + if (r < 0) { + log_error_errno(r, "Failed to parse time span '%s': %m", *input_timespan); + time_parsing_hint(*input_timespan, /* calendar= */ true, /* timestamp= */ true, /* timespan= */ false); + return r; + } + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + assert_se(cell = table_get_cell(table, 0, 1)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + r = table_add_many(table, + TABLE_FIELD, "Original", + TABLE_STRING, *input_timespan); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf_full(table, NULL, TABLE_FIELD, "%ss", special_glyph(SPECIAL_GLYPH_MU)); + if (r < 0) + return table_log_add_error(r); + + r = table_add_many(table, + TABLE_UINT64, output_usecs, + TABLE_FIELD, "Human", + TABLE_TIMESPAN, output_usecs, + TABLE_SET_COLOR, ansi_highlight()); + if (r < 0) + return table_log_add_error(r); + + r = table_print(table, NULL); + if (r < 0) + return r; + + if (input_timespan[1]) + putchar('\n'); + } + + return 0; +} diff --git a/src/analyze/analyze-timespan.h b/src/analyze/analyze-timespan.h new file mode 100644 index 0000000..46d2295 --- /dev/null +++ b/src/analyze/analyze-timespan.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_timespan(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-timestamp.c b/src/analyze/analyze-timestamp.c new file mode 100644 index 0000000..97de438 --- /dev/null +++ b/src/analyze/analyze-timestamp.c @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-timestamp.h" +#include "format-table.h" +#include "terminal-util.h" + +static int test_timestamp_one(const char *p) { + _cleanup_(table_unrefp) Table *table = NULL; + TableCell *cell; + usec_t usec; + int r; + + r = parse_timestamp(p, &usec); + if (r < 0) { + log_error_errno(r, "Failed to parse \"%s\": %m", p); + time_parsing_hint(p, /* calendar= */ true, /* timestamp= */ false, /* timespan= */ true); + return r; + } + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + assert_se(cell = table_get_cell(table, 0, 1)); + r = table_set_ellipsize_percent(table, cell, 100); + if (r < 0) + return r; + + r = table_add_many(table, + TABLE_FIELD, "Original form", + TABLE_STRING, p, + TABLE_FIELD, "Normalized form", + TABLE_TIMESTAMP, usec, + TABLE_SET_COLOR, ansi_highlight_blue()); + if (r < 0) + return table_log_add_error(r); + + if (!in_utc_timezone()) { + r = table_add_many(table, + TABLE_FIELD, "(in UTC)", + TABLE_TIMESTAMP_UTC, usec); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_cell(table, NULL, TABLE_FIELD, "UNIX seconds"); + if (r < 0) + return table_log_add_error(r); + + if (usec % USEC_PER_SEC == 0) + r = table_add_cell_stringf(table, NULL, "@%"PRI_USEC, + usec / USEC_PER_SEC); + else + r = table_add_cell_stringf(table, NULL, "@%"PRI_USEC".%06"PRI_USEC"", + usec / USEC_PER_SEC, + usec % USEC_PER_SEC); + if (r < 0) + return r; + + r = table_add_many(table, + TABLE_FIELD, "From now", + TABLE_TIMESTAMP_RELATIVE, usec); + if (r < 0) + return table_log_add_error(r); + + return table_print(table, NULL); +} + +int verb_timestamp(int argc, char *argv[], void *userdata) { + int r = 0; + + STRV_FOREACH(p, strv_skip(argv, 1)) { + int k; + + k = test_timestamp_one(*p); + if (r == 0 && k < 0) + r = k; + + if (p[1]) + putchar('\n'); + } + + return r; +} diff --git a/src/analyze/analyze-timestamp.h b/src/analyze/analyze-timestamp.h new file mode 100644 index 0000000..43e4b57 --- /dev/null +++ b/src/analyze/analyze-timestamp.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_timestamp(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-unit-files.c b/src/analyze/analyze-unit-files.c new file mode 100644 index 0000000..d9b3313 --- /dev/null +++ b/src/analyze/analyze-unit-files.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-unit-files.h" +#include "path-lookup.h" +#include "strv.h" + +static bool strv_fnmatch_strv_or_empty(char* const* patterns, char **strv, int flags) { + STRV_FOREACH(s, strv) + if (strv_fnmatch_or_empty(patterns, *s, flags)) + return true; + + return false; +} + +int verb_unit_files(int argc, char *argv[], void *userdata) { + _cleanup_hashmap_free_ Hashmap *unit_ids = NULL, *unit_names = NULL; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + char **patterns = strv_skip(argv, 1); + const char *k, *dst; + char **v; + int r; + + r = lookup_paths_init_or_warn(&lp, arg_runtime_scope, 0, NULL); + if (r < 0) + return r; + + r = unit_file_build_name_map(&lp, NULL, &unit_ids, &unit_names, NULL); + if (r < 0) + return log_error_errno(r, "unit_file_build_name_map() failed: %m"); + + HASHMAP_FOREACH_KEY(dst, k, unit_ids) { + if (!strv_fnmatch_or_empty(patterns, k, FNM_NOESCAPE) && + !strv_fnmatch_or_empty(patterns, dst, FNM_NOESCAPE)) + continue; + + printf("ids: %s → %s\n", k, dst); + } + + HASHMAP_FOREACH_KEY(v, k, unit_names) { + if (!strv_fnmatch_or_empty(patterns, k, FNM_NOESCAPE) && + !strv_fnmatch_strv_or_empty(patterns, v, FNM_NOESCAPE)) + continue; + + _cleanup_free_ char *j = strv_join(v, ", "); + printf("aliases: %s ← %s\n", k, j); + } + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-unit-files.h b/src/analyze/analyze-unit-files.h new file mode 100644 index 0000000..c193fd8 --- /dev/null +++ b/src/analyze/analyze-unit-files.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_unit_files(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-unit-paths.c b/src/analyze/analyze-unit-paths.c new file mode 100644 index 0000000..bb00a4f --- /dev/null +++ b/src/analyze/analyze-unit-paths.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-unit-paths.h" +#include "path-lookup.h" +#include "strv.h" + +int verb_unit_paths(int argc, char *argv[], void *userdata) { + _cleanup_(lookup_paths_free) LookupPaths paths = {}; + int r; + + r = lookup_paths_init_or_warn(&paths, arg_runtime_scope, 0, NULL); + if (r < 0) + return r; + + STRV_FOREACH(p, paths.search_path) + puts(*p); + + return EXIT_SUCCESS; +} diff --git a/src/analyze/analyze-unit-paths.h b/src/analyze/analyze-unit-paths.h new file mode 100644 index 0000000..b8d46e8 --- /dev/null +++ b/src/analyze/analyze-unit-paths.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_unit_paths(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze-verify-util.c b/src/analyze/analyze-verify-util.c new file mode 100644 index 0000000..6fbd6fa --- /dev/null +++ b/src/analyze/analyze-verify-util.c @@ -0,0 +1,376 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "all-units.h" +#include "alloc-util.h" +#include "analyze-verify-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "log.h" +#include "manager.h" +#include "pager.h" +#include "path-util.h" +#include "string-table.h" +#include "strv.h" +#include "unit-name.h" +#include "unit-serialize.h" + +static void log_syntax_callback(const char *unit, int level, void *userdata) { + Set **s = ASSERT_PTR(userdata); + int r; + + assert(unit); + + if (level > LOG_WARNING) + return; + + if (*s == POINTER_MAX) + return; + + r = set_put_strdup(s, unit); + if (r < 0) { + set_free_free(*s); + *s = POINTER_MAX; + } +} + +int verify_prepare_filename(const char *filename, char **ret) { + _cleanup_free_ char *abspath = NULL, *name = NULL, *dir = NULL, *with_instance = NULL; + char *c; + int r; + + assert(filename); + assert(ret); + + r = path_make_absolute_cwd(filename, &abspath); + if (r < 0) + return r; + + r = path_extract_filename(abspath, &name); + if (r < 0) + return r; + + if (!unit_name_is_valid(name, UNIT_NAME_ANY)) + return -EINVAL; + + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + r = unit_name_replace_instance(name, "i", &with_instance); + if (r < 0) + return r; + } + + r = path_extract_directory(abspath, &dir); + if (r < 0) + return r; + + c = path_join(dir, with_instance ?: name); + if (!c) + return -ENOMEM; + + *ret = c; + return 0; +} + +static int find_unit_directory(const char *p, char **ret) { + _cleanup_free_ char *a = NULL, *u = NULL, *t = NULL, *d = NULL; + int r; + + assert(p); + assert(ret); + + r = path_make_absolute_cwd(p, &a); + if (r < 0) + return r; + + if (access(a, F_OK) >= 0) { + r = path_extract_directory(a, &d); + if (r < 0) + return r; + + *ret = TAKE_PTR(d); + return 0; + } + + r = path_extract_filename(a, &u); + if (r < 0) + return r; + + if (!unit_name_is_valid(u, UNIT_NAME_INSTANCE)) + return -ENOENT; + + /* If the specified unit is an instance of a template unit, then let's try to find the template unit. */ + r = unit_name_template(u, &t); + if (r < 0) + return r; + + r = path_extract_directory(a, &d); + if (r < 0) + return r; + + free(a); + a = path_join(d, t); + if (!a) + return -ENOMEM; + + if (access(a, F_OK) < 0) + return -errno; + + *ret = TAKE_PTR(d); + return 0; +} + +int verify_set_unit_path(char **filenames) { + _cleanup_strv_free_ char **ans = NULL; + _cleanup_free_ char *joined = NULL; + const char *old; + int r; + + STRV_FOREACH(filename, filenames) { + _cleanup_free_ char *t = NULL; + + r = find_unit_directory(*filename, &t); + if (r == -ENOMEM) + return r; + if (r < 0) + continue; + + r = strv_consume(&ans, TAKE_PTR(t)); + if (r < 0) + return r; + } + + if (strv_isempty(ans)) + return 0; + + joined = strv_join(strv_uniq(ans), ":"); + if (!joined) + return -ENOMEM; + + /* First, prepend our directories. Second, if some path was specified, use that, and + * otherwise use the defaults. Any duplicates will be filtered out in path-lookup.c. + * Treat explicit empty path to mean that nothing should be appended. */ + old = getenv("SYSTEMD_UNIT_PATH"); + if (!streq_ptr(old, "") && + !strextend_with_separator(&joined, ":", old ?: "")) + return -ENOMEM; + + assert_se(set_unit_path(joined) >= 0); + return 0; +} + +static int verify_socket(Unit *u) { + Unit *service; + int r; + + assert(u); + + if (u->type != UNIT_SOCKET) + return 0; + + r = socket_load_service_unit(SOCKET(u), -1, &service); + if (r < 0) + return log_unit_error_errno(u, r, "service unit for the socket cannot be loaded: %m"); + + if (service->load_state != UNIT_LOADED) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), + "service %s not loaded, socket cannot be started.", service->id); + + log_unit_debug(u, "using service unit %s.", service->id); + return 0; +} + +int verify_executable(Unit *u, const ExecCommand *exec, const char *root) { + int r; + + if (!exec) + return 0; + + if (exec->flags & EXEC_COMMAND_IGNORE_FAILURE) + return 0; + + r = find_executable_full(exec->path, root, NULL, false, NULL, NULL); + if (r < 0) + return log_unit_error_errno(u, r, "Command %s is not executable: %m", exec->path); + + return 0; +} + +static int verify_executables(Unit *u, const char *root) { + int r = 0; + + assert(u); + + ExecCommand *exec = + u->type == UNIT_SOCKET ? SOCKET(u)->control_command : + u->type == UNIT_MOUNT ? MOUNT(u)->control_command : + u->type == UNIT_SWAP ? SWAP(u)->control_command : NULL; + RET_GATHER(r, verify_executable(u, exec, root)); + + if (u->type == UNIT_SERVICE) + FOREACH_ARRAY(i, SERVICE(u)->exec_command, ELEMENTSOF(SERVICE(u)->exec_command)) + RET_GATHER(r, verify_executable(u, *i, root)); + + if (u->type == UNIT_SOCKET) + FOREACH_ARRAY(i, SOCKET(u)->exec_command, ELEMENTSOF(SOCKET(u)->exec_command)) + RET_GATHER(r, verify_executable(u, *i, root)); + + return r; +} + +static int verify_documentation(Unit *u, bool check_man) { + int r = 0, k; + + STRV_FOREACH(p, u->documentation) { + log_unit_debug(u, "Found documentation item: %s", *p); + + if (check_man && startswith(*p, "man:")) { + k = show_man_page(*p + 4, true); + if (k != 0) { + if (k < 0) + log_unit_error_errno(u, k, "Can't show %s: %m", *p + 4); + else { + log_unit_error(u, "Command 'man %s' failed with code %d", *p + 4, k); + k = -ENOEXEC; + } + if (r == 0) + r = k; + } + } + } + + /* Check remote URLs? */ + + return r; +} + +static int verify_unit(Unit *u, bool check_man, const char *root) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(u); + + if (DEBUG_LOGGING) + unit_dump(u, stdout, "\t"); + + log_unit_debug(u, "Creating %s/start job", u->id); + r = manager_add_job(u->manager, JOB_START, u, JOB_REPLACE, NULL, &error, NULL); + if (r < 0) + log_unit_error_errno(u, r, "Failed to create %s/start: %s", u->id, bus_error_message(&error, r)); + + RET_GATHER(r, verify_socket(u)); + RET_GATHER(r, verify_executables(u, root)); + RET_GATHER(r, verify_documentation(u, check_man)); + + return r; +} + +static void set_destroy_ignore_pointer_max(Set **s) { + if (*s == POINTER_MAX) + return; + set_free_free(*s); +} + +int verify_units( + char **filenames, + RuntimeScope scope, + bool check_man, + bool run_generators, + RecursiveErrors recursive_errors, + const char *root) { + + const ManagerTestRunFlags flags = + MANAGER_TEST_RUN_MINIMAL | + MANAGER_TEST_RUN_ENV_GENERATORS | + MANAGER_TEST_DONT_OPEN_EXECUTOR | + (recursive_errors == RECURSIVE_ERRORS_NO) * MANAGER_TEST_RUN_IGNORE_DEPENDENCIES | + run_generators * MANAGER_TEST_RUN_GENERATORS; + + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_(set_destroy_ignore_pointer_max) Set *s = NULL; + _unused_ _cleanup_(clear_log_syntax_callback) dummy_t dummy; + Unit *units[strv_length(filenames)]; + int r, k, count = 0; + + if (strv_isempty(filenames)) + return 0; + + /* Allow systemd-analyze to hook in a callback function so that it can get + * all the required log data from the function itself without having to rely + * on a global set variable for the same */ + set_log_syntax_callback(log_syntax_callback, &s); + + /* set the path */ + r = verify_set_unit_path(filenames); + if (r < 0) + return log_error_errno(r, "Failed to set unit load path: %m"); + + r = manager_new(scope, flags, &m); + if (r < 0) + return log_error_errno(r, "Failed to initialize manager: %m"); + + log_debug("Starting manager..."); + + r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, root); + if (r < 0) + return r; + + manager_clear_jobs(m); + + log_debug("Loading remaining units from the command line..."); + + STRV_FOREACH(filename, filenames) { + _cleanup_free_ char *prepared = NULL; + + log_debug("Handling %s...", *filename); + + k = verify_prepare_filename(*filename, &prepared); + if (k < 0) { + log_error_errno(k, "Failed to prepare filename %s: %m", *filename); + RET_GATHER(r, k); + continue; + } + + k = manager_load_startable_unit_or_warn(m, NULL, prepared, &units[count]); + if (k < 0) { + RET_GATHER(r, k); + continue; + } + + count++; + } + + FOREACH_ARRAY(i, units, count) + RET_GATHER(r, verify_unit(*i, check_man, root)); + + if (s == POINTER_MAX) + return log_oom(); + + if (set_isempty(s) || r != 0) + return r; + + /* If all previous verifications succeeded, then either the recursive parsing of all the + * associated dependencies with RECURSIVE_ERRORS_YES or the parsing of the specified unit file + * with RECURSIVE_ERRORS_NO must have yielded a syntax warning and hence, a non-empty set. */ + if (IN_SET(recursive_errors, RECURSIVE_ERRORS_YES, RECURSIVE_ERRORS_NO)) + return -ENOTRECOVERABLE; + + /* If all previous verifications succeeded, then the non-empty set could have resulted from + * a syntax warning encountered during the recursive parsing of the specified unit file and + * its direct dependencies. Hence, search for any of the filenames in the set and if found, + * return a non-zero process exit status. */ + if (recursive_errors == RECURSIVE_ERRORS_ONE) + STRV_FOREACH(filename, filenames) + if (set_contains(s, basename(*filename))) + return -ENOTRECOVERABLE; + + return 0; +} + +static const char* const recursive_errors_table[_RECURSIVE_ERRORS_MAX] = { + [RECURSIVE_ERRORS_NO] = "no", + [RECURSIVE_ERRORS_YES] = "yes", + [RECURSIVE_ERRORS_ONE] = "one", +}; + +DEFINE_STRING_TABLE_LOOKUP(recursive_errors, RecursiveErrors); diff --git a/src/analyze/analyze-verify-util.h b/src/analyze/analyze-verify-util.h new file mode 100644 index 0000000..0834c59 --- /dev/null +++ b/src/analyze/analyze-verify-util.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "execute.h" +#include "path-lookup.h" + +typedef enum RecursiveErrors { + RECURSIVE_ERRORS_YES, /* Look for errors in all associated units */ + RECURSIVE_ERRORS_NO, /* Don't look for errors in any but the selected unit */ + RECURSIVE_ERRORS_ONE, /* Look for errors in the selected unit and its direct dependencies */ + _RECURSIVE_ERRORS_MAX, + _RECURSIVE_ERRORS_INVALID = -EINVAL, +} RecursiveErrors; + +int verify_set_unit_path(char **filenames); +int verify_prepare_filename(const char *filename, char **ret); +int verify_executable(Unit *u, const ExecCommand *exec, const char *root); +int verify_units(char **filenames, RuntimeScope scope, bool check_man, bool run_generators, RecursiveErrors recursive_errors, const char *root); + +const char* recursive_errors_to_string(RecursiveErrors i) _const_; +RecursiveErrors recursive_errors_from_string(const char *s) _pure_; diff --git a/src/analyze/analyze-verify.c b/src/analyze/analyze-verify.c new file mode 100644 index 0000000..3b463f2 --- /dev/null +++ b/src/analyze/analyze-verify.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze.h" +#include "analyze-verify.h" +#include "analyze-verify-util.h" +#include "copy.h" +#include "rm-rf.h" +#include "tmpfile-util.h" + +static int process_aliases(char *argv[], char *tempdir, char ***ret) { + _cleanup_strv_free_ char **filenames = NULL; + int r; + + assert(argv); + assert(tempdir); + assert(ret); + + STRV_FOREACH(filename, strv_skip(argv, 1)) { + _cleanup_free_ char *src = NULL, *dst = NULL, *base = NULL; + const char *parse_arg; + + parse_arg = *filename; + r = extract_first_word(&parse_arg, &src, ":", EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + + if (!parse_arg) { + r = strv_consume(&filenames, TAKE_PTR(src)); + if (r < 0) + return r; + + continue; + } + + r = path_extract_filename(parse_arg, &base); + if (r < 0) + return r; + + dst = path_join(tempdir, base); + if (!dst) + return -ENOMEM; + + r = copy_file(src, dst, 0, 0644, COPY_REFLINK); + if (r < 0) + return r; + + r = strv_consume(&filenames, TAKE_PTR(dst)); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(filenames); + return 0; +} + +int verb_verify(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **filenames = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tempdir = NULL; + int r; + + r = mkdtemp_malloc("/tmp/systemd-analyze-XXXXXX", &tempdir); + if (r < 0) + return log_error_errno(r, "Failed to setup working directory: %m"); + + r = process_aliases(argv, tempdir, &filenames); + if (r < 0) + return log_error_errno(r, "Couldn't process aliases: %m"); + + return verify_units(filenames, arg_runtime_scope, arg_man, arg_generators, arg_recursive_errors, arg_root); +} diff --git a/src/analyze/analyze-verify.h b/src/analyze/analyze-verify.h new file mode 100644 index 0000000..4892c9a --- /dev/null +++ b/src/analyze/analyze-verify.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_verify(int argc, char *argv[], void *userdata); diff --git a/src/analyze/analyze.c b/src/analyze/analyze.c new file mode 100644 index 0000000..021de65 --- /dev/null +++ b/src/analyze/analyze.c @@ -0,0 +1,691 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Simon Peeters +***/ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "analyze.h" +#include "analyze-blame.h" +#include "analyze-calendar.h" +#include "analyze-capability.h" +#include "analyze-cat-config.h" +#include "analyze-compare-versions.h" +#include "analyze-condition.h" +#include "analyze-critical-chain.h" +#include "analyze-dot.h" +#include "analyze-dump.h" +#include "analyze-exit-status.h" +#include "analyze-fdstore.h" +#include "analyze-filesystems.h" +#include "analyze-inspect-elf.h" +#include "analyze-log-control.h" +#include "analyze-malloc.h" +#include "analyze-pcrs.h" +#include "analyze-plot.h" +#include "analyze-security.h" +#include "analyze-service-watchdogs.h" +#include "analyze-srk.h" +#include "analyze-syscall-filter.h" +#include "analyze-time.h" +#include "analyze-time-data.h" +#include "analyze-timespan.h" +#include "analyze-timestamp.h" +#include "analyze-unit-files.h" +#include "analyze-unit-paths.h" +#include "analyze-verify.h" +#include "analyze-image-policy.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-unit-util.h" +#include "calendarspec.h" +#include "cap-list.h" +#include "capability-util.h" +#include "conf-files.h" +#include "copy.h" +#include "constants.h" +#include "exit-status.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "filesystems.h" +#include "format-table.h" +#include "glob-util.h" +#include "hashmap.h" +#include "locale-util.h" +#include "log.h" +#include "main-func.h" +#include "mount-util.h" +#include "nulstr-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "rm-rf.h" +#if HAVE_SECCOMP +# include "seccomp-util.h" +#endif +#include "sort-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "strv.h" +#include "strxcpyx.h" +#include "terminal-util.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "verb-log-control.h" +#include "verbs.h" + +DotMode arg_dot = DEP_ALL; +char **arg_dot_from_patterns = NULL, **arg_dot_to_patterns = NULL; +usec_t arg_fuzz = 0; +PagerFlags arg_pager_flags = 0; +CatFlags arg_cat_flags = 0; +BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +const char *arg_host = NULL; +RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; +RecursiveErrors arg_recursive_errors = _RECURSIVE_ERRORS_INVALID; +bool arg_man = true; +bool arg_generators = false; +char *arg_root = NULL; +static char *arg_image = NULL; +char *arg_security_policy = NULL; +bool arg_offline = false; +unsigned arg_threshold = 100; +unsigned arg_iterations = 1; +usec_t arg_base_time = USEC_INFINITY; +char *arg_unit = NULL; +JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +bool arg_quiet = false; +char *arg_profile = NULL; +bool arg_legend = true; +bool arg_table = false; +ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_dot_from_patterns, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_dot_to_patterns, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_security_policy, freep); +STATIC_DESTRUCTOR_REGISTER(arg_unit, freep); +STATIC_DESTRUCTOR_REGISTER(arg_profile, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +int acquire_bus(sd_bus **bus, bool *use_full_bus) { + int r; + + if (use_full_bus && *use_full_bus) { + r = bus_connect_transport(arg_transport, arg_host, arg_runtime_scope, bus); + if (IN_SET(r, 0, -EHOSTDOWN)) + return r; + + *use_full_bus = false; + } + + return bus_connect_transport_systemd(arg_transport, arg_host, arg_runtime_scope, bus); +} + +int bus_get_unit_property_strv(sd_bus *bus, const char *path, const char *property, char ***strv) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(path); + assert(property); + assert(strv); + + r = sd_bus_get_property_strv( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + property, + &error, + strv); + if (r < 0) + return log_error_errno(r, "Failed to get unit property %s: %s", property, bus_error_message(&error, r)); + + return 0; +} + +void time_parsing_hint(const char *p, bool calendar, bool timestamp, bool timespan) { + if (calendar && calendar_spec_from_string(p, NULL) >= 0) + log_notice("Hint: this expression is a valid calendar specification. " + "Use 'systemd-analyze calendar \"%s\"' instead?", p); + if (timestamp && parse_timestamp(p, NULL) >= 0) + log_notice("Hint: this expression is a valid timestamp. " + "Use 'systemd-analyze timestamp \"%s\"' instead?", p); + if (timespan && parse_time(p, NULL, USEC_PER_SEC) >= 0) + log_notice("Hint: this expression is a valid timespan. " + "Use 'systemd-analyze timespan \"%s\"' instead?", p); +} + +int dump_fd_reply(sd_bus_message *message) { + int fd, r; + + assert(message); + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return bus_log_parse_error(r); + + fflush(stdout); + r = copy_bytes(fd, STDOUT_FILENO, UINT64_MAX, 0); + if (r < 0) + return r; + + return 1; /* Success */ +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL, *dot_link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("systemd-analyze", "1", &link); + if (r < 0) + return log_oom(); + + /* Not using terminal_urlify_man() for this, since we don't want the "man page" text suffix in this case. */ + r = terminal_urlify("man:dot(1)", "dot(1)", &dot_link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n\n" + "%sProfile systemd, show unit dependencies, check unit files.%s\n" + "\nCommands:\n" + " [time] Print time required to boot the machine\n" + " blame Print list of running units ordered by\n" + " time to init\n" + " critical-chain [UNIT...] Print a tree of the time critical chain\n" + " of units\n" + " plot Output SVG graphic showing service\n" + " initialization\n" + " dot [UNIT...] Output dependency graph in %s format\n" + " dump [PATTERN...] Output state serialization of service\n" + " manager\n" + " cat-config NAME|PATH... Show configuration file and drop-ins\n" + " unit-files List files and symlinks for units\n" + " unit-paths List load directories for units\n" + " exit-status [STATUS...] List exit status definitions\n" + " capability [CAP...] List capability definitions\n" + " syscall-filter [NAME...] List syscalls in seccomp filters\n" + " filesystems [NAME...] List known filesystems\n" + " condition CONDITION... Evaluate conditions and asserts\n" + " compare-versions VERSION1 [OP] VERSION2\n" + " Compare two version strings\n" + " verify FILE... Check unit files for correctness\n" + " calendar SPEC... Validate repetitive calendar time\n" + " events\n" + " timestamp TIMESTAMP... Validate a timestamp\n" + " timespan SPAN... Validate a time span\n" + " security [UNIT...] Analyze security of unit\n" + " inspect-elf FILE... Parse and print ELF package metadata\n" + " malloc [D-BUS SERVICE...] Dump malloc stats of a D-Bus service\n" + " fdstore SERVICE... Show file descriptor store contents of service\n" + " image-policy POLICY... Analyze image policy string\n" + " pcrs [PCR...] Show TPM2 PCRs and their names\n" + " srk > FILE Write TPM2 SRK to stdout\n" + "\nOptions:\n" + " --recursive-errors=MODE Control which units are verified\n" + " --offline=BOOL Perform a security review on unit file(s)\n" + " --threshold=N Exit with a non-zero status when overall\n" + " exposure level is over threshold value\n" + " --security-policy=PATH Use custom JSON security policy instead\n" + " of built-in one\n" + " --json=pretty|short|off Generate JSON output of the security\n" + " analysis table, or plot's raw time data\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Disable column headers and hints in plot\n" + " with either --table or --json=\n" + " --system Operate on system systemd instance\n" + " --user Operate on user systemd instance\n" + " --global Operate on global user configuration\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " --order Show only order in the graph\n" + " --require Show only requirement in the graph\n" + " --from-pattern=GLOB Show only origins in the graph\n" + " --to-pattern=GLOB Show only destinations in the graph\n" + " --fuzz=SECONDS Also print services which finished SECONDS\n" + " earlier than the latest in the branch\n" + " --man[=BOOL] Do [not] check for existence of man pages\n" + " --generators[=BOOL] Do [not] run unit generators\n" + " (requires privileges)\n" + " --iterations=N Show the specified number of iterations\n" + " --base-time=TIMESTAMP Calculate calendar times relative to\n" + " specified time\n" + " --profile=name|PATH Include the specified profile in the\n" + " security review of the unit(s)\n" + " --table Output plot's raw time data as a table\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -q --quiet Do not emit hints\n" + " --tldr Skip comments and empty lines\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + dot_link, + link); + + /* When updating this list, including descriptions, apply changes to + * shell-completion/bash/systemd-analyze and shell-completion/zsh/_systemd-analyze too. */ + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_ORDER, + ARG_REQUIRE, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_SYSTEM, + ARG_USER, + ARG_GLOBAL, + ARG_DOT_FROM_PATTERN, + ARG_DOT_TO_PATTERN, + ARG_FUZZ, + ARG_NO_PAGER, + ARG_MAN, + ARG_GENERATORS, + ARG_ITERATIONS, + ARG_BASE_TIME, + ARG_RECURSIVE_ERRORS, + ARG_OFFLINE, + ARG_THRESHOLD, + ARG_SECURITY_POLICY, + ARG_JSON, + ARG_PROFILE, + ARG_TABLE, + ARG_NO_LEGEND, + ARG_TLDR, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "quiet", no_argument, NULL, 'q' }, + { "order", no_argument, NULL, ARG_ORDER }, + { "require", no_argument, NULL, ARG_REQUIRE }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "recursive-errors", required_argument, NULL, ARG_RECURSIVE_ERRORS }, + { "offline", required_argument, NULL, ARG_OFFLINE }, + { "threshold", required_argument, NULL, ARG_THRESHOLD }, + { "security-policy", required_argument, NULL, ARG_SECURITY_POLICY }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "user", no_argument, NULL, ARG_USER }, + { "global", no_argument, NULL, ARG_GLOBAL }, + { "from-pattern", required_argument, NULL, ARG_DOT_FROM_PATTERN }, + { "to-pattern", required_argument, NULL, ARG_DOT_TO_PATTERN }, + { "fuzz", required_argument, NULL, ARG_FUZZ }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "man", optional_argument, NULL, ARG_MAN }, + { "generators", optional_argument, NULL, ARG_GENERATORS }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "iterations", required_argument, NULL, ARG_ITERATIONS }, + { "base-time", required_argument, NULL, ARG_BASE_TIME }, + { "unit", required_argument, NULL, 'U' }, + { "json", required_argument, NULL, ARG_JSON }, + { "profile", required_argument, NULL, ARG_PROFILE }, + { "table", optional_argument, NULL, ARG_TABLE }, + { "no-legend", optional_argument, NULL, ARG_NO_LEGEND }, + { "tldr", no_argument, NULL, ARG_TLDR }, + {} + }; + + int r, c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hH:M:U:q", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case 'q': + arg_quiet = true; + break; + + case ARG_RECURSIVE_ERRORS: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(recursive_errors, RecursiveErrors, _RECURSIVE_ERRORS_MAX); + return 0; + } + r = recursive_errors_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Unknown mode passed to --recursive-errors='%s'.", optarg); + + arg_recursive_errors = r; + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ true, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_SYSTEM: + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + break; + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + + case ARG_GLOBAL: + arg_runtime_scope = RUNTIME_SCOPE_GLOBAL; + break; + + case ARG_ORDER: + arg_dot = DEP_ORDER; + break; + + case ARG_REQUIRE: + arg_dot = DEP_REQUIRE; + break; + + case ARG_DOT_FROM_PATTERN: + if (strv_extend(&arg_dot_from_patterns, optarg) < 0) + return log_oom(); + + break; + + case ARG_DOT_TO_PATTERN: + if (strv_extend(&arg_dot_to_patterns, optarg) < 0) + return log_oom(); + + break; + + case ARG_FUZZ: + r = parse_sec(optarg, &arg_fuzz); + if (r < 0) + return r; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case ARG_MAN: + r = parse_boolean_argument("--man", optarg, &arg_man); + if (r < 0) + return r; + break; + + case ARG_GENERATORS: + r = parse_boolean_argument("--generators", optarg, &arg_generators); + if (r < 0) + return r; + break; + + case ARG_OFFLINE: + r = parse_boolean_argument("--offline", optarg, &arg_offline); + if (r < 0) + return r; + break; + + case ARG_THRESHOLD: + r = safe_atou(optarg, &arg_threshold); + if (r < 0 || arg_threshold > 100) + return log_error_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL), "Failed to parse threshold: %s", optarg); + + break; + + case ARG_SECURITY_POLICY: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_security_policy); + if (r < 0) + return r; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + break; + + case ARG_ITERATIONS: + r = safe_atou(optarg, &arg_iterations); + if (r < 0) + return log_error_errno(r, "Failed to parse iterations: %s", optarg); + break; + + case ARG_BASE_TIME: + r = parse_timestamp(optarg, &arg_base_time); + if (r < 0) + return log_error_errno(r, "Failed to parse --base-time= parameter: %s", optarg); + break; + + case ARG_PROFILE: + if (isempty(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Profile file name is empty"); + + if (is_path(optarg)) { + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_profile); + if (r < 0) + return r; + if (!endswith(arg_profile, ".conf")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Profile file name must end with .conf: %s", arg_profile); + } else { + r = free_and_strdup(&arg_profile, optarg); + if (r < 0) + return log_oom(); + } + + break; + + case 'U': { + _cleanup_free_ char *mangled = NULL; + + r = unit_name_mangle(optarg, UNIT_NAME_MANGLE_WARN, &mangled); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name %s: %m", optarg); + + free_and_replace(arg_unit, mangled); + break; + } + + case ARG_TABLE: + arg_table = true; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_TLDR: + arg_cat_flags = CAT_TLDR; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_offline && !streq_ptr(argv[optind], "security")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --offline= is only supported for security right now."); + + if (arg_json_format_flags != JSON_FORMAT_OFF && !STRPTR_IN_SET(argv[optind], "security", "inspect-elf", "plot", "fdstore", "pcrs")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --json= is only supported for security, inspect-elf, plot, fdstore, pcrs right now."); + + if (arg_threshold != 100 && !streq_ptr(argv[optind], "security")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --threshold= is only supported for security right now."); + + if (arg_runtime_scope == RUNTIME_SCOPE_GLOBAL && + !STR_IN_SET(argv[optind] ?: "time", "dot", "unit-paths", "verify")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --global only makes sense with verbs dot, unit-paths, verify."); + + if (streq_ptr(argv[optind], "cat-config") && arg_runtime_scope == RUNTIME_SCOPE_USER) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --user is not supported for cat-config right now."); + + if (arg_security_policy && !streq_ptr(argv[optind], "security")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --security-policy= is only supported for security."); + + if ((arg_root || arg_image) && (!STRPTR_IN_SET(argv[optind], "cat-config", "verify", "condition")) && + (!(streq_ptr(argv[optind], "security") && arg_offline))) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Options --root= and --image= are only supported for cat-config, verify, condition and security when used with --offline= right now."); + + /* Having both an image and a root is not supported by the code */ + if (arg_root && arg_image) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + + if (arg_unit && !streq_ptr(argv[optind], "condition")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --unit= is only supported for condition"); + + if (streq_ptr(argv[optind], "condition") && !arg_unit && optind >= argc - 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too few arguments for condition"); + + if (streq_ptr(argv[optind], "condition") && arg_unit && optind < argc - 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No conditions can be passed if --unit= is used."); + + if ((!arg_legend && !streq_ptr(argv[optind], "plot")) || + (streq_ptr(argv[optind], "plot") && !arg_legend && !arg_table && FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF))) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --no-legend is only supported for plot with either --table or --json=."); + + if (arg_table && !streq_ptr(argv[optind], "plot")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --table is only supported for plot right now."); + + if (arg_table && !FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--table and --json= are mutually exclusive."); + + return 1; /* work to do */ +} + +static int run(int argc, char *argv[]) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "time", VERB_ANY, 1, VERB_DEFAULT, verb_time }, + { "blame", VERB_ANY, 1, 0, verb_blame }, + { "critical-chain", VERB_ANY, VERB_ANY, 0, verb_critical_chain }, + { "plot", VERB_ANY, 1, 0, verb_plot }, + { "dot", VERB_ANY, VERB_ANY, 0, verb_dot }, + /* ↓ The following seven verbs are deprecated, from here … ↓ */ + { "log-level", VERB_ANY, 2, 0, verb_log_control }, + { "log-target", VERB_ANY, 2, 0, verb_log_control }, + { "set-log-level", 2, 2, 0, verb_log_control }, + { "get-log-level", VERB_ANY, 1, 0, verb_log_control }, + { "set-log-target", 2, 2, 0, verb_log_control }, + { "get-log-target", VERB_ANY, 1, 0, verb_log_control }, + { "service-watchdogs", VERB_ANY, 2, 0, verb_service_watchdogs }, + /* ↑ … until here ↑ */ + { "dump", VERB_ANY, VERB_ANY, 0, verb_dump }, + { "cat-config", 2, VERB_ANY, 0, verb_cat_config }, + { "unit-files", VERB_ANY, VERB_ANY, 0, verb_unit_files }, + { "unit-paths", 1, 1, 0, verb_unit_paths }, + { "exit-status", VERB_ANY, VERB_ANY, 0, verb_exit_status }, + { "syscall-filter", VERB_ANY, VERB_ANY, 0, verb_syscall_filters }, + { "capability", VERB_ANY, VERB_ANY, 0, verb_capabilities }, + { "filesystems", VERB_ANY, VERB_ANY, 0, verb_filesystems }, + { "condition", VERB_ANY, VERB_ANY, 0, verb_condition }, + { "compare-versions", 3, 4, 0, verb_compare_versions }, + { "verify", 2, VERB_ANY, 0, verb_verify }, + { "calendar", 2, VERB_ANY, 0, verb_calendar }, + { "timestamp", 2, VERB_ANY, 0, verb_timestamp }, + { "timespan", 2, VERB_ANY, 0, verb_timespan }, + { "security", VERB_ANY, VERB_ANY, 0, verb_security }, + { "inspect-elf", 2, VERB_ANY, 0, verb_elf_inspection }, + { "malloc", VERB_ANY, VERB_ANY, 0, verb_malloc }, + { "fdstore", 2, VERB_ANY, 0, verb_fdstore }, + { "image-policy", 2, 2, 0, verb_image_policy }, + { "pcrs", VERB_ANY, VERB_ANY, 0, verb_pcrs }, + { "srk", VERB_ANY, 1, 0, verb_srk }, + {} + }; + + int r; + + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); /* we want to format/parse floats in C style */ + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + /* Open up and mount the image */ + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_READ_ONLY, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } + + return dispatch_verb(argc, argv, verbs, NULL); +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/analyze/analyze.h b/src/analyze/analyze.h new file mode 100644 index 0000000..8a9528c --- /dev/null +++ b/src/analyze/analyze.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "analyze-verify-util.h" +#include "bus-util.h" +#include "json.h" +#include "pager.h" +#include "pretty-print.h" +#include "time-util.h" +#include "unit-file.h" + +typedef enum DotMode { + DEP_ALL, + DEP_ORDER, + DEP_REQUIRE, +} DotMode; + +extern DotMode arg_dot; +extern char **arg_dot_from_patterns, **arg_dot_to_patterns; +extern usec_t arg_fuzz; +extern PagerFlags arg_pager_flags; +extern CatFlags arg_cat_flags; +extern BusTransport arg_transport; +extern const char *arg_host; +extern RuntimeScope arg_runtime_scope; +extern RecursiveErrors arg_recursive_errors; +extern bool arg_man; +extern bool arg_generators; +extern char *arg_root; +extern char *arg_security_policy; +extern bool arg_offline; +extern unsigned arg_threshold; +extern unsigned arg_iterations; +extern usec_t arg_base_time; +extern char *arg_unit; +extern JsonFormatFlags arg_json_format_flags; +extern bool arg_quiet; +extern char *arg_profile; +extern bool arg_legend; +extern bool arg_table; +extern ImagePolicy *arg_image_policy; + +int acquire_bus(sd_bus **bus, bool *use_full_bus); + +int bus_get_unit_property_strv(sd_bus *bus, const char *path, const char *property, char ***strv); + +void time_parsing_hint(const char *p, bool calendar, bool timestamp, bool timespan); + +int dump_fd_reply(sd_bus_message *message); diff --git a/src/analyze/meson.build b/src/analyze/meson.build new file mode 100644 index 0000000..a505447 --- /dev/null +++ b/src/analyze/meson.build @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_analyze_sources = files( + 'analyze-blame.c', + 'analyze-calendar.c', + 'analyze-capability.c', + 'analyze-cat-config.c', + 'analyze-compare-versions.c', + 'analyze-condition.c', + 'analyze-critical-chain.c', + 'analyze-dot.c', + 'analyze-dump.c', + 'analyze-exit-status.c', + 'analyze-fdstore.c', + 'analyze-filesystems.c', + 'analyze-image-policy.c', + 'analyze-inspect-elf.c', + 'analyze-log-control.c', + 'analyze-malloc.c', + 'analyze-pcrs.c', + 'analyze-plot.c', + 'analyze-security.c', + 'analyze-service-watchdogs.c', + 'analyze-srk.c', + 'analyze-syscall-filter.c', + 'analyze-time.c', + 'analyze-time-data.c', + 'analyze-timespan.c', + 'analyze-timestamp.c', + 'analyze-unit-files.c', + 'analyze-unit-paths.c', + 'analyze-verify.c', + 'analyze-verify-util.c', + 'analyze.c', +) + +executables += [ + executable_template + { + 'name' : 'systemd-analyze', + 'public' : conf.get('ENABLE_ANALYZE') == 1, + 'sources' : systemd_analyze_sources, + 'include_directories' : core_includes, + 'link_with' : [ + libcore, + libshared, + ], + 'dependencies' : libseccomp, + 'install' : conf.get('ENABLE_ANALYZE') == 1, + }, + core_test_template + { + 'sources' : files( + 'test-verify.c', + 'analyze-verify-util.c', + ), + }, +] diff --git a/src/analyze/test-verify.c b/src/analyze/test-verify.c new file mode 100644 index 0000000..d37e54b --- /dev/null +++ b/src/analyze/test-verify.c @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "analyze-verify-util.h" +#include "tests.h" + +TEST(verify_nonexistent) { + /* Negative cases */ + assert_se(verify_executable(NULL, &(ExecCommand) {.flags = EXEC_COMMAND_IGNORE_FAILURE, .path = (char*) "/non/existent"}, NULL) == 0); + assert_se(verify_executable(NULL, &(ExecCommand) {.path = (char*) "/non/existent"}, NULL) < 0); + + /* Ordinary cases */ + assert_se(verify_executable(NULL, &(ExecCommand) {.path = (char*) "/bin/echo"}, NULL) == 0); + assert_se(verify_executable(NULL, &(ExecCommand) {.flags = EXEC_COMMAND_IGNORE_FAILURE, .path = (char*) "/bin/echo"}, NULL) == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/ask-password/ask-password.c b/src/ask-password/ask-password.c new file mode 100644 index 0000000..bf4c93e --- /dev/null +++ b/src/ask-password/ask-password.c @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "ask-password-api.h" +#include "build.h" +#include "constants.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "parse-argument.h" +#include "pretty-print.h" +#include "strv.h" +#include "terminal-util.h" + +static const char *arg_icon = NULL; +static const char *arg_id = NULL; /* identifier for 'ask-password' protocol */ +static const char *arg_key_name = NULL; /* name in kernel keyring */ +static const char *arg_credential_name = NULL; /* name in $CREDENTIALS_DIRECTORY directory */ +static char *arg_message = NULL; +static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC; +static bool arg_multiple = false; +static bool arg_no_output = false; +static AskPasswordFlags arg_flags = ASK_PASSWORD_PUSH_CACHE; +static bool arg_newline = true; + +STATIC_DESTRUCTOR_REGISTER(arg_message, freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-ask-password", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] MESSAGE\n\n" + "%3$sQuery the user for a system passphrase, via the TTY or a UI agent.%4$s\n\n" + " -h --help Show this help\n" + " --icon=NAME Icon name\n" + " --id=ID Query identifier (e.g. \"cryptsetup:/dev/sda5\")\n" + " --keyname=NAME Kernel key name for caching passwords (e.g. \"cryptsetup\")\n" + " --credential=NAME\n" + " Credential name for ImportCredential=, LoadCredential= or\n" + " SetCredential= credentials\n" + " --timeout=SEC Timeout in seconds\n" + " --echo=yes|no|masked\n" + " Control whether to show password while typing (echo)\n" + " -e --echo Equivalent to --echo=yes\n" + " --emoji=yes|no|auto\n" + " Show a lock and key emoji\n" + " --no-tty Ask question via agent even on TTY\n" + " --accept-cached Accept cached passwords\n" + " --multiple List multiple passwords if available\n" + " --no-output Do not print password to standard output\n" + " -n Do not suffix password written to standard output with\n" + " newline\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_ICON = 0x100, + ARG_TIMEOUT, + ARG_EMOJI, + ARG_NO_TTY, + ARG_ACCEPT_CACHED, + ARG_MULTIPLE, + ARG_ID, + ARG_KEYNAME, + ARG_NO_OUTPUT, + ARG_VERSION, + ARG_CREDENTIAL, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "icon", required_argument, NULL, ARG_ICON }, + { "timeout", required_argument, NULL, ARG_TIMEOUT }, + { "echo", optional_argument, NULL, 'e' }, + { "emoji", required_argument, NULL, ARG_EMOJI }, + { "no-tty", no_argument, NULL, ARG_NO_TTY }, + { "accept-cached", no_argument, NULL, ARG_ACCEPT_CACHED }, + { "multiple", no_argument, NULL, ARG_MULTIPLE }, + { "id", required_argument, NULL, ARG_ID }, + { "keyname", required_argument, NULL, ARG_KEYNAME }, + { "no-output", no_argument, NULL, ARG_NO_OUTPUT }, + { "credential", required_argument, NULL, ARG_CREDENTIAL }, + {} + }; + + const char *emoji = NULL; + int c, r; + + assert(argc >= 0); + assert(argv); + + /* Note the asymmetry: the long option --echo= allows an optional argument, the short option does + * not. */ + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+hen", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_ICON: + arg_icon = optarg; + break; + + case ARG_TIMEOUT: + r = parse_sec(optarg, &arg_timeout); + if (r < 0) + return log_error_errno(r, "Failed to parse --timeout= parameter: %s", optarg); + + break; + + case 'e': + if (!optarg) { + /* Short option -e is used, or no argument to long option --echo= */ + arg_flags |= ASK_PASSWORD_ECHO; + arg_flags &= ~ASK_PASSWORD_SILENT; + } else if (isempty(optarg) || streq(optarg, "masked")) + /* Empty argument or explicit string "masked" for default behaviour. */ + arg_flags &= ~(ASK_PASSWORD_ECHO|ASK_PASSWORD_SILENT); + else { + r = parse_boolean_argument("--echo=", optarg, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_flags, ASK_PASSWORD_ECHO, r); + SET_FLAG(arg_flags, ASK_PASSWORD_SILENT, !r); + } + break; + + case ARG_EMOJI: + emoji = optarg; + break; + + case ARG_NO_TTY: + arg_flags |= ASK_PASSWORD_NO_TTY; + break; + + case ARG_ACCEPT_CACHED: + arg_flags |= ASK_PASSWORD_ACCEPT_CACHED; + break; + + case ARG_MULTIPLE: + arg_multiple = true; + break; + + case ARG_ID: + arg_id = optarg; + break; + + case ARG_KEYNAME: + arg_key_name = optarg; + break; + + case ARG_NO_OUTPUT: + arg_no_output = true; + break; + + case ARG_CREDENTIAL: + arg_credential_name = optarg; + break; + + case 'n': + arg_newline = false; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (isempty(emoji) || streq(emoji, "auto")) + SET_FLAG(arg_flags, ASK_PASSWORD_HIDE_EMOJI, FLAGS_SET(arg_flags, ASK_PASSWORD_ECHO)); + else { + r = parse_boolean_argument("--emoji=", emoji, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_flags, ASK_PASSWORD_HIDE_EMOJI, !r); + } + + if (argc > optind) { + arg_message = strv_join(argv + optind, " "); + if (!arg_message) + return log_oom(); + } else if (FLAGS_SET(arg_flags, ASK_PASSWORD_ECHO)) { + /* By default ask_password_auto() will query with the string "Password: ", which is not right + * when full echo is on, since then it's unlikely a password. Let's hence default to a less + * confusing string in that case. */ + + arg_message = strdup("Input:"); + if (!arg_message) + return log_oom(); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_strv_free_erase_ char **l = NULL; + usec_t timeout; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_timeout > 0) + timeout = usec_add(now(CLOCK_MONOTONIC), arg_timeout); + else + timeout = 0; + + r = ask_password_auto(arg_message, arg_icon, arg_id, arg_key_name, arg_credential_name ?: "password", timeout, arg_flags, &l); + if (r < 0) + return log_error_errno(r, "Failed to query password: %m"); + + STRV_FOREACH(p, l) { + if (!arg_no_output) { + if (arg_newline) + puts(*p); + else + fputs(*p, stdout); + } + + fflush(stdout); + + if (!arg_multiple) + break; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/ask-password/meson.build b/src/ask-password/meson.build new file mode 100644 index 0000000..3197112 --- /dev/null +++ b/src/ask-password/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-ask-password', + 'public' : true, + 'sources' : files('ask-password.c'), + }, +] diff --git a/src/backlight/backlight.c b/src/backlight/backlight.c new file mode 100644 index 0000000..5ac9f90 --- /dev/null +++ b/src/backlight/backlight.c @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "escape.h" +#include "fileio.h" +#include "main-func.h" +#include "mkdir.h" +#include "parse-util.h" +#include "percent-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "reboot-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +#define PCI_CLASS_GRAPHICS_CARD 0x30000 + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-backlight", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s save [backlight|leds]:DEVICE\n" + "%s load [backlight|leds]:DEVICE\n" + "\n%sSave and restore backlight brightness at shutdown and boot.%s\n\n" + " save Save current brightness\n" + " load Set brightness to be the previously saved value\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int has_multiple_graphics_cards(void) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool found = false; + int r; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "pci", /* match = */ true); + if (r < 0) + return r; + + /* class is an unsigned number, let's validate the value later. */ + r = sd_device_enumerator_add_match_sysattr(e, "class", NULL, /* match = */ true); + if (r < 0) + return r; + + FOREACH_DEVICE(e, dev) { + const char *s; + unsigned long c; + + if (sd_device_get_sysattr_value(dev, "class", &s) < 0) + continue; + + if (safe_atolu(s, &c) < 0) + continue; + + if (c != PCI_CLASS_GRAPHICS_CARD) + continue; + + if (found) + return true; /* This is the second device. */ + + found = true; /* Found the first device. */ + } + + return false; +} + +static int find_pci_or_platform_parent(sd_device *device, sd_device **ret) { + const char *subsystem, *sysname, *value; + sd_device *parent; + int r; + + assert(device); + assert(ret); + + r = sd_device_get_parent(device, &parent); + if (r < 0) + return r; + + r = sd_device_get_subsystem(parent, &subsystem); + if (r < 0) + return r; + + r = sd_device_get_sysname(parent, &sysname); + if (r < 0) + return r; + + if (streq(subsystem, "drm")) { + const char *c; + + c = startswith(sysname, "card"); + if (!c) + return -ENODATA; + + c += strspn(c, DIGITS); + if (*c == '-' && !STARTSWITH_SET(c, "-LVDS-", "-Embedded DisplayPort-", "-eDP-")) + /* A connector DRM device, let's ignore all but LVDS and eDP! */ + return -EOPNOTSUPP; + + } else if (streq(subsystem, "pci") && + sd_device_get_sysattr_value(parent, "class", &value) >= 0) { + unsigned long class; + + r = safe_atolu(value, &class); + if (r < 0) + return log_warning_errno(r, "Cannot parse PCI class '%s' of device %s:%s: %m", + value, subsystem, sysname); + + /* Graphics card */ + if (class == PCI_CLASS_GRAPHICS_CARD) { + *ret = parent; + return 0; + } + + } else if (streq(subsystem, "platform")) { + *ret = parent; + return 0; + } + + return find_pci_or_platform_parent(parent, ret); +} + +static int same_device(sd_device *a, sd_device *b) { + const char *a_val, *b_val; + int r; + + assert(a); + assert(b); + + r = sd_device_get_subsystem(a, &a_val); + if (r < 0) + return r; + + r = sd_device_get_subsystem(b, &b_val); + if (r < 0) + return r; + + if (!streq(a_val, b_val)) + return false; + + r = sd_device_get_sysname(a, &a_val); + if (r < 0) + return r; + + r = sd_device_get_sysname(b, &b_val); + if (r < 0) + return r; + + return streq(a_val, b_val); +} + +static int validate_device(sd_device *device) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *enumerate = NULL; + const char *v, *sysname, *subsystem; + sd_device *parent; + int r; + + assert(device); + + /* Verify whether we should actually care for a specific backlight device. For backlight devices + * there might be multiple ways to access the same control: "firmware" (i.e. ACPI), "platform" + * (i.e. via the machine's EC) and "raw" (via the graphics card). In general we should prefer + * "firmware" (i.e. ACPI) or "platform" access over "raw" access, in order not to confuse the + * BIOS/EC, and compatibility with possible low-level hotkey handling of screen brightness. The + * kernel will already make sure to expose only one of "firmware" and "platform" for the same + * device to userspace. However, we still need to make sure that we use "raw" only if no + * "firmware" or "platform" device for the same device exists. */ + + r = sd_device_get_sysname(device, &sysname); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get sysname: %m"); + + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get subsystem: %m"); + if (!streq(subsystem, "backlight")) + return true; + + r = sd_device_get_sysattr_value(device, "type", &v); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to read 'type' sysattr: %m"); + if (!streq(v, "raw")) + return true; + + r = find_pci_or_platform_parent(device, &parent); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to find PCI or platform parent: %m"); + + r = sd_device_get_subsystem(parent, &subsystem); + if (r < 0) + return log_device_debug_errno(parent, r, "Failed to get subsystem: %m"); + + if (DEBUG_LOGGING) { + const char *s = NULL; + + (void) sd_device_get_syspath(parent, &s); + log_device_debug(device, "Found %s parent device: %s", subsystem, strna(s)); + } + + r = sd_device_enumerator_new(&enumerate); + if (r < 0) + return log_oom_debug(); + + r = sd_device_enumerator_allow_uninitialized(enumerate); + if (r < 0) + return log_debug_errno(r, "Failed to allow uninitialized devices: %m"); + + r = sd_device_enumerator_add_match_subsystem(enumerate, "backlight", /* match = */ true); + if (r < 0) + return log_debug_errno(r, "Failed to add subsystem match: %m"); + + r = sd_device_enumerator_add_nomatch_sysname(enumerate, sysname); + if (r < 0) + return log_debug_errno(r, "Failed to add sysname unmatch: %m"); + + r = sd_device_enumerator_add_match_sysattr(enumerate, "type", "platform", /* match = */ true); + if (r < 0) + return log_debug_errno(r, "Failed to add sysattr match: %m"); + + r = sd_device_enumerator_add_match_sysattr(enumerate, "type", "firmware", /* match = */ true); + if (r < 0) + return log_debug_errno(r, "Failed to add sysattr match: %m"); + + if (streq(subsystem, "pci")) { + r = has_multiple_graphics_cards(); + if (r < 0) + return log_debug_errno(r, "Failed to check if the system has multiple graphics cards: %m"); + if (r > 0) { + /* If the system has multiple graphics cards, then we cannot associate platform + * devices on non-PCI bus (especially WMI bus) with PCI devices. Let's ignore all + * backlight devices that do not have the same parent PCI device. */ + log_debug("Found multiple graphics cards on PCI bus. " + "Skipping to associate platform backlight devices on non-PCI bus."); + + r = sd_device_enumerator_add_match_parent(enumerate, parent); + if (r < 0) + return log_debug_errno(r, "Failed to add parent match: %m"); + } + } + + FOREACH_DEVICE(enumerate, other) { + const char *other_subsystem; + sd_device *other_parent; + + /* OK, so there's another backlight device, and it's a platform or firmware device. + * Let's see if we can verify it belongs to the same device as ours. */ + r = find_pci_or_platform_parent(other, &other_parent); + if (r < 0) { + log_device_debug_errno(other, r, "Failed to get PCI or platform parent, ignoring: %m"); + continue; + } + + if (same_device(parent, other_parent) > 0) { + /* Both have the same PCI parent, that means we are out. */ + if (DEBUG_LOGGING) { + const char *other_sysname = NULL, *other_type = NULL; + + (void) sd_device_get_sysname(other, &other_sysname); + (void) sd_device_get_sysattr_value(other, "type", &other_type); + log_device_debug(device, + "Found another %s backlight device %s on the same PCI, skipping.", + strna(other_type), strna(other_sysname)); + } + return false; + } + + r = sd_device_get_subsystem(other_parent, &other_subsystem); + if (r < 0) { + log_device_debug_errno(other_parent, r, "Failed to get subsystem, ignoring: %m"); + continue; + } + + if (streq(other_subsystem, "platform") && streq(subsystem, "pci")) { + /* The other is connected to the platform bus and we are a PCI device, that also means we are out. */ + if (DEBUG_LOGGING) { + const char *other_sysname = NULL, *other_type = NULL; + + (void) sd_device_get_sysname(other, &other_sysname); + (void) sd_device_get_sysattr_value(other, "type", &other_type); + log_device_debug(device, + "Found another %s backlight device %s, which has higher precedence, skipping.", + strna(other_type), strna(other_sysname)); + } + return false; + } + } + + return true; +} + +static int get_max_brightness(sd_device *device, unsigned *ret) { + const char *s; + int r; + + assert(device); + assert(ret); + + r = sd_device_get_sysattr_value(device, "max_brightness", &s); + if (r < 0) + return log_device_warning_errno(device, r, "Failed to read 'max_brightness' attribute: %m"); + + r = safe_atou(s, ret); + if (r < 0) + return log_device_warning_errno(device, r, "Failed to parse 'max_brightness' \"%s\": %m", s); + + return 0; +} + +static int clamp_brightness( + sd_device *device, + unsigned percent, + bool saved, + unsigned max_brightness, + unsigned *brightness) { + + unsigned new_brightness, min_brightness; + const char *subsystem; + int r; + + assert(device); + assert(brightness); + + /* Some systems turn the backlight all the way off at the lowest levels. This clamps the saved + * brightness to at least 1 or 5% of max_brightness in case of 'backlight' subsystem. This + * avoids preserving an unreadably dim screen, which would otherwise force the user to disable + * state restoration. */ + + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0) + return log_device_warning_errno(device, r, "Failed to get device subsystem: %m"); + + if (streq(subsystem, "backlight")) + min_brightness = MAX(1U, (unsigned) ((double) max_brightness * percent / 100)); + else + min_brightness = 0; + + new_brightness = CLAMP(*brightness, min_brightness, max_brightness); + if (new_brightness != *brightness) + log_device_info(device, "%s brightness %u is %s to %u.", + saved ? "Saved" : "Current", + *brightness, + new_brightness > *brightness ? + "too low; increasing" : "too high; decreasing", + new_brightness); + + *brightness = new_brightness; + return 0; +} + +static bool shall_clamp(sd_device *d, unsigned *ret) { + const char *s; + int r; + + assert(d); + assert(ret); + + r = sd_device_get_property_value(d, "ID_BACKLIGHT_CLAMP", &s); + if (r < 0) { + if (r != -ENOENT) + log_device_debug_errno(d, r, "Failed to get ID_BACKLIGHT_CLAMP property, ignoring: %m"); + *ret = 5; /* defaults to 5% */ + return true; + } + + r = parse_boolean(s); + if (r >= 0) { + *ret = r ? 5 : 0; + return r; + } + + r = parse_percent(s); + if (r < 0) { + log_device_debug_errno(d, r, "Failed to parse ID_BACKLIGHT_CLAMP property, ignoring: %m"); + *ret = 5; + return true; + } + + *ret = r; + return true; +} + +static int read_brightness(sd_device *device, unsigned max_brightness, unsigned *ret_brightness) { + const char *subsystem, *value; + unsigned brightness; + int r; + + assert(device); + assert(ret_brightness); + + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get subsystem: %m"); + + if (streq(subsystem, "backlight")) { + r = sd_device_get_sysattr_value(device, "actual_brightness", &value); + if (r == -ENOENT) { + log_device_debug_errno(device, r, "Failed to read 'actual_brightness' attribute, " + "fall back to use 'brightness' attribute: %m"); + goto use_brightness; + } + if (r < 0) + return log_device_debug_errno(device, r, "Failed to read 'actual_brightness' attribute: %m"); + + r = safe_atou(value, &brightness); + if (r < 0) { + log_device_debug_errno(device, r, "Failed to parse 'actual_brightness' attribute, " + "fall back to use 'brightness' attribute: %s", value); + goto use_brightness; + } + + if (brightness > max_brightness) { + log_device_debug(device, "actual_brightness=%u is larger than max_brightness=%u, " + "fall back to use 'brightness' attribute", brightness, max_brightness); + goto use_brightness; + } + + log_device_debug(device, "Current actual_brightness is %u", brightness); + *ret_brightness = brightness; + return 0; + } + +use_brightness: + r = sd_device_get_sysattr_value(device, "brightness", &value); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to read 'brightness' attribute: %m"); + + r = safe_atou(value, &brightness); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to parse 'brightness' attribute: %s", value); + + if (brightness > max_brightness) + return log_device_debug_errno(device, SYNTHETIC_ERRNO(EINVAL), + "brightness=%u is larger than max_brightness=%u", + brightness, max_brightness); + + log_device_debug(device, "Current brightness is %u", brightness); + *ret_brightness = brightness; + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _cleanup_free_ char *escaped_ss = NULL, *escaped_sysname = NULL, *escaped_path_id = NULL; + const char *sysname, *path_id, *ss, *saved; + unsigned max_brightness, brightness; + int r; + + log_setup(); + + if (argv_looks_like_help(argc, argv)) + return help(); + + if (argc != 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program requires two arguments."); + + if (!STR_IN_SET(argv[1], "load", "save")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown verb %s.", argv[1]); + + umask(0022); + + r = mkdir_p("/var/lib/systemd/backlight", 0755); + if (r < 0) + return log_error_errno(r, "Failed to create backlight directory /var/lib/systemd/backlight: %m"); + + sysname = strchr(argv[2], ':'); + if (!sysname) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Requires a subsystem and sysname pair specifying a backlight device."); + + ss = strndupa_safe(argv[2], sysname - argv[2]); + + sysname++; + + if (!STR_IN_SET(ss, "backlight", "leds")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Not a backlight or LED device: '%s:%s'", ss, sysname); + + r = sd_device_new_from_subsystem_sysname(&device, ss, sysname); + if (r < 0) { + bool ignore = r == -ENODEV; + + /* Some drivers, e.g. for AMD GPU, removes acpi backlight device soon after it is added. + * See issue #21997. */ + log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r, + "Failed to get backlight or LED device '%s:%s'%s: %m", + ss, sysname, ignore ? ", ignoring" : ""); + return ignore ? 0 : r; + } + + /* If max_brightness is 0, then there is no actual backlight device. This happens on desktops + * with Asus mainboards that load the eeepc-wmi module. */ + if (get_max_brightness(device, &max_brightness) < 0) + return 0; + + if (max_brightness == 0) { + log_device_warning(device, "Maximum brightness is 0, ignoring device."); + return 0; + } + + log_device_debug(device, "Maximum brightness is %u", max_brightness); + + escaped_ss = cescape(ss); + if (!escaped_ss) + return log_oom(); + + escaped_sysname = cescape(sysname); + if (!escaped_sysname) + return log_oom(); + + if (sd_device_get_property_value(device, "ID_PATH", &path_id) >= 0) { + escaped_path_id = cescape(path_id); + if (!escaped_path_id) + return log_oom(); + + saved = strjoina("/var/lib/systemd/backlight/", escaped_path_id, ":", escaped_ss, ":", escaped_sysname); + } else + saved = strjoina("/var/lib/systemd/backlight/", escaped_ss, ":", escaped_sysname); + + /* If there are multiple conflicting backlight devices, then their probing at boot-time might + * happen in any order. This means the validity checking of the device then is not reliable, + * since it might not see other devices conflicting with a specific backlight. To deal with + * this, we will actively delete backlight state files at shutdown (where device probing should + * be complete), so that the validity check at boot time doesn't have to be reliable. */ + + if (streq(argv[1], "load")) { + _cleanup_free_ char *value = NULL; + unsigned percent; + bool clamp; + + if (!shall_restore_state()) + return 0; + + if (validate_device(device) == 0) + return 0; + + clamp = shall_clamp(device, &percent); + + r = read_one_line_file(saved, &value); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to read %s: %m", saved); + if (r > 0) { + r = safe_atou(value, &brightness); + if (r < 0) { + log_warning_errno(r, "Failed to parse saved brightness '%s', removing %s.", + value, saved); + (void) unlink(saved); + } else { + log_debug("Using saved brightness %u.", brightness); + if (clamp) + (void) clamp_brightness(device, percent, /* saved = */ true, max_brightness, &brightness); + + /* Do not fall back to read current brightness below. */ + r = 1; + } + } + if (r <= 0) { + /* Fallback to clamping current brightness or exit early if clamping is not + * supported/enabled. */ + if (!clamp) + return 0; + + r = read_brightness(device, max_brightness, &brightness); + if (r < 0) + return log_device_error_errno(device, r, "Failed to read current brightness: %m"); + + (void) clamp_brightness(device, percent, /* saved = */ false, max_brightness, &brightness); + } + + r = sd_device_set_sysattr_valuef(device, "brightness", "%u", brightness); + if (r < 0) + return log_device_error_errno(device, r, "Failed to write system 'brightness' attribute: %m"); + + } else if (streq(argv[1], "save")) { + if (validate_device(device) == 0) { + (void) unlink(saved); + return 0; + } + + r = read_brightness(device, max_brightness, &brightness); + if (r < 0) + return log_device_error_errno(device, r, "Failed to read current brightness: %m"); + + r = write_string_filef(saved, WRITE_STRING_FILE_CREATE, "%u", brightness); + if (r < 0) + return log_device_error_errno(device, r, "Failed to write %s: %m", saved); + + } else + assert_not_reached(); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/backlight/meson.build b/src/backlight/meson.build new file mode 100644 index 0000000..ed4c55e --- /dev/null +++ b/src/backlight/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-backlight', + 'conditions' : ['ENABLE_BACKLIGHT'], + 'sources' : files('backlight.c'), + }, +] diff --git a/src/basic/MurmurHash2.c b/src/basic/MurmurHash2.c new file mode 100644 index 0000000..43a89a0 --- /dev/null +++ b/src/basic/MurmurHash2.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: LicenseRef-murmurhash2-public-domain */ +//----------------------------------------------------------------------------- +// MurmurHash2 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - This code makes a few assumptions about how your machine behaves - + +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 + +// And it has a few limitations - + +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +#include "MurmurHash2.h" + +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while (len >= 4) + { + uint32_t k = *(uint32_t*)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; /* fall through */ + case 2: h ^= data[1] << 8; /* fall through */ + case 1: h ^= data[0]; /* fall through */ + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} diff --git a/src/basic/MurmurHash2.h b/src/basic/MurmurHash2.h new file mode 100644 index 0000000..5758b86 --- /dev/null +++ b/src/basic/MurmurHash2.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LicenseRef-murmurhash2-public-domain */ +//----------------------------------------------------------------------------- +// MurmurHash2 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#pragma once + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed ); + +//----------------------------------------------------------------------------- diff --git a/src/basic/af-list.c b/src/basic/af-list.c new file mode 100644 index 0000000..a9ab891 --- /dev/null +++ b/src/basic/af-list.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "af-list.h" +#include "macro.h" + +static const struct af_name* lookup_af(register const char *str, register GPERF_LEN_TYPE len); + +#include "af-from-name.h" +#include "af-to-name.h" + +const char *af_to_name(int id) { + + if (id <= 0) + return NULL; + + if ((size_t) id >= ELEMENTSOF(af_names)) + return NULL; + + return af_names[id]; +} + +int af_from_name(const char *name) { + const struct af_name *sc; + + assert(name); + + sc = lookup_af(name, strlen(name)); + if (!sc) + return -EINVAL; + + return sc->id; +} + +int af_max(void) { + return ELEMENTSOF(af_names); +} + +const char *af_to_ipv4_ipv6(int id) { + /* Pretty often we want to map the address family to the typically used protocol name for IPv4 + + * IPv6. Let's add special helpers for that. */ + return id == AF_INET ? "ipv4" : + id == AF_INET6 ? "ipv6" : NULL; +} + +int af_from_ipv4_ipv6(const char *af) { + return streq_ptr(af, "ipv4") ? AF_INET : + streq_ptr(af, "ipv6") ? AF_INET6 : AF_UNSPEC; +} diff --git a/src/basic/af-list.h b/src/basic/af-list.h new file mode 100644 index 0000000..9592b9e --- /dev/null +++ b/src/basic/af-list.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "string-util.h" + +const char *af_to_name(int id); +int af_from_name(const char *name); + +static inline const char* af_to_name_short(int id) { + const char *f; + + if (id == AF_UNSPEC) + return "*"; + + f = af_to_name(id); + if (!f) + return "unknown"; + + assert(startswith(f, "AF_")); + return f + 3; +} + +const char* af_to_ipv4_ipv6(int id); +int af_from_ipv4_ipv6(const char *af); + +int af_max(void); diff --git a/src/basic/af-to-name.awk b/src/basic/af-to-name.awk new file mode 100644 index 0000000..b9cfbb7 --- /dev/null +++ b/src/basic/af-to-name.awk @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "static const char* const af_names[] = { " +} +!/AF_FILE/ && !/AF_ROUTE/ && !/AF_LOCAL/ { + printf " [%s] = \"%s\",\n", $1, $1 +} +END{ + print "};" +} diff --git a/src/basic/alloc-util.c b/src/basic/alloc-util.c new file mode 100644 index 0000000..fc98610 --- /dev/null +++ b/src/basic/alloc-util.c @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "macro.h" +#include "memory-util.h" + +void* memdup(const void *p, size_t l) { + void *ret; + + assert(l == 0 || p); + + ret = malloc(l ?: 1); + if (!ret) + return NULL; + + return memcpy_safe(ret, p, l); +} + +void* memdup_suffix0(const void *p, size_t l) { + void *ret; + + assert(l == 0 || p); + + /* The same as memdup() but place a safety NUL byte after the allocated memory */ + + if (_unlikely_(l == SIZE_MAX)) /* prevent overflow */ + return NULL; + + ret = malloc(l + 1); + if (!ret) + return NULL; + + ((uint8_t*) ret)[l] = 0; + return memcpy_safe(ret, p, l); +} + +void* greedy_realloc( + void **p, + size_t need, + size_t size) { + + size_t a, newalloc; + void *q; + + assert(p); + + /* We use malloc_usable_size() for determining the current allocated size. On all systems we care + * about this should be safe to rely on. Should there ever arise the need to avoid relying on this we + * can instead locally fall back to realloc() on every call, rounded up to the next exponent of 2 or + * so. */ + + if (*p && (size == 0 || (MALLOC_SIZEOF_SAFE(*p) / size >= need))) + return *p; + + if (_unlikely_(need > SIZE_MAX/2)) /* Overflow check */ + return NULL; + newalloc = need * 2; + + if (size_multiply_overflow(newalloc, size)) + return NULL; + a = newalloc * size; + + if (a < 64) /* Allocate at least 64 bytes */ + a = 64; + + q = realloc(*p, a); + if (!q) + return NULL; + + return *p = q; +} + +void* greedy_realloc0( + void **p, + size_t need, + size_t size) { + + size_t before, after; + uint8_t *q; + + assert(p); + + before = MALLOC_SIZEOF_SAFE(*p); /* malloc_usable_size() will return 0 on NULL input, as per docs */ + + q = greedy_realloc(p, need, size); + if (!q) + return NULL; + + after = MALLOC_SIZEOF_SAFE(q); + + if (size == 0) /* avoid division by zero */ + before = 0; + else + before = (before / size) * size; /* Round down */ + + if (after > before) + memzero(q + before, after - before); + + return q; +} + +void* greedy_realloc_append( + void **p, + size_t *n_p, + const void *from, + size_t n_from, + size_t size) { + + uint8_t *q; + + assert(p); + assert(n_p); + assert(from || n_from == 0); + + if (n_from > SIZE_MAX - *n_p) + return NULL; + + q = greedy_realloc(p, *n_p + n_from, size); + if (!q) + return NULL; + + memcpy_safe(q + *n_p * size, from, n_from * size); + + *n_p += n_from; + + return q; +} + +void *expand_to_usable(void *ptr, size_t newsize _unused_) { + return ptr; +} diff --git a/src/basic/alloc-util.h b/src/basic/alloc-util.h new file mode 100644 index 0000000..136d2b3 --- /dev/null +++ b/src/basic/alloc-util.h @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "macro.h" + +#if HAS_FEATURE_MEMORY_SANITIZER +# include +#endif + +typedef void (*free_func_t)(void *p); +typedef void* (*mfree_func_t)(void *p); + +/* If for some reason more than 4M are allocated on the stack, let's abort immediately. It's better than + * proceeding and smashing the stack limits. Note that by default RLIMIT_STACK is 8M on Linux. */ +#define ALLOCA_MAX (4U*1024U*1024U) + +#define new(t, n) ((t*) malloc_multiply((n), sizeof(t))) + +#define new0(t, n) ((t*) calloc((n) ?: 1, sizeof(t))) + +#define alloca_safe(n) \ + ({ \ + size_t _nn_ = n; \ + assert(_nn_ <= ALLOCA_MAX); \ + alloca(_nn_ == 0 ? 1 : _nn_); \ + }) \ + +#define newa(t, n) \ + ({ \ + size_t _n_ = n; \ + assert(!size_multiply_overflow(sizeof(t), _n_)); \ + (t*) alloca_safe(sizeof(t)*_n_); \ + }) + +#define newa0(t, n) \ + ({ \ + size_t _n_ = n; \ + assert(!size_multiply_overflow(sizeof(t), _n_)); \ + (t*) alloca0((sizeof(t)*_n_)); \ + }) + +#define newdup(t, p, n) ((t*) memdup_multiply(p, (n), sizeof(t))) + +#define newdup_suffix0(t, p, n) ((t*) memdup_suffix0_multiply(p, (n), sizeof(t))) + +#define malloc0(n) (calloc(1, (n) ?: 1)) + +#define free_and_replace_full(a, b, free_func) \ + ({ \ + typeof(a)* _a = &(a); \ + typeof(b)* _b = &(b); \ + free_func(*_a); \ + *_a = *_b; \ + *_b = NULL; \ + 0; \ + }) + +#define free_and_replace(a, b) \ + free_and_replace_full(a, b, free) + +/* This is similar to free_and_replace_full(), but NULL is not assigned to 'b', and its reference counter is + * increased. */ +#define unref_and_replace_full(a, b, ref_func, unref_func) \ + ({ \ + typeof(a)* _a = &(a); \ + typeof(b) _b = ref_func(b); \ + unref_func(*_a); \ + *_a = _b; \ + 0; \ + }) + +void* memdup(const void *p, size_t l) _alloc_(2); +void* memdup_suffix0(const void *p, size_t l); /* We can't use _alloc_() here, since we return a buffer one byte larger than the specified size */ + +#define memdupa(p, l) \ + ({ \ + void *_q_; \ + size_t _l_ = l; \ + _q_ = alloca_safe(_l_); \ + memcpy_safe(_q_, p, _l_); \ + }) + +#define memdupa_suffix0(p, l) \ + ({ \ + void *_q_; \ + size_t _l_ = l; \ + _q_ = alloca_safe(_l_ + 1); \ + ((uint8_t*) _q_)[_l_] = 0; \ + memcpy_safe(_q_, p, _l_); \ + }) + +static inline void unsetp(void *p) { + /* A trivial "destructor" that can be used in cases where we want to + * unset a pointer from a _cleanup_ function. */ + + *(void**)p = NULL; +} + +static inline void freep(void *p) { + *(void**)p = mfree(*(void**) p); +} + +#define _cleanup_free_ _cleanup_(freep) + +static inline bool size_multiply_overflow(size_t size, size_t need) { + return _unlikely_(need != 0 && size > (SIZE_MAX / need)); +} + +_malloc_ _alloc_(1, 2) static inline void *malloc_multiply(size_t need, size_t size) { + if (size_multiply_overflow(size, need)) + return NULL; + + return malloc(size * need ?: 1); +} + +#if !HAVE_REALLOCARRAY +_alloc_(2, 3) static inline void *reallocarray(void *p, size_t need, size_t size) { + if (size_multiply_overflow(size, need)) + return NULL; + + return realloc(p, size * need ?: 1); +} +#endif + +_alloc_(2, 3) static inline void *memdup_multiply(const void *p, size_t need, size_t size) { + if (size_multiply_overflow(size, need)) + return NULL; + + return memdup(p, size * need); +} + +/* Note that we can't decorate this function with _alloc_() since the returned memory area is one byte larger + * than the product of its parameters. */ +static inline void *memdup_suffix0_multiply(const void *p, size_t need, size_t size) { + if (size_multiply_overflow(size, need)) + return NULL; + + return memdup_suffix0(p, size * need); +} + +void* greedy_realloc(void **p, size_t need, size_t size); +void* greedy_realloc0(void **p, size_t need, size_t size); +void* greedy_realloc_append(void **p, size_t *n_p, const void *from, size_t n_from, size_t size); + +#define GREEDY_REALLOC(array, need) \ + greedy_realloc((void**) &(array), (need), sizeof((array)[0])) + +#define GREEDY_REALLOC0(array, need) \ + greedy_realloc0((void**) &(array), (need), sizeof((array)[0])) + +#define GREEDY_REALLOC_APPEND(array, n_array, from, n_from) \ + greedy_realloc_append((void**) &(array), (size_t*) &(n_array), (from), (n_from), sizeof((array)[0])) + +#define alloca0(n) \ + ({ \ + char *_new_; \ + size_t _len_ = n; \ + _new_ = alloca_safe(_len_); \ + memset(_new_, 0, _len_); \ + }) + +/* It's not clear what alignment glibc/gcc alloca() guarantee, hence provide a guaranteed safe version */ +#define alloca_align(size, align) \ + ({ \ + void *_ptr_; \ + size_t _mask_ = (align) - 1; \ + size_t _size_ = size; \ + _ptr_ = alloca_safe(_size_ + _mask_); \ + (void*)(((uintptr_t)_ptr_ + _mask_) & ~_mask_); \ + }) + +#define alloca0_align(size, align) \ + ({ \ + void *_new_; \ + size_t _xsize_ = (size); \ + _new_ = alloca_align(_xsize_, (align)); \ + memset(_new_, 0, _xsize_); \ + }) + +#if HAS_FEATURE_MEMORY_SANITIZER +# define msan_unpoison(r, s) __msan_unpoison(r, s) +#else +# define msan_unpoison(r, s) +#endif + +/* Dummy allocator to tell the compiler that the new size of p is newsize. The implementation returns the + * pointer as is; the only reason for its existence is as a conduit for the _alloc_ attribute. This must not + * be inlined (hence a non-static function with _noinline_ because LTO otherwise tries to inline it) because + * gcc then loses the attributes on the function. + * See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503 */ +void *expand_to_usable(void *p, size_t newsize) _alloc_(2) _returns_nonnull_ _noinline_; + +static inline size_t malloc_sizeof_safe(void **xp) { + if (_unlikely_(!xp || !*xp)) + return 0; + + size_t sz = malloc_usable_size(*xp); + *xp = expand_to_usable(*xp, sz); + /* GCC doesn't see the _returns_nonnull_ when built with ubsan, so yet another hint to make it doubly + * clear that expand_to_usable won't return NULL. + * See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79265 */ + if (!*xp) + assert_not_reached(); + return sz; +} + +/* This returns the number of usable bytes in a malloc()ed region as per malloc_usable_size(), which may + * return a value larger than the size that was actually allocated. Access to that additional memory is + * discouraged because it violates the C standard; a compiler cannot see that this as valid. To help the + * compiler out, the MALLOC_SIZEOF_SAFE macro 'allocates' the usable size using a dummy allocator function + * expand_to_usable. There is a possibility of malloc_usable_size() returning different values during the + * lifetime of an object, which may cause problems, but the glibc allocator does not do that at the moment. */ +#define MALLOC_SIZEOF_SAFE(x) \ + malloc_sizeof_safe((void**) &__builtin_choose_expr(__builtin_constant_p(x), (void*) { NULL }, (x))) + +/* Inspired by ELEMENTSOF() but operates on malloc()'ed memory areas: typesafely returns the number of items + * that fit into the specified memory block */ +#define MALLOC_ELEMENTSOF(x) \ + (__builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), typeof(&*(x))), \ + MALLOC_SIZEOF_SAFE(x)/sizeof((x)[0]), \ + VOID_0)) + +/* These are like strdupa()/strndupa(), but honour ALLOCA_MAX */ +#define strdupa_safe(s) \ + ({ \ + const char *_t = (s); \ + (char*) memdupa_suffix0(_t, strlen(_t)); \ + }) + +#define strndupa_safe(s, n) \ + ({ \ + const char *_t = (s); \ + (char*) memdupa_suffix0(_t, strnlen(_t, (n))); \ + }) + +/* Free every element of the array. */ +static inline void free_many(void **p, size_t n) { + assert(p || n == 0); + + FOREACH_ARRAY(i, p, n) + *i = mfree(*i); +} + +/* Typesafe wrapper for char** rather than void**. Unfortunately C won't implicitly cast this. */ +static inline void free_many_charp(char **c, size_t n) { + free_many((void**) c, n); +} + +_alloc_(2) static inline void *realloc0(void *p, size_t new_size) { + size_t old_size; + void *q; + + /* Like realloc(), but initializes anything appended to zero */ + + old_size = MALLOC_SIZEOF_SAFE(p); + + q = realloc(p, new_size); + if (!q) + return NULL; + + new_size = MALLOC_SIZEOF_SAFE(q); /* Update with actually allocated space */ + + if (new_size > old_size) + memset((uint8_t*) q + old_size, 0, new_size - old_size); + + return q; +} + +#include "memory-util.h" diff --git a/src/basic/architecture.c b/src/basic/architecture.c new file mode 100644 index 0000000..488367c --- /dev/null +++ b/src/basic/architecture.c @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "architecture.h" +#include "macro.h" +#include "string-table.h" +#include "string-util.h" + +Architecture uname_architecture(void) { + + /* Return a sanitized enum identifying the architecture we are running on. This + * is based on uname(), and the user may hence control what this returns by using + * personality(). This puts the user in control on systems that can run binaries + * of multiple architectures. + * + * We do not translate the string returned by uname() 1:1. Instead we try to + * clean it up and break down the confusion on x86 and arm in particular. + * + * We try to distinguish CPUs, not CPU features, i.e. actual architectures that + * have genuinely different code. */ + + static const struct { + const char *machine; + Architecture arch; + } arch_map[] = { +#if defined(__aarch64__) || defined(__arm__) + { "aarch64", ARCHITECTURE_ARM64 }, + { "aarch64_be", ARCHITECTURE_ARM64_BE }, + { "armv8l", ARCHITECTURE_ARM }, + { "armv8b", ARCHITECTURE_ARM_BE }, + { "armv7ml", ARCHITECTURE_ARM }, + { "armv7mb", ARCHITECTURE_ARM_BE }, + { "armv7l", ARCHITECTURE_ARM }, + { "armv7b", ARCHITECTURE_ARM_BE }, + { "armv6l", ARCHITECTURE_ARM }, + { "armv6b", ARCHITECTURE_ARM_BE }, + { "armv5tl", ARCHITECTURE_ARM }, + { "armv5tel", ARCHITECTURE_ARM }, + { "armv5tejl", ARCHITECTURE_ARM }, + { "armv5tejb", ARCHITECTURE_ARM_BE }, + { "armv5teb", ARCHITECTURE_ARM_BE }, + { "armv5tb", ARCHITECTURE_ARM_BE }, + { "armv4tl", ARCHITECTURE_ARM }, + { "armv4tb", ARCHITECTURE_ARM_BE }, + { "armv4l", ARCHITECTURE_ARM }, + { "armv4b", ARCHITECTURE_ARM_BE }, + +#elif defined(__alpha__) + { "alpha" , ARCHITECTURE_ALPHA }, + +#elif defined(__arc__) + { "arc", ARCHITECTURE_ARC }, + { "arceb", ARCHITECTURE_ARC_BE }, + +#elif defined(__cris__) + { "crisv32", ARCHITECTURE_CRIS }, + +#elif defined(__i386__) || defined(__x86_64__) + { "x86_64", ARCHITECTURE_X86_64 }, + { "i686", ARCHITECTURE_X86 }, + { "i586", ARCHITECTURE_X86 }, + { "i486", ARCHITECTURE_X86 }, + { "i386", ARCHITECTURE_X86 }, + +#elif defined(__ia64__) + { "ia64", ARCHITECTURE_IA64 }, + +#elif defined(__hppa__) || defined(__hppa64__) + { "parisc64", ARCHITECTURE_PARISC64 }, + { "parisc", ARCHITECTURE_PARISC }, + +#elif defined(__loongarch_lp64) + { "loongarch64", ARCHITECTURE_LOONGARCH64 }, + +#elif defined(__m68k__) + { "m68k", ARCHITECTURE_M68K }, + +#elif defined(__mips__) || defined(__mips64__) + { "mips64", ARCHITECTURE_MIPS64 }, + { "mips", ARCHITECTURE_MIPS }, + +#elif defined(__nios2__) + { "nios2", ARCHITECTURE_NIOS2 }, + +#elif defined(__powerpc__) || defined(__powerpc64__) + { "ppc64le", ARCHITECTURE_PPC64_LE }, + { "ppc64", ARCHITECTURE_PPC64 }, + { "ppcle", ARCHITECTURE_PPC_LE }, + { "ppc", ARCHITECTURE_PPC }, + +#elif defined(__riscv) + { "riscv64", ARCHITECTURE_RISCV64 }, + { "riscv32", ARCHITECTURE_RISCV32 }, +# if __SIZEOF_POINTER__ == 4 + { "riscv", ARCHITECTURE_RISCV32 }, +# elif __SIZEOF_POINTER__ == 8 + { "riscv", ARCHITECTURE_RISCV64 }, +# endif + +#elif defined(__s390__) || defined(__s390x__) + { "s390x", ARCHITECTURE_S390X }, + { "s390", ARCHITECTURE_S390 }, + +#elif defined(__sh__) || defined(__sh64__) + { "sh5", ARCHITECTURE_SH64 }, + { "sh4a", ARCHITECTURE_SH }, + { "sh4", ARCHITECTURE_SH }, + { "sh3", ARCHITECTURE_SH }, + { "sh2a", ARCHITECTURE_SH }, + { "sh2", ARCHITECTURE_SH }, + +#elif defined(__sparc__) + { "sparc64", ARCHITECTURE_SPARC64 }, + { "sparc", ARCHITECTURE_SPARC }, + +#elif defined(__tilegx__) + { "tilegx", ARCHITECTURE_TILEGX }, + +#else +# error "Please register your architecture here!" +#endif + }; + + static Architecture cached = _ARCHITECTURE_INVALID; + struct utsname u; + + if (cached != _ARCHITECTURE_INVALID) + return cached; + + assert_se(uname(&u) >= 0); + + for (size_t i = 0; i < ELEMENTSOF(arch_map); i++) + if (streq(arch_map[i].machine, u.machine)) + return cached = arch_map[i].arch; + + assert_not_reached(); + return _ARCHITECTURE_INVALID; +} + +/* Maintain same order as in the table above. */ +static const char *const architecture_table[_ARCHITECTURE_MAX] = { + [ARCHITECTURE_ARM64] = "arm64", + [ARCHITECTURE_ARM64_BE] = "arm64-be", + [ARCHITECTURE_ARM] = "arm", + [ARCHITECTURE_ARM_BE] = "arm-be", + [ARCHITECTURE_ALPHA] = "alpha", + [ARCHITECTURE_ARC] = "arc", + [ARCHITECTURE_ARC_BE] = "arc-be", + [ARCHITECTURE_CRIS] = "cris", + [ARCHITECTURE_X86_64] = "x86-64", + [ARCHITECTURE_X86] = "x86", + [ARCHITECTURE_IA64] = "ia64", + [ARCHITECTURE_LOONGARCH64] = "loongarch64", + [ARCHITECTURE_M68K] = "m68k", + [ARCHITECTURE_MIPS64_LE] = "mips64-le", + [ARCHITECTURE_MIPS64] = "mips64", + [ARCHITECTURE_MIPS_LE] = "mips-le", + [ARCHITECTURE_MIPS] = "mips", + [ARCHITECTURE_NIOS2] = "nios2", + [ARCHITECTURE_PARISC64] = "parisc64", + [ARCHITECTURE_PARISC] = "parisc", + [ARCHITECTURE_PPC64_LE] = "ppc64-le", + [ARCHITECTURE_PPC64] = "ppc64", + [ARCHITECTURE_PPC] = "ppc", + [ARCHITECTURE_PPC_LE] = "ppc-le", + [ARCHITECTURE_RISCV32] = "riscv32", + [ARCHITECTURE_RISCV64] = "riscv64", + [ARCHITECTURE_S390X] = "s390x", + [ARCHITECTURE_S390] = "s390", + [ARCHITECTURE_SH64] = "sh64", + [ARCHITECTURE_SH] = "sh", + [ARCHITECTURE_SPARC64] = "sparc64", + [ARCHITECTURE_SPARC] = "sparc", + [ARCHITECTURE_TILEGX] = "tilegx", +}; + +DEFINE_STRING_TABLE_LOOKUP(architecture, Architecture); diff --git a/src/basic/architecture.h b/src/basic/architecture.h new file mode 100644 index 0000000..788f3ab --- /dev/null +++ b/src/basic/architecture.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +/* A cleaned up architecture definition. We don't want to get lost in + * processor features, models, generations or even ABIs. Hence we + * focus on general family, and distinguish word width and endianness. */ + +typedef enum { + ARCHITECTURE_ALPHA, + ARCHITECTURE_ARC, + ARCHITECTURE_ARC_BE, + ARCHITECTURE_ARM, + ARCHITECTURE_ARM64, + ARCHITECTURE_ARM64_BE, + ARCHITECTURE_ARM_BE, + ARCHITECTURE_CRIS, + ARCHITECTURE_IA64, + ARCHITECTURE_LOONGARCH64, + ARCHITECTURE_M68K, + ARCHITECTURE_MIPS, + ARCHITECTURE_MIPS64, + ARCHITECTURE_MIPS64_LE, + ARCHITECTURE_MIPS_LE, + ARCHITECTURE_NIOS2, + ARCHITECTURE_PARISC, + ARCHITECTURE_PARISC64, + ARCHITECTURE_PPC, + ARCHITECTURE_PPC64, + ARCHITECTURE_PPC64_LE, + ARCHITECTURE_PPC_LE, + ARCHITECTURE_RISCV32, + ARCHITECTURE_RISCV64, + ARCHITECTURE_S390, + ARCHITECTURE_S390X, + ARCHITECTURE_SH, + ARCHITECTURE_SH64, + ARCHITECTURE_SPARC, + ARCHITECTURE_SPARC64, + ARCHITECTURE_TILEGX, + ARCHITECTURE_X86, + ARCHITECTURE_X86_64, + _ARCHITECTURE_MAX, + _ARCHITECTURE_INVALID = -EINVAL, +} Architecture; + +Architecture uname_architecture(void); + +/* + * LIB_ARCH_TUPLE should resolve to the local library path + * architecture tuple systemd is built for, according to the Debian + * tuple list: + * + * https://wiki.debian.org/Multiarch/Tuples + * + * This is used in library search paths that should understand + * Debian's paths on all distributions. + */ + +#if defined(__x86_64__) +# define native_architecture() ARCHITECTURE_X86_64 +# if defined(__ILP32__) +# define LIB_ARCH_TUPLE "x86_64-linux-gnux32" +# else +# define LIB_ARCH_TUPLE "x86_64-linux-gnu" +# endif +# define ARCHITECTURE_SECONDARY ARCHITECTURE_X86 +#elif defined(__i386__) +# define native_architecture() ARCHITECTURE_X86 +# define LIB_ARCH_TUPLE "i386-linux-gnu" +#elif defined(__powerpc64__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_PPC64 +# define LIB_ARCH_TUPLE "ppc64-linux-gnu" +# define ARCHITECTURE_SECONDARY ARCHITECTURE_PPC +# else +# define native_architecture() ARCHITECTURE_PPC64_LE +# define LIB_ARCH_TUPLE "powerpc64le-linux-gnu" +# define ARCHITECTURE_SECONDARY ARCHITECTURE_PPC_LE +# endif +#elif defined(__powerpc__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_PPC +# if defined(__NO_FPRS__) +# define LIB_ARCH_TUPLE "powerpc-linux-gnuspe" +# else +# define LIB_ARCH_TUPLE "powerpc-linux-gnu" +# endif +# else +# define native_architecture() ARCHITECTURE_PPC_LE +# error "Missing LIB_ARCH_TUPLE for PPCLE" +# endif +#elif defined(__ia64__) +# define native_architecture() ARCHITECTURE_IA64 +# define LIB_ARCH_TUPLE "ia64-linux-gnu" +#elif defined(__hppa64__) +# define native_architecture() ARCHITECTURE_PARISC64 +# error "Missing LIB_ARCH_TUPLE for HPPA64" +#elif defined(__hppa__) +# define native_architecture() ARCHITECTURE_PARISC +# define LIB_ARCH_TUPLE "hppa‑linux‑gnu" +#elif defined(__s390x__) +# define native_architecture() ARCHITECTURE_S390X +# define LIB_ARCH_TUPLE "s390x-linux-gnu" +# define ARCHITECTURE_SECONDARY ARCHITECTURE_S390 +#elif defined(__s390__) +# define native_architecture() ARCHITECTURE_S390 +# define LIB_ARCH_TUPLE "s390-linux-gnu" +#elif defined(__sparc__) && defined (__arch64__) +# define native_architecture() ARCHITECTURE_SPARC64 +# define LIB_ARCH_TUPLE "sparc64-linux-gnu" +#elif defined(__sparc__) +# define native_architecture() ARCHITECTURE_SPARC +# define LIB_ARCH_TUPLE "sparc-linux-gnu" +#elif defined(__mips64) && defined(__LP64__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_MIPS64 +# define LIB_ARCH_TUPLE "mips64-linux-gnuabi64" +# else +# define native_architecture() ARCHITECTURE_MIPS64_LE +# define LIB_ARCH_TUPLE "mips64el-linux-gnuabi64" +# endif +#elif defined(__mips64) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_MIPS64 +# define LIB_ARCH_TUPLE "mips64-linux-gnuabin32" +# else +# define native_architecture() ARCHITECTURE_MIPS64_LE +# define LIB_ARCH_TUPLE "mips64el-linux-gnuabin32" +# endif +#elif defined(__mips__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_MIPS +# define LIB_ARCH_TUPLE "mips-linux-gnu" +# else +# define native_architecture() ARCHITECTURE_MIPS_LE +# define LIB_ARCH_TUPLE "mipsel-linux-gnu" +# endif +#elif defined(__alpha__) +# define native_architecture() ARCHITECTURE_ALPHA +# define LIB_ARCH_TUPLE "alpha-linux-gnu" +#elif defined(__aarch64__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_ARM64_BE +# define LIB_ARCH_TUPLE "aarch64_be-linux-gnu" +# else +# define native_architecture() ARCHITECTURE_ARM64 +# define LIB_ARCH_TUPLE "aarch64-linux-gnu" +# define ARCHITECTURE_SECONDARY ARCHITECTURE_ARM +# endif +#elif defined(__arm__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_ARM_BE +# if defined(__ARM_EABI__) +# if defined(__ARM_PCS_VFP) +# define LIB_ARCH_TUPLE "armeb-linux-gnueabihf" +# else +# define LIB_ARCH_TUPLE "armeb-linux-gnueabi" +# endif +# else +# define LIB_ARCH_TUPLE "armeb-linux-gnu" +# endif +# else +# define native_architecture() ARCHITECTURE_ARM +# if defined(__ARM_EABI__) +# if defined(__ARM_PCS_VFP) +# define LIB_ARCH_TUPLE "arm-linux-gnueabihf" +# else +# define LIB_ARCH_TUPLE "arm-linux-gnueabi" +# endif +# else +# define LIB_ARCH_TUPLE "arm-linux-gnu" +# endif +# endif +#elif defined(__sh64__) +# define native_architecture() ARCHITECTURE_SH64 +# error "Missing LIB_ARCH_TUPLE for SH64" +#elif defined(__sh__) +# define native_architecture() ARCHITECTURE_SH +# if defined(__SH1__) +# define LIB_ARCH_TUPLE "sh1-linux-gnu" +# elif defined(__SH2__) +# define LIB_ARCH_TUPLE "sh2-linux-gnu" +# elif defined(__SH2A__) +# define LIB_ARCH_TUPLE "sh2a-linux-gnu" +# elif defined(__SH2E__) +# define LIB_ARCH_TUPLE "sh2e-linux-gnu" +# elif defined(__SH3__) +# define LIB_ARCH_TUPLE "sh3-linux-gnu" +# elif defined(__SH3E__) +# define LIB_ARCH_TUPLE "sh3e-linux-gnu" +# elif defined(__SH4__) && !defined(__SH4A__) +# define LIB_ARCH_TUPLE "sh4-linux-gnu" +# elif defined(__SH4A__) +# define LIB_ARCH_TUPLE "sh4a-linux-gnu" +# endif +#elif defined(__loongarch_lp64) +# define native_architecture() ARCHITECTURE_LOONGARCH64 +# if defined(__loongarch_double_float) +# define LIB_ARCH_TUPLE "loongarch64-linux-gnu" +# elif defined(__loongarch_single_float) +# define LIB_ARCH_TUPLE "loongarch64-linux-gnuf32" +# elif defined(__loongarch_soft_float) +# define LIB_ARCH_TUPLE "loongarch64-linux-gnusf" +# else +# error "Unrecognized loongarch architecture variant" +# endif +#elif defined(__m68k__) +# define native_architecture() ARCHITECTURE_M68K +# define LIB_ARCH_TUPLE "m68k-linux-gnu" +#elif defined(__tilegx__) +# define native_architecture() ARCHITECTURE_TILEGX +# define LIB_ARCH_TUPLE "tilegx-linux-gnu" +#elif defined(__cris__) +# define native_architecture() ARCHITECTURE_CRIS +# error "Missing LIB_ARCH_TUPLE for CRIS" +#elif defined(__nios2__) +# define native_architecture() ARCHITECTURE_NIOS2 +# define LIB_ARCH_TUPLE "nios2-linux-gnu" +#elif defined(__riscv) +# if __SIZEOF_POINTER__ == 4 +# define native_architecture() ARCHITECTURE_RISCV32 +# define LIB_ARCH_TUPLE "riscv32-linux-gnu" +# elif __SIZEOF_POINTER__ == 8 +# define native_architecture() ARCHITECTURE_RISCV64 +# define LIB_ARCH_TUPLE "riscv64-linux-gnu" +# else +# error "Unrecognized riscv architecture variant" +# endif +#elif defined(__arc__) +# if __BYTE_ORDER == __BIG_ENDIAN +# define native_architecture() ARCHITECTURE_ARC_BE +# define LIB_ARCH_TUPLE "arceb-linux" +# else +# define native_architecture() ARCHITECTURE_ARC +# define LIB_ARCH_TUPLE "arc-linux" +# endif +#else +# error "Please register your architecture here!" +#endif + +const char *architecture_to_string(Architecture a) _const_; +Architecture architecture_from_string(const char *s) _pure_; diff --git a/src/basic/argv-util.c b/src/basic/argv-util.c new file mode 100644 index 0000000..a2bcc44 --- /dev/null +++ b/src/basic/argv-util.c @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "argv-util.h" +#include "capability-util.h" +#include "errno-util.h" +#include "missing_sched.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" + +int saved_argc = 0; +char **saved_argv = NULL; + +bool invoked_as(char *argv[], const char *token) { + if (!argv || isempty(argv[0])) + return false; + + if (isempty(token)) + return false; + + return strstr(last_path_component(argv[0]), token); +} + +bool invoked_by_systemd(void) { + int r; + + /* If the process is directly executed by PID1 (e.g. ExecStart= or generator), systemd-importd, + * or systemd-homed, then $SYSTEMD_EXEC_PID= is set, and read the command line. */ + const char *e = getenv("SYSTEMD_EXEC_PID"); + if (!e) + return false; + + if (streq(e, "*")) + /* For testing. */ + return true; + + pid_t p; + r = parse_pid(e, &p); + if (r < 0) { + /* We know that systemd sets the variable correctly. Something else must have set it. */ + log_debug_errno(r, "Failed to parse \"SYSTEMD_EXEC_PID=%s\", ignoring: %m", e); + return false; + } + + return getpid_cached() == p; +} + +bool argv_looks_like_help(int argc, char **argv) { + char **l; + + /* Scans the command line for indications the user asks for help. This is supposed to be called by + * tools that do not implement getopt() style command line parsing because they are not primarily + * user-facing. Detects four ways of asking for help: + * + * 1. Passing zero arguments + * 2. Passing "help" as first argument + * 3. Passing --help as any argument + * 4. Passing -h as any argument + */ + + if (argc <= 1) + return true; + + if (streq_ptr(argv[1], "help")) + return true; + + l = strv_skip(argv, 1); + + return strv_contains(l, "--help") || + strv_contains(l, "-h"); +} + +static int update_argv(const char name[], size_t l) { + static int can_do = -1; + int r; + + assert(name); + assert(l < SIZE_MAX); + + if (can_do == 0) + return 0; + can_do = false; /* We'll set it to true only if the whole process works */ + + /* Calling prctl() with PR_SET_MM_ARG_{START,END} requires CAP_SYS_RESOURCE so let's use this as quick bypass + * check, to avoid calling mmap() should PR_SET_MM_ARG_{START,END} fail with EPERM later on anyway. */ + r = have_effective_cap(CAP_SYS_RESOURCE); + if (r < 0) + return log_debug_errno(r, "Failed to check if we have enough privileges: %m"); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), + "Skipping PR_SET_MM, as we don't have privileges."); + + static size_t mm_size = 0; + static char *mm = NULL; + + if (mm_size < l+1) { + size_t nn_size; + char *nn; + + nn_size = PAGE_ALIGN(l+1); + if (nn_size >= SIZE_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "The requested argument is too long."); + + nn = mmap(NULL, nn_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (nn == MAP_FAILED) + return log_debug_errno(errno, "mmap() failed: %m"); + + strncpy(nn, name, nn_size); + + /* Now, let's tell the kernel about this new memory */ + if (prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long) nn, 0, 0) < 0) { + if (ERRNO_IS_PRIVILEGE(errno)) + return log_debug_errno(errno, "PR_SET_MM_ARG_START failed: %m"); + + /* HACK: prctl() API is kind of dumb on this point. The existing end address may already be + * below the desired start address, in which case the kernel may have kicked this back due + * to a range-check failure (see linux/kernel/sys.c:validate_prctl_map() to see this in + * action). The proper solution would be to have a prctl() API that could set both start+end + * simultaneously, or at least let us query the existing address to anticipate this condition + * and respond accordingly. For now, we can only guess at the cause of this failure and try + * a workaround--which will briefly expand the arg space to something potentially huge before + * resizing it to what we want. */ + log_debug_errno(errno, "PR_SET_MM_ARG_START failed, attempting PR_SET_MM_ARG_END hack: %m"); + + if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) nn + l + 1, 0, 0) < 0) { + r = log_debug_errno(errno, "PR_SET_MM_ARG_END hack failed, proceeding without: %m"); + (void) munmap(nn, nn_size); + return r; + } + + if (prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long) nn, 0, 0) < 0) + return log_debug_errno(errno, "PR_SET_MM_ARG_START still failed, proceeding without: %m"); + } else { + /* And update the end pointer to the new end, too. If this fails, we don't really know what + * to do, it's pretty unlikely that we can rollback, hence we'll just accept the failure, + * and continue. */ + if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) nn + l + 1, 0, 0) < 0) + log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m"); + } + + if (mm) + (void) munmap(mm, mm_size); + + mm = nn; + mm_size = nn_size; + } else { + strncpy(mm, name, mm_size); + + /* Update the end pointer, continuing regardless of any failure. */ + if (prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long) mm + l + 1, 0, 0) < 0) + log_debug_errno(errno, "PR_SET_MM_ARG_END failed, proceeding without: %m"); + } + + can_do = true; + return 0; +} + +int rename_process(const char name[]) { + bool truncated = false; + + /* This is a like a poor man's setproctitle(). It changes the comm field, argv[0], and also the glibc's + * internally used name of the process. For the first one a limit of 16 chars applies; to the second one in + * many cases one of 10 (i.e. length of "/sbin/init") — however if we have CAP_SYS_RESOURCES it is unbounded; + * to the third one 7 (i.e. the length of "systemd". If you pass a longer string it will likely be + * truncated. + * + * Returns 0 if a name was set but truncated, > 0 if it was set but not truncated. */ + + if (isempty(name)) + return -EINVAL; /* let's not confuse users unnecessarily with an empty name */ + + if (!is_main_thread()) + return -EPERM; /* Let's not allow setting the process name from other threads than the main one, as we + * cache things without locking, and we make assumptions that PR_SET_NAME sets the + * process name that isn't correct on any other threads */ + + size_t l = strlen(name); + + /* First step, change the comm field. The main thread's comm is identical to the process comm. This means we + * can use PR_SET_NAME, which sets the thread name for the calling thread. */ + if (prctl(PR_SET_NAME, name) < 0) + log_debug_errno(errno, "PR_SET_NAME failed: %m"); + if (l >= TASK_COMM_LEN) /* Linux userspace process names can be 15 chars at max */ + truncated = true; + + /* Second step, change glibc's ID of the process name. */ + if (program_invocation_name) { + size_t k; + + k = strlen(program_invocation_name); + strncpy(program_invocation_name, name, k); + if (l > k) + truncated = true; + + /* Also update the short name. */ + char *p = strrchr(program_invocation_name, '/'); + program_invocation_short_name = p ? p + 1 : program_invocation_name; + } + + /* Third step, completely replace the argv[] array the kernel maintains for us. This requires privileges, but + * has the advantage that the argv[] array is exactly what we want it to be, and not filled up with zeros at + * the end. This is the best option for changing /proc/self/cmdline. */ + (void) update_argv(name, l); + + /* Fourth step: in all cases we'll also update the original argv[], so that our own code gets it right too if + * it still looks here */ + if (saved_argc > 0) { + if (saved_argv[0]) { + size_t k; + + k = strlen(saved_argv[0]); + strncpy(saved_argv[0], name, k); + if (l > k) + truncated = true; + } + + for (int i = 1; i < saved_argc; i++) { + if (!saved_argv[i]) + break; + + memzero(saved_argv[i], strlen(saved_argv[i])); + } + } + + return !truncated; +} diff --git a/src/basic/argv-util.h b/src/basic/argv-util.h new file mode 100644 index 0000000..a20a951 --- /dev/null +++ b/src/basic/argv-util.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +extern int saved_argc; +extern char **saved_argv; + +static inline void save_argc_argv(int argc, char **argv) { + /* Protect against CVE-2021-4034 style attacks */ + assert_se(argc > 0); + assert_se(argv); + assert_se(argv[0]); + + saved_argc = argc; + saved_argv = argv; +} + +bool invoked_as(char *argv[], const char *token); +bool invoked_by_systemd(void); +bool argv_looks_like_help(int argc, char **argv); + +int rename_process(const char name[]); diff --git a/src/basic/arphrd-to-name.awk b/src/basic/arphrd-to-name.awk new file mode 100644 index 0000000..302504b --- /dev/null +++ b/src/basic/arphrd-to-name.awk @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "const char *arphrd_to_name(int id) {" + print " switch (id) {" +} +!/^HDLC$/ { + printf " case ARPHRD_%s: return \"%s\";\n", $1, $1 +} +END{ + print " default: return NULL;" + print " }" + print "}" +} diff --git a/src/basic/arphrd-util.c b/src/basic/arphrd-util.c new file mode 100644 index 0000000..3ea2c9d --- /dev/null +++ b/src/basic/arphrd-util.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "arphrd-util.h" +#include "macro.h" + +static const struct arphrd_name* lookup_arphrd(register const char *str, register GPERF_LEN_TYPE len); + +#include "arphrd-from-name.h" +#include "arphrd-to-name.h" + +int arphrd_from_name(const char *name) { + const struct arphrd_name *sc; + + assert(name); + + sc = lookup_arphrd(name, strlen(name)); + if (!sc) + return -EINVAL; + + return sc->id; +} + +size_t arphrd_to_hw_addr_len(uint16_t arphrd) { + switch (arphrd) { + case ARPHRD_ETHER: + return ETH_ALEN; + case ARPHRD_INFINIBAND: + return INFINIBAND_ALEN; + case ARPHRD_TUNNEL: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + return sizeof(struct in_addr); + case ARPHRD_TUNNEL6: + case ARPHRD_IP6GRE: + return sizeof(struct in6_addr); + default: + return 0; + } +} diff --git a/src/basic/arphrd-util.h b/src/basic/arphrd-util.h new file mode 100644 index 0000000..33f5694 --- /dev/null +++ b/src/basic/arphrd-util.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +const char *arphrd_to_name(int id); +int arphrd_from_name(const char *name); + +size_t arphrd_to_hw_addr_len(uint16_t arphrd); diff --git a/src/basic/audit-util.c b/src/basic/audit-util.c new file mode 100644 index 0000000..bf96e08 --- /dev/null +++ b/src/basic/audit-util.c @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "audit-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "iovec-util.h" +#include "macro.h" +#include "parse-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "user-util.h" + +int audit_session_from_pid(pid_t pid, uint32_t *id) { + _cleanup_free_ char *s = NULL; + const char *p; + uint32_t u; + int r; + + assert(id); + + /* We don't convert ENOENT to ESRCH here, since we can't + * really distinguish between "audit is not available in the + * kernel" and "the process does not exist", both which will + * result in ENOENT. */ + + p = procfs_file_alloca(pid, "sessionid"); + + r = read_one_line_file(p, &s); + if (r < 0) + return r; + + r = safe_atou32(s, &u); + if (r < 0) + return r; + + if (!audit_session_is_valid(u)) + return -ENODATA; + + *id = u; + return 0; +} + +int audit_loginuid_from_pid(pid_t pid, uid_t *uid) { + _cleanup_free_ char *s = NULL; + const char *p; + uid_t u; + int r; + + assert(uid); + + p = procfs_file_alloca(pid, "loginuid"); + + r = read_one_line_file(p, &s); + if (r < 0) + return r; + + r = parse_uid(s, &u); + if (r == -ENXIO) /* the UID was -1 */ + return -ENODATA; + if (r < 0) + return r; + + *uid = u; + return 0; +} + +static int try_audit_request(int fd) { + struct iovec iov; + struct msghdr mh; + ssize_t n; + + assert(fd >= 0); + + struct { + struct nlmsghdr hdr; + struct nlmsgerr err; + } _packed_ msg = { + .hdr.nlmsg_len = NLMSG_LENGTH(0), + .hdr.nlmsg_type = AUDIT_GET_FEATURE, + .hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + }; + iov = IOVEC_MAKE(&msg, msg.hdr.nlmsg_len); + mh = (struct msghdr) { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + if (sendmsg(fd, &mh, MSG_NOSIGNAL) < 0) + return -errno; + + iov.iov_len = sizeof(msg); + + n = recvmsg_safe(fd, &mh, 0); + if (n < 0) + return -errno; + if (n != NLMSG_LENGTH(sizeof(struct nlmsgerr))) + return -EIO; + + if (msg.hdr.nlmsg_type != NLMSG_ERROR) + return -EINVAL; + + return msg.err.error; +} + +bool use_audit(void) { + static int cached_use = -1; + int r; + + if (cached_use < 0) { + int fd; + + fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_AUDIT); + if (fd < 0) { + cached_use = !IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT, EPERM); + if (!cached_use) + log_debug_errno(errno, "Won't talk to audit: %m"); + } else { + /* If we try and use the audit fd but get -ECONNREFUSED, it is because + * we are not in the initial user namespace, and the kernel does not + * have support for audit outside of the initial user namespace + * (see https://elixir.bootlin.com/linux/latest/C/ident/audit_netlink_ok). + * + * If we receive any other error, do not disable audit because we are not + * sure that the error indicates that audit will not work in general. */ + r = try_audit_request(fd); + if (r < 0) { + cached_use = r != -ECONNREFUSED; + log_debug_errno(r, cached_use ? + "Failed to make request on audit fd, ignoring: %m" : + "Won't talk to audit: %m"); + } else + cached_use = true; + + safe_close(fd); + } + } + + return cached_use; +} diff --git a/src/basic/audit-util.h b/src/basic/audit-util.h new file mode 100644 index 0000000..964082b --- /dev/null +++ b/src/basic/audit-util.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#define AUDIT_SESSION_INVALID UINT32_MAX + +int audit_session_from_pid(pid_t pid, uint32_t *id); +int audit_loginuid_from_pid(pid_t pid, uid_t *uid); + +bool use_audit(void); + +static inline bool audit_session_is_valid(uint32_t id) { + return id > 0 && id != AUDIT_SESSION_INVALID; +} diff --git a/src/basic/bitfield.h b/src/basic/bitfield.h new file mode 100644 index 0000000..25bc0eb --- /dev/null +++ b/src/basic/bitfield.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +/* Bit index (0-based) to mask of specified type. Assertion failure if index is out of range. */ +#define _INDEX_TO_MASK(type, i, uniq) \ + ({ \ + int UNIQ_T(_i, uniq) = (i); \ + assert(UNIQ_T(_i, uniq) < (int)sizeof(type) * 8); \ + ((type)1) << UNIQ_T(_i, uniq); \ + }) +#define INDEX_TO_MASK(type, i) \ + ({ \ + assert_cc(sizeof(type) <= sizeof(unsigned long long)); \ + assert_cc(__builtin_choose_expr(__builtin_constant_p(i), i, 0) < (int)(sizeof(type) * 8)); \ + __builtin_choose_expr(__builtin_constant_p(i), \ + ((type)1) << (i), \ + _INDEX_TO_MASK(type, i, UNIQ)); \ + }) + +/* Builds a mask of specified type with multiple bits set. Note the result will not be constant, even if all + * indexes are constant. */ +#define INDEXES_TO_MASK(type, ...) \ + UNIQ_INDEXES_TO_MASK(type, UNIQ, ##__VA_ARGS__) +#define UNIQ_INDEXES_TO_MASK(type, uniq, ...) \ + ({ \ + typeof(type) UNIQ_T(_mask, uniq) = (type)0; \ + int UNIQ_T(_i, uniq); \ + VA_ARGS_FOREACH(UNIQ_T(_i, uniq), ##__VA_ARGS__) \ + UNIQ_T(_mask, uniq) |= INDEX_TO_MASK(type, UNIQ_T(_i, uniq)); \ + UNIQ_T(_mask, uniq); \ + }) + +/* Same as the FLAG macros, but accept a 0-based bit index instead of a mask. Results in assertion failure if + * index is out of range for the type. */ +#define SET_BIT(bits, i) SET_FLAG(bits, INDEX_TO_MASK(typeof(bits), i), true) +#define CLEAR_BIT(bits, i) SET_FLAG(bits, INDEX_TO_MASK(typeof(bits), i), false) +#define BIT_SET(bits, i) FLAGS_SET(bits, INDEX_TO_MASK(typeof(bits), i)) + +/* As above, but accepts multiple indexes. Note the result will not be constant, even if all indexes are + * constant. */ +#define SET_BITS(bits, ...) SET_FLAG(bits, INDEXES_TO_MASK(typeof(bits), ##__VA_ARGS__), true) +#define CLEAR_BITS(bits, ...) SET_FLAG(bits, INDEXES_TO_MASK(typeof(bits), ##__VA_ARGS__), false) +#define BITS_SET(bits, ...) FLAGS_SET(bits, INDEXES_TO_MASK(typeof(bits), ##__VA_ARGS__)) + +/* Iterate through each set bit. Index is 0-based and type int. */ +#define BIT_FOREACH(index, bits) _BIT_FOREACH(index, bits, UNIQ) +#define _BIT_FOREACH(index, bits, uniq) \ + for (int UNIQ_T(_last, uniq) = -1, index; \ + (index = BIT_NEXT_SET(bits, UNIQ_T(_last, uniq))) >= 0; \ + UNIQ_T(_last, uniq) = index) + +/* Find the next set bit after 0-based index 'prev'. Result is 0-based index of next set bit, or -1 if no + * more bits are set. */ +#define BIT_FIRST_SET(bits) BIT_NEXT_SET(bits, -1) +#define BIT_NEXT_SET(bits, prev) \ + UNIQ_BIT_NEXT_SET(bits, prev, UNIQ) +#define UNIQ_BIT_NEXT_SET(bits, prev, uniq) \ + ({ \ + typeof(bits) UNIQ_T(_bits, uniq) = (bits); \ + int UNIQ_T(_prev, uniq) = (prev); \ + int UNIQ_T(_next, uniq); \ + _BIT_NEXT_SET(UNIQ_T(_bits, uniq), \ + UNIQ_T(_prev, uniq), \ + UNIQ_T(_next, uniq)); \ + }) +#define _BIT_NEXT_SET(bits, prev, next) \ + ((int)(prev + 1) == (int)sizeof(bits) * 8 \ + ? -1 /* Prev index was msb. */ \ + : ((next = __builtin_ffsll(((unsigned long long)(bits)) >> (prev + 1))) == 0 \ + ? -1 /* No more bits set. */ \ + : prev + next)) diff --git a/src/basic/btrfs.c b/src/basic/btrfs.c new file mode 100644 index 0000000..bb07374 --- /dev/null +++ b/src/basic/btrfs.c @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "btrfs.h" +#include "fd-util.h" +#include "fs-util.h" +#include "path-util.h" + +int btrfs_validate_subvolume_name(const char *name) { + + if (!filename_is_valid(name)) + return -EINVAL; + + if (strlen(name) > BTRFS_SUBVOL_NAME_MAX) + return -E2BIG; + + return 0; +} + +static int extract_subvolume_name(const char *path, char **ret) { + _cleanup_free_ char *fn = NULL; + int r; + + assert(path); + assert(ret); + + r = path_extract_filename(path, &fn); + if (r < 0) + return r; + + r = btrfs_validate_subvolume_name(fn); + if (r < 0) + return r; + + *ret = TAKE_PTR(fn); + return 0; +} + +int btrfs_subvol_make(int dir_fd, const char *path) { + struct btrfs_ioctl_vol_args args = {}; + _cleanup_free_ char *subvolume = NULL, *parent = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(!isempty(path)); + + r = extract_subvolume_name(path, &subvolume); + if (r < 0) + return r; + + r = path_extract_directory(path, &parent); + if (r < 0) { + if (r != -EDESTADDRREQ) /* Propagate error, unless only a filename was specified, which is OK */ + return r; + + dir_fd = fd_reopen_condition(dir_fd, O_CLOEXEC, O_PATH, &fd); /* drop O_PATH if it is set */ + if (dir_fd < 0) + return dir_fd; + } else { + fd = openat(dir_fd, parent, O_DIRECTORY|O_RDONLY|O_CLOEXEC, 0); + if (fd < 0) + return -errno; + + dir_fd = fd; + } + + strncpy(args.name, subvolume, sizeof(args.name)-1); + + return RET_NERRNO(ioctl(dir_fd, BTRFS_IOC_SUBVOL_CREATE, &args)); +} + +int btrfs_subvol_make_fallback(int dir_fd, const char *path, mode_t mode) { + mode_t old, combined; + int r; + + assert(path); + + /* Let's work like mkdir(), i.e. take the specified mode, and mask it with the current umask. */ + old = umask(~mode); + combined = old | ~mode; + if (combined != ~mode) + umask(combined); + r = btrfs_subvol_make(dir_fd, path); + umask(old); + + if (r >= 0) + return 1; /* subvol worked */ + if (!ERRNO_IS_NOT_SUPPORTED(r)) + return r; + + if (mkdirat(dir_fd, path, mode) < 0) + return -errno; + + return 0; /* plain directory */ +} diff --git a/src/basic/btrfs.h b/src/basic/btrfs.h new file mode 100644 index 0000000..38be9d2 --- /dev/null +++ b/src/basic/btrfs.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +int btrfs_validate_subvolume_name(const char *name); + +int btrfs_subvol_make(int dir_fd, const char *path); + +int btrfs_subvol_make_fallback(int dir_fd, const char *path, mode_t mode); diff --git a/src/basic/build.c b/src/basic/build.c new file mode 100644 index 0000000..c587ada --- /dev/null +++ b/src/basic/build.c @@ -0,0 +1,283 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "build.h" +#include "extract-word.h" +#include "macro.h" +#include "string-util.h" +#include "terminal-util.h" +#include "version.h" + +const char* const systemd_features = + + /* PAM and MAC frameworks */ + +#if HAVE_PAM + "+PAM" +#else + "-PAM" +#endif + +#if HAVE_AUDIT + " +AUDIT" +#else + " -AUDIT" +#endif + +#if HAVE_SELINUX + " +SELINUX" +#else + " -SELINUX" +#endif + +#if HAVE_APPARMOR + " +APPARMOR" +#else + " -APPARMOR" +#endif + +#if ENABLE_IMA + " +IMA" +#else + " -IMA" +#endif + +#if ENABLE_SMACK + " +SMACK" +#else + " -SMACK" +#endif + +#if HAVE_SECCOMP + " +SECCOMP" +#else + " -SECCOMP" +#endif + + /* cryptographic libraries */ + +#if HAVE_GCRYPT + " +GCRYPT" +#else + " -GCRYPT" +#endif + +#if HAVE_GNUTLS + " +GNUTLS" +#else + " -GNUTLS" +#endif + +#if HAVE_OPENSSL + " +OPENSSL" +#else + " -OPENSSL" +#endif + + /* all other libraries, sorted alphabetically */ + +#if HAVE_ACL + " +ACL" +#else + " -ACL" +#endif + +#if HAVE_BLKID + " +BLKID" +#else + " -BLKID" +#endif + +#if HAVE_LIBCURL + " +CURL" +#else + " -CURL" +#endif + +#if HAVE_ELFUTILS + " +ELFUTILS" +#else + " -ELFUTILS" +#endif + +#if HAVE_LIBFIDO2 + " +FIDO2" +#else + " -FIDO2" +#endif + +#if HAVE_LIBIDN2 + " +IDN2" +#else + " -IDN2" +#endif + +#if HAVE_LIBIDN + " +IDN" +#else + " -IDN" +#endif + +#if HAVE_LIBIPTC + " +IPTC" +#else + " -IPTC" +#endif + +#if HAVE_KMOD + " +KMOD" +#else + " -KMOD" +#endif + +#if HAVE_LIBCRYPTSETUP + " +LIBCRYPTSETUP" +#else + " -LIBCRYPTSETUP" +#endif + +#if HAVE_LIBFDISK + " +LIBFDISK" +#else + " -LIBFDISK" +#endif + +#if HAVE_PCRE2 + " +PCRE2" +#else + " -PCRE2" +#endif + +#if HAVE_PWQUALITY + " +PWQUALITY" +#else + " -PWQUALITY" +#endif + +#if HAVE_P11KIT + " +P11KIT" +#else + " -P11KIT" +#endif + +#if HAVE_QRENCODE + " +QRENCODE" +#else + " -QRENCODE" +#endif + +#if HAVE_TPM2 + " +TPM2" +#else + " -TPM2" +#endif + + /* compressors */ + +#if HAVE_BZIP2 + " +BZIP2" +#else + " -BZIP2" +#endif + +#if HAVE_LZ4 + " +LZ4" +#else + " -LZ4" +#endif + +#if HAVE_XZ + " +XZ" +#else + " -XZ" +#endif + +#if HAVE_ZLIB + " +ZLIB" +#else + " -ZLIB" +#endif + +#if HAVE_ZSTD + " +ZSTD" +#else + " -ZSTD" +#endif + + /* other stuff that doesn't fit above */ + +#if BPF_FRAMEWORK + " +BPF_FRAMEWORK" +#else + " -BPF_FRAMEWORK" +#endif + +#if HAVE_XKBCOMMON + " +XKBCOMMON" +#else + " -XKBCOMMON" +#endif + +#if ENABLE_UTMP + " +UTMP" +#else + " -UTMP" +#endif + +#if HAVE_SYSV_COMPAT + " +SYSVINIT" +#else + " -SYSVINIT" +#endif + + " default-hierarchy=" DEFAULT_HIERARCHY_NAME + ; + +static char *systemd_features_with_color(void) { + const char *p = systemd_features; + _cleanup_free_ char *ret = NULL; + int r; + + for (;;) { + _cleanup_free_ char *word = NULL; + char *q; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) { + log_warning_errno(r, "Cannot split features string, ignoring: %m"); + return NULL; + } + if (r == 0) + return TAKE_PTR(ret); + + if (ret && !strextend(&ret, " ")) { + log_oom_warning(); + return NULL; + } + + if (word[0] == '+') + q = strextend(&ret, ANSI_HIGHLIGHT_GREEN, CHAR_TO_STR(word[0]), ANSI_GREEN, word+1, ANSI_NORMAL); + else if (word[0] == '-') + q = strextend(&ret, ANSI_HIGHLIGHT_RED, CHAR_TO_STR(word[0]), ANSI_RED, word+1, ANSI_NORMAL); + else + q = strextend(&ret, word); + if (!q) { + log_oom_warning(); + return NULL; + } + } +} + +int version(void) { + _cleanup_free_ char *b = NULL; + + if (colors_enabled()) + b = systemd_features_with_color(); + + printf("%ssystemd " STRINGIFY(PROJECT_VERSION) "%s (" GIT_VERSION ")\n%s\n", + ansi_highlight(), ansi_normal(), + b ?: systemd_features); + return 0; +} diff --git a/src/basic/build.h b/src/basic/build.h new file mode 100644 index 0000000..5b7c83c --- /dev/null +++ b/src/basic/build.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +extern const char* const systemd_features; + +int version(void); diff --git a/src/basic/bus-label.c b/src/basic/bus-label.c new file mode 100644 index 0000000..d33fc92 --- /dev/null +++ b/src/basic/bus-label.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-label.h" +#include "hexdecoct.h" +#include "macro.h" + +char *bus_label_escape(const char *s) { + char *r, *t; + const char *f; + + assert_return(s, NULL); + + /* Escapes all chars that D-Bus' object path cannot deal + * with. Can be reversed with bus_path_unescape(). We special + * case the empty string. */ + + if (*s == 0) + return strdup("_"); + + r = new(char, strlen(s)*3 + 1); + if (!r) + return NULL; + + for (f = s, t = r; *f; f++) { + + /* Escape everything that is not a-zA-Z0-9. We also escape 0-9 if it's the first character */ + + if (!ascii_isalpha(*f) && + !(f > s && ascii_isdigit(*f))) { + *(t++) = '_'; + *(t++) = hexchar(*f >> 4); + *(t++) = hexchar(*f); + } else + *(t++) = *f; + } + + *t = 0; + + return r; +} + +char *bus_label_unescape_n(const char *f, size_t l) { + char *r, *t; + size_t i; + + assert_return(f, NULL); + + /* Special case for the empty string */ + if (l == 1 && *f == '_') + return strdup(""); + + r = new(char, l + 1); + if (!r) + return NULL; + + for (i = 0, t = r; i < l; ++i) { + if (f[i] == '_') { + int a, b; + + if (l - i < 3 || + (a = unhexchar(f[i + 1])) < 0 || + (b = unhexchar(f[i + 2])) < 0) { + /* Invalid escape code, let's take it literal then */ + *(t++) = '_'; + } else { + *(t++) = (char) ((a << 4) | b); + i += 2; + } + } else + *(t++) = f[i]; + } + + *t = 0; + + return r; +} diff --git a/src/basic/bus-label.h b/src/basic/bus-label.h new file mode 100644 index 0000000..446daba --- /dev/null +++ b/src/basic/bus-label.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "string-util.h" + +char *bus_label_escape(const char *s); +char *bus_label_unescape_n(const char *f, size_t l); + +static inline char *bus_label_unescape(const char *f) { + return bus_label_unescape_n(f, strlen_ptr(f)); +} diff --git a/src/basic/cap-list.c b/src/basic/cap-list.c new file mode 100644 index 0000000..80d48c1 --- /dev/null +++ b/src/basic/cap-list.c @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "capability-util.h" +#include "cap-list.h" +#include "extract-word.h" +#include "macro.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" + +static const struct capability_name* lookup_capability(register const char *str, register GPERF_LEN_TYPE len); + +#include "cap-from-name.h" +#include "cap-to-name.h" + +const char *capability_to_name(int id) { + if (id < 0) + return NULL; + if (id >= capability_list_length()) + return NULL; + + return capability_names[id]; +} + +const char *capability_to_string(int id, char buf[static CAPABILITY_TO_STRING_MAX]) { + const char *p; + + if (id < 0) + return NULL; + if (id > CAP_LIMIT) /* refuse caps > 62 since we can't store them in a uint64_t mask anymore, and still retain UINT64_MAX as marker for "unset" */ + return NULL; + + p = capability_to_name(id); + if (p) + return p; + + sprintf(buf, "0x%x", (unsigned) id); /* numerical fallback */ + return buf; +} + +int capability_from_name(const char *name) { + const struct capability_name *sc; + int r, i; + + assert(name); + + /* Try to parse numeric capability */ + r = safe_atoi(name, &i); + if (r >= 0) { + if (i < 0 || i > CAP_LIMIT) + return -EINVAL; + + return i; + } + + /* Try to parse string capability */ + sc = lookup_capability(name, strlen(name)); + if (!sc) + return -EINVAL; + + return sc->id; +} + +/* This is the number of capability names we are *compiled* with. For the max capability number of the + * currently-running kernel, use cap_last_cap(). Note that this one returns the size of the array, i.e. one + * value larger than the last known capability. This is different from cap_last_cap() which returns the + * highest supported capability. Hence with everyone agreeing on the same capabilities list, this function + * will return one higher than cap_last_cap(). */ +int capability_list_length(void) { + return MIN((int) ELEMENTSOF(capability_names), CAP_LIMIT + 1); +} + +int capability_set_to_string(uint64_t set, char **ret) { + _cleanup_free_ char *str = NULL; + + assert(ret); + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + const char *p; + + if (!FLAGS_SET(set, UINT64_C(1) << i)) + continue; + + p = CAPABILITY_TO_STRING(i); + assert(p); + + if (!strextend_with_separator(&str, " ", p)) + return -ENOMEM; + } + + if (!str) { + str = new0(char, 1); + if (!str) + return -ENOMEM; + } + + *ret = TAKE_PTR(str); + return 0; +} + +int capability_set_to_string_negative(uint64_t set, char **ret) { + _cleanup_free_ char *a = NULL, *b = NULL; + int r; + + assert(ret); + + /* Format the specified capability mask both in positive way (i.e. just listing caps) and in negative + * way (i.e. listing only caps that are missing from the full set) and return the shorter version of + * the two. */ + + r = capability_set_to_string(set, &a); + if (r < 0) + return r; + + r = capability_set_to_string(~set & all_capabilities(), &b); + if (r < 0) + return r; + + if (strlen(a) <= 1 + strlen(b)) + *ret = TAKE_PTR(a); + else { + char *c = strjoin("~", b); + if (!c) + return -ENOMEM; + + *ret = c; + } + + return 0; +} + +int capability_set_to_strv(uint64_t set, char ***ret) { + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(ret); + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + const char *p; + + if (!FLAGS_SET(set, UINT64_C(1) << i)) + continue; + + p = CAPABILITY_TO_STRING(i); + assert(p); + + r = strv_extend(&l, p); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(l); + return 0; +} + +int capability_set_from_string(const char *s, uint64_t *ret) { + uint64_t val = 0; + bool good = true; + + for (const char *p = s;;) { + _cleanup_free_ char *word = NULL; + int r; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX); + if (r < 0) + return r; + if (r == 0) + break; + + r = capability_from_name(word); + if (r < 0) { + log_debug_errno(r, "Failed to parse capability '%s', ignoring: %m", word); + good = false; + } else + val |= UINT64_C(1) << r; + } + + if (ret) + *ret = val; + + return good; +} diff --git a/src/basic/cap-list.h b/src/basic/cap-list.h new file mode 100644 index 0000000..3028197 --- /dev/null +++ b/src/basic/cap-list.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* Space for capability_to_string() in case we write out a numeric capability because we don't know the name + * for it. "0x3e" is the largest string we might output, in both sensese of the word "largest": two chars for + * "0x", two bytes for the hex value, and one trailing NUL byte. */ +#define CAPABILITY_TO_STRING_MAX (2 + 2 + 1) + +const char *capability_to_name(int id); +const char *capability_to_string(int id, char buf[static CAPABILITY_TO_STRING_MAX]); +#define CAPABILITY_TO_STRING(id) capability_to_string(id, (char[CAPABILITY_TO_STRING_MAX]) {}) + +int capability_from_name(const char *name); +int capability_list_length(void); + +int capability_set_to_string(uint64_t set, char **ret); +int capability_set_to_string_negative(uint64_t set, char **ret); +int capability_set_to_strv(uint64_t set, char ***ret); +int capability_set_from_string(const char *s, uint64_t *ret); diff --git a/src/basic/cap-to-name.awk b/src/basic/cap-to-name.awk new file mode 100644 index 0000000..bd8a28c --- /dev/null +++ b/src/basic/cap-to-name.awk @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "static const char* const capability_names[] = { " +} +{ + printf " [%s] = \"%s\",\n", $1, tolower($1) +} +END{ + print "};" +} diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c new file mode 100644 index 0000000..c3cf455 --- /dev/null +++ b/src/basic/capability-util.c @@ -0,0 +1,642 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "capability-util.h" +#include "cap-list.h" +#include "fileio.h" +#include "log.h" +#include "logarithm.h" +#include "macro.h" +#include "missing_prctl.h" +#include "missing_threads.h" +#include "parse-util.h" +#include "user-util.h" + +int have_effective_cap(int value) { + _cleanup_cap_free_ cap_t cap = NULL; + cap_flag_value_t fv = CAP_CLEAR; /* To avoid false-positive use-of-uninitialized-value error reported + * by fuzzers. */ + + cap = cap_get_proc(); + if (!cap) + return -errno; + + if (cap_get_flag(cap, value, CAP_EFFECTIVE, &fv) < 0) + return -errno; + + return fv == CAP_SET; +} + +unsigned cap_last_cap(void) { + static thread_local unsigned saved; + static thread_local bool valid = false; + _cleanup_free_ char *content = NULL; + unsigned long p = 0; + int r; + + if (valid) + return saved; + + /* available since linux-3.2 */ + r = read_one_line_file("/proc/sys/kernel/cap_last_cap", &content); + if (r >= 0) { + r = safe_atolu(content, &p); + if (r >= 0) { + + if (p > CAP_LIMIT) /* Safety for the future: if one day the kernel learns more than + * 64 caps, then we are in trouble (since we, as much userspace + * and kernel space store capability masks in uint64_t types). We + * also want to use UINT64_MAX as marker for "unset". Hence let's + * hence protect ourselves against that and always cap at 62 for + * now. */ + p = CAP_LIMIT; + + saved = p; + valid = true; + return p; + } + } + + /* fall back to syscall-probing for pre linux-3.2 */ + p = (unsigned long) MIN(CAP_LAST_CAP, CAP_LIMIT); + + if (prctl(PR_CAPBSET_READ, p) < 0) { + + /* Hmm, look downwards, until we find one that works */ + for (p--; p > 0; p--) + if (prctl(PR_CAPBSET_READ, p) >= 0) + break; + + } else { + + /* Hmm, look upwards, until we find one that doesn't work */ + for (; p < CAP_LIMIT; p++) + if (prctl(PR_CAPBSET_READ, p+1) < 0) + break; + } + + saved = p; + valid = true; + + return p; +} + +int capability_update_inherited_set(cap_t caps, uint64_t set) { + /* Add capabilities in the set to the inherited caps, drops capabilities not in the set. + * Do not apply them yet. */ + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + cap_flag_value_t flag = set & (UINT64_C(1) << i) ? CAP_SET : CAP_CLEAR; + cap_value_t v; + + v = (cap_value_t) i; + + if (cap_set_flag(caps, CAP_INHERITABLE, 1, &v, flag) < 0) + return -errno; + } + + return 0; +} + +int capability_ambient_set_apply(uint64_t set, bool also_inherit) { + _cleanup_cap_free_ cap_t caps = NULL; + int r; + + /* Remove capabilities requested in ambient set, but not in the bounding set */ + for (unsigned i = 0; i <= cap_last_cap(); i++) { + if (set == 0) + break; + + if (FLAGS_SET(set, (UINT64_C(1) << i)) && prctl(PR_CAPBSET_READ, i) != 1) { + log_debug("Ambient capability %s requested but missing from bounding set," + " suppressing automatically.", capability_to_name(i)); + set &= ~(UINT64_C(1) << i); + } + } + + /* Add the capabilities to the ambient set (an possibly also the inheritable set) */ + + /* Check that we can use PR_CAP_AMBIENT or quit early. */ + if (!ambient_capabilities_supported()) + return (set & all_capabilities()) == 0 ? + 0 : -EOPNOTSUPP; /* if actually no ambient caps are to be set, be silent, + * otherwise fail recognizably */ + + if (also_inherit) { + caps = cap_get_proc(); + if (!caps) + return -errno; + + r = capability_update_inherited_set(caps, set); + if (r < 0) + return -errno; + + if (cap_set_proc(caps) < 0) + return -errno; + } + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + + if (set & (UINT64_C(1) << i)) { + + /* Add the capability to the ambient set. */ + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0, 0) < 0) + return -errno; + } else { + + /* Drop the capability so we don't inherit capabilities we didn't ask for. */ + r = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i, 0, 0); + if (r < 0) + return -errno; + + if (r) + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, i, 0, 0) < 0) + return -errno; + + } + } + + return 0; +} + +int capability_gain_cap_setpcap(cap_t *ret_before_caps) { + _cleanup_cap_free_ cap_t caps = NULL; + cap_flag_value_t fv; + caps = cap_get_proc(); + if (!caps) + return -errno; + + if (cap_get_flag(caps, CAP_SETPCAP, CAP_EFFECTIVE, &fv) < 0) + return -errno; + + if (fv != CAP_SET) { + _cleanup_cap_free_ cap_t temp_cap = NULL; + static const cap_value_t v = CAP_SETPCAP; + + temp_cap = cap_dup(caps); + if (!temp_cap) + return -errno; + + if (cap_set_flag(temp_cap, CAP_EFFECTIVE, 1, &v, CAP_SET) < 0) + return -errno; + + if (cap_set_proc(temp_cap) < 0) + log_debug_errno(errno, "Can't acquire effective CAP_SETPCAP bit, ignoring: %m"); + + /* If we didn't manage to acquire the CAP_SETPCAP bit, we continue anyway, after all this just means + * we'll fail later, when we actually intend to drop some capabilities or try to set securebits. */ + } + if (ret_before_caps) + /* Return the capabilities as they have been before setting CAP_SETPCAP */ + *ret_before_caps = TAKE_PTR(caps); + + return 0; +} + +int capability_bounding_set_drop(uint64_t keep, bool right_now) { + _cleanup_cap_free_ cap_t before_cap = NULL, after_cap = NULL; + int r; + + /* If we are run as PID 1 we will lack CAP_SETPCAP by default + * in the effective set (yes, the kernel drops that when + * executing init!), so get it back temporarily so that we can + * call PR_CAPBSET_DROP. */ + + r = capability_gain_cap_setpcap(&before_cap); + if (r < 0) + return r; + + after_cap = cap_dup(before_cap); + if (!after_cap) + return -errno; + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + cap_value_t v; + + if ((keep & (UINT64_C(1) << i))) + continue; + + /* Drop it from the bounding set */ + if (prctl(PR_CAPBSET_DROP, i) < 0) { + r = -errno; + + /* If dropping the capability failed, let's see if we didn't have it in the first place. If so, + * continue anyway, as dropping a capability we didn't have in the first place doesn't really + * matter anyway. */ + if (prctl(PR_CAPBSET_READ, i) != 0) + goto finish; + } + v = (cap_value_t) i; + + /* Also drop it from the inheritable set, so + * that anything we exec() loses the + * capability for good. */ + if (cap_set_flag(after_cap, CAP_INHERITABLE, 1, &v, CAP_CLEAR) < 0) { + r = -errno; + goto finish; + } + + /* If we shall apply this right now drop it + * also from our own capability sets. */ + if (right_now) { + if (cap_set_flag(after_cap, CAP_PERMITTED, 1, &v, CAP_CLEAR) < 0 || + cap_set_flag(after_cap, CAP_EFFECTIVE, 1, &v, CAP_CLEAR) < 0) { + r = -errno; + goto finish; + } + } + } + + r = 0; + +finish: + if (cap_set_proc(after_cap) < 0) { + /* If there are no actual changes anyway then let's ignore this error. */ + if (cap_compare(before_cap, after_cap) != 0) + r = -errno; + } + + return r; +} + +static int drop_from_file(const char *fn, uint64_t keep) { + _cleanup_free_ char *p = NULL; + uint64_t current, after; + uint32_t hi, lo; + int r, k; + + r = read_one_line_file(fn, &p); + if (r < 0) + return r; + + k = sscanf(p, "%" PRIu32 " %" PRIu32, &lo, &hi); + if (k != 2) + return -EIO; + + current = (uint64_t) lo | ((uint64_t) hi << 32); + after = current & keep; + + if (current == after) + return 0; + + lo = after & UINT32_MAX; + hi = (after >> 32) & UINT32_MAX; + + return write_string_filef(fn, 0, "%" PRIu32 " %" PRIu32, lo, hi); +} + +int capability_bounding_set_drop_usermode(uint64_t keep) { + int r; + + r = drop_from_file("/proc/sys/kernel/usermodehelper/inheritable", keep); + if (r < 0) + return r; + + r = drop_from_file("/proc/sys/kernel/usermodehelper/bset", keep); + if (r < 0) + return r; + + return r; +} + +int drop_privileges(uid_t uid, gid_t gid, uint64_t keep_capabilities) { + int r; + + /* Unfortunately we cannot leave privilege dropping to PID 1 here, since we want to run as user but + * want to keep some capabilities. Since file capabilities have been introduced this cannot be done + * across exec() anymore, unless our binary has the capability configured in the file system, which + * we want to avoid. */ + + if (setresgid(gid, gid, gid) < 0) + return log_error_errno(errno, "Failed to change group ID: %m"); + + r = maybe_setgroups(0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to drop auxiliary groups list: %m"); + + /* Ensure we keep the permitted caps across the setresuid(). Note that we do this even if we actually + * don't want to keep any capabilities, since we want to be able to drop them from the bounding set + * too, and we can only do that if we have capabilities. */ + if (prctl(PR_SET_KEEPCAPS, 1) < 0) + return log_error_errno(errno, "Failed to enable keep capabilities flag: %m"); + + if (setresuid(uid, uid, uid) < 0) + return log_error_errno(errno, "Failed to change user ID: %m"); + + if (prctl(PR_SET_KEEPCAPS, 0) < 0) + return log_error_errno(errno, "Failed to disable keep capabilities flag: %m"); + + /* Drop all caps from the bounding set (as well as the inheritable/permitted/effective sets), except + * the ones we want to keep */ + r = capability_bounding_set_drop(keep_capabilities, true); + if (r < 0) + return log_error_errno(r, "Failed to drop capabilities: %m"); + + /* Now upgrade the permitted caps we still kept to effective caps */ + if (keep_capabilities != 0) { + cap_value_t bits[log2u64(keep_capabilities) + 1]; + _cleanup_cap_free_ cap_t d = NULL; + unsigned i, j = 0; + + d = cap_init(); + if (!d) + return log_oom(); + + for (i = 0; i < ELEMENTSOF(bits); i++) + if (keep_capabilities & (1ULL << i)) + bits[j++] = i; + + /* use enough bits */ + assert(i == 64 || (keep_capabilities >> i) == 0); + /* don't use too many bits */ + assert(keep_capabilities & (UINT64_C(1) << (i - 1))); + + if (cap_set_flag(d, CAP_EFFECTIVE, j, bits, CAP_SET) < 0 || + cap_set_flag(d, CAP_PERMITTED, j, bits, CAP_SET) < 0) + return log_error_errno(errno, "Failed to enable capabilities bits: %m"); + + if (cap_set_proc(d) < 0) + return log_error_errno(errno, "Failed to increase capabilities: %m"); + } + + return 0; +} + +static int change_capability(cap_value_t cv, cap_flag_value_t flag) { + _cleanup_cap_free_ cap_t tmp_cap = NULL; + + tmp_cap = cap_get_proc(); + if (!tmp_cap) + return -errno; + + if ((cap_set_flag(tmp_cap, CAP_INHERITABLE, 1, &cv, flag) < 0) || + (cap_set_flag(tmp_cap, CAP_PERMITTED, 1, &cv, flag) < 0) || + (cap_set_flag(tmp_cap, CAP_EFFECTIVE, 1, &cv, flag) < 0)) + return -errno; + + if (cap_set_proc(tmp_cap) < 0) + return -errno; + + return 0; +} + +int drop_capability(cap_value_t cv) { + return change_capability(cv, CAP_CLEAR); +} + +int keep_capability(cap_value_t cv) { + return change_capability(cv, CAP_SET); +} + +bool ambient_capabilities_supported(void) { + static int cache = -1; + + if (cache >= 0) + return cache; + + /* If PR_CAP_AMBIENT returns something valid, or an unexpected error code we assume that ambient caps are + * available. */ + + cache = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_KILL, 0, 0) >= 0 || + !IN_SET(errno, EINVAL, EOPNOTSUPP, ENOSYS); + + return cache; +} + +bool capability_quintet_mangle(CapabilityQuintet *q) { + uint64_t combined, drop = 0; + bool ambient_supported; + + assert(q); + + combined = q->effective | q->bounding | q->inheritable | q->permitted; + + ambient_supported = q->ambient != CAP_MASK_UNSET; + if (ambient_supported) + combined |= q->ambient; + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + unsigned long bit = UINT64_C(1) << i; + if (!FLAGS_SET(combined, bit)) + continue; + + if (prctl(PR_CAPBSET_READ, i) > 0) + continue; + + drop |= bit; + + log_debug("Not in the current bounding set: %s", capability_to_name(i)); + } + + q->effective &= ~drop; + q->bounding &= ~drop; + q->inheritable &= ~drop; + q->permitted &= ~drop; + + if (ambient_supported) + q->ambient &= ~drop; + + return drop != 0; /* Let the caller know we changed something */ +} + +int capability_quintet_enforce(const CapabilityQuintet *q) { + _cleanup_cap_free_ cap_t c = NULL, modified = NULL; + int r; + + if (q->ambient != CAP_MASK_UNSET) { + bool changed = false; + + c = cap_get_proc(); + if (!c) + return -errno; + + /* In order to raise the ambient caps set we first need to raise the matching + * inheritable + permitted cap */ + for (unsigned i = 0; i <= cap_last_cap(); i++) { + uint64_t m = UINT64_C(1) << i; + cap_value_t cv = (cap_value_t) i; + cap_flag_value_t old_value_inheritable, old_value_permitted; + + if ((q->ambient & m) == 0) + continue; + + if (cap_get_flag(c, cv, CAP_INHERITABLE, &old_value_inheritable) < 0) + return -errno; + if (cap_get_flag(c, cv, CAP_PERMITTED, &old_value_permitted) < 0) + return -errno; + + if (old_value_inheritable == CAP_SET && old_value_permitted == CAP_SET) + continue; + + if (cap_set_flag(c, CAP_INHERITABLE, 1, &cv, CAP_SET) < 0) + return -errno; + if (cap_set_flag(c, CAP_PERMITTED, 1, &cv, CAP_SET) < 0) + return -errno; + + changed = true; + } + + if (changed) + if (cap_set_proc(c) < 0) + return -errno; + + r = capability_ambient_set_apply(q->ambient, false); + if (r < 0) + return r; + } + + if (q->inheritable != CAP_MASK_UNSET || q->permitted != CAP_MASK_UNSET || q->effective != CAP_MASK_UNSET) { + bool changed = false; + + if (!c) { + c = cap_get_proc(); + if (!c) + return -errno; + } + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + uint64_t m = UINT64_C(1) << i; + cap_value_t cv = (cap_value_t) i; + + if (q->inheritable != CAP_MASK_UNSET) { + cap_flag_value_t old_value, new_value; + + if (cap_get_flag(c, cv, CAP_INHERITABLE, &old_value) < 0) { + if (errno == EINVAL) /* If the kernel knows more caps than this + * version of libcap, then this will return + * EINVAL. In that case, simply ignore it, + * pretend it doesn't exist. */ + continue; + + return -errno; + } + + new_value = (q->inheritable & m) ? CAP_SET : CAP_CLEAR; + + if (old_value != new_value) { + changed = true; + + if (cap_set_flag(c, CAP_INHERITABLE, 1, &cv, new_value) < 0) + return -errno; + } + } + + if (q->permitted != CAP_MASK_UNSET) { + cap_flag_value_t old_value, new_value; + + if (cap_get_flag(c, cv, CAP_PERMITTED, &old_value) < 0) { + if (errno == EINVAL) + continue; + + return -errno; + } + + new_value = (q->permitted & m) ? CAP_SET : CAP_CLEAR; + + if (old_value != new_value) { + changed = true; + + if (cap_set_flag(c, CAP_PERMITTED, 1, &cv, new_value) < 0) + return -errno; + } + } + + if (q->effective != CAP_MASK_UNSET) { + cap_flag_value_t old_value, new_value; + + if (cap_get_flag(c, cv, CAP_EFFECTIVE, &old_value) < 0) { + if (errno == EINVAL) + continue; + + return -errno; + } + + new_value = (q->effective & m) ? CAP_SET : CAP_CLEAR; + + if (old_value != new_value) { + changed = true; + + if (cap_set_flag(c, CAP_EFFECTIVE, 1, &cv, new_value) < 0) + return -errno; + } + } + } + + if (changed) { + /* In order to change the bounding caps, we need to keep CAP_SETPCAP for a bit + * longer. Let's add it to our list hence for now. */ + if (q->bounding != CAP_MASK_UNSET) { + cap_value_t cv = CAP_SETPCAP; + + modified = cap_dup(c); + if (!modified) + return -ENOMEM; + + if (cap_set_flag(modified, CAP_PERMITTED, 1, &cv, CAP_SET) < 0) + return -errno; + if (cap_set_flag(modified, CAP_EFFECTIVE, 1, &cv, CAP_SET) < 0) + return -errno; + + if (cap_compare(modified, c) == 0) { + /* No change? then drop this nonsense again */ + cap_free(modified); + modified = NULL; + } + } + + /* Now, let's enforce the caps for the first time. Note that this is where we acquire + * caps in any of the sets we currently don't have. We have to do this before + * dropping the bounding caps below, since at that point we can never acquire new + * caps in inherited/permitted/effective anymore, but only lose them. */ + if (cap_set_proc(modified ?: c) < 0) + return -errno; + } + } + + if (q->bounding != CAP_MASK_UNSET) { + r = capability_bounding_set_drop(q->bounding, false); + if (r < 0) + return r; + } + + /* If needed, let's now set the caps again, this time in the final version, which differs from what + * we have already set only in the CAP_SETPCAP bit, which we needed for dropping the bounding + * bits. This call only undoes bits and doesn't acquire any which means the bounding caps don't + * matter. */ + if (modified) + if (cap_set_proc(c) < 0) + return -errno; + + return 0; +} + +int capability_get_ambient(uint64_t *ret) { + uint64_t a = 0; + int r; + + assert(ret); + + if (!ambient_capabilities_supported()) { + *ret = 0; + return 0; + } + + for (unsigned i = 0; i <= cap_last_cap(); i++) { + r = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i, 0, 0); + if (r < 0) + return -errno; + + if (r) + a |= UINT64_C(1) << i; + } + + + *ret = a; + return 1; +} diff --git a/src/basic/capability-util.h b/src/basic/capability-util.h new file mode 100644 index 0000000..f911de8 --- /dev/null +++ b/src/basic/capability-util.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "macro.h" +#include "missing_capability.h" + +/* Special marker used when storing a capabilities mask as "unset" */ +#define CAP_MASK_UNSET UINT64_MAX + +/* All possible capabilities bits on */ +#define CAP_MASK_ALL UINT64_C(0x7fffffffffffffff) + +/* The largest capability we can deal with, given we want to be able to store cap masks in uint64_t but still + * be able to use UINT64_MAX as indicator for "not set". The latter makes capability 63 unavailable. */ +#define CAP_LIMIT 62 + +unsigned cap_last_cap(void); +int have_effective_cap(int value); +int capability_gain_cap_setpcap(cap_t *return_caps); +int capability_bounding_set_drop(uint64_t keep, bool right_now); +int capability_bounding_set_drop_usermode(uint64_t keep); + +int capability_ambient_set_apply(uint64_t set, bool also_inherit); +int capability_update_inherited_set(cap_t caps, uint64_t ambient_set); + +int drop_privileges(uid_t uid, gid_t gid, uint64_t keep_capabilities); + +int drop_capability(cap_value_t cv); +int keep_capability(cap_value_t cv); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(cap_t, cap_free, NULL); +#define _cleanup_cap_free_ _cleanup_(cap_freep) + +static inline void cap_free_charpp(char **p) { + if (*p) + cap_free(*p); +} +#define _cleanup_cap_free_charp_ _cleanup_(cap_free_charpp) + +static inline uint64_t all_capabilities(void) { + return UINT64_MAX >> (63 - cap_last_cap()); +} + +static inline bool cap_test_all(uint64_t caps) { + return FLAGS_SET(caps, all_capabilities()); +} + +bool ambient_capabilities_supported(void); + +/* Identical to linux/capability.h's CAP_TO_MASK(), but uses an unsigned 1U instead of a signed 1 for shifting left, in + * order to avoid complaints about shifting a signed int left by 31 bits, which would make it negative. */ +#define CAP_TO_MASK_CORRECTED(x) (1U << ((x) & 31U)) + +typedef struct CapabilityQuintet { + /* Stores all five types of capabilities in one go. Note that we use UINT64_MAX for unset here. This hence + * needs to be updated as soon as Linux learns more than 63 caps. */ + uint64_t effective; + uint64_t bounding; + uint64_t inheritable; + uint64_t permitted; + uint64_t ambient; +} CapabilityQuintet; + +assert_cc(CAP_LAST_CAP < 64); + +#define CAPABILITY_QUINTET_NULL { CAP_MASK_UNSET, CAP_MASK_UNSET, CAP_MASK_UNSET, CAP_MASK_UNSET, CAP_MASK_UNSET } + +static inline bool capability_quintet_is_set(const CapabilityQuintet *q) { + return q->effective != CAP_MASK_UNSET || + q->bounding != CAP_MASK_UNSET || + q->inheritable != CAP_MASK_UNSET || + q->permitted != CAP_MASK_UNSET || + q->ambient != CAP_MASK_UNSET; +} + +/* Mangles the specified caps quintet taking the current bounding set into account: + * drops all caps from all five sets if our bounding set doesn't allow them. + * Returns true if the quintet was modified. */ +bool capability_quintet_mangle(CapabilityQuintet *q); + +int capability_quintet_enforce(const CapabilityQuintet *q); + +int capability_get_ambient(uint64_t *ret); diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c new file mode 100644 index 0000000..18b16ec --- /dev/null +++ b/src/basic/cgroup-util.c @@ -0,0 +1,2434 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "constants.h" +#include "dirent-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "log.h" +#include "login-util.h" +#include "macro.h" +#include "missing_magic.h" +#include "missing_threads.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "set.h" +#include "special.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "user-util.h" +#include "xattr-util.h" + +static int cg_enumerate_items(const char *controller, const char *path, FILE **ret, const char *item) { + _cleanup_free_ char *fs = NULL; + FILE *f; + int r; + + assert(ret); + + r = cg_get_path(controller, path, item, &fs); + if (r < 0) + return r; + + f = fopen(fs, "re"); + if (!f) + return -errno; + + *ret = f; + return 0; +} + +int cg_enumerate_processes(const char *controller, const char *path, FILE **ret) { + return cg_enumerate_items(controller, path, ret, "cgroup.procs"); +} + +int cg_read_pid(FILE *f, pid_t *ret) { + unsigned long ul; + + /* Note that the cgroup.procs might contain duplicates! See cgroups.txt for details. */ + + assert(f); + assert(ret); + + errno = 0; + if (fscanf(f, "%lu", &ul) != 1) { + + if (feof(f)) { + *ret = 0; + return 0; + } + + return errno_or_else(EIO); + } + + if (ul <= 0) + return -EIO; + if (ul > PID_T_MAX) + return -EIO; + + *ret = (pid_t) ul; + return 1; +} + +int cg_read_pidref(FILE *f, PidRef *ret) { + int r; + + assert(f); + assert(ret); + + for (;;) { + pid_t pid; + + r = cg_read_pid(f, &pid); + if (r < 0) + return r; + if (r == 0) { + *ret = PIDREF_NULL; + return 0; + } + + r = pidref_set_pid(ret, pid); + if (r >= 0) + return 1; + if (r != -ESRCH) + return r; + + /* ESRCH → gone by now? just skip over it, read the next */ + } +} + +int cg_read_event( + const char *controller, + const char *path, + const char *event, + char **ret) { + + _cleanup_free_ char *events = NULL, *content = NULL; + int r; + + r = cg_get_path(controller, path, "cgroup.events", &events); + if (r < 0) + return r; + + r = read_full_virtual_file(events, &content, NULL); + if (r < 0) + return r; + + for (const char *p = content;;) { + _cleanup_free_ char *line = NULL, *key = NULL, *val = NULL; + const char *q; + + r = extract_first_word(&p, &line, "\n", 0); + if (r < 0) + return r; + if (r == 0) + return -ENOENT; + + q = line; + r = extract_first_word(&q, &key, " ", 0); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if (!streq(key, event)) + continue; + + val = strdup(q); + if (!val) + return -ENOMEM; + + *ret = TAKE_PTR(val); + return 0; + } +} + +bool cg_ns_supported(void) { + static thread_local int enabled = -1; + + if (enabled >= 0) + return enabled; + + if (access("/proc/self/ns/cgroup", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m"); + enabled = false; + } else + enabled = true; + + return enabled; +} + +bool cg_freezer_supported(void) { + static thread_local int supported = -1; + + if (supported >= 0) + return supported; + + supported = cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK) == 0; + + return supported; +} + +bool cg_kill_supported(void) { + static thread_local int supported = -1; + + if (supported >= 0) + return supported; + + if (cg_all_unified() <= 0) + supported = false; + else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if cgroup.kill is available, assuming not: %m"); + supported = false; + } else + supported = true; + + return supported; +} + +int cg_enumerate_subgroups(const char *controller, const char *path, DIR **ret) { + _cleanup_free_ char *fs = NULL; + DIR *d; + int r; + + assert(ret); + + /* This is not recursive! */ + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + d = opendir(fs); + if (!d) + return -errno; + + *ret = d; + return 0; +} + +int cg_read_subgroup(DIR *d, char **ret) { + assert(d); + assert(ret); + + FOREACH_DIRENT_ALL(de, d, return -errno) { + char *b; + + if (de->d_type != DT_DIR) + continue; + + if (dot_or_dot_dot(de->d_name)) + continue; + + b = strdup(de->d_name); + if (!b) + return -ENOMEM; + + *ret = b; + return 1; + } + + *ret = NULL; + return 0; +} + +int cg_rmdir(const char *controller, const char *path) { + _cleanup_free_ char *p = NULL; + int r; + + r = cg_get_path(controller, path, NULL, &p); + if (r < 0) + return r; + + r = rmdir(p); + if (r < 0 && errno != ENOENT) + return -errno; + + r = cg_hybrid_unified(); + if (r <= 0) + return r; + + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path); + if (r < 0) + log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path); + } + + return 0; +} + +static int cg_kill_items( + const char *path, + int sig, + CGroupFlags flags, + Set *s, + cg_kill_log_func_t log_kill, + void *userdata, + const char *item) { + + _cleanup_set_free_ Set *allocated_set = NULL; + bool done = false; + int r, ret = 0, ret_log_kill = 0; + + assert(sig >= 0); + + /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send + * SIGCONT on SIGKILL. */ + if (IN_SET(sig, SIGCONT, SIGKILL)) + flags &= ~CGROUP_SIGCONT; + + /* This goes through the tasks list and kills them all. This + * is repeated until no further processes are added to the + * tasks list, to properly handle forking processes */ + + if (!s) { + s = allocated_set = set_new(NULL); + if (!s) + return -ENOMEM; + } + + do { + _cleanup_fclose_ FILE *f = NULL; + done = true; + + r = cg_enumerate_items(SYSTEMD_CGROUP_CONTROLLER, path, &f, item); + if (r == -ENOENT) + break; + if (r < 0) + return RET_GATHER(ret, r); + + for (;;) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + + r = cg_read_pidref(f, &pidref); + if (r < 0) + return RET_GATHER(ret, r); + if (r == 0) + break; + + if ((flags & CGROUP_IGNORE_SELF) && pidref_is_self(&pidref)) + continue; + + if (set_get(s, PID_TO_PTR(pidref.pid)) == PID_TO_PTR(pidref.pid)) + continue; + + if (log_kill) + ret_log_kill = log_kill(&pidref, sig, userdata); + + /* If we haven't killed this process yet, kill it */ + r = pidref_kill(&pidref, sig); + if (r < 0 && r != -ESRCH) + RET_GATHER(ret, r); + if (r >= 0) { + if (flags & CGROUP_SIGCONT) + (void) pidref_kill(&pidref, SIGCONT); + + if (ret == 0) { + if (log_kill) + ret = ret_log_kill; + else + ret = 1; + } + } + + done = false; + + r = set_put(s, PID_TO_PTR(pidref.pid)); + if (r < 0) + return RET_GATHER(ret, r); + } + + /* To avoid racing against processes which fork quicker than we can kill them, we repeat this + * until no new pids need to be killed. */ + + } while (!done); + + return ret; +} + +int cg_kill( + const char *path, + int sig, + CGroupFlags flags, + Set *s, + cg_kill_log_func_t log_kill, + void *userdata) { + + int r, ret; + + r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.procs"); + if (r < 0 || sig != SIGKILL) + return r; + + ret = r; + + /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as + a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66 + (4340d175b898) and 4.14.138 (feb6b123b7dd). */ + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r == 0) + return ret; + + r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.threads"); + if (r < 0) + return r; + + return r > 0 || ret > 0; +} + +int cg_kill_kernel_sigkill(const char *path) { + /* Kills the cgroup at `path` directly by writing to its cgroup.kill file. This sends SIGKILL to all + * processes in the cgroup and has the advantage of being completely atomic, unlike cg_kill_items(). */ + + _cleanup_free_ char *killfile = NULL; + int r; + + assert(path); + + if (!cg_kill_supported()) + return -EOPNOTSUPP; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.kill", &killfile); + if (r < 0) + return r; + + r = write_string_file(killfile, "1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + return 0; +} + +int cg_kill_recursive( + const char *path, + int sig, + CGroupFlags flags, + Set *s, + cg_kill_log_func_t log_kill, + void *userdata) { + + int r, ret; + + assert(path); + assert(sig >= 0); + + if (sig == SIGKILL && cg_kill_supported() && + !FLAGS_SET(flags, CGROUP_IGNORE_SELF) && !s && !log_kill) + /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */ + ret = cg_kill_kernel_sigkill(path); + else { + _cleanup_set_free_ Set *allocated_set = NULL; + _cleanup_closedir_ DIR *d = NULL; + + if (!s) { + s = allocated_set = set_new(NULL); + if (!s) + return -ENOMEM; + } + + ret = cg_kill(path, sig, flags, s, log_kill, userdata); + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) { + if (r != -ENOENT) + RET_GATHER(ret, r); + + return ret; + } + + for (;;) { + _cleanup_free_ char *fn = NULL, *p = NULL; + + r = cg_read_subgroup(d, &fn); + if (r < 0) { + RET_GATHER(ret, r); + break; + } + if (r == 0) + break; + + p = path_join(empty_to_root(path), fn); + if (!p) + return -ENOMEM; + + r = cg_kill_recursive(p, sig, flags, s, log_kill, userdata); + if (r != 0 && ret >= 0) + ret = r; + } + } + + if (FLAGS_SET(flags, CGROUP_REMOVE)) { + r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, path); + if (!IN_SET(r, -ENOENT, -EBUSY)) + RET_GATHER(ret, r); + } + + return ret; +} + +static const char *controller_to_dirname(const char *controller) { + assert(controller); + + /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it + * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is + * specified. */ + + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + if (cg_hybrid_unified() > 0) + controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID; + else + controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY; + } + + return startswith(controller, "name=") ?: controller; +} + +static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **ret) { + const char *dn; + char *t = NULL; + + assert(ret); + assert(controller); + + dn = controller_to_dirname(controller); + + if (isempty(path) && isempty(suffix)) + t = path_join("/sys/fs/cgroup", dn); + else if (isempty(path)) + t = path_join("/sys/fs/cgroup", dn, suffix); + else if (isempty(suffix)) + t = path_join("/sys/fs/cgroup", dn, path); + else + t = path_join("/sys/fs/cgroup", dn, path, suffix); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +static int join_path_unified(const char *path, const char *suffix, char **ret) { + char *t; + + assert(ret); + + if (isempty(path) && isempty(suffix)) + t = strdup("/sys/fs/cgroup"); + else if (isempty(path)) + t = path_join("/sys/fs/cgroup", suffix); + else if (isempty(suffix)) + t = path_join("/sys/fs/cgroup", path); + else + t = path_join("/sys/fs/cgroup", path, suffix); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret) { + int r; + + assert(ret); + + if (!controller) { + char *t; + + /* If no controller is specified, we return the path *below* the controllers, without any + * prefix. */ + + if (isempty(path) && isempty(suffix)) + return -EINVAL; + + if (isempty(suffix)) + t = strdup(path); + else if (isempty(path)) + t = strdup(suffix); + else + t = path_join(path, suffix); + if (!t) + return -ENOMEM; + + *ret = path_simplify(t); + return 0; + } + + if (!cg_controller_is_valid(controller)) + return -EINVAL; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + r = join_path_unified(path, suffix, ret); + else + r = join_path_legacy(controller, path, suffix, ret); + if (r < 0) + return r; + + path_simplify(*ret); + return 0; +} + +static int controller_is_v1_accessible(const char *root, const char *controller) { + const char *cpath, *dn; + + assert(controller); + + dn = controller_to_dirname(controller); + + /* If root if specified, we check that: + * - possible subcgroup is created at root, + * - we can modify the hierarchy. */ + + cpath = strjoina("/sys/fs/cgroup/", dn, root, root ? "/cgroup.procs" : NULL); + return laccess(cpath, root ? W_OK : F_OK); +} + +int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **ret) { + int r; + + assert(controller); + assert(ret); + + if (!cg_controller_is_valid(controller)) + return -EINVAL; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) { + /* In the unified hierarchy all controllers are considered accessible, + * except for the named hierarchies */ + if (startswith(controller, "name=")) + return -EOPNOTSUPP; + } else { + /* Check if the specified controller is actually accessible */ + r = controller_is_v1_accessible(NULL, controller); + if (r < 0) + return r; + } + + return cg_get_path(controller, path, suffix, ret); +} + +int cg_set_xattr(const char *path, const char *name, const void *value, size_t size, int flags) { + _cleanup_free_ char *fs = NULL; + int r; + + assert(path); + assert(name); + assert(value || size <= 0); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return r; + + return RET_NERRNO(setxattr(fs, name, value, size, flags)); +} + +int cg_get_xattr(const char *path, const char *name, void *value, size_t size) { + _cleanup_free_ char *fs = NULL; + ssize_t n; + int r; + + assert(path); + assert(name); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return r; + + n = getxattr(fs, name, value, size); + if (n < 0) + return -errno; + + return (int) n; +} + +int cg_get_xattr_malloc(const char *path, const char *name, char **ret) { + _cleanup_free_ char *fs = NULL; + int r; + + assert(path); + assert(name); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return r; + + return lgetxattr_malloc(fs, name, ret); +} + +int cg_get_xattr_bool(const char *path, const char *name) { + _cleanup_free_ char *fs = NULL; + int r; + + assert(path); + assert(name); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return r; + + return getxattr_at_bool(AT_FDCWD, fs, name, /* flags= */ 0); +} + +int cg_remove_xattr(const char *path, const char *name) { + _cleanup_free_ char *fs = NULL; + int r; + + assert(path); + assert(name); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return r; + + return RET_NERRNO(removexattr(fs, name)); +} + +int cg_pid_get_path(const char *controller, pid_t pid, char **ret_path) { + _cleanup_fclose_ FILE *f = NULL; + const char *fs, *controller_str = NULL; /* avoid false maybe-uninitialized warning */ + int unified, r; + + assert(pid >= 0); + assert(ret_path); + + if (controller) { + if (!cg_controller_is_valid(controller)) + return -EINVAL; + } else + controller = SYSTEMD_CGROUP_CONTROLLER; + + unified = cg_unified_controller(controller); + if (unified < 0) + return unified; + if (unified == 0) { + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) + controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY; + else + controller_str = controller; + } + + fs = procfs_file_alloca(pid, "cgroup"); + r = fopen_unlocked(fs, "re", &f); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *line = NULL; + char *e; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + + if (unified) { + e = startswith(line, "0:"); + if (!e) + continue; + + e = strchr(e, ':'); + if (!e) + continue; + } else { + char *l; + + l = strchr(line, ':'); + if (!l) + continue; + + l++; + e = strchr(l, ':'); + if (!e) + continue; + *e = 0; + + assert(controller_str); + r = string_contains_word(l, ",", controller_str); + if (r < 0) + return r; + if (r == 0) + continue; + } + + char *path = strdup(e + 1); + if (!path) + return -ENOMEM; + + /* Truncate suffix indicating the process is a zombie */ + e = endswith(path, " (deleted)"); + if (e) + *e = 0; + + *ret_path = path; + return 0; + } +} + +int cg_pidref_get_path(const char *controller, const PidRef *pidref, char **ret_path) { + _cleanup_free_ char *path = NULL; + int r; + + assert(ret_path); + + if (!pidref_is_set(pidref)) + return -ESRCH; + + r = cg_pid_get_path(controller, pidref->pid, &path); + if (r < 0) + return r; + + /* Before we return the path, make sure the procfs entry for this pid still matches the pidref */ + r = pidref_verify(pidref); + if (r < 0) + return r; + + *ret_path = TAKE_PTR(path); + return 0; +} + +int cg_install_release_agent(const char *controller, const char *agent) { + _cleanup_free_ char *fs = NULL, *contents = NULL; + const char *sc; + int r; + + assert(agent); + + r = cg_unified_controller(controller); + if (r < 0) + return r; + if (r > 0) /* doesn't apply to unified hierarchy */ + return -EOPNOTSUPP; + + r = cg_get_path(controller, NULL, "release_agent", &fs); + if (r < 0) + return r; + + r = read_one_line_file(fs, &contents); + if (r < 0) + return r; + + sc = strstrip(contents); + if (isempty(sc)) { + r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + } else if (!path_equal(sc, agent)) + return -EEXIST; + + fs = mfree(fs); + r = cg_get_path(controller, NULL, "notify_on_release", &fs); + if (r < 0) + return r; + + contents = mfree(contents); + r = read_one_line_file(fs, &contents); + if (r < 0) + return r; + + sc = strstrip(contents); + if (streq(sc, "0")) { + r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + return 1; + } + + if (!streq(sc, "1")) + return -EIO; + + return 0; +} + +int cg_uninstall_release_agent(const char *controller) { + _cleanup_free_ char *fs = NULL; + int r; + + r = cg_unified_controller(controller); + if (r < 0) + return r; + if (r > 0) /* Doesn't apply to unified hierarchy */ + return -EOPNOTSUPP; + + r = cg_get_path(controller, NULL, "notify_on_release", &fs); + if (r < 0) + return r; + + r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + fs = mfree(fs); + + r = cg_get_path(controller, NULL, "release_agent", &fs); + if (r < 0) + return r; + + r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + return 0; +} + +int cg_is_empty(const char *controller, const char *path) { + _cleanup_fclose_ FILE *f = NULL; + pid_t pid; + int r; + + assert(path); + + r = cg_enumerate_processes(controller, path, &f); + if (r == -ENOENT) + return true; + if (r < 0) + return r; + + r = cg_read_pid(f, &pid); + if (r < 0) + return r; + + return r == 0; +} + +int cg_is_empty_recursive(const char *controller, const char *path) { + int r; + + assert(path); + + /* The root cgroup is always populated */ + if (controller && empty_or_root(path)) + return false; + + r = cg_unified_controller(controller); + if (r < 0) + return r; + if (r > 0) { + _cleanup_free_ char *t = NULL; + + /* On the unified hierarchy we can check empty state + * via the "populated" attribute of "cgroup.events". */ + + r = cg_read_event(controller, path, "populated", &t); + if (r == -ENOENT) + return true; + if (r < 0) + return r; + + return streq(t, "0"); + } else { + _cleanup_closedir_ DIR *d = NULL; + char *fn; + + r = cg_is_empty(controller, path); + if (r <= 0) + return r; + + r = cg_enumerate_subgroups(controller, path, &d); + if (r == -ENOENT) + return true; + if (r < 0) + return r; + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = path_join(path, fn); + free(fn); + if (!p) + return -ENOMEM; + + r = cg_is_empty_recursive(controller, p); + if (r <= 0) + return r; + } + if (r < 0) + return r; + + return true; + } +} + +int cg_split_spec(const char *spec, char **ret_controller, char **ret_path) { + _cleanup_free_ char *controller = NULL, *path = NULL; + int r; + + assert(spec); + + if (*spec == '/') { + if (!path_is_normalized(spec)) + return -EINVAL; + + if (ret_path) { + r = path_simplify_alloc(spec, &path); + if (r < 0) + return r; + } + + } else { + const char *e; + + e = strchr(spec, ':'); + if (e) { + controller = strndup(spec, e-spec); + if (!controller) + return -ENOMEM; + if (!cg_controller_is_valid(controller)) + return -EINVAL; + + if (!isempty(e + 1)) { + path = strdup(e+1); + if (!path) + return -ENOMEM; + + if (!path_is_normalized(path) || + !path_is_absolute(path)) + return -EINVAL; + + path_simplify(path); + } + + } else { + if (!cg_controller_is_valid(spec)) + return -EINVAL; + + if (ret_controller) { + controller = strdup(spec); + if (!controller) + return -ENOMEM; + } + } + } + + if (ret_controller) + *ret_controller = TAKE_PTR(controller); + if (ret_path) + *ret_path = TAKE_PTR(path); + return 0; +} + +int cg_mangle_path(const char *path, char **ret) { + _cleanup_free_ char *c = NULL, *p = NULL; + int r; + + assert(path); + assert(ret); + + /* First, check if it already is a filesystem path */ + if (path_startswith(path, "/sys/fs/cgroup")) + return path_simplify_alloc(path, ret); + + /* Otherwise, treat it as cg spec */ + r = cg_split_spec(path, &c, &p); + if (r < 0) + return r; + + return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, ret); +} + +int cg_get_root_path(char **ret_path) { + char *p, *e; + int r; + + assert(ret_path); + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p); + if (r < 0) + return r; + + e = endswith(p, "/" SPECIAL_INIT_SCOPE); + if (!e) + e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */ + if (!e) + e = endswith(p, "/system"); /* even more legacy */ + if (e) + *e = 0; + + *ret_path = p; + return 0; +} + +int cg_shift_path(const char *cgroup, const char *root, const char **ret_shifted) { + _cleanup_free_ char *rt = NULL; + char *p; + int r; + + assert(cgroup); + assert(ret_shifted); + + if (!root) { + /* If the root was specified let's use that, otherwise + * let's determine it from PID 1 */ + + r = cg_get_root_path(&rt); + if (r < 0) + return r; + + root = rt; + } + + p = path_startswith(cgroup, root); + if (p && p > cgroup) + *ret_shifted = p - 1; + else + *ret_shifted = cgroup; + + return 0; +} + +int cg_pid_get_path_shifted(pid_t pid, const char *root, char **ret_cgroup) { + _cleanup_free_ char *raw = NULL; + const char *c; + int r; + + assert(pid >= 0); + assert(ret_cgroup); + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw); + if (r < 0) + return r; + + r = cg_shift_path(raw, root, &c); + if (r < 0) + return r; + + if (c == raw) + *ret_cgroup = TAKE_PTR(raw); + else { + char *n; + + n = strdup(c); + if (!n) + return -ENOMEM; + + *ret_cgroup = n; + } + + return 0; +} + +int cg_path_decode_unit(const char *cgroup, char **ret_unit) { + char *c, *s; + size_t n; + + assert(cgroup); + assert(ret_unit); + + n = strcspn(cgroup, "/"); + if (n < 3) + return -ENXIO; + + c = strndupa_safe(cgroup, n); + c = cg_unescape(c); + + if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return -ENXIO; + + s = strdup(c); + if (!s) + return -ENOMEM; + + *ret_unit = s; + return 0; +} + +static bool valid_slice_name(const char *p, size_t n) { + + if (!p) + return false; + + if (n < STRLEN("x.slice")) + return false; + + if (memcmp(p + n - 6, ".slice", 6) == 0) { + char buf[n+1], *c; + + memcpy(buf, p, n); + buf[n] = 0; + + c = cg_unescape(buf); + + return unit_name_is_valid(c, UNIT_NAME_PLAIN); + } + + return false; +} + +static const char *skip_slices(const char *p) { + assert(p); + + /* Skips over all slice assignments */ + + for (;;) { + size_t n; + + p += strspn(p, "/"); + + n = strcspn(p, "/"); + if (!valid_slice_name(p, n)) + return p; + + p += n; + } +} + +int cg_path_get_unit(const char *path, char **ret) { + _cleanup_free_ char *unit = NULL; + const char *e; + int r; + + assert(path); + assert(ret); + + e = skip_slices(path); + + r = cg_path_decode_unit(e, &unit); + if (r < 0) + return r; + + /* We skipped over the slices, don't accept any now */ + if (endswith(unit, ".slice")) + return -ENXIO; + + *ret = TAKE_PTR(unit); + return 0; +} + +int cg_path_get_unit_path(const char *path, char **ret) { + _cleanup_free_ char *path_copy = NULL; + char *unit_name; + + assert(path); + assert(ret); + + path_copy = strdup(path); + if (!path_copy) + return -ENOMEM; + + unit_name = (char *)skip_slices(path_copy); + unit_name[strcspn(unit_name, "/")] = 0; + + if (!unit_name_is_valid(cg_unescape(unit_name), UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) + return -ENXIO; + + *ret = TAKE_PTR(path_copy); + + return 0; +} + +int cg_pid_get_unit(pid_t pid, char **ret_unit) { + _cleanup_free_ char *cgroup = NULL; + int r; + + assert(ret_unit); + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_unit(cgroup, ret_unit); +} + +int cg_pidref_get_unit(const PidRef *pidref, char **ret) { + _cleanup_free_ char *unit = NULL; + int r; + + assert(ret); + + if (!pidref_is_set(pidref)) + return -ESRCH; + + r = cg_pid_get_unit(pidref->pid, &unit); + if (r < 0) + return r; + + r = pidref_verify(pidref); + if (r < 0) + return r; + + *ret = TAKE_PTR(unit); + return 0; +} + +/** + * Skip session-*.scope, but require it to be there. + */ +static const char *skip_session(const char *p) { + size_t n; + + if (isempty(p)) + return NULL; + + p += strspn(p, "/"); + + n = strcspn(p, "/"); + if (n < STRLEN("session-x.scope")) + return NULL; + + if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) { + char buf[n - 8 - 6 + 1]; + + memcpy(buf, p + 8, n - 8 - 6); + buf[n - 8 - 6] = 0; + + /* Note that session scopes never need unescaping, + * since they cannot conflict with the kernel's own + * names, hence we don't need to call cg_unescape() + * here. */ + + if (!session_id_valid(buf)) + return NULL; + + p += n; + p += strspn(p, "/"); + return p; + } + + return NULL; +} + +/** + * Skip user@*.service, but require it to be there. + */ +static const char *skip_user_manager(const char *p) { + size_t n; + + if (isempty(p)) + return NULL; + + p += strspn(p, "/"); + + n = strcspn(p, "/"); + if (n < STRLEN("user@x.service")) + return NULL; + + if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) { + char buf[n - 5 - 8 + 1]; + + memcpy(buf, p + 5, n - 5 - 8); + buf[n - 5 - 8] = 0; + + /* Note that user manager services never need unescaping, + * since they cannot conflict with the kernel's own + * names, hence we don't need to call cg_unescape() + * here. */ + + if (parse_uid(buf, NULL) < 0) + return NULL; + + p += n; + p += strspn(p, "/"); + + return p; + } + + return NULL; +} + +static const char *skip_user_prefix(const char *path) { + const char *e, *t; + + assert(path); + + /* Skip slices, if there are any */ + e = skip_slices(path); + + /* Skip the user manager, if it's in the path now... */ + t = skip_user_manager(e); + if (t) + return t; + + /* Alternatively skip the user session if it is in the path... */ + return skip_session(e); +} + +int cg_path_get_user_unit(const char *path, char **ret) { + const char *t; + + assert(path); + assert(ret); + + t = skip_user_prefix(path); + if (!t) + return -ENXIO; + + /* And from here on it looks pretty much the same as for a system unit, hence let's use the same + * parser. */ + return cg_path_get_unit(t, ret); +} + +int cg_pid_get_user_unit(pid_t pid, char **ret_unit) { + _cleanup_free_ char *cgroup = NULL; + int r; + + assert(ret_unit); + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_user_unit(cgroup, ret_unit); +} + +int cg_path_get_machine_name(const char *path, char **ret_machine) { + _cleanup_free_ char *u = NULL; + const char *sl; + int r; + + r = cg_path_get_unit(path, &u); + if (r < 0) + return r; + + sl = strjoina("/run/systemd/machines/unit:", u); + return readlink_malloc(sl, ret_machine); +} + +int cg_pid_get_machine_name(pid_t pid, char **ret_machine) { + _cleanup_free_ char *cgroup = NULL; + int r; + + assert(ret_machine); + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_machine_name(cgroup, ret_machine); +} + +int cg_path_get_cgroupid(const char *path, uint64_t *ret) { + cg_file_handle fh = CG_FILE_HANDLE_INIT; + int mnt_id = -1; + + assert(path); + assert(ret); + + /* This is cgroupfs so we know the size of the handle, thus no need to loop around like + * name_to_handle_at_loop() does in mountpoint-util.c */ + if (name_to_handle_at(AT_FDCWD, path, &fh.file_handle, &mnt_id, 0) < 0) + return -errno; + + *ret = CG_FILE_HANDLE_CGROUPID(fh); + return 0; +} + +int cg_path_get_session(const char *path, char **ret_session) { + _cleanup_free_ char *unit = NULL; + char *start, *end; + int r; + + assert(path); + + r = cg_path_get_unit(path, &unit); + if (r < 0) + return r; + + start = startswith(unit, "session-"); + if (!start) + return -ENXIO; + end = endswith(start, ".scope"); + if (!end) + return -ENXIO; + + *end = 0; + if (!session_id_valid(start)) + return -ENXIO; + + if (ret_session) { + char *rr; + + rr = strdup(start); + if (!rr) + return -ENOMEM; + + *ret_session = rr; + } + + return 0; +} + +int cg_pid_get_session(pid_t pid, char **ret_session) { + _cleanup_free_ char *cgroup = NULL; + int r; + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_session(cgroup, ret_session); +} + +int cg_path_get_owner_uid(const char *path, uid_t *ret_uid) { + _cleanup_free_ char *slice = NULL; + char *start, *end; + int r; + + assert(path); + + r = cg_path_get_slice(path, &slice); + if (r < 0) + return r; + + start = startswith(slice, "user-"); + if (!start) + return -ENXIO; + + end = endswith(start, ".slice"); + if (!end) + return -ENXIO; + + *end = 0; + if (parse_uid(start, ret_uid) < 0) + return -ENXIO; + + return 0; +} + +int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid) { + _cleanup_free_ char *cgroup = NULL; + int r; + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_owner_uid(cgroup, ret_uid); +} + +int cg_path_get_slice(const char *p, char **ret_slice) { + const char *e = NULL; + + assert(p); + assert(ret_slice); + + /* Finds the right-most slice unit from the beginning, but + * stops before we come to the first non-slice unit. */ + + for (;;) { + size_t n; + + p += strspn(p, "/"); + + n = strcspn(p, "/"); + if (!valid_slice_name(p, n)) { + + if (!e) { + char *s; + + s = strdup(SPECIAL_ROOT_SLICE); + if (!s) + return -ENOMEM; + + *ret_slice = s; + return 0; + } + + return cg_path_decode_unit(e, ret_slice); + } + + e = p; + p += n; + } +} + +int cg_pid_get_slice(pid_t pid, char **ret_slice) { + _cleanup_free_ char *cgroup = NULL; + int r; + + assert(ret_slice); + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_slice(cgroup, ret_slice); +} + +int cg_path_get_user_slice(const char *p, char **ret_slice) { + const char *t; + assert(p); + assert(ret_slice); + + t = skip_user_prefix(p); + if (!t) + return -ENXIO; + + /* And now it looks pretty much the same as for a system slice, so let's just use the same parser + * from here on. */ + return cg_path_get_slice(t, ret_slice); +} + +int cg_pid_get_user_slice(pid_t pid, char **ret_slice) { + _cleanup_free_ char *cgroup = NULL; + int r; + + assert(ret_slice); + + r = cg_pid_get_path_shifted(pid, NULL, &cgroup); + if (r < 0) + return r; + + return cg_path_get_user_slice(cgroup, ret_slice); +} + +bool cg_needs_escape(const char *p) { + + /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note + * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if + * they shall be used. Also note that various names cannot be made valid by escaping even if we + * return true here (because too long, or contain the forbidden character "/"). */ + + if (!filename_is_valid(p)) + return true; + + if (IN_SET(p[0], '_', '.')) + return true; + + if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks")) + return true; + + if (startswith(p, "cgroup.")) + return true; + + for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + const char *q; + + q = startswith(p, cgroup_controller_to_string(c)); + if (!q) + continue; + + if (q[0] == '.') + return true; + } + + return false; +} + +int cg_escape(const char *p, char **ret) { + _cleanup_free_ char *n = NULL; + + /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any + * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That + * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there + * is one. */ + + /* The return value of this function (unlike cg_unescape()) needs free()! */ + + if (cg_needs_escape(p)) { + n = strjoin("_", p); + if (!n) + return -ENOMEM; + + if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */ + return -EINVAL; + } else { + n = strdup(p); + if (!n) + return -ENOMEM; + } + + *ret = TAKE_PTR(n); + return 0; +} + +char *cg_unescape(const char *p) { + assert(p); + + /* The return value of this function (unlike cg_escape()) + * doesn't need free()! */ + + if (p[0] == '_') + return (char*) p+1; + + return (char*) p; +} + +#define CONTROLLER_VALID \ + DIGITS LETTERS \ + "_" + +bool cg_controller_is_valid(const char *p) { + const char *t, *s; + + if (!p) + return false; + + if (streq(p, SYSTEMD_CGROUP_CONTROLLER)) + return true; + + s = startswith(p, "name="); + if (s) + p = s; + + if (IN_SET(*p, 0, '_')) + return false; + + for (t = p; *t; t++) + if (!strchr(CONTROLLER_VALID, *t)) + return false; + + if (t - p > NAME_MAX) + return false; + + return true; +} + +int cg_slice_to_path(const char *unit, char **ret) { + _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL; + const char *dash; + int r; + + assert(unit); + assert(ret); + + if (streq(unit, SPECIAL_ROOT_SLICE)) { + char *x; + + x = strdup(""); + if (!x) + return -ENOMEM; + *ret = x; + return 0; + } + + if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN)) + return -EINVAL; + + if (!endswith(unit, ".slice")) + return -EINVAL; + + r = unit_name_to_prefix(unit, &p); + if (r < 0) + return r; + + dash = strchr(p, '-'); + + /* Don't allow initial dashes */ + if (dash == p) + return -EINVAL; + + while (dash) { + _cleanup_free_ char *escaped = NULL; + char n[dash - p + sizeof(".slice")]; + +#if HAS_FEATURE_MEMORY_SANITIZER + /* msan doesn't instrument stpncpy, so it thinks + * n is later used uninitialized: + * https://github.com/google/sanitizers/issues/926 + */ + zero(n); +#endif + + /* Don't allow trailing or double dashes */ + if (IN_SET(dash[1], 0, '-')) + return -EINVAL; + + strcpy(stpncpy(n, p, dash - p), ".slice"); + if (!unit_name_is_valid(n, UNIT_NAME_PLAIN)) + return -EINVAL; + + r = cg_escape(n, &escaped); + if (r < 0) + return r; + + if (!strextend(&s, escaped, "/")) + return -ENOMEM; + + dash = strchr(dash+1, '-'); + } + + r = cg_escape(unit, &e); + if (r < 0) + return r; + + if (!strextend(&s, e)) + return -ENOMEM; + + *ret = TAKE_PTR(s); + return 0; +} + +int cg_is_threaded(const char *path) { + _cleanup_free_ char *fs = NULL, *contents = NULL; + _cleanup_strv_free_ char **v = NULL; + int r; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.type", &fs); + if (r < 0) + return r; + + r = read_full_virtual_file(fs, &contents, NULL); + if (r == -ENOENT) + return false; /* Assume no. */ + if (r < 0) + return r; + + v = strv_split(contents, NULL); + if (!v) + return -ENOMEM; + + /* If the cgroup is in the threaded mode, it contains "threaded". + * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */ + return strv_contains(v, "threaded") || strv_contains(v, "invalid"); +} + +int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) { + _cleanup_free_ char *p = NULL; + int r; + + r = cg_get_path(controller, path, attribute, &p); + if (r < 0) + return r; + + return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER); +} + +int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + r = cg_get_path(controller, path, attribute, &p); + if (r < 0) + return r; + + return read_one_line_file(p, ret); +} + +int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret) { + _cleanup_free_ char *value = NULL; + uint64_t v; + int r; + + assert(ret); + + r = cg_get_attribute(controller, path, attribute, &value); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + if (streq(value, "max")) { + *ret = CGROUP_LIMIT_MAX; + return 0; + } + + r = safe_atou64(value, &v); + if (r < 0) + return r; + + *ret = v; + return 0; +} + +int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) { + _cleanup_free_ char *value = NULL; + int r; + + assert(ret); + + r = cg_get_attribute(controller, path, attribute, &value); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + r = parse_boolean(value); + if (r < 0) + return r; + + *ret = r; + return 0; +} + +int cg_get_owner(const char *path, uid_t *ret_uid) { + _cleanup_free_ char *f = NULL; + struct stat stats; + int r; + + assert(ret_uid); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &f); + if (r < 0) + return r; + + if (stat(f, &stats) < 0) + return -errno; + + r = stat_verify_directory(&stats); + if (r < 0) + return r; + + *ret_uid = stats.st_uid; + return 0; +} + +int cg_get_keyed_attribute_full( + const char *controller, + const char *path, + const char *attribute, + char **keys, + char **ret_values, + CGroupKeyMode mode) { + + _cleanup_free_ char *filename = NULL, *contents = NULL; + const char *p; + size_t n, i, n_done = 0; + char **v; + int r; + + /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with + * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of + * entries as 'keys'. On success each entry will be set to the value of the matching key. + * + * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode + * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */ + + r = cg_get_path(controller, path, attribute, &filename); + if (r < 0) + return r; + + r = read_full_file(filename, &contents, NULL); + if (r < 0) + return r; + + n = strv_length(keys); + if (n == 0) /* No keys to retrieve? That's easy, we are done then */ + return 0; + + /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */ + v = newa0(char*, n); + + for (p = contents; *p;) { + const char *w = NULL; + + for (i = 0; i < n; i++) + if (!v[i]) { + w = first_word(p, keys[i]); + if (w) + break; + } + + if (w) { + size_t l; + + l = strcspn(w, NEWLINE); + v[i] = strndup(w, l); + if (!v[i]) { + r = -ENOMEM; + goto fail; + } + + n_done++; + if (n_done >= n) + goto done; + + p = w + l; + } else + p += strcspn(p, NEWLINE); + + p += strspn(p, NEWLINE); + } + + if (mode & CG_KEY_MODE_GRACEFUL) + goto done; + + r = -ENXIO; + +fail: + free_many_charp(v, n); + return r; + +done: + memcpy(ret_values, v, sizeof(char*) * n); + if (mode & CG_KEY_MODE_GRACEFUL) + return n_done; + + return 0; +} + +int cg_mask_to_string(CGroupMask mask, char **ret) { + _cleanup_free_ char *s = NULL; + bool space = false; + CGroupController c; + size_t n = 0; + + assert(ret); + + if (mask == 0) { + *ret = NULL; + return 0; + } + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + const char *k; + size_t l; + + if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c))) + continue; + + k = cgroup_controller_to_string(c); + l = strlen(k); + + if (!GREEDY_REALLOC(s, n + space + l + 1)) + return -ENOMEM; + + if (space) + s[n] = ' '; + memcpy(s + n + space, k, l); + n += space + l; + + space = true; + } + + assert(s); + + s[n] = 0; + *ret = TAKE_PTR(s); + + return 0; +} + +int cg_mask_from_string(const char *value, CGroupMask *ret) { + CGroupMask m = 0; + + assert(ret); + assert(value); + + for (;;) { + _cleanup_free_ char *n = NULL; + CGroupController v; + int r; + + r = extract_first_word(&value, &n, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + v = cgroup_controller_from_string(n); + if (v < 0) + continue; + + m |= CGROUP_CONTROLLER_TO_MASK(v); + } + + *ret = m; + return 0; +} + +int cg_mask_supported_subtree(const char *root, CGroupMask *ret) { + CGroupMask mask; + int r; + + /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that + * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz + * pseudo-controllers. */ + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) { + _cleanup_free_ char *controllers = NULL, *path = NULL; + + /* In the unified hierarchy we can read the supported and accessible controllers from + * the top-level cgroup attribute */ + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path); + if (r < 0) + return r; + + r = read_one_line_file(path, &controllers); + if (r < 0) + return r; + + r = cg_mask_from_string(controllers, &mask); + if (r < 0) + return r; + + /* Mask controllers that are not supported in unified hierarchy. */ + mask &= CGROUP_MASK_V2; + + } else { + CGroupController c; + + /* In the legacy hierarchy, we check which hierarchies are accessible. */ + + mask = 0; + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *n; + + if (!FLAGS_SET(CGROUP_MASK_V1, bit)) + continue; + + n = cgroup_controller_to_string(c); + if (controller_is_v1_accessible(root, n) >= 0) + mask |= bit; + } + } + + *ret = mask; + return 0; +} + +int cg_mask_supported(CGroupMask *ret) { + _cleanup_free_ char *root = NULL; + int r; + + r = cg_get_root_path(&root); + if (r < 0) + return r; + + return cg_mask_supported_subtree(root, ret); +} + +int cg_kernel_controllers(Set **ret) { + _cleanup_set_free_ Set *controllers = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(ret); + + /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support + * and controllers that aren't currently accessible (because not mounted). This does not include "name=" + * pseudo-controllers. */ + + r = fopen_unlocked("/proc/cgroups", "re", &f); + if (r == -ENOENT) { + *ret = NULL; + return 0; + } + if (r < 0) + return r; + + /* Ignore the header line */ + (void) read_line(f, SIZE_MAX, NULL); + + for (;;) { + _cleanup_free_ char *controller = NULL; + int enabled = 0; + + errno = 0; + if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) { + + if (feof(f)) + break; + + if (ferror(f)) + return errno_or_else(EIO); + + return -EBADMSG; + } + + if (!enabled) + continue; + + if (!cg_controller_is_valid(controller)) + return -EBADMSG; + + r = set_ensure_consume(&controllers, &string_hash_ops_free, TAKE_PTR(controller)); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(controllers); + + return 0; +} + +/* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on + * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1 + * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on + * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility + * with other tools. + * + * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep + * cgroup v2 process management but disable the compat dual layout, we return true on + * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified(). + */ +static thread_local bool unified_systemd_v232; + +int cg_unified_cached(bool flush) { + static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN; + + struct statfs fs; + + /* Checks if we support the unified hierarchy. Returns an + * error when the cgroup hierarchies aren't mounted yet or we + * have any other trouble determining if the unified hierarchy + * is supported. */ + + if (flush) + unified_cache = CGROUP_UNIFIED_UNKNOWN; + else if (unified_cache >= CGROUP_UNIFIED_NONE) + return unified_cache; + + if (statfs("/sys/fs/cgroup/", &fs) < 0) + return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m"); + + if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) { + log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy"); + unified_cache = CGROUP_UNIFIED_ALL; + } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) { + if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 && + F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) { + log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller"); + unified_cache = CGROUP_UNIFIED_SYSTEMD; + unified_systemd_v232 = false; + } else { + if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0) { + if (errno == ENOENT) { + /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */ + log_debug_errno(errno, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found."); + return -ENOMEDIUM; + } + return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m"); + } + + if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) { + log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)"); + unified_cache = CGROUP_UNIFIED_SYSTEMD; + unified_systemd_v232 = true; + } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) { + log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy"); + unified_cache = CGROUP_UNIFIED_NONE; + } else { + log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy", + (unsigned long long) fs.f_type); + unified_cache = CGROUP_UNIFIED_NONE; + } + } + } else if (F_TYPE_EQUAL(fs.f_type, SYSFS_MAGIC)) { + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "No filesystem is currently mounted on /sys/fs/cgroup."); + } else + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "Unknown filesystem type %llx mounted on /sys/fs/cgroup.", + (unsigned long long)fs.f_type); + + return unified_cache; +} + +int cg_unified_controller(const char *controller) { + int r; + + r = cg_unified_cached(false); + if (r < 0) + return r; + + if (r == CGROUP_UNIFIED_NONE) + return false; + + if (r >= CGROUP_UNIFIED_ALL) + return true; + + return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER); +} + +int cg_all_unified(void) { + int r; + + r = cg_unified_cached(false); + if (r < 0) + return r; + + return r >= CGROUP_UNIFIED_ALL; +} + +int cg_hybrid_unified(void) { + int r; + + r = cg_unified_cached(false); + if (r < 0) + return r; + + return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232; +} + +int cg_is_delegated(const char *path) { + int r; + + assert(path); + + r = cg_get_xattr_bool(path, "trusted.delegate"); + if (!ERRNO_IS_NEG_XATTR_ABSENT(r)) + return r; + + /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the assumption + * that whoever is trusted enough to own the cgroup, is also trusted enough to decide if it is + * delegated or not this should be safe. */ + r = cg_get_xattr_bool(path, "user.delegate"); + return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r; +} + +int cg_is_delegated_fd(int fd) { + int r; + + assert(fd >= 0); + + r = getxattr_at_bool(fd, /* path= */ NULL, "trusted.delegate", /* flags= */ 0); + if (!ERRNO_IS_NEG_XATTR_ABSENT(r)) + return r; + + r = getxattr_at_bool(fd, /* path= */ NULL, "user.delegate", /* flags= */ 0); + return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r; +} + +int cg_has_coredump_receive(const char *path) { + int r; + + assert(path); + + r = cg_get_xattr_bool(path, "user.coredump_receive"); + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) + return false; + + return r; +} + +const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = { + [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX, + [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX, + [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX, + [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX, +}; + +static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = { + [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax", + [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax", + [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax", + [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType); + +bool is_cgroup_fs(const struct statfs *s) { + return is_fs_type(s, CGROUP_SUPER_MAGIC) || + is_fs_type(s, CGROUP2_SUPER_MAGIC); +} + +bool fd_is_cgroup_fs(int fd) { + struct statfs s; + + if (fstatfs(fd, &s) < 0) + return -errno; + + return is_cgroup_fs(&s); +} + +static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = { + [CGROUP_CONTROLLER_CPU] = "cpu", + [CGROUP_CONTROLLER_CPUACCT] = "cpuacct", + [CGROUP_CONTROLLER_CPUSET] = "cpuset", + [CGROUP_CONTROLLER_IO] = "io", + [CGROUP_CONTROLLER_BLKIO] = "blkio", + [CGROUP_CONTROLLER_MEMORY] = "memory", + [CGROUP_CONTROLLER_DEVICES] = "devices", + [CGROUP_CONTROLLER_PIDS] = "pids", + [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall", + [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices", + [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign", + [CGROUP_CONTROLLER_BPF_SOCKET_BIND] = "bpf-socket-bind", + [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES] = "bpf-restrict-network-interfaces", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController); + +CGroupMask get_cpu_accounting_mask(void) { + static CGroupMask needed_mask = (CGroupMask) -1; + + /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is + * provided externally from the CPU controller, which means we don't + * need to enable the CPU controller just to get metrics. This is good, + * because enabling the CPU controller comes at a minor performance + * hit, especially when it's propagated deep into large hierarchies. + * There's also no separate CPU accounting controller available within + * a unified hierarchy. + * + * This combination of factors results in the desired cgroup mask to + * enable for CPU accounting varying as follows: + * + * ╔═════════════════════╤═════════════════════╗ + * ║ Linux ≥4.15 │ Linux <4.15 ║ + * ╔═══════════════╬═════════════════════╪═════════════════════╣ + * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║ + * ╟───────────────╫─────────────────────┼─────────────────────╢ + * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║ + * ╚═══════════════╩═════════════════════╧═════════════════════╝ + * + * We check kernel version here instead of manually checking whether + * cpu.stat is present for every cgroup, as that check in itself would + * already be fairly expensive. + * + * Kernels where this patch has been backported will therefore have the + * CPU controller enabled unnecessarily. This is more expensive than + * necessary, but harmless. ☺️ + */ + + if (needed_mask == (CGroupMask) -1) { + if (cg_all_unified()) { + struct utsname u; + assert_se(uname(&u) >= 0); + + if (strverscmp_improved(u.release, "4.15") < 0) + needed_mask = CGROUP_MASK_CPU; + else + needed_mask = 0; + } else + needed_mask = CGROUP_MASK_CPUACCT; + } + + return needed_mask; +} + +bool cpu_accounting_is_cheap(void) { + return get_cpu_accounting_mask() == 0; +} + +static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = { + [MANAGED_OOM_AUTO] = "auto", + [MANAGED_OOM_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode); + +static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = { + [MANAGED_OOM_PREFERENCE_NONE] = "none", + [MANAGED_OOM_PREFERENCE_AVOID] = "avoid", + [MANAGED_OOM_PREFERENCE_OMIT] = "omit", +}; + +DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference); diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h new file mode 100644 index 0000000..d06eb6d --- /dev/null +++ b/src/basic/cgroup-util.h @@ -0,0 +1,356 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "constants.h" +#include "pidref.h" +#include "set.h" + +#define SYSTEMD_CGROUP_CONTROLLER_LEGACY "name=systemd" +#define SYSTEMD_CGROUP_CONTROLLER_HYBRID "name=unified" +#define SYSTEMD_CGROUP_CONTROLLER "_systemd" + +/* An enum of well known cgroup controllers */ +typedef enum CGroupController { + /* Original cgroup controllers */ + CGROUP_CONTROLLER_CPU, + CGROUP_CONTROLLER_CPUACCT, /* v1 only */ + CGROUP_CONTROLLER_CPUSET, /* v2 only */ + CGROUP_CONTROLLER_IO, /* v2 only */ + CGROUP_CONTROLLER_BLKIO, /* v1 only */ + CGROUP_CONTROLLER_MEMORY, + CGROUP_CONTROLLER_DEVICES, /* v1 only */ + CGROUP_CONTROLLER_PIDS, + + /* BPF-based pseudo-controllers, v2 only */ + CGROUP_CONTROLLER_BPF_FIREWALL, + CGROUP_CONTROLLER_BPF_DEVICES, + CGROUP_CONTROLLER_BPF_FOREIGN, + CGROUP_CONTROLLER_BPF_SOCKET_BIND, + CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES, + /* The BPF hook implementing RestrictFileSystems= is not defined here. + * It's applied as late as possible in exec_invoke() so we don't block + * our own unit setup code. */ + + _CGROUP_CONTROLLER_MAX, + _CGROUP_CONTROLLER_INVALID = -EINVAL, +} CGroupController; + +#define CGROUP_CONTROLLER_TO_MASK(c) (1U << (c)) + +/* A bit mask of well known cgroup controllers */ +typedef enum CGroupMask { + CGROUP_MASK_CPU = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPU), + CGROUP_MASK_CPUACCT = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUACCT), + CGROUP_MASK_CPUSET = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUSET), + CGROUP_MASK_IO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_IO), + CGROUP_MASK_BLKIO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BLKIO), + CGROUP_MASK_MEMORY = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_MEMORY), + CGROUP_MASK_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_DEVICES), + CGROUP_MASK_PIDS = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_PIDS), + CGROUP_MASK_BPF_FIREWALL = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FIREWALL), + CGROUP_MASK_BPF_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_DEVICES), + CGROUP_MASK_BPF_FOREIGN = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FOREIGN), + CGROUP_MASK_BPF_SOCKET_BIND = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_SOCKET_BIND), + CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES), + + /* All real cgroup v1 controllers */ + CGROUP_MASK_V1 = CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT|CGROUP_MASK_BLKIO|CGROUP_MASK_MEMORY|CGROUP_MASK_DEVICES|CGROUP_MASK_PIDS, + + /* All real cgroup v2 controllers */ + CGROUP_MASK_V2 = CGROUP_MASK_CPU|CGROUP_MASK_CPUSET|CGROUP_MASK_IO|CGROUP_MASK_MEMORY|CGROUP_MASK_PIDS, + + /* All controllers we want to delegate in case of Delegate=yes. Which are prety much the v2 controllers only, as delegation on v1 is not safe, and bpf stuff isn't a real controller */ + CGROUP_MASK_DELEGATE = CGROUP_MASK_V2, + + /* All cgroup v2 BPF pseudo-controllers */ + CGROUP_MASK_BPF = CGROUP_MASK_BPF_FIREWALL|CGROUP_MASK_BPF_DEVICES|CGROUP_MASK_BPF_FOREIGN|CGROUP_MASK_BPF_SOCKET_BIND|CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES, + + _CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1, +} CGroupMask; + +static inline CGroupMask CGROUP_MASK_EXTEND_JOINED(CGroupMask mask) { + /* We always mount "cpu" and "cpuacct" in the same hierarchy. Hence, when one bit is set also set the other */ + + if (mask & (CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT)) + mask |= (CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT); + + return mask; +} + +CGroupMask get_cpu_accounting_mask(void); +bool cpu_accounting_is_cheap(void); + +/* Special values for all weight knobs on unified hierarchy */ +#define CGROUP_WEIGHT_INVALID UINT64_MAX +#define CGROUP_WEIGHT_IDLE UINT64_C(0) +#define CGROUP_WEIGHT_MIN UINT64_C(1) +#define CGROUP_WEIGHT_MAX UINT64_C(10000) +#define CGROUP_WEIGHT_DEFAULT UINT64_C(100) + +#define CGROUP_LIMIT_MIN UINT64_C(0) +#define CGROUP_LIMIT_MAX UINT64_MAX + +static inline bool CGROUP_WEIGHT_IS_OK(uint64_t x) { + return + x == CGROUP_WEIGHT_INVALID || + (x >= CGROUP_WEIGHT_MIN && x <= CGROUP_WEIGHT_MAX); +} + +/* IO limits on unified hierarchy */ +typedef enum CGroupIOLimitType { + CGROUP_IO_RBPS_MAX, + CGROUP_IO_WBPS_MAX, + CGROUP_IO_RIOPS_MAX, + CGROUP_IO_WIOPS_MAX, + + _CGROUP_IO_LIMIT_TYPE_MAX, + _CGROUP_IO_LIMIT_TYPE_INVALID = -EINVAL, +} CGroupIOLimitType; + +extern const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX]; + +const char* cgroup_io_limit_type_to_string(CGroupIOLimitType t) _const_; +CGroupIOLimitType cgroup_io_limit_type_from_string(const char *s) _pure_; + +/* Special values for the cpu.shares attribute */ +#define CGROUP_CPU_SHARES_INVALID UINT64_MAX +#define CGROUP_CPU_SHARES_MIN UINT64_C(2) +#define CGROUP_CPU_SHARES_MAX UINT64_C(262144) +#define CGROUP_CPU_SHARES_DEFAULT UINT64_C(1024) + +static inline bool CGROUP_CPU_SHARES_IS_OK(uint64_t x) { + return + x == CGROUP_CPU_SHARES_INVALID || + (x >= CGROUP_CPU_SHARES_MIN && x <= CGROUP_CPU_SHARES_MAX); +} + +/* Special values for the special {blkio,io}.bfq.weight attribute */ +#define CGROUP_BFQ_WEIGHT_INVALID UINT64_MAX +#define CGROUP_BFQ_WEIGHT_MIN UINT64_C(1) +#define CGROUP_BFQ_WEIGHT_MAX UINT64_C(1000) +#define CGROUP_BFQ_WEIGHT_DEFAULT UINT64_C(100) + +/* Convert the normal io.weight value to io.bfq.weight */ +static inline uint64_t BFQ_WEIGHT(uint64_t io_weight) { + return + io_weight <= CGROUP_WEIGHT_DEFAULT ? + CGROUP_BFQ_WEIGHT_DEFAULT - (CGROUP_WEIGHT_DEFAULT - io_weight) * (CGROUP_BFQ_WEIGHT_DEFAULT - CGROUP_BFQ_WEIGHT_MIN) / (CGROUP_WEIGHT_DEFAULT - CGROUP_WEIGHT_MIN) : + CGROUP_BFQ_WEIGHT_DEFAULT + (io_weight - CGROUP_WEIGHT_DEFAULT) * (CGROUP_BFQ_WEIGHT_MAX - CGROUP_BFQ_WEIGHT_DEFAULT) / (CGROUP_WEIGHT_MAX - CGROUP_WEIGHT_DEFAULT); +} + +/* Special values for the blkio.weight attribute */ +#define CGROUP_BLKIO_WEIGHT_INVALID UINT64_MAX +#define CGROUP_BLKIO_WEIGHT_MIN UINT64_C(10) +#define CGROUP_BLKIO_WEIGHT_MAX UINT64_C(1000) +#define CGROUP_BLKIO_WEIGHT_DEFAULT UINT64_C(500) + +static inline bool CGROUP_BLKIO_WEIGHT_IS_OK(uint64_t x) { + return + x == CGROUP_BLKIO_WEIGHT_INVALID || + (x >= CGROUP_BLKIO_WEIGHT_MIN && x <= CGROUP_BLKIO_WEIGHT_MAX); +} + +typedef enum CGroupUnified { + CGROUP_UNIFIED_UNKNOWN = -1, + CGROUP_UNIFIED_NONE = 0, /* Both systemd and controllers on legacy */ + CGROUP_UNIFIED_SYSTEMD = 1, /* Only systemd on unified */ + CGROUP_UNIFIED_ALL = 2, /* Both systemd and controllers on unified */ +} CGroupUnified; + +/* + * General rules: + * + * We accept named hierarchies in the syntax "foo" and "name=foo". + * + * We expect that named hierarchies do not conflict in name with a + * kernel hierarchy, modulo the "name=" prefix. + * + * We always generate "normalized" controller names, i.e. without the + * "name=" prefix. + * + * We require absolute cgroup paths. When returning, we will always + * generate paths with multiple adjacent / removed. + */ + +int cg_enumerate_processes(const char *controller, const char *path, FILE **ret); +int cg_read_pid(FILE *f, pid_t *ret); +int cg_read_pidref(FILE *f, PidRef *ret); +int cg_read_event(const char *controller, const char *path, const char *event, char **ret); + +int cg_enumerate_subgroups(const char *controller, const char *path, DIR **ret); +int cg_read_subgroup(DIR *d, char **ret); + +typedef enum CGroupFlags { + CGROUP_SIGCONT = 1 << 0, + CGROUP_IGNORE_SELF = 1 << 1, + CGROUP_REMOVE = 1 << 2, +} CGroupFlags; + +typedef int (*cg_kill_log_func_t)(const PidRef *pid, int sig, void *userdata); + +int cg_kill(const char *path, int sig, CGroupFlags flags, Set *s, cg_kill_log_func_t kill_log, void *userdata); +int cg_kill_kernel_sigkill(const char *path); +int cg_kill_recursive(const char *path, int sig, CGroupFlags flags, Set *s, cg_kill_log_func_t kill_log, void *userdata); + +int cg_split_spec(const char *spec, char **ret_controller, char **ret_path); +int cg_mangle_path(const char *path, char **ret); + +int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret); +int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **ret); + +int cg_pid_get_path(const char *controller, pid_t pid, char **ret); +int cg_pidref_get_path(const char *controller, const PidRef *pidref, char **ret); + +int cg_rmdir(const char *controller, const char *path); + +int cg_is_threaded(const char *path); + +int cg_is_delegated(const char *path); +int cg_is_delegated_fd(int fd); + +int cg_has_coredump_receive(const char *path); + +typedef enum { + CG_KEY_MODE_GRACEFUL = 1 << 0, +} CGroupKeyMode; + +int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value); +int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret); +int cg_get_keyed_attribute_full(const char *controller, const char *path, const char *attribute, char **keys, char **values, CGroupKeyMode mode); + +static inline int cg_get_keyed_attribute( + const char *controller, + const char *path, + const char *attribute, + char **keys, + char **ret_values) { + return cg_get_keyed_attribute_full(controller, path, attribute, keys, ret_values, 0); +} + +static inline int cg_get_keyed_attribute_graceful( + const char *controller, + const char *path, + const char *attribute, + char **keys, + char **ret_values) { + return cg_get_keyed_attribute_full(controller, path, attribute, keys, ret_values, CG_KEY_MODE_GRACEFUL); +} + +int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret); + +/* Does a parse_boolean() on the attribute contents and sets ret accordingly */ +int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret); + +int cg_get_owner(const char *path, uid_t *ret_uid); + +int cg_set_xattr(const char *path, const char *name, const void *value, size_t size, int flags); +int cg_get_xattr(const char *path, const char *name, void *value, size_t size); +int cg_get_xattr_malloc(const char *path, const char *name, char **ret); +/* Returns negative on error, and 0 or 1 on success for the bool value */ +int cg_get_xattr_bool(const char *path, const char *name); +int cg_remove_xattr(const char *path, const char *name); + +int cg_install_release_agent(const char *controller, const char *agent); +int cg_uninstall_release_agent(const char *controller); + +int cg_is_empty(const char *controller, const char *path); +int cg_is_empty_recursive(const char *controller, const char *path); + +int cg_get_root_path(char **path); + +int cg_path_get_cgroupid(const char *path, uint64_t *ret); +int cg_path_get_session(const char *path, char **ret_session); +int cg_path_get_owner_uid(const char *path, uid_t *ret_uid); +int cg_path_get_unit(const char *path, char **ret_unit); +int cg_path_get_unit_path(const char *path, char **ret_unit); +int cg_path_get_user_unit(const char *path, char **ret_unit); +int cg_path_get_machine_name(const char *path, char **ret_machine); +int cg_path_get_slice(const char *path, char **ret_slice); +int cg_path_get_user_slice(const char *path, char **ret_slice); + +int cg_shift_path(const char *cgroup, const char *cached_root, const char **ret_shifted); +int cg_pid_get_path_shifted(pid_t pid, const char *cached_root, char **ret_cgroup); + +int cg_pid_get_session(pid_t pid, char **ret_session); +int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid); +int cg_pid_get_unit(pid_t pid, char **ret_unit); +int cg_pidref_get_unit(const PidRef *pidref, char **ret); +int cg_pid_get_user_unit(pid_t pid, char **ret_unit); +int cg_pid_get_machine_name(pid_t pid, char **ret_machine); +int cg_pid_get_slice(pid_t pid, char **ret_slice); +int cg_pid_get_user_slice(pid_t pid, char **ret_slice); + +int cg_path_decode_unit(const char *cgroup, char **ret_unit); + +bool cg_needs_escape(const char *p); +int cg_escape(const char *p, char **ret); +char *cg_unescape(const char *p) _pure_; + +bool cg_controller_is_valid(const char *p); + +int cg_slice_to_path(const char *unit, char **ret); + +typedef const char* (*cg_migrate_callback_t)(CGroupMask mask, void *userdata); + +int cg_mask_supported(CGroupMask *ret); +int cg_mask_supported_subtree(const char *root, CGroupMask *ret); +int cg_mask_from_string(const char *s, CGroupMask *ret); +int cg_mask_to_string(CGroupMask mask, char **ret); + +int cg_kernel_controllers(Set **controllers); + +bool cg_ns_supported(void); +bool cg_freezer_supported(void); +bool cg_kill_supported(void); + +int cg_all_unified(void); +int cg_hybrid_unified(void); +int cg_unified_controller(const char *controller); +int cg_unified_cached(bool flush); +static inline int cg_unified(void) { + return cg_unified_cached(true); +} + +const char* cgroup_controller_to_string(CGroupController c) _const_; +CGroupController cgroup_controller_from_string(const char *s) _pure_; + +bool is_cgroup_fs(const struct statfs *s); +bool fd_is_cgroup_fs(int fd); + +typedef enum ManagedOOMMode { + MANAGED_OOM_AUTO, + MANAGED_OOM_KILL, + _MANAGED_OOM_MODE_MAX, + _MANAGED_OOM_MODE_INVALID = -EINVAL, +} ManagedOOMMode; + +const char* managed_oom_mode_to_string(ManagedOOMMode m) _const_; +ManagedOOMMode managed_oom_mode_from_string(const char *s) _pure_; + +typedef enum ManagedOOMPreference { + MANAGED_OOM_PREFERENCE_NONE = 0, + MANAGED_OOM_PREFERENCE_AVOID = 1, + MANAGED_OOM_PREFERENCE_OMIT = 2, + _MANAGED_OOM_PREFERENCE_MAX, + _MANAGED_OOM_PREFERENCE_INVALID = -EINVAL, +} ManagedOOMPreference; + +const char* managed_oom_preference_to_string(ManagedOOMPreference a) _const_; +ManagedOOMPreference managed_oom_preference_from_string(const char *s) _pure_; + +/* The structure to pass to name_to_handle_at() on cgroupfs2 */ +typedef union { + struct file_handle file_handle; + uint8_t space[offsetof(struct file_handle, f_handle) + sizeof(uint64_t)]; +} cg_file_handle; + +#define CG_FILE_HANDLE_INIT { .file_handle.handle_bytes = sizeof(uint64_t) } +#define CG_FILE_HANDLE_CGROUPID(fh) (*(uint64_t*) (fh).file_handle.f_handle) diff --git a/src/basic/chase.c b/src/basic/chase.c new file mode 100644 index 0000000..26bc2d6 --- /dev/null +++ b/src/basic/chase.c @@ -0,0 +1,1156 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "chase.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "log.h" +#include "path-util.h" +#include "string-util.h" +#include "user-util.h" + +bool unsafe_transition(const struct stat *a, const struct stat *b) { + /* Returns true if the transition from a to b is safe, i.e. that we never transition from unprivileged to + * privileged files or directories. Why bother? So that unprivileged code can't symlink to privileged files + * making us believe we read something safe even though it isn't safe in the specific context we open it in. */ + + if (a->st_uid == 0) /* Transitioning from privileged to unprivileged is always fine */ + return false; + + return a->st_uid != b->st_uid; /* Otherwise we need to stay within the same UID */ +} + +static int log_unsafe_transition(int a, int b, const char *path, ChaseFlags flags) { + _cleanup_free_ char *n1 = NULL, *n2 = NULL, *user_a = NULL, *user_b = NULL; + struct stat st; + + if (!FLAGS_SET(flags, CHASE_WARN)) + return -ENOLINK; + + (void) fd_get_path(a, &n1); + (void) fd_get_path(b, &n2); + + if (fstat(a, &st) == 0) + user_a = uid_to_name(st.st_uid); + if (fstat(b, &st) == 0) + user_b = uid_to_name(st.st_uid); + + return log_warning_errno(SYNTHETIC_ERRNO(ENOLINK), + "Detected unsafe path transition %s (owned by %s) %s %s (owned by %s) during canonicalization of %s.", + strna(n1), strna(user_a), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), strna(n2), strna(user_b), path); +} + +static int log_autofs_mount_point(int fd, const char *path, ChaseFlags flags) { + _cleanup_free_ char *n1 = NULL; + + if (!FLAGS_SET(flags, CHASE_WARN)) + return -EREMOTE; + + (void) fd_get_path(fd, &n1); + + return log_warning_errno(SYNTHETIC_ERRNO(EREMOTE), + "Detected autofs mount point %s during canonicalization of %s.", + strna(n1), path); +} + +static int log_prohibited_symlink(int fd, ChaseFlags flags) { + _cleanup_free_ char *n1 = NULL; + + assert(fd >= 0); + + if (!FLAGS_SET(flags, CHASE_WARN)) + return -EREMCHG; + + (void) fd_get_path(fd, &n1); + + return log_warning_errno(SYNTHETIC_ERRNO(EREMCHG), + "Detected symlink where not symlink is allowed at %s, refusing.", + strna(n1)); +} + +static int chaseat_needs_absolute(int dir_fd, const char *path) { + if (dir_fd < 0) + return path_is_absolute(path); + + return dir_fd_is_root(dir_fd); +} + +int chaseat(int dir_fd, const char *path, ChaseFlags flags, char **ret_path, int *ret_fd) { + _cleanup_free_ char *buffer = NULL, *done = NULL; + _cleanup_close_ int fd = -EBADF, root_fd = -EBADF; + unsigned max_follow = CHASE_MAX; /* how many symlinks to follow before giving up and returning ELOOP */ + bool exists = true, append_trail_slash = false; + struct stat st; /* stat obtained from fd */ + const char *todo; + int r; + + assert(!FLAGS_SET(flags, CHASE_PREFIX_ROOT)); + assert(!FLAGS_SET(flags, CHASE_STEP|CHASE_EXTRACT_FILENAME)); + assert(!FLAGS_SET(flags, CHASE_TRAIL_SLASH|CHASE_EXTRACT_FILENAME)); + assert(!FLAGS_SET(flags, CHASE_MKDIR_0755) || (flags & (CHASE_NONEXISTENT | CHASE_PARENT)) != 0); + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + /* Either the file may be missing, or we return an fd to the final object, but both make no sense */ + if (FLAGS_SET(flags, CHASE_NONEXISTENT)) + assert(!ret_fd); + + if (FLAGS_SET(flags, CHASE_STEP)) + assert(!ret_fd); + + if (isempty(path)) + path = "."; + + /* This function resolves symlinks of the path relative to the given directory file descriptor. If + * CHASE_AT_RESOLVE_IN_ROOT is specified and a directory file descriptor is provided, symlinks + * are resolved relative to the given directory file descriptor. Otherwise, they are resolved + * relative to the root directory of the host. + * + * Note that when a positive directory file descriptor is provided and CHASE_AT_RESOLVE_IN_ROOT is + * specified and we find an absolute symlink, it is resolved relative to given directory file + * descriptor and not the root of the host. Also, when following relative symlinks, this functions + * ensures they cannot be used to "escape" the given directory file descriptor. If a positive + * directory file descriptor is provided, the "path" parameter is always interpreted relative to the + * given directory file descriptor, even if it is absolute. If the given directory file descriptor is + * AT_FDCWD and "path" is absolute, it is interpreted relative to the root directory of the host. + * + * When "dir_fd" points to a non-root directory and CHASE_AT_RESOLVE_IN_ROOT is set, this function + * always returns a relative path in "ret_path", even if "path" is an absolute path, because openat() + * like functions generally ignore the directory fd if they are provided with an absolute path. When + * CHASE_AT_RESOLVE_IN_ROOT is not set, then this returns relative path to the specified file + * descriptor if all resolved symlinks are relative, otherwise absolute path will be returned. When + * "dir_fd" is AT_FDCWD and "path" is an absolute path, we return an absolute path in "ret_path" + * because otherwise, if the caller passes the returned relative path to another openat() like + * function, it would be resolved relative to the current working directory instead of to "/". + * + * Summary about the result path: + * - "dir_fd" points to the root directory + * → result will be absolute + * - "dir_fd" points to a non-root directory, and CHASE_AT_RESOLVE_IN_ROOT is set + * → relative + * - "dir_fd" points to a non-root directory, and CHASE_AT_RESOLVE_IN_ROOT is not set + * → relative when all resolved symlinks are relative, otherwise absolute + * - "dir_fd" is AT_FDCWD, and "path" is absolute + * → absolute + * - "dir_fd" is AT_FDCWD, and "path" is relative + * → relative when all resolved symlinks are relative, otherwise absolute + * + * Algorithmically this operates on two path buffers: "done" are the components of the path we + * already processed and resolved symlinks, "." and ".." of. "todo" are the components of the path we + * still need to process. On each iteration, we move one component from "todo" to "done", processing + * its special meaning each time. We always keep an O_PATH fd to the component we are currently + * processing, thus keeping lookup races to a minimum. + * + * Suggested usage: whenever you want to canonicalize a path, use this function. Pass the absolute + * path you got as-is: fully qualified and relative to your host's root. Optionally, specify the + * "dir_fd" parameter to tell this function what to do when encountering a symlink with an absolute + * path as directory: resolve it relative to the given directory file descriptor. + * + * There are five ways to invoke this function: + * + * 1. Without CHASE_STEP or ret_fd: in this case the path is resolved and the normalized path is + * returned in `ret_path`. The return value is < 0 on error. If CHASE_NONEXISTENT is also set, 0 + * is returned if the file doesn't exist, > 0 otherwise. If CHASE_NONEXISTENT is not set, >= 0 is + * returned if the destination was found, -ENOENT if it wasn't. + * + * 2. With ret_fd: in this case the destination is opened after chasing it as O_PATH and this file + * descriptor is returned as return value. This is useful to open files relative to some root + * directory. Note that the returned O_PATH file descriptors must be converted into a regular one + * (using fd_reopen() or such) before it can be used for reading/writing. ret_fd may not be + * combined with CHASE_NONEXISTENT. + * + * 3. With CHASE_STEP: in this case only a single step of the normalization is executed, i.e. only + * the first symlink or ".." component of the path is resolved, and the resulting path is + * returned. This is useful if a caller wants to trace the path through the file system verbosely. + * Returns < 0 on error, > 0 if the path is fully normalized, and == 0 for each normalization + * step. This may be combined with CHASE_NONEXISTENT, in which case 1 is returned when a component + * is not found. + * + * 4. With CHASE_SAFE: in this case the path must not contain unsafe transitions, i.e. transitions + * from unprivileged to privileged files or directories. In such cases the return value is + * -ENOLINK. If CHASE_WARN is also set, a warning describing the unsafe transition is emitted. + * CHASE_WARN cannot be used in PID 1. + * + * 5. With CHASE_NO_AUTOFS: in this case if an autofs mount point is encountered, path normalization + * is aborted and -EREMOTE is returned. If CHASE_WARN is also set, a warning showing the path of + * the mount point is emitted. CHASE_WARN cannot be used in PID 1. + */ + + if (FLAGS_SET(flags, CHASE_AT_RESOLVE_IN_ROOT)) { + /* If we get AT_FDCWD or dir_fd points to "/", then we always resolve symlinks relative to + * the host's root. Hence, CHASE_AT_RESOLVE_IN_ROOT is meaningless. */ + + r = dir_fd_is_root_or_cwd(dir_fd); + if (r < 0) + return r; + if (r > 0) + flags &= ~CHASE_AT_RESOLVE_IN_ROOT; + } + + if (!(flags & + (CHASE_AT_RESOLVE_IN_ROOT|CHASE_NONEXISTENT|CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_STEP| + CHASE_PROHIBIT_SYMLINKS|CHASE_MKDIR_0755)) && + !ret_path && ret_fd) { + + /* Shortcut the ret_fd case if the caller isn't interested in the actual path and has no root + * set and doesn't care about any of the other special features we provide either. */ + r = openat(dir_fd, path, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0)); + if (r < 0) + return -errno; + + *ret_fd = r; + return 0; + } + + buffer = strdup(path); + if (!buffer) + return -ENOMEM; + + /* If we receive an absolute path together with AT_FDCWD, we need to return an absolute path, because + * a relative path would be interpreted relative to the current working directory. Also, let's make + * the result absolute when the file descriptor of the root directory is specified. */ + r = chaseat_needs_absolute(dir_fd, path); + if (r < 0) + return r; + + bool need_absolute = r; + if (need_absolute) { + done = strdup("/"); + if (!done) + return -ENOMEM; + } + + /* If a positive directory file descriptor is provided, always resolve the given path relative to it, + * regardless of whether it is absolute or not. If we get AT_FDCWD, follow regular openat() + * semantics, if the path is relative, resolve against the current working directory. Otherwise, + * resolve against root. */ + fd = openat(dir_fd, done ?: ".", O_CLOEXEC|O_DIRECTORY|O_PATH); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + /* If we get AT_FDCWD, we always resolve symlinks relative to the host's root. Only if a positive + * directory file descriptor is provided we will look at CHASE_AT_RESOLVE_IN_ROOT to determine + * whether to resolve symlinks in it or not. */ + if (dir_fd >= 0 && FLAGS_SET(flags, CHASE_AT_RESOLVE_IN_ROOT)) + root_fd = openat(dir_fd, ".", O_CLOEXEC|O_DIRECTORY|O_PATH); + else + root_fd = open("/", O_CLOEXEC|O_DIRECTORY|O_PATH); + if (root_fd < 0) + return -errno; + + if (FLAGS_SET(flags, CHASE_TRAIL_SLASH)) + append_trail_slash = ENDSWITH_SET(buffer, "/", "/."); + + for (todo = buffer;;) { + _cleanup_free_ char *first = NULL; + _cleanup_close_ int child = -EBADF; + struct stat st_child; + const char *e; + + r = path_find_first_component(&todo, /* accept_dot_dot= */ true, &e); + if (r < 0) + return r; + if (r == 0) { /* We reached the end. */ + if (append_trail_slash) + if (!strextend(&done, "/")) + return -ENOMEM; + break; + } + + first = strndup(e, r); + if (!first) + return -ENOMEM; + + /* Two dots? Then chop off the last bit of what we already found out. */ + if (path_equal(first, "..")) { + _cleanup_free_ char *parent = NULL; + _cleanup_close_ int fd_parent = -EBADF; + struct stat st_parent; + + /* If we already are at the top, then going up will not change anything. This is + * in-line with how the kernel handles this. */ + if (empty_or_root(done) && FLAGS_SET(flags, CHASE_AT_RESOLVE_IN_ROOT)) { + if (FLAGS_SET(flags, CHASE_STEP)) + goto chased_one; + continue; + } + + fd_parent = openat(fd, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH|O_DIRECTORY); + if (fd_parent < 0) + return -errno; + + if (fstat(fd_parent, &st_parent) < 0) + return -errno; + + /* If we opened the same directory, that _may_ indicate that we're at the host root + * directory. Let's confirm that in more detail with dir_fd_is_root(). And if so, + * going up won't change anything. */ + if (stat_inode_same(&st_parent, &st)) { + r = dir_fd_is_root(fd); + if (r < 0) + return r; + if (r > 0) { + if (FLAGS_SET(flags, CHASE_STEP)) + goto chased_one; + continue; + } + } + + r = path_extract_directory(done, &parent); + if (r >= 0) { + assert(!need_absolute || path_is_absolute(parent)); + free_and_replace(done, parent); + } else if (r == -EDESTADDRREQ) { + /* 'done' contains filename only (i.e. no slash). */ + assert(!need_absolute); + done = mfree(done); + } else if (r == -EADDRNOTAVAIL) { + /* 'done' is "/". This branch should be already handled in the above. */ + assert(!FLAGS_SET(flags, CHASE_AT_RESOLVE_IN_ROOT)); + assert_not_reached(); + } else if (r == -EINVAL) { + /* 'done' is an empty string, ends with '..', or an invalid path. */ + assert(!need_absolute); + assert(!FLAGS_SET(flags, CHASE_AT_RESOLVE_IN_ROOT)); + + if (!path_is_valid(done)) + return -EINVAL; + + /* If we're at the top of "dir_fd", start appending ".." to "done". */ + if (!path_extend(&done, "..")) + return -ENOMEM; + } else + return r; + + if (FLAGS_SET(flags, CHASE_STEP)) + goto chased_one; + + if (FLAGS_SET(flags, CHASE_SAFE) && + unsafe_transition(&st, &st_parent)) + return log_unsafe_transition(fd, fd_parent, path, flags); + + /* If the path ends on a "..", and CHASE_PARENT is specified then our current 'fd' is + * the child of the returned normalized path, not the parent as requested. To correct + * this we have to go *two* levels up. */ + if (FLAGS_SET(flags, CHASE_PARENT) && isempty(todo)) { + _cleanup_close_ int fd_grandparent = -EBADF; + struct stat st_grandparent; + + fd_grandparent = openat(fd_parent, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH|O_DIRECTORY); + if (fd_grandparent < 0) + return -errno; + + if (fstat(fd_grandparent, &st_grandparent) < 0) + return -errno; + + if (FLAGS_SET(flags, CHASE_SAFE) && + unsafe_transition(&st_parent, &st_grandparent)) + return log_unsafe_transition(fd_parent, fd_grandparent, path, flags); + + st = st_grandparent; + close_and_replace(fd, fd_grandparent); + break; + } + + /* update fd and stat */ + st = st_parent; + close_and_replace(fd, fd_parent); + continue; + } + + /* Otherwise let's see what this is. */ + child = r = RET_NERRNO(openat(fd, first, O_CLOEXEC|O_NOFOLLOW|O_PATH)); + if (r < 0) { + if (r != -ENOENT) + return r; + + if (!isempty(todo) && !path_is_safe(todo)) + return r; + + if (FLAGS_SET(flags, CHASE_MKDIR_0755) && !isempty(todo)) { + child = xopenat(fd, + first, + O_DIRECTORY|O_CREAT|O_EXCL|O_NOFOLLOW|O_CLOEXEC, + /* xopen_flags = */ 0, + 0755); + if (child < 0) + return child; + } else if (FLAGS_SET(flags, CHASE_PARENT) && isempty(todo)) { + if (!path_extend(&done, first)) + return -ENOMEM; + + break; + } else if (FLAGS_SET(flags, CHASE_NONEXISTENT)) { + if (!path_extend(&done, first, todo)) + return -ENOMEM; + + exists = false; + break; + } else + return r; + } + + if (fstat(child, &st_child) < 0) + return -errno; + + if (FLAGS_SET(flags, CHASE_SAFE) && + unsafe_transition(&st, &st_child)) + return log_unsafe_transition(fd, child, path, flags); + + if (FLAGS_SET(flags, CHASE_NO_AUTOFS) && + fd_is_fs_type(child, AUTOFS_SUPER_MAGIC) > 0) + return log_autofs_mount_point(child, path, flags); + + if (S_ISLNK(st_child.st_mode) && !(FLAGS_SET(flags, CHASE_NOFOLLOW) && isempty(todo))) { + _cleanup_free_ char *destination = NULL; + + if (FLAGS_SET(flags, CHASE_PROHIBIT_SYMLINKS)) + return log_prohibited_symlink(child, flags); + + /* This is a symlink, in this case read the destination. But let's make sure we + * don't follow symlinks without bounds. */ + if (--max_follow <= 0) + return -ELOOP; + + r = readlinkat_malloc(fd, first, &destination); + if (r < 0) + return r; + if (isempty(destination)) + return -EINVAL; + + if (path_is_absolute(destination)) { + + /* An absolute destination. Start the loop from the beginning, but use the + * root file descriptor as base. */ + + safe_close(fd); + fd = fd_reopen(root_fd, O_CLOEXEC|O_PATH|O_DIRECTORY); + if (fd < 0) + return fd; + + if (fstat(fd, &st) < 0) + return -errno; + + if (FLAGS_SET(flags, CHASE_SAFE) && + unsafe_transition(&st_child, &st)) + return log_unsafe_transition(child, fd, path, flags); + + /* When CHASE_AT_RESOLVE_IN_ROOT is not set, now the chased path may be + * outside of the specified dir_fd. Let's make the result absolute. */ + if (!FLAGS_SET(flags, CHASE_AT_RESOLVE_IN_ROOT)) + need_absolute = true; + + r = free_and_strdup(&done, need_absolute ? "/" : NULL); + if (r < 0) + return r; + } + + /* Prefix what's left to do with what we just read, and start the loop again, but + * remain in the current directory. */ + if (!path_extend(&destination, todo)) + return -ENOMEM; + + free_and_replace(buffer, destination); + todo = buffer; + + if (FLAGS_SET(flags, CHASE_STEP)) + goto chased_one; + + continue; + } + + /* If this is not a symlink, then let's just add the name we read to what we already verified. */ + if (!path_extend(&done, first)) + return -ENOMEM; + + if (FLAGS_SET(flags, CHASE_PARENT) && isempty(todo)) + break; + + /* And iterate again, but go one directory further down. */ + st = st_child; + close_and_replace(fd, child); + } + + if (FLAGS_SET(flags, CHASE_PARENT)) { + r = stat_verify_directory(&st); + if (r < 0) + return r; + } + + if (ret_path) { + if (FLAGS_SET(flags, CHASE_EXTRACT_FILENAME) && done) { + _cleanup_free_ char *f = NULL; + + r = path_extract_filename(done, &f); + if (r < 0 && r != -EADDRNOTAVAIL) + return r; + + /* If we get EADDRNOTAVAIL we clear done and it will get reinitialized by the next block. */ + free_and_replace(done, f); + } + + if (!done) { + assert(!need_absolute || FLAGS_SET(flags, CHASE_EXTRACT_FILENAME)); + done = strdup(append_trail_slash ? "./" : "."); + if (!done) + return -ENOMEM; + } + + *ret_path = TAKE_PTR(done); + } + + if (ret_fd) { + /* Return the O_PATH fd we currently are looking to the caller. It can translate it to a + * proper fd by opening /proc/self/fd/xyz. */ + + assert(fd >= 0); + *ret_fd = TAKE_FD(fd); + } + + if (FLAGS_SET(flags, CHASE_STEP)) + return 1; + + return exists; + +chased_one: + if (ret_path) { + const char *e; + + if (!done) { + assert(!need_absolute); + done = strdup(append_trail_slash ? "./" : "."); + if (!done) + return -ENOMEM; + } + + /* todo may contain slashes at the beginning. */ + r = path_find_first_component(&todo, /* accept_dot_dot= */ true, &e); + if (r < 0) + return r; + if (r == 0) + *ret_path = TAKE_PTR(done); + else { + char *c; + + c = path_join(done, e); + if (!c) + return -ENOMEM; + + *ret_path = c; + } + } + + return 0; +} + +static int empty_or_root_to_null(const char **path) { + int r; + + assert(path); + + /* This nullifies the input path when the path is empty or points to "/". */ + + if (empty_or_root(*path)) { + *path = NULL; + return 0; + } + + r = path_is_root(*path); + if (r < 0) + return r; + if (r > 0) + *path = NULL; + + return 0; +} + +int chase(const char *path, const char *root, ChaseFlags flags, char **ret_path, int *ret_fd) { + _cleanup_free_ char *root_abs = NULL, *absolute = NULL, *p = NULL; + _cleanup_close_ int fd = -EBADF, pfd = -EBADF; + int r; + + assert(path); + + if (isempty(path)) + return -EINVAL; + + r = empty_or_root_to_null(&root); + if (r < 0) + return r; + + /* A root directory of "/" or "" is identical to "/". */ + if (empty_or_root(root)) { + root = "/"; + + /* When the root directory is "/", we will drop CHASE_AT_RESOLVE_IN_ROOT in chaseat(), + * hence below is not necessary, but let's shortcut. */ + flags &= ~CHASE_AT_RESOLVE_IN_ROOT; + + } else { + r = path_make_absolute_cwd(root, &root_abs); + if (r < 0) + return r; + + /* Simplify the root directory, so that it has no duplicate slashes and nothing at the + * end. While we won't resolve the root path we still simplify it. */ + root = path_simplify(root_abs); + + assert(path_is_absolute(root)); + assert(!empty_or_root(root)); + + if (FLAGS_SET(flags, CHASE_PREFIX_ROOT)) { + absolute = path_join(root, path); + if (!absolute) + return -ENOMEM; + } + + flags |= CHASE_AT_RESOLVE_IN_ROOT; + } + + if (!absolute) { + r = path_make_absolute_cwd(path, &absolute); + if (r < 0) + return r; + } + + path = path_startswith(absolute, root); + if (!path) + return log_full_errno(FLAGS_SET(flags, CHASE_WARN) ? LOG_WARNING : LOG_DEBUG, + SYNTHETIC_ERRNO(ECHRNG), + "Specified path '%s' is outside of specified root directory '%s', refusing to resolve.", + absolute, root); + + fd = open(root, O_CLOEXEC|O_DIRECTORY|O_PATH); + if (fd < 0) + return -errno; + + r = chaseat(fd, path, flags & ~CHASE_PREFIX_ROOT, ret_path ? &p : NULL, ret_fd ? &pfd : NULL); + if (r < 0) + return r; + + if (ret_path) { + if (!FLAGS_SET(flags, CHASE_EXTRACT_FILENAME)) { + + /* When "root" points to the root directory, the result of chaseat() is always + * absolute, hence it is not necessary to prefix with the root. When "root" points to + * a non-root directory, the result path is always normalized and relative, hence + * we can simply call path_join() and not necessary to call path_simplify(). + * Note that the result of chaseat() may start with "." (more specifically, it may be + * "." or "./"), and we need to drop "." in that case. */ + + if (empty_or_root(root)) + assert(path_is_absolute(p)); + else { + char *q; + + assert(!path_is_absolute(p)); + + q = path_join(root, p + (*p == '.')); + if (!q) + return -ENOMEM; + + free_and_replace(p, q); + } + } + + *ret_path = TAKE_PTR(p); + } + + if (ret_fd) + *ret_fd = TAKE_FD(pfd); + + return r; +} + +int chaseat_prefix_root(const char *path, const char *root, char **ret) { + char *q; + int r; + + assert(path); + assert(ret); + + /* This is mostly for prefixing the result of chaseat(). */ + + if (!path_is_absolute(path)) { + _cleanup_free_ char *root_abs = NULL; + + r = empty_or_root_to_null(&root); + if (r < 0 && r != -ENOENT) + return r; + + /* If the dir_fd points to the root directory, chaseat() always returns an absolute path. */ + if (empty_or_root(root)) + return -EINVAL; + + r = path_make_absolute_cwd(root, &root_abs); + if (r < 0) + return r; + + root = path_simplify(root_abs); + + q = path_join(root, path + (path[0] == '.' && IN_SET(path[1], '/', '\0'))); + } else + q = strdup(path); + if (!q) + return -ENOMEM; + + *ret = q; + return 0; +} + +int chase_extract_filename(const char *path, const char *root, char **ret) { + int r; + + /* This is similar to path_extract_filename(), but takes root directory. + * The result should be consistent with chase() with CHASE_EXTRACT_FILENAME. */ + + assert(path); + assert(ret); + + if (isempty(path)) + return -EINVAL; + + if (!path_is_absolute(path)) + return -EINVAL; + + r = empty_or_root_to_null(&root); + if (r < 0 && r != -ENOENT) + return r; + + if (!empty_or_root(root)) { + _cleanup_free_ char *root_abs = NULL; + + r = path_make_absolute_cwd(root, &root_abs); + if (r < 0) + return r; + + path = path_startswith(path, root_abs); + if (!path) + return -EINVAL; + } + + if (!isempty(path)) { + r = path_extract_filename(path, ret); + if (r != -EADDRNOTAVAIL) + return r; + } + + char *fname = strdup("."); + if (!fname) + return -ENOMEM; + + *ret = fname; + return 0; +} + +int chase_and_open(const char *path, const char *root, ChaseFlags chase_flags, int open_flags, char **ret_path) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL, *fname = NULL; + mode_t mode = open_flags & O_DIRECTORY ? 0755 : 0644; + int r; + + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + + if (empty_or_root(root) && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) + /* Shortcut this call if none of the special features of this call are requested */ + return xopenat(AT_FDCWD, path, + open_flags | (FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0), + /* xopen_flags = */ 0, + mode); + + r = chase(path, root, CHASE_PARENT|chase_flags, &p, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + if (!FLAGS_SET(chase_flags, CHASE_PARENT) && + !FLAGS_SET(chase_flags, CHASE_EXTRACT_FILENAME)) { + r = chase_extract_filename(p, root, &fname); + if (r < 0) + return r; + } + + r = xopenat(path_fd, strempty(fname), open_flags|O_NOFOLLOW, /* xopen_flags = */ 0, mode); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return r; +} + +int chase_and_opendir(const char *path, const char *root, ChaseFlags chase_flags, char **ret_path, DIR **ret_dir) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL; + DIR *d; + int r; + + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + assert(ret_dir); + + if (empty_or_root(root) && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) { + /* Shortcut this call if none of the special features of this call are requested */ + d = opendir(path); + if (!d) + return -errno; + + *ret_dir = d; + return 0; + } + + r = chase(path, root, chase_flags, ret_path ? &p : NULL, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + d = xopendirat(path_fd, ".", O_NOFOLLOW); + if (!d) + return -errno; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + *ret_dir = d; + return 0; +} + +int chase_and_stat(const char *path, const char *root, ChaseFlags chase_flags, char **ret_path, struct stat *ret_stat) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + assert(ret_stat); + + if (empty_or_root(root) && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) + /* Shortcut this call if none of the special features of this call are requested */ + return RET_NERRNO(fstatat(AT_FDCWD, path, ret_stat, + FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0)); + + r = chase(path, root, chase_flags, ret_path ? &p : NULL, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + if (fstat(path_fd, ret_stat) < 0) + return -errno; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return 0; +} + +int chase_and_access(const char *path, const char *root, ChaseFlags chase_flags, int access_mode, char **ret_path) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + + if (empty_or_root(root) && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) + /* Shortcut this call if none of the special features of this call are requested */ + return RET_NERRNO(faccessat(AT_FDCWD, path, access_mode, + FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0)); + + r = chase(path, root, chase_flags, ret_path ? &p : NULL, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + r = access_fd(path_fd, access_mode); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return 0; +} + +int chase_and_fopen_unlocked( + const char *path, + const char *root, + ChaseFlags chase_flags, + const char *open_flags, + char **ret_path, + FILE **ret_file) { + + _cleanup_free_ char *final_path = NULL; + _cleanup_close_ int fd = -EBADF; + int mode_flags, r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP|CHASE_PARENT))); + assert(open_flags); + assert(ret_file); + + mode_flags = fopen_mode_to_flags(open_flags); + if (mode_flags < 0) + return mode_flags; + + fd = chase_and_open(path, root, chase_flags, mode_flags, ret_path ? &final_path : NULL); + if (fd < 0) + return fd; + + r = take_fdopen_unlocked(&fd, open_flags, ret_file); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(final_path); + + return 0; +} + +int chase_and_unlink(const char *path, const char *root, ChaseFlags chase_flags, int unlink_flags, char **ret_path) { + _cleanup_free_ char *p = NULL, *fname = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP|CHASE_PARENT))); + + fd = chase_and_open(path, root, chase_flags|CHASE_PARENT|CHASE_NOFOLLOW, O_PATH|O_DIRECTORY|O_CLOEXEC, &p); + if (fd < 0) + return fd; + + r = path_extract_filename(p, &fname); + if (r < 0) + return r; + + if (unlinkat(fd, fname, unlink_flags) < 0) + return -errno; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return 0; +} + +int chase_and_open_parent(const char *path, const char *root, ChaseFlags chase_flags, char **ret_filename) { + int pfd, r; + + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + + r = chase(path, root, CHASE_PARENT|CHASE_EXTRACT_FILENAME|chase_flags, ret_filename, &pfd); + if (r < 0) + return r; + + return pfd; +} + +int chase_and_openat(int dir_fd, const char *path, ChaseFlags chase_flags, int open_flags, char **ret_path) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL, *fname = NULL; + mode_t mode = open_flags & O_DIRECTORY ? 0755 : 0644; + int r; + + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + + if (dir_fd == AT_FDCWD && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) + /* Shortcut this call if none of the special features of this call are requested */ + return xopenat(dir_fd, path, + open_flags | (FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? O_NOFOLLOW : 0), + /* xopen_flags = */ 0, + mode); + + r = chaseat(dir_fd, path, chase_flags|CHASE_PARENT, &p, &path_fd); + if (r < 0) + return r; + + if (!FLAGS_SET(chase_flags, CHASE_PARENT)) { + r = path_extract_filename(p, &fname); + if (r < 0 && r != -EADDRNOTAVAIL) + return r; + } + + r = xopenat(path_fd, strempty(fname), open_flags|O_NOFOLLOW, /* xopen_flags = */ 0, mode); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return r; +} + +int chase_and_opendirat(int dir_fd, const char *path, ChaseFlags chase_flags, char **ret_path, DIR **ret_dir) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL; + DIR *d; + int r; + + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + assert(ret_dir); + + if (dir_fd == AT_FDCWD && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) { + /* Shortcut this call if none of the special features of this call are requested */ + d = opendir(path); + if (!d) + return -errno; + + *ret_dir = d; + return 0; + } + + r = chaseat(dir_fd, path, chase_flags, ret_path ? &p : NULL, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + d = xopendirat(path_fd, ".", O_NOFOLLOW); + if (!d) + return -errno; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + *ret_dir = d; + return 0; +} + +int chase_and_statat(int dir_fd, const char *path, ChaseFlags chase_flags, char **ret_path, struct stat *ret_stat) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + assert(ret_stat); + + if (dir_fd == AT_FDCWD && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) + /* Shortcut this call if none of the special features of this call are requested */ + return RET_NERRNO(fstatat(AT_FDCWD, path, ret_stat, + FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0)); + + r = chaseat(dir_fd, path, chase_flags, ret_path ? &p : NULL, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + if (fstat(path_fd, ret_stat) < 0) + return -errno; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return 0; +} + +int chase_and_accessat(int dir_fd, const char *path, ChaseFlags chase_flags, int access_mode, char **ret_path) { + _cleanup_close_ int path_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + + if (dir_fd == AT_FDCWD && !ret_path && + (chase_flags & (CHASE_NO_AUTOFS|CHASE_SAFE|CHASE_PROHIBIT_SYMLINKS|CHASE_PARENT|CHASE_MKDIR_0755)) == 0) + /* Shortcut this call if none of the special features of this call are requested */ + return RET_NERRNO(faccessat(AT_FDCWD, path, access_mode, + FLAGS_SET(chase_flags, CHASE_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0)); + + r = chaseat(dir_fd, path, chase_flags, ret_path ? &p : NULL, &path_fd); + if (r < 0) + return r; + assert(path_fd >= 0); + + r = access_fd(path_fd, access_mode); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return 0; +} + +int chase_and_fopenat_unlocked( + int dir_fd, + const char *path, + ChaseFlags chase_flags, + const char *open_flags, + char **ret_path, + FILE **ret_file) { + + _cleanup_free_ char *final_path = NULL; + _cleanup_close_ int fd = -EBADF; + int mode_flags, r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP|CHASE_PARENT))); + assert(open_flags); + assert(ret_file); + + mode_flags = fopen_mode_to_flags(open_flags); + if (mode_flags < 0) + return mode_flags; + + fd = chase_and_openat(dir_fd, path, chase_flags, mode_flags, ret_path ? &final_path : NULL); + if (fd < 0) + return fd; + + r = take_fdopen_unlocked(&fd, open_flags, ret_file); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(final_path); + + return 0; +} + +int chase_and_unlinkat(int dir_fd, const char *path, ChaseFlags chase_flags, int unlink_flags, char **ret_path) { + _cleanup_free_ char *p = NULL, *fname = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(path); + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP|CHASE_PARENT))); + + fd = chase_and_openat(dir_fd, path, chase_flags|CHASE_PARENT|CHASE_NOFOLLOW, O_PATH|O_DIRECTORY|O_CLOEXEC, &p); + if (fd < 0) + return fd; + + r = path_extract_filename(p, &fname); + if (r < 0) + return r; + + if (unlinkat(fd, fname, unlink_flags) < 0) + return -errno; + + if (ret_path) + *ret_path = TAKE_PTR(p); + + return 0; +} + +int chase_and_open_parent_at(int dir_fd, const char *path, ChaseFlags chase_flags, char **ret_filename) { + int pfd, r; + + assert(!(chase_flags & (CHASE_NONEXISTENT|CHASE_STEP))); + + r = chaseat(dir_fd, path, CHASE_PARENT|CHASE_EXTRACT_FILENAME|chase_flags, ret_filename, &pfd); + if (r < 0) + return r; + + return pfd; +} diff --git a/src/basic/chase.h b/src/basic/chase.h new file mode 100644 index 0000000..cfc714b --- /dev/null +++ b/src/basic/chase.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "stat-util.h" + +typedef enum ChaseFlags { + CHASE_PREFIX_ROOT = 1 << 0, /* The specified path will be prefixed by the specified root before beginning the iteration */ + CHASE_NONEXISTENT = 1 << 1, /* It's OK if the path doesn't actually exist. */ + CHASE_NO_AUTOFS = 1 << 2, /* Return -EREMOTE if autofs mount point found */ + CHASE_SAFE = 1 << 3, /* Return -EPERM if we ever traverse from unprivileged to privileged files or directories */ + CHASE_TRAIL_SLASH = 1 << 4, /* Any trailing slash will be preserved */ + CHASE_STEP = 1 << 5, /* Just execute a single step of the normalization */ + CHASE_NOFOLLOW = 1 << 6, /* Do not follow the path's right-most component. With ret_fd, when the path's + * right-most component refers to symlink, return O_PATH fd of the symlink. */ + CHASE_WARN = 1 << 7, /* Emit an appropriate warning when an error is encountered. + * Note: this may do an NSS lookup, hence this flag cannot be used in PID 1. */ + CHASE_AT_RESOLVE_IN_ROOT = 1 << 8, /* Same as openat2()'s RESOLVE_IN_ROOT flag, symlinks are resolved + * relative to the given directory fd instead of root. */ + CHASE_PROHIBIT_SYMLINKS = 1 << 9, /* Refuse all symlinks */ + CHASE_PARENT = 1 << 10, /* Chase the parent directory of the given path. Note that the + * full path is still stored in ret_path and only the returned + * file descriptor will point to the parent directory. Note that + * the result path is the root or '.', then the file descriptor + * also points to the result path even if this flag is set. + * When this specified, chase() will succeed with 1 even if the + * file points to the last path component does not exist. */ + CHASE_MKDIR_0755 = 1 << 11, /* Create any missing parent directories in the given path. This + * needs to be set with CHASE_NONEXISTENT and/or CHASE_PARENT. + * Note, chase_and_open() or friends always add CHASE_PARENT flag + * when internally call chase(), hence CHASE_MKDIR_0755 can be + * safely set without CHASE_NONEXISTENT and CHASE_PARENT. */ + CHASE_EXTRACT_FILENAME = 1 << 12, /* Only return the last component of the resolved path */ +} ChaseFlags; + +bool unsafe_transition(const struct stat *a, const struct stat *b); + +/* How many iterations to execute before returning -ELOOP */ +#define CHASE_MAX 32 + +int chase(const char *path_with_prefix, const char *root, ChaseFlags chase_flags, char **ret_path, int *ret_fd); + +int chaseat_prefix_root(const char *path, const char *root, char **ret); +int chase_extract_filename(const char *path, const char *root, char **ret); + +int chase_and_open(const char *path, const char *root, ChaseFlags chase_flags, int open_flags, char **ret_path); +int chase_and_opendir(const char *path, const char *root, ChaseFlags chase_flags, char **ret_path, DIR **ret_dir); +int chase_and_stat(const char *path, const char *root, ChaseFlags chase_flags, char **ret_path, struct stat *ret_stat); +int chase_and_access(const char *path, const char *root, ChaseFlags chase_flags, int access_mode, char **ret_path); +int chase_and_fopen_unlocked(const char *path, const char *root, ChaseFlags chase_flags, const char *open_flags, char **ret_path, FILE **ret_file); +int chase_and_unlink(const char *path, const char *root, ChaseFlags chase_flags, int unlink_flags, char **ret_path); +int chase_and_open_parent(const char *path, const char *root, ChaseFlags chase_flags, char **ret_filename); + +int chaseat(int dir_fd, const char *path, ChaseFlags flags, char **ret_path, int *ret_fd); + +int chase_and_openat(int dir_fd, const char *path, ChaseFlags chase_flags, int open_flags, char **ret_path); +int chase_and_opendirat(int dir_fd, const char *path, ChaseFlags chase_flags, char **ret_path, DIR **ret_dir); +int chase_and_statat(int dir_fd, const char *path, ChaseFlags chase_flags, char **ret_path, struct stat *ret_stat); +int chase_and_accessat(int dir_fd, const char *path, ChaseFlags chase_flags, int access_mode, char **ret_path); +int chase_and_fopenat_unlocked(int dir_fd, const char *path, ChaseFlags chase_flags, const char *open_flags, char **ret_path, FILE **ret_file); +int chase_and_unlinkat(int dir_fd, const char *path, ChaseFlags chase_flags, int unlink_flags, char **ret_path); +int chase_and_open_parent_at(int dir_fd, const char *path, ChaseFlags chase_flags, char **ret_filename); diff --git a/src/basic/chattr-util.c b/src/basic/chattr-util.c new file mode 100644 index 0000000..fe8b9ab --- /dev/null +++ b/src/basic/chattr-util.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "chattr-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "string-util.h" + +int chattr_full( + int dir_fd, + const char *path, + unsigned value, + unsigned mask, + unsigned *ret_previous, + unsigned *ret_final, + ChattrApplyFlags flags) { + + _cleanup_close_ int fd = -EBADF; + unsigned old_attr, new_attr; + int set_flags_errno = 0; + struct stat st; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0) + return fd; + + if (fstat(fd, &st) < 0) + return -errno; + + /* Explicitly check whether this is a regular file or directory. If it is anything else (such + * as a device node or fifo), then the ioctl will not hit the file systems but possibly + * drivers, where the ioctl might have different effects. Notably, DRM is using the same + * ioctl() number. */ + + if (!S_ISDIR(st.st_mode) && !S_ISREG(st.st_mode)) + return -ENOTTY; + + if (mask == 0 && !ret_previous && !ret_final) + return 0; + + if (ioctl(fd, FS_IOC_GETFLAGS, &old_attr) < 0) + return -errno; + + new_attr = (old_attr & ~mask) | (value & mask); + if (new_attr == old_attr) { + if (ret_previous) + *ret_previous = old_attr; + if (ret_final) + *ret_final = old_attr; + return 0; + } + + if (ioctl(fd, FS_IOC_SETFLAGS, &new_attr) >= 0) { + unsigned attr; + + /* Some filesystems (BTRFS) silently fail when a flag cannot be set. Let's make sure our + * changes actually went through by querying the flags again and verifying they're equal to + * the flags we tried to configure. */ + + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) < 0) + return -errno; + + if (new_attr == attr) { + if (ret_previous) + *ret_previous = old_attr; + if (ret_final) + *ret_final = new_attr; + return 1; + } + + /* Trigger the fallback logic. */ + errno = EINVAL; + } + + if ((errno != EINVAL && !ERRNO_IS_NOT_SUPPORTED(errno)) || + !FLAGS_SET(flags, CHATTR_FALLBACK_BITWISE)) + return -errno; + + /* When -EINVAL is returned, we assume that incompatible attributes are simultaneously + * specified. E.g., compress(c) and nocow(C) attributes cannot be set to files on btrfs. + * As a fallback, let's try to set attributes one by one. + * + * Also, when we get EOPNOTSUPP (or a similar error code) we assume a flag might just not be + * supported, and we can ignore it too */ + + unsigned current_attr = old_attr; + for (unsigned i = 0; i < sizeof(unsigned) * 8; i++) { + unsigned new_one, mask_one = 1u << i; + + if (!FLAGS_SET(mask, mask_one)) + continue; + + new_one = UPDATE_FLAG(current_attr, mask_one, FLAGS_SET(value, mask_one)); + if (new_one == current_attr) + continue; + + if (ioctl(fd, FS_IOC_SETFLAGS, &new_one) < 0) { + if (errno != EINVAL && !ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + + log_full_errno(FLAGS_SET(flags, CHATTR_WARN_UNSUPPORTED_FLAGS) ? LOG_WARNING : LOG_DEBUG, + errno, + "Unable to set file attribute 0x%x on %s, ignoring: %m", mask_one, strna(path)); + + /* Ensures that we record whether only EOPNOTSUPP&friends are encountered, or if a more serious + * error (thus worth logging at a different level, etc) was seen too. */ + if (set_flags_errno == 0 || !ERRNO_IS_NOT_SUPPORTED(errno)) + set_flags_errno = -errno; + + continue; + } + + if (ioctl(fd, FS_IOC_GETFLAGS, ¤t_attr) < 0) + return -errno; + } + + if (ret_previous) + *ret_previous = old_attr; + if (ret_final) + *ret_final = current_attr; + + /* -ENOANO indicates that some attributes cannot be set. ERRNO_IS_NOT_SUPPORTED indicates that all + * encountered failures were due to flags not supported by the FS, so return a specific error in + * that case, so callers can handle it properly (e.g.: tmpfiles.d can use debug level logging). */ + return current_attr == new_attr ? 1 : ERRNO_IS_NOT_SUPPORTED(set_flags_errno) ? set_flags_errno : -ENOANO; +} + +int read_attr_fd(int fd, unsigned *ret) { + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISDIR(st.st_mode) && !S_ISREG(st.st_mode)) + return -ENOTTY; + + return RET_NERRNO(ioctl(fd, FS_IOC_GETFLAGS, ret)); +} + +int read_attr_path(const char *p, unsigned *ret) { + _cleanup_close_ int fd = -EBADF; + + assert(p); + assert(ret); + + fd = open(p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return read_attr_fd(fd, ret); +} diff --git a/src/basic/chattr-util.h b/src/basic/chattr-util.h new file mode 100644 index 0000000..c1ee63b --- /dev/null +++ b/src/basic/chattr-util.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "missing_fs.h" + +/* The chattr() flags to apply when creating a new file *before* writing to it. In particular, flags such as + * FS_NOCOW_FL don't work if applied a-posteriori. All other flags are fine (or even necessary, think + * FS_IMMUTABLE_FL!) to apply after writing to the files. */ +#define CHATTR_EARLY_FL \ + (FS_NOATIME_FL | \ + FS_COMPR_FL | \ + FS_NOCOW_FL | \ + FS_NOCOMP_FL | \ + FS_PROJINHERIT_FL) + +#define CHATTR_ALL_FL \ + (FS_NOATIME_FL | \ + FS_SYNC_FL | \ + FS_DIRSYNC_FL | \ + FS_APPEND_FL | \ + FS_COMPR_FL | \ + FS_NODUMP_FL | \ + FS_EXTENT_FL | \ + FS_IMMUTABLE_FL | \ + FS_JOURNAL_DATA_FL | \ + FS_SECRM_FL | \ + FS_UNRM_FL | \ + FS_NOTAIL_FL | \ + FS_TOPDIR_FL | \ + FS_NOCOW_FL | \ + FS_PROJINHERIT_FL) + +typedef enum ChattrApplyFlags { + CHATTR_FALLBACK_BITWISE = 1 << 0, + CHATTR_WARN_UNSUPPORTED_FLAGS = 1 << 1, +} ChattrApplyFlags; + +int chattr_full(int dir_fd, const char *path, unsigned value, unsigned mask, unsigned *ret_previous, unsigned *ret_final, ChattrApplyFlags flags); +static inline int chattr_at(int dir_fd, const char *path, unsigned value, unsigned mask, unsigned *previous) { + return chattr_full(dir_fd, path, value, mask, previous, NULL, 0); +} +static inline int chattr_fd(int fd, unsigned value, unsigned mask, unsigned *previous) { + return chattr_full(fd, NULL, value, mask, previous, NULL, 0); +} +static inline int chattr_path(const char *path, unsigned value, unsigned mask, unsigned *previous) { + return chattr_full(AT_FDCWD, path, value, mask, previous, NULL, 0); +} + +int read_attr_fd(int fd, unsigned *ret); +int read_attr_path(const char *p, unsigned *ret); + +/* Combination of chattr flags, that should be appropriate for secrets stored on disk: Secure Remove + + * Exclusion from Dumping + Synchronous Writing (i.e. not caching in memory) + In-Place Updating (i.e. not + * spurious copies). */ +#define CHATTR_SECRET_FLAGS (FS_SECRM_FL|FS_NODUMP_FL|FS_SYNC_FL|FS_NOCOW_FL) + +static inline int chattr_secret(int fd, ChattrApplyFlags flags) { + return chattr_full(fd, NULL, CHATTR_SECRET_FLAGS, CHATTR_SECRET_FLAGS, NULL, NULL, flags|CHATTR_FALLBACK_BITWISE); +} diff --git a/src/basic/check-filesystems.sh b/src/basic/check-filesystems.sh new file mode 100755 index 0000000..696ef61 --- /dev/null +++ b/src/basic/check-filesystems.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu +set -o pipefail + +cpp="$1" +filesystems_gperf="$2" +shift 2 + +includes="" +for i in "$@"; do + includes="$includes -include $i" +done + +error=false + +# shellcheck disable=SC2086 +for fs in $($cpp -dM $includes - /dev/null; then + # STACK_END_MAGIC doesn't refer to a filesystem + # mtd_inode was removed in 2015 + # futexfs was removed in 2018 + if [[ "$fs" =~ ^(STACK_END_MAGIC|MTD_INODE_FS_MAGIC|FUTEXFS_SUPER_MAGIC)$ ]]; then + continue + fi + echo "Filesystem found in kernel header but not in $(basename "$filesystems_gperf"): $fs"; + error=true + fi +done + +if $error; then + exit 1 +fi diff --git a/src/basic/compress.c b/src/basic/compress.c new file mode 100644 index 0000000..ac0bfdf --- /dev/null +++ b/src/basic/compress.c @@ -0,0 +1,1088 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_XZ +#include +#endif + +#if HAVE_LZ4 +#include +#include +#endif + +#if HAVE_ZSTD +#include +#include +#endif + +#include "alloc-util.h" +#include "compress.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "macro.h" +#include "sparse-endian.h" +#include "string-table.h" +#include "string-util.h" +#include "unaligned.h" + +#if HAVE_LZ4 +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(LZ4F_compressionContext_t, LZ4F_freeCompressionContext, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(LZ4F_decompressionContext_t, LZ4F_freeDecompressionContext, NULL); +#endif + +#if HAVE_ZSTD +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(ZSTD_CCtx*, ZSTD_freeCCtx, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(ZSTD_DCtx*, ZSTD_freeDCtx, NULL); + +static int zstd_ret_to_errno(size_t ret) { + switch (ZSTD_getErrorCode(ret)) { + case ZSTD_error_dstSize_tooSmall: + return -ENOBUFS; + case ZSTD_error_memory_allocation: + return -ENOMEM; + default: + return -EBADMSG; + } +} +#endif + +#define ALIGN_8(l) ALIGN_TO(l, sizeof(size_t)) + +static const char* const compression_table[_COMPRESSION_MAX] = { + [COMPRESSION_NONE] = "NONE", + [COMPRESSION_XZ] = "XZ", + [COMPRESSION_LZ4] = "LZ4", + [COMPRESSION_ZSTD] = "ZSTD", +}; + +DEFINE_STRING_TABLE_LOOKUP(compression, Compression); + +bool compression_supported(Compression c) { + static const unsigned supported = + (1U << COMPRESSION_NONE) | + (1U << COMPRESSION_XZ) * HAVE_XZ | + (1U << COMPRESSION_LZ4) * HAVE_LZ4 | + (1U << COMPRESSION_ZSTD) * HAVE_ZSTD; + + return c >= 0 && c < _COMPRESSION_MAX && FLAGS_SET(supported, 1U << c); +} + +int compress_blob_xz(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { +#if HAVE_XZ + static const lzma_options_lzma opt = { + 1u << 20u, NULL, 0, LZMA_LC_DEFAULT, LZMA_LP_DEFAULT, + LZMA_PB_DEFAULT, LZMA_MODE_FAST, 128, LZMA_MF_HC3, 4 + }; + static const lzma_filter filters[] = { + { LZMA_FILTER_LZMA2, (lzma_options_lzma*) &opt }, + { LZMA_VLI_UNKNOWN, NULL } + }; + lzma_ret ret; + size_t out_pos = 0; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size > 0); + assert(dst_size); + + /* Returns < 0 if we couldn't compress the data or the + * compressed result is longer than the original */ + + if (src_size < 80) + return -ENOBUFS; + + ret = lzma_stream_buffer_encode((lzma_filter*) filters, LZMA_CHECK_NONE, NULL, + src, src_size, dst, &out_pos, dst_alloc_size); + if (ret != LZMA_OK) + return -ENOBUFS; + + *dst_size = out_pos; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int compress_blob_lz4(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { +#if HAVE_LZ4 + int r; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size > 0); + assert(dst_size); + + /* Returns < 0 if we couldn't compress the data or the + * compressed result is longer than the original */ + + if (src_size < 9) + return -ENOBUFS; + + r = LZ4_compress_default(src, (char*)dst + 8, src_size, (int) dst_alloc_size - 8); + if (r <= 0) + return -ENOBUFS; + + unaligned_write_le64(dst, src_size); + *dst_size = r + 8; + + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int compress_blob_zstd( + const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { +#if HAVE_ZSTD + size_t k; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_alloc_size > 0); + assert(dst_size); + + k = ZSTD_compress(dst, dst_alloc_size, src, src_size, 0); + if (ZSTD_isError(k)) + return zstd_ret_to_errno(k); + + *dst_size = k; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob_xz( + const void *src, + uint64_t src_size, + void **dst, + size_t* dst_size, + size_t dst_max) { + +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + size_t space; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_size); + + ret = lzma_stream_decoder(&s, UINT64_MAX, 0); + if (ret != LZMA_OK) + return -ENOMEM; + + space = MIN(src_size * 2, dst_max ?: SIZE_MAX); + if (!greedy_realloc(dst, space, 1)) + return -ENOMEM; + + s.next_in = src; + s.avail_in = src_size; + + s.next_out = *dst; + s.avail_out = space; + + for (;;) { + size_t used; + + ret = lzma_code(&s, LZMA_FINISH); + + if (ret == LZMA_STREAM_END) + break; + else if (ret != LZMA_OK) + return -ENOMEM; + + if (dst_max > 0 && (space - s.avail_out) >= dst_max) + break; + else if (dst_max > 0 && space == dst_max) + return -ENOBUFS; + + used = space - s.avail_out; + space = MIN(2 * space, dst_max ?: SIZE_MAX); + if (!greedy_realloc(dst, space, 1)) + return -ENOMEM; + + s.avail_out = space - used; + s.next_out = *(uint8_t**)dst + used; + } + + *dst_size = space - s.avail_out; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob_lz4( + const void *src, + uint64_t src_size, + void **dst, + size_t* dst_size, + size_t dst_max) { + +#if HAVE_LZ4 + char* out; + int r, size; /* LZ4 uses int for size */ + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_size); + + if (src_size <= 8) + return -EBADMSG; + + size = unaligned_read_le64(src); + if (size < 0 || (unsigned) size != unaligned_read_le64(src)) + return -EFBIG; + out = greedy_realloc(dst, size, 1); + if (!out) + return -ENOMEM; + + r = LZ4_decompress_safe((char*)src + 8, out, src_size - 8, size); + if (r < 0 || r != size) + return -EBADMSG; + + *dst_size = size; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob_zstd( + const void *src, + uint64_t src_size, + void **dst, + size_t *dst_size, + size_t dst_max) { + +#if HAVE_ZSTD + uint64_t size; + + assert(src); + assert(src_size > 0); + assert(dst); + assert(dst_size); + + size = ZSTD_getFrameContentSize(src, src_size); + if (IN_SET(size, ZSTD_CONTENTSIZE_ERROR, ZSTD_CONTENTSIZE_UNKNOWN)) + return -EBADMSG; + + if (dst_max > 0 && size > dst_max) + size = dst_max; + if (size > SIZE_MAX) + return -E2BIG; + + if (!(greedy_realloc(dst, MAX(ZSTD_DStreamOutSize(), size), 1))) + return -ENOMEM; + + _cleanup_(ZSTD_freeDCtxp) ZSTD_DCtx *dctx = ZSTD_createDCtx(); + if (!dctx) + return -ENOMEM; + + ZSTD_inBuffer input = { + .src = src, + .size = src_size, + }; + ZSTD_outBuffer output = { + .dst = *dst, + .size = MALLOC_SIZEOF_SAFE(*dst), + }; + + size_t k = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(k)) { + log_debug("ZSTD decoder failed: %s", ZSTD_getErrorName(k)); + return zstd_ret_to_errno(k); + } + assert(output.pos >= size); + + *dst_size = size; + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_blob( + Compression compression, + const void *src, + uint64_t src_size, + void **dst, + size_t* dst_size, + size_t dst_max) { + + if (compression == COMPRESSION_XZ) + return decompress_blob_xz( + src, src_size, + dst, dst_size, dst_max); + else if (compression == COMPRESSION_LZ4) + return decompress_blob_lz4( + src, src_size, + dst, dst_size, dst_max); + else if (compression == COMPRESSION_ZSTD) + return decompress_blob_zstd( + src, src_size, + dst, dst_size, dst_max); + else + return -EPROTONOSUPPORT; +} + +int decompress_startswith_xz( + const void *src, + uint64_t src_size, + void **buffer, + const void *prefix, + size_t prefix_len, + uint8_t extra) { + +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + size_t allocated; + lzma_ret ret; + + /* Checks whether the decompressed blob starts with the mentioned prefix. The byte extra needs to + * follow the prefix */ + + assert(src); + assert(src_size > 0); + assert(buffer); + assert(prefix); + + ret = lzma_stream_decoder(&s, UINT64_MAX, 0); + if (ret != LZMA_OK) + return -EBADMSG; + + if (!(greedy_realloc(buffer, ALIGN_8(prefix_len + 1), 1))) + return -ENOMEM; + + allocated = MALLOC_SIZEOF_SAFE(*buffer); + + s.next_in = src; + s.avail_in = src_size; + + s.next_out = *buffer; + s.avail_out = allocated; + + for (;;) { + ret = lzma_code(&s, LZMA_FINISH); + + if (!IN_SET(ret, LZMA_OK, LZMA_STREAM_END)) + return -EBADMSG; + + if (allocated - s.avail_out >= prefix_len + 1) + return memcmp(*buffer, prefix, prefix_len) == 0 && + ((const uint8_t*) *buffer)[prefix_len] == extra; + + if (ret == LZMA_STREAM_END) + return 0; + + s.avail_out += allocated; + + if (!(greedy_realloc(buffer, allocated * 2, 1))) + return -ENOMEM; + + allocated = MALLOC_SIZEOF_SAFE(*buffer); + s.next_out = *(uint8_t**)buffer + allocated - s.avail_out; + } + +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_startswith_lz4( + const void *src, + uint64_t src_size, + void **buffer, + const void *prefix, + size_t prefix_len, + uint8_t extra) { + +#if HAVE_LZ4 + /* Checks whether the decompressed blob starts with the mentioned prefix. The byte extra needs to + * follow the prefix */ + + size_t allocated; + int r; + + assert(src); + assert(src_size > 0); + assert(buffer); + assert(prefix); + + if (src_size <= 8) + return -EBADMSG; + + if (!(greedy_realloc(buffer, ALIGN_8(prefix_len + 1), 1))) + return -ENOMEM; + allocated = MALLOC_SIZEOF_SAFE(*buffer); + + r = LZ4_decompress_safe_partial( + (char*)src + 8, + *buffer, + src_size - 8, + prefix_len + 1, + allocated); + + /* One lz4 < 1.8.3, we might get "failure" (r < 0), or "success" where just a part of the buffer is + * decompressed. But if we get a smaller amount of bytes than requested, we don't know whether there + * isn't enough data to fill the requested size or whether we just got a partial answer. + */ + if (r < 0 || (size_t) r < prefix_len + 1) { + size_t size; + + if (LZ4_versionNumber() >= 10803) + /* We trust that the newer lz4 decompresses the number of bytes we + * requested if available in the compressed string. */ + return 0; + + if (r > 0) + /* Compare what we have first, in case of mismatch we can + * shortcut the full comparison. */ + if (memcmp(*buffer, prefix, r) != 0) + return 0; + + /* Before version 1.8.3, lz4 always tries to decode full a "sequence", + * so in pathological cases might need to decompress the full field. */ + r = decompress_blob_lz4(src, src_size, buffer, &size, 0); + if (r < 0) + return r; + + if (size < prefix_len + 1) + return 0; + } + + return memcmp(*buffer, prefix, prefix_len) == 0 && + ((const uint8_t*) *buffer)[prefix_len] == extra; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_startswith_zstd( + const void *src, + uint64_t src_size, + void **buffer, + const void *prefix, + size_t prefix_len, + uint8_t extra) { +#if HAVE_ZSTD + assert(src); + assert(src_size > 0); + assert(buffer); + assert(prefix); + + uint64_t size = ZSTD_getFrameContentSize(src, src_size); + if (IN_SET(size, ZSTD_CONTENTSIZE_ERROR, ZSTD_CONTENTSIZE_UNKNOWN)) + return -EBADMSG; + + if (size < prefix_len + 1) + return 0; /* Decompressed text too short to match the prefix and extra */ + + _cleanup_(ZSTD_freeDCtxp) ZSTD_DCtx *dctx = ZSTD_createDCtx(); + if (!dctx) + return -ENOMEM; + + if (!(greedy_realloc(buffer, MAX(ZSTD_DStreamOutSize(), prefix_len + 1), 1))) + return -ENOMEM; + + ZSTD_inBuffer input = { + .src = src, + .size = src_size, + }; + ZSTD_outBuffer output = { + .dst = *buffer, + .size = MALLOC_SIZEOF_SAFE(*buffer), + }; + size_t k; + + k = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(k)) { + log_debug("ZSTD decoder failed: %s", ZSTD_getErrorName(k)); + return zstd_ret_to_errno(k); + } + assert(output.pos >= prefix_len + 1); + + return memcmp(*buffer, prefix, prefix_len) == 0 && + ((const uint8_t*) *buffer)[prefix_len] == extra; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_startswith( + Compression compression, + const void *src, + uint64_t src_size, + void **buffer, + const void *prefix, + size_t prefix_len, + uint8_t extra) { + + if (compression == COMPRESSION_XZ) + return decompress_startswith_xz( + src, src_size, + buffer, + prefix, prefix_len, + extra); + + else if (compression == COMPRESSION_LZ4) + return decompress_startswith_lz4( + src, src_size, + buffer, + prefix, prefix_len, + extra); + else if (compression == COMPRESSION_ZSTD) + return decompress_startswith_zstd( + src, src_size, + buffer, + prefix, prefix_len, + extra); + else + return -EBADMSG; +} + +int compress_stream_xz(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size) { +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + uint8_t buf[BUFSIZ], out[BUFSIZ]; + lzma_action action = LZMA_RUN; + + assert(fdf >= 0); + assert(fdt >= 0); + + ret = lzma_easy_encoder(&s, LZMA_PRESET_DEFAULT, LZMA_CHECK_CRC64); + if (ret != LZMA_OK) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to initialize XZ encoder: code %u", + ret); + + for (;;) { + if (s.avail_in == 0 && action == LZMA_RUN) { + size_t m = sizeof(buf); + ssize_t n; + + if (max_bytes != UINT64_MAX && (uint64_t) m > max_bytes) + m = (size_t) max_bytes; + + n = read(fdf, buf, m); + if (n < 0) + return -errno; + if (n == 0) + action = LZMA_FINISH; + else { + s.next_in = buf; + s.avail_in = n; + + if (max_bytes != UINT64_MAX) { + assert(max_bytes >= (uint64_t) n); + max_bytes -= n; + } + } + } + + if (s.avail_out == 0) { + s.next_out = out; + s.avail_out = sizeof(out); + } + + ret = lzma_code(&s, action); + if (!IN_SET(ret, LZMA_OK, LZMA_STREAM_END)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Compression failed: code %u", + ret); + + if (s.avail_out == 0 || ret == LZMA_STREAM_END) { + ssize_t n, k; + + n = sizeof(out) - s.avail_out; + + k = loop_write(fdt, out, n); + if (k < 0) + return k; + + if (ret == LZMA_STREAM_END) { + if (ret_uncompressed_size) + *ret_uncompressed_size = s.total_in; + + log_debug("XZ compression finished (%"PRIu64" -> %"PRIu64" bytes, %.1f%%)", + s.total_in, s.total_out, + (double) s.total_out / s.total_in * 100); + + return 0; + } + } + } +#else + return -EPROTONOSUPPORT; +#endif +} + +#define LZ4_BUFSIZE (512*1024u) + +int compress_stream_lz4(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size) { + +#if HAVE_LZ4 + LZ4F_errorCode_t c; + _cleanup_(LZ4F_freeCompressionContextp) LZ4F_compressionContext_t ctx = NULL; + _cleanup_free_ void *in_buff = NULL; + _cleanup_free_ char *out_buff = NULL; + size_t out_allocsize, n, offset = 0, frame_size; + uint64_t total_in = 0, total_out; + int r; + static const LZ4F_preferences_t preferences = { + .frameInfo.blockSizeID = 5, + }; + + c = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION); + if (LZ4F_isError(c)) + return -ENOMEM; + + frame_size = LZ4F_compressBound(LZ4_BUFSIZE, &preferences); + out_allocsize = frame_size + 64*1024; /* add some space for header and trailer */ + out_buff = malloc(out_allocsize); + if (!out_buff) + return -ENOMEM; + + in_buff = malloc(LZ4_BUFSIZE); + if (!in_buff) + return -ENOMEM; + + n = offset = total_out = LZ4F_compressBegin(ctx, out_buff, out_allocsize, &preferences); + if (LZ4F_isError(n)) + return -EINVAL; + + log_debug("Buffer size is %zu bytes, header size %zu bytes.", out_allocsize, n); + + for (;;) { + ssize_t k; + + k = loop_read(fdf, in_buff, LZ4_BUFSIZE, true); + if (k < 0) + return k; + if (k == 0) + break; + n = LZ4F_compressUpdate(ctx, out_buff + offset, out_allocsize - offset, + in_buff, k, NULL); + if (LZ4F_isError(n)) + return -ENOTRECOVERABLE; + + total_in += k; + offset += n; + total_out += n; + + if (max_bytes != UINT64_MAX && total_out > (size_t) max_bytes) + return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), + "Compressed stream longer than %" PRIu64 " bytes", max_bytes); + + if (out_allocsize - offset < frame_size + 4) { + k = loop_write(fdt, out_buff, offset); + if (k < 0) + return k; + offset = 0; + } + } + + n = LZ4F_compressEnd(ctx, out_buff + offset, out_allocsize - offset, NULL); + if (LZ4F_isError(n)) + return -ENOTRECOVERABLE; + + offset += n; + total_out += n; + r = loop_write(fdt, out_buff, offset); + if (r < 0) + return r; + + if (ret_uncompressed_size) + *ret_uncompressed_size = total_in; + + log_debug("LZ4 compression finished (%" PRIu64 " -> %" PRIu64 " bytes, %.1f%%)", + total_in, total_out, + (double) total_out / total_in * 100); + + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_stream_xz(int fdf, int fdt, uint64_t max_bytes) { + +#if HAVE_XZ + _cleanup_(lzma_end) lzma_stream s = LZMA_STREAM_INIT; + lzma_ret ret; + + uint8_t buf[BUFSIZ], out[BUFSIZ]; + lzma_action action = LZMA_RUN; + + assert(fdf >= 0); + assert(fdt >= 0); + + ret = lzma_stream_decoder(&s, UINT64_MAX, 0); + if (ret != LZMA_OK) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEM), + "Failed to initialize XZ decoder: code %u", + ret); + + for (;;) { + if (s.avail_in == 0 && action == LZMA_RUN) { + ssize_t n; + + n = read(fdf, buf, sizeof(buf)); + if (n < 0) + return -errno; + if (n == 0) + action = LZMA_FINISH; + else { + s.next_in = buf; + s.avail_in = n; + } + } + + if (s.avail_out == 0) { + s.next_out = out; + s.avail_out = sizeof(out); + } + + ret = lzma_code(&s, action); + if (!IN_SET(ret, LZMA_OK, LZMA_STREAM_END)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Decompression failed: code %u", + ret); + + if (s.avail_out == 0 || ret == LZMA_STREAM_END) { + ssize_t n, k; + + n = sizeof(out) - s.avail_out; + + if (max_bytes != UINT64_MAX) { + if (max_bytes < (uint64_t) n) + return -EFBIG; + + max_bytes -= n; + } + + k = loop_write(fdt, out, n); + if (k < 0) + return k; + + if (ret == LZMA_STREAM_END) { + log_debug("XZ decompression finished (%"PRIu64" -> %"PRIu64" bytes, %.1f%%)", + s.total_in, s.total_out, + (double) s.total_out / s.total_in * 100); + + return 0; + } + } + } +#else + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Cannot decompress file. Compiled without XZ support."); +#endif +} + +int decompress_stream_lz4(int in, int out, uint64_t max_bytes) { +#if HAVE_LZ4 + size_t c; + _cleanup_(LZ4F_freeDecompressionContextp) LZ4F_decompressionContext_t ctx = NULL; + _cleanup_free_ char *buf = NULL; + char *src; + struct stat st; + int r = 0; + size_t total_in = 0, total_out = 0; + + c = LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION); + if (LZ4F_isError(c)) + return -ENOMEM; + + if (fstat(in, &st) < 0) + return log_debug_errno(errno, "fstat() failed: %m"); + + if (file_offset_beyond_memory_size(st.st_size)) + return -EFBIG; + + buf = malloc(LZ4_BUFSIZE); + if (!buf) + return -ENOMEM; + + src = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, in, 0); + if (src == MAP_FAILED) + return -errno; + + while (total_in < (size_t) st.st_size) { + size_t produced = LZ4_BUFSIZE; + size_t used = st.st_size - total_in; + + c = LZ4F_decompress(ctx, buf, &produced, src + total_in, &used, NULL); + if (LZ4F_isError(c)) { + r = -EBADMSG; + goto cleanup; + } + + total_in += used; + total_out += produced; + + if (max_bytes != UINT64_MAX && total_out > (size_t) max_bytes) { + log_debug("Decompressed stream longer than %"PRIu64" bytes", max_bytes); + r = -EFBIG; + goto cleanup; + } + + r = loop_write(out, buf, produced); + if (r < 0) + goto cleanup; + } + + log_debug("LZ4 decompression finished (%zu -> %zu bytes, %.1f%%)", + total_in, total_out, + total_in > 0 ? (double) total_out / total_in * 100 : 0.0); + cleanup: + munmap(src, st.st_size); + return r; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Cannot decompress file. Compiled without LZ4 support."); +#endif +} + +int compress_stream_zstd(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size) { +#if HAVE_ZSTD + _cleanup_(ZSTD_freeCCtxp) ZSTD_CCtx *cctx = NULL; + _cleanup_free_ void *in_buff = NULL, *out_buff = NULL; + size_t in_allocsize, out_allocsize; + size_t z; + uint64_t left = max_bytes, in_bytes = 0; + + assert(fdf >= 0); + assert(fdt >= 0); + + /* Create the context and buffers */ + in_allocsize = ZSTD_CStreamInSize(); + out_allocsize = ZSTD_CStreamOutSize(); + in_buff = malloc(in_allocsize); + out_buff = malloc(out_allocsize); + cctx = ZSTD_createCCtx(); + if (!cctx || !out_buff || !in_buff) + return -ENOMEM; + + z = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1); + if (ZSTD_isError(z)) + log_debug("Failed to enable ZSTD checksum, ignoring: %s", ZSTD_getErrorName(z)); + + /* This loop read from the input file, compresses that entire chunk, + * and writes all output produced to the output file. + */ + for (;;) { + bool is_last_chunk; + ZSTD_inBuffer input = { + .src = in_buff, + .size = 0, + .pos = 0 + }; + ssize_t red; + + red = loop_read(fdf, in_buff, in_allocsize, true); + if (red < 0) + return red; + is_last_chunk = red == 0; + + in_bytes += (size_t) red; + input.size = (size_t) red; + + for (bool finished = false; !finished;) { + ZSTD_outBuffer output = { + .dst = out_buff, + .size = out_allocsize, + .pos = 0 + }; + size_t remaining; + ssize_t wrote; + + /* Compress into the output buffer and write all of the + * output to the file so we can reuse the buffer next + * iteration. + */ + remaining = ZSTD_compressStream2( + cctx, &output, &input, + is_last_chunk ? ZSTD_e_end : ZSTD_e_continue); + + if (ZSTD_isError(remaining)) { + log_debug("ZSTD encoder failed: %s", ZSTD_getErrorName(remaining)); + return zstd_ret_to_errno(remaining); + } + + if (left < output.pos) + return -EFBIG; + + wrote = loop_write_full(fdt, output.dst, output.pos, USEC_INFINITY); + if (wrote < 0) + return wrote; + + left -= output.pos; + + /* If we're on the last chunk we're finished when zstd + * returns 0, which means its consumed all the input AND + * finished the frame. Otherwise, we're finished when + * we've consumed all the input. + */ + finished = is_last_chunk ? (remaining == 0) : (input.pos == input.size); + } + + /* zstd only returns 0 when the input is completely consumed */ + assert(input.pos == input.size); + if (is_last_chunk) + break; + } + + if (ret_uncompressed_size) + *ret_uncompressed_size = in_bytes; + + if (in_bytes > 0) + log_debug("ZSTD compression finished (%" PRIu64 " -> %" PRIu64 " bytes, %.1f%%)", + in_bytes, max_bytes - left, (double) (max_bytes - left) / in_bytes * 100); + else + log_debug("ZSTD compression finished (%" PRIu64 " -> %" PRIu64 " bytes)", + in_bytes, max_bytes - left); + + return 0; +#else + return -EPROTONOSUPPORT; +#endif +} + +int decompress_stream_zstd(int fdf, int fdt, uint64_t max_bytes) { +#if HAVE_ZSTD + _cleanup_(ZSTD_freeDCtxp) ZSTD_DCtx *dctx = NULL; + _cleanup_free_ void *in_buff = NULL, *out_buff = NULL; + size_t in_allocsize, out_allocsize; + size_t last_result = 0; + uint64_t left = max_bytes, in_bytes = 0; + + assert(fdf >= 0); + assert(fdt >= 0); + + /* Create the context and buffers */ + in_allocsize = ZSTD_DStreamInSize(); + out_allocsize = ZSTD_DStreamOutSize(); + in_buff = malloc(in_allocsize); + out_buff = malloc(out_allocsize); + dctx = ZSTD_createDCtx(); + if (!dctx || !out_buff || !in_buff) + return -ENOMEM; + + /* This loop assumes that the input file is one or more concatenated + * zstd streams. This example won't work if there is trailing non-zstd + * data at the end, but streaming decompression in general handles this + * case. ZSTD_decompressStream() returns 0 exactly when the frame is + * completed, and doesn't consume input after the frame. + */ + for (;;) { + bool has_error = false; + ZSTD_inBuffer input = { + .src = in_buff, + .size = 0, + .pos = 0 + }; + ssize_t red; + + red = loop_read(fdf, in_buff, in_allocsize, true); + if (red < 0) + return red; + if (red == 0) + break; + + in_bytes += (size_t) red; + input.size = (size_t) red; + input.pos = 0; + + /* Given a valid frame, zstd won't consume the last byte of the + * frame until it has flushed all of the decompressed data of + * the frame. So input.pos < input.size means frame is not done + * or there is still output available. + */ + while (input.pos < input.size) { + ZSTD_outBuffer output = { + .dst = out_buff, + .size = out_allocsize, + .pos = 0 + }; + ssize_t wrote; + /* The return code is zero if the frame is complete, but + * there may be multiple frames concatenated together. + * Zstd will automatically reset the context when a + * frame is complete. Still, calling ZSTD_DCtx_reset() + * can be useful to reset the context to a clean state, + * for instance if the last decompression call returned + * an error. + */ + last_result = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(last_result)) { + has_error = true; + break; + } + + if (left < output.pos) + return -EFBIG; + + wrote = loop_write_full(fdt, output.dst, output.pos, USEC_INFINITY); + if (wrote < 0) + return wrote; + + left -= output.pos; + } + if (has_error) + break; + } + + if (in_bytes == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "ZSTD decoder failed: no data read"); + + if (last_result != 0) { + /* The last return value from ZSTD_decompressStream did not end + * on a frame, but we reached the end of the file! We assume + * this is an error, and the input was truncated. + */ + log_debug("ZSTD decoder failed: %s", ZSTD_getErrorName(last_result)); + return zstd_ret_to_errno(last_result); + } + + log_debug( + "ZSTD decompression finished (%" PRIu64 " -> %" PRIu64 " bytes, %.1f%%)", + in_bytes, + max_bytes - left, + (double) (max_bytes - left) / in_bytes * 100); + return 0; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Cannot decompress file. Compiled without ZSTD support."); +#endif +} + +int decompress_stream(const char *filename, int fdf, int fdt, uint64_t max_bytes) { + + if (endswith(filename, ".lz4")) + return decompress_stream_lz4(fdf, fdt, max_bytes); + else if (endswith(filename, ".xz")) + return decompress_stream_xz(fdf, fdt, max_bytes); + else if (endswith(filename, ".zst")) + return decompress_stream_zstd(fdf, fdt, max_bytes); + else + return -EPROTONOSUPPORT; +} diff --git a/src/basic/compress.h b/src/basic/compress.h new file mode 100644 index 0000000..1b5c645 --- /dev/null +++ b/src/basic/compress.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +typedef enum Compression { + COMPRESSION_NONE, + COMPRESSION_XZ, + COMPRESSION_LZ4, + COMPRESSION_ZSTD, + _COMPRESSION_MAX, + _COMPRESSION_INVALID = -EINVAL, +} Compression; + +const char* compression_to_string(Compression compression); +Compression compression_from_string(const char *compression); + +bool compression_supported(Compression c); + +int compress_blob_xz(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); +int compress_blob_lz4(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); +int compress_blob_zstd(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); + +int decompress_blob_xz(const void *src, uint64_t src_size, + void **dst, size_t* dst_size, size_t dst_max); +int decompress_blob_lz4(const void *src, uint64_t src_size, + void **dst, size_t* dst_size, size_t dst_max); +int decompress_blob_zstd(const void *src, uint64_t src_size, + void **dst, size_t* dst_size, size_t dst_max); +int decompress_blob(Compression compression, + const void *src, uint64_t src_size, + void **dst, size_t* dst_size, size_t dst_max); + +int decompress_startswith_xz(const void *src, uint64_t src_size, + void **buffer, + const void *prefix, size_t prefix_len, + uint8_t extra); +int decompress_startswith_lz4(const void *src, uint64_t src_size, + void **buffer, + const void *prefix, size_t prefix_len, + uint8_t extra); +int decompress_startswith_zstd(const void *src, uint64_t src_size, + void **buffer, + const void *prefix, size_t prefix_len, + uint8_t extra); +int decompress_startswith(Compression compression, + const void *src, uint64_t src_size, + void **buffer, + const void *prefix, size_t prefix_len, + uint8_t extra); + +int compress_stream_xz(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size); +int compress_stream_lz4(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size); +int compress_stream_zstd(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size); + +int decompress_stream_xz(int fdf, int fdt, uint64_t max_size); +int decompress_stream_lz4(int fdf, int fdt, uint64_t max_size); +int decompress_stream_zstd(int fdf, int fdt, uint64_t max_size); + +static inline int compress_blob( + Compression compression, + const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size) { + + switch (compression) { + case COMPRESSION_ZSTD: + return compress_blob_zstd(src, src_size, dst, dst_alloc_size, dst_size); + case COMPRESSION_LZ4: + return compress_blob_lz4(src, src_size, dst, dst_alloc_size, dst_size); + case COMPRESSION_XZ: + return compress_blob_xz(src, src_size, dst, dst_alloc_size, dst_size); + default: + return -EOPNOTSUPP; + } +} + +static inline int compress_stream(int fdf, int fdt, uint64_t max_bytes, uint64_t *ret_uncompressed_size) { + switch (DEFAULT_COMPRESSION) { + case COMPRESSION_ZSTD: + return compress_stream_zstd(fdf, fdt, max_bytes, ret_uncompressed_size); + case COMPRESSION_LZ4: + return compress_stream_lz4(fdf, fdt, max_bytes, ret_uncompressed_size); + case COMPRESSION_XZ: + return compress_stream_xz(fdf, fdt, max_bytes, ret_uncompressed_size); + default: + return -EOPNOTSUPP; + } +} + +static inline const char* default_compression_extension(void) { + switch (DEFAULT_COMPRESSION) { + case COMPRESSION_ZSTD: + return ".zst"; + case COMPRESSION_LZ4: + return ".lz4"; + case COMPRESSION_XZ: + return ".xz"; + default: + return ""; + } +} + +int decompress_stream(const char *filename, int fdf, int fdt, uint64_t max_bytes); diff --git a/src/basic/conf-files.c b/src/basic/conf-files.c new file mode 100644 index 0000000..a56f82f --- /dev/null +++ b/src/basic/conf-files.c @@ -0,0 +1,374 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "chase.h" +#include "conf-files.h" +#include "constants.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "log.h" +#include "macro.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "set.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +static int files_add( + DIR *dir, + const char *dirpath, + Hashmap **files, + Set **masked, + const char *suffix, + unsigned flags) { + + int r; + + assert(dir); + assert(dirpath); + assert(files); + assert(masked); + + FOREACH_DIRENT(de, dir, return -errno) { + _cleanup_free_ char *n = NULL, *p = NULL; + struct stat st; + + /* Does this match the suffix? */ + if (suffix && !endswith(de->d_name, suffix)) + continue; + + /* Has this file already been found in an earlier directory? */ + if (hashmap_contains(*files, de->d_name)) { + log_debug("Skipping overridden file '%s/%s'.", dirpath, de->d_name); + continue; + } + + /* Has this been masked in an earlier directory? */ + if ((flags & CONF_FILES_FILTER_MASKED) && set_contains(*masked, de->d_name)) { + log_debug("File '%s/%s' is masked by previous entry.", dirpath, de->d_name); + continue; + } + + /* Read file metadata if we shall validate the check for file masks, for node types or whether the node is marked executable. */ + if (flags & (CONF_FILES_FILTER_MASKED|CONF_FILES_REGULAR|CONF_FILES_DIRECTORY|CONF_FILES_EXECUTABLE)) + if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) { + log_debug_errno(errno, "Failed to stat '%s/%s', ignoring: %m", dirpath, de->d_name); + continue; + } + + /* Is this a masking entry? */ + if ((flags & CONF_FILES_FILTER_MASKED)) + if (null_or_empty(&st)) { + /* Mark this one as masked */ + r = set_put_strdup(masked, de->d_name); + if (r < 0) + return r; + + log_debug("File '%s/%s' is a mask.", dirpath, de->d_name); + continue; + } + + /* Does this node have the right type? */ + if (flags & (CONF_FILES_REGULAR|CONF_FILES_DIRECTORY)) + if (!((flags & CONF_FILES_DIRECTORY) && S_ISDIR(st.st_mode)) && + !((flags & CONF_FILES_REGULAR) && S_ISREG(st.st_mode))) { + log_debug("Ignoring '%s/%s', as it does not have the right type.", dirpath, de->d_name); + continue; + } + + /* Does this node have the executable bit set? */ + if (flags & CONF_FILES_EXECUTABLE) + /* As requested: check if the file is marked executable. Note that we don't check access(X_OK) + * here, as we care about whether the file is marked executable at all, and not whether it is + * executable for us, because if so, such errors are stuff we should log about. */ + + if ((st.st_mode & 0111) == 0) { /* not executable */ + log_debug("Ignoring '%s/%s', as it is not marked executable.", dirpath, de->d_name); + continue; + } + + n = strdup(de->d_name); + if (!n) + return -ENOMEM; + + if ((flags & CONF_FILES_BASENAME)) + r = hashmap_ensure_put(files, &string_hash_ops_free, n, n); + else { + p = path_join(dirpath, de->d_name); + if (!p) + return -ENOMEM; + + r = hashmap_ensure_put(files, &string_hash_ops_free_free, n, p); + } + if (r < 0) + return r; + assert(r > 0); + + TAKE_PTR(n); + TAKE_PTR(p); + } + + return 0; +} + +static int base_cmp(char * const *a, char * const *b) { + assert(a); + assert(b); + return path_compare_filename(*a, *b); +} + +static int copy_and_sort_files_from_hashmap(Hashmap *fh, char ***ret) { + _cleanup_free_ char **sv = NULL; + char **files; + + assert(ret); + + sv = hashmap_get_strv(fh); + if (!sv) + return -ENOMEM; + + /* The entries in the array given by hashmap_get_strv() are still owned by the hashmap. */ + files = strv_copy(sv); + if (!files) + return -ENOMEM; + + typesafe_qsort(files, strv_length(files), base_cmp); + + *ret = files; + return 0; +} + +int conf_files_list_strv( + char ***ret, + const char *suffix, + const char *root, + unsigned flags, + const char * const *dirs) { + + _cleanup_hashmap_free_ Hashmap *fh = NULL; + _cleanup_set_free_ Set *masked = NULL; + int r; + + assert(ret); + + STRV_FOREACH(p, dirs) { + _cleanup_closedir_ DIR *dir = NULL; + _cleanup_free_ char *path = NULL; + + r = chase_and_opendir(*p, root, CHASE_PREFIX_ROOT, &path, &dir); + if (r < 0) { + if (r != -ENOENT) + log_debug_errno(r, "Failed to chase and open directory '%s', ignoring: %m", *p); + continue; + } + + r = files_add(dir, path, &fh, &masked, suffix, flags); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to search for files in '%s', ignoring: %m", path); + } + + return copy_and_sort_files_from_hashmap(fh, ret); +} + +int conf_files_list_strv_at( + char ***ret, + const char *suffix, + int rfd, + unsigned flags, + const char * const *dirs) { + + _cleanup_hashmap_free_ Hashmap *fh = NULL; + _cleanup_set_free_ Set *masked = NULL; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(ret); + + STRV_FOREACH(p, dirs) { + _cleanup_closedir_ DIR *dir = NULL; + _cleanup_free_ char *path = NULL; + + r = chase_and_opendirat(rfd, *p, CHASE_AT_RESOLVE_IN_ROOT, &path, &dir); + if (r < 0) { + if (r != -ENOENT) + log_debug_errno(r, "Failed to chase and open directory '%s', ignoring: %m", *p); + continue; + } + + r = files_add(dir, path, &fh, &masked, suffix, flags); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to search for files in '%s', ignoring: %m", path); + } + + return copy_and_sort_files_from_hashmap(fh, ret); +} + +int conf_files_insert(char ***strv, const char *root, char **dirs, const char *path) { + /* Insert a path into strv, at the place honouring the usual sorting rules: + * - we first compare by the basename + * - and then we compare by dirname, allowing just one file with the given + * basename. + * This means that we will + * - add a new entry if basename(path) was not on the list, + * - do nothing if an entry with higher priority was already present, + * - do nothing if our new entry matches the existing entry, + * - replace the existing entry if our new entry has higher priority. + */ + size_t i, n; + char *t; + int r; + + n = strv_length(*strv); + for (i = 0; i < n; i++) { + int c; + + c = base_cmp((char* const*) *strv + i, (char* const*) &path); + if (c == 0) + /* Oh, there already is an entry with a matching name (the last component). */ + STRV_FOREACH(dir, dirs) { + _cleanup_free_ char *rdir = NULL; + char *p1, *p2; + + rdir = path_join(root, *dir); + if (!rdir) + return -ENOMEM; + + p1 = path_startswith((*strv)[i], rdir); + if (p1) + /* Existing entry with higher priority + * or same priority, no need to do anything. */ + return 0; + + p2 = path_startswith(path, *dir); + if (p2) { + /* Our new entry has higher priority */ + + t = path_join(root, path); + if (!t) + return log_oom(); + + return free_and_replace((*strv)[i], t); + } + } + + else if (c > 0) + /* Following files have lower priority, let's go insert our + * new entry. */ + break; + + /* … we are not there yet, let's continue */ + } + + /* The new file has lower priority than all the existing entries */ + t = path_join(root, path); + if (!t) + return -ENOMEM; + + r = strv_insert(strv, i, t); + if (r < 0) + free(t); + + return r; +} + +int conf_files_list(char ***ret, const char *suffix, const char *root, unsigned flags, const char *dir) { + return conf_files_list_strv(ret, suffix, root, flags, STRV_MAKE_CONST(dir)); +} + +int conf_files_list_at(char ***ret, const char *suffix, int rfd, unsigned flags, const char *dir) { + return conf_files_list_strv_at(ret, suffix, rfd, flags, STRV_MAKE_CONST(dir)); +} + +int conf_files_list_nulstr(char ***ret, const char *suffix, const char *root, unsigned flags, const char *dirs) { + _cleanup_strv_free_ char **d = NULL; + + assert(ret); + + d = strv_split_nulstr(dirs); + if (!d) + return -ENOMEM; + + return conf_files_list_strv(ret, suffix, root, flags, (const char**) d); +} + +int conf_files_list_nulstr_at(char ***ret, const char *suffix, int rfd, unsigned flags, const char *dirs) { + _cleanup_strv_free_ char **d = NULL; + + assert(ret); + + d = strv_split_nulstr(dirs); + if (!d) + return -ENOMEM; + + return conf_files_list_strv_at(ret, suffix, rfd, flags, (const char**) d); +} + +int conf_files_list_with_replacement( + const char *root, + char **config_dirs, + const char *replacement, + char ***ret_files, + char **ret_replace_file) { + + _cleanup_strv_free_ char **f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(config_dirs); + assert(ret_files); + assert(ret_replace_file || !replacement); + + r = conf_files_list_strv(&f, ".conf", root, 0, (const char* const*) config_dirs); + if (r < 0) + return log_error_errno(r, "Failed to enumerate config files: %m"); + + if (replacement) { + r = conf_files_insert(&f, root, config_dirs, replacement); + if (r < 0) + return log_error_errno(r, "Failed to extend config file list: %m"); + + p = path_join(root, replacement); + if (!p) + return log_oom(); + } + + *ret_files = TAKE_PTR(f); + if (ret_replace_file) + *ret_replace_file = TAKE_PTR(p); + + return 0; +} + +int conf_files_list_dropins( + char ***ret, + const char *dropin_dirname, + const char *root, + const char * const *dirs) { + + _cleanup_strv_free_ char **dropin_dirs = NULL; + const char *suffix; + int r; + + assert(ret); + assert(dropin_dirname); + assert(dirs); + + suffix = strjoina("/", dropin_dirname); + r = strv_extend_strv_concat(&dropin_dirs, (char**) dirs, suffix); + if (r < 0) + return r; + + return conf_files_list_strv(ret, ".conf", root, 0, (const char* const*) dropin_dirs); +} diff --git a/src/basic/conf-files.h b/src/basic/conf-files.h new file mode 100644 index 0000000..566cc8f --- /dev/null +++ b/src/basic/conf-files.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +enum { + CONF_FILES_EXECUTABLE = 1 << 0, + CONF_FILES_REGULAR = 1 << 1, + CONF_FILES_DIRECTORY = 1 << 2, + CONF_FILES_BASENAME = 1 << 3, + CONF_FILES_FILTER_MASKED = 1 << 4, +}; + +int conf_files_list(char ***ret, const char *suffix, const char *root, unsigned flags, const char *dir); +int conf_files_list_at(char ***ret, const char *suffix, int rfd, unsigned flags, const char *dir); +int conf_files_list_strv(char ***ret, const char *suffix, const char *root, unsigned flags, const char* const* dirs); +int conf_files_list_strv_at(char ***ret, const char *suffix, int rfd, unsigned flags, const char * const *dirs); +int conf_files_list_nulstr(char ***ret, const char *suffix, const char *root, unsigned flags, const char *dirs); +int conf_files_list_nulstr_at(char ***ret, const char *suffix, int rfd, unsigned flags, const char *dirs); +int conf_files_insert(char ***strv, const char *root, char **dirs, const char *path); +int conf_files_list_with_replacement( + const char *root, + char **config_dirs, + const char *replacement, + char ***files, + char **replace_file); +int conf_files_list_dropins( + char ***ret, + const char *dropin_dirname, + const char *root, + const char * const *dirs); diff --git a/src/basic/confidential-virt.c b/src/basic/confidential-virt.c new file mode 100644 index 0000000..b6521cf --- /dev/null +++ b/src/basic/confidential-virt.c @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if defined(__i386__) || defined(__x86_64__) +#include +#endif +#include +#include +#include +#include + +#include "confidential-virt-fundamental.h" +#include "confidential-virt.h" +#include "fd-util.h" +#include "missing_threads.h" +#include "string-table.h" +#include "utf8.h" + + +#if defined(__x86_64__) + +static void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { + log_debug("CPUID func %" PRIx32 " %" PRIx32, *eax, *ecx); + __cpuid_count(*eax, *ecx, *eax, *ebx, *ecx, *edx); + log_debug("CPUID result %" PRIx32 " %" PRIx32 " %" PRIx32 " %" PRIx32, *eax, *ebx, *ecx, *edx); +} + +static uint32_t cpuid_leaf(uint32_t eax, char ret_sig[static 13], bool swapped) { + /* zero-init as some queries explicitly require subleaf == 0 */ + uint32_t sig[3] = {}; + + if (swapped) + cpuid(&eax, &sig[0], &sig[2], &sig[1]); + else + cpuid(&eax, &sig[0], &sig[1], &sig[2]); + memcpy(ret_sig, sig, sizeof(sig)); + ret_sig[12] = 0; /* \0-terminate the string to make string comparison possible */ + + /* In some CI tests ret_sig doesn't contain valid UTF8 and prints garbage to the console */ + log_debug("CPUID sig '%s'", strna(utf8_is_valid(ret_sig))); + + return eax; +} + +#define MSR_DEVICE "/dev/cpu/0/msr" + +static uint64_t msr(uint64_t index) { + uint64_t ret; + ssize_t rv; + _cleanup_close_ int fd = -EBADF; + + fd = open(MSR_DEVICE, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + log_debug_errno(errno, + "Cannot open MSR device %s (index %" PRIu64 "), ignoring: %m", + MSR_DEVICE, + index); + return 0; + } + + rv = pread(fd, &ret, sizeof(ret), index); + if (rv < 0) { + log_debug_errno(errno, + "Cannot read MSR device %s (index %" PRIu64 "), ignoring: %m", + MSR_DEVICE, + index); + return 0; + } else if (rv != sizeof(ret)) { + log_debug("Short read %zd bytes from MSR device %s (index %" PRIu64 "), ignoring", + rv, + MSR_DEVICE, + index); + return 0; + } + + log_debug("MSR %" PRIu64 " result %" PRIu64 "", index, ret); + return ret; +} + +static bool detect_hyperv_sev(void) { + uint32_t eax, ebx, ecx, edx, feat; + char sig[13] = {}; + + feat = cpuid_leaf(CPUID_HYPERV_VENDOR_AND_MAX_FUNCTIONS, sig, false); + + if (feat < CPUID_HYPERV_MIN || feat > CPUID_HYPERV_MAX) + return false; + + if (memcmp(sig, CPUID_SIG_HYPERV, sizeof(sig)) != 0) + return false; + + log_debug("CPUID is on hyperv"); + eax = CPUID_HYPERV_FEATURES; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + + if (ebx & CPUID_HYPERV_ISOLATION && !(ebx & CPUID_HYPERV_CPU_MANAGEMENT)) { + + eax = CPUID_HYPERV_ISOLATION_CONFIG; + ebx = ecx = edx = 0; + cpuid(&eax, &ebx, &ecx, &edx); + + if ((ebx & CPUID_HYPERV_ISOLATION_TYPE_MASK) == CPUID_HYPERV_ISOLATION_TYPE_SNP) + return true; + } + + return false; +} + +static ConfidentialVirtualization detect_sev(void) { + uint32_t eax, ebx, ecx, edx; + uint64_t msrval; + + eax = CPUID_GET_HIGHEST_FUNCTION; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + + if (eax < CPUID_AMD_GET_ENCRYPTED_MEMORY_CAPABILITIES) + return CONFIDENTIAL_VIRTUALIZATION_NONE; + + eax = CPUID_AMD_GET_ENCRYPTED_MEMORY_CAPABILITIES; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + + /* bit 1 == CPU supports SEV feature + * + * Note, Azure blocks this CPUID leaf from its SEV-SNP + * guests, so we must fallback to trying some HyperV + * specific CPUID checks. + */ + if (!(eax & EAX_SEV)) { + log_debug("No sev in CPUID, trying hyperv CPUID"); + + if (detect_hyperv_sev()) + return CONFIDENTIAL_VIRTUALIZATION_SEV_SNP; + + log_debug("No hyperv CPUID"); + return CONFIDENTIAL_VIRTUALIZATION_NONE; + } + + msrval = msr(MSR_AMD64_SEV); + + /* Test reverse order, since the SEV-SNP bit implies + * the SEV-ES bit, which implies the SEV bit */ + if (msrval & MSR_SEV_SNP) + return CONFIDENTIAL_VIRTUALIZATION_SEV_SNP; + if (msrval & MSR_SEV_ES) + return CONFIDENTIAL_VIRTUALIZATION_SEV_ES; + if (msrval & MSR_SEV) + return CONFIDENTIAL_VIRTUALIZATION_SEV; + + return CONFIDENTIAL_VIRTUALIZATION_NONE; +} + +static ConfidentialVirtualization detect_tdx(void) { + uint32_t eax, ebx, ecx, edx; + char sig[13] = {}; + + eax = CPUID_GET_HIGHEST_FUNCTION; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + + if (eax < CPUID_INTEL_TDX_ENUMERATION) + return CONFIDENTIAL_VIRTUALIZATION_NONE; + + cpuid_leaf(CPUID_INTEL_TDX_ENUMERATION, sig, true); + + if (memcmp(sig, CPUID_SIG_INTEL_TDX, sizeof(sig)) == 0) + return CONFIDENTIAL_VIRTUALIZATION_TDX; + + return CONFIDENTIAL_VIRTUALIZATION_NONE; +} + +static bool detect_hypervisor(void) { + uint32_t eax, ebx, ecx, edx; + bool is_hv; + + eax = CPUID_PROCESSOR_INFO_AND_FEATURE_BITS; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + + is_hv = ecx & CPUID_FEATURE_HYPERVISOR; + + log_debug("CPUID is hypervisor: %s", yes_no(is_hv)); + return is_hv; +} + +ConfidentialVirtualization detect_confidential_virtualization(void) { + static thread_local ConfidentialVirtualization cached_found = _CONFIDENTIAL_VIRTUALIZATION_INVALID; + char sig[13] = {}; + ConfidentialVirtualization cv = CONFIDENTIAL_VIRTUALIZATION_NONE; + + if (cached_found >= 0) + return cached_found; + + /* Skip everything on bare metal */ + if (detect_hypervisor()) { + cpuid_leaf(0, sig, true); + + if (memcmp(sig, CPUID_SIG_AMD, sizeof(sig)) == 0) + cv = detect_sev(); + else if (memcmp(sig, CPUID_SIG_INTEL, sizeof(sig)) == 0) + cv = detect_tdx(); + } + + cached_found = cv; + return cv; +} +#else /* ! x86_64 */ +ConfidentialVirtualization detect_confidential_virtualization(void) { + log_debug("No confidential virtualization detection on this architecture"); + return CONFIDENTIAL_VIRTUALIZATION_NONE; +} +#endif /* ! x86_64 */ + +static const char *const confidential_virtualization_table[_CONFIDENTIAL_VIRTUALIZATION_MAX] = { + [CONFIDENTIAL_VIRTUALIZATION_NONE] = "none", + [CONFIDENTIAL_VIRTUALIZATION_SEV] = "sev", + [CONFIDENTIAL_VIRTUALIZATION_SEV_ES] = "sev-es", + [CONFIDENTIAL_VIRTUALIZATION_SEV_SNP] = "sev-snp", + [CONFIDENTIAL_VIRTUALIZATION_TDX] = "tdx", +}; + +DEFINE_STRING_TABLE_LOOKUP(confidential_virtualization, ConfidentialVirtualization); diff --git a/src/basic/confidential-virt.h b/src/basic/confidential-virt.h new file mode 100644 index 0000000..c02f3b2 --- /dev/null +++ b/src/basic/confidential-virt.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "errno-list.h" +#include "macro.h" + +typedef enum ConfidentialVirtualization { + CONFIDENTIAL_VIRTUALIZATION_NONE = 0, + + CONFIDENTIAL_VIRTUALIZATION_SEV, + CONFIDENTIAL_VIRTUALIZATION_SEV_ES, + CONFIDENTIAL_VIRTUALIZATION_SEV_SNP, + CONFIDENTIAL_VIRTUALIZATION_TDX, + + _CONFIDENTIAL_VIRTUALIZATION_MAX, + _CONFIDENTIAL_VIRTUALIZATION_INVALID = -EINVAL, + _CONFIDENTIAL_VIRTUALIZATION_ERRNO_MAX = -ERRNO_MAX, /* ensure full range of errno fits into this enum */ +} ConfidentialVirtualization; + +ConfidentialVirtualization detect_confidential_virtualization(void); + +const char *confidential_virtualization_to_string(ConfidentialVirtualization v) _const_; +ConfidentialVirtualization confidential_virtualization_from_string(const char *s) _pure_; diff --git a/src/basic/constants.h b/src/basic/constants.h new file mode 100644 index 0000000..6bb5f3c --- /dev/null +++ b/src/basic/constants.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if !defined(HAS_FEATURE_MEMORY_SANITIZER) +# if defined(__has_feature) +# if __has_feature(memory_sanitizer) +# define HAS_FEATURE_MEMORY_SANITIZER 1 +# endif +# endif +# if !defined(HAS_FEATURE_MEMORY_SANITIZER) +# define HAS_FEATURE_MEMORY_SANITIZER 0 +# endif +#endif + +#if !defined(HAS_FEATURE_ADDRESS_SANITIZER) +# ifdef __SANITIZE_ADDRESS__ +# define HAS_FEATURE_ADDRESS_SANITIZER 1 +# elif defined(__has_feature) +# if __has_feature(address_sanitizer) +# define HAS_FEATURE_ADDRESS_SANITIZER 1 +# endif +# endif +# if !defined(HAS_FEATURE_ADDRESS_SANITIZER) +# define HAS_FEATURE_ADDRESS_SANITIZER 0 +# endif +#endif + +#define DEFAULT_RESTART_USEC (100*USEC_PER_MSEC) + +/* Many different things, but also system unit start/stop */ +#define DEFAULT_TIMEOUT_USEC (DEFAULT_TIMEOUT_SEC*USEC_PER_SEC) +/* User unit start/stop */ +#define DEFAULT_USER_TIMEOUT_USEC (DEFAULT_USER_TIMEOUT_SEC*USEC_PER_SEC) +/* Timeout for user confirmation on the console */ +#define DEFAULT_CONFIRM_USEC (30*USEC_PER_SEC) + +/* We use an extra-long timeout for the reload. This is because a reload or reexec means generators are rerun + * which are timed out after DEFAULT_TIMEOUT_USEC. Let's use twice that time here, so that the generators can + * have their timeout, and for everything else there's the same time budget in place. */ +#define DAEMON_RELOAD_TIMEOUT_SEC (DEFAULT_TIMEOUT_USEC * 2) + +#define DEFAULT_START_LIMIT_INTERVAL (10*USEC_PER_SEC) +#define DEFAULT_START_LIMIT_BURST 5 + +/* Wait for 1.5 seconds at maximum for freeze operation */ +#define FREEZE_TIMEOUT (1500 * USEC_PER_MSEC) + +/* The default time after which exit-on-idle services exit. This + * should be kept lower than the watchdog timeout, because otherwise + * the watchdog pings will keep the loop busy. */ +#define DEFAULT_EXIT_USEC (30*USEC_PER_SEC) + +/* The default value for the net.unix.max_dgram_qlen sysctl */ +#define DEFAULT_UNIX_MAX_DGRAM_QLEN 512 + +#define SIGNALS_CRASH_HANDLER SIGSEGV,SIGILL,SIGFPE,SIGBUS,SIGQUIT,SIGABRT +#define SIGNALS_IGNORE SIGPIPE + +#define NOTIFY_FD_MAX 768 +#define NOTIFY_BUFFER_MAX PIPE_BUF + +/* Return a nulstr for a standard cascade of configuration paths, suitable to pass to + * conf_files_list_nulstr() to implement drop-in directories for extending configuration files. */ +#define CONF_PATHS_NULSTR(n) \ + "/etc/" n "\0" \ + "/run/" n "\0" \ + "/usr/local/lib/" n "\0" \ + "/usr/lib/" n "\0" + +#define CONF_PATHS_USR(n) \ + "/etc/" n, \ + "/run/" n, \ + "/usr/local/lib/" n, \ + "/usr/lib/" n + +#define CONF_PATHS(n) \ + CONF_PATHS_USR(n) + +#define CONF_PATHS_USR_STRV(n) \ + STRV_MAKE(CONF_PATHS_USR(n)) + +#define CONF_PATHS_STRV(n) \ + STRV_MAKE(CONF_PATHS(n)) + +/* The limit for PID 1 itself (which is not inherited to children) */ +#define HIGH_RLIMIT_MEMLOCK (1024ULL*1024ULL*64ULL) + +/* Since kernel 5.16 the kernel default limit was raised to 8M. Let's adjust things on old kernels too, and + * in containers so that our children inherit that. */ +#define DEFAULT_RLIMIT_MEMLOCK (1024ULL*1024ULL*8ULL) + +/* Path where PID1 listens for varlink subscriptions from systemd-oomd to notify of changes in ManagedOOM settings. */ +#define VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM "/run/systemd/io.systemd.ManagedOOM" +/* Path where systemd-oomd listens for varlink connections from user managers to report changes in ManagedOOM settings. */ +#define VARLINK_ADDR_PATH_MANAGED_OOM_USER "/run/systemd/oom/io.systemd.ManagedOOM" + +#define KERNEL_BASELINE_VERSION "4.15" diff --git a/src/basic/coverage.h b/src/basic/coverage.h new file mode 100644 index 0000000..5c30482 --- /dev/null +++ b/src/basic/coverage.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* Use the coverage-related tweaks below only for C stuff as they're not really + * C++ compatible, and the only thing that is built with a C++ compiler is + * the lone test-bus-vtable-cc unit test. + */ +#ifndef __cplusplus + +void __gcov_dump(void); +void __gcov_reset(void); + +/* When built with --coverage (gcov) we need to explicitly call __gcov_dump() + * in places where we use _exit(), since _exit() skips at-exit hooks resulting + * in lost coverage. + * + * To make sure we don't miss any _exit() calls, this header file is included + * explicitly on the compiler command line via the -include directive (only + * when built with -Db_coverage=true) + */ +void _exit(int); + +static inline _Noreturn void _coverage__exit(int status) { + __gcov_dump(); + _exit(status); +} +#define _exit(x) _coverage__exit(x) + +/* gcov provides wrappers for the exec*() calls but there's none for execveat() + * and execvpe() which means we lose all coverage prior to such call. To mitigate + * this, let's add simple wrappers in gcov's style[0] for these exec*() calls, + * which dump and reset the coverage data as needed. + * + * [0] https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libgcc/libgcov-interface.c;h=b2ee930864183b78c8826255183ca86e15e21ded;hb=HEAD + */ + +int execveat(int, const char *, char * const [], char * const [], int); +int execvpe(const char *, char * const [], char * const []); + +static inline int _coverage_execveat( + int dirfd, + const char *pathname, + char * const argv[], + char * const envp[], + int flags) { + __gcov_dump(); + int r = execveat(dirfd, pathname, argv, envp, flags); + __gcov_reset(); + + return r; +} +#define execveat(d,p,a,e,f) _coverage_execveat(d, p, a, e, f) + +static inline int _coverage_execvpe( + const char *file, + char * const argv[], + char * const envp[]) { + __gcov_dump(); + int r = execvpe(file, argv, envp); + __gcov_reset(); + + return r; +} +#define execvpe(f,a,e) _coverage_execvpe(f, a, e) + +#endif diff --git a/src/basic/devnum-util.c b/src/basic/devnum-util.c new file mode 100644 index 0000000..f82e13b --- /dev/null +++ b/src/basic/devnum-util.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "chase.h" +#include "devnum-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" + +int parse_devnum(const char *s, dev_t *ret) { + const char *major; + unsigned x, y; + size_t n; + int r; + + n = strspn(s, DIGITS); + if (n == 0) + return -EINVAL; + if (n > DECIMAL_STR_MAX(dev_t)) + return -EINVAL; + if (s[n] != ':') + return -EINVAL; + + major = strndupa_safe(s, n); + r = safe_atou(major, &x); + if (r < 0) + return r; + + r = safe_atou(s + n + 1, &y); + if (r < 0) + return r; + + if (!DEVICE_MAJOR_VALID(x) || !DEVICE_MINOR_VALID(y)) + return -ERANGE; + + *ret = makedev(x, y); + return 0; +} + +int device_path_make_major_minor(mode_t mode, dev_t devnum, char **ret) { + const char *t; + + /* Generates the /dev/{char|block}/MAJOR:MINOR path for a dev_t */ + + if (S_ISCHR(mode)) + t = "char"; + else if (S_ISBLK(mode)) + t = "block"; + else + return -ENODEV; + + if (asprintf(ret, "/dev/%s/" DEVNUM_FORMAT_STR, t, DEVNUM_FORMAT_VAL(devnum)) < 0) + return -ENOMEM; + + return 0; +} + +int device_path_make_inaccessible(mode_t mode, char **ret) { + char *s; + + assert(ret); + + if (S_ISCHR(mode)) + s = strdup("/run/systemd/inaccessible/chr"); + else if (S_ISBLK(mode)) + s = strdup("/run/systemd/inaccessible/blk"); + else + return -ENODEV; + if (!s) + return -ENOMEM; + + *ret = s; + return 0; +} + +int device_path_make_canonical(mode_t mode, dev_t devnum, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + /* Finds the canonical path for a device, i.e. resolves the /dev/{char|block}/MAJOR:MINOR path to the end. */ + + assert(ret); + + if (devnum_is_zero(devnum)) + /* A special hack to make sure our 'inaccessible' device nodes work. They won't have symlinks in + * /dev/block/ and /dev/char/, hence we handle them specially here. */ + return device_path_make_inaccessible(mode, ret); + + r = device_path_make_major_minor(mode, devnum, &p); + if (r < 0) + return r; + + return chase(p, NULL, 0, ret, NULL); +} + +int device_path_parse_major_minor(const char *path, mode_t *ret_mode, dev_t *ret_devnum) { + mode_t mode; + dev_t devnum; + int r; + + /* Tries to extract the major/minor directly from the device path if we can. Handles /dev/block/ and /dev/char/ + * paths, as well out synthetic inaccessible device nodes. Never goes to disk. Returns -ENODEV if the device + * path cannot be parsed like this. */ + + if (path_equal(path, "/run/systemd/inaccessible/chr")) { + mode = S_IFCHR; + devnum = makedev(0, 0); + } else if (path_equal(path, "/run/systemd/inaccessible/blk")) { + mode = S_IFBLK; + devnum = makedev(0, 0); + } else { + const char *w; + + w = path_startswith(path, "/dev/block/"); + if (w) + mode = S_IFBLK; + else { + w = path_startswith(path, "/dev/char/"); + if (!w) + return -ENODEV; + + mode = S_IFCHR; + } + + r = parse_devnum(w, &devnum); + if (r < 0) + return r; + } + + if (ret_mode) + *ret_mode = mode; + if (ret_devnum) + *ret_devnum = devnum; + + return 0; +} diff --git a/src/basic/devnum-util.h b/src/basic/devnum-util.h new file mode 100644 index 0000000..e109de9 --- /dev/null +++ b/src/basic/devnum-util.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "stdio-util.h" + +int parse_devnum(const char *s, dev_t *ret); + +/* glibc and the Linux kernel have different ideas about the major/minor size. These calls will check whether the + * specified major is valid by the Linux kernel's standards, not by glibc's. Linux has 20bits of minor, and 12 bits of + * major space. See MINORBITS in linux/kdev_t.h in the kernel sources. (If you wonder why we define _y here, instead of + * comparing directly >= 0: it's to trick out -Wtype-limits, which would otherwise complain if the type is unsigned, as + * such a test would be pointless in such a case.) */ + +#define DEVICE_MAJOR_VALID(x) \ + ({ \ + typeof(x) _x = (x), _y = 0; \ + _x >= _y && _x < (UINT32_C(1) << 12); \ + \ + }) + +#define DEVICE_MINOR_VALID(x) \ + ({ \ + typeof(x) _x = (x), _y = 0; \ + _x >= _y && _x < (UINT32_C(1) << 20); \ + }) + +int device_path_make_major_minor(mode_t mode, dev_t devnum, char **ret); +int device_path_make_inaccessible(mode_t mode, char **ret); +int device_path_make_canonical(mode_t mode, dev_t devnum, char **ret); +int device_path_parse_major_minor(const char *path, mode_t *ret_mode, dev_t *ret_devnum); + +static inline bool devnum_set_and_equal(dev_t a, dev_t b) { + /* Returns true if a and b definitely refer to the same device. If either is zero, this means "don't + * know" and we'll return false */ + return a == b && a != 0; +} + +/* Maximum string length for a major:minor string. (Note that DECIMAL_STR_MAX includes space for a trailing NUL) */ +#define DEVNUM_STR_MAX (DECIMAL_STR_MAX(dev_t)-1+1+DECIMAL_STR_MAX(dev_t)) + +#define DEVNUM_FORMAT_STR "%u:%u" +#define DEVNUM_FORMAT_VAL(d) major(d), minor(d) + +static inline char *format_devnum(dev_t d, char buf[static DEVNUM_STR_MAX]) { + return ASSERT_PTR(snprintf_ok(buf, DEVNUM_STR_MAX, DEVNUM_FORMAT_STR, DEVNUM_FORMAT_VAL(d))); +} + +#define FORMAT_DEVNUM(d) format_devnum((d), (char[DEVNUM_STR_MAX]) {}) + +static inline bool devnum_is_zero(dev_t d) { + return major(d) == 0 && minor(d) == 0; +} diff --git a/src/basic/dirent-util.c b/src/basic/dirent-util.c new file mode 100644 index 0000000..17df6a2 --- /dev/null +++ b/src/basic/dirent-util.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "dirent-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-util.h" + +int dirent_ensure_type(int dir_fd, struct dirent *de) { + STRUCT_STATX_DEFINE(sx); + int r; + + assert(dir_fd >= 0); + assert(de); + + if (de->d_type != DT_UNKNOWN) + return 0; + + if (dot_or_dot_dot(de->d_name)) { + de->d_type = DT_DIR; + return 0; + } + + /* Let's ask only for the type, nothing else. */ + r = statx_fallback(dir_fd, de->d_name, AT_SYMLINK_NOFOLLOW|AT_NO_AUTOMOUNT, STATX_TYPE, &sx); + if (r < 0) + return r; + + assert(FLAGS_SET(sx.stx_mask, STATX_TYPE)); + de->d_type = IFTODT(sx.stx_mode); + + /* If the inode is passed too, update the field, i.e. report most recent data */ + if (FLAGS_SET(sx.stx_mask, STATX_INO)) + de->d_ino = sx.stx_ino; + + return 0; +} + +bool dirent_is_file(const struct dirent *de) { + assert(de); + + if (!IN_SET(de->d_type, DT_REG, DT_LNK, DT_UNKNOWN)) + return false; + + if (hidden_or_backup_file(de->d_name)) + return false; + + return true; +} + +bool dirent_is_file_with_suffix(const struct dirent *de, const char *suffix) { + assert(de); + + if (!IN_SET(de->d_type, DT_REG, DT_LNK, DT_UNKNOWN)) + return false; + + if (de->d_name[0] == '.') + return false; + + if (!suffix) + return true; + + return endswith(de->d_name, suffix); +} + +struct dirent *readdir_ensure_type(DIR *d) { + int r; + + assert(d); + + /* Like readdir(), but fills in .d_type if it is DT_UNKNOWN */ + + for (;;) { + struct dirent *de; + + errno = 0; + de = readdir(d); + if (!de) + return NULL; + + r = dirent_ensure_type(dirfd(d), de); + if (r >= 0) + return de; + if (r != -ENOENT) { + errno = -r; /* We want to be compatible with readdir(), hence propagate error via errno here */ + return NULL; + } + + /* Vanished by now? Then skip immediately to next */ + } +} + +struct dirent *readdir_no_dot(DIR *d) { + assert(d); + + for (;;) { + struct dirent *de; + + de = readdir_ensure_type(d); + if (!de || !dot_or_dot_dot(de->d_name)) + return de; + } +} diff --git a/src/basic/dirent-util.h b/src/basic/dirent-util.h new file mode 100644 index 0000000..0a2fcbf --- /dev/null +++ b/src/basic/dirent-util.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" +#include "path-util.h" + +bool dirent_is_file(const struct dirent *de) _pure_; +bool dirent_is_file_with_suffix(const struct dirent *de, const char *suffix) _pure_; +int dirent_ensure_type(int dir_fd, struct dirent *de); + +struct dirent *readdir_ensure_type(DIR *d); +struct dirent *readdir_no_dot(DIR *dirp); + +#define FOREACH_DIRENT_ALL(de, d, on_error) \ + for (struct dirent *(de) = readdir_ensure_type(d);; (de) = readdir_ensure_type(d)) \ + if (!de) { \ + if (errno > 0) { \ + on_error; \ + } \ + break; \ + } else + +#define FOREACH_DIRENT(de, d, on_error) \ + FOREACH_DIRENT_ALL(de, d, on_error) \ + if (hidden_or_backup_file((de)->d_name)) \ + continue; \ + else + +/* Maximum space one dirent structure might require at most */ +#define DIRENT_SIZE_MAX CONST_MAX(sizeof(struct dirent), offsetof(struct dirent, d_name) + NAME_MAX + 1) + +/* Only if 64-bit off_t is enabled struct dirent + struct dirent64 are actually the same. We require this, and + * we want them to be interchangeable to make getdents64() work, hence verify that. */ +assert_cc(_FILE_OFFSET_BITS == 64); +/* These asserts would fail on musl where the LFS extensions don't exist. They should + * always be present on glibc however. */ +#if HAVE_STRUCT_DIRENT64 +assert_cc(sizeof(struct dirent) == sizeof(struct dirent64)); +assert_cc(offsetof(struct dirent, d_ino) == offsetof(struct dirent64, d_ino)); +assert_cc(sizeof_field(struct dirent, d_ino) == sizeof_field(struct dirent64, d_ino)); +assert_cc(offsetof(struct dirent, d_off) == offsetof(struct dirent64, d_off)); +assert_cc(sizeof_field(struct dirent, d_off) == sizeof_field(struct dirent64, d_off)); +assert_cc(offsetof(struct dirent, d_reclen) == offsetof(struct dirent64, d_reclen)); +assert_cc(sizeof_field(struct dirent, d_reclen) == sizeof_field(struct dirent64, d_reclen)); +assert_cc(offsetof(struct dirent, d_type) == offsetof(struct dirent64, d_type)); +assert_cc(sizeof_field(struct dirent, d_type) == sizeof_field(struct dirent64, d_type)); +assert_cc(offsetof(struct dirent, d_name) == offsetof(struct dirent64, d_name)); +assert_cc(sizeof_field(struct dirent, d_name) == sizeof_field(struct dirent64, d_name)); +#endif + +#define FOREACH_DIRENT_IN_BUFFER(de, buf, sz) \ + for (void *_end = (uint8_t*) ({ (de) = (buf); }) + (sz); \ + (uint8_t*) (de) < (uint8_t*) _end; \ + (de) = (struct dirent*) ((uint8_t*) (de) + (de)->d_reclen)) + +#define DEFINE_DIRENT_BUFFER(name, sz) \ + union { \ + struct dirent de; \ + uint8_t data[(sz) * DIRENT_SIZE_MAX]; \ + } name diff --git a/src/basic/dns-def.h b/src/basic/dns-def.h new file mode 100644 index 0000000..d70220b --- /dev/null +++ b/src/basic/dns-def.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* Length of a single label, with all escaping removed, excluding any trailing dot or NUL byte */ +#define DNS_LABEL_MAX 63 + +/* Worst case length of a single label, with all escaping applied and room for a trailing NUL byte. */ +#define DNS_LABEL_ESCAPED_MAX (DNS_LABEL_MAX*4+1) + +/* Maximum length of a full hostname, consisting of a series of unescaped labels, and no trailing dot or NUL byte */ +#define DNS_HOSTNAME_MAX 253 + +/* Maximum length of a full hostname, on the wire, including the final NUL byte */ +#define DNS_WIRE_FORMAT_HOSTNAME_MAX 255 + +/* Maximum number of labels per valid hostname */ +#define DNS_N_LABELS_MAX 127 diff --git a/src/basic/efivars.c b/src/basic/efivars.c new file mode 100644 index 0000000..9011ae2 --- /dev/null +++ b/src/basic/efivars.c @@ -0,0 +1,446 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "efivars.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "macro.h" +#include "memory-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "time-util.h" +#include "utf8.h" +#include "virt.h" + +#if ENABLE_EFI + +/* Reads from efivarfs sometimes fail with EINTR. Retry that many times. */ +#define EFI_N_RETRIES_NO_DELAY 20 +#define EFI_N_RETRIES_TOTAL 25 +#define EFI_RETRY_DELAY (50 * USEC_PER_MSEC) + +int efi_get_variable( + const char *variable, + uint32_t *ret_attribute, + void **ret_value, + size_t *ret_size) { + + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ void *buf = NULL; + struct stat st; + usec_t begin = 0; /* Unnecessary initialization to appease gcc */ + uint32_t a; + ssize_t n; + + assert(variable); + + const char *p = strjoina("/sys/firmware/efi/efivars/", variable); + + if (!ret_value && !ret_size && !ret_attribute) { + /* If caller is not interested in anything, just check if the variable exists and is + * readable. */ + if (access(p, R_OK) < 0) + return -errno; + + return 0; + } + + if (DEBUG_LOGGING) { + log_debug("Reading EFI variable %s.", p); + begin = now(CLOCK_MONOTONIC); + } + + fd = open(p, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return log_debug_errno(errno, "open(\"%s\") failed: %m", p); + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "fstat(\"%s\") failed: %m", p); + if (st.st_size < 4) + return log_debug_errno(SYNTHETIC_ERRNO(ENODATA), "EFI variable %s is shorter than 4 bytes, refusing.", p); + if (st.st_size > 4*1024*1024 + 4) + return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "EFI variable %s is ridiculously large, refusing.", p); + + if (ret_value || ret_attribute) { + /* The kernel ratelimits reads from the efivarfs because EFI is inefficient, and we'll + * occasionally fail with EINTR here. A slowdown is better than a failure for us, so + * retry a few times and eventually fail with -EBUSY. + * + * See https://github.com/torvalds/linux/blob/master/fs/efivarfs/file.c#L75 + * and + * https://github.com/torvalds/linux/commit/bef3efbeb897b56867e271cdbc5f8adaacaeb9cd. + */ + for (unsigned try = 0;; try++) { + n = read(fd, &a, sizeof(a)); + if (n >= 0) + break; + log_debug_errno(errno, "Reading from \"%s\" failed: %m", p); + if (errno != EINTR) + return -errno; + if (try >= EFI_N_RETRIES_TOTAL) + return -EBUSY; + + if (try >= EFI_N_RETRIES_NO_DELAY) + (void) usleep_safe(EFI_RETRY_DELAY); + } + + if (n != sizeof(a)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Read %zi bytes from EFI variable %s, expected %zu.", n, p, sizeof(a)); + } + + if (ret_value) { + buf = malloc(st.st_size - 4 + 3); + if (!buf) + return -ENOMEM; + + n = read(fd, buf, (size_t) st.st_size - 4); + if (n < 0) + return log_debug_errno(errno, "Failed to read value of EFI variable %s: %m", p); + assert(n <= st.st_size - 4); + + /* Always NUL-terminate (3 bytes, to properly protect UTF-16, even if truncated in the middle + * of a character) */ + ((char*) buf)[n] = 0; + ((char*) buf)[n + 1] = 0; + ((char*) buf)[n + 2] = 0; + } else + /* Assume that the reported size is accurate */ + n = st.st_size - 4; + + if (DEBUG_LOGGING) { + usec_t end = now(CLOCK_MONOTONIC); + if (end > begin + EFI_RETRY_DELAY) + log_debug("Detected slow EFI variable read access on %s: %s", + variable, FORMAT_TIMESPAN(end - begin, 1)); + } + + /* Note that efivarfs interestingly doesn't require ftruncate() to update an existing EFI variable + * with a smaller value. */ + + if (ret_attribute) + *ret_attribute = a; + + if (ret_value) + *ret_value = TAKE_PTR(buf); + + if (ret_size) + *ret_size = n; + + return 0; +} + +int efi_get_variable_string(const char *variable, char **ret) { + _cleanup_free_ void *s = NULL; + size_t ss = 0; + int r; + char *x; + + r = efi_get_variable(variable, NULL, &s, &ss); + if (r < 0) + return r; + + x = utf16_to_utf8(s, ss); + if (!x) + return -ENOMEM; + + *ret = x; + return 0; +} + +static int efi_verify_variable(const char *variable, uint32_t attr, const void *value, size_t size) { + _cleanup_free_ void *buf = NULL; + size_t n; + uint32_t a; + int r; + + assert(variable); + assert(value || size == 0); + + r = efi_get_variable(variable, &a, &buf, &n); + if (r < 0) + return r; + + return a == attr && memcmp_nn(buf, n, value, size) == 0; +} + +int efi_set_variable(const char *variable, const void *value, size_t size) { + struct var { + uint32_t attr; + char buf[]; + } _packed_ * _cleanup_free_ buf = NULL; + _cleanup_close_ int fd = -EBADF; + uint32_t attr = EFI_VARIABLE_NON_VOLATILE|EFI_VARIABLE_BOOTSERVICE_ACCESS|EFI_VARIABLE_RUNTIME_ACCESS; + bool saved_flags_valid = false; + unsigned saved_flags; + int r; + + assert(variable); + assert(value || size == 0); + + const char *p = strjoina("/sys/firmware/efi/efivars/", variable); + + /* size 0 means removal, empty variable would not be enough for that */ + if (size > 0 && efi_verify_variable(variable, attr, value, size) > 0) { + log_debug("Variable '%s' is already in wanted state, skipping write.", variable); + return 0; + } + + /* Newer efivarfs protects variables that are not in an allow list with FS_IMMUTABLE_FL by default, + * to protect them for accidental removal and modification. We are not changing these variables + * accidentally however, hence let's unset the bit first. */ + + r = chattr_path(p, 0, FS_IMMUTABLE_FL, &saved_flags); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to drop FS_IMMUTABLE_FL flag from '%s', ignoring: %m", p); + + saved_flags_valid = r >= 0; + + if (size == 0) { + if (unlink(p) < 0) { + r = -errno; + goto finish; + } + + return 0; + } + + fd = open(p, O_WRONLY|O_CREAT|O_NOCTTY|O_CLOEXEC, 0644); + if (fd < 0) { + r = -errno; + goto finish; + } + + buf = malloc(sizeof(uint32_t) + size); + if (!buf) { + r = -ENOMEM; + goto finish; + } + + buf->attr = attr; + memcpy(buf->buf, value, size); + + r = loop_write(fd, buf, sizeof(uint32_t) + size); + if (r < 0) + goto finish; + + /* For some reason efivarfs doesn't update mtime automatically. Let's do it manually then. This is + * useful for processes that cache EFI variables to detect when changes occurred. */ + if (futimens(fd, (struct timespec[2]) { + { .tv_nsec = UTIME_NOW }, + { .tv_nsec = UTIME_NOW } + }) < 0) + log_debug_errno(errno, "Failed to update mtime/atime on %s, ignoring: %m", p); + + r = 0; + +finish: + if (saved_flags_valid) { + int q; + + /* Restore the original flags field, just in case */ + if (fd < 0) + q = chattr_path(p, saved_flags, FS_IMMUTABLE_FL, NULL); + else + q = chattr_fd(fd, saved_flags, FS_IMMUTABLE_FL, NULL); + if (q < 0) + log_debug_errno(q, "Failed to restore FS_IMMUTABLE_FL on '%s', ignoring: %m", p); + } + + return r; +} + +int efi_set_variable_string(const char *variable, const char *value) { + _cleanup_free_ char16_t *u16 = NULL; + + u16 = utf8_to_utf16(value, SIZE_MAX); + if (!u16) + return -ENOMEM; + + return efi_set_variable(variable, u16, (char16_strlen(u16) + 1) * sizeof(char16_t)); +} + +bool is_efi_boot(void) { + static int cache = -1; + + if (cache < 0) { + if (detect_container() > 0) + cache = false; + else { + cache = access("/sys/firmware/efi/", F_OK) >= 0; + if (!cache && errno != ENOENT) + log_debug_errno(errno, "Unable to test whether /sys/firmware/efi/ exists, assuming EFI not available: %m"); + } + } + + return cache; +} + +static int read_flag(const char *variable) { + _cleanup_free_ void *v = NULL; + uint8_t b; + size_t s; + int r; + + if (!is_efi_boot()) /* If this is not an EFI boot, assume the queried flags are zero */ + return 0; + + r = efi_get_variable(variable, NULL, &v, &s); + if (r < 0) + return r; + + if (s != 1) + return -EINVAL; + + b = *(uint8_t *)v; + return !!b; +} + +bool is_efi_secure_boot(void) { + static int cache = -1; + int r; + + if (cache < 0) { + r = read_flag(EFI_GLOBAL_VARIABLE(SecureBoot)); + if (r == -ENOENT) + cache = false; + else if (r < 0) + log_debug_errno(r, "Error reading SecureBoot EFI variable, assuming not in SecureBoot mode: %m"); + else + cache = r; + } + + return cache > 0; +} + +SecureBootMode efi_get_secure_boot_mode(void) { + static SecureBootMode cache = _SECURE_BOOT_INVALID; + + if (cache != _SECURE_BOOT_INVALID) + return cache; + + int secure = read_flag(EFI_GLOBAL_VARIABLE(SecureBoot)); + if (secure < 0) { + if (secure != -ENOENT) + log_debug_errno(secure, "Error reading SecureBoot EFI variable, assuming not in SecureBoot mode: %m"); + + return (cache = SECURE_BOOT_UNSUPPORTED); + } + + /* We can assume false for all these if they are abscent (AuditMode and + * DeployedMode may not exist on older firmware). */ + int audit = read_flag(EFI_GLOBAL_VARIABLE(AuditMode)); + int deployed = read_flag(EFI_GLOBAL_VARIABLE(DeployedMode)); + int setup = read_flag(EFI_GLOBAL_VARIABLE(SetupMode)); + log_debug("Secure boot variables: SecureBoot=%d AuditMode=%d DeployedMode=%d SetupMode=%d", + secure, audit, deployed, setup); + + return (cache = decode_secure_boot_mode(secure, audit > 0, deployed > 0, setup > 0)); +} + +static int read_efi_options_variable(char **ret) { + int r; + + /* In SecureBoot mode this is probably not what you want. As your cmdline is cryptographically signed + * like when using Type #2 EFI Unified Kernel Images (https://uapi-group.org/specifications/specs/boot_loader_specification) + * The user's intention is then that the cmdline should not be modified. You want to make sure that + * the system starts up as exactly specified in the signed artifact. + * + * (NB: For testing purposes, we still check the $SYSTEMD_EFI_OPTIONS env var before accessing this + * cache, even when in SecureBoot mode.) */ + if (is_efi_secure_boot()) { + /* Let's be helpful with the returned error and check if the variable exists at all. If it + * does, let's return a recognizable error (EPERM), and if not ENODATA. */ + + if (access(EFIVAR_PATH(EFI_SYSTEMD_VARIABLE(SystemdOptions)), F_OK) < 0) + return errno == ENOENT ? -ENODATA : -errno; + + return -EPERM; + } + + r = efi_get_variable_string(EFI_SYSTEMD_VARIABLE(SystemdOptions), ret); + if (r == -ENOENT) + return -ENODATA; + return r; +} + +int cache_efi_options_variable(void) { + _cleanup_free_ char *line = NULL; + int r; + + r = read_efi_options_variable(&line); + if (r < 0) + return r; + + return write_string_file(EFIVAR_CACHE_PATH(EFI_SYSTEMD_VARIABLE(SystemdOptions)), line, + WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755); +} + +int systemd_efi_options_variable(char **ret) { + const char *e; + int r; + + /* Returns the contents of the variable for current boot from the cache. */ + + assert(ret); + + /* For testing purposes it is sometimes useful to be able to override this */ + e = secure_getenv("SYSTEMD_EFI_OPTIONS"); + if (e) { + char *m; + + m = strdup(e); + if (!m) + return -ENOMEM; + + *ret = m; + return 0; + } + + r = read_one_line_file(EFIVAR_CACHE_PATH(EFI_SYSTEMD_VARIABLE(SystemdOptions)), ret); + if (r == -ENOENT) + return -ENODATA; + return r; +} + +static int compare_stat_mtime(const struct stat *a, const struct stat *b) { + return CMP(timespec_load(&a->st_mtim), timespec_load(&b->st_mtim)); +} + +int systemd_efi_options_efivarfs_if_newer(char **ret) { + struct stat a = {}, b; + int r; + + if (stat(EFIVAR_PATH(EFI_SYSTEMD_VARIABLE(SystemdOptions)), &a) < 0 && errno != ENOENT) + return log_debug_errno(errno, "Failed to stat EFI variable SystemdOptions: %m"); + + if (stat(EFIVAR_CACHE_PATH(EFI_SYSTEMD_VARIABLE(SystemdOptions)), &b) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to stat "EFIVAR_CACHE_PATH(EFI_SYSTEMD_VARIABLE(SystemdOptions))": %m"); + } else if (compare_stat_mtime(&a, &b) > 0) + log_debug("Variable SystemdOptions in evifarfs is newer than in cache."); + else { + log_debug("Variable SystemdOptions in cache is up to date."); + *ret = NULL; + return 0; + } + + r = read_efi_options_variable(ret); + if (r < 0) + return log_debug_errno(r, "Failed to read SystemdOptions EFI variable: %m"); + + return 0; +} +#endif diff --git a/src/basic/efivars.h b/src/basic/efivars.h new file mode 100644 index 0000000..34d697f --- /dev/null +++ b/src/basic/efivars.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if !ENABLE_EFI +# include +#endif +#include +#include +#include + +#include "sd-id128.h" + +#include "efivars-fundamental.h" +#include "time-util.h" + +#define EFI_VENDOR_LOADER SD_ID128_MAKE(4a,67,b0,82,0a,4c,41,cf,b6,c7,44,0b,29,bb,8c,4f) +#define EFI_VENDOR_LOADER_STR SD_ID128_MAKE_UUID_STR(4a,67,b0,82,0a,4c,41,cf,b6,c7,44,0b,29,bb,8c,4f) +#define EFI_VENDOR_GLOBAL SD_ID128_MAKE(8b,e4,df,61,93,ca,11,d2,aa,0d,00,e0,98,03,2b,8c) +#define EFI_VENDOR_GLOBAL_STR SD_ID128_MAKE_UUID_STR(8b,e4,df,61,93,ca,11,d2,aa,0d,00,e0,98,03,2b,8c) +#define EFI_VENDOR_DATABASE SD_ID128_MAKE(d7,19,b2,cb,3d,3a,45,96,a3,bc,da,d0,0e,67,65,6f) +#define EFI_VENDOR_DATABASE_STR SD_ID128_MAKE_UUID_STR(d7,19,b2,cb,3d,3a,45,96,a3,bc,da,d0,0e,67,65,6f) +#define EFI_VENDOR_SYSTEMD SD_ID128_MAKE(8c,f2,64,4b,4b,0b,42,8f,93,87,6d,87,60,50,dc,67) +#define EFI_VENDOR_SYSTEMD_STR SD_ID128_MAKE_UUID_STR(8c,f2,64,4b,4b,0b,42,8f,93,87,6d,87,60,50,dc,67) + +#define EFI_VARIABLE_NON_VOLATILE UINT32_C(0x00000001) +#define EFI_VARIABLE_BOOTSERVICE_ACCESS UINT32_C(0x00000002) +#define EFI_VARIABLE_RUNTIME_ACCESS UINT32_C(0x00000004) + +/* Note that the - naming scheme is an efivarfs convention, i.e. part of the Linux + * API file system implementation for EFI. EFI itself processes UIDS in binary form. + */ + +#define EFI_VENDOR_VARIABLE_STR(vendor, name) name "-" vendor + +#define EFI_GLOBAL_VARIABLE_STR(name) EFI_VENDOR_VARIABLE_STR(EFI_VENDOR_GLOBAL_STR, name) +#define EFI_LOADER_VARIABLE_STR(name) EFI_VENDOR_VARIABLE_STR(EFI_VENDOR_LOADER_STR, name) +#define EFI_SYSTEMD_VARIABLE_STR(name) EFI_VENDOR_VARIABLE_STR(EFI_VENDOR_SYSTEMD_STR, name) + +#define EFI_GLOBAL_VARIABLE(name) EFI_GLOBAL_VARIABLE_STR(STRINGIFY(name)) +#define EFI_LOADER_VARIABLE(name) EFI_LOADER_VARIABLE_STR(STRINGIFY(name)) +#define EFI_SYSTEMD_VARIABLE(name) EFI_SYSTEMD_VARIABLE_STR(STRINGIFY(name)) + +#define EFIVAR_PATH(variable) "/sys/firmware/efi/efivars/" variable +#define EFIVAR_CACHE_PATH(variable) "/run/systemd/efivars/" variable + +#if ENABLE_EFI + +int efi_get_variable(const char *variable, uint32_t *attribute, void **ret_value, size_t *ret_size); +int efi_get_variable_string(const char *variable, char **ret); +int efi_set_variable(const char *variable, const void *value, size_t size); +int efi_set_variable_string(const char *variable, const char *p); + +bool is_efi_boot(void); +bool is_efi_secure_boot(void); +SecureBootMode efi_get_secure_boot_mode(void); + +int cache_efi_options_variable(void); +int systemd_efi_options_variable(char **ret); +int systemd_efi_options_efivarfs_if_newer(char **ret); + +#else + +static inline int efi_get_variable(const char *variable, uint32_t *attribute, void **value, size_t *size) { + return -EOPNOTSUPP; +} + +static inline int efi_get_variable_string(const char *variable, char **ret) { + return -EOPNOTSUPP; +} + +static inline int efi_set_variable(const char *variable, const void *value, size_t size) { + return -EOPNOTSUPP; +} + +static inline int efi_set_variable_string(const char *variable, const char *p) { + return -EOPNOTSUPP; +} + +static inline bool is_efi_boot(void) { + return false; +} + +static inline bool is_efi_secure_boot(void) { + return false; +} + +static inline SecureBootMode efi_get_secure_boot_mode(void) { + return SECURE_BOOT_UNKNOWN; +} + +static inline int cache_efi_options_variable(void) { + return -EOPNOTSUPP; +} + +static inline int systemd_efi_options_variable(char **line) { + return -ENODATA; +} + +static inline int systemd_efi_options_efivarfs_if_newer(char **line) { + return -ENODATA; +} +#endif diff --git a/src/basic/env-file.c b/src/basic/env-file.c new file mode 100644 index 0000000..c2cbff4 --- /dev/null +++ b/src/basic/env-file.c @@ -0,0 +1,647 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "env-file.h" +#include "env-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "utf8.h" + +typedef int (*push_env_func_t)( + const char *filename, + unsigned line, + const char *key, + char *value, + void *userdata); + +static int parse_env_file_internal( + FILE *f, + const char *fname, + push_env_func_t push, + void *userdata) { + + size_t n_key = 0, n_value = 0, last_value_whitespace = SIZE_MAX, last_key_whitespace = SIZE_MAX; + _cleanup_free_ char *contents = NULL, *key = NULL, *value = NULL; + unsigned line = 1; + int r; + + enum { + PRE_KEY, + KEY, + PRE_VALUE, + VALUE, + VALUE_ESCAPE, + SINGLE_QUOTE_VALUE, + DOUBLE_QUOTE_VALUE, + DOUBLE_QUOTE_VALUE_ESCAPE, + COMMENT, + COMMENT_ESCAPE + } state = PRE_KEY; + + assert(f || fname); + assert(push); + + if (f) + r = read_full_stream(f, &contents, NULL); + else + r = read_full_file(fname, &contents, NULL); + if (r < 0) + return r; + + for (char *p = contents; *p; p++) { + char c = *p; + + switch (state) { + + case PRE_KEY: + if (strchr(COMMENTS, c)) + state = COMMENT; + else if (!strchr(WHITESPACE, c)) { + state = KEY; + last_key_whitespace = SIZE_MAX; + + if (!GREEDY_REALLOC(key, n_key+2)) + return -ENOMEM; + + key[n_key++] = c; + } + break; + + case KEY: + if (strchr(NEWLINE, c)) { + state = PRE_KEY; + line++; + n_key = 0; + } else if (c == '=') { + state = PRE_VALUE; + last_value_whitespace = SIZE_MAX; + } else { + if (!strchr(WHITESPACE, c)) + last_key_whitespace = SIZE_MAX; + else if (last_key_whitespace == SIZE_MAX) + last_key_whitespace = n_key; + + if (!GREEDY_REALLOC(key, n_key+2)) + return -ENOMEM; + + key[n_key++] = c; + } + + break; + + case PRE_VALUE: + if (strchr(NEWLINE, c)) { + state = PRE_KEY; + line++; + key[n_key] = 0; + + if (value) + value[n_value] = 0; + + /* strip trailing whitespace from key */ + if (last_key_whitespace != SIZE_MAX) + key[last_key_whitespace] = 0; + + r = push(fname, line, key, value, userdata); + if (r < 0) + return r; + + n_key = 0; + value = NULL; + n_value = 0; + + } else if (c == '\'') + state = SINGLE_QUOTE_VALUE; + else if (c == '"') + state = DOUBLE_QUOTE_VALUE; + else if (c == '\\') + state = VALUE_ESCAPE; + else if (!strchr(WHITESPACE, c)) { + state = VALUE; + + if (!GREEDY_REALLOC(value, n_value+2)) + return -ENOMEM; + + value[n_value++] = c; + } + + break; + + case VALUE: + if (strchr(NEWLINE, c)) { + state = PRE_KEY; + line++; + + key[n_key] = 0; + + if (value) + value[n_value] = 0; + + /* Chomp off trailing whitespace from value */ + if (last_value_whitespace != SIZE_MAX) + value[last_value_whitespace] = 0; + + /* strip trailing whitespace from key */ + if (last_key_whitespace != SIZE_MAX) + key[last_key_whitespace] = 0; + + r = push(fname, line, key, value, userdata); + if (r < 0) + return r; + + n_key = 0; + value = NULL; + n_value = 0; + + } else if (c == '\\') { + state = VALUE_ESCAPE; + last_value_whitespace = SIZE_MAX; + } else { + if (!strchr(WHITESPACE, c)) + last_value_whitespace = SIZE_MAX; + else if (last_value_whitespace == SIZE_MAX) + last_value_whitespace = n_value; + + if (!GREEDY_REALLOC(value, n_value+2)) + return -ENOMEM; + + value[n_value++] = c; + } + + break; + + case VALUE_ESCAPE: + state = VALUE; + + if (!strchr(NEWLINE, c)) { + /* Escaped newlines we eat up entirely */ + if (!GREEDY_REALLOC(value, n_value+2)) + return -ENOMEM; + + value[n_value++] = c; + } + break; + + case SINGLE_QUOTE_VALUE: + if (c == '\'') + state = PRE_VALUE; + else { + if (!GREEDY_REALLOC(value, n_value+2)) + return -ENOMEM; + + value[n_value++] = c; + } + + break; + + case DOUBLE_QUOTE_VALUE: + if (c == '"') + state = PRE_VALUE; + else if (c == '\\') + state = DOUBLE_QUOTE_VALUE_ESCAPE; + else { + if (!GREEDY_REALLOC(value, n_value+2)) + return -ENOMEM; + + value[n_value++] = c; + } + + break; + + case DOUBLE_QUOTE_VALUE_ESCAPE: + state = DOUBLE_QUOTE_VALUE; + + if (strchr(SHELL_NEED_ESCAPE, c)) { + /* If this is a char that needs escaping, just unescape it. */ + if (!GREEDY_REALLOC(value, n_value+2)) + return -ENOMEM; + value[n_value++] = c; + } else if (c != '\n') { + /* If other char than what needs escaping, keep the "\" in place, like the + * real shell does. */ + if (!GREEDY_REALLOC(value, n_value+3)) + return -ENOMEM; + value[n_value++] = '\\'; + value[n_value++] = c; + } + + /* Escaped newlines (aka "continuation lines") are eaten up entirely */ + break; + + case COMMENT: + if (c == '\\') + state = COMMENT_ESCAPE; + else if (strchr(NEWLINE, c)) { + state = PRE_KEY; + line++; + } + break; + + case COMMENT_ESCAPE: + log_debug("The line which doesn't begin with \";\" or \"#\", but follows a comment" \ + " line trailing with escape is now treated as a non comment line since v254."); + if (strchr(NEWLINE, c)) { + state = PRE_KEY; + line++; + } else + state = COMMENT; + break; + } + } + + if (IN_SET(state, + PRE_VALUE, + VALUE, + VALUE_ESCAPE, + SINGLE_QUOTE_VALUE, + DOUBLE_QUOTE_VALUE, + DOUBLE_QUOTE_VALUE_ESCAPE)) { + + key[n_key] = 0; + + if (value) + value[n_value] = 0; + + if (state == VALUE) + if (last_value_whitespace != SIZE_MAX) + value[last_value_whitespace] = 0; + + /* strip trailing whitespace from key */ + if (last_key_whitespace != SIZE_MAX) + key[last_key_whitespace] = 0; + + r = push(fname, line, key, value, userdata); + if (r < 0) + return r; + + value = NULL; + } + + return 0; +} + +static int check_utf8ness_and_warn( + const char *filename, unsigned line, + const char *key, char *value) { + + assert(key); + + if (!utf8_is_valid(key)) { + _cleanup_free_ char *p = NULL; + + p = utf8_escape_invalid(key); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s:%u: invalid UTF-8 in key '%s', ignoring.", + strna(filename), line, p); + } + + if (value && !utf8_is_valid(value)) { + _cleanup_free_ char *p = NULL; + + p = utf8_escape_invalid(value); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s:%u: invalid UTF-8 value for key %s: '%s', ignoring.", + strna(filename), line, key, p); + } + + return 0; +} + +static int parse_env_file_push( + const char *filename, unsigned line, + const char *key, char *value, + void *userdata) { + + const char *k; + va_list aq, *ap = userdata; + int r; + + assert(key); + + r = check_utf8ness_and_warn(filename, line, key, value); + if (r < 0) + return r; + + va_copy(aq, *ap); + + while ((k = va_arg(aq, const char *))) { + char **v; + + v = va_arg(aq, char **); + + if (streq(key, k)) { + va_end(aq); + free_and_replace(*v, value); + + return 1; + } + } + + va_end(aq); + free(value); + + return 0; +} + +int parse_env_filev( + FILE *f, + const char *fname, + va_list ap) { + + int r; + va_list aq; + + assert(f || fname); + + va_copy(aq, ap); + r = parse_env_file_internal(f, fname, parse_env_file_push, &aq); + va_end(aq); + return r; +} + +int parse_env_file_fdv(int fd, const char *fname, va_list ap) { + _cleanup_fclose_ FILE *f = NULL; + va_list aq; + int r; + + assert(fd >= 0); + + r = fdopen_independent(fd, "re", &f); + if (r < 0) + return r; + + va_copy(aq, ap); + r = parse_env_file_internal(f, fname, parse_env_file_push, &aq); + va_end(aq); + return r; +} + +int parse_env_file_sentinel( + FILE *f, + const char *fname, + ...) { + + va_list ap; + int r; + + assert(f || fname); + + va_start(ap, fname); + r = parse_env_filev(f, fname, ap); + va_end(ap); + + return r; +} + +int parse_env_file_fd_sentinel( + int fd, + const char *fname, /* only used for logging */ + ...) { + + va_list ap; + int r; + + assert(fd >= 0); + + va_start(ap, fname); + r = parse_env_file_fdv(fd, fname, ap); + va_end(ap); + + return r; +} + +static int load_env_file_push( + const char *filename, unsigned line, + const char *key, char *value, + void *userdata) { + + char ***m = userdata; + char *p; + int r; + + assert(key); + + r = check_utf8ness_and_warn(filename, line, key, value); + if (r < 0) + return r; + + p = strjoin(key, "=", value); + if (!p) + return -ENOMEM; + + r = strv_env_replace_consume(m, p); + if (r < 0) + return r; + + free(value); + return 0; +} + +int load_env_file(FILE *f, const char *fname, char ***ret) { + _cleanup_strv_free_ char **m = NULL; + int r; + + assert(f || fname); + assert(ret); + + r = parse_env_file_internal(f, fname, load_env_file_push, &m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +static int load_env_file_push_pairs( + const char *filename, unsigned line, + const char *key, char *value, + void *userdata) { + + char ***m = ASSERT_PTR(userdata); + int r; + + assert(key); + + r = check_utf8ness_and_warn(filename, line, key, value); + if (r < 0) + return r; + + /* Check if the key is present */ + for (char **t = *m; t && *t; t += 2) + if (streq(t[0], key)) { + if (value) + return free_and_replace(t[1], value); + else + return free_and_strdup(t+1, ""); + } + + r = strv_extend(m, key); + if (r < 0) + return r; + + if (value) + return strv_push(m, value); + else + return strv_extend(m, ""); +} + +int load_env_file_pairs(FILE *f, const char *fname, char ***ret) { + _cleanup_strv_free_ char **m = NULL; + int r; + + assert(f || fname); + assert(ret); + + r = parse_env_file_internal(f, fname, load_env_file_push_pairs, &m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +int load_env_file_pairs_fd(int fd, const char *fname, char ***ret) { + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(fd >= 0); + + r = fdopen_independent(fd, "re", &f); + if (r < 0) + return r; + + return load_env_file_pairs(f, fname, ret); +} + +static int merge_env_file_push( + const char *filename, unsigned line, + const char *key, char *value, + void *userdata) { + + char ***env = ASSERT_PTR(userdata); + char *expanded_value; + int r; + + assert(key); + + if (!value) { + log_error("%s:%u: invalid syntax (around \"%s\"), ignoring.", strna(filename), line, key); + return 0; + } + + if (!env_name_is_valid(key)) { + log_error("%s:%u: invalid variable name \"%s\", ignoring.", strna(filename), line, key); + free(value); + return 0; + } + + r = replace_env(value, + *env, + REPLACE_ENV_USE_ENVIRONMENT|REPLACE_ENV_ALLOW_BRACELESS|REPLACE_ENV_ALLOW_EXTENDED, + &expanded_value); + if (r < 0) + return log_error_errno(r, "%s:%u: Failed to expand variable '%s': %m", strna(filename), line, value); + + free_and_replace(value, expanded_value); + + log_debug("%s:%u: setting %s=%s", filename, line, key, value); + + return load_env_file_push(filename, line, key, value, env); +} + +int merge_env_file( + char ***env, + FILE *f, + const char *fname) { + + assert(env); + assert(f || fname); + + /* NOTE: this function supports braceful and braceless variable expansions, + * plus "extended" substitutions, unlike other exported parsing functions. + */ + + return parse_env_file_internal(f, fname, merge_env_file_push, env); +} + +static void write_env_var(FILE *f, const char *v) { + const char *p; + + assert(f); + assert(v); + + p = strchr(v, '='); + if (!p) { + /* Fallback */ + fputs_unlocked(v, f); + fputc_unlocked('\n', f); + return; + } + + p++; + fwrite_unlocked(v, 1, p-v, f); + + if (string_has_cc(p, NULL) || chars_intersect(p, WHITESPACE SHELL_NEED_QUOTES)) { + fputc_unlocked('"', f); + + for (; *p; p++) { + if (strchr(SHELL_NEED_ESCAPE, *p)) + fputc_unlocked('\\', f); + + fputc_unlocked(*p, f); + } + + fputc_unlocked('"', f); + } else + fputs_unlocked(p, f); + + fputc_unlocked('\n', f); +} + +int write_env_file(int dir_fd, const char *fname, char **headers, char **l) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(fname); + + r = fopen_temporary_at(dir_fd, fname, &f, &p); + if (r < 0) + return r; + + (void) fchmod_umask(fileno(f), 0644); + + STRV_FOREACH(i, headers) { + assert(isempty(*i) || startswith(*i, "#")); + fputs_unlocked(*i, f); + fputc_unlocked('\n', f); + } + + STRV_FOREACH(i, l) + write_env_var(f, *i); + + r = fflush_and_check(f); + if (r >= 0) { + if (renameat(dir_fd, p, dir_fd, fname) >= 0) + return 0; + + r = -errno; + } + + (void) unlinkat(dir_fd, p, 0); + return r; +} + +int write_vconsole_conf(int dir_fd, const char *fname, char **l) { + char **headers = STRV_MAKE( + "# Written by systemd-localed(8) or systemd-firstboot(1), read by systemd-localed", + "# and systemd-vconsole-setup(8). Use localectl(1) to update this file."); + + return write_env_file(dir_fd, fname, headers, l); +} diff --git a/src/basic/env-file.h b/src/basic/env-file.h new file mode 100644 index 0000000..37db307 --- /dev/null +++ b/src/basic/env-file.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" + +int parse_env_filev(FILE *f, const char *fname, va_list ap); +int parse_env_file_fdv(int fd, const char *fname, va_list ap); +int parse_env_file_sentinel(FILE *f, const char *fname, ...) _sentinel_; +#define parse_env_file(f, fname, ...) parse_env_file_sentinel(f, fname, __VA_ARGS__, NULL) +int parse_env_file_fd_sentinel(int fd, const char *fname, ...) _sentinel_; +#define parse_env_file_fd(fd, fname, ...) parse_env_file_fd_sentinel(fd, fname, __VA_ARGS__, NULL) +int load_env_file(FILE *f, const char *fname, char ***ret); +int load_env_file_pairs(FILE *f, const char *fname, char ***ret); +int load_env_file_pairs_fd(int fd, const char *fname, char ***ret); + +int merge_env_file(char ***env, FILE *f, const char *fname); + +int write_env_file(int dir_fd, const char *fname, char **headers, char **l); + +int write_vconsole_conf(int dir_fd, const char *fname, char **l); diff --git a/src/basic/env-util.c b/src/basic/env-util.c new file mode 100644 index 0000000..d3bf733 --- /dev/null +++ b/src/basic/env-util.c @@ -0,0 +1,1095 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "env-util.h" +#include "errno-util.h" +#include "escape.h" +#include "extract-word.h" +#include "macro.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +/* We follow bash for the character set. Different shells have different rules. */ +#define VALID_BASH_ENV_NAME_CHARS \ + DIGITS LETTERS \ + "_" + +static bool env_name_is_valid_n(const char *e, size_t n) { + + if (n == SIZE_MAX) + n = strlen_ptr(e); + + if (n <= 0) + return false; + + assert(e); + + if (ascii_isdigit(e[0])) + return false; + + /* POSIX says the overall size of the environment block cannot be > ARG_MAX, an individual assignment + * hence cannot be either. Discounting the equal sign and trailing NUL this hence leaves ARG_MAX-2 as + * longest possible variable name. */ + if (n > (size_t) sysconf(_SC_ARG_MAX) - 2) + return false; + + for (const char *p = e; p < e + n; p++) + if (!strchr(VALID_BASH_ENV_NAME_CHARS, *p)) + return false; + + return true; +} + +bool env_name_is_valid(const char *e) { + return env_name_is_valid_n(e, strlen_ptr(e)); +} + +bool env_value_is_valid(const char *e) { + if (!e) + return false; + + if (!utf8_is_valid(e)) + return false; + + /* Note that variable *values* may contain control characters, in particular NL, TAB, BS, DEL, ESC… + * When printing those variables with show-environment, we'll escape them. Make sure to print + * environment variables carefully! */ + + /* POSIX says the overall size of the environment block cannot be > ARG_MAX, an individual assignment + * hence cannot be either. Discounting the shortest possible variable name of length 1, the equal + * sign and trailing NUL this hence leaves ARG_MAX-3 as longest possible variable value. */ + if (strlen(e) > sc_arg_max() - 3) + return false; + + return true; +} + +bool env_assignment_is_valid(const char *e) { + const char *eq; + + eq = strchr(e, '='); + if (!eq) + return false; + + if (!env_name_is_valid_n(e, eq - e)) + return false; + + if (!env_value_is_valid(eq + 1)) + return false; + + /* POSIX says the overall size of the environment block cannot be > ARG_MAX, hence the individual + * variable assignments cannot be either, but let's leave room for one trailing NUL byte. */ + if (strlen(e) > sc_arg_max() - 1) + return false; + + return true; +} + +bool strv_env_is_valid(char **e) { + STRV_FOREACH(p, e) { + size_t k; + + if (!env_assignment_is_valid(*p)) + return false; + + /* Check if there are duplicate assignments */ + k = strcspn(*p, "="); + STRV_FOREACH(q, p + 1) + if (strneq(*p, *q, k) && (*q)[k] == '=') + return false; + } + + return true; +} + +bool strv_env_name_is_valid(char **l) { + STRV_FOREACH(p, l) { + if (!env_name_is_valid(*p)) + return false; + + if (strv_contains(p + 1, *p)) + return false; + } + + return true; +} + +bool strv_env_name_or_assignment_is_valid(char **l) { + STRV_FOREACH(p, l) { + if (!env_assignment_is_valid(*p) && !env_name_is_valid(*p)) + return false; + + if (strv_contains(p + 1, *p)) + return false; + } + + return true; +} + +static int env_append(char **e, char ***k, char **a) { + assert(e); + assert(k); + assert(*k >= e); + + if (!a) + return 0; + + /* Expects the following arguments: 'e' shall point to the beginning of an strv we are going to append to, 'k' + * to a pointer pointing to the NULL entry at the end of the same array. 'a' shall point to another strv. + * + * This call adds every entry of 'a' to 'e', either overriding an existing matching entry, or appending to it. + * + * This call assumes 'e' has enough pre-allocated space to grow by all of 'a''s items. */ + + for (; *a; a++) { + char **j, *c; + size_t n; + + n = strcspn(*a, "="); + if ((*a)[n] == '=') + n++; + + for (j = e; j < *k; j++) + if (strneq(*j, *a, n)) + break; + + c = strdup(*a); + if (!c) + return -ENOMEM; + + if (j >= *k) { /* Append to the end? */ + (*k)[0] = c; + (*k)[1] = NULL; + (*k)++; + } else + free_and_replace(*j, c); /* Override existing item */ + } + + return 0; +} + +char** _strv_env_merge(char **first, ...) { + _cleanup_strv_free_ char **merged = NULL; + char **k; + va_list ap; + + /* Merges an arbitrary number of environment sets */ + + size_t n = strv_length(first); + + va_start(ap, first); + for (;;) { + char **l; + + l = va_arg(ap, char**); + if (l == POINTER_MAX) + break; + + n += strv_length(l); + } + va_end(ap); + + k = merged = new(char*, n + 1); + if (!merged) + return NULL; + merged[0] = NULL; + + if (env_append(merged, &k, first) < 0) + return NULL; + + va_start(ap, first); + for (;;) { + char **l; + + l = va_arg(ap, char**); + if (l == POINTER_MAX) + break; + + if (env_append(merged, &k, l) < 0) { + va_end(ap); + return NULL; + } + } + va_end(ap); + + return TAKE_PTR(merged); +} + +static bool env_match(const char *t, const char *pattern) { + assert(t); + assert(pattern); + + /* pattern a matches string a + * a matches a= + * a matches a=b + * a= matches a= + * a=b matches a=b + * a= does not match a + * a=b does not match a= + * a=b does not match a + * a=b does not match a=c */ + + if (streq(t, pattern)) + return true; + + if (!strchr(pattern, '=')) { + size_t l = strlen(pattern); + + return strneq(t, pattern, l) && t[l] == '='; + } + + return false; +} + +static bool env_entry_has_name(const char *entry, const char *name) { + const char *t; + + assert(entry); + assert(name); + + t = startswith(entry, name); + if (!t) + return false; + + return *t == '='; +} + +char **strv_env_delete(char **x, size_t n_lists, ...) { + size_t n, i = 0; + _cleanup_strv_free_ char **t = NULL; + va_list ap; + + /* Deletes every entry from x that is mentioned in the other + * string lists */ + + n = strv_length(x); + + t = new(char*, n+1); + if (!t) + return NULL; + + STRV_FOREACH(k, x) { + va_start(ap, n_lists); + for (size_t v = 0; v < n_lists; v++) { + char **l; + + l = va_arg(ap, char**); + STRV_FOREACH(j, l) + if (env_match(*k, *j)) + goto skip; + } + va_end(ap); + + t[i] = strdup(*k); + if (!t[i]) + return NULL; + + i++; + continue; + + skip: + va_end(ap); + } + + t[i] = NULL; + + assert(i <= n); + + return TAKE_PTR(t); +} + +char **strv_env_unset(char **l, const char *p) { + char **f, **t; + + if (!l) + return NULL; + + assert(p); + + /* Drops every occurrence of the env var setting p in the + * string list. Edits in-place. */ + + for (f = t = l; *f; f++) { + + if (env_match(*f, p)) { + free(*f); + continue; + } + + *(t++) = *f; + } + + *t = NULL; + return l; +} + +char **strv_env_unset_many(char **l, ...) { + char **f, **t; + + if (!l) + return NULL; + + /* Like strv_env_unset() but applies many at once. Edits in-place. */ + + for (f = t = l; *f; f++) { + bool found = false; + const char *p; + va_list ap; + + va_start(ap, l); + + while ((p = va_arg(ap, const char*))) { + if (env_match(*f, p)) { + found = true; + break; + } + } + + va_end(ap); + + if (found) { + free(*f); + continue; + } + + *(t++) = *f; + } + + *t = NULL; + return l; +} + +int strv_env_replace_consume(char ***l, char *p) { + const char *t, *name; + int r; + + assert(p); + + /* Replace first occurrence of the env var or add a new one in the string list. Drop other + * occurrences. Edits in-place. Does not copy p and CONSUMES p EVEN ON FAILURE. + * + * p must be a valid key=value assignment. */ + + t = strchr(p, '='); + if (!t) { + free(p); + return -EINVAL; + } + + name = strndupa_safe(p, t - p); + + STRV_FOREACH(f, *l) + if (env_entry_has_name(*f, name)) { + free_and_replace(*f, p); + strv_env_unset(f + 1, *f); + return 0; + } + + /* We didn't find a match, we need to append p or create a new strv */ + r = strv_consume(l, p); + if (r < 0) + return r; + + return 1; +} + +int strv_env_replace_strdup(char ***l, const char *assignment) { + /* Like strv_env_replace_consume(), but copies the argument. */ + + char *p = strdup(assignment); + if (!p) + return -ENOMEM; + + return strv_env_replace_consume(l, p); +} + +int strv_env_replace_strdup_passthrough(char ***l, const char *assignment) { + /* Like strv_env_replace_strdup(), but pulls the variable from the environment of + * the calling program, if a variable name without value is specified. + */ + char *p; + + if (strchr(assignment, '=')) { + if (!env_assignment_is_valid(assignment)) + return -EINVAL; + + p = strdup(assignment); + } else { + if (!env_name_is_valid(assignment)) + return -EINVAL; + + /* If we can't find the variable in our environment, we will use + * the empty string. This way "passthrough" is equivalent to passing + * --setenv=FOO=$FOO in the shell. */ + p = strjoin(assignment, "=", secure_getenv(assignment)); + } + if (!p) + return -ENOMEM; + + return strv_env_replace_consume(l, p); +} + +int strv_env_assign(char ***l, const char *key, const char *value) { + if (!env_name_is_valid(key)) + return -EINVAL; + + /* NULL removes assignment, "" creates an empty assignment. */ + + if (!value) { + strv_env_unset(*l, key); + return 0; + } + + char *p = strjoin(key, "=", value); + if (!p) + return -ENOMEM; + + return strv_env_replace_consume(l, p); +} + +int _strv_env_assign_many(char ***l, ...) { + va_list ap; + int r; + + assert(l); + + va_start(ap, l); + for (;;) { + const char *key, *value; + + key = va_arg(ap, const char *); + if (!key) + break; + + if (!env_name_is_valid(key)) { + va_end(ap); + return -EINVAL; + } + + value = va_arg(ap, const char *); + if (!value) { + strv_env_unset(*l, key); + continue; + } + + char *p = strjoin(key, "=", value); + if (!p) { + va_end(ap); + return -ENOMEM; + } + + r = strv_env_replace_consume(l, p); + if (r < 0) { + va_end(ap); + return r; + } + } + va_end(ap); + + return 0; +} + +char *strv_env_get_n(char **l, const char *name, size_t k, ReplaceEnvFlags flags) { + assert(name); + + if (k == SIZE_MAX) + k = strlen_ptr(name); + if (k <= 0) + return NULL; + + STRV_FOREACH_BACKWARDS(i, l) + if (strneq(*i, name, k) && + (*i)[k] == '=') + return *i + k + 1; + + if (flags & REPLACE_ENV_USE_ENVIRONMENT) { + const char *t; + + /* Safety check that the name is not overly long, before we do a stack allocation */ + if (k > (size_t) sysconf(_SC_ARG_MAX) - 2) + return NULL; + + t = strndupa_safe(name, k); + return getenv(t); + }; + + return NULL; +} + +char *strv_env_pairs_get(char **l, const char *name) { + char *result = NULL; + + assert(name); + + STRV_FOREACH_PAIR(key, value, l) + if (streq(*key, name)) + result = *value; + + return result; +} + +char **strv_env_clean_with_callback(char **e, void (*invalid_callback)(const char *p, void *userdata), void *userdata) { + int k = 0; + + STRV_FOREACH(p, e) { + size_t n; + bool duplicate = false; + + if (!env_assignment_is_valid(*p)) { + if (invalid_callback) + invalid_callback(*p, userdata); + free(*p); + continue; + } + + n = strcspn(*p, "="); + STRV_FOREACH(q, p + 1) + if (strneq(*p, *q, n) && (*q)[n] == '=') { + duplicate = true; + break; + } + + if (duplicate) { + free(*p); + continue; + } + + e[k++] = *p; + } + + if (e) + e[k] = NULL; + + return e; +} + +static int strv_extend_with_length(char ***l, const char *s, size_t n) { + char *c; + + c = strndup(s, n); + if (!c) + return -ENOMEM; + + return strv_consume(l, c); +} + +static int strv_env_get_n_validated( + char **env, + const char *name, + size_t l, + ReplaceEnvFlags flags, + char **ret, /* points into the env block! do not free! */ + char ***unset_variables, /* updated in place */ + char ***bad_variables) { /* ditto */ + + char *e; + int r; + + assert(l == 0 || name); + assert(ret); + + if (env_name_is_valid_n(name, l)) { + e = strv_env_get_n(env, name, l, flags); + if (!e && unset_variables) { + r = strv_extend_with_length(unset_variables, name, l); + if (r < 0) + return r; + } + } else { + e = NULL; /* Resolve invalid variable names the same way as unset ones */ + + if (bad_variables) { + r = strv_extend_with_length(bad_variables, name, l); + if (r < 0) + return r; + } + } + + *ret = e; + return !!e; +} + +int replace_env_full( + const char *format, + size_t n, + char **env, + ReplaceEnvFlags flags, + char **ret, + char ***ret_unset_variables, + char ***ret_bad_variables) { + + enum { + WORD, + CURLY, + VARIABLE, + VARIABLE_RAW, + TEST, + DEFAULT_VALUE, + ALTERNATE_VALUE, + } state = WORD; + + _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL; + const char *e, *word = format, *test_value = NULL; /* test_value is initialized to appease gcc */ + _cleanup_free_ char *s = NULL; + char ***pu, ***pb, *k; + size_t i, len = 0; /* len is initialized to appease gcc */ + int nest = 0, r; + + assert(format); + + if (n == SIZE_MAX) + n = strlen(format); + + pu = ret_unset_variables ? &unset_variables : NULL; + pb = ret_bad_variables ? &bad_variables : NULL; + + for (e = format, i = 0; *e && i < n; e ++, i ++) + switch (state) { + + case WORD: + if (*e == '$') + state = CURLY; + break; + + case CURLY: + if (*e == '{') { + k = strnappend(s, word, e-word-1); + if (!k) + return -ENOMEM; + + free_and_replace(s, k); + + word = e-1; + state = VARIABLE; + nest++; + + } else if (*e == '$') { + k = strnappend(s, word, e-word); + if (!k) + return -ENOMEM; + + free_and_replace(s, k); + + word = e+1; + state = WORD; + + } else if (FLAGS_SET(flags, REPLACE_ENV_ALLOW_BRACELESS) && strchr(VALID_BASH_ENV_NAME_CHARS, *e)) { + k = strnappend(s, word, e-word-1); + if (!k) + return -ENOMEM; + + free_and_replace(s, k); + + word = e-1; + state = VARIABLE_RAW; + + } else + state = WORD; + break; + + case VARIABLE: + if (*e == '}') { + char *t; + + r = strv_env_get_n_validated(env, word+2, e-word-2, flags, &t, pu, pb); + if (r < 0) + return r; + + if (!strextend(&s, t)) + return -ENOMEM; + + word = e+1; + state = WORD; + nest--; + } else if (*e == ':') { + if (flags & REPLACE_ENV_ALLOW_EXTENDED) { + len = e - word - 2; + state = TEST; + } else + /* Treat this as unsupported syntax, i.e. do no replacement */ + state = WORD; + } + break; + + case TEST: + if (*e == '-') + state = DEFAULT_VALUE; + else if (*e == '+') + state = ALTERNATE_VALUE; + else { + state = WORD; + break; + } + + test_value = e+1; + break; + + case DEFAULT_VALUE: /* fall through */ + case ALTERNATE_VALUE: + assert(flags & REPLACE_ENV_ALLOW_EXTENDED); + + if (*e == '{') { + nest++; + break; + } + + if (*e != '}') + break; + + nest--; + if (nest == 0) { + _cleanup_strv_free_ char **u = NULL, **b = NULL; + _cleanup_free_ char *v = NULL; + char *t = NULL; + + r = strv_env_get_n_validated(env, word+2, len, flags, &t, pu, pb); + if (r < 0) + return r; + + if (t && state == ALTERNATE_VALUE) { + r = replace_env_full(test_value, e-test_value, env, flags, &v, pu ? &u : NULL, pb ? &b : NULL); + if (r < 0) + return r; + + t = v; + } else if (!t && state == DEFAULT_VALUE) { + r = replace_env_full(test_value, e-test_value, env, flags, &v, pu ? &u : NULL, pb ? &b : NULL); + if (r < 0) + return r; + + t = v; + } + + r = strv_extend_strv(&unset_variables, u, /* filter_duplicates= */ true); + if (r < 0) + return r; + r = strv_extend_strv(&bad_variables, b, /* filter_duplicates= */ true); + if (r < 0) + return r; + + if (!strextend(&s, t)) + return -ENOMEM; + + word = e+1; + state = WORD; + } + break; + + case VARIABLE_RAW: + assert(flags & REPLACE_ENV_ALLOW_BRACELESS); + + if (!strchr(VALID_BASH_ENV_NAME_CHARS, *e)) { + char *t = NULL; + + r = strv_env_get_n_validated(env, word+1, e-word-1, flags, &t, &unset_variables, &bad_variables); + if (r < 0) + return r; + + if (!strextend(&s, t)) + return -ENOMEM; + + word = e--; + i--; + state = WORD; + } + break; + } + + if (state == VARIABLE_RAW) { + char *t; + + assert(flags & REPLACE_ENV_ALLOW_BRACELESS); + + r = strv_env_get_n_validated(env, word+1, e-word-1, flags, &t, &unset_variables, &bad_variables); + if (r < 0) + return r; + + if (!strextend(&s, t)) + return -ENOMEM; + + } else if (!strextendn(&s, word, e-word)) + return -ENOMEM; + + if (ret_unset_variables) + *ret_unset_variables = TAKE_PTR(unset_variables); + if (ret_bad_variables) + *ret_bad_variables = TAKE_PTR(bad_variables); + + if (ret) + *ret = TAKE_PTR(s); + + return 0; +} + +int replace_env_argv( + char **argv, + char **env, + char ***ret, + char ***ret_unset_variables, + char ***ret_bad_variables) { + + _cleanup_strv_free_ char **n = NULL, **unset_variables = NULL, **bad_variables = NULL; + size_t k = 0, l = 0; + int r; + + l = strv_length(argv); + + n = new(char*, l+1); + if (!n) + return -ENOMEM; + + STRV_FOREACH(i, argv) { + const char *word = *i; + + /* If $FOO appears as single word, replace it by the split up variable */ + if (word[0] == '$' && !IN_SET(word[1], '{', '$')) { + _cleanup_strv_free_ char **m = NULL; + const char *name = word + 1; + char *e, **w; + size_t q; + + if (env_name_is_valid(name)) { + e = strv_env_get(env, name); + if (e) + r = strv_split_full(&m, e, WHITESPACE, EXTRACT_RELAX|EXTRACT_UNQUOTE); + else if (ret_unset_variables) + r = strv_extend(&unset_variables, name); + else + r = 0; + } else if (ret_bad_variables) + r = strv_extend(&bad_variables, name); + else + r = 0; + if (r < 0) + return r; + + q = strv_length(m); + l = l + q - 1; + + w = reallocarray(n, l + 1, sizeof(char*)); + if (!w) + return -ENOMEM; + + n = w; + if (m) { + memcpy(n + k, m, (q + 1) * sizeof(char*)); + m = mfree(m); + } + + k += q; + continue; + } + + _cleanup_strv_free_ char **u = NULL, **b = NULL; + + /* If ${FOO} appears as part of a word, replace it by the variable as-is */ + r = replace_env_full( + word, + /* length= */ SIZE_MAX, + env, + /* flags= */ 0, + n + k, + ret_unset_variables ? &u : NULL, + ret_bad_variables ? &b : NULL); + if (r < 0) + return r; + n[++k] = NULL; + + r = strv_extend_strv(&unset_variables, u, /* filter_duplicates= */ true); + if (r < 0) + return r; + + r = strv_extend_strv(&bad_variables, b, /*filter_duplicates= */ true); + if (r < 0) + return r; + } + + if (ret_unset_variables) { + strv_uniq(strv_sort(unset_variables)); + *ret_unset_variables = TAKE_PTR(unset_variables); + } + if (ret_bad_variables) { + strv_uniq(strv_sort(bad_variables)); + *ret_bad_variables = TAKE_PTR(bad_variables); + } + + *ret = TAKE_PTR(n); + return 0; +} + +int getenv_bool(const char *p) { + const char *e; + + e = getenv(p); + if (!e) + return -ENXIO; + + return parse_boolean(e); +} + +int getenv_bool_secure(const char *p) { + const char *e; + + e = secure_getenv(p); + if (!e) + return -ENXIO; + + return parse_boolean(e); +} + +int getenv_uint64_secure(const char *p, uint64_t *ret) { + const char *e; + + assert(p); + + e = secure_getenv(p); + if (!e) + return -ENXIO; + + return safe_atou64(e, ret); +} + +int set_unset_env(const char *name, const char *value, bool overwrite) { + assert(name); + + if (value) + return RET_NERRNO(setenv(name, value, overwrite)); + + return RET_NERRNO(unsetenv(name)); +} + +int putenv_dup(const char *assignment, bool override) { + const char *e, *n; + + e = strchr(assignment, '='); + if (!e) + return -EINVAL; + + n = strndupa_safe(assignment, e - assignment); + + /* This is like putenv(), but uses setenv() so that our memory doesn't become part of environ[]. */ + return RET_NERRNO(setenv(n, e + 1, override)); +} + +int setenv_systemd_exec_pid(bool update_only) { + char str[DECIMAL_STR_MAX(pid_t)]; + const char *e; + + /* Update $SYSTEMD_EXEC_PID=pid except when '*' is set for the variable. */ + + e = secure_getenv("SYSTEMD_EXEC_PID"); + if (!e && update_only) + return 0; + + if (streq_ptr(e, "*")) + return 0; + + xsprintf(str, PID_FMT, getpid_cached()); + + if (setenv("SYSTEMD_EXEC_PID", str, 1) < 0) + return -errno; + + return 1; +} + +int getenv_path_list(const char *name, char ***ret_paths) { + _cleanup_strv_free_ char **l = NULL; + const char *e; + int r; + + assert(name); + assert(ret_paths); + + e = secure_getenv(name); + if (!e) + return -ENXIO; + + r = strv_split_full(&l, e, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_debug_errno(r, "Failed to parse $%s: %m", name); + + STRV_FOREACH(p, l) { + if (!path_is_absolute(*p)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path '%s' is not absolute, refusing.", *p); + + if (!path_is_normalized(*p)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path '%s' is not normalized, refusing.", *p); + + if (path_equal(*p, "/")) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Path '%s' is the root fs, refusing.", *p); + } + + if (strv_isempty(l)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "No paths specified, refusing."); + + *ret_paths = TAKE_PTR(l); + return 1; +} + +int getenv_steal_erase(const char *name, char **ret) { + _cleanup_(erase_and_freep) char *a = NULL; + char *e; + + assert(name); + + /* Reads an environment variable, makes a copy of it, erases its memory in the environment block and removes + * it from there. Usecase: reading passwords from the env block (which is a bad idea, but useful for + * testing, and given that people are likely going to misuse this, be thorough) */ + + e = getenv(name); + if (!e) { + if (ret) + *ret = NULL; + return 0; + } + + if (ret) { + a = strdup(e); + if (!a) + return -ENOMEM; + } + + string_erase(e); + + if (unsetenv(name) < 0) + return -errno; + + if (ret) + *ret = TAKE_PTR(a); + + return 1; +} + +int set_full_environment(char **env) { + int r; + + clearenv(); + + STRV_FOREACH(e, env) { + _cleanup_free_ char *k = NULL, *v = NULL; + + r = split_pair(*e, "=", &k, &v); + if (r < 0) + return r; + + if (setenv(k, v, /* overwrite= */ true) < 0) + return -errno; + } + + return 0; +} diff --git a/src/basic/env-util.h b/src/basic/env-util.h new file mode 100644 index 0000000..f7fb1e9 --- /dev/null +++ b/src/basic/env-util.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "macro.h" +#include "string.h" + +static inline size_t sc_arg_max(void) { + long l = sysconf(_SC_ARG_MAX); + assert(l > 0); + return (size_t) l; +} + +bool env_name_is_valid(const char *e); +bool env_value_is_valid(const char *e); +bool env_assignment_is_valid(const char *e); + +typedef enum ReplaceEnvFlags { + REPLACE_ENV_USE_ENVIRONMENT = 1 << 0, + REPLACE_ENV_ALLOW_BRACELESS = 1 << 1, + REPLACE_ENV_ALLOW_EXTENDED = 1 << 2, +} ReplaceEnvFlags; + +int replace_env_full(const char *format, size_t n, char **env, ReplaceEnvFlags flags, char **ret, char ***ret_unset_variables, char ***ret_bad_variables); +static inline int replace_env(const char *format, char **env, ReplaceEnvFlags flags, char **ret) { + return replace_env_full(format, SIZE_MAX, env, flags, ret, NULL, NULL); +} + +int replace_env_argv(char **argv, char **env, char ***ret, char ***ret_unset_variables, char ***ret_bad_variables); + +bool strv_env_is_valid(char **e); +#define strv_env_clean(l) strv_env_clean_with_callback(l, NULL, NULL) +char **strv_env_clean_with_callback(char **l, void (*invalid_callback)(const char *p, void *userdata), void *userdata); + +bool strv_env_name_is_valid(char **l); +bool strv_env_name_or_assignment_is_valid(char **l); + +char** _strv_env_merge(char **first, ...); +#define strv_env_merge(first, ...) _strv_env_merge(first, __VA_ARGS__, POINTER_MAX) +char **strv_env_delete(char **x, size_t n_lists, ...); /* New copy */ + +char **strv_env_unset(char **l, const char *p); /* In place ... */ +char **strv_env_unset_many(char **l, ...) _sentinel_; +int strv_env_replace_consume(char ***l, char *p); /* In place ... */ +int strv_env_replace_strdup(char ***l, const char *assignment); +int strv_env_replace_strdup_passthrough(char ***l, const char *assignment); +int strv_env_assign(char ***l, const char *key, const char *value); +int _strv_env_assign_many(char ***l, ...) _sentinel_; +#define strv_env_assign_many(l, ...) _strv_env_assign_many(l, __VA_ARGS__, NULL) + +char *strv_env_get_n(char **l, const char *name, size_t k, ReplaceEnvFlags flags) _pure_; +static inline char *strv_env_get(char **x, const char *n) { + return strv_env_get_n(x, n, SIZE_MAX, 0); +} + +char *strv_env_pairs_get(char **l, const char *name) _pure_; + +int getenv_bool(const char *p); +int getenv_bool_secure(const char *p); + +int getenv_uint64_secure(const char *p, uint64_t *ret); + +/* Like setenv, but calls unsetenv if value == NULL. */ +int set_unset_env(const char *name, const char *value, bool overwrite); + +/* Like putenv, but duplicates the memory like setenv. */ +int putenv_dup(const char *assignment, bool override); + +int setenv_systemd_exec_pid(bool update_only); + +/* Parses and does sanity checks on an environment variable containing + * PATH-like colon-separated absolute paths */ +int getenv_path_list(const char *name, char ***ret_paths); + +int getenv_steal_erase(const char *name, char **ret); + +int set_full_environment(char **env); diff --git a/src/basic/errno-list.c b/src/basic/errno-list.c new file mode 100644 index 0000000..2aeb38c --- /dev/null +++ b/src/basic/errno-list.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "errno-list.h" +#include "macro.h" + +static const struct errno_name* lookup_errno(register const char *str, + register GPERF_LEN_TYPE len); + +#include "errno-from-name.h" +#include "errno-to-name.h" + +const char *errno_to_name(int id) { + + if (id < 0) + id = -id; + + if ((size_t) id >= ELEMENTSOF(errno_names)) + return NULL; + + return errno_names[id]; +} + +int errno_from_name(const char *name) { + const struct errno_name *sc; + + assert(name); + + sc = lookup_errno(name, strlen(name)); + if (!sc) + return -EINVAL; + + assert(sc->id > 0); + return sc->id; +} diff --git a/src/basic/errno-list.h b/src/basic/errno-list.h new file mode 100644 index 0000000..082b833 --- /dev/null +++ b/src/basic/errno-list.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +/* + * MAX_ERRNO is defined as 4095 in linux/err.h + * We use the same value here. + */ +#define ERRNO_MAX 4095 + +const char *errno_to_name(int id); +int errno_from_name(const char *name); +static inline bool errno_is_valid(int n) { + return n > 0 && n <= ERRNO_MAX; +} diff --git a/src/basic/errno-to-name.awk b/src/basic/errno-to-name.awk new file mode 100644 index 0000000..8442124 --- /dev/null +++ b/src/basic/errno-to-name.awk @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "static const char* const errno_names[] = { " +} +!/(EDEADLOCK|EWOULDBLOCK|ENOTSUP)/ { + printf " [%s] = \"%s\",\n", $1, $1 +} +END{ + print "};" +} diff --git a/src/basic/errno-util.h b/src/basic/errno-util.h new file mode 100644 index 0000000..27804e6 --- /dev/null +++ b/src/basic/errno-util.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" + +/* strerror(3) says that glibc uses a maximum length of 1024 bytes. */ +#define ERRNO_BUF_LEN 1024 + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks + * + * Note that we use the GNU variant of strerror_r() here. */ +#define STRERROR(errnum) strerror_r(abs(errnum), (char[ERRNO_BUF_LEN]){}, ERRNO_BUF_LEN) + +/* A helper to print an error message or message for functions that return 0 on EOF. + * Note that we can't use ({ … }) to define a temporary variable, so errnum is + * evaluated twice. */ +#define STRERROR_OR_EOF(errnum) ((errnum) != 0 ? STRERROR(errnum) : "Unexpected EOF") + +static inline void _reset_errno_(int *saved_errno) { + if (*saved_errno < 0) /* Invalidated by UNPROTECT_ERRNO? */ + return; + + errno = *saved_errno; +} + +#define PROTECT_ERRNO \ + _cleanup_(_reset_errno_) _unused_ int _saved_errno_ = errno + +#define UNPROTECT_ERRNO \ + do { \ + errno = _saved_errno_; \ + _saved_errno_ = -1; \ + } while (false) + +#define LOCAL_ERRNO(value) \ + PROTECT_ERRNO; \ + errno = abs(value) + +static inline int negative_errno(void) { + /* This helper should be used to shut up gcc if you know 'errno' is + * negative. Instead of "return -errno;", use "return negative_errno();" + * It will suppress bogus gcc warnings in case it assumes 'errno' might + * be 0 and thus the caller's error-handling might not be triggered. */ + assert_return(errno > 0, -EINVAL); + return -errno; +} + +static inline int RET_NERRNO(int ret) { + + /* Helper to wrap system calls in to make them return negative errno errors. This brings system call + * error handling in sync with how we usually handle errors in our own code, i.e. with immediate + * returning of negative errno. Usage is like this: + * + * … + * r = RET_NERRNO(unlink(t)); + * … + * + * or + * + * … + * fd = RET_NERRNO(open("/etc/fstab", O_RDONLY|O_CLOEXEC)); + * … + */ + + if (ret < 0) + return negative_errno(); + + return ret; +} + +/* Collect possible errors in , so that the first error can be returned. + * Returns (possibly updated) . */ +#define RET_GATHER(acc, err) \ + ({ \ + int *__a = &(acc), __e = (err); \ + if (*__a >= 0 && __e < 0) \ + *__a = __e; \ + *__a; \ + }) + +static inline int errno_or_else(int fallback) { + /* To be used when invoking library calls where errno handling is not defined clearly: we return + * errno if it is set, and the specified error otherwise. The idea is that the caller initializes + * errno to zero before doing an API call, and then uses this helper to retrieve a somewhat useful + * error code */ + if (errno > 0) + return -errno; + + return -abs(fallback); +} + +/* abs(3) says: Trying to take the absolute value of the most negative integer is not defined. */ +#define _DEFINE_ABS_WRAPPER(name) \ + static inline bool ERRNO_IS_##name(intmax_t r) { \ + if (r == INTMAX_MIN) \ + return false; \ + return ERRNO_IS_NEG_##name(-imaxabs(r)); \ + } + +assert_cc(INT_MAX <= INTMAX_MAX); + +/* For send()/recv() or read()/write(). */ +static inline bool ERRNO_IS_NEG_TRANSIENT(intmax_t r) { + return IN_SET(r, + -EAGAIN, + -EINTR); +} +_DEFINE_ABS_WRAPPER(TRANSIENT); + +/* Hint #1: ENETUNREACH happens if we try to connect to "non-existing" special IP addresses, such as ::5. + * + * Hint #2: The kernel sends e.g., EHOSTUNREACH or ENONET to userspace in some ICMP error cases. See the + * icmp_err_convert[] in net/ipv4/icmp.c in the kernel sources. + * + * Hint #3: When asynchronous connect() on TCP fails because the host never acknowledges a single packet, + * kernel tells us that with ETIMEDOUT, see tcp(7). */ +static inline bool ERRNO_IS_NEG_DISCONNECT(intmax_t r) { + return IN_SET(r, + -ECONNABORTED, + -ECONNREFUSED, + -ECONNRESET, + -EHOSTDOWN, + -EHOSTUNREACH, + -ENETDOWN, + -ENETRESET, + -ENETUNREACH, + -ENONET, + -ENOPROTOOPT, + -ENOTCONN, + -EPIPE, + -EPROTO, + -ESHUTDOWN, + -ETIMEDOUT); +} +_DEFINE_ABS_WRAPPER(DISCONNECT); + +/* Transient errors we might get on accept() that we should ignore. As per error handling comment in + * the accept(2) man page. */ +static inline bool ERRNO_IS_NEG_ACCEPT_AGAIN(intmax_t r) { + return ERRNO_IS_NEG_DISCONNECT(r) || + ERRNO_IS_NEG_TRANSIENT(r) || + r == -EOPNOTSUPP; +} +_DEFINE_ABS_WRAPPER(ACCEPT_AGAIN); + +/* Resource exhaustion, could be our fault or general system trouble */ +static inline bool ERRNO_IS_NEG_RESOURCE(intmax_t r) { + return IN_SET(r, + -EMFILE, + -ENFILE, + -ENOMEM); +} +_DEFINE_ABS_WRAPPER(RESOURCE); + +/* Seven different errors for "operation/system call/ioctl/socket feature not supported" */ +static inline bool ERRNO_IS_NEG_NOT_SUPPORTED(intmax_t r) { + return IN_SET(r, + -EOPNOTSUPP, + -ENOTTY, + -ENOSYS, + -EAFNOSUPPORT, + -EPFNOSUPPORT, + -EPROTONOSUPPORT, + -ESOCKTNOSUPPORT); +} +_DEFINE_ABS_WRAPPER(NOT_SUPPORTED); + +/* Two different errors for access problems */ +static inline bool ERRNO_IS_NEG_PRIVILEGE(intmax_t r) { + return IN_SET(r, + -EACCES, + -EPERM); +} +_DEFINE_ABS_WRAPPER(PRIVILEGE); + +/* Three different errors for "not enough disk space" */ +static inline bool ERRNO_IS_NEG_DISK_SPACE(intmax_t r) { + return IN_SET(r, + -ENOSPC, + -EDQUOT, + -EFBIG); +} +_DEFINE_ABS_WRAPPER(DISK_SPACE); + +/* Three different errors for "this device does not quite exist" */ +static inline bool ERRNO_IS_NEG_DEVICE_ABSENT(intmax_t r) { + return IN_SET(r, + -ENODEV, + -ENXIO, + -ENOENT); +} +_DEFINE_ABS_WRAPPER(DEVICE_ABSENT); + +/* Quite often we want to handle cases where the backing FS doesn't support extended attributes at all and + * where it simply doesn't have the requested xattr the same way */ +static inline bool ERRNO_IS_NEG_XATTR_ABSENT(intmax_t r) { + return r == -ENODATA || + ERRNO_IS_NEG_NOT_SUPPORTED(r); +} +_DEFINE_ABS_WRAPPER(XATTR_ABSENT); diff --git a/src/basic/escape.c b/src/basic/escape.c new file mode 100644 index 0000000..75a1d68 --- /dev/null +++ b/src/basic/escape.c @@ -0,0 +1,576 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "escape.h" +#include "hexdecoct.h" +#include "macro.h" +#include "strv.h" +#include "utf8.h" + +int cescape_char(char c, char *buf) { + char *buf_old = buf; + + /* Needs space for 4 characters in the buffer */ + + switch (c) { + + case '\a': + *(buf++) = '\\'; + *(buf++) = 'a'; + break; + case '\b': + *(buf++) = '\\'; + *(buf++) = 'b'; + break; + case '\f': + *(buf++) = '\\'; + *(buf++) = 'f'; + break; + case '\n': + *(buf++) = '\\'; + *(buf++) = 'n'; + break; + case '\r': + *(buf++) = '\\'; + *(buf++) = 'r'; + break; + case '\t': + *(buf++) = '\\'; + *(buf++) = 't'; + break; + case '\v': + *(buf++) = '\\'; + *(buf++) = 'v'; + break; + case '\\': + *(buf++) = '\\'; + *(buf++) = '\\'; + break; + case '"': + *(buf++) = '\\'; + *(buf++) = '"'; + break; + case '\'': + *(buf++) = '\\'; + *(buf++) = '\''; + break; + + default: + /* For special chars we prefer octal over + * hexadecimal encoding, simply because glib's + * g_strescape() does the same */ + if ((c < ' ') || (c >= 127)) { + *(buf++) = '\\'; + *(buf++) = octchar((unsigned char) c >> 6); + *(buf++) = octchar((unsigned char) c >> 3); + *(buf++) = octchar((unsigned char) c); + } else + *(buf++) = c; + break; + } + + return buf - buf_old; +} + +char* cescape_length(const char *s, size_t n) { + const char *f; + char *r, *t; + + assert(s || n == 0); + + /* Does C style string escaping. May be reversed with + * cunescape(). */ + + r = new(char, n*4 + 1); + if (!r) + return NULL; + + for (f = s, t = r; f < s + n; f++) + t += cescape_char(*f, t); + + *t = 0; + + return r; +} + +char* cescape(const char *s) { + assert(s); + + return cescape_length(s, strlen(s)); +} + +int cunescape_one(const char *p, size_t length, char32_t *ret, bool *eight_bit, bool accept_nul) { + int r = 1; + + assert(p); + assert(ret); + + /* Unescapes C style. Returns the unescaped character in ret. + * Sets *eight_bit to true if the escaped sequence either fits in + * one byte in UTF-8 or is a non-unicode literal byte and should + * instead be copied directly. + */ + + if (length != SIZE_MAX && length < 1) + return -EINVAL; + + switch (p[0]) { + + case 'a': + *ret = '\a'; + break; + case 'b': + *ret = '\b'; + break; + case 'f': + *ret = '\f'; + break; + case 'n': + *ret = '\n'; + break; + case 'r': + *ret = '\r'; + break; + case 't': + *ret = '\t'; + break; + case 'v': + *ret = '\v'; + break; + case '\\': + *ret = '\\'; + break; + case '"': + *ret = '"'; + break; + case '\'': + *ret = '\''; + break; + + case 's': + /* This is an extension of the XDG syntax files */ + *ret = ' '; + break; + + case 'x': { + /* hexadecimal encoding */ + int a, b; + + if (length != SIZE_MAX && length < 3) + return -EINVAL; + + a = unhexchar(p[1]); + if (a < 0) + return -EINVAL; + + b = unhexchar(p[2]); + if (b < 0) + return -EINVAL; + + /* Don't allow NUL bytes */ + if (a == 0 && b == 0 && !accept_nul) + return -EINVAL; + + *ret = (a << 4U) | b; + *eight_bit = true; + r = 3; + break; + } + + case 'u': { + /* C++11 style 16-bit unicode */ + + int a[4]; + size_t i; + uint32_t c; + + if (length != SIZE_MAX && length < 5) + return -EINVAL; + + for (i = 0; i < 4; i++) { + a[i] = unhexchar(p[1 + i]); + if (a[i] < 0) + return a[i]; + } + + c = ((uint32_t) a[0] << 12U) | ((uint32_t) a[1] << 8U) | ((uint32_t) a[2] << 4U) | (uint32_t) a[3]; + + /* Don't allow 0 chars */ + if (c == 0 && !accept_nul) + return -EINVAL; + + *ret = c; + r = 5; + break; + } + + case 'U': { + /* C++11 style 32-bit unicode */ + + int a[8]; + size_t i; + char32_t c; + + if (length != SIZE_MAX && length < 9) + return -EINVAL; + + for (i = 0; i < 8; i++) { + a[i] = unhexchar(p[1 + i]); + if (a[i] < 0) + return a[i]; + } + + c = ((uint32_t) a[0] << 28U) | ((uint32_t) a[1] << 24U) | ((uint32_t) a[2] << 20U) | ((uint32_t) a[3] << 16U) | + ((uint32_t) a[4] << 12U) | ((uint32_t) a[5] << 8U) | ((uint32_t) a[6] << 4U) | (uint32_t) a[7]; + + /* Don't allow 0 chars */ + if (c == 0 && !accept_nul) + return -EINVAL; + + /* Don't allow invalid code points */ + if (!unichar_is_valid(c)) + return -EINVAL; + + *ret = c; + r = 9; + break; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + /* octal encoding */ + int a, b, c; + char32_t m; + + if (length != SIZE_MAX && length < 3) + return -EINVAL; + + a = unoctchar(p[0]); + if (a < 0) + return -EINVAL; + + b = unoctchar(p[1]); + if (b < 0) + return -EINVAL; + + c = unoctchar(p[2]); + if (c < 0) + return -EINVAL; + + /* don't allow NUL bytes */ + if (a == 0 && b == 0 && c == 0 && !accept_nul) + return -EINVAL; + + /* Don't allow bytes above 255 */ + m = ((uint32_t) a << 6U) | ((uint32_t) b << 3U) | (uint32_t) c; + if (m > 255) + return -EINVAL; + + *ret = m; + *eight_bit = true; + r = 3; + break; + } + + default: + return -EINVAL; + } + + return r; +} + +ssize_t cunescape_length_with_prefix(const char *s, size_t length, const char *prefix, UnescapeFlags flags, char **ret) { + _cleanup_free_ char *ans = NULL; + char *t; + const char *f; + size_t pl; + int r; + + assert(s); + assert(ret); + + /* Undoes C style string escaping, and optionally prefixes it. */ + + pl = strlen_ptr(prefix); + + ans = new(char, pl+length+1); + if (!ans) + return -ENOMEM; + + if (prefix) + memcpy(ans, prefix, pl); + + for (f = s, t = ans + pl; f < s + length; f++) { + size_t remaining; + bool eight_bit = false; + char32_t u; + + remaining = s + length - f; + assert(remaining > 0); + + if (*f != '\\') { + /* A literal, copy verbatim */ + *(t++) = *f; + continue; + } + + if (remaining == 1) { + if (flags & UNESCAPE_RELAX) { + /* A trailing backslash, copy verbatim */ + *(t++) = *f; + continue; + } + + return -EINVAL; + } + + r = cunescape_one(f + 1, remaining - 1, &u, &eight_bit, flags & UNESCAPE_ACCEPT_NUL); + if (r < 0) { + if (flags & UNESCAPE_RELAX) { + /* Invalid escape code, let's take it literal then */ + *(t++) = '\\'; + continue; + } + + return r; + } + + f += r; + if (eight_bit) + /* One byte? Set directly as specified */ + *(t++) = u; + else + /* Otherwise encode as multi-byte UTF-8 */ + t += utf8_encode_unichar(t, u); + } + + *t = 0; + + assert(t >= ans); /* Let static analyzers know that the answer is non-negative. */ + *ret = TAKE_PTR(ans); + return t - *ret; +} + +char* xescape_full(const char *s, const char *bad, size_t console_width, XEscapeFlags flags) { + char *ans, *t, *prev, *prev2; + const char *f; + + /* Escapes all chars in bad, in addition to \ and all special chars, in \xFF style escaping. May be + * reversed with cunescape(). If XESCAPE_8_BIT is specified, characters >= 127 are let through + * unchanged. This corresponds to non-ASCII printable characters in pre-unicode encodings. + * + * If console_width is reached, or XESCAPE_FORCE_ELLIPSIS is set, output is truncated and "..." is + * appended. */ + + if (console_width == 0) + return strdup(""); + + ans = new(char, MIN(strlen(s), console_width) * 4 + 1); + if (!ans) + return NULL; + + memset(ans, '_', MIN(strlen(s), console_width) * 4); + ans[MIN(strlen(s), console_width) * 4] = 0; + + bool force_ellipsis = FLAGS_SET(flags, XESCAPE_FORCE_ELLIPSIS); + + for (f = s, t = prev = prev2 = ans; ; f++) { + char *tmp_t = t; + + if (!*f) { + if (force_ellipsis) + break; + + *t = 0; + return ans; + } + + if ((unsigned char) *f < ' ' || + (!FLAGS_SET(flags, XESCAPE_8_BIT) && (unsigned char) *f >= 127) || + *f == '\\' || strchr(bad, *f)) { + if ((size_t) (t - ans) + 4 + 3 * force_ellipsis > console_width) + break; + + *(t++) = '\\'; + *(t++) = 'x'; + *(t++) = hexchar(*f >> 4); + *(t++) = hexchar(*f); + } else { + if ((size_t) (t - ans) + 1 + 3 * force_ellipsis > console_width) + break; + + *(t++) = *f; + } + + /* We might need to go back two cycles to fit three dots, so remember two positions */ + prev2 = prev; + prev = tmp_t; + } + + /* We can just write where we want, since chars are one-byte */ + size_t c = MIN(console_width, 3u); /* If the console is too narrow, write fewer dots */ + size_t off; + if (console_width - c >= (size_t) (t - ans)) + off = (size_t) (t - ans); + else if (console_width - c >= (size_t) (prev - ans)) + off = (size_t) (prev - ans); + else if (console_width - c >= (size_t) (prev2 - ans)) + off = (size_t) (prev2 - ans); + else + off = console_width - c; + assert(off <= (size_t) (t - ans)); + + memcpy(ans + off, "...", c); + ans[off + c] = '\0'; + return ans; +} + +char* escape_non_printable_full(const char *str, size_t console_width, XEscapeFlags flags) { + if (FLAGS_SET(flags, XESCAPE_8_BIT)) + return xescape_full(str, "", console_width, flags); + else + return utf8_escape_non_printable_full(str, + console_width, + FLAGS_SET(flags, XESCAPE_FORCE_ELLIPSIS)); +} + +char* octescape(const char *s, size_t len) { + char *buf, *t; + + /* Escapes all chars in bad, in addition to \ and " chars, in \nnn style escaping. */ + + assert(s || len == 0); + + t = buf = new(char, len * 4 + 1); + if (!buf) + return NULL; + + for (size_t i = 0; i < len; i++) { + uint8_t u = (uint8_t) s[i]; + + if (u < ' ' || u >= 127 || IN_SET(u, '\\', '"')) { + *(t++) = '\\'; + *(t++) = '0' + (u >> 6); + *(t++) = '0' + ((u >> 3) & 7); + *(t++) = '0' + (u & 7); + } else + *(t++) = u; + } + + *t = 0; + return buf; +} + +static char* strcpy_backslash_escaped(char *t, const char *s, const char *bad) { + assert(bad); + assert(t); + assert(s); + + while (*s) { + int l = utf8_encoded_valid_unichar(s, SIZE_MAX); + + if (char_is_cc(*s) || l < 0) + t += cescape_char(*(s++), t); + else if (l == 1) { + if (*s == '\\' || strchr(bad, *s)) + *(t++) = '\\'; + *(t++) = *(s++); + } else { + t = mempcpy(t, s, l); + s += l; + } + } + + return t; +} + +char* shell_escape(const char *s, const char *bad) { + char *buf, *t; + + buf = new(char, strlen(s)*4+1); + if (!buf) + return NULL; + + t = strcpy_backslash_escaped(buf, s, bad); + *t = 0; + + return buf; +} + +char* shell_maybe_quote(const char *s, ShellEscapeFlags flags) { + const char *p; + char *buf, *t; + + assert(s); + + /* Encloses a string in quotes if necessary to make it OK as a shell string. */ + + if (FLAGS_SET(flags, SHELL_ESCAPE_EMPTY) && isempty(s)) + return strdup("\"\""); /* We don't use $'' here in the POSIX mode. "" is fine too. */ + + for (p = s; *p; ) { + int l = utf8_encoded_valid_unichar(p, SIZE_MAX); + + if (char_is_cc(*p) || l < 0 || + strchr(WHITESPACE SHELL_NEED_QUOTES, *p)) + break; + + p += l; + } + + if (!*p) + return strdup(s); + + buf = new(char, FLAGS_SET(flags, SHELL_ESCAPE_POSIX) + 1 + strlen(s)*4 + 1 + 1); + if (!buf) + return NULL; + + t = buf; + if (FLAGS_SET(flags, SHELL_ESCAPE_POSIX)) { + *(t++) = '$'; + *(t++) = '\''; + } else + *(t++) = '"'; + + t = mempcpy(t, s, p - s); + + t = strcpy_backslash_escaped(t, p, + FLAGS_SET(flags, SHELL_ESCAPE_POSIX) ? SHELL_NEED_ESCAPE_POSIX : SHELL_NEED_ESCAPE); + + if (FLAGS_SET(flags, SHELL_ESCAPE_POSIX)) + *(t++) = '\''; + else + *(t++) = '"'; + *t = 0; + + return str_realloc(buf); +} + +char* quote_command_line(char **argv, ShellEscapeFlags flags) { + _cleanup_free_ char *result = NULL; + + assert(argv); + + STRV_FOREACH(a, argv) { + _cleanup_free_ char *t = NULL; + + t = shell_maybe_quote(*a, flags); + if (!t) + return NULL; + + if (!strextend_with_separator(&result, " ", t)) + return NULL; + } + + return str_realloc(TAKE_PTR(result)); +} diff --git a/src/basic/escape.h b/src/basic/escape.h new file mode 100644 index 0000000..318da6f --- /dev/null +++ b/src/basic/escape.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "string-util.h" +#include "missing_type.h" + +/* What characters are special in the shell? */ +/* must be escaped outside and inside double-quotes */ +#define SHELL_NEED_ESCAPE "\"\\`$" + +/* Those that can be escaped or double-quoted. + * + * Strictly speaking, ! does not need to be escaped, except in interactive + * mode, but let's be extra nice to the user and quote ! in case this + * output is ever used in interactive mode. */ +#define SHELL_NEED_QUOTES SHELL_NEED_ESCAPE GLOB_CHARS "'()<>|&;!" + +/* Note that we assume control characters would need to be escaped too in + * addition to the "special" characters listed here, if they appear in the + * string. Current users disallow control characters. Also '"' shall not + * be escaped. + */ +#define SHELL_NEED_ESCAPE_POSIX "\\\'" + +typedef enum UnescapeFlags { + UNESCAPE_RELAX = 1 << 0, + UNESCAPE_ACCEPT_NUL = 1 << 1, +} UnescapeFlags; + +typedef enum ShellEscapeFlags { + /* The default is to add shell quotes ("") so the shell will consider this a single argument. + * Tabs and newlines are escaped. */ + + SHELL_ESCAPE_POSIX = 1 << 1, /* Use POSIX shell escape syntax (a string enclosed in $'') instead of plain quotes. */ + SHELL_ESCAPE_EMPTY = 1 << 2, /* Format empty arguments as "". */ +} ShellEscapeFlags; + +char* cescape(const char *s); +char* cescape_length(const char *s, size_t n); +int cescape_char(char c, char *buf); + +int cunescape_one(const char *p, size_t length, char32_t *ret, bool *eight_bit, bool accept_nul); + +ssize_t cunescape_length_with_prefix(const char *s, size_t length, const char *prefix, UnescapeFlags flags, char **ret); +static inline ssize_t cunescape_length(const char *s, size_t length, UnescapeFlags flags, char **ret) { + return cunescape_length_with_prefix(s, length, NULL, flags, ret); +} +static inline ssize_t cunescape(const char *s, UnescapeFlags flags, char **ret) { + return cunescape_length(s, strlen(s), flags, ret); +} + +typedef enum XEscapeFlags { + XESCAPE_8_BIT = 1 << 0, + XESCAPE_FORCE_ELLIPSIS = 1 << 1, +} XEscapeFlags; + +char* xescape_full(const char *s, const char *bad, size_t console_width, XEscapeFlags flags); +static inline char* xescape(const char *s, const char *bad) { + return xescape_full(s, bad, SIZE_MAX, 0); +} +char* octescape(const char *s, size_t len); +char* escape_non_printable_full(const char *str, size_t console_width, XEscapeFlags flags); + +char* shell_escape(const char *s, const char *bad); +char* shell_maybe_quote(const char *s, ShellEscapeFlags flags); +char* quote_command_line(char **argv, ShellEscapeFlags flags); diff --git a/src/basic/ether-addr-util.c b/src/basic/ether-addr-util.c new file mode 100644 index 0000000..0a6a54f --- /dev/null +++ b/src/basic/ether-addr-util.c @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "ether-addr-util.h" +#include "hexdecoct.h" +#include "macro.h" +#include "string-util.h" + +char *hw_addr_to_string_full( + const struct hw_addr_data *addr, + HardwareAddressToStringFlags flags, + char buffer[static HW_ADDR_TO_STRING_MAX]) { + + assert(addr); + assert(buffer); + assert(addr->length <= HW_ADDR_MAX_SIZE); + + for (size_t i = 0, j = 0; i < addr->length; i++) { + buffer[j++] = hexchar(addr->bytes[i] >> 4); + buffer[j++] = hexchar(addr->bytes[i] & 0x0f); + if (!FLAGS_SET(flags, HW_ADDR_TO_STRING_NO_COLON)) + buffer[j++] = ':'; + } + + buffer[addr->length == 0 || FLAGS_SET(flags, HW_ADDR_TO_STRING_NO_COLON) ? + addr->length * 2 : + addr->length * 3 - 1] = '\0'; + return buffer; +} + +struct hw_addr_data *hw_addr_set(struct hw_addr_data *addr, const uint8_t *bytes, size_t length) { + assert(addr); + assert(length <= HW_ADDR_MAX_SIZE); + + addr->length = length; + memcpy_safe(addr->bytes, bytes, length); + return addr; +} + +int hw_addr_compare(const struct hw_addr_data *a, const struct hw_addr_data *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->length, b->length); + if (r != 0) + return r; + + return memcmp(a->bytes, b->bytes, a->length); +} + +void hw_addr_hash_func(const struct hw_addr_data *p, struct siphash *state) { + assert(p); + assert(state); + + siphash24_compress(&p->length, sizeof(p->length), state); + siphash24_compress(p->bytes, p->length, state); +} + +DEFINE_HASH_OPS(hw_addr_hash_ops, struct hw_addr_data, hw_addr_hash_func, hw_addr_compare); +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(hw_addr_hash_ops_free, struct hw_addr_data, hw_addr_hash_func, hw_addr_compare, free); + +char* ether_addr_to_string(const struct ether_addr *addr, char buffer[ETHER_ADDR_TO_STRING_MAX]) { + assert(addr); + assert(buffer); + + /* Like ether_ntoa() but uses %02x instead of %x to print + * ethernet addresses, which makes them look less funny. Also, + * doesn't use a static buffer. */ + + sprintf(buffer, "%02x:%02x:%02x:%02x:%02x:%02x", + addr->ether_addr_octet[0], + addr->ether_addr_octet[1], + addr->ether_addr_octet[2], + addr->ether_addr_octet[3], + addr->ether_addr_octet[4], + addr->ether_addr_octet[5]); + + return buffer; +} + +int ether_addr_to_string_alloc(const struct ether_addr *addr, char **ret) { + char *buf; + + assert(addr); + assert(ret); + + buf = new(char, ETHER_ADDR_TO_STRING_MAX); + if (!buf) + return -ENOMEM; + + ether_addr_to_string(addr, buf); + + *ret = buf; + return 0; +} + +int ether_addr_compare(const struct ether_addr *a, const struct ether_addr *b) { + return memcmp(a, b, ETH_ALEN); +} + +static void ether_addr_hash_func(const struct ether_addr *p, struct siphash *state) { + siphash24_compress(p, sizeof(struct ether_addr), state); +} + +DEFINE_HASH_OPS(ether_addr_hash_ops, struct ether_addr, ether_addr_hash_func, ether_addr_compare); +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(ether_addr_hash_ops_free, struct ether_addr, ether_addr_hash_func, ether_addr_compare, free); + +static int parse_hw_addr_one_field(const char **s, char sep, size_t len, uint8_t *buf) { + const char *hex = HEXDIGITS, *p; + uint16_t data = 0; + bool cont; + + assert(s); + assert(*s); + assert(IN_SET(len, 1, 2)); + assert(buf); + + p = *s; + + for (size_t i = 0; i < len * 2; i++) { + const char *hexoff; + size_t x; + + if (*p == '\0' || *p == sep) { + if (i == 0) + return -EINVAL; + break; + } + + hexoff = strchr(hex, *p); + if (!hexoff) + return -EINVAL; + + assert(hexoff >= hex); + x = hexoff - hex; + if (x >= 16) + x -= 6; /* A-F */ + + assert(x < 16); + data <<= 4; + data += x; + + p++; + } + + if (*p != '\0' && *p != sep) + return -EINVAL; + + switch (len) { + case 1: + buf[0] = data; + break; + case 2: + buf[0] = (data & 0xff00) >> 8; + buf[1] = data & 0xff; + break; + default: + assert_not_reached(); + } + + cont = *p == sep; + *s = p + cont; + return cont; +} + +int parse_hw_addr_full(const char *s, size_t expected_len, struct hw_addr_data *ret) { + size_t field_size, max_len, len = 0; + uint8_t bytes[HW_ADDR_MAX_SIZE]; + char sep; + int r; + + assert(s); + assert(expected_len <= HW_ADDR_MAX_SIZE || expected_len == SIZE_MAX); + assert(ret); + + /* This accepts the following formats: + * + * Dot separated 2 bytes format: xxyy.zzaa.bbcc + * Colon separated 1 bytes format: xx:yy:zz:aa:bb:cc + * Hyphen separated 1 bytes format: xx-yy-zz-aa-bb-cc + * + * Moreover, if expected_len == 0, 4, or 16, this also accepts: + * + * IPv4 format: used by IPv4 tunnel, e.g. ipgre + * IPv6 format: used by IPv6 tunnel, e.g. ip6gre + * + * The expected_len argument controls the length of acceptable addresses: + * + * 0: accepts 4 (AF_INET), 16 (AF_INET6), 6 (ETH_ALEN), or 20 (INFINIBAND_ALEN). + * SIZE_MAX: accepts arbitrary length, but at least one separator must be included. + * Otherwise: accepts addresses with matching length. + */ + + if (IN_SET(expected_len, 0, sizeof(struct in_addr), sizeof(struct in6_addr))) { + union in_addr_union a; + int family; + + if (expected_len == 0) + r = in_addr_from_string_auto(s, &family, &a); + else { + family = expected_len == sizeof(struct in_addr) ? AF_INET : AF_INET6; + r = in_addr_from_string(family, s, &a); + } + if (r >= 0) { + ret->length = FAMILY_ADDRESS_SIZE(family); + memcpy(ret->bytes, a.bytes, ret->length); + return 0; + } + } + + max_len = + expected_len == 0 ? INFINIBAND_ALEN : + expected_len == SIZE_MAX ? HW_ADDR_MAX_SIZE : expected_len; + sep = s[strspn(s, HEXDIGITS)]; + + if (sep == '.') + field_size = 2; + else if (IN_SET(sep, ':', '-')) + field_size = 1; + else + return -EINVAL; + + if (max_len % field_size != 0) + return -EINVAL; + + for (size_t i = 0; i < max_len / field_size; i++) { + r = parse_hw_addr_one_field(&s, sep, field_size, bytes + i * field_size); + if (r < 0) + return r; + if (r == 0) { + len = (i + 1) * field_size; + break; + } + } + + if (len == 0) + return -EINVAL; + + if (expected_len == 0) { + if (!IN_SET(len, 4, 16, ETH_ALEN, INFINIBAND_ALEN)) + return -EINVAL; + } else if (expected_len != SIZE_MAX) { + if (len != expected_len) + return -EINVAL; + } + + ret->length = len; + memcpy(ret->bytes, bytes, ret->length); + return 0; +} + +int parse_ether_addr(const char *s, struct ether_addr *ret) { + struct hw_addr_data a; + int r; + + assert(s); + assert(ret); + + r = parse_hw_addr_full(s, ETH_ALEN, &a); + if (r < 0) + return r; + + *ret = a.ether; + return 0; +} diff --git a/src/basic/ether-addr-util.h b/src/basic/ether-addr-util.h new file mode 100644 index 0000000..83ed77d --- /dev/null +++ b/src/basic/ether-addr-util.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "hash-funcs.h" +#include "in-addr-util.h" +#include "macro.h" +#include "memory-util.h" + +/* This is MAX_ADDR_LEN as defined in linux/netdevice.h, but net/if_arp.h + * defines a macro of the same name with a much lower size. */ +#define HW_ADDR_MAX_SIZE 32 + +struct hw_addr_data { + size_t length; + union { + struct ether_addr ether; + uint8_t infiniband[INFINIBAND_ALEN]; + struct in_addr in; + struct in6_addr in6; + uint8_t bytes[HW_ADDR_MAX_SIZE]; + }; +}; + +int parse_hw_addr_full(const char *s, size_t expected_len, struct hw_addr_data *ret); +static inline int parse_hw_addr(const char *s, struct hw_addr_data *ret) { + return parse_hw_addr_full(s, 0, ret); +} +int parse_ether_addr(const char *s, struct ether_addr *ret); + +typedef enum HardwareAddressToStringFlags { + HW_ADDR_TO_STRING_NO_COLON = 1 << 0, +} HardwareAddressToStringFlags; + +#define HW_ADDR_TO_STRING_MAX (3*HW_ADDR_MAX_SIZE) +char *hw_addr_to_string_full( + const struct hw_addr_data *addr, + HardwareAddressToStringFlags flags, + char buffer[static HW_ADDR_TO_STRING_MAX]); +static inline char *hw_addr_to_string(const struct hw_addr_data *addr, char buffer[static HW_ADDR_TO_STRING_MAX]) { + return hw_addr_to_string_full(addr, 0, buffer); +} + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define HW_ADDR_TO_STR_FULL(hw_addr, flags) hw_addr_to_string_full((hw_addr), flags, (char[HW_ADDR_TO_STRING_MAX]){}) +#define HW_ADDR_TO_STR(hw_addr) HW_ADDR_TO_STR_FULL(hw_addr, 0) + +#define HW_ADDR_NULL ((const struct hw_addr_data){}) + +struct hw_addr_data *hw_addr_set(struct hw_addr_data *addr, const uint8_t *bytes, size_t length); + +void hw_addr_hash_func(const struct hw_addr_data *p, struct siphash *state); +int hw_addr_compare(const struct hw_addr_data *a, const struct hw_addr_data *b); +static inline bool hw_addr_equal(const struct hw_addr_data *a, const struct hw_addr_data *b) { + return hw_addr_compare(a, b) == 0; +} +static inline bool hw_addr_is_null(const struct hw_addr_data *addr) { + assert(addr); + return addr->length == 0 || memeqzero(addr->bytes, addr->length); +} + +extern const struct hash_ops hw_addr_hash_ops; +extern const struct hash_ops hw_addr_hash_ops_free; + +#define ETHER_ADDR_FORMAT_STR "%02X%02X%02X%02X%02X%02X" +#define ETHER_ADDR_FORMAT_VAL(x) (x).ether_addr_octet[0], (x).ether_addr_octet[1], (x).ether_addr_octet[2], (x).ether_addr_octet[3], (x).ether_addr_octet[4], (x).ether_addr_octet[5] + +#define ETHER_ADDR_TO_STRING_MAX (3*6) +char* ether_addr_to_string(const struct ether_addr *addr, char buffer[ETHER_ADDR_TO_STRING_MAX]); +int ether_addr_to_string_alloc(const struct ether_addr *addr, char **ret); +/* Use only as function argument, never stand-alone! */ +#define ETHER_ADDR_TO_STR(addr) ether_addr_to_string((addr), (char[ETHER_ADDR_TO_STRING_MAX]){}) + +int ether_addr_compare(const struct ether_addr *a, const struct ether_addr *b); +static inline bool ether_addr_equal(const struct ether_addr *a, const struct ether_addr *b) { + return ether_addr_compare(a, b) == 0; +} + +#define ETHER_ADDR_NULL ((const struct ether_addr){}) + +static inline bool ether_addr_is_null(const struct ether_addr *addr) { + return ether_addr_equal(addr, ÐER_ADDR_NULL); +} + +static inline bool ether_addr_is_broadcast(const struct ether_addr *addr) { + assert(addr); + return memeqbyte(0xff, addr->ether_addr_octet, ETH_ALEN); +} + +static inline bool ether_addr_is_multicast(const struct ether_addr *addr) { + assert(addr); + return FLAGS_SET(addr->ether_addr_octet[0], 0x01); +} + +static inline bool ether_addr_is_unicast(const struct ether_addr *addr) { + return !ether_addr_is_multicast(addr); +} + +static inline bool ether_addr_is_local(const struct ether_addr *addr) { + /* Determine if the Ethernet address is locally-assigned one (IEEE 802) */ + assert(addr); + return FLAGS_SET(addr->ether_addr_octet[0], 0x02); +} + +static inline bool ether_addr_is_global(const struct ether_addr *addr) { + return !ether_addr_is_local(addr); +} + +extern const struct hash_ops ether_addr_hash_ops; +extern const struct hash_ops ether_addr_hash_ops_free; diff --git a/src/basic/extract-word.c b/src/basic/extract-word.c new file mode 100644 index 0000000..160f771 --- /dev/null +++ b/src/basic/extract-word.c @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "escape.h" +#include "extract-word.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +int extract_first_word(const char **p, char **ret, const char *separators, ExtractFlags flags) { + _cleanup_free_ char *s = NULL; + size_t sz = 0; + char quote = 0; /* 0 or ' or " */ + bool backslash = false; /* whether we've just seen a backslash */ + char c; + int r; + + assert(p); + assert(ret); + assert(!FLAGS_SET(flags, EXTRACT_KEEP_QUOTE | EXTRACT_UNQUOTE)); + + /* Bail early if called after last value or with no input */ + if (!*p) + goto finish; + c = **p; + + if (!separators) + separators = WHITESPACE; + + /* Parses the first word of a string, and returns it in + * *ret. Removes all quotes in the process. When parsing fails + * (because of an uneven number of quotes or similar), leaves + * the pointer *p at the first invalid character. */ + + if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) + if (!GREEDY_REALLOC(s, sz+1)) + return -ENOMEM; + + for (;; (*p)++, c = **p) { + if (c == 0) + goto finish_force_terminate; + else if (strchr(separators, c)) { + if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) { + if (!(flags & EXTRACT_RETAIN_SEPARATORS)) + (*p)++; + goto finish_force_next; + } + } else { + /* We found a non-blank character, so we will always + * want to return a string (even if it is empty), + * allocate it here. */ + if (!GREEDY_REALLOC(s, sz+1)) + return -ENOMEM; + break; + } + } + + for (;; (*p)++, c = **p) { + if (backslash) { + if (!GREEDY_REALLOC(s, sz+7)) + return -ENOMEM; + + if (c == 0) { + if ((flags & EXTRACT_UNESCAPE_RELAX) && + (quote == 0 || flags & EXTRACT_RELAX)) { + /* If we find an unquoted trailing backslash and we're in + * EXTRACT_UNESCAPE_RELAX mode, keep it verbatim in the + * output. + * + * Unbalanced quotes will only be allowed in EXTRACT_RELAX + * mode, EXTRACT_UNESCAPE_RELAX mode does not allow them. + */ + s[sz++] = '\\'; + goto finish_force_terminate; + } + if (flags & EXTRACT_RELAX) + goto finish_force_terminate; + return -EINVAL; + } + + if (flags & (EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS)) { + bool eight_bit = false; + char32_t u; + + if ((flags & EXTRACT_CUNESCAPE) && + (r = cunescape_one(*p, SIZE_MAX, &u, &eight_bit, false)) >= 0) { + /* A valid escaped sequence */ + assert(r >= 1); + + (*p) += r - 1; + + if (eight_bit) + s[sz++] = u; + else + sz += utf8_encode_unichar(s + sz, u); + } else if ((flags & EXTRACT_UNESCAPE_SEPARATORS) && + (strchr(separators, **p) || **p == '\\')) + /* An escaped separator char or the escape char itself */ + s[sz++] = c; + else if (flags & EXTRACT_UNESCAPE_RELAX) { + s[sz++] = '\\'; + s[sz++] = c; + } else + return -EINVAL; + } else + s[sz++] = c; + + backslash = false; + + } else if (quote != 0) { /* inside either single or double quotes */ + for (;; (*p)++, c = **p) { + if (c == 0) { + if (flags & EXTRACT_RELAX) + goto finish_force_terminate; + return -EINVAL; + } else if (c == quote) { /* found the end quote */ + quote = 0; + if (flags & EXTRACT_UNQUOTE) + break; + } else if (c == '\\' && !(flags & EXTRACT_RETAIN_ESCAPE)) { + backslash = true; + break; + } + + if (!GREEDY_REALLOC(s, sz+2)) + return -ENOMEM; + + s[sz++] = c; + + if (quote == 0) + break; + } + + } else { + for (;; (*p)++, c = **p) { + if (c == 0) + goto finish_force_terminate; + else if (IN_SET(c, '\'', '"') && (flags & (EXTRACT_KEEP_QUOTE | EXTRACT_UNQUOTE))) { + quote = c; + if (flags & EXTRACT_UNQUOTE) + break; + } else if (c == '\\' && !(flags & EXTRACT_RETAIN_ESCAPE)) { + backslash = true; + break; + } else if (strchr(separators, c)) { + if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) { + if (!(flags & EXTRACT_RETAIN_SEPARATORS)) + (*p)++; + goto finish_force_next; + } + if (!(flags & EXTRACT_RETAIN_SEPARATORS)) + /* Skip additional coalesced separators. */ + for (;; (*p)++, c = **p) { + if (c == 0) + goto finish_force_terminate; + if (!strchr(separators, c)) + break; + } + goto finish; + + } + + if (!GREEDY_REALLOC(s, sz+2)) + return -ENOMEM; + + s[sz++] = c; + + if (quote != 0) + break; + } + } + } + +finish_force_terminate: + *p = NULL; +finish: + if (!s) { + *p = NULL; + *ret = NULL; + return 0; + } + +finish_force_next: + s[sz] = 0; + *ret = TAKE_PTR(s); + + return 1; +} + +int extract_first_word_and_warn( + const char **p, + char **ret, + const char *separators, + ExtractFlags flags, + const char *unit, + const char *filename, + unsigned line, + const char *rvalue) { + + /* Try to unquote it, if it fails, warn about it and try again + * but this time using EXTRACT_UNESCAPE_RELAX to keep the + * backslashes verbatim in invalid escape sequences. */ + + const char *save; + int r; + + save = *p; + r = extract_first_word(p, ret, separators, flags); + if (r >= 0) + return r; + + if (r == -EINVAL && !(flags & EXTRACT_UNESCAPE_RELAX)) { + + /* Retry it with EXTRACT_UNESCAPE_RELAX. */ + *p = save; + r = extract_first_word(p, ret, separators, flags|EXTRACT_UNESCAPE_RELAX); + if (r >= 0) { + /* It worked this time, hence it must have been an invalid escape sequence. */ + log_syntax(unit, LOG_WARNING, filename, line, EINVAL, "Ignoring unknown escape sequences: \"%s\"", *ret); + return r; + } + + /* If it's still EINVAL; then it must be unbalanced quoting, report this. */ + if (r == -EINVAL) + return log_syntax(unit, LOG_ERR, filename, line, r, "Unbalanced quoting, ignoring: \"%s\"", rvalue); + } + + /* Can be any error, report it */ + return log_syntax(unit, LOG_ERR, filename, line, r, "Unable to decode word \"%s\", ignoring: %m", rvalue); +} + +/* We pass ExtractFlags as unsigned int (to avoid undefined behaviour when passing + * an object that undergoes default argument promotion as an argument to va_start). + * Let's make sure that ExtractFlags fits into an unsigned int. */ +assert_cc(sizeof(enum ExtractFlags) <= sizeof(unsigned)); + +int extract_many_words(const char **p, const char *separators, unsigned flags, ...) { + va_list ap; + char **l; + int n = 0, i, c, r; + + /* Parses a number of words from a string, stripping any + * quotes if necessary. */ + + assert(p); + + /* Count how many words are expected */ + va_start(ap, flags); + for (;;) { + if (!va_arg(ap, char **)) + break; + n++; + } + va_end(ap); + + if (n <= 0) + return 0; + + /* Read all words into a temporary array */ + l = newa0(char*, n); + for (c = 0; c < n; c++) { + + r = extract_first_word(p, &l[c], separators, flags); + if (r < 0) { + free_many_charp(l, c); + return r; + } + + if (r == 0) + break; + } + + /* If we managed to parse all words, return them in the passed + * in parameters */ + va_start(ap, flags); + for (i = 0; i < n; i++) { + char **v; + + v = va_arg(ap, char **); + assert(v); + + *v = l[i]; + } + va_end(ap); + + return c; +} diff --git a/src/basic/extract-word.h b/src/basic/extract-word.h new file mode 100644 index 0000000..c82ad76 --- /dev/null +++ b/src/basic/extract-word.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +typedef enum ExtractFlags { + EXTRACT_RELAX = 1 << 0, /* Allow unbalanced quote and eat up trailing backslash. */ + EXTRACT_CUNESCAPE = 1 << 1, /* Unescape known escape sequences. */ + EXTRACT_UNESCAPE_RELAX = 1 << 2, /* Allow and keep unknown escape sequences, allow and keep trailing backslash. */ + EXTRACT_UNESCAPE_SEPARATORS = 1 << 3, /* Unescape separators (those specified, or whitespace by default). */ + EXTRACT_KEEP_QUOTE = 1 << 4, /* Ignore separators in quoting with "" and ''. */ + EXTRACT_UNQUOTE = 1 << 5, /* Ignore separators in quoting with "" and '', and remove the quotes. */ + EXTRACT_DONT_COALESCE_SEPARATORS = 1 << 6, /* Don't treat multiple adjacent separators as one */ + EXTRACT_RETAIN_ESCAPE = 1 << 7, /* Treat escape character '\' as any other character without special meaning */ + EXTRACT_RETAIN_SEPARATORS = 1 << 8, /* Do not advance the original string pointer past the separator(s) */ + + /* Note that if no flags are specified, escaped escape characters will be silently stripped. */ +} ExtractFlags; + +int extract_first_word(const char **p, char **ret, const char *separators, ExtractFlags flags); +int extract_first_word_and_warn(const char **p, char **ret, const char *separators, ExtractFlags flags, const char *unit, const char *filename, unsigned line, const char *rvalue); +int extract_many_words(const char **p, const char *separators, unsigned flags, ...) _sentinel_; diff --git a/src/basic/fd-util.c b/src/basic/fd-util.c new file mode 100644 index 0000000..542acca --- /dev/null +++ b/src/basic/fd-util.c @@ -0,0 +1,992 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#if WANT_LINUX_FS_H +#include +#endif +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "macro.h" +#include "missing_fcntl.h" +#include "missing_fs.h" +#include "missing_syscall.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "tmpfile-util.h" + +/* The maximum number of iterations in the loop to close descriptors in the fallback case + * when /proc/self/fd/ is inaccessible. */ +#define MAX_FD_LOOP_LIMIT (1024*1024) + +int close_nointr(int fd) { + assert(fd >= 0); + + if (close(fd) >= 0) + return 0; + + /* + * Just ignore EINTR; a retry loop is the wrong thing to do on + * Linux. + * + * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html + * https://bugzilla.gnome.org/show_bug.cgi?id=682819 + * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR + * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain + */ + if (errno == EINTR) + return 0; + + return -errno; +} + +int safe_close(int fd) { + /* + * Like close_nointr() but cannot fail. Guarantees errno is unchanged. Is a noop for negative fds, + * and returns -EBADF, so that it can be used in this syntax: + * + * fd = safe_close(fd); + */ + + if (fd >= 0) { + PROTECT_ERRNO; + + /* The kernel might return pretty much any error code + * via close(), but the fd will be closed anyway. The + * only condition we want to check for here is whether + * the fd was invalid at all... */ + + assert_se(close_nointr(fd) != -EBADF); + } + + return -EBADF; +} + +void safe_close_pair(int p[static 2]) { + assert(p); + + if (p[0] == p[1]) { + /* Special case pairs which use the same fd in both + * directions... */ + p[0] = p[1] = safe_close(p[0]); + return; + } + + p[0] = safe_close(p[0]); + p[1] = safe_close(p[1]); +} + +void close_many(const int fds[], size_t n_fds) { + assert(fds || n_fds == 0); + + FOREACH_ARRAY(fd, fds, n_fds) + safe_close(*fd); +} + +void close_many_unset(int fds[], size_t n_fds) { + assert(fds || n_fds == 0); + + FOREACH_ARRAY(fd, fds, n_fds) + *fd = safe_close(*fd); +} + +void close_many_and_free(int *fds, size_t n_fds) { + assert(fds || n_fds == 0); + + close_many(fds, n_fds); + free(fds); +} + +int fclose_nointr(FILE *f) { + assert(f); + + /* Same as close_nointr(), but for fclose() */ + + errno = 0; /* Extra safety: if the FILE* object is not encapsulating an fd, it might not set errno + * correctly. Let's hence initialize it to zero first, so that we aren't confused by any + * prior errno here */ + if (fclose(f) == 0) + return 0; + + if (errno == EINTR) + return 0; + + return errno_or_else(EIO); +} + +FILE* safe_fclose(FILE *f) { + + /* Same as safe_close(), but for fclose() */ + + if (f) { + PROTECT_ERRNO; + + assert_se(fclose_nointr(f) != -EBADF); + } + + return NULL; +} + +DIR* safe_closedir(DIR *d) { + + if (d) { + PROTECT_ERRNO; + + assert_se(closedir(d) >= 0 || errno != EBADF); + } + + return NULL; +} + +int fd_nonblock(int fd, bool nonblock) { + int flags, nflags; + + assert(fd >= 0); + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) + return -errno; + + nflags = UPDATE_FLAG(flags, O_NONBLOCK, nonblock); + if (nflags == flags) + return 0; + + return RET_NERRNO(fcntl(fd, F_SETFL, nflags)); +} + +int fd_cloexec(int fd, bool cloexec) { + int flags, nflags; + + assert(fd >= 0); + + flags = fcntl(fd, F_GETFD, 0); + if (flags < 0) + return -errno; + + nflags = UPDATE_FLAG(flags, FD_CLOEXEC, cloexec); + if (nflags == flags) + return 0; + + return RET_NERRNO(fcntl(fd, F_SETFD, nflags)); +} + +int fd_cloexec_many(const int fds[], size_t n_fds, bool cloexec) { + int r = 0; + + assert(fds || n_fds == 0); + + FOREACH_ARRAY(fd, fds, n_fds) { + if (*fd < 0) /* Skip gracefully over already invalidated fds */ + continue; + + RET_GATHER(r, fd_cloexec(*fd, cloexec)); + + if (r >= 0) + r = 1; /* report if we did anything */ + } + + return r; +} + +static bool fd_in_set(int fd, const int fds[], size_t n_fds) { + assert(fd >= 0); + assert(fds || n_fds == 0); + + FOREACH_ARRAY(i, fds, n_fds) { + if (*i < 0) + continue; + + if (*i == fd) + return true; + } + + return false; +} + +int get_max_fd(void) { + struct rlimit rl; + rlim_t m; + + /* Return the highest possible fd, based RLIMIT_NOFILE, but enforcing FD_SETSIZE-1 as lower boundary + * and INT_MAX as upper boundary. */ + + if (getrlimit(RLIMIT_NOFILE, &rl) < 0) + return -errno; + + m = MAX(rl.rlim_cur, rl.rlim_max); + if (m < FD_SETSIZE) /* Let's always cover at least 1024 fds */ + return FD_SETSIZE-1; + + if (m == RLIM_INFINITY || m > INT_MAX) /* Saturate on overflow. After all fds are "int", hence can + * never be above INT_MAX */ + return INT_MAX; + + return (int) (m - 1); +} + +static int close_all_fds_frugal(const int except[], size_t n_except) { + int max_fd, r = 0; + + assert(except || n_except == 0); + + /* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so + * and hence is safe to be called in signal handler context. Most users should call close_all_fds(), + * but when we assume we are called from signal handler context, then use this simpler call + * instead. */ + + max_fd = get_max_fd(); + if (max_fd < 0) + return max_fd; + + /* Refuse to do the loop over more too many elements. It's better to fail immediately than to + * spin the CPU for a long time. */ + if (max_fd > MAX_FD_LOOP_LIMIT) + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), + "Refusing to loop over %d potential fds.", max_fd); + + for (int fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -EBADF) { + int q; + + if (fd_in_set(fd, except, n_except)) + continue; + + q = close_nointr(fd); + if (q != -EBADF) + RET_GATHER(r, q); + } + + return r; +} + +static bool have_close_range = true; /* Assume we live in the future */ + +static int close_all_fds_special_case(const int except[], size_t n_except) { + assert(n_except == 0 || except); + + /* Handles a few common special cases separately, since they are common and can be optimized really + * nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0 + * otherwise. */ + + if (!have_close_range) + return 0; + + if (n_except == 1 && except[0] < 0) /* Minor optimization: if we only got one fd, and it's invalid, + * we got none */ + n_except = 0; + + switch (n_except) { + + case 0: + /* Close everything. Yay! */ + + if (close_range(3, INT_MAX, 0) >= 0) + return 1; + + if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) { + have_close_range = false; + return 0; + } + + return -errno; + + case 1: + /* Close all but exactly one, then we don't need no sorting. This is a pretty common + * case, hence let's handle it specially. */ + + if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) && + (except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0)) + return 1; + + if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) { + have_close_range = false; + return 0; + } + + return -errno; + + default: + return 0; + } +} + +int close_all_fds_without_malloc(const int except[], size_t n_except) { + int r; + + assert(n_except == 0 || except); + + r = close_all_fds_special_case(except, n_except); + if (r < 0) + return r; + if (r > 0) /* special case worked! */ + return 0; + + return close_all_fds_frugal(except, n_except); +} + +int close_all_fds(const int except[], size_t n_except) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + + assert(n_except == 0 || except); + + r = close_all_fds_special_case(except, n_except); + if (r < 0) + return r; + if (r > 0) /* special case worked! */ + return 0; + + if (have_close_range) { + _cleanup_free_ int *sorted_malloc = NULL; + size_t n_sorted; + int *sorted; + + /* In the best case we have close_range() to close all fds between a start and an end fd, + * which we can use on the "inverted" exception array, i.e. all intervals between all + * adjacent pairs from the sorted exception array. This changes loop complexity from O(n) + * where n is number of open fds to O(m⋅log(m)) where m is the number of fds to keep + * open. Given that we assume n ≫ m that's preferable to us. */ + + assert(n_except < SIZE_MAX); + n_sorted = n_except + 1; + + if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */ + sorted = sorted_malloc = new(int, n_sorted); + else + sorted = newa(int, n_sorted); + + if (sorted) { + memcpy(sorted, except, n_except * sizeof(int)); + + /* Let's add fd 2 to the list of fds, to simplify the loop below, as this + * allows us to cover the head of the array the same way as the body */ + sorted[n_sorted-1] = 2; + + typesafe_qsort(sorted, n_sorted, cmp_int); + + for (size_t i = 0; i < n_sorted-1; i++) { + int start, end; + + start = MAX(sorted[i], 2); /* The first three fds shall always remain open */ + end = MAX(sorted[i+1], 2); + + assert(end >= start); + + if (end - start <= 1) + continue; + + /* Close everything between the start and end fds (both of which shall stay open) */ + if (close_range(start + 1, end - 1, 0) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + + have_close_range = false; + break; + } + } + + if (have_close_range) { + /* The loop succeeded. Let's now close everything beyond the end */ + + if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */ + return 0; + + if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) >= 0) + return 0; + + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + + have_close_range = false; + } + } + + /* Fallback on OOM or if close_range() is not supported */ + } + + d = opendir("/proc/self/fd"); + if (!d) + return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */ + + FOREACH_DIRENT(de, d, return -errno) { + int fd = -EBADF, q; + + if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN)) + continue; + + fd = parse_fd(de->d_name); + if (fd < 0) + /* Let's better ignore this, just in case */ + continue; + + if (fd < 3) + continue; + + if (fd == dirfd(d)) + continue; + + if (fd_in_set(fd, except, n_except)) + continue; + + q = close_nointr(fd); + if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */ + r = q; + } + + return r; +} + +int same_fd(int a, int b) { + struct stat sta, stb; + pid_t pid; + int r, fa, fb; + + assert(a >= 0); + assert(b >= 0); + + /* Compares two file descriptors. Note that semantics are quite different depending on whether we + * have kcmp() or we don't. If we have kcmp() this will only return true for dup()ed file + * descriptors, but not otherwise. If we don't have kcmp() this will also return true for two fds of + * the same file, created by separate open() calls. Since we use this call mostly for filtering out + * duplicates in the fd store this difference hopefully doesn't matter too much. */ + + if (a == b) + return true; + + /* Try to use kcmp() if we have it. */ + pid = getpid_cached(); + r = kcmp(pid, pid, KCMP_FILE, a, b); + if (r == 0) + return true; + if (r > 0) + return false; + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + + /* We don't have kcmp(), use fstat() instead. */ + if (fstat(a, &sta) < 0) + return -errno; + + if (fstat(b, &stb) < 0) + return -errno; + + if (!stat_inode_same(&sta, &stb)) + return false; + + /* We consider all device fds different, since two device fds might refer to quite different device + * contexts even though they share the same inode and backing dev_t. */ + + if (S_ISCHR(sta.st_mode) || S_ISBLK(sta.st_mode)) + return false; + + /* The fds refer to the same inode on disk, let's also check if they have the same fd flags. This is + * useful to distinguish the read and write side of a pipe created with pipe(). */ + fa = fcntl(a, F_GETFL); + if (fa < 0) + return -errno; + + fb = fcntl(b, F_GETFL); + if (fb < 0) + return -errno; + + return fa == fb; +} + +void cmsg_close_all(struct msghdr *mh) { + struct cmsghdr *cmsg; + + assert(mh); + + CMSG_FOREACH(cmsg, mh) + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) + close_many(CMSG_TYPED_DATA(cmsg, int), + (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int)); +} + +bool fdname_is_valid(const char *s) { + const char *p; + + /* Validates a name for $LISTEN_FDNAMES. We basically allow + * everything ASCII that's not a control character. Also, as + * special exception the ":" character is not allowed, as we + * use that as field separator in $LISTEN_FDNAMES. + * + * Note that the empty string is explicitly allowed + * here. However, we limit the length of the names to 255 + * characters. */ + + if (!s) + return false; + + for (p = s; *p; p++) { + if (*p < ' ') + return false; + if (*p >= 127) + return false; + if (*p == ':') + return false; + } + + return p - s <= FDNAME_MAX; +} + +int fd_get_path(int fd, char **ret) { + int r; + + assert(fd >= 0 || fd == AT_FDCWD); + + if (fd == AT_FDCWD) + return safe_getcwd(ret); + + r = readlink_malloc(FORMAT_PROC_FD_PATH(fd), ret); + if (r == -ENOENT) { + /* ENOENT can mean two things: that the fd does not exist or that /proc is not mounted. Let's make + * things debuggable and distinguish the two. */ + + if (proc_mounted() == 0) + return -ENOSYS; /* /proc is not available or not set up properly, we're most likely in some chroot + * environment. */ + return -EBADF; /* The directory exists, hence it's the fd that doesn't. */ + } + + return r; +} + +int move_fd(int from, int to, int cloexec) { + int r; + + /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If + * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned + * off, if it is > 0 it is turned on. */ + + if (from < 0) + return -EBADF; + if (to < 0) + return -EBADF; + + if (from == to) { + + if (cloexec >= 0) { + r = fd_cloexec(to, cloexec); + if (r < 0) + return r; + } + + return to; + } + + if (cloexec < 0) { + int fl; + + fl = fcntl(from, F_GETFD, 0); + if (fl < 0) + return -errno; + + cloexec = FLAGS_SET(fl, FD_CLOEXEC); + } + + r = dup3(from, to, cloexec ? O_CLOEXEC : 0); + if (r < 0) + return -errno; + + assert(r == to); + + safe_close(from); + + return to; +} + +int fd_move_above_stdio(int fd) { + int flags, copy; + PROTECT_ERRNO; + + /* Moves the specified file descriptor if possible out of the range [0…2], i.e. the range of + * stdin/stdout/stderr. If it can't be moved outside of this range the original file descriptor is + * returned. This call is supposed to be used for long-lasting file descriptors we allocate in our code that + * might get loaded into foreign code, and where we want ensure our fds are unlikely used accidentally as + * stdin/stdout/stderr of unrelated code. + * + * Note that this doesn't fix any real bugs, it just makes it less likely that our code will be affected by + * buggy code from others that mindlessly invokes 'fprintf(stderr, …' or similar in places where stderr has + * been closed before. + * + * This function is written in a "best-effort" and "least-impact" style. This means whenever we encounter an + * error we simply return the original file descriptor, and we do not touch errno. */ + + if (fd < 0 || fd > 2) + return fd; + + flags = fcntl(fd, F_GETFD, 0); + if (flags < 0) + return fd; + + if (flags & FD_CLOEXEC) + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + else + copy = fcntl(fd, F_DUPFD, 3); + if (copy < 0) + return fd; + + assert(copy > 2); + + (void) close(fd); + return copy; +} + +int rearrange_stdio(int original_input_fd, int original_output_fd, int original_error_fd) { + int fd[3] = { original_input_fd, /* Put together an array of fds we work on */ + original_output_fd, + original_error_fd }, + null_fd = -EBADF, /* If we open /dev/null, we store the fd to it here */ + copy_fd[3] = EBADF_TRIPLET, /* This contains all fds we duplicate here + * temporarily, and hence need to close at the end. */ + r; + bool null_readable, null_writable; + + /* Sets up stdin, stdout, stderr with the three file descriptors passed in. If any of the descriptors + * is specified as -EBADF it will be connected with /dev/null instead. If any of the file descriptors + * is passed as itself (e.g. stdin as STDIN_FILENO) it is left unmodified, but the O_CLOEXEC bit is + * turned off should it be on. + * + * Note that if any of the passed file descriptors are > 2 they will be closed — both on success and + * on failure! Thus, callers should assume that when this function returns the input fds are + * invalidated. + * + * Note that when this function fails stdin/stdout/stderr might remain half set up! + * + * O_CLOEXEC is turned off for all three file descriptors (which is how it should be for + * stdin/stdout/stderr). */ + + null_readable = original_input_fd < 0; + null_writable = original_output_fd < 0 || original_error_fd < 0; + + /* First step, open /dev/null once, if we need it */ + if (null_readable || null_writable) { + + /* Let's open this with O_CLOEXEC first, and convert it to non-O_CLOEXEC when we move the fd to the final position. */ + null_fd = open("/dev/null", (null_readable && null_writable ? O_RDWR : + null_readable ? O_RDONLY : O_WRONLY) | O_CLOEXEC); + if (null_fd < 0) { + r = -errno; + goto finish; + } + + /* If this fd is in the 0…2 range, let's move it out of it */ + if (null_fd < 3) { + int copy; + + copy = fcntl(null_fd, F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */ + if (copy < 0) { + r = -errno; + goto finish; + } + + close_and_replace(null_fd, copy); + } + } + + /* Let's assemble fd[] with the fds to install in place of stdin/stdout/stderr */ + for (int i = 0; i < 3; i++) { + + if (fd[i] < 0) + fd[i] = null_fd; /* A negative parameter means: connect this one to /dev/null */ + else if (fd[i] != i && fd[i] < 3) { + /* This fd is in the 0…2 territory, but not at its intended place, move it out of there, so that we can work there. */ + copy_fd[i] = fcntl(fd[i], F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */ + if (copy_fd[i] < 0) { + r = -errno; + goto finish; + } + + fd[i] = copy_fd[i]; + } + } + + /* At this point we now have the fds to use in fd[], and they are all above the stdio range, so that + * we have freedom to move them around. If the fds already were at the right places then the specific + * fds are -EBADF. Let's now move them to the right places. This is the point of no return. */ + for (int i = 0; i < 3; i++) { + + if (fd[i] == i) { + + /* fd is already in place, but let's make sure O_CLOEXEC is off */ + r = fd_cloexec(i, false); + if (r < 0) + goto finish; + + } else { + assert(fd[i] > 2); + + if (dup2(fd[i], i) < 0) { /* Turns off O_CLOEXEC on the new fd. */ + r = -errno; + goto finish; + } + } + } + + r = 0; + +finish: + /* Close the original fds, but only if they were outside of the stdio range. Also, properly check for the same + * fd passed in multiple times. */ + safe_close_above_stdio(original_input_fd); + if (original_output_fd != original_input_fd) + safe_close_above_stdio(original_output_fd); + if (original_error_fd != original_input_fd && original_error_fd != original_output_fd) + safe_close_above_stdio(original_error_fd); + + /* Close the copies we moved > 2 */ + close_many(copy_fd, 3); + + /* Close our null fd, if it's > 2 */ + safe_close_above_stdio(null_fd); + + return r; +} + +int fd_reopen(int fd, int flags) { + int r; + + assert(fd >= 0 || fd == AT_FDCWD); + assert(!FLAGS_SET(flags, O_CREAT)); + + /* Reopens the specified fd with new flags. This is useful for convert an O_PATH fd into a regular one, or to + * turn O_RDWR fds into O_RDONLY fds. + * + * This doesn't work on sockets (since they cannot be open()ed, ever). + * + * This implicitly resets the file read index to 0. + * + * If AT_FDCWD is specified as file descriptor gets an fd to the current cwd. + * + * If the specified file descriptor refers to a symlink via O_PATH, then this function cannot be used + * to follow that symlink. Because we cannot have non-O_PATH fds to symlinks reopening it without + * O_PATH will always result in -ELOOP. Or in other words: if you have an O_PATH fd to a symlink you + * can reopen it only if you pass O_PATH again. */ + + if (FLAGS_SET(flags, O_NOFOLLOW)) + /* O_NOFOLLOW is not allowed in fd_reopen(), because after all this is primarily implemented + * via a symlink-based interface in /proc/self/fd. Let's refuse this here early. Note that + * the kernel would generate ELOOP here too, hence this manual check is mostly redundant – + * the only reason we add it here is so that the O_DIRECTORY special case (see below) behaves + * the same way as the non-O_DIRECTORY case. */ + return -ELOOP; + + if (FLAGS_SET(flags, O_DIRECTORY) || fd == AT_FDCWD) + /* If we shall reopen the fd as directory we can just go via "." and thus bypass the whole + * magic /proc/ directory, and make ourselves independent of that being mounted. */ + return RET_NERRNO(openat(fd, ".", flags | O_DIRECTORY)); + + int new_fd = open(FORMAT_PROC_FD_PATH(fd), flags); + if (new_fd < 0) { + if (errno != ENOENT) + return -errno; + + r = proc_mounted(); + if (r == 0) + return -ENOSYS; /* if we have no /proc/, the concept is not implementable */ + + return r > 0 ? -EBADF : -ENOENT; /* If /proc/ is definitely around then this means the fd is + * not valid, otherwise let's propagate the original + * error */ + } + + return new_fd; +} + +int fd_reopen_condition( + int fd, + int flags, + int mask, + int *ret_new_fd) { + + int r, new_fd; + + assert(fd >= 0); + assert(!FLAGS_SET(flags, O_CREAT)); + + /* Invokes fd_reopen(fd, flags), but only if the existing F_GETFL flags don't match the specified + * flags (masked by the specified mask). This is useful for converting O_PATH fds into real fds if + * needed, but only then. */ + + r = fcntl(fd, F_GETFL); + if (r < 0) + return -errno; + + if ((r & mask) == (flags & mask)) { + *ret_new_fd = -EBADF; + return fd; + } + + new_fd = fd_reopen(fd, flags); + if (new_fd < 0) + return new_fd; + + *ret_new_fd = new_fd; + return new_fd; +} + +int fd_is_opath(int fd) { + int r; + + assert(fd >= 0); + + r = fcntl(fd, F_GETFL); + if (r < 0) + return -errno; + + return FLAGS_SET(r, O_PATH); +} + +int read_nr_open(void) { + _cleanup_free_ char *nr_open = NULL; + int r; + + /* Returns the kernel's current fd limit, either by reading it of /proc/sys if that works, or using the + * hard-coded default compiled-in value of current kernels (1M) if not. This call will never fail. */ + + r = read_one_line_file("/proc/sys/fs/nr_open", &nr_open); + if (r < 0) + log_debug_errno(r, "Failed to read /proc/sys/fs/nr_open, ignoring: %m"); + else { + int v; + + r = safe_atoi(nr_open, &v); + if (r < 0) + log_debug_errno(r, "Failed to parse /proc/sys/fs/nr_open value '%s', ignoring: %m", nr_open); + else + return v; + } + + /* If we fail, fall back to the hard-coded kernel limit of 1024 * 1024. */ + return 1024 * 1024; +} + +int fd_get_diskseq(int fd, uint64_t *ret) { + uint64_t diskseq; + + assert(fd >= 0); + assert(ret); + + if (ioctl(fd, BLKGETDISKSEQ, &diskseq) < 0) { + /* Note that the kernel is weird: non-existing ioctls currently return EINVAL + * rather than ENOTTY on loopback block devices. They should fix that in the kernel, + * but in the meantime we accept both here. */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) + return -errno; + + return -EOPNOTSUPP; + } + + *ret = diskseq; + + return 0; +} + +int path_is_root_at(int dir_fd, const char *path) { + STRUCT_NEW_STATX_DEFINE(st); + STRUCT_NEW_STATX_DEFINE(pst); + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + if (!isempty(path)) { + fd = openat(dir_fd, path, O_PATH|O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return errno == ENOTDIR ? false : -errno; + + dir_fd = fd; + } + + r = statx_fallback(dir_fd, ".", 0, STATX_TYPE|STATX_INO|STATX_MNT_ID, &st.sx); + if (r == -ENOTDIR) + return false; + if (r < 0) + return r; + + r = statx_fallback(dir_fd, "..", 0, STATX_TYPE|STATX_INO|STATX_MNT_ID, &pst.sx); + if (r < 0) + return r; + + /* First, compare inode. If these are different, the fd does not point to the root directory "/". */ + if (!statx_inode_same(&st.sx, &pst.sx)) + return false; + + /* Even if the parent directory has the same inode, the fd may not point to the root directory "/", + * and we also need to check that the mount ids are the same. Otherwise, a construct like the + * following could be used to trick us: + * + * $ mkdir /tmp/x /tmp/x/y + * $ mount --bind /tmp/x /tmp/x/y + * + * Note, statx() does not provide the mount ID and path_get_mnt_id_at() does not work when an old + * kernel is used. In that case, let's assume that we do not have such spurious mount points in an + * early boot stage, and silently skip the following check. */ + + if (!FLAGS_SET(st.nsx.stx_mask, STATX_MNT_ID)) { + int mntid; + + r = path_get_mnt_id_at_fallback(dir_fd, "", &mntid); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + return true; /* skip the mount ID check */ + if (r < 0) + return r; + assert(mntid >= 0); + + st.nsx.stx_mnt_id = mntid; + st.nsx.stx_mask |= STATX_MNT_ID; + } + + if (!FLAGS_SET(pst.nsx.stx_mask, STATX_MNT_ID)) { + int mntid; + + r = path_get_mnt_id_at_fallback(dir_fd, "..", &mntid); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + return true; /* skip the mount ID check */ + if (r < 0) + return r; + assert(mntid >= 0); + + pst.nsx.stx_mnt_id = mntid; + pst.nsx.stx_mask |= STATX_MNT_ID; + } + + return statx_mount_same(&st.nsx, &pst.nsx); +} + +const char *accmode_to_string(int flags) { + switch (flags & O_ACCMODE) { + case O_RDONLY: + return "ro"; + case O_WRONLY: + return "wo"; + case O_RDWR: + return "rw"; + default: + return NULL; + } +} + +char *format_proc_pid_fd_path(char buf[static PROC_PID_FD_PATH_MAX], pid_t pid, int fd) { + assert(buf); + assert(fd >= 0); + assert(pid >= 0); + assert_se(snprintf_ok(buf, PROC_PID_FD_PATH_MAX, "/proc/" PID_FMT "/fd/%i", pid == 0 ? getpid_cached() : pid, fd)); + return buf; +} diff --git a/src/basic/fd-util.h b/src/basic/fd-util.h new file mode 100644 index 0000000..d3e9192 --- /dev/null +++ b/src/basic/fd-util.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "macro.h" +#include "stdio-util.h" + +/* maximum length of fdname */ +#define FDNAME_MAX 255 + +/* Make sure we can distinguish fd 0 and NULL */ +#define FD_TO_PTR(fd) INT_TO_PTR((fd)+1) +#define PTR_TO_FD(p) (PTR_TO_INT(p)-1) + +/* Useful helpers for initializing pipe(), socketpair() or stdio fd arrays */ +#define EBADF_PAIR { -EBADF, -EBADF } +#define EBADF_TRIPLET { -EBADF, -EBADF, -EBADF } + +int close_nointr(int fd); +int safe_close(int fd); +void safe_close_pair(int p[static 2]); + +static inline int safe_close_above_stdio(int fd) { + if (fd < 3) /* Don't close stdin/stdout/stderr, but still invalidate the fd by returning -EBADF. */ + return -EBADF; + + return safe_close(fd); +} + +void close_many(const int fds[], size_t n_fds); +void close_many_unset(int fds[], size_t n_fds); +void close_many_and_free(int *fds, size_t n_fds); + +int fclose_nointr(FILE *f); +FILE* safe_fclose(FILE *f); +DIR* safe_closedir(DIR *f); + +static inline void closep(int *fd) { + safe_close(*fd); +} + +static inline void close_pairp(int (*p)[2]) { + safe_close_pair(*p); +} + +static inline void fclosep(FILE **f) { + safe_fclose(*f); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(FILE*, pclose, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(DIR*, closedir, NULL); + +#define _cleanup_close_ _cleanup_(closep) +#define _cleanup_fclose_ _cleanup_(fclosep) +#define _cleanup_pclose_ _cleanup_(pclosep) +#define _cleanup_closedir_ _cleanup_(closedirp) +#define _cleanup_close_pair_ _cleanup_(close_pairp) + +int fd_nonblock(int fd, bool nonblock); +int fd_cloexec(int fd, bool cloexec); +int fd_cloexec_many(const int fds[], size_t n_fds, bool cloexec); + +int get_max_fd(void); + +int close_all_fds(const int except[], size_t n_except); +int close_all_fds_without_malloc(const int except[], size_t n_except); + +int same_fd(int a, int b); + +void cmsg_close_all(struct msghdr *mh); + +bool fdname_is_valid(const char *s); + +int fd_get_path(int fd, char **ret); + +int move_fd(int from, int to, int cloexec); + +int fd_move_above_stdio(int fd); + +int rearrange_stdio(int original_input_fd, int original_output_fd, int original_error_fd); + +static inline int make_null_stdio(void) { + return rearrange_stdio(-EBADF, -EBADF, -EBADF); +} + +/* Like TAKE_PTR() but for file descriptors, resetting them to -EBADF */ +#define TAKE_FD(fd) TAKE_GENERIC(fd, int, -EBADF) + +/* Like free_and_replace(), but for file descriptors */ +#define close_and_replace(a, b) \ + ({ \ + int *_fdp_ = &(a); \ + safe_close(*_fdp_); \ + *_fdp_ = TAKE_FD(b); \ + 0; \ + }) + +int fd_reopen(int fd, int flags); +int fd_reopen_condition(int fd, int flags, int mask, int *ret_new_fd); +int fd_is_opath(int fd); +int read_nr_open(void); +int fd_get_diskseq(int fd, uint64_t *ret); + +int path_is_root_at(int dir_fd, const char *path); +static inline int path_is_root(const char *path) { + return path_is_root_at(AT_FDCWD, path); +} +static inline int dir_fd_is_root(int dir_fd) { + return path_is_root_at(dir_fd, NULL); +} +static inline int dir_fd_is_root_or_cwd(int dir_fd) { + return dir_fd == AT_FDCWD ? true : path_is_root_at(dir_fd, NULL); +} + +/* The maximum length a buffer for a /proc/self/fd/ path needs */ +#define PROC_FD_PATH_MAX \ + (STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)) + +static inline char *format_proc_fd_path(char buf[static PROC_FD_PATH_MAX], int fd) { + assert(buf); + assert(fd >= 0); + assert_se(snprintf_ok(buf, PROC_FD_PATH_MAX, "/proc/self/fd/%i", fd)); + return buf; +} + +#define FORMAT_PROC_FD_PATH(fd) \ + format_proc_fd_path((char[PROC_FD_PATH_MAX]) {}, (fd)) + +/* The maximum length a buffer for a /proc//fd/ path needs */ +#define PROC_PID_FD_PATH_MAX \ + (STRLEN("/proc//fd/") + DECIMAL_STR_MAX(pid_t) + DECIMAL_STR_MAX(int)) + +char *format_proc_pid_fd_path(char buf[static PROC_PID_FD_PATH_MAX], pid_t pid, int fd); + +/* Kinda the same as FORMAT_PROC_FD_PATH(), but goes by PID rather than "self" symlink */ +#define FORMAT_PROC_PID_FD_PATH(pid, fd) \ + format_proc_pid_fd_path((char[PROC_PID_FD_PATH_MAX]) {}, (pid), (fd)) + +const char *accmode_to_string(int flags); + +/* Like ASSERT_PTR, but for fds */ +#define ASSERT_FD(fd) \ + ({ \ + int _fd_ = (fd); \ + assert(_fd_ >= 0); \ + _fd_; \ + }) diff --git a/src/basic/fileio.c b/src/basic/fileio.c new file mode 100644 index 0000000..a050b61 --- /dev/null +++ b/src/basic/fileio.c @@ -0,0 +1,1573 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "log.h" +#include "macro.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "sync-util.h" +#include "tmpfile-util.h" + +/* The maximum size of the file we'll read in one go in read_full_file() (64M). */ +#define READ_FULL_BYTES_MAX (64U*1024U*1024U - 1U) +/* Used when a size is specified for read_full_file() with READ_FULL_FILE_UNBASE64 or _UNHEX */ +#define READ_FULL_FILE_ENCODED_STRING_AMPLIFICATION_BOUNDARY 3 + +/* The maximum size of virtual files (i.e. procfs, sysfs, and other virtual "API" files) we'll read in one go + * in read_virtual_file(). Note that this limit is different (and much lower) than the READ_FULL_BYTES_MAX + * limit. This reflects the fact that we use different strategies for reading virtual and regular files: + * virtual files we generally have to read in a single read() syscall since the kernel doesn't support + * continuation read()s for them. Thankfully they are somewhat size constrained. Thus we can allocate the + * full potential buffer in advance. Regular files OTOH can be much larger, and there we grow the allocations + * exponentially in a loop. We use a size limit of 4M-2 because 4M-1 is the maximum buffer that /proc/sys/ + * allows us to read() (larger reads will fail with ENOMEM), and we want to read one extra byte so that we + * can detect EOFs. */ +#define READ_VIRTUAL_BYTES_MAX (4U*1024U*1024U - 2U) + +int fdopen_unlocked(int fd, const char *options, FILE **ret) { + assert(ret); + + FILE *f = fdopen(fd, options); + if (!f) + return -errno; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + *ret = f; + return 0; +} + +int take_fdopen_unlocked(int *fd, const char *options, FILE **ret) { + int r; + + assert(fd); + + r = fdopen_unlocked(*fd, options, ret); + if (r < 0) + return r; + + *fd = -EBADF; + + return 0; +} + +FILE* take_fdopen(int *fd, const char *options) { + assert(fd); + + FILE *f = fdopen(*fd, options); + if (!f) + return NULL; + + *fd = -EBADF; + + return f; +} + +DIR* take_fdopendir(int *dfd) { + assert(dfd); + + DIR *d = fdopendir(*dfd); + if (!d) + return NULL; + + *dfd = -EBADF; + + return d; +} + +FILE* open_memstream_unlocked(char **ptr, size_t *sizeloc) { + FILE *f = open_memstream(ptr, sizeloc); + if (!f) + return NULL; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + return f; +} + +FILE* fmemopen_unlocked(void *buf, size_t size, const char *mode) { + FILE *f = fmemopen(buf, size, mode); + if (!f) + return NULL; + + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + return f; +} + +int write_string_stream_ts( + FILE *f, + const char *line, + WriteStringFileFlags flags, + const struct timespec *ts) { + + bool needs_nl; + int r, fd = -EBADF; + + assert(f); + assert(line); + + if (ferror(f)) + return -EIO; + + if (ts) { + /* If we shall set the timestamp we need the fd. But fmemopen() streams generally don't have + * an fd. Let's fail early in that case. */ + fd = fileno(f); + if (fd < 0) + return -EBADF; + } + + if (flags & WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL) { + _cleanup_free_ char *t = NULL; + + /* If value to be written is same as that of the existing value, then suppress the write. */ + + if (fd < 0) { + fd = fileno(f); + if (fd < 0) + return -EBADF; + } + + /* Read an additional byte to detect cases where the prefix matches but the rest + * doesn't. Also, 0 returned by read_virtual_file_fd() means the read was truncated and + * it won't be equal to the new value. */ + if (read_virtual_file_fd(fd, strlen(line)+1, &t, NULL) > 0 && + streq_skip_trailing_chars(line, t, NEWLINE)) { + log_debug("No change in value '%s', suppressing write", line); + return 0; + } + + if (lseek(fd, 0, SEEK_SET) < 0) + return -errno; + } + + needs_nl = !(flags & WRITE_STRING_FILE_AVOID_NEWLINE) && !endswith(line, "\n"); + + if (needs_nl && (flags & WRITE_STRING_FILE_DISABLE_BUFFER)) { + /* If STDIO buffering was disabled, then let's append the newline character to the string + * itself, so that the write goes out in one go, instead of two */ + + line = strjoina(line, "\n"); + needs_nl = false; + } + + if (fputs(line, f) == EOF) + return -errno; + + if (needs_nl) + if (fputc('\n', f) == EOF) + return -errno; + + if (flags & WRITE_STRING_FILE_SYNC) + r = fflush_sync_and_check(f); + else + r = fflush_and_check(f); + if (r < 0) + return r; + + if (ts) { + const struct timespec twice[2] = {*ts, *ts}; + + assert(fd >= 0); + if (futimens(fd, twice) < 0) + return -errno; + } + + return 0; +} + +static int write_string_file_atomic_at( + int dir_fd, + const char *fn, + const char *line, + WriteStringFileFlags flags, + const struct timespec *ts) { + + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(fn); + assert(line); + + /* Note that we'd really like to use O_TMPFILE here, but can't really, since we want replacement + * semantics here, and O_TMPFILE can't offer that. i.e. rename() replaces but linkat() doesn't. */ + + r = fopen_temporary_at(dir_fd, fn, &f, &p); + if (r < 0) + return r; + + r = write_string_stream_ts(f, line, flags, ts); + if (r < 0) + goto fail; + + r = fchmod_umask(fileno(f), FLAGS_SET(flags, WRITE_STRING_FILE_MODE_0600) ? 0600 : 0644); + if (r < 0) + goto fail; + + if (renameat(dir_fd, p, dir_fd, fn) < 0) { + r = -errno; + goto fail; + } + + if (FLAGS_SET(flags, WRITE_STRING_FILE_SYNC)) { + /* Sync the rename, too */ + r = fsync_directory_of_file(fileno(f)); + if (r < 0) + return r; + } + + return 0; + +fail: + (void) unlinkat(dir_fd, p, 0); + return r; +} + +int write_string_file_ts_at( + int dir_fd, + const char *fn, + const char *line, + WriteStringFileFlags flags, + const struct timespec *ts) { + + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -EBADF; + int q, r; + + assert(fn); + assert(line); + + /* We don't know how to verify whether the file contents was already on-disk. */ + assert(!((flags & WRITE_STRING_FILE_VERIFY_ON_FAILURE) && (flags & WRITE_STRING_FILE_SYNC))); + + if (flags & WRITE_STRING_FILE_MKDIR_0755) { + r = mkdirat_parents(dir_fd, fn, 0755); + if (r < 0) + return r; + } + + if (flags & WRITE_STRING_FILE_ATOMIC) { + assert(flags & WRITE_STRING_FILE_CREATE); + + r = write_string_file_atomic_at(dir_fd, fn, line, flags, ts); + if (r < 0) + goto fail; + + return r; + } else + assert(!ts); + + /* We manually build our own version of fopen(..., "we") that works without O_CREAT and with O_NOFOLLOW if needed. */ + fd = openat(dir_fd, fn, O_CLOEXEC|O_NOCTTY | + (FLAGS_SET(flags, WRITE_STRING_FILE_NOFOLLOW) ? O_NOFOLLOW : 0) | + (FLAGS_SET(flags, WRITE_STRING_FILE_CREATE) ? O_CREAT : 0) | + (FLAGS_SET(flags, WRITE_STRING_FILE_TRUNCATE) ? O_TRUNC : 0) | + (FLAGS_SET(flags, WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL) ? O_RDWR : O_WRONLY), + (FLAGS_SET(flags, WRITE_STRING_FILE_MODE_0600) ? 0600 : 0666)); + if (fd < 0) { + r = -errno; + goto fail; + } + + r = take_fdopen_unlocked(&fd, "w", &f); + if (r < 0) + goto fail; + + if (flags & WRITE_STRING_FILE_DISABLE_BUFFER) + setvbuf(f, NULL, _IONBF, 0); + + r = write_string_stream_ts(f, line, flags, ts); + if (r < 0) + goto fail; + + return 0; + +fail: + if (!(flags & WRITE_STRING_FILE_VERIFY_ON_FAILURE)) + return r; + + f = safe_fclose(f); + + /* OK, the operation failed, but let's see if the right + * contents in place already. If so, eat up the error. */ + + q = verify_file(fn, line, !(flags & WRITE_STRING_FILE_AVOID_NEWLINE) || (flags & WRITE_STRING_FILE_VERIFY_IGNORE_NEWLINE)); + if (q <= 0) + return r; + + return 0; +} + +int write_string_filef( + const char *fn, + WriteStringFileFlags flags, + const char *format, ...) { + + _cleanup_free_ char *p = NULL; + va_list ap; + int r; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return write_string_file(fn, p, flags); +} + +int read_one_line_file_at(int dir_fd, const char *filename, char **ret) { + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(filename); + assert(ret); + + r = fopen_unlocked_at(dir_fd, filename, "re", 0, &f); + if (r < 0) + return r; + + return read_line(f, LONG_LINE_MAX, ret); +} + +int verify_file_at(int dir_fd, const char *fn, const char *blob, bool accept_extra_nl) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *buf = NULL; + size_t l, k; + int r; + + assert(fn); + assert(blob); + + l = strlen(blob); + + if (accept_extra_nl && endswith(blob, "\n")) + accept_extra_nl = false; + + buf = malloc(l + accept_extra_nl + 1); + if (!buf) + return -ENOMEM; + + r = fopen_unlocked_at(dir_fd, fn, "re", 0, &f); + if (r < 0) + return r; + + /* We try to read one byte more than we need, so that we know whether we hit eof */ + errno = 0; + k = fread(buf, 1, l + accept_extra_nl + 1, f); + if (ferror(f)) + return errno_or_else(EIO); + + if (k != l && k != l + accept_extra_nl) + return 0; + if (memcmp(buf, blob, l) != 0) + return 0; + if (k > l && buf[l] != '\n') + return 0; + + return 1; +} + +int read_virtual_file_fd(int fd, size_t max_size, char **ret_contents, size_t *ret_size) { + _cleanup_free_ char *buf = NULL; + size_t n, size; + int n_retries; + bool truncated = false; + + /* Virtual filesystems such as sysfs or procfs use kernfs, and kernfs can work with two sorts of + * virtual files. One sort uses "seq_file", and the results of the first read are buffered for the + * second read. The other sort uses "raw" reads which always go direct to the device. In the latter + * case, the content of the virtual file must be retrieved with a single read otherwise a second read + * might get the new value instead of finding EOF immediately. That's the reason why the usage of + * fread(3) is prohibited in this case as it always performs a second call to read(2) looking for + * EOF. See issue #13585. + * + * max_size specifies a limit on the bytes read. If max_size is SIZE_MAX, the full file is read. If + * the full file is too large to read, an error is returned. For other values of max_size, *partial + * contents* may be returned. (Though the read is still done using one syscall.) Returns 0 on + * partial success, 1 if untruncated contents were read. */ + + assert(fd >= 0); + assert(max_size <= READ_VIRTUAL_BYTES_MAX || max_size == SIZE_MAX); + + /* Limit the number of attempts to read the number of bytes returned by fstat(). */ + n_retries = 3; + + for (;;) { + struct stat st; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISREG(st.st_mode)) + return -EBADF; + + /* Be prepared for files from /proc which generally report a file size of 0. */ + assert_cc(READ_VIRTUAL_BYTES_MAX < SSIZE_MAX); + if (st.st_size > 0 && n_retries > 1) { + /* Let's use the file size if we have more than 1 attempt left. On the last attempt + * we'll ignore the file size */ + + if (st.st_size > SSIZE_MAX) { /* Avoid overflow with 32-bit size_t and 64-bit off_t. */ + + if (max_size == SIZE_MAX) + return -EFBIG; + + size = max_size; + } else { + size = MIN((size_t) st.st_size, max_size); + + if (size > READ_VIRTUAL_BYTES_MAX) + return -EFBIG; + } + + n_retries--; + } else if (n_retries > 1) { + /* Files in /proc are generally smaller than the page size so let's start with + * a page size buffer from malloc and only use the max buffer on the final try. */ + size = MIN3(page_size() - 1, READ_VIRTUAL_BYTES_MAX, max_size); + n_retries = 1; + } else { + size = MIN(READ_VIRTUAL_BYTES_MAX, max_size); + n_retries = 0; + } + + buf = malloc(size + 1); + if (!buf) + return -ENOMEM; + + /* Use a bigger allocation if we got it anyway, but not more than the limit. */ + size = MIN3(MALLOC_SIZEOF_SAFE(buf) - 1, max_size, READ_VIRTUAL_BYTES_MAX); + + for (;;) { + ssize_t k; + + /* Read one more byte so we can detect whether the content of the + * file has already changed or the guessed size for files from /proc + * wasn't large enough . */ + k = read(fd, buf, size + 1); + if (k >= 0) { + n = k; + break; + } + + if (errno != EINTR) + return -errno; + } + + /* Consider a short read as EOF */ + if (n <= size) + break; + + /* If a maximum size is specified and we already read more we know the file is larger, and + * can handle this as truncation case. Note that if the size of what we read equals the + * maximum size then this doesn't mean truncation, the file might or might not end on that + * byte. We need to rerun the loop in that case, with a larger buffer size, so that we read + * at least one more byte to be able to distinguish EOF from truncation. */ + if (max_size != SIZE_MAX && n > max_size) { + n = size; /* Make sure we never use more than what we sized the buffer for (so that + * we have one free byte in it for the trailing NUL we add below). */ + truncated = true; + break; + } + + /* We have no further attempts left? Then the file is apparently larger than our limits. Give up. */ + if (n_retries <= 0) + return -EFBIG; + + /* Hmm... either we read too few bytes from /proc or less likely the content of the file + * might have been changed (and is now bigger) while we were processing, let's try again + * either with the new file size. */ + + if (lseek(fd, 0, SEEK_SET) < 0) + return -errno; + + buf = mfree(buf); + } + + if (ret_contents) { + + /* Safety check: if the caller doesn't want to know the size of what we just read it will + * rely on the trailing NUL byte. But if there's an embedded NUL byte, then we should refuse + * operation as otherwise there'd be ambiguity about what we just read. */ + if (!ret_size && memchr(buf, 0, n)) + return -EBADMSG; + + if (n < size) { + char *p; + + /* Return rest of the buffer to libc */ + p = realloc(buf, n + 1); + if (!p) + return -ENOMEM; + buf = p; + } + + buf[n] = 0; + *ret_contents = TAKE_PTR(buf); + } + + if (ret_size) + *ret_size = n; + + return !truncated; +} + +int read_virtual_file_at( + int dir_fd, + const char *filename, + size_t max_size, + char **ret_contents, + size_t *ret_size) { + + _cleanup_close_ int fd = -EBADF; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + if (!filename) { + if (dir_fd == AT_FDCWD) + return -EBADF; + + return read_virtual_file_fd(dir_fd, max_size, ret_contents, ret_size); + } + + fd = openat(dir_fd, filename, O_RDONLY | O_NOCTTY | O_CLOEXEC); + if (fd < 0) + return -errno; + + return read_virtual_file_fd(fd, max_size, ret_contents, ret_size); +} + +int read_full_stream_full( + FILE *f, + const char *filename, + uint64_t offset, + size_t size, + ReadFullFileFlags flags, + char **ret_contents, + size_t *ret_size) { + + _cleanup_free_ char *buf = NULL; + size_t n, n_next = 0, l, expected_decoded_size = size; + int fd, r; + + assert(f); + assert(ret_contents); + assert(!FLAGS_SET(flags, READ_FULL_FILE_UNBASE64 | READ_FULL_FILE_UNHEX)); + assert(size != SIZE_MAX || !FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER)); + + if (offset != UINT64_MAX && offset > LONG_MAX) /* fseek() can only deal with "long" offsets */ + return -ERANGE; + + if ((flags & (READ_FULL_FILE_UNBASE64 | READ_FULL_FILE_UNHEX)) != 0) { + if (size <= SIZE_MAX / READ_FULL_FILE_ENCODED_STRING_AMPLIFICATION_BOUNDARY) + size *= READ_FULL_FILE_ENCODED_STRING_AMPLIFICATION_BOUNDARY; + else + size = SIZE_MAX; + } + + fd = fileno(f); + if (fd >= 0) { /* If the FILE* object is backed by an fd (as opposed to memory or such, see + * fmemopen()), let's optimize our buffering */ + struct stat st; + + if (fstat(fd, &st) < 0) + return -errno; + + if (S_ISREG(st.st_mode)) { + + /* Try to start with the right file size if we shall read the file in full. Note + * that we increase the size to read here by one, so that the first read attempt + * already makes us notice the EOF. If the reported size of the file is zero, we + * avoid this logic however, since quite likely it might be a virtual file in procfs + * that all report a zero file size. */ + + if (st.st_size > 0 && + (size == SIZE_MAX || FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER))) { + + uint64_t rsize = + LESS_BY((uint64_t) st.st_size, offset == UINT64_MAX ? 0 : offset); + + if (rsize < SIZE_MAX) /* overflow check */ + n_next = rsize + 1; + } + + if (flags & READ_FULL_FILE_WARN_WORLD_READABLE) + (void) warn_file_is_world_accessible(filename, &st, NULL, 0); + } + } + + /* If we don't know how much to read, figure it out now. If we shall read a part of the file, then + * allocate the requested size. If we shall load the full file start with LINE_MAX. Note that if + * READ_FULL_FILE_FAIL_WHEN_LARGER we consider the specified size a safety limit, and thus also start + * with LINE_MAX, under assumption the file is most likely much shorter. */ + if (n_next == 0) + n_next = size != SIZE_MAX && !FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER) ? size : LINE_MAX; + + /* Never read more than we need to determine that our own limit is hit */ + if (n_next > READ_FULL_BYTES_MAX) + n_next = READ_FULL_BYTES_MAX + 1; + + if (offset != UINT64_MAX && fseek(f, offset, SEEK_SET) < 0) + return -errno; + + n = l = 0; + for (;;) { + char *t; + size_t k; + + /* If we shall fail when reading overly large data, then read exactly one byte more than the + * specified size at max, since that'll tell us if there's anymore data beyond the limit*/ + if (FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER) && n_next > size) + n_next = size + 1; + + if (flags & READ_FULL_FILE_SECURE) { + t = malloc(n_next + 1); + if (!t) { + r = -ENOMEM; + goto finalize; + } + memcpy_safe(t, buf, n); + explicit_bzero_safe(buf, n); + free(buf); + } else { + t = realloc(buf, n_next + 1); + if (!t) + return -ENOMEM; + } + + buf = t; + /* Unless a size has been explicitly specified, try to read as much as fits into the memory + * we allocated (minus 1, to leave one byte for the safety NUL byte) */ + n = size == SIZE_MAX ? MALLOC_SIZEOF_SAFE(buf) - 1 : n_next; + + errno = 0; + k = fread(buf + l, 1, n - l, f); + + assert(k <= n - l); + l += k; + + if (ferror(f)) { + r = errno_or_else(EIO); + goto finalize; + } + if (feof(f)) + break; + + if (size != SIZE_MAX && !FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER)) { /* If we got asked to read some specific size, we already sized the buffer right, hence leave */ + assert(l == size); + break; + } + + assert(k > 0); /* we can't have read zero bytes because that would have been EOF */ + + if (FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER) && l > size) { + r = -E2BIG; + goto finalize; + } + + if (n >= READ_FULL_BYTES_MAX) { + r = -E2BIG; + goto finalize; + } + + n_next = MIN(n * 2, READ_FULL_BYTES_MAX); + } + + if (flags & (READ_FULL_FILE_UNBASE64 | READ_FULL_FILE_UNHEX)) { + _cleanup_free_ void *decoded = NULL; + size_t decoded_size; + + buf[l++] = 0; + if (flags & READ_FULL_FILE_UNBASE64) + r = unbase64mem_full(buf, l, flags & READ_FULL_FILE_SECURE, &decoded, &decoded_size); + else + r = unhexmem_full(buf, l, flags & READ_FULL_FILE_SECURE, &decoded, &decoded_size); + if (r < 0) + goto finalize; + + if (flags & READ_FULL_FILE_SECURE) + explicit_bzero_safe(buf, n); + free_and_replace(buf, decoded); + n = l = decoded_size; + + if (FLAGS_SET(flags, READ_FULL_FILE_FAIL_WHEN_LARGER) && l > expected_decoded_size) { + r = -E2BIG; + goto finalize; + } + } + + if (!ret_size) { + /* Safety check: if the caller doesn't want to know the size of what we just read it will rely on the + * trailing NUL byte. But if there's an embedded NUL byte, then we should refuse operation as otherwise + * there'd be ambiguity about what we just read. */ + + if (memchr(buf, 0, l)) { + r = -EBADMSG; + goto finalize; + } + } + + buf[l] = 0; + *ret_contents = TAKE_PTR(buf); + + if (ret_size) + *ret_size = l; + + return 0; + +finalize: + if (flags & READ_FULL_FILE_SECURE) + explicit_bzero_safe(buf, n); + + return r; +} + +int read_full_file_full( + int dir_fd, + const char *filename, + uint64_t offset, + size_t size, + ReadFullFileFlags flags, + const char *bind_name, + char **ret_contents, + size_t *ret_size) { + + _cleanup_fclose_ FILE *f = NULL; + XfopenFlags xflags = XFOPEN_UNLOCKED; + int r; + + assert(filename); + assert(ret_contents); + + if (FLAGS_SET(flags, READ_FULL_FILE_CONNECT_SOCKET) && /* If this is enabled, let's try to connect to it */ + offset == UINT64_MAX) /* Seeking is not supported on AF_UNIX sockets */ + xflags |= XFOPEN_SOCKET; + + r = xfopenat_full(dir_fd, filename, "re", 0, xflags, bind_name, &f); + if (r < 0) + return r; + + return read_full_stream_full(f, filename, offset, size, flags, ret_contents, ret_size); +} + +int executable_is_script(const char *path, char **interpreter) { + _cleanup_free_ char *line = NULL; + size_t len; + char *ans; + int r; + + assert(path); + + r = read_one_line_file(path, &line); + if (r == -ENOBUFS) /* First line overly long? if so, then it's not a script */ + return 0; + if (r < 0) + return r; + + if (!startswith(line, "#!")) + return 0; + + ans = strstrip(line + 2); + len = strcspn(ans, " \t"); + + if (len == 0) + return 0; + + ans = strndup(ans, len); + if (!ans) + return -ENOMEM; + + *interpreter = ans; + return 1; +} + +/** + * Retrieve one field from a file like /proc/self/status. pattern + * should not include whitespace or the delimiter (':'). pattern matches only + * the beginning of a line. Whitespace before ':' is skipped. Whitespace and + * zeros after the ':' will be skipped. field must be freed afterwards. + * terminator specifies the terminating characters of the field value (not + * included in the value). + */ +int get_proc_field(const char *filename, const char *pattern, const char *terminator, char **field) { + _cleanup_free_ char *status = NULL; + char *t, *f; + int r; + + assert(terminator); + assert(filename); + assert(pattern); + assert(field); + + r = read_full_virtual_file(filename, &status, NULL); + if (r < 0) + return r; + + t = status; + + do { + bool pattern_ok; + + do { + t = strstr(t, pattern); + if (!t) + return -ENOENT; + + /* Check that pattern occurs in beginning of line. */ + pattern_ok = (t == status || t[-1] == '\n'); + + t += strlen(pattern); + + } while (!pattern_ok); + + t += strspn(t, " \t"); + if (!*t) + return -ENOENT; + + } while (*t != ':'); + + t++; + + if (*t) { + t += strspn(t, " \t"); + + /* Also skip zeros, because when this is used for + * capabilities, we don't want the zeros. This way the + * same capability set always maps to the same string, + * irrespective of the total capability set size. For + * other numbers it shouldn't matter. */ + t += strspn(t, "0"); + /* Back off one char if there's nothing but whitespace + and zeros */ + if (!*t || isspace(*t)) + t--; + } + + f = strdupcspn(t, terminator); + if (!f) + return -ENOMEM; + + *field = f; + return 0; +} + +DIR *xopendirat(int fd, const char *name, int flags) { + _cleanup_close_ int nfd = -EBADF; + + assert(!(flags & O_CREAT)); + + if (fd == AT_FDCWD && flags == 0) + return opendir(name); + + nfd = openat(fd, name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|flags, 0); + if (nfd < 0) + return NULL; + + return take_fdopendir(&nfd); +} + +int fopen_mode_to_flags(const char *mode) { + const char *p; + int flags; + + assert(mode); + + if ((p = startswith(mode, "r+"))) + flags = O_RDWR; + else if ((p = startswith(mode, "r"))) + flags = O_RDONLY; + else if ((p = startswith(mode, "w+"))) + flags = O_RDWR|O_CREAT|O_TRUNC; + else if ((p = startswith(mode, "w"))) + flags = O_WRONLY|O_CREAT|O_TRUNC; + else if ((p = startswith(mode, "a+"))) + flags = O_RDWR|O_CREAT|O_APPEND; + else if ((p = startswith(mode, "a"))) + flags = O_WRONLY|O_CREAT|O_APPEND; + else + return -EINVAL; + + for (; *p != 0; p++) { + + switch (*p) { + + case 'e': + flags |= O_CLOEXEC; + break; + + case 'x': + flags |= O_EXCL; + break; + + case 'm': + /* ignore this here, fdopen() might care later though */ + break; + + case 'c': /* not sure what to do about this one */ + default: + return -EINVAL; + } + } + + return flags; +} + +static int xfopenat_regular(int dir_fd, const char *path, const char *mode, int open_flags, FILE **ret) { + FILE *f; + + /* A combination of fopen() with openat() */ + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + assert(mode); + assert(ret); + + if (dir_fd == AT_FDCWD && open_flags == 0) + f = fopen(path, mode); + else { + _cleanup_close_ int fd = -EBADF; + int mode_flags; + + mode_flags = fopen_mode_to_flags(mode); + if (mode_flags < 0) + return mode_flags; + + fd = openat(dir_fd, path, mode_flags | open_flags); + if (fd < 0) + return -errno; + + f = take_fdopen(&fd, mode); + } + if (!f) + return -errno; + + *ret = f; + return 0; +} + +static int xfopenat_unix_socket(int dir_fd, const char *path, const char *bind_name, FILE **ret) { + _cleanup_close_ int sk = -EBADF; + FILE *f; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + assert(ret); + + sk = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (sk < 0) + return -errno; + + if (bind_name) { + /* If the caller specified a socket name to bind to, do so before connecting. This is + * useful to communicate some minor, short meta-information token from the client to + * the server. */ + union sockaddr_union bsa; + + r = sockaddr_un_set_path(&bsa.un, bind_name); + if (r < 0) + return r; + + if (bind(sk, &bsa.sa, r) < 0) + return -errno; + } + + r = connect_unix_path(sk, dir_fd, path); + if (r < 0) + return r; + + if (shutdown(sk, SHUT_WR) < 0) + return -errno; + + f = take_fdopen(&sk, "r"); + if (!f) + return -errno; + + *ret = f; + return 0; +} + +int xfopenat_full( + int dir_fd, + const char *path, + const char *mode, + int open_flags, + XfopenFlags flags, + const char *bind_name, + FILE **ret) { + + FILE *f = NULL; /* avoid false maybe-uninitialized warning */ + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + assert(mode); + assert(ret); + + r = xfopenat_regular(dir_fd, path, mode, open_flags, &f); + if (r == -ENXIO && FLAGS_SET(flags, XFOPEN_SOCKET)) { + /* ENXIO is what Linux returns if we open a node that is an AF_UNIX socket */ + r = xfopenat_unix_socket(dir_fd, path, bind_name, &f); + if (IN_SET(r, -ENOTSOCK, -EINVAL)) + return -ENXIO; /* propagate original error if this is not a socket after all */ + } + if (r < 0) + return r; + + if (FLAGS_SET(flags, XFOPEN_UNLOCKED)) + (void) __fsetlocking(f, FSETLOCKING_BYCALLER); + + *ret = f; + return 0; +} + +int fdopen_independent(int fd, const char *mode, FILE **ret) { + _cleanup_close_ int copy_fd = -EBADF; + _cleanup_fclose_ FILE *f = NULL; + int mode_flags; + + assert(fd >= 0); + assert(mode); + assert(ret); + + /* A combination of fdopen() + fd_reopen(). i.e. reopens the inode the specified fd points to and + * returns a FILE* for it */ + + mode_flags = fopen_mode_to_flags(mode); + if (mode_flags < 0) + return mode_flags; + + /* Flags returned by fopen_mode_to_flags might contain O_CREAT, but it doesn't make sense for fd_reopen + * since we're working on an existing fd anyway. Let's drop it here to avoid triggering assertion. */ + copy_fd = fd_reopen(fd, mode_flags & ~O_CREAT); + if (copy_fd < 0) + return copy_fd; + + f = take_fdopen(©_fd, mode); + if (!f) + return -errno; + + *ret = TAKE_PTR(f); + return 0; +} + +static int search_and_open_internal( + const char *path, + int mode, /* if ret_fd is NULL this is an [FRWX]_OK mode for access(), otherwise an open mode for open() */ + const char *root, + char **search, + int *ret_fd, + char **ret_path) { + + int r; + + assert(!ret_fd || !FLAGS_SET(mode, O_CREAT)); /* We don't support O_CREAT for this */ + assert(path); + + if (path_is_absolute(path)) { + _cleanup_close_ int fd = -EBADF; + + if (ret_fd) + /* We only specify 0777 here to appease static analyzers, it's never used since we + * don't support O_CREAT here */ + r = fd = RET_NERRNO(open(path, mode, 0777)); + else + r = RET_NERRNO(access(path, mode)); + if (r < 0) + return r; + + if (ret_path) { + r = path_simplify_alloc(path, ret_path); + if (r < 0) + return r; + } + + if (ret_fd) + *ret_fd = TAKE_FD(fd); + + return 0; + } + + if (!path_strv_resolve_uniq(search, root)) + return -ENOMEM; + + STRV_FOREACH(i, search) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *p = NULL; + + p = path_join(root, *i, path); + if (!p) + return -ENOMEM; + + if (ret_fd) + /* as above, 0777 is static analyzer appeasement */ + r = fd = RET_NERRNO(open(p, mode, 0777)); + else + r = RET_NERRNO(access(p, F_OK)); + if (r >= 0) { + if (ret_path) + *ret_path = path_simplify(TAKE_PTR(p)); + + if (ret_fd) + *ret_fd = TAKE_FD(fd); + + return 0; + } + if (r != -ENOENT) + return r; + } + + return -ENOENT; +} + +int search_and_open( + const char *path, + int mode, + const char *root, + char **search, + int *ret_fd, + char **ret_path) { + + _cleanup_strv_free_ char **copy = NULL; + + assert(path); + + copy = strv_copy((char**) search); + if (!copy) + return -ENOMEM; + + return search_and_open_internal(path, mode, root, copy, ret_fd, ret_path); +} + +static int search_and_fopen_internal( + const char *path, + const char *mode, + const char *root, + char **search, + FILE **ret_file, + char **ret_path) { + + _cleanup_free_ char *found_path = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(path); + assert(mode || !ret_file); + + r = search_and_open( + path, + mode ? fopen_mode_to_flags(mode) : 0, + root, + search, + ret_file ? &fd : NULL, + ret_path ? &found_path : NULL); + if (r < 0) + return r; + + if (ret_file) { + FILE *f = take_fdopen(&fd, mode); + if (!f) + return -errno; + + *ret_file = f; + } + + if (ret_path) + *ret_path = TAKE_PTR(found_path); + + return 0; +} + +int search_and_fopen( + const char *path, + const char *mode, + const char *root, + const char **search, + FILE **ret_file, + char **ret_path) { + + _cleanup_strv_free_ char **copy = NULL; + + assert(path); + assert(mode || !ret_file); + + copy = strv_copy((char**) search); + if (!copy) + return -ENOMEM; + + return search_and_fopen_internal(path, mode, root, copy, ret_file, ret_path); +} + +int search_and_fopen_nulstr( + const char *path, + const char *mode, + const char *root, + const char *search, + FILE **ret_file, + char **ret_path) { + + _cleanup_strv_free_ char **l = NULL; + + assert(path); + assert(mode || !ret_file); + + l = strv_split_nulstr(search); + if (!l) + return -ENOMEM; + + return search_and_fopen_internal(path, mode, root, l, ret_file, ret_path); +} + +int fflush_and_check(FILE *f) { + assert(f); + + errno = 0; + fflush(f); + + if (ferror(f)) + return errno_or_else(EIO); + + return 0; +} + +int fflush_sync_and_check(FILE *f) { + int r, fd; + + assert(f); + + r = fflush_and_check(f); + if (r < 0) + return r; + + /* Not all file streams have an fd associated (think: fmemopen()), let's handle this gracefully and + * assume that in that case we need no explicit syncing */ + fd = fileno(f); + if (fd < 0) + return 0; + + r = fsync_full(fd); + if (r < 0) + return r; + + return 0; +} + +int write_timestamp_file_atomic(const char *fn, usec_t n) { + char ln[DECIMAL_STR_MAX(n)+2]; + + /* Creates a "timestamp" file, that contains nothing but a + * usec_t timestamp, formatted in ASCII. */ + + if (!timestamp_is_set(n)) + return -ERANGE; + + xsprintf(ln, USEC_FMT "\n", n); + + return write_string_file(fn, ln, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC); +} + +int read_timestamp_file(const char *fn, usec_t *ret) { + _cleanup_free_ char *ln = NULL; + uint64_t t; + int r; + + r = read_one_line_file(fn, &ln); + if (r < 0) + return r; + + r = safe_atou64(ln, &t); + if (r < 0) + return r; + + if (!timestamp_is_set(t)) + return -ERANGE; + + *ret = (usec_t) t; + return 0; +} + +int fputs_with_space(FILE *f, const char *s, const char *separator, bool *space) { + int r; + + assert(s); + + /* Outputs the specified string with fputs(), but optionally prefixes it with a separator. The *space parameter + * when specified shall initially point to a boolean variable initialized to false. It is set to true after the + * first invocation. This call is supposed to be use in loops, where a separator shall be inserted between each + * element, but not before the first one. */ + + if (!f) + f = stdout; + + if (space) { + if (!separator) + separator = " "; + + if (*space) { + r = fputs(separator, f); + if (r < 0) + return r; + } + + *space = true; + } + + return fputs(s, f); +} + +/* A bitmask of the EOL markers we know */ +typedef enum EndOfLineMarker { + EOL_NONE = 0, + EOL_ZERO = 1 << 0, /* \0 (aka NUL) */ + EOL_TEN = 1 << 1, /* \n (aka NL, aka LF) */ + EOL_THIRTEEN = 1 << 2, /* \r (aka CR) */ +} EndOfLineMarker; + +static EndOfLineMarker categorize_eol(char c, ReadLineFlags flags) { + + if (!FLAGS_SET(flags, READ_LINE_ONLY_NUL)) { + if (c == '\n') + return EOL_TEN; + if (c == '\r') + return EOL_THIRTEEN; + } + + if (c == '\0') + return EOL_ZERO; + + return EOL_NONE; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(FILE*, funlockfile, NULL); + +int read_line_full(FILE *f, size_t limit, ReadLineFlags flags, char **ret) { + _cleanup_free_ char *buffer = NULL; + size_t n = 0, count = 0; + int r; + + assert(f); + + /* Something like a bounded version of getline(). + * + * Considers EOF, \n, \r and \0 end of line delimiters (or combinations of these), and does not include these + * delimiters in the string returned. Specifically, recognizes the following combinations of markers as line + * endings: + * + * • \n (UNIX) + * • \r (old MacOS) + * • \0 (C strings) + * • \n\0 + * • \r\0 + * • \r\n (Windows) + * • \n\r + * • \r\n\0 + * • \n\r\0 + * + * Returns the number of bytes read from the files (i.e. including delimiters — this hence usually differs from + * the number of characters in the returned string). When EOF is hit, 0 is returned. + * + * The input parameter limit is the maximum numbers of characters in the returned string, i.e. excluding + * delimiters. If the limit is hit we fail and return -ENOBUFS. + * + * If a line shall be skipped ret may be initialized as NULL. */ + + if (ret) { + if (!GREEDY_REALLOC(buffer, 1)) + return -ENOMEM; + } + + { + _unused_ _cleanup_(funlockfilep) FILE *flocked = f; + EndOfLineMarker previous_eol = EOL_NONE; + flockfile(f); + + for (;;) { + EndOfLineMarker eol; + char c; + + if (n >= limit) + return -ENOBUFS; + + if (count >= INT_MAX) /* We couldn't return the counter anymore as "int", hence refuse this */ + return -ENOBUFS; + + r = safe_fgetc(f, &c); + if (r < 0) + return r; + if (r == 0) /* EOF is definitely EOL */ + break; + + eol = categorize_eol(c, flags); + + if (FLAGS_SET(previous_eol, EOL_ZERO) || + (eol == EOL_NONE && previous_eol != EOL_NONE) || + (eol != EOL_NONE && (previous_eol & eol) != 0)) { + /* Previous char was a NUL? This is not an EOL, but the previous char was? This type of + * EOL marker has been seen right before? In either of these three cases we are + * done. But first, let's put this character back in the queue. (Note that we have to + * cast this to (unsigned char) here as ungetc() expects a positive 'int', and if we + * are on an architecture where 'char' equals 'signed char' we need to ensure we don't + * pass a negative value here. That said, to complicate things further ungetc() is + * actually happy with most negative characters and implicitly casts them back to + * positive ones as needed, except for \xff (aka -1, aka EOF), which it refuses. What a + * godawful API!) */ + assert_se(ungetc((unsigned char) c, f) != EOF); + break; + } + + count++; + + if (eol != EOL_NONE) { + /* If we are on a tty, we can't shouldn't wait for more input, because that + * generally means waiting for the user, interactively. In the case of a TTY + * we expect only \n as the single EOL marker, so we are in the lucky + * position that there is no need to wait. We check this condition last, to + * avoid isatty() check if not necessary. */ + + if ((flags & (READ_LINE_IS_A_TTY|READ_LINE_NOT_A_TTY)) == 0) { + int fd; + + fd = fileno(f); + if (fd < 0) /* Maybe an fmemopen() stream? Handle this gracefully, + * and don't call isatty() on an invalid fd */ + flags |= READ_LINE_NOT_A_TTY; + else + flags |= isatty(fd) ? READ_LINE_IS_A_TTY : READ_LINE_NOT_A_TTY; + } + if (FLAGS_SET(flags, READ_LINE_IS_A_TTY)) + break; + } + + if (eol != EOL_NONE) { + previous_eol |= eol; + continue; + } + + if (ret) { + if (!GREEDY_REALLOC(buffer, n + 2)) + return -ENOMEM; + + buffer[n] = c; + } + + n++; + } + } + + if (ret) { + buffer[n] = 0; + + *ret = TAKE_PTR(buffer); + } + + return (int) count; +} + +int read_stripped_line(FILE *f, size_t limit, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + assert(f); + + r = read_line(f, limit, ret ? &s : NULL); + if (r < 0) + return r; + + if (ret) { + const char *p; + + p = strstrip(s); + if (p == s) + *ret = TAKE_PTR(s); + else { + char *copy; + + copy = strdup(p); + if (!copy) + return -ENOMEM; + + *ret = copy; + } + } + + return r; +} + +int safe_fgetc(FILE *f, char *ret) { + int k; + + assert(f); + + /* A safer version of plain fgetc(): let's propagate the error that happened while reading as such, and + * separate the EOF condition from the byte read, to avoid those confusion signed/unsigned issues fgetc() + * has. */ + + errno = 0; + k = fgetc(f); + if (k == EOF) { + if (ferror(f)) + return errno_or_else(EIO); + + if (ret) + *ret = 0; + + return 0; + } + + if (ret) + *ret = k; + + return 1; +} + +int warn_file_is_world_accessible(const char *filename, struct stat *st, const char *unit, unsigned line) { + struct stat _st; + + if (!filename) + return 0; + + if (!st) { + if (stat(filename, &_st) < 0) + return -errno; + st = &_st; + } + + if ((st->st_mode & S_IRWXO) == 0) + return 0; + + if (unit) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s has %04o mode that is too permissive, please adjust the ownership and access mode.", + filename, st->st_mode & 07777); + else + log_warning("%s has %04o mode that is too permissive, please adjust the ownership and access mode.", + filename, st->st_mode & 07777); + return 0; +} diff --git a/src/basic/fileio.h b/src/basic/fileio.h new file mode 100644 index 0000000..e0e0a45 --- /dev/null +++ b/src/basic/fileio.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "macro.h" +#include "time-util.h" + +#define LONG_LINE_MAX (1U*1024U*1024U) + +typedef enum { + WRITE_STRING_FILE_CREATE = 1 << 0, + WRITE_STRING_FILE_TRUNCATE = 1 << 1, + WRITE_STRING_FILE_ATOMIC = 1 << 2, + WRITE_STRING_FILE_AVOID_NEWLINE = 1 << 3, + WRITE_STRING_FILE_VERIFY_ON_FAILURE = 1 << 4, + WRITE_STRING_FILE_VERIFY_IGNORE_NEWLINE = 1 << 5, + WRITE_STRING_FILE_SYNC = 1 << 6, + WRITE_STRING_FILE_DISABLE_BUFFER = 1 << 7, + WRITE_STRING_FILE_NOFOLLOW = 1 << 8, + WRITE_STRING_FILE_MKDIR_0755 = 1 << 9, + WRITE_STRING_FILE_MODE_0600 = 1 << 10, + WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL = 1 << 11, + + /* And before you wonder, why write_string_file_atomic_label_ts() is a separate function instead of just one + more flag here: it's about linking: we don't want to pull -lselinux into all users of write_string_file() + and friends. */ + +} WriteStringFileFlags; + +typedef enum { + READ_FULL_FILE_SECURE = 1 << 0, /* erase any buffers we employ internally, after use */ + READ_FULL_FILE_UNBASE64 = 1 << 1, /* base64 decode what we read */ + READ_FULL_FILE_UNHEX = 1 << 2, /* hex decode what we read */ + READ_FULL_FILE_WARN_WORLD_READABLE = 1 << 3, /* if regular file, log at LOG_WARNING level if access mode above 0700 */ + READ_FULL_FILE_CONNECT_SOCKET = 1 << 4, /* if socket inode, connect to it and read off it */ + READ_FULL_FILE_FAIL_WHEN_LARGER = 1 << 5, /* fail loading if file is larger than specified size */ +} ReadFullFileFlags; + +int fdopen_unlocked(int fd, const char *options, FILE **ret); +int take_fdopen_unlocked(int *fd, const char *options, FILE **ret); +FILE* take_fdopen(int *fd, const char *options); +DIR* take_fdopendir(int *dfd); +FILE* open_memstream_unlocked(char **ptr, size_t *sizeloc); +FILE* fmemopen_unlocked(void *buf, size_t size, const char *mode); + +int write_string_stream_ts(FILE *f, const char *line, WriteStringFileFlags flags, const struct timespec *ts); +static inline int write_string_stream(FILE *f, const char *line, WriteStringFileFlags flags) { + return write_string_stream_ts(f, line, flags, NULL); +} +int write_string_file_ts_at(int dir_fd, const char *fn, const char *line, WriteStringFileFlags flags, const struct timespec *ts); +static inline int write_string_file_ts(const char *fn, const char *line, WriteStringFileFlags flags, const struct timespec *ts) { + return write_string_file_ts_at(AT_FDCWD, fn, line, flags, ts); +} +static inline int write_string_file_at(int dir_fd, const char *fn, const char *line, WriteStringFileFlags flags) { + return write_string_file_ts_at(dir_fd, fn, line, flags, NULL); +} +static inline int write_string_file(const char *fn, const char *line, WriteStringFileFlags flags) { + return write_string_file_ts(fn, line, flags, NULL); +} + +int write_string_filef(const char *fn, WriteStringFileFlags flags, const char *format, ...) _printf_(3, 4); + +int read_one_line_file_at(int dir_fd, const char *filename, char **ret); +static inline int read_one_line_file(const char *filename, char **ret) { + return read_one_line_file_at(AT_FDCWD, filename, ret); +} +int read_full_file_full(int dir_fd, const char *filename, uint64_t offset, size_t size, ReadFullFileFlags flags, const char *bind_name, char **ret_contents, size_t *ret_size); +static inline int read_full_file_at(int dir_fd, const char *filename, char **ret_contents, size_t *ret_size) { + return read_full_file_full(dir_fd, filename, UINT64_MAX, SIZE_MAX, 0, NULL, ret_contents, ret_size); +} +static inline int read_full_file(const char *filename, char **ret_contents, size_t *ret_size) { + return read_full_file_full(AT_FDCWD, filename, UINT64_MAX, SIZE_MAX, 0, NULL, ret_contents, ret_size); +} + +int read_virtual_file_fd(int fd, size_t max_size, char **ret_contents, size_t *ret_size); +int read_virtual_file_at(int dir_fd, const char *filename, size_t max_size, char **ret_contents, size_t *ret_size); +static inline int read_virtual_file(const char *filename, size_t max_size, char **ret_contents, size_t *ret_size) { + return read_virtual_file_at(AT_FDCWD, filename, max_size, ret_contents, ret_size); +} +static inline int read_full_virtual_file(const char *filename, char **ret_contents, size_t *ret_size) { + return read_virtual_file(filename, SIZE_MAX, ret_contents, ret_size); +} + +int read_full_stream_full(FILE *f, const char *filename, uint64_t offset, size_t size, ReadFullFileFlags flags, char **ret_contents, size_t *ret_size); +static inline int read_full_stream(FILE *f, char **ret_contents, size_t *ret_size) { + return read_full_stream_full(f, NULL, UINT64_MAX, SIZE_MAX, 0, ret_contents, ret_size); +} + +int verify_file_at(int dir_fd, const char *fn, const char *blob, bool accept_extra_nl); +static inline int verify_file(const char *fn, const char *blob, bool accept_extra_nl) { + return verify_file_at(AT_FDCWD, fn, blob, accept_extra_nl); +} + +int executable_is_script(const char *path, char **interpreter); + +int get_proc_field(const char *filename, const char *pattern, const char *terminator, char **field); + +DIR *xopendirat(int dirfd, const char *name, int flags); + +typedef enum XfopenFlags { + XFOPEN_UNLOCKED = 1 << 0, /* call __fsetlocking(FSETLOCKING_BYCALLER) after opened */ + XFOPEN_SOCKET = 1 << 1, /* also try to open unix socket */ +} XfopenFlags; + +int xfopenat_full( + int dir_fd, + const char *path, + const char *mode, + int open_flags, + XfopenFlags flags, + const char *bind_name, + FILE **ret); +static inline int xfopenat(int dir_fd, const char *path, const char *mode, int open_flags, FILE **ret) { + return xfopenat_full(dir_fd, path, mode, open_flags, 0, NULL, ret); +} +static inline int fopen_unlocked_at(int dir_fd, const char *path, const char *mode, int open_flags, FILE **ret) { + return xfopenat_full(dir_fd, path, mode, open_flags, XFOPEN_UNLOCKED, NULL, ret); +} +static inline int fopen_unlocked(const char *path, const char *mode, FILE **ret) { + return fopen_unlocked_at(AT_FDCWD, path, mode, 0, ret); +} + +int fdopen_independent(int fd, const char *mode, FILE **ret); + +int search_and_open(const char *path, int mode, const char *root, char **search, int *ret_fd, char **ret_path); +static inline int search_and_access(const char *path, int mode, const char *root, char**search, char **ret_path) { + return search_and_open(path, mode, root, search, NULL, ret_path); +} +int search_and_fopen(const char *path, const char *mode, const char *root, const char **search, FILE **ret_file, char **ret_path); +int search_and_fopen_nulstr(const char *path, const char *mode, const char *root, const char *search, FILE **ret_file, char **ret_path); + +int fflush_and_check(FILE *f); +int fflush_sync_and_check(FILE *f); + +int write_timestamp_file_atomic(const char *fn, usec_t n); +int read_timestamp_file(const char *fn, usec_t *ret); + +int fputs_with_space(FILE *f, const char *s, const char *separator, bool *space); + +typedef enum ReadLineFlags { + READ_LINE_ONLY_NUL = 1 << 0, + READ_LINE_IS_A_TTY = 1 << 1, + READ_LINE_NOT_A_TTY = 1 << 2, +} ReadLineFlags; + +int read_line_full(FILE *f, size_t limit, ReadLineFlags flags, char **ret); + +static inline bool file_offset_beyond_memory_size(off_t x) { + if (x < 0) /* off_t is signed, filter that out */ + return false; + return (uint64_t) x > (uint64_t) SIZE_MAX; +} + +static inline int read_line(FILE *f, size_t limit, char **ret) { + return read_line_full(f, limit, 0, ret); +} + +static inline int read_nul_string(FILE *f, size_t limit, char **ret) { + return read_line_full(f, limit, READ_LINE_ONLY_NUL, ret); +} + +int read_stripped_line(FILE *f, size_t limit, char **ret); + +int safe_fgetc(FILE *f, char *ret); + +int warn_file_is_world_accessible(const char *filename, struct stat *st, const char *unit, unsigned line); + +int fopen_mode_to_flags(const char *mode); diff --git a/src/basic/filesystems-gperf.gperf b/src/basic/filesystems-gperf.gperf new file mode 100644 index 0000000..e8c5357 --- /dev/null +++ b/src/basic/filesystems-gperf.gperf @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#include + +#include "filesystems.h" +#include "missing_magic.h" +#include "stat-util.h" + +struct FilesystemMagic { + const char *name; + statfs_f_type_t magic[FILESYSTEM_MAGIC_MAX]; +}; +%} +struct FilesystemMagic; +%language=ANSI-C +%define hash-function-name filesystems_gperf_hash +%define lookup-function-name filesystems_gperf_lookup +%define slot-name name +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +apparmorfs, {AAFS_MAGIC} +adfs, {ADFS_SUPER_MAGIC} +affs, {AFFS_SUPER_MAGIC} +afs, {AFS_FS_MAGIC, AFS_SUPER_MAGIC} +anon_inodefs, {ANON_INODE_FS_MAGIC} +autofs, {AUTOFS_SUPER_MAGIC} +balloon-kvm, {BALLOON_KVM_MAGIC} +bdev, {BDEVFS_MAGIC} +binder, {BINDERFS_SUPER_MAGIC} +binfmt_misc, {BINFMTFS_MAGIC} +bpf, {BPF_FS_MAGIC} +btrfs, {BTRFS_SUPER_MAGIC} +btrfs_test_fs, {BTRFS_TEST_MAGIC} +# cpuset's magic got reassigned to cgroupfs +cpuset, {CGROUP_SUPER_MAGIC} +ceph, {CEPH_SUPER_MAGIC} +cgroup2, {CGROUP2_SUPER_MAGIC} +# note that the cgroupfs magic got reassigned from cpuset +cgroup, {CGROUP_SUPER_MAGIC} +cifs, {CIFS_SUPER_MAGIC, SMB2_SUPER_MAGIC} +coda, {CODA_SUPER_MAGIC} +configfs, {CONFIGFS_MAGIC} +cramfs, {CRAMFS_MAGIC} +dax, {DAXFS_MAGIC} +debugfs, {DEBUGFS_MAGIC} +devmem, {DEVMEM_MAGIC} +devpts, {DEVPTS_SUPER_MAGIC} +# devtmpfs is just a special instance of tmpfs, hence it reports its magic +devtmpfs, {TMPFS_MAGIC} +dmabuf, {DMA_BUF_MAGIC} +ecryptfs, {ECRYPTFS_SUPER_MAGIC} +efivarfs, {EFIVARFS_MAGIC} +efs, {EFS_SUPER_MAGIC} +erofs, {EROFS_SUPER_MAGIC_V1} +# ext2 + ext3 + ext4 use the same magic +ext2, {EXT2_SUPER_MAGIC} +ext3, {EXT3_SUPER_MAGIC} +ext4, {EXT4_SUPER_MAGIC} +exfat, {EXFAT_SUPER_MAGIC} +f2fs, {F2FS_SUPER_MAGIC} +# fuseblk is so closely related to fuse that it shares the same magic +fuseblk, {FUSE_SUPER_MAGIC} +fuse, {FUSE_SUPER_MAGIC} +fusectl, {FUSE_CTL_SUPER_MAGIC} +# gfs is an old version of gfs2 and reuses the magic +gfs, {GFS2_MAGIC} +gfs2, {GFS2_MAGIC} +hostfs, {HOSTFS_SUPER_MAGIC} +hpfs, {HPFS_SUPER_MAGIC} +hugetlbfs, {HUGETLBFS_MAGIC} +iso9660, {ISOFS_SUPER_MAGIC} +jffs2, {JFFS2_SUPER_MAGIC} +minix, {MINIX_SUPER_MAGIC, MINIX_SUPER_MAGIC2, MINIX2_SUPER_MAGIC, MINIX2_SUPER_MAGIC2, MINIX3_SUPER_MAGIC} +mqueue, {MQUEUE_MAGIC} +# msdos is an older legacy version of vfat, shares the magic +msdos, {MSDOS_SUPER_MAGIC} +# ncp/ncpfs have been removed from the kernel, but ncpfs was the official name +ncp, {NCP_SUPER_MAGIC} +ncpfs, {NCP_SUPER_MAGIC} +# nfs is the old version of nfs4, and they share the same magic +nfs, {NFS_SUPER_MAGIC} +nfs4, {NFS_SUPER_MAGIC} +nilfs2, {NILFS_SUPER_MAGIC} +nsfs, {NSFS_MAGIC} +ntfs, {NTFS_SB_MAGIC} +ntfs3, {NTFS3_SUPER_MAGIC} +ocfs2, {OCFS2_SUPER_MAGIC} +openpromfs, {OPENPROM_SUPER_MAGIC} +orangefs, {ORANGEFS_DEVREQ_MAGIC} +overlay, {OVERLAYFS_SUPER_MAGIC} +pipefs, {PIPEFS_MAGIC} +ppc-cmm, {PPC_CMM_MAGIC} +proc, {PROC_SUPER_MAGIC} +pstore, {PSTOREFS_MAGIC} +# pvfs2 is the old version of orangefs +pvfs2, {ORANGEFS_DEVREQ_MAGIC} +qnx4, {QNX4_SUPER_MAGIC} +qnx6, {QNX6_SUPER_MAGIC} +ramfs, {RAMFS_MAGIC} +resctrl, {RDTGROUP_SUPER_MAGIC} +reiserfs, {REISERFS_SUPER_MAGIC} +rpc_pipefs, {RPC_PIPEFS_SUPER_MAGIC} +secretmem, {SECRETMEM_MAGIC} +securityfs, {SECURITYFS_MAGIC} +selinuxfs, {SELINUX_MAGIC} +shiftfs, {SHIFTFS_MAGIC} +smackfs, {SMACK_MAGIC} +# smb3 is an alias for cifs +smb3, {CIFS_SUPER_MAGIC} +# smbfs was removed from the kernel in 2010, the magic remains +smbfs, {SMB_SUPER_MAGIC} +sockfs, {SOCKFS_MAGIC} +squashfs, {SQUASHFS_MAGIC} +sysfs, {SYSFS_MAGIC} +# note that devtmpfs shares the same magic with tmpfs, given it is just a special named instance of it. +tmpfs, {TMPFS_MAGIC} +tracefs, {TRACEFS_MAGIC} +udf, {UDF_SUPER_MAGIC} +usbdevfs, {USBDEVICE_SUPER_MAGIC} +vboxsf, {VBOXSF_SUPER_MAGIC} +# note that msdos shares the same magic (and is the older version) +vfat, {MSDOS_SUPER_MAGIC} +v9fs, {V9FS_MAGIC} +xenfs, {XENFS_SUPER_MAGIC} +xfs, {XFS_SUPER_MAGIC} +z3fold, {Z3FOLD_MAGIC} +zonefs, {ZONEFS_MAGIC} +zsmalloc, {ZSMALLOC_MAGIC} diff --git a/src/basic/filesystems.c b/src/basic/filesystems.c new file mode 100644 index 0000000..7d34e4e --- /dev/null +++ b/src/basic/filesystems.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "filesystems-gperf.h" +#include "stat-util.h" + +const char *fs_type_to_string(statfs_f_type_t magic) { + + switch (magic) { +#include "filesystem-switch-case.h" + } + + return NULL; +} + + +int fs_type_from_string(const char *name, const statfs_f_type_t **ret) { + const struct FilesystemMagic *fs_magic; + + assert(name); + assert(ret); + + fs_magic = filesystems_gperf_lookup(name, strlen(name)); + if (!fs_magic) + return -EINVAL; + + *ret = fs_magic->magic; + return 0; +} + +bool fs_in_group(const struct statfs *s, FilesystemGroups fs_group) { + int r; + + NULSTR_FOREACH(fs, filesystem_sets[fs_group].value) { + const statfs_f_type_t *magic; + + r = fs_type_from_string(fs, &magic); + if (r >= 0) + for (size_t i = 0; i < FILESYSTEM_MAGIC_MAX; i++) { + if (magic[i] == 0) + break; + + if (is_fs_type(s, magic[i])) + return true; + } + } + + return false; +} + +const FilesystemSet filesystem_sets[_FILESYSTEM_SET_MAX] = { + [FILESYSTEM_SET_BASIC_API] = { + .name = "@basic-api", + .help = "Basic filesystem API", + .value = + "cgroup\0" + "cgroup2\0" + "devpts\0" + "devtmpfs\0" + "mqueue\0" + "proc\0" + "sysfs\0" + }, + [FILESYSTEM_SET_ANONYMOUS] = { + .name = "@anonymous", + .help = "Anonymous inodes", + .value = + "anon_inodefs\0" + "pipefs\0" + "sockfs\0" + }, + [FILESYSTEM_SET_APPLICATION] = { + .name = "@application", + .help = "Application virtual filesystems", + .value = + "autofs\0" + "fuse\0" + "overlay\0" + }, + [FILESYSTEM_SET_AUXILIARY_API] = { + .name = "@auxiliary-api", + .help = "Auxiliary filesystem API", + .value = + "binfmt_misc\0" + "configfs\0" + "efivarfs\0" + "fusectl\0" + "hugetlbfs\0" + "rpc_pipefs\0" + "securityfs\0" + }, + [FILESYSTEM_SET_COMMON_BLOCK] = { + .name = "@common-block", + .help = "Common block device filesystems", + .value = + "btrfs\0" + "erofs\0" + "exfat\0" + "ext4\0" + "f2fs\0" + "iso9660\0" + "ntfs3\0" + "squashfs\0" + "udf\0" + "vfat\0" + "xfs\0" + }, + [FILESYSTEM_SET_HISTORICAL_BLOCK] = { + .name = "@historical-block", + .help = "Historical block device filesystems", + .value = + "ext2\0" + "ext3\0" + "minix\0" + }, + [FILESYSTEM_SET_NETWORK] = { + .name = "@network", + .help = "Well-known network filesystems", + .value = + "afs\0" + "ceph\0" + "cifs\0" + "gfs\0" + "gfs2\0" + "ncp\0" + "ncpfs\0" + "nfs\0" + "nfs4\0" + "ocfs2\0" + "orangefs\0" + "pvfs2\0" + "smb3\0" + "smbfs\0" + }, + [FILESYSTEM_SET_PRIVILEGED_API] = { + .name = "@privileged-api", + .help = "Privileged filesystem API", + .value = + "bpf\0" + "debugfs\0" + "pstore\0" + "tracefs\0" + }, + [FILESYSTEM_SET_SECURITY] = { + .name = "@security", + .help = "Security/MAC API VFS", + .value = + "apparmorfs\0" + "selinuxfs\0" + "smackfs\0" + }, + [FILESYSTEM_SET_TEMPORARY] = { + .name = "@temporary", + .help = "Temporary filesystems", + .value = + "ramfs\0" + "tmpfs\0" + }, + [FILESYSTEM_SET_KNOWN] = { + .name = "@known", + .help = "All known filesystems declared in the kernel", + .value = +#include "filesystem-list.h" + }, +}; + +const FilesystemSet *filesystem_set_find(const char *name) { + if (isempty(name) || name[0] != '@') + return NULL; + + for (FilesystemGroups i = 0; i < _FILESYSTEM_SET_MAX; i++) + if (streq(filesystem_sets[i].name, name)) + return filesystem_sets + i; + + return NULL; +} diff --git a/src/basic/filesystems.h b/src/basic/filesystems.h new file mode 100644 index 0000000..f9edbc1 --- /dev/null +++ b/src/basic/filesystems.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "nulstr-util.h" +#include "stat-util.h" +#include "string-util.h" + +#define FILESYSTEM_MAGIC_MAX 10 + +typedef enum FilesystemGroups { + /* Please leave BASIC_API first and KNOWN last, but sort the rest alphabetically */ + FILESYSTEM_SET_BASIC_API, + FILESYSTEM_SET_ANONYMOUS, + FILESYSTEM_SET_APPLICATION, + FILESYSTEM_SET_AUXILIARY_API, + FILESYSTEM_SET_COMMON_BLOCK, + FILESYSTEM_SET_HISTORICAL_BLOCK, + FILESYSTEM_SET_NETWORK, + FILESYSTEM_SET_PRIVILEGED_API, + FILESYSTEM_SET_SECURITY, + FILESYSTEM_SET_TEMPORARY, + FILESYSTEM_SET_KNOWN, + _FILESYSTEM_SET_MAX, + _FILESYSTEM_SET_INVALID = -EINVAL, +} FilesystemGroups; + +typedef struct FilesystemSet { + const char *name; + const char *help; + const char *value; +} FilesystemSet; + +extern const FilesystemSet filesystem_sets[]; + +const FilesystemSet *filesystem_set_find(const char *name); + +const char *fs_type_to_string(statfs_f_type_t magic); +int fs_type_from_string(const char *name, const statfs_f_type_t **ret); +bool fs_in_group(const struct statfs *s, enum FilesystemGroups fs_group); + +/* gperf prototypes */ +const struct FilesystemMagic* filesystems_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/basic/format-util.c b/src/basic/format-util.c new file mode 100644 index 0000000..9450185 --- /dev/null +++ b/src/basic/format-util.c @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "format-util.h" +#include "memory-util.h" +#include "stdio-util.h" +#include "strxcpyx.h" + +assert_cc(STRLEN("%") + DECIMAL_STR_MAX(int) <= IF_NAMESIZE); +int format_ifname_full(int ifindex, FormatIfnameFlag flag, char buf[static IF_NAMESIZE]) { + if (ifindex <= 0) + return -EINVAL; + + if (if_indextoname(ifindex, buf)) + return 0; + + if (!FLAGS_SET(flag, FORMAT_IFNAME_IFINDEX)) + return -errno; + + if (FLAGS_SET(flag, FORMAT_IFNAME_IFINDEX_WITH_PERCENT)) + assert(snprintf_ok(buf, IF_NAMESIZE, "%%%d", ifindex)); + else + assert(snprintf_ok(buf, IF_NAMESIZE, "%d", ifindex)); + + return 0; +} + +int format_ifname_full_alloc(int ifindex, FormatIfnameFlag flag, char **ret) { + char buf[IF_NAMESIZE], *copy; + int r; + + assert(ret); + + r = format_ifname_full(ifindex, flag, buf); + if (r < 0) + return r; + + copy = strdup(buf); + if (!copy) + return -ENOMEM; + + *ret = copy; + return 0; +} + +char *format_bytes_full(char *buf, size_t l, uint64_t t, FormatBytesFlag flag) { + typedef struct { + const char *suffix; + uint64_t factor; + } suffix_table; + static const suffix_table table_iec[] = { + { "E", UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024) }, + { "P", UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024) }, + { "T", UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024) }, + { "G", UINT64_C(1024)*UINT64_C(1024)*UINT64_C(1024) }, + { "M", UINT64_C(1024)*UINT64_C(1024) }, + { "K", UINT64_C(1024) }, + }, table_si[] = { + { "E", UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000) }, + { "P", UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000) }, + { "T", UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000) }, + { "G", UINT64_C(1000)*UINT64_C(1000)*UINT64_C(1000) }, + { "M", UINT64_C(1000)*UINT64_C(1000) }, + { "K", UINT64_C(1000) }, + }; + const suffix_table *table; + size_t n; + + assert_cc(ELEMENTSOF(table_iec) == ELEMENTSOF(table_si)); + + if (t == UINT64_MAX) + return NULL; + + table = flag & FORMAT_BYTES_USE_IEC ? table_iec : table_si; + n = ELEMENTSOF(table_iec); + + for (size_t i = 0; i < n; i++) + if (t >= table[i].factor) { + if (flag & FORMAT_BYTES_BELOW_POINT) { + (void) snprintf(buf, l, + "%" PRIu64 ".%" PRIu64 "%s", + t / table[i].factor, + i != n - 1 ? + (t / table[i + 1].factor * UINT64_C(10) / table[n - 1].factor) % UINT64_C(10): + (t * UINT64_C(10) / table[i].factor) % UINT64_C(10), + table[i].suffix); + } else + (void) snprintf(buf, l, + "%" PRIu64 "%s", + t / table[i].factor, + table[i].suffix); + + goto finish; + } + + (void) snprintf(buf, l, "%" PRIu64 "%s", t, flag & FORMAT_BYTES_TRAILING_B ? "B" : ""); + +finish: + buf[l-1] = 0; + return buf; + +} diff --git a/src/basic/format-util.h b/src/basic/format-util.h new file mode 100644 index 0000000..8719df3 --- /dev/null +++ b/src/basic/format-util.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "cgroup-util.h" +#include "macro.h" + +assert_cc(sizeof(pid_t) == sizeof(int32_t)); +#define PID_PRI PRIi32 +#define PID_FMT "%" PID_PRI + +assert_cc(sizeof(uid_t) == sizeof(uint32_t)); +#define UID_FMT "%" PRIu32 + +assert_cc(sizeof(gid_t) == sizeof(uint32_t)); +#define GID_FMT "%" PRIu32 + +#if SIZEOF_TIME_T == 8 +# define PRI_TIME PRIi64 +#elif SIZEOF_TIME_T == 4 +# define PRI_TIME "li" +#else +# error Unknown time_t size +#endif + +#if SIZEOF_TIMEX_MEMBER == 8 +# define PRI_TIMEX PRIi64 +#elif SIZEOF_TIMEX_MEMBER == 4 +# define PRI_TIMEX "li" +#else +# error Unknown timex member size +#endif + +#if SIZEOF_RLIM_T == 8 +# define RLIM_FMT "%" PRIu64 +#elif SIZEOF_RLIM_T == 4 +# define RLIM_FMT "%" PRIu32 +#else +# error Unknown rlim_t size +#endif + +#if SIZEOF_DEV_T == 8 +# define DEV_FMT "%" PRIu64 +#elif SIZEOF_DEV_T == 4 +# define DEV_FMT "%" PRIu32 +#else +# error Unknown dev_t size +#endif + +#if SIZEOF_INO_T == 8 +# define INO_FMT "%" PRIu64 +#elif SIZEOF_INO_T == 4 +# define INO_FMT "%" PRIu32 +#else +# error Unknown ino_t size +#endif + +typedef enum { + FORMAT_IFNAME_IFINDEX = 1 << 0, + FORMAT_IFNAME_IFINDEX_WITH_PERCENT = (1 << 1) | FORMAT_IFNAME_IFINDEX, +} FormatIfnameFlag; + +int format_ifname_full(int ifindex, FormatIfnameFlag flag, char buf[static IF_NAMESIZE]); +int format_ifname_full_alloc(int ifindex, FormatIfnameFlag flag, char **ret); + +static inline int format_ifname(int ifindex, char buf[static IF_NAMESIZE]) { + return format_ifname_full(ifindex, 0, buf); +} +static inline int format_ifname_alloc(int ifindex, char **ret) { + return format_ifname_full_alloc(ifindex, 0, ret); +} + +static inline char *_format_ifname_full(int ifindex, FormatIfnameFlag flag, char buf[static IF_NAMESIZE]) { + (void) format_ifname_full(ifindex, flag, buf); + return buf; +} + +#define FORMAT_IFNAME_FULL(index, flag) _format_ifname_full(index, flag, (char[IF_NAMESIZE]){}) +#define FORMAT_IFNAME(index) _format_ifname_full(index, 0, (char[IF_NAMESIZE]){}) + +typedef enum { + FORMAT_BYTES_USE_IEC = 1 << 0, + FORMAT_BYTES_BELOW_POINT = 1 << 1, + FORMAT_BYTES_TRAILING_B = 1 << 2, +} FormatBytesFlag; + +#define FORMAT_BYTES_MAX 16U + +char *format_bytes_full(char *buf, size_t l, uint64_t t, FormatBytesFlag flag) _warn_unused_result_; + +_warn_unused_result_ +static inline char *format_bytes(char *buf, size_t l, uint64_t t) { + return format_bytes_full(buf, l, t, FORMAT_BYTES_USE_IEC | FORMAT_BYTES_BELOW_POINT | FORMAT_BYTES_TRAILING_B); +} + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define FORMAT_BYTES(t) format_bytes((char[FORMAT_BYTES_MAX]){}, FORMAT_BYTES_MAX, t) +#define FORMAT_BYTES_FULL(t, flag) format_bytes_full((char[FORMAT_BYTES_MAX]){}, FORMAT_BYTES_MAX, t, flag) + +#define FORMAT_BYTES_CGROUP_PROTECTION(t) (t == CGROUP_LIMIT_MAX ? "infinity" : FORMAT_BYTES(t)) diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c new file mode 100644 index 0000000..9ba9268 --- /dev/null +++ b/src/basic/fs-util.c @@ -0,0 +1,1238 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "btrfs.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "label.h" +#include "lock-util.h" +#include "log.h" +#include "macro.h" +#include "missing_fcntl.h" +#include "missing_fs.h" +#include "missing_syscall.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "ratelimit.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "user-util.h" + +int rmdir_parents(const char *path, const char *stop) { + char *p; + int r; + + assert(path); + assert(stop); + + if (!path_is_safe(path)) + return -EINVAL; + + if (!path_is_safe(stop)) + return -EINVAL; + + p = strdupa_safe(path); + + for (;;) { + char *slash = NULL; + + /* skip the last component. */ + r = path_find_last_component(p, /* accept_dot_dot= */ false, (const char **) &slash, NULL); + if (r <= 0) + return r; + if (slash == p) + return 0; + + assert(*slash == '/'); + *slash = '\0'; + + if (path_startswith_full(stop, p, /* accept_dot_dot= */ false)) + return 0; + + if (rmdir(p) < 0 && errno != ENOENT) + return -errno; + } +} + +int rename_noreplace(int olddirfd, const char *oldpath, int newdirfd, const char *newpath) { + int r; + + /* Try the ideal approach first */ + if (renameat2(olddirfd, oldpath, newdirfd, newpath, RENAME_NOREPLACE) >= 0) + return 0; + + /* renameat2() exists since Linux 3.15, btrfs and FAT added support for it later. If it is not implemented, + * fall back to a different method. */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) + return -errno; + + /* Let's try to use linkat()+unlinkat() as fallback. This doesn't work on directories and on some file systems + * that do not support hard links (such as FAT, most prominently), but for files it's pretty close to what we + * want — though not atomic (i.e. for a short period both the new and the old filename will exist). */ + if (linkat(olddirfd, oldpath, newdirfd, newpath, 0) >= 0) { + + r = RET_NERRNO(unlinkat(olddirfd, oldpath, 0)); + if (r < 0) { + (void) unlinkat(newdirfd, newpath, 0); + return r; + } + + return 0; + } + + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !IN_SET(errno, EINVAL, EPERM)) /* FAT returns EPERM on link()… */ + return -errno; + + /* OK, neither RENAME_NOREPLACE nor linkat()+unlinkat() worked. Let's then fall back to the racy TOCTOU + * vulnerable accessat(F_OK) check followed by classic, replacing renameat(), we have nothing better. */ + + if (faccessat(newdirfd, newpath, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) + return -EEXIST; + if (errno != ENOENT) + return -errno; + + return RET_NERRNO(renameat(olddirfd, oldpath, newdirfd, newpath)); +} + +int readlinkat_malloc(int fd, const char *p, char **ret) { + size_t l = PATH_MAX; + + assert(fd >= 0 || fd == AT_FDCWD); + + if (fd < 0 && isempty(p)) + return -EISDIR; /* In this case, the fd points to the current working directory, and is + * definitely not a symlink. Let's return earlier. */ + + for (;;) { + _cleanup_free_ char *c = NULL; + ssize_t n; + + c = new(char, l+1); + if (!c) + return -ENOMEM; + + n = readlinkat(fd, strempty(p), c, l); + if (n < 0) + return -errno; + + if ((size_t) n < l) { + c[n] = 0; + + if (ret) + *ret = TAKE_PTR(c); + + return 0; + } + + if (l > (SSIZE_MAX-1)/2) /* readlinkat() returns an ssize_t, and we want an extra byte for a + * trailing NUL, hence do an overflow check relative to SSIZE_MAX-1 + * here */ + return -EFBIG; + + l *= 2; + } +} + +int readlink_malloc(const char *p, char **ret) { + return readlinkat_malloc(AT_FDCWD, p, ret); +} + +int readlink_value(const char *p, char **ret) { + _cleanup_free_ char *link = NULL, *name = NULL; + int r; + + assert(p); + assert(ret); + + r = readlink_malloc(p, &link); + if (r < 0) + return r; + + r = path_extract_filename(link, &name); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return -EINVAL; + + *ret = TAKE_PTR(name); + return 0; +} + +int readlink_and_make_absolute(const char *p, char **ret) { + _cleanup_free_ char *target = NULL; + int r; + + assert(p); + assert(ret); + + r = readlink_malloc(p, &target); + if (r < 0) + return r; + + return file_in_same_dir(p, target, ret); +} + +int chmod_and_chown_at(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid) { + _cleanup_close_ int fd = -EBADF; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + if (path) { + /* Let's acquire an O_PATH fd, as precaution to change mode/owner on the same file */ + fd = openat(dir_fd, path, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (fd < 0) + return -errno; + dir_fd = fd; + + } else if (dir_fd == AT_FDCWD) { + /* Let's acquire an O_PATH fd of the current directory */ + fd = openat(dir_fd, ".", O_PATH|O_CLOEXEC|O_NOFOLLOW|O_DIRECTORY); + if (fd < 0) + return -errno; + dir_fd = fd; + } + + return fchmod_and_chown(dir_fd, mode, uid, gid); +} + +int fchmod_and_chown_with_fallback(int fd, const char *path, mode_t mode, uid_t uid, gid_t gid) { + bool do_chown, do_chmod; + struct stat st; + int r; + + /* Change ownership and access mode of the specified fd. Tries to do so safely, ensuring that at no + * point in time the access mode is above the old access mode under the old ownership or the new + * access mode under the new ownership. Note: this call tries hard to leave the access mode + * unaffected if the uid/gid is changed, i.e. it undoes implicit suid/sgid dropping the kernel does + * on chown(). + * + * This call is happy with O_PATH fds. + * + * If path is given, allow a fallback path which does not use /proc/self/fd/. On any normal system + * /proc will be mounted, but in certain improperly assembled environments it might not be. This is + * less secure (potential TOCTOU), so should only be used after consideration. */ + + if (fstat(fd, &st) < 0) + return -errno; + + do_chown = + (uid != UID_INVALID && st.st_uid != uid) || + (gid != GID_INVALID && st.st_gid != gid); + + do_chmod = + !S_ISLNK(st.st_mode) && /* chmod is not defined on symlinks */ + ((mode != MODE_INVALID && ((st.st_mode ^ mode) & 07777) != 0) || + do_chown); /* If we change ownership, make sure we reset the mode afterwards, since chown() + * modifies the access mode too */ + + if (mode == MODE_INVALID) + mode = st.st_mode; /* If we only shall do a chown(), save original mode, since chown() might break it. */ + else if ((mode & S_IFMT) != 0 && ((mode ^ st.st_mode) & S_IFMT) != 0) + return -EINVAL; /* insist on the right file type if it was specified */ + + if (do_chown && do_chmod) { + mode_t minimal = st.st_mode & mode; /* the subset of the old and the new mask */ + + if (((minimal ^ st.st_mode) & 07777) != 0) { + r = fchmod_opath(fd, minimal & 07777); + if (r < 0) { + if (!path || r != -ENOSYS) + return r; + + /* Fallback path which doesn't use /proc/self/fd/. */ + if (chmod(path, minimal & 07777) < 0) + return -errno; + } + } + } + + if (do_chown) + if (fchownat(fd, "", uid, gid, AT_EMPTY_PATH) < 0) + return -errno; + + if (do_chmod) { + r = fchmod_opath(fd, mode & 07777); + if (r < 0) { + if (!path || r != -ENOSYS) + return r; + + /* Fallback path which doesn't use /proc/self/fd/. */ + if (chmod(path, mode & 07777) < 0) + return -errno; + } + } + + return do_chown || do_chmod; +} + +int fchmod_umask(int fd, mode_t m) { + _cleanup_umask_ mode_t u = umask(0777); + + return RET_NERRNO(fchmod(fd, m & (~u))); +} + +int fchmod_opath(int fd, mode_t m) { + /* This function operates also on fd that might have been opened with + * O_PATH. The tool set we have is non-intuitive: + * - fchmod(2) only operates on open files (i. e., fds with an open file description); + * - fchmodat(2) does not have a flag arg like fchownat(2) does, so no way to pass AT_EMPTY_PATH; + * + it should not be confused with the libc fchmodat(3) interface, which adds 4th flag argument, + * but does not support AT_EMPTY_PATH (only supports AT_SYMLINK_NOFOLLOW); + * - fchmodat2(2) supports all the AT_* flags, but is still very recent. + * + * We try to use fchmodat2(), and, if it is not supported, resort + * to the /proc/self/fd dance. */ + + assert(fd >= 0); + + if (fchmodat2(fd, "", m, AT_EMPTY_PATH) >= 0) + return 0; + if (!IN_SET(errno, ENOSYS, EPERM)) /* Some container managers block unknown syscalls with EPERM */ + return -errno; + + if (chmod(FORMAT_PROC_FD_PATH(fd), m) < 0) { + if (errno != ENOENT) + return -errno; + + if (proc_mounted() == 0) + return -ENOSYS; /* if we have no /proc/, the concept is not implementable */ + + return -ENOENT; + } + + return 0; +} + +int futimens_opath(int fd, const struct timespec ts[2]) { + /* Similar to fchmod_opath() but for futimens() */ + + if (utimensat(AT_FDCWD, FORMAT_PROC_FD_PATH(fd), ts, 0) < 0) { + if (errno != ENOENT) + return -errno; + + if (proc_mounted() == 0) + return -ENOSYS; /* if we have no /proc/, the concept is not implementable */ + + return -ENOENT; + } + + return 0; +} + +int stat_warn_permissions(const char *path, const struct stat *st) { + assert(path); + assert(st); + + /* Don't complain if we are reading something that is not a file, for example /dev/null */ + if (!S_ISREG(st->st_mode)) + return 0; + + if (st->st_mode & 0111) + log_warning("Configuration file %s is marked executable. Please remove executable permission bits. Proceeding anyway.", path); + + if (st->st_mode & 0002) + log_warning("Configuration file %s is marked world-writable. Please remove world writability permission bits. Proceeding anyway.", path); + + if (getpid_cached() == 1 && (st->st_mode & 0044) != 0044) + log_warning("Configuration file %s is marked world-inaccessible. This has no effect as configuration data is accessible via APIs without restrictions. Proceeding anyway.", path); + + return 0; +} + +int fd_warn_permissions(const char *path, int fd) { + struct stat st; + + assert(path); + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + return stat_warn_permissions(path, &st); +} + +int touch_file(const char *path, bool parents, usec_t stamp, uid_t uid, gid_t gid, mode_t mode) { + _cleanup_close_ int fd = -EBADF; + int r, ret; + + assert(path); + + /* Note that touch_file() does not follow symlinks: if invoked on an existing symlink, then it is the symlink + * itself which is updated, not its target + * + * Returns the first error we encounter, but tries to apply as much as possible. */ + + if (parents) + (void) mkdir_parents(path, 0755); + + /* Initially, we try to open the node with O_PATH, so that we get a reference to the node. This is useful in + * case the path refers to an existing device or socket node, as we can open it successfully in all cases, and + * won't trigger any driver magic or so. */ + fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (fd < 0) { + if (errno != ENOENT) + return -errno; + + /* if the node doesn't exist yet, we create it, but with O_EXCL, so that we only create a regular file + * here, and nothing else */ + fd = open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, IN_SET(mode, 0, MODE_INVALID) ? 0644 : mode); + if (fd < 0) + return -errno; + } + + /* Let's make a path from the fd, and operate on that. With this logic, we can adjust the access mode, + * ownership and time of the file node in all cases, even if the fd refers to an O_PATH object — which is + * something fchown(), fchmod(), futimensat() don't allow. */ + ret = fchmod_and_chown(fd, mode, uid, gid); + + if (stamp != USEC_INFINITY) { + struct timespec ts[2]; + + timespec_store(&ts[0], stamp); + ts[1] = ts[0]; + r = futimens_opath(fd, ts); + } else + r = futimens_opath(fd, NULL); + if (r < 0 && ret >= 0) + return r; + + return ret; +} + +int symlink_idempotent(const char *from, const char *to, bool make_relative) { + _cleanup_free_ char *relpath = NULL; + int r; + + assert(from); + assert(to); + + if (make_relative) { + r = path_make_relative_parent(to, from, &relpath); + if (r < 0) + return r; + + from = relpath; + } + + if (symlink(from, to) < 0) { + _cleanup_free_ char *p = NULL; + + if (errno != EEXIST) + return -errno; + + r = readlink_malloc(to, &p); + if (r == -EINVAL) /* Not a symlink? In that case return the original error we encountered: -EEXIST */ + return -EEXIST; + if (r < 0) /* Any other error? In that case propagate it as is */ + return r; + + if (!streq(p, from)) /* Not the symlink we want it to be? In that case, propagate the original -EEXIST */ + return -EEXIST; + } + + return 0; +} + +int symlinkat_atomic_full(const char *from, int atfd, const char *to, bool make_relative) { + _cleanup_free_ char *relpath = NULL, *t = NULL; + int r; + + assert(from); + assert(to); + + if (make_relative) { + r = path_make_relative_parent(to, from, &relpath); + if (r < 0) + return r; + + from = relpath; + } + + r = tempfn_random(to, NULL, &t); + if (r < 0) + return r; + + if (symlinkat(from, atfd, t) < 0) + return -errno; + + r = RET_NERRNO(renameat(atfd, t, atfd, to)); + if (r < 0) { + (void) unlinkat(atfd, t, 0); + return r; + } + + return 0; +} + +int mknodat_atomic(int atfd, const char *path, mode_t mode, dev_t dev) { + _cleanup_free_ char *t = NULL; + int r; + + assert(path); + + r = tempfn_random(path, NULL, &t); + if (r < 0) + return r; + + if (mknodat(atfd, t, mode, dev) < 0) + return -errno; + + r = RET_NERRNO(renameat(atfd, t, atfd, path)); + if (r < 0) { + (void) unlinkat(atfd, t, 0); + return r; + } + + return 0; +} + +int mkfifoat_atomic(int atfd, const char *path, mode_t mode) { + _cleanup_free_ char *t = NULL; + int r; + + assert(path); + + /* We're only interested in the (random) filename. */ + r = tempfn_random(path, NULL, &t); + if (r < 0) + return r; + + if (mkfifoat(atfd, t, mode) < 0) + return -errno; + + r = RET_NERRNO(renameat(atfd, t, atfd, path)); + if (r < 0) { + (void) unlinkat(atfd, t, 0); + return r; + } + + return 0; +} + +int get_files_in_directory(const char *path, char ***list) { + _cleanup_strv_free_ char **l = NULL; + _cleanup_closedir_ DIR *d = NULL; + size_t n = 0; + + assert(path); + + /* Returns all files in a directory in *list, and the number + * of files as return value. If list is NULL returns only the + * number. */ + + d = opendir(path); + if (!d) + return -errno; + + FOREACH_DIRENT_ALL(de, d, return -errno) { + if (!dirent_is_file(de)) + continue; + + if (list) { + /* one extra slot is needed for the terminating NULL */ + if (!GREEDY_REALLOC(l, n + 2)) + return -ENOMEM; + + l[n] = strdup(de->d_name); + if (!l[n]) + return -ENOMEM; + + l[++n] = NULL; + } else + n++; + } + + if (list) + *list = TAKE_PTR(l); + + return n; +} + +static int getenv_tmp_dir(const char **ret_path) { + int r, ret = 0; + + assert(ret_path); + + /* We use the same order of environment variables python uses in tempfile.gettempdir(): + * https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir */ + FOREACH_STRING(n, "TMPDIR", "TEMP", "TMP") { + const char *e; + + e = secure_getenv(n); + if (!e) + continue; + if (!path_is_absolute(e)) { + r = -ENOTDIR; + goto next; + } + if (!path_is_normalized(e)) { + r = -EPERM; + goto next; + } + + r = is_dir(e, true); + if (r < 0) + goto next; + if (r == 0) { + r = -ENOTDIR; + goto next; + } + + *ret_path = e; + return 1; + + next: + /* Remember first error, to make this more debuggable */ + if (ret >= 0) + ret = r; + } + + if (ret < 0) + return ret; + + *ret_path = NULL; + return ret; +} + +static int tmp_dir_internal(const char *def, const char **ret) { + const char *e; + int r, k; + + assert(def); + assert(ret); + + r = getenv_tmp_dir(&e); + if (r > 0) { + *ret = e; + return 0; + } + + k = is_dir(def, true); + if (k == 0) + k = -ENOTDIR; + if (k < 0) + return r < 0 ? r : k; + + *ret = def; + return 0; +} + +int var_tmp_dir(const char **ret) { + + /* Returns the location for "larger" temporary files, that is backed by physical storage if available, and thus + * even might survive a boot: /var/tmp. If $TMPDIR (or related environment variables) are set, its value is + * returned preferably however. Note that both this function and tmp_dir() below are affected by $TMPDIR, + * making it a variable that overrides all temporary file storage locations. */ + + return tmp_dir_internal("/var/tmp", ret); +} + +int tmp_dir(const char **ret) { + + /* Similar to var_tmp_dir() above, but returns the location for "smaller" temporary files, which is usually + * backed by an in-memory file system: /tmp. */ + + return tmp_dir_internal("/tmp", ret); +} + +int unlink_or_warn(const char *filename) { + if (unlink(filename) < 0 && errno != ENOENT) + /* If the file doesn't exist and the fs simply was read-only (in which + * case unlink() returns EROFS even if the file doesn't exist), don't + * complain */ + if (errno != EROFS || access(filename, F_OK) >= 0) + return log_error_errno(errno, "Failed to remove \"%s\": %m", filename); + + return 0; +} + +int access_fd(int fd, int mode) { + /* Like access() but operates on an already open fd */ + + if (access(FORMAT_PROC_FD_PATH(fd), mode) < 0) { + if (errno != ENOENT) + return -errno; + + /* ENOENT can mean two things: that the fd does not exist or that /proc is not mounted. Let's + * make things debuggable and distinguish the two. */ + + if (proc_mounted() == 0) + return -ENOSYS; /* /proc is not available or not set up properly, we're most likely in some chroot + * environment. */ + + return -EBADF; /* The directory exists, hence it's the fd that doesn't. */ + } + + return 0; +} + +void unlink_tempfilep(char (*p)[]) { + /* If the file is created with mkstemp(), it will (almost always) + * change the suffix. Treat this as a sign that the file was + * successfully created. We ignore both the rare case where the + * original suffix is used and unlink failures. */ + if (!endswith(*p, ".XXXXXX")) + (void) unlink(*p); +} + +int unlinkat_deallocate(int fd, const char *name, UnlinkDeallocateFlags flags) { + _cleanup_close_ int truncate_fd = -EBADF; + struct stat st; + off_t l, bs; + + assert((flags & ~(UNLINK_REMOVEDIR|UNLINK_ERASE)) == 0); + + /* Operates like unlinkat() but also deallocates the file contents if it is a regular file and there's no other + * link to it. This is useful to ensure that other processes that might have the file open for reading won't be + * able to keep the data pinned on disk forever. This call is particular useful whenever we execute clean-up + * jobs ("vacuuming"), where we want to make sure the data is really gone and the disk space released and + * returned to the free pool. + * + * Deallocation is preferably done by FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE (👊) if supported, which means + * the file won't change size. That's a good thing since we shouldn't needlessly trigger SIGBUS in other + * programs that have mmap()ed the file. (The assumption here is that changing file contents to all zeroes + * underneath those programs is the better choice than simply triggering SIGBUS in them which truncation does.) + * However if hole punching is not implemented in the kernel or file system we'll fall back to normal file + * truncation (🔪), as our goal of deallocating the data space trumps our goal of being nice to readers (💐). + * + * Note that we attempt deallocation, but failure to succeed with that is not considered fatal, as long as the + * primary job – to delete the file – is accomplished. */ + + if (!FLAGS_SET(flags, UNLINK_REMOVEDIR)) { + truncate_fd = openat(fd, name, O_WRONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|O_NONBLOCK); + if (truncate_fd < 0) { + + /* If this failed because the file doesn't exist propagate the error right-away. Also, + * AT_REMOVEDIR wasn't set, and we tried to open the file for writing, which means EISDIR is + * returned when this is a directory but we are not supposed to delete those, hence propagate + * the error right-away too. */ + if (IN_SET(errno, ENOENT, EISDIR)) + return -errno; + + if (errno != ELOOP) /* don't complain if this is a symlink */ + log_debug_errno(errno, "Failed to open file '%s' for deallocation, ignoring: %m", name); + } + } + + if (unlinkat(fd, name, FLAGS_SET(flags, UNLINK_REMOVEDIR) ? AT_REMOVEDIR : 0) < 0) + return -errno; + + if (truncate_fd < 0) /* Don't have a file handle, can't do more ☹️ */ + return 0; + + if (fstat(truncate_fd, &st) < 0) { + log_debug_errno(errno, "Failed to stat file '%s' for deallocation, ignoring: %m", name); + return 0; + } + + if (!S_ISREG(st.st_mode)) + return 0; + + if (FLAGS_SET(flags, UNLINK_ERASE) && st.st_size > 0 && st.st_nlink == 0) { + uint64_t left = st.st_size; + char buffer[64 * 1024]; + + /* If erasing is requested, let's overwrite the file with random data once before deleting + * it. This isn't going to give you shred(1) semantics, but hopefully should be good enough + * for stuff backed by tmpfs at least. + * + * Note that we only erase like this if the link count of the file is zero. If it is higher it + * is still linked by someone else and we'll leave it to them to remove it securely + * eventually! */ + + random_bytes(buffer, sizeof(buffer)); + + while (left > 0) { + ssize_t n; + + n = write(truncate_fd, buffer, MIN(sizeof(buffer), left)); + if (n < 0) { + log_debug_errno(errno, "Failed to erase data in file '%s', ignoring.", name); + break; + } + + assert(left >= (size_t) n); + left -= n; + } + + /* Let's refresh metadata */ + if (fstat(truncate_fd, &st) < 0) { + log_debug_errno(errno, "Failed to stat file '%s' for deallocation, ignoring: %m", name); + return 0; + } + } + + /* Don't dallocate if there's nothing to deallocate or if the file is linked elsewhere */ + if (st.st_blocks == 0 || st.st_nlink > 0) + return 0; + + /* If this is a regular file, it actually took up space on disk and there are no other links it's time to + * punch-hole/truncate this to release the disk space. */ + + bs = MAX(st.st_blksize, 512); + l = ROUND_UP(st.st_size, bs); /* Round up to next block size */ + + if (fallocate(truncate_fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, 0, l) >= 0) + return 0; /* Successfully punched a hole! 😊 */ + + /* Fall back to truncation */ + if (ftruncate(truncate_fd, 0) < 0) { + log_debug_errno(errno, "Failed to truncate file to 0, ignoring: %m"); + return 0; + } + + return 0; +} + +int open_parent_at(int dir_fd, const char *path, int flags, mode_t mode) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + r = path_extract_directory(path, &parent); + if (r == -EDESTADDRREQ) { + parent = strdup("."); + if (!parent) + return -ENOMEM; + } else if (r == -EADDRNOTAVAIL) { + parent = strdup(path); + if (!parent) + return -ENOMEM; + } else if (r < 0) + return r; + + /* Let's insist on O_DIRECTORY since the parent of a file or directory is a directory. Except if we open an + * O_TMPFILE file, because in that case we are actually create a regular file below the parent directory. */ + + if (FLAGS_SET(flags, O_PATH)) + flags |= O_DIRECTORY; + else if (!FLAGS_SET(flags, O_TMPFILE)) + flags |= O_DIRECTORY|O_RDONLY; + + return RET_NERRNO(openat(dir_fd, parent, flags, mode)); +} + +int conservative_renameat( + int olddirfd, const char *oldpath, + int newdirfd, const char *newpath) { + + _cleanup_close_ int old_fd = -EBADF, new_fd = -EBADF; + struct stat old_stat, new_stat; + + /* Renames the old path to the new path, much like renameat() — except if both are regular files and + * have the exact same contents and basic file attributes already. In that case remove the new file + * instead. This call is useful for reducing inotify wakeups on files that are updated but don't + * actually change. This function is written in a style that we rather rename too often than suppress + * too much. I.e. whenever we are in doubt, we rather rename than fail. After all reducing inotify + * events is an optimization only, not more. */ + + old_fd = openat(olddirfd, oldpath, O_CLOEXEC|O_RDONLY|O_NOCTTY|O_NOFOLLOW); + if (old_fd < 0) + goto do_rename; + + new_fd = openat(newdirfd, newpath, O_CLOEXEC|O_RDONLY|O_NOCTTY|O_NOFOLLOW); + if (new_fd < 0) + goto do_rename; + + if (fstat(old_fd, &old_stat) < 0) + goto do_rename; + + if (!S_ISREG(old_stat.st_mode)) + goto do_rename; + + if (fstat(new_fd, &new_stat) < 0) + goto do_rename; + + if (stat_inode_same(&new_stat, &old_stat)) + goto is_same; + + if (old_stat.st_mode != new_stat.st_mode || + old_stat.st_size != new_stat.st_size || + old_stat.st_uid != new_stat.st_uid || + old_stat.st_gid != new_stat.st_gid) + goto do_rename; + + for (;;) { + uint8_t buf1[16*1024]; + uint8_t buf2[sizeof(buf1)]; + ssize_t l1, l2; + + l1 = read(old_fd, buf1, sizeof(buf1)); + if (l1 < 0) + goto do_rename; + + if (l1 == sizeof(buf1)) + /* Read the full block, hence read a full block in the other file too */ + + l2 = read(new_fd, buf2, l1); + else { + assert((size_t) l1 < sizeof(buf1)); + + /* Short read. This hence was the last block in the first file, and then came + * EOF. Read one byte more in the second file, so that we can verify we hit EOF there + * too. */ + + assert((size_t) (l1 + 1) <= sizeof(buf2)); + l2 = read(new_fd, buf2, l1 + 1); + } + if (l2 != l1) + goto do_rename; + + if (memcmp(buf1, buf2, l1) != 0) + goto do_rename; + + if ((size_t) l1 < sizeof(buf1)) /* We hit EOF on the first file, and the second file too, hence exit + * now. */ + break; + } + +is_same: + /* Everything matches? Then don't rename, instead remove the source file, and leave the existing + * destination in place */ + + if (unlinkat(olddirfd, oldpath, 0) < 0) + goto do_rename; + + return 0; + +do_rename: + if (renameat(olddirfd, oldpath, newdirfd, newpath) < 0) + return -errno; + + return 1; +} + +int posix_fallocate_loop(int fd, uint64_t offset, uint64_t size) { + RateLimit rl; + int r; + + r = posix_fallocate(fd, offset, size); /* returns positive errnos on error */ + if (r != EINTR) + return -r; /* Let's return negative errnos, like common in our codebase */ + + /* On EINTR try a couple of times more, but protect against busy looping + * (not more than 16 times per 10s) */ + rl = (const RateLimit) { 10 * USEC_PER_SEC, 16 }; + while (ratelimit_below(&rl)) { + r = posix_fallocate(fd, offset, size); + if (r != EINTR) + return -r; + } + + return -EINTR; +} + +int parse_cifs_service( + const char *s, + char **ret_host, + char **ret_service, + char **ret_path) { + + _cleanup_free_ char *h = NULL, *ss = NULL, *x = NULL; + const char *p, *e, *d; + char delimiter; + + /* Parses a CIFS service in form of //host/service/path… and splitting it in three parts. The last + * part is optional, in which case NULL is returned there. To maximize compatibility syntax with + * backslashes instead of slashes is accepted too. */ + + if (!s) + return -EINVAL; + + p = startswith(s, "//"); + if (!p) { + p = startswith(s, "\\\\"); + if (!p) + return -EINVAL; + } + + delimiter = s[0]; + e = strchr(p, delimiter); + if (!e) + return -EINVAL; + + h = strndup(p, e - p); + if (!h) + return -ENOMEM; + + if (!hostname_is_valid(h, 0)) + return -EINVAL; + + e++; + + d = strchrnul(e, delimiter); + + ss = strndup(e, d - e); + if (!ss) + return -ENOMEM; + + if (!filename_is_valid(ss)) + return -EINVAL; + + if (!isempty(d)) { + x = strdup(skip_leading_chars(d, CHAR_TO_STR(delimiter))); + if (!x) + return -EINVAL; + + /* Make sure to convert Windows-style "\" → Unix-style / */ + for (char *i = x; *i; i++) + if (*i == delimiter) + *i = '/'; + + if (!path_is_valid(x)) + return -EINVAL; + + path_simplify(x); + if (!path_is_normalized(x)) + return -EINVAL; + } + + if (ret_host) + *ret_host = TAKE_PTR(h); + if (ret_service) + *ret_service = TAKE_PTR(ss); + if (ret_path) + *ret_path = TAKE_PTR(x); + + return 0; +} + +int open_mkdir_at(int dirfd, const char *path, int flags, mode_t mode) { + _cleanup_close_ int fd = -EBADF, parent_fd = -EBADF; + _cleanup_free_ char *fname = NULL, *parent = NULL; + int r; + + /* Creates a directory with mkdirat() and then opens it, in the "most atomic" fashion we can + * do. Guarantees that the returned fd refers to a directory. If O_EXCL is specified will fail if the + * dir already exists. Otherwise will open an existing dir, but only if it is one. */ + + if (flags & ~(O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_EXCL|O_NOATIME|O_NOFOLLOW|O_PATH)) + return -EINVAL; + if ((flags & O_ACCMODE) != O_RDONLY) + return -EINVAL; + + /* Note that O_DIRECTORY|O_NOFOLLOW is implied, but we allow specifying it anyway. The following + * flags actually make sense to specify: O_CLOEXEC, O_EXCL, O_NOATIME, O_PATH */ + + /* If this is not a valid filename, it's a path. Let's open the parent directory then, so + * that we can pin it, and operate below it. */ + r = path_extract_directory(path, &parent); + if (r < 0) { + if (!IN_SET(r, -EDESTADDRREQ, -EADDRNOTAVAIL)) + return r; + } else { + r = path_extract_filename(path, &fname); + if (r < 0) + return r; + + parent_fd = openat(dirfd, parent, O_PATH|O_DIRECTORY|O_CLOEXEC); + if (parent_fd < 0) + return -errno; + + dirfd = parent_fd; + path = fname; + } + + fd = xopenat(dirfd, path, flags|O_CREAT|O_DIRECTORY|O_NOFOLLOW, /* xopen_flags = */ 0, mode); + if (IN_SET(fd, -ELOOP, -ENOTDIR)) + return -EEXIST; + if (fd < 0) + return fd; + + return TAKE_FD(fd); +} + +int openat_report_new(int dirfd, const char *pathname, int flags, mode_t mode, bool *ret_newly_created) { + unsigned attempts = 7; + int fd; + + /* Just like openat(), but adds one thing: optionally returns whether we created the file anew or if + * it already existed before. This is only relevant if O_CREAT is set without O_EXCL, and thus will + * shortcut to openat() otherwise */ + + if (!ret_newly_created) + return RET_NERRNO(openat(dirfd, pathname, flags, mode)); + + if (!FLAGS_SET(flags, O_CREAT) || FLAGS_SET(flags, O_EXCL)) { + fd = openat(dirfd, pathname, flags, mode); + if (fd < 0) + return -errno; + + *ret_newly_created = FLAGS_SET(flags, O_CREAT); + return fd; + } + + for (;;) { + /* First, attempt to open without O_CREAT/O_EXCL, i.e. open existing file */ + fd = openat(dirfd, pathname, flags & ~(O_CREAT | O_EXCL), mode); + if (fd >= 0) { + *ret_newly_created = false; + return fd; + } + if (errno != ENOENT) + return -errno; + + /* So the file didn't exist yet, hence create it with O_CREAT/O_EXCL. */ + fd = openat(dirfd, pathname, flags | O_CREAT | O_EXCL, mode); + if (fd >= 0) { + *ret_newly_created = true; + return fd; + } + if (errno != EEXIST) + return -errno; + + /* Hmm, so now we got EEXIST? So it apparently exists now? If so, let's try to open again + * without the two flags. But let's not spin forever, hence put a limit on things */ + + if (--attempts == 0) /* Give up eventually, somebody is playing with us */ + return -EEXIST; + } +} + +int xopenat(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode) { + _cleanup_close_ int fd = -EBADF; + bool made = false; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + /* This is like openat(), but has a few tricks up its sleeves, extending behaviour: + * + * • O_DIRECTORY|O_CREAT is supported, which causes a directory to be created, and immediately + * opened. When used with the XO_SUBVOLUME flag this will even create a btrfs subvolume. + * + * • If O_CREAT is used with XO_LABEL, any created file will be immediately relabelled. + * + * • If the path is specified NULL or empty, behaves like fd_reopen(). + */ + + if (isempty(path)) { + assert(!FLAGS_SET(open_flags, O_CREAT|O_EXCL)); + return fd_reopen(dir_fd, open_flags & ~O_NOFOLLOW); + } + + if (FLAGS_SET(open_flags, O_CREAT) && FLAGS_SET(xopen_flags, XO_LABEL)) { + r = label_ops_pre(dir_fd, path, FLAGS_SET(open_flags, O_DIRECTORY) ? S_IFDIR : S_IFREG); + if (r < 0) + return r; + } + + if (FLAGS_SET(open_flags, O_DIRECTORY|O_CREAT)) { + if (FLAGS_SET(xopen_flags, XO_SUBVOLUME)) + r = btrfs_subvol_make_fallback(dir_fd, path, mode); + else + r = RET_NERRNO(mkdirat(dir_fd, path, mode)); + if (r == -EEXIST) { + if (FLAGS_SET(open_flags, O_EXCL)) + return -EEXIST; + + made = false; + } else if (r < 0) + return r; + else + made = true; + + if (FLAGS_SET(xopen_flags, XO_LABEL)) { + r = label_ops_post(dir_fd, path); + if (r < 0) + return r; + } + + open_flags &= ~(O_EXCL|O_CREAT); + xopen_flags &= ~XO_LABEL; + } + + fd = RET_NERRNO(openat(dir_fd, path, open_flags, mode)); + if (fd < 0) { + if (IN_SET(fd, + /* We got ENOENT? then someone else immediately removed it after we + * created it. In that case let's return immediately without unlinking + * anything, because there simply isn't anything to unlink anymore. */ + -ENOENT, + /* is a symlink? exists already → created by someone else, don't unlink */ + -ELOOP, + /* not a directory? exists already → created by someone else, don't unlink */ + -ENOTDIR)) + return fd; + + if (made) + (void) unlinkat(dir_fd, path, AT_REMOVEDIR); + + return fd; + } + + if (FLAGS_SET(open_flags, O_CREAT) && FLAGS_SET(xopen_flags, XO_LABEL)) { + r = label_ops_post(dir_fd, path); + if (r < 0) + return r; + } + + return TAKE_FD(fd); +} + +int xopenat_lock( + int dir_fd, + const char *path, + int open_flags, + XOpenFlags xopen_flags, + mode_t mode, + LockType locktype, + int operation) { + + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(IN_SET(operation & ~LOCK_NB, LOCK_EX, LOCK_SH)); + + /* POSIX/UNPOSIX locks don't work on directories (errno is set to -EBADF so let's return early with + * the same error here). */ + if (FLAGS_SET(open_flags, O_DIRECTORY) && !IN_SET(locktype, LOCK_BSD, LOCK_NONE)) + return -EBADF; + + for (;;) { + struct stat st; + + fd = xopenat(dir_fd, path, open_flags, xopen_flags, mode); + if (fd < 0) + return fd; + + r = lock_generic(fd, locktype, operation); + if (r < 0) + return r; + + /* If we acquired the lock, let's check if the file/directory still exists in the file + * system. If not, then the previous exclusive owner removed it and then closed it. In such a + * case our acquired lock is worthless, hence try again. */ + + if (fstat(fd, &st) < 0) + return -errno; + if (st.st_nlink > 0) + break; + + fd = safe_close(fd); + } + + return TAKE_FD(fd); +} diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h new file mode 100644 index 0000000..1023ab7 --- /dev/null +++ b/src/basic/fs-util.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "lock-util.h" +#include "time-util.h" +#include "user-util.h" + +#define MODE_INVALID ((mode_t) -1) + +/* The following macros add 1 when converting things, since 0 is a valid mode, while the pointer + * NULL is special */ +#define PTR_TO_MODE(p) ((mode_t) ((uintptr_t) (p)-1)) +#define MODE_TO_PTR(u) ((void *) ((uintptr_t) (u)+1)) + +int rmdir_parents(const char *path, const char *stop); + +int rename_noreplace(int olddirfd, const char *oldpath, int newdirfd, const char *newpath); + +int readlinkat_malloc(int fd, const char *p, char **ret); +int readlink_malloc(const char *p, char **r); +int readlink_value(const char *p, char **ret); +int readlink_and_make_absolute(const char *p, char **r); + +int chmod_and_chown_at(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid); +static inline int chmod_and_chown(const char *path, mode_t mode, uid_t uid, gid_t gid) { + return chmod_and_chown_at(AT_FDCWD, path, mode, uid, gid); +} +int fchmod_and_chown_with_fallback(int fd, const char *path, mode_t mode, uid_t uid, gid_t gid); +static inline int fchmod_and_chown(int fd, mode_t mode, uid_t uid, gid_t gid) { + return fchmod_and_chown_with_fallback(fd, NULL, mode, uid, gid); /* no fallback */ +} + +int fchmod_umask(int fd, mode_t mode); +int fchmod_opath(int fd, mode_t m); + +int futimens_opath(int fd, const struct timespec ts[2]); + +int fd_warn_permissions(const char *path, int fd); +int stat_warn_permissions(const char *path, const struct stat *st); + +#define laccess(path, mode) \ + RET_NERRNO(faccessat(AT_FDCWD, (path), (mode), AT_SYMLINK_NOFOLLOW)) + +int touch_file(const char *path, bool parents, usec_t stamp, uid_t uid, gid_t gid, mode_t mode); + +static inline int touch(const char *path) { + return touch_file(path, false, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); +} + +int symlink_idempotent(const char *from, const char *to, bool make_relative); + +int symlinkat_atomic_full(const char *from, int atfd, const char *to, bool make_relative); +static inline int symlink_atomic(const char *from, const char *to) { + return symlinkat_atomic_full(from, AT_FDCWD, to, false); +} + +int mknodat_atomic(int atfd, const char *path, mode_t mode, dev_t dev); +static inline int mknod_atomic(const char *path, mode_t mode, dev_t dev) { + return mknodat_atomic(AT_FDCWD, path, mode, dev); +} + +int mkfifoat_atomic(int dir_fd, const char *path, mode_t mode); +static inline int mkfifo_atomic(const char *path, mode_t mode) { + return mkfifoat_atomic(AT_FDCWD, path, mode); +} + +int get_files_in_directory(const char *path, char ***list); + +int tmp_dir(const char **ret); +int var_tmp_dir(const char **ret); + +int unlink_or_warn(const char *filename); + +/* Useful for usage with _cleanup_(), removes a directory and frees the pointer */ +static inline char *rmdir_and_free(char *p) { + PROTECT_ERRNO; + + if (!p) + return NULL; + + (void) rmdir(p); + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, rmdir_and_free); + +static inline char* unlink_and_free(char *p) { + if (!p) + return NULL; + + (void) unlink(p); + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, unlink_and_free); + +int access_fd(int fd, int mode); + +void unlink_tempfilep(char (*p)[]); + +typedef enum UnlinkDeallocateFlags { + UNLINK_REMOVEDIR = 1 << 0, + UNLINK_ERASE = 1 << 1, +} UnlinkDeallocateFlags; + +int unlinkat_deallocate(int fd, const char *name, UnlinkDeallocateFlags flags); + +int open_parent_at(int dir_fd, const char *path, int flags, mode_t mode); +static inline int open_parent(const char *path, int flags, mode_t mode) { + return open_parent_at(AT_FDCWD, path, flags, mode); +} + +int conservative_renameat(int olddirfd, const char *oldpath, int newdirfd, const char *newpath); +static inline int conservative_rename(const char *oldpath, const char *newpath) { + return conservative_renameat(AT_FDCWD, oldpath, AT_FDCWD, newpath); +} + +int posix_fallocate_loop(int fd, uint64_t offset, uint64_t size); + +int parse_cifs_service(const char *s, char **ret_host, char **ret_service, char **ret_path); + +int open_mkdir_at(int dirfd, const char *path, int flags, mode_t mode); + +int openat_report_new(int dirfd, const char *pathname, int flags, mode_t mode, bool *ret_newly_created); + +typedef enum XOpenFlags { + XO_LABEL = 1 << 0, + XO_SUBVOLUME = 1 << 1, +} XOpenFlags; + +int xopenat(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode); + +int xopenat_lock(int dir_fd, const char *path, int open_flags, XOpenFlags xopen_flags, mode_t mode, LockType locktype, int operation); diff --git a/src/basic/gcrypt-util.c b/src/basic/gcrypt-util.c new file mode 100644 index 0000000..41c9362 --- /dev/null +++ b/src/basic/gcrypt-util.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_GCRYPT + +#include "gcrypt-util.h" +#include "hexdecoct.h" + +void initialize_libgcrypt(bool secmem) { + if (gcry_control(GCRYCTL_INITIALIZATION_FINISHED_P)) + return; + + gcry_control(GCRYCTL_SET_PREFERRED_RNG_TYPE, GCRY_RNG_TYPE_SYSTEM); + assert_se(gcry_check_version("1.4.5")); + + /* Turn off "secmem". Clients which wish to make use of this + * feature should initialize the library manually */ + if (!secmem) + gcry_control(GCRYCTL_DISABLE_SECMEM); + + gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0); +} + +# if !PREFER_OPENSSL +int string_hashsum(const char *s, size_t len, int md_algorithm, char **out) { + _cleanup_(gcry_md_closep) gcry_md_hd_t md = NULL; + gcry_error_t err; + size_t hash_size; + void *hash; + char *enc; + + initialize_libgcrypt(false); + + hash_size = gcry_md_get_algo_dlen(md_algorithm); + assert(hash_size > 0); + + err = gcry_md_open(&md, md_algorithm, 0); + if (gcry_err_code(err) != GPG_ERR_NO_ERROR || !md) + return -EIO; + + gcry_md_write(md, s, len); + + hash = gcry_md_read(md, 0); + if (!hash) + return -EIO; + + enc = hexmem(hash, hash_size); + if (!enc) + return -ENOMEM; + + *out = enc; + return 0; +} +# endif +#endif diff --git a/src/basic/gcrypt-util.h b/src/basic/gcrypt-util.h new file mode 100644 index 0000000..4c40cef --- /dev/null +++ b/src/basic/gcrypt-util.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include +#include + +#if HAVE_GCRYPT +#include + +#include "macro.h" + +void initialize_libgcrypt(bool secmem); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(gcry_md_hd_t, gcry_md_close, NULL); +#endif + +#if !PREFER_OPENSSL +# if HAVE_GCRYPT +int string_hashsum(const char *s, size_t len, int md_algorithm, char **out); +# endif + +static inline int string_hashsum_sha224(const char *s, size_t len, char **out) { +# if HAVE_GCRYPT + return string_hashsum(s, len, GCRY_MD_SHA224, out); +# else + return -EOPNOTSUPP; +# endif +} + +static inline int string_hashsum_sha256(const char *s, size_t len, char **out) { +# if HAVE_GCRYPT + return string_hashsum(s, len, GCRY_MD_SHA256, out); +# else + return -EOPNOTSUPP; +# endif +} +#endif diff --git a/src/basic/generate-af-list.sh b/src/basic/generate-af-list.sh new file mode 100755 index 0000000..b081485 --- /dev/null +++ b/src/basic/generate-af-list.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu +set -o pipefail + +${1:?} -E -dM -include sys/socket.h -include "${2:?}" -include "${3:?}" - + +#define SYSTEMD_GETOPT_SHORT_OPTIONS "hDbsz:" + +#define COMMON_GETOPT_ARGS \ + ARG_LOG_LEVEL = 0x100, \ + ARG_LOG_TARGET, \ + ARG_LOG_COLOR, \ + ARG_LOG_LOCATION, \ + ARG_LOG_TIME + +#define SYSTEMD_GETOPT_ARGS \ + ARG_UNIT, \ + ARG_SYSTEM, \ + ARG_USER, \ + ARG_TEST, \ + ARG_NO_PAGER, \ + ARG_VERSION, \ + ARG_DUMP_CONFIGURATION_ITEMS, \ + ARG_DUMP_BUS_PROPERTIES, \ + ARG_BUS_INTROSPECT, \ + ARG_DUMP_CORE, \ + ARG_CRASH_CHVT, \ + ARG_CRASH_SHELL, \ + ARG_CRASH_REBOOT, \ + ARG_CONFIRM_SPAWN, \ + ARG_SHOW_STATUS, \ + ARG_DESERIALIZE, \ + ARG_SWITCHED_ROOT, \ + ARG_DEFAULT_STD_OUTPUT, \ + ARG_DEFAULT_STD_ERROR, \ + ARG_MACHINE_ID, \ + ARG_SERVICE_WATCHDOGS + +#define SHUTDOWN_GETOPT_ARGS \ + ARG_EXIT_CODE, \ + ARG_TIMEOUT + +#define COMMON_GETOPT_OPTIONS \ + { "log-level", required_argument, NULL, ARG_LOG_LEVEL }, \ + { "log-target", required_argument, NULL, ARG_LOG_TARGET }, \ + { "log-color", optional_argument, NULL, ARG_LOG_COLOR }, \ + { "log-location", optional_argument, NULL, ARG_LOG_LOCATION }, \ + { "log-time", optional_argument, NULL, ARG_LOG_TIME } + +#define SYSTEMD_GETOPT_OPTIONS \ + { "unit", required_argument, NULL, ARG_UNIT }, \ + { "system", no_argument, NULL, ARG_SYSTEM }, \ + { "user", no_argument, NULL, ARG_USER }, \ + { "test", no_argument, NULL, ARG_TEST }, \ + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, \ + { "help", no_argument, NULL, 'h' }, \ + { "version", no_argument, NULL, ARG_VERSION }, \ + { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS }, \ + { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES }, \ + { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT }, \ + { "dump-core", optional_argument, NULL, ARG_DUMP_CORE }, \ + { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT }, \ + { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL }, \ + { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT }, \ + { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN }, \ + { "show-status", optional_argument, NULL, ARG_SHOW_STATUS }, \ + { "deserialize", required_argument, NULL, ARG_DESERIALIZE }, \ + { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT }, \ + { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, }, \ + { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, }, \ + { "machine-id", required_argument, NULL, ARG_MACHINE_ID }, \ + { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS } + +#define SHUTDOWN_GETOPT_OPTIONS \ + { "exit-code", required_argument, NULL, ARG_EXIT_CODE }, \ + { "timeout", required_argument, NULL, ARG_TIMEOUT } diff --git a/src/basic/glob-util.c b/src/basic/glob-util.c new file mode 100644 index 0000000..802ca8c --- /dev/null +++ b/src/basic/glob-util.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "dirent-util.h" +#include "errno-util.h" +#include "glob-util.h" +#include "macro.h" +#include "path-util.h" +#include "strv.h" + +static void closedir_wrapper(void* v) { + (void) closedir(v); +} + +int safe_glob(const char *path, int flags, glob_t *pglob) { + int k; + + /* We want to set GLOB_ALTDIRFUNC ourselves, don't allow it to be set. */ + assert(!(flags & GLOB_ALTDIRFUNC)); + + if (!pglob->gl_closedir) + pglob->gl_closedir = closedir_wrapper; + if (!pglob->gl_readdir) + pglob->gl_readdir = (struct dirent *(*)(void *)) readdir_no_dot; + if (!pglob->gl_opendir) + pglob->gl_opendir = (void *(*)(const char *)) opendir; + if (!pglob->gl_lstat) + pglob->gl_lstat = lstat; + if (!pglob->gl_stat) + pglob->gl_stat = stat; + + errno = 0; + k = glob(path, flags | GLOB_ALTDIRFUNC, NULL, pglob); + if (k == GLOB_NOMATCH) + return -ENOENT; + if (k == GLOB_NOSPACE) + return -ENOMEM; + if (k != 0) + return errno_or_else(EIO); + if (strv_isempty(pglob->gl_pathv)) + return -ENOENT; + + return 0; +} + +int glob_first(const char *path, char **ret_first) { + _cleanup_globfree_ glob_t g = {}; + int k; + + assert(path); + + k = safe_glob(path, GLOB_NOSORT|GLOB_BRACE, &g); + if (k == -ENOENT) { + if (ret_first) + *ret_first = NULL; + return false; + } + if (k < 0) + return k; + + if (ret_first) { + assert(g.gl_pathv && g.gl_pathv[0]); + + char *first = strdup(g.gl_pathv[0]); + if (!first) + return log_oom_debug(); + *ret_first = first; + } + + return true; +} + +int glob_extend(char ***strv, const char *path, int flags) { + _cleanup_globfree_ glob_t g = {}; + int k; + + k = safe_glob(path, GLOB_NOSORT|GLOB_BRACE|flags, &g); + if (k < 0) + return k; + + return strv_extend_strv(strv, g.gl_pathv, false); +} + +int glob_non_glob_prefix(const char *path, char **ret) { + /* Return the path of the path that has no glob characters. */ + + size_t n = strcspn(path, GLOB_CHARS); + + if (path[n] != '\0') + while (n > 0 && path[n-1] != '/') + n--; + + if (n == 0) + return -ENOENT; + + char *ans = strndup(path, n); + if (!ans) + return -ENOMEM; + *ret = ans; + return 0; +} diff --git a/src/basic/glob-util.h b/src/basic/glob-util.h new file mode 100644 index 0000000..7ca26cc --- /dev/null +++ b/src/basic/glob-util.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" +#include "string-util.h" + +/* Note: this function modifies pglob to set various functions. */ +int safe_glob(const char *path, int flags, glob_t *pglob); + +/* Note: which match is returned depends on the implementation/system and not guaranteed to be stable */ +int glob_first(const char *path, char **ret_first); +#define glob_exists(path) glob_first(path, NULL) +int glob_extend(char ***strv, const char *path, int flags); + +int glob_non_glob_prefix(const char *path, char **ret); + +#define _cleanup_globfree_ _cleanup_(globfree) + +_pure_ static inline bool string_is_glob(const char *p) { + /* Check if a string contains any glob patterns. */ + return !!strpbrk(p, GLOB_CHARS); +} diff --git a/src/basic/glyph-util.c b/src/basic/glyph-util.c new file mode 100644 index 0000000..803bdd9 --- /dev/null +++ b/src/basic/glyph-util.c @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "env-util.h" +#include "glyph-util.h" +#include "locale-util.h" +#include "strv.h" + +bool emoji_enabled(void) { + static int cached_emoji_enabled = -1; + + if (cached_emoji_enabled < 0) { + int val = getenv_bool("SYSTEMD_EMOJI"); + if (val >= 0) + return (cached_emoji_enabled = val); + + const char *term = getenv("TERM"); + if (!term || STR_IN_SET(term, "dumb", "linux")) + return (cached_emoji_enabled = false); + + cached_emoji_enabled = is_locale_utf8(); + } + + return cached_emoji_enabled; +} + +const char *special_glyph_full(SpecialGlyph code, bool force_utf) { + + /* A list of a number of interesting unicode glyphs we can use to decorate our output. It's probably wise to be + * conservative here, and primarily stick to the glyphs defined in the eurlatgr font, so that display still + * works reasonably well on the Linux console. For details see: + * + * http://git.altlinux.org/people/legion/packages/kbd.git?p=kbd.git;a=blob;f=data/consolefonts/README.eurlatgr + */ + + static const char* const draw_table[2][_SPECIAL_GLYPH_MAX] = { + /* ASCII fallback */ + [false] = { + [SPECIAL_GLYPH_TREE_VERTICAL] = "| ", + [SPECIAL_GLYPH_TREE_BRANCH] = "|-", + [SPECIAL_GLYPH_TREE_RIGHT] = "`-", + [SPECIAL_GLYPH_TREE_SPACE] = " ", + [SPECIAL_GLYPH_TREE_TOP] = ",-", + [SPECIAL_GLYPH_VERTICAL_DOTTED] = ":", + [SPECIAL_GLYPH_TRIANGULAR_BULLET] = ">", + [SPECIAL_GLYPH_BLACK_CIRCLE] = "*", + [SPECIAL_GLYPH_WHITE_CIRCLE] = "*", + [SPECIAL_GLYPH_MULTIPLICATION_SIGN] = "x", + [SPECIAL_GLYPH_CIRCLE_ARROW] = "*", + [SPECIAL_GLYPH_BULLET] = "*", + [SPECIAL_GLYPH_MU] = "u", + [SPECIAL_GLYPH_CHECK_MARK] = "+", + [SPECIAL_GLYPH_CROSS_MARK] = "-", + [SPECIAL_GLYPH_LIGHT_SHADE] = "-", + [SPECIAL_GLYPH_DARK_SHADE] = "X", + [SPECIAL_GLYPH_FULL_BLOCK] = "#", + [SPECIAL_GLYPH_SIGMA] = "S", + [SPECIAL_GLYPH_ARROW_UP] = "^", + [SPECIAL_GLYPH_ARROW_DOWN] = "v", + [SPECIAL_GLYPH_ARROW_LEFT] = "<-", + [SPECIAL_GLYPH_ARROW_RIGHT] = "->", + [SPECIAL_GLYPH_ELLIPSIS] = "...", + [SPECIAL_GLYPH_EXTERNAL_LINK] = "[LNK]", + [SPECIAL_GLYPH_ECSTATIC_SMILEY] = ":-]", + [SPECIAL_GLYPH_HAPPY_SMILEY] = ":-}", + [SPECIAL_GLYPH_SLIGHTLY_HAPPY_SMILEY] = ":-)", + [SPECIAL_GLYPH_NEUTRAL_SMILEY] = ":-|", + [SPECIAL_GLYPH_SLIGHTLY_UNHAPPY_SMILEY] = ":-(", + [SPECIAL_GLYPH_UNHAPPY_SMILEY] = ":-{", + [SPECIAL_GLYPH_DEPRESSED_SMILEY] = ":-[", + [SPECIAL_GLYPH_LOCK_AND_KEY] = "o-,", + [SPECIAL_GLYPH_TOUCH] = "O=", /* Yeah, not very convincing, can you do it better? */ + [SPECIAL_GLYPH_RECYCLING] = "~", + [SPECIAL_GLYPH_DOWNLOAD] = "\\", + [SPECIAL_GLYPH_SPARKLES] = "*", + [SPECIAL_GLYPH_LOW_BATTERY] = "!", + [SPECIAL_GLYPH_WARNING_SIGN] = "!", + }, + + /* UTF-8 */ + [true] = { + /* The following are multiple glyphs in both ASCII and in UNICODE */ + [SPECIAL_GLYPH_TREE_VERTICAL] = u8"│ ", + [SPECIAL_GLYPH_TREE_BRANCH] = u8"├─", + [SPECIAL_GLYPH_TREE_RIGHT] = u8"└─", + [SPECIAL_GLYPH_TREE_SPACE] = u8" ", + [SPECIAL_GLYPH_TREE_TOP] = u8"┌─", + + /* Single glyphs in both cases */ + [SPECIAL_GLYPH_VERTICAL_DOTTED] = u8"┆", + [SPECIAL_GLYPH_TRIANGULAR_BULLET] = u8"‣", + [SPECIAL_GLYPH_BLACK_CIRCLE] = u8"●", + [SPECIAL_GLYPH_WHITE_CIRCLE] = u8"○", + [SPECIAL_GLYPH_MULTIPLICATION_SIGN] = u8"×", + [SPECIAL_GLYPH_CIRCLE_ARROW] = u8"↻", + [SPECIAL_GLYPH_BULLET] = u8"•", + [SPECIAL_GLYPH_MU] = u8"μ", /* actually called: GREEK SMALL LETTER MU */ + [SPECIAL_GLYPH_CHECK_MARK] = u8"✓", + [SPECIAL_GLYPH_CROSS_MARK] = u8"✗", /* actually called: BALLOT X */ + [SPECIAL_GLYPH_LIGHT_SHADE] = u8"░", + [SPECIAL_GLYPH_DARK_SHADE] = u8"▒", + [SPECIAL_GLYPH_FULL_BLOCK] = u8"█", + [SPECIAL_GLYPH_SIGMA] = u8"Σ", + [SPECIAL_GLYPH_ARROW_UP] = u8"↑", /* actually called: UPWARDS ARROW */ + [SPECIAL_GLYPH_ARROW_DOWN] = u8"↓", /* actually called: DOWNWARDS ARROW */ + + /* Single glyph in Unicode, two in ASCII */ + [SPECIAL_GLYPH_ARROW_LEFT] = u8"←", /* actually called: LEFTWARDS ARROW */ + [SPECIAL_GLYPH_ARROW_RIGHT] = u8"→", /* actually called: RIGHTWARDS ARROW */ + + /* Single glyph in Unicode, three in ASCII */ + [SPECIAL_GLYPH_ELLIPSIS] = u8"…", /* actually called: HORIZONTAL ELLIPSIS */ + + /* Three glyphs in Unicode, five in ASCII */ + [SPECIAL_GLYPH_EXTERNAL_LINK] = u8"[🡕]", /* actually called: NORTH EAST SANS-SERIF ARROW, enclosed in [] */ + + /* These smileys are a single glyph in Unicode, and three in ASCII */ + [SPECIAL_GLYPH_ECSTATIC_SMILEY] = u8"😇", /* actually called: SMILING FACE WITH HALO */ + [SPECIAL_GLYPH_HAPPY_SMILEY] = u8"😀", /* actually called: GRINNING FACE */ + [SPECIAL_GLYPH_SLIGHTLY_HAPPY_SMILEY] = u8"🙂", /* actually called: SLIGHTLY SMILING FACE */ + [SPECIAL_GLYPH_NEUTRAL_SMILEY] = u8"😐", /* actually called: NEUTRAL FACE */ + [SPECIAL_GLYPH_SLIGHTLY_UNHAPPY_SMILEY] = u8"🙁", /* actually called: SLIGHTLY FROWNING FACE */ + [SPECIAL_GLYPH_UNHAPPY_SMILEY] = u8"😨", /* actually called: FEARFUL FACE */ + [SPECIAL_GLYPH_DEPRESSED_SMILEY] = u8"🤢", /* actually called: NAUSEATED FACE */ + + /* This emoji is a single character cell glyph in Unicode, and three in ASCII */ + [SPECIAL_GLYPH_LOCK_AND_KEY] = u8"🔐", /* actually called: CLOSED LOCK WITH KEY */ + + /* This emoji is a single character cell glyph in Unicode, and two in ASCII */ + [SPECIAL_GLYPH_TOUCH] = u8"👆", /* actually called: BACKHAND INDEX POINTING UP */ + + /* These four emojis are single character cell glyphs in Unicode and also in ASCII. */ + [SPECIAL_GLYPH_RECYCLING] = u8"♻️", /* actually called: UNIVERSAL RECYCLNG SYMBOL */ + [SPECIAL_GLYPH_DOWNLOAD] = u8"⤵️", /* actually called: RIGHT ARROW CURVING DOWN */ + [SPECIAL_GLYPH_SPARKLES] = u8"✨", + [SPECIAL_GLYPH_LOW_BATTERY] = u8"🪫", + [SPECIAL_GLYPH_WARNING_SIGN] = u8"⚠️", + [SPECIAL_GLYPH_COMPUTER_DISK] = u8"💽", + [SPECIAL_GLYPH_WORLD] = u8"🌍", + }, + }; + + if (code < 0) + return NULL; + + assert(code < _SPECIAL_GLYPH_MAX); + return draw_table[force_utf || (code >= _SPECIAL_GLYPH_FIRST_EMOJI ? emoji_enabled() : is_locale_utf8())][code]; +} diff --git a/src/basic/glyph-util.h b/src/basic/glyph-util.h new file mode 100644 index 0000000..a770997 --- /dev/null +++ b/src/basic/glyph-util.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +typedef enum SpecialGlyph { + SPECIAL_GLYPH_TREE_VERTICAL, + SPECIAL_GLYPH_TREE_BRANCH, + SPECIAL_GLYPH_TREE_RIGHT, + SPECIAL_GLYPH_TREE_SPACE, + SPECIAL_GLYPH_TREE_TOP, + SPECIAL_GLYPH_VERTICAL_DOTTED, + SPECIAL_GLYPH_TRIANGULAR_BULLET, + SPECIAL_GLYPH_BLACK_CIRCLE, + SPECIAL_GLYPH_WHITE_CIRCLE, + SPECIAL_GLYPH_MULTIPLICATION_SIGN, + SPECIAL_GLYPH_CIRCLE_ARROW, + SPECIAL_GLYPH_BULLET, + SPECIAL_GLYPH_MU, + SPECIAL_GLYPH_CHECK_MARK, + SPECIAL_GLYPH_CROSS_MARK, + SPECIAL_GLYPH_LIGHT_SHADE, + SPECIAL_GLYPH_DARK_SHADE, + SPECIAL_GLYPH_FULL_BLOCK, + SPECIAL_GLYPH_SIGMA, + SPECIAL_GLYPH_ARROW_UP, + SPECIAL_GLYPH_ARROW_DOWN, + SPECIAL_GLYPH_ARROW_LEFT, + SPECIAL_GLYPH_ARROW_RIGHT, + SPECIAL_GLYPH_ELLIPSIS, + SPECIAL_GLYPH_EXTERNAL_LINK, + _SPECIAL_GLYPH_FIRST_EMOJI, + SPECIAL_GLYPH_ECSTATIC_SMILEY = _SPECIAL_GLYPH_FIRST_EMOJI, + SPECIAL_GLYPH_HAPPY_SMILEY, + SPECIAL_GLYPH_SLIGHTLY_HAPPY_SMILEY, + SPECIAL_GLYPH_NEUTRAL_SMILEY, + SPECIAL_GLYPH_SLIGHTLY_UNHAPPY_SMILEY, + SPECIAL_GLYPH_UNHAPPY_SMILEY, + SPECIAL_GLYPH_DEPRESSED_SMILEY, + SPECIAL_GLYPH_LOCK_AND_KEY, + SPECIAL_GLYPH_TOUCH, + SPECIAL_GLYPH_RECYCLING, + SPECIAL_GLYPH_DOWNLOAD, + SPECIAL_GLYPH_SPARKLES, + SPECIAL_GLYPH_LOW_BATTERY, + SPECIAL_GLYPH_WARNING_SIGN, + SPECIAL_GLYPH_COMPUTER_DISK, + SPECIAL_GLYPH_WORLD, + _SPECIAL_GLYPH_MAX, + _SPECIAL_GLYPH_INVALID = -EINVAL, +} SpecialGlyph; + +bool emoji_enabled(void); + +const char *special_glyph_full(SpecialGlyph code, bool force_utf) _const_; + +static inline const char *special_glyph(SpecialGlyph code) { + return special_glyph_full(code, false); +} + +static inline const char *special_glyph_check_mark(bool b) { + return b ? special_glyph(SPECIAL_GLYPH_CHECK_MARK) : special_glyph(SPECIAL_GLYPH_CROSS_MARK); +} + +static inline const char *special_glyph_check_mark_space(bool b) { + return b ? special_glyph(SPECIAL_GLYPH_CHECK_MARK) : " "; +} diff --git a/src/basic/gunicode.c b/src/basic/gunicode.c new file mode 100644 index 0000000..36beb95 --- /dev/null +++ b/src/basic/gunicode.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* gunicode.c - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright © 2000, 2005 Red Hat, Inc. + */ + +#include "gunicode.h" + +#define unichar uint32_t + +/** + * g_utf8_prev_char: + * @p: a pointer to a position within a UTF-8 encoded string + * + * Finds the previous UTF-8 character in the string before @p. + * + * @p does not have to be at the beginning of a UTF-8 character. No check + * is made to see if the character found is actually valid other than + * it starts with an appropriate byte. If @p might be the first + * character of the string, you must use g_utf8_find_prev_char() instead. + * + * Return value: a pointer to the found character. + **/ +char * +utf8_prev_char (const char *p) +{ + for (;;) + { + p--; + if ((*p & 0xc0) != 0x80) + return (char *)p; + } +} + +struct Interval +{ + unichar start, end; +}; + +static int +interval_compare (const void *key, const void *elt) +{ + unichar c = (unichar) (long) (key); + struct Interval *interval = (struct Interval *)elt; + + if (c < interval->start) + return -1; + if (c > interval->end) + return +1; + + return 0; +} + +/* + * NOTE: + * + * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are + * generated from the Unicode Character Database's file + * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py + * in this way: + * + * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt + * + * Last update for Unicode 6.0. + */ + +/** + * g_unichar_iswide: + * @c: a Unicode character + * + * Determines if a character is typically rendered in a double-width + * cell. + * + * Return value: %TRUE if the character is wide + **/ +bool +unichar_iswide (unichar c) +{ + /* See NOTE earlier for how to update this table. */ + static const struct Interval wide[] = { + {0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, + {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096}, + {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA}, + {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE}, + {0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C}, + {0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, + {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, + {0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, + {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, + {0x1F300, 0x1F567}, /* Miscellaneous Symbols and Pictographs */ + {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}, + }; + + if (bsearch ((void *)(uintptr_t)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0], + interval_compare)) + return true; + + return false; +} + +const char utf8_skip_data[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; diff --git a/src/basic/gunicode.h b/src/basic/gunicode.h new file mode 100644 index 0000000..6b71839 --- /dev/null +++ b/src/basic/gunicode.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* gunicode.h - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright © 2000, 2005 Red Hat, Inc. + */ +#pragma once + +#include +#include +#include + +char *utf8_prev_char (const char *p); + +extern const char utf8_skip_data[256]; + +/** + * g_utf8_next_char: + * @p: Pointer to the start of a valid UTF-8 character + * + * Skips to the next character in a UTF-8 string. The string must be + * valid; this macro is as fast as possible, and has no error-checking. + * You would use this macro to iterate over a string character by + * character. The macro returns the start of the next UTF-8 character. + * Before using this macro, use g_utf8_validate() to validate strings + * that may contain invalid UTF-8. + */ +#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const unsigned char *)(p)]) + +bool unichar_iswide (uint32_t c); diff --git a/src/basic/hash-funcs.c b/src/basic/hash-funcs.c new file mode 100644 index 0000000..5fac467 --- /dev/null +++ b/src/basic/hash-funcs.c @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "hash-funcs.h" +#include "path-util.h" +#include "strv.h" + +void string_hash_func(const char *p, struct siphash *state) { + siphash24_compress(p, strlen(p) + 1, state); +} + +DEFINE_HASH_OPS(string_hash_ops, char, string_hash_func, string_compare_func); +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(string_hash_ops_free, + char, string_hash_func, string_compare_func, free); +DEFINE_HASH_OPS_FULL(string_hash_ops_free_free, + char, string_hash_func, string_compare_func, free, + void, free); +DEFINE_HASH_OPS_FULL(string_hash_ops_free_strv_free, + char, string_hash_func, string_compare_func, free, + char*, strv_free); + +void path_hash_func(const char *q, struct siphash *state) { + bool add_slash = false; + + assert(q); + assert(state); + + /* Calculates a hash for a path in a way this duplicate inner slashes don't make a differences, and also + * whether there's a trailing slash or not. This fits well with the semantics of path_compare(), which does + * similar checks and also doesn't care for trailing slashes. Note that relative and absolute paths (i.e. those + * which begin in a slash or not) will hash differently though. */ + + /* if path is absolute, add one "/" to the hash. */ + if (path_is_absolute(q)) + siphash24_compress("/", 1, state); + + for (;;) { + const char *e; + int r; + + r = path_find_first_component(&q, true, &e); + if (r == 0) + return; + + if (add_slash) + siphash24_compress_byte('/', state); + + if (r < 0) { + /* if a component is invalid, then add remaining part as a string. */ + string_hash_func(q, state); + return; + } + + /* Add this component to the hash. */ + siphash24_compress(e, r, state); + + add_slash = true; + } +} + +DEFINE_HASH_OPS(path_hash_ops, char, path_hash_func, path_compare); +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(path_hash_ops_free, + char, path_hash_func, path_compare, free); +DEFINE_HASH_OPS_FULL(path_hash_ops_free_free, + char, path_hash_func, path_compare, free, + void, free); + +void trivial_hash_func(const void *p, struct siphash *state) { + siphash24_compress(&p, sizeof(p), state); +} + +int trivial_compare_func(const void *a, const void *b) { + return CMP(a, b); +} + +const struct hash_ops trivial_hash_ops = { + .hash = trivial_hash_func, + .compare = trivial_compare_func, +}; + +const struct hash_ops trivial_hash_ops_free = { + .hash = trivial_hash_func, + .compare = trivial_compare_func, + .free_key = free, +}; + +const struct hash_ops trivial_hash_ops_free_free = { + .hash = trivial_hash_func, + .compare = trivial_compare_func, + .free_key = free, + .free_value = free, +}; + +void uint64_hash_func(const uint64_t *p, struct siphash *state) { + siphash24_compress(p, sizeof(uint64_t), state); +} + +int uint64_compare_func(const uint64_t *a, const uint64_t *b) { + return CMP(*a, *b); +} + +DEFINE_HASH_OPS(uint64_hash_ops, uint64_t, uint64_hash_func, uint64_compare_func); + +#if SIZEOF_DEV_T != 8 +void devt_hash_func(const dev_t *p, struct siphash *state) { + siphash24_compress(p, sizeof(dev_t), state); +} +#endif + +int devt_compare_func(const dev_t *a, const dev_t *b) { + int r; + + r = CMP(major(*a), major(*b)); + if (r != 0) + return r; + + return CMP(minor(*a), minor(*b)); +} + +DEFINE_HASH_OPS(devt_hash_ops, dev_t, devt_hash_func, devt_compare_func); diff --git a/src/basic/hash-funcs.h b/src/basic/hash-funcs.h new file mode 100644 index 0000000..3804e94 --- /dev/null +++ b/src/basic/hash-funcs.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "alloc-util.h" +#include "macro.h" +#include "siphash24.h" + +typedef void (*hash_func_t)(const void *p, struct siphash *state); +typedef int (*compare_func_t)(const void *a, const void *b); + +struct hash_ops { + hash_func_t hash; + compare_func_t compare; + free_func_t free_key; + free_func_t free_value; +}; + +#define _DEFINE_HASH_OPS(uq, name, type, hash_func, compare_func, free_key_func, free_value_func, scope) \ + _unused_ static void (* UNIQ_T(static_hash_wrapper, uq))(const type *, struct siphash *) = hash_func; \ + _unused_ static int (* UNIQ_T(static_compare_wrapper, uq))(const type *, const type *) = compare_func; \ + scope const struct hash_ops name = { \ + .hash = (hash_func_t) hash_func, \ + .compare = (compare_func_t) compare_func, \ + .free_key = free_key_func, \ + .free_value = free_value_func, \ + } + +#define _DEFINE_FREE_FUNC(uq, type, wrapper_name, func) \ + /* Type-safe free function */ \ + static void UNIQ_T(wrapper_name, uq)(void *a) { \ + type *_a = a; \ + func(_a); \ + } + +#define _DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(uq, name, type, hash_func, compare_func, free_func, scope) \ + _DEFINE_FREE_FUNC(uq, type, static_free_wrapper, free_func); \ + _DEFINE_HASH_OPS(uq, name, type, hash_func, compare_func, \ + UNIQ_T(static_free_wrapper, uq), NULL, scope) + +#define _DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(uq, name, type, hash_func, compare_func, type_value, free_func, scope) \ + _DEFINE_FREE_FUNC(uq, type_value, static_free_wrapper, free_func); \ + _DEFINE_HASH_OPS(uq, name, type, hash_func, compare_func, \ + NULL, UNIQ_T(static_free_wrapper, uq), scope) + +#define _DEFINE_HASH_OPS_FULL(uq, name, type, hash_func, compare_func, free_key_func, type_value, free_value_func, scope) \ + _DEFINE_FREE_FUNC(uq, type, static_free_key_wrapper, free_key_func); \ + _DEFINE_FREE_FUNC(uq, type_value, static_free_value_wrapper, free_value_func); \ + _DEFINE_HASH_OPS(uq, name, type, hash_func, compare_func, \ + UNIQ_T(static_free_key_wrapper, uq), \ + UNIQ_T(static_free_value_wrapper, uq), scope) + +#define DEFINE_HASH_OPS(name, type, hash_func, compare_func) \ + _DEFINE_HASH_OPS(UNIQ, name, type, hash_func, compare_func, NULL, NULL,) + +#define DEFINE_PRIVATE_HASH_OPS(name, type, hash_func, compare_func) \ + _DEFINE_HASH_OPS(UNIQ, name, type, hash_func, compare_func, NULL, NULL, static) + +#define DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(name, type, hash_func, compare_func, free_func) \ + _DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(UNIQ, name, type, hash_func, compare_func, free_func,) + +#define DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(name, type, hash_func, compare_func, free_func) \ + _DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(UNIQ, name, type, hash_func, compare_func, free_func, static) + +#define DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(name, type, hash_func, compare_func, value_type, free_func) \ + _DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(UNIQ, name, type, hash_func, compare_func, value_type, free_func,) + +#define DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(name, type, hash_func, compare_func, value_type, free_func) \ + _DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(UNIQ, name, type, hash_func, compare_func, value_type, free_func, static) + +#define DEFINE_HASH_OPS_FULL(name, type, hash_func, compare_func, free_key_func, value_type, free_value_func) \ + _DEFINE_HASH_OPS_FULL(UNIQ, name, type, hash_func, compare_func, free_key_func, value_type, free_value_func,) + +#define DEFINE_PRIVATE_HASH_OPS_FULL(name, type, hash_func, compare_func, free_key_func, value_type, free_value_func) \ + _DEFINE_HASH_OPS_FULL(UNIQ, name, type, hash_func, compare_func, free_key_func, value_type, free_value_func, static) + +void string_hash_func(const char *p, struct siphash *state); +#define string_compare_func strcmp +extern const struct hash_ops string_hash_ops; +extern const struct hash_ops string_hash_ops_free; +extern const struct hash_ops string_hash_ops_free_free; +extern const struct hash_ops string_hash_ops_free_strv_free; + +void path_hash_func(const char *p, struct siphash *state); +extern const struct hash_ops path_hash_ops; +extern const struct hash_ops path_hash_ops_free; +extern const struct hash_ops path_hash_ops_free_free; + +/* This will compare the passed pointers directly, and will not dereference them. This is hence not useful for strings + * or suchlike. */ +void trivial_hash_func(const void *p, struct siphash *state); +int trivial_compare_func(const void *a, const void *b) _const_; +extern const struct hash_ops trivial_hash_ops; +extern const struct hash_ops trivial_hash_ops_free; +extern const struct hash_ops trivial_hash_ops_free_free; + +/* 32-bit values we can always just embed in the pointer itself, but in order to support 32-bit archs we need store 64-bit + * values indirectly, since they don't fit in a pointer. */ +void uint64_hash_func(const uint64_t *p, struct siphash *state); +int uint64_compare_func(const uint64_t *a, const uint64_t *b) _pure_; +extern const struct hash_ops uint64_hash_ops; + +/* On some archs dev_t is 32-bit, and on others 64-bit. And sometimes it's 64-bit on 32-bit archs, and sometimes 32-bit on + * 64-bit archs. Yuck! */ +#if SIZEOF_DEV_T != 8 +void devt_hash_func(const dev_t *p, struct siphash *state); +#else +#define devt_hash_func uint64_hash_func +#endif + +int devt_compare_func(const dev_t *a, const dev_t *b) _pure_; +extern const struct hash_ops devt_hash_ops; diff --git a/src/basic/hashmap.c b/src/basic/hashmap.c new file mode 100644 index 0000000..894760c --- /dev/null +++ b/src/basic/hashmap.c @@ -0,0 +1,2160 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +# include +#endif + +#include "alloc-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "logarithm.h" +#include "macro.h" +#include "memory-util.h" +#include "mempool.h" +#include "missing_syscall.h" +#include "process-util.h" +#include "random-util.h" +#include "set.h" +#include "siphash24.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" + +#if ENABLE_DEBUG_HASHMAP +#include "list.h" +#endif + +/* + * Implementation of hashmaps. + * Addressing: open + * - uses less RAM compared to closed addressing (chaining), because + * our entries are small (especially in Sets, which tend to contain + * the majority of entries in systemd). + * Collision resolution: Robin Hood + * - tends to equalize displacement of entries from their optimal buckets. + * Probe sequence: linear + * - though theoretically worse than random probing/uniform hashing/double + * hashing, it is good for cache locality. + * + * References: + * Celis, P. 1986. Robin Hood Hashing. + * Ph.D. Dissertation. University of Waterloo, Waterloo, Ont., Canada, Canada. + * https://cs.uwaterloo.ca/research/tr/1986/CS-86-14.pdf + * - The results are derived for random probing. Suggests deletion with + * tombstones and two mean-centered search methods. None of that works + * well for linear probing. + * + * Janson, S. 2005. Individual displacements for linear probing hashing with different insertion policies. + * ACM Trans. Algorithms 1, 2 (October 2005), 177-213. + * DOI=10.1145/1103963.1103964 http://doi.acm.org/10.1145/1103963.1103964 + * http://www.math.uu.se/~svante/papers/sj157.pdf + * - Applies to Robin Hood with linear probing. Contains remarks on + * the unsuitability of mean-centered search with linear probing. + * + * Viola, A. 2005. Exact distribution of individual displacements in linear probing hashing. + * ACM Trans. Algorithms 1, 2 (October 2005), 214-242. + * DOI=10.1145/1103963.1103965 http://doi.acm.org/10.1145/1103963.1103965 + * - Similar to Janson. Note that Viola writes about C_{m,n} (number of probes + * in a successful search), and Janson writes about displacement. C = d + 1. + * + * Goossaert, E. 2013. Robin Hood hashing: backward shift deletion. + * http://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion/ + * - Explanation of backward shift deletion with pictures. + * + * Khuong, P. 2013. The Other Robin Hood Hashing. + * http://www.pvk.ca/Blog/2013/11/26/the-other-robin-hood-hashing/ + * - Short summary of random vs. linear probing, and tombstones vs. backward shift. + */ + +/* + * XXX Ideas for improvement: + * For unordered hashmaps, randomize iteration order, similarly to Perl: + * http://blog.booking.com/hardening-perls-hash-function.html + */ + +/* INV_KEEP_FREE = 1 / (1 - max_load_factor) + * e.g. 1 / (1 - 0.8) = 5 ... keep one fifth of the buckets free. */ +#define INV_KEEP_FREE 5U + +/* Fields common to entries of all hashmap/set types */ +struct hashmap_base_entry { + const void *key; +}; + +/* Entry types for specific hashmap/set types + * hashmap_base_entry must be at the beginning of each entry struct. */ + +struct plain_hashmap_entry { + struct hashmap_base_entry b; + void *value; +}; + +struct ordered_hashmap_entry { + struct plain_hashmap_entry p; + unsigned iterate_next, iterate_previous; +}; + +struct set_entry { + struct hashmap_base_entry b; +}; + +/* In several functions it is advantageous to have the hash table extended + * virtually by a couple of additional buckets. We reserve special index values + * for these "swap" buckets. */ +#define _IDX_SWAP_BEGIN (UINT_MAX - 3) +#define IDX_PUT (_IDX_SWAP_BEGIN + 0) +#define IDX_TMP (_IDX_SWAP_BEGIN + 1) +#define _IDX_SWAP_END (_IDX_SWAP_BEGIN + 2) + +#define IDX_FIRST (UINT_MAX - 1) /* special index for freshly initialized iterators */ +#define IDX_NIL UINT_MAX /* special index value meaning "none" or "end" */ + +assert_cc(IDX_FIRST == _IDX_SWAP_END); +assert_cc(IDX_FIRST == _IDX_ITERATOR_FIRST); + +/* Storage space for the "swap" buckets. + * All entry types can fit into an ordered_hashmap_entry. */ +struct swap_entries { + struct ordered_hashmap_entry e[_IDX_SWAP_END - _IDX_SWAP_BEGIN]; +}; + +/* Distance from Initial Bucket */ +typedef uint8_t dib_raw_t; +#define DIB_RAW_OVERFLOW ((dib_raw_t)0xfdU) /* indicates DIB value is greater than representable */ +#define DIB_RAW_REHASH ((dib_raw_t)0xfeU) /* entry yet to be rehashed during in-place resize */ +#define DIB_RAW_FREE ((dib_raw_t)0xffU) /* a free bucket */ +#define DIB_RAW_INIT ((char)DIB_RAW_FREE) /* a byte to memset a DIB store with when initializing */ + +#define DIB_FREE UINT_MAX + +#if ENABLE_DEBUG_HASHMAP +struct hashmap_debug_info { + LIST_FIELDS(struct hashmap_debug_info, debug_list); + unsigned max_entries; /* high watermark of n_entries */ + + /* who allocated this hashmap */ + int line; + const char *file; + const char *func; + + /* fields to detect modification while iterating */ + unsigned put_count; /* counts puts into the hashmap */ + unsigned rem_count; /* counts removals from hashmap */ + unsigned last_rem_idx; /* remembers last removal index */ +}; + +/* Tracks all existing hashmaps. Get at it from gdb. See sd_dump_hashmaps.py */ +static LIST_HEAD(struct hashmap_debug_info, hashmap_debug_list); +static pthread_mutex_t hashmap_debug_list_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +enum HashmapType { + HASHMAP_TYPE_PLAIN, + HASHMAP_TYPE_ORDERED, + HASHMAP_TYPE_SET, + _HASHMAP_TYPE_MAX +}; + +struct _packed_ indirect_storage { + void *storage; /* where buckets and DIBs are stored */ + uint8_t hash_key[HASH_KEY_SIZE]; /* hash key; changes during resize */ + + unsigned n_entries; /* number of stored entries */ + unsigned n_buckets; /* number of buckets */ + + unsigned idx_lowest_entry; /* Index below which all buckets are free. + Makes "while (hashmap_steal_first())" loops + O(n) instead of O(n^2) for unordered hashmaps. */ + uint8_t _pad[3]; /* padding for the whole HashmapBase */ + /* The bitfields in HashmapBase complete the alignment of the whole thing. */ +}; + +struct direct_storage { + /* This gives us 39 bytes on 64-bit, or 35 bytes on 32-bit. + * That's room for 4 set_entries + 4 DIB bytes + 3 unused bytes on 64-bit, + * or 7 set_entries + 7 DIB bytes + 0 unused bytes on 32-bit. */ + uint8_t storage[sizeof(struct indirect_storage)]; +}; + +#define DIRECT_BUCKETS(entry_t) \ + (sizeof(struct direct_storage) / (sizeof(entry_t) + sizeof(dib_raw_t))) + +/* We should be able to store at least one entry directly. */ +assert_cc(DIRECT_BUCKETS(struct ordered_hashmap_entry) >= 1); + +/* We have 3 bits for n_direct_entries. */ +assert_cc(DIRECT_BUCKETS(struct set_entry) < (1 << 3)); + +/* Hashmaps with directly stored entries all use this shared hash key. + * It's no big deal if the key is guessed, because there can be only + * a handful of directly stored entries in a hashmap. When a hashmap + * outgrows direct storage, it gets its own key for indirect storage. */ +static uint8_t shared_hash_key[HASH_KEY_SIZE]; + +/* Fields that all hashmap/set types must have */ +struct HashmapBase { + const struct hash_ops *hash_ops; /* hash and compare ops to use */ + + union _packed_ { + struct indirect_storage indirect; /* if has_indirect */ + struct direct_storage direct; /* if !has_indirect */ + }; + + enum HashmapType type:2; /* HASHMAP_TYPE_* */ + bool has_indirect:1; /* whether indirect storage is used */ + unsigned n_direct_entries:3; /* Number of entries in direct storage. + * Only valid if !has_indirect. */ + bool from_pool:1; /* whether was allocated from mempool */ + bool dirty:1; /* whether dirtied since last iterated_cache_get() */ + bool cached:1; /* whether this hashmap is being cached */ + +#if ENABLE_DEBUG_HASHMAP + struct hashmap_debug_info debug; +#endif +}; + +/* Specific hash types + * HashmapBase must be at the beginning of each hashmap struct. */ + +struct Hashmap { + struct HashmapBase b; +}; + +struct OrderedHashmap { + struct HashmapBase b; + unsigned iterate_list_head, iterate_list_tail; +}; + +struct Set { + struct HashmapBase b; +}; + +typedef struct CacheMem { + const void **ptr; + size_t n_populated; + bool active:1; +} CacheMem; + +struct IteratedCache { + HashmapBase *hashmap; + CacheMem keys, values; +}; + +DEFINE_MEMPOOL(hashmap_pool, Hashmap, 8); +DEFINE_MEMPOOL(ordered_hashmap_pool, OrderedHashmap, 8); +/* No need for a separate Set pool */ +assert_cc(sizeof(Hashmap) == sizeof(Set)); + +struct hashmap_type_info { + size_t head_size; + size_t entry_size; + struct mempool *mempool; + unsigned n_direct_buckets; +}; + +static _used_ const struct hashmap_type_info hashmap_type_info[_HASHMAP_TYPE_MAX] = { + [HASHMAP_TYPE_PLAIN] = { + .head_size = sizeof(Hashmap), + .entry_size = sizeof(struct plain_hashmap_entry), + .mempool = &hashmap_pool, + .n_direct_buckets = DIRECT_BUCKETS(struct plain_hashmap_entry), + }, + [HASHMAP_TYPE_ORDERED] = { + .head_size = sizeof(OrderedHashmap), + .entry_size = sizeof(struct ordered_hashmap_entry), + .mempool = &ordered_hashmap_pool, + .n_direct_buckets = DIRECT_BUCKETS(struct ordered_hashmap_entry), + }, + [HASHMAP_TYPE_SET] = { + .head_size = sizeof(Set), + .entry_size = sizeof(struct set_entry), + .mempool = &hashmap_pool, + .n_direct_buckets = DIRECT_BUCKETS(struct set_entry), + }, +}; + +void hashmap_trim_pools(void) { + int r; + + /* The pool is only allocated by the main thread, but the memory can be passed to other + * threads. Let's clean up if we are the main thread and no other threads are live. */ + + /* We build our own is_main_thread() here, which doesn't use C11 TLS based caching of the + * result. That's because valgrind apparently doesn't like TLS to be used from a GCC destructor. */ + if (getpid() != gettid()) + return (void) log_debug("Not cleaning up memory pools, not in main thread."); + + r = get_process_threads(0); + if (r < 0) + return (void) log_debug_errno(r, "Failed to determine number of threads, not cleaning up memory pools: %m"); + if (r != 1) + return (void) log_debug("Not cleaning up memory pools, running in multi-threaded process."); + + mempool_trim(&hashmap_pool); + mempool_trim(&ordered_hashmap_pool); +} + +#if HAVE_VALGRIND_VALGRIND_H +_destructor_ static void cleanup_pools(void) { + /* Be nice to valgrind */ + if (RUNNING_ON_VALGRIND) + hashmap_trim_pools(); +} +#endif + +static unsigned n_buckets(HashmapBase *h) { + return h->has_indirect ? h->indirect.n_buckets + : hashmap_type_info[h->type].n_direct_buckets; +} + +static unsigned n_entries(HashmapBase *h) { + return h->has_indirect ? h->indirect.n_entries + : h->n_direct_entries; +} + +static void n_entries_inc(HashmapBase *h) { + if (h->has_indirect) + h->indirect.n_entries++; + else + h->n_direct_entries++; +} + +static void n_entries_dec(HashmapBase *h) { + if (h->has_indirect) + h->indirect.n_entries--; + else + h->n_direct_entries--; +} + +static void* storage_ptr(HashmapBase *h) { + return h->has_indirect ? h->indirect.storage + : h->direct.storage; +} + +static uint8_t* hash_key(HashmapBase *h) { + return h->has_indirect ? h->indirect.hash_key + : shared_hash_key; +} + +static unsigned base_bucket_hash(HashmapBase *h, const void *p) { + struct siphash state; + uint64_t hash; + + siphash24_init(&state, hash_key(h)); + + h->hash_ops->hash(p, &state); + + hash = siphash24_finalize(&state); + + return (unsigned) (hash % n_buckets(h)); +} +#define bucket_hash(h, p) base_bucket_hash(HASHMAP_BASE(h), p) + +static void base_set_dirty(HashmapBase *h) { + h->dirty = true; +} +#define hashmap_set_dirty(h) base_set_dirty(HASHMAP_BASE(h)) + +static void get_hash_key(uint8_t hash_key[HASH_KEY_SIZE], bool reuse_is_ok) { + static uint8_t current[HASH_KEY_SIZE]; + static bool current_initialized = false; + + /* Returns a hash function key to use. In order to keep things + * fast we will not generate a new key each time we allocate a + * new hash table. Instead, we'll just reuse the most recently + * generated one, except if we never generated one or when we + * are rehashing an entire hash table because we reached a + * fill level */ + + if (!current_initialized || !reuse_is_ok) { + random_bytes(current, sizeof(current)); + current_initialized = true; + } + + memcpy(hash_key, current, sizeof(current)); +} + +static struct hashmap_base_entry* bucket_at(HashmapBase *h, unsigned idx) { + return CAST_ALIGN_PTR( + struct hashmap_base_entry, + (uint8_t *) storage_ptr(h) + idx * hashmap_type_info[h->type].entry_size); +} + +static struct plain_hashmap_entry* plain_bucket_at(Hashmap *h, unsigned idx) { + return (struct plain_hashmap_entry*) bucket_at(HASHMAP_BASE(h), idx); +} + +static struct ordered_hashmap_entry* ordered_bucket_at(OrderedHashmap *h, unsigned idx) { + return (struct ordered_hashmap_entry*) bucket_at(HASHMAP_BASE(h), idx); +} + +static struct set_entry *set_bucket_at(Set *h, unsigned idx) { + return (struct set_entry*) bucket_at(HASHMAP_BASE(h), idx); +} + +static struct ordered_hashmap_entry* bucket_at_swap(struct swap_entries *swap, unsigned idx) { + return &swap->e[idx - _IDX_SWAP_BEGIN]; +} + +/* Returns a pointer to the bucket at index idx. + * Understands real indexes and swap indexes, hence "_virtual". */ +static struct hashmap_base_entry* bucket_at_virtual(HashmapBase *h, struct swap_entries *swap, + unsigned idx) { + if (idx < _IDX_SWAP_BEGIN) + return bucket_at(h, idx); + + if (idx < _IDX_SWAP_END) + return &bucket_at_swap(swap, idx)->p.b; + + assert_not_reached(); +} + +static dib_raw_t* dib_raw_ptr(HashmapBase *h) { + return (dib_raw_t*) + ((uint8_t*) storage_ptr(h) + hashmap_type_info[h->type].entry_size * n_buckets(h)); +} + +static unsigned bucket_distance(HashmapBase *h, unsigned idx, unsigned from) { + return idx >= from ? idx - from + : n_buckets(h) + idx - from; +} + +static unsigned bucket_calculate_dib(HashmapBase *h, unsigned idx, dib_raw_t raw_dib) { + unsigned initial_bucket; + + if (raw_dib == DIB_RAW_FREE) + return DIB_FREE; + + if (_likely_(raw_dib < DIB_RAW_OVERFLOW)) + return raw_dib; + + /* + * Having an overflow DIB value is very unlikely. The hash function + * would have to be bad. For example, in a table of size 2^24 filled + * to load factor 0.9 the maximum observed DIB is only about 60. + * In theory (assuming I used Maxima correctly), for an infinite size + * hash table with load factor 0.8 the probability of a given entry + * having DIB > 40 is 1.9e-8. + * This returns the correct DIB value by recomputing the hash value in + * the unlikely case. XXX Hitting this case could be a hint to rehash. + */ + initial_bucket = bucket_hash(h, bucket_at(h, idx)->key); + return bucket_distance(h, idx, initial_bucket); +} + +static void bucket_set_dib(HashmapBase *h, unsigned idx, unsigned dib) { + dib_raw_ptr(h)[idx] = dib != DIB_FREE ? MIN(dib, DIB_RAW_OVERFLOW) : DIB_RAW_FREE; +} + +static unsigned skip_free_buckets(HashmapBase *h, unsigned idx) { + dib_raw_t *dibs; + + dibs = dib_raw_ptr(h); + + for ( ; idx < n_buckets(h); idx++) + if (dibs[idx] != DIB_RAW_FREE) + return idx; + + return IDX_NIL; +} + +static void bucket_mark_free(HashmapBase *h, unsigned idx) { + memzero(bucket_at(h, idx), hashmap_type_info[h->type].entry_size); + bucket_set_dib(h, idx, DIB_FREE); +} + +static void bucket_move_entry(HashmapBase *h, struct swap_entries *swap, + unsigned from, unsigned to) { + struct hashmap_base_entry *e_from, *e_to; + + assert(from != to); + + e_from = bucket_at_virtual(h, swap, from); + e_to = bucket_at_virtual(h, swap, to); + + memcpy(e_to, e_from, hashmap_type_info[h->type].entry_size); + + if (h->type == HASHMAP_TYPE_ORDERED) { + OrderedHashmap *lh = (OrderedHashmap*) h; + struct ordered_hashmap_entry *le, *le_to; + + le_to = (struct ordered_hashmap_entry*) e_to; + + if (le_to->iterate_next != IDX_NIL) { + le = (struct ordered_hashmap_entry*) + bucket_at_virtual(h, swap, le_to->iterate_next); + le->iterate_previous = to; + } + + if (le_to->iterate_previous != IDX_NIL) { + le = (struct ordered_hashmap_entry*) + bucket_at_virtual(h, swap, le_to->iterate_previous); + le->iterate_next = to; + } + + if (lh->iterate_list_head == from) + lh->iterate_list_head = to; + if (lh->iterate_list_tail == from) + lh->iterate_list_tail = to; + } +} + +static unsigned next_idx(HashmapBase *h, unsigned idx) { + return (idx + 1U) % n_buckets(h); +} + +static unsigned prev_idx(HashmapBase *h, unsigned idx) { + return (n_buckets(h) + idx - 1U) % n_buckets(h); +} + +static void* entry_value(HashmapBase *h, struct hashmap_base_entry *e) { + switch (h->type) { + + case HASHMAP_TYPE_PLAIN: + case HASHMAP_TYPE_ORDERED: + return ((struct plain_hashmap_entry*)e)->value; + + case HASHMAP_TYPE_SET: + return (void*) e->key; + + default: + assert_not_reached(); + } +} + +static void base_remove_entry(HashmapBase *h, unsigned idx) { + unsigned left, right, prev, dib; + dib_raw_t raw_dib, *dibs; + + dibs = dib_raw_ptr(h); + assert(dibs[idx] != DIB_RAW_FREE); + +#if ENABLE_DEBUG_HASHMAP + h->debug.rem_count++; + h->debug.last_rem_idx = idx; +#endif + + left = idx; + /* Find the stop bucket ("right"). It is either free or has DIB == 0. */ + for (right = next_idx(h, left); ; right = next_idx(h, right)) { + raw_dib = dibs[right]; + if (IN_SET(raw_dib, 0, DIB_RAW_FREE)) + break; + + /* The buckets are not supposed to be all occupied and with DIB > 0. + * That would mean we could make everyone better off by shifting them + * backward. This scenario is impossible. */ + assert(left != right); + } + + if (h->type == HASHMAP_TYPE_ORDERED) { + OrderedHashmap *lh = (OrderedHashmap*) h; + struct ordered_hashmap_entry *le = ordered_bucket_at(lh, idx); + + if (le->iterate_next != IDX_NIL) + ordered_bucket_at(lh, le->iterate_next)->iterate_previous = le->iterate_previous; + else + lh->iterate_list_tail = le->iterate_previous; + + if (le->iterate_previous != IDX_NIL) + ordered_bucket_at(lh, le->iterate_previous)->iterate_next = le->iterate_next; + else + lh->iterate_list_head = le->iterate_next; + } + + /* Now shift all buckets in the interval (left, right) one step backwards */ + for (prev = left, left = next_idx(h, left); left != right; + prev = left, left = next_idx(h, left)) { + dib = bucket_calculate_dib(h, left, dibs[left]); + assert(dib != 0); + bucket_move_entry(h, NULL, left, prev); + bucket_set_dib(h, prev, dib - 1); + } + + bucket_mark_free(h, prev); + n_entries_dec(h); + base_set_dirty(h); +} +#define remove_entry(h, idx) base_remove_entry(HASHMAP_BASE(h), idx) + +static unsigned hashmap_iterate_in_insertion_order(OrderedHashmap *h, Iterator *i) { + struct ordered_hashmap_entry *e; + unsigned idx; + + assert(h); + assert(i); + + if (i->idx == IDX_NIL) + goto at_end; + + if (i->idx == IDX_FIRST && h->iterate_list_head == IDX_NIL) + goto at_end; + + if (i->idx == IDX_FIRST) { + idx = h->iterate_list_head; + e = ordered_bucket_at(h, idx); + } else { + idx = i->idx; + e = ordered_bucket_at(h, idx); + /* + * We allow removing the current entry while iterating, but removal may cause + * a backward shift. The next entry may thus move one bucket to the left. + * To detect when it happens, we remember the key pointer of the entry we were + * going to iterate next. If it does not match, there was a backward shift. + */ + if (e->p.b.key != i->next_key) { + idx = prev_idx(HASHMAP_BASE(h), idx); + e = ordered_bucket_at(h, idx); + } + assert(e->p.b.key == i->next_key); + } + +#if ENABLE_DEBUG_HASHMAP + i->prev_idx = idx; +#endif + + if (e->iterate_next != IDX_NIL) { + struct ordered_hashmap_entry *n; + i->idx = e->iterate_next; + n = ordered_bucket_at(h, i->idx); + i->next_key = n->p.b.key; + } else + i->idx = IDX_NIL; + + return idx; + +at_end: + i->idx = IDX_NIL; + return IDX_NIL; +} + +static unsigned hashmap_iterate_in_internal_order(HashmapBase *h, Iterator *i) { + unsigned idx; + + assert(h); + assert(i); + + if (i->idx == IDX_NIL) + goto at_end; + + if (i->idx == IDX_FIRST) { + /* fast forward to the first occupied bucket */ + if (h->has_indirect) { + i->idx = skip_free_buckets(h, h->indirect.idx_lowest_entry); + h->indirect.idx_lowest_entry = i->idx; + } else + i->idx = skip_free_buckets(h, 0); + + if (i->idx == IDX_NIL) + goto at_end; + } else { + struct hashmap_base_entry *e; + + assert(i->idx > 0); + + e = bucket_at(h, i->idx); + /* + * We allow removing the current entry while iterating, but removal may cause + * a backward shift. The next entry may thus move one bucket to the left. + * To detect when it happens, we remember the key pointer of the entry we were + * going to iterate next. If it does not match, there was a backward shift. + */ + if (e->key != i->next_key) + e = bucket_at(h, --i->idx); + + assert(e->key == i->next_key); + } + + idx = i->idx; +#if ENABLE_DEBUG_HASHMAP + i->prev_idx = idx; +#endif + + i->idx = skip_free_buckets(h, i->idx + 1); + if (i->idx != IDX_NIL) + i->next_key = bucket_at(h, i->idx)->key; + else + i->idx = IDX_NIL; + + return idx; + +at_end: + i->idx = IDX_NIL; + return IDX_NIL; +} + +static unsigned hashmap_iterate_entry(HashmapBase *h, Iterator *i) { + if (!h) { + i->idx = IDX_NIL; + return IDX_NIL; + } + +#if ENABLE_DEBUG_HASHMAP + if (i->idx == IDX_FIRST) { + i->put_count = h->debug.put_count; + i->rem_count = h->debug.rem_count; + } else { + /* While iterating, must not add any new entries */ + assert(i->put_count == h->debug.put_count); + /* ... or remove entries other than the current one */ + assert(i->rem_count == h->debug.rem_count || + (i->rem_count == h->debug.rem_count - 1 && + i->prev_idx == h->debug.last_rem_idx)); + /* Reset our removals counter */ + i->rem_count = h->debug.rem_count; + } +#endif + + return h->type == HASHMAP_TYPE_ORDERED ? hashmap_iterate_in_insertion_order((OrderedHashmap*) h, i) + : hashmap_iterate_in_internal_order(h, i); +} + +bool _hashmap_iterate(HashmapBase *h, Iterator *i, void **value, const void **key) { + struct hashmap_base_entry *e; + void *data; + unsigned idx; + + idx = hashmap_iterate_entry(h, i); + if (idx == IDX_NIL) { + if (value) + *value = NULL; + if (key) + *key = NULL; + + return false; + } + + e = bucket_at(h, idx); + data = entry_value(h, e); + if (value) + *value = data; + if (key) + *key = e->key; + + return true; +} + +#define HASHMAP_FOREACH_IDX(idx, h, i) \ + for ((i) = ITERATOR_FIRST, (idx) = hashmap_iterate_entry((h), &(i)); \ + (idx != IDX_NIL); \ + (idx) = hashmap_iterate_entry((h), &(i))) + +IteratedCache* _hashmap_iterated_cache_new(HashmapBase *h) { + IteratedCache *cache; + + assert(h); + assert(!h->cached); + + if (h->cached) + return NULL; + + cache = new0(IteratedCache, 1); + if (!cache) + return NULL; + + cache->hashmap = h; + h->cached = true; + + return cache; +} + +static void reset_direct_storage(HashmapBase *h) { + const struct hashmap_type_info *hi = &hashmap_type_info[h->type]; + void *p; + + assert(!h->has_indirect); + + p = mempset(h->direct.storage, 0, hi->entry_size * hi->n_direct_buckets); + memset(p, DIB_RAW_INIT, sizeof(dib_raw_t) * hi->n_direct_buckets); +} + +static void shared_hash_key_initialize(void) { + random_bytes(shared_hash_key, sizeof(shared_hash_key)); +} + +static struct HashmapBase* hashmap_base_new(const struct hash_ops *hash_ops, enum HashmapType type HASHMAP_DEBUG_PARAMS) { + HashmapBase *h; + const struct hashmap_type_info *hi = &hashmap_type_info[type]; + + bool use_pool = mempool_enabled && mempool_enabled(); /* mempool_enabled is a weak symbol */ + + h = use_pool ? mempool_alloc0_tile(hi->mempool) : malloc0(hi->head_size); + if (!h) + return NULL; + + h->type = type; + h->from_pool = use_pool; + h->hash_ops = hash_ops ?: &trivial_hash_ops; + + if (type == HASHMAP_TYPE_ORDERED) { + OrderedHashmap *lh = (OrderedHashmap*)h; + lh->iterate_list_head = lh->iterate_list_tail = IDX_NIL; + } + + reset_direct_storage(h); + + static pthread_once_t once = PTHREAD_ONCE_INIT; + assert_se(pthread_once(&once, shared_hash_key_initialize) == 0); + +#if ENABLE_DEBUG_HASHMAP + h->debug.func = func; + h->debug.file = file; + h->debug.line = line; + assert_se(pthread_mutex_lock(&hashmap_debug_list_mutex) == 0); + LIST_PREPEND(debug_list, hashmap_debug_list, &h->debug); + assert_se(pthread_mutex_unlock(&hashmap_debug_list_mutex) == 0); +#endif + + return h; +} + +Hashmap *_hashmap_new(const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS) { + return (Hashmap*) hashmap_base_new(hash_ops, HASHMAP_TYPE_PLAIN HASHMAP_DEBUG_PASS_ARGS); +} + +OrderedHashmap *_ordered_hashmap_new(const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS) { + return (OrderedHashmap*) hashmap_base_new(hash_ops, HASHMAP_TYPE_ORDERED HASHMAP_DEBUG_PASS_ARGS); +} + +Set *_set_new(const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS) { + return (Set*) hashmap_base_new(hash_ops, HASHMAP_TYPE_SET HASHMAP_DEBUG_PASS_ARGS); +} + +static int hashmap_base_ensure_allocated(HashmapBase **h, const struct hash_ops *hash_ops, + enum HashmapType type HASHMAP_DEBUG_PARAMS) { + HashmapBase *q; + + assert(h); + + if (*h) + return 0; + + q = hashmap_base_new(hash_ops, type HASHMAP_DEBUG_PASS_ARGS); + if (!q) + return -ENOMEM; + + *h = q; + return 1; +} + +int _hashmap_ensure_allocated(Hashmap **h, const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS) { + return hashmap_base_ensure_allocated((HashmapBase**)h, hash_ops, HASHMAP_TYPE_PLAIN HASHMAP_DEBUG_PASS_ARGS); +} + +int _ordered_hashmap_ensure_allocated(OrderedHashmap **h, const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS) { + return hashmap_base_ensure_allocated((HashmapBase**)h, hash_ops, HASHMAP_TYPE_ORDERED HASHMAP_DEBUG_PASS_ARGS); +} + +int _set_ensure_allocated(Set **s, const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS) { + return hashmap_base_ensure_allocated((HashmapBase**)s, hash_ops, HASHMAP_TYPE_SET HASHMAP_DEBUG_PASS_ARGS); +} + +int _hashmap_ensure_put(Hashmap **h, const struct hash_ops *hash_ops, const void *key, void *value HASHMAP_DEBUG_PARAMS) { + int r; + + r = _hashmap_ensure_allocated(h, hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + return hashmap_put(*h, key, value); +} + +int _ordered_hashmap_ensure_put(OrderedHashmap **h, const struct hash_ops *hash_ops, const void *key, void *value HASHMAP_DEBUG_PARAMS) { + int r; + + r = _ordered_hashmap_ensure_allocated(h, hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + return ordered_hashmap_put(*h, key, value); +} + +static void hashmap_free_no_clear(HashmapBase *h) { + assert(!h->has_indirect); + assert(h->n_direct_entries == 0); + +#if ENABLE_DEBUG_HASHMAP + assert_se(pthread_mutex_lock(&hashmap_debug_list_mutex) == 0); + LIST_REMOVE(debug_list, hashmap_debug_list, &h->debug); + assert_se(pthread_mutex_unlock(&hashmap_debug_list_mutex) == 0); +#endif + + if (h->from_pool) { + /* Ensure that the object didn't get migrated between threads. */ + assert_se(is_main_thread()); + mempool_free_tile(hashmap_type_info[h->type].mempool, h); + } else + free(h); +} + +HashmapBase* _hashmap_free(HashmapBase *h, free_func_t default_free_key, free_func_t default_free_value) { + if (h) { + _hashmap_clear(h, default_free_key, default_free_value); + hashmap_free_no_clear(h); + } + + return NULL; +} + +void _hashmap_clear(HashmapBase *h, free_func_t default_free_key, free_func_t default_free_value) { + free_func_t free_key, free_value; + if (!h) + return; + + free_key = h->hash_ops->free_key ?: default_free_key; + free_value = h->hash_ops->free_value ?: default_free_value; + + if (free_key || free_value) { + + /* If destructor calls are defined, let's destroy things defensively: let's take the item out of the + * hash table, and only then call the destructor functions. If these destructors then try to unregister + * themselves from our hash table a second time, the entry is already gone. */ + + while (_hashmap_size(h) > 0) { + void *k = NULL; + void *v; + + v = _hashmap_first_key_and_value(h, true, &k); + + if (free_key) + free_key(k); + + if (free_value) + free_value(v); + } + } + + if (h->has_indirect) { + free(h->indirect.storage); + h->has_indirect = false; + } + + h->n_direct_entries = 0; + reset_direct_storage(h); + + if (h->type == HASHMAP_TYPE_ORDERED) { + OrderedHashmap *lh = (OrderedHashmap*) h; + lh->iterate_list_head = lh->iterate_list_tail = IDX_NIL; + } + + base_set_dirty(h); +} + +static int resize_buckets(HashmapBase *h, unsigned entries_add); + +/* + * Finds an empty bucket to put an entry into, starting the scan at 'idx'. + * Performs Robin Hood swaps as it goes. The entry to put must be placed + * by the caller into swap slot IDX_PUT. + * If used for in-place resizing, may leave a displaced entry in swap slot + * IDX_PUT. Caller must rehash it next. + * Returns: true if it left a displaced entry to rehash next in IDX_PUT, + * false otherwise. + */ +static bool hashmap_put_robin_hood(HashmapBase *h, unsigned idx, + struct swap_entries *swap) { + dib_raw_t raw_dib, *dibs; + unsigned dib, distance; + +#if ENABLE_DEBUG_HASHMAP + h->debug.put_count++; +#endif + + dibs = dib_raw_ptr(h); + + for (distance = 0; ; distance++) { + raw_dib = dibs[idx]; + if (IN_SET(raw_dib, DIB_RAW_FREE, DIB_RAW_REHASH)) { + if (raw_dib == DIB_RAW_REHASH) + bucket_move_entry(h, swap, idx, IDX_TMP); + + if (h->has_indirect && h->indirect.idx_lowest_entry > idx) + h->indirect.idx_lowest_entry = idx; + + bucket_set_dib(h, idx, distance); + bucket_move_entry(h, swap, IDX_PUT, idx); + if (raw_dib == DIB_RAW_REHASH) { + bucket_move_entry(h, swap, IDX_TMP, IDX_PUT); + return true; + } + + return false; + } + + dib = bucket_calculate_dib(h, idx, raw_dib); + + if (dib < distance) { + /* Found a wealthier entry. Go Robin Hood! */ + bucket_set_dib(h, idx, distance); + + /* swap the entries */ + bucket_move_entry(h, swap, idx, IDX_TMP); + bucket_move_entry(h, swap, IDX_PUT, idx); + bucket_move_entry(h, swap, IDX_TMP, IDX_PUT); + + distance = dib; + } + + idx = next_idx(h, idx); + } +} + +/* + * Puts an entry into a hashmap, boldly - no check whether key already exists. + * The caller must place the entry (only its key and value, not link indexes) + * in swap slot IDX_PUT. + * Caller must ensure: the key does not exist yet in the hashmap. + * that resize is not needed if !may_resize. + * Returns: 1 if entry was put successfully. + * -ENOMEM if may_resize==true and resize failed with -ENOMEM. + * Cannot return -ENOMEM if !may_resize. + */ +static int hashmap_base_put_boldly(HashmapBase *h, unsigned idx, + struct swap_entries *swap, bool may_resize) { + struct ordered_hashmap_entry *new_entry; + int r; + + assert(idx < n_buckets(h)); + + new_entry = bucket_at_swap(swap, IDX_PUT); + + if (may_resize) { + r = resize_buckets(h, 1); + if (r < 0) + return r; + if (r > 0) + idx = bucket_hash(h, new_entry->p.b.key); + } + assert(n_entries(h) < n_buckets(h)); + + if (h->type == HASHMAP_TYPE_ORDERED) { + OrderedHashmap *lh = (OrderedHashmap*) h; + + new_entry->iterate_next = IDX_NIL; + new_entry->iterate_previous = lh->iterate_list_tail; + + if (lh->iterate_list_tail != IDX_NIL) { + struct ordered_hashmap_entry *old_tail; + + old_tail = ordered_bucket_at(lh, lh->iterate_list_tail); + assert(old_tail->iterate_next == IDX_NIL); + old_tail->iterate_next = IDX_PUT; + } + + lh->iterate_list_tail = IDX_PUT; + if (lh->iterate_list_head == IDX_NIL) + lh->iterate_list_head = IDX_PUT; + } + + assert_se(hashmap_put_robin_hood(h, idx, swap) == false); + + n_entries_inc(h); +#if ENABLE_DEBUG_HASHMAP + h->debug.max_entries = MAX(h->debug.max_entries, n_entries(h)); +#endif + + base_set_dirty(h); + + return 1; +} +#define hashmap_put_boldly(h, idx, swap, may_resize) \ + hashmap_base_put_boldly(HASHMAP_BASE(h), idx, swap, may_resize) + +/* + * Returns 0 if resize is not needed. + * 1 if successfully resized. + * -ENOMEM on allocation failure. + */ +static int resize_buckets(HashmapBase *h, unsigned entries_add) { + struct swap_entries swap; + void *new_storage; + dib_raw_t *old_dibs, *new_dibs; + const struct hashmap_type_info *hi; + unsigned idx, optimal_idx; + unsigned old_n_buckets, new_n_buckets, n_rehashed, new_n_entries; + uint8_t new_shift; + bool rehash_next; + + assert(h); + + hi = &hashmap_type_info[h->type]; + new_n_entries = n_entries(h) + entries_add; + + /* overflow? */ + if (_unlikely_(new_n_entries < entries_add)) + return -ENOMEM; + + /* For direct storage we allow 100% load, because it's tiny. */ + if (!h->has_indirect && new_n_entries <= hi->n_direct_buckets) + return 0; + + /* + * Load factor = n/m = 1 - (1/INV_KEEP_FREE). + * From it follows: m = n + n/(INV_KEEP_FREE - 1) + */ + new_n_buckets = new_n_entries + new_n_entries / (INV_KEEP_FREE - 1); + /* overflow? */ + if (_unlikely_(new_n_buckets < new_n_entries)) + return -ENOMEM; + + if (_unlikely_(new_n_buckets > UINT_MAX / (hi->entry_size + sizeof(dib_raw_t)))) + return -ENOMEM; + + old_n_buckets = n_buckets(h); + + if (_likely_(new_n_buckets <= old_n_buckets)) + return 0; + + new_shift = log2u_round_up(MAX( + new_n_buckets * (hi->entry_size + sizeof(dib_raw_t)), + 2 * sizeof(struct direct_storage))); + + /* Realloc storage (buckets and DIB array). */ + new_storage = realloc(h->has_indirect ? h->indirect.storage : NULL, + 1U << new_shift); + if (!new_storage) + return -ENOMEM; + + /* Must upgrade direct to indirect storage. */ + if (!h->has_indirect) { + memcpy(new_storage, h->direct.storage, + old_n_buckets * (hi->entry_size + sizeof(dib_raw_t))); + h->indirect.n_entries = h->n_direct_entries; + h->indirect.idx_lowest_entry = 0; + h->n_direct_entries = 0; + } + + /* Get a new hash key. If we've just upgraded to indirect storage, + * allow reusing a previously generated key. It's still a different key + * from the shared one that we used for direct storage. */ + get_hash_key(h->indirect.hash_key, !h->has_indirect); + + h->has_indirect = true; + h->indirect.storage = new_storage; + h->indirect.n_buckets = (1U << new_shift) / + (hi->entry_size + sizeof(dib_raw_t)); + + old_dibs = (dib_raw_t*)((uint8_t*) new_storage + hi->entry_size * old_n_buckets); + new_dibs = dib_raw_ptr(h); + + /* + * Move the DIB array to the new place, replacing valid DIB values with + * DIB_RAW_REHASH to indicate all of the used buckets need rehashing. + * Note: Overlap is not possible, because we have at least doubled the + * number of buckets and dib_raw_t is smaller than any entry type. + */ + for (idx = 0; idx < old_n_buckets; idx++) { + assert(old_dibs[idx] != DIB_RAW_REHASH); + new_dibs[idx] = old_dibs[idx] == DIB_RAW_FREE ? DIB_RAW_FREE + : DIB_RAW_REHASH; + } + + /* Zero the area of newly added entries (including the old DIB area) */ + memzero(bucket_at(h, old_n_buckets), + (n_buckets(h) - old_n_buckets) * hi->entry_size); + + /* The upper half of the new DIB array needs initialization */ + memset(&new_dibs[old_n_buckets], DIB_RAW_INIT, + (n_buckets(h) - old_n_buckets) * sizeof(dib_raw_t)); + + /* Rehash entries that need it */ + n_rehashed = 0; + for (idx = 0; idx < old_n_buckets; idx++) { + if (new_dibs[idx] != DIB_RAW_REHASH) + continue; + + optimal_idx = bucket_hash(h, bucket_at(h, idx)->key); + + /* + * Not much to do if by luck the entry hashes to its current + * location. Just set its DIB. + */ + if (optimal_idx == idx) { + new_dibs[idx] = 0; + n_rehashed++; + continue; + } + + new_dibs[idx] = DIB_RAW_FREE; + bucket_move_entry(h, &swap, idx, IDX_PUT); + /* bucket_move_entry does not clear the source */ + memzero(bucket_at(h, idx), hi->entry_size); + + do { + /* + * Find the new bucket for the current entry. This may make + * another entry homeless and load it into IDX_PUT. + */ + rehash_next = hashmap_put_robin_hood(h, optimal_idx, &swap); + n_rehashed++; + + /* Did the current entry displace another one? */ + if (rehash_next) + optimal_idx = bucket_hash(h, bucket_at_swap(&swap, IDX_PUT)->p.b.key); + } while (rehash_next); + } + + assert_se(n_rehashed == n_entries(h)); + + return 1; +} + +/* + * Finds an entry with a matching key + * Returns: index of the found entry, or IDX_NIL if not found. + */ +static unsigned base_bucket_scan(HashmapBase *h, unsigned idx, const void *key) { + struct hashmap_base_entry *e; + unsigned dib, distance; + dib_raw_t *dibs = dib_raw_ptr(h); + + assert(idx < n_buckets(h)); + + for (distance = 0; ; distance++) { + if (dibs[idx] == DIB_RAW_FREE) + return IDX_NIL; + + dib = bucket_calculate_dib(h, idx, dibs[idx]); + + if (dib < distance) + return IDX_NIL; + if (dib == distance) { + e = bucket_at(h, idx); + if (h->hash_ops->compare(e->key, key) == 0) + return idx; + } + + idx = next_idx(h, idx); + } +} +#define bucket_scan(h, idx, key) base_bucket_scan(HASHMAP_BASE(h), idx, key) + +int hashmap_put(Hashmap *h, const void *key, void *value) { + struct swap_entries swap; + struct plain_hashmap_entry *e; + unsigned hash, idx; + + assert(h); + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx != IDX_NIL) { + e = plain_bucket_at(h, idx); + if (e->value == value) + return 0; + return -EEXIST; + } + + e = &bucket_at_swap(&swap, IDX_PUT)->p; + e->b.key = key; + e->value = value; + return hashmap_put_boldly(h, hash, &swap, true); +} + +int set_put(Set *s, const void *key) { + struct swap_entries swap; + struct hashmap_base_entry *e; + unsigned hash, idx; + + assert(s); + + hash = bucket_hash(s, key); + idx = bucket_scan(s, hash, key); + if (idx != IDX_NIL) + return 0; + + e = &bucket_at_swap(&swap, IDX_PUT)->p.b; + e->key = key; + return hashmap_put_boldly(s, hash, &swap, true); +} + +int _set_ensure_put(Set **s, const struct hash_ops *hash_ops, const void *key HASHMAP_DEBUG_PARAMS) { + int r; + + r = _set_ensure_allocated(s, hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + return set_put(*s, key); +} + +int _set_ensure_consume(Set **s, const struct hash_ops *hash_ops, void *key HASHMAP_DEBUG_PARAMS) { + int r; + + r = _set_ensure_put(s, hash_ops, key HASHMAP_DEBUG_PASS_ARGS); + if (r <= 0) { + if (hash_ops && hash_ops->free_key) + hash_ops->free_key(key); + else + free(key); + } + + return r; +} + +int hashmap_replace(Hashmap *h, const void *key, void *value) { + struct swap_entries swap; + struct plain_hashmap_entry *e; + unsigned hash, idx; + + assert(h); + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx != IDX_NIL) { + e = plain_bucket_at(h, idx); +#if ENABLE_DEBUG_HASHMAP + /* Although the key is equal, the key pointer may have changed, + * and this would break our assumption for iterating. So count + * this operation as incompatible with iteration. */ + if (e->b.key != key) { + h->b.debug.put_count++; + h->b.debug.rem_count++; + h->b.debug.last_rem_idx = idx; + } +#endif + e->b.key = key; + e->value = value; + hashmap_set_dirty(h); + + return 0; + } + + e = &bucket_at_swap(&swap, IDX_PUT)->p; + e->b.key = key; + e->value = value; + return hashmap_put_boldly(h, hash, &swap, true); +} + +int hashmap_update(Hashmap *h, const void *key, void *value) { + struct plain_hashmap_entry *e; + unsigned hash, idx; + + assert(h); + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) + return -ENOENT; + + e = plain_bucket_at(h, idx); + e->value = value; + hashmap_set_dirty(h); + + return 0; +} + +void* _hashmap_get(HashmapBase *h, const void *key) { + struct hashmap_base_entry *e; + unsigned hash, idx; + + if (!h) + return NULL; + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) + return NULL; + + e = bucket_at(h, idx); + return entry_value(h, e); +} + +void* hashmap_get2(Hashmap *h, const void *key, void **key2) { + struct plain_hashmap_entry *e; + unsigned hash, idx; + + if (!h) + return NULL; + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) + return NULL; + + e = plain_bucket_at(h, idx); + if (key2) + *key2 = (void*) e->b.key; + + return e->value; +} + +bool _hashmap_contains(HashmapBase *h, const void *key) { + unsigned hash; + + if (!h) + return false; + + hash = bucket_hash(h, key); + return bucket_scan(h, hash, key) != IDX_NIL; +} + +void* _hashmap_remove(HashmapBase *h, const void *key) { + struct hashmap_base_entry *e; + unsigned hash, idx; + void *data; + + if (!h) + return NULL; + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) + return NULL; + + e = bucket_at(h, idx); + data = entry_value(h, e); + remove_entry(h, idx); + + return data; +} + +void* hashmap_remove2(Hashmap *h, const void *key, void **rkey) { + struct plain_hashmap_entry *e; + unsigned hash, idx; + void *data; + + if (!h) { + if (rkey) + *rkey = NULL; + return NULL; + } + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) { + if (rkey) + *rkey = NULL; + return NULL; + } + + e = plain_bucket_at(h, idx); + data = e->value; + if (rkey) + *rkey = (void*) e->b.key; + + remove_entry(h, idx); + + return data; +} + +int hashmap_remove_and_put(Hashmap *h, const void *old_key, const void *new_key, void *value) { + struct swap_entries swap; + struct plain_hashmap_entry *e; + unsigned old_hash, new_hash, idx; + + if (!h) + return -ENOENT; + + old_hash = bucket_hash(h, old_key); + idx = bucket_scan(h, old_hash, old_key); + if (idx == IDX_NIL) + return -ENOENT; + + new_hash = bucket_hash(h, new_key); + if (bucket_scan(h, new_hash, new_key) != IDX_NIL) + return -EEXIST; + + remove_entry(h, idx); + + e = &bucket_at_swap(&swap, IDX_PUT)->p; + e->b.key = new_key; + e->value = value; + assert_se(hashmap_put_boldly(h, new_hash, &swap, false) == 1); + + return 0; +} + +int set_remove_and_put(Set *s, const void *old_key, const void *new_key) { + struct swap_entries swap; + struct hashmap_base_entry *e; + unsigned old_hash, new_hash, idx; + + if (!s) + return -ENOENT; + + old_hash = bucket_hash(s, old_key); + idx = bucket_scan(s, old_hash, old_key); + if (idx == IDX_NIL) + return -ENOENT; + + new_hash = bucket_hash(s, new_key); + if (bucket_scan(s, new_hash, new_key) != IDX_NIL) + return -EEXIST; + + remove_entry(s, idx); + + e = &bucket_at_swap(&swap, IDX_PUT)->p.b; + e->key = new_key; + assert_se(hashmap_put_boldly(s, new_hash, &swap, false) == 1); + + return 0; +} + +int hashmap_remove_and_replace(Hashmap *h, const void *old_key, const void *new_key, void *value) { + struct swap_entries swap; + struct plain_hashmap_entry *e; + unsigned old_hash, new_hash, idx_old, idx_new; + + if (!h) + return -ENOENT; + + old_hash = bucket_hash(h, old_key); + idx_old = bucket_scan(h, old_hash, old_key); + if (idx_old == IDX_NIL) + return -ENOENT; + + old_key = bucket_at(HASHMAP_BASE(h), idx_old)->key; + + new_hash = bucket_hash(h, new_key); + idx_new = bucket_scan(h, new_hash, new_key); + if (idx_new != IDX_NIL) + if (idx_old != idx_new) { + remove_entry(h, idx_new); + /* Compensate for a possible backward shift. */ + if (old_key != bucket_at(HASHMAP_BASE(h), idx_old)->key) + idx_old = prev_idx(HASHMAP_BASE(h), idx_old); + assert(old_key == bucket_at(HASHMAP_BASE(h), idx_old)->key); + } + + remove_entry(h, idx_old); + + e = &bucket_at_swap(&swap, IDX_PUT)->p; + e->b.key = new_key; + e->value = value; + assert_se(hashmap_put_boldly(h, new_hash, &swap, false) == 1); + + return 0; +} + +void* _hashmap_remove_value(HashmapBase *h, const void *key, void *value) { + struct hashmap_base_entry *e; + unsigned hash, idx; + + if (!h) + return NULL; + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) + return NULL; + + e = bucket_at(h, idx); + if (entry_value(h, e) != value) + return NULL; + + remove_entry(h, idx); + + return value; +} + +static unsigned find_first_entry(HashmapBase *h) { + Iterator i = ITERATOR_FIRST; + + if (!h || !n_entries(h)) + return IDX_NIL; + + return hashmap_iterate_entry(h, &i); +} + +void* _hashmap_first_key_and_value(HashmapBase *h, bool remove, void **ret_key) { + struct hashmap_base_entry *e; + void *key, *data; + unsigned idx; + + idx = find_first_entry(h); + if (idx == IDX_NIL) { + if (ret_key) + *ret_key = NULL; + return NULL; + } + + e = bucket_at(h, idx); + key = (void*) e->key; + data = entry_value(h, e); + + if (remove) + remove_entry(h, idx); + + if (ret_key) + *ret_key = key; + + return data; +} + +unsigned _hashmap_size(HashmapBase *h) { + if (!h) + return 0; + + return n_entries(h); +} + +unsigned _hashmap_buckets(HashmapBase *h) { + if (!h) + return 0; + + return n_buckets(h); +} + +int _hashmap_merge(Hashmap *h, Hashmap *other) { + Iterator i; + unsigned idx; + + assert(h); + + HASHMAP_FOREACH_IDX(idx, HASHMAP_BASE(other), i) { + struct plain_hashmap_entry *pe = plain_bucket_at(other, idx); + int r; + + r = hashmap_put(h, pe->b.key, pe->value); + if (r < 0 && r != -EEXIST) + return r; + } + + return 0; +} + +int set_merge(Set *s, Set *other) { + Iterator i; + unsigned idx; + + assert(s); + + HASHMAP_FOREACH_IDX(idx, HASHMAP_BASE(other), i) { + struct set_entry *se = set_bucket_at(other, idx); + int r; + + r = set_put(s, se->b.key); + if (r < 0) + return r; + } + + return 0; +} + +int _hashmap_reserve(HashmapBase *h, unsigned entries_add) { + int r; + + assert(h); + + r = resize_buckets(h, entries_add); + if (r < 0) + return r; + + return 0; +} + +/* + * The same as hashmap_merge(), but every new item from other is moved to h. + * Keys already in h are skipped and stay in other. + * Returns: 0 on success. + * -ENOMEM on alloc failure, in which case no move has been done. + */ +int _hashmap_move(HashmapBase *h, HashmapBase *other) { + struct swap_entries swap; + struct hashmap_base_entry *e, *n; + Iterator i; + unsigned idx; + int r; + + assert(h); + + if (!other) + return 0; + + assert(other->type == h->type); + + /* + * This reserves buckets for the worst case, where none of other's + * entries are yet present in h. This is preferable to risking + * an allocation failure in the middle of the moving and having to + * rollback or return a partial result. + */ + r = resize_buckets(h, n_entries(other)); + if (r < 0) + return r; + + HASHMAP_FOREACH_IDX(idx, other, i) { + unsigned h_hash; + + e = bucket_at(other, idx); + h_hash = bucket_hash(h, e->key); + if (bucket_scan(h, h_hash, e->key) != IDX_NIL) + continue; + + n = &bucket_at_swap(&swap, IDX_PUT)->p.b; + n->key = e->key; + if (h->type != HASHMAP_TYPE_SET) + ((struct plain_hashmap_entry*) n)->value = + ((struct plain_hashmap_entry*) e)->value; + assert_se(hashmap_put_boldly(h, h_hash, &swap, false) == 1); + + remove_entry(other, idx); + } + + return 0; +} + +int _hashmap_move_one(HashmapBase *h, HashmapBase *other, const void *key) { + struct swap_entries swap; + unsigned h_hash, other_hash, idx; + struct hashmap_base_entry *e, *n; + int r; + + assert(h); + + h_hash = bucket_hash(h, key); + if (bucket_scan(h, h_hash, key) != IDX_NIL) + return -EEXIST; + + if (!other) + return -ENOENT; + + assert(other->type == h->type); + + other_hash = bucket_hash(other, key); + idx = bucket_scan(other, other_hash, key); + if (idx == IDX_NIL) + return -ENOENT; + + e = bucket_at(other, idx); + + n = &bucket_at_swap(&swap, IDX_PUT)->p.b; + n->key = e->key; + if (h->type != HASHMAP_TYPE_SET) + ((struct plain_hashmap_entry*) n)->value = + ((struct plain_hashmap_entry*) e)->value; + r = hashmap_put_boldly(h, h_hash, &swap, true); + if (r < 0) + return r; + + remove_entry(other, idx); + return 0; +} + +HashmapBase* _hashmap_copy(HashmapBase *h HASHMAP_DEBUG_PARAMS) { + HashmapBase *copy; + int r; + + assert(h); + + copy = hashmap_base_new(h->hash_ops, h->type HASHMAP_DEBUG_PASS_ARGS); + if (!copy) + return NULL; + + switch (h->type) { + case HASHMAP_TYPE_PLAIN: + case HASHMAP_TYPE_ORDERED: + r = hashmap_merge((Hashmap*)copy, (Hashmap*)h); + break; + case HASHMAP_TYPE_SET: + r = set_merge((Set*)copy, (Set*)h); + break; + default: + assert_not_reached(); + } + + if (r < 0) + return _hashmap_free(copy, NULL, NULL); + + return copy; +} + +char** _hashmap_get_strv(HashmapBase *h) { + char **sv; + Iterator i; + unsigned idx, n; + + if (!h) + return new0(char*, 1); + + sv = new(char*, n_entries(h)+1); + if (!sv) + return NULL; + + n = 0; + HASHMAP_FOREACH_IDX(idx, h, i) + sv[n++] = entry_value(h, bucket_at(h, idx)); + sv[n] = NULL; + + return sv; +} + +void* ordered_hashmap_next(OrderedHashmap *h, const void *key) { + struct ordered_hashmap_entry *e; + unsigned hash, idx; + + if (!h) + return NULL; + + hash = bucket_hash(h, key); + idx = bucket_scan(h, hash, key); + if (idx == IDX_NIL) + return NULL; + + e = ordered_bucket_at(h, idx); + if (e->iterate_next == IDX_NIL) + return NULL; + return ordered_bucket_at(h, e->iterate_next)->p.value; +} + +int set_consume(Set *s, void *value) { + int r; + + assert(s); + assert(value); + + r = set_put(s, value); + if (r <= 0) + free(value); + + return r; +} + +int _hashmap_put_strdup_full(Hashmap **h, const struct hash_ops *hash_ops, const char *k, const char *v HASHMAP_DEBUG_PARAMS) { + int r; + + r = _hashmap_ensure_allocated(h, hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + _cleanup_free_ char *kdup = NULL, *vdup = NULL; + + kdup = strdup(k); + if (!kdup) + return -ENOMEM; + + if (v) { + vdup = strdup(v); + if (!vdup) + return -ENOMEM; + } + + r = hashmap_put(*h, kdup, vdup); + if (r < 0) { + if (r == -EEXIST && streq_ptr(v, hashmap_get(*h, kdup))) + return 0; + return r; + } + + /* 0 with non-null vdup would mean vdup is already in the hashmap, which cannot be */ + assert(vdup == NULL || r > 0); + if (r > 0) + kdup = vdup = NULL; + + return r; +} + +int _set_put_strndup_full(Set **s, const struct hash_ops *hash_ops, const char *p, size_t n HASHMAP_DEBUG_PARAMS) { + char *c; + int r; + + assert(s); + assert(p); + + r = _set_ensure_allocated(s, hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + if (n == SIZE_MAX) { + if (set_contains(*s, (char*) p)) + return 0; + + c = strdup(p); + } else + c = strndup(p, n); + if (!c) + return -ENOMEM; + + return set_consume(*s, c); +} + +int _set_put_strdupv_full(Set **s, const struct hash_ops *hash_ops, char **l HASHMAP_DEBUG_PARAMS) { + int n = 0, r; + + assert(s); + + STRV_FOREACH(i, l) { + r = _set_put_strndup_full(s, hash_ops, *i, SIZE_MAX HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + n += r; + } + + return n; +} + +int set_put_strsplit(Set *s, const char *v, const char *separators, ExtractFlags flags) { + const char *p = ASSERT_PTR(v); + int r; + + assert(s); + + for (;;) { + char *word; + + r = extract_first_word(&p, &word, separators, flags); + if (r <= 0) + return r; + + r = set_consume(s, word); + if (r < 0) + return r; + } +} + +/* expand the cachemem if needed, return true if newly (re)activated. */ +static int cachemem_maintain(CacheMem *mem, size_t size) { + assert(mem); + + if (!GREEDY_REALLOC(mem->ptr, size)) { + if (size > 0) + return -ENOMEM; + } + + if (!mem->active) { + mem->active = true; + return true; + } + + return false; +} + +int iterated_cache_get(IteratedCache *cache, const void ***res_keys, const void ***res_values, unsigned *res_n_entries) { + bool sync_keys = false, sync_values = false; + size_t size; + int r; + + assert(cache); + assert(cache->hashmap); + + size = n_entries(cache->hashmap); + + if (res_keys) { + r = cachemem_maintain(&cache->keys, size); + if (r < 0) + return r; + + sync_keys = r; + } else + cache->keys.active = false; + + if (res_values) { + r = cachemem_maintain(&cache->values, size); + if (r < 0) + return r; + + sync_values = r; + } else + cache->values.active = false; + + if (cache->hashmap->dirty) { + if (cache->keys.active) + sync_keys = true; + if (cache->values.active) + sync_values = true; + + cache->hashmap->dirty = false; + } + + if (sync_keys || sync_values) { + unsigned i, idx; + Iterator iter; + + i = 0; + HASHMAP_FOREACH_IDX(idx, cache->hashmap, iter) { + struct hashmap_base_entry *e; + + e = bucket_at(cache->hashmap, idx); + + if (sync_keys) + cache->keys.ptr[i] = e->key; + if (sync_values) + cache->values.ptr[i] = entry_value(cache->hashmap, e); + i++; + } + } + + if (res_keys) + *res_keys = cache->keys.ptr; + if (res_values) + *res_values = cache->values.ptr; + if (res_n_entries) + *res_n_entries = size; + + return 0; +} + +IteratedCache* iterated_cache_free(IteratedCache *cache) { + if (cache) { + free(cache->keys.ptr); + free(cache->values.ptr); + } + + return mfree(cache); +} + +int set_strjoin(Set *s, const char *separator, bool wrap_with_separator, char **ret) { + _cleanup_free_ char *str = NULL; + size_t separator_len, len = 0; + const char *value; + bool first; + + assert(ret); + + if (set_isempty(s)) { + *ret = NULL; + return 0; + } + + separator_len = strlen_ptr(separator); + + if (separator_len == 0) + wrap_with_separator = false; + + first = !wrap_with_separator; + + SET_FOREACH(value, s) { + size_t l = strlen_ptr(value); + + if (l == 0) + continue; + + if (!GREEDY_REALLOC(str, len + l + (first ? 0 : separator_len) + (wrap_with_separator ? separator_len : 0) + 1)) + return -ENOMEM; + + if (separator_len > 0 && !first) { + memcpy(str + len, separator, separator_len); + len += separator_len; + } + + memcpy(str + len, value, l); + len += l; + first = false; + } + + if (wrap_with_separator) { + memcpy(str + len, separator, separator_len); + len += separator_len; + } + + str[len] = '\0'; + + *ret = TAKE_PTR(str); + return 0; +} + +bool set_equal(Set *a, Set *b) { + void *p; + + /* Checks whether each entry of 'a' is also in 'b' and vice versa, i.e. the two sets contain the same + * entries */ + + if (a == b) + return true; + + if (set_isempty(a) && set_isempty(b)) + return true; + + if (set_size(a) != set_size(b)) /* Cheap check that hopefully catches a lot of inequality cases + * already */ + return false; + + SET_FOREACH(p, a) + if (!set_contains(b, p)) + return false; + + /* If we have the same hashops, then we don't need to check things backwards given we compared the + * size and that all of a is in b. */ + if (a->b.hash_ops == b->b.hash_ops) + return true; + + SET_FOREACH(p, b) + if (!set_contains(a, p)) + return false; + + return true; +} + +static bool set_fnmatch_one(Set *patterns, const char *needle) { + const char *p; + + assert(needle); + + /* Any failure of fnmatch() is treated as equivalent to FNM_NOMATCH, i.e. as non-matching pattern */ + + SET_FOREACH(p, patterns) + if (fnmatch(p, needle, 0) == 0) + return true; + + return false; +} + +bool set_fnmatch(Set *include_patterns, Set *exclude_patterns, const char *needle) { + assert(needle); + + if (set_fnmatch_one(exclude_patterns, needle)) + return false; + + if (set_isempty(include_patterns)) + return true; + + return set_fnmatch_one(include_patterns, needle); +} + +static int hashmap_entry_compare( + struct hashmap_base_entry * const *a, + struct hashmap_base_entry * const *b, + compare_func_t compare) { + + assert(a && *a); + assert(b && *b); + assert(compare); + + return compare((*a)->key, (*b)->key); +} + +int _hashmap_dump_sorted(HashmapBase *h, void ***ret, size_t *ret_n) { + _cleanup_free_ struct hashmap_base_entry **entries = NULL; + Iterator iter; + unsigned idx; + size_t n = 0; + + assert(ret); + + if (_hashmap_size(h) == 0) { + *ret = NULL; + if (ret_n) + *ret_n = 0; + return 0; + } + + /* We append one more element than needed so that the resulting array can be used as a strv. We + * don't count this entry in the returned size. */ + entries = new(struct hashmap_base_entry*, _hashmap_size(h) + 1); + if (!entries) + return -ENOMEM; + + HASHMAP_FOREACH_IDX(idx, h, iter) + entries[n++] = bucket_at(h, idx); + + assert(n == _hashmap_size(h)); + entries[n] = NULL; + + typesafe_qsort_r(entries, n, hashmap_entry_compare, h->hash_ops->compare); + + /* Reuse the array. */ + FOREACH_ARRAY(e, entries, n) + *e = entry_value(h, *e); + + *ret = (void**) TAKE_PTR(entries); + if (ret_n) + *ret_n = n; + return 0; +} diff --git a/src/basic/hashmap.h b/src/basic/hashmap.h new file mode 100644 index 0000000..233f1d7 --- /dev/null +++ b/src/basic/hashmap.h @@ -0,0 +1,468 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "hash-funcs.h" +#include "macro.h" + +/* + * A hash table implementation. As a minor optimization a NULL hashmap object + * will be treated as empty hashmap for all read operations. That way it is not + * necessary to instantiate an object for each Hashmap use. + * + * If ENABLE_DEBUG_HASHMAP is defined (by configuring with -Ddebug-extra=hashmap), + * the implementation will: + * - store extra data for debugging and statistics (see tools/gdb-sd_dump_hashmaps.py) + * - perform extra checks for invalid use of iterators + */ + +#define HASH_KEY_SIZE 16 + +typedef void* (*hashmap_destroy_t)(void *p); + +/* The base type for all hashmap and set types. Many functions in the implementation take (HashmapBase*) + * parameters and are run-time polymorphic, though the API is not meant to be polymorphic (do not call + * underscore-prefixed functions directly). */ +typedef struct HashmapBase HashmapBase; + +/* Specific hashmap/set types */ +typedef struct Hashmap Hashmap; /* Maps keys to values */ +typedef struct OrderedHashmap OrderedHashmap; /* Like Hashmap, but also remembers entry insertion order */ +typedef struct Set Set; /* Stores just keys */ + +typedef struct IteratedCache IteratedCache; /* Caches the iterated order of one of the above */ + +/* Ideally the Iterator would be an opaque struct, but it is instantiated + * by hashmap users, so the definition has to be here. Do not use its fields + * directly. */ +typedef struct { + unsigned idx; /* index of an entry to be iterated next */ + const void *next_key; /* expected value of that entry's key pointer */ +#if ENABLE_DEBUG_HASHMAP + unsigned put_count; /* hashmap's put_count recorded at start of iteration */ + unsigned rem_count; /* hashmap's rem_count in previous iteration */ + unsigned prev_idx; /* idx in previous iteration */ +#endif +} Iterator; + +#define _IDX_ITERATOR_FIRST (UINT_MAX - 1) +#define ITERATOR_FIRST ((Iterator) { .idx = _IDX_ITERATOR_FIRST, .next_key = NULL }) +#define ITERATOR_IS_FIRST(i) ((i).idx == _IDX_ITERATOR_FIRST) + +/* Macros for type checking */ +#define PTR_COMPATIBLE_WITH_HASHMAP_BASE(h) \ + (__builtin_types_compatible_p(typeof(h), HashmapBase*) || \ + __builtin_types_compatible_p(typeof(h), Hashmap*) || \ + __builtin_types_compatible_p(typeof(h), OrderedHashmap*) || \ + __builtin_types_compatible_p(typeof(h), Set*)) + +#define PTR_COMPATIBLE_WITH_PLAIN_HASHMAP(h) \ + (__builtin_types_compatible_p(typeof(h), Hashmap*) || \ + __builtin_types_compatible_p(typeof(h), OrderedHashmap*)) \ + +#define HASHMAP_BASE(h) \ + __builtin_choose_expr(PTR_COMPATIBLE_WITH_HASHMAP_BASE(h), \ + (HashmapBase*)(h), \ + (void)0) + +#define PLAIN_HASHMAP(h) \ + __builtin_choose_expr(PTR_COMPATIBLE_WITH_PLAIN_HASHMAP(h), \ + (Hashmap*)(h), \ + (void)0) + +#if ENABLE_DEBUG_HASHMAP +# define HASHMAP_DEBUG_PARAMS , const char *func, const char *file, int line +# define HASHMAP_DEBUG_SRC_ARGS , __func__, PROJECT_FILE, __LINE__ +# define HASHMAP_DEBUG_PASS_ARGS , func, file, line +#else +# define HASHMAP_DEBUG_PARAMS +# define HASHMAP_DEBUG_SRC_ARGS +# define HASHMAP_DEBUG_PASS_ARGS +#endif + +Hashmap* _hashmap_new(const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS); +OrderedHashmap* _ordered_hashmap_new(const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS); +#define hashmap_new(ops) _hashmap_new(ops HASHMAP_DEBUG_SRC_ARGS) +#define ordered_hashmap_new(ops) _ordered_hashmap_new(ops HASHMAP_DEBUG_SRC_ARGS) + +#define hashmap_free_and_replace(a, b) \ + free_and_replace_full(a, b, hashmap_free) + +HashmapBase* _hashmap_free(HashmapBase *h, free_func_t default_free_key, free_func_t default_free_value); +static inline Hashmap* hashmap_free(Hashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), NULL, NULL); +} +static inline OrderedHashmap* ordered_hashmap_free(OrderedHashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), NULL, NULL); +} + +static inline Hashmap* hashmap_free_free(Hashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), NULL, free); +} +static inline OrderedHashmap* ordered_hashmap_free_free(OrderedHashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), NULL, free); +} + +static inline Hashmap* hashmap_free_free_key(Hashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), free, NULL); +} +static inline OrderedHashmap* ordered_hashmap_free_free_key(OrderedHashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), free, NULL); +} + +static inline Hashmap* hashmap_free_free_free(Hashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), free, free); +} +static inline OrderedHashmap* ordered_hashmap_free_free_free(OrderedHashmap *h) { + return (void*) _hashmap_free(HASHMAP_BASE(h), free, free); +} + +IteratedCache* iterated_cache_free(IteratedCache *cache); +int iterated_cache_get(IteratedCache *cache, const void ***res_keys, const void ***res_values, unsigned *res_n_entries); + +HashmapBase* _hashmap_copy(HashmapBase *h HASHMAP_DEBUG_PARAMS); +#define hashmap_copy(h) ((Hashmap*) _hashmap_copy(HASHMAP_BASE(h) HASHMAP_DEBUG_SRC_ARGS)) +#define ordered_hashmap_copy(h) ((OrderedHashmap*) _hashmap_copy(HASHMAP_BASE(h) HASHMAP_DEBUG_SRC_ARGS)) + +int _hashmap_ensure_allocated(Hashmap **h, const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS); +int _hashmap_ensure_put(Hashmap **h, const struct hash_ops *hash_ops, const void *key, void *value HASHMAP_DEBUG_PARAMS); +int _ordered_hashmap_ensure_allocated(OrderedHashmap **h, const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS); + +#define hashmap_ensure_allocated(h, ops) _hashmap_ensure_allocated(h, ops HASHMAP_DEBUG_SRC_ARGS) +#define hashmap_ensure_put(s, ops, key, value) _hashmap_ensure_put(s, ops, key, value HASHMAP_DEBUG_SRC_ARGS) +#define ordered_hashmap_ensure_allocated(h, ops) _ordered_hashmap_ensure_allocated(h, ops HASHMAP_DEBUG_SRC_ARGS) + +int _ordered_hashmap_ensure_put(OrderedHashmap **h, const struct hash_ops *hash_ops, const void *key, void *value HASHMAP_DEBUG_PARAMS); +#define ordered_hashmap_ensure_put(s, ops, key, value) _ordered_hashmap_ensure_put(s, ops, key, value HASHMAP_DEBUG_SRC_ARGS) + +IteratedCache* _hashmap_iterated_cache_new(HashmapBase *h); +static inline IteratedCache* hashmap_iterated_cache_new(Hashmap *h) { + return (IteratedCache*) _hashmap_iterated_cache_new(HASHMAP_BASE(h)); +} +static inline IteratedCache* ordered_hashmap_iterated_cache_new(OrderedHashmap *h) { + return (IteratedCache*) _hashmap_iterated_cache_new(HASHMAP_BASE(h)); +} + +int hashmap_put(Hashmap *h, const void *key, void *value); +static inline int ordered_hashmap_put(OrderedHashmap *h, const void *key, void *value) { + return hashmap_put(PLAIN_HASHMAP(h), key, value); +} + +int _hashmap_put_strdup_full(Hashmap **h, const struct hash_ops *hash_ops, const char *k, const char *v HASHMAP_DEBUG_PARAMS); +#define hashmap_put_strdup_full(h, hash_ops, k, v) _hashmap_put_strdup_full(h, hash_ops, k, v HASHMAP_DEBUG_SRC_ARGS) +#define hashmap_put_strdup(h, k, v) hashmap_put_strdup_full(h, &string_hash_ops_free_free, k, v) + +int hashmap_update(Hashmap *h, const void *key, void *value); +static inline int ordered_hashmap_update(OrderedHashmap *h, const void *key, void *value) { + return hashmap_update(PLAIN_HASHMAP(h), key, value); +} + +int hashmap_replace(Hashmap *h, const void *key, void *value); +static inline int ordered_hashmap_replace(OrderedHashmap *h, const void *key, void *value) { + return hashmap_replace(PLAIN_HASHMAP(h), key, value); +} + +void* _hashmap_get(HashmapBase *h, const void *key); +static inline void *hashmap_get(Hashmap *h, const void *key) { + return _hashmap_get(HASHMAP_BASE(h), key); +} +static inline void *ordered_hashmap_get(OrderedHashmap *h, const void *key) { + return _hashmap_get(HASHMAP_BASE(h), key); +} + +void* hashmap_get2(Hashmap *h, const void *key, void **rkey); +static inline void *ordered_hashmap_get2(OrderedHashmap *h, const void *key, void **rkey) { + return hashmap_get2(PLAIN_HASHMAP(h), key, rkey); +} + +bool _hashmap_contains(HashmapBase *h, const void *key); +static inline bool hashmap_contains(Hashmap *h, const void *key) { + return _hashmap_contains(HASHMAP_BASE(h), key); +} +static inline bool ordered_hashmap_contains(OrderedHashmap *h, const void *key) { + return _hashmap_contains(HASHMAP_BASE(h), key); +} + +void* _hashmap_remove(HashmapBase *h, const void *key); +static inline void *hashmap_remove(Hashmap *h, const void *key) { + return _hashmap_remove(HASHMAP_BASE(h), key); +} +static inline void *ordered_hashmap_remove(OrderedHashmap *h, const void *key) { + return _hashmap_remove(HASHMAP_BASE(h), key); +} + +void* hashmap_remove2(Hashmap *h, const void *key, void **rkey); +static inline void *ordered_hashmap_remove2(OrderedHashmap *h, const void *key, void **rkey) { + return hashmap_remove2(PLAIN_HASHMAP(h), key, rkey); +} + +void* _hashmap_remove_value(HashmapBase *h, const void *key, void *value); +static inline void *hashmap_remove_value(Hashmap *h, const void *key, void *value) { + return _hashmap_remove_value(HASHMAP_BASE(h), key, value); +} + +static inline void* ordered_hashmap_remove_value(OrderedHashmap *h, const void *key, void *value) { + return hashmap_remove_value(PLAIN_HASHMAP(h), key, value); +} + +int hashmap_remove_and_put(Hashmap *h, const void *old_key, const void *new_key, void *value); +static inline int ordered_hashmap_remove_and_put(OrderedHashmap *h, const void *old_key, const void *new_key, void *value) { + return hashmap_remove_and_put(PLAIN_HASHMAP(h), old_key, new_key, value); +} + +int hashmap_remove_and_replace(Hashmap *h, const void *old_key, const void *new_key, void *value); +static inline int ordered_hashmap_remove_and_replace(OrderedHashmap *h, const void *old_key, const void *new_key, void *value) { + return hashmap_remove_and_replace(PLAIN_HASHMAP(h), old_key, new_key, value); +} + +/* Since merging data from an OrderedHashmap into a Hashmap or vice-versa + * should just work, allow this by having looser type-checking here. */ +int _hashmap_merge(Hashmap *h, Hashmap *other); +#define hashmap_merge(h, other) _hashmap_merge(PLAIN_HASHMAP(h), PLAIN_HASHMAP(other)) +#define ordered_hashmap_merge(h, other) hashmap_merge(h, other) + +int _hashmap_reserve(HashmapBase *h, unsigned entries_add); +static inline int hashmap_reserve(Hashmap *h, unsigned entries_add) { + return _hashmap_reserve(HASHMAP_BASE(h), entries_add); +} +static inline int ordered_hashmap_reserve(OrderedHashmap *h, unsigned entries_add) { + return _hashmap_reserve(HASHMAP_BASE(h), entries_add); +} + +int _hashmap_move(HashmapBase *h, HashmapBase *other); +/* Unlike hashmap_merge, hashmap_move does not allow mixing the types. */ +static inline int hashmap_move(Hashmap *h, Hashmap *other) { + return _hashmap_move(HASHMAP_BASE(h), HASHMAP_BASE(other)); +} +static inline int ordered_hashmap_move(OrderedHashmap *h, OrderedHashmap *other) { + return _hashmap_move(HASHMAP_BASE(h), HASHMAP_BASE(other)); +} + +int _hashmap_move_one(HashmapBase *h, HashmapBase *other, const void *key); +static inline int hashmap_move_one(Hashmap *h, Hashmap *other, const void *key) { + return _hashmap_move_one(HASHMAP_BASE(h), HASHMAP_BASE(other), key); +} +static inline int ordered_hashmap_move_one(OrderedHashmap *h, OrderedHashmap *other, const void *key) { + return _hashmap_move_one(HASHMAP_BASE(h), HASHMAP_BASE(other), key); +} + +unsigned _hashmap_size(HashmapBase *h) _pure_; +static inline unsigned hashmap_size(Hashmap *h) { + return _hashmap_size(HASHMAP_BASE(h)); +} +static inline unsigned ordered_hashmap_size(OrderedHashmap *h) { + return _hashmap_size(HASHMAP_BASE(h)); +} + +static inline bool hashmap_isempty(Hashmap *h) { + return hashmap_size(h) == 0; +} +static inline bool ordered_hashmap_isempty(OrderedHashmap *h) { + return ordered_hashmap_size(h) == 0; +} + +unsigned _hashmap_buckets(HashmapBase *h) _pure_; +static inline unsigned hashmap_buckets(Hashmap *h) { + return _hashmap_buckets(HASHMAP_BASE(h)); +} +static inline unsigned ordered_hashmap_buckets(OrderedHashmap *h) { + return _hashmap_buckets(HASHMAP_BASE(h)); +} + +bool _hashmap_iterate(HashmapBase *h, Iterator *i, void **value, const void **key); +static inline bool hashmap_iterate(Hashmap *h, Iterator *i, void **value, const void **key) { + return _hashmap_iterate(HASHMAP_BASE(h), i, value, key); +} +static inline bool ordered_hashmap_iterate(OrderedHashmap *h, Iterator *i, void **value, const void **key) { + return _hashmap_iterate(HASHMAP_BASE(h), i, value, key); +} + +void _hashmap_clear(HashmapBase *h, free_func_t default_free_key, free_func_t default_free_value); +static inline void hashmap_clear(Hashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), NULL, NULL); +} +static inline void ordered_hashmap_clear(OrderedHashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), NULL, NULL); +} + +static inline void hashmap_clear_free(Hashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), NULL, free); +} +static inline void ordered_hashmap_clear_free(OrderedHashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), NULL, free); +} + +static inline void hashmap_clear_free_key(Hashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), free, NULL); +} +static inline void ordered_hashmap_clear_free_key(OrderedHashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), free, NULL); +} + +static inline void hashmap_clear_free_free(Hashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), free, free); +} +static inline void ordered_hashmap_clear_free_free(OrderedHashmap *h) { + _hashmap_clear(HASHMAP_BASE(h), free, free); +} + +/* + * Note about all *_first*() functions + * + * For plain Hashmaps and Sets the order of entries is undefined. + * The functions find whatever entry is first in the implementation + * internal order. + * + * Only for OrderedHashmaps the order is well defined and finding + * the first entry is O(1). + */ + +void *_hashmap_first_key_and_value(HashmapBase *h, bool remove, void **ret_key); +static inline void *hashmap_steal_first_key_and_value(Hashmap *h, void **ret) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), true, ret); +} +static inline void *ordered_hashmap_steal_first_key_and_value(OrderedHashmap *h, void **ret) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), true, ret); +} +static inline void *hashmap_first_key_and_value(Hashmap *h, void **ret) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), false, ret); +} +static inline void *ordered_hashmap_first_key_and_value(OrderedHashmap *h, void **ret) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), false, ret); +} + +static inline void *hashmap_steal_first(Hashmap *h) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), true, NULL); +} +static inline void *ordered_hashmap_steal_first(OrderedHashmap *h) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), true, NULL); +} +static inline void *hashmap_first(Hashmap *h) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), false, NULL); +} +static inline void *ordered_hashmap_first(OrderedHashmap *h) { + return _hashmap_first_key_and_value(HASHMAP_BASE(h), false, NULL); +} + +static inline void *_hashmap_first_key(HashmapBase *h, bool remove) { + void *key = NULL; + + (void) _hashmap_first_key_and_value(HASHMAP_BASE(h), remove, &key); + return key; +} +static inline void *hashmap_steal_first_key(Hashmap *h) { + return _hashmap_first_key(HASHMAP_BASE(h), true); +} +static inline void *ordered_hashmap_steal_first_key(OrderedHashmap *h) { + return _hashmap_first_key(HASHMAP_BASE(h), true); +} +static inline void *hashmap_first_key(Hashmap *h) { + return _hashmap_first_key(HASHMAP_BASE(h), false); +} +static inline void *ordered_hashmap_first_key(OrderedHashmap *h) { + return _hashmap_first_key(HASHMAP_BASE(h), false); +} + +#define hashmap_clear_with_destructor(h, f) \ + ({ \ + Hashmap *_h = (h); \ + void *_item; \ + while ((_item = hashmap_steal_first(_h))) \ + f(_item); \ + _h; \ + }) +#define hashmap_free_with_destructor(h, f) \ + hashmap_free(hashmap_clear_with_destructor(h, f)) +#define ordered_hashmap_clear_with_destructor(h, f) \ + ({ \ + OrderedHashmap *_h = (h); \ + void *_item; \ + while ((_item = ordered_hashmap_steal_first(_h))) \ + f(_item); \ + _h; \ + }) +#define ordered_hashmap_free_with_destructor(h, f) \ + ordered_hashmap_free(ordered_hashmap_clear_with_destructor(h, f)) + +/* no hashmap_next */ +void* ordered_hashmap_next(OrderedHashmap *h, const void *key); + +char** _hashmap_get_strv(HashmapBase *h); +static inline char** hashmap_get_strv(Hashmap *h) { + return _hashmap_get_strv(HASHMAP_BASE(h)); +} +static inline char** ordered_hashmap_get_strv(OrderedHashmap *h) { + return _hashmap_get_strv(HASHMAP_BASE(h)); +} + +int _hashmap_dump_sorted(HashmapBase *h, void ***ret, size_t *ret_n); +static inline int hashmap_dump_sorted(Hashmap *h, void ***ret, size_t *ret_n) { + return _hashmap_dump_sorted(HASHMAP_BASE(h), ret, ret_n); +} +static inline int ordered_hashmap_dump_sorted(OrderedHashmap *h, void ***ret, size_t *ret_n) { + return _hashmap_dump_sorted(HASHMAP_BASE(h), ret, ret_n); +} +static inline int set_dump_sorted(Set *h, void ***ret, size_t *ret_n) { + return _hashmap_dump_sorted(HASHMAP_BASE(h), ret, ret_n); +} + +/* + * Hashmaps are iterated in unpredictable order. + * OrderedHashmaps are an exception to this. They are iterated in the order + * the entries were inserted. + * It is safe to remove the current entry. + */ +#define _HASHMAP_BASE_FOREACH(e, h, i) \ + for (Iterator i = ITERATOR_FIRST; _hashmap_iterate((h), &i, (void**)&(e), NULL); ) +#define HASHMAP_BASE_FOREACH(e, h) \ + _HASHMAP_BASE_FOREACH(e, h, UNIQ_T(i, UNIQ)) + +#define _HASHMAP_FOREACH(e, h, i) \ + for (Iterator i = ITERATOR_FIRST; hashmap_iterate((h), &i, (void**)&(e), NULL); ) +#define HASHMAP_FOREACH(e, h) \ + _HASHMAP_FOREACH(e, h, UNIQ_T(i, UNIQ)) + +#define _ORDERED_HASHMAP_FOREACH(e, h, i) \ + for (Iterator i = ITERATOR_FIRST; ordered_hashmap_iterate((h), &i, (void**)&(e), NULL); ) +#define ORDERED_HASHMAP_FOREACH(e, h) \ + _ORDERED_HASHMAP_FOREACH(e, h, UNIQ_T(i, UNIQ)) + +#define _HASHMAP_BASE_FOREACH_KEY(e, k, h, i) \ + for (Iterator i = ITERATOR_FIRST; _hashmap_iterate((h), &i, (void**)&(e), (const void**) &(k)); ) +#define HASHMAP_BASE_FOREACH_KEY(e, k, h) \ + _HASHMAP_BASE_FOREACH_KEY(e, k, h, UNIQ_T(i, UNIQ)) + +#define _HASHMAP_FOREACH_KEY(e, k, h, i) \ + for (Iterator i = ITERATOR_FIRST; hashmap_iterate((h), &i, (void**)&(e), (const void**) &(k)); ) +#define HASHMAP_FOREACH_KEY(e, k, h) \ + _HASHMAP_FOREACH_KEY(e, k, h, UNIQ_T(i, UNIQ)) + +#define _ORDERED_HASHMAP_FOREACH_KEY(e, k, h, i) \ + for (Iterator i = ITERATOR_FIRST; ordered_hashmap_iterate((h), &i, (void**)&(e), (const void**) &(k)); ) +#define ORDERED_HASHMAP_FOREACH_KEY(e, k, h) \ + _ORDERED_HASHMAP_FOREACH_KEY(e, k, h, UNIQ_T(i, UNIQ)) + +DEFINE_TRIVIAL_CLEANUP_FUNC(Hashmap*, hashmap_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(Hashmap*, hashmap_free_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(Hashmap*, hashmap_free_free_key); +DEFINE_TRIVIAL_CLEANUP_FUNC(Hashmap*, hashmap_free_free_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(OrderedHashmap*, ordered_hashmap_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(OrderedHashmap*, ordered_hashmap_free_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(OrderedHashmap*, ordered_hashmap_free_free_key); +DEFINE_TRIVIAL_CLEANUP_FUNC(OrderedHashmap*, ordered_hashmap_free_free_free); + +#define _cleanup_hashmap_free_ _cleanup_(hashmap_freep) +#define _cleanup_hashmap_free_free_ _cleanup_(hashmap_free_freep) +#define _cleanup_hashmap_free_free_free_ _cleanup_(hashmap_free_free_freep) +#define _cleanup_ordered_hashmap_free_ _cleanup_(ordered_hashmap_freep) +#define _cleanup_ordered_hashmap_free_free_ _cleanup_(ordered_hashmap_free_freep) +#define _cleanup_ordered_hashmap_free_free_free_ _cleanup_(ordered_hashmap_free_free_freep) + +DEFINE_TRIVIAL_CLEANUP_FUNC(IteratedCache*, iterated_cache_free); + +#define _cleanup_iterated_cache_free_ _cleanup_(iterated_cache_freep) + +void hashmap_trim_pools(void); diff --git a/src/basic/hexdecoct.c b/src/basic/hexdecoct.c new file mode 100644 index 0000000..ea683eb --- /dev/null +++ b/src/basic/hexdecoct.c @@ -0,0 +1,907 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "hexdecoct.h" +#include "macro.h" +#include "memory-util.h" +#include "string-util.h" + +char octchar(int x) { + return '0' + (x & 7); +} + +int unoctchar(char c) { + + if (c >= '0' && c <= '7') + return c - '0'; + + return -EINVAL; +} + +char decchar(int x) { + return '0' + (x % 10); +} + +int undecchar(char c) { + + if (c >= '0' && c <= '9') + return c - '0'; + + return -EINVAL; +} + +char hexchar(int x) { + static const char table[16] = "0123456789abcdef"; + + return table[x & 15]; +} + +int unhexchar(char c) { + + if (c >= '0' && c <= '9') + return c - '0'; + + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -EINVAL; +} + +char *hexmem(const void *p, size_t l) { + const uint8_t *x; + char *r, *z; + + assert(p || l == 0); + + z = r = new(char, l * 2 + 1); + if (!r) + return NULL; + + for (x = p; x && x < (const uint8_t*) p + l; x++) { + *(z++) = hexchar(*x >> 4); + *(z++) = hexchar(*x & 15); + } + + *z = 0; + return r; +} + +static int unhex_next(const char **p, size_t *l) { + int r; + + assert(p); + assert(l); + + /* Find the next non-whitespace character, and decode it. We + * greedily skip all preceding and all following whitespace. */ + + for (;;) { + if (*l == 0) + return -EPIPE; + + if (!strchr(WHITESPACE, **p)) + break; + + /* Skip leading whitespace */ + (*p)++, (*l)--; + } + + r = unhexchar(**p); + if (r < 0) + return r; + + for (;;) { + (*p)++, (*l)--; + + if (*l == 0 || !strchr(WHITESPACE, **p)) + break; + + /* Skip following whitespace */ + } + + return r; +} + +int unhexmem_full( + const char *p, + size_t l, + bool secure, + void **ret, + size_t *ret_len) { + + _cleanup_free_ uint8_t *buf = NULL; + size_t buf_size; + const char *x; + uint8_t *z; + + assert(p || l == 0); + + if (l == SIZE_MAX) + l = strlen(p); + + /* Note that the calculation of memory size is an upper boundary, as we ignore whitespace while decoding */ + buf_size = (l + 1) / 2 + 1; + buf = malloc(buf_size); + if (!buf) + return -ENOMEM; + + CLEANUP_ERASE_PTR(secure ? &buf : NULL, buf_size); + + for (x = p, z = buf;;) { + int a, b; + + a = unhex_next(&x, &l); + if (a == -EPIPE) /* End of string */ + break; + if (a < 0) + return a; + + b = unhex_next(&x, &l); + if (b < 0) + return b; + + *(z++) = (uint8_t) a << 4 | (uint8_t) b; + } + + *z = 0; + + if (ret_len) + *ret_len = (size_t) (z - buf); + if (ret) + *ret = TAKE_PTR(buf); + + return 0; +} + +/* https://tools.ietf.org/html/rfc4648#section-6 + * Notice that base32hex differs from base32 in the alphabet it uses. + * The distinction is that the base32hex representation preserves the + * order of the underlying data when compared as bytestrings, this is + * useful when representing NSEC3 hashes, as one can then verify the + * order of hashes directly from their representation. */ +char base32hexchar(int x) { + static const char table[32] = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUV"; + + return table[x & 31]; +} + +int unbase32hexchar(char c) { + unsigned offset; + + if (c >= '0' && c <= '9') + return c - '0'; + + offset = '9' - '0' + 1; + + if (c >= 'A' && c <= 'V') + return c - 'A' + offset; + + return -EINVAL; +} + +char *base32hexmem(const void *p, size_t l, bool padding) { + char *r, *z; + const uint8_t *x; + size_t len; + + assert(p || l == 0); + + if (padding) + /* five input bytes makes eight output bytes, padding is added so we must round up */ + len = 8 * (l + 4) / 5; + else { + /* same, but round down as there is no padding */ + len = 8 * l / 5; + + switch (l % 5) { + case 4: + len += 7; + break; + case 3: + len += 5; + break; + case 2: + len += 4; + break; + case 1: + len += 2; + break; + } + } + + z = r = malloc(len + 1); + if (!r) + return NULL; + + for (x = p; x < (const uint8_t*) p + (l / 5) * 5; x += 5) { + /* x[0] == XXXXXXXX; x[1] == YYYYYYYY; x[2] == ZZZZZZZZ + * x[3] == QQQQQQQQ; x[4] == WWWWWWWW */ + *(z++) = base32hexchar(x[0] >> 3); /* 000XXXXX */ + *(z++) = base32hexchar((x[0] & 7) << 2 | x[1] >> 6); /* 000XXXYY */ + *(z++) = base32hexchar((x[1] & 63) >> 1); /* 000YYYYY */ + *(z++) = base32hexchar((x[1] & 1) << 4 | x[2] >> 4); /* 000YZZZZ */ + *(z++) = base32hexchar((x[2] & 15) << 1 | x[3] >> 7); /* 000ZZZZQ */ + *(z++) = base32hexchar((x[3] & 127) >> 2); /* 000QQQQQ */ + *(z++) = base32hexchar((x[3] & 3) << 3 | x[4] >> 5); /* 000QQWWW */ + *(z++) = base32hexchar((x[4] & 31)); /* 000WWWWW */ + } + + switch (l % 5) { + case 4: + *(z++) = base32hexchar(x[0] >> 3); /* 000XXXXX */ + *(z++) = base32hexchar((x[0] & 7) << 2 | x[1] >> 6); /* 000XXXYY */ + *(z++) = base32hexchar((x[1] & 63) >> 1); /* 000YYYYY */ + *(z++) = base32hexchar((x[1] & 1) << 4 | x[2] >> 4); /* 000YZZZZ */ + *(z++) = base32hexchar((x[2] & 15) << 1 | x[3] >> 7); /* 000ZZZZQ */ + *(z++) = base32hexchar((x[3] & 127) >> 2); /* 000QQQQQ */ + *(z++) = base32hexchar((x[3] & 3) << 3); /* 000QQ000 */ + if (padding) + *(z++) = '='; + + break; + + case 3: + *(z++) = base32hexchar(x[0] >> 3); /* 000XXXXX */ + *(z++) = base32hexchar((x[0] & 7) << 2 | x[1] >> 6); /* 000XXXYY */ + *(z++) = base32hexchar((x[1] & 63) >> 1); /* 000YYYYY */ + *(z++) = base32hexchar((x[1] & 1) << 4 | x[2] >> 4); /* 000YZZZZ */ + *(z++) = base32hexchar((x[2] & 15) << 1); /* 000ZZZZ0 */ + if (padding) { + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + } + + break; + + case 2: + *(z++) = base32hexchar(x[0] >> 3); /* 000XXXXX */ + *(z++) = base32hexchar((x[0] & 7) << 2 | x[1] >> 6); /* 000XXXYY */ + *(z++) = base32hexchar((x[1] & 63) >> 1); /* 000YYYYY */ + *(z++) = base32hexchar((x[1] & 1) << 4); /* 000Y0000 */ + if (padding) { + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + } + + break; + + case 1: + *(z++) = base32hexchar(x[0] >> 3); /* 000XXXXX */ + *(z++) = base32hexchar((x[0] & 7) << 2); /* 000XXX00 */ + if (padding) { + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + *(z++) = '='; + } + + break; + } + + *z = 0; + return r; +} + +int unbase32hexmem(const char *p, size_t l, bool padding, void **mem, size_t *_len) { + _cleanup_free_ uint8_t *r = NULL; + int a, b, c, d, e, f, g, h; + uint8_t *z; + const char *x; + size_t len; + unsigned pad = 0; + + assert(p || l == 0); + assert(mem); + assert(_len); + + if (l == SIZE_MAX) + l = strlen(p); + + /* padding ensures any base32hex input has input divisible by 8 */ + if (padding && l % 8 != 0) + return -EINVAL; + + if (padding) { + /* strip the padding */ + while (l > 0 && p[l - 1] == '=' && pad < 7) { + pad++; + l--; + } + } + + /* a group of eight input bytes needs five output bytes, in case of + * padding we need to add some extra bytes */ + len = (l / 8) * 5; + + switch (l % 8) { + case 7: + len += 4; + break; + case 5: + len += 3; + break; + case 4: + len += 2; + break; + case 2: + len += 1; + break; + case 0: + break; + default: + return -EINVAL; + } + + z = r = malloc(len + 1); + if (!r) + return -ENOMEM; + + for (x = p; x < p + (l / 8) * 8; x += 8) { + /* a == 000XXXXX; b == 000YYYYY; c == 000ZZZZZ; d == 000WWWWW + * e == 000SSSSS; f == 000QQQQQ; g == 000VVVVV; h == 000RRRRR */ + a = unbase32hexchar(x[0]); + if (a < 0) + return -EINVAL; + + b = unbase32hexchar(x[1]); + if (b < 0) + return -EINVAL; + + c = unbase32hexchar(x[2]); + if (c < 0) + return -EINVAL; + + d = unbase32hexchar(x[3]); + if (d < 0) + return -EINVAL; + + e = unbase32hexchar(x[4]); + if (e < 0) + return -EINVAL; + + f = unbase32hexchar(x[5]); + if (f < 0) + return -EINVAL; + + g = unbase32hexchar(x[6]); + if (g < 0) + return -EINVAL; + + h = unbase32hexchar(x[7]); + if (h < 0) + return -EINVAL; + + *(z++) = (uint8_t) a << 3 | (uint8_t) b >> 2; /* XXXXXYYY */ + *(z++) = (uint8_t) b << 6 | (uint8_t) c << 1 | (uint8_t) d >> 4; /* YYZZZZZW */ + *(z++) = (uint8_t) d << 4 | (uint8_t) e >> 1; /* WWWWSSSS */ + *(z++) = (uint8_t) e << 7 | (uint8_t) f << 2 | (uint8_t) g >> 3; /* SQQQQQVV */ + *(z++) = (uint8_t) g << 5 | (uint8_t) h; /* VVVRRRRR */ + } + + switch (l % 8) { + case 7: + a = unbase32hexchar(x[0]); + if (a < 0) + return -EINVAL; + + b = unbase32hexchar(x[1]); + if (b < 0) + return -EINVAL; + + c = unbase32hexchar(x[2]); + if (c < 0) + return -EINVAL; + + d = unbase32hexchar(x[3]); + if (d < 0) + return -EINVAL; + + e = unbase32hexchar(x[4]); + if (e < 0) + return -EINVAL; + + f = unbase32hexchar(x[5]); + if (f < 0) + return -EINVAL; + + g = unbase32hexchar(x[6]); + if (g < 0) + return -EINVAL; + + /* g == 000VV000 */ + if (g & 7) + return -EINVAL; + + *(z++) = (uint8_t) a << 3 | (uint8_t) b >> 2; /* XXXXXYYY */ + *(z++) = (uint8_t) b << 6 | (uint8_t) c << 1 | (uint8_t) d >> 4; /* YYZZZZZW */ + *(z++) = (uint8_t) d << 4 | (uint8_t) e >> 1; /* WWWWSSSS */ + *(z++) = (uint8_t) e << 7 | (uint8_t) f << 2 | (uint8_t) g >> 3; /* SQQQQQVV */ + + break; + case 5: + a = unbase32hexchar(x[0]); + if (a < 0) + return -EINVAL; + + b = unbase32hexchar(x[1]); + if (b < 0) + return -EINVAL; + + c = unbase32hexchar(x[2]); + if (c < 0) + return -EINVAL; + + d = unbase32hexchar(x[3]); + if (d < 0) + return -EINVAL; + + e = unbase32hexchar(x[4]); + if (e < 0) + return -EINVAL; + + /* e == 000SSSS0 */ + if (e & 1) + return -EINVAL; + + *(z++) = (uint8_t) a << 3 | (uint8_t) b >> 2; /* XXXXXYYY */ + *(z++) = (uint8_t) b << 6 | (uint8_t) c << 1 | (uint8_t) d >> 4; /* YYZZZZZW */ + *(z++) = (uint8_t) d << 4 | (uint8_t) e >> 1; /* WWWWSSSS */ + + break; + case 4: + a = unbase32hexchar(x[0]); + if (a < 0) + return -EINVAL; + + b = unbase32hexchar(x[1]); + if (b < 0) + return -EINVAL; + + c = unbase32hexchar(x[2]); + if (c < 0) + return -EINVAL; + + d = unbase32hexchar(x[3]); + if (d < 0) + return -EINVAL; + + /* d == 000W0000 */ + if (d & 15) + return -EINVAL; + + *(z++) = (uint8_t) a << 3 | (uint8_t) b >> 2; /* XXXXXYYY */ + *(z++) = (uint8_t) b << 6 | (uint8_t) c << 1 | (uint8_t) d >> 4; /* YYZZZZZW */ + + break; + case 2: + a = unbase32hexchar(x[0]); + if (a < 0) + return -EINVAL; + + b = unbase32hexchar(x[1]); + if (b < 0) + return -EINVAL; + + /* b == 000YYY00 */ + if (b & 3) + return -EINVAL; + + *(z++) = (uint8_t) a << 3 | (uint8_t) b >> 2; /* XXXXXYYY */ + + break; + case 0: + break; + default: + return -EINVAL; + } + + *z = 0; + + *mem = TAKE_PTR(r); + *_len = len; + + return 0; +} + +/* https://tools.ietf.org/html/rfc4648#section-4 */ +char base64char(int x) { + static const char table[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + return table[x & 63]; +} + +/* This is almost base64char(), but not entirely, as it uses the "url and filename safe" alphabet, + * since we don't want "/" appear in interface names (since interfaces appear in sysfs as filenames). + * See section #5 of RFC 4648. */ +char urlsafe_base64char(int x) { + static const char table[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789-_"; + return table[x & 63]; +} + +int unbase64char(char c) { + unsigned offset; + + if (c >= 'A' && c <= 'Z') + return c - 'A'; + + offset = 'Z' - 'A' + 1; + + if (c >= 'a' && c <= 'z') + return c - 'a' + offset; + + offset += 'z' - 'a' + 1; + + if (c >= '0' && c <= '9') + return c - '0' + offset; + + offset += '9' - '0' + 1; + + if (IN_SET(c, '+', '-')) /* Support both the regular and the URL safe character set (see above) */ + return offset; + + offset++; + + if (IN_SET(c, '/', '_')) /* ditto */ + return offset; + + return -EINVAL; +} + +static void maybe_line_break(char **x, char *start, size_t line_break) { + size_t n; + + assert(x); + assert(*x); + assert(start); + assert(*x >= start); + + if (line_break == SIZE_MAX) + return; + + n = *x - start; + + if (n % (line_break + 1) == line_break) + *((*x)++) = '\n'; +} + +ssize_t base64mem_full( + const void *p, + size_t l, + size_t line_break, + char **ret) { + + const uint8_t *x; + char *b, *z; + size_t m; + + assert(p || l == 0); + assert(line_break > 0); + assert(ret); + + /* three input bytes makes four output bytes, padding is added so we must round up */ + m = 4 * (l + 2) / 3 + 1; + if (line_break != SIZE_MAX) + m += m / line_break; + + z = b = malloc(m); + if (!b) + return -ENOMEM; + + for (x = p; x && x < (const uint8_t*) p + (l / 3) * 3; x += 3) { + /* x[0] == XXXXXXXX; x[1] == YYYYYYYY; x[2] == ZZZZZZZZ */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char(x[0] >> 2); /* 00XXXXXX */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char((x[0] & 3) << 4 | x[1] >> 4); /* 00XXYYYY */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char((x[1] & 15) << 2 | x[2] >> 6); /* 00YYYYZZ */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char(x[2] & 63); /* 00ZZZZZZ */ + } + + switch (l % 3) { + case 2: + maybe_line_break(&z, b, line_break); + *(z++) = base64char(x[0] >> 2); /* 00XXXXXX */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char((x[0] & 3) << 4 | x[1] >> 4); /* 00XXYYYY */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char((x[1] & 15) << 2); /* 00YYYY00 */ + maybe_line_break(&z, b, line_break); + *(z++) = '='; + break; + + case 1: + maybe_line_break(&z, b, line_break); + *(z++) = base64char(x[0] >> 2); /* 00XXXXXX */ + maybe_line_break(&z, b, line_break); + *(z++) = base64char((x[0] & 3) << 4); /* 00XX0000 */ + maybe_line_break(&z, b, line_break); + *(z++) = '='; + maybe_line_break(&z, b, line_break); + *(z++) = '='; + break; + } + + *z = 0; + *ret = b; + + assert(z >= b); /* Let static analyzers know that the answer is non-negative. */ + return z - b; +} + +static ssize_t base64_append_width( + char **prefix, + size_t plen, + char sep, + size_t indent, + const void *p, + size_t l, + size_t width) { + + _cleanup_free_ char *x = NULL; + char *t, *s; + size_t lines; + ssize_t len; + + assert(prefix); + assert(*prefix || plen == 0); + assert(p || l == 0); + + len = base64mem(p, l, &x); + if (len < 0) + return len; + if (len == 0) + return plen; + + lines = DIV_ROUND_UP(len, width); + + if (plen >= SSIZE_MAX - 1 - 1 || + lines > (SSIZE_MAX - plen - 1 - 1) / (indent + width + 1)) + return -ENOMEM; + + t = realloc(*prefix, plen + 1 + 1 + (indent + width + 1) * lines); + if (!t) + return -ENOMEM; + + s = t + plen; + for (size_t line = 0; line < lines; line++) { + size_t act = MIN(width, (size_t) len); + + if (line > 0) + sep = '\n'; + + if (s > t) { + *s++ = sep; + if (sep == '\n') + s = mempset(s, ' ', indent); + } + + s = mempcpy(s, x + width * line, act); + len -= act; + } + assert(len == 0); + + *s = '\0'; + *prefix = t; + return s - t; +} + +ssize_t base64_append( + char **prefix, + size_t plen, + const void *p, + size_t l, + size_t indent, + size_t width) { + + if (plen > width / 2 || plen + indent > width) + /* leave indent on the left, keep last column free */ + return base64_append_width(prefix, plen, '\n', indent, p, l, width - indent); + else + /* leave plen on the left, keep last column free */ + return base64_append_width(prefix, plen, ' ', plen + 1, p, l, width - plen - 1); +} + +static int unbase64_next(const char **p, size_t *l) { + int ret; + + assert(p); + assert(l); + + /* Find the next non-whitespace character, and decode it. If we find padding, we return it as INT_MAX. We + * greedily skip all preceding and all following whitespace. */ + + for (;;) { + if (*l == 0) + return -EPIPE; + + if (!strchr(WHITESPACE, **p)) + break; + + /* Skip leading whitespace */ + (*p)++, (*l)--; + } + + if (**p == '=') + ret = INT_MAX; /* return padding as INT_MAX */ + else { + ret = unbase64char(**p); + if (ret < 0) + return ret; + } + + for (;;) { + (*p)++, (*l)--; + + if (*l == 0) + break; + if (!strchr(WHITESPACE, **p)) + break; + + /* Skip following whitespace */ + } + + return ret; +} + +int unbase64mem_full( + const char *p, + size_t l, + bool secure, + void **ret, + size_t *ret_size) { + + _cleanup_free_ uint8_t *buf = NULL; + const char *x; + uint8_t *z; + size_t len; + + assert(p || l == 0); + + if (l == SIZE_MAX) + l = strlen(p); + + /* A group of four input bytes needs three output bytes, in case of padding we need to add two or three extra + * bytes. Note that this calculation is an upper boundary, as we ignore whitespace while decoding */ + len = (l / 4) * 3 + (l % 4 != 0 ? (l % 4) - 1 : 0); + + buf = malloc(len + 1); + if (!buf) + return -ENOMEM; + + CLEANUP_ERASE_PTR(secure ? &buf : NULL, len); + + for (x = p, z = buf;;) { + int a, b, c, d; /* a == 00XXXXXX; b == 00YYYYYY; c == 00ZZZZZZ; d == 00WWWWWW */ + + a = unbase64_next(&x, &l); + if (a == -EPIPE) /* End of string */ + break; + if (a < 0) + return a; + if (a == INT_MAX) /* Padding is not allowed at the beginning of a 4ch block */ + return -EINVAL; + + b = unbase64_next(&x, &l); + if (b < 0) + return b; + if (b == INT_MAX) /* Padding is not allowed at the second character of a 4ch block either */ + return -EINVAL; + + c = unbase64_next(&x, &l); + if (c < 0) + return c; + + d = unbase64_next(&x, &l); + if (d < 0) + return d; + + if (c == INT_MAX) { /* Padding at the third character */ + + if (d != INT_MAX) /* If the third character is padding, the fourth must be too */ + return -EINVAL; + + /* b == 00YY0000 */ + if (b & 15) + return -EINVAL; + + if (l > 0) /* Trailing rubbish? */ + return -ENAMETOOLONG; + + *(z++) = (uint8_t) a << 2 | (uint8_t) (b >> 4); /* XXXXXXYY */ + break; + } + + if (d == INT_MAX) { + /* c == 00ZZZZ00 */ + if (c & 3) + return -EINVAL; + + if (l > 0) /* Trailing rubbish? */ + return -ENAMETOOLONG; + + *(z++) = (uint8_t) a << 2 | (uint8_t) b >> 4; /* XXXXXXYY */ + *(z++) = (uint8_t) b << 4 | (uint8_t) c >> 2; /* YYYYZZZZ */ + break; + } + + *(z++) = (uint8_t) a << 2 | (uint8_t) b >> 4; /* XXXXXXYY */ + *(z++) = (uint8_t) b << 4 | (uint8_t) c >> 2; /* YYYYZZZZ */ + *(z++) = (uint8_t) c << 6 | (uint8_t) d; /* ZZWWWWWW */ + } + + *z = 0; + + assert((size_t) (z - buf) <= len); + + if (ret_size) + *ret_size = (size_t) (z - buf); + if (ret) + *ret = TAKE_PTR(buf); + + return 0; +} + +void hexdump(FILE *f, const void *p, size_t s) { + const uint8_t *b = p; + unsigned n = 0; + + assert(b || s == 0); + + if (!f) + f = stdout; + + while (s > 0) { + size_t i; + + fprintf(f, "%04x ", n); + + for (i = 0; i < 16; i++) { + + if (i >= s) + fputs(" ", f); + else + fprintf(f, "%02x ", b[i]); + + if (i == 7) + fputc(' ', f); + } + + fputc(' ', f); + + for (i = 0; i < 16; i++) { + + if (i >= s) + fputc(' ', f); + else + fputc(isprint(b[i]) ? (char) b[i] : '.', f); + } + + fputc('\n', f); + + if (s < 16) + break; + + n += 16; + b += 16; + s -= 16; + } +} diff --git a/src/basic/hexdecoct.h b/src/basic/hexdecoct.h new file mode 100644 index 0000000..319b21a --- /dev/null +++ b/src/basic/hexdecoct.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "macro.h" + +char octchar(int x) _const_; +int unoctchar(char c) _const_; + +char decchar(int x) _const_; +int undecchar(char c) _const_; + +char hexchar(int x) _const_; +int unhexchar(char c) _const_; + +char *hexmem(const void *p, size_t l); +int unhexmem_full(const char *p, size_t l, bool secure, void **mem, size_t *len); +static inline int unhexmem(const char *p, size_t l, void **mem, size_t *len) { + return unhexmem_full(p, l, false, mem, len); +} + +char base32hexchar(int x) _const_; +int unbase32hexchar(char c) _const_; + +char base64char(int x) _const_; +char urlsafe_base64char(int x) _const_; +int unbase64char(char c) _const_; + +char *base32hexmem(const void *p, size_t l, bool padding); +int unbase32hexmem(const char *p, size_t l, bool padding, void **mem, size_t *len); + +ssize_t base64mem_full(const void *p, size_t l, size_t line_break, char **ret); +static inline ssize_t base64mem(const void *p, size_t l, char **ret) { + return base64mem_full(p, l, SIZE_MAX, ret); +} + +ssize_t base64_append( + char **prefix, + size_t plen, + const void *p, + size_t l, + size_t margin, + size_t width); +int unbase64mem_full(const char *p, size_t l, bool secure, void **mem, size_t *len); +static inline int unbase64mem(const char *p, size_t l, void **mem, size_t *len) { + return unbase64mem_full(p, l, false, mem, len); +} + +void hexdump(FILE *f, const void *p, size_t s); diff --git a/src/basic/hmac.c b/src/basic/hmac.c new file mode 100644 index 0000000..a5f66d5 --- /dev/null +++ b/src/basic/hmac.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "hmac.h" +#include "sha256.h" + +#define HMAC_BLOCK_SIZE 64 +#define INNER_PADDING_BYTE 0x36 +#define OUTER_PADDING_BYTE 0x5c + +void hmac_sha256(const void *key, + size_t key_size, + const void *input, + size_t input_size, + uint8_t res[static SHA256_DIGEST_SIZE]) { + + uint8_t inner_padding[HMAC_BLOCK_SIZE] = { }; + uint8_t outer_padding[HMAC_BLOCK_SIZE] = { }; + uint8_t replacement_key[SHA256_DIGEST_SIZE]; + struct sha256_ctx hash; + + assert(key); + assert(key_size > 0); + assert(res); + + /* Implement algorithm as described by FIPS 198. */ + + /* The key needs to be block size length or less, hash it if it's longer. */ + if (key_size > HMAC_BLOCK_SIZE) { + sha256_direct(key, key_size, replacement_key); + key = replacement_key; + key_size = SHA256_DIGEST_SIZE; + } + + /* First, copy the key into the padding arrays. If it's shorter than + * the block size, the arrays are already initialized to 0. */ + memcpy(inner_padding, key, key_size); + memcpy(outer_padding, key, key_size); + + /* Then, XOR the provided key and any padding leftovers with the fixed + * padding bytes as defined in FIPS 198. */ + for (size_t i = 0; i < HMAC_BLOCK_SIZE; i++) { + inner_padding[i] ^= INNER_PADDING_BYTE; + outer_padding[i] ^= OUTER_PADDING_BYTE; + } + + /* First pass: hash the inner padding array and the input. */ + sha256_init_ctx(&hash); + sha256_process_bytes(inner_padding, HMAC_BLOCK_SIZE, &hash); + sha256_process_bytes(input, input_size, &hash); + sha256_finish_ctx(&hash, res); + + /* Second pass: hash the outer padding array and the result of the first pass. */ + sha256_init_ctx(&hash); + sha256_process_bytes(outer_padding, HMAC_BLOCK_SIZE, &hash); + sha256_process_bytes(res, SHA256_DIGEST_SIZE, &hash); + sha256_finish_ctx(&hash, res); +} diff --git a/src/basic/hmac.h b/src/basic/hmac.h new file mode 100644 index 0000000..e58c183 --- /dev/null +++ b/src/basic/hmac.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sha256.h" + +/* Unoptimized implementation based on FIPS 198. 'res' has to be allocated by + * the caller. Prefer external OpenSSL functions, and use this only when + * linking to OpenSSL is not desirable (eg: libsystemd.so). */ +void hmac_sha256(const void *key, size_t key_size, const void *input, size_t input_size, uint8_t res[static SHA256_DIGEST_SIZE]); diff --git a/src/basic/hostname-util.c b/src/basic/hostname-util.c new file mode 100644 index 0000000..e743033 --- /dev/null +++ b/src/basic/hostname-util.c @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "env-file.h" +#include "hostname-util.h" +#include "os-util.h" +#include "string-util.h" +#include "strv.h" + +char* get_default_hostname(void) { + int r; + + const char *e = secure_getenv("SYSTEMD_DEFAULT_HOSTNAME"); + if (e) { + if (hostname_is_valid(e, 0)) + return strdup(e); + log_debug("Invalid hostname in $SYSTEMD_DEFAULT_HOSTNAME, ignoring: %s", e); + } + + _cleanup_free_ char *f = NULL; + r = parse_os_release(NULL, "DEFAULT_HOSTNAME", &f); + if (r < 0) + log_debug_errno(r, "Failed to parse os-release, ignoring: %m"); + else if (f) { + if (hostname_is_valid(f, 0)) + return TAKE_PTR(f); + log_debug("Invalid hostname in os-release, ignoring: %s", f); + } + + return strdup(FALLBACK_HOSTNAME); +} + +int gethostname_full(GetHostnameFlags flags, char **ret) { + _cleanup_free_ char *buf = NULL, *fallback = NULL; + struct utsname u; + const char *s; + + assert(ret); + + assert_se(uname(&u) >= 0); + + s = u.nodename; + if (isempty(s) || streq(s, "(none)") || + (!FLAGS_SET(flags, GET_HOSTNAME_ALLOW_LOCALHOST) && is_localhost(s)) || + (FLAGS_SET(flags, GET_HOSTNAME_SHORT) && s[0] == '.')) { + if (!FLAGS_SET(flags, GET_HOSTNAME_FALLBACK_DEFAULT)) + return -ENXIO; + + s = fallback = get_default_hostname(); + if (!s) + return -ENOMEM; + + if (FLAGS_SET(flags, GET_HOSTNAME_SHORT) && s[0] == '.') + return -ENXIO; + } + + if (FLAGS_SET(flags, GET_HOSTNAME_SHORT)) + buf = strdupcspn(s, "."); + else + buf = strdup(s); + if (!buf) + return -ENOMEM; + + *ret = TAKE_PTR(buf); + return 0; +} + +bool valid_ldh_char(char c) { + /* "LDH" → "Letters, digits, hyphens", as per RFC 5890, Section 2.3.1 */ + + return ascii_isalpha(c) || + ascii_isdigit(c) || + c == '-'; +} + +bool hostname_is_valid(const char *s, ValidHostnameFlags flags) { + unsigned n_dots = 0; + const char *p; + bool dot, hyphen; + + /* Check if s looks like a valid hostname or FQDN. This does not do full DNS validation, but only + * checks if the name is composed of allowed characters and the length is not above the maximum + * allowed by Linux (c.f. dns_name_is_valid()). A trailing dot is allowed if + * VALID_HOSTNAME_TRAILING_DOT flag is set and at least two components are present in the name. Note + * that due to the restricted charset and length this call is substantially more conservative than + * dns_name_is_valid(). Doesn't accept empty hostnames, hostnames with leading dots, and hostnames + * with multiple dots in a sequence. Doesn't allow hyphens at the beginning or end of label. */ + + if (isempty(s)) + return false; + + if (streq(s, ".host")) /* Used by the container logic to denote the "root container" */ + return FLAGS_SET(flags, VALID_HOSTNAME_DOT_HOST); + + for (p = s, dot = hyphen = true; *p; p++) + if (*p == '.') { + if (dot || hyphen) + return false; + + dot = true; + hyphen = false; + n_dots++; + + } else if (*p == '-') { + if (dot) + return false; + + dot = false; + hyphen = true; + + } else { + if (!valid_ldh_char(*p)) + return false; + + dot = false; + hyphen = false; + } + + if (dot && (n_dots < 2 || !FLAGS_SET(flags, VALID_HOSTNAME_TRAILING_DOT))) + return false; + if (hyphen) + return false; + + if (p-s > HOST_NAME_MAX) /* Note that HOST_NAME_MAX is 64 on Linux, but DNS allows domain names up to + * 255 characters */ + return false; + + return true; +} + +char* hostname_cleanup(char *s) { + char *p, *d; + bool dot, hyphen; + + assert(s); + + for (p = s, d = s, dot = hyphen = true; *p && d - s < HOST_NAME_MAX; p++) + if (*p == '.') { + if (dot || hyphen) + continue; + + *(d++) = '.'; + dot = true; + hyphen = false; + + } else if (*p == '-') { + if (dot) + continue; + + *(d++) = '-'; + dot = false; + hyphen = true; + + } else if (valid_ldh_char(*p)) { + *(d++) = *p; + dot = false; + hyphen = false; + } + + if (d > s && IN_SET(d[-1], '-', '.')) + /* The dot can occur at most once, but we might have multiple + * hyphens, hence the loop */ + d--; + *d = 0; + + return s; +} + +bool is_localhost(const char *hostname) { + assert(hostname); + + /* This tries to identify local host and domain names + * described in RFC6761 plus the redhatism of localdomain */ + + return STRCASE_IN_SET( + hostname, + "localhost", + "localhost.", + "localhost.localdomain", + "localhost.localdomain.") || + endswith_no_case(hostname, ".localhost") || + endswith_no_case(hostname, ".localhost.") || + endswith_no_case(hostname, ".localhost.localdomain") || + endswith_no_case(hostname, ".localhost.localdomain."); +} + +int get_pretty_hostname(char **ret) { + _cleanup_free_ char *n = NULL; + int r; + + assert(ret); + + r = parse_env_file(NULL, "/etc/machine-info", "PRETTY_HOSTNAME", &n); + if (r < 0) + return r; + + if (isempty(n)) + return -ENXIO; + + *ret = TAKE_PTR(n); + return 0; +} diff --git a/src/basic/hostname-util.h b/src/basic/hostname-util.h new file mode 100644 index 0000000..bcac3d9 --- /dev/null +++ b/src/basic/hostname-util.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" +#include "strv.h" + +typedef enum GetHostnameFlags { + GET_HOSTNAME_ALLOW_LOCALHOST = 1 << 0, /* accepts "localhost" or friends. */ + GET_HOSTNAME_FALLBACK_DEFAULT = 1 << 1, /* use default hostname if no hostname is set. */ + GET_HOSTNAME_SHORT = 1 << 2, /* kills the FQDN part if present. */ +} GetHostnameFlags; + +int gethostname_full(GetHostnameFlags flags, char **ret); +static inline int gethostname_strict(char **ret) { + return gethostname_full(0, ret); +} + +static inline char* gethostname_malloc(void) { + char *s; + + if (gethostname_full(GET_HOSTNAME_ALLOW_LOCALHOST | GET_HOSTNAME_FALLBACK_DEFAULT, &s) < 0) + return NULL; + + return s; +} + +static inline char* gethostname_short_malloc(void) { + char *s; + + if (gethostname_full(GET_HOSTNAME_ALLOW_LOCALHOST | GET_HOSTNAME_FALLBACK_DEFAULT | GET_HOSTNAME_SHORT, &s) < 0) + return NULL; + + return s; +} + +char* get_default_hostname(void); + +bool valid_ldh_char(char c) _const_; + +typedef enum ValidHostnameFlags { + VALID_HOSTNAME_TRAILING_DOT = 1 << 0, /* Accept trailing dot on multi-label names */ + VALID_HOSTNAME_DOT_HOST = 1 << 1, /* Accept ".host" as valid hostname */ +} ValidHostnameFlags; + +bool hostname_is_valid(const char *s, ValidHostnameFlags flags) _pure_; +char* hostname_cleanup(char *s); + +bool is_localhost(const char *hostname); + +static inline bool is_gateway_hostname(const char *hostname) { + /* This tries to identify the valid syntaxes for the our synthetic "gateway" host. */ + return STRCASE_IN_SET(hostname, "_gateway", "_gateway."); +} + +static inline bool is_outbound_hostname(const char *hostname) { + /* This tries to identify the valid syntaxes for the our synthetic "outbound" host. */ + return STRCASE_IN_SET(hostname, "_outbound", "_outbound."); +} + +static inline bool is_dns_stub_hostname(const char *hostname) { + return STRCASE_IN_SET(hostname, "_localdnsstub", "_localdnsstub."); +} + +static inline bool is_dns_proxy_stub_hostname(const char *hostname) { + return STRCASE_IN_SET(hostname, "_localdnsproxy", "_localdnsproxy."); +} + +int get_pretty_hostname(char **ret); diff --git a/src/basic/in-addr-util.c b/src/basic/in-addr-util.c new file mode 100644 index 0000000..ee4ea67 --- /dev/null +++ b/src/basic/in-addr-util.c @@ -0,0 +1,984 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "in-addr-util.h" +#include "logarithm.h" +#include "macro.h" +#include "parse-util.h" +#include "random-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strxcpyx.h" + +bool in4_addr_is_null(const struct in_addr *a) { + assert(a); + + return a->s_addr == 0; +} + +bool in6_addr_is_null(const struct in6_addr *a) { + assert(a); + + return IN6_IS_ADDR_UNSPECIFIED(a); +} + +int in_addr_is_null(int family, const union in_addr_union *u) { + assert(u); + + if (family == AF_INET) + return in4_addr_is_null(&u->in); + + if (family == AF_INET6) + return in6_addr_is_null(&u->in6); + + return -EAFNOSUPPORT; +} + +bool in4_addr_is_link_local(const struct in_addr *a) { + assert(a); + + return (be32toh(a->s_addr) & UINT32_C(0xFFFF0000)) == (UINT32_C(169) << 24 | UINT32_C(254) << 16); +} + +bool in4_addr_is_link_local_dynamic(const struct in_addr *a) { + assert(a); + + if (!in4_addr_is_link_local(a)) + return false; + + /* 169.254.0.0/24 and 169.254.255.0/24 must not be used for the dynamic IPv4LL assignment. + * See RFC 3927 Section 2.1: + * The IPv4 prefix 169.254/16 is registered with the IANA for this purpose. The first 256 and last + * 256 addresses in the 169.254/16 prefix are reserved for future use and MUST NOT be selected by a + * host using this dynamic configuration mechanism. */ + return !IN_SET(be32toh(a->s_addr) & 0x0000FF00U, 0x0000U, 0xFF00U); +} + +bool in6_addr_is_link_local(const struct in6_addr *a) { + assert(a); + + return IN6_IS_ADDR_LINKLOCAL(a); +} + +int in_addr_is_link_local(int family, const union in_addr_union *u) { + assert(u); + + if (family == AF_INET) + return in4_addr_is_link_local(&u->in); + + if (family == AF_INET6) + return in6_addr_is_link_local(&u->in6); + + return -EAFNOSUPPORT; +} + +bool in6_addr_is_link_local_all_nodes(const struct in6_addr *a) { + assert(a); + + /* ff02::1 */ + return be32toh(a->s6_addr32[0]) == UINT32_C(0xff020000) && + a->s6_addr32[1] == 0 && + a->s6_addr32[2] == 0 && + be32toh(a->s6_addr32[3]) == UINT32_C(0x00000001); +} + +int in_addr_is_multicast(int family, const union in_addr_union *u) { + assert(u); + + if (family == AF_INET) + return IN_MULTICAST(be32toh(u->in.s_addr)); + + if (family == AF_INET6) + return IN6_IS_ADDR_MULTICAST(&u->in6); + + return -EAFNOSUPPORT; +} + +bool in4_addr_is_local_multicast(const struct in_addr *a) { + assert(a); + + return (be32toh(a->s_addr) & UINT32_C(0xffffff00)) == UINT32_C(0xe0000000); +} + +bool in4_addr_is_localhost(const struct in_addr *a) { + assert(a); + + /* All of 127.x.x.x is localhost. */ + return (be32toh(a->s_addr) & UINT32_C(0xFF000000)) == UINT32_C(127) << 24; +} + +bool in4_addr_is_non_local(const struct in_addr *a) { + /* Whether the address is not null and not localhost. + * + * As such, it is suitable to configure as DNS/NTP server from DHCP. */ + return !in4_addr_is_null(a) && + !in4_addr_is_localhost(a); +} + +int in_addr_is_localhost(int family, const union in_addr_union *u) { + assert(u); + + if (family == AF_INET) + return in4_addr_is_localhost(&u->in); + + if (family == AF_INET6) + return IN6_IS_ADDR_LOOPBACK(&u->in6); + + return -EAFNOSUPPORT; +} + +int in_addr_is_localhost_one(int family, const union in_addr_union *u) { + assert(u); + + if (family == AF_INET) + /* 127.0.0.1 */ + return be32toh(u->in.s_addr) == UINT32_C(0x7F000001); + + if (family == AF_INET6) + return IN6_IS_ADDR_LOOPBACK(&u->in6); + + return -EAFNOSUPPORT; +} + +bool in6_addr_is_ipv4_mapped_address(const struct in6_addr *a) { + return a->s6_addr32[0] == 0 && + a->s6_addr32[1] == 0 && + a->s6_addr32[2] == htobe32(UINT32_C(0x0000ffff)); +} + +bool in4_addr_equal(const struct in_addr *a, const struct in_addr *b) { + assert(a); + assert(b); + + return a->s_addr == b->s_addr; +} + +bool in6_addr_equal(const struct in6_addr *a, const struct in6_addr *b) { + assert(a); + assert(b); + + return IN6_ARE_ADDR_EQUAL(a, b); +} + +int in_addr_equal(int family, const union in_addr_union *a, const union in_addr_union *b) { + assert(a); + assert(b); + + if (family == AF_INET) + return in4_addr_equal(&a->in, &b->in); + + if (family == AF_INET6) + return in6_addr_equal(&a->in6, &b->in6); + + return -EAFNOSUPPORT; +} + +int in_addr_prefix_intersect( + int family, + const union in_addr_union *a, + unsigned aprefixlen, + const union in_addr_union *b, + unsigned bprefixlen) { + + unsigned m; + + assert(a); + assert(b); + + /* Checks whether there are any addresses that are in both networks */ + + m = MIN(aprefixlen, bprefixlen); + + if (family == AF_INET) { + uint32_t x, nm; + + x = be32toh(a->in.s_addr ^ b->in.s_addr); + nm = m == 0 ? 0 : 0xFFFFFFFFUL << (32 - m); + + return (x & nm) == 0; + } + + if (family == AF_INET6) { + unsigned i; + + if (m > 128) + m = 128; + + for (i = 0; i < 16; i++) { + uint8_t x, nm; + + x = a->in6.s6_addr[i] ^ b->in6.s6_addr[i]; + + if (m < 8) + nm = 0xFF << (8 - m); + else + nm = 0xFF; + + if ((x & nm) != 0) + return 0; + + if (m > 8) + m -= 8; + else + m = 0; + } + + return 1; + } + + return -EAFNOSUPPORT; +} + +int in_addr_prefix_next(int family, union in_addr_union *u, unsigned prefixlen) { + assert(u); + + /* Increases the network part of an address by one. Returns 0 if that succeeds, or -ERANGE if + * this overflows. */ + + return in_addr_prefix_nth(family, u, prefixlen, 1); +} + +/* + * Calculates the nth prefix of size prefixlen starting from the address denoted by u. + * + * On success 0 will be returned and the calculated prefix will be available in + * u. In case the calculation cannot be performed (invalid prefix length, + * overflows would occur) -ERANGE is returned. If the address family given isn't + * supported -EAFNOSUPPORT will be returned. + * + * Examples: + * - in_addr_prefix_nth(AF_INET, 192.168.0.0, 24, 2), returns 0, writes 192.168.2.0 to u + * - in_addr_prefix_nth(AF_INET, 192.168.0.0, 24, 0), returns 0, no data written + * - in_addr_prefix_nth(AF_INET, 255.255.255.0, 24, 1), returns -ERANGE, no data written + * - in_addr_prefix_nth(AF_INET, 255.255.255.0, 0, 1), returns -ERANGE, no data written + * - in_addr_prefix_nth(AF_INET6, 2001:db8, 64, 0xff00) returns 0, writes 2001:0db8:0000:ff00:: to u + */ +int in_addr_prefix_nth(int family, union in_addr_union *u, unsigned prefixlen, uint64_t nth) { + assert(u); + + if (prefixlen <= 0) + return -ERANGE; + + if (family == AF_INET) { + uint32_t c, n, t; + + if (prefixlen > 32) + return -ERANGE; + + c = be32toh(u->in.s_addr); + + t = nth << (32 - prefixlen); + + /* Check for wrap */ + if (c > UINT32_MAX - t) + return -ERANGE; + + n = c + t; + + n &= UINT32_C(0xFFFFFFFF) << (32 - prefixlen); + u->in.s_addr = htobe32(n); + return 0; + } + + if (family == AF_INET6) { + bool overflow = false; + + if (prefixlen > 128) + return -ERANGE; + + for (unsigned i = 16; i > 0; i--) { + unsigned t, j = i - 1, p = j * 8; + + if (p >= prefixlen) { + u->in6.s6_addr[j] = 0; + continue; + } + + if (prefixlen - p < 8) { + u->in6.s6_addr[j] &= 0xff << (8 - (prefixlen - p)); + t = u->in6.s6_addr[j] + ((nth & 0xff) << (8 - (prefixlen - p))); + nth >>= prefixlen - p; + } else { + t = u->in6.s6_addr[j] + (nth & 0xff) + overflow; + nth >>= 8; + } + + overflow = t > UINT8_MAX; + u->in6.s6_addr[j] = (uint8_t) (t & 0xff); + } + + if (overflow || nth != 0) + return -ERANGE; + + return 0; + } + + return -EAFNOSUPPORT; +} + +int in_addr_random_prefix( + int family, + union in_addr_union *u, + unsigned prefixlen_fixed_part, + unsigned prefixlen) { + + assert(u); + + /* Random network part of an address by one. */ + + if (prefixlen <= 0) + return 0; + + if (family == AF_INET) { + uint32_t c, n; + + if (prefixlen_fixed_part > 32) + prefixlen_fixed_part = 32; + if (prefixlen > 32) + prefixlen = 32; + if (prefixlen_fixed_part >= prefixlen) + return -EINVAL; + + c = be32toh(u->in.s_addr); + c &= ((UINT32_C(1) << prefixlen_fixed_part) - 1) << (32 - prefixlen_fixed_part); + + random_bytes(&n, sizeof(n)); + n &= ((UINT32_C(1) << (prefixlen - prefixlen_fixed_part)) - 1) << (32 - prefixlen); + + u->in.s_addr = htobe32(n | c); + return 1; + } + + if (family == AF_INET6) { + struct in6_addr n; + unsigned i, j; + + if (prefixlen_fixed_part > 128) + prefixlen_fixed_part = 128; + if (prefixlen > 128) + prefixlen = 128; + if (prefixlen_fixed_part >= prefixlen) + return -EINVAL; + + random_bytes(&n, sizeof(n)); + + for (i = 0; i < 16; i++) { + uint8_t mask_fixed_part = 0, mask = 0; + + if (i < (prefixlen_fixed_part + 7) / 8) { + if (i < prefixlen_fixed_part / 8) + mask_fixed_part = 0xffu; + else { + j = prefixlen_fixed_part % 8; + mask_fixed_part = ((UINT8_C(1) << (j + 1)) - 1) << (8 - j); + } + } + + if (i < (prefixlen + 7) / 8) { + if (i < prefixlen / 8) + mask = 0xffu ^ mask_fixed_part; + else { + j = prefixlen % 8; + mask = (((UINT8_C(1) << (j + 1)) - 1) << (8 - j)) ^ mask_fixed_part; + } + } + + u->in6.s6_addr[i] &= mask_fixed_part; + u->in6.s6_addr[i] |= n.s6_addr[i] & mask; + } + + return 1; + } + + return -EAFNOSUPPORT; +} + +int in_addr_prefix_range( + int family, + const union in_addr_union *in, + unsigned prefixlen, + union in_addr_union *ret_start, + union in_addr_union *ret_end) { + + union in_addr_union start, end; + int r; + + assert(in); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + if (ret_start) { + start = *in; + r = in_addr_prefix_nth(family, &start, prefixlen, 0); + if (r < 0) + return r; + } + + if (ret_end) { + end = *in; + r = in_addr_prefix_nth(family, &end, prefixlen, 1); + if (r < 0) + return r; + } + + if (ret_start) + *ret_start = start; + if (ret_end) + *ret_end = end; + + return 0; +} + +int in_addr_to_string(int family, const union in_addr_union *u, char **ret) { + _cleanup_free_ char *x = NULL; + size_t l; + + assert(u); + assert(ret); + + if (family == AF_INET) + l = INET_ADDRSTRLEN; + else if (family == AF_INET6) + l = INET6_ADDRSTRLEN; + else + return -EAFNOSUPPORT; + + x = new(char, l); + if (!x) + return -ENOMEM; + + errno = 0; + if (!typesafe_inet_ntop(family, u, x, l)) + return errno_or_else(EINVAL); + + *ret = TAKE_PTR(x); + return 0; +} + +int in_addr_prefix_to_string( + int family, + const union in_addr_union *u, + unsigned prefixlen, + char *buf, + size_t buf_len) { + + assert(u); + assert(buf); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + errno = 0; + if (!typesafe_inet_ntop(family, u, buf, buf_len)) + return errno_or_else(ENOSPC); + + size_t l = strlen(buf); + if (!snprintf_ok(buf + l, buf_len - l, "/%u", prefixlen)) + return -ENOSPC; + return 0; +} + +int in_addr_port_ifindex_name_to_string(int family, const union in_addr_union *u, uint16_t port, int ifindex, const char *server_name, char **ret) { + _cleanup_free_ char *ip_str = NULL, *x = NULL; + int r; + + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(u); + assert(ret); + + /* Much like in_addr_to_string(), but optionally appends the zone interface index to the address, to properly + * handle IPv6 link-local addresses. */ + + r = in_addr_to_string(family, u, &ip_str); + if (r < 0) + return r; + + if (family == AF_INET6) { + r = in_addr_is_link_local(family, u); + if (r < 0) + return r; + if (r == 0) + ifindex = 0; + } else + ifindex = 0; /* For IPv4 address, ifindex is always ignored. */ + + if (port == 0 && ifindex == 0 && isempty(server_name)) { + *ret = TAKE_PTR(ip_str); + return 0; + } + + const char *separator = isempty(server_name) ? "" : "#"; + server_name = strempty(server_name); + + if (port > 0) { + if (family == AF_INET6) { + if (ifindex > 0) + r = asprintf(&x, "[%s]:%"PRIu16"%%%i%s%s", ip_str, port, ifindex, separator, server_name); + else + r = asprintf(&x, "[%s]:%"PRIu16"%s%s", ip_str, port, separator, server_name); + } else + r = asprintf(&x, "%s:%"PRIu16"%s%s", ip_str, port, separator, server_name); + } else { + if (ifindex > 0) + r = asprintf(&x, "%s%%%i%s%s", ip_str, ifindex, separator, server_name); + else { + x = strjoin(ip_str, separator, server_name); + r = x ? 0 : -ENOMEM; + } + } + if (r < 0) + return -ENOMEM; + + *ret = TAKE_PTR(x); + return 0; +} + +int in_addr_from_string(int family, const char *s, union in_addr_union *ret) { + union in_addr_union buffer; + assert(s); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + errno = 0; + if (inet_pton(family, s, ret ?: &buffer) <= 0) + return errno_or_else(EINVAL); + + return 0; +} + +int in_addr_from_string_auto(const char *s, int *ret_family, union in_addr_union *ret) { + int r; + + assert(s); + + r = in_addr_from_string(AF_INET, s, ret); + if (r >= 0) { + if (ret_family) + *ret_family = AF_INET; + return 0; + } + + r = in_addr_from_string(AF_INET6, s, ret); + if (r >= 0) { + if (ret_family) + *ret_family = AF_INET6; + return 0; + } + + return -EINVAL; +} + +unsigned char in4_addr_netmask_to_prefixlen(const struct in_addr *addr) { + assert(addr); + + return 32U - u32ctz(be32toh(addr->s_addr)); +} + +/* Calculate an IPv4 netmask from prefix length, for example /8 -> 255.0.0.0. */ +struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen) { + assert(addr); + assert(prefixlen <= 32); + + /* Shifting beyond 32 is not defined, handle this specially. */ + if (prefixlen == 0) + addr->s_addr = 0; + else + addr->s_addr = htobe32((0xffffffff << (32 - prefixlen)) & 0xffffffff); + + return addr; +} + +/* Calculate an IPv6 netmask from prefix length, for example /16 -> ffff::. */ +struct in6_addr* in6_addr_prefixlen_to_netmask(struct in6_addr *addr, unsigned char prefixlen) { + assert(addr); + assert(prefixlen <= 128); + + for (unsigned i = 0; i < 16; i++) { + uint8_t mask; + + if (prefixlen >= 8) { + mask = 0xFF; + prefixlen -= 8; + } else if (prefixlen > 0) { + mask = 0xFF << (8 - prefixlen); + prefixlen = 0; + } else { + assert(prefixlen == 0); + mask = 0; + } + + addr->s6_addr[i] = mask; + } + + return addr; +} + +/* Calculate an IPv4 or IPv6 netmask from prefix length, for example /8 -> 255.0.0.0 or /16 -> ffff::. */ +int in_addr_prefixlen_to_netmask(int family, union in_addr_union *addr, unsigned char prefixlen) { + assert(addr); + + switch (family) { + case AF_INET: + in4_addr_prefixlen_to_netmask(&addr->in, prefixlen); + return 0; + case AF_INET6: + in6_addr_prefixlen_to_netmask(&addr->in6, prefixlen); + return 0; + default: + return -EAFNOSUPPORT; + } +} + +int in4_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen) { + uint8_t msb_octet = *(uint8_t*) addr; + + /* addr may not be aligned, so make sure we only access it byte-wise */ + + assert(addr); + assert(prefixlen); + + if (msb_octet < 128) + /* class A, leading bits: 0 */ + *prefixlen = 8; + else if (msb_octet < 192) + /* class B, leading bits 10 */ + *prefixlen = 16; + else if (msb_octet < 224) + /* class C, leading bits 110 */ + *prefixlen = 24; + else + /* class D or E, no default prefixlen */ + return -ERANGE; + + return 0; +} + +int in4_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask) { + unsigned char prefixlen; + int r; + + assert(addr); + assert(mask); + + r = in4_addr_default_prefixlen(addr, &prefixlen); + if (r < 0) + return r; + + in4_addr_prefixlen_to_netmask(mask, prefixlen); + return 0; +} + +int in4_addr_mask(struct in_addr *addr, unsigned char prefixlen) { + struct in_addr mask; + + assert(addr); + + if (!in4_addr_prefixlen_to_netmask(&mask, prefixlen)) + return -EINVAL; + + addr->s_addr &= mask.s_addr; + return 0; +} + +int in6_addr_mask(struct in6_addr *addr, unsigned char prefixlen) { + unsigned i; + + for (i = 0; i < 16; i++) { + uint8_t mask; + + if (prefixlen >= 8) { + mask = 0xFF; + prefixlen -= 8; + } else if (prefixlen > 0) { + mask = 0xFF << (8 - prefixlen); + prefixlen = 0; + } else { + assert(prefixlen == 0); + mask = 0; + } + + addr->s6_addr[i] &= mask; + } + + return 0; +} + +int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen) { + assert(addr); + + switch (family) { + case AF_INET: + return in4_addr_mask(&addr->in, prefixlen); + case AF_INET6: + return in6_addr_mask(&addr->in6, prefixlen); + default: + return -EAFNOSUPPORT; + } +} + +int in4_addr_prefix_covers_full( + const struct in_addr *prefix, + unsigned char prefixlen, + const struct in_addr *address, + unsigned char address_prefixlen) { + + struct in_addr masked_prefix, masked_address; + int r; + + assert(prefix); + assert(address); + + if (prefixlen > address_prefixlen) + return false; + + masked_prefix = *prefix; + r = in4_addr_mask(&masked_prefix, prefixlen); + if (r < 0) + return r; + + masked_address = *address; + r = in4_addr_mask(&masked_address, prefixlen); + if (r < 0) + return r; + + return in4_addr_equal(&masked_prefix, &masked_address); +} + +int in6_addr_prefix_covers_full( + const struct in6_addr *prefix, + unsigned char prefixlen, + const struct in6_addr *address, + unsigned char address_prefixlen) { + + struct in6_addr masked_prefix, masked_address; + int r; + + assert(prefix); + assert(address); + + if (prefixlen > address_prefixlen) + return false; + + masked_prefix = *prefix; + r = in6_addr_mask(&masked_prefix, prefixlen); + if (r < 0) + return r; + + masked_address = *address; + r = in6_addr_mask(&masked_address, prefixlen); + if (r < 0) + return r; + + return in6_addr_equal(&masked_prefix, &masked_address); +} + +int in_addr_prefix_covers_full( + int family, + const union in_addr_union *prefix, + unsigned char prefixlen, + const union in_addr_union *address, + unsigned char address_prefixlen) { + + assert(prefix); + assert(address); + + switch (family) { + case AF_INET: + return in4_addr_prefix_covers_full(&prefix->in, prefixlen, &address->in, address_prefixlen); + case AF_INET6: + return in6_addr_prefix_covers_full(&prefix->in6, prefixlen, &address->in6, address_prefixlen); + default: + return -EAFNOSUPPORT; + } +} + +int in_addr_parse_prefixlen(int family, const char *p, unsigned char *ret) { + uint8_t u; + int r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + r = safe_atou8(p, &u); + if (r < 0) + return r; + + if (u > FAMILY_ADDRESS_SIZE(family) * 8) + return -ERANGE; + + *ret = u; + return 0; +} + +int in_addr_prefix_from_string( + const char *p, + int family, + union in_addr_union *ret_prefix, + unsigned char *ret_prefixlen) { + + _cleanup_free_ char *str = NULL; + union in_addr_union buffer; + const char *e, *l; + unsigned char k; + int r; + + assert(p); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + e = strchr(p, '/'); + if (e) { + str = strndup(p, e - p); + if (!str) + return -ENOMEM; + + l = str; + } else + l = p; + + r = in_addr_from_string(family, l, &buffer); + if (r < 0) + return r; + + if (e) { + r = in_addr_parse_prefixlen(family, e+1, &k); + if (r < 0) + return r; + } else + k = FAMILY_ADDRESS_SIZE(family) * 8; + + if (ret_prefix) + *ret_prefix = buffer; + if (ret_prefixlen) + *ret_prefixlen = k; + + return 0; +} + +int in_addr_prefix_from_string_auto_internal( + const char *p, + InAddrPrefixLenMode mode, + int *ret_family, + union in_addr_union *ret_prefix, + unsigned char *ret_prefixlen) { + + _cleanup_free_ char *str = NULL; + union in_addr_union buffer; + const char *e, *l; + unsigned char k; + int family, r; + + assert(p); + + e = strchr(p, '/'); + if (e) { + str = strndup(p, e - p); + if (!str) + return -ENOMEM; + + l = str; + } else + l = p; + + r = in_addr_from_string_auto(l, &family, &buffer); + if (r < 0) + return r; + + if (e) { + r = in_addr_parse_prefixlen(family, e+1, &k); + if (r < 0) + return r; + } else + switch (mode) { + case PREFIXLEN_FULL: + k = FAMILY_ADDRESS_SIZE(family) * 8; + break; + case PREFIXLEN_REFUSE: + return -ENOANO; /* To distinguish this error from others. */ + default: + assert_not_reached(); + } + + if (ret_family) + *ret_family = family; + if (ret_prefix) + *ret_prefix = buffer; + if (ret_prefixlen) + *ret_prefixlen = k; + + return 0; + +} + +void in_addr_data_hash_func(const struct in_addr_data *a, struct siphash *state) { + assert(a); + assert(state); + + siphash24_compress(&a->family, sizeof(a->family), state); + siphash24_compress(&a->address, FAMILY_ADDRESS_SIZE(a->family), state); +} + +int in_addr_data_compare_func(const struct in_addr_data *x, const struct in_addr_data *y) { + int r; + + assert(x); + assert(y); + + r = CMP(x->family, y->family); + if (r != 0) + return r; + + return memcmp(&x->address, &y->address, FAMILY_ADDRESS_SIZE(x->family)); +} + +DEFINE_HASH_OPS( + in_addr_data_hash_ops, + struct in_addr_data, + in_addr_data_hash_func, + in_addr_data_compare_func); + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + in_addr_data_hash_ops_free, + struct in_addr_data, + in_addr_data_hash_func, + in_addr_data_compare_func, + free); + +void in6_addr_hash_func(const struct in6_addr *addr, struct siphash *state) { + assert(addr); + assert(state); + + siphash24_compress(addr, sizeof(*addr), state); +} + +int in6_addr_compare_func(const struct in6_addr *a, const struct in6_addr *b) { + assert(a); + assert(b); + + return memcmp(a, b, sizeof(*a)); +} + +DEFINE_HASH_OPS( + in6_addr_hash_ops, + struct in6_addr, + in6_addr_hash_func, + in6_addr_compare_func); + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + in6_addr_hash_ops_free, + struct in6_addr, + in6_addr_hash_func, + in6_addr_compare_func, + free); diff --git a/src/basic/in-addr-util.h b/src/basic/in-addr-util.h new file mode 100644 index 0000000..12720ca --- /dev/null +++ b/src/basic/in-addr-util.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "hash-funcs.h" +#include "macro.h" + +union in_addr_union { + struct in_addr in; + struct in6_addr in6; + uint8_t bytes[CONST_MAX(sizeof(struct in_addr), sizeof(struct in6_addr))]; +}; + +struct in_addr_data { + int family; + union in_addr_union address; +}; + +bool in4_addr_is_null(const struct in_addr *a); +static inline bool in4_addr_is_set(const struct in_addr *a) { + return !in4_addr_is_null(a); +} +bool in6_addr_is_null(const struct in6_addr *a); +static inline bool in6_addr_is_set(const struct in6_addr *a) { + return !in6_addr_is_null(a); +} +int in_addr_is_null(int family, const union in_addr_union *u); +static inline bool in_addr_is_set(int family, const union in_addr_union *u) { + return in_addr_is_null(family, u) == 0; +} +static inline int in_addr_data_is_null(const struct in_addr_data *a) { + assert(a); + return in_addr_is_null(a->family, &a->address); +} +static inline bool in_addr_data_is_set(const struct in_addr_data *a) { + return in_addr_data_is_null(a); +} + +int in_addr_is_multicast(int family, const union in_addr_union *u); + +bool in4_addr_is_link_local(const struct in_addr *a); +bool in4_addr_is_link_local_dynamic(const struct in_addr *a); +bool in6_addr_is_link_local(const struct in6_addr *a); +int in_addr_is_link_local(int family, const union in_addr_union *u); +bool in6_addr_is_link_local_all_nodes(const struct in6_addr *a); + +bool in4_addr_is_localhost(const struct in_addr *a); +int in_addr_is_localhost(int family, const union in_addr_union *u); +int in_addr_is_localhost_one(int family, const union in_addr_union *u); + +bool in4_addr_is_local_multicast(const struct in_addr *a); +bool in4_addr_is_non_local(const struct in_addr *a); +bool in6_addr_is_ipv4_mapped_address(const struct in6_addr *a); + +bool in4_addr_equal(const struct in_addr *a, const struct in_addr *b); +bool in6_addr_equal(const struct in6_addr *a, const struct in6_addr *b); +int in_addr_equal(int family, const union in_addr_union *a, const union in_addr_union *b); +int in_addr_prefix_intersect(int family, const union in_addr_union *a, unsigned aprefixlen, const union in_addr_union *b, unsigned bprefixlen); +int in_addr_prefix_next(int family, union in_addr_union *u, unsigned prefixlen); +int in_addr_prefix_nth(int family, union in_addr_union *u, unsigned prefixlen, uint64_t nth); +int in_addr_random_prefix(int family, union in_addr_union *u, unsigned prefixlen_fixed_part, unsigned prefixlen); +int in_addr_prefix_range( + int family, + const union in_addr_union *in, + unsigned prefixlen, + union in_addr_union *ret_start, + union in_addr_union *ret_end); + +int in_addr_to_string(int family, const union in_addr_union *u, char **ret); +static inline int in6_addr_to_string(const struct in6_addr *u, char **ret) { + return in_addr_to_string(AF_INET6, (const union in_addr_union*) u, ret); +} + +static inline const char* typesafe_inet_ntop(int family, const union in_addr_union *a, char *buf, size_t len) { + return inet_ntop(family, a, buf, len); +} +static inline const char* typesafe_inet_ntop4(const struct in_addr *a, char *buf, size_t len) { + return inet_ntop(AF_INET, a, buf, len); +} +static inline const char* typesafe_inet_ntop6(const struct in6_addr *a, char *buf, size_t len) { + return inet_ntop(AF_INET6, a, buf, len); +} + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define IN_ADDR_MAX CONST_MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) +#define IN_ADDR_TO_STRING(family, addr) typesafe_inet_ntop(family, addr, (char[IN_ADDR_MAX]){}, IN_ADDR_MAX) +#define IN4_ADDR_TO_STRING(addr) typesafe_inet_ntop4(addr, (char[INET_ADDRSTRLEN]){}, INET_ADDRSTRLEN) +#define IN6_ADDR_TO_STRING(addr) typesafe_inet_ntop6(addr, (char[INET6_ADDRSTRLEN]){}, INET6_ADDRSTRLEN) + +int in_addr_prefix_to_string( + int family, + const union in_addr_union *u, + unsigned prefixlen, + char *buf, + size_t buf_len); + +static inline const char* _in_addr_prefix_to_string( + int family, + const union in_addr_union *u, + unsigned prefixlen, + char *buf, + size_t buf_len) { + /* We assume that this is called with an appropriately sized buffer and can never fail. */ + assert_se(in_addr_prefix_to_string(family, u, prefixlen, buf, buf_len) == 0); + return buf; +} +static inline const char* _in4_addr_prefix_to_string(const struct in_addr *a, unsigned prefixlen, char *buf, size_t buf_len) { + return _in_addr_prefix_to_string(AF_INET, (const union in_addr_union *) a, prefixlen, buf, buf_len); +} +static inline const char* _in6_addr_prefix_to_string(const struct in6_addr *a, unsigned prefixlen, char *buf, size_t buf_len) { + return _in_addr_prefix_to_string(AF_INET6, (const union in_addr_union *) a, prefixlen, buf, buf_len); +} + +#define PREFIX_SUFFIX_MAX (1 + DECIMAL_STR_MAX(unsigned)) +#define IN_ADDR_PREFIX_TO_STRING(family, addr, prefixlen) \ + _in_addr_prefix_to_string(family, addr, prefixlen, (char[IN_ADDR_MAX + PREFIX_SUFFIX_MAX]){}, IN_ADDR_MAX + PREFIX_SUFFIX_MAX) +#define IN4_ADDR_PREFIX_TO_STRING(addr, prefixlen) \ + _in4_addr_prefix_to_string(addr, prefixlen, (char[INET_ADDRSTRLEN + PREFIX_SUFFIX_MAX]){}, INET_ADDRSTRLEN + PREFIX_SUFFIX_MAX) +#define IN6_ADDR_PREFIX_TO_STRING(addr, prefixlen) \ + _in6_addr_prefix_to_string(addr, prefixlen, (char[INET6_ADDRSTRLEN + PREFIX_SUFFIX_MAX]){}, INET6_ADDRSTRLEN + PREFIX_SUFFIX_MAX) + +int in_addr_port_ifindex_name_to_string(int family, const union in_addr_union *u, uint16_t port, int ifindex, const char *server_name, char **ret); +static inline int in_addr_ifindex_to_string(int family, const union in_addr_union *u, int ifindex, char **ret) { + return in_addr_port_ifindex_name_to_string(family, u, 0, ifindex, NULL, ret); +} +static inline int in_addr_port_to_string(int family, const union in_addr_union *u, uint16_t port, char **ret) { + return in_addr_port_ifindex_name_to_string(family, u, port, 0, NULL, ret); +} +int in_addr_from_string(int family, const char *s, union in_addr_union *ret); +int in_addr_from_string_auto(const char *s, int *ret_family, union in_addr_union *ret); + +unsigned char in4_addr_netmask_to_prefixlen(const struct in_addr *addr); +struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen); +struct in6_addr* in6_addr_prefixlen_to_netmask(struct in6_addr *addr, unsigned char prefixlen); +int in_addr_prefixlen_to_netmask(int family, union in_addr_union *addr, unsigned char prefixlen); +int in4_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen); +int in4_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask); +int in4_addr_mask(struct in_addr *addr, unsigned char prefixlen); +int in6_addr_mask(struct in6_addr *addr, unsigned char prefixlen); +int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen); +int in4_addr_prefix_covers_full(const struct in_addr *prefix, unsigned char prefixlen, const struct in_addr *address, unsigned char address_prefixlen); +int in6_addr_prefix_covers_full(const struct in6_addr *prefix, unsigned char prefixlen, const struct in6_addr *address, unsigned char address_prefixlen); +int in_addr_prefix_covers_full(int family, const union in_addr_union *prefix, unsigned char prefixlen, const union in_addr_union *address, unsigned char address_prefixlen); +static inline int in4_addr_prefix_covers(const struct in_addr *prefix, unsigned char prefixlen, const struct in_addr *address) { + return in4_addr_prefix_covers_full(prefix, prefixlen, address, 32); +} +static inline int in6_addr_prefix_covers(const struct in6_addr *prefix, unsigned char prefixlen, const struct in6_addr *address) { + return in6_addr_prefix_covers_full(prefix, prefixlen, address, 128); +} +static inline int in_addr_prefix_covers(int family, const union in_addr_union *prefix, unsigned char prefixlen, const union in_addr_union *address) { + return in_addr_prefix_covers_full(family, prefix, prefixlen, address, family == AF_INET ? 32 : family == AF_INET6 ? 128 : 0); +} +int in_addr_parse_prefixlen(int family, const char *p, unsigned char *ret); +int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen); + +typedef enum InAddrPrefixLenMode { + PREFIXLEN_FULL, /* Default to prefixlen of address size, 32 for IPv4 or 128 for IPv6, if not specified. */ + PREFIXLEN_REFUSE, /* Fail with -ENOANO if prefixlen is not specified. */ +} InAddrPrefixLenMode; + +int in_addr_prefix_from_string_auto_internal(const char *p, InAddrPrefixLenMode mode, int *ret_family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen); +static inline int in_addr_prefix_from_string_auto(const char *p, int *ret_family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen) { + return in_addr_prefix_from_string_auto_internal(p, PREFIXLEN_FULL, ret_family, ret_prefix, ret_prefixlen); +} + +static inline size_t FAMILY_ADDRESS_SIZE(int family) { + assert(IN_SET(family, AF_INET, AF_INET6)); + return family == AF_INET6 ? 16 : 4; +} + +#define FAMILY_ADDRESS_SIZE_SAFE(f) \ + ({ \ + int _f = (f); \ + _f == AF_INET ? sizeof(struct in_addr) : \ + _f == AF_INET6 ? sizeof(struct in6_addr) : 0; \ + }) + +/* Workaround for clang, explicitly specify the maximum-size element here. + * See also oss-fuzz#11344. */ +#define IN_ADDR_NULL ((union in_addr_union) { .in6 = {} }) + +void in_addr_data_hash_func(const struct in_addr_data *a, struct siphash *state); +int in_addr_data_compare_func(const struct in_addr_data *x, const struct in_addr_data *y); +void in6_addr_hash_func(const struct in6_addr *addr, struct siphash *state); +int in6_addr_compare_func(const struct in6_addr *a, const struct in6_addr *b); + +extern const struct hash_ops in_addr_data_hash_ops; +extern const struct hash_ops in_addr_data_hash_ops_free; +extern const struct hash_ops in6_addr_hash_ops; +extern const struct hash_ops in6_addr_hash_ops_free; + +static inline void PTR_TO_IN4_ADDR(const void *p, struct in_addr *ret) { + assert(ret); + ret->s_addr = (uint32_t) ((uintptr_t) p); +} + +static inline void* IN4_ADDR_TO_PTR(const struct in_addr *a) { + assert(a); + return (void*) ((uintptr_t) a->s_addr); +} + +#define IPV4_ADDRESS_FMT_STR "%u.%u.%u.%u" +#define IPV4_ADDRESS_FMT_VAL(address) \ + be32toh((address).s_addr) >> 24, \ + (be32toh((address).s_addr) >> 16) & 0xFFu, \ + (be32toh((address).s_addr) >> 8) & 0xFFu, \ + be32toh((address).s_addr) & 0xFFu diff --git a/src/basic/initrd-util.c b/src/basic/initrd-util.c new file mode 100644 index 0000000..03ccfbe --- /dev/null +++ b/src/basic/initrd-util.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "env-util.h" +#include "errno-util.h" +#include "initrd-util.h" +#include "parse-util.h" +#include "stat-util.h" +#include "string-util.h" + +static int saved_in_initrd = -1; + +bool in_initrd(void) { + int r; + + if (saved_in_initrd >= 0) + return saved_in_initrd; + + /* If /etc/initrd-release exists, we're in an initrd. + * This can be overridden by setting SYSTEMD_IN_INITRD=0|1. + */ + + r = getenv_bool_secure("SYSTEMD_IN_INITRD"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_IN_INITRD, ignoring: %m"); + + if (r >= 0) + saved_in_initrd = r > 0; + else { + r = RET_NERRNO(access("/etc/initrd-release", F_OK)); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to check if /etc/initrd-release exists, assuming it does not: %m"); + saved_in_initrd = r >= 0; + } + + return saved_in_initrd; +} + +void in_initrd_force(bool value) { + saved_in_initrd = value; +} diff --git a/src/basic/initrd-util.h b/src/basic/initrd-util.h new file mode 100644 index 0000000..173093c --- /dev/null +++ b/src/basic/initrd-util.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +bool in_initrd(void); +void in_initrd_force(bool value); diff --git a/src/basic/inotify-util.c b/src/basic/inotify-util.c new file mode 100644 index 0000000..ee9b416 --- /dev/null +++ b/src/basic/inotify-util.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "inotify-util.h" +#include "stat-util.h" + +bool inotify_event_next( + union inotify_event_buffer *buffer, + size_t size, + struct inotify_event **iterator, + int log_level) { + + struct inotify_event *e; + size_t offset = 0; + + assert(buffer); + assert(iterator); + + if (*iterator) { + assert((uint8_t*) *iterator >= buffer->raw); + offset = (uint8_t*) *iterator - buffer->raw; + offset += offsetof(struct inotify_event, name) + (*iterator)->len; + } + + if (size == offset) + return false; /* reached end of list */ + + if (size < offset || + size - offset < offsetof(struct inotify_event, name)) { + log_full(log_level, "Received invalid inotify event, ignoring."); + return false; + } + + e = CAST_ALIGN_PTR(struct inotify_event, buffer->raw + offset); + if (size - offset - offsetof(struct inotify_event, name) < e->len) { + log_full(log_level, "Received invalid inotify event, ignoring."); + return false; + } + + *iterator = e; + return true; +} + +int inotify_add_watch_fd(int fd, int what, uint32_t mask) { + int wd, r; + + /* This is like inotify_add_watch(), except that the file to watch is not referenced by a path, but by an fd */ + wd = inotify_add_watch(fd, FORMAT_PROC_FD_PATH(what), mask); + if (wd < 0) { + if (errno != ENOENT) + return -errno; + + /* Didn't work with ENOENT? If so, then either /proc/ isn't mounted, or the fd is bad */ + r = proc_mounted(); + if (r == 0) + return -ENOSYS; + if (r > 0) + return -EBADF; + + return -ENOENT; /* OK, no clue, let's propagate the original error */ + } + + return wd; +} + +int inotify_add_watch_and_warn(int fd, const char *pathname, uint32_t mask) { + int wd; + + wd = inotify_add_watch(fd, pathname, mask); + if (wd < 0) { + if (errno == ENOSPC) + return log_error_errno(errno, "Failed to add a watch for %s: inotify watch limit reached", pathname); + + return log_error_errno(errno, "Failed to add a watch for %s: %m", pathname); + } + + return wd; +} diff --git a/src/basic/inotify-util.h b/src/basic/inotify-util.h new file mode 100644 index 0000000..665fdac --- /dev/null +++ b/src/basic/inotify-util.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "log.h" + +#define INOTIFY_EVENT_MAX (offsetof(struct inotify_event, name) + NAME_MAX + 1) + +/* This evaluates arguments multiple times */ +#define FOREACH_INOTIFY_EVENT_FULL(e, buffer, sz, log_level) \ + for (struct inotify_event *e = NULL; \ + inotify_event_next(&buffer, sz, &e, log_level); ) + +#define FOREACH_INOTIFY_EVENT(e, buffer, sz) \ + FOREACH_INOTIFY_EVENT_FULL(e, buffer, sz, LOG_DEBUG) + +#define FOREACH_INOTIFY_EVENT_WARN(e, buffer, sz) \ + FOREACH_INOTIFY_EVENT_FULL(e, buffer, sz, LOG_WARNING) + +union inotify_event_buffer { + struct inotify_event ev; + uint8_t raw[INOTIFY_EVENT_MAX]; +}; + +bool inotify_event_next( + union inotify_event_buffer *buffer, + size_t size, + struct inotify_event **iterator, + int log_level); + +int inotify_add_watch_fd(int fd, int what, uint32_t mask); +int inotify_add_watch_and_warn(int fd, const char *pathname, uint32_t mask); diff --git a/src/basic/io-util.c b/src/basic/io-util.c new file mode 100644 index 0000000..6bcbef3 --- /dev/null +++ b/src/basic/io-util.c @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "errno-util.h" +#include "io-util.h" +#include "iovec-util.h" +#include "string-util.h" +#include "time-util.h" + +int flush_fd(int fd) { + int count = 0; + + /* Read from the specified file descriptor, until POLLIN is not set anymore, throwing away everything + * read. Note that some file descriptors (notable IP sockets) will trigger POLLIN even when no data can be read + * (due to IP packet checksum mismatches), hence this function is only safe to be non-blocking if the fd used + * was set to non-blocking too. */ + + for (;;) { + char buf[LINE_MAX]; + ssize_t l; + int r; + + r = fd_wait_for_event(fd, POLLIN, 0); + if (r < 0) { + if (r == -EINTR) + continue; + + return r; + } + if (r == 0) + return count; + + l = read(fd, buf, sizeof(buf)); + if (l < 0) { + if (errno == EINTR) + continue; + + if (errno == EAGAIN) + return count; + + return -errno; + } else if (l == 0) + return count; + + count += (int) l; + } +} + +ssize_t loop_read(int fd, void *buf, size_t nbytes, bool do_poll) { + uint8_t *p = ASSERT_PTR(buf); + ssize_t n = 0; + + assert(fd >= 0); + + /* If called with nbytes == 0, let's call read() at least once, to validate the operation */ + + if (nbytes > (size_t) SSIZE_MAX) + return -EINVAL; + + do { + ssize_t k; + + k = read(fd, p, nbytes); + if (k < 0) { + if (errno == EINTR) + continue; + + if (errno == EAGAIN && do_poll) { + + /* We knowingly ignore any return value here, + * and expect that any error/EOF is reported + * via read() */ + + (void) fd_wait_for_event(fd, POLLIN, USEC_INFINITY); + continue; + } + + return n > 0 ? n : -errno; + } + + if (k == 0) + return n; + + assert((size_t) k <= nbytes); + + p += k; + nbytes -= k; + n += k; + } while (nbytes > 0); + + return n; +} + +int loop_read_exact(int fd, void *buf, size_t nbytes, bool do_poll) { + ssize_t n; + + n = loop_read(fd, buf, nbytes, do_poll); + if (n < 0) + return (int) n; + if ((size_t) n != nbytes) + return -EIO; + + return 0; +} + +int loop_write_full(int fd, const void *buf, size_t nbytes, usec_t timeout) { + const uint8_t *p; + usec_t end; + int r; + + assert(fd >= 0); + assert(buf || nbytes == 0); + + if (nbytes == 0) { + static const dummy_t dummy[0]; + assert_cc(sizeof(dummy) == 0); + p = (const void*) dummy; /* Some valid pointer, in case NULL was specified */ + } else { + if (nbytes == SIZE_MAX) + nbytes = strlen(buf); + else if (_unlikely_(nbytes > (size_t) SSIZE_MAX)) + return -EINVAL; + + p = buf; + } + + /* When timeout is 0 or USEC_INFINITY this is not used. But we initialize it to a sensible value. */ + end = timestamp_is_set(timeout) ? usec_add(now(CLOCK_MONOTONIC), timeout) : USEC_INFINITY; + + do { + ssize_t k; + + k = write(fd, p, nbytes); + if (k < 0) { + if (errno == EINTR) + continue; + + if (errno != EAGAIN || timeout == 0) + return -errno; + + usec_t wait_for; + + if (timeout == USEC_INFINITY) + wait_for = USEC_INFINITY; + else { + usec_t t = now(CLOCK_MONOTONIC); + if (t >= end) + return -ETIME; + + wait_for = usec_sub_unsigned(end, t); + } + + r = fd_wait_for_event(fd, POLLOUT, wait_for); + if (timeout == USEC_INFINITY || ERRNO_IS_NEG_TRANSIENT(r)) + /* If timeout == USEC_INFINITY we knowingly ignore any return value + * here, and expect that any error/EOF is reported via write() */ + continue; + if (r < 0) + return r; + if (r == 0) + return -ETIME; + continue; + } + + if (_unlikely_(nbytes > 0 && k == 0)) /* Can't really happen */ + return -EIO; + + assert((size_t) k <= nbytes); + + p += k; + nbytes -= k; + } while (nbytes > 0); + + return 0; +} + +int pipe_eof(int fd) { + int r; + + r = fd_wait_for_event(fd, POLLIN, 0); + if (r <= 0) + return r; + + return !!(r & POLLHUP); +} + +int ppoll_usec(struct pollfd *fds, size_t nfds, usec_t timeout) { + int r; + + assert(fds || nfds == 0); + + /* This is a wrapper around ppoll() that does primarily two things: + * + * ✅ Takes a usec_t instead of a struct timespec + * + * ✅ Guarantees that if an invalid fd is specified we return EBADF (i.e. converts POLLNVAL to + * EBADF). This is done because EBADF is a programming error usually, and hence should bubble up + * as error, and not be eaten up as non-error POLLNVAL event. + * + * ⚠️ ⚠️ ⚠️ Note that this function does not add any special handling for EINTR. Don't forget + * poll()/ppoll() will return with EINTR on any received signal always, there is no automatic + * restarting via SA_RESTART available. Thus, typically you want to handle EINTR not as an error, + * but just as reason to restart things, under the assumption you use a more appropriate mechanism + * to handle signals, such as signalfd() or signal handlers. ⚠️ ⚠️ ⚠️ + */ + + if (nfds == 0) + return 0; + + r = ppoll(fds, nfds, timeout == USEC_INFINITY ? NULL : TIMESPEC_STORE(timeout), NULL); + if (r < 0) + return -errno; + if (r == 0) + return 0; + + for (size_t i = 0, n = r; i < nfds && n > 0; i++) { + if (fds[i].revents == 0) + continue; + if (fds[i].revents & POLLNVAL) + return -EBADF; + n--; + } + + return r; +} + +int fd_wait_for_event(int fd, int event, usec_t timeout) { + struct pollfd pollfd = { + .fd = fd, + .events = event, + }; + int r; + + /* ⚠️ ⚠️ ⚠️ Keep in mind you almost certainly want to handle -EINTR gracefully in the caller, see + * ppoll_usec() above! ⚠️ ⚠️ ⚠️ */ + + r = ppoll_usec(&pollfd, 1, timeout); + if (r <= 0) + return r; + + return pollfd.revents; +} + +static size_t nul_length(const uint8_t *p, size_t sz) { + size_t n = 0; + + while (sz > 0) { + if (*p != 0) + break; + + n++; + p++; + sz--; + } + + return n; +} + +ssize_t sparse_write(int fd, const void *p, size_t sz, size_t run_length) { + const uint8_t *q, *w, *e; + ssize_t l; + + q = w = p; + e = q + sz; + while (q < e) { + size_t n; + + n = nul_length(q, e - q); + + /* If there are more than the specified run length of + * NUL bytes, or if this is the beginning or the end + * of the buffer, then seek instead of write */ + if ((n > run_length) || + (n > 0 && q == p) || + (n > 0 && q + n >= e)) { + if (q > w) { + l = write(fd, w, q - w); + if (l < 0) + return -errno; + if (l != q -w) + return -EIO; + } + + if (lseek(fd, n, SEEK_CUR) < 0) + return -errno; + + q += n; + w = q; + } else if (n > 0) + q += n; + else + q++; + } + + if (q > w) { + l = write(fd, w, q - w); + if (l < 0) + return -errno; + if (l != q - w) + return -EIO; + } + + return q - (const uint8_t*) p; +} diff --git a/src/basic/io-util.h b/src/basic/io-util.h new file mode 100644 index 0000000..e027c1a --- /dev/null +++ b/src/basic/io-util.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "macro.h" +#include "time-util.h" + +int flush_fd(int fd); + +ssize_t loop_read(int fd, void *buf, size_t nbytes, bool do_poll); +int loop_read_exact(int fd, void *buf, size_t nbytes, bool do_poll); + +int loop_write_full(int fd, const void *buf, size_t nbytes, usec_t timeout); +static inline int loop_write(int fd, const void *buf, size_t nbytes) { + return loop_write_full(fd, buf, nbytes, 0); +} + +int pipe_eof(int fd); + +int ppoll_usec(struct pollfd *fds, size_t nfds, usec_t timeout); +int fd_wait_for_event(int fd, int event, usec_t timeout); + +ssize_t sparse_write(int fd, const void *p, size_t sz, size_t run_length); + +static inline bool FILE_SIZE_VALID(uint64_t l) { + /* ftruncate() and friends take an unsigned file size, but actually cannot deal with file sizes larger than + * 2^63 since the kernel internally handles it as signed value. This call allows checking for this early. */ + + return (l >> 63) == 0; +} + +static inline bool FILE_SIZE_VALID_OR_INFINITY(uint64_t l) { + + /* Same as above, but allows one extra value: -1 as indication for infinity. */ + + if (l == UINT64_MAX) + return true; + + return FILE_SIZE_VALID(l); + +} diff --git a/src/basic/ioprio-util.c b/src/basic/ioprio-util.c new file mode 100644 index 0000000..b63650b --- /dev/null +++ b/src/basic/ioprio-util.c @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "ioprio-util.h" +#include "parse-util.h" +#include "string-table.h" + +int ioprio_parse_priority(const char *s, int *ret) { + int i, r; + + assert(s); + assert(ret); + + r = safe_atoi(s, &i); + if (r < 0) + return r; + + if (!ioprio_priority_is_valid(i)) + return -EINVAL; + + *ret = i; + return 0; +} + +static const char *const ioprio_class_table[] = { + [IOPRIO_CLASS_NONE] = "none", + [IOPRIO_CLASS_RT] = "realtime", + [IOPRIO_CLASS_BE] = "best-effort", + [IOPRIO_CLASS_IDLE] = "idle", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(ioprio_class, int, IOPRIO_N_CLASSES); diff --git a/src/basic/ioprio-util.h b/src/basic/ioprio-util.h new file mode 100644 index 0000000..b8c9b7d --- /dev/null +++ b/src/basic/ioprio-util.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" +#include "missing_ioprio.h" + +int ioprio_class_to_string_alloc(int i, char **s); +int ioprio_class_from_string(const char *s); + +static inline bool ioprio_class_is_valid(int i) { + return IN_SET(i, IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE); +} + +static inline bool ioprio_priority_is_valid(int i) { + return i >= 0 && i < IOPRIO_BE_NR; +} + +int ioprio_parse_priority(const char *s, int *ret); + +/* IOPRIO_CLASS_NONE with any prio value is another way to say IOPRIO_CLASS_BE with level 4. Encode that in a + * proper macro. */ +#define IOPRIO_DEFAULT_CLASS_AND_PRIO ioprio_prio_value(IOPRIO_CLASS_BE, 4) + +static inline int ioprio_normalize(int v) { + /* Converts IOPRIO_CLASS_NONE to what it actually means */ + return ioprio_prio_class(v) == IOPRIO_CLASS_NONE ? IOPRIO_DEFAULT_CLASS_AND_PRIO : v; +} diff --git a/src/basic/iovec-util.c b/src/basic/iovec-util.c new file mode 100644 index 0000000..991889a --- /dev/null +++ b/src/basic/iovec-util.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "iovec-util.h" +#include "string-util.h" + +size_t iovec_total_size(const struct iovec *iovec, size_t n) { + size_t sum = 0; + + assert(iovec || n == 0); + + FOREACH_ARRAY(j, iovec, n) + sum += j->iov_len; + + return sum; +} + +bool iovec_increment(struct iovec *iovec, size_t n, size_t k) { + assert(iovec || n == 0); + + /* Returns true if there is nothing else to send (bytes written cover all of the iovec), + * false if there's still work to do. */ + + FOREACH_ARRAY(j, iovec, n) { + size_t sub; + + if (j->iov_len == 0) + continue; + if (k == 0) + return false; + + sub = MIN(j->iov_len, k); + j->iov_len -= sub; + j->iov_base = (uint8_t*) j->iov_base + sub; + k -= sub; + } + + assert(k == 0); /* Anything else would mean that we wrote more bytes than available, + * or the kernel reported writing more bytes than sent. */ + return true; +} + +char* set_iovec_string_field(struct iovec *iovec, size_t *n_iovec, const char *field, const char *value) { + char *x; + + assert(iovec); + assert(n_iovec); + + x = strjoin(field, value); + if (x) + iovec[(*n_iovec)++] = IOVEC_MAKE_STRING(x); + return x; +} + +char* set_iovec_string_field_free(struct iovec *iovec, size_t *n_iovec, const char *field, char *value) { + char *x; + + assert(iovec); + assert(n_iovec); + + x = set_iovec_string_field(iovec, n_iovec, field, value); + free(value); + return x; +} + +void iovec_array_free(struct iovec *iovec, size_t n) { + FOREACH_ARRAY(i, iovec, n) + free(i->iov_base); + + free(iovec); +} diff --git a/src/basic/iovec-util.h b/src/basic/iovec-util.h new file mode 100644 index 0000000..39feabd --- /dev/null +++ b/src/basic/iovec-util.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "alloc-util.h" +#include "macro.h" + +size_t iovec_total_size(const struct iovec *iovec, size_t n); + +bool iovec_increment(struct iovec *iovec, size_t n, size_t k); + +#define IOVEC_MAKE(base, len) (struct iovec) { .iov_base = (base), .iov_len = (len) } +#define IOVEC_MAKE_STRING(string) \ + ({ \ + const char *_s = (string); \ + IOVEC_MAKE((char*) _s, strlen(_s)); \ + }) + +static inline void iovec_done(struct iovec *iovec) { + /* A _cleanup_() helper that frees the iov_base in the iovec */ + assert(iovec); + + iovec->iov_base = mfree(iovec->iov_base); + iovec->iov_len = 0; +} + +static inline void iovec_done_erase(struct iovec *iovec) { + assert(iovec); + + iovec->iov_base = erase_and_free(iovec->iov_base); + iovec->iov_len = 0; +} + +static inline bool iovec_is_set(const struct iovec *iovec) { + return iovec && iovec->iov_len > 0 && iovec->iov_base; +} + +char* set_iovec_string_field(struct iovec *iovec, size_t *n_iovec, const char *field, const char *value); +char* set_iovec_string_field_free(struct iovec *iovec, size_t *n_iovec, const char *field, char *value); + +void iovec_array_free(struct iovec *iovec, size_t n); diff --git a/src/basic/iovec-wrapper.c b/src/basic/iovec-wrapper.c new file mode 100644 index 0000000..b335acd --- /dev/null +++ b/src/basic/iovec-wrapper.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "iovec-util.h" +#include "iovec-wrapper.h" +#include "string-util.h" + +struct iovec_wrapper *iovw_new(void) { + return new0(struct iovec_wrapper, 1); +} + +void iovw_free_contents(struct iovec_wrapper *iovw, bool free_vectors) { + assert(iovw); + + if (free_vectors) + for (size_t i = 0; i < iovw->count; i++) + free(iovw->iovec[i].iov_base); + + iovw->iovec = mfree(iovw->iovec); + iovw->count = 0; +} + +struct iovec_wrapper *iovw_free_free(struct iovec_wrapper *iovw) { + if (!iovw) + return NULL; + + iovw_free_contents(iovw, /* free_vectors= */ true); + return mfree(iovw); +} + +struct iovec_wrapper *iovw_free(struct iovec_wrapper *iovw) { + if (!iovw) + return NULL; + + iovw_free_contents(iovw, /* free_vectors= */ false); + return mfree(iovw); +} + +int iovw_put(struct iovec_wrapper *iovw, void *data, size_t len) { + assert(iovw); + + if (len == 0) + return 0; + + assert(data); + + if (iovw->count >= IOV_MAX) + return -E2BIG; + + if (!GREEDY_REALLOC(iovw->iovec, iovw->count + 1)) + return -ENOMEM; + + iovw->iovec[iovw->count++] = IOVEC_MAKE(data, len); + return 0; +} + +int iovw_put_string_field(struct iovec_wrapper *iovw, const char *field, const char *value) { + _cleanup_free_ char *x = NULL; + int r; + + assert(iovw); + + x = strjoin(field, value); + if (!x) + return -ENOMEM; + + r = iovw_put(iovw, x, strlen(x)); + if (r >= 0) + TAKE_PTR(x); + + return r; +} + +int iovw_put_string_field_free(struct iovec_wrapper *iovw, const char *field, char *value) { + _cleanup_free_ _unused_ char *free_ptr = value; + + return iovw_put_string_field(iovw, field, value); +} + +void iovw_rebase(struct iovec_wrapper *iovw, void *old, void *new) { + assert(iovw); + + FOREACH_ARRAY(i, iovw->iovec, iovw->count) { + assert(i->iov_base >= old); + i->iov_base = (uint8_t*) i->iov_base - (uint8_t*) old + (uint8_t*) new; + } +} + +size_t iovw_size(const struct iovec_wrapper *iovw) { + if (!iovw) + return 0; + + return iovec_total_size(iovw->iovec, iovw->count); +} + +int iovw_append(struct iovec_wrapper *target, const struct iovec_wrapper *source) { + size_t original_count; + int r; + + assert(target); + + /* This duplicates the source and merges it into the target. */ + + if (iovw_isempty(source)) + return 0; + + original_count = target->count; + + FOREACH_ARRAY(iovec, source->iovec, source->count) { + void *dup; + + dup = memdup(iovec->iov_base, iovec->iov_len); + if (!dup) { + r = -ENOMEM; + goto rollback; + } + + r = iovw_consume(target, dup, iovec->iov_len); + if (r < 0) + goto rollback; + } + + return 0; + +rollback: + for (size_t i = original_count; i < target->count; i++) + free(target->iovec[i].iov_base); + + target->count = original_count; + return r; +} diff --git a/src/basic/iovec-wrapper.h b/src/basic/iovec-wrapper.h new file mode 100644 index 0000000..05e220c --- /dev/null +++ b/src/basic/iovec-wrapper.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +struct iovec_wrapper { + struct iovec *iovec; + size_t count; +}; + +struct iovec_wrapper *iovw_new(void); +struct iovec_wrapper *iovw_free(struct iovec_wrapper *iovw); +struct iovec_wrapper *iovw_free_free(struct iovec_wrapper *iovw); + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct iovec_wrapper*, iovw_free_free); + +void iovw_free_contents(struct iovec_wrapper *iovw, bool free_vectors); + +int iovw_put(struct iovec_wrapper *iovw, void *data, size_t len); +static inline int iovw_consume(struct iovec_wrapper *iovw, void *data, size_t len) { + /* Move data into iovw or free on error */ + int r; + + r = iovw_put(iovw, data, len); + if (r < 0) + free(data); + + return r; +} + +static inline bool iovw_isempty(const struct iovec_wrapper *iovw) { + return !iovw || iovw->count == 0; +} + +int iovw_put_string_field(struct iovec_wrapper *iovw, const char *field, const char *value); +int iovw_put_string_field_free(struct iovec_wrapper *iovw, const char *field, char *value); +void iovw_rebase(struct iovec_wrapper *iovw, void *old, void *new); +size_t iovw_size(const struct iovec_wrapper *iovw); +int iovw_append(struct iovec_wrapper *target, const struct iovec_wrapper *source); diff --git a/src/basic/label.c b/src/basic/label.c new file mode 100644 index 0000000..f134e77 --- /dev/null +++ b/src/basic/label.c @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "label.h" + +static const LabelOps *label_ops = NULL; + +int label_ops_set(const LabelOps *ops) { + if (label_ops) + return -EBUSY; + + label_ops = ops; + return 0; +} + +int label_ops_pre(int dir_fd, const char *path, mode_t mode) { + if (!label_ops || !label_ops->pre) + return 0; + + return label_ops->pre(dir_fd, path, mode); +} + +int label_ops_post(int dir_fd, const char *path) { + if (!label_ops || !label_ops->post) + return 0; + + return label_ops->post(dir_fd, path); +} diff --git a/src/basic/label.h b/src/basic/label.h new file mode 100644 index 0000000..9644e43 --- /dev/null +++ b/src/basic/label.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef struct LabelOps { + int (*pre)(int dir_fd, const char *path, mode_t mode); + int (*post)(int dir_fd, const char *path); +} LabelOps; + +int label_ops_set(const LabelOps *label_ops); + +int label_ops_pre(int dir_fd, const char *path, mode_t mode); +int label_ops_post(int dir_fd, const char *path); diff --git a/src/basic/limits-util.c b/src/basic/limits-util.c new file mode 100644 index 0000000..9597c4c --- /dev/null +++ b/src/basic/limits-util.c @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "limits-util.h" +#include "memory-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "string-util.h" + +uint64_t physical_memory(void) { + _cleanup_free_ char *root = NULL, *value = NULL; + uint64_t mem, lim; + size_t ps; + long sc; + int r; + + /* We return this as uint64_t in case we are running as 32-bit process on a 64-bit kernel with huge amounts of + * memory. + * + * In order to support containers nicely that have a configured memory limit we'll take the minimum of the + * physically reported amount of memory and the limit configured for the root cgroup, if there is any. */ + + sc = sysconf(_SC_PHYS_PAGES); + assert(sc > 0); + + ps = page_size(); + mem = (uint64_t) sc * (uint64_t) ps; + + r = cg_get_root_path(&root); + if (r < 0) { + log_debug_errno(r, "Failed to determine root cgroup, ignoring cgroup memory limit: %m"); + return mem; + } + + r = cg_all_unified(); + if (r < 0) { + log_debug_errno(r, "Failed to determine root unified mode, ignoring cgroup memory limit: %m"); + return mem; + } + if (r > 0) { + r = cg_get_attribute("memory", root, "memory.max", &value); + if (r == -ENOENT) /* Field does not exist on the system's top-level cgroup, hence don't + * complain. (Note that it might exist on our own root though, if we live + * in a cgroup namespace, hence check anyway instead of not even + * trying.) */ + return mem; + if (r < 0) { + log_debug_errno(r, "Failed to read memory.max cgroup attribute, ignoring cgroup memory limit: %m"); + return mem; + } + + if (streq(value, "max")) + return mem; + } else { + r = cg_get_attribute("memory", root, "memory.limit_in_bytes", &value); + if (r < 0) { + log_debug_errno(r, "Failed to read memory.limit_in_bytes cgroup attribute, ignoring cgroup memory limit: %m"); + return mem; + } + } + + r = safe_atou64(value, &lim); + if (r < 0) { + log_debug_errno(r, "Failed to parse cgroup memory limit '%s', ignoring: %m", value); + return mem; + } + if (lim == UINT64_MAX) + return mem; + + /* Make sure the limit is a multiple of our own page size */ + lim /= ps; + lim *= ps; + + return MIN(mem, lim); +} + +uint64_t physical_memory_scale(uint64_t v, uint64_t max) { + uint64_t p, m, ps; + + /* Shortcut two special cases */ + if (v == 0) + return 0; + if (v == max) + return physical_memory(); + + assert(max > 0); + + /* Returns the physical memory size, multiplied by v divided by max. Returns UINT64_MAX on overflow. On success + * the result is a multiple of the page size (rounds down). */ + + ps = page_size(); + assert(ps > 0); + + p = physical_memory() / ps; + assert(p > 0); + + if (v > UINT64_MAX / p) + return UINT64_MAX; + + m = p * v; + m /= max; + + if (m > UINT64_MAX / ps) + return UINT64_MAX; + + return m * ps; +} + +uint64_t system_tasks_max(void) { + uint64_t a = TASKS_MAX, b = TASKS_MAX, c = TASKS_MAX; + _cleanup_free_ char *root = NULL; + int r; + + /* Determine the maximum number of tasks that may run on this system. We check three sources to + * determine this limit: + * + * a) kernel.threads-max sysctl: the maximum number of tasks (threads) the kernel allows. + * + * This puts a direct limit on the number of concurrent tasks. + * + * b) kernel.pid_max sysctl: the maximum PID value. + * + * This limits the numeric range PIDs can take, and thus indirectly also limits the number of + * concurrent threads. It's primarily a compatibility concept: some crappy old code used a signed + * 16-bit type for PIDs, hence the kernel provides a way to ensure the PIDs never go beyond + * INT16_MAX by default. + * + * Also note the weird definition: PIDs assigned will be kept below this value, which means + * the number of tasks that can be created is one lower, as PID 0 is not a valid process ID. + * + * c) pids.max on the root cgroup: the kernel's configured maximum number of tasks. + * + * and then pick the smallest of the three. + * + * By default pid_max is set to much lower values than threads-max, hence the limit people come into + * contact with first, as it's the lowest boundary they need to bump when they want higher number of + * processes. + */ + + r = procfs_get_threads_max(&a); + if (r < 0) + log_debug_errno(r, "Failed to read kernel.threads-max, ignoring: %m"); + + r = procfs_get_pid_max(&b); + if (r < 0) + log_debug_errno(r, "Failed to read kernel.pid_max, ignoring: %m"); + else if (b > 0) + /* Subtract one from pid_max, since PID 0 is not a valid PID */ + b--; + + r = cg_get_root_path(&root); + if (r < 0) + log_debug_errno(r, "Failed to determine cgroup root path, ignoring: %m"); + else { + /* We'll have the "pids.max" attribute on the our root cgroup only if we are in a + * CLONE_NEWCGROUP namespace. On the top-level namespace this attribute is missing, hence + * suppress any message about that */ + r = cg_get_attribute_as_uint64("pids", root, "pids.max", &c); + if (r < 0 && r != -ENODATA) + log_debug_errno(r, "Failed to read pids.max attribute of root cgroup, ignoring: %m"); + } + + return MIN3(a, b, c); +} + +uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) { + uint64_t t, m; + + /* Shortcut two special cases */ + if (v == 0) + return 0; + if (v == max) + return system_tasks_max(); + + assert(max > 0); + + /* Multiply the system's task value by the fraction v/max. Hence, if max==100 this calculates percentages + * relative to the system's maximum number of tasks. Returns UINT64_MAX on overflow. */ + + t = system_tasks_max(); + assert(t > 0); + + if (v > UINT64_MAX / t) /* overflow? */ + return UINT64_MAX; + + m = t * v; + return m / max; +} diff --git a/src/basic/limits-util.h b/src/basic/limits-util.h new file mode 100644 index 0000000..d267fcf --- /dev/null +++ b/src/basic/limits-util.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +uint64_t physical_memory(void); +uint64_t physical_memory_scale(uint64_t v, uint64_t max); + +uint64_t system_tasks_max(void); +uint64_t system_tasks_max_scale(uint64_t v, uint64_t max); diff --git a/src/basic/linux/README b/src/basic/linux/README new file mode 100644 index 0000000..790b4fe --- /dev/null +++ b/src/basic/linux/README @@ -0,0 +1,8 @@ +The files in this directory are copied from current kernel master +(b06ed1e7a2fa9b636f368a9e97c3c8877623f8b2) or WireGuard master +(8416093498ac2c754536dad4757c5d86c9ba8809), and the following +modifications are applied: +- btrfs.h: drop '__user' attributes +- if.h: drop '#include ' and '__user' attributes +- stddef.h: drop '#include ' +- guard linux/fs.h include to avoid conflict with glibc 2.36 diff --git a/src/basic/linux/batman_adv.h b/src/basic/linux/batman_adv.h new file mode 100644 index 0000000..35dc016 --- /dev/null +++ b/src/basic/linux/batman_adv.h @@ -0,0 +1,704 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright (C) B.A.T.M.A.N. contributors: + * + * Matthias Schiffer + */ + +#ifndef _UAPI_LINUX_BATMAN_ADV_H_ +#define _UAPI_LINUX_BATMAN_ADV_H_ + +#define BATADV_NL_NAME "batadv" + +#define BATADV_NL_MCAST_GROUP_CONFIG "config" +#define BATADV_NL_MCAST_GROUP_TPMETER "tpmeter" + +/** + * enum batadv_tt_client_flags - TT client specific flags + * + * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. + * Bits from 8 to 15 are called _local flags_ because they are used for local + * computations only. + * + * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with + * the other nodes in the network. To achieve this goal these flags are included + * in the TT CRC computation. + */ +enum batadv_tt_client_flags { + /** + * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table + */ + BATADV_TT_CLIENT_DEL = (1 << 0), + + /** + * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and + * the new update telling its new real location has not been + * received/sent yet + */ + BATADV_TT_CLIENT_ROAM = (1 << 1), + + /** + * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi + * interface. This information is used by the "AP Isolation" feature + */ + BATADV_TT_CLIENT_WIFI = (1 << 4), + + /** + * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This + * information is used by the Extended Isolation feature + */ + BATADV_TT_CLIENT_ISOLA = (1 << 5), + + /** + * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from + * the table + */ + BATADV_TT_CLIENT_NOPURGE = (1 << 8), + + /** + * @BATADV_TT_CLIENT_NEW: this client has been added to the local table + * but has not been announced yet + */ + BATADV_TT_CLIENT_NEW = (1 << 9), + + /** + * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it + * is kept in the table for one more originator interval for consistency + * purposes + */ + BATADV_TT_CLIENT_PENDING = (1 << 10), + + /** + * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be + * part of the network but no node has already announced it + */ + BATADV_TT_CLIENT_TEMP = (1 << 11), +}; + +/** + * enum batadv_mcast_flags_priv - Private, own multicast flags + * + * These are internal, multicast related flags. Currently they describe certain + * multicast related attributes of the segment this originator bridges into the + * mesh. + * + * Those attributes are used to determine the public multicast flags this + * originator is going to announce via TT. + * + * For netlink, if BATADV_MCAST_FLAGS_BRIDGED is unset then all querier + * related flags are undefined. + */ +enum batadv_mcast_flags_priv { + /** + * @BATADV_MCAST_FLAGS_BRIDGED: There is a bridge on top of the mesh + * interface. + */ + BATADV_MCAST_FLAGS_BRIDGED = (1 << 0), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS: Whether an IGMP querier + * exists in the mesh + */ + BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS = (1 << 1), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS: Whether an MLD querier + * exists in the mesh + */ + BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS = (1 << 2), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING: If an IGMP querier + * exists, whether it is potentially shadowing multicast listeners + * (i.e. querier is behind our own bridge segment) + */ + BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING = (1 << 3), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING: If an MLD querier + * exists, whether it is potentially shadowing multicast listeners + * (i.e. querier is behind our own bridge segment) + */ + BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING = (1 << 4), +}; + +/** + * enum batadv_gw_modes - gateway mode of node + */ +enum batadv_gw_modes { + /** @BATADV_GW_MODE_OFF: gw mode disabled */ + BATADV_GW_MODE_OFF, + + /** @BATADV_GW_MODE_CLIENT: send DHCP requests to gw servers */ + BATADV_GW_MODE_CLIENT, + + /** @BATADV_GW_MODE_SERVER: announce itself as gateway server */ + BATADV_GW_MODE_SERVER, +}; + +/** + * enum batadv_nl_attrs - batman-adv netlink attributes + */ +enum batadv_nl_attrs { + /** + * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors + */ + BATADV_ATTR_UNSPEC, + + /** + * @BATADV_ATTR_VERSION: batman-adv version string + */ + BATADV_ATTR_VERSION, + + /** + * @BATADV_ATTR_ALGO_NAME: name of routing algorithm + */ + BATADV_ATTR_ALGO_NAME, + + /** + * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface + */ + BATADV_ATTR_MESH_IFINDEX, + + /** + * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface + */ + BATADV_ATTR_MESH_IFNAME, + + /** + * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface + */ + BATADV_ATTR_MESH_ADDRESS, + + /** + * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface + */ + BATADV_ATTR_HARD_IFINDEX, + + /** + * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface + */ + BATADV_ATTR_HARD_IFNAME, + + /** + * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv + * interface + */ + BATADV_ATTR_HARD_ADDRESS, + + /** + * @BATADV_ATTR_ORIG_ADDRESS: originator mac address + */ + BATADV_ATTR_ORIG_ADDRESS, + + /** + * @BATADV_ATTR_TPMETER_RESULT: result of run (see + * batadv_tp_meter_status) + */ + BATADV_ATTR_TPMETER_RESULT, + + /** + * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took + */ + BATADV_ATTR_TPMETER_TEST_TIME, + + /** + * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run + */ + BATADV_ATTR_TPMETER_BYTES, + + /** + * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session + */ + BATADV_ATTR_TPMETER_COOKIE, + + /** + * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment + */ + BATADV_ATTR_PAD, + + /** + * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active + */ + BATADV_ATTR_ACTIVE, + + /** + * @BATADV_ATTR_TT_ADDRESS: Client MAC address + */ + BATADV_ATTR_TT_ADDRESS, + + /** + * @BATADV_ATTR_TT_TTVN: Translation table version + */ + BATADV_ATTR_TT_TTVN, + + /** + * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version + */ + BATADV_ATTR_TT_LAST_TTVN, + + /** + * @BATADV_ATTR_TT_CRC32: CRC32 over translation table + */ + BATADV_ATTR_TT_CRC32, + + /** + * @BATADV_ATTR_TT_VID: VLAN ID + */ + BATADV_ATTR_TT_VID, + + /** + * @BATADV_ATTR_TT_FLAGS: Translation table client flags + */ + BATADV_ATTR_TT_FLAGS, + + /** + * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best + */ + BATADV_ATTR_FLAG_BEST, + + /** + * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen + */ + BATADV_ATTR_LAST_SEEN_MSECS, + + /** + * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address + */ + BATADV_ATTR_NEIGH_ADDRESS, + + /** + * @BATADV_ATTR_TQ: TQ to neighbour + */ + BATADV_ATTR_TQ, + + /** + * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour + */ + BATADV_ATTR_THROUGHPUT, + + /** + * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth + */ + BATADV_ATTR_BANDWIDTH_UP, + + /** + * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth + */ + BATADV_ATTR_BANDWIDTH_DOWN, + + /** + * @BATADV_ATTR_ROUTER: Gateway router MAC address + */ + BATADV_ATTR_ROUTER, + + /** + * @BATADV_ATTR_BLA_OWN: Flag indicating own originator + */ + BATADV_ATTR_BLA_OWN, + + /** + * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address + */ + BATADV_ATTR_BLA_ADDRESS, + + /** + * @BATADV_ATTR_BLA_VID: BLA VLAN ID + */ + BATADV_ATTR_BLA_VID, + + /** + * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address + */ + BATADV_ATTR_BLA_BACKBONE, + + /** + * @BATADV_ATTR_BLA_CRC: BLA CRC + */ + BATADV_ATTR_BLA_CRC, + + /** + * @BATADV_ATTR_DAT_CACHE_IP4ADDRESS: Client IPv4 address + */ + BATADV_ATTR_DAT_CACHE_IP4ADDRESS, + + /** + * @BATADV_ATTR_DAT_CACHE_HWADDRESS: Client MAC address + */ + BATADV_ATTR_DAT_CACHE_HWADDRESS, + + /** + * @BATADV_ATTR_DAT_CACHE_VID: VLAN ID + */ + BATADV_ATTR_DAT_CACHE_VID, + + /** + * @BATADV_ATTR_MCAST_FLAGS: Per originator multicast flags + */ + BATADV_ATTR_MCAST_FLAGS, + + /** + * @BATADV_ATTR_MCAST_FLAGS_PRIV: Private, own multicast flags + */ + BATADV_ATTR_MCAST_FLAGS_PRIV, + + /** + * @BATADV_ATTR_VLANID: VLAN id on top of soft interface + */ + BATADV_ATTR_VLANID, + + /** + * @BATADV_ATTR_AGGREGATED_OGMS_ENABLED: whether the batman protocol + * messages of the mesh interface shall be aggregated or not. + */ + BATADV_ATTR_AGGREGATED_OGMS_ENABLED, + + /** + * @BATADV_ATTR_AP_ISOLATION_ENABLED: whether the data traffic going + * from a wireless client to another wireless client will be silently + * dropped. + */ + BATADV_ATTR_AP_ISOLATION_ENABLED, + + /** + * @BATADV_ATTR_ISOLATION_MARK: the isolation mark which is used to + * classify clients as "isolated" by the Extended Isolation feature. + */ + BATADV_ATTR_ISOLATION_MARK, + + /** + * @BATADV_ATTR_ISOLATION_MASK: the isolation (bit)mask which is used to + * classify clients as "isolated" by the Extended Isolation feature. + */ + BATADV_ATTR_ISOLATION_MASK, + + /** + * @BATADV_ATTR_BONDING_ENABLED: whether the data traffic going through + * the mesh will be sent using multiple interfaces at the same time. + */ + BATADV_ATTR_BONDING_ENABLED, + + /** + * @BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED: whether the bridge loop + * avoidance feature is enabled. This feature detects and avoids loops + * between the mesh and devices bridged with the soft interface + */ + BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, + + /** + * @BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED: whether the distributed + * arp table feature is enabled. This feature uses a distributed hash + * table to answer ARP requests without flooding the request through + * the whole mesh. + */ + BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, + + /** + * @BATADV_ATTR_FRAGMENTATION_ENABLED: whether the data traffic going + * through the mesh will be fragmented or silently discarded if the + * packet size exceeds the outgoing interface MTU. + */ + BATADV_ATTR_FRAGMENTATION_ENABLED, + + /** + * @BATADV_ATTR_GW_BANDWIDTH_DOWN: defines the download bandwidth which + * is propagated by this node if %BATADV_ATTR_GW_BANDWIDTH_MODE was set + * to 'server'. + */ + BATADV_ATTR_GW_BANDWIDTH_DOWN, + + /** + * @BATADV_ATTR_GW_BANDWIDTH_UP: defines the upload bandwidth which + * is propagated by this node if %BATADV_ATTR_GW_BANDWIDTH_MODE was set + * to 'server'. + */ + BATADV_ATTR_GW_BANDWIDTH_UP, + + /** + * @BATADV_ATTR_GW_MODE: defines the state of the gateway features. + * Possible values are specified in enum batadv_gw_modes + */ + BATADV_ATTR_GW_MODE, + + /** + * @BATADV_ATTR_GW_SEL_CLASS: defines the selection criteria this node + * will use to choose a gateway if gw_mode was set to 'client'. + */ + BATADV_ATTR_GW_SEL_CLASS, + + /** + * @BATADV_ATTR_HOP_PENALTY: defines the penalty which will be applied + * to an originator message's tq-field on every hop and/or per + * hard interface + */ + BATADV_ATTR_HOP_PENALTY, + + /** + * @BATADV_ATTR_LOG_LEVEL: bitmask with to define which debug messages + * should be send to the debug log/trace ring buffer + */ + BATADV_ATTR_LOG_LEVEL, + + /** + * @BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED: whether multicast + * optimizations should be replaced by simple broadcast-like flooding + * of multicast packets. If set to non-zero then all nodes in the mesh + * are going to use classic flooding for any multicast packet with no + * optimizations. + */ + BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, + + /** + * @BATADV_ATTR_NETWORK_CODING_ENABLED: whether Network Coding (using + * some magic to send fewer wifi packets but still the same content) is + * enabled or not. + */ + BATADV_ATTR_NETWORK_CODING_ENABLED, + + /** + * @BATADV_ATTR_ORIG_INTERVAL: defines the interval in milliseconds in + * which batman sends its protocol messages. + */ + BATADV_ATTR_ORIG_INTERVAL, + + /** + * @BATADV_ATTR_ELP_INTERVAL: defines the interval in milliseconds in + * which batman emits probing packets for neighbor sensing (ELP). + */ + BATADV_ATTR_ELP_INTERVAL, + + /** + * @BATADV_ATTR_THROUGHPUT_OVERRIDE: defines the throughput value to be + * used by B.A.T.M.A.N. V when estimating the link throughput using + * this interface. If the value is set to 0 then batman-adv will try to + * estimate the throughput by itself. + */ + BATADV_ATTR_THROUGHPUT_OVERRIDE, + + /** + * @BATADV_ATTR_MULTICAST_FANOUT: defines the maximum number of packet + * copies that may be generated for a multicast-to-unicast conversion. + * Once this limit is exceeded distribution will fall back to broadcast. + */ + BATADV_ATTR_MULTICAST_FANOUT, + + /* add attributes above here, update the policy in netlink.c */ + + /** + * @__BATADV_ATTR_AFTER_LAST: internal use + */ + __BATADV_ATTR_AFTER_LAST, + + /** + * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available + */ + NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, + + /** + * @BATADV_ATTR_MAX: highest attribute number currently defined + */ + BATADV_ATTR_MAX = __BATADV_ATTR_AFTER_LAST - 1 +}; + +/** + * enum batadv_nl_commands - supported batman-adv netlink commands + */ +enum batadv_nl_commands { + /** + * @BATADV_CMD_UNSPEC: unspecified command to catch errors + */ + BATADV_CMD_UNSPEC, + + /** + * @BATADV_CMD_GET_MESH: Get attributes from softif/mesh + */ + BATADV_CMD_GET_MESH, + + /** + * @BATADV_CMD_GET_MESH_INFO: Alias for @BATADV_CMD_GET_MESH + */ + BATADV_CMD_GET_MESH_INFO = BATADV_CMD_GET_MESH, + + /** + * @BATADV_CMD_TP_METER: Start a tp meter session + */ + BATADV_CMD_TP_METER, + + /** + * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session + */ + BATADV_CMD_TP_METER_CANCEL, + + /** + * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. + */ + BATADV_CMD_GET_ROUTING_ALGOS, + + /** + * @BATADV_CMD_GET_HARDIF: Get attributes from a hardif of the + * current softif + */ + BATADV_CMD_GET_HARDIF, + + /** + * @BATADV_CMD_GET_HARDIFS: Alias for @BATADV_CMD_GET_HARDIF + */ + BATADV_CMD_GET_HARDIFS = BATADV_CMD_GET_HARDIF, + + /** + * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations + */ + BATADV_CMD_GET_TRANSTABLE_LOCAL, + + /** + * @BATADV_CMD_GET_TRANSTABLE_GLOBAL: Query list of global translations + */ + BATADV_CMD_GET_TRANSTABLE_GLOBAL, + + /** + * @BATADV_CMD_GET_ORIGINATORS: Query list of originators + */ + BATADV_CMD_GET_ORIGINATORS, + + /** + * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours + */ + BATADV_CMD_GET_NEIGHBORS, + + /** + * @BATADV_CMD_GET_GATEWAYS: Query list of gateways + */ + BATADV_CMD_GET_GATEWAYS, + + /** + * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims + */ + BATADV_CMD_GET_BLA_CLAIM, + + /** + * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance + * backbones + */ + BATADV_CMD_GET_BLA_BACKBONE, + + /** + * @BATADV_CMD_GET_DAT_CACHE: Query list of DAT cache entries + */ + BATADV_CMD_GET_DAT_CACHE, + + /** + * @BATADV_CMD_GET_MCAST_FLAGS: Query list of multicast flags + */ + BATADV_CMD_GET_MCAST_FLAGS, + + /** + * @BATADV_CMD_SET_MESH: Set attributes for softif/mesh + */ + BATADV_CMD_SET_MESH, + + /** + * @BATADV_CMD_SET_HARDIF: Set attributes for hardif of the + * current softif + */ + BATADV_CMD_SET_HARDIF, + + /** + * @BATADV_CMD_GET_VLAN: Get attributes from a VLAN of the + * current softif + */ + BATADV_CMD_GET_VLAN, + + /** + * @BATADV_CMD_SET_VLAN: Set attributes for VLAN of the + * current softif + */ + BATADV_CMD_SET_VLAN, + + /* add new commands above here */ + + /** + * @__BATADV_CMD_AFTER_LAST: internal use + */ + __BATADV_CMD_AFTER_LAST, + + /** + * @BATADV_CMD_MAX: highest used command number + */ + BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 +}; + +/** + * enum batadv_tp_meter_reason - reason of a tp meter test run stop + */ +enum batadv_tp_meter_reason { + /** + * @BATADV_TP_REASON_COMPLETE: sender finished tp run + */ + BATADV_TP_REASON_COMPLETE = 3, + + /** + * @BATADV_TP_REASON_CANCEL: sender was stopped during run + */ + BATADV_TP_REASON_CANCEL = 4, + + /* error status >= 128 */ + + /** + * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or + * didn't answer + */ + BATADV_TP_REASON_DST_UNREACHABLE = 128, + + /** + * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit + */ + BATADV_TP_REASON_RESEND_LIMIT = 129, + + /** + * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node + * already ongoing + */ + BATADV_TP_REASON_ALREADY_ONGOING = 130, + + /** + * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory + */ + BATADV_TP_REASON_MEMORY_ERROR = 131, + + /** + * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface + */ + BATADV_TP_REASON_CANT_SEND = 132, + + /** + * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions + */ + BATADV_TP_REASON_TOO_MANY = 133, +}; + +/** + * enum batadv_ifla_attrs - batman-adv ifla nested attributes + */ +enum batadv_ifla_attrs { + /** + * @IFLA_BATADV_UNSPEC: unspecified attribute which is not parsed by + * rtnetlink + */ + IFLA_BATADV_UNSPEC, + + /** + * @IFLA_BATADV_ALGO_NAME: routing algorithm (name) which should be + * used by the newly registered batadv net_device. + */ + IFLA_BATADV_ALGO_NAME, + + /* add attributes above here, update the policy in soft-interface.c */ + + /** + * @__IFLA_BATADV_MAX: internal use + */ + __IFLA_BATADV_MAX, +}; + +#define IFLA_BATADV_MAX (__IFLA_BATADV_MAX - 1) + +#endif /* _UAPI_LINUX_BATMAN_ADV_H_ */ diff --git a/src/basic/linux/btrfs.h b/src/basic/linux/btrfs.h new file mode 100644 index 0000000..74ed908 --- /dev/null +++ b/src/basic/linux/btrfs.h @@ -0,0 +1,1173 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _UAPI_LINUX_BTRFS_H +#define _UAPI_LINUX_BTRFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#if WANT_LINUX_FS_H +#include +#endif + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_LABEL_SIZE 256 + +/* this should be 4k */ +#define BTRFS_PATH_NAME_MAX 4087 +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +#define BTRFS_DEVICE_PATH_NAME_MAX 1024 +#define BTRFS_SUBVOL_NAME_MAX 4039 + +#ifndef __KERNEL__ +/* Deprecated since 5.7 */ +# define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#endif +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) +#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) + +#define BTRFS_DEVICE_SPEC_BY_ID (1ULL << 3) + +#define BTRFS_SUBVOL_SPEC_BY_ID (1ULL << 4) + +#define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED \ + (BTRFS_SUBVOL_RDONLY | \ + BTRFS_SUBVOL_QGROUP_INHERIT | \ + BTRFS_DEVICE_SPEC_BY_ID | \ + BTRFS_SUBVOL_SPEC_BY_ID) + +#define BTRFS_FSID_SIZE 16 +#define BTRFS_UUID_SIZE 16 +#define BTRFS_UUID_UNPARSED_SIZE 37 + +/* + * flags definition for qgroup limits + * + * Used by: + * struct btrfs_qgroup_limit.flags + * struct btrfs_qgroup_limit_item.flags + */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) + +struct btrfs_qgroup_limit { + __u64 flags; + __u64 max_rfer; + __u64 max_excl; + __u64 rsv_rfer; + __u64 rsv_excl; +}; + +/* + * flags definition for qgroup inheritance + * + * Used by: + * struct btrfs_qgroup_inherit.flags + */ +#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) + +struct btrfs_qgroup_inherit { + __u64 flags; + __u64 num_qgroups; + __u64 num_ref_copies; + __u64 num_excl_copies; + struct btrfs_qgroup_limit lim; + __u64 qgroups[]; +}; + +struct btrfs_ioctl_qgroup_limit_args { + __u64 qgroupid; + struct btrfs_qgroup_limit lim; +}; + +/* + * Arguments for specification of subvolumes or devices, supporting by-name or + * by-id and flags + * + * The set of supported flags depends on the ioctl + * + * BTRFS_SUBVOL_RDONLY is also provided/consumed by the following ioctls: + * - BTRFS_IOC_SUBVOL_GETFLAGS + * - BTRFS_IOC_SUBVOL_SETFLAGS + */ + +/* Supported flags for BTRFS_IOC_RM_DEV_V2 */ +#define BTRFS_DEVICE_REMOVE_ARGS_MASK \ + (BTRFS_DEVICE_SPEC_BY_ID) + +/* Supported flags for BTRFS_IOC_SNAP_CREATE_V2 and BTRFS_IOC_SUBVOL_CREATE_V2 */ +#define BTRFS_SUBVOL_CREATE_ARGS_MASK \ + (BTRFS_SUBVOL_RDONLY | \ + BTRFS_SUBVOL_QGROUP_INHERIT) + +/* Supported flags for BTRFS_IOC_SNAP_DESTROY_V2 */ +#define BTRFS_SUBVOL_DELETE_ARGS_MASK \ + (BTRFS_SUBVOL_SPEC_BY_ID) + +struct btrfs_ioctl_vol_args_v2 { + __s64 fd; + __u64 transid; + __u64 flags; + union { + struct { + __u64 size; + struct btrfs_qgroup_inherit *qgroup_inherit; + }; + __u64 unused[4]; + }; + union { + char name[BTRFS_SUBVOL_NAME_MAX + 1]; + __u64 devid; + __u64 subvolid; + }; +}; + +/* + * structure to report errors and progress to userspace, either as a + * result of a finished scrub, a canceled scrub or a progress inquiry + */ +struct btrfs_scrub_progress { + __u64 data_extents_scrubbed; /* # of data extents scrubbed */ + __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ + __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ + __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ + __u64 read_errors; /* # of read errors encountered (EIO) */ + __u64 csum_errors; /* # of failed csum checks */ + __u64 verify_errors; /* # of occurrences, where the metadata + * of a tree block did not match the + * expected values, like generation or + * logical */ + __u64 no_csum; /* # of 4k data block for which no csum + * is present, probably the result of + * data written with nodatasum */ + __u64 csum_discards; /* # of csum for which no data was found + * in the extent tree. */ + __u64 super_errors; /* # of bad super blocks encountered */ + __u64 malloc_errors; /* # of internal kmalloc errors. These + * will likely cause an incomplete + * scrub */ + __u64 uncorrectable_errors; /* # of errors where either no intact + * copy was found or the writeback + * failed */ + __u64 corrected_errors; /* # of errors corrected */ + __u64 last_physical; /* last physical address scrubbed. In + * case a scrub was aborted, this can + * be used to restart the scrub */ + __u64 unverified_errors; /* # of occurrences where a read for a + * full (64k) bio failed, but the re- + * check succeeded for each 4k piece. + * Intermittent error. */ +}; + +#define BTRFS_SCRUB_READONLY 1 +struct btrfs_ioctl_scrub_args { + __u64 devid; /* in */ + __u64 start; /* in */ + __u64 end; /* in */ + __u64 flags; /* in */ + struct btrfs_scrub_progress progress; /* out */ + /* pad to 1k */ + __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; +}; + +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +struct btrfs_ioctl_dev_replace_start_params { + __u64 srcdevid; /* in, if 0, use srcdev_name instead */ + __u64 cont_reading_from_srcdev_mode; /* in, see #define + * above */ + __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 +struct btrfs_ioctl_dev_replace_status_params { + __u64 replace_state; /* out, see #define above */ + __u64 progress_1000; /* out, 0 <= x <= 1000 */ + __u64 time_started; /* out, seconds since 1-Jan-1970 */ + __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ + __u64 num_write_errors; /* out */ + __u64 num_uncorrectable_read_errors; /* out */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS 3 +struct btrfs_ioctl_dev_replace_args { + __u64 cmd; /* in */ + __u64 result; /* out */ + + union { + struct btrfs_ioctl_dev_replace_start_params start; + struct btrfs_ioctl_dev_replace_status_params status; + }; /* in/out */ + + __u64 spare[64]; +}; + +struct btrfs_ioctl_dev_info_args { + __u64 devid; /* in/out */ + __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ + __u64 bytes_used; /* out */ + __u64 total_bytes; /* out */ + __u64 unused[379]; /* pad to 4k */ + __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ +}; + +/* + * Retrieve information about the filesystem + */ + +/* Request information about checksum type and size */ +#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) + +/* Request information about filesystem generation */ +#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) +/* Request information about filesystem metadata UUID */ +#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2) + +struct btrfs_ioctl_fs_info_args { + __u64 max_id; /* out */ + __u64 num_devices; /* out */ + __u8 fsid[BTRFS_FSID_SIZE]; /* out */ + __u32 nodesize; /* out */ + __u32 sectorsize; /* out */ + __u32 clone_alignment; /* out */ + /* See BTRFS_FS_INFO_FLAG_* */ + __u16 csum_type; /* out */ + __u16 csum_size; /* out */ + __u64 flags; /* in/out */ + __u64 generation; /* out */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */ + __u8 reserved[944]; /* pad to 1k */ +}; + +/* + * feature flags + * + * Used by: + * struct btrfs_ioctl_feature_flags + */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) +/* + * Older kernels (< 4.9) on big-endian systems produced broken free space tree + * bitmaps, and btrfs-progs also used to corrupt the free space tree (versions + * < 4.7.3). If this bit is clear, then the free space tree cannot be trusted. + * btrfs-progs can also intentionally clear this bit to ask the kernel to + * rebuild the free space tree, however this might not work on older kernels + * that do not know about this bit. If not sure, clear the cache manually on + * first mount when booting older kernel versions. + */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID (1ULL << 1) +#define BTRFS_FEATURE_COMPAT_RO_VERITY (1ULL << 2) + +/* + * Put all block group items into a dedicated block group tree, greatly + * reducing mount time for large filesystem due to better locality. + */ +#define BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE (1ULL << 3) + +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) + +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) +#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) +#define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10) +#define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11) +#define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12) +#define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) + +struct btrfs_ioctl_feature_flags { + __u64 compat_flags; + __u64 compat_ro_flags; + __u64 incompat_flags; +}; + +/* balance control ioctl modes */ +#define BTRFS_BALANCE_CTL_PAUSE 1 +#define BTRFS_BALANCE_CTL_CANCEL 2 + +/* + * this is packed, because it should be exactly the same as its disk + * byte order counterpart (struct btrfs_disk_balance_args) + */ +struct btrfs_balance_args { + __u64 profiles; + + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __u64 usage; + struct { + __u32 usage_min; + __u32 usage_max; + }; + }; + __u64 devid; + __u64 pstart; + __u64 pend; + __u64 vstart; + __u64 vend; + + __u64 target; + + __u64 flags; + + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __u64 limit; /* limit number of processed chunks */ + struct { + __u32 limit_min; + __u32 limit_max; + }; + }; + + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __u32 stripes_min; + __u32 stripes_max; + + __u64 unused[6]; +} __attribute__ ((__packed__)); + +/* report balance progress to userspace */ +struct btrfs_balance_progress { + __u64 expected; /* estimated # of chunks that will be + * relocated to fulfill the request */ + __u64 considered; /* # of chunks we have considered so far */ + __u64 completed; /* # of chunks relocated so far */ +}; + +/* + * flags definition for balance + * + * Restriper's general type filter + * + * Used by: + * btrfs_ioctl_balance_args.flags + * btrfs_balance_control.flags (internal) + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * flags definitions for per-type balance args + * + * Balance filters + * + * Used by: + * struct btrfs_balance_args + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) +#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) +#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) + +#define BTRFS_BALANCE_ARGS_MASK \ + (BTRFS_BALANCE_ARGS_PROFILES | \ + BTRFS_BALANCE_ARGS_USAGE | \ + BTRFS_BALANCE_ARGS_DEVID | \ + BTRFS_BALANCE_ARGS_DRANGE | \ + BTRFS_BALANCE_ARGS_VRANGE | \ + BTRFS_BALANCE_ARGS_LIMIT | \ + BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ + BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ + BTRFS_BALANCE_ARGS_USAGE_RANGE) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + + +/* + * flags definition for balance state + * + * Used by: + * struct btrfs_ioctl_balance_args.state + */ +#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) +#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) +#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) + +struct btrfs_ioctl_balance_args { + __u64 flags; /* in/out */ + __u64 state; /* out */ + + struct btrfs_balance_args data; /* in/out */ + struct btrfs_balance_args meta; /* in/out */ + struct btrfs_balance_args sys; /* in/out */ + + struct btrfs_balance_progress stat; /* out */ + + __u64 unused[72]; /* pad to 1k */ +}; + +#define BTRFS_INO_LOOKUP_PATH_MAX 4080 +struct btrfs_ioctl_ino_lookup_args { + __u64 treeid; + __u64 objectid; + char name[BTRFS_INO_LOOKUP_PATH_MAX]; +}; + +#define BTRFS_INO_LOOKUP_USER_PATH_MAX (4080 - BTRFS_VOL_NAME_MAX - 1) +struct btrfs_ioctl_ino_lookup_user_args { + /* in, inode number containing the subvolume of 'subvolid' */ + __u64 dirid; + /* in */ + __u64 treeid; + /* out, name of the subvolume of 'treeid' */ + char name[BTRFS_VOL_NAME_MAX + 1]; + /* + * out, constructed path from the directory with which the ioctl is + * called to dirid + */ + char path[BTRFS_INO_LOOKUP_USER_PATH_MAX]; +}; + +/* Search criteria for the btrfs SEARCH ioctl family. */ +struct btrfs_ioctl_search_key { + /* + * The tree we're searching in. 1 is the tree of tree roots, 2 is the + * extent tree, etc... + * + * A special tree_id value of 0 will cause a search in the subvolume + * tree that the inode which is passed to the ioctl is part of. + */ + __u64 tree_id; /* in */ + + /* + * When doing a tree search, we're actually taking a slice from a + * linear search space of 136-bit keys. + * + * A full 136-bit tree key is composed as: + * (objectid << 72) + (type << 64) + offset + * + * The individual min and max values for objectid, type and offset + * define the min_key and max_key values for the search range. All + * metadata items with a key in the interval [min_key, max_key] will be + * returned. + * + * Additionally, we can filter the items returned on transaction id of + * the metadata block they're stored in by specifying a transid range. + * Be aware that this transaction id only denotes when the metadata + * page that currently contains the item got written the last time as + * result of a COW operation. The number does not have any meaning + * related to the transaction in which an individual item that is being + * returned was created or changed. + */ + __u64 min_objectid; /* in */ + __u64 max_objectid; /* in */ + __u64 min_offset; /* in */ + __u64 max_offset; /* in */ + __u64 min_transid; /* in */ + __u64 max_transid; /* in */ + __u32 min_type; /* in */ + __u32 max_type; /* in */ + + /* + * input: The maximum amount of results desired. + * output: The actual amount of items returned, restricted by any of: + * - reaching the upper bound of the search range + * - reaching the input nr_items amount of items + * - completely filling the supplied memory buffer + */ + __u32 nr_items; /* in/out */ + + /* align to 64 bits */ + __u32 unused; + + /* some extra for later */ + __u64 unused1; + __u64 unused2; + __u64 unused3; + __u64 unused4; +}; + +struct btrfs_ioctl_search_header { + __u64 transid; + __u64 objectid; + __u64 offset; + __u32 type; + __u32 len; +} __attribute__ ((__may_alias__)); + +#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) +/* + * the buf is an array of search headers where + * each header is followed by the actual item + * the type field is expanded to 32 bits for alignment + */ +struct btrfs_ioctl_search_args { + struct btrfs_ioctl_search_key key; + char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; +}; + +/* + * Extended version of TREE_SEARCH ioctl that can return more than 4k of bytes. + * The allocated size of the buffer is set in buf_size. + */ +struct btrfs_ioctl_search_args_v2 { + struct btrfs_ioctl_search_key key; /* in/out - search parameters */ + __u64 buf_size; /* in - size of buffer + * out - on EOVERFLOW: needed size + * to store item */ + __u64 buf[]; /* out - found items */ +}; + +/* With a @src_length of zero, the range from @src_offset->EOF is cloned! */ +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +/* + * flags definition for the defrag range ioctl + * + * Used by: + * struct btrfs_ioctl_defrag_range_args.flags + */ +#define BTRFS_DEFRAG_RANGE_COMPRESS 1 +#define BTRFS_DEFRAG_RANGE_START_IO 2 +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + + /* spare for later */ + __u32 unused[4]; +}; + + +#define BTRFS_SAME_DATA_DIFFERS 1 +/* For extent-same ioctl */ +struct btrfs_ioctl_same_extent_info { + __s64 fd; /* in - destination file */ + __u64 logical_offset; /* in - start of extent in destination */ + __u64 bytes_deduped; /* out - total # of bytes we were able + * to dedupe from this file */ + /* status of this dedupe operation: + * 0 if dedup succeeds + * < 0 for error + * == BTRFS_SAME_DATA_DIFFERS if data differs + */ + __s32 status; /* out - see above description */ + __u32 reserved; +}; + +struct btrfs_ioctl_same_args { + __u64 logical_offset; /* in - start of extent in source */ + __u64 length; /* in - length of extent */ + __u16 dest_count; /* in - total elements in info array */ + __u16 reserved1; + __u32 reserved2; + struct btrfs_ioctl_same_extent_info info[]; +}; + +struct btrfs_ioctl_space_info { + __u64 flags; + __u64 total_bytes; + __u64 used_bytes; +}; + +struct btrfs_ioctl_space_args { + __u64 space_slots; + __u64 total_spaces; + struct btrfs_ioctl_space_info spaces[]; +}; + +struct btrfs_data_container { + __u32 bytes_left; /* out -- bytes not needed to deliver output */ + __u32 bytes_missing; /* out -- additional bytes needed for result */ + __u32 elem_cnt; /* out */ + __u32 elem_missed; /* out */ + __u64 val[]; /* out */ +}; + +struct btrfs_ioctl_ino_path_args { + __u64 inum; /* in */ + __u64 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *fspath; out */ + __u64 fspath; /* out */ +}; + +struct btrfs_ioctl_logical_ino_args { + __u64 logical; /* in */ + __u64 size; /* in */ + __u64 reserved[3]; /* must be 0 for now */ + __u64 flags; /* in, v2 only */ + /* struct btrfs_data_container *inodes; out */ + __u64 inodes; +}; + +/* + * Return every ref to the extent, not just those containing logical block. + * Requires logical == extent bytenr. + */ +#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0) + +enum btrfs_dev_stat_values { + /* disk I/O failure stats */ + BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not + * been written */ + + BTRFS_DEV_STAT_VALUES_MAX +}; + +/* Reset statistics after reading; needs SYS_ADMIN capability */ +#define BTRFS_DEV_STATS_RESET (1ULL << 0) + +struct btrfs_ioctl_get_dev_stats { + __u64 devid; /* in */ + __u64 nr_items; /* in/out */ + __u64 flags; /* in/out */ + + /* out values: */ + __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; + + /* + * This pads the struct to 1032 bytes. It was originally meant to pad to + * 1024 bytes, but when adding the flags field, the padding calculation + * was not adjusted. + */ + __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; +}; + +#define BTRFS_QUOTA_CTL_ENABLE 1 +#define BTRFS_QUOTA_CTL_DISABLE 2 +#define BTRFS_QUOTA_CTL_RESCAN__NOTUSED 3 +struct btrfs_ioctl_quota_ctl_args { + __u64 cmd; + __u64 status; +}; + +struct btrfs_ioctl_quota_rescan_args { + __u64 flags; + __u64 progress; + __u64 reserved[6]; +}; + +struct btrfs_ioctl_qgroup_assign_args { + __u64 assign; + __u64 src; + __u64 dst; +}; + +struct btrfs_ioctl_qgroup_create_args { + __u64 create; + __u64 qgroupid; +}; +struct btrfs_ioctl_timespec { + __u64 sec; + __u32 nsec; +}; + +struct btrfs_ioctl_received_subvol_args { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec stime; /* in */ + struct btrfs_ioctl_timespec rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +}; + +/* + * Caller doesn't want file data in the send stream, even if the + * search of clone sources doesn't find an extent. UPDATE_EXTENT + * commands will be sent instead of WRITE commands. + */ +#define BTRFS_SEND_FLAG_NO_FILE_DATA 0x1 + +/* + * Do not add the leading stream header. Used when multiple snapshots + * are sent back to back. + */ +#define BTRFS_SEND_FLAG_OMIT_STREAM_HEADER 0x2 + +/* + * Omit the command at the end of the stream that indicated the end + * of the stream. This option is used when multiple snapshots are + * sent back to back. + */ +#define BTRFS_SEND_FLAG_OMIT_END_CMD 0x4 + +/* + * Read the protocol version in the structure + */ +#define BTRFS_SEND_FLAG_VERSION 0x8 + +/* + * Send compressed data using the ENCODED_WRITE command instead of decompressing + * the data and sending it with the WRITE command. This requires protocol + * version >= 2. + */ +#define BTRFS_SEND_FLAG_COMPRESSED 0x10 + +#define BTRFS_SEND_FLAG_MASK \ + (BTRFS_SEND_FLAG_NO_FILE_DATA | \ + BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ + BTRFS_SEND_FLAG_OMIT_END_CMD | \ + BTRFS_SEND_FLAG_VERSION | \ + BTRFS_SEND_FLAG_COMPRESSED) + +struct btrfs_ioctl_send_args { + __s64 send_fd; /* in */ + __u64 clone_sources_count; /* in */ + __u64 *clone_sources; /* in */ + __u64 parent_root; /* in */ + __u64 flags; /* in */ + __u32 version; /* in */ + __u8 reserved[28]; /* in */ +}; + +/* + * Information about a fs tree root. + * + * All items are filled by the ioctl + */ +struct btrfs_ioctl_get_subvol_info_args { + /* Id of this subvolume */ + __u64 treeid; + + /* Name of this subvolume, used to get the real name at mount point */ + char name[BTRFS_VOL_NAME_MAX + 1]; + + /* + * Id of the subvolume which contains this subvolume. + * Zero for top-level subvolume or a deleted subvolume. + */ + __u64 parent_id; + + /* + * Inode number of the directory which contains this subvolume. + * Zero for top-level subvolume or a deleted subvolume + */ + __u64 dirid; + + /* Latest transaction id of this subvolume */ + __u64 generation; + + /* Flags of this subvolume */ + __u64 flags; + + /* UUID of this subvolume */ + __u8 uuid[BTRFS_UUID_SIZE]; + + /* + * UUID of the subvolume of which this subvolume is a snapshot. + * All zero for a non-snapshot subvolume. + */ + __u8 parent_uuid[BTRFS_UUID_SIZE]; + + /* + * UUID of the subvolume from which this subvolume was received. + * All zero for non-received subvolume. + */ + __u8 received_uuid[BTRFS_UUID_SIZE]; + + /* Transaction id indicating when change/create/send/receive happened */ + __u64 ctransid; + __u64 otransid; + __u64 stransid; + __u64 rtransid; + /* Time corresponding to c/o/s/rtransid */ + struct btrfs_ioctl_timespec ctime; + struct btrfs_ioctl_timespec otime; + struct btrfs_ioctl_timespec stime; + struct btrfs_ioctl_timespec rtime; + + /* Must be zero */ + __u64 reserved[8]; +}; + +#define BTRFS_MAX_ROOTREF_BUFFER_NUM 255 +struct btrfs_ioctl_get_subvol_rootref_args { + /* in/out, minimum id of rootref's treeid to be searched */ + __u64 min_treeid; + + /* out */ + struct { + __u64 treeid; + __u64 dirid; + } rootref[BTRFS_MAX_ROOTREF_BUFFER_NUM]; + + /* out, number of found items */ + __u8 num_items; + __u8 align[7]; +}; + +/* + * Data and metadata for an encoded read or write. + * + * Encoded I/O bypasses any encoding automatically done by the filesystem (e.g., + * compression). This can be used to read the compressed contents of a file or + * write pre-compressed data directly to a file. + * + * BTRFS_IOC_ENCODED_READ and BTRFS_IOC_ENCODED_WRITE are essentially + * preadv/pwritev with additional metadata about how the data is encoded and the + * size of the unencoded data. + * + * BTRFS_IOC_ENCODED_READ fills the given iovecs with the encoded data, fills + * the metadata fields, and returns the size of the encoded data. It reads one + * extent per call. It can also read data which is not encoded. + * + * BTRFS_IOC_ENCODED_WRITE uses the metadata fields, writes the encoded data + * from the iovecs, and returns the size of the encoded data. Note that the + * encoded data is not validated when it is written; if it is not valid (e.g., + * it cannot be decompressed), then a subsequent read may return an error. + * + * Since the filesystem page cache contains decoded data, encoded I/O bypasses + * the page cache. Encoded I/O requires CAP_SYS_ADMIN. + */ +struct btrfs_ioctl_encoded_io_args { + /* Input parameters for both reads and writes. */ + + /* + * iovecs containing encoded data. + * + * For reads, if the size of the encoded data is larger than the sum of + * iov[n].iov_len for 0 <= n < iovcnt, then the ioctl fails with + * ENOBUFS. + * + * For writes, the size of the encoded data is the sum of iov[n].iov_len + * for 0 <= n < iovcnt. This must be less than 128 KiB (this limit may + * increase in the future). This must also be less than or equal to + * unencoded_len. + */ + const struct iovec *iov; + /* Number of iovecs. */ + unsigned long iovcnt; + /* + * Offset in file. + * + * For writes, must be aligned to the sector size of the filesystem. + */ + __s64 offset; + /* Currently must be zero. */ + __u64 flags; + + /* + * For reads, the following members are output parameters that will + * contain the returned metadata for the encoded data. + * For writes, the following members must be set to the metadata for the + * encoded data. + */ + + /* + * Length of the data in the file. + * + * Must be less than or equal to unencoded_len - unencoded_offset. For + * writes, must be aligned to the sector size of the filesystem unless + * the data ends at or beyond the current end of the file. + */ + __u64 len; + /* + * Length of the unencoded (i.e., decrypted and decompressed) data. + * + * For writes, must be no more than 128 KiB (this limit may increase in + * the future). If the unencoded data is actually longer than + * unencoded_len, then it is truncated; if it is shorter, then it is + * extended with zeroes. + */ + __u64 unencoded_len; + /* + * Offset from the first byte of the unencoded data to the first byte of + * logical data in the file. + * + * Must be less than unencoded_len. + */ + __u64 unencoded_offset; + /* + * BTRFS_ENCODED_IO_COMPRESSION_* type. + * + * For writes, must not be BTRFS_ENCODED_IO_COMPRESSION_NONE. + */ + __u32 compression; + /* Currently always BTRFS_ENCODED_IO_ENCRYPTION_NONE. */ + __u32 encryption; + /* + * Reserved for future expansion. + * + * For reads, always returned as zero. Users should check for non-zero + * bytes. If there are any, then the kernel has a newer version of this + * structure with additional information that the user definition is + * missing. + * + * For writes, must be zeroed. + */ + __u8 reserved[64]; +}; + +/* Data is not compressed. */ +#define BTRFS_ENCODED_IO_COMPRESSION_NONE 0 +/* Data is compressed as a single zlib stream. */ +#define BTRFS_ENCODED_IO_COMPRESSION_ZLIB 1 +/* + * Data is compressed as a single zstd frame with the windowLog compression + * parameter set to no more than 17. + */ +#define BTRFS_ENCODED_IO_COMPRESSION_ZSTD 2 +/* + * Data is compressed sector by sector (using the sector size indicated by the + * name of the constant) with LZO1X and wrapped in the format documented in + * fs/btrfs/lzo.c. For writes, the compression sector size must match the + * filesystem sector size. + */ +#define BTRFS_ENCODED_IO_COMPRESSION_LZO_4K 3 +#define BTRFS_ENCODED_IO_COMPRESSION_LZO_8K 4 +#define BTRFS_ENCODED_IO_COMPRESSION_LZO_16K 5 +#define BTRFS_ENCODED_IO_COMPRESSION_LZO_32K 6 +#define BTRFS_ENCODED_IO_COMPRESSION_LZO_64K 7 +#define BTRFS_ENCODED_IO_COMPRESSION_TYPES 8 + +/* Data is not encrypted. */ +#define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0 +#define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1 + +/* Error codes as returned by the kernel */ +enum btrfs_err_code { + BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, + BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, + BTRFS_ERROR_DEV_TGT_REPLACE, + BTRFS_ERROR_DEV_MISSING_NOT_FOUND, + BTRFS_ERROR_DEV_ONLY_WRITABLE, + BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS, + BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ + struct btrfs_ioctl_defrag_range_args) +#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ + struct btrfs_ioctl_search_args) +#define BTRFS_IOC_TREE_SEARCH_V2 _IOWR(BTRFS_IOCTL_MAGIC, 17, \ + struct btrfs_ioctl_search_args_v2) +#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ + struct btrfs_ioctl_ino_lookup_args) +#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, __u64) +#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ + struct btrfs_ioctl_space_args) +#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) +#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) +#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ + struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) +#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ + struct btrfs_ioctl_scrub_args) +#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) +#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ + struct btrfs_ioctl_scrub_args) +#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ + struct btrfs_ioctl_dev_info_args) +#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ + struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ + struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ + struct btrfs_ioctl_logical_ino_args) +#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args) +#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) +#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ + struct btrfs_ioctl_quota_ctl_args) +#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ + struct btrfs_ioctl_qgroup_assign_args) +#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ + struct btrfs_ioctl_qgroup_create_args) +#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ + struct btrfs_ioctl_qgroup_limit_args) +#define BTRFS_IOC_QUOTA_RESCAN _IOW(BTRFS_IOCTL_MAGIC, 44, \ + struct btrfs_ioctl_quota_rescan_args) +#define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \ + struct btrfs_ioctl_quota_rescan_args) +#define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46) +#define BTRFS_IOC_GET_FSLABEL FS_IOC_GETFSLABEL +#define BTRFS_IOC_SET_FSLABEL FS_IOC_SETFSLABEL +#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ + struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_dev_replace_args) +#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \ + struct btrfs_ioctl_same_args) +#define BTRFS_IOC_GET_FEATURES _IOR(BTRFS_IOCTL_MAGIC, 57, \ + struct btrfs_ioctl_feature_flags) +#define BTRFS_IOC_SET_FEATURES _IOW(BTRFS_IOCTL_MAGIC, 57, \ + struct btrfs_ioctl_feature_flags[2]) +#define BTRFS_IOC_GET_SUPPORTED_FEATURES _IOR(BTRFS_IOCTL_MAGIC, 57, \ + struct btrfs_ioctl_feature_flags[3]) +#define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \ + struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \ + struct btrfs_ioctl_logical_ino_args) +#define BTRFS_IOC_GET_SUBVOL_INFO _IOR(BTRFS_IOCTL_MAGIC, 60, \ + struct btrfs_ioctl_get_subvol_info_args) +#define BTRFS_IOC_GET_SUBVOL_ROOTREF _IOWR(BTRFS_IOCTL_MAGIC, 61, \ + struct btrfs_ioctl_get_subvol_rootref_args) +#define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \ + struct btrfs_ioctl_ino_lookup_user_args) +#define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \ + struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_ENCODED_READ _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args) +#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args) + +#ifdef __cplusplus +} +#endif + +#endif /* _UAPI_LINUX_BTRFS_H */ diff --git a/src/basic/linux/btrfs_tree.h b/src/basic/linux/btrfs_tree.h new file mode 100644 index 0000000..ab38d0f --- /dev/null +++ b/src/basic/linux/btrfs_tree.h @@ -0,0 +1,1260 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BTRFS_CTREE_H_ +#define _BTRFS_CTREE_H_ + +#include +#include +#ifdef __KERNEL__ +#include +#else +#include +#endif + +/* ASCII for _BHRfS_M, no terminating nul */ +#define BTRFS_MAGIC 0x4D5F53665248425FULL + +#define BTRFS_MAX_LEVEL 8 + +/* + * We can actually store much bigger names, but lets not confuse the rest of + * linux. + */ +#define BTRFS_NAME_LEN 255 + +/* + * Theoretical limit is larger, but we keep this down to a sane value. That + * should limit greatly the possibility of collisions on inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + +/* + * This header contains the structure definitions and constants used + * by file system objects that can be retrieved using + * the BTRFS_IOC_SEARCH_TREE ioctl. That means basically anything that + * is needed to describe a leaf node's key or item contents. + */ + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + +/* for storing items that use the BTRFS_UUID_KEY* types */ +#define BTRFS_UUID_TREE_OBJECTID 9ULL + +/* tracks free space in block groups. */ +#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + +/* Holds the block group items for extent tree v2. */ +#define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL + +/* device stats in the device tree */ +#define BTRFS_DEV_STATS_OBJECTID 0ULL + +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + +/* orphan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + +/* + * The inode number assigned to the special inode for storing + * free ino cache + */ +#define BTRFS_FREE_INO_OBJECTID -12ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + +#define BTRFS_DEV_REPLACE_DEVID 0ULL + +/* + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 +#define BTRFS_XATTR_ITEM_KEY 24 + +/* + * fs verity items are stored under two different key types on disk. + * The descriptor items: + * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ] + * + * At offset 0, we store a btrfs_verity_descriptor_item which tracks the size + * of the descriptor item and some extra data for encryption. + * Starting at offset 1, these hold the generic fs verity descriptor. The + * latter are opaque to btrfs, we just read and write them as a blob for the + * higher level verity code. The most common descriptor size is 256 bytes. + * + * The merkle tree items: + * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ] + * + * These also start at offset 0, and correspond to the merkle tree bytes. When + * fsverity asks for page 0 of the merkle tree, we pull up one page starting at + * offset 0 for this key type. These are also opaque to btrfs, we're blindly + * storing whatever fsverity sends down. + */ +#define BTRFS_VERITY_DESC_ITEM_KEY 36 +#define BTRFS_VERITY_MERKLE_ITEM_KEY 37 + +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. BTRFS_DIR_LOG_ITEM_KEY is no longer used + * but it's still defined here for documentation purposes and to help avoid + * having its numerical value reused in the future. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. They are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 + +/* + * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know + * the length, so we save the level in key->offset instead of the length. + */ +#define BTRFS_METADATA_ITEM_KEY 169 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +/* + * Every block group is represented in the free space tree by a free space info + * item, which stores some accounting information. It is keyed on + * (block_group_start, FREE_SPACE_INFO, block_group_length). + */ +#define BTRFS_FREE_SPACE_INFO_KEY 198 + +/* + * A free space extent tracks an extent of space that is free in a block group. + * It is keyed on (start, FREE_SPACE_EXTENT, length). + */ +#define BTRFS_FREE_SPACE_EXTENT_KEY 199 + +/* + * When a block group becomes very fragmented, we convert it to use bitmaps + * instead of extents. A free space bitmap is keyed on + * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with + * (length / sectorsize) bits. + */ +#define BTRFS_FREE_SPACE_BITMAP_KEY 200 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + +/* + * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. + */ +#define BTRFS_BALANCE_ITEM_KEY 248 + +/* + * The key type for tree items that are stored persistently, but do not need to + * exist for extended period of time. The items can exist in any tree. + * + * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] + * + * Existing items: + * + * - balance status item + * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) + */ +#define BTRFS_TEMPORARY_ITEM_KEY 248 + +/* + * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY + */ +#define BTRFS_DEV_STATS_KEY 249 + +/* + * The key type for tree items that are stored persistently and usually exist + * for a long period, eg. filesystem lifetime. The item kinds can be status + * information, stats or preference values. The item can exist in any tree. + * + * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] + * + * Existing items: + * + * - device statistics, store IO stats in the device tree, one key for all + * stats + * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) + */ +#define BTRFS_PERSISTENT_ITEM_KEY 249 + +/* + * Persistently stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + +/* + * Stores items that allow to quickly map UUIDs to something else. + * These items are part of the filesystem UUID tree. + * The key is built like this: + * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). + */ +#if BTRFS_UUID_SIZE != 16 +#error "UUID items require BTRFS_UUID_SIZE == 16!" +#endif +#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ +#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to + * received subvols */ + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +/* Maximum metadata block size (nodesize) */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +enum btrfs_csum_type { + BTRFS_CSUM_TYPE_CRC32 = 0, + BTRFS_CSUM_TYPE_XXHASH = 1, + BTRFS_CSUM_TYPE_SHA256 = 2, + BTRFS_CSUM_TYPE_BLAKE2 = 3, +}; + +/* + * flags definitions for directory entry item type + * + * Used by: + * struct btrfs_dir_item.type + * + * Values 0..7 must match common file type values in fs_types.h. + */ +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 +/* Directory contains encrypted data */ +#define BTRFS_FT_ENCRYPTED 0x80 + +static inline __u8 btrfs_dir_flags_to_ftype(__u8 flags) +{ + return flags & ~BTRFS_FT_ENCRYPTED; +} + +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1U << 0) +#define BTRFS_INODE_NODATACOW (1U << 1) +#define BTRFS_INODE_READONLY (1U << 2) +#define BTRFS_INODE_NOCOMPRESS (1U << 3) +#define BTRFS_INODE_PREALLOC (1U << 4) +#define BTRFS_INODE_SYNC (1U << 5) +#define BTRFS_INODE_IMMUTABLE (1U << 6) +#define BTRFS_INODE_APPEND (1U << 7) +#define BTRFS_INODE_NODUMP (1U << 8) +#define BTRFS_INODE_NOATIME (1U << 9) +#define BTRFS_INODE_DIRSYNC (1U << 10) +#define BTRFS_INODE_COMPRESS (1U << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) + +#define BTRFS_INODE_FLAG_MASK \ + (BTRFS_INODE_NODATASUM | \ + BTRFS_INODE_NODATACOW | \ + BTRFS_INODE_READONLY | \ + BTRFS_INODE_NOCOMPRESS | \ + BTRFS_INODE_PREALLOC | \ + BTRFS_INODE_SYNC | \ + BTRFS_INODE_IMMUTABLE | \ + BTRFS_INODE_APPEND | \ + BTRFS_INODE_NODUMP | \ + BTRFS_INODE_NOATIME | \ + BTRFS_INODE_DIRSYNC | \ + BTRFS_INODE_COMPRESS | \ + BTRFS_INODE_ROOT_ITEM_INIT) + +#define BTRFS_INODE_RO_VERITY (1U << 0) + +#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) + +/* + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + __u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + __u64 objectid; + __u8 type; + __u64 offset; +} __attribute__ ((__packed__)); + +/* + * Every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* These first four must match the super block */ + __u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific uuid */ + __u8 fsid[BTRFS_FSID_SIZE]; + /* Which block this node is supposed to live in */ + __le64 bytenr; + __le64 flags; + + /* Allowed to be different from the super from here on down */ + __u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + __u8 level; +} __attribute__ ((__packed__)); + +/* + * This is a very generous portion of the super block, giving us room to + * translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 + +/* + * Just in case we somehow lose the roots and are not able to mount, we store + * an array of the roots from previous transactions in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unused_64[4]; + + __u8 tree_root_level; + __u8 chunk_root_level; + __u8 extent_root_level; + __u8 fs_root_level; + __u8 dev_root_level; + __u8 csum_root_level; + /* future and to align */ + __u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* + * A leaf is full of items. offset and size tell us where to find the item in + * the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * Leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together during + * searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * All non-leaf blocks are nodes, they hold only keys and pointers to other + * blocks. + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allow for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + __u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + __u8 bandwidth; + + /* btrfs generated uuid for this device */ + __u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + __u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + __u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +/* + * The super block basically lists the main trees of the FS. + */ +struct btrfs_super_block { + /* The first 4 fields must match struct btrfs_header */ + __u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + __u8 fsid[BTRFS_FSID_SIZE]; + /* This block number */ + __le64 bytenr; + __le64 flags; + + /* Allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 __unused_leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + __u8 root_level; + __u8 chunk_root_level; + __u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + __le64 cache_generation; + __le64 uuid_tree_generation; + + /* The UUID written into btree blocks */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; + + __u64 nr_global_roots; + + /* Future expansion */ + __le64 reserved[27]; + __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; + + /* Padded to 4096 bytes */ + __u8 padding[565]; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + __u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* Super block flags */ +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) +#define BTRFS_SUPER_FLAG_METADUMP_V2 (1ULL << 34) +#define BTRFS_SUPER_FLAG_CHANGING_FSID (1ULL << 35) +#define BTRFS_SUPER_FLAG_CHANGING_FSID_V2 (1ULL << 36) + + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ + +struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { + __le32 refs; +} __attribute__ ((__packed__)); + + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 + +/* + * this flag is only used internally by scrub and may be changed at any time + * it is only declared here to avoid collisions + */ +#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + __u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + __u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + __u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[]; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + __u8 type; +} __attribute__ ((__packed__)); + +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + __u8 drop_level; + __u8 level; + + /* + * The following fields appear after subvol_uuids+subvol_times + * were introduced. + */ + + /* + * This generation number is used to test if the new fields are valid + * and up to date while reading the root item. Every time the root item + * is written out, the "generation" field is copied into this field. If + * anyone ever mounted the fs with an older kernel, we will have + * mismatching generation values here and thus must invalidate the + * new fields. See btrfs_update_root and btrfs_find_last_root for + * details. + * the offset of generation_v2 is also used as the start for the memset + * when invalidating the fields. + */ + __le64 generation_v2; + __u8 uuid[BTRFS_UUID_SIZE]; + __u8 parent_uuid[BTRFS_UUID_SIZE]; + __u8 received_uuid[BTRFS_UUID_SIZE]; + __le64 ctransid; /* updated when an inode changes */ + __le64 otransid; /* trans when created */ + __le64 stransid; /* trans when sent. non-zero for received subvol */ + __le64 rtransid; /* trans when received. non-zero for received subvol */ + struct btrfs_timespec ctime; + struct btrfs_timespec otime; + struct btrfs_timespec stime; + struct btrfs_timespec rtime; + __le64 reserved[8]; /* for future */ +} __attribute__ ((__packed__)); + +/* + * Btrfs root item used to be smaller than current size. The old format ends + * at where member generation_v2 is. + */ +static inline __u32 btrfs_legacy_root_item_size(void) +{ + return offsetof(struct btrfs_root_item, generation_v2); +} + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __le64 limit; + struct { + __le32 limit_min; + __le32 limit_max; + }; + }; + + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __le64 unused[6]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + +enum { + BTRFS_FILE_EXTENT_INLINE = 0, + BTRFS_FILE_EXTENT_REG = 1, + BTRFS_FILE_EXTENT_PREALLOC = 2, + BTRFS_NR_FILE_EXTENT_TYPES = 3, +}; + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + __u8 compression; + __u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + __u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + * + * At this offset in the structure, the inline extent data start. + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + __u8 csum; +} __attribute__ ((__packed__)); + +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 + +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) +#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) +#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID1C3 | \ + BTRFS_BLOCK_GROUP_RAID1C4 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + +#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID1C3 | \ + BTRFS_BLOCK_GROUP_RAID1C4) + +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) + +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline __u64 chunk_to_extended(__u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline __u64 extended_to_chunk(__u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_info { + __le32 extent_count; + __le32 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) + +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline __u16 btrfs_qgroup_level(__u64 qgroupid) +{ + return (__u16)(qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT); +} + +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * RESCAN is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_FLAGS_MASK (BTRFS_QGROUP_STATUS_FLAG_ON | \ + BTRFS_QGROUP_STATUS_FLAG_RESCAN | \ + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 rescan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + +struct btrfs_verity_descriptor_item { + /* Size of the verity descriptor in bytes */ + __le64 size; + /* + * When we implement support for fscrypt, we will need to encrypt the + * Merkle tree for encrypted verity files. These 128 bits are for the + * eventual storage of an fscrypt initialization vector. + */ + __le64 reserved[2]; + __u8 encryption; +} __attribute__ ((__packed__)); + +#endif /* _BTRFS_CTREE_H_ */ diff --git a/src/basic/linux/can/netlink.h b/src/basic/linux/can/netlink.h new file mode 100644 index 0000000..02ec32d --- /dev/null +++ b/src/basic/linux/can/netlink.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * linux/can/netlink.h + * + * Definitions for the CAN netlink interface + * + * Copyright (c) 2009 Wolfgang Grandegger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the version 2 of the GNU General Public License + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _UAPI_CAN_NETLINK_H +#define _UAPI_CAN_NETLINK_H + +#include + +/* + * CAN bit-timing parameters + * + * For further information, please read chapter "8 BIT TIMING + * REQUIREMENTS" of the "Bosch CAN Specification version 2.0" + * at http://www.semiconductors.bosch.de/pdf/can2spec.pdf. + */ +struct can_bittiming { + __u32 bitrate; /* Bit-rate in bits/second */ + __u32 sample_point; /* Sample point in one-tenth of a percent */ + __u32 tq; /* Time quanta (TQ) in nanoseconds */ + __u32 prop_seg; /* Propagation segment in TQs */ + __u32 phase_seg1; /* Phase buffer segment 1 in TQs */ + __u32 phase_seg2; /* Phase buffer segment 2 in TQs */ + __u32 sjw; /* Synchronisation jump width in TQs */ + __u32 brp; /* Bit-rate prescaler */ +}; + +/* + * CAN hardware-dependent bit-timing constant + * + * Used for calculating and checking bit-timing parameters + */ +struct can_bittiming_const { + char name[16]; /* Name of the CAN controller hardware */ + __u32 tseg1_min; /* Time segment 1 = prop_seg + phase_seg1 */ + __u32 tseg1_max; + __u32 tseg2_min; /* Time segment 2 = phase_seg2 */ + __u32 tseg2_max; + __u32 sjw_max; /* Synchronisation jump width */ + __u32 brp_min; /* Bit-rate prescaler */ + __u32 brp_max; + __u32 brp_inc; +}; + +/* + * CAN clock parameters + */ +struct can_clock { + __u32 freq; /* CAN system clock frequency in Hz */ +}; + +/* + * CAN operational and error states + */ +enum can_state { + CAN_STATE_ERROR_ACTIVE = 0, /* RX/TX error count < 96 */ + CAN_STATE_ERROR_WARNING, /* RX/TX error count < 128 */ + CAN_STATE_ERROR_PASSIVE, /* RX/TX error count < 256 */ + CAN_STATE_BUS_OFF, /* RX/TX error count >= 256 */ + CAN_STATE_STOPPED, /* Device is stopped */ + CAN_STATE_SLEEPING, /* Device is sleeping */ + CAN_STATE_MAX +}; + +/* + * CAN bus error counters + */ +struct can_berr_counter { + __u16 txerr; + __u16 rxerr; +}; + +/* + * CAN controller mode + */ +struct can_ctrlmode { + __u32 mask; + __u32 flags; +}; + +#define CAN_CTRLMODE_LOOPBACK 0x01 /* Loopback mode */ +#define CAN_CTRLMODE_LISTENONLY 0x02 /* Listen-only mode */ +#define CAN_CTRLMODE_3_SAMPLES 0x04 /* Triple sampling mode */ +#define CAN_CTRLMODE_ONE_SHOT 0x08 /* One-Shot mode */ +#define CAN_CTRLMODE_BERR_REPORTING 0x10 /* Bus-error reporting */ +#define CAN_CTRLMODE_FD 0x20 /* CAN FD mode */ +#define CAN_CTRLMODE_PRESUME_ACK 0x40 /* Ignore missing CAN ACKs */ +#define CAN_CTRLMODE_FD_NON_ISO 0x80 /* CAN FD in non-ISO mode */ +#define CAN_CTRLMODE_CC_LEN8_DLC 0x100 /* Classic CAN DLC option */ +#define CAN_CTRLMODE_TDC_AUTO 0x200 /* CAN transiver automatically calculates TDCV */ +#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* TDCV is manually set up by user */ + +/* + * CAN device statistics + */ +struct can_device_stats { + __u32 bus_error; /* Bus errors */ + __u32 error_warning; /* Changes to error warning state */ + __u32 error_passive; /* Changes to error passive state */ + __u32 bus_off; /* Changes to bus off state */ + __u32 arbitration_lost; /* Arbitration lost errors */ + __u32 restarts; /* CAN controller re-starts */ +}; + +/* + * CAN netlink interface + */ +enum { + IFLA_CAN_UNSPEC, + IFLA_CAN_BITTIMING, + IFLA_CAN_BITTIMING_CONST, + IFLA_CAN_CLOCK, + IFLA_CAN_STATE, + IFLA_CAN_CTRLMODE, + IFLA_CAN_RESTART_MS, + IFLA_CAN_RESTART, + IFLA_CAN_BERR_COUNTER, + IFLA_CAN_DATA_BITTIMING, + IFLA_CAN_DATA_BITTIMING_CONST, + IFLA_CAN_TERMINATION, + IFLA_CAN_TERMINATION_CONST, + IFLA_CAN_BITRATE_CONST, + IFLA_CAN_DATA_BITRATE_CONST, + IFLA_CAN_BITRATE_MAX, + IFLA_CAN_TDC, + IFLA_CAN_CTRLMODE_EXT, + + /* add new constants above here */ + __IFLA_CAN_MAX, + IFLA_CAN_MAX = __IFLA_CAN_MAX - 1 +}; + +/* + * CAN FD Transmitter Delay Compensation (TDC) + * + * Please refer to struct can_tdc_const and can_tdc in + * include/linux/can/bittiming.h for further details. + */ +enum { + IFLA_CAN_TDC_UNSPEC, + IFLA_CAN_TDC_TDCV_MIN, /* u32 */ + IFLA_CAN_TDC_TDCV_MAX, /* u32 */ + IFLA_CAN_TDC_TDCO_MIN, /* u32 */ + IFLA_CAN_TDC_TDCO_MAX, /* u32 */ + IFLA_CAN_TDC_TDCF_MIN, /* u32 */ + IFLA_CAN_TDC_TDCF_MAX, /* u32 */ + IFLA_CAN_TDC_TDCV, /* u32 */ + IFLA_CAN_TDC_TDCO, /* u32 */ + IFLA_CAN_TDC_TDCF, /* u32 */ + + /* add new constants above here */ + __IFLA_CAN_TDC, + IFLA_CAN_TDC_MAX = __IFLA_CAN_TDC - 1 +}; + +/* + * IFLA_CAN_CTRLMODE_EXT nest: controller mode extended parameters + */ +enum { + IFLA_CAN_CTRLMODE_UNSPEC, + IFLA_CAN_CTRLMODE_SUPPORTED, /* u32 */ + + /* add new constants above here */ + __IFLA_CAN_CTRLMODE, + IFLA_CAN_CTRLMODE_MAX = __IFLA_CAN_CTRLMODE - 1 +}; + +/* u16 termination range: 1..65535 Ohms */ +#define CAN_TERMINATION_DISABLED 0 + +#endif /* !_UAPI_CAN_NETLINK_H */ diff --git a/src/basic/linux/can/vxcan.h b/src/basic/linux/can/vxcan.h new file mode 100644 index 0000000..4fa9d87 --- /dev/null +++ b/src/basic/linux/can/vxcan.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +#ifndef _UAPI_CAN_VXCAN_H +#define _UAPI_CAN_VXCAN_H + +enum { + VXCAN_INFO_UNSPEC, + VXCAN_INFO_PEER, + + __VXCAN_INFO_MAX +#define VXCAN_INFO_MAX (__VXCAN_INFO_MAX - 1) +}; + +#endif diff --git a/src/basic/linux/cfm_bridge.h b/src/basic/linux/cfm_bridge.h new file mode 100644 index 0000000..3c1cbd1 --- /dev/null +++ b/src/basic/linux/cfm_bridge.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_CFM_BRIDGE_H_ +#define _UAPI_LINUX_CFM_BRIDGE_H_ + +#include +#include + +#define ETHER_HEADER_LENGTH (6+6+4+2) +#define CFM_MAID_LENGTH 48 +#define CFM_CCM_PDU_LENGTH 75 +#define CFM_PORT_STATUS_TLV_LENGTH 4 +#define CFM_IF_STATUS_TLV_LENGTH 4 +#define CFM_IF_STATUS_TLV_TYPE 4 +#define CFM_PORT_STATUS_TLV_TYPE 2 +#define CFM_ENDE_TLV_TYPE 0 +#define CFM_CCM_MAX_FRAME_LENGTH (ETHER_HEADER_LENGTH+\ + CFM_CCM_PDU_LENGTH+\ + CFM_PORT_STATUS_TLV_LENGTH+\ + CFM_IF_STATUS_TLV_LENGTH) +#define CFM_FRAME_PRIO 7 +#define CFM_CCM_TLV_OFFSET 70 +#define CFM_CCM_PDU_MAID_OFFSET 10 +#define CFM_CCM_PDU_MEPID_OFFSET 8 +#define CFM_CCM_PDU_SEQNR_OFFSET 4 +#define CFM_CCM_PDU_TLV_OFFSET 74 +#define CFM_CCM_ITU_RESERVED_SIZE 16 + +struct br_cfm_common_hdr { + __u8 mdlevel_version; + __u8 opcode; + __u8 flags; + __u8 tlv_offset; +}; + +enum br_cfm_opcodes { + BR_CFM_OPCODE_CCM = 0x1, +}; + +/* MEP domain */ +enum br_cfm_domain { + BR_CFM_PORT, + BR_CFM_VLAN, +}; + +/* MEP direction */ +enum br_cfm_mep_direction { + BR_CFM_MEP_DIRECTION_DOWN, + BR_CFM_MEP_DIRECTION_UP, +}; + +/* CCM interval supported. */ +enum br_cfm_ccm_interval { + BR_CFM_CCM_INTERVAL_NONE, + BR_CFM_CCM_INTERVAL_3_3_MS, + BR_CFM_CCM_INTERVAL_10_MS, + BR_CFM_CCM_INTERVAL_100_MS, + BR_CFM_CCM_INTERVAL_1_SEC, + BR_CFM_CCM_INTERVAL_10_SEC, + BR_CFM_CCM_INTERVAL_1_MIN, + BR_CFM_CCM_INTERVAL_10_MIN, +}; + +#endif diff --git a/src/basic/linux/fib_rules.h b/src/basic/linux/fib_rules.h new file mode 100644 index 0000000..232df14 --- /dev/null +++ b/src/basic/linux/fib_rules.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_FIB_RULES_H +#define __LINUX_FIB_RULES_H + +#include +#include + +/* rule is permanent, and cannot be deleted */ +#define FIB_RULE_PERMANENT 0x00000001 +#define FIB_RULE_INVERT 0x00000002 +#define FIB_RULE_UNRESOLVED 0x00000004 +#define FIB_RULE_IIF_DETACHED 0x00000008 +#define FIB_RULE_DEV_DETACHED FIB_RULE_IIF_DETACHED +#define FIB_RULE_OIF_DETACHED 0x00000010 + +/* try to find source address in routing lookups */ +#define FIB_RULE_FIND_SADDR 0x00010000 + +struct fib_rule_hdr { + __u8 family; + __u8 dst_len; + __u8 src_len; + __u8 tos; + + __u8 table; + __u8 res1; /* reserved */ + __u8 res2; /* reserved */ + __u8 action; + + __u32 flags; +}; + +struct fib_rule_uid_range { + __u32 start; + __u32 end; +}; + +struct fib_rule_port_range { + __u16 start; + __u16 end; +}; + +enum { + FRA_UNSPEC, + FRA_DST, /* destination address */ + FRA_SRC, /* source address */ + FRA_IIFNAME, /* interface name */ +#define FRA_IFNAME FRA_IIFNAME + FRA_GOTO, /* target to jump to (FR_ACT_GOTO) */ + FRA_UNUSED2, + FRA_PRIORITY, /* priority/preference */ + FRA_UNUSED3, + FRA_UNUSED4, + FRA_UNUSED5, + FRA_FWMARK, /* mark */ + FRA_FLOW, /* flow/class id */ + FRA_TUN_ID, + FRA_SUPPRESS_IFGROUP, + FRA_SUPPRESS_PREFIXLEN, + FRA_TABLE, /* Extended table id */ + FRA_FWMASK, /* mask for netfilter mark */ + FRA_OIFNAME, + FRA_PAD, + FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ + FRA_UID_RANGE, /* UID range */ + FRA_PROTOCOL, /* Originator of the rule */ + FRA_IP_PROTO, /* ip proto */ + FRA_SPORT_RANGE, /* sport */ + FRA_DPORT_RANGE, /* dport */ + __FRA_MAX +}; + +#define FRA_MAX (__FRA_MAX - 1) + +enum { + FR_ACT_UNSPEC, + FR_ACT_TO_TBL, /* Pass to fixed table */ + FR_ACT_GOTO, /* Jump to another rule */ + FR_ACT_NOP, /* No operation */ + FR_ACT_RES3, + FR_ACT_RES4, + FR_ACT_BLACKHOLE, /* Drop without notification */ + FR_ACT_UNREACHABLE, /* Drop with ENETUNREACH */ + FR_ACT_PROHIBIT, /* Drop with EACCES */ + __FR_ACT_MAX, +}; + +#define FR_ACT_MAX (__FR_ACT_MAX - 1) + +#endif diff --git a/src/basic/linux/fou.h b/src/basic/linux/fou.h new file mode 100644 index 0000000..87c2c9f --- /dev/null +++ b/src/basic/linux/fou.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* fou.h - FOU Interface */ + +#ifndef _UAPI_LINUX_FOU_H +#define _UAPI_LINUX_FOU_H + +/* NETLINK_GENERIC related info + */ +#define FOU_GENL_NAME "fou" +#define FOU_GENL_VERSION 0x1 + +enum { + FOU_ATTR_UNSPEC, + FOU_ATTR_PORT, /* u16 */ + FOU_ATTR_AF, /* u8 */ + FOU_ATTR_IPPROTO, /* u8 */ + FOU_ATTR_TYPE, /* u8 */ + FOU_ATTR_REMCSUM_NOPARTIAL, /* flag */ + FOU_ATTR_LOCAL_V4, /* u32 */ + FOU_ATTR_LOCAL_V6, /* in6_addr */ + FOU_ATTR_PEER_V4, /* u32 */ + FOU_ATTR_PEER_V6, /* in6_addr */ + FOU_ATTR_PEER_PORT, /* u16 */ + FOU_ATTR_IFINDEX, /* s32 */ + + __FOU_ATTR_MAX, +}; + +#define FOU_ATTR_MAX (__FOU_ATTR_MAX - 1) + +enum { + FOU_CMD_UNSPEC, + FOU_CMD_ADD, + FOU_CMD_DEL, + FOU_CMD_GET, + + __FOU_CMD_MAX, +}; + +enum { + FOU_ENCAP_UNSPEC, + FOU_ENCAP_DIRECT, + FOU_ENCAP_GUE, +}; + +#define FOU_CMD_MAX (__FOU_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_FOU_H */ diff --git a/src/basic/linux/genetlink.h b/src/basic/linux/genetlink.h new file mode 100644 index 0000000..ddba3ca --- /dev/null +++ b/src/basic/linux/genetlink.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_GENERIC_NETLINK_H +#define _UAPI__LINUX_GENERIC_NETLINK_H + +#include +#include + +#define GENL_NAMSIZ 16 /* length of family name */ + +#define GENL_MIN_ID NLMSG_MIN_TYPE +#define GENL_MAX_ID 1023 + +struct genlmsghdr { + __u8 cmd; + __u8 version; + __u16 reserved; +}; + +#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr)) + +#define GENL_ADMIN_PERM 0x01 +#define GENL_CMD_CAP_DO 0x02 +#define GENL_CMD_CAP_DUMP 0x04 +#define GENL_CMD_CAP_HASPOL 0x08 +#define GENL_UNS_ADMIN_PERM 0x10 + +/* + * List of reserved static generic netlink identifiers: + */ +#define GENL_ID_CTRL NLMSG_MIN_TYPE +#define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1) +#define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2) +/* must be last reserved + 1 */ +#define GENL_START_ALLOC (NLMSG_MIN_TYPE + 3) + +/************************************************************************** + * Controller + **************************************************************************/ + +enum { + CTRL_CMD_UNSPEC, + CTRL_CMD_NEWFAMILY, + CTRL_CMD_DELFAMILY, + CTRL_CMD_GETFAMILY, + CTRL_CMD_NEWOPS, + CTRL_CMD_DELOPS, + CTRL_CMD_GETOPS, + CTRL_CMD_NEWMCAST_GRP, + CTRL_CMD_DELMCAST_GRP, + CTRL_CMD_GETMCAST_GRP, /* unused */ + CTRL_CMD_GETPOLICY, + __CTRL_CMD_MAX, +}; + +#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1) + +enum { + CTRL_ATTR_UNSPEC, + CTRL_ATTR_FAMILY_ID, + CTRL_ATTR_FAMILY_NAME, + CTRL_ATTR_VERSION, + CTRL_ATTR_HDRSIZE, + CTRL_ATTR_MAXATTR, + CTRL_ATTR_OPS, + CTRL_ATTR_MCAST_GROUPS, + CTRL_ATTR_POLICY, + CTRL_ATTR_OP_POLICY, + CTRL_ATTR_OP, + __CTRL_ATTR_MAX, +}; + +#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1) + +enum { + CTRL_ATTR_OP_UNSPEC, + CTRL_ATTR_OP_ID, + CTRL_ATTR_OP_FLAGS, + __CTRL_ATTR_OP_MAX, +}; + +#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1) + +enum { + CTRL_ATTR_MCAST_GRP_UNSPEC, + CTRL_ATTR_MCAST_GRP_NAME, + CTRL_ATTR_MCAST_GRP_ID, + __CTRL_ATTR_MCAST_GRP_MAX, +}; + +#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) + +enum { + CTRL_ATTR_POLICY_UNSPEC, + CTRL_ATTR_POLICY_DO, + CTRL_ATTR_POLICY_DUMP, + + __CTRL_ATTR_POLICY_DUMP_MAX, + CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1 +}; + +#define CTRL_ATTR_POLICY_MAX (__CTRL_ATTR_POLICY_DUMP_MAX - 1) + +#endif /* _UAPI__LINUX_GENERIC_NETLINK_H */ diff --git a/src/basic/linux/hdlc/ioctl.h b/src/basic/linux/hdlc/ioctl.h new file mode 100644 index 0000000..b06341a --- /dev/null +++ b/src/basic/linux/hdlc/ioctl.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __HDLC_IOCTL_H__ +#define __HDLC_IOCTL_H__ + + +#define GENERIC_HDLC_VERSION 4 /* For synchronization with sethdlc utility */ + +#define CLOCK_DEFAULT 0 /* Default setting */ +#define CLOCK_EXT 1 /* External TX and RX clock - DTE */ +#define CLOCK_INT 2 /* Internal TX and RX clock - DCE */ +#define CLOCK_TXINT 3 /* Internal TX and external RX clock */ +#define CLOCK_TXFROMRX 4 /* TX clock derived from external RX clock */ + + +#define ENCODING_DEFAULT 0 /* Default setting */ +#define ENCODING_NRZ 1 +#define ENCODING_NRZI 2 +#define ENCODING_FM_MARK 3 +#define ENCODING_FM_SPACE 4 +#define ENCODING_MANCHESTER 5 + + +#define PARITY_DEFAULT 0 /* Default setting */ +#define PARITY_NONE 1 /* No parity */ +#define PARITY_CRC16_PR0 2 /* CRC16, initial value 0x0000 */ +#define PARITY_CRC16_PR1 3 /* CRC16, initial value 0xFFFF */ +#define PARITY_CRC16_PR0_CCITT 4 /* CRC16, initial 0x0000, ITU-T version */ +#define PARITY_CRC16_PR1_CCITT 5 /* CRC16, initial 0xFFFF, ITU-T version */ +#define PARITY_CRC32_PR0_CCITT 6 /* CRC32, initial value 0x00000000 */ +#define PARITY_CRC32_PR1_CCITT 7 /* CRC32, initial value 0xFFFFFFFF */ + +#define LMI_DEFAULT 0 /* Default setting */ +#define LMI_NONE 1 /* No LMI, all PVCs are static */ +#define LMI_ANSI 2 /* ANSI Annex D */ +#define LMI_CCITT 3 /* ITU-T Annex A */ +#define LMI_CISCO 4 /* The "original" LMI, aka Gang of Four */ + +#ifndef __ASSEMBLY__ + +typedef struct { + unsigned int clock_rate; /* bits per second */ + unsigned int clock_type; /* internal, external, TX-internal etc. */ + unsigned short loopback; +} sync_serial_settings; /* V.35, V.24, X.21 */ + +typedef struct { + unsigned int clock_rate; /* bits per second */ + unsigned int clock_type; /* internal, external, TX-internal etc. */ + unsigned short loopback; + unsigned int slot_map; +} te1_settings; /* T1, E1 */ + +typedef struct { + unsigned short encoding; + unsigned short parity; +} raw_hdlc_proto; + +typedef struct { + unsigned int t391; + unsigned int t392; + unsigned int n391; + unsigned int n392; + unsigned int n393; + unsigned short lmi; + unsigned short dce; /* 1 for DCE (network side) operation */ +} fr_proto; + +typedef struct { + unsigned int dlci; +} fr_proto_pvc; /* for creating/deleting FR PVCs */ + +typedef struct { + unsigned int dlci; + char master[IFNAMSIZ]; /* Name of master FRAD device */ +}fr_proto_pvc_info; /* for returning PVC information only */ + +typedef struct { + unsigned int interval; + unsigned int timeout; +} cisco_proto; + +typedef struct { + unsigned short dce; /* 1 for DCE (network side) operation */ + unsigned int modulo; /* modulo (8 = basic / 128 = extended) */ + unsigned int window; /* frame window size */ + unsigned int t1; /* timeout t1 */ + unsigned int t2; /* timeout t2 */ + unsigned int n2; /* frame retry counter */ +} x25_hdlc_proto; + +/* PPP doesn't need any info now - supply length = 0 to ioctl */ + +#endif /* __ASSEMBLY__ */ +#endif /* __HDLC_IOCTL_H__ */ diff --git a/src/basic/linux/if.h b/src/basic/linux/if.h new file mode 100644 index 0000000..e79f5c8 --- /dev/null +++ b/src/basic/linux/if.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Global definitions for the INET interface module. + * + * Version: @(#)if.h 1.0.2 04/18/93 + * + * Authors: Original taken from Berkeley UNIX 4.3, (c) UCB 1982-1988 + * Ross Biro + * Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _LINUX_IF_H +#define _LINUX_IF_H + +#include /* for compatibility with glibc */ +#include /* for "__kernel_caddr_t" et al */ +#include /* for "struct sockaddr" et al */ + +#ifndef __KERNEL__ +#include /* for struct sockaddr. */ +#endif + +#if __UAPI_DEF_IF_IFNAMSIZ +#define IFNAMSIZ 16 +#endif /* __UAPI_DEF_IF_IFNAMSIZ */ +#define IFALIASZ 256 +#define ALTIFNAMSIZ 128 +#include + +/* For glibc compatibility. An empty enum does not compile. */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO != 0 || \ + __UAPI_DEF_IF_NET_DEVICE_FLAGS != 0 +/** + * enum net_device_flags - &struct net_device flags + * + * These are the &struct net_device flags, they can be set by drivers, the + * kernel and some can be triggered by userspace. Userspace can query and + * set these flags using userspace utilities but there is also a sysfs + * entry available for all dev flags which can be queried and set. These flags + * are shared for all types of net_devices. The sysfs entries are available + * via /sys/class/net//flags. Flags which can be toggled through sysfs + * are annotated below, note that only a few flags can be toggled and some + * other flags are always preserved from the original net_device flags + * even if you try to set them via sysfs. Flags which are always preserved + * are kept under the flag grouping @IFF_VOLATILE. Flags which are volatile + * are annotated below as such. + * + * You should have a pretty good reason to be extending these flags. + * + * @IFF_UP: interface is up. Can be toggled through sysfs. + * @IFF_BROADCAST: broadcast address valid. Volatile. + * @IFF_DEBUG: turn on debugging. Can be toggled through sysfs. + * @IFF_LOOPBACK: is a loopback net. Volatile. + * @IFF_POINTOPOINT: interface is has p-p link. Volatile. + * @IFF_NOTRAILERS: avoid use of trailers. Can be toggled through sysfs. + * Volatile. + * @IFF_RUNNING: interface RFC2863 OPER_UP. Volatile. + * @IFF_NOARP: no ARP protocol. Can be toggled through sysfs. Volatile. + * @IFF_PROMISC: receive all packets. Can be toggled through sysfs. + * @IFF_ALLMULTI: receive all multicast packets. Can be toggled through + * sysfs. + * @IFF_MASTER: master of a load balancer. Volatile. + * @IFF_SLAVE: slave of a load balancer. Volatile. + * @IFF_MULTICAST: Supports multicast. Can be toggled through sysfs. + * @IFF_PORTSEL: can set media type. Can be toggled through sysfs. + * @IFF_AUTOMEDIA: auto media select active. Can be toggled through sysfs. + * @IFF_DYNAMIC: dialup device with changing addresses. Can be toggled + * through sysfs. + * @IFF_LOWER_UP: driver signals L1 up. Volatile. + * @IFF_DORMANT: driver signals dormant. Volatile. + * @IFF_ECHO: echo sent packets. Volatile. + */ +enum net_device_flags { +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS + IFF_UP = 1<<0, /* sysfs */ + IFF_BROADCAST = 1<<1, /* volatile */ + IFF_DEBUG = 1<<2, /* sysfs */ + IFF_LOOPBACK = 1<<3, /* volatile */ + IFF_POINTOPOINT = 1<<4, /* volatile */ + IFF_NOTRAILERS = 1<<5, /* sysfs */ + IFF_RUNNING = 1<<6, /* volatile */ + IFF_NOARP = 1<<7, /* sysfs */ + IFF_PROMISC = 1<<8, /* sysfs */ + IFF_ALLMULTI = 1<<9, /* sysfs */ + IFF_MASTER = 1<<10, /* volatile */ + IFF_SLAVE = 1<<11, /* volatile */ + IFF_MULTICAST = 1<<12, /* sysfs */ + IFF_PORTSEL = 1<<13, /* sysfs */ + IFF_AUTOMEDIA = 1<<14, /* sysfs */ + IFF_DYNAMIC = 1<<15, /* sysfs */ +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO + IFF_LOWER_UP = 1<<16, /* volatile */ + IFF_DORMANT = 1<<17, /* volatile */ + IFF_ECHO = 1<<18, /* volatile */ +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ +}; +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO != 0 || __UAPI_DEF_IF_NET_DEVICE_FLAGS != 0 */ + +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS +#define IFF_UP IFF_UP +#define IFF_BROADCAST IFF_BROADCAST +#define IFF_DEBUG IFF_DEBUG +#define IFF_LOOPBACK IFF_LOOPBACK +#define IFF_POINTOPOINT IFF_POINTOPOINT +#define IFF_NOTRAILERS IFF_NOTRAILERS +#define IFF_RUNNING IFF_RUNNING +#define IFF_NOARP IFF_NOARP +#define IFF_PROMISC IFF_PROMISC +#define IFF_ALLMULTI IFF_ALLMULTI +#define IFF_MASTER IFF_MASTER +#define IFF_SLAVE IFF_SLAVE +#define IFF_MULTICAST IFF_MULTICAST +#define IFF_PORTSEL IFF_PORTSEL +#define IFF_AUTOMEDIA IFF_AUTOMEDIA +#define IFF_DYNAMIC IFF_DYNAMIC +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS */ + +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO +#define IFF_LOWER_UP IFF_LOWER_UP +#define IFF_DORMANT IFF_DORMANT +#define IFF_ECHO IFF_ECHO +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ + +#define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ + IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) + +#define IF_GET_IFACE 0x0001 /* for querying only */ +#define IF_GET_PROTO 0x0002 + +/* For definitions see hdlc.h */ +#define IF_IFACE_V35 0x1000 /* V.35 serial interface */ +#define IF_IFACE_V24 0x1001 /* V.24 serial interface */ +#define IF_IFACE_X21 0x1002 /* X.21 serial interface */ +#define IF_IFACE_T1 0x1003 /* T1 telco serial interface */ +#define IF_IFACE_E1 0x1004 /* E1 telco serial interface */ +#define IF_IFACE_SYNC_SERIAL 0x1005 /* can't be set by software */ +#define IF_IFACE_X21D 0x1006 /* X.21 Dual Clocking (FarSite) */ + +/* For definitions see hdlc.h */ +#define IF_PROTO_HDLC 0x2000 /* raw HDLC protocol */ +#define IF_PROTO_PPP 0x2001 /* PPP protocol */ +#define IF_PROTO_CISCO 0x2002 /* Cisco HDLC protocol */ +#define IF_PROTO_FR 0x2003 /* Frame Relay protocol */ +#define IF_PROTO_FR_ADD_PVC 0x2004 /* Create FR PVC */ +#define IF_PROTO_FR_DEL_PVC 0x2005 /* Delete FR PVC */ +#define IF_PROTO_X25 0x2006 /* X.25 */ +#define IF_PROTO_HDLC_ETH 0x2007 /* raw HDLC, Ethernet emulation */ +#define IF_PROTO_FR_ADD_ETH_PVC 0x2008 /* Create FR Ethernet-bridged PVC */ +#define IF_PROTO_FR_DEL_ETH_PVC 0x2009 /* Delete FR Ethernet-bridged PVC */ +#define IF_PROTO_FR_PVC 0x200A /* for reading PVC status */ +#define IF_PROTO_FR_ETH_PVC 0x200B +#define IF_PROTO_RAW 0x200C /* RAW Socket */ + +/* RFC 2863 operational status */ +enum { + IF_OPER_UNKNOWN, + IF_OPER_NOTPRESENT, + IF_OPER_DOWN, + IF_OPER_LOWERLAYERDOWN, + IF_OPER_TESTING, + IF_OPER_DORMANT, + IF_OPER_UP, +}; + +/* link modes */ +enum { + IF_LINK_MODE_DEFAULT, + IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ + IF_LINK_MODE_TESTING, /* limit upward transition to testing */ +}; + +/* + * Device mapping structure. I'd just gone off and designed a + * beautiful scheme using only loadable modules with arguments + * for driver options and along come the PCMCIA people 8) + * + * Ah well. The get() side of this is good for WDSETUP, and it'll + * be handy for debugging things. The set side is fine for now and + * being very small might be worth keeping for clean configuration. + */ + +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_IFMAP +struct ifmap { + unsigned long mem_start; + unsigned long mem_end; + unsigned short base_addr; + unsigned char irq; + unsigned char dma; + unsigned char port; + /* 3 bytes spare */ +}; +#endif /* __UAPI_DEF_IF_IFMAP */ + +struct if_settings { + unsigned int type; /* Type of physical device or protocol */ + unsigned int size; /* Size of the data allocated by the caller */ + union { + /* {atm/eth/dsl}_settings anyone ? */ + raw_hdlc_proto *raw_hdlc; + cisco_proto *cisco; + fr_proto *fr; + fr_proto_pvc *fr_pvc; + fr_proto_pvc_info *fr_pvc_info; + x25_hdlc_proto *x25; + + /* interface settings */ + sync_serial_settings *sync; + te1_settings *te1; + } ifs_ifsu; +}; + +/* + * Interface request structure used for socket + * ioctl's. All interface ioctl's must have parameter + * definitions which begin with ifr_name. The + * remainder may be interface specific. + */ + +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_IFREQ +struct ifreq { +#define IFHWADDRLEN 6 + union + { + char ifrn_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + } ifr_ifrn; + + union { + struct sockaddr ifru_addr; + struct sockaddr ifru_dstaddr; + struct sockaddr ifru_broadaddr; + struct sockaddr ifru_netmask; + struct sockaddr ifru_hwaddr; + short ifru_flags; + int ifru_ivalue; + int ifru_mtu; + struct ifmap ifru_map; + char ifru_slave[IFNAMSIZ]; /* Just fits the size */ + char ifru_newname[IFNAMSIZ]; + void * ifru_data; + struct if_settings ifru_settings; + } ifr_ifru; +}; +#endif /* __UAPI_DEF_IF_IFREQ */ + +#define ifr_name ifr_ifrn.ifrn_name /* interface name */ +#define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */ +#define ifr_addr ifr_ifru.ifru_addr /* address */ +#define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-p lnk */ +#define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */ +#define ifr_netmask ifr_ifru.ifru_netmask /* interface net mask */ +#define ifr_flags ifr_ifru.ifru_flags /* flags */ +#define ifr_metric ifr_ifru.ifru_ivalue /* metric */ +#define ifr_mtu ifr_ifru.ifru_mtu /* mtu */ +#define ifr_map ifr_ifru.ifru_map /* device map */ +#define ifr_slave ifr_ifru.ifru_slave /* slave device */ +#define ifr_data ifr_ifru.ifru_data /* for use by interface */ +#define ifr_ifindex ifr_ifru.ifru_ivalue /* interface index */ +#define ifr_bandwidth ifr_ifru.ifru_ivalue /* link bandwidth */ +#define ifr_qlen ifr_ifru.ifru_ivalue /* Queue length */ +#define ifr_newname ifr_ifru.ifru_newname /* New name */ +#define ifr_settings ifr_ifru.ifru_settings /* Device/proto settings*/ + +/* + * Structure used in SIOCGIFCONF request. + * Used to retrieve interface configuration + * for machine (useful for programs which + * must know all networks accessible). + */ + +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_IFCONF +struct ifconf { + int ifc_len; /* size of buffer */ + union { + char *ifcu_buf; + struct ifreq *ifcu_req; + } ifc_ifcu; +}; +#endif /* __UAPI_DEF_IF_IFCONF */ + +#define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ +#define ifc_req ifc_ifcu.ifcu_req /* array of structures */ + +#endif /* _LINUX_IF_H */ diff --git a/src/basic/linux/if_addr.h b/src/basic/linux/if_addr.h new file mode 100644 index 0000000..1c392dd --- /dev/null +++ b/src/basic/linux/if_addr.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_IF_ADDR_H +#define __LINUX_IF_ADDR_H + +#include +#include + +struct ifaddrmsg { + __u8 ifa_family; + __u8 ifa_prefixlen; /* The prefix length */ + __u8 ifa_flags; /* Flags */ + __u8 ifa_scope; /* Address scope */ + __u32 ifa_index; /* Link index */ +}; + +/* + * Important comment: + * IFA_ADDRESS is prefix address, rather than local interface address. + * It makes no difference for normally configured broadcast interfaces, + * but for point-to-point IFA_ADDRESS is DESTINATION address, + * local address is supplied in IFA_LOCAL attribute. + * + * IFA_FLAGS is a u32 attribute that extends the u8 field ifa_flags. + * If present, the value from struct ifaddrmsg will be ignored. + */ +enum { + IFA_UNSPEC, + IFA_ADDRESS, + IFA_LOCAL, + IFA_LABEL, + IFA_BROADCAST, + IFA_ANYCAST, + IFA_CACHEINFO, + IFA_MULTICAST, + IFA_FLAGS, + IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ + IFA_TARGET_NETNSID, + IFA_PROTO, /* u8, address protocol */ + __IFA_MAX, +}; + +#define IFA_MAX (__IFA_MAX - 1) + +/* ifa_flags */ +#define IFA_F_SECONDARY 0x01 +#define IFA_F_TEMPORARY IFA_F_SECONDARY + +#define IFA_F_NODAD 0x02 +#define IFA_F_OPTIMISTIC 0x04 +#define IFA_F_DADFAILED 0x08 +#define IFA_F_HOMEADDRESS 0x10 +#define IFA_F_DEPRECATED 0x20 +#define IFA_F_TENTATIVE 0x40 +#define IFA_F_PERMANENT 0x80 +#define IFA_F_MANAGETEMPADDR 0x100 +#define IFA_F_NOPREFIXROUTE 0x200 +#define IFA_F_MCAUTOJOIN 0x400 +#define IFA_F_STABLE_PRIVACY 0x800 + +struct ifa_cacheinfo { + __u32 ifa_prefered; + __u32 ifa_valid; + __u32 cstamp; /* created timestamp, hundredths of seconds */ + __u32 tstamp; /* updated timestamp, hundredths of seconds */ +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifaddrmsg)))) +#define IFA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifaddrmsg)) +#endif + +/* ifa_proto */ +#define IFAPROT_UNSPEC 0 +#define IFAPROT_KERNEL_LO 1 /* loopback */ +#define IFAPROT_KERNEL_RA 2 /* set by kernel from router announcement */ +#define IFAPROT_KERNEL_LL 3 /* link-local set by kernel */ + +#endif diff --git a/src/basic/linux/if_bonding.h b/src/basic/linux/if_bonding.h new file mode 100644 index 0000000..d174914 --- /dev/null +++ b/src/basic/linux/if_bonding.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */ +/* + * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. + * + * + * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes + * NCM: Network and Communications Management, Inc. + * + * BUT, I'm the one who modified it for ethernet, so: + * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov + * + * This software may be used and distributed according to the terms + * of the GNU Public License, incorporated herein by reference. + * + * 2003/03/18 - Amir Noam + * - Added support for getting slave's speed and duplex via ethtool. + * Needed for 802.3ad and other future modes. + * + * 2003/03/18 - Tsippy Mendelson and + * Shmulik Hen + * - Enable support of modes that need to use the unique mac address of + * each slave. + * + * 2003/03/18 - Tsippy Mendelson and + * Amir Noam + * - Moved driver's private data types to bonding.h + * + * 2003/03/18 - Amir Noam , + * Tsippy Mendelson and + * Shmulik Hen + * - Added support for IEEE 802.3ad Dynamic link aggregation mode. + * + * 2003/05/01 - Amir Noam + * - Added ABI version control to restore compatibility between + * new/old ifenslave and new/old bonding. + * + * 2003/12/01 - Shmulik Hen + * - Code cleanup and style changes + * + * 2005/05/05 - Jason Gabler + * - added definitions for various XOR hashing policies + */ + +#ifndef _LINUX_IF_BONDING_H +#define _LINUX_IF_BONDING_H + +#include +#include +#include + +/* userland - kernel ABI version (2003/05/08) */ +#define BOND_ABI_VERSION 2 + +/* + * We can remove these ioctl definitions in 2.5. People should use the + * SIOC*** versions of them instead + */ +#define BOND_ENSLAVE_OLD (SIOCDEVPRIVATE) +#define BOND_RELEASE_OLD (SIOCDEVPRIVATE + 1) +#define BOND_SETHWADDR_OLD (SIOCDEVPRIVATE + 2) +#define BOND_SLAVE_INFO_QUERY_OLD (SIOCDEVPRIVATE + 11) +#define BOND_INFO_QUERY_OLD (SIOCDEVPRIVATE + 12) +#define BOND_CHANGE_ACTIVE_OLD (SIOCDEVPRIVATE + 13) + +#define BOND_CHECK_MII_STATUS (SIOCGMIIPHY) + +#define BOND_MODE_ROUNDROBIN 0 +#define BOND_MODE_ACTIVEBACKUP 1 +#define BOND_MODE_XOR 2 +#define BOND_MODE_BROADCAST 3 +#define BOND_MODE_8023AD 4 +#define BOND_MODE_TLB 5 +#define BOND_MODE_ALB 6 /* TLB + RLB (receive load balancing) */ + +/* each slave's link has 4 states */ +#define BOND_LINK_UP 0 /* link is up and running */ +#define BOND_LINK_FAIL 1 /* link has just gone down */ +#define BOND_LINK_DOWN 2 /* link has been down for too long time */ +#define BOND_LINK_BACK 3 /* link is going back */ + +/* each slave has several states */ +#define BOND_STATE_ACTIVE 0 /* link is active */ +#define BOND_STATE_BACKUP 1 /* link is backup */ + +#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ + +#define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ + +#define BOND_DEFAULT_RESEND_IGMP 1 /* Default number of IGMP membership reports */ + +/* hashing types */ +#define BOND_XMIT_POLICY_LAYER2 0 /* layer 2 (MAC only), default */ +#define BOND_XMIT_POLICY_LAYER34 1 /* layer 3+4 (IP ^ (TCP || UDP)) */ +#define BOND_XMIT_POLICY_LAYER23 2 /* layer 2+3 (IP ^ MAC) */ +#define BOND_XMIT_POLICY_ENCAP23 3 /* encapsulated layer 2+3 */ +#define BOND_XMIT_POLICY_ENCAP34 4 /* encapsulated layer 3+4 */ +#define BOND_XMIT_POLICY_VLAN_SRCMAC 5 /* vlan + source MAC */ + +/* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */ +#define LACP_STATE_LACP_ACTIVITY 0x1 +#define LACP_STATE_LACP_TIMEOUT 0x2 +#define LACP_STATE_AGGREGATION 0x4 +#define LACP_STATE_SYNCHRONIZATION 0x8 +#define LACP_STATE_COLLECTING 0x10 +#define LACP_STATE_DISTRIBUTING 0x20 +#define LACP_STATE_DEFAULTED 0x40 +#define LACP_STATE_EXPIRED 0x80 + +typedef struct ifbond { + __s32 bond_mode; + __s32 num_slaves; + __s32 miimon; +} ifbond; + +typedef struct ifslave { + __s32 slave_id; /* Used as an IN param to the BOND_SLAVE_INFO_QUERY ioctl */ + char slave_name[IFNAMSIZ]; + __s8 link; + __s8 state; + __u32 link_failure_count; +} ifslave; + +struct ad_info { + __u16 aggregator_id; + __u16 ports; + __u16 actor_key; + __u16 partner_key; + __u8 partner_system[ETH_ALEN]; +}; + +/* Embedded inside LINK_XSTATS_TYPE_BOND */ +enum { + BOND_XSTATS_UNSPEC, + BOND_XSTATS_3AD, + __BOND_XSTATS_MAX +}; +#define BOND_XSTATS_MAX (__BOND_XSTATS_MAX - 1) + +/* Embedded inside BOND_XSTATS_3AD */ +enum { + BOND_3AD_STAT_LACPDU_RX, + BOND_3AD_STAT_LACPDU_TX, + BOND_3AD_STAT_LACPDU_UNKNOWN_RX, + BOND_3AD_STAT_LACPDU_ILLEGAL_RX, + BOND_3AD_STAT_MARKER_RX, + BOND_3AD_STAT_MARKER_TX, + BOND_3AD_STAT_MARKER_RESP_RX, + BOND_3AD_STAT_MARKER_RESP_TX, + BOND_3AD_STAT_MARKER_UNKNOWN_RX, + BOND_3AD_STAT_PAD, + __BOND_3AD_STAT_MAX +}; +#define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) + +#endif /* _LINUX_IF_BONDING_H */ diff --git a/src/basic/linux/if_bridge.h b/src/basic/linux/if_bridge.h new file mode 100644 index 0000000..d9de241 --- /dev/null +++ b/src/basic/linux/if_bridge.h @@ -0,0 +1,826 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_IF_BRIDGE_H +#define _UAPI_LINUX_IF_BRIDGE_H + +#include +#include +#include + +#define SYSFS_BRIDGE_ATTR "bridge" +#define SYSFS_BRIDGE_FDB "brforward" +#define SYSFS_BRIDGE_PORT_SUBDIR "brif" +#define SYSFS_BRIDGE_PORT_ATTR "brport" +#define SYSFS_BRIDGE_PORT_LINK "bridge" + +#define BRCTL_VERSION 1 + +#define BRCTL_GET_VERSION 0 +#define BRCTL_GET_BRIDGES 1 +#define BRCTL_ADD_BRIDGE 2 +#define BRCTL_DEL_BRIDGE 3 +#define BRCTL_ADD_IF 4 +#define BRCTL_DEL_IF 5 +#define BRCTL_GET_BRIDGE_INFO 6 +#define BRCTL_GET_PORT_LIST 7 +#define BRCTL_SET_BRIDGE_FORWARD_DELAY 8 +#define BRCTL_SET_BRIDGE_HELLO_TIME 9 +#define BRCTL_SET_BRIDGE_MAX_AGE 10 +#define BRCTL_SET_AGEING_TIME 11 +#define BRCTL_SET_GC_INTERVAL 12 +#define BRCTL_GET_PORT_INFO 13 +#define BRCTL_SET_BRIDGE_STP_STATE 14 +#define BRCTL_SET_BRIDGE_PRIORITY 15 +#define BRCTL_SET_PORT_PRIORITY 16 +#define BRCTL_SET_PATH_COST 17 +#define BRCTL_GET_FDB_ENTRIES 18 + +#define BR_STATE_DISABLED 0 +#define BR_STATE_LISTENING 1 +#define BR_STATE_LEARNING 2 +#define BR_STATE_FORWARDING 3 +#define BR_STATE_BLOCKING 4 + +struct __bridge_info { + __u64 designated_root; + __u64 bridge_id; + __u32 root_path_cost; + __u32 max_age; + __u32 hello_time; + __u32 forward_delay; + __u32 bridge_max_age; + __u32 bridge_hello_time; + __u32 bridge_forward_delay; + __u8 topology_change; + __u8 topology_change_detected; + __u8 root_port; + __u8 stp_enabled; + __u32 ageing_time; + __u32 gc_interval; + __u32 hello_timer_value; + __u32 tcn_timer_value; + __u32 topology_change_timer_value; + __u32 gc_timer_value; +}; + +struct __port_info { + __u64 designated_root; + __u64 designated_bridge; + __u16 port_id; + __u16 designated_port; + __u32 path_cost; + __u32 designated_cost; + __u8 state; + __u8 top_change_ack; + __u8 config_pending; + __u8 unused0; + __u32 message_age_timer_value; + __u32 forward_delay_timer_value; + __u32 hold_timer_value; +}; + +struct __fdb_entry { + __u8 mac_addr[ETH_ALEN]; + __u8 port_no; + __u8 is_local; + __u32 ageing_timer_value; + __u8 port_hi; + __u8 pad0; + __u16 unused; +}; + +/* Bridge Flags */ +#define BRIDGE_FLAGS_MASTER 1 /* Bridge command to/from master */ +#define BRIDGE_FLAGS_SELF 2 /* Bridge command to/from lowerdev */ + +#define BRIDGE_MODE_VEB 0 /* Default loopback mode */ +#define BRIDGE_MODE_VEPA 1 /* 802.1Qbg defined VEPA mode */ +#define BRIDGE_MODE_UNDEF 0xFFFF /* mode undefined */ + +/* Bridge management nested attributes + * [IFLA_AF_SPEC] = { + * [IFLA_BRIDGE_FLAGS] + * [IFLA_BRIDGE_MODE] + * [IFLA_BRIDGE_VLAN_INFO] + * } + */ +enum { + IFLA_BRIDGE_FLAGS, + IFLA_BRIDGE_MODE, + IFLA_BRIDGE_VLAN_INFO, + IFLA_BRIDGE_VLAN_TUNNEL_INFO, + IFLA_BRIDGE_MRP, + IFLA_BRIDGE_CFM, + IFLA_BRIDGE_MST, + __IFLA_BRIDGE_MAX, +}; +#define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1) + +#define BRIDGE_VLAN_INFO_MASTER (1<<0) /* Operate on Bridge device as well */ +#define BRIDGE_VLAN_INFO_PVID (1<<1) /* VLAN is PVID, ingress untagged */ +#define BRIDGE_VLAN_INFO_UNTAGGED (1<<2) /* VLAN egresses untagged */ +#define BRIDGE_VLAN_INFO_RANGE_BEGIN (1<<3) /* VLAN is start of vlan range */ +#define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */ +#define BRIDGE_VLAN_INFO_BRENTRY (1<<5) /* Global bridge VLAN entry */ +#define BRIDGE_VLAN_INFO_ONLY_OPTS (1<<6) /* Skip create/delete/flags */ + +struct bridge_vlan_info { + __u16 flags; + __u16 vid; +}; + +enum { + IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC, + IFLA_BRIDGE_VLAN_TUNNEL_ID, + IFLA_BRIDGE_VLAN_TUNNEL_VID, + IFLA_BRIDGE_VLAN_TUNNEL_FLAGS, + __IFLA_BRIDGE_VLAN_TUNNEL_MAX, +}; + +#define IFLA_BRIDGE_VLAN_TUNNEL_MAX (__IFLA_BRIDGE_VLAN_TUNNEL_MAX - 1) + +struct bridge_vlan_xstats { + __u64 rx_bytes; + __u64 rx_packets; + __u64 tx_bytes; + __u64 tx_packets; + __u16 vid; + __u16 flags; + __u32 pad2; +}; + +enum { + IFLA_BRIDGE_MRP_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE, + IFLA_BRIDGE_MRP_PORT_STATE, + IFLA_BRIDGE_MRP_PORT_ROLE, + IFLA_BRIDGE_MRP_RING_STATE, + IFLA_BRIDGE_MRP_RING_ROLE, + IFLA_BRIDGE_MRP_START_TEST, + IFLA_BRIDGE_MRP_INFO, + IFLA_BRIDGE_MRP_IN_ROLE, + IFLA_BRIDGE_MRP_IN_STATE, + IFLA_BRIDGE_MRP_START_IN_TEST, + __IFLA_BRIDGE_MRP_MAX, +}; + +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_INSTANCE_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE_RING_ID, + IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_PRIO, + __IFLA_BRIDGE_MRP_INSTANCE_MAX, +}; + +#define IFLA_BRIDGE_MRP_INSTANCE_MAX (__IFLA_BRIDGE_MRP_INSTANCE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_STATE_STATE, + __IFLA_BRIDGE_MRP_PORT_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_STATE_MAX (__IFLA_BRIDGE_MRP_PORT_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_ROLE_ROLE, + __IFLA_BRIDGE_MRP_PORT_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_ROLE_MAX (__IFLA_BRIDGE_MRP_PORT_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_STATE_UNSPEC, + IFLA_BRIDGE_MRP_RING_STATE_RING_ID, + IFLA_BRIDGE_MRP_RING_STATE_STATE, + __IFLA_BRIDGE_MRP_RING_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_STATE_MAX (__IFLA_BRIDGE_MRP_RING_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_RING_ROLE_RING_ID, + IFLA_BRIDGE_MRP_RING_ROLE_ROLE, + __IFLA_BRIDGE_MRP_RING_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_ROLE_MAX (__IFLA_BRIDGE_MRP_RING_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_TEST_RING_ID, + IFLA_BRIDGE_MRP_START_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_TEST_PERIOD, + IFLA_BRIDGE_MRP_START_TEST_MONITOR, + __IFLA_BRIDGE_MRP_START_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_INFO_UNSPEC, + IFLA_BRIDGE_MRP_INFO_RING_ID, + IFLA_BRIDGE_MRP_INFO_P_IFINDEX, + IFLA_BRIDGE_MRP_INFO_S_IFINDEX, + IFLA_BRIDGE_MRP_INFO_PRIO, + IFLA_BRIDGE_MRP_INFO_RING_STATE, + IFLA_BRIDGE_MRP_INFO_RING_ROLE, + IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + IFLA_BRIDGE_MRP_INFO_I_IFINDEX, + IFLA_BRIDGE_MRP_INFO_IN_STATE, + IFLA_BRIDGE_MRP_INFO_IN_ROLE, + IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, + __IFLA_BRIDGE_MRP_INFO_MAX, +}; + +#define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_IN_STATE_UNSPEC, + IFLA_BRIDGE_MRP_IN_STATE_IN_ID, + IFLA_BRIDGE_MRP_IN_STATE_STATE, + __IFLA_BRIDGE_MRP_IN_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_STATE_MAX (__IFLA_BRIDGE_MRP_IN_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_IN_ROLE_RING_ID, + IFLA_BRIDGE_MRP_IN_ROLE_IN_ID, + IFLA_BRIDGE_MRP_IN_ROLE_ROLE, + IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX, + __IFLA_BRIDGE_MRP_IN_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_ROLE_MAX (__IFLA_BRIDGE_MRP_IN_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID, + IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_IN_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_IN_TEST_MAX (__IFLA_BRIDGE_MRP_START_IN_TEST_MAX - 1) + +struct br_mrp_instance { + __u32 ring_id; + __u32 p_ifindex; + __u32 s_ifindex; + __u16 prio; +}; + +struct br_mrp_ring_state { + __u32 ring_id; + __u32 ring_state; +}; + +struct br_mrp_ring_role { + __u32 ring_id; + __u32 ring_role; +}; + +struct br_mrp_start_test { + __u32 ring_id; + __u32 interval; + __u32 max_miss; + __u32 period; + __u32 monitor; +}; + +struct br_mrp_in_state { + __u32 in_state; + __u16 in_id; +}; + +struct br_mrp_in_role { + __u32 ring_id; + __u32 in_role; + __u32 i_ifindex; + __u16 in_id; +}; + +struct br_mrp_start_in_test { + __u32 interval; + __u32 max_miss; + __u32 period; + __u16 in_id; +}; + +enum { + IFLA_BRIDGE_CFM_UNSPEC, + IFLA_BRIDGE_CFM_MEP_CREATE, + IFLA_BRIDGE_CFM_MEP_DELETE, + IFLA_BRIDGE_CFM_MEP_CONFIG, + IFLA_BRIDGE_CFM_CC_CONFIG, + IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD, + IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE, + IFLA_BRIDGE_CFM_CC_RDI, + IFLA_BRIDGE_CFM_CC_CCM_TX, + IFLA_BRIDGE_CFM_MEP_CREATE_INFO, + IFLA_BRIDGE_CFM_MEP_CONFIG_INFO, + IFLA_BRIDGE_CFM_CC_CONFIG_INFO, + IFLA_BRIDGE_CFM_CC_RDI_INFO, + IFLA_BRIDGE_CFM_CC_CCM_TX_INFO, + IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO, + IFLA_BRIDGE_CFM_MEP_STATUS_INFO, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO, + __IFLA_BRIDGE_CFM_MAX, +}; + +#define IFLA_BRIDGE_CFM_MAX (__IFLA_BRIDGE_CFM_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC, + IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE, + IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN, + IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION, + IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX, + __IFLA_BRIDGE_CFM_MEP_CREATE_MAX, +}; + +#define IFLA_BRIDGE_CFM_MEP_CREATE_MAX (__IFLA_BRIDGE_CFM_MEP_CREATE_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC, + IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE, + __IFLA_BRIDGE_CFM_MEP_DELETE_MAX, +}; + +#define IFLA_BRIDGE_CFM_MEP_DELETE_MAX (__IFLA_BRIDGE_CFM_MEP_DELETE_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC, + IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE, + IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC, + IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL, + IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID, + __IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, +}; + +#define IFLA_BRIDGE_CFM_MEP_CONFIG_MAX (__IFLA_BRIDGE_CFM_MEP_CONFIG_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC, + IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE, + IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE, + IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL, + IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID, + __IFLA_BRIDGE_CFM_CC_CONFIG_MAX, +}; + +#define IFLA_BRIDGE_CFM_CC_CONFIG_MAX (__IFLA_BRIDGE_CFM_CC_CONFIG_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC, + IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE, + IFLA_BRIDGE_CFM_CC_PEER_MEPID, + __IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, +}; + +#define IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX (__IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_CC_RDI_UNSPEC, + IFLA_BRIDGE_CFM_CC_RDI_INSTANCE, + IFLA_BRIDGE_CFM_CC_RDI_RDI, + __IFLA_BRIDGE_CFM_CC_RDI_MAX, +}; + +#define IFLA_BRIDGE_CFM_CC_RDI_MAX (__IFLA_BRIDGE_CFM_CC_RDI_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC, + IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE, + IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC, + IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE, + IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD, + IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV, + IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE, + IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV, + IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE, + __IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, +}; + +#define IFLA_BRIDGE_CFM_CC_CCM_TX_MAX (__IFLA_BRIDGE_CFM_CC_CCM_TX_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_MEP_STATUS_UNSPEC, + IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE, + IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN, + IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN, + IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN, + __IFLA_BRIDGE_CFM_MEP_STATUS_MAX, +}; + +#define IFLA_BRIDGE_CFM_MEP_STATUS_MAX (__IFLA_BRIDGE_CFM_MEP_STATUS_MAX - 1) + +enum { + IFLA_BRIDGE_CFM_CC_PEER_STATUS_UNSPEC, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN, + IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN, + __IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX, +}; + +#define IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX (__IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX - 1) + +enum { + IFLA_BRIDGE_MST_UNSPEC, + IFLA_BRIDGE_MST_ENTRY, + __IFLA_BRIDGE_MST_MAX, +}; +#define IFLA_BRIDGE_MST_MAX (__IFLA_BRIDGE_MST_MAX - 1) + +enum { + IFLA_BRIDGE_MST_ENTRY_UNSPEC, + IFLA_BRIDGE_MST_ENTRY_MSTI, + IFLA_BRIDGE_MST_ENTRY_STATE, + __IFLA_BRIDGE_MST_ENTRY_MAX, +}; +#define IFLA_BRIDGE_MST_ENTRY_MAX (__IFLA_BRIDGE_MST_ENTRY_MAX - 1) + +struct bridge_stp_xstats { + __u64 transition_blk; + __u64 transition_fwd; + __u64 rx_bpdu; + __u64 tx_bpdu; + __u64 rx_tcn; + __u64 tx_tcn; +}; + +/* Bridge vlan RTM header */ +struct br_vlan_msg { + __u8 family; + __u8 reserved1; + __u16 reserved2; + __u32 ifindex; +}; + +enum { + BRIDGE_VLANDB_DUMP_UNSPEC, + BRIDGE_VLANDB_DUMP_FLAGS, + __BRIDGE_VLANDB_DUMP_MAX, +}; +#define BRIDGE_VLANDB_DUMP_MAX (__BRIDGE_VLANDB_DUMP_MAX - 1) + +/* flags used in BRIDGE_VLANDB_DUMP_FLAGS attribute to affect dumps */ +#define BRIDGE_VLANDB_DUMPF_STATS (1 << 0) /* Include stats in the dump */ +#define BRIDGE_VLANDB_DUMPF_GLOBAL (1 << 1) /* Dump global vlan options only */ + +/* Bridge vlan RTM attributes + * [BRIDGE_VLANDB_ENTRY] = { + * [BRIDGE_VLANDB_ENTRY_INFO] + * ... + * } + * [BRIDGE_VLANDB_GLOBAL_OPTIONS] = { + * [BRIDGE_VLANDB_GOPTS_ID] + * ... + * } + */ +enum { + BRIDGE_VLANDB_UNSPEC, + BRIDGE_VLANDB_ENTRY, + BRIDGE_VLANDB_GLOBAL_OPTIONS, + __BRIDGE_VLANDB_MAX, +}; +#define BRIDGE_VLANDB_MAX (__BRIDGE_VLANDB_MAX - 1) + +enum { + BRIDGE_VLANDB_ENTRY_UNSPEC, + BRIDGE_VLANDB_ENTRY_INFO, + BRIDGE_VLANDB_ENTRY_RANGE, + BRIDGE_VLANDB_ENTRY_STATE, + BRIDGE_VLANDB_ENTRY_TUNNEL_INFO, + BRIDGE_VLANDB_ENTRY_STATS, + BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, + __BRIDGE_VLANDB_ENTRY_MAX, +}; +#define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) + +/* [BRIDGE_VLANDB_ENTRY] = { + * [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { + * [BRIDGE_VLANDB_TINFO_ID] + * ... + * } + * } + */ +enum { + BRIDGE_VLANDB_TINFO_UNSPEC, + BRIDGE_VLANDB_TINFO_ID, + BRIDGE_VLANDB_TINFO_CMD, + __BRIDGE_VLANDB_TINFO_MAX, +}; +#define BRIDGE_VLANDB_TINFO_MAX (__BRIDGE_VLANDB_TINFO_MAX - 1) + +/* [BRIDGE_VLANDB_ENTRY] = { + * [BRIDGE_VLANDB_ENTRY_STATS] = { + * [BRIDGE_VLANDB_STATS_RX_BYTES] + * ... + * } + * ... + * } + */ +enum { + BRIDGE_VLANDB_STATS_UNSPEC, + BRIDGE_VLANDB_STATS_RX_BYTES, + BRIDGE_VLANDB_STATS_RX_PACKETS, + BRIDGE_VLANDB_STATS_TX_BYTES, + BRIDGE_VLANDB_STATS_TX_PACKETS, + BRIDGE_VLANDB_STATS_PAD, + __BRIDGE_VLANDB_STATS_MAX, +}; +#define BRIDGE_VLANDB_STATS_MAX (__BRIDGE_VLANDB_STATS_MAX - 1) + +enum { + BRIDGE_VLANDB_GOPTS_UNSPEC, + BRIDGE_VLANDB_GOPTS_ID, + BRIDGE_VLANDB_GOPTS_RANGE, + BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, + BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, + BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, + BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, + BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, + BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, + BRIDGE_VLANDB_GOPTS_PAD, + BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, + BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE, + BRIDGE_VLANDB_GOPTS_MSTI, + __BRIDGE_VLANDB_GOPTS_MAX +}; +#define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) + +/* Bridge multicast database attributes + * [MDBA_MDB] = { + * [MDBA_MDB_ENTRY] = { + * [MDBA_MDB_ENTRY_INFO] { + * struct br_mdb_entry + * [MDBA_MDB_EATTR attributes] + * } + * } + * } + * [MDBA_ROUTER] = { + * [MDBA_ROUTER_PORT] = { + * u32 ifindex + * [MDBA_ROUTER_PATTR attributes] + * } + * } + */ +enum { + MDBA_UNSPEC, + MDBA_MDB, + MDBA_ROUTER, + __MDBA_MAX, +}; +#define MDBA_MAX (__MDBA_MAX - 1) + +enum { + MDBA_MDB_UNSPEC, + MDBA_MDB_ENTRY, + __MDBA_MDB_MAX, +}; +#define MDBA_MDB_MAX (__MDBA_MDB_MAX - 1) + +enum { + MDBA_MDB_ENTRY_UNSPEC, + MDBA_MDB_ENTRY_INFO, + __MDBA_MDB_ENTRY_MAX, +}; +#define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1) + +/* per mdb entry additional attributes */ +enum { + MDBA_MDB_EATTR_UNSPEC, + MDBA_MDB_EATTR_TIMER, + MDBA_MDB_EATTR_SRC_LIST, + MDBA_MDB_EATTR_GROUP_MODE, + MDBA_MDB_EATTR_SOURCE, + MDBA_MDB_EATTR_RTPROT, + __MDBA_MDB_EATTR_MAX +}; +#define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) + +/* per mdb entry source */ +enum { + MDBA_MDB_SRCLIST_UNSPEC, + MDBA_MDB_SRCLIST_ENTRY, + __MDBA_MDB_SRCLIST_MAX +}; +#define MDBA_MDB_SRCLIST_MAX (__MDBA_MDB_SRCLIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBA_MDB_SRCLIST_ENTRY + */ +enum { + MDBA_MDB_SRCATTR_UNSPEC, + MDBA_MDB_SRCATTR_ADDRESS, + MDBA_MDB_SRCATTR_TIMER, + __MDBA_MDB_SRCATTR_MAX +}; +#define MDBA_MDB_SRCATTR_MAX (__MDBA_MDB_SRCATTR_MAX - 1) + +/* multicast router types */ +enum { + MDB_RTR_TYPE_DISABLED, + MDB_RTR_TYPE_TEMP_QUERY, + MDB_RTR_TYPE_PERM, + MDB_RTR_TYPE_TEMP +}; + +enum { + MDBA_ROUTER_UNSPEC, + MDBA_ROUTER_PORT, + __MDBA_ROUTER_MAX, +}; +#define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1) + +/* router port attributes */ +enum { + MDBA_ROUTER_PATTR_UNSPEC, + MDBA_ROUTER_PATTR_TIMER, + MDBA_ROUTER_PATTR_TYPE, + MDBA_ROUTER_PATTR_INET_TIMER, + MDBA_ROUTER_PATTR_INET6_TIMER, + MDBA_ROUTER_PATTR_VID, + __MDBA_ROUTER_PATTR_MAX +}; +#define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1) + +struct br_port_msg { + __u8 family; + __u32 ifindex; +}; + +struct br_mdb_entry { + __u32 ifindex; +#define MDB_TEMPORARY 0 +#define MDB_PERMANENT 1 + __u8 state; +#define MDB_FLAGS_OFFLOAD (1 << 0) +#define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) + __u8 flags; + __u16 vid; + struct { + union { + __be32 ip4; + struct in6_addr ip6; + unsigned char mac_addr[ETH_ALEN]; + } u; + __be16 proto; + } addr; +}; + +enum { + MDBA_SET_ENTRY_UNSPEC, + MDBA_SET_ENTRY, + MDBA_SET_ENTRY_ATTRS, + __MDBA_SET_ENTRY_MAX, +}; +#define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1) + +/* [MDBA_SET_ENTRY_ATTRS] = { + * [MDBE_ATTR_xxx] + * ... + * } + */ +enum { + MDBE_ATTR_UNSPEC, + MDBE_ATTR_SOURCE, + MDBE_ATTR_SRC_LIST, + MDBE_ATTR_GROUP_MODE, + MDBE_ATTR_RTPROT, + __MDBE_ATTR_MAX, +}; +#define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) + +/* per mdb entry source */ +enum { + MDBE_SRC_LIST_UNSPEC, + MDBE_SRC_LIST_ENTRY, + __MDBE_SRC_LIST_MAX, +}; +#define MDBE_SRC_LIST_MAX (__MDBE_SRC_LIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBE_SRC_LIST_ENTRY + */ +enum { + MDBE_SRCATTR_UNSPEC, + MDBE_SRCATTR_ADDRESS, + __MDBE_SRCATTR_MAX, +}; +#define MDBE_SRCATTR_MAX (__MDBE_SRCATTR_MAX - 1) + +/* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ +enum { + BRIDGE_XSTATS_UNSPEC, + BRIDGE_XSTATS_VLAN, + BRIDGE_XSTATS_MCAST, + BRIDGE_XSTATS_PAD, + BRIDGE_XSTATS_STP, + __BRIDGE_XSTATS_MAX +}; +#define BRIDGE_XSTATS_MAX (__BRIDGE_XSTATS_MAX - 1) + +enum { + BR_MCAST_DIR_RX, + BR_MCAST_DIR_TX, + BR_MCAST_DIR_SIZE +}; + +/* IGMP/MLD statistics */ +struct br_mcast_stats { + __u64 igmp_v1queries[BR_MCAST_DIR_SIZE]; + __u64 igmp_v2queries[BR_MCAST_DIR_SIZE]; + __u64 igmp_v3queries[BR_MCAST_DIR_SIZE]; + __u64 igmp_leaves[BR_MCAST_DIR_SIZE]; + __u64 igmp_v1reports[BR_MCAST_DIR_SIZE]; + __u64 igmp_v2reports[BR_MCAST_DIR_SIZE]; + __u64 igmp_v3reports[BR_MCAST_DIR_SIZE]; + __u64 igmp_parse_errors; + + __u64 mld_v1queries[BR_MCAST_DIR_SIZE]; + __u64 mld_v2queries[BR_MCAST_DIR_SIZE]; + __u64 mld_leaves[BR_MCAST_DIR_SIZE]; + __u64 mld_v1reports[BR_MCAST_DIR_SIZE]; + __u64 mld_v2reports[BR_MCAST_DIR_SIZE]; + __u64 mld_parse_errors; + + __u64 mcast_bytes[BR_MCAST_DIR_SIZE]; + __u64 mcast_packets[BR_MCAST_DIR_SIZE]; +}; + +/* bridge boolean options + * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets + * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping + * + * IMPORTANT: if adding a new option do not forget to handle + * it in br_boolopt_toggle/get and bridge sysfs + */ +enum br_boolopt_id { + BR_BOOLOPT_NO_LL_LEARN, + BR_BOOLOPT_MCAST_VLAN_SNOOPING, + BR_BOOLOPT_MST_ENABLE, + BR_BOOLOPT_MAX +}; + +/* struct br_boolopt_multi - change multiple bridge boolean options + * + * @optval: new option values (bit per option) + * @optmask: options to change (bit per option) + */ +struct br_boolopt_multi { + __u32 optval; + __u32 optmask; +}; + +enum { + BRIDGE_QUERIER_UNSPEC, + BRIDGE_QUERIER_IP_ADDRESS, + BRIDGE_QUERIER_IP_PORT, + BRIDGE_QUERIER_IP_OTHER_TIMER, + BRIDGE_QUERIER_PAD, + BRIDGE_QUERIER_IPV6_ADDRESS, + BRIDGE_QUERIER_IPV6_PORT, + BRIDGE_QUERIER_IPV6_OTHER_TIMER, + __BRIDGE_QUERIER_MAX +}; +#define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1) +#endif /* _UAPI_LINUX_IF_BRIDGE_H */ diff --git a/src/basic/linux/if_ether.h b/src/basic/linux/if_ether.h new file mode 100644 index 0000000..69e0457 --- /dev/null +++ b/src/basic/linux/if_ether.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Global definitions for the Ethernet IEEE 802.3 interface. + * + * Version: @(#)if_ether.h 1.0.1a 02/08/94 + * + * Author: Fred N. van Kempen, + * Donald Becker, + * Alan Cox, + * Steve Whitehouse, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_IF_ETHER_H +#define _UAPI_LINUX_IF_ETHER_H + +#include + +/* + * IEEE 802.3 Ethernet magic constants. The frame sizes omit the preamble + * and FCS/CRC (frame check sequence). + */ + +#define ETH_ALEN 6 /* Octets in one ethernet addr */ +#define ETH_TLEN 2 /* Octets in ethernet type field */ +#define ETH_HLEN 14 /* Total octets in header. */ +#define ETH_ZLEN 60 /* Min. octets in frame sans FCS */ +#define ETH_DATA_LEN 1500 /* Max. octets in payload */ +#define ETH_FRAME_LEN 1514 /* Max. octets in frame sans FCS */ +#define ETH_FCS_LEN 4 /* Octets in the FCS */ + +#define ETH_MIN_MTU 68 /* Min IPv4 MTU per RFC791 */ +#define ETH_MAX_MTU 0xFFFFU /* 65535, same as IP_MAX_MTU */ + +/* + * These are the defined Ethernet Protocol ID's. + */ + +#define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ +#define ETH_P_PUP 0x0200 /* Xerox PUP packet */ +#define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ +#define ETH_P_ERSPAN2 0x22EB /* ERSPAN version 2 (type III) */ +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ +#define ETH_P_X25 0x0805 /* CCITT X.25 */ +#define ETH_P_ARP 0x0806 /* Address Resolution packet */ +#define ETH_P_BPQ 0x08FF /* G8BPQ AX.25 Ethernet Packet [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IEEEPUP 0x0a00 /* Xerox IEEE802.3 PUP packet */ +#define ETH_P_IEEEPUPAT 0x0a01 /* Xerox IEEE802.3 PUP Addr Trans packet */ +#define ETH_P_BATMAN 0x4305 /* B.A.T.M.A.N.-Advanced packet [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DEC 0x6000 /* DEC Assigned proto */ +#define ETH_P_DNA_DL 0x6001 /* DEC DNA Dump/Load */ +#define ETH_P_DNA_RC 0x6002 /* DEC DNA Remote Console */ +#define ETH_P_DNA_RT 0x6003 /* DEC DNA Routing */ +#define ETH_P_LAT 0x6004 /* DEC LAT */ +#define ETH_P_DIAG 0x6005 /* DEC Diagnostics */ +#define ETH_P_CUST 0x6006 /* DEC Customer use */ +#define ETH_P_SCA 0x6007 /* DEC Systems Comms Arch */ +#define ETH_P_TEB 0x6558 /* Trans Ether Bridging */ +#define ETH_P_RARP 0x8035 /* Reverse Addr Res packet */ +#define ETH_P_ATALK 0x809B /* Appletalk DDP */ +#define ETH_P_AARP 0x80F3 /* Appletalk AARP */ +#define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */ +#define ETH_P_ERSPAN 0x88BE /* ERSPAN type II */ +#define ETH_P_IPX 0x8137 /* IPX over DIX */ +#define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ +#define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */ +#define ETH_P_SLOW 0x8809 /* Slow Protocol. See 802.3ad 43B */ +#define ETH_P_WCCP 0x883E /* Web-cache coordination protocol + * defined in draft-wilson-wrec-wccp-v2-00.txt */ +#define ETH_P_MPLS_UC 0x8847 /* MPLS Unicast traffic */ +#define ETH_P_MPLS_MC 0x8848 /* MPLS Multicast traffic */ +#define ETH_P_ATMMPOA 0x884c /* MultiProtocol Over ATM */ +#define ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages */ +#define ETH_P_PPP_SES 0x8864 /* PPPoE session messages */ +#define ETH_P_LINK_CTL 0x886c /* HPNA, wlan link local tunnel */ +#define ETH_P_ATMFATE 0x8884 /* Frame-based ATM Transport + * over Ethernet + */ +#define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */ +#define ETH_P_PROFINET 0x8892 /* PROFINET */ +#define ETH_P_REALTEK 0x8899 /* Multiple proprietary protocols */ +#define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ +#define ETH_P_ETHERCAT 0x88A4 /* EtherCAT */ +#define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ +#define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ +#define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ +#define ETH_P_TIPC 0x88CA /* TIPC */ +#define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ +#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */ +#define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ +#define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ +#define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ +#define ETH_P_1588 0x88F7 /* IEEE 1588 Timesync */ +#define ETH_P_NCSI 0x88F8 /* NCSI protocol */ +#define ETH_P_PRP 0x88FB /* IEC 62439-3 PRP/HSRv0 */ +#define ETH_P_CFM 0x8902 /* Connectivity Fault Management */ +#define ETH_P_FCOE 0x8906 /* Fibre Channel over Ethernet */ +#define ETH_P_IBOE 0x8915 /* Infiniband over Ethernet */ +#define ETH_P_TDLS 0x890D /* TDLS */ +#define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */ +#define ETH_P_80221 0x8917 /* IEEE 802.21 Media Independent Handover Protocol */ +#define ETH_P_HSR 0x892F /* IEC 62439-3 HSRv1 */ +#define ETH_P_NSH 0x894F /* Network Service Header */ +#define ETH_P_LOOPBACK 0x9000 /* Ethernet loopback packet, per IEEE 802.3 */ +#define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DSA_A5PSW 0xE001 /* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ +#define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ + +#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is more than this value + * then the frame is Ethernet II. Else it is 802.3 */ + +/* + * Non DIX types. Won't clash for 1500 types. + */ + +#define ETH_P_802_3 0x0001 /* Dummy type for 802.3 frames */ +#define ETH_P_AX25 0x0002 /* Dummy protocol id for AX.25 */ +#define ETH_P_ALL 0x0003 /* Every packet (be careful!!!) */ +#define ETH_P_802_2 0x0004 /* 802.2 frames */ +#define ETH_P_SNAP 0x0005 /* Internal only */ +#define ETH_P_DDCMP 0x0006 /* DEC DDCMP: Internal only */ +#define ETH_P_WAN_PPP 0x0007 /* Dummy type for WAN PPP frames*/ +#define ETH_P_PPP_MP 0x0008 /* Dummy type for PPP MP frames */ +#define ETH_P_LOCALTALK 0x0009 /* Localtalk pseudo type */ +#define ETH_P_CAN 0x000C /* CAN: Controller Area Network */ +#define ETH_P_CANFD 0x000D /* CANFD: CAN flexible data rate*/ +#define ETH_P_CANXL 0x000E /* CANXL: eXtended frame Length */ +#define ETH_P_PPPTALK 0x0010 /* Dummy type for Atalk over PPP*/ +#define ETH_P_TR_802_2 0x0011 /* 802.2 frames */ +#define ETH_P_MOBITEX 0x0015 /* Mobitex (kaz@cafe.net) */ +#define ETH_P_CONTROL 0x0016 /* Card specific control frames */ +#define ETH_P_IRDA 0x0017 /* Linux-IrDA */ +#define ETH_P_ECONET 0x0018 /* Acorn Econet */ +#define ETH_P_HDLC 0x0019 /* HDLC frames */ +#define ETH_P_ARCNET 0x001A /* 1A for ArcNet :-) */ +#define ETH_P_DSA 0x001B /* Distributed Switch Arch. */ +#define ETH_P_TRAILER 0x001C /* Trailer switch tagging */ +#define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ +#define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ +#define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ +#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ +#define ETH_P_MAP 0x00F9 /* Qualcomm multiplexing and + * aggregation protocol + */ +#define ETH_P_MCTP 0x00FA /* Management component transport + * protocol packets + */ + +/* + * This is an Ethernet frame header. + */ + +/* allow libcs like musl to deactivate this, glibc does not implement this. */ +#ifndef __UAPI_DEF_ETHHDR +#define __UAPI_DEF_ETHHDR 1 +#endif + +#if __UAPI_DEF_ETHHDR +struct ethhdr { + unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ + unsigned char h_source[ETH_ALEN]; /* source ether addr */ + __be16 h_proto; /* packet type ID field */ +} __attribute__((packed)); +#endif + + +#endif /* _UAPI_LINUX_IF_ETHER_H */ diff --git a/src/basic/linux/if_link.h b/src/basic/linux/if_link.h new file mode 100644 index 0000000..1021a7e --- /dev/null +++ b/src/basic/linux/if_link.h @@ -0,0 +1,1392 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IF_LINK_H +#define _UAPI_LINUX_IF_LINK_H + +#include +#include + +/* This struct should be in sync with struct rtnl_link_stats64 */ +struct rtnl_link_stats { + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; + __u32 collisions; + /* detailed rx_errors: */ + __u32 rx_length_errors; + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; + + /* detailed tx_errors */ + __u32 tx_aborted_errors; + __u32 tx_carrier_errors; + __u32 tx_fifo_errors; + __u32 tx_heartbeat_errors; + __u32 tx_window_errors; + + /* for cslip etc */ + __u32 rx_compressed; + __u32 tx_compressed; + + __u32 rx_nohandler; +}; + +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter may include packets discarded + * due to L2 address filtering but should not include packets dropped + * by the device due to buffer exhaustion which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + * + * @rx_otherhost_dropped: Number of packets dropped due to mismatch + * in destination MAC address. + */ +struct rtnl_link_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; + __u64 rx_nohandler; + + __u64 rx_otherhost_dropped; +}; + +/* Subset of link stats useful for in-HW collection. Meaning of the fields is as + * for struct rtnl_link_stats64. + */ +struct rtnl_hw_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; +}; + +/* The struct should be in sync with struct ifmap */ +struct rtnl_link_ifmap { + __u64 mem_start; + __u64 mem_end; + __u64 base_addr; + __u16 irq; + __u8 dma; + __u8 port; +}; + +/* + * IFLA_AF_SPEC + * Contains nested attributes for address family specific attributes. + * Each address family may create a attribute with the address family + * number as type and create its own attribute structure in it. + * + * Example: + * [IFLA_AF_SPEC] = { + * [AF_INET] = { + * [IFLA_INET_CONF] = ..., + * }, + * [AF_INET6] = { + * [IFLA_INET6_FLAGS] = ..., + * [IFLA_INET6_CONF] = ..., + * } + * } + */ + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + + /* device (sysfs) name as parent, used instead + * of IFLA_LINK where there's no parent netdev + */ + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, + IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + + IFLA_DEVLINK_PORT, + + __IFLA_MAX +}; + + +#define IFLA_MAX (__IFLA_MAX - 1) + +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) +#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) +#endif + +enum { + IFLA_INET_UNSPEC, + IFLA_INET_CONF, + __IFLA_INET_MAX, +}; + +#define IFLA_INET_MAX (__IFLA_INET_MAX - 1) + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neither of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* IFLA_LINK. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +/* Subtype attributes for IFLA_PROTINFO */ +enum { + IFLA_INET6_UNSPEC, + IFLA_INET6_FLAGS, /* link flags */ + IFLA_INET6_CONF, /* sysctl parameters */ + IFLA_INET6_STATS, /* statistics */ + IFLA_INET6_MCAST, /* MC things. What of them? */ + IFLA_INET6_CACHEINFO, /* time values and max reasm size */ + IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ + IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ + __IFLA_INET6_MAX +}; + +#define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) + +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, +}; + +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, + IFLA_BR_PAD, + IFLA_BR_VLAN_STATS_ENABLED, + IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, + IFLA_BR_MCAST_QUERIER_STATE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + +enum { + BRIDGE_MODE_UNSPEC, + BRIDGE_MODE_HAIRPIN, +}; + +enum { + IFLA_BRPORT_UNSPEC, + IFLA_BRPORT_STATE, /* Spanning tree state */ + IFLA_BRPORT_PRIORITY, /* " priority */ + IFLA_BRPORT_COST, /* " cost */ + IFLA_BRPORT_MODE, /* mode (hairpin) */ + IFLA_BRPORT_GUARD, /* bpdu guard */ + IFLA_BRPORT_PROTECT, /* root port protection */ + IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, + IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, + IFLA_BRPORT_MCAST_TO_UCAST, + IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, + __IFLA_BRPORT_MAX +}; +#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) + +struct ifla_cacheinfo { + __u32 max_reasm_len; + __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ + __u32 reachable_time; + __u32 retrans_time; +}; + +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND, + IFLA_INFO_DATA, + IFLA_INFO_XSTATS, + IFLA_INFO_SLAVE_KIND, + IFLA_INFO_SLAVE_DATA, + __IFLA_INFO_MAX, +}; + +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* VLAN section */ + +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) + +struct ifla_vlan_flags { + __u32 flags; + __u32 mask; +}; + +enum { + IFLA_VLAN_QOS_UNSPEC, + IFLA_VLAN_QOS_MAPPING, + __IFLA_VLAN_QOS_MAX +}; + +#define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) + +struct ifla_vlan_qos_mapping { + __u32 from; + __u32 to; +}; + +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, + IFLA_MACVLAN_BC_QUEUE_LEN, + IFLA_MACVLAN_BC_QUEUE_LEN_USED, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ + MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, +}; + +#define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ + +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, + IPVLAN_MODE_MAX +}; + +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + +/* Tunnel RTM header */ +struct tunnel_msg { + __u8 family; + __u8 flags; + __u16 reserved2; + __u32 ifindex; +}; + +/* VXLAN section */ + +/* include statistics in the dump */ +#define TUNNEL_MSG_FLAG_STATS 0x01 + +#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS + +/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */ +enum { + VNIFILTER_ENTRY_STATS_UNSPEC, + VNIFILTER_ENTRY_STATS_RX_BYTES, + VNIFILTER_ENTRY_STATS_RX_PKTS, + VNIFILTER_ENTRY_STATS_RX_DROPS, + VNIFILTER_ENTRY_STATS_RX_ERRORS, + VNIFILTER_ENTRY_STATS_TX_BYTES, + VNIFILTER_ENTRY_STATS_TX_PKTS, + VNIFILTER_ENTRY_STATS_TX_DROPS, + VNIFILTER_ENTRY_STATS_TX_ERRORS, + VNIFILTER_ENTRY_STATS_PAD, + __VNIFILTER_ENTRY_STATS_MAX +}; +#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1) + +enum { + VXLAN_VNIFILTER_ENTRY_UNSPEC, + VXLAN_VNIFILTER_ENTRY_START, + VXLAN_VNIFILTER_ENTRY_END, + VXLAN_VNIFILTER_ENTRY_GROUP, + VXLAN_VNIFILTER_ENTRY_GROUP6, + VXLAN_VNIFILTER_ENTRY_STATS, + __VXLAN_VNIFILTER_ENTRY_MAX +}; +#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) + +enum { + VXLAN_VNIFILTER_UNSPEC, + VXLAN_VNIFILTER_ENTRY, + __VXLAN_VNIFILTER_MAX +}; +#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1) + +enum { + IFLA_VXLAN_UNSPEC, + IFLA_VXLAN_ID, + IFLA_VXLAN_GROUP, /* group or remote address */ + IFLA_VXLAN_LINK, + IFLA_VXLAN_LOCAL, + IFLA_VXLAN_TTL, + IFLA_VXLAN_TOS, + IFLA_VXLAN_LEARNING, + IFLA_VXLAN_AGEING, + IFLA_VXLAN_LIMIT, + IFLA_VXLAN_PORT_RANGE, /* source port */ + IFLA_VXLAN_PROXY, + IFLA_VXLAN_RSC, + IFLA_VXLAN_L2MISS, + IFLA_VXLAN_L3MISS, + IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, + IFLA_VXLAN_UDP_CSUM, + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, + IFLA_VXLAN_GBP, + IFLA_VXLAN_REMCSUM_NOPARTIAL, + IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + __IFLA_VXLAN_MAX +}; +#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) + +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, + IFLA_GENEVE_PORT, /* destination port */ + IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, + IFLA_GENEVE_INNER_PROTO_INHERIT, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + +/* Bareudp section */ +enum { + IFLA_BAREUDP_UNSPEC, + IFLA_BAREUDP_PORT, + IFLA_BAREUDP_ETHERTYPE, + IFLA_BAREUDP_SRCPORT_MIN, + IFLA_BAREUDP_MULTIPROTO_MODE, + __IFLA_BAREUDP_MAX +}; + +#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) + +/* PPP section */ +enum { + IFLA_PPP_UNSPEC, + IFLA_PPP_DEV_FD, + __IFLA_PPP_MAX +}; +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) + +/* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + +enum { + IFLA_GTP_UNSPEC, + IFLA_GTP_FD0, + IFLA_GTP_FD1, + IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, + IFLA_GTP_CREATE_SOCKETS, + IFLA_GTP_RESTART_COUNT, + __IFLA_GTP_MAX, +}; +#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) + +/* Bonding section */ + +enum { + IFLA_BOND_UNSPEC, + IFLA_BOND_MODE, + IFLA_BOND_ACTIVE_SLAVE, + IFLA_BOND_MIIMON, + IFLA_BOND_UPDELAY, + IFLA_BOND_DOWNDELAY, + IFLA_BOND_USE_CARRIER, + IFLA_BOND_ARP_INTERVAL, + IFLA_BOND_ARP_IP_TARGET, + IFLA_BOND_ARP_VALIDATE, + IFLA_BOND_ARP_ALL_TARGETS, + IFLA_BOND_PRIMARY, + IFLA_BOND_PRIMARY_RESELECT, + IFLA_BOND_FAIL_OVER_MAC, + IFLA_BOND_XMIT_HASH_POLICY, + IFLA_BOND_RESEND_IGMP, + IFLA_BOND_NUM_PEER_NOTIF, + IFLA_BOND_ALL_SLAVES_ACTIVE, + IFLA_BOND_MIN_LINKS, + IFLA_BOND_LP_INTERVAL, + IFLA_BOND_PACKETS_PER_SLAVE, + IFLA_BOND_AD_LACP_RATE, + IFLA_BOND_AD_SELECT, + IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, + IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, + IFLA_BOND_MISSED_MAX, + IFLA_BOND_NS_IP6_TARGET, + __IFLA_BOND_MAX, +}; + +#define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) + +enum { + IFLA_BOND_AD_INFO_UNSPEC, + IFLA_BOND_AD_INFO_AGGREGATOR, + IFLA_BOND_AD_INFO_NUM_PORTS, + IFLA_BOND_AD_INFO_ACTOR_KEY, + IFLA_BOND_AD_INFO_PARTNER_KEY, + IFLA_BOND_AD_INFO_PARTNER_MAC, + __IFLA_BOND_AD_INFO_MAX, +}; + +#define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) + +enum { + IFLA_BOND_SLAVE_UNSPEC, + IFLA_BOND_SLAVE_STATE, + IFLA_BOND_SLAVE_MII_STATUS, + IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, + IFLA_BOND_SLAVE_PERM_HWADDR, + IFLA_BOND_SLAVE_QUEUE_ID, + IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, + __IFLA_BOND_SLAVE_MAX, +}; + +#define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) + +/* SR-IOV virtual function management section */ + +enum { + IFLA_VF_INFO_UNSPEC, + IFLA_VF_INFO, + __IFLA_VF_INFO_MAX, +}; + +#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) + +enum { + IFLA_VF_UNSPEC, + IFLA_VF_MAC, /* Hardware queue specific attributes */ + IFLA_VF_VLAN, /* VLAN ID and QoS */ + IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ + IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ + IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ + IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + IFLA_VF_BROADCAST, /* VF broadcast */ + __IFLA_VF_MAX, +}; + +#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) + +struct ifla_vf_mac { + __u32 vf; + __u8 mac[32]; /* MAX_ADDR_LEN */ +}; + +struct ifla_vf_broadcast { + __u8 broadcast[32]; +}; + +struct ifla_vf_vlan { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; +}; + +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + +struct ifla_vf_tx_rate { + __u32 vf; + __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ +}; + +struct ifla_vf_rate { + __u32 vf; + __u32 min_tx_rate; /* Min Bandwidth in Mbps */ + __u32 max_tx_rate; /* Max Bandwidth in Mbps */ +}; + +struct ifla_vf_spoofchk { + __u32 vf; + __u32 setting; +}; + +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + +/* VF ports management section + * + * Nested layout of set/get msg is: + * + * [IFLA_NUM_VF] + * [IFLA_VF_PORTS] + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * ... + * [IFLA_PORT_SELF] + * [IFLA_PORT_*], ... + */ + +enum { + IFLA_VF_PORT_UNSPEC, + IFLA_VF_PORT, /* nest */ + __IFLA_VF_PORT_MAX, +}; + +#define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) + +enum { + IFLA_PORT_UNSPEC, + IFLA_PORT_VF, /* __u32 */ + IFLA_PORT_PROFILE, /* string */ + IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ + IFLA_PORT_INSTANCE_UUID, /* binary UUID */ + IFLA_PORT_HOST_UUID, /* binary UUID */ + IFLA_PORT_REQUEST, /* __u8 */ + IFLA_PORT_RESPONSE, /* __u16, output only */ + __IFLA_PORT_MAX, +}; + +#define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) + +#define PORT_PROFILE_MAX 40 +#define PORT_UUID_MAX 16 +#define PORT_SELF_VF -1 + +enum { + PORT_REQUEST_PREASSOCIATE = 0, + PORT_REQUEST_PREASSOCIATE_RR, + PORT_REQUEST_ASSOCIATE, + PORT_REQUEST_DISASSOCIATE, +}; + +enum { + PORT_VDP_RESPONSE_SUCCESS = 0, + PORT_VDP_RESPONSE_INVALID_FORMAT, + PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_VDP_RESPONSE_UNUSED_VTID, + PORT_VDP_RESPONSE_VTID_VIOLATION, + PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, + PORT_VDP_RESPONSE_OUT_OF_SYNC, + /* 0x08-0xFF reserved for future VDP use */ + PORT_PROFILE_RESPONSE_SUCCESS = 0x100, + PORT_PROFILE_RESPONSE_INPROGRESS, + PORT_PROFILE_RESPONSE_INVALID, + PORT_PROFILE_RESPONSE_BADSTATE, + PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_PROFILE_RESPONSE_ERROR, +}; + +struct ifla_port_vsi { + __u8 vsi_mgr_id; + __u8 vsi_type_id[3]; + __u8 vsi_type_version; + __u8 pad[3]; +}; + + +/* IPoIB section */ + +enum { + IFLA_IPOIB_UNSPEC, + IFLA_IPOIB_PKEY, + IFLA_IPOIB_MODE, + IFLA_IPOIB_UMCAST, + __IFLA_IPOIB_MAX +}; + +enum { + IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ + IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ +}; + +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) + + +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; + +enum { + IFLA_HSR_UNSPEC, + IFLA_HSR_SLAVE1, + IFLA_HSR_SLAVE2, + IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ + IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ + IFLA_HSR_SEQ_NR, + IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ + __IFLA_HSR_MAX, +}; + +#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) + +/* STATS section */ + +struct if_stats_msg { + __u8 family; + __u8 pad1; + __u16 pad2; + __u32 ifindex; + __u32 filter_mask; +}; + +/* A stats attribute can be netdev specific or a global stat. + * For netdev stats, lets use the prefix IFLA_STATS_LINK_* + */ +enum { + IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ + IFLA_STATS_LINK_64, + IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, + IFLA_STATS_AF_SPEC, + __IFLA_STATS_MAX, +}; + +#define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) + +#define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) + +enum { + IFLA_STATS_GETSET_UNSPEC, + IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with + * a filter mask for the corresponding group. + */ + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */ + __IFLA_STATS_GETSET_MAX, +}; + +#define IFLA_STATS_GETSET_MAX (__IFLA_STATS_GETSET_MAX - 1) + +/* These are embedded into IFLA_STATS_LINK_XSTATS: + * [IFLA_STATS_LINK_XSTATS] + * -> [LINK_XSTATS_TYPE_xxx] + * -> [rtnl link type specific attributes] + */ +enum { + LINK_XSTATS_TYPE_UNSPEC, + LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, + __LINK_XSTATS_TYPE_MAX +}; +#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) + +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO, /* HW stats info. A nest */ + IFLA_OFFLOAD_XSTATS_L3_STATS, /* struct rtnl_hw_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + +enum { + IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC, + IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, /* u8 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, /* u8 */ + __IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX, +}; +#define IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX \ + (__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX - 1) + +/* XDP section */ + +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) +#define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE | \ + XDP_FLAGS_HW_MODE) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, + XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, +}; + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, + IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + +/* tun section */ + +enum { + IFLA_TUN_UNSPEC, + IFLA_TUN_OWNER, + IFLA_TUN_GROUP, + IFLA_TUN_TYPE, + IFLA_TUN_PI, + IFLA_TUN_VNET_HDR, + IFLA_TUN_PERSIST, + IFLA_TUN_MULTI_QUEUE, + IFLA_TUN_NUM_QUEUES, + IFLA_TUN_NUM_DISABLED_QUEUES, + __IFLA_TUN_MAX, +}; + +#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) + +/* rmnet section */ + +#define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) +#define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) + +enum { + IFLA_RMNET_UNSPEC, + IFLA_RMNET_MUX_ID, + IFLA_RMNET_FLAGS, + __IFLA_RMNET_MAX, +}; + +#define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) + +struct ifla_rmnet_flags { + __u32 flags; + __u32 mask; +}; + +/* MCTP section */ + +enum { + IFLA_MCTP_UNSPEC, + IFLA_MCTP_NET, + __IFLA_MCTP_MAX, +}; + +#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) + +/* DSA section */ + +enum { + IFLA_DSA_UNSPEC, + IFLA_DSA_MASTER, + __IFLA_DSA_MAX, +}; + +#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) + +#endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/src/basic/linux/if_macsec.h b/src/basic/linux/if_macsec.h new file mode 100644 index 0000000..d5b6d1f --- /dev/null +++ b/src/basic/linux/if_macsec.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * include/uapi/linux/if_macsec.h - MACsec device + * + * Copyright (c) 2015 Sabrina Dubroca + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _UAPI_MACSEC_H +#define _UAPI_MACSEC_H + +#include + +#define MACSEC_GENL_NAME "macsec" +#define MACSEC_GENL_VERSION 1 + +#define MACSEC_MAX_KEY_LEN 128 + +#define MACSEC_KEYID_LEN 16 + +#define MACSEC_SALT_LEN 12 + +/* cipher IDs as per IEEE802.1AE-2018 (Table 14-1) */ +#define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL +#define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL +#define MACSEC_CIPHER_ID_GCM_AES_XPN_128 0x0080C20001000003ULL +#define MACSEC_CIPHER_ID_GCM_AES_XPN_256 0x0080C20001000004ULL + +/* deprecated cipher ID for GCM-AES-128 */ +#define MACSEC_DEFAULT_CIPHER_ID 0x0080020001000001ULL +#define MACSEC_DEFAULT_CIPHER_ALT MACSEC_CIPHER_ID_GCM_AES_128 + +#define MACSEC_MIN_ICV_LEN 8 +#define MACSEC_MAX_ICV_LEN 32 +/* upper limit for ICV length as recommended by IEEE802.1AE-2006 */ +#define MACSEC_STD_ICV_LEN 16 + +enum macsec_attrs { + MACSEC_ATTR_UNSPEC, + MACSEC_ATTR_IFINDEX, /* u32, ifindex of the MACsec netdevice */ + MACSEC_ATTR_RXSC_CONFIG, /* config, nested macsec_rxsc_attrs */ + MACSEC_ATTR_SA_CONFIG, /* config, nested macsec_sa_attrs */ + MACSEC_ATTR_SECY, /* dump, nested macsec_secy_attrs */ + MACSEC_ATTR_TXSA_LIST, /* dump, nested, macsec_sa_attrs for each TXSA */ + MACSEC_ATTR_RXSC_LIST, /* dump, nested, macsec_rxsc_attrs for each RXSC */ + MACSEC_ATTR_TXSC_STATS, /* dump, nested, macsec_txsc_stats_attr */ + MACSEC_ATTR_SECY_STATS, /* dump, nested, macsec_secy_stats_attr */ + MACSEC_ATTR_OFFLOAD, /* config, nested, macsec_offload_attrs */ + __MACSEC_ATTR_END, + NUM_MACSEC_ATTR = __MACSEC_ATTR_END, + MACSEC_ATTR_MAX = __MACSEC_ATTR_END - 1, +}; + +enum macsec_secy_attrs { + MACSEC_SECY_ATTR_UNSPEC, + MACSEC_SECY_ATTR_SCI, + MACSEC_SECY_ATTR_ENCODING_SA, + MACSEC_SECY_ATTR_WINDOW, + MACSEC_SECY_ATTR_CIPHER_SUITE, + MACSEC_SECY_ATTR_ICV_LEN, + MACSEC_SECY_ATTR_PROTECT, + MACSEC_SECY_ATTR_REPLAY, + MACSEC_SECY_ATTR_OPER, + MACSEC_SECY_ATTR_VALIDATE, + MACSEC_SECY_ATTR_ENCRYPT, + MACSEC_SECY_ATTR_INC_SCI, + MACSEC_SECY_ATTR_ES, + MACSEC_SECY_ATTR_SCB, + MACSEC_SECY_ATTR_PAD, + __MACSEC_SECY_ATTR_END, + NUM_MACSEC_SECY_ATTR = __MACSEC_SECY_ATTR_END, + MACSEC_SECY_ATTR_MAX = __MACSEC_SECY_ATTR_END - 1, +}; + +enum macsec_rxsc_attrs { + MACSEC_RXSC_ATTR_UNSPEC, + MACSEC_RXSC_ATTR_SCI, /* config/dump, u64 */ + MACSEC_RXSC_ATTR_ACTIVE, /* config/dump, u8 0..1 */ + MACSEC_RXSC_ATTR_SA_LIST, /* dump, nested */ + MACSEC_RXSC_ATTR_STATS, /* dump, nested, macsec_rxsc_stats_attr */ + MACSEC_RXSC_ATTR_PAD, + __MACSEC_RXSC_ATTR_END, + NUM_MACSEC_RXSC_ATTR = __MACSEC_RXSC_ATTR_END, + MACSEC_RXSC_ATTR_MAX = __MACSEC_RXSC_ATTR_END - 1, +}; + +enum macsec_sa_attrs { + MACSEC_SA_ATTR_UNSPEC, + MACSEC_SA_ATTR_AN, /* config/dump, u8 0..3 */ + MACSEC_SA_ATTR_ACTIVE, /* config/dump, u8 0..1 */ + MACSEC_SA_ATTR_PN, /* config/dump, u32/u64 (u64 if XPN) */ + MACSEC_SA_ATTR_KEY, /* config, data */ + MACSEC_SA_ATTR_KEYID, /* config/dump, 128-bit */ + MACSEC_SA_ATTR_STATS, /* dump, nested, macsec_sa_stats_attr */ + MACSEC_SA_ATTR_PAD, + MACSEC_SA_ATTR_SSCI, /* config/dump, u32 - XPN only */ + MACSEC_SA_ATTR_SALT, /* config, 96-bit - XPN only */ + __MACSEC_SA_ATTR_END, + NUM_MACSEC_SA_ATTR = __MACSEC_SA_ATTR_END, + MACSEC_SA_ATTR_MAX = __MACSEC_SA_ATTR_END - 1, +}; + +enum macsec_offload_attrs { + MACSEC_OFFLOAD_ATTR_UNSPEC, + MACSEC_OFFLOAD_ATTR_TYPE, /* config/dump, u8 0..2 */ + MACSEC_OFFLOAD_ATTR_PAD, + __MACSEC_OFFLOAD_ATTR_END, + NUM_MACSEC_OFFLOAD_ATTR = __MACSEC_OFFLOAD_ATTR_END, + MACSEC_OFFLOAD_ATTR_MAX = __MACSEC_OFFLOAD_ATTR_END - 1, +}; + +enum macsec_nl_commands { + MACSEC_CMD_GET_TXSC, + MACSEC_CMD_ADD_RXSC, + MACSEC_CMD_DEL_RXSC, + MACSEC_CMD_UPD_RXSC, + MACSEC_CMD_ADD_TXSA, + MACSEC_CMD_DEL_TXSA, + MACSEC_CMD_UPD_TXSA, + MACSEC_CMD_ADD_RXSA, + MACSEC_CMD_DEL_RXSA, + MACSEC_CMD_UPD_RXSA, + MACSEC_CMD_UPD_OFFLOAD, +}; + +/* u64 per-RXSC stats */ +enum macsec_rxsc_stats_attr { + MACSEC_RXSC_STATS_ATTR_UNSPEC, + MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, + MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, + MACSEC_RXSC_STATS_ATTR_PAD, + __MACSEC_RXSC_STATS_ATTR_END, + NUM_MACSEC_RXSC_STATS_ATTR = __MACSEC_RXSC_STATS_ATTR_END, + MACSEC_RXSC_STATS_ATTR_MAX = __MACSEC_RXSC_STATS_ATTR_END - 1, +}; + +/* u32 per-{RX,TX}SA stats */ +enum macsec_sa_stats_attr { + MACSEC_SA_STATS_ATTR_UNSPEC, + MACSEC_SA_STATS_ATTR_IN_PKTS_OK, + MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, + MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, + MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, + MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, + MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, + MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, + __MACSEC_SA_STATS_ATTR_END, + NUM_MACSEC_SA_STATS_ATTR = __MACSEC_SA_STATS_ATTR_END, + MACSEC_SA_STATS_ATTR_MAX = __MACSEC_SA_STATS_ATTR_END - 1, +}; + +/* u64 per-TXSC stats */ +enum macsec_txsc_stats_attr { + MACSEC_TXSC_STATS_ATTR_UNSPEC, + MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, + MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, + MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, + MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, + MACSEC_TXSC_STATS_ATTR_PAD, + __MACSEC_TXSC_STATS_ATTR_END, + NUM_MACSEC_TXSC_STATS_ATTR = __MACSEC_TXSC_STATS_ATTR_END, + MACSEC_TXSC_STATS_ATTR_MAX = __MACSEC_TXSC_STATS_ATTR_END - 1, +}; + +/* u64 per-SecY stats */ +enum macsec_secy_stats_attr { + MACSEC_SECY_STATS_ATTR_UNSPEC, + MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, + MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, + MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, + MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, + MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, + MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, + MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, + MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, + MACSEC_SECY_STATS_ATTR_PAD, + __MACSEC_SECY_STATS_ATTR_END, + NUM_MACSEC_SECY_STATS_ATTR = __MACSEC_SECY_STATS_ATTR_END, + MACSEC_SECY_STATS_ATTR_MAX = __MACSEC_SECY_STATS_ATTR_END - 1, +}; + +#endif /* _UAPI_MACSEC_H */ diff --git a/src/basic/linux/if_tun.h b/src/basic/linux/if_tun.h new file mode 100644 index 0000000..287cdc8 --- /dev/null +++ b/src/basic/linux/if_tun.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Universal TUN/TAP device driver. + * Copyright (C) 1999-2000 Maxim Krasnyansky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _UAPI__IF_TUN_H +#define _UAPI__IF_TUN_H + +#include +#include +#include + +/* Read queue size */ +#define TUN_READQ_SIZE 500 +/* TUN device type flags: deprecated. Use IFF_TUN/IFF_TAP instead. */ +#define TUN_TUN_DEV IFF_TUN +#define TUN_TAP_DEV IFF_TAP +#define TUN_TYPE_MASK 0x000f + +/* Ioctl defines */ +#define TUNSETNOCSUM _IOW('T', 200, int) +#define TUNSETDEBUG _IOW('T', 201, int) +#define TUNSETIFF _IOW('T', 202, int) +#define TUNSETPERSIST _IOW('T', 203, int) +#define TUNSETOWNER _IOW('T', 204, int) +#define TUNSETLINK _IOW('T', 205, int) +#define TUNSETGROUP _IOW('T', 206, int) +#define TUNGETFEATURES _IOR('T', 207, unsigned int) +#define TUNSETOFFLOAD _IOW('T', 208, unsigned int) +#define TUNSETTXFILTER _IOW('T', 209, unsigned int) +#define TUNGETIFF _IOR('T', 210, unsigned int) +#define TUNGETSNDBUF _IOR('T', 211, int) +#define TUNSETSNDBUF _IOW('T', 212, int) +#define TUNATTACHFILTER _IOW('T', 213, struct sock_fprog) +#define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog) +#define TUNGETVNETHDRSZ _IOR('T', 215, int) +#define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETIFINDEX _IOW('T', 218, unsigned int) +#define TUNGETFILTER _IOR('T', 219, struct sock_fprog) +#define TUNSETVNETLE _IOW('T', 220, int) +#define TUNGETVNETLE _IOR('T', 221, int) +/* The TUNSETVNETBE and TUNGETVNETBE ioctls are for cross-endian support on + * little-endian hosts. Not all kernel configurations support them, but all + * configurations that support SET also support GET. + */ +#define TUNSETVNETBE _IOW('T', 222, int) +#define TUNGETVNETBE _IOR('T', 223, int) +#define TUNSETSTEERINGEBPF _IOR('T', 224, int) +#define TUNSETFILTEREBPF _IOR('T', 225, int) +#define TUNSETCARRIER _IOW('T', 226, int) +#define TUNGETDEVNETNS _IO('T', 227) + +/* TUNSETIFF ifr flags */ +#define IFF_TUN 0x0001 +#define IFF_TAP 0x0002 +#define IFF_NAPI 0x0010 +#define IFF_NAPI_FRAGS 0x0020 +/* Used in TUNSETIFF to bring up tun/tap without carrier */ +#define IFF_NO_CARRIER 0x0040 +#define IFF_NO_PI 0x1000 +/* This flag has no real effect */ +#define IFF_ONE_QUEUE 0x2000 +#define IFF_VNET_HDR 0x4000 +#define IFF_TUN_EXCL 0x8000 +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 +/* read-only flag */ +#define IFF_PERSIST 0x0800 +#define IFF_NOFILTER 0x1000 + +/* Socket options */ +#define TUN_TX_TIMESTAMP 1 + +/* Features for GSO (TUNSETOFFLOAD). */ +#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ +#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ +#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ +#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ +#define TUN_F_UFO 0x10 /* I can handle UFO packets */ +#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ +#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ + +/* Protocol info prepended to the packets (when IFF_NO_PI is not set) */ +#define TUN_PKT_STRIP 0x0001 +struct tun_pi { + __u16 flags; + __be16 proto; +}; + +/* + * Filter spec (used for SETXXFILTER ioctls) + * This stuff is applicable only to the TAP (Ethernet) devices. + * If the count is zero the filter is disabled and the driver accepts + * all packets (promisc mode). + * If the filter is enabled in order to accept broadcast packets + * broadcast addr must be explicitly included in the addr list. + */ +#define TUN_FLT_ALLMULTI 0x0001 /* Accept all multicast packets */ +struct tun_filter { + __u16 flags; /* TUN_FLT_ flags see above */ + __u16 count; /* Number of addresses */ + __u8 addr[][ETH_ALEN]; +}; + +#endif /* _UAPI__IF_TUN_H */ diff --git a/src/basic/linux/if_tunnel.h b/src/basic/linux/if_tunnel.h new file mode 100644 index 0000000..1021196 --- /dev/null +++ b/src/basic/linux/if_tunnel.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_IF_TUNNEL_H_ +#define _UAPI_IF_TUNNEL_H_ + +#include +#include +#include +#include +#include + + +#define SIOCGETTUNNEL (SIOCDEVPRIVATE + 0) +#define SIOCADDTUNNEL (SIOCDEVPRIVATE + 1) +#define SIOCDELTUNNEL (SIOCDEVPRIVATE + 2) +#define SIOCCHGTUNNEL (SIOCDEVPRIVATE + 3) +#define SIOCGETPRL (SIOCDEVPRIVATE + 4) +#define SIOCADDPRL (SIOCDEVPRIVATE + 5) +#define SIOCDELPRL (SIOCDEVPRIVATE + 6) +#define SIOCCHGPRL (SIOCDEVPRIVATE + 7) +#define SIOCGET6RD (SIOCDEVPRIVATE + 8) +#define SIOCADD6RD (SIOCDEVPRIVATE + 9) +#define SIOCDEL6RD (SIOCDEVPRIVATE + 10) +#define SIOCCHG6RD (SIOCDEVPRIVATE + 11) + +#define GRE_CSUM __cpu_to_be16(0x8000) +#define GRE_ROUTING __cpu_to_be16(0x4000) +#define GRE_KEY __cpu_to_be16(0x2000) +#define GRE_SEQ __cpu_to_be16(0x1000) +#define GRE_STRICT __cpu_to_be16(0x0800) +#define GRE_REC __cpu_to_be16(0x0700) +#define GRE_ACK __cpu_to_be16(0x0080) +#define GRE_FLAGS __cpu_to_be16(0x0078) +#define GRE_VERSION __cpu_to_be16(0x0007) + +#define GRE_IS_CSUM(f) ((f) & GRE_CSUM) +#define GRE_IS_ROUTING(f) ((f) & GRE_ROUTING) +#define GRE_IS_KEY(f) ((f) & GRE_KEY) +#define GRE_IS_SEQ(f) ((f) & GRE_SEQ) +#define GRE_IS_STRICT(f) ((f) & GRE_STRICT) +#define GRE_IS_REC(f) ((f) & GRE_REC) +#define GRE_IS_ACK(f) ((f) & GRE_ACK) + +#define GRE_VERSION_0 __cpu_to_be16(0x0000) +#define GRE_VERSION_1 __cpu_to_be16(0x0001) +#define GRE_PROTO_PPP __cpu_to_be16(0x880b) +#define GRE_PPTP_KEY_MASK __cpu_to_be32(0xffff) + +struct ip_tunnel_parm { + char name[IFNAMSIZ]; + int link; + __be16 i_flags; + __be16 o_flags; + __be32 i_key; + __be32 o_key; + struct iphdr iph; +}; + +enum { + IFLA_IPTUN_UNSPEC, + IFLA_IPTUN_LINK, + IFLA_IPTUN_LOCAL, + IFLA_IPTUN_REMOTE, + IFLA_IPTUN_TTL, + IFLA_IPTUN_TOS, + IFLA_IPTUN_ENCAP_LIMIT, + IFLA_IPTUN_FLOWINFO, + IFLA_IPTUN_FLAGS, + IFLA_IPTUN_PROTO, + IFLA_IPTUN_PMTUDISC, + IFLA_IPTUN_6RD_PREFIX, + IFLA_IPTUN_6RD_RELAY_PREFIX, + IFLA_IPTUN_6RD_PREFIXLEN, + IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + IFLA_IPTUN_ENCAP_TYPE, + IFLA_IPTUN_ENCAP_FLAGS, + IFLA_IPTUN_ENCAP_SPORT, + IFLA_IPTUN_ENCAP_DPORT, + IFLA_IPTUN_COLLECT_METADATA, + IFLA_IPTUN_FWMARK, + __IFLA_IPTUN_MAX, +}; +#define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) + +enum tunnel_encap_types { + TUNNEL_ENCAP_NONE, + TUNNEL_ENCAP_FOU, + TUNNEL_ENCAP_GUE, + TUNNEL_ENCAP_MPLS, +}; + +#define TUNNEL_ENCAP_FLAG_CSUM (1<<0) +#define TUNNEL_ENCAP_FLAG_CSUM6 (1<<1) +#define TUNNEL_ENCAP_FLAG_REMCSUM (1<<2) + +/* SIT-mode i_flags */ +#define SIT_ISATAP 0x0001 + +struct ip_tunnel_prl { + __be32 addr; + __u16 flags; + __u16 __reserved; + __u32 datalen; + __u32 __reserved2; + /* data follows */ +}; + +/* PRL flags */ +#define PRL_DEFAULT 0x0001 + +struct ip_tunnel_6rd { + struct in6_addr prefix; + __be32 relay_prefix; + __u16 prefixlen; + __u16 relay_prefixlen; +}; + +enum { + IFLA_GRE_UNSPEC, + IFLA_GRE_LINK, + IFLA_GRE_IFLAGS, + IFLA_GRE_OFLAGS, + IFLA_GRE_IKEY, + IFLA_GRE_OKEY, + IFLA_GRE_LOCAL, + IFLA_GRE_REMOTE, + IFLA_GRE_TTL, + IFLA_GRE_TOS, + IFLA_GRE_PMTUDISC, + IFLA_GRE_ENCAP_LIMIT, + IFLA_GRE_FLOWINFO, + IFLA_GRE_FLAGS, + IFLA_GRE_ENCAP_TYPE, + IFLA_GRE_ENCAP_FLAGS, + IFLA_GRE_ENCAP_SPORT, + IFLA_GRE_ENCAP_DPORT, + IFLA_GRE_COLLECT_METADATA, + IFLA_GRE_IGNORE_DF, + IFLA_GRE_FWMARK, + IFLA_GRE_ERSPAN_INDEX, + IFLA_GRE_ERSPAN_VER, + IFLA_GRE_ERSPAN_DIR, + IFLA_GRE_ERSPAN_HWID, + __IFLA_GRE_MAX, +}; + +#define IFLA_GRE_MAX (__IFLA_GRE_MAX - 1) + +/* VTI-mode i_flags */ +#define VTI_ISVTI ((__force __be16)0x0001) + +enum { + IFLA_VTI_UNSPEC, + IFLA_VTI_LINK, + IFLA_VTI_IKEY, + IFLA_VTI_OKEY, + IFLA_VTI_LOCAL, + IFLA_VTI_REMOTE, + IFLA_VTI_FWMARK, + __IFLA_VTI_MAX, +}; + +#define IFLA_VTI_MAX (__IFLA_VTI_MAX - 1) + +#define TUNNEL_CSUM __cpu_to_be16(0x01) +#define TUNNEL_ROUTING __cpu_to_be16(0x02) +#define TUNNEL_KEY __cpu_to_be16(0x04) +#define TUNNEL_SEQ __cpu_to_be16(0x08) +#define TUNNEL_STRICT __cpu_to_be16(0x10) +#define TUNNEL_REC __cpu_to_be16(0x20) +#define TUNNEL_VERSION __cpu_to_be16(0x40) +#define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) +#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) +#define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) +#define TUNNEL_GTP_OPT __cpu_to_be16(0x8000) + +#define TUNNEL_OPTIONS_PRESENT \ + (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \ + TUNNEL_GTP_OPT) + +#endif /* _UAPI_IF_TUNNEL_H_ */ diff --git a/src/basic/linux/in.h b/src/basic/linux/in.h new file mode 100644 index 0000000..07a4cb1 --- /dev/null +++ b/src/basic/linux/in.h @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions of the Internet Protocol. + * + * Version: @(#)in.h 1.0.1 04/21/93 + * + * Authors: Original taken from the GNU Project file. + * Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _UAPI_LINUX_IN_H +#define _UAPI_LINUX_IN_H + +#include +#include +#include +#include + +#if __UAPI_DEF_IN_IPPROTO +/* Standard well-defined IP protocols. */ +enum { + IPPROTO_IP = 0, /* Dummy protocol for TCP */ +#define IPPROTO_IP IPPROTO_IP + IPPROTO_ICMP = 1, /* Internet Control Message Protocol */ +#define IPPROTO_ICMP IPPROTO_ICMP + IPPROTO_IGMP = 2, /* Internet Group Management Protocol */ +#define IPPROTO_IGMP IPPROTO_IGMP + IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */ +#define IPPROTO_IPIP IPPROTO_IPIP + IPPROTO_TCP = 6, /* Transmission Control Protocol */ +#define IPPROTO_TCP IPPROTO_TCP + IPPROTO_EGP = 8, /* Exterior Gateway Protocol */ +#define IPPROTO_EGP IPPROTO_EGP + IPPROTO_PUP = 12, /* PUP protocol */ +#define IPPROTO_PUP IPPROTO_PUP + IPPROTO_UDP = 17, /* User Datagram Protocol */ +#define IPPROTO_UDP IPPROTO_UDP + IPPROTO_IDP = 22, /* XNS IDP protocol */ +#define IPPROTO_IDP IPPROTO_IDP + IPPROTO_TP = 29, /* SO Transport Protocol Class 4 */ +#define IPPROTO_TP IPPROTO_TP + IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */ +#define IPPROTO_DCCP IPPROTO_DCCP + IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */ +#define IPPROTO_IPV6 IPPROTO_IPV6 + IPPROTO_RSVP = 46, /* RSVP Protocol */ +#define IPPROTO_RSVP IPPROTO_RSVP + IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ +#define IPPROTO_GRE IPPROTO_GRE + IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ +#define IPPROTO_ESP IPPROTO_ESP + IPPROTO_AH = 51, /* Authentication Header protocol */ +#define IPPROTO_AH IPPROTO_AH + IPPROTO_MTP = 92, /* Multicast Transport Protocol */ +#define IPPROTO_MTP IPPROTO_MTP + IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ +#define IPPROTO_BEETPH IPPROTO_BEETPH + IPPROTO_ENCAP = 98, /* Encapsulation Header */ +#define IPPROTO_ENCAP IPPROTO_ENCAP + IPPROTO_PIM = 103, /* Protocol Independent Multicast */ +#define IPPROTO_PIM IPPROTO_PIM + IPPROTO_COMP = 108, /* Compression Header Protocol */ +#define IPPROTO_COMP IPPROTO_COMP + IPPROTO_L2TP = 115, /* Layer 2 Tunnelling Protocol */ +#define IPPROTO_L2TP IPPROTO_L2TP + IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */ +#define IPPROTO_SCTP IPPROTO_SCTP + IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */ +#define IPPROTO_UDPLITE IPPROTO_UDPLITE + IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */ +#define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ +#define IPPROTO_ETHERNET IPPROTO_ETHERNET + IPPROTO_RAW = 255, /* Raw IP packets */ +#define IPPROTO_RAW IPPROTO_RAW + IPPROTO_MPTCP = 262, /* Multipath TCP connection */ +#define IPPROTO_MPTCP IPPROTO_MPTCP + IPPROTO_MAX +}; +#endif + +#if __UAPI_DEF_IN_ADDR +/* Internet address. */ +struct in_addr { + __be32 s_addr; +}; +#endif + +#define IP_TOS 1 +#define IP_TTL 2 +#define IP_HDRINCL 3 +#define IP_OPTIONS 4 +#define IP_ROUTER_ALERT 5 +#define IP_RECVOPTS 6 +#define IP_RETOPTS 7 +#define IP_PKTINFO 8 +#define IP_PKTOPTIONS 9 +#define IP_MTU_DISCOVER 10 +#define IP_RECVERR 11 +#define IP_RECVTTL 12 +#define IP_RECVTOS 13 +#define IP_MTU 14 +#define IP_FREEBIND 15 +#define IP_IPSEC_POLICY 16 +#define IP_XFRM_POLICY 17 +#define IP_PASSSEC 18 +#define IP_TRANSPARENT 19 + +/* BSD compatibility */ +#define IP_RECVRETOPTS IP_RETOPTS + +/* TProxy original addresses */ +#define IP_ORIGDSTADDR 20 +#define IP_RECVORIGDSTADDR IP_ORIGDSTADDR + +#define IP_MINTTL 21 +#define IP_NODEFRAG 22 +#define IP_CHECKSUM 23 +#define IP_BIND_ADDRESS_NO_PORT 24 +#define IP_RECVFRAGSIZE 25 +#define IP_RECVERR_RFC4884 26 + +/* IP_MTU_DISCOVER values */ +#define IP_PMTUDISC_DONT 0 /* Never send DF frames */ +#define IP_PMTUDISC_WANT 1 /* Use per route hints */ +#define IP_PMTUDISC_DO 2 /* Always DF */ +#define IP_PMTUDISC_PROBE 3 /* Ignore dst pmtu */ +/* Always use interface mtu (ignores dst pmtu) but don't set DF flag. + * Also incoming ICMP frag_needed notifications will be ignored on + * this socket to prevent accepting spoofed ones. + */ +#define IP_PMTUDISC_INTERFACE 4 +/* weaker version of IP_PMTUDISC_INTERFACE, which allows packets to get + * fragmented if they exeed the interface mtu + */ +#define IP_PMTUDISC_OMIT 5 + +#define IP_MULTICAST_IF 32 +#define IP_MULTICAST_TTL 33 +#define IP_MULTICAST_LOOP 34 +#define IP_ADD_MEMBERSHIP 35 +#define IP_DROP_MEMBERSHIP 36 +#define IP_UNBLOCK_SOURCE 37 +#define IP_BLOCK_SOURCE 38 +#define IP_ADD_SOURCE_MEMBERSHIP 39 +#define IP_DROP_SOURCE_MEMBERSHIP 40 +#define IP_MSFILTER 41 +#define MCAST_JOIN_GROUP 42 +#define MCAST_BLOCK_SOURCE 43 +#define MCAST_UNBLOCK_SOURCE 44 +#define MCAST_LEAVE_GROUP 45 +#define MCAST_JOIN_SOURCE_GROUP 46 +#define MCAST_LEAVE_SOURCE_GROUP 47 +#define MCAST_MSFILTER 48 +#define IP_MULTICAST_ALL 49 +#define IP_UNICAST_IF 50 + +#define MCAST_EXCLUDE 0 +#define MCAST_INCLUDE 1 + +/* These need to appear somewhere around here */ +#define IP_DEFAULT_MULTICAST_TTL 1 +#define IP_DEFAULT_MULTICAST_LOOP 1 + +/* Request struct for multicast socket ops */ + +#if __UAPI_DEF_IP_MREQ +struct ip_mreq { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +struct ip_mreqn { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_address; /* local IP address of interface */ + int imr_ifindex; /* Interface index */ +}; + +struct ip_mreq_source { + __be32 imr_multiaddr; + __be32 imr_interface; + __be32 imr_sourceaddr; +}; + +struct ip_msfilter { + __be32 imsf_multiaddr; + __be32 imsf_interface; + __u32 imsf_fmode; + __u32 imsf_numsrc; + union { + __be32 imsf_slist[1]; + __DECLARE_FLEX_ARRAY(__be32, imsf_slist_flex); + }; +}; + +#define IP_MSFILTER_SIZE(numsrc) \ + (sizeof(struct ip_msfilter) - sizeof(__u32) \ + + (numsrc) * sizeof(__u32)) + +struct group_req { + __u32 gr_interface; /* interface index */ + struct __kernel_sockaddr_storage gr_group; /* group address */ +}; + +struct group_source_req { + __u32 gsr_interface; /* interface index */ + struct __kernel_sockaddr_storage gsr_group; /* group address */ + struct __kernel_sockaddr_storage gsr_source; /* source address */ +}; + +struct group_filter { + union { + struct { + __u32 gf_interface_aux; /* interface index */ + struct __kernel_sockaddr_storage gf_group_aux; /* multicast address */ + __u32 gf_fmode_aux; /* filter mode */ + __u32 gf_numsrc_aux; /* number of sources */ + struct __kernel_sockaddr_storage gf_slist[1]; /* interface index */ + }; + struct { + __u32 gf_interface; /* interface index */ + struct __kernel_sockaddr_storage gf_group; /* multicast address */ + __u32 gf_fmode; /* filter mode */ + __u32 gf_numsrc; /* number of sources */ + struct __kernel_sockaddr_storage gf_slist_flex[]; /* interface index */ + }; + }; +}; + +#define GROUP_FILTER_SIZE(numsrc) \ + (sizeof(struct group_filter) - sizeof(struct __kernel_sockaddr_storage) \ + + (numsrc) * sizeof(struct __kernel_sockaddr_storage)) +#endif + +#if __UAPI_DEF_IN_PKTINFO +struct in_pktinfo { + int ipi_ifindex; + struct in_addr ipi_spec_dst; + struct in_addr ipi_addr; +}; +#endif + +/* Structure describing an Internet (IP) socket address. */ +#if __UAPI_DEF_SOCKADDR_IN +#define __SOCK_SIZE__ 16 /* sizeof(struct sockaddr) */ +struct sockaddr_in { + __kernel_sa_family_t sin_family; /* Address family */ + __be16 sin_port; /* Port number */ + struct in_addr sin_addr; /* Internet address */ + + /* Pad to size of `struct sockaddr'. */ + unsigned char __pad[__SOCK_SIZE__ - sizeof(short int) - + sizeof(unsigned short int) - sizeof(struct in_addr)]; +}; +#define sin_zero __pad /* for BSD UNIX comp. -FvK */ +#endif + +#if __UAPI_DEF_IN_CLASS +/* + * Definitions of the bits in an Internet address integer. + * On subnets, host and network parts are found according + * to the subnet mask, not these masks. + */ +#define IN_CLASSA(a) ((((long int) (a)) & 0x80000000) == 0) +#define IN_CLASSA_NET 0xff000000 +#define IN_CLASSA_NSHIFT 24 +#define IN_CLASSA_HOST (0xffffffff & ~IN_CLASSA_NET) +#define IN_CLASSA_MAX 128 + +#define IN_CLASSB(a) ((((long int) (a)) & 0xc0000000) == 0x80000000) +#define IN_CLASSB_NET 0xffff0000 +#define IN_CLASSB_NSHIFT 16 +#define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET) +#define IN_CLASSB_MAX 65536 + +#define IN_CLASSC(a) ((((long int) (a)) & 0xe0000000) == 0xc0000000) +#define IN_CLASSC_NET 0xffffff00 +#define IN_CLASSC_NSHIFT 8 +#define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET) + +#define IN_CLASSD(a) ((((long int) (a)) & 0xf0000000) == 0xe0000000) +#define IN_MULTICAST(a) IN_CLASSD(a) +#define IN_MULTICAST_NET 0xe0000000 + +#define IN_BADCLASS(a) (((long int) (a) ) == (long int)0xffffffff) +#define IN_EXPERIMENTAL(a) IN_BADCLASS((a)) + +#define IN_CLASSE(a) ((((long int) (a)) & 0xf0000000) == 0xf0000000) +#define IN_CLASSE_NET 0xffffffff +#define IN_CLASSE_NSHIFT 0 + +/* Address to accept any incoming messages. */ +#define INADDR_ANY ((unsigned long int) 0x00000000) + +/* Address to send to all hosts. */ +#define INADDR_BROADCAST ((unsigned long int) 0xffffffff) + +/* Address indicating an error return. */ +#define INADDR_NONE ((unsigned long int) 0xffffffff) + +/* Dummy address for src of ICMP replies if no real address is set (RFC7600). */ +#define INADDR_DUMMY ((unsigned long int) 0xc0000008) + +/* Network number for local host loopback. */ +#define IN_LOOPBACKNET 127 + +/* Address to loopback in software to local host. */ +#define INADDR_LOOPBACK 0x7f000001 /* 127.0.0.1 */ +#define IN_LOOPBACK(a) ((((long int) (a)) & 0xff000000) == 0x7f000000) + +/* Defines for Multicast INADDR */ +#define INADDR_UNSPEC_GROUP 0xe0000000U /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP 0xe0000001U /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP 0xe0000002U /* 224.0.0.2 */ +#define INADDR_ALLSNOOPERS_GROUP 0xe000006aU /* 224.0.0.106 */ +#define INADDR_MAX_LOCAL_GROUP 0xe00000ffU /* 224.0.0.255 */ +#endif + +/* contains the htonl type stuff.. */ +#include + + +#endif /* _UAPI_LINUX_IN_H */ diff --git a/src/basic/linux/in6.h b/src/basic/linux/in6.h new file mode 100644 index 0000000..c4c53a9 --- /dev/null +++ b/src/basic/linux/in6.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Types and definitions for AF_INET6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Sources: + * IPv6 Program Interfaces for BSD Systems + * + * + * Advanced Sockets API for IPv6 + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_IN6_H +#define _UAPI_LINUX_IN6_H + +#include +#include + +/* + * IPv6 address structure + */ + +#if __UAPI_DEF_IN6_ADDR +struct in6_addr { + union { + __u8 u6_addr8[16]; +#if __UAPI_DEF_IN6_ADDR_ALT + __be16 u6_addr16[8]; + __be32 u6_addr32[4]; +#endif + } in6_u; +#define s6_addr in6_u.u6_addr8 +#if __UAPI_DEF_IN6_ADDR_ALT +#define s6_addr16 in6_u.u6_addr16 +#define s6_addr32 in6_u.u6_addr32 +#endif +}; +#endif /* __UAPI_DEF_IN6_ADDR */ + +#if __UAPI_DEF_SOCKADDR_IN6 +struct sockaddr_in6 { + unsigned short int sin6_family; /* AF_INET6 */ + __be16 sin6_port; /* Transport layer port # */ + __be32 sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + __u32 sin6_scope_id; /* scope id (new in RFC2553) */ +}; +#endif /* __UAPI_DEF_SOCKADDR_IN6 */ + +#if __UAPI_DEF_IPV6_MREQ +struct ipv6_mreq { + /* IPv6 multicast address of group */ + struct in6_addr ipv6mr_multiaddr; + + /* local IPv6 address of interface */ + int ipv6mr_ifindex; +}; +#endif /* __UAPI_DEF_IVP6_MREQ */ + +#define ipv6mr_acaddr ipv6mr_multiaddr + +struct in6_flowlabel_req { + struct in6_addr flr_dst; + __be32 flr_label; + __u8 flr_action; + __u8 flr_share; + __u16 flr_flags; + __u16 flr_expires; + __u16 flr_linger; + __u32 __flr_pad; + /* Options in format of IPV6_PKTOPTIONS */ +}; + +#define IPV6_FL_A_GET 0 +#define IPV6_FL_A_PUT 1 +#define IPV6_FL_A_RENEW 2 + +#define IPV6_FL_F_CREATE 1 +#define IPV6_FL_F_EXCL 2 +#define IPV6_FL_F_REFLECT 4 +#define IPV6_FL_F_REMOTE 8 + +#define IPV6_FL_S_NONE 0 +#define IPV6_FL_S_EXCL 1 +#define IPV6_FL_S_PROCESS 2 +#define IPV6_FL_S_USER 3 +#define IPV6_FL_S_ANY 255 + + +/* + * Bitmask constant declarations to help applications select out the + * flow label and priority fields. + * + * Note that this are in host byte order while the flowinfo field of + * sockaddr_in6 is in network byte order. + */ + +#define IPV6_FLOWINFO_FLOWLABEL 0x000fffff +#define IPV6_FLOWINFO_PRIORITY 0x0ff00000 + +/* These definitions are obsolete */ +#define IPV6_PRIORITY_UNCHARACTERIZED 0x0000 +#define IPV6_PRIORITY_FILLER 0x0100 +#define IPV6_PRIORITY_UNATTENDED 0x0200 +#define IPV6_PRIORITY_RESERVED1 0x0300 +#define IPV6_PRIORITY_BULK 0x0400 +#define IPV6_PRIORITY_RESERVED2 0x0500 +#define IPV6_PRIORITY_INTERACTIVE 0x0600 +#define IPV6_PRIORITY_CONTROL 0x0700 +#define IPV6_PRIORITY_8 0x0800 +#define IPV6_PRIORITY_9 0x0900 +#define IPV6_PRIORITY_10 0x0a00 +#define IPV6_PRIORITY_11 0x0b00 +#define IPV6_PRIORITY_12 0x0c00 +#define IPV6_PRIORITY_13 0x0d00 +#define IPV6_PRIORITY_14 0x0e00 +#define IPV6_PRIORITY_15 0x0f00 + +/* + * IPV6 extension headers + */ +#if __UAPI_DEF_IPPROTO_V6 +#define IPPROTO_HOPOPTS 0 /* IPv6 hop-by-hop options */ +#define IPPROTO_ROUTING 43 /* IPv6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IPv6 fragmentation header */ +#define IPPROTO_ICMPV6 58 /* ICMPv6 */ +#define IPPROTO_NONE 59 /* IPv6 no next header */ +#define IPPROTO_DSTOPTS 60 /* IPv6 destination options */ +#define IPPROTO_MH 135 /* IPv6 mobility header */ +#endif /* __UAPI_DEF_IPPROTO_V6 */ + +/* + * IPv6 TLV options. + */ +#define IPV6_TLV_PAD1 0 +#define IPV6_TLV_PADN 1 +#define IPV6_TLV_ROUTERALERT 5 +#define IPV6_TLV_CALIPSO 7 /* RFC 5570 */ +#define IPV6_TLV_IOAM 49 /* TEMPORARY IANA allocation for IOAM */ +#define IPV6_TLV_JUMBO 194 +#define IPV6_TLV_HAO 201 /* home address option */ + +/* + * IPV6 socket options + */ +#if __UAPI_DEF_IPV6_OPTIONS +#define IPV6_ADDRFORM 1 +#define IPV6_2292PKTINFO 2 +#define IPV6_2292HOPOPTS 3 +#define IPV6_2292DSTOPTS 4 +#define IPV6_2292RTHDR 5 +#define IPV6_2292PKTOPTIONS 6 +#define IPV6_CHECKSUM 7 +#define IPV6_2292HOPLIMIT 8 +#define IPV6_NEXTHOP 9 +#define IPV6_AUTHHDR 10 /* obsolete */ +#define IPV6_FLOWINFO 11 + +#define IPV6_UNICAST_HOPS 16 +#define IPV6_MULTICAST_IF 17 +#define IPV6_MULTICAST_HOPS 18 +#define IPV6_MULTICAST_LOOP 19 +#define IPV6_ADD_MEMBERSHIP 20 +#define IPV6_DROP_MEMBERSHIP 21 +#define IPV6_ROUTER_ALERT 22 +#define IPV6_MTU_DISCOVER 23 +#define IPV6_MTU 24 +#define IPV6_RECVERR 25 +#define IPV6_V6ONLY 26 +#define IPV6_JOIN_ANYCAST 27 +#define IPV6_LEAVE_ANYCAST 28 +#define IPV6_MULTICAST_ALL 29 +#define IPV6_ROUTER_ALERT_ISOLATE 30 +#define IPV6_RECVERR_RFC4884 31 + +/* IPV6_MTU_DISCOVER values */ +#define IPV6_PMTUDISC_DONT 0 +#define IPV6_PMTUDISC_WANT 1 +#define IPV6_PMTUDISC_DO 2 +#define IPV6_PMTUDISC_PROBE 3 +/* same as IPV6_PMTUDISC_PROBE, provided for symetry with IPv4 + * also see comments on IP_PMTUDISC_INTERFACE + */ +#define IPV6_PMTUDISC_INTERFACE 4 +/* weaker version of IPV6_PMTUDISC_INTERFACE, which allows packets to + * get fragmented if they exceed the interface mtu + */ +#define IPV6_PMTUDISC_OMIT 5 + +/* Flowlabel */ +#define IPV6_FLOWLABEL_MGR 32 +#define IPV6_FLOWINFO_SEND 33 + +#define IPV6_IPSEC_POLICY 34 +#define IPV6_XFRM_POLICY 35 +#define IPV6_HDRINCL 36 +#endif + +/* + * Multicast: + * Following socket options are shared between IPv4 and IPv6. + * + * MCAST_JOIN_GROUP 42 + * MCAST_BLOCK_SOURCE 43 + * MCAST_UNBLOCK_SOURCE 44 + * MCAST_LEAVE_GROUP 45 + * MCAST_JOIN_SOURCE_GROUP 46 + * MCAST_LEAVE_SOURCE_GROUP 47 + * MCAST_MSFILTER 48 + */ + +/* + * Advanced API (RFC3542) (1) + * + * Note: IPV6_RECVRTHDRDSTOPTS does not exist. see net/ipv6/datagram.c. + */ + +#define IPV6_RECVPKTINFO 49 +#define IPV6_PKTINFO 50 +#define IPV6_RECVHOPLIMIT 51 +#define IPV6_HOPLIMIT 52 +#define IPV6_RECVHOPOPTS 53 +#define IPV6_HOPOPTS 54 +#define IPV6_RTHDRDSTOPTS 55 +#define IPV6_RECVRTHDR 56 +#define IPV6_RTHDR 57 +#define IPV6_RECVDSTOPTS 58 +#define IPV6_DSTOPTS 59 +#define IPV6_RECVPATHMTU 60 +#define IPV6_PATHMTU 61 +#define IPV6_DONTFRAG 62 +#if 0 /* not yet */ +#define IPV6_USE_MIN_MTU 63 +#endif + +/* + * Netfilter (1) + * + * Following socket options are used in ip6_tables; + * see include/linux/netfilter_ipv6/ip6_tables.h. + * + * IP6T_SO_SET_REPLACE / IP6T_SO_GET_INFO 64 + * IP6T_SO_SET_ADD_COUNTERS / IP6T_SO_GET_ENTRIES 65 + */ + +/* + * Advanced API (RFC3542) (2) + */ +#define IPV6_RECVTCLASS 66 +#define IPV6_TCLASS 67 + +/* + * Netfilter (2) + * + * Following socket options are used in ip6_tables; + * see include/linux/netfilter_ipv6/ip6_tables.h. + * + * IP6T_SO_GET_REVISION_MATCH 68 + * IP6T_SO_GET_REVISION_TARGET 69 + * IP6T_SO_ORIGINAL_DST 80 + */ + +#define IPV6_AUTOFLOWLABEL 70 +/* RFC5014: Source address selection */ +#define IPV6_ADDR_PREFERENCES 72 + +#define IPV6_PREFER_SRC_TMP 0x0001 +#define IPV6_PREFER_SRC_PUBLIC 0x0002 +#define IPV6_PREFER_SRC_PUBTMP_DEFAULT 0x0100 +#define IPV6_PREFER_SRC_COA 0x0004 +#define IPV6_PREFER_SRC_HOME 0x0400 +#define IPV6_PREFER_SRC_CGA 0x0008 +#define IPV6_PREFER_SRC_NONCGA 0x0800 + +/* RFC5082: Generalized Ttl Security Mechanism */ +#define IPV6_MINHOPCOUNT 73 + +#define IPV6_ORIGDSTADDR 74 +#define IPV6_RECVORIGDSTADDR IPV6_ORIGDSTADDR +#define IPV6_TRANSPARENT 75 +#define IPV6_UNICAST_IF 76 +#define IPV6_RECVFRAGSIZE 77 +#define IPV6_FREEBIND 78 + +/* + * Multicast Routing: + * see include/uapi/linux/mroute6.h. + * + * MRT6_BASE 200 + * ... + * MRT6_MAX + */ +#endif /* _UAPI_LINUX_IN6_H */ diff --git a/src/basic/linux/ipv6_route.h b/src/basic/linux/ipv6_route.h new file mode 100644 index 0000000..593800a --- /dev/null +++ b/src/basic/linux/ipv6_route.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_IPV6_ROUTE_H +#define _UAPI_LINUX_IPV6_ROUTE_H + +#include +#include /* For struct in6_addr. */ + +#define RTF_DEFAULT 0x00010000 /* default - learned via ND */ +#define RTF_ALLONLINK 0x00020000 /* (deprecated and will be removed) + fallback, no routers on link */ +#define RTF_ADDRCONF 0x00040000 /* addrconf route - RA */ +#define RTF_PREFIX_RT 0x00080000 /* A prefix only route - RA */ +#define RTF_ANYCAST 0x00100000 /* Anycast */ + +#define RTF_NONEXTHOP 0x00200000 /* route with no nexthop */ +#define RTF_EXPIRES 0x00400000 + +#define RTF_ROUTEINFO 0x00800000 /* route information - RA */ + +#define RTF_CACHE 0x01000000 /* read-only: can not be set by user */ +#define RTF_FLOW 0x02000000 /* flow significant route */ +#define RTF_POLICY 0x04000000 /* policy route */ + +#define RTF_PREF(pref) ((pref) << 27) +#define RTF_PREF_MASK 0x18000000 + +#define RTF_PCPU 0x40000000 /* read-only: can not be set by user */ +#define RTF_LOCAL 0x80000000 + + +struct in6_rtmsg { + struct in6_addr rtmsg_dst; + struct in6_addr rtmsg_src; + struct in6_addr rtmsg_gateway; + __u32 rtmsg_type; + __u16 rtmsg_dst_len; + __u16 rtmsg_src_len; + __u32 rtmsg_metric; + unsigned long rtmsg_info; + __u32 rtmsg_flags; + int rtmsg_ifindex; +}; + +#define RTMSG_NEWDEVICE 0x11 +#define RTMSG_DELDEVICE 0x12 +#define RTMSG_NEWROUTE 0x21 +#define RTMSG_DELROUTE 0x22 + +#define IP6_RT_PRIO_USER 1024 +#define IP6_RT_PRIO_ADDRCONF 256 + +#endif /* _UAPI_LINUX_IPV6_ROUTE_H */ diff --git a/src/basic/linux/l2tp.h b/src/basic/linux/l2tp.h new file mode 100644 index 0000000..7d81c3e --- /dev/null +++ b/src/basic/linux/l2tp.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * L2TP-over-IP socket for L2TPv3. + * + * Author: James Chapman + */ + +#ifndef _UAPI_LINUX_L2TP_H_ +#define _UAPI_LINUX_L2TP_H_ + +#include +#include +#include +#include + +/** + * struct sockaddr_l2tpip - the sockaddr structure for L2TP-over-IP sockets + * @l2tp_family: address family number AF_L2TPIP. + * @l2tp_addr: protocol specific address information + * @l2tp_conn_id: connection id of tunnel + */ +#define __SOCK_SIZE__ 16 /* sizeof(struct sockaddr) */ +struct sockaddr_l2tpip { + /* The first fields must match struct sockaddr_in */ + __kernel_sa_family_t l2tp_family; /* AF_INET */ + __be16 l2tp_unused; /* INET port number (unused) */ + struct in_addr l2tp_addr; /* Internet address */ + + __u32 l2tp_conn_id; /* Connection ID of tunnel */ + + /* Pad to size of `struct sockaddr'. */ + unsigned char __pad[__SOCK_SIZE__ - + sizeof(__kernel_sa_family_t) - + sizeof(__be16) - sizeof(struct in_addr) - + sizeof(__u32)]; +}; + +/** + * struct sockaddr_l2tpip6 - the sockaddr structure for L2TP-over-IPv6 sockets + * @l2tp_family: address family number AF_L2TPIP. + * @l2tp_addr: protocol specific address information + * @l2tp_conn_id: connection id of tunnel + */ +struct sockaddr_l2tpip6 { + /* The first fields must match struct sockaddr_in6 */ + __kernel_sa_family_t l2tp_family; /* AF_INET6 */ + __be16 l2tp_unused; /* INET port number (unused) */ + __be32 l2tp_flowinfo; /* IPv6 flow information */ + struct in6_addr l2tp_addr; /* IPv6 address */ + __u32 l2tp_scope_id; /* scope id (new in RFC2553) */ + __u32 l2tp_conn_id; /* Connection ID of tunnel */ +}; + +/***************************************************************************** + * NETLINK_GENERIC netlink family. + *****************************************************************************/ + +/* + * Commands. + * Valid TLVs of each command are:- + * TUNNEL_CREATE - CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, udpcsum + * TUNNEL_DELETE - CONN_ID + * TUNNEL_MODIFY - CONN_ID, udpcsum + * TUNNEL_GETSTATS - CONN_ID, (stats) + * TUNNEL_GET - CONN_ID, (...) + * SESSION_CREATE - SESSION_ID, PW_TYPE, cookie, peer_cookie, l2spec + * SESSION_DELETE - SESSION_ID + * SESSION_MODIFY - SESSION_ID + * SESSION_GET - SESSION_ID, (...) + * SESSION_GETSTATS - SESSION_ID, (stats) + * + */ +enum { + L2TP_CMD_NOOP, + L2TP_CMD_TUNNEL_CREATE, + L2TP_CMD_TUNNEL_DELETE, + L2TP_CMD_TUNNEL_MODIFY, + L2TP_CMD_TUNNEL_GET, + L2TP_CMD_SESSION_CREATE, + L2TP_CMD_SESSION_DELETE, + L2TP_CMD_SESSION_MODIFY, + L2TP_CMD_SESSION_GET, + __L2TP_CMD_MAX, +}; + +#define L2TP_CMD_MAX (__L2TP_CMD_MAX - 1) + +/* + * ATTR types defined for L2TP + */ +enum { + L2TP_ATTR_NONE, /* no data */ + L2TP_ATTR_PW_TYPE, /* u16, enum l2tp_pwtype */ + L2TP_ATTR_ENCAP_TYPE, /* u16, enum l2tp_encap_type */ + L2TP_ATTR_OFFSET, /* u16 (not used) */ + L2TP_ATTR_DATA_SEQ, /* u16 (not used) */ + L2TP_ATTR_L2SPEC_TYPE, /* u8, enum l2tp_l2spec_type */ + L2TP_ATTR_L2SPEC_LEN, /* u8 (not used) */ + L2TP_ATTR_PROTO_VERSION, /* u8 */ + L2TP_ATTR_IFNAME, /* string */ + L2TP_ATTR_CONN_ID, /* u32 */ + L2TP_ATTR_PEER_CONN_ID, /* u32 */ + L2TP_ATTR_SESSION_ID, /* u32 */ + L2TP_ATTR_PEER_SESSION_ID, /* u32 */ + L2TP_ATTR_UDP_CSUM, /* u8 */ + L2TP_ATTR_VLAN_ID, /* u16 (not used) */ + L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ + L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ + L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags (not used) */ + L2TP_ATTR_RECV_SEQ, /* u8 */ + L2TP_ATTR_SEND_SEQ, /* u8 */ + L2TP_ATTR_LNS_MODE, /* u8 */ + L2TP_ATTR_USING_IPSEC, /* u8 */ + L2TP_ATTR_RECV_TIMEOUT, /* msec */ + L2TP_ATTR_FD, /* int */ + L2TP_ATTR_IP_SADDR, /* u32 */ + L2TP_ATTR_IP_DADDR, /* u32 */ + L2TP_ATTR_UDP_SPORT, /* u16 */ + L2TP_ATTR_UDP_DPORT, /* u16 */ + L2TP_ATTR_MTU, /* u16 (not used) */ + L2TP_ATTR_MRU, /* u16 (not used) */ + L2TP_ATTR_STATS, /* nested */ + L2TP_ATTR_IP6_SADDR, /* struct in6_addr */ + L2TP_ATTR_IP6_DADDR, /* struct in6_addr */ + L2TP_ATTR_UDP_ZERO_CSUM6_TX, /* flag */ + L2TP_ATTR_UDP_ZERO_CSUM6_RX, /* flag */ + L2TP_ATTR_PAD, + __L2TP_ATTR_MAX, +}; + +#define L2TP_ATTR_MAX (__L2TP_ATTR_MAX - 1) + +/* Nested in L2TP_ATTR_STATS */ +enum { + L2TP_ATTR_STATS_NONE, /* no data */ + L2TP_ATTR_TX_PACKETS, /* u64 */ + L2TP_ATTR_TX_BYTES, /* u64 */ + L2TP_ATTR_TX_ERRORS, /* u64 */ + L2TP_ATTR_RX_PACKETS, /* u64 */ + L2TP_ATTR_RX_BYTES, /* u64 */ + L2TP_ATTR_RX_SEQ_DISCARDS, /* u64 */ + L2TP_ATTR_RX_OOS_PACKETS, /* u64 */ + L2TP_ATTR_RX_ERRORS, /* u64 */ + L2TP_ATTR_STATS_PAD, + L2TP_ATTR_RX_COOKIE_DISCARDS, /* u64 */ + L2TP_ATTR_RX_INVALID, /* u64 */ + __L2TP_ATTR_STATS_MAX, +}; + +#define L2TP_ATTR_STATS_MAX (__L2TP_ATTR_STATS_MAX - 1) + +enum l2tp_pwtype { + L2TP_PWTYPE_NONE = 0x0000, + L2TP_PWTYPE_ETH_VLAN = 0x0004, + L2TP_PWTYPE_ETH = 0x0005, + L2TP_PWTYPE_PPP = 0x0007, + L2TP_PWTYPE_PPP_AC = 0x0008, + L2TP_PWTYPE_IP = 0x000b, + __L2TP_PWTYPE_MAX +}; + +enum l2tp_l2spec_type { + L2TP_L2SPECTYPE_NONE, + L2TP_L2SPECTYPE_DEFAULT, +}; + +enum l2tp_encap_type { + L2TP_ENCAPTYPE_UDP, + L2TP_ENCAPTYPE_IP, +}; + +/* For L2TP_ATTR_DATA_SEQ. Unused. */ +enum l2tp_seqmode { + L2TP_SEQ_NONE = 0, + L2TP_SEQ_IP = 1, + L2TP_SEQ_ALL = 2, +}; + +/** + * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions. + * + * Unused. + * + * @L2TP_MSG_DEBUG: verbose debug (if compiled in) + * @L2TP_MSG_CONTROL: userspace - kernel interface + * @L2TP_MSG_SEQ: sequence numbers + * @L2TP_MSG_DATA: data packets + */ +enum l2tp_debug_flags { + L2TP_MSG_DEBUG = (1 << 0), + L2TP_MSG_CONTROL = (1 << 1), + L2TP_MSG_SEQ = (1 << 2), + L2TP_MSG_DATA = (1 << 3), +}; + +/* + * NETLINK_GENERIC related info + */ +#define L2TP_GENL_NAME "l2tp" +#define L2TP_GENL_VERSION 0x1 +#define L2TP_GENL_MCGROUP "l2tp" + +#endif /* _UAPI_LINUX_L2TP_H_ */ diff --git a/src/basic/linux/libc-compat.h b/src/basic/linux/libc-compat.h new file mode 100644 index 0000000..8254c93 --- /dev/null +++ b/src/basic/linux/libc-compat.h @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Compatibility interface for userspace libc header coordination: + * + * Define compatibility macros that are used to control the inclusion or + * exclusion of UAPI structures and definitions in coordination with another + * userspace C library. + * + * This header is intended to solve the problem of UAPI definitions that + * conflict with userspace definitions. If a UAPI header has such conflicting + * definitions then the solution is as follows: + * + * * Synchronize the UAPI header and the libc headers so either one can be + * used and such that the ABI is preserved. If this is not possible then + * no simple compatibility interface exists (you need to write translating + * wrappers and rename things) and you can't use this interface. + * + * Then follow this process: + * + * (a) Include libc-compat.h in the UAPI header. + * e.g. #include + * This include must be as early as possible. + * + * (b) In libc-compat.h add enough code to detect that the comflicting + * userspace libc header has been included first. + * + * (c) If the userspace libc header has been included first define a set of + * guard macros of the form __UAPI_DEF_FOO and set their values to 1, else + * set their values to 0. + * + * (d) Back in the UAPI header with the conflicting definitions, guard the + * definitions with: + * #if __UAPI_DEF_FOO + * ... + * #endif + * + * This fixes the situation where the linux headers are included *after* the + * libc headers. To fix the problem with the inclusion in the other order the + * userspace libc headers must be fixed like this: + * + * * For all definitions that conflict with kernel definitions wrap those + * defines in the following: + * #if !__UAPI_DEF_FOO + * ... + * #endif + * + * This prevents the redefinition of a construct already defined by the kernel. + */ +#ifndef _UAPI_LIBC_COMPAT_H +#define _UAPI_LIBC_COMPAT_H + +/* We have included glibc headers... */ +#if defined(__GLIBC__) + +/* Coordinate with glibc net/if.h header. */ +#if defined(_NET_IF_H) && defined(__USE_MISC) + +/* GLIBC headers included first so don't define anything + * that would already be defined. */ + +#define __UAPI_DEF_IF_IFCONF 0 +#define __UAPI_DEF_IF_IFMAP 0 +#define __UAPI_DEF_IF_IFNAMSIZ 0 +#define __UAPI_DEF_IF_IFREQ 0 +/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 0 +/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ + +#else /* _NET_IF_H */ + +/* Linux headers included first, and we must define everything + * we need. The expectation is that glibc will check the + * __UAPI_DEF_* defines and adjust appropriately. */ + +#define __UAPI_DEF_IF_IFCONF 1 +#define __UAPI_DEF_IF_IFMAP 1 +#define __UAPI_DEF_IF_IFNAMSIZ 1 +#define __UAPI_DEF_IF_IFREQ 1 +/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1 +/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 + +#endif /* _NET_IF_H */ + +/* Coordinate with glibc netinet/in.h header. */ +#if defined(_NETINET_IN_H) + +/* GLIBC headers included first so don't define anything + * that would already be defined. */ +#define __UAPI_DEF_IN_ADDR 0 +#define __UAPI_DEF_IN_IPPROTO 0 +#define __UAPI_DEF_IN_PKTINFO 0 +#define __UAPI_DEF_IP_MREQ 0 +#define __UAPI_DEF_SOCKADDR_IN 0 +#define __UAPI_DEF_IN_CLASS 0 + +#define __UAPI_DEF_IN6_ADDR 0 +/* The exception is the in6_addr macros which must be defined + * if the glibc code didn't define them. This guard matches + * the guard in glibc/inet/netinet/in.h which defines the + * additional in6_addr macros e.g. s6_addr16, and s6_addr32. */ +#if defined(__USE_MISC) || defined (__USE_GNU) +#define __UAPI_DEF_IN6_ADDR_ALT 0 +#else +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#endif +#define __UAPI_DEF_SOCKADDR_IN6 0 +#define __UAPI_DEF_IPV6_MREQ 0 +#define __UAPI_DEF_IPPROTO_V6 0 +#define __UAPI_DEF_IPV6_OPTIONS 0 +#define __UAPI_DEF_IN6_PKTINFO 0 +#define __UAPI_DEF_IP6_MTUINFO 0 + +#else + +/* Linux headers included first, and we must define everything + * we need. The expectation is that glibc will check the + * __UAPI_DEF_* defines and adjust appropriately. */ +#define __UAPI_DEF_IN_ADDR 1 +#define __UAPI_DEF_IN_IPPROTO 1 +#define __UAPI_DEF_IN_PKTINFO 1 +#define __UAPI_DEF_IP_MREQ 1 +#define __UAPI_DEF_SOCKADDR_IN 1 +#define __UAPI_DEF_IN_CLASS 1 + +#define __UAPI_DEF_IN6_ADDR 1 +/* We unconditionally define the in6_addr macros and glibc must + * coordinate. */ +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#define __UAPI_DEF_SOCKADDR_IN6 1 +#define __UAPI_DEF_IPV6_MREQ 1 +#define __UAPI_DEF_IPPROTO_V6 1 +#define __UAPI_DEF_IPV6_OPTIONS 1 +#define __UAPI_DEF_IN6_PKTINFO 1 +#define __UAPI_DEF_IP6_MTUINFO 1 + +#endif /* _NETINET_IN_H */ + +/* Coordinate with glibc netipx/ipx.h header. */ +#if defined(__NETIPX_IPX_H) + +#define __UAPI_DEF_SOCKADDR_IPX 0 +#define __UAPI_DEF_IPX_ROUTE_DEFINITION 0 +#define __UAPI_DEF_IPX_INTERFACE_DEFINITION 0 +#define __UAPI_DEF_IPX_CONFIG_DATA 0 +#define __UAPI_DEF_IPX_ROUTE_DEF 0 + +#else /* defined(__NETIPX_IPX_H) */ + +#define __UAPI_DEF_SOCKADDR_IPX 1 +#define __UAPI_DEF_IPX_ROUTE_DEFINITION 1 +#define __UAPI_DEF_IPX_INTERFACE_DEFINITION 1 +#define __UAPI_DEF_IPX_CONFIG_DATA 1 +#define __UAPI_DEF_IPX_ROUTE_DEF 1 + +#endif /* defined(__NETIPX_IPX_H) */ + +/* Definitions for xattr.h */ +#if defined(_SYS_XATTR_H) +#define __UAPI_DEF_XATTR 0 +#else +#define __UAPI_DEF_XATTR 1 +#endif + +/* If we did not see any headers from any supported C libraries, + * or we are being included in the kernel, then define everything + * that we need. Check for previous __UAPI_* definitions to give + * unsupported C libraries a way to opt out of any kernel definition. */ +#else /* !defined(__GLIBC__) */ + +/* Definitions for if.h */ +#ifndef __UAPI_DEF_IF_IFCONF +#define __UAPI_DEF_IF_IFCONF 1 +#endif +#ifndef __UAPI_DEF_IF_IFMAP +#define __UAPI_DEF_IF_IFMAP 1 +#endif +#ifndef __UAPI_DEF_IF_IFNAMSIZ +#define __UAPI_DEF_IF_IFNAMSIZ 1 +#endif +#ifndef __UAPI_DEF_IF_IFREQ +#define __UAPI_DEF_IF_IFREQ 1 +#endif +/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1 +#endif +/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 +#endif + +/* Definitions for in.h */ +#ifndef __UAPI_DEF_IN_ADDR +#define __UAPI_DEF_IN_ADDR 1 +#endif +#ifndef __UAPI_DEF_IN_IPPROTO +#define __UAPI_DEF_IN_IPPROTO 1 +#endif +#ifndef __UAPI_DEF_IN_PKTINFO +#define __UAPI_DEF_IN_PKTINFO 1 +#endif +#ifndef __UAPI_DEF_IP_MREQ +#define __UAPI_DEF_IP_MREQ 1 +#endif +#ifndef __UAPI_DEF_SOCKADDR_IN +#define __UAPI_DEF_SOCKADDR_IN 1 +#endif +#ifndef __UAPI_DEF_IN_CLASS +#define __UAPI_DEF_IN_CLASS 1 +#endif + +/* Definitions for in6.h */ +#ifndef __UAPI_DEF_IN6_ADDR +#define __UAPI_DEF_IN6_ADDR 1 +#endif +#ifndef __UAPI_DEF_IN6_ADDR_ALT +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#endif +#ifndef __UAPI_DEF_SOCKADDR_IN6 +#define __UAPI_DEF_SOCKADDR_IN6 1 +#endif +#ifndef __UAPI_DEF_IPV6_MREQ +#define __UAPI_DEF_IPV6_MREQ 1 +#endif +#ifndef __UAPI_DEF_IPPROTO_V6 +#define __UAPI_DEF_IPPROTO_V6 1 +#endif +#ifndef __UAPI_DEF_IPV6_OPTIONS +#define __UAPI_DEF_IPV6_OPTIONS 1 +#endif +#ifndef __UAPI_DEF_IN6_PKTINFO +#define __UAPI_DEF_IN6_PKTINFO 1 +#endif +#ifndef __UAPI_DEF_IP6_MTUINFO +#define __UAPI_DEF_IP6_MTUINFO 1 +#endif + +/* Definitions for ipx.h */ +#ifndef __UAPI_DEF_SOCKADDR_IPX +#define __UAPI_DEF_SOCKADDR_IPX 1 +#endif +#ifndef __UAPI_DEF_IPX_ROUTE_DEFINITION +#define __UAPI_DEF_IPX_ROUTE_DEFINITION 1 +#endif +#ifndef __UAPI_DEF_IPX_INTERFACE_DEFINITION +#define __UAPI_DEF_IPX_INTERFACE_DEFINITION 1 +#endif +#ifndef __UAPI_DEF_IPX_CONFIG_DATA +#define __UAPI_DEF_IPX_CONFIG_DATA 1 +#endif +#ifndef __UAPI_DEF_IPX_ROUTE_DEF +#define __UAPI_DEF_IPX_ROUTE_DEF 1 +#endif + +/* Definitions for xattr.h */ +#ifndef __UAPI_DEF_XATTR +#define __UAPI_DEF_XATTR 1 +#endif + +#endif /* __GLIBC__ */ + +#endif /* _UAPI_LIBC_COMPAT_H */ diff --git a/src/basic/linux/mrp_bridge.h b/src/basic/linux/mrp_bridge.h new file mode 100644 index 0000000..bd4424d --- /dev/null +++ b/src/basic/linux/mrp_bridge.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_MRP_BRIDGE_H_ +#define _UAPI_LINUX_MRP_BRIDGE_H_ + +#include +#include + +#define MRP_MAX_FRAME_LENGTH 200 +#define MRP_DEFAULT_PRIO 0x8000 +#define MRP_DOMAIN_UUID_LENGTH 16 +#define MRP_VERSION 1 +#define MRP_FRAME_PRIO 7 +#define MRP_OUI_LENGTH 3 +#define MRP_MANUFACTURE_DATA_LENGTH 2 + +enum br_mrp_ring_role_type { + BR_MRP_RING_ROLE_DISABLED, + BR_MRP_RING_ROLE_MRC, + BR_MRP_RING_ROLE_MRM, + BR_MRP_RING_ROLE_MRA, +}; + +enum br_mrp_in_role_type { + BR_MRP_IN_ROLE_DISABLED, + BR_MRP_IN_ROLE_MIC, + BR_MRP_IN_ROLE_MIM, +}; + +enum br_mrp_ring_state_type { + BR_MRP_RING_STATE_OPEN, + BR_MRP_RING_STATE_CLOSED, +}; + +enum br_mrp_in_state_type { + BR_MRP_IN_STATE_OPEN, + BR_MRP_IN_STATE_CLOSED, +}; + +enum br_mrp_port_state_type { + BR_MRP_PORT_STATE_DISABLED, + BR_MRP_PORT_STATE_BLOCKED, + BR_MRP_PORT_STATE_FORWARDING, + BR_MRP_PORT_STATE_NOT_CONNECTED, +}; + +enum br_mrp_port_role_type { + BR_MRP_PORT_ROLE_PRIMARY, + BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_INTER, +}; + +enum br_mrp_tlv_header_type { + BR_MRP_TLV_HEADER_END = 0x0, + BR_MRP_TLV_HEADER_COMMON = 0x1, + BR_MRP_TLV_HEADER_RING_TEST = 0x2, + BR_MRP_TLV_HEADER_RING_TOPO = 0x3, + BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, + BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_IN_TEST = 0x6, + BR_MRP_TLV_HEADER_IN_TOPO = 0x7, + BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8, + BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9, + BR_MRP_TLV_HEADER_IN_LINK_STATUS = 0xa, + BR_MRP_TLV_HEADER_OPTION = 0x7f, +}; + +enum br_mrp_sub_tlv_header_type { + BR_MRP_SUB_TLV_HEADER_TEST_MGR_NACK = 0x1, + BR_MRP_SUB_TLV_HEADER_TEST_PROPAGATE = 0x2, + BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR = 0x3, +}; + +#endif diff --git a/src/basic/linux/netdevice.h b/src/basic/linux/netdevice.h new file mode 100644 index 0000000..f3770c5 --- /dev/null +++ b/src/basic/linux/netdevice.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the Interfaces handler. + * + * Version: @(#)dev.h 1.0.10 08/12/93 + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Corey Minyard + * Donald J. Becker, + * Alan Cox, + * Bjorn Ekwall. + * Pekka Riikonen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Moved to /usr/include/linux for NET3 + */ +#ifndef _UAPI_LINUX_NETDEVICE_H +#define _UAPI_LINUX_NETDEVICE_H + +#include +#include +#include +#include + + +#define MAX_ADDR_LEN 32 /* Largest hardware address length */ + +/* Initial net device group. All devices belong to group 0 by default. */ +#define INIT_NETDEV_GROUP 0 + + +/* interface name assignment types (sysfs name_assign_type attribute) */ +#define NET_NAME_UNKNOWN 0 /* unknown origin (not exposed to userspace) */ +#define NET_NAME_ENUM 1 /* enumerated by kernel */ +#define NET_NAME_PREDICTABLE 2 /* predictably named by the kernel */ +#define NET_NAME_USER 3 /* provided by user-space */ +#define NET_NAME_RENAMED 4 /* renamed by user-space */ + +/* Media selection options. */ +enum { + IF_PORT_UNKNOWN = 0, + IF_PORT_10BASE2, + IF_PORT_10BASET, + IF_PORT_AUI, + IF_PORT_100BASET, + IF_PORT_100BASETX, + IF_PORT_100BASEFX +}; + +/* hardware address assignment types */ +#define NET_ADDR_PERM 0 /* address is permanent (default) */ +#define NET_ADDR_RANDOM 1 /* address is generated randomly */ +#define NET_ADDR_STOLEN 2 /* address is stolen from other device */ +#define NET_ADDR_SET 3 /* address is set using + * dev_set_mac_address() */ + +#endif /* _UAPI_LINUX_NETDEVICE_H */ diff --git a/src/basic/linux/netfilter/nf_tables.h b/src/basic/linux/netfilter/nf_tables.h new file mode 100644 index 0000000..cfa844d --- /dev/null +++ b/src/basic/linux/netfilter/nf_tables.h @@ -0,0 +1,1963 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_NF_TABLES_H +#define _LINUX_NF_TABLES_H + +#define NFT_NAME_MAXLEN 256 +#define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_OBJ_MAXNAMELEN NFT_NAME_MAXLEN +#define NFT_USERDATA_MAXLEN 256 +#define NFT_OSF_MAXGENRELEN 16 + +/** + * enum nft_registers - nf_tables registers + * + * nf_tables used to have five registers: a verdict register and four data + * registers of size 16. The data registers have been changed to 16 registers + * of size 4. For compatibility reasons, the NFT_REG_[1-4] registers still + * map to areas of size 16, the 4 byte registers are addressed using + * NFT_REG32_00 - NFT_REG32_15. + */ +enum nft_registers { + NFT_REG_VERDICT, + NFT_REG_1, + NFT_REG_2, + NFT_REG_3, + NFT_REG_4, + __NFT_REG_MAX, + + NFT_REG32_00 = 8, + NFT_REG32_01, + NFT_REG32_02, + NFT_REG32_03, + NFT_REG32_04, + NFT_REG32_05, + NFT_REG32_06, + NFT_REG32_07, + NFT_REG32_08, + NFT_REG32_09, + NFT_REG32_10, + NFT_REG32_11, + NFT_REG32_12, + NFT_REG32_13, + NFT_REG32_14, + NFT_REG32_15, +}; +#define NFT_REG_MAX (__NFT_REG_MAX - 1) + +#define NFT_REG_SIZE 16 +#define NFT_REG32_SIZE 4 +#define NFT_REG32_COUNT (NFT_REG32_15 - NFT_REG32_00 + 1) + +/** + * enum nft_verdicts - nf_tables internal verdicts + * + * @NFT_CONTINUE: continue evaluation of the current rule + * @NFT_BREAK: terminate evaluation of the current rule + * @NFT_JUMP: push the current chain on the jump stack and jump to a chain + * @NFT_GOTO: jump to a chain without pushing the current chain on the jump stack + * @NFT_RETURN: return to the topmost chain on the jump stack + * + * The nf_tables verdicts share their numeric space with the netfilter verdicts. + */ +enum nft_verdicts { + NFT_CONTINUE = -1, + NFT_BREAK = -2, + NFT_JUMP = -3, + NFT_GOTO = -4, + NFT_RETURN = -5, +}; + +/** + * enum nf_tables_msg_types - nf_tables netlink message types + * + * @NFT_MSG_NEWTABLE: create a new table (enum nft_table_attributes) + * @NFT_MSG_GETTABLE: get a table (enum nft_table_attributes) + * @NFT_MSG_DELTABLE: delete a table (enum nft_table_attributes) + * @NFT_MSG_NEWCHAIN: create a new chain (enum nft_chain_attributes) + * @NFT_MSG_GETCHAIN: get a chain (enum nft_chain_attributes) + * @NFT_MSG_DELCHAIN: delete a chain (enum nft_chain_attributes) + * @NFT_MSG_NEWRULE: create a new rule (enum nft_rule_attributes) + * @NFT_MSG_GETRULE: get a rule (enum nft_rule_attributes) + * @NFT_MSG_DELRULE: delete a rule (enum nft_rule_attributes) + * @NFT_MSG_NEWSET: create a new set (enum nft_set_attributes) + * @NFT_MSG_GETSET: get a set (enum nft_set_attributes) + * @NFT_MSG_DELSET: delete a set (enum nft_set_attributes) + * @NFT_MSG_NEWSETELEM: create a new set element (enum nft_set_elem_attributes) + * @NFT_MSG_GETSETELEM: get a set element (enum nft_set_elem_attributes) + * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) + * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) + * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) + * @NFT_MSG_TRACE: trace event (enum nft_trace_attributes) + * @NFT_MSG_NEWOBJ: create a stateful object (enum nft_obj_attributes) + * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes) + * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes) + * @NFT_MSG_GETOBJ_RESET: get and reset a stateful object (enum nft_obj_attributes) + * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) + * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes) + */ +enum nf_tables_msg_types { + NFT_MSG_NEWTABLE, + NFT_MSG_GETTABLE, + NFT_MSG_DELTABLE, + NFT_MSG_NEWCHAIN, + NFT_MSG_GETCHAIN, + NFT_MSG_DELCHAIN, + NFT_MSG_NEWRULE, + NFT_MSG_GETRULE, + NFT_MSG_DELRULE, + NFT_MSG_NEWSET, + NFT_MSG_GETSET, + NFT_MSG_DELSET, + NFT_MSG_NEWSETELEM, + NFT_MSG_GETSETELEM, + NFT_MSG_DELSETELEM, + NFT_MSG_NEWGEN, + NFT_MSG_GETGEN, + NFT_MSG_TRACE, + NFT_MSG_NEWOBJ, + NFT_MSG_GETOBJ, + NFT_MSG_DELOBJ, + NFT_MSG_GETOBJ_RESET, + NFT_MSG_NEWFLOWTABLE, + NFT_MSG_GETFLOWTABLE, + NFT_MSG_DELFLOWTABLE, + NFT_MSG_GETRULE_RESET, + NFT_MSG_MAX, +}; + +/** + * enum nft_list_attributes - nf_tables generic list netlink attributes + * + * @NFTA_LIST_ELEM: list element (NLA_NESTED) + */ +enum nft_list_attributes { + NFTA_LIST_UNSPEC, + NFTA_LIST_ELEM, + __NFTA_LIST_MAX +}; +#define NFTA_LIST_MAX (__NFTA_LIST_MAX - 1) + +/** + * enum nft_hook_attributes - nf_tables netfilter hook netlink attributes + * + * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) + * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) + * @NFTA_HOOK_DEV: netdevice name (NLA_STRING) + * @NFTA_HOOK_DEVS: list of netdevices (NLA_NESTED) + */ +enum nft_hook_attributes { + NFTA_HOOK_UNSPEC, + NFTA_HOOK_HOOKNUM, + NFTA_HOOK_PRIORITY, + NFTA_HOOK_DEV, + NFTA_HOOK_DEVS, + __NFTA_HOOK_MAX +}; +#define NFTA_HOOK_MAX (__NFTA_HOOK_MAX - 1) + +/** + * enum nft_table_flags - nf_tables table flags + * + * @NFT_TABLE_F_DORMANT: this table is not active + */ +enum nft_table_flags { + NFT_TABLE_F_DORMANT = 0x1, + NFT_TABLE_F_OWNER = 0x2, +}; +#define NFT_TABLE_F_MASK (NFT_TABLE_F_DORMANT | \ + NFT_TABLE_F_OWNER) + +/** + * enum nft_table_attributes - nf_tables table netlink attributes + * + * @NFTA_TABLE_NAME: name of the table (NLA_STRING) + * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) + * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) + * @NFTA_TABLE_USERDATA: user data (NLA_BINARY) + * @NFTA_TABLE_OWNER: owner of this table through netlink portID (NLA_U32) + */ +enum nft_table_attributes { + NFTA_TABLE_UNSPEC, + NFTA_TABLE_NAME, + NFTA_TABLE_FLAGS, + NFTA_TABLE_USE, + NFTA_TABLE_HANDLE, + NFTA_TABLE_PAD, + NFTA_TABLE_USERDATA, + NFTA_TABLE_OWNER, + __NFTA_TABLE_MAX +}; +#define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) + +enum nft_chain_flags { + NFT_CHAIN_BASE = (1 << 0), + NFT_CHAIN_HW_OFFLOAD = (1 << 1), + NFT_CHAIN_BINDING = (1 << 2), +}; +#define NFT_CHAIN_FLAGS (NFT_CHAIN_BASE | \ + NFT_CHAIN_HW_OFFLOAD | \ + NFT_CHAIN_BINDING) + +/** + * enum nft_chain_attributes - nf_tables chain netlink attributes + * + * @NFTA_CHAIN_TABLE: name of the table containing the chain (NLA_STRING) + * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64) + * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING) + * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes) + * @NFTA_CHAIN_POLICY: numeric policy of the chain (NLA_U32) + * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32) + * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING) + * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) + * @NFTA_CHAIN_FLAGS: chain flags + * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) + * @NFTA_CHAIN_USERDATA: user data (NLA_BINARY) + */ +enum nft_chain_attributes { + NFTA_CHAIN_UNSPEC, + NFTA_CHAIN_TABLE, + NFTA_CHAIN_HANDLE, + NFTA_CHAIN_NAME, + NFTA_CHAIN_HOOK, + NFTA_CHAIN_POLICY, + NFTA_CHAIN_USE, + NFTA_CHAIN_TYPE, + NFTA_CHAIN_COUNTERS, + NFTA_CHAIN_PAD, + NFTA_CHAIN_FLAGS, + NFTA_CHAIN_ID, + NFTA_CHAIN_USERDATA, + __NFTA_CHAIN_MAX +}; +#define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) + +/** + * enum nft_rule_attributes - nf_tables rule netlink attributes + * + * @NFTA_RULE_TABLE: name of the table containing the rule (NLA_STRING) + * @NFTA_RULE_CHAIN: name of the chain containing the rule (NLA_STRING) + * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64) + * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes) + * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes) + * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64) + * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN) + * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32) + * @NFTA_RULE_POSITION_ID: transaction unique identifier of the previous rule (NLA_U32) + */ +enum nft_rule_attributes { + NFTA_RULE_UNSPEC, + NFTA_RULE_TABLE, + NFTA_RULE_CHAIN, + NFTA_RULE_HANDLE, + NFTA_RULE_EXPRESSIONS, + NFTA_RULE_COMPAT, + NFTA_RULE_POSITION, + NFTA_RULE_USERDATA, + NFTA_RULE_PAD, + NFTA_RULE_ID, + NFTA_RULE_POSITION_ID, + NFTA_RULE_CHAIN_ID, + __NFTA_RULE_MAX +}; +#define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) + +/** + * enum nft_rule_compat_flags - nf_tables rule compat flags + * + * @NFT_RULE_COMPAT_F_INV: invert the check result + */ +enum nft_rule_compat_flags { + NFT_RULE_COMPAT_F_INV = (1 << 1), + NFT_RULE_COMPAT_F_MASK = NFT_RULE_COMPAT_F_INV, +}; + +/** + * enum nft_rule_compat_attributes - nf_tables rule compat attributes + * + * @NFTA_RULE_COMPAT_PROTO: numeric value of handled protocol (NLA_U32) + * @NFTA_RULE_COMPAT_FLAGS: bitmask of enum nft_rule_compat_flags (NLA_U32) + */ +enum nft_rule_compat_attributes { + NFTA_RULE_COMPAT_UNSPEC, + NFTA_RULE_COMPAT_PROTO, + NFTA_RULE_COMPAT_FLAGS, + __NFTA_RULE_COMPAT_MAX +}; +#define NFTA_RULE_COMPAT_MAX (__NFTA_RULE_COMPAT_MAX - 1) + +/** + * enum nft_set_flags - nf_tables set flags + * + * @NFT_SET_ANONYMOUS: name allocation, automatic cleanup on unlink + * @NFT_SET_CONSTANT: set contents may not change while bound + * @NFT_SET_INTERVAL: set contains intervals + * @NFT_SET_MAP: set is used as a dictionary + * @NFT_SET_TIMEOUT: set uses timeouts + * @NFT_SET_EVAL: set can be updated from the evaluation path + * @NFT_SET_OBJECT: set contains stateful objects + * @NFT_SET_CONCAT: set contains a concatenation + * @NFT_SET_EXPR: set contains expressions + */ +enum nft_set_flags { + NFT_SET_ANONYMOUS = 0x1, + NFT_SET_CONSTANT = 0x2, + NFT_SET_INTERVAL = 0x4, + NFT_SET_MAP = 0x8, + NFT_SET_TIMEOUT = 0x10, + NFT_SET_EVAL = 0x20, + NFT_SET_OBJECT = 0x40, + NFT_SET_CONCAT = 0x80, + NFT_SET_EXPR = 0x100, +}; + +/** + * enum nft_set_policies - set selection policy + * + * @NFT_SET_POL_PERFORMANCE: prefer high performance over low memory use + * @NFT_SET_POL_MEMORY: prefer low memory use over high performance + */ +enum nft_set_policies { + NFT_SET_POL_PERFORMANCE, + NFT_SET_POL_MEMORY, +}; + +/** + * enum nft_set_desc_attributes - set element description + * + * @NFTA_SET_DESC_SIZE: number of elements in set (NLA_U32) + * @NFTA_SET_DESC_CONCAT: description of field concatenation (NLA_NESTED) + */ +enum nft_set_desc_attributes { + NFTA_SET_DESC_UNSPEC, + NFTA_SET_DESC_SIZE, + NFTA_SET_DESC_CONCAT, + __NFTA_SET_DESC_MAX +}; +#define NFTA_SET_DESC_MAX (__NFTA_SET_DESC_MAX - 1) + +/** + * enum nft_set_field_attributes - attributes of concatenated fields + * + * @NFTA_SET_FIELD_LEN: length of single field, in bits (NLA_U32) + */ +enum nft_set_field_attributes { + NFTA_SET_FIELD_UNSPEC, + NFTA_SET_FIELD_LEN, + __NFTA_SET_FIELD_MAX +}; +#define NFTA_SET_FIELD_MAX (__NFTA_SET_FIELD_MAX - 1) + +/** + * enum nft_set_attributes - nf_tables set netlink attributes + * + * @NFTA_SET_TABLE: table name (NLA_STRING) + * @NFTA_SET_NAME: set name (NLA_STRING) + * @NFTA_SET_FLAGS: bitmask of enum nft_set_flags (NLA_U32) + * @NFTA_SET_KEY_TYPE: key data type, informational purpose only (NLA_U32) + * @NFTA_SET_KEY_LEN: key data length (NLA_U32) + * @NFTA_SET_DATA_TYPE: mapping data type (NLA_U32) + * @NFTA_SET_DATA_LEN: mapping data length (NLA_U32) + * @NFTA_SET_POLICY: selection policy (NLA_U32) + * @NFTA_SET_DESC: set description (NLA_NESTED) + * @NFTA_SET_ID: uniquely identifies a set in a transaction (NLA_U32) + * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64) + * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) + * @NFTA_SET_USERDATA: user data (NLA_BINARY) + * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*) + * @NFTA_SET_HANDLE: set handle (NLA_U64) + * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_SET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) + */ +enum nft_set_attributes { + NFTA_SET_UNSPEC, + NFTA_SET_TABLE, + NFTA_SET_NAME, + NFTA_SET_FLAGS, + NFTA_SET_KEY_TYPE, + NFTA_SET_KEY_LEN, + NFTA_SET_DATA_TYPE, + NFTA_SET_DATA_LEN, + NFTA_SET_POLICY, + NFTA_SET_DESC, + NFTA_SET_ID, + NFTA_SET_TIMEOUT, + NFTA_SET_GC_INTERVAL, + NFTA_SET_USERDATA, + NFTA_SET_PAD, + NFTA_SET_OBJ_TYPE, + NFTA_SET_HANDLE, + NFTA_SET_EXPR, + NFTA_SET_EXPRESSIONS, + __NFTA_SET_MAX +}; +#define NFTA_SET_MAX (__NFTA_SET_MAX - 1) + +/** + * enum nft_set_elem_flags - nf_tables set element flags + * + * @NFT_SET_ELEM_INTERVAL_END: element ends the previous interval + * @NFT_SET_ELEM_CATCHALL: special catch-all element + */ +enum nft_set_elem_flags { + NFT_SET_ELEM_INTERVAL_END = 0x1, + NFT_SET_ELEM_CATCHALL = 0x2, +}; + +/** + * enum nft_set_elem_attributes - nf_tables set element netlink attributes + * + * @NFTA_SET_ELEM_KEY: key value (NLA_NESTED: nft_data) + * @NFTA_SET_ELEM_DATA: data value of mapping (NLA_NESTED: nft_data_attributes) + * @NFTA_SET_ELEM_FLAGS: bitmask of nft_set_elem_flags (NLA_U32) + * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) + * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) + * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) + * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING) + * @NFTA_SET_ELEM_KEY_END: closing key value (NLA_NESTED: nft_data) + * @NFTA_SET_ELEM_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) + */ +enum nft_set_elem_attributes { + NFTA_SET_ELEM_UNSPEC, + NFTA_SET_ELEM_KEY, + NFTA_SET_ELEM_DATA, + NFTA_SET_ELEM_FLAGS, + NFTA_SET_ELEM_TIMEOUT, + NFTA_SET_ELEM_EXPIRATION, + NFTA_SET_ELEM_USERDATA, + NFTA_SET_ELEM_EXPR, + NFTA_SET_ELEM_PAD, + NFTA_SET_ELEM_OBJREF, + NFTA_SET_ELEM_KEY_END, + NFTA_SET_ELEM_EXPRESSIONS, + __NFTA_SET_ELEM_MAX +}; +#define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) + +/** + * enum nft_set_elem_list_attributes - nf_tables set element list netlink attributes + * + * @NFTA_SET_ELEM_LIST_TABLE: table of the set to be changed (NLA_STRING) + * @NFTA_SET_ELEM_LIST_SET: name of the set to be changed (NLA_STRING) + * @NFTA_SET_ELEM_LIST_ELEMENTS: list of set elements (NLA_NESTED: nft_set_elem_attributes) + * @NFTA_SET_ELEM_LIST_SET_ID: uniquely identifies a set in a transaction (NLA_U32) + */ +enum nft_set_elem_list_attributes { + NFTA_SET_ELEM_LIST_UNSPEC, + NFTA_SET_ELEM_LIST_TABLE, + NFTA_SET_ELEM_LIST_SET, + NFTA_SET_ELEM_LIST_ELEMENTS, + NFTA_SET_ELEM_LIST_SET_ID, + __NFTA_SET_ELEM_LIST_MAX +}; +#define NFTA_SET_ELEM_LIST_MAX (__NFTA_SET_ELEM_LIST_MAX - 1) + +/** + * enum nft_data_types - nf_tables data types + * + * @NFT_DATA_VALUE: generic data + * @NFT_DATA_VERDICT: netfilter verdict + * + * The type of data is usually determined by the kernel directly and is not + * explicitly specified by userspace. The only difference are sets, where + * userspace specifies the key and mapping data types. + * + * The values 0xffffff00-0xffffffff are reserved for internally used types. + * The remaining range can be freely used by userspace to encode types, all + * values are equivalent to NFT_DATA_VALUE. + */ +enum nft_data_types { + NFT_DATA_VALUE, + NFT_DATA_VERDICT = 0xffffff00U, +}; + +#define NFT_DATA_RESERVED_MASK 0xffffff00U + +/** + * enum nft_data_attributes - nf_tables data netlink attributes + * + * @NFTA_DATA_VALUE: generic data (NLA_BINARY) + * @NFTA_DATA_VERDICT: nf_tables verdict (NLA_NESTED: nft_verdict_attributes) + */ +enum nft_data_attributes { + NFTA_DATA_UNSPEC, + NFTA_DATA_VALUE, + NFTA_DATA_VERDICT, + __NFTA_DATA_MAX +}; +#define NFTA_DATA_MAX (__NFTA_DATA_MAX - 1) + +/* Maximum length of a value */ +#define NFT_DATA_VALUE_MAXLEN 64 + +/** + * enum nft_verdict_attributes - nf_tables verdict netlink attributes + * + * @NFTA_VERDICT_CODE: nf_tables verdict (NLA_U32: enum nft_verdicts) + * @NFTA_VERDICT_CHAIN: jump target chain name (NLA_STRING) + * @NFTA_VERDICT_CHAIN_ID: jump target chain ID (NLA_U32) + */ +enum nft_verdict_attributes { + NFTA_VERDICT_UNSPEC, + NFTA_VERDICT_CODE, + NFTA_VERDICT_CHAIN, + NFTA_VERDICT_CHAIN_ID, + __NFTA_VERDICT_MAX +}; +#define NFTA_VERDICT_MAX (__NFTA_VERDICT_MAX - 1) + +/** + * enum nft_expr_attributes - nf_tables expression netlink attributes + * + * @NFTA_EXPR_NAME: name of the expression type (NLA_STRING) + * @NFTA_EXPR_DATA: type specific data (NLA_NESTED) + */ +enum nft_expr_attributes { + NFTA_EXPR_UNSPEC, + NFTA_EXPR_NAME, + NFTA_EXPR_DATA, + __NFTA_EXPR_MAX +}; +#define NFTA_EXPR_MAX (__NFTA_EXPR_MAX - 1) + +/** + * enum nft_immediate_attributes - nf_tables immediate expression netlink attributes + * + * @NFTA_IMMEDIATE_DREG: destination register to load data into (NLA_U32) + * @NFTA_IMMEDIATE_DATA: data to load (NLA_NESTED: nft_data_attributes) + */ +enum nft_immediate_attributes { + NFTA_IMMEDIATE_UNSPEC, + NFTA_IMMEDIATE_DREG, + NFTA_IMMEDIATE_DATA, + __NFTA_IMMEDIATE_MAX +}; +#define NFTA_IMMEDIATE_MAX (__NFTA_IMMEDIATE_MAX - 1) + +/** + * enum nft_bitwise_ops - nf_tables bitwise operations + * + * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and + * XOR boolean operations + * @NFT_BITWISE_LSHIFT: left-shift operation + * @NFT_BITWISE_RSHIFT: right-shift operation + */ +enum nft_bitwise_ops { + NFT_BITWISE_BOOL, + NFT_BITWISE_LSHIFT, + NFT_BITWISE_RSHIFT, +}; + +/** + * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes + * + * @NFTA_BITWISE_SREG: source register (NLA_U32: nft_registers) + * @NFTA_BITWISE_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_BITWISE_LEN: length of operands (NLA_U32) + * @NFTA_BITWISE_MASK: mask value (NLA_NESTED: nft_data_attributes) + * @NFTA_BITWISE_XOR: xor value (NLA_NESTED: nft_data_attributes) + * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops) + * @NFTA_BITWISE_DATA: argument for non-boolean operations + * (NLA_NESTED: nft_data_attributes) + * + * The bitwise expression supports boolean and shift operations. It implements + * the boolean operations by performing the following operation: + * + * dreg = (sreg & mask) ^ xor + * + * with these mask and xor values: + * + * mask xor + * NOT: 1 1 + * OR: ~x x + * XOR: 1 x + * AND: x 0 + */ +enum nft_bitwise_attributes { + NFTA_BITWISE_UNSPEC, + NFTA_BITWISE_SREG, + NFTA_BITWISE_DREG, + NFTA_BITWISE_LEN, + NFTA_BITWISE_MASK, + NFTA_BITWISE_XOR, + NFTA_BITWISE_OP, + NFTA_BITWISE_DATA, + __NFTA_BITWISE_MAX +}; +#define NFTA_BITWISE_MAX (__NFTA_BITWISE_MAX - 1) + +/** + * enum nft_byteorder_ops - nf_tables byteorder operators + * + * @NFT_BYTEORDER_NTOH: network to host operator + * @NFT_BYTEORDER_HTON: host to network operator + */ +enum nft_byteorder_ops { + NFT_BYTEORDER_NTOH, + NFT_BYTEORDER_HTON, +}; + +/** + * enum nft_byteorder_attributes - nf_tables byteorder expression netlink attributes + * + * @NFTA_BYTEORDER_SREG: source register (NLA_U32: nft_registers) + * @NFTA_BYTEORDER_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_BYTEORDER_OP: operator (NLA_U32: enum nft_byteorder_ops) + * @NFTA_BYTEORDER_LEN: length of the data (NLA_U32) + * @NFTA_BYTEORDER_SIZE: data size in bytes (NLA_U32: 2 or 4) + */ +enum nft_byteorder_attributes { + NFTA_BYTEORDER_UNSPEC, + NFTA_BYTEORDER_SREG, + NFTA_BYTEORDER_DREG, + NFTA_BYTEORDER_OP, + NFTA_BYTEORDER_LEN, + NFTA_BYTEORDER_SIZE, + __NFTA_BYTEORDER_MAX +}; +#define NFTA_BYTEORDER_MAX (__NFTA_BYTEORDER_MAX - 1) + +/** + * enum nft_cmp_ops - nf_tables relational operator + * + * @NFT_CMP_EQ: equal + * @NFT_CMP_NEQ: not equal + * @NFT_CMP_LT: less than + * @NFT_CMP_LTE: less than or equal to + * @NFT_CMP_GT: greater than + * @NFT_CMP_GTE: greater than or equal to + */ +enum nft_cmp_ops { + NFT_CMP_EQ, + NFT_CMP_NEQ, + NFT_CMP_LT, + NFT_CMP_LTE, + NFT_CMP_GT, + NFT_CMP_GTE, +}; + +/** + * enum nft_cmp_attributes - nf_tables cmp expression netlink attributes + * + * @NFTA_CMP_SREG: source register of data to compare (NLA_U32: nft_registers) + * @NFTA_CMP_OP: cmp operation (NLA_U32: nft_cmp_ops) + * @NFTA_CMP_DATA: data to compare against (NLA_NESTED: nft_data_attributes) + */ +enum nft_cmp_attributes { + NFTA_CMP_UNSPEC, + NFTA_CMP_SREG, + NFTA_CMP_OP, + NFTA_CMP_DATA, + __NFTA_CMP_MAX +}; +#define NFTA_CMP_MAX (__NFTA_CMP_MAX - 1) + +/** + * enum nft_range_ops - nf_tables range operator + * + * @NFT_RANGE_EQ: equal + * @NFT_RANGE_NEQ: not equal + */ +enum nft_range_ops { + NFT_RANGE_EQ, + NFT_RANGE_NEQ, +}; + +/** + * enum nft_range_attributes - nf_tables range expression netlink attributes + * + * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers) + * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops) + * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes) + * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes) + */ +enum nft_range_attributes { + NFTA_RANGE_UNSPEC, + NFTA_RANGE_SREG, + NFTA_RANGE_OP, + NFTA_RANGE_FROM_DATA, + NFTA_RANGE_TO_DATA, + __NFTA_RANGE_MAX +}; +#define NFTA_RANGE_MAX (__NFTA_RANGE_MAX - 1) + +enum nft_lookup_flags { + NFT_LOOKUP_F_INV = (1 << 0), +}; + +/** + * enum nft_lookup_attributes - nf_tables set lookup expression netlink attributes + * + * @NFTA_LOOKUP_SET: name of the set where to look for (NLA_STRING) + * @NFTA_LOOKUP_SREG: source register of the data to look for (NLA_U32: nft_registers) + * @NFTA_LOOKUP_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_LOOKUP_SET_ID: uniquely identifies a set in a transaction (NLA_U32) + * @NFTA_LOOKUP_FLAGS: flags (NLA_U32: enum nft_lookup_flags) + */ +enum nft_lookup_attributes { + NFTA_LOOKUP_UNSPEC, + NFTA_LOOKUP_SET, + NFTA_LOOKUP_SREG, + NFTA_LOOKUP_DREG, + NFTA_LOOKUP_SET_ID, + NFTA_LOOKUP_FLAGS, + __NFTA_LOOKUP_MAX +}; +#define NFTA_LOOKUP_MAX (__NFTA_LOOKUP_MAX - 1) + +enum nft_dynset_ops { + NFT_DYNSET_OP_ADD, + NFT_DYNSET_OP_UPDATE, + NFT_DYNSET_OP_DELETE, +}; + +enum nft_dynset_flags { + NFT_DYNSET_F_INV = (1 << 0), + NFT_DYNSET_F_EXPR = (1 << 1), +}; + +/** + * enum nft_dynset_attributes - dynset expression attributes + * + * @NFTA_DYNSET_SET_NAME: name of set the to add data to (NLA_STRING) + * @NFTA_DYNSET_SET_ID: uniquely identifier of the set in the transaction (NLA_U32) + * @NFTA_DYNSET_OP: operation (NLA_U32) + * @NFTA_DYNSET_SREG_KEY: source register of the key (NLA_U32) + * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) + * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) + * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_DYNSET_FLAGS: flags (NLA_U32) + * @NFTA_DYNSET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) + */ +enum nft_dynset_attributes { + NFTA_DYNSET_UNSPEC, + NFTA_DYNSET_SET_NAME, + NFTA_DYNSET_SET_ID, + NFTA_DYNSET_OP, + NFTA_DYNSET_SREG_KEY, + NFTA_DYNSET_SREG_DATA, + NFTA_DYNSET_TIMEOUT, + NFTA_DYNSET_EXPR, + NFTA_DYNSET_PAD, + NFTA_DYNSET_FLAGS, + NFTA_DYNSET_EXPRESSIONS, + __NFTA_DYNSET_MAX, +}; +#define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) + +/** + * enum nft_payload_bases - nf_tables payload expression offset bases + * + * @NFT_PAYLOAD_LL_HEADER: link layer header + * @NFT_PAYLOAD_NETWORK_HEADER: network header + * @NFT_PAYLOAD_TRANSPORT_HEADER: transport header + * @NFT_PAYLOAD_INNER_HEADER: inner header / payload + */ +enum nft_payload_bases { + NFT_PAYLOAD_LL_HEADER, + NFT_PAYLOAD_NETWORK_HEADER, + NFT_PAYLOAD_TRANSPORT_HEADER, + NFT_PAYLOAD_INNER_HEADER, + NFT_PAYLOAD_TUN_HEADER, +}; + +/** + * enum nft_payload_csum_types - nf_tables payload expression checksum types + * + * @NFT_PAYLOAD_CSUM_NONE: no checksumming + * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + * @NFT_PAYLOAD_CSUM_SCTP: CRC-32c, for use in SCTP header (RFC 3309) + */ +enum nft_payload_csum_types { + NFT_PAYLOAD_CSUM_NONE, + NFT_PAYLOAD_CSUM_INET, + NFT_PAYLOAD_CSUM_SCTP, +}; + +enum nft_payload_csum_flags { + NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0), +}; + +enum nft_inner_type { + NFT_INNER_UNSPEC = 0, + NFT_INNER_VXLAN, + NFT_INNER_GENEVE, +}; + +enum nft_inner_flags { + NFT_INNER_HDRSIZE = (1 << 0), + NFT_INNER_LL = (1 << 1), + NFT_INNER_NH = (1 << 2), + NFT_INNER_TH = (1 << 3), +}; +#define NFT_INNER_MASK (NFT_INNER_HDRSIZE | NFT_INNER_LL | \ + NFT_INNER_NH | NFT_INNER_TH) + +enum nft_inner_attributes { + NFTA_INNER_UNSPEC, + NFTA_INNER_NUM, + NFTA_INNER_TYPE, + NFTA_INNER_FLAGS, + NFTA_INNER_HDRSIZE, + NFTA_INNER_EXPR, + __NFTA_INNER_MAX +}; +#define NFTA_INNER_MAX (__NFTA_INNER_MAX - 1) + +/** + * enum nft_payload_attributes - nf_tables payload expression netlink attributes + * + * @NFTA_PAYLOAD_DREG: destination register to load data into (NLA_U32: nft_registers) + * @NFTA_PAYLOAD_BASE: payload base (NLA_U32: nft_payload_bases) + * @NFTA_PAYLOAD_OFFSET: payload offset relative to base (NLA_U32) + * @NFTA_PAYLOAD_LEN: payload length (NLA_U32) + * @NFTA_PAYLOAD_SREG: source register to load data from (NLA_U32: nft_registers) + * @NFTA_PAYLOAD_CSUM_TYPE: checksum type (NLA_U32) + * @NFTA_PAYLOAD_CSUM_OFFSET: checksum offset relative to base (NLA_U32) + * @NFTA_PAYLOAD_CSUM_FLAGS: checksum flags (NLA_U32) + */ +enum nft_payload_attributes { + NFTA_PAYLOAD_UNSPEC, + NFTA_PAYLOAD_DREG, + NFTA_PAYLOAD_BASE, + NFTA_PAYLOAD_OFFSET, + NFTA_PAYLOAD_LEN, + NFTA_PAYLOAD_SREG, + NFTA_PAYLOAD_CSUM_TYPE, + NFTA_PAYLOAD_CSUM_OFFSET, + NFTA_PAYLOAD_CSUM_FLAGS, + __NFTA_PAYLOAD_MAX +}; +#define NFTA_PAYLOAD_MAX (__NFTA_PAYLOAD_MAX - 1) + +enum nft_exthdr_flags { + NFT_EXTHDR_F_PRESENT = (1 << 0), +}; + +/** + * enum nft_exthdr_op - nf_tables match options + * + * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers + * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_IPV4: match against ipv4 options + * @NFT_EXTHDR_OP_SCTP: match against sctp chunks + */ +enum nft_exthdr_op { + NFT_EXTHDR_OP_IPV6, + NFT_EXTHDR_OP_TCPOPT, + NFT_EXTHDR_OP_IPV4, + NFT_EXTHDR_OP_SCTP, + __NFT_EXTHDR_OP_MAX +}; +#define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) + +/** + * enum nft_exthdr_attributes - nf_tables extension header expression netlink attributes + * + * @NFTA_EXTHDR_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8) + * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32) + * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) + * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) + * @NFTA_EXTHDR_OP: option match type (NLA_U32) + * @NFTA_EXTHDR_SREG: option match type (NLA_U32) + */ +enum nft_exthdr_attributes { + NFTA_EXTHDR_UNSPEC, + NFTA_EXTHDR_DREG, + NFTA_EXTHDR_TYPE, + NFTA_EXTHDR_OFFSET, + NFTA_EXTHDR_LEN, + NFTA_EXTHDR_FLAGS, + NFTA_EXTHDR_OP, + NFTA_EXTHDR_SREG, + __NFTA_EXTHDR_MAX +}; +#define NFTA_EXTHDR_MAX (__NFTA_EXTHDR_MAX - 1) + +/** + * enum nft_meta_keys - nf_tables meta expression keys + * + * @NFT_META_LEN: packet length (skb->len) + * @NFT_META_PROTOCOL: packet ethertype protocol (skb->protocol), invalid in OUTPUT + * @NFT_META_PRIORITY: packet priority (skb->priority) + * @NFT_META_MARK: packet mark (skb->mark) + * @NFT_META_IIF: packet input interface index (dev->ifindex) + * @NFT_META_OIF: packet output interface index (dev->ifindex) + * @NFT_META_IIFNAME: packet input interface name (dev->name) + * @NFT_META_OIFNAME: packet output interface name (dev->name) + * @NFT_META_IIFTYPE: packet input interface type (dev->type) + * @NFT_META_OIFTYPE: packet output interface type (dev->type) + * @NFT_META_SKUID: originating socket UID (fsuid) + * @NFT_META_SKGID: originating socket GID (fsgid) + * @NFT_META_NFTRACE: packet nftrace bit + * @NFT_META_RTCLASSID: realm value of packet's route (skb->dst->tclassid) + * @NFT_META_SECMARK: packet secmark (skb->secmark) + * @NFT_META_NFPROTO: netfilter protocol + * @NFT_META_L4PROTO: layer 4 protocol number + * @NFT_META_BRI_IIFNAME: packet input bridge interface name + * @NFT_META_BRI_OIFNAME: packet output bridge interface name + * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback + * @NFT_META_CPU: cpu id through smp_processor_id() + * @NFT_META_IIFGROUP: packet input interface group + * @NFT_META_OIFGROUP: packet output interface group + * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid) + * @NFT_META_PRANDOM: a 32bit pseudo-random number + * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp) + * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind) + * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind) + * @NFT_META_BRI_IIFPVID: packet input bridge port pvid + * @NFT_META_BRI_IIFVPROTO: packet input bridge vlan proto + * @NFT_META_TIME_NS: time since epoch (in nanoseconds) + * @NFT_META_TIME_DAY: day of week (from 0 = Sunday to 6 = Saturday) + * @NFT_META_TIME_HOUR: hour of day (in seconds) + * @NFT_META_SDIF: slave device interface index + * @NFT_META_SDIFNAME: slave device interface name + */ +enum nft_meta_keys { + NFT_META_LEN, + NFT_META_PROTOCOL, + NFT_META_PRIORITY, + NFT_META_MARK, + NFT_META_IIF, + NFT_META_OIF, + NFT_META_IIFNAME, + NFT_META_OIFNAME, + NFT_META_IFTYPE, +#define NFT_META_IIFTYPE NFT_META_IFTYPE + NFT_META_OIFTYPE, + NFT_META_SKUID, + NFT_META_SKGID, + NFT_META_NFTRACE, + NFT_META_RTCLASSID, + NFT_META_SECMARK, + NFT_META_NFPROTO, + NFT_META_L4PROTO, + NFT_META_BRI_IIFNAME, + NFT_META_BRI_OIFNAME, + NFT_META_PKTTYPE, + NFT_META_CPU, + NFT_META_IIFGROUP, + NFT_META_OIFGROUP, + NFT_META_CGROUP, + NFT_META_PRANDOM, + NFT_META_SECPATH, + NFT_META_IIFKIND, + NFT_META_OIFKIND, + NFT_META_BRI_IIFPVID, + NFT_META_BRI_IIFVPROTO, + NFT_META_TIME_NS, + NFT_META_TIME_DAY, + NFT_META_TIME_HOUR, + NFT_META_SDIF, + NFT_META_SDIFNAME, + __NFT_META_IIFTYPE, +}; + +/** + * enum nft_rt_keys - nf_tables routing expression keys + * + * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid) + * @NFT_RT_NEXTHOP4: routing nexthop for IPv4 + * @NFT_RT_NEXTHOP6: routing nexthop for IPv6 + * @NFT_RT_TCPMSS: fetch current path tcp mss + * @NFT_RT_XFRM: boolean, skb->dst->xfrm != NULL + */ +enum nft_rt_keys { + NFT_RT_CLASSID, + NFT_RT_NEXTHOP4, + NFT_RT_NEXTHOP6, + NFT_RT_TCPMSS, + NFT_RT_XFRM, + __NFT_RT_MAX +}; +#define NFT_RT_MAX (__NFT_RT_MAX - 1) + +/** + * enum nft_hash_types - nf_tables hash expression types + * + * @NFT_HASH_JENKINS: Jenkins Hash + * @NFT_HASH_SYM: Symmetric Hash + */ +enum nft_hash_types { + NFT_HASH_JENKINS, + NFT_HASH_SYM, +}; + +/** + * enum nft_hash_attributes - nf_tables hash expression netlink attributes + * + * @NFTA_HASH_SREG: source register (NLA_U32) + * @NFTA_HASH_DREG: destination register (NLA_U32) + * @NFTA_HASH_LEN: source data length (NLA_U32) + * @NFTA_HASH_MODULUS: modulus value (NLA_U32) + * @NFTA_HASH_SEED: seed value (NLA_U32) + * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32) + * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types) + * @NFTA_HASH_SET_NAME: name of the map to lookup (NLA_STRING) + * @NFTA_HASH_SET_ID: id of the map (NLA_U32) + */ +enum nft_hash_attributes { + NFTA_HASH_UNSPEC, + NFTA_HASH_SREG, + NFTA_HASH_DREG, + NFTA_HASH_LEN, + NFTA_HASH_MODULUS, + NFTA_HASH_SEED, + NFTA_HASH_OFFSET, + NFTA_HASH_TYPE, + NFTA_HASH_SET_NAME, /* deprecated */ + NFTA_HASH_SET_ID, /* deprecated */ + __NFTA_HASH_MAX, +}; +#define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) + +/** + * enum nft_meta_attributes - nf_tables meta expression netlink attributes + * + * @NFTA_META_DREG: destination register (NLA_U32) + * @NFTA_META_KEY: meta data item to load (NLA_U32: nft_meta_keys) + * @NFTA_META_SREG: source register (NLA_U32) + */ +enum nft_meta_attributes { + NFTA_META_UNSPEC, + NFTA_META_DREG, + NFTA_META_KEY, + NFTA_META_SREG, + __NFTA_META_MAX +}; +#define NFTA_META_MAX (__NFTA_META_MAX - 1) + +/** + * enum nft_rt_attributes - nf_tables routing expression netlink attributes + * + * @NFTA_RT_DREG: destination register (NLA_U32) + * @NFTA_RT_KEY: routing data item to load (NLA_U32: nft_rt_keys) + */ +enum nft_rt_attributes { + NFTA_RT_UNSPEC, + NFTA_RT_DREG, + NFTA_RT_KEY, + __NFTA_RT_MAX +}; +#define NFTA_RT_MAX (__NFTA_RT_MAX - 1) + +/** + * enum nft_socket_attributes - nf_tables socket expression netlink attributes + * + * @NFTA_SOCKET_KEY: socket key to match + * @NFTA_SOCKET_DREG: destination register + * @NFTA_SOCKET_LEVEL: cgroups2 ancestor level (only for cgroupsv2) + */ +enum nft_socket_attributes { + NFTA_SOCKET_UNSPEC, + NFTA_SOCKET_KEY, + NFTA_SOCKET_DREG, + NFTA_SOCKET_LEVEL, + __NFTA_SOCKET_MAX +}; +#define NFTA_SOCKET_MAX (__NFTA_SOCKET_MAX - 1) + +/* + * enum nft_socket_keys - nf_tables socket expression keys + * + * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option + * @NFT_SOCKET_MARK: Value of the socket mark + * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) + * @NFT_SOCKET_CGROUPV2: Match on cgroups version 2 + */ +enum nft_socket_keys { + NFT_SOCKET_TRANSPARENT, + NFT_SOCKET_MARK, + NFT_SOCKET_WILDCARD, + NFT_SOCKET_CGROUPV2, + __NFT_SOCKET_MAX +}; +#define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) + +/** + * enum nft_ct_keys - nf_tables ct expression keys + * + * @NFT_CT_STATE: conntrack state (bitmask of enum ip_conntrack_info) + * @NFT_CT_DIRECTION: conntrack direction (enum ip_conntrack_dir) + * @NFT_CT_STATUS: conntrack status (bitmask of enum ip_conntrack_status) + * @NFT_CT_MARK: conntrack mark value + * @NFT_CT_SECMARK: conntrack secmark value + * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms + * @NFT_CT_HELPER: connection tracking helper assigned to conntrack + * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol + * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address, deprecated) + * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address, deprecated) + * @NFT_CT_PROTOCOL: conntrack layer 4 protocol + * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source + * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination + * @NFT_CT_LABELS: conntrack labels + * @NFT_CT_PKTS: conntrack packets + * @NFT_CT_BYTES: conntrack bytes + * @NFT_CT_AVGPKT: conntrack average bytes per packet + * @NFT_CT_ZONE: conntrack zone + * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack + * @NFT_CT_SRC_IP: conntrack layer 3 protocol source (IPv4 address) + * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address) + * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) + * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) + * @NFT_CT_ID: conntrack id + */ +enum nft_ct_keys { + NFT_CT_STATE, + NFT_CT_DIRECTION, + NFT_CT_STATUS, + NFT_CT_MARK, + NFT_CT_SECMARK, + NFT_CT_EXPIRATION, + NFT_CT_HELPER, + NFT_CT_L3PROTOCOL, + NFT_CT_SRC, + NFT_CT_DST, + NFT_CT_PROTOCOL, + NFT_CT_PROTO_SRC, + NFT_CT_PROTO_DST, + NFT_CT_LABELS, + NFT_CT_PKTS, + NFT_CT_BYTES, + NFT_CT_AVGPKT, + NFT_CT_ZONE, + NFT_CT_EVENTMASK, + NFT_CT_SRC_IP, + NFT_CT_DST_IP, + NFT_CT_SRC_IP6, + NFT_CT_DST_IP6, + NFT_CT_ID, + __NFT_CT_MAX +}; +#define NFT_CT_MAX (__NFT_CT_MAX - 1) + +/** + * enum nft_ct_attributes - nf_tables ct expression netlink attributes + * + * @NFTA_CT_DREG: destination register (NLA_U32) + * @NFTA_CT_KEY: conntrack data item to load (NLA_U32: nft_ct_keys) + * @NFTA_CT_DIRECTION: direction in case of directional keys (NLA_U8) + * @NFTA_CT_SREG: source register (NLA_U32) + */ +enum nft_ct_attributes { + NFTA_CT_UNSPEC, + NFTA_CT_DREG, + NFTA_CT_KEY, + NFTA_CT_DIRECTION, + NFTA_CT_SREG, + __NFTA_CT_MAX +}; +#define NFTA_CT_MAX (__NFTA_CT_MAX - 1) + +/** + * enum nft_flow_attributes - ct offload expression attributes + * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) + */ +enum nft_offload_attributes { + NFTA_FLOW_UNSPEC, + NFTA_FLOW_TABLE_NAME, + __NFTA_FLOW_MAX, +}; +#define NFTA_FLOW_MAX (__NFTA_FLOW_MAX - 1) + +enum nft_limit_type { + NFT_LIMIT_PKTS, + NFT_LIMIT_PKT_BYTES +}; + +enum nft_limit_flags { + NFT_LIMIT_F_INV = (1 << 0), +}; + +/** + * enum nft_limit_attributes - nf_tables limit expression netlink attributes + * + * @NFTA_LIMIT_RATE: refill rate (NLA_U64) + * @NFTA_LIMIT_UNIT: refill unit (NLA_U64) + * @NFTA_LIMIT_BURST: burst (NLA_U32) + * @NFTA_LIMIT_TYPE: type of limit (NLA_U32: enum nft_limit_type) + * @NFTA_LIMIT_FLAGS: flags (NLA_U32: enum nft_limit_flags) + */ +enum nft_limit_attributes { + NFTA_LIMIT_UNSPEC, + NFTA_LIMIT_RATE, + NFTA_LIMIT_UNIT, + NFTA_LIMIT_BURST, + NFTA_LIMIT_TYPE, + NFTA_LIMIT_FLAGS, + NFTA_LIMIT_PAD, + __NFTA_LIMIT_MAX +}; +#define NFTA_LIMIT_MAX (__NFTA_LIMIT_MAX - 1) + +enum nft_connlimit_flags { + NFT_CONNLIMIT_F_INV = (1 << 0), +}; + +/** + * enum nft_connlimit_attributes - nf_tables connlimit expression netlink attributes + * + * @NFTA_CONNLIMIT_COUNT: number of connections (NLA_U32) + * @NFTA_CONNLIMIT_FLAGS: flags (NLA_U32: enum nft_connlimit_flags) + */ +enum nft_connlimit_attributes { + NFTA_CONNLIMIT_UNSPEC, + NFTA_CONNLIMIT_COUNT, + NFTA_CONNLIMIT_FLAGS, + __NFTA_CONNLIMIT_MAX +}; +#define NFTA_CONNLIMIT_MAX (__NFTA_CONNLIMIT_MAX - 1) + +/** + * enum nft_counter_attributes - nf_tables counter expression netlink attributes + * + * @NFTA_COUNTER_BYTES: number of bytes (NLA_U64) + * @NFTA_COUNTER_PACKETS: number of packets (NLA_U64) + */ +enum nft_counter_attributes { + NFTA_COUNTER_UNSPEC, + NFTA_COUNTER_BYTES, + NFTA_COUNTER_PACKETS, + NFTA_COUNTER_PAD, + __NFTA_COUNTER_MAX +}; +#define NFTA_COUNTER_MAX (__NFTA_COUNTER_MAX - 1) + +/** + * enum nft_last_attributes - nf_tables last expression netlink attributes + * + * @NFTA_LAST_SET: last update has been set, zero means never updated (NLA_U32) + * @NFTA_LAST_MSECS: milliseconds since last update (NLA_U64) + */ +enum nft_last_attributes { + NFTA_LAST_UNSPEC, + NFTA_LAST_SET, + NFTA_LAST_MSECS, + NFTA_LAST_PAD, + __NFTA_LAST_MAX +}; +#define NFTA_LAST_MAX (__NFTA_LAST_MAX - 1) + +/** + * enum nft_log_attributes - nf_tables log expression netlink attributes + * + * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U32) + * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING) + * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32) + * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32) + * @NFTA_LOG_LEVEL: log level (NLA_U32) + * @NFTA_LOG_FLAGS: logging flags (NLA_U32) + */ +enum nft_log_attributes { + NFTA_LOG_UNSPEC, + NFTA_LOG_GROUP, + NFTA_LOG_PREFIX, + NFTA_LOG_SNAPLEN, + NFTA_LOG_QTHRESHOLD, + NFTA_LOG_LEVEL, + NFTA_LOG_FLAGS, + __NFTA_LOG_MAX +}; +#define NFTA_LOG_MAX (__NFTA_LOG_MAX - 1) + +/** + * enum nft_log_level - nf_tables log levels + * + * @NFT_LOGLEVEL_EMERG: system is unusable + * @NFT_LOGLEVEL_ALERT: action must be taken immediately + * @NFT_LOGLEVEL_CRIT: critical conditions + * @NFT_LOGLEVEL_ERR: error conditions + * @NFT_LOGLEVEL_WARNING: warning conditions + * @NFT_LOGLEVEL_NOTICE: normal but significant condition + * @NFT_LOGLEVEL_INFO: informational + * @NFT_LOGLEVEL_DEBUG: debug-level messages + * @NFT_LOGLEVEL_AUDIT: enabling audit logging + */ +enum nft_log_level { + NFT_LOGLEVEL_EMERG, + NFT_LOGLEVEL_ALERT, + NFT_LOGLEVEL_CRIT, + NFT_LOGLEVEL_ERR, + NFT_LOGLEVEL_WARNING, + NFT_LOGLEVEL_NOTICE, + NFT_LOGLEVEL_INFO, + NFT_LOGLEVEL_DEBUG, + NFT_LOGLEVEL_AUDIT, + __NFT_LOGLEVEL_MAX +}; +#define NFT_LOGLEVEL_MAX (__NFT_LOGLEVEL_MAX - 1) + +/** + * enum nft_queue_attributes - nf_tables queue expression netlink attributes + * + * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16) + * @NFTA_QUEUE_TOTAL: number of queues to load balance packets on (NLA_U16) + * @NFTA_QUEUE_FLAGS: various flags (NLA_U16) + * @NFTA_QUEUE_SREG_QNUM: source register of queue number (NLA_U32: nft_registers) + */ +enum nft_queue_attributes { + NFTA_QUEUE_UNSPEC, + NFTA_QUEUE_NUM, + NFTA_QUEUE_TOTAL, + NFTA_QUEUE_FLAGS, + NFTA_QUEUE_SREG_QNUM, + __NFTA_QUEUE_MAX +}; +#define NFTA_QUEUE_MAX (__NFTA_QUEUE_MAX - 1) + +#define NFT_QUEUE_FLAG_BYPASS 0x01 /* for compatibility with v2 */ +#define NFT_QUEUE_FLAG_CPU_FANOUT 0x02 /* use current CPU (no hashing) */ +#define NFT_QUEUE_FLAG_MASK 0x03 + +enum nft_quota_flags { + NFT_QUOTA_F_INV = (1 << 0), + NFT_QUOTA_F_DEPLETED = (1 << 1), +}; + +/** + * enum nft_quota_attributes - nf_tables quota expression netlink attributes + * + * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16) + * @NFTA_QUOTA_FLAGS: flags (NLA_U32) + * @NFTA_QUOTA_CONSUMED: quota already consumed in bytes (NLA_U64) + */ +enum nft_quota_attributes { + NFTA_QUOTA_UNSPEC, + NFTA_QUOTA_BYTES, + NFTA_QUOTA_FLAGS, + NFTA_QUOTA_PAD, + NFTA_QUOTA_CONSUMED, + __NFTA_QUOTA_MAX +}; +#define NFTA_QUOTA_MAX (__NFTA_QUOTA_MAX - 1) + +/** + * enum nft_secmark_attributes - nf_tables secmark object netlink attributes + * + * @NFTA_SECMARK_CTX: security context (NLA_STRING) + */ +enum nft_secmark_attributes { + NFTA_SECMARK_UNSPEC, + NFTA_SECMARK_CTX, + __NFTA_SECMARK_MAX, +}; +#define NFTA_SECMARK_MAX (__NFTA_SECMARK_MAX - 1) + +/* Max security context length */ +#define NFT_SECMARK_CTX_MAXLEN 256 + +/** + * enum nft_reject_types - nf_tables reject expression reject types + * + * @NFT_REJECT_ICMP_UNREACH: reject using ICMP unreachable + * @NFT_REJECT_TCP_RST: reject using TCP RST + * @NFT_REJECT_ICMPX_UNREACH: abstracted ICMP unreachable for bridge and inet + */ +enum nft_reject_types { + NFT_REJECT_ICMP_UNREACH, + NFT_REJECT_TCP_RST, + NFT_REJECT_ICMPX_UNREACH, +}; + +/** + * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * + * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable + * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable + * @NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable + * @NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratively prohibited + * + * These codes are mapped to real ICMP and ICMPv6 codes. + */ +enum nft_reject_inet_code { + NFT_REJECT_ICMPX_NO_ROUTE = 0, + NFT_REJECT_ICMPX_PORT_UNREACH, + NFT_REJECT_ICMPX_HOST_UNREACH, + NFT_REJECT_ICMPX_ADMIN_PROHIBITED, + __NFT_REJECT_ICMPX_MAX +}; +#define NFT_REJECT_ICMPX_MAX (__NFT_REJECT_ICMPX_MAX - 1) + +/** + * enum nft_reject_attributes - nf_tables reject expression netlink attributes + * + * @NFTA_REJECT_TYPE: packet type to use (NLA_U32: nft_reject_types) + * @NFTA_REJECT_ICMP_CODE: ICMP code to use (NLA_U8) + */ +enum nft_reject_attributes { + NFTA_REJECT_UNSPEC, + NFTA_REJECT_TYPE, + NFTA_REJECT_ICMP_CODE, + __NFTA_REJECT_MAX +}; +#define NFTA_REJECT_MAX (__NFTA_REJECT_MAX - 1) + +/** + * enum nft_nat_types - nf_tables nat expression NAT types + * + * @NFT_NAT_SNAT: source NAT + * @NFT_NAT_DNAT: destination NAT + */ +enum nft_nat_types { + NFT_NAT_SNAT, + NFT_NAT_DNAT, +}; + +/** + * enum nft_nat_attributes - nf_tables nat expression netlink attributes + * + * @NFTA_NAT_TYPE: NAT type (NLA_U32: nft_nat_types) + * @NFTA_NAT_FAMILY: NAT family (NLA_U32) + * @NFTA_NAT_REG_ADDR_MIN: source register of address range start (NLA_U32: nft_registers) + * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers) + * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_NAT_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_nat_attributes { + NFTA_NAT_UNSPEC, + NFTA_NAT_TYPE, + NFTA_NAT_FAMILY, + NFTA_NAT_REG_ADDR_MIN, + NFTA_NAT_REG_ADDR_MAX, + NFTA_NAT_REG_PROTO_MIN, + NFTA_NAT_REG_PROTO_MAX, + NFTA_NAT_FLAGS, + __NFTA_NAT_MAX +}; +#define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) + +/** + * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes + * + * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) + * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) + * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) + */ +enum nft_tproxy_attributes { + NFTA_TPROXY_UNSPEC, + NFTA_TPROXY_FAMILY, + NFTA_TPROXY_REG_ADDR, + NFTA_TPROXY_REG_PORT, + __NFTA_TPROXY_MAX +}; +#define NFTA_TPROXY_MAX (__NFTA_TPROXY_MAX - 1) + +/** + * enum nft_masq_attributes - nf_tables masquerade expression attributes + * + * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + * @NFTA_MASQ_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_MASQ_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + */ +enum nft_masq_attributes { + NFTA_MASQ_UNSPEC, + NFTA_MASQ_FLAGS, + NFTA_MASQ_REG_PROTO_MIN, + NFTA_MASQ_REG_PROTO_MAX, + __NFTA_MASQ_MAX +}; +#define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) + +/** + * enum nft_redir_attributes - nf_tables redirect expression netlink attributes + * + * @NFTA_REDIR_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_REDIR_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_REDIR_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_redir_attributes { + NFTA_REDIR_UNSPEC, + NFTA_REDIR_REG_PROTO_MIN, + NFTA_REDIR_REG_PROTO_MAX, + NFTA_REDIR_FLAGS, + __NFTA_REDIR_MAX +}; +#define NFTA_REDIR_MAX (__NFTA_REDIR_MAX - 1) + +/** + * enum nft_dup_attributes - nf_tables dup expression netlink attributes + * + * @NFTA_DUP_SREG_ADDR: source register of address (NLA_U32: nft_registers) + * @NFTA_DUP_SREG_DEV: source register of output interface (NLA_U32: nft_register) + */ +enum nft_dup_attributes { + NFTA_DUP_UNSPEC, + NFTA_DUP_SREG_ADDR, + NFTA_DUP_SREG_DEV, + __NFTA_DUP_MAX +}; +#define NFTA_DUP_MAX (__NFTA_DUP_MAX - 1) + +/** + * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes + * + * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register) + * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register) + * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto) + */ +enum nft_fwd_attributes { + NFTA_FWD_UNSPEC, + NFTA_FWD_SREG_DEV, + NFTA_FWD_SREG_ADDR, + NFTA_FWD_NFPROTO, + __NFTA_FWD_MAX +}; +#define NFTA_FWD_MAX (__NFTA_FWD_MAX - 1) + +/** + * enum nft_objref_attributes - nf_tables stateful object expression netlink attributes + * + * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: nft_register) + * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING) + * @NFTA_OBJREF_SET_SREG: source register of the data to look for (NLA_U32: nft_registers) + * @NFTA_OBJREF_SET_NAME: name of the set where to look for (NLA_STRING) + * @NFTA_OBJREF_SET_ID: id of the set where to look for in this transaction (NLA_U32) + */ +enum nft_objref_attributes { + NFTA_OBJREF_UNSPEC, + NFTA_OBJREF_IMM_TYPE, + NFTA_OBJREF_IMM_NAME, + NFTA_OBJREF_SET_SREG, + NFTA_OBJREF_SET_NAME, + NFTA_OBJREF_SET_ID, + __NFTA_OBJREF_MAX +}; +#define NFTA_OBJREF_MAX (__NFTA_OBJREF_MAX - 1) + +/** + * enum nft_gen_attributes - nf_tables ruleset generation attributes + * + * @NFTA_GEN_ID: Ruleset generation ID (NLA_U32) + */ +enum nft_gen_attributes { + NFTA_GEN_UNSPEC, + NFTA_GEN_ID, + NFTA_GEN_PROC_PID, + NFTA_GEN_PROC_NAME, + __NFTA_GEN_MAX +}; +#define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) + +/* + * enum nft_fib_attributes - nf_tables fib expression netlink attributes + * + * @NFTA_FIB_DREG: destination register (NLA_U32) + * @NFTA_FIB_RESULT: desired result (NLA_U32) + * @NFTA_FIB_FLAGS: flowi fields to initialize when querying the FIB (NLA_U32) + * + * The FIB expression performs a route lookup according + * to the packet data. + */ +enum nft_fib_attributes { + NFTA_FIB_UNSPEC, + NFTA_FIB_DREG, + NFTA_FIB_RESULT, + NFTA_FIB_FLAGS, + __NFTA_FIB_MAX +}; +#define NFTA_FIB_MAX (__NFTA_FIB_MAX - 1) + +enum nft_fib_result { + NFT_FIB_RESULT_UNSPEC, + NFT_FIB_RESULT_OIF, + NFT_FIB_RESULT_OIFNAME, + NFT_FIB_RESULT_ADDRTYPE, + __NFT_FIB_RESULT_MAX +}; +#define NFT_FIB_RESULT_MAX (__NFT_FIB_RESULT_MAX - 1) + +enum nft_fib_flags { + NFTA_FIB_F_SADDR = 1 << 0, /* look up src */ + NFTA_FIB_F_DADDR = 1 << 1, /* look up dst */ + NFTA_FIB_F_MARK = 1 << 2, /* use skb->mark */ + NFTA_FIB_F_IIF = 1 << 3, /* restrict to iif */ + NFTA_FIB_F_OIF = 1 << 4, /* restrict to oif */ + NFTA_FIB_F_PRESENT = 1 << 5, /* check existence only */ +}; + +enum nft_ct_helper_attributes { + NFTA_CT_HELPER_UNSPEC, + NFTA_CT_HELPER_NAME, + NFTA_CT_HELPER_L3PROTO, + NFTA_CT_HELPER_L4PROTO, + __NFTA_CT_HELPER_MAX, +}; +#define NFTA_CT_HELPER_MAX (__NFTA_CT_HELPER_MAX - 1) + +enum nft_ct_timeout_timeout_attributes { + NFTA_CT_TIMEOUT_UNSPEC, + NFTA_CT_TIMEOUT_L3PROTO, + NFTA_CT_TIMEOUT_L4PROTO, + NFTA_CT_TIMEOUT_DATA, + __NFTA_CT_TIMEOUT_MAX, +}; +#define NFTA_CT_TIMEOUT_MAX (__NFTA_CT_TIMEOUT_MAX - 1) + +enum nft_ct_expectation_attributes { + NFTA_CT_EXPECT_UNSPEC, + NFTA_CT_EXPECT_L3PROTO, + NFTA_CT_EXPECT_L4PROTO, + NFTA_CT_EXPECT_DPORT, + NFTA_CT_EXPECT_TIMEOUT, + NFTA_CT_EXPECT_SIZE, + __NFTA_CT_EXPECT_MAX, +}; +#define NFTA_CT_EXPECT_MAX (__NFTA_CT_EXPECT_MAX - 1) + +#define NFT_OBJECT_UNSPEC 0 +#define NFT_OBJECT_COUNTER 1 +#define NFT_OBJECT_QUOTA 2 +#define NFT_OBJECT_CT_HELPER 3 +#define NFT_OBJECT_LIMIT 4 +#define NFT_OBJECT_CONNLIMIT 5 +#define NFT_OBJECT_TUNNEL 6 +#define NFT_OBJECT_CT_TIMEOUT 7 +#define NFT_OBJECT_SECMARK 8 +#define NFT_OBJECT_CT_EXPECT 9 +#define NFT_OBJECT_SYNPROXY 10 +#define __NFT_OBJECT_MAX 11 +#define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) + +/** + * enum nft_object_attributes - nf_tables stateful object netlink attributes + * + * @NFTA_OBJ_TABLE: name of the table containing the expression (NLA_STRING) + * @NFTA_OBJ_NAME: name of this expression type (NLA_STRING) + * @NFTA_OBJ_TYPE: stateful object type (NLA_U32) + * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) + * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) + * @NFTA_OBJ_HANDLE: object handle (NLA_U64) + * @NFTA_OBJ_USERDATA: user data (NLA_BINARY) + */ +enum nft_object_attributes { + NFTA_OBJ_UNSPEC, + NFTA_OBJ_TABLE, + NFTA_OBJ_NAME, + NFTA_OBJ_TYPE, + NFTA_OBJ_DATA, + NFTA_OBJ_USE, + NFTA_OBJ_HANDLE, + NFTA_OBJ_PAD, + NFTA_OBJ_USERDATA, + __NFTA_OBJ_MAX +}; +#define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) + +/** + * enum nft_flowtable_flags - nf_tables flowtable flags + * + * @NFT_FLOWTABLE_HW_OFFLOAD: flowtable hardware offload is enabled + * @NFT_FLOWTABLE_COUNTER: enable flow counters + */ +enum nft_flowtable_flags { + NFT_FLOWTABLE_HW_OFFLOAD = 0x1, + NFT_FLOWTABLE_COUNTER = 0x2, + NFT_FLOWTABLE_MASK = (NFT_FLOWTABLE_HW_OFFLOAD | + NFT_FLOWTABLE_COUNTER) +}; + +/** + * enum nft_flowtable_attributes - nf_tables flow table netlink attributes + * + * @NFTA_FLOWTABLE_TABLE: name of the table containing the expression (NLA_STRING) + * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING) + * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) + * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) + * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32) + */ +enum nft_flowtable_attributes { + NFTA_FLOWTABLE_UNSPEC, + NFTA_FLOWTABLE_TABLE, + NFTA_FLOWTABLE_NAME, + NFTA_FLOWTABLE_HOOK, + NFTA_FLOWTABLE_USE, + NFTA_FLOWTABLE_HANDLE, + NFTA_FLOWTABLE_PAD, + NFTA_FLOWTABLE_FLAGS, + __NFTA_FLOWTABLE_MAX +}; +#define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) + +/** + * enum nft_flowtable_hook_attributes - nf_tables flow table hook netlink attributes + * + * @NFTA_FLOWTABLE_HOOK_NUM: netfilter hook number (NLA_U32) + * @NFTA_FLOWTABLE_HOOK_PRIORITY: netfilter hook priority (NLA_U32) + * @NFTA_FLOWTABLE_HOOK_DEVS: input devices this flow table is bound to (NLA_NESTED) + */ +enum nft_flowtable_hook_attributes { + NFTA_FLOWTABLE_HOOK_UNSPEC, + NFTA_FLOWTABLE_HOOK_NUM, + NFTA_FLOWTABLE_HOOK_PRIORITY, + NFTA_FLOWTABLE_HOOK_DEVS, + __NFTA_FLOWTABLE_HOOK_MAX +}; +#define NFTA_FLOWTABLE_HOOK_MAX (__NFTA_FLOWTABLE_HOOK_MAX - 1) + +/** + * enum nft_osf_attributes - nftables osf expression netlink attributes + * + * @NFTA_OSF_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_OSF_TTL: Value of the TTL osf option (NLA_U8) + * @NFTA_OSF_FLAGS: flags (NLA_U32) + */ +enum nft_osf_attributes { + NFTA_OSF_UNSPEC, + NFTA_OSF_DREG, + NFTA_OSF_TTL, + NFTA_OSF_FLAGS, + __NFTA_OSF_MAX, +}; +#define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1) + +enum nft_osf_flags { + NFT_OSF_F_VERSION = (1 << 0), +}; + +/** + * enum nft_synproxy_attributes - nf_tables synproxy expression netlink attributes + * + * @NFTA_SYNPROXY_MSS: mss value sent to the backend (NLA_U16) + * @NFTA_SYNPROXY_WSCALE: wscale value sent to the backend (NLA_U8) + * @NFTA_SYNPROXY_FLAGS: flags (NLA_U32) + */ +enum nft_synproxy_attributes { + NFTA_SYNPROXY_UNSPEC, + NFTA_SYNPROXY_MSS, + NFTA_SYNPROXY_WSCALE, + NFTA_SYNPROXY_FLAGS, + __NFTA_SYNPROXY_MAX, +}; +#define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) + +/** + * enum nft_device_attributes - nf_tables device netlink attributes + * + * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + */ +enum nft_devices_attributes { + NFTA_DEVICE_UNSPEC, + NFTA_DEVICE_NAME, + __NFTA_DEVICE_MAX +}; +#define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) + +/* + * enum nft_xfrm_attributes - nf_tables xfrm expr netlink attributes + * + * @NFTA_XFRM_DREG: destination register (NLA_U32) + * @NFTA_XFRM_KEY: enum nft_xfrm_keys (NLA_U32) + * @NFTA_XFRM_DIR: direction (NLA_U8) + * @NFTA_XFRM_SPNUM: index in secpath array (NLA_U32) + */ +enum nft_xfrm_attributes { + NFTA_XFRM_UNSPEC, + NFTA_XFRM_DREG, + NFTA_XFRM_KEY, + NFTA_XFRM_DIR, + NFTA_XFRM_SPNUM, + __NFTA_XFRM_MAX +}; +#define NFTA_XFRM_MAX (__NFTA_XFRM_MAX - 1) + +enum nft_xfrm_keys { + NFT_XFRM_KEY_UNSPEC, + NFT_XFRM_KEY_DADDR_IP4, + NFT_XFRM_KEY_DADDR_IP6, + NFT_XFRM_KEY_SADDR_IP4, + NFT_XFRM_KEY_SADDR_IP6, + NFT_XFRM_KEY_REQID, + NFT_XFRM_KEY_SPI, + __NFT_XFRM_KEY_MAX, +}; +#define NFT_XFRM_KEY_MAX (__NFT_XFRM_KEY_MAX - 1) + +/** + * enum nft_trace_attributes - nf_tables trace netlink attributes + * + * @NFTA_TRACE_TABLE: name of the table (NLA_STRING) + * @NFTA_TRACE_CHAIN: name of the chain (NLA_STRING) + * @NFTA_TRACE_RULE_HANDLE: numeric handle of the rule (NLA_U64) + * @NFTA_TRACE_TYPE: type of the event (NLA_U32: nft_trace_types) + * @NFTA_TRACE_VERDICT: verdict returned by hook (NLA_NESTED: nft_verdicts) + * @NFTA_TRACE_ID: pseudo-id, same for each skb traced (NLA_U32) + * @NFTA_TRACE_LL_HEADER: linklayer header (NLA_BINARY) + * @NFTA_TRACE_NETWORK_HEADER: network header (NLA_BINARY) + * @NFTA_TRACE_TRANSPORT_HEADER: transport header (NLA_BINARY) + * @NFTA_TRACE_IIF: indev ifindex (NLA_U32) + * @NFTA_TRACE_IIFTYPE: netdev->type of indev (NLA_U16) + * @NFTA_TRACE_OIF: outdev ifindex (NLA_U32) + * @NFTA_TRACE_OIFTYPE: netdev->type of outdev (NLA_U16) + * @NFTA_TRACE_MARK: nfmark (NLA_U32) + * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) + * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + */ +enum nft_trace_attributes { + NFTA_TRACE_UNSPEC, + NFTA_TRACE_TABLE, + NFTA_TRACE_CHAIN, + NFTA_TRACE_RULE_HANDLE, + NFTA_TRACE_TYPE, + NFTA_TRACE_VERDICT, + NFTA_TRACE_ID, + NFTA_TRACE_LL_HEADER, + NFTA_TRACE_NETWORK_HEADER, + NFTA_TRACE_TRANSPORT_HEADER, + NFTA_TRACE_IIF, + NFTA_TRACE_IIFTYPE, + NFTA_TRACE_OIF, + NFTA_TRACE_OIFTYPE, + NFTA_TRACE_MARK, + NFTA_TRACE_NFPROTO, + NFTA_TRACE_POLICY, + NFTA_TRACE_PAD, + __NFTA_TRACE_MAX +}; +#define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) + +enum nft_trace_types { + NFT_TRACETYPE_UNSPEC, + NFT_TRACETYPE_POLICY, + NFT_TRACETYPE_RETURN, + NFT_TRACETYPE_RULE, + __NFT_TRACETYPE_MAX +}; +#define NFT_TRACETYPE_MAX (__NFT_TRACETYPE_MAX - 1) + +/** + * enum nft_ng_attributes - nf_tables number generator expression netlink attributes + * + * @NFTA_NG_DREG: destination register (NLA_U32) + * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) + * @NFTA_NG_TYPE: operation type (NLA_U32) + * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32) + * @NFTA_NG_SET_NAME: name of the map to lookup (NLA_STRING) + * @NFTA_NG_SET_ID: id of the map (NLA_U32) + */ +enum nft_ng_attributes { + NFTA_NG_UNSPEC, + NFTA_NG_DREG, + NFTA_NG_MODULUS, + NFTA_NG_TYPE, + NFTA_NG_OFFSET, + NFTA_NG_SET_NAME, /* deprecated */ + NFTA_NG_SET_ID, /* deprecated */ + __NFTA_NG_MAX +}; +#define NFTA_NG_MAX (__NFTA_NG_MAX - 1) + +enum nft_ng_types { + NFT_NG_INCREMENTAL, + NFT_NG_RANDOM, + __NFT_NG_MAX +}; +#define NFT_NG_MAX (__NFT_NG_MAX - 1) + +enum nft_tunnel_key_ip_attributes { + NFTA_TUNNEL_KEY_IP_UNSPEC, + NFTA_TUNNEL_KEY_IP_SRC, + NFTA_TUNNEL_KEY_IP_DST, + __NFTA_TUNNEL_KEY_IP_MAX +}; +#define NFTA_TUNNEL_KEY_IP_MAX (__NFTA_TUNNEL_KEY_IP_MAX - 1) + +enum nft_tunnel_ip6_attributes { + NFTA_TUNNEL_KEY_IP6_UNSPEC, + NFTA_TUNNEL_KEY_IP6_SRC, + NFTA_TUNNEL_KEY_IP6_DST, + NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + __NFTA_TUNNEL_KEY_IP6_MAX +}; +#define NFTA_TUNNEL_KEY_IP6_MAX (__NFTA_TUNNEL_KEY_IP6_MAX - 1) + +enum nft_tunnel_opts_attributes { + NFTA_TUNNEL_KEY_OPTS_UNSPEC, + NFTA_TUNNEL_KEY_OPTS_VXLAN, + NFTA_TUNNEL_KEY_OPTS_ERSPAN, + NFTA_TUNNEL_KEY_OPTS_GENEVE, + __NFTA_TUNNEL_KEY_OPTS_MAX +}; +#define NFTA_TUNNEL_KEY_OPTS_MAX (__NFTA_TUNNEL_KEY_OPTS_MAX - 1) + +enum nft_tunnel_opts_vxlan_attributes { + NFTA_TUNNEL_KEY_VXLAN_UNSPEC, + NFTA_TUNNEL_KEY_VXLAN_GBP, + __NFTA_TUNNEL_KEY_VXLAN_MAX +}; +#define NFTA_TUNNEL_KEY_VXLAN_MAX (__NFTA_TUNNEL_KEY_VXLAN_MAX - 1) + +enum nft_tunnel_opts_erspan_attributes { + NFTA_TUNNEL_KEY_ERSPAN_UNSPEC, + NFTA_TUNNEL_KEY_ERSPAN_VERSION, + NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, + NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, + NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, + __NFTA_TUNNEL_KEY_ERSPAN_MAX +}; +#define NFTA_TUNNEL_KEY_ERSPAN_MAX (__NFTA_TUNNEL_KEY_ERSPAN_MAX - 1) + +enum nft_tunnel_opts_geneve_attributes { + NFTA_TUNNEL_KEY_GENEVE_UNSPEC, + NFTA_TUNNEL_KEY_GENEVE_CLASS, + NFTA_TUNNEL_KEY_GENEVE_TYPE, + NFTA_TUNNEL_KEY_GENEVE_DATA, + __NFTA_TUNNEL_KEY_GENEVE_MAX +}; +#define NFTA_TUNNEL_KEY_GENEVE_MAX (__NFTA_TUNNEL_KEY_GENEVE_MAX - 1) + +enum nft_tunnel_flags { + NFT_TUNNEL_F_ZERO_CSUM_TX = (1 << 0), + NFT_TUNNEL_F_DONT_FRAGMENT = (1 << 1), + NFT_TUNNEL_F_SEQ_NUMBER = (1 << 2), +}; +#define NFT_TUNNEL_F_MASK (NFT_TUNNEL_F_ZERO_CSUM_TX | \ + NFT_TUNNEL_F_DONT_FRAGMENT | \ + NFT_TUNNEL_F_SEQ_NUMBER) + +enum nft_tunnel_key_attributes { + NFTA_TUNNEL_KEY_UNSPEC, + NFTA_TUNNEL_KEY_ID, + NFTA_TUNNEL_KEY_IP, + NFTA_TUNNEL_KEY_IP6, + NFTA_TUNNEL_KEY_FLAGS, + NFTA_TUNNEL_KEY_TOS, + NFTA_TUNNEL_KEY_TTL, + NFTA_TUNNEL_KEY_SPORT, + NFTA_TUNNEL_KEY_DPORT, + NFTA_TUNNEL_KEY_OPTS, + __NFTA_TUNNEL_KEY_MAX +}; +#define NFTA_TUNNEL_KEY_MAX (__NFTA_TUNNEL_KEY_MAX - 1) + +enum nft_tunnel_keys { + NFT_TUNNEL_PATH, + NFT_TUNNEL_ID, + __NFT_TUNNEL_MAX +}; +#define NFT_TUNNEL_MAX (__NFT_TUNNEL_MAX - 1) + +enum nft_tunnel_mode { + NFT_TUNNEL_MODE_NONE, + NFT_TUNNEL_MODE_RX, + NFT_TUNNEL_MODE_TX, + __NFT_TUNNEL_MODE_MAX +}; +#define NFT_TUNNEL_MODE_MAX (__NFT_TUNNEL_MODE_MAX - 1) + +enum nft_tunnel_attributes { + NFTA_TUNNEL_UNSPEC, + NFTA_TUNNEL_KEY, + NFTA_TUNNEL_DREG, + NFTA_TUNNEL_MODE, + __NFTA_TUNNEL_MAX +}; +#define NFTA_TUNNEL_MAX (__NFTA_TUNNEL_MAX - 1) + +#endif /* _LINUX_NF_TABLES_H */ diff --git a/src/basic/linux/netfilter/nfnetlink.h b/src/basic/linux/netfilter/nfnetlink.h new file mode 100644 index 0000000..6cd58cd --- /dev/null +++ b/src/basic/linux/netfilter/nfnetlink.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_NFNETLINK_H +#define _UAPI_NFNETLINK_H +#include +#include + +enum nfnetlink_groups { + NFNLGRP_NONE, +#define NFNLGRP_NONE NFNLGRP_NONE + NFNLGRP_CONNTRACK_NEW, +#define NFNLGRP_CONNTRACK_NEW NFNLGRP_CONNTRACK_NEW + NFNLGRP_CONNTRACK_UPDATE, +#define NFNLGRP_CONNTRACK_UPDATE NFNLGRP_CONNTRACK_UPDATE + NFNLGRP_CONNTRACK_DESTROY, +#define NFNLGRP_CONNTRACK_DESTROY NFNLGRP_CONNTRACK_DESTROY + NFNLGRP_CONNTRACK_EXP_NEW, +#define NFNLGRP_CONNTRACK_EXP_NEW NFNLGRP_CONNTRACK_EXP_NEW + NFNLGRP_CONNTRACK_EXP_UPDATE, +#define NFNLGRP_CONNTRACK_EXP_UPDATE NFNLGRP_CONNTRACK_EXP_UPDATE + NFNLGRP_CONNTRACK_EXP_DESTROY, +#define NFNLGRP_CONNTRACK_EXP_DESTROY NFNLGRP_CONNTRACK_EXP_DESTROY + NFNLGRP_NFTABLES, +#define NFNLGRP_NFTABLES NFNLGRP_NFTABLES + NFNLGRP_ACCT_QUOTA, +#define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA + NFNLGRP_NFTRACE, +#define NFNLGRP_NFTRACE NFNLGRP_NFTRACE + __NFNLGRP_MAX, +}; +#define NFNLGRP_MAX (__NFNLGRP_MAX - 1) + +/* General form of address family dependent message. + */ +struct nfgenmsg { + __u8 nfgen_family; /* AF_xxx */ + __u8 version; /* nfnetlink version */ + __be16 res_id; /* resource id */ +}; + +#define NFNETLINK_V0 0 + +/* netfilter netlink message types are split in two pieces: + * 8 bit subsystem, 8bit operation. + */ + +#define NFNL_SUBSYS_ID(x) ((x & 0xff00) >> 8) +#define NFNL_MSG_TYPE(x) (x & 0x00ff) + +/* No enum here, otherwise __stringify() trick of MODULE_ALIAS_NFNL_SUBSYS() + * won't work anymore */ +#define NFNL_SUBSYS_NONE 0 +#define NFNL_SUBSYS_CTNETLINK 1 +#define NFNL_SUBSYS_CTNETLINK_EXP 2 +#define NFNL_SUBSYS_QUEUE 3 +#define NFNL_SUBSYS_ULOG 4 +#define NFNL_SUBSYS_OSF 5 +#define NFNL_SUBSYS_IPSET 6 +#define NFNL_SUBSYS_ACCT 7 +#define NFNL_SUBSYS_CTNETLINK_TIMEOUT 8 +#define NFNL_SUBSYS_CTHELPER 9 +#define NFNL_SUBSYS_NFTABLES 10 +#define NFNL_SUBSYS_NFT_COMPAT 11 +#define NFNL_SUBSYS_HOOK 12 +#define NFNL_SUBSYS_COUNT 13 + +/* Reserved control nfnetlink messages */ +#define NFNL_MSG_BATCH_BEGIN NLMSG_MIN_TYPE +#define NFNL_MSG_BATCH_END NLMSG_MIN_TYPE+1 + +/** + * enum nfnl_batch_attributes - nfnetlink batch netlink attributes + * + * @NFNL_BATCH_GENID: generation ID for this changeset (NLA_U32) + */ +enum nfnl_batch_attributes { + NFNL_BATCH_UNSPEC, + NFNL_BATCH_GENID, + __NFNL_BATCH_MAX +}; +#define NFNL_BATCH_MAX (__NFNL_BATCH_MAX - 1) + +#endif /* _UAPI_NFNETLINK_H */ diff --git a/src/basic/linux/netlink.h b/src/basic/linux/netlink.h new file mode 100644 index 0000000..e2ae82e --- /dev/null +++ b/src/basic/linux/netlink.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETLINK_H +#define _UAPI__LINUX_NETLINK_H + +#include +#include /* for __kernel_sa_family_t */ +#include + +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* Unused number */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 +#define NETLINK_DNRTMSG 14 /* DECnet routing messages (obsolete) */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 +/* leave room for NETLINK_DM (DM Events) */ +#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_ECRYPTFS 19 +#define NETLINK_RDMA 20 +#define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ + +#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +#define MAX_LINKS 32 + +struct sockaddr_nl { + __kernel_sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __u32 nl_pid; /* port ID */ + __u32 nl_groups; /* multicast groups mask */ +}; + +/** + * struct nlmsghdr - fixed format metadata header of Netlink messages + * @nlmsg_len: Length of message including header + * @nlmsg_type: Message content type + * @nlmsg_flags: Additional flags + * @nlmsg_seq: Sequence number + * @nlmsg_pid: Sending process port ID + */ +struct nlmsghdr { + __u32 nlmsg_len; + __u16 nlmsg_type; + __u16 nlmsg_flags; + __u32 nlmsg_seq; + __u32 nlmsg_pid; +}; + +/* Flags values */ + +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Receive resulting notifications */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ +#define NLM_F_BULK 0x200 /* Delete multiple objects */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void *)(((char *)nlh) + NLMSG_HDRLEN)) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr *)(((char *)(nlh)) + \ + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) +#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +struct nlmsgerr { + int error; + struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @NLMSGERR_ATTR_POLICY: policy for a rejected attribute + * @NLMSGERR_ATTR_MISS_TYPE: type of a missing required attribute, + * %NLMSGERR_ATTR_MISS_NEST will not be present if the attribute was + * missing at the message level + * @NLMSGERR_ATTR_MISS_NEST: offset of the nest where attribute was missing + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + NLMSGERR_ATTR_POLICY, + NLMSGERR_ATTR_MISS_TYPE, + NLMSGERR_ATTR_MISS_NEST, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#endif +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 +#define NETLINK_GET_STRICT_CHK 12 + +struct nl_pktinfo { + __u32 group; +}; + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +#ifndef __KERNEL__ +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +enum { + NETLINK_UNCONNECTED = 0, + NETLINK_CONNECTED, +}; + +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr { + __u16 nla_len; + __u16 nla_type; +}; + +/* + * nla_type (16 bits) + * +---+---+-------------------------------+ + * | N | O | Attribute Type | + * +---+---+-------------------------------+ + * N := Carries nested attributes + * O := Payload stored in network byte order + * + * Note: The N and O flag are mutually exclusive. + */ +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; + +/* + * policy descriptions - it's specific to each family how this is used + * Normally, it should be retrieved via a dump inside another attribute + * specifying where it applies. + */ + +/** + * enum netlink_attribute_type - type of an attribute + * @NL_ATTR_TYPE_INVALID: unused + * @NL_ATTR_TYPE_FLAG: flag attribute (present/not present) + * @NL_ATTR_TYPE_U8: 8-bit unsigned attribute + * @NL_ATTR_TYPE_U16: 16-bit unsigned attribute + * @NL_ATTR_TYPE_U32: 32-bit unsigned attribute + * @NL_ATTR_TYPE_U64: 64-bit unsigned attribute + * @NL_ATTR_TYPE_S8: 8-bit signed attribute + * @NL_ATTR_TYPE_S16: 16-bit signed attribute + * @NL_ATTR_TYPE_S32: 32-bit signed attribute + * @NL_ATTR_TYPE_S64: 64-bit signed attribute + * @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified + * @NL_ATTR_TYPE_STRING: string, min/max length may be specified + * @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string, + * min/max length may be specified + * @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute + * consists of sub-attributes. The nested policy and maxtype + * inside may be specified. + * @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this + * attribute contains sub-attributes whose type is irrelevant + * (just used to separate the array entries) and each such array + * entry has attributes again, the policy for those inner ones + * and the corresponding maxtype may be specified. + * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute + */ +enum netlink_attribute_type { + NL_ATTR_TYPE_INVALID, + + NL_ATTR_TYPE_FLAG, + + NL_ATTR_TYPE_U8, + NL_ATTR_TYPE_U16, + NL_ATTR_TYPE_U32, + NL_ATTR_TYPE_U64, + + NL_ATTR_TYPE_S8, + NL_ATTR_TYPE_S16, + NL_ATTR_TYPE_S32, + NL_ATTR_TYPE_S64, + + NL_ATTR_TYPE_BINARY, + NL_ATTR_TYPE_STRING, + NL_ATTR_TYPE_NUL_STRING, + + NL_ATTR_TYPE_NESTED, + NL_ATTR_TYPE_NESTED_ARRAY, + + NL_ATTR_TYPE_BITFIELD32, +}; + +/** + * enum netlink_policy_type_attr - policy type attributes + * @NL_POLICY_TYPE_ATTR_UNSPEC: unused + * @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute, + * &enum netlink_attribute_type (U32) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary + * attributes, no minimum if not given (U32) + * @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary + * attributes, no maximum if not given (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and + * nested array types (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy + * attribute for nested and nested array types, this can + * in theory be < the size of the policy pointed to by + * the index, if limited inside the nesting (U32) + * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the + * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64) + * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + * + * @__NL_POLICY_TYPE_ATTR_MAX: number of attributes + * @NL_POLICY_TYPE_ATTR_MAX: highest attribute number + */ +enum netlink_policy_type_attr { + NL_POLICY_TYPE_ATTR_UNSPEC, + NL_POLICY_TYPE_ATTR_TYPE, + NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + NL_POLICY_TYPE_ATTR_MIN_LENGTH, + NL_POLICY_TYPE_ATTR_MAX_LENGTH, + NL_POLICY_TYPE_ATTR_POLICY_IDX, + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + NL_POLICY_TYPE_ATTR_PAD, + NL_POLICY_TYPE_ATTR_MASK, + + /* keep last */ + __NL_POLICY_TYPE_ATTR_MAX, + NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1 +}; + +#endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/src/basic/linux/nexthop.h b/src/basic/linux/nexthop.h new file mode 100644 index 0000000..d8ffa8c --- /dev/null +++ b/src/basic/linux/nexthop.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_NEXTHOP_H +#define _UAPI_LINUX_NEXTHOP_H + +#include + +struct nhmsg { + unsigned char nh_family; + unsigned char nh_scope; /* return only */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F flags */ +}; + +/* entry in a nexthop group */ +struct nexthop_grp { + __u32 id; /* nexthop id - must exist */ + __u8 weight; /* weight of this nexthop */ + __u8 resvd1; + __u16 resvd2; +}; + +enum { + NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group + * default type if not specified + */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ + __NEXTHOP_GRP_TYPE_MAX, +}; + +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + +enum { + NHA_UNSPEC, + NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */ + + NHA_GROUP, /* array of nexthop_grp */ + NHA_GROUP_TYPE, /* u16 one of NEXTHOP_GRP_TYPE */ + /* if NHA_GROUP attribute is added, no other attributes can be set */ + + NHA_BLACKHOLE, /* flag; nexthop used to blackhole packets */ + /* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */ + + NHA_OIF, /* u32; nexthop device */ + NHA_GATEWAY, /* be32 (IPv4) or in6_addr (IPv6) gw address */ + NHA_ENCAP_TYPE, /* u16; lwt encap type */ + NHA_ENCAP, /* lwt encap data */ + + /* NHA_OIF can be appended to dump request to return only + * nexthops using given device + */ + NHA_GROUPS, /* flag; only return nexthop groups in dump */ + NHA_MASTER, /* u32; only return nexthops with given master dev */ + + NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ + /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + + /* nested; resilient nexthop group attributes */ + NHA_RES_GROUP, + /* nested; nexthop bucket attributes */ + NHA_RES_BUCKET, + + __NHA_MAX, +}; + +#define NHA_MAX (__NHA_MAX - 1) + +enum { + NHA_RES_GROUP_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC, + + /* u16; number of nexthop buckets in a resilient nexthop group */ + NHA_RES_GROUP_BUCKETS, + /* clock_t as u32; nexthop bucket idle timer (per-group) */ + NHA_RES_GROUP_IDLE_TIMER, + /* clock_t as u32; nexthop unbalanced timer */ + NHA_RES_GROUP_UNBALANCED_TIMER, + /* clock_t as u64; nexthop unbalanced time */ + NHA_RES_GROUP_UNBALANCED_TIME, + + __NHA_RES_GROUP_MAX, +}; + +#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1) + +enum { + NHA_RES_BUCKET_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_BUCKET_PAD = NHA_RES_BUCKET_UNSPEC, + + /* u16; nexthop bucket index */ + NHA_RES_BUCKET_INDEX, + /* clock_t as u64; nexthop bucket idle time */ + NHA_RES_BUCKET_IDLE_TIME, + /* u32; nexthop id assigned to the nexthop bucket */ + NHA_RES_BUCKET_NH_ID, + + __NHA_RES_BUCKET_MAX, +}; + +#define NHA_RES_BUCKET_MAX (__NHA_RES_BUCKET_MAX - 1) + +#endif diff --git a/src/basic/linux/nl80211.h b/src/basic/linux/nl80211.h new file mode 100644 index 0000000..c14a91b --- /dev/null +++ b/src/basic/linux/nl80211.h @@ -0,0 +1,7726 @@ +#ifndef __LINUX_NL80211_H +#define __LINUX_NL80211_H +/* + * 802.11 netlink interface public header + * + * Copyright 2006-2010 Johannes Berg + * Copyright 2008 Michael Wu + * Copyright 2008 Luis Carlos Cobo + * Copyright 2008 Michael Buesch + * Copyright 2008, 2009 Luis R. Rodriguez + * Copyright 2008 Jouni Malinen + * Copyright 2008 Colin McCabe + * Copyright 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018-2022 Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +/* + * This header file defines the userspace API to the wireless stack. Please + * be careful not to break things - i.e. don't move anything around or so + * unless you can demonstrate that it breaks neither API nor ABI. + * + * Additions to the API should be accompanied by actual implementations in + * an upstream driver, so that example implementations exist in case there + * are ever concerns about the precise semantics of the API or changes are + * needed, and to ensure that code for dead (no longer implemented) API + * can actually be identified and removed. + * Nonetheless, semantics should also be documented carefully in this file. + */ + +#include + +#define NL80211_GENL_NAME "nl80211" + +#define NL80211_MULTICAST_GROUP_CONFIG "config" +#define NL80211_MULTICAST_GROUP_SCAN "scan" +#define NL80211_MULTICAST_GROUP_REG "regulatory" +#define NL80211_MULTICAST_GROUP_MLME "mlme" +#define NL80211_MULTICAST_GROUP_VENDOR "vendor" +#define NL80211_MULTICAST_GROUP_NAN "nan" +#define NL80211_MULTICAST_GROUP_TESTMODE "testmode" + +#define NL80211_EDMG_BW_CONFIG_MIN 4 +#define NL80211_EDMG_BW_CONFIG_MAX 15 +#define NL80211_EDMG_CHANNELS_MIN 1 +#define NL80211_EDMG_CHANNELS_MAX 0x3c /* 0b00111100 */ + +/** + * DOC: Station handling + * + * Stations are added per interface, but a special case exists with VLAN + * interfaces. When a station is bound to an AP interface, it may be moved + * into a VLAN identified by a VLAN interface index (%NL80211_ATTR_STA_VLAN). + * The station is still assumed to belong to the AP interface it was added + * to. + * + * Station handling varies per interface type and depending on the driver's + * capabilities. + * + * For drivers supporting TDLS with external setup (WIPHY_FLAG_SUPPORTS_TDLS + * and WIPHY_FLAG_TDLS_EXTERNAL_SETUP), the station lifetime is as follows: + * - a setup station entry is added, not yet authorized, without any rate + * or capability information, this just exists to avoid race conditions + * - when the TDLS setup is done, a single NL80211_CMD_SET_STATION is valid + * to add rate and capability information to the station and at the same + * time mark it authorized. + * - %NL80211_TDLS_ENABLE_LINK is then used + * - after this, the only valid operation is to remove it by tearing down + * the TDLS link (%NL80211_TDLS_DISABLE_LINK) + * + * TODO: need more info for other interface types + */ + +/** + * DOC: Frame transmission/registration support + * + * Frame transmission and registration support exists to allow userspace + * management entities such as wpa_supplicant react to management frames + * that are not being handled by the kernel. This includes, for example, + * certain classes of action frames that cannot be handled in the kernel + * for various reasons. + * + * Frame registration is done on a per-interface basis and registrations + * cannot be removed other than by closing the socket. It is possible to + * specify a registration filter to register, for example, only for a + * certain type of action frame. In particular with action frames, those + * that userspace registers for will not be returned as unhandled by the + * driver, so that the registered application has to take responsibility + * for doing that. + * + * The type of frame that can be registered for is also dependent on the + * driver and interface type. The frame types are advertised in wiphy + * attributes so applications know what to expect. + * + * NOTE: When an interface changes type while registrations are active, + * these registrations are ignored until the interface type is + * changed again. This means that changing the interface type can + * lead to a situation that couldn't otherwise be produced, but + * any such registrations will be dormant in the sense that they + * will not be serviced, i.e. they will not receive any frames. + * + * Frame transmission allows userspace to send for example the required + * responses to action frames. It is subject to some sanity checking, + * but many frames can be transmitted. When a frame was transmitted, its + * status is indicated to the sending socket. + * + * For more technical details, see the corresponding command descriptions + * below. + */ + +/** + * DOC: Virtual interface / concurrency capabilities + * + * Some devices are able to operate with virtual MACs, they can have + * more than one virtual interface. The capability handling for this + * is a bit complex though, as there may be a number of restrictions + * on the types of concurrency that are supported. + * + * To start with, each device supports the interface types listed in + * the %NL80211_ATTR_SUPPORTED_IFTYPES attribute, but by listing the + * types there no concurrency is implied. + * + * Once concurrency is desired, more attributes must be observed: + * To start with, since some interface types are purely managed in + * software, like the AP-VLAN type in mac80211 for example, there's + * an additional list of these, they can be added at any time and + * are only restricted by some semantic restrictions (e.g. AP-VLAN + * cannot be added without a corresponding AP interface). This list + * is exported in the %NL80211_ATTR_SOFTWARE_IFTYPES attribute. + * + * Further, the list of supported combinations is exported. This is + * in the %NL80211_ATTR_INTERFACE_COMBINATIONS attribute. Basically, + * it exports a list of "groups", and at any point in time the + * interfaces that are currently active must fall into any one of + * the advertised groups. Within each group, there are restrictions + * on the number of interfaces of different types that are supported + * and also the number of different channels, along with potentially + * some other restrictions. See &enum nl80211_if_combination_attrs. + * + * All together, these attributes define the concurrency of virtual + * interfaces that a given device supports. + */ + +/** + * DOC: packet coalesce support + * + * In most cases, host that receives IPv4 and IPv6 multicast/broadcast + * packets does not do anything with these packets. Therefore the + * reception of these unwanted packets causes unnecessary processing + * and power consumption. + * + * Packet coalesce feature helps to reduce number of received interrupts + * to host by buffering these packets in firmware/hardware for some + * predefined time. Received interrupt will be generated when one of the + * following events occur. + * a) Expiration of hardware timer whose expiration time is set to maximum + * coalescing delay of matching coalesce rule. + * b) Coalescing buffer in hardware reaches it's limit. + * c) Packet doesn't match any of the configured coalesce rules. + * + * User needs to configure following parameters for creating a coalesce + * rule. + * a) Maximum coalescing delay + * b) List of packet patterns which needs to be matched + * c) Condition for coalescence. pattern 'match' or 'no match' + * Multiple such rules can be created. + */ + +/** + * DOC: WPA/WPA2 EAPOL handshake offload + * + * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK flag drivers + * can indicate they support offloading EAPOL handshakes for WPA/WPA2 + * preshared key authentication in station mode. In %NL80211_CMD_CONNECT + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_CONNECT when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. + * + * Similarly @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X flag can be + * set by drivers indicating offload support of the PTK/GTK EAPOL + * handshakes during 802.1X authentication in station mode. In order to + * use the offload the %NL80211_CMD_CONNECT should have + * %NL80211_ATTR_WANT_1X_4WAY_HS attribute flag. Drivers supporting this + * offload may reject the %NL80211_CMD_CONNECT when the attribute flag is + * not present. + * + * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag drivers + * can indicate they support offloading EAPOL handshakes for WPA/WPA2 + * preshared key authentication in AP mode. In %NL80211_CMD_START_AP + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_START_AP when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. + * + * For 802.1X the PMK or PMK-R0 are set by providing %NL80211_ATTR_PMK + * using %NL80211_CMD_SET_PMK. For offloaded FT support also + * %NL80211_ATTR_PMKR0_NAME must be provided. + */ + +/** + * DOC: FILS shared key authentication offload + * + * FILS shared key authentication offload can be advertized by drivers by + * setting @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD flag. The drivers that support + * FILS shared key authentication offload should be able to construct the + * authentication and association frames for FILS shared key authentication and + * eventually do a key derivation as per IEEE 802.11ai. The below additional + * parameters should be given to driver in %NL80211_CMD_CONNECT and/or in + * %NL80211_CMD_UPDATE_CONNECT_PARAMS. + * %NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai + * %NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai + * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message + * %NL80211_ATTR_FILS_ERP_RRK - used to generate the rIK and rMSK + * rIK should be used to generate an authentication tag on the ERP message and + * rMSK should be used to derive a PMKSA. + * rIK, rMSK should be generated and keyname_nai, sequence number should be used + * as specified in IETF RFC 6696. + * + * When FILS shared key authentication is completed, driver needs to provide the + * below additional parameters to userspace, which can be either after setting + * up a connection or after roaming. + * %NL80211_ATTR_FILS_KEK - used for key renewal + * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges + * %NL80211_ATTR_PMKID - used to identify the PMKSA used/generated + * %Nl80211_ATTR_PMK - used to update PMKSA cache in userspace + * The PMKSA can be maintained in userspace persistently so that it can be used + * later after reboots or wifi turn off/on also. + * + * %NL80211_ATTR_FILS_CACHE_ID is the cache identifier advertized by a FILS + * capable AP supporting PMK caching. It specifies the scope within which the + * PMKSAs are cached in an ESS. %NL80211_CMD_SET_PMKSA and + * %NL80211_CMD_DEL_PMKSA are enhanced to allow support for PMKSA caching based + * on FILS cache identifier. Additionally %NL80211_ATTR_PMK is used with + * %NL80211_SET_PMKSA to specify the PMK corresponding to a PMKSA for driver to + * use in a FILS shared key connection with PMKSA caching. + */ + +/** + * DOC: SAE authentication offload + * + * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they + * support offloading SAE authentication for WPA3-Personal networks in station + * mode. Similarly @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag can be set by + * drivers indicating the offload support in AP mode. + * + * The password for SAE should be specified using %NL80211_ATTR_SAE_PASSWORD in + * %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP for station and AP mode + * respectively. + */ + +/** + * DOC: VLAN offload support for setting group keys and binding STAs to VLANs + * + * By setting @NL80211_EXT_FEATURE_VLAN_OFFLOAD flag drivers can indicate they + * support offloading VLAN functionality in a manner where the driver exposes a + * single netdev that uses VLAN tagged frames and separate VLAN-specific netdevs + * can then be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to the Ethernet + * case. Frames received from stations that are not assigned to any VLAN are + * delivered on the main netdev and frames to such stations can be sent through + * that main netdev. + * + * %NL80211_CMD_NEW_KEY (for group keys), %NL80211_CMD_NEW_STATION, and + * %NL80211_CMD_SET_STATION will optionally specify vlan_id using + * %NL80211_ATTR_VLAN_ID. + */ + +/** + * DOC: TID configuration + * + * TID config support can be checked in the %NL80211_ATTR_TID_CONFIG + * attribute given in wiphy capabilities. + * + * The necessary configuration parameters are mentioned in + * &enum nl80211_tid_config_attr and it will be passed to the + * %NL80211_CMD_SET_TID_CONFIG command in %NL80211_ATTR_TID_CONFIG. + * + * If the configuration needs to be applied for specific peer then the MAC + * address of the peer needs to be passed in %NL80211_ATTR_MAC, otherwise the + * configuration will be applied for all the connected peers in the vif except + * any peers that have peer specific configuration for the TID by default; if + * the %NL80211_TID_CONFIG_ATTR_OVERRIDE flag is set, peer specific values + * will be overwritten. + * + * All this configuration is valid only for STA's current connection + * i.e. the configuration will be reset to default when the STA connects back + * after disconnection/roaming, and this configuration will be cleared when + * the interface goes down. + */ + +/** + * DOC: FILS shared key crypto offload + * + * This feature is applicable to drivers running in AP mode. + * + * FILS shared key crypto offload can be advertised by drivers by setting + * @NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD flag. The drivers that support + * FILS shared key crypto offload should be able to encrypt and decrypt + * association frames for FILS shared key authentication as per IEEE 802.11ai. + * With this capability, for FILS key derivation, drivers depend on userspace. + * + * After FILS key derivation, userspace shares the FILS AAD details with the + * driver and the driver stores the same to use in decryption of association + * request and in encryption of association response. The below parameters + * should be given to the driver in %NL80211_CMD_SET_FILS_AAD. + * %NL80211_ATTR_MAC - STA MAC address, used for storing FILS AAD per STA + * %NL80211_ATTR_FILS_KEK - Used for encryption or decryption + * %NL80211_ATTR_FILS_NONCES - Used for encryption or decryption + * (STA Nonce 16 bytes followed by AP Nonce 16 bytes) + * + * Once the association is done, the driver cleans the FILS AAD data. + */ + +/** + * DOC: Multi-Link Operation + * + * In Multi-Link Operation, a connection between to MLDs utilizes multiple + * links. To use this in nl80211, various commands and responses now need + * to or will include the new %NL80211_ATTR_MLO_LINKS attribute. + * Additionally, various commands that need to operate on a specific link + * now need to be given the %NL80211_ATTR_MLO_LINK_ID attribute, e.g. to + * use %NL80211_CMD_START_AP or similar functions. + */ + +/** + * enum nl80211_commands - supported nl80211 commands + * + * @NL80211_CMD_UNSPEC: unspecified command to catch errors + * + * @NL80211_CMD_GET_WIPHY: request information about a wiphy or dump request + * to get a list of all present wiphys. + * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or + * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME, + * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET (and the attributes determining the + * channel width; this is used for setting monitor mode channel), + * %NL80211_ATTR_WIPHY_RETRY_SHORT, %NL80211_ATTR_WIPHY_RETRY_LONG, + * %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, and/or + * %NL80211_ATTR_WIPHY_RTS_THRESHOLD. However, for setting the channel, + * see %NL80211_CMD_SET_CHANNEL instead, the support here is for backward + * compatibility only. + * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request + * or rename notification. Has attributes %NL80211_ATTR_WIPHY and + * %NL80211_ATTR_WIPHY_NAME. + * @NL80211_CMD_DEL_WIPHY: Wiphy deleted. Has attributes + * %NL80211_ATTR_WIPHY and %NL80211_ATTR_WIPHY_NAME. + * + * @NL80211_CMD_GET_INTERFACE: Request an interface's configuration; + * either a dump request for all interfaces or a specific get with a + * single %NL80211_ATTR_IFINDEX is supported. + * @NL80211_CMD_SET_INTERFACE: Set type of a virtual interface, requires + * %NL80211_ATTR_IFINDEX and %NL80211_ATTR_IFTYPE. + * @NL80211_CMD_NEW_INTERFACE: Newly created virtual interface or response + * to %NL80211_CMD_GET_INTERFACE. Has %NL80211_ATTR_IFINDEX, + * %NL80211_ATTR_WIPHY and %NL80211_ATTR_IFTYPE attributes. Can also + * be sent from userspace to request creation of a new virtual interface, + * then requires attributes %NL80211_ATTR_WIPHY, %NL80211_ATTR_IFTYPE and + * %NL80211_ATTR_IFNAME. + * @NL80211_CMD_DEL_INTERFACE: Virtual interface was deleted, has attributes + * %NL80211_ATTR_IFINDEX and %NL80211_ATTR_WIPHY. Can also be sent from + * userspace to request deletion of a virtual interface, then requires + * attribute %NL80211_ATTR_IFINDEX. If multiple BSSID advertisements are + * enabled using %NL80211_ATTR_MBSSID_CONFIG, %NL80211_ATTR_MBSSID_ELEMS, + * and if this command is used for the transmitting interface, then all + * the non-transmitting interfaces are deleted as well. + * + * @NL80211_CMD_GET_KEY: Get sequence counter information for a key specified + * by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC + * represents peer's MLD address for MLO pairwise key. For MLO group key, + * the link is identified by %NL80211_ATTR_MLO_LINK_ID. + * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT, + * %NL80211_ATTR_KEY_DEFAULT_MGMT, or %NL80211_ATTR_KEY_THRESHOLD. + * For MLO connection, the link to set default key is identified by + * %NL80211_ATTR_MLO_LINK_ID. + * @NL80211_CMD_NEW_KEY: add a key with given %NL80211_ATTR_KEY_DATA, + * %NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC, %NL80211_ATTR_KEY_CIPHER, + * and %NL80211_ATTR_KEY_SEQ attributes. %NL80211_ATTR_MAC represents + * peer's MLD address for MLO pairwise key. The link to add MLO + * group key is identified by %NL80211_ATTR_MLO_LINK_ID. + * @NL80211_CMD_DEL_KEY: delete a key identified by %NL80211_ATTR_KEY_IDX + * or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC represents peer's MLD address + * for MLO pairwise key. The link to delete group key is identified by + * %NL80211_ATTR_MLO_LINK_ID. + * + * @NL80211_CMD_GET_BEACON: (not used) + * @NL80211_CMD_SET_BEACON: change the beacon on an access point interface + * using the %NL80211_ATTR_BEACON_HEAD and %NL80211_ATTR_BEACON_TAIL + * attributes. For drivers that generate the beacon and probe responses + * internally, the following attributes must be provided: %NL80211_ATTR_IE, + * %NL80211_ATTR_IE_PROBE_RESP and %NL80211_ATTR_IE_ASSOC_RESP. + * @NL80211_CMD_START_AP: Start AP operation on an AP interface, parameters + * are like for %NL80211_CMD_SET_BEACON, and additionally parameters that + * do not change are used, these include %NL80211_ATTR_BEACON_INTERVAL, + * %NL80211_ATTR_DTIM_PERIOD, %NL80211_ATTR_SSID, + * %NL80211_ATTR_HIDDEN_SSID, %NL80211_ATTR_CIPHERS_PAIRWISE, + * %NL80211_ATTR_CIPHER_GROUP, %NL80211_ATTR_WPA_VERSIONS, + * %NL80211_ATTR_AKM_SUITES, %NL80211_ATTR_PRIVACY, + * %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_INACTIVITY_TIMEOUT, + * %NL80211_ATTR_ACL_POLICY and %NL80211_ATTR_MAC_ADDRS. + * The channel to use can be set on the interface or be given using the + * %NL80211_ATTR_WIPHY_FREQ and %NL80211_ATTR_WIPHY_FREQ_OFFSET, and the + * attributes determining channel width. + * @NL80211_CMD_NEW_BEACON: old alias for %NL80211_CMD_START_AP + * @NL80211_CMD_STOP_AP: Stop AP operation on the given interface + * @NL80211_CMD_DEL_BEACON: old alias for %NL80211_CMD_STOP_AP + * + * @NL80211_CMD_GET_STATION: Get station attributes for station identified by + * %NL80211_ATTR_MAC on the interface identified by %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_SET_STATION: Set station attributes for station identified by + * %NL80211_ATTR_MAC on the interface identified by %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_NEW_STATION: Add a station with given attributes to the + * interface identified by %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_DEL_STATION: Remove a station identified by %NL80211_ATTR_MAC + * or, if no MAC address given, all stations, on the interface identified + * by %NL80211_ATTR_IFINDEX. %NL80211_ATTR_MGMT_SUBTYPE and + * %NL80211_ATTR_REASON_CODE can optionally be used to specify which type + * of disconnection indication should be sent to the station + * (Deauthentication or Disassociation frame and reason code for that + * frame). + * + * @NL80211_CMD_GET_MPATH: Get mesh path attributes for mesh path to + * destination %NL80211_ATTR_MAC on the interface identified by + * %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_SET_MPATH: Set mesh path attributes for mesh path to + * destination %NL80211_ATTR_MAC on the interface identified by + * %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_NEW_MPATH: Create a new mesh path for the destination given by + * %NL80211_ATTR_MAC via %NL80211_ATTR_MPATH_NEXT_HOP. + * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by + * %NL80211_ATTR_MAC. + * @NL80211_CMD_NEW_PATH: Add a mesh path with given attributes to the + * interface identified by %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_DEL_PATH: Remove a mesh path identified by %NL80211_ATTR_MAC + * or, if no MAC address given, all mesh paths, on the interface identified + * by %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_SET_BSS: Set BSS attributes for BSS identified by + * %NL80211_ATTR_IFINDEX. + * + * @NL80211_CMD_GET_REG: ask the wireless core to send us its currently set + * regulatory domain. If %NL80211_ATTR_WIPHY is specified and the device + * has a private regulatory domain, it will be returned. Otherwise, the + * global regdomain will be returned. + * A device will have a private regulatory domain if it uses the + * regulatory_hint() API. Even when a private regdomain is used the channel + * information will still be mended according to further hints from + * the regulatory core to help with compliance. A dump version of this API + * is now available which will returns the global regdomain as well as + * all private regdomains of present wiphys (for those that have it). + * If a wiphy is self-managed (%NL80211_ATTR_WIPHY_SELF_MANAGED_REG), then + * its private regdomain is the only valid one for it. The regulatory + * core is not used to help with compliance in this case. + * @NL80211_CMD_SET_REG: Set current regulatory domain. CRDA sends this command + * after being queried by the kernel. CRDA replies by sending a regulatory + * domain structure which consists of %NL80211_ATTR_REG_ALPHA set to our + * current alpha2 if it found a match. It also provides + * NL80211_ATTR_REG_RULE_FLAGS, and a set of regulatory rules. Each + * regulatory rule is a nested set of attributes given by + * %NL80211_ATTR_REG_RULE_FREQ_[START|END] and + * %NL80211_ATTR_FREQ_RANGE_MAX_BW with an attached power rule given by + * %NL80211_ATTR_REG_RULE_POWER_MAX_ANT_GAIN and + * %NL80211_ATTR_REG_RULE_POWER_MAX_EIRP. + * @NL80211_CMD_REQ_SET_REG: ask the wireless core to set the regulatory domain + * to the specified ISO/IEC 3166-1 alpha2 country code. The core will + * store this as a valid request and then query userspace for it. + * + * @NL80211_CMD_GET_MESH_CONFIG: Get mesh networking properties for the + * interface identified by %NL80211_ATTR_IFINDEX + * + * @NL80211_CMD_SET_MESH_CONFIG: Set mesh networking properties for the + * interface identified by %NL80211_ATTR_IFINDEX + * + * @NL80211_CMD_SET_MGMT_EXTRA_IE: Set extra IEs for management frames. The + * interface is identified with %NL80211_ATTR_IFINDEX and the management + * frame subtype with %NL80211_ATTR_MGMT_SUBTYPE. The extra IE data to be + * added to the end of the specified management frame is specified with + * %NL80211_ATTR_IE. If the command succeeds, the requested data will be + * added to all specified management frames generated by + * kernel/firmware/driver. + * Note: This command has been removed and it is only reserved at this + * point to avoid re-using existing command number. The functionality this + * command was planned for has been provided with cleaner design with the + * option to specify additional IEs in NL80211_CMD_TRIGGER_SCAN, + * NL80211_CMD_AUTHENTICATE, NL80211_CMD_ASSOCIATE, + * NL80211_CMD_DEAUTHENTICATE, and NL80211_CMD_DISASSOCIATE. + * + * @NL80211_CMD_GET_SCAN: get scan results + * @NL80211_CMD_TRIGGER_SCAN: trigger a new scan with the given parameters + * %NL80211_ATTR_TX_NO_CCK_RATE is used to decide whether to send the + * probe requests at CCK rate or not. %NL80211_ATTR_BSSID can be used to + * specify a BSSID to scan for; if not included, the wildcard BSSID will + * be used. + * @NL80211_CMD_NEW_SCAN_RESULTS: scan notification (as a reply to + * NL80211_CMD_GET_SCAN and on the "scan" multicast group) + * @NL80211_CMD_SCAN_ABORTED: scan was aborted, for unspecified reasons, + * partial scan results may be available + * + * @NL80211_CMD_START_SCHED_SCAN: start a scheduled scan at certain + * intervals and certain number of cycles, as specified by + * %NL80211_ATTR_SCHED_SCAN_PLANS. If %NL80211_ATTR_SCHED_SCAN_PLANS is + * not specified and only %NL80211_ATTR_SCHED_SCAN_INTERVAL is specified, + * scheduled scan will run in an infinite loop with the specified interval. + * These attributes are mutually exculsive, + * i.e. NL80211_ATTR_SCHED_SCAN_INTERVAL must not be passed if + * NL80211_ATTR_SCHED_SCAN_PLANS is defined. + * If for some reason scheduled scan is aborted by the driver, all scan + * plans are canceled (including scan plans that did not start yet). + * Like with normal scans, if SSIDs (%NL80211_ATTR_SCAN_SSIDS) + * are passed, they are used in the probe requests. For + * broadcast, a broadcast SSID must be passed (ie. an empty + * string). If no SSID is passed, no probe requests are sent and + * a passive scan is performed. %NL80211_ATTR_SCAN_FREQUENCIES, + * if passed, define which channels should be scanned; if not + * passed, all channels allowed for the current regulatory domain + * are used. Extra IEs can also be passed from the userspace by + * using the %NL80211_ATTR_IE attribute. The first cycle of the + * scheduled scan can be delayed by %NL80211_ATTR_SCHED_SCAN_DELAY + * is supplied. If the device supports multiple concurrent scheduled + * scans, it will allow such when the caller provides the flag attribute + * %NL80211_ATTR_SCHED_SCAN_MULTI to indicate user-space support for it. + * @NL80211_CMD_STOP_SCHED_SCAN: stop a scheduled scan. Returns -ENOENT if + * scheduled scan is not running. The caller may assume that as soon + * as the call returns, it is safe to start a new scheduled scan again. + * @NL80211_CMD_SCHED_SCAN_RESULTS: indicates that there are scheduled scan + * results available. + * @NL80211_CMD_SCHED_SCAN_STOPPED: indicates that the scheduled scan has + * stopped. The driver may issue this event at any time during a + * scheduled scan. One reason for stopping the scan is if the hardware + * does not support starting an association or a normal scan while running + * a scheduled scan. This event is also sent when the + * %NL80211_CMD_STOP_SCHED_SCAN command is received or when the interface + * is brought down while a scheduled scan was running. + * + * @NL80211_CMD_GET_SURVEY: get survey resuls, e.g. channel occupation + * or noise level + * @NL80211_CMD_NEW_SURVEY_RESULTS: survey data notification (as a reply to + * NL80211_CMD_GET_SURVEY and on the "scan" multicast group) + * + * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry using %NL80211_ATTR_MAC + * (for the BSSID), %NL80211_ATTR_PMKID, and optionally %NL80211_ATTR_PMK + * (PMK is used for PTKSA derivation in case of FILS shared key offload) or + * using %NL80211_ATTR_SSID, %NL80211_ATTR_FILS_CACHE_ID, + * %NL80211_ATTR_PMKID, and %NL80211_ATTR_PMK in case of FILS + * authentication where %NL80211_ATTR_FILS_CACHE_ID is the identifier + * advertized by a FILS capable AP identifying the scope of PMKSA in an + * ESS. + * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC + * (for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID, + * %NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS + * authentication. + * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries. + * + * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain + * has been changed and provides details of the request information + * that caused the change such as who initiated the regulatory request + * (%NL80211_ATTR_REG_INITIATOR), the wiphy_idx + * (%NL80211_ATTR_REG_ALPHA2) on which the request was made from if + * the initiator was %NL80211_REGDOM_SET_BY_COUNTRY_IE or + * %NL80211_REGDOM_SET_BY_DRIVER, the type of regulatory domain + * set (%NL80211_ATTR_REG_TYPE), if the type of regulatory domain is + * %NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on + * to (%NL80211_ATTR_REG_ALPHA2). + * @NL80211_CMD_REG_BEACON_HINT: indicates to userspace that an AP beacon + * has been found while world roaming thus enabling active scan or + * any mode of operation that initiates TX (beacons) on a channel + * where we would not have been able to do either before. As an example + * if you are world roaming (regulatory domain set to world or if your + * driver is using a custom world roaming regulatory domain) and while + * doing a passive scan on the 5 GHz band you find an AP there (if not + * on a DFS channel) you will now be able to actively scan for that AP + * or use AP mode on your card on that same channel. Note that this will + * never be used for channels 1-11 on the 2 GHz band as they are always + * enabled world wide. This beacon hint is only sent if your device had + * either disabled active scanning or beaconing on a channel. We send to + * userspace the wiphy on which we removed a restriction from + * (%NL80211_ATTR_WIPHY) and the channel on which this occurred + * before (%NL80211_ATTR_FREQ_BEFORE) and after (%NL80211_ATTR_FREQ_AFTER) + * the beacon hint was processed. + * + * @NL80211_CMD_AUTHENTICATE: authentication request and notification. + * This command is used both as a command (request to authenticate) and + * as an event on the "mlme" multicast group indicating completion of the + * authentication process. + * When used as a command, %NL80211_ATTR_IFINDEX is used to identify the + * interface. %NL80211_ATTR_MAC is used to specify PeerSTAAddress (and + * BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify + * the SSID (mainly for association, but is included in authentication + * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ + + * %NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequence of the + * channel in MHz. %NL80211_ATTR_AUTH_TYPE is used to specify the + * authentication type. %NL80211_ATTR_IE is used to define IEs + * (VendorSpecificInfo, but also including RSN IE and FT IEs) to be added + * to the frame. + * When used as an event, this reports reception of an Authentication + * frame in station and IBSS modes when the local MLME processed the + * frame, i.e., it was for the local STA and was received in correct + * state. This is similar to MLME-AUTHENTICATE.confirm primitive in the + * MLME SAP interface (kernel providing MLME, userspace SME). The + * included %NL80211_ATTR_FRAME attribute contains the management frame + * (including both the header and frame body, but not FCS). This event is + * also used to indicate if the authentication attempt timed out. In that + * case the %NL80211_ATTR_FRAME attribute is replaced with a + * %NL80211_ATTR_TIMED_OUT flag (and %NL80211_ATTR_MAC to indicate which + * pending authentication timed out). + * @NL80211_CMD_ASSOCIATE: association request and notification; like + * NL80211_CMD_AUTHENTICATE but for Association and Reassociation + * (similar to MLME-ASSOCIATE.request, MLME-REASSOCIATE.request, + * MLME-ASSOCIATE.confirm or MLME-REASSOCIATE.confirm primitives). The + * %NL80211_ATTR_PREV_BSSID attribute is used to specify whether the + * request is for the initial association to an ESS (that attribute not + * included) or for reassociation within the ESS (that attribute is + * included). + * @NL80211_CMD_DEAUTHENTICATE: deauthentication request and notification; like + * NL80211_CMD_AUTHENTICATE but for Deauthentication frames (similar to + * MLME-DEAUTHENTICATION.request and MLME-DEAUTHENTICATE.indication + * primitives). + * @NL80211_CMD_DISASSOCIATE: disassociation request and notification; like + * NL80211_CMD_AUTHENTICATE but for Disassociation frames (similar to + * MLME-DISASSOCIATE.request and MLME-DISASSOCIATE.indication primitives). + * + * @NL80211_CMD_MICHAEL_MIC_FAILURE: notification of a locally detected Michael + * MIC (part of TKIP) failure; sent on the "mlme" multicast group; the + * event includes %NL80211_ATTR_MAC to describe the source MAC address of + * the frame with invalid MIC, %NL80211_ATTR_KEY_TYPE to show the key + * type, %NL80211_ATTR_KEY_IDX to indicate the key identifier, and + * %NL80211_ATTR_KEY_SEQ to indicate the TSC value of the frame; this + * event matches with MLME-MICHAELMICFAILURE.indication() primitive + * + * @NL80211_CMD_JOIN_IBSS: Join a new IBSS -- given at least an SSID and a + * FREQ attribute (for the initial frequency if no peer can be found) + * and optionally a MAC (as BSSID) and FREQ_FIXED attribute if those + * should be fixed rather than automatically determined. Can only be + * executed on a network interface that is UP, and fixed BSSID/FREQ + * may be rejected. Another optional parameter is the beacon interval, + * given in the %NL80211_ATTR_BEACON_INTERVAL attribute, which if not + * given defaults to 100 TU (102.4ms). + * @NL80211_CMD_LEAVE_IBSS: Leave the IBSS -- no special arguments, the IBSS is + * determined by the network interface. + * + * @NL80211_CMD_TESTMODE: testmode command, takes a wiphy (or ifindex) attribute + * to identify the device, and the TESTDATA blob attribute to pass through + * to the driver. + * + * @NL80211_CMD_CONNECT: connection request and notification; this command + * requests to connect to a specified network but without separating + * auth and assoc steps. For this, you need to specify the SSID in a + * %NL80211_ATTR_SSID attribute, and can optionally specify the association + * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, + * %NL80211_ATTR_USE_MFP, %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET, %NL80211_ATTR_CONTROL_PORT, + * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, + * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, + * %NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and + * %NL80211_ATTR_WIPHY_FREQ_HINT. + * If included, %NL80211_ATTR_MAC and %NL80211_ATTR_WIPHY_FREQ are + * restrictions on BSS selection, i.e., they effectively prevent roaming + * within the ESS. %NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT + * can be included to provide a recommendation of the initial BSS while + * allowing the driver to roam to other BSSes within the ESS and also to + * ignore this recommendation if the indicated BSS is not ideal. Only one + * set of BSSID,frequency parameters is used (i.e., either the enforcing + * %NL80211_ATTR_MAC,%NL80211_ATTR_WIPHY_FREQ or the less strict + * %NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT). + * Driver shall not modify the IEs specified through %NL80211_ATTR_IE if + * %NL80211_ATTR_MAC is included. However, if %NL80211_ATTR_MAC_HINT is + * included, these IEs through %NL80211_ATTR_IE are specified by the user + * space based on the best possible BSS selected. Thus, if the driver ends + * up selecting a different BSS, it can modify these IEs accordingly (e.g. + * userspace asks the driver to perform PMKSA caching with BSS1 and the + * driver ends up selecting BSS2 with different PMKSA cache entry; RSNIE + * has to get updated with the apt PMKID). + * %NL80211_ATTR_PREV_BSSID can be used to request a reassociation within + * the ESS in case the device is already associated and an association with + * a different BSS is desired. + * Background scan period can optionally be + * specified in %NL80211_ATTR_BG_SCAN_PERIOD, + * if not specified default background scan configuration + * in driver is used and if period value is 0, bg scan will be disabled. + * This attribute is ignored if driver does not support roam scan. + * It is also sent as an event, with the BSSID and response IEs when the + * connection is established or failed to be established. This can be + * determined by the %NL80211_ATTR_STATUS_CODE attribute (0 = success, + * non-zero = failure). If %NL80211_ATTR_TIMED_OUT is included in the + * event, the connection attempt failed due to not being able to initiate + * authentication/association or not receiving a response from the AP. + * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as + * well to remain backwards compatible. + * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. + * When a security association was established on an 802.1X network using + * fast transition, this event should be followed by an + * %NL80211_CMD_PORT_AUTHORIZED event. + * Following a %NL80211_CMD_ROAM event userspace can issue + * %NL80211_CMD_GET_SCAN in order to obtain the scan information for the + * new BSS the card/driver roamed to. + * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify + * userspace that a connection was dropped by the AP or due to other + * reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and + * %NL80211_ATTR_REASON_CODE attributes are used. + * + * @NL80211_CMD_SET_WIPHY_NETNS: Set a wiphy's netns. Note that all devices + * associated with this wiphy must be down and will follow. + * + * @NL80211_CMD_REMAIN_ON_CHANNEL: Request to remain awake on the specified + * channel for the specified amount of time. This can be used to do + * off-channel operations like transmit a Public Action frame and wait for + * a response while being associated to an AP on another channel. + * %NL80211_ATTR_IFINDEX is used to specify which interface (and thus + * radio) is used. %NL80211_ATTR_WIPHY_FREQ is used to specify the + * frequency for the operation. + * %NL80211_ATTR_DURATION is used to specify the duration in milliseconds + * to remain on the channel. This command is also used as an event to + * notify when the requested duration starts (it may take a while for the + * driver to schedule this time due to other concurrent needs for the + * radio). + * When called, this operation returns a cookie (%NL80211_ATTR_COOKIE) + * that will be included with any events pertaining to this request; + * the cookie is also used to cancel the request. + * @NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL: This command can be used to cancel a + * pending remain-on-channel duration if the desired operation has been + * completed prior to expiration of the originally requested duration. + * %NL80211_ATTR_WIPHY or %NL80211_ATTR_IFINDEX is used to specify the + * radio. The %NL80211_ATTR_COOKIE attribute must be given as well to + * uniquely identify the request. + * This command is also used as an event to notify when a requested + * remain-on-channel duration has expired. + * + * @NL80211_CMD_SET_TX_BITRATE_MASK: Set the mask of rates to be used in TX + * rate selection. %NL80211_ATTR_IFINDEX is used to specify the interface + * and @NL80211_ATTR_TX_RATES the set of allowed rates. + * + * @NL80211_CMD_REGISTER_FRAME: Register for receiving certain mgmt frames + * (via @NL80211_CMD_FRAME) for processing in userspace. This command + * requires an interface index, a frame type attribute (optional for + * backward compatibility reasons, if not given assumes action frames) + * and a match attribute containing the first few bytes of the frame + * that should match, e.g. a single byte for only a category match or + * four bytes for vendor frames including the OUI. The registration + * cannot be dropped, but is removed automatically when the netlink + * socket is closed. Multiple registrations can be made. + * The %NL80211_ATTR_RECEIVE_MULTICAST flag attribute can be given if + * %NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS is available, in which + * case the registration can also be modified to include/exclude the + * flag, rather than requiring unregistration to change it. + * @NL80211_CMD_REGISTER_ACTION: Alias for @NL80211_CMD_REGISTER_FRAME for + * backward compatibility + * @NL80211_CMD_FRAME: Management frame TX request and RX notification. This + * command is used both as a request to transmit a management frame and + * as an event indicating reception of a frame that was not processed in + * kernel code, but is for us (i.e., which may need to be processed in a + * user space application). %NL80211_ATTR_FRAME is used to specify the + * frame contents (including header). %NL80211_ATTR_WIPHY_FREQ is used + * to indicate on which channel the frame is to be transmitted or was + * received. If this channel is not the current channel (remain-on-channel + * or the operational channel) the device will switch to the given channel + * and transmit the frame, optionally waiting for a response for the time + * specified using %NL80211_ATTR_DURATION. When called, this operation + * returns a cookie (%NL80211_ATTR_COOKIE) that will be included with the + * TX status event pertaining to the TX request. + * %NL80211_ATTR_TX_NO_CCK_RATE is used to decide whether to send the + * management frames at CCK rate or not in 2GHz band. + * %NL80211_ATTR_CSA_C_OFFSETS_TX is an array of offsets to CSA + * counters which will be updated to the current value. This attribute + * is used during CSA period. + * For TX on an MLD, the frequency can be omitted and the link ID be + * specified, or if transmitting to a known peer MLD (with MLD addresses + * in the frame) both can be omitted and the link will be selected by + * lower layers. + * For RX notification, %NL80211_ATTR_RX_HW_TIMESTAMP may be included to + * indicate the frame RX timestamp and %NL80211_ATTR_TX_HW_TIMESTAMP may + * be included to indicate the ack TX timestamp. + * @NL80211_CMD_FRAME_WAIT_CANCEL: When an off-channel TX was requested, this + * command may be used with the corresponding cookie to cancel the wait + * time if it is known that it is no longer necessary. This command is + * also sent as an event whenever the driver has completed the off-channel + * wait time. + * @NL80211_CMD_ACTION: Alias for @NL80211_CMD_FRAME for backward compatibility. + * @NL80211_CMD_FRAME_TX_STATUS: Report TX status of a management frame + * transmitted with %NL80211_CMD_FRAME. %NL80211_ATTR_COOKIE identifies + * the TX command and %NL80211_ATTR_FRAME includes the contents of the + * frame. %NL80211_ATTR_ACK flag is included if the recipient acknowledged + * the frame. %NL80211_ATTR_TX_HW_TIMESTAMP may be included to indicate the + * tx timestamp and %NL80211_ATTR_RX_HW_TIMESTAMP may be included to + * indicate the ack RX timestamp. + * @NL80211_CMD_ACTION_TX_STATUS: Alias for @NL80211_CMD_FRAME_TX_STATUS for + * backward compatibility. + * + * @NL80211_CMD_SET_POWER_SAVE: Set powersave, using %NL80211_ATTR_PS_STATE + * @NL80211_CMD_GET_POWER_SAVE: Get powersave status in %NL80211_ATTR_PS_STATE + * + * @NL80211_CMD_SET_CQM: Connection quality monitor configuration. This command + * is used to configure connection quality monitoring notification trigger + * levels. + * @NL80211_CMD_NOTIFY_CQM: Connection quality monitor notification. This + * command is used as an event to indicate the that a trigger level was + * reached. + * @NL80211_CMD_SET_CHANNEL: Set the channel (using %NL80211_ATTR_WIPHY_FREQ + * and the attributes determining channel width) the given interface + * (identifed by %NL80211_ATTR_IFINDEX) shall operate on. + * In case multiple channels are supported by the device, the mechanism + * with which it switches channels is implementation-defined. + * When a monitor interface is given, it can only switch channel while + * no other interfaces are operating to avoid disturbing the operation + * of any other interfaces, and other interfaces will again take + * precedence when they are used. + * + * @NL80211_CMD_SET_WDS_PEER: Set the MAC address of the peer on a WDS interface + * (no longer supported). + * + * @NL80211_CMD_SET_MULTICAST_TO_UNICAST: Configure if this AP should perform + * multicast to unicast conversion. When enabled, all multicast packets + * with ethertype ARP, IPv4 or IPv6 (possibly within an 802.1Q header) + * will be sent out to each station once with the destination (multicast) + * MAC address replaced by the station's MAC address. Note that this may + * break certain expectations of the receiver, e.g. the ability to drop + * unicast IP packets encapsulated in multicast L2 frames, or the ability + * to not send destination unreachable messages in such cases. + * This can only be toggled per BSS. Configure this on an interface of + * type %NL80211_IFTYPE_AP. It applies to all its VLAN interfaces + * (%NL80211_IFTYPE_AP_VLAN), except for those in 4addr (WDS) mode. + * If %NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED is not present with this + * command, the feature is disabled. + * + * @NL80211_CMD_JOIN_MESH: Join a mesh. The mesh ID must be given, and initial + * mesh config parameters may be given. + * @NL80211_CMD_LEAVE_MESH: Leave the mesh network -- no special arguments, the + * network is determined by the network interface. + * + * @NL80211_CMD_UNPROT_DEAUTHENTICATE: Unprotected deauthentication frame + * notification. This event is used to indicate that an unprotected + * deauthentication frame was dropped when MFP is in use. + * @NL80211_CMD_UNPROT_DISASSOCIATE: Unprotected disassociation frame + * notification. This event is used to indicate that an unprotected + * disassociation frame was dropped when MFP is in use. + * + * @NL80211_CMD_NEW_PEER_CANDIDATE: Notification on the reception of a + * beacon or probe response from a compatible mesh peer. This is only + * sent while no station information (sta_info) exists for the new peer + * candidate and when @NL80211_MESH_SETUP_USERSPACE_AUTH, + * @NL80211_MESH_SETUP_USERSPACE_AMPE, or + * @NL80211_MESH_SETUP_USERSPACE_MPM is set. On reception of this + * notification, userspace may decide to create a new station + * (@NL80211_CMD_NEW_STATION). To stop this notification from + * reoccurring, the userspace authentication daemon may want to create the + * new station with the AUTHENTICATED flag unset and maybe change it later + * depending on the authentication result. + * + * @NL80211_CMD_GET_WOWLAN: get Wake-on-Wireless-LAN (WoWLAN) settings. + * @NL80211_CMD_SET_WOWLAN: set Wake-on-Wireless-LAN (WoWLAN) settings. + * Since wireless is more complex than wired ethernet, it supports + * various triggers. These triggers can be configured through this + * command with the %NL80211_ATTR_WOWLAN_TRIGGERS attribute. For + * more background information, see + * https://wireless.wiki.kernel.org/en/users/Documentation/WoWLAN. + * The @NL80211_CMD_SET_WOWLAN command can also be used as a notification + * from the driver reporting the wakeup reason. In this case, the + * @NL80211_ATTR_WOWLAN_TRIGGERS attribute will contain the reason + * for the wakeup, if it was caused by wireless. If it is not present + * in the wakeup notification, the wireless device didn't cause the + * wakeup but reports that it was woken up. + * + * @NL80211_CMD_SET_REKEY_OFFLOAD: This command is used give the driver + * the necessary information for supporting GTK rekey offload. This + * feature is typically used during WoWLAN. The configuration data + * is contained in %NL80211_ATTR_REKEY_DATA (which is nested and + * contains the data in sub-attributes). After rekeying happened, + * this command may also be sent by the driver as an MLME event to + * inform userspace of the new replay counter. + * + * @NL80211_CMD_PMKSA_CANDIDATE: This is used as an event to inform userspace + * of PMKSA caching dandidates. + * + * @NL80211_CMD_TDLS_OPER: Perform a high-level TDLS command (e.g. link setup). + * In addition, this can be used as an event to request userspace to take + * actions on TDLS links (set up a new link or tear down an existing one). + * In such events, %NL80211_ATTR_TDLS_OPERATION indicates the requested + * operation, %NL80211_ATTR_MAC contains the peer MAC address, and + * %NL80211_ATTR_REASON_CODE the reason code to be used (only with + * %NL80211_TDLS_TEARDOWN). + * @NL80211_CMD_TDLS_MGMT: Send a TDLS management frame. The + * %NL80211_ATTR_TDLS_ACTION attribute determines the type of frame to be + * sent. Public Action codes (802.11-2012 8.1.5.1) will be sent as + * 802.11 management frames, while TDLS action codes (802.11-2012 + * 8.5.13.1) will be encapsulated and sent as data frames. The currently + * supported Public Action code is %WLAN_PUB_ACTION_TDLS_DISCOVER_RES + * and the currently supported TDLS actions codes are given in + * &enum ieee80211_tdls_actioncode. + * + * @NL80211_CMD_UNEXPECTED_FRAME: Used by an application controlling an AP + * (or GO) interface (i.e. hostapd) to ask for unexpected frames to + * implement sending deauth to stations that send unexpected class 3 + * frames. Also used as the event sent by the kernel when such a frame + * is received. + * For the event, the %NL80211_ATTR_MAC attribute carries the TA and + * other attributes like the interface index are present. + * If used as the command it must have an interface index and you can + * only unsubscribe from the event by closing the socket. Subscription + * is also for %NL80211_CMD_UNEXPECTED_4ADDR_FRAME events. + * + * @NL80211_CMD_UNEXPECTED_4ADDR_FRAME: Sent as an event indicating that the + * associated station identified by %NL80211_ATTR_MAC sent a 4addr frame + * and wasn't already in a 4-addr VLAN. The event will be sent similarly + * to the %NL80211_CMD_UNEXPECTED_FRAME event, to the same listener. + * + * @NL80211_CMD_PROBE_CLIENT: Probe an associated station on an AP interface + * by sending a null data frame to it and reporting when the frame is + * acknowleged. This is used to allow timing out inactive clients. Uses + * %NL80211_ATTR_IFINDEX and %NL80211_ATTR_MAC. The command returns a + * direct reply with an %NL80211_ATTR_COOKIE that is later used to match + * up the event with the request. The event includes the same data and + * has %NL80211_ATTR_ACK set if the frame was ACKed. + * + * @NL80211_CMD_REGISTER_BEACONS: Register this socket to receive beacons from + * other BSSes when any interfaces are in AP mode. This helps implement + * OLBC handling in hostapd. Beacons are reported in %NL80211_CMD_FRAME + * messages. Note that per PHY only one application may register. + * + * @NL80211_CMD_SET_NOACK_MAP: sets a bitmap for the individual TIDs whether + * No Acknowledgement Policy should be applied. + * + * @NL80211_CMD_CH_SWITCH_NOTIFY: An AP or GO may decide to switch channels + * independently of the userspace SME, send this event indicating + * %NL80211_ATTR_IFINDEX is now on %NL80211_ATTR_WIPHY_FREQ and the + * attributes determining channel width. This indication may also be + * sent when a remotely-initiated switch (e.g., when a STA receives a CSA + * from the remote AP) is completed; + * + * @NL80211_CMD_CH_SWITCH_STARTED_NOTIFY: Notify that a channel switch + * has been started on an interface, regardless of the initiator + * (ie. whether it was requested from a remote device or + * initiated on our own). It indicates that + * %NL80211_ATTR_IFINDEX will be on %NL80211_ATTR_WIPHY_FREQ + * after %NL80211_ATTR_CH_SWITCH_COUNT TBTT's. The userspace may + * decide to react to this indication by requesting other + * interfaces to change channel as well. + * + * @NL80211_CMD_START_P2P_DEVICE: Start the given P2P Device, identified by + * its %NL80211_ATTR_WDEV identifier. It must have been created with + * %NL80211_CMD_NEW_INTERFACE previously. After it has been started, the + * P2P Device can be used for P2P operations, e.g. remain-on-channel and + * public action frame TX. + * @NL80211_CMD_STOP_P2P_DEVICE: Stop the given P2P Device, identified by + * its %NL80211_ATTR_WDEV identifier. + * + * @NL80211_CMD_CONN_FAILED: connection request to an AP failed; used to + * notify userspace that AP has rejected the connection request from a + * station, due to particular reason. %NL80211_ATTR_CONN_FAILED_REASON + * is used for this. + * + * @NL80211_CMD_SET_MCAST_RATE: Change the rate used to send multicast frames + * for IBSS or MESH vif. + * + * @NL80211_CMD_SET_MAC_ACL: sets ACL for MAC address based access control. + * This is to be used with the drivers advertising the support of MAC + * address based access control. List of MAC addresses is passed in + * %NL80211_ATTR_MAC_ADDRS and ACL policy is passed in + * %NL80211_ATTR_ACL_POLICY. Driver will enable ACL with this list, if it + * is not already done. The new list will replace any existing list. Driver + * will clear its ACL when the list of MAC addresses passed is empty. This + * command is used in AP/P2P GO mode. Driver has to make sure to clear its + * ACL list during %NL80211_CMD_STOP_AP. + * + * @NL80211_CMD_RADAR_DETECT: Start a Channel availability check (CAC). Once + * a radar is detected or the channel availability scan (CAC) has finished + * or was aborted, or a radar was detected, usermode will be notified with + * this event. This command is also used to notify userspace about radars + * while operating on this channel. + * %NL80211_ATTR_RADAR_EVENT is used to inform about the type of the + * event. + * + * @NL80211_CMD_GET_PROTOCOL_FEATURES: Get global nl80211 protocol features, + * i.e. features for the nl80211 protocol rather than device features. + * Returns the features in the %NL80211_ATTR_PROTOCOL_FEATURES bitmap. + * + * @NL80211_CMD_UPDATE_FT_IES: Pass down the most up-to-date Fast Transition + * Information Element to the WLAN driver + * + * @NL80211_CMD_FT_EVENT: Send a Fast transition event from the WLAN driver + * to the supplicant. This will carry the target AP's MAC address along + * with the relevant Information Elements. This event is used to report + * received FT IEs (MDIE, FTIE, RSN IE, TIE, RICIE). + * + * @NL80211_CMD_CRIT_PROTOCOL_START: Indicates user-space will start running + * a critical protocol that needs more reliability in the connection to + * complete. + * + * @NL80211_CMD_CRIT_PROTOCOL_STOP: Indicates the connection reliability can + * return back to normal. + * + * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules. + * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. + * + * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the + * new channel information (Channel Switch Announcement - CSA) + * in the beacon for some time (as defined in the + * %NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the + * new channel. Userspace provides the new channel information (using + * %NL80211_ATTR_WIPHY_FREQ and the attributes determining channel + * width). %NL80211_ATTR_CH_SWITCH_BLOCK_TX may be supplied to inform + * other station that transmission must be blocked until the channel + * switch is complete. + * + * @NL80211_CMD_VENDOR: Vendor-specified command/event. The command is specified + * by the %NL80211_ATTR_VENDOR_ID attribute and a sub-command in + * %NL80211_ATTR_VENDOR_SUBCMD. Parameter(s) can be transported in + * %NL80211_ATTR_VENDOR_DATA. + * For feature advertisement, the %NL80211_ATTR_VENDOR_DATA attribute is + * used in the wiphy data as a nested attribute containing descriptions + * (&struct nl80211_vendor_cmd_info) of the supported vendor commands. + * This may also be sent as an event with the same attributes. + * + * @NL80211_CMD_SET_QOS_MAP: Set Interworking QoS mapping for IP DSCP values. + * The QoS mapping information is included in %NL80211_ATTR_QOS_MAP. If + * that attribute is not included, QoS mapping is disabled. Since this + * QoS mapping is relevant for IP packets, it is only valid during an + * association. This is cleared on disassociation and AP restart. + * + * @NL80211_CMD_ADD_TX_TS: Ask the kernel to add a traffic stream for the given + * %NL80211_ATTR_TSID and %NL80211_ATTR_MAC with %NL80211_ATTR_USER_PRIO + * and %NL80211_ATTR_ADMITTED_TIME parameters. + * Note that the action frame handshake with the AP shall be handled by + * userspace via the normal management RX/TX framework, this only sets + * up the TX TS in the driver/device. + * If the admitted time attribute is not added then the request just checks + * if a subsequent setup could be successful, the intent is to use this to + * avoid setting up a session with the AP when local restrictions would + * make that impossible. However, the subsequent "real" setup may still + * fail even if the check was successful. + * @NL80211_CMD_DEL_TX_TS: Remove an existing TS with the %NL80211_ATTR_TSID + * and %NL80211_ATTR_MAC parameters. It isn't necessary to call this + * before removing a station entry entirely, or before disassociating + * or similar, cleanup will happen in the driver/device in this case. + * + * @NL80211_CMD_GET_MPP: Get mesh path attributes for mesh proxy path to + * destination %NL80211_ATTR_MAC on the interface identified by + * %NL80211_ATTR_IFINDEX. + * + * @NL80211_CMD_JOIN_OCB: Join the OCB network. The center frequency and + * bandwidth of a channel must be given. + * @NL80211_CMD_LEAVE_OCB: Leave the OCB network -- no special arguments, the + * network is determined by the network interface. + * + * @NL80211_CMD_TDLS_CHANNEL_SWITCH: Start channel-switching with a TDLS peer, + * identified by the %NL80211_ATTR_MAC parameter. A target channel is + * provided via %NL80211_ATTR_WIPHY_FREQ and other attributes determining + * channel width/type. The target operating class is given via + * %NL80211_ATTR_OPER_CLASS. + * The driver is responsible for continually initiating channel-switching + * operations and returning to the base channel for communication with the + * AP. + * @NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH: Stop channel-switching with a TDLS + * peer given by %NL80211_ATTR_MAC. Both peers must be on the base channel + * when this command completes. + * + * @NL80211_CMD_WIPHY_REG_CHANGE: Similar to %NL80211_CMD_REG_CHANGE, but used + * as an event to indicate changes for devices with wiphy-specific regdom + * management. + * + * @NL80211_CMD_ABORT_SCAN: Stop an ongoing scan. Returns -ENOENT if a scan is + * not running. The driver indicates the status of the scan through + * cfg80211_scan_done(). + * + * @NL80211_CMD_START_NAN: Start NAN operation, identified by its + * %NL80211_ATTR_WDEV interface. This interface must have been + * previously created with %NL80211_CMD_NEW_INTERFACE. After it + * has been started, the NAN interface will create or join a + * cluster. This command must have a valid + * %NL80211_ATTR_NAN_MASTER_PREF attribute and optional + * %NL80211_ATTR_BANDS attributes. If %NL80211_ATTR_BANDS is + * omitted or set to 0, it means don't-care and the device will + * decide what to use. After this command NAN functions can be + * added. + * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by + * its %NL80211_ATTR_WDEV interface. + * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined + * with %NL80211_ATTR_NAN_FUNC nested attribute. When called, this + * operation returns the strictly positive and unique instance id + * (%NL80211_ATTR_NAN_FUNC_INST_ID) and a cookie (%NL80211_ATTR_COOKIE) + * of the function upon success. + * Since instance ID's can be re-used, this cookie is the right + * way to identify the function. This will avoid races when a termination + * event is handled by the user space after it has already added a new + * function that got the same instance id from the kernel as the one + * which just terminated. + * This cookie may be used in NAN events even before the command + * returns, so userspace shouldn't process NAN events until it processes + * the response to this command. + * Look at %NL80211_ATTR_SOCKET_OWNER as well. + * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie. + * This command is also used as a notification sent when a NAN function is + * terminated. This will contain a %NL80211_ATTR_NAN_FUNC_INST_ID + * and %NL80211_ATTR_COOKIE attributes. + * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN + * configuration. NAN must be operational (%NL80211_CMD_START_NAN + * was executed). It must contain at least one of the following + * attributes: %NL80211_ATTR_NAN_MASTER_PREF, + * %NL80211_ATTR_BANDS. If %NL80211_ATTR_BANDS is omitted, the + * current configuration is not changed. If it is present but + * set to zero, the configuration is changed to don't-care + * (i.e. the device can decide what to do). + * @NL80211_CMD_NAN_FUNC_MATCH: Notification sent when a match is reported. + * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and + * %NL80211_ATTR_COOKIE. + * + * @NL80211_CMD_UPDATE_CONNECT_PARAMS: Update one or more connect parameters + * for subsequent roaming cases if the driver or firmware uses internal + * BSS selection. This command can be issued only while connected and it + * does not result in a change for the current association. Currently, + * only the %NL80211_ATTR_IE data is used and updated with this command. + * + * @NL80211_CMD_SET_PMK: For offloaded 4-Way handshake, set the PMK or PMK-R0 + * for the given authenticator address (specified with %NL80211_ATTR_MAC). + * When %NL80211_ATTR_PMKR0_NAME is set, %NL80211_ATTR_PMK specifies the + * PMK-R0, otherwise it specifies the PMK. + * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously + * configured PMK for the authenticator address identified by + * %NL80211_ATTR_MAC. + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates an 802.1X FT roam was + * completed successfully. Drivers that support 4 way handshake offload + * should send this event after indicating 802.1X FT assocation with + * %NL80211_CMD_ROAM. If the 4 way handshake failed %NL80211_CMD_DISCONNECT + * should be indicated instead. + * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request + * and RX notification. This command is used both as a request to transmit + * a control port frame and as a notification that a control port frame + * has been received. %NL80211_ATTR_FRAME is used to specify the + * frame contents. The frame is the raw EAPoL data, without ethernet or + * 802.11 headers. + * For an MLD transmitter, the %NL80211_ATTR_MLO_LINK_ID may be given and + * its effect will depend on the destination: If the destination is known + * to be an MLD, this will be used as a hint to select the link to transmit + * the frame on. If the destination is not an MLD, this will select both + * the link to transmit on and the source address will be set to the link + * address of that link. + * When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, + * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added + * indicating the protocol type of the received frame; whether the frame + * was received unencrypted and the MAC address of the peer respectively. + * + * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. + * + * @NL80211_CMD_EXTERNAL_AUTH: This interface is exclusively defined for host + * drivers that do not define separate commands for authentication and + * association, but rely on user space for the authentication to happen. + * This interface acts both as the event request (driver to user space) + * to trigger the authentication and command response (userspace to + * driver) to indicate the authentication status. + * + * User space uses the %NL80211_CMD_CONNECT command to the host driver to + * trigger a connection. The host driver selects a BSS and further uses + * this interface to offload only the authentication part to the user + * space. Authentication frames are passed between the driver and user + * space through the %NL80211_CMD_FRAME interface. Host driver proceeds + * further with the association after getting successful authentication + * status. User space indicates the authentication status through + * %NL80211_ATTR_STATUS_CODE attribute in %NL80211_CMD_EXTERNAL_AUTH + * command interface. + * + * Host driver reports this status on an authentication failure to the + * user space through the connect result as the user space would have + * initiated the connection through the connect request. + * + * @NL80211_CMD_STA_OPMODE_CHANGED: An event that notify station's + * ht opmode or vht opmode changes using any of %NL80211_ATTR_SMPS_MODE, + * %NL80211_ATTR_CHANNEL_WIDTH,%NL80211_ATTR_NSS attributes with its + * address(specified in %NL80211_ATTR_MAC). + * + * @NL80211_CMD_GET_FTM_RESPONDER_STATS: Retrieve FTM responder statistics, in + * the %NL80211_ATTR_FTM_RESPONDER_STATS attribute. + * + * @NL80211_CMD_PEER_MEASUREMENT_START: start a (set of) peer measurement(s) + * with the given parameters, which are encapsulated in the nested + * %NL80211_ATTR_PEER_MEASUREMENTS attribute. Optionally, MAC address + * randomization may be enabled and configured by specifying the + * %NL80211_ATTR_MAC and %NL80211_ATTR_MAC_MASK attributes. + * If a timeout is requested, use the %NL80211_ATTR_TIMEOUT attribute. + * A u64 cookie for further %NL80211_ATTR_COOKIE use is returned in + * the netlink extended ack message. + * + * To cancel a measurement, close the socket that requested it. + * + * Measurement results are reported to the socket that requested the + * measurement using @NL80211_CMD_PEER_MEASUREMENT_RESULT when they + * become available, so applications must ensure a large enough socket + * buffer size. + * + * Depending on driver support it may or may not be possible to start + * multiple concurrent measurements. + * @NL80211_CMD_PEER_MEASUREMENT_RESULT: This command number is used for the + * result notification from the driver to the requesting socket. + * @NL80211_CMD_PEER_MEASUREMENT_COMPLETE: Notification only, indicating that + * the measurement completed, using the measurement cookie + * (%NL80211_ATTR_COOKIE). + * + * @NL80211_CMD_NOTIFY_RADAR: Notify the kernel that a radar signal was + * detected and reported by a neighboring device on the channel + * indicated by %NL80211_ATTR_WIPHY_FREQ and other attributes + * determining the width and type. + * + * @NL80211_CMD_UPDATE_OWE_INFO: This interface allows the host driver to + * offload OWE processing to user space. This intends to support + * OWE AKM by the host drivers that implement SME but rely + * on the user space for the cryptographic/DH IE processing in AP mode. + * + * @NL80211_CMD_PROBE_MESH_LINK: The requirement for mesh link metric + * refreshing, is that from one mesh point we be able to send some data + * frames to other mesh points which are not currently selected as a + * primary traffic path, but which are only 1 hop away. The absence of + * the primary path to the chosen node makes it necessary to apply some + * form of marking on a chosen packet stream so that the packets can be + * properly steered to the selected node for testing, and not by the + * regular mesh path lookup. Further, the packets must be of type data + * so that the rate control (often embedded in firmware) is used for + * rate selection. + * + * Here attribute %NL80211_ATTR_MAC is used to specify connected mesh + * peer MAC address and %NL80211_ATTR_FRAME is used to specify the frame + * content. The frame is ethernet data. + * + * @NL80211_CMD_SET_TID_CONFIG: Data frame TID specific configuration + * is passed using %NL80211_ATTR_TID_CONFIG attribute. + * + * @NL80211_CMD_UNPROT_BEACON: Unprotected or incorrectly protected Beacon + * frame. This event is used to indicate that a received Beacon frame was + * dropped because it did not include a valid MME MIC while beacon + * protection was enabled (BIGTK configured in station mode). + * + * @NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS: Report TX status of a control + * port frame transmitted with %NL80211_CMD_CONTROL_PORT_FRAME. + * %NL80211_ATTR_COOKIE identifies the TX command and %NL80211_ATTR_FRAME + * includes the contents of the frame. %NL80211_ATTR_ACK flag is included + * if the recipient acknowledged the frame. + * + * @NL80211_CMD_SET_SAR_SPECS: SAR power limitation configuration is + * passed using %NL80211_ATTR_SAR_SPEC. %NL80211_ATTR_WIPHY is used to + * specify the wiphy index to be applied to. + * + * @NL80211_CMD_OBSS_COLOR_COLLISION: This notification is sent out whenever + * mac80211/drv detects a bss color collision. + * + * @NL80211_CMD_COLOR_CHANGE_REQUEST: This command is used to indicate that + * userspace wants to change the BSS color. + * + * @NL80211_CMD_COLOR_CHANGE_STARTED: Notify userland, that a color change has + * started + * + * @NL80211_CMD_COLOR_CHANGE_ABORTED: Notify userland, that the color change has + * been aborted + * + * @NL80211_CMD_COLOR_CHANGE_COMPLETED: Notify userland that the color change + * has completed + * + * @NL80211_CMD_SET_FILS_AAD: Set FILS AAD data to the driver using - + * &NL80211_ATTR_MAC - for STA MAC address + * &NL80211_ATTR_FILS_KEK - for KEK + * &NL80211_ATTR_FILS_NONCES - for FILS Nonces + * (STA Nonce 16 bytes followed by AP Nonce 16 bytes) + * + * @NL80211_CMD_ASSOC_COMEBACK: notification about an association + * temporal rejection with comeback. The event includes %NL80211_ATTR_MAC + * to describe the BSSID address of the AP and %NL80211_ATTR_TIMEOUT to + * specify the timeout value. + * + * @NL80211_CMD_ADD_LINK: Add a new link to an interface. The + * %NL80211_ATTR_MLO_LINK_ID attribute is used for the new link. + * @NL80211_CMD_REMOVE_LINK: Remove a link from an interface. This may come + * without %NL80211_ATTR_MLO_LINK_ID as an easy way to remove all links + * in preparation for e.g. roaming to a regular (non-MLO) AP. + * + * @NL80211_CMD_ADD_LINK_STA: Add a link to an MLD station + * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station + * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station + * + * @NL80211_CMD_MAX: highest used command number + * @__NL80211_CMD_AFTER_LAST: internal use + */ +enum nl80211_commands { +/* don't change the order or add anything between, this is ABI! */ + NL80211_CMD_UNSPEC, + + NL80211_CMD_GET_WIPHY, /* can dump */ + NL80211_CMD_SET_WIPHY, + NL80211_CMD_NEW_WIPHY, + NL80211_CMD_DEL_WIPHY, + + NL80211_CMD_GET_INTERFACE, /* can dump */ + NL80211_CMD_SET_INTERFACE, + NL80211_CMD_NEW_INTERFACE, + NL80211_CMD_DEL_INTERFACE, + + NL80211_CMD_GET_KEY, + NL80211_CMD_SET_KEY, + NL80211_CMD_NEW_KEY, + NL80211_CMD_DEL_KEY, + + NL80211_CMD_GET_BEACON, + NL80211_CMD_SET_BEACON, + NL80211_CMD_START_AP, + NL80211_CMD_NEW_BEACON = NL80211_CMD_START_AP, + NL80211_CMD_STOP_AP, + NL80211_CMD_DEL_BEACON = NL80211_CMD_STOP_AP, + + NL80211_CMD_GET_STATION, + NL80211_CMD_SET_STATION, + NL80211_CMD_NEW_STATION, + NL80211_CMD_DEL_STATION, + + NL80211_CMD_GET_MPATH, + NL80211_CMD_SET_MPATH, + NL80211_CMD_NEW_MPATH, + NL80211_CMD_DEL_MPATH, + + NL80211_CMD_SET_BSS, + + NL80211_CMD_SET_REG, + NL80211_CMD_REQ_SET_REG, + + NL80211_CMD_GET_MESH_CONFIG, + NL80211_CMD_SET_MESH_CONFIG, + + NL80211_CMD_SET_MGMT_EXTRA_IE /* reserved; not used */, + + NL80211_CMD_GET_REG, + + NL80211_CMD_GET_SCAN, + NL80211_CMD_TRIGGER_SCAN, + NL80211_CMD_NEW_SCAN_RESULTS, + NL80211_CMD_SCAN_ABORTED, + + NL80211_CMD_REG_CHANGE, + + NL80211_CMD_AUTHENTICATE, + NL80211_CMD_ASSOCIATE, + NL80211_CMD_DEAUTHENTICATE, + NL80211_CMD_DISASSOCIATE, + + NL80211_CMD_MICHAEL_MIC_FAILURE, + + NL80211_CMD_REG_BEACON_HINT, + + NL80211_CMD_JOIN_IBSS, + NL80211_CMD_LEAVE_IBSS, + + NL80211_CMD_TESTMODE, + + NL80211_CMD_CONNECT, + NL80211_CMD_ROAM, + NL80211_CMD_DISCONNECT, + + NL80211_CMD_SET_WIPHY_NETNS, + + NL80211_CMD_GET_SURVEY, + NL80211_CMD_NEW_SURVEY_RESULTS, + + NL80211_CMD_SET_PMKSA, + NL80211_CMD_DEL_PMKSA, + NL80211_CMD_FLUSH_PMKSA, + + NL80211_CMD_REMAIN_ON_CHANNEL, + NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + + NL80211_CMD_SET_TX_BITRATE_MASK, + + NL80211_CMD_REGISTER_FRAME, + NL80211_CMD_REGISTER_ACTION = NL80211_CMD_REGISTER_FRAME, + NL80211_CMD_FRAME, + NL80211_CMD_ACTION = NL80211_CMD_FRAME, + NL80211_CMD_FRAME_TX_STATUS, + NL80211_CMD_ACTION_TX_STATUS = NL80211_CMD_FRAME_TX_STATUS, + + NL80211_CMD_SET_POWER_SAVE, + NL80211_CMD_GET_POWER_SAVE, + + NL80211_CMD_SET_CQM, + NL80211_CMD_NOTIFY_CQM, + + NL80211_CMD_SET_CHANNEL, + NL80211_CMD_SET_WDS_PEER, + + NL80211_CMD_FRAME_WAIT_CANCEL, + + NL80211_CMD_JOIN_MESH, + NL80211_CMD_LEAVE_MESH, + + NL80211_CMD_UNPROT_DEAUTHENTICATE, + NL80211_CMD_UNPROT_DISASSOCIATE, + + NL80211_CMD_NEW_PEER_CANDIDATE, + + NL80211_CMD_GET_WOWLAN, + NL80211_CMD_SET_WOWLAN, + + NL80211_CMD_START_SCHED_SCAN, + NL80211_CMD_STOP_SCHED_SCAN, + NL80211_CMD_SCHED_SCAN_RESULTS, + NL80211_CMD_SCHED_SCAN_STOPPED, + + NL80211_CMD_SET_REKEY_OFFLOAD, + + NL80211_CMD_PMKSA_CANDIDATE, + + NL80211_CMD_TDLS_OPER, + NL80211_CMD_TDLS_MGMT, + + NL80211_CMD_UNEXPECTED_FRAME, + + NL80211_CMD_PROBE_CLIENT, + + NL80211_CMD_REGISTER_BEACONS, + + NL80211_CMD_UNEXPECTED_4ADDR_FRAME, + + NL80211_CMD_SET_NOACK_MAP, + + NL80211_CMD_CH_SWITCH_NOTIFY, + + NL80211_CMD_START_P2P_DEVICE, + NL80211_CMD_STOP_P2P_DEVICE, + + NL80211_CMD_CONN_FAILED, + + NL80211_CMD_SET_MCAST_RATE, + + NL80211_CMD_SET_MAC_ACL, + + NL80211_CMD_RADAR_DETECT, + + NL80211_CMD_GET_PROTOCOL_FEATURES, + + NL80211_CMD_UPDATE_FT_IES, + NL80211_CMD_FT_EVENT, + + NL80211_CMD_CRIT_PROTOCOL_START, + NL80211_CMD_CRIT_PROTOCOL_STOP, + + NL80211_CMD_GET_COALESCE, + NL80211_CMD_SET_COALESCE, + + NL80211_CMD_CHANNEL_SWITCH, + + NL80211_CMD_VENDOR, + + NL80211_CMD_SET_QOS_MAP, + + NL80211_CMD_ADD_TX_TS, + NL80211_CMD_DEL_TX_TS, + + NL80211_CMD_GET_MPP, + + NL80211_CMD_JOIN_OCB, + NL80211_CMD_LEAVE_OCB, + + NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, + + NL80211_CMD_TDLS_CHANNEL_SWITCH, + NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, + + NL80211_CMD_WIPHY_REG_CHANGE, + + NL80211_CMD_ABORT_SCAN, + + NL80211_CMD_START_NAN, + NL80211_CMD_STOP_NAN, + NL80211_CMD_ADD_NAN_FUNCTION, + NL80211_CMD_DEL_NAN_FUNCTION, + NL80211_CMD_CHANGE_NAN_CONFIG, + NL80211_CMD_NAN_MATCH, + + NL80211_CMD_SET_MULTICAST_TO_UNICAST, + + NL80211_CMD_UPDATE_CONNECT_PARAMS, + + NL80211_CMD_SET_PMK, + NL80211_CMD_DEL_PMK, + + NL80211_CMD_PORT_AUTHORIZED, + + NL80211_CMD_RELOAD_REGDB, + + NL80211_CMD_EXTERNAL_AUTH, + + NL80211_CMD_STA_OPMODE_CHANGED, + + NL80211_CMD_CONTROL_PORT_FRAME, + + NL80211_CMD_GET_FTM_RESPONDER_STATS, + + NL80211_CMD_PEER_MEASUREMENT_START, + NL80211_CMD_PEER_MEASUREMENT_RESULT, + NL80211_CMD_PEER_MEASUREMENT_COMPLETE, + + NL80211_CMD_NOTIFY_RADAR, + + NL80211_CMD_UPDATE_OWE_INFO, + + NL80211_CMD_PROBE_MESH_LINK, + + NL80211_CMD_SET_TID_CONFIG, + + NL80211_CMD_UNPROT_BEACON, + + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS, + + NL80211_CMD_SET_SAR_SPECS, + + NL80211_CMD_OBSS_COLOR_COLLISION, + + NL80211_CMD_COLOR_CHANGE_REQUEST, + + NL80211_CMD_COLOR_CHANGE_STARTED, + NL80211_CMD_COLOR_CHANGE_ABORTED, + NL80211_CMD_COLOR_CHANGE_COMPLETED, + + NL80211_CMD_SET_FILS_AAD, + + NL80211_CMD_ASSOC_COMEBACK, + + NL80211_CMD_ADD_LINK, + NL80211_CMD_REMOVE_LINK, + + NL80211_CMD_ADD_LINK_STA, + NL80211_CMD_MODIFY_LINK_STA, + NL80211_CMD_REMOVE_LINK_STA, + + /* add new commands above here */ + + /* used to define NL80211_CMD_MAX below */ + __NL80211_CMD_AFTER_LAST, + NL80211_CMD_MAX = __NL80211_CMD_AFTER_LAST - 1 +}; + +/* + * Allow user space programs to use #ifdef on new commands by defining them + * here + */ +#define NL80211_CMD_SET_BSS NL80211_CMD_SET_BSS +#define NL80211_CMD_SET_MGMT_EXTRA_IE NL80211_CMD_SET_MGMT_EXTRA_IE +#define NL80211_CMD_REG_CHANGE NL80211_CMD_REG_CHANGE +#define NL80211_CMD_AUTHENTICATE NL80211_CMD_AUTHENTICATE +#define NL80211_CMD_ASSOCIATE NL80211_CMD_ASSOCIATE +#define NL80211_CMD_DEAUTHENTICATE NL80211_CMD_DEAUTHENTICATE +#define NL80211_CMD_DISASSOCIATE NL80211_CMD_DISASSOCIATE +#define NL80211_CMD_REG_BEACON_HINT NL80211_CMD_REG_BEACON_HINT + +#define NL80211_ATTR_FEATURE_FLAGS NL80211_ATTR_FEATURE_FLAGS + +/* source-level API compatibility */ +#define NL80211_CMD_GET_MESH_PARAMS NL80211_CMD_GET_MESH_CONFIG +#define NL80211_CMD_SET_MESH_PARAMS NL80211_CMD_SET_MESH_CONFIG +#define NL80211_MESH_SETUP_VENDOR_PATH_SEL_IE NL80211_MESH_SETUP_IE + +/** + * enum nl80211_attrs - nl80211 netlink attributes + * + * @NL80211_ATTR_UNSPEC: unspecified attribute to catch errors + * + * @NL80211_ATTR_WIPHY: index of wiphy to operate on, cf. + * /sys/class/ieee80211//index + * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming) + * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters + * @NL80211_ATTR_WIPHY_FREQ: frequency of the selected channel in MHz, + * defines the channel together with the (deprecated) + * %NL80211_ATTR_WIPHY_CHANNEL_TYPE attribute or the attributes + * %NL80211_ATTR_CHANNEL_WIDTH and if needed %NL80211_ATTR_CENTER_FREQ1 + * and %NL80211_ATTR_CENTER_FREQ2 + * @NL80211_ATTR_CHANNEL_WIDTH: u32 attribute containing one of the values + * of &enum nl80211_chan_width, describing the channel width. See the + * documentation of the enum for more information. + * @NL80211_ATTR_CENTER_FREQ1: Center frequency of the first part of the + * channel, used for anything but 20 MHz bandwidth. In S1G this is the + * operating channel center frequency. + * @NL80211_ATTR_CENTER_FREQ2: Center frequency of the second part of the + * channel, used only for 80+80 MHz bandwidth + * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ + * if HT20 or HT40 are to be used (i.e., HT disabled if not included): + * NL80211_CHAN_NO_HT = HT not allowed (i.e., same as not including + * this attribute) + * NL80211_CHAN_HT20 = HT20 only + * NL80211_CHAN_HT40MINUS = secondary channel is below the primary channel + * NL80211_CHAN_HT40PLUS = secondary channel is above the primary channel + * This attribute is now deprecated. + * @NL80211_ATTR_WIPHY_RETRY_SHORT: TX retry limit for frames whose length is + * less than or equal to the RTS threshold; allowed range: 1..255; + * dot11ShortRetryLimit; u8 + * @NL80211_ATTR_WIPHY_RETRY_LONG: TX retry limit for frames whose length is + * greater than the RTS threshold; allowed range: 1..255; + * dot11ShortLongLimit; u8 + * @NL80211_ATTR_WIPHY_FRAG_THRESHOLD: fragmentation threshold, i.e., maximum + * length in octets for frames; allowed range: 256..8000, disable + * fragmentation with (u32)-1; dot11FragmentationThreshold; u32 + * @NL80211_ATTR_WIPHY_RTS_THRESHOLD: RTS threshold (TX frames with length + * larger than or equal to this use RTS/CTS handshake); allowed range: + * 0..65536, disable with (u32)-1; dot11RTSThreshold; u32 + * @NL80211_ATTR_WIPHY_COVERAGE_CLASS: Coverage Class as defined by IEEE 802.11 + * section 7.3.2.9; dot11CoverageClass; u8 + * + * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on + * @NL80211_ATTR_IFNAME: network interface name + * @NL80211_ATTR_IFTYPE: type of virtual interface, see &enum nl80211_iftype + * + * @NL80211_ATTR_WDEV: wireless device identifier, used for pseudo-devices + * that don't have a netdev (u64) + * + * @NL80211_ATTR_MAC: MAC address (various uses) + * + * @NL80211_ATTR_KEY_DATA: (temporal) key data; for TKIP this consists of + * 16 bytes encryption key followed by 8 bytes each for TX and RX MIC + * keys + * @NL80211_ATTR_KEY_IDX: key ID (u8, 0-3) + * @NL80211_ATTR_KEY_CIPHER: key cipher suite (u32, as defined by IEEE 802.11 + * section 7.3.2.25.1, e.g. 0x000FAC04) + * @NL80211_ATTR_KEY_SEQ: transmit key sequence number (IV/PN) for TKIP and + * CCMP keys, each six bytes in little endian + * @NL80211_ATTR_KEY_DEFAULT: Flag attribute indicating the key is default key + * @NL80211_ATTR_KEY_DEFAULT_MGMT: Flag attribute indicating the key is the + * default management key + * @NL80211_ATTR_CIPHER_SUITES_PAIRWISE: For crypto settings for connect or + * other commands, indicates which pairwise cipher suites are used + * @NL80211_ATTR_CIPHER_SUITE_GROUP: For crypto settings for connect or + * other commands, indicates which group cipher suite is used + * + * @NL80211_ATTR_BEACON_INTERVAL: beacon interval in TU + * @NL80211_ATTR_DTIM_PERIOD: DTIM period for beaconing + * @NL80211_ATTR_BEACON_HEAD: portion of the beacon before the TIM IE + * @NL80211_ATTR_BEACON_TAIL: portion of the beacon after the TIM IE + * + * @NL80211_ATTR_STA_AID: Association ID for the station (u16) + * @NL80211_ATTR_STA_FLAGS: flags, nested element with NLA_FLAG attributes of + * &enum nl80211_sta_flags (deprecated, use %NL80211_ATTR_STA_FLAGS2) + * @NL80211_ATTR_STA_LISTEN_INTERVAL: listen interval as defined by + * IEEE 802.11 7.3.1.6 (u16). + * @NL80211_ATTR_STA_SUPPORTED_RATES: supported rates, array of supported + * rates as defined by IEEE 802.11 7.3.2.2 but without the length + * restriction (at most %NL80211_MAX_SUPP_RATES). + * @NL80211_ATTR_STA_VLAN: interface index of VLAN interface to move station + * to, or the AP interface the station was originally added to. + * @NL80211_ATTR_STA_INFO: information about a station, part of station info + * given for %NL80211_CMD_GET_STATION, nested attribute containing + * info as possible, see &enum nl80211_sta_info. + * + * @NL80211_ATTR_WIPHY_BANDS: Information about an operating bands, + * consisting of a nested array. + * + * @NL80211_ATTR_MESH_ID: mesh id (1-32 bytes). + * @NL80211_ATTR_STA_PLINK_ACTION: action to perform on the mesh peer link + * (see &enum nl80211_plink_action). + * @NL80211_ATTR_MPATH_NEXT_HOP: MAC address of the next hop for a mesh path. + * @NL80211_ATTR_MPATH_INFO: information about a mesh_path, part of mesh path + * info given for %NL80211_CMD_GET_MPATH, nested attribute described at + * &enum nl80211_mpath_info. + * + * @NL80211_ATTR_MNTR_FLAGS: flags, nested element with NLA_FLAG attributes of + * &enum nl80211_mntr_flags. + * + * @NL80211_ATTR_REG_ALPHA2: an ISO-3166-alpha2 country code for which the + * current regulatory domain should be set to or is already set to. + * For example, 'CR', for Costa Rica. This attribute is used by the kernel + * to query the CRDA to retrieve one regulatory domain. This attribute can + * also be used by userspace to query the kernel for the currently set + * regulatory domain. We chose an alpha2 as that is also used by the + * IEEE-802.11 country information element to identify a country. + * Users can also simply ask the wireless core to set regulatory domain + * to a specific alpha2. + * @NL80211_ATTR_REG_RULES: a nested array of regulatory domain regulatory + * rules. + * + * @NL80211_ATTR_BSS_CTS_PROT: whether CTS protection is enabled (u8, 0 or 1) + * @NL80211_ATTR_BSS_SHORT_PREAMBLE: whether short preamble is enabled + * (u8, 0 or 1) + * @NL80211_ATTR_BSS_SHORT_SLOT_TIME: whether short slot time enabled + * (u8, 0 or 1) + * @NL80211_ATTR_BSS_BASIC_RATES: basic rates, array of basic + * rates in format defined by IEEE 802.11 7.3.2.2 but without the length + * restriction (at most %NL80211_MAX_SUPP_RATES). + * + * @NL80211_ATTR_HT_CAPABILITY: HT Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION) + * + * @NL80211_ATTR_SUPPORTED_IFTYPES: nested attribute containing all + * supported interface types, each a flag attribute with the number + * of the interface mode. + * + * @NL80211_ATTR_MGMT_SUBTYPE: Management frame subtype for + * %NL80211_CMD_SET_MGMT_EXTRA_IE. + * + * @NL80211_ATTR_IE: Information element(s) data (used, e.g., with + * %NL80211_CMD_SET_MGMT_EXTRA_IE). + * + * @NL80211_ATTR_MAX_NUM_SCAN_SSIDS: number of SSIDs you can scan with + * a single scan request, a wiphy attribute. + * @NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS: number of SSIDs you can + * scan with a single scheduled scan request, a wiphy attribute. + * @NL80211_ATTR_MAX_SCAN_IE_LEN: maximum length of information elements + * that can be added to a scan request + * @NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN: maximum length of information + * elements that can be added to a scheduled scan request + * @NL80211_ATTR_MAX_MATCH_SETS: maximum number of sets that can be + * used with @NL80211_ATTR_SCHED_SCAN_MATCH, a wiphy attribute. + * + * @NL80211_ATTR_SCAN_FREQUENCIES: nested attribute with frequencies (in MHz) + * @NL80211_ATTR_SCAN_SSIDS: nested attribute with SSIDs, leave out for passive + * scanning and include a zero-length SSID (wildcard) for wildcard scan + * @NL80211_ATTR_BSS: scan result BSS + * + * @NL80211_ATTR_REG_INITIATOR: indicates who requested the regulatory domain + * currently in effect. This could be any of the %NL80211_REGDOM_SET_BY_* + * @NL80211_ATTR_REG_TYPE: indicates the type of the regulatory domain currently + * set. This can be one of the nl80211_reg_type (%NL80211_REGDOM_TYPE_*) + * + * @NL80211_ATTR_SUPPORTED_COMMANDS: wiphy attribute that specifies + * an array of command numbers (i.e. a mapping index to command number) + * that the driver for the given wiphy supports. + * + * @NL80211_ATTR_FRAME: frame data (binary attribute), including frame header + * and body, but not FCS; used, e.g., with NL80211_CMD_AUTHENTICATE and + * NL80211_CMD_ASSOCIATE events + * @NL80211_ATTR_SSID: SSID (binary attribute, 0..32 octets) + * @NL80211_ATTR_AUTH_TYPE: AuthenticationType, see &enum nl80211_auth_type, + * represented as a u32 + * @NL80211_ATTR_REASON_CODE: ReasonCode for %NL80211_CMD_DEAUTHENTICATE and + * %NL80211_CMD_DISASSOCIATE, u16 + * + * @NL80211_ATTR_KEY_TYPE: Key Type, see &enum nl80211_key_type, represented as + * a u32 + * + * @NL80211_ATTR_FREQ_BEFORE: A channel which has suffered a regulatory change + * due to considerations from a beacon hint. This attribute reflects + * the state of the channel _before_ the beacon hint processing. This + * attributes consists of a nested attribute containing + * NL80211_FREQUENCY_ATTR_* + * @NL80211_ATTR_FREQ_AFTER: A channel which has suffered a regulatory change + * due to considerations from a beacon hint. This attribute reflects + * the state of the channel _after_ the beacon hint processing. This + * attributes consists of a nested attribute containing + * NL80211_FREQUENCY_ATTR_* + * + * @NL80211_ATTR_CIPHER_SUITES: a set of u32 values indicating the supported + * cipher suites + * + * @NL80211_ATTR_FREQ_FIXED: a flag indicating the IBSS should not try to look + * for other networks on different channels + * + * @NL80211_ATTR_TIMED_OUT: a flag indicating than an operation timed out; this + * is used, e.g., with %NL80211_CMD_AUTHENTICATE event + * + * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is + * used for the association (&enum nl80211_mfp, represented as a u32); + * this attribute can be used with %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_CONNECT requests. %NL80211_MFP_OPTIONAL is not allowed for + * %NL80211_CMD_ASSOCIATE since user space SME is expected and hence, it + * must have decided whether to use management frame protection or not. + * Setting %NL80211_MFP_OPTIONAL with a %NL80211_CMD_CONNECT request will + * let the driver (or the firmware) decide whether to use MFP or not. + * + * @NL80211_ATTR_STA_FLAGS2: Attribute containing a + * &struct nl80211_sta_flag_update. + * + * @NL80211_ATTR_CONTROL_PORT: A flag indicating whether user space controls + * IEEE 802.1X port, i.e., sets/clears %NL80211_STA_FLAG_AUTHORIZED, in + * station mode. If the flag is included in %NL80211_CMD_ASSOCIATE + * request, the driver will assume that the port is unauthorized until + * authorized by user space. Otherwise, port is marked authorized by + * default in station mode. + * @NL80211_ATTR_CONTROL_PORT_ETHERTYPE: A 16-bit value indicating the + * ethertype that will be used for key negotiation. It can be + * specified with the associate and connect commands. If it is not + * specified, the value defaults to 0x888E (PAE, 802.1X). This + * attribute is also used as a flag in the wiphy information to + * indicate that protocols other than PAE are supported. + * @NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT: When included along with + * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, indicates that the custom + * ethertype frames used for key negotiation must not be encrypted. + * @NL80211_ATTR_CONTROL_PORT_OVER_NL80211: A flag indicating whether control + * port frames (e.g. of type given in %NL80211_ATTR_CONTROL_PORT_ETHERTYPE) + * will be sent directly to the network interface or sent via the NL80211 + * socket. If this attribute is missing, then legacy behavior of sending + * control port frames directly to the network interface is used. If the + * flag is included, then control port frames are sent over NL80211 instead + * using %CMD_CONTROL_PORT_FRAME. If control port routing over NL80211 is + * to be used then userspace must also use the %NL80211_ATTR_SOCKET_OWNER + * flag. When used with %NL80211_ATTR_CONTROL_PORT_NO_PREAUTH, pre-auth + * frames are not forwared over the control port. + * + * @NL80211_ATTR_TESTDATA: Testmode data blob, passed through to the driver. + * We recommend using nested, driver-specific attributes within this. + * + * @NL80211_ATTR_DISCONNECTED_BY_AP: A flag indicating that the DISCONNECT + * event was due to the AP disconnecting the station, and not due to + * a local disconnect request. + * @NL80211_ATTR_STATUS_CODE: StatusCode for the %NL80211_CMD_CONNECT + * event (u16) + * @NL80211_ATTR_PRIVACY: Flag attribute, used with connect(), indicating + * that protected APs should be used. This is also used with NEW_BEACON to + * indicate that the BSS is to use protection. + * + * @NL80211_ATTR_CIPHERS_PAIRWISE: Used with CONNECT, ASSOCIATE, and NEW_BEACON + * to indicate which unicast key ciphers will be used with the connection + * (an array of u32). + * @NL80211_ATTR_CIPHER_GROUP: Used with CONNECT, ASSOCIATE, and NEW_BEACON to + * indicate which group key cipher will be used with the connection (a + * u32). + * @NL80211_ATTR_WPA_VERSIONS: Used with CONNECT, ASSOCIATE, and NEW_BEACON to + * indicate which WPA version(s) the AP we want to associate with is using + * (a u32 with flags from &enum nl80211_wpa_versions). + * @NL80211_ATTR_AKM_SUITES: Used with CONNECT, ASSOCIATE, and NEW_BEACON to + * indicate which key management algorithm(s) to use (an array of u32). + * This attribute is also sent in response to @NL80211_CMD_GET_WIPHY, + * indicating the supported AKM suites, intended for specific drivers which + * implement SME and have constraints on which AKMs are supported and also + * the cases where an AKM support is offloaded to the driver/firmware. + * If there is no such notification from the driver, user space should + * assume the driver supports all the AKM suites. + * + * @NL80211_ATTR_REQ_IE: (Re)association request information elements as + * sent out by the card, for ROAM and successful CONNECT events. + * @NL80211_ATTR_RESP_IE: (Re)association response information elements as + * sent by peer, for ROAM and successful CONNECT events. + * + * @NL80211_ATTR_PREV_BSSID: previous BSSID, to be used in ASSOCIATE and CONNECT + * commands to specify a request to reassociate within an ESS, i.e., to use + * Reassociate Request frame (with the value of this attribute in the + * Current AP address field) instead of Association Request frame which is + * used for the initial association to an ESS. + * + * @NL80211_ATTR_KEY: key information in a nested attribute with + * %NL80211_KEY_* sub-attributes + * @NL80211_ATTR_KEYS: array of keys for static WEP keys for connect() + * and join_ibss(), key information is in a nested attribute each + * with %NL80211_KEY_* sub-attributes + * + * @NL80211_ATTR_PID: Process ID of a network namespace. + * + * @NL80211_ATTR_GENERATION: Used to indicate consistent snapshots for + * dumps. This number increases whenever the object list being + * dumped changes, and as such userspace can verify that it has + * obtained a complete and consistent snapshot by verifying that + * all dump messages contain the same generation number. If it + * changed then the list changed and the dump should be repeated + * completely from scratch. + * + * @NL80211_ATTR_4ADDR: Use 4-address frames on a virtual interface + * + * @NL80211_ATTR_SURVEY_INFO: survey information about a channel, part of + * the survey response for %NL80211_CMD_GET_SURVEY, nested attribute + * containing info as possible, see &enum survey_info. + * + * @NL80211_ATTR_PMKID: PMK material for PMKSA caching. + * @NL80211_ATTR_MAX_NUM_PMKIDS: maximum number of PMKIDs a firmware can + * cache, a wiphy attribute. + * + * @NL80211_ATTR_DURATION: Duration of an operation in milliseconds, u32. + * @NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION: Device attribute that + * specifies the maximum duration that can be requested with the + * remain-on-channel operation, in milliseconds, u32. + * + * @NL80211_ATTR_COOKIE: Generic 64-bit cookie to identify objects. + * + * @NL80211_ATTR_TX_RATES: Nested set of attributes + * (enum nl80211_tx_rate_attributes) describing TX rates per band. The + * enum nl80211_band value is used as the index (nla_type() of the nested + * data. If a band is not included, it will be configured to allow all + * rates based on negotiated supported rates information. This attribute + * is used with %NL80211_CMD_SET_TX_BITRATE_MASK and with starting AP, + * and joining mesh networks (not IBSS yet). In the later case, it must + * specify just a single bitrate, which is to be used for the beacon. + * The driver must also specify support for this with the extended + * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, + * NL80211_EXT_FEATURE_BEACON_RATE_HT, + * NL80211_EXT_FEATURE_BEACON_RATE_VHT and + * NL80211_EXT_FEATURE_BEACON_RATE_HE. + * + * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain + * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. + * @NL80211_ATTR_FRAME_TYPE: A u16 indicating the frame type/subtype for the + * @NL80211_CMD_REGISTER_FRAME command. + * @NL80211_ATTR_TX_FRAME_TYPES: wiphy capability attribute, which is a + * nested attribute of %NL80211_ATTR_FRAME_TYPE attributes, containing + * information about which frame types can be transmitted with + * %NL80211_CMD_FRAME. + * @NL80211_ATTR_RX_FRAME_TYPES: wiphy capability attribute, which is a + * nested attribute of %NL80211_ATTR_FRAME_TYPE attributes, containing + * information about which frame types can be registered for RX. + * + * @NL80211_ATTR_ACK: Flag attribute indicating that the frame was + * acknowledged by the recipient. + * + * @NL80211_ATTR_PS_STATE: powersave state, using &enum nl80211_ps_state values. + * + * @NL80211_ATTR_CQM: connection quality monitor configuration in a + * nested attribute with %NL80211_ATTR_CQM_* sub-attributes. + * + * @NL80211_ATTR_LOCAL_STATE_CHANGE: Flag attribute to indicate that a command + * is requesting a local authentication/association state change without + * invoking actual management frame exchange. This can be used with + * NL80211_CMD_AUTHENTICATE, NL80211_CMD_DEAUTHENTICATE, + * NL80211_CMD_DISASSOCIATE. + * + * @NL80211_ATTR_AP_ISOLATE: (AP mode) Do not forward traffic between stations + * connected to this BSS. + * + * @NL80211_ATTR_WIPHY_TX_POWER_SETTING: Transmit power setting type. See + * &enum nl80211_tx_power_setting for possible values. + * @NL80211_ATTR_WIPHY_TX_POWER_LEVEL: Transmit power level in signed mBm units. + * This is used in association with @NL80211_ATTR_WIPHY_TX_POWER_SETTING + * for non-automatic settings. + * + * @NL80211_ATTR_SUPPORT_IBSS_RSN: The device supports IBSS RSN, which mostly + * means support for per-station GTKs. + * + * @NL80211_ATTR_WIPHY_ANTENNA_TX: Bitmap of allowed antennas for transmitting. + * This can be used to mask out antennas which are not attached or should + * not be used for transmitting. If an antenna is not selected in this + * bitmap the hardware is not allowed to transmit on this antenna. + * + * Each bit represents one antenna, starting with antenna 1 at the first + * bit. Depending on which antennas are selected in the bitmap, 802.11n + * drivers can derive which chainmasks to use (if all antennas belonging to + * a particular chain are disabled this chain should be disabled) and if + * a chain has diversity antennas wether diversity should be used or not. + * HT capabilities (STBC, TX Beamforming, Antenna selection) can be + * derived from the available chains after applying the antenna mask. + * Non-802.11n drivers can derive wether to use diversity or not. + * Drivers may reject configurations or RX/TX mask combinations they cannot + * support by returning -EINVAL. + * + * @NL80211_ATTR_WIPHY_ANTENNA_RX: Bitmap of allowed antennas for receiving. + * This can be used to mask out antennas which are not attached or should + * not be used for receiving. If an antenna is not selected in this bitmap + * the hardware should not be configured to receive on this antenna. + * For a more detailed description see @NL80211_ATTR_WIPHY_ANTENNA_TX. + * + * @NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX: Bitmap of antennas which are available + * for configuration as TX antennas via the above parameters. + * + * @NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX: Bitmap of antennas which are available + * for configuration as RX antennas via the above parameters. + * + * @NL80211_ATTR_MCAST_RATE: Multicast tx rate (in 100 kbps) for IBSS + * + * @NL80211_ATTR_OFFCHANNEL_TX_OK: For management frame TX, the frame may be + * transmitted on another channel when the channel given doesn't match + * the current channel. If the current channel doesn't match and this + * flag isn't set, the frame will be rejected. This is also used as an + * nl80211 capability flag. + * + * @NL80211_ATTR_BSS_HT_OPMODE: HT operation mode (u16) + * + * @NL80211_ATTR_KEY_DEFAULT_TYPES: A nested attribute containing flags + * attributes, specifying what a key should be set as default as. + * See &enum nl80211_key_default_types. + * + * @NL80211_ATTR_MESH_SETUP: Optional mesh setup parameters. These cannot be + * changed once the mesh is active. + * @NL80211_ATTR_MESH_CONFIG: Mesh configuration parameters, a nested attribute + * containing attributes from &enum nl80211_meshconf_params. + * @NL80211_ATTR_SUPPORT_MESH_AUTH: Currently, this means the underlying driver + * allows auth frames in a mesh to be passed to userspace for processing via + * the @NL80211_MESH_SETUP_USERSPACE_AUTH flag. + * @NL80211_ATTR_STA_PLINK_STATE: The state of a mesh peer link as defined in + * &enum nl80211_plink_state. Used when userspace is driving the peer link + * management state machine. @NL80211_MESH_SETUP_USERSPACE_AMPE or + * @NL80211_MESH_SETUP_USERSPACE_MPM must be enabled. + * + * @NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED: indicates, as part of the wiphy + * capabilities, the supported WoWLAN triggers + * @NL80211_ATTR_WOWLAN_TRIGGERS: used by %NL80211_CMD_SET_WOWLAN to + * indicate which WoW triggers should be enabled. This is also + * used by %NL80211_CMD_GET_WOWLAN to get the currently enabled WoWLAN + * triggers. + * + * @NL80211_ATTR_SCHED_SCAN_INTERVAL: Interval between scheduled scan + * cycles, in msecs. + * + * @NL80211_ATTR_SCHED_SCAN_MATCH: Nested attribute with one or more + * sets of attributes to match during scheduled scans. Only BSSs + * that match any of the sets will be reported. These are + * pass-thru filter rules. + * For a match to succeed, the BSS must match all attributes of a + * set. Since not every hardware supports matching all types of + * attributes, there is no guarantee that the reported BSSs are + * fully complying with the match sets and userspace needs to be + * able to ignore them by itself. + * Thus, the implementation is somewhat hardware-dependent, but + * this is only an optimization and the userspace application + * needs to handle all the non-filtered results anyway. + * If the match attributes don't make sense when combined with + * the values passed in @NL80211_ATTR_SCAN_SSIDS (eg. if an SSID + * is included in the probe request, but the match attributes + * will never let it go through), -EINVAL may be returned. + * If omitted, no filtering is done. + * + * @NL80211_ATTR_INTERFACE_COMBINATIONS: Nested attribute listing the supported + * interface combinations. In each nested item, it contains attributes + * defined in &enum nl80211_if_combination_attrs. + * @NL80211_ATTR_SOFTWARE_IFTYPES: Nested attribute (just like + * %NL80211_ATTR_SUPPORTED_IFTYPES) containing the interface types that + * are managed in software: interfaces of these types aren't subject to + * any restrictions in their number or combinations. + * + * @NL80211_ATTR_REKEY_DATA: nested attribute containing the information + * necessary for GTK rekeying in the device, see &enum nl80211_rekey_data. + * + * @NL80211_ATTR_SCAN_SUPP_RATES: rates per to be advertised as supported in scan, + * nested array attribute containing an entry for each band, with the entry + * being a list of supported rates as defined by IEEE 802.11 7.3.2.2 but + * without the length restriction (at most %NL80211_MAX_SUPP_RATES). + * + * @NL80211_ATTR_HIDDEN_SSID: indicates whether SSID is to be hidden from Beacon + * and Probe Response (when response to wildcard Probe Request); see + * &enum nl80211_hidden_ssid, represented as a u32 + * + * @NL80211_ATTR_IE_PROBE_RESP: Information element(s) for Probe Response frame. + * This is used with %NL80211_CMD_NEW_BEACON and %NL80211_CMD_SET_BEACON to + * provide extra IEs (e.g., WPS/P2P IE) into Probe Response frames when the + * driver (or firmware) replies to Probe Request frames. + * @NL80211_ATTR_IE_ASSOC_RESP: Information element(s) for (Re)Association + * Response frames. This is used with %NL80211_CMD_NEW_BEACON and + * %NL80211_CMD_SET_BEACON to provide extra IEs (e.g., WPS/P2P IE) into + * (Re)Association Response frames when the driver (or firmware) replies to + * (Re)Association Request frames. + * + * @NL80211_ATTR_STA_WME: Nested attribute containing the wme configuration + * of the station, see &enum nl80211_sta_wme_attr. + * @NL80211_ATTR_SUPPORT_AP_UAPSD: the device supports uapsd when working + * as AP. + * + * @NL80211_ATTR_ROAM_SUPPORT: Indicates whether the firmware is capable of + * roaming to another AP in the same ESS if the signal lever is low. + * + * @NL80211_ATTR_PMKSA_CANDIDATE: Nested attribute containing the PMKSA caching + * candidate information, see &enum nl80211_pmksa_candidate_attr. + * + * @NL80211_ATTR_TX_NO_CCK_RATE: Indicates whether to use CCK rate or not + * for management frames transmission. In order to avoid p2p probe/action + * frames are being transmitted at CCK rate in 2GHz band, the user space + * applications use this attribute. + * This attribute is used with %NL80211_CMD_TRIGGER_SCAN and + * %NL80211_CMD_FRAME commands. + * + * @NL80211_ATTR_TDLS_ACTION: Low level TDLS action code (e.g. link setup + * request, link setup confirm, link teardown, etc.). Values are + * described in the TDLS (802.11z) specification. + * @NL80211_ATTR_TDLS_DIALOG_TOKEN: Non-zero token for uniquely identifying a + * TDLS conversation between two devices. + * @NL80211_ATTR_TDLS_OPERATION: High level TDLS operation; see + * &enum nl80211_tdls_operation, represented as a u8. + * @NL80211_ATTR_TDLS_SUPPORT: A flag indicating the device can operate + * as a TDLS peer sta. + * @NL80211_ATTR_TDLS_EXTERNAL_SETUP: The TDLS discovery/setup and teardown + * procedures should be performed by sending TDLS packets via + * %NL80211_CMD_TDLS_MGMT. Otherwise %NL80211_CMD_TDLS_OPER should be + * used for asking the driver to perform a TDLS operation. + * + * @NL80211_ATTR_DEVICE_AP_SME: This u32 attribute may be listed for devices + * that have AP support to indicate that they have the AP SME integrated + * with support for the features listed in this attribute, see + * &enum nl80211_ap_sme_features. + * + * @NL80211_ATTR_DONT_WAIT_FOR_ACK: Used with %NL80211_CMD_FRAME, this tells + * the driver to not wait for an acknowledgement. Note that due to this, + * it will also not give a status callback nor return a cookie. This is + * mostly useful for probe responses to save airtime. + * + * @NL80211_ATTR_FEATURE_FLAGS: This u32 attribute contains flags from + * &enum nl80211_feature_flags and is advertised in wiphy information. + * @NL80211_ATTR_PROBE_RESP_OFFLOAD: Indicates that the HW responds to probe + * requests while operating in AP-mode. + * This attribute holds a bitmap of the supported protocols for + * offloading (see &enum nl80211_probe_resp_offload_support_attr). + * + * @NL80211_ATTR_PROBE_RESP: Probe Response template data. Contains the entire + * probe-response frame. The DA field in the 802.11 header is zero-ed out, + * to be filled by the FW. + * @NL80211_ATTR_DISABLE_HT: Force HT capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_VHT: Force VHT capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_HE: Force HE capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_HT_CAPABILITY_MASK: Specify which bits of the + * ATTR_HT_CAPABILITY to which attention should be paid. + * Currently, only mac80211 NICs support this feature. + * The values that may be configured are: + * MCS rates, MAX-AMSDU, HT-20-40 and HT_CAP_SGI_40 + * AMPDU density and AMPDU factor. + * All values are treated as suggestions and may be ignored + * by the driver as required. The actual values may be seen in + * the station debugfs ht_caps file. + * + * @NL80211_ATTR_DFS_REGION: region for regulatory rules which this country + * abides to when initiating radiation on DFS channels. A country maps + * to one DFS region. + * + * @NL80211_ATTR_NOACK_MAP: This u16 bitmap contains the No Ack Policy of + * up to 16 TIDs. + * + * @NL80211_ATTR_INACTIVITY_TIMEOUT: timeout value in seconds, this can be + * used by the drivers which has MLME in firmware and does not have support + * to report per station tx/rx activity to free up the station entry from + * the list. This needs to be used when the driver advertises the + * capability to timeout the stations. + * + * @NL80211_ATTR_RX_SIGNAL_DBM: signal strength in dBm (as a 32-bit int); + * this attribute is (depending on the driver capabilities) added to + * received frames indicated with %NL80211_CMD_FRAME. + * + * @NL80211_ATTR_BG_SCAN_PERIOD: Background scan period in seconds + * or 0 to disable background scan. + * + * @NL80211_ATTR_USER_REG_HINT_TYPE: type of regulatory hint passed from + * userspace. If unset it is assumed the hint comes directly from + * a user. If set code could specify exactly what type of source + * was used to provide the hint. For the different types of + * allowed user regulatory hints see nl80211_user_reg_hint_type. + * + * @NL80211_ATTR_CONN_FAILED_REASON: The reason for which AP has rejected + * the connection request from a station. nl80211_connect_failed_reason + * enum has different reasons of connection failure. + * + * @NL80211_ATTR_AUTH_DATA: Fields and elements in Authentication frames. + * This contains the authentication frame body (non-IE and IE data), + * excluding the Authentication algorithm number, i.e., starting at the + * Authentication transaction sequence number field. It is used with + * authentication algorithms that need special fields to be added into + * the frames (SAE and FILS). Currently, only the SAE cases use the + * initial two fields (Authentication transaction sequence number and + * Status code). However, those fields are included in the attribute data + * for all authentication algorithms to keep the attribute definition + * consistent. + * + * @NL80211_ATTR_VHT_CAPABILITY: VHT Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION) + * + * @NL80211_ATTR_SCAN_FLAGS: scan request control flags (u32) + * + * @NL80211_ATTR_P2P_CTWINDOW: P2P GO Client Traffic Window (u8), used with + * the START_AP and SET_BSS commands + * @NL80211_ATTR_P2P_OPPPS: P2P GO opportunistic PS (u8), used with the + * START_AP and SET_BSS commands. This can have the values 0 or 1; + * if not given in START_AP 0 is assumed, if not given in SET_BSS + * no change is made. + * + * @NL80211_ATTR_LOCAL_MESH_POWER_MODE: local mesh STA link-specific power mode + * defined in &enum nl80211_mesh_power_mode. + * + * @NL80211_ATTR_ACL_POLICY: ACL policy, see &enum nl80211_acl_policy, + * carried in a u32 attribute + * + * @NL80211_ATTR_MAC_ADDRS: Array of nested MAC addresses, used for + * MAC ACL. + * + * @NL80211_ATTR_MAC_ACL_MAX: u32 attribute to advertise the maximum + * number of MAC addresses that a device can support for MAC + * ACL. + * + * @NL80211_ATTR_RADAR_EVENT: Type of radar event for notification to userspace, + * contains a value of enum nl80211_radar_event (u32). + * + * @NL80211_ATTR_EXT_CAPA: 802.11 extended capabilities that the kernel driver + * has and handles. The format is the same as the IE contents. See + * 802.11-2012 8.4.2.29 for more information. + * @NL80211_ATTR_EXT_CAPA_MASK: Extended capabilities that the kernel driver + * has set in the %NL80211_ATTR_EXT_CAPA value, for multibit fields. + * + * @NL80211_ATTR_STA_CAPABILITY: Station capabilities (u16) are advertised to + * the driver, e.g., to enable TDLS power save (PU-APSD). + * + * @NL80211_ATTR_STA_EXT_CAPABILITY: Station extended capabilities are + * advertised to the driver, e.g., to enable TDLS off channel operations + * and PU-APSD. + * + * @NL80211_ATTR_PROTOCOL_FEATURES: global nl80211 feature flags, see + * &enum nl80211_protocol_features, the attribute is a u32. + * + * @NL80211_ATTR_SPLIT_WIPHY_DUMP: flag attribute, userspace supports + * receiving the data for a single wiphy split across multiple + * messages, given with wiphy dump message + * + * @NL80211_ATTR_MDID: Mobility Domain Identifier + * + * @NL80211_ATTR_IE_RIC: Resource Information Container Information + * Element + * + * @NL80211_ATTR_CRIT_PROT_ID: critical protocol identifier requiring increased + * reliability, see &enum nl80211_crit_proto_id (u16). + * @NL80211_ATTR_MAX_CRIT_PROT_DURATION: duration in milliseconds in which + * the connection should have increased reliability (u16). + * + * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16). + * This is similar to @NL80211_ATTR_STA_AID but with a difference of being + * allowed to be used with the first @NL80211_CMD_SET_STATION command to + * update a TDLS peer STA entry. + * + * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. + * + * @NL80211_ATTR_CH_SWITCH_COUNT: u32 attribute specifying the number of TBTT's + * until the channel switch event. + * @NL80211_ATTR_CH_SWITCH_BLOCK_TX: flag attribute specifying that transmission + * must be blocked on the current channel (before the channel switch + * operation). Also included in the channel switch started event if quiet + * was requested by the AP. + * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information + * for the time while performing a channel switch. + * @NL80211_ATTR_CNTDWN_OFFS_BEACON: An array of offsets (u16) to the channel + * switch or color change counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CNTDWN_OFFS_PRESP: An array of offsets (u16) to the channel + * switch or color change counters in the probe response (%NL80211_ATTR_PROBE_RESP). + * + * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. + * As specified in the &enum nl80211_rxmgmt_flags. + * + * @NL80211_ATTR_STA_SUPPORTED_CHANNELS: array of supported channels. + * + * @NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES: array of supported + * operating classes. + * + * @NL80211_ATTR_HANDLE_DFS: A flag indicating whether user space + * controls DFS operation in IBSS mode. If the flag is included in + * %NL80211_CMD_JOIN_IBSS request, the driver will allow use of DFS + * channels and reports radar events to userspace. Userspace is required + * to react to radar events, e.g. initiate a channel switch or leave the + * IBSS network. + * + * @NL80211_ATTR_SUPPORT_5_MHZ: A flag indicating that the device supports + * 5 MHz channel bandwidth. + * @NL80211_ATTR_SUPPORT_10_MHZ: A flag indicating that the device supports + * 10 MHz channel bandwidth. + * + * @NL80211_ATTR_OPMODE_NOTIF: Operating mode field from Operating Mode + * Notification Element based on association request when used with + * %NL80211_CMD_NEW_STATION or %NL80211_CMD_SET_STATION (only when + * %NL80211_FEATURE_FULL_AP_CLIENT_STATE is supported, or with TDLS); + * u8 attribute. + * + * @NL80211_ATTR_VENDOR_ID: The vendor ID, either a 24-bit OUI or, if + * %NL80211_VENDOR_ID_IS_LINUX is set, a special Linux ID (not used yet) + * @NL80211_ATTR_VENDOR_SUBCMD: vendor sub-command + * @NL80211_ATTR_VENDOR_DATA: data for the vendor command, if any; this + * attribute is also used for vendor command feature advertisement + * @NL80211_ATTR_VENDOR_EVENTS: used for event list advertising in the wiphy + * info, containing a nested array of possible events + * + * @NL80211_ATTR_QOS_MAP: IP DSCP mapping for Interworking QoS mapping. This + * data is in the format defined for the payload of the QoS Map Set element + * in IEEE Std 802.11-2012, 8.4.2.97. + * + * @NL80211_ATTR_MAC_HINT: MAC address recommendation as initial BSS + * @NL80211_ATTR_WIPHY_FREQ_HINT: frequency of the recommended initial BSS + * + * @NL80211_ATTR_MAX_AP_ASSOC_STA: Device attribute that indicates how many + * associated stations are supported in AP mode (including P2P GO); u32. + * Since drivers may not have a fixed limit on the maximum number (e.g., + * other concurrent operations may affect this), drivers are allowed to + * advertise values that cannot always be met. In such cases, an attempt + * to add a new station entry with @NL80211_CMD_NEW_STATION may fail. + * + * @NL80211_ATTR_CSA_C_OFFSETS_TX: An array of csa counter offsets (u16) which + * should be updated when the frame is transmitted. + * @NL80211_ATTR_MAX_CSA_COUNTERS: U8 attribute used to advertise the maximum + * supported number of csa counters. + * + * @NL80211_ATTR_TDLS_PEER_CAPABILITY: flags for TDLS peer capabilities, u32. + * As specified in the &enum nl80211_tdls_peer_capability. + * + * @NL80211_ATTR_SOCKET_OWNER: Flag attribute, if set during interface + * creation then the new interface will be owned by the netlink socket + * that created it and will be destroyed when the socket is closed. + * If set during scheduled scan start then the new scan req will be + * owned by the netlink socket that created it and the scheduled scan will + * be stopped when the socket is closed. + * If set during configuration of regulatory indoor operation then the + * regulatory indoor configuration would be owned by the netlink socket + * that configured the indoor setting, and the indoor operation would be + * cleared when the socket is closed. + * If set during NAN interface creation, the interface will be destroyed + * if the socket is closed just like any other interface. Moreover, NAN + * notifications will be sent in unicast to that socket. Without this + * attribute, the notifications will be sent to the %NL80211_MCGRP_NAN + * multicast group. + * If set during %NL80211_CMD_ASSOCIATE or %NL80211_CMD_CONNECT the + * station will deauthenticate when the socket is closed. + * If set during %NL80211_CMD_JOIN_IBSS the IBSS will be automatically + * torn down when the socket is closed. + * If set during %NL80211_CMD_JOIN_MESH the mesh setup will be + * automatically torn down when the socket is closed. + * If set during %NL80211_CMD_START_AP the AP will be automatically + * disabled when the socket is closed. + * + * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is + * the TDLS link initiator. + * + * @NL80211_ATTR_USE_RRM: flag for indicating whether the current connection + * shall support Radio Resource Measurements (11k). This attribute can be + * used with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests. + * User space applications are expected to use this flag only if the + * underlying device supports these minimal RRM features: + * %NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES, + * %NL80211_FEATURE_QUIET, + * Or, if global RRM is supported, see: + * %NL80211_EXT_FEATURE_RRM + * If this flag is used, driver must add the Power Capabilities IE to the + * association request. In addition, it must also set the RRM capability + * flag in the association request's Capability Info field. + * + * @NL80211_ATTR_WIPHY_DYN_ACK: flag attribute used to enable ACK timeout + * estimation algorithm (dynack). In order to activate dynack + * %NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower + * drivers to indicate dynack capability. Dynack is automatically disabled + * setting valid value for coverage class. + * + * @NL80211_ATTR_TSID: a TSID value (u8 attribute) + * @NL80211_ATTR_USER_PRIO: user priority value (u8 attribute) + * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds + * (per second) (u16 attribute) + * + * @NL80211_ATTR_SMPS_MODE: SMPS mode to use (ap mode). see + * &enum nl80211_smps_mode. + * + * @NL80211_ATTR_OPER_CLASS: operating class + * + * @NL80211_ATTR_MAC_MASK: MAC address mask + * + * @NL80211_ATTR_WIPHY_SELF_MANAGED_REG: flag attribute indicating this device + * is self-managing its regulatory information and any regulatory domain + * obtained from it is coming from the device's wiphy and not the global + * cfg80211 regdomain. + * + * @NL80211_ATTR_EXT_FEATURES: extended feature flags contained in a byte + * array. The feature flags are identified by their bit index (see &enum + * nl80211_ext_feature_index). The bit index is ordered starting at the + * least-significant bit of the first byte in the array, ie. bit index 0 + * is located at bit 0 of byte 0. bit index 25 would be located at bit 1 + * of byte 3 (u8 array). + * + * @NL80211_ATTR_SURVEY_RADIO_STATS: Request overall radio statistics to be + * returned along with other survey data. If set, @NL80211_CMD_GET_SURVEY + * may return a survey entry without a channel indicating global radio + * statistics (only some values are valid and make sense.) + * For devices that don't return such an entry even then, the information + * should be contained in the result as the sum of the respective counters + * over all channels. + * + * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before the first cycle of a + * scheduled scan is started. Or the delay before a WoWLAN + * net-detect scan is started, counting from the moment the + * system is suspended. This value is a u32, in seconds. + + * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device + * is operating in an indoor environment. + * + * @NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS: maximum number of scan plans for + * scheduled scan supported by the device (u32), a wiphy attribute. + * @NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL: maximum interval (in seconds) for + * a scan plan (u32), a wiphy attribute. + * @NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS: maximum number of iterations in + * a scan plan (u32), a wiphy attribute. + * @NL80211_ATTR_SCHED_SCAN_PLANS: a list of scan plans for scheduled scan. + * Each scan plan defines the number of scan iterations and the interval + * between scans. The last scan plan will always run infinitely, + * thus it must not specify the number of iterations, only the interval + * between scans. The scan plans are executed sequentially. + * Each scan plan is a nested attribute of &enum nl80211_sched_scan_plan. + * @NL80211_ATTR_PBSS: flag attribute. If set it means operate + * in a PBSS. Specified in %NL80211_CMD_CONNECT to request + * connecting to a PCP, and in %NL80211_CMD_START_AP to start + * a PCP instead of AP. Relevant for DMG networks only. + * @NL80211_ATTR_BSS_SELECT: nested attribute for driver supporting the + * BSS selection feature. When used with %NL80211_CMD_GET_WIPHY it contains + * attributes according &enum nl80211_bss_select_attr to indicate what + * BSS selection behaviours are supported. When used with %NL80211_CMD_CONNECT + * it contains the behaviour-specific attribute containing the parameters for + * BSS selection to be done by driver and/or firmware. + * + * @NL80211_ATTR_STA_SUPPORT_P2P_PS: whether P2P PS mechanism supported + * or not. u8, one of the values of &enum nl80211_sta_p2p_ps_status + * + * @NL80211_ATTR_PAD: attribute used for padding for 64-bit alignment + * + * @NL80211_ATTR_IFTYPE_EXT_CAPA: Nested attribute of the following attributes: + * %NL80211_ATTR_IFTYPE, %NL80211_ATTR_EXT_CAPA, + * %NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities and + * other interface-type specific capabilities per interface type. For MLO, + * %NL80211_ATTR_EML_CAPABILITY and %NL80211_ATTR_MLD_CAPA_AND_OPS are + * present. + * + * @NL80211_ATTR_MU_MIMO_GROUP_DATA: array of 24 bytes that defines a MU-MIMO + * groupID for monitor mode. + * The first 8 bytes are a mask that defines the membership in each + * group (there are 64 groups, group 0 and 63 are reserved), + * each bit represents a group and set to 1 for being a member in + * that group and 0 for not being a member. + * The remaining 16 bytes define the position in each group: 2 bits for + * each group. + * (smaller group numbers represented on most significant bits and bigger + * group numbers on least significant bits.) + * This attribute is used only if all interfaces are in monitor mode. + * Set this attribute in order to monitor packets using the given MU-MIMO + * groupID data. + * to turn off that feature set all the bits of the groupID to zero. + * @NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR: mac address for the sniffer to follow + * when using MU-MIMO air sniffer. + * to turn that feature off set an invalid mac address + * (e.g. FF:FF:FF:FF:FF:FF) + * + * @NL80211_ATTR_SCAN_START_TIME_TSF: The time at which the scan was actually + * started (u64). The time is the TSF of the BSS the interface that + * requested the scan is connected to (if available, otherwise this + * attribute must not be included). + * @NL80211_ATTR_SCAN_START_TIME_TSF_BSSID: The BSS according to which + * %NL80211_ATTR_SCAN_START_TIME_TSF is set. + * @NL80211_ATTR_MEASUREMENT_DURATION: measurement duration in TUs (u16). If + * %NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY is not set, this is the + * maximum measurement duration allowed. This attribute is used with + * measurement requests. It can also be used with %NL80211_CMD_TRIGGER_SCAN + * if the scan is used for beacon report radio measurement. + * @NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY: flag attribute that indicates + * that the duration specified with %NL80211_ATTR_MEASUREMENT_DURATION is + * mandatory. If this flag is not set, the duration is the maximum duration + * and the actual measurement duration may be shorter. + * + * @NL80211_ATTR_MESH_PEER_AID: Association ID for the mesh peer (u16). This is + * used to pull the stored data for mesh peer in power save state. + * + * @NL80211_ATTR_NAN_MASTER_PREF: the master preference to be used by + * %NL80211_CMD_START_NAN and optionally with + * %NL80211_CMD_CHANGE_NAN_CONFIG. Its type is u8 and it can't be 0. + * Also, values 1 and 255 are reserved for certification purposes and + * should not be used during a normal device operation. + * @NL80211_ATTR_BANDS: operating bands configuration. This is a u32 + * bitmask of BIT(NL80211_BAND_*) as described in %enum + * nl80211_band. For instance, for NL80211_BAND_2GHZ, bit 0 + * would be set. This attribute is used with + * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG, and + * it is optional. If no bands are set, it means don't-care and + * the device will decide what to use. + * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See + * &enum nl80211_nan_func_attributes for description of this nested + * attribute. + * @NL80211_ATTR_NAN_MATCH: used to report a match. This is a nested attribute. + * See &enum nl80211_nan_match_attributes. + * @NL80211_ATTR_FILS_KEK: KEK for FILS (Re)Association Request/Response frame + * protection. + * @NL80211_ATTR_FILS_NONCES: Nonces (part of AAD) for FILS (Re)Association + * Request/Response frame protection. This attribute contains the 16 octet + * STA Nonce followed by 16 octets of AP Nonce. + * + * @NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED: Indicates whether or not multicast + * packets should be send out as unicast to all stations (flag attribute). + * + * @NL80211_ATTR_BSSID: The BSSID of the AP. Note that %NL80211_ATTR_MAC is also + * used in various commands/events for specifying the BSSID. + * + * @NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI: Relative RSSI threshold by which + * other BSSs has to be better or slightly worse than the current + * connected BSS so that they get reported to user space. + * This will give an opportunity to userspace to consider connecting to + * other matching BSSs which have better or slightly worse RSSI than + * the current connected BSS by using an offloaded operation to avoid + * unnecessary wakeups. + * + * @NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST: When present the RSSI level for BSSs in + * the specified band is to be adjusted before doing + * %NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI based comparison to figure out + * better BSSs. The attribute value is a packed structure + * value as specified by &struct nl80211_bss_select_rssi_adjust. + * + * @NL80211_ATTR_TIMEOUT_REASON: The reason for which an operation timed out. + * u32 attribute with an &enum nl80211_timeout_reason value. This is used, + * e.g., with %NL80211_CMD_CONNECT event. + * + * @NL80211_ATTR_FILS_ERP_USERNAME: EAP Re-authentication Protocol (ERP) + * username part of NAI used to refer keys rRK and rIK. This is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_REALM: EAP Re-authentication Protocol (ERP) realm part + * of NAI specifying the domain name of the ER server. This is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM: Unsigned 16-bit ERP next sequence number + * to use in ERP messages. This is used in generating the FILS wrapped data + * for FILS authentication and is used with %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_RRK: ERP re-authentication Root Key (rRK) for the + * NAI specified by %NL80211_ATTR_FILS_ERP_USERNAME and + * %NL80211_ATTR_FILS_ERP_REALM. This is used for generating rIK and rMSK + * from successful FILS authentication and is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_CACHE_ID: A 2-octet identifier advertized by a FILS AP + * identifying the scope of PMKSAs. This is used with + * @NL80211_CMD_SET_PMKSA and @NL80211_CMD_DEL_PMKSA. + * + * @NL80211_ATTR_PMK: attribute for passing PMK key material. Used with + * %NL80211_CMD_SET_PMKSA for the PMKSA identified by %NL80211_ATTR_PMKID. + * For %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP it is used to provide + * PSK for offloading 4-way handshake for WPA/WPA2-PSK networks. For 802.1X + * authentication it is used with %NL80211_CMD_SET_PMK. For offloaded FT + * support this attribute specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME + * is included as well. + * + * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to + * indicate that it supports multiple active scheduled scan requests. + * @NL80211_ATTR_SCHED_SCAN_MAX_REQS: indicates maximum number of scheduled + * scan request that may be active for the device (u32). + * + * @NL80211_ATTR_WANT_1X_4WAY_HS: flag attribute which user-space can include + * in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it + * wants to use the supported offload of the 4-way handshake. + * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT. + * @NL80211_ATTR_PORT_AUTHORIZED: (reserved) + * + * @NL80211_ATTR_EXTERNAL_AUTH_ACTION: Identify the requested external + * authentication operation (u32 attribute with an + * &enum nl80211_external_auth_action value). This is used with the + * %NL80211_CMD_EXTERNAL_AUTH request event. + * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user + * space supports external authentication. This attribute shall be used + * with %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP request. The driver + * may offload authentication processing to user space if this capability + * is indicated in the respective requests from the user space. (This flag + * attribute deprecated for %NL80211_CMD_START_AP, use + * %NL80211_ATTR_AP_SETTINGS_FLAGS) + * + * @NL80211_ATTR_NSS: Station's New/updated RX_NSS value notified using this + * u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED. + * + * @NL80211_ATTR_TXQ_STATS: TXQ statistics (nested attribute, see &enum + * nl80211_txq_stats) + * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy. + * The smaller of this and the memory limit is enforced. + * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory limit (in bytes) for the + * TXQ queues for this phy. The smaller of this and the packet limit is + * enforced. + * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes + * a flow is assigned on each round of the DRR scheduler. + * @NL80211_ATTR_HE_CAPABILITY: HE Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if %NL80211_STA_FLAG_WME is set. + * + * @NL80211_ATTR_FTM_RESPONDER: nested attribute which user-space can include + * in %NL80211_CMD_START_AP or %NL80211_CMD_SET_BEACON for fine timing + * measurement (FTM) responder functionality and containing parameters as + * possible, see &enum nl80211_ftm_responder_attr + * + * @NL80211_ATTR_FTM_RESPONDER_STATS: Nested attribute with FTM responder + * statistics, see &enum nl80211_ftm_responder_stats. + * + * @NL80211_ATTR_TIMEOUT: Timeout for the given operation in milliseconds (u32), + * if the attribute is not given no timeout is requested. Note that 0 is an + * invalid value. + * + * @NL80211_ATTR_PEER_MEASUREMENTS: peer measurements request (and result) + * data, uses nested attributes specified in + * &enum nl80211_peer_measurement_attrs. + * This is also used for capability advertisement in the wiphy information, + * with the appropriate sub-attributes. + * + * @NL80211_ATTR_AIRTIME_WEIGHT: Station's weight when scheduled by the airtime + * scheduler. + * + * @NL80211_ATTR_STA_TX_POWER_SETTING: Transmit power setting type (u8) for + * station associated with the AP. See &enum nl80211_tx_power_setting for + * possible values. + * @NL80211_ATTR_STA_TX_POWER: Transmit power level (s16) in dBm units. This + * allows to set Tx power for a station. If this attribute is not included, + * the default per-interface tx power setting will be overriding. Driver + * should be picking up the lowest tx power, either tx power per-interface + * or per-station. + * + * @NL80211_ATTR_SAE_PASSWORD: attribute for passing SAE password material. It + * is used with %NL80211_CMD_CONNECT to provide password for offloading + * SAE authentication for WPA3-Personal networks. + * + * @NL80211_ATTR_TWT_RESPONDER: Enable target wait time responder support. + * + * @NL80211_ATTR_HE_OBSS_PD: nested attribute for OBSS Packet Detection + * functionality. + * + * @NL80211_ATTR_WIPHY_EDMG_CHANNELS: bitmap that indicates the 2.16 GHz + * channel(s) that are allowed to be used for EDMG transmissions. + * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251. (u8 attribute) + * @NL80211_ATTR_WIPHY_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes + * the allowed channel bandwidth configurations. (u8 attribute) + * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. + * + * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key + * (u16). + * + * @NL80211_ATTR_HE_BSS_COLOR: nested attribute for BSS Color Settings. + * + * @NL80211_ATTR_IFTYPE_AKM_SUITES: nested array attribute, with each entry + * using attributes from &enum nl80211_iftype_akm_attributes. This + * attribute is sent in a response to %NL80211_CMD_GET_WIPHY indicating + * supported AKM suites capability per interface. AKMs advertised in + * %NL80211_ATTR_AKM_SUITES are default capabilities if AKM suites not + * advertised for a specific interface type. + * + * @NL80211_ATTR_TID_CONFIG: TID specific configuration in a + * nested attribute with &enum nl80211_tid_config_attr sub-attributes; + * on output (in wiphy attributes) it contains only the feature sub- + * attributes. + * + * @NL80211_ATTR_CONTROL_PORT_NO_PREAUTH: disable preauth frame rx on control + * port in order to forward/receive them as ordinary data frames. + * + * @NL80211_ATTR_PMK_LIFETIME: Maximum lifetime for PMKSA in seconds (u32, + * dot11RSNAConfigPMKReauthThreshold; 0 is not a valid value). + * An optional parameter configured through %NL80211_CMD_SET_PMKSA. + * Drivers that trigger roaming need to know the lifetime of the + * configured PMKSA for triggering the full vs. PMKSA caching based + * authentication. This timeout helps authentication methods like SAE, + * where PMK gets updated only by going through a full (new SAE) + * authentication instead of getting updated during an association for EAP + * authentication. No new full authentication within the PMK expiry shall + * result in a disassociation at the end of the lifetime. + * + * @NL80211_ATTR_PMK_REAUTH_THRESHOLD: Reauthentication threshold time, in + * terms of percentage of %NL80211_ATTR_PMK_LIFETIME + * (u8, dot11RSNAConfigPMKReauthThreshold, 1..100). This is an optional + * parameter configured through %NL80211_CMD_SET_PMKSA. Requests the + * driver to trigger a full authentication roam (without PMKSA caching) + * after the reauthentication threshold time, but before the PMK lifetime + * has expired. + * + * Authentication methods like SAE need to be able to generate a new PMKSA + * entry without having to force a disconnection after the PMK timeout. If + * no roaming occurs between the reauth threshold and PMK expiration, + * disassociation is still forced. + * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the + * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * @NL80211_ATTR_WIPHY_FREQ_OFFSET: offset of the associated + * %NL80211_ATTR_WIPHY_FREQ in positive KHz. Only valid when supplied with + * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. + * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the + * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. + * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies + * + * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from + * association request when used with NL80211_CMD_NEW_STATION). + * + * @NL80211_ATTR_FILS_DISCOVERY: Optional parameter to configure FILS + * discovery. It is a nested attribute, see + * &enum nl80211_fils_discovery_attributes. + * + * @NL80211_ATTR_UNSOL_BCAST_PROBE_RESP: Optional parameter to configure + * unsolicited broadcast probe response. It is a nested attribute, see + * &enum nl80211_unsol_bcast_probe_resp_attributes. + * + * @NL80211_ATTR_S1G_CAPABILITY: S1G Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION) + * @NL80211_ATTR_S1G_CAPABILITY_MASK: S1G Capability Information element + * override mask. Used with NL80211_ATTR_S1G_CAPABILITY in + * NL80211_CMD_ASSOCIATE or NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_SAE_PWE: Indicates the mechanism(s) allowed for SAE PWE + * derivation in WPA3-Personal networks which are using SAE authentication. + * This is a u8 attribute that encapsulates one of the values from + * &enum nl80211_sae_pwe_mechanism. + * + * @NL80211_ATTR_SAR_SPEC: SAR power limitation specification when + * used with %NL80211_CMD_SET_SAR_SPECS. The message contains fields + * of %nl80211_sar_attrs which specifies the sar type and related + * sar specs. Sar specs contains array of %nl80211_sar_specs_attrs. + * + * @NL80211_ATTR_RECONNECT_REQUESTED: flag attribute, used with deauth and + * disassoc events to indicate that an immediate reconnect to the AP + * is desired. + * + * @NL80211_ATTR_OBSS_COLOR_BITMAP: bitmap of the u64 BSS colors for the + * %NL80211_CMD_OBSS_COLOR_COLLISION event. + * + * @NL80211_ATTR_COLOR_CHANGE_COUNT: u8 attribute specifying the number of TBTT's + * until the color switch event. + * @NL80211_ATTR_COLOR_CHANGE_COLOR: u8 attribute specifying the color that we are + * switching to + * @NL80211_ATTR_COLOR_CHANGE_ELEMS: Nested set of attributes containing the IE + * information for the time while performing a color switch. + * + * @NL80211_ATTR_MBSSID_CONFIG: Nested attribute for multiple BSSID + * advertisements (MBSSID) parameters in AP mode. + * Kernel uses this attribute to indicate the driver's support for MBSSID + * and enhanced multi-BSSID advertisements (EMA AP) to the userspace. + * Userspace should use this attribute to configure per interface MBSSID + * parameters. + * See &enum nl80211_mbssid_config_attributes for details. + * + * @NL80211_ATTR_MBSSID_ELEMS: Nested parameter to pass multiple BSSID elements. + * Mandatory parameter for the transmitting interface to enable MBSSID. + * Optional for the non-transmitting interfaces. + * + * @NL80211_ATTR_RADAR_BACKGROUND: Configure dedicated offchannel chain + * available for radar/CAC detection on some hw. This chain can't be used + * to transmit or receive frames and it is bounded to a running wdev. + * Background radar/CAC detection allows to avoid the CAC downtime + * switching on a different channel during CAC detection on the selected + * radar channel. + * + * @NL80211_ATTR_AP_SETTINGS_FLAGS: u32 attribute contains ap settings flags, + * enumerated in &enum nl80211_ap_settings_flags. This attribute shall be + * used with %NL80211_CMD_START_AP request. + * + * @NL80211_ATTR_EHT_CAPABILITY: EHT Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if %NL80211_STA_FLAG_WME is set. + * + * @NL80211_ATTR_MLO_LINK_ID: A (u8) link ID for use with MLO, to be used with + * various commands that need a link ID to operate. + * @NL80211_ATTR_MLO_LINKS: A nested array of links, each containing some + * per-link information and a link ID. + * @NL80211_ATTR_MLD_ADDR: An MLD address, used with various commands such as + * authenticate/associate. + * + * @NL80211_ATTR_MLO_SUPPORT: Flag attribute to indicate user space supports MLO + * connection. Used with %NL80211_CMD_CONNECT. If this attribute is not + * included in NL80211_CMD_CONNECT drivers must not perform MLO connection. + * + * @NL80211_ATTR_MAX_NUM_AKM_SUITES: U16 attribute. Indicates maximum number of + * AKM suites allowed for %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_START_AP in %NL80211_CMD_GET_WIPHY response. If this + * attribute is not present userspace shall consider maximum number of AKM + * suites allowed as %NL80211_MAX_NR_AKM_SUITES which is the legacy maximum + * number prior to the introduction of this attribute. + * + * @NL80211_ATTR_EML_CAPABILITY: EML Capability information (u16) + * @NL80211_ATTR_MLD_CAPA_AND_OPS: MLD Capabilities and Operations (u16) + * + * @NL80211_ATTR_TX_HW_TIMESTAMP: Hardware timestamp for TX operation in + * nanoseconds (u64). This is the device clock timestamp so it will + * probably reset when the device is stopped or the firmware is reset. + * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the frame TX + * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates + * the ack TX timestamp. + * @NL80211_ATTR_RX_HW_TIMESTAMP: Hardware timestamp for RX operation in + * nanoseconds (u64). This is the device clock timestamp so it will + * probably reset when the device is stopped or the firmware is reset. + * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX + * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates + * the incoming frame RX timestamp. + * @NL80211_ATTR_TD_BITMAP: Transition Disable bitmap, for subsequent + * (re)associations. + * @NUM_NL80211_ATTR: total number of nl80211_attrs available + * @NL80211_ATTR_MAX: highest attribute number currently defined + * @__NL80211_ATTR_AFTER_LAST: internal use + */ +enum nl80211_attrs { +/* don't change the order or add anything between, this is ABI! */ + NL80211_ATTR_UNSPEC, + + NL80211_ATTR_WIPHY, + NL80211_ATTR_WIPHY_NAME, + + NL80211_ATTR_IFINDEX, + NL80211_ATTR_IFNAME, + NL80211_ATTR_IFTYPE, + + NL80211_ATTR_MAC, + + NL80211_ATTR_KEY_DATA, + NL80211_ATTR_KEY_IDX, + NL80211_ATTR_KEY_CIPHER, + NL80211_ATTR_KEY_SEQ, + NL80211_ATTR_KEY_DEFAULT, + + NL80211_ATTR_BEACON_INTERVAL, + NL80211_ATTR_DTIM_PERIOD, + NL80211_ATTR_BEACON_HEAD, + NL80211_ATTR_BEACON_TAIL, + + NL80211_ATTR_STA_AID, + NL80211_ATTR_STA_FLAGS, + NL80211_ATTR_STA_LISTEN_INTERVAL, + NL80211_ATTR_STA_SUPPORTED_RATES, + NL80211_ATTR_STA_VLAN, + NL80211_ATTR_STA_INFO, + + NL80211_ATTR_WIPHY_BANDS, + + NL80211_ATTR_MNTR_FLAGS, + + NL80211_ATTR_MESH_ID, + NL80211_ATTR_STA_PLINK_ACTION, + NL80211_ATTR_MPATH_NEXT_HOP, + NL80211_ATTR_MPATH_INFO, + + NL80211_ATTR_BSS_CTS_PROT, + NL80211_ATTR_BSS_SHORT_PREAMBLE, + NL80211_ATTR_BSS_SHORT_SLOT_TIME, + + NL80211_ATTR_HT_CAPABILITY, + + NL80211_ATTR_SUPPORTED_IFTYPES, + + NL80211_ATTR_REG_ALPHA2, + NL80211_ATTR_REG_RULES, + + NL80211_ATTR_MESH_CONFIG, + + NL80211_ATTR_BSS_BASIC_RATES, + + NL80211_ATTR_WIPHY_TXQ_PARAMS, + NL80211_ATTR_WIPHY_FREQ, + NL80211_ATTR_WIPHY_CHANNEL_TYPE, + + NL80211_ATTR_KEY_DEFAULT_MGMT, + + NL80211_ATTR_MGMT_SUBTYPE, + NL80211_ATTR_IE, + + NL80211_ATTR_MAX_NUM_SCAN_SSIDS, + + NL80211_ATTR_SCAN_FREQUENCIES, + NL80211_ATTR_SCAN_SSIDS, + NL80211_ATTR_GENERATION, /* replaces old SCAN_GENERATION */ + NL80211_ATTR_BSS, + + NL80211_ATTR_REG_INITIATOR, + NL80211_ATTR_REG_TYPE, + + NL80211_ATTR_SUPPORTED_COMMANDS, + + NL80211_ATTR_FRAME, + NL80211_ATTR_SSID, + NL80211_ATTR_AUTH_TYPE, + NL80211_ATTR_REASON_CODE, + + NL80211_ATTR_KEY_TYPE, + + NL80211_ATTR_MAX_SCAN_IE_LEN, + NL80211_ATTR_CIPHER_SUITES, + + NL80211_ATTR_FREQ_BEFORE, + NL80211_ATTR_FREQ_AFTER, + + NL80211_ATTR_FREQ_FIXED, + + + NL80211_ATTR_WIPHY_RETRY_SHORT, + NL80211_ATTR_WIPHY_RETRY_LONG, + NL80211_ATTR_WIPHY_FRAG_THRESHOLD, + NL80211_ATTR_WIPHY_RTS_THRESHOLD, + + NL80211_ATTR_TIMED_OUT, + + NL80211_ATTR_USE_MFP, + + NL80211_ATTR_STA_FLAGS2, + + NL80211_ATTR_CONTROL_PORT, + + NL80211_ATTR_TESTDATA, + + NL80211_ATTR_PRIVACY, + + NL80211_ATTR_DISCONNECTED_BY_AP, + NL80211_ATTR_STATUS_CODE, + + NL80211_ATTR_CIPHER_SUITES_PAIRWISE, + NL80211_ATTR_CIPHER_SUITE_GROUP, + NL80211_ATTR_WPA_VERSIONS, + NL80211_ATTR_AKM_SUITES, + + NL80211_ATTR_REQ_IE, + NL80211_ATTR_RESP_IE, + + NL80211_ATTR_PREV_BSSID, + + NL80211_ATTR_KEY, + NL80211_ATTR_KEYS, + + NL80211_ATTR_PID, + + NL80211_ATTR_4ADDR, + + NL80211_ATTR_SURVEY_INFO, + + NL80211_ATTR_PMKID, + NL80211_ATTR_MAX_NUM_PMKIDS, + + NL80211_ATTR_DURATION, + + NL80211_ATTR_COOKIE, + + NL80211_ATTR_WIPHY_COVERAGE_CLASS, + + NL80211_ATTR_TX_RATES, + + NL80211_ATTR_FRAME_MATCH, + + NL80211_ATTR_ACK, + + NL80211_ATTR_PS_STATE, + + NL80211_ATTR_CQM, + + NL80211_ATTR_LOCAL_STATE_CHANGE, + + NL80211_ATTR_AP_ISOLATE, + + NL80211_ATTR_WIPHY_TX_POWER_SETTING, + NL80211_ATTR_WIPHY_TX_POWER_LEVEL, + + NL80211_ATTR_TX_FRAME_TYPES, + NL80211_ATTR_RX_FRAME_TYPES, + NL80211_ATTR_FRAME_TYPE, + + NL80211_ATTR_CONTROL_PORT_ETHERTYPE, + NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, + + NL80211_ATTR_SUPPORT_IBSS_RSN, + + NL80211_ATTR_WIPHY_ANTENNA_TX, + NL80211_ATTR_WIPHY_ANTENNA_RX, + + NL80211_ATTR_MCAST_RATE, + + NL80211_ATTR_OFFCHANNEL_TX_OK, + + NL80211_ATTR_BSS_HT_OPMODE, + + NL80211_ATTR_KEY_DEFAULT_TYPES, + + NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION, + + NL80211_ATTR_MESH_SETUP, + + NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX, + NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX, + + NL80211_ATTR_SUPPORT_MESH_AUTH, + NL80211_ATTR_STA_PLINK_STATE, + + NL80211_ATTR_WOWLAN_TRIGGERS, + NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED, + + NL80211_ATTR_SCHED_SCAN_INTERVAL, + + NL80211_ATTR_INTERFACE_COMBINATIONS, + NL80211_ATTR_SOFTWARE_IFTYPES, + + NL80211_ATTR_REKEY_DATA, + + NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS, + NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN, + + NL80211_ATTR_SCAN_SUPP_RATES, + + NL80211_ATTR_HIDDEN_SSID, + + NL80211_ATTR_IE_PROBE_RESP, + NL80211_ATTR_IE_ASSOC_RESP, + + NL80211_ATTR_STA_WME, + NL80211_ATTR_SUPPORT_AP_UAPSD, + + NL80211_ATTR_ROAM_SUPPORT, + + NL80211_ATTR_SCHED_SCAN_MATCH, + NL80211_ATTR_MAX_MATCH_SETS, + + NL80211_ATTR_PMKSA_CANDIDATE, + + NL80211_ATTR_TX_NO_CCK_RATE, + + NL80211_ATTR_TDLS_ACTION, + NL80211_ATTR_TDLS_DIALOG_TOKEN, + NL80211_ATTR_TDLS_OPERATION, + NL80211_ATTR_TDLS_SUPPORT, + NL80211_ATTR_TDLS_EXTERNAL_SETUP, + + NL80211_ATTR_DEVICE_AP_SME, + + NL80211_ATTR_DONT_WAIT_FOR_ACK, + + NL80211_ATTR_FEATURE_FLAGS, + + NL80211_ATTR_PROBE_RESP_OFFLOAD, + + NL80211_ATTR_PROBE_RESP, + + NL80211_ATTR_DFS_REGION, + + NL80211_ATTR_DISABLE_HT, + NL80211_ATTR_HT_CAPABILITY_MASK, + + NL80211_ATTR_NOACK_MAP, + + NL80211_ATTR_INACTIVITY_TIMEOUT, + + NL80211_ATTR_RX_SIGNAL_DBM, + + NL80211_ATTR_BG_SCAN_PERIOD, + + NL80211_ATTR_WDEV, + + NL80211_ATTR_USER_REG_HINT_TYPE, + + NL80211_ATTR_CONN_FAILED_REASON, + + NL80211_ATTR_AUTH_DATA, + + NL80211_ATTR_VHT_CAPABILITY, + + NL80211_ATTR_SCAN_FLAGS, + + NL80211_ATTR_CHANNEL_WIDTH, + NL80211_ATTR_CENTER_FREQ1, + NL80211_ATTR_CENTER_FREQ2, + + NL80211_ATTR_P2P_CTWINDOW, + NL80211_ATTR_P2P_OPPPS, + + NL80211_ATTR_LOCAL_MESH_POWER_MODE, + + NL80211_ATTR_ACL_POLICY, + + NL80211_ATTR_MAC_ADDRS, + + NL80211_ATTR_MAC_ACL_MAX, + + NL80211_ATTR_RADAR_EVENT, + + NL80211_ATTR_EXT_CAPA, + NL80211_ATTR_EXT_CAPA_MASK, + + NL80211_ATTR_STA_CAPABILITY, + NL80211_ATTR_STA_EXT_CAPABILITY, + + NL80211_ATTR_PROTOCOL_FEATURES, + NL80211_ATTR_SPLIT_WIPHY_DUMP, + + NL80211_ATTR_DISABLE_VHT, + NL80211_ATTR_VHT_CAPABILITY_MASK, + + NL80211_ATTR_MDID, + NL80211_ATTR_IE_RIC, + + NL80211_ATTR_CRIT_PROT_ID, + NL80211_ATTR_MAX_CRIT_PROT_DURATION, + + NL80211_ATTR_PEER_AID, + + NL80211_ATTR_COALESCE_RULE, + + NL80211_ATTR_CH_SWITCH_COUNT, + NL80211_ATTR_CH_SWITCH_BLOCK_TX, + NL80211_ATTR_CSA_IES, + NL80211_ATTR_CNTDWN_OFFS_BEACON, + NL80211_ATTR_CNTDWN_OFFS_PRESP, + + NL80211_ATTR_RXMGMT_FLAGS, + + NL80211_ATTR_STA_SUPPORTED_CHANNELS, + + NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES, + + NL80211_ATTR_HANDLE_DFS, + + NL80211_ATTR_SUPPORT_5_MHZ, + NL80211_ATTR_SUPPORT_10_MHZ, + + NL80211_ATTR_OPMODE_NOTIF, + + NL80211_ATTR_VENDOR_ID, + NL80211_ATTR_VENDOR_SUBCMD, + NL80211_ATTR_VENDOR_DATA, + NL80211_ATTR_VENDOR_EVENTS, + + NL80211_ATTR_QOS_MAP, + + NL80211_ATTR_MAC_HINT, + NL80211_ATTR_WIPHY_FREQ_HINT, + + NL80211_ATTR_MAX_AP_ASSOC_STA, + + NL80211_ATTR_TDLS_PEER_CAPABILITY, + + NL80211_ATTR_SOCKET_OWNER, + + NL80211_ATTR_CSA_C_OFFSETS_TX, + NL80211_ATTR_MAX_CSA_COUNTERS, + + NL80211_ATTR_TDLS_INITIATOR, + + NL80211_ATTR_USE_RRM, + + NL80211_ATTR_WIPHY_DYN_ACK, + + NL80211_ATTR_TSID, + NL80211_ATTR_USER_PRIO, + NL80211_ATTR_ADMITTED_TIME, + + NL80211_ATTR_SMPS_MODE, + + NL80211_ATTR_OPER_CLASS, + + NL80211_ATTR_MAC_MASK, + + NL80211_ATTR_WIPHY_SELF_MANAGED_REG, + + NL80211_ATTR_EXT_FEATURES, + + NL80211_ATTR_SURVEY_RADIO_STATS, + + NL80211_ATTR_NETNS_FD, + + NL80211_ATTR_SCHED_SCAN_DELAY, + + NL80211_ATTR_REG_INDOOR, + + NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS, + NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL, + NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS, + NL80211_ATTR_SCHED_SCAN_PLANS, + + NL80211_ATTR_PBSS, + + NL80211_ATTR_BSS_SELECT, + + NL80211_ATTR_STA_SUPPORT_P2P_PS, + + NL80211_ATTR_PAD, + + NL80211_ATTR_IFTYPE_EXT_CAPA, + + NL80211_ATTR_MU_MIMO_GROUP_DATA, + NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR, + + NL80211_ATTR_SCAN_START_TIME_TSF, + NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, + NL80211_ATTR_MEASUREMENT_DURATION, + NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY, + + NL80211_ATTR_MESH_PEER_AID, + + NL80211_ATTR_NAN_MASTER_PREF, + NL80211_ATTR_BANDS, + NL80211_ATTR_NAN_FUNC, + NL80211_ATTR_NAN_MATCH, + + NL80211_ATTR_FILS_KEK, + NL80211_ATTR_FILS_NONCES, + + NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED, + + NL80211_ATTR_BSSID, + + NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI, + NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST, + + NL80211_ATTR_TIMEOUT_REASON, + + NL80211_ATTR_FILS_ERP_USERNAME, + NL80211_ATTR_FILS_ERP_REALM, + NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, + NL80211_ATTR_FILS_ERP_RRK, + NL80211_ATTR_FILS_CACHE_ID, + + NL80211_ATTR_PMK, + + NL80211_ATTR_SCHED_SCAN_MULTI, + NL80211_ATTR_SCHED_SCAN_MAX_REQS, + + NL80211_ATTR_WANT_1X_4WAY_HS, + NL80211_ATTR_PMKR0_NAME, + NL80211_ATTR_PORT_AUTHORIZED, + + NL80211_ATTR_EXTERNAL_AUTH_ACTION, + NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, + + NL80211_ATTR_NSS, + NL80211_ATTR_ACK_SIGNAL, + + NL80211_ATTR_CONTROL_PORT_OVER_NL80211, + + NL80211_ATTR_TXQ_STATS, + NL80211_ATTR_TXQ_LIMIT, + NL80211_ATTR_TXQ_MEMORY_LIMIT, + NL80211_ATTR_TXQ_QUANTUM, + + NL80211_ATTR_HE_CAPABILITY, + + NL80211_ATTR_FTM_RESPONDER, + + NL80211_ATTR_FTM_RESPONDER_STATS, + + NL80211_ATTR_TIMEOUT, + + NL80211_ATTR_PEER_MEASUREMENTS, + + NL80211_ATTR_AIRTIME_WEIGHT, + NL80211_ATTR_STA_TX_POWER_SETTING, + NL80211_ATTR_STA_TX_POWER, + + NL80211_ATTR_SAE_PASSWORD, + + NL80211_ATTR_TWT_RESPONDER, + + NL80211_ATTR_HE_OBSS_PD, + + NL80211_ATTR_WIPHY_EDMG_CHANNELS, + NL80211_ATTR_WIPHY_EDMG_BW_CONFIG, + + NL80211_ATTR_VLAN_ID, + + NL80211_ATTR_HE_BSS_COLOR, + + NL80211_ATTR_IFTYPE_AKM_SUITES, + + NL80211_ATTR_TID_CONFIG, + + NL80211_ATTR_CONTROL_PORT_NO_PREAUTH, + + NL80211_ATTR_PMK_LIFETIME, + NL80211_ATTR_PMK_REAUTH_THRESHOLD, + + NL80211_ATTR_RECEIVE_MULTICAST, + NL80211_ATTR_WIPHY_FREQ_OFFSET, + NL80211_ATTR_CENTER_FREQ1_OFFSET, + NL80211_ATTR_SCAN_FREQ_KHZ, + + NL80211_ATTR_HE_6GHZ_CAPABILITY, + + NL80211_ATTR_FILS_DISCOVERY, + + NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + + NL80211_ATTR_S1G_CAPABILITY, + NL80211_ATTR_S1G_CAPABILITY_MASK, + + NL80211_ATTR_SAE_PWE, + + NL80211_ATTR_RECONNECT_REQUESTED, + + NL80211_ATTR_SAR_SPEC, + + NL80211_ATTR_DISABLE_HE, + + NL80211_ATTR_OBSS_COLOR_BITMAP, + + NL80211_ATTR_COLOR_CHANGE_COUNT, + NL80211_ATTR_COLOR_CHANGE_COLOR, + NL80211_ATTR_COLOR_CHANGE_ELEMS, + + NL80211_ATTR_MBSSID_CONFIG, + NL80211_ATTR_MBSSID_ELEMS, + + NL80211_ATTR_RADAR_BACKGROUND, + + NL80211_ATTR_AP_SETTINGS_FLAGS, + + NL80211_ATTR_EHT_CAPABILITY, + + NL80211_ATTR_DISABLE_EHT, + + NL80211_ATTR_MLO_LINKS, + NL80211_ATTR_MLO_LINK_ID, + NL80211_ATTR_MLD_ADDR, + + NL80211_ATTR_MLO_SUPPORT, + + NL80211_ATTR_MAX_NUM_AKM_SUITES, + + NL80211_ATTR_EML_CAPABILITY, + NL80211_ATTR_MLD_CAPA_AND_OPS, + + NL80211_ATTR_TX_HW_TIMESTAMP, + NL80211_ATTR_RX_HW_TIMESTAMP, + NL80211_ATTR_TD_BITMAP, + + /* add attributes here, update the policy in nl80211.c */ + + __NL80211_ATTR_AFTER_LAST, + NUM_NL80211_ATTR = __NL80211_ATTR_AFTER_LAST, + NL80211_ATTR_MAX = __NL80211_ATTR_AFTER_LAST - 1 +}; + +/* source-level API compatibility */ +#define NL80211_ATTR_SCAN_GENERATION NL80211_ATTR_GENERATION +#define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG +#define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER +#define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA +#define NL80211_ATTR_CSA_C_OFF_BEACON NL80211_ATTR_CNTDWN_OFFS_BEACON +#define NL80211_ATTR_CSA_C_OFF_PRESP NL80211_ATTR_CNTDWN_OFFS_PRESP + +/* + * Allow user space programs to use #ifdef on new attributes by defining them + * here + */ +#define NL80211_CMD_CONNECT NL80211_CMD_CONNECT +#define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY +#define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES +#define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS +#define NL80211_ATTR_WIPHY_FREQ NL80211_ATTR_WIPHY_FREQ +#define NL80211_ATTR_WIPHY_CHANNEL_TYPE NL80211_ATTR_WIPHY_CHANNEL_TYPE +#define NL80211_ATTR_MGMT_SUBTYPE NL80211_ATTR_MGMT_SUBTYPE +#define NL80211_ATTR_IE NL80211_ATTR_IE +#define NL80211_ATTR_REG_INITIATOR NL80211_ATTR_REG_INITIATOR +#define NL80211_ATTR_REG_TYPE NL80211_ATTR_REG_TYPE +#define NL80211_ATTR_FRAME NL80211_ATTR_FRAME +#define NL80211_ATTR_SSID NL80211_ATTR_SSID +#define NL80211_ATTR_AUTH_TYPE NL80211_ATTR_AUTH_TYPE +#define NL80211_ATTR_REASON_CODE NL80211_ATTR_REASON_CODE +#define NL80211_ATTR_CIPHER_SUITES_PAIRWISE NL80211_ATTR_CIPHER_SUITES_PAIRWISE +#define NL80211_ATTR_CIPHER_SUITE_GROUP NL80211_ATTR_CIPHER_SUITE_GROUP +#define NL80211_ATTR_WPA_VERSIONS NL80211_ATTR_WPA_VERSIONS +#define NL80211_ATTR_AKM_SUITES NL80211_ATTR_AKM_SUITES +#define NL80211_ATTR_KEY NL80211_ATTR_KEY +#define NL80211_ATTR_KEYS NL80211_ATTR_KEYS +#define NL80211_ATTR_FEATURE_FLAGS NL80211_ATTR_FEATURE_FLAGS + +#define NL80211_WIPHY_NAME_MAXLEN 64 + +#define NL80211_MAX_SUPP_RATES 32 +#define NL80211_MAX_SUPP_HT_RATES 77 +#define NL80211_MAX_SUPP_REG_RULES 128 +#define NL80211_TKIP_DATA_OFFSET_ENCR_KEY 0 +#define NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY 16 +#define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY 24 +#define NL80211_HT_CAPABILITY_LEN 26 +#define NL80211_VHT_CAPABILITY_LEN 12 +#define NL80211_HE_MIN_CAPABILITY_LEN 16 +#define NL80211_HE_MAX_CAPABILITY_LEN 54 +#define NL80211_MAX_NR_CIPHER_SUITES 5 + +/* + * NL80211_MAX_NR_AKM_SUITES is obsolete when %NL80211_ATTR_MAX_NUM_AKM_SUITES + * present in %NL80211_CMD_GET_WIPHY response. + */ +#define NL80211_MAX_NR_AKM_SUITES 2 +#define NL80211_EHT_MIN_CAPABILITY_LEN 13 +#define NL80211_EHT_MAX_CAPABILITY_LEN 51 + +#define NL80211_MIN_REMAIN_ON_CHANNEL_TIME 10 + +/* default RSSI threshold for scan results if none specified. */ +#define NL80211_SCAN_RSSI_THOLD_OFF -300 + +#define NL80211_CQM_TXE_MAX_INTVL 1800 + +/** + * enum nl80211_iftype - (virtual) interface types + * + * @NL80211_IFTYPE_UNSPECIFIED: unspecified type, driver decides + * @NL80211_IFTYPE_ADHOC: independent BSS member + * @NL80211_IFTYPE_STATION: managed BSS member + * @NL80211_IFTYPE_AP: access point + * @NL80211_IFTYPE_AP_VLAN: VLAN interface for access points; VLAN interfaces + * are a bit special in that they must always be tied to a pre-existing + * AP type interface. + * @NL80211_IFTYPE_WDS: wireless distribution interface + * @NL80211_IFTYPE_MONITOR: monitor interface receiving all frames + * @NL80211_IFTYPE_MESH_POINT: mesh point + * @NL80211_IFTYPE_P2P_CLIENT: P2P client + * @NL80211_IFTYPE_P2P_GO: P2P group owner + * @NL80211_IFTYPE_P2P_DEVICE: P2P device interface type, this is not a netdev + * and therefore can't be created in the normal ways, use the + * %NL80211_CMD_START_P2P_DEVICE and %NL80211_CMD_STOP_P2P_DEVICE + * commands to create and destroy one + * @NL80211_IFTYPE_OCB: Outside Context of a BSS + * This mode corresponds to the MIB variable dot11OCBActivated=true + * @NL80211_IFTYPE_NAN: NAN device interface type (not a netdev) + * @NL80211_IFTYPE_MAX: highest interface type number currently defined + * @NUM_NL80211_IFTYPES: number of defined interface types + * + * These values are used with the %NL80211_ATTR_IFTYPE + * to set the type of an interface. + * + */ +enum nl80211_iftype { + NL80211_IFTYPE_UNSPECIFIED, + NL80211_IFTYPE_ADHOC, + NL80211_IFTYPE_STATION, + NL80211_IFTYPE_AP, + NL80211_IFTYPE_AP_VLAN, + NL80211_IFTYPE_WDS, + NL80211_IFTYPE_MONITOR, + NL80211_IFTYPE_MESH_POINT, + NL80211_IFTYPE_P2P_CLIENT, + NL80211_IFTYPE_P2P_GO, + NL80211_IFTYPE_P2P_DEVICE, + NL80211_IFTYPE_OCB, + NL80211_IFTYPE_NAN, + + /* keep last */ + NUM_NL80211_IFTYPES, + NL80211_IFTYPE_MAX = NUM_NL80211_IFTYPES - 1 +}; + +/** + * enum nl80211_sta_flags - station flags + * + * Station flags. When a station is added to an AP interface, it is + * assumed to be already associated (and hence authenticated.) + * + * @__NL80211_STA_FLAG_INVALID: attribute number 0 is reserved + * @NL80211_STA_FLAG_AUTHORIZED: station is authorized (802.1X) + * @NL80211_STA_FLAG_SHORT_PREAMBLE: station is capable of receiving frames + * with short barker preamble + * @NL80211_STA_FLAG_WME: station is WME/QoS capable + * @NL80211_STA_FLAG_MFP: station uses management frame protection + * @NL80211_STA_FLAG_AUTHENTICATED: station is authenticated + * @NL80211_STA_FLAG_TDLS_PEER: station is a TDLS peer -- this flag should + * only be used in managed mode (even in the flags mask). Note that the + * flag can't be changed, it is only valid while adding a station, and + * attempts to change it will silently be ignored (rather than rejected + * as errors.) + * @NL80211_STA_FLAG_ASSOCIATED: station is associated; used with drivers + * that support %NL80211_FEATURE_FULL_AP_CLIENT_STATE to transition a + * previously added station into associated state + * @NL80211_STA_FLAG_MAX: highest station flag number currently defined + * @__NL80211_STA_FLAG_AFTER_LAST: internal use + */ +enum nl80211_sta_flags { + __NL80211_STA_FLAG_INVALID, + NL80211_STA_FLAG_AUTHORIZED, + NL80211_STA_FLAG_SHORT_PREAMBLE, + NL80211_STA_FLAG_WME, + NL80211_STA_FLAG_MFP, + NL80211_STA_FLAG_AUTHENTICATED, + NL80211_STA_FLAG_TDLS_PEER, + NL80211_STA_FLAG_ASSOCIATED, + + /* keep last */ + __NL80211_STA_FLAG_AFTER_LAST, + NL80211_STA_FLAG_MAX = __NL80211_STA_FLAG_AFTER_LAST - 1 +}; + +/** + * enum nl80211_sta_p2p_ps_status - station support of P2P PS + * + * @NL80211_P2P_PS_UNSUPPORTED: station doesn't support P2P PS mechanism + * @@NL80211_P2P_PS_SUPPORTED: station supports P2P PS mechanism + * @NUM_NL80211_P2P_PS_STATUS: number of values + */ +enum nl80211_sta_p2p_ps_status { + NL80211_P2P_PS_UNSUPPORTED = 0, + NL80211_P2P_PS_SUPPORTED, + + NUM_NL80211_P2P_PS_STATUS, +}; + +#define NL80211_STA_FLAG_MAX_OLD_API NL80211_STA_FLAG_TDLS_PEER + +/** + * struct nl80211_sta_flag_update - station flags mask/set + * @mask: mask of station flags to set + * @set: which values to set them to + * + * Both mask and set contain bits as per &enum nl80211_sta_flags. + */ +struct nl80211_sta_flag_update { + __u32 mask; + __u32 set; +} __attribute__((packed)); + +/** + * enum nl80211_he_gi - HE guard interval + * @NL80211_RATE_INFO_HE_GI_0_8: 0.8 usec + * @NL80211_RATE_INFO_HE_GI_1_6: 1.6 usec + * @NL80211_RATE_INFO_HE_GI_3_2: 3.2 usec + */ +enum nl80211_he_gi { + NL80211_RATE_INFO_HE_GI_0_8, + NL80211_RATE_INFO_HE_GI_1_6, + NL80211_RATE_INFO_HE_GI_3_2, +}; + +/** + * enum nl80211_he_ltf - HE long training field + * @NL80211_RATE_INFO_HE_1xLTF: 3.2 usec + * @NL80211_RATE_INFO_HE_2xLTF: 6.4 usec + * @NL80211_RATE_INFO_HE_4xLTF: 12.8 usec + */ +enum nl80211_he_ltf { + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_2XLTF, + NL80211_RATE_INFO_HE_4XLTF, +}; + +/** + * enum nl80211_he_ru_alloc - HE RU allocation values + * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_52: 52-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_106: 106-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_242: 242-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_484: 484-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_996: 996-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_2x996: 2x996-tone RU allocation + */ +enum nl80211_he_ru_alloc { + NL80211_RATE_INFO_HE_RU_ALLOC_26, + NL80211_RATE_INFO_HE_RU_ALLOC_52, + NL80211_RATE_INFO_HE_RU_ALLOC_106, + NL80211_RATE_INFO_HE_RU_ALLOC_242, + NL80211_RATE_INFO_HE_RU_ALLOC_484, + NL80211_RATE_INFO_HE_RU_ALLOC_996, + NL80211_RATE_INFO_HE_RU_ALLOC_2x996, +}; + +/** + * enum nl80211_eht_gi - EHT guard interval + * @NL80211_RATE_INFO_EHT_GI_0_8: 0.8 usec + * @NL80211_RATE_INFO_EHT_GI_1_6: 1.6 usec + * @NL80211_RATE_INFO_EHT_GI_3_2: 3.2 usec + */ +enum nl80211_eht_gi { + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_1_6, + NL80211_RATE_INFO_EHT_GI_3_2, +}; + +/** + * enum nl80211_eht_ru_alloc - EHT RU allocation values + * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_52: 52-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_52P26: 52+26-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_106: 106-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_106P26: 106+26 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_242: 242-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_484: 484-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_484P242: 484+242 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996: 996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484: 996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242: 996+484+242 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996: 2x996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484: 2x996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996: 3x996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484: 3x996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_4x996: 4x996-tone RU allocation + */ +enum nl80211_eht_ru_alloc { + NL80211_RATE_INFO_EHT_RU_ALLOC_26, + NL80211_RATE_INFO_EHT_RU_ALLOC_52, + NL80211_RATE_INFO_EHT_RU_ALLOC_52P26, + NL80211_RATE_INFO_EHT_RU_ALLOC_106, + NL80211_RATE_INFO_EHT_RU_ALLOC_106P26, + NL80211_RATE_INFO_EHT_RU_ALLOC_242, + NL80211_RATE_INFO_EHT_RU_ALLOC_484, + NL80211_RATE_INFO_EHT_RU_ALLOC_484P242, + NL80211_RATE_INFO_EHT_RU_ALLOC_996, + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242, + NL80211_RATE_INFO_EHT_RU_ALLOC_2x996, + NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_3x996, + NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_4x996, +}; + +/** + * enum nl80211_rate_info - bitrate information + * + * These attribute types are used with %NL80211_STA_INFO_TXRATE + * when getting information about the bitrate of a station. + * There are 2 attributes for bitrate, a legacy one that represents + * a 16-bit value, and new one that represents a 32-bit value. + * If the rate value fits into 16 bit, both attributes are reported + * with the same value. If the rate is too high to fit into 16 bits + * (>6.5535Gbps) only 32-bit attribute is included. + * User space tools encouraged to use the 32-bit attribute and fall + * back to the 16-bit one for compatibility with older kernels. + * + * @__NL80211_RATE_INFO_INVALID: attribute number 0 is reserved + * @NL80211_RATE_INFO_BITRATE: total bitrate (u16, 100kbit/s) + * @NL80211_RATE_INFO_MCS: mcs index for 802.11n (u8) + * @NL80211_RATE_INFO_40_MHZ_WIDTH: 40 MHz dualchannel bitrate + * @NL80211_RATE_INFO_SHORT_GI: 400ns guard interval + * @NL80211_RATE_INFO_BITRATE32: total bitrate (u32, 100kbit/s) + * @NL80211_RATE_INFO_MAX: highest rate_info number currently defined + * @NL80211_RATE_INFO_VHT_MCS: MCS index for VHT (u8) + * @NL80211_RATE_INFO_VHT_NSS: number of streams in VHT (u8) + * @NL80211_RATE_INFO_80_MHZ_WIDTH: 80 MHz VHT rate + * @NL80211_RATE_INFO_80P80_MHZ_WIDTH: unused - 80+80 is treated the + * same as 160 for purposes of the bitrates + * @NL80211_RATE_INFO_160_MHZ_WIDTH: 160 MHz VHT rate + * @NL80211_RATE_INFO_10_MHZ_WIDTH: 10 MHz width - note that this is + * a legacy rate and will be reported as the actual bitrate, i.e. + * half the base (20 MHz) rate + * @NL80211_RATE_INFO_5_MHZ_WIDTH: 5 MHz width - note that this is + * a legacy rate and will be reported as the actual bitrate, i.e. + * a quarter of the base (20 MHz) rate + * @NL80211_RATE_INFO_HE_MCS: HE MCS index (u8, 0-11) + * @NL80211_RATE_INFO_HE_NSS: HE NSS value (u8, 1-8) + * @NL80211_RATE_INFO_HE_GI: HE guard interval identifier + * (u8, see &enum nl80211_he_gi) + * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1) + * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then + * non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc) + * @NL80211_RATE_INFO_320_MHZ_WIDTH: 320 MHz bitrate + * @NL80211_RATE_INFO_EHT_MCS: EHT MCS index (u8, 0-15) + * @NL80211_RATE_INFO_EHT_NSS: EHT NSS value (u8, 1-8) + * @NL80211_RATE_INFO_EHT_GI: EHT guard interval identifier + * (u8, see &enum nl80211_eht_gi) + * @NL80211_RATE_INFO_EHT_RU_ALLOC: EHT RU allocation, if not present then + * non-OFDMA was used (u8, see &enum nl80211_eht_ru_alloc) + * @__NL80211_RATE_INFO_AFTER_LAST: internal use + */ +enum nl80211_rate_info { + __NL80211_RATE_INFO_INVALID, + NL80211_RATE_INFO_BITRATE, + NL80211_RATE_INFO_MCS, + NL80211_RATE_INFO_40_MHZ_WIDTH, + NL80211_RATE_INFO_SHORT_GI, + NL80211_RATE_INFO_BITRATE32, + NL80211_RATE_INFO_VHT_MCS, + NL80211_RATE_INFO_VHT_NSS, + NL80211_RATE_INFO_80_MHZ_WIDTH, + NL80211_RATE_INFO_80P80_MHZ_WIDTH, + NL80211_RATE_INFO_160_MHZ_WIDTH, + NL80211_RATE_INFO_10_MHZ_WIDTH, + NL80211_RATE_INFO_5_MHZ_WIDTH, + NL80211_RATE_INFO_HE_MCS, + NL80211_RATE_INFO_HE_NSS, + NL80211_RATE_INFO_HE_GI, + NL80211_RATE_INFO_HE_DCM, + NL80211_RATE_INFO_HE_RU_ALLOC, + NL80211_RATE_INFO_320_MHZ_WIDTH, + NL80211_RATE_INFO_EHT_MCS, + NL80211_RATE_INFO_EHT_NSS, + NL80211_RATE_INFO_EHT_GI, + NL80211_RATE_INFO_EHT_RU_ALLOC, + + /* keep last */ + __NL80211_RATE_INFO_AFTER_LAST, + NL80211_RATE_INFO_MAX = __NL80211_RATE_INFO_AFTER_LAST - 1 +}; + +/** + * enum nl80211_sta_bss_param - BSS information collected by STA + * + * These attribute types are used with %NL80211_STA_INFO_BSS_PARAM + * when getting information about the bitrate of a station. + * + * @__NL80211_STA_BSS_PARAM_INVALID: attribute number 0 is reserved + * @NL80211_STA_BSS_PARAM_CTS_PROT: whether CTS protection is enabled (flag) + * @NL80211_STA_BSS_PARAM_SHORT_PREAMBLE: whether short preamble is enabled + * (flag) + * @NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME: whether short slot time is enabled + * (flag) + * @NL80211_STA_BSS_PARAM_DTIM_PERIOD: DTIM period for beaconing (u8) + * @NL80211_STA_BSS_PARAM_BEACON_INTERVAL: Beacon interval (u16) + * @NL80211_STA_BSS_PARAM_MAX: highest sta_bss_param number currently defined + * @__NL80211_STA_BSS_PARAM_AFTER_LAST: internal use + */ +enum nl80211_sta_bss_param { + __NL80211_STA_BSS_PARAM_INVALID, + NL80211_STA_BSS_PARAM_CTS_PROT, + NL80211_STA_BSS_PARAM_SHORT_PREAMBLE, + NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME, + NL80211_STA_BSS_PARAM_DTIM_PERIOD, + NL80211_STA_BSS_PARAM_BEACON_INTERVAL, + + /* keep last */ + __NL80211_STA_BSS_PARAM_AFTER_LAST, + NL80211_STA_BSS_PARAM_MAX = __NL80211_STA_BSS_PARAM_AFTER_LAST - 1 +}; + +/** + * enum nl80211_sta_info - station information + * + * These attribute types are used with %NL80211_ATTR_STA_INFO + * when getting information about a station. + * + * @__NL80211_STA_INFO_INVALID: attribute number 0 is reserved + * @NL80211_STA_INFO_INACTIVE_TIME: time since last activity (u32, msecs) + * @NL80211_STA_INFO_RX_BYTES: total received bytes (MPDU length) + * (u32, from this station) + * @NL80211_STA_INFO_TX_BYTES: total transmitted bytes (MPDU length) + * (u32, to this station) + * @NL80211_STA_INFO_RX_BYTES64: total received bytes (MPDU length) + * (u64, from this station) + * @NL80211_STA_INFO_TX_BYTES64: total transmitted bytes (MPDU length) + * (u64, to this station) + * @NL80211_STA_INFO_SIGNAL: signal strength of last received PPDU (u8, dBm) + * @NL80211_STA_INFO_TX_BITRATE: current unicast tx rate, nested attribute + * containing info as possible, see &enum nl80211_rate_info + * @NL80211_STA_INFO_RX_PACKETS: total received packet (MSDUs and MMPDUs) + * (u32, from this station) + * @NL80211_STA_INFO_TX_PACKETS: total transmitted packets (MSDUs and MMPDUs) + * (u32, to this station) + * @NL80211_STA_INFO_TX_RETRIES: total retries (MPDUs) (u32, to this station) + * @NL80211_STA_INFO_TX_FAILED: total failed packets (MPDUs) + * (u32, to this station) + * @NL80211_STA_INFO_SIGNAL_AVG: signal strength average (u8, dBm) + * @NL80211_STA_INFO_LLID: the station's mesh LLID + * @NL80211_STA_INFO_PLID: the station's mesh PLID + * @NL80211_STA_INFO_PLINK_STATE: peer link state for the station + * (see %enum nl80211_plink_state) + * @NL80211_STA_INFO_RX_BITRATE: last unicast data frame rx rate, nested + * attribute, like NL80211_STA_INFO_TX_BITRATE. + * @NL80211_STA_INFO_BSS_PARAM: current station's view of BSS, nested attribute + * containing info as possible, see &enum nl80211_sta_bss_param + * @NL80211_STA_INFO_CONNECTED_TIME: time since the station is last connected + * @NL80211_STA_INFO_STA_FLAGS: Contains a struct nl80211_sta_flag_update. + * @NL80211_STA_INFO_BEACON_LOSS: count of times beacon loss was detected (u32) + * @NL80211_STA_INFO_T_OFFSET: timing offset with respect to this STA (s64) + * @NL80211_STA_INFO_LOCAL_PM: local mesh STA link-specific power mode + * @NL80211_STA_INFO_PEER_PM: peer mesh STA link-specific power mode + * @NL80211_STA_INFO_NONPEER_PM: neighbor mesh STA power save mode towards + * non-peer STA + * @NL80211_STA_INFO_CHAIN_SIGNAL: per-chain signal strength of last PPDU + * Contains a nested array of signal strength attributes (u8, dBm) + * @NL80211_STA_INFO_CHAIN_SIGNAL_AVG: per-chain signal strength average + * Same format as NL80211_STA_INFO_CHAIN_SIGNAL. + * @NL80211_STA_EXPECTED_THROUGHPUT: expected throughput considering also the + * 802.11 header (u32, kbps) + * @NL80211_STA_INFO_RX_DROP_MISC: RX packets dropped for unspecified reasons + * (u64) + * @NL80211_STA_INFO_BEACON_RX: number of beacons received from this peer (u64) + * @NL80211_STA_INFO_BEACON_SIGNAL_AVG: signal strength average + * for beacons only (u8, dBm) + * @NL80211_STA_INFO_TID_STATS: per-TID statistics (see &enum nl80211_tid_stats) + * This is a nested attribute where each the inner attribute number is the + * TID+1 and the special TID 16 (i.e. value 17) is used for non-QoS frames; + * each one of those is again nested with &enum nl80211_tid_stats + * attributes carrying the actual values. + * @NL80211_STA_INFO_RX_DURATION: aggregate PPDU duration for all frames + * received from the station (u64, usec) + * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment + * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm) + * @NL80211_STA_INFO_ACK_SIGNAL_AVG: avg signal strength of ACK frames (s8, dBm) + * @NL80211_STA_INFO_RX_MPDUS: total number of received packets (MPDUs) + * (u32, from this station) + * @NL80211_STA_INFO_FCS_ERROR_COUNT: total number of packets (MPDUs) received + * with an FCS error (u32, from this station). This count may not include + * some packets with an FCS error due to TA corruption. Hence this counter + * might not be fully accurate. + * @NL80211_STA_INFO_CONNECTED_TO_GATE: set to true if STA has a path to a + * mesh gate (u8, 0 or 1) + * @NL80211_STA_INFO_TX_DURATION: aggregate PPDU duration for all frames + * sent to the station (u64, usec) + * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16) + * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station + * @NL80211_STA_INFO_ASSOC_AT_BOOTTIME: Timestamp (CLOCK_BOOTTIME, nanoseconds) + * of STA's association + * @NL80211_STA_INFO_CONNECTED_TO_AS: set to true if STA has a path to a + * authentication server (u8, 0 or 1) + * @__NL80211_STA_INFO_AFTER_LAST: internal + * @NL80211_STA_INFO_MAX: highest possible station info attribute + */ +enum nl80211_sta_info { + __NL80211_STA_INFO_INVALID, + NL80211_STA_INFO_INACTIVE_TIME, + NL80211_STA_INFO_RX_BYTES, + NL80211_STA_INFO_TX_BYTES, + NL80211_STA_INFO_LLID, + NL80211_STA_INFO_PLID, + NL80211_STA_INFO_PLINK_STATE, + NL80211_STA_INFO_SIGNAL, + NL80211_STA_INFO_TX_BITRATE, + NL80211_STA_INFO_RX_PACKETS, + NL80211_STA_INFO_TX_PACKETS, + NL80211_STA_INFO_TX_RETRIES, + NL80211_STA_INFO_TX_FAILED, + NL80211_STA_INFO_SIGNAL_AVG, + NL80211_STA_INFO_RX_BITRATE, + NL80211_STA_INFO_BSS_PARAM, + NL80211_STA_INFO_CONNECTED_TIME, + NL80211_STA_INFO_STA_FLAGS, + NL80211_STA_INFO_BEACON_LOSS, + NL80211_STA_INFO_T_OFFSET, + NL80211_STA_INFO_LOCAL_PM, + NL80211_STA_INFO_PEER_PM, + NL80211_STA_INFO_NONPEER_PM, + NL80211_STA_INFO_RX_BYTES64, + NL80211_STA_INFO_TX_BYTES64, + NL80211_STA_INFO_CHAIN_SIGNAL, + NL80211_STA_INFO_CHAIN_SIGNAL_AVG, + NL80211_STA_INFO_EXPECTED_THROUGHPUT, + NL80211_STA_INFO_RX_DROP_MISC, + NL80211_STA_INFO_BEACON_RX, + NL80211_STA_INFO_BEACON_SIGNAL_AVG, + NL80211_STA_INFO_TID_STATS, + NL80211_STA_INFO_RX_DURATION, + NL80211_STA_INFO_PAD, + NL80211_STA_INFO_ACK_SIGNAL, + NL80211_STA_INFO_ACK_SIGNAL_AVG, + NL80211_STA_INFO_RX_MPDUS, + NL80211_STA_INFO_FCS_ERROR_COUNT, + NL80211_STA_INFO_CONNECTED_TO_GATE, + NL80211_STA_INFO_TX_DURATION, + NL80211_STA_INFO_AIRTIME_WEIGHT, + NL80211_STA_INFO_AIRTIME_LINK_METRIC, + NL80211_STA_INFO_ASSOC_AT_BOOTTIME, + NL80211_STA_INFO_CONNECTED_TO_AS, + + /* keep last */ + __NL80211_STA_INFO_AFTER_LAST, + NL80211_STA_INFO_MAX = __NL80211_STA_INFO_AFTER_LAST - 1 +}; + +/* we renamed this - stay compatible */ +#define NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG NL80211_STA_INFO_ACK_SIGNAL_AVG + + +/** + * enum nl80211_tid_stats - per TID statistics attributes + * @__NL80211_TID_STATS_INVALID: attribute number 0 is reserved + * @NL80211_TID_STATS_RX_MSDU: number of MSDUs received (u64) + * @NL80211_TID_STATS_TX_MSDU: number of MSDUs transmitted (or + * attempted to transmit; u64) + * @NL80211_TID_STATS_TX_MSDU_RETRIES: number of retries for + * transmitted MSDUs (not counting the first attempt; u64) + * @NL80211_TID_STATS_TX_MSDU_FAILED: number of failed transmitted + * MSDUs (u64) + * @NL80211_TID_STATS_PAD: attribute used for padding for 64-bit alignment + * @NL80211_TID_STATS_TXQ_STATS: TXQ stats (nested attribute) + * @NUM_NL80211_TID_STATS: number of attributes here + * @NL80211_TID_STATS_MAX: highest numbered attribute here + */ +enum nl80211_tid_stats { + __NL80211_TID_STATS_INVALID, + NL80211_TID_STATS_RX_MSDU, + NL80211_TID_STATS_TX_MSDU, + NL80211_TID_STATS_TX_MSDU_RETRIES, + NL80211_TID_STATS_TX_MSDU_FAILED, + NL80211_TID_STATS_PAD, + NL80211_TID_STATS_TXQ_STATS, + + /* keep last */ + NUM_NL80211_TID_STATS, + NL80211_TID_STATS_MAX = NUM_NL80211_TID_STATS - 1 +}; + +/** + * enum nl80211_txq_stats - per TXQ statistics attributes + * @__NL80211_TXQ_STATS_INVALID: attribute number 0 is reserved + * @NUM_NL80211_TXQ_STATS: number of attributes here + * @NL80211_TXQ_STATS_BACKLOG_BYTES: number of bytes currently backlogged + * @NL80211_TXQ_STATS_BACKLOG_PACKETS: number of packets currently + * backlogged + * @NL80211_TXQ_STATS_FLOWS: total number of new flows seen + * @NL80211_TXQ_STATS_DROPS: total number of packet drops + * @NL80211_TXQ_STATS_ECN_MARKS: total number of packet ECN marks + * @NL80211_TXQ_STATS_OVERLIMIT: number of drops due to queue space overflow + * @NL80211_TXQ_STATS_OVERMEMORY: number of drops due to memory limit overflow + * (only for per-phy stats) + * @NL80211_TXQ_STATS_COLLISIONS: number of hash collisions + * @NL80211_TXQ_STATS_TX_BYTES: total number of bytes dequeued from TXQ + * @NL80211_TXQ_STATS_TX_PACKETS: total number of packets dequeued from TXQ + * @NL80211_TXQ_STATS_MAX_FLOWS: number of flow buckets for PHY + * @NL80211_TXQ_STATS_MAX: highest numbered attribute here + */ +enum nl80211_txq_stats { + __NL80211_TXQ_STATS_INVALID, + NL80211_TXQ_STATS_BACKLOG_BYTES, + NL80211_TXQ_STATS_BACKLOG_PACKETS, + NL80211_TXQ_STATS_FLOWS, + NL80211_TXQ_STATS_DROPS, + NL80211_TXQ_STATS_ECN_MARKS, + NL80211_TXQ_STATS_OVERLIMIT, + NL80211_TXQ_STATS_OVERMEMORY, + NL80211_TXQ_STATS_COLLISIONS, + NL80211_TXQ_STATS_TX_BYTES, + NL80211_TXQ_STATS_TX_PACKETS, + NL80211_TXQ_STATS_MAX_FLOWS, + + /* keep last */ + NUM_NL80211_TXQ_STATS, + NL80211_TXQ_STATS_MAX = NUM_NL80211_TXQ_STATS - 1 +}; + +/** + * enum nl80211_mpath_flags - nl80211 mesh path flags + * + * @NL80211_MPATH_FLAG_ACTIVE: the mesh path is active + * @NL80211_MPATH_FLAG_RESOLVING: the mesh path discovery process is running + * @NL80211_MPATH_FLAG_SN_VALID: the mesh path contains a valid SN + * @NL80211_MPATH_FLAG_FIXED: the mesh path has been manually set + * @NL80211_MPATH_FLAG_RESOLVED: the mesh path discovery process succeeded + */ +enum nl80211_mpath_flags { + NL80211_MPATH_FLAG_ACTIVE = 1<<0, + NL80211_MPATH_FLAG_RESOLVING = 1<<1, + NL80211_MPATH_FLAG_SN_VALID = 1<<2, + NL80211_MPATH_FLAG_FIXED = 1<<3, + NL80211_MPATH_FLAG_RESOLVED = 1<<4, +}; + +/** + * enum nl80211_mpath_info - mesh path information + * + * These attribute types are used with %NL80211_ATTR_MPATH_INFO when getting + * information about a mesh path. + * + * @__NL80211_MPATH_INFO_INVALID: attribute number 0 is reserved + * @NL80211_MPATH_INFO_FRAME_QLEN: number of queued frames for this destination + * @NL80211_MPATH_INFO_SN: destination sequence number + * @NL80211_MPATH_INFO_METRIC: metric (cost) of this mesh path + * @NL80211_MPATH_INFO_EXPTIME: expiration time for the path, in msec from now + * @NL80211_MPATH_INFO_FLAGS: mesh path flags, enumerated in + * &enum nl80211_mpath_flags; + * @NL80211_MPATH_INFO_DISCOVERY_TIMEOUT: total path discovery timeout, in msec + * @NL80211_MPATH_INFO_DISCOVERY_RETRIES: mesh path discovery retries + * @NL80211_MPATH_INFO_HOP_COUNT: hop count to destination + * @NL80211_MPATH_INFO_PATH_CHANGE: total number of path changes to destination + * @NL80211_MPATH_INFO_MAX: highest mesh path information attribute number + * currently defined + * @__NL80211_MPATH_INFO_AFTER_LAST: internal use + */ +enum nl80211_mpath_info { + __NL80211_MPATH_INFO_INVALID, + NL80211_MPATH_INFO_FRAME_QLEN, + NL80211_MPATH_INFO_SN, + NL80211_MPATH_INFO_METRIC, + NL80211_MPATH_INFO_EXPTIME, + NL80211_MPATH_INFO_FLAGS, + NL80211_MPATH_INFO_DISCOVERY_TIMEOUT, + NL80211_MPATH_INFO_DISCOVERY_RETRIES, + NL80211_MPATH_INFO_HOP_COUNT, + NL80211_MPATH_INFO_PATH_CHANGE, + + /* keep last */ + __NL80211_MPATH_INFO_AFTER_LAST, + NL80211_MPATH_INFO_MAX = __NL80211_MPATH_INFO_AFTER_LAST - 1 +}; + +/** + * enum nl80211_band_iftype_attr - Interface type data attributes + * + * @__NL80211_BAND_IFTYPE_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_BAND_IFTYPE_ATTR_IFTYPES: nested attribute containing a flag attribute + * for each interface type that supports the band data + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC: HE MAC capabilities as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY: HE PHY capabilities as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET: HE supported NSS/MCS as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE: HE PPE thresholds information as + * defined in HE capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16), + * given for all 6 GHz band channels + * @NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS: vendor element capabilities that are + * advertised on this band/for this iftype (binary) + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC: EHT MAC capabilities as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY: EHT PHY capabilities as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET: EHT supported NSS/MCS as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE: EHT PPE thresholds information as + * defined in EHT capabilities element + * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use + * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band attribute currently defined + */ +enum nl80211_band_iftype_attr { + __NL80211_BAND_IFTYPE_ATTR_INVALID, + + NL80211_BAND_IFTYPE_ATTR_IFTYPES, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, + NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE, + + /* keep last */ + __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, + NL80211_BAND_IFTYPE_ATTR_MAX = __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_band_attr - band attributes + * @__NL80211_BAND_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_BAND_ATTR_FREQS: supported frequencies in this band, + * an array of nested frequency attributes + * @NL80211_BAND_ATTR_RATES: supported bitrates in this band, + * an array of nested bitrate attributes + * @NL80211_BAND_ATTR_HT_MCS_SET: 16-byte attribute containing the MCS set as + * defined in 802.11n + * @NL80211_BAND_ATTR_HT_CAPA: HT capabilities, as in the HT information IE + * @NL80211_BAND_ATTR_HT_AMPDU_FACTOR: A-MPDU factor, as in 11n + * @NL80211_BAND_ATTR_HT_AMPDU_DENSITY: A-MPDU density, as in 11n + * @NL80211_BAND_ATTR_VHT_MCS_SET: 32-byte attribute containing the MCS set as + * defined in 802.11ac + * @NL80211_BAND_ATTR_VHT_CAPA: VHT capabilities, as in the HT information IE + * @NL80211_BAND_ATTR_IFTYPE_DATA: nested array attribute, with each entry using + * attributes from &enum nl80211_band_iftype_attr + * @NL80211_BAND_ATTR_EDMG_CHANNELS: bitmap that indicates the 2.16 GHz + * channel(s) that are allowed to be used for EDMG transmissions. + * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251. + * @NL80211_BAND_ATTR_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes + * the allowed channel bandwidth configurations. + * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. + * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined + * @__NL80211_BAND_ATTR_AFTER_LAST: internal use + */ +enum nl80211_band_attr { + __NL80211_BAND_ATTR_INVALID, + NL80211_BAND_ATTR_FREQS, + NL80211_BAND_ATTR_RATES, + + NL80211_BAND_ATTR_HT_MCS_SET, + NL80211_BAND_ATTR_HT_CAPA, + NL80211_BAND_ATTR_HT_AMPDU_FACTOR, + NL80211_BAND_ATTR_HT_AMPDU_DENSITY, + + NL80211_BAND_ATTR_VHT_MCS_SET, + NL80211_BAND_ATTR_VHT_CAPA, + NL80211_BAND_ATTR_IFTYPE_DATA, + + NL80211_BAND_ATTR_EDMG_CHANNELS, + NL80211_BAND_ATTR_EDMG_BW_CONFIG, + + /* keep last */ + __NL80211_BAND_ATTR_AFTER_LAST, + NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1 +}; + +#define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA + +/** + * enum nl80211_wmm_rule - regulatory wmm rule + * + * @__NL80211_WMMR_INVALID: attribute number 0 is reserved + * @NL80211_WMMR_CW_MIN: Minimum contention window slot. + * @NL80211_WMMR_CW_MAX: Maximum contention window slot. + * @NL80211_WMMR_AIFSN: Arbitration Inter Frame Space. + * @NL80211_WMMR_TXOP: Maximum allowed tx operation time. + * @nl80211_WMMR_MAX: highest possible wmm rule. + * @__NL80211_WMMR_LAST: Internal use. + */ +enum nl80211_wmm_rule { + __NL80211_WMMR_INVALID, + NL80211_WMMR_CW_MIN, + NL80211_WMMR_CW_MAX, + NL80211_WMMR_AIFSN, + NL80211_WMMR_TXOP, + + /* keep last */ + __NL80211_WMMR_LAST, + NL80211_WMMR_MAX = __NL80211_WMMR_LAST - 1 +}; + +/** + * enum nl80211_frequency_attr - frequency attributes + * @__NL80211_FREQUENCY_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_FREQUENCY_ATTR_FREQ: Frequency in MHz + * @NL80211_FREQUENCY_ATTR_DISABLED: Channel is disabled in current + * regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_IR: no mechanisms that initiate radiation + * are permitted on this channel, this includes sending probe + * requests, or modes of operation that require beaconing. + * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in mBm + * (100 * dBm). + * @NL80211_FREQUENCY_ATTR_DFS_STATE: current state for DFS + * (enum nl80211_dfs_state) + * @NL80211_FREQUENCY_ATTR_DFS_TIME: time in miliseconds for how long + * this channel is in this DFS state. + * @NL80211_FREQUENCY_ATTR_NO_HT40_MINUS: HT40- isn't possible with this + * channel as the control channel + * @NL80211_FREQUENCY_ATTR_NO_HT40_PLUS: HT40+ isn't possible with this + * channel as the control channel + * @NL80211_FREQUENCY_ATTR_NO_80MHZ: any 80 MHz channel using this channel + * as the primary or any of the secondary channels isn't possible, + * this includes 80+80 channels + * @NL80211_FREQUENCY_ATTR_NO_160MHZ: any 160 MHz (but not 80+80) channel + * using this channel as the primary or any of the secondary channels + * isn't possible + * @NL80211_FREQUENCY_ATTR_DFS_CAC_TIME: DFS CAC time in milliseconds. + * @NL80211_FREQUENCY_ATTR_INDOOR_ONLY: Only indoor use is permitted on this + * channel. A channel that has the INDOOR_ONLY attribute can only be + * used when there is a clear assessment that the device is operating in + * an indoor surroundings, i.e., it is connected to AC power (and not + * through portable DC inverters) or is under the control of a master + * that is acting as an AP and is connected to AC power. + * @NL80211_FREQUENCY_ATTR_IR_CONCURRENT: IR operation is allowed on this + * channel if it's connected concurrently to a BSS on the same channel on + * the 2 GHz band or to a channel in the same UNII band (on the 5 GHz + * band), and IEEE80211_CHAN_RADAR is not set. Instantiating a GO or TDLS + * off-channel on a channel that has the IR_CONCURRENT attribute set can be + * done when there is a clear assessment that the device is operating under + * the guidance of an authorized master, i.e., setting up a GO or TDLS + * off-channel while the device is also connected to an AP with DFS and + * radar detection on the UNII band (it is up to user-space, i.e., + * wpa_supplicant to perform the required verifications). Using this + * attribute for IR is disallowed for master interfaces (IBSS, AP). + * @NL80211_FREQUENCY_ATTR_NO_20MHZ: 20 MHz operation is not allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_WMM: this channel has wmm limitations. + * This is a nested attribute that contains the wmm limitation per AC. + * (see &enum nl80211_wmm_rule) + * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel + * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz + * @NL80211_FREQUENCY_ATTR_1MHZ: 1 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_2MHZ: 2 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_4MHZ: 4 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_8MHZ: 8 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_320MHZ: any 320 MHz channel using this channel + * as the primary or any of the secondary channels isn't possible + * @NL80211_FREQUENCY_ATTR_NO_EHT: EHT operation is not allowed on this channel + * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number + * currently defined + * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use + * + * See https://apps.fcc.gov/eas/comments/GetPublishedDocument.html?id=327&tn=528122 + * for more information on the FCC description of the relaxations allowed + * by NL80211_FREQUENCY_ATTR_INDOOR_ONLY and + * NL80211_FREQUENCY_ATTR_IR_CONCURRENT. + */ +enum nl80211_frequency_attr { + __NL80211_FREQUENCY_ATTR_INVALID, + NL80211_FREQUENCY_ATTR_FREQ, + NL80211_FREQUENCY_ATTR_DISABLED, + NL80211_FREQUENCY_ATTR_NO_IR, + __NL80211_FREQUENCY_ATTR_NO_IBSS, + NL80211_FREQUENCY_ATTR_RADAR, + NL80211_FREQUENCY_ATTR_MAX_TX_POWER, + NL80211_FREQUENCY_ATTR_DFS_STATE, + NL80211_FREQUENCY_ATTR_DFS_TIME, + NL80211_FREQUENCY_ATTR_NO_HT40_MINUS, + NL80211_FREQUENCY_ATTR_NO_HT40_PLUS, + NL80211_FREQUENCY_ATTR_NO_80MHZ, + NL80211_FREQUENCY_ATTR_NO_160MHZ, + NL80211_FREQUENCY_ATTR_DFS_CAC_TIME, + NL80211_FREQUENCY_ATTR_INDOOR_ONLY, + NL80211_FREQUENCY_ATTR_IR_CONCURRENT, + NL80211_FREQUENCY_ATTR_NO_20MHZ, + NL80211_FREQUENCY_ATTR_NO_10MHZ, + NL80211_FREQUENCY_ATTR_WMM, + NL80211_FREQUENCY_ATTR_NO_HE, + NL80211_FREQUENCY_ATTR_OFFSET, + NL80211_FREQUENCY_ATTR_1MHZ, + NL80211_FREQUENCY_ATTR_2MHZ, + NL80211_FREQUENCY_ATTR_4MHZ, + NL80211_FREQUENCY_ATTR_8MHZ, + NL80211_FREQUENCY_ATTR_16MHZ, + NL80211_FREQUENCY_ATTR_NO_320MHZ, + NL80211_FREQUENCY_ATTR_NO_EHT, + + /* keep last */ + __NL80211_FREQUENCY_ATTR_AFTER_LAST, + NL80211_FREQUENCY_ATTR_MAX = __NL80211_FREQUENCY_ATTR_AFTER_LAST - 1 +}; + +#define NL80211_FREQUENCY_ATTR_MAX_TX_POWER NL80211_FREQUENCY_ATTR_MAX_TX_POWER +#define NL80211_FREQUENCY_ATTR_PASSIVE_SCAN NL80211_FREQUENCY_ATTR_NO_IR +#define NL80211_FREQUENCY_ATTR_NO_IBSS NL80211_FREQUENCY_ATTR_NO_IR +#define NL80211_FREQUENCY_ATTR_NO_IR NL80211_FREQUENCY_ATTR_NO_IR +#define NL80211_FREQUENCY_ATTR_GO_CONCURRENT \ + NL80211_FREQUENCY_ATTR_IR_CONCURRENT + +/** + * enum nl80211_bitrate_attr - bitrate attributes + * @__NL80211_BITRATE_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_BITRATE_ATTR_RATE: Bitrate in units of 100 kbps + * @NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE: Short preamble supported + * in 2.4 GHz band. + * @NL80211_BITRATE_ATTR_MAX: highest bitrate attribute number + * currently defined + * @__NL80211_BITRATE_ATTR_AFTER_LAST: internal use + */ +enum nl80211_bitrate_attr { + __NL80211_BITRATE_ATTR_INVALID, + NL80211_BITRATE_ATTR_RATE, + NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE, + + /* keep last */ + __NL80211_BITRATE_ATTR_AFTER_LAST, + NL80211_BITRATE_ATTR_MAX = __NL80211_BITRATE_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_initiator - Indicates the initiator of a reg domain request + * @NL80211_REGDOM_SET_BY_CORE: Core queried CRDA for a dynamic world + * regulatory domain. + * @NL80211_REGDOM_SET_BY_USER: User asked the wireless core to set the + * regulatory domain. + * @NL80211_REGDOM_SET_BY_DRIVER: a wireless drivers has hinted to the + * wireless core it thinks its knows the regulatory domain we should be in. + * @NL80211_REGDOM_SET_BY_COUNTRY_IE: the wireless core has received an + * 802.11 country information element with regulatory information it + * thinks we should consider. cfg80211 only processes the country + * code from the IE, and relies on the regulatory domain information + * structure passed by userspace (CRDA) from our wireless-regdb. + * If a channel is enabled but the country code indicates it should + * be disabled we disable the channel and re-enable it upon disassociation. + */ +enum nl80211_reg_initiator { + NL80211_REGDOM_SET_BY_CORE, + NL80211_REGDOM_SET_BY_USER, + NL80211_REGDOM_SET_BY_DRIVER, + NL80211_REGDOM_SET_BY_COUNTRY_IE, +}; + +/** + * enum nl80211_reg_type - specifies the type of regulatory domain + * @NL80211_REGDOM_TYPE_COUNTRY: the regulatory domain set is one that pertains + * to a specific country. When this is set you can count on the + * ISO / IEC 3166 alpha2 country code being valid. + * @NL80211_REGDOM_TYPE_WORLD: the regulatory set domain is the world regulatory + * domain. + * @NL80211_REGDOM_TYPE_CUSTOM_WORLD: the regulatory domain set is a custom + * driver specific world regulatory domain. These do not apply system-wide + * and are only applicable to the individual devices which have requested + * them to be applied. + * @NL80211_REGDOM_TYPE_INTERSECTION: the regulatory domain set is the product + * of an intersection between two regulatory domains -- the previously + * set regulatory domain on the system and the last accepted regulatory + * domain request to be processed. + */ +enum nl80211_reg_type { + NL80211_REGDOM_TYPE_COUNTRY, + NL80211_REGDOM_TYPE_WORLD, + NL80211_REGDOM_TYPE_CUSTOM_WORLD, + NL80211_REGDOM_TYPE_INTERSECTION, +}; + +/** + * enum nl80211_reg_rule_attr - regulatory rule attributes + * @__NL80211_REG_RULE_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_ATTR_REG_RULE_FLAGS: a set of flags which specify additional + * considerations for a given frequency range. These are the + * &enum nl80211_reg_rule_flags. + * @NL80211_ATTR_FREQ_RANGE_START: starting frequencry for the regulatory + * rule in KHz. This is not a center of frequency but an actual regulatory + * band edge. + * @NL80211_ATTR_FREQ_RANGE_END: ending frequency for the regulatory rule + * in KHz. This is not a center a frequency but an actual regulatory + * band edge. + * @NL80211_ATTR_FREQ_RANGE_MAX_BW: maximum allowed bandwidth for this + * frequency range, in KHz. + * @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN: the maximum allowed antenna gain + * for a given frequency range. The value is in mBi (100 * dBi). + * If you don't have one then don't send this. + * @NL80211_ATTR_POWER_RULE_MAX_EIRP: the maximum allowed EIRP for + * a given frequency range. The value is in mBm (100 * dBm). + * @NL80211_ATTR_DFS_CAC_TIME: DFS CAC time in milliseconds. + * If not present or 0 default CAC time will be used. + * @NL80211_REG_RULE_ATTR_MAX: highest regulatory rule attribute number + * currently defined + * @__NL80211_REG_RULE_ATTR_AFTER_LAST: internal use + */ +enum nl80211_reg_rule_attr { + __NL80211_REG_RULE_ATTR_INVALID, + NL80211_ATTR_REG_RULE_FLAGS, + + NL80211_ATTR_FREQ_RANGE_START, + NL80211_ATTR_FREQ_RANGE_END, + NL80211_ATTR_FREQ_RANGE_MAX_BW, + + NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN, + NL80211_ATTR_POWER_RULE_MAX_EIRP, + + NL80211_ATTR_DFS_CAC_TIME, + + /* keep last */ + __NL80211_REG_RULE_ATTR_AFTER_LAST, + NL80211_REG_RULE_ATTR_MAX = __NL80211_REG_RULE_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_sched_scan_match_attr - scheduled scan match attributes + * @__NL80211_SCHED_SCAN_MATCH_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_SCHED_SCAN_MATCH_ATTR_SSID: SSID to be used for matching, + * only report BSS with matching SSID. + * (This cannot be used together with BSSID.) + * @NL80211_SCHED_SCAN_MATCH_ATTR_RSSI: RSSI threshold (in dBm) for reporting a + * BSS in scan results. Filtering is turned off if not specified. Note that + * if this attribute is in a match set of its own, then it is treated as + * the default value for all matchsets with an SSID, rather than being a + * matchset of its own without an RSSI filter. This is due to problems with + * how this API was implemented in the past. Also, due to the same problem, + * the only way to create a matchset with only an RSSI filter (with this + * attribute) is if there's only a single matchset with the RSSI attribute. + * @NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI: Flag indicating whether + * %NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to be used as absolute RSSI or + * relative to current bss's RSSI. + * @NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST: When present the RSSI level for + * BSS-es in the specified band is to be adjusted before doing + * RSSI-based BSS selection. The attribute value is a packed structure + * value as specified by &struct nl80211_bss_select_rssi_adjust. + * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching + * (this cannot be used together with SSID). + * @NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI: Nested attribute that carries the + * band specific minimum rssi thresholds for the bands defined in + * enum nl80211_band. The minimum rssi threshold value(s32) specific to a + * band shall be encapsulated in attribute with type value equals to one + * of the NL80211_BAND_* defined in enum nl80211_band. For example, the + * minimum rssi threshold value for 2.4GHZ band shall be encapsulated + * within an attribute of type NL80211_BAND_2GHZ. And one or more of such + * attributes will be nested within this attribute. + * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter + * attribute number currently defined + * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use + */ +enum nl80211_sched_scan_match_attr { + __NL80211_SCHED_SCAN_MATCH_ATTR_INVALID, + + NL80211_SCHED_SCAN_MATCH_ATTR_SSID, + NL80211_SCHED_SCAN_MATCH_ATTR_RSSI, + NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI, + NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST, + NL80211_SCHED_SCAN_MATCH_ATTR_BSSID, + NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI, + + /* keep last */ + __NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST, + NL80211_SCHED_SCAN_MATCH_ATTR_MAX = + __NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST - 1 +}; + +/* only for backward compatibility */ +#define NL80211_ATTR_SCHED_SCAN_MATCH_SSID NL80211_SCHED_SCAN_MATCH_ATTR_SSID + +/** + * enum nl80211_reg_rule_flags - regulatory rule flags + * + * @NL80211_RRF_NO_OFDM: OFDM modulation not allowed + * @NL80211_RRF_NO_CCK: CCK modulation not allowed + * @NL80211_RRF_NO_INDOOR: indoor operation not allowed + * @NL80211_RRF_NO_OUTDOOR: outdoor operation not allowed + * @NL80211_RRF_DFS: DFS support is required to be used + * @NL80211_RRF_PTP_ONLY: this is only for Point To Point links + * @NL80211_RRF_PTMP_ONLY: this is only for Point To Multi Point links + * @NL80211_RRF_NO_IR: no mechanisms that initiate radiation are allowed, + * this includes probe requests or modes of operation that require + * beaconing. + * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated + * base on contiguous rules and wider channels will be allowed to cross + * multiple contiguous/overlapping frequency ranges. + * @NL80211_RRF_IR_CONCURRENT: See %NL80211_FREQUENCY_ATTR_IR_CONCURRENT + * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation + * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation + * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed + * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed + * @NL80211_RRF_NO_HE: HE operation not allowed + * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed + */ +enum nl80211_reg_rule_flags { + NL80211_RRF_NO_OFDM = 1<<0, + NL80211_RRF_NO_CCK = 1<<1, + NL80211_RRF_NO_INDOOR = 1<<2, + NL80211_RRF_NO_OUTDOOR = 1<<3, + NL80211_RRF_DFS = 1<<4, + NL80211_RRF_PTP_ONLY = 1<<5, + NL80211_RRF_PTMP_ONLY = 1<<6, + NL80211_RRF_NO_IR = 1<<7, + __NL80211_RRF_NO_IBSS = 1<<8, + NL80211_RRF_AUTO_BW = 1<<11, + NL80211_RRF_IR_CONCURRENT = 1<<12, + NL80211_RRF_NO_HT40MINUS = 1<<13, + NL80211_RRF_NO_HT40PLUS = 1<<14, + NL80211_RRF_NO_80MHZ = 1<<15, + NL80211_RRF_NO_160MHZ = 1<<16, + NL80211_RRF_NO_HE = 1<<17, + NL80211_RRF_NO_320MHZ = 1<<18, +}; + +#define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR +#define NL80211_RRF_NO_IBSS NL80211_RRF_NO_IR +#define NL80211_RRF_NO_IR NL80211_RRF_NO_IR +#define NL80211_RRF_NO_HT40 (NL80211_RRF_NO_HT40MINUS |\ + NL80211_RRF_NO_HT40PLUS) +#define NL80211_RRF_GO_CONCURRENT NL80211_RRF_IR_CONCURRENT + +/* For backport compatibility with older userspace */ +#define NL80211_RRF_NO_IR_ALL (NL80211_RRF_NO_IR | __NL80211_RRF_NO_IBSS) + +/** + * enum nl80211_dfs_regions - regulatory DFS regions + * + * @NL80211_DFS_UNSET: Country has no DFS master region specified + * @NL80211_DFS_FCC: Country follows DFS master rules from FCC + * @NL80211_DFS_ETSI: Country follows DFS master rules from ETSI + * @NL80211_DFS_JP: Country follows DFS master rules from JP/MKK/Telec + */ +enum nl80211_dfs_regions { + NL80211_DFS_UNSET = 0, + NL80211_DFS_FCC = 1, + NL80211_DFS_ETSI = 2, + NL80211_DFS_JP = 3, +}; + +/** + * enum nl80211_user_reg_hint_type - type of user regulatory hint + * + * @NL80211_USER_REG_HINT_USER: a user sent the hint. This is always + * assumed if the attribute is not set. + * @NL80211_USER_REG_HINT_CELL_BASE: the hint comes from a cellular + * base station. Device drivers that have been tested to work + * properly to support this type of hint can enable these hints + * by setting the NL80211_FEATURE_CELL_BASE_REG_HINTS feature + * capability on the struct wiphy. The wireless core will + * ignore all cell base station hints until at least one device + * present has been registered with the wireless core that + * has listed NL80211_FEATURE_CELL_BASE_REG_HINTS as a + * supported feature. + * @NL80211_USER_REG_HINT_INDOOR: a user sent an hint indicating that the + * platform is operating in an indoor environment. + */ +enum nl80211_user_reg_hint_type { + NL80211_USER_REG_HINT_USER = 0, + NL80211_USER_REG_HINT_CELL_BASE = 1, + NL80211_USER_REG_HINT_INDOOR = 2, +}; + +/** + * enum nl80211_survey_info - survey information + * + * These attribute types are used with %NL80211_ATTR_SURVEY_INFO + * when getting information about a survey. + * + * @__NL80211_SURVEY_INFO_INVALID: attribute number 0 is reserved + * @NL80211_SURVEY_INFO_FREQUENCY: center frequency of channel + * @NL80211_SURVEY_INFO_NOISE: noise level of channel (u8, dBm) + * @NL80211_SURVEY_INFO_IN_USE: channel is currently being used + * @NL80211_SURVEY_INFO_TIME: amount of time (in ms) that the radio + * was turned on (on channel or globally) + * @NL80211_SURVEY_INFO_TIME_BUSY: amount of the time the primary + * channel was sensed busy (either due to activity or energy detect) + * @NL80211_SURVEY_INFO_TIME_EXT_BUSY: amount of time the extension + * channel was sensed busy + * @NL80211_SURVEY_INFO_TIME_RX: amount of time the radio spent + * receiving data (on channel or globally) + * @NL80211_SURVEY_INFO_TIME_TX: amount of time the radio spent + * transmitting data (on channel or globally) + * @NL80211_SURVEY_INFO_TIME_SCAN: time the radio spent for scan + * (on this channel or globally) + * @NL80211_SURVEY_INFO_PAD: attribute used for padding for 64-bit alignment + * @NL80211_SURVEY_INFO_TIME_BSS_RX: amount of time the radio spent + * receiving frames destined to the local BSS + * @NL80211_SURVEY_INFO_MAX: highest survey info attribute number + * currently defined + * @NL80211_SURVEY_INFO_FREQUENCY_OFFSET: center frequency offset in KHz + * @__NL80211_SURVEY_INFO_AFTER_LAST: internal use + */ +enum nl80211_survey_info { + __NL80211_SURVEY_INFO_INVALID, + NL80211_SURVEY_INFO_FREQUENCY, + NL80211_SURVEY_INFO_NOISE, + NL80211_SURVEY_INFO_IN_USE, + NL80211_SURVEY_INFO_TIME, + NL80211_SURVEY_INFO_TIME_BUSY, + NL80211_SURVEY_INFO_TIME_EXT_BUSY, + NL80211_SURVEY_INFO_TIME_RX, + NL80211_SURVEY_INFO_TIME_TX, + NL80211_SURVEY_INFO_TIME_SCAN, + NL80211_SURVEY_INFO_PAD, + NL80211_SURVEY_INFO_TIME_BSS_RX, + NL80211_SURVEY_INFO_FREQUENCY_OFFSET, + + /* keep last */ + __NL80211_SURVEY_INFO_AFTER_LAST, + NL80211_SURVEY_INFO_MAX = __NL80211_SURVEY_INFO_AFTER_LAST - 1 +}; + +/* keep old names for compatibility */ +#define NL80211_SURVEY_INFO_CHANNEL_TIME NL80211_SURVEY_INFO_TIME +#define NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY NL80211_SURVEY_INFO_TIME_BUSY +#define NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY NL80211_SURVEY_INFO_TIME_EXT_BUSY +#define NL80211_SURVEY_INFO_CHANNEL_TIME_RX NL80211_SURVEY_INFO_TIME_RX +#define NL80211_SURVEY_INFO_CHANNEL_TIME_TX NL80211_SURVEY_INFO_TIME_TX + +/** + * enum nl80211_mntr_flags - monitor configuration flags + * + * Monitor configuration flags. + * + * @__NL80211_MNTR_FLAG_INVALID: reserved + * + * @NL80211_MNTR_FLAG_FCSFAIL: pass frames with bad FCS + * @NL80211_MNTR_FLAG_PLCPFAIL: pass frames with bad PLCP + * @NL80211_MNTR_FLAG_CONTROL: pass control frames + * @NL80211_MNTR_FLAG_OTHER_BSS: disable BSSID filtering + * @NL80211_MNTR_FLAG_COOK_FRAMES: report frames after processing. + * overrides all other flags. + * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address + * and ACK incoming unicast packets. + * + * @__NL80211_MNTR_FLAG_AFTER_LAST: internal use + * @NL80211_MNTR_FLAG_MAX: highest possible monitor flag + */ +enum nl80211_mntr_flags { + __NL80211_MNTR_FLAG_INVALID, + NL80211_MNTR_FLAG_FCSFAIL, + NL80211_MNTR_FLAG_PLCPFAIL, + NL80211_MNTR_FLAG_CONTROL, + NL80211_MNTR_FLAG_OTHER_BSS, + NL80211_MNTR_FLAG_COOK_FRAMES, + NL80211_MNTR_FLAG_ACTIVE, + + /* keep last */ + __NL80211_MNTR_FLAG_AFTER_LAST, + NL80211_MNTR_FLAG_MAX = __NL80211_MNTR_FLAG_AFTER_LAST - 1 +}; + +/** + * enum nl80211_mesh_power_mode - mesh power save modes + * + * @NL80211_MESH_POWER_UNKNOWN: The mesh power mode of the mesh STA is + * not known or has not been set yet. + * @NL80211_MESH_POWER_ACTIVE: Active mesh power mode. The mesh STA is + * in Awake state all the time. + * @NL80211_MESH_POWER_LIGHT_SLEEP: Light sleep mode. The mesh STA will + * alternate between Active and Doze states, but will wake up for + * neighbor's beacons. + * @NL80211_MESH_POWER_DEEP_SLEEP: Deep sleep mode. The mesh STA will + * alternate between Active and Doze states, but may not wake up + * for neighbor's beacons. + * + * @__NL80211_MESH_POWER_AFTER_LAST - internal use + * @NL80211_MESH_POWER_MAX - highest possible power save level + */ + +enum nl80211_mesh_power_mode { + NL80211_MESH_POWER_UNKNOWN, + NL80211_MESH_POWER_ACTIVE, + NL80211_MESH_POWER_LIGHT_SLEEP, + NL80211_MESH_POWER_DEEP_SLEEP, + + __NL80211_MESH_POWER_AFTER_LAST, + NL80211_MESH_POWER_MAX = __NL80211_MESH_POWER_AFTER_LAST - 1 +}; + +/** + * enum nl80211_meshconf_params - mesh configuration parameters + * + * Mesh configuration parameters. These can be changed while the mesh is + * active. + * + * @__NL80211_MESHCONF_INVALID: internal use + * + * @NL80211_MESHCONF_RETRY_TIMEOUT: specifies the initial retry timeout in + * millisecond units, used by the Peer Link Open message + * + * @NL80211_MESHCONF_CONFIRM_TIMEOUT: specifies the initial confirm timeout, in + * millisecond units, used by the peer link management to close a peer link + * + * @NL80211_MESHCONF_HOLDING_TIMEOUT: specifies the holding timeout, in + * millisecond units + * + * @NL80211_MESHCONF_MAX_PEER_LINKS: maximum number of peer links allowed + * on this mesh interface + * + * @NL80211_MESHCONF_MAX_RETRIES: specifies the maximum number of peer link + * open retries that can be sent to establish a new peer link instance in a + * mesh + * + * @NL80211_MESHCONF_TTL: specifies the value of TTL field set at a source mesh + * point. + * + * @NL80211_MESHCONF_AUTO_OPEN_PLINKS: whether we should automatically open + * peer links when we detect compatible mesh peers. Disabled if + * @NL80211_MESH_SETUP_USERSPACE_MPM or @NL80211_MESH_SETUP_USERSPACE_AMPE are + * set. + * + * @NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES: the number of action frames + * containing a PREQ that an MP can send to a particular destination (path + * target) + * + * @NL80211_MESHCONF_PATH_REFRESH_TIME: how frequently to refresh mesh paths + * (in milliseconds) + * + * @NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT: minimum length of time to wait + * until giving up on a path discovery (in milliseconds) + * + * @NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT: The time (in TUs) for which mesh + * points receiving a PREQ shall consider the forwarding information from + * the root to be valid. (TU = time unit) + * + * @NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL: The minimum interval of time (in + * TUs) during which an MP can send only one action frame containing a PREQ + * reference element + * + * @NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME: The interval of time (in TUs) + * that it takes for an HWMP information element to propagate across the + * mesh + * + * @NL80211_MESHCONF_HWMP_ROOTMODE: whether root mode is enabled or not + * + * @NL80211_MESHCONF_ELEMENT_TTL: specifies the value of TTL field set at a + * source mesh point for path selection elements. + * + * @NL80211_MESHCONF_HWMP_RANN_INTERVAL: The interval of time (in TUs) between + * root announcements are transmitted. + * + * @NL80211_MESHCONF_GATE_ANNOUNCEMENTS: Advertise that this mesh station has + * access to a broader network beyond the MBSS. This is done via Root + * Announcement frames. + * + * @NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL: The minimum interval of time (in + * TUs) during which a mesh STA can send only one Action frame containing a + * PERR element. + * + * @NL80211_MESHCONF_FORWARDING: set Mesh STA as forwarding or non-forwarding + * or forwarding entity (default is TRUE - forwarding entity) + * + * @NL80211_MESHCONF_RSSI_THRESHOLD: RSSI threshold in dBm. This specifies the + * threshold for average signal strength of candidate station to establish + * a peer link. + * + * @NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR: maximum number of neighbors + * to synchronize to for 11s default synchronization method + * (see 11C.12.2.2) + * + * @NL80211_MESHCONF_HT_OPMODE: set mesh HT protection mode. + * + * @NL80211_MESHCONF_ATTR_MAX: highest possible mesh configuration attribute + * + * @NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT: The time (in TUs) for + * which mesh STAs receiving a proactive PREQ shall consider the forwarding + * information to the root mesh STA to be valid. + * + * @NL80211_MESHCONF_HWMP_ROOT_INTERVAL: The interval of time (in TUs) between + * proactive PREQs are transmitted. + * + * @NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL: The minimum interval of time + * (in TUs) during which a mesh STA can send only one Action frame + * containing a PREQ element for root path confirmation. + * + * @NL80211_MESHCONF_POWER_MODE: Default mesh power mode for new peer links. + * type &enum nl80211_mesh_power_mode (u32) + * + * @NL80211_MESHCONF_AWAKE_WINDOW: awake window duration (in TUs) + * + * @NL80211_MESHCONF_PLINK_TIMEOUT: If no tx activity is seen from a STA we've + * established peering with for longer than this time (in seconds), then + * remove it from the STA's list of peers. You may set this to 0 to disable + * the removal of the STA. Default is 30 minutes. + * + * @NL80211_MESHCONF_CONNECTED_TO_GATE: If set to true then this mesh STA + * will advertise that it is connected to a gate in the mesh formation + * field. If left unset then the mesh formation field will only + * advertise such if there is an active root mesh path. + * + * @NL80211_MESHCONF_NOLEARN: Try to avoid multi-hop path discovery (e.g. + * PREQ/PREP for HWMP) if the destination is a direct neighbor. Note that + * this might not be the optimal decision as a multi-hop route might be + * better. So if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. + * + * @NL80211_MESHCONF_CONNECTED_TO_AS: If set to true then this mesh STA + * will advertise that it is connected to a authentication server + * in the mesh formation field. + * + * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use + */ +enum nl80211_meshconf_params { + __NL80211_MESHCONF_INVALID, + NL80211_MESHCONF_RETRY_TIMEOUT, + NL80211_MESHCONF_CONFIRM_TIMEOUT, + NL80211_MESHCONF_HOLDING_TIMEOUT, + NL80211_MESHCONF_MAX_PEER_LINKS, + NL80211_MESHCONF_MAX_RETRIES, + NL80211_MESHCONF_TTL, + NL80211_MESHCONF_AUTO_OPEN_PLINKS, + NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, + NL80211_MESHCONF_PATH_REFRESH_TIME, + NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, + NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, + NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, + NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, + NL80211_MESHCONF_HWMP_ROOTMODE, + NL80211_MESHCONF_ELEMENT_TTL, + NL80211_MESHCONF_HWMP_RANN_INTERVAL, + NL80211_MESHCONF_GATE_ANNOUNCEMENTS, + NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, + NL80211_MESHCONF_FORWARDING, + NL80211_MESHCONF_RSSI_THRESHOLD, + NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, + NL80211_MESHCONF_HT_OPMODE, + NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, + NL80211_MESHCONF_HWMP_ROOT_INTERVAL, + NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, + NL80211_MESHCONF_POWER_MODE, + NL80211_MESHCONF_AWAKE_WINDOW, + NL80211_MESHCONF_PLINK_TIMEOUT, + NL80211_MESHCONF_CONNECTED_TO_GATE, + NL80211_MESHCONF_NOLEARN, + NL80211_MESHCONF_CONNECTED_TO_AS, + + /* keep last */ + __NL80211_MESHCONF_ATTR_AFTER_LAST, + NL80211_MESHCONF_ATTR_MAX = __NL80211_MESHCONF_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_mesh_setup_params - mesh setup parameters + * + * Mesh setup parameters. These are used to start/join a mesh and cannot be + * changed while the mesh is active. + * + * @__NL80211_MESH_SETUP_INVALID: Internal use + * + * @NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL: Enable this option to use a + * vendor specific path selection algorithm or disable it to use the + * default HWMP. + * + * @NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC: Enable this option to use a + * vendor specific path metric or disable it to use the default Airtime + * metric. + * + * @NL80211_MESH_SETUP_IE: Information elements for this mesh, for instance, a + * robust security network ie, or a vendor specific information element + * that vendors will use to identify the path selection methods and + * metrics in use. + * + * @NL80211_MESH_SETUP_USERSPACE_AUTH: Enable this option if an authentication + * daemon will be authenticating mesh candidates. + * + * @NL80211_MESH_SETUP_USERSPACE_AMPE: Enable this option if an authentication + * daemon will be securing peer link frames. AMPE is a secured version of + * Mesh Peering Management (MPM) and is implemented with the assistance of + * a userspace daemon. When this flag is set, the kernel will send peer + * management frames to a userspace daemon that will implement AMPE + * functionality (security capabilities selection, key confirmation, and + * key management). When the flag is unset (default), the kernel can + * autonomously complete (unsecured) mesh peering without the need of a + * userspace daemon. + * + * @NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC: Enable this option to use a + * vendor specific synchronization method or disable it to use the default + * neighbor offset synchronization + * + * @NL80211_MESH_SETUP_USERSPACE_MPM: Enable this option if userspace will + * implement an MPM which handles peer allocation and state. + * + * @NL80211_MESH_SETUP_AUTH_PROTOCOL: Inform the kernel of the authentication + * method (u8, as defined in IEEE 8.4.2.100.6, e.g. 0x1 for SAE). + * Default is no authentication method required. + * + * @NL80211_MESH_SETUP_ATTR_MAX: highest possible mesh setup attribute number + * + * @__NL80211_MESH_SETUP_ATTR_AFTER_LAST: Internal use + */ +enum nl80211_mesh_setup_params { + __NL80211_MESH_SETUP_INVALID, + NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL, + NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC, + NL80211_MESH_SETUP_IE, + NL80211_MESH_SETUP_USERSPACE_AUTH, + NL80211_MESH_SETUP_USERSPACE_AMPE, + NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC, + NL80211_MESH_SETUP_USERSPACE_MPM, + NL80211_MESH_SETUP_AUTH_PROTOCOL, + + /* keep last */ + __NL80211_MESH_SETUP_ATTR_AFTER_LAST, + NL80211_MESH_SETUP_ATTR_MAX = __NL80211_MESH_SETUP_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_txq_attr - TX queue parameter attributes + * @__NL80211_TXQ_ATTR_INVALID: Attribute number 0 is reserved + * @NL80211_TXQ_ATTR_AC: AC identifier (NL80211_AC_*) + * @NL80211_TXQ_ATTR_TXOP: Maximum burst time in units of 32 usecs, 0 meaning + * disabled + * @NL80211_TXQ_ATTR_CWMIN: Minimum contention window [a value of the form + * 2^n-1 in the range 1..32767] + * @NL80211_TXQ_ATTR_CWMAX: Maximum contention window [a value of the form + * 2^n-1 in the range 1..32767] + * @NL80211_TXQ_ATTR_AIFS: Arbitration interframe space [0..255] + * @__NL80211_TXQ_ATTR_AFTER_LAST: Internal + * @NL80211_TXQ_ATTR_MAX: Maximum TXQ attribute number + */ +enum nl80211_txq_attr { + __NL80211_TXQ_ATTR_INVALID, + NL80211_TXQ_ATTR_AC, + NL80211_TXQ_ATTR_TXOP, + NL80211_TXQ_ATTR_CWMIN, + NL80211_TXQ_ATTR_CWMAX, + NL80211_TXQ_ATTR_AIFS, + + /* keep last */ + __NL80211_TXQ_ATTR_AFTER_LAST, + NL80211_TXQ_ATTR_MAX = __NL80211_TXQ_ATTR_AFTER_LAST - 1 +}; + +enum nl80211_ac { + NL80211_AC_VO, + NL80211_AC_VI, + NL80211_AC_BE, + NL80211_AC_BK, + NL80211_NUM_ACS +}; + +/* backward compat */ +#define NL80211_TXQ_ATTR_QUEUE NL80211_TXQ_ATTR_AC +#define NL80211_TXQ_Q_VO NL80211_AC_VO +#define NL80211_TXQ_Q_VI NL80211_AC_VI +#define NL80211_TXQ_Q_BE NL80211_AC_BE +#define NL80211_TXQ_Q_BK NL80211_AC_BK + +/** + * enum nl80211_channel_type - channel type + * @NL80211_CHAN_NO_HT: 20 MHz, non-HT channel + * @NL80211_CHAN_HT20: 20 MHz HT channel + * @NL80211_CHAN_HT40MINUS: HT40 channel, secondary channel + * below the control channel + * @NL80211_CHAN_HT40PLUS: HT40 channel, secondary channel + * above the control channel + */ +enum nl80211_channel_type { + NL80211_CHAN_NO_HT, + NL80211_CHAN_HT20, + NL80211_CHAN_HT40MINUS, + NL80211_CHAN_HT40PLUS +}; + +/** + * enum nl80211_key_mode - Key mode + * + * @NL80211_KEY_RX_TX: (Default) + * Key can be used for Rx and Tx immediately + * + * The following modes can only be selected for unicast keys and when the + * driver supports @NL80211_EXT_FEATURE_EXT_KEY_ID: + * + * @NL80211_KEY_NO_TX: Only allowed in combination with @NL80211_CMD_NEW_KEY: + * Unicast key can only be used for Rx, Tx not allowed, yet + * @NL80211_KEY_SET_TX: Only allowed in combination with @NL80211_CMD_SET_KEY: + * The unicast key identified by idx and mac is cleared for Tx and becomes + * the preferred Tx key for the station. + */ +enum nl80211_key_mode { + NL80211_KEY_RX_TX, + NL80211_KEY_NO_TX, + NL80211_KEY_SET_TX +}; + +/** + * enum nl80211_chan_width - channel width definitions + * + * These values are used with the %NL80211_ATTR_CHANNEL_WIDTH + * attribute. + * + * @NL80211_CHAN_WIDTH_20_NOHT: 20 MHz, non-HT channel + * @NL80211_CHAN_WIDTH_20: 20 MHz HT channel + * @NL80211_CHAN_WIDTH_40: 40 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * attribute must be provided as well + * @NL80211_CHAN_WIDTH_80: 80 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * attribute must be provided as well + * @NL80211_CHAN_WIDTH_80P80: 80+80 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * and %NL80211_ATTR_CENTER_FREQ2 attributes must be provided as well + * @NL80211_CHAN_WIDTH_160: 160 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * attribute must be provided as well + * @NL80211_CHAN_WIDTH_5: 5 MHz OFDM channel + * @NL80211_CHAN_WIDTH_10: 10 MHz OFDM channel + * @NL80211_CHAN_WIDTH_1: 1 MHz OFDM channel + * @NL80211_CHAN_WIDTH_2: 2 MHz OFDM channel + * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel + * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel + * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel + * @NL80211_CHAN_WIDTH_320: 320 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * attribute must be provided as well + */ +enum nl80211_chan_width { + NL80211_CHAN_WIDTH_20_NOHT, + NL80211_CHAN_WIDTH_20, + NL80211_CHAN_WIDTH_40, + NL80211_CHAN_WIDTH_80, + NL80211_CHAN_WIDTH_80P80, + NL80211_CHAN_WIDTH_160, + NL80211_CHAN_WIDTH_5, + NL80211_CHAN_WIDTH_10, + NL80211_CHAN_WIDTH_1, + NL80211_CHAN_WIDTH_2, + NL80211_CHAN_WIDTH_4, + NL80211_CHAN_WIDTH_8, + NL80211_CHAN_WIDTH_16, + NL80211_CHAN_WIDTH_320, +}; + +/** + * enum nl80211_bss_scan_width - control channel width for a BSS + * + * These values are used with the %NL80211_BSS_CHAN_WIDTH attribute. + * + * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible + * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide + * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide + * @NL80211_BSS_CHAN_WIDTH_1: control channel is 1 MHz wide + * @NL80211_BSS_CHAN_WIDTH_2: control channel is 2 MHz wide + */ +enum nl80211_bss_scan_width { + NL80211_BSS_CHAN_WIDTH_20, + NL80211_BSS_CHAN_WIDTH_10, + NL80211_BSS_CHAN_WIDTH_5, + NL80211_BSS_CHAN_WIDTH_1, + NL80211_BSS_CHAN_WIDTH_2, +}; + +/** + * enum nl80211_bss - netlink attributes for a BSS + * + * @__NL80211_BSS_INVALID: invalid + * @NL80211_BSS_BSSID: BSSID of the BSS (6 octets) + * @NL80211_BSS_FREQUENCY: frequency in MHz (u32) + * @NL80211_BSS_TSF: TSF of the received probe response/beacon (u64) + * (if @NL80211_BSS_PRESP_DATA is present then this is known to be + * from a probe response, otherwise it may be from the same beacon + * that the NL80211_BSS_BEACON_TSF will be from) + * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16) + * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16) + * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the + * raw information elements from the probe response/beacon (bin); + * if the %NL80211_BSS_BEACON_IES attribute is present and the data is + * different then the IEs here are from a Probe Response frame; otherwise + * they are from a Beacon frame. + * However, if the driver does not indicate the source of the IEs, these + * IEs may be from either frame subtype. + * If present, the @NL80211_BSS_PRESP_DATA attribute indicates that the + * data here is known to be from a probe response, without any heuristics. + * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon + * in mBm (100 * dBm) (s32) + * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon + * in unspecified units, scaled to 0..100 (u8) + * @NL80211_BSS_STATUS: status, if this BSS is "used" + * @NL80211_BSS_SEEN_MS_AGO: age of this BSS entry in ms + * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information + * elements from a Beacon frame (bin); not present if no Beacon frame has + * yet been received + * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel + * (u32, enum nl80211_bss_scan_width) + * @NL80211_BSS_BEACON_TSF: TSF of the last received beacon (u64) + * (not present if no beacon frame has been received yet) + * @NL80211_BSS_PRESP_DATA: the data in @NL80211_BSS_INFORMATION_ELEMENTS and + * @NL80211_BSS_TSF is known to be from a probe response (flag attribute) + * @NL80211_BSS_LAST_SEEN_BOOTTIME: CLOCK_BOOTTIME timestamp when this entry + * was last updated by a received frame. The value is expected to be + * accurate to about 10ms. (u64, nanoseconds) + * @NL80211_BSS_PAD: attribute used for padding for 64-bit alignment + * @NL80211_BSS_PARENT_TSF: the time at the start of reception of the first + * octet of the timestamp field of the last beacon/probe received for + * this BSS. The time is the TSF of the BSS specified by + * @NL80211_BSS_PARENT_BSSID. (u64). + * @NL80211_BSS_PARENT_BSSID: the BSS according to which @NL80211_BSS_PARENT_TSF + * is set. + * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update. + * Contains a nested array of signal strength attributes (u8, dBm), + * using the nesting index as the antenna number. + * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz + * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8). + * @NL80211_BSS_MLD_ADDR: MLD address of this BSS if connected to it. + * @__NL80211_BSS_AFTER_LAST: internal + * @NL80211_BSS_MAX: highest BSS attribute + */ +enum nl80211_bss { + __NL80211_BSS_INVALID, + NL80211_BSS_BSSID, + NL80211_BSS_FREQUENCY, + NL80211_BSS_TSF, + NL80211_BSS_BEACON_INTERVAL, + NL80211_BSS_CAPABILITY, + NL80211_BSS_INFORMATION_ELEMENTS, + NL80211_BSS_SIGNAL_MBM, + NL80211_BSS_SIGNAL_UNSPEC, + NL80211_BSS_STATUS, + NL80211_BSS_SEEN_MS_AGO, + NL80211_BSS_BEACON_IES, + NL80211_BSS_CHAN_WIDTH, + NL80211_BSS_BEACON_TSF, + NL80211_BSS_PRESP_DATA, + NL80211_BSS_LAST_SEEN_BOOTTIME, + NL80211_BSS_PAD, + NL80211_BSS_PARENT_TSF, + NL80211_BSS_PARENT_BSSID, + NL80211_BSS_CHAIN_SIGNAL, + NL80211_BSS_FREQUENCY_OFFSET, + NL80211_BSS_MLO_LINK_ID, + NL80211_BSS_MLD_ADDR, + + /* keep last */ + __NL80211_BSS_AFTER_LAST, + NL80211_BSS_MAX = __NL80211_BSS_AFTER_LAST - 1 +}; + +/** + * enum nl80211_bss_status - BSS "status" + * @NL80211_BSS_STATUS_AUTHENTICATED: Authenticated with this BSS. + * Note that this is no longer used since cfg80211 no longer + * keeps track of whether or not authentication was done with + * a given BSS. + * @NL80211_BSS_STATUS_ASSOCIATED: Associated with this BSS. + * @NL80211_BSS_STATUS_IBSS_JOINED: Joined to this IBSS. + * + * The BSS status is a BSS attribute in scan dumps, which + * indicates the status the interface has wrt. this BSS. + */ +enum nl80211_bss_status { + NL80211_BSS_STATUS_AUTHENTICATED, + NL80211_BSS_STATUS_ASSOCIATED, + NL80211_BSS_STATUS_IBSS_JOINED, +}; + +/** + * enum nl80211_auth_type - AuthenticationType + * + * @NL80211_AUTHTYPE_OPEN_SYSTEM: Open System authentication + * @NL80211_AUTHTYPE_SHARED_KEY: Shared Key authentication (WEP only) + * @NL80211_AUTHTYPE_FT: Fast BSS Transition (IEEE 802.11r) + * @NL80211_AUTHTYPE_NETWORK_EAP: Network EAP (some Cisco APs and mainly LEAP) + * @NL80211_AUTHTYPE_SAE: Simultaneous authentication of equals + * @NL80211_AUTHTYPE_FILS_SK: Fast Initial Link Setup shared key + * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS + * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key + * @__NL80211_AUTHTYPE_NUM: internal + * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm + * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by + * trying multiple times); this is invalid in netlink -- leave out + * the attribute for this on CONNECT commands. + */ +enum nl80211_auth_type { + NL80211_AUTHTYPE_OPEN_SYSTEM, + NL80211_AUTHTYPE_SHARED_KEY, + NL80211_AUTHTYPE_FT, + NL80211_AUTHTYPE_NETWORK_EAP, + NL80211_AUTHTYPE_SAE, + NL80211_AUTHTYPE_FILS_SK, + NL80211_AUTHTYPE_FILS_SK_PFS, + NL80211_AUTHTYPE_FILS_PK, + + /* keep last */ + __NL80211_AUTHTYPE_NUM, + NL80211_AUTHTYPE_MAX = __NL80211_AUTHTYPE_NUM - 1, + NL80211_AUTHTYPE_AUTOMATIC +}; + +/** + * enum nl80211_key_type - Key Type + * @NL80211_KEYTYPE_GROUP: Group (broadcast/multicast) key + * @NL80211_KEYTYPE_PAIRWISE: Pairwise (unicast/individual) key + * @NL80211_KEYTYPE_PEERKEY: PeerKey (DLS) + * @NUM_NL80211_KEYTYPES: number of defined key types + */ +enum nl80211_key_type { + NL80211_KEYTYPE_GROUP, + NL80211_KEYTYPE_PAIRWISE, + NL80211_KEYTYPE_PEERKEY, + + NUM_NL80211_KEYTYPES +}; + +/** + * enum nl80211_mfp - Management frame protection state + * @NL80211_MFP_NO: Management frame protection not used + * @NL80211_MFP_REQUIRED: Management frame protection required + * @NL80211_MFP_OPTIONAL: Management frame protection is optional + */ +enum nl80211_mfp { + NL80211_MFP_NO, + NL80211_MFP_REQUIRED, + NL80211_MFP_OPTIONAL, +}; + +enum nl80211_wpa_versions { + NL80211_WPA_VERSION_1 = 1 << 0, + NL80211_WPA_VERSION_2 = 1 << 1, + NL80211_WPA_VERSION_3 = 1 << 2, +}; + +/** + * enum nl80211_key_default_types - key default types + * @__NL80211_KEY_DEFAULT_TYPE_INVALID: invalid + * @NL80211_KEY_DEFAULT_TYPE_UNICAST: key should be used as default + * unicast key + * @NL80211_KEY_DEFAULT_TYPE_MULTICAST: key should be used as default + * multicast key + * @NUM_NL80211_KEY_DEFAULT_TYPES: number of default types + */ +enum nl80211_key_default_types { + __NL80211_KEY_DEFAULT_TYPE_INVALID, + NL80211_KEY_DEFAULT_TYPE_UNICAST, + NL80211_KEY_DEFAULT_TYPE_MULTICAST, + + NUM_NL80211_KEY_DEFAULT_TYPES +}; + +/** + * enum nl80211_key_attributes - key attributes + * @__NL80211_KEY_INVALID: invalid + * @NL80211_KEY_DATA: (temporal) key data; for TKIP this consists of + * 16 bytes encryption key followed by 8 bytes each for TX and RX MIC + * keys + * @NL80211_KEY_IDX: key ID (u8, 0-3) + * @NL80211_KEY_CIPHER: key cipher suite (u32, as defined by IEEE 802.11 + * section 7.3.2.25.1, e.g. 0x000FAC04) + * @NL80211_KEY_SEQ: transmit key sequence number (IV/PN) for TKIP and + * CCMP keys, each six bytes in little endian + * @NL80211_KEY_DEFAULT: flag indicating default key + * @NL80211_KEY_DEFAULT_MGMT: flag indicating default management key + * @NL80211_KEY_TYPE: the key type from enum nl80211_key_type, if not + * specified the default depends on whether a MAC address was + * given with the command using the key or not (u32) + * @NL80211_KEY_DEFAULT_TYPES: A nested attribute containing flags + * attributes, specifying what a key should be set as default as. + * See &enum nl80211_key_default_types. + * @NL80211_KEY_MODE: the mode from enum nl80211_key_mode. + * Defaults to @NL80211_KEY_RX_TX. + * @NL80211_KEY_DEFAULT_BEACON: flag indicating default Beacon frame key + * + * @__NL80211_KEY_AFTER_LAST: internal + * @NL80211_KEY_MAX: highest key attribute + */ +enum nl80211_key_attributes { + __NL80211_KEY_INVALID, + NL80211_KEY_DATA, + NL80211_KEY_IDX, + NL80211_KEY_CIPHER, + NL80211_KEY_SEQ, + NL80211_KEY_DEFAULT, + NL80211_KEY_DEFAULT_MGMT, + NL80211_KEY_TYPE, + NL80211_KEY_DEFAULT_TYPES, + NL80211_KEY_MODE, + NL80211_KEY_DEFAULT_BEACON, + + /* keep last */ + __NL80211_KEY_AFTER_LAST, + NL80211_KEY_MAX = __NL80211_KEY_AFTER_LAST - 1 +}; + +/** + * enum nl80211_tx_rate_attributes - TX rate set attributes + * @__NL80211_TXRATE_INVALID: invalid + * @NL80211_TXRATE_LEGACY: Legacy (non-MCS) rates allowed for TX rate selection + * in an array of rates as defined in IEEE 802.11 7.3.2.2 (u8 values with + * 1 = 500 kbps) but without the IE length restriction (at most + * %NL80211_MAX_SUPP_RATES in a single array). + * @NL80211_TXRATE_HT: HT (MCS) rates allowed for TX rate selection + * in an array of MCS numbers. + * @NL80211_TXRATE_VHT: VHT rates allowed for TX rate selection, + * see &struct nl80211_txrate_vht + * @NL80211_TXRATE_GI: configure GI, see &enum nl80211_txrate_gi + * @NL80211_TXRATE_HE: HE rates allowed for TX rate selection, + * see &struct nl80211_txrate_he + * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. + * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. + * @__NL80211_TXRATE_AFTER_LAST: internal + * @NL80211_TXRATE_MAX: highest TX rate attribute + */ +enum nl80211_tx_rate_attributes { + __NL80211_TXRATE_INVALID, + NL80211_TXRATE_LEGACY, + NL80211_TXRATE_HT, + NL80211_TXRATE_VHT, + NL80211_TXRATE_GI, + NL80211_TXRATE_HE, + NL80211_TXRATE_HE_GI, + NL80211_TXRATE_HE_LTF, + + /* keep last */ + __NL80211_TXRATE_AFTER_LAST, + NL80211_TXRATE_MAX = __NL80211_TXRATE_AFTER_LAST - 1 +}; + +#define NL80211_TXRATE_MCS NL80211_TXRATE_HT +#define NL80211_VHT_NSS_MAX 8 + +/** + * struct nl80211_txrate_vht - VHT MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_vht { + __u16 mcs[NL80211_VHT_NSS_MAX]; +}; + +#define NL80211_HE_NSS_MAX 8 +/** + * struct nl80211_txrate_he - HE MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_he { + __u16 mcs[NL80211_HE_NSS_MAX]; +}; + +enum nl80211_txrate_gi { + NL80211_TXRATE_DEFAULT_GI, + NL80211_TXRATE_FORCE_SGI, + NL80211_TXRATE_FORCE_LGI, +}; + +/** + * enum nl80211_band - Frequency band + * @NL80211_BAND_2GHZ: 2.4 GHz ISM band + * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz) + * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) + * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz) + * @NL80211_BAND_S1GHZ: around 900MHz, supported by S1G PHYs + * @NL80211_BAND_LC: light communication band (placeholder) + * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace + * since newer kernel versions may support more bands + */ +enum nl80211_band { + NL80211_BAND_2GHZ, + NL80211_BAND_5GHZ, + NL80211_BAND_60GHZ, + NL80211_BAND_6GHZ, + NL80211_BAND_S1GHZ, + NL80211_BAND_LC, + + NUM_NL80211_BANDS, +}; + +/** + * enum nl80211_ps_state - powersave state + * @NL80211_PS_DISABLED: powersave is disabled + * @NL80211_PS_ENABLED: powersave is enabled + */ +enum nl80211_ps_state { + NL80211_PS_DISABLED, + NL80211_PS_ENABLED, +}; + +/** + * enum nl80211_attr_cqm - connection quality monitor attributes + * @__NL80211_ATTR_CQM_INVALID: invalid + * @NL80211_ATTR_CQM_RSSI_THOLD: RSSI threshold in dBm. This value specifies + * the threshold for the RSSI level at which an event will be sent. Zero + * to disable. Alternatively, if %NL80211_EXT_FEATURE_CQM_RSSI_LIST is + * set, multiple values can be supplied as a low-to-high sorted array of + * threshold values in dBm. Events will be sent when the RSSI value + * crosses any of the thresholds. + * @NL80211_ATTR_CQM_RSSI_HYST: RSSI hysteresis in dBm. This value specifies + * the minimum amount the RSSI level must change after an event before a + * new event may be issued (to reduce effects of RSSI oscillation). + * @NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT: RSSI threshold event + * @NL80211_ATTR_CQM_PKT_LOSS_EVENT: a u32 value indicating that this many + * consecutive packets were not acknowledged by the peer + * @NL80211_ATTR_CQM_TXE_RATE: TX error rate in %. Minimum % of TX failures + * during the given %NL80211_ATTR_CQM_TXE_INTVL before an + * %NL80211_CMD_NOTIFY_CQM with reported %NL80211_ATTR_CQM_TXE_RATE and + * %NL80211_ATTR_CQM_TXE_PKTS is generated. + * @NL80211_ATTR_CQM_TXE_PKTS: number of attempted packets in a given + * %NL80211_ATTR_CQM_TXE_INTVL before %NL80211_ATTR_CQM_TXE_RATE is + * checked. + * @NL80211_ATTR_CQM_TXE_INTVL: interval in seconds. Specifies the periodic + * interval in which %NL80211_ATTR_CQM_TXE_PKTS and + * %NL80211_ATTR_CQM_TXE_RATE must be satisfied before generating an + * %NL80211_CMD_NOTIFY_CQM. Set to 0 to turn off TX error reporting. + * @NL80211_ATTR_CQM_BEACON_LOSS_EVENT: flag attribute that's set in a beacon + * loss event + * @NL80211_ATTR_CQM_RSSI_LEVEL: the RSSI value in dBm that triggered the + * RSSI threshold event. + * @__NL80211_ATTR_CQM_AFTER_LAST: internal + * @NL80211_ATTR_CQM_MAX: highest key attribute + */ +enum nl80211_attr_cqm { + __NL80211_ATTR_CQM_INVALID, + NL80211_ATTR_CQM_RSSI_THOLD, + NL80211_ATTR_CQM_RSSI_HYST, + NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT, + NL80211_ATTR_CQM_PKT_LOSS_EVENT, + NL80211_ATTR_CQM_TXE_RATE, + NL80211_ATTR_CQM_TXE_PKTS, + NL80211_ATTR_CQM_TXE_INTVL, + NL80211_ATTR_CQM_BEACON_LOSS_EVENT, + NL80211_ATTR_CQM_RSSI_LEVEL, + + /* keep last */ + __NL80211_ATTR_CQM_AFTER_LAST, + NL80211_ATTR_CQM_MAX = __NL80211_ATTR_CQM_AFTER_LAST - 1 +}; + +/** + * enum nl80211_cqm_rssi_threshold_event - RSSI threshold event + * @NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW: The RSSI level is lower than the + * configured threshold + * @NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH: The RSSI is higher than the + * configured threshold + * @NL80211_CQM_RSSI_BEACON_LOSS_EVENT: (reserved, never sent) + */ +enum nl80211_cqm_rssi_threshold_event { + NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, + NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, + NL80211_CQM_RSSI_BEACON_LOSS_EVENT, +}; + + +/** + * enum nl80211_tx_power_setting - TX power adjustment + * @NL80211_TX_POWER_AUTOMATIC: automatically determine transmit power + * @NL80211_TX_POWER_LIMITED: limit TX power by the mBm parameter + * @NL80211_TX_POWER_FIXED: fix TX power to the mBm parameter + */ +enum nl80211_tx_power_setting { + NL80211_TX_POWER_AUTOMATIC, + NL80211_TX_POWER_LIMITED, + NL80211_TX_POWER_FIXED, +}; + +/** + * enum nl80211_tid_config - TID config state + * @NL80211_TID_CONFIG_ENABLE: Enable config for the TID + * @NL80211_TID_CONFIG_DISABLE: Disable config for the TID + */ +enum nl80211_tid_config { + NL80211_TID_CONFIG_ENABLE, + NL80211_TID_CONFIG_DISABLE, +}; + +/* enum nl80211_tx_rate_setting - TX rate configuration type + * @NL80211_TX_RATE_AUTOMATIC: automatically determine TX rate + * @NL80211_TX_RATE_LIMITED: limit the TX rate by the TX rate parameter + * @NL80211_TX_RATE_FIXED: fix TX rate to the TX rate parameter + */ +enum nl80211_tx_rate_setting { + NL80211_TX_RATE_AUTOMATIC, + NL80211_TX_RATE_LIMITED, + NL80211_TX_RATE_FIXED, +}; + +/* enum nl80211_tid_config_attr - TID specific configuration. + * @NL80211_TID_CONFIG_ATTR_PAD: pad attribute for 64-bit values + * @NL80211_TID_CONFIG_ATTR_VIF_SUPP: a bitmap (u64) of attributes supported + * for per-vif configuration; doesn't list the ones that are generic + * (%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE). + * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but + * per peer instead. + * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if set indicates + * that the new configuration overrides all previous peer + * configurations, otherwise previous peer specific configurations + * should be left untouched. + * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs (bit 0 to 7) + * Its type is u16. + * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. + * specified in %NL80211_TID_CONFIG_ATTR_TID. see %enum nl80211_tid_config. + * Its type is u8. + * @NL80211_TID_CONFIG_ATTR_RETRY_SHORT: Number of retries used with data frame + * transmission, user-space sets this configuration in + * &NL80211_CMD_SET_TID_CONFIG. It is u8 type, min value is 1 and + * the max value is advertised by the driver in this attribute on + * output in wiphy capabilities. + * @NL80211_TID_CONFIG_ATTR_RETRY_LONG: Number of retries used with data frame + * transmission, user-space sets this configuration in + * &NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and + * the max value is advertised by the driver in this attribute on + * output in wiphy capabilities. + * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable MPDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL: Enable/Disable RTS_CTS for the TIDs + * specified in %NL80211_TID_CONFIG_ATTR_TIDS. It is u8 type, using + * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE: This attribute will be useful + * to notfiy the driver that what type of txrate should be used + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. using + * the values form &nl80211_tx_rate_setting. + * @NL80211_TID_CONFIG_ATTR_TX_RATE: Data frame TX rate mask should be applied + * with the parameters passed through %NL80211_ATTR_TX_RATES. + * configuration is applied to the data frame for the tid to that connected + * station. + */ +enum nl80211_tid_config_attr { + __NL80211_TID_CONFIG_ATTR_INVALID, + NL80211_TID_CONFIG_ATTR_PAD, + NL80211_TID_CONFIG_ATTR_VIF_SUPP, + NL80211_TID_CONFIG_ATTR_PEER_SUPP, + NL80211_TID_CONFIG_ATTR_OVERRIDE, + NL80211_TID_CONFIG_ATTR_TIDS, + NL80211_TID_CONFIG_ATTR_NOACK, + NL80211_TID_CONFIG_ATTR_RETRY_SHORT, + NL80211_TID_CONFIG_ATTR_RETRY_LONG, + NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, + NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, + NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, + NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, + NL80211_TID_CONFIG_ATTR_TX_RATE, + + /* keep last */ + __NL80211_TID_CONFIG_ATTR_AFTER_LAST, + NL80211_TID_CONFIG_ATTR_MAX = __NL80211_TID_CONFIG_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_packet_pattern_attr - packet pattern attribute + * @__NL80211_PKTPAT_INVALID: invalid number for nested attribute + * @NL80211_PKTPAT_PATTERN: the pattern, values where the mask has + * a zero bit are ignored + * @NL80211_PKTPAT_MASK: pattern mask, must be long enough to have + * a bit for each byte in the pattern. The lowest-order bit corresponds + * to the first byte of the pattern, but the bytes of the pattern are + * in a little-endian-like format, i.e. the 9th byte of the pattern + * corresponds to the lowest-order bit in the second byte of the mask. + * For example: The match 00:xx:00:00:xx:00:00:00:00:xx:xx:xx (where + * xx indicates "don't care") would be represented by a pattern of + * twelve zero bytes, and a mask of "0xed,0x01". + * Note that the pattern matching is done as though frames were not + * 802.11 frames but 802.3 frames, i.e. the frame is fully unpacked + * first (including SNAP header unpacking) and then matched. + * @NL80211_PKTPAT_OFFSET: packet offset, pattern is matched after + * these fixed number of bytes of received packet + * @NUM_NL80211_PKTPAT: number of attributes + * @MAX_NL80211_PKTPAT: max attribute number + */ +enum nl80211_packet_pattern_attr { + __NL80211_PKTPAT_INVALID, + NL80211_PKTPAT_MASK, + NL80211_PKTPAT_PATTERN, + NL80211_PKTPAT_OFFSET, + + NUM_NL80211_PKTPAT, + MAX_NL80211_PKTPAT = NUM_NL80211_PKTPAT - 1, +}; + +/** + * struct nl80211_pattern_support - packet pattern support information + * @max_patterns: maximum number of patterns supported + * @min_pattern_len: minimum length of each pattern + * @max_pattern_len: maximum length of each pattern + * @max_pkt_offset: maximum Rx packet offset + * + * This struct is carried in %NL80211_WOWLAN_TRIG_PKT_PATTERN when + * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED or in + * %NL80211_ATTR_COALESCE_RULE_PKT_PATTERN when that is part of + * %NL80211_ATTR_COALESCE_RULE in the capability information given + * by the kernel to userspace. + */ +struct nl80211_pattern_support { + __u32 max_patterns; + __u32 min_pattern_len; + __u32 max_pattern_len; + __u32 max_pkt_offset; +} __attribute__((packed)); + +/* only for backward compatibility */ +#define __NL80211_WOWLAN_PKTPAT_INVALID __NL80211_PKTPAT_INVALID +#define NL80211_WOWLAN_PKTPAT_MASK NL80211_PKTPAT_MASK +#define NL80211_WOWLAN_PKTPAT_PATTERN NL80211_PKTPAT_PATTERN +#define NL80211_WOWLAN_PKTPAT_OFFSET NL80211_PKTPAT_OFFSET +#define NUM_NL80211_WOWLAN_PKTPAT NUM_NL80211_PKTPAT +#define MAX_NL80211_WOWLAN_PKTPAT MAX_NL80211_PKTPAT +#define nl80211_wowlan_pattern_support nl80211_pattern_support + +/** + * enum nl80211_wowlan_triggers - WoWLAN trigger definitions + * @__NL80211_WOWLAN_TRIG_INVALID: invalid number for nested attributes + * @NL80211_WOWLAN_TRIG_ANY: wake up on any activity, do not really put + * the chip into a special state -- works best with chips that have + * support for low-power operation already (flag) + * Note that this mode is incompatible with all of the others, if + * any others are even supported by the device. + * @NL80211_WOWLAN_TRIG_DISCONNECT: wake up on disconnect, the way disconnect + * is detected is implementation-specific (flag) + * @NL80211_WOWLAN_TRIG_MAGIC_PKT: wake up on magic packet (6x 0xff, followed + * by 16 repetitions of MAC addr, anywhere in payload) (flag) + * @NL80211_WOWLAN_TRIG_PKT_PATTERN: wake up on the specified packet patterns + * which are passed in an array of nested attributes, each nested attribute + * defining a with attributes from &struct nl80211_wowlan_trig_pkt_pattern. + * Each pattern defines a wakeup packet. Packet offset is associated with + * each pattern which is used while matching the pattern. The matching is + * done on the MSDU, i.e. as though the packet was an 802.3 packet, so the + * pattern matching is done after the packet is converted to the MSDU. + * + * In %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED, it is a binary attribute + * carrying a &struct nl80211_pattern_support. + * + * When reporting wakeup. it is a u32 attribute containing the 0-based + * index of the pattern that caused the wakeup, in the patterns passed + * to the kernel when configuring. + * @NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED: Not a real trigger, and cannot be + * used when setting, used only to indicate that GTK rekeying is supported + * by the device (flag) + * @NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE: wake up on GTK rekey failure (if + * done by the device) (flag) + * @NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST: wake up on EAP Identity Request + * packet (flag) + * @NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE: wake up on 4-way handshake (flag) + * @NL80211_WOWLAN_TRIG_RFKILL_RELEASE: wake up when rfkill is released + * (on devices that have rfkill in the device) (flag) + * @NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211: For wakeup reporting only, contains + * the 802.11 packet that caused the wakeup, e.g. a deauth frame. The frame + * may be truncated, the @NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN + * attribute contains the original length. + * @NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN: Original length of the 802.11 + * packet, may be bigger than the @NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211 + * attribute if the packet was truncated somewhere. + * @NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023: For wakeup reporting only, contains the + * 802.11 packet that caused the wakeup, e.g. a magic packet. The frame may + * be truncated, the @NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023_LEN attribute + * contains the original length. + * @NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023_LEN: Original length of the 802.3 + * packet, may be bigger than the @NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023 + * attribute if the packet was truncated somewhere. + * @NL80211_WOWLAN_TRIG_TCP_CONNECTION: TCP connection wake, see DOC section + * "TCP connection wakeup" for more details. This is a nested attribute + * containing the exact information for establishing and keeping alive + * the TCP connection. + * @NL80211_WOWLAN_TRIG_TCP_WAKEUP_MATCH: For wakeup reporting only, the + * wakeup packet was received on the TCP connection + * @NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST: For wakeup reporting only, the + * TCP connection was lost or failed to be established + * @NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS: For wakeup reporting only, + * the TCP connection ran out of tokens to use for data to send to the + * service + * @NL80211_WOWLAN_TRIG_NET_DETECT: wake up when a configured network + * is detected. This is a nested attribute that contains the + * same attributes used with @NL80211_CMD_START_SCHED_SCAN. It + * specifies how the scan is performed (e.g. the interval, the + * channels to scan and the initial delay) as well as the scan + * results that will trigger a wake (i.e. the matchsets). This + * attribute is also sent in a response to + * @NL80211_CMD_GET_WIPHY, indicating the number of match sets + * supported by the driver (u32). + * @NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS: nested attribute + * containing an array with information about what triggered the + * wake up. If no elements are present in the array, it means + * that the information is not available. If more than one + * element is present, it means that more than one match + * occurred. + * Each element in the array is a nested attribute that contains + * one optional %NL80211_ATTR_SSID attribute and one optional + * %NL80211_ATTR_SCAN_FREQUENCIES attribute. At least one of + * these attributes must be present. If + * %NL80211_ATTR_SCAN_FREQUENCIES contains more than one + * frequency, it means that the match occurred in more than one + * channel. + * @NUM_NL80211_WOWLAN_TRIG: number of wake on wireless triggers + * @MAX_NL80211_WOWLAN_TRIG: highest wowlan trigger attribute number + * + * These nested attributes are used to configure the wakeup triggers and + * to report the wakeup reason(s). + */ +enum nl80211_wowlan_triggers { + __NL80211_WOWLAN_TRIG_INVALID, + NL80211_WOWLAN_TRIG_ANY, + NL80211_WOWLAN_TRIG_DISCONNECT, + NL80211_WOWLAN_TRIG_MAGIC_PKT, + NL80211_WOWLAN_TRIG_PKT_PATTERN, + NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED, + NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE, + NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST, + NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE, + NL80211_WOWLAN_TRIG_RFKILL_RELEASE, + NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211, + NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN, + NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023, + NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023_LEN, + NL80211_WOWLAN_TRIG_TCP_CONNECTION, + NL80211_WOWLAN_TRIG_WAKEUP_TCP_MATCH, + NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST, + NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS, + NL80211_WOWLAN_TRIG_NET_DETECT, + NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS, + + /* keep last */ + NUM_NL80211_WOWLAN_TRIG, + MAX_NL80211_WOWLAN_TRIG = NUM_NL80211_WOWLAN_TRIG - 1 +}; + +/** + * DOC: TCP connection wakeup + * + * Some devices can establish a TCP connection in order to be woken up by a + * packet coming in from outside their network segment, or behind NAT. If + * configured, the device will establish a TCP connection to the given + * service, and periodically send data to that service. The first data + * packet is usually transmitted after SYN/ACK, also ACKing the SYN/ACK. + * The data packets can optionally include a (little endian) sequence + * number (in the TCP payload!) that is generated by the device, and, also + * optionally, a token from a list of tokens. This serves as a keep-alive + * with the service, and for NATed connections, etc. + * + * During this keep-alive period, the server doesn't send any data to the + * client. When receiving data, it is compared against the wakeup pattern + * (and mask) and if it matches, the host is woken up. Similarly, if the + * connection breaks or cannot be established to start with, the host is + * also woken up. + * + * Developer's note: ARP offload is required for this, otherwise TCP + * response packets might not go through correctly. + */ + +/** + * struct nl80211_wowlan_tcp_data_seq - WoWLAN TCP data sequence + * @start: starting value + * @offset: offset of sequence number in packet + * @len: length of the sequence value to write, 1 through 4 + * + * Note: don't confuse with the TCP sequence number(s), this is for the + * keepalive packet payload. The actual value is written into the packet + * in little endian. + */ +struct nl80211_wowlan_tcp_data_seq { + __u32 start, offset, len; +}; + +/** + * struct nl80211_wowlan_tcp_data_token - WoWLAN TCP data token config + * @offset: offset of token in packet + * @len: length of each token + * @token_stream: stream of data to be used for the tokens, the length must + * be a multiple of @len for this to make sense + */ +struct nl80211_wowlan_tcp_data_token { + __u32 offset, len; + __u8 token_stream[]; +}; + +/** + * struct nl80211_wowlan_tcp_data_token_feature - data token features + * @min_len: minimum token length + * @max_len: maximum token length + * @bufsize: total available token buffer size (max size of @token_stream) + */ +struct nl80211_wowlan_tcp_data_token_feature { + __u32 min_len, max_len, bufsize; +}; + +/** + * enum nl80211_wowlan_tcp_attrs - WoWLAN TCP connection parameters + * @__NL80211_WOWLAN_TCP_INVALID: invalid number for nested attributes + * @NL80211_WOWLAN_TCP_SRC_IPV4: source IPv4 address (in network byte order) + * @NL80211_WOWLAN_TCP_DST_IPV4: destination IPv4 address + * (in network byte order) + * @NL80211_WOWLAN_TCP_DST_MAC: destination MAC address, this is given because + * route lookup when configured might be invalid by the time we suspend, + * and doing a route lookup when suspending is no longer possible as it + * might require ARP querying. + * @NL80211_WOWLAN_TCP_SRC_PORT: source port (u16); optional, if not given a + * socket and port will be allocated + * @NL80211_WOWLAN_TCP_DST_PORT: destination port (u16) + * @NL80211_WOWLAN_TCP_DATA_PAYLOAD: data packet payload, at least one byte. + * For feature advertising, a u32 attribute holding the maximum length + * of the data payload. + * @NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ: data packet sequence configuration + * (if desired), a &struct nl80211_wowlan_tcp_data_seq. For feature + * advertising it is just a flag + * @NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN: data packet token configuration, + * see &struct nl80211_wowlan_tcp_data_token and for advertising see + * &struct nl80211_wowlan_tcp_data_token_feature. + * @NL80211_WOWLAN_TCP_DATA_INTERVAL: data interval in seconds, maximum + * interval in feature advertising (u32) + * @NL80211_WOWLAN_TCP_WAKE_PAYLOAD: wake packet payload, for advertising a + * u32 attribute holding the maximum length + * @NL80211_WOWLAN_TCP_WAKE_MASK: Wake packet payload mask, not used for + * feature advertising. The mask works like @NL80211_PKTPAT_MASK + * but on the TCP payload only. + * @NUM_NL80211_WOWLAN_TCP: number of TCP attributes + * @MAX_NL80211_WOWLAN_TCP: highest attribute number + */ +enum nl80211_wowlan_tcp_attrs { + __NL80211_WOWLAN_TCP_INVALID, + NL80211_WOWLAN_TCP_SRC_IPV4, + NL80211_WOWLAN_TCP_DST_IPV4, + NL80211_WOWLAN_TCP_DST_MAC, + NL80211_WOWLAN_TCP_SRC_PORT, + NL80211_WOWLAN_TCP_DST_PORT, + NL80211_WOWLAN_TCP_DATA_PAYLOAD, + NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ, + NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN, + NL80211_WOWLAN_TCP_DATA_INTERVAL, + NL80211_WOWLAN_TCP_WAKE_PAYLOAD, + NL80211_WOWLAN_TCP_WAKE_MASK, + + /* keep last */ + NUM_NL80211_WOWLAN_TCP, + MAX_NL80211_WOWLAN_TCP = NUM_NL80211_WOWLAN_TCP - 1 +}; + +/** + * struct nl80211_coalesce_rule_support - coalesce rule support information + * @max_rules: maximum number of rules supported + * @pat: packet pattern support information + * @max_delay: maximum supported coalescing delay in msecs + * + * This struct is carried in %NL80211_ATTR_COALESCE_RULE in the + * capability information given by the kernel to userspace. + */ +struct nl80211_coalesce_rule_support { + __u32 max_rules; + struct nl80211_pattern_support pat; + __u32 max_delay; +} __attribute__((packed)); + +/** + * enum nl80211_attr_coalesce_rule - coalesce rule attribute + * @__NL80211_COALESCE_RULE_INVALID: invalid number for nested attribute + * @NL80211_ATTR_COALESCE_RULE_DELAY: delay in msecs used for packet coalescing + * @NL80211_ATTR_COALESCE_RULE_CONDITION: condition for packet coalescence, + * see &enum nl80211_coalesce_condition. + * @NL80211_ATTR_COALESCE_RULE_PKT_PATTERN: packet offset, pattern is matched + * after these fixed number of bytes of received packet + * @NUM_NL80211_ATTR_COALESCE_RULE: number of attributes + * @NL80211_ATTR_COALESCE_RULE_MAX: max attribute number + */ +enum nl80211_attr_coalesce_rule { + __NL80211_COALESCE_RULE_INVALID, + NL80211_ATTR_COALESCE_RULE_DELAY, + NL80211_ATTR_COALESCE_RULE_CONDITION, + NL80211_ATTR_COALESCE_RULE_PKT_PATTERN, + + /* keep last */ + NUM_NL80211_ATTR_COALESCE_RULE, + NL80211_ATTR_COALESCE_RULE_MAX = NUM_NL80211_ATTR_COALESCE_RULE - 1 +}; + +/** + * enum nl80211_coalesce_condition - coalesce rule conditions + * @NL80211_COALESCE_CONDITION_MATCH: coalaesce Rx packets when patterns + * in a rule are matched. + * @NL80211_COALESCE_CONDITION_NO_MATCH: coalesce Rx packets when patterns + * in a rule are not matched. + */ +enum nl80211_coalesce_condition { + NL80211_COALESCE_CONDITION_MATCH, + NL80211_COALESCE_CONDITION_NO_MATCH +}; + +/** + * enum nl80211_iface_limit_attrs - limit attributes + * @NL80211_IFACE_LIMIT_UNSPEC: (reserved) + * @NL80211_IFACE_LIMIT_MAX: maximum number of interfaces that + * can be chosen from this set of interface types (u32) + * @NL80211_IFACE_LIMIT_TYPES: nested attribute containing a + * flag attribute for each interface type in this set + * @NUM_NL80211_IFACE_LIMIT: number of attributes + * @MAX_NL80211_IFACE_LIMIT: highest attribute number + */ +enum nl80211_iface_limit_attrs { + NL80211_IFACE_LIMIT_UNSPEC, + NL80211_IFACE_LIMIT_MAX, + NL80211_IFACE_LIMIT_TYPES, + + /* keep last */ + NUM_NL80211_IFACE_LIMIT, + MAX_NL80211_IFACE_LIMIT = NUM_NL80211_IFACE_LIMIT - 1 +}; + +/** + * enum nl80211_if_combination_attrs -- interface combination attributes + * + * @NL80211_IFACE_COMB_UNSPEC: (reserved) + * @NL80211_IFACE_COMB_LIMITS: Nested attributes containing the limits + * for given interface types, see &enum nl80211_iface_limit_attrs. + * @NL80211_IFACE_COMB_MAXNUM: u32 attribute giving the total number of + * interfaces that can be created in this group. This number doesn't + * apply to interfaces purely managed in software, which are listed + * in a separate attribute %NL80211_ATTR_INTERFACES_SOFTWARE. + * @NL80211_IFACE_COMB_STA_AP_BI_MATCH: flag attribute specifying that + * beacon intervals within this group must be all the same even for + * infrastructure and AP/GO combinations, i.e. the GO(s) must adopt + * the infrastructure network's beacon interval. + * @NL80211_IFACE_COMB_NUM_CHANNELS: u32 attribute specifying how many + * different channels may be used within this group. + * @NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS: u32 attribute containing the bitmap + * of supported channel widths for radar detection. + * @NL80211_IFACE_COMB_RADAR_DETECT_REGIONS: u32 attribute containing the bitmap + * of supported regulatory regions for radar detection. + * @NL80211_IFACE_COMB_BI_MIN_GCD: u32 attribute specifying the minimum GCD of + * different beacon intervals supported by all the interface combinations + * in this group (if not present, all beacon intervals be identical). + * @NUM_NL80211_IFACE_COMB: number of attributes + * @MAX_NL80211_IFACE_COMB: highest attribute number + * + * Examples: + * limits = [ #{STA} <= 1, #{AP} <= 1 ], matching BI, channels = 1, max = 2 + * => allows an AP and a STA that must match BIs + * + * numbers = [ #{AP, P2P-GO} <= 8 ], BI min gcd, channels = 1, max = 8, + * => allows 8 of AP/GO that can have BI gcd >= min gcd + * + * numbers = [ #{STA} <= 2 ], channels = 2, max = 2 + * => allows two STAs on the same or on different channels + * + * numbers = [ #{STA} <= 1, #{P2P-client,P2P-GO} <= 3 ], max = 4 + * => allows a STA plus three P2P interfaces + * + * The list of these four possibilities could completely be contained + * within the %NL80211_ATTR_INTERFACE_COMBINATIONS attribute to indicate + * that any of these groups must match. + * + * "Combinations" of just a single interface will not be listed here, + * a single interface of any valid interface type is assumed to always + * be possible by itself. This means that implicitly, for each valid + * interface type, the following group always exists: + * numbers = [ #{} <= 1 ], channels = 1, max = 1 + */ +enum nl80211_if_combination_attrs { + NL80211_IFACE_COMB_UNSPEC, + NL80211_IFACE_COMB_LIMITS, + NL80211_IFACE_COMB_MAXNUM, + NL80211_IFACE_COMB_STA_AP_BI_MATCH, + NL80211_IFACE_COMB_NUM_CHANNELS, + NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS, + NL80211_IFACE_COMB_RADAR_DETECT_REGIONS, + NL80211_IFACE_COMB_BI_MIN_GCD, + + /* keep last */ + NUM_NL80211_IFACE_COMB, + MAX_NL80211_IFACE_COMB = NUM_NL80211_IFACE_COMB - 1 +}; + + +/** + * enum nl80211_plink_state - state of a mesh peer link finite state machine + * + * @NL80211_PLINK_LISTEN: initial state, considered the implicit + * state of non existent mesh peer links + * @NL80211_PLINK_OPN_SNT: mesh plink open frame has been sent to + * this mesh peer + * @NL80211_PLINK_OPN_RCVD: mesh plink open frame has been received + * from this mesh peer + * @NL80211_PLINK_CNF_RCVD: mesh plink confirm frame has been + * received from this mesh peer + * @NL80211_PLINK_ESTAB: mesh peer link is established + * @NL80211_PLINK_HOLDING: mesh peer link is being closed or cancelled + * @NL80211_PLINK_BLOCKED: all frames transmitted from this mesh + * plink are discarded, except for authentication frames + * @NUM_NL80211_PLINK_STATES: number of peer link states + * @MAX_NL80211_PLINK_STATES: highest numerical value of plink states + */ +enum nl80211_plink_state { + NL80211_PLINK_LISTEN, + NL80211_PLINK_OPN_SNT, + NL80211_PLINK_OPN_RCVD, + NL80211_PLINK_CNF_RCVD, + NL80211_PLINK_ESTAB, + NL80211_PLINK_HOLDING, + NL80211_PLINK_BLOCKED, + + /* keep last */ + NUM_NL80211_PLINK_STATES, + MAX_NL80211_PLINK_STATES = NUM_NL80211_PLINK_STATES - 1 +}; + +/** + * enum nl80211_plink_action - actions to perform in mesh peers + * + * @NL80211_PLINK_ACTION_NO_ACTION: perform no action + * @NL80211_PLINK_ACTION_OPEN: start mesh peer link establishment + * @NL80211_PLINK_ACTION_BLOCK: block traffic from this mesh peer + * @NUM_NL80211_PLINK_ACTIONS: number of possible actions + */ +enum plink_actions { + NL80211_PLINK_ACTION_NO_ACTION, + NL80211_PLINK_ACTION_OPEN, + NL80211_PLINK_ACTION_BLOCK, + + NUM_NL80211_PLINK_ACTIONS, +}; + + +#define NL80211_KCK_LEN 16 +#define NL80211_KEK_LEN 16 +#define NL80211_KCK_EXT_LEN 24 +#define NL80211_KEK_EXT_LEN 32 +#define NL80211_REPLAY_CTR_LEN 8 + +/** + * enum nl80211_rekey_data - attributes for GTK rekey offload + * @__NL80211_REKEY_DATA_INVALID: invalid number for nested attributes + * @NL80211_REKEY_DATA_KEK: key encryption key (binary) + * @NL80211_REKEY_DATA_KCK: key confirmation key (binary) + * @NL80211_REKEY_DATA_REPLAY_CTR: replay counter (binary) + * @NL80211_REKEY_DATA_AKM: AKM data (OUI, suite type) + * @NUM_NL80211_REKEY_DATA: number of rekey attributes (internal) + * @MAX_NL80211_REKEY_DATA: highest rekey attribute (internal) + */ +enum nl80211_rekey_data { + __NL80211_REKEY_DATA_INVALID, + NL80211_REKEY_DATA_KEK, + NL80211_REKEY_DATA_KCK, + NL80211_REKEY_DATA_REPLAY_CTR, + NL80211_REKEY_DATA_AKM, + + /* keep last */ + NUM_NL80211_REKEY_DATA, + MAX_NL80211_REKEY_DATA = NUM_NL80211_REKEY_DATA - 1 +}; + +/** + * enum nl80211_hidden_ssid - values for %NL80211_ATTR_HIDDEN_SSID + * @NL80211_HIDDEN_SSID_NOT_IN_USE: do not hide SSID (i.e., broadcast it in + * Beacon frames) + * @NL80211_HIDDEN_SSID_ZERO_LEN: hide SSID by using zero-length SSID element + * in Beacon frames + * @NL80211_HIDDEN_SSID_ZERO_CONTENTS: hide SSID by using correct length of SSID + * element in Beacon frames but zero out each byte in the SSID + */ +enum nl80211_hidden_ssid { + NL80211_HIDDEN_SSID_NOT_IN_USE, + NL80211_HIDDEN_SSID_ZERO_LEN, + NL80211_HIDDEN_SSID_ZERO_CONTENTS +}; + +/** + * enum nl80211_sta_wme_attr - station WME attributes + * @__NL80211_STA_WME_INVALID: invalid number for nested attribute + * @NL80211_STA_WME_UAPSD_QUEUES: bitmap of uapsd queues. the format + * is the same as the AC bitmap in the QoS info field. + * @NL80211_STA_WME_MAX_SP: max service period. the format is the same + * as the MAX_SP field in the QoS info field (but already shifted down). + * @__NL80211_STA_WME_AFTER_LAST: internal + * @NL80211_STA_WME_MAX: highest station WME attribute + */ +enum nl80211_sta_wme_attr { + __NL80211_STA_WME_INVALID, + NL80211_STA_WME_UAPSD_QUEUES, + NL80211_STA_WME_MAX_SP, + + /* keep last */ + __NL80211_STA_WME_AFTER_LAST, + NL80211_STA_WME_MAX = __NL80211_STA_WME_AFTER_LAST - 1 +}; + +/** + * enum nl80211_pmksa_candidate_attr - attributes for PMKSA caching candidates + * @__NL80211_PMKSA_CANDIDATE_INVALID: invalid number for nested attributes + * @NL80211_PMKSA_CANDIDATE_INDEX: candidate index (u32; the smaller, the higher + * priority) + * @NL80211_PMKSA_CANDIDATE_BSSID: candidate BSSID (6 octets) + * @NL80211_PMKSA_CANDIDATE_PREAUTH: RSN pre-authentication supported (flag) + * @NUM_NL80211_PMKSA_CANDIDATE: number of PMKSA caching candidate attributes + * (internal) + * @MAX_NL80211_PMKSA_CANDIDATE: highest PMKSA caching candidate attribute + * (internal) + */ +enum nl80211_pmksa_candidate_attr { + __NL80211_PMKSA_CANDIDATE_INVALID, + NL80211_PMKSA_CANDIDATE_INDEX, + NL80211_PMKSA_CANDIDATE_BSSID, + NL80211_PMKSA_CANDIDATE_PREAUTH, + + /* keep last */ + NUM_NL80211_PMKSA_CANDIDATE, + MAX_NL80211_PMKSA_CANDIDATE = NUM_NL80211_PMKSA_CANDIDATE - 1 +}; + +/** + * enum nl80211_tdls_operation - values for %NL80211_ATTR_TDLS_OPERATION + * @NL80211_TDLS_DISCOVERY_REQ: Send a TDLS discovery request + * @NL80211_TDLS_SETUP: Setup TDLS link + * @NL80211_TDLS_TEARDOWN: Teardown a TDLS link which is already established + * @NL80211_TDLS_ENABLE_LINK: Enable TDLS link + * @NL80211_TDLS_DISABLE_LINK: Disable TDLS link + */ +enum nl80211_tdls_operation { + NL80211_TDLS_DISCOVERY_REQ, + NL80211_TDLS_SETUP, + NL80211_TDLS_TEARDOWN, + NL80211_TDLS_ENABLE_LINK, + NL80211_TDLS_DISABLE_LINK, +}; + +/** + * enum nl80211_ap_sme_features - device-integrated AP features + * @NL80211_AP_SME_SA_QUERY_OFFLOAD: SA Query procedures offloaded to driver + * when user space indicates support for SA Query procedures offload during + * "start ap" with %NL80211_AP_SETTINGS_SA_QUERY_OFFLOAD_SUPPORT. + */ +enum nl80211_ap_sme_features { + NL80211_AP_SME_SA_QUERY_OFFLOAD = 1 << 0, +}; + +/** + * enum nl80211_feature_flags - device/driver features + * @NL80211_FEATURE_SK_TX_STATUS: This driver supports reflecting back + * TX status to the socket error queue when requested with the + * socket option. + * @NL80211_FEATURE_HT_IBSS: This driver supports IBSS with HT datarates. + * @NL80211_FEATURE_INACTIVITY_TIMER: This driver takes care of freeing up + * the connected inactive stations in AP mode. + * @NL80211_FEATURE_CELL_BASE_REG_HINTS: This driver has been tested + * to work properly to support receiving regulatory hints from + * cellular base stations. + * @NL80211_FEATURE_P2P_DEVICE_NEEDS_CHANNEL: (no longer available, only + * here to reserve the value for API/ABI compatibility) + * @NL80211_FEATURE_SAE: This driver supports simultaneous authentication of + * equals (SAE) with user space SME (NL80211_CMD_AUTHENTICATE) in station + * mode + * @NL80211_FEATURE_LOW_PRIORITY_SCAN: This driver supports low priority scan + * @NL80211_FEATURE_SCAN_FLUSH: Scan flush is supported + * @NL80211_FEATURE_AP_SCAN: Support scanning using an AP vif + * @NL80211_FEATURE_VIF_TXPOWER: The driver supports per-vif TX power setting + * @NL80211_FEATURE_NEED_OBSS_SCAN: The driver expects userspace to perform + * OBSS scans and generate 20/40 BSS coex reports. This flag is used only + * for drivers implementing the CONNECT API, for AUTH/ASSOC it is implied. + * @NL80211_FEATURE_P2P_GO_CTWIN: P2P GO implementation supports CT Window + * setting + * @NL80211_FEATURE_P2P_GO_OPPPS: P2P GO implementation supports opportunistic + * powersave + * @NL80211_FEATURE_FULL_AP_CLIENT_STATE: The driver supports full state + * transitions for AP clients. Without this flag (and if the driver + * doesn't have the AP SME in the device) the driver supports adding + * stations only when they're associated and adds them in associated + * state (to later be transitioned into authorized), with this flag + * they should be added before even sending the authentication reply + * and then transitioned into authenticated, associated and authorized + * states using station flags. + * Note that even for drivers that support this, the default is to add + * stations in authenticated/associated state, so to add unauthenticated + * stations the authenticated/associated bits have to be set in the mask. + * @NL80211_FEATURE_ADVERTISE_CHAN_LIMITS: cfg80211 advertises channel limits + * (HT40, VHT 80/160 MHz) if this flag is set + * @NL80211_FEATURE_USERSPACE_MPM: This driver supports a userspace Mesh + * Peering Management entity which may be implemented by registering for + * beacons or NL80211_CMD_NEW_PEER_CANDIDATE events. The mesh beacon is + * still generated by the driver. + * @NL80211_FEATURE_ACTIVE_MONITOR: This driver supports an active monitor + * interface. An active monitor interface behaves like a normal monitor + * interface, but gets added to the driver. It ensures that incoming + * unicast packets directed at the configured interface address get ACKed. + * @NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE: This driver supports dynamic + * channel bandwidth change (e.g., HT 20 <-> 40 MHz channel) during the + * lifetime of a BSS. + * @NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES: This device adds a DS Parameter + * Set IE to probe requests. + * @NL80211_FEATURE_WFA_TPC_IE_IN_PROBES: This device adds a WFA TPC Report IE + * to probe requests. + * @NL80211_FEATURE_QUIET: This device, in client mode, supports Quiet Period + * requests sent to it by an AP. + * @NL80211_FEATURE_TX_POWER_INSERTION: This device is capable of inserting the + * current tx power value into the TPC Report IE in the spectrum + * management TPC Report action frame, and in the Radio Measurement Link + * Measurement Report action frame. + * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout + * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used + * to enable dynack. + * @NL80211_FEATURE_STATIC_SMPS: Device supports static spatial + * multiplexing powersave, ie. can turn off all but one chain + * even on HT connections that should be using more chains. + * @NL80211_FEATURE_DYNAMIC_SMPS: Device supports dynamic spatial + * multiplexing powersave, ie. can turn off all but one chain + * and then wake the rest up as required after, for example, + * rts/cts handshake. + * @NL80211_FEATURE_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM + * TSPEC sessions (TID aka TSID 0-7) with the %NL80211_CMD_ADD_TX_TS + * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it + * needs to be able to handle Block-Ack agreements and other things. + * @NL80211_FEATURE_MAC_ON_CREATE: Device supports configuring + * the vif's MAC address upon creation. + * See 'macaddr' field in the vif_params (cfg80211.h). + * @NL80211_FEATURE_TDLS_CHANNEL_SWITCH: Driver supports channel switching when + * operating as a TDLS peer. + * @NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR: This device/driver supports using a + * random MAC address during scan (if the device is unassociated); the + * %NL80211_SCAN_FLAG_RANDOM_ADDR flag may be set for scans and the MAC + * address mask/value will be used. + * @NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR: This device/driver supports + * using a random MAC address for every scan iteration during scheduled + * scan (while not associated), the %NL80211_SCAN_FLAG_RANDOM_ADDR may + * be set for scheduled scan and the MAC address mask/value will be used. + * @NL80211_FEATURE_ND_RANDOM_MAC_ADDR: This device/driver supports using a + * random MAC address for every scan iteration during "net detect", i.e. + * scan in unassociated WoWLAN, the %NL80211_SCAN_FLAG_RANDOM_ADDR may + * be set for scheduled scan and the MAC address mask/value will be used. + */ +enum nl80211_feature_flags { + NL80211_FEATURE_SK_TX_STATUS = 1 << 0, + NL80211_FEATURE_HT_IBSS = 1 << 1, + NL80211_FEATURE_INACTIVITY_TIMER = 1 << 2, + NL80211_FEATURE_CELL_BASE_REG_HINTS = 1 << 3, + NL80211_FEATURE_P2P_DEVICE_NEEDS_CHANNEL = 1 << 4, + NL80211_FEATURE_SAE = 1 << 5, + NL80211_FEATURE_LOW_PRIORITY_SCAN = 1 << 6, + NL80211_FEATURE_SCAN_FLUSH = 1 << 7, + NL80211_FEATURE_AP_SCAN = 1 << 8, + NL80211_FEATURE_VIF_TXPOWER = 1 << 9, + NL80211_FEATURE_NEED_OBSS_SCAN = 1 << 10, + NL80211_FEATURE_P2P_GO_CTWIN = 1 << 11, + NL80211_FEATURE_P2P_GO_OPPPS = 1 << 12, + /* bit 13 is reserved */ + NL80211_FEATURE_ADVERTISE_CHAN_LIMITS = 1 << 14, + NL80211_FEATURE_FULL_AP_CLIENT_STATE = 1 << 15, + NL80211_FEATURE_USERSPACE_MPM = 1 << 16, + NL80211_FEATURE_ACTIVE_MONITOR = 1 << 17, + NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE = 1 << 18, + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES = 1 << 19, + NL80211_FEATURE_WFA_TPC_IE_IN_PROBES = 1 << 20, + NL80211_FEATURE_QUIET = 1 << 21, + NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, + NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, + NL80211_FEATURE_STATIC_SMPS = 1 << 24, + NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, + NL80211_FEATURE_SUPPORTS_WMM_ADMISSION = 1 << 26, + NL80211_FEATURE_MAC_ON_CREATE = 1 << 27, + NL80211_FEATURE_TDLS_CHANNEL_SWITCH = 1 << 28, + NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR = 1 << 29, + NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR = 1 << 30, + NL80211_FEATURE_ND_RANDOM_MAC_ADDR = 1U << 31, +}; + +/** + * enum nl80211_ext_feature_index - bit index of extended features. + * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. + * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can + * request to use RRM (see %NL80211_ATTR_USE_RRM) with + * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set + * the ASSOC_REQ_USE_RRM flag in the association request even if + * NL80211_FEATURE_QUIET is not advertized. + * @NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER: This device supports MU-MIMO air + * sniffer which means that it can be configured to hear packets from + * certain groups which can be configured by the + * %NL80211_ATTR_MU_MIMO_GROUP_DATA attribute, + * or can be configured to follow a station by configuring the + * %NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR attribute. + * @NL80211_EXT_FEATURE_SCAN_START_TIME: This driver includes the actual + * time the scan started in scan results event. The time is the TSF of + * the BSS that the interface that requested the scan is connected to + * (if available). + * @NL80211_EXT_FEATURE_BSS_PARENT_TSF: Per BSS, this driver reports the + * time the last beacon/probe was received. The time is the TSF of the + * BSS that the interface that requested the scan is connected to + * (if available). + * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of + * channel dwell time. + * @NL80211_EXT_FEATURE_BEACON_RATE_LEGACY: Driver supports beacon rate + * configuration (AP/mesh), supporting a legacy (non HT/VHT) rate. + * @NL80211_EXT_FEATURE_BEACON_RATE_HT: Driver supports beacon rate + * configuration (AP/mesh) with HT rates. + * @NL80211_EXT_FEATURE_BEACON_RATE_VHT: Driver supports beacon rate + * configuration (AP/mesh) with VHT rates. + * @NL80211_EXT_FEATURE_FILS_STA: This driver supports Fast Initial Link Setup + * with user space SME (NL80211_CMD_AUTHENTICATE) in station mode. + * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA: This driver supports randomized TA + * in @NL80211_CMD_FRAME while not associated. + * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED: This driver supports + * randomized TA in @NL80211_CMD_FRAME while associated. + * @NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI: The driver supports sched_scan + * for reporting BSSs with better RSSI than the current connected BSS + * (%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI). + * @NL80211_EXT_FEATURE_CQM_RSSI_LIST: With this driver the + * %NL80211_ATTR_CQM_RSSI_THOLD attribute accepts a list of zero or more + * RSSI threshold values to monitor rather than exactly one threshold. + * @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD: Driver SME supports FILS shared key + * authentication with %NL80211_CMD_CONNECT. + * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK: Device wants to do 4-way + * handshake with PSK in station mode (PSK is passed as part of the connect + * and associate commands), doing it in the host might not be supported. + * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X: Device wants to do doing 4-way + * handshake with 802.1X in station mode (will pass EAP frames to the host + * and accept the set_pmk/del_pmk commands), doing it in the host might not + * be supported. + * @NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME: Driver is capable of overriding + * the max channel attribute in the FILS request params IE with the + * actual dwell time. + * @NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP: Driver accepts broadcast probe + * response + * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE: Driver supports sending + * the first probe request in each channel at rate of at least 5.5Mbps. + * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports + * probe request tx deferral and suppression + * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL + * value in %NL80211_ATTR_USE_MFP. + * @NL80211_EXT_FEATURE_LOW_SPAN_SCAN: Driver supports low span scan. + * @NL80211_EXT_FEATURE_LOW_POWER_SCAN: Driver supports low power scan. + * @NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN: Driver supports high accuracy scan. + * @NL80211_EXT_FEATURE_DFS_OFFLOAD: HW/driver will offload DFS actions. + * Device or driver will do all DFS-related actions by itself, + * informing user-space about CAC progress, radar detection event, + * channel change triggered by radar detection event. + * No need to start CAC from user-space, no need to react to + * "radar detected" event. + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and + * receiving control port frames over nl80211 instead of the netdevice. + * @NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT: This driver/device supports + * (average) ACK signal strength reporting. + * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate + * TXQs. + * @NL80211_EXT_FEATURE_SCAN_RANDOM_SN: Driver/device supports randomizing the + * SN in probe request frames if requested by %NL80211_SCAN_FLAG_RANDOM_SN. + * @NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT: Driver/device can omit all data + * except for supported rates from the probe request content if requested + * by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag. + * @NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER: Driver supports enabling fine + * timing measurement responder role. + * + * @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0: Driver/device confirm that they are + * able to rekey an in-use key correctly. Userspace must not rekey PTK keys + * if this flag is not set. Ignoring this can leak clear text packets and/or + * freeze the connection. + * @NL80211_EXT_FEATURE_EXT_KEY_ID: Driver supports "Extended Key ID for + * Individually Addressed Frames" from IEEE802.11-2016. + * + * @NL80211_EXT_FEATURE_AIRTIME_FAIRNESS: Driver supports getting airtime + * fairness for transmitted packets and has enabled airtime fairness + * scheduling. + * + * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching + * (set/del PMKSA operations) in AP mode. + * + * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports + * filtering of sched scan results using band specific RSSI thresholds. + * + * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power + * to a station. + * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in + * station mode (SAE password is passed as part of the connect command). + * + * @NL80211_EXT_FEATURE_VLAN_OFFLOAD: The driver supports a single netdev + * with VLAN tagged frames and separate VLAN-specific netdevs added using + * vconfig similarly to the Ethernet case. + * + * @NL80211_EXT_FEATURE_AQL: The driver supports the Airtime Queue Limit (AQL) + * feature, which prevents bufferbloat by using the expected transmission + * time to limit the amount of data buffered in the hardware. + * + * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection + * and can receive key configuration for BIGTK using key indexes 6 and 7. + * @NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT: The driver supports Beacon + * protection as a client only and cannot transmit protected beacons. + * + * @NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH: The driver can disable the + * forwarding of preauth frames over the control port. They are then + * handled as ordinary data frames. + * + * @NL80211_EXT_FEATURE_PROTECTED_TWT: Driver supports protected TWT frames + * + * @NL80211_EXT_FEATURE_DEL_IBSS_STA: The driver supports removing stations + * in IBSS mode, essentially by dropping their state. + * + * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations + * are possible for multicast frames and those will be reported properly. + * + * @NL80211_EXT_FEATURE_SCAN_FREQ_KHZ: This driver supports receiving and + * reporting scan request with %NL80211_ATTR_SCAN_FREQ_KHZ. In order to + * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be + * included in the scan request. + * + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver + * can report tx status for control port over nl80211 tx operations. + * + * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating + * Channel Validation (OCV) when using driver's SME for RSNA handshakes. + * + * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK: Device wants to do 4-way + * handshake with PSK in AP mode (PSK is passed as part of the start AP + * command). + * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication + * in AP mode (SAE password is passed as part of the start AP command). + * + * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery + * frames transmission + * + * @NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP: Driver/device supports + * unsolicited broadcast probe response transmission + * + * @NL80211_EXT_FEATURE_BEACON_RATE_HE: Driver supports beacon rate + * configuration (AP/mesh) with HE rates. + * + * @NL80211_EXT_FEATURE_SECURE_LTF: Device supports secure LTF measurement + * exchange protocol. + * + * @NL80211_EXT_FEATURE_SECURE_RTT: Device supports secure RTT measurement + * exchange protocol. + * + * @NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE: Device supports management + * frame protection for all management frames exchanged during the + * negotiation and range measurement procedure. + * + * @NL80211_EXT_FEATURE_BSS_COLOR: The driver supports BSS color collision + * detection and change announcemnts. + * + * @NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD: Driver running in AP mode supports + * FILS encryption and decryption for (Re)Association Request and Response + * frames. Userspace has to share FILS AAD details to the driver by using + * @NL80211_CMD_SET_FILS_AAD. + * + * @NL80211_EXT_FEATURE_RADAR_BACKGROUND: Device supports background radar/CAC + * detection. + * + * @NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE: Device can perform a MAC address + * change without having to bring the underlying network device down + * first. For example, in station mode this can be used to vary the + * origin MAC address prior to a connection to a new AP for privacy + * or other reasons. Note that certain driver specific restrictions + * might apply, e.g. no scans in progress, no offchannel operations + * in progress, and no active connections. + * + * @NUM_NL80211_EXT_FEATURES: number of extended features. + * @MAX_NL80211_EXT_FEATURES: highest extended feature index. + */ +enum nl80211_ext_feature_index { + NL80211_EXT_FEATURE_VHT_IBSS, + NL80211_EXT_FEATURE_RRM, + NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER, + NL80211_EXT_FEATURE_SCAN_START_TIME, + NL80211_EXT_FEATURE_BSS_PARENT_TSF, + NL80211_EXT_FEATURE_SET_SCAN_DWELL, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, + NL80211_EXT_FEATURE_BEACON_RATE_HT, + NL80211_EXT_FEATURE_BEACON_RATE_VHT, + NL80211_EXT_FEATURE_FILS_STA, + NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA, + NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED, + NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI, + NL80211_EXT_FEATURE_CQM_RSSI_LIST, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, + NL80211_EXT_FEATURE_MFP_OPTIONAL, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN, + NL80211_EXT_FEATURE_LOW_POWER_SCAN, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, + NL80211_EXT_FEATURE_DFS_OFFLOAD, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, + NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT, + /* we renamed this - stay compatible */ + NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT = NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT, + NL80211_EXT_FEATURE_TXQS, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT, + NL80211_EXT_FEATURE_CAN_REPLACE_PTK0, + NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING, + NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, + NL80211_EXT_FEATURE_EXT_KEY_ID, + NL80211_EXT_FEATURE_STA_TX_PWR, + NL80211_EXT_FEATURE_SAE_OFFLOAD, + NL80211_EXT_FEATURE_VLAN_OFFLOAD, + NL80211_EXT_FEATURE_AQL, + NL80211_EXT_FEATURE_BEACON_PROTECTION, + NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH, + NL80211_EXT_FEATURE_PROTECTED_TWT, + NL80211_EXT_FEATURE_DEL_IBSS_STA, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, + NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, + NL80211_EXT_FEATURE_FILS_DISCOVERY, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_BEACON_RATE_HE, + NL80211_EXT_FEATURE_SECURE_LTF, + NL80211_EXT_FEATURE_SECURE_RTT, + NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE, + NL80211_EXT_FEATURE_BSS_COLOR, + NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD, + NL80211_EXT_FEATURE_RADAR_BACKGROUND, + NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE, + + /* add new features before the definition below */ + NUM_NL80211_EXT_FEATURES, + MAX_NL80211_EXT_FEATURES = NUM_NL80211_EXT_FEATURES - 1 +}; + +/** + * enum nl80211_probe_resp_offload_support_attr - optional supported + * protocols for probe-response offloading by the driver/FW. + * To be used with the %NL80211_ATTR_PROBE_RESP_OFFLOAD attribute. + * Each enum value represents a bit in the bitmap of supported + * protocols. Typically a subset of probe-requests belonging to a + * supported protocol will be excluded from offload and uploaded + * to the host. + * + * @NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS: Support for WPS ver. 1 + * @NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS2: Support for WPS ver. 2 + * @NL80211_PROBE_RESP_OFFLOAD_SUPPORT_P2P: Support for P2P + * @NL80211_PROBE_RESP_OFFLOAD_SUPPORT_80211U: Support for 802.11u + */ +enum nl80211_probe_resp_offload_support_attr { + NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS = 1<<0, + NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS2 = 1<<1, + NL80211_PROBE_RESP_OFFLOAD_SUPPORT_P2P = 1<<2, + NL80211_PROBE_RESP_OFFLOAD_SUPPORT_80211U = 1<<3, +}; + +/** + * enum nl80211_connect_failed_reason - connection request failed reasons + * @NL80211_CONN_FAIL_MAX_CLIENTS: Maximum number of clients that can be + * handled by the AP is reached. + * @NL80211_CONN_FAIL_BLOCKED_CLIENT: Connection request is rejected due to ACL. + */ +enum nl80211_connect_failed_reason { + NL80211_CONN_FAIL_MAX_CLIENTS, + NL80211_CONN_FAIL_BLOCKED_CLIENT, +}; + +/** + * enum nl80211_timeout_reason - timeout reasons + * + * @NL80211_TIMEOUT_UNSPECIFIED: Timeout reason unspecified. + * @NL80211_TIMEOUT_SCAN: Scan (AP discovery) timed out. + * @NL80211_TIMEOUT_AUTH: Authentication timed out. + * @NL80211_TIMEOUT_ASSOC: Association timed out. + */ +enum nl80211_timeout_reason { + NL80211_TIMEOUT_UNSPECIFIED, + NL80211_TIMEOUT_SCAN, + NL80211_TIMEOUT_AUTH, + NL80211_TIMEOUT_ASSOC, +}; + +/** + * enum nl80211_scan_flags - scan request control flags + * + * Scan request control flags are used to control the handling + * of NL80211_CMD_TRIGGER_SCAN and NL80211_CMD_START_SCHED_SCAN + * requests. + * + * NL80211_SCAN_FLAG_LOW_SPAN, NL80211_SCAN_FLAG_LOW_POWER, and + * NL80211_SCAN_FLAG_HIGH_ACCURACY flags are exclusive of each other, i.e., only + * one of them can be used in the request. + * + * @NL80211_SCAN_FLAG_LOW_PRIORITY: scan request has low priority + * @NL80211_SCAN_FLAG_FLUSH: flush cache before scanning + * @NL80211_SCAN_FLAG_AP: force a scan even if the interface is configured + * as AP and the beaconing has already been configured. This attribute is + * dangerous because will destroy stations performance as a lot of frames + * will be lost while scanning off-channel, therefore it must be used only + * when really needed + * @NL80211_SCAN_FLAG_RANDOM_ADDR: use a random MAC address for this scan (or + * for scheduled scan: a different one for every scan iteration). When the + * flag is set, depending on device capabilities the @NL80211_ATTR_MAC and + * @NL80211_ATTR_MAC_MASK attributes may also be given in which case only + * the masked bits will be preserved from the MAC address and the remainder + * randomised. If the attributes are not given full randomisation (46 bits, + * locally administered 1, multicast 0) is assumed. + * This flag must not be requested when the feature isn't supported, check + * the nl80211 feature flags for the device. + * @NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME: fill the dwell time in the FILS + * request parameters IE in the probe request + * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses + * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at + * rate of at least 5.5M. In case non OCE AP is discovered in the channel, + * only the first probe req in the channel will be sent in high rate. + * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request + * tx deferral (dot11FILSProbeDelay shall be set to 15ms) + * and suppression (if it has received a broadcast Probe Response frame, + * Beacon frame or FILS Discovery frame from an AP that the STA considers + * a suitable candidate for (re-)association - suitable in terms of + * SSID and/or RSSI. + * @NL80211_SCAN_FLAG_LOW_SPAN: Span corresponds to the total time taken to + * accomplish the scan. Thus, this flag intends the driver to perform the + * scan request with lesser span/duration. It is specific to the driver + * implementations on how this is accomplished. Scan accuracy may get + * impacted with this flag. + * @NL80211_SCAN_FLAG_LOW_POWER: This flag intends the scan attempts to consume + * optimal possible power. Drivers can resort to their specific means to + * optimize the power. Scan accuracy may get impacted with this flag. + * @NL80211_SCAN_FLAG_HIGH_ACCURACY: Accuracy here intends to the extent of scan + * results obtained. Thus HIGH_ACCURACY scan flag aims to get maximum + * possible scan results. This flag hints the driver to use the best + * possible scan configuration to improve the accuracy in scanning. + * Latency and power use may get impacted with this flag. + * @NL80211_SCAN_FLAG_RANDOM_SN: randomize the sequence number in probe + * request frames from this scan to avoid correlation/tracking being + * possible. + * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to + * only have supported rates and no additional capabilities (unless + * added by userspace explicitly.) + * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with + * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means + * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. + * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by + * 2.4/5 GHz APs + */ +enum nl80211_scan_flags { + NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, + NL80211_SCAN_FLAG_FLUSH = 1<<1, + NL80211_SCAN_FLAG_AP = 1<<2, + NL80211_SCAN_FLAG_RANDOM_ADDR = 1<<3, + NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME = 1<<4, + NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP = 1<<5, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE = 1<<6, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION = 1<<7, + NL80211_SCAN_FLAG_LOW_SPAN = 1<<8, + NL80211_SCAN_FLAG_LOW_POWER = 1<<9, + NL80211_SCAN_FLAG_HIGH_ACCURACY = 1<<10, + NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, + NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, + NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, + NL80211_SCAN_FLAG_COLOCATED_6GHZ = 1<<14, +}; + +/** + * enum nl80211_acl_policy - access control policy + * + * Access control policy is applied on a MAC list set by + * %NL80211_CMD_START_AP and %NL80211_CMD_SET_MAC_ACL, to + * be used with %NL80211_ATTR_ACL_POLICY. + * + * @NL80211_ACL_POLICY_ACCEPT_UNLESS_LISTED: Deny stations which are + * listed in ACL, i.e. allow all the stations which are not listed + * in ACL to authenticate. + * @NL80211_ACL_POLICY_DENY_UNLESS_LISTED: Allow the stations which are listed + * in ACL, i.e. deny all the stations which are not listed in ACL. + */ +enum nl80211_acl_policy { + NL80211_ACL_POLICY_ACCEPT_UNLESS_LISTED, + NL80211_ACL_POLICY_DENY_UNLESS_LISTED, +}; + +/** + * enum nl80211_smps_mode - SMPS mode + * + * Requested SMPS mode (for AP mode) + * + * @NL80211_SMPS_OFF: SMPS off (use all antennas). + * @NL80211_SMPS_STATIC: static SMPS (use a single antenna) + * @NL80211_SMPS_DYNAMIC: dynamic smps (start with a single antenna and + * turn on other antennas after CTS/RTS). + */ +enum nl80211_smps_mode { + NL80211_SMPS_OFF, + NL80211_SMPS_STATIC, + NL80211_SMPS_DYNAMIC, + + __NL80211_SMPS_AFTER_LAST, + NL80211_SMPS_MAX = __NL80211_SMPS_AFTER_LAST - 1 +}; + +/** + * enum nl80211_radar_event - type of radar event for DFS operation + * + * Type of event to be used with NL80211_ATTR_RADAR_EVENT to inform userspace + * about detected radars or success of the channel available check (CAC) + * + * @NL80211_RADAR_DETECTED: A radar pattern has been detected. The channel is + * now unusable. + * @NL80211_RADAR_CAC_FINISHED: Channel Availability Check has been finished, + * the channel is now available. + * @NL80211_RADAR_CAC_ABORTED: Channel Availability Check has been aborted, no + * change to the channel status. + * @NL80211_RADAR_NOP_FINISHED: The Non-Occupancy Period for this channel is + * over, channel becomes usable. + * @NL80211_RADAR_PRE_CAC_EXPIRED: Channel Availability Check done on this + * non-operating channel is expired and no longer valid. New CAC must + * be done on this channel before starting the operation. This is not + * applicable for ETSI dfs domain where pre-CAC is valid for ever. + * @NL80211_RADAR_CAC_STARTED: Channel Availability Check has been started, + * should be generated by HW if NL80211_EXT_FEATURE_DFS_OFFLOAD is enabled. + */ +enum nl80211_radar_event { + NL80211_RADAR_DETECTED, + NL80211_RADAR_CAC_FINISHED, + NL80211_RADAR_CAC_ABORTED, + NL80211_RADAR_NOP_FINISHED, + NL80211_RADAR_PRE_CAC_EXPIRED, + NL80211_RADAR_CAC_STARTED, +}; + +/** + * enum nl80211_dfs_state - DFS states for channels + * + * Channel states used by the DFS code. + * + * @NL80211_DFS_USABLE: The channel can be used, but channel availability + * check (CAC) must be performed before using it for AP or IBSS. + * @NL80211_DFS_UNAVAILABLE: A radar has been detected on this channel, it + * is therefore marked as not available. + * @NL80211_DFS_AVAILABLE: The channel has been CAC checked and is available. + */ +enum nl80211_dfs_state { + NL80211_DFS_USABLE, + NL80211_DFS_UNAVAILABLE, + NL80211_DFS_AVAILABLE, +}; + +/** + * enum nl80211_protocol_features - nl80211 protocol features + * @NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP: nl80211 supports splitting + * wiphy dumps (if requested by the application with the attribute + * %NL80211_ATTR_SPLIT_WIPHY_DUMP. Also supported is filtering the + * wiphy dump by %NL80211_ATTR_WIPHY, %NL80211_ATTR_IFINDEX or + * %NL80211_ATTR_WDEV. + */ +enum nl80211_protocol_features { + NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP = 1 << 0, +}; + +/** + * enum nl80211_crit_proto_id - nl80211 critical protocol identifiers + * + * @NL80211_CRIT_PROTO_UNSPEC: protocol unspecified. + * @NL80211_CRIT_PROTO_DHCP: BOOTP or DHCPv6 protocol. + * @NL80211_CRIT_PROTO_EAPOL: EAPOL protocol. + * @NL80211_CRIT_PROTO_APIPA: APIPA protocol. + * @NUM_NL80211_CRIT_PROTO: must be kept last. + */ +enum nl80211_crit_proto_id { + NL80211_CRIT_PROTO_UNSPEC, + NL80211_CRIT_PROTO_DHCP, + NL80211_CRIT_PROTO_EAPOL, + NL80211_CRIT_PROTO_APIPA, + /* add other protocols before this one */ + NUM_NL80211_CRIT_PROTO +}; + +/* maximum duration for critical protocol measures */ +#define NL80211_CRIT_PROTO_MAX_DURATION 5000 /* msec */ + +/** + * enum nl80211_rxmgmt_flags - flags for received management frame. + * + * Used by cfg80211_rx_mgmt() + * + * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver. + * @NL80211_RXMGMT_FLAG_EXTERNAL_AUTH: Host driver intends to offload + * the authentication. Exclusively defined for host drivers that + * advertises the SME functionality but would like the userspace + * to handle certain authentication algorithms (e.g. SAE). + */ +enum nl80211_rxmgmt_flags { + NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0, + NL80211_RXMGMT_FLAG_EXTERNAL_AUTH = 1 << 1, +}; + +/* + * If this flag is unset, the lower 24 bits are an OUI, if set + * a Linux nl80211 vendor ID is used (no such IDs are allocated + * yet, so that's not valid so far) + */ +#define NL80211_VENDOR_ID_IS_LINUX 0x80000000 + +/** + * struct nl80211_vendor_cmd_info - vendor command data + * @vendor_id: If the %NL80211_VENDOR_ID_IS_LINUX flag is clear, then the + * value is a 24-bit OUI; if it is set then a separately allocated ID + * may be used, but no such IDs are allocated yet. New IDs should be + * added to this file when needed. + * @subcmd: sub-command ID for the command + */ +struct nl80211_vendor_cmd_info { + __u32 vendor_id; + __u32 subcmd; +}; + +/** + * enum nl80211_tdls_peer_capability - TDLS peer flags. + * + * Used by tdls_mgmt() to determine which conditional elements need + * to be added to TDLS Setup frames. + * + * @NL80211_TDLS_PEER_HT: TDLS peer is HT capable. + * @NL80211_TDLS_PEER_VHT: TDLS peer is VHT capable. + * @NL80211_TDLS_PEER_WMM: TDLS peer is WMM capable. + * @NL80211_TDLS_PEER_HE: TDLS peer is HE capable. + */ +enum nl80211_tdls_peer_capability { + NL80211_TDLS_PEER_HT = 1<<0, + NL80211_TDLS_PEER_VHT = 1<<1, + NL80211_TDLS_PEER_WMM = 1<<2, + NL80211_TDLS_PEER_HE = 1<<3, +}; + +/** + * enum nl80211_sched_scan_plan - scanning plan for scheduled scan + * @__NL80211_SCHED_SCAN_PLAN_INVALID: attribute number 0 is reserved + * @NL80211_SCHED_SCAN_PLAN_INTERVAL: interval between scan iterations. In + * seconds (u32). + * @NL80211_SCHED_SCAN_PLAN_ITERATIONS: number of scan iterations in this + * scan plan (u32). The last scan plan must not specify this attribute + * because it will run infinitely. A value of zero is invalid as it will + * make the scan plan meaningless. + * @NL80211_SCHED_SCAN_PLAN_MAX: highest scheduled scan plan attribute number + * currently defined + * @__NL80211_SCHED_SCAN_PLAN_AFTER_LAST: internal use + */ +enum nl80211_sched_scan_plan { + __NL80211_SCHED_SCAN_PLAN_INVALID, + NL80211_SCHED_SCAN_PLAN_INTERVAL, + NL80211_SCHED_SCAN_PLAN_ITERATIONS, + + /* keep last */ + __NL80211_SCHED_SCAN_PLAN_AFTER_LAST, + NL80211_SCHED_SCAN_PLAN_MAX = + __NL80211_SCHED_SCAN_PLAN_AFTER_LAST - 1 +}; + +/** + * struct nl80211_bss_select_rssi_adjust - RSSI adjustment parameters. + * + * @band: band of BSS that must match for RSSI value adjustment. The value + * of this field is according to &enum nl80211_band. + * @delta: value used to adjust the RSSI value of matching BSS in dB. + */ +struct nl80211_bss_select_rssi_adjust { + __u8 band; + __s8 delta; +} __attribute__((packed)); + +/** + * enum nl80211_bss_select_attr - attributes for bss selection. + * + * @__NL80211_BSS_SELECT_ATTR_INVALID: reserved. + * @NL80211_BSS_SELECT_ATTR_RSSI: Flag indicating only RSSI-based BSS selection + * is requested. + * @NL80211_BSS_SELECT_ATTR_BAND_PREF: attribute indicating BSS + * selection should be done such that the specified band is preferred. + * When there are multiple BSS-es in the preferred band, the driver + * shall use RSSI-based BSS selection as a second step. The value of + * this attribute is according to &enum nl80211_band (u32). + * @NL80211_BSS_SELECT_ATTR_RSSI_ADJUST: When present the RSSI level for + * BSS-es in the specified band is to be adjusted before doing + * RSSI-based BSS selection. The attribute value is a packed structure + * value as specified by &struct nl80211_bss_select_rssi_adjust. + * @NL80211_BSS_SELECT_ATTR_MAX: highest bss select attribute number. + * @__NL80211_BSS_SELECT_ATTR_AFTER_LAST: internal use. + * + * One and only one of these attributes are found within %NL80211_ATTR_BSS_SELECT + * for %NL80211_CMD_CONNECT. It specifies the required BSS selection behaviour + * which the driver shall use. + */ +enum nl80211_bss_select_attr { + __NL80211_BSS_SELECT_ATTR_INVALID, + NL80211_BSS_SELECT_ATTR_RSSI, + NL80211_BSS_SELECT_ATTR_BAND_PREF, + NL80211_BSS_SELECT_ATTR_RSSI_ADJUST, + + /* keep last */ + __NL80211_BSS_SELECT_ATTR_AFTER_LAST, + NL80211_BSS_SELECT_ATTR_MAX = __NL80211_BSS_SELECT_ATTR_AFTER_LAST - 1 +}; + +/** + * enum nl80211_nan_function_type - NAN function type + * + * Defines the function type of a NAN function + * + * @NL80211_NAN_FUNC_PUBLISH: function is publish + * @NL80211_NAN_FUNC_SUBSCRIBE: function is subscribe + * @NL80211_NAN_FUNC_FOLLOW_UP: function is follow-up + */ +enum nl80211_nan_function_type { + NL80211_NAN_FUNC_PUBLISH, + NL80211_NAN_FUNC_SUBSCRIBE, + NL80211_NAN_FUNC_FOLLOW_UP, + + /* keep last */ + __NL80211_NAN_FUNC_TYPE_AFTER_LAST, + NL80211_NAN_FUNC_MAX_TYPE = __NL80211_NAN_FUNC_TYPE_AFTER_LAST - 1, +}; + +/** + * enum nl80211_nan_publish_type - NAN publish tx type + * + * Defines how to send publish Service Discovery Frames + * + * @NL80211_NAN_SOLICITED_PUBLISH: publish function is solicited + * @NL80211_NAN_UNSOLICITED_PUBLISH: publish function is unsolicited + */ +enum nl80211_nan_publish_type { + NL80211_NAN_SOLICITED_PUBLISH = 1 << 0, + NL80211_NAN_UNSOLICITED_PUBLISH = 1 << 1, +}; + +/** + * enum nl80211_nan_func_term_reason - NAN functions termination reason + * + * Defines termination reasons of a NAN function + * + * @NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST: requested by user + * @NL80211_NAN_FUNC_TERM_REASON_TTL_EXPIRED: timeout + * @NL80211_NAN_FUNC_TERM_REASON_ERROR: errored + */ +enum nl80211_nan_func_term_reason { + NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST, + NL80211_NAN_FUNC_TERM_REASON_TTL_EXPIRED, + NL80211_NAN_FUNC_TERM_REASON_ERROR, +}; + +#define NL80211_NAN_FUNC_SERVICE_ID_LEN 6 +#define NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN 0xff +#define NL80211_NAN_FUNC_SRF_MAX_LEN 0xff + +/** + * enum nl80211_nan_func_attributes - NAN function attributes + * @__NL80211_NAN_FUNC_INVALID: invalid + * @NL80211_NAN_FUNC_TYPE: &enum nl80211_nan_function_type (u8). + * @NL80211_NAN_FUNC_SERVICE_ID: 6 bytes of the service ID hash as + * specified in NAN spec. This is a binary attribute. + * @NL80211_NAN_FUNC_PUBLISH_TYPE: relevant if the function's type is + * publish. Defines the transmission type for the publish Service Discovery + * Frame, see &enum nl80211_nan_publish_type. Its type is u8. + * @NL80211_NAN_FUNC_PUBLISH_BCAST: relevant if the function is a solicited + * publish. Should the solicited publish Service Discovery Frame be sent to + * the NAN Broadcast address. This is a flag. + * @NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE: relevant if the function's type is + * subscribe. Is the subscribe active. This is a flag. + * @NL80211_NAN_FUNC_FOLLOW_UP_ID: relevant if the function's type is follow up. + * The instance ID for the follow up Service Discovery Frame. This is u8. + * @NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID: relevant if the function's type + * is follow up. This is a u8. + * The requestor instance ID for the follow up Service Discovery Frame. + * @NL80211_NAN_FUNC_FOLLOW_UP_DEST: the MAC address of the recipient of the + * follow up Service Discovery Frame. This is a binary attribute. + * @NL80211_NAN_FUNC_CLOSE_RANGE: is this function limited for devices in a + * close range. The range itself (RSSI) is defined by the device. + * This is a flag. + * @NL80211_NAN_FUNC_TTL: strictly positive number of DWs this function should + * stay active. If not present infinite TTL is assumed. This is a u32. + * @NL80211_NAN_FUNC_SERVICE_INFO: array of bytes describing the service + * specific info. This is a binary attribute. + * @NL80211_NAN_FUNC_SRF: Service Receive Filter. This is a nested attribute. + * See &enum nl80211_nan_srf_attributes. + * @NL80211_NAN_FUNC_RX_MATCH_FILTER: Receive Matching filter. This is a nested + * attribute. It is a list of binary values. + * @NL80211_NAN_FUNC_TX_MATCH_FILTER: Transmit Matching filter. This is a + * nested attribute. It is a list of binary values. + * @NL80211_NAN_FUNC_INSTANCE_ID: The instance ID of the function. + * Its type is u8 and it cannot be 0. + * @NL80211_NAN_FUNC_TERM_REASON: NAN function termination reason. + * See &enum nl80211_nan_func_term_reason. + * + * @NUM_NL80211_NAN_FUNC_ATTR: internal + * @NL80211_NAN_FUNC_ATTR_MAX: highest NAN function attribute + */ +enum nl80211_nan_func_attributes { + __NL80211_NAN_FUNC_INVALID, + NL80211_NAN_FUNC_TYPE, + NL80211_NAN_FUNC_SERVICE_ID, + NL80211_NAN_FUNC_PUBLISH_TYPE, + NL80211_NAN_FUNC_PUBLISH_BCAST, + NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE, + NL80211_NAN_FUNC_FOLLOW_UP_ID, + NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID, + NL80211_NAN_FUNC_FOLLOW_UP_DEST, + NL80211_NAN_FUNC_CLOSE_RANGE, + NL80211_NAN_FUNC_TTL, + NL80211_NAN_FUNC_SERVICE_INFO, + NL80211_NAN_FUNC_SRF, + NL80211_NAN_FUNC_RX_MATCH_FILTER, + NL80211_NAN_FUNC_TX_MATCH_FILTER, + NL80211_NAN_FUNC_INSTANCE_ID, + NL80211_NAN_FUNC_TERM_REASON, + + /* keep last */ + NUM_NL80211_NAN_FUNC_ATTR, + NL80211_NAN_FUNC_ATTR_MAX = NUM_NL80211_NAN_FUNC_ATTR - 1 +}; + +/** + * enum nl80211_nan_srf_attributes - NAN Service Response filter attributes + * @__NL80211_NAN_SRF_INVALID: invalid + * @NL80211_NAN_SRF_INCLUDE: present if the include bit of the SRF set. + * This is a flag. + * @NL80211_NAN_SRF_BF: Bloom Filter. Present if and only if + * %NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary. + * @NL80211_NAN_SRF_BF_IDX: index of the Bloom Filter. Mandatory if + * %NL80211_NAN_SRF_BF is present. This is a u8. + * @NL80211_NAN_SRF_MAC_ADDRS: list of MAC addresses for the SRF. Present if + * and only if %NL80211_NAN_SRF_BF isn't present. This is a nested + * attribute. Each nested attribute is a MAC address. + * @NUM_NL80211_NAN_SRF_ATTR: internal + * @NL80211_NAN_SRF_ATTR_MAX: highest NAN SRF attribute + */ +enum nl80211_nan_srf_attributes { + __NL80211_NAN_SRF_INVALID, + NL80211_NAN_SRF_INCLUDE, + NL80211_NAN_SRF_BF, + NL80211_NAN_SRF_BF_IDX, + NL80211_NAN_SRF_MAC_ADDRS, + + /* keep last */ + NUM_NL80211_NAN_SRF_ATTR, + NL80211_NAN_SRF_ATTR_MAX = NUM_NL80211_NAN_SRF_ATTR - 1, +}; + +/** + * enum nl80211_nan_match_attributes - NAN match attributes + * @__NL80211_NAN_MATCH_INVALID: invalid + * @NL80211_NAN_MATCH_FUNC_LOCAL: the local function that had the + * match. This is a nested attribute. + * See &enum nl80211_nan_func_attributes. + * @NL80211_NAN_MATCH_FUNC_PEER: the peer function + * that caused the match. This is a nested attribute. + * See &enum nl80211_nan_func_attributes. + * + * @NUM_NL80211_NAN_MATCH_ATTR: internal + * @NL80211_NAN_MATCH_ATTR_MAX: highest NAN match attribute + */ +enum nl80211_nan_match_attributes { + __NL80211_NAN_MATCH_INVALID, + NL80211_NAN_MATCH_FUNC_LOCAL, + NL80211_NAN_MATCH_FUNC_PEER, + + /* keep last */ + NUM_NL80211_NAN_MATCH_ATTR, + NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 +}; + +/** + * nl80211_external_auth_action - Action to perform with external + * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. + * @NL80211_EXTERNAL_AUTH_START: Start the authentication. + * @NL80211_EXTERNAL_AUTH_ABORT: Abort the ongoing authentication. + */ +enum nl80211_external_auth_action { + NL80211_EXTERNAL_AUTH_START, + NL80211_EXTERNAL_AUTH_ABORT, +}; + +/** + * enum nl80211_ftm_responder_attributes - fine timing measurement + * responder attributes + * @__NL80211_FTM_RESP_ATTR_INVALID: Invalid + * @NL80211_FTM_RESP_ATTR_ENABLED: FTM responder is enabled + * @NL80211_FTM_RESP_ATTR_LCI: The content of Measurement Report Element + * (9.4.2.22 in 802.11-2016) with type 8 - LCI (9.4.2.22.10), + * i.e. starting with the measurement token + * @NL80211_FTM_RESP_ATTR_CIVIC: The content of Measurement Report Element + * (9.4.2.22 in 802.11-2016) with type 11 - Civic (Section 9.4.2.22.13), + * i.e. starting with the measurement token + * @__NL80211_FTM_RESP_ATTR_LAST: Internal + * @NL80211_FTM_RESP_ATTR_MAX: highest FTM responder attribute. + */ +enum nl80211_ftm_responder_attributes { + __NL80211_FTM_RESP_ATTR_INVALID, + + NL80211_FTM_RESP_ATTR_ENABLED, + NL80211_FTM_RESP_ATTR_LCI, + NL80211_FTM_RESP_ATTR_CIVICLOC, + + /* keep last */ + __NL80211_FTM_RESP_ATTR_LAST, + NL80211_FTM_RESP_ATTR_MAX = __NL80211_FTM_RESP_ATTR_LAST - 1, +}; + +/* + * enum nl80211_ftm_responder_stats - FTM responder statistics + * + * These attribute types are used with %NL80211_ATTR_FTM_RESPONDER_STATS + * when getting FTM responder statistics. + * + * @__NL80211_FTM_STATS_INVALID: attribute number 0 is reserved + * @NL80211_FTM_STATS_SUCCESS_NUM: number of FTM sessions in which all frames + * were ssfully answered (u32) + * @NL80211_FTM_STATS_PARTIAL_NUM: number of FTM sessions in which part of the + * frames were successfully answered (u32) + * @NL80211_FTM_STATS_FAILED_NUM: number of failed FTM sessions (u32) + * @NL80211_FTM_STATS_ASAP_NUM: number of ASAP sessions (u32) + * @NL80211_FTM_STATS_NON_ASAP_NUM: number of non-ASAP sessions (u32) + * @NL80211_FTM_STATS_TOTAL_DURATION_MSEC: total sessions durations - gives an + * indication of how much time the responder was busy (u64, msec) + * @NL80211_FTM_STATS_UNKNOWN_TRIGGERS_NUM: number of unknown FTM triggers - + * triggers from initiators that didn't finish successfully the negotiation + * phase with the responder (u32) + * @NL80211_FTM_STATS_RESCHEDULE_REQUESTS_NUM: number of FTM reschedule requests + * - initiator asks for a new scheduling although it already has scheduled + * FTM slot (u32) + * @NL80211_FTM_STATS_OUT_OF_WINDOW_TRIGGERS_NUM: number of FTM triggers out of + * scheduled window (u32) + * @NL80211_FTM_STATS_PAD: used for padding, ignore + * @__NL80211_TXQ_ATTR_AFTER_LAST: Internal + * @NL80211_FTM_STATS_MAX: highest possible FTM responder stats attribute + */ +enum nl80211_ftm_responder_stats { + __NL80211_FTM_STATS_INVALID, + NL80211_FTM_STATS_SUCCESS_NUM, + NL80211_FTM_STATS_PARTIAL_NUM, + NL80211_FTM_STATS_FAILED_NUM, + NL80211_FTM_STATS_ASAP_NUM, + NL80211_FTM_STATS_NON_ASAP_NUM, + NL80211_FTM_STATS_TOTAL_DURATION_MSEC, + NL80211_FTM_STATS_UNKNOWN_TRIGGERS_NUM, + NL80211_FTM_STATS_RESCHEDULE_REQUESTS_NUM, + NL80211_FTM_STATS_OUT_OF_WINDOW_TRIGGERS_NUM, + NL80211_FTM_STATS_PAD, + + /* keep last */ + __NL80211_FTM_STATS_AFTER_LAST, + NL80211_FTM_STATS_MAX = __NL80211_FTM_STATS_AFTER_LAST - 1 +}; + +/** + * enum nl80211_preamble - frame preamble types + * @NL80211_PREAMBLE_LEGACY: legacy (HR/DSSS, OFDM, ERP PHY) preamble + * @NL80211_PREAMBLE_HT: HT preamble + * @NL80211_PREAMBLE_VHT: VHT preamble + * @NL80211_PREAMBLE_DMG: DMG preamble + * @NL80211_PREAMBLE_HE: HE preamble + */ +enum nl80211_preamble { + NL80211_PREAMBLE_LEGACY, + NL80211_PREAMBLE_HT, + NL80211_PREAMBLE_VHT, + NL80211_PREAMBLE_DMG, + NL80211_PREAMBLE_HE, +}; + +/** + * enum nl80211_peer_measurement_type - peer measurement types + * @NL80211_PMSR_TYPE_INVALID: invalid/unused, needed as we use + * these numbers also for attributes + * + * @NL80211_PMSR_TYPE_FTM: flight time measurement + * + * @NUM_NL80211_PMSR_TYPES: internal + * @NL80211_PMSR_TYPE_MAX: highest type number + */ +enum nl80211_peer_measurement_type { + NL80211_PMSR_TYPE_INVALID, + + NL80211_PMSR_TYPE_FTM, + + NUM_NL80211_PMSR_TYPES, + NL80211_PMSR_TYPE_MAX = NUM_NL80211_PMSR_TYPES - 1 +}; + +/** + * enum nl80211_peer_measurement_status - peer measurement status + * @NL80211_PMSR_STATUS_SUCCESS: measurement completed successfully + * @NL80211_PMSR_STATUS_REFUSED: measurement was locally refused + * @NL80211_PMSR_STATUS_TIMEOUT: measurement timed out + * @NL80211_PMSR_STATUS_FAILURE: measurement failed, a type-dependent + * reason may be available in the response data + */ +enum nl80211_peer_measurement_status { + NL80211_PMSR_STATUS_SUCCESS, + NL80211_PMSR_STATUS_REFUSED, + NL80211_PMSR_STATUS_TIMEOUT, + NL80211_PMSR_STATUS_FAILURE, +}; + +/** + * enum nl80211_peer_measurement_req - peer measurement request attributes + * @__NL80211_PMSR_REQ_ATTR_INVALID: invalid + * + * @NL80211_PMSR_REQ_ATTR_DATA: This is a nested attribute with measurement + * type-specific request data inside. The attributes used are from the + * enums named nl80211_peer_measurement__req. + * @NL80211_PMSR_REQ_ATTR_GET_AP_TSF: include AP TSF timestamp, if supported + * (flag attribute) + * + * @NUM_NL80211_PMSR_REQ_ATTRS: internal + * @NL80211_PMSR_REQ_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_req { + __NL80211_PMSR_REQ_ATTR_INVALID, + + NL80211_PMSR_REQ_ATTR_DATA, + NL80211_PMSR_REQ_ATTR_GET_AP_TSF, + + /* keep last */ + NUM_NL80211_PMSR_REQ_ATTRS, + NL80211_PMSR_REQ_ATTR_MAX = NUM_NL80211_PMSR_REQ_ATTRS - 1 +}; + +/** + * enum nl80211_peer_measurement_resp - peer measurement response attributes + * @__NL80211_PMSR_RESP_ATTR_INVALID: invalid + * + * @NL80211_PMSR_RESP_ATTR_DATA: This is a nested attribute with measurement + * type-specific results inside. The attributes used are from the enums + * named nl80211_peer_measurement__resp. + * @NL80211_PMSR_RESP_ATTR_STATUS: u32 value with the measurement status + * (using values from &enum nl80211_peer_measurement_status.) + * @NL80211_PMSR_RESP_ATTR_HOST_TIME: host time (%CLOCK_BOOTTIME) when the + * result was measured; this value is not expected to be accurate to + * more than 20ms. (u64, nanoseconds) + * @NL80211_PMSR_RESP_ATTR_AP_TSF: TSF of the AP that the interface + * doing the measurement is connected to when the result was measured. + * This shall be accurately reported if supported and requested + * (u64, usec) + * @NL80211_PMSR_RESP_ATTR_FINAL: If results are sent to the host partially + * (*e.g. with FTM per-burst data) this flag will be cleared on all but + * the last result; if all results are combined it's set on the single + * result. + * @NL80211_PMSR_RESP_ATTR_PAD: padding for 64-bit attributes, ignore + * + * @NUM_NL80211_PMSR_RESP_ATTRS: internal + * @NL80211_PMSR_RESP_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_resp { + __NL80211_PMSR_RESP_ATTR_INVALID, + + NL80211_PMSR_RESP_ATTR_DATA, + NL80211_PMSR_RESP_ATTR_STATUS, + NL80211_PMSR_RESP_ATTR_HOST_TIME, + NL80211_PMSR_RESP_ATTR_AP_TSF, + NL80211_PMSR_RESP_ATTR_FINAL, + NL80211_PMSR_RESP_ATTR_PAD, + + /* keep last */ + NUM_NL80211_PMSR_RESP_ATTRS, + NL80211_PMSR_RESP_ATTR_MAX = NUM_NL80211_PMSR_RESP_ATTRS - 1 +}; + +/** + * enum nl80211_peer_measurement_peer_attrs - peer attributes for measurement + * @__NL80211_PMSR_PEER_ATTR_INVALID: invalid + * + * @NL80211_PMSR_PEER_ATTR_ADDR: peer's MAC address + * @NL80211_PMSR_PEER_ATTR_CHAN: channel definition, nested, using top-level + * attributes like %NL80211_ATTR_WIPHY_FREQ etc. + * @NL80211_PMSR_PEER_ATTR_REQ: This is a nested attribute indexed by + * measurement type, with attributes from the + * &enum nl80211_peer_measurement_req inside. + * @NL80211_PMSR_PEER_ATTR_RESP: This is a nested attribute indexed by + * measurement type, with attributes from the + * &enum nl80211_peer_measurement_resp inside. + * + * @NUM_NL80211_PMSR_PEER_ATTRS: internal + * @NL80211_PMSR_PEER_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_peer_attrs { + __NL80211_PMSR_PEER_ATTR_INVALID, + + NL80211_PMSR_PEER_ATTR_ADDR, + NL80211_PMSR_PEER_ATTR_CHAN, + NL80211_PMSR_PEER_ATTR_REQ, + NL80211_PMSR_PEER_ATTR_RESP, + + /* keep last */ + NUM_NL80211_PMSR_PEER_ATTRS, + NL80211_PMSR_PEER_ATTR_MAX = NUM_NL80211_PMSR_PEER_ATTRS - 1, +}; + +/** + * enum nl80211_peer_measurement_attrs - peer measurement attributes + * @__NL80211_PMSR_ATTR_INVALID: invalid + * + * @NL80211_PMSR_ATTR_MAX_PEERS: u32 attribute used for capability + * advertisement only, indicates the maximum number of peers + * measurements can be done with in a single request + * @NL80211_PMSR_ATTR_REPORT_AP_TSF: flag attribute in capability + * indicating that the connected AP's TSF can be reported in + * measurement results + * @NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR: flag attribute in capability + * indicating that MAC address randomization is supported. + * @NL80211_PMSR_ATTR_TYPE_CAPA: capabilities reported by the device, + * this contains a nesting indexed by measurement type, and + * type-specific capabilities inside, which are from the enums + * named nl80211_peer_measurement__capa. + * @NL80211_PMSR_ATTR_PEERS: nested attribute, the nesting index is + * meaningless, just a list of peers to measure with, with the + * sub-attributes taken from + * &enum nl80211_peer_measurement_peer_attrs. + * + * @NUM_NL80211_PMSR_ATTR: internal + * @NL80211_PMSR_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_attrs { + __NL80211_PMSR_ATTR_INVALID, + + NL80211_PMSR_ATTR_MAX_PEERS, + NL80211_PMSR_ATTR_REPORT_AP_TSF, + NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR, + NL80211_PMSR_ATTR_TYPE_CAPA, + NL80211_PMSR_ATTR_PEERS, + + /* keep last */ + NUM_NL80211_PMSR_ATTR, + NL80211_PMSR_ATTR_MAX = NUM_NL80211_PMSR_ATTR - 1 +}; + +/** + * enum nl80211_peer_measurement_ftm_capa - FTM capabilities + * @__NL80211_PMSR_FTM_CAPA_ATTR_INVALID: invalid + * + * @NL80211_PMSR_FTM_CAPA_ATTR_ASAP: flag attribute indicating ASAP mode + * is supported + * @NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP: flag attribute indicating non-ASAP + * mode is supported + * @NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI: flag attribute indicating if LCI + * data can be requested during the measurement + * @NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC: flag attribute indicating if civic + * location data can be requested during the measurement + * @NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES: u32 bitmap attribute of bits + * from &enum nl80211_preamble. + * @NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS: bitmap of values from + * &enum nl80211_chan_width indicating the supported channel + * bandwidths for FTM. Note that a higher channel bandwidth may be + * configured to allow for other measurements types with different + * bandwidth requirement in the same measurement. + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT: u32 attribute indicating + * the maximum bursts exponent that can be used (if not present anything + * is valid) + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST: u32 attribute indicating + * the maximum FTMs per burst (if not present anything is valid) + * @NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED: flag attribute indicating if + * trigger based ranging measurement is supported + * @NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED: flag attribute indicating + * if non trigger based ranging measurement is supported + * + * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_ftm_capa { + __NL80211_PMSR_FTM_CAPA_ATTR_INVALID, + + NL80211_PMSR_FTM_CAPA_ATTR_ASAP, + NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP, + NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI, + NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC, + NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES, + NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST, + NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED, + NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED, + + /* keep last */ + NUM_NL80211_PMSR_FTM_CAPA_ATTR, + NL80211_PMSR_FTM_CAPA_ATTR_MAX = NUM_NL80211_PMSR_FTM_CAPA_ATTR - 1 +}; + +/** + * enum nl80211_peer_measurement_ftm_req - FTM request attributes + * @__NL80211_PMSR_FTM_REQ_ATTR_INVALID: invalid + * + * @NL80211_PMSR_FTM_REQ_ATTR_ASAP: ASAP mode requested (flag) + * @NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE: preamble type (see + * &enum nl80211_preamble), optional for DMG (u32) + * @NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP: number of bursts exponent as in + * 802.11-2016 9.4.2.168 "Fine Timing Measurement Parameters element" + * (u8, 0-15, optional with default 15 i.e. "no preference") + * @NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD: interval between bursts in units + * of 100ms (u16, optional with default 0) + * @NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION: burst duration, as in 802.11-2016 + * Table 9-257 "Burst Duration field encoding" (u8, 0-15, optional with + * default 15 i.e. "no preference") + * @NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST: number of successful FTM frames + * requested per burst + * (u8, 0-31, optional with default 0 i.e. "no preference") + * @NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES: number of FTMR frame retries + * (u8, default 3) + * @NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI: request LCI data (flag) + * @NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC: request civic location data + * (flag) + * @NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED: request trigger based ranging + * measurement (flag). + * This attribute and %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED are + * mutually exclusive. + * if neither %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED nor + * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set, EDCA based + * ranging will be used. + * @NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED: request non trigger based + * ranging measurement (flag) + * This attribute and %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED are + * mutually exclusive. + * if neither %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED nor + * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set, EDCA based + * ranging will be used. + * @NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK: negotiate for LMR feedback. Only + * valid if either %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED or + * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set. + * @NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR: optional. The BSS color of the + * responder. Only valid if %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED + * or %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED is set. + * + * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal + * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_ftm_req { + __NL80211_PMSR_FTM_REQ_ATTR_INVALID, + + NL80211_PMSR_FTM_REQ_ATTR_ASAP, + NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE, + NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP, + NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD, + NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, + NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST, + NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES, + NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI, + NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC, + NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED, + NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED, + NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK, + NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR, + + /* keep last */ + NUM_NL80211_PMSR_FTM_REQ_ATTR, + NL80211_PMSR_FTM_REQ_ATTR_MAX = NUM_NL80211_PMSR_FTM_REQ_ATTR - 1 +}; + +/** + * enum nl80211_peer_measurement_ftm_failure_reasons - FTM failure reasons + * @NL80211_PMSR_FTM_FAILURE_UNSPECIFIED: unspecified failure, not used + * @NL80211_PMSR_FTM_FAILURE_NO_RESPONSE: no response from the FTM responder + * @NL80211_PMSR_FTM_FAILURE_REJECTED: FTM responder rejected measurement + * @NL80211_PMSR_FTM_FAILURE_WRONG_CHANNEL: we already know the peer is + * on a different channel, so can't measure (if we didn't know, we'd + * try and get no response) + * @NL80211_PMSR_FTM_FAILURE_PEER_NOT_CAPABLE: peer can't actually do FTM + * @NL80211_PMSR_FTM_FAILURE_INVALID_TIMESTAMP: invalid T1/T4 timestamps + * received + * @NL80211_PMSR_FTM_FAILURE_PEER_BUSY: peer reports busy, you may retry + * later (see %NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME) + * @NL80211_PMSR_FTM_FAILURE_BAD_CHANGED_PARAMS: parameters were changed + * by the peer and are no longer supported + */ +enum nl80211_peer_measurement_ftm_failure_reasons { + NL80211_PMSR_FTM_FAILURE_UNSPECIFIED, + NL80211_PMSR_FTM_FAILURE_NO_RESPONSE, + NL80211_PMSR_FTM_FAILURE_REJECTED, + NL80211_PMSR_FTM_FAILURE_WRONG_CHANNEL, + NL80211_PMSR_FTM_FAILURE_PEER_NOT_CAPABLE, + NL80211_PMSR_FTM_FAILURE_INVALID_TIMESTAMP, + NL80211_PMSR_FTM_FAILURE_PEER_BUSY, + NL80211_PMSR_FTM_FAILURE_BAD_CHANGED_PARAMS, +}; + +/** + * enum nl80211_peer_measurement_ftm_resp - FTM response attributes + * @__NL80211_PMSR_FTM_RESP_ATTR_INVALID: invalid + * + * @NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON: FTM-specific failure reason + * (u32, optional) + * @NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX: optional, if bursts are reported + * as separate results then it will be the burst index 0...(N-1) and + * the top level will indicate partial results (u32) + * @NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS: number of FTM Request frames + * transmitted (u32, optional) + * @NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES: number of FTM Request frames + * that were acknowleged (u32, optional) + * @NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME: retry time received from the + * busy peer (u32, seconds) + * @NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP: actual number of bursts exponent + * used by the responder (similar to request, u8) + * @NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION: actual burst duration used by + * the responder (similar to request, u8) + * @NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST: actual FTMs per burst used + * by the responder (similar to request, u8) + * @NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG: average RSSI across all FTM action + * frames (optional, s32, 1/2 dBm) + * @NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD: RSSI spread across all FTM action + * frames (optional, s32, 1/2 dBm) + * @NL80211_PMSR_FTM_RESP_ATTR_TX_RATE: bitrate we used for the response to the + * FTM action frame (optional, nested, using &enum nl80211_rate_info + * attributes) + * @NL80211_PMSR_FTM_RESP_ATTR_RX_RATE: bitrate the responder used for the FTM + * action frame (optional, nested, using &enum nl80211_rate_info attrs) + * @NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG: average RTT (s64, picoseconds, optional + * but one of RTT/DIST must be present) + * @NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE: RTT variance (u64, ps^2, note that + * standard deviation is the square root of variance, optional) + * @NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD: RTT spread (u64, picoseconds, + * optional) + * @NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG: average distance (s64, mm, optional + * but one of RTT/DIST must be present) + * @NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE: distance variance (u64, mm^2, note + * that standard deviation is the square root of variance, optional) + * @NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD: distance spread (u64, mm, optional) + * @NL80211_PMSR_FTM_RESP_ATTR_LCI: LCI data from peer (binary, optional); + * this is the contents of the Measurement Report Element (802.11-2016 + * 9.4.2.22.1) starting with the Measurement Token, with Measurement + * Type 8. + * @NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC: civic location data from peer + * (binary, optional); + * this is the contents of the Measurement Report Element (802.11-2016 + * 9.4.2.22.1) starting with the Measurement Token, with Measurement + * Type 11. + * @NL80211_PMSR_FTM_RESP_ATTR_PAD: ignore, for u64/s64 padding only + * + * @NUM_NL80211_PMSR_FTM_RESP_ATTR: internal + * @NL80211_PMSR_FTM_RESP_ATTR_MAX: highest attribute number + */ +enum nl80211_peer_measurement_ftm_resp { + __NL80211_PMSR_FTM_RESP_ATTR_INVALID, + + NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON, + NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX, + NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS, + NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES, + NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME, + NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP, + NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION, + NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST, + NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG, + NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD, + NL80211_PMSR_FTM_RESP_ATTR_TX_RATE, + NL80211_PMSR_FTM_RESP_ATTR_RX_RATE, + NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG, + NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE, + NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD, + NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG, + NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE, + NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD, + NL80211_PMSR_FTM_RESP_ATTR_LCI, + NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC, + NL80211_PMSR_FTM_RESP_ATTR_PAD, + + /* keep last */ + NUM_NL80211_PMSR_FTM_RESP_ATTR, + NL80211_PMSR_FTM_RESP_ATTR_MAX = NUM_NL80211_PMSR_FTM_RESP_ATTR - 1 +}; + +/** + * enum nl80211_obss_pd_attributes - OBSS packet detection attributes + * @__NL80211_HE_OBSS_PD_ATTR_INVALID: Invalid + * + * @NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET: the OBSS PD minimum tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET: the OBSS PD maximum tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET: the non-SRG OBSS PD maximum + * tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP: bitmap that indicates the BSS color + * values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP: bitmap that indicates the partial + * BSSID values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_SR_CTRL: The SR Control field of SRP element. + * + * @__NL80211_HE_OBSS_PD_ATTR_LAST: Internal + * @NL80211_HE_OBSS_PD_ATTR_MAX: highest OBSS PD attribute. + */ +enum nl80211_obss_pd_attributes { + __NL80211_HE_OBSS_PD_ATTR_INVALID, + + NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET, + NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP, + NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP, + NL80211_HE_OBSS_PD_ATTR_SR_CTRL, + + /* keep last */ + __NL80211_HE_OBSS_PD_ATTR_LAST, + NL80211_HE_OBSS_PD_ATTR_MAX = __NL80211_HE_OBSS_PD_ATTR_LAST - 1, +}; + +/** + * enum nl80211_bss_color_attributes - BSS Color attributes + * @__NL80211_HE_BSS_COLOR_ATTR_INVALID: Invalid + * + * @NL80211_HE_BSS_COLOR_ATTR_COLOR: the current BSS Color. + * @NL80211_HE_BSS_COLOR_ATTR_DISABLED: is BSS coloring disabled. + * @NL80211_HE_BSS_COLOR_ATTR_PARTIAL: the AID equation to be used.. + * + * @__NL80211_HE_BSS_COLOR_ATTR_LAST: Internal + * @NL80211_HE_BSS_COLOR_ATTR_MAX: highest BSS Color attribute. + */ +enum nl80211_bss_color_attributes { + __NL80211_HE_BSS_COLOR_ATTR_INVALID, + + NL80211_HE_BSS_COLOR_ATTR_COLOR, + NL80211_HE_BSS_COLOR_ATTR_DISABLED, + NL80211_HE_BSS_COLOR_ATTR_PARTIAL, + + /* keep last */ + __NL80211_HE_BSS_COLOR_ATTR_LAST, + NL80211_HE_BSS_COLOR_ATTR_MAX = __NL80211_HE_BSS_COLOR_ATTR_LAST - 1, +}; + +/** + * enum nl80211_iftype_akm_attributes - interface type AKM attributes + * @__NL80211_IFTYPE_AKM_ATTR_INVALID: Invalid + * + * @NL80211_IFTYPE_AKM_ATTR_IFTYPES: nested attribute containing a flag + * attribute for each interface type that supports AKM suites specified in + * %NL80211_IFTYPE_AKM_ATTR_SUITES + * @NL80211_IFTYPE_AKM_ATTR_SUITES: an array of u32. Used to indicate supported + * AKM suites for the specified interface types. + * + * @__NL80211_IFTYPE_AKM_ATTR_LAST: Internal + * @NL80211_IFTYPE_AKM_ATTR_MAX: highest interface type AKM attribute. + */ +enum nl80211_iftype_akm_attributes { + __NL80211_IFTYPE_AKM_ATTR_INVALID, + + NL80211_IFTYPE_AKM_ATTR_IFTYPES, + NL80211_IFTYPE_AKM_ATTR_SUITES, + + /* keep last */ + __NL80211_IFTYPE_AKM_ATTR_LAST, + NL80211_IFTYPE_AKM_ATTR_MAX = __NL80211_IFTYPE_AKM_ATTR_LAST - 1, +}; + +/** + * enum nl80211_fils_discovery_attributes - FILS discovery configuration + * from IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @__NL80211_FILS_DISCOVERY_ATTR_INVALID: Invalid + * + * @NL80211_FILS_DISCOVERY_ATTR_INT_MIN: Minimum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_INT_MAX: Maximum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_TMPL: Template data for FILS discovery action + * frame including the headers. + * + * @__NL80211_FILS_DISCOVERY_ATTR_LAST: Internal + * @NL80211_FILS_DISCOVERY_ATTR_MAX: highest attribute + */ +enum nl80211_fils_discovery_attributes { + __NL80211_FILS_DISCOVERY_ATTR_INVALID, + + NL80211_FILS_DISCOVERY_ATTR_INT_MIN, + NL80211_FILS_DISCOVERY_ATTR_INT_MAX, + NL80211_FILS_DISCOVERY_ATTR_TMPL, + + /* keep last */ + __NL80211_FILS_DISCOVERY_ATTR_LAST, + NL80211_FILS_DISCOVERY_ATTR_MAX = __NL80211_FILS_DISCOVERY_ATTR_LAST - 1 +}; + +/* + * FILS discovery template minimum length with action frame headers and + * mandatory fields. + */ +#define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 + +/** + * enum nl80211_unsol_bcast_probe_resp_attributes - Unsolicited broadcast probe + * response configuration. Applicable only in 6GHz. + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID: Invalid + * + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT: Maximum packet interval (u32, TU). + * Allowed range: 0..20 (TU = Time Unit). IEEE P802.11ax/D6.0 + * 26.17.2.3.2 (AP behavior for fast passive scanning). + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL: Unsolicited broadcast probe response + * frame template (binary). + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST: Internal + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX: highest attribute + */ +enum nl80211_unsol_bcast_probe_resp_attributes { + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID, + + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL, + + /* keep last */ + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX = + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST - 1 +}; + +/** + * enum nl80211_sae_pwe_mechanism - The mechanism(s) allowed for SAE PWE + * derivation. Applicable only when WPA3-Personal SAE authentication is + * used. + * + * @NL80211_SAE_PWE_UNSPECIFIED: not specified, used internally to indicate that + * attribute is not present from userspace. + * @NL80211_SAE_PWE_HUNT_AND_PECK: hunting-and-pecking loop only + * @NL80211_SAE_PWE_HASH_TO_ELEMENT: hash-to-element only + * @NL80211_SAE_PWE_BOTH: both hunting-and-pecking loop and hash-to-element + * can be used. + */ +enum nl80211_sae_pwe_mechanism { + NL80211_SAE_PWE_UNSPECIFIED, + NL80211_SAE_PWE_HUNT_AND_PECK, + NL80211_SAE_PWE_HASH_TO_ELEMENT, + NL80211_SAE_PWE_BOTH, +}; + +/** + * enum nl80211_sar_type - type of SAR specs + * + * @NL80211_SAR_TYPE_POWER: power limitation specified in 0.25dBm unit + * + */ +enum nl80211_sar_type { + NL80211_SAR_TYPE_POWER, + + /* add new type here */ + + /* Keep last */ + NUM_NL80211_SAR_TYPE, +}; + +/** + * enum nl80211_sar_attrs - Attributes for SAR spec + * + * @NL80211_SAR_ATTR_TYPE: the SAR type as defined in &enum nl80211_sar_type. + * + * @NL80211_SAR_ATTR_SPECS: Nested array of SAR power + * limit specifications. Each specification contains a set + * of %nl80211_sar_specs_attrs. + * + * For SET operation, it contains array of %NL80211_SAR_ATTR_SPECS_POWER + * and %NL80211_SAR_ATTR_SPECS_RANGE_INDEX. + * + * For sar_capa dump, it contains array of + * %NL80211_SAR_ATTR_SPECS_START_FREQ + * and %NL80211_SAR_ATTR_SPECS_END_FREQ. + * + * @__NL80211_SAR_ATTR_LAST: Internal + * @NL80211_SAR_ATTR_MAX: highest sar attribute + * + * These attributes are used with %NL80211_CMD_SET_SAR_SPEC + */ +enum nl80211_sar_attrs { + __NL80211_SAR_ATTR_INVALID, + + NL80211_SAR_ATTR_TYPE, + NL80211_SAR_ATTR_SPECS, + + __NL80211_SAR_ATTR_LAST, + NL80211_SAR_ATTR_MAX = __NL80211_SAR_ATTR_LAST - 1, +}; + +/** + * enum nl80211_sar_specs_attrs - Attributes for SAR power limit specs + * + * @NL80211_SAR_ATTR_SPECS_POWER: Required (s32)value to specify the actual + * power limit value in units of 0.25 dBm if type is + * NL80211_SAR_TYPE_POWER. (i.e., a value of 44 represents 11 dBm). + * 0 means userspace doesn't have SAR limitation on this associated range. + * + * @NL80211_SAR_ATTR_SPECS_RANGE_INDEX: Required (u32) value to specify the + * index of exported freq range table and the associated power limitation + * is applied to this range. + * + * Userspace isn't required to set all the ranges advertised by WLAN driver, + * and userspace can skip some certain ranges. These skipped ranges don't + * have SAR limitations, and they are same as setting the + * %NL80211_SAR_ATTR_SPECS_POWER to any unreasonable high value because any + * value higher than regulatory allowed value just means SAR power + * limitation is removed, but it's required to set at least one range. + * It's not allowed to set duplicated range in one SET operation. + * + * Every SET operation overwrites previous SET operation. + * + * @NL80211_SAR_ATTR_SPECS_START_FREQ: Required (u32) value to specify the start + * frequency of this range edge when registering SAR capability to wiphy. + * It's not a channel center frequency. The unit is kHz. + * + * @NL80211_SAR_ATTR_SPECS_END_FREQ: Required (u32) value to specify the end + * frequency of this range edge when registering SAR capability to wiphy. + * It's not a channel center frequency. The unit is kHz. + * + * @__NL80211_SAR_ATTR_SPECS_LAST: Internal + * @NL80211_SAR_ATTR_SPECS_MAX: highest sar specs attribute + */ +enum nl80211_sar_specs_attrs { + __NL80211_SAR_ATTR_SPECS_INVALID, + + NL80211_SAR_ATTR_SPECS_POWER, + NL80211_SAR_ATTR_SPECS_RANGE_INDEX, + NL80211_SAR_ATTR_SPECS_START_FREQ, + NL80211_SAR_ATTR_SPECS_END_FREQ, + + __NL80211_SAR_ATTR_SPECS_LAST, + NL80211_SAR_ATTR_SPECS_MAX = __NL80211_SAR_ATTR_SPECS_LAST - 1, +}; + +/** + * enum nl80211_mbssid_config_attributes - multiple BSSID (MBSSID) and enhanced + * multi-BSSID advertisements (EMA) in AP mode. + * Kernel uses some of these attributes to advertise driver's support for + * MBSSID and EMA. + * Remaining attributes should be used by the userspace to configure the + * features. + * + * @__NL80211_MBSSID_CONFIG_ATTR_INVALID: Invalid + * + * @NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES: Used by the kernel to advertise + * the maximum number of MBSSID interfaces supported by the driver. + * Driver should indicate MBSSID support by setting + * wiphy->mbssid_max_interfaces to a value more than or equal to 2. + * + * @NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY: Used by the kernel + * to advertise the maximum profile periodicity supported by the driver + * if EMA is enabled. Driver should indicate EMA support to the userspace + * by setting wiphy->ema_max_profile_periodicity to + * a non-zero value. + * + * @NL80211_MBSSID_CONFIG_ATTR_INDEX: Mandatory parameter to pass the index of + * this BSS (u8) in the multiple BSSID set. + * Value must be set to 0 for the transmitting interface and non-zero for + * all non-transmitting interfaces. The userspace will be responsible + * for using unique indices for the interfaces. + * Range: 0 to wiphy->mbssid_max_interfaces-1. + * + * @NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX: Mandatory parameter for + * a non-transmitted profile which provides the interface index (u32) of + * the transmitted profile. The value must match one of the interface + * indices advertised by the kernel. Optional if the interface being set up + * is the transmitting one, however, if provided then the value must match + * the interface index of the same. + * + * @NL80211_MBSSID_CONFIG_ATTR_EMA: Flag used to enable EMA AP feature. + * Setting this flag is permitted only if the driver advertises EMA support + * by setting wiphy->ema_max_profile_periodicity to non-zero. + * + * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal + * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute + */ +enum nl80211_mbssid_config_attributes { + __NL80211_MBSSID_CONFIG_ATTR_INVALID, + + NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES, + NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY, + NL80211_MBSSID_CONFIG_ATTR_INDEX, + NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX, + NL80211_MBSSID_CONFIG_ATTR_EMA, + + /* keep last */ + __NL80211_MBSSID_CONFIG_ATTR_LAST, + NL80211_MBSSID_CONFIG_ATTR_MAX = __NL80211_MBSSID_CONFIG_ATTR_LAST - 1, +}; + +/** + * enum nl80211_ap_settings_flags - AP settings flags + * + * @NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external + * authentication. + * @NL80211_AP_SETTINGS_SA_QUERY_OFFLOAD_SUPPORT: Userspace supports SA Query + * procedures offload to driver. If driver advertises + * %NL80211_AP_SME_SA_QUERY_OFFLOAD in AP SME features, userspace shall + * ignore SA Query procedures and validations when this flag is set by + * userspace. + */ +enum nl80211_ap_settings_flags { + NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = 1 << 0, + NL80211_AP_SETTINGS_SA_QUERY_OFFLOAD_SUPPORT = 1 << 1, +}; + +#endif /* __LINUX_NL80211_H */ diff --git a/src/basic/linux/pkt_sched.h b/src/basic/linux/pkt_sched.h new file mode 100644 index 0000000..000eec1 --- /dev/null +++ b/src/basic/linux/pkt_sched.h @@ -0,0 +1,1281 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + TCA_RED_FLAGS, /* bitfield32 */ + TCA_RED_EARLY_DROP_BLOCK, /* u32 */ + TCA_RED_MARK_BLOCK, /* u32 */ + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + + /* This field can be used for flags that a RED-like qdisc has + * historically supported. E.g. when configuring RED, it can be used for + * ECN, HARDDROP and ADAPTATIVE. For SFQ it can be used for ECN, + * HARDDROP. Etc. Because this field has not been validated, and is + * copied back on dump, any bits besides those to which a given qdisc + * has assigned a historical meaning need to be considered for free use + * by userspace tools. + * + * Any further flags need to be passed differently, e.g. through an + * attribute (such as TCA_RED_FLAGS above). Such attribute should allow + * passing both recent and historic flags in one value. + */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +#define TC_RED_NODROP 8 +}; + +#define TC_RED_HISTORIC_FLAGS (TC_RED_ECN | TC_RED_HARDDROP | TC_RED_ADAPTATIVE) + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + TCA_HTB_OFFLOAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt { + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt { + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl { + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u16 pad; + __u32 penalty; +}; + +struct tc_cbq_police { + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt { + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats { + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, + __TCA_CBQ_MAX, +}; + +#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) + +/* dsmark section */ + +enum { + TCA_DSMARK_UNSPEC, + TCA_DSMARK_INDICES, + TCA_DSMARK_DEFAULT_INDEX, + TCA_DSMARK_SET_TC_INDEX, + TCA_DSMARK_MASK, + TCA_DSMARK_VALUE, + __TCA_DSMARK_MAX, +}; + +#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) + +/* ATM section */ + +enum { + TCA_ATM_UNSPEC, + TCA_ATM_FD, /* file/socket descriptor */ + TCA_ATM_PTR, /* pointer to descriptor - later */ + TCA_ATM_HDR, /* LL header */ + TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ + TCA_ATM_ADDR, /* PVC address (for output only) */ + TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ + __TCA_ATM_MAX, +}; + +#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +#define FQ_CODEL_QUANTUM_MAX (1 << 20) + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, + TCA_FQ_CODEL_CE_THRESHOLD_MASK, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + TCA_FQ_TIMER_SLACK, /* timer slack */ + + TCA_FQ_HORIZON, /* time horizon in us */ + + TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ + __u64 horizon_drops; + __u64 horizon_caps; +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + TCA_PIE_DQ_RATE_ESTIMATOR, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u64 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in + * bits/pie_time + */ + __u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space + * in queue + */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* FQ PIE */ +enum { + TCA_FQ_PIE_UNSPEC, + TCA_FQ_PIE_LIMIT, + TCA_FQ_PIE_FLOWS, + TCA_FQ_PIE_TARGET, + TCA_FQ_PIE_TUPDATE, + TCA_FQ_PIE_ALPHA, + TCA_FQ_PIE_BETA, + TCA_FQ_PIE_QUANTUM, + TCA_FQ_PIE_MEMORY_LIMIT, + TCA_FQ_PIE_ECN_PROB, + TCA_FQ_PIE_ECN, + TCA_FQ_PIE_BYTEMODE, + TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + __TCA_FQ_PIE_MAX +}; +#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1) + +struct tc_fq_pie_xstats { + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to fq_pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 overmemory; /* dropped due to lack of memory in queue */ + __u32 ecn_mark; /* packets marked with ecn */ + __u32 new_flow_count; /* count of new flows created by packets */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 memory_usage; /* total memory across all queues */ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON _BITUL(0) +#define TC_ETF_OFFLOAD_ON _BITUL(1) +#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + TCA_CAKE_FWMARK, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +/* The format for the admin sched (dump only): + * [TCA_TAPRIO_SCHED_ADMIN_SCHED] + * [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] + */ + +#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST _BITUL(0) +#define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD _BITUL(1) + +enum { + TCA_TAPRIO_TC_ENTRY_UNSPEC, + TCA_TAPRIO_TC_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_TC_ENTRY_MAX_SDU, /* u32 */ + + /* add new constants above here */ + __TCA_TAPRIO_TC_ENTRY_CNT, + TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1) +}; + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ + TCA_TAPRIO_ATTR_FLAGS, /* u32 */ + TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */ + TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */ + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +/* ETS */ + +#define TCQ_ETS_MAX_BANDS 16 + +enum { + TCA_ETS_UNSPEC, + TCA_ETS_NBANDS, /* u8 */ + TCA_ETS_NSTRICT, /* u8 */ + TCA_ETS_QUANTA, /* nested TCA_ETS_QUANTA_BAND */ + TCA_ETS_QUANTA_BAND, /* u32 */ + TCA_ETS_PRIOMAP, /* nested TCA_ETS_PRIOMAP_BAND */ + TCA_ETS_PRIOMAP_BAND, /* u8 */ + __TCA_ETS_MAX, +}; + +#define TCA_ETS_MAX (__TCA_ETS_MAX - 1) + +#endif diff --git a/src/basic/linux/rtnetlink.h b/src/basic/linux/rtnetlink.h new file mode 100644 index 0000000..eb2747d --- /dev/null +++ b/src/basic/linux/rtnetlink.h @@ -0,0 +1,826 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_RTNETLINK_H +#define _UAPI__LINUX_RTNETLINK_H + +#include +#include +#include +#include +#include + +/* rtnetlink families. Values up to 127 are reserved for real address + * families, values above 128 may be used arbitrarily. + */ +#define RTNL_FAMILY_IPMR 128 +#define RTNL_FAMILY_IP6MR 129 +#define RTNL_FAMILY_MAX 129 + +/**** + * Routing/neighbour discovery messages. + ****/ + +/* Types of messages */ + +enum { + RTM_BASE = 16, +#define RTM_BASE RTM_BASE + + RTM_NEWLINK = 16, +#define RTM_NEWLINK RTM_NEWLINK + RTM_DELLINK, +#define RTM_DELLINK RTM_DELLINK + RTM_GETLINK, +#define RTM_GETLINK RTM_GETLINK + RTM_SETLINK, +#define RTM_SETLINK RTM_SETLINK + + RTM_NEWADDR = 20, +#define RTM_NEWADDR RTM_NEWADDR + RTM_DELADDR, +#define RTM_DELADDR RTM_DELADDR + RTM_GETADDR, +#define RTM_GETADDR RTM_GETADDR + + RTM_NEWROUTE = 24, +#define RTM_NEWROUTE RTM_NEWROUTE + RTM_DELROUTE, +#define RTM_DELROUTE RTM_DELROUTE + RTM_GETROUTE, +#define RTM_GETROUTE RTM_GETROUTE + + RTM_NEWNEIGH = 28, +#define RTM_NEWNEIGH RTM_NEWNEIGH + RTM_DELNEIGH, +#define RTM_DELNEIGH RTM_DELNEIGH + RTM_GETNEIGH, +#define RTM_GETNEIGH RTM_GETNEIGH + + RTM_NEWRULE = 32, +#define RTM_NEWRULE RTM_NEWRULE + RTM_DELRULE, +#define RTM_DELRULE RTM_DELRULE + RTM_GETRULE, +#define RTM_GETRULE RTM_GETRULE + + RTM_NEWQDISC = 36, +#define RTM_NEWQDISC RTM_NEWQDISC + RTM_DELQDISC, +#define RTM_DELQDISC RTM_DELQDISC + RTM_GETQDISC, +#define RTM_GETQDISC RTM_GETQDISC + + RTM_NEWTCLASS = 40, +#define RTM_NEWTCLASS RTM_NEWTCLASS + RTM_DELTCLASS, +#define RTM_DELTCLASS RTM_DELTCLASS + RTM_GETTCLASS, +#define RTM_GETTCLASS RTM_GETTCLASS + + RTM_NEWTFILTER = 44, +#define RTM_NEWTFILTER RTM_NEWTFILTER + RTM_DELTFILTER, +#define RTM_DELTFILTER RTM_DELTFILTER + RTM_GETTFILTER, +#define RTM_GETTFILTER RTM_GETTFILTER + + RTM_NEWACTION = 48, +#define RTM_NEWACTION RTM_NEWACTION + RTM_DELACTION, +#define RTM_DELACTION RTM_DELACTION + RTM_GETACTION, +#define RTM_GETACTION RTM_GETACTION + + RTM_NEWPREFIX = 52, +#define RTM_NEWPREFIX RTM_NEWPREFIX + + RTM_GETMULTICAST = 58, +#define RTM_GETMULTICAST RTM_GETMULTICAST + + RTM_GETANYCAST = 62, +#define RTM_GETANYCAST RTM_GETANYCAST + + RTM_NEWNEIGHTBL = 64, +#define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL + RTM_GETNEIGHTBL = 66, +#define RTM_GETNEIGHTBL RTM_GETNEIGHTBL + RTM_SETNEIGHTBL, +#define RTM_SETNEIGHTBL RTM_SETNEIGHTBL + + RTM_NEWNDUSEROPT = 68, +#define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT + + RTM_NEWADDRLABEL = 72, +#define RTM_NEWADDRLABEL RTM_NEWADDRLABEL + RTM_DELADDRLABEL, +#define RTM_DELADDRLABEL RTM_DELADDRLABEL + RTM_GETADDRLABEL, +#define RTM_GETADDRLABEL RTM_GETADDRLABEL + + RTM_GETDCB = 78, +#define RTM_GETDCB RTM_GETDCB + RTM_SETDCB, +#define RTM_SETDCB RTM_SETDCB + + RTM_NEWNETCONF = 80, +#define RTM_NEWNETCONF RTM_NEWNETCONF + RTM_DELNETCONF, +#define RTM_DELNETCONF RTM_DELNETCONF + RTM_GETNETCONF = 82, +#define RTM_GETNETCONF RTM_GETNETCONF + + RTM_NEWMDB = 84, +#define RTM_NEWMDB RTM_NEWMDB + RTM_DELMDB = 85, +#define RTM_DELMDB RTM_DELMDB + RTM_GETMDB = 86, +#define RTM_GETMDB RTM_GETMDB + + RTM_NEWNSID = 88, +#define RTM_NEWNSID RTM_NEWNSID + RTM_DELNSID = 89, +#define RTM_DELNSID RTM_DELNSID + RTM_GETNSID = 90, +#define RTM_GETNSID RTM_GETNSID + + RTM_NEWSTATS = 92, +#define RTM_NEWSTATS RTM_NEWSTATS + RTM_GETSTATS = 94, +#define RTM_GETSTATS RTM_GETSTATS + RTM_SETSTATS, +#define RTM_SETSTATS RTM_SETSTATS + + RTM_NEWCACHEREPORT = 96, +#define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT + + RTM_NEWCHAIN = 100, +#define RTM_NEWCHAIN RTM_NEWCHAIN + RTM_DELCHAIN, +#define RTM_DELCHAIN RTM_DELCHAIN + RTM_GETCHAIN, +#define RTM_GETCHAIN RTM_GETCHAIN + + RTM_NEWNEXTHOP = 104, +#define RTM_NEWNEXTHOP RTM_NEWNEXTHOP + RTM_DELNEXTHOP, +#define RTM_DELNEXTHOP RTM_DELNEXTHOP + RTM_GETNEXTHOP, +#define RTM_GETNEXTHOP RTM_GETNEXTHOP + + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + + RTM_NEWVLAN = 112, +#define RTM_NEWNVLAN RTM_NEWVLAN + RTM_DELVLAN, +#define RTM_DELVLAN RTM_DELVLAN + RTM_GETVLAN, +#define RTM_GETVLAN RTM_GETVLAN + + RTM_NEWNEXTHOPBUCKET = 116, +#define RTM_NEWNEXTHOPBUCKET RTM_NEWNEXTHOPBUCKET + RTM_DELNEXTHOPBUCKET, +#define RTM_DELNEXTHOPBUCKET RTM_DELNEXTHOPBUCKET + RTM_GETNEXTHOPBUCKET, +#define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET + + RTM_NEWTUNNEL = 120, +#define RTM_NEWTUNNEL RTM_NEWTUNNEL + RTM_DELTUNNEL, +#define RTM_DELTUNNEL RTM_DELTUNNEL + RTM_GETTUNNEL, +#define RTM_GETTUNNEL RTM_GETTUNNEL + + __RTM_MAX, +#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) +}; + +#define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE) +#define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) +#define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) + +/* + Generic structure for encapsulation of optional route information. + It is reminiscent of sockaddr, but with sa_family replaced + with attribute type. + */ + +struct rtattr { + unsigned short rta_len; + unsigned short rta_type; +}; + +/* Macros to handle rtattributes */ + +#define RTA_ALIGNTO 4U +#define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) +#define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \ + (rta)->rta_len >= sizeof(struct rtattr) && \ + (rta)->rta_len <= (len)) +#define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ + (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) +#define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len)) +#define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len)) +#define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0))) +#define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0)) + + + + +/****************************************************************************** + * Definitions used in routing table administration. + ****/ + +struct rtmsg { + unsigned char rtm_family; + unsigned char rtm_dst_len; + unsigned char rtm_src_len; + unsigned char rtm_tos; + + unsigned char rtm_table; /* Routing table id */ + unsigned char rtm_protocol; /* Routing protocol; see below */ + unsigned char rtm_scope; /* See below */ + unsigned char rtm_type; /* See below */ + + unsigned rtm_flags; +}; + +/* rtm_type */ + +enum { + RTN_UNSPEC, + RTN_UNICAST, /* Gateway or direct route */ + RTN_LOCAL, /* Accept locally */ + RTN_BROADCAST, /* Accept locally as broadcast, + send as broadcast */ + RTN_ANYCAST, /* Accept locally as broadcast, + but send as unicast */ + RTN_MULTICAST, /* Multicast route */ + RTN_BLACKHOLE, /* Drop */ + RTN_UNREACHABLE, /* Destination is unreachable */ + RTN_PROHIBIT, /* Administratively prohibited */ + RTN_THROW, /* Not in this table */ + RTN_NAT, /* Translate this address */ + RTN_XRESOLVE, /* Use external resolver */ + __RTN_MAX +}; + +#define RTN_MAX (__RTN_MAX - 1) + + +/* rtm_protocol */ + +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; + not used by current IPv4 */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ + +/* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; + they are just passed from user and back as is. + It will be used by hypothetical multiple routing daemons. + Note that protocol values should be standardized in order to + avoid conflicts. + */ + +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC/ND router advertisements */ +#define RTPROT_MRT 10 /* Merit MRT */ +#define RTPROT_ZEBRA 11 /* Zebra */ +#define RTPROT_BIRD 12 /* BIRD */ +#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ +#define RTPROT_XORP 14 /* XORP */ +#define RTPROT_NTK 15 /* Netsukuku */ +#define RTPROT_DHCP 16 /* DHCP client */ +#define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ + +/* rtm_scope + + Really it is not scope, but sort of distance to the destination. + NOWHERE are reserved for not existing destinations, HOST is our + local addresses, LINK are destinations, located on directly attached + link and UNIVERSE is everywhere in the Universe. + + Intermediate values are also possible f.e. interior routes + could be assigned a value between UNIVERSE and LINK. +*/ + +enum rt_scope_t { + RT_SCOPE_UNIVERSE=0, +/* User defined values */ + RT_SCOPE_SITE=200, + RT_SCOPE_LINK=253, + RT_SCOPE_HOST=254, + RT_SCOPE_NOWHERE=255 +}; + +/* rtm_flags */ + +#define RTM_F_NOTIFY 0x100 /* Notify user of route change */ +#define RTM_F_CLONED 0x200 /* This route is cloned */ +#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ +#define RTM_F_PREFIX 0x800 /* Prefix addresses */ +#define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */ +#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ +#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ +#define RTM_F_TRAP 0x8000 /* route is trapping packets */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed, this value + * is chosen to avoid conflicts with + * other flags defined in + * include/uapi/linux/ipv6_route.h + */ + +/* Reserved table identifiers */ + +enum rt_class_t { + RT_TABLE_UNSPEC=0, +/* User defined values */ + RT_TABLE_COMPAT=252, + RT_TABLE_DEFAULT=253, + RT_TABLE_MAIN=254, + RT_TABLE_LOCAL=255, + RT_TABLE_MAX=0xFFFFFFFF +}; + + +/* Routing message attributes */ + +enum rtattr_type_t { + RTA_UNSPEC, + RTA_DST, + RTA_SRC, + RTA_IIF, + RTA_OIF, + RTA_GATEWAY, + RTA_PRIORITY, + RTA_PREFSRC, + RTA_METRICS, + RTA_MULTIPATH, + RTA_PROTOINFO, /* no longer used */ + RTA_FLOW, + RTA_CACHEINFO, + RTA_SESSION, /* no longer used */ + RTA_MP_ALGO, /* no longer used */ + RTA_TABLE, + RTA_MARK, + RTA_MFC_STATS, + RTA_VIA, + RTA_NEWDST, + RTA_PREF, + RTA_ENCAP_TYPE, + RTA_ENCAP, + RTA_EXPIRES, + RTA_PAD, + RTA_UID, + RTA_TTL_PROPAGATE, + RTA_IP_PROTO, + RTA_SPORT, + RTA_DPORT, + RTA_NH_ID, + __RTA_MAX +}; + +#define RTA_MAX (__RTA_MAX - 1) + +#define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) +#define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg)) + +/* RTM_MULTIPATH --- array of struct rtnexthop. + * + * "struct rtnexthop" describes all necessary nexthop information, + * i.e. parameters of path to a destination via this nexthop. + * + * At the moment it is impossible to set different prefsrc, mtu, window + * and rtt for different paths from multipath. + */ + +struct rtnexthop { + unsigned short rtnh_len; + unsigned char rtnh_flags; + unsigned char rtnh_hops; + int rtnh_ifindex; +}; + +/* rtnh_flags */ + +#define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ +#define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ +#define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_OFFLOAD 8 /* Nexthop is offloaded */ +#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ +#define RTNH_F_UNRESOLVED 32 /* The entry is unresolved (ipmr) */ +#define RTNH_F_TRAP 64 /* Nexthop is trapping packets */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \ + RTNH_F_OFFLOAD | RTNH_F_TRAP) + +/* Macros to handle hexthops */ + +#define RTNH_ALIGNTO 4 +#define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) ) +#define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \ + ((int)(rtnh)->rtnh_len) <= (len)) +#define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len))) +#define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len)) +#define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) +#define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) + +/* RTA_VIA */ +struct rtvia { + __kernel_sa_family_t rtvia_family; + __u8 rtvia_addr[]; +}; + +/* RTM_CACHEINFO */ + +struct rta_cacheinfo { + __u32 rta_clntref; + __u32 rta_lastuse; + __s32 rta_expires; + __u32 rta_error; + __u32 rta_used; + +#define RTNETLINK_HAVE_PEERINFO 1 + __u32 rta_id; + __u32 rta_ts; + __u32 rta_tsage; +}; + +/* RTM_METRICS --- array of struct rtattr with types of RTAX_* */ + +enum { + RTAX_UNSPEC, +#define RTAX_UNSPEC RTAX_UNSPEC + RTAX_LOCK, +#define RTAX_LOCK RTAX_LOCK + RTAX_MTU, +#define RTAX_MTU RTAX_MTU + RTAX_WINDOW, +#define RTAX_WINDOW RTAX_WINDOW + RTAX_RTT, +#define RTAX_RTT RTAX_RTT + RTAX_RTTVAR, +#define RTAX_RTTVAR RTAX_RTTVAR + RTAX_SSTHRESH, +#define RTAX_SSTHRESH RTAX_SSTHRESH + RTAX_CWND, +#define RTAX_CWND RTAX_CWND + RTAX_ADVMSS, +#define RTAX_ADVMSS RTAX_ADVMSS + RTAX_REORDERING, +#define RTAX_REORDERING RTAX_REORDERING + RTAX_HOPLIMIT, +#define RTAX_HOPLIMIT RTAX_HOPLIMIT + RTAX_INITCWND, +#define RTAX_INITCWND RTAX_INITCWND + RTAX_FEATURES, +#define RTAX_FEATURES RTAX_FEATURES + RTAX_RTO_MIN, +#define RTAX_RTO_MIN RTAX_RTO_MIN + RTAX_INITRWND, +#define RTAX_INITRWND RTAX_INITRWND + RTAX_QUICKACK, +#define RTAX_QUICKACK RTAX_QUICKACK + RTAX_CC_ALGO, +#define RTAX_CC_ALGO RTAX_CC_ALGO + RTAX_FASTOPEN_NO_COOKIE, +#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE + __RTAX_MAX +}; + +#define RTAX_MAX (__RTAX_MAX - 1) + +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) + +struct rta_session { + __u8 proto; + __u8 pad1; + __u16 pad2; + + union { + struct { + __u16 sport; + __u16 dport; + } ports; + + struct { + __u8 type; + __u8 code; + __u16 ident; + } icmpt; + + __u32 spi; + } u; +}; + +struct rta_mfc_stats { + __u64 mfcs_packets; + __u64 mfcs_bytes; + __u64 mfcs_wrong_if; +}; + +/**** + * General form of address family dependent message. + ****/ + +struct rtgenmsg { + unsigned char rtgen_family; +}; + +/***************************************************************** + * Link layer specific messages. + ****/ + +/* struct ifinfomsg + * passes link level specific information, not dependent + * on network protocol. + */ + +struct ifinfomsg { + unsigned char ifi_family; + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ +}; + +/******************************************************************** + * prefix information + ****/ + +struct prefixmsg { + unsigned char prefix_family; + unsigned char prefix_pad1; + unsigned short prefix_pad2; + int prefix_ifindex; + unsigned char prefix_type; + unsigned char prefix_len; + unsigned char prefix_flags; + unsigned char prefix_pad3; +}; + +enum +{ + PREFIX_UNSPEC, + PREFIX_ADDRESS, + PREFIX_CACHEINFO, + __PREFIX_MAX +}; + +#define PREFIX_MAX (__PREFIX_MAX - 1) + +struct prefix_cacheinfo { + __u32 preferred_time; + __u32 valid_time; +}; + + +/***************************************************************** + * Traffic control messages. + ****/ + +struct tcmsg { + unsigned char tcm_family; + unsigned char tcm__pad1; + unsigned short tcm__pad2; + int tcm_ifindex; + __u32 tcm_handle; + __u32 tcm_parent; +/* tcm_block_index is used instead of tcm_parent + * in case tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK + */ +#define tcm_block_index tcm_parent + __u32 tcm_info; +}; + +/* For manipulation of filters in shared block, tcm_ifindex is set to + * TCM_IFINDEX_MAGIC_BLOCK, and tcm_parent is aliased to tcm_block_index + * which is the block index. + */ +#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU) + +enum { + TCA_UNSPEC, + TCA_KIND, + TCA_OPTIONS, + TCA_STATS, + TCA_XSTATS, + TCA_RATE, + TCA_FCNT, + TCA_STATS2, + TCA_STAB, + TCA_PAD, + TCA_DUMP_INVISIBLE, + TCA_CHAIN, + TCA_HW_OFFLOAD, + TCA_INGRESS_BLOCK, + TCA_EGRESS_BLOCK, + TCA_DUMP_FLAGS, + __TCA_MAX +}; + +#define TCA_MAX (__TCA_MAX - 1) + +#define TCA_DUMP_FLAGS_TERSE (1 << 0) /* Means that in dump user gets only basic + * data necessary to identify the objects + * (handle, cookie, etc.) and stats. + */ + +#define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) +#define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) + +/******************************************************************** + * Neighbor Discovery userland options + ****/ + +struct nduseroptmsg { + unsigned char nduseropt_family; + unsigned char nduseropt_pad1; + unsigned short nduseropt_opts_len; /* Total length of options */ + int nduseropt_ifindex; + __u8 nduseropt_icmp_type; + __u8 nduseropt_icmp_code; + unsigned short nduseropt_pad2; + unsigned int nduseropt_pad3; + /* Followed by one or more ND options */ +}; + +enum { + NDUSEROPT_UNSPEC, + NDUSEROPT_SRCADDR, + __NDUSEROPT_MAX +}; + +#define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1) + +#ifndef __KERNEL__ +/* RTnetlink multicast groups - backwards compatibility for userspace */ +#define RTMGRP_LINK 1 +#define RTMGRP_NOTIFY 2 +#define RTMGRP_NEIGH 4 +#define RTMGRP_TC 8 + +#define RTMGRP_IPV4_IFADDR 0x10 +#define RTMGRP_IPV4_MROUTE 0x20 +#define RTMGRP_IPV4_ROUTE 0x40 +#define RTMGRP_IPV4_RULE 0x80 + +#define RTMGRP_IPV6_IFADDR 0x100 +#define RTMGRP_IPV6_MROUTE 0x200 +#define RTMGRP_IPV6_ROUTE 0x400 +#define RTMGRP_IPV6_IFINFO 0x800 + +#define RTMGRP_DECnet_IFADDR 0x1000 +#define RTMGRP_DECnet_ROUTE 0x4000 + +#define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* RTnetlink multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV4_RULE, +#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_NOP2, + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_DECnet_RULE, +#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE + RTNLGRP_NOP4, + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + RTNLGRP_IPV6_RULE, +#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE + RTNLGRP_ND_USEROPT, +#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT + RTNLGRP_PHONET_IFADDR, +#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR + RTNLGRP_PHONET_ROUTE, +#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE + RTNLGRP_DCB, +#define RTNLGRP_DCB RTNLGRP_DCB + RTNLGRP_IPV4_NETCONF, +#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF + RTNLGRP_IPV6_NETCONF, +#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF + RTNLGRP_MDB, +#define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID + RTNLGRP_MPLS_NETCONF, +#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF + RTNLGRP_IPV4_MROUTE_R, +#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R + RTNLGRP_IPV6_MROUTE_R, +#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN + RTNLGRP_MCTP_IFADDR, +#define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR + RTNLGRP_TUNNEL, +#define RTNLGRP_TUNNEL RTNLGRP_TUNNEL + RTNLGRP_STATS, +#define RTNLGRP_STATS RTNLGRP_STATS + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) + +/* TC action piece */ +struct tcamsg { + unsigned char tca_family; + unsigned char tca__pad1; + unsigned short tca__pad2; +}; + +enum { + TCA_ROOT_UNSPEC, + TCA_ROOT_TAB, +#define TCA_ACT_TAB TCA_ROOT_TAB +#define TCAA_MAX TCA_ROOT_TAB + TCA_ROOT_FLAGS, + TCA_ROOT_COUNT, + TCA_ROOT_TIME_DELTA, /* in msecs */ + __TCA_ROOT_MAX, +#define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) +}; + +#define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) +#define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS + * + * TCA_ACT_FLAG_LARGE_DUMP_ON user->kernel to request for larger than + * TCA_ACT_MAX_PRIO actions in a dump. All dump responses will contain the + * number of actions being dumped stored in for user app's consumption in + * TCA_ROOT_COUNT + * + * TCA_ACT_FLAG_TERSE_DUMP user->kernel to request terse (brief) dump that only + * includes essential action info (kind, index, etc.) + * + */ +#define TCA_FLAG_LARGE_DUMP_ON (1 << 0) +#define TCA_ACT_FLAG_LARGE_DUMP_ON TCA_FLAG_LARGE_DUMP_ON +#define TCA_ACT_FLAG_TERSE_DUMP (1 << 1) + +/* New extended info filters for IFLA_EXT_MASK */ +#define RTEXT_FILTER_VF (1 << 0) +#define RTEXT_FILTER_BRVLAN (1 << 1) +#define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2) +#define RTEXT_FILTER_SKIP_STATS (1 << 3) +#define RTEXT_FILTER_MRP (1 << 4) +#define RTEXT_FILTER_CFM_CONFIG (1 << 5) +#define RTEXT_FILTER_CFM_STATUS (1 << 6) +#define RTEXT_FILTER_MST (1 << 7) + +/* End of information exported to user level */ + + + +#endif /* _UAPI__LINUX_RTNETLINK_H */ diff --git a/src/basic/linux/stddef.h b/src/basic/linux/stddef.h new file mode 100644 index 0000000..1a73963 --- /dev/null +++ b/src/basic/linux/stddef.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_STDDEF_H +#define _UAPI_LINUX_STDDEF_H + + +#ifndef __always_inline +#define __always_inline inline +#endif + +/** + * __struct_group() - Create a mirrored named and anonyomous struct + * + * @TAG: The tag name for the named sub-struct (usually empty) + * @NAME: The identifier name of the mirrored sub-struct + * @ATTRS: Any struct attributes (usually empty) + * @MEMBERS: The member declarations for the mirrored structs + * + * Used to create an anonymous union of two structs with identical layout + * and size: one anonymous and one named. The former's members can be used + * normally without sub-struct naming, and the latter can be used to + * reason about the start, end, and size of the group of struct members. + * The named struct can also be explicitly tagged for layer reuse, as well + * as both having struct attributes appended. + */ +#define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ + union { \ + struct { MEMBERS } ATTRS; \ + struct TAG { MEMBERS } ATTRS NAME; \ + } + +/** + * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union + * + * @TYPE: The type of each flexible array element + * @NAME: The name of the flexible array member + * + * In order to have a flexible array member in a union or alone in a + * struct, it needs to be wrapped in an anonymous struct with at least 1 + * named member, but that member can be empty. + */ +#define __DECLARE_FLEX_ARRAY(TYPE, NAME) \ + struct { \ + struct { } __empty_ ## NAME; \ + TYPE NAME[]; \ + } +#endif diff --git a/src/basic/linux/update.sh b/src/basic/linux/update.sh new file mode 100755 index 0000000..6155766 --- /dev/null +++ b/src/basic/linux/update.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu +set -o pipefail + +for i in *.h */*.h; do + curl --fail "https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/$i" -o "$i" + + sed -r -i -e 's/__user //g' -e '/^#include / d' "$i" + sed -r -i 's/^(#include )/#if WANT_LINUX_FS_H\n\1\n#endif/' "$i" +done diff --git a/src/basic/linux/wireguard.h b/src/basic/linux/wireguard.h new file mode 100644 index 0000000..ae88be1 --- /dev/null +++ b/src/basic/linux/wireguard.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * Documentation + * ============= + * + * The below enums and macros are for interfacing with WireGuard, using generic + * netlink, with family WG_GENL_NAME and version WG_GENL_VERSION. It defines two + * methods: get and set. Note that while they share many common attributes, + * these two functions actually accept a slightly different set of inputs and + * outputs. + * + * WG_CMD_GET_DEVICE + * ----------------- + * + * May only be called via NLM_F_REQUEST | NLM_F_DUMP. The command should contain + * one but not both of: + * + * WGDEVICE_A_IFINDEX: NLA_U32 + * WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1 + * + * The kernel will then return several messages (NLM_F_MULTI) containing the + * following tree of nested items: + * + * WGDEVICE_A_IFINDEX: NLA_U32 + * WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1 + * WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN + * WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN + * WGDEVICE_A_LISTEN_PORT: NLA_U16 + * WGDEVICE_A_FWMARK: NLA_U32 + * WGDEVICE_A_PEERS: NLA_NESTED + * 0: NLA_NESTED + * WGPEER_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN + * WGPEER_A_PRESHARED_KEY: NLA_EXACT_LEN, len WG_KEY_LEN + * WGPEER_A_ENDPOINT: NLA_MIN_LEN(struct sockaddr), struct sockaddr_in or struct sockaddr_in6 + * WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16 + * WGPEER_A_LAST_HANDSHAKE_TIME: NLA_EXACT_LEN, struct __kernel_timespec + * WGPEER_A_RX_BYTES: NLA_U64 + * WGPEER_A_TX_BYTES: NLA_U64 + * WGPEER_A_ALLOWEDIPS: NLA_NESTED + * 0: NLA_NESTED + * WGALLOWEDIP_A_FAMILY: NLA_U16 + * WGALLOWEDIP_A_IPADDR: NLA_MIN_LEN(struct in_addr), struct in_addr or struct in6_addr + * WGALLOWEDIP_A_CIDR_MASK: NLA_U8 + * 0: NLA_NESTED + * ... + * 0: NLA_NESTED + * ... + * ... + * WGPEER_A_PROTOCOL_VERSION: NLA_U32 + * 0: NLA_NESTED + * ... + * ... + * + * It is possible that all of the allowed IPs of a single peer will not + * fit within a single netlink message. In that case, the same peer will + * be written in the following message, except it will only contain + * WGPEER_A_PUBLIC_KEY and WGPEER_A_ALLOWEDIPS. This may occur several + * times in a row for the same peer. It is then up to the receiver to + * coalesce adjacent peers. Likewise, it is possible that all peers will + * not fit within a single message. So, subsequent peers will be sent + * in following messages, except those will only contain WGDEVICE_A_IFNAME + * and WGDEVICE_A_PEERS. It is then up to the receiver to coalesce these + * messages to form the complete list of peers. + * + * Since this is an NLA_F_DUMP command, the final message will always be + * NLMSG_DONE, even if an error occurs. However, this NLMSG_DONE message + * contains an integer error code. It is either zero or a negative error + * code corresponding to the errno. + * + * WG_CMD_SET_DEVICE + * ----------------- + * + * May only be called via NLM_F_REQUEST. The command should contain the + * following tree of nested items, containing one but not both of + * WGDEVICE_A_IFINDEX and WGDEVICE_A_IFNAME: + * + * WGDEVICE_A_IFINDEX: NLA_U32 + * WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1 + * WGDEVICE_A_FLAGS: NLA_U32, 0 or WGDEVICE_F_REPLACE_PEERS if all current + * peers should be removed prior to adding the list below. + * WGDEVICE_A_PRIVATE_KEY: len WG_KEY_LEN, all zeros to remove + * WGDEVICE_A_LISTEN_PORT: NLA_U16, 0 to choose randomly + * WGDEVICE_A_FWMARK: NLA_U32, 0 to disable + * WGDEVICE_A_PEERS: NLA_NESTED + * 0: NLA_NESTED + * WGPEER_A_PUBLIC_KEY: len WG_KEY_LEN + * WGPEER_A_FLAGS: NLA_U32, 0 and/or WGPEER_F_REMOVE_ME if the + * specified peer should not exist at the end of the + * operation, rather than added/updated and/or + * WGPEER_F_REPLACE_ALLOWEDIPS if all current allowed + * IPs of this peer should be removed prior to adding + * the list below and/or WGPEER_F_UPDATE_ONLY if the + * peer should only be set if it already exists. + * WGPEER_A_PRESHARED_KEY: len WG_KEY_LEN, all zeros to remove + * WGPEER_A_ENDPOINT: struct sockaddr_in or struct sockaddr_in6 + * WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16, 0 to disable + * WGPEER_A_ALLOWEDIPS: NLA_NESTED + * 0: NLA_NESTED + * WGALLOWEDIP_A_FAMILY: NLA_U16 + * WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr + * WGALLOWEDIP_A_CIDR_MASK: NLA_U8 + * 0: NLA_NESTED + * ... + * 0: NLA_NESTED + * ... + * ... + * WGPEER_A_PROTOCOL_VERSION: NLA_U32, should not be set or used at + * all by most users of this API, as the + * most recent protocol will be used when + * this is unset. Otherwise, must be set + * to 1. + * 0: NLA_NESTED + * ... + * ... + * + * It is possible that the amount of configuration data exceeds that of + * the maximum message length accepted by the kernel. In that case, several + * messages should be sent one after another, with each successive one + * filling in information not contained in the prior. Note that if + * WGDEVICE_F_REPLACE_PEERS is specified in the first message, it probably + * should not be specified in fragments that come after, so that the list + * of peers is only cleared the first time but appended after. Likewise for + * peers, if WGPEER_F_REPLACE_ALLOWEDIPS is specified in the first message + * of a peer, it likely should not be specified in subsequent fragments. + * + * If an error occurs, NLMSG_ERROR will reply containing an errno. + */ + +#ifndef _WG_UAPI_WIREGUARD_H +#define _WG_UAPI_WIREGUARD_H + +#define WG_GENL_NAME "wireguard" +#define WG_GENL_VERSION 1 + +#define WG_KEY_LEN 32 + +enum wg_cmd { + WG_CMD_GET_DEVICE, + WG_CMD_SET_DEVICE, + __WG_CMD_MAX +}; +#define WG_CMD_MAX (__WG_CMD_MAX - 1) + +enum wgdevice_flag { + WGDEVICE_F_REPLACE_PEERS = 1U << 0, + __WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS +}; +enum wgdevice_attribute { + WGDEVICE_A_UNSPEC, + WGDEVICE_A_IFINDEX, + WGDEVICE_A_IFNAME, + WGDEVICE_A_PRIVATE_KEY, + WGDEVICE_A_PUBLIC_KEY, + WGDEVICE_A_FLAGS, + WGDEVICE_A_LISTEN_PORT, + WGDEVICE_A_FWMARK, + WGDEVICE_A_PEERS, + __WGDEVICE_A_LAST +}; +#define WGDEVICE_A_MAX (__WGDEVICE_A_LAST - 1) + +enum wgpeer_flag { + WGPEER_F_REMOVE_ME = 1U << 0, + WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1, + WGPEER_F_UPDATE_ONLY = 1U << 2, + __WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS | + WGPEER_F_UPDATE_ONLY +}; +enum wgpeer_attribute { + WGPEER_A_UNSPEC, + WGPEER_A_PUBLIC_KEY, + WGPEER_A_PRESHARED_KEY, + WGPEER_A_FLAGS, + WGPEER_A_ENDPOINT, + WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, + WGPEER_A_LAST_HANDSHAKE_TIME, + WGPEER_A_RX_BYTES, + WGPEER_A_TX_BYTES, + WGPEER_A_ALLOWEDIPS, + WGPEER_A_PROTOCOL_VERSION, + __WGPEER_A_LAST +}; +#define WGPEER_A_MAX (__WGPEER_A_LAST - 1) + +enum wgallowedip_attribute { + WGALLOWEDIP_A_UNSPEC, + WGALLOWEDIP_A_FAMILY, + WGALLOWEDIP_A_IPADDR, + WGALLOWEDIP_A_CIDR_MASK, + __WGALLOWEDIP_A_LAST +}; +#define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1) + +#endif /* _WG_UAPI_WIREGUARD_H */ diff --git a/src/basic/list.h b/src/basic/list.h new file mode 100644 index 0000000..10e6954 --- /dev/null +++ b/src/basic/list.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* The head of the linked list. Use this in the structure that shall + * contain the head of the linked list */ +#define LIST_HEAD(t,name) \ + t *name + +/* The pointers in the linked list's items. Use this in the item structure */ +#define LIST_FIELDS(t,name) \ + t *name##_next, *name##_prev + +/* Initialize the list's head */ +#define LIST_HEAD_INIT(head) \ + do { \ + (head) = NULL; \ + } while (false) + +/* Initialize a list item */ +#define LIST_INIT(name,item) \ + do { \ + typeof(*(item)) *_item = (item); \ + assert(_item); \ + _item->name##_prev = _item->name##_next = NULL; \ + } while (false) + +/* Prepend an item to the list */ +#define LIST_PREPEND(name,head,item) \ + ({ \ + typeof(*(head)) **_head = &(head), *_item = (item); \ + assert(_item); \ + if ((_item->name##_next = *_head)) \ + _item->name##_next->name##_prev = _item; \ + _item->name##_prev = NULL; \ + *_head = _item; \ + _item; \ + }) + +/* Append an item to the list */ +#define LIST_APPEND(name,head,item) \ + ({ \ + typeof(*(head)) **_hhead = &(head), *_tail; \ + _tail = LIST_FIND_TAIL(name, *_hhead); \ + LIST_INSERT_AFTER(name, *_hhead, _tail, item); \ + }) + +/* Remove an item from the list */ +#define LIST_REMOVE(name,head,item) \ + ({ \ + typeof(*(head)) **_head = &(head), *_item = (item); \ + assert(_item); \ + if (_item->name##_next) \ + _item->name##_next->name##_prev = _item->name##_prev; \ + if (_item->name##_prev) \ + _item->name##_prev->name##_next = _item->name##_next; \ + else { \ + assert(*_head == _item); \ + *_head = _item->name##_next; \ + } \ + _item->name##_next = _item->name##_prev = NULL; \ + _item; \ + }) + +/* Find the head of the list */ +#define LIST_FIND_HEAD(name,item) \ + ({ \ + typeof(*(item)) *_item = (item); \ + while (_item && _item->name##_prev) \ + _item = _item->name##_prev; \ + _item; \ + }) + +/* Find the tail of the list */ +#define LIST_FIND_TAIL(name,item) \ + ({ \ + typeof(*(item)) *_item = (item); \ + while (_item && _item->name##_next) \ + _item = _item->name##_next; \ + _item; \ + }) + +/* Insert an item after another one (a = where, b = what) */ +#define LIST_INSERT_AFTER(name,head,a,b) \ + ({ \ + typeof(*(head)) **_head = &(head), *_a = (a), *_b = (b); \ + assert(_b); \ + if (!_a) { \ + if ((_b->name##_next = *_head)) \ + _b->name##_next->name##_prev = _b; \ + _b->name##_prev = NULL; \ + *_head = _b; \ + } else { \ + if ((_b->name##_next = _a->name##_next)) \ + _b->name##_next->name##_prev = _b; \ + _b->name##_prev = _a; \ + _a->name##_next = _b; \ + } \ + _b; \ + }) + +/* Insert an item before another one (a = where, b = what) */ +#define LIST_INSERT_BEFORE(name,head,a,b) \ + ({ \ + typeof(*(head)) **_head = &(head), *_a = (a), *_b = (b); \ + assert(_b); \ + if (!_a) { \ + if (!*_head) { \ + _b->name##_next = NULL; \ + _b->name##_prev = NULL; \ + *_head = _b; \ + } else { \ + typeof(*(head)) *_tail = (head); \ + while (_tail->name##_next) \ + _tail = _tail->name##_next; \ + _b->name##_next = NULL; \ + _b->name##_prev = _tail; \ + _tail->name##_next = _b; \ + } \ + } else { \ + if ((_b->name##_prev = _a->name##_prev)) \ + _b->name##_prev->name##_next = _b; \ + else \ + *_head = _b; \ + _b->name##_next = _a; \ + _a->name##_prev = _b; \ + } \ + _b; \ + }) + +#define LIST_JUST_US(name, item) \ + ({ \ + typeof(*(item)) *_item = (item); \ + !(_item)->name##_prev && !(_item)->name##_next; \ + }) + +/* The type of the iterator 'i' is automatically determined by the type of 'head', and declared in the + * loop. Hence, do not declare the same variable in the outer scope. Sometimes, we set 'head' through + * hashmap_get(). In that case, you need to explicitly cast the result. */ +#define LIST_FOREACH_WITH_NEXT(name,i,n,head) \ + for (typeof(*(head)) *n, *i = (head); i && (n = i->name##_next, true); i = n) + +#define LIST_FOREACH(name,i,head) \ + LIST_FOREACH_WITH_NEXT(name, i, UNIQ_T(n, UNIQ), head) + +#define _LIST_FOREACH_WITH_PREV(name,i,p,start) \ + for (typeof(*(start)) *p, *i = (start); i && (p = i->name##_prev, true); i = p) + +#define LIST_FOREACH_BACKWARDS(name,i,start) \ + _LIST_FOREACH_WITH_PREV(name, i, UNIQ_T(p, UNIQ), start) + +/* Iterate through all the members of the list p is included in, but skip over p */ +#define LIST_FOREACH_OTHERS(name,i,p) \ + for (typeof(*(p)) *_p = (p), *i = ({ \ + typeof(*_p) *_j = _p; \ + while (_j && _j->name##_prev) \ + _j = _j->name##_prev; \ + if (_j == _p) \ + _j = _p->name##_next; \ + _j; \ + }); \ + i; \ + i = i->name##_next == _p ? _p->name##_next : i->name##_next) + +/* Loop starting from p->next until p->prev. p can be adjusted meanwhile. */ +#define LIST_LOOP_BUT_ONE(name,i,head,p) \ + for (typeof(*(p)) *i = (p)->name##_next ? (p)->name##_next : (head); \ + i != (p); \ + i = i->name##_next ? i->name##_next : (head)) + +/* Join two lists tail to head: a->b, c->d to a->b->c->d and de-initialise second list */ +#define LIST_JOIN(name,a,b) \ + ({ \ + assert(b); \ + if (!(a)) \ + (a) = (b); \ + else { \ + typeof(*(a)) *_head = (b), *_tail; \ + _tail = LIST_FIND_TAIL(name, (a)); \ + _tail->name##_next = _head; \ + _head->name##_prev = _tail; \ + } \ + (b) = NULL; \ + a; \ + }) + +#define LIST_POP(name, a) \ + ({ \ + typeof(a)* _a = &(a); \ + typeof(a) _p = *_a; \ + if (_p) \ + LIST_REMOVE(name, *_a, _p); \ + _p; \ + }) + +#define LIST_CLEAR(name, head, free_func) \ + _LIST_CLEAR(name, head, free_func, UNIQ_T(elem, UNIQ)) + +/* Clear the list, destroying each element with free_func */ +#define _LIST_CLEAR(name, head, free_func, elem) \ + ({ \ + typeof(head) elem; \ + while ((elem = LIST_POP(name, head))) \ + free_func(elem); \ + head; \ + }) + +/* Now include "macro.h", because we want our definition of assert() which the macros above use. We include + * it down here instead of up top, since macro.h pulls in log.h which in turn needs our own definitions. */ +#include "macro.h" diff --git a/src/basic/locale-util.c b/src/basic/locale-util.c new file mode 100644 index 0000000..d3fef01 --- /dev/null +++ b/src/basic/locale-util.c @@ -0,0 +1,376 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "constants.h" +#include "dirent-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "locale-util.h" +#include "missing_syscall.h" +#include "path-util.h" +#include "set.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +static char *normalize_locale(const char *name) { + const char *e; + + /* Locale names are weird: glibc has some magic rules when looking for the charset name on disk: it + * lowercases everything, and removes most special chars. This means the official .UTF-8 suffix + * becomes .utf8 when looking things up on disk. When enumerating locales, let's do the reverse + * operation, and go back to ".UTF-8" which appears to be the more commonly accepted name. We only do + * that for UTF-8 however, since it's kinda the only charset that matters. */ + + e = endswith(name, ".utf8"); + if (e) { + _cleanup_free_ char *prefix = NULL; + + prefix = strndup(name, e - name); + if (!prefix) + return NULL; + + return strjoin(prefix, ".UTF-8"); + } + + e = strstr(name, ".utf8@"); + if (e) { + _cleanup_free_ char *prefix = NULL; + + prefix = strndup(name, e - name); + if (!prefix) + return NULL; + + return strjoin(prefix, ".UTF-8@", e + 6); + } + + return strdup(name); +} + +static int add_locales_from_archive(Set *locales) { + /* Stolen from glibc... */ + + struct locarhead { + uint32_t magic; + /* Serial number. */ + uint32_t serial; + /* Name hash table. */ + uint32_t namehash_offset; + uint32_t namehash_used; + uint32_t namehash_size; + /* String table. */ + uint32_t string_offset; + uint32_t string_used; + uint32_t string_size; + /* Table with locale records. */ + uint32_t locrectab_offset; + uint32_t locrectab_used; + uint32_t locrectab_size; + /* MD5 sum hash table. */ + uint32_t sumhash_offset; + uint32_t sumhash_used; + uint32_t sumhash_size; + }; + + struct namehashent { + /* Hash value of the name. */ + uint32_t hashval; + /* Offset of the name in the string table. */ + uint32_t name_offset; + /* Offset of the locale record. */ + uint32_t locrec_offset; + }; + + const struct locarhead *h; + const struct namehashent *e; + const void *p = MAP_FAILED; + _cleanup_close_ int fd = -EBADF; + size_t sz = 0; + struct stat st; + int r; + + fd = open("/usr/lib/locale/locale-archive", O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return errno == ENOENT ? 0 : -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISREG(st.st_mode)) + return -EBADMSG; + + if (st.st_size < (off_t) sizeof(struct locarhead)) + return -EBADMSG; + + if (file_offset_beyond_memory_size(st.st_size)) + return -EFBIG; + + p = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + return -errno; + + h = (const struct locarhead *) p; + if (h->magic != 0xde020109 || + h->namehash_offset + h->namehash_size > st.st_size || + h->string_offset + h->string_size > st.st_size || + h->locrectab_offset + h->locrectab_size > st.st_size || + h->sumhash_offset + h->sumhash_size > st.st_size) { + r = -EBADMSG; + goto finish; + } + + e = (const struct namehashent*) ((const uint8_t*) p + h->namehash_offset); + for (size_t i = 0; i < h->namehash_size; i++) { + char *z; + + if (e[i].locrec_offset == 0) + continue; + + if (!utf8_is_valid((char*) p + e[i].name_offset)) + continue; + + z = normalize_locale((char*) p + e[i].name_offset); + if (!z) { + r = -ENOMEM; + goto finish; + } + + r = set_consume(locales, z); + if (r < 0) + goto finish; + } + + r = 0; + + finish: + if (p != MAP_FAILED) + munmap((void*) p, sz); + + return r; +} + +static int add_locales_from_libdir(Set *locales) { + _cleanup_closedir_ DIR *dir = NULL; + int r; + + dir = opendir("/usr/lib/locale"); + if (!dir) + return errno == ENOENT ? 0 : -errno; + + FOREACH_DIRENT(de, dir, return -errno) { + char *z; + + if (de->d_type != DT_DIR) + continue; + + z = normalize_locale(de->d_name); + if (!z) + return -ENOMEM; + + r = set_consume(locales, z); + if (r < 0 && r != -EEXIST) + return r; + } + + return 0; +} + +int get_locales(char ***ret) { + _cleanup_set_free_free_ Set *locales = NULL; + _cleanup_strv_free_ char **l = NULL; + int r; + + locales = set_new(&string_hash_ops); + if (!locales) + return -ENOMEM; + + r = add_locales_from_archive(locales); + if (r < 0 && r != -ENOENT) + return r; + + r = add_locales_from_libdir(locales); + if (r < 0) + return r; + + char *locale; + SET_FOREACH(locale, locales) { + r = locale_is_installed(locale); + if (r < 0) + return r; + if (r == 0) + free(set_remove(locales, locale)); + } + + l = set_get_strv(locales); + if (!l) + return -ENOMEM; + + /* Now, all elements are owned by strv 'l'. Hence, do not call set_free_free(). */ + locales = set_free(locales); + + r = getenv_bool("SYSTEMD_LIST_NON_UTF8_LOCALES"); + if (r == -ENXIO || r == 0) { + char **a, **b; + + /* Filter out non-UTF-8 locales, because it's 2019, by default */ + for (a = b = l; *a; a++) { + + if (endswith(*a, "UTF-8") || + strstr(*a, ".UTF-8@")) + *(b++) = *a; + else + free(*a); + } + + *b = NULL; + + } else if (r < 0) + log_debug_errno(r, "Failed to parse $SYSTEMD_LIST_NON_UTF8_LOCALES as boolean"); + + strv_sort(l); + + *ret = TAKE_PTR(l); + + return 0; +} + +bool locale_is_valid(const char *name) { + + if (isempty(name)) + return false; + + if (strlen(name) >= 128) + return false; + + if (!utf8_is_valid(name)) + return false; + + if (!filename_is_valid(name)) + return false; + + if (!string_is_safe(name)) + return false; + + return true; +} + +int locale_is_installed(const char *name) { + if (!locale_is_valid(name)) + return false; + + if (STR_IN_SET(name, "C", "POSIX")) /* These ones are always OK */ + return true; + + _cleanup_(freelocalep) locale_t loc = + newlocale(LC_ALL_MASK, name, 0); + if (loc == (locale_t) 0) + return errno == ENOMEM ? -ENOMEM : false; + + return true; +} + +bool is_locale_utf8(void) { + static int cached_answer = -1; + const char *set; + int r; + + /* Note that we default to 'true' here, since today UTF8 is + * pretty much supported everywhere. */ + + if (cached_answer >= 0) + goto out; + + r = getenv_bool_secure("SYSTEMD_UTF8"); + if (r >= 0) { + cached_answer = r; + goto out; + } else if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_UTF8, ignoring: %m"); + + /* This function may be called from libsystemd, and setlocale() is not thread safe. Assuming yes. */ + if (gettid() != raw_getpid()) { + cached_answer = true; + goto out; + } + + if (!setlocale(LC_ALL, "")) { + cached_answer = true; + goto out; + } + + set = nl_langinfo(CODESET); + if (!set) { + cached_answer = true; + goto out; + } + + if (streq(set, "UTF-8")) { + cached_answer = true; + goto out; + } + + /* For LC_CTYPE=="C" return true, because CTYPE is effectively + * unset and everything can do to UTF-8 nowadays. */ + set = setlocale(LC_CTYPE, NULL); + if (!set) { + cached_answer = true; + goto out; + } + + /* Check result, but ignore the result if C was set + * explicitly. */ + cached_answer = + STR_IN_SET(set, "C", "POSIX") && + !getenv("LC_ALL") && + !getenv("LC_CTYPE") && + !getenv("LANG"); + +out: + return (bool) cached_answer; +} + +void locale_variables_free(char *l[_VARIABLE_LC_MAX]) { + free_many_charp(l, _VARIABLE_LC_MAX); +} + +void locale_variables_simplify(char *l[_VARIABLE_LC_MAX]) { + assert(l); + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) { + if (p == VARIABLE_LANG) + continue; + if (isempty(l[p]) || streq_ptr(l[VARIABLE_LANG], l[p])) + l[p] = mfree(l[p]); + } +} + +static const char * const locale_variable_table[_VARIABLE_LC_MAX] = { + [VARIABLE_LANG] = "LANG", + [VARIABLE_LANGUAGE] = "LANGUAGE", + [VARIABLE_LC_CTYPE] = "LC_CTYPE", + [VARIABLE_LC_NUMERIC] = "LC_NUMERIC", + [VARIABLE_LC_TIME] = "LC_TIME", + [VARIABLE_LC_COLLATE] = "LC_COLLATE", + [VARIABLE_LC_MONETARY] = "LC_MONETARY", + [VARIABLE_LC_MESSAGES] = "LC_MESSAGES", + [VARIABLE_LC_PAPER] = "LC_PAPER", + [VARIABLE_LC_NAME] = "LC_NAME", + [VARIABLE_LC_ADDRESS] = "LC_ADDRESS", + [VARIABLE_LC_TELEPHONE] = "LC_TELEPHONE", + [VARIABLE_LC_MEASUREMENT] = "LC_MEASUREMENT", + [VARIABLE_LC_IDENTIFICATION] = "LC_IDENTIFICATION" +}; + +DEFINE_STRING_TABLE_LOOKUP(locale_variable, LocaleVariable); diff --git a/src/basic/locale-util.h b/src/basic/locale-util.h new file mode 100644 index 0000000..81fe8d1 --- /dev/null +++ b/src/basic/locale-util.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" + +typedef enum LocaleVariable { + /* We don't list LC_ALL here on purpose. People should be + * using LANG instead. */ + + VARIABLE_LANG, + VARIABLE_LANGUAGE, + VARIABLE_LC_CTYPE, + VARIABLE_LC_NUMERIC, + VARIABLE_LC_TIME, + VARIABLE_LC_COLLATE, + VARIABLE_LC_MONETARY, + VARIABLE_LC_MESSAGES, + VARIABLE_LC_PAPER, + VARIABLE_LC_NAME, + VARIABLE_LC_ADDRESS, + VARIABLE_LC_TELEPHONE, + VARIABLE_LC_MEASUREMENT, + VARIABLE_LC_IDENTIFICATION, + _VARIABLE_LC_MAX, + _VARIABLE_LC_INVALID = -EINVAL, +} LocaleVariable; + +int get_locales(char ***l); +bool locale_is_valid(const char *name); +int locale_is_installed(const char *name); + +#define _(String) dgettext(GETTEXT_PACKAGE, String) +#define N_(String) String + +bool is_locale_utf8(void); + +const char* locale_variable_to_string(LocaleVariable i) _const_; +LocaleVariable locale_variable_from_string(const char *s) _pure_; + +static inline void freelocalep(locale_t *p) { + if (*p == (locale_t) 0) + return; + + freelocale(*p); +} + +void locale_variables_free(char* l[_VARIABLE_LC_MAX]); +static inline void locale_variables_freep(char*(*l)[_VARIABLE_LC_MAX]) { + locale_variables_free(*l); +} +void locale_variables_simplify(char *l[_VARIABLE_LC_MAX]); diff --git a/src/basic/lock-util.c b/src/basic/lock-util.c new file mode 100644 index 0000000..047fd01 --- /dev/null +++ b/src/basic/lock-util.c @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "lock-util.h" +#include "macro.h" +#include "missing_fcntl.h" +#include "path-util.h" +#include "process-util.h" + +int make_lock_file_at(int dir_fd, const char *p, int operation, LockFile *ret) { + _cleanup_close_ int fd = -EBADF, dfd = -EBADF; + _cleanup_free_ char *t = NULL; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(p); + assert(IN_SET(operation & ~LOCK_NB, LOCK_EX, LOCK_SH)); + assert(ret); + + if (isempty(p)) + return -EINVAL; + + /* We use UNPOSIX locks as they have nice semantics, and are mostly compatible with NFS. */ + + dfd = fd_reopen(dir_fd, O_CLOEXEC|O_PATH|O_DIRECTORY); + if (dfd < 0) + return dfd; + + t = strdup(p); + if (!t) + return -ENOMEM; + + fd = xopenat_lock(dfd, + p, + O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, + /* xopen_flags = */ 0, + 0600, + LOCK_UNPOSIX, + operation); + if (fd < 0) + return fd == -EAGAIN ? -EBUSY : fd; + + *ret = (LockFile) { + .dir_fd = TAKE_FD(dfd), + .path = TAKE_PTR(t), + .fd = TAKE_FD(fd), + .operation = operation, + }; + + return 0; +} + +int make_lock_file_for(const char *p, int operation, LockFile *ret) { + _cleanup_free_ char *fn = NULL, *dn = NULL, *t = NULL; + int r; + + assert(p); + assert(ret); + + r = path_extract_filename(p, &fn); + if (r < 0) + return r; + + r = path_extract_directory(p, &dn); + if (r < 0) + return r; + + t = strjoin(dn, "/.#", fn, ".lck"); + if (!t) + return -ENOMEM; + + return make_lock_file(t, operation, ret); +} + +void release_lock_file(LockFile *f) { + if (!f) + return; + + if (f->path) { + + /* If we are the exclusive owner we can safely delete + * the lock file itself. If we are not the exclusive + * owner, we can try becoming it. */ + + if (f->fd >= 0 && + (f->operation & ~LOCK_NB) == LOCK_SH && + unposix_lock(f->fd, LOCK_EX|LOCK_NB) >= 0) + f->operation = LOCK_EX|LOCK_NB; + + if ((f->operation & ~LOCK_NB) == LOCK_EX) + (void) unlinkat(f->dir_fd, f->path, 0); + + f->path = mfree(f->path); + } + + f->dir_fd = safe_close(f->dir_fd); + f->fd = safe_close(f->fd); + f->operation = 0; +} + +static int fcntl_lock(int fd, int operation, bool ofd) { + int cmd, type, r; + + assert(fd >= 0); + + if (ofd) + cmd = (operation & LOCK_NB) ? F_OFD_SETLK : F_OFD_SETLKW; + else + cmd = (operation & LOCK_NB) ? F_SETLK : F_SETLKW; + + switch (operation & ~LOCK_NB) { + case LOCK_EX: + type = F_WRLCK; + break; + case LOCK_SH: + type = F_RDLCK; + break; + case LOCK_UN: + type = F_UNLCK; + break; + default: + assert_not_reached(); + } + + r = RET_NERRNO(fcntl(fd, cmd, &(struct flock) { + .l_type = type, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + })); + + if (r == -EACCES) /* Treat EACCESS/EAGAIN the same as per man page. */ + r = -EAGAIN; + + return r; +} + +int posix_lock(int fd, int operation) { + return fcntl_lock(fd, operation, /*ofd=*/ false); +} + +int unposix_lock(int fd, int operation) { + return fcntl_lock(fd, operation, /*ofd=*/ true); +} + +void posix_unlockpp(int **fd) { + assert(fd); + + if (!*fd || **fd < 0) + return; + + (void) fcntl_lock(**fd, LOCK_UN, /*ofd=*/ false); + *fd = NULL; +} + +void unposix_unlockpp(int **fd) { + assert(fd); + + if (!*fd || **fd < 0) + return; + + (void) fcntl_lock(**fd, LOCK_UN, /*ofd=*/ true); + *fd = NULL; +} + +int lock_generic(int fd, LockType type, int operation) { + assert(fd >= 0); + + switch (type) { + case LOCK_NONE: + return 0; + case LOCK_BSD: + return RET_NERRNO(flock(fd, operation)); + case LOCK_POSIX: + return posix_lock(fd, operation); + case LOCK_UNPOSIX: + return unposix_lock(fd, operation); + default: + assert_not_reached(); + } +} + +int lock_generic_with_timeout(int fd, LockType type, int operation, usec_t timeout) { + _cleanup_(sigkill_waitp) pid_t pid = 0; + int r; + + assert(fd >= 0); + + /* A version of lock_generic(), but with a time-out. We do this in a child process, since the kernel + * APIs natively don't support a timeout. We set a SIGALRM timer that will kill the child after the + * timeout is hit. Returns -ETIMEDOUT if the time-out is hit, and 0 on success. + * + * This only works for BSD and UNPOSIX locks, as only those are fd-bound, and hence can be acquired + * from any process that has access to the fd. POSIX locks OTOH are process-bound, and hence if we'd + * acquire them in a child process they'd remain unlocked in the parent. */ + + if (type == LOCK_NONE) + return 0; + if (!IN_SET(type, LOCK_BSD, LOCK_UNPOSIX)) /* Not for POSIX locks, see above. */ + return -EOPNOTSUPP; + + /* First, try without forking anything off */ + r = lock_generic(fd, type, operation | (timeout == USEC_INFINITY ? 0 : LOCK_NB)); + if (r != -EAGAIN || timeout == 0 || FLAGS_SET(operation, LOCK_NB)) + return r; + + /* If that didn't work, try with a child */ + + r = safe_fork("(sd-flock)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid); + if (r < 0) + return log_error_errno(r, "Failed to flock block device in child process: %m"); + if (r == 0) { + struct sigevent sev = { + .sigev_notify = SIGEV_SIGNAL, + .sigev_signo = SIGALRM, + }; + timer_t id = 0; + + if (timer_create(CLOCK_MONOTONIC, &sev, &id) < 0) { + log_error_errno(errno, "Failed to allocate CLOCK_MONOTONIC timer: %m"); + _exit(EXIT_FAILURE); + } + + struct itimerspec its = {}; + timespec_store(&its.it_value, timeout); + + if (timer_settime(id, /* flags= */ 0, &its, NULL) < 0) { + log_error_errno(errno, "Failed to start CLOCK_MONOTONIC timer: %m"); + _exit(EXIT_FAILURE); + } + + if (lock_generic(fd, type, operation) < 0) { + log_error_errno(errno, "Unable to get an exclusive lock on the device: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + siginfo_t status; + r = wait_for_terminate(pid, &status); + if (r < 0) + return r; + + TAKE_PID(pid); + + switch (status.si_code) { + + case CLD_EXITED: + if (status.si_status != EXIT_SUCCESS) + return -EPROTO; + + return 0; + + case CLD_KILLED: + if (status.si_status == SIGALRM) + return -ETIMEDOUT; + + _fallthrough_; + + case CLD_DUMPED: + return -EPROTO; + + default: + assert_not_reached(); + } +} diff --git a/src/basic/lock-util.h b/src/basic/lock-util.h new file mode 100644 index 0000000..91b332f --- /dev/null +++ b/src/basic/lock-util.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef struct LockFile { + int dir_fd; + char *path; + int fd; + int operation; +} LockFile; + +int make_lock_file_at(int dir_fd, const char *p, int operation, LockFile *ret); +static inline int make_lock_file(const char *p, int operation, LockFile *ret) { + return make_lock_file_at(AT_FDCWD, p, operation, ret); +} +int make_lock_file_for(const char *p, int operation, LockFile *ret); +void release_lock_file(LockFile *f); + +#define LOCK_FILE_INIT { .dir_fd = -EBADF, .fd = -EBADF } + +/* POSIX locks with the same interface as flock(). */ +int posix_lock(int fd, int operation); +void posix_unlockpp(int **fd); + +#define CLEANUP_POSIX_UNLOCK(fd) \ + _cleanup_(posix_unlockpp) _unused_ int *CONCATENATE(_cleanup_posix_unlock_, UNIQ) = &(fd) + +/* Open File Description locks with the same interface as flock(). */ +int unposix_lock(int fd, int operation); +void unposix_unlockpp(int **fd); + +#define CLEANUP_UNPOSIX_UNLOCK(fd) \ + _cleanup_(unposix_unlockpp) _unused_ int *CONCATENATE(_cleanup_unposix_unlock_, UNIQ) = &(fd) + +typedef enum LockType { + LOCK_NONE, /* Don't lock the file descriptor. Useful if you need to conditionally lock a file. */ + LOCK_BSD, + LOCK_POSIX, + LOCK_UNPOSIX, +} LockType; + +int lock_generic(int fd, LockType type, int operation); + +int lock_generic_with_timeout(int fd, LockType type, int operation, usec_t timeout); diff --git a/src/basic/log.c b/src/basic/log.c new file mode 100644 index 0000000..1470611 --- /dev/null +++ b/src/basic/log.c @@ -0,0 +1,1810 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "argv-util.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "iovec-util.h" +#include "log.h" +#include "macro.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "ratelimit.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "time-util.h" +#include "utf8.h" + +#define SNDBUF_SIZE (8*1024*1024) +#define IOVEC_MAX 256U + +static log_syntax_callback_t log_syntax_callback = NULL; +static void *log_syntax_callback_userdata = NULL; + +static LogTarget log_target = LOG_TARGET_CONSOLE; +static int log_max_level = LOG_INFO; +static int log_facility = LOG_DAEMON; +static bool ratelimit_kmsg = true; + +static int console_fd = STDERR_FILENO; +static int console_fd_is_tty = -1; /* tri-state: -1 means don't know */ +static int syslog_fd = -EBADF; +static int kmsg_fd = -EBADF; +static int journal_fd = -EBADF; + +static bool syslog_is_stream = false; + +static int show_color = -1; /* tristate */ +static bool show_location = false; +static bool show_time = false; +static bool show_tid = false; + +static bool upgrade_syslog_to_journal = false; +static bool always_reopen_console = false; +static bool open_when_needed = false; +static bool prohibit_ipc = false; + +/* Akin to glibc's __abort_msg; which is private and we hence cannot + * use here. */ +static char *log_abort_msg = NULL; + +typedef struct LogContext { + unsigned n_ref; + /* Depending on which destructor is used (log_context_free() or log_context_detach()) the memory + * referenced by this is freed or not */ + char **fields; + struct iovec *input_iovec; + size_t n_input_iovec; + char *key; + char *value; + bool owned; + LIST_FIELDS(struct LogContext, ll); +} LogContext; + +static thread_local LIST_HEAD(LogContext, _log_context) = NULL; +static thread_local size_t _log_context_num_fields = 0; + +static thread_local const char *log_prefix = NULL; + +#if LOG_MESSAGE_VERIFICATION || defined(__COVERITY__) +bool _log_message_dummy = false; /* Always false */ +#endif + +/* An assert to use in logging functions that does not call recursively + * into our logging functions (since that might lead to a loop). */ +#define assert_raw(expr) \ + do { \ + if (_unlikely_(!(expr))) { \ + fputs(#expr "\n", stderr); \ + abort(); \ + } \ + } while (false) + +static void log_close_console(void) { + /* See comment in log_close_journal() */ + (void) safe_close_above_stdio(TAKE_FD(console_fd)); + console_fd_is_tty = -1; +} + +static int log_open_console(void) { + + if (!always_reopen_console) { + console_fd = STDERR_FILENO; + console_fd_is_tty = -1; + return 0; + } + + if (console_fd < 3) { + int fd; + + fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return fd; + + console_fd = fd_move_above_stdio(fd); + console_fd_is_tty = true; + } + + return 0; +} + +static void log_close_kmsg(void) { + /* See comment in log_close_journal() */ + (void) safe_close(TAKE_FD(kmsg_fd)); +} + +static int log_open_kmsg(void) { + + if (kmsg_fd >= 0) + return 0; + + kmsg_fd = open("/dev/kmsg", O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (kmsg_fd < 0) + return -errno; + + kmsg_fd = fd_move_above_stdio(kmsg_fd); + return 0; +} + +static void log_close_syslog(void) { + /* See comment in log_close_journal() */ + (void) safe_close(TAKE_FD(syslog_fd)); +} + +static int create_log_socket(int type) { + struct timeval tv; + int fd; + + fd = socket(AF_UNIX, type|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + fd = fd_move_above_stdio(fd); + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + /* We need a blocking fd here since we'd otherwise lose messages way too early. However, let's not hang forever + * in the unlikely case of a deadlock. */ + if (getpid_cached() == 1) + timeval_store(&tv, 10 * USEC_PER_MSEC); + else + timeval_store(&tv, 10 * USEC_PER_SEC); + (void) setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + + return fd; +} + +static int log_open_syslog(void) { + int r; + + if (syslog_fd >= 0) + return 0; + + syslog_fd = create_log_socket(SOCK_DGRAM); + if (syslog_fd < 0) { + r = syslog_fd; + goto fail; + } + + r = connect_unix_path(syslog_fd, AT_FDCWD, "/dev/log"); + if (r < 0) { + safe_close(syslog_fd); + + /* Some legacy syslog systems still use stream sockets. They really shouldn't. But what can + * we do... */ + syslog_fd = create_log_socket(SOCK_STREAM); + if (syslog_fd < 0) { + r = syslog_fd; + goto fail; + } + + r = connect_unix_path(syslog_fd, AT_FDCWD, "/dev/log"); + if (r < 0) + goto fail; + + syslog_is_stream = true; + } else + syslog_is_stream = false; + + return 0; + +fail: + log_close_syslog(); + return r; +} + +static void log_close_journal(void) { + /* If the journal FD is bad, safe_close will fail, and will try to log, which will fail, so we'll + * try to close the journal FD, which is bad, so safe_close will fail... Whether we can close it + * or not, invalidate it immediately so that we don't get in a recursive loop until we run out of + * stack. */ + (void) safe_close(TAKE_FD(journal_fd)); +} + +static int log_open_journal(void) { + int r; + + if (journal_fd >= 0) + return 0; + + journal_fd = create_log_socket(SOCK_DGRAM); + if (journal_fd < 0) { + r = journal_fd; + goto fail; + } + + r = connect_unix_path(journal_fd, AT_FDCWD, "/run/systemd/journal/socket"); + if (r < 0) + goto fail; + + return 0; + +fail: + log_close_journal(); + return r; +} + +static bool stderr_is_journal(void) { + _cleanup_free_ char *w = NULL; + const char *e; + uint64_t dev, ino; + struct stat st; + + e = getenv("JOURNAL_STREAM"); + if (!e) + return false; + + if (extract_first_word(&e, &w, ":", EXTRACT_DONT_COALESCE_SEPARATORS) <= 0) + return false; + if (!e) + return false; + + if (safe_atou64(w, &dev) < 0) + return false; + if (safe_atou64(e, &ino) < 0) + return false; + + if (fstat(STDERR_FILENO, &st) < 0) + return false; + + return st.st_dev == dev && st.st_ino == ino; +} + +int log_open(void) { + int r; + + /* Do not call from library code. */ + + /* This function is often called in preparation for logging. Let's make sure we don't clobber errno, + * so that a call to a logging function immediately following a log_open() call can still easily + * reference an error that happened immediately before the log_open() call. */ + PROTECT_ERRNO; + + /* If we don't use the console, we close it here to not get killed by SAK. If we don't use syslog, we + * close it here too, so that we are not confused by somebody deleting the socket in the fs, and to + * make sure we don't use it if prohibit_ipc is set. If we don't use /dev/kmsg we still keep it open, + * because there is no reason to close it. */ + + if (log_target == LOG_TARGET_NULL) { + log_close_journal(); + log_close_syslog(); + log_close_console(); + return 0; + } + + if (getpid_cached() == 1 || + stderr_is_journal() || + IN_SET(log_target, + LOG_TARGET_KMSG, + LOG_TARGET_JOURNAL, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_SYSLOG, + LOG_TARGET_SYSLOG_OR_KMSG)) { + + if (!prohibit_ipc) { + if (IN_SET(log_target, + LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_JOURNAL)) { + + r = log_open_journal(); + if (r >= 0) { + log_close_syslog(); + log_close_console(); + return r; + } + } + + if (IN_SET(log_target, + LOG_TARGET_SYSLOG_OR_KMSG, + LOG_TARGET_SYSLOG)) { + + r = log_open_syslog(); + if (r >= 0) { + log_close_journal(); + log_close_console(); + return r; + } + } + } + + if (IN_SET(log_target, LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_SYSLOG_OR_KMSG, + LOG_TARGET_KMSG)) { + r = log_open_kmsg(); + if (r >= 0) { + log_close_journal(); + log_close_syslog(); + log_close_console(); + return r; + } + } + } + + log_close_journal(); + log_close_syslog(); + + return log_open_console(); +} + +void log_set_target(LogTarget target) { + assert(target >= 0); + assert(target < _LOG_TARGET_MAX); + + if (upgrade_syslog_to_journal) { + if (target == LOG_TARGET_SYSLOG) + target = LOG_TARGET_JOURNAL; + else if (target == LOG_TARGET_SYSLOG_OR_KMSG) + target = LOG_TARGET_JOURNAL_OR_KMSG; + } + + log_target = target; +} + +void log_set_target_and_open(LogTarget target) { + log_set_target(target); + log_open(); +} + +void log_close(void) { + /* Do not call from library code. */ + + log_close_journal(); + log_close_syslog(); + log_close_kmsg(); + log_close_console(); +} + +void log_forget_fds(void) { + /* Do not call from library code. */ + + console_fd = kmsg_fd = syslog_fd = journal_fd = -EBADF; + console_fd_is_tty = -1; +} + +void log_set_max_level(int level) { + assert(level == LOG_NULL || (level & LOG_PRIMASK) == level); + + log_max_level = level; + + /* Also propagate max log level to libc's syslog(), just in case some other component loaded into our + * process logs directly via syslog(). You might wonder why we maintain our own log level variable if + * libc has the same functionality. This has multiple reasons, first and foremost that we want to + * apply this to all our log targets, not just syslog and console. Moreover, we cannot query the + * current log mask from glibc without changing it, but that's useful for testing the current log + * level before even entering the log functions like we do in our macros. */ + setlogmask(LOG_UPTO(level)); + + /* Ensure that our own LOG_NULL define maps sanely to the log mask */ + assert_cc(LOG_UPTO(LOG_NULL) == 0); +} + +void log_set_facility(int facility) { + log_facility = facility; +} + +static bool check_console_fd_is_tty(void) { + if (console_fd < 0) + return false; + + if (console_fd_is_tty < 0) + console_fd_is_tty = isatty(console_fd) > 0; + + return console_fd_is_tty; +} + +static int write_to_console( + int level, + int error, + const char *file, + int line, + const char *func, + const char *buffer) { + + char location[256], + header_time[FORMAT_TIMESTAMP_MAX], + prefix[1 + DECIMAL_STR_MAX(int) + 2], + tid_string[3 + DECIMAL_STR_MAX(pid_t) + 1]; + struct iovec iovec[11]; + const char *on = NULL, *off = NULL; + size_t n = 0; + + if (console_fd < 0) + return 0; + + if (log_target == LOG_TARGET_CONSOLE_PREFIXED) { + xsprintf(prefix, "<%i>", level); + iovec[n++] = IOVEC_MAKE_STRING(prefix); + } + + if (show_time && + format_timestamp(header_time, sizeof(header_time), now(CLOCK_REALTIME))) { + iovec[n++] = IOVEC_MAKE_STRING(header_time); + iovec[n++] = IOVEC_MAKE_STRING(" "); + } + + if (show_tid) { + xsprintf(tid_string, "(" PID_FMT ") ", gettid()); + iovec[n++] = IOVEC_MAKE_STRING(tid_string); + } + + if (log_get_show_color()) + get_log_colors(LOG_PRI(level), &on, &off, NULL); + + if (show_location) { + const char *lon = "", *loff = ""; + if (log_get_show_color()) { + lon = ansi_highlight_yellow4(); + loff = ansi_normal(); + } + + (void) snprintf(location, sizeof location, "%s%s:%i%s: ", lon, file, line, loff); + iovec[n++] = IOVEC_MAKE_STRING(location); + } + + if (on) + iovec[n++] = IOVEC_MAKE_STRING(on); + if (log_prefix) { + iovec[n++] = IOVEC_MAKE_STRING(log_prefix); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + iovec[n++] = IOVEC_MAKE_STRING(buffer); + if (off) + iovec[n++] = IOVEC_MAKE_STRING(off); + + /* When writing to a TTY we output an extra '\r' (i.e. CR) first, to generate CRNL rather than just + * NL. This is a robustness thing in case the TTY is currently in raw mode (specifically: has the + * ONLCR flag off). We want that subsequent output definitely starts at the beginning of the line + * again, after all. If the TTY is not in raw mode the extra CR should not hurt. */ + iovec[n++] = IOVEC_MAKE_STRING(check_console_fd_is_tty() ? "\r\n" : "\n"); + + if (writev(console_fd, iovec, n) < 0) { + + if (errno == EIO && getpid_cached() == 1) { + + /* If somebody tried to kick us from our console tty (via vhangup() or suchlike), try + * to reconnect. */ + + log_close_console(); + (void) log_open_console(); + if (console_fd < 0) + return 0; + + if (writev(console_fd, iovec, n) < 0) + return -errno; + } else + return -errno; + } + + return 1; +} + +static int write_to_syslog( + int level, + int error, + const char *file, + int line, + const char *func, + const char *buffer) { + + char header_priority[2 + DECIMAL_STR_MAX(int) + 1], + header_time[64], + header_pid[4 + DECIMAL_STR_MAX(pid_t) + 1]; + time_t t; + struct tm tm; + + if (syslog_fd < 0) + return 0; + + xsprintf(header_priority, "<%i>", level); + + t = (time_t) (now(CLOCK_REALTIME) / USEC_PER_SEC); + if (!localtime_r(&t, &tm)) + return -EINVAL; + + if (strftime(header_time, sizeof(header_time), "%h %e %T ", &tm) <= 0) + return -EINVAL; + + xsprintf(header_pid, "["PID_FMT"]: ", getpid_cached()); + + struct iovec iovec[] = { + IOVEC_MAKE_STRING(header_priority), + IOVEC_MAKE_STRING(header_time), + IOVEC_MAKE_STRING(program_invocation_short_name), + IOVEC_MAKE_STRING(header_pid), + IOVEC_MAKE_STRING(strempty(log_prefix)), + IOVEC_MAKE_STRING(log_prefix ? ": " : ""), + IOVEC_MAKE_STRING(buffer), + }; + const struct msghdr msghdr = { + .msg_iov = iovec, + .msg_iovlen = ELEMENTSOF(iovec), + }; + + /* When using syslog via SOCK_STREAM separate the messages by NUL chars */ + if (syslog_is_stream) + iovec[ELEMENTSOF(iovec) - 1].iov_len++; + + for (;;) { + ssize_t n; + + n = sendmsg(syslog_fd, &msghdr, MSG_NOSIGNAL); + if (n < 0) + return -errno; + + if (!syslog_is_stream) + break; + + if (iovec_increment(iovec, ELEMENTSOF(iovec), n)) + break; + } + + return 1; +} + +static int write_to_kmsg( + int level, + int error, + const char *file, + int line, + const char *func, + const char *buffer) { + + /* Set a ratelimit on the amount of messages logged to /dev/kmsg. This is mostly supposed to be a + * safety catch for the case where start indiscriminately logging in a loop. It will not catch cases + * where we log excessively, but not in a tight loop. + * + * Note that this ratelimit is per-emitter, so we might still overwhelm /dev/kmsg with multiple + * loggers. + */ + static thread_local RateLimit ratelimit = { 5 * USEC_PER_SEC, 200 }; + + char header_priority[2 + DECIMAL_STR_MAX(int) + 1], + header_pid[4 + DECIMAL_STR_MAX(pid_t) + 1]; + + if (kmsg_fd < 0) + return 0; + + if (ratelimit_kmsg && !ratelimit_below(&ratelimit)) { + if (ratelimit_num_dropped(&ratelimit) > 1) + return 0; + + buffer = "Too many messages being logged to kmsg, ignoring"; + } + + xsprintf(header_priority, "<%i>", level); + xsprintf(header_pid, "["PID_FMT"]: ", getpid_cached()); + + const struct iovec iovec[] = { + IOVEC_MAKE_STRING(header_priority), + IOVEC_MAKE_STRING(program_invocation_short_name), + IOVEC_MAKE_STRING(header_pid), + IOVEC_MAKE_STRING(strempty(log_prefix)), + IOVEC_MAKE_STRING(log_prefix ? ": " : ""), + IOVEC_MAKE_STRING(buffer), + IOVEC_MAKE_STRING("\n"), + }; + + if (writev(kmsg_fd, iovec, ELEMENTSOF(iovec)) < 0) + return -errno; + + return 1; +} + +static int log_do_header( + char *header, + size_t size, + int level, + int error, + const char *file, int line, const char *func, + const char *object_field, const char *object, + const char *extra_field, const char *extra) { + int r; + + error = IS_SYNTHETIC_ERRNO(error) ? 0 : ERRNO_VALUE(error); + + r = snprintf(header, size, + "PRIORITY=%i\n" + "SYSLOG_FACILITY=%i\n" + "TID=" PID_FMT "\n" + "%s%.256s%s" /* CODE_FILE */ + "%s%.*i%s" /* CODE_LINE */ + "%s%.256s%s" /* CODE_FUNC */ + "%s%.*i%s" /* ERRNO */ + "%s%.256s%s" /* object */ + "%s%.256s%s" /* extra */ + "SYSLOG_IDENTIFIER=%.256s\n", + LOG_PRI(level), + LOG_FAC(level), + gettid(), + isempty(file) ? "" : "CODE_FILE=", + isempty(file) ? "" : file, + isempty(file) ? "" : "\n", + line ? "CODE_LINE=" : "", + line ? 1 : 0, line, /* %.0d means no output too, special case for 0 */ + line ? "\n" : "", + isempty(func) ? "" : "CODE_FUNC=", + isempty(func) ? "" : func, + isempty(func) ? "" : "\n", + error ? "ERRNO=" : "", + error ? 1 : 0, error, + error ? "\n" : "", + isempty(object) ? "" : object_field, + isempty(object) ? "" : object, + isempty(object) ? "" : "\n", + isempty(extra) ? "" : extra_field, + isempty(extra) ? "" : extra, + isempty(extra) ? "" : "\n", + program_invocation_short_name); + assert_raw((size_t) r < size); + + return 0; +} + +static void log_do_context(struct iovec *iovec, size_t iovec_len, size_t *n) { + assert(iovec); + assert(n); + + LIST_FOREACH(ll, c, _log_context) { + STRV_FOREACH(s, c->fields) { + if (*n + 2 >= iovec_len) + return; + + iovec[(*n)++] = IOVEC_MAKE_STRING(*s); + iovec[(*n)++] = IOVEC_MAKE_STRING("\n"); + } + + for (size_t i = 0; i < c->n_input_iovec; i++) { + if (*n + 2 >= iovec_len) + return; + + iovec[(*n)++] = c->input_iovec[i]; + iovec[(*n)++] = IOVEC_MAKE_STRING("\n"); + } + + if (c->key && c->value) { + if (*n + 3 >= iovec_len) + return; + + iovec[(*n)++] = IOVEC_MAKE_STRING(c->key); + iovec[(*n)++] = IOVEC_MAKE_STRING(c->value); + iovec[(*n)++] = IOVEC_MAKE_STRING("\n"); + } + } +} + +static int write_to_journal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra_field, + const char *extra, + const char *buffer) { + + char header[LINE_MAX]; + size_t n = 0, iovec_len; + struct iovec *iovec; + + if (journal_fd < 0) + return 0; + + iovec_len = MIN(6 + _log_context_num_fields * 2, IOVEC_MAX); + iovec = newa(struct iovec, iovec_len); + + log_do_header(header, sizeof(header), level, error, file, line, func, object_field, object, extra_field, extra); + + iovec[n++] = IOVEC_MAKE_STRING(header); + iovec[n++] = IOVEC_MAKE_STRING("MESSAGE="); + if (log_prefix) { + iovec[n++] = IOVEC_MAKE_STRING(log_prefix); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + iovec[n++] = IOVEC_MAKE_STRING(buffer); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + log_do_context(iovec, iovec_len, &n); + + const struct msghdr msghdr = { + .msg_iov = iovec, + .msg_iovlen = n, + }; + + if (sendmsg(journal_fd, &msghdr, MSG_NOSIGNAL) < 0) + return -errno; + + return 1; +} + +int log_dispatch_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra_field, + const char *extra, + char *buffer) { + + assert_raw(buffer); + + if (log_target == LOG_TARGET_NULL) + return -ERRNO_VALUE(error); + + /* Patch in LOG_DAEMON facility if necessary */ + if ((level & LOG_FACMASK) == 0) + level |= log_facility; + + if (open_when_needed) + (void) log_open(); + + do { + char *e; + int k = 0; + + buffer += strspn(buffer, NEWLINE); + + if (buffer[0] == 0) + break; + + if ((e = strpbrk(buffer, NEWLINE))) + *(e++) = 0; + + if (IN_SET(log_target, LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_JOURNAL)) { + + k = write_to_journal(level, error, file, line, func, object_field, object, extra_field, extra, buffer); + if (k < 0 && k != -EAGAIN) + log_close_journal(); + } + + if (IN_SET(log_target, LOG_TARGET_SYSLOG_OR_KMSG, + LOG_TARGET_SYSLOG)) { + + k = write_to_syslog(level, error, file, line, func, buffer); + if (k < 0 && k != -EAGAIN) + log_close_syslog(); + } + + if (k <= 0 && + IN_SET(log_target, LOG_TARGET_AUTO, + LOG_TARGET_SYSLOG_OR_KMSG, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_KMSG)) { + + if (k < 0) + log_open_kmsg(); + + k = write_to_kmsg(level, error, file, line, func, buffer); + if (k < 0) { + log_close_kmsg(); + (void) log_open_console(); + } + } + + if (k <= 0) + (void) write_to_console(level, error, file, line, func, buffer); + + buffer = e; + } while (buffer); + + if (open_when_needed) + log_close(); + + return -ERRNO_VALUE(error); +} + +int log_dump_internal( + int level, + int error, + const char *file, + int line, + const char *func, + char *buffer) { + + PROTECT_ERRNO; + + /* This modifies the buffer... */ + + if (_likely_(LOG_PRI(level) > log_max_level)) + return -ERRNO_VALUE(error); + + return log_dispatch_internal(level, error, file, line, func, NULL, NULL, NULL, NULL, buffer); +} + +int log_internalv( + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, + va_list ap) { + + if (_likely_(LOG_PRI(level) > log_max_level)) + return -ERRNO_VALUE(error); + + /* Make sure that %m maps to the specified error (or "Success"). */ + char buffer[LINE_MAX]; + LOCAL_ERRNO(ERRNO_VALUE(error)); + + (void) vsnprintf(buffer, sizeof buffer, format, ap); + + return log_dispatch_internal(level, error, file, line, func, NULL, NULL, NULL, NULL, buffer); +} + +int log_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) { + + va_list ap; + int r; + + va_start(ap, format); + r = log_internalv(level, error, file, line, func, format, ap); + va_end(ap); + + return r; +} + +int log_object_internalv( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra_field, + const char *extra, + const char *format, + va_list ap) { + + char *buffer, *b; + + if (_likely_(LOG_PRI(level) > log_max_level)) + return -ERRNO_VALUE(error); + + /* Make sure that %m maps to the specified error (or "Success"). */ + LOCAL_ERRNO(ERRNO_VALUE(error)); + + LOG_SET_PREFIX(object); + + b = buffer = newa(char, LINE_MAX); + (void) vsnprintf(b, LINE_MAX, format, ap); + + return log_dispatch_internal(level, error, file, line, func, + object_field, object, extra_field, extra, buffer); +} + +int log_object_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra_field, + const char *extra, + const char *format, ...) { + + va_list ap; + int r; + + va_start(ap, format); + r = log_object_internalv(level, error, file, line, func, object_field, object, extra_field, extra, format, ap); + va_end(ap); + + return r; +} + +static void log_assert( + int level, + const char *text, + const char *file, + int line, + const char *func, + const char *format) { + + static char buffer[LINE_MAX]; + + if (_likely_(LOG_PRI(level) > log_max_level)) + return; + + DISABLE_WARNING_FORMAT_NONLITERAL; + (void) snprintf(buffer, sizeof buffer, format, text, file, line, func); + REENABLE_WARNING; + + log_abort_msg = buffer; + + log_dispatch_internal(level, 0, file, line, func, NULL, NULL, NULL, NULL, buffer); +} + +_noreturn_ void log_assert_failed( + const char *text, + const char *file, + int line, + const char *func) { + log_assert(LOG_CRIT, text, file, line, func, + "Assertion '%s' failed at %s:%u, function %s(). Aborting."); + abort(); +} + +_noreturn_ void log_assert_failed_unreachable( + const char *file, + int line, + const char *func) { + log_assert(LOG_CRIT, "Code should not be reached", file, line, func, + "%s at %s:%u, function %s(). Aborting. 💥"); + abort(); +} + +void log_assert_failed_return( + const char *text, + const char *file, + int line, + const char *func) { + PROTECT_ERRNO; + log_assert(LOG_DEBUG, text, file, line, func, + "Assertion '%s' failed at %s:%u, function %s(). Ignoring."); +} + +int log_oom_internal(int level, const char *file, int line, const char *func) { + return log_internal(level, ENOMEM, file, line, func, "Out of memory."); +} + +int log_format_iovec( + struct iovec *iovec, + size_t iovec_len, + size_t *n, + bool newline_separator, + int error, + const char *format, + va_list ap) { + + static const char nl = '\n'; + + while (format && *n + 1 < iovec_len) { + va_list aq; + char *m; + int r; + + /* We need to copy the va_list structure, + * since vasprintf() leaves it afterwards at + * an undefined location */ + + errno = ERRNO_VALUE(error); + + va_copy(aq, ap); + r = vasprintf(&m, format, aq); + va_end(aq); + if (r < 0) + return -EINVAL; + + /* Now, jump enough ahead, so that we point to + * the next format string */ + VA_FORMAT_ADVANCE(format, ap); + + iovec[(*n)++] = IOVEC_MAKE_STRING(m); + if (newline_separator) + iovec[(*n)++] = IOVEC_MAKE((char *)&nl, 1); + + format = va_arg(ap, char *); + } + return 0; +} + +int log_struct_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) { + + char buf[LINE_MAX]; + bool found = false; + PROTECT_ERRNO; + va_list ap; + + if (_likely_(LOG_PRI(level) > log_max_level) || + log_target == LOG_TARGET_NULL) + return -ERRNO_VALUE(error); + + if ((level & LOG_FACMASK) == 0) + level |= log_facility; + + if (IN_SET(log_target, + LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_JOURNAL)) { + + if (open_when_needed) + log_open_journal(); + + if (journal_fd >= 0) { + char header[LINE_MAX]; + struct iovec *iovec; + size_t n = 0, m, iovec_len; + int r; + bool fallback = false; + + iovec_len = MIN(17 + _log_context_num_fields * 2, IOVEC_MAX); + iovec = newa(struct iovec, iovec_len); + + /* If the journal is available do structured logging. + * Do not report the errno if it is synthetic. */ + log_do_header(header, sizeof(header), level, error, file, line, func, NULL, NULL, NULL, NULL); + iovec[n++] = IOVEC_MAKE_STRING(header); + + va_start(ap, format); + r = log_format_iovec(iovec, iovec_len, &n, true, error, format, ap); + m = n; + if (r < 0) + fallback = true; + else { + log_do_context(iovec, iovec_len, &n); + + const struct msghdr msghdr = { + .msg_iov = iovec, + .msg_iovlen = n, + }; + + (void) sendmsg(journal_fd, &msghdr, MSG_NOSIGNAL); + } + + va_end(ap); + for (size_t i = 1; i < m; i += 2) + free(iovec[i].iov_base); + + if (!fallback) { + if (open_when_needed) + log_close(); + + return -ERRNO_VALUE(error); + } + } + } + + /* Fallback if journal logging is not available or didn't work. */ + + va_start(ap, format); + while (format) { + va_list aq; + + errno = ERRNO_VALUE(error); + + va_copy(aq, ap); + (void) vsnprintf(buf, sizeof buf, format, aq); + va_end(aq); + + if (startswith(buf, "MESSAGE=")) { + found = true; + break; + } + + VA_FORMAT_ADVANCE(format, ap); + + format = va_arg(ap, char *); + } + va_end(ap); + + if (!found) { + if (open_when_needed) + log_close(); + + return -ERRNO_VALUE(error); + } + + return log_dispatch_internal(level, error, file, line, func, NULL, NULL, NULL, NULL, buf + 8); +} + +int log_struct_iovec_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const struct iovec input_iovec[], + size_t n_input_iovec) { + + PROTECT_ERRNO; + + if (_likely_(LOG_PRI(level) > log_max_level) || + log_target == LOG_TARGET_NULL) + return -ERRNO_VALUE(error); + + if ((level & LOG_FACMASK) == 0) + level |= log_facility; + + if (IN_SET(log_target, LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_JOURNAL) && + journal_fd >= 0) { + + char header[LINE_MAX]; + struct iovec *iovec; + size_t n = 0, iovec_len; + + iovec_len = MIN(1 + n_input_iovec * 2 + _log_context_num_fields * 2, IOVEC_MAX); + iovec = newa(struct iovec, iovec_len); + + log_do_header(header, sizeof(header), level, error, file, line, func, NULL, NULL, NULL, NULL); + + iovec[n++] = IOVEC_MAKE_STRING(header); + for (size_t i = 0; i < n_input_iovec; i++) { + iovec[n++] = input_iovec[i]; + iovec[n++] = IOVEC_MAKE_STRING("\n"); + } + + log_do_context(iovec, iovec_len, &n); + + const struct msghdr msghdr = { + .msg_iov = iovec, + .msg_iovlen = n, + }; + + if (sendmsg(journal_fd, &msghdr, MSG_NOSIGNAL) >= 0) + return -ERRNO_VALUE(error); + } + + for (size_t i = 0; i < n_input_iovec; i++) + if (memory_startswith(input_iovec[i].iov_base, input_iovec[i].iov_len, "MESSAGE=")) { + char *m; + + m = strndupa_safe((char*) input_iovec[i].iov_base + STRLEN("MESSAGE="), + input_iovec[i].iov_len - STRLEN("MESSAGE=")); + + return log_dispatch_internal(level, error, file, line, func, NULL, NULL, NULL, NULL, m); + } + + /* Couldn't find MESSAGE=. */ + return -ERRNO_VALUE(error); +} + +int log_set_target_from_string(const char *e) { + LogTarget t; + + t = log_target_from_string(e); + if (t < 0) + return t; + + log_set_target(t); + return 0; +} + +int log_set_max_level_from_string(const char *e) { + int r; + + r = log_level_from_string(e); + if (r < 0) + return r; + + log_set_max_level(r); + return 0; +} + +static int log_set_ratelimit_kmsg_from_string(const char *e) { + int r; + + r = parse_boolean(e); + if (r < 0) + return r; + + ratelimit_kmsg = r; + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + + /* + * The systemd.log_xyz= settings are parsed by all tools, and + * so is "debug". + * + * However, "quiet" is only parsed by PID 1, and only turns of + * status output to /dev/console, but does not alter the log + * level. + */ + + if (streq(key, "debug") && !value) + log_set_max_level(LOG_DEBUG); + + else if (proc_cmdline_key_streq(key, "systemd.log_target")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (log_set_target_from_string(value) < 0) + log_warning("Failed to parse log target '%s'. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.log_level")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (log_set_max_level_from_string(value) < 0) + log_warning("Failed to parse log level '%s'. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.log_color")) { + + if (log_show_color_from_string(value ?: "1") < 0) + log_warning("Failed to parse log color setting '%s'. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.log_location")) { + + if (log_show_location_from_string(value ?: "1") < 0) + log_warning("Failed to parse log location setting '%s'. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.log_tid")) { + + if (log_show_tid_from_string(value ?: "1") < 0) + log_warning("Failed to parse log tid setting '%s'. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.log_time")) { + + if (log_show_time_from_string(value ?: "1") < 0) + log_warning("Failed to parse log time setting '%s'. Ignoring.", value); + + } else if (proc_cmdline_key_streq(key, "systemd.log_ratelimit_kmsg")) { + + if (log_set_ratelimit_kmsg_from_string(value ?: "1") < 0) + log_warning("Failed to parse log ratelimit kmsg boolean '%s'. Ignoring.", value); + } + + return 0; +} + +static bool should_parse_proc_cmdline(void) { + /* PID1 always reads the kernel command line. */ + if (getpid_cached() == 1) + return true; + + /* Otherwise, parse the command line if invoked directly by systemd. */ + return invoked_by_systemd(); +} + +void log_parse_environment_variables(void) { + const char *e; + + e = getenv("SYSTEMD_LOG_TARGET"); + if (e && log_set_target_from_string(e) < 0) + log_warning("Failed to parse log target '%s'. Ignoring.", e); + + e = getenv("SYSTEMD_LOG_LEVEL"); + if (e && log_set_max_level_from_string(e) < 0) + log_warning("Failed to parse log level '%s'. Ignoring.", e); + + e = getenv("SYSTEMD_LOG_COLOR"); + if (e && log_show_color_from_string(e) < 0) + log_warning("Failed to parse log color '%s'. Ignoring.", e); + + e = getenv("SYSTEMD_LOG_LOCATION"); + if (e && log_show_location_from_string(e) < 0) + log_warning("Failed to parse log location '%s'. Ignoring.", e); + + e = getenv("SYSTEMD_LOG_TIME"); + if (e && log_show_time_from_string(e) < 0) + log_warning("Failed to parse log time '%s'. Ignoring.", e); + + e = getenv("SYSTEMD_LOG_TID"); + if (e && log_show_tid_from_string(e) < 0) + log_warning("Failed to parse log tid '%s'. Ignoring.", e); + + e = getenv("SYSTEMD_LOG_RATELIMIT_KMSG"); + if (e && log_set_ratelimit_kmsg_from_string(e) < 0) + log_warning("Failed to parse log ratelimit kmsg boolean '%s'. Ignoring.", e); +} + +void log_parse_environment(void) { + /* Do not call from library code. */ + + if (should_parse_proc_cmdline()) + (void) proc_cmdline_parse(parse_proc_cmdline_item, NULL, PROC_CMDLINE_STRIP_RD_PREFIX); + + log_parse_environment_variables(); +} + +LogTarget log_get_target(void) { + return log_target; +} + +void log_settle_target(void) { + + /* If we're using LOG_TARGET_AUTO and opening the log again on every single log call, we'll check if + * stderr is attached to the journal every single log call. However, if we then close all file + * descriptors later, that will stop working because stderr will be closed as well. To avoid that + * problem, this function is used to permanently change the log target depending on whether stderr is + * connected to the journal or not. */ + + LogTarget t = log_get_target(); + + if (t != LOG_TARGET_AUTO) + return; + + t = getpid_cached() == 1 || stderr_is_journal() ? (prohibit_ipc ? LOG_TARGET_KMSG : LOG_TARGET_JOURNAL_OR_KMSG) + : LOG_TARGET_CONSOLE; + log_set_target(t); +} + +int log_get_max_level(void) { + return log_max_level; +} + +void log_show_color(bool b) { + show_color = b; +} + +bool log_get_show_color(void) { + return show_color > 0; /* Defaults to false. */ +} + +void log_show_location(bool b) { + show_location = b; +} + +bool log_get_show_location(void) { + return show_location; +} + +void log_show_time(bool b) { + show_time = b; +} + +bool log_get_show_time(void) { + return show_time; +} + +void log_show_tid(bool b) { + show_tid = b; +} + +bool log_get_show_tid(void) { + return show_tid; +} + +int log_show_color_from_string(const char *e) { + int r; + + r = parse_boolean(e); + if (r < 0) + return r; + + log_show_color(r); + return 0; +} + +int log_show_location_from_string(const char *e) { + int r; + + r = parse_boolean(e); + if (r < 0) + return r; + + log_show_location(r); + return 0; +} + +int log_show_time_from_string(const char *e) { + int r; + + r = parse_boolean(e); + if (r < 0) + return r; + + log_show_time(r); + return 0; +} + +int log_show_tid_from_string(const char *e) { + int r; + + r = parse_boolean(e); + if (r < 0) + return r; + + log_show_tid(r); + return 0; +} + +bool log_on_console(void) { + if (IN_SET(log_target, LOG_TARGET_CONSOLE, + LOG_TARGET_CONSOLE_PREFIXED)) + return true; + + return syslog_fd < 0 && kmsg_fd < 0 && journal_fd < 0; +} + +static const char *const log_target_table[_LOG_TARGET_MAX] = { + [LOG_TARGET_CONSOLE] = "console", + [LOG_TARGET_CONSOLE_PREFIXED] = "console-prefixed", + [LOG_TARGET_KMSG] = "kmsg", + [LOG_TARGET_JOURNAL] = "journal", + [LOG_TARGET_JOURNAL_OR_KMSG] = "journal-or-kmsg", + [LOG_TARGET_SYSLOG] = "syslog", + [LOG_TARGET_SYSLOG_OR_KMSG] = "syslog-or-kmsg", + [LOG_TARGET_AUTO] = "auto", + [LOG_TARGET_NULL] = "null", +}; + +DEFINE_STRING_TABLE_LOOKUP(log_target, LogTarget); + +void log_received_signal(int level, const struct signalfd_siginfo *si) { + assert(si); + + if (pid_is_valid(si->ssi_pid)) { + _cleanup_free_ char *p = NULL; + + (void) pid_get_comm(si->ssi_pid, &p); + + log_full(level, + "Received SIG%s from PID %"PRIu32" (%s).", + signal_to_string(si->ssi_signo), + si->ssi_pid, strna(p)); + } else + log_full(level, + "Received SIG%s.", + signal_to_string(si->ssi_signo)); +} + +void set_log_syntax_callback(log_syntax_callback_t cb, void *userdata) { + assert(!log_syntax_callback || !cb); + assert(!log_syntax_callback_userdata || !userdata); + + log_syntax_callback = cb; + log_syntax_callback_userdata = userdata; +} + +int log_syntax_internal( + const char *unit, + int level, + const char *config_file, + unsigned config_line, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) { + + PROTECT_ERRNO; + + if (log_syntax_callback) + log_syntax_callback(unit, level, log_syntax_callback_userdata); + + if (_likely_(LOG_PRI(level) > log_max_level) || + log_target == LOG_TARGET_NULL) + return -ERRNO_VALUE(error); + + char buffer[LINE_MAX]; + va_list ap; + const char *unit_fmt = NULL; + + errno = ERRNO_VALUE(error); + + va_start(ap, format); + (void) vsnprintf(buffer, sizeof buffer, format, ap); + va_end(ap); + + if (unit) + unit_fmt = getpid_cached() == 1 ? "UNIT=%s" : "USER_UNIT=%s"; + + if (config_file) { + if (config_line > 0) + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + "CONFIG_FILE=%s", config_file, + "CONFIG_LINE=%u", config_line, + LOG_MESSAGE("%s:%u: %s", config_file, config_line, buffer), + unit_fmt, unit, + NULL); + else + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + "CONFIG_FILE=%s", config_file, + LOG_MESSAGE("%s: %s", config_file, buffer), + unit_fmt, unit, + NULL); + } else if (unit) + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + LOG_MESSAGE("%s: %s", unit, buffer), + unit_fmt, unit, + NULL); + else + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + LOG_MESSAGE("%s", buffer), + NULL); +} + +int log_syntax_invalid_utf8_internal( + const char *unit, + int level, + const char *config_file, + unsigned config_line, + const char *file, + int line, + const char *func, + const char *rvalue) { + + _cleanup_free_ char *p = NULL; + + if (rvalue) + p = utf8_escape_invalid(rvalue); + + return log_syntax_internal(unit, level, config_file, config_line, + SYNTHETIC_ERRNO(EINVAL), file, line, func, + "String is not UTF-8 clean, ignoring assignment: %s", strna(p)); +} + +void log_set_upgrade_syslog_to_journal(bool b) { + upgrade_syslog_to_journal = b; + + /* Make the change effective immediately */ + if (b) { + if (log_target == LOG_TARGET_SYSLOG) + log_target = LOG_TARGET_JOURNAL; + else if (log_target == LOG_TARGET_SYSLOG_OR_KMSG) + log_target = LOG_TARGET_JOURNAL_OR_KMSG; + } +} + +void log_set_always_reopen_console(bool b) { + always_reopen_console = b; +} + +void log_set_open_when_needed(bool b) { + open_when_needed = b; +} + +void log_set_prohibit_ipc(bool b) { + prohibit_ipc = b; +} + +int log_emergency_level(void) { + /* Returns the log level to use for log_emergency() logging. We use LOG_EMERG only when we are PID 1, as only + * then the system of the whole system is obviously affected. */ + + return getpid_cached() == 1 ? LOG_EMERG : LOG_ERR; +} + +int log_dup_console(void) { + int copy; + + /* Duplicate the fd we use for fd logging if it's < 3 and use the copy from now on. This call is useful + * whenever we want to continue logging through the original fd, but want to rearrange stderr. */ + + if (console_fd < 0 || console_fd >= 3) + return 0; + + copy = fcntl(console_fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + console_fd = copy; + return 0; +} + +void log_setup(void) { + log_set_target(LOG_TARGET_AUTO); + log_parse_environment(); + (void) log_open(); + if (log_on_console() && show_color < 0) + log_show_color(true); +} + +const char *_log_set_prefix(const char *prefix, bool force) { + const char *old = log_prefix; + + if (prefix || force) + log_prefix = prefix; + + return old; +} + +static int saved_log_context_enabled = -1; + +bool log_context_enabled(void) { + int r; + + if (log_get_max_level() == LOG_DEBUG) + return true; + + if (saved_log_context_enabled >= 0) + return saved_log_context_enabled; + + r = getenv_bool_secure("SYSTEMD_ENABLE_LOG_CONTEXT"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_ENABLE_LOG_CONTEXT, ignoring: %m"); + + saved_log_context_enabled = r > 0; + + return saved_log_context_enabled; +} + +static LogContext* log_context_attach(LogContext *c) { + assert(c); + + _log_context_num_fields += strv_length(c->fields); + _log_context_num_fields += c->n_input_iovec; + _log_context_num_fields += !!c->key; + + return LIST_PREPEND(ll, _log_context, c); +} + +static LogContext* log_context_detach(LogContext *c) { + if (!c) + return NULL; + + assert(_log_context_num_fields >= strv_length(c->fields) + c->n_input_iovec +!!c->key); + _log_context_num_fields -= strv_length(c->fields); + _log_context_num_fields -= c->n_input_iovec; + _log_context_num_fields -= !!c->key; + + LIST_REMOVE(ll, _log_context, c); + return NULL; +} + +LogContext* log_context_new(const char *key, const char *value) { + assert(key); + assert(endswith(key, "=")); + assert(value); + + LIST_FOREACH(ll, i, _log_context) + if (i->key == key && i->value == value) + return log_context_ref(i); + + LogContext *c = new(LogContext, 1); + if (!c) + return NULL; + + *c = (LogContext) { + .n_ref = 1, + .key = (char *) key, + .value = (char *) value, + }; + + return log_context_attach(c); +} + +LogContext* log_context_new_strv(char **fields, bool owned) { + if (!fields) + return NULL; + + LIST_FOREACH(ll, i, _log_context) + if (i->fields == fields) { + assert(!owned); + return log_context_ref(i); + } + + LogContext *c = new(LogContext, 1); + if (!c) + return NULL; + + *c = (LogContext) { + .n_ref = 1, + .fields = fields, + .owned = owned, + }; + + return log_context_attach(c); +} + +LogContext* log_context_new_iov(struct iovec *input_iovec, size_t n_input_iovec, bool owned) { + if (!input_iovec || n_input_iovec == 0) + return NULL; + + LIST_FOREACH(ll, i, _log_context) + if (i->input_iovec == input_iovec && i->n_input_iovec == n_input_iovec) { + assert(!owned); + return log_context_ref(i); + } + + LogContext *c = new(LogContext, 1); + if (!c) + return NULL; + + *c = (LogContext) { + .n_ref = 1, + .input_iovec = input_iovec, + .n_input_iovec = n_input_iovec, + .owned = owned, + }; + + return log_context_attach(c); +} + +static LogContext* log_context_free(LogContext *c) { + if (!c) + return NULL; + + log_context_detach(c); + + if (c->owned) { + strv_free(c->fields); + iovec_array_free(c->input_iovec, c->n_input_iovec); + free(c->key); + free(c->value); + } + + return mfree(c); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(LogContext, log_context, log_context_free); + +LogContext* log_context_new_strv_consume(char **fields) { + LogContext *c = log_context_new_strv(fields, /*owned=*/ true); + if (!c) + strv_free(fields); + + return c; +} + +LogContext* log_context_new_iov_consume(struct iovec *input_iovec, size_t n_input_iovec) { + LogContext *c = log_context_new_iov(input_iovec, n_input_iovec, /*owned=*/ true); + if (!c) + iovec_array_free(input_iovec, n_input_iovec); + + return c; +} + +size_t log_context_num_contexts(void) { + size_t n = 0; + + LIST_FOREACH(ll, c, _log_context) + n++; + + return n; +} + +size_t log_context_num_fields(void) { + return _log_context_num_fields; +} diff --git a/src/basic/log.h b/src/basic/log.h new file mode 100644 index 0000000..9008d47 --- /dev/null +++ b/src/basic/log.h @@ -0,0 +1,537 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "list.h" +#include "macro.h" +#include "ratelimit.h" +#include "stdio-util.h" + +/* Some structures we reference but don't want to pull in headers for */ +struct iovec; +struct signalfd_siginfo; + +typedef enum LogTarget{ + LOG_TARGET_CONSOLE, + LOG_TARGET_CONSOLE_PREFIXED, + LOG_TARGET_KMSG, + LOG_TARGET_JOURNAL, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_SYSLOG, + LOG_TARGET_SYSLOG_OR_KMSG, + LOG_TARGET_AUTO, /* console if stderr is not journal, JOURNAL_OR_KMSG otherwise */ + LOG_TARGET_NULL, + _LOG_TARGET_MAX, + _LOG_TARGET_INVALID = -EINVAL, +} LogTarget; + +/* This log level disables logging completely. It can only be passed to log_set_max_level() and cannot be + * used a regular log level. */ +#define LOG_NULL (LOG_EMERG - 1) + +/* Note to readers: << and >> have lower precedence (are evaluated earlier) than & and | */ +#define SYNTHETIC_ERRNO(num) (1 << 30 | (num)) +#define IS_SYNTHETIC_ERRNO(val) ((val) >> 30 & 1) +#define ERRNO_VALUE(val) (abs(val) & ~(1 << 30)) + +/* The callback function to be invoked when syntax warnings are seen + * in the unit files. */ +typedef void (*log_syntax_callback_t)(const char *unit, int level, void *userdata); +void set_log_syntax_callback(log_syntax_callback_t cb, void *userdata); + +static inline void clear_log_syntax_callback(dummy_t *dummy) { + set_log_syntax_callback(/* cb= */ NULL, /* userdata= */ NULL); +} + +const char *log_target_to_string(LogTarget target) _const_; +LogTarget log_target_from_string(const char *s) _pure_; +void log_set_target(LogTarget target); +void log_set_target_and_open(LogTarget target); +int log_set_target_from_string(const char *e); +LogTarget log_get_target(void) _pure_; +void log_settle_target(void); + +void log_set_max_level(int level); +int log_set_max_level_from_string(const char *e); +int log_get_max_level(void) _pure_; + +void log_set_facility(int facility); + +void log_show_color(bool b); +bool log_get_show_color(void) _pure_; +void log_show_location(bool b); +bool log_get_show_location(void) _pure_; +void log_show_time(bool b); +bool log_get_show_time(void) _pure_; +void log_show_tid(bool b); +bool log_get_show_tid(void) _pure_; + +int log_show_color_from_string(const char *e); +int log_show_location_from_string(const char *e); +int log_show_time_from_string(const char *e); +int log_show_tid_from_string(const char *e); + +/* Functions below that open and close logs or configure logging based on the + * environment should not be called from library code — this is always a job + * for the application itself. */ + +assert_cc(STRLEN(__FILE__) > STRLEN(RELATIVE_SOURCE_PATH) + 1); +#define PROJECT_FILE (&__FILE__[STRLEN(RELATIVE_SOURCE_PATH) + 1]) + +int log_open(void); +void log_close(void); +void log_forget_fds(void); + +void log_parse_environment_variables(void); +void log_parse_environment(void); + +int log_dispatch_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra, + const char *extra_field, + char *buffer); + +int log_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) _printf_(6,7); + +int log_internalv( + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, + va_list ap) _printf_(6,0); + +int log_object_internalv( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra_field, + const char *extra, + const char *format, + va_list ap) _printf_(10,0); + +int log_object_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *object_field, + const char *object, + const char *extra_field, + const char *extra, + const char *format, ...) _printf_(10,11); + +int log_struct_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) _printf_(6,0) _sentinel_; + +int log_oom_internal( + int level, + const char *file, + int line, + const char *func); + +int log_format_iovec( + struct iovec *iovec, + size_t iovec_len, + size_t *n, + bool newline_separator, + int error, + const char *format, + va_list ap) _printf_(6, 0); + +int log_struct_iovec_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const struct iovec *input_iovec, + size_t n_input_iovec); + +/* This modifies the buffer passed! */ +int log_dump_internal( + int level, + int error, + const char *file, + int line, + const char *func, + char *buffer); + +/* Logging for various assertions */ +_noreturn_ void log_assert_failed( + const char *text, + const char *file, + int line, + const char *func); + +_noreturn_ void log_assert_failed_unreachable( + const char *file, + int line, + const char *func); + +void log_assert_failed_return( + const char *text, + const char *file, + int line, + const char *func); + +#define log_dispatch(level, error, buffer) \ + log_dispatch_internal(level, error, PROJECT_FILE, __LINE__, __func__, NULL, NULL, NULL, NULL, buffer) + +/* Logging with level */ +#define log_full_errno_zerook(level, error, ...) \ + ({ \ + int _level = (level), _e = (error); \ + _e = (log_get_max_level() >= LOG_PRI(_level)) \ + ? log_internal(_level, _e, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__) \ + : -ERRNO_VALUE(_e); \ + _e < 0 ? _e : -ESTRPIPE; \ + }) + +#if BUILD_MODE_DEVELOPER && !defined(TEST_CODE) +# define ASSERT_NON_ZERO(x) assert((x) != 0) +#else +# define ASSERT_NON_ZERO(x) +#endif + +#define log_full_errno(level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_full_errno_zerook(level, _error, __VA_ARGS__); \ + }) + +#define log_full(level, fmt, ...) \ + ({ \ + if (BUILD_MODE_DEVELOPER) \ + assert(!strstr(fmt, "%m")); \ + (void) log_full_errno_zerook(level, 0, fmt, ##__VA_ARGS__); \ + }) + +int log_emergency_level(void); + +/* Normal logging */ +#define log_debug(...) log_full(LOG_DEBUG, __VA_ARGS__) +#define log_info(...) log_full(LOG_INFO, __VA_ARGS__) +#define log_notice(...) log_full(LOG_NOTICE, __VA_ARGS__) +#define log_warning(...) log_full(LOG_WARNING, __VA_ARGS__) +#define log_error(...) log_full(LOG_ERR, __VA_ARGS__) +#define log_emergency(...) log_full(log_emergency_level(), __VA_ARGS__) + +/* Logging triggered by an errno-like error */ +#define log_debug_errno(error, ...) log_full_errno(LOG_DEBUG, error, __VA_ARGS__) +#define log_info_errno(error, ...) log_full_errno(LOG_INFO, error, __VA_ARGS__) +#define log_notice_errno(error, ...) log_full_errno(LOG_NOTICE, error, __VA_ARGS__) +#define log_warning_errno(error, ...) log_full_errno(LOG_WARNING, error, __VA_ARGS__) +#define log_error_errno(error, ...) log_full_errno(LOG_ERR, error, __VA_ARGS__) +#define log_emergency_errno(error, ...) log_full_errno(log_emergency_level(), error, __VA_ARGS__) + +/* This logs at the specified level the first time it is called, and then + * logs at debug. If the specified level is debug, this logs only the first + * time it is called. */ +#define log_once(level, ...) \ + ({ \ + if (ONCE) \ + log_full(level, __VA_ARGS__); \ + else if (LOG_PRI(level) != LOG_DEBUG) \ + log_debug(__VA_ARGS__); \ + }) + +#define log_once_errno(level, error, ...) \ + ({ \ + int _err = (error); \ + if (ONCE) \ + _err = log_full_errno(level, _err, __VA_ARGS__); \ + else if (LOG_PRI(level) != LOG_DEBUG) \ + _err = log_debug_errno(_err, __VA_ARGS__); \ + else \ + _err = -ERRNO_VALUE(_err); \ + _err; \ + }) + +#if LOG_TRACE +# define log_trace(...) log_debug(__VA_ARGS__) +# define log_trace_errno(...) log_debug_errno(__VA_ARGS__) +#else +# define log_trace(...) do {} while (0) +# define log_trace_errno(e, ...) (-ERRNO_VALUE(e)) +#endif + +/* Structured logging */ +#define log_struct_errno(level, error, ...) \ + log_struct_internal(level, error, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__, NULL) +#define log_struct(level, ...) log_struct_errno(level, 0, __VA_ARGS__) + +#define log_struct_iovec_errno(level, error, iovec, n_iovec) \ + log_struct_iovec_internal(level, error, PROJECT_FILE, __LINE__, __func__, iovec, n_iovec) +#define log_struct_iovec(level, iovec, n_iovec) log_struct_iovec_errno(level, 0, iovec, n_iovec) + +/* This modifies the buffer passed! */ +#define log_dump(level, buffer) \ + log_dump_internal(level, 0, PROJECT_FILE, __LINE__, __func__, buffer) + +#define log_oom() log_oom_internal(LOG_ERR, PROJECT_FILE, __LINE__, __func__) +#define log_oom_debug() log_oom_internal(LOG_DEBUG, PROJECT_FILE, __LINE__, __func__) +#define log_oom_warning() log_oom_internal(LOG_WARNING, PROJECT_FILE, __LINE__, __func__) + +bool log_on_console(void) _pure_; + +/* Helper to wrap the main message in structured logging. The macro doesn't do much, + * except to provide visual grouping of the format string and its arguments. */ +#if LOG_MESSAGE_VERIFICATION || defined(__COVERITY__) +/* Do a fake formatting of the message string to let the scanner verify the arguments against the format + * message. The variable will never be set to true, but we don't tell the compiler that :) */ +extern bool _log_message_dummy; +# define LOG_MESSAGE(fmt, ...) "MESSAGE=%.0d" fmt, (_log_message_dummy && printf(fmt, ##__VA_ARGS__)), ##__VA_ARGS__ +#else +# define LOG_MESSAGE(fmt, ...) "MESSAGE=" fmt, ##__VA_ARGS__ +#endif + +void log_received_signal(int level, const struct signalfd_siginfo *si); + +/* If turned on, any requests for a log target involving "syslog" will be implicitly upgraded to the equivalent journal target */ +void log_set_upgrade_syslog_to_journal(bool b); + +/* If turned on, and log_open() is called, we'll not use STDERR_FILENO for logging ever, but rather open /dev/console */ +void log_set_always_reopen_console(bool b); + +/* If turned on, we'll open the log stream implicitly if needed on each individual log call. This is normally not + * desired as we want to reuse our logging streams. It is useful however */ +void log_set_open_when_needed(bool b); + +/* If turned on, then we'll never use IPC-based logging, i.e. never log to syslog or the journal. We'll only log to + * stderr, the console or kmsg */ +void log_set_prohibit_ipc(bool b); + +int log_dup_console(void); + +int log_syntax_internal( + const char *unit, + int level, + const char *config_file, + unsigned config_line, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) _printf_(9, 10); + +int log_syntax_invalid_utf8_internal( + const char *unit, + int level, + const char *config_file, + unsigned config_line, + const char *file, + int line, + const char *func, + const char *rvalue); + +#define log_syntax(unit, level, config_file, config_line, error, ...) \ + ({ \ + int _level = (level), _e = (error); \ + (log_get_max_level() >= LOG_PRI(_level)) \ + ? log_syntax_internal(unit, _level, config_file, config_line, _e, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__) \ + : -ERRNO_VALUE(_e); \ + }) + +#define log_syntax_invalid_utf8(unit, level, config_file, config_line, rvalue) \ + ({ \ + int _level = (level); \ + (log_get_max_level() >= LOG_PRI(_level)) \ + ? log_syntax_invalid_utf8_internal(unit, _level, config_file, config_line, PROJECT_FILE, __LINE__, __func__, rvalue) \ + : -EINVAL; \ + }) + +#define DEBUG_LOGGING _unlikely_(log_get_max_level() >= LOG_DEBUG) + +void log_setup(void); + +typedef struct LogRateLimit { + int error; + int level; + RateLimit ratelimit; +} LogRateLimit; + +#define log_ratelimit_internal(_level, _error, _ratelimit, _format, _file, _line, _func, ...) \ +({ \ + int _log_ratelimit_error = (_error); \ + int _log_ratelimit_level = (_level); \ + static LogRateLimit _log_ratelimit = { \ + .ratelimit = (_ratelimit), \ + }; \ + unsigned _num_dropped_errors = ratelimit_num_dropped(&_log_ratelimit.ratelimit); \ + if (_log_ratelimit_error != _log_ratelimit.error || _log_ratelimit_level != _log_ratelimit.level) { \ + ratelimit_reset(&_log_ratelimit.ratelimit); \ + _log_ratelimit.error = _log_ratelimit_error; \ + _log_ratelimit.level = _log_ratelimit_level; \ + } \ + if (log_get_max_level() == LOG_DEBUG || ratelimit_below(&_log_ratelimit.ratelimit)) \ + _log_ratelimit_error = _num_dropped_errors > 0 \ + ? log_internal(_log_ratelimit_level, _log_ratelimit_error, _file, _line, _func, _format " (Dropped %u similar message(s))", ##__VA_ARGS__, _num_dropped_errors) \ + : log_internal(_log_ratelimit_level, _log_ratelimit_error, _file, _line, _func, _format, ##__VA_ARGS__); \ + _log_ratelimit_error; \ +}) + +#define log_ratelimit_full_errno(level, error, _ratelimit, format, ...) \ + ({ \ + int _level = (level), _e = (error); \ + _e = (log_get_max_level() >= LOG_PRI(_level)) \ + ? log_ratelimit_internal(_level, _e, _ratelimit, format, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__) \ + : -ERRNO_VALUE(_e); \ + _e < 0 ? _e : -ESTRPIPE; \ + }) + +#define log_ratelimit_full(level, _ratelimit, format, ...) \ + log_ratelimit_full_errno(level, 0, _ratelimit, format, ##__VA_ARGS__) + +/* Normal logging */ +#define log_ratelimit_info(...) log_ratelimit_full(LOG_INFO, __VA_ARGS__) +#define log_ratelimit_notice(...) log_ratelimit_full(LOG_NOTICE, __VA_ARGS__) +#define log_ratelimit_warning(...) log_ratelimit_full(LOG_WARNING, __VA_ARGS__) +#define log_ratelimit_error(...) log_ratelimit_full(LOG_ERR, __VA_ARGS__) +#define log_ratelimit_emergency(...) log_ratelimit_full(log_emergency_level(), __VA_ARGS__) + +/* Logging triggered by an errno-like error */ +#define log_ratelimit_info_errno(error, ...) log_ratelimit_full_errno(LOG_INFO, error, __VA_ARGS__) +#define log_ratelimit_notice_errno(error, ...) log_ratelimit_full_errno(LOG_NOTICE, error, __VA_ARGS__) +#define log_ratelimit_warning_errno(error, ...) log_ratelimit_full_errno(LOG_WARNING, error, __VA_ARGS__) +#define log_ratelimit_error_errno(error, ...) log_ratelimit_full_errno(LOG_ERR, error, __VA_ARGS__) +#define log_ratelimit_emergency_errno(error, ...) log_ratelimit_full_errno(log_emergency_level(), error, __VA_ARGS__) + +const char *_log_set_prefix(const char *prefix, bool force); +static inline const char *_log_unset_prefixp(const char **p) { + assert(p); + _log_set_prefix(*p, true); + return NULL; +} + +#define LOG_SET_PREFIX(prefix) \ + _cleanup_(_log_unset_prefixp) _unused_ const char *CONCATENATE(_cleanup_log_unset_prefix_, UNIQ) = _log_set_prefix(prefix, false); + +/* + * The log context allows attaching extra metadata to log messages written to the journal via log.h. We keep + * track of a thread local log context onto which we can push extra metadata fields that should be logged. + * + * LOG_CONTEXT_PUSH() will add the provided field to the log context and will remove it again when the + * current block ends. LOG_CONTEXT_PUSH_STRV() will do the same but for all fields in the given strv. + * LOG_CONTEXT_PUSHF() is like LOG_CONTEXT_PUSH() but takes a format string and arguments. + * + * Using the macros is as simple as putting them anywhere inside a block to add a field to all following log + * messages logged from inside that block. + * + * void myfunction(...) { + * ... + * + * LOG_CONTEXT_PUSHF("MYMETADATA=%s", "abc"); + * + * // Every journal message logged will now have the MYMETADATA=abc + * // field included. + * } + * + * One special case to note is async code, where we use callbacks that are invoked to continue processing + * when some event occurs. For async code, there's usually an associated "userdata" struct containing all the + * information associated with the async operation. In this "userdata" struct, we can store a log context + * allocated with log_context_new() and freed with log_context_free(). We can then add and remove fields to + * the `fields` member of the log context object and all those fields will be logged along with each log + * message. + */ + +typedef struct LogContext LogContext; + +bool log_context_enabled(void); + +LogContext* log_context_new(const char *key, const char *value); +LogContext* log_context_new_strv(char **fields, bool owned); +LogContext* log_context_new_iov(struct iovec *input_iovec, size_t n_input_iovec, bool owned); + +/* Same as log_context_new(), but frees the given fields strv/iovec on failure. */ +LogContext* log_context_new_strv_consume(char **fields); +LogContext* log_context_new_iov_consume(struct iovec *input_iovec, size_t n_input_iovec); + +LogContext *log_context_ref(LogContext *c); +LogContext *log_context_unref(LogContext *c); + +DEFINE_TRIVIAL_CLEANUP_FUNC(LogContext*, log_context_unref); + +/* Returns the number of attached log context objects. */ +size_t log_context_num_contexts(void); +/* Returns the number of fields in all attached log contexts. */ +size_t log_context_num_fields(void); + +#define LOG_CONTEXT_PUSH(...) \ + LOG_CONTEXT_PUSH_STRV(STRV_MAKE(__VA_ARGS__)) + +#define LOG_CONTEXT_PUSHF(...) \ + LOG_CONTEXT_PUSH(snprintf_ok((char[LINE_MAX]) {}, LINE_MAX, __VA_ARGS__)) + +#define _LOG_CONTEXT_PUSH_KEY_VALUE(key, value, c) \ + _unused_ _cleanup_(log_context_unrefp) LogContext *c = log_context_new(key, value); + +#define LOG_CONTEXT_PUSH_KEY_VALUE(key, value) \ + _LOG_CONTEXT_PUSH_KEY_VALUE(key, value, UNIQ_T(c, UNIQ)) + +#define _LOG_CONTEXT_PUSH_STRV(strv, c) \ + _unused_ _cleanup_(log_context_unrefp) LogContext *c = log_context_new_strv(strv, /*owned=*/ false); + +#define LOG_CONTEXT_PUSH_STRV(strv) \ + _LOG_CONTEXT_PUSH_STRV(strv, UNIQ_T(c, UNIQ)) + +#define _LOG_CONTEXT_PUSH_IOV(input_iovec, n_input_iovec, c) \ + _unused_ _cleanup_(log_context_unrefp) LogContext *c = log_context_new_iov(input_iovec, n_input_iovec, /*owned=*/ false); + +#define LOG_CONTEXT_PUSH_IOV(input_iovec, n_input_iovec) \ + _LOG_CONTEXT_PUSH_IOV(input_iovec, n_input_iovec, UNIQ_T(c, UNIQ)) + +/* LOG_CONTEXT_CONSUME_STR()/LOG_CONTEXT_CONSUME_STRV()/LOG_CONTEXT_CONSUME_IOV() are identical to + * LOG_CONTEXT_PUSH_STR()/LOG_CONTEXT_PUSH_STRV()/LOG_CONTEXT_PUSH_IOV() except they take ownership of the + * given str/strv argument. + */ + +#define _LOG_CONTEXT_CONSUME_STR(s, c, strv) \ + _unused_ _cleanup_strv_free_ strv = strv_new(s); \ + if (!strv) \ + free(s); \ + _unused_ _cleanup_(log_context_unrefp) LogContext *c = log_context_new_strv_consume(TAKE_PTR(strv)) + +#define LOG_CONTEXT_CONSUME_STR(s) \ + _LOG_CONTEXT_CONSUME_STR(s, UNIQ_T(c, UNIQ), UNIQ_T(sv, UNIQ)) + +#define _LOG_CONTEXT_CONSUME_STRV(strv, c) \ + _unused_ _cleanup_(log_context_unrefp) LogContext *c = log_context_new_strv_consume(strv); + +#define LOG_CONTEXT_CONSUME_STRV(strv) \ + _LOG_CONTEXT_CONSUME_STRV(strv, UNIQ_T(c, UNIQ)) + +#define _LOG_CONTEXT_CONSUME_IOV(input_iovec, n_input_iovec, c) \ + _unused_ _cleanup_(log_context_unrefp) LogContext *c = log_context_new_iov_consume(input_iovec, n_input_iovec); + +#define LOG_CONTEXT_CONSUME_IOV(input_iovec, n_input_iovec) \ + _LOG_CONTEXT_CONSUME_IOV(input_iovec, n_input_iovec, UNIQ_T(c, UNIQ)) diff --git a/src/basic/login-util.c b/src/basic/login-util.c new file mode 100644 index 0000000..044e8b7 --- /dev/null +++ b/src/basic/login-util.c @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "login-util.h" +#include "string-util.h" + +bool session_id_valid(const char *id) { + + if (isempty(id)) + return false; + + return id[strspn(id, LETTERS DIGITS)] == '\0'; +} diff --git a/src/basic/login-util.h b/src/basic/login-util.h new file mode 100644 index 0000000..4c9cae0 --- /dev/null +++ b/src/basic/login-util.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#define SD_LOGIND_ROOT_CHECK_INHIBITORS (UINT64_C(1) << 0) +#define SD_LOGIND_REBOOT_VIA_KEXEC (UINT64_C(1) << 1) +#define SD_LOGIND_SOFT_REBOOT (UINT64_C(1) << 2) +#define SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP (UINT64_C(1) << 3) + +/* For internal use only */ +#define SD_LOGIND_INTERACTIVE (UINT64_C(1) << 63) + +#define SD_LOGIND_SHUTDOWN_AND_SLEEP_FLAGS_PUBLIC (SD_LOGIND_ROOT_CHECK_INHIBITORS|SD_LOGIND_REBOOT_VIA_KEXEC|SD_LOGIND_SOFT_REBOOT|SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP) +#define SD_LOGIND_SHUTDOWN_AND_SLEEP_FLAGS_ALL (SD_LOGIND_SHUTDOWN_AND_SLEEP_FLAGS_PUBLIC|SD_LOGIND_INTERACTIVE) + +bool session_id_valid(const char *id); + +static inline bool logind_running(void) { + return access("/run/systemd/seats/", F_OK) >= 0; +} diff --git a/src/basic/macro.h b/src/basic/macro.h new file mode 100644 index 0000000..d63aa81 --- /dev/null +++ b/src/basic/macro.h @@ -0,0 +1,392 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "constants.h" +#include "macro-fundamental.h" + +/* Note: on GCC "no_sanitize_address" is a function attribute only, on llvm it may also be applied to global + * variables. We define a specific macro which knows this. Note that on GCC we don't need this decorator so much, since + * our primary use case for this attribute is registration structures placed in named ELF sections which shall not be + * padded, but GCC doesn't pad those anyway if AddressSanitizer is enabled. */ +#if HAS_FEATURE_ADDRESS_SANITIZER && defined(__clang__) +#define _variable_no_sanitize_address_ __attribute__((__no_sanitize_address__)) +#else +#define _variable_no_sanitize_address_ +#endif + +/* Apparently there's no has_feature() call defined to check for ubsan, hence let's define this + * unconditionally on llvm */ +#if defined(__clang__) +#define _function_no_sanitize_float_cast_overflow_ __attribute__((no_sanitize("float-cast-overflow"))) +#else +#define _function_no_sanitize_float_cast_overflow_ +#endif + +#if HAVE_WSTRINGOP_TRUNCATION +# define DISABLE_WARNING_STRINGOP_TRUNCATION \ + _Pragma("GCC diagnostic push"); \ + _Pragma("GCC diagnostic ignored \"-Wstringop-truncation\"") +#else +# define DISABLE_WARNING_STRINGOP_TRUNCATION \ + _Pragma("GCC diagnostic push") +#endif + +/* test harness */ +#define EXIT_TEST_SKIP 77 + +/* builtins */ +#if __SIZEOF_INT__ == 4 +#define BUILTIN_FFS_U32(x) __builtin_ffs(x); +#elif __SIZEOF_LONG__ == 4 +#define BUILTIN_FFS_U32(x) __builtin_ffsl(x); +#else +#error "neither int nor long are four bytes long?!?" +#endif + +static inline uint64_t u64_multiply_safe(uint64_t a, uint64_t b) { + if (_unlikely_(a != 0 && b > (UINT64_MAX / a))) + return 0; /* overflow */ + + return a * b; +} + +/* align to next higher power-of-2 (except for: 0 => 0, overflow => 0) */ +static inline unsigned long ALIGN_POWER2(unsigned long u) { + + /* Avoid subtraction overflow */ + if (u == 0) + return 0; + + /* clz(0) is undefined */ + if (u == 1) + return 1; + + /* left-shift overflow is undefined */ + if (__builtin_clzl(u - 1UL) < 1) + return 0; + + return 1UL << (sizeof(u) * 8 - __builtin_clzl(u - 1UL)); +} + +static inline size_t GREEDY_ALLOC_ROUND_UP(size_t l) { + size_t m; + + /* Round up allocation sizes a bit to some reasonable, likely larger value. This is supposed to be + * used for cases which are likely called in an allocation loop of some form, i.e. that repetitively + * grow stuff, for example strv_extend() and suchlike. + * + * Note the difference to GREEDY_REALLOC() here, as this helper operates on a single size value only, + * and rounds up to next multiple of 2, needing no further counter. + * + * Note the benefits of direct ALIGN_POWER2() usage: type-safety for size_t, sane handling for very + * small (i.e. <= 2) and safe handling for very large (i.e. > SSIZE_MAX) values. */ + + if (l <= 2) + return 2; /* Never allocate less than 2 of something. */ + + m = ALIGN_POWER2(l); + if (m == 0) /* overflow? */ + return l; + + return m; +} + +/* + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + */ +#define container_of(ptr, type, member) __container_of(UNIQ, (ptr), type, member) +#define __container_of(uniq, ptr, type, member) \ + ({ \ + const typeof( ((type*)0)->member ) *UNIQ_T(A, uniq) = (ptr); \ + (type*)( (char *)UNIQ_T(A, uniq) - offsetof(type, member) ); \ + }) + +#ifdef __COVERITY__ + +/* Use special definitions of assertion macros in order to prevent + * false positives of ASSERT_SIDE_EFFECT on Coverity static analyzer + * for uses of assert_se() and assert_return(). + * + * These definitions make expression go through a (trivial) function + * call to ensure they are not discarded. Also use ! or !! to ensure + * the boolean expressions are seen as such. + * + * This technique has been described and recommended in: + * https://community.synopsys.com/s/question/0D534000046Yuzb/suppressing-assertsideeffect-for-functions-that-allow-for-sideeffects + */ + +extern void __coverity_panic__(void); + +static inline void __coverity_check__(int condition) { + if (!condition) + __coverity_panic__(); +} + +static inline int __coverity_check_and_return__(int condition) { + return condition; +} + +#define assert_message_se(expr, message) __coverity_check__(!!(expr)) + +#define assert_log(expr, message) __coverity_check_and_return__(!!(expr)) + +#else /* ! __COVERITY__ */ + +#define assert_message_se(expr, message) \ + do { \ + if (_unlikely_(!(expr))) \ + log_assert_failed(message, PROJECT_FILE, __LINE__, __func__); \ + } while (false) + +#define assert_log(expr, message) ((_likely_(expr)) \ + ? (true) \ + : (log_assert_failed_return(message, PROJECT_FILE, __LINE__, __func__), false)) + +#endif /* __COVERITY__ */ + +#define assert_se(expr) assert_message_se(expr, #expr) + +/* We override the glibc assert() here. */ +#undef assert +#ifdef NDEBUG +#define assert(expr) ({ if (!(expr)) __builtin_unreachable(); }) +#else +#define assert(expr) assert_message_se(expr, #expr) +#endif + +#define assert_not_reached() \ + log_assert_failed_unreachable(PROJECT_FILE, __LINE__, __func__) + +#define assert_return(expr, r) \ + do { \ + if (!assert_log(expr, #expr)) \ + return (r); \ + } while (false) + +#define assert_return_errno(expr, r, err) \ + do { \ + if (!assert_log(expr, #expr)) { \ + errno = err; \ + return (r); \ + } \ + } while (false) + +#define return_with_errno(r, err) \ + do { \ + errno = abs(err); \ + return r; \ + } while (false) + +#define PTR_TO_INT(p) ((int) ((intptr_t) (p))) +#define INT_TO_PTR(u) ((void *) ((intptr_t) (u))) +#define PTR_TO_UINT(p) ((unsigned) ((uintptr_t) (p))) +#define UINT_TO_PTR(u) ((void *) ((uintptr_t) (u))) + +#define PTR_TO_LONG(p) ((long) ((intptr_t) (p))) +#define LONG_TO_PTR(u) ((void *) ((intptr_t) (u))) +#define PTR_TO_ULONG(p) ((unsigned long) ((uintptr_t) (p))) +#define ULONG_TO_PTR(u) ((void *) ((uintptr_t) (u))) + +#define PTR_TO_UINT8(p) ((uint8_t) ((uintptr_t) (p))) +#define UINT8_TO_PTR(u) ((void *) ((uintptr_t) (u))) + +#define PTR_TO_INT32(p) ((int32_t) ((intptr_t) (p))) +#define INT32_TO_PTR(u) ((void *) ((intptr_t) (u))) +#define PTR_TO_UINT32(p) ((uint32_t) ((uintptr_t) (p))) +#define UINT32_TO_PTR(u) ((void *) ((uintptr_t) (u))) + +#define PTR_TO_INT64(p) ((int64_t) ((intptr_t) (p))) +#define INT64_TO_PTR(u) ((void *) ((intptr_t) (u))) +#define PTR_TO_UINT64(p) ((uint64_t) ((uintptr_t) (p))) +#define UINT64_TO_PTR(u) ((void *) ((uintptr_t) (u))) + +#define PTR_TO_SIZE(p) ((size_t) ((uintptr_t) (p))) +#define SIZE_TO_PTR(u) ((void *) ((uintptr_t) (u))) + +#define CHAR_TO_STR(x) ((char[2]) { x, 0 }) + +#define char_array_0(x) x[sizeof(x)-1] = 0; + +#define sizeof_field(struct_type, member) sizeof(((struct_type *) 0)->member) +#define endoffsetof_field(struct_type, member) (offsetof(struct_type, member) + sizeof_field(struct_type, member)) + +/* Maximum buffer size needed for formatting an unsigned integer type as hex, including space for '0x' + * prefix and trailing NUL suffix. */ +#define HEXADECIMAL_STR_MAX(type) (2 + sizeof(type) * 2 + 1) + +/* Returns the number of chars needed to format variables of the specified type as a decimal string. Adds in + * extra space for a negative '-' prefix for signed types. Includes space for the trailing NUL. */ +#define DECIMAL_STR_MAX(type) \ + ((size_t) IS_SIGNED_INTEGER_TYPE(type) + 1U + \ + (sizeof(type) <= 1 ? 3U : \ + sizeof(type) <= 2 ? 5U : \ + sizeof(type) <= 4 ? 10U : \ + sizeof(type) <= 8 ? (IS_SIGNED_INTEGER_TYPE(type) ? 19U : 20U) : sizeof(int[-2*(sizeof(type) > 8)]))) + +/* Returns the number of chars needed to format the specified integer value. It's hence more specific than + * DECIMAL_STR_MAX() which answers the same question for all possible values of the specified type. Does + * *not* include space for a trailing NUL. (If you wonder why we special case _x_ == 0 here: it's to trick + * out gcc's -Wtype-limits, which would complain on comparing an unsigned type with < 0, otherwise. By + * special-casing == 0 here first, we can use <= 0 instead of < 0 to trick out gcc.) */ +#define DECIMAL_STR_WIDTH(x) \ + ({ \ + typeof(x) _x_ = (x); \ + size_t ans; \ + if (_x_ == 0) \ + ans = 1; \ + else { \ + ans = _x_ <= 0 ? 2 : 1; \ + while ((_x_ /= 10) != 0) \ + ans++; \ + } \ + ans; \ + }) + +#define SWAP_TWO(x, y) do { \ + typeof(x) _t = (x); \ + (x) = (y); \ + (y) = (_t); \ + } while (false) + +#define STRV_MAKE(...) ((char**) ((const char*[]) { __VA_ARGS__, NULL })) +#define STRV_MAKE_EMPTY ((char*[1]) { NULL }) +#define STRV_MAKE_CONST(...) ((const char* const*) ((const char*[]) { __VA_ARGS__, NULL })) + +/* Pointers range from NULL to POINTER_MAX */ +#define POINTER_MAX ((void*) UINTPTR_MAX) + +/* Iterates through a specified list of pointers. Accepts NULL pointers, but uses POINTER_MAX as internal marker for EOL. */ +#define FOREACH_POINTER(p, x, ...) \ + for (typeof(p) *_l = (typeof(p)[]) { ({ p = x; }), ##__VA_ARGS__, POINTER_MAX }; \ + p != (typeof(p)) POINTER_MAX; \ + p = *(++_l)) + +#define _FOREACH_ARRAY(i, array, num, m, end) \ + for (typeof(array[0]) *i = (array), *end = ({ \ + typeof(num) m = (num); \ + (i && m > 0) ? i + m : NULL; \ + }); end && i < end; i++) + +#define FOREACH_ARRAY(i, array, num) \ + _FOREACH_ARRAY(i, array, num, UNIQ_T(m, UNIQ), UNIQ_T(end, UNIQ)) + +#define _DEFINE_TRIVIAL_REF_FUNC(type, name, scope) \ + scope type *name##_ref(type *p) { \ + if (!p) \ + return NULL; \ + \ + /* For type check. */ \ + unsigned *q = &p->n_ref; \ + assert(*q > 0); \ + assert_se(*q < UINT_MAX); \ + \ + (*q)++; \ + return p; \ + } + +#define _DEFINE_TRIVIAL_UNREF_FUNC(type, name, free_func, scope) \ + scope type *name##_unref(type *p) { \ + if (!p) \ + return NULL; \ + \ + assert(p->n_ref > 0); \ + p->n_ref--; \ + if (p->n_ref > 0) \ + return NULL; \ + \ + return free_func(p); \ + } + +#define DEFINE_TRIVIAL_REF_FUNC(type, name) \ + _DEFINE_TRIVIAL_REF_FUNC(type, name,) +#define DEFINE_PRIVATE_TRIVIAL_REF_FUNC(type, name) \ + _DEFINE_TRIVIAL_REF_FUNC(type, name, static) +#define DEFINE_PUBLIC_TRIVIAL_REF_FUNC(type, name) \ + _DEFINE_TRIVIAL_REF_FUNC(type, name, _public_) + +#define DEFINE_TRIVIAL_UNREF_FUNC(type, name, free_func) \ + _DEFINE_TRIVIAL_UNREF_FUNC(type, name, free_func,) +#define DEFINE_PRIVATE_TRIVIAL_UNREF_FUNC(type, name, free_func) \ + _DEFINE_TRIVIAL_UNREF_FUNC(type, name, free_func, static) +#define DEFINE_PUBLIC_TRIVIAL_UNREF_FUNC(type, name, free_func) \ + _DEFINE_TRIVIAL_UNREF_FUNC(type, name, free_func, _public_) + +#define DEFINE_TRIVIAL_REF_UNREF_FUNC(type, name, free_func) \ + DEFINE_TRIVIAL_REF_FUNC(type, name); \ + DEFINE_TRIVIAL_UNREF_FUNC(type, name, free_func); + +#define DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(type, name, free_func) \ + DEFINE_PRIVATE_TRIVIAL_REF_FUNC(type, name); \ + DEFINE_PRIVATE_TRIVIAL_UNREF_FUNC(type, name, free_func); + +#define DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(type, name, free_func) \ + DEFINE_PUBLIC_TRIVIAL_REF_FUNC(type, name); \ + DEFINE_PUBLIC_TRIVIAL_UNREF_FUNC(type, name, free_func); + +/* A macro to force copying of a variable from memory. This is useful whenever we want to read something from + * memory and want to make sure the compiler won't optimize away the destination variable for us. It's not + * supposed to be a full CPU memory barrier, i.e. CPU is still allowed to reorder the reads, but it is not + * allowed to remove our local copies of the variables. We want this to work for unaligned memory, hence + * memcpy() is great for our purposes. */ +#define READ_NOW(x) \ + ({ \ + typeof(x) _copy; \ + memcpy(&_copy, &(x), sizeof(_copy)); \ + asm volatile ("" : : : "memory"); \ + _copy; \ + }) + +#define saturate_add(x, y, limit) \ + ({ \ + typeof(limit) _x = (x); \ + typeof(limit) _y = (y); \ + _x > (limit) || _y >= (limit) - _x ? (limit) : _x + _y; \ + }) + +static inline size_t size_add(size_t x, size_t y) { + return saturate_add(x, y, SIZE_MAX); +} + +typedef struct { + int _empty[0]; +} dummy_t; + +assert_cc(sizeof(dummy_t) == 0); + +/* A little helper for subtracting 1 off a pointer in a safe UB-free way. This is intended to be used for + * loops that count down from a high pointer until some base. A naive loop would implement this like this: + * + * for (p = end-1; p >= base; p--) … + * + * But this is not safe because p before the base is UB in C. With this macro the loop becomes this instead: + * + * for (p = PTR_SUB1(end, base); p; p = PTR_SUB1(p, base)) … + * + * And is free from UB! */ +#define PTR_SUB1(p, base) \ + ({ \ + typeof(p) _q = (p); \ + _q && _q > (base) ? &_q[-1] : NULL; \ + }) + +/* Iterate through each variadic arg. All must be the same type as 'entry' or must be implicitly + * convertible. The iteration variable 'entry' must already be defined. */ +#define VA_ARGS_FOREACH(entry, ...) \ + _VA_ARGS_FOREACH(entry, UNIQ_T(_entries_, UNIQ), UNIQ_T(_current_, UNIQ), UNIQ_T(_va_sentinel_, UNIQ), ##__VA_ARGS__) +#define _VA_ARGS_FOREACH(entry, _entries_, _current_, _va_sentinel_, ...) \ + for (typeof(entry) _va_sentinel_[1] = {}, _entries_[] = { __VA_ARGS__ __VA_OPT__(,) _va_sentinel_[0] }, *_current_ = _entries_; \ + ((long)(_current_ - _entries_) < (long)(ELEMENTSOF(_entries_) - 1)) && ({ entry = *_current_; true; }); \ + _current_++) + +#include "log.h" diff --git a/src/basic/mallinfo-util.h b/src/basic/mallinfo-util.h new file mode 100644 index 0000000..7fa9dd5 --- /dev/null +++ b/src/basic/mallinfo-util.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#if HAVE_MALLINFO2 +# define HAVE_GENERIC_MALLINFO 1 +typedef struct mallinfo2 generic_mallinfo; +static inline generic_mallinfo generic_mallinfo_get(void) { + return mallinfo2(); +} +#elif HAVE_MALLINFO +# define HAVE_GENERIC_MALLINFO 1 +typedef struct mallinfo generic_mallinfo; +static inline generic_mallinfo generic_mallinfo_get(void) { + /* glibc has deprecated mallinfo(), let's suppress the deprecation warning if mallinfo2() doesn't + * exist yet. */ +DISABLE_WARNING_DEPRECATED_DECLARATIONS + return mallinfo(); +REENABLE_WARNING +} +#else +# define HAVE_GENERIC_MALLINFO 0 +#endif diff --git a/src/basic/math-util.h b/src/basic/math-util.h new file mode 100644 index 0000000..24023cd --- /dev/null +++ b/src/basic/math-util.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +/* On some optimization level, iszero(x) is converted to (x == 0.0), and emits warning -Wfloat-equal. + * The argument must be a floating point, i.e. one of float, double, or long double. */ +#define iszero_safe(x) (fpclassify(x) == FP_ZERO) + +/* To avoid x == y and triggering compile warning -Wfloat-equal. This returns false if one of the argument is + * NaN or infinity. One of the argument must be a floating point. */ +#define fp_equal(x, y) iszero_safe((x) - (y)) diff --git a/src/basic/memfd-util.c b/src/basic/memfd-util.c new file mode 100644 index 0000000..92b84f9 --- /dev/null +++ b/src/basic/memfd-util.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#if HAVE_LINUX_MEMFD_H +#include +#endif +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "macro.h" +#include "memfd-util.h" +#include "missing_fcntl.h" +#include "missing_mman.h" +#include "missing_syscall.h" +#include "string-util.h" +#include "utf8.h" + +int memfd_create_wrapper(const char *name, unsigned mode) { + unsigned mode_compat; + int mfd; + + mfd = RET_NERRNO(memfd_create(name, mode)); + if (mfd != -EINVAL) + return mfd; + + mode_compat = mode & ~(MFD_EXEC | MFD_NOEXEC_SEAL); + + if (mode == mode_compat) + return mfd; + + return RET_NERRNO(memfd_create(name, mode_compat)); +} + +int memfd_new(const char *name) { + _cleanup_free_ char *g = NULL; + + if (!name) { + char pr[17] = {}; + + /* If no name is specified we generate one. We include + * a hint indicating our library implementation, and + * add the thread name to it */ + + assert_se(prctl(PR_GET_NAME, (unsigned long) pr) >= 0); + + if (isempty(pr)) + name = "sd"; + else { + _cleanup_free_ char *e = NULL; + + e = utf8_escape_invalid(pr); + if (!e) + return -ENOMEM; + + g = strjoin("sd-", e); + if (!g) + return -ENOMEM; + + name = g; + } + } + + return memfd_create_wrapper(name, MFD_ALLOW_SEALING | MFD_CLOEXEC | MFD_NOEXEC_SEAL); +} + +int memfd_add_seals(int fd, unsigned int seals) { + assert(fd >= 0); + + return RET_NERRNO(fcntl(fd, F_ADD_SEALS, seals)); +} + +int memfd_get_seals(int fd, unsigned int *ret_seals) { + int r; + + assert(fd >= 0); + + r = RET_NERRNO(fcntl(fd, F_GET_SEALS)); + if (r < 0) + return r; + + if (ret_seals) + *ret_seals = r; + return 0; +} + +int memfd_map(int fd, uint64_t offset, size_t size, void **p) { + unsigned int seals; + void *q; + int r; + + assert(fd >= 0); + assert(size > 0); + assert(p); + + r = memfd_get_seals(fd, &seals); + if (r < 0) + return r; + + if (seals & F_SEAL_WRITE) + q = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, offset); + else + q = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset); + if (q == MAP_FAILED) + return -errno; + + *p = q; + return 0; +} + +int memfd_set_sealed(int fd) { + return memfd_add_seals(fd, F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE); +} + +int memfd_get_sealed(int fd) { + unsigned int seals; + int r; + + r = memfd_get_seals(fd, &seals); + if (r < 0) + return r; + + /* We ignore F_SEAL_EXEC here to support older kernels. */ + return FLAGS_SET(seals, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE); +} + +int memfd_get_size(int fd, uint64_t *sz) { + struct stat stat; + + assert(fd >= 0); + assert(sz); + + if (fstat(fd, &stat) < 0) + return -errno; + + *sz = stat.st_size; + return 0; +} + +int memfd_set_size(int fd, uint64_t sz) { + assert(fd >= 0); + + return RET_NERRNO(ftruncate(fd, sz)); +} + +int memfd_new_and_map(const char *name, size_t sz, void **p) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(sz > 0); + assert(p); + + fd = memfd_new(name); + if (fd < 0) + return fd; + + r = memfd_set_size(fd, sz); + if (r < 0) + return r; + + r = memfd_map(fd, 0, sz, p); + if (r < 0) + return r; + + return TAKE_FD(fd); +} + +int memfd_new_and_seal(const char *name, const void *data, size_t sz) { + _cleanup_close_ int fd = -EBADF; + ssize_t n; + off_t f; + int r; + + assert(data || sz == 0); + + fd = memfd_new(name); + if (fd < 0) + return fd; + + if (sz > 0) { + n = write(fd, data, sz); + if (n < 0) + return -errno; + if ((size_t) n != sz) + return -EIO; + + f = lseek(fd, 0, SEEK_SET); + if (f != 0) + return -errno; + } + + r = memfd_set_sealed(fd); + if (r < 0) + return r; + + return TAKE_FD(fd); +} diff --git a/src/basic/memfd-util.h b/src/basic/memfd-util.h new file mode 100644 index 0000000..9b2103e --- /dev/null +++ b/src/basic/memfd-util.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +int memfd_create_wrapper(const char *name, unsigned mode); + +int memfd_new(const char *name); +int memfd_new_and_map(const char *name, size_t sz, void **p); +int memfd_new_and_seal(const char *name, const void *data, size_t sz); + +int memfd_add_seals(int fd, unsigned int seals); +int memfd_get_seals(int fd, unsigned int *ret_seals); +int memfd_map(int fd, uint64_t offset, size_t size, void **p); + +int memfd_set_sealed(int fd); +int memfd_get_sealed(int fd); + +int memfd_get_size(int fd, uint64_t *sz); +int memfd_set_size(int fd, uint64_t sz); diff --git a/src/basic/memory-util.c b/src/basic/memory-util.c new file mode 100644 index 0000000..fcedae2 --- /dev/null +++ b/src/basic/memory-util.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "memory-util.h" +#include "missing_threads.h" + +size_t page_size(void) { + static thread_local size_t pgsz = 0; + long r; + + if (_likely_(pgsz > 0)) + return pgsz; + + r = sysconf(_SC_PAGESIZE); + assert(r > 0); + + pgsz = (size_t) r; + return pgsz; +} + +bool memeqbyte(uint8_t byte, const void *data, size_t length) { + /* Does the buffer consist entirely of the same specific byte value? + * Copied from https://github.com/systemd/casync/, copied in turn from + * https://github.com/rustyrussell/ccan/blob/master/ccan/mem/mem.c#L92, + * which is licensed CC-0. + */ + + const uint8_t *p = data; + + /* Check first 16 bytes manually */ + for (size_t i = 0; i < 16; i++, length--) { + if (length == 0) + return true; + if (p[i] != byte) + return false; + } + + /* Now we know first 16 bytes match, memcmp() with self. */ + return memcmp(data, p + 16, length) == 0; +} diff --git a/src/basic/memory-util.h b/src/basic/memory-util.h new file mode 100644 index 0000000..1179513 --- /dev/null +++ b/src/basic/memory-util.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "macro.h" +#include "memory-util-fundamental.h" + +size_t page_size(void) _pure_; +#define PAGE_ALIGN(l) ALIGN_TO(l, page_size()) +#define PAGE_ALIGN_U64(l) ALIGN_TO_U64(l, page_size()) +#define PAGE_ALIGN_DOWN(l) ALIGN_DOWN(l, page_size()) +#define PAGE_ALIGN_DOWN_U64(l) ALIGN_DOWN_U64(l, page_size()) +#define PAGE_OFFSET(l) ALIGN_OFFSET(l, page_size()) +#define PAGE_OFFSET_U64(l) ALIGN_OFFSET_U64(l, page_size()) + +/* Normal memcpy() requires src to be nonnull. We do nothing if n is 0. */ +static inline void *memcpy_safe(void *dst, const void *src, size_t n) { + if (n == 0) + return dst; + assert(src); + return memcpy(dst, src, n); +} + +/* Normal mempcpy() requires src to be nonnull. We do nothing if n is 0. */ +static inline void *mempcpy_safe(void *dst, const void *src, size_t n) { + if (n == 0) + return dst; + assert(src); + return mempcpy(dst, src, n); +} + +/* Normal memcmp() requires s1 and s2 to be nonnull. We do nothing if n is 0. */ +static inline int memcmp_safe(const void *s1, const void *s2, size_t n) { + if (n == 0) + return 0; + assert(s1); + assert(s2); + return memcmp(s1, s2, n); +} + +/* Compare s1 (length n1) with s2 (length n2) in lexicographic order. */ +static inline int memcmp_nn(const void *s1, size_t n1, const void *s2, size_t n2) { + return memcmp_safe(s1, s2, MIN(n1, n2)) + ?: CMP(n1, n2); +} + +#define zero(x) (memzero(&(x), sizeof(x))) + +bool memeqbyte(uint8_t byte, const void *data, size_t length); + +#define memeqzero(data, length) memeqbyte(0x00, data, length) + +#define eqzero(x) memeqzero(x, sizeof(x)) + +static inline void *mempset(void *s, int c, size_t n) { + memset(s, c, n); + return (uint8_t*)s + n; +} + +/* Normal memmem() requires haystack to be nonnull, which is annoying for zero-length buffers */ +static inline void *memmem_safe(const void *haystack, size_t haystacklen, const void *needle, size_t needlelen) { + + if (needlelen <= 0) + return (void*) haystack; + + if (haystacklen < needlelen) + return NULL; + + assert(haystack); + assert(needle); + + return memmem(haystack, haystacklen, needle, needlelen); +} + +static inline void *mempmem_safe(const void *haystack, size_t haystacklen, const void *needle, size_t needlelen) { + const uint8_t *p; + + p = memmem_safe(haystack, haystacklen, needle, needlelen); + if (!p) + return NULL; + + return (uint8_t*) p + needlelen; +} + +static inline void* erase_and_free(void *p) { + size_t l; + + if (!p) + return NULL; + + l = MALLOC_SIZEOF_SAFE(p); + explicit_bzero_safe(p, l); + return mfree(p); +} + +static inline void erase_and_freep(void *p) { + erase_and_free(*(void**) p); +} + +/* Use with _cleanup_ to erase a single 'char' when leaving scope */ +static inline void erase_char(char *p) { + explicit_bzero_safe(p, sizeof(char)); +} diff --git a/src/basic/mempool.c b/src/basic/mempool.c new file mode 100644 index 0000000..391f29b --- /dev/null +++ b/src/basic/mempool.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "format-util.h" +#include "macro.h" +#include "memory-util.h" +#include "mempool.h" + +struct pool { + struct pool *next; + size_t n_tiles; + size_t n_used; +}; + +static void* pool_ptr(struct pool *p) { + return ((uint8_t*) ASSERT_PTR(p)) + ALIGN(sizeof(struct pool)); +} + +void* mempool_alloc_tile(struct mempool *mp) { + size_t i; + + /* When a tile is released we add it to the list and simply + * place the next pointer at its offset 0. */ + + assert(mp); + assert(mp->tile_size >= sizeof(void*)); + assert(mp->at_least > 0); + + if (mp->freelist) { + void *t; + + t = mp->freelist; + mp->freelist = *(void**) mp->freelist; + return t; + } + + if (_unlikely_(!mp->first_pool) || + _unlikely_(mp->first_pool->n_used >= mp->first_pool->n_tiles)) { + size_t size, n; + struct pool *p; + + n = mp->first_pool ? mp->first_pool->n_tiles : 0; + n = MAX(mp->at_least, n * 2); + size = PAGE_ALIGN(ALIGN(sizeof(struct pool)) + n*mp->tile_size); + n = (size - ALIGN(sizeof(struct pool))) / mp->tile_size; + + p = malloc(size); + if (!p) + return NULL; + + p->next = mp->first_pool; + p->n_tiles = n; + p->n_used = 0; + + mp->first_pool = p; + } + + i = mp->first_pool->n_used++; + + return (uint8_t*) pool_ptr(mp->first_pool) + i*mp->tile_size; +} + +void* mempool_alloc0_tile(struct mempool *mp) { + void *p; + + p = mempool_alloc_tile(mp); + if (p) + memzero(p, mp->tile_size); + return p; +} + +void* mempool_free_tile(struct mempool *mp, void *p) { + assert(mp); + + if (!p) + return NULL; + + *(void**) p = mp->freelist; + mp->freelist = p; + + return NULL; +} + +static bool pool_contains(struct mempool *mp, struct pool *p, void *ptr) { + size_t off; + void *a; + + assert(mp); + assert(p); + + if (!ptr) + return false; + + a = pool_ptr(p); + if ((uint8_t*) ptr < (uint8_t*) a) + return false; + + off = (uint8_t*) ptr - (uint8_t*) a; + if (off >= mp->tile_size * p->n_tiles) + return false; + + assert(off % mp->tile_size == 0); + return true; +} + +static bool pool_is_unused(struct mempool *mp, struct pool *p) { + assert(mp); + assert(p); + + if (p->n_used == 0) + return true; + + /* Check if all tiles in this specific pool are in the freelist. */ + size_t n = 0; + void *i = mp->freelist; + while (i) { + if (pool_contains(mp, p, i)) + n++; + + i = *(void**) i; + } + + assert(n <= p->n_used); + + return n == p->n_used; +} + +static void pool_unlink(struct mempool *mp, struct pool *p) { + size_t m = 0; + + assert(mp); + assert(p); + + if (p->n_used == 0) + return; + + void **i = &mp->freelist; + while (*i) { + void *d = *i; + + if (pool_contains(mp, p, d)) { + *i = *(void**) d; + m++; + + if (m == p->n_used) + break; + } else + i = (void**) d; + } +} + +void mempool_trim(struct mempool *mp) { + size_t trimmed = 0, left = 0; + + assert(mp); + + struct pool **p = &mp->first_pool; + while (*p) { + struct pool *d = *p; + + if (pool_is_unused(mp, d)) { + trimmed += d->n_tiles * mp->tile_size; + pool_unlink(mp, d); + *p = d->next; + free(d); + } else { + left += d->n_tiles * mp->tile_size; + p = &d->next; + } + } + + log_debug("Trimmed %s from memory pool %p. (%s left)", FORMAT_BYTES(trimmed), mp, FORMAT_BYTES(left)); +} diff --git a/src/basic/mempool.h b/src/basic/mempool.h new file mode 100644 index 0000000..ba588af --- /dev/null +++ b/src/basic/mempool.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +struct pool; + +struct mempool { + struct pool *first_pool; + void *freelist; + size_t tile_size; + size_t at_least; +}; + +void* mempool_alloc_tile(struct mempool *mp); +void* mempool_alloc0_tile(struct mempool *mp); +void* mempool_free_tile(struct mempool *mp, void *p); + +#define DEFINE_MEMPOOL(pool_name, tile_type, alloc_at_least) \ +static struct mempool pool_name = { \ + .tile_size = sizeof(tile_type), \ + .at_least = alloc_at_least, \ +} + +__attribute__((weak)) bool mempool_enabled(void); + +void mempool_trim(struct mempool *mp); diff --git a/src/basic/memstream-util.c b/src/basic/memstream-util.c new file mode 100644 index 0000000..4e147fd --- /dev/null +++ b/src/basic/memstream-util.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "memstream-util.h" + +void memstream_done(MemStream *m) { + assert(m); + + /* First, close file stream, as the buffer may be reallocated on close. */ + safe_fclose(m->f); + + /* Then, free buffer. */ + free(m->buf); +} + +FILE* memstream_init(MemStream *m) { + assert(m); + assert(!m->f); + + m->f = open_memstream_unlocked(&m->buf, &m->sz); + return m->f; +} + +int memstream_finalize(MemStream *m, char **ret_buf, size_t *ret_size) { + int r; + + assert(m); + assert(m->f); + assert(ret_buf); + + /* Add terminating NUL, so that the output buffer is a valid string. */ + fputc('\0', m->f); + + r = fflush_and_check(m->f); + if (r < 0) + return r; + + m->f = safe_fclose(m->f); + + /* On fclose(), the buffer may be reallocated, and may trigger OOM. */ + if (!m->buf) + return -ENOMEM; + + assert(m->sz > 0); + + *ret_buf = TAKE_PTR(m->buf); + if (ret_size) + *ret_size = m->sz - 1; + + m->sz = 0; /* For safety when the MemStream object will be reused later. */ + return 0; +} + +int memstream_dump_internal( + int level, + int error, + const char *file, + int line, + const char *func, + MemStream *m) { + + _cleanup_free_ char *buf = NULL; + int r; + + assert(m); + + r = memstream_finalize(m, &buf, NULL); + if (r < 0) + return log_full_errno(level, r, "Failed to flush memstream: %m: %m"); + + return log_dump_internal(level, error, file, line, func, buf); +} diff --git a/src/basic/memstream-util.h b/src/basic/memstream-util.h new file mode 100644 index 0000000..1aa5651 --- /dev/null +++ b/src/basic/memstream-util.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +typedef struct MemStream { + FILE *f; + char *buf; + size_t sz; +} MemStream; + +void memstream_done(MemStream *m); +FILE* memstream_init(MemStream *m); +int memstream_finalize(MemStream *m, char **ret_buf, size_t *ret_size); + +/* This finalizes the passed memstream. */ +int memstream_dump_internal( + int level, + int error, + const char *file, + int line, + const char *func, + MemStream *m); +#define memstream_dump(level, m) \ + memstream_dump_internal(level, 0, PROJECT_FILE, __LINE__, __func__, m) diff --git a/src/basic/meson.build b/src/basic/meson.build new file mode 100644 index 0000000..d7450d8 --- /dev/null +++ b/src/basic/meson.build @@ -0,0 +1,316 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +basic_sources = files( + 'MurmurHash2.c', + 'af-list.c', + 'alloc-util.c', + 'architecture.c', + 'argv-util.c', + 'arphrd-util.c', + 'audit-util.c', + 'btrfs.c', + 'build.c', + 'bus-label.c', + 'cap-list.c', + 'capability-util.c', + 'cgroup-util.c', + 'chase.c', + 'chattr-util.c', + 'conf-files.c', + 'confidential-virt.c', + 'devnum-util.c', + 'dirent-util.c', + 'efivars.c', + 'env-file.c', + 'env-util.c', + 'errno-list.c', + 'escape.c', + 'ether-addr-util.c', + 'extract-word.c', + 'fd-util.c', + 'fileio.c', + 'filesystems.c', + 'format-util.c', + 'fs-util.c', + 'glob-util.c', + 'glyph-util.c', + 'gunicode.c', + 'hash-funcs.c', + 'hashmap.c', + 'hexdecoct.c', + 'hmac.c', + 'hostname-util.c', + 'in-addr-util.c', + 'initrd-util.c', + 'inotify-util.c', + 'io-util.c', + 'ioprio-util.c', + 'iovec-util.c', + 'iovec-wrapper.c', + 'label.c', + 'limits-util.c', + 'locale-util.c', + 'lock-util.c', + 'log.c', + 'login-util.c', + 'memfd-util.c', + 'memory-util.c', + 'mempool.c', + 'memstream-util.c', + 'mkdir.c', + 'mountpoint-util.c', + 'namespace-util.c', + 'nulstr-util.c', + 'ordered-set.c', + 'os-util.c', + 'parse-util.c', + 'path-lookup.c', + 'path-util.c', + 'percent-util.c', + 'pidref.c', + 'prioq.c', + 'proc-cmdline.c', + 'process-util.c', + 'procfs-util.c', + 'psi-util.c', + 'random-util.c', + 'ratelimit.c', + 'recurse-dir.c', + 'replace-var.c', + 'rlimit-util.c', + 'runtime-scope.c', + 'sigbus.c', + 'signal-util.c', + 'siphash24.c', + 'socket-util.c', + 'sort-util.c', + 'stat-util.c', + 'strbuf.c', + 'string-table.c', + 'string-util.c', + 'strv.c', + 'strxcpyx.c', + 'sync-util.c', + 'sysctl-util.c', + 'syslog-util.c', + 'terminal-util.c', + 'time-util.c', + 'tmpfile-util.c', + 'uid-alloc-range.c', + 'uid-range.c', + 'unit-def.c', + 'unit-file.c', + 'unit-name.c', + 'user-util.c', + 'utf8.c', + 'virt.c', + 'xattr-util.c', +) + +missing_audit_h = files('missing_audit.h') +missing_capability_h = files('missing_capability.h') +missing_socket_h = files('missing_socket.h') + +missing_syscall_def_h = files('missing_syscall_def.h') +basic_sources += missing_syscall_def_h + +generate_af_list = find_program('generate-af-list.sh') +af_list_txt = custom_target( + 'af-list.txt', + output : 'af-list.txt', + command : [generate_af_list, cpp, config_h, missing_socket_h], + capture : true) + +generate_arphrd_list = find_program('generate-arphrd-list.sh') +arphrd_list_txt = custom_target( + 'arphrd-list.txt', + output : 'arphrd-list.txt', + command : [generate_arphrd_list, cpp, config_h], + capture : true) + +generate_cap_list = find_program('generate-cap-list.sh') +cap_list_txt = custom_target( + 'cap-list.txt', + output : 'cap-list.txt', + command : [generate_cap_list, cpp, config_h, missing_capability_h], + capture : true) + +generate_errno_list = find_program('generate-errno-list.sh') +errno_list_txt = custom_target( + 'errno-list.txt', + output : 'errno-list.txt', + command : [generate_errno_list, cpp], + capture : true) + +generated_gperf_headers = [] +foreach item : [['af', af_list_txt, 'af', ''], + ['arphrd', arphrd_list_txt, 'arphrd', 'ARPHRD_'], + ['cap', cap_list_txt, 'capability', ''], + ['errno', errno_list_txt, 'errno', '']] + + fname = '@0@-from-name.gperf'.format(item[0]) + gperf_file = custom_target( + fname, + input : item[1], + output : fname, + command : [generate_gperfs, item[2], item[3], '@INPUT@'], + capture : true) + + fname = '@0@-from-name.h'.format(item[0]) + target1 = custom_target( + fname, + input : gperf_file, + output : fname, + command : [gperf, + '-L', 'ANSI-C', '-t', '--ignore-case', + '-N', 'lookup_@0@'.format(item[2]), + '-H', 'hash_@0@_name'.format(item[2]), + '-p', '-C', + '@INPUT@'], + capture : true) + + fname = '@0@-to-name.h'.format(item[0]) + awkscript = '@0@-to-name.awk'.format(item[0]) + target2 = custom_target( + fname, + input : [awkscript, item[1]], + output : fname, + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + + generated_gperf_headers += [target1, target2] +endforeach + +basic_sources += generated_gperf_headers + +############################################################ + +arch_list = [ + 'alpha', + 'arc', + 'arm', + 'arm64', + 'i386', + 'ia64', + 'loongarch64', + 'm68k', + 'mips64', + 'mips64n32', + 'mipso32', + 'parisc', + 'powerpc', + 'powerpc64', + 'riscv32', + 'riscv64', + 's390', + 's390x', + 'sparc', + 'x86_64' +] + +run_target( + 'update-syscall-tables', + command : [update_syscall_tables_sh, meson.current_source_dir()] + arch_list) + +syscall_list_txt = files('syscall-list.txt') + +syscall_lists = [] +foreach arch: arch_list + syscall_lists += files('syscalls-@0@.txt'.format(arch)) +endforeach + +missing_syscalls_py = find_program('missing_syscalls.py') + +run_target( + 'update-syscall-header', + command : [missing_syscalls_py, + missing_syscall_def_h, + syscall_lists]) + +############################################################ + +filesystem_includes = ['linux/magic.h', + 'linux/gfs2_ondisk.h'] + +check_filesystems = find_program('check-filesystems.sh') +r = run_command([check_filesystems, cpp, files('filesystems-gperf.gperf')] + filesystem_includes, check: false) +if r.returncode() != 0 + error('Unknown filesystems defined in kernel headers:\n\n' + r.stdout()) +endif + +filesystems_gperf_h = custom_target( + 'filesystems-gperf.h', + input : 'filesystems-gperf.gperf', + output : 'filesystems-gperf.h', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +generate_filesystem_list = find_program('generate-filesystem-list.py') +fname = 'filesystem-list.h' +filesystem_list_h = custom_target( + fname, + input : 'filesystems-gperf.gperf', + output : fname, + command : [generate_filesystem_list, + '@INPUT@'], + capture : true) + +generate_filesystem_switch_case_h = find_program('generate-filesystem-switch-case.py') +fname = 'filesystem-switch-case.h' +filesystem_switch_case_h = custom_target( + fname, + input : 'filesystems-gperf.gperf', + output : 'filesystem-switch-case.h', + command : [generate_filesystem_switch_case_h, + '@INPUT@'], + capture : true) + +basic_sources += [filesystem_list_h, filesystem_switch_case_h, filesystems_gperf_h] + +libbasic = static_library( + 'basic', + basic_sources, + fundamental_sources, + include_directories : basic_includes, + dependencies : [libcap, + libm, + librt, + threads, + userspace], + c_args : ['-fvisibility=default'], + build_by_default : false) + +############################################################ + +basic_gcrypt_sources = files( + 'gcrypt-util.c', +) + +# A convenience library that is separate from libbasic to avoid +# unnecessary linking to libgcrypt. +libbasic_gcrypt = static_library( + 'basic-gcrypt', + basic_gcrypt_sources, + include_directories : basic_includes, + dependencies : [libgcrypt, + userspace], + c_args : ['-fvisibility=default'], + build_by_default : false) + +############################################################ + +basic_compress_sources = files( + 'compress.c', +) + +# A convenience library that is separate from libbasic to avoid unnecessary +# linking to the compression libraries. +libbasic_compress = static_library( + 'basic-compress', + basic_compress_sources, + include_directories : basic_includes, + dependencies : [liblz4, + libxz, + libzstd, + userspace], + c_args : ['-fvisibility=default'], + build_by_default : false) diff --git a/src/basic/missing_audit.h b/src/basic/missing_audit.h new file mode 100644 index 0000000..62e3c29 --- /dev/null +++ b/src/basic/missing_audit.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#if HAVE_AUDIT +#include +#endif + +#ifndef AUDIT_SERVICE_START +#define AUDIT_SERVICE_START 1130 /* Service (daemon) start */ +#endif + +#ifndef AUDIT_SERVICE_STOP +#define AUDIT_SERVICE_STOP 1131 /* Service (daemon) stop */ +#endif + +#ifndef MAX_AUDIT_MESSAGE_LENGTH +#define MAX_AUDIT_MESSAGE_LENGTH 8970 +#endif + +#ifndef AUDIT_NLGRP_MAX +#define AUDIT_NLGRP_READLOG 1 +#endif diff --git a/src/basic/missing_capability.h b/src/basic/missing_capability.h new file mode 100644 index 0000000..5adda55 --- /dev/null +++ b/src/basic/missing_capability.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* 3a101b8de0d39403b2c7e5c23fd0b005668acf48 (3.16) */ +#ifndef CAP_AUDIT_READ +# define CAP_AUDIT_READ 37 +#endif + +/* 980737282232b752bb14dab96d77665c15889c36 (5.8) */ +#ifndef CAP_PERFMON +# define CAP_PERFMON 38 +#endif + +/* a17b53c4a4b55ec322c132b6670743612229ee9c (5.8) */ +#ifndef CAP_BPF +# define CAP_BPF 39 +#endif + +/* 124ea650d3072b005457faed69909221c2905a1f (5.9) */ +#ifndef CAP_CHECKPOINT_RESTORE +# define CAP_CHECKPOINT_RESTORE 40 +#endif + +#define SYSTEMD_CAP_LAST_CAP CAP_CHECKPOINT_RESTORE + +#ifdef CAP_LAST_CAP +# if CAP_LAST_CAP > SYSTEMD_CAP_LAST_CAP +# if BUILD_MODE_DEVELOPER && defined(TEST_CAPABILITY_C) +# warning "The capability list here is outdated" +# endif +# else +# undef CAP_LAST_CAP +# endif +#endif +#ifndef CAP_LAST_CAP +# define CAP_LAST_CAP SYSTEMD_CAP_LAST_CAP +#endif diff --git a/src/basic/missing_drm.h b/src/basic/missing_drm.h new file mode 100644 index 0000000..0dec591 --- /dev/null +++ b/src/basic/missing_drm.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#ifndef DRM_IOCTL_SET_MASTER +#define DRM_IOCTL_SET_MASTER _IO('d', 0x1e) +#endif + +#ifndef DRM_IOCTL_DROP_MASTER +#define DRM_IOCTL_DROP_MASTER _IO('d', 0x1f) +#endif diff --git a/src/basic/missing_fcntl.h b/src/basic/missing_fcntl.h new file mode 100644 index 0000000..3c85bef --- /dev/null +++ b/src/basic/missing_fcntl.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_SETPIPE_SZ +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#endif + +#ifndef F_GETPIPE_SZ +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +#ifndef F_SEAL_FUTURE_WRITE +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#endif + +#ifndef F_SEAL_EXEC +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ +#endif + +#ifndef F_OFD_GETLK +#define F_OFD_GETLK 36 +#define F_OFD_SETLK 37 +#define F_OFD_SETLKW 38 +#endif + +#ifndef MAX_HANDLE_SZ +#define MAX_HANDLE_SZ 128 +#endif + +/* The precise definition of __O_TMPFILE is arch specific; use the + * values defined by the kernel (note: some are hexa, some are octal, + * duplicated as-is from the kernel definitions): + * - alpha, parisc, sparc: each has a specific value; + * - others: they use the "generic" value. + */ + +#ifndef __O_TMPFILE +#if defined(__alpha__) +#define __O_TMPFILE 0100000000 +#elif defined(__parisc__) || defined(__hppa__) +#define __O_TMPFILE 0400000000 +#elif defined(__sparc__) || defined(__sparc64__) +#define __O_TMPFILE 0x2000000 +#else +#define __O_TMPFILE 020000000 +#endif +#endif + +/* a horrid kludge trying to make sure that this will fail on old kernels */ +#ifndef O_TMPFILE +#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +#endif + +/* So O_LARGEFILE is generally implied by glibc, and defined to zero hence, because we only build in LFS + * mode. However, when invoking fcntl(F_GETFL) the flag is ORed into the result anyway — glibc does not mask + * it away. Which sucks. Let's define the actual value here, so that we can mask it ourselves. + * + * The precise definition is arch specific, so we use the values defined in the kernel (note that some + * are hexa and others are octal; duplicated as-is from the kernel definitions): + * - alpha, arm, arm64, m68k, mips, parisc, powerpc, sparc: each has a specific value; + * - others: they use the "generic" value (defined in include/uapi/asm-generic/fcntl.h) */ +#if O_LARGEFILE != 0 +#define RAW_O_LARGEFILE O_LARGEFILE +#else +#if defined(__alpha__) || defined(__arm__) || defined(__aarch64__) || defined(__m68k__) +#define RAW_O_LARGEFILE 0400000 +#elif defined(__mips__) +#define RAW_O_LARGEFILE 0x2000 +#elif defined(__parisc__) || defined(__hppa__) +#define RAW_O_LARGEFILE 000004000 +#elif defined(__powerpc__) +#define RAW_O_LARGEFILE 0200000 +#elif defined(__sparc__) +#define RAW_O_LARGEFILE 0x40000 +#else +#define RAW_O_LARGEFILE 00100000 +#endif +#endif diff --git a/src/basic/missing_fs.h b/src/basic/missing_fs.h new file mode 100644 index 0000000..9b03bba --- /dev/null +++ b/src/basic/missing_fs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* linux/fs.h */ +#ifndef RENAME_NOREPLACE /* 0a7c3937a1f23f8cb5fc77ae01661e9968a51d0c (3.15) */ +#define RENAME_NOREPLACE (1 << 0) +#endif + +#ifndef BLKGETDISKSEQ /* 7957d93bf32bc211415827e44fdd9cdf1388df59 (5.15) */ +#define BLKGETDISKSEQ _IOR(0x12,128,__u64) +#endif + +#ifndef FICLONE /* 04b38d601239b4d9be641b412cf4b7456a041c67 (4.5) */ +#define FICLONE _IOW(0x94, 9, int) +#endif + +#ifndef FICLONERANGE /* 04b38d601239b4d9be641b412cf4b7456a041c67 (4.5) */ +#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range) +struct file_clone_range { + __s64 src_fd; + __u64 src_offset; + __u64 src_length; + __u64 dest_offset; +}; +#endif + +/* linux/fs.h or sys/mount.h */ +#ifndef MS_MOVE +#define MS_MOVE 8192 +#endif + +#ifndef MS_REC +#define MS_REC 16384 +#endif + +#ifndef MS_PRIVATE +#define MS_PRIVATE (1<<18) +#endif + +#ifndef MS_SLAVE +#define MS_SLAVE (1<<19) +#endif + +#ifndef MS_SHARED +#define MS_SHARED (1<<20) +#endif + +#ifndef MS_RELATIME +#define MS_RELATIME (1<<21) +#endif + +#ifndef MS_KERNMOUNT +#define MS_KERNMOUNT (1<<22) +#endif + +#ifndef MS_I_VERSION +#define MS_I_VERSION (1<<23) +#endif + +#ifndef MS_STRICTATIME +#define MS_STRICTATIME (1<<24) +#endif + +#ifndef MS_LAZYTIME +#define MS_LAZYTIME (1<<25) +#endif + +/* Not exposed yet. Defined at fs/ext4/ext4.h */ +#ifndef EXT4_IOC_RESIZE_FS +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#endif + +/* linux/nsfs.h */ +#ifndef NS_GET_NSTYPE /* d95fa3c76a66b6d76b1e109ea505c55e66360f3c (4.11) */ +#define NS_GET_NSTYPE _IO(0xb7, 0x3) +#endif + +#ifndef FS_PROJINHERIT_FL +#define FS_PROJINHERIT_FL 0x20000000 +#endif + +/* linux/fscrypt.h */ +#ifndef FS_KEY_DESCRIPTOR_SIZE +#define FS_KEY_DESCRIPTOR_SIZE 8 +#endif diff --git a/src/basic/missing_input.h b/src/basic/missing_input.h new file mode 100644 index 0000000..6cf16ff --- /dev/null +++ b/src/basic/missing_input.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +/* linux@c7dc65737c9a607d3e6f8478659876074ad129b8 (3.12) */ +#ifndef EVIOCREVOKE +#define EVIOCREVOKE _IOW('E', 0x91, int) +#endif + +/* linux@06a16293f71927f756dcf37558a79c0b05a91641 (4.4) */ +#ifndef EVIOCSMASK +struct input_mask { + __u32 type; + __u32 codes_size; + __u64 codes_ptr; +}; + +#define EVIOCGMASK _IOR('E', 0x92, struct input_mask) +#define EVIOCSMASK _IOW('E', 0x93, struct input_mask) +#endif + +/* linux@7611392fe8ff95ecae528b01a815ae3d72ca6b95 (3.17) */ +#ifndef INPUT_PROP_POINTING_STICK +#define INPUT_PROP_POINTING_STICK 0x05 +#endif + +/* linux@500d4160abe9a2e88b12e319c13ae3ebd1e18108 (4.0) */ +#ifndef INPUT_PROP_ACCELEROMETER +#define INPUT_PROP_ACCELEROMETER 0x06 +#endif + +/* linux@d09bbfd2a8408a995419dff0d2ba906013cf4cc9 (3.11) */ +#ifndef BTN_DPAD_UP +#define BTN_DPAD_UP 0x220 +#define BTN_DPAD_DOWN 0x221 +#define BTN_DPAD_LEFT 0x222 +#define BTN_DPAD_RIGHT 0x223 +#endif + +/* linux@358f24704f2f016af7d504b357cdf32606091d07 (3.13) */ +#ifndef KEY_ALS_TOGGLE +#define KEY_ALS_TOGGLE 0x230 +#endif diff --git a/src/basic/missing_ioprio.h b/src/basic/missing_ioprio.h new file mode 100644 index 0000000..9cbd172 --- /dev/null +++ b/src/basic/missing_ioprio.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* Match values uses by the kernel internally, as no public header seems to exist. */ + +#ifndef IOPRIO_N_CLASSES +# define IOPRIO_N_CLASSES 8 +#endif + +#ifndef IOPRIO_BE_NR +# define IOPRIO_BE_NR 8 +#endif + +#ifndef IOPRIO_CLASS_NONE +# define IOPRIO_CLASS_NONE 0 +#endif +#ifndef IOPRIO_CLASS_RT +# define IOPRIO_CLASS_RT 1 +#endif +#ifndef IOPRIO_CLASS_BE +# define IOPRIO_CLASS_BE 2 +#endif +#ifndef IOPRIO_CLASS_IDLE +# define IOPRIO_CLASS_IDLE 3 +#endif + +#ifndef IOPRIO_WHO_PROCESS +# define IOPRIO_WHO_PROCESS 1 +#endif +#ifndef IOPRIO_WHO_PGRP +# define IOPRIO_WHO_PGRP 2 +#endif +#ifndef IOPRIO_WHO_USER +# define IOPRIO_WHO_USER 3 +#endif + +#ifndef IOPRIO_BITS +# define IOPRIO_BITS 16 +#endif +#ifndef IOPRIO_N_CLASSES +# define IOPRIO_N_CLASSES 8 +#endif +#ifndef IOPRIO_CLASS_SHIFT +# define IOPRIO_CLASS_SHIFT 13 +#endif + +static inline int ioprio_prio_class(int value) { + return value >> IOPRIO_CLASS_SHIFT; +} + +static inline int ioprio_prio_data(int value) { + return value & ((1 << IOPRIO_CLASS_SHIFT) - 1); +} + +static inline int ioprio_prio_value(int class, int data) { + return (class << IOPRIO_CLASS_SHIFT) | data; +} diff --git a/src/basic/missing_keyctl.h b/src/basic/missing_keyctl.h new file mode 100644 index 0000000..081003a --- /dev/null +++ b/src/basic/missing_keyctl.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#ifndef KEYCTL_JOIN_SESSION_KEYRING +#define KEYCTL_JOIN_SESSION_KEYRING 1 +#endif + +#ifndef KEYCTL_CHOWN +#define KEYCTL_CHOWN 4 +#endif + +#ifndef KEYCTL_SETPERM +#define KEYCTL_SETPERM 5 +#endif + +#ifndef KEYCTL_DESCRIBE +#define KEYCTL_DESCRIBE 6 +#endif + +#ifndef KEYCTL_LINK +#define KEYCTL_LINK 8 +#endif + +#ifndef KEYCTL_READ +#define KEYCTL_READ 11 +#endif + +#ifndef KEYCTL_SET_TIMEOUT +#define KEYCTL_SET_TIMEOUT 15 +#endif + +#ifndef KEY_SPEC_USER_KEYRING +#define KEY_SPEC_USER_KEYRING -4 +#endif + +#ifndef KEY_SPEC_SESSION_KEYRING +#define KEY_SPEC_SESSION_KEYRING -3 +#endif + +/* From linux/key.h */ +#ifndef KEY_POS_VIEW + +typedef int32_t key_serial_t; + +#define KEY_POS_VIEW 0x01000000 +#define KEY_POS_READ 0x02000000 +#define KEY_POS_WRITE 0x04000000 +#define KEY_POS_SEARCH 0x08000000 +#define KEY_POS_LINK 0x10000000 +#define KEY_POS_SETATTR 0x20000000 +#define KEY_POS_ALL 0x3f000000 + +#define KEY_USR_VIEW 0x00010000 +#define KEY_USR_READ 0x00020000 +#define KEY_USR_WRITE 0x00040000 +#define KEY_USR_SEARCH 0x00080000 +#define KEY_USR_LINK 0x00100000 +#define KEY_USR_SETATTR 0x00200000 +#define KEY_USR_ALL 0x003f0000 + +#define KEY_GRP_VIEW 0x00000100 +#define KEY_GRP_READ 0x00000200 +#define KEY_GRP_WRITE 0x00000400 +#define KEY_GRP_SEARCH 0x00000800 +#define KEY_GRP_LINK 0x00001000 +#define KEY_GRP_SETATTR 0x00002000 +#define KEY_GRP_ALL 0x00003f00 + +#define KEY_OTH_VIEW 0x00000001 +#define KEY_OTH_READ 0x00000002 +#define KEY_OTH_WRITE 0x00000004 +#define KEY_OTH_SEARCH 0x00000008 +#define KEY_OTH_LINK 0x00000010 +#define KEY_OTH_SETATTR 0x00000020 +#define KEY_OTH_ALL 0x0000003f +#endif diff --git a/src/basic/missing_loop.h b/src/basic/missing_loop.h new file mode 100644 index 0000000..7141544 --- /dev/null +++ b/src/basic/missing_loop.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#ifndef LOOP_CONFIGURE +struct loop_config { + __u32 fd; + __u32 block_size; + struct loop_info64 info; + __u64 __reserved[8]; +}; + +#define LOOP_CONFIGURE 0x4C0A +#endif + +#ifndef LO_FLAGS_DIRECT_IO +#define LO_FLAGS_DIRECT_IO 16 +#define LOOP_SET_DIRECT_IO 0x4C08 +#endif + +#ifndef LOOP_SET_STATUS_SETTABLE_FLAGS +#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR | LO_FLAGS_PARTSCAN | LO_FLAGS_DIRECT_IO) +#endif diff --git a/src/basic/missing_magic.h b/src/basic/missing_magic.h new file mode 100644 index 0000000..27a33ad --- /dev/null +++ b/src/basic/missing_magic.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* 62aa81d7c4c24b90fdb61da70ac0dbbc414f9939 (4.13) */ +#ifndef OCFS2_SUPER_MAGIC +#define OCFS2_SUPER_MAGIC 0x7461636f +#endif + +/* 67e9c74b8a873408c27ac9a8e4c1d1c8d72c93ff (4.5) */ +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC 0x63677270 +#endif + +/* 4282d60689d4f21b40692029080440cc58e8a17d (4.1) */ +#ifndef TRACEFS_MAGIC +#define TRACEFS_MAGIC 0x74726163 +#endif + +/* e149ed2b805fefdccf7ccdfc19eca22fdd4514ac (3.19) */ +#ifndef NSFS_MAGIC +#define NSFS_MAGIC 0x6e736673 +#endif + +/* b2197755b2633e164a439682fb05a9b5ea48f706 (4.4) */ +#ifndef BPF_FS_MAGIC +#define BPF_FS_MAGIC 0xcafe4a11 +#endif + +/* Not exposed yet (4.20). Defined at ipc/mqueue.c */ +#ifndef MQUEUE_MAGIC +#define MQUEUE_MAGIC 0x19800202 +#endif + +/* Not exposed yet (as of Linux 5.4). Defined in fs/xfs/libxfs/xfs_format.h */ +#ifndef XFS_SB_MAGIC +#define XFS_SB_MAGIC 0x58465342 +#endif + +/* dea2903719283c156b53741126228c4a1b40440f (5.17) */ +#ifndef CIFS_SUPER_MAGIC +#define CIFS_SUPER_MAGIC 0xFF534D42 +#endif + +/* dea2903719283c156b53741126228c4a1b40440f (5.17) */ +#ifndef SMB2_SUPER_MAGIC +#define SMB2_SUPER_MAGIC 0xFE534D42 +#endif + +/* 257f871993474e2bde6c497b54022c362cf398e1 (4.5) */ +#ifndef OVERLAYFS_SUPER_MAGIC +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 +#endif + +/* 2a28900be20640fcd1e548b1e3bad79e8221fcf9 (4.7) */ +#ifndef UDF_SUPER_MAGIC +#define UDF_SUPER_MAGIC 0x15013346 +#endif + +/* b1123ea6d3b3da25af5c8a9d843bd07ab63213f4 (4.8) */ +#ifndef BALLOON_KVM_MAGIC +#define BALLOON_KVM_MAGIC 0x13661366 +#endif + +/* 48b4800a1c6af2cdda344ea4e2c843dcc1f6afc9 (4.8) */ +#ifndef ZSMALLOC_MAGIC +#define ZSMALLOC_MAGIC 0x58295829 +#endif + +/* 3bc52c45bac26bf7ed1dc8d287ad1aeaed1250b6 (4.9) */ +#ifndef DAXFS_MAGIC +#define DAXFS_MAGIC 0x64646178 +#endif + +/* 5ff193fbde20df5d80fec367cea3e7856c057320 (4.10) */ +#ifndef RDTGROUP_SUPER_MAGIC +#define RDTGROUP_SUPER_MAGIC 0x7655821 +#endif + +/* a481f4d917835cad86701fc0d1e620c74bb5cd5f (4.13) */ +#ifndef AAFS_MAGIC +#define AAFS_MAGIC 0x5a3c69f0 +#endif + +/* f044c8847bb61eff5e1e95b6f6bb950e7f4a73a4 (4.15) */ +#ifndef AFS_FS_MAGIC +#define AFS_FS_MAGIC 0x6b414653 +#endif + +/* dddde68b8f06dd83486124b8d245e7bfb15c185d (4.20) */ +#ifndef XFS_SUPER_MAGIC +#define XFS_SUPER_MAGIC 0x58465342 +#endif + +/* 3ad20fe393b31025bebfc2d76964561f65df48aa (5.0) */ +#ifndef BINDERFS_SUPER_MAGIC +#define BINDERFS_SUPER_MAGIC 0x6c6f6f70 +#endif + +/* ed63bb1d1f8469586006a9ca63c42344401aa2ab (5.3) */ +#ifndef DMA_BUF_MAGIC +#define DMA_BUF_MAGIC 0x444d4142 +#endif + +/* ea8157ab2ae5e914dd427e5cfab533b6da3819cd (5.3) */ +#ifndef Z3FOLD_MAGIC +#define Z3FOLD_MAGIC 0x33 +#endif + +/* 47e4937a4a7ca4184fd282791dfee76c6799966a (5.4) */ +#ifndef EROFS_SUPER_MAGIC_V1 +#define EROFS_SUPER_MAGIC_V1 0xe0f5e1e2 +#endif + +/* fe030c9b85e6783bc52fe86449c0a4b8aa16c753 (5.5) */ +#ifndef PPC_CMM_MAGIC +#define PPC_CMM_MAGIC 0xc7571590 +#endif + +/* 8dcc1a9d90c10fa4143e5c17821082e5e60e46a1 (5.6) */ +#ifndef ZONEFS_MAGIC +#define ZONEFS_MAGIC 0x5a4f4653 +#endif + +/* 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 (5.8) */ +#ifndef DEVMEM_MAGIC +#define DEVMEM_MAGIC 0x454d444d +#endif + +/* Not in mainline but included in Ubuntu */ +#ifndef SHIFTFS_MAGIC +#define SHIFTFS_MAGIC 0x6a656a62 +#endif + +/* 1507f51255c9ff07d75909a84e7c0d7f3c4b2f49 (5.14) */ +#ifndef SECRETMEM_MAGIC +#define SECRETMEM_MAGIC 0x5345434d +#endif + +/* Not exposed yet. Defined at fs/fuse/inode.c */ +#ifndef FUSE_SUPER_MAGIC +#define FUSE_SUPER_MAGIC 0x65735546 +#endif + +/* Not exposed yet. Defined at fs/fuse/control.c */ +#ifndef FUSE_CTL_SUPER_MAGIC +#define FUSE_CTL_SUPER_MAGIC 0x65735543 +#endif + +/* Not exposed yet. Defined at fs/ceph/super.h */ +#ifndef CEPH_SUPER_MAGIC +#define CEPH_SUPER_MAGIC 0x00c36400 +#endif + +/* Not exposed yet. Defined at fs/orangefs/orangefs-kernel.h */ +#ifndef ORANGEFS_DEVREQ_MAGIC +#define ORANGEFS_DEVREQ_MAGIC 0x20030529 +#endif + +/* linux/gfs2_ondisk.h */ +#ifndef GFS2_MAGIC +#define GFS2_MAGIC 0x01161970 +#endif + +/* Not exposed yet. Defined at fs/configfs/mount.c */ +#ifndef CONFIGFS_MAGIC +#define CONFIGFS_MAGIC 0x62656570 +#endif + +/* Not exposed yet. Defined at fs/vboxsf/super.c */ +#ifndef VBOXSF_SUPER_MAGIC +#define VBOXSF_SUPER_MAGIC 0x786f4256 +#endif + +/* Not exposed yet. Defined at fs/exfat/exfat_fs.h */ +#ifndef EXFAT_SUPER_MAGIC +#define EXFAT_SUPER_MAGIC 0x2011BAB0UL +#endif + +/* Not exposed yet, internally actually called RPCAUTH_GSSMAGIC. Defined in net/sunrpc/rpc_pipe.c */ +#ifndef RPC_PIPEFS_SUPER_MAGIC +#define RPC_PIPEFS_SUPER_MAGIC 0x67596969 +#endif + +/* Not exposed yet, defined at fs/ntfs/ntfs.h */ +#ifndef NTFS_SB_MAGIC +#define NTFS_SB_MAGIC 0x5346544e +#endif + +/* Not exposed yet, encoded literally in fs/ntfs3/super.c. */ +#ifndef NTFS3_SUPER_MAGIC +#define NTFS3_SUPER_MAGIC 0x7366746e +#endif diff --git a/src/basic/missing_mman.h b/src/basic/missing_mman.h new file mode 100644 index 0000000..f48c436 --- /dev/null +++ b/src/basic/missing_mman.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#ifndef MFD_ALLOW_SEALING +#define MFD_ALLOW_SEALING 0x0002U +#endif + +#ifndef MFD_CLOEXEC +#define MFD_CLOEXEC 0x0001U +#endif + +#ifndef MFD_NOEXEC_SEAL +#define MFD_NOEXEC_SEAL 0x0008U +#endif + +#ifndef MFD_EXEC +#define MFD_EXEC 0x0010U +#endif diff --git a/src/basic/missing_mount.h b/src/basic/missing_mount.h new file mode 100644 index 0000000..69b0bcf --- /dev/null +++ b/src/basic/missing_mount.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* dab741e0e02bd3c4f5e2e97be74b39df2523fc6e (5.10) */ +#ifndef MS_NOSYMFOLLOW +#define MS_NOSYMFOLLOW 256 +#endif diff --git a/src/basic/missing_network.h b/src/basic/missing_network.h new file mode 100644 index 0000000..776c7c8 --- /dev/null +++ b/src/basic/missing_network.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* linux/in6.h or netinet/in.h */ +#ifndef IPV6_UNICAST_IF +#define IPV6_UNICAST_IF 76 +#endif + +/* linux/in6.h or netinet/in.h */ +#ifndef IPV6_TRANSPARENT +#define IPV6_TRANSPARENT 75 +#endif + +/* Not exposed but defined at include/net/ip.h */ +#ifndef IPV4_MIN_MTU +#define IPV4_MIN_MTU 68 +#endif + +/* linux/ipv6.h */ +#ifndef IPV6_MIN_MTU +#define IPV6_MIN_MTU 1280 +#endif + +/* Note that LOOPBACK_IFINDEX is currently not exposed by the + * kernel/glibc, but hardcoded internally by the kernel. However, as + * it is exported to userspace indirectly via rtnetlink and the + * ioctls, and made use of widely we define it here too, in a way that + * is compatible with the kernel's internal definition. */ +#ifndef LOOPBACK_IFINDEX +#define LOOPBACK_IFINDEX 1 +#endif + +/* Not exposed yet. Similar values are defined in net/ethernet.h */ +#ifndef ETHERTYPE_LLDP +#define ETHERTYPE_LLDP 0x88cc +#endif + +/* Not exposed but defined in linux/netdevice.h */ +#ifndef MAX_PHYS_ITEM_ID_LEN +#define MAX_PHYS_ITEM_ID_LEN 32 +#endif + +/* Not exposed but defined in include/net/bonding.h */ +#ifndef BOND_MAX_ARP_TARGETS +#define BOND_MAX_ARP_TARGETS 16 +#endif + +/* Not exposed but defined in include/linux/ieee80211.h */ +#ifndef IEEE80211_MAX_SSID_LEN +#define IEEE80211_MAX_SSID_LEN 32 +#endif + +/* Not exposed but defined in include/net/netlabel.h */ +#ifndef NETLBL_NLTYPE_UNLABELED_NAME +#define NETLBL_NLTYPE_UNLABELED_NAME "NLBL_UNLBL" +#endif + +/* Not exposed but defined in net/netlabel/netlabel_unlabeled.h */ +enum { + NLBL_UNLABEL_C_UNSPEC, + NLBL_UNLABEL_C_ACCEPT, + NLBL_UNLABEL_C_LIST, + NLBL_UNLABEL_C_STATICADD, + NLBL_UNLABEL_C_STATICREMOVE, + NLBL_UNLABEL_C_STATICLIST, + NLBL_UNLABEL_C_STATICADDDEF, + NLBL_UNLABEL_C_STATICREMOVEDEF, + NLBL_UNLABEL_C_STATICLISTDEF, + __NLBL_UNLABEL_C_MAX, +}; + +/* Not exposed but defined in net/netlabel/netlabel_unlabeled.h */ +enum { + NLBL_UNLABEL_A_UNSPEC, + NLBL_UNLABEL_A_ACPTFLG, + NLBL_UNLABEL_A_IPV6ADDR, + NLBL_UNLABEL_A_IPV6MASK, + NLBL_UNLABEL_A_IPV4ADDR, + NLBL_UNLABEL_A_IPV4MASK, + NLBL_UNLABEL_A_IFACE, + NLBL_UNLABEL_A_SECCTX, + __NLBL_UNLABEL_A_MAX, +}; diff --git a/src/basic/missing_prctl.h b/src/basic/missing_prctl.h new file mode 100644 index 0000000..7d9e395 --- /dev/null +++ b/src/basic/missing_prctl.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* 58319057b7847667f0c9585b9de0e8932b0fdb08 (4.3) */ +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 + +#define PR_CAP_AMBIENT_IS_SET 1 +#define PR_CAP_AMBIENT_RAISE 2 +#define PR_CAP_AMBIENT_LOWER 3 +#define PR_CAP_AMBIENT_CLEAR_ALL 4 +#endif + +/* b507808ebce23561d4ff8c2aa1fb949fe402bc61 (6.3) */ +#ifndef PR_SET_MDWE +#define PR_SET_MDWE 65 +#endif +#ifndef PR_MDWE_REFUSE_EXEC_GAIN +#define PR_MDWE_REFUSE_EXEC_GAIN 1 +#endif + +#ifndef PR_SET_MEMORY_MERGE +#define PR_SET_MEMORY_MERGE 67 +#endif diff --git a/src/basic/missing_random.h b/src/basic/missing_random.h new file mode 100644 index 0000000..443b913 --- /dev/null +++ b/src/basic/missing_random.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if USE_SYS_RANDOM_H +# include +#else +# include +#endif + +#ifndef GRND_NONBLOCK +#define GRND_NONBLOCK 0x0001 +#endif + +#ifndef GRND_RANDOM +#define GRND_RANDOM 0x0002 +#endif + +#ifndef GRND_INSECURE +#define GRND_INSECURE 0x0004 +#endif diff --git a/src/basic/missing_resource.h b/src/basic/missing_resource.h new file mode 100644 index 0000000..6e76765 --- /dev/null +++ b/src/basic/missing_resource.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#ifndef RLIMIT_RTTIME +#define RLIMIT_RTTIME 15 +#endif + +/* If RLIMIT_RTTIME is not defined, then we cannot use RLIMIT_NLIMITS as is */ +#define _RLIMIT_MAX (RLIMIT_RTTIME+1 > RLIMIT_NLIMITS ? RLIMIT_RTTIME+1 : RLIMIT_NLIMITS) diff --git a/src/basic/missing_sched.h b/src/basic/missing_sched.h new file mode 100644 index 0000000..bcd5b77 --- /dev/null +++ b/src/basic/missing_sched.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000 +#endif + +/* 769071ac9f20b6a447410c7eaa55d1a5233ef40c (5.8) */ +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080 +#endif + +/* Not exposed yet. Defined at include/linux/sched.h */ +#ifndef PF_KTHREAD +#define PF_KTHREAD 0x00200000 +#endif + +/* The maximum thread/process name length including trailing NUL byte. This mimics the kernel definition of the same + * name, which we need in userspace at various places but is not defined in userspace currently, neither under this + * name nor any other. */ +/* Not exposed yet. Defined at include/linux/sched.h */ +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif diff --git a/src/basic/missing_securebits.h b/src/basic/missing_securebits.h new file mode 100644 index 0000000..03fad6f --- /dev/null +++ b/src/basic/missing_securebits.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* 746bf6d64275be0c65b0631d8a72b16f1454cfa1 (4.3) */ +#ifndef SECURE_NO_CAP_AMBIENT_RAISE +#define SECURE_NO_CAP_AMBIENT_RAISE 6 +#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 7 /* make bit-6 immutable */ +#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE)) +#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED)) + +#undef SECURE_ALL_BITS +#define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ + issecure_mask(SECURE_NO_SETUID_FIXUP) | \ + issecure_mask(SECURE_KEEP_CAPS) | \ + issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE)) +#endif diff --git a/src/basic/missing_socket.h b/src/basic/missing_socket.h new file mode 100644 index 0000000..30ac297 --- /dev/null +++ b/src/basic/missing_socket.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#if HAVE_LINUX_VM_SOCKETS_H +#include +#else +#define VMADDR_CID_ANY -1U +struct sockaddr_vm { + unsigned short svm_family; + unsigned short svm_reserved1; + unsigned int svm_port; + unsigned int svm_cid; + unsigned char svm_zero[sizeof(struct sockaddr) - + sizeof(unsigned short) - + sizeof(unsigned short) - + sizeof(unsigned int) - + sizeof(unsigned int)]; +}; +#endif /* !HAVE_LINUX_VM_SOCKETS_H */ + +#ifndef AF_VSOCK +#define AF_VSOCK 40 +#endif + +#ifndef SO_REUSEPORT +#define SO_REUSEPORT 15 +#endif + +#ifndef SO_PEERGROUPS +#define SO_PEERGROUPS 59 +#endif + +#ifndef SO_BINDTOIFINDEX +#define SO_BINDTOIFINDEX 62 +#endif + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +#ifndef SOL_ALG +#define SOL_ALG 279 +#endif + +/* Not exposed yet. Defined in include/linux/socket.h. */ +#ifndef SOL_SCTP +#define SOL_SCTP 132 +#endif + +/* Not exposed yet. Defined in include/linux/socket.h */ +#ifndef SCM_SECURITY +#define SCM_SECURITY 0x03 +#endif + +/* netinet/in.h */ +#ifndef IP_FREEBIND +#define IP_FREEBIND 15 +#endif + +#ifndef IP_TRANSPARENT +#define IP_TRANSPARENT 19 +#endif + +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + +#ifndef IP_RECVFRAGSIZE +#define IP_RECVFRAGSIZE 25 +#endif + +#ifndef IPV6_RECVFRAGSIZE +#define IPV6_RECVFRAGSIZE 77 +#endif + +/* linux/sockios.h */ +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C +#endif diff --git a/src/basic/missing_stat.h b/src/basic/missing_stat.h new file mode 100644 index 0000000..eba1a38 --- /dev/null +++ b/src/basic/missing_stat.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#if WANT_LINUX_STAT_H +#include +#endif + +/* The newest definition we are aware of (fa2fcf4f1df1559a0a4ee0f46915b496cc2ebf60; 5.8) */ +#define STATX_DEFINITION { \ + __u32 stx_mask; \ + __u32 stx_blksize; \ + __u64 stx_attributes; \ + __u32 stx_nlink; \ + __u32 stx_uid; \ + __u32 stx_gid; \ + __u16 stx_mode; \ + __u16 __spare0[1]; \ + __u64 stx_ino; \ + __u64 stx_size; \ + __u64 stx_blocks; \ + __u64 stx_attributes_mask; \ + struct statx_timestamp stx_atime; \ + struct statx_timestamp stx_btime; \ + struct statx_timestamp stx_ctime; \ + struct statx_timestamp stx_mtime; \ + __u32 stx_rdev_major; \ + __u32 stx_rdev_minor; \ + __u32 stx_dev_major; \ + __u32 stx_dev_minor; \ + __u64 stx_mnt_id; \ + __u64 __spare2; \ + __u64 __spare3[12]; \ +} + +#if !HAVE_STRUCT_STATX +struct statx_timestamp { + __s64 tv_sec; + __u32 tv_nsec; + __s32 __reserved; +}; + +struct statx STATX_DEFINITION; +#endif + +/* Always define the newest version we are aware of as a distinct type, so that we can use it even if glibc + * defines an older definition */ +struct new_statx STATX_DEFINITION; + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef AT_STATX_SYNC_AS_STAT +#define AT_STATX_SYNC_AS_STAT 0x0000 +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef AT_STATX_FORCE_SYNC +#define AT_STATX_FORCE_SYNC 0x2000 +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef AT_STATX_DONT_SYNC +#define AT_STATX_DONT_SYNC 0x4000 +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_TYPE +#define STATX_TYPE 0x00000001U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_MODE +#define STATX_MODE 0x00000002U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_NLINK +#define STATX_NLINK 0x00000004U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_UID +#define STATX_UID 0x00000008U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_GID +#define STATX_GID 0x00000010U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_ATIME +#define STATX_ATIME 0x00000020U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_MTIME +#define STATX_MTIME 0x00000040U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_CTIME +#define STATX_CTIME 0x00000080U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_INO +#define STATX_INO 0x00000100U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_SIZE +#define STATX_SIZE 0x00000200U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_BLOCKS +#define STATX_BLOCKS 0x00000400U +#endif + +/* a528d35e8bfcc521d7cb70aaf03e1bd296c8493f (4.11) */ +#ifndef STATX_BTIME +#define STATX_BTIME 0x00000800U +#endif + +/* fa2fcf4f1df1559a0a4ee0f46915b496cc2ebf60 (5.8) */ +#ifndef STATX_MNT_ID +#define STATX_MNT_ID 0x00001000U +#endif + +/* 80340fe3605c0e78cfe496c3b3878be828cfdbfe (5.8) */ +#ifndef STATX_ATTR_MOUNT_ROOT +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ +#endif diff --git a/src/basic/missing_stdlib.h b/src/basic/missing_stdlib.h new file mode 100644 index 0000000..8c76f93 --- /dev/null +++ b/src/basic/missing_stdlib.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* stdlib.h */ +#if !HAVE_SECURE_GETENV +# if HAVE___SECURE_GETENV +# define secure_getenv __secure_getenv +# else +# error "neither secure_getenv nor __secure_getenv are available" +# endif +#endif diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h new file mode 100644 index 0000000..8628077 --- /dev/null +++ b/src/basic/missing_syscall.h @@ -0,0 +1,680 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* Missing glibc definitions to access certain kernel APIs */ + +#include +#include +#if HAVE_LINUX_TIME_TYPES_H +/* This header defines __kernel_timespec for us, but is only available since Linux 5.1, hence conditionally + * include this. */ +#include +#endif +#include +#include +#include +#include +#include + +#ifdef ARCH_MIPS +#include +#endif + +#include "macro.h" +#include "missing_keyctl.h" +#include "missing_stat.h" +#include "missing_syscall_def.h" + +/* linux/kcmp.h */ +#ifndef KCMP_FILE /* 3f4994cfc15f38a3159c6e3a4b3ab2e1481a6b02 (3.19) */ +#define KCMP_FILE 0 +#endif + +/* ======================================================================= */ + +#if !HAVE_FCHMODAT2 +static inline int missing_fchmodat2(int dirfd, const char *path, mode_t mode, int flags) { +# ifdef __NR_fchmodat2 + return syscall(__NR_fchmodat2, dirfd, path, mode, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define fchmodat2 missing_fchmodat2 +#endif + +/* ======================================================================= */ + +#if !HAVE_PIVOT_ROOT +static inline int missing_pivot_root(const char *new_root, const char *put_old) { + return syscall(__NR_pivot_root, new_root, put_old); +} + +# define pivot_root missing_pivot_root +#endif + +/* ======================================================================= */ + +#if !HAVE_IOPRIO_GET +static inline int missing_ioprio_get(int which, int who) { + return syscall(__NR_ioprio_get, which, who); +} + +# define ioprio_get missing_ioprio_get +#endif + +/* ======================================================================= */ + +#if !HAVE_IOPRIO_SET +static inline int missing_ioprio_set(int which, int who, int ioprio) { + return syscall(__NR_ioprio_set, which, who, ioprio); +} + +# define ioprio_set missing_ioprio_set +#endif + +/* ======================================================================= */ + +#if !HAVE_MEMFD_CREATE +static inline int missing_memfd_create(const char *name, unsigned int flags) { +# ifdef __NR_memfd_create + return syscall(__NR_memfd_create, name, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define memfd_create missing_memfd_create +#endif + +/* ======================================================================= */ + +#if !HAVE_GETRANDOM +/* glibc says getrandom() returns ssize_t */ +static inline ssize_t missing_getrandom(void *buffer, size_t count, unsigned flags) { +# ifdef __NR_getrandom + return syscall(__NR_getrandom, buffer, count, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define getrandom missing_getrandom +#endif + +/* ======================================================================= */ + +/* The syscall has been defined since forever, but the glibc wrapper was missing. */ +#if !HAVE_GETTID +static inline pid_t missing_gettid(void) { +# if defined __NR_gettid && __NR_gettid >= 0 + return (pid_t) syscall(__NR_gettid); +# else +# error "__NR_gettid not defined" +# endif +} + +# define gettid missing_gettid +#endif + +/* ======================================================================= */ + +#if !HAVE_NAME_TO_HANDLE_AT +struct file_handle { + unsigned int handle_bytes; + int handle_type; + unsigned char f_handle[0]; +}; + +static inline int missing_name_to_handle_at(int fd, const char *name, struct file_handle *handle, int *mnt_id, int flags) { +# ifdef __NR_name_to_handle_at + return syscall(__NR_name_to_handle_at, fd, name, handle, mnt_id, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define name_to_handle_at missing_name_to_handle_at +#endif + +/* ======================================================================= */ + +#if !HAVE_SETNS +static inline int missing_setns(int fd, int nstype) { +# ifdef __NR_setns + return syscall(__NR_setns, fd, nstype); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define setns missing_setns +#endif + +/* ======================================================================= */ + +static inline pid_t raw_getpid(void) { +#if defined(__alpha__) + return (pid_t) syscall(__NR_getxpid); +#else + return (pid_t) syscall(__NR_getpid); +#endif +} + +/* ======================================================================= */ + +#if !HAVE_RENAMEAT2 +static inline int missing_renameat2(int oldfd, const char *oldname, int newfd, const char *newname, unsigned flags) { +# ifdef __NR_renameat2 + return syscall(__NR_renameat2, oldfd, oldname, newfd, newname, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define renameat2 missing_renameat2 +#endif + +/* ======================================================================= */ + +#if !HAVE_KCMP +static inline int missing_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) { +# if defined __NR_kcmp && __NR_kcmp >= 0 + return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define kcmp missing_kcmp +#endif + +/* ======================================================================= */ + +#if !HAVE_KEYCTL +static inline long missing_keyctl(int cmd, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { +# if defined __NR_keyctl && __NR_keyctl >= 0 + return syscall(__NR_keyctl, cmd, arg2, arg3, arg4, arg5); +# else + errno = ENOSYS; + return -1; +# endif + +# define keyctl missing_keyctl +} + +static inline key_serial_t missing_add_key(const char *type, const char *description, const void *payload, size_t plen, key_serial_t ringid) { +# if defined __NR_add_key && __NR_add_key >= 0 + return syscall(__NR_add_key, type, description, payload, plen, ringid); +# else + errno = ENOSYS; + return -1; +# endif + +# define add_key missing_add_key +} + +static inline key_serial_t missing_request_key(const char *type, const char *description, const char * callout_info, key_serial_t destringid) { +# if defined __NR_request_key && __NR_request_key >= 0 + return syscall(__NR_request_key, type, description, callout_info, destringid); +# else + errno = ENOSYS; + return -1; +# endif + +# define request_key missing_request_key +} +#endif + +/* ======================================================================= */ + +#if !HAVE_COPY_FILE_RANGE +static inline ssize_t missing_copy_file_range(int fd_in, loff_t *off_in, + int fd_out, loff_t *off_out, + size_t len, + unsigned int flags) { +# ifdef __NR_copy_file_range + return syscall(__NR_copy_file_range, fd_in, off_in, fd_out, off_out, len, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define copy_file_range missing_copy_file_range +#endif + +/* ======================================================================= */ + +#if !HAVE_BPF +union bpf_attr; + +static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size) { +#ifdef __NR_bpf + return (int) syscall(__NR_bpf, cmd, attr, size); +#else + errno = ENOSYS; + return -1; +#endif +} + +# define bpf missing_bpf +#endif + +/* ======================================================================= */ + +#if !HAVE_STATX +struct statx; + +static inline ssize_t missing_statx(int dfd, const char *filename, unsigned flags, unsigned int mask, struct statx *buffer) { +# ifdef __NR_statx + return syscall(__NR_statx, dfd, filename, flags, mask, buffer); +# else + errno = ENOSYS; + return -1; +# endif +} +#endif + +/* This typedef is supposed to be always defined. */ +typedef struct statx struct_statx; + +#if !HAVE_STATX +# define statx(dfd, filename, flags, mask, buffer) missing_statx(dfd, filename, flags, mask, buffer) +#endif + +/* ======================================================================= */ + +#if !HAVE_SET_MEMPOLICY +enum { + MPOL_DEFAULT, + MPOL_PREFERRED, + MPOL_BIND, + MPOL_INTERLEAVE, + MPOL_LOCAL, +}; + +static inline long missing_set_mempolicy(int mode, const unsigned long *nodemask, + unsigned long maxnode) { + long i; +# if defined __NR_set_mempolicy && __NR_set_mempolicy >= 0 + i = syscall(__NR_set_mempolicy, mode, nodemask, maxnode); +# else + errno = ENOSYS; + i = -1; +# endif + return i; +} + +# define set_mempolicy missing_set_mempolicy +#endif + +#if !HAVE_GET_MEMPOLICY +static inline long missing_get_mempolicy(int *mode, unsigned long *nodemask, + unsigned long maxnode, void *addr, + unsigned long flags) { + long i; +# if defined __NR_get_mempolicy && __NR_get_mempolicy >= 0 + i = syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags); +# else + errno = ENOSYS; + i = -1; +# endif + return i; +} + +# define get_mempolicy missing_get_mempolicy +#endif + +/* ======================================================================= */ + +#if !HAVE_PIDFD_SEND_SIGNAL +static inline int missing_pidfd_send_signal(int fd, int sig, siginfo_t *info, unsigned flags) { +# ifdef __NR_pidfd_send_signal + return syscall(__NR_pidfd_send_signal, fd, sig, info, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define pidfd_send_signal missing_pidfd_send_signal +#endif + +#if !HAVE_PIDFD_OPEN +static inline int missing_pidfd_open(pid_t pid, unsigned flags) { +# ifdef __NR_pidfd_open + return syscall(__NR_pidfd_open, pid, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define pidfd_open missing_pidfd_open +#endif + +/* ======================================================================= */ + +#if !HAVE_RT_SIGQUEUEINFO +static inline int missing_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info) { +# if defined __NR_rt_sigqueueinfo && __NR_rt_sigqueueinfo >= 0 + return syscall(__NR_rt_sigqueueinfo, tgid, sig, info); +# else +# error "__NR_rt_sigqueueinfo not defined" +# endif +} + +# define rt_sigqueueinfo missing_rt_sigqueueinfo +#endif + +/* ======================================================================= */ + +#if !HAVE_RT_TGSIGQUEUEINFO +static inline int missing_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *info) { +# if defined __NR_rt_tgsigqueueinfo && __NR_rt_tgsigqueueinfo >= 0 + return syscall(__NR_rt_tgsigqueueinfo, tgid, tid, sig, info); +# else +# error "__NR_rt_tgsigqueueinfo not defined" +# endif +} + +# define rt_tgsigqueueinfo missing_rt_tgsigqueueinfo +#endif + +/* ======================================================================= */ + +#if !HAVE_EXECVEAT +static inline int missing_execveat(int dirfd, const char *pathname, + char *const argv[], char *const envp[], + int flags) { +# if defined __NR_execveat && __NR_execveat >= 0 + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# undef AT_EMPTY_PATH +# define AT_EMPTY_PATH 0x1000 +# define execveat missing_execveat +#endif + +/* ======================================================================= */ + +#if !HAVE_CLOSE_RANGE +static inline int missing_close_range(unsigned first_fd, unsigned end_fd, unsigned flags) { +# ifdef __NR_close_range + /* Kernel-side the syscall expects fds as unsigned integers (just like close() actually), while + * userspace exclusively uses signed integers for fds. glibc chose to expose it 1:1 however, hence we + * do so here too, even if we end up passing signed fds to it most of the time. */ + return syscall(__NR_close_range, + first_fd, + end_fd, + flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define close_range missing_close_range +#endif + +/* ======================================================================= */ + +#if !HAVE_MOUNT_SETATTR + +#if !HAVE_STRUCT_MOUNT_ATTR +struct mount_attr { + uint64_t attr_set; + uint64_t attr_clr; + uint64_t propagation; + uint64_t userns_fd; +}; +#else +struct mount_attr; +#endif + +#ifndef MOUNT_ATTR_RDONLY +#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ +#endif + +#ifndef MOUNT_ATTR_NOSUID +#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ +#endif + +#ifndef MOUNT_ATTR_NODEV +#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ +#endif + +#ifndef MOUNT_ATTR_NOEXEC +#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ +#endif + +#ifndef MOUNT_ATTR__ATIME +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#endif + +#ifndef MOUNT_ATTR_RELATIME +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#endif + +#ifndef MOUNT_ATTR_NOATIME +#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ +#endif + +#ifndef MOUNT_ATTR_STRICTATIME +#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ +#endif + +#ifndef MOUNT_ATTR_NODIRATIME +#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +#endif + +#ifndef MOUNT_ATTR_IDMAP +#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ +#endif + +#ifndef MOUNT_ATTR_NOSYMFOLLOW +#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */ +#endif + +#ifndef MOUNT_ATTR_SIZE_VER0 +#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 +#endif + +static inline int missing_mount_setattr( + int dfd, + const char *path, + unsigned flags, + struct mount_attr *attr, + size_t size) { + +# if defined __NR_mount_setattr && __NR_mount_setattr >= 0 + return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define mount_setattr missing_mount_setattr +#endif + +/* ======================================================================= */ + +#if !HAVE_OPEN_TREE + +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +static inline int missing_open_tree( + int dfd, + const char *filename, + unsigned flags) { + +# if defined __NR_open_tree && __NR_open_tree >= 0 + return syscall(__NR_open_tree, dfd, filename, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define open_tree missing_open_tree +#endif + +/* ======================================================================= */ + +#ifndef MOVE_MOUNT_BENEATH +#define MOVE_MOUNT_BENEATH 0x00000200 +#endif + +#if !HAVE_MOVE_MOUNT + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#endif + +#ifndef MOVE_MOUNT_T_EMPTY_PATH +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#endif + +static inline int missing_move_mount( + int from_dfd, + const char *from_pathname, + int to_dfd, + const char *to_pathname, + unsigned flags) { + +# if defined __NR_move_mount && __NR_move_mount >= 0 + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define move_mount missing_move_mount +#endif + +/* ======================================================================= */ + +#if !HAVE_FSOPEN + +#ifndef FSOPEN_CLOEXEC +#define FSOPEN_CLOEXEC 0x00000001 +#endif + +static inline int missing_fsopen(const char *fsname, unsigned flags) { +# if defined __NR_fsopen && __NR_fsopen >= 0 + return syscall(__NR_fsopen, fsname, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define fsopen missing_fsopen +#endif + +/* ======================================================================= */ + +#if !HAVE_FSCONFIG + +#ifndef FSCONFIG_SET_FLAG +#define FSCONFIG_SET_FLAG 0 /* Set parameter, supplying no value */ +#endif + +#ifndef FSCONFIG_SET_STRING +#define FSCONFIG_SET_STRING 1 /* Set parameter, supplying a string value */ +#endif + +#ifndef FSCONFIG_SET_FD +#define FSCONFIG_SET_FD 5 /* Set parameter, supplying an object by fd */ +#endif + +#ifndef FSCONFIG_CMD_CREATE +#define FSCONFIG_CMD_CREATE 6 /* Invoke superblock creation */ +#endif + +static inline int missing_fsconfig(int fd, unsigned cmd, const char *key, const void *value, int aux) { +# if defined __NR_fsconfig && __NR_fsconfig >= 0 + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define fsconfig missing_fsconfig +#endif + +/* ======================================================================= */ + +#if !HAVE_FSMOUNT + +#ifndef FSMOUNT_CLOEXEC +#define FSMOUNT_CLOEXEC 0x00000001 +#endif + +static inline int missing_fsmount(int fd, unsigned flags, unsigned ms_flags) { +# if defined __NR_fsmount && __NR_fsmount >= 0 + return syscall(__NR_fsmount, fd, flags, ms_flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define fsmount missing_fsmount +#endif + +/* ======================================================================= */ + +#if !HAVE_GETDENTS64 + +static inline ssize_t missing_getdents64(int fd, void *buffer, size_t length) { +# if defined __NR_getdents64 && __NR_getdents64 >= 0 + return syscall(__NR_getdents64, fd, buffer, length); +# else + errno = ENOSYS; + return -1; +# endif +} + +# define getdents64 missing_getdents64 +#endif + +/* ======================================================================= */ + +/* glibc does not provide clone() on ia64, only clone2(). Not only that, but it also doesn't provide a + * prototype, only the symbol in the shared library (it provides a prototype for clone(), but not the + * symbol in the shared library). */ +#if defined(__ia64__) +int __clone2(int (*fn)(void *), void *stack_base, size_t stack_size, int flags, void *arg); +#define HAVE_CLONE 0 +#else +/* We know that everywhere else clone() is available, so we don't bother with a meson check (that takes time + * at build time) and just define it. Once the kernel drops ia64 support, we can drop this too. */ +#define HAVE_CLONE 1 +#endif diff --git a/src/basic/missing_syscall_def.h b/src/basic/missing_syscall_def.h new file mode 100644 index 0000000..f679422 --- /dev/null +++ b/src/basic/missing_syscall_def.h @@ -0,0 +1,1199 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * This file is generated by src/basic/missing_syscalls.py. Do not edit! + * + * Use 'ninja -C build update-syscall-tables' to download new syscall tables, + * and 'ninja -C build update-syscall-header' to regenerate this file. + */ +#pragma once + +/* Note: if this code looks strange, this is because it is derived from the same + * template as the per-syscall blocks below. */ +# if defined(__aarch64__) +# elif defined(__alpha__) +# elif defined(__arc__) || defined(__tilegx__) +# elif defined(__arm__) +# elif defined(__i386__) +# elif defined(__ia64__) +# elif defined(__loongarch_lp64) +# elif defined(__m68k__) +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# elif defined(__powerpc__) +# elif defined(__riscv) +# if __riscv_xlen == 32 +# elif __riscv_xlen == 64 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# elif defined(__sparc__) +# elif defined(__x86_64__) +# if defined(__ILP32__) +# else +# endif +# elif !defined(missing_arch_template) +# warning "Current architecture is missing from the template" +# define missing_arch_template 1 +# endif + +#ifndef __IGNORE_bpf +# if defined(__aarch64__) +# define systemd_NR_bpf 280 +# elif defined(__alpha__) +# define systemd_NR_bpf 515 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_bpf 280 +# elif defined(__arm__) +# define systemd_NR_bpf 386 +# elif defined(__i386__) +# define systemd_NR_bpf 357 +# elif defined(__ia64__) +# define systemd_NR_bpf 1341 +# elif defined(__loongarch_lp64) +# define systemd_NR_bpf 280 +# elif defined(__m68k__) +# define systemd_NR_bpf 354 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_bpf 4355 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_bpf 6319 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_bpf 5315 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_bpf 341 +# elif defined(__powerpc__) +# define systemd_NR_bpf 361 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_bpf 280 +# elif __riscv_xlen == 64 +# define systemd_NR_bpf 280 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_bpf 351 +# elif defined(__sparc__) +# define systemd_NR_bpf 349 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_bpf (321 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_bpf 321 +# endif +# elif !defined(missing_arch_template) +# warning "bpf() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_bpf && __NR_bpf >= 0 +# if defined systemd_NR_bpf +assert_cc(__NR_bpf == systemd_NR_bpf); +# endif +# else +# if defined __NR_bpf +# undef __NR_bpf +# endif +# if defined systemd_NR_bpf && systemd_NR_bpf >= 0 +# define __NR_bpf systemd_NR_bpf +# endif +# endif +#endif + +#ifndef __IGNORE_close_range +# if defined(__aarch64__) +# define systemd_NR_close_range 436 +# elif defined(__alpha__) +# define systemd_NR_close_range 546 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_close_range 436 +# elif defined(__arm__) +# define systemd_NR_close_range 436 +# elif defined(__i386__) +# define systemd_NR_close_range 436 +# elif defined(__ia64__) +# define systemd_NR_close_range 1460 +# elif defined(__loongarch_lp64) +# define systemd_NR_close_range 436 +# elif defined(__m68k__) +# define systemd_NR_close_range 436 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_close_range 4436 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_close_range 6436 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_close_range 5436 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_close_range 436 +# elif defined(__powerpc__) +# define systemd_NR_close_range 436 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_close_range 436 +# elif __riscv_xlen == 64 +# define systemd_NR_close_range 436 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_close_range 436 +# elif defined(__sparc__) +# define systemd_NR_close_range 436 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_close_range (436 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_close_range 436 +# endif +# elif !defined(missing_arch_template) +# warning "close_range() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_close_range && __NR_close_range >= 0 +# if defined systemd_NR_close_range +assert_cc(__NR_close_range == systemd_NR_close_range); +# endif +# else +# if defined __NR_close_range +# undef __NR_close_range +# endif +# if defined systemd_NR_close_range && systemd_NR_close_range >= 0 +# define __NR_close_range systemd_NR_close_range +# endif +# endif +#endif + +#ifndef __IGNORE_copy_file_range +# if defined(__aarch64__) +# define systemd_NR_copy_file_range 285 +# elif defined(__alpha__) +# define systemd_NR_copy_file_range 519 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_copy_file_range 285 +# elif defined(__arm__) +# define systemd_NR_copy_file_range 391 +# elif defined(__i386__) +# define systemd_NR_copy_file_range 377 +# elif defined(__ia64__) +# define systemd_NR_copy_file_range 1347 +# elif defined(__loongarch_lp64) +# define systemd_NR_copy_file_range 285 +# elif defined(__m68k__) +# define systemd_NR_copy_file_range 376 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_copy_file_range 4360 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_copy_file_range 6324 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_copy_file_range 5320 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_copy_file_range 346 +# elif defined(__powerpc__) +# define systemd_NR_copy_file_range 379 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_copy_file_range 285 +# elif __riscv_xlen == 64 +# define systemd_NR_copy_file_range 285 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_copy_file_range 375 +# elif defined(__sparc__) +# define systemd_NR_copy_file_range 357 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_copy_file_range (326 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_copy_file_range 326 +# endif +# elif !defined(missing_arch_template) +# warning "copy_file_range() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_copy_file_range && __NR_copy_file_range >= 0 +# if defined systemd_NR_copy_file_range +assert_cc(__NR_copy_file_range == systemd_NR_copy_file_range); +# endif +# else +# if defined __NR_copy_file_range +# undef __NR_copy_file_range +# endif +# if defined systemd_NR_copy_file_range && systemd_NR_copy_file_range >= 0 +# define __NR_copy_file_range systemd_NR_copy_file_range +# endif +# endif +#endif + +#ifndef __IGNORE_fchmodat2 +# if defined(__aarch64__) +# define systemd_NR_fchmodat2 452 +# elif defined(__alpha__) +# define systemd_NR_fchmodat2 562 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_fchmodat2 452 +# elif defined(__arm__) +# define systemd_NR_fchmodat2 452 +# elif defined(__i386__) +# define systemd_NR_fchmodat2 452 +# elif defined(__ia64__) +# define systemd_NR_fchmodat2 1476 +# elif defined(__loongarch_lp64) +# define systemd_NR_fchmodat2 452 +# elif defined(__m68k__) +# define systemd_NR_fchmodat2 452 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_fchmodat2 4452 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_fchmodat2 6452 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_fchmodat2 5452 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_fchmodat2 452 +# elif defined(__powerpc__) +# define systemd_NR_fchmodat2 452 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_fchmodat2 452 +# elif __riscv_xlen == 64 +# define systemd_NR_fchmodat2 452 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_fchmodat2 452 +# elif defined(__sparc__) +# define systemd_NR_fchmodat2 452 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_fchmodat2 (452 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_fchmodat2 452 +# endif +# elif !defined(missing_arch_template) +# warning "fchmodat2() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_fchmodat2 && __NR_fchmodat2 >= 0 +# if defined systemd_NR_fchmodat2 +assert_cc(__NR_fchmodat2 == systemd_NR_fchmodat2); +# endif +# else +# if defined __NR_fchmodat2 +# undef __NR_fchmodat2 +# endif +# if defined systemd_NR_fchmodat2 && systemd_NR_fchmodat2 >= 0 +# define __NR_fchmodat2 systemd_NR_fchmodat2 +# endif +# endif +#endif + +#ifndef __IGNORE_getrandom +# if defined(__aarch64__) +# define systemd_NR_getrandom 278 +# elif defined(__alpha__) +# define systemd_NR_getrandom 511 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_getrandom 278 +# elif defined(__arm__) +# define systemd_NR_getrandom 384 +# elif defined(__i386__) +# define systemd_NR_getrandom 355 +# elif defined(__ia64__) +# define systemd_NR_getrandom 1339 +# elif defined(__loongarch_lp64) +# define systemd_NR_getrandom 278 +# elif defined(__m68k__) +# define systemd_NR_getrandom 352 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_getrandom 4353 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_getrandom 6317 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_getrandom 5313 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_getrandom 339 +# elif defined(__powerpc__) +# define systemd_NR_getrandom 359 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_getrandom 278 +# elif __riscv_xlen == 64 +# define systemd_NR_getrandom 278 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_getrandom 349 +# elif defined(__sparc__) +# define systemd_NR_getrandom 347 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_getrandom (318 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_getrandom 318 +# endif +# elif !defined(missing_arch_template) +# warning "getrandom() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_getrandom && __NR_getrandom >= 0 +# if defined systemd_NR_getrandom +assert_cc(__NR_getrandom == systemd_NR_getrandom); +# endif +# else +# if defined __NR_getrandom +# undef __NR_getrandom +# endif +# if defined systemd_NR_getrandom && systemd_NR_getrandom >= 0 +# define __NR_getrandom systemd_NR_getrandom +# endif +# endif +#endif + +#ifndef __IGNORE_memfd_create +# if defined(__aarch64__) +# define systemd_NR_memfd_create 279 +# elif defined(__alpha__) +# define systemd_NR_memfd_create 512 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_memfd_create 279 +# elif defined(__arm__) +# define systemd_NR_memfd_create 385 +# elif defined(__i386__) +# define systemd_NR_memfd_create 356 +# elif defined(__ia64__) +# define systemd_NR_memfd_create 1340 +# elif defined(__loongarch_lp64) +# define systemd_NR_memfd_create 279 +# elif defined(__m68k__) +# define systemd_NR_memfd_create 353 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_memfd_create 4354 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_memfd_create 6318 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_memfd_create 5314 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_memfd_create 340 +# elif defined(__powerpc__) +# define systemd_NR_memfd_create 360 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_memfd_create 279 +# elif __riscv_xlen == 64 +# define systemd_NR_memfd_create 279 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_memfd_create 350 +# elif defined(__sparc__) +# define systemd_NR_memfd_create 348 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_memfd_create (319 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_memfd_create 319 +# endif +# elif !defined(missing_arch_template) +# warning "memfd_create() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_memfd_create && __NR_memfd_create >= 0 +# if defined systemd_NR_memfd_create +assert_cc(__NR_memfd_create == systemd_NR_memfd_create); +# endif +# else +# if defined __NR_memfd_create +# undef __NR_memfd_create +# endif +# if defined systemd_NR_memfd_create && systemd_NR_memfd_create >= 0 +# define __NR_memfd_create systemd_NR_memfd_create +# endif +# endif +#endif + +#ifndef __IGNORE_mount_setattr +# if defined(__aarch64__) +# define systemd_NR_mount_setattr 442 +# elif defined(__alpha__) +# define systemd_NR_mount_setattr 552 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_mount_setattr 442 +# elif defined(__arm__) +# define systemd_NR_mount_setattr 442 +# elif defined(__i386__) +# define systemd_NR_mount_setattr 442 +# elif defined(__ia64__) +# define systemd_NR_mount_setattr 1466 +# elif defined(__loongarch_lp64) +# define systemd_NR_mount_setattr 442 +# elif defined(__m68k__) +# define systemd_NR_mount_setattr 442 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_mount_setattr 4442 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_mount_setattr 6442 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_mount_setattr 5442 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_mount_setattr 442 +# elif defined(__powerpc__) +# define systemd_NR_mount_setattr 442 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_mount_setattr 442 +# elif __riscv_xlen == 64 +# define systemd_NR_mount_setattr 442 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_mount_setattr 442 +# elif defined(__sparc__) +# define systemd_NR_mount_setattr 442 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_mount_setattr (442 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_mount_setattr 442 +# endif +# elif !defined(missing_arch_template) +# warning "mount_setattr() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_mount_setattr && __NR_mount_setattr >= 0 +# if defined systemd_NR_mount_setattr +assert_cc(__NR_mount_setattr == systemd_NR_mount_setattr); +# endif +# else +# if defined __NR_mount_setattr +# undef __NR_mount_setattr +# endif +# if defined systemd_NR_mount_setattr && systemd_NR_mount_setattr >= 0 +# define __NR_mount_setattr systemd_NR_mount_setattr +# endif +# endif +#endif + +#ifndef __IGNORE_move_mount +# if defined(__aarch64__) +# define systemd_NR_move_mount 429 +# elif defined(__alpha__) +# define systemd_NR_move_mount 539 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_move_mount 429 +# elif defined(__arm__) +# define systemd_NR_move_mount 429 +# elif defined(__i386__) +# define systemd_NR_move_mount 429 +# elif defined(__ia64__) +# define systemd_NR_move_mount 1453 +# elif defined(__loongarch_lp64) +# define systemd_NR_move_mount 429 +# elif defined(__m68k__) +# define systemd_NR_move_mount 429 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_move_mount 4429 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_move_mount 6429 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_move_mount 5429 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_move_mount 429 +# elif defined(__powerpc__) +# define systemd_NR_move_mount 429 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_move_mount 429 +# elif __riscv_xlen == 64 +# define systemd_NR_move_mount 429 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_move_mount 429 +# elif defined(__sparc__) +# define systemd_NR_move_mount 429 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_move_mount (429 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_move_mount 429 +# endif +# elif !defined(missing_arch_template) +# warning "move_mount() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_move_mount && __NR_move_mount >= 0 +# if defined systemd_NR_move_mount +assert_cc(__NR_move_mount == systemd_NR_move_mount); +# endif +# else +# if defined __NR_move_mount +# undef __NR_move_mount +# endif +# if defined systemd_NR_move_mount && systemd_NR_move_mount >= 0 +# define __NR_move_mount systemd_NR_move_mount +# endif +# endif +#endif + +#ifndef __IGNORE_name_to_handle_at +# if defined(__aarch64__) +# define systemd_NR_name_to_handle_at 264 +# elif defined(__alpha__) +# define systemd_NR_name_to_handle_at 497 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_name_to_handle_at 264 +# elif defined(__arm__) +# define systemd_NR_name_to_handle_at 370 +# elif defined(__i386__) +# define systemd_NR_name_to_handle_at 341 +# elif defined(__ia64__) +# define systemd_NR_name_to_handle_at 1326 +# elif defined(__loongarch_lp64) +# define systemd_NR_name_to_handle_at 264 +# elif defined(__m68k__) +# define systemd_NR_name_to_handle_at 340 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_name_to_handle_at 4339 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_name_to_handle_at 6303 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_name_to_handle_at 5298 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_name_to_handle_at 325 +# elif defined(__powerpc__) +# define systemd_NR_name_to_handle_at 345 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_name_to_handle_at 264 +# elif __riscv_xlen == 64 +# define systemd_NR_name_to_handle_at 264 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_name_to_handle_at 335 +# elif defined(__sparc__) +# define systemd_NR_name_to_handle_at 332 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_name_to_handle_at (303 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_name_to_handle_at 303 +# endif +# elif !defined(missing_arch_template) +# warning "name_to_handle_at() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_name_to_handle_at && __NR_name_to_handle_at >= 0 +# if defined systemd_NR_name_to_handle_at +assert_cc(__NR_name_to_handle_at == systemd_NR_name_to_handle_at); +# endif +# else +# if defined __NR_name_to_handle_at +# undef __NR_name_to_handle_at +# endif +# if defined systemd_NR_name_to_handle_at && systemd_NR_name_to_handle_at >= 0 +# define __NR_name_to_handle_at systemd_NR_name_to_handle_at +# endif +# endif +#endif + +#ifndef __IGNORE_open_tree +# if defined(__aarch64__) +# define systemd_NR_open_tree 428 +# elif defined(__alpha__) +# define systemd_NR_open_tree 538 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_open_tree 428 +# elif defined(__arm__) +# define systemd_NR_open_tree 428 +# elif defined(__i386__) +# define systemd_NR_open_tree 428 +# elif defined(__ia64__) +# define systemd_NR_open_tree 1452 +# elif defined(__loongarch_lp64) +# define systemd_NR_open_tree 428 +# elif defined(__m68k__) +# define systemd_NR_open_tree 428 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_open_tree 4428 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_open_tree 6428 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_open_tree 5428 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_open_tree 428 +# elif defined(__powerpc__) +# define systemd_NR_open_tree 428 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_open_tree 428 +# elif __riscv_xlen == 64 +# define systemd_NR_open_tree 428 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_open_tree 428 +# elif defined(__sparc__) +# define systemd_NR_open_tree 428 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_open_tree (428 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_open_tree 428 +# endif +# elif !defined(missing_arch_template) +# warning "open_tree() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_open_tree && __NR_open_tree >= 0 +# if defined systemd_NR_open_tree +assert_cc(__NR_open_tree == systemd_NR_open_tree); +# endif +# else +# if defined __NR_open_tree +# undef __NR_open_tree +# endif +# if defined systemd_NR_open_tree && systemd_NR_open_tree >= 0 +# define __NR_open_tree systemd_NR_open_tree +# endif +# endif +#endif + +#ifndef __IGNORE_openat2 +# if defined(__aarch64__) +# define systemd_NR_openat2 437 +# elif defined(__alpha__) +# define systemd_NR_openat2 547 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_openat2 437 +# elif defined(__arm__) +# define systemd_NR_openat2 437 +# elif defined(__i386__) +# define systemd_NR_openat2 437 +# elif defined(__ia64__) +# define systemd_NR_openat2 1461 +# elif defined(__loongarch_lp64) +# define systemd_NR_openat2 437 +# elif defined(__m68k__) +# define systemd_NR_openat2 437 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_openat2 4437 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_openat2 6437 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_openat2 5437 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_openat2 437 +# elif defined(__powerpc__) +# define systemd_NR_openat2 437 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_openat2 437 +# elif __riscv_xlen == 64 +# define systemd_NR_openat2 437 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_openat2 437 +# elif defined(__sparc__) +# define systemd_NR_openat2 437 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_openat2 (437 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_openat2 437 +# endif +# elif !defined(missing_arch_template) +# warning "openat2() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_openat2 && __NR_openat2 >= 0 +# if defined systemd_NR_openat2 +assert_cc(__NR_openat2 == systemd_NR_openat2); +# endif +# else +# if defined __NR_openat2 +# undef __NR_openat2 +# endif +# if defined systemd_NR_openat2 && systemd_NR_openat2 >= 0 +# define __NR_openat2 systemd_NR_openat2 +# endif +# endif +#endif + +#ifndef __IGNORE_pidfd_open +# if defined(__aarch64__) +# define systemd_NR_pidfd_open 434 +# elif defined(__alpha__) +# define systemd_NR_pidfd_open 544 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_pidfd_open 434 +# elif defined(__arm__) +# define systemd_NR_pidfd_open 434 +# elif defined(__i386__) +# define systemd_NR_pidfd_open 434 +# elif defined(__ia64__) +# define systemd_NR_pidfd_open 1458 +# elif defined(__loongarch_lp64) +# define systemd_NR_pidfd_open 434 +# elif defined(__m68k__) +# define systemd_NR_pidfd_open 434 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_pidfd_open 4434 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_pidfd_open 6434 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_pidfd_open 5434 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_pidfd_open 434 +# elif defined(__powerpc__) +# define systemd_NR_pidfd_open 434 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_pidfd_open 434 +# elif __riscv_xlen == 64 +# define systemd_NR_pidfd_open 434 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_pidfd_open 434 +# elif defined(__sparc__) +# define systemd_NR_pidfd_open 434 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_pidfd_open (434 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_pidfd_open 434 +# endif +# elif !defined(missing_arch_template) +# warning "pidfd_open() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_pidfd_open && __NR_pidfd_open >= 0 +# if defined systemd_NR_pidfd_open +assert_cc(__NR_pidfd_open == systemd_NR_pidfd_open); +# endif +# else +# if defined __NR_pidfd_open +# undef __NR_pidfd_open +# endif +# if defined systemd_NR_pidfd_open && systemd_NR_pidfd_open >= 0 +# define __NR_pidfd_open systemd_NR_pidfd_open +# endif +# endif +#endif + +#ifndef __IGNORE_pidfd_send_signal +# if defined(__aarch64__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__alpha__) +# define systemd_NR_pidfd_send_signal 534 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__arm__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__i386__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__ia64__) +# define systemd_NR_pidfd_send_signal 1448 +# elif defined(__loongarch_lp64) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__m68k__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_pidfd_send_signal 4424 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_pidfd_send_signal 6424 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_pidfd_send_signal 5424 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__powerpc__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_pidfd_send_signal 424 +# elif __riscv_xlen == 64 +# define systemd_NR_pidfd_send_signal 424 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__sparc__) +# define systemd_NR_pidfd_send_signal 424 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_pidfd_send_signal (424 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_pidfd_send_signal 424 +# endif +# elif !defined(missing_arch_template) +# warning "pidfd_send_signal() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_pidfd_send_signal && __NR_pidfd_send_signal >= 0 +# if defined systemd_NR_pidfd_send_signal +assert_cc(__NR_pidfd_send_signal == systemd_NR_pidfd_send_signal); +# endif +# else +# if defined __NR_pidfd_send_signal +# undef __NR_pidfd_send_signal +# endif +# if defined systemd_NR_pidfd_send_signal && systemd_NR_pidfd_send_signal >= 0 +# define __NR_pidfd_send_signal systemd_NR_pidfd_send_signal +# endif +# endif +#endif + +#ifndef __IGNORE_pkey_mprotect +# if defined(__aarch64__) +# define systemd_NR_pkey_mprotect 288 +# elif defined(__alpha__) +# define systemd_NR_pkey_mprotect 524 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_pkey_mprotect 288 +# elif defined(__arm__) +# define systemd_NR_pkey_mprotect 394 +# elif defined(__i386__) +# define systemd_NR_pkey_mprotect 380 +# elif defined(__ia64__) +# define systemd_NR_pkey_mprotect 1354 +# elif defined(__loongarch_lp64) +# define systemd_NR_pkey_mprotect 288 +# elif defined(__m68k__) +# define systemd_NR_pkey_mprotect 381 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_pkey_mprotect 4363 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_pkey_mprotect 6327 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_pkey_mprotect 5323 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_pkey_mprotect 351 +# elif defined(__powerpc__) +# define systemd_NR_pkey_mprotect 386 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_pkey_mprotect 288 +# elif __riscv_xlen == 64 +# define systemd_NR_pkey_mprotect 288 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_pkey_mprotect 384 +# elif defined(__sparc__) +# define systemd_NR_pkey_mprotect 362 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_pkey_mprotect (329 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_pkey_mprotect 329 +# endif +# elif !defined(missing_arch_template) +# warning "pkey_mprotect() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_pkey_mprotect && __NR_pkey_mprotect >= 0 +# if defined systemd_NR_pkey_mprotect +assert_cc(__NR_pkey_mprotect == systemd_NR_pkey_mprotect); +# endif +# else +# if defined __NR_pkey_mprotect +# undef __NR_pkey_mprotect +# endif +# if defined systemd_NR_pkey_mprotect && systemd_NR_pkey_mprotect >= 0 +# define __NR_pkey_mprotect systemd_NR_pkey_mprotect +# endif +# endif +#endif + +#ifndef __IGNORE_renameat2 +# if defined(__aarch64__) +# define systemd_NR_renameat2 276 +# elif defined(__alpha__) +# define systemd_NR_renameat2 510 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_renameat2 276 +# elif defined(__arm__) +# define systemd_NR_renameat2 382 +# elif defined(__i386__) +# define systemd_NR_renameat2 353 +# elif defined(__ia64__) +# define systemd_NR_renameat2 1338 +# elif defined(__loongarch_lp64) +# define systemd_NR_renameat2 276 +# elif defined(__m68k__) +# define systemd_NR_renameat2 351 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_renameat2 4351 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_renameat2 6315 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_renameat2 5311 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_renameat2 337 +# elif defined(__powerpc__) +# define systemd_NR_renameat2 357 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_renameat2 276 +# elif __riscv_xlen == 64 +# define systemd_NR_renameat2 276 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_renameat2 347 +# elif defined(__sparc__) +# define systemd_NR_renameat2 345 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_renameat2 (316 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_renameat2 316 +# endif +# elif !defined(missing_arch_template) +# warning "renameat2() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_renameat2 && __NR_renameat2 >= 0 +# if defined systemd_NR_renameat2 +assert_cc(__NR_renameat2 == systemd_NR_renameat2); +# endif +# else +# if defined __NR_renameat2 +# undef __NR_renameat2 +# endif +# if defined systemd_NR_renameat2 && systemd_NR_renameat2 >= 0 +# define __NR_renameat2 systemd_NR_renameat2 +# endif +# endif +#endif + +#ifndef __IGNORE_setns +# if defined(__aarch64__) +# define systemd_NR_setns 268 +# elif defined(__alpha__) +# define systemd_NR_setns 501 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_setns 268 +# elif defined(__arm__) +# define systemd_NR_setns 375 +# elif defined(__i386__) +# define systemd_NR_setns 346 +# elif defined(__ia64__) +# define systemd_NR_setns 1330 +# elif defined(__loongarch_lp64) +# define systemd_NR_setns 268 +# elif defined(__m68k__) +# define systemd_NR_setns 344 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_setns 4344 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_setns 6308 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_setns 5303 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_setns 328 +# elif defined(__powerpc__) +# define systemd_NR_setns 350 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_setns 268 +# elif __riscv_xlen == 64 +# define systemd_NR_setns 268 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_setns 339 +# elif defined(__sparc__) +# define systemd_NR_setns 337 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_setns (308 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_setns 308 +# endif +# elif !defined(missing_arch_template) +# warning "setns() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_setns && __NR_setns >= 0 +# if defined systemd_NR_setns +assert_cc(__NR_setns == systemd_NR_setns); +# endif +# else +# if defined __NR_setns +# undef __NR_setns +# endif +# if defined systemd_NR_setns && systemd_NR_setns >= 0 +# define __NR_setns systemd_NR_setns +# endif +# endif +#endif + +#ifndef __IGNORE_statx +# if defined(__aarch64__) +# define systemd_NR_statx 291 +# elif defined(__alpha__) +# define systemd_NR_statx 522 +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_statx 291 +# elif defined(__arm__) +# define systemd_NR_statx 397 +# elif defined(__i386__) +# define systemd_NR_statx 383 +# elif defined(__ia64__) +# define systemd_NR_statx 1350 +# elif defined(__loongarch_lp64) +# define systemd_NR_statx 291 +# elif defined(__m68k__) +# define systemd_NR_statx 379 +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_statx 4366 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_statx 6330 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_statx 5326 +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_statx 349 +# elif defined(__powerpc__) +# define systemd_NR_statx 383 +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_statx 291 +# elif __riscv_xlen == 64 +# define systemd_NR_statx 291 +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_statx 379 +# elif defined(__sparc__) +# define systemd_NR_statx 360 +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_statx (332 | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_statx 332 +# endif +# elif !defined(missing_arch_template) +# warning "statx() syscall number is unknown for your architecture" +# endif + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_statx && __NR_statx >= 0 +# if defined systemd_NR_statx +assert_cc(__NR_statx == systemd_NR_statx); +# endif +# else +# if defined __NR_statx +# undef __NR_statx +# endif +# if defined systemd_NR_statx && systemd_NR_statx >= 0 +# define __NR_statx systemd_NR_statx +# endif +# endif +#endif diff --git a/src/basic/missing_syscalls.py b/src/basic/missing_syscalls.py new file mode 100644 index 0000000..3749e89 --- /dev/null +++ b/src/basic/missing_syscalls.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later + +import sys +import functools + +# We only generate numbers for a dozen or so syscalls +SYSCALLS = [ + 'bpf', + 'close_range', + 'copy_file_range', + 'fchmodat2', + 'getrandom', + 'memfd_create', + 'mount_setattr', + 'move_mount', + 'name_to_handle_at', + 'open_tree', + 'openat2', + 'pidfd_open', + 'pidfd_send_signal', + 'pkey_mprotect', + 'renameat2', + 'setns', + 'statx', +] + +def dictify(f): + def wrap(*args, **kwargs): + return dict(f(*args, **kwargs)) + return functools.update_wrapper(wrap, f) + +@dictify +def parse_syscall_table(filename): + print(f'Reading {filename}…') + for line in open(filename): + items = line.split() + if len(items) >= 2: + yield items[0], int(items[1]) + +def parse_syscall_tables(filenames): + return {filename.split('-')[-1][:-4]: parse_syscall_table(filename) + for filename in filenames} + +DEF_TEMPLATE_A = '''\ + +#ifndef __IGNORE_{syscall} +''' + +DEF_TEMPLATE_B = '''\ +# if defined(__aarch64__) +# define systemd_NR_{syscall} {nr_arm64} +# elif defined(__alpha__) +# define systemd_NR_{syscall} {nr_alpha} +# elif defined(__arc__) || defined(__tilegx__) +# define systemd_NR_{syscall} {nr_arc} +# elif defined(__arm__) +# define systemd_NR_{syscall} {nr_arm} +# elif defined(__i386__) +# define systemd_NR_{syscall} {nr_i386} +# elif defined(__ia64__) +# define systemd_NR_{syscall} {nr_ia64} +# elif defined(__loongarch_lp64) +# define systemd_NR_{syscall} {nr_loongarch64} +# elif defined(__m68k__) +# define systemd_NR_{syscall} {nr_m68k} +# elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# define systemd_NR_{syscall} {nr_mipso32} +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# define systemd_NR_{syscall} {nr_mips64n32} +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# define systemd_NR_{syscall} {nr_mips64} +# else +# error "Unknown MIPS ABI" +# endif +# elif defined(__hppa__) +# define systemd_NR_{syscall} {nr_parisc} +# elif defined(__powerpc__) +# define systemd_NR_{syscall} {nr_powerpc} +# elif defined(__riscv) +# if __riscv_xlen == 32 +# define systemd_NR_{syscall} {nr_riscv32} +# elif __riscv_xlen == 64 +# define systemd_NR_{syscall} {nr_riscv64} +# else +# error "Unknown RISC-V ABI" +# endif +# elif defined(__s390__) +# define systemd_NR_{syscall} {nr_s390} +# elif defined(__sparc__) +# define systemd_NR_{syscall} {nr_sparc} +# elif defined(__x86_64__) +# if defined(__ILP32__) +# define systemd_NR_{syscall} ({nr_x86_64} | /* __X32_SYSCALL_BIT */ 0x40000000) +# else +# define systemd_NR_{syscall} {nr_x86_64} +# endif +# elif !defined(missing_arch_template) +%s +# endif +''' + +DEF_TEMPLATE_C = '''\ + +/* may be an (invalid) negative number due to libseccomp, see PR 13319 */ +# if defined __NR_{syscall} && __NR_{syscall} >= 0 +# if defined systemd_NR_{syscall} +assert_cc(__NR_{syscall} == systemd_NR_{syscall}); +# endif +# else +# if defined __NR_{syscall} +# undef __NR_{syscall} +# endif +# if defined systemd_NR_{syscall} && systemd_NR_{syscall} >= 0 +# define __NR_{syscall} systemd_NR_{syscall} +# endif +# endif +#endif''' + +DEF_TEMPLATE = (DEF_TEMPLATE_A + + DEF_TEMPLATE_B % '# warning "{syscall}() syscall number is unknown for your architecture"' + + DEF_TEMPLATE_C) + +ARCH_CHECK = '''\ +/* Note: if this code looks strange, this is because it is derived from the same + * template as the per-syscall blocks below. */ +''' + '\n'.join(line for line in DEF_TEMPLATE_B.splitlines() + if ' define ' not in line) % '''\ +# warning "Current architecture is missing from the template" +# define missing_arch_template 1''' + +def print_syscall_def(syscall, tables, out): + mappings = {f'nr_{arch}':t.get(syscall, -1) + for arch, t in tables.items()} + print(DEF_TEMPLATE.format(syscall=syscall, **mappings), + file=out) + +def print_syscall_defs(syscalls, tables, out): + print('''\ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * This file is generated by src/basic/missing_syscalls.py. Do not edit! + * + * Use 'ninja -C build update-syscall-tables' to download new syscall tables, + * and 'ninja -C build update-syscall-header' to regenerate this file. + */ +#pragma once +''', + file=out) + print(ARCH_CHECK, file=out) + for syscall in syscalls: + print_syscall_def(syscall, tables, out) + +if __name__ == '__main__': + output_file = sys.argv[1] + arch_files = sys.argv[2:] + out = open(output_file, 'wt') + + tables = parse_syscall_tables(arch_files) + print_syscall_defs(SYSCALLS, tables, out) + + print(f'Wrote {output_file}') diff --git a/src/basic/missing_threads.h b/src/basic/missing_threads.h new file mode 100644 index 0000000..fb3b722 --- /dev/null +++ b/src/basic/missing_threads.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* If threads.h doesn't exist, then define our own thread_local to match C11's thread_local. */ +#if HAVE_THREADS_H +# include +#elif !(defined(thread_local)) +/* Don't break on glibc < 2.16 that doesn't define __STDC_NO_THREADS__ + * see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53769 */ +# if __STDC_VERSION__ >= 201112L && !(defined(__STDC_NO_THREADS__) || (defined(__GNU_LIBRARY__) && __GLIBC__ == 2 && __GLIBC_MINOR__ < 16)) +# define thread_local _Thread_local +# else +# define thread_local __thread +# endif +#endif diff --git a/src/basic/missing_timerfd.h b/src/basic/missing_timerfd.h new file mode 100644 index 0000000..dba3043 --- /dev/null +++ b/src/basic/missing_timerfd.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#ifndef TFD_TIMER_CANCEL_ON_SET +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#endif diff --git a/src/basic/missing_type.h b/src/basic/missing_type.h new file mode 100644 index 0000000..f623309 --- /dev/null +++ b/src/basic/missing_type.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#if !HAVE_CHAR32_T +#define char32_t uint32_t +#endif + +#if !HAVE_CHAR16_T +#define char16_t uint16_t +#endif diff --git a/src/basic/missing_xfs.h b/src/basic/missing_xfs.h new file mode 100644 index 0000000..ba5fe81 --- /dev/null +++ b/src/basic/missing_xfs.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* This is currently not exported in the public kernel headers, but the libxfs library code part of xfsprogs + * defines it as public header */ + +#ifndef XFS_IOC_FSGEOMETRY +#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) + +typedef struct xfs_fsop_geom { + uint32_t blocksize; + uint32_t rtextsize; + uint32_t agblocks; + uint32_t agcount; + uint32_t logblocks; + uint32_t sectsize; + uint32_t inodesize; + uint32_t imaxpct; + uint64_t datablocks; + uint64_t rtblocks; + uint64_t rtextents; + uint64_t logstart; + unsigned char uuid[16]; + uint32_t sunit; + uint32_t swidth; + int32_t version; + uint32_t flags; + uint32_t logsectsize; + uint32_t rtsectsize; + uint32_t dirblocksize; + uint32_t logsunit; +} xfs_fsop_geom_t; +#endif + +#ifndef XFS_IOC_FSGROWFSDATA +#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data) + +typedef struct xfs_growfs_data { + uint64_t newblocks; + uint32_t imaxpct; +} xfs_growfs_data_t; +#endif diff --git a/src/basic/mkdir.c b/src/basic/mkdir.c new file mode 100644 index 0000000..c770e5e --- /dev/null +++ b/src/basic/mkdir.c @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "btrfs.h" +#include "chase.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "macro.h" +#include "mkdir.h" +#include "path-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "user-util.h" + +int mkdirat_safe_internal( + int dir_fd, + const char *path, + mode_t mode, + uid_t uid, + gid_t gid, + MkdirFlags flags, + mkdirat_func_t _mkdirat) { + + struct stat st; + int r; + + assert(path); + assert(mode != MODE_INVALID); + assert(_mkdirat && _mkdirat != mkdirat); + + r = _mkdirat(dir_fd, path, mode); + if (r >= 0) + return chmod_and_chown_at(dir_fd, path, mode, uid, gid); + if (r != -EEXIST) + return r; + + if (fstatat(dir_fd, path, &st, AT_SYMLINK_NOFOLLOW) < 0) + return -errno; + + if ((flags & MKDIR_FOLLOW_SYMLINK) && S_ISLNK(st.st_mode)) { + _cleanup_free_ char *p = NULL; + + r = chaseat(dir_fd, path, CHASE_NONEXISTENT, &p, NULL); + if (r < 0) + return r; + if (r == 0) + return mkdirat_safe_internal(dir_fd, p, mode, uid, gid, + flags & ~MKDIR_FOLLOW_SYMLINK, + _mkdirat); + + if (fstatat(dir_fd, p, &st, AT_SYMLINK_NOFOLLOW) < 0) + return -errno; + } + + if (flags & MKDIR_IGNORE_EXISTING) + return 0; + + if (!S_ISDIR(st.st_mode)) + return log_full_errno(flags & MKDIR_WARN_MODE ? LOG_WARNING : LOG_DEBUG, SYNTHETIC_ERRNO(ENOTDIR), + "Path \"%s\" already exists and is not a directory, refusing.", path); + + if ((st.st_mode & ~mode & 0777) != 0) + return log_full_errno(flags & MKDIR_WARN_MODE ? LOG_WARNING : LOG_DEBUG, SYNTHETIC_ERRNO(EEXIST), + "Directory \"%s\" already exists, but has mode %04o that is too permissive (%04o was requested), refusing.", + path, st.st_mode & 0777, mode); + + if ((uid != UID_INVALID && st.st_uid != uid) || + (gid != GID_INVALID && st.st_gid != gid)) { + char u[DECIMAL_STR_MAX(uid_t)] = "-", g[DECIMAL_STR_MAX(gid_t)] = "-"; + + if (uid != UID_INVALID) + xsprintf(u, UID_FMT, uid); + if (gid != UID_INVALID) + xsprintf(g, GID_FMT, gid); + return log_full_errno(flags & MKDIR_WARN_MODE ? LOG_WARNING : LOG_DEBUG, SYNTHETIC_ERRNO(EEXIST), + "Directory \"%s\" already exists, but is owned by "UID_FMT":"GID_FMT" (%s:%s was requested), refusing.", + path, st.st_uid, st.st_gid, u, g); + } + + return 0; +} + +int mkdirat_errno_wrapper(int dirfd, const char *pathname, mode_t mode) { + return RET_NERRNO(mkdirat(dirfd, pathname, mode)); +} + +int mkdirat_safe(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdirat_safe_internal(dir_fd, path, mode, uid, gid, flags, mkdirat_errno_wrapper); +} + +int mkdirat_parents_internal(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdirat) { + const char *e = NULL; + int r; + + assert(path); + assert(_mkdirat != mkdirat); + + if (isempty(path)) + return 0; + + if (!path_is_safe(path)) + return -ENOTDIR; + + /* return immediately if directory exists */ + r = path_find_last_component(path, /* accept_dot_dot= */ false, &e, NULL); + if (r <= 0) /* r == 0 means path is equivalent to prefix. */ + return r; + if (e == path) + return 0; + + assert(e > path); + assert(*e == '/'); + + /* drop the last component */ + path = strndupa_safe(path, e - path); + r = is_dir_full(dir_fd, path, true); + if (r > 0) + return 0; + if (r == 0) + return -ENOTDIR; + + /* create every parent directory in the path, except the last component */ + for (const char *p = path;;) { + char *s; + int n; + + n = path_find_first_component(&p, /* accept_dot_dot= */ false, (const char **) &s); + if (n <= 0) + return n; + + assert(p); + assert(s >= path); + assert(IN_SET(s[n], '/', '\0')); + + s[n] = '\0'; + + r = mkdirat_safe_internal(dir_fd, path, mode, uid, gid, flags | MKDIR_IGNORE_EXISTING, _mkdirat); + if (r < 0 && r != -EEXIST) + return r; + + s[n] = *p == '\0' ? '\0' : '/'; + } +} + +int mkdir_parents_internal(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdirat) { + _cleanup_close_ int fd = AT_FDCWD; + const char *p; + + assert(path); + assert(_mkdirat != mkdirat); + + if (prefix) { + p = path_startswith_full(path, prefix, /* accept_dot_dot= */ false); + if (!p) + return -ENOTDIR; + } else + p = path; + + if (prefix) { + fd = open(prefix, O_PATH|O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return -errno; + } + + return mkdirat_parents_internal(fd, p, mode, uid, gid, flags, _mkdirat); +} + +int mkdirat_parents(int dir_fd, const char *path, mode_t mode) { + return mkdirat_parents_internal(dir_fd, path, mode, UID_INVALID, UID_INVALID, 0, mkdirat_errno_wrapper); +} + +int mkdir_parents_safe(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdir_parents_internal(prefix, path, mode, uid, gid, flags, mkdirat_errno_wrapper); +} + +int mkdir_p_internal(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdirat) { + int r; + + /* Like mkdir -p */ + + assert(_mkdirat != mkdirat); + + r = mkdir_parents_internal(prefix, path, mode, uid, gid, flags | MKDIR_FOLLOW_SYMLINK, _mkdirat); + if (r < 0) + return r; + + if (!uid_is_valid(uid) && !gid_is_valid(gid) && flags == 0) { + r = _mkdirat(AT_FDCWD, path, mode); + if (r < 0 && (r != -EEXIST || is_dir(path, true) <= 0)) + return r; + } else { + r = mkdir_safe_internal(path, mode, uid, gid, flags, _mkdirat); + if (r < 0 && r != -EEXIST) + return r; + } + + return 0; +} + +int mkdir_p(const char *path, mode_t mode) { + return mkdir_p_internal(NULL, path, mode, UID_INVALID, UID_INVALID, 0, mkdirat_errno_wrapper); +} + +int mkdir_p_safe(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdir_p_internal(prefix, path, mode, uid, gid, flags, mkdirat_errno_wrapper); +} + +int mkdir_p_root(const char *root, const char *p, uid_t uid, gid_t gid, mode_t m, char **subvolumes) { + _cleanup_free_ char *pp = NULL, *bn = NULL; + _cleanup_close_ int dfd = -EBADF; + int r; + + r = path_extract_directory(p, &pp); + if (r == -EDESTADDRREQ) { + /* only fname is passed, no prefix to operate on */ + dfd = open(".", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (dfd < 0) + return -errno; + } else if (r == -EADDRNOTAVAIL) + /* only root dir or "." was passed, i.e. there is no parent to extract, in that case there's nothing to do. */ + return 0; + else if (r < 0) + return r; + else { + /* Extracting the parent dir worked, hence we aren't top-level? Recurse up first. */ + r = mkdir_p_root(root, pp, uid, gid, m, subvolumes); + if (r < 0) + return r; + + dfd = chase_and_open(pp, root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_DIRECTORY, NULL); + if (dfd < 0) + return dfd; + } + + r = path_extract_filename(p, &bn); + if (r == -EADDRNOTAVAIL) /* Already top-level */ + return 0; + if (r < 0) + return r; + + if (path_strv_contains(subvolumes, p)) + r = btrfs_subvol_make_fallback(dfd, bn, m); + else + r = RET_NERRNO(mkdirat(dfd, bn, m)); + if (r < 0) { + if (r == -EEXIST) + return 0; + + return r; + } + + if (uid_is_valid(uid) || gid_is_valid(gid)) { + _cleanup_close_ int nfd = -EBADF; + + nfd = openat(dfd, bn, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); + if (nfd < 0) + return -errno; + + if (fchown(nfd, uid, gid) < 0) + return -errno; + } + + return 1; +} diff --git a/src/basic/mkdir.h b/src/basic/mkdir.h new file mode 100644 index 0000000..e538748 --- /dev/null +++ b/src/basic/mkdir.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +typedef enum MkdirFlags { + MKDIR_FOLLOW_SYMLINK = 1 << 0, + MKDIR_IGNORE_EXISTING = 1 << 1, /* Quietly accept a preexisting directory (or file) */ + MKDIR_WARN_MODE = 1 << 2, /* Log at LOG_WARNING when mode doesn't match */ +} MkdirFlags; + +int mkdirat_errno_wrapper(int dirfd, const char *pathname, mode_t mode); + +int mkdirat_safe(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags); +static inline int mkdir_safe(const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdirat_safe(AT_FDCWD, path, mode, uid, gid, flags); +} +int mkdirat_parents(int dir_fd, const char *path, mode_t mode); +static inline int mkdir_parents(const char *path, mode_t mode) { + return mkdirat_parents(AT_FDCWD, path, mode); +} +int mkdir_parents_safe(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags); +int mkdir_p(const char *path, mode_t mode); +int mkdir_p_safe(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags); +int mkdir_p_root(const char *root, const char *p, uid_t uid, gid_t gid, mode_t m, char **subvolumes); + +/* The following are used to implement the mkdir_xyz_label() calls, don't use otherwise. */ +typedef int (*mkdirat_func_t)(int dir_fd, const char *pathname, mode_t mode); +int mkdirat_safe_internal(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdir); +static inline int mkdir_safe_internal(const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdir) { + return mkdirat_safe_internal(AT_FDCWD, path, mode, uid, gid, flags, _mkdir); +} +int mkdirat_parents_internal(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdirat); +int mkdir_parents_internal(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdir); +int mkdir_p_internal(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags, mkdirat_func_t _mkdir); diff --git a/src/basic/mountpoint-util.c b/src/basic/mountpoint-util.c new file mode 100644 index 0000000..bf67f7e --- /dev/null +++ b/src/basic/mountpoint-util.c @@ -0,0 +1,786 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#if WANT_LINUX_FS_H +#include +#endif + +#include "alloc-util.h" +#include "chase.h" +#include "fd-util.h" +#include "fileio.h" +#include "filesystems.h" +#include "fs-util.h" +#include "missing_fs.h" +#include "missing_mount.h" +#include "missing_stat.h" +#include "missing_syscall.h" +#include "mkdir.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "user-util.h" + +/* This is the original MAX_HANDLE_SZ definition from the kernel, when the API was introduced. We use that in place of + * any more currently defined value to future-proof things: if the size is increased in the API headers, and our code + * is recompiled then it would cease working on old kernels, as those refuse any sizes larger than this value with + * EINVAL right-away. Hence, let's disconnect ourselves from any such API changes, and stick to the original definition + * from when it was introduced. We use it as a start value only anyway (see below), and hence should be able to deal + * with large file handles anyway. */ +#define ORIGINAL_MAX_HANDLE_SZ 128 + +int name_to_handle_at_loop( + int fd, + const char *path, + struct file_handle **ret_handle, + int *ret_mnt_id, + int flags) { + + size_t n = ORIGINAL_MAX_HANDLE_SZ; + + assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0); + + /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified + * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a + * start value, it is not an upper bound on the buffer size required. + * + * This improves on raw name_to_handle_at() also in one other regard: ret_handle and ret_mnt_id can be passed + * as NULL if there's no interest in either. */ + + for (;;) { + _cleanup_free_ struct file_handle *h = NULL; + int mnt_id = -1; + + h = malloc0(offsetof(struct file_handle, f_handle) + n); + if (!h) + return -ENOMEM; + + h->handle_bytes = n; + + if (name_to_handle_at(fd, strempty(path), h, &mnt_id, flags) >= 0) { + + if (ret_handle) + *ret_handle = TAKE_PTR(h); + + if (ret_mnt_id) + *ret_mnt_id = mnt_id; + + return 0; + } + if (errno != EOVERFLOW) + return -errno; + + if (!ret_handle && ret_mnt_id && mnt_id >= 0) { + + /* As it appears, name_to_handle_at() fills in mnt_id even when it returns EOVERFLOW when the + * buffer is too small, but that's undocumented. Hence, let's make use of this if it appears to + * be filled in, and the caller was interested in only the mount ID an nothing else. */ + + *ret_mnt_id = mnt_id; + return 0; + } + + /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something + * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small + * buffer. In that case propagate EOVERFLOW */ + if (h->handle_bytes <= n) + return -EOVERFLOW; + + /* The buffer was too small. Size the new buffer by what name_to_handle_at() returned. */ + n = h->handle_bytes; + + /* paranoia: check for overflow (note that .handle_bytes is unsigned only) */ + if (n > UINT_MAX - offsetof(struct file_handle, f_handle)) + return -EOVERFLOW; + } +} + +static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) { + char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)]; + _cleanup_free_ char *fdinfo = NULL; + _cleanup_close_ int subfd = -EBADF; + char *p; + int r; + + assert(ret_mnt_id); + assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0); + + if ((flags & AT_EMPTY_PATH) && isempty(filename)) + xsprintf(path, "/proc/self/fdinfo/%i", fd); + else { + subfd = openat(fd, filename, O_CLOEXEC|O_PATH|(flags & AT_SYMLINK_FOLLOW ? 0 : O_NOFOLLOW)); + if (subfd < 0) + return -errno; + + xsprintf(path, "/proc/self/fdinfo/%i", subfd); + } + + r = read_full_virtual_file(path, &fdinfo, NULL); + if (r == -ENOENT) /* The fdinfo directory is a relatively new addition */ + return proc_mounted() > 0 ? -EOPNOTSUPP : -ENOSYS; + if (r < 0) + return r; + + p = find_line_startswith(fdinfo, "mnt_id:"); + if (!p) /* The mnt_id field is a relatively new addition */ + return -EOPNOTSUPP; + + p += strspn(p, WHITESPACE); + p[strcspn(p, WHITESPACE)] = 0; + + return safe_atoi(p, ret_mnt_id); +} + +static bool filename_possibly_with_slash_suffix(const char *s) { + const char *slash, *copied; + + /* Checks whether the specified string is either file name, or a filename with a suffix of + * slashes. But nothing else. + * + * this is OK: foo, bar, foo/, bar/, foo//, bar/// + * this is not OK: "", "/", "/foo", "foo/bar", ".", ".." … */ + + slash = strchr(s, '/'); + if (!slash) + return filename_is_valid(s); + + if (slash - s > PATH_MAX) /* We want to allocate on the stack below, hence do a size check first */ + return false; + + if (slash[strspn(slash, "/")] != 0) /* Check that the suffix consist only of one or more slashes */ + return false; + + copied = strndupa_safe(s, slash - s); + return filename_is_valid(copied); +} + +static bool is_name_to_handle_at_fatal_error(int err) { + /* name_to_handle_at() can return "acceptable" errors that are due to the context. For + * example the kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall + * was blocked (EACCES/EPERM; maybe through seccomp, because we are running inside of a + * container), or the mount point is not triggered yet (EOVERFLOW, think nfs4), or some + * general name_to_handle_at() flakiness (EINVAL). However other errors are not supposed to + * happen and therefore are considered fatal ones. */ + + assert(err < 0); + + return !IN_SET(err, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL); +} + +int fd_is_mount_point(int fd, const char *filename, int flags) { + _cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL; + int mount_id = -1, mount_id_parent = -1; + bool nosupp = false, check_st_dev = true; + STRUCT_STATX_DEFINE(sx); + struct stat a, b; + int r; + + assert(fd >= 0); + assert((flags & ~AT_SYMLINK_FOLLOW) == 0); + + if (!filename) { + /* If the file name is specified as NULL we'll see if the specified 'fd' is a mount + * point. That's only supported if the kernel supports statx(), or if the inode specified via + * 'fd' refers to a directory. Otherwise, we'll have to fail (ENOTDIR), because we have no + * kernel API to query the information we need. */ + flags |= AT_EMPTY_PATH; + filename = ""; + } else if (!filename_possibly_with_slash_suffix(filename)) + /* Insist that the specified filename is actually a filename, and not a path, i.e. some inode further + * up or down the tree then immediately below the specified directory fd. */ + return -EINVAL; + + /* First we will try statx()' STATX_ATTR_MOUNT_ROOT attribute, which is our ideal API, available + * since kernel 5.8. + * + * If that fails, our second try is the name_to_handle_at() syscall, which tells us the mount id and + * an opaque file "handle". It is not supported everywhere though (kernel compile-time option, not + * all file systems are hooked up). If it works the mount id is usually good enough to tell us + * whether something is a mount point. + * + * If that didn't work we will try to read the mount id from /proc/self/fdinfo/. This is almost + * as good as name_to_handle_at(), however, does not return the opaque file handle. The opaque file + * handle is pretty useful to detect the root directory, which we should always consider a mount + * point. Hence we use this only as fallback. Exporting the mnt_id in fdinfo is a pretty recent + * kernel addition. + * + * As last fallback we do traditional fstat() based st_dev comparisons. This is how things were + * traditionally done, but unionfs breaks this since it exposes file systems with a variety of st_dev + * reported. Also, btrfs subvolumes have different st_dev, even though they aren't real mounts of + * their own. */ + + if (statx(fd, + filename, + (FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : AT_SYMLINK_NOFOLLOW) | + (flags & AT_EMPTY_PATH) | + AT_NO_AUTOMOUNT | /* don't trigger automounts – mounts are a local concept, hence no need to trigger automounts to determine STATX_ATTR_MOUNT_ROOT */ + AT_STATX_DONT_SYNC, /* don't go to the network for this – for similar reasons */ + STATX_TYPE, + &sx) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno) && /* statx() is not supported by the kernel. */ + !ERRNO_IS_PRIVILEGE(errno) && /* maybe filtered by seccomp. */ + errno != EINVAL) /* glibc's fallback method returns EINVAL when AT_STATX_DONT_SYNC is set. */ + return -errno; + + /* If statx() is not available or forbidden, fall back to name_to_handle_at() below */ + } else if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) /* yay! */ + return FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT); + else if (FLAGS_SET(sx.stx_mask, STATX_TYPE) && S_ISLNK(sx.stx_mode)) + return false; /* symlinks are never mount points */ + + r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags); + if (r < 0) { + if (is_name_to_handle_at_fatal_error(r)) + return r; + if (r != -EOPNOTSUPP) + goto fallback_fdinfo; + + /* This kernel or file system does not support name_to_handle_at(), hence let's see + * if the upper fs supports it (in which case it is a mount point), otherwise fall + * back to the traditional stat() logic */ + nosupp = true; + } + + if (isempty(filename)) + r = name_to_handle_at_loop(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */ + else + r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH); + if (r < 0) { + if (is_name_to_handle_at_fatal_error(r)) + return r; + if (r != -EOPNOTSUPP) + goto fallback_fdinfo; + if (nosupp) + /* Both the parent and the directory can't do name_to_handle_at() */ + goto fallback_fdinfo; + + /* The parent can't do name_to_handle_at() but the directory we are + * interested in can? If so, it must be a mount point. */ + return 1; + } + + /* The parent can do name_to_handle_at() but the directory we are interested in can't? If + * so, it must be a mount point. */ + if (nosupp) + return 1; + + /* If the file handle for the directory we are interested in and its parent are identical, + * we assume this is the root directory, which is a mount point. */ + + if (h->handle_type == h_parent->handle_type && + memcmp_nn(h->f_handle, h->handle_bytes, + h_parent->f_handle, h_parent->handle_bytes) == 0) + return 1; + + return mount_id != mount_id_parent; + +fallback_fdinfo: + r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id); + if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM, -ENOSYS)) + goto fallback_fstat; + if (r < 0) + return r; + + if (isempty(filename)) + r = fd_fdinfo_mnt_id(fd, "..", 0, &mount_id_parent); /* can't work for non-directories 😢 */ + else + r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent); + if (r < 0) + return r; + + if (mount_id != mount_id_parent) + return 1; + + /* Hmm, so, the mount ids are the same. This leaves one special case though for the root file + * system. For that, let's see if the parent directory has the same inode as we are interested + * in. Hence, let's also do fstat() checks now, too, but avoid the st_dev comparisons, since they + * aren't that useful on unionfs mounts. */ + check_st_dev = false; + +fallback_fstat: + /* yay for fstatat() taking a different set of flags than the other _at() above */ + if (flags & AT_SYMLINK_FOLLOW) + flags &= ~AT_SYMLINK_FOLLOW; + else + flags |= AT_SYMLINK_NOFOLLOW; + if (fstatat(fd, filename, &a, flags) < 0) + return -errno; + if (S_ISLNK(a.st_mode)) /* Symlinks are never mount points */ + return false; + + if (isempty(filename)) + r = fstatat(fd, "..", &b, 0); + else + r = fstatat(fd, "", &b, AT_EMPTY_PATH); + if (r < 0) + return -errno; + + /* A directory with same device and inode as its parent? Must be the root directory */ + if (stat_inode_same(&a, &b)) + return 1; + + return check_st_dev && (a.st_dev != b.st_dev); +} + +/* flags can be AT_SYMLINK_FOLLOW or 0 */ +int path_is_mount_point(const char *t, const char *root, int flags) { + _cleanup_free_ char *canonical = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(t); + assert((flags & ~AT_SYMLINK_FOLLOW) == 0); + + if (path_equal(t, "/")) + return 1; + + /* we need to resolve symlinks manually, we can't just rely on + * fd_is_mount_point() to do that for us; if we have a structure like + * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we + * look at needs to be /usr, not /. */ + if (flags & AT_SYMLINK_FOLLOW) { + r = chase(t, root, CHASE_TRAIL_SLASH, &canonical, NULL); + if (r < 0) + return r; + + t = canonical; + } + + fd = open_parent(t, O_PATH|O_CLOEXEC, 0); + if (fd < 0) + return fd; + + return fd_is_mount_point(fd, last_path_component(t), flags); +} + +int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret) { + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(ret); + + r = name_to_handle_at_loop(dir_fd, path, NULL, ret, isempty(path) ? AT_EMPTY_PATH : 0); + if (r == 0 || is_name_to_handle_at_fatal_error(r)) + return r; + + return fd_fdinfo_mnt_id(dir_fd, path, isempty(path) ? AT_EMPTY_PATH : 0, ret); +} + +int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) { + STRUCT_NEW_STATX_DEFINE(buf); + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(ret); + + if (statx(dir_fd, + strempty(path), + (isempty(path) ? AT_EMPTY_PATH : AT_SYMLINK_NOFOLLOW) | + AT_NO_AUTOMOUNT | /* don't trigger automounts, mnt_id is a local concept */ + AT_STATX_DONT_SYNC, /* don't go to the network, mnt_id is a local concept */ + STATX_MNT_ID, + &buf.sx) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno) && /* statx() is not supported by the kernel. */ + !ERRNO_IS_PRIVILEGE(errno) && /* maybe filtered by seccomp. */ + errno != EINVAL) /* glibc's fallback method returns EINVAL when AT_STATX_DONT_SYNC is set. */ + return -errno; + + /* Fall back to name_to_handle_at() and then fdinfo if statx is not supported or we lack + * privileges */ + + } else if (FLAGS_SET(buf.nsx.stx_mask, STATX_MNT_ID)) { + *ret = buf.nsx.stx_mnt_id; + return 0; + } + + return path_get_mnt_id_at_fallback(dir_fd, path, ret); +} + +bool fstype_is_network(const char *fstype) { + const char *x; + + x = startswith(fstype, "fuse."); + if (x) + fstype = x; + + if (nulstr_contains(filesystem_sets[FILESYSTEM_SET_NETWORK].value, fstype)) + return true; + + /* Filesystems not present in the internal database */ + return STR_IN_SET(fstype, + "davfs", + "glusterfs", + "lustre", + "sshfs"); +} + +bool fstype_needs_quota(const char *fstype) { + /* 1. quotacheck needs to be run for some filesystems after they are mounted + * if the filesystem was not unmounted cleanly. + * 2. You may need to run quotaon to enable quota usage tracking and/or + * enforcement. + * ext2 - needs 1) and 2) + * ext3 - needs 2) if configured using usrjquota/grpjquota mount options + * ext4 - needs 1) if created without journal, needs 2) if created without QUOTA + * filesystem feature + * reiserfs - needs 2). + * jfs - needs 2) + * f2fs - needs 2) if configured using usrjquota/grpjquota/prjjquota mount options + * xfs - nothing needed + * gfs2 - nothing needed + * ocfs2 - nothing needed + * btrfs - nothing needed + * for reference see filesystem and quota manpages */ + return STR_IN_SET(fstype, + "ext2", + "ext3", + "ext4", + "reiserfs", + "jfs", + "f2fs"); +} + +bool fstype_is_api_vfs(const char *fstype) { + const FilesystemSet *fs; + + FOREACH_POINTER(fs, + filesystem_sets + FILESYSTEM_SET_BASIC_API, + filesystem_sets + FILESYSTEM_SET_AUXILIARY_API, + filesystem_sets + FILESYSTEM_SET_PRIVILEGED_API, + filesystem_sets + FILESYSTEM_SET_TEMPORARY) + if (nulstr_contains(fs->value, fstype)) + return true; + + /* Filesystems not present in the internal database */ + return STR_IN_SET(fstype, + "autofs", + "cpuset", + "devtmpfs"); +} + +bool fstype_is_blockdev_backed(const char *fstype) { + const char *x; + + x = startswith(fstype, "fuse."); + if (x) + fstype = x; + + return !streq(fstype, "9p") && !fstype_is_network(fstype) && !fstype_is_api_vfs(fstype); +} + +bool fstype_is_ro(const char *fstype) { + /* All Linux file systems that are necessarily read-only */ + return STR_IN_SET(fstype, + "DM_verity_hash", + "cramfs", + "erofs", + "iso9660", + "squashfs"); +} + +bool fstype_can_discard(const char *fstype) { + assert(fstype); + + /* Use a curated list as first check, to avoid calling fsopen() which might load kmods, which might + * not be allowed in our MAC context. */ + if (STR_IN_SET(fstype, "btrfs", "f2fs", "ext4", "vfat", "xfs")) + return true; + + /* On new kernels we can just ask the kernel */ + return mount_option_supported(fstype, "discard", NULL) > 0; +} + +bool fstype_can_norecovery(const char *fstype) { + assert(fstype); + + /* Use a curated list as first check, to avoid calling fsopen() which might load kmods, which might + * not be allowed in our MAC context. */ + if (STR_IN_SET(fstype, "ext3", "ext4", "xfs", "btrfs")) + return true; + + /* On new kernels we can just ask the kernel */ + return mount_option_supported(fstype, "norecovery", NULL) > 0; +} + +bool fstype_can_umask(const char *fstype) { + assert(fstype); + + /* Use a curated list as first check, to avoid calling fsopen() which might load kmods, which might + * not be allowed in our MAC context. If we don't know ourselves, on new kernels we can just ask the + * kernel. */ + return streq(fstype, "vfat") || mount_option_supported(fstype, "umask", "0077") > 0; +} + +bool fstype_can_uid_gid(const char *fstype) { + /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and + * directories, current and future. Note that this does *not* ask the kernel via + * mount_option_supported() here because the uid=/gid= setting of various file systems mean different + * things: some apply it only to the root dir inode, others to all inodes in the file system. Thus we + * maintain the curated list below. 😢 */ + + return STR_IN_SET(fstype, + "adfs", + "exfat", + "fat", + "hfs", + "hpfs", + "iso9660", + "msdos", + "ntfs", + "vfat"); +} + +int dev_is_devtmpfs(void) { + _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; + int mount_id, r; + char *e; + + r = path_get_mnt_id("/dev", &mount_id); + if (r < 0) + return r; + + r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo); + if (r == -ENOENT) + return proc_mounted() > 0 ? -ENOENT : -ENOSYS; + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *line = NULL; + int mid; + + r = read_line(proc_self_mountinfo, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + if (sscanf(line, "%i", &mid) != 1) + continue; + + if (mid != mount_id) + continue; + + e = strstrafter(line, " - "); + if (!e) + continue; + + /* accept any name that starts with the currently expected type */ + if (startswith(e, "devtmpfs")) + return true; + } + + return false; +} + +int mount_fd(const char *source, + int target_fd, + const char *filesystemtype, + unsigned long mountflags, + const void *data) { + + if (mount(source, FORMAT_PROC_FD_PATH(target_fd), filesystemtype, mountflags, data) < 0) { + if (errno != ENOENT) + return -errno; + + /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't + * mounted. Check for the latter to generate better error messages. */ + if (proc_mounted() == 0) + return -ENOSYS; + + return -ENOENT; + } + + return 0; +} + +int mount_nofollow( + const char *source, + const char *target, + const char *filesystemtype, + unsigned long mountflags, + const void *data) { + + _cleanup_close_ int fd = -EBADF; + + /* In almost all cases we want to manipulate the mount table without following symlinks, hence + * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is + * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early + * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the + * fs to mount) we can only use traditional mount() directly. + * + * Note that this disables following only for the final component of the target, i.e symlinks within + * the path of the target are honoured, as are symlinks in the source path everywhere. */ + + fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return mount_fd(source, fd, filesystemtype, mountflags, data); +} + +const char *mount_propagation_flag_to_string(unsigned long flags) { + + switch (flags & (MS_SHARED|MS_SLAVE|MS_PRIVATE)) { + case 0: + return ""; + case MS_SHARED: + return "shared"; + case MS_SLAVE: + return "slave"; + case MS_PRIVATE: + return "private"; + } + + return NULL; +} + +int mount_propagation_flag_from_string(const char *name, unsigned long *ret) { + + if (isempty(name)) + *ret = 0; + else if (streq(name, "shared")) + *ret = MS_SHARED; + else if (streq(name, "slave")) + *ret = MS_SLAVE; + else if (streq(name, "private")) + *ret = MS_PRIVATE; + else + return -EINVAL; + return 0; +} + +bool mount_propagation_flag_is_valid(unsigned long flag) { + return IN_SET(flag, 0, MS_SHARED, MS_PRIVATE, MS_SLAVE); +} + +bool mount_new_api_supported(void) { + static int cache = -1; + int r; + + if (cache >= 0) + return cache; + + /* This is the newer API among the ones we use, so use it as boundary */ + r = RET_NERRNO(mount_setattr(-EBADF, NULL, 0, NULL, 0)); + if (r == 0 || ERRNO_IS_NOT_SUPPORTED(r)) /* This should return an error if it is working properly */ + return (cache = false); + + return (cache = true); +} + +unsigned long ms_nosymfollow_supported(void) { + _cleanup_close_ int fsfd = -EBADF, mntfd = -EBADF; + static int cache = -1; + + /* Returns MS_NOSYMFOLLOW if it is supported, zero otherwise. */ + + if (cache >= 0) + return cache ? MS_NOSYMFOLLOW : 0; + + if (!mount_new_api_supported()) + goto not_supported; + + /* Checks if MS_NOSYMFOLLOW is supported (which was added in 5.10). We use the new mount API's + * mount_setattr() call for that, which was added in 5.12, which is close enough. */ + + fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fsfd < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + goto not_supported; + + log_debug_errno(errno, "Failed to open superblock context for tmpfs: %m"); + return 0; + } + + if (fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + goto not_supported; + + log_debug_errno(errno, "Failed to create tmpfs superblock: %m"); + return 0; + } + + mntfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + if (mntfd < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + goto not_supported; + + log_debug_errno(errno, "Failed to turn superblock fd into mount fd: %m"); + return 0; + } + + if (mount_setattr(mntfd, "", AT_EMPTY_PATH|AT_RECURSIVE, + &(struct mount_attr) { + .attr_set = MOUNT_ATTR_NOSYMFOLLOW, + }, sizeof(struct mount_attr)) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + goto not_supported; + + log_debug_errno(errno, "Failed to set MOUNT_ATTR_NOSYMFOLLOW mount attribute: %m"); + return 0; + } + + cache = true; + return MS_NOSYMFOLLOW; + +not_supported: + cache = false; + return 0; +} + +int mount_option_supported(const char *fstype, const char *key, const char *value) { + _cleanup_close_ int fd = -EBADF; + int r; + + /* Checks if the specified file system supports a mount option. Returns > 0 if it supports it, == 0 if + * it does not. Return -EAGAIN if we can't determine it. And any other error otherwise. */ + + assert(fstype); + assert(key); + + fd = fsopen(fstype, FSOPEN_CLOEXEC); + if (fd < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + return -EAGAIN; /* new mount API not available → don't know */ + + return log_debug_errno(errno, "Failed to open superblock context for '%s': %m", fstype); + } + + /* Various file systems have not been converted to the new mount API yet. For such file systems + * fsconfig() with FSCONFIG_SET_STRING/FSCONFIG_SET_FLAG never fail. Which sucks, because we want to + * use it for testing support, after all. Let's hence do a check if the file system got converted yet + * first. */ + if (fsconfig(fd, FSCONFIG_SET_FD, "adefinitelynotexistingmountoption", NULL, fd) < 0) { + /* If FSCONFIG_SET_FD is not supported for the fs, then the file system was not converted to + * the new mount API yet. If it returns EINVAL the mount option doesn't exist, but the fstype + * is converted. */ + if (errno == EOPNOTSUPP) + return -EAGAIN; /* FSCONFIG_SET_FD not supported on the fs, hence not converted to new mount API → don't know */ + if (errno != EINVAL) + return log_debug_errno(errno, "Failed to check if file system has been converted to new mount API: %m"); + + /* So FSCONFIG_SET_FD worked, but the option didn't exist (we got EINVAL), this means the fs + * is converted. Let's now ask the actual question we wonder about. */ + } else + return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), "FSCONFIG_SET_FD worked unexpectedly for '%s', whoa!", fstype); + + if (value) + r = fsconfig(fd, FSCONFIG_SET_STRING, key, value, 0); + else + r = fsconfig(fd, FSCONFIG_SET_FLAG, key, NULL, 0); + if (r < 0) { + if (errno == EINVAL) + return false; /* EINVAL means option not supported. */ + + return log_debug_errno(errno, "Failed to set '%s%s%s' on '%s' superblock context: %m", + key, value ? "=" : "", strempty(value), fstype); + } + + return true; /* works! */ +} diff --git a/src/basic/mountpoint-util.h b/src/basic/mountpoint-util.h new file mode 100644 index 0000000..499403a --- /dev/null +++ b/src/basic/mountpoint-util.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +/* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't + * consume any space and udev isn't supposed to create regular file either. There's no limit on the + * max number of inodes since such limit is hard to guess especially on large storage array + * systems. */ +#define TMPFS_LIMITS_DEV ",size=4m" + +/* The limit used for /dev in private namespaces. 4MB for contents of regular files. The number of + * inodes should be relatively low in private namespaces but for now use a 64k limit. */ +#define TMPFS_LIMITS_PRIVATE_DEV ",size=4m,nr_inodes=64k" + +/* Very little, if any use expected */ +#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k" +#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST +#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST + +/* On an extremely small device with only 256MB of RAM, 20% of RAM should be enough for the re-execution of + * PID1 because 16MB of free space is required. */ +#define TMPFS_LIMITS_RUN ",size=20%,nr_inodes=800k" + +/* The limit used for various nested tmpfs mounts, in particular for guests started by systemd-nspawn. + * 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25% + * translates to 1M inodes. + * (On the host, /tmp is configured through a .mount unit file.) */ +#define NESTED_TMPFS_LIMITS ",size=10%,nr_inodes=400k" + +/* More space for volatile root and /var */ +#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m" +#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR +#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR + +int name_to_handle_at_loop(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags); + +int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret); +int path_get_mnt_id_at(int dir_fd, const char *path, int *ret); +static inline int path_get_mnt_id(const char *path, int *ret) { + return path_get_mnt_id_at(AT_FDCWD, path, ret); +} + +int fd_is_mount_point(int fd, const char *filename, int flags); +int path_is_mount_point(const char *path, const char *root, int flags); + +bool fstype_is_network(const char *fstype); +bool fstype_needs_quota(const char *fstype); +bool fstype_is_api_vfs(const char *fstype); +bool fstype_is_blockdev_backed(const char *fstype); +bool fstype_is_ro(const char *fsype); +bool fstype_can_discard(const char *fstype); +bool fstype_can_uid_gid(const char *fstype); +bool fstype_can_norecovery(const char *fstype); +bool fstype_can_umask(const char *fstype); + +int dev_is_devtmpfs(void); + +int mount_fd(const char *source, int target_fd, const char *filesystemtype, unsigned long mountflags, const void *data); +int mount_nofollow(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data); + +const char *mount_propagation_flag_to_string(unsigned long flags); +int mount_propagation_flag_from_string(const char *name, unsigned long *ret); +bool mount_propagation_flag_is_valid(unsigned long flag); + +bool mount_new_api_supported(void); +unsigned long ms_nosymfollow_supported(void); + +int mount_option_supported(const char *fstype, const char *key, const char *value); diff --git a/src/basic/namespace-util.c b/src/basic/namespace-util.c new file mode 100644 index 0000000..2101f61 --- /dev/null +++ b/src/basic/namespace-util.c @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "missing_fs.h" +#include "missing_magic.h" +#include "missing_sched.h" +#include "namespace-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "user-util.h" + +const struct namespace_info namespace_info[] = { + [NAMESPACE_CGROUP] = { "cgroup", "ns/cgroup", CLONE_NEWCGROUP, }, + [NAMESPACE_IPC] = { "ipc", "ns/ipc", CLONE_NEWIPC, }, + [NAMESPACE_NET] = { "net", "ns/net", CLONE_NEWNET, }, + /* So, the mount namespace flag is called CLONE_NEWNS for historical + * reasons. Let's expose it here under a more explanatory name: "mnt". + * This is in-line with how the kernel exposes namespaces in /proc/$PID/ns. */ + [NAMESPACE_MOUNT] = { "mnt", "ns/mnt", CLONE_NEWNS, }, + [NAMESPACE_PID] = { "pid", "ns/pid", CLONE_NEWPID, }, + [NAMESPACE_USER] = { "user", "ns/user", CLONE_NEWUSER, }, + [NAMESPACE_UTS] = { "uts", "ns/uts", CLONE_NEWUTS, }, + [NAMESPACE_TIME] = { "time", "ns/time", CLONE_NEWTIME, }, + { /* Allow callers to iterate over the array without using _NAMESPACE_TYPE_MAX. */ }, +}; + +#define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path) + +int namespace_open(pid_t pid, int *pidns_fd, int *mntns_fd, int *netns_fd, int *userns_fd, int *root_fd) { + _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF; + int rfd = -EBADF; + + assert(pid >= 0); + + if (mntns_fd) { + const char *mntns; + + mntns = pid_namespace_path(pid, NAMESPACE_MOUNT); + mntnsfd = open(mntns, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (mntnsfd < 0) + return -errno; + } + + if (pidns_fd) { + const char *pidns; + + pidns = pid_namespace_path(pid, NAMESPACE_PID); + pidnsfd = open(pidns, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (pidnsfd < 0) + return -errno; + } + + if (netns_fd) { + const char *netns; + + netns = pid_namespace_path(pid, NAMESPACE_NET); + netnsfd = open(netns, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (netnsfd < 0) + return -errno; + } + + if (userns_fd) { + const char *userns; + + userns = pid_namespace_path(pid, NAMESPACE_USER); + usernsfd = open(userns, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (usernsfd < 0 && errno != ENOENT) + return -errno; + } + + if (root_fd) { + const char *root; + + root = procfs_file_alloca(pid, "root"); + rfd = open(root, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + if (rfd < 0) + return -errno; + } + + if (pidns_fd) + *pidns_fd = TAKE_FD(pidnsfd); + + if (mntns_fd) + *mntns_fd = TAKE_FD(mntnsfd); + + if (netns_fd) + *netns_fd = TAKE_FD(netnsfd); + + if (userns_fd) + *userns_fd = TAKE_FD(usernsfd); + + if (root_fd) + *root_fd = TAKE_FD(rfd); + + return 0; +} + +int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd) { + int r; + + if (userns_fd >= 0) { + /* Can't setns to your own userns, since then you could escalate from non-root to root in + * your own namespace, so check if namespaces are equal before attempting to enter. */ + + r = inode_same_at(userns_fd, "", AT_FDCWD, "/proc/self/ns/user", AT_EMPTY_PATH); + if (r < 0) + return r; + if (r) + userns_fd = -EBADF; + } + + if (pidns_fd >= 0) + if (setns(pidns_fd, CLONE_NEWPID) < 0) + return -errno; + + if (mntns_fd >= 0) + if (setns(mntns_fd, CLONE_NEWNS) < 0) + return -errno; + + if (netns_fd >= 0) + if (setns(netns_fd, CLONE_NEWNET) < 0) + return -errno; + + if (userns_fd >= 0) + if (setns(userns_fd, CLONE_NEWUSER) < 0) + return -errno; + + if (root_fd >= 0) { + if (fchdir(root_fd) < 0) + return -errno; + + if (chroot(".") < 0) + return -errno; + } + + return reset_uid_gid(); +} + +int fd_is_ns(int fd, unsigned long nsflag) { + struct statfs s; + int r; + + /* Checks whether the specified file descriptor refers to a namespace created by specifying nsflag in clone(). + * On old kernels there's no nice way to detect that, hence on those we'll return a recognizable error (EUCLEAN), + * so that callers can handle this somewhat nicely. + * + * This function returns > 0 if the fd definitely refers to a network namespace, 0 if it definitely does not + * refer to a network namespace, -EUCLEAN if we can't determine, and other negative error codes on error. */ + + if (fstatfs(fd, &s) < 0) + return -errno; + + if (!is_fs_type(&s, NSFS_MAGIC)) { + /* On really old kernels, there was no "nsfs", and network namespace sockets belonged to procfs + * instead. Handle that in a somewhat smart way. */ + + if (is_fs_type(&s, PROC_SUPER_MAGIC)) { + struct statfs t; + + /* OK, so it is procfs. Let's see if our own network namespace is procfs, too. If so, then the + * passed fd might refer to a network namespace, but we can't know for sure. In that case, + * return a recognizable error. */ + + if (statfs("/proc/self/ns/net", &t) < 0) + return -errno; + + if (s.f_type == t.f_type) + return -EUCLEAN; /* It's possible, we simply don't know */ + } + + return 0; /* No! */ + } + + r = ioctl(fd, NS_GET_NSTYPE); + if (r < 0) { + if (errno == ENOTTY) /* Old kernels didn't know this ioctl, let's also return a recognizable error in that case */ + return -EUCLEAN; + + return -errno; + } + + return (unsigned long) r == nsflag; +} + +int detach_mount_namespace(void) { + /* Detaches the mount namespace, disabling propagation from our namespace to the host. Sets + * propagation first to MS_SLAVE for all mounts (disabling propagation), and then back to MS_SHARED + * (so that we create a new peer group). */ + + if (unshare(CLONE_NEWNS) < 0) + return log_debug_errno(errno, "Failed to acquire mount namespace: %m"); + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to set mount propagation to MS_SLAVE for all mounts: %m"); + + if (mount(NULL, "/", NULL, MS_SHARED | MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to set mount propagation back to MS_SHARED for all mounts: %m"); + + return 0; +} + +int userns_acquire(const char *uid_map, const char *gid_map) { + char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1]; + _cleanup_(sigkill_waitp) pid_t pid = 0; + _cleanup_close_ int userns_fd = -EBADF; + int r; + + assert(uid_map); + assert(gid_map); + + /* Forks off a process in a new userns, configures the specified uidmap/gidmap, acquires an fd to it, + * and then kills the process again. This way we have a userns fd that is not bound to any + * process. We can use that for file system mounts and similar. */ + + r = safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_USERNS, &pid); + if (r < 0) + return r; + if (r == 0) + /* Child. We do nothing here, just freeze until somebody kills us. */ + freeze(); + + xsprintf(path, "/proc/" PID_FMT "/uid_map", pid); + r = write_string_file(path, uid_map, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write UID map: %m"); + + xsprintf(path, "/proc/" PID_FMT "/gid_map", pid); + r = write_string_file(path, gid_map, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write GID map: %m"); + + r = namespace_open(pid, NULL, NULL, NULL, &userns_fd, NULL); + if (r < 0) + return log_error_errno(r, "Failed to open userns fd: %m"); + + return TAKE_FD(userns_fd); + +} + +int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type) { + const char *ns_path; + struct stat ns_st1, ns_st2; + + if (pid1 == 0) + pid1 = getpid_cached(); + + if (pid2 == 0) + pid2 = getpid_cached(); + + if (pid1 == pid2) + return 1; + + ns_path = pid_namespace_path(pid1, type); + if (stat(ns_path, &ns_st1) < 0) + return -errno; + + ns_path = pid_namespace_path(pid2, type); + if (stat(ns_path, &ns_st2) < 0) + return -errno; + + return stat_inode_same(&ns_st1, &ns_st2); +} diff --git a/src/basic/namespace-util.h b/src/basic/namespace-util.h new file mode 100644 index 0000000..be5b228 --- /dev/null +++ b/src/basic/namespace-util.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef enum NamespaceType { + NAMESPACE_CGROUP, + NAMESPACE_IPC, + NAMESPACE_NET, + NAMESPACE_MOUNT, + NAMESPACE_PID, + NAMESPACE_USER, + NAMESPACE_UTS, + NAMESPACE_TIME, + _NAMESPACE_TYPE_MAX, + _NAMESPACE_TYPE_INVALID = -EINVAL, +} NamespaceType; + +extern const struct namespace_info { + const char *proc_name; + const char *proc_path; + unsigned int clone_flag; +} namespace_info[_NAMESPACE_TYPE_MAX + 1]; + +int namespace_open(pid_t pid, int *pidns_fd, int *mntns_fd, int *netns_fd, int *userns_fd, int *root_fd); +int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd); + +int fd_is_ns(int fd, unsigned long nsflag); + +int detach_mount_namespace(void); + +static inline bool userns_shift_range_valid(uid_t shift, uid_t range) { + /* Checks that the specified userns range makes sense, i.e. contains at least one UID, and the end + * doesn't overflow uid_t. */ + + assert_cc((uid_t) -1 > 0); /* verify that uid_t is unsigned */ + + if (range <= 0) + return false; + + if (shift > (uid_t) -1 - range) + return false; + + return true; +} + +int userns_acquire(const char *uid_map, const char *gid_map); +int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type); diff --git a/src/basic/nss-util.h b/src/basic/nss-util.h new file mode 100644 index 0000000..579e2c0 --- /dev/null +++ b/src/basic/nss-util.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#define NSS_SIGNALS_BLOCK SIGALRM,SIGVTALRM,SIGPIPE,SIGCHLD,SIGTSTP,SIGIO,SIGHUP,SIGUSR1,SIGUSR2,SIGPROF,SIGURG,SIGWINCH + +#ifndef DEPRECATED_RES_USE_INET6 +# define DEPRECATED_RES_USE_INET6 0x00002000 +#endif + +#define NSS_GETHOSTBYNAME_PROTOTYPES(module) \ +enum nss_status _nss_##module##_gethostbyname4_r( \ + const char *name, \ + struct gaih_addrtuple **pat, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop, \ + int32_t *ttlp) _public_; \ +enum nss_status _nss_##module##_gethostbyname3_r( \ + const char *name, \ + int af, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop, \ + int32_t *ttlp, \ + char **canonp) _public_; \ +enum nss_status _nss_##module##_gethostbyname2_r( \ + const char *name, \ + int af, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop) _public_; \ +enum nss_status _nss_##module##_gethostbyname_r( \ + const char *name, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop) _public_ + +#define NSS_GETHOSTBYADDR_PROTOTYPES(module) \ +enum nss_status _nss_##module##_gethostbyaddr2_r( \ + const void* addr, socklen_t len, \ + int af, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop, \ + int32_t *ttlp) _public_; \ +enum nss_status _nss_##module##_gethostbyaddr_r( \ + const void* addr, socklen_t len, \ + int af, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop) _public_ + +#define NSS_GETHOSTBYNAME_FALLBACKS(module) \ +enum nss_status _nss_##module##_gethostbyname2_r( \ + const char *name, \ + int af, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop) { \ + return _nss_##module##_gethostbyname3_r( \ + name, \ + af, \ + host, \ + buffer, buflen, \ + errnop, h_errnop, \ + NULL, \ + NULL); \ +} \ +enum nss_status _nss_##module##_gethostbyname_r( \ + const char *name, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop) { \ + enum nss_status ret = NSS_STATUS_NOTFOUND; \ + \ + if (_res.options & DEPRECATED_RES_USE_INET6) \ + ret = _nss_##module##_gethostbyname3_r( \ + name, \ + AF_INET6, \ + host, \ + buffer, buflen, \ + errnop, h_errnop, \ + NULL, \ + NULL); \ + if (ret == NSS_STATUS_NOTFOUND) \ + ret = _nss_##module##_gethostbyname3_r( \ + name, \ + AF_INET, \ + host, \ + buffer, buflen, \ + errnop, h_errnop, \ + NULL, \ + NULL); \ + return ret; \ +} + +#define NSS_GETHOSTBYADDR_FALLBACKS(module) \ +enum nss_status _nss_##module##_gethostbyaddr_r( \ + const void* addr, socklen_t len, \ + int af, \ + struct hostent *host, \ + char *buffer, size_t buflen, \ + int *errnop, int *h_errnop) { \ + return _nss_##module##_gethostbyaddr2_r( \ + addr, len, \ + af, \ + host, \ + buffer, buflen, \ + errnop, h_errnop, \ + NULL); \ +} + +#define NSS_GETPW_PROTOTYPES(module) \ +enum nss_status _nss_##module##_getpwnam_r( \ + const char *name, \ + struct passwd *pwd, \ + char *buffer, size_t buflen, \ + int *errnop) _public_; \ +enum nss_status _nss_##module##_getpwuid_r( \ + uid_t uid, \ + struct passwd *pwd, \ + char *buffer, size_t buflen, \ + int *errnop) _public_ + +#define NSS_GETSP_PROTOTYPES(module) \ +enum nss_status _nss_##module##_getspnam_r( \ + const char *name, \ + struct spwd *spwd, \ + char *buffer, size_t buflen, \ + int *errnop) _public_ + +#define NSS_GETSG_PROTOTYPES(module) \ +enum nss_status _nss_##module##_getsgnam_r( \ + const char *name, \ + struct sgrp *sgrp, \ + char *buffer, size_t buflen, \ + int *errnop) _public_ + +#define NSS_GETGR_PROTOTYPES(module) \ +enum nss_status _nss_##module##_getgrnam_r( \ + const char *name, \ + struct group *gr, \ + char *buffer, size_t buflen, \ + int *errnop) _public_; \ +enum nss_status _nss_##module##_getgrgid_r( \ + gid_t gid, \ + struct group *gr, \ + char *buffer, size_t buflen, \ + int *errnop) _public_ + +#define NSS_PWENT_PROTOTYPES(module) \ +enum nss_status _nss_##module##_endpwent( \ + void) _public_; \ +enum nss_status _nss_##module##_setpwent( \ + int stayopen) _public_; \ +enum nss_status _nss_##module##_getpwent_r( \ + struct passwd *result, \ + char *buffer, \ + size_t buflen, \ + int *errnop) _public_; + +#define NSS_SPENT_PROTOTYPES(module) \ +enum nss_status _nss_##module##_endspent( \ + void) _public_; \ +enum nss_status _nss_##module##_setspent( \ + int stayopen) _public_; \ +enum nss_status _nss_##module##_getspent_r( \ + struct spwd *spwd, \ + char *buffer, \ + size_t buflen, \ + int *errnop) _public_; + +#define NSS_GRENT_PROTOTYPES(module) \ +enum nss_status _nss_##module##_endgrent( \ + void) _public_; \ +enum nss_status _nss_##module##_setgrent( \ + int stayopen) _public_; \ +enum nss_status _nss_##module##_getgrent_r( \ + struct group *result, \ + char *buffer, \ + size_t buflen, \ + int *errnop) _public_; + +#define NSS_SGENT_PROTOTYPES(module) \ +enum nss_status _nss_##module##_endsgent( \ + void) _public_; \ +enum nss_status _nss_##module##_setsgent( \ + int stayopen) _public_; \ +enum nss_status _nss_##module##_getsgent_r( \ + struct sgrp *sgrp, \ + char *buffer, \ + size_t buflen, \ + int *errnop) _public_; + +#define NSS_INITGROUPS_PROTOTYPE(module) \ +enum nss_status _nss_##module##_initgroups_dyn( \ + const char *user, \ + gid_t group, \ + long int *start, \ + long int *size, \ + gid_t **groupsp, \ + long int limit, \ + int *errnop) _public_; + +typedef enum nss_status (*_nss_gethostbyname4_r_t)( + const char *name, + struct gaih_addrtuple **pat, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp); + +typedef enum nss_status (*_nss_gethostbyname3_r_t)( + const char *name, + int af, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp, + char **canonp); + +typedef enum nss_status (*_nss_gethostbyname2_r_t)( + const char *name, + int af, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop); + +typedef enum nss_status (*_nss_gethostbyname_r_t)( + const char *name, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop); + +typedef enum nss_status (*_nss_gethostbyaddr2_r_t)( + const void* addr, socklen_t len, + int af, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp); +typedef enum nss_status (*_nss_gethostbyaddr_r_t)( + const void* addr, socklen_t len, + int af, + struct hostent *host, + char *buffer, size_t buflen, + int *errnop, int *h_errnop); + +typedef enum nss_status (*_nss_getpwnam_r_t)( + const char *name, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop); +typedef enum nss_status (*_nss_getpwuid_r_t)( + uid_t uid, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop); + +typedef enum nss_status (*_nss_getgrnam_r_t)( + const char *name, + struct group *gr, + char *buffer, size_t buflen, + int *errnop); +typedef enum nss_status (*_nss_getgrgid_r_t)( + gid_t gid, + struct group *gr, + char *buffer, size_t buflen, + int *errnop); diff --git a/src/basic/nulstr-util.c b/src/basic/nulstr-util.c new file mode 100644 index 0000000..06fa219 --- /dev/null +++ b/src/basic/nulstr-util.c @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "nulstr-util.h" +#include "string-util.h" +#include "strv.h" + +char** strv_parse_nulstr_full(const char *s, size_t l, bool drop_trailing_nuls) { + /* l is the length of the input data, which will be split at NULs into elements of the resulting + * strv. Hence, the number of items in the resulting strv will be equal to one plus the number of NUL + * bytes in the l bytes starting at s, unless s[l-1] is NUL, in which case the final empty string is + * not stored in the resulting strv, and length is equal to the number of NUL bytes. + * + * Note that contrary to a normal nulstr which cannot contain empty strings, because the input data + * is terminated by any two consequent NUL bytes, this parser accepts empty strings in s. */ + + _cleanup_strv_free_ char **v = NULL; + size_t c = 0, i = 0; + + assert(s || l <= 0); + + if (drop_trailing_nuls) + while (l > 0 && s[l-1] == '\0') + l--; + + if (l <= 0) + return new0(char*, 1); + + for (const char *p = s; p < s + l; p++) + if (*p == 0) + c++; + + if (s[l-1] != 0) + c++; + + v = new0(char*, c+1); + if (!v) + return NULL; + + for (const char *p = s; p < s + l; ) { + const char *e; + + e = memchr(p, 0, s + l - p); + + v[i] = memdup_suffix0(p, e ? e - p : s + l - p); + if (!v[i]) + return NULL; + + i++; + + if (!e) + break; + + p = e + 1; + } + + assert(i == c); + + return TAKE_PTR(v); +} + +char** strv_split_nulstr(const char *s) { + _cleanup_strv_free_ char **l = NULL; + + /* This parses a nulstr, without specification of size, and stops at an empty string. This cannot + * parse nulstrs with embedded empty strings hence, as an empty string is an end marker. Use + * strv_parse_nulstr() above to parse a nulstr with embedded empty strings (which however requires a + * size to be specified) */ + + NULSTR_FOREACH(i, s) + if (strv_extend(&l, i) < 0) + return NULL; + + return l ? TAKE_PTR(l) : strv_new(NULL); +} + +int strv_make_nulstr(char * const *l, char **ret, size_t *ret_size) { + /* Builds a nulstr and returns it together with the size. An extra NUL byte will be appended (⚠️ but + * not included in the size! ⚠️). This is done so that the nulstr can be used both in + * strv_parse_nulstr() and in NULSTR_FOREACH()/strv_split_nulstr() contexts, i.e. with and without a + * size parameter. In the former case we can include empty strings, in the latter case we cannot (as + * that is the end marker). + * + * When NULSTR_FOREACH()/strv_split_nulstr() is used it is often assumed that the nulstr ends in two + * NUL bytes (which it will, if not empty). To ensure that this assumption *always* holds, we'll + * return a buffer with two NUL bytes in that case, but return a size of zero. */ + + _cleanup_free_ char *m = NULL; + size_t n = 0; + + assert(ret); + + STRV_FOREACH(i, l) { + size_t z; + + z = strlen(*i); + + if (!GREEDY_REALLOC(m, n + z + 2)) + return -ENOMEM; + + memcpy(m + n, *i, z + 1); + n += z + 1; + } + + if (!m) { + /* return a buffer with an extra NUL, so that the assumption that we always have two trailing NULs holds */ + m = new0(char, 2); + if (!m) + return -ENOMEM; + + n = 0; + } else + /* Make sure there is a second extra NUL at the end of resulting nulstr (not counted in return size) */ + m[n] = '\0'; + + *ret = TAKE_PTR(m); + if (ret_size) + *ret_size = n; + + return 0; +} + +int set_make_nulstr(Set *s, char **ret, size_t *ret_size) { + /* Use _cleanup_free_ instead of _cleanup_strv_free_ because we need to clean the strv only, not + * the strings owned by the set. */ + _cleanup_free_ char **strv = NULL; + + assert(ret); + + strv = set_get_strv(s); + if (!strv) + return -ENOMEM; + + return strv_make_nulstr(strv, ret, ret_size); +} + +const char* nulstr_get(const char *nulstr, const char *needle) { + if (!nulstr) + return NULL; + + NULSTR_FOREACH(i, nulstr) + if (streq(i, needle)) + return i; + + return NULL; +} diff --git a/src/basic/nulstr-util.h b/src/basic/nulstr-util.h new file mode 100644 index 0000000..d7bc5fd --- /dev/null +++ b/src/basic/nulstr-util.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "set.h" + +#define NULSTR_FOREACH(i, l) \ + for (typeof(*(l)) *(i) = (l); (i) && *(i); (i) = strchr((i), 0)+1) + +#define NULSTR_FOREACH_PAIR(i, j, l) \ + for (typeof(*(l)) *(i) = (l), *(j) = strchr((i), 0)+1; (i) && *(i); (i) = strchr((j), 0)+1, (j) = *(i) ? strchr((i), 0)+1 : (i)) + +const char* nulstr_get(const char *nulstr, const char *needle); + +static inline bool nulstr_contains(const char *nulstr, const char *needle) { + return nulstr_get(nulstr, needle); +} + +char** strv_parse_nulstr_full(const char *s, size_t l, bool drop_trailing_nuls); +static inline char** strv_parse_nulstr(const char *s, size_t l) { + return strv_parse_nulstr_full(s, l, false); +} +char** strv_split_nulstr(const char *s); +int strv_make_nulstr(char * const *l, char **p, size_t *n); +int set_make_nulstr(Set *s, char **ret, size_t *ret_size); + +static inline int strv_from_nulstr(char ***ret, const char *nulstr) { + char **t; + + assert(ret); + + t = strv_split_nulstr(nulstr); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} diff --git a/src/basic/ordered-set.c b/src/basic/ordered-set.c new file mode 100644 index 0000000..b4c2588 --- /dev/null +++ b/src/basic/ordered-set.c @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fileio.h" +#include "ordered-set.h" +#include "strv.h" + +int _ordered_set_ensure_allocated(OrderedSet **s, const struct hash_ops *ops HASHMAP_DEBUG_PARAMS) { + if (*s) + return 0; + + *s = _ordered_set_new(ops HASHMAP_DEBUG_PASS_ARGS); + if (!*s) + return -ENOMEM; + + return 0; +} + +int _ordered_set_ensure_put(OrderedSet **s, const struct hash_ops *ops, void *p HASHMAP_DEBUG_PARAMS) { + int r; + + r = _ordered_set_ensure_allocated(s, ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + return ordered_set_put(*s, p); +} + +int ordered_set_consume(OrderedSet *s, void *p) { + int r; + + r = ordered_set_put(s, p); + if (r <= 0) + free(p); + + return r; +} + +int _ordered_set_put_strdup(OrderedSet **s, const char *p HASHMAP_DEBUG_PARAMS) { + char *c; + int r; + + assert(s); + assert(p); + + r = _ordered_set_ensure_allocated(s, &string_hash_ops_free HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + if (ordered_set_contains(*s, p)) + return 0; + + c = strdup(p); + if (!c) + return -ENOMEM; + + return ordered_set_consume(*s, c); +} + +int _ordered_set_put_strdupv(OrderedSet **s, char **l HASHMAP_DEBUG_PARAMS) { + int n = 0, r; + + STRV_FOREACH(i, l) { + r = _ordered_set_put_strdup(s, *i HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + n += r; + } + + return n; +} + +int ordered_set_put_string_set(OrderedSet **s, OrderedSet *l) { + int n = 0, r; + char *p; + + /* Like ordered_set_put_strv, but for an OrderedSet of strings */ + + ORDERED_SET_FOREACH(p, l) { + r = ordered_set_put_strdup(s, p); + if (r < 0) + return r; + + n += r; + } + + return n; +} + +void ordered_set_print(FILE *f, const char *field, OrderedSet *s) { + bool space = false; + char *p; + + if (ordered_set_isempty(s)) + return; + + fputs(field, f); + + ORDERED_SET_FOREACH(p, s) + fputs_with_space(f, p, NULL, &space); + + fputc('\n', f); +} diff --git a/src/basic/ordered-set.h b/src/basic/ordered-set.h new file mode 100644 index 0000000..e73da20 --- /dev/null +++ b/src/basic/ordered-set.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "hashmap.h" + +typedef struct OrderedSet OrderedSet; + +static inline OrderedSet* _ordered_set_new(const struct hash_ops *ops HASHMAP_DEBUG_PARAMS) { + return (OrderedSet*) _ordered_hashmap_new(ops HASHMAP_DEBUG_PASS_ARGS); +} +#define ordered_set_new(ops) _ordered_set_new(ops HASHMAP_DEBUG_SRC_ARGS) + +int _ordered_set_ensure_allocated(OrderedSet **s, const struct hash_ops *ops HASHMAP_DEBUG_PARAMS); +#define ordered_set_ensure_allocated(s, ops) _ordered_set_ensure_allocated(s, ops HASHMAP_DEBUG_SRC_ARGS) + +int _ordered_set_ensure_put(OrderedSet **s, const struct hash_ops *ops, void *p HASHMAP_DEBUG_PARAMS); +#define ordered_set_ensure_put(s, hash_ops, key) _ordered_set_ensure_put(s, hash_ops, key HASHMAP_DEBUG_SRC_ARGS) + +static inline void ordered_set_clear(OrderedSet *s) { + return ordered_hashmap_clear((OrderedHashmap*) s); +} + +static inline void ordered_set_clear_free(OrderedSet *s) { + return ordered_hashmap_clear_free((OrderedHashmap*) s); +} + +static inline OrderedSet* ordered_set_free(OrderedSet *s) { + return (OrderedSet*) ordered_hashmap_free((OrderedHashmap*) s); +} + +static inline OrderedSet* ordered_set_free_free(OrderedSet *s) { + return (OrderedSet*) ordered_hashmap_free_free((OrderedHashmap*) s); +} + +static inline int ordered_set_contains(OrderedSet *s, const void *p) { + return ordered_hashmap_contains((OrderedHashmap*) s, p); +} + +static inline int ordered_set_put(OrderedSet *s, void *p) { + return ordered_hashmap_put((OrderedHashmap*) s, p, p); +} + +static inline void *ordered_set_get(OrderedSet *s, const void *p) { + return ordered_hashmap_get((OrderedHashmap*) s, p); +} + +static inline unsigned ordered_set_size(OrderedSet *s) { + return ordered_hashmap_size((OrderedHashmap*) s); +} + +static inline bool ordered_set_isempty(OrderedSet *s) { + return ordered_hashmap_isempty((OrderedHashmap*) s); +} + +static inline bool ordered_set_iterate(OrderedSet *s, Iterator *i, void **value) { + return ordered_hashmap_iterate((OrderedHashmap*) s, i, value, NULL); +} + +static inline void* ordered_set_remove(OrderedSet *s, void *p) { + return ordered_hashmap_remove((OrderedHashmap*) s, p); +} + +static inline void* ordered_set_first(OrderedSet *s) { + return ordered_hashmap_first((OrderedHashmap*) s); +} + +static inline void* ordered_set_steal_first(OrderedSet *s) { + return ordered_hashmap_steal_first((OrderedHashmap*) s); +} + +static inline char** ordered_set_get_strv(OrderedSet *s) { + return _hashmap_get_strv(HASHMAP_BASE((OrderedHashmap*) s)); +} + +static inline int ordered_set_reserve(OrderedSet *s, unsigned entries_add) { + return ordered_hashmap_reserve((OrderedHashmap*) s, entries_add); +} + +int ordered_set_consume(OrderedSet *s, void *p); +int _ordered_set_put_strdup(OrderedSet **s, const char *p HASHMAP_DEBUG_PARAMS); +#define ordered_set_put_strdup(s, p) _ordered_set_put_strdup(s, p HASHMAP_DEBUG_SRC_ARGS) +int _ordered_set_put_strdupv(OrderedSet **s, char **l HASHMAP_DEBUG_PARAMS); +#define ordered_set_put_strdupv(s, l) _ordered_set_put_strdupv(s, l HASHMAP_DEBUG_SRC_ARGS) +int ordered_set_put_string_set(OrderedSet **s, OrderedSet *l); +void ordered_set_print(FILE *f, const char *field, OrderedSet *s); + +#define _ORDERED_SET_FOREACH(e, s, i) \ + for (Iterator i = ITERATOR_FIRST; ordered_set_iterate((s), &i, (void**)&(e)); ) +#define ORDERED_SET_FOREACH(e, s) \ + _ORDERED_SET_FOREACH(e, s, UNIQ_T(i, UNIQ)) + +#define ordered_set_clear_with_destructor(s, f) \ + ({ \ + OrderedSet *_s = (s); \ + void *_item; \ + while ((_item = ordered_set_steal_first(_s))) \ + f(_item); \ + _s; \ + }) +#define ordered_set_free_with_destructor(s, f) \ + ordered_set_free(ordered_set_clear_with_destructor(s, f)) + +DEFINE_TRIVIAL_CLEANUP_FUNC(OrderedSet*, ordered_set_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(OrderedSet*, ordered_set_free_free); + +#define _cleanup_ordered_set_free_ _cleanup_(ordered_set_freep) +#define _cleanup_ordered_set_free_free_ _cleanup_(ordered_set_free_freep) diff --git a/src/basic/origin-id.h b/src/basic/origin-id.h new file mode 100644 index 0000000..c55b0a3 --- /dev/null +++ b/src/basic/origin-id.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "random-util.h" + +/* This pattern needs to be repeated exactly in multiple modules, so macro it. + * To ensure an object is not passed into a different module (e.g.: when two shared objects statically + * linked to libsystemd get loaded in the same process, and the object created by one is passed to the + * other, see https://github.com/systemd/systemd/issues/27216), create a random static global random + * (mixed with PID, so that we can also check for reuse after fork) that is stored in the object and + * checked by public API on use. */ +#define _DEFINE_ORIGIN_ID_HELPERS(type, name, scope) \ +static uint64_t origin_id; \ + \ +static void origin_id_initialize(void) { \ + origin_id = random_u64(); \ +} \ + \ +static uint64_t origin_id_query(void) { \ + static pthread_once_t once = PTHREAD_ONCE_INIT; \ + assert_se(pthread_once(&once, origin_id_initialize) == 0); \ + return origin_id ^ getpid_cached(); \ +} \ + \ +scope bool name##_origin_changed(type *p) { \ + assert(p); \ + return p->origin_id != origin_id_query(); \ +} + +#define DEFINE_ORIGIN_ID_HELPERS(type, name) \ + _DEFINE_ORIGIN_ID_HELPERS(type, name,); + +#define DEFINE_PRIVATE_ORIGIN_ID_HELPERS(type, name) \ + _DEFINE_ORIGIN_ID_HELPERS(type, name, static); diff --git a/src/basic/os-util.c b/src/basic/os-util.c new file mode 100644 index 0000000..dbd067f --- /dev/null +++ b/src/basic/os-util.c @@ -0,0 +1,442 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "chase.h" +#include "dirent-util.h" +#include "env-file.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "macro.h" +#include "os-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" +#include "xattr-util.h" + +static const char* const image_class_table[_IMAGE_CLASS_MAX] = { + [IMAGE_MACHINE] = "machine", + [IMAGE_PORTABLE] = "portable", + [IMAGE_SYSEXT] = "sysext", + [IMAGE_CONFEXT] = "confext", +}; + +DEFINE_STRING_TABLE_LOOKUP(image_class, ImageClass); + +/* Helper struct for naming simplicity and reusability */ +static const struct { + const char *release_file_directory; + const char *release_file_path_prefix; +} image_class_release_info[_IMAGE_CLASS_MAX] = { + [IMAGE_SYSEXT] = { + .release_file_directory = "/usr/lib/extension-release.d/", + .release_file_path_prefix = "/usr/lib/extension-release.d/extension-release.", + }, + [IMAGE_CONFEXT] = { + .release_file_directory = "/etc/extension-release.d/", + .release_file_path_prefix = "/etc/extension-release.d/extension-release.", + } +}; + +bool image_name_is_valid(const char *s) { + if (!filename_is_valid(s)) + return false; + + if (string_has_cc(s, NULL)) + return false; + + if (!utf8_is_valid(s)) + return false; + + /* Temporary files for atomically creating new files */ + if (startswith(s, ".#")) + return false; + + return true; +} + +int path_is_extension_tree(ImageClass image_class, const char *path, const char *extension, bool relax_extension_release_check) { + int r; + + assert(path); + + /* Does the path exist at all? If not, generate an error immediately. This is useful so that a missing root dir + * always results in -ENOENT, and we can properly distinguish the case where the whole root doesn't exist from + * the case where just the os-release file is missing. */ + if (laccess(path, F_OK) < 0) + return -errno; + + /* We use /usr/lib/extension-release.d/extension-release[.NAME] as flag for something being a system extension, + * /etc/extension-release.d/extension-release[.NAME] as flag for something being a system configuration, and finally, + * and {/etc|/usr/lib}/os-release as a flag for something being an OS (when not an extension). */ + r = open_extension_release(path, image_class, extension, relax_extension_release_check, NULL, NULL); + if (r == -ENOENT) /* We got nothing */ + return 0; + if (r < 0) + return r; + + return 1; +} + +static int extension_release_strict_xattr_value(int extension_release_fd, const char *extension_release_dir_path, const char *filename) { + int r; + + assert(extension_release_fd >= 0); + assert(extension_release_dir_path); + assert(filename); + + /* No xattr or cannot parse it? Then skip this. */ + r = getxattr_at_bool(extension_release_fd, /* path= */ NULL, "user.extension-release.strict", /* flags= */ 0); + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) + return log_debug_errno(r, "%s/%s does not have user.extension-release.strict xattr, ignoring.", + extension_release_dir_path, filename); + if (r < 0) + return log_debug_errno(r, "%s/%s: Failed to read 'user.extension-release.strict' extended attribute from file, ignoring: %m", + extension_release_dir_path, filename); + + /* Explicitly set to request strict matching? Skip it. */ + if (r > 0) { + log_debug("%s/%s: 'user.extension-release.strict' attribute is true, ignoring file.", + extension_release_dir_path, filename); + return true; + } + + log_debug("%s/%s: 'user.extension-release.strict' attribute is false%s", + extension_release_dir_path, filename, + special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + return false; +} + +int open_os_release_at(int rfd, char **ret_path, int *ret_fd) { + const char *e; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + + e = secure_getenv("SYSTEMD_OS_RELEASE"); + if (e) + return chaseat(rfd, e, CHASE_AT_RESOLVE_IN_ROOT, ret_path, ret_fd); + + FOREACH_STRING(path, "/etc/os-release", "/usr/lib/os-release") { + r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT, ret_path, ret_fd); + if (r != -ENOENT) + return r; + } + + return -ENOENT; +} + +int open_os_release(const char *root, char **ret_path, int *ret_fd) { + _cleanup_close_ int rfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + rfd = open(empty_to_root(root), O_CLOEXEC | O_DIRECTORY | O_PATH); + if (rfd < 0) + return -errno; + + r = open_os_release_at(rfd, ret_path ? &p : NULL, ret_fd ? &fd : NULL); + if (r < 0) + return r; + + if (ret_path) { + r = chaseat_prefix_root(p, root, ret_path); + if (r < 0) + return r; + } + + if (ret_fd) + *ret_fd = TAKE_FD(fd); + + return 0; +} + +int open_extension_release_at( + int rfd, + ImageClass image_class, + const char *extension, + bool relax_extension_release_check, + char **ret_path, + int *ret_fd) { + + _cleanup_free_ char *dir_path = NULL, *path_found = NULL; + _cleanup_close_ int fd_found = -EBADF; + _cleanup_closedir_ DIR *dir = NULL; + bool found = false; + const char *p; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(!extension || (image_class >= 0 && image_class < _IMAGE_CLASS_MAX)); + + if (!extension) + return open_os_release_at(rfd, ret_path, ret_fd); + + if (!IN_SET(image_class, IMAGE_SYSEXT, IMAGE_CONFEXT)) + return -EINVAL; + + if (!image_name_is_valid(extension)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "The extension name %s is invalid.", extension); + + p = strjoina(image_class_release_info[image_class].release_file_path_prefix, extension); + r = chaseat(rfd, p, CHASE_AT_RESOLVE_IN_ROOT, ret_path, ret_fd); + log_full_errno_zerook(LOG_DEBUG, MIN(r, 0), "Checking for %s: %m", p); + if (r != -ENOENT) + return r; + + /* Cannot find the expected extension-release file? The image filename might have been mangled on + * deployment, so fallback to checking for any file in the extension-release.d directory, and return + * the first one with a user.extension-release xattr instead. The user.extension-release.strict + * xattr is checked to ensure the author of the image considers it OK if names do not match. */ + + p = image_class_release_info[image_class].release_file_directory; + r = chase_and_opendirat(rfd, p, CHASE_AT_RESOLVE_IN_ROOT, &dir_path, &dir); + if (r < 0) + return log_debug_errno(r, "Cannot open %s, ignoring: %m", p); + + FOREACH_DIRENT(de, dir, return -errno) { + _cleanup_close_ int fd = -EBADF; + const char *image_name; + + if (!IN_SET(de->d_type, DT_REG, DT_UNKNOWN)) + continue; + + image_name = startswith(de->d_name, "extension-release."); + if (!image_name) + continue; + + if (!image_name_is_valid(image_name)) { + log_debug("%s/%s is not a valid release file name, ignoring.", dir_path, de->d_name); + continue; + } + + /* We already chased the directory, and checked that this is a real file, so we shouldn't + * fail to open it. */ + fd = openat(dirfd(dir), de->d_name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (fd < 0) + return log_debug_errno(errno, "Failed to open release file %s/%s: %m", dir_path, de->d_name); + + /* Really ensure it is a regular file after we open it. */ + r = fd_verify_regular(fd); + if (r < 0) { + log_debug_errno(r, "%s/%s is not a regular file, ignoring: %m", dir_path, de->d_name); + continue; + } + + if (!relax_extension_release_check && + extension_release_strict_xattr_value(fd, dir_path, de->d_name) != 0) + continue; + + /* We already found what we were looking for, but there's another candidate? We treat this as + * an error, as we want to enforce that there are no ambiguities in case we are in the + * fallback path. */ + if (found) + return -ENOTUNIQ; + + found = true; + + if (ret_fd) + fd_found = TAKE_FD(fd); + + if (ret_path) { + path_found = path_join(dir_path, de->d_name); + if (!path_found) + return -ENOMEM; + } + } + if (!found) + return -ENOENT; + + if (ret_fd) + *ret_fd = TAKE_FD(fd_found); + if (ret_path) + *ret_path = TAKE_PTR(path_found); + + return 0; +} + +int open_extension_release( + const char *root, + ImageClass image_class, + const char *extension, + bool relax_extension_release_check, + char **ret_path, + int *ret_fd) { + + _cleanup_close_ int rfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + rfd = open(empty_to_root(root), O_CLOEXEC | O_DIRECTORY | O_PATH); + if (rfd < 0) + return -errno; + + r = open_extension_release_at(rfd, image_class, extension, relax_extension_release_check, + ret_path ? &p : NULL, ret_fd ? &fd : NULL); + if (r < 0) + return r; + + if (ret_path) { + r = chaseat_prefix_root(p, root, ret_path); + if (r < 0) + return r; + } + + if (ret_fd) + *ret_fd = TAKE_FD(fd); + + return 0; +} + +static int parse_extension_release_atv( + int rfd, + ImageClass image_class, + const char *extension, + bool relax_extension_release_check, + va_list ap) { + + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + + r = open_extension_release_at(rfd, image_class, extension, relax_extension_release_check, &p, &fd); + if (r < 0) + return r; + + return parse_env_file_fdv(fd, p, ap); +} + +int parse_extension_release_at_sentinel( + int rfd, + ImageClass image_class, + bool relax_extension_release_check, + const char *extension, + ...) { + + va_list ap; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + + va_start(ap, extension); + r = parse_extension_release_atv(rfd, image_class, extension, relax_extension_release_check, ap); + va_end(ap); + return r; +} + +int parse_extension_release_sentinel( + const char *root, + ImageClass image_class, + bool relax_extension_release_check, + const char *extension, + ...) { + + _cleanup_close_ int rfd = -EBADF; + va_list ap; + int r; + + rfd = open(empty_to_root(root), O_CLOEXEC | O_DIRECTORY | O_PATH); + if (rfd < 0) + return -errno; + + va_start(ap, extension); + r = parse_extension_release_atv(rfd, image_class, extension, relax_extension_release_check, ap); + va_end(ap); + return r; +} + +int load_os_release_pairs_with_prefix(const char *root, const char *prefix, char ***ret) { + _cleanup_strv_free_ char **os_release_pairs = NULL, **os_release_pairs_prefixed = NULL; + int r; + + r = load_os_release_pairs(root, &os_release_pairs); + if (r < 0) + return r; + + STRV_FOREACH_PAIR(p, q, os_release_pairs) { + char *line; + + /* We strictly return only the four main ID fields and ignore the rest */ + if (!STR_IN_SET(*p, "ID", "VERSION_ID", "BUILD_ID", "VARIANT_ID")) + continue; + + ascii_strlower(*p); + line = strjoin(prefix, *p, "=", *q); + if (!line) + return -ENOMEM; + r = strv_consume(&os_release_pairs_prefixed, line); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(os_release_pairs_prefixed); + + return 0; +} + +int load_extension_release_pairs(const char *root, ImageClass image_class, const char *extension, bool relax_extension_release_check, char ***ret) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + r = open_extension_release(root, image_class, extension, relax_extension_release_check, &p, &fd); + if (r < 0) + return r; + + return load_env_file_pairs_fd(fd, p, ret); +} + +int os_release_support_ended(const char *support_end, bool quiet, usec_t *ret_eol) { + _cleanup_free_ char *_support_end_alloc = NULL; + int r; + + if (!support_end) { + /* If the caller has the variably handy, they can pass it in. If not, we'll read it + * ourselves. */ + + r = parse_os_release(NULL, + "SUPPORT_END", &_support_end_alloc); + if (r < 0 && r != -ENOENT) + return log_full_errno(quiet ? LOG_DEBUG : LOG_WARNING, r, + "Failed to read os-release file, ignoring: %m"); + + support_end = _support_end_alloc; + } + + if (isempty(support_end)) /* An empty string is a explicit way to say "no EOL exists" */ + return false; /* no end date defined */ + + struct tm tm = {}; + const char *k = strptime(support_end, "%Y-%m-%d", &tm); + if (!k || *k) + return log_full_errno(quiet ? LOG_DEBUG : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL), + "Failed to parse SUPPORT_END= in os-release file, ignoring: %m"); + + time_t eol = timegm(&tm); + if (eol == (time_t) -1) + return log_full_errno(quiet ? LOG_DEBUG : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL), + "Failed to convert SUPPORT_END= in os-release file, ignoring: %m"); + + if (ret_eol) + *ret_eol = eol * USEC_PER_SEC; + + return DIV_ROUND_UP(now(CLOCK_REALTIME), USEC_PER_SEC) > (usec_t) eol; +} + +const char *os_release_pretty_name(const char *pretty_name, const char *name) { + /* Distills a "pretty" name to show from os-release data. First argument is supposed to be the + * PRETTY_NAME= field, the second one the NAME= field. This function is trivial, of course, and + * exists mostly to ensure we use the same logic wherever possible. */ + + return empty_to_null(pretty_name) ?: + empty_to_null(name) ?: "Linux"; +} diff --git a/src/basic/os-util.h b/src/basic/os-util.h new file mode 100644 index 0000000..7cee3dd --- /dev/null +++ b/src/basic/os-util.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "time-util.h" + +typedef enum ImageClass { + IMAGE_MACHINE, + IMAGE_PORTABLE, + IMAGE_SYSEXT, + _IMAGE_CLASS_EXTENSION_FIRST = IMAGE_SYSEXT, /* First "extension" image type, so that we can easily generically iterate through them */ + IMAGE_CONFEXT, + _IMAGE_CLASS_EXTENSION_LAST = IMAGE_CONFEXT, /* Last "extension image type */ + _IMAGE_CLASS_MAX, + _IMAGE_CLASS_INVALID = -EINVAL, +} ImageClass; + +const char* image_class_to_string(ImageClass cl) _const_; +ImageClass image_class_from_string(const char *s) _pure_; + +/* The *_extension_release flavours will look for /usr/lib/extension-release/extension-release.NAME + * for sysext images and for /etc/extension-release.d/extension-release.NAME for confext images + * in accordance with the OS extension specification, rather than for /usr/lib/ or /etc/os-release. */ + +bool image_name_is_valid(const char *s) _pure_; + +int path_is_extension_tree(ImageClass image_class, const char *path, const char *extension, bool relax_extension_release_check); +static inline int path_is_os_tree(const char *path) { + return path_is_extension_tree(_IMAGE_CLASS_INVALID, path, NULL, false); +} + +int open_extension_release(const char *root, ImageClass image_class, const char *extension, bool relax_extension_release_check, char **ret_path, int *ret_fd); +int open_extension_release_at(int rfd, ImageClass image_class, const char *extension, bool relax_extension_release_check, char **ret_path, int *ret_fd); +int open_os_release(const char *root, char **ret_path, int *ret_fd); +int open_os_release_at(int rfd, char **ret_path, int *ret_fd); + +int parse_extension_release_sentinel(const char *root, ImageClass image_class, bool relax_extension_release_check, const char *extension, ...) _sentinel_; +#define parse_extension_release(root, image_class, extension, relax_extension_release_check, ...) \ + parse_extension_release_sentinel(root, image_class, relax_extension_release_check, extension, __VA_ARGS__, NULL) +#define parse_os_release(root, ...) \ + parse_extension_release_sentinel(root, _IMAGE_CLASS_INVALID, false, NULL, __VA_ARGS__, NULL) + +int parse_extension_release_at_sentinel(int rfd, ImageClass image_class, bool relax_extension_release_check, const char *extension, ...) _sentinel_; +#define parse_extension_release_at(rfd, image_class, extension, relax_extension_release_check, ...) \ + parse_extension_release_at_sentinel(rfd, image_class, relax_extension_release_check, extension, __VA_ARGS__, NULL) +#define parse_os_release_at(rfd, ...) \ + parse_extension_release_at_sentinel(rfd, _IMAGE_CLASS_INVALID, false, NULL, __VA_ARGS__, NULL) + +int load_extension_release_pairs(const char *root, ImageClass image_class, const char *extension, bool relax_extension_release_check, char ***ret); +static inline int load_os_release_pairs(const char *root, char ***ret) { + return load_extension_release_pairs(root, _IMAGE_CLASS_INVALID, NULL, false, ret); +} +int load_os_release_pairs_with_prefix(const char *root, const char *prefix, char ***ret); + +int os_release_support_ended(const char *support_end, bool quiet, usec_t *ret_eol); + +const char *os_release_pretty_name(const char *pretty_name, const char *name); diff --git a/src/basic/parse-util.c b/src/basic/parse-util.c new file mode 100644 index 0000000..0430e33 --- /dev/null +++ b/src/basic/parse-util.c @@ -0,0 +1,806 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-list.h" +#include "extract-word.h" +#include "locale-util.h" +#include "macro.h" +#include "missing_network.h" +#include "parse-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" + +int parse_boolean(const char *v) { + if (!v) + return -EINVAL; + + if (STRCASE_IN_SET(v, + "1", + "yes", + "y", + "true", + "t", + "on")) + return 1; + + if (STRCASE_IN_SET(v, + "0", + "no", + "n", + "false", + "f", + "off")) + return 0; + + return -EINVAL; +} + +int parse_tristate_full(const char *v, const char *third, int *ret) { + int r; + + if (isempty(v) || streq_ptr(v, third)) { /* Empty string is always taken as the third/invalid/auto state */ + if (ret) + *ret = -1; + } else { + r = parse_boolean(v); + if (r < 0) + return r; + + if (ret) + *ret = r; + } + + return 0; +} + +int parse_pid(const char *s, pid_t* ret_pid) { + unsigned long ul = 0; + pid_t pid; + int r; + + assert(s); + + r = safe_atolu(s, &ul); + if (r < 0) + return r; + + pid = (pid_t) ul; + + if ((unsigned long) pid != ul) + return -ERANGE; + + if (!pid_is_valid(pid)) + return -ERANGE; + + if (ret_pid) + *ret_pid = pid; + return 0; +} + +int parse_mode(const char *s, mode_t *ret) { + unsigned m; + int r; + + assert(s); + + r = safe_atou_full(s, 8 | + SAFE_ATO_REFUSE_PLUS_MINUS, /* Leading '+' or even '-' char? that's just weird, + * refuse. User might have wanted to add mode flags or + * so, but this parser doesn't allow that, so let's + * better be safe. */ + &m); + if (r < 0) + return r; + if (m > 07777) + return -ERANGE; + + if (ret) + *ret = m; + return 0; +} + +int parse_ifindex(const char *s) { + int ifi, r; + + assert(s); + + r = safe_atoi(s, &ifi); + if (r < 0) + return r; + if (ifi <= 0) + return -EINVAL; + + return ifi; +} + +int parse_mtu(int family, const char *s, uint32_t *ret) { + uint64_t u, m; + int r; + + r = parse_size(s, 1024, &u); + if (r < 0) + return r; + + if (u > UINT32_MAX) + return -ERANGE; + + switch (family) { + case AF_INET: + m = IPV4_MIN_MTU; /* This is 68 */ + break; + case AF_INET6: + m = IPV6_MIN_MTU; /* This is 1280 */ + break; + default: + m = 0; + } + + if (u < m) + return -ERANGE; + + *ret = (uint32_t) u; + return 0; +} + +int parse_size(const char *t, uint64_t base, uint64_t *size) { + + /* Soo, sometimes we want to parse IEC binary suffixes, and + * sometimes SI decimal suffixes. This function can parse + * both. Which one is the right way depends on the + * context. Wikipedia suggests that SI is customary for + * hardware metrics and network speeds, while IEC is + * customary for most data sizes used by software and volatile + * (RAM) memory. Hence be careful which one you pick! + * + * In either case we use just K, M, G as suffix, and not Ki, + * Mi, Gi or so (as IEC would suggest). That's because that's + * frickin' ugly. But this means you really need to make sure + * to document which base you are parsing when you use this + * call. */ + + struct table { + const char *suffix; + unsigned long long factor; + }; + + static const struct table iec[] = { + { "E", 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL }, + { "P", 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL }, + { "T", 1024ULL*1024ULL*1024ULL*1024ULL }, + { "G", 1024ULL*1024ULL*1024ULL }, + { "M", 1024ULL*1024ULL }, + { "K", 1024ULL }, + { "B", 1ULL }, + { "", 1ULL }, + }; + + static const struct table si[] = { + { "E", 1000ULL*1000ULL*1000ULL*1000ULL*1000ULL*1000ULL }, + { "P", 1000ULL*1000ULL*1000ULL*1000ULL*1000ULL }, + { "T", 1000ULL*1000ULL*1000ULL*1000ULL }, + { "G", 1000ULL*1000ULL*1000ULL }, + { "M", 1000ULL*1000ULL }, + { "K", 1000ULL }, + { "B", 1ULL }, + { "", 1ULL }, + }; + + const struct table *table; + const char *p; + unsigned long long r = 0; + unsigned n_entries, start_pos = 0; + + assert(t); + assert(IN_SET(base, 1000, 1024)); + assert(size); + + if (base == 1000) { + table = si; + n_entries = ELEMENTSOF(si); + } else { + table = iec; + n_entries = ELEMENTSOF(iec); + } + + p = t; + do { + unsigned long long l, tmp; + double frac = 0; + char *e; + unsigned i; + + p += strspn(p, WHITESPACE); + + errno = 0; + l = strtoull(p, &e, 10); + if (errno > 0) + return -errno; + if (e == p) + return -EINVAL; + if (*p == '-') + return -ERANGE; + + if (*e == '.') { + e++; + + /* strtoull() itself would accept space/+/- */ + if (ascii_isdigit(*e)) { + unsigned long long l2; + char *e2; + + l2 = strtoull(e, &e2, 10); + if (errno > 0) + return -errno; + + /* Ignore failure. E.g. 10.M is valid */ + frac = l2; + for (; e < e2; e++) + frac /= 10; + } + } + + e += strspn(e, WHITESPACE); + + for (i = start_pos; i < n_entries; i++) + if (startswith(e, table[i].suffix)) + break; + + if (i >= n_entries) + return -EINVAL; + + if (l + (frac > 0) > ULLONG_MAX / table[i].factor) + return -ERANGE; + + tmp = l * table[i].factor + (unsigned long long) (frac * table[i].factor); + if (tmp > ULLONG_MAX - r) + return -ERANGE; + + r += tmp; + if ((unsigned long long) (uint64_t) r != r) + return -ERANGE; + + p = e + strlen(table[i].suffix); + + start_pos = i + 1; + + } while (*p); + + *size = r; + + return 0; +} + +int parse_sector_size(const char *t, uint64_t *ret) { + int r; + + assert(t); + assert(ret); + + uint64_t ss; + + r = safe_atou64(t, &ss); + if (r < 0) + return log_error_errno(r, "Failed to parse sector size parameter %s", t); + if (ss < 512 || ss > 4096) /* Allow up to 4K due to dm-crypt support and 4K alignment by the homed LUKS backend */ + return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Sector size not between 512 and 4096: %s", t); + if (!ISPOWEROF2(ss)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Sector size not power of 2: %s", t); + + *ret = ss; + return 0; +} + +int parse_range(const char *t, unsigned *lower, unsigned *upper) { + _cleanup_free_ char *word = NULL; + unsigned l, u; + int r; + + assert(lower); + assert(upper); + + /* Extract the lower bound. */ + r = extract_first_word(&t, &word, "-", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + r = safe_atou(word, &l); + if (r < 0) + return r; + + /* Check for the upper bound and extract it if needed */ + if (!t) + /* Single number with no dashes. */ + u = l; + else if (!*t) + /* Trailing dash is an error. */ + return -EINVAL; + else { + r = safe_atou(t, &u); + if (r < 0) + return r; + } + + *lower = l; + *upper = u; + return 0; +} + +int parse_errno(const char *t) { + int r, e; + + assert(t); + + r = errno_from_name(t); + if (r > 0) + return r; + + r = safe_atoi(t, &e); + if (r < 0) + return r; + + /* 0 is also allowed here */ + if (!errno_is_valid(e) && e != 0) + return -ERANGE; + + return e; +} + +int parse_fd(const char *t) { + int r, fd; + + assert(t); + + r = safe_atoi(t, &fd); + if (r < 0) + return r; + + if (fd < 0) + return -EBADF; + + return fd; +} + +static const char *mangle_base(const char *s, unsigned *base) { + const char *k; + + assert(s); + assert(base); + + /* Base already explicitly specified, then don't do anything. */ + if (SAFE_ATO_MASK_FLAGS(*base) != 0) + return s; + + /* Support Python 3 style "0b" and 0x" prefixes, because they truly make sense, much more than C's "0" prefix for octal. */ + k = STARTSWITH_SET(s, "0b", "0B"); + if (k) { + *base = 2 | (*base & SAFE_ATO_ALL_FLAGS); + return k; + } + + k = STARTSWITH_SET(s, "0o", "0O"); + if (k) { + *base = 8 | (*base & SAFE_ATO_ALL_FLAGS); + return k; + } + + return s; +} + +int safe_atou_full(const char *s, unsigned base, unsigned *ret_u) { + char *x = NULL; + unsigned long l; + + assert(s); + assert(SAFE_ATO_MASK_FLAGS(base) <= 16); + + /* strtoul() is happy to parse negative values, and silently converts them to unsigned values without + * generating an error. We want a clean error, hence let's look for the "-" prefix on our own, and + * generate an error. But let's do so only after strtoul() validated that the string is clean + * otherwise, so that we return EINVAL preferably over ERANGE. */ + + if (FLAGS_SET(base, SAFE_ATO_REFUSE_LEADING_WHITESPACE) && + strchr(WHITESPACE, s[0])) + return -EINVAL; + + s += strspn(s, WHITESPACE); + + if (FLAGS_SET(base, SAFE_ATO_REFUSE_PLUS_MINUS) && + IN_SET(s[0], '+', '-')) + return -EINVAL; /* Note that we check the "-" prefix again a second time below, but return a + * different error. I.e. if the SAFE_ATO_REFUSE_PLUS_MINUS flag is set we + * blanket refuse +/- prefixed integers, while if it is missing we'll just + * return ERANGE, because the string actually parses correctly, but doesn't + * fit in the return type. */ + + if (FLAGS_SET(base, SAFE_ATO_REFUSE_LEADING_ZERO) && + s[0] == '0' && !streq(s, "0")) + return -EINVAL; /* This is particularly useful to avoid ambiguities between C's octal + * notation and assumed-to-be-decimal integers with a leading zero. */ + + s = mangle_base(s, &base); + + errno = 0; + l = strtoul(s, &x, SAFE_ATO_MASK_FLAGS(base) /* Let's mask off the flags bits so that only the actual + * base is left */); + if (errno > 0) + return -errno; + if (!x || x == s || *x != 0) + return -EINVAL; + if (l != 0 && s[0] == '-') + return -ERANGE; + if ((unsigned long) (unsigned) l != l) + return -ERANGE; + + if (ret_u) + *ret_u = (unsigned) l; + + return 0; +} + +int safe_atou_bounded(const char *s, unsigned min, unsigned max, unsigned *ret) { + unsigned v; + int r; + + r = safe_atou(s, &v); + if (r < 0) + return r; + + if (v < min || v > max) + return -ERANGE; + + *ret = v; + return 0; +} + +int safe_atoi(const char *s, int *ret_i) { + unsigned base = 0; + char *x = NULL; + long l; + + assert(s); + + s += strspn(s, WHITESPACE); + s = mangle_base(s, &base); + + errno = 0; + l = strtol(s, &x, base); + if (errno > 0) + return -errno; + if (!x || x == s || *x != 0) + return -EINVAL; + if ((long) (int) l != l) + return -ERANGE; + + if (ret_i) + *ret_i = (int) l; + + return 0; +} + +int safe_atollu_full(const char *s, unsigned base, unsigned long long *ret_llu) { + char *x = NULL; + unsigned long long l; + + assert(s); + assert(SAFE_ATO_MASK_FLAGS(base) <= 16); + + if (FLAGS_SET(base, SAFE_ATO_REFUSE_LEADING_WHITESPACE) && + strchr(WHITESPACE, s[0])) + return -EINVAL; + + s += strspn(s, WHITESPACE); + + if (FLAGS_SET(base, SAFE_ATO_REFUSE_PLUS_MINUS) && + IN_SET(s[0], '+', '-')) + return -EINVAL; + + if (FLAGS_SET(base, SAFE_ATO_REFUSE_LEADING_ZERO) && + s[0] == '0' && s[1] != 0) + return -EINVAL; + + s = mangle_base(s, &base); + + errno = 0; + l = strtoull(s, &x, SAFE_ATO_MASK_FLAGS(base)); + if (errno > 0) + return -errno; + if (!x || x == s || *x != 0) + return -EINVAL; + if (l != 0 && s[0] == '-') + return -ERANGE; + + if (ret_llu) + *ret_llu = l; + + return 0; +} + +int safe_atolli(const char *s, long long int *ret_lli) { + unsigned base = 0; + char *x = NULL; + long long l; + + assert(s); + + s += strspn(s, WHITESPACE); + s = mangle_base(s, &base); + + errno = 0; + l = strtoll(s, &x, base); + if (errno > 0) + return -errno; + if (!x || x == s || *x != 0) + return -EINVAL; + + if (ret_lli) + *ret_lli = l; + + return 0; +} + +int safe_atou8_full(const char *s, unsigned base, uint8_t *ret) { + unsigned u; + int r; + + r = safe_atou_full(s, base, &u); + if (r < 0) + return r; + if (u > UINT8_MAX) + return -ERANGE; + + *ret = (uint8_t) u; + return 0; +} + +int safe_atou16_full(const char *s, unsigned base, uint16_t *ret) { + unsigned u; + int r; + + r = safe_atou_full(s, base, &u); + if (r < 0) + return r; + if (u > UINT16_MAX) + return -ERANGE; + + *ret = (uint16_t) u; + return 0; +} + +int safe_atoi16(const char *s, int16_t *ret) { + unsigned base = 0; + char *x = NULL; + long l; + + assert(s); + + s += strspn(s, WHITESPACE); + s = mangle_base(s, &base); + + errno = 0; + l = strtol(s, &x, base); + if (errno > 0) + return -errno; + if (!x || x == s || *x != 0) + return -EINVAL; + if ((long) (int16_t) l != l) + return -ERANGE; + + if (ret) + *ret = (int16_t) l; + + return 0; +} + +int safe_atod(const char *s, double *ret_d) { + _cleanup_(freelocalep) locale_t loc = (locale_t) 0; + char *x = NULL; + double d = 0; + + assert(s); + + loc = newlocale(LC_NUMERIC_MASK, "C", (locale_t) 0); + if (loc == (locale_t) 0) + return -errno; + + errno = 0; + d = strtod_l(s, &x, loc); + if (errno > 0) + return -errno; + if (!x || x == s || *x != 0) + return -EINVAL; + + if (ret_d) + *ret_d = (double) d; + + return 0; +} + +int parse_fractional_part_u(const char **p, size_t digits, unsigned *res) { + unsigned val = 0; + const char *s; + + s = *p; + + /* accept any number of digits, strtoull is limited to 19 */ + for (size_t i = 0; i < digits; i++,s++) { + if (!ascii_isdigit(*s)) { + if (i == 0) + return -EINVAL; + + /* too few digits, pad with 0 */ + for (; i < digits; i++) + val *= 10; + + break; + } + + val *= 10; + val += *s - '0'; + } + + /* maybe round up */ + if (*s >= '5' && *s <= '9') + val++; + + s += strspn(s, DIGITS); + + *p = s; + *res = val; + + return 0; +} + +int parse_nice(const char *p, int *ret) { + int n, r; + + r = safe_atoi(p, &n); + if (r < 0) + return r; + + if (!nice_is_valid(n)) + return -ERANGE; + + *ret = n; + return 0; +} + +int parse_ip_port(const char *s, uint16_t *ret) { + uint16_t l; + int r; + + r = safe_atou16_full(s, SAFE_ATO_REFUSE_LEADING_WHITESPACE, &l); + if (r < 0) + return r; + + if (l == 0) + return -EINVAL; + + *ret = (uint16_t) l; + + return 0; +} + +int parse_ip_port_range(const char *s, uint16_t *low, uint16_t *high) { + unsigned l, h; + int r; + + r = parse_range(s, &l, &h); + if (r < 0) + return r; + + if (l <= 0 || l > 65535 || h <= 0 || h > 65535) + return -EINVAL; + + if (h < l) + return -EINVAL; + + *low = l; + *high = h; + + return 0; +} + +int parse_ip_prefix_length(const char *s, int *ret) { + unsigned l; + int r; + + r = safe_atou(s, &l); + if (r < 0) + return r; + + if (l > 128) + return -ERANGE; + + *ret = (int) l; + + return 0; +} + +int parse_oom_score_adjust(const char *s, int *ret) { + int r, v; + + assert(s); + assert(ret); + + r = safe_atoi(s, &v); + if (r < 0) + return r; + + if (!oom_score_adjust_is_valid(v)) + return -ERANGE; + + *ret = v; + return 0; +} + +int store_loadavg_fixed_point(unsigned long i, unsigned long f, loadavg_t *ret) { + assert(ret); + + if (i >= (~0UL << LOADAVG_PRECISION_BITS)) + return -ERANGE; + + i = i << LOADAVG_PRECISION_BITS; + f = DIV_ROUND_UP((f << LOADAVG_PRECISION_BITS), 100); + + if (f >= LOADAVG_FIXED_POINT_1_0) + return -ERANGE; + + *ret = i | f; + return 0; +} + +int parse_loadavg_fixed_point(const char *s, loadavg_t *ret) { + const char *d, *f_str, *i_str; + unsigned long i, f; + int r; + + assert(s); + assert(ret); + + d = strchr(s, '.'); + if (!d) + return -EINVAL; + + i_str = strndupa_safe(s, d - s); + f_str = d + 1; + + r = safe_atolu_full(i_str, 10, &i); + if (r < 0) + return r; + + r = safe_atolu_full(f_str, 10, &f); + if (r < 0) + return r; + + return store_loadavg_fixed_point(i, f, ret); +} + +/* Limitations are described in https://www.netfilter.org/projects/nftables/manpage.html and + * https://bugzilla.netfilter.org/show_bug.cgi?id=1175 */ +bool nft_identifier_valid(const char *id) { + if (!id) + return false; + + size_t len = strlen(id); + if (len == 0 || len > 31) + return false; + + if (!ascii_isalpha(id[0])) + return false; + + for (size_t i = 1; i < len; i++) + if (!ascii_isalpha(id[i]) && !ascii_isdigit(id[i]) && !IN_SET(id[i], '/', '\\', '_', '.')) + return false; + return true; +} diff --git a/src/basic/parse-util.h b/src/basic/parse-util.h new file mode 100644 index 0000000..1845f0a --- /dev/null +++ b/src/basic/parse-util.h @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "macro.h" + +typedef unsigned long loadavg_t; + +int parse_boolean(const char *v) _pure_; +int parse_tristate_full(const char *v, const char *third, int *ret); +static inline int parse_tristate(const char *v, int *ret) { + return parse_tristate_full(v, NULL, ret); +} +int parse_pid(const char *s, pid_t* ret_pid); +int parse_mode(const char *s, mode_t *ret); +int parse_ifindex(const char *s); +int parse_mtu(int family, const char *s, uint32_t *ret); + +int parse_size(const char *t, uint64_t base, uint64_t *size); +int parse_sector_size(const char *t, uint64_t *ret); +int parse_range(const char *t, unsigned *lower, unsigned *upper); +int parse_errno(const char *t); +int parse_fd(const char *t); + +#define SAFE_ATO_REFUSE_PLUS_MINUS (1U << 30) +#define SAFE_ATO_REFUSE_LEADING_ZERO (1U << 29) +#define SAFE_ATO_REFUSE_LEADING_WHITESPACE (1U << 28) +#define SAFE_ATO_ALL_FLAGS (SAFE_ATO_REFUSE_PLUS_MINUS|SAFE_ATO_REFUSE_LEADING_ZERO|SAFE_ATO_REFUSE_LEADING_WHITESPACE) +#define SAFE_ATO_MASK_FLAGS(base) ((base) & ~SAFE_ATO_ALL_FLAGS) + +int safe_atou_full(const char *s, unsigned base, unsigned *ret_u); +static inline int safe_atou(const char *s, unsigned *ret_u) { + return safe_atou_full(s, 0, ret_u); +} + +int safe_atou_bounded(const char *s, unsigned min, unsigned max, unsigned *ret); + +int safe_atoi(const char *s, int *ret_i); +int safe_atolli(const char *s, long long int *ret_i); + +int safe_atou8_full(const char *s, unsigned base, uint8_t *ret); + +static inline int safe_atou8(const char *s, uint8_t *ret) { + return safe_atou8_full(s, 0, ret); +} + +int safe_atou16_full(const char *s, unsigned base, uint16_t *ret); + +static inline int safe_atou16(const char *s, uint16_t *ret) { + return safe_atou16_full(s, 0, ret); +} + +static inline int safe_atoux16(const char *s, uint16_t *ret) { + return safe_atou16_full(s, 16, ret); +} + +int safe_atoi16(const char *s, int16_t *ret); + +static inline int safe_atou32_full(const char *s, unsigned base, uint32_t *ret_u) { + assert_cc(sizeof(uint32_t) == sizeof(unsigned)); + return safe_atou_full(s, base, (unsigned*) ret_u); +} + +static inline int safe_atou32(const char *s, uint32_t *ret_u) { + return safe_atou32_full(s, 0, (unsigned*) ret_u); +} + +static inline int safe_atoi32(const char *s, int32_t *ret_i) { + assert_cc(sizeof(int32_t) == sizeof(int)); + return safe_atoi(s, (int*) ret_i); +} + +int safe_atollu_full(const char *s, unsigned base, unsigned long long *ret_llu); + +static inline int safe_atollu(const char *s, unsigned long long *ret_llu) { + return safe_atollu_full(s, 0, ret_llu); +} + +static inline int safe_atou64(const char *s, uint64_t *ret_u) { + assert_cc(sizeof(uint64_t) == sizeof(unsigned long long)); + return safe_atollu(s, (unsigned long long*) ret_u); +} + +static inline int safe_atoi64(const char *s, int64_t *ret_i) { + assert_cc(sizeof(int64_t) == sizeof(long long int)); + return safe_atolli(s, (long long int*) ret_i); +} + +static inline int safe_atoux64(const char *s, uint64_t *ret) { + assert_cc(sizeof(int64_t) == sizeof(unsigned long long)); + return safe_atollu_full(s, 16, (unsigned long long*) ret); +} + +#if LONG_MAX == INT_MAX +static inline int safe_atolu_full(const char *s, unsigned base, unsigned long *ret_u) { + assert_cc(sizeof(unsigned long) == sizeof(unsigned)); + return safe_atou_full(s, base, (unsigned*) ret_u); +} +static inline int safe_atoli(const char *s, long int *ret_u) { + assert_cc(sizeof(long int) == sizeof(int)); + return safe_atoi(s, (int*) ret_u); +} +#else +static inline int safe_atolu_full(const char *s, unsigned base, unsigned long *ret_u) { + assert_cc(sizeof(unsigned long) == sizeof(unsigned long long)); + return safe_atollu_full(s, base, (unsigned long long*) ret_u); +} +static inline int safe_atoli(const char *s, long int *ret_u) { + assert_cc(sizeof(long int) == sizeof(long long int)); + return safe_atolli(s, (long long int*) ret_u); +} +#endif + +static inline int safe_atolu(const char *s, unsigned long *ret_u) { + return safe_atolu_full(s, 0, ret_u); +} + +#if SIZE_MAX == UINT_MAX +static inline int safe_atozu(const char *s, size_t *ret_u) { + assert_cc(sizeof(size_t) == sizeof(unsigned)); + return safe_atou(s, (unsigned *) ret_u); +} +#else +static inline int safe_atozu(const char *s, size_t *ret_u) { + assert_cc(sizeof(size_t) == sizeof(unsigned long)); + return safe_atolu(s, ret_u); +} +#endif + +int safe_atod(const char *s, double *ret_d); + +int parse_fractional_part_u(const char **s, size_t digits, unsigned *res); + +int parse_nice(const char *p, int *ret); + +int parse_ip_port(const char *s, uint16_t *ret); +int parse_ip_port_range(const char *s, uint16_t *low, uint16_t *high); + +int parse_ip_prefix_length(const char *s, int *ret); + +int parse_oom_score_adjust(const char *s, int *ret); + +/* Implement floating point using fixed integers, to improve performance when + * calculating load averages. These macros can be used to extract the integer + * and decimal parts of a value. */ +#define LOADAVG_PRECISION_BITS 11 +#define LOADAVG_FIXED_POINT_1_0 (1 << LOADAVG_PRECISION_BITS) +#define LOADAVG_INT_SIDE(x) ((x) >> LOADAVG_PRECISION_BITS) +#define LOADAVG_DECIMAL_SIDE(x) LOADAVG_INT_SIDE(((x) & (LOADAVG_FIXED_POINT_1_0 - 1)) * 100) + +/* Given a Linux load average (e.g. decimal number 34.89 where 34 is passed as i and 89 is passed as f), convert it + * to a loadavg_t. */ +int store_loadavg_fixed_point(unsigned long i, unsigned long f, loadavg_t *ret); +int parse_loadavg_fixed_point(const char *s, loadavg_t *ret); + +bool nft_identifier_valid(const char *id); diff --git a/src/basic/path-lookup.c b/src/basic/path-lookup.c new file mode 100644 index 0000000..4e3d59f --- /dev/null +++ b/src/basic/path-lookup.c @@ -0,0 +1,910 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "nulstr-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "user-util.h" + +int xdg_user_runtime_dir(char **ret, const char *suffix) { + const char *e; + char *j; + + assert(ret); + assert(suffix); + + e = getenv("XDG_RUNTIME_DIR"); + if (!e) + return -ENXIO; + + j = path_join(e, suffix); + if (!j) + return -ENOMEM; + + *ret = j; + return 0; +} + +int xdg_user_config_dir(char **ret, const char *suffix) { + _cleanup_free_ char *j = NULL; + const char *e; + int r; + + assert(ret); + + e = getenv("XDG_CONFIG_HOME"); + if (e) { + j = path_join(e, suffix); + if (!j) + return -ENOMEM; + } else { + r = get_home_dir(&j); + if (r < 0) + return r; + + if (!path_extend(&j, "/.config", suffix)) + return -ENOMEM; + } + + *ret = TAKE_PTR(j); + return 0; +} + +int xdg_user_data_dir(char **ret, const char *suffix) { + _cleanup_free_ char *j = NULL; + const char *e; + int r; + + assert(ret); + assert(suffix); + + /* We don't treat /etc/xdg/systemd here as the spec + * suggests because we assume that is a link to + * /etc/systemd/ anyway. */ + + e = getenv("XDG_DATA_HOME"); + if (e) { + j = path_join(e, suffix); + if (!j) + return -ENOMEM; + } else { + r = get_home_dir(&j); + if (r < 0) + return r; + + if (!path_extend(&j, "/.local/share", suffix)) + return -ENOMEM; + } + + *ret = TAKE_PTR(j); + return 1; +} + +static const char* const user_data_unit_paths[] = { + "/usr/local/lib/systemd/user", + "/usr/local/share/systemd/user", + USER_DATA_UNIT_DIR, + "/usr/lib/systemd/user", + "/usr/share/systemd/user", + NULL +}; + +static const char* const user_config_unit_paths[] = { + USER_CONFIG_UNIT_DIR, + "/etc/systemd/user", + NULL +}; + +int xdg_user_dirs(char ***ret_config_dirs, char ***ret_data_dirs) { + /* Implement the mechanisms defined in + * + * https://standards.freedesktop.org/basedir-spec/basedir-spec-0.6.html + * + * We look in both the config and the data dirs because we + * want to encourage that distributors ship their unit files + * as data, and allow overriding as configuration. + */ + const char *e; + _cleanup_strv_free_ char **config_dirs = NULL, **data_dirs = NULL; + + e = getenv("XDG_CONFIG_DIRS"); + if (e) + config_dirs = strv_split(e, ":"); + else + config_dirs = strv_new("/etc/xdg"); + if (!config_dirs) + return -ENOMEM; + + e = getenv("XDG_DATA_DIRS"); + if (e) + data_dirs = strv_split(e, ":"); + else + data_dirs = strv_new("/usr/local/share", + "/usr/share"); + if (!data_dirs) + return -ENOMEM; + + *ret_config_dirs = TAKE_PTR(config_dirs); + *ret_data_dirs = TAKE_PTR(data_dirs); + + return 0; +} + +static char** user_dirs( + const char *persistent_config, + const char *runtime_config, + const char *global_persistent_config, + const char *global_runtime_config, + const char *generator, + const char *generator_early, + const char *generator_late, + const char *transient, + const char *persistent_control, + const char *runtime_control) { + + _cleanup_strv_free_ char **config_dirs = NULL, **data_dirs = NULL; + _cleanup_free_ char *data_home = NULL; + _cleanup_strv_free_ char **res = NULL; + int r; + + r = xdg_user_dirs(&config_dirs, &data_dirs); + if (r < 0) + return NULL; + + r = xdg_user_data_dir(&data_home, "/systemd/user"); + if (r < 0 && r != -ENXIO) + return NULL; + + /* Now merge everything we found. */ + if (strv_extend(&res, persistent_control) < 0) + return NULL; + + if (strv_extend(&res, runtime_control) < 0) + return NULL; + + if (strv_extend(&res, transient) < 0) + return NULL; + + if (strv_extend(&res, generator_early) < 0) + return NULL; + + if (strv_extend(&res, persistent_config) < 0) + return NULL; + + if (strv_extend_strv_concat(&res, config_dirs, "/systemd/user") < 0) + return NULL; + + /* global config has lower priority than the user config of the same type */ + if (strv_extend(&res, global_persistent_config) < 0) + return NULL; + + if (strv_extend_strv(&res, (char**) user_config_unit_paths, false) < 0) + return NULL; + + if (strv_extend(&res, runtime_config) < 0) + return NULL; + + if (strv_extend(&res, global_runtime_config) < 0) + return NULL; + + if (strv_extend(&res, generator) < 0) + return NULL; + + if (strv_extend(&res, data_home) < 0) + return NULL; + + if (strv_extend_strv_concat(&res, data_dirs, "/systemd/user") < 0) + return NULL; + + if (strv_extend_strv(&res, (char**) user_data_unit_paths, false) < 0) + return NULL; + + if (strv_extend(&res, generator_late) < 0) + return NULL; + + if (path_strv_make_absolute_cwd(res) < 0) + return NULL; + + return TAKE_PTR(res); +} + +bool path_is_user_data_dir(const char *path) { + assert(path); + + return strv_contains((char**) user_data_unit_paths, path); +} + +bool path_is_user_config_dir(const char *path) { + assert(path); + + return strv_contains((char**) user_config_unit_paths, path); +} + +static int acquire_generator_dirs( + RuntimeScope scope, + const char *tempdir, + char **generator, + char **generator_early, + char **generator_late) { + + _cleanup_free_ char *x = NULL, *y = NULL, *z = NULL, *p = NULL; + const char *prefix; + + assert(generator); + assert(generator_early); + assert(generator_late); + assert(IN_SET(scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER, RUNTIME_SCOPE_GLOBAL)); + + if (scope == RUNTIME_SCOPE_GLOBAL) + return -EOPNOTSUPP; + + if (tempdir) + prefix = tempdir; + else if (scope == RUNTIME_SCOPE_SYSTEM) + prefix = "/run/systemd"; + else { + /* RUNTIME_SCOPE_USER */ + const char *e; + + e = getenv("XDG_RUNTIME_DIR"); + if (!e) + return -ENXIO; + + p = path_join(e, "/systemd"); + if (!p) + return -ENOMEM; + + prefix = p; + } + + x = path_join(prefix, "generator"); + if (!x) + return -ENOMEM; + + y = path_join(prefix, "generator.early"); + if (!y) + return -ENOMEM; + + z = path_join(prefix, "generator.late"); + if (!z) + return -ENOMEM; + + *generator = TAKE_PTR(x); + *generator_early = TAKE_PTR(y); + *generator_late = TAKE_PTR(z); + + return 0; +} + +static int acquire_transient_dir( + RuntimeScope scope, + const char *tempdir, + char **ret) { + + char *transient; + + assert(ret); + assert(IN_SET(scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER, RUNTIME_SCOPE_GLOBAL)); + + if (scope == RUNTIME_SCOPE_GLOBAL) + return -EOPNOTSUPP; + + if (tempdir) + transient = path_join(tempdir, "transient"); + else if (scope == RUNTIME_SCOPE_SYSTEM) + transient = strdup("/run/systemd/transient"); + else + return xdg_user_runtime_dir(ret, "/systemd/transient"); + + if (!transient) + return -ENOMEM; + *ret = transient; + return 0; +} + +static int acquire_config_dirs(RuntimeScope scope, char **persistent, char **runtime) { + _cleanup_free_ char *a = NULL, *b = NULL; + int r; + + assert(persistent); + assert(runtime); + + switch (scope) { + + case RUNTIME_SCOPE_SYSTEM: + a = strdup(SYSTEM_CONFIG_UNIT_DIR); + b = strdup("/run/systemd/system"); + break; + + case RUNTIME_SCOPE_GLOBAL: + a = strdup(USER_CONFIG_UNIT_DIR); + b = strdup("/run/systemd/user"); + break; + + case RUNTIME_SCOPE_USER: + r = xdg_user_config_dir(&a, "/systemd/user"); + if (r < 0 && r != -ENXIO) + return r; + + r = xdg_user_runtime_dir(runtime, "/systemd/user"); + if (r < 0) { + if (r != -ENXIO) + return r; + + /* If XDG_RUNTIME_DIR is not set, don't consider that fatal, simply initialize the runtime + * directory to NULL */ + *runtime = NULL; + } + + *persistent = TAKE_PTR(a); + + return 0; + + default: + assert_not_reached(); + } + + if (!a || !b) + return -ENOMEM; + + *persistent = TAKE_PTR(a); + *runtime = TAKE_PTR(b); + + return 0; +} + +static int acquire_control_dirs(RuntimeScope scope, char **persistent, char **runtime) { + _cleanup_free_ char *a = NULL; + int r; + + assert(persistent); + assert(runtime); + + switch (scope) { + + case RUNTIME_SCOPE_SYSTEM: { + _cleanup_free_ char *b = NULL; + + a = strdup("/etc/systemd/system.control"); + if (!a) + return -ENOMEM; + + b = strdup("/run/systemd/system.control"); + if (!b) + return -ENOMEM; + + *runtime = TAKE_PTR(b); + + break; + } + + case RUNTIME_SCOPE_USER: + r = xdg_user_config_dir(&a, "/systemd/user.control"); + if (r < 0 && r != -ENXIO) + return r; + + r = xdg_user_runtime_dir(runtime, "/systemd/user.control"); + if (r < 0) { + if (r != -ENXIO) + return r; + + /* If XDG_RUNTIME_DIR is not set, don't consider this fatal, simply initialize the directory to + * NULL */ + *runtime = NULL; + } + + break; + + case RUNTIME_SCOPE_GLOBAL: + return -EOPNOTSUPP; + + default: + assert_not_reached(); + } + + *persistent = TAKE_PTR(a); + + return 0; +} + +static int acquire_attached_dirs( + RuntimeScope scope, + char **ret_persistent, + char **ret_runtime) { + + _cleanup_free_ char *a = NULL, *b = NULL; + + assert(ret_persistent); + assert(ret_runtime); + + /* Portable services are not available to regular users for now. */ + if (scope != RUNTIME_SCOPE_SYSTEM) + return -EOPNOTSUPP; + + a = strdup("/etc/systemd/system.attached"); + if (!a) + return -ENOMEM; + + b = strdup("/run/systemd/system.attached"); + if (!b) + return -ENOMEM; + + *ret_persistent = TAKE_PTR(a); + *ret_runtime = TAKE_PTR(b); + + return 0; +} + +static int patch_root_prefix(char **p, const char *root_dir) { + char *c; + + assert(p); + + if (!*p) + return 0; + + c = path_join(root_dir, *p); + if (!c) + return -ENOMEM; + + free_and_replace(*p, c); + return 0; +} + +static int patch_root_prefix_strv(char **l, const char *root_dir) { + int r; + + if (!root_dir) + return 0; + + STRV_FOREACH(i, l) { + r = patch_root_prefix(i, root_dir); + if (r < 0) + return r; + } + + return 0; +} + +static int get_paths_from_environ(const char *var, char ***paths, bool *append) { + const char *e; + int r; + + assert(var); + assert(paths); + assert(append); + + *append = false; + + e = getenv(var); + if (e) { + const char *k; + + k = endswith(e, ":"); + if (k) { + e = strndupa_safe(e, k - e); + *append = true; + } + + /* FIXME: empty components in other places should be rejected. */ + + r = path_split_and_make_absolute(e, paths); + if (r < 0) + return r; + } + + return 0; +} + +int lookup_paths_init( + LookupPaths *lp, + RuntimeScope scope, + LookupPathsFlags flags, + const char *root_dir) { + + _cleanup_(rmdir_and_freep) char *tempdir = NULL; + _cleanup_free_ char + *root = NULL, + *persistent_config = NULL, *runtime_config = NULL, + *global_persistent_config = NULL, *global_runtime_config = NULL, + *generator = NULL, *generator_early = NULL, *generator_late = NULL, + *transient = NULL, + *persistent_control = NULL, *runtime_control = NULL, + *persistent_attached = NULL, *runtime_attached = NULL; + bool append = false; /* Add items from SYSTEMD_UNIT_PATH before normal directories */ + _cleanup_strv_free_ char **paths = NULL; + int r; + + assert(lp); + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + if (!empty_or_root(root_dir)) { + if (scope == RUNTIME_SCOPE_USER) + return -EINVAL; + + r = is_dir(root_dir, true); + if (r < 0) + return r; + if (r == 0) + return -ENOTDIR; + + root = strdup(root_dir); + if (!root) + return -ENOMEM; + } + + if (flags & LOOKUP_PATHS_TEMPORARY_GENERATED) { + r = mkdtemp_malloc("/tmp/systemd-temporary-XXXXXX", &tempdir); + if (r < 0) + return log_debug_errno(r, "Failed to create temporary directory: %m"); + } + + /* Note: when XDG_RUNTIME_DIR is not set this will not return -ENXIO, but simply set runtime_config to NULL */ + r = acquire_config_dirs(scope, &persistent_config, &runtime_config); + if (r < 0) + return r; + + if (scope == RUNTIME_SCOPE_USER) { + r = acquire_config_dirs(RUNTIME_SCOPE_GLOBAL, &global_persistent_config, &global_runtime_config); + if (r < 0) + return r; + } + + if ((flags & LOOKUP_PATHS_EXCLUDE_GENERATED) == 0) { + /* Note: if XDG_RUNTIME_DIR is not set, this will fail completely with ENXIO */ + r = acquire_generator_dirs(scope, tempdir, + &generator, &generator_early, &generator_late); + if (r < 0 && !IN_SET(r, -EOPNOTSUPP, -ENXIO)) + return r; + } + + /* Note: if XDG_RUNTIME_DIR is not set, this will fail completely with ENXIO */ + r = acquire_transient_dir(scope, tempdir, &transient); + if (r < 0 && !IN_SET(r, -EOPNOTSUPP, -ENXIO)) + return r; + + /* Note: when XDG_RUNTIME_DIR is not set this will not return -ENXIO, but simply set runtime_control to NULL */ + r = acquire_control_dirs(scope, &persistent_control, &runtime_control); + if (r < 0 && r != -EOPNOTSUPP) + return r; + + r = acquire_attached_dirs(scope, &persistent_attached, &runtime_attached); + if (r < 0 && r != -EOPNOTSUPP) + return r; + + /* First priority is whatever has been passed to us via env vars */ + r = get_paths_from_environ("SYSTEMD_UNIT_PATH", &paths, &append); + if (r < 0) + return r; + + if (!paths || append) { + /* Let's figure something out. */ + + _cleanup_strv_free_ char **add = NULL; + + /* For the user units we include share/ in the search + * path in order to comply with the XDG basedir spec. + * For the system stuff we avoid such nonsense. OTOH + * we include /lib in the search path for the system + * stuff but avoid it for user stuff. */ + + switch (scope) { + + case RUNTIME_SCOPE_SYSTEM: + add = strv_new( + /* If you modify this you also want to modify + * systemdsystemunitpath= in systemd.pc.in! */ + STRV_IFNOTNULL(persistent_control), + STRV_IFNOTNULL(runtime_control), + STRV_IFNOTNULL(transient), + STRV_IFNOTNULL(generator_early), + persistent_config, + SYSTEM_CONFIG_UNIT_DIR, + "/etc/systemd/system", + STRV_IFNOTNULL(persistent_attached), + runtime_config, + "/run/systemd/system", + STRV_IFNOTNULL(runtime_attached), + STRV_IFNOTNULL(generator), + "/usr/local/lib/systemd/system", + SYSTEM_DATA_UNIT_DIR, + "/usr/lib/systemd/system", + /* To be used ONLY for images which might be legacy split-usr */ + STRV_IFNOTNULL(flags & LOOKUP_PATHS_SPLIT_USR ? "/lib/systemd/system" : NULL), + STRV_IFNOTNULL(generator_late)); + break; + + case RUNTIME_SCOPE_GLOBAL: + add = strv_new( + /* If you modify this you also want to modify + * systemduserunitpath= in systemd.pc.in, and + * the arrays in user_dirs() above! */ + STRV_IFNOTNULL(persistent_control), + STRV_IFNOTNULL(runtime_control), + STRV_IFNOTNULL(transient), + STRV_IFNOTNULL(generator_early), + persistent_config, + USER_CONFIG_UNIT_DIR, + "/etc/systemd/user", + runtime_config, + "/run/systemd/user", + STRV_IFNOTNULL(generator), + "/usr/local/share/systemd/user", + "/usr/share/systemd/user", + "/usr/local/lib/systemd/user", + USER_DATA_UNIT_DIR, + "/usr/lib/systemd/user", + STRV_IFNOTNULL(generator_late)); + break; + + case RUNTIME_SCOPE_USER: + add = user_dirs(persistent_config, runtime_config, + global_persistent_config, global_runtime_config, + generator, generator_early, generator_late, + transient, + persistent_control, runtime_control); + break; + + default: + assert_not_reached(); + } + + if (!add) + return -ENOMEM; + + if (paths) { + r = strv_extend_strv(&paths, add, true); + if (r < 0) + return r; + } else + /* Small optimization: if paths is NULL (and it usually is), we can simply assign 'add' to it, + * and don't have to copy anything */ + paths = TAKE_PTR(add); + } + + r = patch_root_prefix(&persistent_config, root); + if (r < 0) + return r; + r = patch_root_prefix(&runtime_config, root); + if (r < 0) + return r; + + r = patch_root_prefix(&generator, root); + if (r < 0) + return r; + r = patch_root_prefix(&generator_early, root); + if (r < 0) + return r; + r = patch_root_prefix(&generator_late, root); + if (r < 0) + return r; + + r = patch_root_prefix(&transient, root); + if (r < 0) + return r; + + r = patch_root_prefix(&persistent_control, root); + if (r < 0) + return r; + r = patch_root_prefix(&runtime_control, root); + if (r < 0) + return r; + + r = patch_root_prefix(&persistent_attached, root); + if (r < 0) + return r; + r = patch_root_prefix(&runtime_attached, root); + if (r < 0) + return r; + + r = patch_root_prefix_strv(paths, root); + if (r < 0) + return -ENOMEM; + + *lp = (LookupPaths) { + .search_path = strv_uniq(TAKE_PTR(paths)), + + .persistent_config = TAKE_PTR(persistent_config), + .runtime_config = TAKE_PTR(runtime_config), + + .generator = TAKE_PTR(generator), + .generator_early = TAKE_PTR(generator_early), + .generator_late = TAKE_PTR(generator_late), + + .transient = TAKE_PTR(transient), + + .persistent_control = TAKE_PTR(persistent_control), + .runtime_control = TAKE_PTR(runtime_control), + + .persistent_attached = TAKE_PTR(persistent_attached), + .runtime_attached = TAKE_PTR(runtime_attached), + + .root_dir = TAKE_PTR(root), + .temporary_dir = TAKE_PTR(tempdir), + }; + + return 0; +} + +int lookup_paths_init_or_warn(LookupPaths *lp, RuntimeScope scope, LookupPathsFlags flags, const char *root_dir) { + int r; + + r = lookup_paths_init(lp, scope, flags, root_dir); + if (r < 0) + return log_error_errno(r, "Failed to initialize unit search paths%s%s: %m", + isempty(root_dir) ? "" : " for root directory ", strempty(root_dir)); + return r; +} + +void lookup_paths_free(LookupPaths *lp) { + if (!lp) + return; + + lp->search_path = strv_free(lp->search_path); + + lp->persistent_config = mfree(lp->persistent_config); + lp->runtime_config = mfree(lp->runtime_config); + + lp->persistent_attached = mfree(lp->persistent_attached); + lp->runtime_attached = mfree(lp->runtime_attached); + + lp->generator = mfree(lp->generator); + lp->generator_early = mfree(lp->generator_early); + lp->generator_late = mfree(lp->generator_late); + + lp->transient = mfree(lp->transient); + + lp->persistent_control = mfree(lp->persistent_control); + lp->runtime_control = mfree(lp->runtime_control); + + lp->root_dir = mfree(lp->root_dir); + lp->temporary_dir = mfree(lp->temporary_dir); +} + +void lookup_paths_log(LookupPaths *lp) { + assert(lp); + + if (strv_isempty(lp->search_path)) { + log_debug("Ignoring unit files."); + lp->search_path = strv_free(lp->search_path); + } else { + _cleanup_free_ char *t = NULL; + + t = strv_join(lp->search_path, "\n\t"); + log_debug("Looking for unit files in (higher priority first):\n\t%s", strna(t)); + } +} + +char **generator_binary_paths(RuntimeScope scope) { + bool append = false; /* Add items from SYSTEMD_GENERATOR_PATH before normal directories */ + _cleanup_strv_free_ char **paths = NULL; + int r; + + /* First priority is whatever has been passed to us via env vars */ + r = get_paths_from_environ("SYSTEMD_GENERATOR_PATH", &paths, &append); + if (r < 0) + return NULL; + + if (!paths || append) { + _cleanup_strv_free_ char **add = NULL; + + switch (scope) { + + case RUNTIME_SCOPE_SYSTEM: + add = strv_new("/run/systemd/system-generators", + "/etc/systemd/system-generators", + "/usr/local/lib/systemd/system-generators", + SYSTEM_GENERATOR_DIR); + break; + + case RUNTIME_SCOPE_GLOBAL: + case RUNTIME_SCOPE_USER: + add = strv_new("/run/systemd/user-generators", + "/etc/systemd/user-generators", + "/usr/local/lib/systemd/user-generators", + USER_GENERATOR_DIR); + break; + + default: + assert_not_reached(); + } + if (!add) + return NULL; + + if (paths) { + r = strv_extend_strv(&paths, add, true); + if (r < 0) + return NULL; + } else + /* Small optimization: if paths is NULL (and it usually is), we can simply assign 'add' to it, + * and don't have to copy anything */ + paths = TAKE_PTR(add); + } + + return TAKE_PTR(paths); +} + +char **env_generator_binary_paths(RuntimeScope runtime_scope) { + _cleanup_strv_free_ char **paths = NULL, **add = NULL; + bool append = false; /* Add items from SYSTEMD_ENVIRONMENT_GENERATOR_PATH before normal directories */ + int r; + + /* First priority is whatever has been passed to us via env vars */ + r = get_paths_from_environ("SYSTEMD_ENVIRONMENT_GENERATOR_PATH", &paths, &append); + if (r < 0) + return NULL; + + if (!paths || append) { + switch (runtime_scope) { + + case RUNTIME_SCOPE_SYSTEM: + add = strv_new("/run/systemd/system-environment-generators", + "/etc/systemd/system-environment-generators", + "/usr/local/lib/systemd/system-environment-generators", + SYSTEM_ENV_GENERATOR_DIR); + break; + + case RUNTIME_SCOPE_USER: + add = strv_new("/run/systemd/user-environment-generators", + "/etc/systemd/user-environment-generators", + "/usr/local/lib/systemd/user-environment-generators", + USER_ENV_GENERATOR_DIR); + break; + + default: + assert_not_reached(); + } + if (!add) + return NULL; + } + + if (paths) { + r = strv_extend_strv(&paths, add, true); + if (r < 0) + return NULL; + } else + /* Small optimization: if paths is NULL (and it usually is), we can simply assign 'add' to it, + * and don't have to copy anything */ + paths = TAKE_PTR(add); + + return TAKE_PTR(paths); +} + +int find_portable_profile(const char *name, const char *unit, char **ret_path) { + const char *dot; + + assert(name); + assert(ret_path); + + assert_se(dot = strrchr(unit, '.')); + + NULSTR_FOREACH(p, PORTABLE_PROFILE_DIRS) { + _cleanup_free_ char *joined = NULL; + + joined = strjoin(p, "/", name, "/", dot + 1, ".conf"); + if (!joined) + return -ENOMEM; + + if (laccess(joined, F_OK) >= 0) { + *ret_path = TAKE_PTR(joined); + return 0; + } + + if (errno != ENOENT) + return -errno; + } + + return -ENOENT; +} diff --git a/src/basic/path-lookup.h b/src/basic/path-lookup.h new file mode 100644 index 0000000..1601787 --- /dev/null +++ b/src/basic/path-lookup.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "constants.h" +#include "macro.h" +#include "runtime-scope.h" + +typedef enum LookupPathsFlags { + LOOKUP_PATHS_EXCLUDE_GENERATED = 1 << 0, + LOOKUP_PATHS_TEMPORARY_GENERATED = 1 << 1, + LOOKUP_PATHS_SPLIT_USR = 1 << 2, /* Legacy, use ONLY for image payloads which might be old */ +} LookupPathsFlags; + +typedef struct LookupPaths { + /* Where we look for unit files. This includes the individual special paths below, but also any vendor + * supplied, static unit file paths. */ + char **search_path; + + /* Where we shall create or remove our installation symlinks, aka "configuration", and where the user/admin + * shall place their own unit files. */ + char *persistent_config; + char *runtime_config; + + /* Where units from a portable service image shall be placed. */ + char *persistent_attached; + char *runtime_attached; + + /* Where to place generated unit files (i.e. those a "generator" tool generated). Note the special semantics of + * this directory: the generators are flushed each time a "systemctl daemon-reload" is issued. The user should + * not alter these directories directly. */ + char *generator; + char *generator_early; + char *generator_late; + + /* Where to place transient unit files (i.e. those created dynamically via the bus API). Note the special + * semantics of this directory: all units created transiently have their unit files removed as the transient + * unit is unloaded. The user should not alter this directory directly. */ + char *transient; + + /* Where the snippets created by "systemctl set-property" are placed. Note that for transient units, the + * snippets are placed in the transient directory though (see above). The user should not alter this directory + * directly. */ + char *persistent_control; + char *runtime_control; + + /* The root directory prepended to all items above, or NULL */ + char *root_dir; + + /* A temporary directory when running in test mode, to be nuked */ + char *temporary_dir; +} LookupPaths; + +int lookup_paths_init(LookupPaths *lp, RuntimeScope scope, LookupPathsFlags flags, const char *root_dir); +int lookup_paths_init_or_warn(LookupPaths *lp, RuntimeScope scope, LookupPathsFlags flags, const char *root_dir); + +int xdg_user_dirs(char ***ret_config_dirs, char ***ret_data_dirs); +int xdg_user_runtime_dir(char **ret, const char *suffix); +int xdg_user_config_dir(char **ret, const char *suffix); +int xdg_user_data_dir(char **ret, const char *suffix); + +bool path_is_user_data_dir(const char *path); +bool path_is_user_config_dir(const char *path); + +void lookup_paths_log(LookupPaths *p); +void lookup_paths_free(LookupPaths *p); + +char **generator_binary_paths(RuntimeScope scope); +char **env_generator_binary_paths(RuntimeScope scope); + +#define NETWORK_DIRS ((const char* const*) CONF_PATHS_STRV("systemd/network")) +#define NETWORK_DIRS_NULSTR CONF_PATHS_NULSTR("systemd/network") + +#define PORTABLE_PROFILE_DIRS CONF_PATHS_NULSTR("systemd/portable/profile") +int find_portable_profile(const char *name, const char *unit, char **ret_path); diff --git a/src/basic/path-util.c b/src/basic/path-util.c new file mode 100644 index 0000000..6810bf6 --- /dev/null +++ b/src/basic/path-util.c @@ -0,0 +1,1434 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +int path_split_and_make_absolute(const char *p, char ***ret) { + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(p); + assert(ret); + + l = strv_split(p, ":"); + if (!l) + return -ENOMEM; + + r = path_strv_make_absolute_cwd(l); + if (r < 0) + return r; + + *ret = TAKE_PTR(l); + return r; +} + +char* path_make_absolute(const char *p, const char *prefix) { + assert(p); + + /* Makes every item in the list an absolute path by prepending + * the prefix, if specified and necessary */ + + if (path_is_absolute(p) || isempty(prefix)) + return strdup(p); + + return path_join(prefix, p); +} + +int safe_getcwd(char **ret) { + _cleanup_free_ char *cwd = NULL; + + cwd = get_current_dir_name(); + if (!cwd) + return negative_errno(); + + /* Let's make sure the directory is really absolute, to protect us from the logic behind + * CVE-2018-1000001 */ + if (cwd[0] != '/') + return -ENOMEDIUM; + + if (ret) + *ret = TAKE_PTR(cwd); + + return 0; +} + +int path_make_absolute_cwd(const char *p, char **ret) { + char *c; + int r; + + assert(p); + assert(ret); + + /* Similar to path_make_absolute(), but prefixes with the + * current working directory. */ + + if (path_is_absolute(p)) + c = strdup(p); + else { + _cleanup_free_ char *cwd = NULL; + + r = safe_getcwd(&cwd); + if (r < 0) + return r; + + c = path_join(cwd, p); + } + if (!c) + return -ENOMEM; + + *ret = c; + return 0; +} + +int path_make_relative(const char *from, const char *to, char **ret) { + _cleanup_free_ char *result = NULL; + unsigned n_parents; + const char *f, *t; + int r, k; + char *p; + + assert(from); + assert(to); + assert(ret); + + /* Strips the common part, and adds ".." elements as necessary. */ + + if (!path_is_absolute(from) || !path_is_absolute(to)) + return -EINVAL; + + for (;;) { + r = path_find_first_component(&from, true, &f); + if (r < 0) + return r; + + k = path_find_first_component(&to, true, &t); + if (k < 0) + return k; + + if (r == 0) { + /* end of 'from' */ + if (k == 0) { + /* from and to are equivalent. */ + result = strdup("."); + if (!result) + return -ENOMEM; + } else { + /* 'to' is inside of 'from'. */ + r = path_simplify_alloc(t, &result); + if (r < 0) + return r; + + if (!path_is_valid(result)) + return -EINVAL; + } + + *ret = TAKE_PTR(result); + return 0; + } + + if (r != k || !strneq(f, t, r)) + break; + } + + /* If we're here, then "from_dir" has one or more elements that need to + * be replaced with "..". */ + + for (n_parents = 1;; n_parents++) { + /* If this includes ".." we can't do a simple series of "..". */ + r = path_find_first_component(&from, false, &f); + if (r < 0) + return r; + if (r == 0) + break; + } + + if (isempty(t) && n_parents * 3 > PATH_MAX) + /* PATH_MAX is counted *with* the trailing NUL byte */ + return -EINVAL; + + result = new(char, n_parents * 3 + !isempty(t) + strlen_ptr(t)); + if (!result) + return -ENOMEM; + + for (p = result; n_parents > 0; n_parents--) + p = mempcpy(p, "../", 3); + + if (isempty(t)) { + /* Remove trailing slash and terminate string. */ + *(--p) = '\0'; + *ret = TAKE_PTR(result); + return 0; + } + + strcpy(p, t); + + path_simplify(result); + + if (!path_is_valid(result)) + return -EINVAL; + + *ret = TAKE_PTR(result); + return 0; +} + +int path_make_relative_parent(const char *from_child, const char *to, char **ret) { + _cleanup_free_ char *from = NULL; + int r; + + assert(from_child); + assert(to); + assert(ret); + + /* Similar to path_make_relative(), but provides the relative path from the parent directory of + * 'from_child'. This may be useful when creating relative symlink. + * + * E.g. + * - from = "/path/to/aaa", to = "/path/to/bbb" + * path_make_relative(from, to) = "../bbb" + * path_make_relative_parent(from, to) = "bbb" + * + * - from = "/path/to/aaa/bbb", to = "/path/to/ccc/ddd" + * path_make_relative(from, to) = "../../ccc/ddd" + * path_make_relative_parent(from, to) = "../ccc/ddd" + */ + + r = path_extract_directory(from_child, &from); + if (r < 0) + return r; + + return path_make_relative(from, to, ret); +} + +char* path_startswith_strv(const char *p, char **set) { + STRV_FOREACH(s, set) { + char *t; + + t = path_startswith(p, *s); + if (t) + return t; + } + + return NULL; +} + +int path_strv_make_absolute_cwd(char **l) { + int r; + + /* Goes through every item in the string list and makes it + * absolute. This works in place and won't rollback any + * changes on failure. */ + + STRV_FOREACH(s, l) { + char *t; + + r = path_make_absolute_cwd(*s, &t); + if (r < 0) + return r; + + path_simplify(t); + free_and_replace(*s, t); + } + + return 0; +} + +char** path_strv_resolve(char **l, const char *root) { + unsigned k = 0; + bool enomem = false; + int r; + + if (strv_isempty(l)) + return l; + + /* Goes through every item in the string list and canonicalize + * the path. This works in place and won't rollback any + * changes on failure. */ + + STRV_FOREACH(s, l) { + _cleanup_free_ char *orig = NULL; + char *t, *u; + + if (!path_is_absolute(*s)) { + free(*s); + continue; + } + + if (root) { + orig = *s; + t = path_join(root, orig); + if (!t) { + enomem = true; + continue; + } + } else + t = *s; + + r = chase(t, root, 0, &u, NULL); + if (r == -ENOENT) { + if (root) { + u = TAKE_PTR(orig); + free(t); + } else + u = t; + } else if (r < 0) { + free(t); + + if (r == -ENOMEM) + enomem = true; + + continue; + } else if (root) { + char *x; + + free(t); + x = path_startswith(u, root); + if (x) { + /* restore the slash if it was lost */ + if (!startswith(x, "/")) + *(--x) = '/'; + + t = strdup(x); + free(u); + if (!t) { + enomem = true; + continue; + } + u = t; + } else { + /* canonicalized path goes outside of + * prefix, keep the original path instead */ + free_and_replace(u, orig); + } + } else + free(t); + + l[k++] = u; + } + + l[k] = NULL; + + if (enomem) + return NULL; + + return l; +} + +char** path_strv_resolve_uniq(char **l, const char *root) { + + if (strv_isempty(l)) + return l; + + if (!path_strv_resolve(l, root)) + return NULL; + + return strv_uniq(l); +} + +char* path_simplify_full(char *path, PathSimplifyFlags flags) { + bool add_slash = false, keep_trailing_slash, absolute, beginning = true; + char *f = path; + int r; + + /* Removes redundant inner and trailing slashes. Also removes unnecessary dots. + * Modifies the passed string in-place. + * + * ///foo//./bar/. becomes /foo/bar + * .//./foo//./bar/. becomes foo/bar + * /../foo/bar becomes /foo/bar + * /../foo/bar/.. becomes /foo/bar/.. + */ + + if (isempty(path)) + return path; + + keep_trailing_slash = FLAGS_SET(flags, PATH_SIMPLIFY_KEEP_TRAILING_SLASH) && endswith(path, "/"); + + absolute = path_is_absolute(path); + f += absolute; /* Keep leading /, if present. */ + + for (const char *p = f;;) { + const char *e; + + r = path_find_first_component(&p, true, &e); + if (r == 0) + break; + + if (r > 0 && absolute && beginning && path_startswith(e, "..")) + /* If we're at the beginning of an absolute path, we can safely skip ".." */ + continue; + + beginning = false; + + if (add_slash) + *f++ = '/'; + + if (r < 0) { + /* if path is invalid, then refuse to simplify the remaining part. */ + memmove(f, p, strlen(p) + 1); + return path; + } + + memmove(f, e, r); + f += r; + + add_slash = true; + } + + /* Special rule, if we stripped everything, we need a "." for the current directory. */ + if (f == path) + *f++ = '.'; + + if (*(f-1) != '/' && keep_trailing_slash) + *f++ = '/'; + + *f = '\0'; + return path; +} + +char* path_startswith_full(const char *path, const char *prefix, bool accept_dot_dot) { + assert(path); + assert(prefix); + + /* Returns a pointer to the start of the first component after the parts matched by + * the prefix, iff + * - both paths are absolute or both paths are relative, + * and + * - each component in prefix in turn matches a component in path at the same position. + * An empty string will be returned when the prefix and path are equivalent. + * + * Returns NULL otherwise. + */ + + if ((path[0] == '/') != (prefix[0] == '/')) + return NULL; + + for (;;) { + const char *p, *q; + int r, k; + + r = path_find_first_component(&path, accept_dot_dot, &p); + if (r < 0) + return NULL; + + k = path_find_first_component(&prefix, accept_dot_dot, &q); + if (k < 0) + return NULL; + + if (k == 0) + return (char*) (p ?: path); + + if (r != k) + return NULL; + + if (!strneq(p, q, r)) + return NULL; + } +} + +int path_compare(const char *a, const char *b) { + int r; + + /* Order NULL before non-NULL */ + r = CMP(!!a, !!b); + if (r != 0) + return r; + + /* A relative path and an absolute path must not compare as equal. + * Which one is sorted before the other does not really matter. + * Here a relative path is ordered before an absolute path. */ + r = CMP(path_is_absolute(a), path_is_absolute(b)); + if (r != 0) + return r; + + for (;;) { + const char *aa, *bb; + int j, k; + + j = path_find_first_component(&a, true, &aa); + k = path_find_first_component(&b, true, &bb); + + if (j < 0 || k < 0) { + /* When one of paths is invalid, order invalid path after valid one. */ + r = CMP(j < 0, k < 0); + if (r != 0) + return r; + + /* fallback to use strcmp() if both paths are invalid. */ + return strcmp(a, b); + } + + /* Order prefixes first: "/foo" before "/foo/bar" */ + if (j == 0) { + if (k == 0) + return 0; + return -1; + } + if (k == 0) + return 1; + + /* Alphabetical sort: "/foo/aaa" before "/foo/b" */ + r = memcmp(aa, bb, MIN(j, k)); + if (r != 0) + return r; + + /* Sort "/foo/a" before "/foo/aaa" */ + r = CMP(j, k); + if (r != 0) + return r; + } +} + +int path_compare_filename(const char *a, const char *b) { + _cleanup_free_ char *fa = NULL, *fb = NULL; + int r, j, k; + + /* Order NULL before non-NULL */ + r = CMP(!!a, !!b); + if (r != 0) + return r; + + j = path_extract_filename(a, &fa); + k = path_extract_filename(b, &fb); + + /* When one of paths is "." or root, then order it earlier. */ + r = CMP(j != -EADDRNOTAVAIL, k != -EADDRNOTAVAIL); + if (r != 0) + return r; + + /* When one of paths is invalid (or we get OOM), order invalid path after valid one. */ + r = CMP(j < 0, k < 0); + if (r != 0) + return r; + + /* fallback to use strcmp() if both paths are invalid. */ + if (j < 0) + return strcmp(a, b); + + return strcmp(fa, fb); +} + +char* path_extend_internal(char **x, ...) { + size_t sz, old_sz; + char *q, *nx; + const char *p; + va_list ap; + bool slash; + + /* Joins all listed strings until the sentinel and places a "/" between them unless the strings + * end/begin already with one so that it is unnecessary. Note that slashes which are already + * duplicate won't be removed. The string returned is hence always equal to or longer than the sum of + * the lengths of the individual strings. + * + * The first argument may be an already allocated string that is extended via realloc() if + * non-NULL. path_extend() and path_join() are macro wrappers around this function, making use of the + * first parameter to distinguish the two operations. + * + * Note: any listed empty string is simply skipped. This can be useful for concatenating strings of + * which some are optional. + * + * Examples: + * + * path_join("foo", "bar") → "foo/bar" + * path_join("foo/", "bar") → "foo/bar" + * path_join("", "foo", "", "bar", "") → "foo/bar" */ + + sz = old_sz = x ? strlen_ptr(*x) : 0; + va_start(ap, x); + while ((p = va_arg(ap, char*)) != POINTER_MAX) { + size_t add; + + if (isempty(p)) + continue; + + add = 1 + strlen(p); + if (sz > SIZE_MAX - add) { /* overflow check */ + va_end(ap); + return NULL; + } + + sz += add; + } + va_end(ap); + + nx = realloc(x ? *x : NULL, GREEDY_ALLOC_ROUND_UP(sz+1)); + if (!nx) + return NULL; + if (x) + *x = nx; + + if (old_sz > 0) + slash = nx[old_sz-1] == '/'; + else { + nx[old_sz] = 0; + slash = true; /* no need to generate a slash anymore */ + } + + q = nx + old_sz; + + va_start(ap, x); + while ((p = va_arg(ap, char*)) != POINTER_MAX) { + if (isempty(p)) + continue; + + if (!slash && p[0] != '/') + *(q++) = '/'; + + q = stpcpy(q, p); + slash = endswith(p, "/"); + } + va_end(ap); + + return nx; +} + +static int check_x_access(const char *path, int *ret_fd) { + _cleanup_close_ int fd = -EBADF; + int r; + + /* We need to use O_PATH because there may be executables for which we have only exec + * permissions, but not read (usually suid executables). */ + fd = open(path, O_PATH|O_CLOEXEC); + if (fd < 0) + return -errno; + + r = fd_verify_regular(fd); + if (r < 0) + return r; + + r = access_fd(fd, X_OK); + if (r == -ENOSYS) { + /* /proc is not mounted. Fallback to access(). */ + if (access(path, X_OK) < 0) + return -errno; + } else if (r < 0) + return r; + + if (ret_fd) + *ret_fd = TAKE_FD(fd); + + return 0; +} + +static int find_executable_impl(const char *name, const char *root, char **ret_filename, int *ret_fd) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *path_name = NULL; + int r; + + assert(name); + + /* Function chase() is invoked only when root is not NULL, as using it regardless of + * root value would alter the behavior of existing callers for example: /bin/sleep would become + * /usr/bin/sleep when find_executables is called. Hence, this function should be invoked when + * needed to avoid unforeseen regression or other complicated changes. */ + if (root) { + /* prefix root to name in case full paths are not specified */ + r = chase(name, root, CHASE_PREFIX_ROOT, &path_name, /* ret_fd= */ NULL); + if (r < 0) + return r; + + name = path_name; + } + + r = check_x_access(name, ret_fd ? &fd : NULL); + if (r < 0) + return r; + + if (ret_filename) { + r = path_make_absolute_cwd(name, ret_filename); + if (r < 0) + return r; + } + + if (ret_fd) + *ret_fd = TAKE_FD(fd); + + return 0; +} + +int find_executable_full( + const char *name, + const char *root, + char **exec_search_path, + bool use_path_envvar, + char **ret_filename, + int *ret_fd) { + + int last_error = -ENOENT, r = 0; + const char *p = NULL; + + assert(name); + + if (is_path(name)) + return find_executable_impl(name, root, ret_filename, ret_fd); + + if (use_path_envvar) + /* Plain getenv, not secure_getenv, because we want to actually allow the user to pick the + * binary. */ + p = getenv("PATH"); + if (!p) + p = DEFAULT_PATH; + + if (exec_search_path) { + STRV_FOREACH(element, exec_search_path) { + _cleanup_free_ char *full_path = NULL; + + if (!path_is_absolute(*element)) + continue; + + full_path = path_join(*element, name); + if (!full_path) + return -ENOMEM; + + r = find_executable_impl(full_path, root, ret_filename, ret_fd); + if (r < 0) { + if (r != -EACCES) + last_error = r; + continue; + } + return 0; + } + return last_error; + } + + /* Resolve a single-component name to a full path */ + for (;;) { + _cleanup_free_ char *element = NULL; + + r = extract_first_word(&p, &element, ":", EXTRACT_RELAX|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + if (!path_is_absolute(element)) + continue; + + if (!path_extend(&element, name)) + return -ENOMEM; + + r = find_executable_impl(element, root, ret_filename, ret_fd); + if (r < 0) { + /* PATH entries which we don't have access to are ignored, as per tradition. */ + if (r != -EACCES) + last_error = r; + continue; + } + + /* Found it! */ + return 0; + } + + return last_error; +} + +bool paths_check_timestamp(const char* const* paths, usec_t *timestamp, bool update) { + bool changed = false, originally_unset; + + assert(timestamp); + + if (!paths) + return false; + + originally_unset = *timestamp == 0; + + STRV_FOREACH(i, paths) { + struct stat stats; + usec_t u; + + if (stat(*i, &stats) < 0) + continue; + + u = timespec_load(&stats.st_mtim); + + /* check first */ + if (*timestamp >= u) + continue; + + log_debug(originally_unset ? "Loaded timestamp for '%s'." : "Timestamp of '%s' changed.", *i); + + /* update timestamp */ + if (update) { + *timestamp = u; + changed = true; + } else + return true; + } + + return changed; +} + +static int executable_is_good(const char *executable) { + _cleanup_free_ char *p = NULL, *d = NULL; + int r; + + r = find_executable(executable, &p); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + /* An fsck that is linked to /bin/true is a non-existent fsck */ + + r = readlink_malloc(p, &d); + if (r == -EINVAL) /* not a symlink */ + return 1; + if (r < 0) + return r; + + return !PATH_IN_SET(d, "true" + "/bin/true", + "/usr/bin/true", + "/dev/null"); +} + +int fsck_exists(void) { + return executable_is_good("fsck"); +} + +int fsck_exists_for_fstype(const char *fstype) { + const char *checker; + int r; + + assert(fstype); + + if (streq(fstype, "auto")) + return -EINVAL; + + r = fsck_exists(); + if (r <= 0) + return r; + + checker = strjoina("fsck.", fstype); + return executable_is_good(checker); +} + +static const char* skip_slash_or_dot(const char *p) { + for (; !isempty(p); p++) { + if (*p == '/') + continue; + if (startswith(p, "./")) { + p++; + continue; + } + break; + } + return p; +} + +int path_find_first_component(const char **p, bool accept_dot_dot, const char **ret) { + const char *q, *first, *end_first, *next; + size_t len; + + assert(p); + + /* When a path is input, then returns the pointer to the first component and its length, and + * move the input pointer to the next component or nul. This skips both over any '/' + * immediately *before* and *after* the first component before returning. + * + * Examples + * Input: p: "//.//aaa///bbbbb/cc" + * Output: p: "bbbbb///cc" + * ret: "aaa///bbbbb/cc" + * return value: 3 (== strlen("aaa")) + * + * Input: p: "aaa//" + * Output: p: (pointer to NUL) + * ret: "aaa//" + * return value: 3 (== strlen("aaa")) + * + * Input: p: "/", ".", "" + * Output: p: (pointer to NUL) + * ret: NULL + * return value: 0 + * + * Input: p: NULL + * Output: p: NULL + * ret: NULL + * return value: 0 + * + * Input: p: "(too long component)" + * Output: return value: -EINVAL + * + * (when accept_dot_dot is false) + * Input: p: "//..//aaa///bbbbb/cc" + * Output: return value: -EINVAL + */ + + q = *p; + + first = skip_slash_or_dot(q); + if (isempty(first)) { + *p = first; + if (ret) + *ret = NULL; + return 0; + } + if (streq(first, ".")) { + *p = first + 1; + if (ret) + *ret = NULL; + return 0; + } + + end_first = strchrnul(first, '/'); + len = end_first - first; + + if (len > NAME_MAX) + return -EINVAL; + if (!accept_dot_dot && len == 2 && first[0] == '.' && first[1] == '.') + return -EINVAL; + + next = skip_slash_or_dot(end_first); + + *p = next + streq(next, "."); + if (ret) + *ret = first; + return len; +} + +static const char* skip_slash_or_dot_backward(const char *path, const char *q) { + assert(path); + assert(!q || q >= path); + + for (; q; q = PTR_SUB1(q, path)) { + if (*q == '/') + continue; + if (q > path && strneq(q - 1, "/.", 2)) + continue; + if (q == path && *q == '.') + continue; + break; + } + return q; +} + +int path_find_last_component(const char *path, bool accept_dot_dot, const char **next, const char **ret) { + const char *q, *last_end, *last_begin; + size_t len; + + /* Similar to path_find_first_component(), but search components from the end. + * + * Examples + * Input: path: "//.//aaa///bbbbb/cc//././" + * next: NULL + * Output: next: "/cc//././" + * ret: "cc//././" + * return value: 2 (== strlen("cc")) + * + * Input: path: "//.//aaa///bbbbb/cc//././" + * next: "/cc//././" + * Output: next: "///bbbbb/cc//././" + * ret: "bbbbb/cc//././" + * return value: 5 (== strlen("bbbbb")) + * + * Input: path: "//.//aaa///bbbbb/cc//././" + * next: "///bbbbb/cc//././" + * Output: next: "//.//aaa///bbbbb/cc//././" (next == path) + * ret: "aaa///bbbbb/cc//././" + * return value: 3 (== strlen("aaa")) + * + * Input: path: "/", ".", "", or NULL + * Output: next: equivalent to path + * ret: NULL + * return value: 0 + * + * Input: path: "(too long component)" + * Output: return value: -EINVAL + * + * (when accept_dot_dot is false) + * Input: path: "//..//aaa///bbbbb/cc/..//" + * Output: return value: -EINVAL + */ + + if (isempty(path)) { + if (next) + *next = path; + if (ret) + *ret = NULL; + return 0; + } + + if (next && *next) { + if (*next < path || *next > path + strlen(path)) + return -EINVAL; + if (*next == path) { + if (ret) + *ret = NULL; + return 0; + } + if (!IN_SET(**next, '\0', '/')) + return -EINVAL; + q = *next - 1; + } else + q = path + strlen(path) - 1; + + q = skip_slash_or_dot_backward(path, q); + if (!q || /* the root directory */ + (q == path && *q == '.')) { /* path is "." or "./" */ + if (next) + *next = path; + if (ret) + *ret = NULL; + return 0; + } + + last_end = q + 1; + + while (q && *q != '/') + q = PTR_SUB1(q, path); + + last_begin = q ? q + 1 : path; + len = last_end - last_begin; + + if (len > NAME_MAX) + return -EINVAL; + if (!accept_dot_dot && len == 2 && strneq(last_begin, "..", 2)) + return -EINVAL; + + if (next) { + q = skip_slash_or_dot_backward(path, q); + *next = q ? q + 1 : path; + } + + if (ret) + *ret = last_begin; + return len; +} + +const char* last_path_component(const char *path) { + + /* Finds the last component of the path, preserving the optional trailing slash that signifies a directory. + * + * a/b/c → c + * a/b/c/ → c/ + * x → x + * x/ → x/ + * /y → y + * /y/ → y/ + * / → / + * // → / + * /foo/a → a + * /foo/a/ → a/ + * + * Also, the empty string is mapped to itself. + * + * This is different than basename(), which returns "" when a trailing slash is present. + * + * This always succeeds (except if you pass NULL in which case it returns NULL, too). + */ + + unsigned l, k; + + if (!path) + return NULL; + + l = k = strlen(path); + if (l == 0) /* special case — an empty string */ + return path; + + while (k > 0 && path[k-1] == '/') + k--; + + if (k == 0) /* the root directory */ + return path + l - 1; + + while (k > 0 && path[k-1] != '/') + k--; + + return path + k; +} + +int path_extract_filename(const char *path, char **ret) { + _cleanup_free_ char *a = NULL; + const char *c, *next = NULL; + int r; + + /* Extracts the filename part (i.e. right-most component) from a path, i.e. string that passes + * filename_is_valid(). A wrapper around last_path_component(), but eats up trailing + * slashes. Returns: + * + * -EINVAL → if the path is not valid + * -EADDRNOTAVAIL → if only a directory was specified, but no filename, i.e. the root dir + * itself or "." is specified + * -ENOMEM → no memory + * + * Returns >= 0 on success. If the input path has a trailing slash, returns O_DIRECTORY, to + * indicate the referenced file must be a directory. + * + * This function guarantees to return a fully valid filename, i.e. one that passes + * filename_is_valid() – this means "." and ".." are not accepted. */ + + if (!path_is_valid(path)) + return -EINVAL; + + r = path_find_last_component(path, false, &next, &c); + if (r < 0) + return r; + if (r == 0) /* root directory */ + return -EADDRNOTAVAIL; + + a = strndup(c, r); + if (!a) + return -ENOMEM; + + *ret = TAKE_PTR(a); + return strlen(c) > (size_t) r ? O_DIRECTORY : 0; +} + +int path_extract_directory(const char *path, char **ret) { + _cleanup_free_ char *a = NULL; + const char *c, *next = NULL; + int r; + + /* The inverse of path_extract_filename(), i.e. returns the directory path prefix. Returns: + * + * -EINVAL → if the path is not valid + * -EDESTADDRREQ → if no directory was specified in the passed in path, i.e. only a filename was passed + * -EADDRNOTAVAIL → if the passed in parameter had no filename but did have a directory, i.e. + * the root dir itself or "." was specified + * -ENOMEM → no memory (surprise!) + * + * This function guarantees to return a fully valid path, i.e. one that passes path_is_valid(). + */ + + r = path_find_last_component(path, false, &next, &c); + if (r < 0) + return r; + if (r == 0) /* empty or root */ + return isempty(path) ? -EINVAL : -EADDRNOTAVAIL; + if (next == path) { + if (*path != '/') /* filename only */ + return -EDESTADDRREQ; + + a = strdup("/"); + if (!a) + return -ENOMEM; + *ret = TAKE_PTR(a); + return 0; + } + + a = strndup(path, next - path); + if (!a) + return -ENOMEM; + + path_simplify(a); + + if (!path_is_valid(a)) + return -EINVAL; + + if (ret) + *ret = TAKE_PTR(a); + + return 0; +} + +bool filename_part_is_valid(const char *p) { + const char *e; + + /* Checks f the specified string is OK to be *part* of a filename. This is different from + * filename_is_valid() as "." and ".." and "" are OK by this call, but not by filename_is_valid(). */ + + if (!p) + return false; + + e = strchrnul(p, '/'); + if (*e != 0) + return false; + + if (e - p > NAME_MAX) /* NAME_MAX is counted *without* the trailing NUL byte */ + return false; + + return true; +} + +bool filename_is_valid(const char *p) { + + if (isempty(p)) + return false; + + if (dot_or_dot_dot(p)) /* Yes, in this context we consider "." and ".." invalid */ + return false; + + return filename_part_is_valid(p); +} + +bool path_is_valid_full(const char *p, bool accept_dot_dot) { + if (isempty(p)) + return false; + + for (const char *e = p;;) { + int r; + + r = path_find_first_component(&e, accept_dot_dot, NULL); + if (r < 0) + return false; + + if (e - p >= PATH_MAX) /* Already reached the maximum length for a path? (PATH_MAX is counted + * *with* the trailing NUL byte) */ + return false; + if (*e == 0) /* End of string? Yay! */ + return true; + } +} + +bool path_is_normalized(const char *p) { + if (!path_is_safe(p)) + return false; + + if (streq(p, ".") || startswith(p, "./") || endswith(p, "/.") || strstr(p, "/./")) + return false; + + if (strstr(p, "//")) + return false; + + return true; +} + +int file_in_same_dir(const char *path, const char *filename, char **ret) { + _cleanup_free_ char *b = NULL; + int r; + + assert(path); + assert(filename); + assert(ret); + + /* This removes the last component of path and appends filename, unless the latter is absolute anyway + * or the former isn't */ + + if (path_is_absolute(filename)) + b = strdup(filename); + else { + _cleanup_free_ char *dn = NULL; + + r = path_extract_directory(path, &dn); + if (r == -EDESTADDRREQ) /* no path prefix */ + b = strdup(filename); + else if (r < 0) + return r; + else + b = path_join(dn, filename); + } + if (!b) + return -ENOMEM; + + *ret = TAKE_PTR(b); + return 0; +} + +bool hidden_or_backup_file(const char *filename) { + assert(filename); + + if (filename[0] == '.' || + STR_IN_SET(filename, + "lost+found", + "aquota.user", + "aquota.group") || + endswith(filename, "~")) + return true; + + const char *dot = strrchr(filename, '.'); + if (!dot) + return false; + + /* Please, let's not add more entries to the list below. If external projects think it's a good idea + * to come up with always new suffixes and that everybody else should just adjust to that, then it + * really should be on them. Hence, in future, let's not add any more entries. Instead, let's ask + * those packages to instead adopt one of the generic suffixes/prefixes for hidden files or backups, + * possibly augmented with an additional string. Specifically: there's now: + * + * The generic suffixes "~" and ".bak" for backup files + * The generic prefix "." for hidden files + * + * Thus, if a new package manager "foopkg" wants its own set of ".foopkg-new", ".foopkg-old", + * ".foopkg-dist" or so registered, let's refuse that and ask them to use ".foopkg.new", + * ".foopkg.old" or ".foopkg~" instead. + */ + + return STR_IN_SET(dot + 1, + "rpmnew", + "rpmsave", + "rpmorig", + "dpkg-old", + "dpkg-new", + "dpkg-tmp", + "dpkg-dist", + "dpkg-bak", + "dpkg-backup", + "dpkg-remove", + "ucf-new", + "ucf-old", + "ucf-dist", + "swp", + "bak", + "old", + "new"); +} + +bool is_device_path(const char *path) { + + /* Returns true for paths that likely refer to a device, either by path in sysfs or to something in + * /dev. This accepts any path that starts with /dev/ or /sys/ and has something after that prefix. + * It does not actually resolve the path. + * + * Examples: + * /dev/sda, /dev/sda/foo, /sys/class, /dev/.., /sys/.., /./dev/foo → yes. + * /../dev/sda, /dev, /sys, /usr/path, /usr/../dev/sda → no. + */ + + const char *p = PATH_STARTSWITH_SET(ASSERT_PTR(path), "/dev/", "/sys/"); + return !isempty(p); +} + +bool valid_device_node_path(const char *path) { + + /* Some superficial checks whether the specified path is a valid device node path, all without + * looking at the actual device node. */ + + if (!PATH_STARTSWITH_SET(path, "/dev/", "/run/systemd/inaccessible/")) + return false; + + if (endswith(path, "/")) /* can't be a device node if it ends in a slash */ + return false; + + return path_is_normalized(path); +} + +bool valid_device_allow_pattern(const char *path) { + assert(path); + + /* Like valid_device_node_path(), but also allows full-subsystem expressions like those accepted by + * DeviceAllow= and DeviceDeny=. */ + + if (STARTSWITH_SET(path, "block-", "char-")) + return true; + + return valid_device_node_path(path); +} + +bool dot_or_dot_dot(const char *path) { + if (!path) + return false; + if (path[0] != '.') + return false; + if (path[1] == 0) + return true; + if (path[1] != '.') + return false; + + return path[2] == 0; +} + +bool empty_or_root(const char *path) { + + /* For operations relative to some root directory, returns true if the specified root directory is + * redundant, i.e. either / or NULL or the empty string or any equivalent. */ + + if (isempty(path)) + return true; + + return path_equal(path, "/"); +} + +bool path_strv_contains(char **l, const char *path) { + STRV_FOREACH(i, l) + if (path_equal(*i, path)) + return true; + + return false; +} + +bool prefixed_path_strv_contains(char **l, const char *path) { + STRV_FOREACH(i, l) { + const char *j = *i; + + if (*j == '-') + j++; + if (*j == '+') + j++; + if (path_equal(j, path)) + return true; + } + + return false; +} + +int path_glob_can_match(const char *pattern, const char *prefix, char **ret) { + assert(pattern); + assert(prefix); + + for (const char *a = pattern, *b = prefix;;) { + _cleanup_free_ char *g = NULL, *h = NULL; + const char *p, *q; + int r, s; + + r = path_find_first_component(&a, /* accept_dot_dot = */ false, &p); + if (r < 0) + return r; + + s = path_find_first_component(&b, /* accept_dot_dot = */ false, &q); + if (s < 0) + return s; + + if (s == 0) { + /* The pattern matches the prefix. */ + if (ret) { + char *t; + + t = path_join(prefix, p); + if (!t) + return -ENOMEM; + + *ret = t; + } + return true; + } + + if (r == 0) + break; + + if (r == s && strneq(p, q, r)) + continue; /* common component. Check next. */ + + g = strndup(p, r); + if (!g) + return -ENOMEM; + + if (!string_is_glob(g)) + break; + + /* We found a glob component. Check if the glob pattern matches the prefix component. */ + + h = strndup(q, s); + if (!h) + return -ENOMEM; + + r = fnmatch(g, h, 0); + if (r == FNM_NOMATCH) + break; + if (r != 0) /* Failure to process pattern? */ + return -EINVAL; + } + + /* The pattern does not match the prefix. */ + if (ret) + *ret = NULL; + return false; +} diff --git a/src/basic/path-util.h b/src/basic/path-util.h new file mode 100644 index 0000000..6d943e9 --- /dev/null +++ b/src/basic/path-util.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +#define PATH_SPLIT_SBIN_BIN(x) x "sbin:" x "bin" +#define PATH_SPLIT_SBIN_BIN_NULSTR(x) x "sbin\0" x "bin\0" + +#define PATH_NORMAL_SBIN_BIN(x) x "bin" +#define PATH_NORMAL_SBIN_BIN_NULSTR(x) x "bin\0" + +#if HAVE_SPLIT_BIN +# define PATH_SBIN_BIN(x) PATH_SPLIT_SBIN_BIN(x) +# define PATH_SBIN_BIN_NULSTR(x) PATH_SPLIT_SBIN_BIN_NULSTR(x) +#else +# define PATH_SBIN_BIN(x) PATH_NORMAL_SBIN_BIN(x) +# define PATH_SBIN_BIN_NULSTR(x) PATH_NORMAL_SBIN_BIN_NULSTR(x) +#endif + +#define DEFAULT_PATH PATH_SBIN_BIN("/usr/local/") ":" PATH_SBIN_BIN("/usr/") +#define DEFAULT_PATH_NULSTR PATH_SBIN_BIN_NULSTR("/usr/local/") PATH_SBIN_BIN_NULSTR("/usr/") +#define DEFAULT_PATH_COMPAT PATH_SPLIT_SBIN_BIN("/usr/local/") ":" PATH_SPLIT_SBIN_BIN("/usr/") ":" PATH_SPLIT_SBIN_BIN("/") + +#ifndef DEFAULT_USER_PATH +# define DEFAULT_USER_PATH DEFAULT_PATH +#endif + +static inline bool is_path(const char *p) { + if (!p) /* A NULL pointer is definitely not a path */ + return false; + + return strchr(p, '/'); +} + +static inline bool path_is_absolute(const char *p) { + if (!p) /* A NULL pointer is definitely not an absolute path */ + return false; + + return p[0] == '/'; +} + +int path_split_and_make_absolute(const char *p, char ***ret); +char* path_make_absolute(const char *p, const char *prefix); +int safe_getcwd(char **ret); +int path_make_absolute_cwd(const char *p, char **ret); +int path_make_relative(const char *from, const char *to, char **ret); +int path_make_relative_parent(const char *from_child, const char *to, char **ret); +char* path_startswith_full(const char *path, const char *prefix, bool accept_dot_dot) _pure_; +static inline char* path_startswith(const char *path, const char *prefix) { + return path_startswith_full(path, prefix, true); +} + +int path_compare(const char *a, const char *b) _pure_; +static inline bool path_equal(const char *a, const char *b) { + return path_compare(a, b) == 0; +} + +int path_compare_filename(const char *a, const char *b); +static inline bool path_equal_filename(const char *a, const char *b) { + return path_compare_filename(a, b) == 0; +} + +static inline bool path_equal_or_inode_same(const char *a, const char *b, int flags) { + return path_equal(a, b) || inode_same(a, b, flags) > 0; +} + +char* path_extend_internal(char **x, ...); +#define path_extend(x, ...) path_extend_internal(x, __VA_ARGS__, POINTER_MAX) +#define path_join(...) path_extend_internal(NULL, __VA_ARGS__, POINTER_MAX) + +typedef enum PathSimplifyFlags { + PATH_SIMPLIFY_KEEP_TRAILING_SLASH = 1 << 0, +} PathSimplifyFlags; + +char* path_simplify_full(char *path, PathSimplifyFlags flags); +static inline char* path_simplify(char *path) { + return path_simplify_full(path, 0); +} + +static inline int path_simplify_alloc(const char *path, char **ret) { + assert(ret); + + if (!path) { + *ret = NULL; + return 0; + } + + char *t = strdup(path); + if (!t) + return -ENOMEM; + + *ret = path_simplify(t); + return 0; +} + +static inline bool path_equal_ptr(const char *a, const char *b) { + return !!a == !!b && (!a || path_equal(a, b)); +} + +/* Note: the search terminates on the first NULL item. */ +#define PATH_IN_SET(p, ...) path_strv_contains(STRV_MAKE(__VA_ARGS__), p) + +char* path_startswith_strv(const char *p, char **set); +#define PATH_STARTSWITH_SET(p, ...) path_startswith_strv(p, STRV_MAKE(__VA_ARGS__)) + +int path_strv_make_absolute_cwd(char **l); +char** path_strv_resolve(char **l, const char *root); +char** path_strv_resolve_uniq(char **l, const char *root); + +int find_executable_full(const char *name, const char *root, char **exec_search_path, bool use_path_envvar, char **ret_filename, int *ret_fd); +static inline int find_executable(const char *name, char **ret_filename) { + return find_executable_full(name, /* root= */ NULL, NULL, true, ret_filename, NULL); +} + +bool paths_check_timestamp(const char* const* paths, usec_t *paths_ts_usec, bool update); + +int fsck_exists(void); +int fsck_exists_for_fstype(const char *fstype); + +/* Iterates through the path prefixes of the specified path, going up + * the tree, to root. Also returns "" (and not "/"!) for the root + * directory. Excludes the specified directory itself */ +#define PATH_FOREACH_PREFIX(prefix, path) \ + for (char *_slash = ({ \ + path_simplify(strcpy(prefix, path)); \ + streq(prefix, "/") ? NULL : strrchr(prefix, '/'); \ + }); \ + _slash && ((*_slash = 0), true); \ + _slash = strrchr((prefix), '/')) + +/* Same as PATH_FOREACH_PREFIX but also includes the specified path itself */ +#define PATH_FOREACH_PREFIX_MORE(prefix, path) \ + for (char *_slash = ({ \ + path_simplify(strcpy(prefix, path)); \ + if (streq(prefix, "/")) \ + prefix[0] = 0; \ + strrchr(prefix, 0); \ + }); \ + _slash && ((*_slash = 0), true); \ + _slash = strrchr((prefix), '/')) + +/* Similar to path_join(), but only works for two components, and only the first one may be NULL and returns + * an alloca() buffer, or possibly a const pointer into the path parameter. */ +/* DEPRECATED: use path_join() instead */ +#define prefix_roota(root, path) \ + ({ \ + const char* _path = (path), *_root = (root), *_ret; \ + char *_p, *_n; \ + size_t _l; \ + while (_path[0] == '/' && _path[1] == '/') \ + _path ++; \ + if (isempty(_root)) \ + _ret = _path; \ + else { \ + _l = strlen(_root) + 1 + strlen(_path) + 1; \ + _n = newa(char, _l); \ + _p = stpcpy(_n, _root); \ + while (_p > _n && _p[-1] == '/') \ + _p--; \ + if (_path[0] != '/') \ + *(_p++) = '/'; \ + strcpy(_p, _path); \ + _ret = _n; \ + } \ + _ret; \ + }) + +int path_find_first_component(const char **p, bool accept_dot_dot, const char **ret); +int path_find_last_component(const char *path, bool accept_dot_dot, const char **next, const char **ret); +const char* last_path_component(const char *path); +int path_extract_filename(const char *path, char **ret); +int path_extract_directory(const char *path, char **ret); + +bool filename_part_is_valid(const char *p) _pure_; +bool filename_is_valid(const char *p) _pure_; +bool path_is_valid_full(const char *p, bool accept_dot_dot) _pure_; +static inline bool path_is_valid(const char *p) { + return path_is_valid_full(p, /* accept_dot_dot= */ true); +} +static inline bool path_is_safe(const char *p) { + return path_is_valid_full(p, /* accept_dot_dot= */ false); +} +bool path_is_normalized(const char *p) _pure_; + +int file_in_same_dir(const char *path, const char *filename, char **ret); + +bool hidden_or_backup_file(const char *filename) _pure_; + +bool is_device_path(const char *path); + +bool valid_device_node_path(const char *path); +bool valid_device_allow_pattern(const char *path); + +bool dot_or_dot_dot(const char *path); + +static inline const char *skip_dev_prefix(const char *p) { + const char *e; + + /* Drop any /dev prefix if there is any */ + + e = path_startswith(p, "/dev/"); + + return e ?: p; +} + +bool empty_or_root(const char *path); +static inline const char* empty_to_root(const char *path) { + return isempty(path) ? "/" : path; +} + +bool path_strv_contains(char **l, const char *path); +bool prefixed_path_strv_contains(char **l, const char *path); + +int path_glob_can_match(const char *pattern, const char *prefix, char **ret); diff --git a/src/basic/pcapng.h b/src/basic/pcapng.h new file mode 100644 index 0000000..57c3af5 --- /dev/null +++ b/src/basic/pcapng.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* + * For details about the file format see RFC: + * https://www.ietf.org/id/draft-tuexen-opsawg-pcapng-03.html + * and + * https://github.com/pcapng/pcapng/ + */ +enum pcapng_block_types { + PCAPNG_INTERFACE_BLOCK = 1, + PCAPNG_PACKET_BLOCK, /* Obsolete */ + PCAPNG_SIMPLE_PACKET_BLOCK, + PCAPNG_NAME_RESOLUTION_BLOCK, + PCAPNG_INTERFACE_STATS_BLOCK, + PCAPNG_ENHANCED_PACKET_BLOCK, + + PCAPNG_SECTION_BLOCK = 0x0A0D0D0A, +}; + +struct pcapng_option { + uint16_t code; + uint16_t length; + uint8_t data[]; +}; + +#define PCAPNG_BYTE_ORDER_MAGIC 0x1A2B3C4D +#define PCAPNG_MAJOR_VERS 1 +#define PCAPNG_MINOR_VERS 0 + +enum pcapng_opt { + PCAPNG_OPT_END = 0, + PCAPNG_OPT_COMMENT = 1, +}; + +struct pcapng_section { + uint32_t block_type; + uint32_t block_length; + uint32_t byte_order_magic; + uint16_t major_version; + uint16_t minor_version; + uint64_t section_length; +}; + +enum pcapng_section_opt { + PCAPNG_SHB_HARDWARE = 2, + PCAPNG_SHB_OS = 3, + PCAPNG_SHB_USERAPPL = 4, +}; + +struct pcapng_interface_block { + uint32_t block_type; /* 1 */ + uint32_t block_length; + uint16_t link_type; + uint16_t reserved; + uint32_t snap_len; +}; + +enum pcapng_interface_options { + PCAPNG_IFB_NAME = 2, + PCAPNG_IFB_DESCRIPTION, + PCAPNG_IFB_IPV4ADDR, + PCAPNG_IFB_IPV6ADDR, + PCAPNG_IFB_MACADDR, + PCAPNG_IFB_EUIADDR, + PCAPNG_IFB_SPEED, + PCAPNG_IFB_TSRESOL, + PCAPNG_IFB_TZONE, + PCAPNG_IFB_FILTER, + PCAPNG_IFB_OS, + PCAPNG_IFB_FCSLEN, + PCAPNG_IFB_TSOFFSET, + PCAPNG_IFB_HARDWARE, +}; + +struct pcapng_enhance_packet_block { + uint32_t block_type; /* 6 */ + uint32_t block_length; + uint32_t interface_id; + uint32_t timestamp_hi; + uint32_t timestamp_lo; + uint32_t capture_length; + uint32_t original_length; +}; + +/* Flags values */ +#define PCAPNG_IFB_INBOUND 0b01 +#define PCAPNG_IFB_OUTBOUND 0b10 + +enum pcapng_epb_options { + PCAPNG_EPB_FLAGS = 2, + PCAPNG_EPB_HASH, + PCAPNG_EPB_DROPCOUNT, + PCAPNG_EPB_PACKETID, + PCAPNG_EPB_QUEUE, + PCAPNG_EPB_VERDICT, +}; + +struct pcapng_statistics_block { + uint32_t block_type; /* 5 */ + uint32_t block_length; + uint32_t interface_id; + uint32_t timestamp_hi; + uint32_t timestamp_lo; +}; + +enum pcapng_isb_options { + PCAPNG_ISB_STARTTIME = 2, + PCAPNG_ISB_ENDTIME, + PCAPNG_ISB_IFRECV, + PCAPNG_ISB_IFDROP, + PCAPNG_ISB_FILTERACCEPT, + PCAPNG_ISB_OSDROP, + PCAPNG_ISB_USRDELIV, +}; diff --git a/src/basic/percent-util.c b/src/basic/percent-util.c new file mode 100644 index 0000000..cab9d0e --- /dev/null +++ b/src/basic/percent-util.c @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "percent-util.h" +#include "string-util.h" +#include "parse-util.h" + +static int parse_parts_value_whole(const char *p, const char *symbol) { + const char *pc, *n; + int r, v; + + pc = endswith(p, symbol); + if (!pc) + return -EINVAL; + + n = strndupa_safe(p, pc - p); + r = safe_atoi(n, &v); + if (r < 0) + return r; + if (v < 0) + return -ERANGE; + + return v; +} + +static int parse_parts_value_with_tenths_place(const char *p, const char *symbol) { + const char *pc, *dot, *n; + int r, q, v; + + pc = endswith(p, symbol); + if (!pc) + return -EINVAL; + + dot = memchr(p, '.', pc - p); + if (dot) { + if (dot + 2 != pc) + return -EINVAL; + if (dot[1] < '0' || dot[1] > '9') + return -EINVAL; + q = dot[1] - '0'; + n = strndupa_safe(p, dot - p); + } else { + q = 0; + n = strndupa_safe(p, pc - p); + } + r = safe_atoi(n, &v); + if (r < 0) + return r; + if (v < 0) + return -ERANGE; + if (v > (INT_MAX - q) / 10) + return -ERANGE; + + v = v * 10 + q; + return v; +} + +static int parse_parts_value_with_hundredths_place(const char *p, const char *symbol) { + const char *pc, *dot, *n; + int r, q, v; + + pc = endswith(p, symbol); + if (!pc) + return -EINVAL; + + dot = memchr(p, '.', pc - p); + if (dot) { + if (dot + 3 == pc) { + /* Support two places after the dot */ + + if (dot[1] < '0' || dot[1] > '9' || dot[2] < '0' || dot[2] > '9') + return -EINVAL; + q = (dot[1] - '0') * 10 + (dot[2] - '0'); + + } else if (dot + 2 == pc) { + /* Support one place after the dot */ + + if (dot[1] < '0' || dot[1] > '9') + return -EINVAL; + q = (dot[1] - '0') * 10; + } else + /* We do not support zero or more than two places */ + return -EINVAL; + + n = strndupa_safe(p, dot - p); + } else { + q = 0; + n = strndupa_safe(p, pc - p); + } + r = safe_atoi(n, &v); + if (r < 0) + return r; + if (v < 0) + return -ERANGE; + if (v > (INT_MAX - q) / 100) + return -ERANGE; + + v = v * 100 + q; + return v; +} + +int parse_percent_unbounded(const char *p) { + return parse_parts_value_whole(p, "%"); +} + +int parse_percent(const char *p) { + int v; + + v = parse_percent_unbounded(p); + if (v > 100) + return -ERANGE; + + return v; +} + +int parse_permille_unbounded(const char *p) { + const char *pm; + + pm = endswith(p, "‰"); + if (pm) + return parse_parts_value_whole(p, "‰"); + + return parse_parts_value_with_tenths_place(p, "%"); +} + +int parse_permille(const char *p) { + int v; + + v = parse_permille_unbounded(p); + if (v > 1000) + return -ERANGE; + + return v; +} + +int parse_permyriad_unbounded(const char *p) { + const char *pm; + + pm = endswith(p, "‱"); + if (pm) + return parse_parts_value_whole(p, "‱"); + + pm = endswith(p, "‰"); + if (pm) + return parse_parts_value_with_tenths_place(p, "‰"); + + return parse_parts_value_with_hundredths_place(p, "%"); +} + +int parse_permyriad(const char *p) { + int v; + + v = parse_permyriad_unbounded(p); + if (v > 10000) + return -ERANGE; + + return v; +} diff --git a/src/basic/percent-util.h b/src/basic/percent-util.h new file mode 100644 index 0000000..e975d6e --- /dev/null +++ b/src/basic/percent-util.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +int parse_percent_unbounded(const char *p); +int parse_percent(const char *p); + +int parse_permille_unbounded(const char *p); +int parse_permille(const char *p); + +int parse_permyriad_unbounded(const char *p); +int parse_permyriad(const char *p); + +/* Some macro-like helpers that convert a percent/permille/permyriad value (as parsed by parse_percent()) to + * a value relative to 100% == 2^32-1. Rounds to closest. */ +static inline uint32_t UINT32_SCALE_FROM_PERCENT(int percent) { + assert_cc(INT_MAX <= UINT32_MAX); + + return (uint32_t) (((uint64_t) CLAMP(percent, 0, 100) * UINT32_MAX + 50) / 100U); +} + +static inline uint32_t UINT32_SCALE_FROM_PERMILLE(int permille) { + return (uint32_t) (((uint64_t) CLAMP(permille, 0, 1000) * UINT32_MAX + 500) / 1000U); +} + +static inline uint32_t UINT32_SCALE_FROM_PERMYRIAD(int permyriad) { + return (uint32_t) (((uint64_t) CLAMP(permyriad, 0, 10000) * UINT32_MAX + 5000) / 10000U); +} + +static inline int UINT32_SCALE_TO_PERCENT(uint32_t scale) { + uint32_t u; + + u = (uint32_t) ((((uint64_t) scale) * 100U + UINT32_MAX/2) / UINT32_MAX); + if (u > INT_MAX) + return -ERANGE; + + return (int) u; +} + +static inline int UINT32_SCALE_TO_PERMILLE(uint32_t scale) { + uint32_t u; + + u = (uint32_t) ((((uint64_t) scale) * 1000U + UINT32_MAX/2) / UINT32_MAX); + if (u > INT_MAX) + return -ERANGE; + + return (int) u; +} + +static inline int UINT32_SCALE_TO_PERMYRIAD(uint32_t scale) { + uint32_t u; + + u = (uint32_t) ((((uint64_t) scale) * 10000U + UINT32_MAX/2) / UINT32_MAX); + if (u > INT_MAX) + return -ERANGE; + + return (int) u; +} + +#define PERMYRIAD_AS_PERCENT_FORMAT_STR "%i.%02i%%" +#define PERMYRIAD_AS_PERCENT_FORMAT_VAL(x) ((x)/100), ((x)%100) diff --git a/src/basic/pidref.c b/src/basic/pidref.c new file mode 100644 index 0000000..69b5cad --- /dev/null +++ b/src/basic/pidref.c @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "errno-util.h" +#include "fd-util.h" +#include "missing_syscall.h" +#include "parse-util.h" +#include "pidref.h" +#include "process-util.h" +#include "signal-util.h" + +int pidref_set_pid(PidRef *pidref, pid_t pid) { + int fd; + + assert(pidref); + + if (pid < 0) + return -ESRCH; + if (pid == 0) + pid = getpid_cached(); + + fd = pidfd_open(pid, 0); + if (fd < 0) { + /* Graceful fallback in case the kernel doesn't support pidfds or is out of fds */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno) && !ERRNO_IS_RESOURCE(errno)) + return -errno; + + fd = -EBADF; + } + + *pidref = (PidRef) { + .fd = fd, + .pid = pid, + }; + + return 0; +} + +int pidref_set_pidstr(PidRef *pidref, const char *pid) { + pid_t nr; + int r; + + assert(pidref); + + r = parse_pid(pid, &nr); + if (r < 0) + return r; + + return pidref_set_pid(pidref, nr); +} + +int pidref_set_pidfd(PidRef *pidref, int fd) { + int r; + + assert(pidref); + + if (fd < 0) + return -EBADF; + + int fd_copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (fd_copy < 0) { + pid_t pid; + + if (!ERRNO_IS_RESOURCE(errno)) + return -errno; + + /* Graceful fallback if we are out of fds */ + r = pidfd_get_pid(fd, &pid); + if (r < 0) + return r; + + *pidref = PIDREF_MAKE_FROM_PID(pid); + return 0; + } + + return pidref_set_pidfd_consume(pidref, fd_copy); +} + +int pidref_set_pidfd_take(PidRef *pidref, int fd) { + pid_t pid; + int r; + + assert(pidref); + + if (fd < 0) + return -EBADF; + + r = pidfd_get_pid(fd, &pid); + if (r < 0) + return r; + + *pidref = (PidRef) { + .fd = fd, + .pid = pid, + }; + + return 0; +} + +int pidref_set_pidfd_consume(PidRef *pidref, int fd) { + int r; + + r = pidref_set_pidfd_take(pidref, fd); + if (r < 0) + safe_close(fd); + + return r; +} + +void pidref_done(PidRef *pidref) { + assert(pidref); + + *pidref = (PidRef) { + .fd = safe_close(pidref->fd), + }; +} + +PidRef *pidref_free(PidRef *pidref) { + /* Regularly, this is an embedded structure. But sometimes we want it on the heap too */ + if (!pidref) + return NULL; + + pidref_done(pidref); + return mfree(pidref); +} + +int pidref_dup(const PidRef *pidref, PidRef **ret) { + _cleanup_close_ int dup_fd = -EBADF; + pid_t dup_pid = 0; + + assert(ret); + + /* Allocates a new PidRef on the heap, making it a copy of the specified pidref. This does not try to + * acquire a pidfd if we don't have one yet! + * + * If NULL is passed we'll generate a PidRef that refers to no process. This makes it easy to copy + * pidref fields that might or might not reference a process yet. */ + + if (pidref) { + if (pidref->fd >= 0) { + dup_fd = fcntl(pidref->fd, F_DUPFD_CLOEXEC, 3); + if (dup_fd < 0) { + if (!ERRNO_IS_RESOURCE(errno)) + return -errno; + + dup_fd = -EBADF; + } + } + + if (pidref->pid > 0) + dup_pid = pidref->pid; + } + + PidRef *dup_pidref = new(PidRef, 1); + if (!dup_pidref) + return -ENOMEM; + + *dup_pidref = (PidRef) { + .fd = TAKE_FD(dup_fd), + .pid = dup_pid, + }; + + *ret = TAKE_PTR(dup_pidref); + return 0; +} + +int pidref_new_from_pid(pid_t pid, PidRef **ret) { + _cleanup_(pidref_freep) PidRef *n = 0; + int r; + + assert(ret); + + if (pid < 0) + return -ESRCH; + + n = new(PidRef, 1); + if (!n) + return -ENOMEM; + + *n = PIDREF_NULL; + + r = pidref_set_pid(n, pid); + if (r < 0) + return r; + + *ret = TAKE_PTR(n); + return 0; +} + +int pidref_kill(const PidRef *pidref, int sig) { + + if (!pidref) + return -ESRCH; + + if (pidref->fd >= 0) + return RET_NERRNO(pidfd_send_signal(pidref->fd, sig, NULL, 0)); + + if (pidref->pid > 0) + return RET_NERRNO(kill(pidref->pid, sig)); + + return -ESRCH; +} + +int pidref_kill_and_sigcont(const PidRef *pidref, int sig) { + int r; + + r = pidref_kill(pidref, sig); + if (r < 0) + return r; + + if (!IN_SET(sig, SIGCONT, SIGKILL)) + (void) pidref_kill(pidref, SIGCONT); + + return 0; +} + +int pidref_sigqueue(const PidRef *pidref, int sig, int value) { + + if (!pidref) + return -ESRCH; + + if (pidref->fd >= 0) { + siginfo_t si; + + /* We can't use structured initialization here, since the structure contains various unions + * and these fields lie in overlapping (carefully aligned) unions that LLVM is allergic to + * allow assignments to */ + zero(si); + si.si_signo = sig; + si.si_code = SI_QUEUE; + si.si_pid = getpid_cached(); + si.si_uid = getuid(); + si.si_value.sival_int = value; + + return RET_NERRNO(pidfd_send_signal(pidref->fd, sig, &si, 0)); + } + + if (pidref->pid > 0) + return RET_NERRNO(sigqueue(pidref->pid, sig, (const union sigval) { .sival_int = value })); + + return -ESRCH; +} + +int pidref_verify(const PidRef *pidref) { + int r; + + /* This is a helper that is supposed to be called after reading information from procfs via a + * PidRef. It ensures that the PID we track still matches the PIDFD we pin. If this value differs + * after a procfs read, we might have read the data from a recycled PID. */ + + if (!pidref_is_set(pidref)) + return -ESRCH; + + if (pidref->pid == 1) + return 1; /* PID 1 can never go away, hence never be recycled to a different process → return 1 */ + + if (pidref->fd < 0) + return 0; /* If we don't have a pidfd we cannot validate it, hence we assume it's all OK → return 0 */ + + r = pidfd_verify_pid(pidref->fd, pidref->pid); + if (r < 0) + return r; + + return 1; /* We have a pidfd and it still points to the PID we have, hence all is *really* OK → return 1 */ +} + +bool pidref_is_self(const PidRef *pidref) { + if (!pidref) + return false; + + return pidref->pid == getpid_cached(); +} + +static void pidref_hash_func(const PidRef *pidref, struct siphash *state) { + siphash24_compress(&pidref->pid, sizeof(pidref->pid), state); +} + +static int pidref_compare_func(const PidRef *a, const PidRef *b) { + return CMP(a->pid, b->pid); +} + +DEFINE_HASH_OPS(pidref_hash_ops, PidRef, pidref_hash_func, pidref_compare_func); + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(pidref_hash_ops_free, + PidRef, pidref_hash_func, pidref_compare_func, + pidref_free); diff --git a/src/basic/pidref.h b/src/basic/pidref.h new file mode 100644 index 0000000..dada069 --- /dev/null +++ b/src/basic/pidref.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +/* An embeddable structure carrying a reference to a process. Supposed to be used when tracking processes continuously. */ +typedef struct PidRef { + pid_t pid; /* always valid */ + int fd; /* only valid if pidfd are available in the kernel, and we manage to get an fd */ +} PidRef; + +#define PIDREF_NULL (const PidRef) { .fd = -EBADF } + +/* Turns a pid_t into a PidRef structure on-the-fly *without* acquiring a pidfd for it. (As opposed to + * pidref_set_pid() which does so *with* acquiring one, see below) */ +#define PIDREF_MAKE_FROM_PID(x) (PidRef) { .pid = (x), .fd = -EBADF } + +static inline bool pidref_is_set(const PidRef *pidref) { + return pidref && pidref->pid > 0; +} + +static inline bool pidref_equal(const PidRef *a, const PidRef *b) { + + if (pidref_is_set(a)) { + if (!pidref_is_set(b)) + return false; + + return a->pid == b->pid; + } + + return !pidref_is_set(b); +} + +/* This turns a pid_t into a PidRef structure, and acquires a pidfd for it, if possible. (As opposed to + * PIDREF_MAKE_FROM_PID() above, which does not acquire a pidfd.) */ +int pidref_set_pid(PidRef *pidref, pid_t pid); +int pidref_set_pidstr(PidRef *pidref, const char *pid); +int pidref_set_pidfd(PidRef *pidref, int fd); +int pidref_set_pidfd_take(PidRef *pidref, int fd); /* takes ownership of the passed pidfd on success*/ +int pidref_set_pidfd_consume(PidRef *pidref, int fd); /* takes ownership of the passed pidfd in both success and failure */ + +static inline int pidref_set_self(PidRef *pidref) { + return pidref_set_pid(pidref, 0); +} + +bool pidref_is_self(const PidRef *pidref); + +void pidref_done(PidRef *pidref); +PidRef *pidref_free(PidRef *pidref); +DEFINE_TRIVIAL_CLEANUP_FUNC(PidRef*, pidref_free); + +int pidref_dup(const PidRef *pidref, PidRef **ret); + +int pidref_new_from_pid(pid_t pid, PidRef **ret); + +int pidref_kill(const PidRef *pidref, int sig); +int pidref_kill_and_sigcont(const PidRef *pidref, int sig); +int pidref_sigqueue(const PidRef *pidfref, int sig, int value); + +int pidref_verify(const PidRef *pidref); + +#define TAKE_PIDREF(p) TAKE_GENERIC((p), PidRef, PIDREF_NULL) + +extern const struct hash_ops pidref_hash_ops; +extern const struct hash_ops pidref_hash_ops_free; /* Has destructor call for pidref_free(), i.e. expects heap allocated PidRef as keys */ diff --git a/src/basic/prioq.c b/src/basic/prioq.c new file mode 100644 index 0000000..5fbb999 --- /dev/null +++ b/src/basic/prioq.c @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* + * Priority Queue + * The prioq object implements a priority queue. That is, it orders objects by + * their priority and allows O(1) access to the object with the highest + * priority. Insertion and removal are Θ(log n). Optionally, the caller can + * provide a pointer to an index which will be kept up-to-date by the prioq. + * + * The underlying algorithm used in this implementation is a Heap. + */ + +#include +#include + +#include "alloc-util.h" +#include "hashmap.h" +#include "prioq.h" + +struct prioq_item { + void *data; + unsigned *idx; +}; + +struct Prioq { + compare_func_t compare_func; + unsigned n_items, n_allocated; + + struct prioq_item *items; +}; + +Prioq *prioq_new(compare_func_t compare_func) { + Prioq *q; + + q = new(Prioq, 1); + if (!q) + return q; + + *q = (Prioq) { + .compare_func = compare_func, + }; + + return q; +} + +Prioq* prioq_free(Prioq *q) { + if (!q) + return NULL; + + free(q->items); + return mfree(q); +} + +int prioq_ensure_allocated(Prioq **q, compare_func_t compare_func) { + assert(q); + + if (*q) + return 0; + + *q = prioq_new(compare_func); + if (!*q) + return -ENOMEM; + + return 0; +} + +static void swap(Prioq *q, unsigned j, unsigned k) { + assert(q); + assert(j < q->n_items); + assert(k < q->n_items); + + assert(!q->items[j].idx || *(q->items[j].idx) == j); + assert(!q->items[k].idx || *(q->items[k].idx) == k); + + SWAP_TWO(q->items[j].data, q->items[k].data); + SWAP_TWO(q->items[j].idx, q->items[k].idx); + + if (q->items[j].idx) + *q->items[j].idx = j; + + if (q->items[k].idx) + *q->items[k].idx = k; +} + +static unsigned shuffle_up(Prioq *q, unsigned idx) { + assert(q); + assert(idx < q->n_items); + + while (idx > 0) { + unsigned k; + + k = (idx-1)/2; + + if (q->compare_func(q->items[k].data, q->items[idx].data) <= 0) + break; + + swap(q, idx, k); + idx = k; + } + + return idx; +} + +static unsigned shuffle_down(Prioq *q, unsigned idx) { + assert(q); + + for (;;) { + unsigned j, k, s; + + k = (idx+1)*2; /* right child */ + j = k-1; /* left child */ + + if (j >= q->n_items) + break; + + if (q->compare_func(q->items[j].data, q->items[idx].data) < 0) + + /* So our left child is smaller than we are, let's + * remember this fact */ + s = j; + else + s = idx; + + if (k < q->n_items && + q->compare_func(q->items[k].data, q->items[s].data) < 0) + + /* So our right child is smaller than we are, let's + * remember this fact */ + s = k; + + /* s now points to the smallest of the three items */ + + if (s == idx) + /* No swap necessary, we're done */ + break; + + swap(q, idx, s); + idx = s; + } + + return idx; +} + +int prioq_put(Prioq *q, void *data, unsigned *idx) { + struct prioq_item *i; + unsigned k; + + assert(q); + + if (q->n_items >= q->n_allocated) { + unsigned n; + struct prioq_item *j; + + n = MAX((q->n_items+1) * 2, 16u); + j = reallocarray(q->items, n, sizeof(struct prioq_item)); + if (!j) + return -ENOMEM; + + q->items = j; + q->n_allocated = n; + } + + k = q->n_items++; + i = q->items + k; + i->data = data; + i->idx = idx; + + if (idx) + *idx = k; + + shuffle_up(q, k); + + return 0; +} + +int prioq_ensure_put(Prioq **q, compare_func_t compare_func, void *data, unsigned *idx) { + int r; + + r = prioq_ensure_allocated(q, compare_func); + if (r < 0) + return r; + + return prioq_put(*q, data, idx); +} + +static void remove_item(Prioq *q, struct prioq_item *i) { + struct prioq_item *l; + + assert(q); + assert(i); + + l = q->items + q->n_items - 1; + + if (i == l) + /* Last entry, let's just remove it */ + q->n_items--; + else { + unsigned k; + + /* Not last entry, let's replace the last entry with + * this one, and reshuffle */ + + k = i - q->items; + + i->data = l->data; + i->idx = l->idx; + if (i->idx) + *i->idx = k; + q->n_items--; + + k = shuffle_down(q, k); + shuffle_up(q, k); + } +} + +static struct prioq_item* find_item(Prioq *q, void *data, unsigned *idx) { + struct prioq_item *i; + + assert(q); + + if (q->n_items <= 0) + return NULL; + + if (idx) { + if (*idx == PRIOQ_IDX_NULL || + *idx >= q->n_items) + return NULL; + + i = q->items + *idx; + if (i->data != data) + return NULL; + + return i; + } else { + for (i = q->items; i < q->items + q->n_items; i++) + if (i->data == data) + return i; + return NULL; + } +} + +int prioq_remove(Prioq *q, void *data, unsigned *idx) { + struct prioq_item *i; + + if (!q) + return 0; + + i = find_item(q, data, idx); + if (!i) + return 0; + + remove_item(q, i); + return 1; +} + +void prioq_reshuffle(Prioq *q, void *data, unsigned *idx) { + struct prioq_item *i; + unsigned k; + + assert(q); + + i = find_item(q, data, idx); + if (!i) + return; + + k = i - q->items; + k = shuffle_down(q, k); + shuffle_up(q, k); +} + +void *prioq_peek_by_index(Prioq *q, unsigned idx) { + if (!q) + return NULL; + + if (idx >= q->n_items) + return NULL; + + return q->items[idx].data; +} + +void *prioq_pop(Prioq *q) { + void *data; + + if (!q) + return NULL; + + if (q->n_items <= 0) + return NULL; + + data = q->items[0].data; + remove_item(q, q->items); + return data; +} + +unsigned prioq_size(Prioq *q) { + + if (!q) + return 0; + + return q->n_items; +} + +bool prioq_isempty(Prioq *q) { + + if (!q) + return true; + + return q->n_items <= 0; +} diff --git a/src/basic/prioq.h b/src/basic/prioq.h new file mode 100644 index 0000000..f66562f --- /dev/null +++ b/src/basic/prioq.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "hashmap.h" +#include "macro.h" + +typedef struct Prioq Prioq; + +#define PRIOQ_IDX_NULL (UINT_MAX) + +Prioq *prioq_new(compare_func_t compare); +Prioq *prioq_free(Prioq *q); +DEFINE_TRIVIAL_CLEANUP_FUNC(Prioq*, prioq_free); +int prioq_ensure_allocated(Prioq **q, compare_func_t compare_func); + +int prioq_put(Prioq *q, void *data, unsigned *idx); +int prioq_ensure_put(Prioq **q, compare_func_t compare_func, void *data, unsigned *idx); +int prioq_remove(Prioq *q, void *data, unsigned *idx); +void prioq_reshuffle(Prioq *q, void *data, unsigned *idx); + +void *prioq_peek_by_index(Prioq *q, unsigned idx) _pure_; +static inline void *prioq_peek(Prioq *q) { + return prioq_peek_by_index(q, 0); +} +void *prioq_pop(Prioq *q); + +#define PRIOQ_FOREACH_ITEM(q, p) \ + for (unsigned _i = 0; (p = prioq_peek_by_index(q, _i)); _i++) + +unsigned prioq_size(Prioq *q) _pure_; +bool prioq_isempty(Prioq *q) _pure_; diff --git a/src/basic/proc-cmdline.c b/src/basic/proc-cmdline.c new file mode 100644 index 0000000..522d8de --- /dev/null +++ b/src/basic/proc-cmdline.c @@ -0,0 +1,501 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "efivars.h" +#include "extract-word.h" +#include "fileio.h" +#include "getopt-defs.h" +#include "initrd-util.h" +#include "macro.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" +#include "virt.h" + +int proc_cmdline_filter_pid1_args(char **argv, char ***ret) { + enum { + COMMON_GETOPT_ARGS, + SYSTEMD_GETOPT_ARGS, + SHUTDOWN_GETOPT_ARGS, + }; + static const struct option options[] = { + COMMON_GETOPT_OPTIONS, + SYSTEMD_GETOPT_OPTIONS, + SHUTDOWN_GETOPT_OPTIONS, + }; + static const char *short_options = SYSTEMD_GETOPT_SHORT_OPTIONS; + + _cleanup_strv_free_ char **filtered = NULL; + int state, r; + + assert(argv); + assert(ret); + + /* Currently, we do not support '-', '+', and ':' at the beginning. */ + assert(!IN_SET(short_options[0], '-', '+', ':')); + + /* Filter out all known options. */ + state = no_argument; + STRV_FOREACH(p, strv_skip(argv, 1)) { + int prev_state = state; + const char *a = *p; + + /* Reset the state for the next step. */ + state = no_argument; + + if (prev_state == required_argument || + (prev_state == optional_argument && a[0] != '-')) + /* Handled as an argument of the previous option, filtering out the string. */ + continue; + + if (a[0] != '-') { + /* Not an option, accepting the string. */ + r = strv_extend(&filtered, a); + if (r < 0) + return r; + continue; + } + + if (a[1] == '-') { + if (a[2] == '\0') { + /* "--" is specified, accepting remaining strings. */ + r = strv_extend_strv(&filtered, strv_skip(p, 1), /* filter_duplicates = */ false); + if (r < 0) + return r; + break; + } + + /* long option, e.g. --foo */ + for (size_t i = 0; i < ELEMENTSOF(options); i++) { + const char *q = startswith(a + 2, options[i].name); + if (!q || !IN_SET(q[0], '=', '\0')) + continue; + + /* Found matching option, updating the state if necessary. */ + if (q[0] == '\0' && options[i].has_arg == required_argument) + state = required_argument; + + break; + } + continue; + } + + /* short option(s), e.g. -x or -xyz */ + while (a && *++a != '\0') + for (const char *q = short_options; *q != '\0'; q++) { + if (*q != *a) + continue; + + /* Found matching short option. */ + + if (q[1] == ':') { + /* An argument is required or optional, and remaining part + * is handled as argument if exists. */ + state = a[1] != '\0' ? no_argument : + q[2] == ':' ? optional_argument : required_argument; + + a = NULL; /* Not necessary to parse remaining part. */ + } + break; + } + } + + *ret = TAKE_PTR(filtered); + return 0; +} + +int proc_cmdline(char **ret) { + const char *e; + + assert(ret); + + /* For testing purposes it is sometimes useful to be able to override what we consider /proc/cmdline to be */ + e = secure_getenv("SYSTEMD_PROC_CMDLINE"); + if (e) { + char *m; + + m = strdup(e); + if (!m) + return -ENOMEM; + + *ret = m; + return 0; + } + + if (detect_container() > 0) + return pid_get_cmdline(1, SIZE_MAX, 0, ret); + + return read_virtual_file("/proc/cmdline", SIZE_MAX, ret, NULL); +} + +static int proc_cmdline_strv_internal(char ***ret, bool filter_pid1_args) { + const char *e; + int r; + + assert(ret); + + /* For testing purposes it is sometimes useful to be able to override what we consider /proc/cmdline to be */ + e = secure_getenv("SYSTEMD_PROC_CMDLINE"); + if (e) + return strv_split_full(ret, e, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX|EXTRACT_RETAIN_ESCAPE); + + if (detect_container() > 0) { + _cleanup_strv_free_ char **args = NULL; + + r = pid_get_cmdline_strv(1, /* flags = */ 0, &args); + if (r < 0) + return r; + + if (filter_pid1_args) + return proc_cmdline_filter_pid1_args(args, ret); + + *ret = TAKE_PTR(args); + return 0; + + } else { + _cleanup_free_ char *s = NULL; + + r = read_full_file("/proc/cmdline", &s, NULL); + if (r < 0) + return r; + + return strv_split_full(ret, s, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX|EXTRACT_RETAIN_ESCAPE); + } +} + +int proc_cmdline_strv(char ***ret) { + return proc_cmdline_strv_internal(ret, /* filter_pid1_args = */ false); +} + +static char *mangle_word(const char *word, ProcCmdlineFlags flags) { + char *c; + + c = startswith(word, "rd."); + if (c) { + /* Filter out arguments that are intended only for the initrd */ + + if (!in_initrd()) + return NULL; + + if (FLAGS_SET(flags, PROC_CMDLINE_STRIP_RD_PREFIX)) + return c; + + } else if (FLAGS_SET(flags, PROC_CMDLINE_RD_STRICT) && in_initrd()) + /* And optionally filter out arguments that are intended only for the host */ + return NULL; + + return (char*) word; +} + +static int proc_cmdline_parse_strv(char **args, proc_cmdline_parse_t parse_item, void *data, ProcCmdlineFlags flags) { + int r; + + assert(parse_item); + + STRV_FOREACH(word, args) { + char *key, *value; + + key = mangle_word(*word, flags); + if (!key) + continue; + + value = strchr(key, '='); + if (value) + *(value++) = '\0'; + + r = parse_item(key, value, data); + if (r < 0) + return r; + } + + return 0; +} + +int proc_cmdline_parse(proc_cmdline_parse_t parse_item, void *data, ProcCmdlineFlags flags) { + _cleanup_strv_free_ char **args = NULL; + int r; + + assert(parse_item); + + /* The PROC_CMDLINE_VALUE_OPTIONAL and PROC_CMDLINE_TRUE_WHEN_MISSING flags don't really make sense + * for proc_cmdline_parse(), let's make this clear. */ + assert(!(flags & (PROC_CMDLINE_VALUE_OPTIONAL|PROC_CMDLINE_TRUE_WHEN_MISSING))); + + /* We parse the EFI variable first, because later settings have higher priority. */ + + if (!FLAGS_SET(flags, PROC_CMDLINE_IGNORE_EFI_OPTIONS)) { + _cleanup_free_ char *line = NULL; + + r = systemd_efi_options_variable(&line); + if (r < 0) { + if (r != -ENODATA) + log_debug_errno(r, "Failed to get SystemdOptions EFI variable, ignoring: %m"); + } else { + r = strv_split_full(&args, line, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + + r = proc_cmdline_parse_strv(args, parse_item, data, flags); + if (r < 0) + return r; + + args = strv_free(args); + } + } + + r = proc_cmdline_strv_internal(&args, /* filter_pid1_args = */ true); + if (r < 0) + return r; + + return proc_cmdline_parse_strv(args, parse_item, data, flags); +} + +static bool relaxed_equal_char(char a, char b) { + return a == b || + (a == '_' && b == '-') || + (a == '-' && b == '_'); +} + +char *proc_cmdline_key_startswith(const char *s, const char *prefix) { + assert(s); + assert(prefix); + + /* Much like startswith(), but considers "-" and "_" the same */ + + for (; *prefix != 0; s++, prefix++) + if (!relaxed_equal_char(*s, *prefix)) + return NULL; + + return (char*) s; +} + +bool proc_cmdline_key_streq(const char *x, const char *y) { + assert(x); + assert(y); + + /* Much like streq(), but considers "-" and "_" the same */ + + for (; *x != 0 || *y != 0; x++, y++) + if (!relaxed_equal_char(*x, *y)) + return false; + + return true; +} + +static int cmdline_get_key(char **args, const char *key, ProcCmdlineFlags flags, char **ret_value) { + _cleanup_free_ char *v = NULL; + bool found = false; + int r; + + assert(key); + + STRV_FOREACH(p, args) { + const char *word; + + word = mangle_word(*p, flags); + if (!word) + continue; + + if (ret_value) { + const char *e; + + e = proc_cmdline_key_startswith(word, key); + if (!e) + continue; + + if (*e == '=') { + r = free_and_strdup(&v, e+1); + if (r < 0) + return r; + + found = true; + + } else if (*e == 0 && FLAGS_SET(flags, PROC_CMDLINE_VALUE_OPTIONAL)) + found = true; + + } else { + if (proc_cmdline_key_streq(word, key)) { + found = true; + break; /* we found what we were looking for */ + } + } + } + + if (ret_value) + *ret_value = TAKE_PTR(v); + + return found; +} + +int proc_cmdline_get_key(const char *key, ProcCmdlineFlags flags, char **ret_value) { + _cleanup_strv_free_ char **args = NULL; + _cleanup_free_ char *line = NULL, *v = NULL; + int r; + + /* Looks for a specific key on the kernel command line and (with lower priority) the EFI variable. + * Supports three modes: + * + * a) The "ret_value" parameter is used. In this case a parameter beginning with the "key" string followed by + * "=" is searched for, and the value following it is returned in "ret_value". + * + * b) as above, but the PROC_CMDLINE_VALUE_OPTIONAL flag is set. In this case if the key is found as a separate + * word (i.e. not followed by "=" but instead by whitespace or the end of the command line), then this is + * also accepted, and "value" is returned as NULL. + * + * c) The "ret_value" parameter is NULL. In this case a search for the exact "key" parameter is performed. + * + * In all three cases, > 0 is returned if the key is found, 0 if not. */ + + /* PROC_CMDLINE_TRUE_WHEN_MISSING doesn't really make sense for proc_cmdline_get_key(). */ + assert(!FLAGS_SET(flags, PROC_CMDLINE_TRUE_WHEN_MISSING)); + + if (isempty(key)) + return -EINVAL; + + if (FLAGS_SET(flags, PROC_CMDLINE_VALUE_OPTIONAL) && !ret_value) + return -EINVAL; + + r = proc_cmdline_strv_internal(&args, /* filter_pid1_args = */ true); + if (r < 0) + return r; + + if (FLAGS_SET(flags, PROC_CMDLINE_IGNORE_EFI_OPTIONS)) /* Shortcut */ + return cmdline_get_key(args, key, flags, ret_value); + + r = cmdline_get_key(args, key, flags, ret_value ? &v : NULL); + if (r < 0) + return r; + if (r > 0) { + if (ret_value) + *ret_value = TAKE_PTR(v); + + return r; + } + + r = systemd_efi_options_variable(&line); + if (r == -ENODATA) { + if (ret_value) + *ret_value = NULL; + + return false; /* Not found */ + } + if (r < 0) + return r; + + args = strv_free(args); + r = strv_split_full(&args, line, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + + return cmdline_get_key(args, key, flags, ret_value); +} + +int proc_cmdline_get_bool(const char *key, ProcCmdlineFlags flags, bool *ret) { + _cleanup_free_ char *v = NULL; + int r; + + assert(ret); + + r = proc_cmdline_get_key(key, (flags & ~PROC_CMDLINE_TRUE_WHEN_MISSING) | PROC_CMDLINE_VALUE_OPTIONAL, &v); + if (r < 0) + return r; + if (r == 0) { /* key not specified at all */ + *ret = FLAGS_SET(flags, PROC_CMDLINE_TRUE_WHEN_MISSING); + return 0; + } + + if (v) { /* key with parameter passed */ + r = parse_boolean(v); + if (r < 0) + return r; + *ret = r; + } else /* key without parameter passed */ + *ret = true; + + return 1; +} + +static int cmdline_get_key_ap(ProcCmdlineFlags flags, char* const* args, va_list ap) { + int r, ret = 0; + + for (;;) { + char **v; + const char *k, *e; + + k = va_arg(ap, const char*); + if (!k) + break; + + assert_se(v = va_arg(ap, char**)); + + STRV_FOREACH(p, args) { + const char *word; + + word = mangle_word(*p, flags); + if (!word) + continue; + + e = proc_cmdline_key_startswith(word, k); + if (e && *e == '=') { + r = free_and_strdup(v, e + 1); + if (r < 0) + return r; + + ret++; + } + } + } + + return ret; +} + +int proc_cmdline_get_key_many_internal(ProcCmdlineFlags flags, ...) { + _cleanup_strv_free_ char **args = NULL; + int r, ret = 0; + va_list ap; + + /* The PROC_CMDLINE_VALUE_OPTIONAL and PROC_CMDLINE_TRUE_WHEN_MISSING flags don't really make sense + * for proc_cmdline_get_key_many, let's make this clear. */ + assert(!(flags & (PROC_CMDLINE_VALUE_OPTIONAL|PROC_CMDLINE_TRUE_WHEN_MISSING))); + + /* This call may clobber arguments on failure! */ + + if (!FLAGS_SET(flags, PROC_CMDLINE_IGNORE_EFI_OPTIONS)) { + _cleanup_free_ char *line = NULL; + + r = systemd_efi_options_variable(&line); + if (r < 0 && r != -ENODATA) + log_debug_errno(r, "Failed to get SystemdOptions EFI variable, ignoring: %m"); + if (r >= 0) { + r = strv_split_full(&args, line, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + + va_start(ap, flags); + r = cmdline_get_key_ap(flags, args, ap); + va_end(ap); + if (r < 0) + return r; + + ret = r; + args = strv_free(args); + } + } + + r = proc_cmdline_strv(&args); + if (r < 0) + return r; + + va_start(ap, flags); + r = cmdline_get_key_ap(flags, args, ap); + va_end(ap); + if (r < 0) + return r; + + return ret + r; +} diff --git a/src/basic/proc-cmdline.h b/src/basic/proc-cmdline.h new file mode 100644 index 0000000..9502fb8 --- /dev/null +++ b/src/basic/proc-cmdline.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "log.h" + +typedef enum ProcCmdlineFlags { + PROC_CMDLINE_STRIP_RD_PREFIX = 1 << 0, /* automatically strip "rd." prefix if it is set (and we are in the initrd, since otherwise we'd not consider it anyway) */ + PROC_CMDLINE_VALUE_OPTIONAL = 1 << 1, /* the value is optional (for boolean switches that can omit the value) */ + PROC_CMDLINE_RD_STRICT = 1 << 2, /* ignore this in the initrd */ + PROC_CMDLINE_IGNORE_EFI_OPTIONS = 1 << 3, /* don't check systemd's private EFI variable */ + PROC_CMDLINE_TRUE_WHEN_MISSING = 1 << 4, /* default to true when the key is missing for bool */ +} ProcCmdlineFlags; + +typedef int (*proc_cmdline_parse_t)(const char *key, const char *value, void *data); + +int proc_cmdline_filter_pid1_args(char **argv, char ***ret); + +int proc_cmdline(char **ret); +int proc_cmdline_strv(char ***ret); + +int proc_cmdline_parse(const proc_cmdline_parse_t parse, void *userdata, ProcCmdlineFlags flags); + +int proc_cmdline_get_key(const char *parameter, ProcCmdlineFlags flags, char **value); +int proc_cmdline_get_bool(const char *key, ProcCmdlineFlags flags, bool *ret); + +int proc_cmdline_get_key_many_internal(ProcCmdlineFlags flags, ...); +#define proc_cmdline_get_key_many(flags, ...) proc_cmdline_get_key_many_internal(flags, __VA_ARGS__, NULL) + +char *proc_cmdline_key_startswith(const char *s, const char *prefix); +bool proc_cmdline_key_streq(const char *x, const char *y); + +/* A little helper call, to be used in proc_cmdline_parse_t callbacks */ +static inline bool proc_cmdline_value_missing(const char *key, const char *value) { + if (!value) { + log_warning("Missing argument for %s= kernel command line switch, ignoring.", key); + return true; + } + + return false; +} diff --git a/src/basic/process-util.c b/src/basic/process-util.c new file mode 100644 index 0000000..4492e7d --- /dev/null +++ b/src/basic/process-util.c @@ -0,0 +1,2060 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +#include +#endif + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "architecture.h" +#include "argv-util.h" +#include "dirent-util.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "locale-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "missing_sched.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "raw-clone.h" +#include "rlimit-util.h" +#include "signal-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "terminal-util.h" +#include "user-util.h" +#include "utf8.h" + +/* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own + * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel. + */ +#define COMM_MAX_LEN 128 + +static int get_process_state(pid_t pid) { + _cleanup_free_ char *line = NULL; + const char *p; + char state; + int r; + + assert(pid >= 0); + + /* Shortcut: if we are enquired about our own state, we are obviously running */ + if (pid == 0 || pid == getpid_cached()) + return (unsigned char) 'R'; + + p = procfs_file_alloca(pid, "stat"); + + r = read_one_line_file(p, &line); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + p = strrchr(line, ')'); + if (!p) + return -EIO; + + p++; + + if (sscanf(p, " %c", &state) != 1) + return -EIO; + + return (unsigned char) state; +} + +int pid_get_comm(pid_t pid, char **ret) { + _cleanup_free_ char *escaped = NULL, *comm = NULL; + int r; + + assert(ret); + assert(pid >= 0); + + if (pid == 0 || pid == getpid_cached()) { + comm = new0(char, TASK_COMM_LEN + 1); /* Must fit in 16 byte according to prctl(2) */ + if (!comm) + return -ENOMEM; + + if (prctl(PR_GET_NAME, comm) < 0) + return -errno; + } else { + const char *p; + + p = procfs_file_alloca(pid, "comm"); + + /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */ + r = read_one_line_file(p, &comm); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + } + + escaped = new(char, COMM_MAX_LEN); + if (!escaped) + return -ENOMEM; + + /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */ + cellescape(escaped, COMM_MAX_LEN, comm); + + *ret = TAKE_PTR(escaped); + return 0; +} + +int pidref_get_comm(const PidRef *pid, char **ret) { + _cleanup_free_ char *comm = NULL; + int r; + + if (!pidref_is_set(pid)) + return -ESRCH; + + r = pid_get_comm(pid->pid, &comm); + if (r < 0) + return r; + + r = pidref_verify(pid); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(comm); + return 0; +} + +static int pid_get_cmdline_nulstr( + pid_t pid, + size_t max_size, + ProcessCmdlineFlags flags, + char **ret, + size_t *ret_size) { + + _cleanup_free_ char *t = NULL; + const char *p; + size_t k; + int r; + + /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but + * with a specified size. + * + * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set + * (the case for kernel threads), or has a command line that resolves to the empty string, will + * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input + * data. + * + * Returns an error, 0 if output was read but is truncated, 1 otherwise. + */ + + p = procfs_file_alloca(pid, "cmdline"); + r = read_virtual_file(p, max_size, &t, &k); /* Let's assume that each input byte results in >= 1 + * columns of output. We ignore zero-width codepoints. */ + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + if (k == 0) { + if (!(flags & PROCESS_CMDLINE_COMM_FALLBACK)) + return -ENOENT; + + /* Kernel threads have no argv[] */ + _cleanup_free_ char *comm = NULL; + + r = pid_get_comm(pid, &comm); + if (r < 0) + return r; + + free(t); + t = strjoin("[", comm, "]"); + if (!t) + return -ENOMEM; + + k = strlen(t); + r = k <= max_size; + if (r == 0) /* truncation */ + t[max_size] = '\0'; + } + + if (ret) + *ret = TAKE_PTR(t); + if (ret_size) + *ret_size = k; + + return r; +} + +int pid_get_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) { + _cleanup_free_ char *t = NULL; + size_t k; + char *ans; + + assert(pid >= 0); + assert(ret); + + /* Retrieve and format a command line. See above for discussion of retrieval options. + * + * There are two main formatting modes: + * + * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special + * characters are present, this output can be copy-pasted into the terminal to execute. UTF-8 + * output is assumed. + * + * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The + * returned string is of the specified console width at most, abbreviated with an ellipsis. + * + * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and + * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */ + + int full = pid_get_cmdline_nulstr(pid, max_columns, flags, &t, &k); + if (full < 0) + return full; + + if (flags & (PROCESS_CMDLINE_QUOTE | PROCESS_CMDLINE_QUOTE_POSIX)) { + ShellEscapeFlags shflags = SHELL_ESCAPE_EMPTY | + FLAGS_SET(flags, PROCESS_CMDLINE_QUOTE_POSIX) * SHELL_ESCAPE_POSIX; + + assert(!(flags & PROCESS_CMDLINE_USE_LOCALE)); + + _cleanup_strv_free_ char **args = NULL; + + /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end. + * See also issue #21186. */ + args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true); + if (!args) + return -ENOMEM; + + ans = quote_command_line(args, shflags); + if (!ans) + return -ENOMEM; + } else { + /* Arguments are separated by NULs. Let's replace those with spaces. */ + for (size_t i = 0; i < k - 1; i++) + if (t[i] == '\0') + t[i] = ' '; + + delete_trailing_chars(t, WHITESPACE); + + bool eight_bit = (flags & PROCESS_CMDLINE_USE_LOCALE) && !is_locale_utf8(); + + ans = escape_non_printable_full(t, max_columns, + eight_bit * XESCAPE_8_BIT | !full * XESCAPE_FORCE_ELLIPSIS); + if (!ans) + return -ENOMEM; + + ans = str_realloc(ans); + } + + *ret = ans; + return 0; +} + +int pidref_get_cmdline(const PidRef *pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + if (!pidref_is_set(pid)) + return -ESRCH; + + r = pid_get_cmdline(pid->pid, max_columns, flags, &s); + if (r < 0) + return r; + + r = pidref_verify(pid); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(s); + return 0; +} + +int pid_get_cmdline_strv(pid_t pid, ProcessCmdlineFlags flags, char ***ret) { + _cleanup_free_ char *t = NULL; + char **args; + size_t k; + int r; + + assert(pid >= 0); + assert((flags & ~PROCESS_CMDLINE_COMM_FALLBACK) == 0); + assert(ret); + + r = pid_get_cmdline_nulstr(pid, SIZE_MAX, flags, &t, &k); + if (r < 0) + return r; + + args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls = */ true); + if (!args) + return -ENOMEM; + + *ret = args; + return 0; +} + +int pidref_get_cmdline_strv(const PidRef *pid, ProcessCmdlineFlags flags, char ***ret) { + _cleanup_strv_free_ char **args = NULL; + int r; + + if (!pidref_is_set(pid)) + return -ESRCH; + + r = pid_get_cmdline_strv(pid->pid, flags, &args); + if (r < 0) + return r; + + r = pidref_verify(pid); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(args); + + return 0; +} + +int container_get_leader(const char *machine, pid_t *pid) { + _cleanup_free_ char *s = NULL, *class = NULL; + const char *p; + pid_t leader; + int r; + + assert(machine); + assert(pid); + + if (streq(machine, ".host")) { + *pid = 1; + return 0; + } + + if (!hostname_is_valid(machine, 0)) + return -EINVAL; + + p = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, p, + "LEADER", &s, + "CLASS", &class); + if (r == -ENOENT) + return -EHOSTDOWN; + if (r < 0) + return r; + if (!s) + return -EIO; + + if (!streq_ptr(class, "container")) + return -EIO; + + r = parse_pid(s, &leader); + if (r < 0) + return r; + if (leader <= 1) + return -EIO; + + *pid = leader; + return 0; +} + +int namespace_get_leader(pid_t pid, NamespaceType type, pid_t *ret) { + int r; + + assert(ret); + + for (;;) { + pid_t ppid; + + r = get_process_ppid(pid, &ppid); + if (r < 0) + return r; + + r = in_same_namespace(pid, ppid, type); + if (r < 0) + return r; + if (r == 0) { + /* If the parent and the child are not in the same + * namespace, then the child is the leader we are + * looking for. */ + *ret = pid; + return 0; + } + + pid = ppid; + } +} + +int pid_is_kernel_thread(pid_t pid) { + _cleanup_free_ char *line = NULL; + unsigned long long flags; + size_t l, i; + const char *p; + char *q; + int r; + + if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */ + return 0; + if (!pid_is_valid(pid)) + return -EINVAL; + + p = procfs_file_alloca(pid, "stat"); + r = read_one_line_file(p, &line); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + /* Skip past the comm field */ + q = strrchr(line, ')'); + if (!q) + return -EINVAL; + q++; + + /* Skip 6 fields to reach the flags field */ + for (i = 0; i < 6; i++) { + l = strspn(q, WHITESPACE); + if (l < 1) + return -EINVAL; + q += l; + + l = strcspn(q, WHITESPACE); + if (l < 1) + return -EINVAL; + q += l; + } + + /* Skip preceding whitespace */ + l = strspn(q, WHITESPACE); + if (l < 1) + return -EINVAL; + q += l; + + /* Truncate the rest */ + l = strcspn(q, WHITESPACE); + if (l < 1) + return -EINVAL; + q[l] = 0; + + r = safe_atollu(q, &flags); + if (r < 0) + return r; + + return !!(flags & PF_KTHREAD); +} + +int pidref_is_kernel_thread(const PidRef *pid) { + int result, r; + + if (!pidref_is_set(pid)) + return -ESRCH; + + result = pid_is_kernel_thread(pid->pid); + if (result < 0) + return result; + + r = pidref_verify(pid); /* Verify that the PID wasn't reused since */ + if (r < 0) + return r; + + return result; +} + +int get_process_capeff(pid_t pid, char **ret) { + const char *p; + int r; + + assert(pid >= 0); + assert(ret); + + p = procfs_file_alloca(pid, "status"); + + r = get_proc_field(p, "CapEff", WHITESPACE, ret); + if (r == -ENOENT) + return -ESRCH; + + return r; +} + +static int get_process_link_contents(pid_t pid, const char *proc_file, char **ret) { + const char *p; + int r; + + assert(proc_file); + + p = procfs_file_alloca(pid, proc_file); + + r = readlink_malloc(p, ret); + return r == -ENOENT ? -ESRCH : r; +} + +int get_process_exe(pid_t pid, char **ret) { + char *d; + int r; + + assert(pid >= 0); + + r = get_process_link_contents(pid, "exe", ret); + if (r < 0) + return r; + + if (ret) { + d = endswith(*ret, " (deleted)"); + if (d) + *d = '\0'; + } + + return 0; +} + +static int get_process_id(pid_t pid, const char *field, uid_t *ret) { + _cleanup_fclose_ FILE *f = NULL; + const char *p; + int r; + + assert(field); + assert(ret); + + if (pid < 0) + return -EINVAL; + + p = procfs_file_alloca(pid, "status"); + r = fopen_unlocked(p, "re", &f); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + l = startswith(line, field); + if (l) { + l += strspn(l, WHITESPACE); + + l[strcspn(l, WHITESPACE)] = 0; + + return parse_uid(l, ret); + } + } + + return -EIO; +} + +int pid_get_uid(pid_t pid, uid_t *ret) { + assert(ret); + + if (pid == 0 || pid == getpid_cached()) { + *ret = getuid(); + return 0; + } + + return get_process_id(pid, "Uid:", ret); +} + +int pidref_get_uid(const PidRef *pid, uid_t *ret) { + uid_t uid; + int r; + + if (!pidref_is_set(pid)) + return -ESRCH; + + r = pid_get_uid(pid->pid, &uid); + if (r < 0) + return r; + + r = pidref_verify(pid); + if (r < 0) + return r; + + if (ret) + *ret = uid; + return 0; +} + +int get_process_gid(pid_t pid, gid_t *ret) { + + if (pid == 0 || pid == getpid_cached()) { + *ret = getgid(); + return 0; + } + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + return get_process_id(pid, "Gid:", ret); +} + +int get_process_cwd(pid_t pid, char **ret) { + assert(pid >= 0); + + if (pid == 0 || pid == getpid_cached()) + return safe_getcwd(ret); + + return get_process_link_contents(pid, "cwd", ret); +} + +int get_process_root(pid_t pid, char **ret) { + assert(pid >= 0); + return get_process_link_contents(pid, "root", ret); +} + +#define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U) + +int get_process_environ(pid_t pid, char **ret) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *outcome = NULL; + size_t sz = 0; + const char *p; + int r; + + assert(pid >= 0); + assert(ret); + + p = procfs_file_alloca(pid, "environ"); + + r = fopen_unlocked(p, "re", &f); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + for (;;) { + char c; + + if (sz >= ENVIRONMENT_BLOCK_MAX) + return -ENOBUFS; + + if (!GREEDY_REALLOC(outcome, sz + 5)) + return -ENOMEM; + + r = safe_fgetc(f, &c); + if (r < 0) + return r; + if (r == 0) + break; + + if (c == '\0') + outcome[sz++] = '\n'; + else + sz += cescape_char(c, outcome + sz); + } + + outcome[sz] = '\0'; + *ret = TAKE_PTR(outcome); + + return 0; +} + +int get_process_ppid(pid_t pid, pid_t *ret) { + _cleanup_free_ char *line = NULL; + unsigned long ppid; + const char *p; + int r; + + assert(pid >= 0); + + if (pid == 0 || pid == getpid_cached()) { + if (ret) + *ret = getppid(); + return 0; + } + + if (pid == 1) /* PID 1 has no parent, shortcut this case */ + return -EADDRNOTAVAIL; + + p = procfs_file_alloca(pid, "stat"); + r = read_one_line_file(p, &line); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its + * value, so let's skip over it manually */ + + p = strrchr(line, ')'); + if (!p) + return -EIO; + + p++; + + if (sscanf(p, " " + "%*c " /* state */ + "%lu ", /* ppid */ + &ppid) != 1) + return -EIO; + + /* If ppid is zero the process has no parent. Which might be the case for PID 1 but also for + * processes originating in other namespaces that are inserted into a pidns. Return a recognizable + * error in this case. */ + if (ppid == 0) + return -EADDRNOTAVAIL; + + if ((pid_t) ppid < 0 || (unsigned long) (pid_t) ppid != ppid) + return -ERANGE; + + if (ret) + *ret = (pid_t) ppid; + + return 0; +} + +int get_process_umask(pid_t pid, mode_t *ret) { + _cleanup_free_ char *m = NULL; + const char *p; + int r; + + assert(pid >= 0); + assert(ret); + + p = procfs_file_alloca(pid, "status"); + + r = get_proc_field(p, "Umask", WHITESPACE, &m); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + return parse_mode(m, ret); +} + +int wait_for_terminate(pid_t pid, siginfo_t *status) { + siginfo_t dummy; + + assert(pid >= 1); + + if (!status) + status = &dummy; + + for (;;) { + zero(*status); + + if (waitid(P_PID, pid, status, WEXITED) < 0) { + + if (errno == EINTR) + continue; + + return negative_errno(); + } + + return 0; + } +} + +/* + * Return values: + * < 0 : wait_for_terminate() failed to get the state of the + * process, the process was terminated by a signal, or + * failed for an unknown reason. + * >=0 : The process terminated normally, and its exit code is + * returned. + * + * That is, success is indicated by a return value of zero, and an + * error is indicated by a non-zero value. + * + * A warning is emitted if the process terminates abnormally, + * and also if it returns non-zero unless check_exit_code is true. + */ +int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags) { + _cleanup_free_ char *buffer = NULL; + siginfo_t status; + int r, prio; + + assert(pid > 1); + + if (!name) { + r = pid_get_comm(pid, &buffer); + if (r < 0) + log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pid); + else + name = buffer; + } + + prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG; + + r = wait_for_terminate(pid, &status); + if (r < 0) + return log_full_errno(prio, r, "Failed to wait for %s: %m", strna(name)); + + if (status.si_code == CLD_EXITED) { + if (status.si_status != EXIT_SUCCESS) + log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG, + "%s failed with exit status %i.", strna(name), status.si_status); + else + log_debug("%s succeeded.", name); + + return status.si_status; + + } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED)) { + + log_full(prio, "%s terminated by signal %s.", strna(name), signal_to_string(status.si_status)); + return -EPROTO; + } + + log_full(prio, "%s failed due to unknown reason.", strna(name)); + return -EPROTO; +} + +/* + * Return values: + * + * < 0 : wait_for_terminate_with_timeout() failed to get the state of the process, the process timed out, the process + * was terminated by a signal, or failed for an unknown reason. + * + * >=0 : The process terminated normally with no failures. + * + * Success is indicated by a return value of zero, a timeout is indicated by ETIMEDOUT, and all other child failure + * states are indicated by error is indicated by a non-zero value. + * + * This call assumes SIGCHLD has been blocked already, in particular before the child to wait for has been forked off + * to remain entirely race-free. + */ +int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) { + sigset_t mask; + int r; + usec_t until; + + assert_se(sigemptyset(&mask) == 0); + assert_se(sigaddset(&mask, SIGCHLD) == 0); + + /* Drop into a sigtimewait-based timeout. Waiting for the + * pid to exit. */ + until = usec_add(now(CLOCK_MONOTONIC), timeout); + for (;;) { + usec_t n; + siginfo_t status = {}; + + n = now(CLOCK_MONOTONIC); + if (n >= until) + break; + + r = RET_NERRNO(sigtimedwait(&mask, NULL, TIMESPEC_STORE(until - n))); + /* Assuming we woke due to the child exiting. */ + if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) { + if (status.si_pid == pid) { + /* This is the correct child. */ + if (status.si_code == CLD_EXITED) + return status.si_status == 0 ? 0 : -EPROTO; + else + return -EPROTO; + } + } + /* Not the child, check for errors and proceed appropriately */ + if (r < 0) { + switch (r) { + case -EAGAIN: + /* Timed out, child is likely hung. */ + return -ETIMEDOUT; + case -EINTR: + /* Received a different signal and should retry */ + continue; + default: + /* Return any unexpected errors */ + return r; + } + } + } + + return -EPROTO; +} + +void sigkill_wait(pid_t pid) { + assert(pid > 1); + + (void) kill(pid, SIGKILL); + (void) wait_for_terminate(pid, NULL); +} + +void sigkill_waitp(pid_t *pid) { + PROTECT_ERRNO; + + if (!pid) + return; + if (*pid <= 1) + return; + + sigkill_wait(*pid); +} + +void sigterm_wait(pid_t pid) { + assert(pid > 1); + + (void) kill_and_sigcont(pid, SIGTERM); + (void) wait_for_terminate(pid, NULL); +} + +void sigkill_nowait(pid_t pid) { + assert(pid > 1); + + (void) kill(pid, SIGKILL); +} + +void sigkill_nowaitp(pid_t *pid) { + PROTECT_ERRNO; + + if (!pid) + return; + if (*pid <= 1) + return; + + sigkill_nowait(*pid); +} + +int kill_and_sigcont(pid_t pid, int sig) { + int r; + + r = RET_NERRNO(kill(pid, sig)); + + /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't + * affected by a process being suspended anyway. */ + if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL)) + (void) kill(pid, SIGCONT); + + return r; +} + +int getenv_for_pid(pid_t pid, const char *field, char **ret) { + _cleanup_fclose_ FILE *f = NULL; + char *value = NULL; + const char *path; + size_t l, sum = 0; + int r; + + assert(pid >= 0); + assert(field); + assert(ret); + + if (pid == 0 || pid == getpid_cached()) { + const char *e; + + e = getenv(field); + if (!e) { + *ret = NULL; + return 0; + } + + value = strdup(e); + if (!value) + return -ENOMEM; + + *ret = value; + return 1; + } + + if (!pid_is_valid(pid)) + return -EINVAL; + + path = procfs_file_alloca(pid, "environ"); + + r = fopen_unlocked(path, "re", &f); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + l = strlen(field); + for (;;) { + _cleanup_free_ char *line = NULL; + + if (sum > ENVIRONMENT_BLOCK_MAX) /* Give up searching eventually */ + return -ENOBUFS; + + r = read_nul_string(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) /* EOF */ + break; + + sum += r; + + if (strneq(line, field, l) && line[l] == '=') { + value = strdup(line + l + 1); + if (!value) + return -ENOMEM; + + *ret = value; + return 1; + } + } + + *ret = NULL; + return 0; +} + +int pid_is_my_child(pid_t pid) { + pid_t ppid; + int r; + + if (pid < 0) + return -ESRCH; + + if (pid <= 1) + return false; + + r = get_process_ppid(pid, &ppid); + if (r < 0) + return r; + + return ppid == getpid_cached(); +} + +int pidref_is_my_child(const PidRef *pid) { + int r, result; + + if (!pidref_is_set(pid)) + return -ESRCH; + + result = pid_is_my_child(pid->pid); + if (result < 0) + return result; + + r = pidref_verify(pid); + if (r < 0) + return r; + + return result; +} + +int pid_is_unwaited(pid_t pid) { + /* Checks whether a PID is still valid at all, including a zombie */ + + if (pid < 0) + return -ESRCH; + + if (pid <= 1) /* If we or PID 1 would be dead and have been waited for, this code would not be running */ + return true; + + if (pid == getpid_cached()) + return true; + + if (kill(pid, 0) >= 0) + return true; + + return errno != ESRCH; +} + +int pidref_is_unwaited(const PidRef *pid) { + int r; + + if (!pidref_is_set(pid)) + return -ESRCH; + + if (pid->pid == 1 || pidref_is_self(pid)) + return true; + + r = pidref_kill(pid, 0); + if (r == -ESRCH) + return false; + if (r < 0) + return r; + + return true; +} + +int pid_is_alive(pid_t pid) { + int r; + + /* Checks whether a PID is still valid and not a zombie */ + + if (pid < 0) + return -ESRCH; + + if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */ + return true; + + if (pid == getpid_cached()) + return true; + + r = get_process_state(pid); + if (r == -ESRCH) + return false; + if (r < 0) + return r; + + return r != 'Z'; +} + +int pidref_is_alive(const PidRef *pidref) { + int r, result; + + if (!pidref_is_set(pidref)) + return -ESRCH; + + result = pid_is_alive(pidref->pid); + if (result < 0) + return result; + + r = pidref_verify(pidref); + if (r == -ESRCH) + return false; + if (r < 0) + return r; + + return result; +} + +int pid_from_same_root_fs(pid_t pid) { + const char *root; + + if (pid < 0) + return false; + + if (pid == 0 || pid == getpid_cached()) + return true; + + root = procfs_file_alloca(pid, "root"); + + return inode_same(root, "/proc/1/root", 0); +} + +bool is_main_thread(void) { + static thread_local int cached = 0; + + if (_unlikely_(cached == 0)) + cached = getpid_cached() == gettid() ? 1 : -1; + + return cached > 0; +} + +bool oom_score_adjust_is_valid(int oa) { + return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX; +} + +unsigned long personality_from_string(const char *p) { + Architecture architecture; + + if (!p) + return PERSONALITY_INVALID; + + /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just + * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for + * the same register size. */ + + architecture = architecture_from_string(p); + if (architecture < 0) + return PERSONALITY_INVALID; + + if (architecture == native_architecture()) + return PER_LINUX; +#ifdef ARCHITECTURE_SECONDARY + if (architecture == ARCHITECTURE_SECONDARY) + return PER_LINUX32; +#endif + + return PERSONALITY_INVALID; +} + +const char* personality_to_string(unsigned long p) { + Architecture architecture = _ARCHITECTURE_INVALID; + + if (p == PER_LINUX) + architecture = native_architecture(); +#ifdef ARCHITECTURE_SECONDARY + else if (p == PER_LINUX32) + architecture = ARCHITECTURE_SECONDARY; +#endif + + if (architecture < 0) + return NULL; + + return architecture_to_string(architecture); +} + +int safe_personality(unsigned long p) { + int ret; + + /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno, + * and in others as negative return value containing an errno-like value. Let's work around this: this is a + * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and + * the return value indicating the same issue, so that we are definitely on the safe side. + * + * See https://github.com/systemd/systemd/issues/6737 */ + + errno = 0; + ret = personality(p); + if (ret < 0) { + if (errno != 0) + return -errno; + + errno = -ret; + } + + return ret; +} + +int opinionated_personality(unsigned long *ret) { + int current; + + /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit + * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the + * two most relevant personalities: PER_LINUX and PER_LINUX32. */ + + current = safe_personality(PERSONALITY_INVALID); + if (current < 0) + return current; + + if (((unsigned long) current & 0xffff) == PER_LINUX32) + *ret = PER_LINUX32; + else + *ret = PER_LINUX; + + return 0; +} + +void valgrind_summary_hack(void) { +#if HAVE_VALGRIND_VALGRIND_H + if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) { + pid_t pid; + pid = raw_clone(SIGCHLD); + if (pid < 0) + log_struct_errno( + LOG_EMERG, errno, + "MESSAGE_ID=" SD_MESSAGE_VALGRIND_HELPER_FORK_STR, + LOG_MESSAGE( "Failed to fork off valgrind helper: %m")); + else if (pid == 0) + exit(EXIT_SUCCESS); + else { + log_info("Spawned valgrind helper as PID "PID_FMT".", pid); + (void) wait_for_terminate(pid, NULL); + } + } +#endif +} + +int pid_compare_func(const pid_t *a, const pid_t *b) { + /* Suitable for usage in qsort() */ + return CMP(*a, *b); +} + +/* The cached PID, possible values: + * + * == UNSET [0] → cache not initialized yet + * == BUSY [-1] → some thread is initializing it at the moment + * any other → the cached PID + */ + +#define CACHED_PID_UNSET ((pid_t) 0) +#define CACHED_PID_BUSY ((pid_t) -1) + +static pid_t cached_pid = CACHED_PID_UNSET; + +void reset_cached_pid(void) { + /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */ + cached_pid = CACHED_PID_UNSET; +} + +pid_t getpid_cached(void) { + static bool installed = false; + pid_t current_value = CACHED_PID_UNSET; + + /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a + * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally + * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when + * objects were used across fork()s. With this caching the old behaviour is somewhat restored. + * + * https://bugzilla.redhat.com/show_bug.cgi?id=1443976 + * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e + */ + + (void) __atomic_compare_exchange_n( + &cached_pid, + ¤t_value, + CACHED_PID_BUSY, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST); + + switch (current_value) { + + case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */ + pid_t new_pid; + + new_pid = raw_getpid(); + + if (!installed) { + /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's + * only half-documented (glibc doesn't document it but LSB does — though only superficially) + * we'll check for errors only in the most generic fashion possible. */ + + if (pthread_atfork(NULL, NULL, reset_cached_pid) != 0) { + /* OOM? Let's try again later */ + cached_pid = CACHED_PID_UNSET; + return new_pid; + } + + installed = true; + } + + cached_pid = new_pid; + return new_pid; + } + + case CACHED_PID_BUSY: /* Somebody else is currently initializing */ + return raw_getpid(); + + default: /* Properly initialized */ + return current_value; + } +} + +int must_be_root(void) { + + if (geteuid() == 0) + return 0; + + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be root."); +} + +static void restore_sigsetp(sigset_t **ssp) { + if (*ssp) + (void) sigprocmask(SIG_SETMASK, *ssp, NULL); +} + +pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) { + size_t ps; + pid_t pid; + void *mystack; + + /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports + * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping. + * + * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent + * is threaded these locks will be in an undefined state in the child, and hence memory allocations + * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is + * strictly single-threaded or your child never calls malloc(). */ + + assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID| + CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0); + + /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that + * the net effect is that the child will have the start of its stack inside the stack of the parent, + * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But + * since we don't want to deal with differences between systems where the stack grows backwards or + * forwards we'll allocate one more and place the stack address in the middle. Except that we also + * want it page aligned, hence we'll allocate one page more. Makes 3. */ + + ps = page_size(); + mystack = alloca(ps*3); + mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */ + mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */ + +#if HAVE_CLONE + pid = clone(fn, mystack, flags, userdata); +#else + pid = __clone2(fn, mystack, ps, flags, userdata); +#endif + if (pid < 0) + return -errno; + + return pid; +} + +static int fork_flags_to_signal(ForkFlags flags) { + return (flags & FORK_DEATHSIG_SIGTERM) ? SIGTERM : + (flags & FORK_DEATHSIG_SIGINT) ? SIGINT : + SIGKILL; +} + +int safe_fork_full( + const char *name, + const int stdio_fds[3], + const int except_fds[], + size_t n_except_fds, + ForkFlags flags, + pid_t *ret_pid) { + + pid_t original_pid, pid; + sigset_t saved_ss, ss; + _unused_ _cleanup_(restore_sigsetp) sigset_t *saved_ssp = NULL; + bool block_signals = false, block_all = false, intermediary = false; + int prio, r; + + assert(!FLAGS_SET(flags, FORK_DETACH) || !ret_pid); + assert(!FLAGS_SET(flags, FORK_DETACH|FORK_WAIT)); + + /* A wrapper around fork(), that does a couple of important initializations in addition to mere forking. Always + * returns the child's PID in *ret_pid. Returns == 0 in the child, and > 0 in the parent. */ + + prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG; + + original_pid = getpid_cached(); + + if (flags & FORK_FLUSH_STDIO) { + fflush(stdout); + fflush(stderr); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */ + } + + if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT)) { + /* We temporarily block all signals, so that the new child has them blocked initially. This + * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for + * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */ + + assert_se(sigfillset(&ss) >= 0); + block_signals = block_all = true; + + } else if (flags & FORK_WAIT) { + /* Let's block SIGCHLD at least, so that we can safely watch for the child process */ + + assert_se(sigemptyset(&ss) >= 0); + assert_se(sigaddset(&ss, SIGCHLD) >= 0); + block_signals = true; + } + + if (block_signals) { + if (sigprocmask(SIG_SETMASK, &ss, &saved_ss) < 0) + return log_full_errno(prio, errno, "Failed to set signal mask: %m"); + saved_ssp = &saved_ss; + } + + if (FLAGS_SET(flags, FORK_DETACH)) { + assert(!FLAGS_SET(flags, FORK_WAIT)); + assert(!ret_pid); + + /* Fork off intermediary child if needed */ + + r = is_reaper_process(); + if (r < 0) + return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m"); + + if (!r) { + /* Not a reaper process, hence do a double fork() so we are reparented to one */ + + pid = fork(); + if (pid < 0) + return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name)); + if (pid > 0) { + log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid); + return 1; /* return in the parent */ + } + + intermediary = true; + } + } + + if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS)) != 0) + pid = raw_clone(SIGCHLD| + (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) | + (FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0)); + else + pid = fork(); + if (pid < 0) + return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name)); + if (pid > 0) { + + /* If we are in the intermediary process, exit now */ + if (intermediary) + _exit(EXIT_SUCCESS); + + /* We are in the parent process */ + log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid); + + if (flags & FORK_WAIT) { + if (block_all) { + /* undo everything except SIGCHLD */ + ss = saved_ss; + assert_se(sigaddset(&ss, SIGCHLD) >= 0); + (void) sigprocmask(SIG_SETMASK, &ss, NULL); + } + + r = wait_for_terminate_and_check(name, pid, (flags & FORK_LOG ? WAIT_LOG : 0)); + if (r < 0) + return r; + if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */ + return -EPROTO; + } + + if (ret_pid) + *ret_pid = pid; + + return 1; + } + + /* We are in the child process */ + + /* Restore signal mask manually */ + saved_ssp = NULL; + + if (flags & FORK_REOPEN_LOG) { + /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */ + log_close(); + log_set_open_when_needed(true); + log_settle_target(); + } + + if (name) { + r = rename_process(name); + if (r < 0) + log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG, + r, "Failed to rename process, ignoring: %m"); + } + + if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL)) + if (prctl(PR_SET_PDEATHSIG, fork_flags_to_signal(flags)) < 0) { + log_full_errno(prio, errno, "Failed to set death signal: %m"); + _exit(EXIT_FAILURE); + } + + if (flags & FORK_RESET_SIGNALS) { + r = reset_all_signal_handlers(); + if (r < 0) { + log_full_errno(prio, r, "Failed to reset signal handlers: %m"); + _exit(EXIT_FAILURE); + } + + /* This implicitly undoes the signal mask stuff we did before the fork()ing above */ + r = reset_signal_mask(); + if (r < 0) { + log_full_errno(prio, r, "Failed to reset signal mask: %m"); + _exit(EXIT_FAILURE); + } + } else if (block_signals) { /* undo what we did above */ + if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) { + log_full_errno(prio, errno, "Failed to restore signal mask: %m"); + _exit(EXIT_FAILURE); + } + } + + if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL|FORK_DEATHSIG_SIGINT)) { + pid_t ppid; + /* Let's see if the parent PID is still the one we started from? If not, then the parent + * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */ + + ppid = getppid(); + if (ppid == 0) + /* Parent is in a different PID namespace. */; + else if (ppid != original_pid) { + int sig = fork_flags_to_signal(flags); + log_debug("Parent died early, raising %s.", signal_to_string(sig)); + (void) raise(sig); + _exit(EXIT_FAILURE); + } + } + + if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) { + /* Optionally, make sure we never propagate mounts to the host. */ + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) { + log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m"); + _exit(EXIT_FAILURE); + } + } + + if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) { + assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS)); + + /* Optionally, overmount new tmpfs instance on /tmp/. */ + r = mount_nofollow("tmpfs", "/tmp", "tmpfs", + MS_NOSUID|MS_NODEV, + "mode=01777" TMPFS_LIMITS_RUN); + if (r < 0) { + log_full_errno(prio, r, "Failed to overmount /tmp/: %m"); + _exit(EXIT_FAILURE); + } + } + + if (flags & FORK_REARRANGE_STDIO) { + if (stdio_fds) { + r = rearrange_stdio(stdio_fds[0], stdio_fds[1], stdio_fds[2]); + if (r < 0) { + log_full_errno(prio, r, "Failed to rearrange stdio fds: %m"); + _exit(EXIT_FAILURE); + } + } else { + r = make_null_stdio(); + if (r < 0) { + log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m"); + _exit(EXIT_FAILURE); + } + } + } else if (flags & FORK_STDOUT_TO_STDERR) { + if (dup2(STDERR_FILENO, STDOUT_FILENO) < 0) { + log_full_errno(prio, errno, "Failed to connect stdout to stderr: %m"); + _exit(EXIT_FAILURE); + } + } + + if (flags & FORK_CLOSE_ALL_FDS) { + /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */ + log_close(); + + r = close_all_fds(except_fds, n_except_fds); + if (r < 0) { + log_full_errno(prio, r, "Failed to close all file descriptors: %m"); + _exit(EXIT_FAILURE); + } + } + + if (flags & FORK_CLOEXEC_OFF) { + r = fd_cloexec_many(except_fds, n_except_fds, false); + if (r < 0) { + log_full_errno(prio, r, "Failed to turn off O_CLOEXEC on file descriptors: %m"); + _exit(EXIT_FAILURE); + } + } + + /* When we were asked to reopen the logs, do so again now */ + if (flags & FORK_REOPEN_LOG) { + log_open(); + log_set_open_when_needed(false); + } + + if (flags & FORK_RLIMIT_NOFILE_SAFE) { + r = rlimit_nofile_safe(); + if (r < 0) { + log_full_errno(prio, r, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m"); + _exit(EXIT_FAILURE); + } + } + + if (!FLAGS_SET(flags, FORK_KEEP_NOTIFY_SOCKET)) { + r = RET_NERRNO(unsetenv("NOTIFY_SOCKET")); + if (r < 0) { + log_full_errno(prio, r, "Failed to unset $NOTIFY_SOCKET: %m"); + _exit(EXIT_FAILURE); + } + } + + if (ret_pid) + *ret_pid = getpid_cached(); + + return 0; +} + +int namespace_fork( + const char *outer_name, + const char *inner_name, + const int except_fds[], + size_t n_except_fds, + ForkFlags flags, + int pidns_fd, + int mntns_fd, + int netns_fd, + int userns_fd, + int root_fd, + pid_t *ret_pid) { + + int r; + + /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle + * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that + * /proc/self/fd works correctly. */ + + r = safe_fork_full(outer_name, + NULL, + except_fds, n_except_fds, + (flags|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid); + if (r < 0) + return r; + if (r == 0) { + pid_t pid; + + /* Child */ + + r = namespace_enter(pidns_fd, mntns_fd, netns_fd, userns_fd, root_fd); + if (r < 0) { + log_full_errno(FLAGS_SET(flags, FORK_LOG) ? LOG_ERR : LOG_DEBUG, r, "Failed to join namespace: %m"); + _exit(EXIT_FAILURE); + } + + /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */ + r = safe_fork_full(inner_name, + NULL, + except_fds, n_except_fds, + flags & ~(FORK_WAIT|FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO), &pid); + if (r < 0) + _exit(EXIT_FAILURE); + if (r == 0) { + /* Child */ + if (ret_pid) + *ret_pid = pid; + return 0; + } + + r = wait_for_terminate_and_check(inner_name, pid, FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(r); + } + + return 1; +} + +int set_oom_score_adjust(int value) { + char t[DECIMAL_STR_MAX(int)]; + + xsprintf(t, "%i", value); + + return write_string_file("/proc/self/oom_score_adj", t, + WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER); +} + +int get_oom_score_adjust(int *ret) { + _cleanup_free_ char *t = NULL; + int r, a; + + r = read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX, &t, NULL); + if (r < 0) + return r; + + delete_trailing_chars(t, WHITESPACE); + + assert_se(safe_atoi(t, &a) >= 0); + assert_se(oom_score_adjust_is_valid(a)); + + if (ret) + *ret = a; + return 0; +} + +int pidfd_get_pid(int fd, pid_t *ret) { + char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)]; + _cleanup_free_ char *fdinfo = NULL; + char *p; + int r; + + /* Converts a pidfd into a pid. Well known errors: + * + * -EBADF → fd invalid + * -ENOSYS → /proc/ not mounted + * -ENOTTY → fd valid, but not a pidfd + * -EREMOTE → fd valid, but pid is in another namespace we cannot translate to the local one + * -ESRCH → fd valid, but process is already reaped + */ + + if (fd < 0) + return -EBADF; + + xsprintf(path, "/proc/self/fdinfo/%i", fd); + + r = read_full_virtual_file(path, &fdinfo, NULL); + if (r == -ENOENT) /* if fdinfo doesn't exist we assume the process does not exist */ + return proc_mounted() > 0 ? -EBADF : -ENOSYS; + if (r < 0) + return r; + + p = find_line_startswith(fdinfo, "Pid:"); + if (!p) + return -ENOTTY; /* not a pidfd? */ + + p += strspn(p, WHITESPACE); + p[strcspn(p, WHITESPACE)] = 0; + + if (streq(p, "0")) + return -EREMOTE; /* PID is in foreign PID namespace? */ + if (streq(p, "-1")) + return -ESRCH; /* refers to reaped process? */ + + return parse_pid(p, ret); +} + +int pidfd_verify_pid(int pidfd, pid_t pid) { + pid_t current_pid; + int r; + + assert(pidfd >= 0); + assert(pid > 0); + + r = pidfd_get_pid(pidfd, ¤t_pid); + if (r < 0) + return r; + + return current_pid != pid ? -ESRCH : 0; +} + +static int rlimit_to_nice(rlim_t limit) { + if (limit <= 1) + return PRIO_MAX-1; /* i.e. 19 */ + + if (limit >= -PRIO_MIN + PRIO_MAX) + return PRIO_MIN; /* i.e. -20 */ + + return PRIO_MAX - (int) limit; +} + +int setpriority_closest(int priority) { + int current, limit, saved_errno; + struct rlimit highest; + + /* Try to set requested nice level */ + if (setpriority(PRIO_PROCESS, 0, priority) >= 0) + return 1; + + /* Permission failed */ + saved_errno = -errno; + if (!ERRNO_IS_PRIVILEGE(saved_errno)) + return saved_errno; + + errno = 0; + current = getpriority(PRIO_PROCESS, 0); + if (errno != 0) + return -errno; + + if (priority == current) + return 1; + + /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't, + * then the whole setpriority() system call is blocked to us, hence let's propagate the error + * right-away */ + if (priority > current) + return saved_errno; + + if (getrlimit(RLIMIT_NICE, &highest) < 0) + return -errno; + + limit = rlimit_to_nice(highest.rlim_cur); + + /* We are already less nice than limit allows us */ + if (current < limit) { + log_debug("Cannot raise nice level, permissions and the resource limit do not allow it."); + return 0; + } + + /* Push to the allowed limit */ + if (setpriority(PRIO_PROCESS, 0, limit) < 0) + return -errno; + + log_debug("Cannot set requested nice level (%i), used next best (%i).", priority, limit); + return 0; +} + +_noreturn_ void freeze(void) { + log_close(); + + /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use + * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function + * to be compatible with being called from signal handlers. */ + (void) close_all_fds_without_malloc(NULL, 0); + + /* Let's not freeze right away, but keep reaping zombies. */ + for (;;) { + siginfo_t si = {}; + + if (waitid(P_ALL, 0, &si, WEXITED) < 0 && errno != EINTR) + break; + } + + /* waitid() failed with an unexpected error, things are really borked. Freeze now! */ + for (;;) + pause(); +} + +int get_process_threads(pid_t pid) { + _cleanup_free_ char *t = NULL; + const char *p; + int n, r; + + if (pid < 0) + return -EINVAL; + + p = procfs_file_alloca(pid, "status"); + + r = get_proc_field(p, "Threads", WHITESPACE, &t); + if (r == -ENOENT) + return proc_mounted() == 0 ? -ENOSYS : -ESRCH; + if (r < 0) + return r; + + r = safe_atoi(t, &n); + if (r < 0) + return r; + if (n < 0) + return -EINVAL; + + return n; +} + +int is_reaper_process(void) { + int b = 0; + + /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes + * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */ + + if (getpid_cached() == 1) + return true; + + if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0) + return -errno; + + return b != 0; +} + +int make_reaper_process(bool b) { + + if (getpid_cached() == 1) { + + if (!b) + return -EINVAL; + + return 0; + } + + /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all, + * to avoid any ambiguities */ + if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0) + return -errno; + + return 0; +} + +int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, pid_t *ret_pid) { + posix_spawnattr_t attr; + sigset_t mask; + pid_t pid; + int r; + + /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the + * caller will be blocked until the child either exits or exec's. The memory of the child will be + * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max + * issues. */ + + assert(path); + assert(argv); + assert(ret_pid); + + assert_se(sigfillset(&mask) >= 0); + + r = posix_spawnattr_init(&attr); + if (r != 0) + return -r; /* These functions return a positive errno on failure */ + /* Set all signals to SIG_DFL */ + r = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF); + if (r != 0) + goto fail; + r = posix_spawnattr_setsigmask(&attr, &mask); + if (r != 0) + goto fail; + + r = posix_spawn(&pid, path, NULL, &attr, argv, envp); + if (r != 0) + goto fail; + + *ret_pid = pid; + + posix_spawnattr_destroy(&attr); + return 0; + +fail: + assert(r > 0); + posix_spawnattr_destroy(&attr); + return -r; +} + +int proc_dir_open(DIR **ret) { + DIR *d; + + assert(ret); + + d = opendir("/proc"); + if (!d) + return -errno; + + *ret = d; + return 0; +} + +int proc_dir_read(DIR *d, pid_t *ret) { + assert(d); + + for (;;) { + struct dirent *de; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + return -errno; + + break; + } + + if (!IN_SET(de->d_type, DT_DIR, DT_UNKNOWN)) + continue; + + if (parse_pid(de->d_name, ret) >= 0) + return 1; + } + + if (ret) + *ret = 0; + return 0; +} + +int proc_dir_read_pidref(DIR *d, PidRef *ret) { + int r; + + assert(d); + + for (;;) { + pid_t pid; + + r = proc_dir_read(d, &pid); + if (r < 0) + return r; + if (r == 0) + break; + + r = pidref_set_pid(ret, pid); + if (r == -ESRCH) /* gone by now? skip it */ + continue; + if (r < 0) + return r; + + return 1; + } + + if (ret) + *ret = PIDREF_NULL; + return 0; +} + +static const char *const sigchld_code_table[] = { + [CLD_EXITED] = "exited", + [CLD_KILLED] = "killed", + [CLD_DUMPED] = "dumped", + [CLD_TRAPPED] = "trapped", + [CLD_STOPPED] = "stopped", + [CLD_CONTINUED] = "continued", +}; + +DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int); + +static const char* const sched_policy_table[] = { + [SCHED_OTHER] = "other", + [SCHED_BATCH] = "batch", + [SCHED_IDLE] = "idle", + [SCHED_FIFO] = "fifo", + [SCHED_RR] = "rr", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX); diff --git a/src/basic/process-util.h b/src/basic/process-util.h new file mode 100644 index 0000000..af6cba1 --- /dev/null +++ b/src/basic/process-util.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "format-util.h" +#include "macro.h" +#include "namespace-util.h" +#include "time-util.h" + +#define procfs_file_alloca(pid, field) \ + ({ \ + pid_t _pid_ = (pid); \ + const char *_field_ = (field); \ + char *_r_; \ + if (_pid_ == 0) { \ + _r_ = newa(char, STRLEN("/proc/self/") + strlen(_field_) + 1); \ + strcpy(stpcpy(_r_, "/proc/self/"), _field_); \ + } else { \ + _r_ = newa(char, STRLEN("/proc/") + DECIMAL_STR_MAX(pid_t) + 1 + strlen(_field_) + 1); \ + sprintf(_r_, "/proc/" PID_FMT "/%s", _pid_, _field_); \ + } \ + (const char*) _r_; \ + }) + +typedef enum ProcessCmdlineFlags { + PROCESS_CMDLINE_COMM_FALLBACK = 1 << 0, + PROCESS_CMDLINE_USE_LOCALE = 1 << 1, + PROCESS_CMDLINE_QUOTE = 1 << 2, + PROCESS_CMDLINE_QUOTE_POSIX = 1 << 3, +} ProcessCmdlineFlags; + +int pid_get_comm(pid_t pid, char **ret); +int pidref_get_comm(const PidRef *pid, char **ret); +int pid_get_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret); +int pidref_get_cmdline(const PidRef *pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret); +int pid_get_cmdline_strv(pid_t pid, ProcessCmdlineFlags flags, char ***ret); +int pidref_get_cmdline_strv(const PidRef *pid, ProcessCmdlineFlags flags, char ***ret); +int get_process_exe(pid_t pid, char **ret); +int pid_get_uid(pid_t pid, uid_t *ret); +int pidref_get_uid(const PidRef *pid, uid_t *ret); +int get_process_gid(pid_t pid, gid_t *ret); +int get_process_capeff(pid_t pid, char **ret); +int get_process_cwd(pid_t pid, char **ret); +int get_process_root(pid_t pid, char **ret); +int get_process_environ(pid_t pid, char **ret); +int get_process_ppid(pid_t pid, pid_t *ret); +int get_process_umask(pid_t pid, mode_t *ret); + +int container_get_leader(const char *machine, pid_t *pid); + +int namespace_get_leader(pid_t pid, NamespaceType type, pid_t *ret); + +int wait_for_terminate(pid_t pid, siginfo_t *status); + +typedef enum WaitFlags { + WAIT_LOG_ABNORMAL = 1 << 0, + WAIT_LOG_NON_ZERO_EXIT_STATUS = 1 << 1, + + /* A shortcut for requesting the most complete logging */ + WAIT_LOG = WAIT_LOG_ABNORMAL|WAIT_LOG_NON_ZERO_EXIT_STATUS, +} WaitFlags; + +int wait_for_terminate_and_check(const char *name, pid_t pid, WaitFlags flags); +int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout); + +void sigkill_wait(pid_t pid); +void sigkill_waitp(pid_t *pid); +void sigterm_wait(pid_t pid); +void sigkill_nowait(pid_t pid); +void sigkill_nowaitp(pid_t *pid); + +int kill_and_sigcont(pid_t pid, int sig); + +int pid_is_kernel_thread(pid_t pid); +int pidref_is_kernel_thread(const PidRef *pid); + +int getenv_for_pid(pid_t pid, const char *field, char **_value); + +int pid_is_alive(pid_t pid); +int pidref_is_alive(const PidRef *pidref); +int pid_is_unwaited(pid_t pid); +int pidref_is_unwaited(const PidRef *pidref); +int pid_is_my_child(pid_t pid); +int pidref_is_my_child(const PidRef *pidref); +int pid_from_same_root_fs(pid_t pid); + +bool is_main_thread(void); + +bool oom_score_adjust_is_valid(int oa); + +#ifndef PERSONALITY_INVALID +/* personality(7) documents that 0xffffffffUL is used for querying the + * current personality, hence let's use that here as error + * indicator. */ +#define PERSONALITY_INVALID 0xffffffffLU +#endif + +unsigned long personality_from_string(const char *p); +const char *personality_to_string(unsigned long); + +int safe_personality(unsigned long p); +int opinionated_personality(unsigned long *ret); + +const char *sigchld_code_to_string(int i) _const_; +int sigchld_code_from_string(const char *s) _pure_; + +int sched_policy_to_string_alloc(int i, char **s); +int sched_policy_from_string(const char *s); + +static inline pid_t PTR_TO_PID(const void *p) { + return (pid_t) ((uintptr_t) p); +} + +static inline void* PID_TO_PTR(pid_t pid) { + return (void*) ((uintptr_t) pid); +} + +void valgrind_summary_hack(void); + +int pid_compare_func(const pid_t *a, const pid_t *b); + +static inline bool nice_is_valid(int n) { + return n >= PRIO_MIN && n < PRIO_MAX; +} + +static inline bool sched_policy_is_valid(int i) { + return IN_SET(i, SCHED_OTHER, SCHED_BATCH, SCHED_IDLE, SCHED_FIFO, SCHED_RR); +} + +static inline bool sched_priority_is_valid(int i) { + return i >= 0 && i <= sched_get_priority_max(SCHED_RR); +} + +static inline bool pid_is_valid(pid_t p) { + return p > 0; +} + +pid_t getpid_cached(void); +void reset_cached_pid(void); + +int must_be_root(void); + +pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata); + +/* 💣 Note that FORK_NEW_USERNS + FORK_NEW_MOUNTNS should not be called in threaded programs, because they + * cause us to use raw_clone() which does not synchronize the glibc malloc() locks, and thus will cause + * deadlocks if the parent uses threads and the child does memory allocations. Hence: if the parent is + * threaded these flags may not be used. These flags cannot be used if the parent uses threads or the child + * uses malloc(). 💣 */ +typedef enum ForkFlags { + FORK_RESET_SIGNALS = 1 << 0, /* Reset all signal handlers and signal mask */ + FORK_CLOSE_ALL_FDS = 1 << 1, /* Close all open file descriptors in the child, except for 0,1,2 */ + FORK_DEATHSIG_SIGTERM = 1 << 2, /* Set PR_DEATHSIG in the child to SIGTERM */ + FORK_DEATHSIG_SIGINT = 1 << 3, /* Set PR_DEATHSIG in the child to SIGINT */ + FORK_DEATHSIG_SIGKILL = 1 << 4, /* Set PR_DEATHSIG in the child to SIGKILL */ + FORK_REARRANGE_STDIO = 1 << 5, /* Connect 0,1,2 to specified fds or /dev/null */ + FORK_REOPEN_LOG = 1 << 6, /* Reopen log connection */ + FORK_LOG = 1 << 7, /* Log above LOG_DEBUG log level about failures */ + FORK_WAIT = 1 << 8, /* Wait until child exited */ + FORK_NEW_MOUNTNS = 1 << 9, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ + FORK_MOUNTNS_SLAVE = 1 << 10, /* Make child's mount namespace MS_SLAVE */ + FORK_PRIVATE_TMP = 1 << 11, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */ + FORK_RLIMIT_NOFILE_SAFE = 1 << 12, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */ + FORK_STDOUT_TO_STDERR = 1 << 13, /* Make stdout a copy of stderr */ + FORK_FLUSH_STDIO = 1 << 14, /* fflush() stdout (and stderr) before forking */ + FORK_NEW_USERNS = 1 << 15, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ + FORK_CLOEXEC_OFF = 1 << 16, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */ + FORK_KEEP_NOTIFY_SOCKET = 1 << 17, /* Unless this specified, $NOTIFY_SOCKET will be unset. */ + FORK_DETACH = 1 << 18, /* Double fork if needed to ensure PID1/subreaper is parent */ +} ForkFlags; + +int safe_fork_full( + const char *name, + const int stdio_fds[3], + const int except_fds[], + size_t n_except_fds, + ForkFlags flags, + pid_t *ret_pid); + +static inline int safe_fork(const char *name, ForkFlags flags, pid_t *ret_pid) { + return safe_fork_full(name, NULL, NULL, 0, flags, ret_pid); +} + +int namespace_fork(const char *outer_name, const char *inner_name, const int except_fds[], size_t n_except_fds, ForkFlags flags, int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd, pid_t *ret_pid); + +int set_oom_score_adjust(int value); +int get_oom_score_adjust(int *ret); + +/* The highest possibly (theoretic) pid_t value on this architecture. */ +#define PID_T_MAX ((pid_t) INT32_MAX) +/* The maximum number of concurrent processes Linux allows on this architecture, as well as the highest valid PID value + * the kernel will potentially assign. This reflects a value compiled into the kernel (PID_MAX_LIMIT), and sets the + * upper boundary on what may be written to the /proc/sys/kernel/pid_max sysctl (but do note that the sysctl is off by + * 1, since PID 0 can never exist and there can hence only be one process less than the limit would suggest). Since + * these values are documented in proc(5) we feel quite confident that they are stable enough for the near future at + * least to define them here too. */ +#define TASKS_MAX 4194303U + +assert_cc(TASKS_MAX <= (unsigned long) PID_T_MAX); + +/* Like TAKE_PTR() but for pid_t, resetting them to 0 */ +#define TAKE_PID(pid) TAKE_GENERIC(pid, pid_t, 0) + +int pidfd_get_pid(int fd, pid_t *ret); +int pidfd_verify_pid(int pidfd, pid_t pid); + +int setpriority_closest(int priority); + +_noreturn_ void freeze(void); + +int get_process_threads(pid_t pid); + +int is_reaper_process(void); +int make_reaper_process(bool b); + +int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, pid_t *ret_pid); + +int proc_dir_open(DIR **ret); +int proc_dir_read(DIR *d, pid_t *ret); +int proc_dir_read_pidref(DIR *d, PidRef *ret); diff --git a/src/basic/procfs-util.c b/src/basic/procfs-util.c new file mode 100644 index 0000000..d7cfcd9 --- /dev/null +++ b/src/basic/procfs-util.c @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "constants.h" +#include "fd-util.h" +#include "fileio.h" +#include "parse-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "stdio-util.h" +#include "string-util.h" + +int procfs_get_pid_max(uint64_t *ret) { + _cleanup_free_ char *value = NULL; + int r; + + assert(ret); + + r = read_one_line_file("/proc/sys/kernel/pid_max", &value); + if (r < 0) + return r; + + return safe_atou64(value, ret); +} + +int procfs_get_threads_max(uint64_t *ret) { + _cleanup_free_ char *value = NULL; + int r; + + assert(ret); + + r = read_one_line_file("/proc/sys/kernel/threads-max", &value); + if (r < 0) + return r; + + return safe_atou64(value, ret); +} + +int procfs_tasks_set_limit(uint64_t limit) { + char buffer[DECIMAL_STR_MAX(uint64_t)+1]; + uint64_t pid_max; + int r; + + if (limit == 0) /* This makes no sense, we are userspace and hence count as tasks too, and we want to live, + * hence the limit conceptually has to be above 0. Also, most likely if anyone asks for a zero + * limit they probably mean "no limit", hence let's better refuse this to avoid + * confusion. */ + return -EINVAL; + + /* The Linux kernel doesn't allow this value to go below 20, hence don't allow this either, higher values than + * TASKS_MAX are not accepted by the pid_max sysctl. We'll treat anything this high as "unbounded" and hence + * set it to the maximum. */ + limit = CLAMP(limit, 20U, TASKS_MAX); + + r = procfs_get_pid_max(&pid_max); + if (r < 0) + return r; + + /* As pid_max is about the numeric pid_t range we'll bump it if necessary, but only ever increase it, never + * decrease it, as threads-max is the much more relevant sysctl. */ + if (limit > pid_max-1) { + sprintf(buffer, "%" PRIu64, limit+1); /* Add one, since PID 0 is not a valid PID */ + r = write_string_file("/proc/sys/kernel/pid_max", buffer, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + } + + sprintf(buffer, "%" PRIu64, limit); + r = write_string_file("/proc/sys/kernel/threads-max", buffer, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) { + uint64_t threads_max; + + /* Hmm, we couldn't write this? If so, maybe it was already set properly? In that case let's not + * generate an error */ + + if (procfs_get_threads_max(&threads_max) < 0) + return r; /* return original error */ + + if (MIN(pid_max - 1, threads_max) != limit) + return r; /* return original error */ + + /* Yay! Value set already matches what we were trying to set, hence consider this a success. */ + } + + return 0; +} + +int procfs_tasks_get_current(uint64_t *ret) { + _cleanup_free_ char *value = NULL; + const char *p, *nr; + size_t n; + int r; + + assert(ret); + + r = read_one_line_file("/proc/loadavg", &value); + if (r < 0) + return r; + + /* Look for the second part of the fourth field, which is separated by a slash from the first part. None of the + * earlier fields use a slash, hence let's use this to find the right spot. */ + p = strchr(value, '/'); + if (!p) + return -EINVAL; + + p++; + n = strspn(p, DIGITS); + nr = strndupa_safe(p, n); + + return safe_atou64(nr, ret); +} + +static uint64_t calc_gcd64(uint64_t a, uint64_t b) { + + while (b > 0) { + uint64_t t; + + t = a % b; + + a = b; + b = t; + } + + return a; +} + +int procfs_cpu_get_usage(nsec_t *ret) { + _cleanup_free_ char *first_line = NULL; + unsigned long user_ticks, nice_ticks, system_ticks, irq_ticks, softirq_ticks, + guest_ticks = 0, guest_nice_ticks = 0; + long ticks_per_second; + uint64_t sum, gcd, a, b; + const char *p; + int r; + + assert(ret); + + r = read_one_line_file("/proc/stat", &first_line); + if (r < 0) + return r; + + p = first_word(first_line, "cpu"); + if (!p) + return -EINVAL; + + if (sscanf(p, "%lu %lu %lu %*u %*u %lu %lu %*u %lu %lu", + &user_ticks, + &nice_ticks, + &system_ticks, + &irq_ticks, + &softirq_ticks, + &guest_ticks, + &guest_nice_ticks) < 5) /* we only insist on the first five fields */ + return -EINVAL; + + ticks_per_second = sysconf(_SC_CLK_TCK); + if (ticks_per_second < 0) + return -errno; + assert(ticks_per_second > 0); + + sum = (uint64_t) user_ticks + (uint64_t) nice_ticks + (uint64_t) system_ticks + + (uint64_t) irq_ticks + (uint64_t) softirq_ticks + + (uint64_t) guest_ticks + (uint64_t) guest_nice_ticks; + + /* Let's reduce this fraction before we apply it to avoid overflows when converting this to μsec */ + gcd = calc_gcd64(NSEC_PER_SEC, ticks_per_second); + + a = (uint64_t) NSEC_PER_SEC / gcd; + b = (uint64_t) ticks_per_second / gcd; + + *ret = DIV_ROUND_UP((nsec_t) sum * (nsec_t) a, (nsec_t) b); + return 0; +} + +int convert_meminfo_value_to_uint64_bytes(const char *word, uint64_t *ret) { + _cleanup_free_ char *w = NULL; + char *digits, *e; + uint64_t v; + size_t n; + int r; + + assert(word); + assert(ret); + + w = strdup(word); + if (!w) + return -ENOMEM; + + /* Determine length of numeric value */ + n = strspn(w, WHITESPACE); + digits = w + n; + n = strspn(digits, DIGITS); + if (n == 0) + return -EINVAL; + e = digits + n; + + /* Ensure the line ends in " kB" */ + n = strspn(e, WHITESPACE); + if (n == 0) + return -EINVAL; + if (!streq(e + n, "kB")) + return -EINVAL; + + *e = 0; + r = safe_atou64(digits, &v); + if (r < 0) + return r; + if (v == UINT64_MAX) + return -EINVAL; + + if (v > UINT64_MAX/1024) + return -EOVERFLOW; + + *ret = v * 1024U; + return 0; +} + +int procfs_memory_get(uint64_t *ret_total, uint64_t *ret_used) { + uint64_t mem_total = UINT64_MAX, mem_available = UINT64_MAX; + _cleanup_fclose_ FILE *f = NULL; + int r; + + f = fopen("/proc/meminfo", "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + uint64_t *v; + char *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; /* EOF: Couldn't find one or both fields? */ + + p = first_word(line, "MemTotal:"); + if (p) + v = &mem_total; + else { + p = first_word(line, "MemAvailable:"); + if (p) + v = &mem_available; + else + continue; + } + + r = convert_meminfo_value_to_uint64_bytes(p, v); + if (r < 0) + return r; + + if (mem_total != UINT64_MAX && mem_available != UINT64_MAX) + break; + } + + if (mem_available > mem_total) + return -EINVAL; + + if (ret_total) + *ret_total = mem_total; + if (ret_used) + *ret_used = mem_total - mem_available; + return 0; +} diff --git a/src/basic/procfs-util.h b/src/basic/procfs-util.h new file mode 100644 index 0000000..eb8c773 --- /dev/null +++ b/src/basic/procfs-util.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" + +int procfs_get_pid_max(uint64_t *ret); +int procfs_get_threads_max(uint64_t *ret); + +int procfs_tasks_set_limit(uint64_t limit); +int procfs_tasks_get_current(uint64_t *ret); + +int procfs_cpu_get_usage(nsec_t *ret); + +int procfs_memory_get(uint64_t *ret_total, uint64_t *ret_used); +static inline int procfs_memory_get_used(uint64_t *ret) { + return procfs_memory_get(NULL, ret); +} + +int convert_meminfo_value_to_uint64_bytes(const char *word, uint64_t *ret); diff --git a/src/basic/psi-util.c b/src/basic/psi-util.c new file mode 100644 index 0000000..2a43b03 --- /dev/null +++ b/src/basic/psi-util.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "missing_threads.h" +#include "parse-util.h" +#include "psi-util.h" +#include "string-util.h" +#include "stat-util.h" +#include "strv.h" + +int read_resource_pressure(const char *path, PressureType type, ResourcePressure *ret) { + _cleanup_free_ char *line = NULL; + _cleanup_fclose_ FILE *f = NULL; + unsigned field_filled = 0; + ResourcePressure rp = {}; + const char *t, *cline; + char *word; + int r; + + assert(path); + assert(IN_SET(type, PRESSURE_TYPE_SOME, PRESSURE_TYPE_FULL)); + assert(ret); + + if (type == PRESSURE_TYPE_SOME) + t = "some"; + else if (type == PRESSURE_TYPE_FULL) + t = "full"; + else + return -EINVAL; + + r = fopen_unlocked(path, "re", &f); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *l = NULL; + char *w; + + r = read_line(f, LONG_LINE_MAX, &l); + if (r < 0) + return r; + if (r == 0) + break; + + w = first_word(l, t); + if (w) { + line = TAKE_PTR(l); + cline = w; + break; + } + } + + if (!line) + return -ENODATA; + + /* extracts either avgX=Y.Z or total=X */ + while ((r = extract_first_word(&cline, &word, NULL, 0)) > 0) { + _cleanup_free_ char *w = word; + const char *v; + + if ((v = startswith(w, "avg10="))) { + if (field_filled & (1U << 0)) + return -EINVAL; + + field_filled |= 1U << 0; + r = parse_loadavg_fixed_point(v, &rp.avg10); + } else if ((v = startswith(w, "avg60="))) { + if (field_filled & (1U << 1)) + return -EINVAL; + + field_filled |= 1U << 1; + r = parse_loadavg_fixed_point(v, &rp.avg60); + } else if ((v = startswith(w, "avg300="))) { + if (field_filled & (1U << 2)) + return -EINVAL; + + field_filled |= 1U << 2; + r = parse_loadavg_fixed_point(v, &rp.avg300); + } else if ((v = startswith(w, "total="))) { + if (field_filled & (1U << 3)) + return -EINVAL; + + field_filled |= 1U << 3; + r = safe_atou64(v, &rp.total); + } else + continue; + + if (r < 0) + return r; + } + + if (r < 0) + return r; + + if (field_filled != 15U) + return -EINVAL; + + *ret = rp; + return 0; +} + +int is_pressure_supported(void) { + static thread_local int cached = -1; + int r; + + /* The pressure files, both under /proc/ and in cgroups, will exist even if the kernel has PSI + * support disabled; we have to read the file to make sure it doesn't return -EOPNOTSUPP */ + + if (cached >= 0) + return cached; + + FOREACH_STRING(p, "/proc/pressure/cpu", "/proc/pressure/io", "/proc/pressure/memory") { + r = read_virtual_file(p, 0, NULL, NULL); + if (r == -ENOENT || ERRNO_IS_NEG_NOT_SUPPORTED(r)) + return (cached = false); + if (r < 0) + return r; + } + + return (cached = true); +} diff --git a/src/basic/psi-util.h b/src/basic/psi-util.h new file mode 100644 index 0000000..bf8f4fe --- /dev/null +++ b/src/basic/psi-util.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "parse-util.h" +#include "time-util.h" + +typedef enum PressureType { + PRESSURE_TYPE_SOME, + PRESSURE_TYPE_FULL, +} PressureType; + +/* Averages are stored in fixed-point with 11 bit fractions */ +typedef struct ResourcePressure { + loadavg_t avg10; + loadavg_t avg60; + loadavg_t avg300; + usec_t total; +} ResourcePressure; + +/** Upstream 4.20+ format + * + * some avg10=0.22 avg60=0.17 avg300=1.11 total=58761459 + * full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525 + */ +int read_resource_pressure(const char *path, PressureType type, ResourcePressure *ret); + +/* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */ +int is_pressure_supported(void); + +/* Default parameters for memory pressure watch logic in sd-event and PID 1 */ +#define MEMORY_PRESSURE_DEFAULT_TYPE "some" +#define MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC (200 * USEC_PER_MSEC) +#define MEMORY_PRESSURE_DEFAULT_WINDOW_USEC (2 * USEC_PER_SEC) diff --git a/src/basic/pthread-util.h b/src/basic/pthread-util.h new file mode 100644 index 0000000..113485d --- /dev/null +++ b/src/basic/pthread-util.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +static inline pthread_mutex_t* pthread_mutex_lock_assert(pthread_mutex_t *mutex) { + assert_se(pthread_mutex_lock(mutex) == 0); + return mutex; +} + +static inline void pthread_mutex_unlock_assertp(pthread_mutex_t **mutexp) { + if (*mutexp) + assert_se(pthread_mutex_unlock(*mutexp) == 0); +} diff --git a/src/basic/random-util.c b/src/basic/random-util.c new file mode 100644 index 0000000..c7277ad --- /dev/null +++ b/src/basic/random-util.c @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_SYS_AUXV_H +# include +#endif + +#include "alloc-util.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "missing_random.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "parse-util.h" +#include "process-util.h" +#include "random-util.h" +#include "sha256.h" +#include "time-util.h" + +/* This is a "best effort" kind of thing, but has no real security value. So, this should only be used by + * random_bytes(), which is not meant for crypto. This could be made better, but we're *not* trying to roll a + * userspace prng here, or even have forward secrecy, but rather just do the shortest thing that is at least + * better than libc rand(). */ +static void fallback_random_bytes(void *p, size_t n) { + static thread_local uint64_t fallback_counter = 0; + struct { + char label[32]; + uint64_t call_id, block_id; + usec_t stamp_mono, stamp_real; + pid_t pid, tid; + uint8_t auxval[16]; + } state = { + /* Arbitrary domain separation to prevent other usage of AT_RANDOM from clashing. */ + .label = "systemd fallback random bytes v1", + .call_id = fallback_counter++, + .stamp_mono = now(CLOCK_MONOTONIC), + .stamp_real = now(CLOCK_REALTIME), + .pid = getpid_cached(), + .tid = gettid(), + }; + +#if HAVE_SYS_AUXV_H + memcpy(state.auxval, ULONG_TO_PTR(getauxval(AT_RANDOM)), sizeof(state.auxval)); +#endif + + while (n > 0) { + struct sha256_ctx ctx; + + sha256_init_ctx(&ctx); + sha256_process_bytes(&state, sizeof(state), &ctx); + if (n < SHA256_DIGEST_SIZE) { + uint8_t partial[SHA256_DIGEST_SIZE]; + sha256_finish_ctx(&ctx, partial); + memcpy(p, partial, n); + break; + } + sha256_finish_ctx(&ctx, p); + p = (uint8_t *) p + SHA256_DIGEST_SIZE; + n -= SHA256_DIGEST_SIZE; + ++state.block_id; + } +} + +void random_bytes(void *p, size_t n) { + static bool have_getrandom = true, have_grndinsecure = true; + _cleanup_close_ int fd = -EBADF; + + if (n == 0) + return; + + for (;;) { + ssize_t l; + + if (!have_getrandom) + break; + + l = getrandom(p, n, have_grndinsecure ? GRND_INSECURE : GRND_NONBLOCK); + if (l > 0) { + if ((size_t) l == n) + return; /* Done reading, success. */ + p = (uint8_t *) p + l; + n -= l; + continue; /* Interrupted by a signal; keep going. */ + } else if (l == 0) + break; /* Weird, so fallback to /dev/urandom. */ + else if (ERRNO_IS_NOT_SUPPORTED(errno)) { + have_getrandom = false; + break; /* No syscall, so fallback to /dev/urandom. */ + } else if (errno == EINVAL && have_grndinsecure) { + have_grndinsecure = false; + continue; /* No GRND_INSECURE; fallback to GRND_NONBLOCK. */ + } else if (errno == EAGAIN && !have_grndinsecure) + break; /* Will block, but no GRND_INSECURE, so fallback to /dev/urandom. */ + + break; /* Unexpected, so just give up and fallback to /dev/urandom. */ + } + + fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd >= 0 && loop_read_exact(fd, p, n, false) == 0) + return; + + /* This is a terrible fallback. Oh well. */ + fallback_random_bytes(p, n); +} + +int crypto_random_bytes(void *p, size_t n) { + static bool have_getrandom = true, seen_initialized = false; + _cleanup_close_ int fd = -EBADF; + + if (n == 0) + return 0; + + for (;;) { + ssize_t l; + + if (!have_getrandom) + break; + + l = getrandom(p, n, 0); + if (l > 0) { + if ((size_t) l == n) + return 0; /* Done reading, success. */ + p = (uint8_t *) p + l; + n -= l; + continue; /* Interrupted by a signal; keep going. */ + } else if (l == 0) + return -EIO; /* Weird, should never happen. */ + else if (ERRNO_IS_NOT_SUPPORTED(errno)) { + have_getrandom = false; + break; /* No syscall, so fallback to /dev/urandom. */ + } + return -errno; + } + + if (!seen_initialized) { + _cleanup_close_ int ready_fd = -EBADF; + int r; + + ready_fd = open("/dev/random", O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (ready_fd < 0) + return -errno; + r = fd_wait_for_event(ready_fd, POLLIN, USEC_INFINITY); + if (r < 0) + return r; + seen_initialized = true; + } + + fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return -errno; + return loop_read_exact(fd, p, n, false); +} + +size_t random_pool_size(void) { + _cleanup_free_ char *s = NULL; + int r; + + /* Read pool size, if possible */ + r = read_one_line_file("/proc/sys/kernel/random/poolsize", &s); + if (r < 0) + log_debug_errno(r, "Failed to read pool size from kernel: %m"); + else { + unsigned sz; + + r = safe_atou(s, &sz); + if (r < 0) + log_debug_errno(r, "Failed to parse pool size: %s", s); + else + /* poolsize is in bits on 2.6, but we want bytes */ + return CLAMP(sz / 8, RANDOM_POOL_SIZE_MIN, RANDOM_POOL_SIZE_MAX); + } + + /* Use the minimum as default, if we can't retrieve the correct value */ + return RANDOM_POOL_SIZE_MIN; +} + +int random_write_entropy(int fd, const void *seed, size_t size, bool credit) { + _cleanup_close_ int opened_fd = -EBADF; + int r; + + assert(seed || size == 0); + + if (size == 0) + return 0; + + if (fd < 0) { + opened_fd = open("/dev/urandom", O_WRONLY|O_CLOEXEC|O_NOCTTY); + if (opened_fd < 0) + return -errno; + + fd = opened_fd; + } + + if (credit) { + _cleanup_free_ struct rand_pool_info *info = NULL; + + /* The kernel API only accepts "int" as entropy count (which is in bits), let's avoid any + * chance for confusion here. */ + if (size > INT_MAX / 8) + return -EOVERFLOW; + + info = malloc(offsetof(struct rand_pool_info, buf) + size); + if (!info) + return -ENOMEM; + + info->entropy_count = size * 8; + info->buf_size = size; + memcpy(info->buf, seed, size); + + if (ioctl(fd, RNDADDENTROPY, info) < 0) + return -errno; + } else { + r = loop_write(fd, seed, size); + if (r < 0) + return r; + } + + return 1; +} + +uint64_t random_u64_range(uint64_t m) { + uint64_t x, remainder; + + /* Generates a random number in the range 0…m-1, unbiased. (Java's algorithm) */ + + if (m == 0) /* Let's take m == 0 as special case to return an integer from the full range */ + return random_u64(); + if (m == 1) + return 0; + + remainder = UINT64_MAX % m; + + do { + x = random_u64(); + } while (x >= UINT64_MAX - remainder); + + return x % m; +} diff --git a/src/basic/random-util.h b/src/basic/random-util.h new file mode 100644 index 0000000..b1a4d10 --- /dev/null +++ b/src/basic/random-util.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +void random_bytes(void *p, size_t n); /* Returns random bytes suitable for most uses, but may be insecure sometimes. */ +int crypto_random_bytes(void *p, size_t n); /* Returns secure random bytes after waiting for the RNG to initialize. */ + +static inline uint64_t random_u64(void) { + uint64_t u; + random_bytes(&u, sizeof(u)); + return u; +} + +static inline uint32_t random_u32(void) { + uint32_t u; + random_bytes(&u, sizeof(u)); + return u; +} + +/* Some limits on the pool sizes when we deal with the kernel random pool */ +#define RANDOM_POOL_SIZE_MIN 32U +#define RANDOM_POOL_SIZE_MAX (10U*1024U*1024U) +#define RANDOM_EFI_SEED_SIZE 32U + +size_t random_pool_size(void); + +int random_write_entropy(int fd, const void *seed, size_t size, bool credit); + +uint64_t random_u64_range(uint64_t max); diff --git a/src/basic/ratelimit.c b/src/basic/ratelimit.c new file mode 100644 index 0000000..41ca070 --- /dev/null +++ b/src/basic/ratelimit.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "macro.h" +#include "ratelimit.h" + +/* Modelled after Linux' lib/ratelimit.c by Dave Young + * , which is licensed GPLv2. */ + +bool ratelimit_below(RateLimit *r) { + usec_t ts; + + assert(r); + + if (!ratelimit_configured(r)) + return true; + + ts = now(CLOCK_MONOTONIC); + + if (r->begin <= 0 || + usec_sub_unsigned(ts, r->begin) > r->interval) { + r->begin = ts; /* Start a new time window */ + r->num = 1; /* Reset counter */ + return true; + } + + if (_unlikely_(r->num == UINT_MAX)) + return false; + + r->num++; + return r->num <= r->burst; +} + +unsigned ratelimit_num_dropped(RateLimit *r) { + assert(r); + + if (r->num == UINT_MAX) /* overflow, return as special case */ + return UINT_MAX; + + return LESS_BY(r->num, r->burst); +} + +usec_t ratelimit_end(const RateLimit *rl) { + assert(rl); + + if (rl->begin == 0) + return 0; + + return usec_add(rl->begin, rl->interval); +} + +usec_t ratelimit_left(const RateLimit *rl) { + assert(rl); + + if (rl->begin == 0) + return 0; + + return usec_sub_unsigned(ratelimit_end(rl), now(CLOCK_MONOTONIC)); +} diff --git a/src/basic/ratelimit.h b/src/basic/ratelimit.h new file mode 100644 index 0000000..492ea3b --- /dev/null +++ b/src/basic/ratelimit.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" + +typedef struct RateLimit { + usec_t interval; /* Keep those two fields first so they can be initialized easily: */ + unsigned burst; /* RateLimit rl = { INTERVAL, BURST }; */ + unsigned num; + usec_t begin; +} RateLimit; + +#define RATELIMIT_OFF (const RateLimit) { .interval = USEC_INFINITY, .burst = UINT_MAX } + +static inline void ratelimit_reset(RateLimit *rl) { + rl->num = rl->begin = 0; +} + +static inline bool ratelimit_configured(RateLimit *rl) { + return rl->interval > 0 && rl->burst > 0; +} + +bool ratelimit_below(RateLimit *r); + +unsigned ratelimit_num_dropped(RateLimit *r); + +usec_t ratelimit_end(const RateLimit *rl); +usec_t ratelimit_left(const RateLimit *rl); diff --git a/src/basic/raw-clone.h b/src/basic/raw-clone.h new file mode 100644 index 0000000..6de67ab --- /dev/null +++ b/src/basic/raw-clone.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2016 Michael Karcher +***/ + +#include +#include +#include + +#include "log.h" +#include "macro.h" +#include "process-util.h" + +/** + * raw_clone() - uses clone to create a new process with clone flags + * @flags: Flags to pass to the clone system call + * + * Uses the clone system call to create a new process with the cloning flags and termination signal passed in the flags + * parameter. Opposed to glibc's clone function, using this function does not set up a separate stack for the child, but + * relies on copy-on-write semantics on the one stack at a common virtual address, just as fork does. + * + * To obtain copy-on-write semantics, flags must not contain CLONE_VM, and thus CLONE_THREAD and CLONE_SIGHAND (which + * require CLONE_VM) are not usable. + * + * Additionally, as this function does not pass the ptid, newtls and ctid parameters to the kernel, flags must not + * contain CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID or CLONE_SETTLS. + * + * WARNING: 💣 this call (just like glibc's own clone() wrapper) will not synchronize on glibc's malloc + * locks, which means they will be in an undefined state in the child if the parent is + * threaded. This means: the parent must either never use threads, or the child cannot use memory + * allocation itself. This is a major pitfall, hence be careful! 💣 + * + * Returns: 0 in the child process and the child process id in the parent. + */ +static inline pid_t raw_clone(unsigned long flags) { + pid_t ret; + + assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID| + CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0); +#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) + /* On s390/s390x and cris the order of the first and second arguments + * of the raw clone() system call is reversed. */ + ret = (pid_t) syscall(__NR_clone, NULL, flags); +#elif defined(__sparc__) + { + /** + * sparc always returns the other process id in %o0, and + * a boolean flag whether this is the child or the parent in + * %o1. Inline assembly is needed to get the flag returned + * in %o1. + */ + int in_child, child_pid, error; + + asm volatile("mov %3, %%g1\n\t" + "mov %4, %%o0\n\t" + "mov 0 , %%o1\n\t" +#if defined(__arch64__) + "t 0x6d\n\t" +#else + "t 0x10\n\t" +#endif + "addx %%g0, 0, %2\n\t" + "mov %%o1, %0\n\t" + "mov %%o0, %1" : + "=r"(in_child), "=r"(child_pid), "=r"(error) : + "i"(__NR_clone), "r"(flags) : + "%o1", "%o0", "%g1", "cc" ); + + if (error) { + errno = child_pid; + ret = -1; + } else + ret = in_child ? 0 : child_pid; + } +#else + ret = (pid_t) syscall(__NR_clone, flags, NULL); +#endif + + if (ret == 0) + reset_cached_pid(); + + return ret; +} diff --git a/src/basic/raw-reboot.h b/src/basic/raw-reboot.h new file mode 100644 index 0000000..e6bff30 --- /dev/null +++ b/src/basic/raw-reboot.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +/* glibc defines the reboot() API call, which is a wrapper around the system call of the same name, but without the + * extra "arg" parameter. Since we need that parameter for some calls, let's add a "raw" wrapper that is defined the + * same way, except it takes the additional argument. */ + +static inline int raw_reboot(int cmd, const void *arg) { + return (int) syscall(SYS_reboot, LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, arg); +} diff --git a/src/basic/recurse-dir.c b/src/basic/recurse-dir.c new file mode 100644 index 0000000..5e98b7a --- /dev/null +++ b/src/basic/recurse-dir.c @@ -0,0 +1,503 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "missing_syscall.h" +#include "mountpoint-util.h" +#include "recurse-dir.h" +#include "sort-util.h" + +#define DEFAULT_RECURSION_MAX 100 + +static int sort_func(struct dirent * const *a, struct dirent * const *b) { + return strcmp((*a)->d_name, (*b)->d_name); +} + +static bool ignore_dirent(const struct dirent *de, RecurseDirFlags flags) { + assert(de); + + /* Depending on flag either ignore everything starting with ".", or just "." itself and ".." */ + + return FLAGS_SET(flags, RECURSE_DIR_IGNORE_DOT) ? + de->d_name[0] == '.' : + dot_or_dot_dot(de->d_name); +} + +int readdir_all(int dir_fd, + RecurseDirFlags flags, + DirectoryEntries **ret) { + + _cleanup_free_ DirectoryEntries *de = NULL; + struct dirent *entry; + DirectoryEntries *nde; + size_t add, sz, j; + int r; + + assert(dir_fd >= 0); + + /* Returns an array with pointers to "struct dirent" directory entries, optionally sorted. Free the + * array with readdir_all_freep(). + * + * Start with space for up to 8 directory entries. We expect at least 2 ("." + ".."), hence hopefully + * 8 will cover most cases comprehensively. (Note that most likely a lot more entries will actually + * fit in the buffer, given we calculate maximum file name length here.) */ + de = malloc(offsetof(DirectoryEntries, buffer) + DIRENT_SIZE_MAX * 8); + if (!de) + return -ENOMEM; + + de->buffer_size = 0; + for (;;) { + size_t bs; + ssize_t n; + + bs = MIN(MALLOC_SIZEOF_SAFE(de) - offsetof(DirectoryEntries, buffer), (size_t) SSIZE_MAX); + assert(bs > de->buffer_size); + + n = getdents64(dir_fd, (uint8_t*) de->buffer + de->buffer_size, bs - de->buffer_size); + if (n < 0) + return -errno; + if (n == 0) + break; + + msan_unpoison((uint8_t*) de->buffer + de->buffer_size, n); + + de->buffer_size += n; + + if (de->buffer_size < bs - DIRENT_SIZE_MAX) /* Still room for one more entry, then try to + * fill it up without growing the structure. */ + continue; + + if (bs >= SSIZE_MAX - offsetof(DirectoryEntries, buffer)) + return -EFBIG; + bs = bs >= (SSIZE_MAX - offsetof(DirectoryEntries, buffer))/2 ? SSIZE_MAX - offsetof(DirectoryEntries, buffer) : bs * 2; + + nde = realloc(de, bs); + if (!nde) + return -ENOMEM; + + de = nde; + } + + de->n_entries = 0; + FOREACH_DIRENT_IN_BUFFER(entry, de->buffer, de->buffer_size) { + if (ignore_dirent(entry, flags)) + continue; + + if (FLAGS_SET(flags, RECURSE_DIR_ENSURE_TYPE)) { + r = dirent_ensure_type(dir_fd, entry); + if (r == -ENOENT) + /* dentry gone by now? no problem, let's just suppress it */ + continue; + if (r < 0) + return r; + } + + de->n_entries++; + } + + sz = ALIGN(offsetof(DirectoryEntries, buffer) + de->buffer_size); + add = sizeof(struct dirent*) * de->n_entries; + if (add > SIZE_MAX - add) + return -ENOMEM; + + nde = realloc(de, sz + add); + if (!nde) + return -ENOMEM; + + de = nde; + de->entries = (struct dirent**) ((uint8_t*) de + ALIGN(offsetof(DirectoryEntries, buffer) + de->buffer_size)); + + j = 0; + FOREACH_DIRENT_IN_BUFFER(entry, de->buffer, de->buffer_size) { + if (ignore_dirent(entry, flags)) + continue; + + /* If d_type == DT_UNKNOWN that means we failed to ensure the type in the earlier loop and + * didn't include the dentry in de->n_entries and as such should skip it here as well. */ + if (FLAGS_SET(flags, RECURSE_DIR_ENSURE_TYPE) && entry->d_type == DT_UNKNOWN) + continue; + + de->entries[j++] = entry; + } + assert(j == de->n_entries); + + if (FLAGS_SET(flags, RECURSE_DIR_SORT)) + typesafe_qsort(de->entries, de->n_entries, sort_func); + + if (ret) + *ret = TAKE_PTR(de); + + return 0; +} + +int recurse_dir( + int dir_fd, + const char *path, + unsigned statx_mask, + unsigned n_depth_max, + RecurseDirFlags flags, + recurse_dir_func_t func, + void *userdata) { + + _cleanup_free_ DirectoryEntries *de = NULL; + STRUCT_STATX_DEFINE(root_sx); + int r; + + assert(dir_fd >= 0); + assert(func); + + /* This is a lot like ftw()/nftw(), but a lot more modern, i.e. built around openat()/statx()/O_PATH, + * and under the assumption that fds are not as 'expensive' as they used to be. */ + + if (n_depth_max == 0) + return -EOVERFLOW; + if (n_depth_max == UINT_MAX) /* special marker for "default" */ + n_depth_max = DEFAULT_RECURSION_MAX; + + if (FLAGS_SET(flags, RECURSE_DIR_TOPLEVEL)) { + if (statx_mask != 0) { + r = statx_fallback(dir_fd, "", AT_EMPTY_PATH, statx_mask, &root_sx); + if (r < 0) + return r; + } + + r = func(RECURSE_DIR_ENTER, + path, + -1, /* we have no parent fd */ + dir_fd, + NULL, /* we have no dirent */ + statx_mask != 0 ? &root_sx : NULL, + userdata); + if (IN_SET(r, RECURSE_DIR_LEAVE_DIRECTORY, RECURSE_DIR_SKIP_ENTRY)) + return 0; + if (r != RECURSE_DIR_CONTINUE) + return r; + } + + /* Mask out RECURSE_DIR_ENSURE_TYPE so we can do it ourselves and avoid an extra statx() call. */ + r = readdir_all(dir_fd, flags & ~RECURSE_DIR_ENSURE_TYPE, &de); + if (r < 0) + return r; + + for (size_t i = 0; i < de->n_entries; i++) { + _cleanup_close_ int inode_fd = -EBADF, subdir_fd = -EBADF; + _cleanup_free_ char *joined = NULL; + STRUCT_STATX_DEFINE(sx); + bool sx_valid = false; + const char *p; + + /* For each directory entry we'll do one of the following: + * + * 1) If the entry refers to a directory, we'll open it as O_DIRECTORY 'subdir_fd' and then statx() the opened directory via that new fd (if requested) + * 2) Otherwise, if RECURSE_DIR_INODE_FD is set we'll open it as O_PATH 'inode_fd' and then statx() the opened inode via that new fd (if requested) + * 3) Otherwise, we'll statx() the directory entry via the directory fd we are currently looking at (if requested) + */ + + if (path) { + joined = path_join(path, de->entries[i]->d_name); + if (!joined) + return -ENOMEM; + + p = joined; + } else + p = de->entries[i]->d_name; + + if (IN_SET(de->entries[i]->d_type, DT_UNKNOWN, DT_DIR)) { + subdir_fd = openat(dir_fd, de->entries[i]->d_name, O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC); + if (subdir_fd < 0) { + if (errno == ENOENT) /* Vanished by now, go for next file immediately */ + continue; + + /* If it is a subdir but we failed to open it, then fail */ + if (!IN_SET(errno, ENOTDIR, ELOOP)) { + log_debug_errno(errno, "Failed to open directory '%s': %m", p); + + assert(errno <= RECURSE_DIR_SKIP_OPEN_DIR_ERROR_MAX - RECURSE_DIR_SKIP_OPEN_DIR_ERROR_BASE); + + r = func(RECURSE_DIR_SKIP_OPEN_DIR_ERROR_BASE + errno, + p, + dir_fd, + -1, + de->entries[i], + NULL, + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_CONTINUE, RECURSE_DIR_SKIP_ENTRY)) + return r; + + continue; + } + + /* If it's not a subdir, then let's handle it like a regular inode below */ + + } else { + /* If we managed to get a DIR* off the inode, it's definitely a directory. */ + de->entries[i]->d_type = DT_DIR; + + if (statx_mask != 0 || (flags & RECURSE_DIR_SAME_MOUNT)) { + r = statx_fallback(subdir_fd, "", AT_EMPTY_PATH, statx_mask, &sx); + if (r < 0) + return r; + + sx_valid = true; + } + } + } + + if (subdir_fd < 0) { + /* It's not a subdirectory. */ + + if (flags & RECURSE_DIR_INODE_FD) { + + inode_fd = openat(dir_fd, de->entries[i]->d_name, O_PATH|O_NOFOLLOW|O_CLOEXEC); + if (inode_fd < 0) { + if (errno == ENOENT) /* Vanished by now, go for next file immediately */ + continue; + + log_debug_errno(errno, "Failed to open directory entry '%s': %m", p); + + assert(errno <= RECURSE_DIR_SKIP_OPEN_INODE_ERROR_MAX - RECURSE_DIR_SKIP_OPEN_INODE_ERROR_BASE); + + r = func(RECURSE_DIR_SKIP_OPEN_INODE_ERROR_BASE + errno, + p, + dir_fd, + -1, + de->entries[i], + NULL, + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_CONTINUE, RECURSE_DIR_SKIP_ENTRY)) + return r; + + continue; + } + + /* If we open the inode, then verify it's actually a non-directory, like we + * assume. Let's guarantee that we never pass statx data of a directory where + * caller expects a non-directory */ + + r = statx_fallback(inode_fd, "", AT_EMPTY_PATH, statx_mask | STATX_TYPE, &sx); + if (r < 0) + return r; + + assert(sx.stx_mask & STATX_TYPE); + sx_valid = true; + + if (S_ISDIR(sx.stx_mode)) { + /* What? It's a directory now? Then someone must have quickly + * replaced it. Let's handle that gracefully: convert it to a + * directory fd — which should be riskless now that we pinned the + * inode. */ + + subdir_fd = fd_reopen(inode_fd, O_DIRECTORY|O_CLOEXEC); + if (subdir_fd < 0) + return subdir_fd; + + inode_fd = safe_close(inode_fd); + } + + } else if (statx_mask != 0 || (de->entries[i]->d_type == DT_UNKNOWN && (flags & RECURSE_DIR_ENSURE_TYPE))) { + + r = statx_fallback(dir_fd, de->entries[i]->d_name, AT_SYMLINK_NOFOLLOW, statx_mask | STATX_TYPE, &sx); + if (r == -ENOENT) /* Vanished by now? Go for next file immediately */ + continue; + if (r < 0) { + log_debug_errno(r, "Failed to stat directory entry '%s': %m", p); + + assert(errno <= RECURSE_DIR_SKIP_STAT_INODE_ERROR_MAX - RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE); + + r = func(RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE + -r, + p, + dir_fd, + -1, + de->entries[i], + NULL, + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_CONTINUE, RECURSE_DIR_SKIP_ENTRY)) + return r; + + continue; + } + + assert(sx.stx_mask & STATX_TYPE); + sx_valid = true; + + if (S_ISDIR(sx.stx_mode)) { + /* So it suddenly is a directory, but we couldn't open it as such + * earlier? That is weird, and probably means somebody is racing + * against us. We could of course retry and open it as a directory + * again, but the chance to win here is limited. Hence, let's + * propagate this as EISDIR error instead. That way we make this + * something that can be reasonably handled, even though we give the + * guarantee that RECURSE_DIR_ENTRY is strictly issued for + * non-directory dirents. */ + + log_debug_errno(r, "Non-directory entry '%s' suddenly became a directory: %m", p); + + r = func(RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE + EISDIR, + p, + dir_fd, + -1, + de->entries[i], + NULL, + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_CONTINUE, RECURSE_DIR_SKIP_ENTRY)) + return r; + + continue; + } + } + } + + if (sx_valid) { + /* Copy over the data we acquired through statx() if we acquired any */ + if (sx.stx_mask & STATX_TYPE) { + assert((subdir_fd < 0) == !S_ISDIR(sx.stx_mode)); + de->entries[i]->d_type = IFTODT(sx.stx_mode); + } + + if (sx.stx_mask & STATX_INO) + de->entries[i]->d_ino = sx.stx_ino; + } + + if (subdir_fd >= 0) { + if (FLAGS_SET(flags, RECURSE_DIR_SAME_MOUNT)) { + bool is_mount; + + if (sx_valid && FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) + is_mount = FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT); + else { + r = fd_is_mount_point(dir_fd, de->entries[i]->d_name, 0); + if (r < 0) + log_debug_errno(r, "Failed to determine whether %s is a submount, assuming not: %m", p); + + is_mount = r > 0; + } + + if (is_mount) { + r = func(RECURSE_DIR_SKIP_MOUNT, + p, + dir_fd, + subdir_fd, + de->entries[i], + statx_mask != 0 ? &sx : NULL, /* only pass sx if user asked for it */ + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_CONTINUE, RECURSE_DIR_SKIP_ENTRY)) + return r; + + continue; + } + } + + if (n_depth_max <= 1) { + /* When we reached max depth, generate a special event */ + + r = func(RECURSE_DIR_SKIP_DEPTH, + p, + dir_fd, + subdir_fd, + de->entries[i], + statx_mask != 0 ? &sx : NULL, /* only pass sx if user asked for it */ + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_CONTINUE, RECURSE_DIR_SKIP_ENTRY)) + return r; + + continue; + } + + r = func(RECURSE_DIR_ENTER, + p, + dir_fd, + subdir_fd, + de->entries[i], + statx_mask != 0 ? &sx : NULL, /* only pass sx if user asked for it */ + userdata); + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (r == RECURSE_DIR_SKIP_ENTRY) + continue; + if (r != RECURSE_DIR_CONTINUE) + return r; + + r = recurse_dir(subdir_fd, + p, + statx_mask, + n_depth_max - 1, + flags &~ RECURSE_DIR_TOPLEVEL, /* we already called the callback for this entry */ + func, + userdata); + if (r != 0) + return r; + + r = func(RECURSE_DIR_LEAVE, + p, + dir_fd, + subdir_fd, + de->entries[i], + statx_mask != 0 ? &sx : NULL, /* only pass sx if user asked for it */ + userdata); + } else + /* Non-directory inode */ + r = func(RECURSE_DIR_ENTRY, + p, + dir_fd, + inode_fd, + de->entries[i], + statx_mask != 0 ? &sx : NULL, /* only pass sx if user asked for it */ + userdata); + + + if (r == RECURSE_DIR_LEAVE_DIRECTORY) + break; + if (!IN_SET(r, RECURSE_DIR_SKIP_ENTRY, RECURSE_DIR_CONTINUE)) + return r; + } + + if (FLAGS_SET(flags, RECURSE_DIR_TOPLEVEL)) { + + r = func(RECURSE_DIR_LEAVE, + path, + -1, + dir_fd, + NULL, + statx_mask != 0 ? &root_sx : NULL, + userdata); + if (!IN_SET(r, RECURSE_DIR_LEAVE_DIRECTORY, RECURSE_DIR_SKIP_ENTRY, RECURSE_DIR_CONTINUE)) + return r; + } + + return 0; +} + +int recurse_dir_at( + int atfd, + const char *path, + unsigned statx_mask, + unsigned n_depth_max, + RecurseDirFlags flags, + recurse_dir_func_t func, + void *userdata) { + + _cleanup_close_ int fd = -EBADF; + + assert(atfd >= 0 || atfd == AT_FDCWD); + assert(func); + + fd = openat(atfd, path ?: ".", O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return -errno; + + return recurse_dir(fd, path, statx_mask, n_depth_max, flags, func, userdata); +} diff --git a/src/basic/recurse-dir.h b/src/basic/recurse-dir.h new file mode 100644 index 0000000..9f6a7ad --- /dev/null +++ b/src/basic/recurse-dir.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "errno-list.h" +#include "stat-util.h" +#include "macro.h" + +typedef enum RecurseDirEvent { + RECURSE_DIR_ENTER, /* only for dir inodes */ + RECURSE_DIR_LEAVE, /* only for dir inodes */ + RECURSE_DIR_ENTRY, /* only for non-dir inodes */ + RECURSE_DIR_SKIP_MOUNT, /* only for dir inodes: when we don't descent into submounts */ + RECURSE_DIR_SKIP_DEPTH, /* only for dir inodes: when we reached the max depth */ + + /* If we hit an error opening/stating an entry, then we'll fire a + * 'RECURSE_DIR_SKIP_{OPEN_DIR|OPEN_INODE|STAT_INODE}_ERROR_BASE + errno' event. In this case 'de' + * will be valid, but the statx data NULL and the inode fd -1. */ + RECURSE_DIR_SKIP_OPEN_DIR_ERROR_BASE, + RECURSE_DIR_SKIP_OPEN_DIR_ERROR_MAX = RECURSE_DIR_SKIP_OPEN_DIR_ERROR_BASE + ERRNO_MAX, + + RECURSE_DIR_SKIP_OPEN_INODE_ERROR_BASE, + RECURSE_DIR_SKIP_OPEN_INODE_ERROR_MAX = RECURSE_DIR_SKIP_OPEN_INODE_ERROR_BASE + ERRNO_MAX, + + RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE, + RECURSE_DIR_SKIP_STAT_INODE_ERROR_MAX = RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE + ERRNO_MAX, + + _RECURSE_DIR_EVENT_MAX, + _RECURSE_DIR_EVENT_INVALID = -EINVAL, +} RecurseDirEvent; + +#define RECURSE_DIR_CONTINUE 0 +#define RECURSE_DIR_LEAVE_DIRECTORY INT_MIN +#define RECURSE_DIR_SKIP_ENTRY (INT_MIN+1) + +/* Make sure that the negative errno range and these two special returns don't overlap */ +assert_cc(RECURSE_DIR_LEAVE_DIRECTORY < -ERRNO_MAX); +assert_cc(RECURSE_DIR_SKIP_ENTRY < -ERRNO_MAX); + +/* Prototype for the callback function that is called whenever we enter or leave a dir inode, or find another dir entry. Return values are: + * + * RECURSE_DIR_CONTINUE (i.e. 0) → continue with next entry + * RECURSE_DIR_LEAVE_DIRECTORY → leave current directory immediately, don't process further siblings + * RECURSE_DIR_SKIP_ENTRY → skip this entry otherwise (only makes sense on RECURSE_DIR_ENTER) + * others → terminate iteration entirely, return the specified value (idea is that + * < 0 indicates errors and > 0 indicates various forms of success) + */ +typedef int (*recurse_dir_func_t)( + RecurseDirEvent event, + const char *path, /* Full non-normalized path, i.e. the path specified during recurise_dir() with what we found appended */ + int dir_fd, /* fd of the current dir */ + int inode_fd, /* fd of the current entry in the current dir (O_DIRECTORY if directory, and O_PATH otherwise, but only if RECURSE_DIR_INODE_FD was set) */ + const struct dirent *de, /* directory entry (always valid) */ + const struct statx *sx, /* statx data (only if statx_mask was non-zero) */ + void *userdata); + +typedef enum RecurseDirFlags { + /* Interpreted by readdir_all() */ + RECURSE_DIR_SORT = 1 << 0, /* sort file directory entries before processing them */ + RECURSE_DIR_IGNORE_DOT = 1 << 1, /* ignore all dot files ("." and ".." are always ignored) */ + RECURSE_DIR_ENSURE_TYPE = 1 << 2, /* guarantees that 'd_type' field of 'de' is not DT_UNKNOWN */ + + /* Interpreted by recurse_dir() */ + RECURSE_DIR_SAME_MOUNT = 1 << 3, /* skips over subdirectories that are submounts */ + RECURSE_DIR_INODE_FD = 1 << 4, /* passes an opened inode fd (O_DIRECTORY fd in case of dirs, O_PATH otherwise) */ + RECURSE_DIR_TOPLEVEL = 1 << 5, /* call RECURSE_DIR_ENTER/RECURSE_DIR_LEAVE once for top-level dir, too, with dir_fd=-1 and NULL dirent */ +} RecurseDirFlags; + +typedef struct DirectoryEntries { + size_t n_entries; + struct dirent** entries; + size_t buffer_size; + struct dirent buffer[]; +} DirectoryEntries; + +int readdir_all(int dir_fd, RecurseDirFlags flags, DirectoryEntries **ret); + +int recurse_dir(int dir_fd, const char *path, unsigned statx_mask, unsigned n_depth_max, RecurseDirFlags flags, recurse_dir_func_t func, void *userdata); +int recurse_dir_at(int atfd, const char *path, unsigned statx_mask, unsigned n_depth_max, RecurseDirFlags flags, recurse_dir_func_t func, void *userdata); diff --git a/src/basic/replace-var.c b/src/basic/replace-var.c new file mode 100644 index 0000000..01c26ce --- /dev/null +++ b/src/basic/replace-var.c @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "macro.h" +#include "replace-var.h" +#include "string-util.h" + +/* + * Generic infrastructure for replacing @FOO@ style variables in + * strings. Will call a callback for each replacement. + */ + +static int get_variable(const char *b, char **r) { + size_t k; + char *t; + + assert(b); + assert(r); + + if (*b != '@') + return 0; + + k = strspn(b + 1, UPPERCASE_LETTERS "_"); + if (k <= 0 || b[k+1] != '@') + return 0; + + t = strndup(b + 1, k); + if (!t) + return -ENOMEM; + + *r = t; + return 1; +} + +char *replace_var(const char *text, char *(*lookup)(const char *variable, void *userdata), void *userdata) { + char *r, *t; + const char *f; + size_t l; + + assert(text); + assert(lookup); + + l = strlen(text); + r = new(char, l+1); + if (!r) + return NULL; + + f = text; + t = r; + while (*f) { + _cleanup_free_ char *v = NULL, *n = NULL; + char *a; + int k; + size_t skip, d, nl; + + k = get_variable(f, &v); + if (k < 0) + goto oom; + if (k == 0) { + *(t++) = *(f++); + continue; + } + + n = lookup(v, userdata); + if (!n) + goto oom; + + skip = strlen(v) + 2; + + d = t - r; + nl = l - skip + strlen(n); + a = realloc(r, nl + 1); + if (!a) + goto oom; + + l = nl; + r = a; + t = r + d; + + t = stpcpy(t, n); + f += skip; + } + + *t = 0; + return r; + +oom: + return mfree(r); +} diff --git a/src/basic/replace-var.h b/src/basic/replace-var.h new file mode 100644 index 0000000..644d9df --- /dev/null +++ b/src/basic/replace-var.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +char *replace_var(const char *text, char *(*lookup)(const char *variable, void *userdata), void *userdata); diff --git a/src/basic/rlimit-util.c b/src/basic/rlimit-util.c new file mode 100644 index 0000000..c1f0b2b --- /dev/null +++ b/src/basic/rlimit-util.c @@ -0,0 +1,428 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "format-util.h" +#include "macro.h" +#include "missing_resource.h" +#include "rlimit-util.h" +#include "string-table.h" +#include "time-util.h" + +int setrlimit_closest(int resource, const struct rlimit *rlim) { + struct rlimit highest, fixed; + + assert(rlim); + + if (setrlimit(resource, rlim) >= 0) + return 0; + + if (errno != EPERM) + return -errno; + + /* So we failed to set the desired setrlimit, then let's try + * to get as close as we can */ + if (getrlimit(resource, &highest) < 0) + return -errno; + + /* If the hard limit is unbounded anyway, then the EPERM had other reasons, let's propagate the original EPERM + * then */ + if (highest.rlim_max == RLIM_INFINITY) + return -EPERM; + + fixed = (struct rlimit) { + .rlim_cur = MIN(rlim->rlim_cur, highest.rlim_max), + .rlim_max = MIN(rlim->rlim_max, highest.rlim_max), + }; + + /* Shortcut things if we wouldn't change anything. */ + if (fixed.rlim_cur == highest.rlim_cur && + fixed.rlim_max == highest.rlim_max) + return 0; + + log_debug("Failed at setting rlimit " RLIM_FMT " for resource RLIMIT_%s. Will attempt setting value " RLIM_FMT " instead.", rlim->rlim_max, rlimit_to_string(resource), fixed.rlim_max); + + return RET_NERRNO(setrlimit(resource, &fixed)); +} + +int setrlimit_closest_all(const struct rlimit *const *rlim, int *which_failed) { + int r; + + assert(rlim); + + /* On failure returns the limit's index that failed in *which_failed, but only if non-NULL */ + + for (int i = 0; i < _RLIMIT_MAX; i++) { + if (!rlim[i]) + continue; + + r = setrlimit_closest(i, rlim[i]); + if (r < 0) { + if (which_failed) + *which_failed = i; + + return r; + } + } + + if (which_failed) + *which_failed = -1; + + return 0; +} + +static int rlimit_parse_u64(const char *val, rlim_t *ret) { + uint64_t u; + int r; + + assert(val); + assert(ret); + + if (streq(val, "infinity")) { + *ret = RLIM_INFINITY; + return 0; + } + + /* setrlimit(2) suggests rlim_t is always 64-bit on Linux. */ + assert_cc(sizeof(rlim_t) == sizeof(uint64_t)); + + r = safe_atou64(val, &u); + if (r < 0) + return r; + if (u >= (uint64_t) RLIM_INFINITY) + return -ERANGE; + + *ret = (rlim_t) u; + return 0; +} + +static int rlimit_parse_size(const char *val, rlim_t *ret) { + uint64_t u; + int r; + + assert(val); + assert(ret); + + if (streq(val, "infinity")) { + *ret = RLIM_INFINITY; + return 0; + } + + r = parse_size(val, 1024, &u); + if (r < 0) + return r; + if (u >= (uint64_t) RLIM_INFINITY) + return -ERANGE; + + *ret = (rlim_t) u; + return 0; +} + +static int rlimit_parse_sec(const char *val, rlim_t *ret) { + uint64_t u; + usec_t t; + int r; + + assert(val); + assert(ret); + + if (streq(val, "infinity")) { + *ret = RLIM_INFINITY; + return 0; + } + + r = parse_sec(val, &t); + if (r < 0) + return r; + if (t == USEC_INFINITY) { + *ret = RLIM_INFINITY; + return 0; + } + + u = (uint64_t) DIV_ROUND_UP(t, USEC_PER_SEC); + if (u >= (uint64_t) RLIM_INFINITY) + return -ERANGE; + + *ret = (rlim_t) u; + return 0; +} + +static int rlimit_parse_usec(const char *val, rlim_t *ret) { + usec_t t; + int r; + + assert(val); + assert(ret); + + if (streq(val, "infinity")) { + *ret = RLIM_INFINITY; + return 0; + } + + r = parse_time(val, &t, 1); + if (r < 0) + return r; + if (t == USEC_INFINITY) { + *ret = RLIM_INFINITY; + return 0; + } + + *ret = (rlim_t) t; + return 0; +} + +static int rlimit_parse_nice(const char *val, rlim_t *ret) { + uint64_t rl; + int r; + + /* So, Linux is weird. The range for RLIMIT_NICE is 40..1, mapping to the nice levels -20..19. However, the + * RLIMIT_NICE limit defaults to 0 by the kernel, i.e. a value that maps to nice level 20, which of course is + * bogus and does not exist. In order to permit parsing the RLIMIT_NICE of 0 here we hence implement a slight + * asymmetry: when parsing as positive nice level we permit 0..19. When parsing as negative nice level, we + * permit -20..0. But when parsing as raw resource limit value then we also allow the special value 0. + * + * Yeah, Linux is quality engineering sometimes... */ + + if (val[0] == '+') { + + /* Prefixed with "+": Parse as positive user-friendly nice value */ + r = safe_atou64(val + 1, &rl); + if (r < 0) + return r; + + if (rl >= PRIO_MAX) + return -ERANGE; + + rl = 20 - rl; + + } else if (val[0] == '-') { + + /* Prefixed with "-": Parse as negative user-friendly nice value */ + r = safe_atou64(val + 1, &rl); + if (r < 0) + return r; + + if (rl > (uint64_t) (-PRIO_MIN)) + return -ERANGE; + + rl = 20 + rl; + } else { + + /* Not prefixed: parse as raw resource limit value */ + r = safe_atou64(val, &rl); + if (r < 0) + return r; + + if (rl > (uint64_t) (20 - PRIO_MIN)) + return -ERANGE; + } + + *ret = (rlim_t) rl; + return 0; +} + +static int (*const rlimit_parse_table[_RLIMIT_MAX])(const char *val, rlim_t *ret) = { + [RLIMIT_CPU] = rlimit_parse_sec, + [RLIMIT_FSIZE] = rlimit_parse_size, + [RLIMIT_DATA] = rlimit_parse_size, + [RLIMIT_STACK] = rlimit_parse_size, + [RLIMIT_CORE] = rlimit_parse_size, + [RLIMIT_RSS] = rlimit_parse_size, + [RLIMIT_NOFILE] = rlimit_parse_u64, + [RLIMIT_AS] = rlimit_parse_size, + [RLIMIT_NPROC] = rlimit_parse_u64, + [RLIMIT_MEMLOCK] = rlimit_parse_size, + [RLIMIT_LOCKS] = rlimit_parse_u64, + [RLIMIT_SIGPENDING] = rlimit_parse_u64, + [RLIMIT_MSGQUEUE] = rlimit_parse_size, + [RLIMIT_NICE] = rlimit_parse_nice, + [RLIMIT_RTPRIO] = rlimit_parse_u64, + [RLIMIT_RTTIME] = rlimit_parse_usec, +}; + +int rlimit_parse_one(int resource, const char *val, rlim_t *ret) { + assert(val); + assert(ret); + + if (resource < 0) + return -EINVAL; + if (resource >= _RLIMIT_MAX) + return -EINVAL; + + return rlimit_parse_table[resource](val, ret); +} + +int rlimit_parse(int resource, const char *val, struct rlimit *ret) { + _cleanup_free_ char *hard = NULL, *soft = NULL; + rlim_t hl, sl; + int r; + + assert(val); + assert(ret); + + r = extract_first_word(&val, &soft, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + r = rlimit_parse_one(resource, soft, &sl); + if (r < 0) + return r; + + r = extract_first_word(&val, &hard, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (!isempty(val)) + return -EINVAL; + if (r == 0) + hl = sl; + else { + r = rlimit_parse_one(resource, hard, &hl); + if (r < 0) + return r; + if (sl > hl) + return -EILSEQ; + } + + *ret = (struct rlimit) { + .rlim_cur = sl, + .rlim_max = hl, + }; + + return 0; +} + +int rlimit_format(const struct rlimit *rl, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + assert(rl); + assert(ret); + + if (rl->rlim_cur >= RLIM_INFINITY && rl->rlim_max >= RLIM_INFINITY) + r = free_and_strdup(&s, "infinity"); + else if (rl->rlim_cur >= RLIM_INFINITY) + r = asprintf(&s, "infinity:" RLIM_FMT, rl->rlim_max); + else if (rl->rlim_max >= RLIM_INFINITY) + r = asprintf(&s, RLIM_FMT ":infinity", rl->rlim_cur); + else if (rl->rlim_cur == rl->rlim_max) + r = asprintf(&s, RLIM_FMT, rl->rlim_cur); + else + r = asprintf(&s, RLIM_FMT ":" RLIM_FMT, rl->rlim_cur, rl->rlim_max); + if (r < 0) + return -ENOMEM; + + *ret = TAKE_PTR(s); + return 0; +} + +static const char* const rlimit_table[_RLIMIT_MAX] = { + [RLIMIT_AS] = "AS", + [RLIMIT_CORE] = "CORE", + [RLIMIT_CPU] = "CPU", + [RLIMIT_DATA] = "DATA", + [RLIMIT_FSIZE] = "FSIZE", + [RLIMIT_LOCKS] = "LOCKS", + [RLIMIT_MEMLOCK] = "MEMLOCK", + [RLIMIT_MSGQUEUE] = "MSGQUEUE", + [RLIMIT_NICE] = "NICE", + [RLIMIT_NOFILE] = "NOFILE", + [RLIMIT_NPROC] = "NPROC", + [RLIMIT_RSS] = "RSS", + [RLIMIT_RTPRIO] = "RTPRIO", + [RLIMIT_RTTIME] = "RTTIME", + [RLIMIT_SIGPENDING] = "SIGPENDING", + [RLIMIT_STACK] = "STACK", +}; + +DEFINE_STRING_TABLE_LOOKUP(rlimit, int); + +int rlimit_from_string_harder(const char *s) { + const char *suffix; + + /* The official prefix */ + suffix = startswith(s, "RLIMIT_"); + if (suffix) + return rlimit_from_string(suffix); + + /* Our own unit file setting prefix */ + suffix = startswith(s, "Limit"); + if (suffix) + return rlimit_from_string(suffix); + + return rlimit_from_string(s); +} + +void rlimit_free_all(struct rlimit **rl) { + free_many((void**) rl, _RLIMIT_MAX); +} + +int rlimit_copy_all(struct rlimit* target[static _RLIMIT_MAX], struct rlimit* const source[static _RLIMIT_MAX]) { + struct rlimit* copy[_RLIMIT_MAX] = {}; + + assert(target); + assert(source); + + for (int i = 0; i < _RLIMIT_MAX; i++) { + if (!source[i]) + continue; + + copy[i] = newdup(struct rlimit, source[i], 1); + if (!copy[i]) { + rlimit_free_all(copy); + return -ENOMEM; + } + } + + memcpy(target, copy, sizeof(struct rlimit*) * _RLIMIT_MAX); + return 0; +} + +int rlimit_nofile_bump(int limit) { + int r; + + /* Bumps the (soft) RLIMIT_NOFILE resource limit as close as possible to the specified limit. If a negative + * limit is specified, bumps it to the maximum the kernel and the hard resource limit allows. This call should + * be used by all our programs that might need a lot of fds, and that know how to deal with high fd numbers + * (i.e. do not use select() — which chokes on fds >= 1024) */ + + if (limit < 0) + limit = read_nr_open(); + + if (limit < 3) + limit = 3; + + r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(limit)); + if (r < 0) + return log_debug_errno(r, "Failed to set RLIMIT_NOFILE: %m"); + + return 0; +} + +int rlimit_nofile_safe(void) { + struct rlimit rl; + + /* Resets RLIMIT_NOFILE's soft limit FD_SETSIZE (i.e. 1024), for compatibility with software still using + * select() */ + + if (getrlimit(RLIMIT_NOFILE, &rl) < 0) + return log_debug_errno(errno, "Failed to query RLIMIT_NOFILE: %m"); + + if (rl.rlim_cur <= FD_SETSIZE) + return 0; + + /* So we might have inherited a hard limit that's larger than the kernel's maximum limit as stored in + * /proc/sys/fs/nr_open. If we pass this hard limit unmodified to setrlimit(), we'll get EPERM. To + * make sure that doesn't happen, let's limit our hard limit to the value from nr_open. */ + rl.rlim_max = MIN(rl.rlim_max, (rlim_t) read_nr_open()); + rl.rlim_cur = MIN((rlim_t) FD_SETSIZE, rl.rlim_max); + if (setrlimit(RLIMIT_NOFILE, &rl) < 0) + return log_debug_errno(errno, "Failed to lower RLIMIT_NOFILE's soft limit to " RLIM_FMT ": %m", rl.rlim_cur); + + return 1; +} diff --git a/src/basic/rlimit-util.h b/src/basic/rlimit-util.h new file mode 100644 index 0000000..202c3fd --- /dev/null +++ b/src/basic/rlimit-util.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" +#include "missing_resource.h" + +const char *rlimit_to_string(int i) _const_; +int rlimit_from_string(const char *s) _pure_; +int rlimit_from_string_harder(const char *s) _pure_; + +int setrlimit_closest(int resource, const struct rlimit *rlim); +int setrlimit_closest_all(const struct rlimit * const *rlim, int *which_failed); + +int rlimit_parse_one(int resource, const char *val, rlim_t *ret); +int rlimit_parse(int resource, const char *val, struct rlimit *ret); + +int rlimit_format(const struct rlimit *rl, char **ret); + +int rlimit_copy_all(struct rlimit* target[static _RLIMIT_MAX], struct rlimit* const source[static _RLIMIT_MAX]); +void rlimit_free_all(struct rlimit **rl); + +#define RLIMIT_MAKE_CONST(lim) ((struct rlimit) { lim, lim }) + +int rlimit_nofile_bump(int limit); +int rlimit_nofile_safe(void); diff --git a/src/basic/runtime-scope.c b/src/basic/runtime-scope.c new file mode 100644 index 0000000..3d653d6 --- /dev/null +++ b/src/basic/runtime-scope.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "runtime-scope.h" +#include "string-table.h" + +static const char* const runtime_scope_table[_RUNTIME_SCOPE_MAX] = { + [RUNTIME_SCOPE_SYSTEM] = "system", + [RUNTIME_SCOPE_USER] = "user", + [RUNTIME_SCOPE_GLOBAL] = "global", +}; + +DEFINE_STRING_TABLE_LOOKUP(runtime_scope, RuntimeScope); + +static const char* const runtime_scope_cmdline_option_table[_RUNTIME_SCOPE_MAX] = { + [RUNTIME_SCOPE_SYSTEM] = "--system", + [RUNTIME_SCOPE_USER] = "--user", + [RUNTIME_SCOPE_GLOBAL] = "--global", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(runtime_scope_cmdline_option, RuntimeScope); diff --git a/src/basic/runtime-scope.h b/src/basic/runtime-scope.h new file mode 100644 index 0000000..6553e4c --- /dev/null +++ b/src/basic/runtime-scope.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +typedef enum RuntimeScope { + RUNTIME_SCOPE_SYSTEM, /* for the system */ + RUNTIME_SCOPE_USER, /* for a user */ + RUNTIME_SCOPE_GLOBAL, /* for all users */ + _RUNTIME_SCOPE_MAX, + _RUNTIME_SCOPE_INVALID = -EINVAL, +} RuntimeScope; + +const char *runtime_scope_to_string(RuntimeScope scope) _const_; +RuntimeScope runtime_scope_from_string(const char *s) _const_; + +const char *runtime_scope_cmdline_option_to_string(RuntimeScope scope) _const_; diff --git a/src/basic/set.h b/src/basic/set.h new file mode 100644 index 0000000..618e729 --- /dev/null +++ b/src/basic/set.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "extract-word.h" +#include "hashmap.h" +#include "macro.h" + +#define set_free_and_replace(a, b) \ + free_and_replace_full(a, b, set_free) + +Set* _set_new(const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS); +#define set_new(ops) _set_new(ops HASHMAP_DEBUG_SRC_ARGS) + +static inline Set* set_free(Set *s) { + return (Set*) _hashmap_free(HASHMAP_BASE(s), NULL, NULL); +} + +static inline Set* set_free_free(Set *s) { + return (Set*) _hashmap_free(HASHMAP_BASE(s), free, NULL); +} + +/* no set_free_free_free */ + +#define set_copy(s) ((Set*) _hashmap_copy(HASHMAP_BASE(s) HASHMAP_DEBUG_SRC_ARGS)) + +int _set_ensure_allocated(Set **s, const struct hash_ops *hash_ops HASHMAP_DEBUG_PARAMS); +#define set_ensure_allocated(h, ops) _set_ensure_allocated(h, ops HASHMAP_DEBUG_SRC_ARGS) + +int set_put(Set *s, const void *key); +/* no set_update */ +/* no set_replace */ +static inline void *set_get(const Set *s, const void *key) { + return _hashmap_get(HASHMAP_BASE((Set *) s), key); +} +/* no set_get2 */ + +static inline bool set_contains(const Set *s, const void *key) { + return _hashmap_contains(HASHMAP_BASE((Set *) s), key); +} + +static inline void *set_remove(Set *s, const void *key) { + return _hashmap_remove(HASHMAP_BASE(s), key); +} + +/* no set_remove2 */ +/* no set_remove_value */ +int set_remove_and_put(Set *s, const void *old_key, const void *new_key); +/* no set_remove_and_replace */ +int set_merge(Set *s, Set *other); + +static inline int set_reserve(Set *h, unsigned entries_add) { + return _hashmap_reserve(HASHMAP_BASE(h), entries_add); +} + +static inline int set_move(Set *s, Set *other) { + return _hashmap_move(HASHMAP_BASE(s), HASHMAP_BASE(other)); +} + +static inline int set_move_one(Set *s, Set *other, const void *key) { + return _hashmap_move_one(HASHMAP_BASE(s), HASHMAP_BASE(other), key); +} + +static inline unsigned set_size(const Set *s) { + return _hashmap_size(HASHMAP_BASE((Set *) s)); +} + +static inline bool set_isempty(const Set *s) { + return set_size(s) == 0; +} + +static inline unsigned set_buckets(const Set *s) { + return _hashmap_buckets(HASHMAP_BASE((Set *) s)); +} + +static inline bool set_iterate(const Set *s, Iterator *i, void **value) { + return _hashmap_iterate(HASHMAP_BASE((Set*) s), i, value, NULL); +} + +static inline void set_clear(Set *s) { + _hashmap_clear(HASHMAP_BASE(s), NULL, NULL); +} + +static inline void set_clear_free(Set *s) { + _hashmap_clear(HASHMAP_BASE(s), free, NULL); +} + +/* no set_clear_free_free */ + +static inline void *set_steal_first(Set *s) { + return _hashmap_first_key_and_value(HASHMAP_BASE(s), true, NULL); +} + +#define set_clear_with_destructor(s, f) \ + ({ \ + Set *_s = (s); \ + void *_item; \ + while ((_item = set_steal_first(_s))) \ + f(_item); \ + _s; \ + }) +#define set_free_with_destructor(s, f) \ + set_free(set_clear_with_destructor(s, f)) + +/* no set_steal_first_key */ +/* no set_first_key */ + +static inline void *set_first(const Set *s) { + return _hashmap_first_key_and_value(HASHMAP_BASE((Set *) s), false, NULL); +} + +/* no set_next */ + +static inline char **set_get_strv(Set *s) { + return _hashmap_get_strv(HASHMAP_BASE(s)); +} + +int _set_ensure_put(Set **s, const struct hash_ops *hash_ops, const void *key HASHMAP_DEBUG_PARAMS); +#define set_ensure_put(s, hash_ops, key) _set_ensure_put(s, hash_ops, key HASHMAP_DEBUG_SRC_ARGS) + +int _set_ensure_consume(Set **s, const struct hash_ops *hash_ops, void *key HASHMAP_DEBUG_PARAMS); +#define set_ensure_consume(s, hash_ops, key) _set_ensure_consume(s, hash_ops, key HASHMAP_DEBUG_SRC_ARGS) + +int set_consume(Set *s, void *value); + +int _set_put_strndup_full(Set **s, const struct hash_ops *hash_ops, const char *p, size_t n HASHMAP_DEBUG_PARAMS); +#define set_put_strndup_full(s, hash_ops, p, n) _set_put_strndup_full(s, hash_ops, p, n HASHMAP_DEBUG_SRC_ARGS) +#define set_put_strdup_full(s, hash_ops, p) set_put_strndup_full(s, hash_ops, p, SIZE_MAX) +#define set_put_strndup(s, p, n) set_put_strndup_full(s, &string_hash_ops_free, p, n) +#define set_put_strdup(s, p) set_put_strndup(s, p, SIZE_MAX) + +int _set_put_strdupv_full(Set **s, const struct hash_ops *hash_ops, char **l HASHMAP_DEBUG_PARAMS); +#define set_put_strdupv_full(s, hash_ops, l) _set_put_strdupv_full(s, hash_ops, l HASHMAP_DEBUG_SRC_ARGS) +#define set_put_strdupv(s, l) set_put_strdupv_full(s, &string_hash_ops_free, l) + +int set_put_strsplit(Set *s, const char *v, const char *separators, ExtractFlags flags); + +#define _SET_FOREACH(e, s, i) \ + for (Iterator i = ITERATOR_FIRST; set_iterate((s), &i, (void**)&(e)); ) +#define SET_FOREACH(e, s) \ + _SET_FOREACH(e, s, UNIQ_T(i, UNIQ)) + +#define SET_FOREACH_MOVE(e, d, s) \ + for (; ({ e = set_first(s); assert_se(!e || set_move_one(d, s, e) >= 0); e; }); ) + +DEFINE_TRIVIAL_CLEANUP_FUNC(Set*, set_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(Set*, set_free_free); + +#define _cleanup_set_free_ _cleanup_(set_freep) +#define _cleanup_set_free_free_ _cleanup_(set_free_freep) + +int set_strjoin(Set *s, const char *separator, bool wrap_with_separator, char **ret); + +bool set_equal(Set *a, Set *b); + +bool set_fnmatch(Set *include_patterns, Set *exclude_patterns, const char *needle); diff --git a/src/basic/sigbus.c b/src/basic/sigbus.c new file mode 100644 index 0000000..47ab0b8 --- /dev/null +++ b/src/basic/sigbus.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "macro.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "process-util.h" +#include "sigbus.h" +#include "signal-util.h" + +#define SIGBUS_QUEUE_MAX 64 + +static struct sigaction old_sigaction; +static unsigned n_installed = 0; + +/* We maintain a fixed size list of page addresses that triggered a + SIGBUS. We access with list with atomic operations, so that we + don't have to deal with locks between signal handler and main + programs in possibly multiple threads. */ + +static void* volatile sigbus_queue[SIGBUS_QUEUE_MAX]; +static volatile sig_atomic_t n_sigbus_queue = 0; + +static void sigbus_push(void *addr) { + assert(addr); + + /* Find a free place, increase the number of entries and leave, if we can */ + for (size_t u = 0; u < SIGBUS_QUEUE_MAX; u++) { + /* OK to initialize this here since we haven't started the atomic ops yet */ + void *tmp = NULL; + if (__atomic_compare_exchange_n(&sigbus_queue[u], &tmp, addr, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { + __atomic_fetch_add(&n_sigbus_queue, 1, __ATOMIC_SEQ_CST); + return; + } + } + + /* If we can't, make sure the queue size is out of bounds, to + * mark it as overflowed */ + for (;;) { + sig_atomic_t c; + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + c = n_sigbus_queue; + + if (c > SIGBUS_QUEUE_MAX) /* already overflowed */ + return; + + /* OK if we clobber c here, since we either immediately return + * or it will be immediately reinitialized on next loop */ + if (__atomic_compare_exchange_n(&n_sigbus_queue, &c, c + SIGBUS_QUEUE_MAX, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + return; + } +} + +int sigbus_pop(void **ret) { + assert(ret); + + for (;;) { + unsigned u, c; + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + c = n_sigbus_queue; + + if (_likely_(c == 0)) + return 0; + + if (_unlikely_(c > SIGBUS_QUEUE_MAX)) + return -EOVERFLOW; + + for (u = 0; u < SIGBUS_QUEUE_MAX; u++) { + void *addr; + + addr = sigbus_queue[u]; + if (!addr) + continue; + + /* OK if we clobber addr here, since we either immediately return + * or it will be immediately reinitialized on next loop */ + if (__atomic_compare_exchange_n(&sigbus_queue[u], &addr, NULL, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { + __atomic_fetch_sub(&n_sigbus_queue, 1, __ATOMIC_SEQ_CST); + /* If we successfully entered this if condition, addr won't + * have been modified since its assignment, so safe to use it */ + *ret = addr; + return 1; + } + } + } +} + +static void sigbus_handler(int sn, siginfo_t *si, void *data) { + unsigned long ul; + void *aligned; + + assert(sn == SIGBUS); + assert(si); + + if (si->si_code != BUS_ADRERR || !si->si_addr) { + assert_se(sigaction(SIGBUS, &old_sigaction, NULL) == 0); + propagate_signal(sn, si); + return; + } + + ul = (unsigned long) si->si_addr; + ul = ul / page_size(); + ul = ul * page_size(); + aligned = (void*) ul; + + /* Let's remember which address failed */ + sigbus_push(aligned); + + /* Replace mapping with an anonymous page, so that the + * execution can continue, however with a zeroed out page */ + assert_se(mmap(aligned, page_size(), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == aligned); +} + +void sigbus_install(void) { + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + /* make sure that sysconf() is not called from a signal handler because + * it is not guaranteed to be async-signal-safe since POSIX.1-2008 */ + (void) page_size(); + + n_installed++; + + if (n_installed == 1) + assert_se(sigaction(SIGBUS, &sa, &old_sigaction) == 0); + + return; +} + +void sigbus_reset(void) { + + if (n_installed <= 0) + return; + + n_installed--; + + if (n_installed == 0) + assert_se(sigaction(SIGBUS, &old_sigaction, NULL) == 0); + + return; +} diff --git a/src/basic/sigbus.h b/src/basic/sigbus.h new file mode 100644 index 0000000..a40b1a8 --- /dev/null +++ b/src/basic/sigbus.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +void sigbus_install(void); +void sigbus_reset(void); + +int sigbus_pop(void **ret); diff --git a/src/basic/signal-util.c b/src/basic/signal-util.c new file mode 100644 index 0000000..5d948462 --- /dev/null +++ b/src/basic/signal-util.c @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "errno-util.h" +#include "macro.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "parse-util.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" + +int reset_all_signal_handlers(void) { + static const struct sigaction sa = { + .sa_handler = SIG_DFL, + .sa_flags = SA_RESTART, + }; + int r = 0; + + for (int sig = 1; sig < _NSIG; sig++) { + + /* These two cannot be caught... */ + if (IN_SET(sig, SIGKILL, SIGSTOP)) + continue; + + /* On Linux the first two RT signals are reserved by + * glibc, and sigaction() will return EINVAL for them. */ + if (sigaction(sig, &sa, NULL) < 0) + if (errno != EINVAL && r >= 0) + r = -errno; + } + + return r; +} + +int reset_signal_mask(void) { + sigset_t ss; + + if (sigemptyset(&ss) < 0) + return -errno; + + return RET_NERRNO(sigprocmask(SIG_SETMASK, &ss, NULL)); +} + +int sigaction_many_internal(const struct sigaction *sa, ...) { + int sig, r = 0; + va_list ap; + + va_start(ap, sa); + + /* negative signal ends the list. 0 signal is skipped. */ + while ((sig = va_arg(ap, int)) >= 0) { + + if (sig == 0) + continue; + + if (sigaction(sig, sa, NULL) < 0) { + if (r >= 0) + r = -errno; + } + } + + va_end(ap); + + return r; +} + +static int sigset_add_many_ap(sigset_t *ss, va_list ap) { + int sig, r = 0; + + assert(ss); + + while ((sig = va_arg(ap, int)) >= 0) { + + if (sig == 0) + continue; + + if (sigaddset(ss, sig) < 0) { + if (r >= 0) + r = -errno; + } + } + + return r; +} + +int sigset_add_many(sigset_t *ss, ...) { + va_list ap; + int r; + + va_start(ap, ss); + r = sigset_add_many_ap(ss, ap); + va_end(ap); + + return r; +} + +int sigprocmask_many(int how, sigset_t *old, ...) { + va_list ap; + sigset_t ss; + int r; + + if (sigemptyset(&ss) < 0) + return -errno; + + va_start(ap, old); + r = sigset_add_many_ap(&ss, ap); + va_end(ap); + + if (r < 0) + return r; + + if (sigprocmask(how, &ss, old) < 0) + return -errno; + + return 0; +} + +static const char *const static_signal_table[] = { + [SIGHUP] = "HUP", + [SIGINT] = "INT", + [SIGQUIT] = "QUIT", + [SIGILL] = "ILL", + [SIGTRAP] = "TRAP", + [SIGABRT] = "ABRT", + [SIGBUS] = "BUS", + [SIGFPE] = "FPE", + [SIGKILL] = "KILL", + [SIGUSR1] = "USR1", + [SIGSEGV] = "SEGV", + [SIGUSR2] = "USR2", + [SIGPIPE] = "PIPE", + [SIGALRM] = "ALRM", + [SIGTERM] = "TERM", +#ifdef SIGSTKFLT + [SIGSTKFLT] = "STKFLT", /* Linux on SPARC doesn't know SIGSTKFLT */ +#endif + [SIGCHLD] = "CHLD", + [SIGCONT] = "CONT", + [SIGSTOP] = "STOP", + [SIGTSTP] = "TSTP", + [SIGTTIN] = "TTIN", + [SIGTTOU] = "TTOU", + [SIGURG] = "URG", + [SIGXCPU] = "XCPU", + [SIGXFSZ] = "XFSZ", + [SIGVTALRM] = "VTALRM", + [SIGPROF] = "PROF", + [SIGWINCH] = "WINCH", + [SIGIO] = "IO", + [SIGPWR] = "PWR", + [SIGSYS] = "SYS" +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(static_signal, int); + +const char *signal_to_string(int signo) { + static thread_local char buf[STRLEN("RTMIN+") + DECIMAL_STR_MAX(int)]; + const char *name; + + name = static_signal_to_string(signo); + if (name) + return name; + + if (signo >= SIGRTMIN && signo <= SIGRTMAX) + xsprintf(buf, "RTMIN+%d", signo - SIGRTMIN); + else + xsprintf(buf, "%d", signo); + + return buf; +} + +int signal_from_string(const char *s) { + const char *p; + int signo, r; + + /* Check that the input is a signal number. */ + if (safe_atoi(s, &signo) >= 0) { + if (SIGNAL_VALID(signo)) + return signo; + else + return -ERANGE; + } + + /* Drop "SIG" prefix. */ + if (startswith(s, "SIG")) + s += 3; + + /* Check that the input is a signal name. */ + signo = static_signal_from_string(s); + if (signo > 0) + return signo; + + /* Check that the input is RTMIN or + * RTMIN+n (0 <= n <= SIGRTMAX-SIGRTMIN). */ + p = startswith(s, "RTMIN"); + if (p) { + if (*p == '\0') + return SIGRTMIN; + if (*p != '+') + return -EINVAL; + + r = safe_atoi(p, &signo); + if (r < 0) + return r; + + if (signo < 0 || signo > SIGRTMAX - SIGRTMIN) + return -ERANGE; + + return signo + SIGRTMIN; + } + + /* Check that the input is RTMAX or + * RTMAX-n (0 <= n <= SIGRTMAX-SIGRTMIN). */ + p = startswith(s, "RTMAX"); + if (p) { + if (*p == '\0') + return SIGRTMAX; + if (*p != '-') + return -EINVAL; + + r = safe_atoi(p, &signo); + if (r < 0) + return r; + + if (signo > 0 || signo < SIGRTMIN - SIGRTMAX) + return -ERANGE; + + return signo + SIGRTMAX; + } + + return -EINVAL; +} + +void nop_signal_handler(int sig) { + /* nothing here */ +} + +int signal_is_blocked(int sig) { + sigset_t ss; + int r; + + r = pthread_sigmask(SIG_SETMASK, NULL, &ss); + if (r != 0) + return -r; + + return RET_NERRNO(sigismember(&ss, sig)); +} + +int pop_pending_signal_internal(int sig, ...) { + sigset_t ss; + va_list ap; + int r; + + if (sig < 0) /* Empty list? */ + return -EINVAL; + + if (sigemptyset(&ss) < 0) + return -errno; + + /* Add first signal (if the signal is zero, we'll silently skip it, to make it easier to build + * parameter lists where some element are sometimes off, similar to how sigset_add_many_ap() handles + * this.) */ + if (sig > 0 && sigaddset(&ss, sig) < 0) + return -errno; + + /* Add all other signals */ + va_start(ap, sig); + r = sigset_add_many_ap(&ss, ap); + va_end(ap); + if (r < 0) + return r; + + r = sigtimedwait(&ss, NULL, &(struct timespec) { 0, 0 }); + if (r < 0) { + if (errno == EAGAIN) + return 0; + + return -errno; + } + + return r; /* Returns the signal popped */ +} + +void propagate_signal(int sig, siginfo_t *siginfo) { + pid_t p; + + /* To be called from a signal handler. Will raise the same signal again, in our process + in our threads. + * + * Note that we use raw_getpid() instead of getpid_cached(). We might have forked with raw_clone() + * earlier (see PID 1), and hence let's go to the raw syscall here. In particular as this is not + * performance sensitive code. + * + * Note that we use kill() rather than raise() as fallback, for similar reasons. */ + + p = raw_getpid(); + + if (rt_tgsigqueueinfo(p, gettid(), sig, siginfo) < 0) + assert_se(kill(p, sig) >= 0); +} diff --git a/src/basic/signal-util.h b/src/basic/signal-util.h new file mode 100644 index 0000000..ad2ba84 --- /dev/null +++ b/src/basic/signal-util.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +int reset_all_signal_handlers(void); +int reset_signal_mask(void); + +int sigaction_many_internal(const struct sigaction *sa, ...); + +#define ignore_signals(...) \ + sigaction_many_internal( \ + &(const struct sigaction) { \ + .sa_handler = SIG_IGN, \ + .sa_flags = SA_RESTART \ + }, \ + __VA_ARGS__, \ + -1) + +#define default_signals(...) \ + sigaction_many_internal( \ + &(const struct sigaction) { \ + .sa_handler = SIG_DFL, \ + .sa_flags = SA_RESTART \ + }, \ + __VA_ARGS__, \ + -1) + +#define sigaction_many(sa, ...) \ + sigaction_many_internal(sa, __VA_ARGS__, -1) + +int sigset_add_many(sigset_t *ss, ...); +int sigprocmask_many(int how, sigset_t *old, ...); + +const char *signal_to_string(int i) _const_; +int signal_from_string(const char *s) _pure_; + +void nop_signal_handler(int sig); + +static inline void block_signals_reset(sigset_t *ss) { + assert_se(sigprocmask(SIG_SETMASK, ss, NULL) >= 0); +} + +#define BLOCK_SIGNALS(...) \ + _cleanup_(block_signals_reset) _unused_ sigset_t _saved_sigset = ({ \ + sigset_t _t; \ + assert_se(sigprocmask_many(SIG_BLOCK, &_t, __VA_ARGS__, -1) >= 0); \ + _t; \ + }) + +static inline bool SIGNAL_VALID(int signo) { + return signo > 0 && signo < _NSIG; +} + +static inline const char* signal_to_string_with_check(int n) { + if (!SIGNAL_VALID(n)) + return NULL; + + return signal_to_string(n); +} + +int signal_is_blocked(int sig); + +int pop_pending_signal_internal(int sig, ...); +#define pop_pending_signal(...) pop_pending_signal_internal(__VA_ARGS__, -1) + +void propagate_signal(int sig, siginfo_t *siginfo); diff --git a/src/basic/siphash24.c b/src/basic/siphash24.c new file mode 100644 index 0000000..b614ecf --- /dev/null +++ b/src/basic/siphash24.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: CC0-1.0 */ + +/* + SipHash reference C implementation + + Written in 2012 by + Jean-Philippe Aumasson + Daniel J. Bernstein + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . + + (Minimal changes made by Lennart Poettering, to make clean for inclusion in systemd) + (Refactored by Tom Gundersen to split up in several functions and follow systemd + coding style) +*/ + +#include + +#include "macro.h" +#include "siphash24.h" +#include "unaligned.h" + +static uint64_t rotate_left(uint64_t x, uint8_t b) { + assert(b < 64); + + return (x << b) | (x >> (64 - b)); +} + +static void sipround(struct siphash *state) { + assert(state); + + state->v0 += state->v1; + state->v1 = rotate_left(state->v1, 13); + state->v1 ^= state->v0; + state->v0 = rotate_left(state->v0, 32); + state->v2 += state->v3; + state->v3 = rotate_left(state->v3, 16); + state->v3 ^= state->v2; + state->v0 += state->v3; + state->v3 = rotate_left(state->v3, 21); + state->v3 ^= state->v0; + state->v2 += state->v1; + state->v1 = rotate_left(state->v1, 17); + state->v1 ^= state->v2; + state->v2 = rotate_left(state->v2, 32); +} + +void siphash24_init(struct siphash *state, const uint8_t k[static 16]) { + uint64_t k0, k1; + + assert(state); + assert(k); + + k0 = unaligned_read_le64(k); + k1 = unaligned_read_le64(k + 8); + + *state = (struct siphash) { + /* "somepseudorandomlygeneratedbytes" */ + .v0 = 0x736f6d6570736575ULL ^ k0, + .v1 = 0x646f72616e646f6dULL ^ k1, + .v2 = 0x6c7967656e657261ULL ^ k0, + .v3 = 0x7465646279746573ULL ^ k1, + .padding = 0, + .inlen = 0, + }; +} + +void siphash24_compress(const void *_in, size_t inlen, struct siphash *state) { + + const uint8_t *in = ASSERT_PTR(_in); + const uint8_t *end = in + inlen; + size_t left = state->inlen & 7; + uint64_t m; + + assert(state); + + /* Update total length */ + state->inlen += inlen; + + /* If padding exists, fill it out */ + if (left > 0) { + for ( ; in < end && left < 8; in ++, left ++) + state->padding |= ((uint64_t) *in) << (left * 8); + + if (in == end && left < 8) + /* We did not have enough input to fill out the padding completely */ + return; + +#if ENABLE_DEBUG_SIPHASH + printf("(%3zu) v0 %08x %08x\n", state->inlen, (uint32_t) (state->v0 >> 32), (uint32_t) state->v0); + printf("(%3zu) v1 %08x %08x\n", state->inlen, (uint32_t) (state->v1 >> 32), (uint32_t) state->v1); + printf("(%3zu) v2 %08x %08x\n", state->inlen, (uint32_t) (state->v2 >> 32), (uint32_t) state->v2); + printf("(%3zu) v3 %08x %08x\n", state->inlen, (uint32_t) (state->v3 >> 32), (uint32_t) state->v3); + printf("(%3zu) compress padding %08x %08x\n", state->inlen, (uint32_t) (state->padding >> 32), (uint32_t)state->padding); +#endif + + state->v3 ^= state->padding; + sipround(state); + sipround(state); + state->v0 ^= state->padding; + + state->padding = 0; + } + + end -= (state->inlen % sizeof(uint64_t)); + + for ( ; in < end; in += 8) { + m = unaligned_read_le64(in); +#if ENABLE_DEBUG_SIPHASH + printf("(%3zu) v0 %08x %08x\n", state->inlen, (uint32_t) (state->v0 >> 32), (uint32_t) state->v0); + printf("(%3zu) v1 %08x %08x\n", state->inlen, (uint32_t) (state->v1 >> 32), (uint32_t) state->v1); + printf("(%3zu) v2 %08x %08x\n", state->inlen, (uint32_t) (state->v2 >> 32), (uint32_t) state->v2); + printf("(%3zu) v3 %08x %08x\n", state->inlen, (uint32_t) (state->v3 >> 32), (uint32_t) state->v3); + printf("(%3zu) compress %08x %08x\n", state->inlen, (uint32_t) (m >> 32), (uint32_t) m); +#endif + state->v3 ^= m; + sipround(state); + sipround(state); + state->v0 ^= m; + } + + left = state->inlen & 7; + switch (left) { + case 7: + state->padding |= ((uint64_t) in[6]) << 48; + _fallthrough_; + case 6: + state->padding |= ((uint64_t) in[5]) << 40; + _fallthrough_; + case 5: + state->padding |= ((uint64_t) in[4]) << 32; + _fallthrough_; + case 4: + state->padding |= ((uint64_t) in[3]) << 24; + _fallthrough_; + case 3: + state->padding |= ((uint64_t) in[2]) << 16; + _fallthrough_; + case 2: + state->padding |= ((uint64_t) in[1]) << 8; + _fallthrough_; + case 1: + state->padding |= ((uint64_t) in[0]); + _fallthrough_; + case 0: + break; + } +} + +uint64_t siphash24_finalize(struct siphash *state) { + uint64_t b; + + assert(state); + + b = state->padding | (((uint64_t) state->inlen) << 56); + +#if ENABLE_DEBUG_SIPHASH + printf("(%3zu) v0 %08x %08x\n", state->inlen, (uint32_t) (state->v0 >> 32), (uint32_t) state->v0); + printf("(%3zu) v1 %08x %08x\n", state->inlen, (uint32_t) (state->v1 >> 32), (uint32_t) state->v1); + printf("(%3zu) v2 %08x %08x\n", state->inlen, (uint32_t) (state->v2 >> 32), (uint32_t) state->v2); + printf("(%3zu) v3 %08x %08x\n", state->inlen, (uint32_t) (state->v3 >> 32), (uint32_t) state->v3); + printf("(%3zu) padding %08x %08x\n", state->inlen, (uint32_t) (state->padding >> 32), (uint32_t) state->padding); +#endif + + state->v3 ^= b; + sipround(state); + sipround(state); + state->v0 ^= b; + +#if ENABLE_DEBUG_SIPHASH + printf("(%3zu) v0 %08x %08x\n", state->inlen, (uint32_t) (state->v0 >> 32), (uint32_t) state->v0); + printf("(%3zu) v1 %08x %08x\n", state->inlen, (uint32_t) (state->v1 >> 32), (uint32_t) state->v1); + printf("(%3zu) v2 %08x %08x\n", state->inlen, (uint32_t) (state->v2 >> 32), (uint32_t) state->v2); + printf("(%3zu) v3 %08x %08x\n", state->inlen, (uint32_t) (state->v3 >> 32), (uint32_t) state->v3); +#endif + state->v2 ^= 0xff; + + sipround(state); + sipround(state); + sipround(state); + sipround(state); + + return state->v0 ^ state->v1 ^ state->v2 ^ state->v3; +} + +uint64_t siphash24(const void *in, size_t inlen, const uint8_t k[static 16]) { + struct siphash state; + + assert(in); + assert(k); + + siphash24_init(&state, k); + siphash24_compress(in, inlen, &state); + + return siphash24_finalize(&state); +} diff --git a/src/basic/siphash24.h b/src/basic/siphash24.h new file mode 100644 index 0000000..0b3e845 --- /dev/null +++ b/src/basic/siphash24.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: CC0-1.0 */ + +#pragma once + +#include +#include +#include +#include + +#include "string-util.h" +#include "time-util.h" + +struct siphash { + uint64_t v0; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t padding; + size_t inlen; +}; + +void siphash24_init(struct siphash *state, const uint8_t k[static 16]); +void siphash24_compress(const void *in, size_t inlen, struct siphash *state); +#define siphash24_compress_byte(byte, state) siphash24_compress((const uint8_t[]) { (byte) }, 1, (state)) + +static inline void siphash24_compress_boolean(bool in, struct siphash *state) { + uint8_t i = in; + + siphash24_compress(&i, sizeof i, state); +} + +static inline void siphash24_compress_usec_t(usec_t in, struct siphash *state) { + siphash24_compress(&in, sizeof in, state); +} + +static inline void siphash24_compress_safe(const void *in, size_t inlen, struct siphash *state) { + if (inlen == 0) + return; + + siphash24_compress(in, inlen, state); +} + +static inline void siphash24_compress_string(const char *in, struct siphash *state) { + siphash24_compress_safe(in, strlen_ptr(in), state); +} + +uint64_t siphash24_finalize(struct siphash *state); + +uint64_t siphash24(const void *in, size_t inlen, const uint8_t k[static 16]); + +static inline uint64_t siphash24_string(const char *s, const uint8_t k[static 16]) { + return siphash24(s, strlen(s) + 1, k); +} diff --git a/src/basic/socket-util.c b/src/basic/socket-util.c new file mode 100644 index 0000000..beb64d8 --- /dev/null +++ b/src/basic/socket-util.c @@ -0,0 +1,1696 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "io-util.h" +#include "log.h" +#include "memory-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "sysctl-util.h" +#include "user-util.h" +#include "utf8.h" + +#if ENABLE_IDN +# define IDN_FLAGS NI_IDN +#else +# define IDN_FLAGS 0 +#endif + +/* From the kernel's include/net/scm.h */ +#ifndef SCM_MAX_FD +# define SCM_MAX_FD 253 +#endif + +static const char* const socket_address_type_table[] = { + [SOCK_STREAM] = "Stream", + [SOCK_DGRAM] = "Datagram", + [SOCK_RAW] = "Raw", + [SOCK_RDM] = "ReliableDatagram", + [SOCK_SEQPACKET] = "SequentialPacket", + [SOCK_DCCP] = "DatagramCongestionControl", +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_address_type, int); + +int socket_address_verify(const SocketAddress *a, bool strict) { + assert(a); + + /* With 'strict' we enforce additional sanity constraints which are not set by the standard, + * but should only apply to sockets we create ourselves. */ + + switch (socket_address_family(a)) { + + case AF_INET: + if (a->size != sizeof(struct sockaddr_in)) + return -EINVAL; + + if (a->sockaddr.in.sin_port == 0) + return -EINVAL; + + if (!IN_SET(a->type, 0, SOCK_STREAM, SOCK_DGRAM)) + return -EINVAL; + + return 0; + + case AF_INET6: + if (a->size != sizeof(struct sockaddr_in6)) + return -EINVAL; + + if (a->sockaddr.in6.sin6_port == 0) + return -EINVAL; + + if (!IN_SET(a->type, 0, SOCK_STREAM, SOCK_DGRAM)) + return -EINVAL; + + return 0; + + case AF_UNIX: + if (a->size < offsetof(struct sockaddr_un, sun_path)) + return -EINVAL; + if (a->size > sizeof(struct sockaddr_un) + !strict) + /* If !strict, allow one extra byte, since getsockname() on Linux will append + * a NUL byte if we have path sockets that are above sun_path's full size. */ + return -EINVAL; + + if (a->size > offsetof(struct sockaddr_un, sun_path) && + a->sockaddr.un.sun_path[0] != 0 && + strict) { + /* Only validate file system sockets here, and only in strict mode */ + const char *e; + + e = memchr(a->sockaddr.un.sun_path, 0, sizeof(a->sockaddr.un.sun_path)); + if (e) { + /* If there's an embedded NUL byte, make sure the size of the socket address matches it */ + if (a->size != offsetof(struct sockaddr_un, sun_path) + (e - a->sockaddr.un.sun_path) + 1) + return -EINVAL; + } else { + /* If there's no embedded NUL byte, then the size needs to match the whole + * structure or the structure with one extra NUL byte suffixed. (Yeah, Linux is awful, + * and considers both equivalent: getsockname() even extends sockaddr_un beyond its + * size if the path is non NUL terminated.) */ + if (!IN_SET(a->size, sizeof(a->sockaddr.un.sun_path), sizeof(a->sockaddr.un.sun_path)+1)) + return -EINVAL; + } + } + + if (!IN_SET(a->type, 0, SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET)) + return -EINVAL; + + return 0; + + case AF_NETLINK: + + if (a->size != sizeof(struct sockaddr_nl)) + return -EINVAL; + + if (!IN_SET(a->type, 0, SOCK_RAW, SOCK_DGRAM)) + return -EINVAL; + + return 0; + + case AF_VSOCK: + if (a->size != sizeof(struct sockaddr_vm)) + return -EINVAL; + + if (!IN_SET(a->type, 0, SOCK_STREAM, SOCK_DGRAM)) + return -EINVAL; + + return 0; + + default: + return -EAFNOSUPPORT; + } +} + +int socket_address_print(const SocketAddress *a, char **ret) { + int r; + + assert(a); + assert(ret); + + r = socket_address_verify(a, false); /* We do non-strict validation, because we want to be + * able to pretty-print any socket the kernel considers + * valid. We still need to do validation to know if we + * can meaningfully print the address. */ + if (r < 0) + return r; + + if (socket_address_family(a) == AF_NETLINK) { + _cleanup_free_ char *sfamily = NULL; + + r = netlink_family_to_string_alloc(a->protocol, &sfamily); + if (r < 0) + return r; + + r = asprintf(ret, "%s %u", sfamily, a->sockaddr.nl.nl_groups); + if (r < 0) + return -ENOMEM; + + return 0; + } + + return sockaddr_pretty(&a->sockaddr.sa, a->size, false, true, ret); +} + +bool socket_address_can_accept(const SocketAddress *a) { + assert(a); + + return + IN_SET(a->type, SOCK_STREAM, SOCK_SEQPACKET); +} + +bool socket_address_equal(const SocketAddress *a, const SocketAddress *b) { + assert(a); + assert(b); + + /* Invalid addresses are unequal to all */ + if (socket_address_verify(a, false) < 0 || + socket_address_verify(b, false) < 0) + return false; + + if (a->type != b->type) + return false; + + if (socket_address_family(a) != socket_address_family(b)) + return false; + + switch (socket_address_family(a)) { + + case AF_INET: + if (a->sockaddr.in.sin_addr.s_addr != b->sockaddr.in.sin_addr.s_addr) + return false; + + if (a->sockaddr.in.sin_port != b->sockaddr.in.sin_port) + return false; + + break; + + case AF_INET6: + if (memcmp(&a->sockaddr.in6.sin6_addr, &b->sockaddr.in6.sin6_addr, sizeof(a->sockaddr.in6.sin6_addr)) != 0) + return false; + + if (a->sockaddr.in6.sin6_port != b->sockaddr.in6.sin6_port) + return false; + + break; + + case AF_UNIX: + if (a->size <= offsetof(struct sockaddr_un, sun_path) || + b->size <= offsetof(struct sockaddr_un, sun_path)) + return false; + + if ((a->sockaddr.un.sun_path[0] == 0) != (b->sockaddr.un.sun_path[0] == 0)) + return false; + + if (a->sockaddr.un.sun_path[0]) { + if (!path_equal_or_inode_same(a->sockaddr.un.sun_path, b->sockaddr.un.sun_path, 0)) + return false; + } else { + if (a->size != b->size) + return false; + + if (memcmp(a->sockaddr.un.sun_path, b->sockaddr.un.sun_path, a->size) != 0) + return false; + } + + break; + + case AF_NETLINK: + if (a->protocol != b->protocol) + return false; + + if (a->sockaddr.nl.nl_groups != b->sockaddr.nl.nl_groups) + return false; + + break; + + case AF_VSOCK: + if (a->sockaddr.vm.svm_cid != b->sockaddr.vm.svm_cid) + return false; + + if (a->sockaddr.vm.svm_port != b->sockaddr.vm.svm_port) + return false; + + break; + + default: + /* Cannot compare, so we assume the addresses are different */ + return false; + } + + return true; +} + +const char* socket_address_get_path(const SocketAddress *a) { + assert(a); + + if (socket_address_family(a) != AF_UNIX) + return NULL; + + if (a->sockaddr.un.sun_path[0] == 0) + return NULL; + + /* Note that this is only safe because we know that there's an extra NUL byte after the sockaddr_un + * structure. On Linux AF_UNIX file system socket addresses don't have to be NUL terminated if they take up the + * full sun_path space. */ + assert_cc(sizeof(union sockaddr_union) >= sizeof(struct sockaddr_un)+1); + return a->sockaddr.un.sun_path; +} + +bool socket_ipv6_is_supported(void) { + static int cached = -1; + + if (cached < 0) { + + if (access("/proc/net/if_inet6", F_OK) < 0) { + + if (errno != ENOENT) { + log_debug_errno(errno, "Unexpected error when checking whether /proc/net/if_inet6 exists: %m"); + return false; + } + + cached = false; + } else + cached = true; + } + + return cached; +} + +bool socket_ipv6_is_enabled(void) { + _cleanup_free_ char *v = NULL; + int r; + + /* Much like socket_ipv6_is_supported(), but also checks that the sysctl that disables IPv6 on all + * interfaces isn't turned on */ + + if (!socket_ipv6_is_supported()) + return false; + + r = sysctl_read_ip_property(AF_INET6, "all", "disable_ipv6", &v); + if (r < 0) { + log_debug_errno(r, "Unexpected error reading 'net.ipv6.conf.all.disable_ipv6' sysctl: %m"); + return true; + } + + r = parse_boolean(v); + if (r < 0) { + log_debug_errno(r, "Failed to pare 'net.ipv6.conf.all.disable_ipv6' sysctl: %m"); + return true; + } + + return !r; +} + +bool socket_address_matches_fd(const SocketAddress *a, int fd) { + SocketAddress b; + socklen_t solen; + + assert(a); + assert(fd >= 0); + + b.size = sizeof(b.sockaddr); + if (getsockname(fd, &b.sockaddr.sa, &b.size) < 0) + return false; + + if (b.sockaddr.sa.sa_family != a->sockaddr.sa.sa_family) + return false; + + solen = sizeof(b.type); + if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &b.type, &solen) < 0) + return false; + + if (b.type != a->type) + return false; + + if (a->protocol != 0) { + solen = sizeof(b.protocol); + if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &b.protocol, &solen) < 0) + return false; + + if (b.protocol != a->protocol) + return false; + } + + return socket_address_equal(a, &b); +} + +int sockaddr_port(const struct sockaddr *_sa, unsigned *ret_port) { + const union sockaddr_union *sa = (const union sockaddr_union*) _sa; + + /* Note, this returns the port as 'unsigned' rather than 'uint16_t', as AF_VSOCK knows larger ports */ + + assert(sa); + + switch (sa->sa.sa_family) { + + case AF_INET: + *ret_port = be16toh(sa->in.sin_port); + return 0; + + case AF_INET6: + *ret_port = be16toh(sa->in6.sin6_port); + return 0; + + case AF_VSOCK: + *ret_port = sa->vm.svm_port; + return 0; + + default: + return -EAFNOSUPPORT; + } +} + +const union in_addr_union *sockaddr_in_addr(const struct sockaddr *_sa) { + const union sockaddr_union *sa = (const union sockaddr_union*) _sa; + + if (!sa) + return NULL; + + switch (sa->sa.sa_family) { + + case AF_INET: + return (const union in_addr_union*) &sa->in.sin_addr; + + case AF_INET6: + return (const union in_addr_union*) &sa->in6.sin6_addr; + + default: + return NULL; + } +} + +int sockaddr_set_in_addr( + union sockaddr_union *u, + int family, + const union in_addr_union *a, + uint16_t port) { + + assert(u); + assert(a); + + switch (family) { + + case AF_INET: + u->in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr = a->in, + .sin_port = htobe16(port), + }; + + return 0; + + case AF_INET6: + u->in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = a->in6, + .sin6_port = htobe16(port), + }; + + return 0; + + default: + return -EAFNOSUPPORT; + + } +} + +int sockaddr_pretty( + const struct sockaddr *_sa, + socklen_t salen, + bool translate_ipv6, + bool include_port, + char **ret) { + + union sockaddr_union *sa = (union sockaddr_union*) _sa; + char *p; + int r; + + assert(sa); + assert(salen >= sizeof(sa->sa.sa_family)); + + switch (sa->sa.sa_family) { + + case AF_INET: { + uint32_t a; + + a = be32toh(sa->in.sin_addr.s_addr); + + if (include_port) + r = asprintf(&p, + "%u.%u.%u.%u:%u", + a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF, + be16toh(sa->in.sin_port)); + else + r = asprintf(&p, + "%u.%u.%u.%u", + a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF); + if (r < 0) + return -ENOMEM; + break; + } + + case AF_INET6: { + static const unsigned char ipv4_prefix[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF + }; + + if (translate_ipv6 && + memcmp(&sa->in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) { + const uint8_t *a = sa->in6.sin6_addr.s6_addr+12; + if (include_port) + r = asprintf(&p, + "%u.%u.%u.%u:%u", + a[0], a[1], a[2], a[3], + be16toh(sa->in6.sin6_port)); + else + r = asprintf(&p, + "%u.%u.%u.%u", + a[0], a[1], a[2], a[3]); + if (r < 0) + return -ENOMEM; + } else { + const char *a = IN6_ADDR_TO_STRING(&sa->in6.sin6_addr); + + if (include_port) { + if (asprintf(&p, + "[%s]:%u%s%s", + a, + be16toh(sa->in6.sin6_port), + sa->in6.sin6_scope_id != 0 ? "%" : "", + FORMAT_IFNAME_FULL(sa->in6.sin6_scope_id, FORMAT_IFNAME_IFINDEX)) < 0) + return -ENOMEM; + } else { + if (sa->in6.sin6_scope_id != 0) + p = strjoin(a, "%", FORMAT_IFNAME_FULL(sa->in6.sin6_scope_id, FORMAT_IFNAME_IFINDEX)); + else + p = strdup(a); + if (!p) + return -ENOMEM; + } + } + + break; + } + + case AF_UNIX: + if (salen <= offsetof(struct sockaddr_un, sun_path) || + (sa->un.sun_path[0] == 0 && salen == offsetof(struct sockaddr_un, sun_path) + 1)) + /* The name must have at least one character (and the leading NUL does not count) */ + p = strdup(""); + else { + /* Note that we calculate the path pointer here through the .un_buffer[] field, in order to + * outtrick bounds checking tools such as ubsan, which are too smart for their own good: on + * Linux the kernel may return sun_path[] data one byte longer than the declared size of the + * field. */ + char *path = (char*) sa->un_buffer + offsetof(struct sockaddr_un, sun_path); + size_t path_len = salen - offsetof(struct sockaddr_un, sun_path); + + if (path[0] == 0) { + /* Abstract socket. When parsing address information from, we + * explicitly reject overly long paths and paths with embedded NULs. + * But we might get such a socket from the outside. Let's return + * something meaningful and printable in this case. */ + + _cleanup_free_ char *e = NULL; + + e = cescape_length(path + 1, path_len - 1); + if (!e) + return -ENOMEM; + + p = strjoin("@", e); + } else { + if (path[path_len - 1] == '\0') + /* We expect a terminating NUL and don't print it */ + path_len --; + + p = cescape_length(path, path_len); + } + } + if (!p) + return -ENOMEM; + + break; + + case AF_VSOCK: + if (include_port) { + if (sa->vm.svm_cid == VMADDR_CID_ANY) + r = asprintf(&p, "vsock::%u", sa->vm.svm_port); + else + r = asprintf(&p, "vsock:%u:%u", sa->vm.svm_cid, sa->vm.svm_port); + } else + r = asprintf(&p, "vsock:%u", sa->vm.svm_cid); + if (r < 0) + return -ENOMEM; + break; + + default: + return -EOPNOTSUPP; + } + + *ret = p; + return 0; +} + +int getpeername_pretty(int fd, bool include_port, char **ret) { + union sockaddr_union sa; + socklen_t salen = sizeof(sa); + int r; + + assert(fd >= 0); + assert(ret); + + if (getpeername(fd, &sa.sa, &salen) < 0) + return -errno; + + if (sa.sa.sa_family == AF_UNIX) { + struct ucred ucred = UCRED_INVALID; + + /* UNIX connection sockets are anonymous, so let's use + * PID/UID as pretty credentials instead */ + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + if (asprintf(ret, "PID "PID_FMT"/UID "UID_FMT, ucred.pid, ucred.uid) < 0) + return -ENOMEM; + + return 0; + } + + /* For remote sockets we translate IPv6 addresses back to IPv4 + * if applicable, since that's nicer. */ + + return sockaddr_pretty(&sa.sa, salen, true, include_port, ret); +} + +int getsockname_pretty(int fd, char **ret) { + union sockaddr_union sa; + socklen_t salen = sizeof(sa); + + assert(fd >= 0); + assert(ret); + + if (getsockname(fd, &sa.sa, &salen) < 0) + return -errno; + + /* For local sockets we do not translate IPv6 addresses back + * to IPv6 if applicable, since this is usually used for + * listening sockets where the difference between IPv4 and + * IPv6 matters. */ + + return sockaddr_pretty(&sa.sa, salen, false, true, ret); +} + +int socknameinfo_pretty(union sockaddr_union *sa, socklen_t salen, char **_ret) { + int r; + char host[NI_MAXHOST], *ret; + + assert(_ret); + + r = getnameinfo(&sa->sa, salen, host, sizeof(host), NULL, 0, IDN_FLAGS); + if (r != 0) { + int saved_errno = errno; + + r = sockaddr_pretty(&sa->sa, salen, true, true, &ret); + if (r < 0) + return r; + + log_debug_errno(saved_errno, "getnameinfo(%s) failed: %m", ret); + } else { + ret = strdup(host); + if (!ret) + return -ENOMEM; + } + + *_ret = ret; + return 0; +} + +static const char* const netlink_family_table[] = { + [NETLINK_ROUTE] = "route", + [NETLINK_FIREWALL] = "firewall", + [NETLINK_INET_DIAG] = "inet-diag", + [NETLINK_NFLOG] = "nflog", + [NETLINK_XFRM] = "xfrm", + [NETLINK_SELINUX] = "selinux", + [NETLINK_ISCSI] = "iscsi", + [NETLINK_AUDIT] = "audit", + [NETLINK_FIB_LOOKUP] = "fib-lookup", + [NETLINK_CONNECTOR] = "connector", + [NETLINK_NETFILTER] = "netfilter", + [NETLINK_IP6_FW] = "ip6-fw", + [NETLINK_DNRTMSG] = "dnrtmsg", + [NETLINK_KOBJECT_UEVENT] = "kobject-uevent", + [NETLINK_GENERIC] = "generic", + [NETLINK_SCSITRANSPORT] = "scsitransport", + [NETLINK_ECRYPTFS] = "ecryptfs", + [NETLINK_RDMA] = "rdma", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(netlink_family, int, INT_MAX); + +static const char* const socket_address_bind_ipv6_only_table[_SOCKET_ADDRESS_BIND_IPV6_ONLY_MAX] = { + [SOCKET_ADDRESS_DEFAULT] = "default", + [SOCKET_ADDRESS_BOTH] = "both", + [SOCKET_ADDRESS_IPV6_ONLY] = "ipv6-only" +}; + +DEFINE_STRING_TABLE_LOOKUP(socket_address_bind_ipv6_only, SocketAddressBindIPv6Only); + +SocketAddressBindIPv6Only socket_address_bind_ipv6_only_or_bool_from_string(const char *n) { + int r; + + r = parse_boolean(n); + if (r > 0) + return SOCKET_ADDRESS_IPV6_ONLY; + if (r == 0) + return SOCKET_ADDRESS_BOTH; + + return socket_address_bind_ipv6_only_from_string(n); +} + +bool sockaddr_equal(const union sockaddr_union *a, const union sockaddr_union *b) { + assert(a); + assert(b); + + if (a->sa.sa_family != b->sa.sa_family) + return false; + + if (a->sa.sa_family == AF_INET) + return a->in.sin_addr.s_addr == b->in.sin_addr.s_addr; + + if (a->sa.sa_family == AF_INET6) + return memcmp(&a->in6.sin6_addr, &b->in6.sin6_addr, sizeof(a->in6.sin6_addr)) == 0; + + if (a->sa.sa_family == AF_VSOCK) + return a->vm.svm_cid == b->vm.svm_cid; + + return false; +} + +int fd_set_sndbuf(int fd, size_t n, bool increase) { + int r, value; + socklen_t l = sizeof(value); + + if (n > INT_MAX) + return -ERANGE; + + r = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &value, &l); + if (r >= 0 && l == sizeof(value) && increase ? (size_t) value >= n*2 : (size_t) value == n*2) + return 0; + + /* First, try to set the buffer size with SO_SNDBUF. */ + r = setsockopt_int(fd, SOL_SOCKET, SO_SNDBUF, n); + if (r < 0) + return r; + + /* SO_SNDBUF above may set to the kernel limit, instead of the requested size. + * So, we need to check the actual buffer size here. */ + l = sizeof(value); + r = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &value, &l); + if (r >= 0 && l == sizeof(value) && increase ? (size_t) value >= n*2 : (size_t) value == n*2) + return 1; + + /* If we have the privileges we will ignore the kernel limit. */ + r = setsockopt_int(fd, SOL_SOCKET, SO_SNDBUFFORCE, n); + if (r < 0) + return r; + + return 1; +} + +int fd_set_rcvbuf(int fd, size_t n, bool increase) { + int r, value; + socklen_t l = sizeof(value); + + if (n > INT_MAX) + return -ERANGE; + + r = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &value, &l); + if (r >= 0 && l == sizeof(value) && increase ? (size_t) value >= n*2 : (size_t) value == n*2) + return 0; + + /* First, try to set the buffer size with SO_RCVBUF. */ + r = setsockopt_int(fd, SOL_SOCKET, SO_RCVBUF, n); + if (r < 0) + return r; + + /* SO_RCVBUF above may set to the kernel limit, instead of the requested size. + * So, we need to check the actual buffer size here. */ + l = sizeof(value); + r = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &value, &l); + if (r >= 0 && l == sizeof(value) && increase ? (size_t) value >= n*2 : (size_t) value == n*2) + return 1; + + /* If we have the privileges we will ignore the kernel limit. */ + r = setsockopt_int(fd, SOL_SOCKET, SO_RCVBUFFORCE, n); + if (r < 0) + return r; + + return 1; +} + +static const char* const ip_tos_table[] = { + [IPTOS_LOWDELAY] = "low-delay", + [IPTOS_THROUGHPUT] = "throughput", + [IPTOS_RELIABILITY] = "reliability", + [IPTOS_LOWCOST] = "low-cost", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(ip_tos, int, 0xff); + +bool ifname_valid_char(char a) { + if ((unsigned char) a >= 127U) + return false; + + if ((unsigned char) a <= 32U) + return false; + + if (IN_SET(a, + ':', /* colons are used by the legacy "alias" interface logic */ + '/', /* slashes cannot work, since we need to use network interfaces in sysfs paths, and in paths slashes are separators */ + '%')) /* %d is used in the kernel's weird foo%d format string naming feature which we really really don't want to ever run into by accident */ + return false; + + return true; +} + +bool ifname_valid_full(const char *p, IfnameValidFlags flags) { + bool numeric = true; + + /* Checks whether a network interface name is valid. This is inspired by dev_valid_name() in the kernel sources + * but slightly stricter, as we only allow non-control, non-space ASCII characters in the interface name. We + * also don't permit names that only container numbers, to avoid confusion with numeric interface indexes. */ + + assert(!(flags & ~_IFNAME_VALID_ALL)); + + if (isempty(p)) + return false; + + /* A valid ifindex? If so, it's valid iff IFNAME_VALID_NUMERIC is set */ + if (parse_ifindex(p) >= 0) + return flags & IFNAME_VALID_NUMERIC; + + if (flags & IFNAME_VALID_ALTERNATIVE) { + if (strlen(p) >= ALTIFNAMSIZ) + return false; + } else { + if (strlen(p) >= IFNAMSIZ) + return false; + } + + if (dot_or_dot_dot(p)) + return false; + + /* Let's refuse "all" and "default" as interface name, to avoid collisions with the special sysctl + * directories /proc/sys/net/{ipv4,ipv6}/conf/{all,default} */ + if (!FLAGS_SET(flags, IFNAME_VALID_SPECIAL) && STR_IN_SET(p, "all", "default")) + return false; + + for (const char *t = p; *t; t++) { + if (!ifname_valid_char(*t)) + return false; + + numeric = numeric && ascii_isdigit(*t); + } + + /* It's fully numeric but didn't parse as valid ifindex above? if so, it must be too large or zero or + * so, let's refuse that. */ + if (numeric) + return false; + + return true; +} + +bool address_label_valid(const char *p) { + + if (isempty(p)) + return false; + + if (strlen(p) >= IFNAMSIZ) + return false; + + while (*p) { + if ((uint8_t) *p >= 127U) + return false; + + if ((uint8_t) *p <= 31U) + return false; + p++; + } + + return true; +} + +int getpeercred(int fd, struct ucred *ucred) { + socklen_t n = sizeof(struct ucred); + struct ucred u; + int r; + + assert(fd >= 0); + assert(ucred); + + r = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &u, &n); + if (r < 0) + return -errno; + + if (n != sizeof(struct ucred)) + return -EIO; + + /* Check if the data is actually useful and not suppressed due to namespacing issues */ + if (!pid_is_valid(u.pid)) + return -ENODATA; + + /* Note that we don't check UID/GID here, as namespace translation works differently there: instead of + * receiving in "invalid" user/group we get the overflow UID/GID. */ + + *ucred = u; + return 0; +} + +int getpeersec(int fd, char **ret) { + _cleanup_free_ char *s = NULL; + socklen_t n = 64; + + assert(fd >= 0); + assert(ret); + + for (;;) { + s = new0(char, n+1); + if (!s) + return -ENOMEM; + + if (getsockopt(fd, SOL_SOCKET, SO_PEERSEC, s, &n) >= 0) + break; + + if (errno != ERANGE) + return -errno; + + s = mfree(s); + } + + if (isempty(s)) + return -EOPNOTSUPP; + + *ret = TAKE_PTR(s); + + return 0; +} + +int getpeergroups(int fd, gid_t **ret) { + socklen_t n = sizeof(gid_t) * 64; + _cleanup_free_ gid_t *d = NULL; + + assert(fd >= 0); + assert(ret); + + for (;;) { + d = malloc(n); + if (!d) + return -ENOMEM; + + if (getsockopt(fd, SOL_SOCKET, SO_PEERGROUPS, d, &n) >= 0) + break; + + if (errno != ERANGE) + return -errno; + + d = mfree(d); + } + + assert_se(n % sizeof(gid_t) == 0); + n /= sizeof(gid_t); + + if ((socklen_t) (int) n != n) + return -E2BIG; + + *ret = TAKE_PTR(d); + + return (int) n; +} + +ssize_t send_many_fds_iov_sa( + int transport_fd, + int *fds_array, size_t n_fds_array, + const struct iovec *iov, size_t iovlen, + const struct sockaddr *sa, socklen_t len, + int flags) { + + _cleanup_free_ struct cmsghdr *cmsg = NULL; + struct msghdr mh = { + .msg_name = (struct sockaddr*) sa, + .msg_namelen = len, + .msg_iov = (struct iovec *)iov, + .msg_iovlen = iovlen, + }; + ssize_t k; + + assert(transport_fd >= 0); + assert(fds_array || n_fds_array == 0); + + /* The kernel will reject sending more than SCM_MAX_FD FDs at once */ + if (n_fds_array > SCM_MAX_FD) + return -E2BIG; + + /* We need either an FD array or data to send. If there's nothing, return an error. */ + if (n_fds_array == 0 && !iov) + return -EINVAL; + + if (n_fds_array > 0) { + mh.msg_controllen = CMSG_SPACE(sizeof(int) * n_fds_array); + mh.msg_control = cmsg = malloc(mh.msg_controllen); + if (!cmsg) + return -ENOMEM; + + *cmsg = (struct cmsghdr) { + .cmsg_len = CMSG_LEN(sizeof(int) * n_fds_array), + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_RIGHTS, + }; + memcpy(CMSG_DATA(cmsg), fds_array, sizeof(int) * n_fds_array); + } + k = sendmsg(transport_fd, &mh, MSG_NOSIGNAL | flags); + if (k < 0) + return (ssize_t) -errno; + + return k; +} + +ssize_t send_one_fd_iov_sa( + int transport_fd, + int fd, + const struct iovec *iov, size_t iovlen, + const struct sockaddr *sa, socklen_t len, + int flags) { + + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control = {}; + struct msghdr mh = { + .msg_name = (struct sockaddr*) sa, + .msg_namelen = len, + .msg_iov = (struct iovec *)iov, + .msg_iovlen = iovlen, + }; + ssize_t k; + + assert(transport_fd >= 0); + + /* + * We need either an FD or data to send. + * If there's nothing, return an error. + */ + if (fd < 0 && !iov) + return -EINVAL; + + if (fd >= 0) { + struct cmsghdr *cmsg; + + mh.msg_control = &control; + mh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&mh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); + } + k = sendmsg(transport_fd, &mh, MSG_NOSIGNAL | flags); + if (k < 0) + return (ssize_t) -errno; + + return k; +} + +int send_one_fd_sa( + int transport_fd, + int fd, + const struct sockaddr *sa, socklen_t len, + int flags) { + + assert(fd >= 0); + + return (int) send_one_fd_iov_sa(transport_fd, fd, NULL, 0, sa, len, flags); +} + +ssize_t receive_many_fds_iov( + int transport_fd, + struct iovec *iov, size_t iovlen, + int **ret_fds_array, size_t *ret_n_fds_array, + int flags) { + + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int) * SCM_MAX_FD)) control; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_iov = iov, + .msg_iovlen = iovlen, + }; + _cleanup_free_ int *fds_array = NULL; + size_t n_fds_array = 0; + struct cmsghdr *cmsg; + ssize_t k; + + assert(transport_fd >= 0); + assert(ret_fds_array); + assert(ret_n_fds_array); + + /* + * Receive many FDs via @transport_fd. We don't care for the transport-type. We retrieve all the FDs + * at once. This is best used in combination with send_many_fds(). + */ + + k = recvmsg_safe(transport_fd, &mh, MSG_CMSG_CLOEXEC | flags); + if (k < 0) + return k; + + CMSG_FOREACH(cmsg, &mh) + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + size_t n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + fds_array = GREEDY_REALLOC(fds_array, n_fds_array + n); + if (!fds_array) { + cmsg_close_all(&mh); + return -ENOMEM; + } + + memcpy(fds_array + n_fds_array, CMSG_TYPED_DATA(cmsg, int), sizeof(int) * n); + n_fds_array += n; + } + + if (n_fds_array == 0) { + cmsg_close_all(&mh); + + /* If didn't receive an FD or any data, return an error. */ + if (k == 0) + return -EIO; + } + + *ret_fds_array = TAKE_PTR(fds_array); + *ret_n_fds_array = n_fds_array; + + return k; +} + +int receive_many_fds(int transport_fd, int **ret_fds_array, size_t *ret_n_fds_array, int flags) { + ssize_t k; + + k = receive_many_fds_iov(transport_fd, NULL, 0, ret_fds_array, ret_n_fds_array, flags); + if (k == 0) + return 0; + + /* k must be negative, since receive_many_fds_iov() only returns a positive value if data was received + * through the iov. */ + assert(k < 0); + return (int) k; +} + +ssize_t receive_one_fd_iov( + int transport_fd, + struct iovec *iov, size_t iovlen, + int flags, + int *ret_fd) { + + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_iov = iov, + .msg_iovlen = iovlen, + }; + struct cmsghdr *found; + ssize_t k; + + assert(transport_fd >= 0); + assert(ret_fd); + + /* + * Receive a single FD via @transport_fd. We don't care for + * the transport-type. We retrieve a single FD at most, so for + * packet-based transports, the caller must ensure to send + * only a single FD per packet. This is best used in + * combination with send_one_fd(). + */ + + k = recvmsg_safe(transport_fd, &mh, MSG_CMSG_CLOEXEC | flags); + if (k < 0) + return k; + + found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int))); + if (!found) { + cmsg_close_all(&mh); + + /* If didn't receive an FD or any data, return an error. */ + if (k == 0) + return -EIO; + } + + if (found) + *ret_fd = *CMSG_TYPED_DATA(found, int); + else + *ret_fd = -EBADF; + + return k; +} + +int receive_one_fd(int transport_fd, int flags) { + int fd; + ssize_t k; + + k = receive_one_fd_iov(transport_fd, NULL, 0, flags, &fd); + if (k == 0) + return fd; + + /* k must be negative, since receive_one_fd_iov() only returns + * a positive value if data was received through the iov. */ + assert(k < 0); + return (int) k; +} + +ssize_t next_datagram_size_fd(int fd) { + ssize_t l; + int k; + + /* This is a bit like FIONREAD/SIOCINQ, however a bit more powerful. The difference being: recv(MSG_PEEK) will + * actually cause the next datagram in the queue to be validated regarding checksums, which FIONREAD doesn't + * do. This difference is actually of major importance as we need to be sure that the size returned here + * actually matches what we will read with recvmsg() next, as otherwise we might end up allocating a buffer of + * the wrong size. */ + + l = recv(fd, NULL, 0, MSG_PEEK|MSG_TRUNC); + if (l < 0) { + if (IN_SET(errno, EOPNOTSUPP, EFAULT)) + goto fallback; + + return -errno; + } + if (l == 0) + goto fallback; + + return l; + +fallback: + k = 0; + + /* Some sockets (AF_PACKET) do not support null-sized recv() with MSG_TRUNC set, let's fall back to FIONREAD + * for them. Checksums don't matter for raw sockets anyway, hence this should be fine. */ + + if (ioctl(fd, FIONREAD, &k) < 0) + return -errno; + + return (ssize_t) k; +} + +/* Put a limit on how many times will attempt to call accept4(). We loop + * only on "transient" errors, but let's make sure we don't loop forever. */ +#define MAX_FLUSH_ITERATIONS 1024 + +int flush_accept(int fd) { + + int r, b; + socklen_t l = sizeof(b); + + /* Similar to flush_fd() but flushes all incoming connections by accepting and immediately closing + * them. */ + + if (getsockopt(fd, SOL_SOCKET, SO_ACCEPTCONN, &b, &l) < 0) + return -errno; + + assert(l == sizeof(b)); + if (!b) /* Let's check if this socket accepts connections before calling accept(). accept4() can + * return EOPNOTSUPP if the fd is not a listening socket, which we should treat as a fatal + * error, or in case the incoming TCP connection triggered a network issue, which we want to + * treat as a transient error. Thus, let's rule out the first reason for EOPNOTSUPP early, so + * we can loop safely on transient errors below. */ + return -ENOTTY; + + for (unsigned iteration = 0;; iteration++) { + int cfd; + + r = fd_wait_for_event(fd, POLLIN, 0); + if (r < 0) { + if (r == -EINTR) + continue; + + return r; + } + if (r == 0) + return 0; + + if (iteration >= MAX_FLUSH_ITERATIONS) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), + "Failed to flush connections within " STRINGIFY(MAX_FLUSH_ITERATIONS) " iterations."); + + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (cfd < 0) { + if (errno == EAGAIN) + return 0; + + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + continue; + + return -errno; + } + + safe_close(cfd); + } +} + +struct cmsghdr* cmsg_find(struct msghdr *mh, int level, int type, socklen_t length) { + struct cmsghdr *cmsg; + + assert(mh); + + CMSG_FOREACH(cmsg, mh) + if (cmsg->cmsg_level == level && + cmsg->cmsg_type == type && + (length == (socklen_t) -1 || length == cmsg->cmsg_len)) + return cmsg; + + return NULL; +} + +void* cmsg_find_and_copy_data(struct msghdr *mh, int level, int type, void *buf, size_t buf_len) { + struct cmsghdr *cmsg; + + assert(mh); + assert(buf); + assert(buf_len > 0); + + /* This is similar to cmsg_find_data(), but copy the found data to buf. This should be typically used + * when reading possibly unaligned data such as timestamp, as time_t is 64-bit and size_t is 32-bit on + * RISCV32. See issue #27241. */ + + cmsg = cmsg_find(mh, level, type, CMSG_LEN(buf_len)); + if (!cmsg) + return NULL; + + return memcpy_safe(buf, CMSG_DATA(cmsg), buf_len); +} + +int socket_ioctl_fd(void) { + int fd; + + /* Create a socket to invoke the various network interface ioctl()s on. Traditionally only AF_INET was good for + * that. Since kernel 4.6 AF_NETLINK works for this too. We first try to use AF_INET hence, but if that's not + * available (for example, because it is made unavailable via SECCOMP or such), we'll fall back to the more + * generic AF_NETLINK. */ + + fd = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) + fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_GENERIC); + if (fd < 0) + return -errno; + + return fd; +} + +int sockaddr_un_unlink(const struct sockaddr_un *sa) { + const char *p, * nul; + + assert(sa); + + if (sa->sun_family != AF_UNIX) + return -EPROTOTYPE; + + if (sa->sun_path[0] == 0) /* Nothing to do for abstract sockets */ + return 0; + + /* The path in .sun_path is not necessarily NUL terminated. Let's fix that. */ + nul = memchr(sa->sun_path, 0, sizeof(sa->sun_path)); + if (nul) + p = sa->sun_path; + else + p = memdupa_suffix0(sa->sun_path, sizeof(sa->sun_path)); + + if (unlink(p) < 0) + return -errno; + + return 1; +} + +int sockaddr_un_set_path(struct sockaddr_un *ret, const char *path) { + size_t l; + + assert(ret); + assert(path); + + /* Initialize ret->sun_path from the specified argument. This will interpret paths starting with '@' as + * abstract namespace sockets, and those starting with '/' as regular filesystem sockets. It won't accept + * anything else (i.e. no relative paths), to avoid ambiguities. Note that this function cannot be used to + * reference paths in the abstract namespace that include NUL bytes in the name. */ + + l = strlen(path); + if (l < 2) + return -EINVAL; + if (!IN_SET(path[0], '/', '@')) + return -EINVAL; + + /* Don't allow paths larger than the space in sockaddr_un. Note that we are a tiny bit more restrictive than + * the kernel is: we insist on NUL termination (both for abstract namespace and regular file system socket + * addresses!), which the kernel doesn't. We do this to reduce chance of incompatibility with other apps that + * do not expect non-NUL terminated file system path. */ + if (l+1 > sizeof(ret->sun_path)) + return path[0] == '@' ? -EINVAL : -ENAMETOOLONG; /* return a recognizable error if this is + * too long to fit into a sockaddr_un, but + * is a file system path, and thus might be + * connectible via O_PATH indirection. */ + + *ret = (struct sockaddr_un) { + .sun_family = AF_UNIX, + }; + + if (path[0] == '@') { + /* Abstract namespace socket */ + memcpy(ret->sun_path + 1, path + 1, l); /* copy *with* trailing NUL byte */ + return (int) (offsetof(struct sockaddr_un, sun_path) + l); /* 🔥 *don't* 🔥 include trailing NUL in size */ + + } else { + assert(path[0] == '/'); + + /* File system socket */ + memcpy(ret->sun_path, path, l + 1); /* copy *with* trailing NUL byte */ + return (int) (offsetof(struct sockaddr_un, sun_path) + l + 1); /* include trailing NUL in size */ + } +} + +int socket_bind_to_ifname(int fd, const char *ifname) { + assert(fd >= 0); + + /* Call with NULL to drop binding */ + + return RET_NERRNO(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen_ptr(ifname))); +} + +int socket_bind_to_ifindex(int fd, int ifindex) { + char ifname[IF_NAMESIZE]; + int r; + + assert(fd >= 0); + + if (ifindex <= 0) + /* Drop binding */ + return RET_NERRNO(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, NULL, 0)); + + r = setsockopt_int(fd, SOL_SOCKET, SO_BINDTOIFINDEX, ifindex); + if (r != -ENOPROTOOPT) + return r; + + /* Fall back to SO_BINDTODEVICE on kernels < 5.0 which didn't have SO_BINDTOIFINDEX */ + r = format_ifname(ifindex, ifname); + if (r < 0) + return r; + + return socket_bind_to_ifname(fd, ifname); +} + +ssize_t recvmsg_safe(int sockfd, struct msghdr *msg, int flags) { + ssize_t n; + + /* A wrapper around recvmsg() that checks for MSG_CTRUNC, and turns it into an error, in a reasonably + * safe way, closing any SCM_RIGHTS fds in the error path. + * + * Note that unlike our usual coding style this might modify *msg on failure. */ + + n = recvmsg(sockfd, msg, flags); + if (n < 0) + return -errno; + + if (FLAGS_SET(msg->msg_flags, MSG_CTRUNC)) { + cmsg_close_all(msg); + return -EXFULL; /* a recognizable error code */ + } + + return n; +} + +int socket_get_family(int fd) { + int af; + socklen_t sl = sizeof(af); + + if (getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &af, &sl) < 0) + return -errno; + + if (sl != sizeof(af)) + return -EINVAL; + + return af; +} + +int socket_set_recvpktinfo(int fd, int af, bool b) { + + if (af == AF_UNSPEC) { + af = socket_get_family(fd); + if (af < 0) + return af; + } + + switch (af) { + + case AF_INET: + return setsockopt_int(fd, IPPROTO_IP, IP_PKTINFO, b); + + case AF_INET6: + return setsockopt_int(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, b); + + case AF_NETLINK: + return setsockopt_int(fd, SOL_NETLINK, NETLINK_PKTINFO, b); + + case AF_PACKET: + return setsockopt_int(fd, SOL_PACKET, PACKET_AUXDATA, b); + + default: + return -EAFNOSUPPORT; + } +} + +int socket_set_unicast_if(int fd, int af, int ifi) { + be32_t ifindex_be = htobe32(ifi); + + if (af == AF_UNSPEC) { + af = socket_get_family(fd); + if (af < 0) + return af; + } + + switch (af) { + + case AF_INET: + return RET_NERRNO(setsockopt(fd, IPPROTO_IP, IP_UNICAST_IF, &ifindex_be, sizeof(ifindex_be))); + + case AF_INET6: + return RET_NERRNO(setsockopt(fd, IPPROTO_IPV6, IPV6_UNICAST_IF, &ifindex_be, sizeof(ifindex_be))); + + default: + return -EAFNOSUPPORT; + } +} + +int socket_set_option(int fd, int af, int opt_ipv4, int opt_ipv6, int val) { + if (af == AF_UNSPEC) { + af = socket_get_family(fd); + if (af < 0) + return af; + } + + switch (af) { + + case AF_INET: + return setsockopt_int(fd, IPPROTO_IP, opt_ipv4, val); + + case AF_INET6: + return setsockopt_int(fd, IPPROTO_IPV6, opt_ipv6, val); + + default: + return -EAFNOSUPPORT; + } +} + +int socket_get_mtu(int fd, int af, size_t *ret) { + int mtu, r; + + if (af == AF_UNSPEC) { + af = socket_get_family(fd); + if (af < 0) + return af; + } + + switch (af) { + + case AF_INET: + r = getsockopt_int(fd, IPPROTO_IP, IP_MTU, &mtu); + break; + + case AF_INET6: + r = getsockopt_int(fd, IPPROTO_IPV6, IPV6_MTU, &mtu); + break; + + default: + return -EAFNOSUPPORT; + } + + if (r < 0) + return r; + if (mtu <= 0) + return -EINVAL; + + *ret = (size_t) mtu; + return 0; +} + +static int connect_unix_path_simple(int fd, const char *path) { + union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + }; + size_t l; + + assert(fd >= 0); + assert(path); + + l = strlen(path); + assert(l > 0); + assert(l < sizeof(sa.un.sun_path)); + + memcpy(sa.un.sun_path, path, l + 1); + return RET_NERRNO(connect(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + l + 1)); +} + +static int connect_unix_inode(int fd, int inode_fd) { + assert(fd >= 0); + assert(inode_fd >= 0); + + return connect_unix_path_simple(fd, FORMAT_PROC_FD_PATH(inode_fd)); +} + +int connect_unix_path(int fd, int dir_fd, const char *path) { + _cleanup_close_ int inode_fd = -EBADF; + + assert(fd >= 0); + assert(dir_fd == AT_FDCWD || dir_fd >= 0); + + /* Connects to the specified AF_UNIX socket in the file system. Works around the 108 byte size limit + * in sockaddr_un, by going via O_PATH if needed. This hence works for any kind of path. */ + + if (!path) + return connect_unix_inode(fd, dir_fd); /* If no path is specified, then dir_fd refers to the socket inode to connect to. */ + + /* Refuse zero length path early, to make sure AF_UNIX stack won't mistake this for an abstract + * namespace path, since first char is NUL */ + if (isempty(path)) + return -EINVAL; + + /* Shortcut for the simple case */ + if (dir_fd == AT_FDCWD && strlen(path) < sizeof_field(struct sockaddr_un, sun_path)) + return connect_unix_path_simple(fd, path); + + /* If dir_fd is specified, then we need to go the indirect O_PATH route, because connectat() does not + * exist. If the path is too long, we also need to take the indirect route, since we can't fit this + * into a sockaddr_un directly. */ + + inode_fd = openat(dir_fd, path, O_PATH|O_CLOEXEC); + if (inode_fd < 0) + return -errno; + + return connect_unix_inode(fd, inode_fd); +} + +int socket_address_parse_unix(SocketAddress *ret_address, const char *s) { + struct sockaddr_un un; + int r; + + assert(ret_address); + assert(s); + + if (!IN_SET(*s, '/', '@')) + return -EPROTO; + + r = sockaddr_un_set_path(&un, s); + if (r < 0) + return r; + + *ret_address = (SocketAddress) { + .sockaddr.un = un, + .size = r, + }; + + return 0; +} + +int socket_address_parse_vsock(SocketAddress *ret_address, const char *s) { + /* AF_VSOCK socket in vsock:cid:port notation */ + _cleanup_free_ char *n = NULL; + char *e, *cid_start; + unsigned port, cid; + int type, r; + + assert(ret_address); + assert(s); + + if ((cid_start = startswith(s, "vsock:"))) + type = 0; + else if ((cid_start = startswith(s, "vsock-dgram:"))) + type = SOCK_DGRAM; + else if ((cid_start = startswith(s, "vsock-seqpacket:"))) + type = SOCK_SEQPACKET; + else if ((cid_start = startswith(s, "vsock-stream:"))) + type = SOCK_STREAM; + else + return -EPROTO; + + e = strchr(cid_start, ':'); + if (!e) + return -EINVAL; + + r = safe_atou(e+1, &port); + if (r < 0) + return r; + + n = strndup(cid_start, e - cid_start); + if (!n) + return -ENOMEM; + + if (isempty(n)) + cid = VMADDR_CID_ANY; + else { + r = safe_atou(n, &cid); + if (r < 0) + return r; + } + + *ret_address = (SocketAddress) { + .sockaddr.vm = { + .svm_cid = cid, + .svm_family = AF_VSOCK, + .svm_port = port, + }, + .type = type, + .size = sizeof(struct sockaddr_vm), + }; + + return 0; +} diff --git a/src/basic/socket-util.h b/src/basic/socket-util.h new file mode 100644 index 0000000..9a11df8 --- /dev/null +++ b/src/basic/socket-util.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "errno-util.h" +#include "in-addr-util.h" +#include "macro.h" +#include "missing_network.h" +#include "missing_socket.h" +#include "sparse-endian.h" + +union sockaddr_union { + /* The minimal, abstract version */ + struct sockaddr sa; + + /* The libc provided version that allocates "enough room" for every protocol */ + struct sockaddr_storage storage; + + /* Protoctol-specific implementations */ + struct sockaddr_in in; + struct sockaddr_in6 in6; + struct sockaddr_un un; + struct sockaddr_nl nl; + struct sockaddr_ll ll; + struct sockaddr_vm vm; + + /* Ensure there is enough space to store Infiniband addresses */ + uint8_t ll_buffer[offsetof(struct sockaddr_ll, sll_addr) + CONST_MAX(ETH_ALEN, INFINIBAND_ALEN)]; + + /* Ensure there is enough space after the AF_UNIX sun_path for one more NUL byte, just to be sure that the path + * component is always followed by at least one NUL byte. */ + uint8_t un_buffer[sizeof(struct sockaddr_un) + 1]; +}; + +#define SUN_PATH_LEN (sizeof(((struct sockaddr_un){}).sun_path)) + +typedef struct SocketAddress { + union sockaddr_union sockaddr; + + /* We store the size here explicitly due to the weird + * sockaddr_un semantics for abstract sockets */ + socklen_t size; + + /* Socket type, i.e. SOCK_STREAM, SOCK_DGRAM, ... */ + int type; + + /* Socket protocol, IPPROTO_xxx, usually 0, except for netlink */ + int protocol; +} SocketAddress; + +typedef enum SocketAddressBindIPv6Only { + SOCKET_ADDRESS_DEFAULT, + SOCKET_ADDRESS_BOTH, + SOCKET_ADDRESS_IPV6_ONLY, + _SOCKET_ADDRESS_BIND_IPV6_ONLY_MAX, + _SOCKET_ADDRESS_BIND_IPV6_ONLY_INVALID = -EINVAL, +} SocketAddressBindIPv6Only; + +#define socket_address_family(a) ((a)->sockaddr.sa.sa_family) + +const char* socket_address_type_to_string(int t) _const_; +int socket_address_type_from_string(const char *s) _pure_; + +int sockaddr_un_unlink(const struct sockaddr_un *sa); + +static inline int socket_address_unlink(const SocketAddress *a) { + return socket_address_family(a) == AF_UNIX ? sockaddr_un_unlink(&a->sockaddr.un) : 0; +} + +bool socket_address_can_accept(const SocketAddress *a) _pure_; + +int socket_address_listen( + const SocketAddress *a, + int flags, + int backlog, + SocketAddressBindIPv6Only only, + const char *bind_to_device, + bool reuse_port, + bool free_bind, + bool transparent, + mode_t directory_mode, + mode_t socket_mode, + const char *label); + +int socket_address_verify(const SocketAddress *a, bool strict) _pure_; +int socket_address_print(const SocketAddress *a, char **p); +bool socket_address_matches_fd(const SocketAddress *a, int fd); + +bool socket_address_equal(const SocketAddress *a, const SocketAddress *b) _pure_; + +const char* socket_address_get_path(const SocketAddress *a); + +bool socket_ipv6_is_supported(void); +bool socket_ipv6_is_enabled(void); + +int sockaddr_port(const struct sockaddr *_sa, unsigned *port); +const union in_addr_union *sockaddr_in_addr(const struct sockaddr *sa); +int sockaddr_set_in_addr(union sockaddr_union *u, int family, const union in_addr_union *a, uint16_t port); + +int sockaddr_pretty(const struct sockaddr *_sa, socklen_t salen, bool translate_ipv6, bool include_port, char **ret); +int getpeername_pretty(int fd, bool include_port, char **ret); +int getsockname_pretty(int fd, char **ret); + +int socknameinfo_pretty(union sockaddr_union *sa, socklen_t salen, char **_ret); + +const char* socket_address_bind_ipv6_only_to_string(SocketAddressBindIPv6Only b) _const_; +SocketAddressBindIPv6Only socket_address_bind_ipv6_only_from_string(const char *s) _pure_; +SocketAddressBindIPv6Only socket_address_bind_ipv6_only_or_bool_from_string(const char *s); + +int netlink_family_to_string_alloc(int b, char **s); +int netlink_family_from_string(const char *s) _pure_; + +bool sockaddr_equal(const union sockaddr_union *a, const union sockaddr_union *b); + +int fd_set_sndbuf(int fd, size_t n, bool increase); +static inline int fd_inc_sndbuf(int fd, size_t n) { + return fd_set_sndbuf(fd, n, true); +} +int fd_set_rcvbuf(int fd, size_t n, bool increase); +static inline int fd_increase_rxbuf(int fd, size_t n) { + return fd_set_rcvbuf(fd, n, true); +} + +int ip_tos_to_string_alloc(int i, char **s); +int ip_tos_from_string(const char *s); + +typedef enum { + IFNAME_VALID_ALTERNATIVE = 1 << 0, /* Allow "altnames" too */ + IFNAME_VALID_NUMERIC = 1 << 1, /* Allow decimal formatted ifindexes too */ + IFNAME_VALID_SPECIAL = 1 << 2, /* Allow the special names "all" and "default" */ + _IFNAME_VALID_ALL = IFNAME_VALID_ALTERNATIVE | IFNAME_VALID_NUMERIC | IFNAME_VALID_SPECIAL, +} IfnameValidFlags; +bool ifname_valid_char(char a); +bool ifname_valid_full(const char *p, IfnameValidFlags flags); +static inline bool ifname_valid(const char *p) { + return ifname_valid_full(p, 0); +} +bool address_label_valid(const char *p); + +int getpeercred(int fd, struct ucred *ucred); +int getpeersec(int fd, char **ret); +int getpeergroups(int fd, gid_t **ret); + +ssize_t send_many_fds_iov_sa( + int transport_fd, + int *fds_array, size_t n_fds_array, + const struct iovec *iov, size_t iovlen, + const struct sockaddr *sa, socklen_t len, + int flags); +static inline ssize_t send_many_fds_iov( + int transport_fd, + int *fds_array, size_t n_fds_array, + const struct iovec *iov, size_t iovlen, + int flags) { + + return send_many_fds_iov_sa(transport_fd, fds_array, n_fds_array, iov, iovlen, NULL, 0, flags); +} +static inline int send_many_fds( + int transport_fd, + int *fds_array, + size_t n_fds_array, + int flags) { + + return send_many_fds_iov_sa(transport_fd, fds_array, n_fds_array, NULL, 0, NULL, 0, flags); +} +ssize_t send_one_fd_iov_sa( + int transport_fd, + int fd, + const struct iovec *iov, size_t iovlen, + const struct sockaddr *sa, socklen_t len, + int flags); +int send_one_fd_sa(int transport_fd, + int fd, + const struct sockaddr *sa, socklen_t len, + int flags); +#define send_one_fd_iov(transport_fd, fd, iov, iovlen, flags) send_one_fd_iov_sa(transport_fd, fd, iov, iovlen, NULL, 0, flags) +#define send_one_fd(transport_fd, fd, flags) send_one_fd_iov_sa(transport_fd, fd, NULL, 0, NULL, 0, flags) +ssize_t receive_one_fd_iov(int transport_fd, struct iovec *iov, size_t iovlen, int flags, int *ret_fd); +int receive_one_fd(int transport_fd, int flags); +ssize_t receive_many_fds_iov(int transport_fd, struct iovec *iov, size_t iovlen, int **ret_fds_array, size_t *ret_n_fds_array, int flags); +int receive_many_fds(int transport_fd, int **ret_fds_array, size_t *ret_n_fds_array, int flags); + +ssize_t next_datagram_size_fd(int fd); + +int flush_accept(int fd); + +#define CMSG_FOREACH(cmsg, mh) \ + for ((cmsg) = CMSG_FIRSTHDR(mh); (cmsg); (cmsg) = CMSG_NXTHDR((mh), (cmsg))) + +/* Returns the cmsghdr's data pointer, but safely cast to the specified type. Does two alignment checks: one + * at compile time, that the requested type has a smaller or same alignment as 'struct cmsghdr', and one + * during runtime, that the actual pointer matches the alignment too. This is supposed to catch cases such as + * 'struct timeval' is embedded into 'struct cmsghdr' on architectures where the alignment of the former is 8 + * bytes (because of a 64-bit time_t), but of the latter is 4 bytes (because size_t is 32 bits), such as + * riscv32. */ +#define CMSG_TYPED_DATA(cmsg, type) \ + ({ \ + struct cmsghdr *_cmsg = (cmsg); \ + assert_cc(alignof(type) <= alignof(struct cmsghdr)); \ + _cmsg ? CAST_ALIGN_PTR(type, CMSG_DATA(_cmsg)) : (type*) NULL; \ + }) + +struct cmsghdr* cmsg_find(struct msghdr *mh, int level, int type, socklen_t length); +void* cmsg_find_and_copy_data(struct msghdr *mh, int level, int type, void *buf, size_t buf_len); + +/* Type-safe, dereferencing version of cmsg_find() */ +#define CMSG_FIND_DATA(mh, level, type, ctype) \ + CMSG_TYPED_DATA(cmsg_find(mh, level, type, CMSG_LEN(sizeof(ctype))), ctype) + +/* Type-safe version of cmsg_find_and_copy_data() */ +#define CMSG_FIND_AND_COPY_DATA(mh, level, type, ctype) \ + (ctype*) cmsg_find_and_copy_data(mh, level, type, &(ctype){}, sizeof(ctype)) + +/* Resolves to a type that can carry cmsghdr structures. Make sure things are properly aligned, i.e. the type + * itself is placed properly in memory and the size is also aligned to what's appropriate for "cmsghdr" + * structures. */ +#define CMSG_BUFFER_TYPE(size) \ + union { \ + struct cmsghdr cmsghdr; \ + uint8_t buf[size]; \ + uint8_t align_check[(size) >= CMSG_SPACE(0) && \ + (size) == CMSG_ALIGN(size) ? 1 : -1]; \ + } + +/* + * Certain hardware address types (e.g Infiniband) do not fit into sll_addr + * (8 bytes) and run over the structure. This macro returns the correct size that + * must be passed to kernel. + */ +#define SOCKADDR_LL_LEN(sa) \ + ({ \ + const struct sockaddr_ll *_sa = &(sa); \ + size_t _mac_len = sizeof(_sa->sll_addr); \ + assert(_sa->sll_family == AF_PACKET); \ + if (be16toh(_sa->sll_hatype) == ARPHRD_ETHER) \ + _mac_len = MAX(_mac_len, (size_t) ETH_ALEN); \ + if (be16toh(_sa->sll_hatype) == ARPHRD_INFINIBAND) \ + _mac_len = MAX(_mac_len, (size_t) INFINIBAND_ALEN); \ + offsetof(struct sockaddr_ll, sll_addr) + _mac_len; \ + }) + +/* Covers only file system and abstract AF_UNIX socket addresses, but not unnamed socket addresses. */ +#define SOCKADDR_UN_LEN(sa) \ + ({ \ + const struct sockaddr_un *_sa = &(sa); \ + assert(_sa->sun_family == AF_UNIX); \ + offsetof(struct sockaddr_un, sun_path) + \ + (_sa->sun_path[0] == 0 ? \ + 1 + strnlen(_sa->sun_path+1, sizeof(_sa->sun_path)-1) : \ + strnlen(_sa->sun_path, sizeof(_sa->sun_path))+1); \ + }) + +#define SOCKADDR_LEN(saddr) \ + ({ \ + const union sockaddr_union *__sa = &(saddr); \ + size_t _len; \ + switch (__sa->sa.sa_family) { \ + case AF_INET: \ + _len = sizeof(struct sockaddr_in); \ + break; \ + case AF_INET6: \ + _len = sizeof(struct sockaddr_in6); \ + break; \ + case AF_UNIX: \ + _len = SOCKADDR_UN_LEN(__sa->un); \ + break; \ + case AF_PACKET: \ + _len = SOCKADDR_LL_LEN(__sa->ll); \ + break; \ + case AF_NETLINK: \ + _len = sizeof(struct sockaddr_nl); \ + break; \ + case AF_VSOCK: \ + _len = sizeof(struct sockaddr_vm); \ + break; \ + default: \ + assert_not_reached(); \ + } \ + _len; \ + }) + +int socket_ioctl_fd(void); + +int sockaddr_un_set_path(struct sockaddr_un *ret, const char *path); + +static inline int setsockopt_int(int fd, int level, int optname, int value) { + if (setsockopt(fd, level, optname, &value, sizeof(value)) < 0) + return -errno; + + return 0; +} + +static inline int getsockopt_int(int fd, int level, int optname, int *ret) { + int v; + socklen_t sl = sizeof(v); + + if (getsockopt(fd, level, optname, &v, &sl) < 0) + return negative_errno(); + if (sl != sizeof(v)) + return -EIO; + + *ret = v; + return 0; +} + +int socket_bind_to_ifname(int fd, const char *ifname); +int socket_bind_to_ifindex(int fd, int ifindex); + +/* Define a 64-bit version of timeval/timespec in any case, even on 32-bit userspace. */ +struct timeval_large { + uint64_t tvl_sec, tvl_usec; +}; +struct timespec_large { + uint64_t tvl_sec, tvl_nsec; +}; + +/* glibc duplicates timespec/timeval on certain 32-bit arches, once in 32-bit and once in 64-bit. + * See __convert_scm_timestamps() in glibc source code. Hence, we need additional buffer space for them + * to prevent from recvmsg_safe() returning -EXFULL. */ +#define CMSG_SPACE_TIMEVAL \ + ((sizeof(struct timeval) == sizeof(struct timeval_large)) ? \ + CMSG_SPACE(sizeof(struct timeval)) : \ + CMSG_SPACE(sizeof(struct timeval)) + \ + CMSG_SPACE(sizeof(struct timeval_large))) +#define CMSG_SPACE_TIMESPEC \ + ((sizeof(struct timespec) == sizeof(struct timespec_large)) ? \ + CMSG_SPACE(sizeof(struct timespec)) : \ + CMSG_SPACE(sizeof(struct timespec)) + \ + CMSG_SPACE(sizeof(struct timespec_large))) + +ssize_t recvmsg_safe(int sockfd, struct msghdr *msg, int flags); + +int socket_get_family(int fd); +int socket_set_recvpktinfo(int fd, int af, bool b); +int socket_set_unicast_if(int fd, int af, int ifi); + +int socket_set_option(int fd, int af, int opt_ipv4, int opt_ipv6, int val); +static inline int socket_set_recverr(int fd, int af, bool b) { + return socket_set_option(fd, af, IP_RECVERR, IPV6_RECVERR, b); +} +static inline int socket_set_recvttl(int fd, int af, bool b) { + return socket_set_option(fd, af, IP_RECVTTL, IPV6_RECVHOPLIMIT, b); +} +static inline int socket_set_ttl(int fd, int af, int ttl) { + return socket_set_option(fd, af, IP_TTL, IPV6_UNICAST_HOPS, ttl); +} +static inline int socket_set_freebind(int fd, int af, bool b) { + return socket_set_option(fd, af, IP_FREEBIND, IPV6_FREEBIND, b); +} +static inline int socket_set_transparent(int fd, int af, bool b) { + return socket_set_option(fd, af, IP_TRANSPARENT, IPV6_TRANSPARENT, b); +} +static inline int socket_set_recvfragsize(int fd, int af, bool b) { + return socket_set_option(fd, af, IP_RECVFRAGSIZE, IPV6_RECVFRAGSIZE, b); +} + +int socket_get_mtu(int fd, int af, size_t *ret); + +/* an initializer for struct ucred that initialized all fields to the invalid value appropriate for each */ +#define UCRED_INVALID { .pid = 0, .uid = UID_INVALID, .gid = GID_INVALID } + +int connect_unix_path(int fd, int dir_fd, const char *path); + +/* Parses AF_UNIX and AF_VSOCK addresses. AF_INET[6] require some netlink calls, so it cannot be in + * src/basic/ and is done from 'socket_local_address from src/shared/. Return -EPROTO in case of + * protocol mismatch. */ +int socket_address_parse_unix(SocketAddress *ret_address, const char *s); +int socket_address_parse_vsock(SocketAddress *ret_address, const char *s); + +/* libc's SOMAXCONN is defined to 128 or 4096 (at least on glibc). But actually, the value can be much + * larger. In our codebase we want to set it to the max usually, since noawadays socket memory is properly + * tracked by memcg, and hence we don't need to enforce extra limits here. Moreover, the kernel caps it to + * /proc/sys/net/core/somaxconn anyway, thus by setting this to unbounded we just make that sysctl file + * authoritative. */ +#define SOMAXCONN_DELUXE INT_MAX diff --git a/src/basic/sort-util.c b/src/basic/sort-util.c new file mode 100644 index 0000000..9eadb96 --- /dev/null +++ b/src/basic/sort-util.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sort-util.h" +#include "alloc-util.h" + +/* hey glibc, APIs with callbacks without a user pointer are so useless */ +void *xbsearch_r(const void *key, const void *base, size_t nmemb, size_t size, + comparison_userdata_fn_t compar, void *arg) { + size_t l, u, idx; + const void *p; + int comparison; + + assert(!size_multiply_overflow(nmemb, size)); + + l = 0; + u = nmemb; + while (l < u) { + idx = (l + u) / 2; + p = (const uint8_t*) base + idx * size; + comparison = compar(key, p, arg); + if (comparison < 0) + u = idx; + else if (comparison > 0) + l = idx + 1; + else + return (void *)p; + } + return NULL; +} + +int cmp_int(const int *a, const int *b) { + return CMP(*a, *b); +} + +int cmp_uint16(const uint16_t *a, const uint16_t *b) { + return CMP(*a, *b); +} diff --git a/src/basic/sort-util.h b/src/basic/sort-util.h new file mode 100644 index 0000000..9c818bd --- /dev/null +++ b/src/basic/sort-util.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +/* This is the same as glibc's internal __compar_d_fn_t type. glibc exports a public comparison_fn_t, for the + * external type __compar_fn_t, but doesn't do anything similar for __compar_d_fn_t. Let's hence do that + * ourselves, picking a name that is obvious, but likely enough to not clash with glibc's choice of naming if + * they should ever add one. */ +typedef int (*comparison_userdata_fn_t)(const void *, const void *, void *); + +void *xbsearch_r(const void *key, const void *base, size_t nmemb, size_t size, + comparison_userdata_fn_t compar, void *arg); + +#define typesafe_bsearch_r(k, b, n, func, userdata) \ + ({ \ + const typeof((b)[0]) *_k = k; \ + int (*_func_)(const typeof((b)[0])*, const typeof((b)[0])*, typeof(userdata)) = func; \ + (typeof((b)[0])*) xbsearch_r((const void*) _k, (b), (n), sizeof((b)[0]), (comparison_userdata_fn_t) _func_, userdata); \ + }) + +/** + * Normal bsearch requires base to be nonnull. Here were require + * that only if nmemb > 0. + */ +static inline void* bsearch_safe(const void *key, const void *base, + size_t nmemb, size_t size, comparison_fn_t compar) { + if (nmemb <= 0) + return NULL; + + assert(base); + return bsearch(key, base, nmemb, size, compar); +} + +#define typesafe_bsearch(k, b, n, func) \ + ({ \ + const typeof((b)[0]) *_k = k; \ + int (*_func_)(const typeof((b)[0])*, const typeof((b)[0])*) = func; \ + (typeof((b)[0])*) bsearch_safe((const void*) _k, (b), (n), sizeof((b)[0]), (comparison_fn_t) _func_); \ + }) + +/** + * Normal qsort requires base to be nonnull. Here were require + * that only if nmemb > 0. + */ +static inline void _qsort_safe(void *base, size_t nmemb, size_t size, comparison_fn_t compar) { + if (nmemb <= 1) + return; + + assert(base); + qsort(base, nmemb, size, compar); +} + +/* A wrapper around the above, but that adds typesafety: the element size is automatically derived from the type and so + * is the prototype for the comparison function */ +#define typesafe_qsort(p, n, func) \ + ({ \ + int (*_func_)(const typeof((p)[0])*, const typeof((p)[0])*) = func; \ + _qsort_safe((p), (n), sizeof((p)[0]), (comparison_fn_t) _func_); \ + }) + +static inline void qsort_r_safe(void *base, size_t nmemb, size_t size, comparison_userdata_fn_t compar, void *userdata) { + if (nmemb <= 1) + return; + + assert(base); + qsort_r(base, nmemb, size, compar, userdata); +} + +#define typesafe_qsort_r(p, n, func, userdata) \ + ({ \ + int (*_func_)(const typeof((p)[0])*, const typeof((p)[0])*, typeof(userdata)) = func; \ + qsort_r_safe((p), (n), sizeof((p)[0]), (comparison_userdata_fn_t) _func_, userdata); \ + }) + +int cmp_int(const int *a, const int *b); +int cmp_uint16(const uint16_t *a, const uint16_t *b); diff --git a/src/basic/sparse-endian.h b/src/basic/sparse-endian.h new file mode 100644 index 0000000..c795d3d --- /dev/null +++ b/src/basic/sparse-endian.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: MIT + * + * Copyright (c) 2012 Josh Triplett + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include +#include +#include + +#ifdef __CHECKER__ +#define __sd_bitwise __attribute__((__bitwise__)) +#define __sd_force __attribute__((__force__)) +#else +#define __sd_bitwise +#define __sd_force +#endif + +typedef uint16_t __sd_bitwise le16_t; +typedef uint16_t __sd_bitwise be16_t; +typedef uint32_t __sd_bitwise le32_t; +typedef uint32_t __sd_bitwise be32_t; +typedef uint64_t __sd_bitwise le64_t; +typedef uint64_t __sd_bitwise be64_t; + +#undef htobe16 +#undef htole16 +#undef be16toh +#undef le16toh +#undef htobe32 +#undef htole32 +#undef be32toh +#undef le32toh +#undef htobe64 +#undef htole64 +#undef be64toh +#undef le64toh + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define bswap_16_on_le(x) bswap_16(x) +#define bswap_32_on_le(x) bswap_32(x) +#define bswap_64_on_le(x) bswap_64(x) +#define bswap_16_on_be(x) (x) +#define bswap_32_on_be(x) (x) +#define bswap_64_on_be(x) (x) +#elif __BYTE_ORDER == __BIG_ENDIAN +#define bswap_16_on_le(x) (x) +#define bswap_32_on_le(x) (x) +#define bswap_64_on_le(x) (x) +#define bswap_16_on_be(x) bswap_16(x) +#define bswap_32_on_be(x) bswap_32(x) +#define bswap_64_on_be(x) bswap_64(x) +#endif + +static inline le16_t htole16(uint16_t value) { return (le16_t __sd_force) bswap_16_on_be(value); } +static inline le32_t htole32(uint32_t value) { return (le32_t __sd_force) bswap_32_on_be(value); } +static inline le64_t htole64(uint64_t value) { return (le64_t __sd_force) bswap_64_on_be(value); } + +static inline be16_t htobe16(uint16_t value) { return (be16_t __sd_force) bswap_16_on_le(value); } +static inline be32_t htobe32(uint32_t value) { return (be32_t __sd_force) bswap_32_on_le(value); } +static inline be64_t htobe64(uint64_t value) { return (be64_t __sd_force) bswap_64_on_le(value); } + +static inline uint16_t le16toh(le16_t value) { return bswap_16_on_be((uint16_t __sd_force)value); } +static inline uint32_t le32toh(le32_t value) { return bswap_32_on_be((uint32_t __sd_force)value); } +static inline uint64_t le64toh(le64_t value) { return bswap_64_on_be((uint64_t __sd_force)value); } + +static inline uint16_t be16toh(be16_t value) { return bswap_16_on_le((uint16_t __sd_force)value); } +static inline uint32_t be32toh(be32_t value) { return bswap_32_on_le((uint32_t __sd_force)value); } +static inline uint64_t be64toh(be64_t value) { return bswap_64_on_le((uint64_t __sd_force)value); } + +#undef __sd_bitwise +#undef __sd_force diff --git a/src/basic/special.h b/src/basic/special.h new file mode 100644 index 0000000..a625e75 --- /dev/null +++ b/src/basic/special.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#define SPECIAL_DEFAULT_TARGET "default.target" +#define SPECIAL_INITRD_TARGET "initrd.target" + +/* Shutdown targets */ +#define SPECIAL_UMOUNT_TARGET "umount.target" +/* This is not really intended to be started by directly. This is + * mostly so that other targets (reboot/halt/poweroff) can depend on + * it to bring all services down that want to be brought down on + * system shutdown. */ +#define SPECIAL_SHUTDOWN_TARGET "shutdown.target" +#define SPECIAL_HALT_TARGET "halt.target" +#define SPECIAL_POWEROFF_TARGET "poweroff.target" +#define SPECIAL_REBOOT_TARGET "reboot.target" +#define SPECIAL_SOFT_REBOOT_TARGET "soft-reboot.target" +#define SPECIAL_KEXEC_TARGET "kexec.target" +#define SPECIAL_EXIT_TARGET "exit.target" +#define SPECIAL_SUSPEND_TARGET "suspend.target" +#define SPECIAL_HIBERNATE_TARGET "hibernate.target" +#define SPECIAL_HYBRID_SLEEP_TARGET "hybrid-sleep.target" +#define SPECIAL_SUSPEND_THEN_HIBERNATE_TARGET "suspend-then-hibernate.target" +#define SPECIAL_FACTORY_RESET_TARGET "factory-reset.target" + +/* Special boot targets */ +#define SPECIAL_RESCUE_TARGET "rescue.target" +#define SPECIAL_EMERGENCY_TARGET "emergency.target" +#define SPECIAL_MULTI_USER_TARGET "multi-user.target" +#define SPECIAL_GRAPHICAL_TARGET "graphical.target" + +/* Early boot targets */ +#define SPECIAL_SYSINIT_TARGET "sysinit.target" +#define SPECIAL_SOCKETS_TARGET "sockets.target" +#define SPECIAL_TIMERS_TARGET "timers.target" +#define SPECIAL_PATHS_TARGET "paths.target" +#define SPECIAL_LOCAL_FS_TARGET "local-fs.target" +#define SPECIAL_LOCAL_FS_PRE_TARGET "local-fs-pre.target" +#define SPECIAL_INITRD_FS_TARGET "initrd-fs.target" +#define SPECIAL_INITRD_ROOT_DEVICE_TARGET "initrd-root-device.target" +#define SPECIAL_INITRD_ROOT_FS_TARGET "initrd-root-fs.target" +#define SPECIAL_INITRD_USR_FS_TARGET "initrd-usr-fs.target" +#define SPECIAL_REMOTE_FS_TARGET "remote-fs.target" /* LSB's $remote_fs */ +#define SPECIAL_REMOTE_FS_PRE_TARGET "remote-fs-pre.target" +#define SPECIAL_SWAP_TARGET "swap.target" +#define SPECIAL_NETWORK_ONLINE_TARGET "network-online.target" +#define SPECIAL_TIME_SYNC_TARGET "time-sync.target" /* LSB's $time */ +#define SPECIAL_TIME_SET_TARGET "time-set.target" +#define SPECIAL_BASIC_TARGET "basic.target" + +/* LSB compatibility */ +#define SPECIAL_NETWORK_TARGET "network.target" /* LSB's $network */ +#define SPECIAL_NSS_LOOKUP_TARGET "nss-lookup.target" /* LSB's $named */ +#define SPECIAL_RPCBIND_TARGET "rpcbind.target" /* LSB's $portmap */ + +/* + * Rules regarding adding further high level targets like the above: + * + * - Be conservative, only add more of these when we really need + * them. We need strong use cases for further additions. + * + * - When there can be multiple implementations running side-by-side, + * it needs to be a .target unit which can pull in all + * implementations. + * + * - If something can be implemented with socket activation, and + * without, it needs to be a .target unit, so that it can pull in + * the appropriate unit. + * + * - Otherwise, it should be a .service unit. + * + * - In some cases it is OK to have both a .service and a .target + * unit, i.e. if there can be multiple parallel implementations, but + * only one is the "system" one. Example: syslog. + * + * Or to put this in other words: .service symlinks can be used to + * arbitrate between multiple implementations if there can be only one + * of a kind. .target units can be used to support multiple + * implementations that can run side-by-side. + */ + +/* Magic early boot services */ +#define SPECIAL_FSCK_SERVICE "systemd-fsck@.service" +#define SPECIAL_FSCK_ROOT_SERVICE "systemd-fsck-root.service" +#define SPECIAL_FSCK_USR_SERVICE "systemd-fsck-usr.service" +#define SPECIAL_QUOTACHECK_SERVICE "systemd-quotacheck.service" +#define SPECIAL_QUOTAON_SERVICE "quotaon.service" +#define SPECIAL_REMOUNT_FS_SERVICE "systemd-remount-fs.service" +#define SPECIAL_VOLATILE_ROOT_SERVICE "systemd-volatile-root.service" +#define SPECIAL_UDEVD_SERVICE "systemd-udevd.service" +#define SPECIAL_GROWFS_SERVICE "systemd-growfs@.service" +#define SPECIAL_GROWFS_ROOT_SERVICE "systemd-growfs-root.service" +#define SPECIAL_PCRFS_SERVICE "systemd-pcrfs@.service" +#define SPECIAL_PCRFS_ROOT_SERVICE "systemd-pcrfs-root.service" +#define SPECIAL_HIBERNATE_RESUME_SERVICE "systemd-hibernate-resume.service" + +/* Services systemd relies on */ +#define SPECIAL_DBUS_SERVICE "dbus.service" +#define SPECIAL_DBUS_SOCKET "dbus.socket" +#define SPECIAL_JOURNALD_SOCKET "systemd-journald.socket" +#define SPECIAL_JOURNALD_SERVICE "systemd-journald.service" +#define SPECIAL_TMPFILES_SETUP_SERVICE "systemd-tmpfiles-setup.service" + +/* Magic init signals */ +#define SPECIAL_KBREQUEST_TARGET "kbrequest.target" +#define SPECIAL_SIGPWR_TARGET "sigpwr.target" +#define SPECIAL_CTRL_ALT_DEL_TARGET "ctrl-alt-del.target" + +/* Where we add all our system units, users and machines by default */ +#define SPECIAL_SYSTEM_SLICE "system.slice" +#define SPECIAL_USER_SLICE "user.slice" +#define SPECIAL_MACHINE_SLICE "machine.slice" +#define SPECIAL_ROOT_SLICE "-.slice" + +/* The scope unit systemd itself lives in. */ +#define SPECIAL_INIT_SCOPE "init.scope" + +/* The root directory. */ +#define SPECIAL_ROOT_MOUNT "-.mount" + +/* Special slices valid for the user instance */ +#define SPECIAL_SESSION_SLICE "session.slice" +#define SPECIAL_APP_SLICE "app.slice" +#define SPECIAL_BACKGROUND_SLICE "background.slice" diff --git a/src/basic/stat-util.c b/src/basic/stat-util.c new file mode 100644 index 0000000..c54374b --- /dev/null +++ b/src/basic/stat-util.c @@ -0,0 +1,520 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "dirent-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "filesystems.h" +#include "fs-util.h" +#include "hash-funcs.h" +#include "macro.h" +#include "missing_fs.h" +#include "missing_magic.h" +#include "missing_syscall.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "stat-util.h" +#include "string-util.h" + +int is_symlink(const char *path) { + struct stat info; + + assert(path); + + if (lstat(path, &info) < 0) + return -errno; + + return !!S_ISLNK(info.st_mode); +} + +int is_dir_full(int atfd, const char* path, bool follow) { + struct stat st; + int r; + + assert(atfd >= 0 || atfd == AT_FDCWD); + assert(atfd >= 0 || path); + + if (path) + r = fstatat(atfd, path, &st, follow ? 0 : AT_SYMLINK_NOFOLLOW); + else + r = fstat(atfd, &st); + if (r < 0) + return -errno; + + return !!S_ISDIR(st.st_mode); +} + +int is_device_node(const char *path) { + struct stat info; + + assert(path); + + if (lstat(path, &info) < 0) + return -errno; + + return !!(S_ISBLK(info.st_mode) || S_ISCHR(info.st_mode)); +} + +int dir_is_empty_at(int dir_fd, const char *path, bool ignore_hidden_or_backup) { + _cleanup_close_ int fd = -EBADF; + struct dirent *buf; + size_t m; + + if (path) { + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + fd = openat(dir_fd, path, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return -errno; + } else if (dir_fd == AT_FDCWD) { + fd = open(".", O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return -errno; + } else { + /* Note that DUPing is not enough, as the internal pointer would still be shared and moved + * getedents64(). */ + assert(dir_fd >= 0); + + fd = fd_reopen(dir_fd, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return fd; + } + + /* Allocate space for at least 3 full dirents, since every dir has at least two entries ("." + + * ".."), and only once we have seen if there's a third we know whether the dir is empty or not. If + * 'ignore_hidden_or_backup' is true we'll allocate a bit more, since we might skip over a bunch of + * entries that we end up ignoring. */ + m = (ignore_hidden_or_backup ? 16 : 3) * DIRENT_SIZE_MAX; + buf = alloca(m); + + for (;;) { + struct dirent *de; + ssize_t n; + + n = getdents64(fd, buf, m); + if (n < 0) + return -errno; + if (n == 0) + break; + + assert((size_t) n <= m); + msan_unpoison(buf, n); + + FOREACH_DIRENT_IN_BUFFER(de, buf, n) + if (!(ignore_hidden_or_backup ? hidden_or_backup_file(de->d_name) : dot_or_dot_dot(de->d_name))) + return 0; + } + + return 1; +} + +bool null_or_empty(struct stat *st) { + assert(st); + + if (S_ISREG(st->st_mode) && st->st_size <= 0) + return true; + + /* We don't want to hardcode the major/minor of /dev/null, hence we do a simpler "is this a character + * device node?" check. */ + + if (S_ISCHR(st->st_mode)) + return true; + + return false; +} + +int null_or_empty_path_with_root(const char *fn, const char *root) { + struct stat st; + int r; + + assert(fn); + + /* A symlink to /dev/null or an empty file? + * When looking under root_dir, we can't expect /dev/ to be mounted, + * so let's see if the path is a (possibly dangling) symlink to /dev/null. */ + + if (path_equal_ptr(path_startswith(fn, root ?: "/"), "dev/null")) + return true; + + r = chase_and_stat(fn, root, CHASE_PREFIX_ROOT, NULL, &st); + if (r < 0) + return r; + + return null_or_empty(&st); +} + +static int fd_is_read_only_fs(int fd) { + struct statvfs st; + + assert(fd >= 0); + + if (fstatvfs(fd, &st) < 0) + return -errno; + + if (st.f_flag & ST_RDONLY) + return true; + + /* On NFS, fstatvfs() might not reflect whether we can actually write to the remote share. Let's try + * again with access(W_OK) which is more reliable, at least sometimes. */ + if (access_fd(fd, W_OK) == -EROFS) + return true; + + return false; +} + +int path_is_read_only_fs(const char *path) { + _cleanup_close_ int fd = -EBADF; + + assert(path); + + fd = open(path, O_CLOEXEC | O_PATH); + if (fd < 0) + return -errno; + + return fd_is_read_only_fs(fd); +} + +int inode_same_at(int fda, const char *filea, int fdb, const char *fileb, int flags) { + struct stat a, b; + + assert(fda >= 0 || fda == AT_FDCWD); + assert(filea); + assert(fdb >= 0 || fdb == AT_FDCWD); + assert(fileb); + + if (fstatat(fda, filea, &a, flags) < 0) + return log_debug_errno(errno, "Cannot stat %s: %m", filea); + + if (fstatat(fdb, fileb, &b, flags) < 0) + return log_debug_errno(errno, "Cannot stat %s: %m", fileb); + + return stat_inode_same(&a, &b); +} + +bool is_fs_type(const struct statfs *s, statfs_f_type_t magic_value) { + assert(s); + assert_cc(sizeof(statfs_f_type_t) >= sizeof(s->f_type)); + + return F_TYPE_EQUAL(s->f_type, magic_value); +} + +int is_fs_type_at(int dir_fd, const char *path, statfs_f_type_t magic_value) { + struct statfs s; + int r; + + r = xstatfsat(dir_fd, path, &s); + if (r < 0) + return r; + + return is_fs_type(&s, magic_value); +} + +bool is_temporary_fs(const struct statfs *s) { + return fs_in_group(s, FILESYSTEM_SET_TEMPORARY); +} + +bool is_network_fs(const struct statfs *s) { + return fs_in_group(s, FILESYSTEM_SET_NETWORK); +} + +int fd_is_temporary_fs(int fd) { + struct statfs s; + + if (fstatfs(fd, &s) < 0) + return -errno; + + return is_temporary_fs(&s); +} + +int fd_is_network_fs(int fd) { + struct statfs s; + + if (fstatfs(fd, &s) < 0) + return -errno; + + return is_network_fs(&s); +} + +int path_is_temporary_fs(const char *path) { + struct statfs s; + + if (statfs(path, &s) < 0) + return -errno; + + return is_temporary_fs(&s); +} + +int path_is_network_fs(const char *path) { + struct statfs s; + + if (statfs(path, &s) < 0) + return -errno; + + return is_network_fs(&s); +} + +int stat_verify_regular(const struct stat *st) { + assert(st); + + /* Checks whether the specified stat() structure refers to a regular file. If not returns an appropriate error + * code. */ + + if (S_ISDIR(st->st_mode)) + return -EISDIR; + + if (S_ISLNK(st->st_mode)) + return -ELOOP; + + if (!S_ISREG(st->st_mode)) + return -EBADFD; + + return 0; +} + +int fd_verify_regular(int fd) { + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + return stat_verify_regular(&st); +} + +int verify_regular_at(int dir_fd, const char *path, bool follow) { + struct stat st; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + if (fstatat(dir_fd, path, &st, (isempty(path) ? AT_EMPTY_PATH : 0) | (follow ? 0 : AT_SYMLINK_NOFOLLOW)) < 0) + return -errno; + + return stat_verify_regular(&st); +} + +int stat_verify_directory(const struct stat *st) { + assert(st); + + if (S_ISLNK(st->st_mode)) + return -ELOOP; + + if (!S_ISDIR(st->st_mode)) + return -ENOTDIR; + + return 0; +} + +int fd_verify_directory(int fd) { + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + return stat_verify_directory(&st); +} + +int proc_mounted(void) { + int r; + + /* A quick check of procfs is properly mounted */ + + r = path_is_fs_type("/proc/", PROC_SUPER_MAGIC); + if (r == -ENOENT) /* not mounted at all */ + return false; + + return r; +} + +bool stat_inode_same(const struct stat *a, const struct stat *b) { + + /* Returns if the specified stat structure references the same (though possibly modified) inode. Does + * a thorough check, comparing inode nr, backing device and if the inode is still of the same type. */ + + return a && b && + (a->st_mode & S_IFMT) != 0 && /* We use the check for .st_mode if the structure was ever initialized */ + ((a->st_mode ^ b->st_mode) & S_IFMT) == 0 && /* same inode type */ + a->st_dev == b->st_dev && + a->st_ino == b->st_ino; +} + +bool stat_inode_unmodified(const struct stat *a, const struct stat *b) { + + /* Returns if the specified stat structures reference the same, unmodified inode. This check tries to + * be reasonably careful when detecting changes: we check both inode and mtime, to cater for file + * systems where mtimes are fixed to 0 (think: ostree/nixos type installations). We also check file + * size, backing device, inode type and if this refers to a device not the major/minor. + * + * Note that we don't care if file attributes such as ownership or access mode change, this here is + * about contents of the file. The purpose here is to detect file contents changes, and nothing + * else. */ + + return stat_inode_same(a, b) && + a->st_mtim.tv_sec == b->st_mtim.tv_sec && + a->st_mtim.tv_nsec == b->st_mtim.tv_nsec && + (!S_ISREG(a->st_mode) || a->st_size == b->st_size) && /* if regular file, compare file size */ + (!(S_ISCHR(a->st_mode) || S_ISBLK(a->st_mode)) || a->st_rdev == b->st_rdev); /* if device node, also compare major/minor, because we can */ +} + +bool statx_inode_same(const struct statx *a, const struct statx *b) { + + /* Same as stat_inode_same() but for struct statx */ + + return a && b && + FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO) && FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO) && + (a->stx_mode & S_IFMT) != 0 && + ((a->stx_mode ^ b->stx_mode) & S_IFMT) == 0 && + a->stx_dev_major == b->stx_dev_major && + a->stx_dev_minor == b->stx_dev_minor && + a->stx_ino == b->stx_ino; +} + +bool statx_mount_same(const struct new_statx *a, const struct new_statx *b) { + if (!a || !b) + return false; + + /* if we have the mount ID, that's all we need */ + if (FLAGS_SET(a->stx_mask, STATX_MNT_ID) && FLAGS_SET(b->stx_mask, STATX_MNT_ID)) + return a->stx_mnt_id == b->stx_mnt_id; + + /* Otherwise, major/minor of backing device must match */ + return a->stx_dev_major == b->stx_dev_major && + a->stx_dev_minor == b->stx_dev_minor; +} + +static bool is_statx_fatal_error(int err, int flags) { + assert(err < 0); + + /* If statx() is not supported or if we see EPERM (which might indicate seccomp filtering or so), + * let's do a fallback. Note that on EACCES we'll not fall back, since that is likely an indication of + * fs access issues, which we should propagate. */ + if (ERRNO_IS_NOT_SUPPORTED(err) || err == -EPERM) + return false; + + /* When unsupported flags are specified, glibc's fallback function returns -EINVAL. + * See statx_generic() in glibc. */ + if (err != -EINVAL) + return true; + + if ((flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | AT_STATX_SYNC_AS_STAT)) != 0) + return false; /* Unsupported flags are specified. Let's try to use our implementation. */ + + return true; +} + +int statx_fallback(int dfd, const char *path, int flags, unsigned mask, struct statx *sx) { + static bool avoid_statx = false; + struct stat st; + int r; + + if (!avoid_statx) { + r = RET_NERRNO(statx(dfd, path, flags, mask, sx)); + if (r >= 0 || is_statx_fatal_error(r, flags)) + return r; + + avoid_statx = true; + } + + /* Only do fallback if fstatat() supports the flag too, or if it's one of the sync flags, which are + * OK to ignore */ + if ((flags & ~(AT_EMPTY_PATH|AT_NO_AUTOMOUNT|AT_SYMLINK_NOFOLLOW| + AT_STATX_SYNC_AS_STAT|AT_STATX_FORCE_SYNC|AT_STATX_DONT_SYNC)) != 0) + return -EOPNOTSUPP; + + if (fstatat(dfd, path, &st, flags & (AT_EMPTY_PATH|AT_NO_AUTOMOUNT|AT_SYMLINK_NOFOLLOW)) < 0) + return -errno; + + *sx = (struct statx) { + .stx_mask = STATX_TYPE|STATX_MODE| + STATX_NLINK|STATX_UID|STATX_GID| + STATX_ATIME|STATX_MTIME|STATX_CTIME| + STATX_INO|STATX_SIZE|STATX_BLOCKS, + .stx_blksize = st.st_blksize, + .stx_nlink = st.st_nlink, + .stx_uid = st.st_uid, + .stx_gid = st.st_gid, + .stx_mode = st.st_mode, + .stx_ino = st.st_ino, + .stx_size = st.st_size, + .stx_blocks = st.st_blocks, + .stx_rdev_major = major(st.st_rdev), + .stx_rdev_minor = minor(st.st_rdev), + .stx_dev_major = major(st.st_dev), + .stx_dev_minor = minor(st.st_dev), + .stx_atime.tv_sec = st.st_atim.tv_sec, + .stx_atime.tv_nsec = st.st_atim.tv_nsec, + .stx_mtime.tv_sec = st.st_mtim.tv_sec, + .stx_mtime.tv_nsec = st.st_mtim.tv_nsec, + .stx_ctime.tv_sec = st.st_ctim.tv_sec, + .stx_ctime.tv_nsec = st.st_ctim.tv_nsec, + }; + + return 0; +} + +int xstatfsat(int dir_fd, const char *path, struct statfs *ret) { + _cleanup_close_ int fd = -EBADF; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(ret); + + fd = xopenat(dir_fd, path, O_PATH|O_CLOEXEC|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0) + return fd; + + return RET_NERRNO(fstatfs(fd, ret)); +} + +void inode_hash_func(const struct stat *q, struct siphash *state) { + siphash24_compress(&q->st_dev, sizeof(q->st_dev), state); + siphash24_compress(&q->st_ino, sizeof(q->st_ino), state); +} + +int inode_compare_func(const struct stat *a, const struct stat *b) { + int r; + + r = CMP(a->st_dev, b->st_dev); + if (r != 0) + return r; + + return CMP(a->st_ino, b->st_ino); +} + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(inode_hash_ops, struct stat, inode_hash_func, inode_compare_func, free); + +const char* inode_type_to_string(mode_t m) { + + /* Returns a short string for the inode type. We use the same name as the underlying macros for each + * inode type. */ + + switch (m & S_IFMT) { + case S_IFREG: + return "reg"; + case S_IFDIR: + return "dir"; + case S_IFLNK: + return "lnk"; + case S_IFCHR: + return "chr"; + case S_IFBLK: + return "blk"; + case S_IFIFO: + return "fifo"; + case S_IFSOCK: + return "sock"; + } + + return NULL; +} diff --git a/src/basic/stat-util.h b/src/basic/stat-util.h new file mode 100644 index 0000000..ae0aaf8 --- /dev/null +++ b/src/basic/stat-util.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "macro.h" +#include "missing_stat.h" +#include "siphash24.h" + +int is_symlink(const char *path); +int is_dir_full(int atfd, const char *fname, bool follow); +static inline int is_dir(const char *path, bool follow) { + return is_dir_full(AT_FDCWD, path, follow); +} +static inline int is_dir_fd(int fd) { + return is_dir_full(fd, NULL, false); +} +int is_device_node(const char *path); + +int dir_is_empty_at(int dir_fd, const char *path, bool ignore_hidden_or_backup); +static inline int dir_is_empty(const char *path, bool ignore_hidden_or_backup) { + return dir_is_empty_at(AT_FDCWD, path, ignore_hidden_or_backup); +} + +bool null_or_empty(struct stat *st) _pure_; +int null_or_empty_path_with_root(const char *fn, const char *root); + +static inline int null_or_empty_path(const char *fn) { + return null_or_empty_path_with_root(fn, NULL); +} + +int path_is_read_only_fs(const char *path); + +int inode_same_at(int fda, const char *filea, int fdb, const char *fileb, int flags); + +static inline int inode_same(const char *filea, const char *fileb, int flags) { + return inode_same_at(AT_FDCWD, filea, AT_FDCWD, fileb, flags); +} + +/* The .f_type field of struct statfs is really weird defined on + * different archs. Let's give its type a name. */ +typedef typeof(((struct statfs*)NULL)->f_type) statfs_f_type_t; + +bool is_fs_type(const struct statfs *s, statfs_f_type_t magic_value) _pure_; +int is_fs_type_at(int dir_fd, const char *path, statfs_f_type_t magic_value); +static inline int fd_is_fs_type(int fd, statfs_f_type_t magic_value) { + return is_fs_type_at(fd, NULL, magic_value); +} +static inline int path_is_fs_type(const char *path, statfs_f_type_t magic_value) { + return is_fs_type_at(AT_FDCWD, path, magic_value); +} + +bool is_temporary_fs(const struct statfs *s) _pure_; +bool is_network_fs(const struct statfs *s) _pure_; + +int fd_is_temporary_fs(int fd); +int fd_is_network_fs(int fd); + +int path_is_temporary_fs(const char *path); +int path_is_network_fs(const char *path); + +/* Because statfs.t_type can be int on some architectures, we have to cast + * the const magic to the type, otherwise the compiler warns about + * signed/unsigned comparison, because the magic can be 32 bit unsigned. + */ +#define F_TYPE_EQUAL(a, b) (a == (typeof(a)) b) + +int stat_verify_regular(const struct stat *st); +int fd_verify_regular(int fd); +int verify_regular_at(int dir_fd, const char *path, bool follow); + +int stat_verify_directory(const struct stat *st); +int fd_verify_directory(int fd); + +int proc_mounted(void); + +bool stat_inode_same(const struct stat *a, const struct stat *b); +bool stat_inode_unmodified(const struct stat *a, const struct stat *b); + +bool statx_inode_same(const struct statx *a, const struct statx *b); +bool statx_mount_same(const struct new_statx *a, const struct new_statx *b); + +int statx_fallback(int dfd, const char *path, int flags, unsigned mask, struct statx *sx); + +int xstatfsat(int dir_fd, const char *path, struct statfs *ret); + +#if HAS_FEATURE_MEMORY_SANITIZER +# warning "Explicitly initializing struct statx, to work around msan limitation. Please remove as soon as msan has been updated to not require this." +# define STRUCT_STATX_DEFINE(var) \ + struct statx var = {} +# define STRUCT_NEW_STATX_DEFINE(var) \ + union { \ + struct statx sx; \ + struct new_statx nsx; \ + } var = {} +#else +# define STRUCT_STATX_DEFINE(var) \ + struct statx var +# define STRUCT_NEW_STATX_DEFINE(var) \ + union { \ + struct statx sx; \ + struct new_statx nsx; \ + } var +#endif + +void inode_hash_func(const struct stat *q, struct siphash *state); +int inode_compare_func(const struct stat *a, const struct stat *b); +extern const struct hash_ops inode_hash_ops; + +const char* inode_type_to_string(mode_t m); diff --git a/src/basic/static-destruct.h b/src/basic/static-destruct.h new file mode 100644 index 0000000..2ffc651 --- /dev/null +++ b/src/basic/static-destruct.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include "alloc-util.h" +#include "macro.h" +#include "memory-util.h" + +/* A framework for registering static variables that shall be freed on shutdown of a process. It's a bit like gcc's + * destructor attribute, but allows us to precisely schedule when we want to free the variables. This is supposed to + * feel a bit like the gcc cleanup attribute, but for static variables. Note that this does not work for static + * variables declared in .so's, as the list is private to the same linking unit. But maybe that's a good thing. */ + +#define _common_static_destruct_attrs_ \ + /* Older compilers don't know "retain" attribute. */ \ + _Pragma("GCC diagnostic ignored \"-Wattributes\"") \ + /* The actual destructor structure we place in a special section to find it. */ \ + _section_("SYSTEMD_STATIC_DESTRUCT") \ + /* Use pointer alignment, since that is apparently what gcc does for static variables. */ \ + _alignptr_ \ + /* Make sure this is not dropped from the image despite not being explicitly referenced. */ \ + _used_ \ + /* Prevent garbage collection by the linker. */ \ + _retain_ \ + /* Make sure that AddressSanitizer doesn't pad this variable: we want everything in this section + * packed next to each other so that we can enumerate it. */ \ + _variable_no_sanitize_address_ + +typedef enum StaticDestructorType { + STATIC_DESTRUCTOR_SIMPLE, + STATIC_DESTRUCTOR_ARRAY, + _STATIC_DESTRUCTOR_TYPE_MAX, + _STATIC_DESTRUCTOR_INVALID = -EINVAL, +} StaticDestructorType; + +typedef struct SimpleCleanup { + void *data; + free_func_t destroy; +} SimpleCleanup; + +typedef struct StaticDestructor { + StaticDestructorType type; + union { + SimpleCleanup simple; + ArrayCleanup array; + }; +} StaticDestructor; + +#define STATIC_DESTRUCTOR_REGISTER(variable, func) \ + _STATIC_DESTRUCTOR_REGISTER(UNIQ, variable, func) + +#define _STATIC_DESTRUCTOR_REGISTER(uq, variable, func) \ + /* Type-safe destructor */ \ + static void UNIQ_T(static_destructor_wrapper, uq)(void *p) { \ + typeof(variable) *q = p; \ + func(q); \ + } \ + _common_static_destruct_attrs_ \ + static const StaticDestructor UNIQ_T(static_destructor_entry, uq) = { \ + .type = STATIC_DESTRUCTOR_SIMPLE, \ + .simple.data = &(variable), \ + .simple.destroy = UNIQ_T(static_destructor_wrapper, uq), \ + } + +#define STATIC_ARRAY_DESTRUCTOR_REGISTER(a, n, func) \ + _STATIC_ARRAY_DESTRUCTOR_REGISTER(UNIQ, a, n, func) + +#define _STATIC_ARRAY_DESTRUCTOR_REGISTER(uq, a, n, func) \ + /* Type-safety check */ \ + _unused_ static void (* UNIQ_T(static_destructor_wrapper, uq))(typeof(a[0]) *x, size_t y) = (func); \ + _common_static_destruct_attrs_ \ + static const StaticDestructor UNIQ_T(static_destructor_entry, uq) = { \ + .type = STATIC_DESTRUCTOR_ARRAY, \ + .array.parray = (void**) &(a), \ + .array.pn = &(n), \ + .array.pfunc = (free_array_func_t) (func), \ + }; + +/* Beginning and end of our section listing the destructors. We define these as weak as we want this to work + * even if no destructors are defined and the section is missing. */ +extern const StaticDestructor _weak_ __start_SYSTEMD_STATIC_DESTRUCT[]; +extern const StaticDestructor _weak_ __stop_SYSTEMD_STATIC_DESTRUCT[]; + +/* The function to destroy everything. (Note that this must be static inline, as it's key that it remains in + * the same linking unit as the variables we want to destroy.) */ +static inline void static_destruct(void) { + if (!__start_SYSTEMD_STATIC_DESTRUCT) + return; + + for (const StaticDestructor *d = ALIGN_PTR(__start_SYSTEMD_STATIC_DESTRUCT); + d < __stop_SYSTEMD_STATIC_DESTRUCT; + d = ALIGN_PTR(d + 1)) + switch (d->type) { + case STATIC_DESTRUCTOR_SIMPLE: + d->simple.destroy(d->simple.data); + break; + + case STATIC_DESTRUCTOR_ARRAY: + array_cleanup(&d->array); + break; + + default: + assert_not_reached(); + } +} diff --git a/src/basic/stdio-util.h b/src/basic/stdio-util.h new file mode 100644 index 0000000..4e93ac9 --- /dev/null +++ b/src/basic/stdio-util.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "macro.h" + +_printf_(3, 4) +static inline char *snprintf_ok(char *buf, size_t len, const char *format, ...) { + va_list ap; + int r; + + va_start(ap, format); + DISABLE_WARNING_FORMAT_NONLITERAL; + r = vsnprintf(buf, len, format, ap); + REENABLE_WARNING; + va_end(ap); + + return r >= 0 && (size_t) r < len ? buf : NULL; +} + +#define xsprintf(buf, fmt, ...) \ + assert_message_se(snprintf_ok(buf, ELEMENTSOF(buf), fmt, ##__VA_ARGS__), "xsprintf: " #buf "[] must be big enough") + +#define VA_FORMAT_ADVANCE(format, ap) \ +do { \ + int _argtypes[128]; \ + size_t _i, _k; \ + /* See https://github.com/google/sanitizers/issues/992 */ \ + if (HAS_FEATURE_MEMORY_SANITIZER) \ + memset(_argtypes, 0, sizeof(_argtypes)); \ + _k = parse_printf_format((format), ELEMENTSOF(_argtypes), _argtypes); \ + assert(_k < ELEMENTSOF(_argtypes)); \ + for (_i = 0; _i < _k; _i++) { \ + if (_argtypes[_i] & PA_FLAG_PTR) { \ + (void) va_arg(ap, void*); \ + continue; \ + } \ + \ + switch (_argtypes[_i]) { \ + case PA_INT: \ + case PA_INT|PA_FLAG_SHORT: \ + case PA_CHAR: \ + (void) va_arg(ap, int); \ + break; \ + case PA_INT|PA_FLAG_LONG: \ + (void) va_arg(ap, long int); \ + break; \ + case PA_INT|PA_FLAG_LONG_LONG: \ + (void) va_arg(ap, long long int); \ + break; \ + case PA_WCHAR: \ + (void) va_arg(ap, wchar_t); \ + break; \ + case PA_WSTRING: \ + case PA_STRING: \ + case PA_POINTER: \ + (void) va_arg(ap, void*); \ + break; \ + case PA_FLOAT: \ + case PA_DOUBLE: \ + (void) va_arg(ap, double); \ + break; \ + case PA_DOUBLE|PA_FLAG_LONG_DOUBLE: \ + (void) va_arg(ap, long double); \ + break; \ + default: \ + assert_not_reached(); \ + } \ + } \ +} while (false) diff --git a/src/basic/strbuf.c b/src/basic/strbuf.c new file mode 100644 index 0000000..0617acc --- /dev/null +++ b/src/basic/strbuf.c @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "sort-util.h" +#include "strbuf.h" + +/* + * Strbuf stores given strings in a single continuous allocated memory + * area. Identical strings are de-duplicated and return the same offset + * as the first string stored. If the tail of a string already exists + * in the buffer, the tail is returned. + * + * A trie (http://en.wikipedia.org/wiki/Trie) is used to maintain the + * information about the stored strings. + * + * Example of udev rules: + * $ ./udevadm test . + * ... + * read rules file: /usr/lib/udev/rules.d/99-systemd.rules + * rules contain 196608 bytes tokens (16384 * 12 bytes), 39742 bytes strings + * 23939 strings (207859 bytes), 20404 de-duplicated (171653 bytes), 3536 trie nodes used + * ... + */ + +struct strbuf* strbuf_new(void) { + struct strbuf *str; + + str = new(struct strbuf, 1); + if (!str) + return NULL; + *str = (struct strbuf) { + .buf = new0(char, 1), + .root = new0(struct strbuf_node, 1), + .len = 1, + .nodes_count = 1, + }; + if (!str->buf || !str->root) { + free(str->buf); + free(str->root); + return mfree(str); + } + + return str; +} + +static struct strbuf_node* strbuf_node_cleanup(struct strbuf_node *node) { + size_t i; + + for (i = 0; i < node->children_count; i++) + strbuf_node_cleanup(node->children[i].child); + free(node->children); + return mfree(node); +} + +/* clean up trie data, leave only the string buffer */ +void strbuf_complete(struct strbuf *str) { + if (!str) + return; + if (str->root) + str->root = strbuf_node_cleanup(str->root); +} + +/* clean up everything */ +struct strbuf* strbuf_free(struct strbuf *str) { + if (!str) + return NULL; + + strbuf_complete(str); + free(str->buf); + return mfree(str); +} + +static int strbuf_children_cmp(const struct strbuf_child_entry *n1, + const struct strbuf_child_entry *n2) { + return n1->c - n2->c; +} + +static void bubbleinsert(struct strbuf_node *node, + uint8_t c, + struct strbuf_node *node_child) { + + struct strbuf_child_entry new = { + .c = c, + .child = node_child, + }; + int left = 0, right = node->children_count; + + while (right > left) { + int middle = (right + left) / 2 ; + if (strbuf_children_cmp(&node->children[middle], &new) <= 0) + left = middle + 1; + else + right = middle; + } + + memmove(node->children + left + 1, node->children + left, + sizeof(struct strbuf_child_entry) * (node->children_count - left)); + node->children[left] = new; + + node->children_count++; +} + +/* add string, return the index/offset into the buffer */ +ssize_t strbuf_add_string(struct strbuf *str, const char *s, size_t len) { + uint8_t c; + char *buf_new; + struct strbuf_child_entry *child; + struct strbuf_node *node; + ssize_t off; + + if (!str->root) + return -EINVAL; + + /* search string; start from last character to find possibly matching tails */ + + str->in_count++; + if (len == 0) { + str->dedup_count++; + return 0; + } + str->in_len += len; + + node = str->root; + for (size_t depth = 0; depth <= len; depth++) { + struct strbuf_child_entry search; + + /* match against current node */ + off = node->value_off + node->value_len - len; + if (depth == len || (node->value_len >= len && memcmp(str->buf + off, s, len) == 0)) { + str->dedup_len += len; + str->dedup_count++; + return off; + } + + c = s[len - 1 - depth]; + + /* lookup child node */ + search.c = c; + child = typesafe_bsearch(&search, node->children, node->children_count, strbuf_children_cmp); + if (!child) + break; + node = child->child; + } + + /* add new string */ + buf_new = realloc(str->buf, str->len + len+1); + if (!buf_new) + return -ENOMEM; + str->buf = buf_new; + off = str->len; + memcpy(str->buf + off, s, len); + str->len += len; + str->buf[str->len++] = '\0'; + + /* new node */ + _cleanup_free_ struct strbuf_node *node_child = NULL; + + node_child = new(struct strbuf_node, 1); + if (!node_child) + return -ENOMEM; + *node_child = (struct strbuf_node) { + .value_off = off, + .value_len = len, + }; + + /* extend array, add new entry, sort for bisection */ + child = reallocarray(node->children, node->children_count + 1, sizeof(struct strbuf_child_entry)); + if (!child) + return -ENOMEM; + + str->nodes_count++; + + node->children = child; + bubbleinsert(node, c, TAKE_PTR(node_child)); + + return off; +} diff --git a/src/basic/strbuf.h b/src/basic/strbuf.h new file mode 100644 index 0000000..6187c08 --- /dev/null +++ b/src/basic/strbuf.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" + +struct strbuf { + char *buf; + size_t len; + struct strbuf_node *root; + + size_t nodes_count; + size_t in_count; + size_t in_len; + size_t dedup_len; + size_t dedup_count; +}; + +struct strbuf_node { + size_t value_off; + size_t value_len; + + struct strbuf_child_entry *children; + uint8_t children_count; +}; + +struct strbuf_child_entry { + uint8_t c; + struct strbuf_node *child; +}; + +struct strbuf* strbuf_new(void); +ssize_t strbuf_add_string(struct strbuf *str, const char *s, size_t len); +void strbuf_complete(struct strbuf *str); +struct strbuf* strbuf_free(struct strbuf *str); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct strbuf*, strbuf_free); diff --git a/src/basic/string-table.c b/src/basic/string-table.c new file mode 100644 index 0000000..3a63767 --- /dev/null +++ b/src/basic/string-table.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "string-table.h" +#include "string-util.h" + +ssize_t string_table_lookup(const char * const *table, size_t len, const char *key) { + if (!key) + return -EINVAL; + + for (size_t i = 0; i < len; ++i) + if (streq_ptr(table[i], key)) + return (ssize_t) i; + + return -EINVAL; +} diff --git a/src/basic/string-table.h b/src/basic/string-table.h new file mode 100644 index 0000000..3be70df --- /dev/null +++ b/src/basic/string-table.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include +#include +#include + +#include "macro.h" +#include "parse-util.h" +#include "string-util.h" + +ssize_t string_table_lookup(const char * const *table, size_t len, const char *key); + +/* For basic lookup tables with strictly enumerated entries */ +#define _DEFINE_STRING_TABLE_LOOKUP_TO_STRING(name,type,scope) \ + scope const char *name##_to_string(type i) { \ + if (i < 0 || i >= (type) ELEMENTSOF(name##_table)) \ + return NULL; \ + return name##_table[i]; \ + } + +#define _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING(name,type,scope) \ + scope type name##_from_string(const char *s) { \ + return (type) string_table_lookup(name##_table, ELEMENTSOF(name##_table), s); \ + } + +#define _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(name,type,yes,scope) \ + scope type name##_from_string(const char *s) { \ + if (!s) \ + return -EINVAL; \ + int b = parse_boolean(s); \ + if (b == 0) \ + return (type) 0; \ + if (b > 0) \ + return yes; \ + return (type) string_table_lookup(name##_table, ELEMENTSOF(name##_table), s); \ + } + +#define _DEFINE_STRING_TABLE_LOOKUP_TO_STRING_FALLBACK(name,type,max,scope) \ + scope int name##_to_string_alloc(type i, char **str) { \ + char *s; \ + if (i < 0 || i > max) \ + return -ERANGE; \ + if (i < (type) ELEMENTSOF(name##_table) && name##_table[i]) { \ + s = strdup(name##_table[i]); \ + if (!s) \ + return -ENOMEM; \ + } else { \ + if (asprintf(&s, "%i", i) < 0) \ + return -ENOMEM; \ + } \ + *str = s; \ + return 0; \ + } + +#define _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_FALLBACK(name,type,max,scope) \ + scope type name##_from_string(const char *s) { \ + unsigned u = 0; \ + type i; \ + if (!s) \ + return -EINVAL; \ + i = (type) string_table_lookup(name##_table, ELEMENTSOF(name##_table), s); \ + if (i >= 0) \ + return i; \ + if (safe_atou(s, &u) < 0) \ + return -EINVAL; \ + if (u > max) \ + return -EINVAL; \ + return (type) u; \ + } + +#define _DEFINE_STRING_TABLE_LOOKUP(name,type,scope) \ + _DEFINE_STRING_TABLE_LOOKUP_TO_STRING(name,type,scope) \ + _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING(name,type,scope) + +#define _DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(name,type,yes,scope) \ + _DEFINE_STRING_TABLE_LOOKUP_TO_STRING(name,type,scope) \ + _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(name,type,yes,scope) + +#define DEFINE_STRING_TABLE_LOOKUP(name,type) _DEFINE_STRING_TABLE_LOOKUP(name,type,) +#define DEFINE_STRING_TABLE_LOOKUP_TO_STRING(name,type) _DEFINE_STRING_TABLE_LOOKUP_TO_STRING(name,type,) +#define DEFINE_STRING_TABLE_LOOKUP_FROM_STRING(name,type) _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING(name,type,) +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP(name,type) _DEFINE_STRING_TABLE_LOOKUP(name,type,static) +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(name,type) _DEFINE_STRING_TABLE_LOOKUP_TO_STRING(name,type,static) +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(name,type) _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING(name,type,static) + +#define DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(name,type,yes) _DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(name,type,yes,) +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(name,type,yes) _DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(name,type,yes,static) +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(name,type,yes) \ + _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(name,type,yes,static) + +/* For string conversions where numbers are also acceptable */ +#define DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(name,type,max) \ + _DEFINE_STRING_TABLE_LOOKUP_TO_STRING_FALLBACK(name,type,max,) \ + _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_FALLBACK(name,type,max,) +#define DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_FALLBACK(name,type,max) _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_FALLBACK(name,type,max,) + +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING_FALLBACK(name,type,max) \ + _DEFINE_STRING_TABLE_LOOKUP_TO_STRING_FALLBACK(name,type,max,static) +#define DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING_FALLBACK(name,type,max) \ + _DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_FALLBACK(name,type,max,static) + +#define DUMP_STRING_TABLE(name,type,max) \ + do { \ + flockfile(stdout); \ + for (type _k = 0; _k < (max); _k++) { \ + const char *_t; \ + _t = name##_to_string(_k); \ + if (!_t) \ + continue; \ + fputs_unlocked(_t, stdout); \ + fputc_unlocked('\n', stdout); \ + } \ + funlockfile(stdout); \ + } while (false) diff --git a/src/basic/string-util.c b/src/basic/string-util.c new file mode 100644 index 0000000..7329bfa --- /dev/null +++ b/src/basic/string-util.c @@ -0,0 +1,1521 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "escape.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "gunicode.h" +#include "locale-util.h" +#include "macro.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "utf8.h" + +char* first_word(const char *s, const char *word) { + size_t sl, wl; + const char *p; + + assert(s); + assert(word); + + /* Checks if the string starts with the specified word, either + * followed by NUL or by whitespace. Returns a pointer to the + * NUL or the first character after the whitespace. */ + + sl = strlen(s); + wl = strlen(word); + + if (sl < wl) + return NULL; + + if (wl == 0) + return (char*) s; + + if (memcmp(s, word, wl) != 0) + return NULL; + + p = s + wl; + if (*p == 0) + return (char*) p; + + if (!strchr(WHITESPACE, *p)) + return NULL; + + p += strspn(p, WHITESPACE); + return (char*) p; +} + +char *strnappend(const char *s, const char *suffix, size_t b) { + size_t a; + char *r; + + if (!s && !suffix) + return strdup(""); + + if (!s) + return strndup(suffix, b); + + if (!suffix) + return strdup(s); + + assert(s); + assert(suffix); + + a = strlen(s); + if (b > SIZE_MAX - a) + return NULL; + + r = new(char, a+b+1); + if (!r) + return NULL; + + memcpy(r, s, a); + memcpy(r+a, suffix, b); + r[a+b] = 0; + + return r; +} + +char *strjoin_real(const char *x, ...) { + va_list ap; + size_t l = 1; + char *r, *p; + + va_start(ap, x); + for (const char *t = x; t; t = va_arg(ap, const char *)) { + size_t n; + + n = strlen(t); + if (n > SIZE_MAX - l) { + va_end(ap); + return NULL; + } + l += n; + } + va_end(ap); + + p = r = new(char, l); + if (!r) + return NULL; + + va_start(ap, x); + for (const char *t = x; t; t = va_arg(ap, const char *)) + p = stpcpy(p, t); + va_end(ap); + + *p = 0; + + return r; +} + +char *strstrip(char *s) { + if (!s) + return NULL; + + /* Drops trailing whitespace. Modifies the string in place. Returns pointer to first non-space character */ + + return delete_trailing_chars(skip_leading_chars(s, WHITESPACE), WHITESPACE); +} + +char *delete_chars(char *s, const char *bad) { + char *f, *t; + + /* Drops all specified bad characters, regardless where in the string */ + + if (!s) + return NULL; + + if (!bad) + bad = WHITESPACE; + + for (f = s, t = s; *f; f++) { + if (strchr(bad, *f)) + continue; + + *(t++) = *f; + } + + *t = 0; + + return s; +} + +char *delete_trailing_chars(char *s, const char *bad) { + char *c = s; + + /* Drops all specified bad characters, at the end of the string */ + + if (!s) + return NULL; + + if (!bad) + bad = WHITESPACE; + + for (char *p = s; *p; p++) + if (!strchr(bad, *p)) + c = p + 1; + + *c = 0; + + return s; +} + +char *truncate_nl_full(char *s, size_t *ret_len) { + size_t n; + + assert(s); + + n = strcspn(s, NEWLINE); + s[n] = '\0'; + if (ret_len) + *ret_len = n; + return s; +} + +char ascii_tolower(char x) { + + if (x >= 'A' && x <= 'Z') + return x - 'A' + 'a'; + + return x; +} + +char ascii_toupper(char x) { + + if (x >= 'a' && x <= 'z') + return x - 'a' + 'A'; + + return x; +} + +char *ascii_strlower(char *t) { + assert(t); + + for (char *p = t; *p; p++) + *p = ascii_tolower(*p); + + return t; +} + +char *ascii_strupper(char *t) { + assert(t); + + for (char *p = t; *p; p++) + *p = ascii_toupper(*p); + + return t; +} + +char *ascii_strlower_n(char *t, size_t n) { + if (n <= 0) + return t; + + for (size_t i = 0; i < n; i++) + t[i] = ascii_tolower(t[i]); + + return t; +} + +int ascii_strcasecmp_n(const char *a, const char *b, size_t n) { + + for (; n > 0; a++, b++, n--) { + int x, y; + + x = (int) (uint8_t) ascii_tolower(*a); + y = (int) (uint8_t) ascii_tolower(*b); + + if (x != y) + return x - y; + } + + return 0; +} + +int ascii_strcasecmp_nn(const char *a, size_t n, const char *b, size_t m) { + int r; + + r = ascii_strcasecmp_n(a, b, MIN(n, m)); + if (r != 0) + return r; + + return CMP(n, m); +} + +bool chars_intersect(const char *a, const char *b) { + /* Returns true if any of the chars in a are in b. */ + for (const char *p = a; *p; p++) + if (strchr(b, *p)) + return true; + + return false; +} + +bool string_has_cc(const char *p, const char *ok) { + assert(p); + + /* + * Check if a string contains control characters. If 'ok' is + * non-NULL it may be a string containing additional CCs to be + * considered OK. + */ + + for (const char *t = p; *t; t++) { + if (ok && strchr(ok, *t)) + continue; + + if (char_is_cc(*t)) + return true; + } + + return false; +} + +static int write_ellipsis(char *buf, bool unicode) { + if (unicode || is_locale_utf8()) { + buf[0] = 0xe2; /* tri-dot ellipsis: … */ + buf[1] = 0x80; + buf[2] = 0xa6; + } else { + buf[0] = '.'; + buf[1] = '.'; + buf[2] = '.'; + } + + return 3; +} + +static size_t ansi_sequence_length(const char *s, size_t len) { + assert(s); + + if (len < 2) + return 0; + + if (s[0] != 0x1B) /* ASCII 27, aka ESC, aka Ctrl-[ */ + return 0; /* Not the start of a sequence */ + + if (s[1] == 0x5B) { /* [, start of CSI sequence */ + size_t i = 2; + + if (i == len) + return 0; + + while (s[i] >= 0x30 && s[i] <= 0x3F) /* Parameter bytes */ + if (++i == len) + return 0; + while (s[i] >= 0x20 && s[i] <= 0x2F) /* Intermediate bytes */ + if (++i == len) + return 0; + if (s[i] >= 0x40 && s[i] <= 0x7E) /* Final byte */ + return i + 1; + return 0; /* Bad sequence */ + + } else if (s[1] >= 0x40 && s[1] <= 0x5F) /* other non-CSI Fe sequence */ + return 2; + + return 0; /* Bad escape? */ +} + +static bool string_has_ansi_sequence(const char *s, size_t len) { + const char *t = s; + + while ((t = memchr(s, 0x1B, len - (t - s)))) + if (ansi_sequence_length(t, len - (t - s)) > 0) + return true; + return false; +} + +static size_t previous_ansi_sequence(const char *s, size_t length, const char **ret_where) { + /* Locate the previous ANSI sequence and save its start in *ret_where and return length. */ + + for (size_t i = length - 2; i > 0; i--) { /* -2 because at least two bytes are needed */ + size_t slen = ansi_sequence_length(s + (i - 1), length - (i - 1)); + if (slen == 0) + continue; + + *ret_where = s + (i - 1); + return slen; + } + + *ret_where = NULL; + return 0; +} + +static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { + size_t x, need_space, suffix_len; + char *t; + + assert(s); + assert(percent <= 100); + assert(new_length != SIZE_MAX); + + if (old_length <= new_length) + return strndup(s, old_length); + + /* Special case short ellipsations */ + switch (new_length) { + + case 0: + return strdup(""); + + case 1: + if (is_locale_utf8()) + return strdup("…"); + else + return strdup("."); + + case 2: + if (!is_locale_utf8()) + return strdup(".."); + + break; + + default: + break; + } + + /* Calculate how much space the ellipsis will take up. If we are in UTF-8 mode we only need space for one + * character ("…"), otherwise for three characters ("..."). Note that in both cases we need 3 bytes of storage, + * either for the UTF-8 encoded character or for three ASCII characters. */ + need_space = is_locale_utf8() ? 1 : 3; + + t = new(char, new_length+3); + if (!t) + return NULL; + + assert(new_length >= need_space); + + x = ((new_length - need_space) * percent + 50) / 100; + assert(x <= new_length - need_space); + + memcpy(t, s, x); + write_ellipsis(t + x, false); + suffix_len = new_length - x - need_space; + memcpy(t + x + 3, s + old_length - suffix_len, suffix_len); + *(t + x + 3 + suffix_len) = '\0'; + + return t; +} + +char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { + size_t x, k, len, len2; + const char *i, *j; + int r; + + /* Note that 'old_length' refers to bytes in the string, while 'new_length' refers to character cells taken up + * on screen. This distinction doesn't matter for ASCII strings, but it does matter for non-ASCII UTF-8 + * strings. + * + * Ellipsation is done in a locale-dependent way: + * 1. If the string passed in is fully ASCII and the current locale is not UTF-8, three dots are used ("...") + * 2. Otherwise, a unicode ellipsis is used ("…") + * + * In other words: you'll get a unicode ellipsis as soon as either the string contains non-ASCII characters or + * the current locale is UTF-8. + */ + + assert(s); + assert(percent <= 100); + + if (new_length == SIZE_MAX) + return strndup(s, old_length); + + if (new_length == 0) + return strdup(""); + + bool has_ansi_seq = string_has_ansi_sequence(s, old_length); + + /* If no multibyte characters or ANSI sequences, use ascii_ellipsize_mem for speed */ + if (!has_ansi_seq && ascii_is_valid_n(s, old_length)) + return ascii_ellipsize_mem(s, old_length, new_length, percent); + + x = (new_length - 1) * percent / 100; + assert(x <= new_length - 1); + + k = 0; + for (i = s; i < s + old_length; ) { + size_t slen = has_ansi_seq ? ansi_sequence_length(i, old_length - (i - s)) : 0; + if (slen > 0) { + i += slen; + continue; /* ANSI sequences don't take up any space in output */ + } + + char32_t c; + r = utf8_encoded_to_unichar(i, &c); + if (r < 0) + return NULL; + + int w = unichar_iswide(c) ? 2 : 1; + if (k + w > x) + break; + + k += w; + i += r; + } + + const char *ansi_start = s + old_length; + size_t ansi_len = 0; + + for (const char *t = j = s + old_length; t > i && k < new_length; ) { + char32_t c; + int w; + const char *tt; + + if (has_ansi_seq && ansi_start >= t) + /* Figure out the previous ANSI sequence, if any */ + ansi_len = previous_ansi_sequence(s, t - s, &ansi_start); + + /* If the sequence extends all the way to the current position, skip it. */ + if (has_ansi_seq && ansi_len > 0 && ansi_start + ansi_len == t) { + t = ansi_start; + continue; + } + + tt = utf8_prev_char(t); + r = utf8_encoded_to_unichar(tt, &c); + if (r < 0) + return NULL; + + w = unichar_iswide(c) ? 2 : 1; + if (k + w > new_length) + break; + + k += w; + j = t = tt; /* j should always point to the first "real" character */ + } + + /* We don't actually need to ellipsize */ + if (i >= j) + return memdup_suffix0(s, old_length); + + if (k >= new_length) { + /* Make space for ellipsis, if required and possible. We know that the edge character is not + * part of an ANSI sequence (because then we'd skip it). If the last character we looked at + * was wide, we don't need to make space. */ + if (j < s + old_length) + j = utf8_next_char(j); + else if (i > s) + i = utf8_prev_char(i); + } + + len = i - s; + len2 = s + old_length - j; + + /* If we have ANSI, allow the same length as the source string + ellipsis. It'd be too involved to + * figure out what exact space is needed. Strings with ANSI sequences are most likely to be fairly + * short anyway. */ + size_t alloc_len = has_ansi_seq ? old_length + 3 + 1 : len + 3 + len2 + 1; + + char *e = new(char, alloc_len); + if (!e) + return NULL; + + /* + printf("old_length=%zu new_length=%zu x=%zu len=%zu len2=%zu k=%zu\n", + old_length, new_length, x, len, len2, k); + */ + + memcpy_safe(e, s, len); + write_ellipsis(e + len, true); + + char *dst = e + len + 3; + + if (has_ansi_seq) + /* Copy over any ANSI sequences in full */ + for (const char *p = s + len; p < j; ) { + size_t slen = ansi_sequence_length(p, j - p); + if (slen > 0) { + dst = mempcpy(dst, p, slen); + p += slen; + } else + p = utf8_next_char(p); + } + + memcpy_safe(dst, j, len2); + dst[len2] = '\0'; + + return e; +} + +char *cellescape(char *buf, size_t len, const char *s) { + /* Escape and ellipsize s into buffer buf of size len. Only non-control ASCII + * characters are copied as they are, everything else is escaped. The result + * is different then if escaping and ellipsization was performed in two + * separate steps, because each sequence is either stored in full or skipped. + * + * This function should be used for logging about strings which expected to + * be plain ASCII in a safe way. + * + * An ellipsis will be used if s is too long. It was always placed at the + * very end. + */ + + size_t i = 0, last_char_width[4] = {}, k = 0; + + assert(len > 0); /* at least a terminating NUL */ + + for (;;) { + char four[4]; + int w; + + if (*s == 0) /* terminating NUL detected? then we are done! */ + goto done; + + w = cescape_char(*s, four); + if (i + w + 1 > len) /* This character doesn't fit into the buffer anymore? In that case let's + * ellipsize at the previous location */ + break; + + /* OK, there was space, let's add this escaped character to the buffer */ + memcpy(buf + i, four, w); + i += w; + + /* And remember its width in the ring buffer */ + last_char_width[k] = w; + k = (k + 1) % 4; + + s++; + } + + /* Ellipsation is necessary. This means we might need to truncate the string again to make space for 4 + * characters ideally, but the buffer is shorter than that in the first place take what we can get */ + for (size_t j = 0; j < ELEMENTSOF(last_char_width); j++) { + + if (i + 4 <= len) /* nice, we reached our space goal */ + break; + + k = k == 0 ? 3 : k - 1; + if (last_char_width[k] == 0) /* bummer, we reached the beginning of the strings */ + break; + + assert(i >= last_char_width[k]); + i -= last_char_width[k]; + } + + if (i + 4 <= len) /* yay, enough space */ + i += write_ellipsis(buf + i, false); + else if (i + 3 <= len) { /* only space for ".." */ + buf[i++] = '.'; + buf[i++] = '.'; + } else if (i + 2 <= len) /* only space for a single "." */ + buf[i++] = '.'; + else + assert(i + 1 <= len); + + done: + buf[i] = '\0'; + return buf; +} + +char* strshorten(char *s, size_t l) { + assert(s); + + if (strnlen(s, l+1) > l) + s[l] = 0; + + return s; +} + +int strgrowpad0(char **s, size_t l) { + size_t sz; + + assert(s); + + if (*s) { + sz = strlen(*s) + 1; + if (sz >= l) /* never shrink */ + return 0; + } else + sz = 0; + + char *q = realloc(*s, l); + if (!q) + return -ENOMEM; + + *s = q; + + memzero(*s + sz, l - sz); + return 0; +} + +char *strreplace(const char *text, const char *old_string, const char *new_string) { + size_t l, old_len, new_len; + char *t, *ret = NULL; + const char *f; + + assert(old_string); + assert(new_string); + + if (!text) + return NULL; + + old_len = strlen(old_string); + new_len = strlen(new_string); + + l = strlen(text); + if (!GREEDY_REALLOC(ret, l+1)) + return NULL; + + f = text; + t = ret; + while (*f) { + size_t d, nl; + + if (!startswith(f, old_string)) { + *(t++) = *(f++); + continue; + } + + d = t - ret; + nl = l - old_len + new_len; + + if (!GREEDY_REALLOC(ret, nl + 1)) + return mfree(ret); + + l = nl; + t = ret + d; + + t = stpcpy(t, new_string); + f += old_len; + } + + *t = 0; + return ret; +} + +static void advance_offsets( + ssize_t diff, + size_t offsets[2], /* note: we can't use [static 2] here, since this may be NULL */ + size_t shift[static 2], + size_t size) { + + if (!offsets) + return; + + assert(shift); + + if ((size_t) diff < offsets[0]) + shift[0] += size; + if ((size_t) diff < offsets[1]) + shift[1] += size; +} + +char *strip_tab_ansi(char **ibuf, size_t *_isz, size_t highlight[2]) { + const char *begin = NULL; + enum { + STATE_OTHER, + STATE_ESCAPE, + STATE_CSI, + STATE_CSO, + } state = STATE_OTHER; + _cleanup_(memstream_done) MemStream m = {}; + size_t isz, shift[2] = {}, n_carriage_returns = 0; + FILE *f; + + assert(ibuf); + assert(*ibuf); + + /* This does three things: + * + * 1. Replaces TABs by 8 spaces + * 2. Strips ANSI color sequences (a subset of CSI), i.e. ESC '[' … 'm' sequences + * 3. Strips ANSI operating system sequences (CSO), i.e. ESC ']' … BEL sequences + * 4. Strip trailing \r characters (since they would "move the cursor", but have no + * other effect). + * + * Everything else will be left as it is. In particular other ANSI sequences are left as they are, as + * are any other special characters. Truncated ANSI sequences are left-as is too. This call is + * supposed to suppress the most basic formatting noise, but nothing else. + * + * Why care for CSO sequences? Well, to undo what terminal_urlify() and friends generate. */ + + isz = _isz ? *_isz : strlen(*ibuf); + + /* Note we turn off internal locking on f for performance reasons. It's safe to do so since we + * created f here and it doesn't leave our scope. */ + f = memstream_init(&m); + if (!f) + return NULL; + + for (const char *i = *ibuf; i < *ibuf + isz + 1; i++) { + + switch (state) { + + case STATE_OTHER: + if (i >= *ibuf + isz) /* EOT */ + break; + + if (*i == '\r') { + n_carriage_returns++; + break; + } else if (*i == '\n') + /* Ignore carriage returns before new line */ + n_carriage_returns = 0; + for (; n_carriage_returns > 0; n_carriage_returns--) + fputc('\r', f); + + if (*i == '\x1B') + state = STATE_ESCAPE; + else if (*i == '\t') { + fputs(" ", f); + advance_offsets(i - *ibuf, highlight, shift, 7); + } else + fputc(*i, f); + + break; + + case STATE_ESCAPE: + assert(n_carriage_returns == 0); + + if (i >= *ibuf + isz) { /* EOT */ + fputc('\x1B', f); + advance_offsets(i - *ibuf, highlight, shift, 1); + break; + } else if (*i == '[') { /* ANSI CSI */ + state = STATE_CSI; + begin = i + 1; + } else if (*i == ']') { /* ANSI CSO */ + state = STATE_CSO; + begin = i + 1; + } else { + fputc('\x1B', f); + fputc(*i, f); + advance_offsets(i - *ibuf, highlight, shift, 1); + state = STATE_OTHER; + } + + break; + + case STATE_CSI: + assert(n_carriage_returns == 0); + + if (i >= *ibuf + isz || /* EOT … */ + !strchr("01234567890;m", *i)) { /* … or invalid chars in sequence */ + fputc('\x1B', f); + fputc('[', f); + advance_offsets(i - *ibuf, highlight, shift, 2); + state = STATE_OTHER; + i = begin-1; + } else if (*i == 'm') + state = STATE_OTHER; + + break; + + case STATE_CSO: + assert(n_carriage_returns == 0); + + if (i >= *ibuf + isz || /* EOT … */ + (*i != '\a' && (uint8_t) *i < 32U) || (uint8_t) *i > 126U) { /* … or invalid chars in sequence */ + fputc('\x1B', f); + fputc(']', f); + advance_offsets(i - *ibuf, highlight, shift, 2); + state = STATE_OTHER; + i = begin-1; + } else if (*i == '\a') + state = STATE_OTHER; + + break; + } + } + + char *obuf; + if (memstream_finalize(&m, &obuf, _isz) < 0) + return NULL; + + free_and_replace(*ibuf, obuf); + + if (highlight) { + highlight[0] += shift[0]; + highlight[1] += shift[1]; + } + + return *ibuf; +} + +char *strextend_with_separator_internal(char **x, const char *separator, ...) { + size_t f, l, l_separator; + bool need_separator; + char *nr, *p; + va_list ap; + + assert(x); + + l = f = strlen_ptr(*x); + + need_separator = !isempty(*x); + l_separator = strlen_ptr(separator); + + va_start(ap, separator); + for (;;) { + const char *t; + size_t n; + + t = va_arg(ap, const char *); + if (!t) + break; + + n = strlen(t); + + if (need_separator) + n += l_separator; + + if (n >= SIZE_MAX - l) { + va_end(ap); + return NULL; + } + + l += n; + need_separator = true; + } + va_end(ap); + + need_separator = !isempty(*x); + + nr = realloc(*x, GREEDY_ALLOC_ROUND_UP(l+1)); + if (!nr) + return NULL; + + *x = nr; + p = nr + f; + + va_start(ap, separator); + for (;;) { + const char *t; + + t = va_arg(ap, const char *); + if (!t) + break; + + if (need_separator && separator) + p = stpcpy(p, separator); + + p = stpcpy(p, t); + + need_separator = true; + } + va_end(ap); + + assert(p == nr + l); + + *p = 0; + + return p; +} + +int strextendf_with_separator(char **x, const char *separator, const char *format, ...) { + size_t m, a, l_separator; + va_list ap; + int l; + + /* Appends a formatted string to the specified string. Don't use this in inner loops, since then + * we'll spend a tonload of time in determining the length of the string passed in, over and over + * again. */ + + assert(x); + assert(format); + + l_separator = isempty(*x) ? 0 : strlen_ptr(separator); + + /* Let's try to use the allocated buffer, if there's room at the end still. Otherwise let's extend by 64 chars. */ + if (*x) { + m = strlen(*x); + a = MALLOC_SIZEOF_SAFE(*x); + assert(a >= m + 1); + } else + m = a = 0; + + if (a - m < 17 + l_separator) { /* if there's less than 16 chars space, then enlarge the buffer first */ + char *n; + + if (_unlikely_(l_separator > SIZE_MAX - 64)) /* overflow check #1 */ + return -ENOMEM; + if (_unlikely_(m > SIZE_MAX - 64 - l_separator)) /* overflow check #2 */ + return -ENOMEM; + + n = realloc(*x, m + 64 + l_separator); + if (!n) + return -ENOMEM; + + *x = n; + a = MALLOC_SIZEOF_SAFE(*x); + } + + /* Now, let's try to format the string into it */ + memcpy_safe(*x + m, separator, l_separator); + va_start(ap, format); + l = vsnprintf(*x + m + l_separator, a - m - l_separator, format, ap); + va_end(ap); + + assert(l >= 0); + + if ((size_t) l < a - m - l_separator) { + char *n; + + /* Nice! This worked. We are done. But first, let's return the extra space we don't + * need. This should be a cheap operation, since we only lower the allocation size here, + * never increase. */ + n = realloc(*x, m + (size_t) l + l_separator + 1); + if (n) + *x = n; + } else { + char *n; + + /* Wasn't enough. Then let's allocate exactly what we need. */ + + if (_unlikely_((size_t) l > SIZE_MAX - (l_separator + 1))) /* overflow check #1 */ + goto oom; + if (_unlikely_(m > SIZE_MAX - ((size_t) l + l_separator + 1))) /* overflow check #2 */ + goto oom; + + a = m + (size_t) l + l_separator + 1; + n = realloc(*x, a); + if (!n) + goto oom; + *x = n; + + va_start(ap, format); + l = vsnprintf(*x + m + l_separator, a - m - l_separator, format, ap); + va_end(ap); + + assert((size_t) l < a - m - l_separator); + } + + return 0; + +oom: + /* truncate the bytes added after the first vsnprintf() attempt again */ + (*x)[m] = 0; + return -ENOMEM; +} + +char *strextendn(char **x, const char *s, size_t l) { + assert(x); + assert(s || l == 0); + + if (l == SIZE_MAX) + l = strlen_ptr(s); + else if (l > 0) + l = strnlen(s, l); /* ignore trailing noise */ + + if (l > 0 || !*x) { + size_t q; + char *m; + + q = strlen_ptr(*x); + m = realloc(*x, q + l + 1); + if (!m) + return NULL; + + memcpy_safe(m + q, s, l); + m[q + l] = 0; + + *x = m; + } + + return *x; +} + +char *strrep(const char *s, unsigned n) { + char *r, *p; + size_t l; + + assert(s); + + l = strlen(s); + p = r = malloc(l * n + 1); + if (!r) + return NULL; + + for (unsigned i = 0; i < n; i++) + p = stpcpy(p, s); + + *p = 0; + return r; +} + +int split_pair(const char *s, const char *sep, char **l, char **r) { + char *x, *a, *b; + + assert(s); + assert(sep); + assert(l); + assert(r); + + if (isempty(sep)) + return -EINVAL; + + x = strstr(s, sep); + if (!x) + return -EINVAL; + + a = strndup(s, x - s); + if (!a) + return -ENOMEM; + + b = strdup(x + strlen(sep)); + if (!b) { + free(a); + return -ENOMEM; + } + + *l = a; + *r = b; + + return 0; +} + +int free_and_strdup(char **p, const char *s) { + char *t; + + assert(p); + + /* Replaces a string pointer with a strdup()ed new string, + * possibly freeing the old one. */ + + if (streq_ptr(*p, s)) + return 0; + + if (s) { + t = strdup(s); + if (!t) + return -ENOMEM; + } else + t = NULL; + + free_and_replace(*p, t); + + return 1; +} + +int free_and_strndup(char **p, const char *s, size_t l) { + char *t; + + assert(p); + assert(s || l == 0); + + /* Replaces a string pointer with a strndup()ed new string, + * freeing the old one. */ + + if (!*p && !s) + return 0; + + if (*p && s && strneq(*p, s, l) && (l > strlen(*p) || (*p)[l] == '\0')) + return 0; + + if (s) { + t = strndup(s, l); + if (!t) + return -ENOMEM; + } else + t = NULL; + + free_and_replace(*p, t); + return 1; +} + +bool string_is_safe(const char *p) { + if (!p) + return false; + + /* Checks if the specified string contains no quotes or control characters */ + + for (const char *t = p; *t; t++) { + if (*t > 0 && *t < ' ') /* no control characters */ + return false; + + if (strchr(QUOTES "\\\x7f", *t)) + return false; + } + + return true; +} + +char* string_erase(char *x) { + if (!x) + return NULL; + + /* A delicious drop of snake-oil! To be called on memory where we stored passphrases or so, after we + * used them. */ + explicit_bzero_safe(x, strlen(x)); + return x; +} + +int string_truncate_lines(const char *s, size_t n_lines, char **ret) { + const char *p = s, *e = s; + bool truncation_applied = false; + char *copy; + size_t n = 0; + + assert(s); + + /* Truncate after the specified number of lines. Returns > 0 if a truncation was applied or == 0 if + * there were fewer lines in the string anyway. Trailing newlines on input are ignored, and not + * generated either. */ + + for (;;) { + size_t k; + + k = strcspn(p, "\n"); + + if (p[k] == 0) { + if (k == 0) /* final empty line */ + break; + + if (n >= n_lines) /* above threshold */ + break; + + e = p + k; /* last line to include */ + break; + } + + assert(p[k] == '\n'); + + if (n >= n_lines) + break; + + if (k > 0) + e = p + k; + + p += k + 1; + n++; + } + + /* e points after the last character we want to keep */ + if (isempty(e)) + copy = strdup(s); + else { + if (!in_charset(e, "\n")) /* We only consider things truncated if we remove something that + * isn't a new-line or a series of them */ + truncation_applied = true; + + copy = strndup(s, e - s); + } + if (!copy) + return -ENOMEM; + + *ret = copy; + return truncation_applied; +} + +int string_extract_line(const char *s, size_t i, char **ret) { + const char *p = s; + size_t c = 0; + + /* Extract the i'nth line from the specified string. Returns > 0 if there are more lines after that, + * and == 0 if we are looking at the last line or already beyond the last line. As special + * optimization, if the first line is requested and the string only consists of one line we return + * NULL, indicating the input string should be used as is, and avoid a memory allocation for a very + * common case. */ + + for (;;) { + const char *q; + + q = strchr(p, '\n'); + if (i == c) { + /* The line we are looking for! */ + + if (q) { + char *m; + + m = strndup(p, q - p); + if (!m) + return -ENOMEM; + + *ret = m; + return !isempty(q + 1); /* more coming? */ + } else { + if (p == s) + *ret = NULL; /* Just use the input string */ + else { + char *m; + + m = strdup(p); + if (!m) + return -ENOMEM; + + *ret = m; + } + + return 0; /* The end */ + } + } + + if (!q) { + char *m; + + /* No more lines, return empty line */ + + m = strdup(""); + if (!m) + return -ENOMEM; + + *ret = m; + return 0; /* The end */ + } + + p = q + 1; + c++; + } +} + +int string_contains_word_strv(const char *string, const char *separators, char **words, const char **ret_word) { + /* In the default mode with no separators specified, we split on whitespace and + * don't coalesce separators. */ + const ExtractFlags flags = separators ? EXTRACT_DONT_COALESCE_SEPARATORS : 0; + + const char *found = NULL; + + for (const char *p = string;;) { + _cleanup_free_ char *w = NULL; + int r; + + r = extract_first_word(&p, &w, separators, flags); + if (r < 0) + return r; + if (r == 0) + break; + + found = strv_find(words, w); + if (found) + break; + } + + if (ret_word) + *ret_word = found; + return !!found; +} + +bool streq_skip_trailing_chars(const char *s1, const char *s2, const char *ok) { + if (!s1 && !s2) + return true; + if (!s1 || !s2) + return false; + + if (!ok) + ok = WHITESPACE; + + for (; *s1 && *s2; s1++, s2++) + if (*s1 != *s2) + break; + + return in_charset(s1, ok) && in_charset(s2, ok); +} + +char *string_replace_char(char *str, char old_char, char new_char) { + assert(str); + assert(old_char != '\0'); + assert(new_char != '\0'); + assert(old_char != new_char); + + for (char *p = strchr(str, old_char); p; p = strchr(p + 1, old_char)) + *p = new_char; + + return str; +} + +int make_cstring(const char *s, size_t n, MakeCStringMode mode, char **ret) { + char *b; + + assert(s || n == 0); + assert(mode >= 0); + assert(mode < _MAKE_CSTRING_MODE_MAX); + + /* Converts a sized character buffer into a NUL-terminated NUL string, refusing if there are embedded + * NUL bytes. Whether to expect a trailing NUL byte can be specified via 'mode' */ + + if (n == 0) { + if (mode == MAKE_CSTRING_REQUIRE_TRAILING_NUL) + return -EINVAL; + + if (!ret) + return 0; + + b = new0(char, 1); + } else { + const char *nul; + + nul = memchr(s, 0, n); + if (nul) { + if (nul < s + n - 1 || /* embedded NUL? */ + mode == MAKE_CSTRING_REFUSE_TRAILING_NUL) + return -EINVAL; + + n--; + } else if (mode == MAKE_CSTRING_REQUIRE_TRAILING_NUL) + return -EINVAL; + + if (!ret) + return 0; + + b = memdup_suffix0(s, n); + } + if (!b) + return -ENOMEM; + + *ret = b; + return 0; +} + +size_t strspn_from_end(const char *str, const char *accept) { + size_t n = 0; + + if (isempty(str)) + return 0; + + if (isempty(accept)) + return 0; + + for (const char *p = str + strlen(str); p > str && strchr(accept, p[-1]); p--) + n++; + + return n; +} + +char *strdupspn(const char *a, const char *accept) { + if (isempty(a) || isempty(accept)) + return strdup(""); + + return strndup(a, strspn(a, accept)); +} + +char *strdupcspn(const char *a, const char *reject) { + if (isempty(a)) + return strdup(""); + if (isempty(reject)) + return strdup(a); + + return strndup(a, strcspn(a, reject)); +} + +char *find_line_startswith(const char *haystack, const char *needle) { + char *p; + + assert(haystack); + assert(needle); + + /* Finds the first line in 'haystack' that starts with the specified string. Returns a pointer to the + * first character after it */ + + p = strstr(haystack, needle); + if (!p) + return NULL; + + if (p > haystack) + while (p[-1] != '\n') { + p = strstr(p + 1, needle); + if (!p) + return NULL; + } + + return p + strlen(needle); +} + +char *startswith_strv(const char *string, char **strv) { + char *found = NULL; + + STRV_FOREACH(i, strv) { + found = startswith(string, *i); + if (found) + break; + } + + return found; +} + +bool version_is_valid(const char *s) { + if (isempty(s)) + return false; + + if (!filename_part_is_valid(s)) + return false; + + /* This is a superset of the characters used by semver. We additionally allow "," and "_". */ + if (!in_charset(s, ALPHANUMERICAL ".,_-+")) + return false; + + return true; +} + +bool version_is_valid_versionspec(const char *s) { + if (!filename_part_is_valid(s)) + return false; + + if (!in_charset(s, ALPHANUMERICAL "-.~^")) + return false; + + return true; +} + +ssize_t strlevenshtein(const char *x, const char *y) { + _cleanup_free_ size_t *t0 = NULL, *t1 = NULL, *t2 = NULL; + size_t xl, yl; + + /* This is inspired from the Linux kernel's Levenshtein implementation */ + + if (streq_ptr(x, y)) + return 0; + + xl = strlen_ptr(x); + if (xl > SSIZE_MAX) + return -E2BIG; + + yl = strlen_ptr(y); + if (yl > SSIZE_MAX) + return -E2BIG; + + if (isempty(x)) + return yl; + if (isempty(y)) + return xl; + + t0 = new0(size_t, yl + 1); + if (!t0) + return -ENOMEM; + t1 = new0(size_t, yl + 1); + if (!t1) + return -ENOMEM; + t2 = new0(size_t, yl + 1); + if (!t2) + return -ENOMEM; + + for (size_t i = 0; i <= yl; i++) + t1[i] = i; + + for (size_t i = 0; i < xl; i++) { + t2[0] = i + 1; + + for (size_t j = 0; j < yl; j++) { + /* Substitution */ + t2[j+1] = t1[j] + (x[i] != y[j]); + + /* Swap */ + if (i > 0 && j > 0 && x[i-1] == y[j] && x[i] == y[j-1] && t2[j+1] > t0[j-1] + 1) + t2[j+1] = t0[j-1] + 1; + + /* Deletion */ + if (t2[j+1] > t1[j+1] + 1) + t2[j+1] = t1[j+1] + 1; + + /* Insertion */ + if (t2[j+1] > t2[j] + 1) + t2[j+1] = t2[j] + 1; + } + + size_t *dummy = t0; + t0 = t1; + t1 = t2; + t2 = dummy; + } + + return t1[yl]; +} diff --git a/src/basic/string-util.h b/src/basic/string-util.h new file mode 100644 index 0000000..b6d8be3 --- /dev/null +++ b/src/basic/string-util.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "alloc-util.h" +#include "macro.h" +#include "string-util-fundamental.h" + +/* What is interpreted as whitespace? */ +#define WHITESPACE " \t\n\r" +#define NEWLINE "\n\r" +#define QUOTES "\"\'" +#define COMMENTS "#;" +#define GLOB_CHARS "*?[" +#define DIGITS "0123456789" +#define LOWERCASE_LETTERS "abcdefghijklmnopqrstuvwxyz" +#define UPPERCASE_LETTERS "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define LETTERS LOWERCASE_LETTERS UPPERCASE_LETTERS +#define ALPHANUMERICAL LETTERS DIGITS +#define HEXDIGITS DIGITS "abcdefABCDEF" +#define LOWERCASE_HEXDIGITS DIGITS "abcdef" +#define URI_RESERVED ":/?#[]@!$&'()*+;=" /* [RFC3986] */ +#define URI_UNRESERVED ALPHANUMERICAL "-._~" /* [RFC3986] */ +#define URI_VALID URI_RESERVED URI_UNRESERVED /* [RFC3986] */ + +static inline char* strstr_ptr(const char *haystack, const char *needle) { + if (!haystack || !needle) + return NULL; + return strstr(haystack, needle); +} + +static inline char *strstrafter(const char *haystack, const char *needle) { + char *p; + + /* Returns NULL if not found, or pointer to first character after needle if found */ + + p = strstr_ptr(haystack, needle); + if (!p) + return NULL; + + return p + strlen(needle); +} + +static inline const char* strnull(const char *s) { + return s ?: "(null)"; +} + +static inline const char *strna(const char *s) { + return s ?: "n/a"; +} + +static inline const char* true_false(bool b) { + return b ? "true" : "false"; +} + +static inline const char* plus_minus(bool b) { + return b ? "+" : "-"; +} + +static inline const char* one_zero(bool b) { + return b ? "1" : "0"; +} + +static inline const char* enable_disable(bool b) { + return b ? "enable" : "disable"; +} + +static inline const char* enabled_disabled(bool b) { + return b ? "enabled" : "disabled"; +} + +/* This macro's return pointer will have the "const" qualifier set or unset the same way as the input + * pointer. */ +#define empty_to_null(p) \ + ({ \ + const char *_p = (p); \ + (typeof(p)) (isempty(_p) ? NULL : _p); \ + }) + +static inline const char *empty_to_na(const char *p) { + return isempty(p) ? "n/a" : p; +} + +static inline const char *empty_to_dash(const char *str) { + return isempty(str) ? "-" : str; +} + +static inline bool empty_or_dash(const char *str) { + return !str || + str[0] == 0 || + (str[0] == '-' && str[1] == 0); +} + +static inline const char *empty_or_dash_to_null(const char *p) { + return empty_or_dash(p) ? NULL : p; +} +#define empty_or_dash_to_null(p) \ + ({ \ + const char *_p = (p); \ + (typeof(p)) (empty_or_dash(_p) ? NULL : _p); \ + }) + +char *first_word(const char *s, const char *word) _pure_; + +char *strnappend(const char *s, const char *suffix, size_t length); + +char *strjoin_real(const char *x, ...) _sentinel_; +#define strjoin(a, ...) strjoin_real((a), __VA_ARGS__, NULL) + +#define strjoina(a, ...) \ + ({ \ + const char *_appendees_[] = { a, __VA_ARGS__ }; \ + char *_d_, *_p_; \ + size_t _len_ = 0; \ + size_t _i_; \ + for (_i_ = 0; _i_ < ELEMENTSOF(_appendees_) && _appendees_[_i_]; _i_++) \ + _len_ += strlen(_appendees_[_i_]); \ + _p_ = _d_ = newa(char, _len_ + 1); \ + for (_i_ = 0; _i_ < ELEMENTSOF(_appendees_) && _appendees_[_i_]; _i_++) \ + _p_ = stpcpy(_p_, _appendees_[_i_]); \ + *_p_ = 0; \ + _d_; \ + }) + +char *strstrip(char *s); +char *delete_chars(char *s, const char *bad); +char *delete_trailing_chars(char *s, const char *bad); +char *truncate_nl_full(char *s, size_t *ret_len); +static inline char *truncate_nl(char *s) { + return truncate_nl_full(s, NULL); +} + +static inline char *skip_leading_chars(const char *s, const char *bad) { + if (!s) + return NULL; + + if (!bad) + bad = WHITESPACE; + + return (char*) s + strspn(s, bad); +} + +char ascii_tolower(char x); +char *ascii_strlower(char *s); +char *ascii_strlower_n(char *s, size_t n); + +char ascii_toupper(char x); +char *ascii_strupper(char *s); + +int ascii_strcasecmp_n(const char *a, const char *b, size_t n); +int ascii_strcasecmp_nn(const char *a, size_t n, const char *b, size_t m); + +bool chars_intersect(const char *a, const char *b) _pure_; + +static inline bool _pure_ in_charset(const char *s, const char* charset) { + assert(s); + assert(charset); + return s[strspn(s, charset)] == '\0'; +} + +static inline bool char_is_cc(char p) { + /* char is unsigned on some architectures, e.g. aarch64. So, compiler may warn the condition + * p >= 0 is always true. See #19543. Hence, let's cast to unsigned before the comparison. Note + * that the cast in the right hand side is redundant, as according to the C standard, compilers + * automatically cast a signed value to unsigned when comparing with an unsigned variable. Just + * for safety and readability. */ + return (uint8_t) p < (uint8_t) ' ' || p == 127; +} +bool string_has_cc(const char *p, const char *ok) _pure_; + +char *ellipsize_mem(const char *s, size_t old_length_bytes, size_t new_length_columns, unsigned percent); +static inline char *ellipsize(const char *s, size_t length, unsigned percent) { + return ellipsize_mem(s, strlen(s), length, percent); +} + +char *cellescape(char *buf, size_t len, const char *s); + +/* This limit is arbitrary, enough to give some idea what the string contains */ +#define CELLESCAPE_DEFAULT_LENGTH 64 + +char* strshorten(char *s, size_t l); + +int strgrowpad0(char **s, size_t l); + +char *strreplace(const char *text, const char *old_string, const char *new_string); + +char *strip_tab_ansi(char **ibuf, size_t *_isz, size_t highlight[2]); + +char *strextend_with_separator_internal(char **x, const char *separator, ...) _sentinel_; +#define strextend_with_separator(x, separator, ...) strextend_with_separator_internal(x, separator, __VA_ARGS__, NULL) +#define strextend(x, ...) strextend_with_separator_internal(x, NULL, __VA_ARGS__, NULL) + +char *strextendn(char **x, const char *s, size_t l); + +int strextendf_with_separator(char **x, const char *separator, const char *format, ...) _printf_(3,4); +#define strextendf(x, ...) strextendf_with_separator(x, NULL, __VA_ARGS__) + +char *strrep(const char *s, unsigned n); + +#define strrepa(s, n) \ + ({ \ + char *_d_, *_p_; \ + size_t _len_ = strlen(s) * n; \ + _p_ = _d_ = newa(char, _len_ + 1); \ + for (unsigned _i_ = 0; _i_ < n; _i_++) \ + _p_ = stpcpy(_p_, s); \ + *_p_ = 0; \ + _d_; \ + }) + +int split_pair(const char *s, const char *sep, char **l, char **r); + +int free_and_strdup(char **p, const char *s); +static inline int free_and_strdup_warn(char **p, const char *s) { + int r; + + r = free_and_strdup(p, s); + if (r < 0) + return log_oom(); + return r; +} +int free_and_strndup(char **p, const char *s, size_t l); + +bool string_is_safe(const char *p) _pure_; + +DISABLE_WARNING_STRINGOP_TRUNCATION; +static inline void strncpy_exact(char *buf, const char *src, size_t buf_len) { + strncpy(buf, src, buf_len); +} +REENABLE_WARNING; + +/* Like startswith_no_case(), but operates on arbitrary memory blocks. + * It works only for ASCII strings. + */ +static inline void *memory_startswith_no_case(const void *p, size_t sz, const char *token) { + assert(token); + + size_t n = strlen(token); + if (sz < n) + return NULL; + + assert(p); + + for (size_t i = 0; i < n; i++) + if (ascii_tolower(((char *)p)[i]) != ascii_tolower(token[i])) + return NULL; + + return (uint8_t*) p + n; +} + +static inline char* str_realloc(char *p) { + /* Reallocate *p to actual size. Ignore failure, and return the original string on error. */ + + if (!p) + return NULL; + + return realloc(p, strlen(p) + 1) ?: p; +} + +char* string_erase(char *x); + +int string_truncate_lines(const char *s, size_t n_lines, char **ret); +int string_extract_line(const char *s, size_t i, char **ret); + +int string_contains_word_strv(const char *string, const char *separators, char **words, const char **ret_word); +static inline int string_contains_word(const char *string, const char *separators, const char *word) { + return string_contains_word_strv(string, separators, STRV_MAKE(word), NULL); +} + +bool streq_skip_trailing_chars(const char *s1, const char *s2, const char *ok); + +char *string_replace_char(char *str, char old_char, char new_char); + +typedef enum MakeCStringMode { + MAKE_CSTRING_REFUSE_TRAILING_NUL, + MAKE_CSTRING_ALLOW_TRAILING_NUL, + MAKE_CSTRING_REQUIRE_TRAILING_NUL, + _MAKE_CSTRING_MODE_MAX, + _MAKE_CSTRING_MODE_INVALID = -1, +} MakeCStringMode; + +int make_cstring(const char *s, size_t n, MakeCStringMode mode, char **ret); + +size_t strspn_from_end(const char *str, const char *accept); + +char *strdupspn(const char *a, const char *accept); +char *strdupcspn(const char *a, const char *reject); + +char *find_line_startswith(const char *haystack, const char *needle); + +char *startswith_strv(const char *string, char **strv); + +#define STARTSWITH_SET(p, ...) \ + startswith_strv(p, STRV_MAKE(__VA_ARGS__)) + +bool version_is_valid(const char *s); + +bool version_is_valid_versionspec(const char *s); + +ssize_t strlevenshtein(const char *x, const char *y); + +static inline int strdup_or_null(const char *s, char **ret) { + char *c; + + assert(ret); + + /* This is a lot like strdup(), but is happy with NULL strings, and does not treat that as error, but + * copies the NULL value. */ + + if (!s) { + *ret = NULL; + return 0; + } + + c = strdup(s); + if (!c) + return -ENOMEM; + + *ret = c; + return 1; +} diff --git a/src/basic/strv.c b/src/basic/strv.c new file mode 100644 index 0000000..1065e1b --- /dev/null +++ b/src/basic/strv.c @@ -0,0 +1,923 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "env-util.h" +#include "escape.h" +#include "extract-word.h" +#include "fileio.h" +#include "memory-util.h" +#include "nulstr-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" + +char* strv_find(char * const *l, const char *name) { + assert(name); + + STRV_FOREACH(i, l) + if (streq(*i, name)) + return *i; + + return NULL; +} + +char* strv_find_case(char * const *l, const char *name) { + assert(name); + + STRV_FOREACH(i, l) + if (strcaseeq(*i, name)) + return *i; + + return NULL; +} + +char* strv_find_prefix(char * const *l, const char *name) { + assert(name); + + STRV_FOREACH(i, l) + if (startswith(*i, name)) + return *i; + + return NULL; +} + +char* strv_find_startswith(char * const *l, const char *name) { + assert(name); + + /* Like strv_find_prefix, but actually returns only the + * suffix, not the whole item */ + + STRV_FOREACH(i, l) { + char *e; + + e = startswith(*i, name); + if (e) + return e; + } + + return NULL; +} + +char* strv_find_first_field(char * const *needles, char * const *haystack) { + STRV_FOREACH(k, needles) { + char *value = strv_env_pairs_get((char **)haystack, *k); + if (value) + return value; + } + + return NULL; +} + +char** strv_free(char **l) { + STRV_FOREACH(k, l) + free(*k); + + return mfree(l); +} + +char** strv_free_erase(char **l) { + STRV_FOREACH(i, l) + erase_and_freep(i); + + return mfree(l); +} + +void strv_free_many(char ***strvs, size_t n) { + assert(strvs || n == 0); + + FOREACH_ARRAY (i, strvs, n) + strv_free(*i); + + free(strvs); +} + +char** strv_copy_n(char * const *l, size_t m) { + _cleanup_strv_free_ char **result = NULL; + char **k; + + result = new(char*, MIN(strv_length(l), m) + 1); + if (!result) + return NULL; + + k = result; + STRV_FOREACH(i, l) { + if (m == 0) + break; + + *k = strdup(*i); + if (!*k) + return NULL; + k++; + + if (m != SIZE_MAX) + m--; + } + + *k = NULL; + return TAKE_PTR(result); +} + +int strv_copy_unless_empty(char * const *l, char ***ret) { + assert(ret); + + if (strv_isempty(l)) { + *ret = NULL; + return 0; + } + + char **copy = strv_copy(l); + if (!copy) + return -ENOMEM; + + *ret = TAKE_PTR(copy); + return 1; +} + +size_t strv_length(char * const *l) { + size_t n = 0; + + STRV_FOREACH(i, l) + n++; + + return n; +} + +char** strv_new_ap(const char *x, va_list ap) { + _cleanup_strv_free_ char **a = NULL; + size_t n = 0, i = 0; + va_list aq; + + /* As a special trick we ignore all listed strings that equal + * STRV_IGNORE. This is supposed to be used with the + * STRV_IFNOTNULL() macro to include possibly NULL strings in + * the string list. */ + + va_copy(aq, ap); + for (const char *s = x; s; s = va_arg(aq, const char*)) { + if (s == STRV_IGNORE) + continue; + + n++; + } + va_end(aq); + + a = new(char*, n+1); + if (!a) + return NULL; + + for (const char *s = x; s; s = va_arg(ap, const char*)) { + if (s == STRV_IGNORE) + continue; + + a[i] = strdup(s); + if (!a[i]) + return NULL; + + i++; + } + + a[i] = NULL; + + return TAKE_PTR(a); +} + +char** strv_new_internal(const char *x, ...) { + char **r; + va_list ap; + + va_start(ap, x); + r = strv_new_ap(x, ap); + va_end(ap); + + return r; +} + +int strv_extend_strv(char ***a, char * const *b, bool filter_duplicates) { + size_t p, q, i = 0; + char **t; + + assert(a); + + if (strv_isempty(b)) + return 0; + + p = strv_length(*a); + q = strv_length(b); + + if (p >= SIZE_MAX - q) + return -ENOMEM; + + t = reallocarray(*a, GREEDY_ALLOC_ROUND_UP(p + q + 1), sizeof(char *)); + if (!t) + return -ENOMEM; + + t[p] = NULL; + *a = t; + + STRV_FOREACH(s, b) { + if (filter_duplicates && strv_contains(t, *s)) + continue; + + t[p+i] = strdup(*s); + if (!t[p+i]) + goto rollback; + + i++; + t[p+i] = NULL; + } + + assert(i <= q); + + return (int) i; + +rollback: + free_many_charp(t + p, i); + t[p] = NULL; + return -ENOMEM; +} + +int strv_extend_strv_concat(char ***a, char * const *b, const char *suffix) { + int r; + + STRV_FOREACH(s, b) { + char *v; + + v = strjoin(*s, suffix); + if (!v) + return -ENOMEM; + + r = strv_push(a, v); + if (r < 0) { + free(v); + return r; + } + } + + return 0; +} + +int strv_split_newlines_full(char ***ret, const char *s, ExtractFlags flags) { + _cleanup_strv_free_ char **l = NULL; + size_t n; + int r; + + assert(s); + + /* Special version of strv_split_full() that splits on newlines and + * suppresses an empty string at the end. */ + + r = strv_split_full(&l, s, NEWLINE, flags); + if (r < 0) + return r; + + n = strv_length(l); + if (n > 0 && isempty(l[n - 1])) { + l[n - 1] = mfree(l[n - 1]); + n--; + } + + *ret = TAKE_PTR(l); + return n; +} + +int strv_split_full(char ***t, const char *s, const char *separators, ExtractFlags flags) { + _cleanup_strv_free_ char **l = NULL; + size_t n = 0; + int r; + + assert(t); + assert(s); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&s, &word, separators, flags); + if (r < 0) + return r; + if (r == 0) + break; + + if (!GREEDY_REALLOC(l, n + 2)) + return -ENOMEM; + + l[n++] = TAKE_PTR(word); + l[n] = NULL; + } + + if (!l) { + l = new0(char*, 1); + if (!l) + return -ENOMEM; + } + + *t = TAKE_PTR(l); + + return (int) n; +} + +int strv_split_and_extend_full(char ***t, const char *s, const char *separators, bool filter_duplicates, ExtractFlags flags) { + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(t); + assert(s); + + r = strv_split_full(&l, s, separators, flags); + if (r < 0) + return r; + + r = strv_extend_strv(t, l, filter_duplicates); + if (r < 0) + return r; + + return (int) strv_length(*t); +} + +int strv_split_colon_pairs(char ***t, const char *s) { + _cleanup_strv_free_ char **l = NULL; + size_t n = 0; + int r; + + assert(t); + assert(s); + + for (;;) { + _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL, *second_or_empty = NULL; + + r = extract_first_word(&s, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + const char *p = tuple; + r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, + &first, &second, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + /* Enforce that at most 2 colon-separated words are contained in each group */ + if (!isempty(p)) + return -EINVAL; + + second_or_empty = strdup(strempty(second)); + if (!second_or_empty) + return -ENOMEM; + + if (!GREEDY_REALLOC(l, n + 3)) + return -ENOMEM; + + l[n++] = TAKE_PTR(first); + l[n++] = TAKE_PTR(second_or_empty); + + l[n] = NULL; + } + + if (!l) { + l = new0(char*, 1); + if (!l) + return -ENOMEM; + } + + *t = TAKE_PTR(l); + + return (int) n; +} + +char* strv_join_full(char * const *l, const char *separator, const char *prefix, bool escape_separator) { + char *r, *e; + size_t n, k, m; + + if (!separator) + separator = " "; + + k = strlen(separator); + m = strlen_ptr(prefix); + + if (escape_separator) /* If the separator was multi-char, we wouldn't know how to escape it. */ + assert(k == 1); + + n = 0; + STRV_FOREACH(s, l) { + if (s != l) + n += k; + + bool needs_escaping = escape_separator && strchr(*s, *separator); + + n += m + strlen(*s) * (1 + needs_escaping); + } + + r = new(char, n+1); + if (!r) + return NULL; + + e = r; + STRV_FOREACH(s, l) { + if (s != l) + e = stpcpy(e, separator); + + if (prefix) + e = stpcpy(e, prefix); + + bool needs_escaping = escape_separator && strchr(*s, *separator); + + if (needs_escaping) + for (size_t i = 0; (*s)[i]; i++) { + if ((*s)[i] == *separator) + *(e++) = '\\'; + *(e++) = (*s)[i]; + } + else + e = stpcpy(e, *s); + } + + *e = 0; + + return r; +} + +int strv_push_with_size(char ***l, size_t *n, char *value) { + /* n is a pointer to a variable to store the size of l. + * If not given (i.e. n is NULL or *n is SIZE_MAX), size will be calculated using strv_length(). + * If n is not NULL, the size after the push will be returned. + * If value is empty, no action is taken and *n is not set. */ + + if (!value) + return 0; + + size_t size = n ? *n : SIZE_MAX; + if (size == SIZE_MAX) + size = strv_length(*l); + + /* Check for overflow */ + if (size > SIZE_MAX-2) + return -ENOMEM; + + char **c = reallocarray(*l, GREEDY_ALLOC_ROUND_UP(size + 2), sizeof(char*)); + if (!c) + return -ENOMEM; + + c[size] = value; + c[size+1] = NULL; + + *l = c; + if (n) + *n = size + 1; + return 0; +} + +int strv_push_pair(char ***l, char *a, char *b) { + char **c; + size_t n; + + if (!a && !b) + return 0; + + n = strv_length(*l); + + /* Check for overflow */ + if (n > SIZE_MAX-3) + return -ENOMEM; + + /* increase and check for overflow */ + c = reallocarray(*l, GREEDY_ALLOC_ROUND_UP(n + !!a + !!b + 1), sizeof(char*)); + if (!c) + return -ENOMEM; + + if (a) + c[n++] = a; + if (b) + c[n++] = b; + c[n] = NULL; + + *l = c; + return 0; +} + +int strv_insert(char ***l, size_t position, char *value) { + char **c; + size_t n, m; + + if (!value) + return 0; + + n = strv_length(*l); + position = MIN(position, n); + + /* increase and check for overflow */ + m = n + 2; + if (m < n) + return -ENOMEM; + + c = new(char*, m); + if (!c) + return -ENOMEM; + + for (size_t i = 0; i < position; i++) + c[i] = (*l)[i]; + c[position] = value; + for (size_t i = position; i < n; i++) + c[i+1] = (*l)[i]; + c[n+1] = NULL; + + return free_and_replace(*l, c); +} + +int strv_consume_with_size(char ***l, size_t *n, char *value) { + int r; + + r = strv_push_with_size(l, n, value); + if (r < 0) + free(value); + + return r; +} + +int strv_consume_pair(char ***l, char *a, char *b) { + int r; + + r = strv_push_pair(l, a, b); + if (r < 0) { + free(a); + free(b); + } + + return r; +} + +int strv_consume_prepend(char ***l, char *value) { + int r; + + r = strv_push_prepend(l, value); + if (r < 0) + free(value); + + return r; +} + +int strv_prepend(char ***l, const char *value) { + char *v; + + if (!value) + return 0; + + v = strdup(value); + if (!v) + return -ENOMEM; + + return strv_consume_prepend(l, v); +} + +int strv_extend_with_size(char ***l, size_t *n, const char *value) { + char *v; + + if (!value) + return 0; + + v = strdup(value); + if (!v) + return -ENOMEM; + + return strv_consume_with_size(l, n, v); +} + +int strv_extend_front(char ***l, const char *value) { + size_t n, m; + char *v, **c; + + assert(l); + + /* Like strv_extend(), but prepends rather than appends the new entry */ + + if (!value) + return 0; + + n = strv_length(*l); + + /* Increase and overflow check. */ + m = n + 2; + if (m < n) + return -ENOMEM; + + v = strdup(value); + if (!v) + return -ENOMEM; + + c = reallocarray(*l, m, sizeof(char*)); + if (!c) { + free(v); + return -ENOMEM; + } + + memmove(c+1, c, n * sizeof(char*)); + c[0] = v; + c[n+1] = NULL; + + *l = c; + return 0; +} + +char** strv_uniq(char **l) { + /* Drops duplicate entries. The first identical string will be + * kept, the others dropped */ + + STRV_FOREACH(i, l) + strv_remove(i+1, *i); + + return l; +} + +bool strv_is_uniq(char * const *l) { + STRV_FOREACH(i, l) + if (strv_contains(i+1, *i)) + return false; + + return true; +} + +char** strv_remove(char **l, const char *s) { + char **f, **t; + + if (!l) + return NULL; + + assert(s); + + /* Drops every occurrence of s in the string list, edits + * in-place. */ + + for (f = t = l; *f; f++) + if (streq(*f, s)) + free(*f); + else + *(t++) = *f; + + *t = NULL; + return l; +} + +bool strv_overlap(char * const *a, char * const *b) { + STRV_FOREACH(i, a) + if (strv_contains(b, *i)) + return true; + + return false; +} + +static int str_compare(char * const *a, char * const *b) { + return strcmp(*a, *b); +} + +char** strv_sort(char **l) { + typesafe_qsort(l, strv_length(l), str_compare); + return l; +} + +int strv_compare(char * const *a, char * const *b) { + int r; + + if (strv_isempty(a)) { + if (strv_isempty(b)) + return 0; + else + return -1; + } + + if (strv_isempty(b)) + return 1; + + for ( ; *a || *b; ++a, ++b) { + r = strcmp_ptr(*a, *b); + if (r != 0) + return r; + } + + return 0; +} + +void strv_print_full(char * const *l, const char *prefix) { + STRV_FOREACH(s, l) + printf("%s%s\n", strempty(prefix), *s); +} + +int strv_extendf(char ***l, const char *format, ...) { + va_list ap; + char *x; + int r; + + va_start(ap, format); + r = vasprintf(&x, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return strv_consume(l, x); +} + +char** strv_reverse(char **l) { + size_t n; + + n = strv_length(l); + if (n <= 1) + return l; + + for (size_t i = 0; i < n / 2; i++) + SWAP_TWO(l[i], l[n-1-i]); + + return l; +} + +char** strv_shell_escape(char **l, const char *bad) { + /* Escapes every character in every string in l that is in bad, + * edits in-place, does not roll-back on error. */ + + STRV_FOREACH(s, l) { + char *v; + + v = shell_escape(*s, bad); + if (!v) + return NULL; + + free_and_replace(*s, v); + } + + return l; +} + +bool strv_fnmatch_full( + char* const* patterns, + const char *s, + int flags, + size_t *ret_matched_pos) { + + assert(s); + + if (patterns) + for (size_t i = 0; patterns[i]; i++) + /* NB: We treat all fnmatch() errors as equivalent to FNM_NOMATCH, i.e. if fnmatch() fails to + * process the pattern for some reason we'll consider this equivalent to non-matching. */ + if (fnmatch(patterns[i], s, flags) == 0) { + if (ret_matched_pos) + *ret_matched_pos = i; + return true; + } + + if (ret_matched_pos) + *ret_matched_pos = SIZE_MAX; + + return false; +} + +char** strv_skip(char **l, size_t n) { + + while (n > 0) { + if (strv_isempty(l)) + return l; + + l++, n--; + } + + return l; +} + +int strv_extend_n(char ***l, const char *value, size_t n) { + size_t i, k; + char **nl; + + assert(l); + + if (!value) + return 0; + if (n == 0) + return 0; + + /* Adds the value n times to l */ + + k = strv_length(*l); + if (n >= SIZE_MAX - k) + return -ENOMEM; + + nl = reallocarray(*l, GREEDY_ALLOC_ROUND_UP(k + n + 1), sizeof(char *)); + if (!nl) + return -ENOMEM; + + *l = nl; + + for (i = k; i < k + n; i++) { + nl[i] = strdup(value); + if (!nl[i]) + goto rollback; + } + nl[i] = NULL; + + return 0; + +rollback: + for (size_t j = k; j < i; j++) + free(nl[j]); + nl[k] = NULL; + + return -ENOMEM; +} + +int strv_extend_assignment(char ***l, const char *lhs, const char *rhs) { + char *j; + + assert(l); + assert(lhs); + + if (!rhs) /* value is optional, in which case we suppress the field */ + return 0; + + j = strjoin(lhs, "=", rhs); + if (!j) + return -ENOMEM; + + return strv_consume(l, j); +} + +int fputstrv(FILE *f, char * const *l, const char *separator, bool *space) { + bool b = false; + int r; + + /* Like fputs(), but for strv, and with a less stupid argument order */ + + if (!space) + space = &b; + + STRV_FOREACH(s, l) { + r = fputs_with_space(f, *s, separator, space); + if (r < 0) + return r; + } + + return 0; +} + +static int string_strv_hashmap_put_internal(Hashmap *h, const char *key, const char *value) { + char **l; + int r; + + l = hashmap_get(h, key); + if (l) { + /* A list for this key already exists, let's append to it if it is not listed yet */ + if (strv_contains(l, value)) + return 0; + + r = strv_extend(&l, value); + if (r < 0) + return r; + + assert_se(hashmap_update(h, key, l) >= 0); + } else { + /* No list for this key exists yet, create one */ + _cleanup_strv_free_ char **l2 = NULL; + _cleanup_free_ char *t = NULL; + + t = strdup(key); + if (!t) + return -ENOMEM; + + r = strv_extend(&l2, value); + if (r < 0) + return r; + + r = hashmap_put(h, t, l2); + if (r < 0) + return r; + TAKE_PTR(t); + TAKE_PTR(l2); + } + + return 1; +} + +int _string_strv_hashmap_put(Hashmap **h, const char *key, const char *value HASHMAP_DEBUG_PARAMS) { + int r; + + r = _hashmap_ensure_allocated(h, &string_strv_hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + return string_strv_hashmap_put_internal(*h, key, value); +} + +int _string_strv_ordered_hashmap_put(OrderedHashmap **h, const char *key, const char *value HASHMAP_DEBUG_PARAMS) { + int r; + + r = _ordered_hashmap_ensure_allocated(h, &string_strv_hash_ops HASHMAP_DEBUG_PASS_ARGS); + if (r < 0) + return r; + + return string_strv_hashmap_put_internal(PLAIN_HASHMAP(*h), key, value); +} + +DEFINE_HASH_OPS_FULL(string_strv_hash_ops, char, string_hash_func, string_compare_func, free, char*, strv_free); diff --git a/src/basic/strv.h b/src/basic/strv.h new file mode 100644 index 0000000..03089d5 --- /dev/null +++ b/src/basic/strv.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "extract-word.h" +#include "hashmap.h" +#include "macro.h" +#include "string-util.h" + +char* strv_find(char * const *l, const char *name) _pure_; +char* strv_find_case(char * const *l, const char *name) _pure_; +char* strv_find_prefix(char * const *l, const char *name) _pure_; +char* strv_find_startswith(char * const *l, const char *name) _pure_; +/* Given two vectors, the first a list of keys and the second a list of key-value pairs, returns the value + * of the first key from the first vector that is found in the second vector. */ +char* strv_find_first_field(char * const *needles, char * const *haystack) _pure_; + +#define strv_contains(l, s) (!!strv_find((l), (s))) +#define strv_contains_case(l, s) (!!strv_find_case((l), (s))) + +char** strv_free(char **l); +DEFINE_TRIVIAL_CLEANUP_FUNC(char**, strv_free); +#define _cleanup_strv_free_ _cleanup_(strv_freep) + +char** strv_free_erase(char **l); +DEFINE_TRIVIAL_CLEANUP_FUNC(char**, strv_free_erase); +#define _cleanup_strv_free_erase_ _cleanup_(strv_free_erasep) + +void strv_free_many(char ***strvs, size_t n); + +char** strv_copy_n(char * const *l, size_t n); +static inline char** strv_copy(char * const *l) { + return strv_copy_n(l, SIZE_MAX); +} +int strv_copy_unless_empty(char * const *l, char ***ret); + +size_t strv_length(char * const *l) _pure_; + +int strv_extend_strv(char ***a, char * const *b, bool filter_duplicates); +int strv_extend_strv_concat(char ***a, char * const *b, const char *suffix); +int strv_prepend(char ***l, const char *value); + +/* _with_size() are lower-level functions where the size can be provided externally, + * which allows us to skip iterating over the strv to find the end, which saves + * a bit of time and reduces the complexity of appending from O(n²) to O(n). */ + +int strv_extend_with_size(char ***l, size_t *n, const char *value); +static inline int strv_extend(char ***l, const char *value) { + return strv_extend_with_size(l, NULL, value); +} + +int strv_extendf(char ***l, const char *format, ...) _printf_(2,3); +int strv_extend_front(char ***l, const char *value); + +int strv_push_with_size(char ***l, size_t *n, char *value); +static inline int strv_push(char ***l, char *value) { + return strv_push_with_size(l, NULL, value); +} +int strv_push_pair(char ***l, char *a, char *b); + +int strv_insert(char ***l, size_t position, char *value); + +static inline int strv_push_prepend(char ***l, char *value) { + return strv_insert(l, 0, value); +} + +int strv_consume_with_size(char ***l, size_t *n, char *value); +static inline int strv_consume(char ***l, char *value) { + return strv_consume_with_size(l, NULL, value); +} + +int strv_consume_pair(char ***l, char *a, char *b); +int strv_consume_prepend(char ***l, char *value); + +char** strv_remove(char **l, const char *s); +char** strv_uniq(char **l); +bool strv_is_uniq(char * const *l); + +int strv_compare(char * const *a, char * const *b); +static inline bool strv_equal(char * const *a, char * const *b) { + return strv_compare(a, b) == 0; +} + +char** strv_new_internal(const char *x, ...) _sentinel_; +char** strv_new_ap(const char *x, va_list ap); +#define strv_new(...) strv_new_internal(__VA_ARGS__, NULL) + +#define STRV_IGNORE ((const char *) POINTER_MAX) + +static inline const char* STRV_IFNOTNULL(const char *x) { + return x ?: STRV_IGNORE; +} + +static inline bool strv_isempty(char * const *l) { + return !l || !*l; +} + +int strv_split_full(char ***t, const char *s, const char *separators, ExtractFlags flags); +static inline char** strv_split(const char *s, const char *separators) { + char **ret; + + if (strv_split_full(&ret, s, separators, EXTRACT_RETAIN_ESCAPE) < 0) + return NULL; + + return ret; +} + +int strv_split_and_extend_full(char ***t, const char *s, const char *separators, bool filter_duplicates, ExtractFlags flags); +#define strv_split_and_extend(t, s, sep, dup) strv_split_and_extend_full(t, s, sep, dup, 0) + +int strv_split_newlines_full(char ***ret, const char *s, ExtractFlags flags); +static inline char** strv_split_newlines(const char *s) { + char **ret; + + if (strv_split_newlines_full(&ret, s, 0) < 0) + return NULL; + + return ret; +} + +/* Given a string containing white-space separated tuples of words themselves separated by ':', + * returns a vector of strings. If the second element in a tuple is missing, the corresponding + * string in the vector is an empty string. */ +int strv_split_colon_pairs(char ***t, const char *s); + +char* strv_join_full(char * const *l, const char *separator, const char *prefix, bool escape_separator); +static inline char *strv_join(char * const *l, const char *separator) { + return strv_join_full(l, separator, NULL, false); +} + +bool strv_overlap(char * const *a, char * const *b) _pure_; + +#define _STRV_FOREACH_BACKWARDS(s, l, h, i) \ + for (typeof(*(l)) *s, *h = (l), *i = ({ \ + size_t _len = strv_length(h); \ + _len > 0 ? h + _len - 1 : NULL; \ + }); \ + (s = i); \ + i = PTR_SUB1(i, h)) + +#define STRV_FOREACH_BACKWARDS(s, l) \ + _STRV_FOREACH_BACKWARDS(s, l, UNIQ_T(h, UNIQ), UNIQ_T(i, UNIQ)) + +#define _STRV_FOREACH_PAIR(x, y, l, i) \ + for (typeof(*l) *x, *y, *i = (l); \ + i && *(x = i) && *(y = i + 1); \ + i += 2) + +#define STRV_FOREACH_PAIR(x, y, l) \ + _STRV_FOREACH_PAIR(x, y, l, UNIQ_T(i, UNIQ)) + +char** strv_sort(char **l); +void strv_print_full(char * const *l, const char *prefix); +static inline void strv_print(char * const *l) { + strv_print_full(l, NULL); +} + +#define strv_from_stdarg_alloca(first) \ + ({ \ + char **_l; \ + \ + if (!first) \ + _l = (char**) &first; \ + else { \ + size_t _n; \ + va_list _ap; \ + \ + _n = 1; \ + va_start(_ap, first); \ + while (va_arg(_ap, char*)) \ + _n++; \ + va_end(_ap); \ + \ + _l = newa(char*, _n+1); \ + _l[_n = 0] = (char*) first; \ + va_start(_ap, first); \ + for (;;) { \ + _l[++_n] = va_arg(_ap, char*); \ + if (!_l[_n]) \ + break; \ + } \ + va_end(_ap); \ + } \ + _l; \ + }) + +#define STR_IN_SET(x, ...) strv_contains(STRV_MAKE(__VA_ARGS__), x) +#define STRPTR_IN_SET(x, ...) \ + ({ \ + const char* _x = (x); \ + _x && strv_contains(STRV_MAKE(__VA_ARGS__), _x); \ + }) + +#define STRCASE_IN_SET(x, ...) strv_contains_case(STRV_MAKE(__VA_ARGS__), x) +#define STRCASEPTR_IN_SET(x, ...) \ + ({ \ + const char* _x = (x); \ + _x && strv_contains_case(STRV_MAKE(__VA_ARGS__), _x); \ + }) + +#define ENDSWITH_SET(p, ...) \ + ({ \ + const char *_p = (p); \ + char *_found = NULL; \ + STRV_FOREACH(_i, STRV_MAKE(__VA_ARGS__)) { \ + _found = endswith(_p, *_i); \ + if (_found) \ + break; \ + } \ + _found; \ + }) + +#define _FOREACH_STRING(uniq, x, y, ...) \ + for (const char *x, * const*UNIQ_T(l, uniq) = STRV_MAKE_CONST(({ x = y; }), ##__VA_ARGS__); \ + x; \ + x = *(++UNIQ_T(l, uniq))) + +#define FOREACH_STRING(x, y, ...) \ + _FOREACH_STRING(UNIQ, x, y, ##__VA_ARGS__) + +char** strv_reverse(char **l); +char** strv_shell_escape(char **l, const char *bad); + +bool strv_fnmatch_full(char* const* patterns, const char *s, int flags, size_t *ret_matched_pos); +static inline bool strv_fnmatch(char* const* patterns, const char *s) { + return strv_fnmatch_full(patterns, s, 0, NULL); +} + +static inline bool strv_fnmatch_or_empty(char* const* patterns, const char *s, int flags) { + assert(s); + return strv_isempty(patterns) || + strv_fnmatch_full(patterns, s, flags, NULL); +} + +char** strv_skip(char **l, size_t n); + +int strv_extend_n(char ***l, const char *value, size_t n); + +int strv_extend_assignment(char ***l, const char *lhs, const char *rhs); + +int fputstrv(FILE *f, char * const *l, const char *separator, bool *space); + +#define strv_free_and_replace(a, b) \ + free_and_replace_full(a, b, strv_free) + +extern const struct hash_ops string_strv_hash_ops; +int _string_strv_hashmap_put(Hashmap **h, const char *key, const char *value HASHMAP_DEBUG_PARAMS); +int _string_strv_ordered_hashmap_put(OrderedHashmap **h, const char *key, const char *value HASHMAP_DEBUG_PARAMS); +#define string_strv_hashmap_put(h, k, v) _string_strv_hashmap_put(h, k, v HASHMAP_DEBUG_SRC_ARGS) +#define string_strv_ordered_hashmap_put(h, k, v) _string_strv_ordered_hashmap_put(h, k, v HASHMAP_DEBUG_SRC_ARGS) diff --git a/src/basic/strxcpyx.c b/src/basic/strxcpyx.c new file mode 100644 index 0000000..52b9565 --- /dev/null +++ b/src/basic/strxcpyx.c @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* + * Concatenates/copies strings. In any case, terminates in all cases + * with '\0' and moves the @dest pointer forward to the added '\0'. + * Returns the remaining size, and 0 if the string was truncated. + * + * Due to the intended usage, these helpers silently noop invocations + * having zero size. This is technically an exception to the above + * statement "terminates in all cases". It's unexpected for such calls to + * occur outside of a loop where this is the preferred behavior. + */ + +#include +#include +#include + +#include "string-util.h" +#include "strxcpyx.h" + +size_t strnpcpy_full(char **dest, size_t size, const char *src, size_t len, bool *ret_truncated) { + bool truncated = false; + + assert(dest); + assert(src); + + if (size == 0) { + if (ret_truncated) + *ret_truncated = len > 0; + return 0; + } + + if (len >= size) { + if (size > 1) + *dest = mempcpy(*dest, src, size-1); + size = 0; + truncated = true; + } else if (len > 0) { + *dest = mempcpy(*dest, src, len); + size -= len; + } + + if (ret_truncated) + *ret_truncated = truncated; + + *dest[0] = '\0'; + return size; +} + +size_t strpcpy_full(char **dest, size_t size, const char *src, bool *ret_truncated) { + assert(dest); + assert(src); + + return strnpcpy_full(dest, size, src, strlen(src), ret_truncated); +} + +size_t strpcpyf_full(char **dest, size_t size, bool *ret_truncated, const char *src, ...) { + bool truncated = false; + va_list va; + int i; + + assert(dest); + assert(src); + + va_start(va, src); + i = vsnprintf(*dest, size, src, va); + va_end(va); + + if (i < (int) size) { + *dest += i; + size -= i; + } else { + size = 0; + truncated = i > 0; + } + + if (ret_truncated) + *ret_truncated = truncated; + + return size; +} + +size_t strpcpyl_full(char **dest, size_t size, bool *ret_truncated, const char *src, ...) { + bool truncated = false; + va_list va; + + assert(dest); + assert(src); + + va_start(va, src); + do { + bool t; + + size = strpcpy_full(dest, size, src, &t); + truncated = truncated || t; + src = va_arg(va, char *); + } while (src); + va_end(va); + + if (ret_truncated) + *ret_truncated = truncated; + return size; +} + +size_t strnscpy_full(char *dest, size_t size, const char *src, size_t len, bool *ret_truncated) { + char *s; + + assert(dest); + assert(src); + + s = dest; + return strnpcpy_full(&s, size, src, len, ret_truncated); +} + +size_t strscpy_full(char *dest, size_t size, const char *src, bool *ret_truncated) { + assert(dest); + assert(src); + + return strnscpy_full(dest, size, src, strlen(src), ret_truncated); +} + +size_t strscpyl_full(char *dest, size_t size, bool *ret_truncated, const char *src, ...) { + bool truncated = false; + va_list va; + char *s; + + assert(dest); + assert(src); + + va_start(va, src); + s = dest; + do { + bool t; + + size = strpcpy_full(&s, size, src, &t); + truncated = truncated || t; + src = va_arg(va, char *); + } while (src); + va_end(va); + + if (ret_truncated) + *ret_truncated = truncated; + + return size; +} diff --git a/src/basic/strxcpyx.h b/src/basic/strxcpyx.h new file mode 100644 index 0000000..4a648ed --- /dev/null +++ b/src/basic/strxcpyx.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +size_t strnpcpy_full(char **dest, size_t size, const char *src, size_t len, bool *ret_truncated); +static inline size_t strnpcpy(char **dest, size_t size, const char *src, size_t len) { + return strnpcpy_full(dest, size, src, len, NULL); +} +size_t strpcpy_full(char **dest, size_t size, const char *src, bool *ret_truncated); +static inline size_t strpcpy(char **dest, size_t size, const char *src) { + return strpcpy_full(dest, size, src, NULL); +} +size_t strpcpyf_full(char **dest, size_t size, bool *ret_truncated, const char *src, ...) _printf_(4, 5); +#define strpcpyf(dest, size, src, ...) \ + strpcpyf_full((dest), (size), NULL, (src), ##__VA_ARGS__) +size_t strpcpyl_full(char **dest, size_t size, bool *ret_truncated, const char *src, ...) _sentinel_; +#define strpcpyl(dest, size, src, ...) \ + strpcpyl_full((dest), (size), NULL, (src), ##__VA_ARGS__) +size_t strnscpy_full(char *dest, size_t size, const char *src, size_t len, bool *ret_truncated); +static inline size_t strnscpy(char *dest, size_t size, const char *src, size_t len) { + return strnscpy_full(dest, size, src, len, NULL); +} +size_t strscpy_full(char *dest, size_t size, const char *src, bool *ret_truncated); +static inline size_t strscpy(char *dest, size_t size, const char *src) { + return strscpy_full(dest, size, src, NULL); +} +size_t strscpyl_full(char *dest, size_t size, bool *ret_truncated, const char *src, ...) _sentinel_; +#define strscpyl(dest, size, src, ...) \ + strscpyl_full(dest, size, NULL, src, ##__VA_ARGS__) diff --git a/src/basic/sync-util.c b/src/basic/sync-util.c new file mode 100644 index 0000000..a17ab2c --- /dev/null +++ b/src/basic/sync-util.c @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "fs-util.h" +#include "path-util.h" +#include "sync-util.h" + +int fsync_directory_of_file(int fd) { + _cleanup_close_ int dfd = -EBADF; + struct stat st; + int r; + + assert(fd >= 0); + + /* We only reasonably can do this for regular files and directories, or for O_PATH fds, hence check + * for the inode type first */ + if (fstat(fd, &st) < 0) + return -errno; + + if (S_ISDIR(st.st_mode)) { + dfd = openat(fd, "..", O_RDONLY|O_DIRECTORY|O_CLOEXEC, 0); + if (dfd < 0) + return -errno; + + } else if (!S_ISREG(st.st_mode)) { /* Regular files are OK regardless if O_PATH or not, for all other + * types check O_PATH flag */ + r = fd_is_opath(fd); + if (r < 0) + return r; + if (!r) /* If O_PATH this refers to the inode in the fs, in which case we can sensibly do + * what is requested. Otherwise this refers to a socket, fifo or device node, where + * the concept of a containing directory doesn't make too much sense. */ + return -ENOTTY; + } + + if (dfd < 0) { + _cleanup_free_ char *path = NULL; + + r = fd_get_path(fd, &path); + if (r < 0) { + log_debug_errno(r, "Failed to query /proc/self/fd/%d%s: %m", + fd, + r == -ENOSYS ? ", ignoring" : ""); + + if (r == -ENOSYS) + /* If /proc is not available, we're most likely running in some + * chroot environment, and syncing the directory is not very + * important in that case. Let's just silently do nothing. */ + return 0; + + return r; + } + + if (!path_is_absolute(path)) + return -EINVAL; + + dfd = open_parent(path, O_CLOEXEC|O_NOFOLLOW, 0); + if (dfd < 0) + return dfd; + } + + return RET_NERRNO(fsync(dfd)); +} + +int fsync_full(int fd) { + int r, q; + + /* Sync both the file and the directory */ + + r = RET_NERRNO(fsync(fd)); + + q = fsync_directory_of_file(fd); + if (r < 0) /* Return earlier error */ + return r; + if (q == -ENOTTY) /* Ignore if the 'fd' refers to a block device or so which doesn't really have a + * parent dir */ + return 0; + return q; +} + +int fsync_path_at(int at_fd, const char *path) { + _cleanup_close_ int opened_fd = -EBADF; + int fd; + + if (isempty(path)) { + if (at_fd == AT_FDCWD) { + opened_fd = open(".", O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (opened_fd < 0) + return -errno; + + fd = opened_fd; + } else + fd = at_fd; + } else { + opened_fd = openat(at_fd, path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (opened_fd < 0) + return -errno; + + fd = opened_fd; + } + + return RET_NERRNO(fsync(fd)); +} + +int fsync_parent_at(int at_fd, const char *path) { + _cleanup_close_ int opened_fd = -EBADF; + + if (isempty(path)) { + if (at_fd != AT_FDCWD) + return fsync_directory_of_file(at_fd); + + opened_fd = open("..", O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (opened_fd < 0) + return -errno; + + return RET_NERRNO(fsync(opened_fd)); + } + + opened_fd = openat(at_fd, path, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (opened_fd < 0) + return -errno; + + return fsync_directory_of_file(opened_fd); +} + +int fsync_path_and_parent_at(int at_fd, const char *path) { + _cleanup_close_ int opened_fd = -EBADF; + + if (isempty(path)) { + if (at_fd != AT_FDCWD) + return fsync_full(at_fd); + + opened_fd = open(".", O_RDONLY|O_DIRECTORY|O_CLOEXEC); + } else + opened_fd = openat(at_fd, path, O_RDONLY|O_NOFOLLOW|O_NONBLOCK|O_CLOEXEC); + if (opened_fd < 0) + return -errno; + + return fsync_full(opened_fd); +} + +int syncfs_path(int at_fd, const char *path) { + _cleanup_close_ int fd = -EBADF; + + if (isempty(path)) { + if (at_fd != AT_FDCWD) + return RET_NERRNO(syncfs(at_fd)); + + fd = open(".", O_RDONLY|O_DIRECTORY|O_CLOEXEC); + } else + fd = openat(at_fd, path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return -errno; + + return RET_NERRNO(syncfs(fd)); +} diff --git a/src/basic/sync-util.h b/src/basic/sync-util.h new file mode 100644 index 0000000..e449440 --- /dev/null +++ b/src/basic/sync-util.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int fsync_directory_of_file(int fd); +int fsync_full(int fd); + +int fsync_path_at(int at_fd, const char *path); +int fsync_parent_at(int at_fd, const char *path); +int fsync_path_and_parent_at(int at_fd, const char *path); + +int syncfs_path(int at_fd, const char *path); diff --git a/src/basic/syscall-list.txt b/src/basic/syscall-list.txt new file mode 100644 index 0000000..1c335bb --- /dev/null +++ b/src/basic/syscall-list.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept +accept4 +access +acct +add_key +adjtimex +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind +bpf +brk +cachectl +cacheflush +cachestat +capget +capset +chdir +chmod +chown +chown32 +chroot +clock_adjtime +clock_adjtime64 +clock_getres +clock_getres_time64 +clock_gettime +clock_gettime64 +clock_nanosleep +clock_nanosleep_time64 +clock_settime +clock_settime64 +clone +clone3 +close +close_range +connect +copy_file_range +creat +delete_module +dipc +dup +dup2 +dup3 +epoll_create +epoll_create1 +epoll_ctl +epoll_ctl_old +epoll_pwait +epoll_pwait2 +epoll_wait +epoll_wait_old +eventfd +eventfd2 +exec_with_loader +execv +execve +execveat +exit +exit_group +faccessat +faccessat2 +fadvise64 +fadvise64_64 +fallocate +fanotify_init +fanotify_mark +fchdir +fchmod +fchmodat +fchmodat2 +fchown +fchown32 +fchownat +fcntl +fcntl64 +fdatasync +fgetxattr +finit_module +flistxattr +flock +fork +fremovexattr +fsconfig +fsetxattr +fsmount +fsopen +fspick +fstat +fstat64 +fstatat64 +fstatfs +fstatfs64 +fsync +ftruncate +ftruncate64 +futex +futex_requeue +futex_time64 +futex_wait +futex_waitv +futex_wake +futimesat +get_mempolicy +get_robust_list +get_thread_area +getcpu +getcwd +getdents +getdents64 +getdomainname +getdtablesize +getegid +getegid32 +geteuid +geteuid32 +getgid +getgid32 +getgroups +getgroups32 +gethostname +getitimer +getpagesize +getpeername +getpgid +getpgrp +getpid +getppid +getpriority +getrandom +getresgid +getresgid32 +getresuid +getresuid32 +getrlimit +getrusage +getsid +getsockname +getsockopt +gettid +gettimeofday +getuid +getuid32 +getxattr +getxgid +getxpid +getxuid +init_module +inotify_add_watch +inotify_init +inotify_init1 +inotify_rm_watch +io_cancel +io_destroy +io_getevents +io_pgetevents +io_pgetevents_time64 +io_setup +io_submit +io_uring_enter +io_uring_register +io_uring_setup +ioctl +ioperm +iopl +ioprio_get +ioprio_set +ipc +kcmp +kern_features +kexec_file_load +kexec_load +keyctl +kill +landlock_add_rule +landlock_create_ruleset +landlock_restrict_self +lchown +lchown32 +lgetxattr +link +linkat +listen +listxattr +llistxattr +lookup_dcookie +lremovexattr +lseek +lsetxattr +lstat +lstat64 +madvise +map_shadow_stack +mbind +membarrier +memfd_create +memfd_secret +memory_ordering +migrate_pages +mincore +mkdir +mkdirat +mknod +mknodat +mlock +mlock2 +mlockall +mmap +mmap2 +modify_ldt +mount +mount_setattr +move_mount +move_pages +mprotect +mq_getsetattr +mq_notify +mq_open +mq_timedreceive +mq_timedreceive_time64 +mq_timedsend +mq_timedsend_time64 +mq_unlink +mremap +msgctl +msgget +msgrcv +msgsnd +msync +multiplexer +munlock +munlockall +munmap +name_to_handle_at +nanosleep +newfstatat +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open +open_by_handle_at +open_tree +openat +openat2 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open +perfctr +personality +pidfd_getfd +pidfd_open +pidfd_send_signal +pipe +pipe2 +pivot_root +pkey_alloc +pkey_free +pkey_mprotect +poll +ppoll +ppoll_time64 +prctl +pread64 +preadv +preadv2 +prlimit64 +process_madvise +process_mrelease +process_vm_readv +process_vm_writev +pselect6 +pselect6_time64 +ptrace +pwrite64 +pwritev +pwritev2 +quotactl +quotactl_fd +read +readahead +readdir +readlink +readlinkat +readv +reboot +recv +recvfrom +recvmmsg +recvmmsg_time64 +recvmsg +remap_file_pages +removexattr +rename +renameat +renameat2 +request_key +restart_syscall +riscv_flush_icache +riscv_hwprobe +rmdir +rseq +rt_sigaction +rt_sigpending +rt_sigprocmask +rt_sigqueueinfo +rt_sigreturn +rt_sigsuspend +rt_sigtimedwait +rt_sigtimedwait_time64 +rt_tgsigqueueinfo +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max +sched_get_priority_min +sched_getaffinity +sched_getattr +sched_getparam +sched_getscheduler +sched_rr_get_interval +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity +sched_setattr +sched_setparam +sched_setscheduler +sched_yield +seccomp +select +semctl +semget +semop +semtimedop +semtimedop_time64 +send +sendfile +sendfile64 +sendmmsg +sendmsg +sendto +set_mempolicy +set_mempolicy_home_node +set_robust_list +set_thread_area +set_tid_address +setdomainname +setfsgid +setfsgid32 +setfsuid +setfsuid32 +setgid +setgid32 +setgroups +setgroups32 +sethae +sethostname +setitimer +setns +setpgid +setpgrp +setpriority +setregid +setregid32 +setresgid +setresgid32 +setresuid +setresuid32 +setreuid +setreuid32 +setrlimit +setsid +setsockopt +settimeofday +setuid +setuid32 +setxattr +sgetmask +shmat +shmctl +shmdt +shmget +shutdown +sigaction +sigaltstack +signal +signalfd +signalfd4 +sigpending +sigprocmask +sigreturn +sigsuspend +socket +socketcall +socketpair +splice +spu_create +spu_run +ssetmask +stat +stat64 +statfs +statfs64 +statx +stime +subpage_prot +swapcontext +swapoff +swapon +switch_endian +symlink +symlinkat +sync +sync_file_range +sync_file_range2 +syncfs +sys_debug_setcontext +syscall +sysfs +sysinfo +syslog +sysmips +tee +tgkill +time +timer_create +timer_delete +timer_getoverrun +timer_gettime +timer_gettime64 +timer_settime +timer_settime64 +timerfd +timerfd_create +timerfd_gettime +timerfd_gettime64 +timerfd_settime +timerfd_settime64 +times +tkill +truncate +truncate64 +ugetrlimit +umask +umount +umount2 +uname +unlink +unlinkat +unshare +userfaultfd +ustat +utime +utimensat +utimensat_time64 +utimes +utrap_install +vfork +vhangup +vm86 +vm86old +vmsplice +wait4 +waitid +waitpid +write +writev diff --git a/src/basic/syscalls-alpha.txt b/src/basic/syscalls-alpha.txt new file mode 100644 index 0000000..d3ed3a4 --- /dev/null +++ b/src/basic/syscalls-alpha.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 99 +accept4 502 +access 33 +acct 51 +add_key 439 +adjtimex 366 +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 104 +bpf 515 +brk 17 +cachectl +cacheflush +cachestat 561 +capget 368 +capset 369 +chdir 12 +chmod 15 +chown 16 +chown32 +chroot 61 +clock_adjtime 499 +clock_adjtime64 +clock_getres 421 +clock_getres_time64 +clock_gettime 420 +clock_gettime64 +clock_nanosleep 422 +clock_nanosleep_time64 +clock_settime 419 +clock_settime64 +clone 312 +clone3 +close 6 +close_range 546 +connect 98 +copy_file_range 519 +creat +delete_module 308 +dipc 373 +dup 41 +dup2 90 +dup3 487 +epoll_create 407 +epoll_create1 486 +epoll_ctl 408 +epoll_ctl_old +epoll_pwait 474 +epoll_pwait2 551 +epoll_wait 409 +epoll_wait_old +eventfd 478 +eventfd2 485 +exec_with_loader 25 +execv +execve 59 +execveat 513 +exit 1 +exit_group 405 +faccessat 462 +faccessat2 549 +fadvise64 413 +fadvise64_64 +fallocate 480 +fanotify_init 494 +fanotify_mark 495 +fchdir 13 +fchmod 124 +fchmodat 461 +fchmodat2 562 +fchown 123 +fchown32 +fchownat 453 +fcntl 92 +fcntl64 +fdatasync 447 +fgetxattr 387 +finit_module 507 +flistxattr 390 +flock 131 +fork 2 +fremovexattr 393 +fsconfig 541 +fsetxattr 384 +fsmount 542 +fsopen 540 +fspick 543 +fstat 91 +fstat64 427 +fstatat64 455 +fstatfs 329 +fstatfs64 529 +fsync 95 +ftruncate 130 +ftruncate64 +futex 394 +futex_requeue 566 +futex_time64 +futex_wait 565 +futex_waitv 559 +futex_wake 564 +futimesat 454 +get_mempolicy 430 +get_robust_list 467 +get_thread_area +getcpu 473 +getcwd 367 +getdents 305 +getdents64 377 +getdomainname +getdtablesize 89 +getegid 530 +getegid32 +geteuid 531 +geteuid32 +getgid 47 +getgid32 +getgroups 79 +getgroups32 +gethostname 87 +getitimer 361 +getpagesize 64 +getpeername 141 +getpgid 233 +getpgrp 63 +getpid 20 +getppid 532 +getpriority 100 +getrandom 511 +getresgid 372 +getresgid32 +getresuid 344 +getresuid32 +getrlimit 144 +getrusage 364 +getsid 234 +getsockname 150 +getsockopt 118 +gettid 378 +gettimeofday 359 +getuid 24 +getuid32 +getxattr 385 +getxgid 47 +getxpid 20 +getxuid 24 +init_module 307 +inotify_add_watch 445 +inotify_init 444 +inotify_init1 489 +inotify_rm_watch 446 +io_cancel 402 +io_destroy 399 +io_getevents 400 +io_pgetevents 523 +io_pgetevents_time64 +io_setup 398 +io_submit 401 +io_uring_enter 536 +io_uring_register 537 +io_uring_setup 535 +ioctl 54 +ioperm +iopl +ioprio_get 443 +ioprio_set 442 +ipc +kcmp 506 +kern_features +kexec_file_load +kexec_load 448 +keyctl 441 +kill 37 +landlock_add_rule 555 +landlock_create_ruleset 554 +landlock_restrict_self 556 +lchown 208 +lchown32 +lgetxattr 386 +link 9 +linkat 458 +listen 106 +listxattr 388 +llistxattr 389 +lookup_dcookie 406 +lremovexattr 392 +lseek 19 +lsetxattr 383 +lstat 68 +lstat64 426 +madvise 75 +map_shadow_stack 563 +mbind 429 +membarrier 517 +memfd_create 512 +memfd_secret +memory_ordering +migrate_pages 449 +mincore 375 +mkdir 136 +mkdirat 451 +mknod 14 +mknodat 452 +mlock 314 +mlock2 518 +mlockall 316 +mmap 71 +mmap2 +modify_ldt +mount 302 +mount_setattr 552 +move_mount 539 +move_pages 472 +mprotect 74 +mq_getsetattr 437 +mq_notify 436 +mq_open 432 +mq_timedreceive 435 +mq_timedreceive_time64 +mq_timedsend 434 +mq_timedsend_time64 +mq_unlink 433 +mremap 341 +msgctl 200 +msgget 201 +msgrcv 202 +msgsnd 203 +msync 217 +multiplexer +munlock 315 +munlockall 317 +munmap 73 +name_to_handle_at 497 +nanosleep 340 +newfstatat +nice +old_adjtimex 303 +oldfstat +oldlstat +oldolduname +oldstat +oldumount 321 +olduname +open 45 +open_by_handle_at 498 +open_tree 538 +openat 450 +openat2 547 +or1k_atomic +osf_fstat 226 +osf_fstatfs 161 +osf_fstatfs64 228 +osf_getdirentries 159 +osf_getdomainname 165 +osf_getitimer 86 +osf_getrusage 117 +osf_getsysinfo 256 +osf_gettimeofday 116 +osf_lstat 225 +osf_mount 21 +osf_proplist_syscall 244 +osf_select 93 +osf_set_program_attributes 43 +osf_setitimer 83 +osf_setsysinfo 257 +osf_settimeofday 122 +osf_shmat 209 +osf_sigprocmask 48 +osf_sigstack 112 +osf_stat 224 +osf_statfs 160 +osf_statfs64 227 +osf_swapon 199 +osf_syscall 0 +osf_sysinfo 241 +osf_usleep_thread 251 +osf_utimes 138 +osf_utsname 207 +osf_wait4 7 +pause +pciconfig_iobase 376 +pciconfig_read 345 +pciconfig_write 346 +perf_event_open 493 +perfctr +personality 324 +pidfd_getfd 548 +pidfd_open 544 +pidfd_send_signal 534 +pipe 42 +pipe2 488 +pivot_root 374 +pkey_alloc 525 +pkey_free 526 +pkey_mprotect 524 +poll 94 +ppoll 464 +ppoll_time64 +prctl 348 +pread64 349 +preadv 490 +preadv2 520 +prlimit64 496 +process_madvise 550 +process_mrelease 558 +process_vm_readv 504 +process_vm_writev 505 +pselect6 463 +pselect6_time64 +ptrace 26 +pwrite64 350 +pwritev 491 +pwritev2 521 +quotactl 148 +quotactl_fd 553 +read 3 +readahead 379 +readdir +readlink 58 +readlinkat 460 +readv 120 +reboot 311 +recv 102 +recvfrom 125 +recvmmsg 479 +recvmmsg_time64 +recvmsg 113 +remap_file_pages 410 +removexattr 391 +rename 128 +renameat 457 +renameat2 510 +request_key 440 +restart_syscall 412 +riscv_flush_icache +riscv_hwprobe +rmdir 137 +rseq 527 +rt_sigaction 352 +rt_sigpending 354 +rt_sigprocmask 353 +rt_sigqueueinfo 356 +rt_sigreturn 351 +rt_sigsuspend 357 +rt_sigtimedwait 355 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 492 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 335 +sched_get_priority_min 336 +sched_getaffinity 396 +sched_getattr 509 +sched_getparam 331 +sched_getscheduler 333 +sched_rr_get_interval 337 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 395 +sched_setattr 508 +sched_setparam 330 +sched_setscheduler 332 +sched_yield 334 +seccomp 514 +select 358 +semctl 204 +semget 205 +semop 206 +semtimedop 423 +semtimedop_time64 +send 101 +sendfile 370 +sendfile64 +sendmmsg 503 +sendmsg 114 +sendto 133 +set_mempolicy 431 +set_mempolicy_home_node 560 +set_robust_list 466 +set_thread_area +set_tid_address 411 +setdomainname 166 +setfsgid 326 +setfsgid32 +setfsuid 325 +setfsuid32 +setgid 132 +setgid32 +setgroups 80 +setgroups32 +sethae 301 +sethostname 88 +setitimer 362 +setns 501 +setpgid 39 +setpgrp 82 +setpriority 96 +setregid 127 +setregid32 +setresgid 371 +setresgid32 +setresuid 343 +setresuid32 +setreuid 126 +setreuid32 +setrlimit 145 +setsid 147 +setsockopt 105 +settimeofday 360 +setuid 23 +setuid32 +setxattr 382 +sgetmask +shmat 209 +shmctl 210 +shmdt 211 +shmget 212 +shutdown 134 +sigaction 156 +sigaltstack 235 +signal +signalfd 476 +signalfd4 484 +sigpending 52 +sigprocmask +sigreturn 103 +sigsuspend 111 +socket 97 +socketcall +socketpair 135 +splice 468 +spu_create +spu_run +ssetmask +stat 67 +stat64 425 +statfs 328 +statfs64 528 +statx 522 +stime +subpage_prot +swapcontext +swapoff 304 +swapon 322 +switch_endian +symlink 57 +symlinkat 459 +sync 36 +sync_file_range 469 +sync_file_range2 +syncfs 500 +sys_debug_setcontext +syscall +sysfs 254 +sysinfo 318 +syslog 310 +sysmips +tee 470 +tgkill 424 +time +timer_create 414 +timer_delete 418 +timer_getoverrun 417 +timer_gettime 416 +timer_gettime64 +timer_settime 415 +timer_settime64 +timerfd 477 +timerfd_create 481 +timerfd_gettime 483 +timerfd_gettime64 +timerfd_settime 482 +timerfd_settime64 +times 323 +tkill 381 +truncate 129 +truncate64 +ugetrlimit +umask 60 +umount 22 +umount2 22 +uname 339 +unlink 10 +unlinkat 456 +unshare 465 +userfaultfd 516 +ustat 327 +utime +utimensat 475 +utimensat_time64 +utimes 363 +utrap_install +vfork 66 +vhangup 76 +vm86 +vm86old +vmsplice 471 +wait4 365 +waitid 438 +waitpid +write 4 +writev 121 diff --git a/src/basic/syscalls-arc.txt b/src/basic/syscalls-arc.txt new file mode 100644 index 0000000..951ef56 --- /dev/null +++ b/src/basic/syscalls-arc.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 202 +accept4 242 +access +acct 89 +add_key 217 +adjtimex 171 +alarm +arc_gettls 246 +arc_settls 245 +arc_usr_cmpxchg 248 +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 200 +bpf 280 +brk 214 +cachectl +cacheflush 244 +cachestat 451 +capget 90 +capset 91 +chdir 49 +chmod +chown +chown32 +chroot 51 +clock_adjtime 266 +clock_adjtime64 405 +clock_getres 114 +clock_getres_time64 406 +clock_gettime 113 +clock_gettime64 403 +clock_nanosleep 115 +clock_nanosleep_time64 407 +clock_settime 112 +clock_settime64 404 +clone 220 +clone3 435 +close 57 +close_range 436 +connect 203 +copy_file_range 285 +creat +delete_module 106 +dipc +dup 23 +dup2 +dup3 24 +epoll_create +epoll_create1 20 +epoll_ctl 21 +epoll_ctl_old +epoll_pwait 22 +epoll_pwait2 441 +epoll_wait +epoll_wait_old +eventfd +eventfd2 19 +exec_with_loader +execv +execve 221 +execveat 281 +exit 93 +exit_group 94 +faccessat 48 +faccessat2 439 +fadvise64 +fadvise64_64 223 +fallocate 47 +fanotify_init 262 +fanotify_mark 263 +fchdir 50 +fchmod 52 +fchmodat 53 +fchmodat2 452 +fchown 55 +fchown32 +fchownat 54 +fcntl +fcntl64 25 +fdatasync 83 +fgetxattr 10 +finit_module 273 +flistxattr 13 +flock 32 +fork +fremovexattr 16 +fsconfig 431 +fsetxattr 7 +fsmount 432 +fsopen 430 +fspick 433 +fstat +fstat64 80 +fstatat64 79 +fstatfs +fstatfs64 44 +fsync 82 +ftruncate +ftruncate64 46 +futex 98 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat +get_mempolicy 236 +get_robust_list 100 +get_thread_area +getcpu 168 +getcwd 17 +getdents +getdents64 61 +getdomainname +getdtablesize +getegid 177 +getegid32 +geteuid 175 +geteuid32 +getgid 176 +getgid32 +getgroups 158 +getgroups32 +gethostname +getitimer 102 +getpagesize +getpeername 205 +getpgid 155 +getpgrp +getpid 172 +getppid 173 +getpriority 141 +getrandom 278 +getresgid 150 +getresgid32 +getresuid 148 +getresuid32 +getrlimit 163 +getrusage 165 +getsid 156 +getsockname 204 +getsockopt 209 +gettid 178 +gettimeofday 169 +getuid 174 +getuid32 +getxattr 8 +getxgid +getxpid +getxuid +init_module 105 +inotify_add_watch 27 +inotify_init +inotify_init1 26 +inotify_rm_watch 28 +io_cancel 3 +io_destroy 1 +io_getevents 4 +io_pgetevents 292 +io_pgetevents_time64 416 +io_setup 0 +io_submit 2 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 29 +ioperm +iopl +ioprio_get 31 +ioprio_set 30 +ipc +kcmp 272 +kern_features +kexec_file_load 294 +kexec_load 104 +keyctl 219 +kill 129 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown +lchown32 +lgetxattr 9 +link +linkat 37 +listen 201 +listxattr 11 +llistxattr 12 +lookup_dcookie 18 +lremovexattr 15 +lseek +lsetxattr 6 +lstat +lstat64 +madvise 233 +map_shadow_stack 453 +mbind 235 +membarrier 283 +memfd_create 279 +memfd_secret +memory_ordering +migrate_pages 238 +mincore 232 +mkdir +mkdirat 34 +mknod +mknodat 33 +mlock 228 +mlock2 284 +mlockall 230 +mmap +mmap2 222 +modify_ldt +mount 40 +mount_setattr 442 +move_mount 429 +move_pages 239 +mprotect 226 +mq_getsetattr 185 +mq_notify 184 +mq_open 180 +mq_timedreceive 183 +mq_timedreceive_time64 419 +mq_timedsend 182 +mq_timedsend_time64 418 +mq_unlink 181 +mremap 216 +msgctl 187 +msgget 186 +msgrcv 188 +msgsnd 189 +msync 227 +multiplexer +munlock 229 +munlockall 231 +munmap 215 +name_to_handle_at 264 +nanosleep 101 +newfstatat +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open +open_by_handle_at 265 +open_tree 428 +openat 56 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 241 +perfctr +personality 92 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe +pipe2 59 +pivot_root 41 +pkey_alloc 289 +pkey_free 290 +pkey_mprotect 288 +poll +ppoll 73 +ppoll_time64 414 +prctl 167 +pread64 67 +preadv 69 +preadv2 286 +prlimit64 261 +process_madvise 440 +process_mrelease 448 +process_vm_readv 270 +process_vm_writev 271 +pselect6 72 +pselect6_time64 413 +ptrace 117 +pwrite64 68 +pwritev 70 +pwritev2 287 +quotactl 60 +quotactl_fd 443 +read 63 +readahead 213 +readdir +readlink +readlinkat 78 +readv 65 +reboot 142 +recv +recvfrom 207 +recvmmsg 243 +recvmmsg_time64 417 +recvmsg 212 +remap_file_pages 234 +removexattr 14 +rename +renameat 38 +renameat2 276 +request_key 218 +restart_syscall 128 +riscv_flush_icache +riscv_hwprobe +rmdir +rseq 293 +rt_sigaction 134 +rt_sigpending 136 +rt_sigprocmask 135 +rt_sigqueueinfo 138 +rt_sigreturn 139 +rt_sigsuspend 133 +rt_sigtimedwait 137 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 240 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 125 +sched_get_priority_min 126 +sched_getaffinity 123 +sched_getattr 275 +sched_getparam 121 +sched_getscheduler 120 +sched_rr_get_interval 127 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 122 +sched_setattr 274 +sched_setparam 118 +sched_setscheduler 119 +sched_yield 124 +seccomp 277 +select +semctl 191 +semget 190 +semop 193 +semtimedop 192 +semtimedop_time64 420 +send +sendfile +sendfile64 71 +sendmmsg 269 +sendmsg 211 +sendto 206 +set_mempolicy 237 +set_mempolicy_home_node 450 +set_robust_list 99 +set_thread_area +set_tid_address 96 +setdomainname 162 +setfsgid 152 +setfsgid32 +setfsuid 151 +setfsuid32 +setgid 144 +setgid32 +setgroups 159 +setgroups32 +sethae +sethostname 161 +setitimer 103 +setns 268 +setpgid 154 +setpgrp +setpriority 140 +setregid 143 +setregid32 +setresgid 149 +setresgid32 +setresuid 147 +setresuid32 +setreuid 145 +setreuid32 +setrlimit 164 +setsid 157 +setsockopt 208 +settimeofday 170 +setuid 146 +setuid32 +setxattr 5 +sgetmask +shmat 196 +shmctl 195 +shmdt 197 +shmget 194 +shutdown 210 +sigaction +sigaltstack 132 +signal +signalfd +signalfd4 74 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 198 +socketcall +socketpair 199 +splice 76 +spu_create +spu_run +ssetmask +stat +stat64 +statfs +statfs64 43 +statx 291 +stime +subpage_prot +swapcontext +swapoff 225 +swapon 224 +switch_endian +symlink +symlinkat 36 +sync 81 +sync_file_range 84 +sync_file_range2 +syncfs 267 +sys_debug_setcontext +syscall +sysfs 247 +sysinfo 179 +syslog 116 +sysmips +tee 77 +tgkill 131 +time +timer_create 107 +timer_delete 111 +timer_getoverrun 109 +timer_gettime 108 +timer_gettime64 408 +timer_settime 110 +timer_settime64 409 +timerfd +timerfd_create 85 +timerfd_gettime 87 +timerfd_gettime64 410 +timerfd_settime 86 +timerfd_settime64 411 +times 153 +tkill 130 +truncate +truncate64 45 +ugetrlimit +umask 166 +umount +umount2 39 +uname 160 +unlink +unlinkat 35 +unshare 97 +userfaultfd 282 +ustat +utime +utimensat 88 +utimensat_time64 412 +utimes +utrap_install +vfork +vhangup 58 +vm86 +vm86old +vmsplice 75 +wait4 260 +waitid 95 +waitpid +write 64 +writev 66 diff --git a/src/basic/syscalls-arm.txt b/src/basic/syscalls-arm.txt new file mode 100644 index 0000000..1c0e66f --- /dev/null +++ b/src/basic/syscalls-arm.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept 285 +accept4 366 +access 33 +acct 51 +add_key 309 +adjtimex 124 +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 270 +atomic_barrier +atomic_cmpxchg_32 +bind 282 +bpf 386 +brk 45 +cachectl +cacheflush +cachestat 451 +capget 184 +capset 185 +chdir 12 +chmod 15 +chown 182 +chown32 212 +chroot 61 +clock_adjtime 372 +clock_adjtime64 405 +clock_getres 264 +clock_getres_time64 406 +clock_gettime 263 +clock_gettime64 403 +clock_nanosleep 265 +clock_nanosleep_time64 407 +clock_settime 262 +clock_settime64 404 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 283 +copy_file_range 391 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 358 +epoll_create 250 +epoll_create1 357 +epoll_ctl 251 +epoll_ctl_old +epoll_pwait 346 +epoll_pwait2 441 +epoll_wait 252 +epoll_wait_old +eventfd 351 +eventfd2 356 +exec_with_loader +execv +execve 11 +execveat 387 +exit 1 +exit_group 248 +faccessat 334 +faccessat2 439 +fadvise64 +fadvise64_64 +fallocate 352 +fanotify_init 367 +fanotify_mark 368 +fchdir 133 +fchmod 94 +fchmodat 333 +fchmodat2 452 +fchown 95 +fchown32 207 +fchownat 325 +fcntl 55 +fcntl64 221 +fdatasync 148 +fgetxattr 231 +finit_module 379 +flistxattr 234 +flock 143 +fork 2 +fremovexattr 237 +fsconfig 431 +fsetxattr 228 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 197 +fstatat64 327 +fstatfs 100 +fstatfs64 267 +fsync 118 +ftruncate 93 +ftruncate64 194 +futex 240 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 326 +get_mempolicy 320 +get_robust_list 339 +get_thread_area +getcpu 345 +getcwd 183 +getdents 141 +getdents64 217 +getdomainname +getdtablesize +getegid 50 +getegid32 202 +geteuid 49 +geteuid32 201 +getgid 47 +getgid32 200 +getgroups 80 +getgroups32 205 +gethostname +getitimer 105 +getpagesize +getpeername 287 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 384 +getresgid 171 +getresgid32 211 +getresuid 165 +getresuid32 209 +getrlimit +getrusage 77 +getsid 147 +getsockname 286 +getsockopt 295 +gettid 224 +gettimeofday 78 +getuid 24 +getuid32 199 +getxattr 229 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 317 +inotify_init 316 +inotify_init1 360 +inotify_rm_watch 318 +io_cancel 247 +io_destroy 244 +io_getevents 245 +io_pgetevents 399 +io_pgetevents_time64 416 +io_setup 243 +io_submit 246 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm +iopl +ioprio_get 315 +ioprio_set 314 +ipc +kcmp 378 +kern_features +kexec_file_load 401 +kexec_load 347 +keyctl 311 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 198 +lgetxattr 230 +link 9 +linkat 330 +listen 284 +listxattr 232 +llistxattr 233 +lookup_dcookie 249 +lremovexattr 236 +lseek 19 +lsetxattr 227 +lstat 107 +lstat64 196 +madvise 220 +map_shadow_stack 453 +mbind 319 +membarrier 389 +memfd_create 385 +memfd_secret +memory_ordering +migrate_pages 400 +mincore 219 +mkdir 39 +mkdirat 323 +mknod 14 +mknodat 324 +mlock 150 +mlock2 390 +mlockall 152 +mmap +mmap2 192 +modify_ldt +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 344 +mprotect 125 +mq_getsetattr 279 +mq_notify 278 +mq_open 274 +mq_timedreceive 277 +mq_timedreceive_time64 419 +mq_timedsend 276 +mq_timedsend_time64 418 +mq_unlink 275 +mremap 163 +msgctl 304 +msgget 303 +msgrcv 302 +msgsnd 301 +msync 144 +multiplexer +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 370 +nanosleep 162 +newfstatat +nice 34 +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 5 +open_by_handle_at 371 +open_tree 428 +openat 322 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase 271 +pciconfig_read 272 +pciconfig_write 273 +perf_event_open 364 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 359 +pivot_root 218 +pkey_alloc 395 +pkey_free 396 +pkey_mprotect 394 +poll 168 +ppoll 336 +ppoll_time64 414 +prctl 172 +pread64 180 +preadv 361 +preadv2 392 +prlimit64 369 +process_madvise 440 +process_mrelease 448 +process_vm_readv 376 +process_vm_writev 377 +pselect6 335 +pselect6_time64 413 +ptrace 26 +pwrite64 181 +pwritev 362 +pwritev2 393 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 225 +readdir +readlink 85 +readlinkat 332 +readv 145 +reboot 88 +recv 291 +recvfrom 292 +recvmmsg 365 +recvmmsg_time64 417 +recvmsg 297 +remap_file_pages 253 +removexattr 235 +rename 38 +renameat 329 +renameat2 382 +request_key 310 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 398 +rt_sigaction 174 +rt_sigpending 176 +rt_sigprocmask 175 +rt_sigqueueinfo 178 +rt_sigreturn 173 +rt_sigsuspend 179 +rt_sigtimedwait 177 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 363 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 242 +sched_getattr 381 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 241 +sched_setattr 380 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 383 +select +semctl 300 +semget 299 +semop 298 +semtimedop 312 +semtimedop_time64 420 +send 289 +sendfile 187 +sendfile64 239 +sendmmsg 374 +sendmsg 296 +sendto 290 +set_mempolicy 321 +set_mempolicy_home_node 450 +set_robust_list 338 +set_thread_area +set_tid_address 256 +setdomainname 121 +setfsgid 139 +setfsgid32 216 +setfsuid 138 +setfsuid32 215 +setgid 46 +setgid32 214 +setgroups 81 +setgroups32 206 +sethae +sethostname 74 +setitimer 104 +setns 375 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 204 +setresgid 170 +setresgid32 210 +setresuid 164 +setresuid32 208 +setreuid 70 +setreuid32 203 +setrlimit 75 +setsid 66 +setsockopt 294 +settimeofday 79 +setuid 23 +setuid32 213 +setxattr 226 +sgetmask +shmat 305 +shmctl 308 +shmdt 306 +shmget 307 +shutdown 293 +sigaction 67 +sigaltstack 186 +signal +signalfd 349 +signalfd4 355 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 281 +socketcall +socketpair 288 +splice 340 +spu_create +spu_run +ssetmask +stat 106 +stat64 195 +statfs 99 +statfs64 266 +statx 397 +stime +subpage_prot +swapcontext +swapoff 115 +swapon 87 +switch_endian +symlink 83 +symlinkat 331 +sync 36 +sync_file_range +sync_file_range2 341 +syncfs 373 +sys_debug_setcontext +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 342 +tgkill 268 +time +timer_create 257 +timer_delete 261 +timer_getoverrun 260 +timer_gettime 259 +timer_gettime64 408 +timer_settime 258 +timer_settime64 409 +timerfd +timerfd_create 350 +timerfd_gettime 354 +timerfd_gettime64 410 +timerfd_settime 353 +timerfd_settime64 411 +times 43 +tkill 238 +truncate 92 +truncate64 193 +ugetrlimit 191 +umask 60 +umount +umount2 52 +uname 122 +unlink 10 +unlinkat 328 +unshare 337 +userfaultfd 388 +ustat 62 +utime +utimensat 348 +utimensat_time64 412 +utimes 269 +utrap_install +vfork 190 +vhangup 111 +vm86 +vm86old +vmsplice 343 +wait4 114 +waitid 280 +waitpid +write 4 +writev 146 diff --git a/src/basic/syscalls-arm64.txt b/src/basic/syscalls-arm64.txt new file mode 100644 index 0000000..b8602a1 --- /dev/null +++ b/src/basic/syscalls-arm64.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 202 +accept4 242 +access +acct 89 +add_key 217 +adjtimex 171 +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 200 +bpf 280 +brk 214 +cachectl +cacheflush +cachestat 451 +capget 90 +capset 91 +chdir 49 +chmod +chown +chown32 +chroot 51 +clock_adjtime 266 +clock_adjtime64 +clock_getres 114 +clock_getres_time64 +clock_gettime 113 +clock_gettime64 +clock_nanosleep 115 +clock_nanosleep_time64 +clock_settime 112 +clock_settime64 +clone 220 +clone3 435 +close 57 +close_range 436 +connect 203 +copy_file_range 285 +creat +delete_module 106 +dipc +dup 23 +dup2 +dup3 24 +epoll_create +epoll_create1 20 +epoll_ctl 21 +epoll_ctl_old +epoll_pwait 22 +epoll_pwait2 441 +epoll_wait +epoll_wait_old +eventfd +eventfd2 19 +exec_with_loader +execv +execve 221 +execveat 281 +exit 93 +exit_group 94 +faccessat 48 +faccessat2 439 +fadvise64 223 +fadvise64_64 +fallocate 47 +fanotify_init 262 +fanotify_mark 263 +fchdir 50 +fchmod 52 +fchmodat 53 +fchmodat2 452 +fchown 55 +fchown32 +fchownat 54 +fcntl 25 +fcntl64 +fdatasync 83 +fgetxattr 10 +finit_module 273 +flistxattr 13 +flock 32 +fork +fremovexattr 16 +fsconfig 431 +fsetxattr 7 +fsmount 432 +fsopen 430 +fspick 433 +fstat 80 +fstat64 +fstatat64 +fstatfs 44 +fstatfs64 +fsync 82 +ftruncate 46 +ftruncate64 +futex 98 +futex_requeue 456 +futex_time64 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat +get_mempolicy 236 +get_robust_list 100 +get_thread_area +getcpu 168 +getcwd 17 +getdents +getdents64 61 +getdomainname +getdtablesize +getegid 177 +getegid32 +geteuid 175 +geteuid32 +getgid 176 +getgid32 +getgroups 158 +getgroups32 +gethostname +getitimer 102 +getpagesize +getpeername 205 +getpgid 155 +getpgrp +getpid 172 +getppid 173 +getpriority 141 +getrandom 278 +getresgid 150 +getresgid32 +getresuid 148 +getresuid32 +getrlimit 163 +getrusage 165 +getsid 156 +getsockname 204 +getsockopt 209 +gettid 178 +gettimeofday 169 +getuid 174 +getuid32 +getxattr 8 +getxgid +getxpid +getxuid +init_module 105 +inotify_add_watch 27 +inotify_init +inotify_init1 26 +inotify_rm_watch 28 +io_cancel 3 +io_destroy 1 +io_getevents 4 +io_pgetevents 292 +io_pgetevents_time64 +io_setup 0 +io_submit 2 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 29 +ioperm +iopl +ioprio_get 31 +ioprio_set 30 +ipc +kcmp 272 +kern_features +kexec_file_load 294 +kexec_load 104 +keyctl 219 +kill 129 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown +lchown32 +lgetxattr 9 +link +linkat 37 +listen 201 +listxattr 11 +llistxattr 12 +lookup_dcookie 18 +lremovexattr 15 +lseek 62 +lsetxattr 6 +lstat +lstat64 +madvise 233 +map_shadow_stack 453 +mbind 235 +membarrier 283 +memfd_create 279 +memfd_secret 447 +memory_ordering +migrate_pages 238 +mincore 232 +mkdir +mkdirat 34 +mknod +mknodat 33 +mlock 228 +mlock2 284 +mlockall 230 +mmap 222 +mmap2 +modify_ldt +mount 40 +mount_setattr 442 +move_mount 429 +move_pages 239 +mprotect 226 +mq_getsetattr 185 +mq_notify 184 +mq_open 180 +mq_timedreceive 183 +mq_timedreceive_time64 +mq_timedsend 182 +mq_timedsend_time64 +mq_unlink 181 +mremap 216 +msgctl 187 +msgget 186 +msgrcv 188 +msgsnd 189 +msync 227 +multiplexer +munlock 229 +munlockall 231 +munmap 215 +name_to_handle_at 264 +nanosleep 101 +newfstatat 79 +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open +open_by_handle_at 265 +open_tree 428 +openat 56 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 241 +perfctr +personality 92 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe +pipe2 59 +pivot_root 41 +pkey_alloc 289 +pkey_free 290 +pkey_mprotect 288 +poll +ppoll 73 +ppoll_time64 +prctl 167 +pread64 67 +preadv 69 +preadv2 286 +prlimit64 261 +process_madvise 440 +process_mrelease 448 +process_vm_readv 270 +process_vm_writev 271 +pselect6 72 +pselect6_time64 +ptrace 117 +pwrite64 68 +pwritev 70 +pwritev2 287 +quotactl 60 +quotactl_fd 443 +read 63 +readahead 213 +readdir +readlink +readlinkat 78 +readv 65 +reboot 142 +recv +recvfrom 207 +recvmmsg 243 +recvmmsg_time64 +recvmsg 212 +remap_file_pages 234 +removexattr 14 +rename +renameat 38 +renameat2 276 +request_key 218 +restart_syscall 128 +riscv_flush_icache +riscv_hwprobe +rmdir +rseq 293 +rt_sigaction 134 +rt_sigpending 136 +rt_sigprocmask 135 +rt_sigqueueinfo 138 +rt_sigreturn 139 +rt_sigsuspend 133 +rt_sigtimedwait 137 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 240 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 125 +sched_get_priority_min 126 +sched_getaffinity 123 +sched_getattr 275 +sched_getparam 121 +sched_getscheduler 120 +sched_rr_get_interval 127 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 122 +sched_setattr 274 +sched_setparam 118 +sched_setscheduler 119 +sched_yield 124 +seccomp 277 +select +semctl 191 +semget 190 +semop 193 +semtimedop 192 +semtimedop_time64 +send +sendfile 71 +sendfile64 +sendmmsg 269 +sendmsg 211 +sendto 206 +set_mempolicy 237 +set_mempolicy_home_node 450 +set_robust_list 99 +set_thread_area +set_tid_address 96 +setdomainname 162 +setfsgid 152 +setfsgid32 +setfsuid 151 +setfsuid32 +setgid 144 +setgid32 +setgroups 159 +setgroups32 +sethae +sethostname 161 +setitimer 103 +setns 268 +setpgid 154 +setpgrp +setpriority 140 +setregid 143 +setregid32 +setresgid 149 +setresgid32 +setresuid 147 +setresuid32 +setreuid 145 +setreuid32 +setrlimit 164 +setsid 157 +setsockopt 208 +settimeofday 170 +setuid 146 +setuid32 +setxattr 5 +sgetmask +shmat 196 +shmctl 195 +shmdt 197 +shmget 194 +shutdown 210 +sigaction +sigaltstack 132 +signal +signalfd +signalfd4 74 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 198 +socketcall +socketpair 199 +splice 76 +spu_create +spu_run +ssetmask +stat +stat64 +statfs 43 +statfs64 +statx 291 +stime +subpage_prot +swapcontext +swapoff 225 +swapon 224 +switch_endian +symlink +symlinkat 36 +sync 81 +sync_file_range 84 +sync_file_range2 +syncfs 267 +sys_debug_setcontext +syscall +sysfs +sysinfo 179 +syslog 116 +sysmips +tee 77 +tgkill 131 +time +timer_create 107 +timer_delete 111 +timer_getoverrun 109 +timer_gettime 108 +timer_gettime64 +timer_settime 110 +timer_settime64 +timerfd +timerfd_create 85 +timerfd_gettime 87 +timerfd_gettime64 +timerfd_settime 86 +timerfd_settime64 +times 153 +tkill 130 +truncate 45 +truncate64 +ugetrlimit +umask 166 +umount +umount2 39 +uname 160 +unlink +unlinkat 35 +unshare 97 +userfaultfd 282 +ustat +utime +utimensat 88 +utimensat_time64 +utimes +utrap_install +vfork +vhangup 58 +vm86 +vm86old +vmsplice 75 +wait4 260 +waitid 95 +waitpid +write 64 +writev 66 diff --git a/src/basic/syscalls-i386.txt b/src/basic/syscalls-i386.txt new file mode 100644 index 0000000..6d0c57f --- /dev/null +++ b/src/basic/syscalls-i386.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept +accept4 364 +access 33 +acct 51 +add_key 286 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl 384 +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 361 +bpf 357 +brk 45 +cachectl +cacheflush +cachestat 451 +capget 184 +capset 185 +chdir 12 +chmod 15 +chown 182 +chown32 212 +chroot 61 +clock_adjtime 343 +clock_adjtime64 405 +clock_getres 266 +clock_getres_time64 406 +clock_gettime 265 +clock_gettime64 403 +clock_nanosleep 267 +clock_nanosleep_time64 407 +clock_settime 264 +clock_settime64 404 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 362 +copy_file_range 377 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 330 +epoll_create 254 +epoll_create1 329 +epoll_ctl 255 +epoll_ctl_old +epoll_pwait 319 +epoll_pwait2 441 +epoll_wait 256 +epoll_wait_old +eventfd 323 +eventfd2 328 +exec_with_loader +execv +execve 11 +execveat 358 +exit 1 +exit_group 252 +faccessat 307 +faccessat2 439 +fadvise64 250 +fadvise64_64 272 +fallocate 324 +fanotify_init 338 +fanotify_mark 339 +fchdir 133 +fchmod 94 +fchmodat 306 +fchmodat2 452 +fchown 95 +fchown32 207 +fchownat 298 +fcntl 55 +fcntl64 221 +fdatasync 148 +fgetxattr 231 +finit_module 350 +flistxattr 234 +flock 143 +fork 2 +fremovexattr 237 +fsconfig 431 +fsetxattr 228 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 197 +fstatat64 300 +fstatfs 100 +fstatfs64 269 +fsync 118 +ftruncate 93 +ftruncate64 194 +futex 240 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 299 +get_mempolicy 275 +get_robust_list 312 +get_thread_area 244 +getcpu 318 +getcwd 183 +getdents 141 +getdents64 220 +getdomainname +getdtablesize +getegid 50 +getegid32 202 +geteuid 49 +geteuid32 201 +getgid 47 +getgid32 200 +getgroups 80 +getgroups32 205 +gethostname +getitimer 105 +getpagesize +getpeername 368 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 355 +getresgid 171 +getresgid32 211 +getresuid 165 +getresuid32 209 +getrlimit 76 +getrusage 77 +getsid 147 +getsockname 367 +getsockopt 365 +gettid 224 +gettimeofday 78 +getuid 24 +getuid32 199 +getxattr 229 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 292 +inotify_init 291 +inotify_init1 332 +inotify_rm_watch 293 +io_cancel 249 +io_destroy 246 +io_getevents 247 +io_pgetevents 385 +io_pgetevents_time64 416 +io_setup 245 +io_submit 248 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm 101 +iopl 110 +ioprio_get 290 +ioprio_set 289 +ipc 117 +kcmp 349 +kern_features +kexec_file_load +kexec_load 283 +keyctl 288 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 198 +lgetxattr 230 +link 9 +linkat 303 +listen 363 +listxattr 232 +llistxattr 233 +lookup_dcookie 253 +lremovexattr 236 +lseek 19 +lsetxattr 227 +lstat 107 +lstat64 196 +madvise 219 +map_shadow_stack 453 +mbind 274 +membarrier 375 +memfd_create 356 +memfd_secret 447 +memory_ordering +migrate_pages 294 +mincore 218 +mkdir 39 +mkdirat 296 +mknod 14 +mknodat 297 +mlock 150 +mlock2 376 +mlockall 152 +mmap 90 +mmap2 192 +modify_ldt 123 +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 317 +mprotect 125 +mq_getsetattr 282 +mq_notify 281 +mq_open 277 +mq_timedreceive 280 +mq_timedreceive_time64 419 +mq_timedsend 279 +mq_timedsend_time64 418 +mq_unlink 278 +mremap 163 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 144 +multiplexer +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 341 +nanosleep 162 +newfstatat +nice 34 +old_adjtimex +oldfstat 28 +oldlstat 84 +oldolduname 59 +oldstat 18 +oldumount +olduname 109 +open 5 +open_by_handle_at 342 +open_tree 428 +openat 295 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 336 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 331 +pivot_root 217 +pkey_alloc 381 +pkey_free 382 +pkey_mprotect 380 +poll 168 +ppoll 309 +ppoll_time64 414 +prctl 172 +pread64 180 +preadv 333 +preadv2 378 +prlimit64 340 +process_madvise 440 +process_mrelease 448 +process_vm_readv 347 +process_vm_writev 348 +pselect6 308 +pselect6_time64 413 +ptrace 26 +pwrite64 181 +pwritev 334 +pwritev2 379 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 225 +readdir 89 +readlink 85 +readlinkat 305 +readv 145 +reboot 88 +recv +recvfrom 371 +recvmmsg 337 +recvmmsg_time64 417 +recvmsg 372 +remap_file_pages 257 +removexattr 235 +rename 38 +renameat 302 +renameat2 353 +request_key 287 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 386 +rt_sigaction 174 +rt_sigpending 176 +rt_sigprocmask 175 +rt_sigqueueinfo 178 +rt_sigreturn 173 +rt_sigsuspend 179 +rt_sigtimedwait 177 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 335 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 242 +sched_getattr 352 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 241 +sched_setattr 351 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 354 +select 82 +semctl 394 +semget 393 +semop +semtimedop +semtimedop_time64 420 +send +sendfile 187 +sendfile64 239 +sendmmsg 345 +sendmsg 370 +sendto 369 +set_mempolicy 276 +set_mempolicy_home_node 450 +set_robust_list 311 +set_thread_area 243 +set_tid_address 258 +setdomainname 121 +setfsgid 139 +setfsgid32 216 +setfsuid 138 +setfsuid32 215 +setgid 46 +setgid32 214 +setgroups 81 +setgroups32 206 +sethae +sethostname 74 +setitimer 104 +setns 346 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 204 +setresgid 170 +setresgid32 210 +setresuid 164 +setresuid32 208 +setreuid 70 +setreuid32 203 +setrlimit 75 +setsid 66 +setsockopt 366 +settimeofday 79 +setuid 23 +setuid32 213 +setxattr 226 +sgetmask 68 +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 373 +sigaction 67 +sigaltstack 186 +signal 48 +signalfd 321 +signalfd4 327 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 359 +socketcall 102 +socketpair 360 +splice 313 +spu_create +spu_run +ssetmask 69 +stat 106 +stat64 195 +statfs 99 +statfs64 268 +statx 383 +stime 25 +subpage_prot +swapcontext +swapoff 115 +swapon 87 +switch_endian +symlink 83 +symlinkat 304 +sync 36 +sync_file_range 314 +sync_file_range2 +syncfs 344 +sys_debug_setcontext +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 315 +tgkill 270 +time 13 +timer_create 259 +timer_delete 263 +timer_getoverrun 262 +timer_gettime 261 +timer_gettime64 408 +timer_settime 260 +timer_settime64 409 +timerfd +timerfd_create 322 +timerfd_gettime 326 +timerfd_gettime64 410 +timerfd_settime 325 +timerfd_settime64 411 +times 43 +tkill 238 +truncate 92 +truncate64 193 +ugetrlimit 191 +umask 60 +umount 22 +umount2 52 +uname 122 +unlink 10 +unlinkat 301 +unshare 310 +userfaultfd 374 +ustat 62 +utime 30 +utimensat 320 +utimensat_time64 412 +utimes 271 +utrap_install +vfork 190 +vhangup 111 +vm86 166 +vm86old 113 +vmsplice 316 +wait4 114 +waitid 284 +waitpid 7 +write 4 +writev 146 diff --git a/src/basic/syscalls-ia64.txt b/src/basic/syscalls-ia64.txt new file mode 100644 index 0000000..e6adcce --- /dev/null +++ b/src/basic/syscalls-ia64.txt @@ -0,0 +1,604 @@ +_llseek +_newselect +_sysctl 1150 +accept 1194 +accept4 1334 +access 1049 +acct 1064 +add_key 1271 +adjtimex 1131 +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bdflush 1138 +bind 1191 +bpf 1341 +brk 1060 +cachectl +cacheflush +cachestat 1475 +capget 1185 +capset 1186 +chdir 1034 +chmod 1038 +chown 1039 +chown32 +chroot 1068 +clock_adjtime 1328 +clock_adjtime64 +clock_getres 1255 +clock_getres_time64 +clock_gettime 1254 +clock_gettime64 +clock_nanosleep 1256 +clock_nanosleep_time64 +clock_settime 1253 +clock_settime64 +clone 1128 +clone2 1213 +clone3 +close 1029 +close_range 1460 +connect 1192 +copy_file_range 1347 +creat 1030 +create_module +delete_module 1134 +dipc +dup 1057 +dup2 1070 +dup3 1316 +epoll_create 1243 +epoll_create1 1315 +epoll_ctl 1244 +epoll_ctl_old +epoll_pwait 1305 +epoll_pwait2 1465 +epoll_wait 1245 +epoll_wait_old +eventfd 1309 +eventfd2 1314 +exec_with_loader +execv +execve 1033 +execveat 1342 +exit 1025 +exit_group 1236 +faccessat 1293 +faccessat2 1463 +fadvise64 1234 +fadvise64_64 +fallocate 1303 +fanotify_init 1323 +fanotify_mark 1324 +fchdir 1035 +fchmod 1099 +fchmodat 1292 +fchmodat2 1476 +fchown 1100 +fchown32 +fchownat 1284 +fcntl 1066 +fcntl64 +fdatasync 1052 +fgetxattr 1222 +finit_module 1335 +flistxattr 1225 +flock 1145 +fork +fremovexattr 1228 +fsconfig 1455 +fsetxattr 1219 +fsmount 1456 +fsopen 1454 +fspick 1457 +fstat 1212 +fstat64 +fstatat64 +fstatfs 1104 +fstatfs64 1257 +fsync 1051 +ftruncate 1098 +ftruncate64 +futex 1230 +futex_time64 +futex_waitv 1473 +futimesat 1285 +get_kernel_syms +get_mempolicy 1260 +get_robust_list 1299 +get_thread_area +getcpu 1304 +getcwd 1184 +getdents 1144 +getdents64 1214 +getdomainname +getdtablesize +getegid 1063 +getegid32 +geteuid 1047 +geteuid32 +getgid 1062 +getgid32 +getgroups 1077 +getgroups32 +gethostname +getitimer 1119 +getpagesize +getpeername 1196 +getpgid 1079 +getpgrp +getpid 1041 +getpmsg 1188 +getppid 1042 +getpriority 1101 +getrandom 1339 +getresgid 1075 +getresgid32 +getresuid 1073 +getresuid32 +getrlimit 1085 +getrusage 1086 +getsid 1082 +getsockname 1195 +getsockopt 1204 +gettid 1105 +gettimeofday 1087 +getuid 1046 +getuid32 +getunwind 1215 +getxattr 1220 +getxgid +getxpid +getxuid +idle +init_module 1133 +inotify_add_watch 1278 +inotify_init 1277 +inotify_init1 1318 +inotify_rm_watch 1279 +io_cancel 1242 +io_destroy 1239 +io_getevents 1240 +io_pgetevents 1351 +io_pgetevents_time64 +io_setup 1238 +io_submit 1241 +io_uring_enter 1450 +io_uring_register 1451 +io_uring_setup 1449 +ioctl 1065 +ioperm +iopl +ioprio_get 1275 +ioprio_set 1274 +ipc +kcmp 1345 +kern_features +kexec_file_load +kexec_load 1268 +keyctl 1273 +kill 1053 +landlock_add_rule 1469 +landlock_create_ruleset 1468 +landlock_restrict_self 1470 +lchown 1124 +lchown32 +lgetxattr 1221 +link 1031 +linkat 1289 +listen 1193 +listxattr 1223 +llistxattr 1224 +lookup_dcookie 1237 +lremovexattr 1227 +lseek 1040 +lsetxattr 1218 +lstat 1211 +lstat64 +madvise 1209 +map_shadow_stack +mbind 1259 +membarrier 1344 +memfd_create 1340 +memfd_secret +memory_ordering +migrate_pages 1280 +mincore 1208 +mkdir 1055 +mkdirat 1282 +mknod 1037 +mknodat 1283 +mlock 1153 +mlock2 1346 +mlockall 1154 +mmap 1151 +mmap2 1172 +modify_ldt +mount 1043 +mount_setattr 1466 +move_mount 1453 +move_pages 1276 +mprotect 1155 +mq_getsetattr 1267 +mq_notify 1266 +mq_open 1262 +mq_timedreceive 1265 +mq_timedreceive_time64 +mq_timedsend 1264 +mq_timedsend_time64 +mq_unlink 1263 +mremap 1156 +msgctl 1112 +msgget 1109 +msgrcv 1111 +msgsnd 1110 +msync 1157 +multiplexer +munlock 1158 +munlockall 1159 +munmap 1152 +name_to_handle_at 1326 +nanosleep 1168 +newfstatat 1286 +nfsservctl 1169 +nice +old_adjtimex +old_getpagesize 1171 +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 1028 +open_by_handle_at 1327 +open_tree 1452 +openat 1281 +openat2 1461 +or1k_atomic +osf_adjtime +osf_afs_syscall +osf_alt_plock +osf_alt_setsid +osf_alt_sigpending +osf_asynch_daemon +osf_audcntl +osf_audgen +osf_chflags +osf_execve +osf_exportfs +osf_fchflags +osf_fdatasync +osf_fpathconf +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_fuser +osf_getaddressconf +osf_getdirentries +osf_getdomainname +osf_getfh +osf_getfsstat +osf_gethostid +osf_getitimer +osf_getlogin +osf_getmnt +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_kloadcall +osf_kmodcall +osf_lstat +osf_memcntl +osf_mincore +osf_mount +osf_mremap +osf_msfs_syscall +osf_msleep +osf_mvalid +osf_mwakeup +osf_naccept +osf_nfssvc +osf_ngetpeername +osf_ngetsockname +osf_nrecvfrom +osf_nrecvmsg +osf_nsendmsg +osf_ntp_adjtime +osf_ntp_gettime +osf_old_creat +osf_old_fstat +osf_old_getpgrp +osf_old_killpg +osf_old_lstat +osf_old_open +osf_old_sigaction +osf_old_sigblock +osf_old_sigreturn +osf_old_sigsetmask +osf_old_sigvec +osf_old_stat +osf_old_vadvise +osf_old_vtrace +osf_old_wait +osf_oldquota +osf_pathconf +osf_pid_block +osf_pid_unblock +osf_plock +osf_priocntlset +osf_profil +osf_proplist_syscall +osf_reboot +osf_revoke +osf_sbrk +osf_security +osf_select +osf_set_program_attributes +osf_set_speculative +osf_sethostid +osf_setitimer +osf_setlogin +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_signal +osf_sigprocmask +osf_sigsendset +osf_sigstack +osf_sigwaitprim +osf_sstk +osf_stat +osf_statfs +osf_statfs64 +osf_subsys_info +osf_swapctl +osf_swapon +osf_syscall +osf_sysinfo +osf_table +osf_uadmin +osf_usleep_thread +osf_uswitch +osf_utc_adjtime +osf_utc_gettime +osf_utimes +osf_utsname +osf_wait4 +osf_waitid +pause +pciconfig_iobase +pciconfig_read 1173 +pciconfig_write 1174 +perf_event_open 1352 +perfctr +personality 1140 +pidfd_getfd 1462 +pidfd_open 1458 +pidfd_send_signal 1448 +pipe 1058 +pipe2 1317 +pivot_root 1207 +pkey_alloc 1355 +pkey_free 1356 +pkey_mprotect 1354 +poll 1090 +ppoll 1295 +ppoll_time64 +prctl 1170 +pread64 1148 +preadv 1319 +preadv2 1348 +prlimit64 1325 +process_madvise 1464 +process_mrelease 1472 +process_vm_readv 1332 +process_vm_writev 1333 +pselect6 1294 +pselect6_time64 +ptrace 1048 +pwrite64 1149 +pwritev 1320 +pwritev2 1349 +query_module +quotactl 1137 +quotactl_fd 1467 +read 1026 +readahead 1216 +readdir +readlink 1092 +readlinkat 1291 +readv 1146 +reboot 1096 +recv 1200 +recvfrom 1201 +recvmmsg 1322 +recvmmsg_time64 +recvmsg 1206 +remap_file_pages 1125 +removexattr 1226 +rename 1054 +renameat 1288 +renameat2 1338 +request_key 1272 +restart_syscall 1246 +riscv_flush_icache +riscv_hwprobe +rmdir 1056 +rseq 1357 +rt_sigaction 1177 +rt_sigpending 1178 +rt_sigprocmask 1179 +rt_sigqueueinfo 1180 +rt_sigreturn 1181 +rt_sigsuspend 1182 +rt_sigtimedwait 1183 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 1321 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 1165 +sched_get_priority_min 1166 +sched_getaffinity 1232 +sched_getattr 1337 +sched_getparam 1160 +sched_getscheduler 1162 +sched_rr_get_interval 1167 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 1231 +sched_setattr 1336 +sched_setparam 1161 +sched_setscheduler 1163 +sched_yield 1164 +seccomp 1353 +select 1089 +semctl 1108 +semget 1106 +semop 1107 +semtimedop 1247 +semtimedop_time64 +send 1198 +sendfile 1187 +sendfile64 +sendmmsg 1331 +sendmsg 1205 +sendto 1199 +set_mempolicy 1261 +set_mempolicy_home_node 1474 +set_robust_list 1298 +set_thread_area +set_tid_address 1233 +setdomainname 1129 +setfsgid 1143 +setfsgid32 +setfsuid 1142 +setfsuid32 +setgid 1061 +setgid32 +setgroups 1078 +setgroups32 +sethae +sethostname 1083 +setitimer 1118 +setns 1330 +setpgid 1080 +setpgrp +setpriority 1102 +setregid 1072 +setregid32 +setresgid 1076 +setresgid32 +setresuid 1074 +setresuid32 +setreuid 1071 +setreuid32 +setrlimit 1084 +setsid 1081 +setsockopt 1203 +settimeofday 1088 +setuid 1045 +setuid32 +setxattr 1217 +sgetmask +shmat 1114 +shmctl 1116 +shmdt 1115 +shmget 1113 +shutdown 1202 +sigaction +sigaltstack 1176 +signal +signalfd 1307 +signalfd4 1313 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 1190 +socketcall +socketpair 1197 +splice 1297 +spu_create +spu_run +ssetmask +stat 1210 +stat64 +statfs 1103 +statfs64 1258 +statx 1350 +stime +subpage_prot +swapcontext +swapoff 1095 +swapon 1094 +switch_endian +symlink 1091 +symlinkat 1290 +sync 1050 +sync_file_range 1300 +sync_file_range2 +syncfs 1329 +sys_debug_setcontext +syscall +sysfs 1139 +sysinfo 1127 +syslog 1117 +sysmips +tee 1301 +tgkill 1235 +time +timer_create 1248 +timer_delete 1252 +timer_getoverrun 1251 +timer_gettime 1250 +timer_gettime64 +timer_settime 1249 +timer_settime64 +timerfd 1308 +timerfd_create 1310 +timerfd_gettime 1312 +timerfd_gettime64 +timerfd_settime 1311 +timerfd_settime64 +times 1059 +tkill 1229 +truncate 1097 +truncate64 +ugetrlimit +umask 1067 +umount 1044 +umount2 1044 +uname 1130 +unlink 1032 +unlinkat 1287 +unshare 1296 +uselib 1093 +userfaultfd 1343 +ustat 1069 +utime +utimensat 1306 +utimensat_time64 +utimes 1036 +utrap_install +vfork +vhangup 1123 +vm86 +vm86old +vmsplice 1302 +wait4 1126 +waitid 1270 +waitpid +write 1027 +writev 1147 diff --git a/src/basic/syscalls-loongarch64.txt b/src/basic/syscalls-loongarch64.txt new file mode 100644 index 0000000..34a45cb --- /dev/null +++ b/src/basic/syscalls-loongarch64.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 202 +accept4 242 +access +acct 89 +add_key 217 +adjtimex 171 +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 200 +bpf 280 +brk 214 +cachectl +cacheflush +cachestat 451 +capget 90 +capset 91 +chdir 49 +chmod +chown +chown32 +chroot 51 +clock_adjtime 266 +clock_adjtime64 +clock_getres 114 +clock_getres_time64 +clock_gettime 113 +clock_gettime64 +clock_nanosleep 115 +clock_nanosleep_time64 +clock_settime 112 +clock_settime64 +clone 220 +clone3 435 +close 57 +close_range 436 +connect 203 +copy_file_range 285 +creat +delete_module 106 +dipc +dup 23 +dup2 +dup3 24 +epoll_create +epoll_create1 20 +epoll_ctl 21 +epoll_ctl_old +epoll_pwait 22 +epoll_pwait2 441 +epoll_wait +epoll_wait_old +eventfd +eventfd2 19 +exec_with_loader +execv +execve 221 +execveat 281 +exit 93 +exit_group 94 +faccessat 48 +faccessat2 439 +fadvise64 223 +fadvise64_64 +fallocate 47 +fanotify_init 262 +fanotify_mark 263 +fchdir 50 +fchmod 52 +fchmodat 53 +fchmodat2 452 +fchown 55 +fchown32 +fchownat 54 +fcntl 25 +fcntl64 +fdatasync 83 +fgetxattr 10 +finit_module 273 +flistxattr 13 +flock 32 +fork +fremovexattr 16 +fsconfig 431 +fsetxattr 7 +fsmount 432 +fsopen 430 +fspick 433 +fstat +fstat64 +fstatat64 +fstatfs 44 +fstatfs64 +fsync 82 +ftruncate 46 +ftruncate64 +futex 98 +futex_requeue 456 +futex_time64 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat +get_mempolicy 236 +get_robust_list 100 +get_thread_area +getcpu 168 +getcwd 17 +getdents +getdents64 61 +getdomainname +getdtablesize +getegid 177 +getegid32 +geteuid 175 +geteuid32 +getgid 176 +getgid32 +getgroups 158 +getgroups32 +gethostname +getitimer 102 +getpagesize +getpeername 205 +getpgid 155 +getpgrp +getpid 172 +getppid 173 +getpriority 141 +getrandom 278 +getresgid 150 +getresgid32 +getresuid 148 +getresuid32 +getrlimit +getrusage 165 +getsid 156 +getsockname 204 +getsockopt 209 +gettid 178 +gettimeofday 169 +getuid 174 +getuid32 +getxattr 8 +getxgid +getxpid +getxuid +init_module 105 +inotify_add_watch 27 +inotify_init +inotify_init1 26 +inotify_rm_watch 28 +io_cancel 3 +io_destroy 1 +io_getevents 4 +io_pgetevents 292 +io_pgetevents_time64 +io_setup 0 +io_submit 2 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 29 +ioperm +iopl +ioprio_get 31 +ioprio_set 30 +ipc +kcmp 272 +kern_features +kexec_file_load 294 +kexec_load 104 +keyctl 219 +kill 129 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown +lchown32 +lgetxattr 9 +link +linkat 37 +listen 201 +listxattr 11 +llistxattr 12 +lookup_dcookie 18 +lremovexattr 15 +lseek 62 +lsetxattr 6 +lstat +lstat64 +madvise 233 +map_shadow_stack 453 +mbind 235 +membarrier 283 +memfd_create 279 +memfd_secret +memory_ordering +migrate_pages 238 +mincore 232 +mkdir +mkdirat 34 +mknod +mknodat 33 +mlock 228 +mlock2 284 +mlockall 230 +mmap 222 +mmap2 +modify_ldt +mount 40 +mount_setattr 442 +move_mount 429 +move_pages 239 +mprotect 226 +mq_getsetattr 185 +mq_notify 184 +mq_open 180 +mq_timedreceive 183 +mq_timedreceive_time64 +mq_timedsend 182 +mq_timedsend_time64 +mq_unlink 181 +mremap 216 +msgctl 187 +msgget 186 +msgrcv 188 +msgsnd 189 +msync 227 +multiplexer +munlock 229 +munlockall 231 +munmap 215 +name_to_handle_at 264 +nanosleep 101 +newfstatat +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open +open_by_handle_at 265 +open_tree 428 +openat 56 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 241 +perfctr +personality 92 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe +pipe2 59 +pivot_root 41 +pkey_alloc 289 +pkey_free 290 +pkey_mprotect 288 +poll +ppoll 73 +ppoll_time64 +prctl 167 +pread64 67 +preadv 69 +preadv2 286 +prlimit64 261 +process_madvise 440 +process_mrelease 448 +process_vm_readv 270 +process_vm_writev 271 +pselect6 72 +pselect6_time64 +ptrace 117 +pwrite64 68 +pwritev 70 +pwritev2 287 +quotactl 60 +quotactl_fd 443 +read 63 +readahead 213 +readdir +readlink +readlinkat 78 +readv 65 +reboot 142 +recv +recvfrom 207 +recvmmsg 243 +recvmmsg_time64 +recvmsg 212 +remap_file_pages 234 +removexattr 14 +rename +renameat +renameat2 276 +request_key 218 +restart_syscall 128 +riscv_flush_icache +riscv_hwprobe +rmdir +rseq 293 +rt_sigaction 134 +rt_sigpending 136 +rt_sigprocmask 135 +rt_sigqueueinfo 138 +rt_sigreturn 139 +rt_sigsuspend 133 +rt_sigtimedwait 137 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 240 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 125 +sched_get_priority_min 126 +sched_getaffinity 123 +sched_getattr 275 +sched_getparam 121 +sched_getscheduler 120 +sched_rr_get_interval 127 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 122 +sched_setattr 274 +sched_setparam 118 +sched_setscheduler 119 +sched_yield 124 +seccomp 277 +select +semctl 191 +semget 190 +semop 193 +semtimedop 192 +semtimedop_time64 +send +sendfile 71 +sendfile64 +sendmmsg 269 +sendmsg 211 +sendto 206 +set_mempolicy 237 +set_mempolicy_home_node 450 +set_robust_list 99 +set_thread_area +set_tid_address 96 +setdomainname 162 +setfsgid 152 +setfsgid32 +setfsuid 151 +setfsuid32 +setgid 144 +setgid32 +setgroups 159 +setgroups32 +sethae +sethostname 161 +setitimer 103 +setns 268 +setpgid 154 +setpgrp +setpriority 140 +setregid 143 +setregid32 +setresgid 149 +setresgid32 +setresuid 147 +setresuid32 +setreuid 145 +setreuid32 +setrlimit +setsid 157 +setsockopt 208 +settimeofday 170 +setuid 146 +setuid32 +setxattr 5 +sgetmask +shmat 196 +shmctl 195 +shmdt 197 +shmget 194 +shutdown 210 +sigaction +sigaltstack 132 +signal +signalfd +signalfd4 74 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 198 +socketcall +socketpair 199 +splice 76 +spu_create +spu_run +ssetmask +stat +stat64 +statfs 43 +statfs64 +statx 291 +stime +subpage_prot +swapcontext +swapoff 225 +swapon 224 +switch_endian +symlink +symlinkat 36 +sync 81 +sync_file_range 84 +sync_file_range2 +syncfs 267 +sys_debug_setcontext +syscall +sysfs +sysinfo 179 +syslog 116 +sysmips +tee 77 +tgkill 131 +time +timer_create 107 +timer_delete 111 +timer_getoverrun 109 +timer_gettime 108 +timer_gettime64 +timer_settime 110 +timer_settime64 +timerfd +timerfd_create 85 +timerfd_gettime 87 +timerfd_gettime64 +timerfd_settime 86 +timerfd_settime64 +times 153 +tkill 130 +truncate 45 +truncate64 +ugetrlimit +umask 166 +umount +umount2 39 +uname 160 +unlink +unlinkat 35 +unshare 97 +userfaultfd 282 +ustat +utime +utimensat 88 +utimensat_time64 +utimes +utrap_install +vfork +vhangup 58 +vm86 +vm86old +vmsplice 75 +wait4 260 +waitid 95 +waitpid +write 64 +writev 66 diff --git a/src/basic/syscalls-m68k.txt b/src/basic/syscalls-m68k.txt new file mode 100644 index 0000000..712f272 --- /dev/null +++ b/src/basic/syscalls-m68k.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept +accept4 361 +access 33 +acct 51 +add_key 279 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier 336 +atomic_cmpxchg_32 335 +bind 358 +bpf 354 +brk 45 +cachectl +cacheflush 123 +cachestat 451 +capget 184 +capset 185 +chdir 12 +chmod 15 +chown 16 +chown32 198 +chroot 61 +clock_adjtime 342 +clock_adjtime64 405 +clock_getres 261 +clock_getres_time64 406 +clock_gettime 260 +clock_gettime64 403 +clock_nanosleep 262 +clock_nanosleep_time64 407 +clock_settime 259 +clock_settime64 404 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 359 +copy_file_range 376 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 326 +epoll_create 249 +epoll_create1 325 +epoll_ctl 250 +epoll_ctl_old +epoll_pwait 315 +epoll_pwait2 441 +epoll_wait 251 +epoll_wait_old +eventfd 319 +eventfd2 324 +exec_with_loader +execv +execve 11 +execveat 355 +exit 1 +exit_group 247 +faccessat 300 +faccessat2 439 +fadvise64 246 +fadvise64_64 267 +fallocate 320 +fanotify_init 337 +fanotify_mark 338 +fchdir 133 +fchmod 94 +fchmodat 299 +fchmodat2 452 +fchown 95 +fchown32 207 +fchownat 291 +fcntl 55 +fcntl64 239 +fdatasync 148 +fgetxattr 228 +finit_module 348 +flistxattr 231 +flock 143 +fork 2 +fremovexattr 234 +fsconfig 431 +fsetxattr 225 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 197 +fstatat64 293 +fstatfs 100 +fstatfs64 264 +fsync 118 +ftruncate 93 +ftruncate64 194 +futex 235 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 292 +get_mempolicy 269 +get_robust_list 305 +get_thread_area 333 +getcpu 314 +getcwd 183 +getdents 141 +getdents64 220 +getdomainname +getdtablesize +getegid 50 +getegid32 202 +geteuid 49 +geteuid32 201 +getgid 47 +getgid32 200 +getgroups 80 +getgroups32 205 +gethostname +getitimer 105 +getpagesize 166 +getpeername 365 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 352 +getresgid 171 +getresgid32 211 +getresuid 165 +getresuid32 209 +getrlimit 76 +getrusage 77 +getsid 147 +getsockname 364 +getsockopt 362 +gettid 221 +gettimeofday 78 +getuid 24 +getuid32 199 +getxattr 226 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 285 +inotify_init 284 +inotify_init1 328 +inotify_rm_watch 286 +io_cancel 245 +io_destroy 242 +io_getevents 243 +io_pgetevents +io_pgetevents_time64 416 +io_setup 241 +io_submit 244 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm +iopl +ioprio_get 283 +ioprio_set 282 +ipc 117 +kcmp 347 +kern_features +kexec_file_load +kexec_load 313 +keyctl 281 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 182 +lchown32 212 +lgetxattr 227 +link 9 +linkat 296 +listen 360 +listxattr 229 +llistxattr 230 +lookup_dcookie 248 +lremovexattr 233 +lseek 19 +lsetxattr 224 +lstat 107 +lstat64 196 +madvise 238 +map_shadow_stack 453 +mbind 268 +membarrier 374 +memfd_create 353 +memfd_secret +memory_ordering +migrate_pages 287 +mincore 237 +mkdir 39 +mkdirat 289 +mknod 14 +mknodat 290 +mlock 150 +mlock2 375 +mlockall 152 +mmap 90 +mmap2 192 +modify_ldt +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 310 +mprotect 125 +mq_getsetattr 276 +mq_notify 275 +mq_open 271 +mq_timedreceive 274 +mq_timedreceive_time64 419 +mq_timedsend 273 +mq_timedsend_time64 418 +mq_unlink 272 +mremap 163 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 144 +multiplexer +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 340 +nanosleep 162 +newfstatat +nice 34 +old_adjtimex +oldfstat 28 +oldlstat 84 +oldolduname +oldstat 18 +oldumount +olduname +open 5 +open_by_handle_at 341 +open_tree 428 +openat 288 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 332 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 327 +pivot_root 217 +pkey_alloc 382 +pkey_free 383 +pkey_mprotect 381 +poll 168 +ppoll 302 +ppoll_time64 414 +prctl 172 +pread64 180 +preadv 329 +preadv2 377 +prlimit64 339 +process_madvise 440 +process_mrelease 448 +process_vm_readv 345 +process_vm_writev 346 +pselect6 301 +pselect6_time64 413 +ptrace 26 +pwrite64 181 +pwritev 330 +pwritev2 378 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 240 +readdir 89 +readlink 85 +readlinkat 298 +readv 145 +reboot 88 +recv +recvfrom 368 +recvmmsg 371 +recvmmsg_time64 417 +recvmsg 369 +remap_file_pages 252 +removexattr 232 +rename 38 +renameat 295 +renameat2 351 +request_key 280 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 384 +rt_sigaction 174 +rt_sigpending 176 +rt_sigprocmask 175 +rt_sigqueueinfo 178 +rt_sigreturn 173 +rt_sigsuspend 179 +rt_sigtimedwait 177 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 331 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 312 +sched_getattr 350 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 311 +sched_setattr 349 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 380 +select 82 +semctl 394 +semget 393 +semop +semtimedop +semtimedop_time64 420 +send +sendfile 187 +sendfile64 236 +sendmmsg 372 +sendmsg 367 +sendto 366 +set_mempolicy 270 +set_mempolicy_home_node 450 +set_robust_list 304 +set_thread_area 334 +set_tid_address 253 +setdomainname 121 +setfsgid 139 +setfsgid32 216 +setfsuid 138 +setfsuid32 215 +setgid 46 +setgid32 214 +setgroups 81 +setgroups32 206 +sethae +sethostname 74 +setitimer 104 +setns 344 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 204 +setresgid 170 +setresgid32 210 +setresuid 164 +setresuid32 208 +setreuid 70 +setreuid32 203 +setrlimit 75 +setsid 66 +setsockopt 363 +settimeofday 79 +setuid 23 +setuid32 213 +setxattr 223 +sgetmask 68 +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 370 +sigaction 67 +sigaltstack 186 +signal 48 +signalfd 317 +signalfd4 323 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 356 +socketcall 102 +socketpair 357 +splice 306 +spu_create +spu_run +ssetmask 69 +stat 106 +stat64 195 +statfs 99 +statfs64 263 +statx 379 +stime 25 +subpage_prot +swapcontext +swapoff 115 +swapon 87 +switch_endian +symlink 83 +symlinkat 297 +sync 36 +sync_file_range 307 +sync_file_range2 +syncfs 343 +sys_debug_setcontext +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 308 +tgkill 265 +time 13 +timer_create 254 +timer_delete 258 +timer_getoverrun 257 +timer_gettime 256 +timer_gettime64 408 +timer_settime 255 +timer_settime64 409 +timerfd +timerfd_create 318 +timerfd_gettime 322 +timerfd_gettime64 410 +timerfd_settime 321 +timerfd_settime64 411 +times 43 +tkill 222 +truncate 92 +truncate64 193 +ugetrlimit 191 +umask 60 +umount 22 +umount2 52 +uname 122 +unlink 10 +unlinkat 294 +unshare 303 +userfaultfd 373 +ustat 62 +utime 30 +utimensat 316 +utimensat_time64 412 +utimes 266 +utrap_install +vfork 190 +vhangup 111 +vm86 +vm86old +vmsplice 309 +wait4 114 +waitid 277 +waitpid 7 +write 4 +writev 146 diff --git a/src/basic/syscalls-mips64.txt b/src/basic/syscalls-mips64.txt new file mode 100644 index 0000000..2d0984e --- /dev/null +++ b/src/basic/syscalls-mips64.txt @@ -0,0 +1,515 @@ +_llseek +_newselect 5022 +accept 5042 +accept4 5293 +access 5020 +acct 5158 +add_key 5239 +adjtimex 5154 +alarm 5037 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 5048 +bpf 5315 +brk 5012 +cachectl 5198 +cacheflush 5197 +cachestat 5451 +capget 5123 +capset 5124 +chdir 5078 +chmod 5088 +chown 5090 +chown32 +chroot 5156 +clock_adjtime 5300 +clock_adjtime64 +clock_getres 5223 +clock_getres_time64 +clock_gettime 5222 +clock_gettime64 +clock_nanosleep 5224 +clock_nanosleep_time64 +clock_settime 5221 +clock_settime64 +clone 5055 +clone3 5435 +close 5003 +close_range 5436 +connect 5041 +copy_file_range 5320 +creat 5083 +delete_module 5169 +dipc +dup 5031 +dup2 5032 +dup3 5286 +epoll_create 5207 +epoll_create1 5285 +epoll_ctl 5208 +epoll_ctl_old +epoll_pwait 5272 +epoll_pwait2 5441 +epoll_wait 5209 +epoll_wait_old +eventfd 5278 +eventfd2 5284 +exec_with_loader +execv +execve 5057 +execveat 5316 +exit 5058 +exit_group 5205 +faccessat 5259 +faccessat2 5439 +fadvise64 5215 +fadvise64_64 +fallocate 5279 +fanotify_init 5295 +fanotify_mark 5296 +fchdir 5079 +fchmod 5089 +fchmodat 5258 +fchmodat2 5452 +fchown 5091 +fchown32 +fchownat 5250 +fcntl 5070 +fcntl64 +fdatasync 5073 +fgetxattr 5185 +finit_module 5307 +flistxattr 5188 +flock 5071 +fork 5056 +fremovexattr 5191 +fsconfig 5431 +fsetxattr 5182 +fsmount 5432 +fsopen 5430 +fspick 5433 +fstat 5005 +fstat64 +fstatat64 +fstatfs 5135 +fstatfs64 +fsync 5072 +ftruncate 5075 +ftruncate64 +futex 5194 +futex_requeue 5456 +futex_time64 +futex_wait 5455 +futex_waitv 5449 +futex_wake 5454 +futimesat 5251 +get_mempolicy 5228 +get_robust_list 5269 +get_thread_area +getcpu 5271 +getcwd 5077 +getdents 5076 +getdents64 5308 +getdomainname +getdtablesize +getegid 5106 +getegid32 +geteuid 5105 +geteuid32 +getgid 5102 +getgid32 +getgroups 5113 +getgroups32 +gethostname +getitimer 5035 +getpagesize +getpeername 5051 +getpgid 5119 +getpgrp 5109 +getpid 5038 +getppid 5108 +getpriority 5137 +getrandom 5313 +getresgid 5118 +getresgid32 +getresuid 5116 +getresuid32 +getrlimit 5095 +getrusage 5096 +getsid 5122 +getsockname 5050 +getsockopt 5054 +gettid 5178 +gettimeofday 5094 +getuid 5100 +getuid32 +getxattr 5183 +getxgid +getxpid +getxuid +init_module 5168 +inotify_add_watch 5244 +inotify_init 5243 +inotify_init1 5288 +inotify_rm_watch 5245 +io_cancel 5204 +io_destroy 5201 +io_getevents 5202 +io_pgetevents 5328 +io_pgetevents_time64 +io_setup 5200 +io_submit 5203 +io_uring_enter 5426 +io_uring_register 5427 +io_uring_setup 5425 +ioctl 5015 +ioperm +iopl +ioprio_get 5274 +ioprio_set 5273 +ipc +kcmp 5306 +kern_features +kexec_file_load +kexec_load 5270 +keyctl 5241 +kill 5060 +landlock_add_rule 5445 +landlock_create_ruleset 5444 +landlock_restrict_self 5446 +lchown 5092 +lchown32 +lgetxattr 5184 +link 5084 +linkat 5255 +listen 5049 +listxattr 5186 +llistxattr 5187 +lookup_dcookie 5206 +lremovexattr 5190 +lseek 5008 +lsetxattr 5181 +lstat 5006 +lstat64 +madvise 5027 +map_shadow_stack 5453 +mbind 5227 +membarrier 5318 +memfd_create 5314 +memfd_secret +memory_ordering +migrate_pages 5246 +mincore 5026 +mkdir 5081 +mkdirat 5248 +mknod 5131 +mknodat 5249 +mlock 5146 +mlock2 5319 +mlockall 5148 +mmap 5009 +mmap2 +modify_ldt +mount 5160 +mount_setattr 5442 +move_mount 5429 +move_pages 5267 +mprotect 5010 +mq_getsetattr 5235 +mq_notify 5234 +mq_open 5230 +mq_timedreceive 5233 +mq_timedreceive_time64 +mq_timedsend 5232 +mq_timedsend_time64 +mq_unlink 5231 +mremap 5024 +msgctl 5069 +msgget 5066 +msgrcv 5068 +msgsnd 5067 +msync 5025 +multiplexer +munlock 5147 +munlockall 5149 +munmap 5011 +name_to_handle_at 5298 +nanosleep 5034 +newfstatat 5252 +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 5002 +open_by_handle_at 5299 +open_tree 5428 +openat 5247 +openat2 5437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 5033 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 5292 +perfctr +personality 5132 +pidfd_getfd 5438 +pidfd_open 5434 +pidfd_send_signal 5424 +pipe 5021 +pipe2 5287 +pivot_root 5151 +pkey_alloc 5324 +pkey_free 5325 +pkey_mprotect 5323 +poll 5007 +ppoll 5261 +ppoll_time64 +prctl 5153 +pread64 5016 +preadv 5289 +preadv2 5321 +prlimit64 5297 +process_madvise 5440 +process_mrelease 5448 +process_vm_readv 5304 +process_vm_writev 5305 +pselect6 5260 +pselect6_time64 +ptrace 5099 +pwrite64 5017 +pwritev 5290 +pwritev2 5322 +quotactl 5172 +quotactl_fd 5443 +read 5000 +readahead 5179 +readdir +readlink 5087 +readlinkat 5257 +readv 5018 +reboot 5164 +recv +recvfrom 5044 +recvmmsg 5294 +recvmmsg_time64 +recvmsg 5046 +remap_file_pages 5210 +removexattr 5189 +rename 5080 +renameat 5254 +renameat2 5311 +request_key 5240 +restart_syscall 5213 +riscv_flush_icache +riscv_hwprobe +rmdir 5082 +rseq 5327 +rt_sigaction 5013 +rt_sigpending 5125 +rt_sigprocmask 5014 +rt_sigqueueinfo 5127 +rt_sigreturn 5211 +rt_sigsuspend 5128 +rt_sigtimedwait 5126 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 5291 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 5143 +sched_get_priority_min 5144 +sched_getaffinity 5196 +sched_getattr 5310 +sched_getparam 5140 +sched_getscheduler 5142 +sched_rr_get_interval 5145 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 5195 +sched_setattr 5309 +sched_setparam 5139 +sched_setscheduler 5141 +sched_yield 5023 +seccomp 5312 +select +semctl 5064 +semget 5062 +semop 5063 +semtimedop 5214 +semtimedop_time64 +send +sendfile 5039 +sendfile64 +sendmmsg 5302 +sendmsg 5045 +sendto 5043 +set_mempolicy 5229 +set_mempolicy_home_node 5450 +set_robust_list 5268 +set_thread_area 5242 +set_tid_address 5212 +setdomainname 5166 +setfsgid 5121 +setfsgid32 +setfsuid 5120 +setfsuid32 +setgid 5104 +setgid32 +setgroups 5114 +setgroups32 +sethae +sethostname 5165 +setitimer 5036 +setns 5303 +setpgid 5107 +setpgrp +setpriority 5138 +setregid 5112 +setregid32 +setresgid 5117 +setresgid32 +setresuid 5115 +setresuid32 +setreuid 5111 +setreuid32 +setrlimit 5155 +setsid 5110 +setsockopt 5053 +settimeofday 5159 +setuid 5103 +setuid32 +setxattr 5180 +sgetmask +shmat 5029 +shmctl 5030 +shmdt 5065 +shmget 5028 +shutdown 5047 +sigaction +sigaltstack 5129 +signal +signalfd 5276 +signalfd4 5283 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 5040 +socketcall +socketpair 5052 +splice 5263 +spu_create +spu_run +ssetmask +stat 5004 +stat64 +statfs 5134 +statfs64 +statx 5326 +stime +subpage_prot +swapcontext +swapoff 5163 +swapon 5162 +switch_endian +symlink 5086 +symlinkat 5256 +sync 5157 +sync_file_range 5264 +sync_file_range2 +syncfs 5301 +sys_debug_setcontext +syscall +sysfs 5136 +sysinfo 5097 +syslog 5101 +sysmips 5199 +tee 5265 +tgkill 5225 +time +timer_create 5216 +timer_delete 5220 +timer_getoverrun 5219 +timer_gettime 5218 +timer_gettime64 +timer_settime 5217 +timer_settime64 +timerfd 5277 +timerfd_create 5280 +timerfd_gettime 5281 +timerfd_gettime64 +timerfd_settime 5282 +timerfd_settime64 +times 5098 +tkill 5192 +truncate 5074 +truncate64 +ugetrlimit +umask 5093 +umount +umount2 5161 +uname 5061 +unlink 5085 +unlinkat 5253 +unshare 5262 +userfaultfd 5317 +ustat 5133 +utime 5130 +utimensat 5275 +utimensat_time64 +utimes 5226 +utrap_install +vfork +vhangup 5150 +vm86 +vm86old +vmsplice 5266 +wait4 5059 +waitid 5237 +waitpid +write 5001 +writev 5019 diff --git a/src/basic/syscalls-mips64n32.txt b/src/basic/syscalls-mips64n32.txt new file mode 100644 index 0000000..4475867 --- /dev/null +++ b/src/basic/syscalls-mips64n32.txt @@ -0,0 +1,515 @@ +_llseek +_newselect 6022 +accept 6042 +accept4 6297 +access 6020 +acct 6158 +add_key 6243 +adjtimex 6154 +alarm 6037 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 6048 +bpf 6319 +brk 6012 +cachectl 6198 +cacheflush 6197 +cachestat 6451 +capget 6123 +capset 6124 +chdir 6078 +chmod 6088 +chown 6090 +chown32 +chroot 6156 +clock_adjtime 6305 +clock_adjtime64 6405 +clock_getres 6227 +clock_getres_time64 6406 +clock_gettime 6226 +clock_gettime64 6403 +clock_nanosleep 6228 +clock_nanosleep_time64 6407 +clock_settime 6225 +clock_settime64 6404 +clone 6055 +clone3 6435 +close 6003 +close_range 6436 +connect 6041 +copy_file_range 6324 +creat 6083 +delete_module 6169 +dipc +dup 6031 +dup2 6032 +dup3 6290 +epoll_create 6207 +epoll_create1 6289 +epoll_ctl 6208 +epoll_ctl_old +epoll_pwait 6276 +epoll_pwait2 6441 +epoll_wait 6209 +epoll_wait_old +eventfd 6282 +eventfd2 6288 +exec_with_loader +execv +execve 6057 +execveat 6320 +exit 6058 +exit_group 6205 +faccessat 6263 +faccessat2 6439 +fadvise64 6216 +fadvise64_64 +fallocate 6283 +fanotify_init 6300 +fanotify_mark 6301 +fchdir 6079 +fchmod 6089 +fchmodat 6262 +fchmodat2 6452 +fchown 6091 +fchown32 +fchownat 6254 +fcntl 6070 +fcntl64 6212 +fdatasync 6073 +fgetxattr 6185 +finit_module 6312 +flistxattr 6188 +flock 6071 +fork 6056 +fremovexattr 6191 +fsconfig 6431 +fsetxattr 6182 +fsmount 6432 +fsopen 6430 +fspick 6433 +fstat 6005 +fstat64 +fstatat64 +fstatfs 6135 +fstatfs64 6218 +fsync 6072 +ftruncate 6075 +ftruncate64 +futex 6194 +futex_requeue 6456 +futex_time64 6422 +futex_wait 6455 +futex_waitv 6449 +futex_wake 6454 +futimesat 6255 +get_mempolicy 6232 +get_robust_list 6273 +get_thread_area +getcpu 6275 +getcwd 6077 +getdents 6076 +getdents64 6299 +getdomainname +getdtablesize +getegid 6106 +getegid32 +geteuid 6105 +geteuid32 +getgid 6102 +getgid32 +getgroups 6113 +getgroups32 +gethostname +getitimer 6035 +getpagesize +getpeername 6051 +getpgid 6119 +getpgrp 6109 +getpid 6038 +getppid 6108 +getpriority 6137 +getrandom 6317 +getresgid 6118 +getresgid32 +getresuid 6116 +getresuid32 +getrlimit 6095 +getrusage 6096 +getsid 6122 +getsockname 6050 +getsockopt 6054 +gettid 6178 +gettimeofday 6094 +getuid 6100 +getuid32 +getxattr 6183 +getxgid +getxpid +getxuid +init_module 6168 +inotify_add_watch 6248 +inotify_init 6247 +inotify_init1 6292 +inotify_rm_watch 6249 +io_cancel 6204 +io_destroy 6201 +io_getevents 6202 +io_pgetevents 6332 +io_pgetevents_time64 6416 +io_setup 6200 +io_submit 6203 +io_uring_enter 6426 +io_uring_register 6427 +io_uring_setup 6425 +ioctl 6015 +ioperm +iopl +ioprio_get 6278 +ioprio_set 6277 +ipc +kcmp 6311 +kern_features +kexec_file_load +kexec_load 6274 +keyctl 6245 +kill 6060 +landlock_add_rule 6445 +landlock_create_ruleset 6444 +landlock_restrict_self 6446 +lchown 6092 +lchown32 +lgetxattr 6184 +link 6084 +linkat 6259 +listen 6049 +listxattr 6186 +llistxattr 6187 +lookup_dcookie 6206 +lremovexattr 6190 +lseek 6008 +lsetxattr 6181 +lstat 6006 +lstat64 +madvise 6027 +map_shadow_stack 6453 +mbind 6231 +membarrier 6322 +memfd_create 6318 +memfd_secret +memory_ordering +migrate_pages 6250 +mincore 6026 +mkdir 6081 +mkdirat 6252 +mknod 6131 +mknodat 6253 +mlock 6146 +mlock2 6323 +mlockall 6148 +mmap 6009 +mmap2 +modify_ldt +mount 6160 +mount_setattr 6442 +move_mount 6429 +move_pages 6271 +mprotect 6010 +mq_getsetattr 6239 +mq_notify 6238 +mq_open 6234 +mq_timedreceive 6237 +mq_timedreceive_time64 6419 +mq_timedsend 6236 +mq_timedsend_time64 6418 +mq_unlink 6235 +mremap 6024 +msgctl 6069 +msgget 6066 +msgrcv 6068 +msgsnd 6067 +msync 6025 +multiplexer +munlock 6147 +munlockall 6149 +munmap 6011 +name_to_handle_at 6303 +nanosleep 6034 +newfstatat 6256 +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 6002 +open_by_handle_at 6304 +open_tree 6428 +openat 6251 +openat2 6437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 6033 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 6296 +perfctr +personality 6132 +pidfd_getfd 6438 +pidfd_open 6434 +pidfd_send_signal 6424 +pipe 6021 +pipe2 6291 +pivot_root 6151 +pkey_alloc 6328 +pkey_free 6329 +pkey_mprotect 6327 +poll 6007 +ppoll 6265 +ppoll_time64 6414 +prctl 6153 +pread64 6016 +preadv 6293 +preadv2 6325 +prlimit64 6302 +process_madvise 6440 +process_mrelease 6448 +process_vm_readv 6309 +process_vm_writev 6310 +pselect6 6264 +pselect6_time64 6413 +ptrace 6099 +pwrite64 6017 +pwritev 6294 +pwritev2 6326 +quotactl 6172 +quotactl_fd 6443 +read 6000 +readahead 6179 +readdir +readlink 6087 +readlinkat 6261 +readv 6018 +reboot 6164 +recv +recvfrom 6044 +recvmmsg 6298 +recvmmsg_time64 6417 +recvmsg 6046 +remap_file_pages 6210 +removexattr 6189 +rename 6080 +renameat 6258 +renameat2 6315 +request_key 6244 +restart_syscall 6214 +riscv_flush_icache +riscv_hwprobe +rmdir 6082 +rseq 6331 +rt_sigaction 6013 +rt_sigpending 6125 +rt_sigprocmask 6014 +rt_sigqueueinfo 6127 +rt_sigreturn 6211 +rt_sigsuspend 6128 +rt_sigtimedwait 6126 +rt_sigtimedwait_time64 6421 +rt_tgsigqueueinfo 6295 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 6143 +sched_get_priority_min 6144 +sched_getaffinity 6196 +sched_getattr 6314 +sched_getparam 6140 +sched_getscheduler 6142 +sched_rr_get_interval 6145 +sched_rr_get_interval_time64 6423 +sched_set_affinity +sched_setaffinity 6195 +sched_setattr 6313 +sched_setparam 6139 +sched_setscheduler 6141 +sched_yield 6023 +seccomp 6316 +select +semctl 6064 +semget 6062 +semop 6063 +semtimedop 6215 +semtimedop_time64 6420 +send +sendfile 6039 +sendfile64 6219 +sendmmsg 6307 +sendmsg 6045 +sendto 6043 +set_mempolicy 6233 +set_mempolicy_home_node 6450 +set_robust_list 6272 +set_thread_area 6246 +set_tid_address 6213 +setdomainname 6166 +setfsgid 6121 +setfsgid32 +setfsuid 6120 +setfsuid32 +setgid 6104 +setgid32 +setgroups 6114 +setgroups32 +sethae +sethostname 6165 +setitimer 6036 +setns 6308 +setpgid 6107 +setpgrp +setpriority 6138 +setregid 6112 +setregid32 +setresgid 6117 +setresgid32 +setresuid 6115 +setresuid32 +setreuid 6111 +setreuid32 +setrlimit 6155 +setsid 6110 +setsockopt 6053 +settimeofday 6159 +setuid 6103 +setuid32 +setxattr 6180 +sgetmask +shmat 6029 +shmctl 6030 +shmdt 6065 +shmget 6028 +shutdown 6047 +sigaction +sigaltstack 6129 +signal +signalfd 6280 +signalfd4 6287 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 6040 +socketcall +socketpair 6052 +splice 6267 +spu_create +spu_run +ssetmask +stat 6004 +stat64 +statfs 6134 +statfs64 6217 +statx 6330 +stime +subpage_prot +swapcontext +swapoff 6163 +swapon 6162 +switch_endian +symlink 6086 +symlinkat 6260 +sync 6157 +sync_file_range 6268 +sync_file_range2 +syncfs 6306 +sys_debug_setcontext +syscall +sysfs 6136 +sysinfo 6097 +syslog 6101 +sysmips 6199 +tee 6269 +tgkill 6229 +time +timer_create 6220 +timer_delete 6224 +timer_getoverrun 6223 +timer_gettime 6222 +timer_gettime64 6408 +timer_settime 6221 +timer_settime64 6409 +timerfd 6281 +timerfd_create 6284 +timerfd_gettime 6285 +timerfd_gettime64 6410 +timerfd_settime 6286 +timerfd_settime64 6411 +times 6098 +tkill 6192 +truncate 6074 +truncate64 +ugetrlimit +umask 6093 +umount +umount2 6161 +uname 6061 +unlink 6085 +unlinkat 6257 +unshare 6266 +userfaultfd 6321 +ustat 6133 +utime 6130 +utimensat 6279 +utimensat_time64 6412 +utimes 6230 +utrap_install +vfork +vhangup 6150 +vm86 +vm86old +vmsplice 6270 +wait4 6059 +waitid 6241 +waitpid +write 6001 +writev 6019 diff --git a/src/basic/syscalls-mipso32.txt b/src/basic/syscalls-mipso32.txt new file mode 100644 index 0000000..0254cb3 --- /dev/null +++ b/src/basic/syscalls-mipso32.txt @@ -0,0 +1,515 @@ +_llseek 4140 +_newselect 4142 +accept 4168 +accept4 4334 +access 4033 +acct 4051 +add_key 4280 +adjtimex 4124 +alarm 4027 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 4169 +bpf 4355 +brk 4045 +cachectl 4148 +cacheflush 4147 +cachestat 4451 +capget 4204 +capset 4205 +chdir 4012 +chmod 4015 +chown 4202 +chown32 +chroot 4061 +clock_adjtime 4341 +clock_adjtime64 4405 +clock_getres 4264 +clock_getres_time64 4406 +clock_gettime 4263 +clock_gettime64 4403 +clock_nanosleep 4265 +clock_nanosleep_time64 4407 +clock_settime 4262 +clock_settime64 4404 +clone 4120 +clone3 4435 +close 4006 +close_range 4436 +connect 4170 +copy_file_range 4360 +creat 4008 +delete_module 4129 +dipc +dup 4041 +dup2 4063 +dup3 4327 +epoll_create 4248 +epoll_create1 4326 +epoll_ctl 4249 +epoll_ctl_old +epoll_pwait 4313 +epoll_pwait2 4441 +epoll_wait 4250 +epoll_wait_old +eventfd 4319 +eventfd2 4325 +exec_with_loader +execv +execve 4011 +execveat 4356 +exit 4001 +exit_group 4246 +faccessat 4300 +faccessat2 4439 +fadvise64 4254 +fadvise64_64 +fallocate 4320 +fanotify_init 4336 +fanotify_mark 4337 +fchdir 4133 +fchmod 4094 +fchmodat 4299 +fchmodat2 4452 +fchown 4095 +fchown32 +fchownat 4291 +fcntl 4055 +fcntl64 4220 +fdatasync 4152 +fgetxattr 4229 +finit_module 4348 +flistxattr 4232 +flock 4143 +fork 4002 +fremovexattr 4235 +fsconfig 4431 +fsetxattr 4226 +fsmount 4432 +fsopen 4430 +fspick 4433 +fstat 4108 +fstat64 4215 +fstatat64 4293 +fstatfs 4100 +fstatfs64 4256 +fsync 4118 +ftruncate 4093 +ftruncate64 4212 +futex 4238 +futex_requeue 4456 +futex_time64 4422 +futex_wait 4455 +futex_waitv 4449 +futex_wake 4454 +futimesat 4292 +get_mempolicy 4269 +get_robust_list 4310 +get_thread_area +getcpu 4312 +getcwd 4203 +getdents 4141 +getdents64 4219 +getdomainname +getdtablesize +getegid 4050 +getegid32 +geteuid 4049 +geteuid32 +getgid 4047 +getgid32 +getgroups 4080 +getgroups32 +gethostname +getitimer 4105 +getpagesize +getpeername 4171 +getpgid 4132 +getpgrp 4065 +getpid 4020 +getppid 4064 +getpriority 4096 +getrandom 4353 +getresgid 4191 +getresgid32 +getresuid 4186 +getresuid32 +getrlimit 4076 +getrusage 4077 +getsid 4151 +getsockname 4172 +getsockopt 4173 +gettid 4222 +gettimeofday 4078 +getuid 4024 +getuid32 +getxattr 4227 +getxgid +getxpid +getxuid +init_module 4128 +inotify_add_watch 4285 +inotify_init 4284 +inotify_init1 4329 +inotify_rm_watch 4286 +io_cancel 4245 +io_destroy 4242 +io_getevents 4243 +io_pgetevents 4368 +io_pgetevents_time64 4416 +io_setup 4241 +io_submit 4244 +io_uring_enter 4426 +io_uring_register 4427 +io_uring_setup 4425 +ioctl 4054 +ioperm 4101 +iopl 4110 +ioprio_get 4315 +ioprio_set 4314 +ipc 4117 +kcmp 4347 +kern_features +kexec_file_load +kexec_load 4311 +keyctl 4282 +kill 4037 +landlock_add_rule 4445 +landlock_create_ruleset 4444 +landlock_restrict_self 4446 +lchown 4016 +lchown32 +lgetxattr 4228 +link 4009 +linkat 4296 +listen 4174 +listxattr 4230 +llistxattr 4231 +lookup_dcookie 4247 +lremovexattr 4234 +lseek 4019 +lsetxattr 4225 +lstat 4107 +lstat64 4214 +madvise 4218 +map_shadow_stack 4453 +mbind 4268 +membarrier 4358 +memfd_create 4354 +memfd_secret +memory_ordering +migrate_pages 4287 +mincore 4217 +mkdir 4039 +mkdirat 4289 +mknod 4014 +mknodat 4290 +mlock 4154 +mlock2 4359 +mlockall 4156 +mmap 4090 +mmap2 4210 +modify_ldt 4123 +mount 4021 +mount_setattr 4442 +move_mount 4429 +move_pages 4308 +mprotect 4125 +mq_getsetattr 4276 +mq_notify 4275 +mq_open 4271 +mq_timedreceive 4274 +mq_timedreceive_time64 4419 +mq_timedsend 4273 +mq_timedsend_time64 4418 +mq_unlink 4272 +mremap 4167 +msgctl 4402 +msgget 4399 +msgrcv 4401 +msgsnd 4400 +msync 4144 +multiplexer +munlock 4155 +munlockall 4157 +munmap 4091 +name_to_handle_at 4339 +nanosleep 4166 +newfstatat +nice 4034 +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 4005 +open_by_handle_at 4340 +open_tree 4428 +openat 4288 +openat2 4437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 4029 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 4333 +perfctr +personality 4136 +pidfd_getfd 4438 +pidfd_open 4434 +pidfd_send_signal 4424 +pipe 4042 +pipe2 4328 +pivot_root 4216 +pkey_alloc 4364 +pkey_free 4365 +pkey_mprotect 4363 +poll 4188 +ppoll 4302 +ppoll_time64 4414 +prctl 4192 +pread64 4200 +preadv 4330 +preadv2 4361 +prlimit64 4338 +process_madvise 4440 +process_mrelease 4448 +process_vm_readv 4345 +process_vm_writev 4346 +pselect6 4301 +pselect6_time64 4413 +ptrace 4026 +pwrite64 4201 +pwritev 4331 +pwritev2 4362 +quotactl 4131 +quotactl_fd 4443 +read 4003 +readahead 4223 +readdir 4089 +readlink 4085 +readlinkat 4298 +readv 4145 +reboot 4088 +recv 4175 +recvfrom 4176 +recvmmsg 4335 +recvmmsg_time64 4417 +recvmsg 4177 +remap_file_pages 4251 +removexattr 4233 +rename 4038 +renameat 4295 +renameat2 4351 +request_key 4281 +restart_syscall 4253 +riscv_flush_icache +riscv_hwprobe +rmdir 4040 +rseq 4367 +rt_sigaction 4194 +rt_sigpending 4196 +rt_sigprocmask 4195 +rt_sigqueueinfo 4198 +rt_sigreturn 4193 +rt_sigsuspend 4199 +rt_sigtimedwait 4197 +rt_sigtimedwait_time64 4421 +rt_tgsigqueueinfo 4332 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 4163 +sched_get_priority_min 4164 +sched_getaffinity 4240 +sched_getattr 4350 +sched_getparam 4159 +sched_getscheduler 4161 +sched_rr_get_interval 4165 +sched_rr_get_interval_time64 4423 +sched_set_affinity +sched_setaffinity 4239 +sched_setattr 4349 +sched_setparam 4158 +sched_setscheduler 4160 +sched_yield 4162 +seccomp 4352 +select +semctl 4394 +semget 4393 +semop +semtimedop +semtimedop_time64 4420 +send 4178 +sendfile 4207 +sendfile64 4237 +sendmmsg 4343 +sendmsg 4179 +sendto 4180 +set_mempolicy 4270 +set_mempolicy_home_node 4450 +set_robust_list 4309 +set_thread_area 4283 +set_tid_address 4252 +setdomainname 4121 +setfsgid 4139 +setfsgid32 +setfsuid 4138 +setfsuid32 +setgid 4046 +setgid32 +setgroups 4081 +setgroups32 +sethae +sethostname 4074 +setitimer 4104 +setns 4344 +setpgid 4057 +setpgrp +setpriority 4097 +setregid 4071 +setregid32 +setresgid 4190 +setresgid32 +setresuid 4185 +setresuid32 +setreuid 4070 +setreuid32 +setrlimit 4075 +setsid 4066 +setsockopt 4181 +settimeofday 4079 +setuid 4023 +setuid32 +setxattr 4224 +sgetmask 4068 +shmat 4397 +shmctl 4396 +shmdt 4398 +shmget 4395 +shutdown 4182 +sigaction 4067 +sigaltstack 4206 +signal 4048 +signalfd 4317 +signalfd4 4324 +sigpending 4073 +sigprocmask 4126 +sigreturn 4119 +sigsuspend 4072 +socket 4183 +socketcall 4102 +socketpair 4184 +splice 4304 +spu_create +spu_run +ssetmask 4069 +stat 4106 +stat64 4213 +statfs 4099 +statfs64 4255 +statx 4366 +stime 4025 +subpage_prot +swapcontext +swapoff 4115 +swapon 4087 +switch_endian +symlink 4083 +symlinkat 4297 +sync 4036 +sync_file_range 4305 +sync_file_range2 +syncfs 4342 +sys_debug_setcontext +syscall 4000 +sysfs 4135 +sysinfo 4116 +syslog 4103 +sysmips 4149 +tee 4306 +tgkill 4266 +time 4013 +timer_create 4257 +timer_delete 4261 +timer_getoverrun 4260 +timer_gettime 4259 +timer_gettime64 4408 +timer_settime 4258 +timer_settime64 4409 +timerfd 4318 +timerfd_create 4321 +timerfd_gettime 4322 +timerfd_gettime64 4410 +timerfd_settime 4323 +timerfd_settime64 4411 +times 4043 +tkill 4236 +truncate 4092 +truncate64 4211 +ugetrlimit +umask 4060 +umount 4022 +umount2 4052 +uname 4122 +unlink 4010 +unlinkat 4294 +unshare 4303 +userfaultfd 4357 +ustat 4062 +utime 4030 +utimensat 4316 +utimensat_time64 4412 +utimes 4267 +utrap_install +vfork +vhangup 4111 +vm86 4113 +vm86old +vmsplice 4307 +wait4 4114 +waitid 4278 +waitpid 4007 +write 4004 +writev 4146 diff --git a/src/basic/syscalls-parisc.txt b/src/basic/syscalls-parisc.txt new file mode 100644 index 0000000..2bb1de5 --- /dev/null +++ b/src/basic/syscalls-parisc.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept 35 +accept4 320 +access 33 +acct 51 +add_key 264 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 22 +bpf 341 +brk 45 +cachectl +cacheflush 356 +cachestat 451 +capget 106 +capset 107 +chdir 12 +chmod 15 +chown 180 +chown32 +chroot 61 +clock_adjtime 324 +clock_adjtime64 405 +clock_getres 257 +clock_getres_time64 406 +clock_gettime 256 +clock_gettime64 403 +clock_nanosleep 258 +clock_nanosleep_time64 407 +clock_settime 255 +clock_settime64 404 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 31 +copy_file_range 346 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 312 +epoll_create 224 +epoll_create1 311 +epoll_ctl 225 +epoll_ctl_old +epoll_pwait 297 +epoll_pwait2 441 +epoll_wait 226 +epoll_wait_old +eventfd 304 +eventfd2 310 +exec_with_loader +execv +execve 11 +execveat 342 +exit 1 +exit_group 222 +faccessat 287 +faccessat2 439 +fadvise64 +fadvise64_64 236 +fallocate 305 +fanotify_init 322 +fanotify_mark 323 +fchdir 133 +fchmod 94 +fchmodat 286 +fchmodat2 452 +fchown 95 +fchown32 +fchownat 278 +fcntl 55 +fcntl64 202 +fdatasync 148 +fgetxattr 243 +finit_module 333 +flistxattr 246 +flock 143 +fork 2 +fremovexattr 249 +fsconfig 431 +fsetxattr 240 +fsmount 432 +fsopen 430 +fspick 433 +fstat 28 +fstat64 112 +fstatat64 280 +fstatfs 100 +fstatfs64 299 +fsync 118 +ftruncate 93 +ftruncate64 200 +futex 210 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 279 +get_mempolicy 261 +get_robust_list 290 +get_thread_area +getcpu 296 +getcwd 110 +getdents 141 +getdents64 201 +getdomainname +getdtablesize +getegid 50 +getegid32 +geteuid 49 +geteuid32 +getgid 47 +getgid32 +getgroups 80 +getgroups32 +gethostname +getitimer 105 +getpagesize +getpeername 53 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 339 +getresgid 171 +getresgid32 +getresuid 165 +getresuid32 +getrlimit 76 +getrusage 77 +getsid 147 +getsockname 44 +getsockopt 182 +gettid 206 +gettimeofday 78 +getuid 24 +getuid32 +getxattr 241 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 270 +inotify_init 269 +inotify_init1 314 +inotify_rm_watch 271 +io_cancel 219 +io_destroy 216 +io_getevents 217 +io_pgetevents 350 +io_pgetevents_time64 416 +io_setup 215 +io_submit 218 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm +iopl +ioprio_get 268 +ioprio_set 267 +ipc +kcmp 332 +kern_features +kexec_file_load 355 +kexec_load 300 +keyctl 266 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 +lgetxattr 242 +link 9 +linkat 283 +listen 32 +listxattr 244 +llistxattr 245 +lookup_dcookie 223 +lremovexattr 248 +lseek 19 +lsetxattr 239 +lstat 84 +lstat64 198 +madvise 119 +map_shadow_stack 453 +mbind 260 +membarrier 343 +memfd_create 340 +memfd_secret +memory_ordering +migrate_pages 272 +mincore 72 +mkdir 39 +mkdirat 276 +mknod 14 +mknodat 277 +mlock 150 +mlock2 345 +mlockall 152 +mmap 90 +mmap2 89 +modify_ldt +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 295 +mprotect 125 +mq_getsetattr 234 +mq_notify 233 +mq_open 229 +mq_timedreceive 232 +mq_timedreceive_time64 419 +mq_timedsend 231 +mq_timedsend_time64 418 +mq_unlink 230 +mremap 163 +msgctl 191 +msgget 190 +msgrcv 189 +msgsnd 188 +msync 144 +multiplexer +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 325 +nanosleep 162 +newfstatat +nice 34 +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 5 +open_by_handle_at 326 +open_tree 428 +openat 275 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 318 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 313 +pivot_root 67 +pkey_alloc 352 +pkey_free 353 +pkey_mprotect 351 +poll 168 +ppoll 274 +ppoll_time64 414 +prctl 172 +pread64 108 +preadv 315 +preadv2 347 +prlimit64 321 +process_madvise 440 +process_mrelease 448 +process_vm_readv 330 +process_vm_writev 331 +pselect6 273 +pselect6_time64 413 +ptrace 26 +pwrite64 109 +pwritev 316 +pwritev2 348 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 207 +readdir +readlink 85 +readlinkat 285 +readv 145 +reboot 88 +recv 98 +recvfrom 123 +recvmmsg 319 +recvmmsg_time64 417 +recvmsg 184 +remap_file_pages 227 +removexattr 247 +rename 38 +renameat 282 +renameat2 337 +request_key 265 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 354 +rt_sigaction 174 +rt_sigpending 176 +rt_sigprocmask 175 +rt_sigqueueinfo 178 +rt_sigreturn 173 +rt_sigsuspend 179 +rt_sigtimedwait 177 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 317 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 212 +sched_getattr 335 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 211 +sched_setattr 334 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 338 +select +semctl 187 +semget 186 +semop 185 +semtimedop 228 +semtimedop_time64 420 +send 58 +sendfile 122 +sendfile64 209 +sendmmsg 329 +sendmsg 183 +sendto 82 +set_mempolicy 262 +set_mempolicy_home_node 450 +set_robust_list 289 +set_thread_area +set_tid_address 237 +setdomainname 121 +setfsgid 139 +setfsgid32 +setfsuid 138 +setfsuid32 +setgid 46 +setgid32 +setgroups 81 +setgroups32 +sethae +sethostname 74 +setitimer 104 +setns 328 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 +setresgid 170 +setresgid32 +setresuid 164 +setresuid32 +setreuid 70 +setreuid32 +setrlimit 75 +setsid 66 +setsockopt 181 +settimeofday 79 +setuid 23 +setuid32 +setxattr 238 +sgetmask 68 +shmat 192 +shmctl 195 +shmdt 193 +shmget 194 +shutdown 117 +sigaction +sigaltstack 166 +signal 48 +signalfd 302 +signalfd4 309 +sigpending 73 +sigprocmask 126 +sigreturn +sigsuspend +socket 17 +socketcall +socketpair 56 +splice 291 +spu_create +spu_run +ssetmask 69 +stat 18 +stat64 101 +statfs 99 +statfs64 298 +statx 349 +stime 25 +subpage_prot +swapcontext +swapoff 115 +swapon 87 +switch_endian +symlink 83 +symlinkat 284 +sync 36 +sync_file_range 292 +sync_file_range2 +syncfs 327 +sys_debug_setcontext +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 293 +tgkill 259 +time 13 +timer_create 250 +timer_delete 254 +timer_getoverrun 253 +timer_gettime 252 +timer_gettime64 408 +timer_settime 251 +timer_settime64 409 +timerfd +timerfd_create 306 +timerfd_gettime 308 +timerfd_gettime64 410 +timerfd_settime 307 +timerfd_settime64 411 +times 43 +tkill 208 +truncate 92 +truncate64 199 +ugetrlimit +umask 60 +umount +umount2 52 +uname 59 +unlink 10 +unlinkat 281 +unshare 288 +userfaultfd 344 +ustat 62 +utime 30 +utimensat 301 +utimensat_time64 412 +utimes 336 +utrap_install +vfork 113 +vhangup 111 +vm86 +vm86old +vmsplice 294 +wait4 114 +waitid 235 +waitpid 7 +write 4 +writev 146 diff --git a/src/basic/syscalls-powerpc.txt b/src/basic/syscalls-powerpc.txt new file mode 100644 index 0000000..a8c1b1b --- /dev/null +++ b/src/basic/syscalls-powerpc.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept 330 +accept4 344 +access 33 +acct 51 +add_key 269 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 327 +bpf 361 +brk 45 +cachectl +cacheflush +cachestat 451 +capget 183 +capset 184 +chdir 12 +chmod 15 +chown 181 +chown32 +chroot 61 +clock_adjtime 347 +clock_adjtime64 405 +clock_getres 247 +clock_getres_time64 406 +clock_gettime 246 +clock_gettime64 403 +clock_nanosleep 248 +clock_nanosleep_time64 407 +clock_settime 245 +clock_settime64 404 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 328 +copy_file_range 379 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 316 +epoll_create 236 +epoll_create1 315 +epoll_ctl 237 +epoll_ctl_old +epoll_pwait 303 +epoll_pwait2 441 +epoll_wait 238 +epoll_wait_old +eventfd 307 +eventfd2 314 +exec_with_loader +execv +execve 11 +execveat 362 +exit 1 +exit_group 234 +faccessat 298 +faccessat2 439 +fadvise64 233 +fadvise64_64 254 +fallocate 309 +fanotify_init 323 +fanotify_mark 324 +fchdir 133 +fchmod 94 +fchmodat 297 +fchmodat2 452 +fchown 95 +fchown32 +fchownat 289 +fcntl 55 +fcntl64 204 +fdatasync 148 +fgetxattr 214 +finit_module 353 +flistxattr 217 +flock 143 +fork 2 +fremovexattr 220 +fsconfig 431 +fsetxattr 211 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 197 +fstatat64 291 +fstatfs 100 +fstatfs64 253 +fsync 118 +ftruncate 93 +ftruncate64 194 +futex 221 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 290 +get_mempolicy 260 +get_robust_list 299 +get_thread_area +getcpu 302 +getcwd 182 +getdents 141 +getdents64 202 +getdomainname +getdtablesize +getegid 50 +getegid32 +geteuid 49 +geteuid32 +getgid 47 +getgid32 +getgroups 80 +getgroups32 +gethostname +getitimer 105 +getpagesize +getpeername 332 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 359 +getresgid 170 +getresgid32 +getresuid 165 +getresuid32 +getrlimit 76 +getrusage 77 +getsid 147 +getsockname 331 +getsockopt 340 +gettid 207 +gettimeofday 78 +getuid 24 +getuid32 +getxattr 212 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 276 +inotify_init 275 +inotify_init1 318 +inotify_rm_watch 277 +io_cancel 231 +io_destroy 228 +io_getevents 229 +io_pgetevents 388 +io_pgetevents_time64 416 +io_setup 227 +io_submit 230 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm 101 +iopl 110 +ioprio_get 274 +ioprio_set 273 +ipc 117 +kcmp 354 +kern_features +kexec_file_load 382 +kexec_load 268 +keyctl 271 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 +lgetxattr 213 +link 9 +linkat 294 +listen 329 +listxattr 215 +llistxattr 216 +lookup_dcookie 235 +lremovexattr 219 +lseek 19 +lsetxattr 210 +lstat 107 +lstat64 196 +madvise 205 +map_shadow_stack 453 +mbind 259 +membarrier 365 +memfd_create 360 +memfd_secret +memory_ordering +migrate_pages 258 +mincore 206 +mkdir 39 +mkdirat 287 +mknod 14 +mknodat 288 +mlock 150 +mlock2 378 +mlockall 152 +mmap 90 +mmap2 192 +modify_ldt 123 +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 301 +mprotect 125 +mq_getsetattr 267 +mq_notify 266 +mq_open 262 +mq_timedreceive 265 +mq_timedreceive_time64 419 +mq_timedsend 264 +mq_timedsend_time64 418 +mq_unlink 263 +mremap 163 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 144 +multiplexer 201 +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 345 +nanosleep 162 +newfstatat +nice 34 +old_adjtimex +oldfstat 28 +oldlstat 84 +oldolduname 59 +oldstat 18 +oldumount +olduname 109 +open 5 +open_by_handle_at 346 +open_tree 428 +openat 286 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase 200 +pciconfig_read 198 +pciconfig_write 199 +perf_event_open 319 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 317 +pivot_root 203 +pkey_alloc 384 +pkey_free 385 +pkey_mprotect 386 +poll 167 +ppoll 281 +ppoll_time64 414 +prctl 171 +pread64 179 +preadv 320 +preadv2 380 +prlimit64 325 +process_madvise 440 +process_mrelease 448 +process_vm_readv 351 +process_vm_writev 352 +pselect6 280 +pselect6_time64 413 +ptrace 26 +pwrite64 180 +pwritev 321 +pwritev2 381 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 191 +readdir 89 +readlink 85 +readlinkat 296 +readv 145 +reboot 88 +recv 336 +recvfrom 337 +recvmmsg 343 +recvmmsg_time64 417 +recvmsg 342 +remap_file_pages 239 +removexattr 218 +rename 38 +renameat 293 +renameat2 357 +request_key 270 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 387 +rt_sigaction 173 +rt_sigpending 175 +rt_sigprocmask 174 +rt_sigqueueinfo 177 +rt_sigreturn 172 +rt_sigsuspend 178 +rt_sigtimedwait 176 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 322 +rtas 255 +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 223 +sched_getattr 356 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 222 +sched_setattr 355 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 358 +select 82 +semctl 394 +semget 393 +semop +semtimedop +semtimedop_time64 420 +send 334 +sendfile 186 +sendfile64 226 +sendmmsg 349 +sendmsg 341 +sendto 335 +set_mempolicy 261 +set_mempolicy_home_node 450 +set_robust_list 300 +set_thread_area +set_tid_address 232 +setdomainname 121 +setfsgid 139 +setfsgid32 +setfsuid 138 +setfsuid32 +setgid 46 +setgid32 +setgroups 81 +setgroups32 +sethae +sethostname 74 +setitimer 104 +setns 350 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 +setresgid 169 +setresgid32 +setresuid 164 +setresuid32 +setreuid 70 +setreuid32 +setrlimit 75 +setsid 66 +setsockopt 339 +settimeofday 79 +setuid 23 +setuid32 +setxattr 209 +sgetmask 68 +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 338 +sigaction 67 +sigaltstack 185 +signal 48 +signalfd 305 +signalfd4 313 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 326 +socketcall 102 +socketpair 333 +splice 283 +spu_create 279 +spu_run 278 +ssetmask 69 +stat 106 +stat64 195 +statfs 99 +statfs64 252 +statx 383 +stime 25 +subpage_prot 310 +swapcontext 249 +swapoff 115 +swapon 87 +switch_endian 363 +symlink 83 +symlinkat 295 +sync 36 +sync_file_range +sync_file_range2 308 +syncfs 348 +sys_debug_setcontext 256 +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 284 +tgkill 250 +time 13 +timer_create 240 +timer_delete 244 +timer_getoverrun 243 +timer_gettime 242 +timer_gettime64 408 +timer_settime 241 +timer_settime64 409 +timerfd +timerfd_create 306 +timerfd_gettime 312 +timerfd_gettime64 410 +timerfd_settime 311 +timerfd_settime64 411 +times 43 +tkill 208 +truncate 92 +truncate64 193 +ugetrlimit 190 +umask 60 +umount 22 +umount2 52 +uname 122 +unlink 10 +unlinkat 292 +unshare 282 +userfaultfd 364 +ustat 62 +utime 30 +utimensat 304 +utimensat_time64 412 +utimes 251 +utrap_install +vfork 189 +vhangup 111 +vm86 113 +vm86old +vmsplice 285 +wait4 114 +waitid 272 +waitpid 7 +write 4 +writev 146 diff --git a/src/basic/syscalls-powerpc64.txt b/src/basic/syscalls-powerpc64.txt new file mode 100644 index 0000000..824cc61 --- /dev/null +++ b/src/basic/syscalls-powerpc64.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept 330 +accept4 344 +access 33 +acct 51 +add_key 269 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 327 +bpf 361 +brk 45 +cachectl +cacheflush +cachestat 451 +capget 183 +capset 184 +chdir 12 +chmod 15 +chown 181 +chown32 +chroot 61 +clock_adjtime 347 +clock_adjtime64 +clock_getres 247 +clock_getres_time64 +clock_gettime 246 +clock_gettime64 +clock_nanosleep 248 +clock_nanosleep_time64 +clock_settime 245 +clock_settime64 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 328 +copy_file_range 379 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 316 +epoll_create 236 +epoll_create1 315 +epoll_ctl 237 +epoll_ctl_old +epoll_pwait 303 +epoll_pwait2 441 +epoll_wait 238 +epoll_wait_old +eventfd 307 +eventfd2 314 +exec_with_loader +execv +execve 11 +execveat 362 +exit 1 +exit_group 234 +faccessat 298 +faccessat2 439 +fadvise64 233 +fadvise64_64 +fallocate 309 +fanotify_init 323 +fanotify_mark 324 +fchdir 133 +fchmod 94 +fchmodat 297 +fchmodat2 452 +fchown 95 +fchown32 +fchownat 289 +fcntl 55 +fcntl64 +fdatasync 148 +fgetxattr 214 +finit_module 353 +flistxattr 217 +flock 143 +fork 2 +fremovexattr 220 +fsconfig 431 +fsetxattr 211 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 +fstatat64 +fstatfs 100 +fstatfs64 253 +fsync 118 +ftruncate 93 +ftruncate64 +futex 221 +futex_requeue 456 +futex_time64 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 290 +get_mempolicy 260 +get_robust_list 299 +get_thread_area +getcpu 302 +getcwd 182 +getdents 141 +getdents64 202 +getdomainname +getdtablesize +getegid 50 +getegid32 +geteuid 49 +geteuid32 +getgid 47 +getgid32 +getgroups 80 +getgroups32 +gethostname +getitimer 105 +getpagesize +getpeername 332 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 359 +getresgid 170 +getresgid32 +getresuid 165 +getresuid32 +getrlimit 76 +getrusage 77 +getsid 147 +getsockname 331 +getsockopt 340 +gettid 207 +gettimeofday 78 +getuid 24 +getuid32 +getxattr 212 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 276 +inotify_init 275 +inotify_init1 318 +inotify_rm_watch 277 +io_cancel 231 +io_destroy 228 +io_getevents 229 +io_pgetevents 388 +io_pgetevents_time64 +io_setup 227 +io_submit 230 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm 101 +iopl 110 +ioprio_get 274 +ioprio_set 273 +ipc 117 +kcmp 354 +kern_features +kexec_file_load 382 +kexec_load 268 +keyctl 271 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 +lgetxattr 213 +link 9 +linkat 294 +listen 329 +listxattr 215 +llistxattr 216 +lookup_dcookie 235 +lremovexattr 219 +lseek 19 +lsetxattr 210 +lstat 107 +lstat64 +madvise 205 +map_shadow_stack 453 +mbind 259 +membarrier 365 +memfd_create 360 +memfd_secret +memory_ordering +migrate_pages 258 +mincore 206 +mkdir 39 +mkdirat 287 +mknod 14 +mknodat 288 +mlock 150 +mlock2 378 +mlockall 152 +mmap 90 +mmap2 +modify_ldt 123 +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 301 +mprotect 125 +mq_getsetattr 267 +mq_notify 266 +mq_open 262 +mq_timedreceive 265 +mq_timedreceive_time64 +mq_timedsend 264 +mq_timedsend_time64 +mq_unlink 263 +mremap 163 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 144 +multiplexer 201 +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 345 +nanosleep 162 +newfstatat 291 +nice 34 +old_adjtimex +oldfstat 28 +oldlstat 84 +oldolduname 59 +oldstat 18 +oldumount +olduname 109 +open 5 +open_by_handle_at 346 +open_tree 428 +openat 286 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase 200 +pciconfig_read 198 +pciconfig_write 199 +perf_event_open 319 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 317 +pivot_root 203 +pkey_alloc 384 +pkey_free 385 +pkey_mprotect 386 +poll 167 +ppoll 281 +ppoll_time64 +prctl 171 +pread64 179 +preadv 320 +preadv2 380 +prlimit64 325 +process_madvise 440 +process_mrelease 448 +process_vm_readv 351 +process_vm_writev 352 +pselect6 280 +pselect6_time64 +ptrace 26 +pwrite64 180 +pwritev 321 +pwritev2 381 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 191 +readdir 89 +readlink 85 +readlinkat 296 +readv 145 +reboot 88 +recv 336 +recvfrom 337 +recvmmsg 343 +recvmmsg_time64 +recvmsg 342 +remap_file_pages 239 +removexattr 218 +rename 38 +renameat 293 +renameat2 357 +request_key 270 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 387 +rt_sigaction 173 +rt_sigpending 175 +rt_sigprocmask 174 +rt_sigqueueinfo 177 +rt_sigreturn 172 +rt_sigsuspend 178 +rt_sigtimedwait 176 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 322 +rtas 255 +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 223 +sched_getattr 356 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 222 +sched_setattr 355 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 358 +select 82 +semctl 394 +semget 393 +semop +semtimedop 392 +semtimedop_time64 +send 334 +sendfile 186 +sendfile64 +sendmmsg 349 +sendmsg 341 +sendto 335 +set_mempolicy 261 +set_mempolicy_home_node 450 +set_robust_list 300 +set_thread_area +set_tid_address 232 +setdomainname 121 +setfsgid 139 +setfsgid32 +setfsuid 138 +setfsuid32 +setgid 46 +setgid32 +setgroups 81 +setgroups32 +sethae +sethostname 74 +setitimer 104 +setns 350 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 +setresgid 169 +setresgid32 +setresuid 164 +setresuid32 +setreuid 70 +setreuid32 +setrlimit 75 +setsid 66 +setsockopt 339 +settimeofday 79 +setuid 23 +setuid32 +setxattr 209 +sgetmask 68 +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 338 +sigaction 67 +sigaltstack 185 +signal 48 +signalfd 305 +signalfd4 313 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 326 +socketcall 102 +socketpair 333 +splice 283 +spu_create 279 +spu_run 278 +ssetmask 69 +stat 106 +stat64 +statfs 99 +statfs64 252 +statx 383 +stime 25 +subpage_prot 310 +swapcontext 249 +swapoff 115 +swapon 87 +switch_endian 363 +symlink 83 +symlinkat 295 +sync 36 +sync_file_range +sync_file_range2 308 +syncfs 348 +sys_debug_setcontext 256 +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 284 +tgkill 250 +time 13 +timer_create 240 +timer_delete 244 +timer_getoverrun 243 +timer_gettime 242 +timer_gettime64 +timer_settime 241 +timer_settime64 +timerfd +timerfd_create 306 +timerfd_gettime 312 +timerfd_gettime64 +timerfd_settime 311 +timerfd_settime64 +times 43 +tkill 208 +truncate 92 +truncate64 +ugetrlimit 190 +umask 60 +umount 22 +umount2 52 +uname 122 +unlink 10 +unlinkat 292 +unshare 282 +userfaultfd 364 +ustat 62 +utime 30 +utimensat 304 +utimensat_time64 +utimes 251 +utrap_install +vfork 189 +vhangup 111 +vm86 113 +vm86old +vmsplice 285 +wait4 114 +waitid 272 +waitpid 7 +write 4 +writev 146 diff --git a/src/basic/syscalls-riscv32.txt b/src/basic/syscalls-riscv32.txt new file mode 100644 index 0000000..5011956 --- /dev/null +++ b/src/basic/syscalls-riscv32.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 202 +accept4 242 +access +acct 89 +add_key 217 +adjtimex +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 200 +bpf 280 +brk 214 +cachectl +cacheflush +cachestat 451 +capget 90 +capset 91 +chdir 49 +chmod +chown +chown32 +chroot 51 +clock_adjtime +clock_adjtime64 405 +clock_getres +clock_getres_time64 406 +clock_gettime +clock_gettime64 403 +clock_nanosleep +clock_nanosleep_time64 407 +clock_settime +clock_settime64 404 +clone 220 +clone3 435 +close 57 +close_range 436 +connect 203 +copy_file_range 285 +creat +delete_module 106 +dipc +dup 23 +dup2 +dup3 24 +epoll_create +epoll_create1 20 +epoll_ctl 21 +epoll_ctl_old +epoll_pwait 22 +epoll_pwait2 441 +epoll_wait +epoll_wait_old +eventfd +eventfd2 19 +exec_with_loader +execv +execve 221 +execveat 281 +exit 93 +exit_group 94 +faccessat 48 +faccessat2 439 +fadvise64 +fadvise64_64 223 +fallocate 47 +fanotify_init 262 +fanotify_mark 263 +fchdir 50 +fchmod 52 +fchmodat 53 +fchmodat2 452 +fchown 55 +fchown32 +fchownat 54 +fcntl +fcntl64 25 +fdatasync 83 +fgetxattr 10 +finit_module 273 +flistxattr 13 +flock 32 +fork +fremovexattr 16 +fsconfig 431 +fsetxattr 7 +fsmount 432 +fsopen 430 +fspick 433 +fstat +fstat64 +fstatat64 +fstatfs +fstatfs64 44 +fsync 82 +ftruncate +ftruncate64 46 +futex +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat +get_mempolicy 236 +get_robust_list 100 +get_thread_area +getcpu 168 +getcwd 17 +getdents +getdents64 61 +getdomainname +getdtablesize +getegid 177 +getegid32 +geteuid 175 +geteuid32 +getgid 176 +getgid32 +getgroups 158 +getgroups32 +gethostname +getitimer 102 +getpagesize +getpeername 205 +getpgid 155 +getpgrp +getpid 172 +getppid 173 +getpriority 141 +getrandom 278 +getresgid 150 +getresgid32 +getresuid 148 +getresuid32 +getrlimit +getrusage 165 +getsid 156 +getsockname 204 +getsockopt 209 +gettid 178 +gettimeofday +getuid 174 +getuid32 +getxattr 8 +getxgid +getxpid +getxuid +init_module 105 +inotify_add_watch 27 +inotify_init +inotify_init1 26 +inotify_rm_watch 28 +io_cancel 3 +io_destroy 1 +io_getevents +io_pgetevents +io_pgetevents_time64 416 +io_setup 0 +io_submit 2 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 29 +ioperm +iopl +ioprio_get 31 +ioprio_set 30 +ipc +kcmp 272 +kern_features +kexec_file_load 294 +kexec_load 104 +keyctl 219 +kill 129 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown +lchown32 +lgetxattr 9 +link +linkat 37 +listen 201 +listxattr 11 +llistxattr 12 +lookup_dcookie 18 +lremovexattr 15 +lseek +lsetxattr 6 +lstat +lstat64 +madvise 233 +map_shadow_stack 453 +mbind 235 +membarrier 283 +memfd_create 279 +memfd_secret 447 +memory_ordering +migrate_pages 238 +mincore 232 +mkdir +mkdirat 34 +mknod +mknodat 33 +mlock 228 +mlock2 284 +mlockall 230 +mmap +mmap2 222 +modify_ldt +mount 40 +mount_setattr 442 +move_mount 429 +move_pages 239 +mprotect 226 +mq_getsetattr 185 +mq_notify 184 +mq_open 180 +mq_timedreceive +mq_timedreceive_time64 419 +mq_timedsend +mq_timedsend_time64 418 +mq_unlink 181 +mremap 216 +msgctl 187 +msgget 186 +msgrcv 188 +msgsnd 189 +msync 227 +multiplexer +munlock 229 +munlockall 231 +munmap 215 +name_to_handle_at 264 +nanosleep +newfstatat +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open +open_by_handle_at 265 +open_tree 428 +openat 56 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 241 +perfctr +personality 92 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe +pipe2 59 +pivot_root 41 +pkey_alloc 289 +pkey_free 290 +pkey_mprotect 288 +poll +ppoll +ppoll_time64 414 +prctl 167 +pread64 67 +preadv 69 +preadv2 286 +prlimit64 261 +process_madvise 440 +process_mrelease 448 +process_vm_readv 270 +process_vm_writev 271 +pselect6 +pselect6_time64 413 +ptrace 117 +pwrite64 68 +pwritev 70 +pwritev2 287 +quotactl 60 +quotactl_fd 443 +read 63 +readahead 213 +readdir +readlink +readlinkat 78 +readv 65 +reboot 142 +recv +recvfrom 207 +recvmmsg +recvmmsg_time64 417 +recvmsg 212 +remap_file_pages 234 +removexattr 14 +rename +renameat +renameat2 276 +request_key 218 +restart_syscall 128 +riscv_flush_icache 259 +riscv_hwprobe 258 +rmdir +rseq 293 +rt_sigaction 134 +rt_sigpending 136 +rt_sigprocmask 135 +rt_sigqueueinfo 138 +rt_sigreturn 139 +rt_sigsuspend 133 +rt_sigtimedwait +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 240 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 125 +sched_get_priority_min 126 +sched_getaffinity 123 +sched_getattr 275 +sched_getparam 121 +sched_getscheduler 120 +sched_rr_get_interval +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 122 +sched_setattr 274 +sched_setparam 118 +sched_setscheduler 119 +sched_yield 124 +seccomp 277 +select +semctl 191 +semget 190 +semop 193 +semtimedop +semtimedop_time64 420 +send +sendfile +sendfile64 71 +sendmmsg 269 +sendmsg 211 +sendto 206 +set_mempolicy 237 +set_mempolicy_home_node 450 +set_robust_list 99 +set_thread_area +set_tid_address 96 +setdomainname 162 +setfsgid 152 +setfsgid32 +setfsuid 151 +setfsuid32 +setgid 144 +setgid32 +setgroups 159 +setgroups32 +sethae +sethostname 161 +setitimer 103 +setns 268 +setpgid 154 +setpgrp +setpriority 140 +setregid 143 +setregid32 +setresgid 149 +setresgid32 +setresuid 147 +setresuid32 +setreuid 145 +setreuid32 +setrlimit +setsid 157 +setsockopt 208 +settimeofday +setuid 146 +setuid32 +setxattr 5 +sgetmask +shmat 196 +shmctl 195 +shmdt 197 +shmget 194 +shutdown 210 +sigaction +sigaltstack 132 +signal +signalfd +signalfd4 74 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 198 +socketcall +socketpair 199 +splice 76 +spu_create +spu_run +ssetmask +stat +stat64 +statfs +statfs64 43 +statx 291 +stime +subpage_prot +swapcontext +swapoff 225 +swapon 224 +switch_endian +symlink +symlinkat 36 +sync 81 +sync_file_range 84 +sync_file_range2 +syncfs 267 +sys_debug_setcontext +syscall +sysfs +sysinfo 179 +syslog 116 +sysmips +tee 77 +tgkill 131 +time +timer_create 107 +timer_delete 111 +timer_getoverrun 109 +timer_gettime +timer_gettime64 408 +timer_settime +timer_settime64 409 +timerfd +timerfd_create 85 +timerfd_gettime +timerfd_gettime64 410 +timerfd_settime +timerfd_settime64 411 +times 153 +tkill 130 +truncate +truncate64 45 +ugetrlimit +umask 166 +umount +umount2 39 +uname 160 +unlink +unlinkat 35 +unshare 97 +userfaultfd 282 +ustat +utime +utimensat +utimensat_time64 412 +utimes +utrap_install +vfork +vhangup 58 +vm86 +vm86old +vmsplice 75 +wait4 +waitid 95 +waitpid +write 64 +writev 66 diff --git a/src/basic/syscalls-riscv64.txt b/src/basic/syscalls-riscv64.txt new file mode 100644 index 0000000..ba00b90 --- /dev/null +++ b/src/basic/syscalls-riscv64.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 202 +accept4 242 +access +acct 89 +add_key 217 +adjtimex 171 +alarm +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 200 +bpf 280 +brk 214 +cachectl +cacheflush +cachestat 451 +capget 90 +capset 91 +chdir 49 +chmod +chown +chown32 +chroot 51 +clock_adjtime 266 +clock_adjtime64 +clock_getres 114 +clock_getres_time64 +clock_gettime 113 +clock_gettime64 +clock_nanosleep 115 +clock_nanosleep_time64 +clock_settime 112 +clock_settime64 +clone 220 +clone3 435 +close 57 +close_range 436 +connect 203 +copy_file_range 285 +creat +delete_module 106 +dipc +dup 23 +dup2 +dup3 24 +epoll_create +epoll_create1 20 +epoll_ctl 21 +epoll_ctl_old +epoll_pwait 22 +epoll_pwait2 441 +epoll_wait +epoll_wait_old +eventfd +eventfd2 19 +exec_with_loader +execv +execve 221 +execveat 281 +exit 93 +exit_group 94 +faccessat 48 +faccessat2 439 +fadvise64 223 +fadvise64_64 +fallocate 47 +fanotify_init 262 +fanotify_mark 263 +fchdir 50 +fchmod 52 +fchmodat 53 +fchmodat2 452 +fchown 55 +fchown32 +fchownat 54 +fcntl 25 +fcntl64 +fdatasync 83 +fgetxattr 10 +finit_module 273 +flistxattr 13 +flock 32 +fork +fremovexattr 16 +fsconfig 431 +fsetxattr 7 +fsmount 432 +fsopen 430 +fspick 433 +fstat 80 +fstat64 +fstatat64 +fstatfs 44 +fstatfs64 +fsync 82 +ftruncate 46 +ftruncate64 +futex 98 +futex_requeue 456 +futex_time64 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat +get_mempolicy 236 +get_robust_list 100 +get_thread_area +getcpu 168 +getcwd 17 +getdents +getdents64 61 +getdomainname +getdtablesize +getegid 177 +getegid32 +geteuid 175 +geteuid32 +getgid 176 +getgid32 +getgroups 158 +getgroups32 +gethostname +getitimer 102 +getpagesize +getpeername 205 +getpgid 155 +getpgrp +getpid 172 +getppid 173 +getpriority 141 +getrandom 278 +getresgid 150 +getresgid32 +getresuid 148 +getresuid32 +getrlimit 163 +getrusage 165 +getsid 156 +getsockname 204 +getsockopt 209 +gettid 178 +gettimeofday 169 +getuid 174 +getuid32 +getxattr 8 +getxgid +getxpid +getxuid +init_module 105 +inotify_add_watch 27 +inotify_init +inotify_init1 26 +inotify_rm_watch 28 +io_cancel 3 +io_destroy 1 +io_getevents 4 +io_pgetevents 292 +io_pgetevents_time64 +io_setup 0 +io_submit 2 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 29 +ioperm +iopl +ioprio_get 31 +ioprio_set 30 +ipc +kcmp 272 +kern_features +kexec_file_load 294 +kexec_load 104 +keyctl 219 +kill 129 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown +lchown32 +lgetxattr 9 +link +linkat 37 +listen 201 +listxattr 11 +llistxattr 12 +lookup_dcookie 18 +lremovexattr 15 +lseek 62 +lsetxattr 6 +lstat +lstat64 +madvise 233 +map_shadow_stack 453 +mbind 235 +membarrier 283 +memfd_create 279 +memfd_secret 447 +memory_ordering +migrate_pages 238 +mincore 232 +mkdir +mkdirat 34 +mknod +mknodat 33 +mlock 228 +mlock2 284 +mlockall 230 +mmap 222 +mmap2 +modify_ldt +mount 40 +mount_setattr 442 +move_mount 429 +move_pages 239 +mprotect 226 +mq_getsetattr 185 +mq_notify 184 +mq_open 180 +mq_timedreceive 183 +mq_timedreceive_time64 +mq_timedsend 182 +mq_timedsend_time64 +mq_unlink 181 +mremap 216 +msgctl 187 +msgget 186 +msgrcv 188 +msgsnd 189 +msync 227 +multiplexer +munlock 229 +munlockall 231 +munmap 215 +name_to_handle_at 264 +nanosleep 101 +newfstatat 79 +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open +open_by_handle_at 265 +open_tree 428 +openat 56 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 241 +perfctr +personality 92 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe +pipe2 59 +pivot_root 41 +pkey_alloc 289 +pkey_free 290 +pkey_mprotect 288 +poll +ppoll 73 +ppoll_time64 +prctl 167 +pread64 67 +preadv 69 +preadv2 286 +prlimit64 261 +process_madvise 440 +process_mrelease 448 +process_vm_readv 270 +process_vm_writev 271 +pselect6 72 +pselect6_time64 +ptrace 117 +pwrite64 68 +pwritev 70 +pwritev2 287 +quotactl 60 +quotactl_fd 443 +read 63 +readahead 213 +readdir +readlink +readlinkat 78 +readv 65 +reboot 142 +recv +recvfrom 207 +recvmmsg 243 +recvmmsg_time64 +recvmsg 212 +remap_file_pages 234 +removexattr 14 +rename +renameat +renameat2 276 +request_key 218 +restart_syscall 128 +riscv_flush_icache 259 +riscv_hwprobe 258 +rmdir +rseq 293 +rt_sigaction 134 +rt_sigpending 136 +rt_sigprocmask 135 +rt_sigqueueinfo 138 +rt_sigreturn 139 +rt_sigsuspend 133 +rt_sigtimedwait 137 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 240 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 125 +sched_get_priority_min 126 +sched_getaffinity 123 +sched_getattr 275 +sched_getparam 121 +sched_getscheduler 120 +sched_rr_get_interval 127 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 122 +sched_setattr 274 +sched_setparam 118 +sched_setscheduler 119 +sched_yield 124 +seccomp 277 +select +semctl 191 +semget 190 +semop 193 +semtimedop 192 +semtimedop_time64 +send +sendfile 71 +sendfile64 +sendmmsg 269 +sendmsg 211 +sendto 206 +set_mempolicy 237 +set_mempolicy_home_node 450 +set_robust_list 99 +set_thread_area +set_tid_address 96 +setdomainname 162 +setfsgid 152 +setfsgid32 +setfsuid 151 +setfsuid32 +setgid 144 +setgid32 +setgroups 159 +setgroups32 +sethae +sethostname 161 +setitimer 103 +setns 268 +setpgid 154 +setpgrp +setpriority 140 +setregid 143 +setregid32 +setresgid 149 +setresgid32 +setresuid 147 +setresuid32 +setreuid 145 +setreuid32 +setrlimit 164 +setsid 157 +setsockopt 208 +settimeofday 170 +setuid 146 +setuid32 +setxattr 5 +sgetmask +shmat 196 +shmctl 195 +shmdt 197 +shmget 194 +shutdown 210 +sigaction +sigaltstack 132 +signal +signalfd +signalfd4 74 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 198 +socketcall +socketpair 199 +splice 76 +spu_create +spu_run +ssetmask +stat +stat64 +statfs 43 +statfs64 +statx 291 +stime +subpage_prot +swapcontext +swapoff 225 +swapon 224 +switch_endian +symlink +symlinkat 36 +sync 81 +sync_file_range 84 +sync_file_range2 +syncfs 267 +sys_debug_setcontext +syscall +sysfs +sysinfo 179 +syslog 116 +sysmips +tee 77 +tgkill 131 +time +timer_create 107 +timer_delete 111 +timer_getoverrun 109 +timer_gettime 108 +timer_gettime64 +timer_settime 110 +timer_settime64 +timerfd +timerfd_create 85 +timerfd_gettime 87 +timerfd_gettime64 +timerfd_settime 86 +timerfd_settime64 +times 153 +tkill 130 +truncate 45 +truncate64 +ugetrlimit +umask 166 +umount +umount2 39 +uname 160 +unlink +unlinkat 35 +unshare 97 +userfaultfd 282 +ustat +utime +utimensat 88 +utimensat_time64 +utimes +utrap_install +vfork +vhangup 58 +vm86 +vm86old +vmsplice 75 +wait4 260 +waitid 95 +waitpid +write 64 +writev 66 diff --git a/src/basic/syscalls-s390.txt b/src/basic/syscalls-s390.txt new file mode 100644 index 0000000..c81f795 --- /dev/null +++ b/src/basic/syscalls-s390.txt @@ -0,0 +1,515 @@ +_llseek 140 +_newselect 142 +accept +accept4 364 +access 33 +acct 51 +add_key 278 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 361 +bpf 351 +brk 45 +cachectl +cacheflush +cachestat 451 +capget 184 +capset 185 +chdir 12 +chmod 15 +chown 182 +chown32 212 +chroot 61 +clock_adjtime 337 +clock_adjtime64 405 +clock_getres 261 +clock_getres_time64 406 +clock_gettime 260 +clock_gettime64 403 +clock_nanosleep 262 +clock_nanosleep_time64 407 +clock_settime 259 +clock_settime64 404 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 362 +copy_file_range 375 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 326 +epoll_create 249 +epoll_create1 327 +epoll_ctl 250 +epoll_ctl_old +epoll_pwait 312 +epoll_pwait2 441 +epoll_wait 251 +epoll_wait_old +eventfd 318 +eventfd2 323 +exec_with_loader +execv +execve 11 +execveat 354 +exit 1 +exit_group 248 +faccessat 300 +faccessat2 439 +fadvise64 253 +fadvise64_64 264 +fallocate 314 +fanotify_init 332 +fanotify_mark 333 +fchdir 133 +fchmod 94 +fchmodat 299 +fchmodat2 452 +fchown 95 +fchown32 207 +fchownat 291 +fcntl 55 +fcntl64 221 +fdatasync 148 +fgetxattr 229 +finit_module 344 +flistxattr 232 +flock 143 +fork 2 +fremovexattr 235 +fsconfig 431 +fsetxattr 226 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 197 +fstatat64 293 +fstatfs 100 +fstatfs64 266 +fsync 118 +ftruncate 93 +ftruncate64 194 +futex 238 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 292 +get_mempolicy 269 +get_robust_list 305 +get_thread_area +getcpu 311 +getcwd 183 +getdents 141 +getdents64 220 +getdomainname +getdtablesize +getegid 50 +getegid32 202 +geteuid 49 +geteuid32 201 +getgid 47 +getgid32 200 +getgroups 80 +getgroups32 205 +gethostname +getitimer 105 +getpagesize +getpeername 368 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 349 +getresgid 171 +getresgid32 211 +getresuid 165 +getresuid32 209 +getrlimit 76 +getrusage 77 +getsid 147 +getsockname 367 +getsockopt 365 +gettid 236 +gettimeofday 78 +getuid 24 +getuid32 199 +getxattr 227 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 285 +inotify_init 284 +inotify_init1 324 +inotify_rm_watch 286 +io_cancel 247 +io_destroy 244 +io_getevents 245 +io_pgetevents 382 +io_pgetevents_time64 416 +io_setup 243 +io_submit 246 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm 101 +iopl +ioprio_get 283 +ioprio_set 282 +ipc 117 +kcmp 343 +kern_features +kexec_file_load 381 +kexec_load 277 +keyctl 280 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 198 +lgetxattr 228 +link 9 +linkat 296 +listen 363 +listxattr 230 +llistxattr 231 +lookup_dcookie 110 +lremovexattr 234 +lseek 19 +lsetxattr 225 +lstat 107 +lstat64 196 +madvise 219 +map_shadow_stack 453 +mbind 268 +membarrier 356 +memfd_create 350 +memfd_secret 447 +memory_ordering +migrate_pages 287 +mincore 218 +mkdir 39 +mkdirat 289 +mknod 14 +mknodat 290 +mlock 150 +mlock2 374 +mlockall 152 +mmap 90 +mmap2 192 +modify_ldt +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 310 +mprotect 125 +mq_getsetattr 276 +mq_notify 275 +mq_open 271 +mq_timedreceive 274 +mq_timedreceive_time64 419 +mq_timedsend 273 +mq_timedsend_time64 418 +mq_unlink 272 +mremap 163 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 144 +multiplexer +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 335 +nanosleep 162 +newfstatat +nice 34 +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 5 +open_by_handle_at 336 +open_tree 428 +openat 288 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 331 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 325 +pivot_root 217 +pkey_alloc 385 +pkey_free 386 +pkey_mprotect 384 +poll 168 +ppoll 302 +ppoll_time64 414 +prctl 172 +pread64 180 +preadv 328 +preadv2 376 +prlimit64 334 +process_madvise 440 +process_mrelease 448 +process_vm_readv 340 +process_vm_writev 341 +pselect6 301 +pselect6_time64 413 +ptrace 26 +pwrite64 181 +pwritev 329 +pwritev2 377 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 222 +readdir 89 +readlink 85 +readlinkat 298 +readv 145 +reboot 88 +recv +recvfrom 371 +recvmmsg 357 +recvmmsg_time64 417 +recvmsg 372 +remap_file_pages 267 +removexattr 233 +rename 38 +renameat 295 +renameat2 347 +request_key 279 +restart_syscall 7 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 383 +rt_sigaction 174 +rt_sigpending 176 +rt_sigprocmask 175 +rt_sigqueueinfo 178 +rt_sigreturn 173 +rt_sigsuspend 179 +rt_sigtimedwait 177 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 330 +rtas +s390_guarded_storage 378 +s390_pci_mmio_read 353 +s390_pci_mmio_write 352 +s390_runtime_instr 342 +s390_sthyi 380 +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 240 +sched_getattr 346 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 423 +sched_set_affinity +sched_setaffinity 239 +sched_setattr 345 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 348 +select +semctl 394 +semget 393 +semop +semtimedop +semtimedop_time64 420 +send +sendfile 187 +sendfile64 223 +sendmmsg 358 +sendmsg 370 +sendto 369 +set_mempolicy 270 +set_mempolicy_home_node 450 +set_robust_list 304 +set_thread_area +set_tid_address 252 +setdomainname 121 +setfsgid 139 +setfsgid32 216 +setfsuid 138 +setfsuid32 215 +setgid 46 +setgid32 214 +setgroups 81 +setgroups32 206 +sethae +sethostname 74 +setitimer 104 +setns 339 +setpgid 57 +setpgrp +setpriority 97 +setregid 71 +setregid32 204 +setresgid 170 +setresgid32 210 +setresuid 164 +setresuid32 208 +setreuid 70 +setreuid32 203 +setrlimit 75 +setsid 66 +setsockopt 366 +settimeofday 79 +setuid 23 +setuid32 213 +setxattr 224 +sgetmask +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 373 +sigaction 67 +sigaltstack 186 +signal 48 +signalfd 316 +signalfd4 322 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 359 +socketcall 102 +socketpair 360 +splice 306 +spu_create +spu_run +ssetmask +stat 106 +stat64 195 +statfs 99 +statfs64 265 +statx 379 +stime 25 +subpage_prot +swapcontext +swapoff 115 +swapon 87 +switch_endian +symlink 83 +symlinkat 297 +sync 36 +sync_file_range 307 +sync_file_range2 +syncfs 338 +sys_debug_setcontext +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 308 +tgkill 241 +time 13 +timer_create 254 +timer_delete 258 +timer_getoverrun 257 +timer_gettime 256 +timer_gettime64 408 +timer_settime 255 +timer_settime64 409 +timerfd 317 +timerfd_create 319 +timerfd_gettime 321 +timerfd_gettime64 410 +timerfd_settime 320 +timerfd_settime64 411 +times 43 +tkill 237 +truncate 92 +truncate64 193 +ugetrlimit 191 +umask 60 +umount 22 +umount2 52 +uname 122 +unlink 10 +unlinkat 294 +unshare 303 +userfaultfd 355 +ustat 62 +utime 30 +utimensat 315 +utimensat_time64 412 +utimes 313 +utrap_install +vfork 190 +vhangup 111 +vm86 +vm86old +vmsplice 309 +wait4 114 +waitid 281 +waitpid +write 4 +writev 146 diff --git a/src/basic/syscalls-s390x.txt b/src/basic/syscalls-s390x.txt new file mode 100644 index 0000000..c999fd6 --- /dev/null +++ b/src/basic/syscalls-s390x.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept +accept4 364 +access 33 +acct 51 +add_key 278 +adjtimex 124 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 361 +bpf 351 +brk 45 +cachectl +cacheflush +cachestat 451 +capget 184 +capset 185 +chdir 12 +chmod 15 +chown 212 +chown32 +chroot 61 +clock_adjtime 337 +clock_adjtime64 +clock_getres 261 +clock_getres_time64 +clock_gettime 260 +clock_gettime64 +clock_nanosleep 262 +clock_nanosleep_time64 +clock_settime 259 +clock_settime64 +clone 120 +clone3 435 +close 6 +close_range 436 +connect 362 +copy_file_range 375 +creat 8 +delete_module 129 +dipc +dup 41 +dup2 63 +dup3 326 +epoll_create 249 +epoll_create1 327 +epoll_ctl 250 +epoll_ctl_old +epoll_pwait 312 +epoll_pwait2 441 +epoll_wait 251 +epoll_wait_old +eventfd 318 +eventfd2 323 +exec_with_loader +execv +execve 11 +execveat 354 +exit 1 +exit_group 248 +faccessat 300 +faccessat2 439 +fadvise64 253 +fadvise64_64 +fallocate 314 +fanotify_init 332 +fanotify_mark 333 +fchdir 133 +fchmod 94 +fchmodat 299 +fchmodat2 452 +fchown 207 +fchown32 +fchownat 291 +fcntl 55 +fcntl64 +fdatasync 148 +fgetxattr 229 +finit_module 344 +flistxattr 232 +flock 143 +fork 2 +fremovexattr 235 +fsconfig 431 +fsetxattr 226 +fsmount 432 +fsopen 430 +fspick 433 +fstat 108 +fstat64 +fstatat64 +fstatfs 100 +fstatfs64 266 +fsync 118 +ftruncate 93 +ftruncate64 +futex 238 +futex_requeue 456 +futex_time64 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 292 +get_mempolicy 269 +get_robust_list 305 +get_thread_area +getcpu 311 +getcwd 183 +getdents 141 +getdents64 220 +getdomainname +getdtablesize +getegid 202 +getegid32 +geteuid 201 +geteuid32 +getgid 200 +getgid32 +getgroups 205 +getgroups32 +gethostname +getitimer 105 +getpagesize +getpeername 368 +getpgid 132 +getpgrp 65 +getpid 20 +getppid 64 +getpriority 96 +getrandom 349 +getresgid 211 +getresgid32 +getresuid 209 +getresuid32 +getrlimit 191 +getrusage 77 +getsid 147 +getsockname 367 +getsockopt 365 +gettid 236 +gettimeofday 78 +getuid 199 +getuid32 +getxattr 227 +getxgid +getxpid +getxuid +init_module 128 +inotify_add_watch 285 +inotify_init 284 +inotify_init1 324 +inotify_rm_watch 286 +io_cancel 247 +io_destroy 244 +io_getevents 245 +io_pgetevents 382 +io_pgetevents_time64 +io_setup 243 +io_submit 246 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm +iopl +ioprio_get 283 +ioprio_set 282 +ipc 117 +kcmp 343 +kern_features +kexec_file_load 381 +kexec_load 277 +keyctl 280 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 198 +lchown32 +lgetxattr 228 +link 9 +linkat 296 +listen 363 +listxattr 230 +llistxattr 231 +lookup_dcookie 110 +lremovexattr 234 +lseek 19 +lsetxattr 225 +lstat 107 +lstat64 +madvise 219 +map_shadow_stack 453 +mbind 268 +membarrier 356 +memfd_create 350 +memfd_secret 447 +memory_ordering +migrate_pages 287 +mincore 218 +mkdir 39 +mkdirat 289 +mknod 14 +mknodat 290 +mlock 150 +mlock2 374 +mlockall 152 +mmap 90 +mmap2 +modify_ldt +mount 21 +mount_setattr 442 +move_mount 429 +move_pages 310 +mprotect 125 +mq_getsetattr 276 +mq_notify 275 +mq_open 271 +mq_timedreceive 274 +mq_timedreceive_time64 +mq_timedsend 273 +mq_timedsend_time64 +mq_unlink 272 +mremap 163 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 144 +multiplexer +munlock 151 +munlockall 153 +munmap 91 +name_to_handle_at 335 +nanosleep 162 +newfstatat 293 +nice 34 +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 5 +open_by_handle_at 336 +open_tree 428 +openat 288 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 331 +perfctr +personality 136 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 325 +pivot_root 217 +pkey_alloc 385 +pkey_free 386 +pkey_mprotect 384 +poll 168 +ppoll 302 +ppoll_time64 +prctl 172 +pread64 180 +preadv 328 +preadv2 376 +prlimit64 334 +process_madvise 440 +process_mrelease 448 +process_vm_readv 340 +process_vm_writev 341 +pselect6 301 +pselect6_time64 +ptrace 26 +pwrite64 181 +pwritev 329 +pwritev2 377 +quotactl 131 +quotactl_fd 443 +read 3 +readahead 222 +readdir 89 +readlink 85 +readlinkat 298 +readv 145 +reboot 88 +recv +recvfrom 371 +recvmmsg 357 +recvmmsg_time64 +recvmsg 372 +remap_file_pages 267 +removexattr 233 +rename 38 +renameat 295 +renameat2 347 +request_key 279 +restart_syscall 7 +riscv_flush_icache +riscv_hwprobe +rmdir 40 +rseq 383 +rt_sigaction 174 +rt_sigpending 176 +rt_sigprocmask 175 +rt_sigqueueinfo 178 +rt_sigreturn 173 +rt_sigsuspend 179 +rt_sigtimedwait 177 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 330 +rtas +s390_guarded_storage 378 +s390_pci_mmio_read 353 +s390_pci_mmio_write 352 +s390_runtime_instr 342 +s390_sthyi 380 +sched_get_affinity +sched_get_priority_max 159 +sched_get_priority_min 160 +sched_getaffinity 240 +sched_getattr 346 +sched_getparam 155 +sched_getscheduler 157 +sched_rr_get_interval 161 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 239 +sched_setattr 345 +sched_setparam 154 +sched_setscheduler 156 +sched_yield 158 +seccomp 348 +select 142 +semctl 394 +semget 393 +semop +semtimedop 392 +semtimedop_time64 +send +sendfile 187 +sendfile64 +sendmmsg 358 +sendmsg 370 +sendto 369 +set_mempolicy 270 +set_mempolicy_home_node 450 +set_robust_list 304 +set_thread_area +set_tid_address 252 +setdomainname 121 +setfsgid 216 +setfsgid32 +setfsuid 215 +setfsuid32 +setgid 214 +setgid32 +setgroups 206 +setgroups32 +sethae +sethostname 74 +setitimer 104 +setns 339 +setpgid 57 +setpgrp +setpriority 97 +setregid 204 +setregid32 +setresgid 210 +setresgid32 +setresuid 208 +setresuid32 +setreuid 203 +setreuid32 +setrlimit 75 +setsid 66 +setsockopt 366 +settimeofday 79 +setuid 213 +setuid32 +setxattr 224 +sgetmask +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 373 +sigaction 67 +sigaltstack 186 +signal 48 +signalfd 316 +signalfd4 322 +sigpending 73 +sigprocmask 126 +sigreturn 119 +sigsuspend 72 +socket 359 +socketcall 102 +socketpair 360 +splice 306 +spu_create +spu_run +ssetmask +stat 106 +stat64 +statfs 99 +statfs64 265 +statx 379 +stime +subpage_prot +swapcontext +swapoff 115 +swapon 87 +switch_endian +symlink 83 +symlinkat 297 +sync 36 +sync_file_range 307 +sync_file_range2 +syncfs 338 +sys_debug_setcontext +syscall +sysfs 135 +sysinfo 116 +syslog 103 +sysmips +tee 308 +tgkill 241 +time +timer_create 254 +timer_delete 258 +timer_getoverrun 257 +timer_gettime 256 +timer_gettime64 +timer_settime 255 +timer_settime64 +timerfd 317 +timerfd_create 319 +timerfd_gettime 321 +timerfd_gettime64 +timerfd_settime 320 +timerfd_settime64 +times 43 +tkill 237 +truncate 92 +truncate64 +ugetrlimit +umask 60 +umount 22 +umount2 52 +uname 122 +unlink 10 +unlinkat 294 +unshare 303 +userfaultfd 355 +ustat 62 +utime 30 +utimensat 315 +utimensat_time64 +utimes 313 +utrap_install +vfork 190 +vhangup 111 +vm86 +vm86old +vmsplice 309 +wait4 114 +waitid 281 +waitpid +write 4 +writev 146 diff --git a/src/basic/syscalls-sparc.txt b/src/basic/syscalls-sparc.txt new file mode 100644 index 0000000..e631d30 --- /dev/null +++ b/src/basic/syscalls-sparc.txt @@ -0,0 +1,515 @@ +_llseek 236 +_newselect 230 +accept 99 +accept4 323 +access 33 +acct 51 +add_key 281 +adjtimex 219 +alarm 27 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 353 +bpf 349 +brk 17 +cachectl +cacheflush +cachestat 451 +capget 21 +capset 22 +chdir 12 +chmod 15 +chown 13 +chown32 35 +chroot 61 +clock_adjtime 334 +clock_adjtime64 405 +clock_getres 258 +clock_getres_time64 406 +clock_gettime 257 +clock_gettime64 403 +clock_nanosleep 259 +clock_nanosleep_time64 407 +clock_settime 256 +clock_settime64 404 +clone 217 +clone3 +close 6 +close_range 436 +connect 98 +copy_file_range 357 +creat 8 +delete_module 222 +dipc +dup 41 +dup2 90 +dup3 320 +epoll_create 193 +epoll_create1 319 +epoll_ctl 194 +epoll_ctl_old +epoll_pwait 309 +epoll_pwait2 441 +epoll_wait 195 +epoll_wait_old +eventfd 313 +eventfd2 318 +exec_with_loader +execv 11 +execve 59 +execveat 350 +exit 1 +exit_group 188 +faccessat 296 +faccessat2 439 +fadvise64 209 +fadvise64_64 210 +fallocate 314 +fanotify_init 329 +fanotify_mark 330 +fchdir 176 +fchmod 124 +fchmodat 295 +fchmodat2 452 +fchown 123 +fchown32 32 +fchownat 287 +fcntl 92 +fcntl64 155 +fdatasync 253 +fgetxattr 177 +finit_module 342 +flistxattr 180 +flock 131 +fork 2 +fremovexattr 186 +fsconfig 431 +fsetxattr 171 +fsmount 432 +fsopen 430 +fspick 433 +fstat 62 +fstat64 63 +fstatat64 289 +fstatfs 158 +fstatfs64 235 +fsync 95 +ftruncate 130 +ftruncate64 84 +futex 142 +futex_requeue 456 +futex_time64 422 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 288 +get_mempolicy 304 +get_robust_list 301 +get_thread_area +getcpu 308 +getcwd 119 +getdents 174 +getdents64 154 +getdomainname 162 +getdtablesize +getegid 50 +getegid32 70 +geteuid 49 +geteuid32 69 +getgid 47 +getgid32 53 +getgroups 79 +getgroups32 115 +gethostname +getitimer 86 +getpagesize 64 +getpeername 141 +getpgid 224 +getpgrp 81 +getpid 20 +getppid 197 +getpriority 100 +getrandom 347 +getresgid +getresgid32 111 +getresuid +getresuid32 109 +getrlimit 144 +getrusage 117 +getsid 252 +getsockname 150 +getsockopt 118 +gettid 143 +gettimeofday 116 +getuid 24 +getuid32 44 +getxattr 172 +getxgid +getxpid +getxuid +init_module 190 +inotify_add_watch 152 +inotify_init 151 +inotify_init1 322 +inotify_rm_watch 156 +io_cancel 271 +io_destroy 269 +io_getevents 272 +io_pgetevents 361 +io_pgetevents_time64 416 +io_setup 268 +io_submit 270 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 54 +ioperm +iopl +ioprio_get 218 +ioprio_set 196 +ipc 215 +kcmp 341 +kern_features 340 +kexec_file_load +kexec_load 306 +keyctl 283 +kill 37 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 16 +lchown32 31 +lgetxattr 173 +link 9 +linkat 292 +listen 354 +listxattr 178 +llistxattr 179 +lookup_dcookie 208 +lremovexattr 182 +lseek 19 +lsetxattr 170 +lstat 40 +lstat64 132 +madvise 75 +map_shadow_stack 453 +mbind 303 +membarrier 351 +memfd_create 348 +memfd_secret +memory_ordering +migrate_pages 302 +mincore 78 +mkdir 136 +mkdirat 285 +mknod 14 +mknodat 286 +mlock 237 +mlock2 356 +mlockall 239 +mmap 71 +mmap2 56 +modify_ldt +mount 167 +mount_setattr 442 +move_mount 429 +move_pages 307 +mprotect 74 +mq_getsetattr 278 +mq_notify 277 +mq_open 273 +mq_timedreceive 276 +mq_timedreceive_time64 419 +mq_timedsend 275 +mq_timedsend_time64 418 +mq_unlink 274 +mremap 250 +msgctl 402 +msgget 399 +msgrcv 401 +msgsnd 400 +msync 65 +multiplexer +munlock 238 +munlockall 240 +munmap 73 +name_to_handle_at 332 +nanosleep 249 +newfstatat +nice 34 +old_adjtimex +oldfstat +oldlstat 202 +oldolduname +oldstat +oldumount +olduname +open 5 +open_by_handle_at 333 +open_tree 428 +openat 284 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 29 +pciconfig_iobase +pciconfig_read 148 +pciconfig_write 149 +perf_event_open 327 +perfctr 18 +personality 191 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 42 +pipe2 321 +pivot_root 146 +pkey_alloc 363 +pkey_free 364 +pkey_mprotect 362 +poll 153 +ppoll 298 +ppoll_time64 414 +prctl 147 +pread64 67 +preadv 324 +preadv2 358 +prlimit64 331 +process_madvise 440 +process_mrelease 448 +process_vm_readv 338 +process_vm_writev 339 +pselect6 297 +pselect6_time64 413 +ptrace 26 +pwrite64 68 +pwritev 325 +pwritev2 359 +quotactl 165 +quotactl_fd 443 +read 3 +readahead 205 +readdir 204 +readlink 58 +readlinkat 294 +readv 120 +reboot 55 +recv +recvfrom 125 +recvmmsg 328 +recvmmsg_time64 417 +recvmsg 113 +remap_file_pages 192 +removexattr 181 +rename 128 +renameat 291 +renameat2 345 +request_key 282 +restart_syscall 0 +riscv_flush_icache +riscv_hwprobe +rmdir 137 +rseq 365 +rt_sigaction 102 +rt_sigpending 104 +rt_sigprocmask 103 +rt_sigqueueinfo 106 +rt_sigreturn 101 +rt_sigsuspend 107 +rt_sigtimedwait 105 +rt_sigtimedwait_time64 421 +rt_tgsigqueueinfo 326 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity 161 +sched_get_priority_max 246 +sched_get_priority_min 247 +sched_getaffinity 260 +sched_getattr 344 +sched_getparam 242 +sched_getscheduler 244 +sched_rr_get_interval 248 +sched_rr_get_interval_time64 423 +sched_set_affinity 160 +sched_setaffinity 261 +sched_setattr 343 +sched_setparam 241 +sched_setscheduler 243 +sched_yield 245 +seccomp 346 +select 93 +semctl 394 +semget 393 +semop +semtimedop +semtimedop_time64 420 +send +sendfile 39 +sendfile64 140 +sendmmsg 336 +sendmsg 114 +sendto 133 +set_mempolicy 305 +set_mempolicy_home_node 450 +set_robust_list 300 +set_thread_area +set_tid_address 166 +setdomainname 163 +setfsgid 229 +setfsgid32 94 +setfsuid 228 +setfsuid32 91 +setgid 46 +setgid32 89 +setgroups 80 +setgroups32 82 +sethae +sethostname 88 +setitimer 83 +setns 337 +setpgid 185 +setpgrp +setpriority 96 +setregid 127 +setregid32 112 +setresgid +setresgid32 110 +setresuid +setresuid32 108 +setreuid 126 +setreuid32 72 +setrlimit 145 +setsid 175 +setsockopt 355 +settimeofday 122 +setuid 23 +setuid32 87 +setxattr 169 +sgetmask 199 +shmat 397 +shmctl 396 +shmdt 398 +shmget 395 +shutdown 134 +sigaction 198 +sigaltstack 28 +signal 48 +signalfd 311 +signalfd4 317 +sigpending 183 +sigprocmask 220 +sigreturn 216 +sigsuspend 201 +socket 97 +socketcall 206 +socketpair 135 +splice 232 +spu_create +spu_run +ssetmask 200 +stat 38 +stat64 139 +statfs 157 +statfs64 234 +statx 360 +stime 233 +subpage_prot +swapcontext +swapoff 213 +swapon 85 +switch_endian +symlink 57 +symlinkat 293 +sync 36 +sync_file_range 255 +sync_file_range2 +syncfs 335 +sys_debug_setcontext +syscall +sysfs 226 +sysinfo 214 +syslog 207 +sysmips +tee 280 +tgkill 211 +time 231 +timer_create 266 +timer_delete 265 +timer_getoverrun 264 +timer_gettime 263 +timer_gettime64 408 +timer_settime 262 +timer_settime64 409 +timerfd +timerfd_create 312 +timerfd_gettime 316 +timerfd_gettime64 410 +timerfd_settime 315 +timerfd_settime64 411 +times 43 +tkill 187 +truncate 129 +truncate64 77 +ugetrlimit +umask 60 +umount 159 +umount2 45 +uname 189 +unlink 10 +unlinkat 290 +unshare 299 +userfaultfd 352 +ustat 168 +utime 30 +utimensat 310 +utimensat_time64 412 +utimes 138 +utrap_install +vfork 66 +vhangup 76 +vm86 +vm86old +vmsplice 25 +wait4 7 +waitid 279 +waitpid 212 +write 4 +writev 121 diff --git a/src/basic/syscalls-x86_64.txt b/src/basic/syscalls-x86_64.txt new file mode 100644 index 0000000..52d6176 --- /dev/null +++ b/src/basic/syscalls-x86_64.txt @@ -0,0 +1,515 @@ +_llseek +_newselect +accept 43 +accept4 288 +access 21 +acct 163 +add_key 248 +adjtimex 159 +alarm 37 +arc_gettls +arc_settls +arc_usr_cmpxchg +arch_prctl 158 +arm_fadvise64_64 +atomic_barrier +atomic_cmpxchg_32 +bind 49 +bpf 321 +brk 12 +cachectl +cacheflush +cachestat 451 +capget 125 +capset 126 +chdir 80 +chmod 90 +chown 92 +chown32 +chroot 161 +clock_adjtime 305 +clock_adjtime64 +clock_getres 229 +clock_getres_time64 +clock_gettime 228 +clock_gettime64 +clock_nanosleep 230 +clock_nanosleep_time64 +clock_settime 227 +clock_settime64 +clone 56 +clone3 435 +close 3 +close_range 436 +connect 42 +copy_file_range 326 +creat 85 +delete_module 176 +dipc +dup 32 +dup2 33 +dup3 292 +epoll_create 213 +epoll_create1 291 +epoll_ctl 233 +epoll_ctl_old 214 +epoll_pwait 281 +epoll_pwait2 441 +epoll_wait 232 +epoll_wait_old 215 +eventfd 284 +eventfd2 290 +exec_with_loader +execv +execve 59 +execveat 322 +exit 60 +exit_group 231 +faccessat 269 +faccessat2 439 +fadvise64 221 +fadvise64_64 +fallocate 285 +fanotify_init 300 +fanotify_mark 301 +fchdir 81 +fchmod 91 +fchmodat 268 +fchmodat2 452 +fchown 93 +fchown32 +fchownat 260 +fcntl 72 +fcntl64 +fdatasync 75 +fgetxattr 193 +finit_module 313 +flistxattr 196 +flock 73 +fork 57 +fremovexattr 199 +fsconfig 431 +fsetxattr 190 +fsmount 432 +fsopen 430 +fspick 433 +fstat 5 +fstat64 +fstatat64 +fstatfs 138 +fstatfs64 +fsync 74 +ftruncate 77 +ftruncate64 +futex 202 +futex_requeue 456 +futex_time64 +futex_wait 455 +futex_waitv 449 +futex_wake 454 +futimesat 261 +get_mempolicy 239 +get_robust_list 274 +get_thread_area 211 +getcpu 309 +getcwd 79 +getdents 78 +getdents64 217 +getdomainname +getdtablesize +getegid 108 +getegid32 +geteuid 107 +geteuid32 +getgid 104 +getgid32 +getgroups 115 +getgroups32 +gethostname +getitimer 36 +getpagesize +getpeername 52 +getpgid 121 +getpgrp 111 +getpid 39 +getppid 110 +getpriority 140 +getrandom 318 +getresgid 120 +getresgid32 +getresuid 118 +getresuid32 +getrlimit 97 +getrusage 98 +getsid 124 +getsockname 51 +getsockopt 55 +gettid 186 +gettimeofday 96 +getuid 102 +getuid32 +getxattr 191 +getxgid +getxpid +getxuid +init_module 175 +inotify_add_watch 254 +inotify_init 253 +inotify_init1 294 +inotify_rm_watch 255 +io_cancel 210 +io_destroy 207 +io_getevents 208 +io_pgetevents 333 +io_pgetevents_time64 +io_setup 206 +io_submit 209 +io_uring_enter 426 +io_uring_register 427 +io_uring_setup 425 +ioctl 16 +ioperm 173 +iopl 172 +ioprio_get 252 +ioprio_set 251 +ipc +kcmp 312 +kern_features +kexec_file_load 320 +kexec_load 246 +keyctl 250 +kill 62 +landlock_add_rule 445 +landlock_create_ruleset 444 +landlock_restrict_self 446 +lchown 94 +lchown32 +lgetxattr 192 +link 86 +linkat 265 +listen 50 +listxattr 194 +llistxattr 195 +lookup_dcookie 212 +lremovexattr 198 +lseek 8 +lsetxattr 189 +lstat 6 +lstat64 +madvise 28 +map_shadow_stack 453 +mbind 237 +membarrier 324 +memfd_create 319 +memfd_secret 447 +memory_ordering +migrate_pages 256 +mincore 27 +mkdir 83 +mkdirat 258 +mknod 133 +mknodat 259 +mlock 149 +mlock2 325 +mlockall 151 +mmap 9 +mmap2 +modify_ldt 154 +mount 165 +mount_setattr 442 +move_mount 429 +move_pages 279 +mprotect 10 +mq_getsetattr 245 +mq_notify 244 +mq_open 240 +mq_timedreceive 243 +mq_timedreceive_time64 +mq_timedsend 242 +mq_timedsend_time64 +mq_unlink 241 +mremap 25 +msgctl 71 +msgget 68 +msgrcv 70 +msgsnd 69 +msync 26 +multiplexer +munlock 150 +munlockall 152 +munmap 11 +name_to_handle_at 303 +nanosleep 35 +newfstatat 262 +nice +old_adjtimex +oldfstat +oldlstat +oldolduname +oldstat +oldumount +olduname +open 2 +open_by_handle_at 304 +open_tree 428 +openat 257 +openat2 437 +or1k_atomic +osf_fstat +osf_fstatfs +osf_fstatfs64 +osf_getdirentries +osf_getdomainname +osf_getitimer +osf_getrusage +osf_getsysinfo +osf_gettimeofday +osf_lstat +osf_mount +osf_proplist_syscall +osf_select +osf_set_program_attributes +osf_setitimer +osf_setsysinfo +osf_settimeofday +osf_shmat +osf_sigprocmask +osf_sigstack +osf_stat +osf_statfs +osf_statfs64 +osf_swapon +osf_syscall +osf_sysinfo +osf_usleep_thread +osf_utimes +osf_utsname +osf_wait4 +pause 34 +pciconfig_iobase +pciconfig_read +pciconfig_write +perf_event_open 298 +perfctr +personality 135 +pidfd_getfd 438 +pidfd_open 434 +pidfd_send_signal 424 +pipe 22 +pipe2 293 +pivot_root 155 +pkey_alloc 330 +pkey_free 331 +pkey_mprotect 329 +poll 7 +ppoll 271 +ppoll_time64 +prctl 157 +pread64 17 +preadv 295 +preadv2 327 +prlimit64 302 +process_madvise 440 +process_mrelease 448 +process_vm_readv 310 +process_vm_writev 311 +pselect6 270 +pselect6_time64 +ptrace 101 +pwrite64 18 +pwritev 296 +pwritev2 328 +quotactl 179 +quotactl_fd 443 +read 0 +readahead 187 +readdir +readlink 89 +readlinkat 267 +readv 19 +reboot 169 +recv +recvfrom 45 +recvmmsg 299 +recvmmsg_time64 +recvmsg 47 +remap_file_pages 216 +removexattr 197 +rename 82 +renameat 264 +renameat2 316 +request_key 249 +restart_syscall 219 +riscv_flush_icache +riscv_hwprobe +rmdir 84 +rseq 334 +rt_sigaction 13 +rt_sigpending 127 +rt_sigprocmask 14 +rt_sigqueueinfo 129 +rt_sigreturn 15 +rt_sigsuspend 130 +rt_sigtimedwait 128 +rt_sigtimedwait_time64 +rt_tgsigqueueinfo 297 +rtas +s390_guarded_storage +s390_pci_mmio_read +s390_pci_mmio_write +s390_runtime_instr +s390_sthyi +sched_get_affinity +sched_get_priority_max 146 +sched_get_priority_min 147 +sched_getaffinity 204 +sched_getattr 315 +sched_getparam 143 +sched_getscheduler 145 +sched_rr_get_interval 148 +sched_rr_get_interval_time64 +sched_set_affinity +sched_setaffinity 203 +sched_setattr 314 +sched_setparam 142 +sched_setscheduler 144 +sched_yield 24 +seccomp 317 +select 23 +semctl 66 +semget 64 +semop 65 +semtimedop 220 +semtimedop_time64 +send +sendfile 40 +sendfile64 +sendmmsg 307 +sendmsg 46 +sendto 44 +set_mempolicy 238 +set_mempolicy_home_node 450 +set_robust_list 273 +set_thread_area 205 +set_tid_address 218 +setdomainname 171 +setfsgid 123 +setfsgid32 +setfsuid 122 +setfsuid32 +setgid 106 +setgid32 +setgroups 116 +setgroups32 +sethae +sethostname 170 +setitimer 38 +setns 308 +setpgid 109 +setpgrp +setpriority 141 +setregid 114 +setregid32 +setresgid 119 +setresgid32 +setresuid 117 +setresuid32 +setreuid 113 +setreuid32 +setrlimit 160 +setsid 112 +setsockopt 54 +settimeofday 164 +setuid 105 +setuid32 +setxattr 188 +sgetmask +shmat 30 +shmctl 31 +shmdt 67 +shmget 29 +shutdown 48 +sigaction +sigaltstack 131 +signal +signalfd 282 +signalfd4 289 +sigpending +sigprocmask +sigreturn +sigsuspend +socket 41 +socketcall +socketpair 53 +splice 275 +spu_create +spu_run +ssetmask +stat 4 +stat64 +statfs 137 +statfs64 +statx 332 +stime +subpage_prot +swapcontext +swapoff 168 +swapon 167 +switch_endian +symlink 88 +symlinkat 266 +sync 162 +sync_file_range 277 +sync_file_range2 +syncfs 306 +sys_debug_setcontext +syscall +sysfs 139 +sysinfo 99 +syslog 103 +sysmips +tee 276 +tgkill 234 +time 201 +timer_create 222 +timer_delete 226 +timer_getoverrun 225 +timer_gettime 224 +timer_gettime64 +timer_settime 223 +timer_settime64 +timerfd +timerfd_create 283 +timerfd_gettime 287 +timerfd_gettime64 +timerfd_settime 286 +timerfd_settime64 +times 100 +tkill 200 +truncate 76 +truncate64 +ugetrlimit +umask 95 +umount +umount2 166 +uname 63 +unlink 87 +unlinkat 263 +unshare 272 +userfaultfd 323 +ustat 136 +utime 132 +utimensat 280 +utimensat_time64 +utimes 235 +utrap_install +vfork 58 +vhangup 153 +vm86 +vm86old +vmsplice 278 +wait4 61 +waitid 247 +waitpid +write 1 +writev 20 diff --git a/src/basic/sysctl-util.c b/src/basic/sysctl-util.c new file mode 100644 index 0000000..b66a662 --- /dev/null +++ b/src/basic/sysctl-util.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "af-list.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "sysctl-util.h" + +char *sysctl_normalize(char *s) { + char *n; + + n = strpbrk(s, "/."); + + /* If the first separator is a slash, the path is + * assumed to be normalized and slashes remain slashes + * and dots remains dots. */ + + if (n && *n == '.') + /* Dots become slashes and slashes become dots. Fun. */ + do { + if (*n == '.') + *n = '/'; + else + *n = '.'; + + n = strpbrk(n + 1, "/."); + } while (n); + + path_simplify(s); + + /* Kill the leading slash, but keep the first character of the string in the same place. */ + if (s[0] == '/' && s[1] != 0) + memmove(s, s+1, strlen(s)); + + return s; +} + +int sysctl_write(const char *property, const char *value) { + char *p; + + assert(property); + assert(value); + + p = strjoina("/proc/sys/", property); + + path_simplify(p); + if (!path_is_normalized(p)) + return -EINVAL; + + log_debug("Setting '%s' to '%s'", p, value); + + return write_string_file(p, value, WRITE_STRING_FILE_VERIFY_ON_FAILURE | WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL); +} + +int sysctl_writef(const char *property, const char *format, ...) { + _cleanup_free_ char *v = NULL; + va_list ap; + int r; + + va_start(ap, format); + r = vasprintf(&v, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return sysctl_write(property, v); +} + +int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value) { + const char *p; + + assert(property); + assert(value); + + if (!IN_SET(af, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + if (ifname) { + if (!ifname_valid_full(ifname, IFNAME_VALID_SPECIAL)) + return -EINVAL; + + p = strjoina("net/", af_to_ipv4_ipv6(af), "/conf/", ifname, "/", property); + } else + p = strjoina("net/", af_to_ipv4_ipv6(af), "/", property); + + return sysctl_write(p, value); +} + +int sysctl_read(const char *property, char **ret) { + char *p; + int r; + + assert(property); + + p = strjoina("/proc/sys/", property); + + path_simplify(p); + if (!path_is_normalized(p)) /* Filter out attempts to write to /proc/sys/../../…, just in case */ + return -EINVAL; + + r = read_full_virtual_file(p, ret, NULL); + if (r < 0) + return r; + if (ret) + delete_trailing_chars(*ret, NEWLINE); + + return r; +} + +int sysctl_read_ip_property(int af, const char *ifname, const char *property, char **ret) { + const char *p; + + assert(property); + + if (!IN_SET(af, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + if (ifname) { + if (!ifname_valid_full(ifname, IFNAME_VALID_SPECIAL)) + return -EINVAL; + + p = strjoina("net/", af_to_ipv4_ipv6(af), "/conf/", ifname, "/", property); + } else + p = strjoina("net/", af_to_ipv4_ipv6(af), "/", property); + + return sysctl_read(p, ret); +} diff --git a/src/basic/sysctl-util.h b/src/basic/sysctl-util.h new file mode 100644 index 0000000..3236419 --- /dev/null +++ b/src/basic/sysctl-util.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" +#include "stdio-util.h" +#include "string-util.h" + +char *sysctl_normalize(char *s); +int sysctl_read(const char *property, char **value); +int sysctl_write(const char *property, const char *value); +int sysctl_writef(const char *property, const char *format, ...) _printf_(2, 3); + +int sysctl_read_ip_property(int af, const char *ifname, const char *property, char **ret); +int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value); +static inline int sysctl_write_ip_property_boolean(int af, const char *ifname, const char *property, bool value) { + return sysctl_write_ip_property(af, ifname, property, one_zero(value)); +} + +#define DEFINE_SYSCTL_WRITE_IP_PROPERTY(name, type, format) \ + static inline int sysctl_write_ip_property_##name(int af, const char *ifname, const char *property, type value) { \ + char buf[DECIMAL_STR_MAX(type)]; \ + xsprintf(buf, format, value); \ + return sysctl_write_ip_property(af, ifname, property, buf); \ + } + +DEFINE_SYSCTL_WRITE_IP_PROPERTY(int, int, "%i"); +DEFINE_SYSCTL_WRITE_IP_PROPERTY(uint32, uint32_t, "%" PRIu32); diff --git a/src/basic/syslog-util.c b/src/basic/syslog-util.c new file mode 100644 index 0000000..0371922 --- /dev/null +++ b/src/basic/syslog-util.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-id128.h" + +#include "glob-util.h" +#include "hexdecoct.h" +#include "macro.h" +#include "path-util.h" +#include "string-table.h" +#include "syslog-util.h" +#include "unit-name.h" + +int syslog_parse_priority(const char **p, int *priority, bool with_facility) { + int a = 0, b = 0, c = 0; + const char *end; + size_t k; + + assert(p); + assert(*p); + assert(priority); + + if ((*p)[0] != '<') + return 0; + + end = strchr(*p, '>'); + if (!end) + return 0; + + k = end - *p; + assert(k > 0); + + if (k == 2) + c = undecchar((*p)[1]); + else if (k == 3) { + b = undecchar((*p)[1]); + c = undecchar((*p)[2]); + } else if (k == 4) { + a = undecchar((*p)[1]); + b = undecchar((*p)[2]); + c = undecchar((*p)[3]); + } else + return 0; + + if (a < 0 || b < 0 || c < 0 || + (!with_facility && (a || b || c > 7))) + return 0; + + if (with_facility) + *priority = a*100 + b*10 + c; + else + *priority = (*priority & LOG_FACMASK) | c; + + *p += k + 1; + return 1; +} + +static const char *const log_facility_unshifted_table[LOG_NFACILITIES] = { + [LOG_FAC(LOG_KERN)] = "kern", + [LOG_FAC(LOG_USER)] = "user", + [LOG_FAC(LOG_MAIL)] = "mail", + [LOG_FAC(LOG_DAEMON)] = "daemon", + [LOG_FAC(LOG_AUTH)] = "auth", + [LOG_FAC(LOG_SYSLOG)] = "syslog", + [LOG_FAC(LOG_LPR)] = "lpr", + [LOG_FAC(LOG_NEWS)] = "news", + [LOG_FAC(LOG_UUCP)] = "uucp", + [LOG_FAC(LOG_CRON)] = "cron", + [LOG_FAC(LOG_AUTHPRIV)] = "authpriv", + [LOG_FAC(LOG_FTP)] = "ftp", + [LOG_FAC(LOG_LOCAL0)] = "local0", + [LOG_FAC(LOG_LOCAL1)] = "local1", + [LOG_FAC(LOG_LOCAL2)] = "local2", + [LOG_FAC(LOG_LOCAL3)] = "local3", + [LOG_FAC(LOG_LOCAL4)] = "local4", + [LOG_FAC(LOG_LOCAL5)] = "local5", + [LOG_FAC(LOG_LOCAL6)] = "local6", + [LOG_FAC(LOG_LOCAL7)] = "local7", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(log_facility_unshifted, int, LOG_FAC(~0)); + +bool log_facility_unshifted_is_valid(int facility) { + return facility >= 0 && facility <= LOG_FAC(~0); +} + +static const char *const log_level_table[] = { + [LOG_EMERG] = "emerg", + [LOG_ALERT] = "alert", + [LOG_CRIT] = "crit", + [LOG_ERR] = "err", + [LOG_WARNING] = "warning", + [LOG_NOTICE] = "notice", + [LOG_INFO] = "info", + [LOG_DEBUG] = "debug", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(log_level, int, LOG_DEBUG); + +bool log_level_is_valid(int level) { + return level >= 0 && level <= LOG_DEBUG; +} + +/* The maximum size for a log namespace length. This is the file name size limit 255 minus the size of a + * formatted machine ID minus a separator char */ +#define LOG_NAMESPACE_MAX (NAME_MAX - (SD_ID128_STRING_MAX - 1) - 1) + +bool log_namespace_name_valid(const char *s) { + /* Let's make sure the namespace fits in a filename that is prefixed with the machine ID and a dot + * (so that /var/log/journal/. can be created based on it). Also make sure it + * is suitable as unit instance name, and does not contain fishy characters. */ + + if (!filename_is_valid(s)) + return false; + + if (strlen(s) > LOG_NAMESPACE_MAX) + return false; + + if (!unit_instance_is_valid(s)) + return false; + + if (!string_is_safe(s)) + return false; + + /* Let's avoid globbing for now */ + if (string_is_glob(s)) + return false; + + return true; +} diff --git a/src/basic/syslog-util.h b/src/basic/syslog-util.h new file mode 100644 index 0000000..d7aa97f --- /dev/null +++ b/src/basic/syslog-util.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int log_facility_unshifted_to_string_alloc(int i, char **s); +int log_facility_unshifted_from_string(const char *s); +bool log_facility_unshifted_is_valid(int faciliy); + +int log_level_to_string_alloc(int i, char **s); +int log_level_from_string(const char *s); +bool log_level_is_valid(int level); + +int syslog_parse_priority(const char **p, int *priority, bool with_facility); + +bool log_namespace_name_valid(const char *s); diff --git a/src/basic/terminal-util.c b/src/basic/terminal-util.c new file mode 100644 index 0000000..3355b74 --- /dev/null +++ b/src/basic/terminal-util.c @@ -0,0 +1,1553 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "constants.h" +#include "devnum-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "inotify-util.h" +#include "io-util.h" +#include "log.h" +#include "macro.h" +#include "namespace-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "time-util.h" +#include "user-util.h" + +static volatile unsigned cached_columns = 0; +static volatile unsigned cached_lines = 0; + +static volatile int cached_on_tty = -1; +static volatile int cached_on_dev_null = -1; +static volatile int cached_color_mode = _COLOR_INVALID; +static volatile int cached_underline_enabled = -1; + +int chvt(int vt) { + _cleanup_close_ int fd = -EBADF; + + /* Switch to the specified vt number. If the VT is specified <= 0 switch to the VT the kernel log messages go, + * if that's configured. */ + + fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return -errno; + + if (vt <= 0) { + int tiocl[2] = { + TIOCL_GETKMSGREDIRECT, + 0 + }; + + if (ioctl(fd, TIOCLINUX, tiocl) < 0) + return -errno; + + vt = tiocl[0] <= 0 ? 1 : tiocl[0]; + } + + return RET_NERRNO(ioctl(fd, VT_ACTIVATE, vt)); +} + +int read_one_char(FILE *f, char *ret, usec_t t, bool *need_nl) { + _cleanup_free_ char *line = NULL; + struct termios old_termios; + int r, fd; + + assert(f); + assert(ret); + + /* If this is a terminal, then switch canonical mode off, so that we can read a single + * character. (Note that fmemopen() streams do not have an fd associated with them, let's handle that + * nicely.) */ + fd = fileno(f); + if (fd >= 0 && tcgetattr(fd, &old_termios) >= 0) { + struct termios new_termios = old_termios; + + new_termios.c_lflag &= ~ICANON; + new_termios.c_cc[VMIN] = 1; + new_termios.c_cc[VTIME] = 0; + + if (tcsetattr(fd, TCSADRAIN, &new_termios) >= 0) { + char c; + + if (t != USEC_INFINITY) { + if (fd_wait_for_event(fd, POLLIN, t) <= 0) { + (void) tcsetattr(fd, TCSADRAIN, &old_termios); + return -ETIMEDOUT; + } + } + + r = safe_fgetc(f, &c); + (void) tcsetattr(fd, TCSADRAIN, &old_termios); + if (r < 0) + return r; + if (r == 0) + return -EIO; + + if (need_nl) + *need_nl = c != '\n'; + + *ret = c; + return 0; + } + } + + if (t != USEC_INFINITY && fd > 0) { + /* Let's wait the specified amount of time for input. When we have no fd we skip this, under + * the assumption that this is an fmemopen() stream or so where waiting doesn't make sense + * anyway, as the data is either already in the stream or cannot possible be placed there + * while we access the stream */ + + if (fd_wait_for_event(fd, POLLIN, t) <= 0) + return -ETIMEDOUT; + } + + /* If this is not a terminal, then read a full line instead */ + + r = read_line(f, 16, &line); /* longer than necessary, to eat up UTF-8 chars/vt100 key sequences */ + if (r < 0) + return r; + if (r == 0) + return -EIO; + + if (strlen(line) != 1) + return -EBADMSG; + + if (need_nl) + *need_nl = false; + + *ret = line[0]; + return 0; +} + +#define DEFAULT_ASK_REFRESH_USEC (2*USEC_PER_SEC) + +int ask_char(char *ret, const char *replies, const char *fmt, ...) { + int r; + + assert(ret); + assert(replies); + assert(fmt); + + for (;;) { + va_list ap; + char c; + bool need_nl = true; + + fputs(ansi_highlight(), stdout); + + putchar('\r'); + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + + fputs(ansi_normal(), stdout); + + fflush(stdout); + + r = read_one_char(stdin, &c, DEFAULT_ASK_REFRESH_USEC, &need_nl); + if (r < 0) { + + if (r == -ETIMEDOUT) + continue; + + if (r == -EBADMSG) { + puts("Bad input, please try again."); + continue; + } + + putchar('\n'); + return r; + } + + if (need_nl) + putchar('\n'); + + if (strchr(replies, c)) { + *ret = c; + return 0; + } + + puts("Read unexpected character, please try again."); + } +} + +int ask_string(char **ret, const char *text, ...) { + _cleanup_free_ char *line = NULL; + va_list ap; + int r; + + assert(ret); + assert(text); + + fputs(ansi_highlight(), stdout); + + va_start(ap, text); + vprintf(text, ap); + va_end(ap); + + fputs(ansi_normal(), stdout); + + fflush(stdout); + + r = read_line(stdin, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + return -EIO; + + *ret = TAKE_PTR(line); + return 0; +} + +int reset_terminal_fd(int fd, bool switch_to_text) { + struct termios termios; + int r; + + /* Set terminal to some sane defaults */ + + assert(fd >= 0); + + if (isatty(fd) < 1) + return log_debug_errno(errno, "Asked to reset a terminal that actually isn't a terminal: %m"); + + /* We leave locked terminal attributes untouched, so that Plymouth may set whatever it wants to set, + * and we don't interfere with that. */ + + /* Disable exclusive mode, just in case */ + if (ioctl(fd, TIOCNXCL) < 0) + log_debug_errno(errno, "TIOCNXCL ioctl failed on TTY, ignoring: %m"); + + /* Switch to text mode */ + if (switch_to_text) + if (ioctl(fd, KDSETMODE, KD_TEXT) < 0) + log_debug_errno(errno, "KDSETMODE ioctl for switching to text mode failed on TTY, ignoring: %m"); + + + /* Set default keyboard mode */ + r = vt_reset_keyboard(fd); + if (r < 0) + log_debug_errno(r, "Failed to reset VT keyboard, ignoring: %m"); + + if (tcgetattr(fd, &termios) < 0) { + r = log_debug_errno(errno, "Failed to get terminal parameters: %m"); + goto finish; + } + + /* We only reset the stuff that matters to the software. How + * hardware is set up we don't touch assuming that somebody + * else will do that for us */ + + termios.c_iflag &= ~(IGNBRK | BRKINT | ISTRIP | INLCR | IGNCR | IUCLC); + termios.c_iflag |= ICRNL | IMAXBEL | IUTF8; + termios.c_oflag |= ONLCR | OPOST; + termios.c_cflag |= CREAD; + termios.c_lflag = ISIG | ICANON | IEXTEN | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE; + + termios.c_cc[VINTR] = 03; /* ^C */ + termios.c_cc[VQUIT] = 034; /* ^\ */ + termios.c_cc[VERASE] = 0177; + termios.c_cc[VKILL] = 025; /* ^X */ + termios.c_cc[VEOF] = 04; /* ^D */ + termios.c_cc[VSTART] = 021; /* ^Q */ + termios.c_cc[VSTOP] = 023; /* ^S */ + termios.c_cc[VSUSP] = 032; /* ^Z */ + termios.c_cc[VLNEXT] = 026; /* ^V */ + termios.c_cc[VWERASE] = 027; /* ^W */ + termios.c_cc[VREPRINT] = 022; /* ^R */ + termios.c_cc[VEOL] = 0; + termios.c_cc[VEOL2] = 0; + + termios.c_cc[VTIME] = 0; + termios.c_cc[VMIN] = 1; + + r = RET_NERRNO(tcsetattr(fd, TCSANOW, &termios)); + +finish: + /* Just in case, flush all crap out */ + (void) tcflush(fd, TCIOFLUSH); + + return r; +} + +int reset_terminal(const char *name) { + _cleanup_close_ int fd = -EBADF; + + /* We open the terminal with O_NONBLOCK here, to ensure we + * don't block on carrier if this is a terminal with carrier + * configured. */ + + fd = open_terminal(name, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return fd; + + return reset_terminal_fd(fd, true); +} + +int open_terminal(const char *name, int mode) { + _cleanup_close_ int fd = -EBADF; + unsigned c = 0; + + /* + * If a TTY is in the process of being closed opening it might cause EIO. This is horribly awful, but + * unlikely to be changed in the kernel. Hence we work around this problem by retrying a couple of + * times. + * + * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/554172/comments/245 + */ + + if (mode & O_CREAT) + return -EINVAL; + + for (;;) { + fd = open(name, mode, 0); + if (fd >= 0) + break; + + if (errno != EIO) + return -errno; + + /* Max 1s in total */ + if (c >= 20) + return -errno; + + (void) usleep_safe(50 * USEC_PER_MSEC); + c++; + } + + if (isatty(fd) < 1) + return negative_errno(); + + return TAKE_FD(fd); +} + +int acquire_terminal( + const char *name, + AcquireTerminalFlags flags, + usec_t timeout) { + + _cleanup_close_ int notify = -EBADF, fd = -EBADF; + usec_t ts = USEC_INFINITY; + int r, wd = -1; + + assert(name); + assert(IN_SET(flags & ~ACQUIRE_TERMINAL_PERMISSIVE, ACQUIRE_TERMINAL_TRY, ACQUIRE_TERMINAL_FORCE, ACQUIRE_TERMINAL_WAIT)); + + /* We use inotify to be notified when the tty is closed. We create the watch before checking if we can actually + * acquire it, so that we don't lose any event. + * + * Note: strictly speaking this actually watches for the device being closed, it does *not* really watch + * whether a tty loses its controlling process. However, unless some rogue process uses TIOCNOTTY on /dev/tty + * *after* closing its tty otherwise this will not become a problem. As long as the administrator makes sure to + * not configure any service on the same tty as an untrusted user this should not be a problem. (Which they + * probably should not do anyway.) */ + + if ((flags & ~ACQUIRE_TERMINAL_PERMISSIVE) == ACQUIRE_TERMINAL_WAIT) { + notify = inotify_init1(IN_CLOEXEC | (timeout != USEC_INFINITY ? IN_NONBLOCK : 0)); + if (notify < 0) + return -errno; + + wd = inotify_add_watch(notify, name, IN_CLOSE); + if (wd < 0) + return -errno; + + if (timeout != USEC_INFINITY) + ts = now(CLOCK_MONOTONIC); + } + + for (;;) { + struct sigaction sa_old, sa_new = { + .sa_handler = SIG_IGN, + .sa_flags = SA_RESTART, + }; + + if (notify >= 0) { + r = flush_fd(notify); + if (r < 0) + return r; + } + + /* We pass here O_NOCTTY only so that we can check the return value TIOCSCTTY and have a reliable way + * to figure out if we successfully became the controlling process of the tty */ + fd = open_terminal(name, O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return fd; + + /* Temporarily ignore SIGHUP, so that we don't get SIGHUP'ed if we already own the tty. */ + assert_se(sigaction(SIGHUP, &sa_new, &sa_old) == 0); + + /* First, try to get the tty */ + r = RET_NERRNO(ioctl(fd, TIOCSCTTY, (flags & ~ACQUIRE_TERMINAL_PERMISSIVE) == ACQUIRE_TERMINAL_FORCE)); + + /* Reset signal handler to old value */ + assert_se(sigaction(SIGHUP, &sa_old, NULL) == 0); + + /* Success? Exit the loop now! */ + if (r >= 0) + break; + + /* Any failure besides -EPERM? Fail, regardless of the mode. */ + if (r != -EPERM) + return r; + + if (flags & ACQUIRE_TERMINAL_PERMISSIVE) /* If we are in permissive mode, then EPERM is fine, turn this + * into a success. Note that EPERM is also returned if we + * already are the owner of the TTY. */ + break; + + if (flags != ACQUIRE_TERMINAL_WAIT) /* If we are in TRY or FORCE mode, then propagate EPERM as EPERM */ + return r; + + assert(notify >= 0); + assert(wd >= 0); + + for (;;) { + union inotify_event_buffer buffer; + ssize_t l; + + if (timeout != USEC_INFINITY) { + usec_t n; + + assert(ts != USEC_INFINITY); + + n = usec_sub_unsigned(now(CLOCK_MONOTONIC), ts); + if (n >= timeout) + return -ETIMEDOUT; + + r = fd_wait_for_event(notify, POLLIN, usec_sub_unsigned(timeout, n)); + if (r < 0) + return r; + if (r == 0) + return -ETIMEDOUT; + } + + l = read(notify, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + continue; + + return -errno; + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + if (e->mask & IN_Q_OVERFLOW) /* If we hit an inotify queue overflow, simply check if the terminal is up for grabs now. */ + break; + + if (e->wd != wd || !(e->mask & IN_CLOSE)) /* Safety checks */ + return -EIO; + } + + break; + } + + /* We close the tty fd here since if the old session ended our handle will be dead. It's important that + * we do this after sleeping, so that we don't enter an endless loop. */ + fd = safe_close(fd); + } + + return TAKE_FD(fd); +} + +int release_terminal(void) { + static const struct sigaction sa_new = { + .sa_handler = SIG_IGN, + .sa_flags = SA_RESTART, + }; + + _cleanup_close_ int fd = -EBADF; + struct sigaction sa_old; + int r; + + fd = open("/dev/tty", O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return -errno; + + /* Temporarily ignore SIGHUP, so that we don't get SIGHUP'ed + * by our own TIOCNOTTY */ + assert_se(sigaction(SIGHUP, &sa_new, &sa_old) == 0); + + r = RET_NERRNO(ioctl(fd, TIOCNOTTY)); + + assert_se(sigaction(SIGHUP, &sa_old, NULL) == 0); + + return r; +} + +int terminal_vhangup_fd(int fd) { + assert(fd >= 0); + return RET_NERRNO(ioctl(fd, TIOCVHANGUP)); +} + +int terminal_vhangup(const char *name) { + _cleanup_close_ int fd = -EBADF; + + fd = open_terminal(name, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return fd; + + return terminal_vhangup_fd(fd); +} + +int vt_disallocate(const char *name) { + const char *e; + int r; + + /* Deallocate the VT if possible. If not possible + * (i.e. because it is the active one), at least clear it + * entirely (including the scrollback buffer). */ + + e = path_startswith(name, "/dev/"); + if (!e) + return -EINVAL; + + if (tty_is_vc(name)) { + _cleanup_close_ int fd = -EBADF; + unsigned u; + const char *n; + + n = startswith(e, "tty"); + if (!n) + return -EINVAL; + + r = safe_atou(n, &u); + if (r < 0) + return r; + + if (u <= 0) + return -EINVAL; + + /* Try to deallocate */ + fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return fd; + + r = ioctl(fd, VT_DISALLOCATE, u); + if (r >= 0) + return 0; + if (errno != EBUSY) + return -errno; + } + + /* So this is not a VT (in which case we cannot deallocate it), + * or we failed to deallocate. Let's at least clear the screen. */ + + _cleanup_close_ int fd2 = open_terminal(name, O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd2 < 0) + return fd2; + + (void) loop_write(fd2, + "\033[r" /* clear scrolling region */ + "\033[H" /* move home */ + "\033[3J", /* clear screen including scrollback, requires Linux 2.6.40 */ + 10); + return 0; +} + +int make_console_stdio(void) { + int fd, r; + + /* Make /dev/console the controlling terminal and stdin/stdout/stderr, if we can. If we can't use + * /dev/null instead. This is particularly useful if /dev/console is turned off, e.g. if console=null + * is specified on the kernel command line. */ + + fd = acquire_terminal("/dev/console", ACQUIRE_TERMINAL_FORCE|ACQUIRE_TERMINAL_PERMISSIVE, USEC_INFINITY); + if (fd < 0) { + log_warning_errno(fd, "Failed to acquire terminal, using /dev/null stdin/stdout/stderr instead: %m"); + + r = make_null_stdio(); + if (r < 0) + return log_error_errno(r, "Failed to make /dev/null stdin/stdout/stderr: %m"); + + } else { + unsigned rows, cols; + + r = reset_terminal_fd(fd, /* switch_to_text= */ true); + if (r < 0) + log_warning_errno(r, "Failed to reset terminal, ignoring: %m"); + + r = proc_cmdline_tty_size("/dev/console", &rows, &cols); + if (r < 0) + log_warning_errno(r, "Failed to get terminal size, ignoring: %m"); + else { + r = terminal_set_size_fd(fd, NULL, rows, cols); + if (r < 0) + log_warning_errno(r, "Failed to set terminal size, ignoring: %m"); + } + + r = rearrange_stdio(fd, fd, fd); /* This invalidates 'fd' both on success and on failure. */ + if (r < 0) + return log_error_errno(r, "Failed to make terminal stdin/stdout/stderr: %m"); + } + + reset_terminal_feature_caches(); + return 0; +} + +bool tty_is_vc(const char *tty) { + assert(tty); + + return vtnr_from_tty(tty) >= 0; +} + +bool tty_is_console(const char *tty) { + assert(tty); + + return streq(skip_dev_prefix(tty), "console"); +} + +int vtnr_from_tty(const char *tty) { + int i, r; + + assert(tty); + + tty = skip_dev_prefix(tty); + + if (!startswith(tty, "tty") ) + return -EINVAL; + + if (!ascii_isdigit(tty[3])) + return -EINVAL; + + r = safe_atoi(tty+3, &i); + if (r < 0) + return r; + + if (i < 0 || i > 63) + return -EINVAL; + + return i; +} + + int resolve_dev_console(char **ret) { + _cleanup_free_ char *active = NULL; + char *tty; + int r; + + assert(ret); + + /* Resolve where /dev/console is pointing to, if /sys is actually ours (i.e. not read-only-mounted which is a + * sign for container setups) */ + + if (path_is_read_only_fs("/sys") > 0) + return -ENOMEDIUM; + + r = read_one_line_file("/sys/class/tty/console/active", &active); + if (r < 0) + return r; + + /* If multiple log outputs are configured the last one is what /dev/console points to */ + tty = strrchr(active, ' '); + if (tty) + tty++; + else + tty = active; + + if (streq(tty, "tty0")) { + active = mfree(active); + + /* Get the active VC (e.g. tty1) */ + r = read_one_line_file("/sys/class/tty/tty0/active", &active); + if (r < 0) + return r; + + tty = active; + } + + if (tty == active) + *ret = TAKE_PTR(active); + else { + char *tmp; + + tmp = strdup(tty); + if (!tmp) + return -ENOMEM; + + *ret = tmp; + } + + return 0; +} + +int get_kernel_consoles(char ***ret) { + _cleanup_strv_free_ char **l = NULL; + _cleanup_free_ char *line = NULL; + const char *p; + int r; + + assert(ret); + + /* If /sys is mounted read-only this means we are running in some kind of container environment. In that + * case /sys would reflect the host system, not us, hence ignore the data we can read from it. */ + if (path_is_read_only_fs("/sys") > 0) + goto fallback; + + r = read_one_line_file("/sys/class/tty/console/active", &line); + if (r < 0) + return r; + + p = line; + for (;;) { + _cleanup_free_ char *tty = NULL, *path = NULL; + + r = extract_first_word(&p, &tty, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + if (streq(tty, "tty0")) { + tty = mfree(tty); + r = read_one_line_file("/sys/class/tty/tty0/active", &tty); + if (r < 0) + return r; + } + + path = path_join("/dev", tty); + if (!path) + return -ENOMEM; + + if (access(path, F_OK) < 0) { + log_debug_errno(errno, "Console device %s is not accessible, skipping: %m", path); + continue; + } + + r = strv_consume(&l, TAKE_PTR(path)); + if (r < 0) + return r; + } + + if (strv_isempty(l)) { + log_debug("No devices found for system console"); + goto fallback; + } + + *ret = TAKE_PTR(l); + + return 0; + +fallback: + r = strv_extend(&l, "/dev/console"); + if (r < 0) + return r; + + *ret = TAKE_PTR(l); + + return 0; +} + +bool tty_is_vc_resolve(const char *tty) { + _cleanup_free_ char *resolved = NULL; + + assert(tty); + + tty = skip_dev_prefix(tty); + + if (streq(tty, "console")) { + if (resolve_dev_console(&resolved) < 0) + return false; + + tty = resolved; + } + + return tty_is_vc(tty); +} + +const char *default_term_for_tty(const char *tty) { + return tty && tty_is_vc_resolve(tty) ? "linux" : "vt220"; +} + +int fd_columns(int fd) { + struct winsize ws = {}; + + if (fd < 0) + return -EBADF; + + if (ioctl(fd, TIOCGWINSZ, &ws) < 0) + return -errno; + + if (ws.ws_col <= 0) + return -EIO; + + return ws.ws_col; +} + +unsigned columns(void) { + const char *e; + int c; + + if (cached_columns > 0) + return cached_columns; + + c = 0; + e = getenv("COLUMNS"); + if (e) + (void) safe_atoi(e, &c); + + if (c <= 0 || c > USHRT_MAX) { + c = fd_columns(STDOUT_FILENO); + if (c <= 0) + c = 80; + } + + cached_columns = c; + return cached_columns; +} + +int fd_lines(int fd) { + struct winsize ws = {}; + + if (fd < 0) + return -EBADF; + + if (ioctl(fd, TIOCGWINSZ, &ws) < 0) + return -errno; + + if (ws.ws_row <= 0) + return -EIO; + + return ws.ws_row; +} + +unsigned lines(void) { + const char *e; + int l; + + if (cached_lines > 0) + return cached_lines; + + l = 0; + e = getenv("LINES"); + if (e) + (void) safe_atoi(e, &l); + + if (l <= 0 || l > USHRT_MAX) { + l = fd_lines(STDOUT_FILENO); + if (l <= 0) + l = 24; + } + + cached_lines = l; + return cached_lines; +} + +int terminal_set_size_fd(int fd, const char *ident, unsigned rows, unsigned cols) { + struct winsize ws; + + if (rows == UINT_MAX && cols == UINT_MAX) + return 0; + + if (ioctl(fd, TIOCGWINSZ, &ws) < 0) + return log_debug_errno(errno, + "TIOCGWINSZ ioctl for getting %s size failed, not setting terminal size: %m", + ident ?: "TTY"); + + if (rows == UINT_MAX) + rows = ws.ws_row; + else if (rows > USHRT_MAX) + rows = USHRT_MAX; + + if (cols == UINT_MAX) + cols = ws.ws_col; + else if (cols > USHRT_MAX) + cols = USHRT_MAX; + + if (rows == ws.ws_row && cols == ws.ws_col) + return 0; + + ws.ws_row = rows; + ws.ws_col = cols; + + if (ioctl(fd, TIOCSWINSZ, &ws) < 0) + return log_debug_errno(errno, "TIOCSWINSZ ioctl for setting %s size failed: %m", ident ?: "TTY"); + + return 0; +} + +int proc_cmdline_tty_size(const char *tty, unsigned *ret_rows, unsigned *ret_cols) { + _cleanup_free_ char *rowskey = NULL, *rowsvalue = NULL, *colskey = NULL, *colsvalue = NULL; + unsigned rows = UINT_MAX, cols = UINT_MAX; + int r; + + assert(tty); + + if (!ret_rows && !ret_cols) + return 0; + + tty = skip_dev_prefix(tty); + if (!in_charset(tty, ALPHANUMERICAL)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "%s contains non-alphanumeric characters", tty); + + rowskey = strjoin("systemd.tty.rows.", tty); + if (!rowskey) + return -ENOMEM; + + colskey = strjoin("systemd.tty.columns.", tty); + if (!colskey) + return -ENOMEM; + + r = proc_cmdline_get_key_many(/* flags = */ 0, + rowskey, &rowsvalue, + colskey, &colsvalue); + if (r < 0) + return log_debug_errno(r, "Failed to read TTY size of %s from kernel cmdline: %m", tty); + + if (rowsvalue) { + r = safe_atou(rowsvalue, &rows); + if (r < 0) + return log_debug_errno(r, "Failed to parse %s=%s: %m", rowskey, rowsvalue); + } + + if (colsvalue) { + r = safe_atou(colsvalue, &cols); + if (r < 0) + return log_debug_errno(r, "Failed to parse %s=%s: %m", colskey, colsvalue); + } + + if (ret_rows) + *ret_rows = rows; + if (ret_cols) + *ret_cols = cols; + + return 0; +} + +/* intended to be used as a SIGWINCH sighandler */ +void columns_lines_cache_reset(int signum) { + cached_columns = 0; + cached_lines = 0; +} + +void reset_terminal_feature_caches(void) { + cached_columns = 0; + cached_lines = 0; + + cached_color_mode = _COLOR_INVALID; + cached_underline_enabled = -1; + cached_on_tty = -1; + cached_on_dev_null = -1; +} + +bool on_tty(void) { + + /* We check both stdout and stderr, so that situations where pipes on the shell are used are reliably + * recognized, regardless if only the output or the errors are piped to some place. Since on_tty() is generally + * used to default to a safer, non-interactive, non-color mode of operation it's probably good to be defensive + * here, and check for both. Note that we don't check for STDIN_FILENO, because it should fine to use fancy + * terminal functionality when outputting stuff, even if the input is piped to us. */ + + if (cached_on_tty < 0) + cached_on_tty = + isatty(STDOUT_FILENO) > 0 && + isatty(STDERR_FILENO) > 0; + + return cached_on_tty; +} + +int getttyname_malloc(int fd, char **ret) { + char path[PATH_MAX], *c; /* PATH_MAX is counted *with* the trailing NUL byte */ + int r; + + assert(fd >= 0); + assert(ret); + + r = ttyname_r(fd, path, sizeof path); /* positive error */ + assert(r >= 0); + if (r == ERANGE) + return -ENAMETOOLONG; + if (r > 0) + return -r; + + c = strdup(skip_dev_prefix(path)); + if (!c) + return -ENOMEM; + + *ret = c; + return 0; +} + +int getttyname_harder(int fd, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + r = getttyname_malloc(fd, &s); + if (r < 0) + return r; + + if (streq(s, "tty")) + return get_ctty(0, NULL, ret); + + *ret = TAKE_PTR(s); + return 0; +} + +int get_ctty_devnr(pid_t pid, dev_t *d) { + int r; + _cleanup_free_ char *line = NULL; + const char *p; + unsigned long ttynr; + + assert(pid >= 0); + + p = procfs_file_alloca(pid, "stat"); + r = read_one_line_file(p, &line); + if (r < 0) + return r; + + p = strrchr(line, ')'); + if (!p) + return -EIO; + + p++; + + if (sscanf(p, " " + "%*c " /* state */ + "%*d " /* ppid */ + "%*d " /* pgrp */ + "%*d " /* session */ + "%lu ", /* ttynr */ + &ttynr) != 1) + return -EIO; + + if (devnum_is_zero(ttynr)) + return -ENXIO; + + if (d) + *d = (dev_t) ttynr; + + return 0; +} + +int get_ctty(pid_t pid, dev_t *ret_devnr, char **ret) { + char pty[STRLEN("/dev/pts/") + DECIMAL_STR_MAX(dev_t) + 1]; + _cleanup_free_ char *buf = NULL; + const char *fn = NULL, *w; + dev_t devnr; + int r; + + r = get_ctty_devnr(pid, &devnr); + if (r < 0) + return r; + + r = device_path_make_canonical(S_IFCHR, devnr, &buf); + if (r < 0) { + struct stat st; + + if (r != -ENOENT) /* No symlink for this in /dev/char/? */ + return r; + + /* Maybe this is PTY? PTY devices are not listed in /dev/char/, as they don't follow the + * Linux device model and hence device_path_make_canonical() doesn't work for them. Let's + * assume this is a PTY for a moment, and check if the device node this would then map to in + * /dev/pts/ matches the one we are looking for. This way we don't have to hardcode the major + * number (which is 136 btw), but we still rely on the fact that PTY numbers map directly to + * the minor number of the pty. */ + xsprintf(pty, "/dev/pts/%u", minor(devnr)); + + if (stat(pty, &st) < 0) { + if (errno != ENOENT) + return -errno; + + } else if (S_ISCHR(st.st_mode) && devnr == st.st_rdev) /* Bingo! */ + fn = pty; + + if (!fn) { + /* Doesn't exist, or not a PTY? Probably something similar to the PTYs which have no + * symlink in /dev/char/. Let's return something vaguely useful. */ + r = device_path_make_major_minor(S_IFCHR, devnr, &buf); + if (r < 0) + return r; + + fn = buf; + } + } else + fn = buf; + + w = path_startswith(fn, "/dev/"); + if (!w) + return -EINVAL; + + if (ret) { + _cleanup_free_ char *b = NULL; + + b = strdup(w); + if (!b) + return -ENOMEM; + + *ret = TAKE_PTR(b); + } + + if (ret_devnr) + *ret_devnr = devnr; + + return 0; +} + +int ptsname_malloc(int fd, char **ret) { + size_t l = 100; + + assert(fd >= 0); + assert(ret); + + for (;;) { + char *c; + + c = new(char, l); + if (!c) + return -ENOMEM; + + if (ptsname_r(fd, c, l) == 0) { + *ret = c; + return 0; + } + if (errno != ERANGE) { + free(c); + return -errno; + } + + free(c); + + if (l > SIZE_MAX / 2) + return -ENOMEM; + + l *= 2; + } +} + +int openpt_allocate(int flags, char **ret_slave) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + fd = posix_openpt(flags|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return -errno; + + if (ret_slave) { + r = ptsname_malloc(fd, &p); + if (r < 0) + return r; + + if (!path_startswith(p, "/dev/pts/")) + return -EINVAL; + } + + if (unlockpt(fd) < 0) + return -errno; + + if (ret_slave) + *ret_slave = TAKE_PTR(p); + + return TAKE_FD(fd); +} + +static int ptsname_namespace(int pty, char **ret) { + int no = -1, r; + + /* Like ptsname(), but doesn't assume that the path is + * accessible in the local namespace. */ + + r = ioctl(pty, TIOCGPTN, &no); + if (r < 0) + return -errno; + + if (no < 0) + return -EIO; + + if (asprintf(ret, "/dev/pts/%i", no) < 0) + return -ENOMEM; + + return 0; +} + +int openpt_allocate_in_namespace(pid_t pid, int flags, char **ret_slave) { + _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF, fd = -EBADF; + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + pid_t child; + int r; + + assert(pid > 0); + + r = namespace_open(pid, &pidnsfd, &mntnsfd, NULL, &usernsfd, &rootfd); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) < 0) + return -errno; + + r = namespace_fork("(sd-openptns)", "(sd-openpt)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + pidnsfd, mntnsfd, -1, usernsfd, rootfd, &child); + if (r < 0) + return r; + if (r == 0) { + pair[0] = safe_close(pair[0]); + + fd = openpt_allocate(flags, NULL); + if (fd < 0) + _exit(EXIT_FAILURE); + + if (send_one_fd(pair[1], fd, 0) < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + r = wait_for_terminate_and_check("(sd-openptns)", child, 0); + if (r < 0) + return r; + if (r != EXIT_SUCCESS) + return -EIO; + + fd = receive_one_fd(pair[0], 0); + if (fd < 0) + return fd; + + if (ret_slave) { + r = ptsname_namespace(fd, ret_slave); + if (r < 0) + return r; + } + + return TAKE_FD(fd); +} + +int open_terminal_in_namespace(pid_t pid, const char *name, int mode) { + _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF; + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + pid_t child; + int r; + + r = namespace_open(pid, &pidnsfd, &mntnsfd, NULL, &usernsfd, &rootfd); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) < 0) + return -errno; + + r = namespace_fork("(sd-terminalns)", "(sd-terminal)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + pidnsfd, mntnsfd, -1, usernsfd, rootfd, &child); + if (r < 0) + return r; + if (r == 0) { + int master; + + pair[0] = safe_close(pair[0]); + + master = open_terminal(name, mode|O_NOCTTY|O_CLOEXEC); + if (master < 0) + _exit(EXIT_FAILURE); + + if (send_one_fd(pair[1], master, 0) < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + r = wait_for_terminate_and_check("(sd-terminalns)", child, 0); + if (r < 0) + return r; + if (r != EXIT_SUCCESS) + return -EIO; + + return receive_one_fd(pair[0], 0); +} + +static bool on_dev_null(void) { + struct stat dst, ost, est; + + if (cached_on_dev_null >= 0) + return cached_on_dev_null; + + if (stat("/dev/null", &dst) < 0 || fstat(STDOUT_FILENO, &ost) < 0 || fstat(STDERR_FILENO, &est) < 0) + cached_on_dev_null = false; + else + cached_on_dev_null = stat_inode_same(&dst, &ost) && stat_inode_same(&dst, &est); + + return cached_on_dev_null; +} + +static bool getenv_terminal_is_dumb(void) { + const char *e; + + e = getenv("TERM"); + if (!e) + return true; + + return streq(e, "dumb"); +} + +bool terminal_is_dumb(void) { + if (!on_tty() && !on_dev_null()) + return true; + + return getenv_terminal_is_dumb(); +} + +static ColorMode parse_systemd_colors(void) { + const char *e; + int r; + + e = getenv("SYSTEMD_COLORS"); + if (!e) + return _COLOR_INVALID; + if (streq(e, "16")) + return COLOR_16; + if (streq(e, "256")) + return COLOR_256; + r = parse_boolean(e); + if (r >= 0) + return r > 0 ? COLOR_ON : COLOR_OFF; + return _COLOR_INVALID; +} + +ColorMode get_color_mode(void) { + + /* Returns the mode used to choose output colors. The possible modes are COLOR_OFF for no colors, + * COLOR_16 for only the base 16 ANSI colors, COLOR_256 for more colors and COLOR_ON for unrestricted + * color output. For that we check $SYSTEMD_COLORS first (which is the explicit way to + * change the mode). If that didn't work we turn colors off unless we are on a TTY. And if we are on a TTY + * we turn it off if $TERM is set to "dumb". There's one special tweak though: if we are PID 1 then we do not + * check whether we are connected to a TTY, because we don't keep /dev/console open continuously due to fear + * of SAK, and hence things are a bit weird. */ + ColorMode m; + + if (cached_color_mode < 0) { + m = parse_systemd_colors(); + if (m >= 0) + cached_color_mode = m; + else if (getenv("NO_COLOR")) + /* We only check for the presence of the variable; value is ignored. */ + cached_color_mode = COLOR_OFF; + + else if (getpid_cached() == 1) { + /* PID1 outputs to the console without holding it open all the time. + * + * Note that the Linux console can only display 16 colors. We still enable 256 color + * mode even for PID1 output though (which typically goes to the Linux console), + * since the Linux console is able to parse the 256 color sequences and automatically + * map them to the closest color in the 16 color palette (since kernel 3.16). Doing + * 256 colors is nice for people who invoke systemd in a container or via a serial + * link or such, and use a true 256 color terminal to do so. */ + if (getenv_terminal_is_dumb()) + cached_color_mode = COLOR_OFF; + } else { + if (terminal_is_dumb()) + cached_color_mode = COLOR_OFF; + } + + if (cached_color_mode < 0) { + /* We failed to figure out any reason to *disable* colors. + * Let's see how many colors we shall use. */ + if (STRPTR_IN_SET(getenv("COLORTERM"), + "truecolor", + "24bit")) + cached_color_mode = COLOR_24BIT; + else + cached_color_mode = COLOR_256; + } + } + + return cached_color_mode; +} + +bool dev_console_colors_enabled(void) { + _cleanup_free_ char *s = NULL; + ColorMode m; + + /* Returns true if we assume that color is supported on /dev/console. + * + * For that we first check if we explicitly got told to use colors or not, by checking $SYSTEMD_COLORS. If that + * isn't set we check whether PID 1 has $TERM set, and if not, whether TERM is set on the kernel command + * line. If we find $TERM set we assume color if it's not set to "dumb", similarly to how regular + * colors_enabled() operates. */ + + m = parse_systemd_colors(); + if (m >= 0) + return m; + + if (getenv("NO_COLOR")) + return false; + + if (getenv_for_pid(1, "TERM", &s) <= 0) + (void) proc_cmdline_get_key("TERM", 0, &s); + + return !streq_ptr(s, "dumb"); +} + +bool underline_enabled(void) { + + if (cached_underline_enabled < 0) { + + /* The Linux console doesn't support underlining, turn it off, but only there. */ + + if (colors_enabled()) + cached_underline_enabled = !streq_ptr(getenv("TERM"), "linux"); + else + cached_underline_enabled = false; + } + + return cached_underline_enabled; +} + +int vt_default_utf8(void) { + _cleanup_free_ char *b = NULL; + int r; + + /* Read the default VT UTF8 setting from the kernel */ + + r = read_one_line_file("/sys/module/vt/parameters/default_utf8", &b); + if (r < 0) + return r; + + return parse_boolean(b); +} + +int vt_reset_keyboard(int fd) { + int kb; + + /* If we can't read the default, then default to unicode. It's 2017 after all. */ + kb = vt_default_utf8() != 0 ? K_UNICODE : K_XLATE; + + return RET_NERRNO(ioctl(fd, KDSKBMODE, kb)); +} + +int vt_restore(int fd) { + static const struct vt_mode mode = { + .mode = VT_AUTO, + }; + int r, q = 0; + + if (isatty(fd) < 1) + return log_debug_errno(errno, "Asked to restore the VT for an fd that does not refer to a terminal: %m"); + + if (ioctl(fd, KDSETMODE, KD_TEXT) < 0) + q = log_debug_errno(errno, "Failed to set VT in text mode, ignoring: %m"); + + r = vt_reset_keyboard(fd); + if (r < 0) { + log_debug_errno(r, "Failed to reset keyboard mode, ignoring: %m"); + if (q >= 0) + q = r; + } + + if (ioctl(fd, VT_SETMODE, &mode) < 0) { + log_debug_errno(errno, "Failed to set VT_AUTO mode, ignoring: %m"); + if (q >= 0) + q = -errno; + } + + r = fchmod_and_chown(fd, TTY_MODE, 0, GID_INVALID); + if (r < 0) { + log_debug_errno(r, "Failed to chmod()/chown() VT, ignoring: %m"); + if (q >= 0) + q = r; + } + + return q; +} + +int vt_release(int fd, bool restore) { + assert(fd >= 0); + + /* This function releases the VT by acknowledging the VT-switch signal + * sent by the kernel and optionally reset the VT in text and auto + * VT-switching modes. */ + + if (isatty(fd) < 1) + return log_debug_errno(errno, "Asked to release the VT for an fd that does not refer to a terminal: %m"); + + if (ioctl(fd, VT_RELDISP, 1) < 0) + return -errno; + + if (restore) + return vt_restore(fd); + + return 0; +} + +void get_log_colors(int priority, const char **on, const char **off, const char **highlight) { + /* Note that this will initialize output variables only when there's something to output. + * The caller must pre-initialize to "" or NULL as appropriate. */ + + if (priority <= LOG_ERR) { + if (on) + *on = ansi_highlight_red(); + if (off) + *off = ansi_normal(); + if (highlight) + *highlight = ansi_highlight(); + + } else if (priority <= LOG_WARNING) { + if (on) + *on = ansi_highlight_yellow(); + if (off) + *off = ansi_normal(); + if (highlight) + *highlight = ansi_highlight(); + + } else if (priority <= LOG_NOTICE) { + if (on) + *on = ansi_highlight(); + if (off) + *off = ansi_normal(); + if (highlight) + *highlight = ansi_highlight_red(); + + } else if (priority >= LOG_DEBUG) { + if (on) + *on = ansi_grey(); + if (off) + *off = ansi_normal(); + if (highlight) + *highlight = ansi_highlight_red(); + } +} + +int set_terminal_cursor_position(int fd, unsigned int row, unsigned int column) { + int r; + char cursor_position[STRLEN("\x1B[") + DECIMAL_STR_MAX(int) * 2 + STRLEN(";H") + 1]; + + assert(fd >= 0); + + xsprintf(cursor_position, "\x1B[%u;%uH", row, column); + + r = loop_write(fd, cursor_position, SIZE_MAX); + if (r < 0) + return log_warning_errno(r, "Failed to set cursor position, ignoring: %m"); + + return 0; +} diff --git a/src/basic/terminal-util.h b/src/basic/terminal-util.h new file mode 100644 index 0000000..2a7d48b --- /dev/null +++ b/src/basic/terminal-util.h @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "macro.h" +#include "time-util.h" + +/* Regular colors */ +#define ANSI_BLACK "\x1B[0;30m" /* Some type of grey usually. */ +#define ANSI_RED "\x1B[0;31m" +#define ANSI_GREEN "\x1B[0;32m" +#define ANSI_YELLOW "\x1B[0;33m" +#define ANSI_BLUE "\x1B[0;34m" +#define ANSI_MAGENTA "\x1B[0;35m" +#define ANSI_CYAN "\x1B[0;36m" +#define ANSI_WHITE "\x1B[0;37m" /* This is actually rendered as light grey, legible even on a white + * background. See ANSI_HIGHLIGHT_WHITE for real white. */ + +#define ANSI_BRIGHT_BLACK "\x1B[0;90m" +#define ANSI_BRIGHT_RED "\x1B[0;91m" +#define ANSI_BRIGHT_GREEN "\x1B[0;92m" +#define ANSI_BRIGHT_YELLOW "\x1B[0;93m" +#define ANSI_BRIGHT_BLUE "\x1B[0;94m" +#define ANSI_BRIGHT_MAGENTA "\x1B[0;95m" +#define ANSI_BRIGHT_CYAN "\x1B[0;96m" +#define ANSI_BRIGHT_WHITE "\x1B[0;97m" + +#define ANSI_GREY "\x1B[0;38;5;245m" + +/* Bold/highlighted */ +#define ANSI_HIGHLIGHT_BLACK "\x1B[0;1;30m" +#define ANSI_HIGHLIGHT_RED "\x1B[0;1;31m" +#define ANSI_HIGHLIGHT_GREEN "\x1B[0;1;32m" +#define _ANSI_HIGHLIGHT_YELLOW "\x1B[0;1;33m" /* This yellow is currently not displayed well by some terminals */ +#define ANSI_HIGHLIGHT_BLUE "\x1B[0;1;34m" +#define ANSI_HIGHLIGHT_MAGENTA "\x1B[0;1;35m" +#define ANSI_HIGHLIGHT_CYAN "\x1B[0;1;36m" +#define ANSI_HIGHLIGHT_WHITE "\x1B[0;1;37m" +#define ANSI_HIGHLIGHT_YELLOW4 "\x1B[0;1;38;5;100m" +#define ANSI_HIGHLIGHT_KHAKI3 "\x1B[0;1;38;5;185m" +#define ANSI_HIGHLIGHT_GREY "\x1B[0;1;38;5;245m" + +#define ANSI_HIGHLIGHT_YELLOW ANSI_HIGHLIGHT_KHAKI3 /* Replacement yellow that is more legible */ + +/* Underlined */ +#define ANSI_GREY_UNDERLINE "\x1B[0;4;38;5;245m" +#define ANSI_BRIGHT_BLACK_UNDERLINE "\x1B[0;4;90m" +#define ANSI_HIGHLIGHT_RED_UNDERLINE "\x1B[0;1;4;31m" +#define ANSI_HIGHLIGHT_GREEN_UNDERLINE "\x1B[0;1;4;32m" +#define ANSI_HIGHLIGHT_YELLOW_UNDERLINE "\x1B[0;1;4;38;5;185m" +#define ANSI_HIGHLIGHT_BLUE_UNDERLINE "\x1B[0;1;4;34m" +#define ANSI_HIGHLIGHT_MAGENTA_UNDERLINE "\x1B[0;1;4;35m" +#define ANSI_HIGHLIGHT_GREY_UNDERLINE "\x1B[0;1;4;38;5;245m" + +/* Other ANSI codes */ +#define ANSI_UNDERLINE "\x1B[0;4m" +#define ANSI_HIGHLIGHT "\x1B[0;1;39m" +#define ANSI_HIGHLIGHT_UNDERLINE "\x1B[0;1;4m" + +/* Fallback colors: 256 -> 16 */ +#define ANSI_HIGHLIGHT_GREY_FALLBACK "\x1B[0;1;90m" +#define ANSI_HIGHLIGHT_GREY_FALLBACK_UNDERLINE "\x1B[0;1;4;90m" +#define ANSI_HIGHLIGHT_YELLOW_FALLBACK "\x1B[0;1;33m" +#define ANSI_HIGHLIGHT_YELLOW_FALLBACK_UNDERLINE "\x1B[0;1;4;33m" + +/* Background colors */ +#define ANSI_BACKGROUND_BLUE "\x1B[44m" + +/* Reset/clear ANSI styles */ +#define ANSI_NORMAL "\x1B[0m" + +/* Erase characters until the end of the line */ +#define ANSI_ERASE_TO_END_OF_LINE "\x1B[K" + +/* Move cursor up one line */ +#define ANSI_REVERSE_LINEFEED "\x1BM" + +/* Set cursor to top left corner and clear screen */ +#define ANSI_HOME_CLEAR "\x1B[H\x1B[2J" + +int reset_terminal_fd(int fd, bool switch_to_text); +int reset_terminal(const char *name); +int set_terminal_cursor_position(int fd, unsigned int row, unsigned int column); + +int open_terminal(const char *name, int mode); + +/* Flags for tweaking the way we become the controlling process of a terminal. */ +typedef enum AcquireTerminalFlags { + /* Try to become the controlling process of the TTY. If we can't return -EPERM. */ + ACQUIRE_TERMINAL_TRY = 0, + + /* Tell the kernel to forcibly make us the controlling process of the TTY. Returns -EPERM if the kernel doesn't allow that. */ + ACQUIRE_TERMINAL_FORCE = 1, + + /* If we can't become the controlling process of the TTY right-away, then wait until we can. */ + ACQUIRE_TERMINAL_WAIT = 2, + + /* Pick one of the above, and then OR this flag in, in order to request permissive behaviour, if we can't become controlling process then don't mind */ + ACQUIRE_TERMINAL_PERMISSIVE = 1 << 2, +} AcquireTerminalFlags; + +/* Limits the use of ANSI colors to a subset. */ +typedef enum ColorMode { + /* No colors, monochrome output. */ + COLOR_OFF, + + /* All colors, no restrictions. */ + COLOR_ON, + + /* Only the base 16 colors. */ + COLOR_16, + + /* Only 256 colors. */ + COLOR_256, + + /* For truecolor or 24bit color support. */ + COLOR_24BIT, + + _COLOR_INVALID = -EINVAL, +} ColorMode; + +int acquire_terminal(const char *name, AcquireTerminalFlags flags, usec_t timeout); +int release_terminal(void); + +int terminal_vhangup_fd(int fd); +int terminal_vhangup(const char *name); + +int terminal_set_size_fd(int fd, const char *ident, unsigned rows, unsigned cols); +int proc_cmdline_tty_size(const char *tty, unsigned *ret_rows, unsigned *ret_cols); + +int chvt(int vt); + +int read_one_char(FILE *f, char *ret, usec_t timeout, bool *need_nl); +int ask_char(char *ret, const char *replies, const char *text, ...) _printf_(3, 4); +int ask_string(char **ret, const char *text, ...) _printf_(2, 3); + +int vt_disallocate(const char *name); + +int resolve_dev_console(char **ret); +int get_kernel_consoles(char ***ret); +bool tty_is_vc(const char *tty); +bool tty_is_vc_resolve(const char *tty); +bool tty_is_console(const char *tty) _pure_; +int vtnr_from_tty(const char *tty); +const char *default_term_for_tty(const char *tty); + +int make_console_stdio(void); + +int fd_columns(int fd); +unsigned columns(void); +int fd_lines(int fd); +unsigned lines(void); + +void columns_lines_cache_reset(int _unused_ signum); +void reset_terminal_feature_caches(void); + +bool on_tty(void); +bool terminal_is_dumb(void); +ColorMode get_color_mode(void); +bool underline_enabled(void); +bool dev_console_colors_enabled(void); + +static inline bool colors_enabled(void) { + + /* Returns true if colors are considered supported on our stdout. */ + return get_color_mode() != COLOR_OFF; +} + +#define DEFINE_ANSI_FUNC(name, NAME) \ + static inline const char *ansi_##name(void) { \ + return colors_enabled() ? ANSI_##NAME : ""; \ + } + +#define DEFINE_ANSI_FUNC_256(name, NAME, FALLBACK) \ + static inline const char *ansi_##name(void) { \ + switch (get_color_mode()) { \ + case COLOR_OFF: return ""; \ + case COLOR_16: return ANSI_##FALLBACK; \ + default : return ANSI_##NAME; \ + } \ + } + +static inline const char *ansi_underline(void) { + return underline_enabled() ? ANSI_UNDERLINE : ANSI_NORMAL; +} + +#define DEFINE_ANSI_FUNC_UNDERLINE(name, NAME) \ + static inline const char *ansi_##name(void) { \ + return underline_enabled() ? ANSI_##NAME##_UNDERLINE : \ + colors_enabled() ? ANSI_##NAME : ""; \ + } + + +#define DEFINE_ANSI_FUNC_UNDERLINE_256(name, NAME, FALLBACK) \ + static inline const char *ansi_##name(void) { \ + switch (get_color_mode()) { \ + case COLOR_OFF: return ""; \ + case COLOR_16: return underline_enabled() ? ANSI_##FALLBACK##_UNDERLINE : ANSI_##FALLBACK; \ + default : return underline_enabled() ? ANSI_##NAME##_UNDERLINE: ANSI_##NAME; \ + } \ + } + +DEFINE_ANSI_FUNC(normal, NORMAL); +DEFINE_ANSI_FUNC(highlight, HIGHLIGHT); +DEFINE_ANSI_FUNC(black, BLACK); +DEFINE_ANSI_FUNC(red, RED); +DEFINE_ANSI_FUNC(green, GREEN); +DEFINE_ANSI_FUNC(yellow, YELLOW); +DEFINE_ANSI_FUNC(blue, BLUE); +DEFINE_ANSI_FUNC(magenta, MAGENTA); +DEFINE_ANSI_FUNC(cyan, CYAN); +DEFINE_ANSI_FUNC(white, WHITE); +DEFINE_ANSI_FUNC_256(grey, GREY, BRIGHT_BLACK); + +DEFINE_ANSI_FUNC(bright_black, BRIGHT_BLACK); +DEFINE_ANSI_FUNC(bright_red, BRIGHT_RED); +DEFINE_ANSI_FUNC(bright_green, BRIGHT_GREEN); +DEFINE_ANSI_FUNC(bright_yellow, BRIGHT_YELLOW); +DEFINE_ANSI_FUNC(bright_blue, BRIGHT_BLUE); +DEFINE_ANSI_FUNC(bright_magenta, BRIGHT_MAGENTA); +DEFINE_ANSI_FUNC(bright_cyan, BRIGHT_CYAN); +DEFINE_ANSI_FUNC(bright_white, BRIGHT_WHITE); + +DEFINE_ANSI_FUNC(highlight_black, HIGHLIGHT_BLACK); +DEFINE_ANSI_FUNC(highlight_red, HIGHLIGHT_RED); +DEFINE_ANSI_FUNC(highlight_green, HIGHLIGHT_GREEN); +DEFINE_ANSI_FUNC_256(highlight_yellow, HIGHLIGHT_YELLOW, HIGHLIGHT_YELLOW_FALLBACK); +DEFINE_ANSI_FUNC_256(highlight_yellow4, HIGHLIGHT_YELLOW4, HIGHLIGHT_YELLOW_FALLBACK); +DEFINE_ANSI_FUNC(highlight_blue, HIGHLIGHT_BLUE); +DEFINE_ANSI_FUNC(highlight_magenta, HIGHLIGHT_MAGENTA); +DEFINE_ANSI_FUNC(highlight_cyan, HIGHLIGHT_CYAN); +DEFINE_ANSI_FUNC_256(highlight_grey, HIGHLIGHT_GREY, HIGHLIGHT_GREY_FALLBACK); +DEFINE_ANSI_FUNC(highlight_white, HIGHLIGHT_WHITE); + +static inline const char* _ansi_highlight_yellow(void) { + return colors_enabled() ? _ANSI_HIGHLIGHT_YELLOW : ""; +} + +DEFINE_ANSI_FUNC_UNDERLINE(highlight_underline, HIGHLIGHT); +DEFINE_ANSI_FUNC_UNDERLINE_256(grey_underline, GREY, BRIGHT_BLACK); +DEFINE_ANSI_FUNC_UNDERLINE(highlight_red_underline, HIGHLIGHT_RED); +DEFINE_ANSI_FUNC_UNDERLINE(highlight_green_underline, HIGHLIGHT_GREEN); +DEFINE_ANSI_FUNC_UNDERLINE_256(highlight_yellow_underline, HIGHLIGHT_YELLOW, HIGHLIGHT_YELLOW_FALLBACK); +DEFINE_ANSI_FUNC_UNDERLINE(highlight_blue_underline, HIGHLIGHT_BLUE); +DEFINE_ANSI_FUNC_UNDERLINE(highlight_magenta_underline, HIGHLIGHT_MAGENTA); +DEFINE_ANSI_FUNC_UNDERLINE_256(highlight_grey_underline, HIGHLIGHT_GREY, HIGHLIGHT_GREY_FALLBACK); + +int get_ctty_devnr(pid_t pid, dev_t *d); +int get_ctty(pid_t, dev_t *_devnr, char **r); + +int getttyname_malloc(int fd, char **r); +int getttyname_harder(int fd, char **r); + +int ptsname_malloc(int fd, char **ret); + +int openpt_allocate(int flags, char **ret_slave); +int openpt_allocate_in_namespace(pid_t pid, int flags, char **ret_slave); +int open_terminal_in_namespace(pid_t pid, const char *name, int mode); + +int vt_default_utf8(void); +int vt_reset_keyboard(int fd); +int vt_restore(int fd); +int vt_release(int fd, bool restore_vt); + +void get_log_colors(int priority, const char **on, const char **off, const char **highlight); + +static inline const char* ansi_highlight_green_red(bool b) { + return b ? ansi_highlight_green() : ansi_highlight_red(); +} + +/* This assumes there is a 'tty' group */ +#define TTY_MODE 0620 diff --git a/src/basic/time-util.c b/src/basic/time-util.c new file mode 100644 index 0000000..f9014dc --- /dev/null +++ b/src/basic/time-util.c @@ -0,0 +1,1773 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "log.h" +#include "macro.h" +#include "missing_threads.h" +#include "missing_timerfd.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +static clockid_t map_clock_id(clockid_t c) { + + /* Some more exotic archs (s390, ppc, …) lack the "ALARM" flavour of the clocks. Thus, + * clock_gettime() will fail for them. Since they are essentially the same as their non-ALARM + * pendants (their only difference is when timers are set on them), let's just map them + * accordingly. This way, we can get the correct time even on those archs. */ + + switch (c) { + + case CLOCK_BOOTTIME_ALARM: + return CLOCK_BOOTTIME; + + case CLOCK_REALTIME_ALARM: + return CLOCK_REALTIME; + + default: + return c; + } +} + +usec_t now(clockid_t clock_id) { + struct timespec ts; + + assert_se(clock_gettime(map_clock_id(clock_id), &ts) == 0); + + return timespec_load(&ts); +} + +nsec_t now_nsec(clockid_t clock_id) { + struct timespec ts; + + assert_se(clock_gettime(map_clock_id(clock_id), &ts) == 0); + + return timespec_load_nsec(&ts); +} + +dual_timestamp* dual_timestamp_now(dual_timestamp *ts) { + assert(ts); + + ts->realtime = now(CLOCK_REALTIME); + ts->monotonic = now(CLOCK_MONOTONIC); + + return ts; +} + +triple_timestamp* triple_timestamp_now(triple_timestamp *ts) { + assert(ts); + + ts->realtime = now(CLOCK_REALTIME); + ts->monotonic = now(CLOCK_MONOTONIC); + ts->boottime = now(CLOCK_BOOTTIME); + + return ts; +} + +static usec_t map_clock_usec_internal(usec_t from, usec_t from_base, usec_t to_base) { + + /* Maps the time 'from' between two clocks, based on a common reference point where the first clock + * is at 'from_base' and the second clock at 'to_base'. Basically calculates: + * + * from - from_base + to_base + * + * But takes care of overflows/underflows and avoids signed operations. */ + + if (from >= from_base) { /* In the future */ + usec_t delta = from - from_base; + + if (to_base >= USEC_INFINITY - delta) /* overflow? */ + return USEC_INFINITY; + + return to_base + delta; + + } else { /* In the past */ + usec_t delta = from_base - from; + + if (to_base <= delta) /* underflow? */ + return 0; + + return to_base - delta; + } +} + +usec_t map_clock_usec(usec_t from, clockid_t from_clock, clockid_t to_clock) { + + /* Try to avoid any inaccuracy needlessly added in case we convert from effectively the same clock + * onto itself */ + if (map_clock_id(from_clock) == map_clock_id(to_clock)) + return from; + + /* Keep infinity as is */ + if (from == USEC_INFINITY) + return from; + + return map_clock_usec_internal(from, now(from_clock), now(to_clock)); +} + +dual_timestamp* dual_timestamp_from_realtime(dual_timestamp *ts, usec_t u) { + assert(ts); + + if (!timestamp_is_set(u)) { + ts->realtime = ts->monotonic = u; + return ts; + } + + ts->realtime = u; + ts->monotonic = map_clock_usec(u, CLOCK_REALTIME, CLOCK_MONOTONIC); + return ts; +} + +triple_timestamp* triple_timestamp_from_realtime(triple_timestamp *ts, usec_t u) { + usec_t nowr; + + assert(ts); + + if (!timestamp_is_set(u)) { + ts->realtime = ts->monotonic = ts->boottime = u; + return ts; + } + + nowr = now(CLOCK_REALTIME); + + ts->realtime = u; + ts->monotonic = map_clock_usec_internal(u, nowr, now(CLOCK_MONOTONIC)); + ts->boottime = map_clock_usec_internal(u, nowr, now(CLOCK_BOOTTIME)); + + return ts; +} + +triple_timestamp* triple_timestamp_from_boottime(triple_timestamp *ts, usec_t u) { + usec_t nowb; + + assert(ts); + + if (u == USEC_INFINITY) { + ts->realtime = ts->monotonic = ts->boottime = u; + return ts; + } + + nowb = now(CLOCK_BOOTTIME); + + ts->boottime = u; + ts->monotonic = map_clock_usec_internal(u, nowb, now(CLOCK_MONOTONIC)); + ts->realtime = map_clock_usec_internal(u, nowb, now(CLOCK_REALTIME)); + + return ts; +} + +dual_timestamp* dual_timestamp_from_monotonic(dual_timestamp *ts, usec_t u) { + assert(ts); + + if (u == USEC_INFINITY) { + ts->realtime = ts->monotonic = USEC_INFINITY; + return ts; + } + + ts->monotonic = u; + ts->realtime = map_clock_usec(u, CLOCK_MONOTONIC, CLOCK_REALTIME); + return ts; +} + +dual_timestamp* dual_timestamp_from_boottime(dual_timestamp *ts, usec_t u) { + usec_t nowm; + + assert(ts); + + if (u == USEC_INFINITY) { + ts->realtime = ts->monotonic = USEC_INFINITY; + return ts; + } + + nowm = now(CLOCK_BOOTTIME); + ts->monotonic = map_clock_usec_internal(u, nowm, now(CLOCK_MONOTONIC)); + ts->realtime = map_clock_usec_internal(u, nowm, now(CLOCK_REALTIME)); + return ts; +} + +usec_t triple_timestamp_by_clock(triple_timestamp *ts, clockid_t clock) { + assert(ts); + + switch (clock) { + + case CLOCK_REALTIME: + case CLOCK_REALTIME_ALARM: + return ts->realtime; + + case CLOCK_MONOTONIC: + return ts->monotonic; + + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + return ts->boottime; + + default: + return USEC_INFINITY; + } +} + +usec_t timespec_load(const struct timespec *ts) { + assert(ts); + + if (ts->tv_sec < 0 || ts->tv_nsec < 0) + return USEC_INFINITY; + + if ((usec_t) ts->tv_sec > (UINT64_MAX - (ts->tv_nsec / NSEC_PER_USEC)) / USEC_PER_SEC) + return USEC_INFINITY; + + return + (usec_t) ts->tv_sec * USEC_PER_SEC + + (usec_t) ts->tv_nsec / NSEC_PER_USEC; +} + +nsec_t timespec_load_nsec(const struct timespec *ts) { + assert(ts); + + if (ts->tv_sec < 0 || ts->tv_nsec < 0) + return NSEC_INFINITY; + + if ((nsec_t) ts->tv_sec >= (UINT64_MAX - ts->tv_nsec) / NSEC_PER_SEC) + return NSEC_INFINITY; + + return (nsec_t) ts->tv_sec * NSEC_PER_SEC + (nsec_t) ts->tv_nsec; +} + +struct timespec *timespec_store(struct timespec *ts, usec_t u) { + assert(ts); + + if (u == USEC_INFINITY || + u / USEC_PER_SEC >= TIME_T_MAX) { + ts->tv_sec = (time_t) -1; + ts->tv_nsec = -1L; + return ts; + } + + ts->tv_sec = (time_t) (u / USEC_PER_SEC); + ts->tv_nsec = (long) ((u % USEC_PER_SEC) * NSEC_PER_USEC); + + return ts; +} + +struct timespec *timespec_store_nsec(struct timespec *ts, nsec_t n) { + assert(ts); + + if (n == NSEC_INFINITY || + n / NSEC_PER_SEC >= TIME_T_MAX) { + ts->tv_sec = (time_t) -1; + ts->tv_nsec = -1L; + return ts; + } + + ts->tv_sec = (time_t) (n / NSEC_PER_SEC); + ts->tv_nsec = (long) (n % NSEC_PER_SEC); + + return ts; +} + +usec_t timeval_load(const struct timeval *tv) { + assert(tv); + + if (tv->tv_sec < 0 || tv->tv_usec < 0) + return USEC_INFINITY; + + if ((usec_t) tv->tv_sec > (UINT64_MAX - tv->tv_usec) / USEC_PER_SEC) + return USEC_INFINITY; + + return + (usec_t) tv->tv_sec * USEC_PER_SEC + + (usec_t) tv->tv_usec; +} + +struct timeval *timeval_store(struct timeval *tv, usec_t u) { + assert(tv); + + if (u == USEC_INFINITY || + u / USEC_PER_SEC > TIME_T_MAX) { + tv->tv_sec = (time_t) -1; + tv->tv_usec = (suseconds_t) -1; + } else { + tv->tv_sec = (time_t) (u / USEC_PER_SEC); + tv->tv_usec = (suseconds_t) (u % USEC_PER_SEC); + } + + return tv; +} + +char *format_timestamp_style( + char *buf, + size_t l, + usec_t t, + TimestampStyle style) { + + /* The weekdays in non-localized (English) form. We use this instead of the localized form, so that + * our generated timestamps may be parsed with parse_timestamp(), and always read the same. */ + static const char * const weekdays[] = { + [0] = "Sun", + [1] = "Mon", + [2] = "Tue", + [3] = "Wed", + [4] = "Thu", + [5] = "Fri", + [6] = "Sat", + }; + + struct tm tm; + bool utc, us; + time_t sec; + size_t n; + + assert(buf); + assert(style >= 0); + assert(style < _TIMESTAMP_STYLE_MAX); + + if (!timestamp_is_set(t)) + return NULL; /* Timestamp is unset */ + + if (style == TIMESTAMP_UNIX) { + if (l < (size_t) (1 + 1 + 1)) + return NULL; /* not enough space for even the shortest of forms */ + + return snprintf_ok(buf, l, "@" USEC_FMT, t / USEC_PER_SEC); /* round down μs → s */ + } + + utc = IN_SET(style, TIMESTAMP_UTC, TIMESTAMP_US_UTC, TIMESTAMP_DATE); + us = IN_SET(style, TIMESTAMP_US, TIMESTAMP_US_UTC); + + if (l < (size_t) (3 + /* week day */ + 1 + 10 + /* space and date */ + style == TIMESTAMP_DATE ? 0 : + (1 + 8 + /* space and time */ + (us ? 1 + 6 : 0) + /* "." and microsecond part */ + 1 + (utc ? 3 : 1)) + /* space and shortest possible zone */ + 1)) + return NULL; /* Not enough space even for the shortest form. */ + + /* Let's not format times with years > 9999 */ + if (t > USEC_TIMESTAMP_FORMATTABLE_MAX) { + static const char* const xxx[_TIMESTAMP_STYLE_MAX] = { + [TIMESTAMP_PRETTY] = "--- XXXX-XX-XX XX:XX:XX", + [TIMESTAMP_US] = "--- XXXX-XX-XX XX:XX:XX.XXXXXX", + [TIMESTAMP_UTC] = "--- XXXX-XX-XX XX:XX:XX UTC", + [TIMESTAMP_US_UTC] = "--- XXXX-XX-XX XX:XX:XX.XXXXXX UTC", + [TIMESTAMP_DATE] = "--- XXXX-XX-XX", + }; + + assert(l >= strlen(xxx[style]) + 1); + return strcpy(buf, xxx[style]); + } + + sec = (time_t) (t / USEC_PER_SEC); /* Round down */ + + if (!localtime_or_gmtime_r(&sec, &tm, utc)) + return NULL; + + /* Start with the week day */ + assert((size_t) tm.tm_wday < ELEMENTSOF(weekdays)); + memcpy(buf, weekdays[tm.tm_wday], 4); + + if (style == TIMESTAMP_DATE) { + /* Special format string if only date should be shown. */ + if (strftime(buf + 3, l - 3, " %Y-%m-%d", &tm) <= 0) + return NULL; /* Doesn't fit */ + + return buf; + } + + /* Add the main components */ + if (strftime(buf + 3, l - 3, " %Y-%m-%d %H:%M:%S", &tm) <= 0) + return NULL; /* Doesn't fit */ + + /* Append the microseconds part, if that's requested */ + if (us) { + n = strlen(buf); + if (n + 8 > l) + return NULL; /* Microseconds part doesn't fit. */ + + sprintf(buf + n, ".%06"PRI_USEC, t % USEC_PER_SEC); + } + + /* Append the timezone */ + n = strlen(buf); + if (utc) { + /* If this is UTC then let's explicitly use the "UTC" string here, because gmtime_r() + * normally uses the obsolete "GMT" instead. */ + if (n + 5 > l) + return NULL; /* "UTC" doesn't fit. */ + + strcpy(buf + n, " UTC"); + + } else if (!isempty(tm.tm_zone)) { + size_t tn; + + /* An explicit timezone is specified, let's use it, if it fits */ + tn = strlen(tm.tm_zone); + if (n + 1 + tn + 1 > l) { + /* The full time zone does not fit in. Yuck. */ + + if (n + 1 + _POSIX_TZNAME_MAX + 1 > l) + return NULL; /* Not even enough space for the POSIX minimum (of 6)? In that + * case, complain that it doesn't fit. */ + + /* So the time zone doesn't fit in fully, but the caller passed enough space for the + * POSIX minimum time zone length. In this case suppress the timezone entirely, in + * order not to dump an overly long, hard to read string on the user. This should be + * safe, because the user will assume the local timezone anyway if none is shown. And + * so does parse_timestamp(). */ + } else { + buf[n++] = ' '; + strcpy(buf + n, tm.tm_zone); + } + } + + return buf; +} + +char* format_timestamp_relative_full(char *buf, size_t l, usec_t t, clockid_t clock, bool implicit_left) { + const char *s; + usec_t n, d; + + assert(buf); + + if (!timestamp_is_set(t)) + return NULL; + + n = now(clock); + if (n > t) { + d = n - t; + s = " ago"; + } else { + d = t - n; + s = implicit_left ? "" : " left"; + } + + if (d >= USEC_PER_YEAR) { + usec_t years = d / USEC_PER_YEAR; + usec_t months = (d % USEC_PER_YEAR) / USEC_PER_MONTH; + + (void) snprintf(buf, l, USEC_FMT " %s " USEC_FMT " %s%s", + years, + years == 1 ? "year" : "years", + months, + months == 1 ? "month" : "months", + s); + } else if (d >= USEC_PER_MONTH) { + usec_t months = d / USEC_PER_MONTH; + usec_t days = (d % USEC_PER_MONTH) / USEC_PER_DAY; + + (void) snprintf(buf, l, USEC_FMT " %s " USEC_FMT " %s%s", + months, + months == 1 ? "month" : "months", + days, + days == 1 ? "day" : "days", + s); + } else if (d >= USEC_PER_WEEK) { + usec_t weeks = d / USEC_PER_WEEK; + usec_t days = (d % USEC_PER_WEEK) / USEC_PER_DAY; + + (void) snprintf(buf, l, USEC_FMT " %s " USEC_FMT " %s%s", + weeks, + weeks == 1 ? "week" : "weeks", + days, + days == 1 ? "day" : "days", + s); + } else if (d >= 2*USEC_PER_DAY) + (void) snprintf(buf, l, USEC_FMT " days%s", d / USEC_PER_DAY,s); + else if (d >= 25*USEC_PER_HOUR) + (void) snprintf(buf, l, "1 day " USEC_FMT "h%s", + (d - USEC_PER_DAY) / USEC_PER_HOUR, s); + else if (d >= 6*USEC_PER_HOUR) + (void) snprintf(buf, l, USEC_FMT "h%s", + d / USEC_PER_HOUR, s); + else if (d >= USEC_PER_HOUR) + (void) snprintf(buf, l, USEC_FMT "h " USEC_FMT "min%s", + d / USEC_PER_HOUR, + (d % USEC_PER_HOUR) / USEC_PER_MINUTE, s); + else if (d >= 5*USEC_PER_MINUTE) + (void) snprintf(buf, l, USEC_FMT "min%s", + d / USEC_PER_MINUTE, s); + else if (d >= USEC_PER_MINUTE) + (void) snprintf(buf, l, USEC_FMT "min " USEC_FMT "s%s", + d / USEC_PER_MINUTE, + (d % USEC_PER_MINUTE) / USEC_PER_SEC, s); + else if (d >= USEC_PER_SEC) + (void) snprintf(buf, l, USEC_FMT "s%s", + d / USEC_PER_SEC, s); + else if (d >= USEC_PER_MSEC) + (void) snprintf(buf, l, USEC_FMT "ms%s", + d / USEC_PER_MSEC, s); + else if (d > 0) + (void) snprintf(buf, l, USEC_FMT"us%s", + d, s); + else + (void) snprintf(buf, l, "now"); + + buf[l-1] = 0; + return buf; +} + +char* format_timespan(char *buf, size_t l, usec_t t, usec_t accuracy) { + static const struct { + const char *suffix; + usec_t usec; + } table[] = { + { "y", USEC_PER_YEAR }, + { "month", USEC_PER_MONTH }, + { "w", USEC_PER_WEEK }, + { "d", USEC_PER_DAY }, + { "h", USEC_PER_HOUR }, + { "min", USEC_PER_MINUTE }, + { "s", USEC_PER_SEC }, + { "ms", USEC_PER_MSEC }, + { "us", 1 }, + }; + + char *p = ASSERT_PTR(buf); + bool something = false; + + assert(l > 0); + + if (t == USEC_INFINITY) { + strncpy(p, "infinity", l-1); + p[l-1] = 0; + return p; + } + + if (t <= 0) { + strncpy(p, "0", l-1); + p[l-1] = 0; + return p; + } + + /* The result of this function can be parsed with parse_sec */ + + for (size_t i = 0; i < ELEMENTSOF(table); i++) { + int k = 0; + size_t n; + bool done = false; + usec_t a, b; + + if (t <= 0) + break; + + if (t < accuracy && something) + break; + + if (t < table[i].usec) + continue; + + if (l <= 1) + break; + + a = t / table[i].usec; + b = t % table[i].usec; + + /* Let's see if we should shows this in dot notation */ + if (t < USEC_PER_MINUTE && b > 0) { + signed char j = 0; + + for (usec_t cc = table[i].usec; cc > 1; cc /= 10) + j++; + + for (usec_t cc = accuracy; cc > 1; cc /= 10) { + b /= 10; + j--; + } + + if (j > 0) { + k = snprintf(p, l, + "%s"USEC_FMT".%0*"PRI_USEC"%s", + p > buf ? " " : "", + a, + j, + b, + table[i].suffix); + + t = 0; + done = true; + } + } + + /* No? Then let's show it normally */ + if (!done) { + k = snprintf(p, l, + "%s"USEC_FMT"%s", + p > buf ? " " : "", + a, + table[i].suffix); + + t = b; + } + + n = MIN((size_t) k, l-1); + + l -= n; + p += n; + + something = true; + } + + *p = 0; + + return buf; +} + +static int parse_timestamp_impl( + const char *t, + size_t max_len, + bool utc, + int isdst, + long gmtoff, + usec_t *ret) { + + static const struct { + const char *name; + const int nr; + } day_nr[] = { + { "Sunday", 0 }, + { "Sun", 0 }, + { "Monday", 1 }, + { "Mon", 1 }, + { "Tuesday", 2 }, + { "Tue", 2 }, + { "Wednesday", 3 }, + { "Wed", 3 }, + { "Thursday", 4 }, + { "Thu", 4 }, + { "Friday", 5 }, + { "Fri", 5 }, + { "Saturday", 6 }, + { "Sat", 6 }, + }; + + _cleanup_free_ char *t_alloc = NULL; + usec_t usec, plus = 0, minus = 0; + bool with_tz = false; + int r, weekday = -1; + unsigned fractional = 0; + const char *k; + struct tm tm, copy; + time_t sec; + + /* Allowed syntaxes: + * + * 2012-09-22 16:34:22.1[2[3[4[5[6]]]]] + * 2012-09-22 16:34:22 (µsec will be set to 0) + * 2012-09-22 16:34 (seconds will be set to 0) + * 2012-09-22T16:34:22.1[2[3[4[5[6]]]]] + * 2012-09-22T16:34:22 (µsec will be set to 0) + * 2012-09-22T16:34 (seconds will be set to 0) + * 2012-09-22 (time will be set to 00:00:00) + * 16:34:22 (date will be set to today) + * 16:34 (date will be set to today, seconds to 0) + * now + * yesterday (time is set to 00:00:00) + * today (time is set to 00:00:00) + * tomorrow (time is set to 00:00:00) + * +5min + * -5days + * @2147483647 (seconds since epoch) + * + * Note, on DST change, 00:00:00 may not exist and in that case the time part may be shifted. + * E.g. "Sun 2023-03-13 America/Havana" is parsed as "Sun 2023-03-13 01:00:00 CDT". + * + * A simplified strptime-spelled RFC3339 ABNF looks like + * "%Y-%m-%d" "T" "%H" ":" "%M" ":" "%S" [".%N"] ("Z" / (("+" / "-") "%H:%M")) + * We additionally allow no seconds and inherited timezone + * for symmetry with our other syntaxes and improved interactive usability: + * "%Y-%m-%d" "T" "%H" ":" "%M" ":" ["%S" [".%N"]] ["Z" / (("+" / "-") "%H:%M")] + * RFC3339 defines time-secfrac to as "." 1*DIGIT, but we limit to 6 digits, + * since we're limited to 1µs resolution. + * We also accept "Sat 2012-09-22T16:34:22", RFC3339 warns against it. + */ + + assert(t); + + if (max_len != SIZE_MAX) { + /* If the input string contains timezone, then cut it here. */ + + if (max_len == 0) /* Can't be the only field */ + return -EINVAL; + + t_alloc = strndup(t, max_len); + if (!t_alloc) + return -ENOMEM; + + t = t_alloc; + with_tz = true; + } + + if (utc) { + /* glibc accepts gmtoff more than 24 hours, but we refuse it. */ + if ((usec_t) labs(gmtoff) * USEC_PER_SEC > USEC_PER_DAY) + return -EINVAL; + } else { + if (gmtoff != 0) + return -EINVAL; + } + + if (t[0] == '@' && !with_tz) + return parse_sec(t + 1, ret); + + usec = now(CLOCK_REALTIME); + + if (!with_tz) { + if (streq(t, "now")) + goto finish; + + if (t[0] == '+') { + r = parse_sec(t+1, &plus); + if (r < 0) + return r; + + goto finish; + } + + if (t[0] == '-') { + r = parse_sec(t+1, &minus); + if (r < 0) + return r; + + goto finish; + } + + if ((k = endswith(t, " ago"))) { + _cleanup_free_ char *buf = NULL; + + buf = strndup(t, k - t); + if (!buf) + return -ENOMEM; + + r = parse_sec(buf, &minus); + if (r < 0) + return r; + + goto finish; + } + + if ((k = endswith(t, " left"))) { + _cleanup_free_ char *buf = NULL; + + buf = strndup(t, k - t); + if (!buf) + return -ENOMEM; + + r = parse_sec(buf, &plus); + if (r < 0) + return r; + + goto finish; + } + } + + sec = (time_t) (usec / USEC_PER_SEC); + + if (!localtime_or_gmtime_r(&sec, &tm, utc)) + return -EINVAL; + + tm.tm_isdst = isdst; + + if (streq(t, "today")) { + tm.tm_sec = tm.tm_min = tm.tm_hour = 0; + goto from_tm; + + } else if (streq(t, "yesterday")) { + tm.tm_mday--; + tm.tm_sec = tm.tm_min = tm.tm_hour = 0; + goto from_tm; + + } else if (streq(t, "tomorrow")) { + tm.tm_mday++; + tm.tm_sec = tm.tm_min = tm.tm_hour = 0; + goto from_tm; + } + + for (size_t i = 0; i < ELEMENTSOF(day_nr); i++) { + k = startswith_no_case(t, day_nr[i].name); + if (!k || *k != ' ') + continue; + + weekday = day_nr[i].nr; + t = k + 1; + break; + } + + copy = tm; + k = strptime(t, "%y-%m-%d %H:%M:%S", &tm); + if (k) { + if (*k == '.') + goto parse_usec; + else if (*k == 0) + goto from_tm; + } + + /* Our "canonical" RFC3339 syntax variant */ + tm = copy; + k = strptime(t, "%Y-%m-%d %H:%M:%S", &tm); + if (k) { + if (*k == '.') + goto parse_usec; + else if (*k == 0) + goto from_tm; + } + + /* RFC3339 syntax */ + tm = copy; + k = strptime(t, "%Y-%m-%dT%H:%M:%S", &tm); + if (k) { + if (*k == '.') + goto parse_usec; + else if (*k == 0) + goto from_tm; + } + + /* Support OUTPUT_SHORT and OUTPUT_SHORT_PRECISE formats */ + tm = copy; + k = strptime(t, "%b %d %H:%M:%S", &tm); + if (k) { + if (*k == '.') + goto parse_usec; + else if (*k == 0) + goto from_tm; + } + + tm = copy; + k = strptime(t, "%y-%m-%d %H:%M", &tm); + if (k && *k == 0) { + tm.tm_sec = 0; + goto from_tm; + } + + /* Our "canonical" RFC3339 syntax variant without seconds */ + tm = copy; + k = strptime(t, "%Y-%m-%d %H:%M", &tm); + if (k && *k == 0) { + tm.tm_sec = 0; + goto from_tm; + } + + /* RFC3339 syntax without seconds */ + tm = copy; + k = strptime(t, "%Y-%m-%dT%H:%M", &tm); + if (k && *k == 0) { + tm.tm_sec = 0; + goto from_tm; + } + + tm = copy; + k = strptime(t, "%y-%m-%d", &tm); + if (k && *k == 0) { + tm.tm_sec = tm.tm_min = tm.tm_hour = 0; + goto from_tm; + } + + tm = copy; + k = strptime(t, "%Y-%m-%d", &tm); + if (k && *k == 0) { + tm.tm_sec = tm.tm_min = tm.tm_hour = 0; + goto from_tm; + } + + tm = copy; + k = strptime(t, "%H:%M:%S", &tm); + if (k) { + if (*k == '.') + goto parse_usec; + else if (*k == 0) + goto from_tm; + } + + tm = copy; + k = strptime(t, "%H:%M", &tm); + if (k && *k == 0) { + tm.tm_sec = 0; + goto from_tm; + } + + return -EINVAL; + +parse_usec: + k++; + r = parse_fractional_part_u(&k, 6, &fractional); + if (r < 0) + return -EINVAL; + if (*k != '\0') + return -EINVAL; + +from_tm: + assert(plus == 0); + assert(minus == 0); + + if (weekday >= 0 && tm.tm_wday != weekday) + return -EINVAL; + + if (gmtoff < 0) { + plus = -gmtoff * USEC_PER_SEC; + + /* If gmtoff is negative, the string may be too old to be parsed as UTC. + * E.g. 1969-12-31 23:00:00 -06 == 1970-01-01 05:00:00 UTC + * We assumed that gmtoff is in the range of -24:00…+24:00, hence the only date we need to + * handle here is 1969-12-31. So, let's shift the date with one day, then subtract the shift + * later. */ + if (tm.tm_year == 69 && tm.tm_mon == 11 && tm.tm_mday == 31) { + /* Thu 1970-01-01-00:00:00 */ + tm.tm_year = 70; + tm.tm_mon = 0; + tm.tm_mday = 1; + tm.tm_wday = 4; + tm.tm_yday = 0; + minus = USEC_PER_DAY; + } + } else + minus = gmtoff * USEC_PER_SEC; + + sec = mktime_or_timegm(&tm, utc); + if (sec < 0) + return -EINVAL; + + usec = usec_add(sec * USEC_PER_SEC, fractional); + +finish: + usec = usec_add(usec, plus); + + if (usec < minus) + return -EINVAL; + + usec = usec_sub_unsigned(usec, minus); + + if (usec > USEC_TIMESTAMP_FORMATTABLE_MAX) + return -EINVAL; + + if (ret) + *ret = usec; + return 0; +} + +static int parse_timestamp_maybe_with_tz(const char *t, size_t tz_offset, bool valid_tz, usec_t *ret) { + assert(t); + + tzset(); + + for (int j = 0; j <= 1; j++) { + if (isempty(tzname[j])) + continue; + + if (!streq(t + tz_offset, tzname[j])) + continue; + + /* The specified timezone matches tzname[] of the local timezone. */ + return parse_timestamp_impl(t, tz_offset - 1, /* utc = */ false, /* isdst = */ j, /* gmtoff = */ 0, ret); + } + + /* If we know that the last word is a valid timezone (e.g. Asia/Tokyo), then simply drop the timezone + * and parse the remaining string as a local time. If we know that the last word is not a timezone, + * then assume that it is a part of the time and try to parse the whole string as a local time. */ + return parse_timestamp_impl(t, valid_tz ? tz_offset - 1 : SIZE_MAX, + /* utc = */ false, /* isdst = */ -1, /* gmtoff = */ 0, ret); +} + +typedef struct ParseTimestampResult { + usec_t usec; + int return_value; +} ParseTimestampResult; + +int parse_timestamp(const char *t, usec_t *ret) { + ParseTimestampResult *shared, tmp; + const char *k, *tz, *current_tz; + size_t max_len, t_len; + struct tm tm; + int r; + + assert(t); + + t_len = strlen(t); + if (t_len > 2 && t[t_len - 1] == 'Z' && t[t_len - 2] != ' ') /* RFC3339-style welded UTC: "1985-04-12T23:20:50.52Z" */ + return parse_timestamp_impl(t, t_len - 1, /* utc = */ true, /* isdst = */ -1, /* gmtoff = */ 0, ret); + + if (t_len > 7 && IN_SET(t[t_len - 6], '+', '-') && t[t_len - 7] != ' ') { /* RFC3339-style welded offset: "1990-12-31T15:59:60-08:00" */ + k = strptime(&t[t_len - 6], "%z", &tm); + if (k && *k == '\0') + return parse_timestamp_impl(t, t_len - 6, /* utc = */ true, /* isdst = */ -1, /* gmtoff = */ tm.tm_gmtoff, ret); + } + + tz = strrchr(t, ' '); + if (!tz) + return parse_timestamp_impl(t, /* max_len = */ SIZE_MAX, /* utc = */ false, /* isdst = */ -1, /* gmtoff = */ 0, ret); + + max_len = tz - t; + tz++; + + /* Shortcut, parse the string as UTC. */ + if (streq(tz, "UTC")) + return parse_timestamp_impl(t, max_len, /* utc = */ true, /* isdst = */ -1, /* gmtoff = */ 0, ret); + + /* If the timezone is compatible with RFC-822/ISO 8601 (e.g. +06, or -03:00) then parse the string as + * UTC and shift the result. Note, this must be earlier than the timezone check with tzname[], as + * tzname[] may be in the same format. */ + k = strptime(tz, "%z", &tm); + if (k && *k == '\0') + return parse_timestamp_impl(t, max_len, /* utc = */ true, /* isdst = */ -1, /* gmtoff = */ tm.tm_gmtoff, ret); + + /* If the last word is not a timezone file (e.g. Asia/Tokyo), then let's check if it matches + * tzname[] of the local timezone, e.g. JST or CEST. */ + if (!timezone_is_valid(tz, LOG_DEBUG)) + return parse_timestamp_maybe_with_tz(t, tz - t, /* valid_tz = */ false, ret); + + /* Shortcut. If the current $TZ is equivalent to the specified timezone, it is not necessary to fork + * the process. */ + current_tz = getenv("TZ"); + if (current_tz && *current_tz == ':' && streq(current_tz + 1, tz)) + return parse_timestamp_maybe_with_tz(t, tz - t, /* valid_tz = */ true, ret); + + /* Otherwise, to avoid polluting the current environment variables, let's fork the process and set + * the specified timezone in the child process. */ + + shared = mmap(NULL, sizeof *shared, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) + return negative_errno(); + + r = safe_fork("(sd-timestamp)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_WAIT, NULL); + if (r < 0) { + (void) munmap(shared, sizeof *shared); + return r; + } + if (r == 0) { + const char *colon_tz; + + /* tzset(3) says $TZ should be prefixed with ":" if we reference timezone files */ + colon_tz = strjoina(":", tz); + + if (setenv("TZ", colon_tz, 1) != 0) { + shared->return_value = negative_errno(); + _exit(EXIT_FAILURE); + } + + shared->return_value = parse_timestamp_maybe_with_tz(t, tz - t, /* valid_tz = */ true, &shared->usec); + + _exit(EXIT_SUCCESS); + } + + tmp = *shared; + if (munmap(shared, sizeof *shared) != 0) + return negative_errno(); + + if (tmp.return_value == 0 && ret) + *ret = tmp.usec; + + return tmp.return_value; +} + +static const char* extract_multiplier(const char *p, usec_t *ret) { + static const struct { + const char *suffix; + usec_t usec; + } table[] = { + { "seconds", USEC_PER_SEC }, + { "second", USEC_PER_SEC }, + { "sec", USEC_PER_SEC }, + { "s", USEC_PER_SEC }, + { "minutes", USEC_PER_MINUTE }, + { "minute", USEC_PER_MINUTE }, + { "min", USEC_PER_MINUTE }, + { "months", USEC_PER_MONTH }, + { "month", USEC_PER_MONTH }, + { "M", USEC_PER_MONTH }, + { "msec", USEC_PER_MSEC }, + { "ms", USEC_PER_MSEC }, + { "m", USEC_PER_MINUTE }, + { "hours", USEC_PER_HOUR }, + { "hour", USEC_PER_HOUR }, + { "hr", USEC_PER_HOUR }, + { "h", USEC_PER_HOUR }, + { "days", USEC_PER_DAY }, + { "day", USEC_PER_DAY }, + { "d", USEC_PER_DAY }, + { "weeks", USEC_PER_WEEK }, + { "week", USEC_PER_WEEK }, + { "w", USEC_PER_WEEK }, + { "years", USEC_PER_YEAR }, + { "year", USEC_PER_YEAR }, + { "y", USEC_PER_YEAR }, + { "usec", 1ULL }, + { "us", 1ULL }, + { "μs", 1ULL }, /* U+03bc (aka GREEK SMALL LETTER MU) */ + { "µs", 1ULL }, /* U+b5 (aka MICRO SIGN) */ + }; + + assert(p); + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(table); i++) { + char *e; + + e = startswith(p, table[i].suffix); + if (e) { + *ret = table[i].usec; + return e; + } + } + + return p; +} + +int parse_time(const char *t, usec_t *ret, usec_t default_unit) { + const char *p, *s; + usec_t usec = 0; + bool something = false; + + assert(t); + assert(default_unit > 0); + + p = t; + + p += strspn(p, WHITESPACE); + s = startswith(p, "infinity"); + if (s) { + s += strspn(s, WHITESPACE); + if (*s != 0) + return -EINVAL; + + if (ret) + *ret = USEC_INFINITY; + return 0; + } + + for (;;) { + usec_t multiplier = default_unit, k; + long long l; + char *e; + + p += strspn(p, WHITESPACE); + + if (*p == 0) { + if (!something) + return -EINVAL; + + break; + } + + if (*p == '-') /* Don't allow "-0" */ + return -ERANGE; + + errno = 0; + l = strtoll(p, &e, 10); + if (errno > 0) + return -errno; + if (l < 0) + return -ERANGE; + + if (*e == '.') { + p = e + 1; + p += strspn(p, DIGITS); + } else if (e == p) + return -EINVAL; + else + p = e; + + s = extract_multiplier(p + strspn(p, WHITESPACE), &multiplier); + if (s == p && *s != '\0') + /* Don't allow '12.34.56', but accept '12.34 .56' or '12.34s.56' */ + return -EINVAL; + + p = s; + + if ((usec_t) l >= USEC_INFINITY / multiplier) + return -ERANGE; + + k = (usec_t) l * multiplier; + if (k >= USEC_INFINITY - usec) + return -ERANGE; + + usec += k; + + something = true; + + if (*e == '.') { + usec_t m = multiplier / 10; + const char *b; + + for (b = e + 1; *b >= '0' && *b <= '9'; b++, m /= 10) { + k = (usec_t) (*b - '0') * m; + if (k >= USEC_INFINITY - usec) + return -ERANGE; + + usec += k; + } + + /* Don't allow "0.-0", "3.+1", "3. 1", "3.sec" or "3.hoge" */ + if (b == e + 1) + return -EINVAL; + } + } + + if (ret) + *ret = usec; + return 0; +} + +int parse_sec(const char *t, usec_t *ret) { + return parse_time(t, ret, USEC_PER_SEC); +} + +int parse_sec_fix_0(const char *t, usec_t *ret) { + usec_t k; + int r; + + assert(t); + assert(ret); + + r = parse_sec(t, &k); + if (r < 0) + return r; + + *ret = k == 0 ? USEC_INFINITY : k; + return r; +} + +int parse_sec_def_infinity(const char *t, usec_t *ret) { + assert(t); + assert(ret); + + t += strspn(t, WHITESPACE); + if (isempty(t)) { + *ret = USEC_INFINITY; + return 0; + } + return parse_sec(t, ret); +} + +static const char* extract_nsec_multiplier(const char *p, nsec_t *ret) { + static const struct { + const char *suffix; + nsec_t nsec; + } table[] = { + { "seconds", NSEC_PER_SEC }, + { "second", NSEC_PER_SEC }, + { "sec", NSEC_PER_SEC }, + { "s", NSEC_PER_SEC }, + { "minutes", NSEC_PER_MINUTE }, + { "minute", NSEC_PER_MINUTE }, + { "min", NSEC_PER_MINUTE }, + { "months", NSEC_PER_MONTH }, + { "month", NSEC_PER_MONTH }, + { "M", NSEC_PER_MONTH }, + { "msec", NSEC_PER_MSEC }, + { "ms", NSEC_PER_MSEC }, + { "m", NSEC_PER_MINUTE }, + { "hours", NSEC_PER_HOUR }, + { "hour", NSEC_PER_HOUR }, + { "hr", NSEC_PER_HOUR }, + { "h", NSEC_PER_HOUR }, + { "days", NSEC_PER_DAY }, + { "day", NSEC_PER_DAY }, + { "d", NSEC_PER_DAY }, + { "weeks", NSEC_PER_WEEK }, + { "week", NSEC_PER_WEEK }, + { "w", NSEC_PER_WEEK }, + { "years", NSEC_PER_YEAR }, + { "year", NSEC_PER_YEAR }, + { "y", NSEC_PER_YEAR }, + { "usec", NSEC_PER_USEC }, + { "us", NSEC_PER_USEC }, + { "μs", NSEC_PER_USEC }, /* U+03bc (aka GREEK LETTER MU) */ + { "µs", NSEC_PER_USEC }, /* U+b5 (aka MICRO SIGN) */ + { "nsec", 1ULL }, + { "ns", 1ULL }, + { "", 1ULL }, /* default is nsec */ + }; + size_t i; + + assert(p); + assert(ret); + + for (i = 0; i < ELEMENTSOF(table); i++) { + char *e; + + e = startswith(p, table[i].suffix); + if (e) { + *ret = table[i].nsec; + return e; + } + } + + return p; +} + +int parse_nsec(const char *t, nsec_t *ret) { + const char *p, *s; + nsec_t nsec = 0; + bool something = false; + + assert(t); + assert(ret); + + p = t; + + p += strspn(p, WHITESPACE); + s = startswith(p, "infinity"); + if (s) { + s += strspn(s, WHITESPACE); + if (*s != 0) + return -EINVAL; + + *ret = NSEC_INFINITY; + return 0; + } + + for (;;) { + nsec_t multiplier = 1, k; + long long l; + char *e; + + p += strspn(p, WHITESPACE); + + if (*p == 0) { + if (!something) + return -EINVAL; + + break; + } + + if (*p == '-') /* Don't allow "-0" */ + return -ERANGE; + + errno = 0; + l = strtoll(p, &e, 10); + if (errno > 0) + return -errno; + if (l < 0) + return -ERANGE; + + if (*e == '.') { + p = e + 1; + p += strspn(p, DIGITS); + } else if (e == p) + return -EINVAL; + else + p = e; + + s = extract_nsec_multiplier(p + strspn(p, WHITESPACE), &multiplier); + if (s == p && *s != '\0') + /* Don't allow '12.34.56', but accept '12.34 .56' or '12.34s.56' */ + return -EINVAL; + + p = s; + + if ((nsec_t) l >= NSEC_INFINITY / multiplier) + return -ERANGE; + + k = (nsec_t) l * multiplier; + if (k >= NSEC_INFINITY - nsec) + return -ERANGE; + + nsec += k; + + something = true; + + if (*e == '.') { + nsec_t m = multiplier / 10; + const char *b; + + for (b = e + 1; *b >= '0' && *b <= '9'; b++, m /= 10) { + k = (nsec_t) (*b - '0') * m; + if (k >= NSEC_INFINITY - nsec) + return -ERANGE; + + nsec += k; + } + + /* Don't allow "0.-0", "3.+1", "3. 1", "3.sec" or "3.hoge" */ + if (b == e + 1) + return -EINVAL; + } + } + + *ret = nsec; + + return 0; +} + +static int get_timezones_from_zone1970_tab(char ***ret) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **zones = NULL; + int r; + + assert(ret); + + f = fopen("/usr/share/zoneinfo/zone1970.tab", "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL, *cc = NULL, *co = NULL, *tz = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + const char *p = line; + + /* Line format is: + * 'country codes' 'coordinates' 'timezone' 'comments' */ + r = extract_many_words(&p, NULL, 0, &cc, &co, &tz, NULL); + if (r < 0) + continue; + + /* Lines that start with # are comments. */ + if (*cc == '#') + continue; + + r = strv_extend(&zones, tz); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(zones); + return 0; +} + +static int get_timezones_from_tzdata_zi(char ***ret) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **zones = NULL; + int r; + + assert(ret); + + f = fopen("/usr/share/zoneinfo/tzdata.zi", "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL, *type = NULL, *f1 = NULL, *f2 = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + const char *p = line; + + /* The only lines we care about are Zone and Link lines. + * Zone line format is: + * 'Zone' 'timezone' ... + * Link line format is: + * 'Link' 'target' 'alias' + * See 'man zic' for more detail. */ + r = extract_many_words(&p, NULL, 0, &type, &f1, &f2, NULL); + if (r < 0) + continue; + + char *tz; + if (IN_SET(*type, 'Z', 'z')) + /* Zone lines have timezone in field 1. */ + tz = f1; + else if (IN_SET(*type, 'L', 'l')) + /* Link lines have timezone in field 2. */ + tz = f2; + else + /* Not a line we care about. */ + continue; + + r = strv_extend(&zones, tz); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(zones); + return 0; +} + +int get_timezones(char ***ret) { + _cleanup_strv_free_ char **zones = NULL; + int r; + + assert(ret); + + r = get_timezones_from_tzdata_zi(&zones); + if (r == -ENOENT) { + log_debug_errno(r, "Could not get timezone data from tzdata.zi, using zone1970.tab: %m"); + r = get_timezones_from_zone1970_tab(&zones); + if (r == -ENOENT) + log_debug_errno(r, "Could not get timezone data from zone1970.tab, using UTC: %m"); + } + if (r < 0 && r != -ENOENT) + return r; + + /* Always include UTC */ + r = strv_extend(&zones, "UTC"); + if (r < 0) + return -ENOMEM; + + strv_sort(zones); + strv_uniq(zones); + + *ret = TAKE_PTR(zones); + return 0; +} + +int verify_timezone(const char *name, int log_level) { + bool slash = false; + const char *p, *t; + _cleanup_close_ int fd = -EBADF; + char buf[4]; + int r; + + if (isempty(name)) + return -EINVAL; + + /* Always accept "UTC" as valid timezone, since it's the fallback, even if user has no timezones installed. */ + if (streq(name, "UTC")) + return 0; + + if (name[0] == '/') + return -EINVAL; + + for (p = name; *p; p++) { + if (!ascii_isdigit(*p) && + !ascii_isalpha(*p) && + !IN_SET(*p, '-', '_', '+', '/')) + return -EINVAL; + + if (*p == '/') { + + if (slash) + return -EINVAL; + + slash = true; + } else + slash = false; + } + + if (slash) + return -EINVAL; + + if (p - name >= PATH_MAX) + return -ENAMETOOLONG; + + t = strjoina("/usr/share/zoneinfo/", name); + + fd = open(t, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_full_errno(log_level, errno, "Failed to open timezone file '%s': %m", t); + + r = fd_verify_regular(fd); + if (r < 0) + return log_full_errno(log_level, r, "Timezone file '%s' is not a regular file: %m", t); + + r = loop_read_exact(fd, buf, 4, false); + if (r < 0) + return log_full_errno(log_level, r, "Failed to read from timezone file '%s': %m", t); + + /* Magic from tzfile(5) */ + if (memcmp(buf, "TZif", 4) != 0) + return log_full_errno(log_level, SYNTHETIC_ERRNO(EBADMSG), + "Timezone file '%s' has wrong magic bytes", t); + + return 0; +} + +bool clock_supported(clockid_t clock) { + struct timespec ts; + + switch (clock) { + + case CLOCK_MONOTONIC: + case CLOCK_REALTIME: + case CLOCK_BOOTTIME: + /* These three are always available in our baseline, and work in timerfd, as of kernel 3.15 */ + return true; + + default: + /* For everything else, check properly */ + return clock_gettime(clock, &ts) >= 0; + } +} + +int get_timezone(char **ret) { + _cleanup_free_ char *t = NULL; + const char *e; + char *z; + int r; + + assert(ret); + + r = readlink_malloc("/etc/localtime", &t); + if (r == -ENOENT) { + /* If the symlink does not exist, assume "UTC", like glibc does */ + z = strdup("UTC"); + if (!z) + return -ENOMEM; + + *ret = z; + return 0; + } + if (r < 0) + return r; /* returns EINVAL if not a symlink */ + + e = PATH_STARTSWITH_SET(t, "/usr/share/zoneinfo/", "../usr/share/zoneinfo/"); + if (!e) + return -EINVAL; + + if (!timezone_is_valid(e, LOG_DEBUG)) + return -EINVAL; + + z = strdup(e); + if (!z) + return -ENOMEM; + + *ret = z; + return 0; +} + +time_t mktime_or_timegm(struct tm *tm, bool utc) { + assert(tm); + + return utc ? timegm(tm) : mktime(tm); +} + +struct tm *localtime_or_gmtime_r(const time_t *t, struct tm *tm, bool utc) { + assert(t); + assert(tm); + + return utc ? gmtime_r(t, tm) : localtime_r(t, tm); +} + +static uint32_t sysconf_clock_ticks_cached(void) { + static thread_local uint32_t hz = 0; + long r; + + if (hz == 0) { + r = sysconf(_SC_CLK_TCK); + + assert(r > 0); + hz = r; + } + + return hz; +} + +uint32_t usec_to_jiffies(usec_t u) { + uint32_t hz = sysconf_clock_ticks_cached(); + return DIV_ROUND_UP(u, USEC_PER_SEC / hz); +} + +usec_t jiffies_to_usec(uint32_t j) { + uint32_t hz = sysconf_clock_ticks_cached(); + return DIV_ROUND_UP(j * USEC_PER_SEC, hz); +} + +usec_t usec_shift_clock(usec_t x, clockid_t from, clockid_t to) { + usec_t a, b; + + if (x == USEC_INFINITY) + return USEC_INFINITY; + if (map_clock_id(from) == map_clock_id(to)) + return x; + + a = now(from); + b = now(to); + + if (x > a) + /* x lies in the future */ + return usec_add(b, usec_sub_unsigned(x, a)); + else + /* x lies in the past */ + return usec_sub_unsigned(b, usec_sub_unsigned(a, x)); +} + +bool in_utc_timezone(void) { + tzset(); + + return timezone == 0 && daylight == 0; +} + +int time_change_fd(void) { + + /* We only care for the cancellation event, hence we set the timeout to the latest possible value. */ + static const struct itimerspec its = { + .it_value.tv_sec = TIME_T_MAX, + }; + + _cleanup_close_ int fd = -EBADF; + + assert_cc(sizeof(time_t) == sizeof(TIME_T_MAX)); + + /* Uses TFD_TIMER_CANCEL_ON_SET to get notifications whenever CLOCK_REALTIME makes a jump relative to + * CLOCK_MONOTONIC. */ + + fd = timerfd_create(CLOCK_REALTIME, TFD_NONBLOCK|TFD_CLOEXEC); + if (fd < 0) + return -errno; + + if (timerfd_settime(fd, TFD_TIMER_ABSTIME|TFD_TIMER_CANCEL_ON_SET, &its, NULL) >= 0) + return TAKE_FD(fd); + + /* So apparently there are systems where time_t is 64-bit, but the kernel actually doesn't support + * 64-bit time_t. In that case configuring a timer to TIME_T_MAX will fail with EOPNOTSUPP or a + * similar error. If that's the case let's try with INT32_MAX instead, maybe that works. It's a bit + * of a black magic thing though, but what can we do? + * + * We don't want this code on x86-64, hence let's conditionalize this for systems with 64-bit time_t + * but where "long" is shorter than 64-bit, i.e. 32-bit archs. + * + * See: https://github.com/systemd/systemd/issues/14362 */ + +#if SIZEOF_TIME_T == 8 && ULONG_MAX < UINT64_MAX + if (ERRNO_IS_NOT_SUPPORTED(errno) || errno == EOVERFLOW) { + static const struct itimerspec its32 = { + .it_value.tv_sec = INT32_MAX, + }; + + if (timerfd_settime(fd, TFD_TIMER_ABSTIME|TFD_TIMER_CANCEL_ON_SET, &its32, NULL) >= 0) + return TAKE_FD(fd); + } +#endif + + return -errno; +} + +static const char* const timestamp_style_table[_TIMESTAMP_STYLE_MAX] = { + [TIMESTAMP_PRETTY] = "pretty", + [TIMESTAMP_US] = "us", + [TIMESTAMP_UTC] = "utc", + [TIMESTAMP_US_UTC] = "us+utc", + [TIMESTAMP_UNIX] = "unix", +}; + +/* Use the macro for enum → string to allow for aliases */ +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(timestamp_style, TimestampStyle); + +/* For the string → enum mapping we use the generic implementation, but also support two aliases */ +TimestampStyle timestamp_style_from_string(const char *s) { + TimestampStyle t; + + t = (TimestampStyle) string_table_lookup(timestamp_style_table, ELEMENTSOF(timestamp_style_table), s); + if (t >= 0) + return t; + if (STRPTR_IN_SET(s, "µs", "μs")) /* accept both µ symbols in unicode, i.e. micro symbol + Greek small letter mu. */ + return TIMESTAMP_US; + if (STRPTR_IN_SET(s, "µs+utc", "μs+utc")) + return TIMESTAMP_US_UTC; + return t; +} diff --git a/src/basic/time-util.h b/src/basic/time-util.h new file mode 100644 index 0000000..ed4c1aa --- /dev/null +++ b/src/basic/time-util.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include + +typedef uint64_t usec_t; +typedef uint64_t nsec_t; + +#define PRI_NSEC PRIu64 +#define PRI_USEC PRIu64 +#define NSEC_FMT "%" PRI_NSEC +#define USEC_FMT "%" PRI_USEC + +#include "macro.h" + +typedef struct dual_timestamp { + usec_t realtime; + usec_t monotonic; +} dual_timestamp; + +typedef struct triple_timestamp { + usec_t realtime; + usec_t monotonic; + usec_t boottime; +} triple_timestamp; + +typedef enum TimestampStyle { + TIMESTAMP_PRETTY, + TIMESTAMP_US, + TIMESTAMP_UTC, + TIMESTAMP_US_UTC, + TIMESTAMP_UNIX, + TIMESTAMP_DATE, + _TIMESTAMP_STYLE_MAX, + _TIMESTAMP_STYLE_INVALID = -EINVAL, +} TimestampStyle; + +#define USEC_INFINITY ((usec_t) UINT64_MAX) +#define NSEC_INFINITY ((nsec_t) UINT64_MAX) + +#define MSEC_PER_SEC 1000ULL +#define USEC_PER_SEC ((usec_t) 1000000ULL) +#define USEC_PER_MSEC ((usec_t) 1000ULL) +#define NSEC_PER_SEC ((nsec_t) 1000000000ULL) +#define NSEC_PER_MSEC ((nsec_t) 1000000ULL) +#define NSEC_PER_USEC ((nsec_t) 1000ULL) + +#define USEC_PER_MINUTE ((usec_t) (60ULL*USEC_PER_SEC)) +#define NSEC_PER_MINUTE ((nsec_t) (60ULL*NSEC_PER_SEC)) +#define USEC_PER_HOUR ((usec_t) (60ULL*USEC_PER_MINUTE)) +#define NSEC_PER_HOUR ((nsec_t) (60ULL*NSEC_PER_MINUTE)) +#define USEC_PER_DAY ((usec_t) (24ULL*USEC_PER_HOUR)) +#define NSEC_PER_DAY ((nsec_t) (24ULL*NSEC_PER_HOUR)) +#define USEC_PER_WEEK ((usec_t) (7ULL*USEC_PER_DAY)) +#define NSEC_PER_WEEK ((nsec_t) (7ULL*NSEC_PER_DAY)) +#define USEC_PER_MONTH ((usec_t) (2629800ULL*USEC_PER_SEC)) +#define NSEC_PER_MONTH ((nsec_t) (2629800ULL*NSEC_PER_SEC)) +#define USEC_PER_YEAR ((usec_t) (31557600ULL*USEC_PER_SEC)) +#define NSEC_PER_YEAR ((nsec_t) (31557600ULL*NSEC_PER_SEC)) + +/* We assume a maximum timezone length of 6. TZNAME_MAX is not defined on Linux, but glibc internally initializes this + * to 6. Let's rely on that. */ +#define FORMAT_TIMESTAMP_MAX (3U+1U+10U+1U+8U+1U+6U+1U+6U+1U) +#define FORMAT_TIMESTAMP_RELATIVE_MAX 256U +#define FORMAT_TIMESPAN_MAX 64U + +#define TIME_T_MAX (time_t)((UINTMAX_C(1) << ((sizeof(time_t) << 3) - 1)) - 1) + +#define DUAL_TIMESTAMP_NULL ((struct dual_timestamp) {}) +#define TRIPLE_TIMESTAMP_NULL ((struct triple_timestamp) {}) + +usec_t now(clockid_t clock); +nsec_t now_nsec(clockid_t clock); + +usec_t map_clock_usec(usec_t from, clockid_t from_clock, clockid_t to_clock); + +dual_timestamp* dual_timestamp_now(dual_timestamp *ts); +dual_timestamp* dual_timestamp_from_realtime(dual_timestamp *ts, usec_t u); +dual_timestamp* dual_timestamp_from_monotonic(dual_timestamp *ts, usec_t u); +dual_timestamp* dual_timestamp_from_boottime(dual_timestamp *ts, usec_t u); + +triple_timestamp* triple_timestamp_now(triple_timestamp *ts); +triple_timestamp* triple_timestamp_from_realtime(triple_timestamp *ts, usec_t u); +triple_timestamp* triple_timestamp_from_boottime(triple_timestamp *ts, usec_t u); + +#define DUAL_TIMESTAMP_HAS_CLOCK(clock) \ + IN_SET(clock, CLOCK_REALTIME, CLOCK_REALTIME_ALARM, CLOCK_MONOTONIC) + +#define TRIPLE_TIMESTAMP_HAS_CLOCK(clock) \ + IN_SET(clock, CLOCK_REALTIME, CLOCK_REALTIME_ALARM, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) + +static inline bool timestamp_is_set(usec_t timestamp) { + return timestamp > 0 && timestamp != USEC_INFINITY; +} + +static inline bool dual_timestamp_is_set(const dual_timestamp *ts) { + return timestamp_is_set(ts->realtime) || + timestamp_is_set(ts->monotonic); +} + +static inline bool triple_timestamp_is_set(const triple_timestamp *ts) { + return timestamp_is_set(ts->realtime) || + timestamp_is_set(ts->monotonic) || + timestamp_is_set(ts->boottime); +} + +usec_t triple_timestamp_by_clock(triple_timestamp *ts, clockid_t clock); + +usec_t timespec_load(const struct timespec *ts) _pure_; +nsec_t timespec_load_nsec(const struct timespec *ts) _pure_; +struct timespec* timespec_store(struct timespec *ts, usec_t u); +struct timespec* timespec_store_nsec(struct timespec *ts, nsec_t n); + +#define TIMESPEC_STORE(u) timespec_store(&(struct timespec) {}, (u)) + +usec_t timeval_load(const struct timeval *tv) _pure_; +struct timeval* timeval_store(struct timeval *tv, usec_t u); + +#define TIMEVAL_STORE(u) timeval_store(&(struct timeval) {}, (u)) + +char* format_timestamp_style(char *buf, size_t l, usec_t t, TimestampStyle style) _warn_unused_result_; +char* format_timestamp_relative_full(char *buf, size_t l, usec_t t, clockid_t clock, bool implicit_left) _warn_unused_result_; +char* format_timespan(char *buf, size_t l, usec_t t, usec_t accuracy) _warn_unused_result_; + +_warn_unused_result_ +static inline char* format_timestamp_relative(char *buf, size_t l, usec_t t) { + return format_timestamp_relative_full(buf, l, t, CLOCK_REALTIME, /* implicit_left = */ false); +} +_warn_unused_result_ +static inline char* format_timestamp_relative_monotonic(char *buf, size_t l, usec_t t) { + return format_timestamp_relative_full(buf, l, t, CLOCK_MONOTONIC, /* implicit_left = */ false); +} + +_warn_unused_result_ +static inline char* format_timestamp(char *buf, size_t l, usec_t t) { + return format_timestamp_style(buf, l, t, TIMESTAMP_PRETTY); +} + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define FORMAT_TIMESTAMP(t) format_timestamp((char[FORMAT_TIMESTAMP_MAX]){}, FORMAT_TIMESTAMP_MAX, t) +#define FORMAT_TIMESTAMP_RELATIVE(t) \ + format_timestamp_relative((char[FORMAT_TIMESTAMP_RELATIVE_MAX]){}, FORMAT_TIMESTAMP_RELATIVE_MAX, t) +#define FORMAT_TIMESTAMP_RELATIVE_MONOTONIC(t) \ + format_timestamp_relative_monotonic((char[FORMAT_TIMESTAMP_RELATIVE_MAX]){}, FORMAT_TIMESTAMP_RELATIVE_MAX, t) +#define FORMAT_TIMESPAN(t, accuracy) format_timespan((char[FORMAT_TIMESPAN_MAX]){}, FORMAT_TIMESPAN_MAX, t, accuracy) +#define FORMAT_TIMESTAMP_STYLE(t, style) \ + format_timestamp_style((char[FORMAT_TIMESTAMP_MAX]){}, FORMAT_TIMESTAMP_MAX, t, style) + +int parse_timestamp(const char *t, usec_t *ret); + +int parse_sec(const char *t, usec_t *ret); +int parse_sec_fix_0(const char *t, usec_t *ret); +int parse_sec_def_infinity(const char *t, usec_t *ret); +int parse_time(const char *t, usec_t *ret, usec_t default_unit); +int parse_nsec(const char *t, nsec_t *ret); + +int get_timezones(char ***ret); +int verify_timezone(const char *name, int log_level); +static inline bool timezone_is_valid(const char *name, int log_level) { + return verify_timezone(name, log_level) >= 0; +} + +bool clock_supported(clockid_t clock); + +usec_t usec_shift_clock(usec_t, clockid_t from, clockid_t to); + +int get_timezone(char **ret); + +time_t mktime_or_timegm(struct tm *tm, bool utc); +struct tm *localtime_or_gmtime_r(const time_t *t, struct tm *tm, bool utc); + +uint32_t usec_to_jiffies(usec_t usec); +usec_t jiffies_to_usec(uint32_t jiffies); + +bool in_utc_timezone(void); + +static inline usec_t usec_add(usec_t a, usec_t b) { + /* Adds two time values, and makes sure USEC_INFINITY as input results as USEC_INFINITY in output, + * and doesn't overflow. */ + + if (a > USEC_INFINITY - b) /* overflow check */ + return USEC_INFINITY; + + return a + b; +} + +static inline usec_t usec_sub_unsigned(usec_t timestamp, usec_t delta) { + if (timestamp == USEC_INFINITY) /* Make sure infinity doesn't degrade */ + return USEC_INFINITY; + if (timestamp < delta) + return 0; + + return timestamp - delta; +} + +static inline usec_t usec_sub_signed(usec_t timestamp, int64_t delta) { + if (delta == INT64_MIN) { /* prevent overflow */ + assert_cc(-(INT64_MIN + 1) == INT64_MAX); + assert_cc(USEC_INFINITY > INT64_MAX); + return usec_add(timestamp, (usec_t) INT64_MAX + 1); + } + if (delta < 0) + return usec_add(timestamp, (usec_t) (-delta)); + + return usec_sub_unsigned(timestamp, (usec_t) delta); +} + +static inline int usleep_safe(usec_t usec) { + /* usleep() takes useconds_t that is (typically?) uint32_t. Also, usleep() may only support the + * range [0, 1000000]. See usleep(3). Let's override usleep() with clock_nanosleep(). + * + * ⚠️ Note we are not using plain nanosleep() here, since that operates on CLOCK_REALTIME, not + * CLOCK_MONOTONIC! */ + + // FIXME: use RET_NERRNO() macro here. Currently, this header cannot include errno-util.h. + return clock_nanosleep(CLOCK_MONOTONIC, 0, TIMESPEC_STORE(usec), NULL) < 0 ? -errno : 0; +} + +/* The last second we can format is 31. Dec 9999, 1s before midnight, because otherwise we'd enter 5 digit + * year territory. However, since we want to stay away from this in all timezones we take one day off. */ +#define USEC_TIMESTAMP_FORMATTABLE_MAX_64BIT ((usec_t) 253402214399000000) /* Thu 9999-12-30 23:59:59 UTC */ +/* With a 32-bit time_t we can't go beyond 2038... + * We parse timestamp with RFC-822/ISO 8601 (e.g. +06, or -03:00) as UTC, hence the upper bound must be off + * by USEC_PER_DAY. See parse_timestamp() for more details. */ +#define USEC_TIMESTAMP_FORMATTABLE_MAX_32BIT (((usec_t) INT32_MAX) * USEC_PER_SEC - USEC_PER_DAY) +#if SIZEOF_TIME_T == 8 +# define USEC_TIMESTAMP_FORMATTABLE_MAX USEC_TIMESTAMP_FORMATTABLE_MAX_64BIT +#elif SIZEOF_TIME_T == 4 +# define USEC_TIMESTAMP_FORMATTABLE_MAX USEC_TIMESTAMP_FORMATTABLE_MAX_32BIT +#else +# error "Yuck, time_t is neither 4 nor 8 bytes wide?" +#endif + +int time_change_fd(void); + +const char* timestamp_style_to_string(TimestampStyle t) _const_; +TimestampStyle timestamp_style_from_string(const char *s) _pure_; diff --git a/src/basic/tmpfile-util.c b/src/basic/tmpfile-util.c new file mode 100644 index 0000000..e77ca94 --- /dev/null +++ b/src/basic/tmpfile-util.c @@ -0,0 +1,472 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "macro.h" +#include "memfd-util.h" +#include "missing_fcntl.h" +#include "missing_syscall.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "sync-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" + +static int fopen_temporary_internal(int dir_fd, const char *path, FILE **ret_file) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + fd = openat(dir_fd, path, O_CLOEXEC|O_NOCTTY|O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + return -errno; + + /* This assumes that returned FILE object is short-lived and used within the same single-threaded + * context and never shared externally, hence locking is not necessary. */ + + r = take_fdopen_unlocked(&fd, "w", &f); + if (r < 0) { + (void) unlinkat(dir_fd, path, 0); + return r; + } + + if (ret_file) + *ret_file = TAKE_PTR(f); + + return 0; +} + +int fopen_temporary_at(int dir_fd, const char *path, FILE **ret_file, char **ret_path) { + _cleanup_free_ char *t = NULL; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + r = tempfn_random(path, NULL, &t); + if (r < 0) + return r; + + r = fopen_temporary_internal(dir_fd, t, ret_file); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(t); + + return 0; +} + +int fopen_temporary_child_at(int dir_fd, const char *path, FILE **ret_file, char **ret_path) { + _cleanup_free_ char *t = NULL; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + if (!path) { + r = tmp_dir(&path); + if (r < 0) + return r; + } + + r = tempfn_random_child(path, NULL, &t); + if (r < 0) + return r; + + r = fopen_temporary_internal(dir_fd, t, ret_file); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(t); + + return 0; +} + +/* This is much like mkostemp() but is subject to umask(). */ +int mkostemp_safe(char *pattern) { + assert(pattern); + BLOCK_WITH_UMASK(0077); + return RET_NERRNO(mkostemp(pattern, O_CLOEXEC)); +} + +int fmkostemp_safe(char *pattern, const char *mode, FILE **ret_f) { + _cleanup_close_ int fd = -EBADF; + FILE *f; + + fd = mkostemp_safe(pattern); + if (fd < 0) + return fd; + + f = take_fdopen(&fd, mode); + if (!f) + return -errno; + + *ret_f = f; + return 0; +} + +static int tempfn_build(const char *p, const char *pre, const char *post, bool child, char **ret) { + _cleanup_free_ char *d = NULL, *fn = NULL, *nf = NULL, *result = NULL; + size_t len_pre, len_post, len_add; + int r; + + assert(p); + assert(ret); + + /* + * Turns this: + * /foo/bar/waldo + * + * Into this : + * /foo/bar/waldo/.#
 (child == true)
+         *         /foo/bar/.#
waldo (child == false)
+         */
+
+        if (pre && strchr(pre, '/'))
+                return -EINVAL;
+
+        if (post && strchr(post, '/'))
+                return -EINVAL;
+
+        len_pre = strlen_ptr(pre);
+        len_post = strlen_ptr(post);
+        /* NAME_MAX is counted *without* the trailing NUL byte. */
+        if (len_pre > NAME_MAX - STRLEN(".#") ||
+            len_post > NAME_MAX - STRLEN(".#") - len_pre)
+                return -EINVAL;
+
+        len_add = len_pre + len_post + STRLEN(".#");
+
+        if (child) {
+                d = strdup(p);
+                if (!d)
+                        return -ENOMEM;
+        } else {
+                r = path_extract_directory(p, &d);
+                if (r < 0 && r != -EDESTADDRREQ) /* EDESTADDRREQ → No directory specified, just a filename */
+                        return r;
+
+                r = path_extract_filename(p, &fn);
+                if (r < 0)
+                        return r;
+
+                if (strlen(fn) > NAME_MAX - len_add)
+                        /* We cannot simply prepend and append strings to the filename. Let's truncate the filename. */
+                        fn[NAME_MAX - len_add] = '\0';
+        }
+
+        nf = strjoin(".#", strempty(pre), strempty(fn), strempty(post));
+        if (!nf)
+                return -ENOMEM;
+
+        if (d) {
+                if (!path_extend(&d, nf))
+                        return -ENOMEM;
+
+                result = path_simplify(TAKE_PTR(d));
+        } else
+                result = TAKE_PTR(nf);
+
+        if (!path_is_valid(result)) /* New path is not valid? (Maybe because too long?) Refuse. */
+                return -EINVAL;
+
+        *ret = TAKE_PTR(result);
+        return 0;
+}
+
+int tempfn_xxxxxx(const char *p, const char *extra, char **ret) {
+        /*
+         * Turns this:
+         *         /foo/bar/waldo
+         *
+         * Into this:
+         *         /foo/bar/.#waldoXXXXXX
+         */
+
+        return tempfn_build(p, extra, "XXXXXX", /* child = */ false, ret);
+}
+
+int tempfn_random(const char *p, const char *extra, char **ret) {
+        _cleanup_free_ char *s = NULL;
+
+        assert(p);
+        assert(ret);
+
+        /*
+         * Turns this:
+         *         /foo/bar/waldo
+         *
+         * Into this:
+         *         /foo/bar/.#waldobaa2a261115984a9
+         */
+
+        if (asprintf(&s, "%016" PRIx64, random_u64()) < 0)
+                return -ENOMEM;
+
+        return tempfn_build(p, extra, s, /* child = */ false, ret);
+}
+
+int tempfn_random_child(const char *p, const char *extra, char **ret) {
+        _cleanup_free_ char *s = NULL;
+        int r;
+
+        assert(ret);
+
+        /* Turns this:
+         *         /foo/bar/waldo
+         * Into this:
+         *         /foo/bar/waldo/.#3c2b6219aa75d7d0
+         */
+
+        if (!p) {
+                r = tmp_dir(&p);
+                if (r < 0)
+                        return r;
+        }
+
+        if (asprintf(&s, "%016" PRIx64, random_u64()) < 0)
+                return -ENOMEM;
+
+        return tempfn_build(p, extra, s, /* child = */ true, ret);
+}
+
+int open_tmpfile_unlinkable(const char *directory, int flags) {
+        char *p;
+        int fd, r;
+
+        if (!directory) {
+                r = tmp_dir(&directory);
+                if (r < 0)
+                        return r;
+        } else if (isempty(directory))
+                return -EINVAL;
+
+        /* Returns an unlinked temporary file that cannot be linked into the file system anymore */
+
+        /* Try O_TMPFILE first, if it is supported */
+        fd = open(directory, flags|O_TMPFILE|O_EXCL, S_IRUSR|S_IWUSR);
+        if (fd >= 0)
+                return fd;
+
+        /* Fall back to unguessable name + unlinking */
+        p = strjoina(directory, "/systemd-tmp-XXXXXX");
+
+        fd = mkostemp_safe(p);
+        if (fd < 0)
+                return fd;
+
+        (void) unlink(p);
+
+        return fd;
+}
+
+int open_tmpfile_linkable_at(int dir_fd, const char *target, int flags, char **ret_path) {
+        _cleanup_free_ char *tmp = NULL;
+        int r, fd;
+
+        assert(target);
+        assert(ret_path);
+
+        /* Don't allow O_EXCL, as that has a special meaning for O_TMPFILE */
+        assert((flags & O_EXCL) == 0);
+
+        /* Creates a temporary file, that shall be renamed to "target" later. If possible, this uses O_TMPFILE – in
+         * which case "ret_path" will be returned as NULL. If not possible the temporary path name used is returned in
+         * "ret_path". Use link_tmpfile() below to rename the result after writing the file in full. */
+
+        fd = open_parent_at(dir_fd, target, O_TMPFILE|flags, 0640);
+        if (fd >= 0) {
+                *ret_path = NULL;
+                return fd;
+        }
+
+        log_debug_errno(fd, "Failed to use O_TMPFILE for %s: %m", target);
+
+        r = tempfn_random(target, NULL, &tmp);
+        if (r < 0)
+                return r;
+
+        fd = openat(dir_fd, tmp, O_CREAT|O_EXCL|O_NOFOLLOW|O_NOCTTY|flags, 0640);
+        if (fd < 0)
+                return -errno;
+
+        *ret_path = TAKE_PTR(tmp);
+
+        return fd;
+}
+
+int fopen_tmpfile_linkable(const char *target, int flags, char **ret_path, FILE **ret_file) {
+        _cleanup_free_ char *path = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_close_ int fd = -EBADF;
+
+        assert(target);
+        assert(ret_file);
+        assert(ret_path);
+
+        fd = open_tmpfile_linkable(target, flags, &path);
+        if (fd < 0)
+                return fd;
+
+        f = take_fdopen(&fd, "w");
+        if (!f)
+                return -ENOMEM;
+
+        *ret_path = TAKE_PTR(path);
+        *ret_file = TAKE_PTR(f);
+        return 0;
+}
+
+static int link_fd(int fd, int newdirfd, const char *newpath) {
+        int r;
+
+        assert(fd >= 0);
+        assert(newdirfd >= 0 || newdirfd == AT_FDCWD);
+        assert(newpath);
+
+        /* Try symlinking via /proc/fd/ first. */
+        r = RET_NERRNO(linkat(AT_FDCWD, FORMAT_PROC_FD_PATH(fd), newdirfd, newpath, AT_SYMLINK_FOLLOW));
+        if (r != -ENOENT)
+                return r;
+
+        /* Fall back to symlinking via AT_EMPTY_PATH as fallback (this requires CAP_DAC_READ_SEARCH and a
+         * more recent kernel, but does not require /proc/ mounted) */
+        if (proc_mounted() != 0)
+                return r;
+
+        return RET_NERRNO(linkat(fd, "", newdirfd, newpath, AT_EMPTY_PATH));
+}
+
+int link_tmpfile_at(int fd, int dir_fd, const char *path, const char *target, LinkTmpfileFlags flags) {
+        _cleanup_free_ char *tmp = NULL;
+        int r;
+
+        assert(fd >= 0);
+        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+        assert(target);
+
+        /* Moves a temporary file created with open_tmpfile() above into its final place. If "path" is NULL
+         * an fd created with O_TMPFILE is assumed, and linkat() is used. Otherwise it is assumed O_TMPFILE
+         * is not supported on the directory, and renameat2() is used instead. */
+
+        if (FLAGS_SET(flags, LINK_TMPFILE_SYNC) && fsync(fd) < 0)
+                return -errno;
+
+        if (path) {
+                if (FLAGS_SET(flags, LINK_TMPFILE_REPLACE))
+                        r = RET_NERRNO(renameat(dir_fd, path, dir_fd, target));
+                else
+                        r = rename_noreplace(dir_fd, path, dir_fd, target);
+                if (r < 0)
+                        return r;
+        } else {
+
+                r = link_fd(fd, dir_fd, target);
+                if (r != -EEXIST || !FLAGS_SET(flags, LINK_TMPFILE_REPLACE))
+                        return r;
+
+                /* So the target already exists and we were asked to replace it. That sucks a bit, since the kernel's
+                 * linkat() logic does not allow that. We work-around this by linking the file to a random name
+                 * first, and then renaming that to the final name. This reintroduces the race O_TMPFILE kinda is
+                 * trying to fix, but at least the vulnerability window (i.e. where the file is linked into the file
+                 * system under a temporary name) is very short. */
+
+                r = tempfn_random(target, NULL, &tmp);
+                if (r < 0)
+                        return r;
+
+                if (link_fd(fd, dir_fd, tmp) < 0)
+                        return -EEXIST; /* propagate original error */
+
+                r = RET_NERRNO(renameat(dir_fd, tmp, dir_fd, target));
+                if (r < 0) {
+                        (void) unlinkat(dir_fd, tmp, 0);
+                        return r;
+                }
+        }
+
+        if (FLAGS_SET(flags, LINK_TMPFILE_SYNC)) {
+                r = fsync_full(fd);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int flink_tmpfile(FILE *f, const char *path, const char *target, LinkTmpfileFlags flags) {
+        int fd, r;
+
+        assert(f);
+        assert(target);
+
+        fd = fileno(f);
+        if (fd < 0) /* Not all FILE* objects encapsulate fds */
+                return -EBADF;
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return r;
+
+        return link_tmpfile(fd, path, target, flags);
+}
+
+int mkdtemp_malloc(const char *template, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(ret);
+
+        if (template)
+                p = strdup(template);
+        else {
+                const char *tmp;
+
+                r = tmp_dir(&tmp);
+                if (r < 0)
+                        return r;
+
+                p = path_join(tmp, "XXXXXX");
+        }
+        if (!p)
+                return -ENOMEM;
+
+        if (!mkdtemp(p))
+                return -errno;
+
+        *ret = TAKE_PTR(p);
+        return 0;
+}
+
+int mkdtemp_open(const char *template, int flags, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        int fd, r;
+
+        r = mkdtemp_malloc(template, &p);
+        if (r < 0)
+                return r;
+
+        fd = RET_NERRNO(open(p, O_DIRECTORY|O_CLOEXEC|flags));
+        if (fd < 0) {
+                (void) rmdir(p);
+                return fd;
+        }
+
+        if (ret)
+                *ret = TAKE_PTR(p);
+
+        return fd;
+}
diff --git a/src/basic/tmpfile-util.h b/src/basic/tmpfile-util.h
new file mode 100644
index 0000000..8c917c0
--- /dev/null
+++ b/src/basic/tmpfile-util.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+
+int fopen_temporary_at(int dir_fd, const char *path, FILE **ret_file, char **ret_path);
+static inline int fopen_temporary(const char *path, FILE **ret_file, char **ret_path) {
+        return fopen_temporary_at(AT_FDCWD, path, ret_file, ret_path);
+}
+
+int fopen_temporary_child_at(int dir_fd, const char *path, FILE **ret_file, char **ret_path);
+static inline int fopen_temporary_child(const char *path, FILE **ret_file, char **ret_path) {
+        return fopen_temporary_child_at(AT_FDCWD, path, ret_file, ret_path);
+}
+
+int mkostemp_safe(char *pattern);
+int fmkostemp_safe(char *pattern, const char *mode, FILE**_f);
+
+int tempfn_xxxxxx(const char *p, const char *extra, char **ret);
+int tempfn_random(const char *p, const char *extra, char **ret);
+int tempfn_random_child(const char *p, const char *extra, char **ret);
+
+int open_tmpfile_unlinkable(const char *directory, int flags);
+int open_tmpfile_linkable_at(int dir_fd, const char *target, int flags, char **ret_path);
+static inline int open_tmpfile_linkable(const char *target, int flags, char **ret_path) {
+        return open_tmpfile_linkable_at(AT_FDCWD, target, flags, ret_path);
+}
+int fopen_tmpfile_linkable(const char *target, int flags, char **ret_path, FILE **ret_file);
+
+typedef enum LinkTmpfileFlags {
+        LINK_TMPFILE_REPLACE = 1 << 0,
+        LINK_TMPFILE_SYNC    = 1 << 1,
+} LinkTmpfileFlags;
+
+int link_tmpfile_at(int fd, int dir_fd, const char *path, const char *target, LinkTmpfileFlags flags);
+static inline int link_tmpfile(int fd, const char *path, const char *target, LinkTmpfileFlags flags) {
+        return link_tmpfile_at(fd, AT_FDCWD, path, target, flags);
+}
+int flink_tmpfile(FILE *f, const char *path, const char *target, LinkTmpfileFlags flags);
+
+int mkdtemp_malloc(const char *template, char **ret);
+int mkdtemp_open(const char *template, int flags, char **ret);
diff --git a/src/basic/uid-alloc-range.c b/src/basic/uid-alloc-range.c
new file mode 100644
index 0000000..669cb6d
--- /dev/null
+++ b/src/basic/uid-alloc-range.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "chase.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "missing_threads.h"
+#include "string-util.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+static const UGIDAllocationRange default_ugid_allocation_range = {
+        .system_alloc_uid_min = SYSTEM_ALLOC_UID_MIN,
+        .system_uid_max = SYSTEM_UID_MAX,
+        .system_alloc_gid_min = SYSTEM_ALLOC_GID_MIN,
+        .system_gid_max = SYSTEM_GID_MAX,
+};
+
+#if ENABLE_COMPAT_MUTABLE_UID_BOUNDARIES
+static int parse_alloc_uid(const char *path, const char *name, const char *t, uid_t *ret_uid) {
+        uid_t uid;
+        int r;
+
+        r = parse_uid(t, &uid);
+        if (r < 0)
+                return log_debug_errno(r, "%s: failed to parse %s %s, ignoring: %m", path, name, t);
+        if (uid == 0)
+                uid = 1;
+
+        *ret_uid = uid;
+        return 0;
+}
+#endif
+
+int read_login_defs(UGIDAllocationRange *ret_defs, const char *path, const char *root) {
+#if ENABLE_COMPAT_MUTABLE_UID_BOUNDARIES
+        _cleanup_fclose_ FILE *f = NULL;
+        UGIDAllocationRange defs;
+        int r;
+
+        if (!path)
+                path = "/etc/login.defs";
+
+        r = chase_and_fopen_unlocked(path, root, CHASE_PREFIX_ROOT, "re", NULL, &f);
+        if (r == -ENOENT)
+                goto defaults;
+        if (r < 0)
+                return log_debug_errno(r, "Failed to open %s: %m", path);
+
+        defs = default_ugid_allocation_range;
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+                char *t;
+
+                r = read_line(f, LINE_MAX, &line);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to read %s: %m", path);
+                if (r == 0)
+                        break;
+
+                if ((t = first_word(line, "SYS_UID_MIN")))
+                        (void) parse_alloc_uid(path, "SYS_UID_MIN", t, &defs.system_alloc_uid_min);
+                else if ((t = first_word(line, "SYS_UID_MAX")))
+                        (void) parse_alloc_uid(path, "SYS_UID_MAX", t, &defs.system_uid_max);
+                else if ((t = first_word(line, "SYS_GID_MIN")))
+                        (void) parse_alloc_uid(path, "SYS_GID_MIN", t, &defs.system_alloc_gid_min);
+                else if ((t = first_word(line, "SYS_GID_MAX")))
+                        (void) parse_alloc_uid(path, "SYS_GID_MAX", t, &defs.system_gid_max);
+        }
+
+        if (defs.system_alloc_uid_min > defs.system_uid_max) {
+                log_debug("%s: SYS_UID_MIN > SYS_UID_MAX, resetting.", path);
+                defs.system_alloc_uid_min = MIN(defs.system_uid_max - 1, (uid_t) SYSTEM_ALLOC_UID_MIN);
+                /* Look at sys_uid_max to make sure sys_uid_min..sys_uid_max remains a valid range. */
+        }
+        if (defs.system_alloc_gid_min > defs.system_gid_max) {
+                log_debug("%s: SYS_GID_MIN > SYS_GID_MAX, resetting.", path);
+                defs.system_alloc_gid_min = MIN(defs.system_gid_max - 1, (gid_t) SYSTEM_ALLOC_GID_MIN);
+                /* Look at sys_gid_max to make sure sys_gid_min..sys_gid_max remains a valid range. */
+        }
+
+        *ret_defs = defs;
+        return 1;
+defaults:
+#endif
+        *ret_defs = default_ugid_allocation_range;
+        return 0;
+}
+
+const UGIDAllocationRange *acquire_ugid_allocation_range(void) {
+#if ENABLE_COMPAT_MUTABLE_UID_BOUNDARIES
+        static thread_local UGIDAllocationRange defs;
+        static thread_local int initialized = 0; /* == 0 → not initialized yet
+                                                  * < 0 → failure
+                                                  * > 0 → success */
+
+        /* This function will ignore failure to read the file, so it should only be called from places where
+         * we don't crucially depend on the answer. In other words, it's appropriate for journald, but
+         * probably not for sysusers. */
+
+        if (initialized == 0)
+                initialized = read_login_defs(&defs, NULL, NULL) < 0 ? -1 : 1;
+        if (initialized < 0)
+                return &default_ugid_allocation_range;
+
+        return &defs;
+
+#endif
+        return &default_ugid_allocation_range;
+}
+
+bool uid_is_system(uid_t uid) {
+        const UGIDAllocationRange *defs;
+        assert_se(defs = acquire_ugid_allocation_range());
+
+        return uid <= defs->system_uid_max;
+}
+
+bool gid_is_system(gid_t gid) {
+        const UGIDAllocationRange *defs;
+        assert_se(defs = acquire_ugid_allocation_range());
+
+        return gid <= defs->system_gid_max;
+}
+
+bool uid_for_system_journal(uid_t uid) {
+
+        /* Returns true if the specified UID shall get its data stored in the system journal. */
+
+        return uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY || uid_is_container(uid);
+}
diff --git a/src/basic/uid-alloc-range.h b/src/basic/uid-alloc-range.h
new file mode 100644
index 0000000..5badde1
--- /dev/null
+++ b/src/basic/uid-alloc-range.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+bool uid_is_system(uid_t uid);
+bool gid_is_system(gid_t gid);
+
+static inline bool uid_is_dynamic(uid_t uid) {
+        return DYNAMIC_UID_MIN <= uid && uid <= DYNAMIC_UID_MAX;
+}
+
+static inline bool gid_is_dynamic(gid_t gid) {
+        return uid_is_dynamic((uid_t) gid);
+}
+
+static inline bool uid_is_container(uid_t uid) {
+        return CONTAINER_UID_BASE_MIN <= uid && uid <= CONTAINER_UID_BASE_MAX;
+}
+
+static inline bool gid_is_container(gid_t gid) {
+        return uid_is_container((uid_t) gid);
+}
+
+typedef struct UGIDAllocationRange {
+        uid_t system_alloc_uid_min;
+        uid_t system_uid_max;
+        gid_t system_alloc_gid_min;
+        gid_t system_gid_max;
+} UGIDAllocationRange;
+
+int read_login_defs(UGIDAllocationRange *ret_defs, const char *path, const char *root);
+const UGIDAllocationRange *acquire_ugid_allocation_range(void);
+
+bool uid_for_system_journal(uid_t uid);
diff --git a/src/basic/uid-range.c b/src/basic/uid-range.c
new file mode 100644
index 0000000..8463599
--- /dev/null
+++ b/src/basic/uid-range.c
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "macro.h"
+#include "path-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "uid-range.h"
+#include "user-util.h"
+
+UidRange *uid_range_free(UidRange *range) {
+        if (!range)
+                return NULL;
+
+        free(range->entries);
+        return mfree(range);
+}
+
+static bool uid_range_entry_intersect(const UidRangeEntry *a, const UidRangeEntry *b) {
+        assert(a);
+        assert(b);
+
+        return a->start <= b->start + b->nr && a->start + a->nr >= b->start;
+}
+
+static int uid_range_entry_compare(const UidRangeEntry *a, const UidRangeEntry *b) {
+        int r;
+
+        assert(a);
+        assert(b);
+
+        r = CMP(a->start, b->start);
+        if (r != 0)
+                return r;
+
+        return CMP(a->nr, b->nr);
+}
+
+static void uid_range_coalesce(UidRange *range) {
+        assert(range);
+
+        if (range->n_entries <= 0)
+                return;
+
+        typesafe_qsort(range->entries, range->n_entries, uid_range_entry_compare);
+
+        for (size_t i = 0; i < range->n_entries; i++) {
+                UidRangeEntry *x = range->entries + i;
+
+                for (size_t j = i + 1; j < range->n_entries; j++) {
+                        UidRangeEntry *y = range->entries + j;
+                        uid_t begin, end;
+
+                        if (!uid_range_entry_intersect(x, y))
+                                break;
+
+                        begin = MIN(x->start, y->start);
+                        end = MAX(x->start + x->nr, y->start + y->nr);
+
+                        x->start = begin;
+                        x->nr = end - begin;
+
+                        if (range->n_entries > j + 1)
+                                memmove(y, y + 1, sizeof(UidRangeEntry) * (range->n_entries - j - 1));
+
+                        range->n_entries--;
+                        j--;
+                }
+        }
+}
+
+int uid_range_add_internal(UidRange **range, uid_t start, uid_t nr, bool coalesce) {
+        _cleanup_(uid_range_freep) UidRange *range_new = NULL;
+        UidRange *p;
+
+        assert(range);
+
+        if (nr <= 0)
+                return 0;
+
+        if (start > UINT32_MAX - nr) /* overflow check */
+                return -ERANGE;
+
+        if (*range)
+                p = *range;
+        else {
+                range_new = new0(UidRange, 1);
+                if (!range_new)
+                        return -ENOMEM;
+
+                p = range_new;
+        }
+
+        if (!GREEDY_REALLOC(p->entries, p->n_entries + 1))
+                return -ENOMEM;
+
+        p->entries[p->n_entries++] = (UidRangeEntry) {
+                .start = start,
+                .nr = nr,
+        };
+
+        if (coalesce)
+                uid_range_coalesce(p);
+
+        TAKE_PTR(range_new);
+        *range = p;
+
+        return 0;
+}
+
+int uid_range_add_str(UidRange **range, const char *s) {
+        uid_t start, end;
+        int r;
+
+        assert(range);
+        assert(s);
+
+        r = parse_uid_range(s, &start, &end);
+        if (r < 0)
+                return r;
+
+        return uid_range_add_internal(range, start, end - start + 1, /* coalesce = */ true);
+}
+
+int uid_range_next_lower(const UidRange *range, uid_t *uid) {
+        uid_t closest = UID_INVALID, candidate;
+
+        assert(range);
+        assert(uid);
+
+        if (*uid == 0)
+                return -EBUSY;
+
+        candidate = *uid - 1;
+
+        for (size_t i = 0; i < range->n_entries; i++) {
+                uid_t begin, end;
+
+                begin = range->entries[i].start;
+                end = range->entries[i].start + range->entries[i].nr - 1;
+
+                if (candidate >= begin && candidate <= end) {
+                        *uid = candidate;
+                        return 1;
+                }
+
+                if (end < candidate)
+                        closest = end;
+        }
+
+        if (closest == UID_INVALID)
+                return -EBUSY;
+
+        *uid = closest;
+        return 1;
+}
+
+bool uid_range_covers(const UidRange *range, uid_t start, uid_t nr) {
+        if (nr == 0) /* empty range? always covered... */
+                return true;
+
+        if (start > UINT32_MAX - nr) /* range overflows? definitely not covered... */
+                return false;
+
+        if (!range)
+                return false;
+
+        for (size_t i = 0; i < range->n_entries; i++)
+                if (start >= range->entries[i].start &&
+                    start + nr <= range->entries[i].start + range->entries[i].nr)
+                        return true;
+
+        return false;
+}
+
+int uid_range_load_userns(UidRange **ret, const char *path) {
+        _cleanup_(uid_range_freep) UidRange *range = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        /* If 'path' is NULL loads the UID range of the userns namespace we run. Otherwise load the data from
+         * the specified file (which can be either uid_map or gid_map, in case caller needs to deal with GID
+         * maps).
+         *
+         * To simplify things this will modify the passed array in case of later failure. */
+
+        assert(ret);
+
+        if (!path)
+                path = "/proc/self/uid_map";
+
+        f = fopen(path, "re");
+        if (!f) {
+                r = -errno;
+
+                if (r == -ENOENT && path_startswith(path, "/proc/"))
+                        return proc_mounted() > 0 ? -EOPNOTSUPP : -ENOSYS;
+
+                return r;
+        }
+
+        range = new0(UidRange, 1);
+        if (!range)
+                return -ENOMEM;
+
+        for (;;) {
+                uid_t uid_base, uid_shift, uid_range;
+                int k;
+
+                errno = 0;
+                k = fscanf(f, UID_FMT " " UID_FMT " " UID_FMT "\n", &uid_base, &uid_shift, &uid_range);
+                if (k == EOF) {
+                        if (ferror(f))
+                                return errno_or_else(EIO);
+
+                        break;
+                }
+                if (k != 3)
+                        return -EBADMSG;
+
+                r = uid_range_add_internal(&range, uid_base, uid_range, /* coalesce = */ false);
+                if (r < 0)
+                        return r;
+        }
+
+        uid_range_coalesce(range);
+
+        *ret = TAKE_PTR(range);
+        return 0;
+}
diff --git a/src/basic/uid-range.h b/src/basic/uid-range.h
new file mode 100644
index 0000000..461a511
--- /dev/null
+++ b/src/basic/uid-range.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+#include "macro.h"
+
+typedef struct UidRangeEntry {
+        uid_t start, nr;
+} UidRangeEntry;
+
+typedef struct UidRange {
+        UidRangeEntry *entries;
+        size_t n_entries;
+} UidRange;
+
+UidRange *uid_range_free(UidRange *range);
+DEFINE_TRIVIAL_CLEANUP_FUNC(UidRange*, uid_range_free);
+
+int uid_range_add_internal(UidRange **range, uid_t start, uid_t nr, bool coalesce);
+static inline int uid_range_add(UidRange **range, uid_t start, uid_t nr) {
+        return uid_range_add_internal(range, start, nr, true);
+}
+int uid_range_add_str(UidRange **range, const char *s);
+
+int uid_range_next_lower(const UidRange *range, uid_t *uid);
+
+bool uid_range_covers(const UidRange *range, uid_t start, uid_t nr);
+static inline bool uid_range_contains(const UidRange *range, uid_t uid) {
+        return uid_range_covers(range, uid, 1);
+}
+
+int uid_range_load_userns(UidRange **ret, const char *path);
diff --git a/src/basic/umask-util.h b/src/basic/umask-util.h
new file mode 100644
index 0000000..00417fa
--- /dev/null
+++ b/src/basic/umask-util.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+
+#include "macro.h"
+
+static inline void umaskp(mode_t *u) {
+        umask(*u);
+}
+
+#define _cleanup_umask_ _cleanup_(umaskp)
+
+/* We make use of the fact here that the umask() syscall uses only the lower 9 bits of mode_t, although
+ * mode_t has space for the file type in the bits further up. We simply OR in the file type mask S_IFMT to
+ * distinguish the first and the second iteration of the WITH_UMASK() loop, so that we can run the first one,
+ * and exit on the second. */
+
+assert_cc((S_IFMT & 0777) == 0);
+
+#define WITH_UMASK(mask)                                            \
+        for (_cleanup_umask_ mode_t _saved_umask_ = umask(mask) | S_IFMT; \
+             FLAGS_SET(_saved_umask_, S_IFMT);                          \
+             _saved_umask_ &= 0777)
+
+#define BLOCK_WITH_UMASK(mask) \
+        _unused_ _cleanup_umask_ mode_t _saved_umask_ = umask(mask);
diff --git a/src/basic/unaligned.h b/src/basic/unaligned.h
new file mode 100644
index 0000000..04580cf
--- /dev/null
+++ b/src/basic/unaligned.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+#include "unaligned-fundamental.h"
+
+/* BE */
+
+static inline uint16_t unaligned_read_be16(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint16_t x; } *u = _u;
+
+        return be16toh(u->x);
+}
+
+static inline uint32_t unaligned_read_be32(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint32_t x; } *u = _u;
+
+        return be32toh(u->x);
+}
+
+static inline uint64_t unaligned_read_be64(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint64_t x; } *u = _u;
+
+        return be64toh(u->x);
+}
+
+static inline void unaligned_write_be16(void *_u, uint16_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint16_t x; } *u = _u;
+
+        u->x = be16toh(a);
+}
+
+static inline void unaligned_write_be32(void *_u, uint32_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint32_t x; } *u = _u;
+
+        u->x = be32toh(a);
+}
+
+static inline void unaligned_write_be64(void *_u, uint64_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint64_t x; } *u = _u;
+
+        u->x = be64toh(a);
+}
+
+/* LE */
+
+static inline uint16_t unaligned_read_le16(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint16_t x; } *u = _u;
+
+        return le16toh(u->x);
+}
+
+static inline uint32_t unaligned_read_le32(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint32_t x; } *u = _u;
+
+        return le32toh(u->x);
+}
+
+static inline uint64_t unaligned_read_le64(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint64_t x; } *u = _u;
+
+        return le64toh(u->x);
+}
+
+static inline void unaligned_write_le16(void *_u, uint16_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint16_t x; } *u = _u;
+
+        u->x = le16toh(a);
+}
+
+static inline void unaligned_write_le32(void *_u, uint32_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint32_t x; } *u = _u;
+
+        u->x = le32toh(a);
+}
+
+static inline void unaligned_write_le64(void *_u, uint64_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint64_t x; } *u = _u;
+
+        u->x = le64toh(a);
+}
diff --git a/src/basic/unit-def.c b/src/basic/unit-def.c
new file mode 100644
index 0000000..908c0cd
--- /dev/null
+++ b/src/basic/unit-def.c
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-label.h"
+#include "string-table.h"
+#include "unit-def.h"
+#include "unit-name.h"
+
+char *unit_dbus_path_from_name(const char *name) {
+        _cleanup_free_ char *e = NULL;
+
+        assert(name);
+
+        e = bus_label_escape(name);
+        if (!e)
+                return NULL;
+
+        return strjoin("/org/freedesktop/systemd1/unit/", e);
+}
+
+int unit_name_from_dbus_path(const char *path, char **name) {
+        const char *e;
+        char *n;
+
+        e = startswith(path, "/org/freedesktop/systemd1/unit/");
+        if (!e)
+                return -EINVAL;
+
+        n = bus_label_unescape(e);
+        if (!n)
+                return -ENOMEM;
+
+        *name = n;
+        return 0;
+}
+
+const char* unit_dbus_interface_from_type(UnitType t) {
+
+        static const char *const table[_UNIT_TYPE_MAX] = {
+                [UNIT_SERVICE]   = "org.freedesktop.systemd1.Service",
+                [UNIT_SOCKET]    = "org.freedesktop.systemd1.Socket",
+                [UNIT_TARGET]    = "org.freedesktop.systemd1.Target",
+                [UNIT_DEVICE]    = "org.freedesktop.systemd1.Device",
+                [UNIT_MOUNT]     = "org.freedesktop.systemd1.Mount",
+                [UNIT_AUTOMOUNT] = "org.freedesktop.systemd1.Automount",
+                [UNIT_SWAP]      = "org.freedesktop.systemd1.Swap",
+                [UNIT_TIMER]     = "org.freedesktop.systemd1.Timer",
+                [UNIT_PATH]      = "org.freedesktop.systemd1.Path",
+                [UNIT_SLICE]     = "org.freedesktop.systemd1.Slice",
+                [UNIT_SCOPE]     = "org.freedesktop.systemd1.Scope",
+        };
+
+        if (t < 0)
+                return NULL;
+        if (t >= _UNIT_TYPE_MAX)
+                return NULL;
+
+        return table[t];
+}
+
+const char *unit_dbus_interface_from_name(const char *name) {
+        UnitType t;
+
+        t = unit_name_to_type(name);
+        if (t < 0)
+                return NULL;
+
+        return unit_dbus_interface_from_type(t);
+}
+
+const char* unit_type_to_capitalized_string(UnitType t) {
+        const char *di = unit_dbus_interface_from_type(t);
+        if (!di)
+                return NULL;
+
+        return ASSERT_PTR(startswith(di, "org.freedesktop.systemd1."));
+}
+
+static const char* const unit_type_table[_UNIT_TYPE_MAX] = {
+        [UNIT_SERVICE]   = "service",
+        [UNIT_SOCKET]    = "socket",
+        [UNIT_TARGET]    = "target",
+        [UNIT_DEVICE]    = "device",
+        [UNIT_MOUNT]     = "mount",
+        [UNIT_AUTOMOUNT] = "automount",
+        [UNIT_SWAP]      = "swap",
+        [UNIT_TIMER]     = "timer",
+        [UNIT_PATH]      = "path",
+        [UNIT_SLICE]     = "slice",
+        [UNIT_SCOPE]     = "scope",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_type, UnitType);
+
+static const char* const unit_load_state_table[_UNIT_LOAD_STATE_MAX] = {
+        [UNIT_STUB]        = "stub",
+        [UNIT_LOADED]      = "loaded",
+        [UNIT_NOT_FOUND]   = "not-found",
+        [UNIT_BAD_SETTING] = "bad-setting",
+        [UNIT_ERROR]       = "error",
+        [UNIT_MERGED]      = "merged",
+        [UNIT_MASKED]      = "masked"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_load_state, UnitLoadState);
+
+static const char* const unit_active_state_table[_UNIT_ACTIVE_STATE_MAX] = {
+        [UNIT_ACTIVE]       = "active",
+        [UNIT_RELOADING]    = "reloading",
+        [UNIT_INACTIVE]     = "inactive",
+        [UNIT_FAILED]       = "failed",
+        [UNIT_ACTIVATING]   = "activating",
+        [UNIT_DEACTIVATING] = "deactivating",
+        [UNIT_MAINTENANCE]  = "maintenance",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_active_state, UnitActiveState);
+
+static const char* const freezer_state_table[_FREEZER_STATE_MAX] = {
+        [FREEZER_RUNNING]  = "running",
+        [FREEZER_FREEZING] = "freezing",
+        [FREEZER_FROZEN]   = "frozen",
+        [FREEZER_THAWING]  = "thawing",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(freezer_state, FreezerState);
+
+static const char* const unit_marker_table[_UNIT_MARKER_MAX] = {
+        [UNIT_MARKER_NEEDS_RELOAD]  = "needs-reload",
+        [UNIT_MARKER_NEEDS_RESTART] = "needs-restart",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_marker, UnitMarker);
+
+static const char* const automount_state_table[_AUTOMOUNT_STATE_MAX] = {
+        [AUTOMOUNT_DEAD]    = "dead",
+        [AUTOMOUNT_WAITING] = "waiting",
+        [AUTOMOUNT_RUNNING] = "running",
+        [AUTOMOUNT_FAILED]  = "failed"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(automount_state, AutomountState);
+
+static const char* const device_state_table[_DEVICE_STATE_MAX] = {
+        [DEVICE_DEAD]      = "dead",
+        [DEVICE_TENTATIVE] = "tentative",
+        [DEVICE_PLUGGED]   = "plugged",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(device_state, DeviceState);
+
+static const char* const mount_state_table[_MOUNT_STATE_MAX] = {
+        [MOUNT_DEAD]               = "dead",
+        [MOUNT_MOUNTING]           = "mounting",
+        [MOUNT_MOUNTING_DONE]      = "mounting-done",
+        [MOUNT_MOUNTED]            = "mounted",
+        [MOUNT_REMOUNTING]         = "remounting",
+        [MOUNT_UNMOUNTING]         = "unmounting",
+        [MOUNT_REMOUNTING_SIGTERM] = "remounting-sigterm",
+        [MOUNT_REMOUNTING_SIGKILL] = "remounting-sigkill",
+        [MOUNT_UNMOUNTING_SIGTERM] = "unmounting-sigterm",
+        [MOUNT_UNMOUNTING_SIGKILL] = "unmounting-sigkill",
+        [MOUNT_FAILED]             = "failed",
+        [MOUNT_CLEANING]           = "cleaning",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_state, MountState);
+
+static const char* const path_state_table[_PATH_STATE_MAX] = {
+        [PATH_DEAD]    = "dead",
+        [PATH_WAITING] = "waiting",
+        [PATH_RUNNING] = "running",
+        [PATH_FAILED]  = "failed"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_state, PathState);
+
+static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
+        [SCOPE_DEAD]         = "dead",
+        [SCOPE_START_CHOWN]  = "start-chown",
+        [SCOPE_RUNNING]      = "running",
+        [SCOPE_ABANDONED]    = "abandoned",
+        [SCOPE_STOP_SIGTERM] = "stop-sigterm",
+        [SCOPE_STOP_SIGKILL] = "stop-sigkill",
+        [SCOPE_FAILED]       = "failed",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(scope_state, ScopeState);
+
+static const char* const service_state_table[_SERVICE_STATE_MAX] = {
+        [SERVICE_DEAD]                       = "dead",
+        [SERVICE_CONDITION]                  = "condition",
+        [SERVICE_START_PRE]                  = "start-pre",
+        [SERVICE_START]                      = "start",
+        [SERVICE_START_POST]                 = "start-post",
+        [SERVICE_RUNNING]                    = "running",
+        [SERVICE_EXITED]                     = "exited",
+        [SERVICE_RELOAD]                     = "reload",
+        [SERVICE_RELOAD_SIGNAL]              = "reload-signal",
+        [SERVICE_RELOAD_NOTIFY]              = "reload-notify",
+        [SERVICE_STOP]                       = "stop",
+        [SERVICE_STOP_WATCHDOG]              = "stop-watchdog",
+        [SERVICE_STOP_SIGTERM]               = "stop-sigterm",
+        [SERVICE_STOP_SIGKILL]               = "stop-sigkill",
+        [SERVICE_STOP_POST]                  = "stop-post",
+        [SERVICE_FINAL_WATCHDOG]             = "final-watchdog",
+        [SERVICE_FINAL_SIGTERM]              = "final-sigterm",
+        [SERVICE_FINAL_SIGKILL]              = "final-sigkill",
+        [SERVICE_FAILED]                     = "failed",
+        [SERVICE_DEAD_BEFORE_AUTO_RESTART]   = "dead-before-auto-restart",
+        [SERVICE_FAILED_BEFORE_AUTO_RESTART] = "failed-before-auto-restart",
+        [SERVICE_DEAD_RESOURCES_PINNED]      = "dead-resources-pinned",
+        [SERVICE_AUTO_RESTART]               = "auto-restart",
+        [SERVICE_AUTO_RESTART_QUEUED]        = "auto-restart-queued",
+        [SERVICE_CLEANING]                   = "cleaning",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_state, ServiceState);
+
+static const char* const slice_state_table[_SLICE_STATE_MAX] = {
+        [SLICE_DEAD]   = "dead",
+        [SLICE_ACTIVE] = "active"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(slice_state, SliceState);
+
+static const char* const socket_state_table[_SOCKET_STATE_MAX] = {
+        [SOCKET_DEAD]             = "dead",
+        [SOCKET_START_PRE]        = "start-pre",
+        [SOCKET_START_CHOWN]      = "start-chown",
+        [SOCKET_START_POST]       = "start-post",
+        [SOCKET_LISTENING]        = "listening",
+        [SOCKET_RUNNING]          = "running",
+        [SOCKET_STOP_PRE]         = "stop-pre",
+        [SOCKET_STOP_PRE_SIGTERM] = "stop-pre-sigterm",
+        [SOCKET_STOP_PRE_SIGKILL] = "stop-pre-sigkill",
+        [SOCKET_STOP_POST]        = "stop-post",
+        [SOCKET_FINAL_SIGTERM]    = "final-sigterm",
+        [SOCKET_FINAL_SIGKILL]    = "final-sigkill",
+        [SOCKET_FAILED]           = "failed",
+        [SOCKET_CLEANING]         = "cleaning",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_state, SocketState);
+
+static const char* const swap_state_table[_SWAP_STATE_MAX] = {
+        [SWAP_DEAD]                 = "dead",
+        [SWAP_ACTIVATING]           = "activating",
+        [SWAP_ACTIVATING_DONE]      = "activating-done",
+        [SWAP_ACTIVE]               = "active",
+        [SWAP_DEACTIVATING]         = "deactivating",
+        [SWAP_DEACTIVATING_SIGTERM] = "deactivating-sigterm",
+        [SWAP_DEACTIVATING_SIGKILL] = "deactivating-sigkill",
+        [SWAP_FAILED]               = "failed",
+        [SWAP_CLEANING]             = "cleaning",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_state, SwapState);
+
+static const char* const target_state_table[_TARGET_STATE_MAX] = {
+        [TARGET_DEAD]   = "dead",
+        [TARGET_ACTIVE] = "active"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(target_state, TargetState);
+
+static const char* const timer_state_table[_TIMER_STATE_MAX] = {
+        [TIMER_DEAD]    = "dead",
+        [TIMER_WAITING] = "waiting",
+        [TIMER_RUNNING] = "running",
+        [TIMER_ELAPSED] = "elapsed",
+        [TIMER_FAILED]  = "failed"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_state, TimerState);
+
+static const char* const unit_dependency_table[_UNIT_DEPENDENCY_MAX] = {
+        [UNIT_REQUIRES]               = "Requires",
+        [UNIT_REQUISITE]              = "Requisite",
+        [UNIT_WANTS]                  = "Wants",
+        [UNIT_BINDS_TO]               = "BindsTo",
+        [UNIT_PART_OF]                = "PartOf",
+        [UNIT_UPHOLDS]                = "Upholds",
+        [UNIT_REQUIRED_BY]            = "RequiredBy",
+        [UNIT_REQUISITE_OF]           = "RequisiteOf",
+        [UNIT_WANTED_BY]              = "WantedBy",
+        [UNIT_BOUND_BY]               = "BoundBy",
+        [UNIT_UPHELD_BY]              = "UpheldBy",
+        [UNIT_CONSISTS_OF]            = "ConsistsOf",
+        [UNIT_CONFLICTS]              = "Conflicts",
+        [UNIT_CONFLICTED_BY]          = "ConflictedBy",
+        [UNIT_BEFORE]                 = "Before",
+        [UNIT_AFTER]                  = "After",
+        [UNIT_ON_SUCCESS]             = "OnSuccess",
+        [UNIT_ON_SUCCESS_OF]          = "OnSuccessOf",
+        [UNIT_ON_FAILURE]             = "OnFailure",
+        [UNIT_ON_FAILURE_OF]          = "OnFailureOf",
+        [UNIT_TRIGGERS]               = "Triggers",
+        [UNIT_TRIGGERED_BY]           = "TriggeredBy",
+        [UNIT_PROPAGATES_RELOAD_TO]   = "PropagatesReloadTo",
+        [UNIT_RELOAD_PROPAGATED_FROM] = "ReloadPropagatedFrom",
+        [UNIT_PROPAGATES_STOP_TO]     = "PropagatesStopTo",
+        [UNIT_STOP_PROPAGATED_FROM]   = "StopPropagatedFrom",
+        [UNIT_JOINS_NAMESPACE_OF]     = "JoinsNamespaceOf",
+        [UNIT_REFERENCES]             = "References",
+        [UNIT_REFERENCED_BY]          = "ReferencedBy",
+        [UNIT_IN_SLICE]               = "InSlice",
+        [UNIT_SLICE_OF]               = "SliceOf",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_dependency, UnitDependency);
+
+static const char* const notify_access_table[_NOTIFY_ACCESS_MAX] = {
+        [NOTIFY_NONE] = "none",
+        [NOTIFY_MAIN] = "main",
+        [NOTIFY_EXEC] = "exec",
+        [NOTIFY_ALL]  = "all"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(notify_access, NotifyAccess);
+
+SpecialGlyph unit_active_state_to_glyph(UnitActiveState state) {
+        static const SpecialGlyph map[_UNIT_ACTIVE_STATE_MAX] = {
+                [UNIT_ACTIVE]       = SPECIAL_GLYPH_BLACK_CIRCLE,
+                [UNIT_RELOADING]    = SPECIAL_GLYPH_CIRCLE_ARROW,
+                [UNIT_INACTIVE]     = SPECIAL_GLYPH_WHITE_CIRCLE,
+                [UNIT_FAILED]       = SPECIAL_GLYPH_MULTIPLICATION_SIGN,
+                [UNIT_ACTIVATING]   = SPECIAL_GLYPH_BLACK_CIRCLE,
+                [UNIT_DEACTIVATING] = SPECIAL_GLYPH_BLACK_CIRCLE,
+                [UNIT_MAINTENANCE]  = SPECIAL_GLYPH_WHITE_CIRCLE,
+        };
+
+        if (state < 0)
+                return _SPECIAL_GLYPH_INVALID;
+
+        assert(state < _UNIT_ACTIVE_STATE_MAX);
+        return map[state];
+}
diff --git a/src/basic/unit-def.h b/src/basic/unit-def.h
new file mode 100644
index 0000000..6627da5
--- /dev/null
+++ b/src/basic/unit-def.h
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "errno-list.h"
+#include "glyph-util.h"
+#include "macro.h"
+
+/* The enum order is used to order unit jobs in the job queue
+ * when other criteria (cpu weight, nice level) are identical.
+ * In this case service units have the highest priority. */
+typedef enum UnitType {
+        UNIT_SERVICE,
+        UNIT_MOUNT,
+        UNIT_SWAP,
+        UNIT_SOCKET,
+        UNIT_TARGET,
+        UNIT_DEVICE,
+        UNIT_AUTOMOUNT,
+        UNIT_TIMER,
+        UNIT_PATH,
+        UNIT_SLICE,
+        UNIT_SCOPE,
+        _UNIT_TYPE_MAX,
+        _UNIT_TYPE_INVALID = -EINVAL,
+        _UNIT_TYPE_ERRNO_MAX = -ERRNO_MAX, /* Ensure the whole errno range fits into this enum */
+} UnitType;
+
+typedef enum UnitLoadState {
+        UNIT_STUB,
+        UNIT_LOADED,
+        UNIT_NOT_FOUND,    /* error condition #1: unit file not found */
+        UNIT_BAD_SETTING,  /* error condition #2: we couldn't parse some essential unit file setting */
+        UNIT_ERROR,        /* error condition #3: other "system" error, catchall for the rest */
+        UNIT_MERGED,
+        UNIT_MASKED,
+        _UNIT_LOAD_STATE_MAX,
+        _UNIT_LOAD_STATE_INVALID = -EINVAL,
+} UnitLoadState;
+
+typedef enum UnitActiveState {
+        UNIT_ACTIVE,
+        UNIT_RELOADING,
+        UNIT_INACTIVE,
+        UNIT_FAILED,
+        UNIT_ACTIVATING,
+        UNIT_DEACTIVATING,
+        UNIT_MAINTENANCE,
+        _UNIT_ACTIVE_STATE_MAX,
+        _UNIT_ACTIVE_STATE_INVALID = -EINVAL,
+} UnitActiveState;
+
+typedef enum FreezerState {
+        FREEZER_RUNNING,
+        FREEZER_FREEZING,
+        FREEZER_FROZEN,
+        FREEZER_THAWING,
+        _FREEZER_STATE_MAX,
+        _FREEZER_STATE_INVALID = -EINVAL,
+} FreezerState;
+
+typedef enum UnitMarker {
+        UNIT_MARKER_NEEDS_RELOAD,
+        UNIT_MARKER_NEEDS_RESTART,
+        _UNIT_MARKER_MAX,
+        _UNIT_MARKER_INVALID = -EINVAL,
+} UnitMarker;
+
+typedef enum AutomountState {
+        AUTOMOUNT_DEAD,
+        AUTOMOUNT_WAITING,
+        AUTOMOUNT_RUNNING,
+        AUTOMOUNT_FAILED,
+        _AUTOMOUNT_STATE_MAX,
+        _AUTOMOUNT_STATE_INVALID = -EINVAL,
+} AutomountState;
+
+/* We simply watch devices, we cannot plug/unplug them. That
+ * simplifies the state engine greatly */
+typedef enum DeviceState {
+        DEVICE_DEAD,
+        DEVICE_TENTATIVE, /* mounted or swapped, but not (yet) announced by udev */
+        DEVICE_PLUGGED,   /* announced by udev */
+        _DEVICE_STATE_MAX,
+        _DEVICE_STATE_INVALID = -EINVAL,
+} DeviceState;
+
+typedef enum MountState {
+        MOUNT_DEAD,
+        MOUNT_MOUNTING,               /* /usr/bin/mount is running, but the mount is not done yet. */
+        MOUNT_MOUNTING_DONE,          /* /usr/bin/mount is running, and the mount is done. */
+        MOUNT_MOUNTED,
+        MOUNT_REMOUNTING,
+        MOUNT_UNMOUNTING,
+        MOUNT_REMOUNTING_SIGTERM,
+        MOUNT_REMOUNTING_SIGKILL,
+        MOUNT_UNMOUNTING_SIGTERM,
+        MOUNT_UNMOUNTING_SIGKILL,
+        MOUNT_FAILED,
+        MOUNT_CLEANING,
+        _MOUNT_STATE_MAX,
+        _MOUNT_STATE_INVALID = -EINVAL,
+} MountState;
+
+typedef enum PathState {
+        PATH_DEAD,
+        PATH_WAITING,
+        PATH_RUNNING,
+        PATH_FAILED,
+        _PATH_STATE_MAX,
+        _PATH_STATE_INVALID = -EINVAL,
+} PathState;
+
+typedef enum ScopeState {
+        SCOPE_DEAD,
+        SCOPE_START_CHOWN,
+        SCOPE_RUNNING,
+        SCOPE_ABANDONED,
+        SCOPE_STOP_SIGTERM,
+        SCOPE_STOP_SIGKILL,
+        SCOPE_FAILED,
+        _SCOPE_STATE_MAX,
+        _SCOPE_STATE_INVALID = -EINVAL,
+} ScopeState;
+
+typedef enum ServiceState {
+        SERVICE_DEAD,
+        SERVICE_CONDITION,
+        SERVICE_START_PRE,
+        SERVICE_START,
+        SERVICE_START_POST,
+        SERVICE_RUNNING,
+        SERVICE_EXITED,            /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */
+        SERVICE_RELOAD,            /* Reloading via ExecReload= */
+        SERVICE_RELOAD_SIGNAL,     /* Reloading via SIGHUP requested */
+        SERVICE_RELOAD_NOTIFY,     /* Waiting for READY=1 after RELOADING=1 notify */
+        SERVICE_STOP,              /* No STOP_PRE state, instead just register multiple STOP executables */
+        SERVICE_STOP_WATCHDOG,
+        SERVICE_STOP_SIGTERM,
+        SERVICE_STOP_SIGKILL,
+        SERVICE_STOP_POST,
+        SERVICE_FINAL_WATCHDOG,    /* In case the STOP_POST executable needs to be aborted. */
+        SERVICE_FINAL_SIGTERM,     /* In case the STOP_POST executable hangs, we shoot that down, too */
+        SERVICE_FINAL_SIGKILL,
+        SERVICE_FAILED,
+        SERVICE_DEAD_BEFORE_AUTO_RESTART,
+        SERVICE_FAILED_BEFORE_AUTO_RESTART,
+        SERVICE_DEAD_RESOURCES_PINNED,  /* Like SERVICE_DEAD, but with pinned resources */
+        SERVICE_AUTO_RESTART,
+        SERVICE_AUTO_RESTART_QUEUED,
+        SERVICE_CLEANING,
+        _SERVICE_STATE_MAX,
+        _SERVICE_STATE_INVALID = -EINVAL,
+} ServiceState;
+
+typedef enum SliceState {
+        SLICE_DEAD,
+        SLICE_ACTIVE,
+        _SLICE_STATE_MAX,
+        _SLICE_STATE_INVALID = -EINVAL,
+} SliceState;
+
+typedef enum SocketState {
+        SOCKET_DEAD,
+        SOCKET_START_PRE,
+        SOCKET_START_CHOWN,
+        SOCKET_START_POST,
+        SOCKET_LISTENING,
+        SOCKET_RUNNING,
+        SOCKET_STOP_PRE,
+        SOCKET_STOP_PRE_SIGTERM,
+        SOCKET_STOP_PRE_SIGKILL,
+        SOCKET_STOP_POST,
+        SOCKET_FINAL_SIGTERM,
+        SOCKET_FINAL_SIGKILL,
+        SOCKET_FAILED,
+        SOCKET_CLEANING,
+        _SOCKET_STATE_MAX,
+        _SOCKET_STATE_INVALID = -EINVAL,
+} SocketState;
+
+typedef enum SwapState {
+        SWAP_DEAD,
+        SWAP_ACTIVATING,               /* /sbin/swapon is running, but the swap not yet enabled. */
+        SWAP_ACTIVATING_DONE,          /* /sbin/swapon is running, and the swap is done. */
+        SWAP_ACTIVE,
+        SWAP_DEACTIVATING,
+        SWAP_DEACTIVATING_SIGTERM,
+        SWAP_DEACTIVATING_SIGKILL,
+        SWAP_FAILED,
+        SWAP_CLEANING,
+        _SWAP_STATE_MAX,
+        _SWAP_STATE_INVALID = -EINVAL,
+} SwapState;
+
+typedef enum TargetState {
+        TARGET_DEAD,
+        TARGET_ACTIVE,
+        _TARGET_STATE_MAX,
+        _TARGET_STATE_INVALID = -EINVAL,
+} TargetState;
+
+typedef enum TimerState {
+        TIMER_DEAD,
+        TIMER_WAITING,
+        TIMER_RUNNING,
+        TIMER_ELAPSED,
+        TIMER_FAILED,
+        _TIMER_STATE_MAX,
+        _TIMER_STATE_INVALID = -EINVAL,
+} TimerState;
+
+typedef enum UnitDependency {
+        /* Positive dependencies */
+        UNIT_REQUIRES,
+        UNIT_REQUISITE,
+        UNIT_WANTS,
+        UNIT_BINDS_TO,
+        UNIT_PART_OF,
+        UNIT_UPHOLDS,
+
+        /* Inverse of the above */
+        UNIT_REQUIRED_BY,             /* inverse of 'requires' is 'required_by' */
+        UNIT_REQUISITE_OF,            /* inverse of 'requisite' is 'requisite_of' */
+        UNIT_WANTED_BY,               /* inverse of 'wants' */
+        UNIT_BOUND_BY,                /* inverse of 'binds_to' */
+        UNIT_CONSISTS_OF,             /* inverse of 'part_of' */
+        UNIT_UPHELD_BY,               /* inverse of 'uphold' */
+
+        /* Negative dependencies */
+        UNIT_CONFLICTS,               /* inverse of 'conflicts' is 'conflicted_by' */
+        UNIT_CONFLICTED_BY,
+
+        /* Order */
+        UNIT_BEFORE,                  /* inverse of 'before' is 'after' and vice versa */
+        UNIT_AFTER,
+
+        /* OnSuccess= + OnFailure= */
+        UNIT_ON_SUCCESS,
+        UNIT_ON_SUCCESS_OF,
+        UNIT_ON_FAILURE,
+        UNIT_ON_FAILURE_OF,
+
+        /* Triggers (i.e. a socket triggers a service) */
+        UNIT_TRIGGERS,
+        UNIT_TRIGGERED_BY,
+
+        /* Propagate reloads */
+        UNIT_PROPAGATES_RELOAD_TO,
+        UNIT_RELOAD_PROPAGATED_FROM,
+
+        /* Propagate stops */
+        UNIT_PROPAGATES_STOP_TO,
+        UNIT_STOP_PROPAGATED_FROM,
+
+        /* Joins namespace of */
+        UNIT_JOINS_NAMESPACE_OF,
+
+        /* Reference information for GC logic */
+        UNIT_REFERENCES,              /* Inverse of 'references' is 'referenced_by' */
+        UNIT_REFERENCED_BY,
+
+        /* Slice= */
+        UNIT_IN_SLICE,
+        UNIT_SLICE_OF,
+
+        _UNIT_DEPENDENCY_MAX,
+        _UNIT_DEPENDENCY_INVALID = -EINVAL,
+} UnitDependency;
+
+typedef enum NotifyAccess {
+        NOTIFY_NONE,
+        NOTIFY_ALL,
+        NOTIFY_MAIN,
+        NOTIFY_EXEC,
+        _NOTIFY_ACCESS_MAX,
+        _NOTIFY_ACCESS_INVALID = -EINVAL,
+} NotifyAccess;
+
+char *unit_dbus_path_from_name(const char *name);
+int unit_name_from_dbus_path(const char *path, char **name);
+
+const char* unit_dbus_interface_from_type(UnitType t);
+const char *unit_dbus_interface_from_name(const char *name);
+
+const char *unit_type_to_string(UnitType i) _const_;
+UnitType unit_type_from_string(const char *s) _pure_;
+
+const char* unit_type_to_capitalized_string(UnitType t);
+
+const char *unit_load_state_to_string(UnitLoadState i) _const_;
+UnitLoadState unit_load_state_from_string(const char *s) _pure_;
+
+const char *unit_active_state_to_string(UnitActiveState i) _const_;
+UnitActiveState unit_active_state_from_string(const char *s) _pure_;
+
+const char *freezer_state_to_string(FreezerState i) _const_;
+FreezerState freezer_state_from_string(const char *s) _pure_;
+
+const char *unit_marker_to_string(UnitMarker m) _const_;
+UnitMarker unit_marker_from_string(const char *s) _pure_;
+
+const char* automount_state_to_string(AutomountState i) _const_;
+AutomountState automount_state_from_string(const char *s) _pure_;
+
+const char* device_state_to_string(DeviceState i) _const_;
+DeviceState device_state_from_string(const char *s) _pure_;
+
+const char* mount_state_to_string(MountState i) _const_;
+MountState mount_state_from_string(const char *s) _pure_;
+
+const char* path_state_to_string(PathState i) _const_;
+PathState path_state_from_string(const char *s) _pure_;
+
+const char* scope_state_to_string(ScopeState i) _const_;
+ScopeState scope_state_from_string(const char *s) _pure_;
+
+const char* service_state_to_string(ServiceState i) _const_;
+ServiceState service_state_from_string(const char *s) _pure_;
+
+const char* slice_state_to_string(SliceState i) _const_;
+SliceState slice_state_from_string(const char *s) _pure_;
+
+const char* socket_state_to_string(SocketState i) _const_;
+SocketState socket_state_from_string(const char *s) _pure_;
+
+const char* swap_state_to_string(SwapState i) _const_;
+SwapState swap_state_from_string(const char *s) _pure_;
+
+const char* target_state_to_string(TargetState i) _const_;
+TargetState target_state_from_string(const char *s) _pure_;
+
+const char *timer_state_to_string(TimerState i) _const_;
+TimerState timer_state_from_string(const char *s) _pure_;
+
+const char *unit_dependency_to_string(UnitDependency i) _const_;
+UnitDependency unit_dependency_from_string(const char *s) _pure_;
+
+const char* notify_access_to_string(NotifyAccess i) _const_;
+NotifyAccess notify_access_from_string(const char *s) _pure_;
+
+SpecialGlyph unit_active_state_to_glyph(UnitActiveState state);
diff --git a/src/basic/unit-file.c b/src/basic/unit-file.c
new file mode 100644
index 0000000..54f2137
--- /dev/null
+++ b/src/basic/unit-file.c
@@ -0,0 +1,833 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-id128.h"
+
+#include "chase.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "initrd-util.h"
+#include "macro.h"
+#include "path-lookup.h"
+#include "set.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-file.h"
+
+bool unit_type_may_alias(UnitType type) {
+        return IN_SET(type,
+                      UNIT_SERVICE,
+                      UNIT_SOCKET,
+                      UNIT_TARGET,
+                      UNIT_DEVICE,
+                      UNIT_TIMER,
+                      UNIT_PATH);
+}
+
+bool unit_type_may_template(UnitType type) {
+        return IN_SET(type,
+                      UNIT_SERVICE,
+                      UNIT_SOCKET,
+                      UNIT_TARGET,
+                      UNIT_TIMER,
+                      UNIT_PATH);
+}
+
+int unit_symlink_name_compatible(const char *symlink, const char *target, bool instance_propagation) {
+        _cleanup_free_ char *template = NULL;
+        int r, un_type1, un_type2;
+
+        un_type1 = unit_name_classify(symlink);
+
+        /* The straightforward case: the symlink name matches the target and we have a valid unit */
+        if (streq(symlink, target) &&
+            (un_type1 & (UNIT_NAME_PLAIN | UNIT_NAME_INSTANCE)))
+                return 1;
+
+        r = unit_name_template(symlink, &template);
+        if (r == -EINVAL)
+                return 0; /* Not a template */
+        if (r < 0)
+                return r;
+
+        un_type2 = unit_name_classify(target);
+
+        /* An instance name points to a target that is just the template name */
+        if (un_type1 == UNIT_NAME_INSTANCE &&
+            un_type2 == UNIT_NAME_TEMPLATE &&
+            streq(template, target))
+                return 1;
+
+        /* foo@.target.requires/bar@.service: instance will be propagated */
+        if (instance_propagation &&
+            un_type1 == UNIT_NAME_TEMPLATE &&
+            un_type2 == UNIT_NAME_TEMPLATE &&
+            streq(template, target))
+                return 1;
+
+        return 0;
+}
+
+int unit_validate_alias_symlink_or_warn(int log_level, const char *filename, const char *target) {
+        _cleanup_free_ char *src = NULL, *dst = NULL;
+        _cleanup_free_ char *src_instance = NULL, *dst_instance = NULL;
+        UnitType src_unit_type, dst_unit_type;
+        UnitNameFlags src_name_type, dst_name_type;
+        int r;
+
+        /* Check if the *alias* symlink is valid. This applies to symlinks like
+         * /etc/systemd/system/dbus.service → dbus-broker.service, but not to .wants or .requires symlinks
+         * and such. Neither does this apply to symlinks which *link* units, i.e. symlinks to outside of the
+         * unit lookup path.
+         *
+         * -EINVAL is returned if the something is wrong with the source filename or the source unit type is
+         *         not allowed to symlink,
+         * -EXDEV if the target filename is not a valid unit name or doesn't match the source,
+         * -ELOOP for an alias to self.
+         */
+
+        r = path_extract_filename(filename, &src);
+        if (r < 0)
+                return r;
+
+        r = path_extract_filename(target, &dst);
+        if (r < 0)
+                return r;
+
+        /* src checks */
+
+        src_name_type = unit_name_to_instance(src, &src_instance);
+        if (src_name_type < 0)
+                return log_full_errno(log_level, src_name_type,
+                                      "%s: not a valid unit name \"%s\": %m", filename, src);
+
+        src_unit_type = unit_name_to_type(src);
+        assert(src_unit_type >= 0); /* unit_name_to_instance() checked the suffix already */
+
+        if (!unit_type_may_alias(src_unit_type))
+                return log_full_errno(log_level, SYNTHETIC_ERRNO(EINVAL),
+                                      "%s: symlinks are not allowed for units of this type, rejecting.",
+                                      filename);
+
+        if (src_name_type != UNIT_NAME_PLAIN &&
+            !unit_type_may_template(src_unit_type))
+                return log_full_errno(log_level, SYNTHETIC_ERRNO(EINVAL),
+                                      "%s: templates not allowed for %s units, rejecting.",
+                                      filename, unit_type_to_string(src_unit_type));
+
+        /* dst checks */
+
+        if (streq(src, dst))
+                return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+                                       "%s: unit self-alias: %s → %s, ignoring.",
+                                       filename, src, dst);
+
+        dst_name_type = unit_name_to_instance(dst, &dst_instance);
+        if (dst_name_type < 0)
+                return log_full_errno(log_level, dst_name_type == -EINVAL ? SYNTHETIC_ERRNO(EXDEV) : dst_name_type,
+                                      "%s points to \"%s\" which is not a valid unit name: %m",
+                                      filename, dst);
+
+        if (!(dst_name_type == src_name_type ||
+              (src_name_type == UNIT_NAME_INSTANCE && dst_name_type == UNIT_NAME_TEMPLATE)))
+                return log_full_errno(log_level, SYNTHETIC_ERRNO(EXDEV),
+                                      "%s: symlink target name type \"%s\" does not match source, rejecting.",
+                                      filename, dst);
+
+        if (dst_name_type == UNIT_NAME_INSTANCE) {
+                assert(src_instance);
+                assert(dst_instance);
+                if (!streq(src_instance, dst_instance))
+                        return log_full_errno(log_level, SYNTHETIC_ERRNO(EXDEV),
+                                              "%s: unit symlink target \"%s\" instance name doesn't match, rejecting.",
+                                              filename, dst);
+        }
+
+        dst_unit_type = unit_name_to_type(dst);
+        if (dst_unit_type != src_unit_type)
+                return log_full_errno(log_level, SYNTHETIC_ERRNO(EXDEV),
+                                      "%s: symlink target \"%s\" has incompatible suffix, rejecting.",
+                                      filename, dst);
+
+        return 0;
+}
+
+#define FOLLOW_MAX 8
+
+static int unit_ids_map_get(
+                Hashmap *unit_ids_map,
+                const char *unit_name,
+                const char **ret_fragment_path) {
+
+        /* Resolve recursively until we hit an absolute path, i.e. a non-aliased unit.
+         *
+         * We distinguish the case where unit_name was not found in the hashmap at all, and the case where
+         * some symlink was broken.
+         *
+         * If a symlink target points to an instance name, then we also check for the template. */
+
+        const char *id = NULL;
+        int r;
+
+        for (unsigned n = 0; n < FOLLOW_MAX; n++) {
+                const char *t = hashmap_get(unit_ids_map, id ?: unit_name);
+                if (!t) {
+                        _cleanup_free_ char *template = NULL;
+
+                        if (!id)
+                                return -ENOENT;
+
+                        r = unit_name_template(id, &template);
+                        if (r == -EINVAL)
+                                return -ENXIO; /* we failed to find the symlink target */
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to determine template name for %s: %m", id);
+
+                        t = hashmap_get(unit_ids_map, template);
+                        if (!t)
+                                return -ENXIO;
+
+                        /* We successfully switched from instanced name to a template, let's continue */
+                }
+
+                if (path_is_absolute(t)) {
+                        if (ret_fragment_path)
+                                *ret_fragment_path = t;
+                        return 0;
+                }
+
+                id = t;
+        }
+
+        return -ELOOP;
+}
+
+static bool lookup_paths_mtime_exclude(const LookupPaths *lp, const char *path) {
+        /* Paths that are under our exclusive control. Users shall not alter those directly. */
+
+        return streq_ptr(path, lp->generator) ||
+               streq_ptr(path, lp->generator_early) ||
+               streq_ptr(path, lp->generator_late) ||
+               streq_ptr(path, lp->transient) ||
+               streq_ptr(path, lp->persistent_control) ||
+               streq_ptr(path, lp->runtime_control);
+}
+
+#define HASH_KEY SD_ID128_MAKE(4e,86,1b,e3,39,b3,40,46,98,5d,b8,11,34,8f,c3,c1)
+
+bool lookup_paths_timestamp_hash_same(const LookupPaths *lp, uint64_t timestamp_hash, uint64_t *ret_new) {
+        struct siphash state;
+
+        siphash24_init(&state, HASH_KEY.bytes);
+
+        STRV_FOREACH(dir, lp->search_path) {
+                struct stat st;
+
+                if (lookup_paths_mtime_exclude(lp, *dir))
+                        continue;
+
+                /* Determine the latest lookup path modification time */
+                if (stat(*dir, &st) < 0) {
+                        if (errno == ENOENT)
+                                continue;
+
+                        log_debug_errno(errno, "Failed to stat %s, ignoring: %m", *dir);
+                        continue;
+                }
+
+                siphash24_compress_usec_t(timespec_load(&st.st_mtim), &state);
+        }
+
+        uint64_t updated = siphash24_finalize(&state);
+        if (ret_new)
+                *ret_new = updated;
+        if (updated != timestamp_hash)
+                log_debug("Modification times have changed, need to update cache.");
+        return updated == timestamp_hash;
+}
+
+static int directory_name_is_valid(const char *name) {
+
+        /* Accept a directory whose name is a valid unit file name ending in .wants/, .requires/,
+         * .upholds/ or .d/ */
+
+        FOREACH_STRING(suffix, ".wants", ".requires", ".upholds", ".d") {
+                _cleanup_free_ char *chopped = NULL;
+                const char *e;
+
+                e = endswith(name, suffix);
+                if (!e)
+                        continue;
+
+                chopped = strndup(name, e - name);
+                if (!chopped)
+                        return log_oom();
+
+                if (unit_name_is_valid(chopped, UNIT_NAME_ANY) ||
+                    unit_type_from_string(chopped) >= 0)
+                        return true;
+        }
+
+        return false;
+}
+
+int unit_file_resolve_symlink(
+                const char *root_dir,
+                char **search_path,
+                const char *dir,
+                int dirfd,
+                const char *filename,
+                bool resolve_destination_target,
+                char **ret_destination) {
+
+        _cleanup_free_ char *target = NULL, *simplified = NULL, *dst = NULL, *_dir = NULL, *_filename = NULL;
+        int r;
+
+        /* This can be called with either dir+dirfd valid and filename just a name,
+         * or !dir && dirfd==AT_FDCWD, and filename being a full path.
+         *
+         * If resolve_destination_target is true, an absolute path will be returned.
+         * If not, an absolute path is returned for linked unit files, and a relative
+         * path otherwise.
+         *
+         * Returns an error, false if this is an alias, true if it's a linked unit file. */
+
+        assert(filename);
+        assert(ret_destination);
+        assert(dir || path_is_absolute(filename));
+        assert(dirfd >= 0 || dirfd == AT_FDCWD);
+
+        r = readlinkat_malloc(dirfd, filename, &target);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to read symlink %s%s%s: %m",
+                                         dir, dir ? "/" : "", filename);
+
+        if (!dir) {
+                r = path_extract_directory(filename, &_dir);
+                if (r < 0)
+                        return r;
+                dir = _dir;
+
+                r = path_extract_filename(filename, &_filename);
+                if (r < 0)
+                        return r;
+                if (r == O_DIRECTORY)
+                        return log_warning_errno(SYNTHETIC_ERRNO(EISDIR),
+                                                 "Unexpected path to a directory \"%s\", refusing.", filename);
+                filename = _filename;
+        }
+
+        bool is_abs = path_is_absolute(target);
+        if (root_dir || !is_abs) {
+                char *target_abs = path_join(is_abs ? root_dir : dir, target);
+                if (!target_abs)
+                        return log_oom();
+
+                free_and_replace(target, target_abs);
+        }
+
+        /* Get rid of "." and ".." components in target path */
+        r = chase(target, root_dir, CHASE_NOFOLLOW | CHASE_NONEXISTENT, &simplified, NULL);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to resolve symlink %s/%s pointing to %s: %m",
+                                         dir, filename, target);
+
+        assert(path_is_absolute(simplified));
+
+        /* Check if the symlink remain inside of our search path.
+         * If yes, it is an alias. Verify that it is valid.
+         *
+         * If no, then this is a linked unit file or mask, and we don't care about the target name
+         * when loading units, and we return the link *source* (resolve_destination_target == false);
+         * When this is called for installation purposes, we want the final destination,
+         * so we return the *target*.
+         */
+        const char *tail = path_startswith_strv(simplified, search_path);
+        if (tail) {  /* An alias */
+                _cleanup_free_ char *target_name = NULL;
+
+                r = path_extract_filename(simplified, &target_name);
+                if (r < 0)
+                        return r;
+
+                r = unit_validate_alias_symlink_or_warn(LOG_NOTICE, filename, simplified);
+                if (r < 0)
+                        return r;
+                if (is_path(tail))
+                        log_warning("Suspicious symlink %s/%s %s %s, treating as alias.",
+                                    dir, filename, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), simplified);
+
+                dst = resolve_destination_target ? TAKE_PTR(simplified) : TAKE_PTR(target_name);
+
+        } else {
+                log_debug("Linked unit file: %s/%s %s %s", dir, filename, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), simplified);
+
+                if (resolve_destination_target)
+                        dst = TAKE_PTR(simplified);
+                else {
+                        dst = path_join(dir, filename);
+                        if (!dst)
+                                return log_oom();
+                }
+        }
+
+        *ret_destination = TAKE_PTR(dst);
+        return !tail;  /* true if linked unit file */
+}
+
+int unit_file_build_name_map(
+                const LookupPaths *lp,
+                uint64_t *cache_timestamp_hash,
+                Hashmap **unit_ids_map,
+                Hashmap **unit_names_map,
+                Set **path_cache) {
+
+        /* Build two mappings: any name → main unit (i.e. the end result of symlink resolution), unit name →
+         * all aliases (i.e. the entry for a given key is a list of all names which point to this key). The
+         * key is included in the value iff we saw a file or symlink with that name. In other words, if we
+         * have a key, but it is not present in the value for itself, there was an alias pointing to it, but
+         * the unit itself is not loadable.
+         *
+         * At the same, build a cache of paths where to find units. The non-const parameters are for input
+         * and output. Existing contents will be freed before the new contents are stored.
+         */
+
+        _cleanup_hashmap_free_ Hashmap *ids = NULL, *names = NULL;
+        _cleanup_set_free_free_ Set *paths = NULL;
+        _cleanup_strv_free_ char **expanded_search_path = NULL;
+        uint64_t timestamp_hash;
+        int r;
+
+        /* Before doing anything, check if the timestamp hash that was passed is still valid.
+         * If yes, do nothing. */
+        if (cache_timestamp_hash &&
+            lookup_paths_timestamp_hash_same(lp, *cache_timestamp_hash, ×tamp_hash))
+                return 0;
+
+        /* The timestamp hash is now set based on the mtimes from before when we start reading files.
+         * If anything is modified concurrently, we'll consider the cache outdated. */
+
+        if (path_cache) {
+                paths = set_new(&path_hash_ops_free);
+                if (!paths)
+                        return log_oom();
+        }
+
+        /* Go over all our search paths, chase their symlinks and store the result in the
+         * expanded_search_path list.
+         *
+         * This is important for cases where any of the unit directories itself are symlinks into other
+         * directories and would therefore cause all of the unit files to be recognized as linked units.
+         *
+         * This is important for distributions such as NixOS where most paths in /etc/ are symlinks to some
+         * other location on the filesystem (e.g.  into /nix/store/).
+         *
+         * Search paths are ordered by priority (highest first), and we need to maintain this order.
+         * If a resolved path is already in the list, we don't need to include.
+         *
+         * Note that we build a list that contains both the original paths and the resolved symlinks:
+         * we need the latter for the case where the directory is symlinked, as described above, and
+         * the former for the case where some unit file alias is a dangling symlink that points to one
+         * of the "original" directories (and can't be followed).
+         */
+        STRV_FOREACH(dir, lp->search_path) {
+                _cleanup_free_ char *resolved_dir = NULL;
+
+                r = strv_extend(&expanded_search_path, *dir);
+                if (r < 0)
+                        return log_oom();
+
+                r = chase(*dir, NULL, 0, &resolved_dir, NULL);
+                if (r < 0) {
+                        if (r != -ENOENT)
+                                log_warning_errno(r, "Failed to resolve symlink %s, ignoring: %m", *dir);
+                        continue;
+                }
+
+                if (strv_contains(expanded_search_path, resolved_dir))
+                        continue;
+
+                if (strv_consume(&expanded_search_path, TAKE_PTR(resolved_dir)) < 0)
+                        return log_oom();
+        }
+
+        STRV_FOREACH(dir, lp->search_path) {
+                _cleanup_closedir_ DIR *d = NULL;
+
+                d = opendir(*dir);
+                if (!d) {
+                        if (errno != ENOENT)
+                                log_warning_errno(errno, "Failed to open \"%s\", ignoring: %m", *dir);
+                        continue;
+                }
+
+                FOREACH_DIRENT_ALL(de, d, log_warning_errno(errno, "Failed to read \"%s\", ignoring: %m", *dir)) {
+                        _unused_ _cleanup_free_ char *_filename_free = NULL;
+                        char *filename;
+                        _cleanup_free_ char *dst = NULL;
+                        bool symlink_to_dir = false;
+
+                        /* We only care about valid units and dirs with certain suffixes, let's ignore the
+                         * rest. */
+
+                        if (de->d_type == DT_REG) {
+
+                                /* Accept a regular file whose name is a valid unit file name. */
+                                if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY))
+                                        continue;
+
+                        } else if (de->d_type == DT_DIR) {
+
+                                if (!paths) /* Skip directories early unless path_cache is requested */
+                                        continue;
+
+                                r = directory_name_is_valid(de->d_name);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        continue;
+
+                        } else if (de->d_type == DT_LNK) {
+
+                                /* Accept a symlink file whose name is a valid unit file name or
+                                 * ending in .wants/, .requires/ or .d/. */
+
+                                if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY)) {
+                                        _cleanup_free_ char *target = NULL;
+
+                                        if (!paths) /* Skip symlink to a directory early unless path_cache is requested */
+                                                continue;
+
+                                        r = directory_name_is_valid(de->d_name);
+                                        if (r < 0)
+                                                return r;
+                                        if (r == 0)
+                                                continue;
+
+                                        r = readlinkat_malloc(dirfd(d), de->d_name, &target);
+                                        if (r < 0) {
+                                                log_warning_errno(r, "Failed to read symlink %s/%s, ignoring: %m",
+                                                                  *dir, de->d_name);
+                                                continue;
+                                        }
+
+                                        r = is_dir(target, /* follow = */ true);
+                                        if (r <= 0)
+                                                continue;
+
+                                        symlink_to_dir = true;
+                                }
+
+                        } else
+                                continue;
+
+                        filename = path_join(*dir, de->d_name);
+                        if (!filename)
+                                return log_oom();
+
+                        if (paths) {
+                                r = set_put(paths, filename);
+                                if (r < 0)
+                                        return log_oom();
+                                if (r == 0)
+                                        _filename_free = filename; /* Make sure we free the filename. */
+                        } else
+                                _filename_free = filename; /* Make sure we free the filename. */
+
+                        if (de->d_type == DT_DIR || (de->d_type == DT_LNK && symlink_to_dir))
+                                continue;
+
+                        assert(IN_SET(de->d_type, DT_REG, DT_LNK));
+
+                        /* search_path is ordered by priority (highest first). If the name is already mapped
+                         * to something (incl. itself), it means that we have already seen it, and we should
+                         * ignore it here. */
+                        if (hashmap_contains(ids, de->d_name))
+                                continue;
+
+                        if (de->d_type == DT_LNK) {
+                                /* We don't explicitly check for alias loops here. unit_ids_map_get() which
+                                 * limits the number of hops should be used to access the map. */
+
+                                r = unit_file_resolve_symlink(lp->root_dir, expanded_search_path,
+                                                              *dir, dirfd(d), de->d_name,
+                                                              /* resolve_destination_target= */ false,
+                                                              &dst);
+                                if (r == -ENOMEM)
+                                        return r;
+                                if (r < 0)  /* we ignore other errors here */
+                                        continue;
+
+                        } else {
+                                dst = TAKE_PTR(_filename_free); /* Grab the copy we made previously, if available. */
+                                if (!dst) {
+                                        dst = strdup(filename);
+                                        if (!dst)
+                                                return log_oom();
+                                }
+
+                                log_debug("%s: normal unit file: %s", __func__, dst);
+                        }
+
+                        _cleanup_free_ char *key = strdup(de->d_name);
+                        if (!key)
+                                return log_oom();
+
+                        r = hashmap_ensure_put(&ids, &string_hash_ops_free_free, key, dst);
+                        if (r < 0)
+                                return log_warning_errno(r, "Failed to add entry to hashmap (%s%s%s): %m",
+                                                         de->d_name, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), dst);
+                        key = dst = NULL;
+                }
+        }
+
+        /* Let's also put the names in the reverse db. */
+        const char *dummy, *src;
+        HASHMAP_FOREACH_KEY(dummy, src, ids) {
+                _cleanup_free_ char *inst = NULL, *dst_inst = NULL;
+                const char *dst;
+
+                r = unit_ids_map_get(ids, src, &dst);
+                if (r < 0)
+                        continue;
+
+                if (null_or_empty_path(dst) != 0)
+                        continue;
+
+                dst = basename(dst);
+
+                /* If we have an symlink from an instance name to a template name, it is an alias just for
+                 * this specific instance, foo@id.service ↔ template@id.service. */
+                if (unit_name_is_valid(dst, UNIT_NAME_TEMPLATE)) {
+                        UnitNameFlags t = unit_name_to_instance(src, &inst);
+                        if (t < 0)
+                                return log_error_errno(t, "Failed to extract instance part from %s: %m", src);
+                        if (t == UNIT_NAME_INSTANCE) {
+                                r = unit_name_replace_instance(dst, inst, &dst_inst);
+                                if (r < 0) {
+                                        /* This might happen e.g. if the combined length is too large.
+                                         * Let's not make too much of a fuss. */
+                                        log_debug_errno(r, "Failed to build alias name (%s + %s), ignoring: %m",
+                                                        dst, inst);
+                                        continue;
+                                }
+
+                                dst = dst_inst;
+                        }
+                }
+
+                r = string_strv_hashmap_put(&names, dst, src);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to add entry to hashmap (%s%s%s): %m",
+                                                 dst, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), src);
+        }
+
+        if (cache_timestamp_hash)
+                *cache_timestamp_hash = timestamp_hash;
+
+        hashmap_free_and_replace(*unit_ids_map, ids);
+        hashmap_free_and_replace(*unit_names_map, names);
+        if (path_cache)
+                set_free_and_replace(*path_cache, paths);
+
+        return 1;
+}
+
+static int add_name(
+                const char *unit_name,
+                Set **names,
+                const char *name) {
+        int r;
+
+        assert(names);
+        assert(name);
+
+        r = set_put_strdup(names, name);
+        if (r < 0)
+                return r;
+        if (r > 0 && !streq(unit_name, name))
+                log_debug("Unit %s has alias %s.", unit_name, name);
+        return r;
+}
+
+static int add_names(
+                Hashmap *unit_ids_map,
+                Hashmap *unit_name_map,
+                const char *unit_name,
+                const char *fragment_basename,  /* Only set when adding additional names based on fragment path */
+                UnitNameFlags name_type,
+                const char *instance,
+                Set **names,
+                const char *name) {
+
+        char **aliases;
+        int r;
+
+        assert(name_type == UNIT_NAME_PLAIN || instance);
+
+        /* The unit has its own name if it's not a template. If we're looking at a fragment, the fragment
+         * name (possibly with instance inserted), is also always one of the unit names. */
+        if (name_type != UNIT_NAME_TEMPLATE) {
+                r = add_name(unit_name, names, name);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Add any aliases of the name to the set of names.
+         *
+         * We don't even need to know which fragment we will use. The unit_name_map should return the same
+         * set of names for any of the aliases. */
+        aliases = hashmap_get(unit_name_map, name);
+        STRV_FOREACH(alias, aliases) {
+                if (name_type == UNIT_NAME_INSTANCE && unit_name_is_valid(*alias, UNIT_NAME_TEMPLATE)) {
+                        _cleanup_free_ char *inst = NULL;
+                        const char *inst_fragment = NULL;
+
+                        r = unit_name_replace_instance(*alias, instance, &inst);
+                        if (r < 0)
+                                return log_debug_errno(r, "Cannot build instance name %s + %s: %m",
+                                                       *alias, instance);
+
+                        /* Exclude any aliases that point in some other direction.
+                         *
+                         * See https://github.com/systemd/systemd/pull/13119#discussion_r308145418. */
+                        r = unit_ids_map_get(unit_ids_map, inst, &inst_fragment);
+                        if (r < 0 && !IN_SET(r, -ENOENT, -ENXIO))
+                                return log_debug_errno(r, "Cannot find instance fragment %s: %m", inst);
+
+                        if (inst_fragment &&
+                            fragment_basename &&
+                            !streq(basename(inst_fragment), fragment_basename)) {
+                                log_debug("Instance %s has fragment %s and is not an alias of %s.",
+                                          inst, inst_fragment, unit_name);
+                                continue;
+                        }
+
+                        r = add_name(unit_name, names, inst);
+                } else
+                        r = add_name(unit_name, names, *alias);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int unit_file_find_fragment(
+                Hashmap *unit_ids_map,
+                Hashmap *unit_name_map,
+                const char *unit_name,
+                const char **ret_fragment_path,
+                Set **ret_names) {
+
+        const char *fragment = NULL;
+        _cleanup_free_ char *template = NULL, *instance = NULL;
+        _cleanup_set_free_ Set *names = NULL;
+        int r;
+
+        /* Finds a fragment path, and returns the set of names:
+         * if we have …/foo.service and …/foo-alias.service→foo.service,
+         * and …/foo@.service and …/foo-alias@.service→foo@.service,
+         * and …/foo@inst.service,
+         * this should return:
+         * foo.service → …/foo.service, {foo.service, foo-alias.service},
+         * foo-alias.service → …/foo.service, {foo.service, foo-alias.service},
+         * foo@.service → …/foo@.service, {foo@.service, foo-alias@.service},
+         * foo-alias@.service → …/foo@.service, {foo@.service, foo-alias@.service},
+         * foo@bar.service → …/foo@.service, {foo@bar.service, foo-alias@bar.service},
+         * foo-alias@bar.service → …/foo@.service, {foo@bar.service, foo-alias@bar.service},
+         * foo-alias@inst.service → …/foo@inst.service, {foo@inst.service, foo-alias@inst.service}.
+         */
+
+        UnitNameFlags name_type = unit_name_to_instance(unit_name, &instance);
+        if (name_type < 0)
+                return name_type;
+
+        if (ret_names) {
+                r = add_names(unit_ids_map, unit_name_map, unit_name, NULL, name_type, instance, &names, unit_name);
+                if (r < 0)
+                        return r;
+        }
+
+        /* First try to load fragment under the original name */
+        r = unit_ids_map_get(unit_ids_map, unit_name, &fragment);
+        if (r < 0 && !IN_SET(r, -ENOENT, -ENXIO))
+                return log_debug_errno(r, "Cannot load unit %s: %m", unit_name);
+
+        if (!fragment && name_type == UNIT_NAME_INSTANCE) {
+                /* Look for a fragment under the template name */
+
+                r = unit_name_template(unit_name, &template);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to determine template name: %m");
+
+                r = unit_ids_map_get(unit_ids_map, template, &fragment);
+                if (r < 0 && !IN_SET(r, -ENOENT, -ENXIO))
+                        return log_debug_errno(r, "Cannot load template %s: %m", template);
+        }
+
+        if (fragment && ret_names) {
+                _cleanup_free_ char *fragment_basename = NULL;
+                r = path_extract_filename(fragment, &fragment_basename);
+                if (r < 0)
+                        return r;
+
+                if (!streq(fragment_basename, unit_name)) {
+                        /* Add names based on the fragment name to the set of names */
+                        r = add_names(unit_ids_map, unit_name_map, unit_name, fragment_basename, name_type, instance, &names, fragment_basename);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        *ret_fragment_path = fragment;
+        if (ret_names)
+                *ret_names = TAKE_PTR(names);
+
+        return 0;
+}
+
+static const char * const rlmap[] = {
+        "emergency", SPECIAL_EMERGENCY_TARGET,
+        "-b",        SPECIAL_EMERGENCY_TARGET,
+        "rescue",    SPECIAL_RESCUE_TARGET,
+        "single",    SPECIAL_RESCUE_TARGET,
+        "-s",        SPECIAL_RESCUE_TARGET,
+        "s",         SPECIAL_RESCUE_TARGET,
+        "S",         SPECIAL_RESCUE_TARGET,
+        "1",         SPECIAL_RESCUE_TARGET,
+        "2",         SPECIAL_MULTI_USER_TARGET,
+        "3",         SPECIAL_MULTI_USER_TARGET,
+        "4",         SPECIAL_MULTI_USER_TARGET,
+        "5",         SPECIAL_GRAPHICAL_TARGET,
+        NULL
+};
+
+static const char * const rlmap_initrd[] = {
+        "emergency", SPECIAL_EMERGENCY_TARGET,
+        "rescue",    SPECIAL_RESCUE_TARGET,
+        NULL
+};
+
+const char* runlevel_to_target(const char *word) {
+        const char * const *rlmap_ptr;
+
+        if (!word)
+                return NULL;
+
+        if (in_initrd()) {
+                word = startswith(word, "rd.");
+                if (!word)
+                        return NULL;
+        }
+
+        rlmap_ptr = in_initrd() ? rlmap_initrd : rlmap;
+
+        for (size_t i = 0; rlmap_ptr[i]; i += 2)
+                if (streq(word, rlmap_ptr[i]))
+                        return rlmap_ptr[i+1];
+
+        return NULL;
+}
diff --git a/src/basic/unit-file.h b/src/basic/unit-file.h
new file mode 100644
index 0000000..1c43861
--- /dev/null
+++ b/src/basic/unit-file.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "hashmap.h"
+#include "path-lookup.h"
+#include "time-util.h"
+#include "unit-name.h"
+
+typedef enum UnitFileState UnitFileState;
+
+enum UnitFileState {
+        UNIT_FILE_ENABLED,
+        UNIT_FILE_ENABLED_RUNTIME,
+        UNIT_FILE_LINKED,
+        UNIT_FILE_LINKED_RUNTIME,
+        UNIT_FILE_ALIAS,
+        UNIT_FILE_MASKED,
+        UNIT_FILE_MASKED_RUNTIME,
+        UNIT_FILE_STATIC,
+        UNIT_FILE_DISABLED,
+        UNIT_FILE_INDIRECT,
+        UNIT_FILE_GENERATED,
+        UNIT_FILE_TRANSIENT,
+        UNIT_FILE_BAD,
+        _UNIT_FILE_STATE_MAX,
+        _UNIT_FILE_STATE_INVALID = -EINVAL,
+};
+
+bool unit_type_may_alias(UnitType type) _const_;
+bool unit_type_may_template(UnitType type) _const_;
+
+int unit_symlink_name_compatible(const char *symlink, const char *target, bool instance_propagation);
+int unit_validate_alias_symlink_or_warn(int log_level, const char *filename, const char *target);
+
+bool lookup_paths_timestamp_hash_same(const LookupPaths *lp, uint64_t timestamp_hash, uint64_t *ret_new);
+
+int unit_file_resolve_symlink(
+                const char *root_dir,
+                char **search_path,
+                const char *dir,
+                int dirfd,
+                const char *filename,
+                bool resolve_destination_target,
+                char **ret_destination);
+
+int unit_file_build_name_map(
+                const LookupPaths *lp,
+                uint64_t *cache_timestamp_hash,
+                Hashmap **unit_ids_map,
+                Hashmap **unit_names_map,
+                Set **path_cache);
+
+int unit_file_find_fragment(
+                Hashmap *unit_ids_map,
+                Hashmap *unit_name_map,
+                const char *unit_name,
+                const char **ret_fragment_path,
+                Set **ret_names);
+
+const char* runlevel_to_target(const char *rl);
diff --git a/src/basic/unit-name.c b/src/basic/unit-name.c
new file mode 100644
index 0000000..8bf28ba
--- /dev/null
+++ b/src/basic/unit-name.c
@@ -0,0 +1,916 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "glob-util.h"
+#include "hexdecoct.h"
+#include "memory-util.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "sparse-endian.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+
+/* Characters valid in a unit name. */
+#define VALID_CHARS                             \
+        DIGITS                                  \
+        LETTERS                                 \
+        ":-_.\\"
+
+/* The same, but also permits the single @ character that may appear */
+#define VALID_CHARS_WITH_AT                     \
+        "@"                                     \
+        VALID_CHARS
+
+/* All chars valid in a unit name glob */
+#define VALID_CHARS_GLOB                        \
+        VALID_CHARS_WITH_AT                     \
+        "[]!-*?"
+
+#define LONG_UNIT_NAME_HASH_KEY SD_ID128_MAKE(ec,f2,37,fb,58,32,4a,32,84,9f,06,9b,0d,21,eb,9a)
+#define UNIT_NAME_HASH_LENGTH_CHARS 16
+
+bool unit_name_is_valid(const char *n, UnitNameFlags flags) {
+        const char *e, *i, *at;
+
+        assert((flags & ~(UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE)) == 0);
+
+        if (_unlikely_(flags == 0))
+                return false;
+
+        if (isempty(n))
+                return false;
+
+        if (strlen(n) >= UNIT_NAME_MAX)
+                return false;
+
+        e = strrchr(n, '.');
+        if (!e || e == n)
+                return false;
+
+        if (unit_type_from_string(e + 1) < 0)
+                return false;
+
+        for (i = n, at = NULL; i < e; i++) {
+
+                if (*i == '@' && !at)
+                        at = i;
+
+                if (!strchr(VALID_CHARS_WITH_AT, *i))
+                        return false;
+        }
+
+        if (at == n)
+                return false;
+
+        if (flags & UNIT_NAME_PLAIN)
+                if (!at)
+                        return true;
+
+        if (flags & UNIT_NAME_INSTANCE)
+                if (at && e > at + 1)
+                        return true;
+
+        if (flags & UNIT_NAME_TEMPLATE)
+                if (at && e == at + 1)
+                        return true;
+
+        return false;
+}
+
+bool unit_prefix_is_valid(const char *p) {
+
+        /* We don't allow additional @ in the prefix string */
+
+        if (isempty(p))
+                return false;
+
+        return in_charset(p, VALID_CHARS);
+}
+
+bool unit_instance_is_valid(const char *i) {
+
+        /* The max length depends on the length of the string, so we
+         * don't really check this here. */
+
+        if (isempty(i))
+                return false;
+
+        /* We allow additional @ in the instance string, we do not
+         * allow them in the prefix! */
+
+        return in_charset(i, "@" VALID_CHARS);
+}
+
+bool unit_suffix_is_valid(const char *s) {
+        if (isempty(s))
+                return false;
+
+        if (s[0] != '.')
+                return false;
+
+        if (unit_type_from_string(s + 1) < 0)
+                return false;
+
+        return true;
+}
+
+int unit_name_to_prefix(const char *n, char **ret) {
+        const char *p;
+        char *s;
+
+        assert(n);
+        assert(ret);
+
+        if (!unit_name_is_valid(n, UNIT_NAME_ANY))
+                return -EINVAL;
+
+        p = strchr(n, '@');
+        if (!p)
+                p = strrchr(n, '.');
+
+        assert_se(p);
+
+        s = strndup(n, p - n);
+        if (!s)
+                return -ENOMEM;
+
+        *ret = s;
+        return 0;
+}
+
+UnitNameFlags unit_name_to_instance(const char *n, char **ret) {
+        const char *p, *d;
+
+        assert(n);
+
+        if (!unit_name_is_valid(n, UNIT_NAME_ANY))
+                return -EINVAL;
+
+        /* Everything past the first @ and before the last . is the instance */
+        p = strchr(n, '@');
+        if (!p) {
+                if (ret)
+                        *ret = NULL;
+                return UNIT_NAME_PLAIN;
+        }
+
+        p++;
+
+        d = strrchr(p, '.');
+        if (!d)
+                return -EINVAL;
+
+        if (ret) {
+                char *i = strndup(p, d-p);
+                if (!i)
+                        return -ENOMEM;
+
+                *ret = i;
+        }
+        return d > p ? UNIT_NAME_INSTANCE : UNIT_NAME_TEMPLATE;
+}
+
+int unit_name_to_prefix_and_instance(const char *n, char **ret) {
+        const char *d;
+        char *s;
+
+        assert(n);
+        assert(ret);
+
+        if (!unit_name_is_valid(n, UNIT_NAME_ANY))
+                return -EINVAL;
+
+        d = strrchr(n, '.');
+        if (!d)
+                return -EINVAL;
+
+        s = strndup(n, d - n);
+        if (!s)
+                return -ENOMEM;
+
+        *ret = s;
+        return 0;
+}
+
+UnitType unit_name_to_type(const char *n) {
+        const char *e;
+
+        assert(n);
+
+        if (!unit_name_is_valid(n, UNIT_NAME_ANY))
+                return _UNIT_TYPE_INVALID;
+
+        assert_se(e = strrchr(n, '.'));
+
+        return unit_type_from_string(e + 1);
+}
+
+int unit_name_change_suffix(const char *n, const char *suffix, char **ret) {
+        _cleanup_free_ char *s = NULL;
+        size_t a, b;
+        char *e;
+
+        assert(n);
+        assert(suffix);
+        assert(ret);
+
+        if (!unit_name_is_valid(n, UNIT_NAME_ANY))
+                return -EINVAL;
+
+        if (!unit_suffix_is_valid(suffix))
+                return -EINVAL;
+
+        assert_se(e = strrchr(n, '.'));
+
+        a = e - n;
+        b = strlen(suffix);
+
+        s = new(char, a + b + 1);
+        if (!s)
+                return -ENOMEM;
+
+        strcpy(mempcpy(s, n, a), suffix);
+
+        /* Make sure the name is still valid (i.e. didn't grow too large due to longer suffix) */
+        if (!unit_name_is_valid(s, UNIT_NAME_ANY))
+                return -EINVAL;
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+int unit_name_build(const char *prefix, const char *instance, const char *suffix, char **ret) {
+        UnitType type;
+
+        assert(prefix);
+        assert(suffix);
+        assert(ret);
+
+        if (suffix[0] != '.')
+                return -EINVAL;
+
+        type = unit_type_from_string(suffix + 1);
+        if (type < 0)
+                return type;
+
+        return unit_name_build_from_type(prefix, instance, type, ret);
+}
+
+int unit_name_build_from_type(const char *prefix, const char *instance, UnitType type, char **ret) {
+        _cleanup_free_ char *s = NULL;
+        const char *ut;
+
+        assert(prefix);
+        assert(type >= 0);
+        assert(type < _UNIT_TYPE_MAX);
+        assert(ret);
+
+        if (!unit_prefix_is_valid(prefix))
+                return -EINVAL;
+
+        ut = unit_type_to_string(type);
+
+        if (instance) {
+                if (!unit_instance_is_valid(instance))
+                        return -EINVAL;
+
+                s = strjoin(prefix, "@", instance, ".", ut);
+        } else
+                s = strjoin(prefix, ".", ut);
+        if (!s)
+                return -ENOMEM;
+
+        /* Verify that this didn't grow too large (or otherwise is invalid) */
+        if (!unit_name_is_valid(s, instance ? UNIT_NAME_INSTANCE : UNIT_NAME_PLAIN))
+                return -EINVAL;
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+static char *do_escape_char(char c, char *t) {
+        assert(t);
+
+        *(t++) = '\\';
+        *(t++) = 'x';
+        *(t++) = hexchar(c >> 4);
+        *(t++) = hexchar(c);
+
+        return t;
+}
+
+static char *do_escape(const char *f, char *t) {
+        assert(f);
+        assert(t);
+
+        /* do not create units with a leading '.', like for "/.dotdir" mount points */
+        if (*f == '.') {
+                t = do_escape_char(*f, t);
+                f++;
+        }
+
+        for (; *f; f++) {
+                if (*f == '/')
+                        *(t++) = '-';
+                else if (IN_SET(*f, '-', '\\') || !strchr(VALID_CHARS, *f))
+                        t = do_escape_char(*f, t);
+                else
+                        *(t++) = *f;
+        }
+
+        return t;
+}
+
+char *unit_name_escape(const char *f) {
+        char *r, *t;
+
+        assert(f);
+
+        r = new(char, strlen(f)*4+1);
+        if (!r)
+                return NULL;
+
+        t = do_escape(f, r);
+        *t = 0;
+
+        return r;
+}
+
+int unit_name_unescape(const char *f, char **ret) {
+        _cleanup_free_ char *r = NULL;
+        char *t;
+
+        assert(f);
+
+        r = strdup(f);
+        if (!r)
+                return -ENOMEM;
+
+        for (t = r; *f; f++) {
+                if (*f == '-')
+                        *(t++) = '/';
+                else if (*f == '\\') {
+                        int a, b;
+
+                        if (f[1] != 'x')
+                                return -EINVAL;
+
+                        a = unhexchar(f[2]);
+                        if (a < 0)
+                                return -EINVAL;
+
+                        b = unhexchar(f[3]);
+                        if (b < 0)
+                                return -EINVAL;
+
+                        *(t++) = (char) (((uint8_t) a << 4U) | (uint8_t) b);
+                        f += 3;
+                } else
+                        *(t++) = *f;
+        }
+
+        *t = 0;
+
+        *ret = TAKE_PTR(r);
+
+        return 0;
+}
+
+int unit_name_path_escape(const char *f, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        char *s;
+        int r;
+
+        assert(f);
+        assert(ret);
+
+        r = path_simplify_alloc(f, &p);
+        if (r < 0)
+                return r;
+
+        if (empty_or_root(p))
+                s = strdup("-");
+        else {
+                if (!path_is_normalized(p))
+                        return -EINVAL;
+
+                /* Truncate trailing slashes and skip leading slashes */
+                delete_trailing_chars(p, "/");
+                s = unit_name_escape(skip_leading_chars(p, "/"));
+        }
+        if (!s)
+                return -ENOMEM;
+
+        *ret = s;
+        return 0;
+}
+
+int unit_name_path_unescape(const char *f, char **ret) {
+        _cleanup_free_ char *s = NULL;
+        int r;
+
+        assert(f);
+
+        if (isempty(f))
+                return -EINVAL;
+
+        if (streq(f, "-")) {
+                s = strdup("/");
+                if (!s)
+                        return -ENOMEM;
+        } else {
+                _cleanup_free_ char *w = NULL;
+
+                r = unit_name_unescape(f, &w);
+                if (r < 0)
+                        return r;
+
+                /* Don't accept trailing or leading slashes */
+                if (startswith(w, "/") || endswith(w, "/"))
+                        return -EINVAL;
+
+                /* Prefix a slash again */
+                s = strjoin("/", w);
+                if (!s)
+                        return -ENOMEM;
+
+                if (!path_is_normalized(s))
+                        return -EINVAL;
+        }
+
+        if (ret)
+                *ret = TAKE_PTR(s);
+
+        return 0;
+}
+
+int unit_name_replace_instance(const char *f, const char *i, char **ret) {
+        _cleanup_free_ char *s = NULL;
+        const char *p, *e;
+        size_t a, b;
+
+        assert(f);
+        assert(i);
+        assert(ret);
+
+        if (!unit_name_is_valid(f, UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE))
+                return -EINVAL;
+        if (!unit_instance_is_valid(i))
+                return -EINVAL;
+
+        assert_se(p = strchr(f, '@'));
+        assert_se(e = strrchr(f, '.'));
+
+        a = p - f;
+        b = strlen(i);
+
+        s = new(char, a + 1 + b + strlen(e) + 1);
+        if (!s)
+                return -ENOMEM;
+
+        strcpy(mempcpy(mempcpy(s, f, a + 1), i, b), e);
+
+        /* Make sure the resulting name still is valid, i.e. didn't grow too large */
+        if (!unit_name_is_valid(s, UNIT_NAME_INSTANCE))
+                return -EINVAL;
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+int unit_name_template(const char *f, char **ret) {
+        const char *p, *e;
+        char *s;
+        size_t a;
+
+        assert(f);
+        assert(ret);
+
+        if (!unit_name_is_valid(f, UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE))
+                return -EINVAL;
+
+        assert_se(p = strchr(f, '@'));
+        assert_se(e = strrchr(f, '.'));
+
+        a = p - f;
+
+        s = new(char, a + 1 + strlen(e) + 1);
+        if (!s)
+                return -ENOMEM;
+
+        strcpy(mempcpy(s, f, a + 1), e);
+
+        *ret = s;
+        return 0;
+}
+
+bool unit_name_is_hashed(const char *name) {
+        char *s;
+
+        if (!unit_name_is_valid(name, UNIT_NAME_PLAIN))
+                return false;
+
+        assert_se(s = strrchr(name, '.'));
+
+        if (s - name < UNIT_NAME_HASH_LENGTH_CHARS + 1)
+                return false;
+
+        s -= UNIT_NAME_HASH_LENGTH_CHARS;
+        if (s[-1] != '_')
+                return false;
+
+        for (size_t i = 0; i < UNIT_NAME_HASH_LENGTH_CHARS; i++)
+                if (!strchr(LOWERCASE_HEXDIGITS, s[i]))
+                        return false;
+
+        return true;
+}
+
+int unit_name_hash_long(const char *name, char **ret) {
+        _cleanup_free_ char *n = NULL, *hash = NULL;
+        char *suffix;
+        le64_t h;
+        size_t len;
+
+        if (strlen(name) < UNIT_NAME_MAX)
+                return -EMSGSIZE;
+
+        suffix = strrchr(name, '.');
+        if (!suffix)
+                return -EINVAL;
+
+        if (unit_type_from_string(suffix+1) < 0)
+                return -EINVAL;
+
+        h = htole64(siphash24_string(name, LONG_UNIT_NAME_HASH_KEY.bytes));
+
+        hash = hexmem(&h, sizeof(h));
+        if (!hash)
+                return -ENOMEM;
+
+        assert_se(strlen(hash) == UNIT_NAME_HASH_LENGTH_CHARS);
+
+        len = UNIT_NAME_MAX - 1 - strlen(suffix+1) - UNIT_NAME_HASH_LENGTH_CHARS - 2;
+        assert(len > 0 && len < UNIT_NAME_MAX);
+
+        n = strndup(name, len);
+        if (!n)
+                return -ENOMEM;
+
+        if (!strextend(&n, "_", hash, suffix))
+                return -ENOMEM;
+        assert_se(unit_name_is_valid(n, UNIT_NAME_PLAIN));
+
+        *ret = TAKE_PTR(n);
+
+        return 0;
+}
+
+int unit_name_from_path(const char *path, const char *suffix, char **ret) {
+        _cleanup_free_ char *p = NULL, *s = NULL;
+        int r;
+
+        assert(path);
+        assert(suffix);
+        assert(ret);
+
+        if (!unit_suffix_is_valid(suffix))
+                return -EINVAL;
+
+        r = unit_name_path_escape(path, &p);
+        if (r < 0)
+                return r;
+
+        s = strjoin(p, suffix);
+        if (!s)
+                return -ENOMEM;
+
+        if (strlen(s) >= UNIT_NAME_MAX) {
+                _cleanup_free_ char *n = NULL;
+
+                log_debug("Unit name \"%s\" too long, falling back to hashed unit name.", s);
+
+                r = unit_name_hash_long(s, &n);
+                if (r < 0)
+                        return r;
+
+                free_and_replace(s, n);
+        }
+
+        /* Refuse if this for some other reason didn't result in a valid name */
+        if (!unit_name_is_valid(s, UNIT_NAME_PLAIN))
+                return -EINVAL;
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+int unit_name_from_path_instance(const char *prefix, const char *path, const char *suffix, char **ret) {
+        _cleanup_free_ char *p = NULL, *s = NULL;
+        int r;
+
+        assert(prefix);
+        assert(path);
+        assert(suffix);
+        assert(ret);
+
+        if (!unit_prefix_is_valid(prefix))
+                return -EINVAL;
+
+        if (!unit_suffix_is_valid(suffix))
+                return -EINVAL;
+
+        r = unit_name_path_escape(path, &p);
+        if (r < 0)
+                return r;
+
+        s = strjoin(prefix, "@", p, suffix);
+        if (!s)
+                return -ENOMEM;
+
+        if (strlen(s) >= UNIT_NAME_MAX) /* Return a slightly more descriptive error for this specific condition */
+                return -ENAMETOOLONG;
+
+        /* Refuse if this for some other reason didn't result in a valid name */
+        if (!unit_name_is_valid(s, UNIT_NAME_INSTANCE))
+                return -EINVAL;
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+int unit_name_to_path(const char *name, char **ret) {
+        _cleanup_free_ char *prefix = NULL;
+        int r;
+
+        assert(name);
+
+        r = unit_name_to_prefix(name, &prefix);
+        if (r < 0)
+                return r;
+
+        if (unit_name_is_hashed(name))
+                return -ENAMETOOLONG;
+
+        return unit_name_path_unescape(prefix, ret);
+}
+
+static bool do_escape_mangle(const char *f, bool allow_globs, char *t) {
+        const char *valid_chars;
+        bool mangled = false;
+
+        assert(f);
+        assert(t);
+
+        /* We'll only escape the obvious characters here, to play safe.
+         *
+         * Returns true if any characters were mangled, false otherwise.
+         */
+
+        valid_chars = allow_globs ? VALID_CHARS_GLOB : VALID_CHARS_WITH_AT;
+
+        for (; *f; f++)
+                if (*f == '/') {
+                        *(t++) = '-';
+                        mangled = true;
+                } else if (!strchr(valid_chars, *f)) {
+                        t = do_escape_char(*f, t);
+                        mangled = true;
+                } else
+                        *(t++) = *f;
+        *t = 0;
+
+        return mangled;
+}
+
+/**
+ *  Convert a string to a unit name. /dev/blah is converted to dev-blah.device,
+ *  /blah/blah is converted to blah-blah.mount, anything else is left alone,
+ *  except that @suffix is appended if a valid unit suffix is not present.
+ *
+ *  If @allow_globs, globs characters are preserved. Otherwise, they are escaped.
+ */
+int unit_name_mangle_with_suffix(
+                const char *name,
+                const char *operation,
+                UnitNameMangle flags,
+                const char *suffix,
+                char **ret) {
+
+        _cleanup_free_ char *s = NULL;
+        bool mangled, suggest_escape = true, warn = flags & UNIT_NAME_MANGLE_WARN;
+        int r;
+
+        assert(name);
+        assert(suffix);
+        assert(ret);
+
+        if (isempty(name)) /* We cannot mangle empty unit names to become valid, sorry. */
+                return -EINVAL;
+
+        if (!unit_suffix_is_valid(suffix))
+                return -EINVAL;
+
+        /* Already a fully valid unit name? If so, no mangling is necessary... */
+        if (unit_name_is_valid(name, UNIT_NAME_ANY))
+                goto good;
+
+        /* Already a fully valid globbing expression? If so, no mangling is necessary either... */
+        if (string_is_glob(name) && in_charset(name, VALID_CHARS_GLOB)) {
+                if (flags & UNIT_NAME_MANGLE_GLOB)
+                        goto good;
+                log_full(warn ? LOG_NOTICE : LOG_DEBUG,
+                         "Glob pattern passed%s%s, but globs are not supported for this.",
+                         operation ? " " : "", strempty(operation));
+                suggest_escape = false;
+        }
+
+        if (path_is_absolute(name)) {
+                _cleanup_free_ char *n = NULL;
+
+                r = path_simplify_alloc(name, &n);
+                if (r < 0)
+                        return r;
+
+                if (is_device_path(n)) {
+                        r = unit_name_from_path(n, ".device", ret);
+                        if (r >= 0)
+                                return 1;
+                        if (r != -EINVAL)
+                                return r;
+                }
+
+                r = unit_name_from_path(n, ".mount", ret);
+                if (r >= 0)
+                        return 1;
+                if (r != -EINVAL)
+                        return r;
+        }
+
+        s = new(char, strlen(name) * 4 + strlen(suffix) + 1);
+        if (!s)
+                return -ENOMEM;
+
+        mangled = do_escape_mangle(name, flags & UNIT_NAME_MANGLE_GLOB, s);
+        if (mangled)
+                log_full(warn ? LOG_NOTICE : LOG_DEBUG,
+                         "Invalid unit name \"%s\" escaped as \"%s\"%s.",
+                         name, s,
+                         suggest_escape ? " (maybe you should use systemd-escape?)" : "");
+
+        /* Append a suffix if it doesn't have any, but only if this is not a glob, so that we can allow
+         * "foo.*" as a valid glob. */
+        if ((!(flags & UNIT_NAME_MANGLE_GLOB) || !string_is_glob(s)) && unit_name_to_type(s) < 0)
+                strcat(s, suffix);
+
+        /* Make sure mangling didn't grow this too large (but don't do this check if globbing is allowed,
+         * since globs generally do not qualify as valid unit names) */
+        if (!FLAGS_SET(flags, UNIT_NAME_MANGLE_GLOB) && !unit_name_is_valid(s, UNIT_NAME_ANY))
+                return -EINVAL;
+
+        *ret = TAKE_PTR(s);
+        return 1;
+
+good:
+        s = strdup(name);
+        if (!s)
+                return -ENOMEM;
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+int slice_build_parent_slice(const char *slice, char **ret) {
+        _cleanup_free_ char *s = NULL;
+        char *dash;
+        int r;
+
+        assert(slice);
+        assert(ret);
+
+        if (!slice_name_is_valid(slice))
+                return -EINVAL;
+
+        if (streq(slice, SPECIAL_ROOT_SLICE)) {
+                *ret = NULL;
+                return 0;
+        }
+
+        s = strdup(slice);
+        if (!s)
+                return -ENOMEM;
+
+        dash = strrchr(s, '-');
+        if (dash)
+                strcpy(dash, ".slice");
+        else {
+                r = free_and_strdup(&s, SPECIAL_ROOT_SLICE);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret = TAKE_PTR(s);
+        return 1;
+}
+
+int slice_build_subslice(const char *slice, const char *name, char **ret) {
+        char *subslice;
+
+        assert(slice);
+        assert(name);
+        assert(ret);
+
+        if (!slice_name_is_valid(slice))
+                return -EINVAL;
+
+        if (!unit_prefix_is_valid(name))
+                return -EINVAL;
+
+        if (streq(slice, SPECIAL_ROOT_SLICE))
+                subslice = strjoin(name, ".slice");
+        else {
+                char *e;
+
+                assert_se(e = endswith(slice, ".slice"));
+
+                subslice = new(char, (e - slice) + 1 + strlen(name) + 6 + 1);
+                if (!subslice)
+                        return -ENOMEM;
+
+                stpcpy(stpcpy(stpcpy(mempcpy(subslice, slice, e - slice), "-"), name), ".slice");
+        }
+
+        *ret = subslice;
+        return 0;
+}
+
+bool slice_name_is_valid(const char *name) {
+        const char *p, *e;
+        bool dash = false;
+
+        if (!unit_name_is_valid(name, UNIT_NAME_PLAIN))
+                return false;
+
+        if (streq(name, SPECIAL_ROOT_SLICE))
+                return true;
+
+        e = endswith(name, ".slice");
+        if (!e)
+                return false;
+
+        for (p = name; p < e; p++) {
+
+                if (*p == '-') {
+
+                        /* Don't allow initial dash */
+                        if (p == name)
+                                return false;
+
+                        /* Don't allow multiple dashes */
+                        if (dash)
+                                return false;
+
+                        dash = true;
+                } else
+                        dash = false;
+        }
+
+        /* Don't allow trailing hash */
+        if (dash)
+                return false;
+
+        return true;
+}
+
+bool unit_name_prefix_equal(const char *a, const char *b) {
+        const char *p, *q;
+
+        assert(a);
+        assert(b);
+
+        if (!unit_name_is_valid(a, UNIT_NAME_ANY) || !unit_name_is_valid(b, UNIT_NAME_ANY))
+                return false;
+
+        p = strchr(a, '@');
+        if (!p)
+                p = strrchr(a, '.');
+
+        q = strchr(b, '@');
+        if (!q)
+                q = strrchr(b, '.');
+
+        assert(p);
+        assert(q);
+
+        return memcmp_nn(a, p - a, b, q - b) == 0;
+}
diff --git a/src/basic/unit-name.h b/src/basic/unit-name.h
new file mode 100644
index 0000000..eaa701e
--- /dev/null
+++ b/src/basic/unit-name.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "macro.h"
+#include "unit-def.h"
+
+#define UNIT_NAME_MAX 256
+
+typedef enum UnitNameFlags {
+        UNIT_NAME_PLAIN    = 1 << 0, /* Allow foo.service */
+        UNIT_NAME_TEMPLATE = 1 << 1, /* Allow foo@.service */
+        UNIT_NAME_INSTANCE = 1 << 2, /* Allow foo@bar.service */
+        UNIT_NAME_ANY = UNIT_NAME_PLAIN|UNIT_NAME_TEMPLATE|UNIT_NAME_INSTANCE,
+        _UNIT_NAME_INVALID = -EINVAL,
+} UnitNameFlags;
+
+bool unit_name_is_valid(const char *n, UnitNameFlags flags) _pure_;
+bool unit_prefix_is_valid(const char *p) _pure_;
+bool unit_instance_is_valid(const char *i) _pure_;
+bool unit_suffix_is_valid(const char *s) _pure_;
+
+int unit_name_to_prefix(const char *n, char **ret);
+UnitNameFlags unit_name_to_instance(const char *n, char **ret);
+static inline UnitNameFlags unit_name_classify(const char *n) {
+        return unit_name_to_instance(n, NULL);
+}
+int unit_name_to_prefix_and_instance(const char *n, char **ret);
+
+UnitType unit_name_to_type(const char *n) _pure_;
+
+int unit_name_change_suffix(const char *n, const char *suffix, char **ret);
+
+int unit_name_build(const char *prefix, const char *instance, const char *suffix, char **ret);
+int unit_name_build_from_type(const char *prefix, const char *instance, UnitType, char **ret);
+
+char *unit_name_escape(const char *f);
+int unit_name_unescape(const char *f, char **ret);
+int unit_name_path_escape(const char *f, char **ret);
+int unit_name_path_unescape(const char *f, char **ret);
+
+int unit_name_replace_instance(const char *f, const char *i, char **ret);
+
+int unit_name_template(const char *f, char **ret);
+
+int unit_name_hash_long(const char *name, char **ret);
+bool unit_name_is_hashed(const char *name);
+
+int unit_name_from_path(const char *path, const char *suffix, char **ret);
+int unit_name_from_path_instance(const char *prefix, const char *path, const char *suffix, char **ret);
+int unit_name_to_path(const char *name, char **ret);
+
+typedef enum UnitNameMangle {
+        UNIT_NAME_MANGLE_GLOB = 1 << 0,
+        UNIT_NAME_MANGLE_WARN = 1 << 1,
+} UnitNameMangle;
+
+int unit_name_mangle_with_suffix(const char *name, const char *operation, UnitNameMangle flags, const char *suffix, char **ret);
+
+static inline int unit_name_mangle(const char *name, UnitNameMangle flags, char **ret) {
+        return unit_name_mangle_with_suffix(name, NULL, flags, ".service", ret);
+}
+
+int slice_build_parent_slice(const char *slice, char **ret);
+int slice_build_subslice(const char *slice, const char *name, char **subslice);
+bool slice_name_is_valid(const char *name);
+
+bool unit_name_prefix_equal(const char *a, const char *b);
diff --git a/src/basic/user-util.c b/src/basic/user-util.c
new file mode 100644
index 0000000..9e6926b
--- /dev/null
+++ b/src/basic/user-util.c
@@ -0,0 +1,1060 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "lock-util.h"
+#include "macro.h"
+#include "mkdir.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+#include "utf8.h"
+
+bool uid_is_valid(uid_t uid) {
+
+        /* Also see POSIX IEEE Std 1003.1-2008, 2016 Edition, 3.436. */
+
+        /* Some libc APIs use UID_INVALID as special placeholder */
+        if (uid == (uid_t) UINT32_C(0xFFFFFFFF))
+                return false;
+
+        /* A long time ago UIDs where 16 bit, hence explicitly avoid the 16-bit -1 too */
+        if (uid == (uid_t) UINT32_C(0xFFFF))
+                return false;
+
+        return true;
+}
+
+int parse_uid(const char *s, uid_t *ret) {
+        uint32_t uid = 0;
+        int r;
+
+        assert(s);
+
+        assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+
+        /* We are very strict when parsing UIDs, and prohibit +/- as prefix, leading zero as prefix, and
+         * whitespace. We do this, since this call is often used in a context where we parse things as UID
+         * first, and if that doesn't work we fall back to NSS. Thus we really want to make sure that UIDs
+         * are parsed as UIDs only if they really really look like UIDs. */
+        r = safe_atou32_full(s, 10
+                             | SAFE_ATO_REFUSE_PLUS_MINUS
+                             | SAFE_ATO_REFUSE_LEADING_ZERO
+                             | SAFE_ATO_REFUSE_LEADING_WHITESPACE, &uid);
+        if (r < 0)
+                return r;
+
+        if (!uid_is_valid(uid))
+                return -ENXIO; /* we return ENXIO instead of EINVAL
+                                * here, to make it easy to distinguish
+                                * invalid numeric uids from invalid
+                                * strings. */
+
+        if (ret)
+                *ret = uid;
+
+        return 0;
+}
+
+int parse_uid_range(const char *s, uid_t *ret_lower, uid_t *ret_upper) {
+        _cleanup_free_ char *word = NULL;
+        uid_t l, u;
+        int r;
+
+        assert(s);
+        assert(ret_lower);
+        assert(ret_upper);
+
+        r = extract_first_word(&s, &word, "-", EXTRACT_DONT_COALESCE_SEPARATORS);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EINVAL;
+
+        r = parse_uid(word, &l);
+        if (r < 0)
+                return r;
+
+        /* Check for the upper bound and extract it if needed */
+        if (!s)
+                /* Single number with no dash. */
+                u = l;
+        else if (!*s)
+                /* Trailing dash is an error. */
+                return -EINVAL;
+        else {
+                r = parse_uid(s, &u);
+                if (r < 0)
+                        return r;
+
+                if (l > u)
+                        return -EINVAL;
+        }
+
+        *ret_lower = l;
+        *ret_upper = u;
+        return 0;
+}
+
+char* getlogname_malloc(void) {
+        uid_t uid;
+        struct stat st;
+
+        if (isatty(STDIN_FILENO) && fstat(STDIN_FILENO, &st) >= 0)
+                uid = st.st_uid;
+        else
+                uid = getuid();
+
+        return uid_to_name(uid);
+}
+
+char* getusername_malloc(void) {
+        const char *e;
+
+        e = secure_getenv("USER");
+        if (e)
+                return strdup(e);
+
+        return uid_to_name(getuid());
+}
+
+bool is_nologin_shell(const char *shell) {
+        return PATH_IN_SET(shell,
+                           /* 'nologin' is the friendliest way to disable logins for a user account. It prints a nice
+                            * message and exits. Different distributions place the binary at different places though,
+                            * hence let's list them all. */
+                           "/bin/nologin",
+                           "/sbin/nologin",
+                           "/usr/bin/nologin",
+                           "/usr/sbin/nologin",
+                           /* 'true' and 'false' work too for the same purpose, but are less friendly as they don't do
+                            * any message printing. Different distributions place the binary at various places but at
+                            * least not in the 'sbin' directory. */
+                           "/bin/false",
+                           "/usr/bin/false",
+                           "/bin/true",
+                           "/usr/bin/true");
+}
+
+const char* default_root_shell_at(int rfd) {
+        /* We want to use the preferred shell, i.e. DEFAULT_USER_SHELL, which usually
+         * will be /bin/bash. Fall back to /bin/sh if DEFAULT_USER_SHELL is not found,
+         * or any access errors. */
+
+        assert(rfd >= 0 || rfd == AT_FDCWD);
+
+        int r = chaseat(rfd, DEFAULT_USER_SHELL, CHASE_AT_RESOLVE_IN_ROOT, NULL, NULL);
+        if (r < 0 && r != -ENOENT)
+                log_debug_errno(r, "Failed to look up shell '%s': %m", DEFAULT_USER_SHELL);
+        if (r > 0)
+                return DEFAULT_USER_SHELL;
+
+        return "/bin/sh";
+}
+
+const char* default_root_shell(const char *root) {
+        _cleanup_close_ int rfd = -EBADF;
+
+        rfd = open(empty_to_root(root), O_CLOEXEC | O_DIRECTORY | O_PATH);
+        if (rfd < 0)
+                return "/bin/sh";
+
+        return default_root_shell_at(rfd);
+}
+
+static int synthesize_user_creds(
+                const char **username,
+                uid_t *uid, gid_t *gid,
+                const char **home,
+                const char **shell,
+                UserCredsFlags flags) {
+
+        /* We enforce some special rules for uid=0 and uid=65534: in order to avoid NSS lookups for root we hardcode
+         * their user record data. */
+
+        if (STR_IN_SET(*username, "root", "0")) {
+                *username = "root";
+
+                if (uid)
+                        *uid = 0;
+                if (gid)
+                        *gid = 0;
+
+                if (home)
+                        *home = "/root";
+
+                if (shell)
+                        *shell = default_root_shell(NULL);
+
+                return 0;
+        }
+
+        if (STR_IN_SET(*username, NOBODY_USER_NAME, "65534") &&
+            synthesize_nobody()) {
+                *username = NOBODY_USER_NAME;
+
+                if (uid)
+                        *uid = UID_NOBODY;
+                if (gid)
+                        *gid = GID_NOBODY;
+
+                if (home)
+                        *home = FLAGS_SET(flags, USER_CREDS_CLEAN) ? NULL : "/";
+
+                if (shell)
+                        *shell = FLAGS_SET(flags, USER_CREDS_CLEAN) ? NULL : NOLOGIN;
+
+                return 0;
+        }
+
+        return -ENOMEDIUM;
+}
+
+int get_user_creds(
+                const char **username,
+                uid_t *uid, gid_t *gid,
+                const char **home,
+                const char **shell,
+                UserCredsFlags flags) {
+
+        uid_t u = UID_INVALID;
+        struct passwd *p;
+        int r;
+
+        assert(username);
+        assert(*username);
+
+        if (!FLAGS_SET(flags, USER_CREDS_PREFER_NSS) ||
+            (!home && !shell)) {
+
+                /* So here's the deal: normally, we'll try to synthesize all records we can synthesize, and override
+                 * the user database with that. However, if the user specifies USER_CREDS_PREFER_NSS then the
+                 * user database will override the synthetic records instead — except if the user is only interested in
+                 * the UID and/or GID (but not the home directory, or the shell), in which case we'll always override
+                 * the user database (i.e. the USER_CREDS_PREFER_NSS flag has no effect in this case). Why?
+                 * Simply because there are valid usecase where the user might change the home directory or the shell
+                 * of the relevant users, but changing the UID/GID mappings for them is something we explicitly don't
+                 * support. */
+
+                r = synthesize_user_creds(username, uid, gid, home, shell, flags);
+                if (r >= 0)
+                        return 0;
+                if (r != -ENOMEDIUM) /* not a username we can synthesize */
+                        return r;
+        }
+
+        if (parse_uid(*username, &u) >= 0) {
+                errno = 0;
+                p = getpwuid(u);
+
+                /* If there are multiple users with the same id, make sure to leave $USER to the configured value
+                 * instead of the first occurrence in the database. However if the uid was configured by a numeric uid,
+                 * then let's pick the real username from /etc/passwd. */
+                if (p)
+                        *username = p->pw_name;
+                else if (FLAGS_SET(flags, USER_CREDS_ALLOW_MISSING) && !gid && !home && !shell) {
+
+                        /* If the specified user is a numeric UID and it isn't in the user database, and the caller
+                         * passed USER_CREDS_ALLOW_MISSING and was only interested in the UID, then just return that
+                         * and don't complain. */
+
+                        if (uid)
+                                *uid = u;
+
+                        return 0;
+                }
+        } else {
+                errno = 0;
+                p = getpwnam(*username);
+        }
+        if (!p) {
+                /* getpwnam() may fail with ENOENT if /etc/passwd is missing.
+                 * For us that is equivalent to the name not being defined. */
+                r = IN_SET(errno, 0, ENOENT) ? -ESRCH : -errno;
+
+                /* If the user requested that we only synthesize as fallback, do so now */
+                if (FLAGS_SET(flags, USER_CREDS_PREFER_NSS)) {
+                        if (synthesize_user_creds(username, uid, gid, home, shell, flags) >= 0)
+                                return 0;
+                }
+
+                return r;
+        }
+
+        if (uid) {
+                if (!uid_is_valid(p->pw_uid))
+                        return -EBADMSG;
+
+                *uid = p->pw_uid;
+        }
+
+        if (gid) {
+                if (!gid_is_valid(p->pw_gid))
+                        return -EBADMSG;
+
+                *gid = p->pw_gid;
+        }
+
+        if (home) {
+                if (FLAGS_SET(flags, USER_CREDS_CLEAN) &&
+                    (empty_or_root(p->pw_dir) ||
+                     !path_is_valid(p->pw_dir) ||
+                     !path_is_absolute(p->pw_dir)))
+                        *home = NULL; /* Note: we don't insist on normalized paths, since there are setups that have /./ in the path */
+                else
+                        *home = p->pw_dir;
+        }
+
+        if (shell) {
+                if (FLAGS_SET(flags, USER_CREDS_CLEAN) &&
+                    (isempty(p->pw_shell) ||
+                     !path_is_valid(p->pw_shell) ||
+                     !path_is_absolute(p->pw_shell) ||
+                     is_nologin_shell(p->pw_shell)))
+                        *shell = NULL;
+                else
+                        *shell = p->pw_shell;
+        }
+
+        return 0;
+}
+
+int get_group_creds(const char **groupname, gid_t *gid, UserCredsFlags flags) {
+        struct group *g;
+        gid_t id;
+
+        assert(groupname);
+
+        /* We enforce some special rules for gid=0: in order to avoid NSS lookups for root we hardcode its data. */
+
+        if (STR_IN_SET(*groupname, "root", "0")) {
+                *groupname = "root";
+
+                if (gid)
+                        *gid = 0;
+
+                return 0;
+        }
+
+        if (STR_IN_SET(*groupname, NOBODY_GROUP_NAME, "65534") &&
+            synthesize_nobody()) {
+                *groupname = NOBODY_GROUP_NAME;
+
+                if (gid)
+                        *gid = GID_NOBODY;
+
+                return 0;
+        }
+
+        if (parse_gid(*groupname, &id) >= 0) {
+                errno = 0;
+                g = getgrgid(id);
+
+                if (g)
+                        *groupname = g->gr_name;
+                else if (FLAGS_SET(flags, USER_CREDS_ALLOW_MISSING)) {
+                        if (gid)
+                                *gid = id;
+
+                        return 0;
+                }
+        } else {
+                errno = 0;
+                g = getgrnam(*groupname);
+        }
+
+        if (!g)
+                /* getgrnam() may fail with ENOENT if /etc/group is missing.
+                 * For us that is equivalent to the name not being defined. */
+                return IN_SET(errno, 0, ENOENT) ? -ESRCH : -errno;
+
+        if (gid) {
+                if (!gid_is_valid(g->gr_gid))
+                        return -EBADMSG;
+
+                *gid = g->gr_gid;
+        }
+
+        return 0;
+}
+
+char* uid_to_name(uid_t uid) {
+        char *ret;
+        int r;
+
+        /* Shortcut things to avoid NSS lookups */
+        if (uid == 0)
+                return strdup("root");
+        if (uid == UID_NOBODY && synthesize_nobody())
+                return strdup(NOBODY_USER_NAME);
+
+        if (uid_is_valid(uid)) {
+                long bufsize;
+
+                bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
+                if (bufsize <= 0)
+                        bufsize = 4096;
+
+                for (;;) {
+                        struct passwd pwbuf, *pw = NULL;
+                        _cleanup_free_ char *buf = NULL;
+
+                        buf = malloc(bufsize);
+                        if (!buf)
+                                return NULL;
+
+                        r = getpwuid_r(uid, &pwbuf, buf, (size_t) bufsize, &pw);
+                        if (r == 0 && pw)
+                                return strdup(pw->pw_name);
+                        if (r != ERANGE)
+                                break;
+
+                        if (bufsize > LONG_MAX/2) /* overflow check */
+                                return NULL;
+
+                        bufsize *= 2;
+                }
+        }
+
+        if (asprintf(&ret, UID_FMT, uid) < 0)
+                return NULL;
+
+        return ret;
+}
+
+char* gid_to_name(gid_t gid) {
+        char *ret;
+        int r;
+
+        if (gid == 0)
+                return strdup("root");
+        if (gid == GID_NOBODY && synthesize_nobody())
+                return strdup(NOBODY_GROUP_NAME);
+
+        if (gid_is_valid(gid)) {
+                long bufsize;
+
+                bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
+                if (bufsize <= 0)
+                        bufsize = 4096;
+
+                for (;;) {
+                        struct group grbuf, *gr = NULL;
+                        _cleanup_free_ char *buf = NULL;
+
+                        buf = malloc(bufsize);
+                        if (!buf)
+                                return NULL;
+
+                        r = getgrgid_r(gid, &grbuf, buf, (size_t) bufsize, &gr);
+                        if (r == 0 && gr)
+                                return strdup(gr->gr_name);
+                        if (r != ERANGE)
+                                break;
+
+                        if (bufsize > LONG_MAX/2) /* overflow check */
+                                return NULL;
+
+                        bufsize *= 2;
+                }
+        }
+
+        if (asprintf(&ret, GID_FMT, gid) < 0)
+                return NULL;
+
+        return ret;
+}
+
+static bool gid_list_has(const gid_t *list, size_t size, gid_t val) {
+        for (size_t i = 0; i < size; i++)
+                if (list[i] == val)
+                        return true;
+        return false;
+}
+
+int in_gid(gid_t gid) {
+        _cleanup_free_ gid_t *gids = NULL;
+        int ngroups;
+
+        if (getgid() == gid)
+                return 1;
+
+        if (getegid() == gid)
+                return 1;
+
+        if (!gid_is_valid(gid))
+                return -EINVAL;
+
+        ngroups = getgroups_alloc(&gids);
+        if (ngroups < 0)
+                return ngroups;
+
+        return gid_list_has(gids, ngroups, gid);
+}
+
+int in_group(const char *name) {
+        int r;
+        gid_t gid;
+
+        r = get_group_creds(&name, &gid, 0);
+        if (r < 0)
+                return r;
+
+        return in_gid(gid);
+}
+
+int merge_gid_lists(const gid_t *list1, size_t size1, const gid_t *list2, size_t size2, gid_t **ret) {
+        size_t nresult = 0;
+        assert(ret);
+
+        if (size2 > INT_MAX - size1)
+                return -ENOBUFS;
+
+        gid_t *buf = new(gid_t, size1 + size2);
+        if (!buf)
+                return -ENOMEM;
+
+        /* Duplicates need to be skipped on merging, otherwise they'll be passed on and stored in the kernel. */
+        for (size_t i = 0; i < size1; i++)
+                if (!gid_list_has(buf, nresult, list1[i]))
+                        buf[nresult++] = list1[i];
+        for (size_t i = 0; i < size2; i++)
+                if (!gid_list_has(buf, nresult, list2[i]))
+                        buf[nresult++] = list2[i];
+        *ret = buf;
+        return (int)nresult;
+}
+
+int getgroups_alloc(gid_t** gids) {
+        gid_t *allocated;
+        _cleanup_free_  gid_t *p = NULL;
+        int ngroups = 8;
+        unsigned attempt = 0;
+
+        allocated = new(gid_t, ngroups);
+        if (!allocated)
+                return -ENOMEM;
+        p = allocated;
+
+        for (;;) {
+                ngroups = getgroups(ngroups, p);
+                if (ngroups >= 0)
+                        break;
+                if (errno != EINVAL)
+                        return -errno;
+
+                /* Give up eventually */
+                if (attempt++ > 10)
+                        return -EINVAL;
+
+                /* Get actual size needed, and size the array explicitly. Note that this is potentially racy
+                 * to use (in multi-threaded programs), hence let's call this in a loop. */
+                ngroups = getgroups(0, NULL);
+                if (ngroups < 0)
+                        return -errno;
+                if (ngroups == 0)
+                        return false;
+
+                free(allocated);
+
+                p = allocated = new(gid_t, ngroups);
+                if (!allocated)
+                        return -ENOMEM;
+        }
+
+        *gids = TAKE_PTR(p);
+        return ngroups;
+}
+
+int get_home_dir(char **ret) {
+        struct passwd *p;
+        const char *e;
+        uid_t u;
+
+        assert(ret);
+
+        /* Take the user specified one */
+        e = secure_getenv("HOME");
+        if (e && path_is_valid(e) && path_is_absolute(e))
+                goto found;
+
+        /* Hardcode home directory for root and nobody to avoid NSS */
+        u = getuid();
+        if (u == 0) {
+                e = "/root";
+                goto found;
+        }
+
+        if (u == UID_NOBODY && synthesize_nobody()) {
+                e = "/";
+                goto found;
+        }
+
+        /* Check the database... */
+        errno = 0;
+        p = getpwuid(u);
+        if (!p)
+                return errno_or_else(ESRCH);
+        e = p->pw_dir;
+
+        if (!path_is_valid(e) || !path_is_absolute(e))
+                return -EINVAL;
+
+ found:
+        return path_simplify_alloc(e, ret);
+}
+
+int get_shell(char **ret) {
+        struct passwd *p;
+        const char *e;
+        uid_t u;
+
+        assert(ret);
+
+        /* Take the user specified one */
+        e = secure_getenv("SHELL");
+        if (e && path_is_valid(e) && path_is_absolute(e))
+                goto found;
+
+        /* Hardcode shell for root and nobody to avoid NSS */
+        u = getuid();
+        if (u == 0) {
+                e = default_root_shell(NULL);
+                goto found;
+        }
+        if (u == UID_NOBODY && synthesize_nobody()) {
+                e = NOLOGIN;
+                goto found;
+        }
+
+        /* Check the database... */
+        errno = 0;
+        p = getpwuid(u);
+        if (!p)
+                return errno_or_else(ESRCH);
+        e = p->pw_shell;
+
+        if (!path_is_valid(e) || !path_is_absolute(e))
+                return -EINVAL;
+
+ found:
+        return path_simplify_alloc(e, ret);
+}
+
+int reset_uid_gid(void) {
+        int r;
+
+        r = maybe_setgroups(0, NULL);
+        if (r < 0)
+                return r;
+
+        if (setresgid(0, 0, 0) < 0)
+                return -errno;
+
+        return RET_NERRNO(setresuid(0, 0, 0));
+}
+
+int take_etc_passwd_lock(const char *root) {
+        int r;
+
+        /* This is roughly the same as lckpwdf(), but not as awful. We don't want to use alarm() and signals,
+         * hence we implement our own trivial version of this.
+         *
+         * Note that shadow-utils also takes per-database locks in addition to lckpwdf(). However, we don't,
+         * given that they are redundant: they invoke lckpwdf() first and keep it during everything they do.
+         * The per-database locks are awfully racy, and thus we just won't do them. */
+
+        _cleanup_free_ char *path = path_join(root, ETC_PASSWD_LOCK_PATH);
+        if (!path)
+                return log_oom_debug();
+
+        (void) mkdir_parents(path, 0755);
+
+        _cleanup_close_ int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0600);
+        if (fd < 0)
+                return log_debug_errno(errno, "Cannot open %s: %m", path);
+
+        r = unposix_lock(fd, LOCK_EX);
+        if (r < 0)
+                return log_debug_errno(r, "Locking %s failed: %m", path);
+
+        return TAKE_FD(fd);
+}
+
+bool valid_user_group_name(const char *u, ValidUserFlags flags) {
+        const char *i;
+
+        /* Checks if the specified name is a valid user/group name. There are two flavours of this call:
+         * strict mode is the default which is POSIX plus some extra rules; and relaxed mode where we accept
+         * pretty much everything except the really worst offending names.
+         *
+         * Whenever we synthesize users ourselves we should use the strict mode. But when we process users
+         * created by other stuff, let's be more liberal. */
+
+        if (isempty(u)) /* An empty user name is never valid */
+                return false;
+
+        if (parse_uid(u, NULL) >= 0) /* Something that parses as numeric UID string is valid exactly when the
+                                      * flag for it is set */
+                return FLAGS_SET(flags, VALID_USER_ALLOW_NUMERIC);
+
+        if (FLAGS_SET(flags, VALID_USER_RELAX)) {
+
+                /* In relaxed mode we just check very superficially. Apparently SSSD and other stuff is
+                 * extremely liberal (way too liberal if you ask me, even inserting "@" in user names, which
+                 * is bound to cause problems for example when used with an MTA), hence only filter the most
+                 * obvious cases, or where things would result in an invalid entry if such a user name would
+                 * show up in /etc/passwd (or equivalent getent output).
+                 *
+                 * Note that we stepped far out of POSIX territory here. It's not our fault though, but
+                 * SSSD's, Samba's and everybody else who ignored POSIX on this. (I mean, I am happy to step
+                 * outside of POSIX' bounds any day, but I must say in this case I probably wouldn't
+                 * have...) */
+
+                if (startswith(u, " ") || endswith(u, " ")) /* At least expect whitespace padding is removed
+                                                             * at front and back (accept in the middle, since
+                                                             * that's apparently a thing on Windows). Note
+                                                             * that this also blocks usernames consisting of
+                                                             * whitespace only. */
+                        return false;
+
+                if (!utf8_is_valid(u)) /* We want to synthesize JSON from this, hence insist on UTF-8 */
+                        return false;
+
+                if (string_has_cc(u, NULL)) /* CC characters are just dangerous (and \n in particular is the
+                                             * record separator in /etc/passwd), so we can't allow that. */
+                        return false;
+
+                if (strpbrk(u, ":/")) /* Colons are the field separator in /etc/passwd, we can't allow
+                                       * that. Slashes are special to file systems paths and user names
+                                       * typically show up in the file system as home directories, hence
+                                       * don't allow slashes. */
+                        return false;
+
+                if (in_charset(u, "0123456789")) /* Don't allow fully numeric strings, they might be confused
+                                                  * with UIDs (note that this test is more broad than
+                                                  * the parse_uid() test above, as it will cover more than
+                                                  * the 32-bit range, and it will detect 65535 (which is in
+                                                  * invalid UID, even though in the unsigned 32 bit range) */
+                        return false;
+
+                if (u[0] == '-' && in_charset(u + 1, "0123456789")) /* Don't allow negative fully numeric
+                                                                     * strings either. After all some people
+                                                                     * write 65535 as -1 (even though that's
+                                                                     * not even true on 32-bit uid_t
+                                                                     * anyway) */
+                        return false;
+
+                if (dot_or_dot_dot(u)) /* User names typically become home directory names, and these two are
+                                        * special in that context, don't allow that. */
+                        return false;
+
+                /* Compare with strict result and warn if result doesn't match */
+                if (FLAGS_SET(flags, VALID_USER_WARN) && !valid_user_group_name(u, 0))
+                        log_struct(LOG_NOTICE,
+                                   LOG_MESSAGE("Accepting user/group name '%s', which does not match strict user/group name rules.", u),
+                                   "USER_GROUP_NAME=%s", u,
+                                   "MESSAGE_ID=" SD_MESSAGE_UNSAFE_USER_NAME_STR);
+
+                /* Note that we make no restrictions on the length in relaxed mode! */
+        } else {
+                long sz;
+                size_t l;
+
+                /* Also see POSIX IEEE Std 1003.1-2008, 2016 Edition, 3.437. We are a bit stricter here
+                 * however. Specifically we deviate from POSIX rules:
+                 *
+                 * - We don't allow empty user names (see above)
+                 * - We require that names fit into the appropriate utmp field
+                 * - We don't allow any dots (this conflicts with chown syntax which permits dots as user/group name separator)
+                 * - We don't allow dashes or digit as the first character
+                 *
+                 * Note that other systems are even more restrictive, and don't permit underscores or uppercase characters.
+                 */
+
+                if (!ascii_isalpha(u[0]) &&
+                    u[0] != '_')
+                        return false;
+
+                for (i = u+1; *i; i++)
+                        if (!ascii_isalpha(*i) &&
+                            !ascii_isdigit(*i) &&
+                            !IN_SET(*i, '_', '-'))
+                                return false;
+
+                l = i - u;
+
+                sz = sysconf(_SC_LOGIN_NAME_MAX);
+                assert_se(sz > 0);
+
+                if (l > (size_t) sz)
+                        return false;
+                if (l > NAME_MAX) /* must fit in a filename */
+                        return false;
+                if (l > UT_NAMESIZE - 1)
+                        return false;
+        }
+
+        return true;
+}
+
+bool valid_gecos(const char *d) {
+
+        if (!d)
+                return false;
+
+        if (!utf8_is_valid(d))
+                return false;
+
+        if (string_has_cc(d, NULL))
+                return false;
+
+        /* Colons are used as field separators, and hence not OK */
+        if (strchr(d, ':'))
+                return false;
+
+        return true;
+}
+
+char* mangle_gecos(const char *d) {
+        char *mangled;
+
+        /* Makes sure the provided string becomes valid as a GEGOS field, by dropping bad chars. glibc's
+         * putwent() only changes \n and : to spaces. We do more: replace all CC too, and remove invalid
+         * UTF-8 */
+
+        mangled = strdup(d);
+        if (!mangled)
+                return NULL;
+
+        for (char *i = mangled; *i; i++) {
+                int len;
+
+                if ((uint8_t) *i < (uint8_t) ' ' || *i == ':') {
+                        *i = ' ';
+                        continue;
+                }
+
+                len = utf8_encoded_valid_unichar(i, SIZE_MAX);
+                if (len < 0) {
+                        *i = ' ';
+                        continue;
+                }
+
+                i += len - 1;
+        }
+
+        return mangled;
+}
+
+bool valid_home(const char *p) {
+        /* Note that this function is also called by valid_shell(), any
+         * changes must account for that. */
+
+        if (isempty(p))
+                return false;
+
+        if (!utf8_is_valid(p))
+                return false;
+
+        if (string_has_cc(p, NULL))
+                return false;
+
+        if (!path_is_absolute(p))
+                return false;
+
+        if (!path_is_normalized(p))
+                return false;
+
+        /* Colons are used as field separators, and hence not OK */
+        if (strchr(p, ':'))
+                return false;
+
+        return true;
+}
+
+int maybe_setgroups(size_t size, const gid_t *list) {
+        int r;
+
+        /* Check if setgroups is allowed before we try to drop all the auxiliary groups */
+        if (size == 0) { /* Dropping all aux groups? */
+                _cleanup_free_ char *setgroups_content = NULL;
+                bool can_setgroups;
+
+                r = read_one_line_file("/proc/self/setgroups", &setgroups_content);
+                if (r == -ENOENT)
+                        /* Old kernels don't have /proc/self/setgroups, so assume we can use setgroups */
+                        can_setgroups = true;
+                else if (r < 0)
+                        return r;
+                else
+                        can_setgroups = streq(setgroups_content, "allow");
+
+                if (!can_setgroups) {
+                        log_debug("Skipping setgroups(), /proc/self/setgroups is set to 'deny'");
+                        return 0;
+                }
+        }
+
+        return RET_NERRNO(setgroups(size, list));
+}
+
+bool synthesize_nobody(void) {
+        /* Returns true when we shall synthesize the "nobody" user (which we do by default). This can be turned off by
+         * touching /etc/systemd/dont-synthesize-nobody in order to provide upgrade compatibility with legacy systems
+         * that used the "nobody" user name and group name for other UIDs/GIDs than 65534.
+         *
+         * Note that we do not employ any kind of synchronization on the following caching variable. If the variable is
+         * accessed in multi-threaded programs in the worst case it might happen that we initialize twice, but that
+         * shouldn't matter as each initialization should come to the same result. */
+        static int cache = -1;
+
+        if (cache < 0)
+                cache = access("/etc/systemd/dont-synthesize-nobody", F_OK) < 0;
+
+        return cache;
+}
+
+int putpwent_sane(const struct passwd *pw, FILE *stream) {
+        assert(pw);
+        assert(stream);
+
+        errno = 0;
+        if (putpwent(pw, stream) != 0)
+                return errno_or_else(EIO);
+
+        return 0;
+}
+
+int putspent_sane(const struct spwd *sp, FILE *stream) {
+        assert(sp);
+        assert(stream);
+
+        errno = 0;
+        if (putspent(sp, stream) != 0)
+                return errno_or_else(EIO);
+
+        return 0;
+}
+
+int putgrent_sane(const struct group *gr, FILE *stream) {
+        assert(gr);
+        assert(stream);
+
+        errno = 0;
+        if (putgrent(gr, stream) != 0)
+                return errno_or_else(EIO);
+
+        return 0;
+}
+
+#if ENABLE_GSHADOW
+int putsgent_sane(const struct sgrp *sg, FILE *stream) {
+        assert(sg);
+        assert(stream);
+
+        errno = 0;
+        if (putsgent(sg, stream) != 0)
+                return errno_or_else(EIO);
+
+        return 0;
+}
+#endif
+
+int fgetpwent_sane(FILE *stream, struct passwd **pw) {
+        assert(stream);
+        assert(pw);
+
+        errno = 0;
+        struct passwd *p = fgetpwent(stream);
+        if (!p && errno != ENOENT)
+                return errno_or_else(EIO);
+
+        *pw = p;
+        return !!p;
+}
+
+int fgetspent_sane(FILE *stream, struct spwd **sp) {
+        assert(stream);
+        assert(sp);
+
+        errno = 0;
+        struct spwd *s = fgetspent(stream);
+        if (!s && errno != ENOENT)
+                return errno_or_else(EIO);
+
+        *sp = s;
+        return !!s;
+}
+
+int fgetgrent_sane(FILE *stream, struct group **gr) {
+        assert(stream);
+        assert(gr);
+
+        errno = 0;
+        struct group *g = fgetgrent(stream);
+        if (!g && errno != ENOENT)
+                return errno_or_else(EIO);
+
+        *gr = g;
+        return !!g;
+}
+
+#if ENABLE_GSHADOW
+int fgetsgent_sane(FILE *stream, struct sgrp **sg) {
+        assert(stream);
+        assert(sg);
+
+        errno = 0;
+        struct sgrp *s = fgetsgent(stream);
+        if (!s && errno != ENOENT)
+                return errno_or_else(EIO);
+
+        *sg = s;
+        return !!s;
+}
+#endif
+
+int is_this_me(const char *username) {
+        uid_t uid;
+        int r;
+
+        /* Checks if the specified username is our current one. Passed string might be a UID or a user name. */
+
+        r = get_user_creds(&username, &uid, NULL, NULL, NULL, USER_CREDS_ALLOW_MISSING);
+        if (r < 0)
+                return r;
+
+        return uid == getuid();
+}
+
+const char* get_home_root(void) {
+        const char *e;
+
+        /* For debug purposes allow overriding where we look for home dirs */
+        e = secure_getenv("SYSTEMD_HOME_ROOT");
+        if (e && path_is_absolute(e) && path_is_normalized(e))
+                return e;
+
+        return "/home";
+}
diff --git a/src/basic/user-util.h b/src/basic/user-util.h
new file mode 100644
index 0000000..f394f62
--- /dev/null
+++ b/src/basic/user-util.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#if ENABLE_GSHADOW
+#  include 
+#endif
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Users managed by systemd-homed. See https://systemd.io/UIDS-GIDS for details how this range fits into the rest of the world */
+#define HOME_UID_MIN ((uid_t) 60001)
+#define HOME_UID_MAX ((uid_t) 60513)
+
+/* Users mapped from host into a container */
+#define MAP_UID_MIN ((uid_t) 60514)
+#define MAP_UID_MAX ((uid_t) 60577)
+
+bool uid_is_valid(uid_t uid);
+
+static inline bool gid_is_valid(gid_t gid) {
+        return uid_is_valid((uid_t) gid);
+}
+
+int parse_uid(const char *s, uid_t* ret_uid);
+int parse_uid_range(const char *s, uid_t *ret_lower, uid_t *ret_upper);
+
+static inline int parse_gid(const char *s, gid_t *ret_gid) {
+        return parse_uid(s, (uid_t*) ret_gid);
+}
+
+char* getlogname_malloc(void);
+char* getusername_malloc(void);
+
+typedef enum UserCredsFlags {
+        USER_CREDS_PREFER_NSS    = 1 << 0,  /* if set, only synthesize user records if database lacks them. Normally we bypass the userdb entirely for the records we can synthesize */
+        USER_CREDS_ALLOW_MISSING = 1 << 1,  /* if a numeric UID string is resolved, be OK if there's no record for it */
+        USER_CREDS_CLEAN         = 1 << 2,  /* try to clean up shell and home fields with invalid data */
+} UserCredsFlags;
+
+int get_user_creds(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell, UserCredsFlags flags);
+int get_group_creds(const char **groupname, gid_t *gid, UserCredsFlags flags);
+
+char* uid_to_name(uid_t uid);
+char* gid_to_name(gid_t gid);
+
+int in_gid(gid_t gid);
+int in_group(const char *name);
+
+int merge_gid_lists(const gid_t *list1, size_t size1, const gid_t *list2, size_t size2, gid_t **result);
+int getgroups_alloc(gid_t** gids);
+
+int get_home_dir(char **ret);
+int get_shell(char **ret);
+
+int reset_uid_gid(void);
+
+int take_etc_passwd_lock(const char *root);
+
+#define UID_INVALID ((uid_t) -1)
+#define GID_INVALID ((gid_t) -1)
+
+#define UID_NOBODY ((uid_t) 65534U)
+#define GID_NOBODY ((gid_t) 65534U)
+
+/* If REMOUNT_IDMAPPING_HOST_ROOT is set for remount_idmap() we'll include a mapping here that maps the host
+ * root user accessing the idmapped mount to the this user ID on the backing fs. This is the last valid UID in
+ * the *signed* 32-bit range. You might wonder why precisely use this specific UID for this purpose? Well, we
+ * definitely cannot use the first 0…65536 UIDs for that, since in most cases that's precisely the file range
+ * we intend to map to some high UID range, and since UID mappings have to be bijective we thus cannot use
+ * them at all. Furthermore the UID range beyond INT32_MAX (i.e. the range above the signed 32-bit range) is
+ * icky, since many APIs cannot use it (example: setfsuid() returns the old UID as signed integer). Following
+ * our usual logic of assigning a 16-bit UID range to each container, so that the upper 16-bit of a 32-bit UID
+ * value indicate kind of a "container ID" and the lower 16-bit map directly to the intended user you can read
+ * this specific UID as the "nobody" user of the container with ID 0x7FFF, which is kinda nice. */
+#define UID_MAPPED_ROOT ((uid_t) (INT32_MAX-1))
+#define GID_MAPPED_ROOT ((gid_t) (INT32_MAX-1))
+
+#define ETC_PASSWD_LOCK_FILENAME ".pwd.lock"
+#define ETC_PASSWD_LOCK_PATH "/etc/" ETC_PASSWD_LOCK_FILENAME
+
+/* The following macros add 1 when converting things, since UID 0 is a valid UID, while the pointer
+ * NULL is special */
+#define PTR_TO_UID(p) ((uid_t) (((uintptr_t) (p))-1))
+#define UID_TO_PTR(u) ((void*) (((uintptr_t) (u))+1))
+
+#define PTR_TO_GID(p) ((gid_t) (((uintptr_t) (p))-1))
+#define GID_TO_PTR(u) ((void*) (((uintptr_t) (u))+1))
+
+static inline bool userns_supported(void) {
+        return access("/proc/self/uid_map", F_OK) >= 0;
+}
+
+typedef enum ValidUserFlags {
+        VALID_USER_RELAX         = 1 << 0,
+        VALID_USER_WARN          = 1 << 1,
+        VALID_USER_ALLOW_NUMERIC = 1 << 2,
+} ValidUserFlags;
+
+bool valid_user_group_name(const char *u, ValidUserFlags flags);
+bool valid_gecos(const char *d);
+char* mangle_gecos(const char *d);
+bool valid_home(const char *p);
+
+static inline bool valid_shell(const char *p) {
+        /* We have the same requirements, so just piggy-back on the home check.
+         *
+         * Let's ignore /etc/shells because this is only applicable to real and
+         * not system users. It is also incompatible with the idea of empty /etc.
+         */
+        return valid_home(p);
+}
+
+int maybe_setgroups(size_t size, const gid_t *list);
+
+bool synthesize_nobody(void);
+
+int fgetpwent_sane(FILE *stream, struct passwd **pw);
+int fgetspent_sane(FILE *stream, struct spwd **sp);
+int fgetgrent_sane(FILE *stream, struct group **gr);
+int putpwent_sane(const struct passwd *pw, FILE *stream);
+int putspent_sane(const struct spwd *sp, FILE *stream);
+int putgrent_sane(const struct group *gr, FILE *stream);
+#if ENABLE_GSHADOW
+int fgetsgent_sane(FILE *stream, struct sgrp **sg);
+int putsgent_sane(const struct sgrp *sg, FILE *stream);
+#endif
+
+bool is_nologin_shell(const char *shell);
+const char* default_root_shell_at(int rfd);
+const char* default_root_shell(const char *root);
+
+int is_this_me(const char *username);
+
+const char* get_home_root(void);
+
+static inline bool hashed_password_is_locked_or_invalid(const char *password) {
+        return password && password[0] != '$';
+}
+
+/* A locked *and* invalid password for "struct spwd"'s .sp_pwdp and "struct passwd"'s .pw_passwd field */
+#define PASSWORD_LOCKED_AND_INVALID "!*"
+
+/* A password indicating "look in shadow file, please!" for "struct passwd"'s .pw_passwd */
+#define PASSWORD_SEE_SHADOW "x"
+
+/* A password indicating "hey, no password required for login" */
+#define PASSWORD_NONE ""
+
+/* Used by sysusers to indicate that the password should be filled in by firstboot.
+ * Also see https://github.com/systemd/systemd/pull/24680#pullrequestreview-1439464325.
+ */
+#define PASSWORD_UNPROVISIONED "!unprovisioned"
diff --git a/src/basic/utf8.c b/src/basic/utf8.c
new file mode 100644
index 0000000..36e1e0f
--- /dev/null
+++ b/src/basic/utf8.c
@@ -0,0 +1,630 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* Parts of this file are based on the GLIB utf8 validation functions. The
+ * original license text follows. */
+
+/* gutf8.c - Operations on UTF-8 strings.
+ *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "gunicode.h"
+#include "hexdecoct.h"
+#include "macro.h"
+#include "string-util.h"
+#include "utf8.h"
+
+bool unichar_is_valid(char32_t ch) {
+
+        if (ch >= 0x110000) /* End of unicode space */
+                return false;
+        if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
+                return false;
+        if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
+                return false;
+        if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
+                return false;
+
+        return true;
+}
+
+static bool unichar_is_control(char32_t ch) {
+
+        /*
+          0 to ' '-1 is the C0 range.
+          DEL=0x7F, and DEL+1 to 0x9F is C1 range.
+          '\t' is in C0 range, but more or less harmless and commonly used.
+        */
+
+        return (ch < ' ' && !IN_SET(ch, '\t', '\n')) ||
+                (0x7F <= ch && ch <= 0x9F);
+}
+
+/* count of characters used to encode one unicode char */
+static size_t utf8_encoded_expected_len(uint8_t c) {
+        if (c < 0x80)
+                return 1;
+        if ((c & 0xe0) == 0xc0)
+                return 2;
+        if ((c & 0xf0) == 0xe0)
+                return 3;
+        if ((c & 0xf8) == 0xf0)
+                return 4;
+        if ((c & 0xfc) == 0xf8)
+                return 5;
+        if ((c & 0xfe) == 0xfc)
+                return 6;
+
+        return 0;
+}
+
+/* decode one unicode char */
+int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) {
+        char32_t unichar;
+        size_t len;
+
+        assert(str);
+
+        len = utf8_encoded_expected_len(str[0]);
+
+        switch (len) {
+        case 1:
+                *ret_unichar = (char32_t)str[0];
+                return 1;
+        case 2:
+                unichar = str[0] & 0x1f;
+                break;
+        case 3:
+                unichar = (char32_t)str[0] & 0x0f;
+                break;
+        case 4:
+                unichar = (char32_t)str[0] & 0x07;
+                break;
+        case 5:
+                unichar = (char32_t)str[0] & 0x03;
+                break;
+        case 6:
+                unichar = (char32_t)str[0] & 0x01;
+                break;
+        default:
+                return -EINVAL;
+        }
+
+        for (size_t i = 1; i < len; i++) {
+                if (((char32_t)str[i] & 0xc0) != 0x80)
+                        return -EINVAL;
+
+                unichar <<= 6;
+                unichar |= (char32_t)str[i] & 0x3f;
+        }
+
+        *ret_unichar = unichar;
+        return len;
+}
+
+bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newline) {
+        assert(str);
+
+        for (const char *p = str; length > 0;) {
+                int encoded_len;
+                char32_t val;
+
+                encoded_len = utf8_encoded_valid_unichar(p, length);
+                if (encoded_len < 0)
+                        return false;
+                assert(encoded_len > 0 && (size_t) encoded_len <= length);
+
+                if (utf8_encoded_to_unichar(p, &val) < 0 ||
+                    unichar_is_control(val) ||
+                    (!allow_newline && val == '\n'))
+                        return false;
+
+                length -= encoded_len;
+                p += encoded_len;
+        }
+
+        return true;
+}
+
+char *utf8_is_valid_n(const char *str, size_t len_bytes) {
+        /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after
+         * len_bytes. Otherwise, stop at NUL. */
+
+        assert(str);
+
+        for (const char *p = str; len_bytes != SIZE_MAX ? (size_t) (p - str) < len_bytes : *p != '\0'; ) {
+                int len;
+
+                if (_unlikely_(*p == '\0') && len_bytes != SIZE_MAX)
+                        return NULL; /* embedded NUL */
+
+                len = utf8_encoded_valid_unichar(p,
+                                                 len_bytes != SIZE_MAX ? len_bytes - (p - str) : SIZE_MAX);
+                if (_unlikely_(len < 0))
+                        return NULL; /* invalid character */
+
+                p += len;
+        }
+
+        return (char*) str;
+}
+
+char *utf8_escape_invalid(const char *str) {
+        char *p, *s;
+
+        assert(str);
+
+        p = s = malloc(strlen(str) * 4 + 1);
+        if (!p)
+                return NULL;
+
+        while (*str) {
+                int len;
+
+                len = utf8_encoded_valid_unichar(str, SIZE_MAX);
+                if (len > 0) {
+                        s = mempcpy(s, str, len);
+                        str += len;
+                } else {
+                        s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
+                        str += 1;
+                }
+        }
+
+        *s = '\0';
+        return str_realloc(p);
+}
+
+static int utf8_char_console_width(const char *str) {
+        char32_t c;
+        int r;
+
+        r = utf8_encoded_to_unichar(str, &c);
+        if (r < 0)
+                return r;
+
+        /* TODO: we should detect combining characters */
+
+        return unichar_iswide(c) ? 2 : 1;
+}
+
+char *utf8_escape_non_printable_full(const char *str, size_t console_width, bool force_ellipsis) {
+        char *p, *s, *prev_s;
+        size_t n = 0; /* estimated print width */
+
+        assert(str);
+
+        if (console_width == 0)
+                return strdup("");
+
+        p = s = prev_s = malloc(strlen(str) * 4 + 1);
+        if (!p)
+                return NULL;
+
+        for (;;) {
+                int len;
+                char *saved_s = s;
+
+                if (!*str) { /* done! */
+                        if (force_ellipsis)
+                                goto truncation;
+                        else
+                                goto finish;
+                }
+
+                len = utf8_encoded_valid_unichar(str, SIZE_MAX);
+                if (len > 0) {
+                        if (utf8_is_printable(str, len)) {
+                                int w;
+
+                                w = utf8_char_console_width(str);
+                                assert(w >= 0);
+                                if (n + w > console_width)
+                                        goto truncation;
+
+                                s = mempcpy(s, str, len);
+                                str += len;
+                                n += w;
+
+                        } else {
+                                for (; len > 0; len--) {
+                                        if (n + 4 > console_width)
+                                                goto truncation;
+
+                                        *(s++) = '\\';
+                                        *(s++) = 'x';
+                                        *(s++) = hexchar((int) *str >> 4);
+                                        *(s++) = hexchar((int) *str);
+
+                                        str += 1;
+                                        n += 4;
+                                }
+                        }
+                } else {
+                        if (n + 1 > console_width)
+                                goto truncation;
+
+                        s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER));
+                        str += 1;
+                        n += 1;
+                }
+
+                prev_s = saved_s;
+        }
+
+ truncation:
+        /* Try to go back one if we don't have enough space for the ellipsis */
+        if (n + 1 > console_width)
+                s = prev_s;
+
+        s = mempcpy(s, "…", strlen("…"));
+
+ finish:
+        *s = '\0';
+        return str_realloc(p);
+}
+
+char *ascii_is_valid(const char *str) {
+        /* Check whether the string consists of valid ASCII bytes,
+         * i.e values between 0 and 127, inclusive. */
+
+        assert(str);
+
+        for (const char *p = str; *p; p++)
+                if ((unsigned char) *p >= 128)
+                        return NULL;
+
+        return (char*) str;
+}
+
+char *ascii_is_valid_n(const char *str, size_t len) {
+        /* Very similar to ascii_is_valid(), but checks exactly len
+         * bytes and rejects any NULs in that range. */
+
+        assert(str);
+
+        for (size_t i = 0; i < len; i++)
+                if ((unsigned char) str[i] >= 128 || str[i] == 0)
+                        return NULL;
+
+        return (char*) str;
+}
+
+int utf8_to_ascii(const char *str, char replacement_char, char **ret) {
+        /* Convert to a string that has only ASCII chars, replacing anything that is not ASCII
+         * by replacement_char. */
+
+        _cleanup_free_ char *ans = new(char, strlen(str) + 1);
+        if (!ans)
+                return -ENOMEM;
+
+        char *q = ans;
+
+        for (const char *p = str; *p; q++) {
+                int l;
+
+                l = utf8_encoded_valid_unichar(p, SIZE_MAX);
+                if (l < 0)  /* Non-UTF-8, let's not even try to propagate the garbage */
+                        return l;
+
+                if (l == 1)
+                        *q = *p;
+                else
+                        /* non-ASCII, we need to replace it */
+                        *q = replacement_char;
+
+                p += l;
+        }
+        *q = '\0';
+
+        *ret = TAKE_PTR(ans);
+        return 0;
+}
+
+/**
+ * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
+ * @out_utf8: output buffer of at least 4 bytes or NULL
+ * @g: UCS-4 character to encode
+ *
+ * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
+ * The length of the character is returned. It is not zero-terminated! If the
+ * output buffer is NULL, only the length is returned.
+ *
+ * Returns: The length in bytes that the UTF-8 representation does or would
+ *          occupy.
+ */
+size_t utf8_encode_unichar(char *out_utf8, char32_t g) {
+
+        if (g < (1 << 7)) {
+                if (out_utf8)
+                        out_utf8[0] = g & 0x7f;
+                return 1;
+        } else if (g < (1 << 11)) {
+                if (out_utf8) {
+                        out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
+                        out_utf8[1] = 0x80 | (g & 0x3f);
+                }
+                return 2;
+        } else if (g < (1 << 16)) {
+                if (out_utf8) {
+                        out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
+                        out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
+                        out_utf8[2] = 0x80 | (g & 0x3f);
+                }
+                return 3;
+        } else if (g < (1 << 21)) {
+                if (out_utf8) {
+                        out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
+                        out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
+                        out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
+                        out_utf8[3] = 0x80 | (g & 0x3f);
+                }
+                return 4;
+        }
+
+        return 0;
+}
+
+char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */) {
+        const uint8_t *f;
+        char *r, *t;
+
+        if (length == 0)
+                return new0(char, 1);
+
+        assert(s);
+
+        if (length == SIZE_MAX) {
+                length = char16_strlen(s);
+
+                if (length > SIZE_MAX/2)
+                        return NULL; /* overflow */
+
+                length *= 2;
+        }
+
+        /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
+         * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
+        if (length > (SIZE_MAX - 1) / 2)
+                return NULL; /* overflow */
+
+        r = new(char, length * 2 + 1);
+        if (!r)
+                return NULL;
+
+        f = (const uint8_t*) s;
+        t = r;
+
+        while (f + 1 < (const uint8_t*) s + length) {
+                char16_t w1, w2;
+
+                /* see RFC 2781 section 2.2 */
+
+                w1 = f[1] << 8 | f[0];
+                f += 2;
+
+                if (!utf16_is_surrogate(w1)) {
+                        t += utf8_encode_unichar(t, w1);
+                        continue;
+                }
+
+                if (utf16_is_trailing_surrogate(w1))
+                        continue; /* spurious trailing surrogate, ignore */
+
+                if (f + 1 >= (const uint8_t*) s + length)
+                        break;
+
+                w2 = f[1] << 8 | f[0];
+                f += 2;
+
+                if (!utf16_is_trailing_surrogate(w2)) {
+                        f -= 2;
+                        continue; /* surrogate missing its trailing surrogate, ignore */
+                }
+
+                t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
+        }
+
+        *t = 0;
+        return r;
+}
+
+size_t utf16_encode_unichar(char16_t *out, char32_t c) {
+
+        /* Note that this encodes as little-endian. */
+
+        switch (c) {
+
+        case 0 ... 0xd7ffU:
+        case 0xe000U ... 0xffffU:
+                out[0] = htole16(c);
+                return 1;
+
+        case 0x10000U ... 0x10ffffU:
+                c -= 0x10000U;
+                out[0] = htole16((c >> 10) + 0xd800U);
+                out[1] = htole16((c & 0x3ffU) + 0xdc00U);
+                return 2;
+
+        default: /* A surrogate (invalid) */
+                return 0;
+        }
+}
+
+char16_t *utf8_to_utf16(const char *s, size_t length) {
+        char16_t *n, *p;
+        int r;
+
+        if (length == 0)
+                return new0(char16_t, 1);
+
+        assert(s);
+
+        if (length == SIZE_MAX)
+                length = strlen(s);
+
+        if (length > SIZE_MAX - 1)
+                return NULL; /* overflow */
+
+        n = new(char16_t, length + 1);
+        if (!n)
+                return NULL;
+
+        p = n;
+
+        for (size_t i = 0; i < length;) {
+                char32_t unichar;
+                size_t e;
+
+                e = utf8_encoded_expected_len(s[i]);
+                if (e <= 1) /* Invalid and single byte characters are copied as they are */
+                        goto copy;
+
+                if (i + e > length) /* sequence longer than input buffer, then copy as-is */
+                        goto copy;
+
+                r = utf8_encoded_to_unichar(s + i, &unichar);
+                if (r < 0) /* sequence invalid, then copy as-is */
+                        goto copy;
+
+                p += utf16_encode_unichar(p, unichar);
+                i += e;
+                continue;
+
+        copy:
+                *(p++) = htole16(s[i++]);
+        }
+
+        *p = 0;
+        return n;
+}
+
+size_t char16_strlen(const char16_t *s) {
+        size_t n = 0;
+
+        assert(s);
+
+        while (*s != 0)
+                n++, s++;
+
+        return n;
+}
+
+/* expected size used to encode one unicode char */
+static int utf8_unichar_to_encoded_len(char32_t unichar) {
+
+        if (unichar < 0x80)
+                return 1;
+        if (unichar < 0x800)
+                return 2;
+        if (unichar < 0x10000)
+                return 3;
+        if (unichar < 0x200000)
+                return 4;
+        if (unichar < 0x4000000)
+                return 5;
+
+        return 6;
+}
+
+/* validate one encoded unicode char and return its length */
+int utf8_encoded_valid_unichar(const char *str, size_t length /* bytes */) {
+        char32_t unichar;
+        size_t len;
+        int r;
+
+        assert(str);
+        assert(length > 0);
+
+        /* We read until NUL, at most length bytes. SIZE_MAX may be used to disable the length check. */
+
+        len = utf8_encoded_expected_len(str[0]);
+        if (len == 0)
+                return -EINVAL;
+
+        /* Do we have a truncated multi-byte character? */
+        if (len > length)
+                return -EINVAL;
+
+        /* ascii is valid */
+        if (len == 1)
+                return 1;
+
+        /* check if expected encoded chars are available */
+        for (size_t i = 0; i < len; i++)
+                if ((str[i] & 0x80) != 0x80)
+                        return -EINVAL;
+
+        r = utf8_encoded_to_unichar(str, &unichar);
+        if (r < 0)
+                return r;
+
+        /* check if encoded length matches encoded value */
+        if (utf8_unichar_to_encoded_len(unichar) != (int) len)
+                return -EINVAL;
+
+        /* check if value has valid range */
+        if (!unichar_is_valid(unichar))
+                return -EINVAL;
+
+        return (int) len;
+}
+
+size_t utf8_n_codepoints(const char *str) {
+        size_t n = 0;
+
+        /* Returns the number of UTF-8 codepoints in this string, or SIZE_MAX if the string is not valid UTF-8. */
+
+        while (*str != 0) {
+                int k;
+
+                k = utf8_encoded_valid_unichar(str, SIZE_MAX);
+                if (k < 0)
+                        return SIZE_MAX;
+
+                str += k;
+                n++;
+        }
+
+        return n;
+}
+
+size_t utf8_console_width(const char *str) {
+        size_t n = 0;
+
+        /* Returns the approximate width a string will take on screen when printed on a character cell
+         * terminal/console. */
+
+        while (*str) {
+                int w;
+
+                w = utf8_char_console_width(str);
+                if (w < 0)
+                        return SIZE_MAX;
+
+                n += w;
+                str = utf8_next_char(str);
+        }
+
+        return n;
+}
diff --git a/src/basic/utf8.h b/src/basic/utf8.h
new file mode 100644
index 0000000..962312c
--- /dev/null
+++ b/src/basic/utf8.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+#include 
+
+#include "macro.h"
+#include "missing_type.h"
+
+#define UTF8_REPLACEMENT_CHARACTER "\xef\xbf\xbd"
+#define UTF8_BYTE_ORDER_MARK "\xef\xbb\xbf"
+
+bool unichar_is_valid(char32_t c);
+
+char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_;
+static inline char *utf8_is_valid(const char *s) {
+        return utf8_is_valid_n(s, SIZE_MAX);
+}
+char *ascii_is_valid(const char *s) _pure_;
+char *ascii_is_valid_n(const char *str, size_t len);
+
+int utf8_to_ascii(const char *str, char replacement_char, char **ret);
+
+bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newline) _pure_;
+#define utf8_is_printable(str, length) utf8_is_printable_newline(str, length, true)
+
+char *utf8_escape_invalid(const char *s);
+char *utf8_escape_non_printable_full(const char *str, size_t console_width, bool force_ellipsis);
+static inline char *utf8_escape_non_printable(const char *str) {
+        return utf8_escape_non_printable_full(str, SIZE_MAX, false);
+}
+
+size_t utf8_encode_unichar(char *out_utf8, char32_t g);
+size_t utf16_encode_unichar(char16_t *out, char32_t c);
+
+char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */);
+char16_t *utf8_to_utf16(const char *s, size_t length);
+
+size_t char16_strlen(const char16_t *s); /* returns the number of 16-bit words in the string (not bytes!) */
+
+int utf8_encoded_valid_unichar(const char *str, size_t length);
+int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar);
+
+static inline bool utf16_is_surrogate(char16_t c) {
+        return c >= 0xd800U && c <= 0xdfffU;
+}
+
+static inline bool utf16_is_trailing_surrogate(char16_t c) {
+        return c >= 0xdc00U && c <= 0xdfffU;
+}
+
+static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail) {
+        return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U);
+}
+
+size_t utf8_n_codepoints(const char *str);
+size_t utf8_console_width(const char *str);
diff --git a/src/basic/virt.c b/src/basic/virt.c
new file mode 100644
index 0000000..93ccfaa
--- /dev/null
+++ b/src/basic/virt.c
@@ -0,0 +1,1071 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if defined(__i386__) || defined(__x86_64__)
+#include 
+#endif
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "cgroup-util.h"
+#include "dirent-util.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "macro.h"
+#include "missing_threads.h"
+#include "process-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "virt.h"
+
+enum {
+      SMBIOS_VM_BIT_SET,
+      SMBIOS_VM_BIT_UNSET,
+      SMBIOS_VM_BIT_UNKNOWN,
+};
+
+static Virtualization detect_vm_cpuid(void) {
+
+        /* CPUID is an x86 specific interface. */
+#if defined(__i386__) || defined(__x86_64__)
+
+        static const struct {
+                const char sig[13];
+                Virtualization id;
+        } vm_table[] = {
+                { "XenVMMXenVMM", VIRTUALIZATION_XEN       },
+                { "KVMKVMKVM",    VIRTUALIZATION_KVM       }, /* qemu with KVM */
+                { "Linux KVM Hv", VIRTUALIZATION_KVM       }, /* qemu with KVM + HyperV Enlightenments */
+                { "TCGTCGTCGTCG", VIRTUALIZATION_QEMU      }, /* qemu without KVM */
+                /* http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1009458 */
+                { "VMwareVMware", VIRTUALIZATION_VMWARE    },
+                /* https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs */
+                { "Microsoft Hv", VIRTUALIZATION_MICROSOFT },
+                /* https://wiki.freebsd.org/bhyve */
+                { "bhyve bhyve ", VIRTUALIZATION_BHYVE     },
+                { "QNXQVMBSQG",   VIRTUALIZATION_QNX       },
+                /* https://projectacrn.org */
+                { "ACRNACRNACRN", VIRTUALIZATION_ACRN      },
+                /* https://www.lockheedmartin.com/en-us/products/Hardened-Security-for-Intel-Processors.html */
+                { "SRESRESRESRE", VIRTUALIZATION_SRE       },
+                { "Apple VZ",     VIRTUALIZATION_APPLE     },
+        };
+
+        uint32_t eax, ebx, ecx, edx;
+        bool hypervisor;
+
+        /* http://lwn.net/Articles/301888/ */
+
+        /* First detect whether there is a hypervisor */
+        if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) == 0)
+                return VIRTUALIZATION_NONE;
+
+        hypervisor = ecx & 0x80000000U;
+
+        if (hypervisor) {
+                union {
+                        uint32_t sig32[3];
+                        char text[13];
+                } sig = {};
+
+                /* There is a hypervisor, see what it is */
+                __cpuid(0x40000000U, eax, ebx, ecx, edx);
+
+                sig.sig32[0] = ebx;
+                sig.sig32[1] = ecx;
+                sig.sig32[2] = edx;
+
+                log_debug("Virtualization found, CPUID=%s", sig.text);
+
+                for (size_t i = 0; i < ELEMENTSOF(vm_table); i++)
+                        if (memcmp_nn(sig.text, sizeof(sig.text),
+                                      vm_table[i].sig, sizeof(vm_table[i].sig)) == 0)
+                                return vm_table[i].id;
+
+                log_debug("Unknown virtualization with CPUID=%s. Add to vm_table[]?", sig.text);
+                return VIRTUALIZATION_VM_OTHER;
+        }
+#endif
+        log_debug("No virtualization found in CPUID");
+
+        return VIRTUALIZATION_NONE;
+}
+
+static Virtualization detect_vm_device_tree(void) {
+#if defined(__arm__) || defined(__aarch64__) || defined(__powerpc__) || defined(__powerpc64__) || defined(__riscv)
+        _cleanup_free_ char *hvtype = NULL;
+        int r;
+
+        r = read_one_line_file("/proc/device-tree/hypervisor/compatible", &hvtype);
+        if (r == -ENOENT) {
+                _cleanup_closedir_ DIR *dir = NULL;
+                _cleanup_free_ char *compat = NULL;
+
+                if (access("/proc/device-tree/ibm,partition-name", F_OK) == 0 &&
+                    access("/proc/device-tree/hmc-managed?", F_OK) == 0 &&
+                    access("/proc/device-tree/chosen/qemu,graphic-width", F_OK) != 0)
+                        return VIRTUALIZATION_POWERVM;
+
+                dir = opendir("/proc/device-tree");
+                if (!dir) {
+                        if (errno == ENOENT) {
+                                log_debug_errno(errno, "/proc/device-tree: %m");
+                                return VIRTUALIZATION_NONE;
+                        }
+                        return -errno;
+                }
+
+                FOREACH_DIRENT(de, dir, return -errno)
+                        if (strstr(de->d_name, "fw-cfg")) {
+                                log_debug("Virtualization QEMU: \"fw-cfg\" present in /proc/device-tree/%s", de->d_name);
+                                return VIRTUALIZATION_QEMU;
+                        }
+
+                r = read_one_line_file("/proc/device-tree/compatible", &compat);
+                if (r < 0 && r != -ENOENT)
+                        return r;
+                if (r >= 0 && streq(compat, "qemu,pseries")) {
+                        log_debug("Virtualization %s found in /proc/device-tree/compatible", compat);
+                        return VIRTUALIZATION_QEMU;
+                }
+
+                log_debug("No virtualization found in /proc/device-tree/*");
+                return VIRTUALIZATION_NONE;
+        } else if (r < 0)
+                return r;
+
+        log_debug("Virtualization %s found in /proc/device-tree/hypervisor/compatible", hvtype);
+        if (streq(hvtype, "linux,kvm"))
+                return VIRTUALIZATION_KVM;
+        else if (strstr(hvtype, "xen"))
+                return VIRTUALIZATION_XEN;
+        else if (strstr(hvtype, "vmware"))
+                return VIRTUALIZATION_VMWARE;
+        else
+                return VIRTUALIZATION_VM_OTHER;
+#else
+        log_debug("This platform does not support /proc/device-tree");
+        return VIRTUALIZATION_NONE;
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || defined(__riscv)
+static Virtualization detect_vm_dmi_vendor(void) {
+        static const char* const dmi_vendors[] = {
+                "/sys/class/dmi/id/product_name", /* Test this before sys_vendor to detect KVM over QEMU */
+                "/sys/class/dmi/id/sys_vendor",
+                "/sys/class/dmi/id/board_vendor",
+                "/sys/class/dmi/id/bios_vendor",
+                "/sys/class/dmi/id/product_version", /* For Hyper-V VMs test */
+                NULL
+        };
+
+        static const struct {
+                const char *vendor;
+                Virtualization id;
+        } dmi_vendor_table[] = {
+                { "KVM",                   VIRTUALIZATION_KVM       },
+                { "OpenStack",             VIRTUALIZATION_KVM       }, /* Detect OpenStack instance as KVM in non x86 architecture */
+                { "KubeVirt",              VIRTUALIZATION_KVM       }, /* Detect KubeVirt instance as KVM in non x86 architecture */
+                { "Amazon EC2",            VIRTUALIZATION_AMAZON    },
+                { "QEMU",                  VIRTUALIZATION_QEMU      },
+                { "VMware",                VIRTUALIZATION_VMWARE    }, /* https://kb.vmware.com/s/article/1009458 */
+                { "VMW",                   VIRTUALIZATION_VMWARE    },
+                { "innotek GmbH",          VIRTUALIZATION_ORACLE    },
+                { "VirtualBox",            VIRTUALIZATION_ORACLE    },
+                { "Xen",                   VIRTUALIZATION_XEN       },
+                { "Bochs",                 VIRTUALIZATION_BOCHS     },
+                { "Parallels",             VIRTUALIZATION_PARALLELS },
+                /* https://wiki.freebsd.org/bhyve */
+                { "BHYVE",                 VIRTUALIZATION_BHYVE     },
+                { "Hyper-V",               VIRTUALIZATION_MICROSOFT },
+                { "Apple Virtualization",  VIRTUALIZATION_APPLE     },
+                { "Google Compute Engine", VIRTUALIZATION_GOOGLE    }, /* https://cloud.google.com/run/docs/container-contract#sandbox */
+        };
+        int r;
+
+        STRV_FOREACH(vendor, dmi_vendors) {
+                _cleanup_free_ char *s = NULL;
+
+                r = read_one_line_file(*vendor, &s);
+                if (r < 0) {
+                        if (r == -ENOENT)
+                                continue;
+
+                        return r;
+                }
+
+                for (size_t i = 0; i < ELEMENTSOF(dmi_vendor_table); i++)
+                        if (startswith(s, dmi_vendor_table[i].vendor)) {
+                                log_debug("Virtualization %s found in DMI (%s)", s, *vendor);
+                                return dmi_vendor_table[i].id;
+                        }
+        }
+        log_debug("No virtualization found in DMI vendor table.");
+        return VIRTUALIZATION_NONE;
+}
+
+static int detect_vm_smbios(void) {
+        /* The SMBIOS BIOS Characteristics Extension Byte 2 (Section 2.1.2.2 of
+         * https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.4.0.pdf), specifies that
+         * the 4th bit being set indicates a VM. The BIOS Characteristics table is exposed via the kernel in
+         * /sys/firmware/dmi/entries/0-0. Note that in the general case, this bit being unset should not
+         * imply that the system is running on bare-metal.  For example, QEMU 3.1.0 (with or without KVM)
+         * with SeaBIOS does not set this bit. */
+        _cleanup_free_ char *s = NULL;
+        size_t readsize;
+        int r;
+
+        r = read_full_virtual_file("/sys/firmware/dmi/entries/0-0/raw", &s, &readsize);
+        if (r < 0) {
+                log_debug_errno(r, "Unable to read /sys/firmware/dmi/entries/0-0/raw, "
+                                "using the virtualization information found in DMI vendor table, ignoring: %m");
+                return SMBIOS_VM_BIT_UNKNOWN;
+        }
+        if (readsize < 20 || s[1] < 20) {
+                /* The spec indicates that byte 1 contains the size of the table, 0x12 + the number of
+                 * extension bytes. The data we're interested in is in extension byte 2, which would be at
+                 * 0x13. If we didn't read that much data, or if the BIOS indicates that we don't have that
+                 * much data, we don't infer anything from the SMBIOS. */
+                log_debug("Only read %zu bytes from /sys/firmware/dmi/entries/0-0/raw (expected 20). "
+                          "Using the virtualization information found in DMI vendor table.", readsize);
+                return SMBIOS_VM_BIT_UNKNOWN;
+        }
+
+        uint8_t byte = (uint8_t) s[19];
+        if (byte & (1U<<4)) {
+                log_debug("DMI BIOS Extension table indicates virtualization.");
+                return SMBIOS_VM_BIT_SET;
+        }
+        log_debug("DMI BIOS Extension table does not indicate virtualization.");
+        return SMBIOS_VM_BIT_UNSET;
+}
+#endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) */
+
+static Virtualization detect_vm_dmi(void) {
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64)
+
+        int r;
+        r = detect_vm_dmi_vendor();
+
+        /* The DMI vendor tables in /sys/class/dmi/id don't help us distinguish between Amazon EC2
+         * virtual machines and bare-metal instances, so we need to look at SMBIOS. */
+        if (r == VIRTUALIZATION_AMAZON) {
+                switch (detect_vm_smbios()) {
+                case SMBIOS_VM_BIT_SET:
+                        return VIRTUALIZATION_AMAZON;
+                case SMBIOS_VM_BIT_UNSET:
+                        return VIRTUALIZATION_NONE;
+                case SMBIOS_VM_BIT_UNKNOWN: {
+                        /* The DMI information we are after is only accessible to the root user,
+                         * so we fallback to using the product name which is less restricted
+                         * to distinguish metal systems from virtualized instances */
+                        _cleanup_free_ char *s = NULL;
+                        const char *e;
+
+                        r = read_full_virtual_file("/sys/class/dmi/id/product_name", &s, NULL);
+                        /* In EC2, virtualized is much more common than metal, so if for some reason
+                         * we fail to read the DMI data, assume we are virtualized. */
+                        if (r < 0) {
+                                log_debug_errno(r, "Can't read /sys/class/dmi/id/product_name,"
+                                                " assuming virtualized: %m");
+                                return VIRTUALIZATION_AMAZON;
+                        }
+                        e = strstrafter(truncate_nl(s), ".metal");
+                        if (e && IN_SET(*e, 0, '-')) {
+                                log_debug("DMI product name has '.metal', assuming no virtualization");
+                                return VIRTUALIZATION_NONE;
+                        } else
+                                return VIRTUALIZATION_AMAZON;
+                }
+                default:
+                        assert_not_reached();
+              }
+        }
+
+        /* If we haven't identified a VM, but the firmware indicates that there is one, indicate as much. We
+         * have no further information about what it is. */
+        if (r == VIRTUALIZATION_NONE && detect_vm_smbios() == SMBIOS_VM_BIT_SET)
+                return VIRTUALIZATION_VM_OTHER;
+        return r;
+#else
+        return VIRTUALIZATION_NONE;
+#endif
+}
+
+#define XENFEAT_dom0 11 /* xen/include/public/features.h */
+#define PATH_FEATURES "/sys/hypervisor/properties/features"
+/* Returns -errno, or 0 for domU, or 1 for dom0 */
+static int detect_vm_xen_dom0(void) {
+        _cleanup_free_ char *domcap = NULL;
+        int r;
+
+        r = read_one_line_file(PATH_FEATURES, &domcap);
+        if (r < 0 && r != -ENOENT)
+                return r;
+        if (r >= 0) {
+                unsigned long features;
+
+                /* Here, we need to use sscanf() instead of safe_atoul()
+                 * as the string lacks the leading "0x". */
+                r = sscanf(domcap, "%lx", &features);
+                if (r == 1) {
+                        r = !!(features & (1U << XENFEAT_dom0));
+                        log_debug("Virtualization XEN, found %s with value %08lx, "
+                                  "XENFEAT_dom0 (indicating the 'hardware domain') is%s set.",
+                                  PATH_FEATURES, features, r ? "" : " not");
+                        return r;
+                }
+                log_debug("Virtualization XEN, found %s, unhandled content '%s'",
+                          PATH_FEATURES, domcap);
+        }
+
+        r = read_one_line_file("/proc/xen/capabilities", &domcap);
+        if (r == -ENOENT) {
+                log_debug("Virtualization XEN because /proc/xen/capabilities does not exist");
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        for (const char *i = domcap;;) {
+                _cleanup_free_ char *cap = NULL;
+
+                r = extract_first_word(&i, &cap, ",", 0);
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        log_debug("Virtualization XEN DomU found (/proc/xen/capabilities)");
+                        return 0;
+                }
+
+                if (streq(cap, "control_d")) {
+                        log_debug("Virtualization XEN Dom0 ignored (/proc/xen/capabilities)");
+                        return 1;
+                }
+        }
+}
+
+static Virtualization detect_vm_xen(void) {
+        /* The presence of /proc/xen indicates some form of a Xen domain
+           The check for Dom0 is handled outside this function */
+        if (access("/proc/xen", F_OK) < 0) {
+                log_debug("Virtualization XEN not found, /proc/xen does not exist");
+                return VIRTUALIZATION_NONE;
+        }
+        log_debug("Virtualization XEN found (/proc/xen exists)");
+        return VIRTUALIZATION_XEN;
+}
+
+static Virtualization detect_vm_hypervisor(void) {
+        _cleanup_free_ char *hvtype = NULL;
+        int r;
+
+        r = read_one_line_file("/sys/hypervisor/type", &hvtype);
+        if (r == -ENOENT)
+                return VIRTUALIZATION_NONE;
+        if (r < 0)
+                return r;
+
+        log_debug("Virtualization %s found in /sys/hypervisor/type", hvtype);
+
+        if (streq(hvtype, "xen"))
+                return VIRTUALIZATION_XEN;
+        else
+                return VIRTUALIZATION_VM_OTHER;
+}
+
+static Virtualization detect_vm_uml(void) {
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        /* Detect User-Mode Linux by reading /proc/cpuinfo */
+        f = fopen("/proc/cpuinfo", "re");
+        if (!f) {
+                if (errno == ENOENT) {
+                        log_debug("/proc/cpuinfo not found, assuming no UML virtualization.");
+                        return VIRTUALIZATION_NONE;
+                }
+                return -errno;
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+                const char *t;
+
+                r = read_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                t = startswith(line, "vendor_id\t: ");
+                if (t) {
+                        if (startswith(t, "User Mode Linux")) {
+                                log_debug("UML virtualization found in /proc/cpuinfo");
+                                return VIRTUALIZATION_UML;
+                        }
+
+                        break;
+                }
+        }
+
+        log_debug("UML virtualization not found in /proc/cpuinfo.");
+        return VIRTUALIZATION_NONE;
+}
+
+static Virtualization detect_vm_zvm(void) {
+
+#if defined(__s390__)
+        _cleanup_free_ char *t = NULL;
+        int r;
+
+        r = get_proc_field("/proc/sysinfo", "VM00 Control Program", WHITESPACE, &t);
+        if (r == -ENOENT)
+                return VIRTUALIZATION_NONE;
+        if (r < 0)
+                return r;
+
+        log_debug("Virtualization %s found in /proc/sysinfo", t);
+        if (streq(t, "z/VM"))
+                return VIRTUALIZATION_ZVM;
+        else
+                return VIRTUALIZATION_KVM;
+#else
+        log_debug("This platform does not support /proc/sysinfo");
+        return VIRTUALIZATION_NONE;
+#endif
+}
+
+/* Returns a short identifier for the various VM implementations */
+Virtualization detect_vm(void) {
+        static thread_local Virtualization cached_found = _VIRTUALIZATION_INVALID;
+        bool other = false;
+        int xen_dom0 = 0;
+        Virtualization v, dmi;
+
+        if (cached_found >= 0)
+                return cached_found;
+
+        /* We have to use the correct order here:
+         *
+         * → First, try to detect Oracle Virtualbox, Amazon EC2 Nitro, Parallels, and Google Compute Engine, even if they use KVM,
+         *   as well as Xen even if it cloaks as Microsoft Hyper-V. Attempt to detect uml at this stage also
+         *   since it runs as a user-process nested inside other VMs. Also check for Xen now, because Xen PV
+         *   mode does not override CPUID when nested inside another hypervisor.
+         *
+         * → Second, try to detect from CPUID, this will report KVM for whatever software is used even if
+         *   info in DMI is overwritten.
+         *
+         * → Third, try to detect from DMI. */
+
+        dmi = detect_vm_dmi();
+        if (IN_SET(dmi,
+                   VIRTUALIZATION_ORACLE,
+                   VIRTUALIZATION_XEN,
+                   VIRTUALIZATION_AMAZON,
+                   VIRTUALIZATION_PARALLELS,
+                   VIRTUALIZATION_GOOGLE)) {
+                v = dmi;
+                goto finish;
+        }
+
+        /* Detect UML */
+        v = detect_vm_uml();
+        if (v < 0)
+                return v;
+        if (v != VIRTUALIZATION_NONE)
+                goto finish;
+
+        /* Detect Xen */
+        v = detect_vm_xen();
+        if (v < 0)
+                return v;
+        if (v == VIRTUALIZATION_XEN) {
+                 /* If we are Dom0, then we expect to not report as a VM. However, as we might be nested
+                  * inside another hypervisor which can be detected via the CPUID check, wait to report this
+                  * until after the CPUID check. */
+                xen_dom0 = detect_vm_xen_dom0();
+                if (xen_dom0 < 0)
+                        return xen_dom0;
+                if (xen_dom0 == 0)
+                        goto finish;
+        } else if (v != VIRTUALIZATION_NONE)
+                assert_not_reached();
+
+        /* Detect from CPUID */
+        v = detect_vm_cpuid();
+        if (v < 0)
+                return v;
+        if (v == VIRTUALIZATION_VM_OTHER)
+                other = true;
+        else if (v != VIRTUALIZATION_NONE)
+                goto finish;
+
+        /* If we are in Dom0 and have not yet finished, finish with the result of detect_vm_cpuid */
+        if (xen_dom0 > 0)
+                goto finish;
+
+        /* Now, let's get back to DMI */
+        if (dmi < 0)
+                return dmi;
+        if (dmi == VIRTUALIZATION_VM_OTHER)
+                other = true;
+        else if (dmi != VIRTUALIZATION_NONE) {
+                v = dmi;
+                goto finish;
+        }
+
+        /* Check high-level hypervisor sysfs file */
+        v = detect_vm_hypervisor();
+        if (v < 0)
+                return v;
+        if (v == VIRTUALIZATION_VM_OTHER)
+                other = true;
+        else if (v != VIRTUALIZATION_NONE)
+                goto finish;
+
+        v = detect_vm_device_tree();
+        if (v < 0)
+                return v;
+        if (v == VIRTUALIZATION_VM_OTHER)
+                other = true;
+        else if (v != VIRTUALIZATION_NONE)
+                goto finish;
+
+        v = detect_vm_zvm();
+        if (v < 0)
+                return v;
+
+finish:
+        if (v == VIRTUALIZATION_NONE && other)
+                v = VIRTUALIZATION_VM_OTHER;
+
+        cached_found = v;
+        log_debug("Found VM virtualization %s", virtualization_to_string(v));
+        return v;
+}
+
+static const char *const container_table[_VIRTUALIZATION_MAX] = {
+        [VIRTUALIZATION_LXC]            = "lxc",
+        [VIRTUALIZATION_LXC_LIBVIRT]    = "lxc-libvirt",
+        [VIRTUALIZATION_SYSTEMD_NSPAWN] = "systemd-nspawn",
+        [VIRTUALIZATION_DOCKER]         = "docker",
+        [VIRTUALIZATION_PODMAN]         = "podman",
+        [VIRTUALIZATION_RKT]            = "rkt",
+        [VIRTUALIZATION_WSL]            = "wsl",
+        [VIRTUALIZATION_PROOT]          = "proot",
+        [VIRTUALIZATION_POUCH]          = "pouch",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(container, int);
+
+static int running_in_cgroupns(void) {
+        int r;
+
+        if (!cg_ns_supported())
+                return false;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+
+        if (r) {
+                /* cgroup v2 */
+
+                r = access("/sys/fs/cgroup/cgroup.events", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        /* All kernel versions have cgroup.events in nested cgroups. */
+                        return false;
+                }
+
+                /* There's no cgroup.type in the root cgroup, and future kernel versions
+                 * are unlikely to add it since cgroup.type is something that makes no sense
+                 * whatsoever in the root cgroup. */
+                r = access("/sys/fs/cgroup/cgroup.type", F_OK);
+                if (r == 0)
+                        return true;
+                if (r < 0 && errno != ENOENT)
+                        return -errno;
+
+                /* On older kernel versions, there's no cgroup.type */
+                r = access("/sys/kernel/cgroup/features", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        /* This is an old kernel that we know for sure has cgroup.events
+                         * only in nested cgroups. */
+                        return true;
+                }
+
+                /* This is a recent kernel, and cgroup.type doesn't exist, so we must be
+                 * in the root cgroup. */
+                return false;
+        } else {
+                /* cgroup v1 */
+
+                /* If systemd controller is not mounted, do not even bother. */
+                r = access("/sys/fs/cgroup/systemd", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        return false;
+                }
+
+                /* release_agent only exists in the root cgroup. */
+                r = access("/sys/fs/cgroup/systemd/release_agent", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        return true;
+                }
+
+                return false;
+        }
+}
+
+static Virtualization detect_container_files(void) {
+        static const struct {
+                const char *file_path;
+                Virtualization id;
+        } container_file_table[] = {
+                /* https://github.com/containers/podman/issues/6192 */
+                /* https://github.com/containers/podman/issues/3586#issuecomment-661918679 */
+                { "/run/.containerenv", VIRTUALIZATION_PODMAN },
+                /* https://github.com/moby/moby/issues/18355 */
+                /* Docker must be the last in this table, see below. */
+                { "/.dockerenv",        VIRTUALIZATION_DOCKER },
+        };
+
+        for (size_t i = 0; i < ELEMENTSOF(container_file_table); i++) {
+                if (access(container_file_table[i].file_path, F_OK) >= 0)
+                        return container_file_table[i].id;
+
+                if (errno != ENOENT)
+                        log_debug_errno(errno,
+                                        "Checking if %s exists failed, ignoring: %m",
+                                        container_file_table[i].file_path);
+        }
+
+        return VIRTUALIZATION_NONE;
+}
+
+Virtualization detect_container(void) {
+        static thread_local Virtualization cached_found = _VIRTUALIZATION_INVALID;
+        _cleanup_free_ char *m = NULL, *o = NULL, *p = NULL;
+        const char *e = NULL;
+        Virtualization v;
+        int r;
+
+        if (cached_found >= 0)
+                return cached_found;
+
+        /* /proc/vz exists in container and outside of the container, /proc/bc only outside of the container. */
+        if (access("/proc/vz", F_OK) < 0) {
+                if (errno != ENOENT)
+                        log_debug_errno(errno, "Failed to check if /proc/vz exists, ignoring: %m");
+        } else if (access("/proc/bc", F_OK) < 0) {
+                if (errno == ENOENT) {
+                        v = VIRTUALIZATION_OPENVZ;
+                        goto finish;
+                }
+
+                log_debug_errno(errno, "Failed to check if /proc/bc exists, ignoring: %m");
+        }
+
+        /* "Official" way of detecting WSL https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 */
+        r = read_one_line_file("/proc/sys/kernel/osrelease", &o);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read /proc/sys/kernel/osrelease, ignoring: %m");
+        else if (strstr(o, "Microsoft") || strstr(o, "WSL")) {
+                v = VIRTUALIZATION_WSL;
+                goto finish;
+        }
+
+        /* proot doesn't use PID namespacing, so we can just check if we have a matching tracer for this
+         * invocation without worrying about it being elsewhere.
+         */
+        r = get_proc_field("/proc/self/status", "TracerPid", WHITESPACE, &p);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read our own trace PID, ignoring: %m");
+        else if (!streq(p, "0")) {
+                pid_t ptrace_pid;
+
+                r = parse_pid(p, &ptrace_pid);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to parse our own tracer PID, ignoring: %m");
+                else {
+                        _cleanup_free_ char *ptrace_comm = NULL;
+                        const char *pf;
+
+                        pf = procfs_file_alloca(ptrace_pid, "comm");
+                        r = read_one_line_file(pf, &ptrace_comm);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to read %s, ignoring: %m", pf);
+                        else if (startswith(ptrace_comm, "proot")) {
+                                v = VIRTUALIZATION_PROOT;
+                                goto finish;
+                        }
+                }
+        }
+
+        /* The container manager might have placed this in the /run/host/ hierarchy for us, which is best
+         * because we can be consumed just like that, without special privileges. */
+        r = read_one_line_file("/run/host/container-manager", &m);
+        if (r > 0) {
+                e = m;
+                goto translate_name;
+        }
+        if (!IN_SET(r, -ENOENT, 0))
+                return log_debug_errno(r, "Failed to read /run/host/container-manager: %m");
+
+        if (getpid_cached() == 1) {
+                /* If we are PID 1 we can just check our own environment variable, and that's authoritative.
+                 * We distinguish three cases:
+                 * - the variable is not defined → we jump to other checks
+                 * - the variable is defined to an empty value → we are not in a container
+                 * - anything else → some container, either one of the known ones or "container-other"
+                 */
+                e = getenv("container");
+                if (!e)
+                        goto check_files;
+                if (isempty(e)) {
+                        v = VIRTUALIZATION_NONE;
+                        goto finish;
+                }
+
+                goto translate_name;
+        }
+
+        /* Otherwise, PID 1 might have dropped this information into a file in /run. This is better than accessing
+         * /proc/1/environ, since we don't need CAP_SYS_PTRACE for that. */
+        r = read_one_line_file("/run/systemd/container", &m);
+        if (r > 0) {
+                e = m;
+                goto translate_name;
+        }
+        if (!IN_SET(r, -ENOENT, 0))
+                return log_debug_errno(r, "Failed to read /run/systemd/container: %m");
+
+        /* Fallback for cases where PID 1 was not systemd (for example, cases where init=/bin/sh is used. */
+        r = getenv_for_pid(1, "container", &m);
+        if (r > 0) {
+                e = m;
+                goto translate_name;
+        }
+        if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */
+                log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m");
+
+check_files:
+        /* Check for existence of some well-known files. We only do this after checking
+         * for other specific container managers, otherwise we risk mistaking another
+         * container manager for Docker: the /.dockerenv file could inadvertently end up
+         * in a file system image. */
+        v = detect_container_files();
+        if (v < 0)
+                return v;
+        if (v != VIRTUALIZATION_NONE)
+                goto finish;
+
+        r = running_in_cgroupns();
+        if (r > 0) {
+                v = VIRTUALIZATION_CONTAINER_OTHER;
+                goto finish;
+        }
+        if (r < 0)
+                log_debug_errno(r, "Failed to detect cgroup namespace: %m");
+
+        /* If none of that worked, give up, assume no container manager. */
+        v = VIRTUALIZATION_NONE;
+        goto finish;
+
+translate_name:
+        if (streq(e, "oci")) {
+                /* Some images hardcode container=oci, but OCI is not a specific container manager.
+                 * Try to detect one based on well-known files. */
+                v = detect_container_files();
+                if (v == VIRTUALIZATION_NONE)
+                        v = VIRTUALIZATION_CONTAINER_OTHER;
+                goto finish;
+        }
+        v = container_from_string(e);
+        if (v < 0)
+                v = VIRTUALIZATION_CONTAINER_OTHER;
+
+finish:
+        log_debug("Found container virtualization %s.", virtualization_to_string(v));
+        cached_found = v;
+        return v;
+}
+
+Virtualization detect_virtualization(void) {
+        int v;
+
+        v = detect_container();
+        if (v != VIRTUALIZATION_NONE)
+                return v;
+
+        return detect_vm();
+}
+
+static int userns_has_mapping(const char *name) {
+        _cleanup_fclose_ FILE *f = NULL;
+        uid_t a, b, c;
+        int r;
+
+        f = fopen(name, "re");
+        if (!f) {
+                log_debug_errno(errno, "Failed to open %s: %m", name);
+                return errno == ENOENT ? false : -errno;
+        }
+
+        errno = 0;
+        r = fscanf(f, UID_FMT " " UID_FMT " " UID_FMT "\n", &a, &b, &c);
+        if (r == EOF) {
+                if (ferror(f))
+                        return log_debug_errno(errno_or_else(EIO), "Failed to read %s: %m", name);
+
+                log_debug("%s is empty, we're in an uninitialized user namespace", name);
+                return true;
+        }
+        if (r != 3)
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed to parse %s: %m", name);
+
+        if (a == 0 && b == 0 && c == UINT32_MAX) {
+                /* The kernel calls mappings_overlap() and does not allow overlaps */
+                log_debug("%s has a full 1:1 mapping", name);
+                return false;
+        }
+
+        /* Anything else implies that we are in a user namespace */
+        log_debug("Mapping found in %s, we're in a user namespace", name);
+        return true;
+}
+
+int running_in_userns(void) {
+        _cleanup_free_ char *line = NULL;
+        int r;
+
+        r = userns_has_mapping("/proc/self/uid_map");
+        if (r != 0)
+                return r;
+
+        r = userns_has_mapping("/proc/self/gid_map");
+        if (r != 0)
+                return r;
+
+        /* "setgroups" file was added in kernel v3.18-rc6-15-g9cc46516dd. It is also possible to compile a
+         * kernel without CONFIG_USER_NS, in which case "setgroups" also does not exist. We cannot
+         * distinguish those two cases, so assume that we're running on a stripped-down recent kernel, rather
+         * than on an old one, and if the file is not found, return false. */
+        r = read_virtual_file("/proc/self/setgroups", SIZE_MAX, &line, NULL);
+        if (r < 0) {
+                log_debug_errno(r, "/proc/self/setgroups: %m");
+                return r == -ENOENT ? false : r;
+        }
+
+        strstrip(line); /* remove trailing newline */
+
+        r = streq(line, "deny");
+        /* See user_namespaces(7) for a description of this "setgroups" contents. */
+        log_debug("/proc/self/setgroups contains \"%s\", %s user namespace", line, r ? "in" : "not in");
+        return r;
+}
+
+int running_in_chroot(void) {
+        int r;
+
+        /* If we're PID1, /proc may not be mounted (and most likely we're not in a chroot). But PID1 will
+         * mount /proc, so all other programs can assume that if /proc is *not* available, we're in some
+         * chroot. */
+
+        if (getenv_bool("SYSTEMD_IGNORE_CHROOT") > 0)
+                return 0;
+
+        r = inode_same("/proc/1/root", "/", 0);
+        if (r == -ENOENT) {
+                r = proc_mounted();
+                if (r == 0) {
+                        if (getpid_cached() == 1)
+                                return false; /* We will mount /proc, assuming we're not in a chroot. */
+
+                        log_debug("/proc is not mounted, assuming we're in a chroot.");
+                        return true;
+                }
+                if (r > 0)  /* If we have fake /proc/, we can't do the check properly. */
+                        return -ENOSYS;
+        }
+        if (r < 0)
+                return r;
+
+        return r == 0;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+struct cpuid_table_entry {
+        uint32_t flag_bit;
+        const char *name;
+};
+
+static const struct cpuid_table_entry leaf1_edx[] = {
+        {  0, "fpu"     },
+        {  1, "vme"     },
+        {  2, "de"      },
+        {  3, "pse"     },
+        {  4, "tsc"     },
+        {  5, "msr"     },
+        {  6, "pae"     },
+        {  7, "mce"     },
+        {  8, "cx8"     },
+        {  9, "apic"    },
+        { 11, "sep"     },
+        { 12, "mtrr"    },
+        { 13, "pge"     },
+        { 14, "mca"     },
+        { 15, "cmov"    },
+        { 16, "pat"     },
+        { 17, "pse36"   },
+        { 19, "clflush" },
+        { 23, "mmx"     },
+        { 24, "fxsr"    },
+        { 25, "sse"     },
+        { 26, "sse2"    },
+        { 28, "ht"      },
+};
+
+static const struct cpuid_table_entry leaf1_ecx[] = {
+        {  0, "pni"     },
+        {  1, "pclmul"  },
+        {  3, "monitor" },
+        {  9, "ssse3"   },
+        { 12, "fma3"    },
+        { 13, "cx16"    },
+        { 19, "sse4_1"  },
+        { 20, "sse4_2"  },
+        { 22, "movbe"   },
+        { 23, "popcnt"  },
+        { 25, "aes"     },
+        { 26, "xsave"   },
+        { 27, "osxsave" },
+        { 28, "avx"     },
+        { 29, "f16c"    },
+        { 30, "rdrand"  },
+};
+
+static const struct cpuid_table_entry leaf7_ebx[] = {
+        {  3, "bmi1"   },
+        {  5, "avx2"   },
+        {  8, "bmi2"   },
+        { 18, "rdseed" },
+        { 19, "adx"    },
+        { 29, "sha_ni" },
+};
+
+static const struct cpuid_table_entry leaf81_edx[] = {
+        { 11, "syscall" },
+        { 27, "rdtscp"  },
+        { 29, "lm"      },
+};
+
+static const struct cpuid_table_entry leaf81_ecx[] = {
+        {  0, "lahf_lm" },
+        {  5, "abm"     },
+};
+
+static const struct cpuid_table_entry leaf87_edx[] = {
+        {  8, "constant_tsc" },
+};
+
+static bool given_flag_in_set(const char *flag, const struct cpuid_table_entry *set, size_t set_size, uint32_t val) {
+        for (size_t i = 0; i < set_size; i++) {
+                if ((UINT32_C(1) << set[i].flag_bit) & val &&
+                                streq(flag, set[i].name))
+                        return true;
+        }
+        return false;
+}
+
+static bool real_has_cpu_with_flag(const char *flag) {
+        uint32_t eax, ebx, ecx, edx;
+
+        if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+                if (given_flag_in_set(flag, leaf1_ecx, ELEMENTSOF(leaf1_ecx), ecx))
+                        return true;
+
+                if (given_flag_in_set(flag, leaf1_edx, ELEMENTSOF(leaf1_edx), edx))
+                        return true;
+        }
+
+        if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
+                if (given_flag_in_set(flag, leaf7_ebx, ELEMENTSOF(leaf7_ebx), ebx))
+                        return true;
+        }
+
+        if (__get_cpuid(0x80000001U, &eax, &ebx, &ecx, &edx)) {
+                if (given_flag_in_set(flag, leaf81_ecx, ELEMENTSOF(leaf81_ecx), ecx))
+                        return true;
+
+                if (given_flag_in_set(flag, leaf81_edx, ELEMENTSOF(leaf81_edx), edx))
+                        return true;
+        }
+
+        if (__get_cpuid(0x80000007U, &eax, &ebx, &ecx, &edx))
+                if (given_flag_in_set(flag, leaf87_edx, ELEMENTSOF(leaf87_edx), edx))
+                        return true;
+
+        return false;
+}
+#endif
+
+bool has_cpu_with_flag(const char *flag) {
+        /* CPUID is an x86 specific interface. Assume on all others that no CPUs have those flags. */
+#if defined(__i386__) || defined(__x86_64__)
+        return real_has_cpu_with_flag(flag);
+#else
+        return false;
+#endif
+}
+
+static const char *const virtualization_table[_VIRTUALIZATION_MAX] = {
+        [VIRTUALIZATION_NONE]            = "none",
+        [VIRTUALIZATION_KVM]             = "kvm",
+        [VIRTUALIZATION_AMAZON]          = "amazon",
+        [VIRTUALIZATION_QEMU]            = "qemu",
+        [VIRTUALIZATION_BOCHS]           = "bochs",
+        [VIRTUALIZATION_XEN]             = "xen",
+        [VIRTUALIZATION_UML]             = "uml",
+        [VIRTUALIZATION_VMWARE]          = "vmware",
+        [VIRTUALIZATION_ORACLE]          = "oracle",
+        [VIRTUALIZATION_MICROSOFT]       = "microsoft",
+        [VIRTUALIZATION_ZVM]             = "zvm",
+        [VIRTUALIZATION_PARALLELS]       = "parallels",
+        [VIRTUALIZATION_BHYVE]           = "bhyve",
+        [VIRTUALIZATION_QNX]             = "qnx",
+        [VIRTUALIZATION_ACRN]            = "acrn",
+        [VIRTUALIZATION_POWERVM]         = "powervm",
+        [VIRTUALIZATION_APPLE]           = "apple",
+        [VIRTUALIZATION_SRE]             = "sre",
+        [VIRTUALIZATION_GOOGLE]          = "google",
+        [VIRTUALIZATION_VM_OTHER]        = "vm-other",
+
+        [VIRTUALIZATION_SYSTEMD_NSPAWN]  = "systemd-nspawn",
+        [VIRTUALIZATION_LXC_LIBVIRT]     = "lxc-libvirt",
+        [VIRTUALIZATION_LXC]             = "lxc",
+        [VIRTUALIZATION_OPENVZ]          = "openvz",
+        [VIRTUALIZATION_DOCKER]          = "docker",
+        [VIRTUALIZATION_PODMAN]          = "podman",
+        [VIRTUALIZATION_RKT]             = "rkt",
+        [VIRTUALIZATION_WSL]             = "wsl",
+        [VIRTUALIZATION_PROOT]           = "proot",
+        [VIRTUALIZATION_POUCH]           = "pouch",
+        [VIRTUALIZATION_CONTAINER_OTHER] = "container-other",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(virtualization, Virtualization);
diff --git a/src/basic/virt.h b/src/basic/virt.h
new file mode 100644
index 0000000..dea39e4
--- /dev/null
+++ b/src/basic/virt.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "errno-list.h"
+#include "macro.h"
+
+typedef enum Virtualization {
+        VIRTUALIZATION_NONE = 0,
+
+        VIRTUALIZATION_VM_FIRST,
+        VIRTUALIZATION_KVM = VIRTUALIZATION_VM_FIRST,
+        VIRTUALIZATION_AMAZON,
+        VIRTUALIZATION_QEMU,
+        VIRTUALIZATION_BOCHS,
+        VIRTUALIZATION_XEN,
+        VIRTUALIZATION_UML,
+        VIRTUALIZATION_VMWARE,
+        VIRTUALIZATION_ORACLE,
+        VIRTUALIZATION_MICROSOFT,
+        VIRTUALIZATION_ZVM,
+        VIRTUALIZATION_PARALLELS,
+        VIRTUALIZATION_BHYVE,
+        VIRTUALIZATION_QNX,
+        VIRTUALIZATION_ACRN,
+        VIRTUALIZATION_POWERVM,
+        VIRTUALIZATION_APPLE,
+        VIRTUALIZATION_SRE,
+        VIRTUALIZATION_GOOGLE,
+        VIRTUALIZATION_VM_OTHER,
+        VIRTUALIZATION_VM_LAST = VIRTUALIZATION_VM_OTHER,
+
+        VIRTUALIZATION_CONTAINER_FIRST,
+        VIRTUALIZATION_SYSTEMD_NSPAWN = VIRTUALIZATION_CONTAINER_FIRST,
+        VIRTUALIZATION_LXC_LIBVIRT,
+        VIRTUALIZATION_LXC,
+        VIRTUALIZATION_OPENVZ,
+        VIRTUALIZATION_DOCKER,
+        VIRTUALIZATION_PODMAN,
+        VIRTUALIZATION_RKT,
+        VIRTUALIZATION_WSL,
+        VIRTUALIZATION_PROOT,
+        VIRTUALIZATION_POUCH,
+        VIRTUALIZATION_CONTAINER_OTHER,
+        VIRTUALIZATION_CONTAINER_LAST = VIRTUALIZATION_CONTAINER_OTHER,
+
+        _VIRTUALIZATION_MAX,
+        _VIRTUALIZATION_INVALID = -EINVAL,
+        _VIRTUALIZATION_ERRNO_MAX = -ERRNO_MAX, /* ensure full range of errno fits into this enum */
+} Virtualization;
+
+static inline bool VIRTUALIZATION_IS_VM(Virtualization x) {
+        return x >= VIRTUALIZATION_VM_FIRST && x <= VIRTUALIZATION_VM_LAST;
+}
+
+static inline bool VIRTUALIZATION_IS_CONTAINER(Virtualization x) {
+        return x >= VIRTUALIZATION_CONTAINER_FIRST && x <= VIRTUALIZATION_CONTAINER_LAST;
+}
+
+Virtualization detect_vm(void);
+Virtualization detect_container(void);
+Virtualization detect_virtualization(void);
+
+int running_in_userns(void);
+int running_in_chroot(void);
+
+const char *virtualization_to_string(Virtualization v) _const_;
+Virtualization virtualization_from_string(const char *s) _pure_;
+bool has_cpu_with_flag(const char *flag);
diff --git a/src/basic/xattr-util.c b/src/basic/xattr-util.c
new file mode 100644
index 0000000..d2daf87
--- /dev/null
+++ b/src/basic/xattr-util.c
@@ -0,0 +1,379 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "macro.h"
+#include "missing_syscall.h"
+#include "parse-util.h"
+#include "sparse-endian.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "time-util.h"
+#include "xattr-util.h"
+
+int getxattr_at_malloc(
+                int fd,
+                const char *path,
+                const char *name,
+                int flags,
+                char **ret) {
+
+        _cleanup_close_ int opened_fd = -EBADF;
+        unsigned n_attempts = 7;
+        bool by_procfs = false;
+        size_t l = 100;
+
+        assert(fd >= 0 || fd == AT_FDCWD);
+        assert(name);
+        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
+        assert(ret);
+
+        /* So, this is single function that does what getxattr()/lgetxattr()/fgetxattr() does, but in one go,
+         * and with additional bells and whistles. Specifically:
+         *
+         * 1. This works on O_PATH fds (which fgetxattr() does not)
+         * 2. Provides full openat()-style semantics, i.e. by-fd, by-path and combination thereof
+         * 3. As extension to openat()-style semantics implies AT_EMPTY_PATH if path is NULL.
+         * 4. Does a malloc() loop, automatically sizing the allocation
+         * 5. NUL-terminates the returned buffer (for safety)
+         */
+
+        if (!path) /* If path is NULL, imply AT_EMPTY_PATH. – But if it's "", don't — for safety reasons. */
+                flags |= AT_EMPTY_PATH;
+
+        if (isempty(path)) {
+                if (!FLAGS_SET(flags, AT_EMPTY_PATH))
+                        return -EINVAL;
+
+                if (fd == AT_FDCWD) /* Both unspecified? Then operate on current working directory */
+                        path = ".";
+                else
+                        path = NULL;
+
+        } else if (fd != AT_FDCWD) {
+
+                /* If both have been specified, then we go via O_PATH */
+                opened_fd = openat(fd, path, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : O_NOFOLLOW));
+                if (opened_fd < 0)
+                        return -errno;
+
+                fd = opened_fd;
+                path = NULL;
+                by_procfs = true; /* fgetxattr() is not going to work, go via /proc/ link right-away */
+        }
+
+        for (;;) {
+                _cleanup_free_ char *v = NULL;
+                ssize_t n;
+
+                if (n_attempts == 0) /* If someone is racing against us, give up eventually */
+                        return -EBUSY;
+                n_attempts--;
+
+                v = new0(char, l+1);
+                if (!v)
+                        return -ENOMEM;
+
+                l = MALLOC_ELEMENTSOF(v) - 1;
+
+                if (path)
+                        n = FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? getxattr(path, name, v, l) : lgetxattr(path, name, v, l);
+                else
+                        n = by_procfs ? getxattr(FORMAT_PROC_FD_PATH(fd), name, v, l) : fgetxattr(fd, name, v, l);
+                if (n < 0) {
+                        if (errno == EBADF) {
+                                if (by_procfs || path)
+                                        return -EBADF;
+
+                                by_procfs = true; /* Might be an O_PATH fd, try again via /proc/ link */
+                                continue;
+                        }
+
+                        if (errno != ERANGE)
+                                return -errno;
+                } else {
+                        v[n] = 0; /* NUL terminate */
+                        *ret = TAKE_PTR(v);
+                        return (int) n;
+                }
+
+                if (path)
+                        n = FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? getxattr(path, name, NULL, 0) : lgetxattr(path, name, NULL, 0);
+                else
+                        n = by_procfs ? getxattr(FORMAT_PROC_FD_PATH(fd), name, NULL, 0) : fgetxattr(fd, name, NULL, 0);
+                if (n < 0)
+                        return -errno;
+                if (n > INT_MAX) /* We couldn't return this as 'int' anymore */
+                        return -E2BIG;
+
+                l = (size_t) n;
+        }
+}
+
+int getxattr_at_bool(int fd, const char *path, const char *name, int flags) {
+        _cleanup_free_ char *v = NULL;
+        int r;
+
+        r = getxattr_at_malloc(fd, path, name, flags, &v);
+        if (r < 0)
+                return r;
+
+        if (memchr(v, 0, r)) /* Refuse embedded NUL byte */
+                return -EINVAL;
+
+        return parse_boolean(v);
+}
+
+static int parse_crtime(le64_t le, usec_t *usec) {
+        uint64_t u;
+
+        assert(usec);
+
+        u = le64toh(le);
+        if (IN_SET(u, 0, UINT64_MAX))
+                return -EIO;
+
+        *usec = (usec_t) u;
+        return 0;
+}
+
+int fd_getcrtime_at(
+                int fd,
+                const char *path,
+                int flags,
+                usec_t *ret) {
+
+        _cleanup_free_ le64_t *le = NULL;
+        STRUCT_STATX_DEFINE(sx);
+        usec_t a, b;
+        int r;
+
+        assert(fd >= 0 || fd == AT_FDCWD);
+        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
+        assert(ret);
+
+        if (!path)
+                flags |= AT_EMPTY_PATH;
+
+        /* So here's the deal: the creation/birth time (crtime/btime) of a file is a relatively newly supported concept
+         * on Linux (or more strictly speaking: a concept that only recently got supported in the API, it was
+         * implemented on various file systems on the lower level since a while, but never was accessible). However, we
+         * needed a concept like that for vacuuming algorithms and such, hence we emulated it via a user xattr for a
+         * long time. Starting with Linux 4.11 there's statx() which exposes the timestamp to userspace for the first
+         * time, where it is available. This function will read it, but it tries to keep some compatibility with older
+         * systems: we try to read both the crtime/btime and the xattr, and then use whatever is older. After all the
+         * concept is useful for determining how "old" a file really is, and hence using the older of the two makes
+         * most sense. */
+
+        if (statx(fd, strempty(path),
+                  (flags & ~AT_SYMLINK_FOLLOW)|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : AT_SYMLINK_NOFOLLOW)|AT_STATX_DONT_SYNC,
+                  STATX_BTIME,
+                  &sx) >= 0 &&
+            (sx.stx_mask & STATX_BTIME) &&
+            sx.stx_btime.tv_sec != 0)
+                a = (usec_t) sx.stx_btime.tv_sec * USEC_PER_SEC +
+                        (usec_t) sx.stx_btime.tv_nsec / NSEC_PER_USEC;
+        else
+                a = USEC_INFINITY;
+
+        r = getxattr_at_malloc(fd, path, "user.crtime_usec", flags, (char**) &le);
+        if (r >= 0) {
+                if (r != sizeof(*le))
+                        r = -EIO;
+                else
+                        r = parse_crtime(*le, &b);
+        }
+        if (r < 0) {
+                if (a != USEC_INFINITY) {
+                        *ret = a;
+                        return 0;
+                }
+
+                return r;
+        }
+
+        if (a != USEC_INFINITY)
+                *ret = MIN(a, b);
+        else
+                *ret = b;
+
+        return 0;
+}
+
+int fd_setcrtime(int fd, usec_t usec) {
+        le64_t le;
+
+        assert(fd >= 0);
+
+        if (!timestamp_is_set(usec))
+                usec = now(CLOCK_REALTIME);
+
+        le = htole64((uint64_t) usec);
+        return RET_NERRNO(fsetxattr(fd, "user.crtime_usec", &le, sizeof(le), 0));
+}
+
+int listxattr_at_malloc(
+                int fd,
+                const char *path,
+                int flags,
+                char **ret) {
+
+        _cleanup_close_ int opened_fd = -EBADF;
+        bool by_procfs = false;
+        unsigned n_attempts = 7;
+        size_t l = 100;
+
+        assert(fd >= 0 || fd == AT_FDCWD);
+        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
+        assert(ret);
+
+        /* This is to listxattr()/llistattr()/flistattr() what getxattr_at_malloc() is to getxattr()/… */
+
+        if (!path) /* If path is NULL, imply AT_EMPTY_PATH. – But if it's "", don't. */
+                flags |= AT_EMPTY_PATH;
+
+        if (isempty(path)) {
+                if (!FLAGS_SET(flags, AT_EMPTY_PATH))
+                        return -EINVAL;
+
+                if (fd == AT_FDCWD) /* Both unspecified? Then operate on current working directory */
+                        path = ".";
+                else
+                        path = NULL;
+
+        } else if (fd != AT_FDCWD) {
+                /* If both have been specified, then we go via O_PATH */
+                opened_fd = openat(fd, path, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : O_NOFOLLOW));
+                if (opened_fd < 0)
+                        return -errno;
+
+                fd = opened_fd;
+                path = NULL;
+                by_procfs = true;
+        }
+
+        for (;;) {
+                _cleanup_free_ char *v = NULL;
+                ssize_t n;
+
+                if (n_attempts == 0) /* If someone is racing against us, give up eventually */
+                        return -EBUSY;
+                n_attempts--;
+
+                v = new(char, l+1);
+                if (!v)
+                        return -ENOMEM;
+
+                l = MALLOC_ELEMENTSOF(v) - 1;
+
+                if (path)
+                        n = FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? listxattr(path, v, l) : llistxattr(path, v, l);
+                else
+                        n = by_procfs ? listxattr(FORMAT_PROC_FD_PATH(fd), v, l) : flistxattr(fd, v, l);
+                if (n < 0) {
+                        if (errno == EBADF) {
+                                if (by_procfs || path)
+                                        return -EBADF;
+
+                                by_procfs = true; /* Might be an O_PATH fd, try again via /proc/ link */
+                                continue;
+                        }
+
+                        if (errno != ERANGE)
+                                return -errno;
+                } else {
+                        v[n] = 0; /* NUL terminate */
+                        *ret = TAKE_PTR(v);
+                        return (int) n;
+                }
+
+                if (path)
+                        n = FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? listxattr(path, NULL, 0) : llistxattr(path, NULL, 0);
+                else
+                        n = by_procfs ? listxattr(FORMAT_PROC_FD_PATH(fd), NULL, 0) : flistxattr(fd, NULL, 0);
+                if (n < 0)
+                        return -errno;
+                if (n > INT_MAX) /* We couldn't return this as 'int' anymore */
+                        return -E2BIG;
+
+                l = (size_t) n;
+        }
+}
+
+int xsetxattr(int fd,
+              const char *path,
+              const char *name,
+              const char *value,
+              size_t size,
+              int flags) {
+
+        _cleanup_close_ int opened_fd = -EBADF;
+        bool by_procfs = false;
+        int r;
+
+        assert(fd >= 0 || fd == AT_FDCWD);
+        assert(name);
+        assert(value);
+        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
+
+        /* So, this is a single function that does what setxattr()/lsetxattr()/fsetxattr() do, but in one go,
+         * and with additional bells and whistles. Specifically:
+         *
+         * 1. This works on O_PATH fds (which fsetxattr() does not)
+         * 2. Provides full openat()-style semantics, i.e. by-fd, by-path and combination thereof
+         * 3. As extension to openat()-style semantics implies AT_EMPTY_PATH if path is NULL.
+         */
+
+        if (!path) /* If path is NULL, imply AT_EMPTY_PATH. – But if it's "", don't — for safety reasons. */
+                flags |= AT_EMPTY_PATH;
+
+        if (size == SIZE_MAX)
+                size = strlen(value);
+
+        if (isempty(path)) {
+                if (!FLAGS_SET(flags, AT_EMPTY_PATH))
+                        return -EINVAL;
+
+                if (fd == AT_FDCWD) /* Both unspecified? Then operate on current working directory */
+                        path = ".";
+                else {
+                        r = fd_is_opath(fd);
+                        if (r < 0)
+                                return r;
+
+                        by_procfs = r;
+                        path = NULL;
+                }
+
+        } else if (fd != AT_FDCWD) {
+
+                /* If both have been specified, then we go via O_PATH */
+                opened_fd = openat(fd, path, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : O_NOFOLLOW));
+                if (opened_fd < 0)
+                        return -errno;
+
+                fd = opened_fd;
+                path = NULL;
+                by_procfs = true; /* fsetxattr() is not going to work, go via /proc/ link right-away */
+        }
+
+        if (path)
+                r = FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? setxattr(path, name, value, size, 0)
+                                                        : lsetxattr(path, name, value, size, 0);
+        else
+                r = by_procfs ? setxattr(FORMAT_PROC_FD_PATH(fd), name, value, size, 0)
+                              : fsetxattr(fd, name, value, size, 0);
+        if (r < 0)
+                return -errno;
+
+        return 0;
+}
diff --git a/src/basic/xattr-util.h b/src/basic/xattr-util.h
new file mode 100644
index 0000000..19ee3e1
--- /dev/null
+++ b/src/basic/xattr-util.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+
+#include "time-util.h"
+
+int getxattr_at_malloc(int fd, const char *path, const char *name, int flags, char **ret);
+static inline int getxattr_malloc(const char *path, const char *name, char **ret) {
+        return getxattr_at_malloc(AT_FDCWD, path, name, AT_SYMLINK_FOLLOW, ret);
+}
+static inline int lgetxattr_malloc(const char *path, const char *name, char **ret) {
+        return getxattr_at_malloc(AT_FDCWD, path, name, 0, ret);
+}
+static inline int fgetxattr_malloc(int fd, const char *name, char **ret) {
+        return getxattr_at_malloc(fd, NULL, name, AT_EMPTY_PATH, ret);
+}
+
+int getxattr_at_bool(int fd, const char *path, const char *name, int flags);
+
+int fd_setcrtime(int fd, usec_t usec);
+
+int fd_getcrtime_at(int fd, const char *name, int flags, usec_t *ret);
+static inline int fd_getcrtime(int fd, usec_t *ret) {
+        return fd_getcrtime_at(fd, NULL, 0, ret);
+}
+
+
+int listxattr_at_malloc(int fd, const char *path, int flags, char **ret);
+static inline int listxattr_malloc(const char *path, char **ret) {
+        return listxattr_at_malloc(AT_FDCWD, path, AT_SYMLINK_FOLLOW, ret);
+}
+static inline int llistxattr_malloc(const char *path, char **ret) {
+        return listxattr_at_malloc(AT_FDCWD, path, 0, ret);
+}
+static inline int flistxattr_malloc(int fd, char **ret) {
+        return listxattr_at_malloc(fd, NULL, AT_EMPTY_PATH, ret);
+}
+
+int xsetxattr(int fd, const char *path, const char *name, const char *value, size_t size, int flags);
diff --git a/src/battery-check/battery-check.c b/src/battery-check/battery-check.c
new file mode 100644
index 0000000..03628c8
--- /dev/null
+++ b/src/battery-check/battery-check.c
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "battery-util.h"
+#include "build.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "glyph-util.h"
+#include "io-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "parse-util.h"
+#include "plymouth-util.h"
+#include "pretty-print.h"
+#include "proc-cmdline.h"
+#include "socket-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+
+#define BATTERY_LOW_MESSAGE \
+        "Battery level critically low. Please connect your charger or the system will power off in 10 seconds."
+#define BATTERY_RESTORED_MESSAGE \
+        "A.C. power restored, continuing."
+
+static bool arg_doit = true;
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-battery-check", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s\n\n"
+               "%sCheck battery level to see whether there's enough charge.%s\n\n"
+               "   -h --help            Show this help\n"
+               "      --version         Show package version\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int plymouth_send_message(const char *mode, const char *message) {
+        _cleanup_free_ char *plymouth_message = NULL;
+        int c, r;
+
+        assert(mode);
+        assert(message);
+
+        c = asprintf(&plymouth_message,
+                     "C\x02%c%s%c"
+                     "M\x02%c%s%c",
+                     (int) strlen(mode) + 1, mode, '\x00',
+                     (int) strlen(message) + 1, message, '\x00');
+        if (c < 0)
+                return log_oom();
+
+        /* We set SOCK_NONBLOCK here so that we rather drop the message than wait for plymouth */
+        r = plymouth_send_raw(plymouth_message, c, SOCK_NONBLOCK);
+        if (r < 0)
+                return log_full_errno(ERRNO_IS_NO_PLYMOUTH(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                      "Failed to communicate with plymouth: %m");
+
+        return 0;
+}
+
+static int parse_argv(int argc, char * argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+        };
+
+        static const struct option options[] = {
+                { "help",    no_argument, NULL, 'h'         },
+                { "version", no_argument, NULL, ARG_VERSION },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (optind < argc)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "%s takes no argument.",
+                                       program_invocation_short_name);
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_free_ char *plymouth_message = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        log_setup();
+
+        r = proc_cmdline_get_bool("systemd.battery-check", PROC_CMDLINE_STRIP_RD_PREFIX|PROC_CMDLINE_TRUE_WHEN_MISSING, &arg_doit);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse systemd.battery-check= kernel command line option, ignoring: %m");
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        if (!arg_doit) {
+                log_info("Checking battery status and AC power existence is disabled by the kernel command line, skipping execution.");
+                return 0;
+        }
+
+        r = battery_is_discharging_and_low();
+        if (r < 0) {
+                log_warning_errno(r, "Failed to check battery status, ignoring: %m");
+                return 0;
+        }
+        if (r == 0)
+                return 0;
+        log_struct(LOG_EMERG,
+                   LOG_MESSAGE("%s " BATTERY_LOW_MESSAGE, special_glyph(SPECIAL_GLYPH_LOW_BATTERY)),
+                   "MESSAGE_ID=" SD_MESSAGE_BATTERY_LOW_WARNING_STR);
+
+        fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+        if (fd < 0)
+                log_warning_errno(fd, "Failed to open console, ignoring: %m");
+        else
+                dprintf(fd, ANSI_HIGHLIGHT_RED "%s " BATTERY_LOW_MESSAGE ANSI_NORMAL "\n",
+                        special_glyph_full(SPECIAL_GLYPH_LOW_BATTERY, /* force_utf = */ false));
+
+        if (asprintf(&plymouth_message, "%s " BATTERY_LOW_MESSAGE,
+                     special_glyph_full(SPECIAL_GLYPH_LOW_BATTERY, /* force_utf = */ true)) < 0)
+                return log_oom();
+
+        (void) plymouth_send_message("shutdown", plymouth_message);
+
+        usleep_safe(10 * USEC_PER_SEC);
+
+        r = battery_is_discharging_and_low();
+        if (r < 0)
+                return log_warning_errno(r, "Failed to check battery status, assuming not charged yet, powering off: %m");
+        if (r > 0) {
+                log_struct(LOG_EMERG,
+                           LOG_MESSAGE("Battery level critically low, powering off."),
+                           "MESSAGE_ID=" SD_MESSAGE_BATTERY_LOW_POWEROFF_STR);
+                return r;
+        }
+
+        log_info(BATTERY_RESTORED_MESSAGE);
+        if (fd >= 0)
+                dprintf(fd, BATTERY_RESTORED_MESSAGE "\n");
+        (void) plymouth_send_message("boot-up", BATTERY_RESTORED_MESSAGE);
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/battery-check/meson.build b/src/battery-check/meson.build
new file mode 100644
index 0000000..370d4d4
--- /dev/null
+++ b/src/battery-check/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-battery-check',
+                'public' : true,
+                'sources' : files('battery-check.c'),
+        },
+]
diff --git a/src/binfmt/binfmt.c b/src/binfmt/binfmt.c
new file mode 100644
index 0000000..d21f3f7
--- /dev/null
+++ b/src/binfmt/binfmt.c
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "binfmt-util.h"
+#include "build.h"
+#include "conf-files.h"
+#include "constants.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "main-func.h"
+#include "pager.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "string-util.h"
+#include "strv.h"
+
+static CatFlags arg_cat_flags = CAT_CONFIG_OFF;
+static PagerFlags arg_pager_flags = 0;
+static bool arg_unregister = false;
+
+static int delete_rule(const char *rulename) {
+        const char *fn = strjoina("/proc/sys/fs/binfmt_misc/", rulename);
+        return write_string_file(fn, "-1", WRITE_STRING_FILE_DISABLE_BUFFER);
+}
+
+static int apply_rule(const char *filename, unsigned line, const char *rule) {
+        assert(filename);
+        assert(line > 0);
+        assert(rule);
+        assert(rule[0]);
+
+        _cleanup_free_ char *rulename = NULL;
+        int r;
+
+        rulename = strdupcspn(rule + 1, CHAR_TO_STR(rule[0]));
+        if (!rulename)
+                return log_oom();
+
+        if (!filename_is_valid(rulename) ||
+            STR_IN_SET(rulename, "register", "status"))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "%s:%u: Rule name '%s' is not valid, refusing.",
+                                       filename, line, rulename);
+        r = delete_rule(rulename);
+        if (r < 0 && r != -ENOENT)
+                log_warning_errno(r, "%s:%u: Failed to delete rule '%s', ignoring: %m",
+                                  filename, line, rulename);
+        if (r >= 0)
+                log_debug("%s:%u: Rule '%s' deleted.", filename, line, rulename);
+
+        r = write_string_file("/proc/sys/fs/binfmt_misc/register", rule, WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                return log_error_errno(r, "%s:%u: Failed to add binary format '%s': %m",
+                                       filename, line, rulename);
+
+        log_debug("%s:%u: Binary format '%s' registered.", filename, line, rulename);
+        return 0;
+}
+
+static int apply_file(const char *filename, bool ignore_enoent) {
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *pp = NULL;
+        int r;
+
+        assert(filename);
+
+        r = search_and_fopen(filename, "re", NULL, (const char**) CONF_PATHS_STRV("binfmt.d"), &f, &pp);
+        if (r < 0) {
+                if (ignore_enoent && r == -ENOENT)
+                        return 0;
+
+                return log_error_errno(r, "Failed to open file '%s': %m", filename);
+        }
+
+        log_debug("Applying %s%s", pp, special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+        for (unsigned line = 1;; line++) {
+                _cleanup_free_ char *text = NULL;
+                int k;
+
+                k = read_stripped_line(f, LONG_LINE_MAX, &text);
+                if (k < 0)
+                        return log_error_errno(k, "Failed to read file '%s': %m", pp);
+                if (k == 0)
+                        break;
+
+                if (isempty(text))
+                        continue;
+                if (strchr(COMMENTS, text[0]))
+                        continue;
+
+                RET_GATHER(r, apply_rule(filename, line, text));
+        }
+
+        return r;
+}
+
+static int cat_config(char **files) {
+        pager_open(arg_pager_flags);
+
+        return cat_files(NULL, files, arg_cat_flags);
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-binfmt.service", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] [CONFIGURATION FILE...]\n\n"
+               "Registers binary formats with the kernel.\n\n"
+               "  -h --help             Show this help\n"
+               "     --version          Show package version\n"
+               "     --cat-config       Show configuration files\n"
+               "     --tldr             Show non-comment parts of configuration\n"
+               "     --no-pager         Do not pipe output into a pager\n"
+               "     --unregister       Unregister all existing entries\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_CAT_CONFIG,
+                ARG_TLDR,
+                ARG_NO_PAGER,
+                ARG_UNREGISTER,
+        };
+
+        static const struct option options[] = {
+                { "help",       no_argument, NULL, 'h'            },
+                { "version",    no_argument, NULL, ARG_VERSION    },
+                { "cat-config", no_argument, NULL, ARG_CAT_CONFIG },
+                { "tldr",       no_argument, NULL, ARG_TLDR       },
+                { "no-pager",   no_argument, NULL, ARG_NO_PAGER   },
+                { "unregister", no_argument, NULL, ARG_UNREGISTER },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_CAT_CONFIG:
+                        arg_cat_flags = CAT_CONFIG_ON;
+                        break;
+
+                case ARG_TLDR:
+                        arg_cat_flags = CAT_TLDR;
+                        break;
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_UNREGISTER:
+                        arg_unregister = true;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if ((arg_unregister || arg_cat_flags != CAT_CONFIG_OFF) && argc > optind)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Positional arguments are not allowed with --cat-config/--tldr or --unregister.");
+
+        return 1;
+}
+
+static int binfmt_mounted_warn(void) {
+        int r;
+
+        r = binfmt_mounted();
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if /proc/sys/fs/binfmt_misc is mounted: %m");
+        if (r == 0)
+                log_debug("/proc/sys/fs/binfmt_misc is not mounted in read-write mode, skipping.");
+
+        return r;
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        log_setup();
+
+        umask(0022);
+
+        r = 0;
+
+        if (arg_unregister)
+                return disable_binfmt();
+
+        if (argc > optind) {
+                r = binfmt_mounted_warn();
+                if (r <= 0)
+                        return r;
+
+                for (int i = optind; i < argc; i++)
+                        RET_GATHER(r, apply_file(argv[i], false));
+
+        } else {
+                _cleanup_strv_free_ char **files = NULL;
+
+                r = conf_files_list_strv(&files, ".conf", NULL, 0, (const char**) CONF_PATHS_STRV("binfmt.d"));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to enumerate binfmt.d files: %m");
+
+                if (arg_cat_flags != CAT_CONFIG_OFF)
+                        return cat_config(files);
+
+                r = binfmt_mounted_warn();
+                if (r <= 0)
+                        return r;
+
+                /* Flush out all rules */
+                r = write_string_file("/proc/sys/fs/binfmt_misc/status", "-1", WRITE_STRING_FILE_DISABLE_BUFFER);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to flush binfmt_misc rules, ignoring: %m");
+                else
+                        log_debug("Flushed all binfmt_misc rules.");
+
+                STRV_FOREACH(f, files)
+                        RET_GATHER(r, apply_file(*f, true));
+        }
+
+        return r;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/binfmt/meson.build b/src/binfmt/meson.build
new file mode 100644
index 0000000..4496340
--- /dev/null
+++ b/src/binfmt/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-binfmt',
+                'public' : true,
+                'conditions' : ['ENABLE_BINFMT'],
+                'sources' : files('binfmt.c'),
+        },
+]
+
+if conf.get('ENABLE_BINFMT') == 1
+        install_emptydir(binfmtdir)
+        if install_sysconfdir
+                install_emptydir(sysconfdir / 'binfmt.d')
+        endif
+endif
diff --git a/src/boot/bless-boot-generator.c b/src/boot/bless-boot-generator.c
new file mode 100644
index 0000000..38b2c3a
--- /dev/null
+++ b/src/boot/bless-boot-generator.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "efi-loader.h"
+#include "generator.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "mkdir.h"
+#include "special.h"
+#include "string-util.h"
+#include "virt.h"
+
+/* This generator pulls systemd-bless-boot.service into the initial transaction if the "LoaderBootCountPath"
+ * EFI variable is set, i.e. the system boots up with boot counting in effect, which means we should mark the
+ * boot as "good" if we manage to boot up far enough. */
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+
+        if (in_initrd()) {
+                log_debug("Skipping generator, running in the initrd.");
+                return EXIT_SUCCESS;
+        }
+
+        if (detect_container() > 0) {
+                log_debug("Skipping generator, running in a container.");
+                return 0;
+        }
+
+        if (!is_efi_boot()) {
+                log_debug("Skipping generator, not an EFI boot.");
+                return 0;
+        }
+
+        if (access(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderBootCountPath)), F_OK) < 0) {
+
+                if (errno == ENOENT) {
+                        log_debug_errno(errno, "Skipping generator, not booted with boot counting in effect.");
+                        return 0;
+                }
+
+                return log_error_errno(errno, "Failed to check if LoaderBootCountPath EFI variable exists: %m");
+        }
+
+        /* We pull this in from basic.target so that it ends up in all "regular" boot ups, but not in
+         * rescue.target or even emergency.target. */
+        const char *p = strjoina(dest_early, "/" SPECIAL_BASIC_TARGET ".wants/systemd-bless-boot.service");
+        (void) mkdir_parents(p, 0755);
+        if (symlink(SYSTEM_DATA_UNIT_DIR "/systemd-bless-boot.service", p) < 0)
+                return log_error_errno(errno, "Failed to create symlink '%s': %m", p);
+
+        return 0;
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/boot/bless-boot.c b/src/boot/bless-boot.c
new file mode 100644
index 0000000..0c0b4f2
--- /dev/null
+++ b/src/boot/bless-boot.c
@@ -0,0 +1,527 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bootspec.h"
+#include "build.h"
+#include "devnum-util.h"
+#include "efi-api.h"
+#include "efi-loader.h"
+#include "efivars.h"
+#include "fd-util.h"
+#include "find-esp.h"
+#include "fs-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "sync-util.h"
+#include "terminal-util.h"
+#include "verbs.h"
+#include "virt.h"
+
+static char **arg_path = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_path, strv_freep);
+
+static int help(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-bless-boot.service", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] COMMAND\n"
+               "\n%sMark the boot process as good or bad.%s\n"
+               "\nCommands:\n"
+               "     status          Show status of current boot loader entry\n"
+               "     good            Mark this boot as good\n"
+               "     bad             Mark this boot as bad\n"
+               "     indeterminate   Undo any marking as good or bad\n"
+               "\nOptions:\n"
+               "  -h --help          Show this help\n"
+               "     --version       Print version\n"
+               "     --path=PATH     Path to the $BOOT partition (may be used multiple times)\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_PATH = 0x100,
+                ARG_VERSION,
+        };
+
+        static const struct option options[] = {
+                { "help",         no_argument,       NULL, 'h'              },
+                { "version",      no_argument,       NULL, ARG_VERSION      },
+                { "path",         required_argument, NULL, ARG_PATH         },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        help(0, NULL, NULL);
+                        return 0;
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_PATH:
+                        r = strv_extend(&arg_path, optarg);
+                        if (r < 0)
+                                return log_oom();
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int acquire_path(void) {
+        _cleanup_free_ char *esp_path = NULL, *xbootldr_path = NULL;
+        dev_t esp_devid = 0, xbootldr_devid = 0;
+        char **a;
+        int r;
+
+        if (!strv_isempty(arg_path))
+                return 0;
+
+        r = find_esp_and_warn(NULL, NULL, /* unprivileged_mode= */ false, &esp_path, NULL, NULL, NULL, NULL, &esp_devid);
+        if (r < 0 && r != -ENOKEY) /* ENOKEY means not found, and is the only error the function won't log about on its own */
+                return r;
+
+        r = find_xbootldr_and_warn(NULL, NULL, /* unprivileged_mode= */ false, &xbootldr_path, NULL, &xbootldr_devid);
+        if (r < 0 && r != -ENOKEY)
+                return r;
+
+        if (!esp_path && !xbootldr_path)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+                                       "Couldn't find $BOOT partition. It is recommended to mount it to /boot.\n"
+                                       "Alternatively, use --path= to specify path to mount point.");
+
+        if (esp_path && xbootldr_path && !devnum_set_and_equal(esp_devid, xbootldr_devid)) /* in case the two paths refer to the same inode, suppress one */
+                a = strv_new(esp_path, xbootldr_path);
+        else if (esp_path)
+                a = strv_new(esp_path);
+        else
+                a = strv_new(xbootldr_path);
+        if (!a)
+                return log_oom();
+
+        strv_free_and_replace(arg_path, a);
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *j = NULL;
+
+                j = strv_join(arg_path, ":");
+                log_debug("Using %s as boot loader drop-in search path.", strna(j));
+        }
+
+        return 0;
+}
+
+static int parse_counter(
+                const char *path,
+                const char **p,
+                uint64_t *ret_left,
+                uint64_t *ret_done) {
+
+        uint64_t left, done;
+        const char *z, *e;
+        size_t k;
+        int r;
+
+        assert(path);
+        assert(p);
+
+        e = *p;
+        assert(e);
+        assert(*e == '+');
+
+        e++;
+
+        k = strspn(e, DIGITS);
+        if (k == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Can't parse empty 'tries left' counter from LoaderBootCountPath: %s",
+                                       path);
+
+        z = strndupa_safe(e, k);
+        r = safe_atou64(z, &left);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse 'tries left' counter from LoaderBootCountPath: %s", path);
+
+        e += k;
+
+        if (*e == '-') {
+                e++;
+
+                k = strspn(e, DIGITS);
+                if (k == 0) /* If there's a "-" there also needs to be at least one digit */
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Can't parse empty 'tries done' counter from LoaderBootCountPath: %s",
+                                               path);
+
+                z = strndupa_safe(e, k);
+                r = safe_atou64(z, &done);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse 'tries done' counter from LoaderBootCountPath: %s", path);
+
+                e += k;
+        } else
+                done = 0;
+
+        if (done == 0)
+                log_warning("The 'tries done' counter is currently at zero. This can't really be, after all we are running, and this boot must hence count as one. Proceeding anyway.");
+
+        *p = e;
+
+        if (ret_left)
+                *ret_left = left;
+
+        if (ret_done)
+                *ret_done = done;
+
+        return 0;
+}
+
+static int acquire_boot_count_path(
+                char **ret_path,
+                char **ret_prefix,
+                uint64_t *ret_left,
+                uint64_t *ret_done,
+                char **ret_suffix) {
+
+        _cleanup_free_ char *path = NULL, *prefix = NULL, *suffix = NULL;
+        const char *last, *e;
+        uint64_t left, done;
+        int r;
+
+        r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderBootCountPath), &path);
+        if (r == -ENOENT)
+                return -EUNATCH; /* in this case, let the caller print a message */
+        if (r < 0)
+                return log_error_errno(r, "Failed to read LoaderBootCountPath EFI variable: %m");
+
+        efi_tilt_backslashes(path);
+
+        if (!path_is_normalized(path))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Path read from LoaderBootCountPath is not normalized, refusing: %s",
+                                       path);
+
+        if (!path_is_absolute(path))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Path read from LoaderBootCountPath is not absolute, refusing: %s",
+                                       path);
+
+        last = last_path_component(path);
+        e = strrchr(last, '+');
+        if (!e)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Path read from LoaderBootCountPath does not contain a counter, refusing: %s",
+                                       path);
+
+        if (ret_prefix) {
+                prefix = strndup(path, e - path);
+                if (!prefix)
+                        return log_oom();
+        }
+
+        r = parse_counter(path, &e, &left, &done);
+        if (r < 0)
+                return r;
+
+        if (ret_suffix) {
+                suffix = strdup(e);
+                if (!suffix)
+                        return log_oom();
+
+                *ret_suffix = TAKE_PTR(suffix);
+        }
+
+        if (ret_path)
+                *ret_path = TAKE_PTR(path);
+        if (ret_prefix)
+                *ret_prefix = TAKE_PTR(prefix);
+        if (ret_left)
+                *ret_left = left;
+        if (ret_done)
+                *ret_done = done;
+
+        return 0;
+}
+
+static int make_good(const char *prefix, const char *suffix, char **ret) {
+        _cleanup_free_ char *good = NULL;
+
+        assert(prefix);
+        assert(suffix);
+        assert(ret);
+
+        /* Generate the path we'd use on good boots. This one is easy. If we are successful, we simple drop the counter
+         * pair entirely from the name. After all, we know all is good, and the logs will contain information about the
+         * tries we needed to come here, hence it's safe to drop the counters from the name. */
+
+        good = strjoin(prefix, suffix);
+        if (!good)
+                return -ENOMEM;
+
+        *ret = TAKE_PTR(good);
+        return 0;
+}
+
+static int make_bad(const char *prefix, uint64_t done, const char *suffix, char **ret) {
+        _cleanup_free_ char *bad = NULL;
+
+        assert(prefix);
+        assert(suffix);
+        assert(ret);
+
+        /* Generate the path we'd use on bad boots. Let's simply set the 'left' counter to zero, and keep the 'done'
+         * counter. The information might be interesting to boot loaders, after all. */
+
+        if (done == 0) {
+                bad = strjoin(prefix, "+0", suffix);
+                if (!bad)
+                        return -ENOMEM;
+        } else {
+                if (asprintf(&bad, "%s+0-%" PRIu64 "%s", prefix, done, suffix) < 0)
+                        return -ENOMEM;
+        }
+
+        *ret = TAKE_PTR(bad);
+        return 0;
+}
+
+static const char *skip_slash(const char *path) {
+        assert(path);
+        assert(path[0] == '/');
+
+        return path + 1;
+}
+
+static int verb_status(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *path = NULL, *prefix = NULL, *suffix = NULL, *good = NULL, *bad = NULL;
+        uint64_t left, done;
+        int r;
+
+        r = acquire_boot_count_path(&path, &prefix, &left, &done, &suffix);
+        if (r == -EUNATCH) { /* No boot count in place, then let's consider this a "clean" boot, as "good", "bad" or "indeterminate" don't apply. */
+                puts("clean");
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        r = acquire_path();
+        if (r < 0)
+                return r;
+
+        r = make_good(prefix, suffix, &good);
+        if (r < 0)
+                return log_oom();
+
+        r = make_bad(prefix, done, suffix, &bad);
+        if (r < 0)
+                return log_oom();
+
+        log_debug("Booted file: %s\n"
+                  "The same modified for 'good': %s\n"
+                  "The same modified for 'bad':  %s\n",
+                  path,
+                  good,
+                  bad);
+
+        log_debug("Tries left: %" PRIu64"\n"
+                  "Tries done: %" PRIu64"\n",
+                  left, done);
+
+        STRV_FOREACH(p, arg_path) {
+                _cleanup_close_ int fd = -EBADF;
+
+                fd = open(*p, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+                if (fd < 0) {
+                        if (errno == ENOENT)
+                                continue;
+
+                        return log_error_errno(errno, "Failed to open $BOOT partition '%s': %m", *p);
+                }
+
+                if (faccessat(fd, skip_slash(path), F_OK, 0) >= 0) {
+                        puts("indeterminate");
+                        return 0;
+                }
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to check if '%s' exists: %m", path);
+
+                if (faccessat(fd, skip_slash(good), F_OK, 0) >= 0) {
+                        puts("good");
+                        return 0;
+                }
+
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to check if '%s' exists: %m", good);
+
+                if (faccessat(fd, skip_slash(bad), F_OK, 0) >= 0) {
+                        puts("bad");
+                        return 0;
+                }
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to check if '%s' exists: %m", bad);
+
+                /* We didn't find any of the three? If so, let's try the next directory, before we give up. */
+        }
+
+        return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Couldn't determine boot state: %m");
+}
+
+static int verb_set(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *path = NULL, *prefix = NULL, *suffix = NULL, *good = NULL, *bad = NULL;
+        const char *target, *source1, *source2;
+        uint64_t done;
+        int r;
+
+        r = acquire_boot_count_path(&path, &prefix, NULL, &done, &suffix);
+        if (r == -EUNATCH) /* acquire_boot_count_path() won't log on its own for this specific error */
+                return log_error_errno(r, "Not booted with boot counting in effect.");
+        if (r < 0)
+                return r;
+
+        r = acquire_path();
+        if (r < 0)
+                return r;
+
+        r = make_good(prefix, suffix, &good);
+        if (r < 0)
+                return log_oom();
+
+        r = make_bad(prefix, done, suffix, &bad);
+        if (r < 0)
+                return log_oom();
+
+        /* Figure out what rename to what */
+        if (streq(argv[0], "good")) {
+                target = good;
+                source1 = path;
+                source2 = bad;      /* Maybe this boot was previously marked as 'bad'? */
+        } else if (streq(argv[0], "bad")) {
+                target = bad;
+                source1 = path;
+                source2 = good;     /* Maybe this boot was previously marked as 'good'? */
+        } else {
+                assert(streq(argv[0], "indeterminate"));
+                target = path;
+                source1 = good;
+                source2 = bad;
+        }
+
+        STRV_FOREACH(p, arg_path) {
+                _cleanup_close_ int fd = -EBADF;
+
+                fd = open(*p, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to open $BOOT partition '%s': %m", *p);
+
+                r = rename_noreplace(fd, skip_slash(source1), fd, skip_slash(target));
+                if (r == -EEXIST)
+                        goto exists;
+                if (r == -ENOENT) {
+
+                        r = rename_noreplace(fd, skip_slash(source2), fd, skip_slash(target));
+                        if (r == -EEXIST)
+                                goto exists;
+                        if (r == -ENOENT) {
+
+                                if (faccessat(fd, skip_slash(target), F_OK, 0) >= 0) /* Hmm, if we can't find either source file, maybe the destination already exists? */
+                                        goto exists;
+
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno, "Failed to determine if %s already exists: %m", target);
+
+                                /* We found none of the snippets here, try the next directory */
+                                continue;
+                        }
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to rename '%s' to '%s': %m", source2, target);
+
+                        log_debug("Successfully renamed '%s' to '%s'.", source2, target);
+                } else if (r < 0)
+                        return log_error_errno(r, "Failed to rename '%s' to '%s': %m", source1, target);
+                else
+                        log_debug("Successfully renamed '%s' to '%s'.", source1, target);
+
+                /* First, fsync() the directory these files are located in */
+                r = fsync_parent_at(fd, skip_slash(target));
+                if (r < 0)
+                        log_debug_errno(errno, "Failed to synchronize image directory, ignoring: %m");
+
+                /* Secondly, syncfs() the whole file system these files are located in */
+                if (syncfs(fd) < 0)
+                        log_debug_errno(errno, "Failed to synchronize $BOOT partition, ignoring: %m");
+
+                log_info("Marked boot as '%s'. (Boot attempt counter is at %" PRIu64".)", argv[0], done);
+                return 0;
+        }
+
+        log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Can't find boot counter source file for '%s': %m", target);
+        return 1;
+
+exists:
+        log_debug("Operation already executed before, not doing anything.");
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help",          VERB_ANY, VERB_ANY, 0,            help        },
+                { "status",        VERB_ANY, 1,        VERB_DEFAULT, verb_status },
+                { "good",          VERB_ANY, 1,        0,            verb_set    },
+                { "bad",           VERB_ANY, 1,        0,            verb_set    },
+                { "indeterminate", VERB_ANY, 1,        0,            verb_set    },
+                {}
+        };
+
+        int r;
+
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        if (detect_container() > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                       "Marking a boot is not supported in containers.");
+
+        if (!is_efi_boot())
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                       "Marking a boot is only supported on EFI systems.");
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/boot/boot-check-no-failures.c b/src/boot/boot-check-no-failures.c
new file mode 100644
index 0000000..4ff91cb
--- /dev/null
+++ b/src/boot/boot-check-no-failures.c
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "bus-error.h"
+#include "log.h"
+#include "main-func.h"
+#include "pretty-print.h"
+#include "terminal-util.h"
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-boot-check-no-failures.service", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...]\n"
+               "\n%sVerify system operational state.%s\n\n"
+               "  -h --help          Show this help\n"
+               "     --version       Print version\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_PATH = 0x100,
+                ARG_VERSION,
+        };
+
+        static const struct option options[] = {
+                { "help",         no_argument,       NULL, 'h'              },
+                { "version",      no_argument,       NULL, ARG_VERSION      },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        help();
+                        return 0;
+
+                case ARG_VERSION:
+                        return version();
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        uint32_t n;
+        int r;
+
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        r = sd_bus_open_system(&bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to connect to system bus: %m");
+
+        r = sd_bus_get_property_trivial(
+                        bus,
+                        "org.freedesktop.systemd1",
+                        "/org/freedesktop/systemd1",
+                        "org.freedesktop.systemd1.Manager",
+                        "NFailedUnits",
+                        &error,
+                        'u',
+                        &n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get failed units counter: %s", bus_error_message(&error, r));
+
+        if (n > 0)
+                log_notice("Health check: %" PRIu32 " units have failed.", n);
+        else
+                log_info("Health check: no failed units.");
+
+        return n > 0;
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/boot/bootctl-install.c b/src/boot/bootctl-install.c
new file mode 100644
index 0000000..bacbbb2
--- /dev/null
+++ b/src/boot/bootctl-install.c
@@ -0,0 +1,1101 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bootctl.h"
+#include "bootctl-install.h"
+#include "bootctl-random-seed.h"
+#include "bootctl-util.h"
+#include "chase.h"
+#include "copy.h"
+#include "dirent-util.h"
+#include "efi-api.h"
+#include "env-file.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "id128-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "stat-util.h"
+#include "sync-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "utf8.h"
+
+static int load_etc_machine_id(void) {
+        int r;
+
+        r = sd_id128_get_machine(&arg_machine_id);
+        if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) /* Not set or empty */
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to get machine-id: %m");
+
+        log_debug("Loaded machine ID %s from /etc/machine-id.", SD_ID128_TO_STRING(arg_machine_id));
+        return 0;
+}
+
+static int load_etc_machine_info(void) {
+        /* systemd v250 added support to store the kernel-install layout setting and the machine ID to use
+         * for setting up the ESP in /etc/machine-info. The newer /etc/kernel/entry-token file, as well as
+         * the $layout field in /etc/kernel/install.conf are better replacements for this though, hence this
+         * has been deprecated and is only returned for compatibility. */
+        _cleanup_free_ char *p = NULL, *s = NULL, *layout = NULL;
+        int r;
+
+        p = path_join(arg_root, "etc/machine-info");
+        if (!p)
+                return log_oom();
+
+        r = parse_env_file(NULL, p,
+                           "KERNEL_INSTALL_LAYOUT", &layout,
+                           "KERNEL_INSTALL_MACHINE_ID", &s);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse /etc/machine-info: %m");
+
+        if (!isempty(s)) {
+                if (!arg_quiet)
+                        log_notice("Read $KERNEL_INSTALL_MACHINE_ID from /etc/machine-info. "
+                                   "Please move it to /etc/kernel/entry-token.");
+
+                r = sd_id128_from_string(s, &arg_machine_id);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse KERNEL_INSTALL_MACHINE_ID=%s in /etc/machine-info: %m", s);
+
+                log_debug("Loaded KERNEL_INSTALL_MACHINE_ID=%s from /etc/machine-info.",
+                          SD_ID128_TO_STRING(arg_machine_id));
+        }
+
+        if (!isempty(layout)) {
+                if (!arg_quiet)
+                        log_notice("Read $KERNEL_INSTALL_LAYOUT from /etc/machine-info. "
+                                   "Please move it to the layout= setting of /etc/kernel/install.conf.");
+
+                log_debug("KERNEL_INSTALL_LAYOUT=%s is specified in /etc/machine-info.", layout);
+                free_and_replace(arg_install_layout, layout);
+        }
+
+        return 0;
+}
+
+static int load_etc_kernel_install_conf(void) {
+        _cleanup_free_ char *layout = NULL, *p = NULL;
+        int r;
+
+        p = path_join(arg_root, etc_kernel(), "install.conf");
+        if (!p)
+                return log_oom();
+
+        r = parse_env_file(NULL, p, "layout", &layout);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse %s: %m", p);
+
+        if (!isempty(layout)) {
+                log_debug("layout=%s is specified in %s.", layout, p);
+                free_and_replace(arg_install_layout, layout);
+        }
+
+        return 0;
+}
+
+static bool use_boot_loader_spec_type1(void) {
+        /* If the layout is not specified, or if it is set explicitly to "bls" we assume Boot Loader
+         * Specification Type #1 is the chosen format for our boot loader entries */
+        return !arg_install_layout || streq(arg_install_layout, "bls");
+}
+
+static int settle_make_entry_directory(void) {
+        int r;
+
+        r = load_etc_machine_id();
+        if (r < 0)
+                return r;
+
+        r = load_etc_machine_info();
+        if (r < 0)
+                return r;
+
+        r = load_etc_kernel_install_conf();
+        if (r < 0)
+                return r;
+
+        r = settle_entry_token();
+        if (r < 0)
+                return r;
+
+        bool layout_type1 = use_boot_loader_spec_type1();
+        if (arg_make_entry_directory < 0) { /* Automatic mode */
+                if (layout_type1) {
+                        if (arg_entry_token_type == BOOT_ENTRY_TOKEN_MACHINE_ID) {
+                                r = path_is_temporary_fs("/etc/machine-id");
+                                if (r < 0)
+                                        return log_debug_errno(r, "Couldn't determine whether /etc/machine-id is on a temporary file system: %m");
+
+                                arg_make_entry_directory = r == 0;
+                        } else
+                                arg_make_entry_directory = true;
+                } else
+                        arg_make_entry_directory = false;
+        }
+
+        if (arg_make_entry_directory > 0 && !layout_type1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "KERNEL_INSTALL_LAYOUT=%s is configured, but Boot Loader Specification Type #1 entry directory creation was requested.",
+                                       arg_install_layout);
+
+        return 0;
+}
+
+static int compare_product(const char *a, const char *b) {
+        size_t x, y;
+
+        assert(a);
+        assert(b);
+
+        x = strcspn(a, " ");
+        y = strcspn(b, " ");
+        if (x != y)
+                return x < y ? -1 : x > y ? 1 : 0;
+
+        return strncmp(a, b, x);
+}
+
+static int compare_version(const char *a, const char *b) {
+        assert(a);
+        assert(b);
+
+        a += strcspn(a, " ");
+        a += strspn(a, " ");
+        b += strcspn(b, " ");
+        b += strspn(b, " ");
+
+        return strverscmp_improved(a, b);
+}
+
+static int version_check(int fd_from, const char *from, int fd_to, const char *to) {
+        _cleanup_free_ char *a = NULL, *b = NULL;
+        int r;
+
+        assert(fd_from >= 0);
+        assert(from);
+        assert(fd_to >= 0);
+        assert(to);
+
+        r = get_file_version(fd_from, &a);
+        if (r == -ESRCH)
+                return log_notice_errno(r, "Source file \"%s\" does not carry version information!", from);
+        if (r < 0)
+                return r;
+
+        r = get_file_version(fd_to, &b);
+        if (r == -ESRCH)
+                return log_notice_errno(r, "Skipping \"%s\", it's owned by another boot loader (no version info found).",
+                                        to);
+        if (r < 0)
+                return r;
+        if (compare_product(a, b) != 0)
+                return log_notice_errno(SYNTHETIC_ERRNO(ESRCH),
+                                        "Skipping \"%s\", it's owned by another boot loader.", to);
+
+        r = compare_version(a, b);
+        log_debug("Comparing versions: \"%s\" %s \"%s", a, comparison_operator(r), b);
+        if (r < 0)
+                return log_warning_errno(SYNTHETIC_ERRNO(ESTALE),
+                                         "Skipping \"%s\", newer boot loader version in place already.", to);
+        if (r == 0)
+                return log_info_errno(SYNTHETIC_ERRNO(ESTALE),
+                                      "Skipping \"%s\", same boot loader version in place already.", to);
+
+        return 0;
+}
+
+static int copy_file_with_version_check(const char *from, const char *to, bool force) {
+        _cleanup_close_ int fd_from = -EBADF, fd_to = -EBADF;
+        _cleanup_free_ char *t = NULL;
+        int r;
+
+        fd_from = open(from, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+        if (fd_from < 0)
+                return log_error_errno(errno, "Failed to open \"%s\" for reading: %m", from);
+
+        if (!force) {
+                fd_to = open(to, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+                if (fd_to < 0) {
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to open \"%s\" for reading: %m", to);
+                } else {
+                        r = version_check(fd_from, from, fd_to, to);
+                        if (r < 0)
+                                return r;
+
+                        if (lseek(fd_from, 0, SEEK_SET) < 0)
+                                return log_error_errno(errno, "Failed to seek in \"%s\": %m", from);
+
+                        fd_to = safe_close(fd_to);
+                }
+        }
+
+        r = tempfn_random(to, NULL, &t);
+        if (r < 0)
+                return log_oom();
+
+        WITH_UMASK(0000) {
+                fd_to = open(t, O_WRONLY|O_CREAT|O_CLOEXEC|O_EXCL|O_NOFOLLOW, 0644);
+                if (fd_to < 0)
+                        return log_error_errno(errno, "Failed to open \"%s\" for writing: %m", t);
+        }
+
+        r = copy_bytes(fd_from, fd_to, UINT64_MAX, COPY_REFLINK);
+        if (r < 0) {
+                (void) unlink(t);
+                return log_error_errno(r, "Failed to copy data from \"%s\" to \"%s\": %m", from, t);
+        }
+
+        (void) copy_times(fd_from, fd_to, 0);
+
+        r = fsync_full(fd_to);
+        if (r < 0) {
+                (void) unlink(t);
+                return log_error_errno(r, "Failed to copy data from \"%s\" to \"%s\": %m", from, t);
+        }
+
+        r = RET_NERRNO(renameat(AT_FDCWD, t, AT_FDCWD, to));
+        if (r < 0) {
+                (void) unlink(t);
+                return log_error_errno(r, "Failed to rename \"%s\" to \"%s\": %m", t, to);
+        }
+
+        log_info("Copied \"%s\" to \"%s\".", from, to);
+
+        return 0;
+}
+
+static int mkdir_one(const char *prefix, const char *suffix) {
+        _cleanup_free_ char *p = NULL;
+
+        p = path_join(prefix, suffix);
+        if (mkdir(p, 0700) < 0) {
+                if (errno != EEXIST)
+                        return log_error_errno(errno, "Failed to create \"%s\": %m", p);
+        } else
+                log_info("Created \"%s\".", p);
+
+        return 0;
+}
+
+static const char *const esp_subdirs[] = {
+        /* The directories to place in the ESP */
+        "EFI",
+        "EFI/systemd",
+        "EFI/BOOT",
+        "loader",
+        NULL
+};
+
+static const char *const dollar_boot_subdirs[] = {
+        /* The directories to place in the XBOOTLDR partition or the ESP, depending what exists */
+        "loader",
+        "loader/entries",  /* Type #1 entries */
+        "EFI",
+        "EFI/Linux",       /* Type #2 entries */
+        NULL
+};
+
+static int create_subdirs(const char *root, const char * const *subdirs) {
+        int r;
+
+        STRV_FOREACH(i, subdirs) {
+                r = mkdir_one(root, *i);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+
+static int copy_one_file(const char *esp_path, const char *name, bool force) {
+        char *root = IN_SET(arg_install_source, ARG_INSTALL_SOURCE_AUTO, ARG_INSTALL_SOURCE_IMAGE) ? arg_root : NULL;
+        _cleanup_free_ char *source_path = NULL, *dest_path = NULL, *p = NULL, *q = NULL;
+        const char *e;
+        char *dest_name, *s;
+        int r, ret;
+
+        dest_name = strdupa_safe(name);
+        s = endswith_no_case(dest_name, ".signed");
+        if (s)
+                *s = 0;
+
+        p = path_join(BOOTLIBDIR, name);
+        if (!p)
+                return log_oom();
+
+        r = chase(p, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &source_path, NULL);
+        /* If we had a root directory to try, we didn't find it and we are in auto mode, retry on the host */
+        if (r == -ENOENT && root && arg_install_source == ARG_INSTALL_SOURCE_AUTO)
+                r = chase(p, NULL, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &source_path, NULL);
+        if (r < 0)
+                return log_error_errno(r,
+                                       "Failed to resolve path %s%s%s: %m",
+                                       p,
+                                       root ? " under directory " : "",
+                                       strempty(root));
+
+        q = path_join("/EFI/systemd/", dest_name);
+        if (!q)
+                return log_oom();
+
+        r = chase(q, esp_path, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_NONEXISTENT, &dest_path, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to resolve path %s under directory %s: %m", q, esp_path);
+
+        /* Note that if this fails we do the second copy anyway, but return this error code,
+         * so we stash it away in a separate variable. */
+        ret = copy_file_with_version_check(source_path, dest_path, force);
+
+        e = startswith(dest_name, "systemd-boot");
+        if (e) {
+                _cleanup_free_ char *default_dest_path = NULL;
+                char *v;
+
+                /* Create the EFI default boot loader name (specified for removable devices) */
+                v = strjoina("/EFI/BOOT/BOOT", e);
+                ascii_strupper(strrchr(v, '/') + 1);
+
+                r = chase(v, esp_path, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_NONEXISTENT, &default_dest_path, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to resolve path %s under directory %s: %m", v, esp_path);
+
+                r = copy_file_with_version_check(source_path, default_dest_path, force);
+                if (r < 0 && ret == 0)
+                        ret = r;
+        }
+
+        return ret;
+}
+
+static int install_binaries(const char *esp_path, const char *arch, bool force) {
+        char *root = IN_SET(arg_install_source, ARG_INSTALL_SOURCE_AUTO, ARG_INSTALL_SOURCE_IMAGE) ? arg_root : NULL;
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        r = chase_and_opendir(BOOTLIBDIR, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &path, &d);
+        /* If we had a root directory to try, we didn't find it and we are in auto mode, retry on the host */
+        if (r == -ENOENT && root && arg_install_source == ARG_INSTALL_SOURCE_AUTO)
+                r = chase_and_opendir(BOOTLIBDIR, NULL, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &path, &d);
+        if (r == -ENOENT && arg_graceful) {
+                log_debug("Source directory does not exist, ignoring.");
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to open boot loader directory %s%s: %m", strempty(root), BOOTLIBDIR);
+
+        const char *suffix = strjoina(arch, ".efi");
+        const char *suffix_signed = strjoina(arch, ".efi.signed");
+
+        FOREACH_DIRENT(de, d, return log_error_errno(errno, "Failed to read \"%s\": %m", path)) {
+                int k;
+
+                if (!endswith_no_case(de->d_name, suffix) && !endswith_no_case(de->d_name, suffix_signed))
+                        continue;
+
+                /* skip the .efi file, if there's a .signed version of it */
+                if (endswith_no_case(de->d_name, ".efi")) {
+                        _cleanup_free_ const char *s = strjoin(de->d_name, ".signed");
+                        if (!s)
+                                return log_oom();
+                        if (faccessat(dirfd(d), s, F_OK, 0) >= 0)
+                                continue;
+                }
+
+                k = copy_one_file(esp_path, de->d_name, force);
+                /* Don't propagate an error code if no update necessary, installed version already equal or
+                 * newer version, or other boot loader in place. */
+                if (arg_graceful && IN_SET(k, -ESTALE, -ESRCH))
+                        continue;
+                RET_GATHER(r, k);
+        }
+
+        return r;
+}
+
+static int install_loader_config(const char *esp_path) {
+        _cleanup_(unlink_and_freep) char *t = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(arg_make_entry_directory >= 0);
+
+        p = path_join(esp_path, "/loader/loader.conf");
+        if (!p)
+                return log_oom();
+        if (access(p, F_OK) >= 0) /* Silently skip creation if the file already exists (early check) */
+                return 0;
+
+        r = fopen_tmpfile_linkable(p, O_WRONLY|O_CLOEXEC, &t, &f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open \"%s\" for writing: %m", p);
+
+        fprintf(f, "#timeout 3\n"
+                   "#console-mode keep\n");
+
+        if (arg_make_entry_directory) {
+                assert(arg_entry_token);
+                fprintf(f, "default %s-*\n", arg_entry_token);
+        }
+
+        r = flink_tmpfile(f, t, p, LINK_TMPFILE_SYNC);
+        if (r == -EEXIST)
+                return 0; /* Silently skip creation if the file exists now (recheck) */
+        if (r < 0)
+                return log_error_errno(r, "Failed to move \"%s\" into place: %m", p);
+
+        t = mfree(t);
+        return 1;
+}
+
+static int install_loader_specification(const char *root) {
+        _cleanup_(unlink_and_freep) char *t = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        p = path_join(root, "/loader/entries.srel");
+        if (!p)
+                return log_oom();
+
+        if (access(p, F_OK) >= 0) /* Silently skip creation if the file already exists (early check) */
+                return 0;
+
+        r = fopen_tmpfile_linkable(p, O_WRONLY|O_CLOEXEC, &t, &f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open \"%s\" for writing: %m", p);
+
+        fprintf(f, "type1\n");
+
+        r = flink_tmpfile(f, t, p, LINK_TMPFILE_SYNC);
+        if (r == -EEXIST)
+                return 0; /* Silently skip creation if the file exists now (recheck) */
+        if (r < 0)
+                return log_error_errno(r, "Failed to move \"%s\" into place: %m", p);
+
+        t = mfree(t);
+        return 1;
+}
+
+static int install_entry_directory(const char *root) {
+        assert(root);
+        assert(arg_make_entry_directory >= 0);
+
+        if (!arg_make_entry_directory)
+                return 0;
+
+        assert(arg_entry_token);
+        return mkdir_one(root, arg_entry_token);
+}
+
+static int install_entry_token(void) {
+        _cleanup_free_ char* p = NULL;
+        int r;
+
+        assert(arg_make_entry_directory >= 0);
+        assert(arg_entry_token);
+
+        /* Let's save the used entry token in /etc/kernel/entry-token if we used it to create the entry
+         * directory, or if anything else but the machine ID */
+
+        if (!arg_make_entry_directory && arg_entry_token_type == BOOT_ENTRY_TOKEN_MACHINE_ID)
+                return 0;
+
+        p = path_join(arg_root, etc_kernel(), "entry-token");
+        if (!p)
+                return log_oom();
+
+        r = write_string_file(p, arg_entry_token, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_MKDIR_0755);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write entry token '%s' to %s: %m", arg_entry_token, p);
+
+        return 0;
+}
+
+static bool same_entry(uint16_t id, sd_id128_t uuid, const char *path) {
+        _cleanup_free_ char *opath = NULL;
+        sd_id128_t ouuid;
+        int r;
+
+        r = efi_get_boot_option(id, NULL, &ouuid, &opath, NULL);
+        if (r < 0)
+                return false;
+        if (!sd_id128_equal(uuid, ouuid))
+                return false;
+
+        /* Some motherboards convert the path to uppercase under certain circumstances
+         * (e.g. after booting into the Boot Menu in the ASUS ROG STRIX B350-F GAMING),
+         * so use case-insensitive checking */
+        if (!strcaseeq_ptr(path, opath))
+                return false;
+
+        return true;
+}
+
+static int find_slot(sd_id128_t uuid, const char *path, uint16_t *id) {
+        _cleanup_free_ uint16_t *options = NULL;
+
+        int n = efi_get_boot_options(&options);
+        if (n < 0)
+                return n;
+
+        /* find already existing systemd-boot entry */
+        for (int i = 0; i < n; i++)
+                if (same_entry(options[i], uuid, path)) {
+                        *id = options[i];
+                        return 1;
+                }
+
+        /* find free slot in the sorted BootXXXX variable list */
+        for (int i = 0; i < n; i++)
+                if (i != options[i]) {
+                        *id = i;
+                        return 0;
+                }
+
+        /* use the next one */
+        if (n == 0xffff)
+                return -ENOSPC;
+        *id = n;
+        return 0;
+}
+
+static int insert_into_order(uint16_t slot, bool first) {
+        _cleanup_free_ uint16_t *order = NULL;
+        uint16_t *t;
+        int n;
+
+        n = efi_get_boot_order(&order);
+        if (n <= 0)
+                /* no entry, add us */
+                return efi_set_boot_order(&slot, 1);
+
+        /* are we the first and only one? */
+        if (n == 1 && order[0] == slot)
+                return 0;
+
+        /* are we already in the boot order? */
+        for (int i = 0; i < n; i++) {
+                if (order[i] != slot)
+                        continue;
+
+                /* we do not require to be the first one, all is fine */
+                if (!first)
+                        return 0;
+
+                /* move us to the first slot */
+                memmove(order + 1, order, i * sizeof(uint16_t));
+                order[0] = slot;
+                return efi_set_boot_order(order, n);
+        }
+
+        /* extend array */
+        t = reallocarray(order, n + 1, sizeof(uint16_t));
+        if (!t)
+                return -ENOMEM;
+        order = t;
+
+        /* add us to the top or end of the list */
+        if (first) {
+                memmove(order + 1, order, n * sizeof(uint16_t));
+                order[0] = slot;
+        } else
+                order[n] = slot;
+
+        return efi_set_boot_order(order, n + 1);
+}
+
+static int remove_from_order(uint16_t slot) {
+        _cleanup_free_ uint16_t *order = NULL;
+        int n;
+
+        n = efi_get_boot_order(&order);
+        if (n <= 0)
+                return n;
+
+        for (int i = 0; i < n; i++) {
+                if (order[i] != slot)
+                        continue;
+
+                if (i + 1 < n)
+                        memmove(order + i, order + i+1, (n - i) * sizeof(uint16_t));
+                return efi_set_boot_order(order, n - 1);
+        }
+
+        return 0;
+}
+
+static const char *pick_efi_boot_option_description(void) {
+        return arg_efi_boot_option_description ?: "Linux Boot Manager";
+}
+
+static int install_variables(
+                const char *esp_path,
+                uint32_t part,
+                uint64_t pstart,
+                uint64_t psize,
+                sd_id128_t uuid,
+                const char *path,
+                bool first,
+                bool graceful) {
+
+        uint16_t slot;
+        int r;
+
+        if (arg_root) {
+                log_info("Acting on %s, skipping EFI variable setup.",
+                         arg_image ? "image" : "root directory");
+                return 0;
+        }
+
+        if (!is_efi_boot()) {
+                log_warning("Not booted with EFI, skipping EFI variable setup.");
+                return 0;
+        }
+
+        r = chase_and_access(path, esp_path, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, F_OK, NULL);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Cannot access \"%s/%s\": %m", esp_path, path);
+
+        r = find_slot(uuid, path, &slot);
+        if (r < 0) {
+                int level = graceful ? arg_quiet ? LOG_DEBUG : LOG_INFO : LOG_ERR;
+                const char *skip = graceful ? ", skipping" : "";
+
+                log_full_errno(level, r,
+                               r == -ENOENT ?
+                               "Failed to access EFI variables%s. Is the \"efivarfs\" filesystem mounted?" :
+                               "Failed to determine current boot order%s: %m", skip);
+
+                return graceful ? 0 : r;
+        }
+
+        if (first || r == 0) {
+                r = efi_add_boot_option(slot, pick_efi_boot_option_description(),
+                                        part, pstart, psize,
+                                        uuid, path);
+                if (r < 0) {
+                        int level = graceful ? arg_quiet ? LOG_DEBUG : LOG_INFO : LOG_ERR;
+                        const char *skip = graceful ? ", skipping" : "";
+
+                        log_full_errno(level, r, "Failed to create EFI Boot variable entry%s: %m", skip);
+
+                        return graceful ? 0 : r;
+                }
+
+                log_info("Created EFI boot entry \"%s\".", pick_efi_boot_option_description());
+        }
+
+        return insert_into_order(slot, first);
+}
+
+static int are_we_installed(const char *esp_path) {
+        int r;
+
+        /* Tests whether systemd-boot is installed. It's not obvious what to use as check here: we could
+         * check EFI variables, we could check what binary /EFI/BOOT/BOOT*.EFI points to, or whether the
+         * loader entries directory exists. Here we opted to check whether /EFI/systemd/ is non-empty, which
+         * should be a suitable and very minimal check for a number of reasons:
+         *
+         *  → The check is architecture independent (i.e. we check if any systemd-boot loader is installed,
+         *    not a specific one.)
+         *
+         *  → It doesn't assume we are the only boot loader (i.e doesn't check if we own the main
+         *    /EFI/BOOT/BOOT*.EFI fallback binary.
+         *
+         *  → It specifically checks for systemd-boot, not for other boot loaders (which a check for
+         *    /boot/loader/entries would do). */
+
+        _cleanup_free_ char *p = path_join(esp_path, "/EFI/systemd/");
+        if (!p)
+                return log_oom();
+
+        log_debug("Checking whether %s contains any files%s", p, special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+        r = dir_is_empty(p, /* ignore_hidden_or_backup= */ false);
+        if (r < 0 && r != -ENOENT)
+                return log_error_errno(r, "Failed to check whether %s contains any files: %m", p);
+
+        return r == 0;
+}
+
+int verb_install(int argc, char *argv[], void *userdata) {
+        sd_id128_t uuid = SD_ID128_NULL;
+        uint64_t pstart = 0, psize = 0;
+        uint32_t part = 0;
+        bool install, graceful;
+        int r;
+
+        /* Invoked for both "update" and "install" */
+
+        install = streq(argv[0], "install");
+        graceful = !install && arg_graceful; /* support graceful mode for updates */
+
+        r = acquire_esp(/* unprivileged_mode= */ false, graceful, &part, &pstart, &psize, &uuid, NULL);
+        if (graceful && r == -ENOKEY)
+                return 0; /* If --graceful is specified and we can't find an ESP, handle this cleanly */
+        if (r < 0)
+                return r;
+
+        if (!install) {
+                /* If we are updating, don't do anything if sd-boot wasn't actually installed. */
+                r = are_we_installed(arg_esp_path);
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        log_debug("Skipping update because sd-boot is not installed in the ESP.");
+                        return 0;
+                }
+        }
+
+        r = acquire_xbootldr(/* unprivileged_mode= */ false, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = settle_make_entry_directory();
+        if (r < 0)
+                return r;
+
+        const char *arch = arg_arch_all ? "" : get_efi_arch();
+
+        WITH_UMASK(0002) {
+                if (install) {
+                        /* Don't create any of these directories when we are just updating. When we update
+                         * we'll drop-in our files (unless there are newer ones already), but we won't create
+                         * the directories for them in the first place. */
+                        r = create_subdirs(arg_esp_path, esp_subdirs);
+                        if (r < 0)
+                                return r;
+
+                        r = create_subdirs(arg_dollar_boot_path(), dollar_boot_subdirs);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = install_binaries(arg_esp_path, arch, install);
+                if (r < 0)
+                        return r;
+
+                if (install) {
+                        r = install_loader_config(arg_esp_path);
+                        if (r < 0)
+                                return r;
+
+                        r = install_entry_directory(arg_dollar_boot_path());
+                        if (r < 0)
+                                return r;
+
+                        r = install_entry_token();
+                        if (r < 0)
+                                return r;
+
+                        r = install_random_seed(arg_esp_path);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = install_loader_specification(arg_dollar_boot_path());
+                if (r < 0)
+                        return r;
+        }
+
+        (void) sync_everything();
+
+        if (!arg_touch_variables)
+                return 0;
+
+        if (arg_arch_all) {
+                log_info("Not changing EFI variables with --all-architectures.");
+                return 0;
+        }
+
+        char *path = strjoina("/EFI/systemd/systemd-boot", arch, ".efi");
+        return install_variables(arg_esp_path, part, pstart, psize, uuid, path, install, graceful);
+}
+
+static int remove_boot_efi(const char *esp_path) {
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_free_ char *p = NULL;
+        int r, c = 0;
+
+        r = chase_and_opendir("/EFI/BOOT", esp_path, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &p, &d);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to open directory \"%s/EFI/BOOT\": %m", esp_path);
+
+        FOREACH_DIRENT(de, d, break) {
+                _cleanup_close_ int fd = -EBADF;
+                _cleanup_free_ char *v = NULL;
+
+                if (!endswith_no_case(de->d_name, ".efi"))
+                        continue;
+
+                if (!startswith_no_case(de->d_name, "boot"))
+                        continue;
+
+                fd = openat(dirfd(d), de->d_name, O_RDONLY|O_CLOEXEC);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to open \"%s/%s\" for reading: %m", p, de->d_name);
+
+                r = get_file_version(fd, &v);
+                if (r == -ESRCH)
+                        continue;  /* No version information */
+                if (r < 0)
+                        return r;
+                if (startswith(v, "systemd-boot ")) {
+                        r = unlinkat(dirfd(d), de->d_name, 0);
+                        if (r < 0)
+                                return log_error_errno(errno, "Failed to remove \"%s/%s\": %m", p, de->d_name);
+
+                        log_info("Removed \"%s/%s\".", p, de->d_name);
+                }
+
+                c++;
+        }
+
+        return c;
+}
+
+static int rmdir_one(const char *prefix, const char *suffix) {
+        const char *p;
+
+        p = prefix_roota(prefix, suffix);
+        if (rmdir(p) < 0) {
+                bool ignore = IN_SET(errno, ENOENT, ENOTEMPTY);
+
+                log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, errno,
+                               "Failed to remove directory \"%s\": %m", p);
+                if (!ignore)
+                        return -errno;
+        } else
+                log_info("Removed \"%s\".", p);
+
+        return 0;
+}
+
+static int remove_subdirs(const char *root, const char *const *subdirs) {
+        int r, q;
+
+        /* We use recursion here to destroy the directories in reverse order. Which should be safe given how
+         * short the array is. */
+
+        if (!subdirs[0]) /* A the end of the list */
+                return 0;
+
+        r = remove_subdirs(root, subdirs + 1);
+        q = rmdir_one(root, subdirs[0]);
+
+        return r < 0 ? r : q;
+}
+
+static int remove_entry_directory(const char *root) {
+        assert(root);
+        assert(arg_make_entry_directory >= 0);
+
+        if (!arg_make_entry_directory || !arg_entry_token)
+                return 0;
+
+        return rmdir_one(root, arg_entry_token);
+}
+
+static int remove_binaries(const char *esp_path) {
+        const char *p;
+        int r, q;
+
+        p = prefix_roota(esp_path, "/EFI/systemd");
+        r = rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL);
+
+        q = remove_boot_efi(esp_path);
+        if (q < 0 && r == 0)
+                r = q;
+
+        return r;
+}
+
+static int remove_file(const char *root, const char *file) {
+        const char *p;
+
+        assert(root);
+        assert(file);
+
+        p = prefix_roota(root, file);
+        if (unlink(p) < 0) {
+                log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno,
+                               "Failed to unlink file \"%s\": %m", p);
+
+                return errno == ENOENT ? 0 : -errno;
+        }
+
+        log_info("Removed \"%s\".", p);
+        return 1;
+}
+
+static int remove_variables(sd_id128_t uuid, const char *path, bool in_order) {
+        uint16_t slot;
+        int r;
+
+        if (arg_root || !is_efi_boot())
+                return 0;
+
+        r = find_slot(uuid, path, &slot);
+        if (r != 1)
+                return 0;
+
+        r = efi_remove_boot_option(slot);
+        if (r < 0)
+                return r;
+
+        if (in_order)
+                return remove_from_order(slot);
+
+        return 0;
+}
+
+static int remove_loader_variables(void) {
+        int r = 0;
+
+        /* Remove all persistent loader variables we define */
+
+        FOREACH_STRING(var,
+                       EFI_LOADER_VARIABLE(LoaderConfigConsoleMode),
+                       EFI_LOADER_VARIABLE(LoaderConfigTimeout),
+                       EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot),
+                       EFI_LOADER_VARIABLE(LoaderEntryDefault),
+                       EFI_LOADER_VARIABLE(LoaderEntryLastBooted),
+                       EFI_LOADER_VARIABLE(LoaderEntryOneShot),
+                       EFI_LOADER_VARIABLE(LoaderSystemToken)){
+
+                int q;
+
+                q = efi_set_variable(var, NULL, 0);
+                if (q == -ENOENT)
+                        continue;
+                if (q < 0) {
+                        log_warning_errno(q, "Failed to remove EFI variable %s: %m", var);
+                        if (r >= 0)
+                                r = q;
+                } else
+                        log_info("Removed EFI variable %s.", var);
+        }
+
+        return r;
+}
+
+int verb_remove(int argc, char *argv[], void *userdata) {
+        sd_id128_t uuid = SD_ID128_NULL;
+        int r, q;
+
+        r = acquire_esp(/* unprivileged_mode= */ false, /* graceful= */ false, NULL, NULL, NULL, &uuid, NULL);
+        if (r < 0)
+                return r;
+
+        r = acquire_xbootldr(/* unprivileged_mode= */ false, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = settle_make_entry_directory();
+        if (r < 0)
+                return r;
+
+        r = remove_binaries(arg_esp_path);
+
+        q = remove_file(arg_esp_path, "/loader/loader.conf");
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = remove_file(arg_esp_path, "/loader/random-seed");
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = remove_file(arg_esp_path, "/loader/entries.srel");
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = remove_subdirs(arg_esp_path, esp_subdirs);
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = remove_subdirs(arg_esp_path, dollar_boot_subdirs);
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = remove_entry_directory(arg_esp_path);
+        if (q < 0 && r >= 0)
+                r = q;
+
+        if (arg_xbootldr_path) {
+                /* Remove a subset of these also from the XBOOTLDR partition if it exists */
+
+                q = remove_file(arg_xbootldr_path, "/loader/entries.srel");
+                if (q < 0 && r >= 0)
+                        r = q;
+
+                q = remove_subdirs(arg_xbootldr_path, dollar_boot_subdirs);
+                if (q < 0 && r >= 0)
+                        r = q;
+
+                q = remove_entry_directory(arg_xbootldr_path);
+                if (q < 0 && r >= 0)
+                        r = q;
+        }
+
+        (void) sync_everything();
+
+        if (!arg_touch_variables)
+                return r;
+
+        if (arg_arch_all) {
+                log_info("Not changing EFI variables with --all-architectures.");
+                return r;
+        }
+
+        char *path = strjoina("/EFI/systemd/systemd-boot", get_efi_arch(), ".efi");
+        q = remove_variables(uuid, path, true);
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = remove_loader_variables();
+        if (q < 0 && r >= 0)
+                r = q;
+
+        return r;
+}
+
+int verb_is_installed(int argc, char *argv[], void *userdata) {
+        int r;
+
+        r = acquire_esp(/* unprivileged_mode= */ false,
+                        /* graceful= */ arg_graceful,
+                        NULL, NULL, NULL, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = are_we_installed(arg_esp_path);
+        if (r < 0)
+                return r;
+
+        if (r > 0) {
+                if (!arg_quiet)
+                        puts("yes");
+                return EXIT_SUCCESS;
+        } else {
+                if (!arg_quiet)
+                        puts("no");
+                return EXIT_FAILURE;
+        }
+}
diff --git a/src/boot/bootctl-install.h b/src/boot/bootctl-install.h
new file mode 100644
index 0000000..cd4b725
--- /dev/null
+++ b/src/boot/bootctl-install.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int verb_install(int argc, char *argv[], void *userdata);
+int verb_remove(int argc, char *argv[], void *userdata);
+int verb_is_installed(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-random-seed.c b/src/boot/bootctl-random-seed.c
new file mode 100644
index 0000000..cfe10c4
--- /dev/null
+++ b/src/boot/bootctl-random-seed.c
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bootctl.h"
+#include "bootctl-random-seed.h"
+#include "bootctl-util.h"
+#include "efi-api.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "find-esp.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "io-util.h"
+#include "mkdir.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "sha256.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+
+static int random_seed_verify_permissions(int fd, mode_t expected_type) {
+        _cleanup_free_ char *full_path = NULL;
+        struct stat st;
+        int r;
+
+        assert(fd >= 0);
+
+        r = fd_get_path(fd, &full_path);
+        if (r < 0)
+                return log_error_errno(r, "Unable to determine full path of random seed fd: %m");
+
+        if (fstat(fd, &st) < 0)
+                return log_error_errno(errno, "Unable to stat %s: %m", full_path);
+
+        if (((st.st_mode ^ expected_type) & S_IFMT) != 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EBADF),
+                                       "Unexpected inode type when validating random seed access mode on %s: %m", full_path);
+
+        if ((st.st_mode & 0007) == 0) /* All world bits are off? Then all is good */
+                return 0;
+
+        if (S_ISREG(expected_type))
+                log_warning("%s Random seed file '%s' is world accessible, which is a security hole! %s",
+                            special_glyph(SPECIAL_GLYPH_WARNING_SIGN), full_path, special_glyph(SPECIAL_GLYPH_WARNING_SIGN));
+        else {
+                assert(S_ISDIR(expected_type));
+                log_warning("%s Mount point '%s' which backs the random seed file is world accessible, which is a security hole! %s",
+                            special_glyph(SPECIAL_GLYPH_WARNING_SIGN), full_path, special_glyph(SPECIAL_GLYPH_WARNING_SIGN));
+        }
+
+        return 1;
+}
+
+static int set_system_token(void) {
+        uint8_t buffer[RANDOM_EFI_SEED_SIZE];
+        size_t token_size;
+        int r;
+
+        if (!arg_touch_variables)
+                return 0;
+
+        if (arg_root) {
+                log_warning("Acting on %s, skipping EFI variable setup.",
+                             arg_image ? "image" : "root directory");
+                return 0;
+        }
+
+        if (!is_efi_boot()) {
+                log_notice("Not booted with EFI, skipping EFI variable setup.");
+                return 0;
+        }
+
+        r = getenv_bool("SYSTEMD_WRITE_SYSTEM_TOKEN");
+        if (r < 0) {
+                if (r != -ENXIO)
+                        log_warning_errno(r, "Failed to parse $SYSTEMD_WRITE_SYSTEM_TOKEN, ignoring.");
+        } else if (r == 0) {
+                log_notice("Not writing system token, because $SYSTEMD_WRITE_SYSTEM_TOKEN is set to false.");
+                return 0;
+        }
+
+        r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderSystemToken), NULL, NULL, &token_size);
+        if (r == -ENODATA)
+                log_debug_errno(r, "LoaderSystemToken EFI variable is invalid (too short?), replacing.");
+        else if (r < 0) {
+                if (r != -ENOENT)
+                        return log_error_errno(r, "Failed to test system token validity: %m");
+        } else {
+                if (token_size >= sizeof(buffer)) {
+                        /* Let's avoid writes if we can, and initialize this only once. */
+                        log_debug("System token already written, not updating.");
+                        return 0;
+                }
+
+                log_debug("Existing system token size (%zu) does not match our expectations (%zu), replacing.", token_size, sizeof(buffer));
+        }
+
+        r = crypto_random_bytes(buffer, sizeof(buffer));
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire random seed: %m");
+
+        /* Let's write this variable with an umask in effect, so that unprivileged users can't see the token
+         * and possibly get identification information or too much insight into the kernel's entropy pool
+         * state. */
+        WITH_UMASK(0077) {
+                r = efi_set_variable(EFI_LOADER_VARIABLE(LoaderSystemToken), buffer, sizeof(buffer));
+                if (r < 0) {
+                        if (!arg_graceful)
+                                return log_error_errno(r, "Failed to write 'LoaderSystemToken' EFI variable: %m");
+
+                        if (r == -EINVAL)
+                                log_notice_errno(r, "Unable to write 'LoaderSystemToken' EFI variable (firmware problem?), ignoring: %m");
+                        else
+                                log_notice_errno(r, "Unable to write 'LoaderSystemToken' EFI variable, ignoring: %m");
+                } else
+                        log_info("Successfully initialized system token in EFI variable with %zu bytes.", sizeof(buffer));
+        }
+
+        return 0;
+}
+
+int install_random_seed(const char *esp) {
+        _cleanup_close_ int esp_fd = -EBADF, loader_dir_fd = -EBADF, fd = -EBADF;
+        _cleanup_free_ char *tmp = NULL;
+        uint8_t buffer[RANDOM_EFI_SEED_SIZE];
+        struct sha256_ctx hash_state;
+        bool refreshed, warned = false;
+        int r;
+
+        assert(esp);
+
+        assert_cc(RANDOM_EFI_SEED_SIZE == SHA256_DIGEST_SIZE);
+
+        esp_fd = open(esp, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+        if (esp_fd < 0)
+                return log_error_errno(errno, "Failed to open ESP directory '%s': %m", esp);
+
+        (void) random_seed_verify_permissions(esp_fd, S_IFDIR);
+
+        loader_dir_fd = open_mkdir_at(esp_fd, "loader", O_DIRECTORY|O_RDONLY|O_CLOEXEC|O_NOFOLLOW, 0775);
+        if (loader_dir_fd < 0)
+                return log_error_errno(loader_dir_fd, "Failed to open loader directory '%s/loader': %m", esp);
+
+        r = crypto_random_bytes(buffer, sizeof(buffer));
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire random seed: %m");
+
+        sha256_init_ctx(&hash_state);
+        sha256_process_bytes_and_size(buffer, sizeof(buffer), &hash_state);
+
+        fd = openat(loader_dir_fd, "random-seed", O_NOFOLLOW|O_CLOEXEC|O_RDONLY|O_NOCTTY);
+        if (fd < 0) {
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to open old random seed file: %m");
+
+                sha256_process_bytes(&(const ssize_t) { 0 }, sizeof(ssize_t), &hash_state);
+                refreshed = false;
+        } else {
+                ssize_t n;
+
+                warned = random_seed_verify_permissions(fd, S_IFREG) > 0;
+
+                /* Hash the old seed in so that we never regress in entropy. */
+
+                n = read(fd, buffer, sizeof(buffer));
+                if (n < 0)
+                        return log_error_errno(errno, "Failed to read old random seed file: %m");
+
+                sha256_process_bytes_and_size(buffer, n, &hash_state);
+
+                fd = safe_close(fd);
+                refreshed = n > 0;
+        }
+
+        sha256_finish_ctx(&hash_state, buffer);
+
+        if (tempfn_random("random-seed", "bootctl", &tmp) < 0)
+                return log_oom();
+
+        fd = openat(loader_dir_fd, tmp, O_CREAT|O_EXCL|O_NOFOLLOW|O_NOCTTY|O_WRONLY|O_CLOEXEC, 0600);
+        if (fd < 0)
+                return log_error_errno(fd, "Failed to open random seed file for writing: %m");
+
+        if (!warned) /* only warn once per seed file */
+                (void) random_seed_verify_permissions(fd, S_IFREG);
+
+        r = loop_write(fd, buffer, sizeof(buffer));
+        if (r < 0) {
+                log_error_errno(r, "Failed to write random seed file: %m");
+                goto fail;
+        }
+
+        if (fsync(fd) < 0 || fsync(loader_dir_fd) < 0) {
+                r = log_error_errno(errno, "Failed to sync random seed file: %m");
+                goto fail;
+        }
+
+        if (renameat(loader_dir_fd, tmp, loader_dir_fd, "random-seed") < 0) {
+                r = log_error_errno(errno, "Failed to move random seed file into place: %m");
+                goto fail;
+        }
+
+        tmp = mfree(tmp);
+
+        if (syncfs(fd) < 0)
+                return log_error_errno(errno, "Failed to sync ESP file system: %m");
+
+        log_info("Random seed file %s/loader/random-seed successfully %s (%zu bytes).", esp, refreshed ? "refreshed" : "written", sizeof(buffer));
+
+        return set_system_token();
+
+fail:
+        assert(tmp);
+        (void) unlinkat(loader_dir_fd, tmp, 0);
+
+        return r;
+}
+
+int verb_random_seed(int argc, char *argv[], void *userdata) {
+        int r;
+
+        r = find_esp_and_warn(arg_root, arg_esp_path, false, &arg_esp_path, NULL, NULL, NULL, NULL, NULL);
+        if (r == -ENOKEY) {
+                /* find_esp_and_warn() doesn't warn about ENOKEY, so let's do that on our own */
+                if (!arg_graceful)
+                        return log_error_errno(r, "Unable to find ESP.");
+
+                log_notice("No ESP found, not initializing random seed.");
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        r = install_random_seed(arg_esp_path);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
diff --git a/src/boot/bootctl-random-seed.h b/src/boot/bootctl-random-seed.h
new file mode 100644
index 0000000..91596d3
--- /dev/null
+++ b/src/boot/bootctl-random-seed.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int install_random_seed(const char *esp);
+
+int verb_random_seed(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-reboot-to-firmware.c b/src/boot/bootctl-reboot-to-firmware.c
new file mode 100644
index 0000000..91f2597
--- /dev/null
+++ b/src/boot/bootctl-reboot-to-firmware.c
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bootctl-reboot-to-firmware.h"
+#include "efi-api.h"
+#include "parse-util.h"
+
+int verb_reboot_to_firmware(int argc, char *argv[], void *userdata) {
+        int r;
+
+        if (argc < 2) {
+                r = efi_get_reboot_to_firmware();
+                if (r > 0) {
+                        puts("active");
+                        return 0; /* success */
+                }
+                if (r == 0) {
+                        puts("supported");
+                        return 1; /* recognizable error #1 */
+                }
+                if (r == -EOPNOTSUPP) {
+                        puts("not supported");
+                        return 2; /* recognizable error #2 */
+                }
+
+                log_error_errno(r, "Failed to query reboot-to-firmware state: %m");
+                return 3; /* other kind of error */
+        } else {
+                r = parse_boolean(argv[1]);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse argument: %s", argv[1]);
+
+                r = efi_set_reboot_to_firmware(r);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set reboot-to-firmware option: %m");
+
+                return 0;
+        }
+}
diff --git a/src/boot/bootctl-reboot-to-firmware.h b/src/boot/bootctl-reboot-to-firmware.h
new file mode 100644
index 0000000..0ca4b2c
--- /dev/null
+++ b/src/boot/bootctl-reboot-to-firmware.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+int verb_reboot_to_firmware(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-set-efivar.c b/src/boot/bootctl-set-efivar.c
new file mode 100644
index 0000000..cb2ed0d
--- /dev/null
+++ b/src/boot/bootctl-set-efivar.c
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "bootctl.h"
+#include "bootctl-set-efivar.h"
+#include "efivars.h"
+#include "efi-loader.h"
+#include "stdio-util.h"
+#include "utf8.h"
+#include "virt.h"
+
+static int parse_timeout(const char *arg1, char16_t **ret_timeout, size_t *ret_timeout_size) {
+        char utf8[DECIMAL_STR_MAX(usec_t)];
+        char16_t *encoded;
+        usec_t timeout;
+        bool menu_disabled = false;
+        int r;
+
+        assert(arg1);
+        assert(ret_timeout);
+        assert(ret_timeout_size);
+
+        assert_cc(STRLEN("menu-disabled") < ELEMENTSOF(utf8));
+
+        /* Note: Since there is no way to query if the bootloader supports the string tokens, we explicitly
+         * set their numerical value(s) instead. This means that some of the sd-boot internal ABI has leaked
+         * although the ship has sailed and the side-effects are self-contained.
+         */
+        if (streq(arg1, "menu-force"))
+                timeout = USEC_INFINITY;
+        else if (streq(arg1, "menu-hidden"))
+                timeout = 0;
+        else if (streq(arg1, "menu-disabled")) {
+                uint64_t loader_features = 0;
+
+                (void) efi_loader_get_features(&loader_features);
+                if (!(loader_features & EFI_LOADER_FEATURE_MENU_DISABLE)) {
+                        if (!arg_graceful)
+                                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Loader does not support 'menu-disabled': %m");
+
+                        log_warning("Loader does not support 'menu-disabled', setting anyway.");
+                }
+                menu_disabled = true;
+        } else {
+                r = parse_time(arg1, &timeout, USEC_PER_SEC);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse timeout '%s': %m", arg1);
+                if (timeout != USEC_INFINITY && timeout > UINT32_MAX * USEC_PER_SEC)
+                        log_warning("Timeout is too long and will be treated as 'menu-force' instead.");
+        }
+
+        if (menu_disabled)
+                xsprintf(utf8, "menu-disabled");
+        else
+                xsprintf(utf8, USEC_FMT, MIN(timeout / USEC_PER_SEC, UINT32_MAX));
+
+        encoded = utf8_to_utf16(utf8, SIZE_MAX);
+        if (!encoded)
+                return log_oom();
+
+        *ret_timeout = encoded;
+        *ret_timeout_size = char16_strlen(encoded) * 2 + 2;
+        return 0;
+}
+
+static int parse_loader_entry_target_arg(const char *arg1, char16_t **ret_target, size_t *ret_target_size) {
+        char16_t *encoded = NULL;
+        int r;
+
+        assert(arg1);
+        assert(ret_target);
+        assert(ret_target_size);
+
+        if (streq(arg1, "@current")) {
+                r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderEntrySelected), NULL, (void *) ret_target, ret_target_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get EFI variable 'LoaderEntrySelected': %m");
+
+        } else if (streq(arg1, "@oneshot")) {
+                r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderEntryOneShot), NULL, (void *) ret_target, ret_target_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get EFI variable 'LoaderEntryOneShot': %m");
+
+        } else if (streq(arg1, "@default")) {
+                r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderEntryDefault), NULL, (void *) ret_target, ret_target_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get EFI variable 'LoaderEntryDefault': %m");
+
+        } else if (arg1[0] == '@' && !streq(arg1, "@saved"))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unsupported special entry identifier: %s", arg1);
+        else {
+                encoded = utf8_to_utf16(arg1, SIZE_MAX);
+                if (!encoded)
+                        return log_oom();
+
+                *ret_target = encoded;
+                *ret_target_size = char16_strlen(encoded) * 2 + 2;
+        }
+
+        return 0;
+}
+
+int verb_set_efivar(int argc, char *argv[], void *userdata) {
+        int r;
+
+        if (arg_root)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                       "Acting on %s, skipping EFI variable setup.",
+                                       arg_image ? "image" : "root directory");
+
+        if (!is_efi_boot())
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                       "Not booted with UEFI.");
+
+        if (access(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderInfo)), F_OK) < 0) {
+                if (errno == ENOENT) {
+                        log_error_errno(errno, "Not booted with a supported boot loader.");
+                        return -EOPNOTSUPP;
+                }
+
+                return log_error_errno(errno, "Failed to detect whether boot loader supports '%s' operation: %m", argv[0]);
+        }
+
+        if (detect_container() > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                       "'%s' operation not supported in a container.",
+                                       argv[0]);
+
+        if (!arg_touch_variables)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "'%s' operation cannot be combined with --no-variables.",
+                                       argv[0]);
+
+        const char *variable;
+        int (* arg_parser)(const char *, char16_t **, size_t *);
+
+        if (streq(argv[0], "set-default")) {
+                variable = EFI_LOADER_VARIABLE(LoaderEntryDefault);
+                arg_parser = parse_loader_entry_target_arg;
+        } else if (streq(argv[0], "set-oneshot")) {
+                variable = EFI_LOADER_VARIABLE(LoaderEntryOneShot);
+                arg_parser = parse_loader_entry_target_arg;
+        } else if (streq(argv[0], "set-timeout")) {
+                variable = EFI_LOADER_VARIABLE(LoaderConfigTimeout);
+                arg_parser = parse_timeout;
+        } else if (streq(argv[0], "set-timeout-oneshot")) {
+                variable = EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot);
+                arg_parser = parse_timeout;
+        } else
+                assert_not_reached();
+
+        if (isempty(argv[1])) {
+                r = efi_set_variable(variable, NULL, 0);
+                if (r < 0 && r != -ENOENT)
+                        return log_error_errno(r, "Failed to remove EFI variable '%s': %m", variable);
+        } else {
+                _cleanup_free_ char16_t *value = NULL;
+                size_t value_size = 0;
+
+                r = arg_parser(argv[1], &value, &value_size);
+                if (r < 0)
+                        return r;
+                r = efi_set_variable(variable, value, value_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to update EFI variable '%s': %m", variable);
+        }
+
+        return 0;
+}
diff --git a/src/boot/bootctl-set-efivar.h b/src/boot/bootctl-set-efivar.h
new file mode 100644
index 0000000..6441681
--- /dev/null
+++ b/src/boot/bootctl-set-efivar.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int verb_set_efivar(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-status.c b/src/boot/bootctl-status.c
new file mode 100644
index 0000000..d171512
--- /dev/null
+++ b/src/boot/bootctl-status.c
@@ -0,0 +1,829 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "bootctl.h"
+#include "bootctl-status.h"
+#include "bootctl-util.h"
+#include "bootspec.h"
+#include "chase.h"
+#include "devnum-util.h"
+#include "dirent-util.h"
+#include "efi-api.h"
+#include "efi-loader.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "find-esp.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "recurse-dir.h"
+#include "terminal-util.h"
+#include "tpm2-util.h"
+
+static int boot_config_load_and_select(
+                BootConfig *config,
+                const char *esp_path,
+                dev_t esp_devid,
+                const char *xbootldr_path,
+                dev_t xbootldr_devid) {
+
+        int r;
+
+        /* If XBOOTLDR and ESP actually refer to the same block device, suppress XBOOTLDR, since it would
+         * find the same entries twice. */
+        bool same = esp_path && xbootldr_path && devnum_set_and_equal(esp_devid, xbootldr_devid);
+
+        r = boot_config_load(config, esp_path, same ? NULL : xbootldr_path);
+        if (r < 0)
+                return r;
+
+        if (!arg_root) {
+                _cleanup_strv_free_ char **efi_entries = NULL;
+
+                r = efi_loader_get_entries(&efi_entries);
+                if (r == -ENOENT || ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                        log_debug_errno(r, "Boot loader reported no entries.");
+                else if (r < 0)
+                        log_warning_errno(r, "Failed to determine entries reported by boot loader, ignoring: %m");
+                else
+                        (void) boot_config_augment_from_loader(config, efi_entries, /* only_auto= */ false);
+        }
+
+        return boot_config_select_special_entries(config, /* skip_efivars= */ !!arg_root);
+}
+
+static int status_entries(
+                const BootConfig *config,
+                const char *esp_path,
+                sd_id128_t esp_partition_uuid,
+                const char *xbootldr_path,
+                sd_id128_t xbootldr_partition_uuid) {
+
+        sd_id128_t dollar_boot_partition_uuid;
+        const char *dollar_boot_path;
+        int r;
+
+        assert(config);
+        assert(esp_path || xbootldr_path);
+
+        if (xbootldr_path) {
+                dollar_boot_path = xbootldr_path;
+                dollar_boot_partition_uuid = xbootldr_partition_uuid;
+        } else {
+                dollar_boot_path = esp_path;
+                dollar_boot_partition_uuid = esp_partition_uuid;
+        }
+
+        printf("%sBoot Loader Entries:%s\n"
+               "        $BOOT: %s", ansi_underline(), ansi_normal(), dollar_boot_path);
+        if (!sd_id128_is_null(dollar_boot_partition_uuid))
+                printf(" (/dev/disk/by-partuuid/" SD_ID128_UUID_FORMAT_STR ")",
+                       SD_ID128_FORMAT_VAL(dollar_boot_partition_uuid));
+        if (settle_entry_token() >= 0)
+                printf("\n        token: %s", arg_entry_token);
+        printf("\n\n");
+
+        if (config->default_entry < 0)
+                printf("%zu entries, no entry could be determined as default.\n", config->n_entries);
+        else {
+                printf("%sDefault Boot Loader Entry:%s\n", ansi_underline(), ansi_normal());
+
+                r = show_boot_entry(
+                                boot_config_default_entry(config),
+                                /* show_as_default= */ false,
+                                /* show_as_selected= */ false,
+                                /* show_discovered= */ false);
+                if (r > 0)
+                        /* < 0 is already logged by the function itself, let's just emit an extra warning if
+                           the default entry is broken */
+                        printf("\nWARNING: default boot entry is broken\n");
+        }
+
+        return 0;
+}
+
+static int print_efi_option(uint16_t id, int *n_printed, bool in_order) {
+        _cleanup_free_ char *title = NULL;
+        _cleanup_free_ char *path = NULL;
+        sd_id128_t partition;
+        bool active;
+        int r;
+
+        assert(n_printed);
+
+        r = efi_get_boot_option(id, &title, &partition, &path, &active);
+        if (r == -ENOENT) {
+                log_debug_errno(r, "Boot option 0x%04X referenced but missing, ignoring: %m", id);
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to read boot option 0x%04X: %m", id);
+
+        /* print only configured entries with partition information */
+        if (!path || sd_id128_is_null(partition)) {
+                log_debug("Ignoring boot entry 0x%04X without partition information.", id);
+                return 0;
+        }
+
+        efi_tilt_backslashes(path);
+
+        if (*n_printed == 0) /* Print section title before first entry */
+                printf("%sBoot Loaders Listed in EFI Variables:%s\n", ansi_underline(), ansi_normal());
+
+        printf("        Title: %s%s%s\n", ansi_highlight(), strna(title), ansi_normal());
+        printf("           ID: 0x%04X\n", id);
+        printf("       Status: %sactive%s\n", active ? "" : "in", in_order ? ", boot-order" : "");
+        printf("    Partition: /dev/disk/by-partuuid/" SD_ID128_UUID_FORMAT_STR "\n",
+               SD_ID128_FORMAT_VAL(partition));
+        printf("         File: %s%s\n", special_glyph(SPECIAL_GLYPH_TREE_RIGHT), path);
+        printf("\n");
+
+        (*n_printed)++;
+        return 1;
+}
+
+static int status_variables(void) {
+        _cleanup_free_ uint16_t *options = NULL, *order = NULL;
+        int n_options, n_order, n_printed = 0;
+
+        n_options = efi_get_boot_options(&options);
+        if (n_options == -ENOENT)
+                return log_error_errno(n_options,
+                                       "Failed to access EFI variables, efivarfs"
+                                       " needs to be available at /sys/firmware/efi/efivars/.");
+        if (n_options < 0)
+                return log_error_errno(n_options, "Failed to read EFI boot entries: %m");
+
+        n_order = efi_get_boot_order(&order);
+        if (n_order == -ENOENT)
+                n_order = 0;
+        else if (n_order < 0)
+                return log_error_errno(n_order, "Failed to read EFI boot order: %m");
+
+        /* print entries in BootOrder first */
+        for (int i = 0; i < n_order; i++)
+                (void) print_efi_option(order[i], &n_printed, /* in_order= */ true);
+
+        /* print remaining entries */
+        for (int i = 0; i < n_options; i++) {
+                for (int j = 0; j < n_order; j++)
+                        if (options[i] == order[j])
+                                goto next_option;
+
+                (void) print_efi_option(options[i], &n_printed, /* in_order= */ false);
+
+        next_option:
+                continue;
+        }
+
+        if (n_printed == 0)
+                printf("No boot loaders listed in EFI Variables.\n\n");
+
+        return 0;
+}
+
+static int enumerate_binaries(
+                const char *esp_path,
+                const char *path,
+                const char *prefix,
+                char **previous,
+                bool *is_first) {
+
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_free_ char *p = NULL;
+        int c = 0, r;
+
+        assert(esp_path);
+        assert(path);
+        assert(previous);
+        assert(is_first);
+
+        r = chase_and_opendir(path, esp_path, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &p, &d);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to read \"%s/%s\": %m", esp_path, path);
+
+        FOREACH_DIRENT(de, d, break) {
+                _cleanup_free_ char *v = NULL, *filename = NULL;
+                _cleanup_close_ int fd = -EBADF;
+
+                if (!endswith_no_case(de->d_name, ".efi"))
+                        continue;
+
+                if (prefix && !startswith_no_case(de->d_name, prefix))
+                        continue;
+
+                filename = path_join(p, de->d_name);
+                if (!filename)
+                        return log_oom();
+                LOG_SET_PREFIX(filename);
+
+                fd = openat(dirfd(d), de->d_name, O_RDONLY|O_CLOEXEC);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to open file for reading: %m");
+
+                r = get_file_version(fd, &v);
+
+                if (r < 0 && r != -ESRCH)
+                        return r;
+
+                if (*previous) { /* Let's output the previous entry now, since now we know that there will be
+                                  * one more, and can draw the tree glyph properly. */
+                        printf("         %s %s%s\n",
+                               *is_first ? "File:" : "     ",
+                               special_glyph(SPECIAL_GLYPH_TREE_BRANCH), *previous);
+                        *is_first = false;
+                        *previous = mfree(*previous);
+                }
+
+                /* Do not output this entry immediately, but store what should be printed in a state
+                 * variable, because we only will know the tree glyph to print (branch or final edge) once we
+                 * read one more entry */
+                if (r == -ESRCH) /* No systemd-owned file but still interesting to print */
+                        r = asprintf(previous, "/%s/%s", path, de->d_name);
+                else /* if (r >= 0) */
+                        r = asprintf(previous, "/%s/%s (%s%s%s)", path, de->d_name, ansi_highlight(), v, ansi_normal());
+                if (r < 0)
+                        return log_oom();
+
+                c++;
+        }
+
+        return c;
+}
+
+static int status_binaries(const char *esp_path, sd_id128_t partition) {
+        _cleanup_free_ char *last = NULL;
+        bool is_first = true;
+        int r, k;
+
+        printf("%sAvailable Boot Loaders on ESP:%s\n", ansi_underline(), ansi_normal());
+
+        if (!esp_path) {
+                printf("          ESP: Cannot find or access mount point of ESP.\n\n");
+                return -ENOENT;
+        }
+
+        printf("          ESP: %s", esp_path);
+        if (!sd_id128_is_null(partition))
+                printf(" (/dev/disk/by-partuuid/" SD_ID128_UUID_FORMAT_STR ")", SD_ID128_FORMAT_VAL(partition));
+        printf("\n");
+
+        r = enumerate_binaries(esp_path, "EFI/systemd", NULL, &last, &is_first);
+        if (r < 0)
+                goto fail;
+
+        k = enumerate_binaries(esp_path, "EFI/BOOT", "boot", &last, &is_first);
+        if (k < 0) {
+                r = k;
+                goto fail;
+        }
+
+        if (last) /* let's output the last entry now, since now we know that there will be no more, and can draw the tree glyph properly */
+                printf("         %s %s%s\n",
+                       is_first ? "File:" : "     ",
+                       special_glyph(SPECIAL_GLYPH_TREE_RIGHT), last);
+
+        if (r == 0 && !arg_quiet)
+                log_info("systemd-boot not installed in ESP.");
+        if (k == 0 && !arg_quiet)
+                log_info("No default/fallback boot loader installed in ESP.");
+
+        printf("\n");
+        return 0;
+
+fail:
+        errno = -r;
+        printf("         File: (can't access %s: %m)\n\n", esp_path);
+        return r;
+}
+
+static void read_efi_var(const char *variable, char **ret) {
+        int r;
+
+        r = efi_get_variable_string(variable, ret);
+        if (r < 0 && r != -ENOENT)
+                log_warning_errno(r, "Failed to read EFI variable %s: %m", variable);
+}
+
+static void print_yes_no_line(bool first, bool good, const char *name) {
+        printf("%s%s %s\n",
+               first ? "     Features: " : "               ",
+               COLOR_MARK_BOOL(good),
+               name);
+}
+
+int verb_status(int argc, char *argv[], void *userdata) {
+        sd_id128_t esp_uuid = SD_ID128_NULL, xbootldr_uuid = SD_ID128_NULL;
+        dev_t esp_devid = 0, xbootldr_devid = 0;
+        int r, k;
+
+        r = acquire_esp(/* unprivileged_mode= */ -1, /* graceful= */ false, NULL, NULL, NULL, &esp_uuid, &esp_devid);
+        if (arg_print_esp_path) {
+                if (r == -EACCES) /* If we couldn't acquire the ESP path, log about access errors (which is the only
+                                   * error the find_esp_and_warn() won't log on its own) */
+                        return log_error_errno(r, "Failed to determine ESP location: %m");
+                if (r < 0)
+                        return r;
+
+                puts(arg_esp_path);
+                return 0;
+        }
+
+        r = acquire_xbootldr(/* unprivileged_mode= */ -1, &xbootldr_uuid, &xbootldr_devid);
+        if (arg_print_dollar_boot_path) {
+                if (r == -EACCES)
+                        return log_error_errno(r, "Failed to determine XBOOTLDR partition: %m");
+                if (r < 0)
+                        return r;
+
+                const char *path = arg_dollar_boot_path();
+                if (!path)
+                        return log_error_errno(SYNTHETIC_ERRNO(EACCES), "Failed to determine XBOOTLDR location: %m");
+
+                puts(path);
+                return 0;
+        }
+
+        r = 0; /* If we couldn't determine the path, then don't consider that a problem from here on, just
+                * show what we can show */
+
+        pager_open(arg_pager_flags);
+
+        if (!arg_root && is_efi_boot()) {
+                static const struct {
+                        uint64_t flag;
+                        const char *name;
+                } loader_flags[] = {
+                        { EFI_LOADER_FEATURE_BOOT_COUNTING,           "Boot counting"                         },
+                        { EFI_LOADER_FEATURE_CONFIG_TIMEOUT,          "Menu timeout control"                  },
+                        { EFI_LOADER_FEATURE_CONFIG_TIMEOUT_ONE_SHOT, "One-shot menu timeout control"         },
+                        { EFI_LOADER_FEATURE_ENTRY_DEFAULT,           "Default entry control"                 },
+                        { EFI_LOADER_FEATURE_ENTRY_ONESHOT,           "One-shot entry control"                },
+                        { EFI_LOADER_FEATURE_XBOOTLDR,                "Support for XBOOTLDR partition"        },
+                        { EFI_LOADER_FEATURE_RANDOM_SEED,             "Support for passing random seed to OS" },
+                        { EFI_LOADER_FEATURE_LOAD_DRIVER,             "Load drop-in drivers"                  },
+                        { EFI_LOADER_FEATURE_SORT_KEY,                "Support Type #1 sort-key field"        },
+                        { EFI_LOADER_FEATURE_SAVED_ENTRY,             "Support @saved pseudo-entry"           },
+                        { EFI_LOADER_FEATURE_DEVICETREE,              "Support Type #1 devicetree field"      },
+                        { EFI_LOADER_FEATURE_SECUREBOOT_ENROLL,       "Enroll SecureBoot keys"                },
+                        { EFI_LOADER_FEATURE_RETAIN_SHIM,             "Retain SHIM protocols"                 },
+                        { EFI_LOADER_FEATURE_MENU_DISABLE,            "Menu can be disabled"                  },
+                };
+                static const struct {
+                        uint64_t flag;
+                        const char *name;
+                } stub_flags[] = {
+                        { EFI_STUB_FEATURE_REPORT_BOOT_PARTITION,     "Stub sets ESP information"                            },
+                        { EFI_STUB_FEATURE_PICK_UP_CREDENTIALS,       "Picks up credentials from boot partition"             },
+                        { EFI_STUB_FEATURE_PICK_UP_SYSEXTS,           "Picks up system extension images from boot partition" },
+                        { EFI_STUB_FEATURE_THREE_PCRS,                "Measures kernel+command line+sysexts"                 },
+                        { EFI_STUB_FEATURE_RANDOM_SEED,               "Support for passing random seed to OS"                },
+                        { EFI_STUB_FEATURE_CMDLINE_ADDONS,            "Pick up .cmdline from addons"                         },
+                        { EFI_STUB_FEATURE_CMDLINE_SMBIOS,            "Pick up .cmdline from SMBIOS Type 11"                 },
+                        { EFI_STUB_FEATURE_DEVICETREE_ADDONS,         "Pick up .dtb from addons"                             },
+                };
+                _cleanup_free_ char *fw_type = NULL, *fw_info = NULL, *loader = NULL, *loader_path = NULL, *stub = NULL;
+                sd_id128_t loader_part_uuid = SD_ID128_NULL;
+                uint64_t loader_features = 0, stub_features = 0;
+                Tpm2Support s;
+                int have;
+
+                read_efi_var(EFI_LOADER_VARIABLE(LoaderFirmwareType), &fw_type);
+                read_efi_var(EFI_LOADER_VARIABLE(LoaderFirmwareInfo), &fw_info);
+                read_efi_var(EFI_LOADER_VARIABLE(LoaderInfo), &loader);
+                read_efi_var(EFI_LOADER_VARIABLE(StubInfo), &stub);
+                read_efi_var(EFI_LOADER_VARIABLE(LoaderImageIdentifier), &loader_path);
+                (void) efi_loader_get_features(&loader_features);
+                (void) efi_stub_get_features(&stub_features);
+
+                if (loader_path)
+                        efi_tilt_backslashes(loader_path);
+
+                k = efi_loader_get_device_part_uuid(&loader_part_uuid);
+                if (k < 0 && k != -ENOENT)
+                        r = log_warning_errno(k, "Failed to read EFI variable LoaderDevicePartUUID: %m");
+
+                SecureBootMode secure = efi_get_secure_boot_mode();
+                printf("%sSystem:%s\n", ansi_underline(), ansi_normal());
+                printf("      Firmware: %s%s (%s)%s\n", ansi_highlight(), strna(fw_type), strna(fw_info), ansi_normal());
+                printf(" Firmware Arch: %s\n", get_efi_arch());
+                printf("   Secure Boot: %s%s%s",
+                       IN_SET(secure, SECURE_BOOT_USER, SECURE_BOOT_DEPLOYED) ? ansi_highlight_green() : ansi_normal(),
+                       enabled_disabled(IN_SET(secure, SECURE_BOOT_USER, SECURE_BOOT_DEPLOYED)),
+                       ansi_normal());
+
+                if (secure != SECURE_BOOT_DISABLED)
+                        printf(" (%s)\n", secure_boot_mode_to_string(secure));
+                else
+                        printf("\n");
+
+                s = tpm2_support();
+                printf("  TPM2 Support: %s%s%s\n",
+                       FLAGS_SET(s, TPM2_SUPPORT_FIRMWARE|TPM2_SUPPORT_DRIVER) ? ansi_highlight_green() :
+                       (s & (TPM2_SUPPORT_FIRMWARE|TPM2_SUPPORT_DRIVER)) != 0 ? ansi_highlight_red() : ansi_highlight_yellow(),
+                       FLAGS_SET(s, TPM2_SUPPORT_FIRMWARE|TPM2_SUPPORT_DRIVER) ? "yes" :
+                       (s & TPM2_SUPPORT_FIRMWARE) ? "firmware only, driver unavailable" :
+                       (s & TPM2_SUPPORT_DRIVER) ? "driver only, firmware unavailable" : "no",
+                       ansi_normal());
+
+                k = efi_measured_uki(LOG_DEBUG);
+                if (k > 0)
+                        printf("  Measured UKI: %syes%s\n", ansi_highlight_green(), ansi_normal());
+                else if (k == 0)
+                        printf("  Measured UKI: no\n");
+                else {
+                        errno = -k;
+                        printf("  Measured UKI: %sfailed%s (%m)\n", ansi_highlight_red(), ansi_normal());
+                }
+
+                k = efi_get_reboot_to_firmware();
+                if (k > 0)
+                        printf("  Boot into FW: %sactive%s\n", ansi_highlight_yellow(), ansi_normal());
+                else if (k == 0)
+                        printf("  Boot into FW: supported\n");
+                else if (k == -EOPNOTSUPP)
+                        printf("  Boot into FW: not supported\n");
+                else {
+                        errno = -k;
+                        printf("  Boot into FW: %sfailed%s (%m)\n", ansi_highlight_red(), ansi_normal());
+                }
+                printf("\n");
+
+                printf("%sCurrent Boot Loader:%s\n", ansi_underline(), ansi_normal());
+                printf("      Product: %s%s%s\n", ansi_highlight(), strna(loader), ansi_normal());
+
+                for (size_t i = 0; i < ELEMENTSOF(loader_flags); i++)
+                        print_yes_no_line(i == 0, FLAGS_SET(loader_features, loader_flags[i].flag), loader_flags[i].name);
+
+                sd_id128_t bootloader_esp_uuid;
+                bool have_bootloader_esp_uuid = efi_loader_get_device_part_uuid(&bootloader_esp_uuid) >= 0;
+
+                print_yes_no_line(false, have_bootloader_esp_uuid, "Boot loader sets ESP information");
+                if (have_bootloader_esp_uuid && !sd_id128_is_null(esp_uuid) &&
+                    !sd_id128_equal(esp_uuid, bootloader_esp_uuid))
+                        printf("WARNING: The boot loader reports a different ESP UUID than detected ("SD_ID128_UUID_FORMAT_STR" vs. "SD_ID128_UUID_FORMAT_STR")!\n",
+                               SD_ID128_FORMAT_VAL(bootloader_esp_uuid),
+                               SD_ID128_FORMAT_VAL(esp_uuid));
+
+                if (stub) {
+                        printf("         Stub: %s\n", stub);
+                        for (size_t i = 0; i < ELEMENTSOF(stub_flags); i++)
+                                print_yes_no_line(i == 0, FLAGS_SET(stub_features, stub_flags[i].flag), stub_flags[i].name);
+                }
+                if (!sd_id128_is_null(loader_part_uuid))
+                        printf("          ESP: /dev/disk/by-partuuid/" SD_ID128_UUID_FORMAT_STR "\n",
+                               SD_ID128_FORMAT_VAL(loader_part_uuid));
+                else
+                        printf("          ESP: n/a\n");
+                printf("         File: %s%s\n", special_glyph(SPECIAL_GLYPH_TREE_RIGHT), strna(loader_path));
+                printf("\n");
+
+                printf("%sRandom Seed:%s\n", ansi_underline(), ansi_normal());
+                have = access(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderSystemToken)), F_OK) >= 0;
+                printf(" System Token: %s\n", have ? "set" : "not set");
+
+                if (arg_esp_path) {
+                        _cleanup_free_ char *p = NULL;
+
+                        p = path_join(arg_esp_path, "/loader/random-seed");
+                        if (!p)
+                                return log_oom();
+
+                        have = access(p, F_OK) >= 0;
+                        printf("       Exists: %s\n", yes_no(have));
+                }
+
+                printf("\n");
+        } else
+                printf("%sSystem:%s\n"
+                       "Not booted with EFI\n\n",
+                       ansi_underline(), ansi_normal());
+
+        if (arg_esp_path)
+                RET_GATHER(r, status_binaries(arg_esp_path, esp_uuid));
+
+        if (!arg_root && is_efi_boot())
+                RET_GATHER(r, status_variables());
+
+        if (arg_esp_path || arg_xbootldr_path) {
+                _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL;
+
+                k = boot_config_load_and_select(&config,
+                                                arg_esp_path, esp_devid,
+                                                arg_xbootldr_path, xbootldr_devid);
+                RET_GATHER(r, k);
+
+                if (k >= 0)
+                        RET_GATHER(r,
+                                   status_entries(&config,
+                                                  arg_esp_path, esp_uuid,
+                                                  arg_xbootldr_path, xbootldr_uuid));
+        }
+
+        return r;
+}
+
+static int ref_file(Hashmap *known_files, const char *fn, int increment) {
+        char *k = NULL;
+        int n, r;
+
+        assert(known_files);
+
+        /* just gracefully ignore this. This way the caller doesn't
+           have to verify whether the bootloader entry is relevant */
+        if (!fn)
+                return 0;
+
+        n = PTR_TO_INT(hashmap_get2(known_files, fn, (void**)&k));
+        n += increment;
+
+        assert(n >= 0);
+
+        if (n == 0) {
+                (void) hashmap_remove(known_files, fn);
+                free(k);
+        } else if (!k) {
+                _cleanup_free_ char *t = NULL;
+
+                t = strdup(fn);
+                if (!t)
+                        return -ENOMEM;
+                r = hashmap_put(known_files, t, INT_TO_PTR(n));
+                if (r < 0)
+                        return r;
+                TAKE_PTR(t);
+        } else {
+                r = hashmap_update(known_files, fn, INT_TO_PTR(n));
+                if (r < 0)
+                        return r;
+        }
+
+        return n;
+}
+
+static void deref_unlink_file(Hashmap *known_files, const char *fn, const char *root) {
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        assert(known_files);
+
+        /* just gracefully ignore this. This way the caller doesn't
+           have to verify whether the bootloader entry is relevant */
+        if (!fn || !root)
+                return;
+
+        r = ref_file(known_files, fn, -1);
+        if (r < 0)
+                return (void) log_warning_errno(r, "Failed to deref \"%s\", ignoring: %m", fn);
+        if (r > 0)
+                return;
+
+        if (arg_dry_run) {
+                r = chase_and_access(fn, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, F_OK, &path);
+                if (r < 0)
+                        log_info_errno(r, "Unable to determine whether \"%s\" exists, ignoring: %m", fn);
+                else
+                        log_info("Would remove \"%s\"", path);
+                return;
+        }
+
+        r = chase_and_unlink(fn, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, 0, &path);
+        if (r >= 0)
+                log_info("Removed \"%s\"", path);
+        else if (r != -ENOENT)
+                return (void) log_warning_errno(r, "Failed to remove \"%s\", ignoring: %m", fn);
+
+        _cleanup_free_ char *d = NULL;
+        if (path_extract_directory(fn, &d) >= 0 && !path_equal(d, "/")) {
+                r = chase_and_unlink(d, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, AT_REMOVEDIR, NULL);
+                if (r < 0 && !IN_SET(r, -ENOTEMPTY, -ENOENT))
+                        log_warning_errno(r, "Failed to remove directory \"%s\", ignoring: %m", d);
+        }
+}
+
+static int count_known_files(const BootConfig *config, const char* root, Hashmap **ret_known_files) {
+        _cleanup_(hashmap_free_free_keyp) Hashmap *known_files = NULL;
+        int r;
+
+        assert(config);
+        assert(ret_known_files);
+
+        known_files = hashmap_new(&path_hash_ops);
+        if (!known_files)
+                return -ENOMEM;
+
+        for (size_t i = 0; i < config->n_entries; i++) {
+                const BootEntry *e = config->entries + i;
+
+                if (!path_equal(e->root, root))
+                        continue;
+
+                r = ref_file(known_files, e->kernel, +1);
+                if (r < 0)
+                        return r;
+                r = ref_file(known_files, e->efi, +1);
+                if (r < 0)
+                        return r;
+                STRV_FOREACH(s, e->initrd) {
+                        r = ref_file(known_files, *s, +1);
+                        if (r < 0)
+                                return r;
+                }
+                r = ref_file(known_files, e->device_tree, +1);
+                if (r < 0)
+                        return r;
+                STRV_FOREACH(s, e->device_tree_overlay) {
+                        r = ref_file(known_files, *s, +1);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        *ret_known_files = TAKE_PTR(known_files);
+
+        return 0;
+}
+
+static int boot_config_find_in(const BootConfig *config, const char *root, const char *id) {
+        assert(config);
+
+        if (!root || !id)
+                return -ENOENT;
+
+        for (size_t i = 0; i < config->n_entries; i++)
+                if (path_equal(config->entries[i].root, root) &&
+                    fnmatch(id, config->entries[i].id, FNM_CASEFOLD) == 0)
+                        return i;
+
+        return -ENOENT;
+}
+
+static int unlink_entry(const BootConfig *config, const char *root, const char *id) {
+        _cleanup_(hashmap_free_free_keyp) Hashmap *known_files = NULL;
+        const BootEntry *e = NULL;
+        int r;
+
+        assert(config);
+
+        r = count_known_files(config, root, &known_files);
+        if (r < 0)
+                return log_error_errno(r, "Failed to count files in %s: %m", root);
+
+        r = boot_config_find_in(config, root, id);
+        if (r < 0)
+                return r;
+
+        if (r == config->default_entry)
+                log_warning("%s is the default boot entry", id);
+        if (r == config->selected_entry)
+                log_warning("%s is the selected boot entry", id);
+
+        e = &config->entries[r];
+
+        deref_unlink_file(known_files, e->kernel, e->root);
+        deref_unlink_file(known_files, e->efi, e->root);
+        STRV_FOREACH(s, e->initrd)
+                deref_unlink_file(known_files, *s, e->root);
+        deref_unlink_file(known_files, e->device_tree, e->root);
+        STRV_FOREACH(s, e->device_tree_overlay)
+                deref_unlink_file(known_files, *s, e->root);
+
+        if (arg_dry_run)
+                log_info("Would remove \"%s\"", e->path);
+        else {
+                r = chase_and_unlink(e->path, root, CHASE_PROHIBIT_SYMLINKS, 0, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to remove \"%s\": %m", e->path);
+
+                log_info("Removed %s", e->path);
+        }
+
+        return 0;
+}
+
+static int list_remove_orphaned_file(
+                RecurseDirEvent event,
+                const char *path,
+                int dir_fd,
+                int inode_fd,
+                const struct dirent *de,
+                const struct statx *sx,
+                void *userdata) {
+
+        Hashmap *known_files = userdata;
+
+        assert(path);
+        assert(known_files);
+
+        if (event != RECURSE_DIR_ENTRY)
+                return RECURSE_DIR_CONTINUE;
+
+        if (hashmap_get(known_files, path))
+                return RECURSE_DIR_CONTINUE; /* keep! */
+
+        if (arg_dry_run)
+                log_info("Would remove %s", path);
+        else if (unlinkat(dir_fd, de->d_name, 0) < 0)
+                log_warning_errno(errno, "Failed to remove \"%s\", ignoring: %m", path);
+        else
+                log_info("Removed %s", path);
+
+        return RECURSE_DIR_CONTINUE;
+}
+
+static int cleanup_orphaned_files(
+                const BootConfig *config,
+                const char *root) {
+
+        _cleanup_(hashmap_free_free_keyp) Hashmap *known_files = NULL;
+        _cleanup_free_ char *full = NULL, *p = NULL;
+        _cleanup_close_ int dir_fd = -EBADF;
+        int r;
+
+        assert(config);
+        assert(root);
+
+        log_info("Cleaning %s", root);
+
+        r = settle_entry_token();
+        if (r < 0)
+                return r;
+
+        r = count_known_files(config, root, &known_files);
+        if (r < 0)
+                return log_error_errno(r, "Failed to count files in %s: %m", root);
+
+        dir_fd = chase_and_open(arg_entry_token, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS,
+                        O_DIRECTORY|O_CLOEXEC, &full);
+        if (dir_fd == -ENOENT)
+                return 0;
+        if (dir_fd < 0)
+                return log_error_errno(dir_fd, "Failed to open '%s/%s': %m", root, arg_entry_token);
+
+        p = path_join("/", arg_entry_token);
+        if (!p)
+                return log_oom();
+
+        r = recurse_dir(dir_fd, p, 0, UINT_MAX, RECURSE_DIR_SORT, list_remove_orphaned_file, known_files);
+        if (r < 0)
+                return log_error_errno(r, "Failed to cleanup %s: %m", full);
+
+        return r;
+}
+
+int verb_list(int argc, char *argv[], void *userdata) {
+        _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL;
+        dev_t esp_devid = 0, xbootldr_devid = 0;
+        int r;
+
+        /* If we lack privileges we invoke find_esp_and_warn() in "unprivileged mode" here, which does two
+         * things: turn off logging about access errors and turn off potentially privileged device probing.
+         * Here we're interested in the latter but not the former, hence request the mode, and log about
+         * EACCES. */
+
+        r = acquire_esp(/* unprivileged_mode= */ -1, /* graceful= */ false, NULL, NULL, NULL, NULL, &esp_devid);
+        if (r == -EACCES) /* We really need the ESP path for this call, hence also log about access errors */
+                return log_error_errno(r, "Failed to determine ESP location: %m");
+        if (r < 0)
+                return r;
+
+        r = acquire_xbootldr(/* unprivileged_mode= */ -1, NULL, &xbootldr_devid);
+        if (r == -EACCES)
+                return log_error_errno(r, "Failed to determine XBOOTLDR partition: %m");
+        if (r < 0)
+                return r;
+
+        r = boot_config_load_and_select(&config, arg_esp_path, esp_devid, arg_xbootldr_path, xbootldr_devid);
+        if (r < 0)
+                return r;
+
+        if (config.n_entries == 0 && FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) {
+                log_info("No boot loader entries found.");
+                return 0;
+        }
+
+        if (streq(argv[0], "list")) {
+                pager_open(arg_pager_flags);
+                return show_boot_entries(&config, arg_json_format_flags);
+        } else if (streq(argv[0], "cleanup")) {
+                if (arg_xbootldr_path && xbootldr_devid != esp_devid)
+                        cleanup_orphaned_files(&config, arg_xbootldr_path);
+                return cleanup_orphaned_files(&config, arg_esp_path);
+        } else {
+                assert(streq(argv[0], "unlink"));
+                if (arg_xbootldr_path && xbootldr_devid != esp_devid) {
+                        r = unlink_entry(&config, arg_xbootldr_path, argv[1]);
+                        if (r == 0 || r != -ENOENT)
+                                return r;
+                }
+                return unlink_entry(&config, arg_esp_path, argv[1]);
+        }
+}
+
+int verb_unlink(int argc, char *argv[], void *userdata) {
+        return verb_list(argc, argv, userdata);
+}
diff --git a/src/boot/bootctl-status.h b/src/boot/bootctl-status.h
new file mode 100644
index 0000000..f7998a3
--- /dev/null
+++ b/src/boot/bootctl-status.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+int verb_status(int argc, char *argv[], void *userdata);
+int verb_list(int argc, char *argv[], void *userdata);
+int verb_unlink(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-systemd-efi-options.c b/src/boot/bootctl-systemd-efi-options.c
new file mode 100644
index 0000000..7f8308f
--- /dev/null
+++ b/src/boot/bootctl-systemd-efi-options.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bootctl.h"
+#include "bootctl-systemd-efi-options.h"
+#include "efi-loader.h"
+
+int verb_systemd_efi_options(int argc, char *argv[], void *userdata) {
+        int r;
+
+        /* This is obsolete and subject to removal */
+
+        if (!arg_quiet)
+                log_notice("Use of the SystemdOptions EFI variable is deprecated.");
+
+        if (argc == 1) {
+                _cleanup_free_ char *line = NULL, *new = NULL;
+
+                r = systemd_efi_options_variable(&line);
+                if (r == -ENODATA)
+                        log_debug("No SystemdOptions EFI variable present in cache.");
+                else if (r < 0)
+                        return log_error_errno(r, "Failed to read SystemdOptions EFI variable from cache: %m");
+                else
+                        puts(line);
+
+                r = systemd_efi_options_efivarfs_if_newer(&new);
+                if (r == -ENODATA) {
+                        if (line)
+                                log_notice("Note: SystemdOptions EFI variable has been removed since boot.");
+                } else if (r < 0)
+                        log_warning_errno(r, "Failed to check SystemdOptions EFI variable in efivarfs, ignoring: %m");
+                else if (new && !streq_ptr(line, new))
+                        log_notice("Note: SystemdOptions EFI variable has been modified since boot. New value: %s",
+                                   new);
+        } else {
+                r = efi_set_variable_string(EFI_SYSTEMD_VARIABLE(SystemdOptions), argv[1]);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set SystemdOptions EFI variable: %m");
+        }
+
+        return 0;
+}
diff --git a/src/boot/bootctl-systemd-efi-options.h b/src/boot/bootctl-systemd-efi-options.h
new file mode 100644
index 0000000..d0243eb
--- /dev/null
+++ b/src/boot/bootctl-systemd-efi-options.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int verb_systemd_efi_options(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-uki.c b/src/boot/bootctl-uki.c
new file mode 100644
index 0000000..8808c30
--- /dev/null
+++ b/src/boot/bootctl-uki.c
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "bootctl-uki.h"
+#include "kernel-image.h"
+
+int verb_kernel_identify(int argc, char *argv[], void *userdata) {
+        KernelImageType t;
+        int r;
+
+        r = inspect_kernel(AT_FDCWD, argv[1], &t, NULL, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        puts(kernel_image_type_to_string(t));
+        return 0;
+}
+
+int verb_kernel_inspect(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *cmdline = NULL, *uname = NULL, *pname = NULL;
+        KernelImageType t;
+        int r;
+
+        r = inspect_kernel(AT_FDCWD, argv[1], &t, &cmdline, &uname, &pname);
+        if (r < 0)
+                return r;
+
+        printf("Kernel Type: %s\n", kernel_image_type_to_string(t));
+        if (cmdline)
+                printf("    Cmdline: %s\n", cmdline);
+        if (uname)
+                printf("    Version: %s\n", uname);
+        if (pname)
+                printf("         OS: %s\n", pname);
+
+        return 0;
+}
diff --git a/src/boot/bootctl-uki.h b/src/boot/bootctl-uki.h
new file mode 100644
index 0000000..effb984
--- /dev/null
+++ b/src/boot/bootctl-uki.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+int verb_kernel_identify(int argc, char *argv[], void *userdata);
+int verb_kernel_inspect(int argc, char *argv[], void *userdata);
diff --git a/src/boot/bootctl-util.c b/src/boot/bootctl-util.c
new file mode 100644
index 0000000..3cab875
--- /dev/null
+++ b/src/boot/bootctl-util.c
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "bootctl.h"
+#include "bootctl-util.h"
+#include "errno-util.h"
+#include "fileio.h"
+#include "stat-util.h"
+#include "sync-util.h"
+
+int sync_everything(void) {
+        int r = 0, k;
+
+        if (arg_esp_path) {
+                k = syncfs_path(AT_FDCWD, arg_esp_path);
+                if (k < 0)
+                        RET_GATHER(r, log_error_errno(k, "Failed to synchronize the ESP '%s': %m", arg_esp_path));
+        }
+
+        if (arg_xbootldr_path) {
+                k = syncfs_path(AT_FDCWD, arg_xbootldr_path);
+                if (k < 0)
+                        RET_GATHER(r, log_error_errno(k, "Failed to synchronize $BOOT '%s': %m", arg_xbootldr_path));
+        }
+
+        return r;
+}
+
+const char *get_efi_arch(void) {
+        /* Detect EFI firmware architecture of the running system. On mixed mode systems, it could be 32-bit
+         * while the kernel is running in 64-bit. */
+
+#ifdef __x86_64__
+        _cleanup_free_ char *platform_size = NULL;
+        int r;
+
+        r = read_one_line_file("/sys/firmware/efi/fw_platform_size", &platform_size);
+        if (r == -ENOENT)
+                return EFI_MACHINE_TYPE_NAME;
+        if (r < 0) {
+                log_warning_errno(r,
+                        "Error reading EFI firmware word size, assuming machine type '%s': %m",
+                        EFI_MACHINE_TYPE_NAME);
+                return EFI_MACHINE_TYPE_NAME;
+        }
+
+        if (streq(platform_size, "64"))
+                return EFI_MACHINE_TYPE_NAME;
+        if (streq(platform_size, "32"))
+                return "ia32";
+
+        log_warning(
+                "Unknown EFI firmware word size '%s', using machine type '%s'.",
+                platform_size,
+                EFI_MACHINE_TYPE_NAME);
+#endif
+
+        return EFI_MACHINE_TYPE_NAME;
+}
+
+/* search for "#### LoaderInfo: systemd-boot 218 ####" string inside the binary */
+int get_file_version(int fd, char **ret) {
+        struct stat st;
+        char *buf;
+        const char *s, *e;
+        char *marker = NULL;
+        int r;
+
+        assert(fd >= 0);
+        assert(ret);
+
+        if (fstat(fd, &st) < 0)
+                return log_error_errno(errno, "Failed to stat EFI binary: %m");
+
+        r = stat_verify_regular(&st);
+        if (r < 0) {
+                log_debug_errno(r, "EFI binary is not a regular file, assuming no version information: %m");
+                return -ESRCH;
+        }
+
+        if (st.st_size < 27 || file_offset_beyond_memory_size(st.st_size))
+                return log_debug_errno(SYNTHETIC_ERRNO(ESRCH),
+                                       "EFI binary size too %s: %"PRIi64,
+                                       st.st_size < 27 ? "small" : "large", st.st_size);
+
+        buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+        if (buf == MAP_FAILED)
+                return log_error_errno(errno, "Failed to mmap EFI binary: %m");
+
+        s = mempmem_safe(buf, st.st_size - 8, "#### LoaderInfo: ", 17);
+        if (!s) {
+                r = log_debug_errno(SYNTHETIC_ERRNO(ESRCH), "EFI binary has no LoaderInfo marker.");
+                goto finish;
+        }
+
+        e = memmem_safe(s, st.st_size - (s - buf), " ####", 5);
+        if (!e || e - s < 3) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), "EFI binary has malformed LoaderInfo marker.");
+                goto finish;
+        }
+
+        marker = strndup(s, e - s);
+        if (!marker) {
+                r = log_oom();
+                goto finish;
+        }
+
+        log_debug("EFI binary LoaderInfo marker: \"%s\"", marker);
+        r = 0;
+        *ret = marker;
+finish:
+        (void) munmap(buf, st.st_size);
+        return r;
+}
+
+int settle_entry_token(void) {
+        int r;
+
+        r = boot_entry_token_ensure(
+                        arg_root,
+                        etc_kernel(),
+                        arg_machine_id,
+                        /* machine_id_is_random = */ false,
+                        &arg_entry_token_type,
+                        &arg_entry_token);
+        if (r < 0)
+                return r;
+
+        log_debug("Using entry token: %s", arg_entry_token);
+        return 0;
+}
diff --git a/src/boot/bootctl-util.h b/src/boot/bootctl-util.h
new file mode 100644
index 0000000..147455e
--- /dev/null
+++ b/src/boot/bootctl-util.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int sync_everything(void);
+
+const char *get_efi_arch(void);
+
+int get_file_version(int fd, char **ret);
+
+int settle_entry_token(void);
+
+static inline const char* etc_kernel(void) {
+        return getenv("KERNEL_INSTALL_CONF_ROOT") ?: "/etc/kernel/";
+}
diff --git a/src/boot/bootctl.c b/src/boot/bootctl.c
new file mode 100644
index 0000000..4614ca1
--- /dev/null
+++ b/src/boot/bootctl.c
@@ -0,0 +1,516 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "blockdev-util.h"
+#include "bootctl.h"
+#include "bootctl-install.h"
+#include "bootctl-random-seed.h"
+#include "bootctl-reboot-to-firmware.h"
+#include "bootctl-set-efivar.h"
+#include "bootctl-status.h"
+#include "bootctl-systemd-efi-options.h"
+#include "bootctl-uki.h"
+#include "build.h"
+#include "devnum-util.h"
+#include "dissect-image.h"
+#include "escape.h"
+#include "find-esp.h"
+#include "main-func.h"
+#include "mount-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "pretty-print.h"
+#include "utf8.h"
+#include "verbs.h"
+#include "virt.h"
+
+/* EFI_BOOT_OPTION_DESCRIPTION_MAX sets the maximum length for the boot option description
+ * stored in NVRAM. The UEFI spec does not specify a minimum or maximum length for this
+ * string, but we limit the length to something reasonable to prevent from the firmware
+ * having to deal with a potentially too long string. */
+#define EFI_BOOT_OPTION_DESCRIPTION_MAX ((size_t) 255)
+
+char *arg_esp_path = NULL;
+char *arg_xbootldr_path = NULL;
+bool arg_print_esp_path = false;
+bool arg_print_dollar_boot_path = false;
+unsigned arg_print_root_device = 0;
+bool arg_touch_variables = true;
+PagerFlags arg_pager_flags = 0;
+bool arg_graceful = false;
+bool arg_quiet = false;
+int arg_make_entry_directory = false; /* tri-state: < 0 for automatic logic */
+sd_id128_t arg_machine_id = SD_ID128_NULL;
+char *arg_install_layout = NULL;
+BootEntryTokenType arg_entry_token_type = BOOT_ENTRY_TOKEN_AUTO;
+char *arg_entry_token = NULL;
+JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+bool arg_arch_all = false;
+char *arg_root = NULL;
+char *arg_image = NULL;
+InstallSource arg_install_source = ARG_INSTALL_SOURCE_AUTO;
+char *arg_efi_boot_option_description = NULL;
+bool arg_dry_run = false;
+ImagePolicy *arg_image_policy = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_esp_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_xbootldr_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_install_layout, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_entry_token, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_efi_boot_option_description, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
+
+int acquire_esp(
+                int unprivileged_mode,
+                bool graceful,
+                uint32_t *ret_part,
+                uint64_t *ret_pstart,
+                uint64_t *ret_psize,
+                sd_id128_t *ret_uuid,
+                dev_t *ret_devid) {
+
+        char *np;
+        int r;
+
+        /* Find the ESP, and log about errors. Note that find_esp_and_warn() will log in all error cases on
+         * its own, except for ENOKEY (which is good, we want to show our own message in that case,
+         * suggesting use of --esp-path=) and EACCESS (only when we request unprivileged mode; in this case
+         * we simply eat up the error here, so that --list and --status work too, without noise about
+         * this). */
+
+        r = find_esp_and_warn(arg_root, arg_esp_path, unprivileged_mode, &np, ret_part, ret_pstart, ret_psize, ret_uuid, ret_devid);
+        if (r == -ENOKEY) {
+                if (graceful)
+                        return log_full_errno(arg_quiet ? LOG_DEBUG : LOG_INFO, r,
+                                              "Couldn't find EFI system partition, skipping.");
+
+                return log_error_errno(r,
+                                       "Couldn't find EFI system partition. It is recommended to mount it to /boot or /efi.\n"
+                                       "Alternatively, use --esp-path= to specify path to mount point.");
+        }
+        if (r < 0)
+                return r;
+
+        free_and_replace(arg_esp_path, np);
+        log_debug("Using EFI System Partition at %s.", arg_esp_path);
+
+        return 0;
+}
+
+int acquire_xbootldr(
+                int unprivileged_mode,
+                sd_id128_t *ret_uuid,
+                dev_t *ret_devid) {
+
+        char *np;
+        int r;
+
+        r = find_xbootldr_and_warn(arg_root, arg_xbootldr_path, unprivileged_mode, &np, ret_uuid, ret_devid);
+        if (r == -ENOKEY) {
+                log_debug_errno(r, "Didn't find an XBOOTLDR partition, using the ESP as $BOOT.");
+                arg_xbootldr_path = mfree(arg_xbootldr_path);
+
+                if (ret_uuid)
+                        *ret_uuid = SD_ID128_NULL;
+                if (ret_devid)
+                        *ret_devid = 0;
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        free_and_replace(arg_xbootldr_path, np);
+        log_debug("Using XBOOTLDR partition at %s as $BOOT.", arg_xbootldr_path);
+
+        return 1;
+}
+
+static int help(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        pager_open(arg_pager_flags);
+
+        r = terminal_urlify_man("bootctl", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s [OPTIONS...] COMMAND ...\n"
+               "\n%5$sControl EFI firmware boot settings and manage boot loader.%6$s\n"
+               "\n%3$sGeneric EFI Firmware/Boot Loader Commands:%4$s\n"
+               "  status               Show status of installed boot loader and EFI variables\n"
+               "  reboot-to-firmware [BOOL]\n"
+               "                       Query or set reboot-to-firmware EFI flag\n"
+               "\n%3$sBoot Loader Specification Commands:%4$s\n"
+               "  list                 List boot loader entries\n"
+               "  unlink ID            Remove boot loader entry\n"
+               "  cleanup              Remove files in ESP not referenced in any boot entry\n"
+               "\n%3$sBoot Loader Interface Commands:%4$s\n"
+               "  set-default ID       Set default boot loader entry\n"
+               "  set-oneshot ID       Set default boot loader entry, for next boot only\n"
+               "  set-timeout SECONDS  Set the menu timeout\n"
+               "  set-timeout-oneshot SECONDS\n"
+               "                       Set the menu timeout for the next boot only\n"
+               "\n%3$ssystemd-boot Commands:%4$s\n"
+               "  install              Install systemd-boot to the ESP and EFI variables\n"
+               "  update               Update systemd-boot in the ESP and EFI variables\n"
+               "  remove               Remove systemd-boot from the ESP and EFI variables\n"
+               "  is-installed         Test whether systemd-boot is installed in the ESP\n"
+               "  random-seed          Initialize or refresh random seed in ESP and EFI\n"
+               "                       variables\n"
+               "\n%3$sKernel Image Commands:%4$s\n"
+               "  kernel-identify      Identify kernel image type\n"
+               "  kernel-inspect       Prints details about the kernel image\n"
+               "\n%3$sOptions:%4$s\n"
+               "  -h --help            Show this help\n"
+               "     --version         Print version\n"
+               "     --esp-path=PATH   Path to the EFI System Partition (ESP)\n"
+               "     --boot-path=PATH  Path to the $BOOT partition\n"
+               "     --root=PATH       Operate on an alternate filesystem root\n"
+               "     --image=PATH      Operate on disk image as filesystem root\n"
+               "     --image-policy=POLICY\n"
+               "                       Specify disk image dissection policy\n"
+               "     --install-source=auto|image|host\n"
+               "                       Where to pick files when using --root=/--image=\n"
+               "  -p --print-esp-path  Print path to the EFI System Partition mount point\n"
+               "  -x --print-boot-path Print path to the $BOOT partition mount point\n"
+               "  -R --print-root-device\n"
+               "                       Print path to the block device node backing the\n"
+               "                       root file system (returns e.g. /dev/nvme0n1p5)\n"
+               "  -RR                  Print path to the whole disk block device node\n"
+               "                       backing the root FS (returns e.g. /dev/nvme0n1)\n"
+               "     --no-variables    Don't touch EFI variables\n"
+               "     --no-pager        Do not pipe output into a pager\n"
+               "     --graceful        Don't fail when the ESP cannot be found or EFI\n"
+               "                       variables cannot be written\n"
+               "  -q --quiet           Suppress output\n"
+               "     --make-entry-directory=yes|no|auto\n"
+               "                       Create $BOOT/ENTRY-TOKEN/ directory\n"
+               "     --entry-token=machine-id|os-id|os-image-id|auto|literal:…\n"
+               "                       Entry token to use for this installation\n"
+               "     --json=pretty|short|off\n"
+               "                       Generate JSON output\n"
+               "     --all-architectures\n"
+               "                       Install all supported EFI architectures\n"
+               "     --efi-boot-option-description=DESCRIPTION\n"
+               "                       Description of the entry in the boot option list\n"
+               "     --dry-run         Dry run (unlink and cleanup)\n"
+               "\nSee the %2$s for details.\n",
+               program_invocation_short_name,
+               link,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_ESP_PATH = 0x100,
+                ARG_BOOT_PATH,
+                ARG_ROOT,
+                ARG_IMAGE,
+                ARG_IMAGE_POLICY,
+                ARG_INSTALL_SOURCE,
+                ARG_VERSION,
+                ARG_NO_VARIABLES,
+                ARG_NO_PAGER,
+                ARG_GRACEFUL,
+                ARG_MAKE_ENTRY_DIRECTORY,
+                ARG_ENTRY_TOKEN,
+                ARG_JSON,
+                ARG_ARCH_ALL,
+                ARG_EFI_BOOT_OPTION_DESCRIPTION,
+                ARG_DRY_RUN,
+        };
+
+        static const struct option options[] = {
+                { "help",                        no_argument,       NULL, 'h'                             },
+                { "version",                     no_argument,       NULL, ARG_VERSION                     },
+                { "esp-path",                    required_argument, NULL, ARG_ESP_PATH                    },
+                { "path",                        required_argument, NULL, ARG_ESP_PATH                    }, /* Compatibility alias */
+                { "boot-path",                   required_argument, NULL, ARG_BOOT_PATH                   },
+                { "root",                        required_argument, NULL, ARG_ROOT                        },
+                { "image",                       required_argument, NULL, ARG_IMAGE                       },
+                { "image-policy",                required_argument, NULL, ARG_IMAGE_POLICY                },
+                { "install-source",              required_argument, NULL, ARG_INSTALL_SOURCE              },
+                { "print-esp-path",              no_argument,       NULL, 'p'                             },
+                { "print-path",                  no_argument,       NULL, 'p'                             }, /* Compatibility alias */
+                { "print-boot-path",             no_argument,       NULL, 'x'                             },
+                { "print-root-device",           no_argument,       NULL, 'R'                             },
+                { "no-variables",                no_argument,       NULL, ARG_NO_VARIABLES                },
+                { "no-pager",                    no_argument,       NULL, ARG_NO_PAGER                    },
+                { "graceful",                    no_argument,       NULL, ARG_GRACEFUL                    },
+                { "quiet",                       no_argument,       NULL, 'q'                             },
+                { "make-entry-directory",        required_argument, NULL, ARG_MAKE_ENTRY_DIRECTORY        },
+                { "make-machine-id-directory",   required_argument, NULL, ARG_MAKE_ENTRY_DIRECTORY        }, /* Compatibility alias */
+                { "entry-token",                 required_argument, NULL, ARG_ENTRY_TOKEN                 },
+                { "json",                        required_argument, NULL, ARG_JSON                        },
+                { "all-architectures",           no_argument,       NULL, ARG_ARCH_ALL                    },
+                { "efi-boot-option-description", required_argument, NULL, ARG_EFI_BOOT_OPTION_DESCRIPTION },
+                { "dry-run",                     no_argument,       NULL, ARG_DRY_RUN                     },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hpxRq", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        help(0, NULL, NULL);
+                        return 0;
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_ESP_PATH:
+                        r = free_and_strdup(&arg_esp_path, optarg);
+                        if (r < 0)
+                                return log_oom();
+                        break;
+
+                case ARG_BOOT_PATH:
+                        r = free_and_strdup(&arg_xbootldr_path, optarg);
+                        if (r < 0)
+                                return log_oom();
+                        break;
+
+                case ARG_ROOT:
+                        r = parse_path_argument(optarg, /* suppress_root= */ true, &arg_root);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE_POLICY:
+                        r = parse_image_policy_argument(optarg, &arg_image_policy);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_INSTALL_SOURCE:
+                        if (streq(optarg, "auto"))
+                                arg_install_source = ARG_INSTALL_SOURCE_AUTO;
+                        else if (streq(optarg, "image"))
+                                arg_install_source = ARG_INSTALL_SOURCE_IMAGE;
+                        else if (streq(optarg, "host"))
+                                arg_install_source = ARG_INSTALL_SOURCE_HOST;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected parameter for --install-source=: %s", optarg);
+
+                        break;
+
+                case 'p':
+                        arg_print_esp_path = true;
+                        break;
+
+                case 'x':
+                        arg_print_dollar_boot_path = true;
+                        break;
+
+                case 'R':
+                        arg_print_root_device ++;
+                        break;
+
+                case ARG_NO_VARIABLES:
+                        arg_touch_variables = false;
+                        break;
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_GRACEFUL:
+                        arg_graceful = true;
+                        break;
+
+                case 'q':
+                        arg_quiet = true;
+                        break;
+
+                case ARG_ENTRY_TOKEN:
+                        r = parse_boot_entry_token_type(optarg, &arg_entry_token_type, &arg_entry_token);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_MAKE_ENTRY_DIRECTORY:
+                        if (streq(optarg, "auto"))  /* retained for backwards compatibility */
+                                arg_make_entry_directory = -1; /* yes if machine-id is permanent */
+                        else {
+                                r = parse_boolean_argument("--make-entry-directory=", optarg, NULL);
+                                if (r < 0)
+                                        return r;
+
+                                arg_make_entry_directory = r;
+                        }
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+                        break;
+
+                case ARG_ARCH_ALL:
+                        arg_arch_all = true;
+                        break;
+
+                case ARG_EFI_BOOT_OPTION_DESCRIPTION:
+                        if (isempty(optarg) || !(string_is_safe(optarg) && utf8_is_valid(optarg))) {
+                                _cleanup_free_ char *escaped = NULL;
+
+                                escaped = cescape(optarg);
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Invalid --efi-boot-option-description=: %s", strna(escaped));
+                        }
+                        if (strlen(optarg) > EFI_BOOT_OPTION_DESCRIPTION_MAX)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "--efi-boot-option-description= too long: %zu > %zu",
+                                                       strlen(optarg), EFI_BOOT_OPTION_DESCRIPTION_MAX);
+                        r = free_and_strdup_warn(&arg_efi_boot_option_description, optarg);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_DRY_RUN:
+                        arg_dry_run = true;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (!!arg_print_esp_path + !!arg_print_dollar_boot_path + (arg_print_root_device > 0) > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "--print-esp-path/-p, --print-boot-path/-x, --print-root-device=/-R cannot be combined.");
+
+        if ((arg_root || arg_image) && argv[optind] && !STR_IN_SET(argv[optind], "status", "list",
+                        "install", "update", "remove", "is-installed", "random-seed", "unlink", "cleanup"))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Options --root= and --image= are not supported with verb %s.",
+                                       argv[optind]);
+
+        if (arg_root && arg_image)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported.");
+
+        if (arg_install_source != ARG_INSTALL_SOURCE_AUTO && !arg_root && !arg_image)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--install-from-host is only supported with --root= or --image=.");
+
+        if (arg_dry_run && argv[optind] && !STR_IN_SET(argv[optind], "unlink", "cleanup"))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--dry is only supported with --unlink or --cleanup");
+
+        return 1;
+}
+
+static int bootctl_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help",                VERB_ANY, VERB_ANY, 0,            help                     },
+                { "status",              VERB_ANY, 1,        VERB_DEFAULT, verb_status              },
+                { "install",             VERB_ANY, 1,        0,            verb_install             },
+                { "update",              VERB_ANY, 1,        0,            verb_install             },
+                { "remove",              VERB_ANY, 1,        0,            verb_remove              },
+                { "is-installed",        VERB_ANY, 1,        0,            verb_is_installed        },
+                { "kernel-identify",     2,        2,        0,            verb_kernel_identify     },
+                { "kernel-inspect",      2,        2,        0,            verb_kernel_inspect      },
+                { "list",                VERB_ANY, 1,        0,            verb_list                },
+                { "unlink",              2,        2,        0,            verb_unlink              },
+                { "cleanup",             VERB_ANY, 1,        0,            verb_list                },
+                { "set-default",         2,        2,        0,            verb_set_efivar          },
+                { "set-oneshot",         2,        2,        0,            verb_set_efivar          },
+                { "set-timeout",         2,        2,        0,            verb_set_efivar          },
+                { "set-timeout-oneshot", 2,        2,        0,            verb_set_efivar          },
+                { "random-seed",         VERB_ANY, 1,        0,            verb_random_seed         },
+                { "systemd-efi-options", VERB_ANY, 2,        0,            verb_systemd_efi_options },
+                { "reboot-to-firmware",  VERB_ANY, 2,        0,            verb_reboot_to_firmware  },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+        _cleanup_(umount_and_freep) char *mounted_dir = NULL;
+        int r;
+
+        log_setup();
+
+        /* If we run in a container, automatically turn off EFI file system access */
+        if (detect_container() > 0)
+                arg_touch_variables = false;
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        if (arg_print_root_device > 0) {
+                _cleanup_free_ char *path = NULL;
+                dev_t devno;
+
+                r = blockdev_get_root(LOG_ERR, &devno);
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        log_error("Root file system not backed by a (single) whole block device.");
+                        return 80; /* some recognizable error code */
+                }
+
+                if (arg_print_root_device > 1) {
+                        r = block_get_whole_disk(devno, &devno);
+                        if (r < 0)
+                                log_debug_errno(r, "Unable to find whole block device for root block device, ignoring: %m");
+                }
+
+                r = device_path_make_canonical(S_IFBLK, devno, &path);
+                if (r < 0)
+                        return log_error_errno(r,
+                                               "Failed to format canonical device path for devno '" DEVNUM_FORMAT_STR "': %m",
+                                               DEVNUM_FORMAT_VAL(devno));
+
+                puts(path);
+                return 0;
+        }
+
+        /* Open up and mount the image */
+        if (arg_image) {
+                assert(!arg_root);
+
+                r = mount_image_privately_interactively(
+                                arg_image,
+                                arg_image_policy,
+                                DISSECT_IMAGE_GENERIC_ROOT |
+                                DISSECT_IMAGE_RELAX_VAR_CHECK,
+                                &mounted_dir,
+                                /* ret_dir_fd= */ NULL,
+                                &loop_device);
+                if (r < 0)
+                        return r;
+
+                arg_root = strdup(mounted_dir);
+                if (!arg_root)
+                        return log_oom();
+        }
+
+        return bootctl_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/boot/bootctl.h b/src/boot/bootctl.h
new file mode 100644
index 0000000..e395b33
--- /dev/null
+++ b/src/boot/bootctl.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-id128.h"
+
+#include "boot-entry.h"
+#include "image-policy.h"
+#include "json.h"
+#include "pager.h"
+
+typedef enum InstallSource {
+        ARG_INSTALL_SOURCE_IMAGE,
+        ARG_INSTALL_SOURCE_HOST,
+        ARG_INSTALL_SOURCE_AUTO,
+} InstallSource;
+
+extern char *arg_esp_path;
+extern char *arg_xbootldr_path;
+extern bool arg_print_esp_path;
+extern bool arg_print_dollar_boot_path;
+extern unsigned arg_print_root_device;
+extern bool arg_touch_variables;
+extern PagerFlags arg_pager_flags;
+extern bool arg_graceful;
+extern bool arg_quiet;
+extern int arg_make_entry_directory; /* tri-state: < 0 for automatic logic */
+extern sd_id128_t arg_machine_id;
+extern char *arg_install_layout;
+extern BootEntryTokenType arg_entry_token_type;
+extern char *arg_entry_token;
+extern JsonFormatFlags arg_json_format_flags;
+extern bool arg_arch_all;
+extern char *arg_root;
+extern char *arg_image;
+extern InstallSource arg_install_source;
+extern char *arg_efi_boot_option_description;
+extern bool arg_dry_run;
+extern ImagePolicy *arg_image_policy;
+
+static inline const char *arg_dollar_boot_path(void) {
+        /* $BOOT shall be the XBOOTLDR partition if it exists, and otherwise the ESP */
+        return arg_xbootldr_path ?: arg_esp_path;
+}
+
+int acquire_esp(int unprivileged_mode, bool graceful, uint32_t *ret_part, uint64_t *ret_pstart, uint64_t *ret_psize, sd_id128_t *ret_uuid, dev_t *ret_devid);
+int acquire_xbootldr(int unprivileged_mode, sd_id128_t *ret_uuid, dev_t *ret_devid);
diff --git a/src/boot/efi/UEFI_SECURITY.md b/src/boot/efi/UEFI_SECURITY.md
new file mode 100644
index 0000000..9f750d8
--- /dev/null
+++ b/src/boot/efi/UEFI_SECURITY.md
@@ -0,0 +1,122 @@
+# UEFI Components Security Posture
+The systemd project provides a UEFI boot menu, `systemd-boot`, and a stub that can wrap a Linux kernel in a
+PE binary, adding various features, `systemd-stub`. These components fully support UEFI SecureBoot, and
+this document will describe their security posture and how they comply with industry-standard expectations
+for UEFI SecureBoot workflows.
+
+Note that `systemd-stub` is not the same, or an alternative, to the Linux kernel's own EFI stub. The kernel
+stub's role is that of the fundamental entrypoint to kernel execution from UEFI mode, implementing the
+modern Linux boot protocol. `systemd-stub` on the other hand loads various resources, including the kernel
+image, via the EFI LoadImage/StartImage protocol (although it does support the legacy Linux boot protocol,
+as a fallback for older kernels on x86). The purpose of `systemd-stub` is to provide additional features and
+functionality for either or both `systemd-boot` and `systemd` (userspace).
+
+## Fundamental Security Design Goals
+The fundamental security design goals for these components are separation of security policy logic from the
+rest of the functionality, achieved by offloading security-critical tasks to the firmware or earlier stages
+of the boot process (e.g.: `Shim`).
+
+When SecureBoot is enabled, these components are designed to avoid executing, loading or using
+unauthenticated payloads that could compromise the boot process, with special care taken for anything that
+could affect the system before `ExitBootServices()` has been called. For example, when additional resources
+are loaded, if running with SecureBoot enabled, they will be validated before use. The only exceptions are
+the bootloader's own textual configuration files, and parsing metadata out of images for displaying purposes
+only. There are no build time or runtime configuration options that can be set to weaken the security model
+of these components when SecureBoot is enabled.
+
+The role of `systemd-boot` is to discover next stage components in the ESP (and XBOOTLDR if present), via
+filesystem enumeration or explicit configuration files, and present a menu to the user, to choose the next
+step. This auto discovery mechanism is described in details in the [BLS (Boot Loader
+Specification)](https://uapi-group.org/specifications/specs/boot_loader_specification/).
+
+The role of `systemd-stub` is to load and measure in the TPM the post-bootloader stages, such as the kernel,
+initrd and kernel command line, and implement optional features such as augmenting the initrd with
+additional content such as configuration or optional services. [Unified Kernel
+Images](https://uapi-group.org/specifications/specs/unified_kernel_image/) embed `systemd-stub`, a kernel
+and other optional components as sections in a PE signed binary, that can thus be executed in UEFI
+environments.
+
+Since it is embedded in a PE signed binary, `systemd-stub` will temporarily disable the UEFI authentication
+protocol while loading the payload kernel it wraps, in order to avoid redundant duplicate authentication of
+the image, given that the payload kernel was already authenticated and verified as part of the whole image.
+SecureBoot authentication is re-enabled immediately after the kernel image has been loaded.
+
+Various EFI variables, under the vendor UUID `4a67b082-0a4c-41cf-b6c7-440b29bb8c4f`, are set and read by
+these components, to pass metadata and configuration between different stages of the boot process, as
+defined in the [Boot Loader Interface](https://systemd.io/BOOT_LOADER_INTERFACE/).
+
+## Dependencies
+Neither of these components implements cryptographic primitives, cryptographic checks or drivers. File
+access to the ESP is implemented solely via the appropriate UEFI file protocols. Verification of next stage
+payloads is implementend solely via the appropriate UEFI image load protocols, which means authenticode
+signature checks are again done by the firmware or `Shim`. As a consequence, no external security-critical
+libraries (such as OpenSSL or gnu-efi) are used, linked or embedded.
+
+## Additional Resources
+BLS Type #1 entries allow the user to load two types of additional resources that can affect the system
+before `ExitBootServices()` has been called, kernel command line arguments and Devicetree blobs, that are
+not validated before use, as they do not carry signatures. For this reason, when SecureBoot is enabled,
+loading these resources is automatically disabled. There is no override for this security mechanism, neither
+at build time nor at runtime. Note that initrds are also not verified in BLS Type #1 configurations, for
+compatibility with how SecureBoot has been traditionally handled on Linux-based OSes, as the kernel will
+only load them after `ExitBootServices()` has been called.
+
+Another mechanism is supported by `systemd-boot` and `systemd-stub` to add additional payloads to the boot
+process: `addons`. Addons are PE signed binaries that can carry kernel command line arguments or Devicetree
+blobs (more might be added in the future). In contrast to the user-specified additions in the Type #1 case
+described above, these addons are loaded through the UEFI image loading protocol, and thus are subject to
+signature validation, and will be rejected if not signed or if the signature is invalid, following the
+standard SecureBoot model. They are also measured in the TPM.
+
+`systemd-boot` will also load file system drivers that are stored in the ESP, to allow enhancing the
+firmware's capabilities. These are again PE signed binaries and will be verified using the appropriate
+UEFI protocol.
+
+A random seed will be loaded and passed to the kernel for early-boot entropy pool filling if found in the
+ESP. It is mixed with various other sources of entropy available in the UEFI environment, such as the RNG
+protocol, the boot counter and the clock. Moreover, the seed is updated before the kernel is invoked, as
+well as after the kernel is invoked (from userspace), with a new seed derived from the Linux kernel entropy
+pool.
+
+When operating as a virtual machine payload, the loaded payloads can be customized via `SMBIOS Type 11
+Strings`, if the hypervisor specifies them. This is automatically disabled if running inside a confidential
+computing VM.
+
+## Certificates Enrollment
+When SecureBoot is supported but in `setup` mode, `systemd-boot` can enroll user certificates if a set of
+`PK`, `KEK` and `db` certificates is found in the ESP, after which SecureBoot is enabled and a firmware
+reset is performed. When running on bare metal, the certificate(s) will be shown to the user on the console,
+and manual confirmation will be asked before proceeding. When running as a virtual machine payload,
+enrollment is fully automated, without user interaction, unless disabled via a configuration file in the
+ESP. The configuration file can also be used to disable enrollment completely.
+
+## Compiler Hardening
+The PE binaries are built with `-fstack-protector-strong`, and the stack canary is seeded with random data if
+the UEFI RNG protocol is available.
+
+The binaries also are linked with `-z relro` and ship with native PE relocations, with the conversion from
+ELF performed at build time, instead of containing ELF dynamic relocations, so the image loaded by
+firmware/Shim requires fewer writable pages.
+
+The binaries are linked by default with full LTO support, so no code will be shipped unless it's reachable.
+
+Finally, the binaries ship with the `NX_COMPAT` bit set.
+
+The CI infrastructure also employs fuzz testing on various components, including string functions and the
+BCD parser.
+
+## SBAT
+`systemd-boot` and `systemd-stub` are built with an `SBAT` section by default. There are build options to
+allow customizations of the metadata included in the section, that can be used by downstream distributors.
+The `systemd` project will participate in the coordinated `SBAT` disclosure and metadata revision process as
+deemed necessary, in coordination with the Shim Review group.
+
+The upstream project name used to be unified (`systemd`) for both components, but since version v255 has
+been split into separate `systemd-boot` and `systemd-stub` project names, so that each component can be
+revisioned independently. Most of the code tend to be shared between these two components, but there is no
+complete overlap, so it is possible for a vulnerability to affect only one component but not the other.
+
+## Known Vulnerabilities
+There is currently one known (and fixed) security vulnerability affecting `systemd-boot` on arm64 and
+riscv64 systems. For details of the affected and fixed versions, please see the [published security
+advisory.](https://github.com/systemd/systemd/security/advisories/GHSA-6m6p-rjcq-334c)
diff --git a/src/boot/efi/addon.c b/src/boot/efi/addon.c
new file mode 100644
index 0000000..95b29da
--- /dev/null
+++ b/src/boot/efi/addon.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "efi.h"
+#include "version.h"
+
+/* Magic string for recognizing our own binaries */
+DECLARE_NOALLOC_SECTION(".sdmagic", "#### LoaderInfo: systemd-addon " GIT_VERSION " ####");
+
+/* This is intended to carry data, not to be executed */
+
+EFIAPI EFI_STATUS efi_main(EFI_HANDLE image, EFI_SYSTEM_TABLE *system_table);
+EFIAPI EFI_STATUS efi_main(EFI_HANDLE image, EFI_SYSTEM_TABLE *system_table) {
+        return EFI_UNSUPPORTED;
+}
diff --git a/src/boot/efi/bcd.c b/src/boot/efi/bcd.c
new file mode 100644
index 0000000..4533d47
--- /dev/null
+++ b/src/boot/efi/bcd.c
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "bcd.h"
+#include "efi-string.h"
+
+enum {
+        SIG_BASE_BLOCK  = 1718052210, /* regf */
+        SIG_KEY         = 27502,      /* nk */
+        SIG_SUBKEY_FAST = 26220,      /* lf */
+        SIG_KEY_VALUE   = 27510,      /* vk */
+};
+
+enum {
+        REG_SZ       = 1,
+        REG_MULTI_SZ = 7,
+};
+
+/* These structs contain a lot more members than we care for. They have all
+ * been squashed into _padN for our convenience. */
+
+typedef struct {
+        uint32_t sig;
+        uint32_t primary_seqnum;
+        uint32_t secondary_seqnum;
+        uint64_t _pad1;
+        uint32_t version_major;
+        uint32_t version_minor;
+        uint32_t type;
+        uint32_t _pad2;
+        uint32_t root_cell_offset;
+        uint64_t _pad3[507];
+} _packed_ BaseBlock;
+assert_cc(sizeof(BaseBlock) == 4096);
+assert_cc(offsetof(BaseBlock, sig) == 0);
+assert_cc(offsetof(BaseBlock, primary_seqnum) == 4);
+assert_cc(offsetof(BaseBlock, secondary_seqnum) == 8);
+assert_cc(offsetof(BaseBlock, version_major) == 20);
+assert_cc(offsetof(BaseBlock, version_minor) == 24);
+assert_cc(offsetof(BaseBlock, type) == 28);
+assert_cc(offsetof(BaseBlock, root_cell_offset) == 36);
+
+/* All offsets are relative to the base block and technically point to a hive
+ * cell struct. But for our use case we don't need to bother about that one,
+ * so skip over the cell_size uint32_t. */
+#define HIVE_CELL_OFFSET (sizeof(BaseBlock) + 4)
+
+typedef struct {
+        uint16_t sig;
+        uint16_t _pad1[13];
+        uint32_t subkeys_offset;
+        uint32_t _pad2;
+        uint32_t n_key_values;
+        uint32_t key_values_offset;
+        uint32_t _pad3[7];
+        uint16_t key_name_len;
+        uint16_t _pad4;
+        char key_name[];
+} _packed_ Key;
+assert_cc(offsetof(Key, sig) == 0);
+assert_cc(offsetof(Key, subkeys_offset) == 28);
+assert_cc(offsetof(Key, n_key_values) == 36);
+assert_cc(offsetof(Key, key_values_offset) == 40);
+assert_cc(offsetof(Key, key_name_len) == 72);
+assert_cc(offsetof(Key, key_name) == 76);
+
+typedef struct {
+        uint16_t sig;
+        uint16_t n_entries;
+        struct SubkeyFastEntry {
+                uint32_t key_offset;
+                char name_hint[4];
+        } _packed_ entries[];
+} _packed_ SubkeyFast;
+assert_cc(offsetof(SubkeyFast, sig) == 0);
+assert_cc(offsetof(SubkeyFast, n_entries) == 2);
+assert_cc(offsetof(SubkeyFast, entries) == 4);
+
+typedef struct {
+        uint16_t sig;
+        uint16_t name_len;
+        uint32_t data_size;
+        uint32_t data_offset;
+        uint32_t data_type;
+        uint32_t _pad;
+        char name[];
+} _packed_ KeyValue;
+assert_cc(offsetof(KeyValue, sig) == 0);
+assert_cc(offsetof(KeyValue, name_len) == 2);
+assert_cc(offsetof(KeyValue, data_size) == 4);
+assert_cc(offsetof(KeyValue, data_offset) == 8);
+assert_cc(offsetof(KeyValue, data_type) == 12);
+assert_cc(offsetof(KeyValue, name) == 20);
+
+#define BAD_OFFSET(offset, len, max) \
+        ((uint64_t) (offset) + (len) >= (max))
+
+#define BAD_STRUCT(type, offset, max) \
+        ((uint64_t) (offset) + sizeof(type) >= (max))
+
+#define BAD_ARRAY(type, array, offset, array_len, max) \
+        ((uint64_t) (offset) + offsetof(type, array) + \
+         sizeof((type){}.array[0]) * (uint64_t) (array_len) >= (max))
+
+static const Key *get_key(const uint8_t *bcd, uint32_t bcd_len, uint32_t offset, const char *name);
+
+static const Key *get_subkey(const uint8_t *bcd, uint32_t bcd_len, uint32_t offset, const char *name) {
+        assert(bcd);
+        assert(name);
+
+        if (BAD_STRUCT(SubkeyFast, offset, bcd_len))
+                return NULL;
+
+        const SubkeyFast *subkey = (const SubkeyFast *) (bcd + offset);
+        if (subkey->sig != SIG_SUBKEY_FAST)
+                return NULL;
+
+        if (BAD_ARRAY(SubkeyFast, entries, offset, subkey->n_entries, bcd_len))
+                return NULL;
+
+        for (uint16_t i = 0; i < subkey->n_entries; i++) {
+                if (!strncaseeq8(name, subkey->entries[i].name_hint, sizeof(subkey->entries[i].name_hint)))
+                        continue;
+
+                const Key *key = get_key(bcd, bcd_len, subkey->entries[i].key_offset, name);
+                if (key)
+                        return key;
+        }
+
+        return NULL;
+}
+
+/* We use NUL as registry path separators for convenience. To start from the root, begin
+ * name with a NUL. Name must end with two NUL. The lookup depth is not restricted, so
+ * name must be properly validated before calling get_key(). */
+static const Key *get_key(const uint8_t *bcd, uint32_t bcd_len, uint32_t offset, const char *name) {
+        assert(bcd);
+        assert(name);
+
+        if (BAD_STRUCT(Key, offset, bcd_len))
+                return NULL;
+
+        const Key *key = (const Key *) (bcd + offset);
+        if (key->sig != SIG_KEY)
+                return NULL;
+
+        if (BAD_ARRAY(Key, key_name, offset, key->key_name_len, bcd_len))
+                return NULL;
+
+        if (*name) {
+                if (strncaseeq8(name, key->key_name, key->key_name_len) && strlen8(name) == key->key_name_len)
+                        name += key->key_name_len;
+                else
+                        return NULL;
+        }
+
+        name++;
+        return *name ? get_subkey(bcd, bcd_len, key->subkeys_offset, name) : key;
+}
+
+static const KeyValue *get_key_value(const uint8_t *bcd, uint32_t bcd_len, const Key *key, const char *name) {
+        assert(bcd);
+        assert(key);
+        assert(name);
+
+        if (key->n_key_values == 0)
+                return NULL;
+
+        if (BAD_OFFSET(key->key_values_offset, sizeof(uint32_t) * (uint64_t) key->n_key_values, bcd_len) ||
+            (uintptr_t) (bcd + key->key_values_offset) % alignof(uint32_t) != 0)
+                return NULL;
+
+        const uint32_t *key_value_list = (const uint32_t *) (bcd + key->key_values_offset);
+        for (uint32_t i = 0; i < key->n_key_values; i++) {
+                uint32_t offset = *(key_value_list + i);
+
+                if (BAD_STRUCT(KeyValue, offset, bcd_len))
+                        continue;
+
+                const KeyValue *kv = (const KeyValue *) (bcd + offset);
+                if (kv->sig != SIG_KEY_VALUE)
+                        continue;
+
+                if (BAD_ARRAY(KeyValue, name, offset, kv->name_len, bcd_len))
+                        continue;
+
+                /* If most significant bit is set, data is stored in data_offset itself, but
+                 * we are only interested in UTF16 strings. The only strings that could fit
+                 * would have just one char in it, so let's not bother with this. */
+                if (FLAGS_SET(kv->data_size, UINT32_C(1) << 31))
+                        continue;
+
+                if (BAD_OFFSET(kv->data_offset, kv->data_size, bcd_len))
+                        continue;
+
+                if (strncaseeq8(name, kv->name, kv->name_len) && strlen8(name) == kv->name_len)
+                        return kv;
+        }
+
+        return NULL;
+}
+
+/* The BCD store is really just a regular windows registry hive with a rather cryptic internal
+ * key structure. On a running system it gets mounted to HKEY_LOCAL_MACHINE\BCD00000000.
+ *
+ * Of interest to us are these two keys:
+ * - \Objects\{bootmgr}\Elements\24000001
+ *   This key is the "displayorder" property and contains a value of type REG_MULTI_SZ
+ *   with the name "Element" that holds a {GUID} list (UTF16, NUL-separated).
+ * - \Objects\{GUID}\Elements\12000004
+ *   This key is the "description" property and contains a value of type REG_SZ with the
+ *   name "Element" that holds a NUL-terminated UTF16 string.
+ *
+ * The GUIDs and properties are as reported by "bcdedit.exe /v".
+ *
+ * To get a title for the BCD store we first look at the displayorder property of {bootmgr}
+ * (it always has the GUID 9dea862c-5cdd-4e70-acc1-f32b344d4795). If it contains more than
+ * one GUID, the BCD is multi-boot and we stop looking. Otherwise we take that GUID, look it
+ * up, and return its description property. */
+char16_t *get_bcd_title(uint8_t *bcd, size_t bcd_len) {
+        assert(bcd);
+
+        if (HIVE_CELL_OFFSET >= bcd_len)
+                return NULL;
+
+        BaseBlock *base_block = (BaseBlock *) bcd;
+        if (base_block->sig != SIG_BASE_BLOCK ||
+            base_block->version_major != 1 ||
+            base_block->version_minor != 3 ||
+            base_block->type != 0 ||
+            base_block->primary_seqnum != base_block->secondary_seqnum)
+                return NULL;
+
+        bcd += HIVE_CELL_OFFSET;
+        bcd_len -= HIVE_CELL_OFFSET;
+
+        const Key *objects_key = get_key(bcd, bcd_len, base_block->root_cell_offset, "\0Objects\0");
+        if (!objects_key)
+                return NULL;
+
+        const Key *displayorder_key = get_subkey(
+                        bcd,
+                        bcd_len,
+                        objects_key->subkeys_offset,
+                        "{9dea862c-5cdd-4e70-acc1-f32b344d4795}\0Elements\00024000001\0");
+        if (!displayorder_key)
+                return NULL;
+
+        const KeyValue *displayorder_value = get_key_value(bcd, bcd_len, displayorder_key, "Element");
+        if (!displayorder_value)
+                return NULL;
+
+        char order_guid[sizeof("{00000000-0000-0000-0000-000000000000}\0")];
+        if (displayorder_value->data_type != REG_MULTI_SZ ||
+            displayorder_value->data_size != sizeof(char16_t[sizeof(order_guid)]) ||
+            (uintptr_t) (bcd + displayorder_value->data_offset) % alignof(char16_t) != 0)
+                /* BCD is multi-boot. */
+                return NULL;
+
+        /* Keys are stored as ASCII in registry hives if the data fits (and GUIDS always should). */
+        char16_t *order_guid_utf16 = (char16_t *) (bcd + displayorder_value->data_offset);
+        for (size_t i = 0; i < sizeof(order_guid) - 2; i++) {
+                char16_t c = order_guid_utf16[i];
+                switch (c) {
+                case '-':
+                case '{':
+                case '}':
+                case '0' ... '9':
+                case 'a' ... 'f':
+                case 'A' ... 'F':
+                        order_guid[i] = c;
+                        break;
+                default:
+                        /* Not a valid GUID. */
+                        return NULL;
+                }
+        }
+        /* Our functions expect the lookup key to be double-derminated. */
+        order_guid[sizeof(order_guid) - 2] = '\0';
+        order_guid[sizeof(order_guid) - 1] = '\0';
+
+        const Key *default_key = get_subkey(bcd, bcd_len, objects_key->subkeys_offset, order_guid);
+        if (!default_key)
+                return NULL;
+
+        const Key *description_key = get_subkey(
+                        bcd, bcd_len, default_key->subkeys_offset, "Elements\00012000004\0");
+        if (!description_key)
+                return NULL;
+
+        const KeyValue *description_value = get_key_value(bcd, bcd_len, description_key, "Element");
+        if (!description_value)
+                return NULL;
+
+        if (description_value->data_type != REG_SZ ||
+            description_value->data_size < sizeof(char16_t) ||
+            description_value->data_size % sizeof(char16_t) != 0 ||
+            (uintptr_t) (bcd + description_value->data_offset) % alignof(char16_t))
+                return NULL;
+
+        /* The data should already be NUL-terminated. */
+        char16_t *title = (char16_t *) (bcd + description_value->data_offset);
+        title[description_value->data_size / sizeof(char16_t) - 1] = '\0';
+        return title;
+}
diff --git a/src/boot/efi/bcd.h b/src/boot/efi/bcd.h
new file mode 100644
index 0000000..bb12d89
--- /dev/null
+++ b/src/boot/efi/bcd.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+char16_t *get_bcd_title(uint8_t *bcd, size_t bcd_len);
diff --git a/src/boot/efi/boot.c b/src/boot/efi/boot.c
new file mode 100644
index 0000000..5c0f0ab
--- /dev/null
+++ b/src/boot/efi/boot.c
@@ -0,0 +1,2748 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bcd.h"
+#include "bootspec-fundamental.h"
+#include "console.h"
+#include "device-path-util.h"
+#include "devicetree.h"
+#include "drivers.h"
+#include "efivars-fundamental.h"
+#include "graphics.h"
+#include "initrd.h"
+#include "linux.h"
+#include "measure.h"
+#include "part-discovery.h"
+#include "pe.h"
+#include "proto/block-io.h"
+#include "proto/device-path.h"
+#include "proto/simple-text-io.h"
+#include "random-seed.h"
+#include "sbat.h"
+#include "secure-boot.h"
+#include "shim.h"
+#include "ticks.h"
+#include "tpm2-pcr.h"
+#include "util.h"
+#include "version.h"
+#include "vmm.h"
+
+/* Magic string for recognizing our own binaries */
+#define SD_MAGIC "#### LoaderInfo: systemd-boot " GIT_VERSION " ####"
+DECLARE_NOALLOC_SECTION(".sdmagic", SD_MAGIC);
+
+/* Makes systemd-boot available from \EFI\Linux\ for testing purposes. */
+DECLARE_NOALLOC_SECTION(
+                ".osrel",
+                "ID=systemd-boot\n"
+                "VERSION=\"" GIT_VERSION "\"\n"
+                "NAME=\"systemd-boot " GIT_VERSION "\"\n");
+
+DECLARE_SBAT(SBAT_BOOT_SECTION_TEXT);
+
+typedef enum LoaderType {
+        LOADER_UNDEFINED,
+        LOADER_AUTO,
+        LOADER_EFI,
+        LOADER_LINUX,         /* Boot loader spec type #1 entries */
+        LOADER_UNIFIED_LINUX, /* Boot loader spec type #2 entries */
+        LOADER_SECURE_BOOT_KEYS,
+        _LOADER_TYPE_MAX,
+} LoaderType;
+
+typedef struct {
+        char16_t *id;         /* The unique identifier for this entry (typically the filename of the file defining the entry) */
+        char16_t *title_show; /* The string to actually display (this is made unique before showing) */
+        char16_t *title;      /* The raw (human readable) title string of the entry (not necessarily unique) */
+        char16_t *sort_key;   /* The string to use as primary sort key, usually ID= from os-release, possibly suffixed */
+        char16_t *version;    /* The raw (human readable) version string of the entry */
+        char16_t *machine_id;
+        EFI_HANDLE *device;
+        LoaderType type;
+        char16_t *loader;
+        char16_t *devicetree;
+        char16_t *options;
+        bool options_implied; /* If true, these options are implied if we invoke the PE binary without any parameters (as in: UKI). If false we must specify these options explicitly. */
+        char16_t **initrd;
+        char16_t key;
+        EFI_STATUS (*call)(void);
+        int tries_done;
+        int tries_left;
+        char16_t *path;
+        char16_t *current_name;
+        char16_t *next_name;
+} BootEntry;
+
+typedef struct {
+        BootEntry **entries;
+        size_t n_entries;
+        size_t idx_default;
+        size_t idx_default_efivar;
+        uint64_t timeout_sec; /* Actual timeout used (efi_main() override > efivar > config). */
+        uint64_t timeout_sec_config;
+        uint64_t timeout_sec_efivar;
+        char16_t *entry_default_config;
+        char16_t *entry_default_efivar;
+        char16_t *entry_oneshot;
+        char16_t *entry_saved;
+        bool editor;
+        bool auto_entries;
+        bool auto_firmware;
+        bool auto_poweroff;
+        bool auto_reboot;
+        bool reboot_for_bitlocker;
+        secure_boot_enroll secure_boot_enroll;
+        bool force_menu;
+        bool use_saved_entry;
+        bool use_saved_entry_efivar;
+        bool beep;
+        int64_t console_mode;
+        int64_t console_mode_efivar;
+} Config;
+
+/* These values have been chosen so that the transitions the user sees could employ unsigned over-/underflow
+ * like this:
+ * efivar unset ↔ force menu ↔ no timeout/skip menu ↔ 1 s ↔ 2 s ↔ …
+ *
+ * Note: all the values below are ABI, so they are not allowed to change. The bootctl tool sets the numerical
+ * value of TIMEOUT_MENU_FORCE and TIMEOUT_MENU_HIDDEN, instead of the string for compatibility reasons.
+ *
+ * The other values may be set by systemd-boot itself and changing those will lead to functional regression
+ * when new version of systemd-boot is installed.
+ *
+ * All the 64bit values are not ABI and will never be written to an efi variable.
+ */
+enum {
+        TIMEOUT_MIN           = 1,
+        TIMEOUT_MAX           = UINT32_MAX - 2U,
+        TIMEOUT_UNSET         = UINT32_MAX - 1U,
+        TIMEOUT_MENU_FORCE    = UINT32_MAX,
+        TIMEOUT_MENU_HIDDEN   = 0,
+        TIMEOUT_TYPE_MAX      = UINT32_MAX,
+        TIMEOUT_MENU_DISABLED = (uint64_t)UINT32_MAX + 1U,
+        TIMEOUT_TYPE_MAX64    = UINT64_MAX,
+};
+
+enum {
+        IDX_MAX = INT16_MAX,
+        IDX_INVALID,
+};
+
+static void cursor_left(size_t *cursor, size_t *first) {
+        assert(cursor);
+        assert(first);
+
+        if ((*cursor) > 0)
+                (*cursor)--;
+        else if ((*first) > 0)
+                (*first)--;
+}
+
+static void cursor_right(size_t *cursor, size_t *first, size_t x_max, size_t len) {
+        assert(cursor);
+        assert(first);
+
+        if ((*cursor)+1 < x_max)
+                (*cursor)++;
+        else if ((*first) + (*cursor) < len)
+                (*first)++;
+}
+
+static bool line_edit(char16_t **line_in, size_t x_max, size_t y_pos) {
+        _cleanup_free_ char16_t *line = NULL, *print = NULL;
+        size_t size, len, first = 0, cursor = 0, clear = 0;
+
+        /* Edit the line and return true if it should be executed, false if not. */
+
+        assert(line_in);
+
+        len = strlen16(*line_in);
+        size = len + 1024;
+        line = xnew(char16_t, size);
+        print = xnew(char16_t, x_max + 1);
+        strcpy16(line, strempty(*line_in));
+
+        for (;;) {
+                EFI_STATUS err;
+                uint64_t key;
+                size_t j, cursor_color = EFI_TEXT_ATTR_SWAP(COLOR_EDIT);
+
+                j = MIN(len - first, x_max);
+                memcpy(print, line + first, j * sizeof(char16_t));
+                while (clear > 0 && j < x_max) {
+                        clear--;
+                        print[j++] = ' ';
+                }
+                print[j] = '\0';
+
+                /* See comment at edit_line() call site for why we start at 1. */
+                print_at(1, y_pos, COLOR_EDIT, print);
+
+                if (!print[cursor])
+                        print[cursor] = ' ';
+                print[cursor+1] = '\0';
+                do {
+                        print_at(cursor + 1, y_pos, cursor_color, print + cursor);
+                        cursor_color = EFI_TEXT_ATTR_SWAP(cursor_color);
+
+                        err = console_key_read(&key, 750 * 1000);
+                        if (!IN_SET(err, EFI_SUCCESS, EFI_TIMEOUT, EFI_NOT_READY))
+                                return false;
+
+                        print_at(cursor + 1, y_pos, COLOR_EDIT, print + cursor);
+                } while (err != EFI_SUCCESS);
+
+                switch (key) {
+                case KEYPRESS(0, SCAN_ESC, 0):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'c'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'g'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('c')):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('g')):
+                        return false;
+
+                case KEYPRESS(0, SCAN_HOME, 0):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'a'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('a')):
+                        /* beginning-of-line */
+                        cursor = 0;
+                        first = 0;
+                        continue;
+
+                case KEYPRESS(0, SCAN_END, 0):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'e'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('e')):
+                        /* end-of-line */
+                        cursor = len - first;
+                        if (cursor+1 >= x_max) {
+                                cursor = x_max-1;
+                                first = len - (x_max-1);
+                        }
+                        continue;
+
+                case KEYPRESS(0, SCAN_DOWN, 0):
+                case KEYPRESS(EFI_ALT_PRESSED, 0, 'f'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, SCAN_RIGHT, 0):
+                        /* forward-word */
+                        while (line[first + cursor] == ' ')
+                                cursor_right(&cursor, &first, x_max, len);
+                        while (line[first + cursor] && line[first + cursor] != ' ')
+                                cursor_right(&cursor, &first, x_max, len);
+                        continue;
+
+                case KEYPRESS(0, SCAN_UP, 0):
+                case KEYPRESS(EFI_ALT_PRESSED, 0, 'b'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, SCAN_LEFT, 0):
+                        /* backward-word */
+                        if ((first + cursor) > 0 && line[first + cursor-1] == ' ') {
+                                cursor_left(&cursor, &first);
+                                while ((first + cursor) > 0 && line[first + cursor] == ' ')
+                                        cursor_left(&cursor, &first);
+                        }
+                        while ((first + cursor) > 0 && line[first + cursor-1] != ' ')
+                                cursor_left(&cursor, &first);
+                        continue;
+
+                case KEYPRESS(0, SCAN_RIGHT, 0):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'f'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('f')):
+                        /* forward-char */
+                        if (first + cursor == len)
+                                continue;
+                        cursor_right(&cursor, &first, x_max, len);
+                        continue;
+
+                case KEYPRESS(0, SCAN_LEFT, 0):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'b'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('b')):
+                        /* backward-char */
+                        cursor_left(&cursor, &first);
+                        continue;
+
+                case KEYPRESS(EFI_CONTROL_PRESSED, SCAN_DELETE, 0):
+                case KEYPRESS(EFI_ALT_PRESSED, 0, 'd'):
+                        /* kill-word */
+                        clear = 0;
+
+                        size_t k;
+                        for (k = first + cursor; k < len && line[k] == ' '; k++)
+                                clear++;
+                        for (; k < len && line[k] != ' '; k++)
+                                clear++;
+
+                        for (size_t i = first + cursor; i + clear < len; i++)
+                                line[i] = line[i + clear];
+                        len -= clear;
+                        line[len] = '\0';
+                        continue;
+
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'w'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('w')):
+                case KEYPRESS(EFI_ALT_PRESSED, 0, '\b'):
+                        /* backward-kill-word */
+                        clear = 0;
+                        if ((first + cursor) > 0 && line[first + cursor-1] == ' ') {
+                                cursor_left(&cursor, &first);
+                                clear++;
+                                while ((first + cursor) > 0 && line[first + cursor] == ' ') {
+                                        cursor_left(&cursor, &first);
+                                        clear++;
+                                }
+                        }
+                        while ((first + cursor) > 0 && line[first + cursor-1] != ' ') {
+                                cursor_left(&cursor, &first);
+                                clear++;
+                        }
+
+                        for (size_t i = first + cursor; i + clear < len; i++)
+                                line[i] = line[i + clear];
+                        len -= clear;
+                        line[len] = '\0';
+                        continue;
+
+                case KEYPRESS(0, SCAN_DELETE, 0):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'd'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('d')):
+                        if (len == 0)
+                                continue;
+                        if (first + cursor == len)
+                                continue;
+                        for (size_t i = first + cursor; i < len; i++)
+                                line[i] = line[i+1];
+                        clear = 1;
+                        len--;
+                        continue;
+
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'k'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('k')):
+                        /* kill-line */
+                        line[first + cursor] = '\0';
+                        clear = len - (first + cursor);
+                        len = first + cursor;
+                        continue;
+
+                case KEYPRESS(0, 0, '\n'):
+                case KEYPRESS(0, 0, '\r'):
+                case KEYPRESS(0, SCAN_F3, 0): /* EZpad Mini 4s firmware sends malformed events */
+                case KEYPRESS(0, SCAN_F3, '\r'): /* Teclast X98+ II firmware sends malformed events */
+                        if (!streq16(line, *line_in)) {
+                                free(*line_in);
+                                *line_in = TAKE_PTR(line);
+                        }
+                        return true;
+
+                case KEYPRESS(0, 0, '\b'):
+                        if (len == 0)
+                                continue;
+                        if (first == 0 && cursor == 0)
+                                continue;
+                        for (size_t i = first + cursor-1; i < len; i++)
+                                line[i] = line[i+1];
+                        clear = 1;
+                        len--;
+                        if (cursor > 0)
+                                cursor--;
+                        if (cursor > 0 || first == 0)
+                                continue;
+                        /* show full line if it fits */
+                        if (len < x_max) {
+                                cursor = first;
+                                first = 0;
+                                continue;
+                        }
+                        /* jump left to see what we delete */
+                        if (first > 10) {
+                                first -= 10;
+                                cursor = 10;
+                        } else {
+                                cursor = first;
+                                first = 0;
+                        }
+                        continue;
+
+                case KEYPRESS(0, 0, ' ') ... KEYPRESS(0, 0, '~'):
+                case KEYPRESS(0, 0, 0x80) ... KEYPRESS(0, 0, 0xffff):
+                        if (len+1 == size)
+                                continue;
+                        for (size_t i = len; i > first + cursor; i--)
+                                line[i] = line[i-1];
+                        line[first + cursor] = KEYCHAR(key);
+                        len++;
+                        line[len] = '\0';
+                        if (cursor+1 < x_max)
+                                cursor++;
+                        else if (first + cursor < len)
+                                first++;
+                        continue;
+                }
+        }
+}
+
+static size_t entry_lookup_key(Config *config, size_t start, char16_t key) {
+        assert(config);
+
+        if (key == 0)
+                return IDX_INVALID;
+
+        /* select entry by number key */
+        if (key >= '1' && key <= '9') {
+                size_t i = key - '0';
+                if (i > config->n_entries)
+                        i = config->n_entries;
+                return i-1;
+        }
+
+        /* find matching key in boot entries */
+        for (size_t i = start; i < config->n_entries; i++)
+                if (config->entries[i]->key == key)
+                        return i;
+
+        for (size_t i = 0; i < start; i++)
+                if (config->entries[i]->key == key)
+                        return i;
+
+        return IDX_INVALID;
+}
+
+static char16_t* update_timeout_efivar(Config *config, bool inc) {
+        assert(config);
+
+        switch (config->timeout_sec) {
+        case TIMEOUT_MAX:
+                config->timeout_sec = inc ? TIMEOUT_MAX : config->timeout_sec - 1;
+                break;
+        case TIMEOUT_UNSET:
+                config->timeout_sec = inc ? TIMEOUT_MENU_FORCE : TIMEOUT_UNSET;
+                break;
+        case TIMEOUT_MENU_DISABLED:
+                config->timeout_sec = inc ? TIMEOUT_MIN : TIMEOUT_MENU_FORCE;
+                break;
+        case TIMEOUT_MENU_FORCE:
+                config->timeout_sec = inc ? TIMEOUT_MENU_HIDDEN : TIMEOUT_MENU_FORCE;
+                break;
+        case TIMEOUT_MENU_HIDDEN:
+                config->timeout_sec = inc ? TIMEOUT_MIN : TIMEOUT_MENU_FORCE;
+                break;
+        default:
+                config->timeout_sec = config->timeout_sec + (inc ? 1 : -1);
+        }
+
+        config->timeout_sec_efivar = config->timeout_sec;
+
+        switch (config->timeout_sec) {
+        case TIMEOUT_UNSET:
+                return xstrdup16(u"Menu timeout defined by configuration file.");
+        case TIMEOUT_MENU_DISABLED:
+                assert_not_reached();
+        case TIMEOUT_MENU_FORCE:
+                return xstrdup16(u"Timeout disabled, menu will always be shown.");
+        case TIMEOUT_MENU_HIDDEN:
+                return xstrdup16(u"Menu hidden. Hold down key at bootup to show menu.");
+        default:
+                return xasprintf("Menu timeout set to %u s.", (uint32_t)config->timeout_sec_efivar);
+        }
+}
+
+static bool unicode_supported(void) {
+        static int cache = -1;
+
+        if (cache < 0)
+                /* Basic unicode box drawing support is mandated by the spec, but it does
+                 * not hurt to make sure it works. */
+                cache = ST->ConOut->TestString(ST->ConOut, (char16_t *) u"─") == EFI_SUCCESS;
+
+        return cache;
+}
+
+static bool ps_continue(void) {
+        const char16_t *sep = unicode_supported() ? u"───" : u"---";
+        printf("\n%ls Press any key to continue, ESC or q to quit. %ls\n\n", sep, sep);
+
+        uint64_t key;
+        return console_key_read(&key, UINT64_MAX) == EFI_SUCCESS &&
+                        !IN_SET(key, KEYPRESS(0, SCAN_ESC, 0), KEYPRESS(0, 0, 'q'), KEYPRESS(0, 0, 'Q'));
+}
+
+static void print_timeout_status(const char *label, uint64_t t) {
+        switch (t) {
+        case TIMEOUT_UNSET:
+                return;
+        case TIMEOUT_MENU_DISABLED:
+                return (void) printf("%s: menu-disabled\n", label);
+        case TIMEOUT_MENU_FORCE:
+                return (void) printf("%s: menu-force\n", label);
+        case TIMEOUT_MENU_HIDDEN:
+                return (void) printf("%s: menu-hidden\n", label);
+        default:
+                return (void) printf("%s: %u s\n", label, (uint32_t)t);
+        }
+}
+
+static void print_status(Config *config, char16_t *loaded_image_path) {
+        size_t x_max, y_max;
+        uint32_t screen_width = 0, screen_height = 0;
+        SecureBootMode secure;
+        _cleanup_free_ char16_t *device_part_uuid = NULL;
+
+        assert(config);
+
+        clear_screen(COLOR_NORMAL);
+        console_query_mode(&x_max, &y_max);
+        query_screen_resolution(&screen_width, &screen_height);
+
+        secure = secure_boot_mode();
+        (void) efivar_get(MAKE_GUID_PTR(LOADER), u"LoaderDevicePartUUID", &device_part_uuid);
+
+        printf("  systemd-boot version: " GIT_VERSION "\n");
+        if (loaded_image_path)
+                printf("          loaded image: %ls\n", loaded_image_path);
+        if (device_part_uuid)
+                printf(" loader partition UUID: %ls\n", device_part_uuid);
+        printf("          architecture: " EFI_MACHINE_TYPE_NAME "\n");
+        printf("    UEFI specification: %u.%02u\n", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff);
+        printf("       firmware vendor: %ls\n", ST->FirmwareVendor);
+        printf("      firmware version: %u.%02u\n", ST->FirmwareRevision >> 16, ST->FirmwareRevision & 0xffff);
+        printf("        OS indications: %#" PRIx64 "\n", get_os_indications_supported());
+        printf("           secure boot: %ls (%ls)\n",
+                        yes_no(IN_SET(secure, SECURE_BOOT_USER, SECURE_BOOT_DEPLOYED)),
+                        secure_boot_mode_to_string(secure));
+        printf("                  shim: %ls\n", yes_no(shim_loaded()));
+        printf("                   TPM: %ls\n", yes_no(tpm_present()));
+        printf("          console mode: %i/%" PRIi64 " (%zux%zu @%ux%u)\n",
+                        ST->ConOut->Mode->Mode, ST->ConOut->Mode->MaxMode - INT64_C(1),
+                        x_max, y_max, screen_width, screen_height);
+
+        if (!ps_continue())
+                return;
+
+        print_timeout_status("      timeout (config)", config->timeout_sec_config);
+        print_timeout_status("     timeout (EFI var)", config->timeout_sec_efivar);
+
+        if (config->entry_default_config)
+                printf("      default (config): %ls\n", config->entry_default_config);
+        if (config->entry_default_efivar)
+                printf("     default (EFI var): %ls\n", config->entry_default_efivar);
+        if (config->entry_oneshot)
+                printf("    default (one-shot): %ls\n", config->entry_oneshot);
+        if (config->entry_saved)
+                printf("           saved entry: %ls\n", config->entry_saved);
+        printf("                editor: %ls\n", yes_no(config->editor));
+        printf("          auto-entries: %ls\n", yes_no(config->auto_entries));
+        printf("         auto-firmware: %ls\n", yes_no(config->auto_firmware));
+        printf("         auto-poweroff: %ls\n", yes_no(config->auto_poweroff));
+        printf("           auto-reboot: %ls\n", yes_no(config->auto_reboot));
+        printf("                  beep: %ls\n", yes_no(config->beep));
+        printf("  reboot-for-bitlocker: %ls\n", yes_no(config->reboot_for_bitlocker));
+
+        switch (config->secure_boot_enroll) {
+        case ENROLL_OFF:
+                printf("    secure-boot-enroll: off\n");
+                break;
+        case ENROLL_MANUAL:
+                printf("    secure-boot-enroll: manual\n");
+                break;
+        case ENROLL_IF_SAFE:
+                printf("    secure-boot-enroll: if-safe\n");
+                break;
+        case ENROLL_FORCE:
+                printf("    secure-boot-enroll: force\n");
+                break;
+        default:
+                assert_not_reached();
+        }
+
+        switch (config->console_mode) {
+        case CONSOLE_MODE_AUTO:
+                printf(" console-mode (config): auto\n");
+                break;
+        case CONSOLE_MODE_KEEP:
+                printf(" console-mode (config): keep\n");
+                break;
+        case CONSOLE_MODE_FIRMWARE_MAX:
+                printf(" console-mode (config): max\n");
+                break;
+        default:
+                printf(" console-mode (config): %" PRIi64 "\n", config->console_mode);
+                break;
+        }
+
+        /* EFI var console mode is always a concrete value or unset. */
+        if (config->console_mode_efivar != CONSOLE_MODE_KEEP)
+                printf("console-mode (EFI var): %" PRIi64 "\n", config->console_mode_efivar);
+
+        if (!ps_continue())
+                return;
+
+        for (size_t i = 0; i < config->n_entries; i++) {
+                BootEntry *entry = config->entries[i];
+                EFI_DEVICE_PATH *dp = NULL;
+                _cleanup_free_ char16_t *dp_str = NULL;
+
+                if (entry->device &&
+                    BS->HandleProtocol(entry->device, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL), (void **) &dp) ==
+                                    EFI_SUCCESS)
+                        (void) device_path_to_str(dp, &dp_str);
+
+                printf("    boot entry: %zu/%zu\n", i + 1, config->n_entries);
+                printf("            id: %ls\n", entry->id);
+                if (entry->title)
+                        printf("         title: %ls\n", entry->title);
+                if (entry->title_show && !streq16(entry->title, entry->title_show))
+                        printf("    title show: %ls\n", entry->title_show);
+                if (entry->sort_key)
+                        printf("      sort key: %ls\n", entry->sort_key);
+                if (entry->version)
+                        printf("       version: %ls\n", entry->version);
+                if (entry->machine_id)
+                        printf("    machine-id: %ls\n", entry->machine_id);
+                if (dp_str)
+                        printf("        device: %ls\n", dp_str);
+                if (entry->loader)
+                        printf("        loader: %ls\n", entry->loader);
+                STRV_FOREACH(initrd, entry->initrd)
+                        printf("        initrd: %ls\n", *initrd);
+                if (entry->devicetree)
+                        printf("    devicetree: %ls\n", entry->devicetree);
+                if (entry->options)
+                        printf("       options: %ls\n", entry->options);
+                printf(" internal call: %ls\n", yes_no(!!entry->call));
+
+                printf("counting boots: %ls\n", yes_no(entry->tries_left >= 0));
+                if (entry->tries_left >= 0) {
+                        printf("         tries: %i left, %i done\n", entry->tries_left, entry->tries_done);
+                        printf("  current path: %ls\\%ls\n", entry->path, entry->current_name);
+                        printf("     next path: %ls\\%ls\n", entry->path, entry->next_name);
+                }
+
+                if (!ps_continue())
+                        return;
+        }
+}
+
+static EFI_STATUS set_reboot_into_firmware(void) {
+        uint64_t osind = 0;
+        EFI_STATUS err;
+
+        (void) efivar_get_uint64_le(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"OsIndications", &osind);
+        osind |= EFI_OS_INDICATIONS_BOOT_TO_FW_UI;
+
+        err = efivar_set_uint64_le(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"OsIndications", osind, EFI_VARIABLE_NON_VOLATILE);
+        if (err != EFI_SUCCESS)
+                log_error_status(err, "Error setting OsIndications: %m");
+        return err;
+}
+
+_noreturn_ static EFI_STATUS poweroff_system(void) {
+        RT->ResetSystem(EfiResetShutdown, EFI_SUCCESS, 0, NULL);
+        assert_not_reached();
+}
+
+_noreturn_ static EFI_STATUS reboot_system(void) {
+        RT->ResetSystem(EfiResetCold, EFI_SUCCESS, 0, NULL);
+        assert_not_reached();
+}
+
+static EFI_STATUS reboot_into_firmware(void) {
+        EFI_STATUS err;
+
+        err = set_reboot_into_firmware();
+        if (err != EFI_SUCCESS)
+                return err;
+
+        return reboot_system();
+}
+
+static bool menu_run(
+                Config *config,
+                BootEntry **chosen_entry,
+                char16_t *loaded_image_path) {
+
+        assert(config);
+        assert(chosen_entry);
+
+        EFI_STATUS err;
+        size_t visible_max = 0;
+        size_t idx_highlight = config->idx_default, idx_highlight_prev = 0;
+        size_t idx, idx_first = 0, idx_last = 0;
+        bool new_mode = true, clear = true;
+        bool refresh = true, highlight = false;
+        size_t x_start = 0, y_start = 0, y_status = 0, x_max, y_max;
+        _cleanup_(strv_freep) char16_t **lines = NULL;
+        _cleanup_free_ char16_t *clearline = NULL, *separator = NULL, *status = NULL;
+        uint64_t timeout_efivar_saved = config->timeout_sec_efivar;
+        uint32_t timeout_remain = config->timeout_sec == TIMEOUT_MENU_FORCE ? 0 : config->timeout_sec;
+        int64_t console_mode_initial = ST->ConOut->Mode->Mode, console_mode_efivar_saved = config->console_mode_efivar;
+        size_t default_efivar_saved = config->idx_default_efivar;
+
+        enum {
+                ACTION_CONTINUE,        /* Continue with loop over user input */
+                ACTION_FIRMWARE_SETUP,  /* Ask for confirmation and reboot into firmware setup */
+                ACTION_POWEROFF,        /* Power off the machine */
+                ACTION_REBOOT,          /* Reboot the machine */
+                ACTION_RUN,             /* Execute a boot entry */
+                ACTION_QUIT,            /* Return to the firmware */
+        } action = ACTION_CONTINUE;
+
+        graphics_mode(false);
+        ST->ConIn->Reset(ST->ConIn, false);
+        ST->ConOut->EnableCursor(ST->ConOut, false);
+
+        /* draw a single character to make ClearScreen work on some firmware */
+        ST->ConOut->OutputString(ST->ConOut, (char16_t *) u" ");
+
+        err = console_set_mode(config->console_mode_efivar != CONSOLE_MODE_KEEP ?
+                               config->console_mode_efivar : config->console_mode);
+        if (err != EFI_SUCCESS) {
+                clear_screen(COLOR_NORMAL);
+                log_error_status(err, "Error switching console mode: %m");
+        }
+
+        size_t line_width = 0, entry_padding = 3;
+        while (IN_SET(action, ACTION_CONTINUE, ACTION_FIRMWARE_SETUP)) {
+                uint64_t key;
+
+                if (new_mode) {
+                        console_query_mode(&x_max, &y_max);
+
+                        /* account for padding+status */
+                        visible_max = y_max - 2;
+
+                        /* Drawing entries starts at idx_first until idx_last. We want to make
+                        * sure that idx_highlight is centered, but not if we are close to the
+                        * beginning/end of the entry list. Otherwise we would have a half-empty
+                        * screen. */
+                        if (config->n_entries <= visible_max || idx_highlight <= visible_max / 2)
+                                idx_first = 0;
+                        else if (idx_highlight >= config->n_entries - (visible_max / 2))
+                                idx_first = config->n_entries - visible_max;
+                        else
+                                idx_first = idx_highlight - (visible_max / 2);
+                        idx_last = idx_first + visible_max - 1;
+
+                        /* length of the longest entry */
+                        line_width = 0;
+                        for (size_t i = 0; i < config->n_entries; i++)
+                                line_width = MAX(line_width, strlen16(config->entries[i]->title_show));
+                        line_width = MIN(line_width + 2 * entry_padding, x_max);
+
+                        /* offsets to center the entries on the screen */
+                        x_start = (x_max - (line_width)) / 2;
+                        if (config->n_entries < visible_max)
+                                y_start = ((visible_max - config->n_entries) / 2) + 1;
+                        else
+                                y_start = 0;
+
+                        /* Put status line after the entry list, but give it some breathing room. */
+                        y_status = MIN(y_start + MIN(visible_max, config->n_entries) + 1, y_max - 1);
+
+                        lines = strv_free(lines);
+                        clearline = mfree(clearline);
+                        separator = mfree(separator);
+
+                        /* menu entries title lines */
+                        lines = xnew(char16_t *, config->n_entries + 1);
+
+                        for (size_t i = 0; i < config->n_entries; i++) {
+                                size_t j, padding;
+
+                                lines[i] = xnew(char16_t, line_width + 1);
+                                padding = (line_width - MIN(strlen16(config->entries[i]->title_show), line_width)) / 2;
+
+                                for (j = 0; j < padding; j++)
+                                        lines[i][j] = ' ';
+
+                                for (size_t k = 0; config->entries[i]->title_show[k] != '\0' && j < line_width; j++, k++)
+                                        lines[i][j] = config->entries[i]->title_show[k];
+
+                                for (; j < line_width; j++)
+                                        lines[i][j] = ' ';
+                                lines[i][line_width] = '\0';
+                        }
+                        lines[config->n_entries] = NULL;
+
+                        clearline = xnew(char16_t, x_max + 1);
+                        separator = xnew(char16_t, x_max + 1);
+                        for (size_t i = 0; i < x_max; i++) {
+                                clearline[i] = ' ';
+                                separator[i] = unicode_supported() ? L'─' : L'-';
+                        }
+                        clearline[x_max] = 0;
+                        separator[x_max] = 0;
+
+                        new_mode = false;
+                        clear = true;
+                }
+
+                if (clear) {
+                        clear_screen(COLOR_NORMAL);
+                        clear = false;
+                        refresh = true;
+                }
+
+                if (refresh) {
+                        for (size_t i = idx_first; i <= idx_last && i < config->n_entries; i++) {
+                                print_at(x_start, y_start + i - idx_first,
+                                         i == idx_highlight ? COLOR_HIGHLIGHT : COLOR_ENTRY,
+                                         lines[i]);
+                                if (i == config->idx_default_efivar)
+                                        print_at(x_start,
+                                                 y_start + i - idx_first,
+                                                 i == idx_highlight ? COLOR_HIGHLIGHT : COLOR_ENTRY,
+                                                 unicode_supported() ? u" ►" : u"=>");
+                        }
+                        refresh = false;
+                } else if (highlight) {
+                        print_at(x_start, y_start + idx_highlight_prev - idx_first, COLOR_ENTRY, lines[idx_highlight_prev]);
+                        print_at(x_start, y_start + idx_highlight - idx_first, COLOR_HIGHLIGHT, lines[idx_highlight]);
+                        if (idx_highlight_prev == config->idx_default_efivar)
+                                print_at(x_start,
+                                         y_start + idx_highlight_prev - idx_first,
+                                         COLOR_ENTRY,
+                                         unicode_supported() ? u" ►" : u"=>");
+                        if (idx_highlight == config->idx_default_efivar)
+                                print_at(x_start,
+                                         y_start + idx_highlight - idx_first,
+                                         COLOR_HIGHLIGHT,
+                                         unicode_supported() ? u" ►" : u"=>");
+                        highlight = false;
+                }
+
+                if (timeout_remain > 0) {
+                        free(status);
+                        status = xasprintf("Boot in %u s.", timeout_remain);
+                }
+
+                if (status) {
+                        /* If we draw the last char of the last line, the screen will scroll and break our
+                         * input. Therefore, draw one less character then we could for the status message.
+                         * Note that the same does not apply for the separator line as it will never be drawn
+                         * on the last line. */
+                        size_t len = strnlen16(status, x_max - 1);
+                        size_t x = (x_max - len) / 2;
+                        status[len] = '\0';
+                        print_at(0, y_status, COLOR_NORMAL, clearline + x_max - x);
+                        ST->ConOut->OutputString(ST->ConOut, status);
+                        ST->ConOut->OutputString(ST->ConOut, clearline + 1 + x + len);
+
+                        len = MIN(MAX(len, line_width) + 2 * entry_padding, x_max);
+                        x = (x_max - len) / 2;
+                        print_at(x, y_status - 1, COLOR_NORMAL, separator + x_max - len);
+                } else {
+                        print_at(0, y_status - 1, COLOR_NORMAL, clearline);
+                        print_at(0, y_status, COLOR_NORMAL, clearline + 1); /* See comment above. */
+                }
+
+                /* Beep several times so that the selected entry can be distinguished. */
+                if (config->beep)
+                        beep(idx_highlight + 1);
+
+                err = console_key_read(&key, timeout_remain > 0 ? 1000 * 1000 : UINT64_MAX);
+                if (err == EFI_NOT_READY)
+                        /* No input device returned a key, try again. This
+                         * normally should not happen. */
+                        continue;
+                if (err == EFI_TIMEOUT) {
+                        assert(timeout_remain > 0);
+                        timeout_remain--;
+                        if (timeout_remain == 0) {
+                                action = ACTION_RUN;
+                                break;
+                        }
+
+                        /* update status */
+                        continue;
+                }
+                if (err != EFI_SUCCESS) {
+                        action = ACTION_RUN;
+                        break;
+                }
+
+                timeout_remain = 0;
+
+                /* clear status after keystroke */
+                status = mfree(status);
+
+                idx_highlight_prev = idx_highlight;
+
+                if (action == ACTION_FIRMWARE_SETUP) {
+                        if (IN_SET(key, KEYPRESS(0, 0, '\r'), KEYPRESS(0, 0, '\n')) &&
+                            set_reboot_into_firmware() == EFI_SUCCESS)
+                                break;
+
+                        /* Any key other than newline or a failed attempt cancel the request. */
+                        action = ACTION_CONTINUE;
+                        continue;
+                }
+
+                switch (key) {
+                case KEYPRESS(0, SCAN_UP, 0):
+                case KEYPRESS(0, 0, 'k'):
+                case KEYPRESS(0, 0, 'K'):
+                        if (idx_highlight > 0)
+                                idx_highlight--;
+                        break;
+
+                case KEYPRESS(0, SCAN_DOWN, 0):
+                case KEYPRESS(0, 0, 'j'):
+                case KEYPRESS(0, 0, 'J'):
+                        if (idx_highlight < config->n_entries-1)
+                                idx_highlight++;
+                        break;
+
+                case KEYPRESS(0, SCAN_HOME, 0):
+                case KEYPRESS(EFI_ALT_PRESSED, 0, '<'):
+                        if (idx_highlight > 0) {
+                                refresh = true;
+                                idx_highlight = 0;
+                        }
+                        break;
+
+                case KEYPRESS(0, SCAN_END, 0):
+                case KEYPRESS(EFI_ALT_PRESSED, 0, '>'):
+                        if (idx_highlight < config->n_entries-1) {
+                                refresh = true;
+                                idx_highlight = config->n_entries-1;
+                        }
+                        break;
+
+                case KEYPRESS(0, SCAN_PAGE_UP, 0):
+                        if (idx_highlight > visible_max)
+                                idx_highlight -= visible_max;
+                        else
+                                idx_highlight = 0;
+                        break;
+
+                case KEYPRESS(0, SCAN_PAGE_DOWN, 0):
+                        idx_highlight += visible_max;
+                        if (idx_highlight > config->n_entries-1)
+                                idx_highlight = config->n_entries-1;
+                        break;
+
+                case KEYPRESS(0, 0, '\n'):
+                case KEYPRESS(0, 0, '\r'):
+                case KEYPRESS(0, SCAN_F3, 0): /* EZpad Mini 4s firmware sends malformed events */
+                case KEYPRESS(0, SCAN_F3, '\r'): /* Teclast X98+ II firmware sends malformed events */
+                case KEYPRESS(0, SCAN_RIGHT, 0):
+                        action = ACTION_RUN;
+                        break;
+
+                case KEYPRESS(0, SCAN_F1, 0):
+                case KEYPRESS(0, 0, 'h'):
+                case KEYPRESS(0, 0, 'H'):
+                case KEYPRESS(0, 0, '?'):
+                        /* This must stay below 80 characters! Q/v/Ctrl+l/f deliberately not advertised. */
+                        status = xasprintf("(d)efault (t/T)imeout (e)dit (r/R)esolution (p)rint %s%s(h)elp",
+                                           config->auto_poweroff ? "" : "(O)ff ",
+                                           config->auto_reboot ? "" : "re(B)oot ");
+                        break;
+
+                case KEYPRESS(0, 0, 'Q'):
+                        action = ACTION_QUIT;
+                        break;
+
+                case KEYPRESS(0, 0, 'd'):
+                case KEYPRESS(0, 0, 'D'):
+                        if (config->idx_default_efivar != idx_highlight) {
+                                free(config->entry_default_efivar);
+                                config->entry_default_efivar = xstrdup16(config->entries[idx_highlight]->id);
+                                config->idx_default_efivar = idx_highlight;
+                                status = xstrdup16(u"Default boot entry selected.");
+                        } else {
+                                config->entry_default_efivar = mfree(config->entry_default_efivar);
+                                config->idx_default_efivar = IDX_INVALID;
+                                status = xstrdup16(u"Default boot entry cleared.");
+                        }
+                        config->use_saved_entry_efivar = false;
+                        refresh = true;
+                        break;
+
+                case KEYPRESS(0, 0, '-'):
+                case KEYPRESS(0, 0, 'T'):
+                        status = update_timeout_efivar(config, false);
+                        break;
+
+                case KEYPRESS(0, 0, '+'):
+                case KEYPRESS(0, 0, 't'):
+                        status = update_timeout_efivar(config, true);
+                        break;
+
+                case KEYPRESS(0, 0, 'e'):
+                case KEYPRESS(0, 0, 'E'):
+                        /* only the options of configured entries can be edited */
+                        if (!config->editor ||
+                            !IN_SET(config->entries[idx_highlight]->type, LOADER_EFI, LOADER_LINUX, LOADER_UNIFIED_LINUX)) {
+                                status = xstrdup16(u"Entry does not support editing the command line.");
+                                break;
+                        }
+
+                        /* Unified kernels that are signed as a whole will not accept command line options
+                         * when secure boot is enabled unless there is none embedded in the image. Do not try
+                         * to pretend we can edit it to only have it be ignored. */
+                        if (config->entries[idx_highlight]->type == LOADER_UNIFIED_LINUX &&
+                            secure_boot_enabled() &&
+                            config->entries[idx_highlight]->options) {
+                                status = xstrdup16(u"Entry not editable in SecureBoot mode.");
+                                break;
+                        }
+
+                        /* The edit line may end up on the last line of the screen. And even though we're
+                         * not telling the firmware to advance the line, it still does in this one case,
+                         * causing a scroll to happen that screws with our beautiful boot loader output.
+                         * Since we cannot paint the last character of the edit line, we simply start
+                         * at x-offset 1 for symmetry. */
+                        print_at(1, y_status, COLOR_EDIT, clearline + 2);
+                        if (line_edit(&config->entries[idx_highlight]->options, x_max - 2, y_status))
+                                action = ACTION_RUN;
+                        print_at(1, y_status, COLOR_NORMAL, clearline + 2);
+
+                        /* The options string was now edited, hence we have to pass it to the invoked
+                         * binary. */
+                        config->entries[idx_highlight]->options_implied = false;
+                        break;
+
+                case KEYPRESS(0, 0, 'v'):
+                        status = xasprintf(
+                                        "systemd-boot " GIT_VERSION " (" EFI_MACHINE_TYPE_NAME "), "
+                                        "UEFI Specification %u.%02u, Vendor %ls %u.%02u",
+                                        ST->Hdr.Revision >> 16,
+                                        ST->Hdr.Revision & 0xffff,
+                                        ST->FirmwareVendor,
+                                        ST->FirmwareRevision >> 16,
+                                        ST->FirmwareRevision & 0xffff);
+                        break;
+
+                case KEYPRESS(0, 0, 'p'):
+                case KEYPRESS(0, 0, 'P'):
+                        print_status(config, loaded_image_path);
+                        clear = true;
+                        break;
+
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, 'l'):
+                case KEYPRESS(EFI_CONTROL_PRESSED, 0, CHAR_CTRL('l')):
+                case 'L': /* only uppercase, do not conflict with lower-case 'l' which picks first Linux entry */
+                        clear = true;
+                        break;
+
+                case KEYPRESS(0, 0, 'r'):
+                        err = console_set_mode(CONSOLE_MODE_NEXT);
+                        if (err != EFI_SUCCESS)
+                                status = xasprintf_status(err, "Error changing console mode: %m");
+                        else {
+                                config->console_mode_efivar = ST->ConOut->Mode->Mode;
+                                status = xasprintf(
+                                                "Console mode changed to %" PRIi64 ".",
+                                                config->console_mode_efivar);
+                        }
+                        new_mode = true;
+                        break;
+
+                case KEYPRESS(0, 0, 'R'):
+                        config->console_mode_efivar = CONSOLE_MODE_KEEP;
+                        err = console_set_mode(config->console_mode == CONSOLE_MODE_KEEP ?
+                                               console_mode_initial : config->console_mode);
+                        if (err != EFI_SUCCESS)
+                                status = xasprintf_status(err, "Error resetting console mode: %m");
+                        else
+                                status = xasprintf(
+                                                "Console mode reset to %s default.",
+                                                config->console_mode == CONSOLE_MODE_KEEP ?
+                                                                "firmware" :
+                                                                "configuration file");
+                        new_mode = true;
+                        break;
+
+                case KEYPRESS(0, 0, 'f'):
+                case KEYPRESS(0, 0, 'F'):
+                case KEYPRESS(0, SCAN_F2, 0):     /* Most vendors. */
+                case KEYPRESS(0, SCAN_F10, 0):    /* HP and Lenovo. */
+                case KEYPRESS(0, SCAN_DELETE, 0): /* Same as F2. */
+                case KEYPRESS(0, SCAN_ESC, 0):    /* HP. */
+                        if (FLAGS_SET(get_os_indications_supported(), EFI_OS_INDICATIONS_BOOT_TO_FW_UI)) {
+                                action = ACTION_FIRMWARE_SETUP;
+                                /* Let's make sure the user really wants to do this. */
+                                status = xstrdup16(u"Press Enter to reboot into firmware interface.");
+                        } else
+                                status = xstrdup16(u"Reboot into firmware interface not supported.");
+                        break;
+
+                case KEYPRESS(0, 0, 'O'): /* Only uppercase, so that it can't be hit so easily fat-fingered,
+                                           * but still works safely over serial. */
+                        action = ACTION_POWEROFF;
+                        break;
+
+                case KEYPRESS(0, 0, 'B'): /* ditto */
+                        action = ACTION_REBOOT;
+                        break;
+
+                default:
+                        /* jump with a hotkey directly to a matching entry */
+                        idx = entry_lookup_key(config, idx_highlight+1, KEYCHAR(key));
+                        if (idx == IDX_INVALID)
+                                break;
+                        idx_highlight = idx;
+                        refresh = true;
+                }
+
+                if (idx_highlight > idx_last) {
+                        idx_last = idx_highlight;
+                        idx_first = 1 + idx_highlight - visible_max;
+                        refresh = true;
+                } else if (idx_highlight < idx_first) {
+                        idx_first = idx_highlight;
+                        idx_last = idx_highlight + visible_max-1;
+                        refresh = true;
+                }
+
+                if (!refresh && idx_highlight != idx_highlight_prev)
+                        highlight = true;
+        }
+
+        /* Update EFI vars after we left the menu to reduce NVRAM writes. */
+
+        if (default_efivar_saved != config->idx_default_efivar)
+                efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderEntryDefault", config->entry_default_efivar, EFI_VARIABLE_NON_VOLATILE);
+
+        if (console_mode_efivar_saved != config->console_mode_efivar) {
+                if (config->console_mode_efivar == CONSOLE_MODE_KEEP)
+                        efivar_unset(MAKE_GUID_PTR(LOADER), u"LoaderConfigConsoleMode", EFI_VARIABLE_NON_VOLATILE);
+                else
+                        efivar_set_uint_string(MAKE_GUID_PTR(LOADER), u"LoaderConfigConsoleMode",
+                                               config->console_mode_efivar, EFI_VARIABLE_NON_VOLATILE);
+        }
+
+        if (timeout_efivar_saved != config->timeout_sec_efivar) {
+                switch (config->timeout_sec_efivar) {
+                case TIMEOUT_UNSET:
+                        efivar_unset(MAKE_GUID_PTR(LOADER), u"LoaderConfigTimeout", EFI_VARIABLE_NON_VOLATILE);
+                        break;
+                case TIMEOUT_MENU_DISABLED:
+                        assert_not_reached();
+                case TIMEOUT_MENU_FORCE:
+                        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderConfigTimeout", u"menu-force", EFI_VARIABLE_NON_VOLATILE);
+                        break;
+                case TIMEOUT_MENU_HIDDEN:
+                        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderConfigTimeout", u"menu-hidden", EFI_VARIABLE_NON_VOLATILE);
+                        break;
+                default:
+                        assert(config->timeout_sec_efivar < UINT32_MAX);
+                        efivar_set_uint_string(MAKE_GUID_PTR(LOADER), u"LoaderConfigTimeout",
+                                               config->timeout_sec_efivar, EFI_VARIABLE_NON_VOLATILE);
+                }
+        }
+
+        switch (action) {
+        case ACTION_CONTINUE:
+                assert_not_reached();
+        case ACTION_POWEROFF:
+                poweroff_system();
+        case ACTION_REBOOT:
+        case ACTION_FIRMWARE_SETUP:
+                reboot_system();
+        case ACTION_RUN:
+        case ACTION_QUIT:
+                break;
+        }
+
+        *chosen_entry = config->entries[idx_highlight];
+        clear_screen(COLOR_NORMAL);
+        return action == ACTION_RUN;
+}
+
+static void config_add_entry(Config *config, BootEntry *entry) {
+        assert(config);
+        assert(entry);
+
+        /* This is just for paranoia. */
+        assert(config->n_entries < IDX_MAX);
+
+        if ((config->n_entries & 15) == 0) {
+                config->entries = xrealloc(
+                                config->entries,
+                                sizeof(void *) * config->n_entries,
+                                sizeof(void *) * (config->n_entries + 16));
+        }
+        config->entries[config->n_entries++] = entry;
+}
+
+static BootEntry* boot_entry_free(BootEntry *entry) {
+        if (!entry)
+                return NULL;
+
+        free(entry->id);
+        free(entry->title_show);
+        free(entry->title);
+        free(entry->sort_key);
+        free(entry->version);
+        free(entry->machine_id);
+        free(entry->loader);
+        free(entry->devicetree);
+        free(entry->options);
+        strv_free(entry->initrd);
+        free(entry->path);
+        free(entry->current_name);
+        free(entry->next_name);
+
+        return mfree(entry);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(BootEntry *, boot_entry_free);
+
+static void config_defaults_load_from_file(Config *config, char *content) {
+        char *line;
+        size_t pos = 0;
+        char *key, *value;
+
+        assert(config);
+        assert(content);
+
+        while ((line = line_get_key_value(content, " \t", &pos, &key, &value)))
+                if (streq8(key, "timeout")) {
+                        if (streq8(value, "menu-disabled"))
+                                config->timeout_sec_config = TIMEOUT_MENU_DISABLED;
+                        else if (streq8(value, "menu-force"))
+                                config->timeout_sec_config = TIMEOUT_MENU_FORCE;
+                        else if (streq8(value, "menu-hidden"))
+                                config->timeout_sec_config = TIMEOUT_MENU_HIDDEN;
+                        else {
+                                uint64_t u;
+                                if (!parse_number8(value, &u, NULL) || u > TIMEOUT_TYPE_MAX) {
+                                        log_error("Error parsing 'timeout' config option, ignoring: %s",
+                                                  value);
+                                        continue;
+                                }
+                                config->timeout_sec_config = u;
+                        }
+                        config->timeout_sec = config->timeout_sec_config;
+
+                } else if (streq8(key, "default")) {
+                        if (value[0] == '@' && !strcaseeq8(value, "@saved")) {
+                                log_error("Unsupported special entry identifier, ignoring: %s", value);
+                                continue;
+                        }
+                        free(config->entry_default_config);
+                        config->entry_default_config = xstr8_to_16(value);
+
+                } else if (streq8(key, "editor")) {
+                        if (!parse_boolean(value, &config->editor))
+                                log_error("Error parsing 'editor' config option, ignoring: %s", value);
+
+                } else if (streq8(key, "auto-entries")) {
+                        if (!parse_boolean(value, &config->auto_entries))
+                                log_error("Error parsing 'auto-entries' config option, ignoring: %s", value);
+
+                } else if (streq8(key, "auto-firmware")) {
+                        if (!parse_boolean(value, &config->auto_firmware))
+                                log_error("Error parsing 'auto-firmware' config option, ignoring: %s", value);
+
+                } else if (streq8(key, "auto-poweroff")) {
+                        if (!parse_boolean(value, &config->auto_poweroff))
+                                log_error("Error parsing 'auto-poweroff' config option, ignoring: %s", value);
+
+                } else if (streq8(key, "auto-reboot")) {
+                        if (!parse_boolean(value, &config->auto_reboot))
+                                log_error("Error parsing 'auto-reboot' config option, ignoring: %s", value);
+
+                } else if (streq8(key, "beep")) {
+                        if (!parse_boolean(value, &config->beep))
+                                log_error("Error parsing 'beep' config option, ignoring: %s", value);
+
+                } else if (streq8(key, "reboot-for-bitlocker")) {
+                        if (!parse_boolean(value, &config->reboot_for_bitlocker))
+                                log_error("Error parsing 'reboot-for-bitlocker' config option, ignoring: %s",
+                                          value);
+
+                } else if (streq8(key, "secure-boot-enroll")) {
+                        if (streq8(value, "manual"))
+                                config->secure_boot_enroll = ENROLL_MANUAL;
+                        else if (streq8(value, "force"))
+                                config->secure_boot_enroll = ENROLL_FORCE;
+                        else if (streq8(value, "if-safe"))
+                                config->secure_boot_enroll = ENROLL_IF_SAFE;
+                        else if (streq8(value, "off"))
+                                config->secure_boot_enroll = ENROLL_OFF;
+                        else
+                                log_error("Error parsing 'secure-boot-enroll' config option, ignoring: %s",
+                                          value);
+
+                } else if (streq8(key, "console-mode")) {
+                        if (streq8(value, "auto"))
+                                config->console_mode = CONSOLE_MODE_AUTO;
+                        else if (streq8(value, "max"))
+                                config->console_mode = CONSOLE_MODE_FIRMWARE_MAX;
+                        else if (streq8(value, "keep"))
+                                config->console_mode = CONSOLE_MODE_KEEP;
+                        else {
+                                uint64_t u;
+                                if (!parse_number8(value, &u, NULL) || u > CONSOLE_MODE_RANGE_MAX) {
+                                        log_error("Error parsing 'console-mode' config option, ignoring: %s",
+                                                  value);
+                                        continue;
+                                }
+                                config->console_mode = u;
+                        }
+                }
+}
+
+static void boot_entry_parse_tries(
+                BootEntry *entry,
+                const char16_t *path,
+                const char16_t *file,
+                const char16_t *suffix) {
+
+        assert(entry);
+        assert(path);
+        assert(file);
+        assert(suffix);
+
+        /*
+         * Parses a suffix of two counters (one going down, one going up) in the form "+LEFT-DONE" from the end of the
+         * filename (but before the .efi/.conf suffix), where the "-DONE" part is optional and may be left out (in
+         * which case that counter as assumed to be zero, i.e. the missing part is synonymous to "-0").
+         *
+         * Names we grok, and the series they result in:
+         *
+         * foobar+3.efi   → foobar+2-1.efi → foobar+1-2.efi → foobar+0-3.efi → STOP!
+         * foobar+4-0.efi → foobar+3-1.efi → foobar+2-2.efi → foobar+1-3.efi → foobar+0-4.efi → STOP!
+         */
+
+        const char16_t *counter = NULL;
+        for (;;) {
+                char16_t *plus = strchr16(counter ?: file, '+');
+                if (plus) {
+                        /* We want the last "+". */
+                        counter = plus + 1;
+                        continue;
+                }
+                if (counter)
+                        break;
+
+                /* No boot counter found. */
+                return;
+        }
+
+        uint64_t tries_left, tries_done = 0;
+        size_t prefix_len = counter - file;
+
+        if (!parse_number16(counter, &tries_left, &counter) || tries_left > INT_MAX)
+                return;
+
+        /* Parse done counter only if present. */
+        if (*counter == '-' && (!parse_number16(counter + 1, &tries_done, &counter) || tries_done > INT_MAX))
+                return;
+
+        /* Boot counter in the middle of the name? */
+        if (!streq16(counter, suffix))
+                return;
+
+        entry->tries_left = tries_left;
+        entry->tries_done = tries_done;
+        entry->path = xstrdup16(path);
+        entry->current_name = xstrdup16(file);
+        entry->next_name = xasprintf(
+                        "%.*ls%" PRIu64 "-%" PRIu64 "%ls",
+                        (int) prefix_len,
+                        file,
+                        LESS_BY(tries_left, 1u),
+                        MIN(tries_done + 1, (uint64_t) INT_MAX),
+                        suffix);
+}
+
+static EFI_STATUS boot_entry_bump_counters(BootEntry *entry) {
+        _cleanup_free_ char16_t* old_path = NULL, *new_path = NULL;
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        _cleanup_free_ EFI_FILE_INFO *file_info = NULL;
+        size_t file_info_size;
+        EFI_STATUS err;
+
+        assert(entry);
+
+        if (entry->tries_left < 0)
+                return EFI_SUCCESS;
+
+        if (!entry->path || !entry->current_name || !entry->next_name)
+                return EFI_SUCCESS;
+
+        _cleanup_(file_closep) EFI_FILE *root = NULL;
+        err = open_volume(entry->device, &root);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error opening entry root path: %m");
+
+        old_path = xasprintf("%ls\\%ls", entry->path, entry->current_name);
+
+        err = root->Open(root, &handle, old_path, EFI_FILE_MODE_READ|EFI_FILE_MODE_WRITE, 0ULL);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error opening boot entry: %m");
+
+        err = get_file_info(handle, &file_info, &file_info_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error getting boot entry file info: %m");
+
+        /* And rename the file */
+        strcpy16(file_info->FileName, entry->next_name);
+        err = handle->SetInfo(handle, MAKE_GUID_PTR(EFI_FILE_INFO), file_info_size, file_info);
+        if (err != EFI_SUCCESS)
+                return log_error_status(
+                                err, "Failed to rename '%ls' to '%ls', ignoring: %m", old_path, entry->next_name);
+
+        /* Flush everything to disk, just in case… */
+        err = handle->Flush(handle);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error flushing boot entry file info: %m");
+
+        /* Let's tell the OS that we renamed this file, so that it knows what to rename to the counter-less name on
+         * success */
+        new_path = xasprintf("%ls\\%ls", entry->path, entry->next_name);
+        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderBootCountPath", new_path, 0);
+
+        /* If the file we just renamed is the loader path, then let's update that. */
+        if (streq16(entry->loader, old_path)) {
+                free(entry->loader);
+                entry->loader = TAKE_PTR(new_path);
+        }
+
+        return EFI_SUCCESS;
+}
+
+static void boot_entry_add_type1(
+                Config *config,
+                EFI_HANDLE *device,
+                EFI_FILE *root_dir,
+                const char16_t *path,
+                const char16_t *file,
+                char *content,
+                const char16_t *loaded_image_path) {
+
+        _cleanup_(boot_entry_freep) BootEntry *entry = NULL;
+        char *line;
+        size_t pos = 0, n_initrd = 0;
+        char *key, *value;
+        EFI_STATUS err;
+
+        assert(config);
+        assert(device);
+        assert(root_dir);
+        assert(path);
+        assert(file);
+        assert(content);
+
+        entry = xnew(BootEntry, 1);
+        *entry = (BootEntry) {
+                .tries_done = -1,
+                .tries_left = -1,
+        };
+
+        while ((line = line_get_key_value(content, " \t", &pos, &key, &value)))
+                if (streq8(key, "title")) {
+                        free(entry->title);
+                        entry->title = xstr8_to_16(value);
+
+                } else if (streq8(key, "sort-key")) {
+                        free(entry->sort_key);
+                        entry->sort_key = xstr8_to_16(value);
+
+                } else if (streq8(key, "version")) {
+                        free(entry->version);
+                        entry->version = xstr8_to_16(value);
+
+                } else if (streq8(key, "machine-id")) {
+                        free(entry->machine_id);
+                        entry->machine_id = xstr8_to_16(value);
+
+                } else if (streq8(key, "linux")) {
+                        free(entry->loader);
+                        entry->type = LOADER_LINUX;
+                        entry->loader = xstr8_to_path(value);
+                        entry->key = 'l';
+
+                } else if (streq8(key, "efi")) {
+                        entry->type = LOADER_EFI;
+                        free(entry->loader);
+                        entry->loader = xstr8_to_path(value);
+
+                        /* do not add an entry for ourselves */
+                        if (strcaseeq16(entry->loader, loaded_image_path)) {
+                                entry->type = LOADER_UNDEFINED;
+                                break;
+                        }
+
+                } else if (streq8(key, "architecture")) {
+                        /* do not add an entry for an EFI image of architecture not matching with that of the image */
+                        if (!streq8(value, EFI_MACHINE_TYPE_NAME)) {
+                                entry->type = LOADER_UNDEFINED;
+                                break;
+                        }
+
+                } else if (streq8(key, "devicetree")) {
+                        free(entry->devicetree);
+                        entry->devicetree = xstr8_to_path(value);
+
+                } else if (streq8(key, "initrd")) {
+                        entry->initrd = xrealloc(
+                                entry->initrd,
+                                n_initrd == 0 ? 0 : (n_initrd + 1) * sizeof(uint16_t *),
+                                (n_initrd + 2) * sizeof(uint16_t *));
+                        entry->initrd[n_initrd++] = xstr8_to_path(value);
+                        entry->initrd[n_initrd] = NULL;
+
+                } else if (streq8(key, "options")) {
+                        _cleanup_free_ char16_t *new = NULL;
+
+                        new = xstr8_to_16(value);
+                        if (entry->options) {
+                                char16_t *s = xasprintf("%ls %ls", entry->options, new);
+                                free(entry->options);
+                                entry->options = s;
+                        } else
+                                entry->options = TAKE_PTR(new);
+                }
+
+        if (entry->type == LOADER_UNDEFINED)
+                return;
+
+        /* check existence */
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        err = root_dir->Open(root_dir, &handle, entry->loader, EFI_FILE_MODE_READ, 0ULL);
+        if (err != EFI_SUCCESS)
+                return;
+
+        entry->device = device;
+        entry->id = xstrdup16(file);
+        strtolower16(entry->id);
+
+        config_add_entry(config, entry);
+
+        boot_entry_parse_tries(entry, path, file, u".conf");
+        TAKE_PTR(entry);
+}
+
+static EFI_STATUS efivar_get_timeout(const char16_t *var, uint64_t *ret_value) {
+        _cleanup_free_ char16_t *value = NULL;
+        EFI_STATUS err;
+
+        assert(var);
+        assert(ret_value);
+
+        err = efivar_get(MAKE_GUID_PTR(LOADER), var, &value);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (streq16(value, u"menu-disabled")) {
+                *ret_value = TIMEOUT_MENU_DISABLED;
+                return EFI_SUCCESS;
+        }
+        if (streq16(value, u"menu-force")) {
+                *ret_value = TIMEOUT_MENU_FORCE;
+                return EFI_SUCCESS;
+        }
+        if (streq16(value, u"menu-hidden")) {
+                *ret_value = TIMEOUT_MENU_HIDDEN;
+                return EFI_SUCCESS;
+        }
+
+        uint64_t timeout;
+        if (!parse_number16(value, &timeout, NULL))
+                return EFI_INVALID_PARAMETER;
+
+        *ret_value = MIN(timeout, TIMEOUT_TYPE_MAX);
+        return EFI_SUCCESS;
+}
+
+static void config_load_defaults(Config *config, EFI_FILE *root_dir) {
+        _cleanup_free_ char *content = NULL;
+        size_t content_size, value = 0;  /* avoid false maybe-uninitialized warning */
+        EFI_STATUS err;
+
+        assert(root_dir);
+
+        *config = (Config) {
+                .editor = true,
+                .auto_entries = true,
+                .auto_firmware = true,
+                .secure_boot_enroll = ENROLL_IF_SAFE,
+                .idx_default_efivar = IDX_INVALID,
+                .console_mode = CONSOLE_MODE_KEEP,
+                .console_mode_efivar = CONSOLE_MODE_KEEP,
+                .timeout_sec_config = TIMEOUT_UNSET,
+                .timeout_sec_efivar = TIMEOUT_UNSET,
+        };
+
+        err = file_read(root_dir, u"\\loader\\loader.conf", 0, 0, &content, &content_size);
+        if (err == EFI_SUCCESS) {
+                /* First, measure. */
+                err = tpm_log_tagged_event(
+                                TPM2_PCR_BOOT_LOADER_CONFIG,
+                                POINTER_TO_PHYSICAL_ADDRESS(content),
+                                content_size,
+                                LOADER_CONF_EVENT_TAG_ID,
+                                u"loader.conf",
+                                /* ret_measured= */ NULL);
+                if (err != EFI_SUCCESS)
+                        log_error_status(err, "Error measuring loader.conf into TPM: %m");
+
+                /* Then: parse */
+                config_defaults_load_from_file(config, content);
+        }
+
+        err = efivar_get_timeout(u"LoaderConfigTimeout", &config->timeout_sec_efivar);
+        if (err == EFI_SUCCESS)
+                config->timeout_sec = config->timeout_sec_efivar;
+        else if (err != EFI_NOT_FOUND)
+                log_error_status(err, "Error reading LoaderConfigTimeout EFI variable: %m");
+
+        err = efivar_get_timeout(u"LoaderConfigTimeoutOneShot", &config->timeout_sec);
+        if (err == EFI_SUCCESS) {
+                /* Unset variable now, after all it's "one shot". */
+                (void) efivar_unset(MAKE_GUID_PTR(LOADER), u"LoaderConfigTimeoutOneShot", EFI_VARIABLE_NON_VOLATILE);
+
+                config->force_menu = true; /* force the menu when this is set */
+        } else if (err != EFI_NOT_FOUND)
+                log_error_status(err, "Error reading LoaderConfigTimeoutOneShot EFI variable: %m");
+
+        err = efivar_get_uint_string(MAKE_GUID_PTR(LOADER), u"LoaderConfigConsoleMode", &value);
+        if (err == EFI_SUCCESS)
+                config->console_mode_efivar = value;
+
+        err = efivar_get(MAKE_GUID_PTR(LOADER), u"LoaderEntryOneShot", &config->entry_oneshot);
+        if (err == EFI_SUCCESS)
+                /* Unset variable now, after all it's "one shot". */
+                (void) efivar_unset(MAKE_GUID_PTR(LOADER), u"LoaderEntryOneShot", EFI_VARIABLE_NON_VOLATILE);
+
+        (void) efivar_get(MAKE_GUID_PTR(LOADER), u"LoaderEntryDefault", &config->entry_default_efivar);
+
+        strtolower16(config->entry_default_config);
+        strtolower16(config->entry_default_efivar);
+        strtolower16(config->entry_oneshot);
+        strtolower16(config->entry_saved);
+
+        config->use_saved_entry = streq16(config->entry_default_config, u"@saved");
+        config->use_saved_entry_efivar = streq16(config->entry_default_efivar, u"@saved");
+        if (config->use_saved_entry || config->use_saved_entry_efivar)
+                (void) efivar_get(MAKE_GUID_PTR(LOADER), u"LoaderEntryLastBooted", &config->entry_saved);
+}
+
+static void config_load_type1_entries(
+                Config *config,
+                EFI_HANDLE *device,
+                EFI_FILE *root_dir,
+                const char16_t *loaded_image_path) {
+
+        _cleanup_(file_closep) EFI_FILE *entries_dir = NULL;
+        _cleanup_free_ EFI_FILE_INFO *f = NULL;
+        size_t f_size = 0;
+        EFI_STATUS err;
+
+        assert(config);
+        assert(device);
+        assert(root_dir);
+
+        /* Adds Boot Loader Type #1 entries (i.e. /loader/entries/….conf) */
+
+        err = open_directory(root_dir, u"\\loader\\entries", &entries_dir);
+        if (err != EFI_SUCCESS)
+                return;
+
+        for (;;) {
+                _cleanup_free_ char *content = NULL;
+
+                err = readdir(entries_dir, &f, &f_size);
+                if (err != EFI_SUCCESS || !f)
+                        break;
+
+                if (f->FileName[0] == '.')
+                        continue;
+                if (FLAGS_SET(f->Attribute, EFI_FILE_DIRECTORY))
+                        continue;
+
+                if (!endswith_no_case(f->FileName, u".conf"))
+                        continue;
+                if (startswith(f->FileName, u"auto-"))
+                        continue;
+
+                err = file_read(entries_dir, f->FileName, 0, 0, &content, NULL);
+                if (err == EFI_SUCCESS)
+                        boot_entry_add_type1(config, device, root_dir, u"\\loader\\entries", f->FileName, content, loaded_image_path);
+        }
+}
+
+static int boot_entry_compare(const BootEntry *a, const BootEntry *b) {
+        int r;
+
+        assert(a);
+        assert(b);
+
+        /* Order entries that have no tries left to the end of the list */
+        r = CMP(a->tries_left == 0, b->tries_left == 0);
+        if (r != 0)
+                return r;
+
+        /* If there's a sort key defined for *both* entries, then we do new-style ordering, i.e. by
+         * sort-key/machine-id/version, with a final fallback to id. If there's no sort key for either, we do
+         * old-style ordering, i.e. by id only. If one has sort key and the other does not, we put new-style
+         * before old-style. */
+        r = CMP(!a->sort_key, !b->sort_key);
+        if (r != 0) /* one is old-style, one new-style */
+                return r;
+
+        if (a->sort_key && b->sort_key) {
+                r = strcmp16(a->sort_key, b->sort_key);
+                if (r != 0)
+                        return r;
+
+                /* If multiple installations of the same OS are around, group by machine ID */
+                r = strcmp16(a->machine_id, b->machine_id);
+                if (r != 0)
+                        return r;
+
+                /* If the sort key was defined, then order by version now (downwards, putting the newest first) */
+                r = -strverscmp_improved(a->version, b->version);
+                if (r != 0)
+                        return r;
+        }
+
+        /* Now order by ID. The version is likely part of the ID, thus note that this will generatelly put
+         * the newer versions earlier. Specifying a sort key explicitly is preferable, because it gives an
+         * explicit sort order. */
+        r = -strverscmp_improved(a->id, b->id);
+        if (r != 0)
+                return r;
+
+        if (a->tries_left < 0 || b->tries_left < 0)
+                return 0;
+
+        /* If both items have boot counting, and otherwise are identical, put the entry with more tries left first */
+        r = -CMP(a->tries_left, b->tries_left);
+        if (r != 0)
+                return r;
+
+        /* If they have the same number of tries left, then let the one win which was tried fewer times so far */
+        return CMP(a->tries_done, b->tries_done);
+}
+
+static size_t config_find_entry(Config *config, const char16_t *pattern) {
+        assert(config);
+
+        /* We expect pattern and entry IDs to be already case folded. */
+
+        if (!pattern)
+                return IDX_INVALID;
+
+        for (size_t i = 0; i < config->n_entries; i++)
+                if (efi_fnmatch(pattern, config->entries[i]->id))
+                        return i;
+
+        return IDX_INVALID;
+}
+
+static void config_select_default_entry(Config *config) {
+        size_t i;
+
+        assert(config);
+
+        i = config_find_entry(config, config->entry_oneshot);
+        if (i != IDX_INVALID) {
+                config->idx_default = i;
+                return;
+        }
+
+        i = config_find_entry(config, config->use_saved_entry_efivar ? config->entry_saved : config->entry_default_efivar);
+        if (i != IDX_INVALID) {
+                config->idx_default = i;
+                config->idx_default_efivar = i;
+                return;
+        }
+
+        if (config->use_saved_entry)
+                /* No need to do the same thing twice. */
+                i = config->use_saved_entry_efivar ? IDX_INVALID : config_find_entry(config, config->entry_saved);
+        else
+                i = config_find_entry(config, config->entry_default_config);
+        if (i != IDX_INVALID) {
+                config->idx_default = i;
+                return;
+        }
+
+        /* select the first suitable entry */
+        for (i = 0; i < config->n_entries; i++)
+                if (config->entries[i]->type != LOADER_AUTO && !config->entries[i]->call) {
+                        config->idx_default = i;
+                        return;
+                }
+
+        /* If no configured entry to select from was found, enable the menu. */
+        config->idx_default = 0;
+        if (config->timeout_sec == 0)
+                config->timeout_sec = 10;
+}
+
+static bool entries_unique(BootEntry **entries, bool *unique, size_t n_entries) {
+        bool is_unique = true;
+
+        assert(entries);
+        assert(unique);
+
+        for (size_t i = 0; i < n_entries; i++)
+                for (size_t k = i + 1; k < n_entries; k++) {
+                        if (!streq16(entries[i]->title_show, entries[k]->title_show))
+                                continue;
+
+                        is_unique = unique[i] = unique[k] = false;
+                }
+
+        return is_unique;
+}
+
+/* generate unique titles, avoiding non-distinguishable menu entries */
+static void generate_boot_entry_titles(Config *config) {
+        assert(config);
+
+        bool unique[config->n_entries];
+
+        /* set title */
+        for (size_t i = 0; i < config->n_entries; i++) {
+                assert(!config->entries[i]->title_show);
+                unique[i] = true;
+                config->entries[i]->title_show = xstrdup16(config->entries[i]->title ?: config->entries[i]->id);
+        }
+
+        if (entries_unique(config->entries, unique, config->n_entries))
+                return;
+
+        /* add version to non-unique titles */
+        for (size_t i = 0; i < config->n_entries; i++) {
+                if (unique[i])
+                        continue;
+
+                unique[i] = true;
+
+                if (!config->entries[i]->version)
+                        continue;
+
+                _cleanup_free_ char16_t *t = config->entries[i]->title_show;
+                config->entries[i]->title_show = xasprintf("%ls (%ls)", t, config->entries[i]->version);
+        }
+
+        if (entries_unique(config->entries, unique, config->n_entries))
+                return;
+
+        /* add machine-id to non-unique titles */
+        for (size_t i = 0; i < config->n_entries; i++) {
+                if (unique[i])
+                        continue;
+
+                unique[i] = true;
+
+                if (!config->entries[i]->machine_id)
+                        continue;
+
+                _cleanup_free_ char16_t *t = config->entries[i]->title_show;
+                config->entries[i]->title_show = xasprintf("%ls (%.8ls)", t, config->entries[i]->machine_id);
+        }
+
+        if (entries_unique(config->entries, unique, config->n_entries))
+                return;
+
+        /* add file name to non-unique titles */
+        for (size_t i = 0; i < config->n_entries; i++) {
+                if (unique[i])
+                        continue;
+
+                _cleanup_free_ char16_t *t = config->entries[i]->title_show;
+                config->entries[i]->title_show = xasprintf("%ls (%ls)", t, config->entries[i]->id);
+        }
+}
+
+static bool is_sd_boot(EFI_FILE *root_dir, const char16_t *loader_path) {
+        EFI_STATUS err;
+        static const char * const sections[] = {
+                ".sdmagic",
+                NULL
+        };
+        size_t offset = 0, size = 0, read;
+        _cleanup_free_ char *content = NULL;
+
+        assert(root_dir);
+        assert(loader_path);
+
+        err = pe_file_locate_sections(root_dir, loader_path, sections, &offset, &size);
+        if (err != EFI_SUCCESS || size != sizeof(SD_MAGIC))
+                return false;
+
+        err = file_read(root_dir, loader_path, offset, size, &content, &read);
+        if (err != EFI_SUCCESS || size != read)
+                return false;
+
+        return memcmp(content, SD_MAGIC, sizeof(SD_MAGIC)) == 0;
+}
+
+static BootEntry* config_add_entry_loader_auto(
+                Config *config,
+                EFI_HANDLE *device,
+                EFI_FILE *root_dir,
+                const char16_t *loaded_image_path,
+                const char16_t *id,
+                char16_t key,
+                const char16_t *title,
+                const char16_t *loader) {
+
+        assert(config);
+        assert(device);
+        assert(root_dir);
+        assert(id);
+        assert(title);
+
+        if (!config->auto_entries)
+                return NULL;
+
+        if (!loader) {
+                loader = u"\\EFI\\BOOT\\BOOT" EFI_MACHINE_TYPE_NAME ".efi";
+
+                /* We are trying to add the default EFI loader here,
+                 * but we do not want to do that if that would be us.
+                 *
+                 * If the default loader is not us, it might be shim. It would
+                 * chainload GRUBX64.EFI in that case, which might be us. */
+                if (strcaseeq16(loader, loaded_image_path) ||
+                    is_sd_boot(root_dir, loader) ||
+                    is_sd_boot(root_dir, u"\\EFI\\BOOT\\GRUB" EFI_MACHINE_TYPE_NAME u".EFI"))
+                        return NULL;
+        }
+
+        /* check existence */
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        EFI_STATUS err = root_dir->Open(root_dir, &handle, (char16_t *) loader, EFI_FILE_MODE_READ, 0ULL);
+        if (err != EFI_SUCCESS)
+                return NULL;
+
+        BootEntry *entry = xnew(BootEntry, 1);
+        *entry = (BootEntry) {
+                .id = xstrdup16(id),
+                .type = LOADER_AUTO,
+                .title = xstrdup16(title),
+                .device = device,
+                .loader = xstrdup16(loader),
+                .key = key,
+                .tries_done = -1,
+                .tries_left = -1,
+        };
+
+        config_add_entry(config, entry);
+        return entry;
+}
+
+static void config_add_entry_osx(Config *config) {
+        EFI_STATUS err;
+        size_t n_handles = 0;
+        _cleanup_free_ EFI_HANDLE *handles = NULL;
+
+        assert(config);
+
+        if (!config->auto_entries)
+                return;
+
+        err = BS->LocateHandleBuffer(
+                        ByProtocol, MAKE_GUID_PTR(EFI_SIMPLE_FILE_SYSTEM_PROTOCOL), NULL, &n_handles, &handles);
+        if (err != EFI_SUCCESS)
+                return;
+
+        for (size_t i = 0; i < n_handles; i++) {
+                _cleanup_(file_closep) EFI_FILE *root = NULL;
+
+                if (open_volume(handles[i], &root) != EFI_SUCCESS)
+                        continue;
+
+                if (config_add_entry_loader_auto(
+                                config,
+                                handles[i],
+                                root,
+                                NULL,
+                                u"auto-osx",
+                                'a',
+                                u"macOS",
+                                u"\\System\\Library\\CoreServices\\boot.efi"))
+                        break;
+        }
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+static EFI_STATUS boot_windows_bitlocker(void) {
+        _cleanup_free_ EFI_HANDLE *handles = NULL;
+        size_t n_handles;
+        EFI_STATUS err;
+
+        // FIXME: Experimental for now. Should be generalized, and become a per-entry option that can be
+        // enabled independently of BitLocker, and without a BootXXXX entry pre-existing.
+
+        /* BitLocker key cannot be sealed without a TPM present. */
+        if (!tpm_present())
+                return EFI_NOT_FOUND;
+
+        err = BS->LocateHandleBuffer(
+                        ByProtocol, MAKE_GUID_PTR(EFI_BLOCK_IO_PROTOCOL), NULL, &n_handles, &handles);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Look for BitLocker magic string on all block drives. */
+        bool found = false;
+        for (size_t i = 0; i < n_handles; i++) {
+                EFI_BLOCK_IO_PROTOCOL *block_io;
+                err = BS->HandleProtocol(handles[i], MAKE_GUID_PTR(EFI_BLOCK_IO_PROTOCOL), (void **) &block_io);
+                if (err != EFI_SUCCESS || block_io->Media->BlockSize < 512 || block_io->Media->BlockSize > 4096)
+                        continue;
+
+                char buf[4096];
+                err = block_io->ReadBlocks(block_io, block_io->Media->MediaId, 0, sizeof(buf), buf);
+                if (err != EFI_SUCCESS)
+                        continue;
+
+                if (memcmp(buf + 3, "-FVE-FS-", STRLEN("-FVE-FS-")) == 0) {
+                        found = true;
+                        break;
+                }
+        }
+
+        /* If no BitLocker drive was found, we can just chainload bootmgfw.efi directly. */
+        if (!found)
+                return EFI_NOT_FOUND;
+
+        _cleanup_free_ uint16_t *boot_order = NULL;
+        size_t boot_order_size;
+
+        /* There can be gaps in Boot#### entries. Instead of iterating over the full
+         * EFI var list or uint16_t namespace, just look for "Windows Boot Manager" in BootOrder. */
+        err = efivar_get_raw(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"BootOrder", (char **) &boot_order, &boot_order_size);
+        if (err != EFI_SUCCESS || boot_order_size % sizeof(uint16_t) != 0)
+                return err;
+
+        for (size_t i = 0; i < boot_order_size / sizeof(uint16_t); i++) {
+                _cleanup_free_ char *buf = NULL;
+                size_t buf_size;
+
+                _cleanup_free_ char16_t *name = xasprintf("Boot%04x", boot_order[i]);
+                err = efivar_get_raw(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), name, &buf, &buf_size);
+                if (err != EFI_SUCCESS)
+                        continue;
+
+                /* Boot#### are EFI_LOAD_OPTION. But we really are only interested
+                 * for the description, which is at this offset. */
+                size_t offset = sizeof(uint32_t) + sizeof(uint16_t);
+                if (buf_size < offset + sizeof(char16_t))
+                        continue;
+
+                if (streq16((char16_t *) (buf + offset), u"Windows Boot Manager")) {
+                        err = efivar_set_raw(
+                                MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE),
+                                u"BootNext",
+                                boot_order + i,
+                                sizeof(boot_order[i]),
+                                EFI_VARIABLE_NON_VOLATILE);
+                        if (err != EFI_SUCCESS)
+                                return err;
+                        RT->ResetSystem(EfiResetWarm, EFI_SUCCESS, 0, NULL);
+                        assert_not_reached();
+                }
+        }
+
+        return EFI_NOT_FOUND;
+}
+#endif
+
+static void config_add_entry_windows(Config *config, EFI_HANDLE *device, EFI_FILE *root_dir) {
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+        _cleanup_free_ char *bcd = NULL;
+        char16_t *title = NULL;
+        EFI_STATUS err;
+        size_t len;
+
+        assert(config);
+        assert(device);
+        assert(root_dir);
+
+        if (!config->auto_entries)
+                return;
+
+        /* Try to find a better title. */
+        err = file_read(root_dir, u"\\EFI\\Microsoft\\Boot\\BCD", 0, 100*1024, &bcd, &len);
+        if (err == EFI_SUCCESS)
+                title = get_bcd_title((uint8_t *) bcd, len);
+
+        BootEntry *e = config_add_entry_loader_auto(config, device, root_dir, NULL,
+                                                    u"auto-windows", 'w', title ?: u"Windows Boot Manager",
+                                                    u"\\EFI\\Microsoft\\Boot\\bootmgfw.efi");
+
+        if (config->reboot_for_bitlocker)
+                e->call = boot_windows_bitlocker;
+#endif
+}
+
+static void config_load_type2_entries(
+                Config *config,
+                EFI_HANDLE *device,
+                EFI_FILE *root_dir) {
+
+        _cleanup_(file_closep) EFI_FILE *linux_dir = NULL;
+        _cleanup_free_ EFI_FILE_INFO *f = NULL;
+        size_t f_size = 0;
+        EFI_STATUS err;
+
+        /* Adds Boot Loader Type #2 entries (i.e. /EFI/Linux/….efi) */
+
+        assert(config);
+        assert(device);
+        assert(root_dir);
+
+        err = open_directory(root_dir, u"\\EFI\\Linux", &linux_dir);
+        if (err != EFI_SUCCESS)
+                return;
+
+        for (;;) {
+                enum {
+                        SECTION_CMDLINE,
+                        SECTION_OSREL,
+                        _SECTION_MAX,
+                };
+
+                static const char * const sections[_SECTION_MAX + 1] = {
+                        [SECTION_CMDLINE] = ".cmdline",
+                        [SECTION_OSREL]   = ".osrel",
+                        NULL,
+                };
+
+                _cleanup_free_ char16_t *os_pretty_name = NULL, *os_image_id = NULL, *os_name = NULL, *os_id = NULL,
+                        *os_image_version = NULL, *os_version = NULL, *os_version_id = NULL, *os_build_id = NULL;
+                const char16_t *good_name, *good_version, *good_sort_key;
+                _cleanup_free_ char *content = NULL;
+                size_t offs[_SECTION_MAX] = {}, szs[_SECTION_MAX] = {}, pos = 0;
+                char *line, *key, *value;
+
+                err = readdir(linux_dir, &f, &f_size);
+                if (err != EFI_SUCCESS || !f)
+                        break;
+
+                if (f->FileName[0] == '.')
+                        continue;
+                if (FLAGS_SET(f->Attribute, EFI_FILE_DIRECTORY))
+                        continue;
+                if (!endswith_no_case(f->FileName, u".efi"))
+                        continue;
+                if (startswith(f->FileName, u"auto-"))
+                        continue;
+
+                /* look for .osrel and .cmdline sections in the .efi binary */
+                err = pe_file_locate_sections(linux_dir, f->FileName, sections, offs, szs);
+                if (err != EFI_SUCCESS || szs[SECTION_OSREL] == 0)
+                        continue;
+
+                err = file_read(linux_dir, f->FileName, offs[SECTION_OSREL], szs[SECTION_OSREL], &content, NULL);
+                if (err != EFI_SUCCESS)
+                        continue;
+
+                /* read properties from the embedded os-release file */
+                while ((line = line_get_key_value(content, "=", &pos, &key, &value)))
+                        if (streq8(key, "PRETTY_NAME")) {
+                                free(os_pretty_name);
+                                os_pretty_name = xstr8_to_16(value);
+
+                        } else if (streq8(key, "IMAGE_ID")) {
+                                free(os_image_id);
+                                os_image_id = xstr8_to_16(value);
+
+                        } else if (streq8(key, "NAME")) {
+                                free(os_name);
+                                os_name = xstr8_to_16(value);
+
+                        } else if (streq8(key, "ID")) {
+                                free(os_id);
+                                os_id = xstr8_to_16(value);
+
+                        } else if (streq8(key, "IMAGE_VERSION")) {
+                                free(os_image_version);
+                                os_image_version = xstr8_to_16(value);
+
+                        } else if (streq8(key, "VERSION")) {
+                                free(os_version);
+                                os_version = xstr8_to_16(value);
+
+                        } else if (streq8(key, "VERSION_ID")) {
+                                free(os_version_id);
+                                os_version_id = xstr8_to_16(value);
+
+                        } else if (streq8(key, "BUILD_ID")) {
+                                free(os_build_id);
+                                os_build_id = xstr8_to_16(value);
+                        }
+
+                if (!bootspec_pick_name_version_sort_key(
+                                    os_pretty_name,
+                                    os_image_id,
+                                    os_name,
+                                    os_id,
+                                    os_image_version,
+                                    os_version,
+                                    os_version_id,
+                                    os_build_id,
+                                    &good_name,
+                                    &good_version,
+                                    &good_sort_key))
+                        continue;
+
+                BootEntry *entry = xnew(BootEntry, 1);
+                *entry = (BootEntry) {
+                        .id = xstrdup16(f->FileName),
+                        .type = LOADER_UNIFIED_LINUX,
+                        .title = xstrdup16(good_name),
+                        .version = xstrdup16(good_version),
+                        .device = device,
+                        .loader = xasprintf("\\EFI\\Linux\\%ls", f->FileName),
+                        .sort_key = xstrdup16(good_sort_key),
+                        .key = 'l',
+                        .tries_done = -1,
+                        .tries_left = -1,
+                };
+
+                strtolower16(entry->id);
+                config_add_entry(config, entry);
+                boot_entry_parse_tries(entry, u"\\EFI\\Linux", f->FileName, u".efi");
+
+                if (szs[SECTION_CMDLINE] == 0)
+                        continue;
+
+                content = mfree(content);
+
+                /* read the embedded cmdline file */
+                size_t cmdline_len;
+                err = file_read(linux_dir, f->FileName, offs[SECTION_CMDLINE], szs[SECTION_CMDLINE], &content, &cmdline_len);
+                if (err == EFI_SUCCESS) {
+                        entry->options = xstrn8_to_16(content, cmdline_len);
+                        mangle_stub_cmdline(entry->options);
+                        entry->options_implied = true;
+                }
+        }
+}
+
+static void config_load_xbootldr(
+                Config *config,
+                EFI_HANDLE *device) {
+
+        _cleanup_(file_closep) EFI_FILE *root_dir = NULL;
+        EFI_HANDLE new_device = NULL;  /* avoid false maybe-uninitialized warning */
+        EFI_STATUS err;
+
+        assert(config);
+        assert(device);
+
+        err = partition_open(MAKE_GUID_PTR(XBOOTLDR), device, &new_device, &root_dir);
+        if (err != EFI_SUCCESS)
+                return;
+
+        config_load_type2_entries(config, new_device, root_dir);
+        config_load_type1_entries(config, new_device, root_dir, NULL);
+}
+
+static EFI_STATUS initrd_prepare(
+                EFI_FILE *root,
+                const BootEntry *entry,
+                char16_t **ret_options,
+                void **ret_initrd,
+                size_t *ret_initrd_size) {
+
+        assert(root);
+        assert(entry);
+        assert(ret_options);
+        assert(ret_initrd);
+        assert(ret_initrd_size);
+
+        if (entry->type != LOADER_LINUX || !entry->initrd) {
+                ret_options = NULL;
+                ret_initrd = NULL;
+                ret_initrd_size = 0;
+                return EFI_SUCCESS;
+        }
+
+        /* Note that order of initrds matters. The kernel will only look for microcode updates in the very
+         * first one it sees. */
+
+        /* Add initrd= to options for older kernels that do not support LINUX_INITRD_MEDIA. Should be dropped
+         * if linux_x86.c is dropped. */
+        _cleanup_free_ char16_t *options = NULL;
+
+        EFI_STATUS err;
+        size_t size = 0;
+        _cleanup_free_ uint8_t *initrd = NULL;
+
+        STRV_FOREACH(i, entry->initrd) {
+                _cleanup_free_ char16_t *o = options;
+                if (o)
+                        options = xasprintf("%ls initrd=%ls", o, *i);
+                else
+                        options = xasprintf("initrd=%ls", *i);
+
+                _cleanup_(file_closep) EFI_FILE *handle = NULL;
+                err = root->Open(root, &handle, *i, EFI_FILE_MODE_READ, 0);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                _cleanup_free_ EFI_FILE_INFO *info = NULL;
+                err = get_file_info(handle, &info, NULL);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                if (info->FileSize == 0) /* Automatically skip over empty files */
+                        continue;
+
+                size_t new_size, read_size = info->FileSize;
+                if (__builtin_add_overflow(size, read_size, &new_size))
+                        return EFI_OUT_OF_RESOURCES;
+                initrd = xrealloc(initrd, size, new_size);
+
+                err = chunked_read(handle, &read_size, initrd + size);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                /* Make sure the actual read size is what we expected. */
+                assert(size + read_size == new_size);
+                size = new_size;
+        }
+
+        if (entry->options) {
+                _cleanup_free_ char16_t *o = options;
+                options = xasprintf("%ls %ls", o, entry->options);
+        }
+
+        *ret_options = TAKE_PTR(options);
+        *ret_initrd = TAKE_PTR(initrd);
+        *ret_initrd_size = size;
+        return EFI_SUCCESS;
+}
+
+static EFI_STATUS image_start(
+                EFI_HANDLE parent_image,
+                const BootEntry *entry) {
+
+        _cleanup_(devicetree_cleanup) struct devicetree_state dtstate = {};
+        _cleanup_(unload_imagep) EFI_HANDLE image = NULL;
+        _cleanup_free_ EFI_DEVICE_PATH *path = NULL;
+        EFI_STATUS err;
+
+        assert(entry);
+
+        /* If this loader entry has a special way to boot, try that first. */
+        if (entry->call)
+                (void) entry->call();
+
+        _cleanup_(file_closep) EFI_FILE *image_root = NULL;
+        err = open_volume(entry->device, &image_root);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error opening root path: %m");
+
+        err = make_file_device_path(entry->device, entry->loader, &path);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error making file device path: %m");
+
+        size_t initrd_size = 0;
+        _cleanup_free_ void *initrd = NULL;
+        _cleanup_free_ char16_t *options_initrd = NULL;
+        err = initrd_prepare(image_root, entry, &options_initrd, &initrd, &initrd_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error preparing initrd: %m");
+
+        err = shim_load_image(parent_image, path, &image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error loading %ls: %m", entry->loader);
+
+        /* DTBs are loaded by the kernel before ExitBootServices, and they can be used to map and assign
+         * arbitrary memory ranges, so skip them when secure boot is enabled as the DTB here is unverified.
+         */
+        if (entry->devicetree && !secure_boot_enabled()) {
+                err = devicetree_install(&dtstate, image_root, entry->devicetree);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Error loading %ls: %m", entry->devicetree);
+        }
+
+        _cleanup_(cleanup_initrd) EFI_HANDLE initrd_handle = NULL;
+        err = initrd_register(initrd, initrd_size, &initrd_handle);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error registering initrd: %m");
+
+        EFI_LOADED_IMAGE_PROTOCOL *loaded_image;
+        err = BS->HandleProtocol(image, MAKE_GUID_PTR(EFI_LOADED_IMAGE_PROTOCOL), (void **) &loaded_image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error getting LoadedImageProtocol handle: %m");
+
+        /* If we had to append an initrd= entry to the command line, we have to pass it, and measure it.
+         * Otherwise, only pass/measure it if it is not implicit anyway (i.e. embedded into the UKI or
+         * so). */
+        char16_t *options = options_initrd ?: entry->options_implied ? NULL : entry->options;
+        if (options) {
+                loaded_image->LoadOptions = options;
+                loaded_image->LoadOptionsSize = strsize16(options);
+
+                /* Try to log any options to the TPM, especially to catch manually edited options */
+                (void) tpm_log_load_options(options, NULL);
+        }
+
+        efivar_set_time_usec(MAKE_GUID_PTR(LOADER), u"LoaderTimeExecUSec", 0);
+        err = BS->StartImage(image, NULL, NULL);
+        graphics_mode(false);
+        if (err == EFI_SUCCESS)
+                return EFI_SUCCESS;
+
+        /* Try calling the kernel compat entry point if one exists. */
+        if (err == EFI_UNSUPPORTED && entry->type == LOADER_LINUX) {
+                uint32_t compat_address;
+
+                err = pe_kernel_info(loaded_image->ImageBase, &compat_address);
+                if (err != EFI_SUCCESS) {
+                        if (err != EFI_UNSUPPORTED)
+                                return log_error_status(err, "Error finding kernel compat entry address: %m");
+                } else if (compat_address > 0) {
+                        EFI_IMAGE_ENTRY_POINT kernel_entry =
+                                (EFI_IMAGE_ENTRY_POINT) ((uint8_t *) loaded_image->ImageBase + compat_address);
+
+                        err = kernel_entry(image, ST);
+                        graphics_mode(false);
+                        if (err == EFI_SUCCESS)
+                                return EFI_SUCCESS;
+                } else
+                        err = EFI_UNSUPPORTED;
+        }
+
+        return log_error_status(err, "Failed to execute %ls (%ls): %m", entry->title_show, entry->loader);
+}
+
+static void config_free(Config *config) {
+        assert(config);
+        for (size_t i = 0; i < config->n_entries; i++)
+                boot_entry_free(config->entries[i]);
+        free(config->entries);
+        free(config->entry_default_config);
+        free(config->entry_default_efivar);
+        free(config->entry_oneshot);
+        free(config->entry_saved);
+}
+
+static void config_write_entries_to_variable(Config *config) {
+        _cleanup_free_ char *buffer = NULL;
+        size_t sz = 0;
+        char *p;
+
+        assert(config);
+
+        for (size_t i = 0; i < config->n_entries; i++)
+                sz += strsize16(config->entries[i]->id);
+
+        p = buffer = xmalloc(sz);
+
+        for (size_t i = 0; i < config->n_entries; i++)
+                p = mempcpy(p, config->entries[i]->id, strsize16(config->entries[i]->id));
+
+        assert(p == buffer + sz);
+
+        /* Store the full list of discovered entries. */
+        (void) efivar_set_raw(MAKE_GUID_PTR(LOADER), u"LoaderEntries", buffer, sz, 0);
+}
+
+static void save_selected_entry(const Config *config, const BootEntry *entry) {
+        assert(config);
+        assert(entry);
+        assert(entry->loader || !entry->call);
+
+        /* Always export the selected boot entry to the system in a volatile var. */
+        (void) efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderEntrySelected", entry->id, 0);
+
+        /* Do not save or delete if this was a oneshot boot. */
+        if (streq16(config->entry_oneshot, entry->id))
+                return;
+
+        if (config->use_saved_entry_efivar || (!config->entry_default_efivar && config->use_saved_entry)) {
+                /* Avoid unnecessary NVRAM writes. */
+                if (streq16(config->entry_saved, entry->id))
+                        return;
+
+                (void) efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderEntryLastBooted", entry->id, EFI_VARIABLE_NON_VOLATILE);
+        } else
+                /* Delete the non-volatile var if not needed. */
+                (void) efivar_unset(MAKE_GUID_PTR(LOADER), u"LoaderEntryLastBooted", EFI_VARIABLE_NON_VOLATILE);
+}
+
+static EFI_STATUS secure_boot_discover_keys(Config *config, EFI_FILE *root_dir) {
+        EFI_STATUS err;
+        _cleanup_(file_closep) EFI_FILE *keys_basedir = NULL;
+
+        if (secure_boot_mode() != SECURE_BOOT_SETUP)
+                return EFI_SUCCESS;
+
+        /* the lack of a 'keys' directory is not fatal and is silently ignored */
+        err = open_directory(root_dir, u"\\loader\\keys", &keys_basedir);
+        if (err == EFI_NOT_FOUND)
+                return EFI_SUCCESS;
+        if (err != EFI_SUCCESS)
+                return err;
+
+        for (;;) {
+                _cleanup_free_ EFI_FILE_INFO *dirent = NULL;
+                size_t dirent_size = 0;
+                BootEntry *entry = NULL;
+
+                err = readdir(keys_basedir, &dirent, &dirent_size);
+                if (err != EFI_SUCCESS || !dirent)
+                        return err;
+
+                if (dirent->FileName[0] == '.')
+                        continue;
+
+                if (!FLAGS_SET(dirent->Attribute, EFI_FILE_DIRECTORY))
+                        continue;
+
+                entry = xnew(BootEntry, 1);
+                *entry = (BootEntry) {
+                        .id = xasprintf("secure-boot-keys-%ls", dirent->FileName),
+                        .title = xasprintf("Enroll Secure Boot keys: %ls", dirent->FileName),
+                        .path = xasprintf("\\loader\\keys\\%ls", dirent->FileName),
+                        .type = LOADER_SECURE_BOOT_KEYS,
+                        .tries_done = -1,
+                        .tries_left = -1,
+                };
+                config_add_entry(config, entry);
+
+                if (IN_SET(config->secure_boot_enroll, ENROLL_IF_SAFE, ENROLL_FORCE) &&
+                    strcaseeq16(dirent->FileName, u"auto"))
+                        /* If we auto enroll successfully this call does not return.
+                         * If it fails we still want to add other potential entries to the menu. */
+                        secure_boot_enroll_at(root_dir, entry->path, config->secure_boot_enroll == ENROLL_FORCE);
+        }
+
+        return EFI_SUCCESS;
+}
+
+static void export_variables(
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                const char16_t *loaded_image_path,
+                uint64_t init_usec) {
+
+        static const uint64_t loader_features =
+                EFI_LOADER_FEATURE_CONFIG_TIMEOUT |
+                EFI_LOADER_FEATURE_CONFIG_TIMEOUT_ONE_SHOT |
+                EFI_LOADER_FEATURE_ENTRY_DEFAULT |
+                EFI_LOADER_FEATURE_ENTRY_ONESHOT |
+                EFI_LOADER_FEATURE_BOOT_COUNTING |
+                EFI_LOADER_FEATURE_XBOOTLDR |
+                EFI_LOADER_FEATURE_RANDOM_SEED |
+                EFI_LOADER_FEATURE_LOAD_DRIVER |
+                EFI_LOADER_FEATURE_SORT_KEY |
+                EFI_LOADER_FEATURE_SAVED_ENTRY |
+                EFI_LOADER_FEATURE_DEVICETREE |
+                EFI_LOADER_FEATURE_SECUREBOOT_ENROLL |
+                EFI_LOADER_FEATURE_RETAIN_SHIM |
+                EFI_LOADER_FEATURE_MENU_DISABLE |
+                0;
+
+        _cleanup_free_ char16_t *infostr = NULL, *typestr = NULL;
+
+        assert(loaded_image);
+
+        efivar_set_time_usec(MAKE_GUID_PTR(LOADER), u"LoaderTimeInitUSec", init_usec);
+        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderInfo", u"systemd-boot " GIT_VERSION, 0);
+
+        infostr = xasprintf("%ls %u.%02u", ST->FirmwareVendor, ST->FirmwareRevision >> 16, ST->FirmwareRevision & 0xffff);
+        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderFirmwareInfo", infostr, 0);
+
+        typestr = xasprintf("UEFI %u.%02u", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff);
+        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderFirmwareType", typestr, 0);
+
+        (void) efivar_set_uint64_le(MAKE_GUID_PTR(LOADER), u"LoaderFeatures", loader_features, 0);
+
+        /* the filesystem path to this image, to prevent adding ourselves to the menu */
+        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderImageIdentifier", loaded_image_path, 0);
+
+        /* export the device path this image is started from */
+        _cleanup_free_ char16_t *uuid = disk_get_part_uuid(loaded_image->DeviceHandle);
+        if (uuid)
+                efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderDevicePartUUID", uuid, 0);
+}
+
+static void config_load_all_entries(
+                Config *config,
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                const char16_t *loaded_image_path,
+                EFI_FILE *root_dir) {
+
+        assert(config);
+        assert(loaded_image);
+        assert(root_dir);
+
+        config_load_defaults(config, root_dir);
+
+        /* Scan /EFI/Linux/ directory */
+        config_load_type2_entries(config, loaded_image->DeviceHandle, root_dir);
+
+        /* Scan /loader/entries/\*.conf files */
+        config_load_type1_entries(config, loaded_image->DeviceHandle, root_dir, loaded_image_path);
+
+        /* Similar, but on any XBOOTLDR partition */
+        config_load_xbootldr(config, loaded_image->DeviceHandle);
+
+        /* Sort entries after version number */
+        sort_pointer_array((void **) config->entries, config->n_entries, (compare_pointer_func_t) boot_entry_compare);
+
+        /* If we find some well-known loaders, add them to the end of the list */
+        config_add_entry_osx(config);
+        config_add_entry_windows(config, loaded_image->DeviceHandle, root_dir);
+        config_add_entry_loader_auto(config, loaded_image->DeviceHandle, root_dir, NULL,
+                                     u"auto-efi-shell", 's', u"EFI Shell", u"\\shell" EFI_MACHINE_TYPE_NAME ".efi");
+        config_add_entry_loader_auto(config, loaded_image->DeviceHandle, root_dir, loaded_image_path,
+                                     u"auto-efi-default", '\0', u"EFI Default Loader", NULL);
+
+        if (config->auto_firmware && FLAGS_SET(get_os_indications_supported(), EFI_OS_INDICATIONS_BOOT_TO_FW_UI)) {
+                BootEntry *entry = xnew(BootEntry, 1);
+                *entry = (BootEntry) {
+                        .id = xstrdup16(u"auto-reboot-to-firmware-setup"),
+                        .title = xstrdup16(u"Reboot Into Firmware Interface"),
+                        .call = reboot_into_firmware,
+                        .tries_done = -1,
+                        .tries_left = -1,
+                };
+                config_add_entry(config, entry);
+        }
+
+        if (config->auto_poweroff) {
+                BootEntry *entry = xnew(BootEntry, 1);
+                *entry = (BootEntry) {
+                        .id = xstrdup16(u"auto-poweroff"),
+                        .title = xstrdup16(u"Power Off The System"),
+                        .call = poweroff_system,
+                        .tries_done = -1,
+                        .tries_left = -1,
+                };
+                config_add_entry(config, entry);
+        }
+
+        if (config->auto_reboot) {
+                BootEntry *entry = xnew(BootEntry, 1);
+                *entry = (BootEntry) {
+                        .id = xstrdup16(u"auto-reboot"),
+                        .title = xstrdup16(u"Reboot The System"),
+                        .call = reboot_system,
+                        .tries_done = -1,
+                        .tries_left = -1,
+                };
+                config_add_entry(config, entry);
+        }
+
+        /* Find secure boot signing keys and autoload them if configured.
+         * Otherwise, create menu entries so that the user can load them manually.
+         * If the secure-boot-enroll variable is set to no (the default), we do not
+         * even search for keys on the ESP */
+        if (config->secure_boot_enroll != ENROLL_OFF)
+                secure_boot_discover_keys(config, root_dir);
+
+        if (config->n_entries == 0)
+                return;
+
+        config_write_entries_to_variable(config);
+
+        generate_boot_entry_titles(config);
+
+        /* Select entry by configured pattern or EFI LoaderDefaultEntry= variable */
+        config_select_default_entry(config);
+}
+
+static EFI_STATUS discover_root_dir(EFI_LOADED_IMAGE_PROTOCOL *loaded_image, EFI_FILE **ret_dir) {
+        if (is_direct_boot(loaded_image->DeviceHandle))
+                return vmm_open(&loaded_image->DeviceHandle, ret_dir);
+        else
+                return open_volume(loaded_image->DeviceHandle, ret_dir);
+}
+
+static EFI_STATUS run(EFI_HANDLE image) {
+        EFI_LOADED_IMAGE_PROTOCOL *loaded_image;
+        _cleanup_(file_closep) EFI_FILE *root_dir = NULL;
+        _cleanup_(config_free) Config config = {};
+        _cleanup_free_ char16_t *loaded_image_path = NULL;
+        EFI_STATUS err;
+        uint64_t init_usec;
+        bool menu = false;
+
+        init_usec = time_usec();
+
+        /* Ask Shim to leave its protocol around, so that the stub can use it to validate PEs.
+         * By default, Shim uninstalls its protocol when calling StartImage(). */
+        shim_retain_protocol();
+
+        err = BS->HandleProtocol(image, MAKE_GUID_PTR(EFI_LOADED_IMAGE_PROTOCOL), (void **) &loaded_image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error getting a LoadedImageProtocol handle: %m");
+
+        (void) device_path_to_str(loaded_image->FilePath, &loaded_image_path);
+
+        export_variables(loaded_image, loaded_image_path, init_usec);
+
+        err = discover_root_dir(loaded_image, &root_dir);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Unable to open root directory: %m");
+
+        (void) load_drivers(image, loaded_image, root_dir);
+
+        config_load_all_entries(&config, loaded_image, loaded_image_path, root_dir);
+
+        if (config.n_entries == 0)
+                return log_error_status(
+                                EFI_NOT_FOUND,
+                                "No loader found. Configuration files in \\loader\\entries\\*.conf are needed.");
+
+        /* select entry or show menu when key is pressed or timeout is set */
+        if (config.force_menu || !IN_SET(config.timeout_sec, TIMEOUT_MENU_HIDDEN, TIMEOUT_MENU_DISABLED))
+                menu = true;
+        else if (config.timeout_sec != TIMEOUT_MENU_DISABLED) {
+                uint64_t key;
+
+                /* Block up to 100ms to give firmware time to get input working. */
+                err = console_key_read(&key, 100 * 1000);
+                if (err == EFI_SUCCESS) {
+                        /* find matching key in boot entries */
+                        size_t idx = entry_lookup_key(&config, config.idx_default, KEYCHAR(key));
+                        if (idx != IDX_INVALID)
+                                config.idx_default = idx;
+                        else
+                                menu = true;
+                }
+        }
+
+        for (;;) {
+                BootEntry *entry;
+
+                entry = config.entries[config.idx_default];
+                if (menu) {
+                        efivar_set_time_usec(MAKE_GUID_PTR(LOADER), u"LoaderTimeMenuUSec", 0);
+                        if (!menu_run(&config, &entry, loaded_image_path))
+                                return EFI_SUCCESS;
+                }
+
+                /* if auto enrollment is activated, we try to load keys for the given entry. */
+                if (entry->type == LOADER_SECURE_BOOT_KEYS && config.secure_boot_enroll != ENROLL_OFF) {
+                        err = secure_boot_enroll_at(root_dir, entry->path, /*force=*/ true);
+                        if (err != EFI_SUCCESS)
+                                return err;
+                        continue;
+                }
+
+                /* Run special entry like "reboot" now. Those that have a loader
+                 * will be handled by image_start() instead. */
+                if (entry->call && !entry->loader) {
+                        entry->call();
+                        continue;
+                }
+
+                (void) boot_entry_bump_counters(entry);
+                save_selected_entry(&config, entry);
+
+                /* Optionally, read a random seed off the ESP and pass it to the OS */
+                (void) process_random_seed(root_dir);
+
+                err = image_start(image, entry);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                menu = true;
+                config.timeout_sec = 0;
+        }
+}
+
+DEFINE_EFI_MAIN_FUNCTION(run, "systemd-boot", /*wait_for_debugger=*/false);
diff --git a/src/boot/efi/console.c b/src/boot/efi/console.c
new file mode 100644
index 0000000..067ee7c
--- /dev/null
+++ b/src/boot/efi/console.c
@@ -0,0 +1,312 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "console.h"
+#include "proto/graphics-output.h"
+#include "util.h"
+
+#define SYSTEM_FONT_WIDTH 8
+#define SYSTEM_FONT_HEIGHT 19
+#define HORIZONTAL_MAX_OK 1920
+#define VERTICAL_MAX_OK 1080
+#define VIEWPORT_RATIO 10
+
+static void event_closep(EFI_EVENT *event) {
+        if (!*event)
+                return;
+
+        BS->CloseEvent(*event);
+}
+
+/*
+ * Reading input from the console sounds like an easy task to do, but thanks to broken
+ * firmware it is actually a nightmare.
+ *
+ * There is a SimpleTextInput and SimpleTextInputEx API for this. Ideally we want to use
+ * TextInputEx, because that gives us Ctrl/Alt/Shift key state information. Unfortunately,
+ * it is not always available and sometimes just non-functional.
+ *
+ * On some firmware, calling ReadKeyStroke or ReadKeyStrokeEx on the default console input
+ * device will just freeze no matter what (even though it *reported* being ready).
+ * Also, multiple input protocols can be backed by the same device, but they can be out of
+ * sync. Falling back on a different protocol can end up with double input.
+ *
+ * Therefore, we will preferably use TextInputEx for ConIn if that is available. Additionally,
+ * we look for the first TextInputEx device the firmware gives us as a fallback option. It
+ * will replace ConInEx permanently if it ever reports a key press.
+ * Lastly, a timer event allows us to provide a input timeout without having to call into
+ * any input functions that can freeze on us or using a busy/stall loop. */
+EFI_STATUS console_key_read(uint64_t *key, uint64_t timeout_usec) {
+        static EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL *conInEx = NULL, *extraInEx = NULL;
+        static bool checked = false;
+        size_t index;
+        EFI_STATUS err;
+        _cleanup_(event_closep) EFI_EVENT timer = NULL;
+
+        assert(key);
+
+        if (!checked) {
+                /* Get the *first* TextInputEx device. */
+                err = BS->LocateProtocol(
+                                MAKE_GUID_PTR(EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL), NULL, (void **) &extraInEx);
+                if (err != EFI_SUCCESS || BS->CheckEvent(extraInEx->WaitForKeyEx) == EFI_INVALID_PARAMETER)
+                        /* If WaitForKeyEx fails here, the firmware pretends it talks this
+                         * protocol, but it really doesn't. */
+                        extraInEx = NULL;
+
+                /* Get the TextInputEx version of ST->ConIn. */
+                err = BS->HandleProtocol(
+                                ST->ConsoleInHandle,
+                                MAKE_GUID_PTR(EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL),
+                                (void **) &conInEx);
+                if (err != EFI_SUCCESS || BS->CheckEvent(conInEx->WaitForKeyEx) == EFI_INVALID_PARAMETER)
+                        conInEx = NULL;
+
+                if (conInEx == extraInEx)
+                        extraInEx = NULL;
+
+                checked = true;
+        }
+
+        err = BS->CreateEvent(EVT_TIMER, 0, NULL, NULL, &timer);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error creating timer event: %m");
+
+        EFI_EVENT events[] = {
+                timer,
+                conInEx ? conInEx->WaitForKeyEx : ST->ConIn->WaitForKey,
+                extraInEx ? extraInEx->WaitForKeyEx : NULL,
+        };
+        size_t n_events = extraInEx ? 3 : 2;
+
+        /* Watchdog rearming loop in case the user never provides us with input or some
+         * broken firmware never returns from WaitForEvent. */
+        for (;;) {
+                uint64_t watchdog_timeout_sec = 5 * 60,
+                       watchdog_ping_usec = watchdog_timeout_sec / 2 * 1000 * 1000;
+
+                /* SetTimer expects 100ns units for some reason. */
+                err = BS->SetTimer(
+                                timer,
+                                TimerRelative,
+                                MIN(timeout_usec, watchdog_ping_usec) * 10);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Error arming timer event: %m");
+
+                (void) BS->SetWatchdogTimer(watchdog_timeout_sec, 0x10000, 0, NULL);
+                err = BS->WaitForEvent(n_events, events, &index);
+                (void) BS->SetWatchdogTimer(watchdog_timeout_sec, 0x10000, 0, NULL);
+
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Error waiting for events: %m");
+
+                /* We have keyboard input, process it after this loop. */
+                if (timer != events[index])
+                        break;
+
+                /* The EFI timer fired instead. If this was a watchdog timeout, loop again. */
+                if (timeout_usec == UINT64_MAX)
+                        continue;
+                else if (timeout_usec > watchdog_ping_usec) {
+                        timeout_usec -= watchdog_ping_usec;
+                        continue;
+                }
+
+                /* The caller requested a timeout? They shall have one! */
+                return EFI_TIMEOUT;
+        }
+
+        /* If the extra input device we found returns something, always use that instead
+         * to work around broken firmware freezing on ConIn/ConInEx. */
+        if (extraInEx && BS->CheckEvent(extraInEx->WaitForKeyEx) == EFI_SUCCESS) {
+                conInEx = extraInEx;
+                extraInEx = NULL;
+        }
+
+        /* Do not fall back to ConIn if we have a ConIn that supports TextInputEx.
+         * The two may be out of sync on some firmware, giving us double input. */
+        if (conInEx) {
+                EFI_KEY_DATA keydata;
+                uint32_t shift = 0;
+
+                err = conInEx->ReadKeyStrokeEx(conInEx, &keydata);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                if (FLAGS_SET(keydata.KeyState.KeyShiftState, EFI_SHIFT_STATE_VALID)) {
+                        /* Do not distinguish between left and right keys (set both flags). */
+                        if (keydata.KeyState.KeyShiftState & EFI_CONTROL_PRESSED)
+                                shift |= EFI_CONTROL_PRESSED;
+                        if (keydata.KeyState.KeyShiftState & EFI_ALT_PRESSED)
+                                shift |= EFI_ALT_PRESSED;
+                        if (keydata.KeyState.KeyShiftState & EFI_LOGO_PRESSED)
+                                shift |= EFI_LOGO_PRESSED;
+
+                        /* Shift is not supposed to be reported for keys that can be represented as uppercase
+                         * unicode chars (Shift+f is reported as F instead). Some firmware does it anyway, so
+                         * filter those out. */
+                        if ((keydata.KeyState.KeyShiftState & EFI_SHIFT_PRESSED) &&
+                            keydata.Key.UnicodeChar == 0)
+                                shift |= EFI_SHIFT_PRESSED;
+                }
+
+                /* 32 bit modifier keys + 16 bit scan code + 16 bit unicode */
+                *key = KEYPRESS(shift, keydata.Key.ScanCode, keydata.Key.UnicodeChar);
+                return EFI_SUCCESS;
+        } else if (BS->CheckEvent(ST->ConIn->WaitForKey) == EFI_SUCCESS) {
+                EFI_INPUT_KEY k;
+
+                err = ST->ConIn->ReadKeyStroke(ST->ConIn, &k);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                *key = KEYPRESS(0, k.ScanCode, k.UnicodeChar);
+                return EFI_SUCCESS;
+        }
+
+        return EFI_NOT_READY;
+}
+
+static EFI_STATUS change_mode(int64_t mode) {
+        EFI_STATUS err;
+        int32_t old_mode;
+
+        /* SetMode expects a size_t, so make sure these values are sane. */
+        mode = CLAMP(mode, CONSOLE_MODE_RANGE_MIN, CONSOLE_MODE_RANGE_MAX);
+        old_mode = MAX(CONSOLE_MODE_RANGE_MIN, ST->ConOut->Mode->Mode);
+
+        log_wait();
+        err = ST->ConOut->SetMode(ST->ConOut, mode);
+        if (err == EFI_SUCCESS)
+                return EFI_SUCCESS;
+
+        /* Something went wrong. Output is probably borked, so try to revert to previous mode. */
+        if (ST->ConOut->SetMode(ST->ConOut, old_mode) == EFI_SUCCESS)
+                return err;
+
+        /* Maybe the device is on fire? */
+        ST->ConOut->Reset(ST->ConOut, true);
+        ST->ConOut->SetMode(ST->ConOut, CONSOLE_MODE_RANGE_MIN);
+        return err;
+}
+
+EFI_STATUS query_screen_resolution(uint32_t *ret_w, uint32_t *ret_h) {
+        EFI_STATUS err;
+        EFI_GRAPHICS_OUTPUT_PROTOCOL *go;
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_GRAPHICS_OUTPUT_PROTOCOL), NULL, (void **) &go);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (!go->Mode || !go->Mode->Info)
+                return EFI_DEVICE_ERROR;
+
+        *ret_w = go->Mode->Info->HorizontalResolution;
+        *ret_h = go->Mode->Info->VerticalResolution;
+        return EFI_SUCCESS;
+}
+
+static int64_t get_auto_mode(void) {
+        uint32_t screen_width, screen_height;
+
+        if (query_screen_resolution(&screen_width, &screen_height) == EFI_SUCCESS) {
+                bool keep = false;
+
+                /* Start verifying if we are in a resolution larger than Full HD
+                 * (1920x1080). If we're not, assume we're in a good mode and do not
+                 * try to change it. */
+                if (screen_width <= HORIZONTAL_MAX_OK && screen_height <= VERTICAL_MAX_OK)
+                        keep = true;
+                /* For larger resolutions, calculate the ratio of the total screen
+                 * area to the text viewport area. If it's less than 10 times bigger,
+                 * then assume the text is readable and keep the text mode. */
+                else {
+                        uint64_t text_area;
+                        size_t x_max, y_max;
+                        uint64_t screen_area = (uint64_t)screen_width * (uint64_t)screen_height;
+
+                        console_query_mode(&x_max, &y_max);
+                        text_area = SYSTEM_FONT_WIDTH * SYSTEM_FONT_HEIGHT * (uint64_t)x_max * (uint64_t)y_max;
+
+                        if (text_area != 0 && screen_area/text_area < VIEWPORT_RATIO)
+                                keep = true;
+                }
+
+                if (keep)
+                        return ST->ConOut->Mode->Mode;
+        }
+
+        /* If we reached here, then we have a high resolution screen and the text
+         * viewport is less than 10% the screen area, so the firmware developer
+         * screwed up. Try to switch to a better mode. Mode number 2 is first non
+         * standard mode, which is provided by the device manufacturer, so it should
+         * be a good mode.
+         * Note: MaxMode is the number of modes, not the last mode. */
+        if (ST->ConOut->Mode->MaxMode > CONSOLE_MODE_FIRMWARE_FIRST)
+                return CONSOLE_MODE_FIRMWARE_FIRST;
+
+        /* Try again with mode different than zero (assume user requests
+         * auto mode due to some problem with mode zero). */
+        if (ST->ConOut->Mode->MaxMode > CONSOLE_MODE_80_50)
+                return CONSOLE_MODE_80_50;
+
+        return CONSOLE_MODE_80_25;
+}
+
+EFI_STATUS console_set_mode(int64_t mode) {
+        switch (mode) {
+        case CONSOLE_MODE_KEEP:
+                /* If the firmware indicates the current mode is invalid, change it anyway. */
+                if (ST->ConOut->Mode->Mode < CONSOLE_MODE_RANGE_MIN)
+                        return change_mode(CONSOLE_MODE_RANGE_MIN);
+                return EFI_SUCCESS;
+
+        case CONSOLE_MODE_NEXT:
+                if (ST->ConOut->Mode->MaxMode <= CONSOLE_MODE_RANGE_MIN)
+                        return EFI_UNSUPPORTED;
+
+                mode = MAX(CONSOLE_MODE_RANGE_MIN, ST->ConOut->Mode->Mode);
+                do {
+                        mode = (mode + 1) % ST->ConOut->Mode->MaxMode;
+                        if (change_mode(mode) == EFI_SUCCESS)
+                                break;
+                        /* If this mode is broken/unsupported, try the next.
+                         * If mode is 0, we wrapped around and should stop. */
+                } while (mode > CONSOLE_MODE_RANGE_MIN);
+
+                return EFI_SUCCESS;
+
+        case CONSOLE_MODE_AUTO:
+                return change_mode(get_auto_mode());
+
+        case CONSOLE_MODE_FIRMWARE_MAX:
+                /* Note: MaxMode is the number of modes, not the last mode. */
+                return change_mode(ST->ConOut->Mode->MaxMode - 1LL);
+
+        default:
+                return change_mode(mode);
+        }
+}
+
+EFI_STATUS console_query_mode(size_t *x_max, size_t *y_max) {
+        EFI_STATUS err;
+
+        assert(x_max);
+        assert(y_max);
+
+        err = ST->ConOut->QueryMode(ST->ConOut, ST->ConOut->Mode->Mode, x_max, y_max);
+        if (err != EFI_SUCCESS) {
+                /* Fallback values mandated by UEFI spec. */
+                switch (ST->ConOut->Mode->Mode) {
+                case CONSOLE_MODE_80_50:
+                        *x_max = 80;
+                        *y_max = 50;
+                        break;
+                case CONSOLE_MODE_80_25:
+                default:
+                        *x_max = 80;
+                        *y_max = 25;
+                }
+        }
+
+        return err;
+}
diff --git a/src/boot/efi/console.h b/src/boot/efi/console.h
new file mode 100644
index 0000000..c4d821a
--- /dev/null
+++ b/src/boot/efi/console.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+#include "proto/simple-text-io.h"
+
+enum {
+        EFI_SHIFT_PRESSED   = EFI_RIGHT_SHIFT_PRESSED|EFI_LEFT_SHIFT_PRESSED,
+        EFI_CONTROL_PRESSED = EFI_RIGHT_CONTROL_PRESSED|EFI_LEFT_CONTROL_PRESSED,
+        EFI_ALT_PRESSED     = EFI_RIGHT_ALT_PRESSED|EFI_LEFT_ALT_PRESSED,
+        EFI_LOGO_PRESSED    = EFI_RIGHT_LOGO_PRESSED|EFI_LEFT_LOGO_PRESSED,
+};
+
+#define KEYPRESS(keys, scan, uni) ((((uint64_t)keys) << 32) | (((uint64_t)scan) << 16) | (uni))
+#define KEYCHAR(k) ((char16_t)(k))
+#define CHAR_CTRL(c) ((c) - 'a' + 1)
+
+enum {
+        /* Console mode is a int32_t in EFI. We use int64_t to make room for our special values. */
+        CONSOLE_MODE_RANGE_MIN = 0,
+        CONSOLE_MODE_RANGE_MAX = INT32_MAX, /* This is just the theoretical limit. */
+        CONSOLE_MODE_INVALID = -1,          /* UEFI uses -1 if the device is not in a valid text mode. */
+
+        CONSOLE_MODE_80_25 = 0,             /* 80x25 is required by UEFI spec. */
+        CONSOLE_MODE_80_50 = 1,             /* 80x50 may be supported. */
+        CONSOLE_MODE_FIRMWARE_FIRST = 2,    /* First custom mode, if supported. */
+
+        /* These are our own mode values that map to concrete values at runtime. */
+        CONSOLE_MODE_KEEP = CONSOLE_MODE_RANGE_MAX + 1LL,
+        CONSOLE_MODE_NEXT,
+        CONSOLE_MODE_AUTO,
+        CONSOLE_MODE_FIRMWARE_MAX, /* 'max' in config. */
+};
+
+EFI_STATUS console_key_read(uint64_t *key, uint64_t timeout_usec);
+EFI_STATUS console_set_mode(int64_t mode);
+EFI_STATUS console_query_mode(size_t *x_max, size_t *y_max);
+EFI_STATUS query_screen_resolution(uint32_t *ret_width, uint32_t *ret_height);
diff --git a/src/boot/efi/cpio.c b/src/boot/efi/cpio.c
new file mode 100644
index 0000000..5b90e17
--- /dev/null
+++ b/src/boot/efi/cpio.c
@@ -0,0 +1,512 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cpio.h"
+#include "device-path-util.h"
+#include "measure.h"
+#include "proto/device-path.h"
+#include "util.h"
+
+static char *write_cpio_word(char *p, uint32_t v) {
+        static const char hex[] = "0123456789abcdef";
+
+        assert(p);
+
+        /* Writes a CPIO header 8 character hex value */
+
+        for (size_t i = 0; i < 8; i++)
+                p[7-i] = hex[(v >> (4 * i)) & 0xF];
+
+        return p + 8;
+}
+
+static char *mangle_filename(char *p, const char16_t *f) {
+        char* w;
+
+        assert(p);
+        assert(f);
+
+        /* Basically converts UTF-16 to plain ASCII (note that we filtered non-ASCII filenames beforehand, so
+         * this operation is always safe) */
+
+        for (w = p; *f != 0; f++) {
+                assert(*f <= 0x7fu);
+
+                *(w++) = *f;
+        }
+
+        *(w++) = 0;
+        return w;
+}
+
+static char *pad4(char *p, const char *start) {
+        assert(p);
+        assert(start);
+        assert(p >= start);
+
+        /* Appends NUL bytes to 'p', until the address is divisible by 4, when taken relative to 'start' */
+
+        while ((p - start) % 4 != 0)
+                *(p++) = 0;
+
+        return p;
+}
+
+static EFI_STATUS pack_cpio_one(
+                const char16_t *fname,
+                const void *contents,
+                size_t contents_size,
+                const char *target_dir_prefix,
+                uint32_t access_mode,
+                uint32_t *inode_counter,
+                void **cpio_buffer,
+                size_t *cpio_buffer_size) {
+
+        size_t l, target_dir_prefix_size, fname_size, q;
+        char *a;
+
+        assert(fname);
+        assert(contents_size || contents_size == 0);
+        assert(target_dir_prefix);
+        assert(inode_counter);
+        assert(cpio_buffer);
+        assert(cpio_buffer_size);
+
+        /* Serializes one file in the cpio format understood by the kernel initrd logic.
+         *
+         * See: https://docs.kernel.org/driver-api/early-userspace/buffer-format.html */
+
+        if (contents_size > UINT32_MAX) /* cpio cannot deal with > 32-bit file sizes */
+                return EFI_LOAD_ERROR;
+
+        if (*inode_counter == UINT32_MAX) /* more than 2^32-1 inodes? yikes. cpio doesn't support that either */
+                return EFI_OUT_OF_RESOURCES;
+
+        l = 6 + 13*8 + 1 + 1; /* Fixed CPIO header size, slash separator, and NUL byte after the file name */
+
+        target_dir_prefix_size = strlen8(target_dir_prefix);
+        if (l > SIZE_MAX - target_dir_prefix_size)
+                return EFI_OUT_OF_RESOURCES;
+        l += target_dir_prefix_size;
+
+        fname_size = strlen16(fname);
+        if (l > SIZE_MAX - fname_size)
+                return EFI_OUT_OF_RESOURCES;
+        l += fname_size; /* append space for file name */
+
+        /* CPIO can't deal with fnames longer than 2^32-1 */
+        if (target_dir_prefix_size + fname_size >= UINT32_MAX)
+                return EFI_OUT_OF_RESOURCES;
+
+        /* Align the whole header to 4 byte size */
+        l = ALIGN4(l);
+        if (l == SIZE_MAX) /* overflow check */
+                return EFI_OUT_OF_RESOURCES;
+
+        /* Align the contents to 4 byte size */
+        q = ALIGN4(contents_size);
+        if (q == SIZE_MAX) /* overflow check */
+                return EFI_OUT_OF_RESOURCES;
+
+        if (l > SIZE_MAX - q) /* overflow check */
+                return EFI_OUT_OF_RESOURCES;
+        l += q; /* Add contents to header */
+
+        if (*cpio_buffer_size > SIZE_MAX - l) /* overflow check */
+                return EFI_OUT_OF_RESOURCES;
+        a = xrealloc(*cpio_buffer, *cpio_buffer_size, *cpio_buffer_size + l);
+
+        *cpio_buffer = a;
+        a = (char *) *cpio_buffer + *cpio_buffer_size;
+
+        a = mempcpy(a, "070701", 6); /* magic ID */
+
+        a = write_cpio_word(a, (*inode_counter)++);                         /* inode */
+        a = write_cpio_word(a, access_mode | 0100000 /* = S_IFREG */);      /* mode */
+        a = write_cpio_word(a, 0);                                          /* uid */
+        a = write_cpio_word(a, 0);                                          /* gid */
+        a = write_cpio_word(a, 1);                                          /* nlink */
+
+        /* Note: we don't make any attempt to propagate the mtime here, for two reasons: it's a mess given
+         * that FAT usually is assumed to operate with timezoned timestamps, while UNIX does not. More
+         * importantly though: the modifications times would hamper our goals of providing stable
+         * measurements for the same boots. After all we extend the initrds we generate here into TPM2
+         * PCRs. */
+        a = write_cpio_word(a, 0);                                          /* mtime */
+        a = write_cpio_word(a, contents_size);                              /* size */
+        a = write_cpio_word(a, 0);                                          /* major(dev) */
+        a = write_cpio_word(a, 0);                                          /* minor(dev) */
+        a = write_cpio_word(a, 0);                                          /* major(rdev) */
+        a = write_cpio_word(a, 0);                                          /* minor(rdev) */
+        a = write_cpio_word(a, target_dir_prefix_size + fname_size + 2);    /* fname size */
+        a = write_cpio_word(a, 0);                                          /* "crc" */
+
+        a = mempcpy(a, target_dir_prefix, target_dir_prefix_size);
+        *(a++) = '/';
+        a = mangle_filename(a, fname);
+
+        /* Pad to next multiple of 4 */
+        a = pad4(a, *cpio_buffer);
+
+        a = mempcpy(a, contents, contents_size);
+
+        /* Pad to next multiple of 4 */
+        a = pad4(a, *cpio_buffer);
+
+        assert(a == (char *) *cpio_buffer + *cpio_buffer_size + l);
+        *cpio_buffer_size += l;
+
+        return EFI_SUCCESS;
+}
+
+static EFI_STATUS pack_cpio_dir(
+                const char *path,
+                uint32_t access_mode,
+                uint32_t *inode_counter,
+                void **cpio_buffer,
+                size_t *cpio_buffer_size) {
+
+        size_t l, path_size;
+        char *a;
+
+        assert(path);
+        assert(inode_counter);
+        assert(cpio_buffer);
+        assert(cpio_buffer_size);
+
+        /* Serializes one directory inode in cpio format. Note that cpio archives must first create the dirs
+         * they want to place files in. */
+
+        if (*inode_counter == UINT32_MAX)
+                return EFI_OUT_OF_RESOURCES;
+
+        l = 6 + 13*8 + 1; /* Fixed CPIO header size, and NUL byte after the file name */
+
+        path_size = strlen8(path);
+        if (l > SIZE_MAX - path_size)
+                return EFI_OUT_OF_RESOURCES;
+        l += path_size;
+
+        /* Align the whole header to 4 byte size */
+        l = ALIGN4(l);
+        if (l == SIZE_MAX) /* overflow check */
+                return EFI_OUT_OF_RESOURCES;
+
+        if (*cpio_buffer_size > SIZE_MAX - l) /* overflow check */
+                return EFI_OUT_OF_RESOURCES;
+
+        *cpio_buffer = a = xrealloc(*cpio_buffer, *cpio_buffer_size, *cpio_buffer_size + l);
+        a = (char *) *cpio_buffer + *cpio_buffer_size;
+
+        a = mempcpy(a, "070701", 6); /* magic ID */
+
+        a = write_cpio_word(a, (*inode_counter)++);                         /* inode */
+        a = write_cpio_word(a, access_mode | 0040000 /* = S_IFDIR */);      /* mode */
+        a = write_cpio_word(a, 0);                                          /* uid */
+        a = write_cpio_word(a, 0);                                          /* gid */
+        a = write_cpio_word(a, 1);                                          /* nlink */
+        a = write_cpio_word(a, 0);                                          /* mtime */
+        a = write_cpio_word(a, 0);                                          /* size */
+        a = write_cpio_word(a, 0);                                          /* major(dev) */
+        a = write_cpio_word(a, 0);                                          /* minor(dev) */
+        a = write_cpio_word(a, 0);                                          /* major(rdev) */
+        a = write_cpio_word(a, 0);                                          /* minor(rdev) */
+        a = write_cpio_word(a, path_size + 1);                              /* fname size */
+        a = write_cpio_word(a, 0);                                          /* "crc" */
+
+        a = mempcpy(a, path, path_size + 1);
+
+        /* Pad to next multiple of 4 */
+        a = pad4(a, *cpio_buffer);
+
+        assert(a == (char *) *cpio_buffer + *cpio_buffer_size + l);
+
+        *cpio_buffer_size += l;
+        return EFI_SUCCESS;
+}
+
+static EFI_STATUS pack_cpio_prefix(
+                const char *path,
+                uint32_t dir_mode,
+                uint32_t *inode_counter,
+                void **cpio_buffer,
+                size_t *cpio_buffer_size) {
+
+        EFI_STATUS err;
+
+        assert(path);
+        assert(inode_counter);
+        assert(cpio_buffer);
+        assert(cpio_buffer_size);
+
+        /* Serializes directory inodes of all prefix paths of the specified path in cpio format. Note that
+         * (similar to mkdir -p behaviour) all leading paths are created with 0555 access mode, only the
+         * final dir is created with the specified directory access mode. */
+
+        for (const char *p = path;;) {
+                const char *e;
+
+                e = strchr8(p, '/');
+                if (!e)
+                        break;
+
+                if (e > p) {
+                        _cleanup_free_ char *t = NULL;
+
+                        t = xstrndup8(path, e - path);
+                        if (!t)
+                                return EFI_OUT_OF_RESOURCES;
+
+                        err = pack_cpio_dir(t, 0555, inode_counter, cpio_buffer, cpio_buffer_size);
+                        if (err != EFI_SUCCESS)
+                                return err;
+                }
+
+                p = e + 1;
+        }
+
+        return pack_cpio_dir(path, dir_mode, inode_counter, cpio_buffer, cpio_buffer_size);
+}
+
+static EFI_STATUS pack_cpio_trailer(
+                void **cpio_buffer,
+                size_t *cpio_buffer_size) {
+
+        static const char trailer[] =
+                "070701"
+                "00000000"
+                "00000000"
+                "00000000"
+                "00000000"
+                "00000001"
+                "00000000"
+                "00000000"
+                "00000000"
+                "00000000"
+                "00000000"
+                "00000000"
+                "0000000B"
+                "00000000"
+                "TRAILER!!!\0\0\0"; /* There's a fourth NUL byte appended here, because this is a string */
+
+        /* Generates the cpio trailer record that indicates the end of our initrd cpio archive */
+
+        assert(cpio_buffer);
+        assert(cpio_buffer_size);
+        assert_cc(sizeof(trailer) % 4 == 0);
+
+        *cpio_buffer = xrealloc(*cpio_buffer, *cpio_buffer_size, *cpio_buffer_size + sizeof(trailer));
+        memcpy((uint8_t*) *cpio_buffer + *cpio_buffer_size, trailer, sizeof(trailer));
+        *cpio_buffer_size += sizeof(trailer);
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS pack_cpio(
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                const char16_t *dropin_dir,
+                const char16_t *match_suffix,
+                const char *target_dir_prefix,
+                uint32_t dir_mode,
+                uint32_t access_mode,
+                uint32_t tpm_pcr,
+                const char16_t *tpm_description,
+                void **ret_buffer,
+                size_t *ret_buffer_size,
+                bool *ret_measured) {
+
+        _cleanup_(file_closep) EFI_FILE *root = NULL, *extra_dir = NULL;
+        size_t dirent_size = 0, buffer_size = 0, n_items = 0, n_allocated = 0;
+        _cleanup_free_ char16_t *rel_dropin_dir = NULL;
+        _cleanup_free_ EFI_FILE_INFO *dirent = NULL;
+        _cleanup_(strv_freep) char16_t **items = NULL;
+        _cleanup_free_ void *buffer = NULL;
+        uint32_t inode = 1; /* inode counter, so that each item gets a new inode */
+        EFI_STATUS err;
+
+        assert(loaded_image);
+        assert(target_dir_prefix);
+        assert(ret_buffer);
+        assert(ret_buffer_size);
+
+        if (!loaded_image->DeviceHandle)
+                goto nothing;
+
+        err = open_volume(loaded_image->DeviceHandle, &root);
+        if (err == EFI_UNSUPPORTED)
+                /* Error will be unsupported if the bootloader doesn't implement the file system protocol on
+                 * its file handles. */
+                goto nothing;
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Unable to open root directory: %m");
+
+        if (!dropin_dir) {
+                dropin_dir = rel_dropin_dir = get_extra_dir(loaded_image->FilePath);
+                if (!dropin_dir)
+                        goto nothing;
+        }
+
+        err = open_directory(root, dropin_dir, &extra_dir);
+        if (err == EFI_NOT_FOUND)
+                /* No extra subdir, that's totally OK */
+                goto nothing;
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to open extra directory of loaded image: %m");
+
+        for (;;) {
+                _cleanup_free_ char16_t *d = NULL;
+
+                err = readdir(extra_dir, &dirent, &dirent_size);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to read extra directory of loaded image: %m");
+                if (!dirent) /* End of directory */
+                        break;
+
+                if (dirent->FileName[0] == '.')
+                        continue;
+                if (FLAGS_SET(dirent->Attribute, EFI_FILE_DIRECTORY))
+                        continue;
+                if (match_suffix && !endswith_no_case(dirent->FileName, match_suffix))
+                        continue;
+                if (!is_ascii(dirent->FileName))
+                        continue;
+                if (strlen16(dirent->FileName) > 255) /* Max filename size on Linux */
+                        continue;
+
+                d = xstrdup16(dirent->FileName);
+
+                if (n_items+2 > n_allocated) {
+                        /* We allocate 16 entries at a time, as a matter of optimization */
+                        if (n_items > (SIZE_MAX / sizeof(uint16_t)) - 16) /* Overflow check, just in case */
+                                return log_oom();
+
+                        size_t m = n_items + 16;
+                        items = xrealloc(items, n_allocated * sizeof(uint16_t *), m * sizeof(uint16_t *));
+                        n_allocated = m;
+                }
+
+                items[n_items++] = TAKE_PTR(d);
+                items[n_items] = NULL; /* Let's always NUL terminate, to make freeing via strv_free() easy */
+        }
+
+        if (n_items == 0)
+                /* Empty directory */
+                goto nothing;
+
+        /* Now, sort the files we found, to make this uniform and stable (and to ensure the TPM measurements
+         * are not dependent on read order) */
+        sort_pointer_array((void**) items, n_items, (compare_pointer_func_t) strcmp16);
+
+        /* Generate the leading directory inodes right before adding the first files, to the
+         * archive. Otherwise the cpio archive cannot be unpacked, since the leading dirs won't exist. */
+        err = pack_cpio_prefix(target_dir_prefix, dir_mode, &inode, &buffer, &buffer_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to pack cpio prefix: %m");
+
+        for (size_t i = 0; i < n_items; i++) {
+                _cleanup_free_ char *content = NULL;
+                size_t contentsize = 0;  /* avoid false maybe-uninitialized warning */
+
+                err = file_read(extra_dir, items[i], 0, 0, &content, &contentsize);
+                if (err != EFI_SUCCESS) {
+                        log_error_status(err, "Failed to read %ls, ignoring: %m", items[i]);
+                        continue;
+                }
+
+                err = pack_cpio_one(
+                                items[i],
+                                content, contentsize,
+                                target_dir_prefix,
+                                access_mode,
+                                &inode,
+                                &buffer, &buffer_size);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to pack cpio file %ls: %m", dirent->FileName);
+        }
+
+        err = pack_cpio_trailer(&buffer, &buffer_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to pack cpio trailer: %m");
+
+        err = tpm_log_event(
+                        tpm_pcr, POINTER_TO_PHYSICAL_ADDRESS(buffer), buffer_size, tpm_description, ret_measured);
+        if (err != EFI_SUCCESS)
+                return log_error_status(
+                                err,
+                                "Unable to add cpio TPM measurement for PCR %u (%ls), ignoring: %m",
+                                tpm_pcr,
+                                tpm_description);
+
+        *ret_buffer = TAKE_PTR(buffer);
+        *ret_buffer_size = buffer_size;
+
+        return EFI_SUCCESS;
+
+nothing:
+        *ret_buffer = NULL;
+        *ret_buffer_size = 0;
+
+        if (ret_measured)
+                *ret_measured = false;
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS pack_cpio_literal(
+                const void *data,
+                size_t data_size,
+                const char *target_dir_prefix,
+                const char16_t *target_filename,
+                uint32_t dir_mode,
+                uint32_t access_mode,
+                uint32_t tpm_pcr,
+                const char16_t *tpm_description,
+                void **ret_buffer,
+                size_t *ret_buffer_size,
+                bool *ret_measured) {
+
+        uint32_t inode = 1; /* inode counter, so that each item gets a new inode */
+        _cleanup_free_ void *buffer = NULL;
+        size_t buffer_size = 0;
+        EFI_STATUS err;
+
+        assert(data || data_size == 0);
+        assert(target_dir_prefix);
+        assert(target_filename);
+        assert(ret_buffer);
+        assert(ret_buffer_size);
+
+        /* Generate the leading directory inodes right before adding the first files, to the
+         * archive. Otherwise the cpio archive cannot be unpacked, since the leading dirs won't exist. */
+
+        err = pack_cpio_prefix(target_dir_prefix, dir_mode, &inode, &buffer, &buffer_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to pack cpio prefix: %m");
+
+        err = pack_cpio_one(
+                        target_filename,
+                        data, data_size,
+                        target_dir_prefix,
+                        access_mode,
+                        &inode,
+                        &buffer, &buffer_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to pack cpio file %ls: %m", target_filename);
+
+        err = pack_cpio_trailer(&buffer, &buffer_size);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to pack cpio trailer: %m");
+
+        err = tpm_log_event(
+                        tpm_pcr, POINTER_TO_PHYSICAL_ADDRESS(buffer), buffer_size, tpm_description, ret_measured);
+        if (err != EFI_SUCCESS)
+                return log_error_status(
+                                err,
+                                "Unable to add cpio TPM measurement for PCR %u (%ls), ignoring: %m",
+                                tpm_pcr,
+                                tpm_description);
+
+        *ret_buffer = TAKE_PTR(buffer);
+        *ret_buffer_size = buffer_size;
+
+        return EFI_SUCCESS;
+}
diff --git a/src/boot/efi/cpio.h b/src/boot/efi/cpio.h
new file mode 100644
index 0000000..26851e3
--- /dev/null
+++ b/src/boot/efi/cpio.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+#include "proto/loaded-image.h"
+
+EFI_STATUS pack_cpio(
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                const char16_t *dropin_dir,
+                const char16_t *match_suffix,
+                const char *target_dir_prefix,
+                uint32_t dir_mode,
+                uint32_t access_mode,
+                uint32_t tpm_pcr,
+                const char16_t *tpm_description,
+                void **ret_buffer,
+                size_t *ret_buffer_size,
+                bool *ret_measured);
+
+EFI_STATUS pack_cpio_literal(
+                const void *data,
+                size_t data_size,
+                const char *target_dir_prefix,
+                const char16_t *target_filename,
+                uint32_t dir_mode,
+                uint32_t access_mode,
+                uint32_t tpm_pcr,
+                const char16_t *tpm_description,
+                void **ret_buffer,
+                size_t *ret_buffer_size,
+                bool *ret_measured);
diff --git a/src/boot/efi/device-path-util.c b/src/boot/efi/device-path-util.c
new file mode 100644
index 0000000..2a85e8b
--- /dev/null
+++ b/src/boot/efi/device-path-util.c
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "device-path-util.h"
+#include "util.h"
+
+EFI_STATUS make_file_device_path(EFI_HANDLE device, const char16_t *file, EFI_DEVICE_PATH **ret_dp) {
+        EFI_STATUS err;
+        EFI_DEVICE_PATH *dp;
+
+        assert(file);
+        assert(ret_dp);
+
+        err = BS->HandleProtocol(device, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL), (void **) &dp);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        EFI_DEVICE_PATH *end_node = dp;
+        while (!device_path_is_end(end_node))
+                end_node = device_path_next_node(end_node);
+
+        size_t file_size = strsize16(file);
+        size_t dp_size = (uint8_t *) end_node - (uint8_t *) dp;
+
+        /* Make a copy that can also hold a file media device path. */
+        *ret_dp = xmalloc(dp_size + file_size + sizeof(FILEPATH_DEVICE_PATH) + sizeof(EFI_DEVICE_PATH));
+        dp = mempcpy(*ret_dp, dp, dp_size);
+
+        FILEPATH_DEVICE_PATH *file_dp = (FILEPATH_DEVICE_PATH *) dp;
+        file_dp->Header = (EFI_DEVICE_PATH) {
+                .Type = MEDIA_DEVICE_PATH,
+                .SubType = MEDIA_FILEPATH_DP,
+                .Length = sizeof(FILEPATH_DEVICE_PATH) + file_size,
+        };
+        memcpy(file_dp->PathName, file, file_size);
+
+        dp = device_path_next_node(dp);
+        *dp = DEVICE_PATH_END_NODE;
+        return EFI_SUCCESS;
+}
+
+static char16_t *device_path_to_str_internal(const EFI_DEVICE_PATH *dp) {
+        char16_t *str = NULL;
+
+        for (const EFI_DEVICE_PATH *node = dp; !device_path_is_end(node); node = device_path_next_node(node)) {
+                _cleanup_free_ char16_t *old = str;
+
+                if (node->Type == END_DEVICE_PATH_TYPE && node->SubType == END_INSTANCE_DEVICE_PATH_SUBTYPE) {
+                        str = xasprintf("%ls%s,", strempty(old), old ? "\\" : "");
+                        continue;
+                }
+
+                /* Special-case this so that FilePath-only device path string look and behave nicely. */
+                if (node->Type == MEDIA_DEVICE_PATH && node->SubType == MEDIA_FILEPATH_DP) {
+                        str = xasprintf("%ls%s%ls",
+                                        strempty(old),
+                                        old ? "\\" : "",
+                                        ((FILEPATH_DEVICE_PATH *) node)->PathName);
+                        continue;
+                }
+
+                /* Instead of coding all the different types and sub-types here we just use the
+                 * generic node form. This function is a best-effort for firmware that does not
+                 * provide the EFI_DEVICE_PATH_TO_TEXT_PROTOCOL after all. */
+
+                size_t size = node->Length - sizeof(EFI_DEVICE_PATH);
+                _cleanup_free_ char16_t *hex_data = hexdump((uint8_t *) node + sizeof(EFI_DEVICE_PATH), size);
+                str = xasprintf("%ls%sPath(%u,%u%s%ls)",
+                                strempty(old),
+                                old ? "/" : "",
+                                node->Type,
+                                node->SubType,
+                                size == 0 ? "" : ",",
+                                hex_data);
+        }
+
+        return str;
+}
+
+EFI_STATUS device_path_to_str(const EFI_DEVICE_PATH *dp, char16_t **ret) {
+        EFI_DEVICE_PATH_TO_TEXT_PROTOCOL *dp_to_text;
+        EFI_STATUS err;
+        _cleanup_free_ char16_t *str = NULL;
+
+        assert(dp);
+        assert(ret);
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_DEVICE_PATH_TO_TEXT_PROTOCOL), NULL, (void **) &dp_to_text);
+        if (err != EFI_SUCCESS) {
+                *ret = device_path_to_str_internal(dp);
+                return EFI_SUCCESS;
+        }
+
+        str = dp_to_text->ConvertDevicePathToText(dp, false, false);
+        if (!str)
+                return EFI_OUT_OF_RESOURCES;
+
+        *ret = TAKE_PTR(str);
+        return EFI_SUCCESS;
+}
+
+bool device_path_startswith(const EFI_DEVICE_PATH *dp, const EFI_DEVICE_PATH *start) {
+        if (!start)
+                return true;
+        if (!dp)
+                return false;
+        for (;;) {
+                if (device_path_is_end(start))
+                        return true;
+                if (device_path_is_end(dp))
+                        return false;
+                if (start->Length != dp->Length)
+                        return false;
+                if (memcmp(dp, start, start->Length) != 0)
+                        return false;
+                start = device_path_next_node(start);
+                dp = device_path_next_node(dp);
+        }
+}
+
+EFI_DEVICE_PATH *device_path_replace_node(
+                const EFI_DEVICE_PATH *path, const EFI_DEVICE_PATH *node, const EFI_DEVICE_PATH *new_node) {
+
+        /* Create a new device path as a copy of path, while chopping off the remainder starting at the given
+         * node. If new_node is provided, it is appended at the end of the new path. */
+
+        assert(path);
+        assert(node);
+
+        size_t len = (uint8_t *) node - (uint8_t *) path;
+        EFI_DEVICE_PATH *ret = xmalloc(len + (new_node ? new_node->Length : 0) + sizeof(EFI_DEVICE_PATH));
+        EFI_DEVICE_PATH *end = mempcpy(ret, path, len);
+
+        if (new_node)
+                end = mempcpy(end, new_node, new_node->Length);
+
+        *end = DEVICE_PATH_END_NODE;
+        return ret;
+}
diff --git a/src/boot/efi/device-path-util.h b/src/boot/efi/device-path-util.h
new file mode 100644
index 0000000..08f1a9c
--- /dev/null
+++ b/src/boot/efi/device-path-util.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "proto/device-path.h"
+
+EFI_STATUS make_file_device_path(EFI_HANDLE device, const char16_t *file, EFI_DEVICE_PATH **ret_dp);
+EFI_STATUS device_path_to_str(const EFI_DEVICE_PATH *dp, char16_t **ret);
+bool device_path_startswith(const EFI_DEVICE_PATH *dp, const EFI_DEVICE_PATH *start);
+EFI_DEVICE_PATH *device_path_replace_node(
+                const EFI_DEVICE_PATH *path, const EFI_DEVICE_PATH *node, const EFI_DEVICE_PATH *new_node);
+
+static inline EFI_DEVICE_PATH *device_path_next_node(const EFI_DEVICE_PATH *dp) {
+        assert(dp);
+        return (EFI_DEVICE_PATH *) ((uint8_t *) dp + dp->Length);
+}
+
+static inline bool device_path_is_end(const EFI_DEVICE_PATH *dp) {
+        assert(dp);
+        return dp->Type == END_DEVICE_PATH_TYPE && dp->SubType == END_ENTIRE_DEVICE_PATH_SUBTYPE;
+}
+
+#define DEVICE_PATH_END_NODE                               \
+        (EFI_DEVICE_PATH) {                                \
+                .Type = END_DEVICE_PATH_TYPE,              \
+                .SubType = END_ENTIRE_DEVICE_PATH_SUBTYPE, \
+                .Length = sizeof(EFI_DEVICE_PATH)          \
+        }
diff --git a/src/boot/efi/devicetree.c b/src/boot/efi/devicetree.c
new file mode 100644
index 0000000..61a43cd
--- /dev/null
+++ b/src/boot/efi/devicetree.c
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "devicetree.h"
+#include "proto/dt-fixup.h"
+#include "util.h"
+
+#define FDT_V1_SIZE (7*4)
+
+static EFI_STATUS devicetree_allocate(struct devicetree_state *state, size_t size) {
+        size_t pages = DIV_ROUND_UP(size, EFI_PAGE_SIZE);
+        EFI_STATUS err;
+
+        assert(state);
+
+        err = BS->AllocatePages(AllocateAnyPages, EfiACPIReclaimMemory, pages, &state->addr);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        state->pages = pages;
+        return err;
+}
+
+static size_t devicetree_allocated(const struct devicetree_state *state) {
+        assert(state);
+        return state->pages * EFI_PAGE_SIZE;
+}
+
+static EFI_STATUS devicetree_fixup(struct devicetree_state *state, size_t len) {
+        EFI_DT_FIXUP_PROTOCOL *fixup;
+        size_t size;
+        EFI_STATUS err;
+
+        assert(state);
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_DT_FIXUP_PROTOCOL), NULL, (void **) &fixup);
+        /* Skip fixup if we cannot locate device tree fixup protocol */
+        if (err != EFI_SUCCESS)
+                return EFI_SUCCESS;
+
+        size = devicetree_allocated(state);
+        err = fixup->Fixup(fixup, PHYSICAL_ADDRESS_TO_POINTER(state->addr), &size,
+                           EFI_DT_APPLY_FIXUPS | EFI_DT_RESERVE_MEMORY);
+        if (err == EFI_BUFFER_TOO_SMALL) {
+                EFI_PHYSICAL_ADDRESS oldaddr = state->addr;
+                size_t oldpages = state->pages;
+                void *oldptr = PHYSICAL_ADDRESS_TO_POINTER(state->addr);
+
+                err = devicetree_allocate(state, size);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                memcpy(PHYSICAL_ADDRESS_TO_POINTER(state->addr), oldptr, len);
+                err = BS->FreePages(oldaddr, oldpages);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                size = devicetree_allocated(state);
+                err = fixup->Fixup(fixup, PHYSICAL_ADDRESS_TO_POINTER(state->addr), &size,
+                                   EFI_DT_APPLY_FIXUPS | EFI_DT_RESERVE_MEMORY);
+        }
+
+        return err;
+}
+
+EFI_STATUS devicetree_install(struct devicetree_state *state, EFI_FILE *root_dir, char16_t *name) {
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        _cleanup_free_ EFI_FILE_INFO *info = NULL;
+        size_t len;
+        EFI_STATUS err;
+
+        assert(state);
+        assert(root_dir);
+        assert(name);
+
+        /* Capture the original value for the devicetree table. NULL is not an error in this case so we don't
+         * need to check the return value. NULL simply means the system fw had no devicetree initially (and
+         * is the correct value to use to return to the initial state if needed). */
+        state->orig = find_configuration_table(MAKE_GUID_PTR(EFI_DTB_TABLE));
+
+        err = root_dir->Open(root_dir, &handle, name, EFI_FILE_MODE_READ, EFI_FILE_READ_ONLY);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = get_file_info(handle, &info, NULL);
+        if (err != EFI_SUCCESS)
+                return err;
+        if (info->FileSize < FDT_V1_SIZE || info->FileSize > 32 * 1024 * 1024)
+                /* 32MB device tree blob doesn't seem right */
+                return EFI_INVALID_PARAMETER;
+
+        len = info->FileSize;
+
+        err = devicetree_allocate(state, len);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = handle->Read(handle, &len, PHYSICAL_ADDRESS_TO_POINTER(state->addr));
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = devicetree_fixup(state, len);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        return BS->InstallConfigurationTable(
+                        MAKE_GUID_PTR(EFI_DTB_TABLE), PHYSICAL_ADDRESS_TO_POINTER(state->addr));
+}
+
+EFI_STATUS devicetree_install_from_memory(
+                struct devicetree_state *state, const void *dtb_buffer, size_t dtb_length) {
+
+        EFI_STATUS err;
+
+        assert(state);
+        assert(dtb_buffer && dtb_length > 0);
+
+        /* Capture the original value for the devicetree table. NULL is not an error in this case so we don't
+         * need to check the return value. NULL simply means the system fw had no devicetree initially (and
+         * is the correct value to use to return to the initial state if needed). */
+        state->orig = find_configuration_table(MAKE_GUID_PTR(EFI_DTB_TABLE));
+
+        err = devicetree_allocate(state, dtb_length);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        memcpy(PHYSICAL_ADDRESS_TO_POINTER(state->addr), dtb_buffer, dtb_length);
+
+        err = devicetree_fixup(state, dtb_length);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        return BS->InstallConfigurationTable(
+                        MAKE_GUID_PTR(EFI_DTB_TABLE), PHYSICAL_ADDRESS_TO_POINTER(state->addr));
+}
+
+void devicetree_cleanup(struct devicetree_state *state) {
+        EFI_STATUS err;
+
+        if (!state->pages)
+                return;
+
+        err = BS->InstallConfigurationTable(MAKE_GUID_PTR(EFI_DTB_TABLE), state->orig);
+        /* don't free the current device tree if we can't reinstate the old one */
+        if (err != EFI_SUCCESS)
+                return;
+
+        BS->FreePages(state->addr, state->pages);
+        state->pages = 0;
+}
diff --git a/src/boot/efi/devicetree.h b/src/boot/efi/devicetree.h
new file mode 100644
index 0000000..33eaa22
--- /dev/null
+++ b/src/boot/efi/devicetree.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+struct devicetree_state {
+        EFI_PHYSICAL_ADDRESS addr;
+        size_t pages;
+        void *orig;
+};
+
+EFI_STATUS devicetree_install(struct devicetree_state *state, EFI_FILE *root_dir, char16_t *name);
+EFI_STATUS devicetree_install_from_memory(
+                struct devicetree_state *state, const void *dtb_buffer, size_t dtb_length);
+void devicetree_cleanup(struct devicetree_state *state);
diff --git a/src/boot/efi/drivers.c b/src/boot/efi/drivers.c
new file mode 100644
index 0000000..0674557
--- /dev/null
+++ b/src/boot/efi/drivers.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "device-path-util.h"
+#include "drivers.h"
+#include "util.h"
+
+static EFI_STATUS load_one_driver(
+                EFI_HANDLE parent_image,
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                const char16_t *fname) {
+
+        _cleanup_(unload_imagep) EFI_HANDLE image = NULL;
+        _cleanup_free_ EFI_DEVICE_PATH *path = NULL;
+        _cleanup_free_ char16_t *spath = NULL;
+        EFI_STATUS err;
+
+        assert(parent_image);
+        assert(loaded_image);
+        assert(fname);
+
+        spath = xasprintf("\\EFI\\systemd\\drivers\\%ls", fname);
+        err = make_file_device_path(loaded_image->DeviceHandle, spath, &path);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error making file device path: %m");
+
+        err = BS->LoadImage(false, parent_image, path, NULL, 0, &image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to load image %ls: %m", fname);
+
+        err = BS->HandleProtocol(image, MAKE_GUID_PTR(EFI_LOADED_IMAGE_PROTOCOL), (void **) &loaded_image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to find protocol in driver image %ls: %m", fname);
+
+        if (loaded_image->ImageCodeType != EfiBootServicesCode &&
+            loaded_image->ImageCodeType != EfiRuntimeServicesCode)
+                return log_error("Image %ls is not a driver, refusing.", fname);
+
+        err = BS->StartImage(image, NULL, NULL);
+        if (err != EFI_SUCCESS) {
+                /* EFI_ABORTED signals an initializing driver. It uses this error code on success
+                 * so that it is unloaded after. */
+                if (err != EFI_ABORTED)
+                        log_error_status(err, "Failed to start image %ls: %m", fname);
+                return err;
+        }
+
+        TAKE_PTR(image);
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS reconnect_all_drivers(void) {
+        _cleanup_free_ EFI_HANDLE *handles = NULL;
+        size_t n_handles = 0;
+        EFI_STATUS err;
+
+        /* Reconnects all handles, so that any loaded drivers can take effect. */
+
+        err = BS->LocateHandleBuffer(AllHandles, NULL, NULL, &n_handles, &handles);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to get list of handles: %m");
+
+        for (size_t i = 0; i < n_handles; i++)
+                /* Some firmware gives us some bogus handles (or they might become bad due to
+                 * reconnecting everything). Security policy may also prevent us from doing so too.
+                 * There is nothing we can realistically do on errors anyways, so just ignore them. */
+                (void) BS->ConnectController(handles[i], NULL, NULL, true);
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS load_drivers(
+                EFI_HANDLE parent_image,
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                EFI_FILE *root_dir) {
+
+        _cleanup_(file_closep) EFI_FILE *drivers_dir = NULL;
+        _cleanup_free_ EFI_FILE_INFO *dirent = NULL;
+        size_t dirent_size = 0, n_succeeded = 0;
+        EFI_STATUS err;
+
+        err = open_directory(
+                        root_dir,
+                        u"\\EFI\\systemd\\drivers",
+                        &drivers_dir);
+        if (err == EFI_NOT_FOUND)
+                return EFI_SUCCESS;
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to open \\EFI\\systemd\\drivers: %m");
+
+        for (;;) {
+                err = readdir(drivers_dir, &dirent, &dirent_size);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to read extra directory of loaded image: %m");
+                if (!dirent) /* End of directory */
+                        break;
+
+                if (dirent->FileName[0] == '.')
+                        continue;
+                if (FLAGS_SET(dirent->Attribute, EFI_FILE_DIRECTORY))
+                        continue;
+                if (!endswith_no_case(dirent->FileName, EFI_MACHINE_TYPE_NAME u".efi"))
+                        continue;
+
+                err = load_one_driver(parent_image, loaded_image, dirent->FileName);
+                if (err != EFI_SUCCESS)
+                        continue;
+
+                n_succeeded++;
+        }
+
+        if (n_succeeded > 0)
+                (void) reconnect_all_drivers();
+
+        return EFI_SUCCESS;
+}
diff --git a/src/boot/efi/drivers.h b/src/boot/efi/drivers.h
new file mode 100644
index 0000000..ecd0b4e
--- /dev/null
+++ b/src/boot/efi/drivers.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+#include "proto/loaded-image.h"
+
+EFI_STATUS reconnect_all_drivers(void);
+EFI_STATUS load_drivers(
+                EFI_HANDLE parent_image,
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                EFI_FILE *root_dir);
diff --git a/src/boot/efi/efi-string.c b/src/boot/efi/efi-string.c
new file mode 100644
index 0000000..4144c0d
--- /dev/null
+++ b/src/boot/efi/efi-string.c
@@ -0,0 +1,1084 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "efi-string.h"
+
+#if SD_BOOT
+#  include "proto/simple-text-io.h"
+#  include "util.h"
+#else
+#  include 
+#  include "alloc-util.h"
+#  define xnew(t, n) ASSERT_SE_PTR(new(t, n))
+#  define xmalloc(n) ASSERT_SE_PTR(malloc(n))
+#endif
+
+/* String functions for both char and char16_t that should behave the same way as their respective
+ * counterpart in userspace. Where it makes sense, these accept NULL and do something sensible whereas
+ * userspace does not allow for this (strlen8(NULL) returns 0 like strlen_ptr(NULL) for example). To make it
+ * easier to tell in code which kind of string they work on, we use 8/16 suffixes. This also makes is easier
+ * to unit test them. */
+
+#define DEFINE_STRNLEN(type, name)             \
+        size_t name(const type *s, size_t n) { \
+                if (!s)                        \
+                        return 0;              \
+                                               \
+                size_t len = 0;                \
+                while (len < n && *s) {        \
+                        s++;                   \
+                        len++;                 \
+                }                              \
+                                               \
+                return len;                    \
+        }
+
+DEFINE_STRNLEN(char, strnlen8);
+DEFINE_STRNLEN(char16_t, strnlen16);
+
+#define TOLOWER(c)                                                \
+        ({                                                        \
+                typeof(c) _c = (c);                               \
+                (_c >= 'A' && _c <= 'Z') ? _c + ('a' - 'A') : _c; \
+        })
+
+#define DEFINE_STRTOLOWER(type, name)     \
+        void name(type *s) {              \
+                if (!s)                   \
+                        return;           \
+                for (; *s; s++)           \
+                        *s = TOLOWER(*s); \
+        }
+
+DEFINE_STRTOLOWER(char, strtolower8);
+DEFINE_STRTOLOWER(char16_t, strtolower16);
+
+#define DEFINE_STRNCASECMP(type, name, tolower)              \
+        int name(const type *s1, const type *s2, size_t n) { \
+                if (!s1 || !s2)                              \
+                        return CMP(s1, s2);                  \
+                                                             \
+                while (n > 0) {                              \
+                        type c1 = *s1, c2 = *s2;             \
+                        if (tolower) {                       \
+                                c1 = TOLOWER(c1);            \
+                                c2 = TOLOWER(c2);            \
+                        }                                    \
+                        if (!c1 || c1 != c2)                 \
+                                return CMP(c1, c2);          \
+                                                             \
+                        s1++;                                \
+                        s2++;                                \
+                        n--;                                 \
+                }                                            \
+                                                             \
+                return 0;                                    \
+        }
+
+DEFINE_STRNCASECMP(char, strncmp8, false);
+DEFINE_STRNCASECMP(char16_t, strncmp16, false);
+DEFINE_STRNCASECMP(char, strncasecmp8, true);
+DEFINE_STRNCASECMP(char16_t, strncasecmp16, true);
+
+#define DEFINE_STRCPY(type, name)                                     \
+        type *name(type * restrict dest, const type * restrict src) { \
+                type *ret = ASSERT_PTR(dest);                         \
+                                                                      \
+                if (!src) {                                           \
+                        *dest = '\0';                                 \
+                        return ret;                                   \
+                }                                                     \
+                                                                      \
+                while (*src) {                                        \
+                        *dest = *src;                                 \
+                        dest++;                                       \
+                        src++;                                        \
+                }                                                     \
+                                                                      \
+                *dest = '\0';                                         \
+                return ret;                                           \
+        }
+
+DEFINE_STRCPY(char, strcpy8);
+DEFINE_STRCPY(char16_t, strcpy16);
+
+#define DEFINE_STRCHR(type, name)                  \
+        type *name(const type *s, type c) {        \
+                if (!s)                            \
+                        return NULL;               \
+                                                   \
+                while (*s) {                       \
+                        if (*s == c)               \
+                                return (type *) s; \
+                        s++;                       \
+                }                                  \
+                                                   \
+                return c ? NULL : (type *) s;      \
+        }
+
+DEFINE_STRCHR(char, strchr8);
+DEFINE_STRCHR(char16_t, strchr16);
+
+#define DEFINE_STRNDUP(type, name, len_func)              \
+        type *name(const type *s, size_t n) {             \
+                if (!s)                                   \
+                        return NULL;                      \
+                                                          \
+                size_t len = len_func(s, n);              \
+                size_t size = len * sizeof(type);         \
+                                                          \
+                type *dup = xmalloc(size + sizeof(type)); \
+                if (size > 0)                             \
+                        memcpy(dup, s, size);             \
+                dup[len] = '\0';                          \
+                                                          \
+                return dup;                               \
+        }
+
+DEFINE_STRNDUP(char, xstrndup8, strnlen8);
+DEFINE_STRNDUP(char16_t, xstrndup16, strnlen16);
+
+static unsigned utf8_to_unichar(const char *utf8, size_t n, char32_t *c) {
+        char32_t unichar;
+        unsigned len;
+
+        assert(utf8);
+        assert(c);
+
+        if (!(utf8[0] & 0x80)) {
+                *c = utf8[0];
+                return 1;
+        } else if ((utf8[0] & 0xe0) == 0xc0) {
+                len = 2;
+                unichar = utf8[0] & 0x1f;
+        } else if ((utf8[0] & 0xf0) == 0xe0) {
+                len = 3;
+                unichar = utf8[0] & 0x0f;
+        } else if ((utf8[0] & 0xf8) == 0xf0) {
+                len = 4;
+                unichar = utf8[0] & 0x07;
+        } else if ((utf8[0] & 0xfc) == 0xf8) {
+                len = 5;
+                unichar = utf8[0] & 0x03;
+        } else if ((utf8[0] & 0xfe) == 0xfc) {
+                len = 6;
+                unichar = utf8[0] & 0x01;
+        } else {
+                *c = UINT32_MAX;
+                return 1;
+        }
+
+        if (len > n) {
+                *c = UINT32_MAX;
+                return len;
+        }
+
+        for (unsigned i = 1; i < len; i++) {
+                if ((utf8[i] & 0xc0) != 0x80) {
+                        *c = UINT32_MAX;
+                        return len;
+                }
+                unichar <<= 6;
+                unichar |= utf8[i] & 0x3f;
+        }
+
+        *c = unichar;
+        return len;
+}
+
+/* Convert UTF-8 to UCS-2, skipping any invalid or short byte sequences. */
+char16_t *xstrn8_to_16(const char *str8, size_t n) {
+        if (!str8 || n == 0)
+                return NULL;
+
+        size_t i = 0;
+        char16_t *str16 = xnew(char16_t, n + 1);
+
+        while (n > 0 && *str8 != '\0') {
+                char32_t unichar;
+
+                size_t utf8len = utf8_to_unichar(str8, n, &unichar);
+                str8 += utf8len;
+                n = LESS_BY(n, utf8len);
+
+                switch (unichar) {
+                case 0 ... 0xd7ffU:
+                case 0xe000U ... 0xffffU:
+                        str16[i++] = unichar;
+                        break;
+                }
+        }
+
+        str16[i] = '\0';
+        return str16;
+}
+
+char *startswith8(const char *s, const char *prefix) {
+        size_t l;
+
+        assert(prefix);
+
+        if (!s)
+                return NULL;
+
+        l = strlen8(prefix);
+        if (!strneq8(s, prefix, l))
+                return NULL;
+
+        return (char*) s + l;
+}
+
+static bool efi_fnmatch_prefix(const char16_t *p, const char16_t *h, const char16_t **ret_p, const char16_t **ret_h) {
+        assert(p);
+        assert(h);
+        assert(ret_p);
+        assert(ret_h);
+
+        for (;; p++, h++)
+                switch (*p) {
+                case '\0':
+                        /* End of pattern. Check that haystack is now empty. */
+                        return *h == '\0';
+
+                case '\\':
+                        p++;
+                        if (*p == '\0' || *p != *h)
+                                /* Trailing escape or no match. */
+                                return false;
+                        break;
+
+                case '?':
+                        if (*h == '\0')
+                                /* Early end of haystack. */
+                                return false;
+                        break;
+
+                case '*':
+                        /* Point ret_p at the remainder of the pattern. */
+                        while (*p == '*')
+                                p++;
+                        *ret_p = p;
+                        *ret_h = h;
+                        return true;
+
+                case '[':
+                        if (*h == '\0')
+                                /* Early end of haystack. */
+                                return false;
+
+                        bool first = true, can_range = true, match = false;
+                        for (;; first = false) {
+                                p++;
+                                if (*p == '\0')
+                                        return false;
+
+                                if (*p == '\\') {
+                                        p++;
+                                        if (*p == '\0')
+                                                return false;
+                                        if (*p == *h)
+                                                match = true;
+                                        can_range = true;
+                                        continue;
+                                }
+
+                                /* End of set unless it's the first char. */
+                                if (*p == ']' && !first)
+                                        break;
+
+                                /* Range pattern if '-' is not first or last in set. */
+                                if (*p == '-' && can_range && !first && *(p + 1) != ']') {
+                                        char16_t low = *(p - 1);
+                                        p++;
+                                        if (*p == '\\')
+                                                p++;
+                                        if (*p == '\0')
+                                                return false;
+
+                                        if (low <= *h && *h <= *p)
+                                                match = true;
+
+                                        /* Ranges cannot be chained: [a-c-f] == [-abcf] */
+                                        can_range = false;
+                                        continue;
+                                }
+
+                                if (*p == *h)
+                                        match = true;
+                                can_range = true;
+                        }
+
+                        if (!match)
+                                return false;
+                        break;
+
+                default:
+                        if (*p != *h)
+                                /* Single char mismatch. */
+                                return false;
+                }
+}
+
+/* Patterns are fnmatch-compatible (with reduced feature support). */
+bool efi_fnmatch(const char16_t *pattern, const char16_t *haystack) {
+        /* Patterns can be considered as simple patterns (without '*') concatenated by '*'. By doing so we
+         * simply have to make sure the very first simple pattern matches the start of haystack. Then we just
+         * look for the remaining simple patterns *somewhere* within the haystack (in order) as any extra
+         * characters in between would be matches by the '*'. We then only have to ensure that the very last
+         * simple pattern matches at the actual end of the haystack.
+         *
+         * This means we do not need to use backtracking which could have catastrophic runtimes with the
+         * right input data. */
+
+        for (bool first = true;;) {
+                const char16_t *pattern_tail = NULL, *haystack_tail = NULL;
+                bool match = efi_fnmatch_prefix(pattern, haystack, &pattern_tail, &haystack_tail);
+                if (first) {
+                        if (!match)
+                                /* Initial simple pattern must match. */
+                                return false;
+                        if (!pattern_tail)
+                                /* No '*' was in pattern, we can return early. */
+                                return true;
+                        first = false;
+                }
+
+                if (pattern_tail) {
+                        assert(match);
+                        pattern = pattern_tail;
+                        haystack = haystack_tail;
+                } else {
+                        /* If we have a match this must be at the end of the haystack. Note that
+                         * efi_fnmatch_prefix compares the NUL-bytes at the end, so we cannot match the end
+                         * of pattern in the middle of haystack). */
+                        if (match || *haystack == '\0')
+                                return match;
+
+                        /* Match one character using '*'. */
+                        haystack++;
+                }
+        }
+}
+
+#define DEFINE_PARSE_NUMBER(type, name)                                    \
+        bool name(const type *s, uint64_t *ret_u, const type **ret_tail) { \
+                assert(ret_u);                                             \
+                                                                           \
+                if (!s)                                                    \
+                        return false;                                      \
+                                                                           \
+                /* Need at least one digit. */                             \
+                if (*s < '0' || *s > '9')                                  \
+                        return false;                                      \
+                                                                           \
+                uint64_t u = 0;                                            \
+                while (*s >= '0' && *s <= '9') {                           \
+                        if (__builtin_mul_overflow(u, 10, &u))             \
+                                return false;                              \
+                        if (__builtin_add_overflow(u, *s - '0', &u))       \
+                                return false;                              \
+                        s++;                                               \
+                }                                                          \
+                                                                           \
+                if (!ret_tail && *s != '\0')                               \
+                        return false;                                      \
+                                                                           \
+                *ret_u = u;                                                \
+                if (ret_tail)                                              \
+                        *ret_tail = s;                                     \
+                return true;                                               \
+        }
+
+DEFINE_PARSE_NUMBER(char, parse_number8);
+DEFINE_PARSE_NUMBER(char16_t, parse_number16);
+
+bool parse_boolean(const char *v, bool *ret) {
+        assert(ret);
+
+        if (!v)
+                return false;
+
+        if (streq8(v, "1") || streq8(v, "yes") || streq8(v, "y") || streq8(v, "true") || streq8(v, "t") ||
+            streq8(v, "on")) {
+                *ret = true;
+                return true;
+        }
+
+        if (streq8(v, "0") || streq8(v, "no") || streq8(v, "n") || streq8(v, "false") || streq8(v, "f") ||
+            streq8(v, "off")) {
+                *ret = false;
+                return true;
+        }
+
+        return false;
+}
+
+char *line_get_key_value(char *s, const char *sep, size_t *pos, char **ret_key, char **ret_value) {
+        char *line, *value;
+        size_t linelen;
+
+        assert(s);
+        assert(sep);
+        assert(pos);
+        assert(ret_key);
+        assert(ret_value);
+
+        for (;;) {
+                line = s + *pos;
+                if (*line == '\0')
+                        return NULL;
+
+                linelen = 0;
+                while (line[linelen] && !strchr8("\n\r", line[linelen]))
+                        linelen++;
+
+                /* move pos to next line */
+                *pos += linelen;
+                if (s[*pos])
+                        (*pos)++;
+
+                /* empty line */
+                if (linelen == 0)
+                        continue;
+
+                /* terminate line */
+                line[linelen] = '\0';
+
+                /* remove leading whitespace */
+                while (linelen > 0 && strchr8(" \t", *line)) {
+                        line++;
+                        linelen--;
+                }
+
+                /* remove trailing whitespace */
+                while (linelen > 0 && strchr8(" \t", line[linelen - 1]))
+                        linelen--;
+                line[linelen] = '\0';
+
+                if (*line == '#')
+                        continue;
+
+                /* split key/value */
+                value = line;
+                while (*value && !strchr8(sep, *value))
+                        value++;
+                if (*value == '\0')
+                        continue;
+                *value = '\0';
+                value++;
+                while (*value && strchr8(sep, *value))
+                        value++;
+
+                /* unquote */
+                if (value[0] == '"' && line[linelen - 1] == '"') {
+                        value++;
+                        line[linelen - 1] = '\0';
+                }
+
+                *ret_key = line;
+                *ret_value = value;
+                return line;
+        }
+}
+
+char16_t *hexdump(const void *data, size_t size) {
+        static const char hex[16] = "0123456789abcdef";
+        const uint8_t *d = data;
+
+        assert(data || size == 0);
+
+        char16_t *buf = xnew(char16_t, size * 2 + 1);
+
+        for (size_t i = 0; i < size; i++) {
+                buf[i * 2] = hex[d[i] >> 4];
+                buf[i * 2 + 1] = hex[d[i] & 0x0F];
+        }
+
+        buf[size * 2] = 0;
+        return buf;
+}
+
+static const char * const warn_table[] = {
+        [EFI_SUCCESS]               = "Success",
+        [EFI_WARN_UNKNOWN_GLYPH]    = "Unknown glyph",
+        [EFI_WARN_DELETE_FAILURE]   = "Delete failure",
+        [EFI_WARN_WRITE_FAILURE]    = "Write failure",
+        [EFI_WARN_BUFFER_TOO_SMALL] = "Buffer too small",
+        [EFI_WARN_STALE_DATA]       = "Stale data",
+        [EFI_WARN_FILE_SYSTEM]      = "File system",
+        [EFI_WARN_RESET_REQUIRED]   = "Reset required",
+};
+
+/* Errors have MSB set, remove it to keep the table compact. */
+#define NOERR(err) ((err) & ~EFI_ERROR_MASK)
+
+static const char * const err_table[] = {
+        [NOERR(EFI_ERROR_MASK)]           = "Error",
+        [NOERR(EFI_LOAD_ERROR)]           = "Load error",
+        [NOERR(EFI_INVALID_PARAMETER)]    = "Invalid parameter",
+        [NOERR(EFI_UNSUPPORTED)]          = "Unsupported",
+        [NOERR(EFI_BAD_BUFFER_SIZE)]      = "Bad buffer size",
+        [NOERR(EFI_BUFFER_TOO_SMALL)]     = "Buffer too small",
+        [NOERR(EFI_NOT_READY)]            = "Not ready",
+        [NOERR(EFI_DEVICE_ERROR)]         = "Device error",
+        [NOERR(EFI_WRITE_PROTECTED)]      = "Write protected",
+        [NOERR(EFI_OUT_OF_RESOURCES)]     = "Out of resources",
+        [NOERR(EFI_VOLUME_CORRUPTED)]     = "Volume corrupt",
+        [NOERR(EFI_VOLUME_FULL)]          = "Volume full",
+        [NOERR(EFI_NO_MEDIA)]             = "No media",
+        [NOERR(EFI_MEDIA_CHANGED)]        = "Media changed",
+        [NOERR(EFI_NOT_FOUND)]            = "Not found",
+        [NOERR(EFI_ACCESS_DENIED)]        = "Access denied",
+        [NOERR(EFI_NO_RESPONSE)]          = "No response",
+        [NOERR(EFI_NO_MAPPING)]           = "No mapping",
+        [NOERR(EFI_TIMEOUT)]              = "Time out",
+        [NOERR(EFI_NOT_STARTED)]          = "Not started",
+        [NOERR(EFI_ALREADY_STARTED)]      = "Already started",
+        [NOERR(EFI_ABORTED)]              = "Aborted",
+        [NOERR(EFI_ICMP_ERROR)]           = "ICMP error",
+        [NOERR(EFI_TFTP_ERROR)]           = "TFTP error",
+        [NOERR(EFI_PROTOCOL_ERROR)]       = "Protocol error",
+        [NOERR(EFI_INCOMPATIBLE_VERSION)] = "Incompatible version",
+        [NOERR(EFI_SECURITY_VIOLATION)]   = "Security violation",
+        [NOERR(EFI_CRC_ERROR)]            = "CRC error",
+        [NOERR(EFI_END_OF_MEDIA)]         = "End of media",
+        [NOERR(EFI_ERROR_RESERVED_29)]    = "Reserved (29)",
+        [NOERR(EFI_ERROR_RESERVED_30)]    = "Reserved (30)",
+        [NOERR(EFI_END_OF_FILE)]          = "End of file",
+        [NOERR(EFI_INVALID_LANGUAGE)]     = "Invalid language",
+        [NOERR(EFI_COMPROMISED_DATA)]     = "Compromised data",
+        [NOERR(EFI_IP_ADDRESS_CONFLICT)]  = "IP address conflict",
+        [NOERR(EFI_HTTP_ERROR)]           = "HTTP error",
+};
+
+static const char *status_to_string(EFI_STATUS status) {
+        if (status <= ELEMENTSOF(warn_table) - 1)
+                return warn_table[status];
+        if (status >= EFI_ERROR_MASK && status <= ((ELEMENTSOF(err_table) - 1) | EFI_ERROR_MASK))
+                return err_table[NOERR(status)];
+        return NULL;
+}
+
+typedef struct {
+        size_t padded_len; /* Field width in printf. */
+        size_t len;        /* Precision in printf. */
+        bool pad_zero;
+        bool align_left;
+        bool alternative_form;
+        bool long_arg;
+        bool longlong_arg;
+        bool have_field_width;
+
+        const char *str;
+        const wchar_t *wstr;
+
+        /* For numbers. */
+        bool is_signed;
+        bool lowercase;
+        int8_t base;
+        char sign_pad; /* For + and (space) flags. */
+} SpecifierContext;
+
+typedef struct {
+        char16_t stack_buf[128]; /* We use stack_buf first to avoid allocations in most cases. */
+        char16_t *dyn_buf;       /* Allocated buf or NULL if stack_buf is used. */
+        char16_t *buf;           /* Points to the current active buf. */
+        size_t n_buf;            /* Len of buf (in char16_t's, not bytes!). */
+        size_t n;                /* Used len of buf (in char16_t's). This is always n, need, &need));
+
+        if (need < ctx->n_buf)
+                return;
+
+        /* Greedily allocate if we can. */
+        if (__builtin_mul_overflow(need, 2, &ctx->n_buf))
+                ctx->n_buf = need;
+
+        /* We cannot use realloc here as ctx->buf may be ctx->stack_buf, which we cannot free. */
+        char16_t *new_buf = xnew(char16_t, ctx->n_buf);
+        memcpy(new_buf, ctx->buf, ctx->n * sizeof(*ctx->buf));
+
+        free(ctx->dyn_buf);
+        ctx->buf = ctx->dyn_buf = new_buf;
+}
+
+static void push_padding(FormatContext *ctx, char pad, size_t len) {
+        assert(ctx);
+        while (len > 0) {
+                len--;
+                ctx->buf[ctx->n++] = pad;
+        }
+}
+
+static bool push_str(FormatContext *ctx, SpecifierContext *sp) {
+        assert(ctx);
+        assert(sp);
+
+        sp->padded_len = LESS_BY(sp->padded_len, sp->len);
+
+        grow_buf(ctx, sp->padded_len + sp->len);
+
+        if (!sp->align_left)
+                push_padding(ctx, ' ', sp->padded_len);
+
+        /* In userspace unit tests we cannot just memcpy() the wide string. */
+        if (sp->wstr && sizeof(wchar_t) == sizeof(char16_t)) {
+                memcpy(ctx->buf + ctx->n, sp->wstr, sp->len * sizeof(*sp->wstr));
+                ctx->n += sp->len;
+        } else {
+                assert(sp->str || sp->wstr);
+                for (size_t i = 0; i < sp->len; i++)
+                        ctx->buf[ctx->n++] = sp->str ? sp->str[i] : sp->wstr[i];
+        }
+
+        if (sp->align_left)
+                push_padding(ctx, ' ', sp->padded_len);
+
+        assert(ctx->n < ctx->n_buf);
+        return true;
+}
+
+static bool push_num(FormatContext *ctx, SpecifierContext *sp, uint64_t u) {
+        const char *digits = sp->lowercase ? "0123456789abcdef" : "0123456789ABCDEF";
+        char16_t tmp[32];
+        size_t n = 0;
+
+        assert(ctx);
+        assert(sp);
+        assert(IN_SET(sp->base, 10, 16));
+
+        /* "%.0u" prints nothing if value is 0. */
+        if (u == 0 && sp->len == 0)
+                return true;
+
+        if (sp->is_signed && (int64_t) u < 0) {
+                /* We cannot just do "u = -(int64_t)u" here because -INT64_MIN overflows. */
+
+                uint64_t rem = -((int64_t) u % sp->base);
+                u = (int64_t) u / -sp->base;
+                tmp[n++] = digits[rem];
+                sp->sign_pad = '-';
+        }
+
+        while (u > 0 || n == 0) {
+                uint64_t rem = u % sp->base;
+                u /= sp->base;
+                tmp[n++] = digits[rem];
+        }
+
+        /* Note that numbers never get truncated! */
+        size_t prefix = (sp->sign_pad != 0 ? 1 : 0) + (sp->alternative_form ? 2 : 0);
+        size_t number_len = prefix + MAX(n, sp->len);
+        grow_buf(ctx, MAX(sp->padded_len, number_len));
+
+        size_t padding = 0;
+        if (sp->pad_zero)
+                /* Leading zeroes go after the sign or 0x prefix. */
+                number_len = MAX(number_len, sp->padded_len);
+        else
+                padding = LESS_BY(sp->padded_len, number_len);
+
+        if (!sp->align_left)
+                push_padding(ctx, ' ', padding);
+
+        if (sp->sign_pad != 0)
+                ctx->buf[ctx->n++] = sp->sign_pad;
+        if (sp->alternative_form) {
+                ctx->buf[ctx->n++] = '0';
+                ctx->buf[ctx->n++] = sp->lowercase ? 'x' : 'X';
+        }
+
+        push_padding(ctx, '0', LESS_BY(number_len, n + prefix));
+
+        while (n > 0)
+                ctx->buf[ctx->n++] = tmp[--n];
+
+        if (sp->align_left)
+                push_padding(ctx, ' ', padding);
+
+        assert(ctx->n < ctx->n_buf);
+        return true;
+}
+
+/* This helps unit testing. */
+#if SD_BOOT
+#  define NULLSTR "(null)"
+#  define wcsnlen strnlen16
+#else
+#  define NULLSTR "(nil)"
+#endif
+
+static bool handle_format_specifier(FormatContext *ctx, SpecifierContext *sp) {
+        /* Parses one item from the format specifier in ctx and put the info into sp. If we are done with
+         * this specifier returns true, otherwise this function should be called again. */
+
+        /* This implementation assumes 32-bit ints. Also note that all types smaller than int are promoted to
+         * int in vararg functions, which is why we fetch only ints for any such types. The compiler would
+         * otherwise warn about fetching smaller types. */
+        assert_cc(sizeof(int) == 4);
+        assert_cc(sizeof(wchar_t) <= sizeof(int));
+        assert_cc(sizeof(intmax_t) <= sizeof(long long));
+
+        assert(ctx);
+        assert(sp);
+
+        switch (*ctx->format) {
+        case '#':
+                sp->alternative_form = true;
+                return false;
+        case '.':
+                sp->have_field_width = true;
+                return false;
+        case '-':
+                sp->align_left = true;
+                return false;
+        case '+':
+        case ' ':
+                sp->sign_pad = *ctx->format;
+                return false;
+
+        case '0':
+                if (!sp->have_field_width) {
+                        sp->pad_zero = true;
+                        return false;
+                }
+
+                /* If field width has already been provided then 0 is part of precision (%.0s). */
+                _fallthrough_;
+
+        case '*':
+        case '1' ... '9': {
+                int64_t i;
+
+                if (*ctx->format == '*')
+                        i = va_arg(ctx->ap, int);
+                else {
+                        uint64_t u;
+                        if (!parse_number8(ctx->format, &u, &ctx->format) || u > INT_MAX)
+                                assert_not_reached();
+                        ctx->format--; /* Point it back to the last digit. */
+                        i = u;
+                }
+
+                if (sp->have_field_width) {
+                        /* Negative precision is ignored. */
+                        if (i >= 0)
+                                sp->len = (size_t) i;
+                } else {
+                        /* Negative field width is treated as positive field width with '-' flag. */
+                        if (i < 0) {
+                                i *= -1;
+                                sp->align_left = true;
+                        }
+                        sp->padded_len = i;
+                }
+
+                return false;
+        }
+
+        case 'h':
+                if (*(ctx->format + 1) == 'h')
+                        ctx->format++;
+                /* char/short gets promoted to int, nothing to do here. */
+                return false;
+
+        case 'l':
+                if (*(ctx->format + 1) == 'l') {
+                        ctx->format++;
+                        sp->longlong_arg = true;
+                } else
+                        sp->long_arg = true;
+                return false;
+
+        case 'z':
+                sp->long_arg = sizeof(size_t) == sizeof(long);
+                sp->longlong_arg = !sp->long_arg && sizeof(size_t) == sizeof(long long);
+                return false;
+
+        case 'j':
+                sp->long_arg = sizeof(intmax_t) == sizeof(long);
+                sp->longlong_arg = !sp->long_arg && sizeof(intmax_t) == sizeof(long long);
+                return false;
+
+        case 't':
+                sp->long_arg = sizeof(ptrdiff_t) == sizeof(long);
+                sp->longlong_arg = !sp->long_arg && sizeof(ptrdiff_t) == sizeof(long long);
+                return false;
+
+        case '%':
+                sp->str = "%";
+                sp->len = 1;
+                return push_str(ctx, sp);
+
+        case 'c':
+                sp->wstr = &(wchar_t){ va_arg(ctx->ap, int) };
+                sp->len = 1;
+                return push_str(ctx, sp);
+
+        case 's':
+                if (sp->long_arg) {
+                        sp->wstr = va_arg(ctx->ap, const wchar_t *) ?: L"(null)";
+                        sp->len = wcsnlen(sp->wstr, sp->len);
+                } else {
+                        sp->str = va_arg(ctx->ap, const char *) ?: "(null)";
+                        sp->len = strnlen8(sp->str, sp->len);
+                }
+                return push_str(ctx, sp);
+
+        case 'd':
+        case 'i':
+        case 'u':
+        case 'x':
+        case 'X':
+                sp->lowercase = *ctx->format == 'x';
+                sp->is_signed = IN_SET(*ctx->format, 'd', 'i');
+                sp->base = IN_SET(*ctx->format, 'x', 'X') ? 16 : 10;
+                if (sp->len == SIZE_MAX)
+                        sp->len = 1;
+
+                uint64_t v;
+                if (sp->longlong_arg)
+                        v = sp->is_signed ? (uint64_t) va_arg(ctx->ap, long long) :
+                                            va_arg(ctx->ap, unsigned long long);
+                else if (sp->long_arg)
+                        v = sp->is_signed ? (uint64_t) va_arg(ctx->ap, long) : va_arg(ctx->ap, unsigned long);
+                else
+                        v = sp->is_signed ? (uint64_t) va_arg(ctx->ap, int) : va_arg(ctx->ap, unsigned);
+
+                return push_num(ctx, sp, v);
+
+        case 'p': {
+                const void *ptr = va_arg(ctx->ap, const void *);
+                if (!ptr) {
+                        sp->str = NULLSTR;
+                        sp->len = STRLEN(NULLSTR);
+                        return push_str(ctx, sp);
+                }
+
+                sp->base = 16;
+                sp->lowercase = true;
+                sp->alternative_form = true;
+                sp->len = 0; /* Precision is ignored for %p. */
+                return push_num(ctx, sp, (uintptr_t) ptr);
+        }
+
+        case 'm': {
+                sp->str = status_to_string(ctx->status);
+                if (sp->str) {
+                        sp->len = strlen8(sp->str);
+                        return push_str(ctx, sp);
+                }
+
+                sp->base = 16;
+                sp->lowercase = true;
+                sp->alternative_form = true;
+                sp->len = 0;
+                return push_num(ctx, sp, ctx->status);
+        }
+
+        default:
+                assert_not_reached();
+        }
+}
+
+/* printf_internal is largely compatible to userspace vasprintf. Any features omitted should trigger asserts.
+ *
+ * Supported:
+ *  - Flags: #, 0, +, -, space
+ *  - Lengths: h, hh, l, ll, z, j, t
+ *  - Specifiers: %, c, s, u, i, d, x, X, p, m
+ *  - Precision and width (inline or as int arg using *)
+ *
+ * Notable differences:
+ *  - Passing NULL to %s is permitted and will print "(null)"
+ *  - %p will also use "(null)"
+ *  - The provided EFI_STATUS is used for %m instead of errno
+ *  - "\n" is translated to "\r\n" */
+_printf_(2, 0) static char16_t *printf_internal(EFI_STATUS status, const char *format, va_list ap, bool ret) {
+        assert(format);
+
+        FormatContext ctx = {
+                .buf = ctx.stack_buf,
+                .n_buf = ELEMENTSOF(ctx.stack_buf),
+                .format = format,
+                .status = status,
+        };
+
+        /* We cannot put this into the struct without making a copy. */
+        va_copy(ctx.ap, ap);
+
+        while (*ctx.format != '\0') {
+                SpecifierContext sp = { .len = SIZE_MAX };
+
+                switch (*ctx.format) {
+                case '%':
+                        ctx.format++;
+                        while (!handle_format_specifier(&ctx, &sp))
+                                ctx.format++;
+                        ctx.format++;
+                        break;
+                case '\n':
+                        ctx.format++;
+                        sp.str = "\r\n";
+                        sp.len = 2;
+                        push_str(&ctx, &sp);
+                        break;
+                default:
+                        sp.str = ctx.format++;
+                        while (!IN_SET(*ctx.format, '%', '\n', '\0'))
+                                ctx.format++;
+                        sp.len = ctx.format - sp.str;
+                        push_str(&ctx, &sp);
+                }
+        }
+
+        va_end(ctx.ap);
+
+        assert(ctx.n < ctx.n_buf);
+        ctx.buf[ctx.n++] = '\0';
+
+        if (ret) {
+                if (ctx.dyn_buf)
+                        return TAKE_PTR(ctx.dyn_buf);
+
+                char16_t *ret_buf = xnew(char16_t, ctx.n);
+                memcpy(ret_buf, ctx.buf, ctx.n * sizeof(*ctx.buf));
+                return ret_buf;
+        }
+
+#if SD_BOOT
+        ST->ConOut->OutputString(ST->ConOut, ctx.buf);
+#endif
+
+        return mfree(ctx.dyn_buf);
+}
+
+void printf_status(EFI_STATUS status, const char *format, ...) {
+        va_list ap;
+        va_start(ap, format);
+        printf_internal(status, format, ap, false);
+        va_end(ap);
+}
+
+void vprintf_status(EFI_STATUS status, const char *format, va_list ap) {
+        printf_internal(status, format, ap, false);
+}
+
+char16_t *xasprintf_status(EFI_STATUS status, const char *format, ...) {
+        va_list ap;
+        va_start(ap, format);
+        char16_t *ret = printf_internal(status, format, ap, true);
+        va_end(ap);
+        return ret;
+}
+
+char16_t *xvasprintf_status(EFI_STATUS status, const char *format, va_list ap) {
+        return printf_internal(status, format, ap, true);
+}
+
+#if SD_BOOT
+/* To provide the actual implementation for these we need to remove the redirection to the builtins. */
+#  undef memchr
+#  undef memcmp
+#  undef memcpy
+#  undef memset
+_used_ void *memchr(const void *p, int c, size_t n);
+_used_ int memcmp(const void *p1, const void *p2, size_t n);
+_used_ void *memcpy(void * restrict dest, const void * restrict src, size_t n);
+_used_ void *memset(void *p, int c, size_t n);
+#else
+/* And for userspace unit testing we need to give them an efi_ prefix. */
+#  define memchr efi_memchr
+#  define memcmp efi_memcmp
+#  define memcpy efi_memcpy
+#  define memset efi_memset
+#endif
+
+void *memchr(const void *p, int c, size_t n) {
+        if (!p || n == 0)
+                return NULL;
+
+        const uint8_t *q = p;
+        for (size_t i = 0; i < n; i++)
+                if (q[i] == (unsigned char) c)
+                        return (void *) (q + i);
+
+        return NULL;
+}
+
+int memcmp(const void *p1, const void *p2, size_t n) {
+        const uint8_t *up1 = p1, *up2 = p2;
+        int r;
+
+        if (!p1 || !p2)
+                return CMP(p1, p2);
+
+        while (n > 0) {
+                r = CMP(*up1, *up2);
+                if (r != 0)
+                        return r;
+
+                up1++;
+                up2++;
+                n--;
+        }
+
+        return 0;
+}
+
+void *memcpy(void * restrict dest, const void * restrict src, size_t n) {
+        if (!dest || !src || n == 0)
+                return dest;
+
+#if SD_BOOT
+        /* The firmware-provided memcpy is likely optimized, so use that. The function is guaranteed to be
+         * available by the UEFI spec. We still make it depend on the boot services pointer being set just in
+         * case the compiler emits a call before it is available. */
+        if (_likely_(BS)) {
+                BS->CopyMem(dest, (void *) src, n);
+                return dest;
+        }
+#endif
+
+        uint8_t *d = dest;
+        const uint8_t *s = src;
+
+        while (n > 0) {
+                *d = *s;
+                d++;
+                s++;
+                n--;
+        }
+
+        return dest;
+}
+
+void *memset(void *p, int c, size_t n) {
+        if (!p || n == 0)
+                return p;
+
+#if SD_BOOT
+        /* See comment in efi_memcpy. Note that the signature has c and n swapped! */
+        if (_likely_(BS)) {
+                BS->SetMem(p, n, c);
+                return p;
+        }
+#endif
+
+        uint8_t *q = p;
+        while (n > 0) {
+                *q = c;
+                q++;
+                n--;
+        }
+
+        return p;
+}
diff --git a/src/boot/efi/efi-string.h b/src/boot/efi/efi-string.h
new file mode 100644
index 0000000..9ac919f
--- /dev/null
+++ b/src/boot/efi/efi-string.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+#include "macro-fundamental.h"
+
+size_t strnlen8(const char *s, size_t n);
+size_t strnlen16(const char16_t *s, size_t n);
+
+static inline size_t strlen8(const char *s) {
+        return strnlen8(s, SIZE_MAX);
+}
+
+static inline size_t strlen16(const char16_t *s) {
+        return strnlen16(s, SIZE_MAX);
+}
+
+static inline size_t strsize8(const char *s) {
+        return s ? (strlen8(s) + 1) * sizeof(*s) : 0;
+}
+
+static inline size_t strsize16(const char16_t *s) {
+        return s ? (strlen16(s) + 1) * sizeof(*s) : 0;
+}
+
+void strtolower8(char *s);
+void strtolower16(char16_t *s);
+
+int strncmp8(const char *s1, const char *s2, size_t n);
+int strncmp16(const char16_t *s1, const char16_t *s2, size_t n);
+int strncasecmp8(const char *s1, const char *s2, size_t n);
+int strncasecmp16(const char16_t *s1, const char16_t *s2, size_t n);
+
+static inline int strcmp8(const char *s1, const char *s2) {
+        return strncmp8(s1, s2, SIZE_MAX);
+}
+
+static inline int strcmp16(const char16_t *s1, const char16_t *s2) {
+        return strncmp16(s1, s2, SIZE_MAX);
+}
+
+static inline int strcasecmp8(const char *s1, const char *s2) {
+        return strncasecmp8(s1, s2, SIZE_MAX);
+}
+
+static inline int strcasecmp16(const char16_t *s1, const char16_t *s2) {
+        return strncasecmp16(s1, s2, SIZE_MAX);
+}
+
+static inline bool strneq8(const char *s1, const char *s2, size_t n) {
+        return strncmp8(s1, s2, n) == 0;
+}
+
+static inline bool strneq16(const char16_t *s1, const char16_t *s2, size_t n) {
+        return strncmp16(s1, s2, n) == 0;
+}
+
+static inline bool streq8(const char *s1, const char *s2) {
+        return strcmp8(s1, s2) == 0;
+}
+
+static inline bool streq16(const char16_t *s1, const char16_t *s2) {
+        return strcmp16(s1, s2) == 0;
+}
+
+static inline int strncaseeq8(const char *s1, const char *s2, size_t n) {
+        return strncasecmp8(s1, s2, n) == 0;
+}
+
+static inline int strncaseeq16(const char16_t *s1, const char16_t *s2, size_t n) {
+        return strncasecmp16(s1, s2, n) == 0;
+}
+
+static inline bool strcaseeq8(const char *s1, const char *s2) {
+        return strcasecmp8(s1, s2) == 0;
+}
+
+static inline bool strcaseeq16(const char16_t *s1, const char16_t *s2) {
+        return strcasecmp16(s1, s2) == 0;
+}
+
+char *strcpy8(char * restrict dest, const char * restrict src);
+char16_t *strcpy16(char16_t * restrict dest, const char16_t * restrict src);
+
+char *strchr8(const char *s, char c);
+char16_t *strchr16(const char16_t *s, char16_t c);
+
+char *xstrndup8(const char *s, size_t n);
+char16_t *xstrndup16(const char16_t *s, size_t n);
+
+static inline char *xstrdup8(const char *s) {
+        return xstrndup8(s, SIZE_MAX);
+}
+
+static inline char16_t *xstrdup16(const char16_t *s) {
+        return xstrndup16(s, SIZE_MAX);
+}
+
+char16_t *xstrn8_to_16(const char *str8, size_t n);
+static inline char16_t *xstr8_to_16(const char *str8) {
+        return xstrn8_to_16(str8, strlen8(str8));
+}
+
+char *startswith8(const char *s, const char *prefix);
+
+bool efi_fnmatch(const char16_t *pattern, const char16_t *haystack);
+
+bool parse_number8(const char *s, uint64_t *ret_u, const char **ret_tail);
+bool parse_number16(const char16_t *s, uint64_t *ret_u, const char16_t **ret_tail);
+
+bool parse_boolean(const char *v, bool *ret);
+
+char *line_get_key_value(char *s, const char *sep, size_t *pos, char **ret_key, char **ret_value);
+
+char16_t *hexdump(const void *data, size_t size);
+
+#ifdef __clang__
+#  define _gnu_printf_(a, b) _printf_(a, b)
+#else
+#  define _gnu_printf_(a, b) __attribute__((format(gnu_printf, a, b)))
+#endif
+
+_gnu_printf_(2, 3) void printf_status(EFI_STATUS status, const char *format, ...);
+_gnu_printf_(2, 0) void vprintf_status(EFI_STATUS status, const char *format, va_list ap);
+_gnu_printf_(2, 3) _warn_unused_result_ char16_t *xasprintf_status(EFI_STATUS status, const char *format, ...);
+_gnu_printf_(2, 0) _warn_unused_result_ char16_t *xvasprintf_status(EFI_STATUS status, const char *format, va_list ap);
+
+#if SD_BOOT
+#  define printf(...) printf_status(EFI_SUCCESS, __VA_ARGS__)
+#  define xasprintf(...) xasprintf_status(EFI_SUCCESS, __VA_ARGS__)
+
+/* inttypes.h is provided by libc instead of the compiler and is not supposed to be used in freestanding
+ * environments. We could use clang __*_FMT*__ constants for this, bug gcc does not have them. :( */
+
+#  if defined(__ILP32__) || defined(__arm__) || defined(__i386__)
+#    define PRI64_PREFIX "ll"
+#  elif defined(__LP64__)
+#    define PRI64_PREFIX "l"
+#  elif defined(__LLP64__) || (__SIZEOF_LONG__ == 4 && __SIZEOF_POINTER__ == 8)
+#    define PRI64_PREFIX "ll"
+#  else
+#    error Unknown 64-bit data model
+#  endif
+
+#  define PRIi32 "i"
+#  define PRIu32 "u"
+#  define PRIx32 "x"
+#  define PRIX32 "X"
+#  define PRIiPTR "zi"
+#  define PRIuPTR "zu"
+#  define PRIxPTR "zx"
+#  define PRIXPTR "zX"
+#  define PRIi64 PRI64_PREFIX "i"
+#  define PRIu64 PRI64_PREFIX "u"
+#  define PRIx64 PRI64_PREFIX "x"
+#  define PRIX64 PRI64_PREFIX "X"
+
+/* The compiler normally has knowledge about standard functions such as memcmp, but this is not the case when
+ * compiling with -ffreestanding. By referring to builtins, the compiler can check arguments and do
+ * optimizations again. Note that we still need to provide implementations as the compiler is free to not
+ * inline its own implementation and instead issue a library call. */
+#  define memchr __builtin_memchr
+#  define memcmp __builtin_memcmp
+#  define memcpy __builtin_memcpy
+#  define memset __builtin_memset
+
+static inline void *mempcpy(void * restrict dest, const void * restrict src, size_t n) {
+        if (!dest || !src || n == 0)
+                return dest;
+        memcpy(dest, src, n);
+        return (uint8_t *) dest + n;
+}
+
+#else
+/* For unit testing. */
+void *efi_memchr(const void *p, int c, size_t n);
+int efi_memcmp(const void *p1, const void *p2, size_t n);
+void *efi_memcpy(void * restrict dest, const void * restrict src, size_t n);
+void *efi_memset(void *p, int c, size_t n);
+#endif
diff --git a/src/boot/efi/efi.h b/src/boot/efi/efi.h
new file mode 100644
index 0000000..fbc5d10
--- /dev/null
+++ b/src/boot/efi/efi.h
@@ -0,0 +1,459 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+#include 
+
+#include "macro-fundamental.h"
+
+#if SD_BOOT
+/* uchar.h/wchar.h are not suitable for freestanding environments. */
+typedef __WCHAR_TYPE__ wchar_t;
+typedef __CHAR16_TYPE__ char16_t;
+typedef __CHAR32_TYPE__ char32_t;
+
+/* Let's be paranoid and do some sanity checks. */
+assert_cc(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__);
+assert_cc(__STDC_HOSTED__ == 0);
+assert_cc(sizeof(bool) == 1);
+assert_cc(sizeof(uint8_t) == 1);
+assert_cc(sizeof(uint16_t) == 2);
+assert_cc(sizeof(uint32_t) == 4);
+assert_cc(sizeof(uint64_t) == 8);
+assert_cc(sizeof(wchar_t) == 2);
+assert_cc(sizeof(char16_t) == 2);
+assert_cc(sizeof(char32_t) == 4);
+assert_cc(sizeof(size_t) == sizeof(void *));
+assert_cc(sizeof(size_t) == sizeof(uintptr_t));
+assert_cc(alignof(bool) == 1);
+assert_cc(alignof(uint8_t) == 1);
+assert_cc(alignof(uint16_t) == 2);
+assert_cc(alignof(uint32_t) == 4);
+assert_cc(alignof(uint64_t) == 8);
+assert_cc(alignof(wchar_t) == 2);
+assert_cc(alignof(char16_t) == 2);
+assert_cc(alignof(char32_t) == 4);
+
+#  if defined(__x86_64__) && defined(__ILP32__)
+#    error Building for x64 requires -m64 on x32 ABI.
+#  endif
+#else
+#  include 
+#  include 
+#endif
+
+/* We use size_t/ssize_t to represent UEFI UINTN/INTN. */
+typedef size_t EFI_STATUS;
+typedef intptr_t ssize_t;
+
+typedef void* EFI_HANDLE;
+typedef void* EFI_EVENT;
+typedef size_t EFI_TPL;
+typedef uint64_t EFI_LBA;
+typedef uint64_t EFI_PHYSICAL_ADDRESS;
+
+#if defined(__x86_64__) && !defined(__ILP32__)
+#  define EFIAPI __attribute__((ms_abi))
+#else
+#  define EFIAPI
+#endif
+
+#if __SIZEOF_POINTER__ == 8
+#  define EFI_ERROR_MASK 0x8000000000000000ULL
+#elif __SIZEOF_POINTER__ == 4
+#  define EFI_ERROR_MASK 0x80000000ULL
+#else
+#  error Unsupported pointer size
+#endif
+
+#define EFIWARN(s) ((EFI_STATUS) s)
+#define EFIERR(s) ((EFI_STATUS) (s | EFI_ERROR_MASK))
+
+#define EFI_SUCCESS               EFIWARN(0)
+#define EFI_WARN_UNKNOWN_GLYPH    EFIWARN(1)
+#define EFI_WARN_DELETE_FAILURE   EFIWARN(2)
+#define EFI_WARN_WRITE_FAILURE    EFIWARN(3)
+#define EFI_WARN_BUFFER_TOO_SMALL EFIWARN(4)
+#define EFI_WARN_STALE_DATA       EFIWARN(5)
+#define EFI_WARN_FILE_SYSTEM      EFIWARN(6)
+#define EFI_WARN_RESET_REQUIRED   EFIWARN(7)
+
+#define EFI_LOAD_ERROR           EFIERR(1)
+#define EFI_INVALID_PARAMETER    EFIERR(2)
+#define EFI_UNSUPPORTED          EFIERR(3)
+#define EFI_BAD_BUFFER_SIZE      EFIERR(4)
+#define EFI_BUFFER_TOO_SMALL     EFIERR(5)
+#define EFI_NOT_READY            EFIERR(6)
+#define EFI_DEVICE_ERROR         EFIERR(7)
+#define EFI_WRITE_PROTECTED      EFIERR(8)
+#define EFI_OUT_OF_RESOURCES     EFIERR(9)
+#define EFI_VOLUME_CORRUPTED     EFIERR(10)
+#define EFI_VOLUME_FULL          EFIERR(11)
+#define EFI_NO_MEDIA             EFIERR(12)
+#define EFI_MEDIA_CHANGED        EFIERR(13)
+#define EFI_NOT_FOUND            EFIERR(14)
+#define EFI_ACCESS_DENIED        EFIERR(15)
+#define EFI_NO_RESPONSE          EFIERR(16)
+#define EFI_NO_MAPPING           EFIERR(17)
+#define EFI_TIMEOUT              EFIERR(18)
+#define EFI_NOT_STARTED          EFIERR(19)
+#define EFI_ALREADY_STARTED      EFIERR(20)
+#define EFI_ABORTED              EFIERR(21)
+#define EFI_ICMP_ERROR           EFIERR(22)
+#define EFI_TFTP_ERROR           EFIERR(23)
+#define EFI_PROTOCOL_ERROR       EFIERR(24)
+#define EFI_INCOMPATIBLE_VERSION EFIERR(25)
+#define EFI_SECURITY_VIOLATION   EFIERR(26)
+#define EFI_CRC_ERROR            EFIERR(27)
+#define EFI_END_OF_MEDIA         EFIERR(28)
+#define EFI_ERROR_RESERVED_29    EFIERR(29)
+#define EFI_ERROR_RESERVED_30    EFIERR(30)
+#define EFI_END_OF_FILE          EFIERR(31)
+#define EFI_INVALID_LANGUAGE     EFIERR(32)
+#define EFI_COMPROMISED_DATA     EFIERR(33)
+#define EFI_IP_ADDRESS_CONFLICT  EFIERR(34)
+#define EFI_HTTP_ERROR           EFIERR(35)
+
+typedef struct {
+        uint32_t Data1;
+        uint16_t Data2;
+        uint16_t Data3;
+        uint8_t Data4[8];
+} EFI_GUID;
+
+#define GUID_DEF(d1, d2, d3, d4_1, d4_2, d4_3, d4_4, d4_5, d4_6, d4_7, d4_8) \
+    { d1, d2, d3, { d4_1, d4_2, d4_3, d4_4, d4_5, d4_6, d4_7, d4_8 } }
+
+/* Creates a EFI_GUID pointer suitable for EFI APIs. Use of const allows the compiler to merge multiple
+ * uses (although, currently compilers do that regardless). Most EFI APIs declare their EFI_GUID input
+ * as non-const, but almost all of them are in fact const. */
+#define MAKE_GUID_PTR(name) ((EFI_GUID *) &(const EFI_GUID) name##_GUID)
+
+/* These allow MAKE_GUID_PTR() to work without requiring an extra _GUID in the passed name. We want to
+ * keep the GUID definitions in line with the UEFI spec. */
+#define EFI_GLOBAL_VARIABLE_GUID EFI_GLOBAL_VARIABLE
+#define EFI_FILE_INFO_GUID EFI_FILE_INFO_ID
+
+#define EFI_GLOBAL_VARIABLE \
+        GUID_DEF(0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c)
+#define EFI_IMAGE_SECURITY_DATABASE_GUID \
+        GUID_DEF(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
+
+#define EVT_TIMER                         0x80000000U
+#define EVT_RUNTIME                       0x40000000U
+#define EVT_NOTIFY_WAIT                   0x00000100U
+#define EVT_NOTIFY_SIGNAL                 0x00000200U
+#define EVT_SIGNAL_EXIT_BOOT_SERVICES     0x00000201U
+#define EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE 0x60000202U
+
+#define EFI_OPEN_PROTOCOL_BY_HANDLE_PROTOCOL  0x01U
+#define EFI_OPEN_PROTOCOL_GET_PROTOCOL        0x02U
+#define EFI_OPEN_PROTOCOL_TEST_PROTOCOL       0x04U
+#define EFI_OPEN_PROTOCOL_BY_CHILD_CONTROLLER 0x08U
+#define EFI_OPEN_PROTOCOL_BY_DRIVER           0x10U
+#define EFI_OPEN_PROTOCOL_EXCLUSIVE           0x20U
+
+#define EFI_VARIABLE_NON_VOLATILE                          0x01U
+#define EFI_VARIABLE_BOOTSERVICE_ACCESS                    0x02U
+#define EFI_VARIABLE_RUNTIME_ACCESS                        0x04U
+#define EFI_VARIABLE_HARDWARE_ERROR_RECORD                 0x08U
+#define EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS            0x10U
+#define EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS 0x20U
+#define EFI_VARIABLE_APPEND_WRITE                          0x40U
+#define EFI_VARIABLE_ENHANCED_AUTHENTICATED_ACCESS         0x80U
+
+#define EFI_TIME_ADJUST_DAYLIGHT 0x001U
+#define EFI_TIME_IN_DAYLIGHT     0x002U
+#define EFI_UNSPECIFIED_TIMEZONE 0x7FFU
+
+#define EFI_OS_INDICATIONS_BOOT_TO_FW_UI                   0x01U
+#define EFI_OS_INDICATIONS_TIMESTAMP_REVOCATION            0x02U
+#define EFI_OS_INDICATIONS_FILE_CAPSULE_DELIVERY_SUPPORTED 0x04U
+#define EFI_OS_INDICATIONS_FMP_CAPSULE_SUPPORTED           0x08U
+#define EFI_OS_INDICATIONS_CAPSULE_RESULT_VAR_SUPPORTED    0x10U
+#define EFI_OS_INDICATIONS_START_OS_RECOVERY               0x20U
+#define EFI_OS_INDICATIONS_START_PLATFORM_RECOVERY         0x40U
+#define EFI_OS_INDICATIONS_JSON_CONFIG_DATA_REFRESH        0x80U
+
+#define EFI_PAGE_SIZE 4096U
+#define EFI_SIZE_TO_PAGES(s) (((s) + 0xFFFU) >> 12U)
+
+/* These are common enough to warrant forward declaration. We also give them a
+ * shorter name for convenience. */
+typedef struct EFI_FILE_PROTOCOL EFI_FILE;
+typedef struct EFI_DEVICE_PATH_PROTOCOL EFI_DEVICE_PATH;
+
+typedef struct EFI_SIMPLE_TEXT_INPUT_PROTOCOL EFI_SIMPLE_TEXT_INPUT_PROTOCOL;
+typedef struct EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL;
+
+typedef enum {
+        TimerCancel,
+        TimerPeriodic,
+        TimerRelative,
+} EFI_TIMER_DELAY;
+
+typedef enum {
+        AllocateAnyPages,
+        AllocateMaxAddress,
+        AllocateAddress,
+        MaxAllocateType,
+} EFI_ALLOCATE_TYPE;
+
+typedef enum {
+        EfiReservedMemoryType,
+        EfiLoaderCode,
+        EfiLoaderData,
+        EfiBootServicesCode,
+        EfiBootServicesData,
+        EfiRuntimeServicesCode,
+        EfiRuntimeServicesData,
+        EfiConventionalMemory,
+        EfiUnusableMemory,
+        EfiACPIReclaimMemory,
+        EfiACPIMemoryNVS,
+        EfiMemoryMappedIO,
+        EfiMemoryMappedIOPortSpace,
+        EfiPalCode,
+        EfiPersistentMemory,
+        EfiUnacceptedMemoryType,
+        EfiMaxMemoryType,
+} EFI_MEMORY_TYPE;
+
+typedef enum {
+        AllHandles,
+        ByRegisterNotify,
+        ByProtocol,
+} EFI_LOCATE_SEARCH_TYPE;
+
+typedef enum {
+        EfiResetCold,
+        EfiResetWarm,
+        EfiResetShutdown,
+        EfiResetPlatformSpecific,
+} EFI_RESET_TYPE;
+
+typedef struct {
+        uint16_t Year;
+        uint8_t Month;
+        uint8_t Day;
+        uint8_t Hour;
+        uint8_t Minute;
+        uint8_t Second;
+        uint8_t Pad1;
+        uint32_t Nanosecond;
+        int16_t TimeZone;
+        uint8_t Daylight;
+        uint8_t Pad2;
+} EFI_TIME;
+
+typedef struct {
+        uint32_t Resolution;
+        uint32_t Accuracy;
+        bool SetsToZero;
+} EFI_TIME_CAPABILITIES;
+
+typedef struct {
+        uint64_t Signature;
+        uint32_t Revision;
+        uint32_t HeaderSize;
+        uint32_t CRC32;
+        uint32_t Reserved;
+} EFI_TABLE_HEADER;
+
+typedef struct {
+        EFI_TABLE_HEADER Hdr;
+        void *RaiseTPL;
+        void *RestoreTPL;
+        EFI_STATUS (EFIAPI *AllocatePages)(
+                        EFI_ALLOCATE_TYPE Type,
+                        EFI_MEMORY_TYPE MemoryType,
+                        size_t Pages,
+                        EFI_PHYSICAL_ADDRESS *Memory);
+        EFI_STATUS (EFIAPI *FreePages)(
+                        EFI_PHYSICAL_ADDRESS Memory,
+                        size_t Pages);
+        void *GetMemoryMap;
+        EFI_STATUS (EFIAPI *AllocatePool)(
+                        EFI_MEMORY_TYPE PoolType,
+                        size_t Size,
+                        void **Buffer);
+        EFI_STATUS (EFIAPI *FreePool)(void *Buffer);
+        EFI_STATUS (EFIAPI *CreateEvent)(
+                        uint32_t Type,
+                        EFI_TPL NotifyTpl,
+                        void *NotifyFunction,
+                        void *NotifyContext,
+                        EFI_EVENT *Event);
+        EFI_STATUS (EFIAPI *SetTimer)(
+                        EFI_EVENT Event,
+                        EFI_TIMER_DELAY Type,
+                        uint64_t TriggerTime);
+        EFI_STATUS (EFIAPI *WaitForEvent)(
+                        size_t NumberOfEvents,
+                        EFI_EVENT *Event,
+                        size_t *Index);
+        void *SignalEvent;
+        EFI_STATUS (EFIAPI *CloseEvent)(EFI_EVENT Event);
+        EFI_STATUS (EFIAPI *CheckEvent)(EFI_EVENT Event);
+        void *InstallProtocolInterface;
+        EFI_STATUS (EFIAPI *ReinstallProtocolInterface)(
+                        EFI_HANDLE Handle,
+                        EFI_GUID *Protocol,
+                        void *OldInterface,
+                        void *NewInterface);
+        void *UninstallProtocolInterface;
+        EFI_STATUS (EFIAPI *HandleProtocol)(
+                        EFI_HANDLE Handle,
+                        EFI_GUID *Protocol,
+                        void **Interface);
+        void *Reserved;
+        void *RegisterProtocolNotify;
+        EFI_STATUS (EFIAPI *LocateHandle)(
+                        EFI_LOCATE_SEARCH_TYPE SearchType,
+                        EFI_GUID *Protocol,
+                        void *SearchKey,
+                        size_t *BufferSize,
+                        EFI_HANDLE *Buffer);
+        EFI_STATUS (EFIAPI *LocateDevicePath)(
+                        EFI_GUID *Protocol,
+                        EFI_DEVICE_PATH **DevicePath,
+                        EFI_HANDLE *Device);
+        EFI_STATUS (EFIAPI *InstallConfigurationTable)(
+                        EFI_GUID *Guid,
+                        void *Table);
+        EFI_STATUS (EFIAPI *LoadImage)(
+                        bool BootPolicy,
+                        EFI_HANDLE ParentImageHandle,
+                        EFI_DEVICE_PATH *DevicePath,
+                        void *SourceBuffer,
+                        size_t SourceSize,
+                        EFI_HANDLE *ImageHandle);
+        EFI_STATUS (EFIAPI *StartImage)(
+                        EFI_HANDLE ImageHandle,
+                        size_t *ExitDataSize,
+                        char16_t **ExitData);
+        EFI_STATUS (EFIAPI *Exit)(
+                        EFI_HANDLE ImageHandle,
+                        EFI_STATUS ExitStatus,
+                        size_t ExitDataSize,
+                        char16_t *ExitData);
+        EFI_STATUS (EFIAPI *UnloadImage)(EFI_HANDLE ImageHandle);
+        void *ExitBootServices;
+        EFI_STATUS (EFIAPI *GetNextMonotonicCount)(uint64_t *Count);
+        EFI_STATUS (EFIAPI *Stall)(size_t Microseconds);
+        EFI_STATUS (EFIAPI *SetWatchdogTimer)(
+                        size_t Timeout,
+                        uint64_t WatchdogCode,
+                        size_t DataSize,
+                        char16_t *WatchdogData);
+        EFI_STATUS (EFIAPI *ConnectController)(
+                        EFI_HANDLE ControllerHandle,
+                        EFI_HANDLE *DriverImageHandle,
+                        EFI_DEVICE_PATH *RemainingDevicePath,
+                        bool Recursive);
+        EFI_STATUS (EFIAPI *DisconnectController)(
+                        EFI_HANDLE ControllerHandle,
+                        EFI_HANDLE DriverImageHandle,
+                        EFI_HANDLE ChildHandle);
+        EFI_STATUS (EFIAPI *OpenProtocol)(
+                        EFI_HANDLE Handle,
+                        EFI_GUID *Protocol,
+                        void **Interface,
+                        EFI_HANDLE AgentHandle,
+                        EFI_HANDLE ControllerHandle,
+                        uint32_t Attributes);
+        EFI_STATUS (EFIAPI *CloseProtocol)(
+                        EFI_HANDLE Handle,
+                        EFI_GUID *Protocol,
+                        EFI_HANDLE AgentHandle,
+                        EFI_HANDLE ControllerHandle);
+        void *OpenProtocolInformation;
+        EFI_STATUS (EFIAPI *ProtocolsPerHandle)(
+                        EFI_HANDLE Handle,
+                        EFI_GUID ***ProtocolBuffer,
+                        size_t *ProtocolBufferCount);
+        EFI_STATUS (EFIAPI *LocateHandleBuffer)(
+                        EFI_LOCATE_SEARCH_TYPE SearchType,
+                        EFI_GUID *Protocol,
+                        void *SearchKey,
+                        size_t *NoHandles,
+                        EFI_HANDLE **Buffer);
+        EFI_STATUS (EFIAPI *LocateProtocol)(
+                        EFI_GUID *Protocol,
+                        void *Registration,
+                        void **Interface);
+        EFI_STATUS (EFIAPI *InstallMultipleProtocolInterfaces)(EFI_HANDLE *Handle, ...);
+        EFI_STATUS (EFIAPI *UninstallMultipleProtocolInterfaces)(EFI_HANDLE Handle, ...);
+        EFI_STATUS (EFIAPI *CalculateCrc32)(
+                        void *Data,
+                        size_t DataSize,
+                        uint32_t *Crc32);
+        void (EFIAPI *CopyMem)(
+                        void *Destination,
+                        void *Source,
+                        size_t Length);
+        void (EFIAPI *SetMem)(
+                        void *Buffer,
+                        size_t Size,
+                        uint8_t Value);
+        void *CreateEventEx;
+} EFI_BOOT_SERVICES;
+
+typedef struct {
+        EFI_TABLE_HEADER Hdr;
+        EFI_STATUS (EFIAPI *GetTime)(
+                        EFI_TIME *Time,
+                        EFI_TIME_CAPABILITIES *Capabilities);
+        EFI_STATUS (EFIAPI *SetTime)(EFI_TIME *Time);
+        void *GetWakeupTime;
+        void *SetWakeupTime;
+        void *SetVirtualAddressMap;
+        void *ConvertPointer;
+        EFI_STATUS (EFIAPI *GetVariable)(
+                        char16_t *VariableName,
+                        EFI_GUID *VendorGuid,
+                        uint32_t *Attributes,
+                        size_t *DataSize,
+                        void *Data);
+        void *GetNextVariableName;
+        EFI_STATUS (EFIAPI *SetVariable)(
+                        char16_t *VariableName,
+                        EFI_GUID *VendorGuid,
+                        uint32_t Attributes,
+                        size_t  DataSize,
+                        void *Data);
+        EFI_STATUS (EFIAPI *GetNextHighMonotonicCount)(uint32_t *HighCount);
+        void (EFIAPI *ResetSystem)(
+                        EFI_RESET_TYPE ResetType,
+                        EFI_STATUS ResetStatus,
+                        size_t DataSize,
+                        void *ResetData);
+        void *UpdateCapsule;
+        void *QueryCapsuleCapabilities;
+        void *QueryVariableInfo;
+} EFI_RUNTIME_SERVICES;
+
+typedef struct {
+        EFI_TABLE_HEADER Hdr;
+        char16_t *FirmwareVendor;
+        uint32_t FirmwareRevision;
+        EFI_HANDLE ConsoleInHandle;
+        EFI_SIMPLE_TEXT_INPUT_PROTOCOL *ConIn;
+        EFI_HANDLE ConsoleOutHandle;
+        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *ConOut;
+        EFI_HANDLE StandardErrorHandle;
+        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *StdErr;
+        EFI_RUNTIME_SERVICES *RuntimeServices;
+        EFI_BOOT_SERVICES *BootServices;
+        size_t NumberOfTableEntries;
+        struct {
+                EFI_GUID VendorGuid;
+                void *VendorTable;
+        } *ConfigurationTable;
+} EFI_SYSTEM_TABLE;
+
+extern EFI_SYSTEM_TABLE *ST;
+extern EFI_BOOT_SERVICES *BS;
+extern EFI_RUNTIME_SERVICES *RT;
diff --git a/src/boot/efi/fuzz-bcd.c b/src/boot/efi/fuzz-bcd.c
new file mode 100644
index 0000000..cb5be7a
--- /dev/null
+++ b/src/boot/efi/fuzz-bcd.c
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bcd.h"
+#include "fuzz.h"
+#include "utf8.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_free_ void *p = NULL;
+
+        /* This limit was borrowed from src/boot/efi/boot.c */
+        if (outside_size_range(size, 0, 100*1024))
+                return 0;
+
+        fuzz_setup_logging();
+
+        p = memdup(data, size);
+        assert_se(p);
+
+        char16_t *title = get_bcd_title(p, size);
+        /* If we get something, it must be NUL-terminated, but an empty string is still valid! */
+        DO_NOT_OPTIMIZE(title && char16_strlen(title));
+        return 0;
+}
diff --git a/src/boot/efi/fuzz-efi-osrel.c b/src/boot/efi/fuzz-efi-osrel.c
new file mode 100644
index 0000000..1a5a9bc
--- /dev/null
+++ b/src/boot/efi/fuzz-efi-osrel.c
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "efi-string.h"
+#include "fuzz.h"
+
+#define SEP_LEN 4
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        if (outside_size_range(size, SEP_LEN + 1, 64 * 1024))
+                return 0;
+        if (data[SEP_LEN] != '\0')
+                return 0;
+
+        fuzz_setup_logging();
+
+        _cleanup_free_ char *p = memdup_suffix0(data + SEP_LEN + 1, size - SEP_LEN - 1);
+        assert_se(p);
+
+        size_t pos = 0;
+        char *key, *value;
+        while (line_get_key_value(p, (const char *) data, &pos, &key, &value)) {
+                assert_se(key);
+                assert_se(value);
+        }
+
+        return 0;
+}
diff --git a/src/boot/efi/fuzz-efi-printf.c b/src/boot/efi/fuzz-efi-printf.c
new file mode 100644
index 0000000..6dee830
--- /dev/null
+++ b/src/boot/efi/fuzz-efi-printf.c
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "efi-string.h"
+#include "fuzz.h"
+#include "utf8.h"
+
+typedef struct {
+        EFI_STATUS status;
+        int16_t field_width;
+        int16_t precision;
+        const void *ptr;
+        char c;
+        unsigned char uchar;
+        signed char schar;
+        unsigned short ushort;
+        signed short sshort;
+        unsigned int uint;
+        signed int sint;
+        unsigned long ulong;
+        signed long slong;
+        unsigned long long ulonglong;
+        signed long long slonglong;
+        size_t size;
+        ssize_t ssize;
+        intmax_t intmax;
+        uintmax_t uintmax;
+        ptrdiff_t ptrdiff;
+        char str[];
+} Input;
+
+#define PRINTF_ONE(...)                                                        \
+        ({                                                                     \
+                _cleanup_free_ char16_t *_ret = xasprintf_status(__VA_ARGS__); \
+                DO_NOT_OPTIMIZE(_ret);                                         \
+        })
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        if (outside_size_range(size, sizeof(Input), 1024 * 1024))
+                return 0;
+
+        const Input *i = (const Input *) data;
+        size_t len = size - offsetof(Input, str);
+
+        fuzz_setup_logging();
+
+        PRINTF_ONE(i->status, "%*.*s", i->field_width, (int) len, i->str);
+        PRINTF_ONE(i->status, "%*.*ls", i->field_width, (int) (len / sizeof(wchar_t)), (const wchar_t *) i->str);
+
+        PRINTF_ONE(i->status, "%% %*.*m", i->field_width, i->precision);
+        PRINTF_ONE(i->status, "%*p", i->field_width, i->ptr);
+        PRINTF_ONE(i->status, "%*c %12340c %56789c", i->field_width, i->c, i->c, i->c);
+
+        PRINTF_ONE(i->status, "%*.*hhu", i->field_width, i->precision, i->uchar);
+        PRINTF_ONE(i->status, "%*.*hhi", i->field_width, i->precision, i->schar);
+        PRINTF_ONE(i->status, "%*.*hu", i->field_width, i->precision, i->ushort);
+        PRINTF_ONE(i->status, "%*.*hi", i->field_width, i->precision, i->sshort);
+        PRINTF_ONE(i->status, "%*.*u", i->field_width, i->precision, i->uint);
+        PRINTF_ONE(i->status, "%*.*i", i->field_width, i->precision, i->sint);
+        PRINTF_ONE(i->status, "%*.*lu", i->field_width, i->precision, i->ulong);
+        PRINTF_ONE(i->status, "%*.*li", i->field_width, i->precision, i->slong);
+        PRINTF_ONE(i->status, "%*.*llu", i->field_width, i->precision, i->ulonglong);
+        PRINTF_ONE(i->status, "%*.*lli", i->field_width, i->precision, i->slonglong);
+
+        PRINTF_ONE(i->status, "%+*.*hhi", i->field_width, i->precision, i->schar);
+        PRINTF_ONE(i->status, "%-*.*hi", i->field_width, i->precision, i->sshort);
+        PRINTF_ONE(i->status, "% *.*i", i->field_width, i->precision, i->sint);
+        PRINTF_ONE(i->status, "%0*li", i->field_width, i->slong);
+        PRINTF_ONE(i->status, "%#*.*llx", i->field_width, i->precision, i->ulonglong);
+
+        PRINTF_ONE(i->status, "%-*.*zx", i->field_width, i->precision, i->size);
+        PRINTF_ONE(i->status, "% *.*zi", i->field_width, i->precision, i->ssize);
+        PRINTF_ONE(i->status, "%0*ji", i->field_width, i->intmax);
+        PRINTF_ONE(i->status, "%#0*jX", i->field_width, i->uintmax);
+        PRINTF_ONE(i->status, "%*.*ti", i->field_width, i->precision, i->ptrdiff);
+
+        return 0;
+}
diff --git a/src/boot/efi/fuzz-efi-string.c b/src/boot/efi/fuzz-efi-string.c
new file mode 100644
index 0000000..36ecaf9
--- /dev/null
+++ b/src/boot/efi/fuzz-efi-string.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "efi-string.h"
+#include "fuzz.h"
+#include "utf8.h"
+
+static char16_t *memdup_str16(const uint8_t *data, size_t size) {
+        char16_t *ret = memdup(data, size);
+        assert_se(ret);
+        ret[size / sizeof(char16_t) - 1] = '\0';
+        return ret;
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        if (outside_size_range(size, sizeof(size_t), 64 * 1024))
+                return 0;
+
+        fuzz_setup_logging();
+
+        size_t len, len2;
+        memcpy(&len, data, sizeof(len));
+        data += sizeof(len);
+        size -= sizeof(len);
+
+        len2 = size - len;
+        if (len > size || len < sizeof(char16_t) || len2 < sizeof(char16_t))
+                return 0;
+
+        const char *tail8 = NULL;
+        _cleanup_free_ char *str8 = ASSERT_SE_PTR(memdup_suffix0(data, size));
+        DO_NOT_OPTIMIZE(parse_number8(str8, &(uint64_t){ 0 }, size % 2 == 0 ? NULL : &tail8));
+
+        const char16_t *tail16 = NULL;
+        _cleanup_free_ char16_t *str16 = memdup_str16(data, size);
+        DO_NOT_OPTIMIZE(parse_number16(str16, &(uint64_t){ 0 }, size % 2 == 0 ? NULL : &tail16));
+
+        _cleanup_free_ char16_t *pattern = memdup_str16(data, len), *haystack = memdup_str16(data + len, len2);
+        DO_NOT_OPTIMIZE(efi_fnmatch(pattern, haystack));
+
+        return 0;
+}
diff --git a/src/boot/efi/graphics.c b/src/boot/efi/graphics.c
new file mode 100644
index 0000000..496fc69
--- /dev/null
+++ b/src/boot/efi/graphics.c
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Copyright © 2013 Intel Corporation
+ *   Authored by Joonas Lahtinen 
+ */
+
+#include "graphics.h"
+#include "proto/console-control.h"
+#include "proto/simple-text-io.h"
+#include "util.h"
+
+EFI_STATUS graphics_mode(bool on) {
+        EFI_CONSOLE_CONTROL_PROTOCOL *ConsoleControl = NULL;
+        EFI_CONSOLE_CONTROL_SCREEN_MODE new;
+        EFI_CONSOLE_CONTROL_SCREEN_MODE current;
+        bool uga_exists, stdin_locked;
+        EFI_STATUS err;
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_CONSOLE_CONTROL_PROTOCOL), NULL, (void **) &ConsoleControl);
+        if (err != EFI_SUCCESS)
+                /* console control protocol is nonstandard and might not exist. */
+                return err == EFI_NOT_FOUND ? EFI_SUCCESS : err;
+
+        /* check current mode */
+        err = ConsoleControl->GetMode(ConsoleControl, ¤t, &uga_exists, &stdin_locked);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* do not touch the mode */
+        new = on ? EfiConsoleControlScreenGraphics : EfiConsoleControlScreenText;
+        if (new == current)
+                return EFI_SUCCESS;
+
+        log_wait();
+        err = ConsoleControl->SetMode(ConsoleControl, new);
+
+        /* some firmware enables the cursor when switching modes */
+        ST->ConOut->EnableCursor(ST->ConOut, false);
+
+        return err;
+}
diff --git a/src/boot/efi/graphics.h b/src/boot/efi/graphics.h
new file mode 100644
index 0000000..33ab7f8
--- /dev/null
+++ b/src/boot/efi/graphics.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Copyright © 2013 Intel Corporation
+ *   Authored by Joonas Lahtinen 
+ */
+#pragma once
+
+#include "efi.h"
+
+EFI_STATUS graphics_mode(bool on);
diff --git a/src/boot/efi/initrd.c b/src/boot/efi/initrd.c
new file mode 100644
index 0000000..527b05f
--- /dev/null
+++ b/src/boot/efi/initrd.c
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "initrd.h"
+#include "macro-fundamental.h"
+#include "proto/device-path.h"
+#include "proto/load-file.h"
+#include "util.h"
+
+#define LINUX_INITRD_MEDIA_GUID \
+        GUID_DEF(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
+
+/* extend LoadFileProtocol */
+struct initrd_loader {
+        EFI_LOAD_FILE_PROTOCOL load_file;
+        const void *address;
+        size_t length;
+};
+
+/* static structure for LINUX_INITRD_MEDIA device path
+   see https://github.com/torvalds/linux/blob/v5.13/drivers/firmware/efi/libstub/efi-stub-helper.c
+ */
+static const struct {
+        VENDOR_DEVICE_PATH vendor;
+        EFI_DEVICE_PATH end;
+} _packed_ efi_initrd_device_path = {
+        .vendor = {
+                .Header = {
+                        .Type = MEDIA_DEVICE_PATH,
+                        .SubType = MEDIA_VENDOR_DP,
+                        .Length = sizeof(efi_initrd_device_path.vendor),
+                },
+                .Guid = LINUX_INITRD_MEDIA_GUID
+        },
+        .end = {
+                .Type = END_DEVICE_PATH_TYPE,
+                .SubType = END_ENTIRE_DEVICE_PATH_SUBTYPE,
+                .Length = sizeof(efi_initrd_device_path.end),
+        }
+};
+
+static EFIAPI EFI_STATUS initrd_load_file(
+                EFI_LOAD_FILE_PROTOCOL *this,
+                EFI_DEVICE_PATH *file_path,
+                bool boot_policy,
+                size_t *buffer_size,
+                void *buffer) {
+
+        struct initrd_loader *loader;
+
+        if (!this || !buffer_size || !file_path)
+                return EFI_INVALID_PARAMETER;
+        if (boot_policy)
+                return EFI_UNSUPPORTED;
+
+        loader = (struct initrd_loader *) this;
+
+        if (loader->length == 0 || !loader->address)
+                return EFI_NOT_FOUND;
+
+        if (!buffer || *buffer_size < loader->length) {
+                *buffer_size = loader->length;
+                return EFI_BUFFER_TOO_SMALL;
+        }
+
+        memcpy(buffer, loader->address, loader->length);
+        *buffer_size = loader->length;
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS initrd_register(
+                const void *initrd_address,
+                size_t initrd_length,
+                EFI_HANDLE *ret_initrd_handle) {
+
+        EFI_STATUS err;
+        EFI_DEVICE_PATH *dp = (EFI_DEVICE_PATH *) &efi_initrd_device_path;
+        EFI_HANDLE handle;
+        struct initrd_loader *loader;
+
+        assert(ret_initrd_handle);
+
+        if (!initrd_address || initrd_length == 0)
+                return EFI_SUCCESS;
+
+        /* check if a LINUX_INITRD_MEDIA_GUID DevicePath is already registered.
+           LocateDevicePath checks for the "closest DevicePath" and returns its handle,
+           where as InstallMultipleProtocolInterfaces only matches identical DevicePaths.
+         */
+        err = BS->LocateDevicePath(MAKE_GUID_PTR(EFI_LOAD_FILE2_PROTOCOL), &dp, &handle);
+        if (err != EFI_NOT_FOUND) /* InitrdMedia is already registered */
+                return EFI_ALREADY_STARTED;
+
+        loader = xnew(struct initrd_loader, 1);
+        *loader = (struct initrd_loader) {
+                .load_file.LoadFile = initrd_load_file,
+                .address = initrd_address,
+                .length = initrd_length
+        };
+
+        /* create a new handle and register the LoadFile2 protocol with the InitrdMediaPath on it */
+        err = BS->InstallMultipleProtocolInterfaces(
+                        ret_initrd_handle, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL),
+                        &efi_initrd_device_path, MAKE_GUID_PTR(EFI_LOAD_FILE2_PROTOCOL),
+                        loader,
+                        NULL);
+        if (err != EFI_SUCCESS)
+                free(loader);
+
+        return err;
+}
+
+EFI_STATUS initrd_unregister(EFI_HANDLE initrd_handle) {
+        EFI_STATUS err;
+        struct initrd_loader *loader;
+
+        if (!initrd_handle)
+                return EFI_SUCCESS;
+
+        /* get the LoadFile2 protocol that we allocated earlier */
+        err = BS->HandleProtocol(initrd_handle, MAKE_GUID_PTR(EFI_LOAD_FILE2_PROTOCOL), (void **) &loader);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* uninstall all protocols thus destroying the handle */
+        err = BS->UninstallMultipleProtocolInterfaces(
+                        initrd_handle, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL),
+                        &efi_initrd_device_path, MAKE_GUID_PTR(EFI_LOAD_FILE2_PROTOCOL),
+                        loader,
+                        NULL);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        initrd_handle = NULL;
+        free(loader);
+        return EFI_SUCCESS;
+}
diff --git a/src/boot/efi/initrd.h b/src/boot/efi/initrd.h
new file mode 100644
index 0000000..e7685ae
--- /dev/null
+++ b/src/boot/efi/initrd.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+EFI_STATUS initrd_register(
+                const void *initrd_address,
+                size_t initrd_length,
+                EFI_HANDLE *ret_initrd_handle);
+
+EFI_STATUS initrd_unregister(EFI_HANDLE initrd_handle);
+
+static inline void cleanup_initrd(EFI_HANDLE *initrd_handle) {
+        (void) initrd_unregister(*initrd_handle);
+        *initrd_handle = NULL;
+}
diff --git a/src/boot/efi/linux.c b/src/boot/efi/linux.c
new file mode 100644
index 0000000..65bc176
--- /dev/null
+++ b/src/boot/efi/linux.c
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/*
+ * Generic Linux boot protocol using the EFI/PE entry point of the kernel. Passes
+ * initrd with the LINUX_INITRD_MEDIA_GUID DevicePath and cmdline with
+ * EFI LoadedImageProtocol.
+ *
+ * This method works for Linux 5.8 and newer on ARM/Aarch64, x86/x68_64 and RISC-V.
+ */
+
+#include "initrd.h"
+#include "linux.h"
+#include "pe.h"
+#include "proto/device-path.h"
+#include "proto/loaded-image.h"
+#include "secure-boot.h"
+#include "util.h"
+
+#define STUB_PAYLOAD_GUID \
+        { 0x55c5d1f8, 0x04cd, 0x46b5, { 0x8a, 0x20, 0xe5, 0x6c, 0xbb, 0x30, 0x52, 0xd0 } }
+
+typedef struct {
+        const void *addr;
+        size_t len;
+        const EFI_DEVICE_PATH *device_path;
+} ValidationContext;
+
+static bool validate_payload(
+                const void *ctx, const EFI_DEVICE_PATH *device_path, const void *file_buffer, size_t file_size) {
+
+        const ValidationContext *payload = ASSERT_PTR(ctx);
+
+        if (device_path != payload->device_path)
+                return false;
+
+        /* Security arch (1) protocol does not provide a file buffer. Instead we are supposed to fetch the payload
+         * ourselves, which is not needed as we already have everything in memory and the device paths match. */
+        if (file_buffer && (file_buffer != payload->addr || file_size != payload->len))
+                return false;
+
+        return true;
+}
+
+static EFI_STATUS load_image(EFI_HANDLE parent, const void *source, size_t len, EFI_HANDLE *ret_image) {
+        assert(parent);
+        assert(source);
+        assert(ret_image);
+
+        /* We could pass a NULL device path, but it's nicer to provide something and it allows us to identify
+         * the loaded image from within the security hooks. */
+        struct {
+                VENDOR_DEVICE_PATH payload;
+                EFI_DEVICE_PATH end;
+        } _packed_ payload_device_path = {
+                .payload = {
+                        .Header = {
+                                .Type = MEDIA_DEVICE_PATH,
+                                .SubType = MEDIA_VENDOR_DP,
+                                .Length = sizeof(payload_device_path.payload),
+                        },
+                        .Guid = STUB_PAYLOAD_GUID,
+                },
+                .end = {
+                        .Type = END_DEVICE_PATH_TYPE,
+                        .SubType = END_ENTIRE_DEVICE_PATH_SUBTYPE,
+                        .Length = sizeof(payload_device_path.end),
+                },
+        };
+
+        /* We want to support unsigned kernel images as payload, which is safe to do under secure boot
+         * because it is embedded in this stub loader (and since it is already running it must be trusted). */
+        install_security_override(
+                        validate_payload,
+                        &(ValidationContext) {
+                                .addr = source,
+                                .len = len,
+                                .device_path = &payload_device_path.payload.Header,
+                        });
+
+        EFI_STATUS ret = BS->LoadImage(
+                        /*BootPolicy=*/false,
+                        parent,
+                        &payload_device_path.payload.Header,
+                        (void *) source,
+                        len,
+                        ret_image);
+
+        uninstall_security_override();
+
+        return ret;
+}
+
+EFI_STATUS linux_exec(
+                EFI_HANDLE parent,
+                const char16_t *cmdline,
+                const void *linux_buffer,
+                size_t linux_length,
+                const void *initrd_buffer,
+                size_t initrd_length) {
+
+        uint32_t compat_address;
+        EFI_STATUS err;
+
+        assert(parent);
+        assert(linux_buffer && linux_length > 0);
+        assert(initrd_buffer || initrd_length == 0);
+
+        err = pe_kernel_info(linux_buffer, &compat_address);
+#if defined(__i386__) || defined(__x86_64__)
+        if (err == EFI_UNSUPPORTED)
+                /* Kernel is too old to support LINUX_INITRD_MEDIA_GUID, try the deprecated EFI handover
+                 * protocol. */
+                return linux_exec_efi_handover(
+                                parent,
+                                cmdline,
+                                linux_buffer,
+                                linux_length,
+                                initrd_buffer,
+                                initrd_length);
+#endif
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Bad kernel image: %m");
+
+        _cleanup_(unload_imagep) EFI_HANDLE kernel_image = NULL;
+        err = load_image(parent, linux_buffer, linux_length, &kernel_image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error loading kernel image: %m");
+
+        EFI_LOADED_IMAGE_PROTOCOL *loaded_image;
+        err = BS->HandleProtocol(
+                        kernel_image, MAKE_GUID_PTR(EFI_LOADED_IMAGE_PROTOCOL), (void **) &loaded_image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error getting kernel loaded image protocol: %m");
+
+        if (cmdline) {
+                loaded_image->LoadOptions = (void *) cmdline;
+                loaded_image->LoadOptionsSize = strsize16(loaded_image->LoadOptions);
+        }
+
+        _cleanup_(cleanup_initrd) EFI_HANDLE initrd_handle = NULL;
+        err = initrd_register(initrd_buffer, initrd_length, &initrd_handle);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error registering initrd: %m");
+
+        log_wait();
+        err = BS->StartImage(kernel_image, NULL, NULL);
+
+        /* Try calling the kernel compat entry point if one exists. */
+        if (err == EFI_UNSUPPORTED && compat_address > 0) {
+                EFI_IMAGE_ENTRY_POINT compat_entry =
+                                (EFI_IMAGE_ENTRY_POINT) ((uint8_t *) loaded_image->ImageBase + compat_address);
+                err = compat_entry(kernel_image, ST);
+        }
+
+        return log_error_status(err, "Error starting kernel image: %m");
+}
diff --git a/src/boot/efi/linux.h b/src/boot/efi/linux.h
new file mode 100644
index 0000000..46b5f4f
--- /dev/null
+++ b/src/boot/efi/linux.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+EFI_STATUS linux_exec(
+                EFI_HANDLE parent,
+                const char16_t *cmdline,
+                const void *linux_buffer,
+                size_t linux_length,
+                const void *initrd_buffer,
+                size_t initrd_length);
+EFI_STATUS linux_exec_efi_handover(
+                EFI_HANDLE parent,
+                const char16_t *cmdline,
+                const void *linux_buffer,
+                size_t linux_length,
+                const void *initrd_buffer,
+                size_t initrd_length);
diff --git a/src/boot/efi/linux_x86.c b/src/boot/efi/linux_x86.c
new file mode 100644
index 0000000..757902d
--- /dev/null
+++ b/src/boot/efi/linux_x86.c
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/*
+ * x86 specific code to for EFI handover boot protocol
+ * Linux kernels version 5.8 and newer support providing the initrd by
+ * LINUX_INITRD_MEDIA_GUID DevicePath. In order to support older kernels too,
+ * this x86 specific linux_exec function passes the initrd by setting the
+ * corresponding fields in the setup_header struct.
+ *
+ * see https://docs.kernel.org/x86/boot.html
+ */
+
+#include "initrd.h"
+#include "linux.h"
+#include "macro-fundamental.h"
+#include "util.h"
+
+#define KERNEL_SECTOR_SIZE 512u
+#define BOOT_FLAG_MAGIC    0xAA55u
+#define SETUP_MAGIC        0x53726448u /* "HdrS" */
+#define SETUP_VERSION_2_11 0x20bu
+#define SETUP_VERSION_2_12 0x20cu
+#define SETUP_VERSION_2_15 0x20fu
+#define CMDLINE_PTR_MAX    0xA0000u
+
+enum {
+        XLF_KERNEL_64              = 1 << 0,
+        XLF_CAN_BE_LOADED_ABOVE_4G = 1 << 1,
+        XLF_EFI_HANDOVER_32        = 1 << 2,
+        XLF_EFI_HANDOVER_64        = 1 << 3,
+#ifdef __x86_64__
+        XLF_EFI_HANDOVER           = XLF_EFI_HANDOVER_64,
+#else
+        XLF_EFI_HANDOVER           = XLF_EFI_HANDOVER_32,
+#endif
+};
+
+typedef struct {
+        uint8_t  setup_sects;
+        uint16_t root_flags;
+        uint32_t syssize;
+        uint16_t ram_size;
+        uint16_t vid_mode;
+        uint16_t root_dev;
+        uint16_t boot_flag;
+        uint8_t  jump; /* We split the 2-byte jump field from the spec in two for convenience. */
+        uint8_t  setup_size;
+        uint32_t header;
+        uint16_t version;
+        uint32_t realmode_swtch;
+        uint16_t start_sys_seg;
+        uint16_t kernel_version;
+        uint8_t  type_of_loader;
+        uint8_t  loadflags;
+        uint16_t setup_move_size;
+        uint32_t code32_start;
+        uint32_t ramdisk_image;
+        uint32_t ramdisk_size;
+        uint32_t bootsect_kludge;
+        uint16_t heap_end_ptr;
+        uint8_t  ext_loader_ver;
+        uint8_t  ext_loader_type;
+        uint32_t cmd_line_ptr;
+        uint32_t initrd_addr_max;
+        uint32_t kernel_alignment;
+        uint8_t  relocatable_kernel;
+        uint8_t  min_alignment;
+        uint16_t xloadflags;
+        uint32_t cmdline_size;
+        uint32_t hardware_subarch;
+        uint64_t hardware_subarch_data;
+        uint32_t payload_offset;
+        uint32_t payload_length;
+        uint64_t setup_data;
+        uint64_t pref_address;
+        uint32_t init_size;
+        uint32_t handover_offset;
+} _packed_ SetupHeader;
+
+/* We really only care about a few fields, but we still have to provide a full page otherwise. */
+typedef struct {
+        uint8_t pad[192];
+        uint32_t ext_ramdisk_image;
+        uint32_t ext_ramdisk_size;
+        uint32_t ext_cmd_line_ptr;
+        uint8_t pad2[293];
+        SetupHeader hdr;
+        uint8_t pad3[3480];
+} _packed_ BootParams;
+assert_cc(offsetof(BootParams, ext_ramdisk_image) == 0x0C0);
+assert_cc(sizeof(BootParams) == 4096);
+
+#ifdef __i386__
+#  define __regparm0__ __attribute__((regparm(0)))
+#else
+#  define __regparm0__
+#endif
+
+typedef void (*handover_f)(void *parent, EFI_SYSTEM_TABLE *table, BootParams *params) __regparm0__
+                __attribute__((sysv_abi));
+
+static void linux_efi_handover(EFI_HANDLE parent, uintptr_t kernel, BootParams *params) {
+        assert(params);
+
+        kernel += (params->hdr.setup_sects + 1) * KERNEL_SECTOR_SIZE; /* 32-bit entry address. */
+
+        /* Old kernels needs this set, while newer ones seem to ignore this. */
+        params->hdr.code32_start = kernel;
+
+#ifdef __x86_64__
+        kernel += KERNEL_SECTOR_SIZE; /* 64-bit entry address. */
+#endif
+
+        kernel += params->hdr.handover_offset; /* 32/64-bit EFI handover address. */
+
+        /* Note in EFI mixed mode this now points to the correct 32-bit handover entry point, allowing a 64-bit
+         * kernel to be booted from a 32-bit sd-stub. */
+
+        handover_f handover = (handover_f) kernel;
+        handover(parent, ST, params);
+}
+
+EFI_STATUS linux_exec_efi_handover(
+                EFI_HANDLE parent,
+                const char16_t *cmdline,
+                const void *linux_buffer,
+                size_t linux_length,
+                const void *initrd_buffer,
+                size_t initrd_length) {
+
+        assert(parent);
+        assert(linux_buffer);
+        assert(initrd_buffer || initrd_length == 0);
+
+        if (linux_length < sizeof(BootParams))
+                return EFI_LOAD_ERROR;
+
+        const BootParams *image_params = (const BootParams *) linux_buffer;
+        if (image_params->hdr.header != SETUP_MAGIC || image_params->hdr.boot_flag != BOOT_FLAG_MAGIC)
+                return log_error_status(EFI_UNSUPPORTED, "Unsupported kernel image.");
+        if (image_params->hdr.version < SETUP_VERSION_2_11)
+                return log_error_status(EFI_UNSUPPORTED, "Kernel too old.");
+        if (!image_params->hdr.relocatable_kernel)
+                return log_error_status(EFI_UNSUPPORTED, "Kernel is not relocatable.");
+
+        /* The xloadflags were added in version 2.12+ of the boot protocol but the handover support predates
+         * that, so we cannot safety-check this for 2.11. */
+        if (image_params->hdr.version >= SETUP_VERSION_2_12 &&
+            !FLAGS_SET(image_params->hdr.xloadflags, XLF_EFI_HANDOVER))
+                return log_error_status(EFI_UNSUPPORTED, "Kernel does not support EFI handover protocol.");
+
+        bool can_4g = image_params->hdr.version >= SETUP_VERSION_2_12 &&
+                        FLAGS_SET(image_params->hdr.xloadflags, XLF_CAN_BE_LOADED_ABOVE_4G);
+
+        /* There is no way to pass the high bits of code32_start. Newer kernels seems to handle this
+         * just fine, but older kernels will fail even if they otherwise have above 4G boot support. */
+        _cleanup_pages_ Pages linux_relocated = {};
+        if (POINTER_TO_PHYSICAL_ADDRESS(linux_buffer) + linux_length > UINT32_MAX) {
+                linux_relocated = xmalloc_pages(
+                                AllocateMaxAddress, EfiLoaderCode, EFI_SIZE_TO_PAGES(linux_length), UINT32_MAX);
+                linux_buffer = memcpy(
+                                PHYSICAL_ADDRESS_TO_POINTER(linux_relocated.addr), linux_buffer, linux_length);
+        }
+
+        _cleanup_pages_ Pages initrd_relocated = {};
+        if (!can_4g && POINTER_TO_PHYSICAL_ADDRESS(initrd_buffer) + initrd_length > UINT32_MAX) {
+                initrd_relocated = xmalloc_pages(
+                                AllocateMaxAddress, EfiLoaderData, EFI_SIZE_TO_PAGES(initrd_length), UINT32_MAX);
+                initrd_buffer = memcpy(
+                                PHYSICAL_ADDRESS_TO_POINTER(initrd_relocated.addr),
+                                initrd_buffer,
+                                initrd_length);
+        }
+
+        _cleanup_pages_ Pages boot_params_page = xmalloc_pages(
+                        can_4g ? AllocateAnyPages : AllocateMaxAddress,
+                        EfiLoaderData,
+                        EFI_SIZE_TO_PAGES(sizeof(BootParams)),
+                        UINT32_MAX /* Below the 4G boundary */);
+        BootParams *boot_params = PHYSICAL_ADDRESS_TO_POINTER(boot_params_page.addr);
+        *boot_params = (BootParams){};
+
+        /* Setup size is determined by offset 0x0202 + byte value at offset 0x0201, which is the same as
+         * offset of the header field and the target from the jump field (which we split for this reason). */
+        memcpy(&boot_params->hdr,
+               &image_params->hdr,
+               offsetof(SetupHeader, header) + image_params->hdr.setup_size);
+
+        boot_params->hdr.type_of_loader = 0xff;
+
+        /* Spec says: For backwards compatibility, if the setup_sects field contains 0, the real value is 4. */
+        if (boot_params->hdr.setup_sects == 0)
+                boot_params->hdr.setup_sects = 4;
+
+        _cleanup_pages_ Pages cmdline_pages = {};
+        if (cmdline) {
+                size_t len = MIN(strlen16(cmdline), image_params->hdr.cmdline_size);
+
+                cmdline_pages = xmalloc_pages(
+                                can_4g ? AllocateAnyPages : AllocateMaxAddress,
+                                EfiLoaderData,
+                                EFI_SIZE_TO_PAGES(len + 1),
+                                CMDLINE_PTR_MAX);
+
+                /* Convert cmdline to ASCII. */
+                char *cmdline8 = PHYSICAL_ADDRESS_TO_POINTER(cmdline_pages.addr);
+                for (size_t i = 0; i < len; i++)
+                        cmdline8[i] = cmdline[i] <= 0x7E ? cmdline[i] : ' ';
+                cmdline8[len] = '\0';
+
+                boot_params->hdr.cmd_line_ptr = (uint32_t) cmdline_pages.addr;
+                boot_params->ext_cmd_line_ptr = cmdline_pages.addr >> 32;
+                assert(can_4g || cmdline_pages.addr <= CMDLINE_PTR_MAX);
+        }
+
+        boot_params->hdr.ramdisk_image = (uintptr_t) initrd_buffer;
+        boot_params->ext_ramdisk_image = POINTER_TO_PHYSICAL_ADDRESS(initrd_buffer) >> 32;
+        boot_params->hdr.ramdisk_size = initrd_length;
+        boot_params->ext_ramdisk_size = ((uint64_t) initrd_length) >> 32;
+
+        log_wait();
+        linux_efi_handover(parent, (uintptr_t) linux_buffer, boot_params);
+        return EFI_LOAD_ERROR;
+}
diff --git a/src/boot/efi/log.c b/src/boot/efi/log.c
new file mode 100644
index 0000000..364471e
--- /dev/null
+++ b/src/boot/efi/log.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "log.h"
+#include "proto/rng.h"
+#include "proto/simple-text-io.h"
+#include "util.h"
+
+static unsigned log_count = 0;
+
+void freeze(void) {
+        for (;;)
+                BS->Stall(60 * 1000 * 1000);
+}
+
+_noreturn_ static void panic(const char16_t *message) {
+        if (ST->ConOut->Mode->CursorColumn > 0)
+                ST->ConOut->OutputString(ST->ConOut, (char16_t *) u"\r\n");
+        ST->ConOut->SetAttribute(ST->ConOut, EFI_TEXT_ATTR(EFI_LIGHTRED, EFI_BLACK));
+        ST->ConOut->OutputString(ST->ConOut, (char16_t *) message);
+        freeze();
+}
+
+void efi_assert(const char *expr, const char *file, unsigned line, const char *function) {
+        static bool asserting = false;
+
+        /* Let's be paranoid. */
+        if (asserting)
+                panic(u"systemd-boot: Nested assertion failure, halting.");
+
+        asserting = true;
+        log_error("systemd-boot: Assertion '%s' failed at %s:%u@%s, halting.", expr, file, line, function);
+        freeze();
+}
+
+EFI_STATUS log_internal(EFI_STATUS status, const char *format, ...) {
+        assert(format);
+
+        int32_t attr = ST->ConOut->Mode->Attribute;
+
+        if (ST->ConOut->Mode->CursorColumn > 0)
+                ST->ConOut->OutputString(ST->ConOut, (char16_t *) u"\r\n");
+        ST->ConOut->SetAttribute(ST->ConOut, EFI_TEXT_ATTR(EFI_LIGHTRED, EFI_BLACK));
+
+        va_list ap;
+        va_start(ap, format);
+        vprintf_status(status, format, ap);
+        va_end(ap);
+
+        ST->ConOut->OutputString(ST->ConOut, (char16_t *) u"\r\n");
+        ST->ConOut->SetAttribute(ST->ConOut, attr);
+
+        log_count++;
+        return status;
+}
+
+#ifdef EFI_DEBUG
+void log_hexdump(const char16_t *prefix, const void *data, size_t size) {
+        /* Debugging helper — please keep this around, even if not used */
+
+        _cleanup_free_ char16_t *hex = hexdump(data, size);
+        log_internal(EFI_SUCCESS, "%ls[%zu]: %ls", prefix, size, hex);
+}
+#endif
+
+void log_wait(void) {
+        if (log_count == 0)
+                return;
+
+        BS->Stall(MIN(4u, log_count) * 2500 * 1000);
+        log_count = 0;
+}
+
+_used_ intptr_t __stack_chk_guard = (intptr_t) 0x70f6967de78acae3;
+
+/* We can only set a random stack canary if this function attribute is available,
+ * otherwise this may create a stack check fail. */
+#if STACK_PROTECTOR_RANDOM
+void __stack_chk_guard_init(void) {
+        EFI_RNG_PROTOCOL *rng;
+        if (BS->LocateProtocol(MAKE_GUID_PTR(EFI_RNG_PROTOCOL), NULL, (void **) &rng) == EFI_SUCCESS)
+                (void) rng->GetRNG(rng, NULL, sizeof(__stack_chk_guard), (void *) &__stack_chk_guard);
+        else
+                /* Better than no extra entropy. */
+                __stack_chk_guard ^= (intptr_t) __executable_start;
+}
+#endif
+
+_used_ _noreturn_ void __stack_chk_fail(void);
+_used_ _noreturn_ void __stack_chk_fail_local(void);
+void __stack_chk_fail(void) {
+        panic(u"systemd-boot: Stack check failed, halting.");
+}
+void __stack_chk_fail_local(void) {
+        __stack_chk_fail();
+}
+
+/* Called by libgcc for some fatal errors like integer overflow with -ftrapv. */
+_used_ _noreturn_ void abort(void);
+void abort(void) {
+        panic(u"systemd-boot: Unknown error, halting.");
+}
+
+#if defined(__ARM_EABI__)
+/* These override the (weak) div0 handlers from libgcc as they would otherwise call raise() instead. */
+_used_ _noreturn_ int __aeabi_idiv0(int return_value);
+_used_ _noreturn_ long long __aeabi_ldiv0(long long return_value);
+
+int __aeabi_idiv0(int return_value) {
+        panic(u"systemd-boot: Division by zero, halting.");
+}
+
+long long __aeabi_ldiv0(long long return_value) {
+        panic(u"systemd-boot: Division by zero, halting.");
+}
+#endif
diff --git a/src/boot/efi/log.h b/src/boot/efi/log.h
new file mode 100644
index 0000000..13f3887
--- /dev/null
+++ b/src/boot/efi/log.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi-string.h"
+
+#if defined __has_attribute
+#  if __has_attribute(no_stack_protector)
+#    define HAVE_NO_STACK_PROTECTOR_ATTRIBUTE
+#  endif
+#endif
+
+#if defined(HAVE_NO_STACK_PROTECTOR_ATTRIBUTE) && \
+    (defined(__SSP__) || defined(__SSP_ALL__) || \
+    defined(__SSP_STRONG__) || defined(__SSP_EXPLICIT__))
+#  define STACK_PROTECTOR_RANDOM 1
+__attribute__((no_stack_protector, noinline)) void __stack_chk_guard_init(void);
+#else
+#  define STACK_PROTECTOR_RANDOM 0
+#  define __stack_chk_guard_init()
+#endif
+
+_noreturn_ void freeze(void);
+void log_wait(void);
+_gnu_printf_(2, 3) EFI_STATUS log_internal(EFI_STATUS status, const char *format, ...);
+#define log_error_status(status, ...) log_internal(status, __VA_ARGS__)
+#define log_error(...) log_internal(EFI_INVALID_PARAMETER, __VA_ARGS__)
+#define log_oom() log_internal(EFI_OUT_OF_RESOURCES, "Out of memory.")
+#define log_trace() log_internal(EFI_SUCCESS, "%s:%i@%s", __FILE__, __LINE__, __func__)
+
+#ifdef EFI_DEBUG
+void log_hexdump(const char16_t *prefix, const void *data, size_t size);
+#endif
diff --git a/src/boot/efi/measure.c b/src/boot/efi/measure.c
new file mode 100644
index 0000000..01c97c8
--- /dev/null
+++ b/src/boot/efi/measure.c
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if ENABLE_TPM
+
+#include "macro-fundamental.h"
+#include "measure.h"
+#include "memory-util-fundamental.h"
+#include "proto/tcg.h"
+#include "tpm2-pcr.h"
+#include "util.h"
+
+static EFI_STATUS tpm1_measure_to_pcr_and_event_log(
+                const EFI_TCG_PROTOCOL *tcg,
+                uint32_t pcrindex,
+                EFI_PHYSICAL_ADDRESS buffer,
+                size_t buffer_size,
+                const char16_t *description) {
+
+        _cleanup_free_ TCG_PCR_EVENT *tcg_event = NULL;
+        EFI_PHYSICAL_ADDRESS event_log_last;
+        uint32_t event_number = 1;
+        size_t desc_len;
+
+        assert(tcg);
+        assert(description);
+
+        desc_len = strsize16(description);
+        tcg_event = xmalloc(offsetof(TCG_PCR_EVENT, Event) + desc_len);
+        *tcg_event = (TCG_PCR_EVENT) {
+                .EventSize = desc_len,
+                .PCRIndex = pcrindex,
+                .EventType = EV_IPL,
+        };
+        memcpy(tcg_event->Event, description, desc_len);
+
+        return tcg->HashLogExtendEvent(
+                        (EFI_TCG_PROTOCOL *) tcg,
+                        buffer, buffer_size,
+                        TCG_ALG_SHA,
+                        tcg_event,
+                        &event_number,
+                        &event_log_last);
+}
+
+static EFI_STATUS tpm2_measure_to_pcr_and_tagged_event_log(
+                EFI_TCG2_PROTOCOL *tcg,
+                uint32_t pcrindex,
+                EFI_PHYSICAL_ADDRESS buffer,
+                uint64_t buffer_size,
+                uint32_t event_id,
+                const char16_t *description) {
+
+        _cleanup_free_ struct event {
+                EFI_TCG2_EVENT tcg_event;
+                EFI_TCG2_TAGGED_EVENT tcg_tagged_event;
+        } _packed_ *event = NULL;
+        size_t desc_len, event_size;
+
+        assert(tcg);
+        assert(description);
+
+        desc_len = strsize16(description);
+        event_size = offsetof(EFI_TCG2_EVENT, Event) + offsetof(EFI_TCG2_TAGGED_EVENT, Event) + desc_len;
+
+        event = xmalloc(event_size);
+        *event = (struct event) {
+                .tcg_event = (EFI_TCG2_EVENT) {
+                        .Size = event_size,
+                        .Header.HeaderSize = sizeof(EFI_TCG2_EVENT_HEADER),
+                        .Header.HeaderVersion = EFI_TCG2_EVENT_HEADER_VERSION,
+                        .Header.PCRIndex = pcrindex,
+                        .Header.EventType = EV_EVENT_TAG,
+                },
+                .tcg_tagged_event = {
+                        .EventId = event_id,
+                        .EventSize = desc_len,
+                },
+        };
+        memcpy(event->tcg_tagged_event.Event, description, desc_len);
+
+        return tcg->HashLogExtendEvent(
+                        tcg,
+                        0,
+                        buffer, buffer_size,
+                        &event->tcg_event);
+}
+
+static EFI_STATUS tpm2_measure_to_pcr_and_event_log(
+                EFI_TCG2_PROTOCOL *tcg,
+                uint32_t pcrindex,
+                EFI_PHYSICAL_ADDRESS buffer,
+                uint64_t buffer_size,
+                const char16_t *description) {
+
+        _cleanup_free_ EFI_TCG2_EVENT *tcg_event = NULL;
+        size_t desc_len;
+
+        assert(tcg);
+        assert(description);
+
+        /* NB: We currently record everything as EV_IPL. Which sucks, because it makes it hard to
+         * recognize from the event log which of the events are ours. Measurement logs are kinda API hence
+         * this is hard to change for existing, established events. But for future additions, let's use
+         * EV_EVENT_TAG instead, with a tag of our choosing that makes clear what precisely we are measuring
+         * here. */
+
+        desc_len = strsize16(description);
+        tcg_event = xmalloc(offsetof(EFI_TCG2_EVENT, Event) + desc_len);
+        *tcg_event = (EFI_TCG2_EVENT) {
+                .Size = offsetof(EFI_TCG2_EVENT, Event) + desc_len,
+                .Header.HeaderSize = sizeof(EFI_TCG2_EVENT_HEADER),
+                .Header.HeaderVersion = EFI_TCG2_EVENT_HEADER_VERSION,
+                .Header.PCRIndex = pcrindex,
+                .Header.EventType = EV_IPL,
+        };
+
+        memcpy(tcg_event->Event, description, desc_len);
+
+        return tcg->HashLogExtendEvent(
+                        tcg,
+                        0,
+                        buffer, buffer_size,
+                        tcg_event);
+}
+
+static EFI_TCG_PROTOCOL *tcg1_interface_check(void) {
+        EFI_PHYSICAL_ADDRESS event_log_location, event_log_last_entry;
+        EFI_TCG_BOOT_SERVICE_CAPABILITY capability = {
+                .Size = sizeof(capability),
+        };
+        EFI_STATUS err;
+        uint32_t features;
+        EFI_TCG_PROTOCOL *tcg;
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_TCG_PROTOCOL), NULL, (void **) &tcg);
+        if (err != EFI_SUCCESS)
+                return NULL;
+
+        err = tcg->StatusCheck(
+                        tcg,
+                        &capability,
+                        &features,
+                        &event_log_location,
+                        &event_log_last_entry);
+        if (err != EFI_SUCCESS)
+                return NULL;
+
+        if (capability.TPMDeactivatedFlag)
+                return NULL;
+
+        if (!capability.TPMPresentFlag)
+                return NULL;
+
+        return tcg;
+}
+
+static EFI_TCG2_PROTOCOL *tcg2_interface_check(void) {
+        EFI_TCG2_BOOT_SERVICE_CAPABILITY capability = {
+                .Size = sizeof(capability),
+        };
+        EFI_STATUS err;
+        EFI_TCG2_PROTOCOL *tcg;
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_TCG2_PROTOCOL), NULL, (void **) &tcg);
+        if (err != EFI_SUCCESS)
+                return NULL;
+
+        err = tcg->GetCapability(tcg, &capability);
+        if (err != EFI_SUCCESS)
+                return NULL;
+
+        if (capability.StructureVersion.Major == 1 &&
+            capability.StructureVersion.Minor == 0) {
+                EFI_TCG_BOOT_SERVICE_CAPABILITY *caps_1_0 =
+                        (EFI_TCG_BOOT_SERVICE_CAPABILITY*) &capability;
+                if (caps_1_0->TPMPresentFlag)
+                        return tcg;
+        }
+
+        if (!capability.TPMPresentFlag)
+                return NULL;
+
+        return tcg;
+}
+
+bool tpm_present(void) {
+        return tcg2_interface_check() || tcg1_interface_check();
+}
+
+EFI_STATUS tpm_log_event(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, const char16_t *description, bool *ret_measured) {
+        EFI_TCG2_PROTOCOL *tpm2;
+        EFI_STATUS err;
+
+        assert(description || pcrindex == UINT32_MAX);
+
+        /* If EFI_SUCCESS is returned, will initialize ret_measured to true if we actually measured
+         * something, or false if measurement was turned off. */
+
+        if (pcrindex == UINT32_MAX) { /* PCR disabled? */
+                if (ret_measured)
+                        *ret_measured = false;
+
+                return EFI_SUCCESS;
+        }
+
+        tpm2 = tcg2_interface_check();
+        if (tpm2)
+                err = tpm2_measure_to_pcr_and_event_log(tpm2, pcrindex, buffer, buffer_size, description);
+        else {
+                EFI_TCG_PROTOCOL *tpm1;
+
+                tpm1 = tcg1_interface_check();
+                if (tpm1)
+                        err = tpm1_measure_to_pcr_and_event_log(tpm1, pcrindex, buffer, buffer_size, description);
+                else {
+                        /* No active TPM found, so don't return an error */
+
+                        if (ret_measured)
+                                *ret_measured = false;
+
+                        return EFI_SUCCESS;
+                }
+        }
+
+        if (err == EFI_SUCCESS && ret_measured)
+                *ret_measured = true;
+
+        return err;
+}
+
+EFI_STATUS tpm_log_tagged_event(
+                uint32_t pcrindex,
+                EFI_PHYSICAL_ADDRESS buffer,
+                size_t buffer_size,
+                uint32_t event_id,
+                const char16_t *description,
+                bool *ret_measured) {
+
+        EFI_TCG2_PROTOCOL *tpm2;
+        EFI_STATUS err;
+
+        assert(description || pcrindex == UINT32_MAX);
+        assert(event_id > 0);
+
+        /* If EFI_SUCCESS is returned, will initialize ret_measured to true if we actually measured
+         * something, or false if measurement was turned off. */
+
+        tpm2 = tcg2_interface_check();
+        if (!tpm2 || pcrindex == UINT32_MAX) { /* PCR disabled? */
+                if (ret_measured)
+                        *ret_measured = false;
+
+                return EFI_SUCCESS;
+        }
+
+        err = tpm2_measure_to_pcr_and_tagged_event_log(tpm2, pcrindex, buffer, buffer_size, event_id, description);
+        if (err == EFI_SUCCESS && ret_measured)
+                *ret_measured = true;
+
+        return err;
+}
+
+EFI_STATUS tpm_log_event_ascii(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, const char *description, bool *ret_measured) {
+        _cleanup_free_ char16_t *c = NULL;
+
+        if (description)
+                c = xstr8_to_16(description);
+
+        return tpm_log_event(pcrindex, buffer, buffer_size, c, ret_measured);
+}
+
+EFI_STATUS tpm_log_load_options(const char16_t *load_options, bool *ret_measured) {
+        bool measured = false;
+        EFI_STATUS err;
+
+        /* Measures a load options string into the TPM2, i.e. the kernel command line */
+
+        err = tpm_log_event(
+                        TPM2_PCR_KERNEL_CONFIG,
+                        POINTER_TO_PHYSICAL_ADDRESS(load_options),
+                        strsize16(load_options),
+                        load_options,
+                        &measured);
+        if (err != EFI_SUCCESS)
+                return log_error_status(
+                                err,
+                                "Unable to add load options (i.e. kernel command) line measurement to PCR %i: %m",
+                                TPM2_PCR_KERNEL_CONFIG);
+
+        if (ret_measured)
+                *ret_measured = measured;
+
+        return EFI_SUCCESS;
+}
+
+#endif
diff --git a/src/boot/efi/measure.h b/src/boot/efi/measure.h
new file mode 100644
index 0000000..c3c4e0a
--- /dev/null
+++ b/src/boot/efi/measure.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#if ENABLE_TPM
+
+bool tpm_present(void);
+EFI_STATUS tpm_log_event(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, const char16_t *description, bool *ret_measured);
+EFI_STATUS tpm_log_event_ascii(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, const char *description, bool *ret_measured);
+EFI_STATUS tpm_log_tagged_event(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, uint32_t event_id, const char16_t *description, bool *ret_measured);
+EFI_STATUS tpm_log_load_options(const char16_t *cmdline, bool *ret_measured);
+
+#else
+
+static inline bool tpm_present(void) {
+        return false;
+}
+
+static inline EFI_STATUS tpm_log_event(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, const char16_t *description, bool *ret_measured) {
+        if (ret_measured)
+                *ret_measured = false;
+        return EFI_SUCCESS;
+}
+
+static inline EFI_STATUS tpm_log_event_ascii(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, const char *description, bool *ret_measured) {
+        if (ret_measured)
+                *ret_measured = false;
+        return EFI_SUCCESS;
+}
+
+static inline EFI_STATUS tpm_log_tagged_event(uint32_t pcrindex, EFI_PHYSICAL_ADDRESS buffer, size_t buffer_size, uint32_t event_id, const char16_t *description, bool *ret_measured) {
+        if (ret_measured)
+                *ret_measured = false;
+        return EFI_SUCCESS;
+}
+
+static inline EFI_STATUS tpm_log_load_options(const char16_t *cmdline, bool *ret_measured) {
+        if (ret_measured)
+                *ret_measured = false;
+        return EFI_SUCCESS;
+}
+
+#endif
diff --git a/src/boot/efi/meson.build b/src/boot/efi/meson.build
new file mode 100644
index 0000000..c95132e
--- /dev/null
+++ b/src/boot/efi/meson.build
@@ -0,0 +1,409 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+efi_config_h_dir = meson.current_build_dir()
+efi_addon = ''
+
+libefitest = static_library(
+        'efitest',
+        files(
+                'bcd.c',
+                'efi-string.c',
+        ),
+        build_by_default : false,
+        include_directories : [
+                basic_includes,
+                include_directories('.'),
+        ],
+        dependencies : userspace)
+
+efitest_base = {
+        'link_with' : [
+                libefitest,
+                libshared,
+        ],
+}
+efi_test_template = test_template + efitest_base
+efi_fuzz_template = fuzz_template + efitest_base
+
+executables += [
+        efi_test_template + {
+                'sources' : files('test-bcd.c'),
+                'dependencies' : libzstd,
+                'conditions' : ['ENABLE_BOOTLOADER', 'HAVE_ZSTD'],
+        },
+        efi_test_template + {
+                'sources' : files('test-efi-string.c'),
+                'conditions' : ['ENABLE_BOOTLOADER'],
+        },
+        efi_fuzz_template + {
+                'sources' : files('fuzz-bcd.c'),
+        },
+        efi_fuzz_template + {
+                'sources' : files('fuzz-efi-string.c'),
+        },
+        efi_fuzz_template + {
+                'sources' : files('fuzz-efi-osrel.c'),
+        },
+        efi_fuzz_template + {
+                'sources' : files('fuzz-efi-printf.c'),
+        },
+]
+
+if conf.get('ENABLE_BOOTLOADER') != 1
+        subdir_done()
+endif
+
+efi_conf = configuration_data()
+efi_conf.set10('ENABLE_TPM', get_option('tpm'))
+
+foreach ctype : ['color-normal', 'color-entry', 'color-highlight', 'color-edit']
+        c = get_option('efi-' + ctype).split(',')
+        efi_conf.set(ctype.underscorify().to_upper(), 'EFI_TEXT_ATTR(@0@, @1@)'.format(
+                'EFI_' + c[0].strip().underscorify().to_upper(),
+                'EFI_' + c[1].strip().underscorify().to_upper()))
+endforeach
+
+if meson.is_cross_build() and get_option('sbat-distro') == 'auto'
+        warning('Auto detection of SBAT information not supported when cross-building, disabling SBAT.')
+elif get_option('sbat-distro') != ''
+        efi_conf.set_quoted('SBAT_PROJECT', meson.project_name())
+        efi_conf.set_quoted('PROJECT_VERSION', meson.project_version())
+        efi_conf.set('PROJECT_URL', conf.get('PROJECT_URL'))
+        if get_option('sbat-distro-generation') < 1
+                error('SBAT Distro Generation must be a positive integer')
+        endif
+        efi_conf.set('SBAT_DISTRO_GENERATION', get_option('sbat-distro-generation'))
+        foreach sbatvar : [['sbat-distro', 'ID'],
+                           ['sbat-distro-summary', 'NAME'],
+                           ['sbat-distro-url', 'BUG_REPORT_URL']]
+                value = get_option(sbatvar[0])
+                if (value == '' or value == 'auto') and not meson.is_cross_build()
+                        cmd = 'if [ -e /etc/os-release ]; then . /etc/os-release; else . /usr/lib/os-release; fi; echo $@0@'.format(sbatvar[1])
+                        value = run_command(sh, '-c', cmd, check: true).stdout().strip()
+                endif
+                if value == ''
+                        error('Required @0@ option not set and autodetection failed'.format(sbatvar[0]))
+                endif
+                efi_conf.set_quoted(sbatvar[0].underscorify().to_upper(), value)
+        endforeach
+
+        pkgname = get_option('sbat-distro-pkgname')
+        if pkgname == ''
+                pkgname = meson.project_name()
+        endif
+        efi_conf.set_quoted('SBAT_DISTRO_PKGNAME', pkgname)
+
+        pkgver = get_option('sbat-distro-version')
+        if pkgver == ''
+                # This is determined during build, not configuration, so we can't display it yet.
+                efi_conf.set('SBAT_DISTRO_VERSION', 'GIT_VERSION')
+        else
+                efi_conf.set_quoted('SBAT_DISTRO_VERSION', pkgver)
+        endif
+endif
+
+summary({'UEFI architectures' : efi_arch + (efi_arch_alt == '' ? '' : ', ' + efi_arch_alt)},
+        section : 'UEFI')
+
+if efi_conf.get('SBAT_DISTRO', '') != ''
+        summary({
+                'SBAT distro':              efi_conf.get('SBAT_DISTRO'),
+                'SBAT distro generation':   efi_conf.get('SBAT_DISTRO_GENERATION'),
+                'SBAT distro version':      efi_conf.get('SBAT_DISTRO_VERSION'),
+                'SBAT distro summary':      efi_conf.get('SBAT_DISTRO_SUMMARY'),
+                'SBAT distro URL':          efi_conf.get('SBAT_DISTRO_URL')},
+                section : 'UEFI')
+endif
+
+configure_file(
+        output : 'efi_config.h',
+        configuration : efi_conf)
+
+############################################################
+
+efi_includes = [
+        build_dir_include,
+        fundamental_include,
+        include_directories('.'),
+]
+
+efi_c_args = [
+        '-DSD_BOOT=1',
+        '-ffreestanding',
+        '-fno-strict-aliasing',
+        '-fshort-wchar',
+        '-include', 'efi_config.h',
+]
+
+efi_c_args += cc.get_supported_arguments(
+        '-fwide-exec-charset=UCS2',
+        # gcc docs says this is required for ms_abi to work correctly.
+        '-maccumulate-outgoing-args',
+        '-mstack-protector-guard=global',
+)
+
+# Debug information has little value in release builds as no normal human being knows
+# how to attach a debugger to EFI binaries running on real hardware. Anyone who does
+# certainly has the means to do their own dev build.
+if get_option('mode') == 'developer' and get_option('debug')
+        efi_c_args += '-DEFI_DEBUG'
+endif
+
+efi_c_ld_args = [
+        '-lgcc',
+        '-nostdlib',
+        '-static-pie',
+        '-Wl,--entry=efi_main',
+        '-Wl,--fatal-warnings',
+
+        # These flags should be passed by -static-pie, but for whatever reason the flag translation
+        # is not enabled on all architectures. Not passing `-static` would just allow the linker to
+        # use dynamic libraries, (which we can't/don't use anyway). But if `-pie` is missing and the
+        # gcc build does not default to `-pie` we get a regular (no-pie) binary that will be
+        # rightfully rejected by elf2efi. Note that meson also passes `-pie` to the linker driver,
+        # but it is overridden by our `-static-pie`. We also need to pass these directly to the
+        # linker as `-static`+`-pie` seem to get translated differently.
+        '-Wl,-static,-pie,--no-dynamic-linker,-z,text',
+
+        # EFI has 4KiB pages.
+        '-z', 'common-page-size=4096',
+        '-z', 'max-page-size=4096',
+
+        '-z', 'noexecstack',
+        '-z', 'relro',
+        '-z', 'separate-code',
+]
+
+efi_c_ld_args += cc.get_supported_link_arguments(
+        # binutils >= 2.38
+        '-Wl,-z,nopack-relative-relocs',
+)
+
+# efi_c_args is explicitly passed to targets so that they can override distro-provided flags
+# that should not be used for EFI binaries.
+efi_disabled_c_args = cc.get_supported_arguments(
+        '-fcf-protection=none',
+        '-fno-asynchronous-unwind-tables',
+        '-fno-exceptions',
+        '-fno-unwind-tables',
+)
+efi_override_options = [
+        'b_coverage=false',
+        'b_pgo=off',
+]
+
+if get_option('b_sanitize') == 'undefined'
+        efi_disabled_c_args += cc.get_supported_arguments('-fno-sanitize-link-runtime')
+else
+        efi_disabled_c_args += cc.get_supported_arguments('-fno-sanitize=all')
+        efi_override_options += 'b_sanitize=none'
+endif
+
+efi_c_args += efi_disabled_c_args
+efi_c_ld_args += efi_disabled_c_args
+
+if cc.get_id() == 'clang'
+        # clang is too picky sometimes.
+        efi_c_args += '-Wno-unused-command-line-argument'
+        efi_c_ld_args += '-Wno-unused-command-line-argument'
+endif
+
+efi_arch_c_args = {
+        'aarch64' : ['-mgeneral-regs-only'],
+        'arm'     : ['-mgeneral-regs-only'],
+        # Pass -m64/32 explicitly to make building on x32 work.
+        'x86_64'  : ['-m64', '-march=x86-64', '-mno-red-zone', '-mgeneral-regs-only'],
+        'x86'     : ['-m32', '-march=i686', '-mgeneral-regs-only', '-malign-double'],
+}
+efi_arch_c_ld_args = {
+        # libgcc is not compiled with -fshort-wchar, but it does not use it anyways,
+        # so it's fine to link against it.
+        'arm'    : cc.get_supported_link_arguments('-Wl,--no-wchar-size-warning'),
+        'x86_64' : ['-m64'],
+        'x86'    : ['-m32'],
+}
+
+linker_sanity_code = 'void a(void) {}; void _start(void) { a(); }'
+linker_sanity_args = ['-nostdlib', '-Wl,--fatal-warnings']
+if not cc.links(linker_sanity_code,
+                name : 'linker supports -static-pie',
+                args : [linker_sanity_args, '-static-pie'])
+        error('Linker does not support -static-pie.')
+endif
+
+# https://github.com/llvm/llvm-project/issues/67152
+if not cc.links(linker_sanity_code,
+                name : 'linker supports LTO with -nostdlib',
+                args : [linker_sanity_args, '-flto'])
+        efi_c_args += '-fno-lto'
+        efi_c_ld_args += '-fno-lto'
+endif
+
+# https://github.com/llvm/llvm-project/issues/61101
+if efi_cpu_family_alt == 'x86' and not cc.links(linker_sanity_code,
+                name : 'linker supports LTO with -nostdlib (x86)',
+                args : [linker_sanity_args, '-flto', '-m32'])
+        efi_arch_c_args += { 'x86' : efi_arch_c_args['x86'] + '-fno-lto' }
+        efi_arch_c_ld_args += { 'x86' : efi_arch_c_ld_args['x86'] + '-fno-lto' }
+endif
+
+############################################################
+
+libefi_sources = files(
+        'console.c',
+        'device-path-util.c',
+        'devicetree.c',
+        'drivers.c',
+        'efi-string.c',
+        'graphics.c',
+        'initrd.c',
+        'log.c',
+        'measure.c',
+        'part-discovery.c',
+        'pe.c',
+        'random-seed.c',
+        'secure-boot.c',
+        'shim.c',
+        'ticks.c',
+        'util.c',
+        'vmm.c',
+)
+
+systemd_boot_sources = files(
+        'boot.c',
+)
+
+stub_sources = files(
+        'cpio.c',
+        'linux.c',
+        'splash.c',
+        'stub.c',
+)
+
+addon_sources = files(
+        'addon.c',
+)
+
+if get_option('b_sanitize') == 'undefined'
+        libefi_sources += files('ubsan.c')
+endif
+
+if host_machine.cpu_family() in ['x86', 'x86_64']
+        stub_sources += files('linux_x86.c')
+endif
+
+# BCD parser only makes sense on arches that Windows supports.
+if host_machine.cpu_family() in ['aarch64', 'arm', 'x86_64', 'x86']
+        systemd_boot_sources += files('bcd.c')
+endif
+
+boot_targets = []
+efi_elf_binaries = []
+efi_archspecs = [
+        {
+                'arch' : efi_arch,
+                'c_args' : [
+                        efi_c_args,
+                        '-DEFI_MACHINE_TYPE_NAME="' + efi_arch + '"',
+                        efi_arch_c_args.get(host_machine.cpu_family(), []),
+                ],
+                'link_args' : [
+                        efi_c_ld_args,
+                        efi_arch_c_ld_args.get(host_machine.cpu_family(), []),
+                ],
+        },
+]
+if efi_arch_alt != ''
+        efi_archspecs += {
+                'arch' : efi_arch_alt,
+                'c_args' : [
+                        efi_c_args,
+                        '-DEFI_MACHINE_TYPE_NAME="' + efi_arch_alt + '"',
+                        efi_arch_c_args.get(efi_cpu_family_alt, []),
+                ],
+                'link_args' : [
+                        efi_c_ld_args,
+                        efi_arch_c_ld_args.get(efi_cpu_family_alt, []),
+                ],
+        }
+endif
+
+foreach archspec : efi_archspecs
+        libefi = static_library(
+                'efi' + archspec['arch'],
+                fundamental_sources,
+                libefi_sources,
+                version_h,
+                include_directories : efi_includes,
+                c_args : archspec['c_args'],
+                gnu_symbol_visibility : 'hidden',
+                override_options : efi_override_options,
+                pic : true)
+
+        kwargs = {
+                'include_directories' : efi_includes,
+                'c_args' : archspec['c_args'],
+                'link_args' : archspec['link_args'],
+                'gnu_symbol_visibility' : 'hidden',
+                'override_options' : efi_override_options,
+                'pie' : true,
+        }
+
+        efi_elf_binaries += executable(
+                'systemd-boot' + archspec['arch'],
+                sources : [systemd_boot_sources, version_h],
+                link_with : libefi,
+                name_suffix : 'elf',
+                kwargs : kwargs)
+
+        efi_elf_binaries += executable(
+                'linux' + archspec['arch'],
+                sources : [stub_sources, version_h],
+                link_with : libefi,
+                name_suffix : 'elf.stub',
+                kwargs : kwargs)
+
+        efi_elf_binaries += executable(
+                'addon' + archspec['arch'],
+                sources : [addon_sources, version_h],
+                name_suffix : 'elf.stub',
+                kwargs : kwargs)
+endforeach
+
+foreach efi_elf_binary : efi_elf_binaries
+        name = efi_elf_binary.name()
+        name += name.startswith('systemd-boot') ? '.efi' : '.efi.stub'
+        # For the addon, given it's empty, we need to explicitly reserve space in the header to account for
+        # the sections that ukify will add.
+        minimum_sections = name.endswith('.stub') ? '15' : '0'
+        exe = custom_target(
+                name,
+                output : name,
+                input : efi_elf_binary,
+                install : true,
+                install_dir : bootlibdir,
+                install_tag : 'systemd-boot',
+                command : [
+                        elf2efi_py,
+                        '--version-major=' + meson.project_version(),
+                        '--version-minor=0',
+                        '--efi-major=1',
+                        '--efi-minor=1',
+                        '--subsystem=10',
+                        '--minimum-sections=' + minimum_sections,
+                        '--copy-sections=.sbat,.sdmagic,.osrel',
+                        '@INPUT@',
+                        '@OUTPUT@',
+                ])
+        boot_targets += exe
+        if name.startswith('linux')
+                boot_stubs += exe
+        endif
+
+        # This is supposed to match exactly one time
+        if name == 'addon@0@.efi.stub'.format(efi_arch)
+                efi_addon = exe.full_path()
+        endif
+endforeach
+
+alias_target('systemd-boot', boot_targets)
diff --git a/src/boot/efi/part-discovery.c b/src/boot/efi/part-discovery.c
new file mode 100644
index 0000000..f5b1573
--- /dev/null
+++ b/src/boot/efi/part-discovery.c
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "device-path-util.h"
+#include "part-discovery.h"
+#include "proto/block-io.h"
+#include "proto/device-path.h"
+#include "util.h"
+
+typedef struct {
+        EFI_GUID PartitionTypeGUID;
+        EFI_GUID UniquePartitionGUID;
+        EFI_LBA StartingLBA;
+        EFI_LBA EndingLBA;
+        uint64_t Attributes;
+        char16_t PartitionName[36];
+} EFI_PARTITION_ENTRY;
+
+typedef struct {
+        EFI_TABLE_HEADER Header;
+        EFI_LBA MyLBA;
+        EFI_LBA AlternateLBA;
+        EFI_LBA FirstUsableLBA;
+        EFI_LBA LastUsableLBA;
+        EFI_GUID DiskGUID;
+        EFI_LBA PartitionEntryLBA;
+        uint32_t NumberOfPartitionEntries;
+        uint32_t SizeOfPartitionEntry;
+        uint32_t PartitionEntryArrayCRC32;
+        uint8_t _pad[420];
+} _packed_ GptHeader;
+assert_cc(sizeof(GptHeader) == 512);
+
+static bool verify_gpt(/*const*/ GptHeader *h, EFI_LBA lba_expected) {
+        uint32_t crc32, crc32_saved;
+        EFI_STATUS err;
+
+        assert(h);
+
+        /* Some superficial validation of the GPT header */
+        if (memcmp(&h->Header.Signature, "EFI PART", sizeof(h->Header.Signature)) != 0)
+                return false;
+
+        if (h->Header.HeaderSize < 92 || h->Header.HeaderSize > 512)
+                return false;
+
+        if (h->Header.Revision != 0x00010000U)
+                return false;
+
+        /* Calculate CRC check */
+        crc32_saved = h->Header.CRC32;
+        h->Header.CRC32 = 0;
+        err = BS->CalculateCrc32(h, h->Header.HeaderSize, &crc32);
+        h->Header.CRC32 = crc32_saved;
+        if (err != EFI_SUCCESS || crc32 != crc32_saved)
+                return false;
+
+        if (h->MyLBA != lba_expected)
+                return false;
+
+        if ((h->SizeOfPartitionEntry % sizeof(EFI_PARTITION_ENTRY)) != 0)
+                return false;
+
+        if (h->NumberOfPartitionEntries <= 0 || h->NumberOfPartitionEntries > 1024)
+                return false;
+
+        /* overflow check */
+        if (h->SizeOfPartitionEntry > SIZE_MAX / h->NumberOfPartitionEntries)
+                return false;
+
+        return true;
+}
+
+static EFI_STATUS try_gpt(
+                const EFI_GUID *type,
+                EFI_BLOCK_IO_PROTOCOL *block_io,
+                EFI_LBA lba,
+                EFI_LBA *ret_backup_lba, /* May be changed even on error! */
+                HARDDRIVE_DEVICE_PATH *ret_hd) {
+
+        _cleanup_free_ EFI_PARTITION_ENTRY *entries = NULL;
+        GptHeader gpt;
+        EFI_STATUS err;
+        uint32_t crc32;
+        size_t size;
+
+        assert(block_io);
+        assert(ret_hd);
+
+        /* Read the GPT header */
+        err = block_io->ReadBlocks(
+                        block_io,
+                        block_io->Media->MediaId,
+                        lba,
+                        sizeof(gpt), &gpt);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Indicate the location of backup LBA even if the rest of the header is corrupt. */
+        if (ret_backup_lba)
+                *ret_backup_lba = gpt.AlternateLBA;
+
+        if (!verify_gpt(&gpt, lba))
+                return EFI_NOT_FOUND;
+
+        /* Now load the GPT entry table */
+        size = ALIGN_TO((size_t) gpt.SizeOfPartitionEntry * (size_t) gpt.NumberOfPartitionEntries, 512);
+        entries = xmalloc(size);
+
+        err = block_io->ReadBlocks(
+                        block_io,
+                        block_io->Media->MediaId,
+                        gpt.PartitionEntryLBA,
+                        size, entries);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Calculate CRC of entries array, too */
+        err = BS->CalculateCrc32(entries, size, &crc32);
+        if (err != EFI_SUCCESS || crc32 != gpt.PartitionEntryArrayCRC32)
+                return EFI_CRC_ERROR;
+
+        /* Now we can finally look for xbootloader partitions. */
+        for (size_t i = 0; i < gpt.NumberOfPartitionEntries; i++) {
+                EFI_PARTITION_ENTRY *entry =
+                                (EFI_PARTITION_ENTRY *) ((uint8_t *) entries + gpt.SizeOfPartitionEntry * i);
+
+                if (!efi_guid_equal(&entry->PartitionTypeGUID, type))
+                        continue;
+
+                if (entry->EndingLBA < entry->StartingLBA) /* Bogus? */
+                        continue;
+
+                *ret_hd = (HARDDRIVE_DEVICE_PATH) {
+                        .Header = {
+                                .Type = MEDIA_DEVICE_PATH,
+                                .SubType = MEDIA_HARDDRIVE_DP,
+                                .Length = sizeof(HARDDRIVE_DEVICE_PATH),
+                        },
+                        .PartitionNumber = i + 1,
+                        .PartitionStart = entry->StartingLBA,
+                        .PartitionSize = entry->EndingLBA - entry->StartingLBA + 1,
+                        .MBRType = MBR_TYPE_EFI_PARTITION_TABLE_HEADER,
+                        .SignatureType = SIGNATURE_TYPE_GUID,
+                };
+                memcpy(ret_hd->Signature, &entry->UniquePartitionGUID, sizeof(ret_hd->Signature));
+
+                return EFI_SUCCESS;
+        }
+
+        /* This GPT was fully valid, but we didn't find what we are looking for. This
+         * means there's no reason to check the second copy of the GPT header */
+        return EFI_NOT_FOUND;
+}
+
+static EFI_STATUS find_device(const EFI_GUID *type, EFI_HANDLE *device, EFI_DEVICE_PATH **ret_device_path) {
+        EFI_STATUS err;
+
+        assert(device);
+        assert(ret_device_path);
+
+        EFI_DEVICE_PATH *partition_path;
+        err = BS->HandleProtocol(device, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL), (void **) &partition_path);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Find the (last) partition node itself. */
+        EFI_DEVICE_PATH *part_node = NULL;
+        for (EFI_DEVICE_PATH *node = partition_path; !device_path_is_end(node);
+             node = device_path_next_node(node)) {
+                if (node->Type != MEDIA_DEVICE_PATH || node->SubType != MEDIA_HARDDRIVE_DP)
+                        continue;
+
+                part_node = node;
+        }
+
+        if (!part_node)
+                return EFI_NOT_FOUND;
+
+        /* Chop off the partition part, leaving us with the full path to the disk itself. */
+        _cleanup_free_ EFI_DEVICE_PATH *disk_path = NULL;
+        EFI_DEVICE_PATH *p = disk_path = device_path_replace_node(partition_path, part_node, NULL);
+
+        EFI_HANDLE disk_handle;
+        EFI_BLOCK_IO_PROTOCOL *block_io;
+        err = BS->LocateDevicePath(MAKE_GUID_PTR(EFI_BLOCK_IO_PROTOCOL), &p, &disk_handle);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* The drivers for other partitions on this drive may not be initialized on fastboot firmware, so we
+         * have to ask the firmware to do just that. */
+        (void) BS->ConnectController(disk_handle, NULL, NULL, true);
+
+        err = BS->HandleProtocol(disk_handle, MAKE_GUID_PTR(EFI_BLOCK_IO_PROTOCOL), (void **) &block_io);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Filter out some block devices early. (We only care about block devices that aren't
+         * partitions themselves — we look for GPT partition tables to parse after all —, and only
+         * those which contain a medium and have at least 2 blocks.) */
+        if (block_io->Media->LogicalPartition ||
+            !block_io->Media->MediaPresent ||
+            block_io->Media->LastBlock <= 1)
+                return EFI_NOT_FOUND;
+
+        /* Try several copies of the GPT header, in case one is corrupted */
+        EFI_LBA backup_lba = 0;
+        for (size_t nr = 0; nr < 3; nr++) {
+                EFI_LBA lba;
+
+                /* Read the first copy at LBA 1 and then try the backup GPT header pointed
+                 * to by the first header if that one was corrupted. As a last resort,
+                 * try the very last LBA of this block device. */
+                if (nr == 0)
+                        lba = 1;
+                else if (nr == 1 && backup_lba != 0)
+                        lba = backup_lba;
+                else if (nr == 2 && backup_lba != block_io->Media->LastBlock)
+                        lba = block_io->Media->LastBlock;
+                else
+                        continue;
+
+                HARDDRIVE_DEVICE_PATH hd;
+                err = try_gpt(type, block_io, lba,
+                        nr == 0 ? &backup_lba : NULL, /* Only get backup LBA location from first GPT header. */
+                        &hd);
+                if (err != EFI_SUCCESS) {
+                        /* GPT was valid but no XBOOT loader partition found. */
+                        if (err == EFI_NOT_FOUND)
+                                break;
+                        /* Bad GPT, try next one. */
+                        continue;
+                }
+
+                /* Patch in the data we found */
+                *ret_device_path = device_path_replace_node(partition_path, part_node, (EFI_DEVICE_PATH *) &hd);
+                return EFI_SUCCESS;
+        }
+
+        /* No xbootloader partition found */
+        return EFI_NOT_FOUND;
+}
+
+EFI_STATUS partition_open(const EFI_GUID *type, EFI_HANDLE *device, EFI_HANDLE *ret_device,
+                          EFI_FILE **ret_root_dir) {
+        _cleanup_free_ EFI_DEVICE_PATH *partition_path = NULL;
+        EFI_HANDLE new_device;
+        EFI_FILE *root_dir;
+        EFI_STATUS err;
+
+        assert(type);
+        assert(device);
+        assert(ret_root_dir);
+
+        err = find_device(type, device, &partition_path);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        EFI_DEVICE_PATH *dp = partition_path;
+        err = BS->LocateDevicePath(MAKE_GUID_PTR(EFI_BLOCK_IO_PROTOCOL), &dp, &new_device);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = open_volume(new_device, &root_dir);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (ret_device)
+                *ret_device = new_device;
+        *ret_root_dir = root_dir;
+        return EFI_SUCCESS;
+}
+
+char16_t *disk_get_part_uuid(EFI_HANDLE *handle) {
+        EFI_STATUS err;
+        EFI_DEVICE_PATH *dp;
+
+        /* export the device path this image is started from */
+
+        if (!handle)
+                return NULL;
+
+        err = BS->HandleProtocol(handle, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL), (void **) &dp);
+        if (err != EFI_SUCCESS)
+                return NULL;
+
+        for (; !device_path_is_end(dp); dp = device_path_next_node(dp)) {
+                if (dp->Type != MEDIA_DEVICE_PATH || dp->SubType != MEDIA_HARDDRIVE_DP)
+                        continue;
+
+                HARDDRIVE_DEVICE_PATH *hd = (HARDDRIVE_DEVICE_PATH *) dp;
+                if (hd->SignatureType != SIGNATURE_TYPE_GUID)
+                        continue;
+
+                return xasprintf(GUID_FORMAT_STR, GUID_FORMAT_VAL(hd->SignatureGuid));
+        }
+
+        return NULL;
+}
diff --git a/src/boot/efi/part-discovery.h b/src/boot/efi/part-discovery.h
new file mode 100644
index 0000000..bbc87ff
--- /dev/null
+++ b/src/boot/efi/part-discovery.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define XBOOTLDR_GUID \
+        { 0xbc13c2ff, 0x59e6, 0x4262, { 0xa3, 0x52, 0xb2, 0x75, 0xfd, 0x6f, 0x71, 0x72 } }
+#define ESP_GUID \
+        { 0xc12a7328, 0xf81f, 0x11d2, { 0xba, 0x4b, 0x00, 0xa0, 0xc9, 0x3e, 0xc9, 0x3b } }
+
+EFI_STATUS partition_open(const EFI_GUID *type, EFI_HANDLE *device, EFI_HANDLE *ret_device, EFI_FILE **ret_root_dir);
+char16_t *disk_get_part_uuid(EFI_HANDLE *handle);
diff --git a/src/boot/efi/pe.c b/src/boot/efi/pe.c
new file mode 100644
index 0000000..829266b
--- /dev/null
+++ b/src/boot/efi/pe.c
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "pe.h"
+#include "util.h"
+
+#define DOS_FILE_MAGIC "MZ"
+#define PE_FILE_MAGIC  "PE\0\0"
+#define MAX_SECTIONS 96
+
+#if defined(__i386__)
+#  define TARGET_MACHINE_TYPE 0x014CU
+#  define TARGET_MACHINE_TYPE_COMPATIBILITY 0x8664U
+#elif defined(__x86_64__)
+#  define TARGET_MACHINE_TYPE 0x8664U
+#elif defined(__aarch64__)
+#  define TARGET_MACHINE_TYPE 0xAA64U
+#elif defined(__arm__)
+#  define TARGET_MACHINE_TYPE 0x01C2U
+#elif defined(__riscv) && __riscv_xlen == 32
+#  define TARGET_MACHINE_TYPE 0x5032U
+#elif defined(__riscv) && __riscv_xlen == 64
+#  define TARGET_MACHINE_TYPE 0x5064U
+#elif defined(__loongarch__) && __loongarch_grlen == 32
+#  define TARGET_MACHINE_TYPE 0x6232U
+#elif defined(__loongarch__) && __loongarch_grlen == 64
+#  define TARGET_MACHINE_TYPE 0x6264U
+#else
+#  error Unknown EFI arch
+#endif
+
+#ifndef TARGET_MACHINE_TYPE_COMPATIBILITY
+#  define TARGET_MACHINE_TYPE_COMPATIBILITY 0
+#endif
+
+typedef struct DosFileHeader {
+        uint8_t  Magic[2];
+        uint16_t LastSize;
+        uint16_t nBlocks;
+        uint16_t nReloc;
+        uint16_t HdrSize;
+        uint16_t MinAlloc;
+        uint16_t MaxAlloc;
+        uint16_t ss;
+        uint16_t sp;
+        uint16_t Checksum;
+        uint16_t ip;
+        uint16_t cs;
+        uint16_t RelocPos;
+        uint16_t nOverlay;
+        uint16_t reserved[4];
+        uint16_t OEMId;
+        uint16_t OEMInfo;
+        uint16_t reserved2[10];
+        uint32_t ExeHeader;
+} _packed_ DosFileHeader;
+
+typedef struct CoffFileHeader {
+        uint16_t Machine;
+        uint16_t NumberOfSections;
+        uint32_t TimeDateStamp;
+        uint32_t PointerToSymbolTable;
+        uint32_t NumberOfSymbols;
+        uint16_t SizeOfOptionalHeader;
+        uint16_t Characteristics;
+} _packed_ CoffFileHeader;
+
+#define OPTHDR32_MAGIC 0x10B /* PE32  OptionalHeader */
+#define OPTHDR64_MAGIC 0x20B /* PE32+ OptionalHeader */
+
+typedef struct PeOptionalHeader {
+        uint16_t Magic;
+        uint8_t  LinkerMajor;
+        uint8_t  LinkerMinor;
+        uint32_t SizeOfCode;
+        uint32_t SizeOfInitializedData;
+        uint32_t SizeOfUninitializeData;
+        uint32_t AddressOfEntryPoint;
+        uint32_t BaseOfCode;
+        union {
+                struct { /* PE32 */
+                        uint32_t BaseOfData;
+                        uint32_t ImageBase32;
+                };
+                uint64_t ImageBase64; /* PE32+ */
+        };
+        uint32_t SectionAlignment;
+        uint32_t FileAlignment;
+        uint16_t MajorOperatingSystemVersion;
+        uint16_t MinorOperatingSystemVersion;
+        uint16_t MajorImageVersion;
+        uint16_t MinorImageVersion;
+        uint16_t MajorSubsystemVersion;
+        uint16_t MinorSubsystemVersion;
+        uint32_t Win32VersionValue;
+        uint32_t SizeOfImage;
+        uint32_t SizeOfHeaders;
+        uint32_t CheckSum;
+        uint16_t Subsystem;
+        uint16_t DllCharacteristics;
+        /* fields with different sizes for 32/64 omitted */
+} _packed_ PeOptionalHeader;
+
+typedef struct PeFileHeader {
+        uint8_t   Magic[4];
+        CoffFileHeader FileHeader;
+        PeOptionalHeader OptionalHeader;
+} _packed_ PeFileHeader;
+
+typedef struct PeSectionHeader {
+        uint8_t  Name[8];
+        uint32_t VirtualSize;
+        uint32_t VirtualAddress;
+        uint32_t SizeOfRawData;
+        uint32_t PointerToRawData;
+        uint32_t PointerToRelocations;
+        uint32_t PointerToLinenumbers;
+        uint16_t NumberOfRelocations;
+        uint16_t NumberOfLinenumbers;
+        uint32_t Characteristics;
+} _packed_ PeSectionHeader;
+
+static bool verify_dos(const DosFileHeader *dos) {
+        assert(dos);
+        return memcmp(dos->Magic, DOS_FILE_MAGIC, STRLEN(DOS_FILE_MAGIC)) == 0;
+}
+
+static bool verify_pe(const PeFileHeader *pe, bool allow_compatibility) {
+        assert(pe);
+        return memcmp(pe->Magic, PE_FILE_MAGIC, STRLEN(PE_FILE_MAGIC)) == 0 &&
+               (pe->FileHeader.Machine == TARGET_MACHINE_TYPE ||
+                (allow_compatibility && pe->FileHeader.Machine == TARGET_MACHINE_TYPE_COMPATIBILITY)) &&
+               pe->FileHeader.NumberOfSections > 0 &&
+               pe->FileHeader.NumberOfSections <= MAX_SECTIONS &&
+               IN_SET(pe->OptionalHeader.Magic, OPTHDR32_MAGIC, OPTHDR64_MAGIC);
+}
+
+static size_t section_table_offset(const DosFileHeader *dos, const PeFileHeader *pe) {
+        assert(dos);
+        assert(pe);
+        return dos->ExeHeader + offsetof(PeFileHeader, OptionalHeader) + pe->FileHeader.SizeOfOptionalHeader;
+}
+
+static void locate_sections(
+                const PeSectionHeader section_table[],
+                size_t n_table,
+                const char * const sections[],
+                size_t *offsets,
+                size_t *sizes,
+                bool in_memory) {
+
+        assert(section_table);
+        assert(sections);
+        assert(offsets);
+        assert(sizes);
+
+        for (size_t i = 0; i < n_table; i++) {
+                const PeSectionHeader *sect = section_table + i;
+
+                for (size_t j = 0; sections[j]; j++) {
+                        if (memcmp(sect->Name, sections[j], strlen8(sections[j])) != 0)
+                                continue;
+
+                        offsets[j] = in_memory ? sect->VirtualAddress : sect->PointerToRawData;
+                        sizes[j] = sect->VirtualSize;
+                }
+        }
+}
+
+static uint32_t get_compatibility_entry_address(const DosFileHeader *dos, const PeFileHeader *pe) {
+        size_t addr = 0, size = 0;
+        static const char *sections[] = { ".compat", NULL };
+
+        /* The kernel may provide alternative PE entry points for different PE architectures. This allows
+         * booting a 64-bit kernel on 32-bit EFI that is otherwise running on a 64-bit CPU. The locations of any
+         * such compat entry points are located in a special PE section. */
+
+        locate_sections((const PeSectionHeader *) ((const uint8_t *) dos + section_table_offset(dos, pe)),
+                        pe->FileHeader.NumberOfSections,
+                        sections,
+                        &addr,
+                        &size,
+                        /*in_memory=*/true);
+
+        if (size == 0)
+                return 0;
+
+        typedef struct {
+                uint8_t type;
+                uint8_t size;
+                uint16_t machine_type;
+                uint32_t entry_point;
+        } _packed_ LinuxPeCompat1;
+
+        while (size >= sizeof(LinuxPeCompat1) && addr % alignof(LinuxPeCompat1) == 0) {
+                LinuxPeCompat1 *compat = (LinuxPeCompat1 *) ((uint8_t *) dos + addr);
+
+                if (compat->type == 0 || compat->size == 0 || compat->size > size)
+                        break;
+
+                if (compat->type == 1 &&
+                    compat->size >= sizeof(LinuxPeCompat1) &&
+                    compat->machine_type == TARGET_MACHINE_TYPE)
+                        return compat->entry_point;
+
+                addr += compat->size;
+                size -= compat->size;
+        }
+
+        return 0;
+}
+
+EFI_STATUS pe_kernel_info(const void *base, uint32_t *ret_compat_address) {
+        assert(base);
+        assert(ret_compat_address);
+
+        const DosFileHeader *dos = (const DosFileHeader *) base;
+        if (!verify_dos(dos))
+                return EFI_LOAD_ERROR;
+
+        const PeFileHeader *pe = (const PeFileHeader *) ((const uint8_t *) base + dos->ExeHeader);
+        if (!verify_pe(pe, /* allow_compatibility= */ true))
+                return EFI_LOAD_ERROR;
+
+        /* Support for LINUX_INITRD_MEDIA_GUID was added in kernel stub 1.0. */
+        if (pe->OptionalHeader.MajorImageVersion < 1)
+                return EFI_UNSUPPORTED;
+
+        if (pe->FileHeader.Machine == TARGET_MACHINE_TYPE) {
+                *ret_compat_address = 0;
+                return EFI_SUCCESS;
+        }
+
+        uint32_t compat_address = get_compatibility_entry_address(dos, pe);
+        if (compat_address == 0)
+                /* Image type not supported and no compat entry found. */
+                return EFI_UNSUPPORTED;
+
+        *ret_compat_address = compat_address;
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS pe_memory_locate_sections(const void *base, const char * const sections[], size_t *addrs, size_t *sizes) {
+        const DosFileHeader *dos;
+        const PeFileHeader *pe;
+        size_t offset;
+
+        assert(base);
+        assert(sections);
+        assert(addrs);
+        assert(sizes);
+
+        dos = (const DosFileHeader *) base;
+        if (!verify_dos(dos))
+                return EFI_LOAD_ERROR;
+
+        pe = (const PeFileHeader *) ((uint8_t *) base + dos->ExeHeader);
+        if (!verify_pe(pe, /* allow_compatibility= */ false))
+                return EFI_LOAD_ERROR;
+
+        offset = section_table_offset(dos, pe);
+        locate_sections((PeSectionHeader *) ((uint8_t *) base + offset),
+                        pe->FileHeader.NumberOfSections,
+                        sections,
+                        addrs,
+                        sizes,
+                        /*in_memory=*/true);
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS pe_file_locate_sections(
+                EFI_FILE *dir,
+                const char16_t *path,
+                const char * const sections[],
+                size_t *offsets,
+                size_t *sizes) {
+        _cleanup_free_ PeSectionHeader *section_table = NULL;
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        DosFileHeader dos;
+        PeFileHeader pe;
+        size_t len, section_table_len;
+        EFI_STATUS err;
+
+        assert(dir);
+        assert(path);
+        assert(sections);
+        assert(offsets);
+        assert(sizes);
+
+        err = dir->Open(dir, &handle, (char16_t *) path, EFI_FILE_MODE_READ, 0ULL);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        len = sizeof(dos);
+        err = handle->Read(handle, &len, &dos);
+        if (err != EFI_SUCCESS)
+                return err;
+        if (len != sizeof(dos) || !verify_dos(&dos))
+                return EFI_LOAD_ERROR;
+
+        err = handle->SetPosition(handle, dos.ExeHeader);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        len = sizeof(pe);
+        err = handle->Read(handle, &len, &pe);
+        if (err != EFI_SUCCESS)
+                return err;
+        if (len != sizeof(pe) || !verify_pe(&pe, /* allow_compatibility= */ false))
+                return EFI_LOAD_ERROR;
+
+        section_table_len = pe.FileHeader.NumberOfSections * sizeof(PeSectionHeader);
+        section_table = xmalloc(section_table_len);
+        if (!section_table)
+                return EFI_OUT_OF_RESOURCES;
+
+        err = handle->SetPosition(handle, section_table_offset(&dos, &pe));
+        if (err != EFI_SUCCESS)
+                return err;
+
+        len = section_table_len;
+        err = handle->Read(handle, &len, section_table);
+        if (err != EFI_SUCCESS)
+                return err;
+        if (len != section_table_len)
+                return EFI_LOAD_ERROR;
+
+        locate_sections(section_table, pe.FileHeader.NumberOfSections,
+                        sections, offsets, sizes, /*in_memory=*/false);
+
+        return EFI_SUCCESS;
+}
diff --git a/src/boot/efi/pe.h b/src/boot/efi/pe.h
new file mode 100644
index 0000000..7e2258f
--- /dev/null
+++ b/src/boot/efi/pe.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+EFI_STATUS pe_memory_locate_sections(
+                const void *base,
+                const char * const sections[],
+                size_t *addrs,
+                size_t *sizes);
+
+EFI_STATUS pe_file_locate_sections(
+                EFI_FILE *dir,
+                const char16_t *path,
+                const char * const sections[],
+                size_t *offsets,
+                size_t *sizes);
+
+EFI_STATUS pe_kernel_info(const void *base, uint32_t *ret_compat_address);
diff --git a/src/boot/efi/proto/block-io.h b/src/boot/efi/proto/block-io.h
new file mode 100644
index 0000000..e977f70
--- /dev/null
+++ b/src/boot/efi/proto/block-io.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_BLOCK_IO_PROTOCOL_GUID \
+        GUID_DEF(0x0964e5b21, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+
+typedef struct EFI_BLOCK_IO_PROTOCOL EFI_BLOCK_IO_PROTOCOL;
+struct EFI_BLOCK_IO_PROTOCOL {
+        uint64_t Revision;
+        struct {
+                uint32_t MediaId;
+                bool RemovableMedia;
+                bool MediaPresent;
+                bool LogicalPartition;
+                bool ReadOnly;
+                bool WriteCaching;
+                uint32_t BlockSize;
+                uint32_t IoAlign;
+                EFI_LBA LastBlock;
+                EFI_LBA LowestAlignedLba;
+                uint32_t LogicalBlocksPerPhysicalBlock;
+                uint32_t OptimalTransferLengthGranularity;
+        } *Media;
+
+        EFI_STATUS (EFIAPI *Reset)(
+                        EFI_BLOCK_IO_PROTOCOL *This,
+                        bool ExtendedVerification);
+        EFI_STATUS (EFIAPI *ReadBlocks)(
+                        EFI_BLOCK_IO_PROTOCOL *This,
+                        uint32_t MediaId,
+                        EFI_LBA LBA,
+                        size_t BufferSize,
+                        void *Buffer);
+        EFI_STATUS (EFIAPI *WriteBlocks)(
+                        EFI_BLOCK_IO_PROTOCOL *This,
+                        uint32_t MediaId,
+                        EFI_LBA LBA,
+                        size_t BufferSize,
+                        void *Buffer);
+        EFI_STATUS (EFIAPI *FlushBlocks)(EFI_BLOCK_IO_PROTOCOL *This);
+};
diff --git a/src/boot/efi/proto/console-control.h b/src/boot/efi/proto/console-control.h
new file mode 100644
index 0000000..d3a92ea
--- /dev/null
+++ b/src/boot/efi/proto/console-control.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_CONSOLE_CONTROL_PROTOCOL_GUID \
+        GUID_DEF(0xf42f7782, 0x12e, 0x4c12, 0x99, 0x56, 0x49, 0xf9, 0x43, 0x4, 0xf7, 0x21)
+
+typedef enum {
+        EfiConsoleControlScreenText,
+        EfiConsoleControlScreenGraphics,
+        EfiConsoleControlScreenMaxValue,
+} EFI_CONSOLE_CONTROL_SCREEN_MODE;
+
+typedef struct EFI_CONSOLE_CONTROL_PROTOCOL EFI_CONSOLE_CONTROL_PROTOCOL;
+struct EFI_CONSOLE_CONTROL_PROTOCOL {
+        EFI_STATUS (EFIAPI *GetMode)(
+                        EFI_CONSOLE_CONTROL_PROTOCOL *This,
+                        EFI_CONSOLE_CONTROL_SCREEN_MODE *Mode,
+                        bool *UgaExists,
+                        bool *StdInLocked);
+        EFI_STATUS (EFIAPI *SetMode)(
+                        EFI_CONSOLE_CONTROL_PROTOCOL *This,
+                        EFI_CONSOLE_CONTROL_SCREEN_MODE Mode);
+        EFI_STATUS(EFIAPI *LockStdIn)(
+                        EFI_CONSOLE_CONTROL_PROTOCOL *This,
+                        char16_t *Password);
+};
diff --git a/src/boot/efi/proto/device-path.h b/src/boot/efi/proto/device-path.h
new file mode 100644
index 0000000..0fabae1
--- /dev/null
+++ b/src/boot/efi/proto/device-path.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_DEVICE_PATH_PROTOCOL_GUID \
+        GUID_DEF(0x09576e91, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+#define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID \
+        GUID_DEF(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c)
+#define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID \
+        GUID_DEF(0x05c99a21, 0xc70f, 0x4ad2, 0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e)
+
+/* Device path types. */
+enum {
+        HARDWARE_DEVICE_PATH  = 0x01,
+        ACPI_DEVICE_PATH      = 0x02,
+        MESSAGING_DEVICE_PATH = 0x03,
+        MEDIA_DEVICE_PATH     = 0x04,
+        BBS_DEVICE_PATH       = 0x05,
+        END_DEVICE_PATH_TYPE  = 0x7f,
+};
+
+/* Device path sub-types. */
+enum {
+        END_INSTANCE_DEVICE_PATH_SUBTYPE = 0x01,
+        END_ENTIRE_DEVICE_PATH_SUBTYPE   = 0xff,
+
+        MEDIA_HARDDRIVE_DP               = 0x01,
+        MEDIA_VENDOR_DP                  = 0x03,
+        MEDIA_FILEPATH_DP                = 0x04,
+        MEDIA_PIWG_FW_FILE_DP            = 0x06,
+        MEDIA_PIWG_FW_VOL_DP             = 0x07,
+};
+
+struct _packed_ EFI_DEVICE_PATH_PROTOCOL {
+        uint8_t Type;
+        uint8_t SubType;
+        uint16_t Length;
+};
+
+typedef struct {
+        EFI_DEVICE_PATH Header;
+        EFI_GUID Guid;
+} _packed_ VENDOR_DEVICE_PATH;
+
+#define MBR_TYPE_PCAT                        0x01U
+#define MBR_TYPE_EFI_PARTITION_TABLE_HEADER  0x02U
+#define NO_DISK_SIGNATURE    0x00U
+#define SIGNATURE_TYPE_MBR   0x01U
+#define SIGNATURE_TYPE_GUID  0x02U
+
+typedef struct {
+        EFI_DEVICE_PATH Header;
+        uint32_t PartitionNumber;
+        uint64_t PartitionStart;
+        uint64_t PartitionSize;
+        union {
+                uint8_t Signature[16];
+                EFI_GUID SignatureGuid;
+        };
+        uint8_t MBRType;
+        uint8_t SignatureType;
+} _packed_ HARDDRIVE_DEVICE_PATH;
+
+typedef struct {
+        EFI_DEVICE_PATH Header;
+        char16_t PathName[];
+} _packed_ FILEPATH_DEVICE_PATH;
+
+typedef struct {
+        char16_t* (EFIAPI *ConvertDeviceNodeToText)(
+                        const EFI_DEVICE_PATH *DeviceNode,
+                        bool DisplayOnly,
+                        bool AllowShortcuts);
+        char16_t* (EFIAPI *ConvertDevicePathToText)(
+                        const EFI_DEVICE_PATH *DevicePath,
+                        bool DisplayOnly,
+                        bool AllowShortcuts);
+} EFI_DEVICE_PATH_TO_TEXT_PROTOCOL;
+
+typedef struct {
+        EFI_DEVICE_PATH* (EFIAPI *ConvertTextToDevicNode)(
+                        const char16_t *TextDeviceNode);
+        EFI_DEVICE_PATH* (EFIAPI *ConvertTextToDevicPath)(
+                        const char16_t *ConvertTextToDevicPath);
+} EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL;
diff --git a/src/boot/efi/proto/dt-fixup.h b/src/boot/efi/proto/dt-fixup.h
new file mode 100644
index 0000000..6edbef5
--- /dev/null
+++ b/src/boot/efi/proto/dt-fixup.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_DTB_TABLE_GUID \
+        GUID_DEF(0xb1b621d5, 0xf19c, 0x41a5, 0x83, 0x0b, 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0)
+#define EFI_DT_FIXUP_PROTOCOL_GUID \
+        GUID_DEF(0xe617d64c, 0xfe08, 0x46da, 0xf4, 0xdc, 0xbb, 0xd5, 0x87, 0x0c, 0x73, 0x00)
+
+#define EFI_DT_FIXUP_PROTOCOL_REVISION 0x00010000
+
+/* Add nodes and update properties */
+#define EFI_DT_APPLY_FIXUPS 0x00000001
+
+/*
+ * Reserve memory according to the /reserved-memory node
+ * and the memory reservation block
+ */
+#define EFI_DT_RESERVE_MEMORY 0x00000002
+
+typedef struct EFI_DT_FIXUP_PROTOCOL EFI_DT_FIXUP_PROTOCOL;
+struct EFI_DT_FIXUP_PROTOCOL {
+        uint64_t Revision;
+        EFI_STATUS (EFIAPI *Fixup)(
+                EFI_DT_FIXUP_PROTOCOL *This,
+                void *Fdt,
+                size_t *BufferSize,
+                uint32_t Flags);
+};
diff --git a/src/boot/efi/proto/file-io.h b/src/boot/efi/proto/file-io.h
new file mode 100644
index 0000000..001ad48
--- /dev/null
+++ b/src/boot/efi/proto/file-io.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_SIMPLE_FILE_SYSTEM_PROTOCOL_GUID \
+        GUID_DEF(0x0964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+#define EFI_FILE_INFO_ID \
+        GUID_DEF(0x009576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+
+#define EFI_FILE_MODE_READ   0x0000000000000001U
+#define EFI_FILE_MODE_WRITE  0x0000000000000002U
+#define EFI_FILE_MODE_CREATE 0x8000000000000000U
+
+#define EFI_FILE_READ_ONLY  0x01U
+#define EFI_FILE_HIDDEN     0x02U
+#define EFI_FILE_SYSTEM     0x04U
+#define EFI_FILE_RESERVED   0x08U
+#define EFI_FILE_DIRECTORY  0x10U
+#define EFI_FILE_ARCHIVE    0x20U
+#define EFI_FILE_VALID_ATTR 0x37U
+
+typedef struct {
+        uint64_t Size;
+        uint64_t FileSize;
+        uint64_t PhysicalSize;
+        EFI_TIME CreateTime;
+        EFI_TIME LastAccessTime;
+        EFI_TIME ModificationTime;
+        uint64_t Attribute;
+        char16_t FileName[];
+} EFI_FILE_INFO;
+
+/* Some broken firmware violates the EFI spec by still advancing the readdir
+ * position when returning EFI_BUFFER_TOO_SMALL, effectively skipping over any files when
+ * the buffer was too small. Therefore, we always start with a buffer that should handle FAT32
+ * max file name length. */
+#define EFI_FILE_INFO_MIN_SIZE (offsetof(EFI_FILE_INFO, FileName) + 256U * sizeof(char16_t))
+
+typedef struct EFI_SIMPLE_FILE_SYSTEM_PROTOCOL EFI_SIMPLE_FILE_SYSTEM_PROTOCOL;
+struct EFI_SIMPLE_FILE_SYSTEM_PROTOCOL {
+        uint64_t Revision;
+        EFI_STATUS (EFIAPI *OpenVolume)(
+                        EFI_SIMPLE_FILE_SYSTEM_PROTOCOL *This,
+                        EFI_FILE **Root);
+};
+
+struct EFI_FILE_PROTOCOL {
+        uint64_t Revision;
+        EFI_STATUS (EFIAPI *Open)(
+                        EFI_FILE *This,
+                        EFI_FILE **NewHandle,
+                        char16_t *FileName,
+                        uint64_t OpenMode,
+                        uint64_t Attributes);
+        EFI_STATUS (EFIAPI *Close)(EFI_FILE *This);
+        EFI_STATUS (EFIAPI *Delete)(EFI_FILE *This);
+        EFI_STATUS (EFIAPI *Read)(
+                        EFI_FILE *This,
+                        size_t *BufferSize,
+                        void *Buffer);
+        EFI_STATUS (EFIAPI *Write)(
+                        EFI_FILE *This,
+                        size_t *BufferSize,
+                        void *Buffer);
+        EFI_STATUS (EFIAPI *GetPosition)(EFI_FILE *This, uint64_t *Position);
+        EFI_STATUS (EFIAPI *SetPosition)(EFI_FILE *This, uint64_t Position);
+        EFI_STATUS (EFIAPI *GetInfo)(
+                        EFI_FILE *This,
+                        EFI_GUID *InformationType,
+                        size_t *BufferSize,
+                        void *Buffer);
+        EFI_STATUS (EFIAPI *SetInfo)(
+                        EFI_FILE *This,
+                        EFI_GUID *InformationType,
+                        size_t BufferSize,
+                        void *Buffer);
+        EFI_STATUS (EFIAPI *Flush)(EFI_FILE *This);
+        void *OpenEx;
+        void *ReadEx;
+        void *WriteEx;
+        void *FlushEx;
+};
diff --git a/src/boot/efi/proto/graphics-output.h b/src/boot/efi/proto/graphics-output.h
new file mode 100644
index 0000000..f49e580
--- /dev/null
+++ b/src/boot/efi/proto/graphics-output.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID \
+        GUID_DEF(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a)
+
+typedef enum {
+        PixelRedGreenBlueReserved8BitPerColor,
+        PixelBlueGreenRedReserved8BitPerColor,
+        PixelBitMask,
+        PixelBltOnly,
+        PixelFormatMax,
+} EFI_GRAPHICS_PIXEL_FORMAT;
+
+typedef enum {
+        EfiBltVideoFill,
+        EfiBltVideoToBltBuffer,
+        EfiBltBufferToVideo,
+        EfiBltVideoToVideo,
+        EfiGraphicsOutputBltOperationMax,
+} EFI_GRAPHICS_OUTPUT_BLT_OPERATION;
+
+typedef struct {
+        uint32_t RedMask;
+        uint32_t GreenMask;
+        uint32_t BlueMask;
+        uint32_t ReservedMask;
+} EFI_PIXEL_BITMASK;
+
+typedef struct {
+        uint8_t Blue;
+        uint8_t Green;
+        uint8_t Red;
+        uint8_t Reserved;
+} EFI_GRAPHICS_OUTPUT_BLT_PIXEL;
+
+typedef struct {
+        uint32_t Version;
+        uint32_t HorizontalResolution;
+        uint32_t VerticalResolution;
+        EFI_GRAPHICS_PIXEL_FORMAT PixelFormat;
+        EFI_PIXEL_BITMASK PixelInformation;
+        uint32_t PixelsPerScanLine;
+} EFI_GRAPHICS_OUTPUT_MODE_INFORMATION;
+
+typedef struct EFI_GRAPHICS_OUTPUT_PROTOCOL EFI_GRAPHICS_OUTPUT_PROTOCOL;
+struct EFI_GRAPHICS_OUTPUT_PROTOCOL {
+        EFI_STATUS (EFIAPI *QueryMode)(
+                        EFI_GRAPHICS_OUTPUT_PROTOCOL *This,
+                        uint32_t ModeNumber,
+                        size_t *SizeOfInfo,
+                        EFI_GRAPHICS_OUTPUT_MODE_INFORMATION **Info);
+        EFI_STATUS(EFIAPI *SetMode)(
+                        EFI_GRAPHICS_OUTPUT_PROTOCOL *This,
+                        uint32_t ModeNumber);
+        EFI_STATUS (EFIAPI *Blt)(
+                        EFI_GRAPHICS_OUTPUT_PROTOCOL *This,
+                        EFI_GRAPHICS_OUTPUT_BLT_PIXEL *BltBuffer,
+                        EFI_GRAPHICS_OUTPUT_BLT_OPERATION BltOperation,
+                        size_t SourceX,
+                        size_t SourceY,
+                        size_t DestinationX,
+                        size_t DestinationY,
+                        size_t Width,
+                        size_t Height,
+                        size_t Delta);
+
+        struct {
+                uint32_t MaxMode;
+                uint32_t Mode;
+                EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *Info;
+                size_t SizeOfInfo;
+                EFI_PHYSICAL_ADDRESS FrameBufferBase;
+                size_t FrameBufferSize;
+        } *Mode;
+};
diff --git a/src/boot/efi/proto/load-file.h b/src/boot/efi/proto/load-file.h
new file mode 100644
index 0000000..2e01ce5
--- /dev/null
+++ b/src/boot/efi/proto/load-file.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_LOAD_FILE_PROTOCOL_GUID \
+        GUID_DEF(0x56EC3091, 0x954C, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+#define EFI_LOAD_FILE2_PROTOCOL_GUID \
+        GUID_DEF(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d)
+
+typedef struct EFI_LOAD_FILE_PROTOCOL EFI_LOAD_FILE_PROTOCOL;
+typedef EFI_LOAD_FILE_PROTOCOL EFI_LOAD_FILE2_PROTOCOL;
+
+struct EFI_LOAD_FILE_PROTOCOL {
+        EFI_STATUS (EFIAPI *LoadFile)(
+                        EFI_LOAD_FILE_PROTOCOL *This,
+                        EFI_DEVICE_PATH *FilePath,
+                        bool BootPolicy,
+                        size_t *BufferSize,
+                        void *Buffer);
+};
diff --git a/src/boot/efi/proto/loaded-image.h b/src/boot/efi/proto/loaded-image.h
new file mode 100644
index 0000000..46371e7
--- /dev/null
+++ b/src/boot/efi/proto/loaded-image.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_LOADED_IMAGE_PROTOCOL_GUID \
+        GUID_DEF(0x5B1B31A1, 0x9562, 0x11d2, 0x8E, 0x3F, 0x00, 0xA0, 0xC9, 0x69, 0x72, 0x3B)
+#define EFI_LOADED_IMAGE_DEVICE_PATH_PROTOCOL_GUID \
+        GUID_DEF(0xbc62157e, 0x3e33, 0x4fec, 0x99, 0x20, 0x2d, 0x3b, 0x36, 0xd7, 0x50, 0xdf)
+
+typedef EFI_STATUS (EFIAPI *EFI_IMAGE_ENTRY_POINT)(
+        EFI_HANDLE ImageHandle,
+        EFI_SYSTEM_TABLE *SystemTable);
+
+typedef struct {
+        uint32_t Revision;
+        EFI_HANDLE ParentHandle;
+        EFI_SYSTEM_TABLE *SystemTable;
+        EFI_HANDLE DeviceHandle;
+        EFI_DEVICE_PATH *FilePath;
+        void *Reserved;
+        uint32_t LoadOptionsSize;
+        void *LoadOptions;
+        void *ImageBase;
+        uint64_t ImageSize;
+        EFI_MEMORY_TYPE ImageCodeType;
+        EFI_MEMORY_TYPE ImageDataType;
+        EFI_STATUS (EFIAPI *Unload)(EFI_HANDLE ImageHandle);
+} EFI_LOADED_IMAGE_PROTOCOL;
diff --git a/src/boot/efi/proto/rng.h b/src/boot/efi/proto/rng.h
new file mode 100644
index 0000000..8ed1fd4
--- /dev/null
+++ b/src/boot/efi/proto/rng.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_RNG_PROTOCOL_GUID \
+        GUID_DEF(0x3152bca5, 0xeade, 0x433d, 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44)
+
+typedef struct EFI_RNG_PROTOCOL EFI_RNG_PROTOCOL;
+struct EFI_RNG_PROTOCOL {
+        EFI_STATUS (EFIAPI *GetInfo)(
+                        EFI_RNG_PROTOCOL *This,
+                        size_t *RNGAlgorithmListSize,
+                        EFI_GUID *RNGAlgorithmList);
+        EFI_STATUS (EFIAPI *GetRNG)(
+                        EFI_RNG_PROTOCOL *This,
+                        EFI_GUID *RNGAlgorithm,
+                        size_t RNGValueLength,
+                        uint8_t *RNGValue);
+};
diff --git a/src/boot/efi/proto/security-arch.h b/src/boot/efi/proto/security-arch.h
new file mode 100644
index 0000000..2675c61
--- /dev/null
+++ b/src/boot/efi/proto/security-arch.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_SECURITY_ARCH_PROTOCOL_GUID \
+        GUID_DEF(0xA46423E3, 0x4617, 0x49f1, 0xB9, 0xFF, 0xD1, 0xBF, 0xA9, 0x11, 0x58, 0x39)
+#define EFI_SECURITY2_ARCH_PROTOCOL_GUID \
+        GUID_DEF(0x94ab2f58, 0x1438, 0x4ef1, 0x91, 0x52, 0x18, 0x94, 0x1a, 0x3a, 0x0e, 0x68)
+
+typedef struct EFI_SECURITY_ARCH_PROTOCOL EFI_SECURITY_ARCH_PROTOCOL;
+typedef struct EFI_SECURITY2_ARCH_PROTOCOL EFI_SECURITY2_ARCH_PROTOCOL;
+
+typedef EFI_STATUS (EFIAPI *EFI_SECURITY_FILE_AUTHENTICATION_STATE)(
+                const EFI_SECURITY_ARCH_PROTOCOL *This,
+                uint32_t AuthenticationStatus,
+                const EFI_DEVICE_PATH *File);
+
+typedef EFI_STATUS (EFIAPI *EFI_SECURITY2_FILE_AUTHENTICATION)(
+                const EFI_SECURITY2_ARCH_PROTOCOL *This,
+                const EFI_DEVICE_PATH *DevicePath,
+                void *FileBuffer,
+                size_t FileSize,
+                bool BootPolicy);
+
+struct EFI_SECURITY_ARCH_PROTOCOL {
+        EFI_SECURITY_FILE_AUTHENTICATION_STATE FileAuthenticationState;
+};
+
+struct EFI_SECURITY2_ARCH_PROTOCOL {
+        EFI_SECURITY2_FILE_AUTHENTICATION FileAuthentication;
+};
diff --git a/src/boot/efi/proto/shell-parameters.h b/src/boot/efi/proto/shell-parameters.h
new file mode 100644
index 0000000..8080922
--- /dev/null
+++ b/src/boot/efi/proto/shell-parameters.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_SHELL_PARAMETERS_PROTOCOL_GUID \
+        GUID_DEF(0x752f3136, 0x4e16, 0x4fdc, 0xa2, 0x2a, 0xe5, 0xf4, 0x68, 0x12, 0xf4, 0xca)
+
+typedef struct {
+        char16_t **Argv;
+        size_t Argc;
+        void *StdIn;
+        void *StdOut;
+        void *StdErr;
+} EFI_SHELL_PARAMETERS_PROTOCOL;
diff --git a/src/boot/efi/proto/simple-text-io.h b/src/boot/efi/proto/simple-text-io.h
new file mode 100644
index 0000000..95016d3
--- /dev/null
+++ b/src/boot/efi/proto/simple-text-io.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_SIMPLE_TEXT_INPUT_PROTOCOL_GUID \
+        GUID_DEF(0x387477c1, 0x69c7, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+#define EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL_GUID \
+        GUID_DEF(0xdd9e7534, 0x7762, 0x4698, 0x8c, 0x14, 0xf5, 0x85, 0x17, 0xa6, 0x25, 0xaa)
+#define EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL_GUID \
+        GUID_DEF(0x387477c2, 0x69c7, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+
+#define EFI_SHIFT_STATE_VALID     0x80000000U
+#define EFI_RIGHT_SHIFT_PRESSED   0x00000001U
+#define EFI_LEFT_SHIFT_PRESSED    0x00000002U
+#define EFI_RIGHT_CONTROL_PRESSED 0x00000004U
+#define EFI_LEFT_CONTROL_PRESSED  0x00000008U
+#define EFI_RIGHT_ALT_PRESSED     0x00000010U
+#define EFI_LEFT_ALT_PRESSED      0x00000020U
+#define EFI_RIGHT_LOGO_PRESSED    0x00000040U
+#define EFI_LEFT_LOGO_PRESSED     0x00000080U
+#define EFI_MENU_KEY_PRESSED      0x00000100U
+#define EFI_SYS_REQ_PRESSED       0x00000200U
+
+#define EFI_TOGGLE_STATE_VALID 0x80U
+#define EFI_KEY_STATE_EXPOSED  0x40U
+#define EFI_SCROLL_LOCK_ACTIVE 0x01U
+#define EFI_NUM_LOCK_ACTIVE    0x02U
+#define EFI_CAPS_LOCK_ACTIVE   0x04U
+
+enum {
+        EFI_BLACK        = 0x00,
+        EFI_BLUE         = 0x01,
+        EFI_GREEN        = 0x02,
+        EFI_CYAN         = EFI_BLUE | EFI_GREEN,
+        EFI_RED          = 0x04,
+        EFI_MAGENTA      = EFI_BLUE | EFI_RED,
+        EFI_BROWN        = EFI_GREEN | EFI_RED,
+        EFI_LIGHTGRAY    = EFI_BLUE | EFI_GREEN | EFI_RED,
+        EFI_BRIGHT       = 0x08,
+        EFI_DARKGRAY     = EFI_BLACK | EFI_BRIGHT,
+        EFI_LIGHTBLUE    = EFI_BLUE | EFI_BRIGHT,
+        EFI_LIGHTGREEN   = EFI_GREEN | EFI_BRIGHT,
+        EFI_LIGHTCYAN    = EFI_CYAN | EFI_BRIGHT,
+        EFI_LIGHTRED     = EFI_RED | EFI_BRIGHT,
+        EFI_LIGHTMAGENTA = EFI_MAGENTA | EFI_BRIGHT,
+        EFI_YELLOW       = EFI_BROWN | EFI_BRIGHT,
+        EFI_WHITE        = EFI_BLUE | EFI_GREEN | EFI_RED | EFI_BRIGHT,
+};
+
+#define EFI_TEXT_ATTR(fg, bg) ((fg) | ((bg) << 4))
+#define EFI_TEXT_ATTR_SWAP(c) EFI_TEXT_ATTR(((c) & 0xF0U) >> 4, (c) & 0xFU)
+
+enum {
+        SCAN_NULL            = 0x000,
+        SCAN_UP              = 0x001,
+        SCAN_DOWN            = 0x002,
+        SCAN_RIGHT           = 0x003,
+        SCAN_LEFT            = 0x004,
+        SCAN_HOME            = 0x005,
+        SCAN_END             = 0x006,
+        SCAN_INSERT          = 0x007,
+        SCAN_DELETE          = 0x008,
+        SCAN_PAGE_UP         = 0x009,
+        SCAN_PAGE_DOWN       = 0x00A,
+        SCAN_F1              = 0x00B,
+        SCAN_F2              = 0x00C,
+        SCAN_F3              = 0x00D,
+        SCAN_F4              = 0x00E,
+        SCAN_F5              = 0x00F,
+        SCAN_F6              = 0x010,
+        SCAN_F7              = 0x011,
+        SCAN_F8              = 0x012,
+        SCAN_F9              = 0x013,
+        SCAN_F10             = 0x014,
+        SCAN_F11             = 0x015,
+        SCAN_F12             = 0x016,
+        SCAN_ESC             = 0x017,
+        SCAN_PAUSE           = 0x048,
+        SCAN_F13             = 0x068,
+        SCAN_F14             = 0x069,
+        SCAN_F15             = 0x06A,
+        SCAN_F16             = 0x06B,
+        SCAN_F17             = 0x06C,
+        SCAN_F18             = 0x06D,
+        SCAN_F19             = 0x06E,
+        SCAN_F20             = 0x06F,
+        SCAN_F21             = 0x070,
+        SCAN_F22             = 0x071,
+        SCAN_F23             = 0x072,
+        SCAN_F24             = 0x073,
+        SCAN_MUTE            = 0x07F,
+        SCAN_VOLUME_UP       = 0x080,
+        SCAN_VOLUME_DOWN     = 0x081,
+        SCAN_BRIGHTNESS_UP   = 0x100,
+        SCAN_BRIGHTNESS_DOWN = 0x101,
+        SCAN_SUSPEND         = 0x102,
+        SCAN_HIBERNATE       = 0x103,
+        SCAN_TOGGLE_DISPLAY  = 0x104,
+        SCAN_RECOVERY        = 0x105,
+        SCAN_EJECT           = 0x106,
+};
+
+typedef struct {
+        uint16_t ScanCode;
+        char16_t UnicodeChar;
+} EFI_INPUT_KEY;
+
+typedef struct {
+        uint32_t KeyShiftState;
+        uint8_t KeyToggleState;
+} EFI_KEY_STATE;
+
+typedef struct {
+        EFI_INPUT_KEY Key;
+        EFI_KEY_STATE KeyState;
+} EFI_KEY_DATA;
+
+struct EFI_SIMPLE_TEXT_INPUT_PROTOCOL {
+        EFI_STATUS (EFIAPI *Reset)(
+                        EFI_SIMPLE_TEXT_INPUT_PROTOCOL *This,
+                        bool ExtendedVerification);
+        EFI_STATUS (EFIAPI *ReadKeyStroke)(
+                        EFI_SIMPLE_TEXT_INPUT_PROTOCOL *This,
+                        EFI_INPUT_KEY *Key);
+        EFI_EVENT WaitForKey;
+};
+
+typedef struct EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL;
+struct EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL {
+        EFI_STATUS (EFIAPI *Reset)(
+                        EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL *This,
+                        bool ExtendedVerification);
+        EFI_STATUS (EFIAPI *ReadKeyStrokeEx)(
+                        EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL *This,
+                        EFI_KEY_DATA *KeyData);
+        EFI_EVENT WaitForKeyEx;
+        void *SetState;
+        void *RegisterKeyNotify;
+        void *UnregisterKeyNotify;
+};
+
+typedef struct EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL;
+struct EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL {
+        EFI_STATUS (EFIAPI *Reset)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        bool ExtendedVerification);
+        EFI_STATUS (EFIAPI *OutputString)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        char16_t *String);
+        EFI_STATUS (EFIAPI *TestString)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        char16_t *String);
+        EFI_STATUS (EFIAPI *QueryMode)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        size_t ModeNumber,
+                        size_t *Columns,
+                        size_t *Rows);
+        EFI_STATUS (EFIAPI *SetMode)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        size_t ModeNumber);
+        EFI_STATUS (EFIAPI *SetAttribute)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        size_t Attribute);
+        EFI_STATUS (EFIAPI *ClearScreen)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This);
+        EFI_STATUS (EFIAPI *SetCursorPosition)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        size_t Column,
+                        size_t Row);
+        EFI_STATUS (EFIAPI *EnableCursor)(
+                        EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL *This,
+                        bool Visible);
+        struct {
+                int32_t MaxMode;
+                int32_t Mode;
+                int32_t Attribute;
+                int32_t CursorColumn;
+                int32_t CursorRow;
+                bool CursorVisible;
+        } *Mode;
+};
diff --git a/src/boot/efi/proto/tcg.h b/src/boot/efi/proto/tcg.h
new file mode 100644
index 0000000..b4b8296
--- /dev/null
+++ b/src/boot/efi/proto/tcg.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+#define EFI_TCG_PROTOCOL_GUID \
+        GUID_DEF(0xf541796d, 0xa62e, 0x4954, 0xa7, 0x75, 0x95, 0x84, 0xf6, 0x1b, 0x9c, 0xdd)
+#define EFI_TCG2_PROTOCOL_GUID \
+        GUID_DEF(0x607f766c, 0x7455, 0x42be, 0x93, 0x0b, 0xe4, 0xd7, 0x6d, 0xb2, 0x72, 0x0f)
+
+#define TCG_ALG_SHA 0x4
+#define EFI_TCG2_EVENT_HEADER_VERSION 1
+#define EV_IPL 13
+#define EV_EVENT_TAG UINT32_C(6)
+
+typedef struct {
+        uint8_t Major;
+        uint8_t Minor;
+        uint8_t RevMajor;
+        uint8_t RevMinor;
+} TCG_VERSION;
+
+typedef struct {
+        uint8_t Major;
+        uint8_t Minor;
+} EFI_TCG2_VERSION;
+
+typedef struct {
+        uint8_t Size;
+        TCG_VERSION StructureVersion;
+        TCG_VERSION ProtocolSpecVersion;
+        uint8_t HashAlgorithmBitmap;
+        bool TPMPresentFlag;
+        bool TPMDeactivatedFlag;
+} EFI_TCG_BOOT_SERVICE_CAPABILITY;
+
+typedef struct {
+        uint8_t Size;
+        EFI_TCG2_VERSION StructureVersion;
+        EFI_TCG2_VERSION ProtocolVersion;
+        uint32_t HashAlgorithmBitmap;
+        uint32_t SupportedEventLogs;
+        bool TPMPresentFlag;
+        uint16_t MaxCommandSize;
+        uint16_t MaxResponseSize;
+        uint32_t ManufacturerID;
+        uint32_t NumberOfPCRBanks;
+        uint32_t ActivePcrBanks;
+} EFI_TCG2_BOOT_SERVICE_CAPABILITY;
+
+typedef struct {
+        uint32_t PCRIndex;
+        uint32_t EventType;
+        struct {
+                uint8_t Digest[20];
+        } Digest;
+        uint32_t EventSize;
+        uint8_t Event[];
+} _packed_ TCG_PCR_EVENT;
+
+typedef struct {
+        uint32_t HeaderSize;
+        uint16_t HeaderVersion;
+        uint32_t PCRIndex;
+        uint32_t EventType;
+} _packed_ EFI_TCG2_EVENT_HEADER;
+
+typedef struct {
+        uint32_t Size;
+        EFI_TCG2_EVENT_HEADER Header;
+        uint8_t Event[];
+} _packed_ EFI_TCG2_EVENT;
+
+typedef struct {
+        uint32_t EventId;
+        uint32_t EventSize;
+        uint8_t Event[];
+} _packed_ EFI_TCG2_TAGGED_EVENT;
+
+typedef struct EFI_TCG_PROTOCOL EFI_TCG_PROTOCOL;
+struct EFI_TCG_PROTOCOL {
+        EFI_STATUS (EFIAPI *StatusCheck)(
+                        EFI_TCG_PROTOCOL *This,
+                        EFI_TCG_BOOT_SERVICE_CAPABILITY *ProtocolCapability,
+                        uint32_t *TCGFeatureFlags,
+                        EFI_PHYSICAL_ADDRESS *EventLogLocation,
+                        EFI_PHYSICAL_ADDRESS *EventLogLastEntry);
+        void *HashAll;
+        void *LogEvent;
+        void *PassThroughToTpm;
+        EFI_STATUS (EFIAPI *HashLogExtendEvent)(
+                        EFI_TCG_PROTOCOL *This,
+                        EFI_PHYSICAL_ADDRESS HashData,
+                        uint64_t HashDataLen,
+                        uint32_t AlgorithmId,
+                        TCG_PCR_EVENT *TCGLogData,
+                        uint32_t *EventNumber,
+                        EFI_PHYSICAL_ADDRESS *EventLogLastEntry);
+};
+
+typedef struct EFI_TCG2_PROTOCOL EFI_TCG2_PROTOCOL;
+struct EFI_TCG2_PROTOCOL {
+        EFI_STATUS (EFIAPI *GetCapability)(
+                        EFI_TCG2_PROTOCOL *This,
+                        EFI_TCG2_BOOT_SERVICE_CAPABILITY *ProtocolCapability);
+        void *GetEventLog;
+        EFI_STATUS (EFIAPI *HashLogExtendEvent)(
+                        EFI_TCG2_PROTOCOL *This,
+                        uint64_t Flags,
+                        EFI_PHYSICAL_ADDRESS DataToHash,
+                        uint64_t DataToHashLen,
+                        EFI_TCG2_EVENT *EfiTcgEvent);
+        void *SubmitCommand;
+        void *GetActivePcrBanks;
+        void *SetActivePcrBanks;
+        void *GetResultOfSetActivePcrBanks;
+};
diff --git a/src/boot/efi/random-seed.c b/src/boot/efi/random-seed.c
new file mode 100644
index 0000000..8147e54
--- /dev/null
+++ b/src/boot/efi/random-seed.c
@@ -0,0 +1,325 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "memory-util-fundamental.h"
+#include "proto/rng.h"
+#include "random-seed.h"
+#include "secure-boot.h"
+#include "sha256.h"
+#include "util.h"
+
+#define RANDOM_MAX_SIZE_MIN (32U)
+#define RANDOM_MAX_SIZE_MAX (32U*1024U)
+
+struct linux_efi_random_seed {
+        uint32_t size;
+        uint8_t seed[];
+};
+
+#define LINUX_EFI_RANDOM_SEED_TABLE_GUID \
+        { 0x1ce1e5bc, 0x7ceb, 0x42f2, { 0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b } }
+
+/* SHA256 gives us 256/8=32 bytes */
+#define HASH_VALUE_SIZE 32
+
+/* Linux's RNG is 256 bits, so let's provide this much */
+#define DESIRED_SEED_SIZE 32
+
+/* Some basic domain separation in case somebody uses this data elsewhere */
+#define HASH_LABEL "systemd-boot random seed label v1"
+
+static EFI_STATUS acquire_rng(void *ret, size_t size) {
+        EFI_RNG_PROTOCOL *rng;
+        EFI_STATUS err;
+
+        assert(ret);
+
+        /* Try to acquire the specified number of bytes from the UEFI RNG */
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_RNG_PROTOCOL), NULL, (void **) &rng);
+        if (err != EFI_SUCCESS)
+                return err;
+        if (!rng)
+                return EFI_UNSUPPORTED;
+
+        err = rng->GetRNG(rng, NULL, size, ret);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to acquire RNG data: %m");
+        return EFI_SUCCESS;
+}
+
+static EFI_STATUS acquire_system_token(void **ret, size_t *ret_size) {
+        _cleanup_free_ char *data = NULL;
+        EFI_STATUS err;
+        size_t size;
+
+        assert(ret);
+        assert(ret_size);
+
+        err = efivar_get_raw(MAKE_GUID_PTR(LOADER), u"LoaderSystemToken", &data, &size);
+        if (err != EFI_SUCCESS) {
+                if (err != EFI_NOT_FOUND)
+                        log_error_status(err, "Failed to read LoaderSystemToken EFI variable: %m");
+                return err;
+        }
+
+        if (size <= 0)
+                return log_error_status(EFI_NOT_FOUND, "System token too short, ignoring.");
+
+        *ret = TAKE_PTR(data);
+        *ret_size = size;
+
+        return EFI_SUCCESS;
+}
+
+static void validate_sha256(void) {
+
+#ifdef EFI_DEBUG
+        /* Let's validate our SHA256 implementation. We stole it from glibc, and converted it to UEFI
+         * style. We better check whether it does the right stuff. We use the simpler test vectors from the
+         * SHA spec. Note that we strip this out in optimization builds. */
+
+        static const struct {
+                const char *string;
+                uint8_t hash[HASH_VALUE_SIZE];
+        } array[] = {
+                { "abc",
+                  { 0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea,
+                    0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23,
+                    0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c,
+                    0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad }},
+
+                { "",
+                  { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14,
+                    0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24,
+                    0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c,
+                    0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55 }},
+
+                { "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+                  { 0x24, 0x8d, 0x6a, 0x61, 0xd2, 0x06, 0x38, 0xb8,
+                    0xe5, 0xc0, 0x26, 0x93, 0x0c, 0x3e, 0x60, 0x39,
+                    0xa3, 0x3c, 0xe4, 0x59, 0x64, 0xff, 0x21, 0x67,
+                    0xf6, 0xec, 0xed, 0xd4, 0x19, 0xdb, 0x06, 0xc1 }},
+
+                { "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
+                  { 0xcf, 0x5b, 0x16, 0xa7, 0x78, 0xaf, 0x83, 0x80,
+                    0x03, 0x6c, 0xe5, 0x9e, 0x7b, 0x04, 0x92, 0x37,
+                    0x0b, 0x24, 0x9b, 0x11, 0xe8, 0xf0, 0x7a, 0x51,
+                    0xaf, 0xac, 0x45, 0x03, 0x7a, 0xfe, 0xe9, 0xd1 }},
+        };
+
+        for (size_t i = 0; i < ELEMENTSOF(array); i++)
+                assert(memcmp(SHA256_DIRECT(array[i].string, strlen8(array[i].string)), array[i].hash, HASH_VALUE_SIZE) == 0);
+#endif
+}
+
+EFI_STATUS process_random_seed(EFI_FILE *root_dir) {
+        uint8_t random_bytes[DESIRED_SEED_SIZE], hash_key[HASH_VALUE_SIZE];
+        _cleanup_free_ struct linux_efi_random_seed *new_seed_table = NULL;
+        struct linux_efi_random_seed *previous_seed_table = NULL;
+        _cleanup_free_ void *seed = NULL, *system_token = NULL;
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        _cleanup_free_ EFI_FILE_INFO *info = NULL;
+        struct sha256_ctx hash;
+        uint64_t uefi_monotonic_counter = 0;
+        size_t size, rsize, wsize;
+        bool seeded_by_efi = false;
+        EFI_STATUS err;
+        EFI_TIME now;
+
+        CLEANUP_ERASE(random_bytes);
+        CLEANUP_ERASE(hash_key);
+        CLEANUP_ERASE(hash);
+
+        assert(root_dir);
+        assert_cc(DESIRED_SEED_SIZE == HASH_VALUE_SIZE);
+
+        validate_sha256();
+
+        /* hash = LABEL || sizeof(input1) || input1 || ... || sizeof(inputN) || inputN */
+        sha256_init_ctx(&hash);
+
+        /* Some basic domain separation in case somebody uses this data elsewhere */
+        sha256_process_bytes(HASH_LABEL, sizeof(HASH_LABEL) - 1, &hash);
+
+        previous_seed_table = find_configuration_table(MAKE_GUID_PTR(LINUX_EFI_RANDOM_SEED_TABLE));
+        if (!previous_seed_table) {
+                size = 0;
+                sha256_process_bytes(&size, sizeof(size), &hash);
+        } else {
+                size = previous_seed_table->size;
+                seeded_by_efi = size >= DESIRED_SEED_SIZE;
+                sha256_process_bytes(&size, sizeof(size), &hash);
+                sha256_process_bytes(previous_seed_table->seed, size, &hash);
+
+                /* Zero and free the previous seed table only at the end after we've managed to install a new
+                 * one, so that in case this function fails or aborts, Linux still receives whatever the
+                 * previous bootloader chain set. So, the next line of this block is not an explicit_bzero()
+                 * call. */
+        }
+
+        /* Request some random data from the UEFI RNG. We don't need this to work safely, but it's a good
+         * idea to use it because it helps us for cases where users mistakenly include a random seed in
+         * golden master images that are replicated many times. */
+        err = acquire_rng(random_bytes, sizeof(random_bytes));
+        if (err != EFI_SUCCESS) {
+                size = 0;
+                /* If we can't get any randomness from EFI itself, then we'll only be relying on what's in
+                 * ESP. But ESP is mutable, so if secure boot is enabled, we probably shouldn't trust that
+                 * alone, in which case we bail out early. */
+                if (!seeded_by_efi && secure_boot_enabled())
+                        return EFI_NOT_FOUND;
+        } else {
+                seeded_by_efi = true;
+                size = sizeof(random_bytes);
+        }
+        sha256_process_bytes(&size, sizeof(size), &hash);
+        sha256_process_bytes(random_bytes, size, &hash);
+
+        /* Get some system specific seed that the installer might have placed in an EFI variable. We include
+         * it in our hash. This is protection against golden master image sloppiness, and it remains on the
+         * system, even when disk images are duplicated or swapped out. */
+        size = 0;
+        err = acquire_system_token(&system_token, &size);
+        if ((err != EFI_SUCCESS || size < DESIRED_SEED_SIZE) && !seeded_by_efi)
+                return err;
+        sha256_process_bytes(&size, sizeof(size), &hash);
+        if (system_token) {
+                sha256_process_bytes(system_token, size, &hash);
+                explicit_bzero_safe(system_token, size);
+        }
+
+        err = root_dir->Open(
+                        root_dir,
+                        &handle,
+                        (char16_t *) u"\\loader\\random-seed",
+                        EFI_FILE_MODE_READ | EFI_FILE_MODE_WRITE,
+                        0);
+        if (err != EFI_SUCCESS) {
+                if (err != EFI_NOT_FOUND && err != EFI_WRITE_PROTECTED)
+                        log_error_status(err, "Failed to open random seed file: %m");
+                return err;
+        }
+
+        err = get_file_info(handle, &info, NULL);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to get file info for random seed: %m");
+
+        size = info->FileSize;
+        if (size < RANDOM_MAX_SIZE_MIN)
+                return log_error("Random seed file is too short.");
+
+        if (size > RANDOM_MAX_SIZE_MAX)
+                return log_error("Random seed file is too large.");
+
+        seed = xmalloc(size);
+        rsize = size;
+        err = handle->Read(handle, &rsize, seed);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to read random seed file: %m");
+        if (rsize != size) {
+                explicit_bzero_safe(seed, rsize);
+                return log_error_status(EFI_PROTOCOL_ERROR, "Short read on random seed file.");
+        }
+
+        sha256_process_bytes(&size, sizeof(size), &hash);
+        sha256_process_bytes(seed, size, &hash);
+        explicit_bzero_safe(seed, size);
+
+        err = handle->SetPosition(handle, 0);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to seek to beginning of random seed file: %m");
+
+        /* Let's also include the UEFI monotonic counter (which is supposedly increasing on every single
+         * boot) in the hash, so that even if the changes to the ESP for some reason should not be
+         * persistent, the random seed we generate will still be different on every single boot. */
+        err = BS->GetNextMonotonicCount(&uefi_monotonic_counter);
+        if (err != EFI_SUCCESS && !seeded_by_efi)
+                return log_error_status(err, "Failed to acquire UEFI monotonic counter: %m");
+        size = sizeof(uefi_monotonic_counter);
+        sha256_process_bytes(&size, sizeof(size), &hash);
+        sha256_process_bytes(&uefi_monotonic_counter, size, &hash);
+
+        err = RT->GetTime(&now, NULL);
+        size = err == EFI_SUCCESS ? sizeof(now) : 0; /* Known to be flaky, so don't bark on error. */
+        sha256_process_bytes(&size, sizeof(size), &hash);
+        sha256_process_bytes(&now, size, &hash);
+
+        /* hash_key = HASH(hash) */
+        sha256_finish_ctx(&hash, hash_key);
+
+        /* hash = hash_key || 0 */
+        sha256_init_ctx(&hash);
+        sha256_process_bytes(hash_key, sizeof(hash_key), &hash);
+        sha256_process_bytes(&(const uint8_t){ 0 }, sizeof(uint8_t), &hash);
+        /* random_bytes = HASH(hash) */
+        sha256_finish_ctx(&hash, random_bytes);
+
+        size = sizeof(random_bytes);
+        /* If the file size is too large, zero out the remaining bytes on disk. */
+        if (size < info->FileSize) {
+                err = handle->SetPosition(handle, size);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to seek to offset of random seed file: %m");
+                wsize = info->FileSize - size;
+                err = handle->Write(handle, &wsize, seed /* All zeros now */);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to write random seed file: %m");
+                if (wsize != info->FileSize - size)
+                        return log_error_status(EFI_PROTOCOL_ERROR, "Short write on random seed file.");
+                err = handle->Flush(handle);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to flush random seed file: %m");
+                err = handle->SetPosition(handle, 0);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to seek to beginning of random seed file: %m");
+
+                /* We could truncate the file here with something like:
+                 *
+                 *     info->FileSize = size;
+                 *     err = handle->SetInfo(handle, &GenericFileInfo, info->Size, info);
+                 *     if (err != EFI_SUCCESS)
+                 *             return log_error_status(err, "Failed to truncate random seed file: %u");
+                 *
+                 * But this is considered slightly risky, because EFI filesystem drivers are a little bit
+                 * flimsy. So instead we rely on userspace eventually truncating this when it writes a new
+                 * seed. For now the best we do is zero it. */
+        }
+        /* Update the random seed on disk before we use it */
+        wsize = size;
+        err = handle->Write(handle, &wsize, random_bytes);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to write random seed file: %m");
+        if (wsize != size)
+                return log_error_status(EFI_PROTOCOL_ERROR, "Short write on random seed file.");
+        err = handle->Flush(handle);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to flush random seed file: %m");
+
+        err = BS->AllocatePool(EfiACPIReclaimMemory,
+                               offsetof(struct linux_efi_random_seed, seed) + DESIRED_SEED_SIZE,
+                               (void **) &new_seed_table);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to allocate EFI table for random seed: %m");
+        new_seed_table->size = DESIRED_SEED_SIZE;
+
+        /* hash = hash_key || 1 */
+        sha256_init_ctx(&hash);
+        sha256_process_bytes(hash_key, sizeof(hash_key), &hash);
+        sha256_process_bytes(&(const uint8_t){ 1 }, sizeof(uint8_t), &hash);
+        /* new_seed_table->seed = HASH(hash) */
+        sha256_finish_ctx(&hash, new_seed_table->seed);
+
+        err = BS->InstallConfigurationTable(MAKE_GUID_PTR(LINUX_EFI_RANDOM_SEED_TABLE), new_seed_table);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to install EFI table for random seed: %m");
+        TAKE_PTR(new_seed_table);
+
+        if (previous_seed_table) {
+                /* Now that we've succeeded in installing the new table, we can safely nuke the old one. */
+                explicit_bzero_safe(previous_seed_table->seed, previous_seed_table->size);
+                explicit_bzero_safe(previous_seed_table, sizeof(*previous_seed_table));
+                free(previous_seed_table);
+        }
+
+        return EFI_SUCCESS;
+}
diff --git a/src/boot/efi/random-seed.h b/src/boot/efi/random-seed.h
new file mode 100644
index 0000000..67f005d
--- /dev/null
+++ b/src/boot/efi/random-seed.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+EFI_STATUS process_random_seed(EFI_FILE *root_dir);
diff --git a/src/boot/efi/secure-boot.c b/src/boot/efi/secure-boot.c
new file mode 100644
index 0000000..f76d2f4
--- /dev/null
+++ b/src/boot/efi/secure-boot.c
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "console.h"
+#include "proto/security-arch.h"
+#include "secure-boot.h"
+#include "util.h"
+#include "vmm.h"
+
+bool secure_boot_enabled(void) {
+        bool secure = false;  /* avoid false maybe-uninitialized warning */
+        EFI_STATUS err;
+
+        err = efivar_get_boolean_u8(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"SecureBoot", &secure);
+
+        return err == EFI_SUCCESS && secure;
+}
+
+SecureBootMode secure_boot_mode(void) {
+        bool secure, audit = false, deployed = false, setup = false;
+        EFI_STATUS err;
+
+        err = efivar_get_boolean_u8(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"SecureBoot", &secure);
+        if (err != EFI_SUCCESS)
+                return SECURE_BOOT_UNSUPPORTED;
+
+        /* We can assume false for all these if they are abscent (AuditMode and
+         * DeployedMode may not exist on older firmware). */
+        (void) efivar_get_boolean_u8(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"AuditMode", &audit);
+        (void) efivar_get_boolean_u8(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"DeployedMode", &deployed);
+        (void) efivar_get_boolean_u8(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"SetupMode", &setup);
+
+        return decode_secure_boot_mode(secure, audit, deployed, setup);
+}
+
+EFI_STATUS secure_boot_enroll_at(EFI_FILE *root_dir, const char16_t *path, bool force) {
+        assert(root_dir);
+        assert(path);
+
+        EFI_STATUS err;
+
+        clear_screen(COLOR_NORMAL);
+
+        /* Enrolling secure boot keys is safe to do in virtualized environments as there is nothing
+         * we can brick there. */
+        bool is_safe = in_hypervisor();
+
+        if (!is_safe && !force)
+                return EFI_SUCCESS;
+
+        printf("Enrolling secure boot keys from directory: %ls\n", path);
+
+        if (!is_safe) {
+                printf("Warning: Enrolling custom Secure Boot keys might soft-brick your machine!\n");
+
+                unsigned timeout_sec = 15;
+                for (;;) {
+                        printf("\rEnrolling in %2u s, press any key to abort.", timeout_sec);
+
+                        uint64_t key;
+                        err = console_key_read(&key, 1000 * 1000);
+                        if (err == EFI_NOT_READY)
+                                continue;
+                        if (err == EFI_TIMEOUT) {
+                                if (timeout_sec == 0) /* continue enrolling keys */
+                                        break;
+                                timeout_sec--;
+                                continue;
+                        }
+                        if (err != EFI_SUCCESS)
+                                return log_error_status(
+                                                err,
+                                                "Error waiting for user input to enroll Secure Boot keys: %m");
+
+                        /* user aborted, returning EFI_SUCCESS here allows the user to go back to the menu */
+                        return EFI_SUCCESS;
+                }
+
+                printf("\n");
+        }
+
+        _cleanup_(file_closep) EFI_FILE *dir = NULL;
+
+        err = open_directory(root_dir, path, &dir);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed opening keys directory %ls: %m", path);
+
+        struct {
+                const char16_t *name;
+                const char16_t *filename;
+                const EFI_GUID vendor;
+                char *buffer;
+                size_t size;
+        } sb_vars[] = {
+                { u"db",  u"db.auth",  EFI_IMAGE_SECURITY_DATABASE_GUID, NULL, 0 },
+                { u"KEK", u"KEK.auth", EFI_GLOBAL_VARIABLE, NULL, 0 },
+                { u"PK",  u"PK.auth",  EFI_GLOBAL_VARIABLE, NULL, 0 },
+        };
+
+        /* Make sure all keys files exist before we start enrolling them by loading them from the disk first. */
+        for (size_t i = 0; i < ELEMENTSOF(sb_vars); i++) {
+                err = file_read(dir, sb_vars[i].filename, 0, 0, &sb_vars[i].buffer, &sb_vars[i].size);
+                if (err != EFI_SUCCESS) {
+                        log_error_status(err, "Failed reading file %ls\\%ls: %m", path, sb_vars[i].filename);
+                        goto out_deallocate;
+                }
+        }
+
+        for (size_t i = 0; i < ELEMENTSOF(sb_vars); i++) {
+                uint32_t sb_vars_opts =
+                        EFI_VARIABLE_NON_VOLATILE |
+                        EFI_VARIABLE_BOOTSERVICE_ACCESS |
+                        EFI_VARIABLE_RUNTIME_ACCESS |
+                        EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS;
+
+                err = efivar_set_raw(&sb_vars[i].vendor, sb_vars[i].name, sb_vars[i].buffer, sb_vars[i].size, sb_vars_opts);
+                if (err != EFI_SUCCESS) {
+                        log_error_status(err, "Failed to write %ls secure boot variable: %m", sb_vars[i].name);
+                        goto out_deallocate;
+                }
+        }
+
+        printf("Custom Secure Boot keys successfully enrolled, rebooting the system now!\n");
+        /* The system should be in secure boot mode now and we could continue a regular boot. But at least
+         * TPM PCR7 measurements should change on next boot. Reboot now so that any OS we load does not end
+         * up relying on the old PCR state. */
+        RT->ResetSystem(EfiResetCold, EFI_SUCCESS, 0, NULL);
+        assert_not_reached();
+
+out_deallocate:
+        for (size_t i = 0; i < ELEMENTSOF(sb_vars); i++)
+                free(sb_vars[i].buffer);
+
+        return err;
+}
+
+static struct SecurityOverride {
+        EFI_SECURITY_ARCH_PROTOCOL *security;
+        EFI_SECURITY2_ARCH_PROTOCOL *security2;
+        EFI_SECURITY_FILE_AUTHENTICATION_STATE original_hook;
+        EFI_SECURITY2_FILE_AUTHENTICATION original_hook2;
+
+        security_validator_t validator;
+        const void *validator_ctx;
+} security_override;
+
+static EFIAPI EFI_STATUS security_hook(
+                const EFI_SECURITY_ARCH_PROTOCOL *this,
+                uint32_t authentication_status,
+                const EFI_DEVICE_PATH *file) {
+
+        assert(security_override.validator);
+        assert(security_override.security);
+        assert(security_override.original_hook);
+
+        if (security_override.validator(security_override.validator_ctx, file, NULL, 0))
+                return EFI_SUCCESS;
+
+        return security_override.original_hook(security_override.security, authentication_status, file);
+}
+
+static EFIAPI EFI_STATUS security2_hook(
+                const EFI_SECURITY2_ARCH_PROTOCOL *this,
+                const EFI_DEVICE_PATH *device_path,
+                void *file_buffer,
+                size_t file_size,
+                bool boot_policy) {
+
+        assert(security_override.validator);
+        assert(security_override.security2);
+        assert(security_override.original_hook2);
+
+        if (security_override.validator(security_override.validator_ctx, device_path, file_buffer, file_size))
+                return EFI_SUCCESS;
+
+        return security_override.original_hook2(
+                        security_override.security2, device_path, file_buffer, file_size, boot_policy);
+}
+
+/* This replaces the platform provided security arch protocols hooks (defined in the UEFI Platform
+ * Initialization Specification) with our own that uses the given validator to decide if a image is to be
+ * trusted. If not running in secure boot or the protocols are not available nothing happens. The override
+ * must be removed with uninstall_security_override() after LoadImage() has been called.
+ *
+ * This is a hack as we do not own the security protocol instances and modifying them is not an official part
+ * of their spec. But there is little else we can do to circumvent secure boot short of implementing our own
+ * PE loader. We could replace the firmware instances with our own instance using
+ * ReinstallProtocolInterface(), but some firmware will still use the old ones. */
+void install_security_override(security_validator_t validator, const void *validator_ctx) {
+        EFI_STATUS err;
+
+        assert(validator);
+
+        if (!secure_boot_enabled())
+                return;
+
+        security_override = (struct SecurityOverride) {
+                .validator = validator,
+                .validator_ctx = validator_ctx,
+        };
+
+        EFI_SECURITY_ARCH_PROTOCOL *security = NULL;
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_SECURITY_ARCH_PROTOCOL), NULL, (void **) &security);
+        if (err == EFI_SUCCESS) {
+                security_override.security = security;
+                security_override.original_hook = security->FileAuthenticationState;
+                security->FileAuthenticationState = security_hook;
+        }
+
+        EFI_SECURITY2_ARCH_PROTOCOL *security2 = NULL;
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_SECURITY2_ARCH_PROTOCOL), NULL, (void **) &security2);
+        if (err == EFI_SUCCESS) {
+                security_override.security2 = security2;
+                security_override.original_hook2 = security2->FileAuthentication;
+                security2->FileAuthentication = security2_hook;
+        }
+}
+
+void uninstall_security_override(void) {
+        if (security_override.original_hook)
+                security_override.security->FileAuthenticationState = security_override.original_hook;
+        if (security_override.original_hook2)
+                security_override.security2->FileAuthentication = security_override.original_hook2;
+}
diff --git a/src/boot/efi/secure-boot.h b/src/boot/efi/secure-boot.h
new file mode 100644
index 0000000..3471131
--- /dev/null
+++ b/src/boot/efi/secure-boot.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+#include "efivars-fundamental.h"
+
+typedef enum {
+        ENROLL_OFF,         /* no Secure Boot key enrollment whatsoever, even manual entries are not generated */
+        ENROLL_MANUAL,      /* Secure Boot key enrollment is strictly manual: manual entries are generated and need to be selected by the user */
+        ENROLL_IF_SAFE,     /* Automatically enroll if it is safe (if we are running inside a VM, for example). */
+        ENROLL_FORCE,       /* Secure Boot key enrollment may be automatic if it is available but might not be safe */
+} secure_boot_enroll;
+
+bool secure_boot_enabled(void);
+SecureBootMode secure_boot_mode(void);
+
+EFI_STATUS secure_boot_enroll_at(EFI_FILE *root_dir, const char16_t *path, bool force);
+
+typedef bool (*security_validator_t)(
+                const void *ctx,
+                const EFI_DEVICE_PATH *device_path,
+                const void *file_buffer,
+                size_t file_size);
+
+void install_security_override(security_validator_t validator, const void *validator_ctx);
+void uninstall_security_override(void);
diff --git a/src/boot/efi/shim.c b/src/boot/efi/shim.c
new file mode 100644
index 0000000..df136ed
--- /dev/null
+++ b/src/boot/efi/shim.c
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Port to systemd-boot
+ * Copyright © 2017 Max Resch 
+ *
+ * Security Policy Handling
+ * Copyright © 2012 
+ * https://github.com/mjg59/efitools
+ */
+
+#include "device-path-util.h"
+#include "secure-boot.h"
+#include "shim.h"
+#include "util.h"
+
+#if defined(__x86_64__) || defined(__i386__)
+#define __sysv_abi__ __attribute__((sysv_abi))
+#else
+#define __sysv_abi__
+#endif
+
+struct ShimLock {
+        EFI_STATUS __sysv_abi__ (*shim_verify) (const void *buffer, uint32_t size);
+
+        /* context is actually a struct for the PE header, but it isn't needed so void is sufficient just do define the interface
+         * see shim.c/shim.h and PeHeader.h in the github shim repo */
+        EFI_STATUS __sysv_abi__ (*generate_hash) (void *data, uint32_t datasize, void *context, uint8_t *sha256hash, uint8_t *sha1hash);
+
+        EFI_STATUS __sysv_abi__ (*read_header) (void *data, uint32_t datasize, void *context);
+};
+
+#define SHIM_LOCK_GUID \
+        { 0x605dab50, 0xe046, 0x4300, { 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23 } }
+
+bool shim_loaded(void) {
+        struct ShimLock *shim_lock;
+
+        return BS->LocateProtocol(MAKE_GUID_PTR(SHIM_LOCK), NULL, (void **) &shim_lock) == EFI_SUCCESS;
+}
+
+static bool shim_validate(
+                const void *ctx, const EFI_DEVICE_PATH *device_path, const void *file_buffer, size_t file_size) {
+
+        EFI_STATUS err;
+        _cleanup_free_ char *file_buffer_owned = NULL;
+
+        if (!file_buffer) {
+                if (!device_path)
+                        return false;
+
+                EFI_HANDLE device_handle;
+                EFI_DEVICE_PATH *file_dp = (EFI_DEVICE_PATH *) device_path;
+                err = BS->LocateDevicePath(
+                                MAKE_GUID_PTR(EFI_SIMPLE_FILE_SYSTEM_PROTOCOL), &file_dp, &device_handle);
+                if (err != EFI_SUCCESS)
+                        return false;
+
+                _cleanup_(file_closep) EFI_FILE *root = NULL;
+                err = open_volume(device_handle, &root);
+                if (err != EFI_SUCCESS)
+                        return false;
+
+                _cleanup_free_ char16_t *dp_str = NULL;
+                err = device_path_to_str(file_dp, &dp_str);
+                if (err != EFI_SUCCESS)
+                        return false;
+
+                err = file_read(root, dp_str, 0, 0, &file_buffer_owned, &file_size);
+                if (err != EFI_SUCCESS)
+                        return false;
+
+                file_buffer = file_buffer_owned;
+        }
+
+        struct ShimLock *shim_lock;
+        err = BS->LocateProtocol(MAKE_GUID_PTR(SHIM_LOCK), NULL, (void **) &shim_lock);
+        if (err != EFI_SUCCESS)
+                return false;
+
+        return shim_lock->shim_verify(file_buffer, file_size) == EFI_SUCCESS;
+}
+
+EFI_STATUS shim_load_image(EFI_HANDLE parent, const EFI_DEVICE_PATH *device_path, EFI_HANDLE *ret_image) {
+        assert(device_path);
+        assert(ret_image);
+
+        bool have_shim = shim_loaded();
+
+        if (have_shim)
+                install_security_override(shim_validate, NULL);
+
+        EFI_STATUS ret = BS->LoadImage(
+                        /*BootPolicy=*/false, parent, (EFI_DEVICE_PATH *) device_path, NULL, 0, ret_image);
+
+        if (have_shim)
+                uninstall_security_override();
+
+        return ret;
+}
+
+void shim_retain_protocol(void) {
+        uint8_t value = 1;
+
+        /* Ask Shim to avoid uninstalling its security protocol, so that we can use it from sd-stub to
+         * validate PE addons. By default, Shim uninstalls its protocol when calling StartImage().
+         * Requires Shim 15.8. */
+        (void) efivar_set_raw(MAKE_GUID_PTR(SHIM_LOCK), u"ShimRetainProtocol", &value, sizeof(value), 0);
+}
diff --git a/src/boot/efi/shim.h b/src/boot/efi/shim.h
new file mode 100644
index 0000000..e0cb39f
--- /dev/null
+++ b/src/boot/efi/shim.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Port to systemd-boot
+ * Copyright © 2017 Max Resch 
+ *
+ * Security Policy Handling
+ * Copyright © 2012 
+ * https://github.com/mjg59/efitools
+ */
+#pragma once
+
+#include "efi.h"
+
+bool shim_loaded(void);
+EFI_STATUS shim_load_image(EFI_HANDLE parent, const EFI_DEVICE_PATH *device_path, EFI_HANDLE *ret_image);
+void shim_retain_protocol(void);
diff --git a/src/boot/efi/splash.c b/src/boot/efi/splash.c
new file mode 100644
index 0000000..8daeb71
--- /dev/null
+++ b/src/boot/efi/splash.c
@@ -0,0 +1,334 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "graphics.h"
+#include "logarithm.h"
+#include "proto/graphics-output.h"
+#include "splash.h"
+#include "unaligned-fundamental.h"
+#include "util.h"
+
+struct bmp_file {
+        char signature[2];
+        uint32_t size;
+        uint16_t reserved[2];
+        uint32_t offset;
+} _packed_;
+
+/* we require at least BITMAPINFOHEADER, later versions are
+   accepted, but their features ignored */
+struct bmp_dib {
+        uint32_t size;
+        uint32_t x;
+        uint32_t y;
+        uint16_t planes;
+        uint16_t depth;
+        uint32_t compression;
+        uint32_t image_size;
+        int32_t x_pixel_meter;
+        int32_t y_pixel_meter;
+        uint32_t colors_used;
+        uint32_t colors_important;
+        uint32_t channel_mask_r;
+        uint32_t channel_mask_g;
+        uint32_t channel_mask_b;
+        uint32_t channel_mask_a;
+} _packed_;
+
+#define SIZEOF_BMP_DIB offsetof(struct bmp_dib, channel_mask_r)
+#define SIZEOF_BMP_DIB_RGB offsetof(struct bmp_dib, channel_mask_a)
+#define SIZEOF_BMP_DIB_RGBA sizeof(struct bmp_dib)
+
+struct bmp_map {
+        uint8_t blue;
+        uint8_t green;
+        uint8_t red;
+        uint8_t reserved;
+} _packed_;
+
+static EFI_STATUS bmp_parse_header(
+                const uint8_t *bmp,
+                size_t size,
+                struct bmp_dib **ret_dib,
+                struct bmp_map **ret_map,
+                const uint8_t **pixmap) {
+
+        assert(bmp);
+        assert(ret_dib);
+        assert(ret_map);
+        assert(pixmap);
+
+        if (size < sizeof(struct bmp_file) + SIZEOF_BMP_DIB)
+                return EFI_INVALID_PARAMETER;
+
+        /* check file header */
+        struct bmp_file *file = (struct bmp_file *) bmp;
+        if (file->signature[0] != 'B' || file->signature[1] != 'M')
+                return EFI_INVALID_PARAMETER;
+        if (file->size != size)
+                return EFI_INVALID_PARAMETER;
+        if (file->size < file->offset)
+                return EFI_INVALID_PARAMETER;
+
+        /*  check device-independent bitmap */
+        struct bmp_dib *dib = (struct bmp_dib *) (bmp + sizeof(struct bmp_file));
+        if (dib->size < SIZEOF_BMP_DIB)
+                return EFI_UNSUPPORTED;
+
+        switch (dib->depth) {
+        case 1:
+        case 4:
+        case 8:
+        case 24:
+                if (dib->compression != 0)
+                        return EFI_UNSUPPORTED;
+
+                break;
+
+        case 16:
+        case 32:
+                if (dib->compression != 0 && dib->compression != 3)
+                        return EFI_UNSUPPORTED;
+
+                break;
+
+        default:
+                return EFI_UNSUPPORTED;
+        }
+
+        size_t row_size = ((size_t) dib->depth * dib->x + 31) / 32 * 4;
+        if (file->size - file->offset <  dib->y * row_size)
+                return EFI_INVALID_PARAMETER;
+        if (row_size * dib->y > 64 * 1024 * 1024)
+                return EFI_INVALID_PARAMETER;
+
+        /* check color table */
+        struct bmp_map *map = (struct bmp_map *) (bmp + sizeof(struct bmp_file) + dib->size);
+        if (file->offset < sizeof(struct bmp_file) + dib->size)
+                return EFI_INVALID_PARAMETER;
+
+        if (file->offset > sizeof(struct bmp_file) + dib->size) {
+                uint32_t map_count = 0;
+
+                if (dib->colors_used)
+                        map_count = dib->colors_used;
+                else if (IN_SET(dib->depth, 1, 4, 8))
+                        map_count = 1 << dib->depth;
+
+                size_t map_size = file->offset - (sizeof(struct bmp_file) + dib->size);
+                if (map_size != sizeof(struct bmp_map) * map_count)
+                        return EFI_INVALID_PARAMETER;
+        }
+
+        *ret_map = map;
+        *ret_dib = dib;
+        *pixmap = bmp + file->offset;
+
+        return EFI_SUCCESS;
+}
+
+enum Channels { R, G, B, A, _CHANNELS_MAX };
+static void read_channel_maks(
+                const struct bmp_dib *dib,
+                uint32_t channel_mask[static _CHANNELS_MAX],
+                uint8_t channel_shift[static _CHANNELS_MAX],
+                uint8_t channel_scale[static _CHANNELS_MAX]) {
+
+        assert(dib);
+
+        if (IN_SET(dib->depth, 16, 32) && dib->size >= SIZEOF_BMP_DIB_RGB) {
+                channel_mask[R] = dib->channel_mask_r;
+                channel_mask[G] = dib->channel_mask_g;
+                channel_mask[B] = dib->channel_mask_b;
+                channel_shift[R] = __builtin_ctz(dib->channel_mask_r);
+                channel_shift[G] = __builtin_ctz(dib->channel_mask_g);
+                channel_shift[B] = __builtin_ctz(dib->channel_mask_b);
+                channel_scale[R] = 0xff / ((1 << popcount(dib->channel_mask_r)) - 1);
+                channel_scale[G] = 0xff / ((1 << popcount(dib->channel_mask_g)) - 1);
+                channel_scale[B] = 0xff / ((1 << popcount(dib->channel_mask_b)) - 1);
+
+                if (dib->size >= SIZEOF_BMP_DIB_RGBA && dib->channel_mask_a != 0) {
+                        channel_mask[A] = dib->channel_mask_a;
+                        channel_shift[A] = __builtin_ctz(dib->channel_mask_a);
+                        channel_scale[A] = 0xff / ((1 << popcount(dib->channel_mask_a)) - 1);
+                } else {
+                        channel_mask[A] = 0;
+                        channel_shift[A] = 0;
+                        channel_scale[A] = 0;
+                }
+        } else {
+                bool bpp16 = dib->depth == 16;
+                channel_mask[R] = bpp16 ? 0x7C00 : 0xFF0000;
+                channel_mask[G] = bpp16 ? 0x03E0 : 0x00FF00;
+                channel_mask[B] = bpp16 ? 0x001F : 0x0000FF;
+                channel_mask[A] = bpp16 ? 0x0000 : 0x000000;
+                channel_shift[R] = bpp16 ? 0xA : 0x10;
+                channel_shift[G] = bpp16 ? 0x5 : 0x08;
+                channel_shift[B] = bpp16 ? 0x0 : 0x00;
+                channel_shift[A] = bpp16 ? 0x0 : 0x00;
+                channel_scale[R] = bpp16 ? 0x08 : 0x1;
+                channel_scale[G] = bpp16 ? 0x08 : 0x1;
+                channel_scale[B] = bpp16 ? 0x08 : 0x1;
+                channel_scale[A] = bpp16 ? 0x00 : 0x0;
+        }
+}
+
+static EFI_STATUS bmp_to_blt(
+                EFI_GRAPHICS_OUTPUT_BLT_PIXEL *buf,
+                struct bmp_dib *dib,
+                struct bmp_map *map,
+                const uint8_t *pixmap) {
+
+        const uint8_t *in;
+
+        assert(buf);
+        assert(dib);
+        assert(map);
+        assert(pixmap);
+
+        uint32_t channel_mask[_CHANNELS_MAX];
+        uint8_t channel_shift[_CHANNELS_MAX], channel_scale[_CHANNELS_MAX];
+        read_channel_maks(dib, channel_mask, channel_shift, channel_scale);
+
+        /* transform and copy pixels */
+        in = pixmap;
+        for (uint32_t y = 0; y < dib->y; y++) {
+                EFI_GRAPHICS_OUTPUT_BLT_PIXEL *out = &buf[(dib->y - y - 1) * dib->x];
+
+                for (uint32_t x = 0; x < dib->x; x++, in++, out++) {
+                        switch (dib->depth) {
+                        case 1: {
+                                for (unsigned i = 0; i < 8 && x < dib->x; i++) {
+                                        out->Red = map[((*in) >> (7 - i)) & 1].red;
+                                        out->Green = map[((*in) >> (7 - i)) & 1].green;
+                                        out->Blue = map[((*in) >> (7 - i)) & 1].blue;
+                                        out++;
+                                        x++;
+                                }
+                                out--;
+                                x--;
+                                break;
+                        }
+
+                        case 4: {
+                                unsigned i = (*in) >> 4;
+                                out->Red = map[i].red;
+                                out->Green = map[i].green;
+                                out->Blue = map[i].blue;
+                                if (x < (dib->x - 1)) {
+                                        out++;
+                                        x++;
+                                        i = (*in) & 0x0f;
+                                        out->Red = map[i].red;
+                                        out->Green = map[i].green;
+                                        out->Blue = map[i].blue;
+                                }
+                                break;
+                        }
+
+                        case 8:
+                                out->Red = map[*in].red;
+                                out->Green = map[*in].green;
+                                out->Blue = map[*in].blue;
+                                break;
+
+                        case 24:
+                                out->Red = in[2];
+                                out->Green = in[1];
+                                out->Blue = in[0];
+                                in += 2;
+                                break;
+
+                        case 16:
+                        case 32: {
+                                uint32_t i = dib->depth == 16 ? unaligned_read_ne16(in) :
+                                                                unaligned_read_ne32(in);
+
+                                uint8_t r = ((i & channel_mask[R]) >> channel_shift[R]) * channel_scale[R],
+                                        g = ((i & channel_mask[G]) >> channel_shift[G]) * channel_scale[G],
+                                        b = ((i & channel_mask[B]) >> channel_shift[B]) * channel_scale[B],
+                                        a = 0xFFu;
+                                if (channel_mask[A] != 0)
+                                        a = ((i & channel_mask[A]) >> channel_shift[A]) * channel_scale[A];
+
+                                out->Red = (out->Red * (0xFFu - a) + r * a) >> 8;
+                                out->Green = (out->Green * (0xFFu - a) + g * a) >> 8;
+                                out->Blue = (out->Blue * (0xFFu - a) + b * a) >> 8;
+
+                                in += dib->depth == 16 ? 1 : 3;
+                                break;
+                        }
+                        }
+                }
+
+                /* add row padding; new lines always start at 32 bit boundary */
+                size_t row_size = in - pixmap;
+                in += ((row_size + 3) & ~3) - row_size;
+        }
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS graphics_splash(const uint8_t *content, size_t len) {
+        EFI_GRAPHICS_OUTPUT_BLT_PIXEL background = {};
+        EFI_GRAPHICS_OUTPUT_PROTOCOL *GraphicsOutput = NULL;
+        struct bmp_dib *dib;
+        struct bmp_map *map;
+        const uint8_t *pixmap;
+        size_t x_pos = 0, y_pos = 0;
+        EFI_STATUS err;
+
+        if (len == 0)
+                return EFI_SUCCESS;
+
+        assert(content);
+
+        if (strcaseeq16(ST->FirmwareVendor, u"Apple")) {
+                background.Red = 0xc0;
+                background.Green = 0xc0;
+                background.Blue = 0xc0;
+        }
+
+        err = BS->LocateProtocol(MAKE_GUID_PTR(EFI_GRAPHICS_OUTPUT_PROTOCOL), NULL, (void **) &GraphicsOutput);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = bmp_parse_header(content, len, &dib, &map, &pixmap);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (dib->x < GraphicsOutput->Mode->Info->HorizontalResolution)
+                x_pos = (GraphicsOutput->Mode->Info->HorizontalResolution - dib->x) / 2;
+        if (dib->y < GraphicsOutput->Mode->Info->VerticalResolution)
+                y_pos = (GraphicsOutput->Mode->Info->VerticalResolution - dib->y) / 2;
+
+        err = GraphicsOutput->Blt(
+                        GraphicsOutput, &background,
+                        EfiBltVideoFill, 0, 0, 0, 0,
+                        GraphicsOutput->Mode->Info->HorizontalResolution,
+                        GraphicsOutput->Mode->Info->VerticalResolution, 0);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Read in current screen content to perform proper alpha blending. */
+        _cleanup_free_ EFI_GRAPHICS_OUTPUT_BLT_PIXEL *blt = xnew(
+                        EFI_GRAPHICS_OUTPUT_BLT_PIXEL, dib->x * dib->y);
+        err = GraphicsOutput->Blt(
+                        GraphicsOutput, blt,
+                        EfiBltVideoToBltBuffer, x_pos, y_pos, 0, 0,
+                        dib->x, dib->y, 0);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = bmp_to_blt(blt, dib, map, pixmap);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = graphics_mode(true);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        return GraphicsOutput->Blt(
+                        GraphicsOutput, blt,
+                        EfiBltBufferToVideo, 0, 0, x_pos, y_pos,
+                        dib->x, dib->y, 0);
+}
diff --git a/src/boot/efi/splash.h b/src/boot/efi/splash.h
new file mode 100644
index 0000000..a66eb24
--- /dev/null
+++ b/src/boot/efi/splash.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+EFI_STATUS graphics_splash(const uint8_t *content, size_t len);
diff --git a/src/boot/efi/stub.c b/src/boot/efi/stub.c
new file mode 100644
index 0000000..7ef3e76
--- /dev/null
+++ b/src/boot/efi/stub.c
@@ -0,0 +1,816 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cpio.h"
+#include "device-path-util.h"
+#include "devicetree.h"
+#include "graphics.h"
+#include "linux.h"
+#include "measure.h"
+#include "memory-util-fundamental.h"
+#include "part-discovery.h"
+#include "pe.h"
+#include "proto/shell-parameters.h"
+#include "random-seed.h"
+#include "sbat.h"
+#include "secure-boot.h"
+#include "shim.h"
+#include "splash.h"
+#include "tpm2-pcr.h"
+#include "uki.h"
+#include "util.h"
+#include "version.h"
+#include "vmm.h"
+
+/* magic string to find in the binary image */
+DECLARE_NOALLOC_SECTION(".sdmagic", "#### LoaderInfo: systemd-stub " GIT_VERSION " ####");
+
+DECLARE_SBAT(SBAT_STUB_SECTION_TEXT);
+
+static EFI_STATUS combine_initrd(
+                EFI_PHYSICAL_ADDRESS initrd_base, size_t initrd_size,
+                const void * const extra_initrds[], const size_t extra_initrd_sizes[], size_t n_extra_initrds,
+                Pages *ret_initr_pages, size_t *ret_initrd_size) {
+
+        size_t n;
+
+        assert(ret_initr_pages);
+        assert(ret_initrd_size);
+
+        /* Combines four initrds into one, by simple concatenation in memory */
+
+        n = ALIGN4(initrd_size); /* main initrd might not be padded yet */
+
+        for (size_t i = 0; i < n_extra_initrds; i++) {
+                if (!extra_initrds[i])
+                        continue;
+
+                if (n > SIZE_MAX - extra_initrd_sizes[i])
+                        return EFI_OUT_OF_RESOURCES;
+
+                n += extra_initrd_sizes[i];
+        }
+
+        _cleanup_pages_ Pages pages = xmalloc_pages(
+                        AllocateMaxAddress,
+                        EfiLoaderData,
+                        EFI_SIZE_TO_PAGES(n),
+                        UINT32_MAX /* Below 4G boundary. */);
+        uint8_t *p = PHYSICAL_ADDRESS_TO_POINTER(pages.addr);
+        if (initrd_base != 0) {
+                size_t pad;
+
+                /* Order matters, the real initrd must come first, since it might include microcode updates
+                 * which the kernel only looks for in the first cpio archive */
+                p = mempcpy(p, PHYSICAL_ADDRESS_TO_POINTER(initrd_base), initrd_size);
+
+                pad = ALIGN4(initrd_size) - initrd_size;
+                if (pad > 0)  {
+                        memzero(p, pad);
+                        p += pad;
+                }
+        }
+
+        for (size_t i = 0; i < n_extra_initrds; i++) {
+                if (!extra_initrds[i])
+                        continue;
+
+                p = mempcpy(p, extra_initrds[i], extra_initrd_sizes[i]);
+        }
+
+        assert(PHYSICAL_ADDRESS_TO_POINTER(pages.addr + n) == p);
+
+        *ret_initr_pages = pages;
+        *ret_initrd_size = n;
+        pages.n_pages = 0;
+
+        return EFI_SUCCESS;
+}
+
+static void export_variables(EFI_LOADED_IMAGE_PROTOCOL *loaded_image) {
+        static const uint64_t stub_features =
+                EFI_STUB_FEATURE_REPORT_BOOT_PARTITION |    /* We set LoaderDevicePartUUID */
+                EFI_STUB_FEATURE_PICK_UP_CREDENTIALS |      /* We pick up credentials from the boot partition */
+                EFI_STUB_FEATURE_PICK_UP_SYSEXTS |          /* We pick up system extensions from the boot partition */
+                EFI_STUB_FEATURE_THREE_PCRS |               /* We can measure kernel image, parameters and sysext */
+                EFI_STUB_FEATURE_RANDOM_SEED |              /* We pass a random seed to the kernel */
+                EFI_STUB_FEATURE_CMDLINE_ADDONS |           /* We pick up .cmdline addons */
+                EFI_STUB_FEATURE_CMDLINE_SMBIOS |           /* We support extending kernel cmdline from SMBIOS Type #11 */
+                EFI_STUB_FEATURE_DEVICETREE_ADDONS |        /* We pick up .dtb addons */
+                0;
+
+        assert(loaded_image);
+
+        /* Export the device path this image is started from, if it's not set yet */
+        if (efivar_get_raw(MAKE_GUID_PTR(LOADER), u"LoaderDevicePartUUID", NULL, NULL) != EFI_SUCCESS) {
+                _cleanup_free_ char16_t *uuid = disk_get_part_uuid(loaded_image->DeviceHandle);
+                if (uuid)
+                        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderDevicePartUUID", uuid, 0);
+        }
+
+        /* If LoaderImageIdentifier is not set, assume the image with this stub was loaded directly from the
+         * UEFI firmware without any boot loader, and hence set the LoaderImageIdentifier ourselves. Note
+         * that some boot chain loaders neither set LoaderImageIdentifier nor make FilePath available to us,
+         * in which case there's simple nothing to set for us. (The UEFI spec doesn't really say who's wrong
+         * here, i.e. whether FilePath may be NULL or not, hence handle this gracefully and check if FilePath
+         * is non-NULL explicitly.) */
+        if (efivar_get_raw(MAKE_GUID_PTR(LOADER), u"LoaderImageIdentifier", NULL, NULL) != EFI_SUCCESS &&
+            loaded_image->FilePath) {
+                _cleanup_free_ char16_t *s = NULL;
+                if (device_path_to_str(loaded_image->FilePath, &s) == EFI_SUCCESS)
+                        efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderImageIdentifier", s, 0);
+        }
+
+        /* if LoaderFirmwareInfo is not set, let's set it */
+        if (efivar_get_raw(MAKE_GUID_PTR(LOADER), u"LoaderFirmwareInfo", NULL, NULL) != EFI_SUCCESS) {
+                _cleanup_free_ char16_t *s = NULL;
+                s = xasprintf("%ls %u.%02u", ST->FirmwareVendor, ST->FirmwareRevision >> 16, ST->FirmwareRevision & 0xffff);
+                efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderFirmwareInfo", s, 0);
+        }
+
+        /* ditto for LoaderFirmwareType */
+        if (efivar_get_raw(MAKE_GUID_PTR(LOADER), u"LoaderFirmwareType", NULL, NULL) != EFI_SUCCESS) {
+                _cleanup_free_ char16_t *s = NULL;
+                s = xasprintf("UEFI %u.%02u", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff);
+                efivar_set(MAKE_GUID_PTR(LOADER), u"LoaderFirmwareType", s, 0);
+        }
+
+
+        /* add StubInfo (this is one is owned by the stub, hence we unconditionally override this with our
+         * own data) */
+        (void) efivar_set(MAKE_GUID_PTR(LOADER), u"StubInfo", u"systemd-stub " GIT_VERSION, 0);
+
+        (void) efivar_set_uint64_le(MAKE_GUID_PTR(LOADER), u"StubFeatures", stub_features, 0);
+}
+
+static bool use_load_options(
+                EFI_HANDLE stub_image,
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                bool have_cmdline,
+                char16_t **ret) {
+
+        assert(stub_image);
+        assert(loaded_image);
+        assert(ret);
+
+        /* We only allow custom command lines if we aren't in secure boot or if no cmdline was baked into
+         * the stub image.
+         * We also don't allow it if we are in confidential vms and secureboot is on. */
+        if (secure_boot_enabled() && (have_cmdline || is_confidential_vm()))
+                return false;
+
+        /* We also do a superficial check whether first character of passed command line
+         * is printable character (for compat with some Dell systems which fill in garbage?). */
+        if (loaded_image->LoadOptionsSize < sizeof(char16_t) || ((char16_t *) loaded_image->LoadOptions)[0] <= 0x1F)
+                return false;
+
+        /* The UEFI shell registers EFI_SHELL_PARAMETERS_PROTOCOL onto images it runs. This lets us know that
+         * LoadOptions starts with the stub binary path which we want to strip off. */
+        EFI_SHELL_PARAMETERS_PROTOCOL *shell;
+        if (BS->HandleProtocol(stub_image, MAKE_GUID_PTR(EFI_SHELL_PARAMETERS_PROTOCOL), (void **) &shell)
+            != EFI_SUCCESS) {
+                /* Not running from EFI shell, use entire LoadOptions. Note that LoadOptions is a void*, so
+                 * it could be anything! */
+                *ret = xstrndup16(loaded_image->LoadOptions, loaded_image->LoadOptionsSize / sizeof(char16_t));
+                mangle_stub_cmdline(*ret);
+                return true;
+        }
+
+        if (shell->Argc < 2)
+                /* No arguments were provided? Then we fall back to built-in cmdline. */
+                return false;
+
+        /* Assemble the command line ourselves without our stub path. */
+        *ret = xstrdup16(shell->Argv[1]);
+        for (size_t i = 2; i < shell->Argc; i++) {
+                _cleanup_free_ char16_t *old = *ret;
+                *ret = xasprintf("%ls %ls", old, shell->Argv[i]);
+        }
+
+        mangle_stub_cmdline(*ret);
+        return true;
+}
+
+static EFI_STATUS load_addons_from_dir(
+                EFI_FILE *root,
+                const char16_t *prefix,
+                char16_t ***items,
+                size_t *n_items,
+                size_t *n_allocated) {
+
+        _cleanup_(file_closep) EFI_FILE *extra_dir = NULL;
+        _cleanup_free_ EFI_FILE_INFO *dirent = NULL;
+        size_t dirent_size = 0;
+        EFI_STATUS err;
+
+        assert(root);
+        assert(prefix);
+        assert(items);
+        assert(n_items);
+        assert(n_allocated);
+
+        err = open_directory(root, prefix, &extra_dir);
+        if (err == EFI_NOT_FOUND)
+                /* No extra subdir, that's totally OK */
+                return EFI_SUCCESS;
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Failed to open addons directory '%ls': %m", prefix);
+
+        for (;;) {
+                _cleanup_free_ char16_t *d = NULL;
+
+                err = readdir(extra_dir, &dirent, &dirent_size);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to read addons directory of loaded image: %m");
+                if (!dirent) /* End of directory */
+                        break;
+
+                if (dirent->FileName[0] == '.')
+                        continue;
+                if (FLAGS_SET(dirent->Attribute, EFI_FILE_DIRECTORY))
+                        continue;
+                if (!is_ascii(dirent->FileName))
+                        continue;
+                if (strlen16(dirent->FileName) > 255) /* Max filename size on Linux */
+                        continue;
+                if (!endswith_no_case(dirent->FileName, u".addon.efi"))
+                        continue;
+
+                d = xstrdup16(dirent->FileName);
+
+                if (*n_items + 2 > *n_allocated) {
+                        /* We allocate 16 entries at a time, as a matter of optimization */
+                        if (*n_items > (SIZE_MAX / sizeof(uint16_t)) - 16) /* Overflow check, just in case */
+                                return log_oom();
+
+                        size_t m = *n_items + 16;
+                        *items = xrealloc(*items, *n_allocated * sizeof(uint16_t *), m * sizeof(uint16_t *));
+                        *n_allocated = m;
+                }
+
+                (*items)[(*n_items)++] = TAKE_PTR(d);
+                (*items)[*n_items] = NULL; /* Let's always NUL terminate, to make freeing via strv_free() easy */
+        }
+
+        return EFI_SUCCESS;
+}
+
+static void cmdline_append_and_measure_addons(
+                char16_t *cmdline_global,
+                char16_t *cmdline_uki,
+                char16_t **cmdline_append,
+                bool *ret_parameters_measured) {
+
+        _cleanup_free_ char16_t *tmp = NULL, *merged = NULL;
+        bool m = false;
+
+        assert(cmdline_append);
+        assert(ret_parameters_measured);
+
+        if (isempty(cmdline_global) && isempty(cmdline_uki))
+                return;
+
+        merged = xasprintf("%ls%ls%ls",
+                           strempty(cmdline_global),
+                           isempty(cmdline_global) || isempty(cmdline_uki) ? u"" : u" ",
+                           strempty(cmdline_uki));
+
+        mangle_stub_cmdline(merged);
+
+        if (isempty(merged))
+                return;
+
+        (void) tpm_log_load_options(merged, &m);
+        *ret_parameters_measured = m;
+
+        tmp = TAKE_PTR(*cmdline_append);
+        *cmdline_append = xasprintf("%ls%ls%ls", strempty(tmp), isempty(tmp) ? u"" : u" ", merged);
+}
+
+static void dtb_install_addons(
+                struct devicetree_state *dt_state,
+                void **dt_bases,
+                size_t *dt_sizes,
+                char16_t **dt_filenames,
+                size_t n_dts,
+                bool *ret_parameters_measured) {
+
+        int parameters_measured = -1;
+        EFI_STATUS err;
+
+        assert(dt_state);
+        assert(n_dts == 0 || (dt_bases && dt_sizes && dt_filenames));
+        assert(ret_parameters_measured);
+
+        for (size_t i = 0; i < n_dts; ++i) {
+                err = devicetree_install_from_memory(dt_state, dt_bases[i], dt_sizes[i]);
+                if (err != EFI_SUCCESS)
+                        log_error_status(err, "Error loading addon devicetree, ignoring: %m");
+                else {
+                        bool m = false;
+
+                        err = tpm_log_tagged_event(
+                                        TPM2_PCR_KERNEL_CONFIG,
+                                        POINTER_TO_PHYSICAL_ADDRESS(dt_bases[i]),
+                                        dt_sizes[i],
+                                        DEVICETREE_ADDON_EVENT_TAG_ID,
+                                        dt_filenames[i],
+                                        &m);
+                        if (err != EFI_SUCCESS)
+                                return (void) log_error_status(
+                                                err,
+                                                "Unable to add measurement of DTB addon #%zu to PCR %i: %m",
+                                                i,
+                                                TPM2_PCR_KERNEL_CONFIG);
+
+                        parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+                }
+        }
+
+        *ret_parameters_measured = parameters_measured;
+}
+
+static void dt_bases_free(void **dt_bases, size_t n_dt) {
+        assert(dt_bases || n_dt == 0);
+
+        for (size_t i = 0; i < n_dt; ++i)
+                free(dt_bases[i]);
+
+        free(dt_bases);
+}
+
+static void dt_filenames_free(char16_t **dt_filenames, size_t n_dt) {
+        assert(dt_filenames || n_dt == 0);
+
+        for (size_t i = 0; i < n_dt; ++i)
+                free(dt_filenames[i]);
+
+        free(dt_filenames);
+}
+
+static EFI_STATUS load_addons(
+                EFI_HANDLE stub_image,
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_image,
+                const char16_t *prefix,
+                const char *uname,
+                char16_t **ret_cmdline,
+                void ***ret_dt_bases,
+                size_t **ret_dt_sizes,
+                char16_t ***ret_dt_filenames,
+                size_t *ret_n_dt) {
+
+        _cleanup_free_ size_t *dt_sizes = NULL;
+        _cleanup_(strv_freep) char16_t **items = NULL;
+        _cleanup_(file_closep) EFI_FILE *root = NULL;
+        _cleanup_free_ char16_t *cmdline = NULL;
+        size_t n_items = 0, n_allocated = 0, n_dt = 0;
+        char16_t **dt_filenames = NULL;
+        void **dt_bases = NULL;
+        EFI_STATUS err;
+
+        assert(stub_image);
+        assert(loaded_image);
+        assert(prefix);
+        assert(!!ret_dt_bases == !!ret_dt_sizes);
+        assert(!!ret_dt_bases == !!ret_n_dt);
+        assert(!!ret_dt_filenames == !!ret_n_dt);
+
+        if (!loaded_image->DeviceHandle)
+                return EFI_SUCCESS;
+
+        CLEANUP_ARRAY(dt_bases, n_dt, dt_bases_free);
+        CLEANUP_ARRAY(dt_filenames, n_dt, dt_filenames_free);
+
+        err = open_volume(loaded_image->DeviceHandle, &root);
+        if (err == EFI_UNSUPPORTED)
+                /* Error will be unsupported if the bootloader doesn't implement the file system protocol on
+                 * its file handles. */
+                return EFI_SUCCESS;
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Unable to open root directory: %m");
+
+        err = load_addons_from_dir(root, prefix, &items, &n_items, &n_allocated);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (n_items == 0)
+                return EFI_SUCCESS; /* Empty directory */
+
+        /* Now, sort the files we found, to make this uniform and stable (and to ensure the TPM measurements
+         * are not dependent on read order) */
+        sort_pointer_array((void**) items, n_items, (compare_pointer_func_t) strcmp16);
+
+        for (size_t i = 0; i < n_items; i++) {
+                size_t addrs[_UNIFIED_SECTION_MAX] = {}, szs[_UNIFIED_SECTION_MAX] = {};
+                _cleanup_free_ EFI_DEVICE_PATH *addon_path = NULL;
+                _cleanup_(unload_imagep) EFI_HANDLE addon = NULL;
+                EFI_LOADED_IMAGE_PROTOCOL *loaded_addon = NULL;
+                _cleanup_free_ char16_t *addon_spath = NULL;
+
+                addon_spath = xasprintf("%ls\\%ls", prefix, items[i]);
+                err = make_file_device_path(loaded_image->DeviceHandle, addon_spath, &addon_path);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Error making device path for %ls: %m", addon_spath);
+
+                /* By using shim_load_image, we cover both the case where the PE files are signed with MoK
+                 * and with DB, and running with or without shim. */
+                err = shim_load_image(stub_image, addon_path, &addon);
+                if (err != EFI_SUCCESS) {
+                        log_error_status(err,
+                                         "Failed to read '%ls' from '%ls', ignoring: %m",
+                                         items[i],
+                                         addon_spath);
+                        continue;
+                }
+
+                err = BS->HandleProtocol(addon,
+                                         MAKE_GUID_PTR(EFI_LOADED_IMAGE_PROTOCOL),
+                                         (void **) &loaded_addon);
+                if (err != EFI_SUCCESS)
+                        return log_error_status(err, "Failed to find protocol in %ls: %m", items[i]);
+
+                err = pe_memory_locate_sections(loaded_addon->ImageBase, unified_sections, addrs, szs);
+                if (err != EFI_SUCCESS ||
+                    (szs[UNIFIED_SECTION_CMDLINE] == 0 && szs[UNIFIED_SECTION_DTB] == 0)) {
+                        if (err == EFI_SUCCESS)
+                                err = EFI_NOT_FOUND;
+                        log_error_status(err,
+                                         "Unable to locate embedded .cmdline/.dtb sections in %ls, ignoring: %m",
+                                         items[i]);
+                        continue;
+                }
+
+                /* We want to enforce that addons are not UKIs, i.e.: they must not embed a kernel. */
+                if (szs[UNIFIED_SECTION_LINUX] > 0) {
+                        log_error_status(EFI_INVALID_PARAMETER, "%ls is a UKI, not an addon, ignoring: %m", items[i]);
+                        continue;
+                }
+
+                /* Also enforce that, in case it is specified, .uname matches as a quick way to allow
+                 * enforcing compatibility with a specific UKI only */
+                if (uname && szs[UNIFIED_SECTION_UNAME] > 0 &&
+                                !strneq8(uname,
+                                         (char *)loaded_addon->ImageBase + addrs[UNIFIED_SECTION_UNAME],
+                                         szs[UNIFIED_SECTION_UNAME])) {
+                        log_error(".uname mismatch between %ls and UKI, ignoring", items[i]);
+                        continue;
+                }
+
+                if (ret_cmdline && szs[UNIFIED_SECTION_CMDLINE] > 0) {
+                        _cleanup_free_ char16_t *tmp = TAKE_PTR(cmdline),
+                                                *extra16 = xstrn8_to_16((char *)loaded_addon->ImageBase + addrs[UNIFIED_SECTION_CMDLINE],
+                                                                        szs[UNIFIED_SECTION_CMDLINE]);
+                        cmdline = xasprintf("%ls%ls%ls", strempty(tmp), isempty(tmp) ? u"" : u" ", extra16);
+                }
+
+                if (ret_dt_bases && szs[UNIFIED_SECTION_DTB] > 0) {
+                        dt_sizes = xrealloc(dt_sizes,
+                                            n_dt * sizeof(size_t),
+                                            (n_dt + 1)  * sizeof(size_t));
+                        dt_sizes[n_dt] = szs[UNIFIED_SECTION_DTB];
+
+                        dt_bases = xrealloc(dt_bases,
+                                            n_dt * sizeof(void *),
+                                            (n_dt + 1) * sizeof(void *));
+                        dt_bases[n_dt] = xmemdup((uint8_t*)loaded_addon->ImageBase + addrs[UNIFIED_SECTION_DTB],
+                                                 dt_sizes[n_dt]);
+
+                        dt_filenames = xrealloc(dt_filenames,
+                                                n_dt * sizeof(char16_t *),
+                                                (n_dt + 1) * sizeof(char16_t *));
+                        dt_filenames[n_dt] = xstrdup16(items[i]);
+
+                        ++n_dt;
+                }
+        }
+
+        if (ret_cmdline && !isempty(cmdline))
+                *ret_cmdline = TAKE_PTR(cmdline);
+
+        if (ret_n_dt && n_dt > 0) {
+                *ret_dt_filenames = TAKE_PTR(dt_filenames);
+                *ret_dt_bases = TAKE_PTR(dt_bases);
+                *ret_dt_sizes = TAKE_PTR(dt_sizes);
+                *ret_n_dt = n_dt;
+        }
+
+        return EFI_SUCCESS;
+}
+
+static EFI_STATUS run(EFI_HANDLE image) {
+        _cleanup_free_ void *credential_initrd = NULL, *global_credential_initrd = NULL, *sysext_initrd = NULL, *pcrsig_initrd = NULL, *pcrpkey_initrd = NULL;
+        size_t credential_initrd_size = 0, global_credential_initrd_size = 0, sysext_initrd_size = 0, pcrsig_initrd_size = 0, pcrpkey_initrd_size = 0;
+        void **dt_bases_addons_global = NULL, **dt_bases_addons_uki = NULL;
+        char16_t **dt_filenames_addons_global = NULL, **dt_filenames_addons_uki = NULL;
+        _cleanup_free_ size_t *dt_sizes_addons_global = NULL, *dt_sizes_addons_uki = NULL;
+        size_t linux_size, initrd_size, dt_size, n_dts_addons_global = 0, n_dts_addons_uki = 0;
+        EFI_PHYSICAL_ADDRESS linux_base, initrd_base, dt_base;
+        _cleanup_(devicetree_cleanup) struct devicetree_state dt_state = {};
+        EFI_LOADED_IMAGE_PROTOCOL *loaded_image;
+        size_t addrs[_UNIFIED_SECTION_MAX] = {}, szs[_UNIFIED_SECTION_MAX] = {};
+        _cleanup_free_ char16_t *cmdline = NULL, *cmdline_addons_global = NULL, *cmdline_addons_uki = NULL;
+        int sections_measured = -1, parameters_measured = -1;
+        _cleanup_free_ char *uname = NULL;
+        bool sysext_measured = false, m;
+        uint64_t loader_features = 0;
+        EFI_STATUS err;
+
+        err = BS->HandleProtocol(image, MAKE_GUID_PTR(EFI_LOADED_IMAGE_PROTOCOL), (void **) &loaded_image);
+        if (err != EFI_SUCCESS)
+                return log_error_status(err, "Error getting a LoadedImageProtocol handle: %m");
+
+        if (loaded_image->DeviceHandle && /* Handle case, where bootloader doesn't support DeviceHandle. */
+            (efivar_get_uint64_le(MAKE_GUID_PTR(LOADER), u"LoaderFeatures", &loader_features) != EFI_SUCCESS ||
+            !FLAGS_SET(loader_features, EFI_LOADER_FEATURE_RANDOM_SEED))) {
+                _cleanup_(file_closep) EFI_FILE *esp_dir = NULL;
+
+                err = partition_open(MAKE_GUID_PTR(ESP), loaded_image->DeviceHandle, NULL, &esp_dir);
+                if (err == EFI_SUCCESS) /* Non-fatal on failure, so that we still boot without it. */
+                        (void) process_random_seed(esp_dir);
+        }
+
+        err = pe_memory_locate_sections(loaded_image->ImageBase, unified_sections, addrs, szs);
+        if (err != EFI_SUCCESS || szs[UNIFIED_SECTION_LINUX] == 0) {
+                if (err == EFI_SUCCESS)
+                        err = EFI_NOT_FOUND;
+                return log_error_status(err, "Unable to locate embedded .linux section: %m");
+        }
+
+        CLEANUP_ARRAY(dt_bases_addons_global, n_dts_addons_global, dt_bases_free);
+        CLEANUP_ARRAY(dt_bases_addons_uki, n_dts_addons_uki, dt_bases_free);
+        CLEANUP_ARRAY(dt_filenames_addons_global, n_dts_addons_global, dt_filenames_free);
+        CLEANUP_ARRAY(dt_filenames_addons_uki, n_dts_addons_uki, dt_filenames_free);
+
+        /* Now that we have the UKI sections loaded, also load global first and then local (per-UKI)
+         * addons. The data is loaded at once, and then used later. */
+        err = load_addons(
+                        image,
+                        loaded_image,
+                        u"\\loader\\addons",
+                        uname,
+                        &cmdline_addons_global,
+                        &dt_bases_addons_global,
+                        &dt_sizes_addons_global,
+                        &dt_filenames_addons_global,
+                        &n_dts_addons_global);
+        if (err != EFI_SUCCESS)
+                log_error_status(err, "Error loading global addons, ignoring: %m");
+
+        /* Some bootloaders always pass NULL in FilePath, so we need to check for it here. */
+        _cleanup_free_ char16_t *dropin_dir = get_extra_dir(loaded_image->FilePath);
+        if (dropin_dir) {
+                err = load_addons(
+                                image,
+                                loaded_image,
+                                dropin_dir,
+                                uname,
+                                &cmdline_addons_uki,
+                                &dt_bases_addons_uki,
+                                &dt_sizes_addons_uki,
+                                &dt_filenames_addons_uki,
+                                &n_dts_addons_uki);
+                if (err != EFI_SUCCESS)
+                        log_error_status(err, "Error loading UKI-specific addons, ignoring: %m");
+        }
+
+        /* Measure all "payload" of this PE image into a separate PCR (i.e. where nothing else is written
+         * into so far), so that we have one PCR that we can nicely write policies against because it
+         * contains all static data of this image, and thus can be easily be pre-calculated. */
+        for (UnifiedSection section = 0; section < _UNIFIED_SECTION_MAX; section++) {
+
+                if (!unified_section_measure(section)) /* shall not measure? */
+                        continue;
+
+                if (szs[section] == 0) /* not found */
+                        continue;
+
+                m = false;
+
+                /* First measure the name of the section */
+                (void) tpm_log_event_ascii(
+                                TPM2_PCR_KERNEL_BOOT,
+                                POINTER_TO_PHYSICAL_ADDRESS(unified_sections[section]),
+                                strsize8(unified_sections[section]), /* including NUL byte */
+                                unified_sections[section],
+                                &m);
+
+                sections_measured = sections_measured < 0 ? m : (sections_measured && m);
+
+                /* Then measure the data of the section */
+                (void) tpm_log_event_ascii(
+                                TPM2_PCR_KERNEL_BOOT,
+                                POINTER_TO_PHYSICAL_ADDRESS(loaded_image->ImageBase) + addrs[section],
+                                szs[section],
+                                unified_sections[section],
+                                &m);
+
+                sections_measured = sections_measured < 0 ? m : (sections_measured && m);
+        }
+
+        /* After we are done, set an EFI variable that tells userspace this was done successfully, and encode
+         * in it which PCR was used. */
+        if (sections_measured > 0)
+                (void) efivar_set_uint_string(MAKE_GUID_PTR(LOADER), u"StubPcrKernelImage", TPM2_PCR_KERNEL_BOOT, 0);
+
+        /* Show splash screen as early as possible */
+        graphics_splash((const uint8_t*) loaded_image->ImageBase + addrs[UNIFIED_SECTION_SPLASH], szs[UNIFIED_SECTION_SPLASH]);
+
+        if (szs[UNIFIED_SECTION_UNAME] > 0)
+                uname = xstrndup8((char *)loaded_image->ImageBase + addrs[UNIFIED_SECTION_UNAME],
+                                  szs[UNIFIED_SECTION_UNAME]);
+
+        if (use_load_options(image, loaded_image, szs[UNIFIED_SECTION_CMDLINE] > 0, &cmdline)) {
+                /* Let's measure the passed kernel command line into the TPM. Note that this possibly
+                 * duplicates what we already did in the boot menu, if that was already used. However, since
+                 * we want the boot menu to support an EFI binary, and want to this stub to be usable from
+                 * any boot menu, let's measure things anyway. */
+                m = false;
+                (void) tpm_log_load_options(cmdline, &m);
+                parameters_measured = m;
+        } else if (szs[UNIFIED_SECTION_CMDLINE] > 0) {
+                cmdline = xstrn8_to_16(
+                                (char *) loaded_image->ImageBase + addrs[UNIFIED_SECTION_CMDLINE],
+                                szs[UNIFIED_SECTION_CMDLINE]);
+                mangle_stub_cmdline(cmdline);
+        }
+
+        /* If we have any extra command line to add via PE addons, load them now and append, and
+         * measure the additions together, after the embedded options, but before the smbios ones,
+         * so that the order is reversed from "most hardcoded" to "most dynamic". The global addons are
+         * loaded first, and the image-specific ones later, for the same reason. */
+        cmdline_append_and_measure_addons(cmdline_addons_global, cmdline_addons_uki, &cmdline, &m);
+        parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+
+        /* SMBIOS OEM Strings data is controlled by the host admin and not covered
+         * by the VM attestation, so MUST NOT be trusted when in a confidential VM */
+        if (!is_confidential_vm()) {
+                const char *extra = smbios_find_oem_string("io.systemd.stub.kernel-cmdline-extra");
+                if (extra) {
+                        _cleanup_free_ char16_t *tmp = TAKE_PTR(cmdline), *extra16 = xstr8_to_16(extra);
+                        cmdline = xasprintf("%ls %ls", tmp, extra16);
+
+                        /* SMBIOS strings are measured in PCR1, but we also want to measure them in our specific
+                         * PCR12, as firmware-owned PCRs are very difficult to use as they'll contain unpredictable
+                         * measurements that are not under control of the machine owner. */
+                        m = false;
+                        (void) tpm_log_load_options(extra16, &m);
+                        parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+                }
+        }
+
+        export_variables(loaded_image);
+
+        if (pack_cpio(loaded_image,
+                      NULL,
+                      u".cred",
+                      ".extra/credentials",
+                      /* dir_mode= */ 0500,
+                      /* access_mode= */ 0400,
+                      /* tpm_pcr= */ TPM2_PCR_KERNEL_CONFIG,
+                      u"Credentials initrd",
+                      &credential_initrd,
+                      &credential_initrd_size,
+                      &m) == EFI_SUCCESS)
+                parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+
+        if (pack_cpio(loaded_image,
+                      u"\\loader\\credentials",
+                      u".cred",
+                      ".extra/global_credentials",
+                      /* dir_mode= */ 0500,
+                      /* access_mode= */ 0400,
+                      /* tpm_pcr= */ TPM2_PCR_KERNEL_CONFIG,
+                      u"Global credentials initrd",
+                      &global_credential_initrd,
+                      &global_credential_initrd_size,
+                      &m) == EFI_SUCCESS)
+                parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+
+        if (pack_cpio(loaded_image,
+                      NULL,
+                      u".raw",
+                      ".extra/sysext",
+                      /* dir_mode= */ 0555,
+                      /* access_mode= */ 0444,
+                      /* tpm_pcr= */ TPM2_PCR_SYSEXTS,
+                      u"System extension initrd",
+                      &sysext_initrd,
+                      &sysext_initrd_size,
+                      &m) == EFI_SUCCESS)
+                sysext_measured = m;
+
+        dt_size = szs[UNIFIED_SECTION_DTB];
+        dt_base = dt_size != 0 ? POINTER_TO_PHYSICAL_ADDRESS(loaded_image->ImageBase) + addrs[UNIFIED_SECTION_DTB] : 0;
+
+        /* First load the base device tree, then fix it up using addons - global first, then per-UKI. */
+        if (dt_size > 0) {
+                err = devicetree_install_from_memory(
+                                &dt_state, PHYSICAL_ADDRESS_TO_POINTER(dt_base), dt_size);
+                if (err != EFI_SUCCESS)
+                        log_error_status(err, "Error loading embedded devicetree: %m");
+        }
+
+        dtb_install_addons(&dt_state,
+                           dt_bases_addons_global,
+                           dt_sizes_addons_global,
+                           dt_filenames_addons_global,
+                           n_dts_addons_global,
+                           &m);
+        parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+        dtb_install_addons(&dt_state,
+                           dt_bases_addons_uki,
+                           dt_sizes_addons_uki,
+                           dt_filenames_addons_uki,
+                           n_dts_addons_uki,
+                           &m);
+        parameters_measured = parameters_measured < 0 ? m : (parameters_measured && m);
+
+        if (parameters_measured > 0)
+                (void) efivar_set_uint_string(MAKE_GUID_PTR(LOADER), u"StubPcrKernelParameters", TPM2_PCR_KERNEL_CONFIG, 0);
+        if (sysext_measured)
+                (void) efivar_set_uint_string(MAKE_GUID_PTR(LOADER), u"StubPcrInitRDSysExts", TPM2_PCR_SYSEXTS, 0);
+
+        /* If the PCR signature was embedded in the PE image, then let's wrap it in a cpio and also pass it
+         * to the kernel, so that it can be read from /.extra/tpm2-pcr-signature.json. Note that this section
+         * is not measured, neither as raw section (see above), nor as cpio (here), because it is the
+         * signature of expected PCR values, i.e. its input are PCR measurements, and hence it shouldn't
+         * itself be input for PCR measurements. */
+        if (szs[UNIFIED_SECTION_PCRSIG] > 0)
+                (void) pack_cpio_literal(
+                                (uint8_t*) loaded_image->ImageBase + addrs[UNIFIED_SECTION_PCRSIG],
+                                szs[UNIFIED_SECTION_PCRSIG],
+                                ".extra",
+                                u"tpm2-pcr-signature.json",
+                                /* dir_mode= */ 0555,
+                                /* access_mode= */ 0444,
+                                /* tpm_pcr= */ UINT32_MAX,
+                                /* tpm_description= */ NULL,
+                                &pcrsig_initrd,
+                                &pcrsig_initrd_size,
+                                /* ret_measured= */ NULL);
+
+        /* If the public key used for the PCR signatures was embedded in the PE image, then let's wrap it in
+         * a cpio and also pass it to the kernel, so that it can be read from
+         * /.extra/tpm2-pcr-public-key.pem. This section is already measure above, hence we won't measure the
+         * cpio. */
+        if (szs[UNIFIED_SECTION_PCRPKEY] > 0)
+                (void) pack_cpio_literal(
+                                (uint8_t*) loaded_image->ImageBase + addrs[UNIFIED_SECTION_PCRPKEY],
+                                szs[UNIFIED_SECTION_PCRPKEY],
+                                ".extra",
+                                u"tpm2-pcr-public-key.pem",
+                                /* dir_mode= */ 0555,
+                                /* access_mode= */ 0444,
+                                /* tpm_pcr= */ UINT32_MAX,
+                                /* tpm_description= */ NULL,
+                                &pcrpkey_initrd,
+                                &pcrpkey_initrd_size,
+                                /* ret_measured= */ NULL);
+
+        linux_size = szs[UNIFIED_SECTION_LINUX];
+        linux_base = POINTER_TO_PHYSICAL_ADDRESS(loaded_image->ImageBase) + addrs[UNIFIED_SECTION_LINUX];
+
+        initrd_size = szs[UNIFIED_SECTION_INITRD];
+        initrd_base = initrd_size != 0 ? POINTER_TO_PHYSICAL_ADDRESS(loaded_image->ImageBase) + addrs[UNIFIED_SECTION_INITRD] : 0;
+
+        _cleanup_pages_ Pages initrd_pages = {};
+        if (credential_initrd || global_credential_initrd || sysext_initrd || pcrsig_initrd || pcrpkey_initrd) {
+                /* If we have generated initrds dynamically, let's combine them with the built-in initrd. */
+                err = combine_initrd(
+                                initrd_base, initrd_size,
+                                (const void*const[]) {
+                                        credential_initrd,
+                                        global_credential_initrd,
+                                        sysext_initrd,
+                                        pcrsig_initrd,
+                                        pcrpkey_initrd,
+                                },
+                                (const size_t[]) {
+                                        credential_initrd_size,
+                                        global_credential_initrd_size,
+                                        sysext_initrd_size,
+                                        pcrsig_initrd_size,
+                                        pcrpkey_initrd_size,
+                                },
+                                5,
+                                &initrd_pages, &initrd_size);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                initrd_base = initrd_pages.addr;
+
+                /* Given these might be large let's free them explicitly, quickly. */
+                credential_initrd = mfree(credential_initrd);
+                global_credential_initrd = mfree(global_credential_initrd);
+                sysext_initrd = mfree(sysext_initrd);
+                pcrsig_initrd = mfree(pcrsig_initrd);
+                pcrpkey_initrd = mfree(pcrpkey_initrd);
+        }
+
+        err = linux_exec(image, cmdline,
+                         PHYSICAL_ADDRESS_TO_POINTER(linux_base), linux_size,
+                         PHYSICAL_ADDRESS_TO_POINTER(initrd_base), initrd_size);
+        graphics_mode(false);
+        return err;
+}
+
+DEFINE_EFI_MAIN_FUNCTION(run, "systemd-stub", /*wait_for_debugger=*/false);
diff --git a/src/boot/efi/test-bcd.c b/src/boot/efi/test-bcd.c
new file mode 100644
index 0000000..3f93ca0
--- /dev/null
+++ b/src/boot/efi/test-bcd.c
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bcd.h"
+#include "compress.h"
+#include "fileio.h"
+#include "tests.h"
+#include "utf8.h"
+
+/* Include the implementation directly, so we can poke at some internals. */
+#include "bcd.c"
+
+static void load_bcd(const char *path, void **ret_bcd, size_t *ret_bcd_len) {
+        size_t len;
+        _cleanup_free_ char *fn = NULL, *compressed = NULL;
+
+        assert_se(get_testdata_dir(path, &fn) >= 0);
+        assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, SIZE_MAX, 0, NULL, &compressed, &len) >= 0);
+        assert_se(decompress_blob_zstd(compressed, len, ret_bcd, ret_bcd_len, SIZE_MAX) >= 0);
+}
+
+static void test_get_bcd_title_one(
+                const char *path,
+                const char16_t *title_expect,
+                size_t title_len_expect) {
+
+        size_t len;
+        _cleanup_free_ void *bcd = NULL;
+
+        log_info("/* %s(%s) */", __func__, path);
+
+        load_bcd(path, &bcd, &len);
+
+        char16_t *title = get_bcd_title(bcd, len);
+        if (title_expect) {
+                assert_se(title);
+                assert_se(memcmp(title, title_expect, title_len_expect) == 0);
+        } else
+                assert_se(!title);
+}
+
+TEST(get_bcd_title) {
+        test_get_bcd_title_one("test-bcd/win10.bcd.zst", u"Windows 10", sizeof(u"Windows 10"));
+
+        test_get_bcd_title_one("test-bcd/description-bad-type.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/description-empty.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/description-missing.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/description-too-small.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/displayorder-bad-name.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/displayorder-bad-size.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/displayorder-bad-type.bcd.zst", NULL, 0);
+        test_get_bcd_title_one("test-bcd/empty.bcd.zst", NULL, 0);
+}
+
+TEST(base_block) {
+        size_t len;
+        BaseBlock backup;
+        uint8_t *bcd_base;
+        _cleanup_free_ BaseBlock *bcd = NULL;
+
+        load_bcd("test-bcd/win10.bcd.zst", (void **) &bcd, &len);
+        backup = *bcd;
+        bcd_base = (uint8_t *) bcd;
+
+        assert_se(get_bcd_title(bcd_base, len));
+
+        /* Try various "corruptions" of the base block. */
+
+        assert_se(!get_bcd_title(bcd_base, sizeof(BaseBlock) - 1));
+
+        bcd->sig = 0;
+        assert_se(!get_bcd_title(bcd_base, len));
+        *bcd = backup;
+
+        bcd->version_minor = 2;
+        assert_se(!get_bcd_title(bcd_base, len));
+        *bcd = backup;
+
+        bcd->version_major = 4;
+        assert_se(!get_bcd_title(bcd_base, len));
+        *bcd = backup;
+
+        bcd->type = 1;
+        assert_se(!get_bcd_title(bcd_base, len));
+        *bcd = backup;
+
+        bcd->primary_seqnum++;
+        assert_se(!get_bcd_title(bcd_base, len));
+        *bcd = backup;
+}
+
+TEST(bad_bcd) {
+        size_t len;
+        uint8_t *hbins;
+        uint32_t offset;
+        _cleanup_free_ void *bcd = NULL;
+
+        /* This BCD hive has been manipulated to have bad offsets/sizes at various places. */
+        load_bcd("test-bcd/corrupt.bcd.zst", &bcd, &len);
+
+        assert_se(len >= HIVE_CELL_OFFSET);
+        hbins = (uint8_t *) bcd + HIVE_CELL_OFFSET;
+        len -= HIVE_CELL_OFFSET;
+        offset = ((BaseBlock *) bcd)->root_cell_offset;
+
+        const Key *root = get_key(hbins, len, offset, "\0");
+        assert_se(root);
+        assert_se(!get_key(hbins, sizeof(Key) - 1, offset, "\0"));
+
+        assert_se(!get_key(hbins, len, offset, "\0BadOffset\0"));
+        assert_se(!get_key(hbins, len, offset, "\0BadSig\0"));
+        assert_se(!get_key(hbins, len, offset, "\0BadKeyNameLen\0"));
+        assert_se(!get_key(hbins, len, offset, "\0SubkeyBadOffset\0Dummy\0"));
+        assert_se(!get_key(hbins, len, offset, "\0SubkeyBadSig\0Dummy\0"));
+        assert_se(!get_key(hbins, len, offset, "\0SubkeyBadNEntries\0Dummy\0"));
+
+        assert_se(!get_key_value(hbins, len, root, "Dummy"));
+
+        const Key *kv_bad_offset = get_key(hbins, len, offset, "\0KeyValuesBadOffset\0");
+        assert_se(kv_bad_offset);
+        assert_se(!get_key_value(hbins, len, kv_bad_offset, "Dummy"));
+
+        const Key *kv_bad_n_key_values = get_key(hbins, len, offset, "\0KeyValuesBadNKeyValues\0");
+        assert_se(kv_bad_n_key_values);
+        assert_se(!get_key_value(hbins, len, kv_bad_n_key_values, "Dummy"));
+
+        const Key *kv = get_key(hbins, len, offset, "\0KeyValues\0");
+        assert_se(kv);
+
+        assert_se(!get_key_value(hbins, len, kv, "BadOffset"));
+        assert_se(!get_key_value(hbins, len, kv, "BadSig"));
+        assert_se(!get_key_value(hbins, len, kv, "BadNameLen"));
+        assert_se(!get_key_value(hbins, len, kv, "InlineData"));
+        assert_se(!get_key_value(hbins, len, kv, "BadDataOffset"));
+        assert_se(!get_key_value(hbins, len, kv, "BadDataSize"));
+}
+
+TEST(argv_bcds) {
+        for (int i = 1; i < saved_argc; i++) {
+                size_t len;
+                _cleanup_free_ void *bcd = NULL;
+
+                assert_se(read_full_file_full(
+                        AT_FDCWD,
+                        saved_argv[i],
+                        UINT64_MAX,
+                        SIZE_MAX,
+                        0,
+                        NULL,
+                        (char **) &bcd,
+                        &len) >= 0);
+
+                char16_t *title = get_bcd_title(bcd, len);
+                if (title) {
+                        _cleanup_free_ char *title_utf8 = utf16_to_utf8(title, SIZE_MAX);
+                        log_info("%s: \"%s\"", saved_argv[i], title_utf8);
+                } else
+                        log_info("%s: Bad BCD", saved_argv[i]);
+        }
+}
+
+DEFINE_TEST_MAIN(LOG_INFO);
diff --git a/src/boot/efi/test-efi-string.c b/src/boot/efi/test-efi-string.c
new file mode 100644
index 0000000..b71a0c3
--- /dev/null
+++ b/src/boot/efi/test-efi-string.c
@@ -0,0 +1,794 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "efi-string.h"
+#include "fileio.h"
+#include "tests.h"
+
+TEST(strlen8) {
+        assert_se(strlen8(NULL) == 0);
+        assert_se(strlen8("") == 0);
+        assert_se(strlen8("1") == 1);
+        assert_se(strlen8("11") == 2);
+        assert_se(strlen8("123456789") == 9);
+        assert_se(strlen8("12\0004") == 2);
+}
+
+TEST(strlen16) {
+        assert_se(strlen16(NULL) == 0);
+        assert_se(strlen16(u"") == 0);
+        assert_se(strlen16(u"1") == 1);
+        assert_se(strlen16(u"11") == 2);
+        assert_se(strlen16(u"123456789") == 9);
+        assert_se(strlen16(u"12\0004") == 2);
+}
+
+TEST(strnlen8) {
+        assert_se(strnlen8(NULL, 0) == 0);
+        assert_se(strnlen8(NULL, 10) == 0);
+        assert_se(strnlen8("", 10) == 0);
+        assert_se(strnlen8("1", 10) == 1);
+        assert_se(strnlen8("11", 1) == 1);
+        assert_se(strnlen8("123456789", 7) == 7);
+        assert_se(strnlen8("12\0004", 5) == 2);
+}
+
+TEST(strnlen16) {
+        assert_se(strnlen16(NULL, 0) == 0);
+        assert_se(strnlen16(NULL, 10) == 0);
+        assert_se(strnlen16(u"", 10) == 0);
+        assert_se(strnlen16(u"1", 10) == 1);
+        assert_se(strnlen16(u"11", 1) == 1);
+        assert_se(strnlen16(u"123456789", 7) == 7);
+        assert_se(strnlen16(u"12\0004", 5) == 2);
+}
+
+TEST(strsize8) {
+        assert_se(strsize8(NULL) == 0);
+        assert_se(strsize8("") == 1);
+        assert_se(strsize8("1") == 2);
+        assert_se(strsize8("11") == 3);
+        assert_se(strsize8("123456789") == 10);
+        assert_se(strsize8("12\0004") == 3);
+}
+
+TEST(strsize16) {
+        assert_se(strsize16(NULL) == 0);
+        assert_se(strsize16(u"") == 2);
+        assert_se(strsize16(u"1") == 4);
+        assert_se(strsize16(u"11") == 6);
+        assert_se(strsize16(u"123456789") == 20);
+        assert_se(strsize16(u"12\0004") == 6);
+}
+
+TEST(strtolower8) {
+        char s[] = "\0001234abcDEF!\0zZ";
+
+        strtolower8(NULL);
+
+        strtolower8(s);
+        assert_se(memcmp(s, "\0001234abcDEF!\0zZ", sizeof(s)) == 0);
+
+        s[0] = '#';
+        strtolower8(s);
+        assert_se(memcmp(s, "#1234abcdef!\0zZ", sizeof(s)) == 0);
+}
+
+TEST(strtolower16) {
+        char16_t s[] = u"\0001234abcDEF!\0zZ";
+
+        strtolower16(NULL);
+
+        strtolower16(s);
+        assert_se(memcmp(s, u"\0001234abcDEF!\0zZ", sizeof(s)) == 0);
+
+        s[0] = '#';
+        strtolower16(s);
+        assert_se(memcmp(s, u"#1234abcdef!\0zZ", sizeof(s)) == 0);
+}
+
+TEST(strncmp8) {
+        assert_se(strncmp8(NULL, "", 10) < 0);
+        assert_se(strncmp8("", NULL, 10) > 0);
+        assert_se(strncmp8(NULL, NULL, 0) == 0);
+        assert_se(strncmp8(NULL, NULL, 10) == 0);
+        assert_se(strncmp8("", "", 10) == 0);
+        assert_se(strncmp8("abc", "abc", 2) == 0);
+        assert_se(strncmp8("aBc", "aBc", 3) == 0);
+        assert_se(strncmp8("aBC", "aBC", 4) == 0);
+        assert_se(strncmp8("", "a", 0) == 0);
+        assert_se(strncmp8("b", "a", 0) == 0);
+        assert_se(strncmp8("", "a", 3) < 0);
+        assert_se(strncmp8("=", "=", 1) == 0);
+        assert_se(strncmp8("A", "a", 1) < 0);
+        assert_se(strncmp8("a", "A", 2) > 0);
+        assert_se(strncmp8("a", "Aa", 2) > 0);
+        assert_se(strncmp8("12\00034", "12345", 4) < 0);
+        assert_se(strncmp8("12\00034", "12345", SIZE_MAX) < 0);
+        assert_se(strncmp8("abc\0def", "abc", SIZE_MAX) == 0);
+        assert_se(strncmp8("abc\0def", "abcdef", SIZE_MAX) < 0);
+
+        assert_se(strncmp8((char[]){ CHAR_MIN }, (char[]){ CHAR_MIN }, 1) == 0);
+        assert_se(strncmp8((char[]){ CHAR_MAX }, (char[]){ CHAR_MAX }, 1) == 0);
+        assert_se(strncmp8((char[]){ CHAR_MIN }, (char[]){ CHAR_MAX }, 1) < 0);
+        assert_se(strncmp8((char[]){ CHAR_MAX }, (char[]){ CHAR_MIN }, 1) > 0);
+}
+
+TEST(strncmp16) {
+        assert_se(strncmp16(NULL, u"", 10) < 0);
+        assert_se(strncmp16(u"", NULL, 10) > 0);
+        assert_se(strncmp16(NULL, NULL, 0) == 0);
+        assert_se(strncmp16(NULL, NULL, 10) == 0);
+        assert_se(strncmp16(u"", u"", 0) == 0);
+        assert_se(strncmp16(u"", u"", 10) == 0);
+        assert_se(strncmp16(u"abc", u"abc", 2) == 0);
+        assert_se(strncmp16(u"aBc", u"aBc", 3) == 0);
+        assert_se(strncmp16(u"aBC", u"aBC", 4) == 0);
+        assert_se(strncmp16(u"", u"a", 0) == 0);
+        assert_se(strncmp16(u"b", u"a", 0) == 0);
+        assert_se(strncmp16(u"", u"a", 3) < 0);
+        assert_se(strncmp16(u"=", u"=", 1) == 0);
+        assert_se(strncmp16(u"A", u"a", 1) < 0);
+        assert_se(strncmp16(u"a", u"A", 2) > 0);
+        assert_se(strncmp16(u"a", u"Aa", 2) > 0);
+        assert_se(strncmp16(u"12\00034", u"12345", 4) < 0);
+        assert_se(strncmp16(u"12\00034", u"12345", SIZE_MAX) < 0);
+        assert_se(strncmp16(u"abc\0def", u"abc", SIZE_MAX) == 0);
+        assert_se(strncmp16(u"abc\0def", u"abcdef", SIZE_MAX) < 0);
+
+        assert_se(strncmp16((char16_t[]){ UINT16_MAX }, (char16_t[]){ UINT16_MAX }, 1) == 0);
+        assert_se(strncmp16((char16_t[]){ 0 }, (char16_t[]){ UINT16_MAX }, 1) < 0);
+        assert_se(strncmp16((char16_t[]){ UINT16_MAX }, (char16_t[]){ 0 }, 1) > 0);
+}
+
+TEST(strncasecmp8) {
+        assert_se(strncasecmp8(NULL, "", 10) < 0);
+        assert_se(strncasecmp8("", NULL, 10) > 0);
+        assert_se(strncasecmp8(NULL, NULL, 0) == 0);
+        assert_se(strncasecmp8(NULL, NULL, 10) == 0);
+        assert_se(strncasecmp8("", "", 10) == 0);
+        assert_se(strncasecmp8("abc", "abc", 2) == 0);
+        assert_se(strncasecmp8("aBc", "AbC", 3) == 0);
+        assert_se(strncasecmp8("aBC", "Abc", 4) == 0);
+        assert_se(strncasecmp8("", "a", 0) == 0);
+        assert_se(strncasecmp8("b", "a", 0) == 0);
+        assert_se(strncasecmp8("", "a", 3) < 0);
+        assert_se(strncasecmp8("=", "=", 1) == 0);
+        assert_se(strncasecmp8("A", "a", 1) == 0);
+        assert_se(strncasecmp8("a", "A", 2) == 0);
+        assert_se(strncasecmp8("a", "Aa", 2) < 0);
+        assert_se(strncasecmp8("12\00034", "12345", 4) < 0);
+        assert_se(strncasecmp8("12\00034", "12345", SIZE_MAX) < 0);
+        assert_se(strncasecmp8("abc\0def", "ABC", SIZE_MAX) == 0);
+        assert_se(strncasecmp8("abc\0def", "ABCDEF", SIZE_MAX) < 0);
+
+        assert_se(strncasecmp8((char[]){ CHAR_MIN }, (char[]){ CHAR_MIN }, 1) == 0);
+        assert_se(strncasecmp8((char[]){ CHAR_MAX }, (char[]){ CHAR_MAX }, 1) == 0);
+        assert_se(strncasecmp8((char[]){ CHAR_MIN }, (char[]){ CHAR_MAX }, 1) < 0);
+        assert_se(strncasecmp8((char[]){ CHAR_MAX }, (char[]){ CHAR_MIN }, 1) > 0);
+}
+
+TEST(strncasecmp16) {
+        assert_se(strncasecmp16(NULL, u"", 10) < 0);
+        assert_se(strncasecmp16(u"", NULL, 10) > 0);
+        assert_se(strncasecmp16(NULL, NULL, 0) == 0);
+        assert_se(strncasecmp16(NULL, NULL, 10) == 0);
+        assert_se(strncasecmp16(u"", u"", 10) == 0);
+        assert_se(strncasecmp16(u"abc", u"abc", 2) == 0);
+        assert_se(strncasecmp16(u"aBc", u"AbC", 3) == 0);
+        assert_se(strncasecmp16(u"aBC", u"Abc", 4) == 0);
+        assert_se(strncasecmp16(u"", u"a", 0) == 0);
+        assert_se(strncasecmp16(u"b", u"a", 0) == 0);
+        assert_se(strncasecmp16(u"", u"a", 3) < 0);
+        assert_se(strncasecmp16(u"=", u"=", 1) == 0);
+        assert_se(strncasecmp16(u"A", u"a", 1) == 0);
+        assert_se(strncasecmp16(u"a", u"A", 2) == 0);
+        assert_se(strncasecmp16(u"a", u"Aa", 2) < 0);
+        assert_se(strncasecmp16(u"12\00034", u"12345", 4) < 0);
+        assert_se(strncasecmp16(u"12\00034", u"12345", SIZE_MAX) < 0);
+        assert_se(strncasecmp16(u"abc\0def", u"ABC", SIZE_MAX) == 0);
+        assert_se(strncasecmp16(u"abc\0def", u"ABCDEF", SIZE_MAX) < 0);
+
+        assert_se(strncasecmp16((char16_t[]){ UINT16_MAX }, (char16_t[]){ UINT16_MAX }, 1) == 0);
+        assert_se(strncasecmp16((char16_t[]){ 0 }, (char16_t[]){ UINT16_MAX }, 1) < 0);
+        assert_se(strncasecmp16((char16_t[]){ UINT16_MAX }, (char16_t[]){ 0 }, 1) > 0);
+}
+
+TEST(strcpy8) {
+        char buf[128];
+
+        assert_se(strcpy8(buf, "123") == buf);
+        assert_se(streq8(buf, "123"));
+        assert_se(strcpy8(buf, "") == buf);
+        assert_se(streq8(buf, ""));
+        assert_se(strcpy8(buf, "A") == buf);
+        assert_se(streq8(buf, "A"));
+        assert_se(strcpy8(buf, NULL) == buf);
+        assert_se(streq8(buf, ""));
+}
+
+TEST(strcpy16) {
+        char16_t buf[128];
+
+        assert_se(strcpy16(buf, u"123") == buf);
+        assert_se(streq16(buf, u"123"));
+        assert_se(strcpy16(buf, u"") == buf);
+        assert_se(streq16(buf, u""));
+        assert_se(strcpy16(buf, u"A") == buf);
+        assert_se(streq16(buf, u"A"));
+        assert_se(strcpy16(buf, NULL) == buf);
+        assert_se(streq16(buf, u""));
+}
+
+TEST(strchr8) {
+        assert_se(!strchr8(NULL, 'a'));
+        assert_se(!strchr8("", 'a'));
+        assert_se(!strchr8("123", 'a'));
+
+        const char str[] = "abcaBc";
+        assert_se(strchr8(str, 'a') == &str[0]);
+        assert_se(strchr8(str, 'c') == &str[2]);
+        assert_se(strchr8(str, 'B') == &str[4]);
+
+        assert_se(strchr8(str, 0) == str + strlen8(str));
+}
+
+TEST(strchr16) {
+        assert_se(!strchr16(NULL, 'a'));
+        assert_se(!strchr16(u"", 'a'));
+        assert_se(!strchr16(u"123", 'a'));
+
+        const char16_t str[] = u"abcaBc";
+        assert_se(strchr16(str, 'a') == &str[0]);
+        assert_se(strchr16(str, 'c') == &str[2]);
+        assert_se(strchr16(str, 'B') == &str[4]);
+
+        assert_se(strchr16(str, 0) == str + strlen16(str));
+}
+
+TEST(xstrndup8) {
+        char *s = NULL;
+
+        assert_se(xstrndup8(NULL, 0) == NULL);
+        assert_se(xstrndup8(NULL, 10) == NULL);
+
+        assert_se(s = xstrndup8("", 10));
+        assert_se(streq8(s, ""));
+        free(s);
+
+        assert_se(s = xstrndup8("abc", 0));
+        assert_se(streq8(s, ""));
+        free(s);
+
+        assert_se(s = xstrndup8("ABC", 3));
+        assert_se(streq8(s, "ABC"));
+        free(s);
+
+        assert_se(s = xstrndup8("123abcDEF", 5));
+        assert_se(streq8(s, "123ab"));
+        free(s);
+}
+
+TEST(xstrdup8) {
+        char *s = NULL;
+
+        assert_se(xstrdup8(NULL) == NULL);
+
+        assert_se(s = xstrdup8(""));
+        assert_se(streq8(s, ""));
+        free(s);
+
+        assert_se(s = xstrdup8("1"));
+        assert_se(streq8(s, "1"));
+        free(s);
+
+        assert_se(s = xstrdup8("123abcDEF"));
+        assert_se(streq8(s, "123abcDEF"));
+        free(s);
+}
+
+TEST(xstrndup16) {
+        char16_t *s = NULL;
+
+        assert_se(xstrndup16(NULL, 0) == NULL);
+        assert_se(xstrndup16(NULL, 10) == NULL);
+
+        assert_se(s = xstrndup16(u"", 10));
+        assert_se(streq16(s, u""));
+        free(s);
+
+        assert_se(s = xstrndup16(u"abc", 0));
+        assert_se(streq16(s, u""));
+        free(s);
+
+        assert_se(s = xstrndup16(u"ABC", 3));
+        assert_se(streq16(s, u"ABC"));
+        free(s);
+
+        assert_se(s = xstrndup16(u"123abcDEF", 5));
+        assert_se(streq16(s, u"123ab"));
+        free(s);
+}
+
+TEST(xstrdup16) {
+        char16_t *s = NULL;
+
+        assert_se(xstrdup16(NULL) == NULL);
+
+        assert_se(s = xstrdup16(u""));
+        assert_se(streq16(s, u""));
+        free(s);
+
+        assert_se(s = xstrdup16(u"1"));
+        assert_se(streq16(s, u"1"));
+        free(s);
+
+        assert_se(s = xstrdup16(u"123abcDEF"));
+        assert_se(streq16(s, u"123abcDEF"));
+        free(s);
+}
+
+TEST(xstrn8_to_16) {
+        char16_t *s = NULL;
+
+        assert_se(xstrn8_to_16(NULL, 1) == NULL);
+        assert_se(xstrn8_to_16("a", 0) == NULL);
+
+        assert_se(s = xstrn8_to_16("", 1));
+        assert_se(streq16(s, u""));
+        free(s);
+
+        assert_se(s = xstrn8_to_16("1", 1));
+        assert_se(streq16(s, u"1"));
+        free(s);
+
+        assert_se(s = xstr8_to_16("abcxyzABCXYZ09 .,-_#*!\"§$%&/()=?`~"));
+        assert_se(streq16(s, u"abcxyzABCXYZ09 .,-_#*!\"§$%&/()=?`~"));
+        free(s);
+
+        assert_se(s = xstr8_to_16("ÿⱿ𝇉 😺"));
+        assert_se(streq16(s, u"ÿⱿ "));
+        free(s);
+
+        assert_se(s = xstrn8_to_16("¶¶", 3));
+        assert_se(streq16(s, u"¶"));
+        free(s);
+}
+
+TEST(startswith8) {
+        assert_se(streq8(startswith8("", ""), ""));
+        assert_se(streq8(startswith8("x", ""), "x"));
+        assert_se(!startswith8("", "x"));
+        assert_se(!startswith8("", "xxxxxxxx"));
+        assert_se(streq8(startswith8("xxx", "x"), "xx"));
+        assert_se(streq8(startswith8("xxx", "xx"), "x"));
+        assert_se(streq8(startswith8("xxx", "xxx"), ""));
+        assert_se(!startswith8("xxx", "xxxx"));
+        assert_se(!startswith8(NULL, ""));
+}
+
+#define TEST_FNMATCH_ONE(pattern, haystack, expect)                                     \
+        ({                                                                              \
+                assert_se(fnmatch(pattern, haystack, 0) == (expect ? 0 : FNM_NOMATCH)); \
+                assert_se(efi_fnmatch(u##pattern, u##haystack) == expect);              \
+        })
+
+TEST(efi_fnmatch) {
+        TEST_FNMATCH_ONE("", "", true);
+        TEST_FNMATCH_ONE("abc", "abc", true);
+        TEST_FNMATCH_ONE("aBc", "abc", false);
+        TEST_FNMATCH_ONE("b", "a", false);
+        TEST_FNMATCH_ONE("b", "", false);
+        TEST_FNMATCH_ONE("abc", "a", false);
+        TEST_FNMATCH_ONE("a?c", "azc", true);
+        TEST_FNMATCH_ONE("???", "?.9", true);
+        TEST_FNMATCH_ONE("1?", "1", false);
+        TEST_FNMATCH_ONE("***", "", true);
+        TEST_FNMATCH_ONE("*", "123", true);
+        TEST_FNMATCH_ONE("**", "abcd", true);
+        TEST_FNMATCH_ONE("*b*", "abcd", true);
+        TEST_FNMATCH_ONE("abc*d", "abc", false);
+        TEST_FNMATCH_ONE("start*end", "startend", true);
+        TEST_FNMATCH_ONE("start*end", "startendend", true);
+        TEST_FNMATCH_ONE("start*end", "startenddne", false);
+        TEST_FNMATCH_ONE("start*end", "startendstartend", true);
+        TEST_FNMATCH_ONE("start*end", "starten", false);
+        TEST_FNMATCH_ONE("*.conf", "arch.conf", true);
+        TEST_FNMATCH_ONE("debian-*.conf", "debian-wheezy.conf", true);
+        TEST_FNMATCH_ONE("debian-*.*", "debian-wheezy.efi", true);
+        TEST_FNMATCH_ONE("ab*cde", "abzcd", false);
+        TEST_FNMATCH_ONE("\\*\\a\\[", "*a[", true);
+        TEST_FNMATCH_ONE("[abc] [abc] [abc]", "a b c", true);
+        TEST_FNMATCH_ONE("abc]", "abc]", true);
+        TEST_FNMATCH_ONE("[abc]", "z", false);
+        TEST_FNMATCH_ONE("[abc", "a", false);
+        TEST_FNMATCH_ONE("[][!] [][!] [][!]", "[ ] !", true);
+        TEST_FNMATCH_ONE("[]-] []-]", "] -", true);
+        TEST_FNMATCH_ONE("[1\\]] [1\\]]", "1 ]", true);
+        TEST_FNMATCH_ONE("[$-\\+]", "&", true);
+        TEST_FNMATCH_ONE("[1-3A-C] [1-3A-C]", "2 B", true);
+        TEST_FNMATCH_ONE("[3-5] [3-5] [3-5]", "3 4 5", true);
+        TEST_FNMATCH_ONE("[f-h] [f-h] [f-h]", "f g h", true);
+        TEST_FNMATCH_ONE("[a-c-f] [a-c-f] [a-c-f] [a-c-f] [a-c-f]", "a b c - f", true);
+        TEST_FNMATCH_ONE("[a-c-f]", "e", false);
+        TEST_FNMATCH_ONE("[--0] [--0] [--0]", "- . 0", true);
+        TEST_FNMATCH_ONE("[+--] [+--] [+--]", "+ , -", true);
+        TEST_FNMATCH_ONE("[f-l]", "m", false);
+        TEST_FNMATCH_ONE("[b]", "z-a", false);
+        TEST_FNMATCH_ONE("[a\\-z]", "b", false);
+        TEST_FNMATCH_ONE("?a*b[.-0]c", "/a/b/c", true);
+        TEST_FNMATCH_ONE("debian-*-*-*.*", "debian-jessie-2018-06-17-kernel-image-5.10.0-16-amd64.efi", true);
+
+        /* These would take forever with a backtracking implementation. */
+        TEST_FNMATCH_ONE(
+                        "a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*q*r*s*t*u*v*w*x*y*z*",
+                        "aaaabbbbccccddddeeeeffffgggghhhhiiiijjjjkkkkllllmmmmnnnnooooppppqqqqrrrrssssttttuuuuvvvvwwwwxxxxyyyy",
+                        false);
+        TEST_FNMATCH_ONE(
+                        "a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*q*r*s*t*u*v*w*x*y*z*",
+                        "aaaabbbbccccddddeeeeffffgggghhhhiiiijjjjkkkkllllmmmmnnnnooooppppqqqqrrrrssssttttuuuuvvvvwwwwxxxxyyyyzzzz!!!!",
+                        true);
+}
+
+TEST(parse_number8) {
+        uint64_t u;
+        const char *tail;
+
+        assert_se(!parse_number8(NULL, &u, NULL));
+        assert_se(!parse_number8("", &u, NULL));
+        assert_se(!parse_number8("a1", &u, NULL));
+        assert_se(!parse_number8("1a", &u, NULL));
+        assert_se(!parse_number8("-42", &u, NULL));
+        assert_se(!parse_number8("18446744073709551616", &u, NULL));
+
+        assert_se(parse_number8("0", &u, NULL));
+        assert_se(u == 0);
+        assert_se(parse_number8("1", &u, NULL));
+        assert_se(u == 1);
+        assert_se(parse_number8("999", &u, NULL));
+        assert_se(u == 999);
+        assert_se(parse_number8("18446744073709551615", &u, NULL));
+        assert_se(u == UINT64_MAX);
+        assert_se(parse_number8("42", &u, &tail));
+        assert_se(u == 42);
+        assert_se(streq8(tail, ""));
+        assert_se(parse_number8("54321rest", &u, &tail));
+        assert_se(u == 54321);
+        assert_se(streq8(tail, "rest"));
+}
+
+TEST(parse_number16) {
+        uint64_t u;
+        const char16_t *tail;
+
+        assert_se(!parse_number16(NULL, &u, NULL));
+        assert_se(!parse_number16(u"", &u, NULL));
+        assert_se(!parse_number16(u"a1", &u, NULL));
+        assert_se(!parse_number16(u"1a", &u, NULL));
+        assert_se(!parse_number16(u"-42", &u, NULL));
+        assert_se(!parse_number16(u"18446744073709551616", &u, NULL));
+
+        assert_se(parse_number16(u"0", &u, NULL));
+        assert_se(u == 0);
+        assert_se(parse_number16(u"1", &u, NULL));
+        assert_se(u == 1);
+        assert_se(parse_number16(u"999", &u, NULL));
+        assert_se(u == 999);
+        assert_se(parse_number16(u"18446744073709551615", &u, NULL));
+        assert_se(u == UINT64_MAX);
+        assert_se(parse_number16(u"42", &u, &tail));
+        assert_se(u == 42);
+        assert_se(streq16(tail, u""));
+        assert_se(parse_number16(u"54321rest", &u, &tail));
+        assert_se(u == 54321);
+        assert_se(streq16(tail, u"rest"));
+}
+
+TEST(parse_boolean) {
+        bool b;
+
+        assert_se(!parse_boolean(NULL, &b));
+        assert_se(!parse_boolean("", &b));
+        assert_se(!parse_boolean("ja", &b));
+        assert_se(parse_boolean("1", &b) && b == true);
+        assert_se(parse_boolean("y", &b) && b == true);
+        assert_se(parse_boolean("yes", &b) && b == true);
+        assert_se(parse_boolean("t", &b) && b == true);
+        assert_se(parse_boolean("true", &b) && b == true);
+        assert_se(parse_boolean("on", &b) && b == true);
+        assert_se(parse_boolean("0", &b) && b == false);
+        assert_se(parse_boolean("n", &b) && b == false);
+        assert_se(parse_boolean("no", &b) && b == false);
+        assert_se(parse_boolean("f", &b) && b == false);
+        assert_se(parse_boolean("false", &b) && b == false);
+        assert_se(parse_boolean("off", &b) && b == false);
+}
+
+TEST(line_get_key_value) {
+        char s1[] = "key=value\n"
+                    " \t  # comment line \n"
+                    "k-e-y=\"quoted value\"\n\r"
+                    "  wrong= 'quotes' \n"
+                    "odd= stripping  # with comments  ";
+        char s2[] = "this parser\n"
+                    "\t\t\t# is\t\r"
+                    "  also\tused  \r\n"
+                    "for \"the conf\"\n"
+                    "format\t !!";
+        size_t pos = 0;
+        char *key, *value;
+
+        assert_se(!line_get_key_value((char[]){ "" }, "=", &pos, &key, &value));
+        assert_se(!line_get_key_value((char[]){ "\t" }, " \t", &pos, &key, &value));
+
+        pos = 0;
+        assert_se(line_get_key_value(s1, "=", &pos, &key, &value));
+        assert_se(streq8(key, "key"));
+        assert_se(streq8(value, "value"));
+        assert_se(line_get_key_value(s1, "=", &pos, &key, &value));
+        assert_se(streq8(key, "k-e-y"));
+        assert_se(streq8(value, "quoted value"));
+        assert_se(line_get_key_value(s1, "=", &pos, &key, &value));
+        assert_se(streq8(key, "wrong"));
+        assert_se(streq8(value, " 'quotes'"));
+        assert_se(line_get_key_value(s1, "=", &pos, &key, &value));
+        assert_se(streq8(key, "odd"));
+        assert_se(streq8(value, " stripping  # with comments"));
+        assert_se(!line_get_key_value(s1, "=", &pos, &key, &value));
+
+        pos = 0;
+        assert_se(line_get_key_value(s2, " \t", &pos, &key, &value));
+        assert_se(streq8(key, "this"));
+        assert_se(streq8(value, "parser"));
+        assert_se(line_get_key_value(s2, " \t", &pos, &key, &value));
+        assert_se(streq8(key, "also"));
+        assert_se(streq8(value, "used"));
+        assert_se(line_get_key_value(s2, " \t", &pos, &key, &value));
+        assert_se(streq8(key, "for"));
+        assert_se(streq8(value, "the conf"));
+        assert_se(line_get_key_value(s2, " \t", &pos, &key, &value));
+        assert_se(streq8(key, "format"));
+        assert_se(streq8(value, "!!"));
+        assert_se(!line_get_key_value(s2, " \t", &pos, &key, &value));
+
+        /* Let's make sure we don't fail on real os-release data. */
+        _cleanup_free_ char *osrel = NULL;
+        if (read_full_file("/usr/lib/os-release", &osrel, NULL) >= 0) {
+                pos = 0;
+                while (line_get_key_value(osrel, "=", &pos, &key, &value)) {
+                        assert_se(key);
+                        assert_se(value);
+                        printf("%s = %s\n", key, value);
+                }
+        }
+}
+
+TEST(hexdump) {
+        char16_t *hex;
+
+        hex = hexdump(NULL, 0);
+        assert(streq16(hex, u""));
+        free(hex);
+
+        hex = hexdump("1", 2);
+        assert(streq16(hex, u"3100"));
+        free(hex);
+
+        hex = hexdump("abc", 4);
+        assert(streq16(hex, u"61626300"));
+        free(hex);
+
+        hex = hexdump((uint8_t[]){ 0x0, 0x42, 0xFF, 0xF1, 0x1F }, 5);
+        assert(streq16(hex, u"0042fff11f"));
+        free(hex);
+}
+
+_printf_(1, 2) static void test_printf_one(const char *format, ...) {
+        va_list ap, ap_efi;
+        va_start(ap, format);
+        va_copy(ap_efi, ap);
+
+        _cleanup_free_ char *buf = NULL;
+        int r = vasprintf(&buf, format, ap);
+        assert_se(r >= 0);
+        log_info("/* %s(%s) -> \"%.100s\" */", __func__, format, buf);
+
+        _cleanup_free_ char16_t *buf_efi = xvasprintf_status(0, format, ap_efi);
+
+        bool eq = true;
+        for (size_t i = 0; i <= (size_t) r; i++) {
+                if (buf[i] != buf_efi[i])
+                        eq = false;
+                buf[i] = buf_efi[i];
+        }
+
+        log_info("%.100s", buf);
+        assert_se(eq);
+
+        va_end(ap);
+        va_end(ap_efi);
+}
+
+TEST(xvasprintf_status) {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-zero-length"
+        test_printf_one("");
+#pragma GCC diagnostic pop
+        test_printf_one("string");
+        test_printf_one("%%-%%%%");
+
+        test_printf_one("%p %p %32p %*p %*p", NULL, (void *) 0xF, &errno, 0, &saved_argc, 20, &saved_argv);
+        test_printf_one("%-10p %-32p %-*p %-*p", NULL, &errno, 0, &saved_argc, 20, &saved_argv);
+
+        test_printf_one("%c %3c %*c %*c %-8c", '1', '!', 0, 'a', 9, '_', '>');
+
+        test_printf_one("%s %s %s", "012345", "6789", "ab");
+        test_printf_one("%.4s %.4s %.4s %.0s", "cdefgh", "ijkl", "mn", "@");
+        test_printf_one("%8s %8s %8s", "opqrst", "uvwx", "yz");
+        test_printf_one("%8.4s %8.4s %8.4s %8.0s", "ABCDEF", "GHIJ", "KL", "$");
+        test_printf_one("%4.8s %4.8s %4.8s", "ABCDEFGHIJ", "ABCDEFGH", "ABCD");
+
+        test_printf_one("%.*s %.*s %.*s %.*s", 4, "012345", 4, "6789", 4, "ab", 0, "&");
+        test_printf_one("%*s %*s %*s", 8, "cdefgh", 8, "ijkl", 8, "mn");
+        test_printf_one("%*.*s %*.*s %*.*s %*.*s", 8, 4, "opqrst", 8, 4, "uvwx", 8, 4, "yz", 8, 0, "#");
+        test_printf_one("%*.*s %*.*s %*.*s", 3, 8, "OPQRST", 3, 8, "UVWX", 3, 8, "YZ");
+
+        test_printf_one("%ls %ls %ls", L"012345", L"6789", L"ab");
+        test_printf_one("%.4ls %.4ls %.4ls %.0ls", L"cdefgh", L"ijkl", L"mn", L"@");
+        test_printf_one("%8ls %8ls %8ls", L"opqrst", L"uvwx", L"yz");
+        test_printf_one("%8.4ls %8.4ls %8.4ls %8.0ls", L"ABCDEF", L"GHIJ", L"KL", L"$");
+        test_printf_one("%4.8ls %4.8ls %4.8ls", L"ABCDEFGHIJ", L"ABCDEFGH", L"ABCD");
+
+        test_printf_one("%.*ls %.*ls %.*ls %.*ls", 4, L"012345", 4, L"6789", 4, L"ab", 0, L"&");
+        test_printf_one("%*ls %*ls %*ls", 8, L"cdefgh", 8, L"ijkl", 8, L"mn");
+        test_printf_one("%*.*ls %*.*ls %*.*ls %*.*ls", 8, 4, L"opqrst", 8, 4, L"uvwx", 8, 4, L"yz", 8, 0, L"#");
+        test_printf_one("%*.*ls %*.*ls %*.*ls", 3, 8, L"OPQRST", 3, 8, L"UVWX", 3, 8, L"YZ");
+
+        test_printf_one("%-14s %-10.0s %-10.3s", "left", "", "chopped");
+        test_printf_one("%-14ls %-10.0ls %-10.3ls", L"left", L"", L"chopped");
+
+        test_printf_one("%.6s", (char[]){ 'n', 'o', ' ', 'n', 'u', 'l' });
+        test_printf_one("%.6ls", (wchar_t[]){ 'n', 'o', ' ', 'n', 'u', 'l' });
+
+        test_printf_one("%u %u %u", 0U, 42U, 1234567890U);
+        test_printf_one("%i %i %i", 0, -42, -1234567890);
+        test_printf_one("%x %x %x", 0x0U, 0x42U, 0x123ABCU);
+        test_printf_one("%X %X %X", 0x1U, 0x43U, 0x234BCDU);
+        test_printf_one("%#x %#x %#x", 0x2U, 0x44U, 0x345CDEU);
+        test_printf_one("%#X %#X %#X", 0x3U, 0x45U, 0x456FEDU);
+
+        test_printf_one("%u %lu %llu %zu", UINT_MAX, ULONG_MAX, ULLONG_MAX, SIZE_MAX);
+        test_printf_one("%i %i %zi", INT_MIN, INT_MAX, SSIZE_MAX);
+        test_printf_one("%li %li %lli %lli", LONG_MIN, LONG_MAX, LLONG_MIN, LLONG_MAX);
+        test_printf_one("%x %#lx %llx %#zx", UINT_MAX, ULONG_MAX, ULLONG_MAX, SIZE_MAX);
+        test_printf_one("%X %#lX %llX %#zX", UINT_MAX, ULONG_MAX, ULLONG_MAX, SIZE_MAX);
+        test_printf_one("%ju %ji %ji", UINTMAX_MAX, INTMAX_MIN, INTMAX_MAX);
+        test_printf_one("%ti %ti", PTRDIFF_MIN, PTRDIFF_MAX);
+
+        test_printf_one("%" PRIu32 " %" PRIi32 " %" PRIi32, UINT32_MAX, INT32_MIN, INT32_MAX);
+        test_printf_one("%" PRIx32 " %" PRIX32, UINT32_MAX, UINT32_MAX);
+        test_printf_one("%#" PRIx32 " %#" PRIX32, UINT32_MAX, UINT32_MAX);
+
+        test_printf_one("%" PRIu64 " %" PRIi64 " %" PRIi64, UINT64_MAX, INT64_MIN, INT64_MAX);
+        test_printf_one("%" PRIx64 " %" PRIX64, UINT64_MAX, UINT64_MAX);
+        test_printf_one("%#" PRIx64 " %#" PRIX64, UINT64_MAX, UINT64_MAX);
+
+        test_printf_one("%.11u %.11i %.11x %.11X %#.11x %#.11X", 1U, -2, 3U, 0xA1U, 0xB2U, 0xC3U);
+        test_printf_one("%13u %13i %13x %13X %#13x %#13X", 4U, -5, 6U, 0xD4U, 0xE5U, 0xF6U);
+        test_printf_one("%9.5u %9.5i %9.5x %9.5X %#9.5x %#9.5X", 7U, -8, 9U, 0xA9U, 0xB8U, 0xC7U);
+        test_printf_one("%09u %09i %09x %09X %#09x %#09X", 4U, -5, 6U, 0xD6U, 0xE5U, 0xF4U);
+
+        test_printf_one("%*u %.*u %*i %.*i", 15, 42U, 15, 43U, 15, -42, 15, -43);
+        test_printf_one("%*.*u %*.*i", 14, 10, 13U, 14, 10, -14);
+        test_printf_one("%*x %*X %.*x %.*X", 15, 0x1AU, 15, 0x2BU, 15, 0x3CU, 15, 0x4DU);
+        test_printf_one("%#*x %#*X %#.*x %#.*X", 15, 0xA1U, 15, 0xB2U, 15, 0xC3U, 15, 0xD4U);
+        test_printf_one("%*.*x %*.*X", 14, 10, 0x1AU, 14, 10, 0x2BU);
+        test_printf_one("%#*.*x %#*.*X", 14, 10, 0x3CU, 14, 10, 0x4DU);
+
+        test_printf_one("%+.5i %+.5i % .7i % .7i", -15, 51, -15, 51);
+        test_printf_one("%+5.i %+5.i % 7.i % 7.i", -15, 51, -15, 51);
+
+        test_printf_one("%-10u %-10i %-10x %#-10X %- 10i", 1u, -2, 0xA2D2u, 0XB3F4u, -512);
+        test_printf_one("%-10.6u %-10.6i %-10.6x %#-10.6X %- 10.6i", 1u, -2, 0xA2D2u, 0XB3F4u, -512);
+        test_printf_one("%-6.10u %-6.10i %-6.10x %#-6.10X %- 6.10i", 3u, -4, 0x2A2Du, 0X3B4Fu, -215);
+        test_printf_one("%*.u %.*i %.*i", -4, 9u, -4, 8, -4, -6);
+
+        test_printf_one("%.0u %.0i %.0x %.0X", 0u, 0, 0u, 0u);
+        test_printf_one("%.*u %.*i %.*x %.*X", 0, 0u, 0, 0, 0, 0u, 0, 0u);
+        test_printf_one("%*u %*i %*x %*X", -1, 0u, -1, 0, -1, 0u, -1, 0u);
+
+        test_printf_one("%*s%*s%*s", 256, "", 256, "", 4096, ""); /* Test buf growing. */
+        test_printf_one("%0*i%0*i%0*i", 256, 0, 256, 0, 4096, 0); /* Test buf growing. */
+        test_printf_one("%0*i", INT16_MAX, 0); /* Poor programmer's memzero. */
+
+        /* Non printf-compatible behavior tests below. */
+        char16_t *s;
+
+        assert_se(s = xasprintf_status(0, "\n \r \r\n"));
+        assert_se(streq16(s, u"\r\n \r \r\r\n"));
+        s = mfree(s);
+
+        assert_se(s = xasprintf_status(EFI_SUCCESS, "%m"));
+        assert_se(streq16(s, u"Success"));
+        s = mfree(s);
+
+        assert_se(s = xasprintf_status(EFI_SUCCESS, "%15m"));
+        assert_se(streq16(s, u"        Success"));
+        s = mfree(s);
+
+        assert_se(s = xasprintf_status(EFI_LOAD_ERROR, "%m"));
+        assert_se(streq16(s, u"Load error"));
+        s = mfree(s);
+
+        assert_se(s = xasprintf_status(0x42, "%m"));
+        assert_se(streq16(s, u"0x42"));
+        s = mfree(s);
+}
+
+TEST(efi_memchr) {
+        assert_se(streq8(efi_memchr("abcde", 'c', 5), "cde"));
+        assert_se(streq8(efi_memchr("abcde", 'c', 3), "cde"));
+        assert_se(streq8(efi_memchr("abcde", 'c', 2), NULL));
+        assert_se(streq8(efi_memchr("abcde", 'c', 7), "cde"));
+        assert_se(streq8(efi_memchr("abcde", 'q', 5), NULL));
+        assert_se(streq8(efi_memchr("abcde", 'q', 0), NULL));
+        /* Test that the character is interpreted as unsigned char. */
+        assert_se(streq8(efi_memchr("abcde", 'a', 6), efi_memchr("abcde", 'a' + 0x100, 6)));
+        assert_se(streq8(efi_memchr("abcde", 0, 6), ""));
+        assert_se(efi_memchr(NULL, 0, 0) == NULL);
+}
+
+TEST(efi_memcmp) {
+        assert_se(efi_memcmp(NULL, NULL, 0) == 0);
+        assert_se(efi_memcmp(NULL, NULL, 1) == 0);
+        assert_se(efi_memcmp(NULL, "", 1) < 0);
+        assert_se(efi_memcmp("", NULL, 1) > 0);
+        assert_se(efi_memcmp("", "", 0) == 0);
+        assert_se(efi_memcmp("", "", 1) == 0);
+        assert_se(efi_memcmp("1", "1", 1) == 0);
+        assert_se(efi_memcmp("1", "2", 1) < 0);
+        assert_se(efi_memcmp("A", "a", 1) < 0);
+        assert_se(efi_memcmp("a", "A", 1) > 0);
+        assert_se(efi_memcmp("abc", "ab", 2) == 0);
+        assert_se(efi_memcmp("ab", "abc", 3) < 0);
+        assert_se(efi_memcmp("abc", "ab", 3) > 0);
+        assert_se(efi_memcmp("ab\000bd", "ab\000bd", 6) == 0);
+        assert_se(efi_memcmp("ab\000b\0", "ab\000bd", 6) < 0);
+}
+
+TEST(efi_memcpy) {
+        char buf[10];
+
+        assert_se(!efi_memcpy(NULL, NULL, 0));
+        assert_se(!efi_memcpy(NULL, "", 1));
+        assert_se(efi_memcpy(buf, NULL, 0) == buf);
+        assert_se(efi_memcpy(buf, NULL, 1) == buf);
+        assert_se(efi_memcpy(buf, "a", 0) == buf);
+
+        assert_se(efi_memcpy(buf, "", 1) == buf);
+        assert_se(memcmp(buf, "", 1) == 0);
+        assert_se(efi_memcpy(buf, "1", 1) == buf);
+        assert_se(memcmp(buf, "1", 1) == 0);
+        assert_se(efi_memcpy(buf, "23", 3) == buf);
+        assert_se(memcmp(buf, "23", 3) == 0);
+        assert_se(efi_memcpy(buf, "45\0ab\0\0\0c", 9) == buf);
+        assert_se(memcmp(buf, "45\0ab\0\0\0c", 9) == 0);
+}
+
+TEST(efi_memset) {
+        char buf[10];
+
+        assert_se(!efi_memset(NULL, '1', 0));
+        assert_se(!efi_memset(NULL, '1', 1));
+        assert_se(efi_memset(buf, '1', 0) == buf);
+
+        assert_se(efi_memset(buf, '2', 1) == buf);
+        assert_se(memcmp(buf, "2", 1) == 0);
+        assert_se(efi_memset(buf, '4', 4) == buf);
+        assert_se(memcmp(buf, "4444", 4) == 0);
+        assert_se(efi_memset(buf, 'a', 10) == buf);
+        assert_se(memcmp(buf, "aaaaaaaaaa", 10) == 0);
+}
+
+DEFINE_TEST_MAIN(LOG_INFO);
diff --git a/src/boot/efi/ticks.c b/src/boot/efi/ticks.c
new file mode 100644
index 0000000..873b9fe
--- /dev/null
+++ b/src/boot/efi/ticks.c
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "ticks.h"
+#include "util.h"
+#include "vmm.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#  include 
+
+static uint64_t ticks_read_arch(void) {
+        /* The TSC might or might not be virtualized in VMs (and thus might not be accurate or start at zero
+         * at boot), depending on hypervisor and CPU functionality. If it's not virtualized it's not useful
+         * for keeping time, hence don't attempt to use it. */
+        if (in_hypervisor())
+                return 0;
+
+        return __builtin_ia32_rdtsc();
+}
+
+static uint64_t ticks_freq_arch(void) {
+        /* Detect TSC frequency from CPUID information if available. */
+
+        unsigned max_leaf, ebx, ecx, edx;
+        if (__get_cpuid(0, &max_leaf, &ebx, &ecx, &edx) == 0)
+                return 0;
+
+        /* Leaf 0x15 is Intel only. */
+        if (max_leaf < 0x15 || ebx != signature_INTEL_ebx || ecx != signature_INTEL_ecx ||
+            edx != signature_INTEL_edx)
+                return 0;
+
+        unsigned denominator, numerator, crystal_hz;
+        __cpuid(0x15, denominator, numerator, crystal_hz, edx);
+        if (denominator == 0 || numerator == 0)
+                return 0;
+
+        uint64_t freq = crystal_hz;
+        if (crystal_hz == 0) {
+                /* If the crystal frequency is not available, try to deduce it from
+                 * the processor frequency leaf if available. */
+                if (max_leaf < 0x16)
+                        return 0;
+
+                unsigned core_mhz;
+                __cpuid(0x16, core_mhz, ebx, ecx, edx);
+                freq = core_mhz * 1000ULL * 1000ULL * denominator / numerator;
+        }
+
+        return freq * numerator / denominator;
+}
+
+#elif defined(__aarch64__)
+
+static uint64_t ticks_read_arch(void) {
+        uint64_t val;
+        asm volatile("mrs %0, cntvct_el0" : "=r"(val));
+        return val;
+}
+
+static uint64_t ticks_freq_arch(void) {
+        uint64_t freq;
+        asm volatile("mrs %0, cntfrq_el0" : "=r"(freq));
+        return freq;
+}
+
+#else
+
+static uint64_t ticks_read_arch(void) {
+        return 0;
+}
+
+static uint64_t ticks_freq_arch(void) {
+        return 0;
+}
+
+#endif
+
+static uint64_t ticks_freq(void) {
+        static uint64_t cache = 0;
+
+        if (cache != 0)
+                return cache;
+
+        cache = ticks_freq_arch();
+        if (cache != 0)
+                return cache;
+
+        /* As a fallback, count ticks during a millisecond delay. */
+        uint64_t ticks_start = ticks_read_arch();
+        BS->Stall(1000);
+        uint64_t ticks_end = ticks_read_arch();
+
+        if (ticks_end < ticks_start) /* Check for an overflow (which is not that unlikely, given on some
+                                      * archs the value is 32-bit) */
+                return 0;
+
+        cache = (ticks_end - ticks_start) * 1000UL;
+        return cache;
+}
+
+uint64_t time_usec(void) {
+        uint64_t ticks = ticks_read_arch();
+        if (ticks == 0)
+                return 0;
+
+        uint64_t freq = ticks_freq();
+        if (freq == 0)
+                return 0;
+
+        return 1000UL * 1000UL * ticks / freq;
+}
diff --git a/src/boot/efi/ticks.h b/src/boot/efi/ticks.h
new file mode 100644
index 0000000..fec3764
--- /dev/null
+++ b/src/boot/efi/ticks.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+uint64_t time_usec(void);
diff --git a/src/boot/efi/ubsan.c b/src/boot/efi/ubsan.c
new file mode 100644
index 0000000..9512046
--- /dev/null
+++ b/src/boot/efi/ubsan.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "log.h"
+
+typedef struct {
+        const char *filename;
+        uint32_t line;
+        uint32_t column;
+} SourceLocation;
+
+/* Note that all ubsan handlers have a pointer to a type-specific struct passed as first argument.
+ * Since we do not inspect the extra data in it we can just treat it as a SourceLocation struct
+ * directly to keep things simple. */
+
+#define HANDLER(name, ...)                                         \
+        _used_ _noreturn_ void __ubsan_handle_##name(__VA_ARGS__); \
+        void __ubsan_handle_##name(__VA_ARGS__) {                  \
+                log_error("systemd-boot: %s in %s@%u:%u",          \
+                          __func__,                                \
+                          location->filename,                      \
+                          location->line,                          \
+                          location->column);                       \
+                freeze();                                          \
+        }
+
+#define UNARY_HANDLER(name) HANDLER(name, SourceLocation *location, uintptr_t v)
+#define BINARY_HANDLER(name) HANDLER(name, SourceLocation *location, uintptr_t v1, uintptr_t v2)
+
+UNARY_HANDLER(load_invalid_value);
+UNARY_HANDLER(negate_overflow);
+UNARY_HANDLER(out_of_bounds);
+UNARY_HANDLER(type_mismatch_v1);
+UNARY_HANDLER(vla_bound_not_positive);
+
+BINARY_HANDLER(add_overflow);
+BINARY_HANDLER(divrem_overflow);
+BINARY_HANDLER(implicit_conversion);
+BINARY_HANDLER(mul_overflow);
+BINARY_HANDLER(pointer_overflow);
+BINARY_HANDLER(shift_out_of_bounds);
+BINARY_HANDLER(sub_overflow);
+
+HANDLER(builtin_unreachable, SourceLocation *location);
+HANDLER(invalid_builtin, SourceLocation *location);
+HANDLER(nonnull_arg, SourceLocation *location);
+HANDLER(nonnull_return_v1, SourceLocation *attr_location, SourceLocation *location);
diff --git a/src/boot/efi/util.c b/src/boot/efi/util.c
new file mode 100644
index 0000000..e56ccfd
--- /dev/null
+++ b/src/boot/efi/util.c
@@ -0,0 +1,705 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "device-path-util.h"
+#include "memory-util-fundamental.h"
+#include "proto/device-path.h"
+#include "proto/simple-text-io.h"
+#include "ticks.h"
+#include "util.h"
+#include "version.h"
+
+EFI_STATUS efivar_set_raw(const EFI_GUID *vendor, const char16_t *name, const void *buf, size_t size, uint32_t flags) {
+        assert(vendor);
+        assert(name);
+        assert(buf || size == 0);
+
+        flags |= EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS;
+        return RT->SetVariable((char16_t *) name, (EFI_GUID *) vendor, flags, size, (void *) buf);
+}
+
+EFI_STATUS efivar_set(const EFI_GUID *vendor, const char16_t *name, const char16_t *value, uint32_t flags) {
+        assert(vendor);
+        assert(name);
+
+        return efivar_set_raw(vendor, name, value, value ? strsize16(value) : 0, flags);
+}
+
+EFI_STATUS efivar_set_uint_string(const EFI_GUID *vendor, const char16_t *name, size_t i, uint32_t flags) {
+        assert(vendor);
+        assert(name);
+
+        _cleanup_free_ char16_t *str = xasprintf("%zu", i);
+        return efivar_set(vendor, name, str, flags);
+}
+
+EFI_STATUS efivar_set_uint32_le(const EFI_GUID *vendor, const char16_t *name, uint32_t value, uint32_t flags) {
+        uint8_t buf[4];
+
+        assert(vendor);
+        assert(name);
+
+        buf[0] = (uint8_t)(value >> 0U & 0xFF);
+        buf[1] = (uint8_t)(value >> 8U & 0xFF);
+        buf[2] = (uint8_t)(value >> 16U & 0xFF);
+        buf[3] = (uint8_t)(value >> 24U & 0xFF);
+
+        return efivar_set_raw(vendor, name, buf, sizeof(buf), flags);
+}
+
+EFI_STATUS efivar_set_uint64_le(const EFI_GUID *vendor, const char16_t *name, uint64_t value, uint32_t flags) {
+        uint8_t buf[8];
+
+        assert(vendor);
+        assert(name);
+
+        buf[0] = (uint8_t)(value >> 0U & 0xFF);
+        buf[1] = (uint8_t)(value >> 8U & 0xFF);
+        buf[2] = (uint8_t)(value >> 16U & 0xFF);
+        buf[3] = (uint8_t)(value >> 24U & 0xFF);
+        buf[4] = (uint8_t)(value >> 32U & 0xFF);
+        buf[5] = (uint8_t)(value >> 40U & 0xFF);
+        buf[6] = (uint8_t)(value >> 48U & 0xFF);
+        buf[7] = (uint8_t)(value >> 56U & 0xFF);
+
+        return efivar_set_raw(vendor, name, buf, sizeof(buf), flags);
+}
+
+EFI_STATUS efivar_unset(const EFI_GUID *vendor, const char16_t *name, uint32_t flags) {
+        EFI_STATUS err;
+
+        assert(vendor);
+        assert(name);
+
+        /* We could be wiping a non-volatile variable here and the spec makes no guarantees that won't incur
+         * in an extra write (and thus wear out). So check and clear only if needed. */
+        err = efivar_get_raw(vendor, name, NULL, NULL);
+        if (err == EFI_SUCCESS)
+                return efivar_set_raw(vendor, name, NULL, 0, flags);
+
+        return err;
+}
+
+EFI_STATUS efivar_get(const EFI_GUID *vendor, const char16_t *name, char16_t **ret) {
+        _cleanup_free_ char16_t *buf = NULL;
+        EFI_STATUS err;
+        char16_t *val;
+        size_t size;
+
+        assert(vendor);
+        assert(name);
+
+        err = efivar_get_raw(vendor, name, (char **) &buf, &size);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Make sure there are no incomplete characters in the buffer */
+        if ((size % sizeof(char16_t)) != 0)
+                return EFI_INVALID_PARAMETER;
+
+        if (!ret)
+                return EFI_SUCCESS;
+
+        /* Return buffer directly if it happens to be NUL terminated already */
+        if (size >= sizeof(char16_t) && buf[size / sizeof(char16_t) - 1] == 0) {
+                *ret = TAKE_PTR(buf);
+                return EFI_SUCCESS;
+        }
+
+        /* Make sure a terminating NUL is available at the end */
+        val = xmalloc(size + sizeof(char16_t));
+
+        memcpy(val, buf, size);
+        val[size / sizeof(char16_t) - 1] = 0; /* NUL terminate */
+
+        *ret = val;
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS efivar_get_uint_string(const EFI_GUID *vendor, const char16_t *name, size_t *ret) {
+        _cleanup_free_ char16_t *val = NULL;
+        EFI_STATUS err;
+        uint64_t u;
+
+        assert(vendor);
+        assert(name);
+
+        err = efivar_get(vendor, name, &val);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (!parse_number16(val, &u, NULL) || u > SIZE_MAX)
+                return EFI_INVALID_PARAMETER;
+
+        if (ret)
+                *ret = u;
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS efivar_get_uint32_le(const EFI_GUID *vendor, const char16_t *name, uint32_t *ret) {
+        _cleanup_free_ char *buf = NULL;
+        size_t size;
+        EFI_STATUS err;
+
+        assert(vendor);
+        assert(name);
+
+        err = efivar_get_raw(vendor, name, &buf, &size);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (size != sizeof(uint32_t))
+                return EFI_BUFFER_TOO_SMALL;
+
+        if (ret)
+                *ret = (uint32_t) buf[0] << 0U | (uint32_t) buf[1] << 8U | (uint32_t) buf[2] << 16U |
+                        (uint32_t) buf[3] << 24U;
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS efivar_get_uint64_le(const EFI_GUID *vendor, const char16_t *name, uint64_t *ret) {
+        _cleanup_free_ char *buf = NULL;
+        size_t size;
+        EFI_STATUS err;
+
+        assert(vendor);
+        assert(name);
+
+        err = efivar_get_raw(vendor, name, &buf, &size);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (size != sizeof(uint64_t))
+                return EFI_BUFFER_TOO_SMALL;
+
+        if (ret)
+                *ret = (uint64_t) buf[0] << 0U | (uint64_t) buf[1] << 8U | (uint64_t) buf[2] << 16U |
+                        (uint64_t) buf[3] << 24U | (uint64_t) buf[4] << 32U | (uint64_t) buf[5] << 40U |
+                        (uint64_t) buf[6] << 48U | (uint64_t) buf[7] << 56U;
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS efivar_get_raw(const EFI_GUID *vendor, const char16_t *name, char **ret, size_t *ret_size) {
+        EFI_STATUS err;
+
+        assert(vendor);
+        assert(name);
+
+        size_t size = 0;
+        err = RT->GetVariable((char16_t *) name, (EFI_GUID *) vendor, NULL, &size, NULL);
+        if (err != EFI_BUFFER_TOO_SMALL)
+                return err;
+
+        _cleanup_free_ void *buf = xmalloc(size);
+        err = RT->GetVariable((char16_t *) name, (EFI_GUID *) vendor, NULL, &size, buf);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (ret)
+                *ret = TAKE_PTR(buf);
+        if (ret_size)
+                *ret_size = size;
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS efivar_get_boolean_u8(const EFI_GUID *vendor, const char16_t *name, bool *ret) {
+        _cleanup_free_ char *b = NULL;
+        size_t size;
+        EFI_STATUS err;
+
+        assert(vendor);
+        assert(name);
+
+        err = efivar_get_raw(vendor, name, &b, &size);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (ret)
+                *ret = *b > 0;
+
+        return EFI_SUCCESS;
+}
+
+void efivar_set_time_usec(const EFI_GUID *vendor, const char16_t *name, uint64_t usec) {
+        assert(vendor);
+        assert(name);
+
+        if (usec == 0)
+                usec = time_usec();
+        if (usec == 0)
+                return;
+
+        _cleanup_free_ char16_t *str = xasprintf("%" PRIu64, usec);
+        efivar_set(vendor, name, str, 0);
+}
+
+void convert_efi_path(char16_t *path) {
+        assert(path);
+
+        for (size_t i = 0, fixed = 0;; i++) {
+                /* Fix device path node separator. */
+                path[fixed] = (path[i] == '/') ? '\\' : path[i];
+
+                /* Double '\' is not allowed in EFI file paths. */
+                if (fixed > 0 && path[fixed - 1] == '\\' && path[fixed] == '\\')
+                        continue;
+
+                if (path[i] == '\0')
+                        break;
+
+                fixed++;
+        }
+}
+
+char16_t *xstr8_to_path(const char *str8) {
+        assert(str8);
+        char16_t *path = xstr8_to_16(str8);
+        convert_efi_path(path);
+        return path;
+}
+
+static bool shall_be_whitespace(char16_t c) {
+        return c <= 0x20U || c == 0x7FU; /* All control characters + space */
+}
+
+char16_t* mangle_stub_cmdline(char16_t *cmdline) {
+        char16_t *p, *q, *e;
+
+        if (!cmdline)
+                return cmdline;
+
+        p = q = cmdline;
+
+        /* Skip initial whitespace */
+        while (shall_be_whitespace(*p))
+                p++;
+
+        /* Turn inner control characters into proper spaces */
+        for (e = p; *p != 0; p++) {
+                if (shall_be_whitespace(*p)) {
+                        *(q++) = ' ';
+                        continue;
+                }
+
+                *(q++) = *p;
+                e = q; /* remember last non-whitespace char */
+        }
+
+        /* Chop off trailing whitespace */
+        *e = 0;
+        return cmdline;
+}
+
+EFI_STATUS chunked_read(EFI_FILE *file, size_t *size, void *buf) {
+        EFI_STATUS err;
+
+        assert(file);
+        assert(size);
+        assert(buf);
+
+        /* This is a drop-in replacement for EFI_FILE->Read() with the same API behavior.
+         * Some broken firmwares cannot handle large file reads and will instead return
+         * an error. As a workaround, read such files in small chunks.
+         * Note that we cannot just try reading the whole file first on such firmware as
+         * that will permanently break the handle even if it is re-opened.
+         *
+         * https://github.com/systemd/systemd/issues/25911 */
+
+        if (*size == 0)
+                return EFI_SUCCESS;
+
+        size_t read = 0, remaining = *size;
+        while (remaining > 0) {
+                size_t chunk = MIN(1024U * 1024U, remaining);
+
+                err = file->Read(file, &chunk, (uint8_t *) buf + read);
+                if (err != EFI_SUCCESS)
+                        return err;
+                if (chunk == 0)
+                        /* Caller requested more bytes than are in file. */
+                        break;
+
+                assert(chunk <= remaining);
+                read += chunk;
+                remaining -= chunk;
+        }
+
+        *size = read;
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS file_read(EFI_FILE *dir, const char16_t *name, size_t off, size_t size, char **ret, size_t *ret_size) {
+        _cleanup_(file_closep) EFI_FILE *handle = NULL;
+        _cleanup_free_ char *buf = NULL;
+        EFI_STATUS err;
+
+        assert(dir);
+        assert(name);
+        assert(ret);
+
+        err = dir->Open(dir, &handle, (char16_t*) name, EFI_FILE_MODE_READ, 0ULL);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (size == 0) {
+                _cleanup_free_ EFI_FILE_INFO *info = NULL;
+
+                err = get_file_info(handle, &info, NULL);
+                if (err != EFI_SUCCESS)
+                        return err;
+
+                size = info->FileSize;
+        }
+
+        if (off > 0) {
+                err = handle->SetPosition(handle, off);
+                if (err != EFI_SUCCESS)
+                        return err;
+        }
+
+        /* Allocate some extra bytes to guarantee the result is NUL-terminated for char and char16_t strings. */
+        size_t extra = size % sizeof(char16_t) + sizeof(char16_t);
+
+        buf = xmalloc(size + extra);
+        err = chunked_read(handle, &size, buf);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        /* Note that chunked_read() changes size to reflect the actual bytes read. */
+        memzero(buf + size, extra);
+
+        *ret = TAKE_PTR(buf);
+        if (ret_size)
+                *ret_size = size;
+
+        return err;
+}
+
+void print_at(size_t x, size_t y, size_t attr, const char16_t *str) {
+        assert(str);
+        ST->ConOut->SetCursorPosition(ST->ConOut, x, y);
+        ST->ConOut->SetAttribute(ST->ConOut, attr);
+        ST->ConOut->OutputString(ST->ConOut, (char16_t *) str);
+}
+
+void clear_screen(size_t attr) {
+        log_wait();
+        ST->ConOut->SetAttribute(ST->ConOut, attr);
+        ST->ConOut->ClearScreen(ST->ConOut);
+}
+
+void sort_pointer_array(
+                void **array,
+                size_t n_members,
+                compare_pointer_func_t compare) {
+
+        assert(array || n_members == 0);
+        assert(compare);
+
+        if (n_members <= 1)
+                return;
+
+        for (size_t i = 1; i < n_members; i++) {
+                size_t k;
+                void *entry = array[i];
+
+                for (k = i; k > 0; k--) {
+                        if (compare(array[k - 1], entry) <= 0)
+                                break;
+
+                        array[k] = array[k - 1];
+                }
+
+                array[k] = entry;
+        }
+}
+
+EFI_STATUS get_file_info(EFI_FILE *handle, EFI_FILE_INFO **ret, size_t *ret_size) {
+        size_t size = EFI_FILE_INFO_MIN_SIZE;
+        _cleanup_free_ EFI_FILE_INFO *fi = NULL;
+        EFI_STATUS err;
+
+        assert(handle);
+        assert(ret);
+
+        fi = xmalloc(size);
+        err = handle->GetInfo(handle, MAKE_GUID_PTR(EFI_FILE_INFO), &size, fi);
+        if (err == EFI_BUFFER_TOO_SMALL) {
+                free(fi);
+                fi = xmalloc(size);  /* GetInfo tells us the required size, let's use that now */
+                err = handle->GetInfo(handle, MAKE_GUID_PTR(EFI_FILE_INFO), &size, fi);
+        }
+
+        if (err != EFI_SUCCESS)
+                return err;
+
+        *ret = TAKE_PTR(fi);
+
+        if (ret_size)
+                *ret_size = size;
+
+        return EFI_SUCCESS;
+}
+
+EFI_STATUS readdir(
+                EFI_FILE *handle,
+                EFI_FILE_INFO **buffer,
+                size_t *buffer_size) {
+
+        EFI_STATUS err;
+        size_t sz;
+
+        assert(handle);
+        assert(buffer);
+        assert(buffer_size);
+
+        /* buffer/buffer_size are both in and output parameters. Should be zero-initialized initially, and
+         * the specified buffer needs to be freed by caller, after final use. */
+
+        if (!*buffer) {
+                sz = EFI_FILE_INFO_MIN_SIZE;
+                *buffer = xmalloc(sz);
+                *buffer_size = sz;
+        } else
+                sz = *buffer_size;
+
+        err = handle->Read(handle, &sz, *buffer);
+        if (err == EFI_BUFFER_TOO_SMALL) {
+                free(*buffer);
+                *buffer = xmalloc(sz);
+                *buffer_size = sz;
+                err = handle->Read(handle, &sz, *buffer);
+        }
+        if (err != EFI_SUCCESS)
+                return err;
+
+        if (sz == 0) {
+                /* End of directory */
+                free(*buffer);
+                *buffer = NULL;
+                *buffer_size = 0;
+        }
+
+        return EFI_SUCCESS;
+}
+
+bool is_ascii(const char16_t *f) {
+        if (!f)
+                return false;
+
+        for (; *f != 0; f++)
+                if (*f > 127)
+                        return false;
+
+        return true;
+}
+
+char16_t **strv_free(char16_t **v) {
+        if (!v)
+                return NULL;
+
+        for (char16_t **i = v; *i; i++)
+                free(*i);
+
+        free(v);
+        return NULL;
+}
+
+EFI_STATUS open_directory(
+                EFI_FILE *root,
+                const char16_t *path,
+                EFI_FILE **ret) {
+
+        _cleanup_(file_closep) EFI_FILE *dir = NULL;
+        _cleanup_free_ EFI_FILE_INFO *file_info = NULL;
+        EFI_STATUS err;
+
+        assert(root);
+
+        /* Opens a file, and then verifies it is actually a directory */
+
+        err = root->Open(root, &dir, (char16_t *) path, EFI_FILE_MODE_READ, 0);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = get_file_info(dir, &file_info, NULL);
+        if (err != EFI_SUCCESS)
+                return err;
+        if (!FLAGS_SET(file_info->Attribute, EFI_FILE_DIRECTORY))
+                return EFI_LOAD_ERROR;
+
+        *ret = TAKE_PTR(dir);
+        return EFI_SUCCESS;
+}
+
+uint64_t get_os_indications_supported(void) {
+        uint64_t osind;
+        EFI_STATUS err;
+
+        /* Returns the supported OS indications. If we can't acquire it, returns a zeroed out mask, i.e. no
+         * supported features. */
+
+        err = efivar_get_uint64_le(MAKE_GUID_PTR(EFI_GLOBAL_VARIABLE), u"OsIndicationsSupported", &osind);
+        if (err != EFI_SUCCESS)
+                return 0;
+
+        return osind;
+}
+
+__attribute__((noinline)) void notify_debugger(const char *identity, volatile bool wait) {
+#ifdef EFI_DEBUG
+        printf("%s@%p %s\n", identity, __executable_start, GIT_VERSION);
+        if (wait)
+                printf("Waiting for debugger to attach...\n");
+
+        /* This is a poor programmer's breakpoint to wait until a debugger
+         * has attached to us. Just "set variable wait = 0" or "return" to continue. */
+        while (wait)
+                /* Prefer asm based stalling so that gdb has a source location to present. */
+#  if defined(__i386__) || defined(__x86_64__)
+                asm volatile("pause");
+#  elif defined(__aarch64__)
+                asm volatile("wfi");
+#  else
+                BS->Stall(5000);
+#  endif
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static uint8_t inb(uint16_t port) {
+        uint8_t value;
+        asm volatile("inb %1, %0" : "=a"(value) : "Nd"(port));
+        return value;
+}
+
+static void outb(uint16_t port, uint8_t value) {
+        asm volatile("outb %0, %1" : : "a"(value), "Nd"(port));
+}
+
+void beep(unsigned beep_count) {
+        enum {
+                PITCH                = 500,
+                BEEP_DURATION_USEC   = 100 * 1000,
+                WAIT_DURATION_USEC   = 400 * 1000,
+
+                PIT_FREQUENCY        = 0x1234dd,
+                SPEAKER_CONTROL_PORT = 0x61,
+                SPEAKER_ON_MASK      = 0x03,
+                TIMER_PORT_MAGIC     = 0xB6,
+                TIMER_CONTROL_PORT   = 0x43,
+                TIMER_CONTROL2_PORT  = 0x42,
+        };
+
+        /* Set frequency. */
+        uint32_t counter = PIT_FREQUENCY / PITCH;
+        outb(TIMER_CONTROL_PORT, TIMER_PORT_MAGIC);
+        outb(TIMER_CONTROL2_PORT, counter & 0xFF);
+        outb(TIMER_CONTROL2_PORT, (counter >> 8) & 0xFF);
+
+        uint8_t value = inb(SPEAKER_CONTROL_PORT);
+
+        while (beep_count > 0) {
+                /* Turn speaker on. */
+                value |= SPEAKER_ON_MASK;
+                outb(SPEAKER_CONTROL_PORT, value);
+
+                BS->Stall(BEEP_DURATION_USEC);
+
+                /* Turn speaker off. */
+                value &= ~SPEAKER_ON_MASK;
+                outb(SPEAKER_CONTROL_PORT, value);
+
+                beep_count--;
+                if (beep_count > 0)
+                        BS->Stall(WAIT_DURATION_USEC);
+        }
+}
+#endif
+
+EFI_STATUS open_volume(EFI_HANDLE device, EFI_FILE **ret_file) {
+        EFI_STATUS err;
+        EFI_FILE *file;
+        EFI_SIMPLE_FILE_SYSTEM_PROTOCOL *volume;
+
+        assert(ret_file);
+
+        err = BS->HandleProtocol(device, MAKE_GUID_PTR(EFI_SIMPLE_FILE_SYSTEM_PROTOCOL), (void **) &volume);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        err = volume->OpenVolume(volume, &file);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        *ret_file = file;
+        return EFI_SUCCESS;
+}
+
+void *find_configuration_table(const EFI_GUID *guid) {
+        for (size_t i = 0; i < ST->NumberOfTableEntries; i++)
+                if (efi_guid_equal(&ST->ConfigurationTable[i].VendorGuid, guid))
+                        return ST->ConfigurationTable[i].VendorTable;
+
+        return NULL;
+}
+
+static void remove_boot_count(char16_t *path) {
+        char16_t *prefix_end;
+        const char16_t *tail;
+        uint64_t ignored;
+
+        assert(path);
+
+        prefix_end = strchr16(path, '+');
+        if (!prefix_end)
+                return;
+
+        tail = prefix_end + 1;
+
+        if (!parse_number16(tail, &ignored, &tail))
+                return;
+
+        if (*tail == '-') {
+                ++tail;
+                if (!parse_number16(tail, &ignored, &tail))
+                        return;
+        }
+
+        if (!IN_SET(*tail, '\0', '.'))
+                return;
+
+        strcpy16(prefix_end, tail);
+}
+
+char16_t *get_extra_dir(const EFI_DEVICE_PATH *file_path) {
+        if (!file_path)
+                return NULL;
+
+        /* A device path is allowed to have more than one file path node. If that is the case they are
+         * supposed to be concatenated. Unfortunately, the device path to text protocol simply converts the
+         * nodes individually and then combines those with the usual '/' for device path nodes. But this does
+         * not create a legal EFI file path that the file protocol can use. */
+
+        /* Make sure we really only got file paths. */
+        for (const EFI_DEVICE_PATH *node = file_path; !device_path_is_end(node);
+             node = device_path_next_node(node))
+                if (node->Type != MEDIA_DEVICE_PATH || node->SubType != MEDIA_FILEPATH_DP)
+                        return NULL;
+
+        _cleanup_free_ char16_t *file_path_str = NULL;
+        if (device_path_to_str(file_path, &file_path_str) != EFI_SUCCESS)
+                return NULL;
+
+        convert_efi_path(file_path_str);
+        remove_boot_count(file_path_str);
+        return xasprintf("%ls.extra.d", file_path_str);
+}
+
+void *xmalloc(size_t size) {
+        void *p = NULL;
+        assert_se(BS->AllocatePool(EfiLoaderData, size, &p) == EFI_SUCCESS);
+        return p;
+}
diff --git a/src/boot/efi/util.h b/src/boot/efi/util.h
new file mode 100644
index 0000000..0306e32
--- /dev/null
+++ b/src/boot/efi/util.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+#include "log.h"
+#include "proto/file-io.h"
+#include "string-util-fundamental.h"
+
+/* This is provided by the linker. */
+extern uint8_t __executable_start[];
+
+static inline void free(void *p) {
+        if (!p)
+                return;
+
+        /* Debugging an invalid free requires trace logging to find the call site or a debugger attached. For
+         * release builds it is not worth the bother to even warn when we cannot even print a call stack. */
+#ifdef EFI_DEBUG
+        assert_se(BS->FreePool(p) == EFI_SUCCESS);
+#else
+        (void) BS->FreePool(p);
+#endif
+}
+
+static inline void freep(void *p) {
+        free(*(void **) p);
+}
+
+#define _cleanup_free_ _cleanup_(freep)
+
+_malloc_ _alloc_(1) _returns_nonnull_ _warn_unused_result_
+void *xmalloc(size_t size);
+
+_malloc_ _alloc_(1, 2) _returns_nonnull_ _warn_unused_result_
+static inline void *xmalloc_multiply(size_t n, size_t size) {
+        assert_se(!__builtin_mul_overflow(size, n, &size));
+        return xmalloc(size);
+}
+
+/* Use malloc attribute as this never returns p like userspace realloc. */
+_malloc_ _alloc_(3) _returns_nonnull_ _warn_unused_result_
+static inline void *xrealloc(void *p, size_t old_size, size_t new_size) {
+        void *t = xmalloc(new_size);
+        new_size = MIN(old_size, new_size);
+        if (new_size > 0)
+                memcpy(t, p, new_size);
+        free(p);
+        return t;
+}
+
+_malloc_ _alloc_(2) _returns_nonnull_ _warn_unused_result_
+static inline void* xmemdup(const void *p, size_t l) {
+        return memcpy(xmalloc(l), p, l);
+}
+
+#define xnew(type, n) ((type *) xmalloc_multiply((n), sizeof(type)))
+
+typedef struct {
+        EFI_PHYSICAL_ADDRESS addr;
+        size_t n_pages;
+} Pages;
+
+static inline void cleanup_pages(Pages *p) {
+        if (p->n_pages == 0)
+                return;
+#ifdef EFI_DEBUG
+        assert_se(BS->FreePages(p->addr, p->n_pages) == EFI_SUCCESS);
+#else
+        (void) BS->FreePages(p->addr, p->n_pages);
+#endif
+}
+
+#define _cleanup_pages_ _cleanup_(cleanup_pages)
+
+static inline Pages xmalloc_pages(
+                EFI_ALLOCATE_TYPE type, EFI_MEMORY_TYPE memory_type, size_t n_pages, EFI_PHYSICAL_ADDRESS addr) {
+        assert_se(BS->AllocatePages(type, memory_type, n_pages, &addr) == EFI_SUCCESS);
+        return (Pages) {
+                .addr = addr,
+                .n_pages = n_pages,
+        };
+}
+
+EFI_STATUS efivar_set(const EFI_GUID *vendor, const char16_t *name, const char16_t *value, uint32_t flags);
+EFI_STATUS efivar_set_raw(const EFI_GUID *vendor, const char16_t *name, const void *buf, size_t size, uint32_t flags);
+EFI_STATUS efivar_set_uint_string(const EFI_GUID *vendor, const char16_t *name, size_t i, uint32_t flags);
+EFI_STATUS efivar_set_uint32_le(const EFI_GUID *vendor, const char16_t *NAME, uint32_t value, uint32_t flags);
+EFI_STATUS efivar_set_uint64_le(const EFI_GUID *vendor, const char16_t *name, uint64_t value, uint32_t flags);
+void efivar_set_time_usec(const EFI_GUID *vendor, const char16_t *name, uint64_t usec);
+
+EFI_STATUS efivar_unset(const EFI_GUID *vendor, const char16_t *name, uint32_t flags);
+
+EFI_STATUS efivar_get(const EFI_GUID *vendor, const char16_t *name, char16_t **ret);
+EFI_STATUS efivar_get_raw(const EFI_GUID *vendor, const char16_t *name, char **ret, size_t *ret_size);
+EFI_STATUS efivar_get_uint_string(const EFI_GUID *vendor, const char16_t *name, size_t *ret);
+EFI_STATUS efivar_get_uint32_le(const EFI_GUID *vendor, const char16_t *name, uint32_t *ret);
+EFI_STATUS efivar_get_uint64_le(const EFI_GUID *vendor, const char16_t *name, uint64_t *ret);
+EFI_STATUS efivar_get_boolean_u8(const EFI_GUID *vendor, const char16_t *name, bool *ret);
+
+void convert_efi_path(char16_t *path);
+char16_t *xstr8_to_path(const char *stra);
+char16_t *mangle_stub_cmdline(char16_t *cmdline);
+
+EFI_STATUS chunked_read(EFI_FILE *file, size_t *size, void *buf);
+EFI_STATUS file_read(EFI_FILE *dir, const char16_t *name, size_t off, size_t size, char **content, size_t *content_size);
+
+static inline void file_closep(EFI_FILE **handle) {
+        if (!*handle)
+                return;
+
+        (*handle)->Close(*handle);
+}
+
+static inline void unload_imagep(EFI_HANDLE *image) {
+        if (*image)
+                (void) BS->UnloadImage(*image);
+}
+
+/*
+ * Allocated random UUID, intended to be shared across tools that implement
+ * the (ESP)\loader\entries\-.conf convention and the
+ * associated EFI variables.
+ */
+#define LOADER_GUID \
+        { 0x4a67b082, 0x0a4c, 0x41cf, { 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f } }
+
+/* Note that GUID is evaluated multiple times! */
+#define GUID_FORMAT_STR "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X"
+#define GUID_FORMAT_VAL(g) (g).Data1, (g).Data2, (g).Data3, (g).Data4[0], (g).Data4[1], \
+        (g).Data4[2], (g).Data4[3], (g).Data4[4], (g).Data4[5], (g).Data4[6], (g).Data4[7]
+
+void print_at(size_t x, size_t y, size_t attr, const char16_t *str);
+void clear_screen(size_t attr);
+
+typedef int (*compare_pointer_func_t)(const void *a, const void *b);
+void sort_pointer_array(void **array, size_t n_members, compare_pointer_func_t compare);
+
+EFI_STATUS get_file_info(EFI_FILE *handle, EFI_FILE_INFO **ret, size_t *ret_size);
+EFI_STATUS readdir(EFI_FILE *handle, EFI_FILE_INFO **buffer, size_t *buffer_size);
+
+bool is_ascii(const char16_t *f);
+
+char16_t **strv_free(char16_t **l);
+
+static inline void strv_freep(char16_t ***p) {
+        strv_free(*p);
+}
+
+EFI_STATUS open_directory(EFI_FILE *root_dir, const char16_t *path, EFI_FILE **ret);
+
+/* Conversion between EFI_PHYSICAL_ADDRESS and pointers is not obvious. The former is always 64-bit, even on
+ * 32-bit archs. And gcc complains if we cast a pointer to an integer of a different size. Hence let's do the
+ * conversion indirectly: first into uintptr_t and then extended to EFI_PHYSICAL_ADDRESS. */
+static inline EFI_PHYSICAL_ADDRESS POINTER_TO_PHYSICAL_ADDRESS(const void *p) {
+        return (EFI_PHYSICAL_ADDRESS) (uintptr_t) p;
+}
+
+static inline void *PHYSICAL_ADDRESS_TO_POINTER(EFI_PHYSICAL_ADDRESS addr) {
+        /* On 32-bit systems the address might not be convertible (as pointers are 32-bit but
+         * EFI_PHYSICAL_ADDRESS 64-bit) */
+        assert(addr <= UINTPTR_MAX);
+        return (void *) (uintptr_t) addr;
+}
+
+uint64_t get_os_indications_supported(void);
+
+/* If EFI_DEBUG, print our name and version and also report the address of the image base so a debugger can
+ * be attached. See debug-sd-boot.sh for how this can be done. */
+void notify_debugger(const char *identity, bool wait);
+
+/* On x86 the compiler assumes a different incoming stack alignment than what we get.
+ * This will cause long long variables to be misaligned when building with
+ * '-mlong-double' (for correct struct layouts). Normally, the compiler realigns the
+ * stack itself on entry, but we have to do this ourselves here as the compiler does
+ * not know that this is our entry point. */
+#ifdef __i386__
+#  define _realign_stack_ __attribute__((force_align_arg_pointer))
+#else
+#  define _realign_stack_
+#endif
+
+#define DEFINE_EFI_MAIN_FUNCTION(func, identity, wait_for_debugger)                    \
+        EFI_SYSTEM_TABLE *ST;                                                          \
+        EFI_BOOT_SERVICES *BS;                                                         \
+        EFI_RUNTIME_SERVICES *RT;                                                      \
+        _realign_stack_                                                                \
+        EFIAPI EFI_STATUS efi_main(EFI_HANDLE image, EFI_SYSTEM_TABLE *system_table);  \
+        EFIAPI EFI_STATUS efi_main(EFI_HANDLE image, EFI_SYSTEM_TABLE *system_table) { \
+                ST = system_table;                                                     \
+                BS = system_table->BootServices;                                       \
+                RT = system_table->RuntimeServices;                                    \
+                __stack_chk_guard_init();                                              \
+                notify_debugger((identity), (wait_for_debugger));                      \
+                EFI_STATUS err = func(image);                                          \
+                log_wait();                                                            \
+                return err;                                                            \
+        }
+
+#if defined(__i386__) || defined(__x86_64__)
+void beep(unsigned beep_count);
+#else
+static inline void beep(unsigned beep_count) {}
+#endif
+
+EFI_STATUS open_volume(EFI_HANDLE device, EFI_FILE **ret_file);
+
+static inline bool efi_guid_equal(const EFI_GUID *a, const EFI_GUID *b) {
+        return memcmp(a, b, sizeof(EFI_GUID)) == 0;
+}
+
+void *find_configuration_table(const EFI_GUID *guid);
+
+char16_t *get_extra_dir(const EFI_DEVICE_PATH *file_path);
diff --git a/src/boot/efi/vmm.c b/src/boot/efi/vmm.c
new file mode 100644
index 0000000..60e216d
--- /dev/null
+++ b/src/boot/efi/vmm.c
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if defined(__i386__) || defined(__x86_64__)
+#  include 
+#endif
+
+#include "confidential-virt-fundamental.h"
+#include "device-path-util.h"
+#include "drivers.h"
+#include "efi-string.h"
+#include "proto/device-path.h"
+#include "string-util-fundamental.h"
+#include "util.h"
+#include "vmm.h"
+
+#define QEMU_KERNEL_LOADER_FS_MEDIA_GUID \
+        { 0x1428f772, 0xb64a, 0x441e, { 0xb8, 0xc3, 0x9e, 0xbd, 0xd7, 0xf8, 0x93, 0xc7 } }
+
+#define VMM_BOOT_ORDER_GUID \
+        { 0x668f4529, 0x63d0, 0x4bb5, { 0xb6, 0x5d, 0x6f, 0xbb, 0x9d, 0x36, 0xa4, 0x4a } }
+
+/* detect direct boot */
+bool is_direct_boot(EFI_HANDLE device) {
+        EFI_STATUS err;
+        VENDOR_DEVICE_PATH *dp;
+
+        err = BS->HandleProtocol(device, MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL), (void **) &dp);
+        if (err != EFI_SUCCESS)
+                return false;
+
+        /* 'qemu -kernel systemd-bootx64.efi' */
+        if (dp->Header.Type == MEDIA_DEVICE_PATH &&
+            dp->Header.SubType == MEDIA_VENDOR_DP &&
+            memcmp(&dp->Guid, MAKE_GUID_PTR(QEMU_KERNEL_LOADER_FS_MEDIA), sizeof(EFI_GUID)) == 0)
+                return true;
+
+        /* loaded from firmware volume (sd-boot added to ovmf) */
+        if (dp->Header.Type == MEDIA_DEVICE_PATH &&
+            dp->Header.SubType == MEDIA_PIWG_FW_VOL_DP)
+                return true;
+
+        return false;
+}
+
+/*
+ * Try find ESP when not loaded from ESP
+ *
+ * Inspect all filesystems known to the firmware, try find the ESP.  In case VMMBootOrderNNNN variables are
+ * present they are used to inspect the filesystems in the specified order.  When nothing was found or the
+ * variables are not present the function will do one final search pass over all filesystems.
+ *
+ * Recent OVMF builds store the qemu boot order (as specified using the bootindex property on the qemu
+ * command line) in VMMBootOrderNNNN.  The variables contain a device path.
+ *
+ * Example qemu command line:
+ *     qemu -virtio-scsi-pci,addr=14.0 -device scsi-cd,scsi-id=4,bootindex=1
+ *
+ * Resulting variable:
+ *     VMMBootOrder0000 = PciRoot(0x0)/Pci(0x14,0x0)/Scsi(0x4,0x0)
+ */
+EFI_STATUS vmm_open(EFI_HANDLE *ret_vmm_dev, EFI_FILE **ret_vmm_dir) {
+        _cleanup_free_ EFI_HANDLE *handles = NULL;
+        size_t n_handles;
+        EFI_STATUS err, dp_err;
+
+        assert(ret_vmm_dev);
+        assert(ret_vmm_dir);
+
+        /* Make sure all file systems have been initialized. Only do this in VMs as this is slow
+         * on some real firmwares. */
+        (void) reconnect_all_drivers();
+
+        /* find all file system handles */
+        err = BS->LocateHandleBuffer(
+                        ByProtocol, MAKE_GUID_PTR(EFI_SIMPLE_FILE_SYSTEM_PROTOCOL), NULL, &n_handles, &handles);
+        if (err != EFI_SUCCESS)
+                return err;
+
+        for (size_t order = 0;; order++) {
+                _cleanup_free_ EFI_DEVICE_PATH *dp = NULL;
+
+                _cleanup_free_ char16_t *order_str = xasprintf("VMMBootOrder%04zx", order);
+                dp_err = efivar_get_raw(MAKE_GUID_PTR(VMM_BOOT_ORDER), order_str, (char **) &dp, NULL);
+
+                for (size_t i = 0; i < n_handles; i++) {
+                        _cleanup_(file_closep) EFI_FILE *root_dir = NULL, *efi_dir = NULL;
+                        EFI_DEVICE_PATH *fs;
+
+                        err = BS->HandleProtocol(
+                                        handles[i], MAKE_GUID_PTR(EFI_DEVICE_PATH_PROTOCOL), (void **) &fs);
+                        if (err != EFI_SUCCESS)
+                                return err;
+
+                        /* check against VMMBootOrderNNNN (if set) */
+                        if (dp_err == EFI_SUCCESS && !device_path_startswith(fs, dp))
+                                continue;
+
+                        err = open_volume(handles[i], &root_dir);
+                        if (err != EFI_SUCCESS)
+                                continue;
+
+                        /* simple ESP check */
+                        err = root_dir->Open(root_dir, &efi_dir, (char16_t*) u"\\EFI",
+                                             EFI_FILE_MODE_READ,
+                                             EFI_FILE_READ_ONLY | EFI_FILE_DIRECTORY);
+                        if (err != EFI_SUCCESS)
+                                continue;
+
+                        *ret_vmm_dev = handles[i];
+                        *ret_vmm_dir = TAKE_PTR(root_dir);
+                        return EFI_SUCCESS;
+                }
+
+                if (dp_err != EFI_SUCCESS)
+                        return EFI_NOT_FOUND;
+        }
+        assert_not_reached();
+}
+
+static bool cpuid_in_hypervisor(void) {
+#if defined(__i386__) || defined(__x86_64__)
+        unsigned eax, ebx, ecx, edx;
+
+        /* This is a dumbed down version of src/basic/virt.c's detect_vm() that safely works in the UEFI
+         * environment. */
+
+        if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) == 0)
+                return false;
+
+        if (FLAGS_SET(ecx, 0x80000000U))
+                return true;
+#endif
+
+        return false;
+}
+
+#define SMBIOS_TABLE_GUID \
+        GUID_DEF(0xeb9d2d31, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+#define SMBIOS3_TABLE_GUID \
+        GUID_DEF(0xf2fd1544, 0x9794, 0x4a2c, 0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94)
+
+typedef struct {
+        uint8_t anchor_string[4];
+        uint8_t entry_point_structure_checksum;
+        uint8_t entry_point_length;
+        uint8_t major_version;
+        uint8_t minor_version;
+        uint16_t max_structure_size;
+        uint8_t entry_point_revision;
+        uint8_t formatted_area[5];
+        uint8_t intermediate_anchor_string[5];
+        uint8_t intermediate_checksum;
+        uint16_t table_length;
+        uint32_t table_address;
+        uint16_t number_of_smbios_structures;
+        uint8_t smbios_bcd_revision;
+} _packed_ SmbiosEntryPoint;
+
+typedef struct {
+        uint8_t anchor_string[5];
+        uint8_t entry_point_structure_checksum;
+        uint8_t entry_point_length;
+        uint8_t major_version;
+        uint8_t minor_version;
+        uint8_t docrev;
+        uint8_t entry_point_revision;
+        uint8_t reserved;
+        uint32_t table_maximum_size;
+        uint64_t table_address;
+} _packed_ Smbios3EntryPoint;
+
+typedef struct {
+        uint8_t type;
+        uint8_t length;
+        uint8_t handle[2];
+} _packed_ SmbiosHeader;
+
+typedef struct {
+        SmbiosHeader header;
+        uint8_t vendor;
+        uint8_t bios_version;
+        uint16_t bios_segment;
+        uint8_t bios_release_date;
+        uint8_t bios_size;
+        uint64_t bios_characteristics;
+        uint8_t bios_characteristics_ext[2];
+} _packed_ SmbiosTableType0;
+
+typedef struct {
+        SmbiosHeader header;
+        uint8_t count;
+        char contents[];
+} _packed_ SmbiosTableType11;
+
+static const void *find_smbios_configuration_table(uint64_t *ret_size) {
+        assert(ret_size);
+
+        const Smbios3EntryPoint *entry3 = find_configuration_table(MAKE_GUID_PTR(SMBIOS3_TABLE));
+        if (entry3 && memcmp(entry3->anchor_string, "_SM3_", 5) == 0 &&
+            entry3->entry_point_length <= sizeof(*entry3)) {
+                *ret_size = entry3->table_maximum_size;
+                return PHYSICAL_ADDRESS_TO_POINTER(entry3->table_address);
+        }
+
+        const SmbiosEntryPoint *entry = find_configuration_table(MAKE_GUID_PTR(SMBIOS_TABLE));
+        if (entry && memcmp(entry->anchor_string, "_SM_", 4) == 0 &&
+            entry->entry_point_length <= sizeof(*entry)) {
+                *ret_size = entry->table_length;
+                return PHYSICAL_ADDRESS_TO_POINTER(entry->table_address);
+        }
+
+        return NULL;
+}
+
+static const SmbiosHeader *get_smbios_table(uint8_t type, uint64_t *ret_size_left) {
+        uint64_t size = 0;
+        const uint8_t *p = find_smbios_configuration_table(&size);
+        if (!p)
+                return NULL;
+
+        for (;;) {
+                if (size < sizeof(SmbiosHeader))
+                        return NULL;
+
+                const SmbiosHeader *header = (const SmbiosHeader *) p;
+
+                /* End of table. */
+                if (header->type == 127)
+                        return NULL;
+
+                if (size < header->length)
+                        return NULL;
+
+                if (header->type == type) {
+                        if (ret_size_left)
+                                *ret_size_left = size;
+                        return header; /* Yay! */
+                }
+
+                /* Skip over formatted area. */
+                size -= header->length;
+                p += header->length;
+
+                /* Skip over string table. */
+                for (;;) {
+                        const uint8_t *e = memchr(p, 0, size);
+                        if (!e)
+                                return NULL;
+
+                        if (e == p) {/* Double NUL byte means we've reached the end of the string table. */
+                                p++;
+                                size--;
+                                break;
+                        }
+
+                        size -= e + 1 - p;
+                        p = e + 1;
+                }
+        }
+
+        return NULL;
+}
+
+static bool smbios_in_hypervisor(void) {
+        /* Look up BIOS Information (Type 0). */
+        const SmbiosTableType0 *type0 = (const SmbiosTableType0 *) get_smbios_table(0, NULL);
+        if (!type0 || type0->header.length < sizeof(SmbiosTableType0))
+                return false;
+
+        /* Bit 4 of 2nd BIOS characteristics extension bytes indicates virtualization. */
+        return FLAGS_SET(type0->bios_characteristics_ext[1], 1 << 4);
+}
+
+bool in_hypervisor(void) {
+        static int cache = -1;
+        if (cache >= 0)
+                return cache;
+
+        cache = cpuid_in_hypervisor() || smbios_in_hypervisor();
+        return cache;
+}
+
+const char* smbios_find_oem_string(const char *name) {
+        uint64_t left;
+
+        assert(name);
+
+        const SmbiosTableType11 *type11 = (const SmbiosTableType11 *) get_smbios_table(11, &left);
+        if (!type11 || type11->header.length < sizeof(SmbiosTableType11))
+                return NULL;
+
+        assert(left >= type11->header.length);
+
+        const char *s = type11->contents;
+        left -= type11->header.length;
+
+        for (const char *p = s; p < s + left; ) {
+                const char *e = memchr(p, 0, s + left - p);
+                if (!e || e == p) /* Double NUL byte means we've reached the end of the OEM strings. */
+                        break;
+
+                const char *eq = startswith8(p, name);
+                if (eq && *eq == '=')
+                        return eq + 1;
+
+                p = e + 1;
+        }
+
+        return NULL;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static uint32_t cpuid_leaf(uint32_t eax, char ret_sig[static 13], bool swapped) {
+        /* zero-init as some queries explicitly require subleaf == 0 */
+        uint32_t sig[3] = {};
+
+        if (swapped)
+                __cpuid_count(eax, 0, eax, sig[0], sig[2], sig[1]);
+        else
+                __cpuid_count(eax, 0, eax, sig[0], sig[1], sig[2]);
+
+        memcpy(ret_sig, sig, sizeof(sig));
+        ret_sig[12] = 0; /* \0-terminate the string to make string comparison possible */
+
+        return eax;
+}
+
+static uint64_t msr(uint32_t index) {
+        uint64_t val;
+#ifdef __x86_64__
+        uint32_t low, high;
+        asm volatile ("rdmsr" : "=a"(low), "=d"(high) : "c"(index) : "memory");
+        val = ((uint64_t)high << 32) | low;
+#else
+        asm volatile ("rdmsr" : "=A"(val) : "c"(index) : "memory");
+#endif
+        return val;
+}
+
+static bool detect_hyperv_sev(void) {
+        uint32_t eax, ebx, ecx, edx, feat;
+        char sig[13] = {};
+
+        feat = cpuid_leaf(CPUID_HYPERV_VENDOR_AND_MAX_FUNCTIONS, sig, false);
+
+        if (feat < CPUID_HYPERV_MIN || feat > CPUID_HYPERV_MAX)
+                return false;
+
+        if (memcmp(sig, CPUID_SIG_HYPERV, sizeof(sig)) != 0)
+                return false;
+
+        __cpuid(CPUID_HYPERV_FEATURES, eax, ebx, ecx, edx);
+
+        if (ebx & CPUID_HYPERV_ISOLATION && !(ebx & CPUID_HYPERV_CPU_MANAGEMENT)) {
+                __cpuid(CPUID_HYPERV_ISOLATION_CONFIG, eax, ebx, ecx, edx);
+
+                if ((ebx & CPUID_HYPERV_ISOLATION_TYPE_MASK) == CPUID_HYPERV_ISOLATION_TYPE_SNP)
+                        return true;
+        }
+
+        return false;
+}
+
+static bool detect_sev(void) {
+        uint32_t eax, ebx, ecx, edx;
+        uint64_t msrval;
+
+        __cpuid(CPUID_GET_HIGHEST_FUNCTION, eax, ebx, ecx, edx);
+
+        if (eax < CPUID_AMD_GET_ENCRYPTED_MEMORY_CAPABILITIES)
+                return false;
+
+        __cpuid(CPUID_AMD_GET_ENCRYPTED_MEMORY_CAPABILITIES, eax, ebx, ecx, edx);
+
+        /* bit 1 == CPU supports SEV feature
+         *
+         * Note, Azure blocks this CPUID leaf from its SEV-SNP
+         * guests, so we must fallback to trying some HyperV
+         * specific CPUID checks.
+         */
+        if (!(eax & EAX_SEV))
+                return detect_hyperv_sev();
+
+        msrval = msr(MSR_AMD64_SEV);
+
+        if (msrval & (MSR_SEV_SNP | MSR_SEV_ES | MSR_SEV))
+                return true;
+
+        return false;
+}
+
+static bool detect_tdx(void) {
+        uint32_t eax, ebx, ecx, edx;
+        char sig[13] = {};
+
+        __cpuid(CPUID_GET_HIGHEST_FUNCTION, eax, ebx, ecx, edx);
+
+        if (eax < CPUID_INTEL_TDX_ENUMERATION)
+                return false;
+
+        cpuid_leaf(CPUID_INTEL_TDX_ENUMERATION, sig, true);
+
+        if (memcmp(sig, CPUID_SIG_INTEL_TDX, sizeof(sig)) == 0)
+                return true;
+
+        return false;
+}
+#endif /* ! __i386__ && ! __x86_64__ */
+
+bool is_confidential_vm(void) {
+#if defined(__i386__) || defined(__x86_64__)
+        char sig[13] = {};
+
+        if (!cpuid_in_hypervisor())
+                return false;
+
+        cpuid_leaf(0, sig, true);
+
+        if (memcmp(sig, CPUID_SIG_AMD, sizeof(sig)) == 0)
+                return detect_sev();
+        if (memcmp(sig, CPUID_SIG_INTEL, sizeof(sig)) == 0)
+                return detect_tdx();
+#endif /* ! __i386__ && ! __x86_64__ */
+
+        return false;
+}
diff --git a/src/boot/efi/vmm.h b/src/boot/efi/vmm.h
new file mode 100644
index 0000000..df48af3
--- /dev/null
+++ b/src/boot/efi/vmm.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efi.h"
+
+bool is_direct_boot(EFI_HANDLE device);
+EFI_STATUS vmm_open(EFI_HANDLE *ret_qemu_dev, EFI_FILE **ret_qemu_dir);
+
+bool in_hypervisor(void);
+
+bool is_confidential_vm(void);
+
+const char* smbios_find_oem_string(const char *name);
diff --git a/src/boot/measure.c b/src/boot/measure.c
new file mode 100644
index 0000000..5c5071e
--- /dev/null
+++ b/src/boot/measure.c
@@ -0,0 +1,1085 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "build.h"
+#include "efi-loader.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "main-func.h"
+#include "memstream-util.h"
+#include "openssl-util.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "pretty-print.h"
+#include "sha256.h"
+#include "terminal-util.h"
+#include "tpm2-pcr.h"
+#include "tpm2-util.h"
+#include "uki.h"
+#include "verbs.h"
+
+/* Tool for pre-calculating expected TPM PCR values based on measured resources. This is intended to be used
+ * to pre-calculate suitable values for PCR 11, the way sd-stub measures into it. */
+
+static char *arg_sections[_UNIFIED_SECTION_MAX] = {};
+static char **arg_banks = NULL;
+static char *arg_tpm2_device = NULL;
+static char *arg_private_key = NULL;
+static char *arg_public_key = NULL;
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO|JSON_FORMAT_OFF;
+static PagerFlags arg_pager_flags = 0;
+static bool arg_current = false;
+static char **arg_phase = NULL;
+static char *arg_append = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_banks, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_private_key, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_public_key, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_phase, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_append, freep);
+
+static void free_sections(char*(*sections)[_UNIFIED_SECTION_MAX]) {
+        for (UnifiedSection c = 0; c < _UNIFIED_SECTION_MAX; c++)
+                free((*sections)[c]);
+}
+
+STATIC_DESTRUCTOR_REGISTER(arg_sections, free_sections);
+
+static int help(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-measure", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s  [OPTIONS...] COMMAND ...\n"
+               "\n%5$sPre-calculate and sign PCR hash for a unified kernel image (UKI).%6$s\n"
+               "\n%3$sCommands:%4$s\n"
+               "  status                 Show current PCR values\n"
+               "  calculate              Calculate expected PCR values\n"
+               "  sign                   Calculate and sign expected PCR values\n"
+               "\n%3$sOptions:%4$s\n"
+               "  -h --help              Show this help\n"
+               "     --version           Print version\n"
+               "     --no-pager          Do not pipe output into a pager\n"
+               "  -c --current           Use current PCR values\n"
+               "     --phase=PHASE       Specify a boot phase to sign for\n"
+               "     --bank=DIGEST       Select TPM bank (SHA1, SHA256, SHA384, SHA512)\n"
+               "     --tpm2-device=PATH  Use specified TPM2 device\n"
+               "     --private-key=KEY   Private key (PEM) to sign with\n"
+               "     --public-key=KEY    Public key (PEM) to validate against\n"
+               "     --json=MODE         Output as JSON\n"
+               "  -j                     Same as --json=pretty on tty, --json=short otherwise\n"
+               "     --append=PATH       Load specified JSON signature, and append new signature to it\n"
+               "\n%3$sUKI PE Section Options:%4$s                                         %3$sUKI PE Section%4$s\n"
+               "     --linux=PATH        Path to Linux kernel image file        %7$s .linux\n"
+               "     --osrel=PATH        Path to os-release file                %7$s .osrel\n"
+               "     --cmdline=PATH      Path to file with kernel command line  %7$s .cmdline\n"
+               "     --initrd=PATH       Path to initrd image file              %7$s .initrd\n"
+               "     --splash=PATH       Path to splash bitmap file             %7$s .splash\n"
+               "     --dtb=PATH          Path to Devicetree file                %7$s .dtb\n"
+               "     --uname=PATH        Path to 'uname -r' file                %7$s .uname\n"
+               "     --sbat=PATH         Path to SBAT file                      %7$s .sbat\n"
+               "     --pcrpkey=PATH      Path to public key for PCR signatures  %7$s .pcrpkey\n"
+               "\nSee the %2$s for details.\n",
+               program_invocation_short_name,
+               link,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal(),
+               special_glyph(SPECIAL_GLYPH_ARROW_RIGHT));
+
+        return 0;
+}
+
+static char *normalize_phase(const char *s) {
+        _cleanup_strv_free_ char **l = NULL;
+
+        /* Let's normalize phase expressions. We split the series of colon-separated words up, then remove
+         * all empty ones, and glue them back together again. In other words we remove duplicate ":", as well
+         * as leading and trailing ones. */
+
+        l = strv_split(s, ":"); /* Split series of words */
+        if (!l)
+                return NULL;
+
+        /* Remove all empty words and glue things back together */
+        return strv_join(strv_remove(l, ""), ":");
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_PAGER,
+                _ARG_SECTION_FIRST,
+                ARG_LINUX = _ARG_SECTION_FIRST,
+                ARG_OSREL,
+                ARG_CMDLINE,
+                ARG_INITRD,
+                ARG_SPLASH,
+                ARG_DTB,
+                ARG_UNAME,
+                ARG_SBAT,
+                _ARG_PCRSIG, /* the .pcrsig section is not input for signing, hence not actually an argument here */
+                _ARG_SECTION_LAST,
+                ARG_PCRPKEY = _ARG_SECTION_LAST,
+                ARG_BANK,
+                ARG_PRIVATE_KEY,
+                ARG_PUBLIC_KEY,
+                ARG_TPM2_DEVICE,
+                ARG_JSON,
+                ARG_PHASE,
+                ARG_APPEND,
+        };
+
+        static const struct option options[] = {
+                { "help",        no_argument,       NULL, 'h'             },
+                { "no-pager",    no_argument,       NULL, ARG_NO_PAGER    },
+                { "version",     no_argument,       NULL, ARG_VERSION     },
+                { "linux",       required_argument, NULL, ARG_LINUX       },
+                { "osrel",       required_argument, NULL, ARG_OSREL       },
+                { "cmdline",     required_argument, NULL, ARG_CMDLINE     },
+                { "initrd",      required_argument, NULL, ARG_INITRD      },
+                { "splash",      required_argument, NULL, ARG_SPLASH      },
+                { "dtb",         required_argument, NULL, ARG_DTB         },
+                { "uname",       required_argument, NULL, ARG_UNAME       },
+                { "sbat",        required_argument, NULL, ARG_SBAT        },
+                { "pcrpkey",     required_argument, NULL, ARG_PCRPKEY     },
+                { "current",     no_argument,       NULL, 'c'             },
+                { "bank",        required_argument, NULL, ARG_BANK        },
+                { "tpm2-device", required_argument, NULL, ARG_TPM2_DEVICE },
+                { "private-key", required_argument, NULL, ARG_PRIVATE_KEY },
+                { "public-key",  required_argument, NULL, ARG_PUBLIC_KEY  },
+                { "json",        required_argument, NULL, ARG_JSON        },
+                { "phase",       required_argument, NULL, ARG_PHASE       },
+                { "append",      required_argument, NULL, ARG_APPEND      },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        /* Make sure the arguments list and the section list, stays in sync */
+        assert_cc(_ARG_SECTION_FIRST + _UNIFIED_SECTION_MAX == _ARG_SECTION_LAST + 1);
+
+        while ((c = getopt_long(argc, argv, "hjc", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        help(0, NULL, NULL);
+                        return 0;
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case _ARG_SECTION_FIRST..._ARG_SECTION_LAST: {
+                        UnifiedSection section = c - _ARG_SECTION_FIRST;
+
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, arg_sections + section);
+                        if (r < 0)
+                                return r;
+                        break;
+                }
+
+                case 'c':
+                        arg_current = true;
+                        break;
+
+                case ARG_BANK: {
+                        const EVP_MD *implementation;
+
+                        implementation = EVP_get_digestbyname(optarg);
+                        if (!implementation)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown bank '%s', refusing.", optarg);
+
+                        if (strv_extend(&arg_banks, EVP_MD_name(implementation)) < 0)
+                                return log_oom();
+
+                        break;
+                }
+
+                case ARG_PRIVATE_KEY:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_private_key);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_PUBLIC_KEY:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_public_key);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_DEVICE: {
+                        _cleanup_free_ char *device = NULL;
+
+                        if (streq(optarg, "list"))
+                                return tpm2_list_devices();
+
+                        if (!streq(optarg, "auto")) {
+                                device = strdup(optarg);
+                                if (!device)
+                                        return log_oom();
+                        }
+
+                        free_and_replace(arg_tpm2_device, device);
+                        break;
+                }
+
+                case 'j':
+                        arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO;
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case ARG_PHASE: {
+                        char *n;
+
+                        n = normalize_phase(optarg);
+                        if (!n)
+                                return log_oom();
+
+                        r = strv_consume(&arg_phase, TAKE_PTR(n));
+                        if (r < 0)
+                                return r;
+
+                        break;
+                }
+
+                case ARG_APPEND:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_append);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (strv_isempty(arg_banks)) {
+                /* If no banks are specifically selected, pick all known banks */
+                arg_banks = strv_new("SHA1", "SHA256", "SHA384", "SHA512");
+                if (!arg_banks)
+                        return log_oom();
+        }
+
+        strv_sort(arg_banks);
+        strv_uniq(arg_banks);
+
+        if (arg_current)
+                for (UnifiedSection us = 0; us < _UNIFIED_SECTION_MAX; us++)
+                        if (arg_sections[us])
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "The --current switch cannot be used in combination with --linux= and related switches.");
+
+        if (strv_isempty(arg_phase)) {
+                /* If no phases are specifically selected, pick everything from the beginning of the initrd
+                 * to the beginning of shutdown. */
+                if (strv_extend_strv(&arg_phase,
+                                     STRV_MAKE("enter-initrd",
+                                               "enter-initrd:leave-initrd",
+                                               "enter-initrd:leave-initrd:sysinit",
+                                               "enter-initrd:leave-initrd:sysinit:ready"),
+                                     /* filter_duplicates= */ false) < 0)
+                        return log_oom();
+        } else {
+                strv_sort(arg_phase);
+                strv_uniq(arg_phase);
+        }
+
+        _cleanup_free_ char *j = NULL;
+        j = strv_join(arg_phase, ", ");
+        if (!j)
+                return log_oom();
+
+        log_debug("Measuring boot phases: %s", j);
+        return 1;
+}
+
+/* The PCR 11 state for one specific bank */
+typedef struct PcrState {
+        char *bank;
+        const EVP_MD *md;
+        void *value;
+        size_t value_size;
+        void *saved_value; /* A copy of the original value we calculated, used by pcr_states_save()/pcr_states_restore() to come later back to */
+} PcrState;
+
+static void pcr_state_free_all(PcrState **pcr_state) {
+        assert(pcr_state);
+
+        if (!*pcr_state)
+                return;
+
+        for (size_t i = 0; (*pcr_state)[i].value; i++) {
+                free((*pcr_state)[i].bank);
+                free((*pcr_state)[i].value);
+                free((*pcr_state)[i].saved_value);
+        }
+
+        *pcr_state = mfree(*pcr_state);
+}
+
+static void evp_md_ctx_free_all(EVP_MD_CTX **md[]) {
+        assert(md);
+
+        if (!*md)
+                return;
+
+        for (size_t i = 0; (*md)[i]; i++)
+                EVP_MD_CTX_free((*md)[i]);
+
+        *md = mfree(*md);
+}
+
+static int pcr_state_extend(PcrState *pcr_state, const void *data, size_t sz) {
+        _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *mc = NULL;
+        unsigned value_size;
+
+        assert(pcr_state);
+        assert(data || sz == 0);
+        assert(pcr_state->md);
+        assert(pcr_state->value);
+        assert(pcr_state->value_size > 0);
+
+        /* Extends a (virtual) PCR by the given data */
+
+        mc = EVP_MD_CTX_new();
+        if (!mc)
+                return log_oom();
+
+        if (EVP_DigestInit_ex(mc, pcr_state->md, NULL) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize %s context.", pcr_state->bank);
+
+        /* First thing we do, is hash the old PCR value */
+        if (EVP_DigestUpdate(mc, pcr_state->value, pcr_state->value_size) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to run digest.");
+
+        /* Then, we hash the new data */
+        if (EVP_DigestUpdate(mc, data, sz) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to run digest.");
+
+        if (EVP_DigestFinal_ex(mc, pcr_state->value, &value_size) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize hash context.");
+
+        assert(value_size == pcr_state->value_size);
+        return 0;
+}
+
+#define BUFFER_SIZE (16U * 1024U)
+
+static int measure_kernel(PcrState *pcr_states, size_t n) {
+        _cleanup_free_ void *buffer = NULL;
+        int r;
+
+        assert(n > 0);
+        assert(pcr_states);
+
+        /* Virtually measures the components of a unified kernel image into PCR 11 */
+
+        if (arg_current) {
+                /* Shortcut things, if we should just use the current PCR value */
+
+                for (size_t i = 0; i < n; i++) {
+                        _cleanup_free_ char *p = NULL, *s = NULL;
+                        _cleanup_free_ void *v = NULL;
+                        size_t sz;
+
+                        if (asprintf(&p, "/sys/class/tpm/tpm0/pcr-%s/%i", pcr_states[i].bank, TPM2_PCR_KERNEL_BOOT) < 0)
+                                return log_oom();
+
+                        r = read_virtual_file(p, 4096, &s, NULL);
+                        if (r == -ENOENT && access("/sys/class/tpm/tpm0/", F_OK) >= 0)
+                                return log_error_errno(r, "TPM device exists, but cannot open '%s'; either the kernel is too old, or selected PCR bank is not supported: %m", p);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to read '%s': %m", p);
+
+                        r = unhexmem(strstrip(s), SIZE_MAX, &v, &sz);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to decode PCR value '%s': %m", s);
+
+                        assert(pcr_states[i].value_size == sz);
+                        memcpy(pcr_states[i].value, v, sz);
+                }
+
+                return 0;
+        }
+
+        buffer = malloc(BUFFER_SIZE);
+        if (!buffer)
+                return log_oom();
+
+        for (UnifiedSection c = 0; c < _UNIFIED_SECTION_MAX; c++) {
+                _cleanup_(evp_md_ctx_free_all) EVP_MD_CTX **mdctx = NULL;
+                _cleanup_close_ int fd = -EBADF;
+                uint64_t m = 0;
+
+                if (!arg_sections[c])
+                        continue;
+
+                fd = open(arg_sections[c], O_RDONLY|O_CLOEXEC);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to open '%s': %m", arg_sections[c]);
+
+                /* Allocate one message digest context per bank (NULL terminated) */
+                mdctx = new0(EVP_MD_CTX*, n + 1);
+                if (!mdctx)
+                        return log_oom();
+
+                for (size_t i = 0; i < n; i++) {
+                        mdctx[i] = EVP_MD_CTX_new();
+                        if (!mdctx[i])
+                                return log_oom();
+
+                        if (EVP_DigestInit_ex(mdctx[i], pcr_states[i].md, NULL) != 1)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Failed to initialize data %s context.", pcr_states[i].bank);
+                }
+
+                for (;;) {
+                        ssize_t sz;
+
+                        sz = read(fd, buffer, BUFFER_SIZE);
+                        if (sz < 0)
+                                return log_error_errno(errno, "Failed to read '%s': %m", arg_sections[c]);
+                        if (sz == 0) /* EOF */
+                                break;
+
+                        for (size_t i = 0; i < n; i++)
+                                if (EVP_DigestUpdate(mdctx[i], buffer, sz) != 1)
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to run digest.");
+
+                        m += sz;
+                }
+
+                fd = safe_close(fd);
+
+                if (m == 0) /* We skip over empty files, the stub does so too */
+                        continue;
+
+                for (size_t i = 0; i < n; i++) {
+                        _cleanup_free_ void *data_hash = NULL;
+                        unsigned data_hash_size;
+
+                        data_hash = malloc(pcr_states[i].value_size);
+                        if (!data_hash)
+                                return log_oom();
+
+                        /* Measure name of section */
+                        if (EVP_Digest(unified_sections[c], strlen(unified_sections[c]) + 1, data_hash, &data_hash_size, pcr_states[i].md, NULL) != 1)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to hash section name with %s.", pcr_states[i].bank);
+
+                        assert(data_hash_size == (unsigned) pcr_states[i].value_size);
+
+                        r = pcr_state_extend(pcr_states + i, data_hash, data_hash_size);
+                        if (r < 0)
+                                return r;
+
+                        /* Retrieve hash of data and measure it */
+                        if (EVP_DigestFinal_ex(mdctx[i], data_hash, &data_hash_size) != 1)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize hash context.");
+
+                        assert(data_hash_size == (unsigned) pcr_states[i].value_size);
+
+                        r = pcr_state_extend(pcr_states + i, data_hash, data_hash_size);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int measure_phase(PcrState *pcr_states, size_t n, const char *phase) {
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(pcr_states);
+        assert(n > 0);
+
+        /* Measure a phase string into PCR 11. This splits up the "phase" expression at colons, and then
+         * virtually extends each specified word into PCR 11, to model how during boot we measure a series of
+         * words into PCR 11, one for each phase. */
+
+        l = strv_split(phase, ":");
+        if (!l)
+                return log_oom();
+
+        STRV_FOREACH(word, l) {
+                size_t wl;
+
+                if (isempty(*word))
+                        continue;
+
+                wl = strlen(*word);
+
+                for (size_t i = 0; i < n; i++) { /* For each bank */
+                        _cleanup_free_ void *b = NULL;
+                        int bsz;
+
+                        bsz = EVP_MD_size(pcr_states[i].md);
+                        assert(bsz > 0);
+
+                        b = malloc(bsz);
+                        if (!b)
+                                return log_oom();
+
+                        /* First hash the word itself */
+                        if (EVP_Digest(*word, wl, b, NULL, pcr_states[i].md, NULL) != 1)
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to hash word '%s'.", *word);
+
+                        /* And then extend the PCR with the resulting hash */
+                        r = pcr_state_extend(pcr_states + i, b, bsz);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int pcr_states_allocate(PcrState **ret) {
+        _cleanup_(pcr_state_free_all) PcrState *pcr_states = NULL;
+        size_t n = 0;
+
+        pcr_states = new0(PcrState, strv_length(arg_banks) + 1);
+        if (!pcr_states)
+                return log_oom();
+
+        /* Allocate a PCR state structure, one for each bank */
+        STRV_FOREACH(d, arg_banks) {
+                const EVP_MD *implementation;
+                _cleanup_free_ void *v = NULL;
+                _cleanup_free_ char *b = NULL;
+                int sz;
+
+                assert_se(implementation = EVP_get_digestbyname(*d)); /* Must work, we already checked while parsing  command line */
+
+                b = strdup(EVP_MD_name(implementation));
+                if (!b)
+                        return log_oom();
+
+                sz = EVP_MD_size(implementation);
+                if (sz <= 0 || sz >= INT_MAX)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected digest size: %i", sz);
+
+                v = malloc0(sz); /* initial PCR state is all zeroes */
+                if (!v)
+                        return log_oom();
+
+                pcr_states[n++] = (struct PcrState) {
+                        .bank = ascii_strlower(TAKE_PTR(b)),
+                        .md = implementation,
+                        .value = TAKE_PTR(v),
+                        .value_size = sz,
+                };
+        }
+
+        *ret = TAKE_PTR(pcr_states);
+        return (int) n;
+}
+
+static int pcr_states_save(PcrState *pcr_states, size_t n) {
+        assert(pcr_states);
+        assert(n > 0);
+
+        for (size_t i = 0; i < n; i++) {
+                _cleanup_free_ void *saved = NULL;
+
+                if (!pcr_states[i].value)
+                        continue;
+
+                saved = memdup(pcr_states[i].value, pcr_states[i].value_size);
+                if (!saved)
+                        return log_oom();
+
+                free_and_replace(pcr_states[i].saved_value, saved);
+        }
+
+        return 0;
+}
+
+static void pcr_states_restore(PcrState *pcr_states, size_t n) {
+        assert(pcr_states);
+        assert(n > 0);
+
+        for (size_t i = 0; i < n; i++) {
+
+                assert(pcr_states[i].value);
+                assert(pcr_states[i].saved_value);
+
+                memcpy(pcr_states[i].value, pcr_states[i].saved_value, pcr_states[i].value_size);
+        }
+}
+
+static int verb_calculate(int argc, char *argv[], void *userdata) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        _cleanup_(pcr_state_free_all) PcrState *pcr_states = NULL;
+        int r;
+
+        if (!arg_sections[UNIFIED_SECTION_LINUX] && !arg_current)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Either --linux= or --current must be specified, refusing.");
+        if (arg_append)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "The --append= switch is only supported for 'sign', not 'calculate'.");
+
+        assert(!strv_isempty(arg_banks));
+        assert(!strv_isempty(arg_phase));
+
+        r = pcr_states_allocate(&pcr_states);
+        if (r < 0)
+                return r;
+
+        size_t n = r;
+
+        r = measure_kernel(pcr_states, n);
+        if (r < 0)
+                return r;
+
+        /* Save the current state, so that we later can restore to it. This way we can measure the PCR values
+         * for multiple different boot phases without heaving to start from zero each time */
+        r = pcr_states_save(pcr_states, n);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(phase, arg_phase) {
+
+                r = measure_phase(pcr_states, n, *phase);
+                if (r < 0)
+                        return r;
+
+                for (size_t i = 0; i < n; i++) {
+                        if (arg_json_format_flags & JSON_FORMAT_OFF) {
+                                _cleanup_free_ char *hd = NULL;
+
+                                if (i == 0) {
+                                        fflush(stdout);
+                                        fprintf(stderr, "%s# PCR[%i] Phase <%s>%s\n",
+                                                ansi_grey(),
+                                                TPM2_PCR_KERNEL_BOOT,
+                                                isempty(*phase) ? ":" : *phase,
+                                                ansi_normal());
+                                        fflush(stderr);
+                                }
+
+                                hd = hexmem(pcr_states[i].value, pcr_states[i].value_size);
+                                if (!hd)
+                                        return log_oom();
+
+                                printf("%i:%s=%s\n", TPM2_PCR_KERNEL_BOOT, pcr_states[i].bank, hd);
+                        } else {
+                                _cleanup_(json_variant_unrefp) JsonVariant *array = NULL;
+
+                                array = json_variant_ref(json_variant_by_key(w, pcr_states[i].bank));
+
+                                r = json_variant_append_arrayb(
+                                                &array,
+                                                JSON_BUILD_OBJECT(
+                                                                JSON_BUILD_PAIR_CONDITION(!isempty(*phase), "phase", JSON_BUILD_STRING(*phase)),
+                                                                JSON_BUILD_PAIR("pcr", JSON_BUILD_INTEGER(TPM2_PCR_KERNEL_BOOT)),
+                                                                JSON_BUILD_PAIR("hash", JSON_BUILD_HEX(pcr_states[i].value, pcr_states[i].value_size))));
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to append JSON object to array: %m");
+
+                                r = json_variant_set_field(&w, pcr_states[i].bank, array);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to add bank info to object: %m");
+                        }
+                }
+
+                /* Return to the original kernel measurement for the next phase calculation */
+                pcr_states_restore(pcr_states, n);
+        }
+
+        if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) {
+
+                if (arg_json_format_flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+                        pager_open(arg_pager_flags);
+
+                json_variant_dump(w, arg_json_format_flags, stdout, NULL);
+        }
+
+        return 0;
+}
+
+static int verb_sign(int argc, char *argv[], void *userdata) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(pcr_state_free_all) PcrState *pcr_states = NULL;
+        _cleanup_(EVP_PKEY_freep) EVP_PKEY *privkey = NULL, *pubkey = NULL;
+        _cleanup_fclose_ FILE *privkeyf = NULL;
+        size_t n;
+        int r;
+
+        if (!arg_sections[UNIFIED_SECTION_LINUX] && !arg_current)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Either --linux= or --current must be specified, refusing.");
+
+        if (!arg_private_key)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "No private key specified, use --private-key=.");
+
+        assert(!strv_isempty(arg_banks));
+        assert(!strv_isempty(arg_phase));
+
+        if (arg_append) {
+                r = json_parse_file(NULL, arg_append, 0, &v, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse '%s': %m", arg_append);
+
+                if (!json_variant_is_object(v))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "File '%s' is not a valid JSON object, refusing.", arg_append);
+        }
+
+        /* When signing we only support JSON output */
+        arg_json_format_flags &= ~JSON_FORMAT_OFF;
+
+        privkeyf = fopen(arg_private_key, "re");
+        if (!privkeyf)
+                return log_error_errno(errno, "Failed to open private key file '%s': %m", arg_private_key);
+
+        privkey = PEM_read_PrivateKey(privkeyf, NULL, NULL, NULL);
+        if (!privkey)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse private key '%s'.", arg_private_key);
+
+        if (arg_public_key) {
+                _cleanup_fclose_ FILE *pubkeyf = NULL;
+
+                pubkeyf = fopen(arg_public_key, "re");
+                if (!pubkeyf)
+                        return log_error_errno(errno, "Failed to open public key file '%s': %m", arg_public_key);
+
+                pubkey = PEM_read_PUBKEY(pubkeyf, NULL, NULL, NULL);
+                if (!pubkey)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse public key '%s'.", arg_public_key);
+        } else {
+                _cleanup_(memstream_done) MemStream m = {};
+                FILE *tf;
+
+                /* No public key was specified, let's derive it automatically, if we can */
+
+                tf = memstream_init(&m);
+                if (!tf)
+                        return log_oom();
+
+                if (i2d_PUBKEY_fp(tf, privkey) != 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Failed to extract public key from private key file '%s'.", arg_private_key);
+
+                fflush(tf);
+                rewind(tf);
+
+                if (!d2i_PUBKEY_fp(tf, &pubkey))
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Failed to parse extracted public key of private key file '%s'.", arg_private_key);
+        }
+
+        r = pcr_states_allocate(&pcr_states);
+        if (r < 0)
+                return r;
+
+        n = (size_t) r;
+
+        r = measure_kernel(pcr_states, n);
+        if (r < 0)
+                return r;
+
+        r = pcr_states_save(pcr_states, n);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(phase, arg_phase) {
+
+                r = measure_phase(pcr_states, n, *phase);
+                if (r < 0)
+                        return r;
+
+                for (size_t i = 0; i < n; i++) {
+                        PcrState *p = pcr_states + i;
+
+                        int tpmalg = tpm2_hash_alg_from_string(EVP_MD_name(p->md));
+                        if (tpmalg < 0)
+                                return log_error_errno(tpmalg, "Unsupported PCR bank");
+
+                        Tpm2PCRValue pcr_value = TPM2_PCR_VALUE_MAKE(TPM2_PCR_KERNEL_BOOT,
+                                                                     tpmalg,
+                                                                     TPM2B_DIGEST_MAKE(p->value, p->value_size));
+
+                        TPM2B_DIGEST pcr_policy_digest = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE);
+
+                        r = tpm2_calculate_policy_pcr(&pcr_value, 1, &pcr_policy_digest);
+                        if (r < 0)
+                                return log_error_errno(r, "Could not calculate PolicyPCR digest: %m");
+
+                        _cleanup_free_ void *sig = NULL;
+                        size_t ss;
+
+                        r = digest_and_sign(p->md, privkey, pcr_policy_digest.buffer, pcr_policy_digest.size, &sig, &ss);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to sign PCR policy: %m");
+
+                        _cleanup_free_ void *pubkey_fp = NULL;
+                        size_t pubkey_fp_size = 0;
+                        r = pubkey_fingerprint(pubkey, EVP_sha256(), &pubkey_fp, &pubkey_fp_size);
+                        if (r < 0)
+                                return r;
+
+                        _cleanup_(json_variant_unrefp) JsonVariant *a = NULL;
+                        r = tpm2_make_pcr_json_array(UINT64_C(1) << TPM2_PCR_KERNEL_BOOT, &a);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to build JSON PCR mask array: %m");
+
+                        _cleanup_(json_variant_unrefp) JsonVariant *bv = NULL;
+                        r = json_build(&bv, JSON_BUILD_OBJECT(
+                                                       JSON_BUILD_PAIR("pcrs", JSON_BUILD_VARIANT(a)),                                             /* PCR mask */
+                                                       JSON_BUILD_PAIR("pkfp", JSON_BUILD_HEX(pubkey_fp, pubkey_fp_size)),                         /* SHA256 fingerprint of public key (DER) used for the signature */
+                                                       JSON_BUILD_PAIR("pol", JSON_BUILD_HEX(pcr_policy_digest.buffer, pcr_policy_digest.size)),   /* TPM2 policy hash that is signed */
+                                                       JSON_BUILD_PAIR("sig", JSON_BUILD_BASE64(sig, ss))));                                       /* signature data */
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to build JSON object: %m");
+
+                        _cleanup_(json_variant_unrefp) JsonVariant *av = NULL;
+                        av = json_variant_ref(json_variant_by_key(v, p->bank));
+
+                        r = json_variant_append_array_nodup(&av, bv);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to append JSON object: %m");
+
+                        r = json_variant_set_field(&v, p->bank, av);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to add JSON field: %m");
+                }
+
+                /* Return to the original kernel measurement for the next phase calculation */
+                pcr_states_restore(pcr_states, n);
+        }
+
+        if (arg_json_format_flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+                pager_open(arg_pager_flags);
+
+        json_variant_dump(v, arg_json_format_flags, stdout, NULL);
+
+        return 0;
+}
+
+static int compare_reported_pcr_nr(uint32_t pcr, const char *varname, const char *description) {
+        _cleanup_free_ char *s = NULL;
+        uint32_t v;
+        int r;
+
+        r = efi_get_variable_string(varname, &s);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to read EFI variable '%s': %m", varname);
+
+        r = safe_atou32(s, &v);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse EFI variable '%s': %s", varname, s);
+
+        if (pcr != v)
+                log_warning("PCR number reported by stub for %s (%" PRIu32 ") different from our expectation (%" PRIu32 ").\n"
+                            "The measurements are likely inconsistent.", description, v, pcr);
+
+        return 0;
+}
+
+static int validate_stub(void) {
+        uint64_t features;
+        bool found = false;
+        int r;
+
+        if (tpm2_support() != TPM2_SUPPORT_FULL)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Sorry, system lacks full TPM2 support.");
+
+        r = efi_stub_get_features(&features);
+        if (r < 0)
+                return log_error_errno(r, "Unable to get stub features: %m");
+
+        if (!FLAGS_SET(features, EFI_STUB_FEATURE_THREE_PCRS))
+                log_warning("Warning: current kernel image does not support measuring itself, the command line or initrd system extension images.\n"
+                            "The PCR measurements seen are unlikely to be valid.");
+
+        r = compare_reported_pcr_nr(TPM2_PCR_KERNEL_BOOT, EFI_LOADER_VARIABLE(StubPcrKernelImage), "kernel image");
+        if (r < 0)
+                return r;
+
+        r = compare_reported_pcr_nr(TPM2_PCR_KERNEL_CONFIG, EFI_LOADER_VARIABLE(StubPcrKernelParameters), "kernel parameters");
+        if (r < 0)
+                return r;
+
+        r = compare_reported_pcr_nr(TPM2_PCR_SYSEXTS, EFI_LOADER_VARIABLE(StubPcrInitRDSysExts), "initrd system extension images");
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(bank, arg_banks) {
+                _cleanup_free_ char *b = NULL, *p = NULL;
+
+                b = strdup(*bank);
+                if (!b)
+                        return log_oom();
+
+                if (asprintf(&p, "/sys/class/tpm/tpm0/pcr-%s/", ascii_strlower(b)) < 0)
+                        return log_oom();
+
+                if (access(p, F_OK) < 0) {
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to detect if '%s' exists: %m", b);
+                } else
+                        found = true;
+        }
+
+        if (!found)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "None of the select PCR banks appear to exist.");
+
+        return 0;
+}
+
+static int verb_status(int argc, char *argv[], void *userdata) {
+        static const uint32_t relevant_pcrs[] = {
+                TPM2_PCR_KERNEL_BOOT,
+                TPM2_PCR_KERNEL_CONFIG,
+                TPM2_PCR_SYSEXTS,
+        };
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        int r;
+
+        r = validate_stub();
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < ELEMENTSOF(relevant_pcrs); i++) {
+
+                STRV_FOREACH(bank, arg_banks) {
+                        _cleanup_free_ char *b = NULL, *p = NULL, *s = NULL;
+                        _cleanup_free_ void *h = NULL;
+                        size_t l;
+
+                        b = strdup(*bank);
+                        if (!b)
+                                return log_oom();
+
+                        if (asprintf(&p, "/sys/class/tpm/tpm0/pcr-%s/%" PRIu32, ascii_strlower(b), relevant_pcrs[i]) < 0)
+                                return log_oom();
+
+                        r = read_virtual_file(p, 4096, &s, NULL);
+                        if (r == -ENOENT)
+                                continue;
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to read '%s': %m", p);
+
+                        r = unhexmem(strstrip(s), SIZE_MAX, &h, &l);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to decode PCR value '%s': %m", s);
+
+                        if (arg_json_format_flags & JSON_FORMAT_OFF) {
+                                _cleanup_free_ char *f = NULL;
+
+                                f = hexmem(h, l);
+                                if (!h)
+                                        return log_oom();
+
+                                if (bank == arg_banks) {
+                                        /* before the first line for each PCR, write a short descriptive text to
+                                         * stderr, and leave the primary content on stdout */
+                                        fflush(stdout);
+                                        fprintf(stderr, "%s# PCR[%" PRIu32 "] %s%s%s\n",
+                                                ansi_grey(),
+                                                relevant_pcrs[i],
+                                                tpm2_pcr_index_to_string(relevant_pcrs[i]),
+                                                memeqzero(h, l) ? " (NOT SET!)" : "",
+                                                ansi_normal());
+                                        fflush(stderr);
+                                }
+
+                                printf("%" PRIu32 ":%s=%s\n", relevant_pcrs[i], b, f);
+
+                        } else {
+                                _cleanup_(json_variant_unrefp) JsonVariant *bv = NULL, *a = NULL;
+
+                                r = json_build(&bv,
+                                               JSON_BUILD_OBJECT(
+                                                               JSON_BUILD_PAIR("pcr", JSON_BUILD_INTEGER(relevant_pcrs[i])),
+                                                               JSON_BUILD_PAIR("hash", JSON_BUILD_HEX(h, l))
+                                               )
+                                );
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to build JSON object: %m");
+
+                                a = json_variant_ref(json_variant_by_key(v, b));
+
+                                r = json_variant_append_array(&a, bv);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to append PCR entry to JSON array: %m");
+
+                                r = json_variant_set_field(&v, b, a);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to add bank info to object: %m");
+                        }
+                }
+        }
+
+        if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) {
+                if (arg_json_format_flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+                        pager_open(arg_pager_flags);
+
+                json_variant_dump(v, arg_json_format_flags, stdout, NULL);
+        }
+
+        return 0;
+}
+
+static int measure_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help",      VERB_ANY, VERB_ANY, 0,            help           },
+                { "status",    VERB_ANY, 1,        VERB_DEFAULT, verb_status    },
+                { "calculate", VERB_ANY, 1,        0,            verb_calculate },
+                { "sign",      VERB_ANY, 1,        0,            verb_sign      },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_show_color(true);
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return measure_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/boot/meson.build b/src/boot/meson.build
new file mode 100644
index 0000000..55b9bd6
--- /dev/null
+++ b/src/boot/meson.build
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+bootctl_sources = files(
+        'bootctl-install.c',
+        'bootctl-random-seed.c',
+        'bootctl-reboot-to-firmware.c',
+        'bootctl-set-efivar.c',
+        'bootctl-status.c',
+        'bootctl-systemd-efi-options.c',
+        'bootctl-uki.c',
+        'bootctl-util.c',
+        'bootctl.c',
+)
+
+if get_option('link-boot-shared')
+        boot_link_with = [libshared]
+else
+        boot_link_with = [
+                libshared_static,
+                libsystemd_static,
+        ]
+endif
+
+executables += [
+        executable_template + {
+                'name' : 'bootctl',
+                'public' : true,
+                'conditions' : [
+                          'HAVE_BLKID',
+                ],
+                'sources' : bootctl_sources,
+                'link_with' : boot_link_with,
+                'dependencies' : libblkid,
+        },
+        libexec_template + {
+                'name' : 'systemd-bless-boot',
+                'public' : true,
+                'conditions' : [
+                        'HAVE_BLKID',
+                        'ENABLE_BOOTLOADER',
+                ],
+                'sources' : files('bless-boot.c'),
+                'link_with' : boot_link_with,
+                'dependencies' : libblkid,
+        },
+        generator_template + {
+                'name' : 'systemd-bless-boot-generator',
+                'conditions' : [
+                        'HAVE_BLKID',
+                        'ENABLE_BOOTLOADER',
+                ],
+                'sources' : files('bless-boot-generator.c'),
+                'link_with' : boot_link_with,
+        },
+        libexec_template + {
+                'name' : 'systemd-measure',
+                'conditions' : [
+                        'HAVE_BLKID',
+                        'HAVE_OPENSSL',
+                        'HAVE_TPM2',
+                ],
+                'sources' : files('measure.c'),
+                'dependencies' : libopenssl,
+        },
+        libexec_template + {
+                'name' : 'systemd-boot-check-no-failures',
+                'sources' : files('boot-check-no-failures.c'),
+        },
+]
diff --git a/src/busctl/busctl-introspect.c b/src/busctl/busctl-introspect.c
new file mode 100644
index 0000000..3da4a13
--- /dev/null
+++ b/src/busctl/busctl-introspect.c
@@ -0,0 +1,715 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "busctl-introspect.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "xml.h"
+
+#define NODE_DEPTH_MAX 16
+
+typedef struct Context {
+        const XMLIntrospectOps *ops;
+        void *userdata;
+
+        char *interface_name;
+        uint64_t interface_flags;
+
+        char *member_name;
+        char *member_signature;
+        char *member_result;
+        uint64_t member_flags;
+        bool member_writable;
+
+        const char *current;
+        void *xml_state;
+} Context;
+
+static void context_reset_member(Context *c) {
+        free(c->member_name);
+        free(c->member_signature);
+        free(c->member_result);
+
+        c->member_name = c->member_signature = c->member_result = NULL;
+        c->member_flags = 0;
+        c->member_writable = false;
+}
+
+static void context_reset_interface(Context *c) {
+        c->interface_name = mfree(c->interface_name);
+        c->interface_flags = 0;
+
+        context_reset_member(c);
+}
+
+static int parse_xml_annotation(Context *context, uint64_t *flags) {
+
+        enum {
+                STATE_ANNOTATION,
+                STATE_NAME,
+                STATE_VALUE
+        } state = STATE_ANNOTATION;
+
+        _cleanup_free_ char *field = NULL, *value = NULL;
+
+        assert(context);
+
+        for (;;) {
+                _cleanup_free_ char *name = NULL;
+
+                int t;
+
+                t = xml_tokenize(&context->current, &name, &context->xml_state, NULL);
+                if (t < 0) {
+                        log_error("XML parse error.");
+                        return t;
+                }
+
+                if (t == XML_END)
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                               "Premature end of XML data.");
+
+                switch (state) {
+
+                case STATE_ANNOTATION:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_NAME;
+
+                                else if (streq_ptr(name, "value"))
+                                        state = STATE_VALUE;
+
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  attribute %s.",
+                                                               name);
+
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "annotation"))) {
+
+                                if (flags) {
+                                        if (streq_ptr(field, "org.freedesktop.DBus.Deprecated")) {
+
+                                                if (streq_ptr(value, "true"))
+                                                        *flags |= SD_BUS_VTABLE_DEPRECATED;
+
+                                        } else if (streq_ptr(field, "org.freedesktop.DBus.Method.NoReply")) {
+
+                                                if (streq_ptr(value, "true"))
+                                                        *flags |= SD_BUS_VTABLE_METHOD_NO_REPLY;
+
+                                        } else if (streq_ptr(field, "org.freedesktop.DBus.Property.EmitsChangedSignal")) {
+
+                                                if (streq_ptr(value, "const"))
+                                                        *flags = (*flags & ~(SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION|SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE)) | SD_BUS_VTABLE_PROPERTY_CONST;
+                                                else if (streq_ptr(value, "invalidates"))
+                                                        *flags = (*flags & ~(SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE|SD_BUS_VTABLE_PROPERTY_CONST)) | SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION;
+                                                else if (streq_ptr(value, "false"))
+                                                        *flags = *flags & ~(SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE|SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION);
+                                        }
+                                }
+
+                                return 0;
+
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (1)");
+
+                        break;
+
+                case STATE_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                free_and_replace(field, name);
+
+                                state = STATE_ANNOTATION;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (2)");
+
+                        break;
+
+                case STATE_VALUE:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                free_and_replace(value, name);
+
+                                state = STATE_ANNOTATION;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (3)");
+
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+}
+
+static int parse_xml_node(Context *context, const char *prefix, unsigned n_depth) {
+
+        enum {
+                STATE_NODE,
+                STATE_NODE_NAME,
+                STATE_INTERFACE,
+                STATE_INTERFACE_NAME,
+                STATE_METHOD,
+                STATE_METHOD_NAME,
+                STATE_METHOD_ARG,
+                STATE_METHOD_ARG_NAME,
+                STATE_METHOD_ARG_TYPE,
+                STATE_METHOD_ARG_DIRECTION,
+                STATE_SIGNAL,
+                STATE_SIGNAL_NAME,
+                STATE_SIGNAL_ARG,
+                STATE_SIGNAL_ARG_NAME,
+                STATE_SIGNAL_ARG_TYPE,
+                STATE_SIGNAL_ARG_DIRECTION,
+                STATE_PROPERTY,
+                STATE_PROPERTY_NAME,
+                STATE_PROPERTY_TYPE,
+                STATE_PROPERTY_ACCESS,
+        } state = STATE_NODE;
+
+        _cleanup_free_ char *node_path = NULL, *argument_type = NULL, *argument_direction = NULL;
+        const char *np = ASSERT_PTR(prefix);
+        int r;
+
+        assert(context);
+
+        if (n_depth > NODE_DEPTH_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), " depth too high.");
+
+        for (;;) {
+                _cleanup_free_ char *name = NULL;
+                int t;
+
+                t = xml_tokenize(&context->current, &name, &context->xml_state, NULL);
+                if (t < 0) {
+                        log_error("XML parse error.");
+                        return t;
+                }
+
+                if (t == XML_END)
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Premature end of XML data.");
+
+                switch (state) {
+
+                case STATE_NODE:
+                        if (t == XML_ATTRIBUTE_NAME) {
+
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_NODE_NAME;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  attribute %s.", name);
+
+                        } else if (t == XML_TAG_OPEN) {
+
+                                if (streq_ptr(name, "interface"))
+                                        state = STATE_INTERFACE;
+                                else if (streq_ptr(name, "node")) {
+
+                                        r = parse_xml_node(context, np, n_depth+1);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  tag %s.", name);
+
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "node"))) {
+
+                                if (context->ops->on_path) {
+                                        r = context->ops->on_path(node_path ?: np, context->userdata);
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                return 0;
+
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (1)");
+
+                        break;
+
+                case STATE_NODE_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+
+                                free(node_path);
+
+                                if (name[0] == '/')
+                                        node_path = TAKE_PTR(name);
+                                else {
+                                        node_path = path_join(prefix, name);
+                                        if (!node_path)
+                                                return log_oom();
+                                }
+
+                                np = node_path;
+                                state = STATE_NODE;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (2)");
+
+                        break;
+
+                case STATE_INTERFACE:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_INTERFACE_NAME;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  attribute %s.",
+                                                               name);
+
+                        } else if (t == XML_TAG_OPEN) {
+                                if (streq_ptr(name, "method"))
+                                        state = STATE_METHOD;
+                                else if (streq_ptr(name, "signal"))
+                                        state = STATE_SIGNAL;
+                                else if (streq_ptr(name, "property")) {
+                                        context->member_flags |= SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE;
+                                        state = STATE_PROPERTY;
+                                } else if (streq_ptr(name, "annotation")) {
+                                        r = parse_xml_annotation(context, &context->interface_flags);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unexpected  tag %s.", name);
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "interface"))) {
+
+                                if (n_depth == 0) {
+                                        if (context->ops->on_interface) {
+                                                r = context->ops->on_interface(context->interface_name, context->interface_flags, context->userdata);
+                                                if (r < 0)
+                                                        return r;
+                                        }
+
+                                        context_reset_interface(context);
+                                }
+
+                                state = STATE_NODE;
+
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (1)");
+
+                        break;
+
+                case STATE_INTERFACE_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                if (n_depth == 0)
+                                        free_and_replace(context->interface_name, name);
+
+                                state = STATE_INTERFACE;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (2)");
+
+                        break;
+
+                case STATE_METHOD:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_METHOD_NAME;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  attribute %s",
+                                                               name);
+                        } else if (t == XML_TAG_OPEN) {
+                                if (streq_ptr(name, "arg"))
+                                        state = STATE_METHOD_ARG;
+                                else if (streq_ptr(name, "annotation")) {
+                                        r = parse_xml_annotation(context, &context->member_flags);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unexpected  tag %s.",
+                                                               name);
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "method"))) {
+
+                                if (n_depth == 0) {
+                                        if (context->ops->on_method) {
+                                                r = context->ops->on_method(context->interface_name, context->member_name, context->member_signature, context->member_result, context->member_flags, context->userdata);
+                                                if (r < 0)
+                                                        return r;
+                                        }
+
+                                        context_reset_member(context);
+                                }
+
+                                state = STATE_INTERFACE;
+
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in  (1).");
+
+                        break;
+
+                case STATE_METHOD_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                if (n_depth == 0)
+                                        free_and_replace(context->member_name, name);
+
+                                state = STATE_METHOD;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in  (2).");
+
+                        break;
+
+                case STATE_METHOD_ARG:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_METHOD_ARG_NAME;
+                                else if (streq_ptr(name, "type"))
+                                        state = STATE_METHOD_ARG_TYPE;
+                                else if (streq_ptr(name, "direction"))
+                                        state = STATE_METHOD_ARG_DIRECTION;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected method  attribute %s.",
+                                                               name);
+                        } else if (t == XML_TAG_OPEN) {
+                                if (streq_ptr(name, "annotation")) {
+                                        r = parse_xml_annotation(context, NULL);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unexpected method  tag %s.",
+                                                               name);
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "arg"))) {
+
+                                if (n_depth == 0) {
+
+                                        if (argument_type) {
+                                                if (!argument_direction || streq(argument_direction, "in")) {
+                                                        if (!strextend(&context->member_signature, argument_type))
+                                                                return log_oom();
+                                                } else if (streq(argument_direction, "out")) {
+                                                        if (!strextend(&context->member_result, argument_type))
+                                                                return log_oom();
+                                                } else
+                                                        log_error("Unexpected method  direction value '%s'.", argument_direction);
+                                        }
+
+                                        argument_type = mfree(argument_type);
+                                        argument_direction = mfree(argument_direction);
+                                }
+
+                                state = STATE_METHOD;
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in method . (1)");
+
+                        break;
+
+                case STATE_METHOD_ARG_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE)
+                                state = STATE_METHOD_ARG;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in method . (2)");
+
+                        break;
+
+                case STATE_METHOD_ARG_TYPE:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                free_and_replace(argument_type, name);
+
+                                state = STATE_METHOD_ARG;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in method . (3)");
+
+                        break;
+
+                case STATE_METHOD_ARG_DIRECTION:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                free_and_replace(argument_direction, name);
+
+                                state = STATE_METHOD_ARG;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in method . (4)");
+
+                        break;
+
+                case STATE_SIGNAL:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_SIGNAL_NAME;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  attribute %s.",
+                                                               name);
+                        } else if (t == XML_TAG_OPEN) {
+                                if (streq_ptr(name, "arg"))
+                                        state = STATE_SIGNAL_ARG;
+                                else if (streq_ptr(name, "annotation")) {
+                                        r = parse_xml_annotation(context, &context->member_flags);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unexpected  tag %s.",
+                                                               name);
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "signal"))) {
+
+                                if (n_depth == 0) {
+                                        if (context->ops->on_signal) {
+                                                r = context->ops->on_signal(context->interface_name, context->member_name, context->member_signature, context->member_flags, context->userdata);
+                                                if (r < 0)
+                                                        return r;
+                                        }
+
+                                        context_reset_member(context);
+                                }
+
+                                state = STATE_INTERFACE;
+
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (1)");
+
+                        break;
+
+                case STATE_SIGNAL_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                if (n_depth == 0)
+                                        free_and_replace(context->member_name, name);
+
+                                state = STATE_SIGNAL;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (2)");
+
+                        break;
+
+                case STATE_SIGNAL_ARG:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_SIGNAL_ARG_NAME;
+                                else if (streq_ptr(name, "type"))
+                                        state = STATE_SIGNAL_ARG_TYPE;
+                                else if (streq_ptr(name, "direction"))
+                                        state = STATE_SIGNAL_ARG_DIRECTION;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected signal  attribute %s.",
+                                                               name);
+                        } else if (t == XML_TAG_OPEN) {
+                                if (streq_ptr(name, "annotation")) {
+                                        r = parse_xml_annotation(context, NULL);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unexpected signal  tag %s.",
+                                                               name);
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "arg"))) {
+
+                                if (argument_type) {
+                                        if (!argument_direction || streq(argument_direction, "out")) {
+                                                if (!strextend(&context->member_signature, argument_type))
+                                                        return log_oom();
+                                        } else
+                                                log_error("Unexpected signal  direction value '%s'.", argument_direction);
+
+                                        argument_type = mfree(argument_type);
+                                }
+
+                                state = STATE_SIGNAL;
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in signal  (1).");
+
+                        break;
+
+                case STATE_SIGNAL_ARG_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE)
+                                state = STATE_SIGNAL_ARG;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in signal  (2).");
+
+                        break;
+
+                case STATE_SIGNAL_ARG_TYPE:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                free_and_replace(argument_type, name);
+
+                                state = STATE_SIGNAL_ARG;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in signal  (3).");
+
+                        break;
+
+                case STATE_SIGNAL_ARG_DIRECTION:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                free_and_replace(argument_direction, name);
+
+                                state = STATE_SIGNAL_ARG;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in signal . (4)");
+
+                        break;
+
+                case STATE_PROPERTY:
+
+                        if (t == XML_ATTRIBUTE_NAME) {
+                                if (streq_ptr(name, "name"))
+                                        state = STATE_PROPERTY_NAME;
+                                else if (streq_ptr(name, "type"))
+                                        state  = STATE_PROPERTY_TYPE;
+                                else if (streq_ptr(name, "access"))
+                                        state  = STATE_PROPERTY_ACCESS;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                               "Unexpected  attribute %s.",
+                                                               name);
+                        } else if (t == XML_TAG_OPEN) {
+
+                                if (streq_ptr(name, "annotation")) {
+                                        r = parse_xml_annotation(context, &context->member_flags);
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unexpected  tag %s.",
+                                                               name);
+
+                        } else if (t == XML_TAG_CLOSE_EMPTY ||
+                                   (t == XML_TAG_CLOSE && streq_ptr(name, "property"))) {
+
+                                if (n_depth == 0) {
+                                        if (context->ops->on_property) {
+                                                r = context->ops->on_property(context->interface_name, context->member_name, context->member_signature, context->member_writable, context->member_flags, context->userdata);
+                                                if (r < 0)
+                                                        return r;
+                                        }
+
+                                        context_reset_member(context);
+                                }
+
+                                state = STATE_INTERFACE;
+
+                        } else if (t != XML_TEXT || !in_charset(name, WHITESPACE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (1)");
+
+                        break;
+
+                case STATE_PROPERTY_NAME:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                if (n_depth == 0)
+                                        free_and_replace(context->member_name, name);
+
+                                state = STATE_PROPERTY;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (2)");
+
+                        break;
+
+                case STATE_PROPERTY_TYPE:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+                                if (n_depth == 0)
+                                        free_and_replace(context->member_signature, name);
+
+                                state = STATE_PROPERTY;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (3)");
+
+                        break;
+
+                case STATE_PROPERTY_ACCESS:
+
+                        if (t == XML_ATTRIBUTE_VALUE) {
+
+                                if (streq(name, "readwrite") || streq(name, "write"))
+                                        context->member_writable = true;
+
+                                state = STATE_PROPERTY;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unexpected token in . (4)");
+
+                        break;
+                }
+        }
+}
+
+int parse_xml_introspect(const char *prefix, const char *xml, const XMLIntrospectOps *ops, void *userdata) {
+        _cleanup_(context_reset_interface) Context context = {
+                .ops = ops,
+                .userdata = userdata,
+                .current = xml,
+        };
+
+        int r;
+
+        assert(prefix);
+        assert(xml);
+        assert(ops);
+
+        for (;;) {
+                _cleanup_free_ char *name = NULL;
+
+                r = xml_tokenize(&context.current, &name, &context.xml_state, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "XML parse error");
+
+                if (r == XML_END)
+                        break;
+
+                if (r == XML_TAG_OPEN) {
+
+                        if (streq(name, "node")) {
+                                r = parse_xml_node(&context, prefix, 0);
+                                if (r < 0)
+                                        return r;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                       "Unexpected tag '%s' in introspection data.", name);
+                } else if (r != XML_TEXT || !in_charset(name, WHITESPACE))
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected token.");
+        }
+
+        return 0;
+}
diff --git a/src/busctl/busctl-introspect.h b/src/busctl/busctl-introspect.h
new file mode 100644
index 0000000..720a0df
--- /dev/null
+++ b/src/busctl/busctl-introspect.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+typedef struct XMLIntrospectOps {
+        int (*on_path)(const char *path, void *userdata);
+        int (*on_interface)(const char *name, uint64_t flags, void *userdata);
+        int (*on_method)(const char *interface, const char *name, const char *signature, const char *result, uint64_t flags, void *userdata);
+        int (*on_signal)(const char *interface, const char *name, const char *signature, uint64_t flags, void *userdata);
+        int (*on_property)(const char *interface, const char *name, const char *signature, bool writable, uint64_t flags, void *userdata);
+} XMLIntrospectOps;
+
+int parse_xml_introspect(const char *prefix, const char *xml, const XMLIntrospectOps *ops, void *userdata);
diff --git a/src/busctl/busctl.c b/src/busctl/busctl.c
new file mode 100644
index 0000000..39d22f2
--- /dev/null
+++ b/src/busctl/busctl.c
@@ -0,0 +1,2593 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "bus-dump.h"
+#include "bus-internal.h"
+#include "bus-message.h"
+#include "bus-signature.h"
+#include "bus-type.h"
+#include "bus-util.h"
+#include "busctl-introspect.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-table.h"
+#include "glyph-util.h"
+#include "json.h"
+#include "log.h"
+#include "main-func.h"
+#include "memstream-util.h"
+#include "os-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "runtime-scope.h"
+#include "set.h"
+#include "sort-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "user-util.h"
+#include "verbs.h"
+#include "version.h"
+
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+static PagerFlags arg_pager_flags = 0;
+static bool arg_legend = true;
+static int arg_full = -1;
+static const char *arg_address = NULL;
+static bool arg_unique = false;
+static bool arg_acquired = false;
+static bool arg_activatable = false;
+static bool arg_show_machine = false;
+static char **arg_matches = NULL;
+static BusTransport arg_transport = BUS_TRANSPORT_LOCAL;
+static const char *arg_host = NULL;
+static RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+static size_t arg_snaplen = 4096;
+static bool arg_list = false;
+static bool arg_quiet = false;
+static bool arg_verbose = false;
+static bool arg_xml_interface = false;
+static bool arg_expect_reply = true;
+static bool arg_auto_start = true;
+static bool arg_allow_interactive_authorization = true;
+static bool arg_augment_creds = true;
+static bool arg_watch_bind = false;
+static usec_t arg_timeout = 0;
+static const char *arg_destination = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_matches, strv_freep);
+
+#define NAME_IS_ACQUIRED INT_TO_PTR(1)
+#define NAME_IS_ACTIVATABLE INT_TO_PTR(2)
+
+static int json_transform_message(sd_bus_message *m, JsonVariant **ret);
+
+static int acquire_bus(bool set_monitor, sd_bus **ret) {
+        _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        r = sd_bus_new(&bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate bus: %m");
+
+        (void) sd_bus_set_description(bus, "busctl");
+
+        if (set_monitor) {
+                r = sd_bus_set_monitor(bus, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set monitor mode: %m");
+
+                r = sd_bus_negotiate_creds(bus, true, _SD_BUS_CREDS_ALL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to enable credentials: %m");
+
+                r = sd_bus_negotiate_timestamp(bus, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to enable timestamps: %m");
+
+                r = sd_bus_negotiate_fds(bus, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to enable fds: %m");
+        }
+
+        r = sd_bus_set_bus_client(bus, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set bus client: %m");
+
+        r = sd_bus_set_watch_bind(bus, arg_watch_bind);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set watch-bind setting to '%s': %m",
+                                       yes_no(arg_watch_bind));
+
+        if (arg_address)
+                r = sd_bus_set_address(bus, arg_address);
+        else
+                switch (arg_transport) {
+
+                case BUS_TRANSPORT_LOCAL:
+
+                        switch (arg_runtime_scope) {
+
+                        case RUNTIME_SCOPE_USER:
+                                r = bus_set_address_user(bus);
+                                break;
+
+                        case RUNTIME_SCOPE_SYSTEM:
+                                r = bus_set_address_system(bus);
+                                break;
+
+                        default:
+                                assert_not_reached();
+                        }
+
+                        break;
+
+                case BUS_TRANSPORT_REMOTE:
+                        r = bus_set_address_system_remote(bus, arg_host);
+                        break;
+
+                case BUS_TRANSPORT_MACHINE:
+                        r = bus_set_address_machine(bus, arg_runtime_scope, arg_host);
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (r < 0)
+                return bus_log_address_error(r, arg_transport);
+
+        r = sd_bus_start(bus);
+        if (r < 0)
+                return bus_log_connect_error(r, arg_transport);
+
+        *ret = TAKE_PTR(bus);
+
+        return 0;
+}
+
+static int list_bus_names(int argc, char **argv, void *userdata) {
+        _cleanup_strv_free_ char **acquired = NULL, **activatable = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_hashmap_free_ Hashmap *names = NULL;
+        _cleanup_(table_unrefp) Table *table = NULL;
+        char *k;
+        void *v;
+        int r;
+
+        enum {
+                COLUMN_ACTIVATABLE,
+                COLUMN_NAME,
+                COLUMN_PID,
+                COLUMN_PROCESS,
+                COLUMN_USER,
+                COLUMN_CONNECTION,
+                COLUMN_UNIT,
+                COLUMN_SESSION,
+                COLUMN_DESCRIPTION,
+                COLUMN_MACHINE,
+        };
+
+        if (!arg_unique && !arg_acquired && !arg_activatable)
+                arg_unique = arg_acquired = arg_activatable = true;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_list_names(bus,
+                              (arg_acquired || arg_unique) ? &acquired : NULL,
+                              arg_activatable ? &activatable : NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to list names: %m");
+
+        names = hashmap_new(&string_hash_ops);
+        if (!names)
+                return log_oom();
+
+        STRV_FOREACH(i, acquired) {
+                r = hashmap_put(names, *i, NAME_IS_ACQUIRED);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add to hashmap: %m");
+        }
+
+        STRV_FOREACH(i, activatable) {
+                r = hashmap_put(names, *i, NAME_IS_ACTIVATABLE);
+                if (r < 0 && r != -EEXIST)
+                        return log_error_errno(r, "Failed to add to hashmap: %m");
+        }
+
+        table = table_new("activatable",
+                          "name",
+                          "pid",
+                          "process",
+                          "user",
+                          "connection",
+                          "unit",
+                          "session",
+                          "description",
+                          "machine");
+        if (!table)
+                return log_oom();
+
+        if (arg_full > 0)
+                table_set_width(table, 0);
+
+        r = table_set_align_percent(table, table_get_cell(table, 0, COLUMN_PID), 100);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set alignment: %m");
+
+        table_set_ersatz_string(table, TABLE_ERSATZ_DASH);
+
+        r = table_set_sort(table, (size_t) COLUMN_NAME);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set sort column: %m");
+
+        if (arg_show_machine)
+                r = table_set_display(table, (size_t) COLUMN_NAME,
+                                             (size_t) COLUMN_PID,
+                                             (size_t) COLUMN_PROCESS,
+                                             (size_t) COLUMN_USER,
+                                             (size_t) COLUMN_CONNECTION,
+                                             (size_t) COLUMN_UNIT,
+                                             (size_t) COLUMN_SESSION,
+                                             (size_t) COLUMN_DESCRIPTION,
+                                             (size_t) COLUMN_MACHINE);
+        else
+                r = table_set_display(table, (size_t) COLUMN_NAME,
+                                             (size_t) COLUMN_PID,
+                                             (size_t) COLUMN_PROCESS,
+                                             (size_t) COLUMN_USER,
+                                             (size_t) COLUMN_CONNECTION,
+                                             (size_t) COLUMN_UNIT,
+                                             (size_t) COLUMN_SESSION,
+                                             (size_t) COLUMN_DESCRIPTION);
+
+        if (r < 0)
+                return log_error_errno(r, "Failed to set columns to display: %m");
+
+        HASHMAP_FOREACH_KEY(v, k, names) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+
+                if (v == NAME_IS_ACTIVATABLE) {
+                        r = table_add_many(
+                                        table,
+                                        TABLE_INT, PTR_TO_INT(v),
+                                        TABLE_STRING, k,
+                                        TABLE_EMPTY,
+                                        TABLE_EMPTY,
+                                        TABLE_EMPTY,
+                                        TABLE_STRING, "(activatable)", TABLE_SET_COLOR, ansi_grey(),
+                                        TABLE_EMPTY,
+                                        TABLE_EMPTY,
+                                        TABLE_EMPTY,
+                                        TABLE_EMPTY);
+                        if (r < 0)
+                                return table_log_add_error(r);
+
+                        continue;
+                }
+
+                assert(v == NAME_IS_ACQUIRED);
+
+                if (!arg_unique && k[0] == ':')
+                        continue;
+
+                if (!arg_acquired && k[0] != ':')
+                        continue;
+
+                r = table_add_many(table,
+                                   TABLE_INT, PTR_TO_INT(v),
+                                   TABLE_STRING, k);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                r = sd_bus_get_name_creds(
+                                bus, k,
+                                (arg_augment_creds ? SD_BUS_CREDS_AUGMENT : 0) |
+                                SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID|SD_BUS_CREDS_COMM|
+                                SD_BUS_CREDS_UNIQUE_NAME|SD_BUS_CREDS_UNIT|SD_BUS_CREDS_SESSION|
+                                SD_BUS_CREDS_DESCRIPTION, &creds);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to acquire credentials of service %s, ignoring: %m", k);
+
+                        r = table_fill_empty(table, COLUMN_MACHINE);
+                } else {
+                        const char *unique = NULL, *session = NULL, *unit = NULL, *cn = NULL;
+                        pid_t pid;
+                        uid_t uid;
+
+                        r = sd_bus_creds_get_pid(creds, &pid);
+                        if (r >= 0) {
+                                const char *comm = NULL;
+
+                                (void) sd_bus_creds_get_comm(creds, &comm);
+
+                                r = table_add_many(table,
+                                                   TABLE_PID, pid,
+                                                   TABLE_STRING, strna(comm));
+                        } else
+                                r = table_add_many(table, TABLE_EMPTY, TABLE_EMPTY);
+                        if (r < 0)
+                                return table_log_add_error(r);
+
+                        r = sd_bus_creds_get_euid(creds, &uid);
+                        if (r >= 0) {
+                                _cleanup_free_ char *u = NULL;
+
+                                u = uid_to_name(uid);
+                                if (!u)
+                                        return log_oom();
+
+                                r = table_add_cell(table, NULL, TABLE_STRING, u);
+                        } else
+                                r = table_add_cell(table, NULL, TABLE_EMPTY, NULL);
+                        if (r < 0)
+                                return table_log_add_error(r);
+
+                        (void) sd_bus_creds_get_unique_name(creds, &unique);
+                        (void) sd_bus_creds_get_unit(creds, &unit);
+                        (void) sd_bus_creds_get_session(creds, &session);
+                        (void) sd_bus_creds_get_description(creds, &cn);
+
+                        r = table_add_many(
+                                        table,
+                                        TABLE_STRING, unique,
+                                        TABLE_STRING, unit,
+                                        TABLE_STRING, session,
+                                        TABLE_STRING, cn);
+                }
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                if (arg_show_machine) {
+                        sd_id128_t mid;
+
+                        r = sd_bus_get_name_machine_id(bus, k, &mid);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to acquire credentials of service %s, ignoring: %m", k);
+                        else {
+                                r = table_add_cell(table, NULL, TABLE_ID128, &mid);
+                                if (r < 0)
+                                        return table_log_add_error(r);
+
+                                continue; /* line fully filled, no need to fill the remainder below */
+                        }
+                }
+
+                r = table_fill_empty(table, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to fill line: %m");
+        }
+
+        return table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, arg_legend);
+}
+
+static void print_subtree(const char *prefix, const char *path, char **l) {
+        /* We assume the list is sorted. Let's first skip over the
+         * entry we are looking at. */
+        for (;;) {
+                if (!*l)
+                        return;
+
+                if (!streq(*l, path))
+                        break;
+
+                l++;
+        }
+
+        const char
+                *vertical = strjoina(prefix, special_glyph(SPECIAL_GLYPH_TREE_VERTICAL)),
+                *space = strjoina(prefix, special_glyph(SPECIAL_GLYPH_TREE_SPACE));
+
+        for (;;) {
+                bool has_more = false;
+                char **n;
+
+                if (!*l || !path_startswith(*l, path))
+                        break;
+
+                n = l + 1;
+                for (;;) {
+                        if (!*n || !path_startswith(*n, path))
+                                break;
+
+                        if (!path_startswith(*n, *l)) {
+                                has_more = true;
+                                break;
+                        }
+
+                        n++;
+                }
+
+                printf("%s%s %s\n",
+                       prefix,
+                       special_glyph(has_more ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT),
+                       *l);
+
+                print_subtree(has_more ? vertical : space, *l, l);
+                l = n;
+        }
+}
+
+static void print_tree(char **l) {
+        if (arg_list)
+                strv_print(l);
+        else if (strv_isempty(l))
+                printf("No objects discovered.\n");
+        else if (streq(l[0], "/") && !l[1])
+                printf("Only root object discovered.\n");
+        else
+                print_subtree("", "/", l);
+}
+
+static int on_path(const char *path, void *userdata) {
+        Set *paths = ASSERT_PTR(userdata);
+        int r;
+
+        r = set_put_strdup(&paths, path);
+        if (r < 0)
+                return log_oom();
+
+        return 0;
+}
+
+static int find_nodes(sd_bus *bus, const char *service, const char *path, Set *paths) {
+        static const XMLIntrospectOps ops = {
+                .on_path = on_path,
+        };
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        const char *xml;
+        int r;
+
+        r = sd_bus_call_method(bus, service, path,
+                               "org.freedesktop.DBus.Introspectable", "Introspect",
+                               &error, &reply, NULL);
+        if (r < 0) {
+                printf("%sFailed to introspect object %s of service %s: %s%s\n",
+                       ansi_highlight_red(),
+                       path, service, bus_error_message(&error, r),
+                       ansi_normal());
+                return r;
+        }
+
+        r = sd_bus_message_read(reply, "s", &xml);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        return parse_xml_introspect(path, xml, &ops, paths);
+}
+
+static int tree_one(sd_bus *bus, const char *service) {
+        _cleanup_set_free_ Set *paths = NULL, *done = NULL, *failed = NULL;
+        _cleanup_free_ char **l = NULL;
+        int r;
+
+        r = set_put_strdup(&paths, "/");
+        if (r < 0)
+                return log_oom();
+
+        for (;;) {
+                _cleanup_free_ char *p = NULL;
+                int q;
+
+                p = set_steal_first(paths);
+                if (!p)
+                        break;
+
+                if (set_contains(done, p) ||
+                    set_contains(failed, p))
+                        continue;
+
+                q = find_nodes(bus, service, p, paths);
+                if (q < 0 && r >= 0)
+                        r = q;
+
+                q = set_ensure_consume(q < 0 ? &failed : &done, &string_hash_ops_free, TAKE_PTR(p));
+                assert(q != 0);
+                if (q < 0)
+                        return log_oom();
+        }
+
+        pager_open(arg_pager_flags);
+
+        l = set_get_strv(done);
+        if (!l)
+                return log_oom();
+
+        strv_sort(l);
+        print_tree(l);
+
+        fflush(stdout);
+
+        return r;
+}
+
+static int tree(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        /* Do superficial verification of arguments before even opening the bus */
+        STRV_FOREACH(i, strv_skip(argv, 1))
+                if (!sd_bus_service_name_is_valid(*i))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Invalid bus service name: %s", *i);
+
+        if (!arg_unique && !arg_acquired)
+                arg_acquired = true;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        if (argc <= 1) {
+                _cleanup_strv_free_ char **names = NULL;
+                bool not_first = false;
+
+                r = sd_bus_list_names(bus, &names, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get name list: %m");
+
+                pager_open(arg_pager_flags);
+
+                STRV_FOREACH(i, names) {
+                        int q;
+
+                        if (!arg_unique && (*i)[0] == ':')
+                                continue;
+
+                        if (!arg_acquired && (*i)[0] == ':')
+                                continue;
+
+                        if (not_first)
+                                printf("\n");
+
+                        printf("Service %s%s%s:\n", ansi_highlight(), *i, ansi_normal());
+
+                        q = tree_one(bus, *i);
+                        if (q < 0 && r >= 0)
+                                r = q;
+
+                        not_first = true;
+                }
+        } else
+                STRV_FOREACH(i, strv_skip(argv, 1)) {
+                        int q;
+
+                        if (i > argv+1)
+                                printf("\n");
+
+                        if (argv[2]) {
+                                pager_open(arg_pager_flags);
+                                printf("Service %s%s%s:\n", ansi_highlight(), *i, ansi_normal());
+                        }
+
+                        q = tree_one(bus, *i);
+                        if (q < 0 && r >= 0)
+                                r = q;
+                }
+
+        return r;
+}
+
+static int format_cmdline(sd_bus_message *m, FILE *f, bool needs_space) {
+        int r;
+
+        for (;;) {
+                const char *contents = NULL;
+                char type;
+                union {
+                        uint8_t u8;
+                        uint16_t u16;
+                        int16_t s16;
+                        uint32_t u32;
+                        int32_t s32;
+                        uint64_t u64;
+                        int64_t s64;
+                        double d64;
+                        const char *string;
+                        int i;
+                } basic;
+
+                r = sd_bus_message_peek_type(m, &type, &contents);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return needs_space;
+
+                if (bus_type_is_container(type) > 0) {
+
+                        r = sd_bus_message_enter_container(m, type, contents);
+                        if (r < 0)
+                                return r;
+
+                        if (type == SD_BUS_TYPE_ARRAY) {
+                                unsigned n = 0;
+
+                                /* count array entries */
+                                for (;;) {
+
+                                        r = sd_bus_message_skip(m, contents);
+                                        if (r < 0)
+                                                return r;
+                                        if (r == 0)
+                                                break;
+
+                                        n++;
+                                }
+
+                                r = sd_bus_message_rewind(m, false);
+                                if (r < 0)
+                                        return r;
+
+                                if (needs_space)
+                                        fputc(' ', f);
+
+                                fprintf(f, "%u", n);
+                                needs_space = true;
+
+                        } else if (type == SD_BUS_TYPE_VARIANT) {
+
+                                if (needs_space)
+                                        fputc(' ', f);
+
+                                fprintf(f, "%s", contents);
+                                needs_space = true;
+                        }
+
+                        r = format_cmdline(m, f, needs_space);
+                        if (r < 0)
+                                return r;
+
+                        needs_space = r > 0;
+
+                        r = sd_bus_message_exit_container(m);
+                        if (r < 0)
+                                return r;
+
+                        continue;
+                }
+
+                r = sd_bus_message_read_basic(m, type, &basic);
+                if (r < 0)
+                        return r;
+
+                if (needs_space)
+                        fputc(' ', f);
+
+                switch (type) {
+                case SD_BUS_TYPE_BYTE:
+                        fprintf(f, "%u", basic.u8);
+                        break;
+
+                case SD_BUS_TYPE_BOOLEAN:
+                        fputs(true_false(basic.i), f);
+                        break;
+
+                case SD_BUS_TYPE_INT16:
+                        fprintf(f, "%i", basic.s16);
+                        break;
+
+                case SD_BUS_TYPE_UINT16:
+                        fprintf(f, "%u", basic.u16);
+                        break;
+
+                case SD_BUS_TYPE_INT32:
+                        fprintf(f, "%i", basic.s32);
+                        break;
+
+                case SD_BUS_TYPE_UINT32:
+                        fprintf(f, "%u", basic.u32);
+                        break;
+
+                case SD_BUS_TYPE_INT64:
+                        fprintf(f, "%" PRIi64, basic.s64);
+                        break;
+
+                case SD_BUS_TYPE_UINT64:
+                        fprintf(f, "%" PRIu64, basic.u64);
+                        break;
+
+                case SD_BUS_TYPE_DOUBLE:
+                        fprintf(f, "%g", basic.d64);
+                        break;
+
+                case SD_BUS_TYPE_STRING:
+                case SD_BUS_TYPE_OBJECT_PATH:
+                case SD_BUS_TYPE_SIGNATURE: {
+                        _cleanup_free_ char *b = NULL;
+
+                        b = cescape(basic.string);
+                        if (!b)
+                                return -ENOMEM;
+
+                        fprintf(f, "\"%s\"", b);
+                        break;
+                }
+
+                case SD_BUS_TYPE_UNIX_FD:
+                        fprintf(f, "%i", basic.i);
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+
+                needs_space = true;
+        }
+}
+
+typedef struct Member {
+        const char *type;
+        char *interface;
+        char *name;
+        char *signature;
+        char *result;
+        char *value;
+        bool writable;
+        uint64_t flags;
+} Member;
+
+static void member_hash_func(const Member *m, struct siphash *state) {
+        uint64_t arity = 1;
+
+        assert(m);
+        assert(m->type);
+
+        string_hash_func(m->type, state);
+
+        arity += !!m->name + !!m->interface;
+
+        uint64_hash_func(&arity, state);
+
+        if (m->name)
+                string_hash_func(m->name, state);
+
+        if (m->signature)
+                string_hash_func(m->signature, state);
+
+        if (m->interface)
+                string_hash_func(m->interface, state);
+}
+
+static int member_compare_func(const Member *x, const Member *y) {
+        int d;
+
+        assert(x);
+        assert(y);
+        assert(x->type);
+        assert(y->type);
+
+        d = strcmp_ptr(x->interface, y->interface);
+        if (d != 0)
+                return d;
+
+        d = strcmp(x->type, y->type);
+        if (d != 0)
+                return d;
+
+        d = strcmp_ptr(x->name, y->name);
+        if (d != 0)
+                return d;
+
+        return strcmp_ptr(x->signature, y->signature);
+}
+
+static int member_compare_funcp(Member * const *a, Member * const *b) {
+        return member_compare_func(*a, *b);
+}
+
+static Member* member_free(Member *m) {
+        if (!m)
+                return NULL;
+
+        free(m->interface);
+        free(m->name);
+        free(m->signature);
+        free(m->result);
+        free(m->value);
+        return mfree(m);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(Member*, member_free);
+
+static Set* member_set_free(Set *s) {
+        return set_free_with_destructor(s, member_free);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(Set*, member_set_free);
+
+static int on_interface(const char *interface, uint64_t flags, void *userdata) {
+        _cleanup_(member_freep) Member *m = NULL;
+        Set *members = ASSERT_PTR(userdata);
+        int r;
+
+        assert(interface);
+
+        m = new(Member, 1);
+        if (!m)
+                return log_oom();
+
+        *m = (Member) {
+                .type = "interface",
+                .flags = flags,
+        };
+
+        r = free_and_strdup(&m->interface, interface);
+        if (r < 0)
+                return log_oom();
+
+        r = set_put(members, m);
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                       "Invalid introspection data: duplicate interface '%s'.", interface);
+        if (r < 0)
+                return log_oom();
+
+        m = NULL;
+        return 0;
+}
+
+static int on_method(const char *interface, const char *name, const char *signature, const char *result, uint64_t flags, void *userdata) {
+        _cleanup_(member_freep) Member *m = NULL;
+        Set *members = userdata;
+        int r;
+
+        assert(interface);
+        assert(name);
+
+        m = new(Member, 1);
+        if (!m)
+                return log_oom();
+
+        *m = (Member) {
+                .type = "method",
+                .flags = flags,
+        };
+
+        r = free_and_strdup(&m->interface, interface);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->name, name);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->signature, signature);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->result, result);
+        if (r < 0)
+                return log_oom();
+
+        r = set_put(members, m);
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                       "Invalid introspection data: duplicate method '%s' on interface '%s'.", name, interface);
+        if (r < 0)
+                return log_oom();
+
+        m = NULL;
+        return 0;
+}
+
+static int on_signal(const char *interface, const char *name, const char *signature, uint64_t flags, void *userdata) {
+        _cleanup_(member_freep) Member *m = NULL;
+        Set *members = userdata;
+        int r;
+
+        assert(interface);
+        assert(name);
+
+        m = new(Member, 1);
+        if (!m)
+                return log_oom();
+
+        *m = (Member) {
+                .type = "signal",
+                .flags = flags,
+        };
+
+        r = free_and_strdup(&m->interface, interface);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->name, name);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->signature, signature);
+        if (r < 0)
+                return log_oom();
+
+        r = set_put(members, m);
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                       "Invalid introspection data: duplicate signal '%s' on interface '%s'.", name, interface);
+        if (r < 0)
+                return log_oom();
+
+        m = NULL;
+        return 0;
+}
+
+static int on_property(const char *interface, const char *name, const char *signature, bool writable, uint64_t flags, void *userdata) {
+        _cleanup_(member_freep) Member *m = NULL;
+        Set *members = userdata;
+        int r;
+
+        assert(interface);
+        assert(name);
+
+        m = new(Member, 1);
+        if (!m)
+                return log_oom();
+
+        *m = (Member) {
+                .type = "property",
+                .flags = flags,
+                .writable = writable,
+        };
+
+        r = free_and_strdup(&m->interface, interface);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->name, name);
+        if (r < 0)
+                return log_oom();
+
+        r = free_and_strdup(&m->signature, signature);
+        if (r < 0)
+                return log_oom();
+
+        r = set_put(members, m);
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                       "Invalid introspection data: duplicate property '%s' on interface '%s'.", name, interface);
+        if (r < 0)
+                return log_oom();
+
+        m = NULL;
+        return 0;
+}
+
+DEFINE_PRIVATE_HASH_OPS(member_hash_ops, Member, member_hash_func, member_compare_func);
+
+static int introspect(int argc, char **argv, void *userdata) {
+        static const XMLIntrospectOps ops = {
+                .on_interface = on_interface,
+                .on_method = on_method,
+                .on_signal = on_signal,
+                .on_property = on_property,
+        };
+
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply_xml = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(member_set_freep) Set *members = NULL;
+        unsigned name_width, type_width, signature_width, result_width;
+        Member *m;
+        const char *xml;
+        int r;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        members = set_new(&member_hash_ops);
+        if (!members)
+                return log_oom();
+
+        r = sd_bus_call_method(bus, argv[1], argv[2],
+                               "org.freedesktop.DBus.Introspectable", "Introspect",
+                               &error, &reply_xml, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to introspect object %s of service %s: %s",
+                                       argv[2], argv[1], bus_error_message(&error, r));
+
+        r = sd_bus_message_read(reply_xml, "s", &xml);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        if (arg_xml_interface) {
+                /* Just dump the received XML and finish */
+                pager_open(arg_pager_flags);
+                puts(xml);
+                return 0;
+        }
+
+        /* First, get list of all properties */
+        r = parse_xml_introspect(argv[2], xml, &ops, members);
+        if (r < 0)
+                return r;
+
+        /* Second, find the current values for them */
+        SET_FOREACH(m, members) {
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+
+                if (!streq(m->type, "property"))
+                        continue;
+
+                if (m->value)
+                        continue;
+
+                if (argv[3] && !streq(argv[3], m->interface))
+                        continue;
+
+                r = sd_bus_call_method(bus, argv[1], argv[2],
+                                       "org.freedesktop.DBus.Properties", "GetAll",
+                                       &error, &reply, "s", m->interface);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get all properties on interface %s: %s",
+                                               m->interface, bus_error_message(&error, r));
+
+                r = sd_bus_message_enter_container(reply, 'a', "{sv}");
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                for (;;) {
+                        _cleanup_(memstream_done) MemStream ms = {};
+                        _cleanup_free_ char *buf = NULL;
+                        const char *name, *contents;
+                        Member *z;
+                        char type;
+                        FILE *mf;
+
+                        r = sd_bus_message_enter_container(reply, 'e', "sv");
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+                        if (r == 0)
+                                break;
+
+                        r = sd_bus_message_read(reply, "s", &name);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        r = sd_bus_message_peek_type(reply, &type, &contents);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+                        if (type != 'v')
+                                return bus_log_parse_error(EINVAL);
+
+                        r = sd_bus_message_enter_container(reply, 'v', contents);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        mf = memstream_init(&ms);
+                        if (!mf)
+                                return log_oom();
+
+                        r = format_cmdline(reply, mf, false);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        r = memstream_finalize(&ms, &buf, NULL);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to flush and close memstream: %m");
+
+                        z = set_get(members, &((Member) {
+                                                .type = "property",
+                                                .interface = m->interface,
+                                                .signature = (char*) contents,
+                                                .name = (char*) name }));
+                        if (z)
+                                free_and_replace(z->value, buf);
+
+                        r = sd_bus_message_exit_container(reply);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        r = sd_bus_message_exit_container(reply);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+                }
+
+                r = sd_bus_message_exit_container(reply);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+        }
+
+        name_width = strlen("NAME");
+        type_width = strlen("TYPE");
+        signature_width = strlen("SIGNATURE");
+        result_width = strlen("RESULT/VALUE");
+
+        Member **sorted = newa(Member*, set_size(members));
+        size_t k = 0;
+
+        SET_FOREACH(m, members) {
+                if (argv[3] && !streq(argv[3], m->interface))
+                        continue;
+
+                if (m->interface)
+                        name_width = MAX(name_width, strlen(m->interface));
+                if (m->name)
+                        name_width = MAX(name_width, strlen(m->name) + 1);
+                if (m->type)
+                        type_width = MAX(type_width, strlen(m->type));
+                if (m->signature)
+                        signature_width = MAX(signature_width, strlen(m->signature));
+                if (m->result)
+                        result_width = MAX(result_width, strlen(m->result));
+                if (m->value)
+                        result_width = MAX(result_width, strlen(m->value));
+
+                sorted[k++] = m;
+        }
+
+        if (result_width > 40 && arg_full <= 0)
+                result_width = 40;
+
+        typesafe_qsort(sorted, k, member_compare_funcp);
+
+        pager_open(arg_pager_flags);
+
+        if (arg_legend)
+                printf("%-*s %-*s %-*s %-*s %s\n",
+                       (int) name_width, "NAME",
+                       (int) type_width, "TYPE",
+                       (int) signature_width, "SIGNATURE",
+                       (int) result_width, "RESULT/VALUE",
+                       "FLAGS");
+
+        for (size_t j = 0; j < k; j++) {
+                _cleanup_free_ char *ellipsized = NULL;
+                const char *rv;
+                bool is_interface;
+
+                m = sorted[j];
+
+                if (argv[3] && !streq(argv[3], m->interface))
+                        continue;
+
+                is_interface = streq(m->type, "interface");
+
+                if (argv[3] && is_interface)
+                        continue;
+
+                if (m->value) {
+                        ellipsized = ellipsize(m->value, result_width, 100);
+                        if (!ellipsized)
+                                return log_oom();
+
+                        rv = ellipsized;
+                } else
+                        rv = empty_to_dash(m->result);
+
+                printf("%s%s%-*s%s %-*s %-*s %-*s%s%s%s%s%s%s\n",
+                       is_interface ? ansi_highlight() : "",
+                       is_interface ? "" : ".",
+                       - !is_interface + (int) name_width,
+                       empty_to_dash(streq_ptr(m->type, "interface") ? m->interface : m->name),
+                       is_interface ? ansi_normal() : "",
+                       (int) type_width, empty_to_dash(m->type),
+                       (int) signature_width, empty_to_dash(m->signature),
+                       (int) result_width, rv,
+                       (m->flags & SD_BUS_VTABLE_DEPRECATED) ? " deprecated" : (m->flags || m->writable ? "" : " -"),
+                       (m->flags & SD_BUS_VTABLE_METHOD_NO_REPLY) ? " no-reply" : "",
+                       (m->flags & SD_BUS_VTABLE_PROPERTY_CONST) ? " const" : "",
+                       (m->flags & SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE) ? " emits-change" : "",
+                       (m->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION) ? " emits-invalidation" : "",
+                       m->writable ? " writable" : "");
+        }
+
+        return 0;
+}
+
+static int message_dump(sd_bus_message *m, FILE *f) {
+        return sd_bus_message_dump(m, f, SD_BUS_MESSAGE_DUMP_WITH_HEADER);
+}
+
+static int message_pcap(sd_bus_message *m, FILE *f) {
+        return bus_message_pcap_frame(m, arg_snaplen, f);
+}
+
+static int message_json(sd_bus_message *m, FILE *f) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL;
+        char e[2];
+        int r;
+        usec_t ts;
+
+        r = json_transform_message(m, &v);
+        if (r < 0)
+                return r;
+
+        e[0] = m->header->endian;
+        e[1] = 0;
+
+        ts = m->realtime;
+        if (ts == 0)
+                ts = now(CLOCK_REALTIME);
+
+        r = json_build(&w, JSON_BUILD_OBJECT(
+                JSON_BUILD_PAIR("type", JSON_BUILD_STRING(bus_message_type_to_string(m->header->type))),
+                JSON_BUILD_PAIR("endian", JSON_BUILD_STRING(e)),
+                JSON_BUILD_PAIR("flags", JSON_BUILD_INTEGER(m->header->flags)),
+                JSON_BUILD_PAIR("version", JSON_BUILD_INTEGER(m->header->version)),
+                JSON_BUILD_PAIR("cookie", JSON_BUILD_INTEGER(BUS_MESSAGE_COOKIE(m))),
+                JSON_BUILD_PAIR_CONDITION(m->reply_cookie != 0, "reply_cookie", JSON_BUILD_INTEGER(m->reply_cookie)),
+                JSON_BUILD_PAIR("timestamp-realtime", JSON_BUILD_UNSIGNED(ts)),
+                JSON_BUILD_PAIR_CONDITION(m->sender, "sender", JSON_BUILD_STRING(m->sender)),
+                JSON_BUILD_PAIR_CONDITION(m->destination, "destination", JSON_BUILD_STRING(m->destination)),
+                JSON_BUILD_PAIR_CONDITION(m->path, "path", JSON_BUILD_STRING(m->path)),
+                JSON_BUILD_PAIR_CONDITION(m->interface, "interface", JSON_BUILD_STRING(m->interface)),
+                JSON_BUILD_PAIR_CONDITION(m->member, "member", JSON_BUILD_STRING(m->member)),
+                JSON_BUILD_PAIR_CONDITION(m->monotonic != 0, "monotonic", JSON_BUILD_INTEGER(m->monotonic)),
+                JSON_BUILD_PAIR_CONDITION(m->realtime != 0, "realtime", JSON_BUILD_INTEGER(m->realtime)),
+                JSON_BUILD_PAIR_CONDITION(m->seqnum != 0, "seqnum", JSON_BUILD_INTEGER(m->seqnum)),
+                JSON_BUILD_PAIR_CONDITION(m->error.name, "error_name", JSON_BUILD_STRING(m->error.name)),
+                JSON_BUILD_PAIR("payload", JSON_BUILD_VARIANT(v))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to build JSON object: %m");
+
+        json_variant_dump(w, arg_json_format_flags, f, NULL);
+        return 0;
+}
+
+static int monitor(int argc, char **argv, int (*dump)(sd_bus_message *m, FILE *f)) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        uint32_t flags = 0;
+        const char *unique_name;
+        bool is_monitor = false;
+        int r;
+
+        r = acquire_bus(true, &bus);
+        if (r < 0)
+                return r;
+
+        /* upgrade connection; it's not used for anything else after this call */
+        r = sd_bus_message_new_method_call(bus,
+                                           &message,
+                                           "org.freedesktop.DBus",
+                                           "/org/freedesktop/DBus",
+                                           "org.freedesktop.DBus.Monitoring",
+                                           "BecomeMonitor");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_open_container(message, 'a', "s");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        STRV_FOREACH(i, argv+1) {
+                _cleanup_free_ char *m = NULL;
+
+                if (!sd_bus_service_name_is_valid(*i))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid service name '%s'", *i);
+
+                m = strjoin("sender='", *i, "'");
+                if (!m)
+                        return log_oom();
+
+                r = sd_bus_message_append_basic(message, 's', m);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                free(m);
+                m = strjoin("destination='", *i, "'");
+                if (!m)
+                        return log_oom();
+
+                r = sd_bus_message_append_basic(message, 's', m);
+                if (r < 0)
+                        return bus_log_create_error(r);
+        }
+
+        STRV_FOREACH(i, arg_matches) {
+                r = sd_bus_message_append_basic(message, 's', *i);
+                if (r < 0)
+                        return bus_log_create_error(r);
+        }
+
+        r = sd_bus_message_close_container(message);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_append_basic(message, 'u', &flags);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_call(bus, message, arg_timeout, &error, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Call to org.freedesktop.DBus.Monitoring.BecomeMonitor failed: %s",
+                                       bus_error_message(&error, r));
+
+        r = sd_bus_get_unique_name(bus, &unique_name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get unique name: %m");
+
+        if (!arg_quiet && arg_json_format_flags == JSON_FORMAT_OFF)
+                log_info("Monitoring bus message stream.");
+
+        (void) sd_notify(/* unset_environment=false */ false, "READY=1");
+
+        for (;;) {
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = sd_bus_process(bus, &m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to process bus: %m");
+
+                if (m) {
+                        if (!is_monitor) {
+                                const char *name;
+
+                                /* wait until we lose our unique name */
+                                if (sd_bus_message_is_signal(m, "org.freedesktop.DBus", "NameLost") <= 0)
+                                        continue;
+
+                                r = sd_bus_message_read(m, "s", &name);
+                                if (r < 0)
+                                        return bus_log_parse_error(r);
+
+                                if (streq(name, unique_name))
+                                        is_monitor = true;
+
+                                continue;
+                        }
+
+                        dump(m, stdout);
+                        fflush(stdout);
+
+                        if (sd_bus_message_is_signal(m, "org.freedesktop.DBus.Local", "Disconnected") > 0) {
+                                if (!arg_quiet && arg_json_format_flags == JSON_FORMAT_OFF)
+                                        log_info("Connection terminated, exiting.");
+                                return 0;
+                        }
+
+                        continue;
+                }
+
+                if (r > 0)
+                        continue;
+
+                r = sd_bus_wait(bus, UINT64_MAX);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to wait for bus: %m");
+        }
+}
+
+static int verb_monitor(int argc, char **argv, void *userdata) {
+        return monitor(argc, argv, (arg_json_format_flags & JSON_FORMAT_OFF) ? message_dump : message_json);
+}
+
+static int verb_capture(int argc, char **argv, void *userdata) {
+        _cleanup_free_ char *osname = NULL;
+        static const char info[] =
+                "busctl (systemd) " STRINGIFY(PROJECT_VERSION) " (Git " GIT_VERSION ")";
+        int r;
+
+        if (isatty(fileno(stdout)) > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Refusing to write message data to console, please redirect output to a file.");
+
+        r = parse_os_release(NULL, "PRETTY_NAME", &osname);
+        if (r < 0)
+                log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_INFO, r,
+                               "Failed to read os-release file, ignoring: %m");
+        bus_pcap_header(arg_snaplen, osname, info, stdout);
+
+        r = monitor(argc, argv, message_pcap);
+        if (r < 0)
+                return r;
+
+        r = fflush_and_check(stdout);
+        if (r < 0)
+                return log_error_errno(r, "Couldn't write capture file: %m");
+
+        return r;
+}
+
+static int status(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        pid_t pid;
+        int r;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        pager_open(arg_pager_flags);
+
+        if (!isempty(argv[1])) {
+                r = parse_pid(argv[1], &pid);
+                if (r < 0)
+                        r = sd_bus_get_name_creds(
+                                        bus,
+                                        argv[1],
+                                        (arg_augment_creds ? SD_BUS_CREDS_AUGMENT : 0) | _SD_BUS_CREDS_ALL,
+                                        &creds);
+                else
+                        r = sd_bus_creds_new_from_pid(
+                                        &creds,
+                                        pid,
+                                        _SD_BUS_CREDS_ALL);
+        } else {
+                const char *scope, *address;
+                sd_id128_t bus_id;
+
+                r = sd_bus_get_address(bus, &address);
+                if (r >= 0)
+                        printf("BusAddress=%s%s%s\n", ansi_highlight(), address, ansi_normal());
+
+                r = sd_bus_get_scope(bus, &scope);
+                if (r >= 0)
+                        printf("BusScope=%s%s%s\n", ansi_highlight(), scope, ansi_normal());
+
+                r = sd_bus_get_bus_id(bus, &bus_id);
+                if (r >= 0)
+                        printf("BusID=%s" SD_ID128_FORMAT_STR "%s\n",
+                               ansi_highlight(), SD_ID128_FORMAT_VAL(bus_id), ansi_normal());
+
+                r = sd_bus_get_owner_creds(
+                                bus,
+                                (arg_augment_creds ? SD_BUS_CREDS_AUGMENT : 0) | _SD_BUS_CREDS_ALL,
+                                &creds);
+        }
+
+        if (r < 0)
+                return log_error_errno(r, "Failed to get credentials: %m");
+
+        bus_creds_dump(creds, NULL, false);
+        return 0;
+}
+
+static int message_append_cmdline(sd_bus_message *m, const char *signature, char ***x) {
+        char **p;
+        int r;
+
+        assert(m);
+        assert(signature);
+        assert(x);
+
+        p = *x;
+
+        for (;;) {
+                const char *v;
+                char t;
+
+                t = *signature;
+                v = *p;
+
+                if (t == 0)
+                        break;
+                if (!v)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Too few parameters for signature.");
+
+                signature++;
+                p++;
+
+                switch (t) {
+
+                case SD_BUS_TYPE_BOOLEAN:
+
+                        r = parse_boolean(v);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as boolean: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &r);
+                        break;
+
+                case SD_BUS_TYPE_BYTE: {
+                        uint8_t z;
+
+                        r = safe_atou8(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as byte (unsigned 8-bit integer): %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_INT16: {
+                        int16_t z;
+
+                        r = safe_atoi16(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as signed 16-bit integer: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_UINT16: {
+                        uint16_t z;
+
+                        r = safe_atou16(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as unsigned 16-bit integer: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_INT32: {
+                        int32_t z;
+
+                        r = safe_atoi32(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as signed 32-bit integer: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_UINT32: {
+                        uint32_t z;
+
+                        r = safe_atou32(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as unsigned 32-bit integer: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_INT64: {
+                        int64_t z;
+
+                        r = safe_atoi64(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as signed 64-bit integer: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_UINT64: {
+                        uint64_t z;
+
+                        r = safe_atou64(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as unsigned 64-bit integer: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_DOUBLE: {
+                        double z;
+
+                        r = safe_atod(v, &z);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' as double precision floating point: %m", v);
+
+                        r = sd_bus_message_append_basic(m, t, &z);
+                        break;
+                }
+
+                case SD_BUS_TYPE_STRING:
+                case SD_BUS_TYPE_OBJECT_PATH:
+                case SD_BUS_TYPE_SIGNATURE:
+
+                        r = sd_bus_message_append_basic(m, t, v);
+                        break;
+
+                case SD_BUS_TYPE_ARRAY: {
+                        uint32_t n;
+                        size_t k;
+
+                        r = safe_atou32(v, &n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse '%s' number of array entries: %m", v);
+
+                        r = signature_element_length(signature, &k);
+                        if (r < 0)
+                                return log_error_errno(r, "Invalid array signature: %m");
+
+                        {
+                                char s[k + 1];
+                                memcpy(s, signature, k);
+                                s[k] = 0;
+
+                                r = sd_bus_message_open_container(m, SD_BUS_TYPE_ARRAY, s);
+                                if (r < 0)
+                                        return bus_log_create_error(r);
+
+                                for (unsigned i = 0; i < n; i++) {
+                                        r = message_append_cmdline(m, s, &p);
+                                        if (r < 0)
+                                                return r;
+                                }
+                        }
+
+                        signature += k;
+
+                        r = sd_bus_message_close_container(m);
+                        break;
+                }
+
+                case SD_BUS_TYPE_VARIANT:
+                        r = sd_bus_message_open_container(m, SD_BUS_TYPE_VARIANT, v);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = message_append_cmdline(m, v, &p);
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_close_container(m);
+                        break;
+
+                case SD_BUS_TYPE_STRUCT_BEGIN:
+                case SD_BUS_TYPE_DICT_ENTRY_BEGIN: {
+                        size_t k;
+
+                        signature--;
+                        p--;
+
+                        r = signature_element_length(signature, &k);
+                        if (r < 0 || k < 2) {
+                                if (r >= 0 && k < 2)
+                                        r = -ERANGE;
+                                return log_error_errno(r, "Invalid struct/dict entry signature: %m");
+                        }
+
+                        {
+                                char s[k-1];
+                                memcpy(s, signature + 1, k - 2);
+                                s[k - 2] = 0;
+
+                                const char ctype = t == SD_BUS_TYPE_STRUCT_BEGIN ?
+                                        SD_BUS_TYPE_STRUCT : SD_BUS_TYPE_DICT_ENTRY;
+                                r = sd_bus_message_open_container(m, ctype, s);
+                                if (r < 0)
+                                        return bus_log_create_error(r);
+
+                                r = message_append_cmdline(m, s, &p);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        signature += k;
+
+                        r = sd_bus_message_close_container(m);
+                        break;
+                }
+
+                case SD_BUS_TYPE_UNIX_FD:
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "UNIX file descriptor not supported as type.");
+
+                default:
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Unknown signature type %c.", t);
+                }
+
+                if (r < 0)
+                        return bus_log_create_error(r);
+        }
+
+        *x = p;
+        return 0;
+}
+
+static int json_transform_one(sd_bus_message *m, JsonVariant **ret);
+
+static int json_transform_and_append(sd_bus_message *m, JsonVariant **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *element = NULL;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        r = json_transform_one(m, &element);
+        if (r < 0)
+                return r;
+
+        return json_variant_append_array(ret, element);
+}
+
+static int json_transform_array_or_struct(sd_bus_message *m, JsonVariant **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *array = NULL;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        for (;;) {
+                r = sd_bus_message_at_end(m, false);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+                if (r > 0)
+                        break;
+
+                r = json_transform_and_append(m, &array);
+                if (r < 0)
+                        return r;
+        }
+
+        if (!array)
+                return json_variant_new_array(ret, NULL, 0);
+
+        *ret = TAKE_PTR(array);
+        return 0;
+}
+
+static int json_transform_variant(sd_bus_message *m, const char *contents, JsonVariant **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *value = NULL;
+        int r;
+
+        assert(m);
+        assert(contents);
+        assert(ret);
+
+        r = json_transform_one(m, &value);
+        if (r < 0)
+                return r;
+
+        r = json_build(ret, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("type", JSON_BUILD_STRING(contents)),
+                                              JSON_BUILD_PAIR("data", JSON_BUILD_VARIANT(value))));
+        if (r < 0)
+                return log_oom();
+
+        return r;
+}
+
+static int json_transform_dict_array(sd_bus_message *m, JsonVariant **ret) {
+        JsonVariant **elements = NULL;
+        size_t n_elements = 0;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        CLEANUP_ARRAY(elements, n_elements, json_variant_unref_many);
+
+        for (;;) {
+                const char *contents;
+                char type;
+
+                r = sd_bus_message_at_end(m, false);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+                if (r > 0)
+                        break;
+
+                r = sd_bus_message_peek_type(m, &type, &contents);
+                if (r < 0)
+                        return r;
+
+                assert(type == 'e');
+
+                if (!GREEDY_REALLOC(elements, n_elements + 2))
+                        return log_oom();
+
+                r = sd_bus_message_enter_container(m, type, contents);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_transform_one(m, elements + n_elements);
+                if (r < 0)
+                        return r;
+
+                n_elements++;
+
+                r = json_transform_one(m, elements + n_elements);
+                if (r < 0)
+                        return r;
+
+                n_elements++;
+
+                r = sd_bus_message_exit_container(m);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+        }
+
+        return json_variant_new_object(ret, elements, n_elements);
+}
+
+static int json_transform_one(sd_bus_message *m, JsonVariant **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        const char *contents;
+        char type;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        r = sd_bus_message_peek_type(m, &type, &contents);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        switch (type) {
+
+        case SD_BUS_TYPE_BYTE: {
+                uint8_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_unsigned(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform byte: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_BOOLEAN: {
+                int b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_boolean(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform boolean: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_INT16: {
+                int16_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_integer(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform int16: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_UINT16: {
+                uint16_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_unsigned(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform uint16: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_INT32: {
+                int32_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_integer(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform int32: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_UINT32: {
+                uint32_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_unsigned(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform uint32: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_INT64: {
+                int64_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_integer(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform int64: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_UINT64: {
+                uint64_t b;
+
+                r = sd_bus_message_read_basic(m, type, &b);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_unsigned(&v, b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform uint64: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_DOUBLE: {
+                double d;
+
+                r = sd_bus_message_read_basic(m, type, &d);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_real(&v, d);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform double: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_STRING:
+        case SD_BUS_TYPE_OBJECT_PATH:
+        case SD_BUS_TYPE_SIGNATURE: {
+                const char *s;
+
+                r = sd_bus_message_read_basic(m, type, &s);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_string(&v, s);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform double: %m");
+
+                break;
+        }
+
+        case SD_BUS_TYPE_UNIX_FD:
+                r = sd_bus_message_read_basic(m, type, NULL);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_variant_new_null(&v);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transform fd: %m");
+
+                break;
+
+        case SD_BUS_TYPE_ARRAY:
+        case SD_BUS_TYPE_VARIANT:
+        case SD_BUS_TYPE_STRUCT:
+                r = sd_bus_message_enter_container(m, type, contents);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                if (type == SD_BUS_TYPE_VARIANT)
+                        r = json_transform_variant(m, contents, &v);
+                else if (type == SD_BUS_TYPE_ARRAY && contents[0] == '{')
+                        r = json_transform_dict_array(m, &v);
+                else
+                        r = json_transform_array_or_struct(m, &v);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(m);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        *ret = TAKE_PTR(v);
+        return 0;
+}
+
+static int json_transform_message(sd_bus_message *m, JsonVariant **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        const char *type;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        assert_se(type = sd_bus_message_get_signature(m, false));
+
+        r = json_transform_array_or_struct(m, &v);
+        if (r < 0)
+                return r;
+
+        r = json_build(ret, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("type",  JSON_BUILD_STRING(type)),
+                                              JSON_BUILD_PAIR("data", JSON_BUILD_VARIANT(v))));
+        if (r < 0)
+                return log_oom();
+
+        return 0;
+}
+
+static int call(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
+        int r;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_call(bus, &m, argv[1], argv[2], argv[3], argv[4]);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_set_expect_reply(m, arg_expect_reply);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_set_auto_start(m, arg_auto_start);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_set_allow_interactive_authorization(m, arg_allow_interactive_authorization);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        if (!isempty(argv[5])) {
+                char **p;
+
+                p = argv+6;
+
+                r = message_append_cmdline(m, argv[5], &p);
+                if (r < 0)
+                        return r;
+
+                if (*p)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Too many parameters for signature.");
+        }
+
+        if (!arg_expect_reply) {
+                r = sd_bus_send(bus, m, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to send message: %m");
+
+                return 0;
+        }
+
+        r = sd_bus_call(bus, m, arg_timeout, &error, &reply);
+        if (r < 0)
+                return log_error_errno(r, "Call failed: %s", bus_error_message(&error, r));
+
+        r = sd_bus_message_is_empty(reply);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        if (r == 0 && !arg_quiet) {
+
+                if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) {
+                        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+                        if (arg_json_format_flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+                                pager_open(arg_pager_flags);
+
+                        r = json_transform_message(reply, &v);
+                        if (r < 0)
+                                return r;
+
+                        json_variant_dump(v, arg_json_format_flags, NULL, NULL);
+
+                } else if (arg_verbose) {
+                        pager_open(arg_pager_flags);
+
+                        r = sd_bus_message_dump(reply, stdout, 0);
+                        if (r < 0)
+                                return r;
+                } else {
+
+                        fputs(sd_bus_message_get_signature(reply, true), stdout);
+                        fputc(' ', stdout);
+
+                        r = format_cmdline(reply, stdout, false);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        fputc('\n', stdout);
+                }
+        }
+
+        return 0;
+}
+
+static int emit_signal(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        int r;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_signal(bus, &m, argv[1], argv[2], argv[3]);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        if (arg_destination) {
+                r = sd_bus_message_set_destination(m, arg_destination);
+                if (r < 0)
+                        return bus_log_create_error(r);
+        }
+
+        r = sd_bus_message_set_auto_start(m, arg_auto_start);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        if (!isempty(argv[4])) {
+                char **p;
+
+                p = argv+5;
+
+                r = message_append_cmdline(m, argv[4], &p);
+                if (r < 0)
+                        return r;
+
+                if (*p)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Too many parameters for signature.");
+        }
+
+        r = sd_bus_send(bus, m, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to send signal: %m");
+
+        return 0;
+}
+
+static int get_property(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, argv + 4) {
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+                const char *contents = NULL;
+                char type;
+
+                r = sd_bus_call_method(bus, argv[1], argv[2],
+                                       "org.freedesktop.DBus.Properties", "Get",
+                                       &error, &reply, "ss", argv[3], *i);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get property %s on interface %s: %s",
+                                               *i, argv[3],
+                                               bus_error_message(&error, r));
+
+                r = sd_bus_message_peek_type(reply, &type, &contents);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = sd_bus_message_enter_container(reply, 'v', contents);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) {
+                        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+                        if (arg_json_format_flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+                                pager_open(arg_pager_flags);
+
+                        r = json_transform_variant(reply, contents, &v);
+                        if (r < 0)
+                                return r;
+
+                        json_variant_dump(v, arg_json_format_flags, NULL, NULL);
+
+                } else if (arg_verbose) {
+                        pager_open(arg_pager_flags);
+
+                        r = sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY);
+                        if (r < 0)
+                                return r;
+                } else {
+                        fputs(contents, stdout);
+                        fputc(' ', stdout);
+
+                        r = format_cmdline(reply, stdout, false);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        fputc('\n', stdout);
+                }
+
+                r = sd_bus_message_exit_container(reply);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+        }
+
+        return 0;
+}
+
+static int set_property(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        char **p;
+        int r;
+
+        r = acquire_bus(false, &bus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_call(bus, &m, argv[1], argv[2],
+                                           "org.freedesktop.DBus.Properties", "Set");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_append(m, "ss", argv[3], argv[4]);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_open_container(m, 'v', argv[5]);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        p = argv + 6;
+        r = message_append_cmdline(m, argv[5], &p);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_close_container(m);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        if (*p)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too many parameters for signature.");
+
+        r = sd_bus_call(bus, m, arg_timeout, &error, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set property %s on interface %s: %s",
+                                       argv[4], argv[3],
+                                       bus_error_message(&error, r));
+
+        return 0;
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("busctl", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        pager_open(arg_pager_flags);
+
+        printf("%s [OPTIONS...] COMMAND ...\n\n"
+               "%sIntrospect the D-Bus IPC bus.%s\n"
+               "\nCommands:\n"
+               "  list                     List bus names\n"
+               "  status [SERVICE]         Show bus service, process or bus owner credentials\n"
+               "  monitor [SERVICE...]     Show bus traffic\n"
+               "  capture [SERVICE...]     Capture bus traffic as pcap\n"
+               "  tree [SERVICE...]        Show object tree of service\n"
+               "  introspect SERVICE OBJECT [INTERFACE]\n"
+               "  call SERVICE OBJECT INTERFACE METHOD [SIGNATURE [ARGUMENT...]]\n"
+               "                           Call a method\n"
+               "  emit OBJECT INTERFACE SIGNAL [SIGNATURE [ARGUMENT...]]\n"
+               "                           Emit a signal\n"
+               "  get-property SERVICE OBJECT INTERFACE PROPERTY...\n"
+               "                           Get property value\n"
+               "  set-property SERVICE OBJECT INTERFACE PROPERTY SIGNATURE ARGUMENT...\n"
+               "                           Set property value\n"
+               "  help                     Show this help\n"
+               "\nOptions:\n"
+               "  -h --help                Show this help\n"
+               "     --version             Show package version\n"
+               "     --no-pager            Do not pipe output into a pager\n"
+               "     --no-legend           Do not show the headers and footers\n"
+               "  -l --full                Do not ellipsize output\n"
+               "     --system              Connect to system bus\n"
+               "     --user                Connect to user bus\n"
+               "  -H --host=[USER@]HOST    Operate on remote host\n"
+               "  -M --machine=CONTAINER   Operate on local container\n"
+               "     --address=ADDRESS     Connect to bus specified by address\n"
+               "     --show-machine        Show machine ID column in list\n"
+               "     --unique              Only show unique names\n"
+               "     --acquired            Only show acquired names\n"
+               "     --activatable         Only show activatable names\n"
+               "     --match=MATCH         Only show matching messages\n"
+               "     --size=SIZE           Maximum length of captured packet\n"
+               "     --list                Don't show tree, but simple object path list\n"
+               "  -q --quiet               Don't show method call reply\n"
+               "     --verbose             Show result values in long format\n"
+               "     --json=MODE           Output as JSON\n"
+               "  -j                       Same as --json=pretty on tty, --json=short otherwise\n"
+               "     --xml-interface       Dump the XML description in introspect command\n"
+               "     --expect-reply=BOOL   Expect a method call reply\n"
+               "     --auto-start=BOOL     Auto-start destination service\n"
+               "     --allow-interactive-authorization=BOOL\n"
+               "                           Allow interactive authorization for operation\n"
+               "     --timeout=SECS        Maximum time to wait for method call completion\n"
+               "     --augment-creds=BOOL  Extend credential data with data read from /proc/$PID\n"
+               "     --watch-bind=BOOL     Wait for bus AF_UNIX socket to be bound in the file\n"
+               "                           system\n"
+               "     --destination=SERVICE Destination service of a signal\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int verb_help(int argc, char **argv, void *userdata) {
+        return help();
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_PAGER,
+                ARG_NO_LEGEND,
+                ARG_SYSTEM,
+                ARG_USER,
+                ARG_ADDRESS,
+                ARG_MATCH,
+                ARG_SHOW_MACHINE,
+                ARG_UNIQUE,
+                ARG_ACQUIRED,
+                ARG_ACTIVATABLE,
+                ARG_SIZE,
+                ARG_LIST,
+                ARG_VERBOSE,
+                ARG_XML_INTERFACE,
+                ARG_EXPECT_REPLY,
+                ARG_AUTO_START,
+                ARG_ALLOW_INTERACTIVE_AUTHORIZATION,
+                ARG_TIMEOUT,
+                ARG_AUGMENT_CREDS,
+                ARG_WATCH_BIND,
+                ARG_JSON,
+                ARG_DESTINATION,
+        };
+
+        static const struct option options[] = {
+                { "help",                            no_argument,       NULL, 'h'                                 },
+                { "version",                         no_argument,       NULL, ARG_VERSION                         },
+                { "no-pager",                        no_argument,       NULL, ARG_NO_PAGER                        },
+                { "no-legend",                       no_argument,       NULL, ARG_NO_LEGEND                       },
+                { "full",                            no_argument,       NULL, 'l'                                 },
+                { "system",                          no_argument,       NULL, ARG_SYSTEM                          },
+                { "user",                            no_argument,       NULL, ARG_USER                            },
+                { "address",                         required_argument, NULL, ARG_ADDRESS                         },
+                { "show-machine",                    no_argument,       NULL, ARG_SHOW_MACHINE                    },
+                { "unique",                          no_argument,       NULL, ARG_UNIQUE                          },
+                { "acquired",                        no_argument,       NULL, ARG_ACQUIRED                        },
+                { "activatable",                     no_argument,       NULL, ARG_ACTIVATABLE                     },
+                { "match",                           required_argument, NULL, ARG_MATCH                           },
+                { "host",                            required_argument, NULL, 'H'                                 },
+                { "machine",                         required_argument, NULL, 'M'                                 },
+                { "size",                            required_argument, NULL, ARG_SIZE                            },
+                { "list",                            no_argument,       NULL, ARG_LIST                            },
+                { "quiet",                           no_argument,       NULL, 'q'                                 },
+                { "verbose",                         no_argument,       NULL, ARG_VERBOSE                         },
+                { "xml-interface",                   no_argument,       NULL, ARG_XML_INTERFACE                   },
+                { "expect-reply",                    required_argument, NULL, ARG_EXPECT_REPLY                    },
+                { "auto-start",                      required_argument, NULL, ARG_AUTO_START                      },
+                { "allow-interactive-authorization", required_argument, NULL, ARG_ALLOW_INTERACTIVE_AUTHORIZATION },
+                { "timeout",                         required_argument, NULL, ARG_TIMEOUT                         },
+                { "augment-creds",                   required_argument, NULL, ARG_AUGMENT_CREDS                   },
+                { "watch-bind",                      required_argument, NULL, ARG_WATCH_BIND                      },
+                { "json",                            required_argument, NULL, ARG_JSON                            },
+                { "destination",                     required_argument, NULL, ARG_DESTINATION                     },
+                {},
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hH:M:qjl", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_NO_LEGEND:
+                        arg_legend = false;
+                        break;
+
+                case 'l':
+                        arg_full = true;
+                        break;
+
+                case ARG_USER:
+                        arg_runtime_scope = RUNTIME_SCOPE_USER;
+                        break;
+
+                case ARG_SYSTEM:
+                        arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+                        break;
+
+                case ARG_ADDRESS:
+                        arg_address = optarg;
+                        break;
+
+                case ARG_SHOW_MACHINE:
+                        arg_show_machine = true;
+                        break;
+
+                case ARG_UNIQUE:
+                        arg_unique = true;
+                        break;
+
+                case ARG_ACQUIRED:
+                        arg_acquired = true;
+                        break;
+
+                case ARG_ACTIVATABLE:
+                        arg_activatable = true;
+                        break;
+
+                case ARG_MATCH:
+                        if (strv_extend(&arg_matches, optarg) < 0)
+                                return log_oom();
+                        break;
+
+                case ARG_SIZE: {
+                        uint64_t sz;
+
+                        r = parse_size(optarg, 1024, &sz);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse size '%s': %m", optarg);
+
+                        if ((uint64_t) (size_t) sz !=  sz)
+                                return log_error_errno(SYNTHETIC_ERRNO(E2BIG),
+                                                       "Size out of range.");
+
+                        arg_snaplen = (size_t) sz;
+                        break;
+                }
+
+                case ARG_LIST:
+                        arg_list = true;
+                        break;
+
+                case 'H':
+                        arg_transport = BUS_TRANSPORT_REMOTE;
+                        arg_host = optarg;
+                        break;
+
+                case 'M':
+                        arg_transport = BUS_TRANSPORT_MACHINE;
+                        arg_host = optarg;
+                        break;
+
+                case 'q':
+                        arg_quiet = true;
+                        break;
+
+                case ARG_VERBOSE:
+                        arg_verbose = true;
+                        break;
+
+                case ARG_XML_INTERFACE:
+                        arg_xml_interface = true;
+                        break;
+
+                case ARG_EXPECT_REPLY:
+                        r = parse_boolean_argument("--expect-reply=", optarg, &arg_expect_reply);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_AUTO_START:
+                        r = parse_boolean_argument("--auto-start=", optarg, &arg_auto_start);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_ALLOW_INTERACTIVE_AUTHORIZATION:
+                        r = parse_boolean_argument("--allow-interactive-authorization=", optarg,
+                                                   &arg_allow_interactive_authorization);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_TIMEOUT:
+                        r = parse_sec(optarg, &arg_timeout);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --timeout= parameter '%s': %m", optarg);
+
+                        break;
+
+                case ARG_AUGMENT_CREDS:
+                        r = parse_boolean_argument("--augment-creds=", optarg, &arg_augment_creds);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_WATCH_BIND:
+                        r = parse_boolean_argument("--watch-bind=", optarg, &arg_watch_bind);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case 'j':
+                        arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO;
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case ARG_DESTINATION:
+                        arg_destination = optarg;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (arg_full < 0)
+                arg_full = terminal_is_dumb();
+
+        return 1;
+}
+
+static int busctl_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "list",         VERB_ANY, 1,        VERB_DEFAULT, list_bus_names },
+                { "status",       VERB_ANY, 2,        0,            status         },
+                { "monitor",      VERB_ANY, VERB_ANY, 0,            verb_monitor   },
+                { "capture",      VERB_ANY, VERB_ANY, 0,            verb_capture   },
+                { "tree",         VERB_ANY, VERB_ANY, 0,            tree           },
+                { "introspect",   3,        4,        0,            introspect     },
+                { "call",         5,        VERB_ANY, 0,            call           },
+                { "emit",         4,        VERB_ANY, 0,            emit_signal    },
+                { "get-property", 5,        VERB_ANY, 0,            get_property   },
+                { "set-property", 6,        VERB_ANY, 0,            set_property   },
+                { "help",         VERB_ANY, VERB_ANY, 0,            verb_help      },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return busctl_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/busctl/meson.build b/src/busctl/meson.build
new file mode 100644
index 0000000..6b3a479
--- /dev/null
+++ b/src/busctl/meson.build
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+busctl_sources = files(
+        'busctl-introspect.c',
+        'busctl.c',
+)
+
+executables += [
+        executable_template + {
+                'name' : 'busctl',
+                'public' : true,
+                'sources' : busctl_sources,
+        },
+        test_template + {
+                'sources' : files(
+                        'test-busctl-introspect.c',
+                        'busctl-introspect.c',
+                )
+        },
+]
diff --git a/src/busctl/test-busctl-introspect.c b/src/busctl/test-busctl-introspect.c
new file mode 100644
index 0000000..859ca71
--- /dev/null
+++ b/src/busctl/test-busctl-introspect.c
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "busctl-introspect.h"
+#include "set.h"
+#include "strv.h"
+#include "tests.h"
+
+static const char *xml_root =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "\n";
+
+static const char *xml_org =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "\n";
+
+static const char *xml_org_freedesktop =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  "\n";
+
+static const char *xml_org_freedesktop_LogControl1 =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  "\n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  "\n";
+
+static const char *xml_org_freedesktop_network1 =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "\n";
+
+static const char *xml_org_freedesktop_network1_network =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  " \n"
+                  "\n";
+
+static const char *xml_org_freedesktop_network1_network_unsigned =
+        "\n"
+                  "\n"
+                  " \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  "  \n"
+                  "   \n"
+                  "   \n"
+                  "   \n"
+                  "  \n"
+                  " \n"
+                  " \n"
+                  "\n";
+
+static int on_path(const char *path, void *userdata) {
+        Set *paths = userdata;
+
+        assert_se(paths);
+        assert_se(set_put_strdup(&paths, path) >= 0);
+
+        return 0;
+}
+
+TEST(introspect_on_path) {
+        static const XMLIntrospectOps ops = {
+                .on_path = on_path,
+        };
+        _cleanup_strv_free_ char **expected = NULL;
+        _cleanup_set_free_ Set *paths = NULL;
+        _cleanup_free_ char **l = NULL;
+
+        assert_se(set_put_strdup(&paths, "/") > 0);
+
+        log_debug("/* parse_xml_introspect(\"/\") */");
+        assert_se(parse_xml_introspect("/", xml_root, &ops, paths) >= 0);
+        log_debug("/* parse_xml_introspect(\"/org\") */");
+        assert_se(parse_xml_introspect("/org", xml_org, &ops, paths) >= 0);
+        log_debug("/* parse_xml_introspect(\"/org/freedesktop\") */");
+        assert_se(parse_xml_introspect("/org/freedesktop", xml_org_freedesktop, &ops, paths) >= 0);
+        log_debug("/* parse_xml_introspect(\"/org/freedesktop/LogControl1\") */");
+        assert_se(parse_xml_introspect("/org/freedesktop/LogControl1", xml_org_freedesktop_LogControl1, &ops, paths) >= 0);
+        log_debug("/* parse_xml_introspect(\"/org/freedesktop/network1\") */");
+        assert_se(parse_xml_introspect("/org/freedesktop/network1", xml_org_freedesktop_network1, &ops, paths) >= 0);
+        log_debug("/* parse_xml_introspect(\"/org/freedesktop/network1/network\") */");
+        assert_se(parse_xml_introspect("/org/freedesktop/network1/network", xml_org_freedesktop_network1_network, &ops, paths) >= 0);
+        for (unsigned i = 0; i <= 20; i++) {
+                _cleanup_free_ char *path = NULL;
+
+                assert_se(asprintf(&path, "/org/freedesktop/network1/network/%u", i) >= 0);
+                log_debug("/* parse_xml_introspect(\"%s\") */", path);
+                assert_se(parse_xml_introspect(path, xml_org_freedesktop_network1_network_unsigned, &ops, paths) >= 0);
+        }
+
+        assert_se(l = set_get_strv(paths));
+        strv_sort(l);
+
+        assert_se(strv_extend(&expected, "/") >= 0);
+        assert_se(strv_extend(&expected, "/org") >= 0);
+        assert_se(strv_extend(&expected, "/org/freedesktop") >= 0);
+        assert_se(strv_extend(&expected, "/org/freedesktop/LogControl1") >= 0);
+        assert_se(strv_extend(&expected, "/org/freedesktop/network1") >= 0);
+        assert_se(strv_extend(&expected, "/org/freedesktop/network1/network") >= 0);
+        for (unsigned i = 0; i <= 20; i++) {
+                assert_se(strv_extendf(&expected, "/org/freedesktop/network1/network/%u", i) >= 0);
+                assert_se(strv_extendf(&expected, "/org/freedesktop/network1/network/%u/hoge", i) >= 0);
+        }
+
+        strv_sort(expected);
+        assert_se(strv_equal(l, expected));
+}
+
+DEFINE_TEST_MAIN(LOG_DEBUG);
diff --git a/src/cgls/cgls.c b/src/cgls/cgls.c
new file mode 100644
index 0000000..70fa260
--- /dev/null
+++ b/src/cgls/cgls.c
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "bus-util.h"
+#include "cgroup-show.h"
+#include "cgroup-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "main-func.h"
+#include "output-mode.h"
+#include "pager.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "strv.h"
+#include "unit-name.h"
+
+static PagerFlags arg_pager_flags = 0;
+static OutputFlags arg_output_flags = 0;
+
+static enum {
+        SHOW_UNIT_NONE,
+        SHOW_UNIT_SYSTEM,
+        SHOW_UNIT_USER,
+} arg_show_unit = SHOW_UNIT_NONE;
+static char **arg_names = NULL;
+
+static int arg_full = -1;
+static const char* arg_machine = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_names, freep); /* don't free the strings */
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-cgls", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] [CGROUP...]\n\n"
+               "Recursively show control group contents.\n\n"
+               "  -h --help           Show this help\n"
+               "     --version        Show package version\n"
+               "     --no-pager       Do not pipe output into a pager\n"
+               "  -a --all            Show all groups, including empty\n"
+               "  -u --unit           Show the subtrees of specified system units\n"
+               "     --user-unit      Show the subtrees of specified user units\n"
+               "  -x --xattr=BOOL     Show cgroup extended attributes\n"
+               "  -c --cgroup-id=BOOL Show cgroup ID\n"
+               "  -l --full           Do not ellipsize output\n"
+               "  -k                  Include kernel threads in output\n"
+               "  -M --machine=NAME   Show container NAME\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_NO_PAGER = 0x100,
+                ARG_VERSION,
+                ARG_USER_UNIT,
+        };
+
+        static const struct option options[] = {
+                { "help",      no_argument,       NULL, 'h'           },
+                { "version",   no_argument,       NULL, ARG_VERSION   },
+                { "no-pager",  no_argument,       NULL, ARG_NO_PAGER  },
+                { "all",       no_argument,       NULL, 'a'           },
+                { "full",      no_argument,       NULL, 'l'           },
+                { "machine",   required_argument, NULL, 'M'           },
+                { "unit",      optional_argument, NULL, 'u'           },
+                { "user-unit", optional_argument, NULL, ARG_USER_UNIT },
+                { "xattr",     required_argument, NULL, 'x'           },
+                { "cgroup-id", required_argument, NULL, 'c'           },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 1);
+        assert(argv);
+
+        /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
+         * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
+        optind = 0;
+        while ((c = getopt_long(argc, argv, "-hkalM:u::xc", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case 'a':
+                        arg_output_flags |= OUTPUT_SHOW_ALL;
+                        break;
+
+                case 'u':
+                        arg_show_unit = SHOW_UNIT_SYSTEM;
+                        if (strv_push(&arg_names, optarg) < 0) /* push optarg if not empty */
+                                return log_oom();
+                        break;
+
+                case ARG_USER_UNIT:
+                        arg_show_unit = SHOW_UNIT_USER;
+                        if (strv_push(&arg_names, optarg) < 0) /* push optarg if not empty */
+                                return log_oom();
+                        break;
+
+                case 1:
+                        /* positional argument */
+                        if (strv_push(&arg_names, optarg) < 0)
+                                return log_oom();
+                        break;
+
+                case 'l':
+                        arg_full = true;
+                        break;
+
+                case 'k':
+                        arg_output_flags |= OUTPUT_KERNEL_THREADS;
+                        break;
+
+                case 'M':
+                        arg_machine = optarg;
+                        break;
+
+                case 'x':
+                        if (optarg) {
+                                r = parse_boolean(optarg);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse --xattr= value: %s", optarg);
+                        } else
+                                r = true;
+
+                        SET_FLAG(arg_output_flags, OUTPUT_CGROUP_XATTRS, r);
+                        break;
+
+                case 'c':
+                        if (optarg) {
+                                r = parse_boolean(optarg);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse --cgroup-id= value: %s", optarg);
+                        } else
+                                r = true;
+
+                        SET_FLAG(arg_output_flags, OUTPUT_CGROUP_ID, r);
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (arg_machine && arg_show_unit != SHOW_UNIT_NONE)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Cannot combine --unit or --user-unit with --machine=.");
+
+        return 1;
+}
+
+static void show_cg_info(const char *controller, const char *path) {
+
+        if (cg_all_unified() == 0 && controller && !streq(controller, SYSTEMD_CGROUP_CONTROLLER))
+                printf("Controller %s; ", controller);
+
+        printf("CGroup %s:\n", empty_to_root(path));
+        fflush(stdout);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        pager_open(arg_pager_flags);
+        if (arg_full < 0 && pager_have())
+                arg_full = true;
+
+        if (arg_full > 0)
+                arg_output_flags |= OUTPUT_FULL_WIDTH;
+
+        if (arg_names) {
+                _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+                _cleanup_free_ char *root = NULL;
+
+                STRV_FOREACH(name, arg_names) {
+                        int q;
+
+                        if (arg_show_unit != SHOW_UNIT_NONE) {
+                                /* Command line arguments are unit names */
+                                _cleanup_free_ char *cgroup = NULL, *unit_name = NULL;
+
+                                r = unit_name_mangle(*name, UNIT_NAME_MANGLE_WARN, &unit_name);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to mangle unit name: %m");
+
+                                if (!bus) {
+                                        /* Connect to the bus only if necessary */
+                                        r = bus_connect_transport_systemd(
+                                                        BUS_TRANSPORT_LOCAL, NULL,
+                                                        arg_show_unit == SHOW_UNIT_USER ? RUNTIME_SCOPE_USER : RUNTIME_SCOPE_SYSTEM,
+                                                        &bus);
+                                        if (r < 0)
+                                                return bus_log_connect_error(r, BUS_TRANSPORT_LOCAL);
+                                }
+
+                                q = show_cgroup_get_unit_path_and_warn(bus, unit_name, &cgroup);
+                                if (q < 0)
+                                        goto failed;
+
+                                if (isempty(cgroup)) {
+                                        q = log_warning_errno(SYNTHETIC_ERRNO(ENOENT), "Unit %s not found.", unit_name);
+                                        goto failed;
+                                }
+
+                                printf("Unit %s (%s):\n", unit_name, cgroup);
+                                fflush(stdout);
+
+                                q = show_cgroup_by_path(cgroup, NULL, 0, arg_output_flags);
+
+                        } else if (path_startswith(*name, "/sys/fs/cgroup")) {
+
+                                printf("Directory %s:\n", *name);
+                                fflush(stdout);
+
+                                q = show_cgroup_by_path(*name, NULL, 0, arg_output_flags);
+                        } else {
+                                _cleanup_free_ char *c = NULL, *p = NULL, *j = NULL;
+                                const char *controller, *path;
+
+                                if (!root) {
+                                        /* Query root only if needed, treat error as fatal */
+                                        r = show_cgroup_get_path_and_warn(arg_machine, NULL, &root);
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to list cgroup tree: %m");
+                                }
+
+                                q = cg_split_spec(*name, &c, &p);
+                                if (q < 0) {
+                                        log_error_errno(q, "Failed to split argument %s: %m", *name);
+                                        goto failed;
+                                }
+
+                                controller = c ?: SYSTEMD_CGROUP_CONTROLLER;
+                                if (p) {
+                                        j = path_join(root, p);
+                                        if (!j)
+                                                return log_oom();
+
+                                        path_simplify(j);
+                                        path = j;
+                                } else
+                                        path = root;
+
+                                show_cg_info(controller, path);
+
+                                q = show_cgroup(controller, path, NULL, 0, arg_output_flags);
+                        }
+
+                failed:
+                        if (q < 0 && r >= 0)
+                                r = q;
+                }
+
+        } else {
+                bool done = false;
+
+                if (!arg_machine)  {
+                        _cleanup_free_ char *cwd = NULL;
+
+                        r = safe_getcwd(&cwd);
+                        if (r < 0)
+                                return log_error_errno(r, "Cannot determine current working directory: %m");
+
+                        if (path_startswith(cwd, "/sys/fs/cgroup")) {
+                                printf("Working directory %s:\n", cwd);
+                                fflush(stdout);
+
+                                r = show_cgroup_by_path(cwd, NULL, 0, arg_output_flags);
+                                done = true;
+                        }
+                }
+
+                if (!done) {
+                        _cleanup_free_ char *root = NULL;
+
+                        r = show_cgroup_get_path_and_warn(arg_machine, NULL, &root);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to list cgroup tree: %m");
+
+                        show_cg_info(SYSTEMD_CGROUP_CONTROLLER, root);
+
+                        printf("-.slice\n");
+                        r = show_cgroup(SYSTEMD_CGROUP_CONTROLLER, root, NULL, 0, arg_output_flags);
+                }
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to list cgroup tree: %m");
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/cgls/meson.build b/src/cgls/meson.build
new file mode 100644
index 0000000..f7eea5b
--- /dev/null
+++ b/src/cgls/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-cgls',
+                'public' : true,
+                'sources' : files('cgls.c'),
+        },
+]
diff --git a/src/cgroups-agent/cgroups-agent.c b/src/cgroups-agent/cgroups-agent.c
new file mode 100644
index 0000000..16c5a2a
--- /dev/null
+++ b/src/cgroups-agent/cgroups-agent.c
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "fd-util.h"
+#include "log.h"
+#include "socket-util.h"
+
+int main(int argc, char *argv[]) {
+
+        static const union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/cgroups-agent",
+        };
+
+        _cleanup_close_ int fd = -EBADF;
+        ssize_t n;
+        size_t l;
+        int r;
+
+        r = make_null_stdio();
+        if (r < 0) {
+                log_error_errno(r, "Failed to connect stdin/stdout/stderr with /dev/null: %m");
+                return EXIT_FAILURE;
+        }
+
+        if (argc != 2) {
+                log_error("Incorrect number of arguments.");
+                return EXIT_FAILURE;
+        }
+
+        log_setup();
+
+        fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0);
+        if (fd < 0) {
+                log_debug_errno(errno, "Failed to allocate socket: %m");
+                return EXIT_FAILURE;
+        }
+
+        l = strlen(argv[1]);
+
+        n = sendto(fd, argv[1], l, 0, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+        if (n < 0) {
+                log_debug_errno(errno, "Failed to send cgroups agent message: %m");
+                return EXIT_FAILURE;
+        }
+
+        if ((size_t) n != l) {
+                log_debug("Datagram size mismatch");
+                return EXIT_FAILURE;
+        }
+
+        return EXIT_SUCCESS;
+}
diff --git a/src/cgroups-agent/meson.build b/src/cgroups-agent/meson.build
new file mode 100644
index 0000000..33ff1f5
--- /dev/null
+++ b/src/cgroups-agent/meson.build
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-cgroups-agent',
+                'sources' : files('cgroups-agent.c'),
+        },
+]
diff --git a/src/cgtop/cgtop.c b/src/cgtop/cgtop.c
new file mode 100644
index 0000000..ca51455
--- /dev/null
+++ b/src/cgtop/cgtop.c
@@ -0,0 +1,1110 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "cgroup-show.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hashmap.h"
+#include "main-func.h"
+#include "missing_sched.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "procfs-util.h"
+#include "sort-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "unit-name.h"
+#include "virt.h"
+
+typedef struct Group {
+        char *path;
+
+        bool n_tasks_valid;
+        bool cpu_valid;
+        bool memory_valid;
+        bool io_valid;
+
+        uint64_t n_tasks;
+
+        unsigned cpu_iteration;
+        nsec_t cpu_usage;
+        nsec_t cpu_timestamp;
+        double cpu_fraction;
+
+        uint64_t memory;
+
+        unsigned io_iteration;
+        uint64_t io_input, io_output;
+        nsec_t io_timestamp;
+        uint64_t io_input_bps, io_output_bps;
+} Group;
+
+/* Counted objects, enum order matters */
+typedef enum PidsCount {
+        COUNT_USERSPACE_PROCESSES,      /* least */
+        COUNT_ALL_PROCESSES,
+        COUNT_PIDS,                     /* most, requires pids controller */
+} PidsCount;
+
+static unsigned arg_depth = 3;
+static unsigned arg_iterations = UINT_MAX;
+static bool arg_batch = false;
+static bool arg_raw = false;
+static usec_t arg_delay = 1*USEC_PER_SEC;
+static char* arg_machine = NULL;
+static char* arg_root = NULL;
+static bool arg_recursive = true;
+static bool arg_recursive_unset = false;
+
+static PidsCount arg_count = COUNT_PIDS;
+
+static enum {
+        ORDER_PATH,
+        ORDER_TASKS,
+        ORDER_CPU,
+        ORDER_MEMORY,
+        ORDER_IO,
+} arg_order = ORDER_CPU;
+
+static enum {
+        CPU_PERCENT,
+        CPU_TIME,
+} arg_cpu_type = CPU_PERCENT;
+
+static Group *group_free(Group *g) {
+        if (!g)
+                return NULL;
+
+        free(g->path);
+        return mfree(g);
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(group_hash_ops, char, path_hash_func, path_compare, Group, group_free);
+
+static const char *maybe_format_timespan(char *buf, size_t l, usec_t t, usec_t accuracy) {
+        if (arg_raw) {
+               (void) snprintf(buf, l, USEC_FMT, t);
+               return buf;
+        }
+        return format_timespan(buf, l, t, accuracy);
+}
+
+#define BUFSIZE1 CONST_MAX(FORMAT_TIMESPAN_MAX, DECIMAL_STR_MAX(usec_t))
+#define MAYBE_FORMAT_TIMESPAN(t, accuracy) \
+        maybe_format_timespan((char[BUFSIZE1]){}, BUFSIZE1, t, accuracy)
+
+static const char *maybe_format_bytes(char *buf, size_t l, bool is_valid, uint64_t t) {
+        if (!is_valid)
+                return "-";
+        if (arg_raw) {
+                (void) snprintf(buf, l, "%" PRIu64, t);
+                return buf;
+        }
+        return format_bytes(buf, l, t);
+}
+
+#define BUFSIZE2 CONST_MAX(FORMAT_BYTES_MAX, DECIMAL_STR_MAX(uint64_t))
+#define MAYBE_FORMAT_BYTES(is_valid, t) \
+        maybe_format_bytes((char[BUFSIZE2]){}, BUFSIZE2, is_valid, t)
+
+static bool is_root_cgroup(const char *path) {
+
+        /* Returns true if the specified path belongs to the root cgroup. The root cgroup is special on cgroup v2 as it
+         * carries only very few attributes in order not to export multiple truth about system state as most
+         * information is available elsewhere in /proc anyway. We need to be able to deal with that, and need to get
+         * our data from different sources in that case.
+         *
+         * There's one extra complication in all of this, though 😣: if the path to the cgroup indicates we are in the
+         * root cgroup this might actually not be the case, because cgroup namespacing might be in effect
+         * (CLONE_NEWCGROUP). Since there's no nice way to distinguish a real cgroup root from a fake namespaced one we
+         * do an explicit container check here, under the assumption that CLONE_NEWCGROUP is generally used when
+         * container managers are used too.
+         *
+         * Note that checking for a container environment is kinda ugly, since in theory people could use cgtop from
+         * inside a container where cgroup namespacing is turned off to watch the host system. However, that's mostly a
+         * theoretic use case, and if people actually try all they'll lose is accounting for the top-level cgroup. Which
+         * isn't too bad. */
+
+        if (detect_container() > 0)
+                return false;
+
+        return empty_or_root(path);
+}
+
+static int process(
+                const char *controller,
+                const char *path,
+                Hashmap *a,
+                Hashmap *b,
+                unsigned iteration,
+                Group **ret) {
+
+        Group *g;
+        int r, all_unified;
+
+        assert(controller);
+        assert(path);
+        assert(a);
+
+        all_unified = cg_all_unified();
+        if (all_unified < 0)
+                return all_unified;
+
+        g = hashmap_get(a, path);
+        if (!g) {
+                g = hashmap_get(b, path);
+                if (!g) {
+                        g = new0(Group, 1);
+                        if (!g)
+                                return -ENOMEM;
+
+                        g->path = strdup(path);
+                        if (!g->path) {
+                                group_free(g);
+                                return -ENOMEM;
+                        }
+
+                        r = hashmap_put(a, g->path, g);
+                        if (r < 0) {
+                                group_free(g);
+                                return r;
+                        }
+                } else {
+                        r = hashmap_move_one(a, b, path);
+                        if (r < 0)
+                                return r;
+
+                        g->cpu_valid = g->memory_valid = g->io_valid = g->n_tasks_valid = false;
+                }
+        }
+
+        if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) &&
+            IN_SET(arg_count, COUNT_ALL_PROCESSES, COUNT_USERSPACE_PROCESSES)) {
+                _cleanup_fclose_ FILE *f = NULL;
+                pid_t pid;
+
+                r = cg_enumerate_processes(controller, path, &f);
+                if (r == -ENOENT)
+                        return 0;
+                if (r < 0)
+                        return r;
+
+                g->n_tasks = 0;
+                while (cg_read_pid(f, &pid) > 0) {
+
+                        if (arg_count == COUNT_USERSPACE_PROCESSES && pid_is_kernel_thread(pid) > 0)
+                                continue;
+
+                        g->n_tasks++;
+                }
+
+                if (g->n_tasks > 0)
+                        g->n_tasks_valid = true;
+
+        } else if (streq(controller, "pids") && arg_count == COUNT_PIDS) {
+
+                if (is_root_cgroup(path)) {
+                        r = procfs_tasks_get_current(&g->n_tasks);
+                        if (r < 0)
+                                return r;
+                } else {
+                        _cleanup_free_ char *p = NULL, *v = NULL;
+
+                        r = cg_get_path(controller, path, "pids.current", &p);
+                        if (r < 0)
+                                return r;
+
+                        r = read_one_line_file(p, &v);
+                        if (r == -ENOENT)
+                                return 0;
+                        if (r < 0)
+                                return r;
+
+                        r = safe_atou64(v, &g->n_tasks);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (g->n_tasks > 0)
+                        g->n_tasks_valid = true;
+
+        } else if (streq(controller, "memory")) {
+
+                if (is_root_cgroup(path)) {
+                        r = procfs_memory_get_used(&g->memory);
+                        if (r < 0)
+                                return r;
+                } else {
+                        _cleanup_free_ char *p = NULL, *v = NULL;
+
+                        if (all_unified)
+                                r = cg_get_path(controller, path, "memory.current", &p);
+                        else
+                                r = cg_get_path(controller, path, "memory.usage_in_bytes", &p);
+                        if (r < 0)
+                                return r;
+
+                        r = read_one_line_file(p, &v);
+                        if (r == -ENOENT)
+                                return 0;
+                        if (r < 0)
+                                return r;
+
+                        r = safe_atou64(v, &g->memory);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (g->memory > 0)
+                        g->memory_valid = true;
+
+        } else if ((streq(controller, "io") && all_unified) ||
+                   (streq(controller, "blkio") && !all_unified)) {
+                _cleanup_fclose_ FILE *f = NULL;
+                _cleanup_free_ char *p = NULL;
+                uint64_t wr = 0, rd = 0;
+                nsec_t timestamp;
+
+                r = cg_get_path(controller, path, all_unified ? "io.stat" : "blkio.io_service_bytes", &p);
+                if (r < 0)
+                        return r;
+
+                f = fopen(p, "re");
+                if (!f) {
+                        if (errno == ENOENT)
+                                return 0;
+                        return -errno;
+                }
+
+                for (;;) {
+                        _cleanup_free_ char *line = NULL;
+                        uint64_t k, *q;
+                        char *l;
+
+                        r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        /* Skip the device */
+                        l = line + strcspn(line, WHITESPACE);
+                        l += strspn(l, WHITESPACE);
+
+                        if (all_unified) {
+                                while (!isempty(l)) {
+                                        if (sscanf(l, "rbytes=%" SCNu64, &k) == 1)
+                                                rd += k;
+                                        else if (sscanf(l, "wbytes=%" SCNu64, &k) == 1)
+                                                wr += k;
+
+                                        l += strcspn(l, WHITESPACE);
+                                        l += strspn(l, WHITESPACE);
+                                }
+                        } else {
+                                if (first_word(l, "Read")) {
+                                        l += 4;
+                                        q = &rd;
+                                } else if (first_word(l, "Write")) {
+                                        l += 5;
+                                        q = ≀
+                                } else
+                                        continue;
+
+                                l += strspn(l, WHITESPACE);
+                                r = safe_atou64(l, &k);
+                                if (r < 0)
+                                        continue;
+
+                                *q += k;
+                        }
+                }
+
+                timestamp = now_nsec(CLOCK_MONOTONIC);
+
+                if (g->io_iteration == iteration - 1) {
+                        uint64_t x, yr, yw;
+
+                        x = (uint64_t) (timestamp - g->io_timestamp);
+                        if (x < 1)
+                                x = 1;
+
+                        if (rd > g->io_input)
+                                yr = rd - g->io_input;
+                        else
+                                yr = 0;
+
+                        if (wr > g->io_output)
+                                yw = wr - g->io_output;
+                        else
+                                yw = 0;
+
+                        if (yr > 0 || yw > 0) {
+                                g->io_input_bps = (yr * 1000000000ULL) / x;
+                                g->io_output_bps = (yw * 1000000000ULL) / x;
+                                g->io_valid = true;
+                        }
+                }
+
+                g->io_input = rd;
+                g->io_output = wr;
+                g->io_timestamp = timestamp;
+                g->io_iteration = iteration;
+        } else if (STR_IN_SET(controller, "cpu", "cpuacct") || cpu_accounting_is_cheap()) {
+                _cleanup_free_ char *p = NULL, *v = NULL;
+                uint64_t new_usage;
+                nsec_t timestamp;
+
+                if (is_root_cgroup(path)) {
+                        r = procfs_cpu_get_usage(&new_usage);
+                        if (r < 0)
+                                return r;
+                } else if (all_unified) {
+                        _cleanup_free_ char *val = NULL;
+
+                        if (!streq(controller, "cpu"))
+                                return 0;
+
+                        r = cg_get_keyed_attribute("cpu", path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
+                        if (IN_SET(r, -ENOENT, -ENXIO))
+                                return 0;
+                        if (r < 0)
+                                return r;
+
+                        r = safe_atou64(val, &new_usage);
+                        if (r < 0)
+                                return r;
+
+                        new_usage *= NSEC_PER_USEC;
+                } else {
+                        if (!streq(controller, "cpuacct"))
+                                return 0;
+
+                        r = cg_get_path(controller, path, "cpuacct.usage", &p);
+                        if (r < 0)
+                                return r;
+
+                        r = read_one_line_file(p, &v);
+                        if (r == -ENOENT)
+                                return 0;
+                        if (r < 0)
+                                return r;
+
+                        r = safe_atou64(v, &new_usage);
+                        if (r < 0)
+                                return r;
+                }
+
+                timestamp = now_nsec(CLOCK_MONOTONIC);
+
+                if (g->cpu_iteration == iteration - 1 &&
+                    (nsec_t) new_usage > g->cpu_usage) {
+
+                        nsec_t x, y;
+
+                        x = timestamp - g->cpu_timestamp;
+                        if (x < 1)
+                                x = 1;
+
+                        y = (nsec_t) new_usage - g->cpu_usage;
+                        g->cpu_fraction = (double) y / (double) x;
+                        g->cpu_valid = true;
+                }
+
+                g->cpu_usage = (nsec_t) new_usage;
+                g->cpu_timestamp = timestamp;
+                g->cpu_iteration = iteration;
+
+        }
+
+        if (ret)
+                *ret = g;
+
+        return 0;
+}
+
+static int refresh_one(
+                const char *controller,
+                const char *path,
+                Hashmap *a,
+                Hashmap *b,
+                unsigned iteration,
+                unsigned depth,
+                Group **ret) {
+
+        _cleanup_closedir_ DIR *d = NULL;
+        Group *ours = NULL;
+        int r;
+
+        assert(controller);
+        assert(path);
+        assert(a);
+
+        if (depth > arg_depth)
+                return 0;
+
+        r = process(controller, path, a, b, iteration, &ours);
+        if (r < 0)
+                return r;
+
+        r = cg_enumerate_subgroups(controller, path, &d);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                _cleanup_free_ char *fn = NULL, *p = NULL;
+                Group *child = NULL;
+
+                r = cg_read_subgroup(d, &fn);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                p = path_join(path, fn);
+                if (!p)
+                        return -ENOMEM;
+
+                path_simplify(p);
+
+                r = refresh_one(controller, p, a, b, iteration, depth + 1, &child);
+                if (r < 0)
+                        return r;
+
+                if (arg_recursive &&
+                    IN_SET(arg_count, COUNT_ALL_PROCESSES, COUNT_USERSPACE_PROCESSES) &&
+                    child &&
+                    child->n_tasks_valid &&
+                    streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+
+                        /* Recursively sum up processes */
+
+                        if (ours->n_tasks_valid)
+                                ours->n_tasks += child->n_tasks;
+                        else {
+                                ours->n_tasks = child->n_tasks;
+                                ours->n_tasks_valid = true;
+                        }
+                }
+        }
+
+        if (ret)
+                *ret = ours;
+
+        return 1;
+}
+
+static int refresh(const char *root, Hashmap *a, Hashmap *b, unsigned iteration) {
+        int r;
+
+        FOREACH_STRING(c, SYSTEMD_CGROUP_CONTROLLER, "cpu", "cpuacct", "memory", "io", "blkio", "pids") {
+                r = refresh_one(c, root, a, b, iteration, 0, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int group_compare(Group * const *a, Group * const *b) {
+        const Group *x = *a, *y = *b;
+        int r;
+
+        if (arg_order != ORDER_TASKS || arg_recursive) {
+                /* Let's make sure that the parent is always before
+                 * the child. Except when ordering by tasks and
+                 * recursive summing is off, since that is actually
+                 * not accumulative for all children. */
+
+                if (path_startswith(empty_to_root(y->path), empty_to_root(x->path)))
+                        return -1;
+                if (path_startswith(empty_to_root(x->path), empty_to_root(y->path)))
+                        return 1;
+        }
+
+        switch (arg_order) {
+
+        case ORDER_PATH:
+                break;
+
+        case ORDER_CPU:
+                if (arg_cpu_type == CPU_PERCENT) {
+                        if (x->cpu_valid && y->cpu_valid) {
+                                r = CMP(y->cpu_fraction, x->cpu_fraction);
+                                if (r != 0)
+                                        return r;
+                        } else if (x->cpu_valid)
+                                return -1;
+                        else if (y->cpu_valid)
+                                return 1;
+                } else {
+                        r = CMP(y->cpu_usage, x->cpu_usage);
+                        if (r != 0)
+                                return r;
+                }
+
+                break;
+
+        case ORDER_TASKS:
+                if (x->n_tasks_valid && y->n_tasks_valid) {
+                        r = CMP(y->n_tasks, x->n_tasks);
+                        if (r != 0)
+                                return r;
+                } else if (x->n_tasks_valid)
+                        return -1;
+                else if (y->n_tasks_valid)
+                        return 1;
+
+                break;
+
+        case ORDER_MEMORY:
+                if (x->memory_valid && y->memory_valid) {
+                        r = CMP(y->memory, x->memory);
+                        if (r != 0)
+                                return r;
+                } else if (x->memory_valid)
+                        return -1;
+                else if (y->memory_valid)
+                        return 1;
+
+                break;
+
+        case ORDER_IO:
+                if (x->io_valid && y->io_valid) {
+                        r = CMP(y->io_input_bps + y->io_output_bps, x->io_input_bps + x->io_output_bps);
+                        if (r != 0)
+                                return r;
+                } else if (x->io_valid)
+                        return -1;
+                else if (y->io_valid)
+                        return 1;
+        }
+
+        return path_compare(x->path, y->path);
+}
+
+static void display(Hashmap *a) {
+        Group *g;
+        Group **array;
+        signed path_columns;
+        unsigned rows, n = 0, maxtcpu = 0, maxtpath = 3; /* 3 for ellipsize() to work properly */
+
+        assert(a);
+
+        if (!terminal_is_dumb())
+                fputs(ANSI_HOME_CLEAR, stdout);
+
+        array = newa(Group*, hashmap_size(a));
+
+        HASHMAP_FOREACH(g, a)
+                if (g->n_tasks_valid || g->cpu_valid || g->memory_valid || g->io_valid)
+                        array[n++] = g;
+
+        typesafe_qsort(array, n, group_compare);
+
+        /* Find the longest names in one run */
+        for (unsigned j = 0; j < n; j++) {
+                maxtcpu = MAX(maxtcpu,
+                              strlen(MAYBE_FORMAT_TIMESPAN((usec_t) (array[j]->cpu_usage / NSEC_PER_USEC), 0)));
+                maxtpath = MAX(maxtpath,
+                               strlen(array[j]->path));
+        }
+
+        rows = lines();
+        if (rows <= 10)
+                rows = 10;
+
+        if (on_tty()) {
+                const char *on, *off;
+                int cpu_len = arg_cpu_type == CPU_PERCENT ? 6 : maxtcpu;
+
+                path_columns = columns() - 36 - cpu_len;
+                if (path_columns < 10)
+                        path_columns = 10;
+
+                on = ansi_highlight_underline();
+                off = ansi_underline();
+
+                printf("%s%s%-*s%s %s%7s%s %s%*s%s %s%8s%s %s%8s%s %s%8s%s%s\n",
+                       ansi_underline(),
+                       arg_order == ORDER_PATH ? on : "", path_columns, "CGroup",
+                       arg_order == ORDER_PATH ? off : "",
+                       arg_order == ORDER_TASKS ? on : "",
+                       arg_count == COUNT_PIDS ? "Tasks" : arg_count == COUNT_USERSPACE_PROCESSES ? "Procs" : "Proc+",
+                       arg_order == ORDER_TASKS ? off : "",
+                       arg_order == ORDER_CPU ? on : "",
+                       cpu_len,
+                       arg_cpu_type == CPU_PERCENT ? "%CPU" : "CPU Time",
+                       arg_order == ORDER_CPU ? off : "",
+                       arg_order == ORDER_MEMORY ? on : "", "Memory",
+                       arg_order == ORDER_MEMORY ? off : "",
+                       arg_order == ORDER_IO ? on : "", "Input/s",
+                       arg_order == ORDER_IO ? off : "",
+                       arg_order == ORDER_IO ? on : "", "Output/s",
+                       arg_order == ORDER_IO ? off : "",
+                       ansi_normal());
+        } else
+                path_columns = maxtpath;
+
+        for (unsigned j = 0; j < n; j++) {
+                _cleanup_free_ char *ellipsized = NULL;
+                const char *path;
+
+                if (on_tty() && j + 6 > rows)
+                        break;
+
+                g = array[j];
+
+                path = empty_to_root(g->path);
+                ellipsized = ellipsize(path, path_columns, 33);
+                printf("%-*s", path_columns, ellipsized ?: path);
+
+                if (g->n_tasks_valid)
+                        printf(" %7" PRIu64, g->n_tasks);
+                else
+                        fputs("       -", stdout);
+
+                if (arg_cpu_type == CPU_PERCENT) {
+                        if (g->cpu_valid)
+                                printf(" %6.1f", g->cpu_fraction*100);
+                        else
+                                fputs("      -", stdout);
+                } else
+                        printf(" %*s",
+                               (int) maxtcpu,
+                               MAYBE_FORMAT_TIMESPAN((usec_t) (g->cpu_usage / NSEC_PER_USEC), 0));
+
+                printf(" %8s", MAYBE_FORMAT_BYTES(g->memory_valid, g->memory));
+                printf(" %8s", MAYBE_FORMAT_BYTES(g->io_valid, g->io_input_bps));
+                printf(" %8s", MAYBE_FORMAT_BYTES(g->io_valid, g->io_output_bps));
+
+                putchar('\n');
+        }
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-cgtop", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] [CGROUP]\n\n"
+               "Show top control groups by their resource usage.\n\n"
+               "  -h --help           Show this help\n"
+               "     --version        Show package version\n"
+               "  -p --order=path     Order by path\n"
+               "  -t --order=tasks    Order by number of tasks/processes\n"
+               "  -c --order=cpu      Order by CPU load (default)\n"
+               "  -m --order=memory   Order by memory load\n"
+               "  -i --order=io       Order by IO load\n"
+               "  -r --raw            Provide raw (not human-readable) numbers\n"
+               "     --cpu=percentage Show CPU usage as percentage (default)\n"
+               "     --cpu=time       Show CPU usage as time\n"
+               "  -P                  Count userspace processes instead of tasks (excl. kernel)\n"
+               "  -k                  Count all processes instead of tasks (incl. kernel)\n"
+               "     --recursive=BOOL Sum up process count recursively\n"
+               "  -d --delay=DELAY    Delay between updates\n"
+               "  -n --iterations=N   Run for N iterations before exiting\n"
+               "  -1                  Shortcut for --iterations=1\n"
+               "  -b --batch          Run in batch mode, accepting no input\n"
+               "     --depth=DEPTH    Maximum traversal depth (default: %u)\n"
+               "  -M --machine=       Show container\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               arg_depth,
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_DEPTH,
+                ARG_CPU_TYPE,
+                ARG_ORDER,
+                ARG_RECURSIVE,
+        };
+
+        static const struct option options[] = {
+                { "help",         no_argument,       NULL, 'h'           },
+                { "version",      no_argument,       NULL, ARG_VERSION   },
+                { "delay",        required_argument, NULL, 'd'           },
+                { "iterations",   required_argument, NULL, 'n'           },
+                { "batch",        no_argument,       NULL, 'b'           },
+                { "raw",          no_argument,       NULL, 'r'           },
+                { "depth",        required_argument, NULL, ARG_DEPTH     },
+                { "cpu",          optional_argument, NULL, ARG_CPU_TYPE  },
+                { "order",        required_argument, NULL, ARG_ORDER     },
+                { "recursive",    required_argument, NULL, ARG_RECURSIVE },
+                { "machine",      required_argument, NULL, 'M'           },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 1);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hptcmin:brd:kPM:1", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_CPU_TYPE:
+                        if (optarg) {
+                                if (streq(optarg, "time"))
+                                        arg_cpu_type = CPU_TIME;
+                                else if (streq(optarg, "percentage"))
+                                        arg_cpu_type = CPU_PERCENT;
+                                else
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unknown argument to --cpu=: %s",
+                                                               optarg);
+                        } else
+                                arg_cpu_type = CPU_TIME;
+
+                        break;
+
+                case ARG_DEPTH:
+                        r = safe_atou(optarg, &arg_depth);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse depth parameter '%s': %m", optarg);
+
+                        break;
+
+                case 'd':
+                        r = parse_sec(optarg, &arg_delay);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse delay parameter '%s': %m", optarg);
+                        if (arg_delay <= 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Invalid delay parameter '%s'",
+                                                       optarg);
+
+                        break;
+
+                case 'n':
+                        r = safe_atou(optarg, &arg_iterations);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse iterations parameter '%s': %m", optarg);
+
+                        break;
+
+                case '1':
+                        arg_iterations = 1;
+                        break;
+
+                case 'b':
+                        arg_batch = true;
+                        break;
+
+                case 'r':
+                        arg_raw = true;
+                        break;
+
+                case 'p':
+                        arg_order = ORDER_PATH;
+                        break;
+
+                case 't':
+                        arg_order = ORDER_TASKS;
+                        break;
+
+                case 'c':
+                        arg_order = ORDER_CPU;
+                        break;
+
+                case 'm':
+                        arg_order = ORDER_MEMORY;
+                        break;
+
+                case 'i':
+                        arg_order = ORDER_IO;
+                        break;
+
+                case ARG_ORDER:
+                        if (streq(optarg, "path"))
+                                arg_order = ORDER_PATH;
+                        else if (streq(optarg, "tasks"))
+                                arg_order = ORDER_TASKS;
+                        else if (streq(optarg, "cpu"))
+                                arg_order = ORDER_CPU;
+                        else if (streq(optarg, "memory"))
+                                arg_order = ORDER_MEMORY;
+                        else if (streq(optarg, "io"))
+                                arg_order = ORDER_IO;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Invalid argument to --order=: %s",
+                                                       optarg);
+                        break;
+
+                case 'k':
+                        arg_count = COUNT_ALL_PROCESSES;
+                        break;
+
+                case 'P':
+                        arg_count = COUNT_USERSPACE_PROCESSES;
+                        break;
+
+                case ARG_RECURSIVE:
+                        r = parse_boolean_argument("--recursive=", optarg, &arg_recursive);
+                        if (r < 0)
+                                return r;
+
+                        arg_recursive_unset = !r;
+                        break;
+
+                case 'M':
+                        arg_machine = optarg;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (optind == argc - 1)
+                arg_root = argv[optind];
+        else if (optind < argc)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Too many arguments.");
+
+        return 1;
+}
+
+static const char* counting_what(void) {
+        if (arg_count == COUNT_PIDS)
+                return "tasks";
+        else if (arg_count == COUNT_ALL_PROCESSES)
+                return "all processes (incl. kernel)";
+        else
+                return "userspace processes (excl. kernel)";
+}
+
+static int loop(const char *root) {
+        _cleanup_hashmap_free_ Hashmap *a = NULL, *b = NULL;
+        unsigned iteration = 0;
+        usec_t last_refresh = 0;
+        bool immediate_refresh = false;
+        int r;
+
+        a = hashmap_new(&group_hash_ops);
+        b = hashmap_new(&group_hash_ops);
+        if (!a || !b)
+                return log_oom();
+
+        for (;;) {
+                usec_t t;
+                char key;
+
+                t = now(CLOCK_MONOTONIC);
+
+                if (t >= usec_add(last_refresh, arg_delay) || immediate_refresh) {
+
+                        r = refresh(root, a, b, iteration++);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to refresh: %m");
+
+                        hashmap_clear(b);
+                        SWAP_TWO(a, b);
+
+                        last_refresh = t;
+                        immediate_refresh = false;
+                }
+
+                display(b);
+
+                if (arg_iterations && iteration >= arg_iterations)
+                        return 0;
+
+                if (!on_tty()) /* non-TTY: Empty newline as delimiter between polls */
+                        fputs("\n", stdout);
+                fflush(stdout);
+
+                if (arg_batch)
+                        (void) usleep_safe(usec_add(usec_sub_unsigned(last_refresh, t), arg_delay));
+                else {
+                        r = read_one_char(stdin, &key, usec_add(usec_sub_unsigned(last_refresh, t), arg_delay), NULL);
+                        if (r == -ETIMEDOUT)
+                                continue;
+                        if (r < 0)
+                                return log_error_errno(r, "Couldn't read key: %m");
+                }
+
+                if (on_tty()) { /* TTY: Clear any user keystroke */
+                        fputs("\r \r", stdout);
+                        fflush(stdout);
+                }
+
+                if (arg_batch)
+                        continue;
+
+                switch (key) {
+
+                case ' ':
+                        immediate_refresh = true;
+                        break;
+
+                case 'q':
+                        return 0;
+
+                case 'p':
+                        arg_order = ORDER_PATH;
+                        break;
+
+                case 't':
+                        arg_order = ORDER_TASKS;
+                        break;
+
+                case 'c':
+                        arg_order = ORDER_CPU;
+                        break;
+
+                case 'm':
+                        arg_order = ORDER_MEMORY;
+                        break;
+
+                case 'i':
+                        arg_order = ORDER_IO;
+                        break;
+
+                case '%':
+                        arg_cpu_type = arg_cpu_type == CPU_TIME ? CPU_PERCENT : CPU_TIME;
+                        break;
+
+                case 'k':
+                        arg_count = arg_count != COUNT_ALL_PROCESSES ? COUNT_ALL_PROCESSES : COUNT_PIDS;
+                        fprintf(stdout, "\nCounting: %s.", counting_what());
+                        fflush(stdout);
+                        sleep(1);
+                        break;
+
+                case 'P':
+                        arg_count = arg_count != COUNT_USERSPACE_PROCESSES ? COUNT_USERSPACE_PROCESSES : COUNT_PIDS;
+                        fprintf(stdout, "\nCounting: %s.", counting_what());
+                        fflush(stdout);
+                        sleep(1);
+                        break;
+
+                case 'r':
+                        if (arg_count == COUNT_PIDS)
+                                fprintf(stdout, "\n\aCannot toggle recursive counting, not available in task counting mode.");
+                        else {
+                                arg_recursive = !arg_recursive;
+                                fprintf(stdout, "\nRecursive process counting: %s", yes_no(arg_recursive));
+                        }
+                        fflush(stdout);
+                        sleep(1);
+                        break;
+
+                case '+':
+                        arg_delay = usec_add(arg_delay, arg_delay < USEC_PER_SEC ? USEC_PER_MSEC * 250 : USEC_PER_SEC);
+
+                        fprintf(stdout, "\nIncreased delay to %s.", FORMAT_TIMESPAN(arg_delay, 0));
+                        fflush(stdout);
+                        sleep(1);
+                        break;
+
+                case '-':
+                        if (arg_delay <= USEC_PER_MSEC*500)
+                                arg_delay = USEC_PER_MSEC*250;
+                        else
+                                arg_delay = usec_sub_unsigned(arg_delay, arg_delay < USEC_PER_MSEC * 1250 ? USEC_PER_MSEC * 250 : USEC_PER_SEC);
+
+                        fprintf(stdout, "\nDecreased delay to %s.", FORMAT_TIMESPAN(arg_delay, 0));
+                        fflush(stdout);
+                        sleep(1);
+                        break;
+
+                case '?':
+                case 'h':
+
+                        fprintf(stdout,
+                                "\t<%1$sp%2$s> By path; <%1$st%2$s> By tasks/procs; <%1$sc%2$s> By CPU; <%1$sm%2$s> By memory; <%1$si%2$s> By I/O\n"
+                                "\t<%1$s+%2$s> Inc. delay; <%1$s-%2$s> Dec. delay; <%1$s%%%2$s> Toggle time; <%1$sSPACE%2$s> Refresh\n"
+                                "\t<%1$sP%2$s> Toggle count userspace processes; <%1$sk%2$s> Toggle count all processes\n"
+                                "\t<%1$sr%2$s> Count processes recursively; <%1$sq%2$s> Quit",
+                                ansi_highlight(), ansi_normal());
+                        fflush(stdout);
+                        sleep(3);
+                        break;
+
+                default:
+                        if (key < ' ')
+                                fprintf(stdout, "\nUnknown key '\\x%x'. Ignoring.", (unsigned) key);
+                        else
+                                fprintf(stdout, "\nUnknown key '%c'. Ignoring.", key);
+                        fflush(stdout);
+                        sleep(1);
+                        break;
+                }
+        }
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_free_ char *root = NULL;
+        CGroupMask mask;
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        r = cg_mask_supported(&mask);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine supported controllers: %m");
+
+        /* honor user selection unless pids controller is unavailable */
+        PidsCount possible_count = (mask & CGROUP_MASK_PIDS) ? COUNT_PIDS : COUNT_ALL_PROCESSES;
+        arg_count = MIN(possible_count, arg_count);
+
+        if (arg_recursive_unset && arg_count == COUNT_PIDS)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Non-recursive counting is only supported when counting processes, not tasks. Use -P or -k.");
+
+        r = show_cgroup_get_path_and_warn(arg_machine, arg_root, &root);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get root control group path: %m");
+        log_debug("CGroup path: %s", root);
+
+        signal(SIGWINCH, columns_lines_cache_reset);
+
+        if (arg_iterations == UINT_MAX)
+                arg_iterations = on_tty() ? 0 : 1;
+
+        return loop(root);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/cgtop/meson.build b/src/cgtop/meson.build
new file mode 100644
index 0000000..afe6a33
--- /dev/null
+++ b/src/cgtop/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-cgtop',
+                'public' : true,
+                'sources' : files('cgtop.c'),
+        },
+]
diff --git a/src/core/all-units.h b/src/core/all-units.h
new file mode 100644
index 0000000..fad814b
--- /dev/null
+++ b/src/core/all-units.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+#include "automount.h"
+#include "device.h"
+#include "path.h"
+#include "scope.h"
+#include "service.h"
+#include "slice.h"
+#include "socket.h"
+#include "swap.h"
+#include "target.h"
+#include "timer.h"
diff --git a/src/core/apparmor-setup.c b/src/core/apparmor-setup.c
new file mode 100644
index 0000000..3426a10
--- /dev/null
+++ b/src/core/apparmor-setup.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#if HAVE_APPARMOR
+#  include 
+#endif
+#include 
+
+#include "apparmor-setup.h"
+#include "apparmor-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "macro.h"
+#include "string-util.h"
+#include "strv.h"
+
+#if HAVE_APPARMOR
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_policy_cache *, aa_policy_cache_unref, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_features *, aa_features_unref, NULL);
+#endif
+
+int mac_apparmor_setup(void) {
+#if HAVE_APPARMOR
+        _cleanup_(aa_policy_cache_unrefp) aa_policy_cache *policy_cache = NULL;
+        _cleanup_(aa_features_unrefp) aa_features *features = NULL;
+        _cleanup_free_ char *current_profile = NULL, *cache_dir_path = NULL;
+        int r;
+
+        if (!mac_apparmor_use()) {
+                log_debug("AppArmor either not supported by the kernel or disabled.");
+                return 0;
+        }
+
+        /* To enable LSM stacking a patch to the kernel is proposed to create a
+         * per-LSM subdirectory to distinguish between the LSMs. Therefore, we
+         * read the file from the LSM specific directory first and only if that
+         * fails the one from the generic directory.
+         */
+        FOREACH_STRING(current_file, "/proc/self/attr/apparmor/current", "/proc/self/attr/current") {
+                r = read_one_line_file(current_file, ¤t_profile);
+                if (r == -ENOENT)
+                        continue;
+                else if (r < 0)
+                        log_warning_errno(r, "Failed to read current AppArmor profile from file %s, ignoring: %m", current_file);
+                else
+                        break;
+        }
+        if (!current_profile) {
+                log_warning("Failed to get the current AppArmor profile of systemd from /proc/self/attr/apparmor/current or /proc/self/attr/current, ignoring.");
+                return 0;
+        }
+        if (!streq(current_profile, "unconfined")) {
+                log_debug("We are already confined in an AppArmor profile.");
+                return 0;
+        }
+
+        r = aa_features_new_from_kernel(&features);
+        if (r < 0) {
+                log_warning_errno(errno, "Failed to get the AppArmor feature set from the kernel, ignoring: %m");
+                return 0;
+        }
+        cache_dir_path = aa_policy_cache_dir_path_preview(features, AT_FDCWD, "/etc/apparmor/earlypolicy");
+        if (!cache_dir_path) {
+                log_debug_errno(errno, "Failed to get the path of the early AppArmor policy cache directory.");
+                return 0;
+        }
+
+        /* aa_policy_cache_new will internally use the same path as aa_policy_cache_dir_path_preview has returned. */
+        r = aa_policy_cache_new(&policy_cache, features, AT_FDCWD, "/etc/apparmor/earlypolicy", 0);
+        if (r < 0) {
+                if (errno == ENOENT) {
+                        log_debug_errno(errno, "The early AppArmor policy cache directory %s does not exist.", cache_dir_path);
+                        return 0;
+                }
+                log_warning_errno(errno, "Failed to create a new AppArmor policy cache, ignoring: %m");
+                return 0;
+        }
+        r = aa_policy_cache_replace_all(policy_cache, NULL);
+        if (r < 0) {
+                log_warning_errno(errno, "Failed to load the profiles from the early AppArmor policy cache directory %s, ignoring: %m", cache_dir_path);
+                return 0;
+        }
+
+        log_info("Successfully loaded all binary profiles from AppArmor early policy cache at %s.", cache_dir_path);
+
+        r = aa_change_profile("systemd");
+        if (r < 0) {
+                if (errno == ENOENT)
+                        log_debug_errno(errno, "Failed to change to AppArmor profile 'systemd'. Please ensure that one of the binary profile files in policy cache directory %s contains a profile with that name.", cache_dir_path);
+                else
+                        log_error_errno(errno, "Failed to change to AppArmor profile 'systemd': %m");
+                return 0;
+        }
+
+        log_info("Changed to AppArmor profile systemd.");
+#endif
+        return 0;
+}
diff --git a/src/core/apparmor-setup.h b/src/core/apparmor-setup.h
new file mode 100644
index 0000000..f3b7382
--- /dev/null
+++ b/src/core/apparmor-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int mac_apparmor_setup(void);
diff --git a/src/core/audit-fd.c b/src/core/audit-fd.c
new file mode 100644
index 0000000..6674fa8
--- /dev/null
+++ b/src/core/audit-fd.c
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "audit-fd.h"
+
+#if HAVE_AUDIT
+
+#include 
+#include 
+
+#include "capability-util.h"
+#include "fd-util.h"
+#include "log.h"
+
+static bool initialized = false;
+static int audit_fd;
+
+int get_audit_fd(void) {
+
+        if (!initialized) {
+                if (have_effective_cap(CAP_AUDIT_WRITE) <= 0) {
+                        audit_fd = -EPERM;
+                        initialized = true;
+
+                        return audit_fd;
+                }
+
+                audit_fd = audit_open();
+
+                if (audit_fd < 0) {
+                        if (!IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT))
+                                log_error_errno(errno, "Failed to connect to audit log: %m");
+
+                        audit_fd = errno ? -errno : -EINVAL;
+                }
+
+                initialized = true;
+        }
+
+        return audit_fd;
+}
+
+void close_audit_fd(void) {
+
+        if (initialized && audit_fd >= 0)
+                safe_close(audit_fd);
+
+        initialized = true;
+        audit_fd = -ECONNRESET;
+}
+
+#else
+
+int get_audit_fd(void) {
+        return -EAFNOSUPPORT;
+}
+
+void close_audit_fd(void) {
+}
+
+#endif
diff --git a/src/core/audit-fd.h b/src/core/audit-fd.h
new file mode 100644
index 0000000..5cdf61e
--- /dev/null
+++ b/src/core/audit-fd.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int get_audit_fd(void);
+void close_audit_fd(void);
diff --git a/src/core/automount.c b/src/core/automount.c
new file mode 100644
index 0000000..14bf7e6
--- /dev/null
+++ b/src/core/automount.c
@@ -0,0 +1,1149 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "async.h"
+#include "automount.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-automount.h"
+#include "dbus-unit.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fstab-util.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
+        [AUTOMOUNT_DEAD] = UNIT_INACTIVE,
+        [AUTOMOUNT_WAITING] = UNIT_ACTIVE,
+        [AUTOMOUNT_RUNNING] = UNIT_ACTIVE,
+        [AUTOMOUNT_FAILED] = UNIT_FAILED
+};
+
+static int open_dev_autofs(Manager *m);
+static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata);
+static int automount_start_expire(Automount *a);
+static void automount_stop_expire(Automount *a);
+static int automount_send_ready(Automount *a, Set *tokens, int status);
+
+static void automount_init(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+
+        assert(a);
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        a->pipe_fd = -EBADF;
+        a->directory_mode = 0755;
+        UNIT(a)->ignore_on_isolate = true;
+}
+
+static void unmount_autofs(Automount *a) {
+        int r;
+
+        assert(a);
+
+        if (a->pipe_fd < 0)
+                return;
+
+        a->pipe_event_source = sd_event_source_disable_unref(a->pipe_event_source);
+        a->pipe_fd = safe_close(a->pipe_fd);
+
+        /* If we reload/reexecute things we keep the mount point around */
+        if (!IN_SET(UNIT(a)->manager->objective, MANAGER_RELOAD, MANAGER_REEXECUTE)) {
+
+                automount_send_ready(a, a->tokens, -EHOSTDOWN);
+                automount_send_ready(a, a->expire_tokens, -EHOSTDOWN);
+
+                if (a->where) {
+                        r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW);
+                        if (r < 0)
+                                log_unit_error_errno(UNIT(a), r, "Failed to unmount: %m");
+                }
+        }
+}
+
+static void automount_done(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+
+        assert(a);
+
+        unmount_autofs(a);
+
+        a->where = mfree(a->where);
+        a->extra_options = mfree(a->extra_options);
+
+        a->tokens = set_free(a->tokens);
+        a->expire_tokens = set_free(a->expire_tokens);
+
+        a->expire_event_source = sd_event_source_disable_unref(a->expire_event_source);
+}
+
+static int automount_add_trigger_dependencies(Automount *a) {
+        Unit *x;
+        int r;
+
+        assert(a);
+
+        r = unit_load_related_unit(UNIT(a), ".mount", &x);
+        if (r < 0)
+                return r;
+
+        return unit_add_two_dependencies(UNIT(a), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int automount_add_mount_dependencies(Automount *a) {
+        _cleanup_free_ char *parent = NULL;
+        int r;
+
+        assert(a);
+
+        r = path_extract_directory(a->where, &parent);
+        if (r < 0)
+                return r;
+
+        return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int automount_add_default_dependencies(Automount *a) {
+        int r;
+
+        assert(a);
+
+        if (!UNIT(a)->default_dependencies)
+                return 0;
+
+        if (!MANAGER_IS_SYSTEM(UNIT(a)->manager))
+                return 0;
+
+        r = unit_add_dependency_by_name(UNIT(a), UNIT_BEFORE, SPECIAL_LOCAL_FS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        r = unit_add_dependency_by_name(UNIT(a), UNIT_AFTER, SPECIAL_LOCAL_FS_PRE_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        r = unit_add_two_dependencies_by_name(UNIT(a), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int automount_verify(Automount *a) {
+        static const char *const reserved_options[] = {
+                "fd\0",
+                "pgrp\0",
+                "minproto\0",
+                "maxproto\0",
+                "direct\0",
+                "indirect\0",
+        };
+
+        _cleanup_free_ char *e = NULL;
+        int r;
+
+        assert(a);
+        assert(UNIT(a)->load_state == UNIT_LOADED);
+
+        if (path_equal(a->where, "/"))
+                return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Cannot have an automount unit for the root directory. Refusing.");
+
+        r = unit_name_from_path(a->where, ".automount", &e);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(a), r, "Failed to generate unit name from path: %m");
+
+        if (!unit_has_name(UNIT(a), e))
+                return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing.");
+
+        for (size_t i = 0; i < ELEMENTSOF(reserved_options); i++)
+                if (fstab_test_option(a->extra_options, reserved_options[i]))
+                        return log_unit_error_errno(
+                                UNIT(a),
+                                SYNTHETIC_ERRNO(ENOEXEC),
+                                "ExtraOptions= setting may not contain reserved option %s.",
+                                reserved_options[i]);
+
+        return 0;
+}
+
+static int automount_set_where(Automount *a) {
+        int r;
+
+        assert(a);
+
+        if (a->where)
+                return 0;
+
+        r = unit_name_to_path(UNIT(a)->id, &a->where);
+        if (r < 0)
+                return r;
+
+        path_simplify(a->where);
+        return 1;
+}
+
+static int automount_add_extras(Automount *a) {
+        int r;
+
+        r = automount_set_where(a);
+        if (r < 0)
+                return r;
+
+        r = automount_add_trigger_dependencies(a);
+        if (r < 0)
+                return r;
+
+        r = automount_add_mount_dependencies(a);
+        if (r < 0)
+                return r;
+
+        return automount_add_default_dependencies(a);
+}
+
+static int automount_load(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+        int r;
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        /* Load a .automount file */
+        r = unit_load_fragment_and_dropin(u, true);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        r = automount_add_extras(a);
+        if (r < 0)
+                return r;
+
+        return automount_verify(a);
+}
+
+static void automount_set_state(Automount *a, AutomountState state) {
+        AutomountState old_state;
+        assert(a);
+
+        if (a->state != state)
+                bus_unit_send_pending_change_signal(UNIT(a), false);
+
+        old_state = a->state;
+        a->state = state;
+
+        if (state != AUTOMOUNT_RUNNING)
+                automount_stop_expire(a);
+
+        if (!IN_SET(state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING))
+                unmount_autofs(a);
+
+        if (state != old_state)
+                log_unit_debug(UNIT(a), "Changed %s -> %s", automount_state_to_string(old_state), automount_state_to_string(state));
+
+        unit_notify(UNIT(a), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int automount_coldplug(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+        int r;
+
+        assert(a);
+        assert(a->state == AUTOMOUNT_DEAD);
+
+        if (a->deserialized_state == a->state)
+                return 0;
+
+        if (IN_SET(a->deserialized_state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) {
+
+                r = automount_set_where(a);
+                if (r < 0)
+                        return r;
+
+                r = open_dev_autofs(u->manager);
+                if (r < 0)
+                        return r;
+
+                assert(a->pipe_fd >= 0);
+
+                r = sd_event_add_io(u->manager->event, &a->pipe_event_source, a->pipe_fd, EPOLLIN, automount_dispatch_io, u);
+                if (r < 0)
+                        return r;
+
+                (void) sd_event_source_set_description(a->pipe_event_source, "automount-io");
+                if (a->deserialized_state == AUTOMOUNT_RUNNING) {
+                        r = automount_start_expire(a);
+                        if (r < 0)
+                                log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m");
+                }
+
+                automount_set_state(a, a->deserialized_state);
+        }
+
+        return 0;
+}
+
+static void automount_dump(Unit *u, FILE *f, const char *prefix) {
+        Automount *a = AUTOMOUNT(u);
+
+        assert(a);
+
+        fprintf(f,
+                "%sAutomount State: %s\n"
+                "%sResult: %s\n"
+                "%sWhere: %s\n"
+                "%sExtraOptions: %s\n"
+                "%sDirectoryMode: %04o\n"
+                "%sTimeoutIdleUSec: %s\n",
+                prefix, automount_state_to_string(a->state),
+                prefix, automount_result_to_string(a->result),
+                prefix, a->where,
+                prefix, a->extra_options,
+                prefix, a->directory_mode,
+                prefix, FORMAT_TIMESPAN(a->timeout_idle_usec, USEC_PER_SEC));
+}
+
+static void automount_enter_dead(Automount *a, AutomountResult f) {
+        assert(a);
+
+        if (a->result == AUTOMOUNT_SUCCESS)
+                a->result = f;
+
+        unit_log_result(UNIT(a), a->result == AUTOMOUNT_SUCCESS, automount_result_to_string(a->result));
+        automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD);
+}
+
+static int open_dev_autofs(Manager *m) {
+        struct autofs_dev_ioctl param;
+        int r;
+
+        assert(m);
+
+        if (m->dev_autofs_fd >= 0)
+                return m->dev_autofs_fd;
+
+        (void) label_fix("/dev/autofs", 0);
+
+        m->dev_autofs_fd = open("/dev/autofs", O_CLOEXEC|O_RDONLY);
+        if (m->dev_autofs_fd < 0)
+                return log_error_errno(errno, "Failed to open /dev/autofs: %m");
+
+        init_autofs_dev_ioctl(¶m);
+        r = RET_NERRNO(ioctl(m->dev_autofs_fd, AUTOFS_DEV_IOCTL_VERSION, ¶m));
+        if (r < 0) {
+                m->dev_autofs_fd = safe_close(m->dev_autofs_fd);
+                return log_error_errno(r, "Failed to issue AUTOFS_DEV_IOCTL_VERSION ioctl: %m");
+        }
+
+        log_debug("Autofs kernel version %u.%u", param.ver_major, param.ver_minor);
+
+        return m->dev_autofs_fd;
+}
+
+static int open_ioctl_fd(int dev_autofs_fd, const char *where, dev_t devid) {
+        struct autofs_dev_ioctl *param;
+        size_t l;
+
+        assert(dev_autofs_fd >= 0);
+        assert(where);
+
+        l = sizeof(struct autofs_dev_ioctl) + strlen(where) + 1;
+        param = alloca_safe(l);
+
+        init_autofs_dev_ioctl(param);
+        param->size = l;
+        param->ioctlfd = -EBADF;
+        param->openmount.devid = devid;
+        strcpy(param->path, where);
+
+        if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0)
+                return -errno;
+
+        if (param->ioctlfd < 0)
+                return -EIO;
+
+        (void) fd_cloexec(param->ioctlfd, true);
+        return param->ioctlfd;
+}
+
+static int autofs_protocol(int dev_autofs_fd, int ioctl_fd) {
+        uint32_t major, minor;
+        struct autofs_dev_ioctl param;
+
+        assert(dev_autofs_fd >= 0);
+        assert(ioctl_fd >= 0);
+
+        init_autofs_dev_ioctl(¶m);
+        param.ioctlfd = ioctl_fd;
+
+        if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOVER, ¶m) < 0)
+                return -errno;
+
+        major = param.protover.version;
+
+        init_autofs_dev_ioctl(¶m);
+        param.ioctlfd = ioctl_fd;
+
+        if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOSUBVER, ¶m) < 0)
+                return -errno;
+
+        minor = param.protosubver.sub_version;
+
+        log_debug("Autofs protocol version %u.%u", major, minor);
+        return 0;
+}
+
+static int autofs_set_timeout(int dev_autofs_fd, int ioctl_fd, usec_t usec) {
+        struct autofs_dev_ioctl param;
+
+        assert(dev_autofs_fd >= 0);
+        assert(ioctl_fd >= 0);
+
+        init_autofs_dev_ioctl(¶m);
+        param.ioctlfd = ioctl_fd;
+
+        if (usec == USEC_INFINITY)
+                param.timeout.timeout = 0;
+        else
+                /* Convert to seconds, rounding up. */
+                param.timeout.timeout = DIV_ROUND_UP(usec, USEC_PER_SEC);
+
+        return RET_NERRNO(ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_TIMEOUT, ¶m));
+}
+
+static int autofs_send_ready(int dev_autofs_fd, int ioctl_fd, uint32_t token, int status) {
+        struct autofs_dev_ioctl param;
+
+        assert(dev_autofs_fd >= 0);
+        assert(ioctl_fd >= 0);
+
+        init_autofs_dev_ioctl(¶m);
+        param.ioctlfd = ioctl_fd;
+
+        if (status != 0) {
+                param.fail.token = token;
+                param.fail.status = status;
+        } else
+                param.ready.token = token;
+
+        return RET_NERRNO(ioctl(dev_autofs_fd, status ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, ¶m));
+}
+
+static int automount_send_ready(Automount *a, Set *tokens, int status) {
+        _cleanup_close_ int ioctl_fd = -EBADF;
+        unsigned token;
+        int r;
+
+        assert(a);
+        assert(status <= 0);
+
+        if (set_isempty(tokens))
+                return 0;
+
+        ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+        if (ioctl_fd < 0)
+                return ioctl_fd;
+
+        if (status != 0)
+                log_unit_debug_errno(UNIT(a), status, "Sending failure: %m");
+        else
+                log_unit_debug(UNIT(a), "Sending success.");
+
+        r = 0;
+
+        /* Autofs thankfully does not hand out 0 as a token */
+        while ((token = PTR_TO_UINT(set_steal_first(tokens)))) {
+                int k;
+
+                /* Autofs fun fact:
+                 *
+                 * if you pass a positive status code here, kernels
+                 * prior to 4.12 will freeze! Yay! */
+
+                k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd,
+                                      ioctl_fd,
+                                      token,
+                                      status);
+                if (k < 0)
+                        r = k;
+        }
+
+        return r;
+}
+
+static void automount_trigger_notify(Unit *u, Unit *other) {
+        Automount *a = AUTOMOUNT(u);
+        int r;
+
+        assert(a);
+        assert(other);
+
+        /* Filter out invocations with bogus state */
+        assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+        assert(other->type == UNIT_MOUNT);
+
+        /* Don't propagate state changes from the mount if we are already down */
+        if (!IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING))
+                return;
+
+        /* Propagate start limit hit state */
+        if (other->start_limit_hit) {
+                automount_enter_dead(a, AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT);
+                return;
+        }
+
+        /* Don't propagate anything if there's still a job queued */
+        if (other->job)
+                return;
+
+        /* The mount is successfully established */
+        if (IN_SET(MOUNT(other)->state, MOUNT_MOUNTED, MOUNT_REMOUNTING)) {
+                (void) automount_send_ready(a, a->tokens, 0);
+
+                r = automount_start_expire(a);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m");
+
+                automount_set_state(a, AUTOMOUNT_RUNNING);
+        }
+
+        if (IN_SET(MOUNT(other)->state,
+                   MOUNT_MOUNTING, MOUNT_MOUNTING_DONE,
+                   MOUNT_MOUNTED, MOUNT_REMOUNTING,
+                   MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL,
+                   MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL,
+                   MOUNT_FAILED))
+                (void) automount_send_ready(a, a->expire_tokens, -ENODEV);
+
+        if (MOUNT(other)->state == MOUNT_DEAD)
+                (void) automount_send_ready(a, a->expire_tokens, 0);
+
+        /* The mount is in some unhappy state now, let's unfreeze any waiting clients */
+        if (IN_SET(MOUNT(other)->state,
+                   MOUNT_DEAD, MOUNT_UNMOUNTING,
+                   MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL,
+                   MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL,
+                   MOUNT_FAILED)) {
+
+                (void) automount_send_ready(a, a->tokens, -ENODEV);
+
+                automount_set_state(a, AUTOMOUNT_WAITING);
+        }
+}
+
+static void automount_enter_waiting(Automount *a) {
+        _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR;
+        _cleanup_close_ int ioctl_fd = -EBADF;
+        char name[STRLEN("systemd-") + DECIMAL_STR_MAX(pid_t) + 1];
+        _cleanup_free_ char *options = NULL;
+        bool mounted = false;
+        int r, dev_autofs_fd;
+        struct stat st;
+
+        assert(a);
+        assert(a->pipe_fd < 0);
+        assert(a->where);
+
+        set_clear(a->tokens);
+
+        r = unit_fail_if_noncanonical(UNIT(a), a->where);
+        if (r < 0)
+                goto fail;
+
+        (void) mkdir_p_label(a->where, a->directory_mode);
+
+        unit_warn_if_dir_nonempty(UNIT(a), a->where);
+
+        dev_autofs_fd = open_dev_autofs(UNIT(a)->manager);
+        if (dev_autofs_fd < 0)
+                goto fail;
+
+        if (pipe2(pipe_fd, O_CLOEXEC) < 0) {
+                log_unit_warning_errno(UNIT(a), errno, "Failed to allocate autofs pipe: %m");
+                goto fail;
+        }
+        r = fd_nonblock(pipe_fd[0], true);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(a), r, "Failed to make read side of pipe non-blocking: %m");
+                goto fail;
+        }
+
+        if (asprintf(
+                    &options,
+                    "fd=%i,pgrp="PID_FMT",minproto=5,maxproto=5,direct%s%s",
+                    pipe_fd[1],
+                    getpgrp(),
+                    isempty(a->extra_options) ? "" : ",",
+                    strempty(a->extra_options)) < 0) {
+                log_oom();
+                goto fail;
+        }
+
+        xsprintf(name, "systemd-"PID_FMT, getpid_cached());
+        r = mount_nofollow_verbose(LOG_WARNING, name, a->where, "autofs", 0, options);
+        if (r < 0)
+                goto fail;
+
+        mounted = true;
+
+        pipe_fd[1] = safe_close(pipe_fd[1]);
+
+        if (stat(a->where, &st) < 0) {
+                log_unit_warning_errno(UNIT(a), errno, "Failed to stat new automount point '%s': %m", a->where);
+                goto fail;
+        }
+
+        ioctl_fd = open_ioctl_fd(dev_autofs_fd, a->where, st.st_dev);
+        if (ioctl_fd < 0) {
+                log_unit_warning_errno(UNIT(a), ioctl_fd, "Failed to open automount ioctl fd for '%s': %m", a->where);
+                goto fail;
+        }
+
+        r = autofs_protocol(dev_autofs_fd, ioctl_fd);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(a), r, "Failed to validate autofs protocol for '%s': %m", a->where);
+                goto fail;
+        }
+
+        r = autofs_set_timeout(dev_autofs_fd, ioctl_fd, a->timeout_idle_usec);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(a), r, "Failed to set autofs timeout for '%s': %m", a->where);
+                goto fail;
+        }
+
+        r = sd_event_add_io(UNIT(a)->manager->event, &a->pipe_event_source, pipe_fd[0], EPOLLIN, automount_dispatch_io, a);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(a), r, "Failed to allocate IO event source for autofs mount '%s': %m", a->where);
+                goto fail;
+        }
+
+        (void) sd_event_source_set_description(a->pipe_event_source, "automount-io");
+
+        a->pipe_fd = TAKE_FD(pipe_fd[0]);
+        a->dev_id = st.st_dev;
+
+        automount_set_state(a, AUTOMOUNT_WAITING);
+        return;
+
+fail:
+        if (mounted) {
+                r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(a), r, "Failed to unmount, ignoring: %m");
+        }
+
+        automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+}
+
+static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) {
+        int r;
+
+        assert(dev_autofs_fd >= 0);
+        assert(ioctl_fd >= 0);
+
+        /* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the
+         * child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up
+         * anyway. */
+
+        r = safe_fork_full("(sd-expire)",
+                           /* stdio_fds= */ NULL,
+                           (int[]) { dev_autofs_fd, ioctl_fd },
+                           /* n_except_fds= */ 2,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG,
+                           /* pid= */ NULL);
+        if (r != 0)
+                return r;
+
+        /* Child */
+        for (;;) {
+                struct autofs_dev_ioctl param;
+                init_autofs_dev_ioctl(¶m);
+                param.ioctlfd = ioctl_fd;
+
+                if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, ¶m) < 0)
+                        break;
+        }
+
+        if (errno != EAGAIN)
+                log_warning_errno(errno, "Failed to expire automount, ignoring: %m");
+
+        _exit(EXIT_SUCCESS);
+}
+
+static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
+        _cleanup_close_ int ioctl_fd = -EBADF;
+        Automount *a = AUTOMOUNT(userdata);
+        int r;
+
+        assert(a);
+        assert(source == a->expire_event_source);
+
+        ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+        if (ioctl_fd < 0)
+                return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m");
+
+        r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m");
+
+        return automount_start_expire(a);
+}
+
+static int automount_start_expire(Automount *a) {
+        usec_t timeout;
+        int r;
+
+        assert(a);
+
+        if (a->timeout_idle_usec == 0)
+                return 0;
+
+        timeout = MAX(a->timeout_idle_usec/3, USEC_PER_SEC);
+
+        if (a->expire_event_source) {
+                r = sd_event_source_set_time_relative(a->expire_event_source, timeout);
+                if (r < 0)
+                        return r;
+
+                return sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_ONESHOT);
+        }
+
+        r = sd_event_add_time_relative(
+                        UNIT(a)->manager->event,
+                        &a->expire_event_source,
+                        CLOCK_MONOTONIC, timeout, 0,
+                        automount_dispatch_expire, a);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(a->expire_event_source, "automount-expire");
+
+        return 0;
+}
+
+static void automount_stop_expire(Automount *a) {
+        assert(a);
+
+        if (!a->expire_event_source)
+                return;
+
+        (void) sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_OFF);
+}
+
+static void automount_enter_running(Automount *a) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        Unit *trigger;
+        struct stat st;
+        int r;
+
+        assert(a);
+
+        /* If the user masked our unit in the meantime, fail */
+        if (UNIT(a)->load_state != UNIT_LOADED) {
+                log_unit_error(UNIT(a), "Suppressing automount event since unit is no longer loaded.");
+                goto fail;
+        }
+
+        /* We don't take mount requests anymore if we are supposed to
+         * shut down anyway */
+        if (unit_stop_pending(UNIT(a))) {
+                log_unit_debug(UNIT(a), "Suppressing automount request since unit stop is scheduled.");
+                automount_send_ready(a, a->tokens, -EHOSTDOWN);
+                automount_send_ready(a, a->expire_tokens, -EHOSTDOWN);
+                return;
+        }
+
+        (void) mkdir_p_label(a->where, a->directory_mode);
+
+        /* Before we do anything, let's see if somebody is playing games with us? */
+        if (lstat(a->where, &st) < 0) {
+                log_unit_warning_errno(UNIT(a), errno, "Failed to stat automount point: %m");
+                goto fail;
+        }
+
+        /* The mount unit may have been explicitly started before we got the
+         * autofs request. Ack it to unblock anything waiting on the mount point. */
+        if (!S_ISDIR(st.st_mode) || st.st_dev != a->dev_id) {
+                log_unit_info(UNIT(a), "Automount point already active?");
+                automount_send_ready(a, a->tokens, 0);
+                return;
+        }
+
+        trigger = UNIT_TRIGGER(UNIT(a));
+        if (!trigger) {
+                log_unit_error(UNIT(a), "Unit to trigger vanished.");
+                goto fail;
+        }
+
+        r = manager_add_job(UNIT(a)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL);
+        if (r < 0) {
+                log_unit_warning(UNIT(a), "Failed to queue mount startup job: %s", bus_error_message(&error, r));
+                goto fail;
+        }
+
+        automount_set_state(a, AUTOMOUNT_RUNNING);
+        return;
+
+fail:
+        automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+}
+
+static int automount_start(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+        int r;
+
+        assert(a);
+        assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED));
+
+        if (path_is_mount_point(a->where, NULL, 0) > 0)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), "Path %s is already a mount point, refusing start.", a->where);
+
+        r = unit_test_trigger_loaded(u);
+        if (r < 0)
+                return r;
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        a->result = AUTOMOUNT_SUCCESS;
+        automount_enter_waiting(a);
+        return 1;
+}
+
+static int automount_stop(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+
+        assert(a);
+        assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING));
+
+        automount_enter_dead(a, AUTOMOUNT_SUCCESS);
+        return 1;
+}
+
+static int automount_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Automount *a = AUTOMOUNT(u);
+        void *p;
+        int r;
+
+        assert(a);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", automount_state_to_string(a->state));
+        (void) serialize_item(f, "result", automount_result_to_string(a->result));
+        (void) serialize_item_format(f, "dev-id", "%lu", (unsigned long) a->dev_id);
+
+        SET_FOREACH(p, a->tokens)
+                (void) serialize_item_format(f, "token", "%u", PTR_TO_UINT(p));
+        SET_FOREACH(p, a->expire_tokens)
+                (void) serialize_item_format(f, "expire-token", "%u", PTR_TO_UINT(p));
+
+        r = serialize_fd(f, fds, "pipe-fd", a->pipe_fd);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Automount *a = AUTOMOUNT(u);
+        int r;
+
+        assert(a);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                AutomountState state;
+
+                state = automount_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        a->deserialized_state = state;
+        } else if (streq(key, "result")) {
+                AutomountResult f;
+
+                f = automount_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse result value: %s", value);
+                else if (f != AUTOMOUNT_SUCCESS)
+                        a->result = f;
+
+        } else if (streq(key, "dev-id")) {
+                unsigned long d;
+
+                if (safe_atolu(value, &d) < 0)
+                        log_unit_debug(u, "Failed to parse dev-id value: %s", value);
+                else
+                        a->dev_id = (dev_t) d;
+
+        } else if (streq(key, "token")) {
+                unsigned token;
+
+                if (safe_atou(value, &token) < 0)
+                        log_unit_debug(u, "Failed to parse token value: %s", value);
+                else {
+                        r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(token));
+                        if (r < 0)
+                                log_unit_error_errno(u, r, "Failed to add token to set: %m");
+                }
+        } else if (streq(key, "expire-token")) {
+                unsigned token;
+
+                if (safe_atou(value, &token) < 0)
+                        log_unit_debug(u, "Failed to parse token value: %s", value);
+                else {
+                        r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(token));
+                        if (r < 0)
+                                log_unit_error_errno(u, r, "Failed to add expire token to set: %m");
+                }
+        } else if (streq(key, "pipe-fd")) {
+                safe_close(a->pipe_fd);
+                a->pipe_fd = deserialize_fd(fds, value);
+        } else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static UnitActiveState automount_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[AUTOMOUNT(u)->state];
+}
+
+static const char *automount_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return automount_state_to_string(AUTOMOUNT(u)->state);
+}
+
+static bool automount_may_gc(Unit *u) {
+        Unit *t;
+
+        assert(u);
+
+        t = UNIT_TRIGGER(u);
+        if (!t)
+                return true;
+
+        return UNIT_VTABLE(t)->may_gc(t);
+}
+
+static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        union autofs_v5_packet_union packet;
+        Automount *a = AUTOMOUNT(userdata);
+        Unit *trigger;
+        int r;
+
+        assert(a);
+        assert(fd == a->pipe_fd);
+
+        if (events & (EPOLLHUP|EPOLLERR)) {
+                log_unit_error(UNIT(a), "Got hangup/error on autofs pipe from kernel. Likely our automount point has been unmounted by someone or something else?");
+                automount_enter_dead(a, AUTOMOUNT_FAILURE_UNMOUNTED);
+                return 0;
+        }
+
+        if (events != EPOLLIN) {
+                log_unit_error(UNIT(a), "Got invalid poll event %"PRIu32" on pipe (fd=%d)", events, fd);
+                goto fail;
+        }
+
+        r = loop_read_exact(a->pipe_fd, &packet, sizeof(packet), true);
+        if (r < 0) {
+                log_unit_error_errno(UNIT(a), r, "Invalid read from pipe: %m");
+                goto fail;
+        }
+
+        switch (packet.hdr.type) {
+
+        case autofs_ptype_missing_direct:
+
+                if (packet.v5_packet.pid > 0) {
+                        _cleanup_free_ char *p = NULL;
+
+                        (void) pid_get_comm(packet.v5_packet.pid, &p);
+                        log_unit_info(UNIT(a), "Got automount request for %s, triggered by %"PRIu32" (%s)", a->where, packet.v5_packet.pid, strna(p));
+                } else
+                        log_unit_debug(UNIT(a), "Got direct mount request on %s", a->where);
+
+                r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token));
+                if (r < 0) {
+                        log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m");
+                        goto fail;
+                }
+
+                automount_enter_running(a);
+                break;
+
+        case autofs_ptype_expire_direct:
+                log_unit_debug(UNIT(a), "Got direct umount request on %s", a->where);
+
+                automount_stop_expire(a);
+
+                r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token));
+                if (r < 0) {
+                        log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m");
+                        goto fail;
+                }
+
+                trigger = UNIT_TRIGGER(UNIT(a));
+                if (!trigger) {
+                        log_unit_error(UNIT(a), "Unit to trigger vanished.");
+                        goto fail;
+                }
+
+                r = manager_add_job(UNIT(a)->manager, JOB_STOP, trigger, JOB_REPLACE, NULL, &error, NULL);
+                if (r < 0) {
+                        log_unit_warning(UNIT(a), "Failed to queue unmount job: %s", bus_error_message(&error, r));
+                        goto fail;
+                }
+                break;
+
+        default:
+                log_unit_error(UNIT(a), "Received unknown automount request %i", packet.hdr.type);
+                break;
+        }
+
+        return 0;
+
+fail:
+        automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+        return 0;
+}
+
+static void automount_shutdown(Manager *m) {
+        assert(m);
+
+        m->dev_autofs_fd = safe_close(m->dev_autofs_fd);
+}
+
+static void automount_reset_failed(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+
+        assert(a);
+
+        if (a->state == AUTOMOUNT_FAILED)
+                automount_set_state(a, AUTOMOUNT_DEAD);
+
+        a->result = AUTOMOUNT_SUCCESS;
+}
+
+static bool automount_supported(void) {
+        static int supported = -1;
+
+        if (supported < 0)
+                supported = access("/dev/autofs", F_OK) >= 0;
+
+        return supported;
+}
+
+static int automount_can_start(Unit *u) {
+        Automount *a = AUTOMOUNT(u);
+        int r;
+
+        assert(a);
+
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT);
+                return r;
+        }
+
+        return 1;
+}
+
+static const char* const automount_result_table[_AUTOMOUNT_RESULT_MAX] = {
+        [AUTOMOUNT_SUCCESS]                       = "success",
+        [AUTOMOUNT_FAILURE_RESOURCES]             = "resources",
+        [AUTOMOUNT_FAILURE_START_LIMIT_HIT]       = "start-limit-hit",
+        [AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT] = "mount-start-limit-hit",
+        [AUTOMOUNT_FAILURE_UNMOUNTED]             = "unmounted",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(automount_result, AutomountResult);
+
+const UnitVTable automount_vtable = {
+        .object_size = sizeof(Automount),
+
+        .sections =
+                "Unit\0"
+                "Automount\0"
+                "Install\0",
+        .private_section = "Automount",
+
+        .can_transient = true,
+        .can_fail = true,
+        .can_trigger = true,
+        .exclude_from_switch_root_serialization = true,
+
+        .init = automount_init,
+        .load = automount_load,
+        .done = automount_done,
+
+        .coldplug = automount_coldplug,
+
+        .dump = automount_dump,
+
+        .start = automount_start,
+        .stop = automount_stop,
+
+        .serialize = automount_serialize,
+        .deserialize_item = automount_deserialize_item,
+
+        .active_state = automount_active_state,
+        .sub_state_to_string = automount_sub_state_to_string,
+
+        .may_gc = automount_may_gc,
+
+        .trigger_notify = automount_trigger_notify,
+
+        .reset_failed = automount_reset_failed,
+
+        .bus_set_property = bus_automount_set_property,
+
+        .shutdown = automount_shutdown,
+        .supported = automount_supported,
+
+        .status_message_formats = {
+                .finished_start_job = {
+                        [JOB_DONE]       = "Set up automount %s.",
+                        [JOB_FAILED]     = "Failed to set up automount %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Unset automount %s.",
+                        [JOB_FAILED]     = "Failed to unset automount %s.",
+                },
+        },
+
+        .can_start = automount_can_start,
+};
diff --git a/src/core/automount.h b/src/core/automount.h
new file mode 100644
index 0000000..e413f23
--- /dev/null
+++ b/src/core/automount.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Automount Automount;
+
+#include "unit.h"
+
+typedef enum AutomountResult {
+        AUTOMOUNT_SUCCESS,
+        AUTOMOUNT_FAILURE_RESOURCES,
+        AUTOMOUNT_FAILURE_UNMOUNTED,
+        AUTOMOUNT_FAILURE_START_LIMIT_HIT,
+        AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT,
+        _AUTOMOUNT_RESULT_MAX,
+        _AUTOMOUNT_RESULT_INVALID = -EINVAL,
+} AutomountResult;
+
+struct Automount {
+        Unit meta;
+
+        AutomountState state, deserialized_state;
+
+        char *where;
+        char *extra_options;
+        usec_t timeout_idle_usec;
+
+        int pipe_fd;
+        sd_event_source *pipe_event_source;
+        mode_t directory_mode;
+        dev_t dev_id;
+
+        Set *tokens;
+        Set *expire_tokens;
+
+        sd_event_source *expire_event_source;
+
+        AutomountResult result;
+};
+
+extern const UnitVTable automount_vtable;
+
+const char* automount_result_to_string(AutomountResult i) _const_;
+AutomountResult automount_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(AUTOMOUNT, Automount);
diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c
new file mode 100644
index 0000000..06d2146
--- /dev/null
+++ b/src/core/bpf-devices.c
@@ -0,0 +1,505 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "bpf-devices.h"
+#include "bpf-program.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+#define PASS_JUMP_OFF 4096
+
+/* Ensure the high level flags we use and the low-level BPF flags exposed on the kernel are defined the same way */
+assert_cc((unsigned) BPF_DEVCG_ACC_MKNOD == (unsigned) CGROUP_DEVICE_MKNOD);
+assert_cc((unsigned) BPF_DEVCG_ACC_READ  == (unsigned) CGROUP_DEVICE_READ);
+assert_cc((unsigned) BPF_DEVCG_ACC_WRITE == (unsigned) CGROUP_DEVICE_WRITE);
+
+static int bpf_prog_allow_list_device(
+                BPFProgram *prog,
+                char type,
+                int major,
+                int minor,
+                CGroupDevicePermissions p) {
+
+        int r;
+
+        assert(prog);
+
+        log_trace("%s: %c %d:%d %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p));
+
+        if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+                return -EINVAL;
+
+        assert(IN_SET(type, 'b', 'c'));
+        const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+        const struct bpf_insn insn[] = {
+                BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+                BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p),
+                BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 4), /* compare access type */
+
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 3),  /* compare device type */
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2),     /* compare major */
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1),     /* compare minor */
+                BPF_JMP_A(PASS_JUMP_OFF),                      /* jump to PASS */
+        };
+
+        if (p == _CGROUP_DEVICE_PERMISSIONS_ALL)
+                r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+        else
+                r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+        if (r < 0)
+                log_error_errno(r, "Extending device control BPF program failed: %m");
+
+        return r;
+}
+
+static int bpf_prog_allow_list_major(
+                BPFProgram *prog,
+                char type,
+                int major,
+                CGroupDevicePermissions p) {
+
+        int r;
+
+        assert(prog);
+
+        log_trace("%s: %c %d:* %s", __func__, type, major, cgroup_device_permissions_to_string(p));
+
+        if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+                return -EINVAL;
+
+        assert(IN_SET(type, 'b', 'c'));
+        const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+        const struct bpf_insn insn[] = {
+                BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+                BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p),
+                BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */
+
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 2),  /* compare device type */
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1),     /* compare major */
+                BPF_JMP_A(PASS_JUMP_OFF),                      /* jump to PASS */
+        };
+
+        if (p == _CGROUP_DEVICE_PERMISSIONS_ALL)
+                r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+        else
+                r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+        if (r < 0)
+                log_error_errno(r, "Extending device control BPF program failed: %m");
+
+        return r;
+}
+
+static int bpf_prog_allow_list_class(
+                BPFProgram *prog,
+                char type,
+                CGroupDevicePermissions p) {
+
+        int r;
+
+        assert(prog);
+
+        log_trace("%s: %c *:* %s", __func__, type, cgroup_device_permissions_to_string(p));
+
+        if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+                return -EINVAL;
+
+        assert(IN_SET(type, 'b', 'c'));
+        const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+        const struct bpf_insn insn[] = {
+                BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+                BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p),
+                BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */
+
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 1), /* compare device type */
+                BPF_JMP_A(PASS_JUMP_OFF),                     /* jump to PASS */
+        };
+
+        if (p == _CGROUP_DEVICE_PERMISSIONS_ALL)
+                r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+        else
+                r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+        if (r < 0)
+                log_error_errno(r, "Extending device control BPF program failed: %m");
+
+        return r;
+}
+
+int bpf_devices_cgroup_init(
+                BPFProgram **ret,
+                CGroupDevicePolicy policy,
+                bool allow_list) {
+
+        const struct bpf_insn pre_insn[] = {
+                /* load device type to r2 */
+                BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+                            offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+                BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+                /* load access type to r3 */
+                BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+                            offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+                BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+                /* load major number to r4 */
+                BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+                            offsetof(struct bpf_cgroup_dev_ctx, major)),
+
+                /* load minor number to r5 */
+                BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+                            offsetof(struct bpf_cgroup_dev_ctx, minor)),
+        };
+
+        _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+        int r;
+
+        assert(ret);
+
+        if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list)
+                return 0;
+
+        r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog);
+        if (r < 0)
+                return log_error_errno(r, "Loading device control BPF program failed: %m");
+
+        if (policy == CGROUP_DEVICE_POLICY_CLOSED || allow_list) {
+                r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn));
+                if (r < 0)
+                        return log_error_errno(r, "Extending device control BPF program failed: %m");
+        }
+
+        *ret = TAKE_PTR(prog);
+
+        return 0;
+}
+
+int bpf_devices_apply_policy(
+                BPFProgram **prog,
+                CGroupDevicePolicy policy,
+                bool allow_list,
+                const char *cgroup_path,
+                BPFProgram **prog_installed) {
+
+        _cleanup_free_ char *controller_path = NULL;
+        int r;
+
+        /* This will assign *prog_installed if everything goes well. */
+
+        assert(prog);
+        if (!*prog)
+                goto finish;
+
+        const bool deny_everything = policy == CGROUP_DEVICE_POLICY_STRICT && !allow_list;
+
+        const struct bpf_insn post_insn[] = {
+                /* return DENY */
+                BPF_MOV64_IMM(BPF_REG_0, 0),
+                BPF_JMP_A(1),
+        };
+
+        const struct bpf_insn exit_insn[] = {
+                /* finally return DENY if deny_everything else ALLOW */
+                BPF_MOV64_IMM(BPF_REG_0, deny_everything ? 0 : 1),
+                BPF_EXIT_INSN()
+        };
+
+        if (!deny_everything) {
+                r = bpf_program_add_instructions(*prog, post_insn, ELEMENTSOF(post_insn));
+                if (r < 0)
+                        return log_error_errno(r, "Extending device control BPF program failed: %m");
+
+                /* Fixup PASS_JUMP_OFF jump offsets. */
+                for (size_t off = 0; off < (*prog)->n_instructions; off++) {
+                        struct bpf_insn *ins = &((*prog)->instructions[off]);
+
+                        if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF)
+                                ins->off = (*prog)->n_instructions - off - 1;
+                }
+        }
+
+        r = bpf_program_add_instructions(*prog, exit_insn, ELEMENTSOF(exit_insn));
+        if (r < 0)
+                return log_error_errno(r, "Extending device control BPF program failed: %m");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, NULL, &controller_path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine cgroup path: %m");
+
+        r = bpf_program_cgroup_attach(*prog, BPF_CGROUP_DEVICE, controller_path, BPF_F_ALLOW_MULTI);
+        if (r < 0)
+                return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m",
+                                       empty_to_root(cgroup_path));
+
+ finish:
+        /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */
+        if (prog_installed) {
+                bpf_program_free(*prog_installed);
+                *prog_installed = TAKE_PTR(*prog);
+        }
+        return 0;
+}
+
+int bpf_devices_supported(void) {
+        const struct bpf_insn trivial[] = {
+                BPF_MOV64_IMM(BPF_REG_0, 1),
+                BPF_EXIT_INSN()
+        };
+
+        _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
+        static int supported = -1;
+        int r;
+
+        /* Checks whether BPF device controller is supported. For this, we check five things:
+         *
+         * a) whether we are privileged
+         * b) whether the unified hierarchy is being used
+         * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require
+         */
+
+        if (supported >= 0)
+                return supported;
+
+        if (geteuid() != 0) {
+                log_debug("Not enough privileges, BPF device control is not supported.");
+                return supported = 0;
+        }
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
+        if (r == 0) {
+                log_debug("Not running with unified cgroups, BPF device control is not supported.");
+                return supported = 0;
+        }
+
+        r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &program);
+        if (r < 0) {
+                log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+                return supported = 0;
+        }
+
+        r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
+        if (r < 0) {
+                log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+                return supported = 0;
+        }
+
+        r = bpf_program_load_kernel(program, NULL, 0);
+        if (r < 0) {
+                log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+                return supported = 0;
+        }
+
+        return supported = 1;
+}
+
+static int allow_list_device_pattern(
+                BPFProgram *prog,
+                const char *path,
+                char type,
+                const unsigned *maj,
+                const unsigned *min,
+                CGroupDevicePermissions p) {
+
+        assert(IN_SET(type, 'b', 'c'));
+
+        if (cg_all_unified() > 0) {
+                if (!prog)
+                        return 0;
+
+                if (maj && min)
+                        return bpf_prog_allow_list_device(prog, type, *maj, *min, p);
+                else if (maj)
+                        return bpf_prog_allow_list_major(prog, type, *maj, p);
+                else
+                        return bpf_prog_allow_list_class(prog, type, p);
+
+        } else {
+                char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4];
+                int r;
+
+                if (maj && min)
+                        xsprintf(buf, "%c %u:%u %s", type, *maj, *min, cgroup_device_permissions_to_string(p));
+                else if (maj)
+                        xsprintf(buf, "%c %u:* %s", type, *maj, cgroup_device_permissions_to_string(p));
+                else
+                        xsprintf(buf, "%c *:* %s", type, cgroup_device_permissions_to_string(p));
+
+                /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
+                 * EINVAL here. */
+
+                r = cg_set_attribute("devices", path, "devices.allow", buf);
+                if (r < 0)
+                        log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
+                                       r, "Failed to set devices.allow on %s: %m", path);
+
+                return r;
+        }
+}
+
+int bpf_devices_allow_list_device(
+                BPFProgram *prog,
+                const char *path,
+                const char *node,
+                CGroupDevicePermissions p) {
+
+        mode_t mode;
+        dev_t rdev;
+        int r;
+
+        assert(path);
+        assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+        log_trace("%s: %s %s", __func__, node, cgroup_device_permissions_to_string(p));
+
+        /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
+         * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
+         * means clients can use these path without the device node actually around */
+        r = device_path_parse_major_minor(node, &mode, &rdev);
+        if (r < 0) {
+                if (r != -ENODEV)
+                        return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
+
+                struct stat st;
+                if (stat(node, &st) < 0)
+                        return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
+
+                if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
+                        return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node);
+
+                mode = st.st_mode;
+                rdev = (dev_t) st.st_rdev;
+        }
+
+        unsigned maj = major(rdev), min = minor(rdev);
+        return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, p);
+}
+
+int bpf_devices_allow_list_major(
+                BPFProgram *prog,
+                const char *path,
+                const char *name,
+                char type,
+                CGroupDevicePermissions permissions) {
+
+        unsigned maj;
+        int r;
+
+        assert(path);
+        assert(IN_SET(type, 'b', 'c'));
+        assert(permissions >= 0 && permissions < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+        if (streq(name, "*"))
+                /* If the name is a wildcard, then apply this list to all devices of this type */
+                return allow_list_device_pattern(prog, path, type, NULL, NULL, permissions);
+
+        if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj))
+                /* The name is numeric and suitable as major. In that case, let's take its major, and create
+                 * the entry directly. */
+                return allow_list_device_pattern(prog, path, type, &maj, NULL, permissions);
+
+        _cleanup_fclose_ FILE *f = NULL;
+        bool good = false, any = false;
+
+        f = fopen("/proc/devices", "re");
+        if (!f)
+                return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s: %m", name);
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+                char *w, *p;
+
+                r = read_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to read /proc/devices: %m");
+                if (r == 0)
+                        break;
+
+                if (type == 'c' && streq(line, "Character devices:")) {
+                        good = true;
+                        continue;
+                }
+
+                if (type == 'b' && streq(line, "Block devices:")) {
+                        good = true;
+                        continue;
+                }
+
+                if (isempty(line)) {
+                        good = false;
+                        continue;
+                }
+
+                if (!good)
+                        continue;
+
+                p = strstrip(line);
+
+                w = strpbrk(p, WHITESPACE);
+                if (!w)
+                        continue;
+                *w = 0;
+
+                r = safe_atou(p, &maj);
+                if (r < 0)
+                        continue;
+                if (maj <= 0)
+                        continue;
+
+                w++;
+                w += strspn(w, WHITESPACE);
+
+                if (fnmatch(name, w, 0) != 0)
+                        continue;
+
+                any = true;
+                (void) allow_list_device_pattern(prog, path, type, &maj, NULL, permissions);
+        }
+
+        if (!any)
+                return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
+                                       "Device allow list pattern \"%s\" did not match anything.", name);
+
+        return 0;
+}
+
+int bpf_devices_allow_list_static(
+                BPFProgram *prog,
+                const char *path) {
+
+        static const char auto_devices[] =
+                "/dev/null\0" "rwm\0"
+                "/dev/zero\0" "rwm\0"
+                "/dev/full\0" "rwm\0"
+                "/dev/random\0" "rwm\0"
+                "/dev/urandom\0" "rwm\0"
+                "/dev/tty\0" "rwm\0"
+                "/dev/ptmx\0" "rwm\0"
+                /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
+                "/run/systemd/inaccessible/chr\0" "rwm\0"
+                "/run/systemd/inaccessible/blk\0" "rwm\0";
+        int r = 0, k;
+
+        NULSTR_FOREACH_PAIR(node, acc, auto_devices) {
+                k = bpf_devices_allow_list_device(prog, path, node, cgroup_device_permissions_from_string(acc));
+                if (r >= 0 && k < 0)
+                        r = k;
+        }
+
+        /* PTS (/dev/pts) devices may not be duplicated, but accessed */
+        k = bpf_devices_allow_list_major(prog, path, "pts", 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
+        if (r >= 0 && k < 0)
+                r = k;
+
+        return r;
+}
diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h
new file mode 100644
index 0000000..5660e1a
--- /dev/null
+++ b/src/core/bpf-devices.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cgroup.h"
+
+typedef struct BPFProgram BPFProgram;
+
+int bpf_devices_cgroup_init(BPFProgram **ret, CGroupDevicePolicy policy, bool allow_list);
+int bpf_devices_apply_policy(
+                BPFProgram **prog,
+                CGroupDevicePolicy policy,
+                bool allow_list,
+                const char *cgroup_path,
+                BPFProgram **prog_installed);
+
+int bpf_devices_supported(void);
+int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, CGroupDevicePermissions p);
+int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, CGroupDevicePermissions p);
+int bpf_devices_allow_list_static(BPFProgram *prog, const char *path);
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
new file mode 100644
index 0000000..66773e1
--- /dev/null
+++ b/src/core/bpf-firewall.c
@@ -0,0 +1,974 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-program.h"
+#include "fd-util.h"
+#include "in-addr-prefix-util.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "unit.h"
+#include "strv.h"
+#include "virt.h"
+
+enum {
+        MAP_KEY_PACKETS,
+        MAP_KEY_BYTES,
+};
+
+enum {
+        ACCESS_ALLOWED = 1,
+        ACCESS_DENIED  = 2,
+};
+
+/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
+
+static int add_lookup_instructions(
+                BPFProgram *p,
+                int map_fd,
+                int protocol,
+                bool is_ingress,
+                int verdict) {
+
+        int r, addr_offset, addr_size;
+
+        assert(p);
+        assert(map_fd >= 0);
+
+        switch (protocol) {
+
+        case ETH_P_IP:
+                addr_size = sizeof(uint32_t);
+                addr_offset = is_ingress ?
+                        offsetof(struct iphdr, saddr) :
+                        offsetof(struct iphdr, daddr);
+                break;
+
+        case ETH_P_IPV6:
+                addr_size = 4 * sizeof(uint32_t);
+                addr_offset = is_ingress ?
+                        offsetof(struct ip6_hdr, ip6_src.s6_addr) :
+                        offsetof(struct ip6_hdr, ip6_dst.s6_addr);
+                break;
+
+        default:
+                return -EAFNOSUPPORT;
+        }
+
+        do {
+                /* Compare IPv4 with one word instruction (32-bit) */
+                struct bpf_insn insn[] = {
+                        /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
+                        BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
+
+                        /*
+                         * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
+                         *
+                         * R1: Pointer to the skb
+                         * R2: Data offset
+                         * R3: Destination buffer on the stack (r10 - 4)
+                         * R4: Number of bytes to read (4)
+                         */
+
+                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+                        BPF_MOV32_IMM(BPF_REG_2, addr_offset),
+
+                        BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
+
+                        BPF_MOV32_IMM(BPF_REG_4, addr_size),
+                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+
+                        /*
+                         * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
+                         * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
+                         * has to be set to the maximum possible value.
+                         *
+                         * On success, the looked up value is stored in R0. For this application, the actual
+                         * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
+                         * matching value.
+                         */
+
+                        BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
+                        BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
+
+                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+                        BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
+                };
+
+                /* Jump label fixup */
+                insn[0].off = ELEMENTSOF(insn) - 1;
+
+                r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+                if (r < 0)
+                        return r;
+
+        } while (false);
+
+        return 0;
+}
+
+static int add_instructions_for_ip_any(
+                BPFProgram *p,
+                int verdict) {
+        int r;
+
+        assert(p);
+
+        const struct bpf_insn insn[] = {
+                BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
+        };
+
+        r = bpf_program_add_instructions(p, insn, 1);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int bpf_firewall_compile_bpf(
+                Unit *u,
+                const char *prog_name,
+                bool is_ingress,
+                BPFProgram **ret,
+                bool ip_allow_any,
+                bool ip_deny_any) {
+
+        const struct bpf_insn pre_insn[] = {
+                /*
+                 * When the eBPF program is entered, R1 contains the address of the skb.
+                 * However, R1-R5 are scratch registers that are not preserved when calling
+                 * into kernel functions, so we need to save anything that's supposed to
+                 * stay around to R6-R9. Save the skb to R6.
+                 */
+                BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+                /*
+                 * Although we cannot access the skb data directly from eBPF programs used in this
+                 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
+                 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
+                 * for later use.
+                 */
+                BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
+
+                /*
+                 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
+                 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
+                 */
+                BPF_MOV32_IMM(BPF_REG_8, 0),
+        };
+
+        /*
+         * The access checkers compiled for the configured allowance and denial lists
+         * write to R8 at runtime. The following code prepares for an early exit that
+         * skip the accounting if the packet is denied.
+         *
+         * R0 = 1
+         * if (R8 == ACCESS_DENIED)
+         *     R0 = 0
+         *
+         * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
+         * is allowed to pass.
+         */
+        const struct bpf_insn post_insn[] = {
+                BPF_MOV64_IMM(BPF_REG_0, 1),
+                BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
+                BPF_MOV64_IMM(BPF_REG_0, 0),
+        };
+
+        _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+        int accounting_map_fd, r;
+        bool access_enabled;
+
+        assert(u);
+        assert(ret);
+
+        accounting_map_fd = is_ingress ?
+                u->ip_accounting_ingress_map_fd :
+                u->ip_accounting_egress_map_fd;
+
+        access_enabled =
+                u->ipv4_allow_map_fd >= 0 ||
+                u->ipv6_allow_map_fd >= 0 ||
+                u->ipv4_deny_map_fd >= 0 ||
+                u->ipv6_deny_map_fd >= 0 ||
+                ip_allow_any ||
+                ip_deny_any;
+
+        if (accounting_map_fd < 0 && !access_enabled) {
+                *ret = NULL;
+                return 0;
+        }
+
+        r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
+        if (r < 0)
+                return r;
+
+        r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
+        if (r < 0)
+                return r;
+
+        if (access_enabled) {
+                /*
+                 * The simple rule this function translates into eBPF instructions is:
+                 *
+                 * - Access will be granted when an address matches an entry in @list_allow
+                 * - Otherwise, access will be denied when an address matches an entry in @list_deny
+                 * - Otherwise, access will be granted
+                 */
+
+                if (u->ipv4_deny_map_fd >= 0) {
+                        r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (u->ipv6_deny_map_fd >= 0) {
+                        r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (u->ipv4_allow_map_fd >= 0) {
+                        r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (u->ipv6_allow_map_fd >= 0) {
+                        r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (ip_allow_any) {
+                        r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (ip_deny_any) {
+                        r = add_instructions_for_ip_any(p, ACCESS_DENIED);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
+        if (r < 0)
+                return r;
+
+        if (accounting_map_fd >= 0) {
+                struct bpf_insn insn[] = {
+                        /*
+                         * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
+                         * The jump label will be fixed up later.
+                         */
+                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
+
+                        /* Count packets */
+                        BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+                        BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+                        BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
+                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                        BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+                        BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+                        /* Count bytes */
+                        BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+                        BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+                        BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
+                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                        BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+                        BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+                        /* Allow the packet to pass */
+                        BPF_MOV64_IMM(BPF_REG_0, 1),
+                };
+
+                /* Jump label fixup */
+                insn[0].off = ELEMENTSOF(insn) - 1;
+
+                r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+                if (r < 0)
+                        return r;
+        }
+
+        do {
+                /*
+                 * Exit from the eBPF program, R0 contains the verdict.
+                 * 0 means the packet is denied, 1 means the packet may pass.
+                 */
+                const struct bpf_insn insn[] = {
+                        BPF_EXIT_INSN()
+                };
+
+                r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+                if (r < 0)
+                        return r;
+        } while (false);
+
+        *ret = TAKE_PTR(p);
+
+        return 0;
+}
+
+static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
+        struct in_addr_prefix *a;
+
+        assert(n_ipv4);
+        assert(n_ipv6);
+
+        SET_FOREACH(a, prefixes)
+                switch (a->family) {
+
+                case AF_INET:
+                        (*n_ipv4)++;
+                        break;
+
+                case AF_INET6:
+                        (*n_ipv6)++;
+                        break;
+
+                default:
+                        return -EAFNOSUPPORT;
+                }
+
+        return 0;
+}
+
+static int bpf_firewall_add_access_items(
+                Set *prefixes,
+                int ipv4_map_fd,
+                int ipv6_map_fd,
+                int verdict) {
+
+        struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
+        struct in_addr_prefix *a;
+        uint64_t value = verdict;
+        int r;
+
+        key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
+        key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
+
+        SET_FOREACH(a, prefixes)
+                switch (a->family) {
+
+                case AF_INET:
+                        key_ipv4->prefixlen = a->prefixlen;
+                        memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
+
+                        r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case AF_INET6:
+                        key_ipv6->prefixlen = a->prefixlen;
+                        memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
+
+                        r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                default:
+                        return -EAFNOSUPPORT;
+                }
+
+        return 0;
+}
+
+static int bpf_firewall_prepare_access_maps(
+                Unit *u,
+                int verdict,
+                int *ret_ipv4_map_fd,
+                int *ret_ipv6_map_fd,
+                bool *ret_has_any) {
+
+        _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF;
+        size_t n_ipv4 = 0, n_ipv6 = 0;
+        Unit *p;
+        int r;
+
+        assert(ret_ipv4_map_fd);
+        assert(ret_ipv6_map_fd);
+        assert(ret_has_any);
+
+        for (p = u; p; p = UNIT_GET_SLICE(p)) {
+                CGroupContext *cc;
+                Set *prefixes;
+                bool *reduced;
+
+                cc = unit_get_cgroup_context(p);
+                if (!cc)
+                        continue;
+
+                prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
+                reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
+
+                if (!*reduced) {
+                        r = in_addr_prefixes_reduce(prefixes);
+                        if (r < 0)
+                                return r;
+
+                        *reduced = true;
+                }
+
+                bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
+
+                /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
+                 * needing CAP_SYS_ADMIN for allocating LPM trie map. */
+                if (in_addr_prefixes_is_any(prefixes)) {
+                        *ret_has_any = true;
+                        return 0;
+                }
+        }
+
+        if (n_ipv4 > 0) {
+                char *name = strjoina("4_", u->id);
+                ipv4_map_fd = bpf_map_new(
+                                name,
+                                BPF_MAP_TYPE_LPM_TRIE,
+                                offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
+                                sizeof(uint64_t),
+                                n_ipv4,
+                                BPF_F_NO_PREALLOC);
+                if (ipv4_map_fd < 0)
+                        return ipv4_map_fd;
+        }
+
+        if (n_ipv6 > 0) {
+                char *name = strjoina("6_", u->id);
+                ipv6_map_fd = bpf_map_new(
+                                name,
+                                BPF_MAP_TYPE_LPM_TRIE,
+                                offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
+                                sizeof(uint64_t),
+                                n_ipv6,
+                                BPF_F_NO_PREALLOC);
+                if (ipv6_map_fd < 0)
+                        return ipv6_map_fd;
+        }
+
+        for (p = u; p; p = UNIT_GET_SLICE(p)) {
+                CGroupContext *cc;
+
+                cc = unit_get_cgroup_context(p);
+                if (!cc)
+                        continue;
+
+                r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
+                                                  ipv4_map_fd, ipv6_map_fd, verdict);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
+        *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
+        *ret_has_any = false;
+        return 0;
+}
+
+static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
+        int r;
+
+        assert(u);
+        assert(fd_ingress);
+        assert(fd_egress);
+
+        if (enabled) {
+                if (*fd_ingress < 0) {
+                        char *name = strjoina("I_", u->id);
+                        r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
+                        if (r < 0)
+                                return r;
+
+                        *fd_ingress = r;
+                }
+
+                if (*fd_egress < 0) {
+                        char *name = strjoina("E_", u->id);
+                        r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
+                        if (r < 0)
+                                return r;
+
+                        *fd_egress = r;
+                }
+
+        } else {
+                *fd_ingress = safe_close(*fd_ingress);
+                *fd_egress = safe_close(*fd_egress);
+
+                zero(u->ip_accounting_extra);
+        }
+
+        return 0;
+}
+
+int bpf_firewall_compile(Unit *u) {
+        const char *ingress_name = NULL, *egress_name = NULL;
+        bool ip_allow_any = false, ip_deny_any = false;
+        CGroupContext *cc;
+        int r, supported;
+
+        assert(u);
+
+        cc = unit_get_cgroup_context(u);
+        if (!cc)
+                return -EINVAL;
+
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+        if (supported == BPF_FIREWALL_UNSUPPORTED)
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                            "bpf-firewall: BPF firewalling not supported, proceeding without.");
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
+                /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
+                 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
+                 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
+                 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
+                 * all, either. */
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                            "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
+
+        /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
+         * kernel). */
+        if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+                ingress_name = "sd_fw_ingress";
+                egress_name = "sd_fw_egress";
+        }
+
+        /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
+         * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
+         * configuration, but we don't flush out the accounting unnecessarily */
+
+        u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
+        u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+
+        u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
+        u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+
+        u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
+        u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+
+        if (u->type != UNIT_SLICE) {
+                /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
+                 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
+                 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
+                 * means that all configure IP access rules *will* take effect on processes, even though we never
+                 * compile them for inner nodes. */
+
+                r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
+
+                r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
+        }
+
+        r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
+
+        r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
+
+        r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
+
+        return 0;
+}
+
+static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
+        set_clear(*set);
+
+        STRV_FOREACH(bpf_fs_path, filter_paths) {
+                _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+                int r;
+
+                r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
+
+                r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
+
+                r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
+                if (r < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+int bpf_firewall_load_custom(Unit *u) {
+        CGroupContext *cc;
+        int r, supported;
+
+        assert(u);
+
+        cc = unit_get_cgroup_context(u);
+        if (!cc)
+                return 0;
+
+        if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
+                return 0;
+
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                            "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
+
+        r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
+        if (r < 0)
+                return r;
+        r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
+        BPFProgram *prog;
+        int r;
+
+        assert(u);
+
+        set_clear(*set_installed);
+        r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
+        if (r < 0)
+                return log_oom();
+
+        SET_FOREACH_MOVE(prog, *set_installed, *set) {
+                r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
+        }
+        return 0;
+}
+
+int bpf_firewall_install(Unit *u) {
+        _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
+        _cleanup_free_ char *path = NULL;
+        CGroupContext *cc;
+        int r, supported;
+        uint32_t flags;
+
+        assert(u);
+
+        cc = unit_get_cgroup_context(u);
+        if (!cc)
+                return -EINVAL;
+        if (!u->cgroup_path)
+                return -EINVAL;
+        if (!u->cgroup_realized)
+                return -EINVAL;
+
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+        if (supported == BPF_FIREWALL_UNSUPPORTED)
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                            "bpf-firewall: BPF firewalling not supported, proceeding without.");
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                            "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
+            (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                            "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
+
+        flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
+
+        if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
+                /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
+                 * after attaching the new programs, so that there's no time window where neither program is
+                 * attached. (There will be a program where both are attached, but that's OK, since this is a
+                 * security feature where we rather want to lock down too much than too little */
+                ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
+                ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
+        } else {
+                /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
+                 * detach them) right before attaching the new program, to minimize the time window when we
+                 * don't account for IP traffic. */
+                u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+                u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+        }
+
+        if (u->ip_bpf_egress) {
+                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
+                if (r < 0)
+                        return log_unit_error_errno(u, r,
+                                "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
+
+                /* Remember that this BPF program is installed now. */
+                u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
+        }
+
+        if (u->ip_bpf_ingress) {
+                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
+                if (r < 0)
+                        return log_unit_error_errno(u, r,
+                                "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
+
+                u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
+        }
+
+        /* And now, definitely get rid of the old programs, and detach them */
+        ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
+        ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
+
+        r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
+        if (r < 0)
+                return r;
+
+        r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
+        uint64_t key, packets;
+        int r;
+
+        if (map_fd < 0)
+                return -EBADF;
+
+        if (ret_packets) {
+                key = MAP_KEY_PACKETS;
+                r = bpf_map_lookup_element(map_fd, &key, &packets);
+                if (r < 0)
+                        return r;
+        }
+
+        if (ret_bytes) {
+                key = MAP_KEY_BYTES;
+                r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
+                if (r < 0)
+                        return r;
+        }
+
+        if (ret_packets)
+                *ret_packets = packets;
+
+        return 0;
+}
+
+int bpf_firewall_reset_accounting(int map_fd) {
+        uint64_t key, value = 0;
+        int r;
+
+        if (map_fd < 0)
+                return -EBADF;
+
+        key = MAP_KEY_PACKETS;
+        r = bpf_map_update_element(map_fd, &key, &value);
+        if (r < 0)
+                return r;
+
+        key = MAP_KEY_BYTES;
+        return bpf_map_update_element(map_fd, &key, &value);
+}
+
+static int bpf_firewall_unsupported_reason = 0;
+
+int bpf_firewall_supported(void) {
+        const struct bpf_insn trivial[] = {
+                BPF_MOV64_IMM(BPF_REG_0, 1),
+                BPF_EXIT_INSN()
+        };
+
+        _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
+        static int supported = -1;
+        union bpf_attr attr;
+        int r;
+
+        /* Checks whether BPF firewalling is supported. For this, we check the following things:
+         *
+         * - whether the unified hierarchy is being used
+         * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
+         * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
+         */
+        if (supported >= 0)
+                return supported;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m");
+        if (r == 0) {
+                bpf_firewall_unsupported_reason =
+                        log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
+                                        "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported.");
+                return supported = BPF_FIREWALL_UNSUPPORTED;
+        }
+
+        /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
+        r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
+        if (r < 0) {
+                bpf_firewall_unsupported_reason =
+                        log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+                return supported = BPF_FIREWALL_UNSUPPORTED;
+        }
+
+        r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
+        if (r < 0) {
+                bpf_firewall_unsupported_reason =
+                        log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+                return supported = BPF_FIREWALL_UNSUPPORTED;
+        }
+
+        r = bpf_program_load_kernel(program, NULL, 0);
+        if (r < 0) {
+                bpf_firewall_unsupported_reason =
+                        log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+                return supported = BPF_FIREWALL_UNSUPPORTED;
+        }
+
+        /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
+         * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
+         * program if we can't do a thing with it later?
+         *
+         * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
+         * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
+         * parameters are validated however, and that'll fail with EBADF then. */
+
+        // FIXME: Clang doesn't 0-pad with structured initialization, causing
+        // the kernel to reject the bpf_attr as invalid. See:
+        // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
+        // Ideally it should behave like GCC, so that we can remove these workarounds.
+        zero(attr);
+        attr.attach_type = BPF_CGROUP_INET_EGRESS;
+        attr.target_fd = -EBADF;
+        attr.attach_bpf_fd = -EBADF;
+
+        if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
+                if (errno != EBADF) {
+                        bpf_firewall_unsupported_reason =
+                                log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
+                        return supported = BPF_FIREWALL_UNSUPPORTED;
+                }
+
+                /* YAY! */
+        } else {
+                bpf_firewall_unsupported_reason =
+                        log_debug_errno(SYNTHETIC_ERRNO(EBADE),
+                                        "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
+                                        "Something is weird, assuming BPF firewalling is broken and hence not supported.");
+                return supported = BPF_FIREWALL_UNSUPPORTED;
+        }
+
+        /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
+         * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
+         * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
+         * get EINVAL if it's not supported, and EBADF as before if it is available.
+         * Use probe result as the indicator that program name is also supported since they both were
+         * added in kernel 4.15. */
+
+        zero(attr);
+        attr.attach_type = BPF_CGROUP_INET_EGRESS;
+        attr.target_fd = -EBADF;
+        attr.attach_bpf_fd = -EBADF;
+        attr.attach_flags = BPF_F_ALLOW_MULTI;
+
+        if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
+                if (errno == EBADF) {
+                        log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
+                        return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
+                }
+
+                if (errno == EINVAL)
+                        log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
+                else
+                        log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
+
+                return supported = BPF_FIREWALL_SUPPORTED;
+        } else {
+                bpf_firewall_unsupported_reason =
+                        log_debug_errno(SYNTHETIC_ERRNO(EBADE),
+                                        "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
+                                        "Something is weird, assuming BPF firewalling is broken and hence not supported.");
+                return supported = BPF_FIREWALL_UNSUPPORTED;
+        }
+}
+
+void emit_bpf_firewall_warning(Unit *u) {
+        static bool warned = false;
+
+        assert(u);
+        assert(u->manager);
+
+        if (warned || MANAGER_IS_TEST_RUN(u->manager))
+                return;
+
+        bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
+
+        log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
+                            "unit configures an IP firewall, but %s.\n"
+                            "(This warning is only shown for the first unit using IP firewalling.)",
+                            getuid() != 0 ? "not running as root" :
+                            "the local system does not support BPF/cgroup firewalling");
+        warned = true;
+}
+
+void bpf_firewall_close(Unit *u) {
+        assert(u);
+
+        u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
+        u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
+
+        u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
+        u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
+        u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+        u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+
+        u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
+        u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+        u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+        u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+
+        u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
+        u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
+        u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
+        u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
+}
diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h
new file mode 100644
index 0000000..58b401f
--- /dev/null
+++ b/src/core/bpf-firewall.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "unit.h"
+
+enum {
+        BPF_FIREWALL_UNSUPPORTED          = 0,
+        BPF_FIREWALL_SUPPORTED            = 1,
+        BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2,
+};
+
+int bpf_firewall_supported(void);
+
+int bpf_firewall_compile(Unit *u);
+int bpf_firewall_install(Unit *u);
+int bpf_firewall_load_custom(Unit *u);
+
+int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets);
+int bpf_firewall_reset_accounting(int map_fd);
+
+void emit_bpf_firewall_warning(Unit *u);
+
+void bpf_firewall_close(Unit *u);
diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c
new file mode 100644
index 0000000..cff2f61
--- /dev/null
+++ b/src/core/bpf-foreign.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-foreign.h"
+#include "bpf-program.h"
+#include "cgroup.h"
+#include "memory-util.h"
+#include "missing_magic.h"
+#include "mountpoint-util.h"
+#include "set.h"
+#include "stat-util.h"
+
+typedef struct BPFForeignKey BPFForeignKey;
+struct BPFForeignKey {
+        uint32_t prog_id;
+        uint32_t attach_type;
+};
+
+static int bpf_foreign_key_new(uint32_t prog_id,
+                enum bpf_attach_type attach_type,
+                BPFForeignKey **ret) {
+        _cleanup_free_ BPFForeignKey *p = NULL;
+
+        assert(ret);
+
+        p = new(BPFForeignKey, 1);
+        if (!p)
+                return -ENOMEM;
+
+        *p = (BPFForeignKey) {
+                .prog_id = prog_id,
+                .attach_type = attach_type,
+        };
+
+        *ret = TAKE_PTR(p);
+
+        return 0;
+}
+
+static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeignKey *b) {
+        int r = CMP(a->prog_id, b->prog_id);
+        if (r != 0)
+                return r;
+
+        return CMP(a->attach_type, b->attach_type);
+}
+
+static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) {
+        siphash24_compress(&p->prog_id, sizeof(p->prog_id), h);
+        siphash24_compress(&p->attach_type, sizeof(p->attach_type), h);
+}
+
+DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops,
+                BPFForeignKey, bpf_foreign_key_hash_func, bpf_foreign_key_compare_func, free,
+                BPFProgram, bpf_program_free);
+
+static int attach_programs(Unit *u, const char *path, Hashmap* foreign_by_key, uint32_t attach_flags) {
+        const BPFForeignKey *key;
+        BPFProgram *prog;
+        int r, ret = 0;
+
+        assert(u);
+
+        HASHMAP_FOREACH_KEY(prog, key, foreign_by_key) {
+                r = bpf_program_cgroup_attach(prog, key->attach_type, path, attach_flags);
+                if (r < 0) {
+                        log_unit_error_errno(u, r, "bpf-foreign: Attaching foreign BPF program to cgroup %s failed: %m", path);
+                        if (ret >= 0)
+                                ret = r;
+                }
+        }
+
+        return ret;
+}
+
+/*
+ * Prepare foreign BPF program for installation:
+ * - Load the program from BPF filesystem to the kernel;
+ * - Store program FD identified by program ID and attach type in the unit.
+ */
+static int bpf_foreign_prepare(
+                Unit *u,
+                enum bpf_attach_type attach_type,
+                const char *bpffs_path) {
+        _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+        _cleanup_free_ BPFForeignKey *key = NULL;
+        uint32_t prog_id;
+        int r;
+
+        assert(u);
+        assert(bpffs_path);
+
+        r = path_is_fs_type(bpffs_path, BPF_FS_MAGIC);
+        if (r == -ENOENT) {
+                log_unit_warning_errno(u, r, "bpf-foreign: foreign program %s does not exist, skipping.", bpffs_path);
+                return 0;
+        }
+        if (r < 0)
+                return log_unit_error_errno(u, r,
+                                "bpf-foreign: Failed to determine filesystem type of %s: %m", bpffs_path);
+        if (r == 0)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                "bpf-foreign: Path in BPF filesystem is expected.");
+
+        r = bpf_program_new_from_bpffs_path(bpffs_path, &prog);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m");
+
+        r = bpf_program_get_id_by_fd(prog->kernel_fd, &prog_id);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-foreign: Failed to get BPF program id from fd: %m");
+
+        r = bpf_foreign_key_new(prog_id, attach_type, &key);
+        if (r < 0)
+                return log_unit_error_errno(u, r,
+                                "bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path);
+
+        r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog);
+        if (r == -EEXIST) {
+                log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m");
+                return 0;
+        }
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-foreign: Failed to put foreign BPF program into map: %m");
+
+        TAKE_PTR(key);
+        TAKE_PTR(prog);
+
+        return 0;
+}
+
+int bpf_foreign_install(Unit *u) {
+        _cleanup_free_ char *cgroup_path = NULL;
+        CGroupContext *cc;
+        int r, ret = 0;
+
+        assert(u);
+
+        cc = unit_get_cgroup_context(u);
+        if (!cc)
+                return 0;
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m");
+
+        LIST_FOREACH(programs, p, cc->bpf_foreign_programs) {
+                r = bpf_foreign_prepare(u, p->attach_type, p->bpffs_path);
+                if (r < 0 && ret >= 0)
+                        ret = r;
+        }
+
+        r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI);
+        return ret < 0 ? ret : r;
+}
diff --git a/src/core/bpf-foreign.h b/src/core/bpf-foreign.h
new file mode 100644
index 0000000..e387b1b
--- /dev/null
+++ b/src/core/bpf-foreign.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include "unit.h"
+
+static inline int bpf_foreign_supported(void) {
+        return cg_all_unified();
+}
+
+/*
+ * Attach cgroup-bpf programs foreign to systemd, i.e. loaded to the kernel by an entity
+ * external to systemd.
+ */
+int bpf_foreign_install(Unit *u);
diff --git a/src/core/bpf-lsm.c b/src/core/bpf-lsm.c
new file mode 100644
index 0000000..216fc34
--- /dev/null
+++ b/src/core/bpf-lsm.c
@@ -0,0 +1,320 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "filesystems.h"
+#include "log.h"
+#include "lsm-util.h"
+#include "manager.h"
+#include "mkdir.h"
+#include "nulstr-util.h"
+#include "stat-util.h"
+#include "strv.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang and llc compile time dependencies are satisfied */
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+
+#define CGROUP_HASH_SIZE_MAX 2048
+
+static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) {
+        /* restrict_fs_bpf__destroy handles object == NULL case */
+        (void) restrict_fs_bpf__destroy(obj);
+
+        return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free);
+
+static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
+        _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+
+        assert(prog);
+
+        link = sym_bpf_program__attach_lsm(prog);
+
+        /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
+         * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
+         * BPF_LSM_MAC attach type) is not supported. */
+        return sym_libbpf_get_error(link) == 0;
+}
+
+static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
+        _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+        _cleanup_close_ int inner_map_fd = -EBADF;
+        int r;
+
+        assert(ret_obj);
+
+        obj = restrict_fs_bpf__open();
+        if (!obj)
+                return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m");
+
+        /* TODO Maybe choose a number based on runtime information? */
+        r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX);
+        assert(r <= 0);
+        if (r < 0)
+                return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m",
+                                       sym_bpf_map__name(obj->maps.cgroup_hash));
+
+        /* Dummy map to satisfy the verifier */
+        inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL);
+        if (inner_map_fd < 0)
+                return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m");
+
+        r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd);
+        assert(r <= 0);
+        if (r < 0)
+                return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m");
+
+        r = restrict_fs_bpf__load(obj);
+        assert(r <= 0);
+        if (r < 0)
+                return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m");
+
+        *ret_obj = TAKE_PTR(obj);
+
+        return 0;
+}
+
+bool lsm_bpf_supported(bool initialize) {
+        _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+        static int supported = -1;
+        int r;
+
+        if (supported >= 0)
+                return supported;
+        if (!initialize)
+                return false;
+
+        if (!cgroup_bpf_supported())
+                return (supported = false);
+
+        r = lsm_supported("bpf");
+        if (r < 0) {
+                log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m");
+                return (supported = false);
+        }
+        if (r == 0) {
+                log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported");
+                return (supported = false);
+        }
+
+        r = prepare_restrict_fs_bpf(&obj);
+        if (r < 0)
+                return (supported = false);
+
+        if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) {
+                log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                  "bpf-lsm: Failed to link program; assuming BPF LSM is not available");
+                return (supported = false);
+        }
+
+        return (supported = true);
+}
+
+int lsm_bpf_setup(Manager *m) {
+        _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+        _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+        int r;
+
+        assert(m);
+
+        r = prepare_restrict_fs_bpf(&obj);
+        if (r < 0)
+                return r;
+
+        link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems);
+        r = sym_libbpf_get_error(link);
+        if (r != 0)
+                return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m",
+                                       sym_bpf_program__name(obj->progs.restrict_filesystems));
+
+        log_info("bpf-lsm: LSM BPF program attached");
+
+        obj->links.restrict_filesystems = TAKE_PTR(link);
+        m->restrict_fs = TAKE_PTR(obj);
+
+        return 0;
+}
+
+int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) {
+        uint32_t dummy_value = 1, zero = 0;
+        const char *fs;
+        const statfs_f_type_t *magic;
+        int r;
+
+        assert(filesystems);
+        assert(outer_map_fd >= 0);
+
+        int inner_map_fd = compat_bpf_map_create(
+                        BPF_MAP_TYPE_HASH,
+                        NULL,
+                        sizeof(uint32_t),
+                        sizeof(uint32_t),
+                        128U, /* Should be enough for all filesystem types */
+                        NULL);
+        if (inner_map_fd < 0)
+                return log_error_errno(errno, "bpf-lsm: Failed to create inner BPF map: %m");
+
+        if (sym_bpf_map_update_elem(outer_map_fd, &cgroup_id, &inner_map_fd, BPF_ANY) != 0)
+                return log_error_errno(errno, "bpf-lsm: Error populating BPF map: %m");
+
+        uint32_t allow = allow_list;
+
+        /* Use key 0 to store whether this is an allow list or a deny list */
+        if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0)
+                return log_error_errno(errno, "bpf-lsm: Error initializing map: %m");
+
+        SET_FOREACH(fs, filesystems) {
+                r = fs_type_from_string(fs, &magic);
+                if (r < 0) {
+                        log_warning("bpf-lsm: Invalid filesystem name '%s', ignoring.", fs);
+                        continue;
+                }
+
+                log_debug("bpf-lsm: Restricting filesystem access to '%s'", fs);
+
+                for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) {
+                        if (magic[i] == 0)
+                                break;
+
+                        if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) {
+                                r = log_error_errno(errno, "bpf-lsm: Failed to update BPF map: %m");
+
+                                if (sym_bpf_map_delete_elem(outer_map_fd, &cgroup_id) != 0)
+                                        log_debug_errno(errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m");
+
+                                return r;
+                        }
+                }
+        }
+
+        return 0;
+}
+
+int lsm_bpf_cleanup(const Unit *u) {
+        assert(u);
+        assert(u->manager);
+
+        /* If we never successfully detected support, there is nothing to clean up. */
+        if (!lsm_bpf_supported(/* initialize = */ false))
+                return 0;
+
+        if (!u->manager->restrict_fs)
+                return 0;
+
+        if (u->cgroup_id == 0)
+                return 0;
+
+        int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
+        if (fd < 0)
+                return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m");
+
+        if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT)
+                return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m");
+
+        return 0;
+}
+
+int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+        assert(unit);
+        assert(unit->manager);
+
+        if (!unit->manager->restrict_fs)
+                return -ENOMEDIUM;
+
+        return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash);
+}
+
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+        restrict_fs_bpf__destroy(prog);
+}
+#else /* ! BPF_FRAMEWORK */
+bool lsm_bpf_supported(bool initialize) {
+        return false;
+}
+
+int lsm_bpf_setup(Manager *m) {
+        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m");
+}
+
+int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) {
+        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m");
+}
+
+int lsm_bpf_cleanup(const Unit *u) {
+        return 0;
+}
+
+int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+        return -ENOMEDIUM;
+}
+
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+        return;
+}
+#endif
+
+int lsm_bpf_parse_filesystem(
+                const char *name,
+                Set **filesystems,
+                FilesystemParseFlags flags,
+                const char *unit,
+                const char *filename,
+                unsigned line) {
+        int r;
+
+        assert(name);
+        assert(filesystems);
+
+        if (name[0] == '@') {
+                const FilesystemSet *set;
+
+                set = filesystem_set_find(name);
+                if (!set) {
+                        log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
+                                   "bpf-lsm: Unknown filesystem group, ignoring: %s", name);
+                        return 0;
+                }
+
+                NULSTR_FOREACH(i, set->value) {
+                        /* Call ourselves again, for the group to parse. Note that we downgrade logging here
+                         * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table
+                         * are our own problem, not a problem in user configuration data and we shouldn't
+                         * pretend otherwise by complaining about them. */
+                        r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line);
+                        if (r < 0)
+                                return r;
+                }
+        } else {
+                /* If we previously wanted to forbid access to a filesystem and now
+                 * we want to allow it, then remove it from the list. */
+                if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) {
+                        r = set_put_strdup(filesystems, name);
+                        if (r == -ENOMEM)
+                                return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM;
+                        if (r < 0 && r != -EEXIST)  /* When already in set, ignore */
+                                return r;
+                } else
+                        free(set_remove(*filesystems, name));
+        }
+
+        return 0;
+}
diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h
new file mode 100644
index 0000000..a6eda19
--- /dev/null
+++ b/src/core/bpf-lsm.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "hashmap.h"
+
+typedef enum FilesystemParseFlags {
+        FILESYSTEM_PARSE_INVERT     = 1 << 0,
+        FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1,
+        FILESYSTEM_PARSE_LOG        = 1 << 2,
+} FilesystemParseFlags;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+
+typedef struct restrict_fs_bpf restrict_fs_bpf;
+
+bool lsm_bpf_supported(bool initialize);
+int lsm_bpf_setup(Manager *m);
+int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list);
+int lsm_bpf_cleanup(const Unit *u);
+int lsm_bpf_map_restrict_fs_fd(Unit *u);
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog);
+int lsm_bpf_parse_filesystem(const char *name,
+                             Set **filesystems,
+                             FilesystemParseFlags flags,
+                             const char *unit,
+                             const char *filename,
+                             unsigned line);
diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c
new file mode 100644
index 0000000..9f290ab
--- /dev/null
+++ b/src/core/bpf-socket-bind.c
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if BPF_FRAMEWORK
+#include 
+#endif
+
+#include "fd-util.h"
+#include "bpf-socket-bind.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang, llvm and bpftool compile time dependencies are satisfied */
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/socket_bind/socket-bind-api.bpf.h"
+#include "bpf/socket_bind/socket-bind-skel.h"
+
+static struct socket_bind_bpf *socket_bind_bpf_free(struct socket_bind_bpf *obj) {
+        /* socket_bind_bpf__destroy handles object == NULL case */
+        (void) socket_bind_bpf__destroy(obj);
+
+        return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct socket_bind_bpf *, socket_bind_bpf_free);
+
+static int update_rules_map(
+                int map_fd,
+                CGroupSocketBindItem *head) {
+
+        uint32_t i = 0;
+
+        assert(map_fd >= 0);
+
+        LIST_FOREACH(socket_bind_items, item, head) {
+                struct socket_bind_rule val = {
+                        .address_family = (uint32_t) item->address_family,
+                        .protocol = item->ip_protocol,
+                        .nr_ports = item->nr_ports,
+                        .port_min = item->port_min,
+                };
+
+                uint32_t key = i++;
+
+                if (sym_bpf_map_update_elem(map_fd, &key, &val, BPF_ANY) != 0)
+                        return -errno;
+        }
+
+        return 0;
+}
+
+static int prepare_socket_bind_bpf(
+                Unit *u,
+                CGroupSocketBindItem *allow,
+                CGroupSocketBindItem *deny,
+                struct socket_bind_bpf **ret_obj) {
+
+        _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+        size_t allow_count = 0, deny_count = 0;
+        int allow_map_fd, deny_map_fd, r;
+
+        assert(ret_obj);
+
+        LIST_FOREACH(socket_bind_items, item, allow)
+                allow_count++;
+
+        LIST_FOREACH(socket_bind_items, item, deny)
+                deny_count++;
+
+        if (allow_count > SOCKET_BIND_MAX_RULES)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL),
+                                           "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES);
+
+        if (deny_count > SOCKET_BIND_MAX_RULES)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL),
+                                           "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES);
+
+        obj = socket_bind_bpf__open();
+        if (!obj)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "bpf-socket-bind: Failed to open BPF object: %m");
+
+        if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_allow, MAX(allow_count, 1u)) != 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+                                           "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_allow));
+
+        if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_deny, MAX(deny_count, 1u)) != 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+                                           "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_deny));
+
+        if (socket_bind_bpf__load(obj) != 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno,
+                                           "bpf-socket-bind: Failed to load BPF object: %m");
+
+        allow_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_allow);
+        assert(allow_map_fd >= 0);
+
+        r = update_rules_map(allow_map_fd, allow);
+        if (r < 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+                                           "bpf-socket-bind: Failed to put socket bind allow rules into BPF map '%s'",
+                                           sym_bpf_map__name(obj->maps.sd_bind_allow));
+
+        deny_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_deny);
+        assert(deny_map_fd >= 0);
+
+        r = update_rules_map(deny_map_fd, deny);
+        if (r < 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+                                           "bpf-socket-bind: Failed to put socket bind deny rules into BPF map '%s'",
+                                           sym_bpf_map__name(obj->maps.sd_bind_deny));
+
+        *ret_obj = TAKE_PTR(obj);
+        return 0;
+}
+
+int bpf_socket_bind_supported(void) {
+        _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+        int r;
+
+        if (!cgroup_bpf_supported())
+                return false;
+
+        if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, /*opts=*/NULL)) {
+                log_debug("bpf-socket-bind: BPF program type cgroup_sock_addr is not supported");
+                return false;
+        }
+
+        r = prepare_socket_bind_bpf(/*unit=*/NULL, /*allow_rules=*/NULL, /*deny_rules=*/NULL, &obj);
+        if (r < 0) {
+                log_debug_errno(r, "bpf-socket-bind: socket bind filtering is not supported: %m");
+                return false;
+        }
+
+        return bpf_can_link_program(obj->progs.sd_bind4);
+}
+
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
+        int r;
+
+        assert(u);
+
+        if (!u->initial_socket_bind_link_fds) {
+                u->initial_socket_bind_link_fds = fdset_new();
+                if (!u->initial_socket_bind_link_fds)
+                        return log_oom();
+        }
+
+        r = fdset_put(u->initial_socket_bind_link_fds, fd);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd);
+
+        return 0;
+}
+
+static int socket_bind_install_impl(Unit *u) {
+        _cleanup_(bpf_link_freep) struct bpf_link *ipv4 = NULL, *ipv6 = NULL;
+        _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+        _cleanup_free_ char *cgroup_path = NULL;
+        _cleanup_close_ int cgroup_fd = -EBADF;
+        CGroupContext *cc;
+        int r;
+
+        assert(u);
+
+        cc = unit_get_cgroup_context(u);
+        if (!cc)
+                return 0;
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m");
+
+        if (!cc->socket_bind_allow && !cc->socket_bind_deny)
+                return 0;
+
+        r = prepare_socket_bind_bpf(u, cc->socket_bind_allow, cc->socket_bind_deny, &obj);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to load BPF object: %m");
+
+        cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC, 0);
+        if (cgroup_fd < 0)
+                return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path);
+
+        ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd);
+        r = sym_libbpf_get_error(ipv4);
+        if (r != 0)
+                return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
+                                            sym_bpf_program__name(obj->progs.sd_bind4));
+
+        ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd);
+        r = sym_libbpf_get_error(ipv6);
+        if (r != 0)
+                return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
+                                            sym_bpf_program__name(obj->progs.sd_bind6));
+
+        u->ipv4_socket_bind_link = TAKE_PTR(ipv4);
+        u->ipv6_socket_bind_link = TAKE_PTR(ipv6);
+
+        return 0;
+}
+
+int bpf_socket_bind_install(Unit *u) {
+        int r;
+
+        assert(u);
+
+        r = socket_bind_install_impl(u);
+        if (r == -ENOMEM)
+                return r;
+
+        fdset_close(u->initial_socket_bind_link_fds);
+        return r;
+}
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+        int r;
+
+        assert(u);
+
+        r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link);
+        if (r < 0)
+                return r;
+
+        return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link);
+}
+
+#else /* ! BPF_FRAMEWORK */
+int bpf_socket_bind_supported(void) {
+        return false;
+}
+
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
+        return 0;
+}
+
+int bpf_socket_bind_install(Unit *u) {
+        return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                    "bpf-socket-bind: Failed to install; BPF framework is not supported");
+}
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+        return 0;
+}
+#endif
diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h
new file mode 100644
index 0000000..7d426df
--- /dev/null
+++ b/src/core/bpf-socket-bind.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "fdset.h"
+#include "unit.h"
+
+int bpf_socket_bind_supported(void);
+
+/* Add BPF link fd created before daemon-reload or daemon-reexec.  FDs will be closed at the end of
+ * socket_bind_install. */
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd);
+
+int bpf_socket_bind_install(Unit *u);
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds);
diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c
new file mode 100644
index 0000000..6fe229e
--- /dev/null
+++ b/src/core/bpf-util.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-dlopen.h"
+#include "bpf-util.h"
+#include "cgroup-util.h"
+#include "initrd-util.h"
+#include "log.h"
+
+bool cgroup_bpf_supported(void) {
+        static int supported = -1;
+        int r;
+
+        if (supported >= 0)
+                return supported;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0) {
+                log_warning_errno(r, "Can't determine whether the unified hierarchy is used: %m");
+                return (supported = false);
+        }
+
+        if (r == 0) {
+                log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "Not running with unified cgroup hierarchy, disabling cgroup BPF features.");
+                return (supported = false);
+        }
+
+        r = dlopen_bpf();
+        if (r < 0) {
+                log_full_errno(in_initrd() ? LOG_DEBUG : LOG_INFO,
+                               r, "Failed to open libbpf, cgroup BPF features disabled: %m");
+                return (supported = false);
+        }
+
+        return (supported = true);
+}
diff --git a/src/core/bpf-util.h b/src/core/bpf-util.h
new file mode 100644
index 0000000..a6c55cd
--- /dev/null
+++ b/src/core/bpf-util.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+bool cgroup_bpf_supported(void);
diff --git a/src/core/bpf/restrict_fs/meson.build b/src/core/bpf/restrict_fs/meson.build
new file mode 100644
index 0000000..69cde02
--- /dev/null
+++ b/src/core/bpf/restrict_fs/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+        subdir_done()
+endif
+
+restrict_fs_bpf_o_unstripped = custom_target(
+        'restrict-fs.bpf.unstripped.o',
+        input : 'restrict-fs.bpf.c',
+        output : 'restrict-fs.bpf.unstripped.o',
+        command : bpf_o_unstripped_cmd)
+
+restrict_fs_bpf_o = custom_target(
+        'restrict-fs.bpf.o',
+        input : restrict_fs_bpf_o_unstripped,
+        output : 'restrict-fs.bpf.o',
+        command : bpf_o_cmd)
+
+restrict_fs_skel_h = custom_target(
+        'restrict-fs.skel.h',
+        input : restrict_fs_bpf_o,
+        output : 'restrict-fs.skel.h',
+        command : skel_h_cmd,
+        capture : true)
diff --git a/src/core/bpf/restrict_fs/restrict-fs-skel.h b/src/core/bpf/restrict_fs/restrict-fs-skel.h
new file mode 100644
index 0000000..412cf62
--- /dev/null
+++ b/src/core/bpf/restrict_fs/restrict-fs-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/restrict_fs/restrict-fs.skel.h"
diff --git a/src/core/bpf/restrict_fs/restrict-fs.bpf.c b/src/core/bpf/restrict_fs/restrict-fs.bpf.c
new file mode 100644
index 0000000..eb5ed3e
--- /dev/null
+++ b/src/core/bpf/restrict_fs/restrict-fs.bpf.c
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct super_block {
+        unsigned long int s_magic;
+} __attribute__((preserve_access_index));
+
+struct inode {
+        struct super_block *i_sb;
+} __attribute__((preserve_access_index));
+
+struct file {
+        struct inode *f_inode;
+} __attribute__((preserve_access_index));
+
+/*
+ * max_entries is set from user space with the bpf_map__set_max_entries helper.
+ * */
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+        __type(key, uint64_t);      /* cgroup ID */
+        __type(value, uint32_t);    /* fs magic set */
+} cgroup_hash SEC(".maps");
+
+SEC("lsm/file_open")
+int BPF_PROG(restrict_filesystems, struct file *file, int ret)
+{
+        unsigned long raw_magic_number;
+        uint64_t cgroup_id;
+        uint32_t *value, *magic_map, magic_number, zero = 0, *is_allow;
+
+        /* ret is the return value from the previous BPF program or 0 if it's
+         * the first hook */
+        if (ret != 0)
+                return ret;
+
+        BPF_CORE_READ_INTO(&raw_magic_number, file, f_inode, i_sb, s_magic);
+        /* super_block.s_magic is unsigned long, but magic_map keys are
+         * uint32_t. Using s_magic as-is would fail on big-endian systems,
+         * which have 64-bit unsigned long. So cast it. */
+        magic_number = (uint32_t)raw_magic_number;
+
+        cgroup_id = bpf_get_current_cgroup_id();
+
+        magic_map = bpf_map_lookup_elem(&cgroup_hash, &cgroup_id);
+        if (!magic_map)
+                return 0;
+
+        is_allow = bpf_map_lookup_elem(magic_map, &zero);
+        if (!is_allow)
+                /* Malformed map, it doesn't include whether it's an allow list
+                 * or a deny list. Allow. */
+                return 0;
+
+        if (*is_allow) {
+                /* Allow-list: Allow access only if magic_number present in inner map */
+                if (!bpf_map_lookup_elem(magic_map, &magic_number))
+                        return -EPERM;
+        } else {
+                /* Deny-list: Allow access only if magic_number is not present in inner map */
+                if (bpf_map_lookup_elem(magic_map, &magic_number))
+                        return -EPERM;
+        }
+
+        return 0;
+}
+
+static const char _license[] SEC("license") = "GPL";
diff --git a/src/core/bpf/restrict_ifaces/meson.build b/src/core/bpf/restrict_ifaces/meson.build
new file mode 100644
index 0000000..5f36178
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+        subdir_done()
+endif
+
+restrict_ifaces_bpf_o_unstripped = custom_target(
+        'restrict-ifaces.bpf.unstripped.o',
+        input : 'restrict-ifaces.bpf.c',
+        output : 'restrict-ifaces.bpf.unstripped.o',
+        command : bpf_o_unstripped_cmd)
+
+restrict_ifaces_bpf_o = custom_target(
+        'restrict-ifaces.bpf.o',
+        input : restrict_ifaces_bpf_o_unstripped,
+        output : 'restrict-ifaces.bpf.o',
+        command : bpf_o_cmd)
+
+restrict_ifaces_skel_h = custom_target(
+        'restrict-ifaces.skel.h',
+        input : restrict_ifaces_bpf_o,
+        output : 'restrict-ifaces.skel.h',
+        command : skel_h_cmd,
+        capture : true)
diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h
new file mode 100644
index 0000000..f937490
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/restrict_ifaces/restrict-ifaces.skel.h"
diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c
new file mode 100644
index 0000000..32cde5c
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/*  must precede  due to integer types
+ * in bpf helpers signatures.
+ */
+#include 
+#include 
+
+const volatile __u8 is_allow_list = 0;
+
+/* Map containing the network interfaces indexes.
+ * The interpretation of the map depends on the value of is_allow_list.
+ */
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH);
+        __type(key, __u32);
+        __type(value, __u8);
+} sd_restrictif SEC(".maps");
+
+#define DROP 0
+#define PASS 1
+
+static __always_inline int restrict_network_interfaces_impl(const struct __sk_buff *sk) {
+        __u32 zero = 0, ifindex;
+        __u8 *lookup_result;
+
+        ifindex = sk->ifindex;
+        lookup_result = bpf_map_lookup_elem(&sd_restrictif, &ifindex);
+        if (is_allow_list) {
+                /* allow-list: let the packet pass if iface in the list */
+                if (lookup_result)
+                        return PASS;
+        } else {
+            /* deny-list: let the packet pass if iface *not* in the list */
+                if (!lookup_result)
+                        return PASS;
+        }
+
+        return DROP;
+}
+
+SEC("cgroup_skb/egress")
+int sd_restrictif_e(const struct __sk_buff *sk) {
+        return restrict_network_interfaces_impl(sk);
+}
+
+SEC("cgroup_skb/ingress")
+int sd_restrictif_i(const struct __sk_buff *sk) {
+        return restrict_network_interfaces_impl(sk);
+}
+
+static const char _license[] SEC("license") = "LGPL-2.1-or-later";
diff --git a/src/core/bpf/socket_bind/meson.build b/src/core/bpf/socket_bind/meson.build
new file mode 100644
index 0000000..05a2b9d
--- /dev/null
+++ b/src/core/bpf/socket_bind/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+        subdir_done()
+endif
+
+socket_bind_bpf_o_unstripped = custom_target(
+        'socket-bind.bpf.unstripped.o',
+        input : 'socket-bind.bpf.c',
+        output : 'socket-bind.bpf.unstripped.o',
+        command : bpf_o_unstripped_cmd)
+
+socket_bind_bpf_o = custom_target(
+        'socket-bind.bpf.o',
+        input : socket_bind_bpf_o_unstripped,
+        output : 'socket-bind.bpf.o',
+        command : bpf_o_cmd)
+
+socket_bind_skel_h = custom_target(
+        'socket-bind.skel.h',
+        input : socket_bind_bpf_o,
+        output : 'socket-bind.skel.h',
+        command : skel_h_cmd,
+        capture : true)
diff --git a/src/core/bpf/socket_bind/socket-bind-api.bpf.h b/src/core/bpf/socket_bind/socket-bind-api.bpf.h
new file mode 100644
index 0000000..277b9bb
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind-api.bpf.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include 
+
+/*
+ * Bind rule is matched with socket fields accessible to cgroup/bind{4,6} hook
+ * through bpf_sock_addr struct.
+ * 'address_family' is expected to be one of AF_UNSPEC, AF_INET or AF_INET6.
+ * Matching by family is bypassed for rules with AF_UNSPEC set, which makes the
+ * rest of a rule applicable for both IPv4 and IPv6 addresses.
+ * If matching by family is either successful or bypassed, a rule and a socket
+ * are matched by ip protocol.
+ * If 'protocol' is 0, matching is bypassed.
+ * 'nr_ports' and 'port_min' fields specify a set of ports to match a user port
+ * with.
+ * If 'nr_ports' is 0, matching by port is bypassed, making that rule applicable
+ * for all possible ports, e.g. [1, 65535] range. Thus a rule with
+ * 'address_family', 'protocol' and 'nr_ports' equal to AF_UNSPEC, 0 and 0
+ * correspondingly forms 'allow any' or 'deny any' cases.
+ * For positive 'nr_ports', a user_port lying in a range from 'port_min' to'
+ * 'port_min' + 'nr_ports' exclusively is considered to be a match. 'nr_ports'
+ * equalling to 1 forms a rule for a single port.
+ * Ports are in host order.
+ *
+ * Examples:
+ * AF_UNSPEC, 1, 0, 7777: match IPv4 and IPv6 addresses with 7777 user port;
+ *
+ * AF_INET, 1023, 0, 1: match IPv4 addresses with user port in [1, 1023]
+ * range inclusively;
+ *
+ * AF_INET6, 0, 0, 0: match IPv6 addresses;
+ *
+ * AF_UNSPEC, 0, 0, 0: match IPv4 and IPv6 addresses;
+ *
+ * AF_INET6, IPPROTO_TCP, 0, 0: match IPv6/TCP addresses.
+ */
+
+struct socket_bind_rule {
+        __u32 address_family;
+        __u32 protocol;
+        __u16 nr_ports;
+        __u16 port_min;
+};
+
+#define SOCKET_BIND_MAX_RULES 128
diff --git a/src/core/bpf/socket_bind/socket-bind-skel.h b/src/core/bpf/socket_bind/socket-bind-skel.h
new file mode 100644
index 0000000..e0d1626
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/socket_bind/socket-bind.skel.h"
diff --git a/src/core/bpf/socket_bind/socket-bind.bpf.c b/src/core/bpf/socket_bind/socket-bind.bpf.c
new file mode 100644
index 0000000..b7972a8
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind.bpf.c
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "socket-bind-api.bpf.h"
+/*  must precede  due to
+ *  does not depend from type header by design.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * max_entries is set from user space with bpf_map__set_max_entries helper.
+ */
+struct socket_bind_map_t {
+        __uint(type, BPF_MAP_TYPE_ARRAY);
+        __type(key, __u32);
+        __type(value, struct socket_bind_rule);
+};
+
+enum socket_bind_action {
+        SOCKET_BIND_DENY = 0,
+        SOCKET_BIND_ALLOW = 1,
+};
+
+struct socket_bind_map_t sd_bind_allow SEC(".maps");
+struct socket_bind_map_t sd_bind_deny SEC(".maps");
+
+static __always_inline bool match_af(
+                __u8 address_family, const struct socket_bind_rule *r) {
+        return r->address_family == AF_UNSPEC || address_family == r->address_family;
+}
+
+static __always_inline bool match_protocol(
+                __u32 protocol, const struct socket_bind_rule *r) {
+        return r->protocol == 0 || r->protocol == protocol;
+}
+
+static __always_inline bool match_user_port(
+                __u16 port, const struct socket_bind_rule *r) {
+        return r->nr_ports == 0 ||
+                (port >= r->port_min && port < r->port_min + (__u32) r->nr_ports);
+}
+
+static __always_inline bool match(
+                __u8 address_family,
+                __u32 protocol,
+                __u16 port,
+                const struct socket_bind_rule *r) {
+        return match_af(address_family, r) &&
+                match_protocol(protocol, r) &&
+                match_user_port(port, r);
+}
+
+static __always_inline bool match_rules(
+                struct bpf_sock_addr *ctx,
+                struct socket_bind_map_t *rules) {
+        volatile __u32 user_port = ctx->user_port;
+        __u16 port = (__u16)bpf_ntohs(user_port);
+
+        for (__u32 i = 0; i < SOCKET_BIND_MAX_RULES; ++i) {
+                const __u32 key = i;
+                const struct socket_bind_rule *rule = bpf_map_lookup_elem(rules, &key);
+
+                /* Lookup returns NULL if iterator is advanced past the last
+                 * element put in the map. */
+                if (!rule)
+                        break;
+
+                if (match(ctx->user_family, ctx->protocol, port, rule))
+                        return true;
+        }
+
+        return false;
+}
+
+static __always_inline int bind_socket(struct bpf_sock_addr *ctx) {
+        if (match_rules(ctx, &sd_bind_allow))
+                return SOCKET_BIND_ALLOW;
+
+        if (match_rules(ctx, &sd_bind_deny))
+                return SOCKET_BIND_DENY;
+
+        return SOCKET_BIND_ALLOW;
+}
+
+SEC("cgroup/bind4")
+int sd_bind4(struct bpf_sock_addr *ctx) {
+        if (ctx->user_family != AF_INET || ctx->family != AF_INET)
+                return SOCKET_BIND_ALLOW;
+
+        return bind_socket(ctx);
+}
+
+SEC("cgroup/bind6")
+int sd_bind6(struct bpf_sock_addr *ctx) {
+        if (ctx->user_family != AF_INET6 || ctx->family != AF_INET6)
+                return SOCKET_BIND_ALLOW;
+
+        return bind_socket(ctx);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
new file mode 100644
index 0000000..61ac4df
--- /dev/null
+++ b/src/core/cgroup.c
@@ -0,0 +1,4665 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "bpf-devices.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bpf-socket-bind.h"
+#include "btrfs-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "in-addr-prefix-util.h"
+#include "inotify-util.h"
+#include "io-util.h"
+#include "ip-protocol-list.h"
+#include "limits-util.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "procfs-util.h"
+#include "restrict-ifaces.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "virt.h"
+
+#if BPF_FRAMEWORK
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+#endif
+
+#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
+
+/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
+ * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
+ * out specific attributes from us. */
+#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
+
+uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) {
+        if (tasks_max->scale == 0)
+                return tasks_max->value;
+
+        return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
+}
+
+bool manager_owns_host_root_cgroup(Manager *m) {
+        assert(m);
+
+        /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
+         * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
+         * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
+         * we run in any kind of container virtualization. */
+
+        if (MANAGER_IS_USER(m))
+                return false;
+
+        if (detect_container() > 0)
+                return false;
+
+        return empty_or_root(m->cgroup_root);
+}
+
+bool unit_has_startup_cgroup_constraints(Unit *u) {
+        assert(u);
+
+        /* Returns true if this unit has any directives which apply during
+         * startup/shutdown phases. */
+
+        CGroupContext *c;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return false;
+
+        return c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
+               c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+               c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+               c->startup_cpuset_cpus.set ||
+               c->startup_cpuset_mems.set ||
+               c->startup_memory_high_set ||
+               c->startup_memory_max_set ||
+               c->startup_memory_swap_max_set||
+               c->startup_memory_zswap_max_set ||
+               c->startup_memory_low_set;
+}
+
+bool unit_has_host_root_cgroup(Unit *u) {
+        assert(u);
+
+        /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
+         * the manager manages the root cgroup. */
+
+        if (!manager_owns_host_root_cgroup(u->manager))
+                return false;
+
+        return unit_has_name(u, SPECIAL_ROOT_SLICE);
+}
+
+static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
+        int r;
+
+        r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
+        if (r < 0)
+                log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
+                                    strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value);
+
+        return r;
+}
+
+static void cgroup_compat_warn(void) {
+        static bool cgroup_compat_warned = false;
+
+        if (cgroup_compat_warned)
+                return;
+
+        log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
+                    "See cgroup-compat debug messages for details.");
+
+        cgroup_compat_warned = true;
+}
+
+#define log_cgroup_compat(unit, fmt, ...) do {                                  \
+                cgroup_compat_warn();                                           \
+                log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
+        } while (false)
+
+void cgroup_context_init(CGroupContext *c) {
+        assert(c);
+
+        /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
+         * sure to serialize in execute-serialize.c using serialize_bool() instead of
+         * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
+         * skips serialization if the value is 'false' (as that's the common default), so if the value at
+         * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
+         * values, update/add a conditional serialization check. This is to minimize the amount of
+         * serialized data that is sent to the sd-executor, so that there is less work to do on the default
+         * cases. */
+
+        *c = (CGroupContext) {
+                .cpu_weight = CGROUP_WEIGHT_INVALID,
+                .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
+                .cpu_quota_per_sec_usec = USEC_INFINITY,
+                .cpu_quota_period_usec = USEC_INFINITY,
+
+                .cpu_shares = CGROUP_CPU_SHARES_INVALID,
+                .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
+
+                .memory_high = CGROUP_LIMIT_MAX,
+                .startup_memory_high = CGROUP_LIMIT_MAX,
+                .memory_max = CGROUP_LIMIT_MAX,
+                .startup_memory_max = CGROUP_LIMIT_MAX,
+                .memory_swap_max = CGROUP_LIMIT_MAX,
+                .startup_memory_swap_max = CGROUP_LIMIT_MAX,
+                .memory_zswap_max = CGROUP_LIMIT_MAX,
+                .startup_memory_zswap_max = CGROUP_LIMIT_MAX,
+
+                .memory_limit = CGROUP_LIMIT_MAX,
+
+                .io_weight = CGROUP_WEIGHT_INVALID,
+                .startup_io_weight = CGROUP_WEIGHT_INVALID,
+
+                .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
+                .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
+
+                .tasks_max = CGROUP_TASKS_MAX_UNSET,
+
+                .moom_swap = MANAGED_OOM_AUTO,
+                .moom_mem_pressure = MANAGED_OOM_AUTO,
+                .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
+
+                .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
+                .memory_pressure_threshold_usec = USEC_INFINITY,
+        };
+}
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
+        assert(c);
+        assert(a);
+
+        LIST_REMOVE(device_allow, c->device_allow, a);
+        free(a->path);
+        free(a);
+}
+
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
+        assert(c);
+        assert(w);
+
+        LIST_REMOVE(device_weights, c->io_device_weights, w);
+        free(w->path);
+        free(w);
+}
+
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
+        assert(c);
+        assert(l);
+
+        LIST_REMOVE(device_latencies, c->io_device_latencies, l);
+        free(l->path);
+        free(l);
+}
+
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
+        assert(c);
+        assert(l);
+
+        LIST_REMOVE(device_limits, c->io_device_limits, l);
+        free(l->path);
+        free(l);
+}
+
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
+        assert(c);
+        assert(w);
+
+        LIST_REMOVE(device_weights, c->blockio_device_weights, w);
+        free(w->path);
+        free(w);
+}
+
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
+        assert(c);
+        assert(b);
+
+        LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
+        free(b->path);
+        free(b);
+}
+
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
+        assert(c);
+        assert(p);
+
+        LIST_REMOVE(programs, c->bpf_foreign_programs, p);
+        free(p->bpffs_path);
+        free(p);
+}
+
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
+        assert(head);
+
+        LIST_CLEAR(socket_bind_items, *head, free);
+}
+
+void cgroup_context_done(CGroupContext *c) {
+        assert(c);
+
+        while (c->io_device_weights)
+                cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+        while (c->io_device_latencies)
+                cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+        while (c->io_device_limits)
+                cgroup_context_free_io_device_limit(c, c->io_device_limits);
+
+        while (c->blockio_device_weights)
+                cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+        while (c->blockio_device_bandwidths)
+                cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
+
+        while (c->device_allow)
+                cgroup_context_free_device_allow(c, c->device_allow);
+
+        cgroup_context_remove_socket_bind(&c->socket_bind_allow);
+        cgroup_context_remove_socket_bind(&c->socket_bind_deny);
+
+        c->ip_address_allow = set_free(c->ip_address_allow);
+        c->ip_address_deny = set_free(c->ip_address_deny);
+
+        c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
+        c->ip_filters_egress = strv_free(c->ip_filters_egress);
+
+        while (c->bpf_foreign_programs)
+                cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+        c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+
+        cpu_set_reset(&c->cpuset_cpus);
+        cpu_set_reset(&c->startup_cpuset_cpus);
+        cpu_set_reset(&c->cpuset_mems);
+        cpu_set_reset(&c->startup_cpuset_mems);
+
+        c->delegate_subgroup = mfree(c->delegate_subgroup);
+
+        nft_set_context_clear(&c->nft_set_context);
+}
+
+static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
+        assert(u);
+
+        if (!u->cgroup_realized)
+                return -EOWNERDEAD;
+
+        return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
+}
+
+static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
+        CGroupContext *c;
+        CGroupMask m;
+        const char *file;
+        uint64_t unit_value;
+        int r;
+
+        /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
+         * return -ENODATA) on cgroup v1.
+         *
+         * Returns:
+         *
+         * <0: On error.
+         *  0: If the kernel memory setting doesn't match our configuration.
+         * >0: If the kernel memory setting matches our configuration.
+         *
+         * The following values are only guaranteed to be populated on return >=0:
+         *
+         * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
+         * - ret_kernel_value will contain the actual value presented by the kernel. */
+
+        assert(u);
+
+        r = cg_all_unified();
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
+
+        /* Unsupported on v1.
+         *
+         * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
+         * silently masked the controller. */
+        if (r == 0)
+                return -ENODATA;
+
+        /* The root slice doesn't have any controller files, so we can't compare anything. */
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return -ENODATA;
+
+        /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
+         * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
+         * avoid specious errors in these scenarios, check that we even expect the memory controller to be
+         * enabled at all. */
+        m = unit_get_target_mask(u);
+        if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
+                return -ENODATA;
+
+        assert_se(c = unit_get_cgroup_context(u));
+
+        bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
+
+        if (streq(property_name, "MemoryLow")) {
+                unit_value = unit_get_ancestor_memory_low(u);
+                file = "memory.low";
+        } else if (startup && streq(property_name, "StartupMemoryLow")) {
+                unit_value = unit_get_ancestor_startup_memory_low(u);
+                file = "memory.low";
+        } else if (streq(property_name, "MemoryMin")) {
+                unit_value = unit_get_ancestor_memory_min(u);
+                file = "memory.min";
+        } else if (streq(property_name, "MemoryHigh")) {
+                unit_value = c->memory_high;
+                file = "memory.high";
+        } else if (startup && streq(property_name, "StartupMemoryHigh")) {
+                unit_value = c->startup_memory_high;
+                file = "memory.high";
+        } else if (streq(property_name, "MemoryMax")) {
+                unit_value = c->memory_max;
+                file = "memory.max";
+        } else if (startup && streq(property_name, "StartupMemoryMax")) {
+                unit_value = c->startup_memory_max;
+                file = "memory.max";
+        } else if (streq(property_name, "MemorySwapMax")) {
+                unit_value = c->memory_swap_max;
+                file = "memory.swap.max";
+        } else if (startup && streq(property_name, "StartupMemorySwapMax")) {
+                unit_value = c->startup_memory_swap_max;
+                file = "memory.swap.max";
+        } else if (streq(property_name, "MemoryZSwapMax")) {
+                unit_value = c->memory_zswap_max;
+                file = "memory.zswap.max";
+        } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) {
+                unit_value = c->startup_memory_zswap_max;
+                file = "memory.zswap.max";
+        } else
+                return -EINVAL;
+
+        r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
+
+        /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
+         * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
+         * our internal page-counting. To support those future kernels, just check the value itself first
+         * without any page-alignment. */
+        if (*ret_kernel_value == unit_value) {
+                *ret_unit_value = unit_value;
+                return 1;
+        }
+
+        /* The current kernel behaviour, by comparison, is that even if you write a particular number of
+         * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
+         * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
+         * cricket. */
+        if (unit_value != CGROUP_LIMIT_MAX)
+                unit_value = PAGE_ALIGN_DOWN(unit_value);
+
+        *ret_unit_value = unit_value;
+
+        return *ret_kernel_value == *ret_unit_value;
+}
+
+#define FORMAT_CGROUP_DIFF_MAX 128
+
+static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
+        uint64_t kval, sval;
+        int r;
+
+        assert(u);
+        assert(buf);
+        assert(l > 0);
+
+        r = unit_compare_memory_limit(u, property_name, &sval, &kval);
+
+        /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
+         * In the absence of reliably being able to detect whether memcg swap support is available or not,
+         * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
+         * on CONFIG_ZSWAP. */
+        if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
+            (r == -ENOENT && STR_IN_SET(property_name,
+                                        "MemorySwapMax",
+                                        "StartupMemorySwapMax",
+                                        "MemoryZSwapMax",
+                                        "StartupMemoryZSwapMax")))
+                buf[0] = 0;
+        else if (r < 0) {
+                errno = -r;
+                (void) snprintf(buf, l, " (error getting kernel value: %m)");
+        } else
+                (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
+
+        return buf;
+}
+
+const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) {
+        static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = {
+                /* Lets simply define a table with every possible combination. As long as those are just 8 we
+                 * can get away with it. If this ever grows to more we need to revisit this logic though. */
+                [0]                                                          = "",
+                [CGROUP_DEVICE_READ]                                         = "r",
+                [CGROUP_DEVICE_WRITE]                                        = "w",
+                [CGROUP_DEVICE_MKNOD]                                        = "m",
+                [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE]                     = "rw",
+                [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD]                     = "rm",
+                [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD]                    = "wm",
+                [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm",
+        };
+
+        if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+                return NULL;
+
+        return table[p];
+}
+
+CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) {
+        CGroupDevicePermissions p = 0;
+
+        if (!s)
+                return _CGROUP_DEVICE_PERMISSIONS_INVALID;
+
+        for (const char *c = s; *c; c++) {
+                if (*c == 'r')
+                        p |= CGROUP_DEVICE_READ;
+                else if (*c == 'w')
+                        p |= CGROUP_DEVICE_WRITE;
+                else if (*c == 'm')
+                        p |= CGROUP_DEVICE_MKNOD;
+                else
+                        return _CGROUP_DEVICE_PERMISSIONS_INVALID;
+        }
+
+        return p;
+}
+
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
+        _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
+        CGroupContext *c;
+        struct in_addr_prefix *iaai;
+
+        char cda[FORMAT_CGROUP_DIFF_MAX];
+        char cdb[FORMAT_CGROUP_DIFF_MAX];
+        char cdc[FORMAT_CGROUP_DIFF_MAX];
+        char cdd[FORMAT_CGROUP_DIFF_MAX];
+        char cde[FORMAT_CGROUP_DIFF_MAX];
+        char cdf[FORMAT_CGROUP_DIFF_MAX];
+        char cdg[FORMAT_CGROUP_DIFF_MAX];
+        char cdh[FORMAT_CGROUP_DIFF_MAX];
+        char cdi[FORMAT_CGROUP_DIFF_MAX];
+        char cdj[FORMAT_CGROUP_DIFF_MAX];
+        char cdk[FORMAT_CGROUP_DIFF_MAX];
+
+        assert(u);
+        assert(f);
+
+        assert_se(c = unit_get_cgroup_context(u));
+
+        prefix = strempty(prefix);
+
+        (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
+        (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
+
+        /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
+        const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no";
+
+        cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
+        startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
+        cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
+        startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
+
+        fprintf(f,
+                "%sCPUAccounting: %s\n"
+                "%sIOAccounting: %s\n"
+                "%sBlockIOAccounting: %s\n"
+                "%sMemoryAccounting: %s\n"
+                "%sTasksAccounting: %s\n"
+                "%sIPAccounting: %s\n"
+                "%sCPUWeight: %" PRIu64 "\n"
+                "%sStartupCPUWeight: %" PRIu64 "\n"
+                "%sCPUShares: %" PRIu64 "\n"
+                "%sStartupCPUShares: %" PRIu64 "\n"
+                "%sCPUQuotaPerSecSec: %s\n"
+                "%sCPUQuotaPeriodSec: %s\n"
+                "%sAllowedCPUs: %s\n"
+                "%sStartupAllowedCPUs: %s\n"
+                "%sAllowedMemoryNodes: %s\n"
+                "%sStartupAllowedMemoryNodes: %s\n"
+                "%sIOWeight: %" PRIu64 "\n"
+                "%sStartupIOWeight: %" PRIu64 "\n"
+                "%sBlockIOWeight: %" PRIu64 "\n"
+                "%sStartupBlockIOWeight: %" PRIu64 "\n"
+                "%sDefaultMemoryMin: %" PRIu64 "\n"
+                "%sDefaultMemoryLow: %" PRIu64 "\n"
+                "%sMemoryMin: %" PRIu64 "%s\n"
+                "%sMemoryLow: %" PRIu64 "%s\n"
+                "%sStartupMemoryLow: %" PRIu64 "%s\n"
+                "%sMemoryHigh: %" PRIu64 "%s\n"
+                "%sStartupMemoryHigh: %" PRIu64 "%s\n"
+                "%sMemoryMax: %" PRIu64 "%s\n"
+                "%sStartupMemoryMax: %" PRIu64 "%s\n"
+                "%sMemorySwapMax: %" PRIu64 "%s\n"
+                "%sStartupMemorySwapMax: %" PRIu64 "%s\n"
+                "%sMemoryZSwapMax: %" PRIu64 "%s\n"
+                "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
+                "%sMemoryLimit: %" PRIu64 "\n"
+                "%sTasksMax: %" PRIu64 "\n"
+                "%sDevicePolicy: %s\n"
+                "%sDisableControllers: %s\n"
+                "%sDelegate: %s\n"
+                "%sManagedOOMSwap: %s\n"
+                "%sManagedOOMMemoryPressure: %s\n"
+                "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
+                "%sManagedOOMPreference: %s\n"
+                "%sMemoryPressureWatch: %s\n"
+                "%sCoredumpReceive: %s\n",
+                prefix, yes_no(c->cpu_accounting),
+                prefix, yes_no(c->io_accounting),
+                prefix, yes_no(c->blockio_accounting),
+                prefix, yes_no(c->memory_accounting),
+                prefix, yes_no(c->tasks_accounting),
+                prefix, yes_no(c->ip_accounting),
+                prefix, c->cpu_weight,
+                prefix, c->startup_cpu_weight,
+                prefix, c->cpu_shares,
+                prefix, c->startup_cpu_shares,
+                prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
+                prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
+                prefix, strempty(cpuset_cpus),
+                prefix, strempty(startup_cpuset_cpus),
+                prefix, strempty(cpuset_mems),
+                prefix, strempty(startup_cpuset_mems),
+                prefix, c->io_weight,
+                prefix, c->startup_io_weight,
+                prefix, c->blockio_weight,
+                prefix, c->startup_blockio_weight,
+                prefix, c->default_memory_min,
+                prefix, c->default_memory_low,
+                prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
+                prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
+                prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "StartupMemoryLow"),
+                prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryHigh"),
+                prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "StartupMemoryHigh"),
+                prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdf, sizeof(cdf), u, "MemoryMax"),
+                prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(cdg, sizeof(cdg), u, "StartupMemoryMax"),
+                prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cdh, sizeof(cdh), u, "MemorySwapMax"),
+                prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(cdi, sizeof(cdi), u, "StartupMemorySwapMax"),
+                prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(cdj, sizeof(cdj), u, "MemoryZSwapMax"),
+                prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(cdk, sizeof(cdk), u, "StartupMemoryZSwapMax"),
+                prefix, c->memory_limit,
+                prefix, cgroup_tasks_max_resolve(&c->tasks_max),
+                prefix, cgroup_device_policy_to_string(c->device_policy),
+                prefix, strempty(disable_controllers_str),
+                prefix, delegate_str,
+                prefix, managed_oom_mode_to_string(c->moom_swap),
+                prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
+                prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
+                prefix, managed_oom_preference_to_string(c->moom_preference),
+                prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch),
+                prefix, yes_no(c->coredump_receive));
+
+        if (c->delegate_subgroup)
+                fprintf(f, "%sDelegateSubgroup: %s\n",
+                        prefix, c->delegate_subgroup);
+
+        if (c->memory_pressure_threshold_usec != USEC_INFINITY)
+                fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
+
+        LIST_FOREACH(device_allow, a, c->device_allow)
+                /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
+                fprintf(f,
+                        "%sDeviceAllow: %s %s\n",
+                        prefix,
+                        a->path,
+                        strna(cgroup_device_permissions_to_string(a->permissions)));
+
+        LIST_FOREACH(device_weights, iw, c->io_device_weights)
+                fprintf(f,
+                        "%sIODeviceWeight: %s %" PRIu64 "\n",
+                        prefix,
+                        iw->path,
+                        iw->weight);
+
+        LIST_FOREACH(device_latencies, l, c->io_device_latencies)
+                fprintf(f,
+                        "%sIODeviceLatencyTargetSec: %s %s\n",
+                        prefix,
+                        l->path,
+                        FORMAT_TIMESPAN(l->target_usec, 1));
+
+        LIST_FOREACH(device_limits, il, c->io_device_limits)
+                for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+                        if (il->limits[type] != cgroup_io_limit_defaults[type])
+                                fprintf(f,
+                                        "%s%s: %s %s\n",
+                                        prefix,
+                                        cgroup_io_limit_type_to_string(type),
+                                        il->path,
+                                        FORMAT_BYTES(il->limits[type]));
+
+        LIST_FOREACH(device_weights, w, c->blockio_device_weights)
+                fprintf(f,
+                        "%sBlockIODeviceWeight: %s %" PRIu64,
+                        prefix,
+                        w->path,
+                        w->weight);
+
+        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+                if (b->rbps != CGROUP_LIMIT_MAX)
+                        fprintf(f,
+                                "%sBlockIOReadBandwidth: %s %s\n",
+                                prefix,
+                                b->path,
+                                FORMAT_BYTES(b->rbps));
+                if (b->wbps != CGROUP_LIMIT_MAX)
+                        fprintf(f,
+                                "%sBlockIOWriteBandwidth: %s %s\n",
+                                prefix,
+                                b->path,
+                                FORMAT_BYTES(b->wbps));
+        }
+
+        SET_FOREACH(iaai, c->ip_address_allow)
+                fprintf(f, "%sIPAddressAllow: %s\n", prefix,
+                        IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+        SET_FOREACH(iaai, c->ip_address_deny)
+                fprintf(f, "%sIPAddressDeny: %s\n", prefix,
+                        IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+
+        STRV_FOREACH(path, c->ip_filters_ingress)
+                fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
+        STRV_FOREACH(path, c->ip_filters_egress)
+                fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
+
+        LIST_FOREACH(programs, p, c->bpf_foreign_programs)
+                fprintf(f, "%sBPFProgram: %s:%s",
+                        prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
+
+        if (c->socket_bind_allow) {
+                fprintf(f, "%sSocketBindAllow: ", prefix);
+                cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
+                fputc('\n', f);
+        }
+
+        if (c->socket_bind_deny) {
+                fprintf(f, "%sSocketBindDeny: ", prefix);
+                cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
+                fputc('\n', f);
+        }
+
+        if (c->restrict_network_interfaces) {
+                char *iface;
+                SET_FOREACH(iface, c->restrict_network_interfaces)
+                        fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
+        }
+
+        FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
+                fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
+                        nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
+}
+
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
+        const char *family, *colon1, *protocol = "", *colon2 = "";
+
+        family = strempty(af_to_ipv4_ipv6(item->address_family));
+        colon1 = isempty(family) ? "" : ":";
+
+        if (item->ip_protocol != 0) {
+                protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
+                colon2 = ":";
+        }
+
+        if (item->nr_ports == 0)
+                fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
+        else if (item->nr_ports == 1)
+                fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
+        else {
+                uint16_t port_max = item->port_min + item->nr_ports - 1;
+                fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
+                        item->port_min, port_max);
+        }
+}
+
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
+        bool first = true;
+
+        LIST_FOREACH(socket_bind_items, bi, items) {
+                if (first)
+                        first = false;
+                else
+                        fputc(' ', f);
+
+                cgroup_context_dump_socket_bind_item(bi, f);
+        }
+}
+
+int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
+        _cleanup_free_ CGroupDeviceAllow *a = NULL;
+        _cleanup_free_ char *d = NULL;
+
+        assert(c);
+        assert(dev);
+        assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+        if (p == 0)
+                p = _CGROUP_DEVICE_PERMISSIONS_ALL;
+
+        a = new(CGroupDeviceAllow, 1);
+        if (!a)
+                return -ENOMEM;
+
+        d = strdup(dev);
+        if (!d)
+                return -ENOMEM;
+
+        *a = (CGroupDeviceAllow) {
+                .path = TAKE_PTR(d),
+                .permissions = p,
+        };
+
+        LIST_PREPEND(device_allow, c->device_allow, a);
+        TAKE_PTR(a);
+
+        return 0;
+}
+
+int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
+        assert(c);
+        assert(dev);
+        assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+        if (p == 0)
+                p = _CGROUP_DEVICE_PERMISSIONS_ALL;
+
+        LIST_FOREACH(device_allow, b, c->device_allow)
+                if (path_equal(b->path, dev)) {
+                        b->permissions = p;
+                        return 0;
+                }
+
+        return cgroup_context_add_device_allow(c, dev, p);
+}
+
+int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
+        CGroupBPFForeignProgram *p;
+        _cleanup_free_ char *d = NULL;
+
+        assert(c);
+        assert(bpffs_path);
+
+        if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
+
+        d = strdup(bpffs_path);
+        if (!d)
+                return log_oom();
+
+        p = new(CGroupBPFForeignProgram, 1);
+        if (!p)
+                return log_oom();
+
+        *p = (CGroupBPFForeignProgram) {
+                .attach_type = attach_type,
+                .bpffs_path = TAKE_PTR(d),
+        };
+
+        LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
+
+        return 0;
+}
+
+#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry)                       \
+        uint64_t unit_get_ancestor_##entry(Unit *u) {                   \
+                CGroupContext *c;                                       \
+                                                                        \
+                /* 1. Is entry set in this unit? If so, use that.       \
+                 * 2. Is the default for this entry set in any          \
+                 *    ancestor? If so, use that.                        \
+                 * 3. Otherwise, return CGROUP_LIMIT_MIN. */            \
+                                                                        \
+                assert(u);                                              \
+                                                                        \
+                c = unit_get_cgroup_context(u);                         \
+                if (c && c->entry##_set)                                \
+                        return c->entry;                                \
+                                                                        \
+                while ((u = UNIT_GET_SLICE(u))) {                       \
+                        c = unit_get_cgroup_context(u);                 \
+                        if (c && c->default_##entry##_set)              \
+                                return c->default_##entry;              \
+                }                                                       \
+                                                                        \
+                /* We've reached the root, but nobody had default for   \
+                 * this entry set, so set it to the kernel default. */  \
+                return CGROUP_LIMIT_MIN;                                \
+}
+
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low);
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
+
+static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) {
+        int r;
+
+        assert(u);
+        assert(name);
+
+        if (!u->cgroup_path)
+                return;
+
+        r = cg_set_xattr(u->cgroup_path, name, data, size, 0);
+        if (r < 0)
+                log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
+}
+
+static void unit_remove_xattr_graceful(Unit *u, const char *name) {
+        int r;
+
+        assert(u);
+        assert(name);
+
+        if (!u->cgroup_path)
+                return;
+
+        r = cg_remove_xattr(u->cgroup_path, name);
+        if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+                log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
+}
+
+static void cgroup_oomd_xattr_apply(Unit *u) {
+        CGroupContext *c;
+
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return;
+
+        if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
+                unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1);
+
+        if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
+                unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1);
+
+        if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
+                unit_remove_xattr_graceful(u, "user.oomd_avoid");
+
+        if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
+                unit_remove_xattr_graceful(u, "user.oomd_omit");
+}
+
+static int cgroup_log_xattr_apply(Unit *u) {
+        ExecContext *c;
+        size_t len, allowed_patterns_len, denied_patterns_len;
+        _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL;
+        char *last;
+        int r;
+
+        assert(u);
+
+        c = unit_get_exec_context(u);
+        if (!c)
+                /* Some unit types have a cgroup context but no exec context, so we do not log
+                 * any error here to avoid confusion. */
+                return 0;
+
+        if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) {
+                unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns");
+                return 0;
+        }
+
+        r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to make nulstr from set: %m");
+
+        r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to make nulstr from set: %m");
+
+        /* Use nul character separated strings without trailing nul */
+        allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u);
+        denied_patterns_len = LESS_BY(denied_patterns_len, 1u);
+
+        len = allowed_patterns_len + 1 + denied_patterns_len;
+        patterns = new(char, len);
+        if (!patterns)
+                return log_oom_debug();
+
+        last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len);
+        *(last++) = '\xff';
+        memcpy_safe(last, denied_patterns, denied_patterns_len);
+
+        unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len);
+
+        return 0;
+}
+
+static void cgroup_invocation_id_xattr_apply(Unit *u) {
+        bool b;
+
+        assert(u);
+
+        b = !sd_id128_is_null(u->invocation_id);
+        FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
+                if (b)
+                        unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
+                else
+                        unit_remove_xattr_graceful(u, xn);
+        }
+}
+
+static void cgroup_coredump_xattr_apply(Unit *u) {
+        CGroupContext *c;
+
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return;
+
+        if (unit_cgroup_delegate(u) && c->coredump_receive)
+                unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1);
+        else
+                unit_remove_xattr_graceful(u, "user.coredump_receive");
+}
+
+static void cgroup_delegate_xattr_apply(Unit *u) {
+        bool b;
+
+        assert(u);
+
+        /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
+         * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
+         * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
+         * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
+         * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
+         * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
+         * be a big problem given this communicates delegation state to clients, but the manager never reads
+         * it. */
+        b = unit_cgroup_delegate(u);
+        FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
+                if (b)
+                        unit_set_xattr_graceful(u, xn, "1", 1);
+                else
+                        unit_remove_xattr_graceful(u, xn);
+        }
+}
+
+static void cgroup_survive_xattr_apply(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->survive_final_kill_signal) {
+                r = cg_set_xattr(
+                                u->cgroup_path,
+                                "user.survive_final_kill_signal",
+                                "1",
+                                1,
+                                /* flags= */ 0);
+                /* user xattr support was added in kernel v5.7 */
+                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                        r = cg_set_xattr(
+                                        u->cgroup_path,
+                                        "trusted.survive_final_kill_signal",
+                                        "1",
+                                        1,
+                                        /* flags= */ 0);
+                if (r < 0)
+                        log_unit_debug_errno(u,
+                                             r,
+                                             "Failed to set 'survive_final_kill_signal' xattr on control "
+                                             "group %s, ignoring: %m",
+                                             empty_to_root(u->cgroup_path));
+        } else {
+                unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
+                unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
+        }
+}
+
+static void cgroup_xattr_apply(Unit *u) {
+        assert(u);
+
+        /* The 'user.*' xattrs can be set from a user manager. */
+        cgroup_oomd_xattr_apply(u);
+        cgroup_log_xattr_apply(u);
+        cgroup_coredump_xattr_apply(u);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return;
+
+        cgroup_invocation_id_xattr_apply(u);
+        cgroup_delegate_xattr_apply(u);
+        cgroup_survive_xattr_apply(u);
+}
+
+static int lookup_block_device(const char *p, dev_t *ret) {
+        dev_t rdev, dev = 0;
+        mode_t mode;
+        int r;
+
+        assert(p);
+        assert(ret);
+
+        r = device_path_parse_major_minor(p, &mode, &rdev);
+        if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
+                struct stat st;
+
+                if (stat(p, &st) < 0)
+                        return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
+
+                mode = st.st_mode;
+                rdev = st.st_rdev;
+                dev = st.st_dev;
+        } else if (r < 0)
+                return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
+
+        if (S_ISCHR(mode))
+                return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
+                                         "Device node '%s' is a character device, but block device needed.", p);
+        if (S_ISBLK(mode))
+                *ret = rdev;
+        else if (major(dev) != 0)
+                *ret = dev; /* If this is not a device node then use the block device this file is stored on */
+        else {
+                /* If this is btrfs, getting the backing block device is a bit harder */
+                r = btrfs_get_block_device(p, ret);
+                if (r == -ENOTTY)
+                        return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
+                                                 "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
+        }
+
+        /* If this is a LUKS/DM device, recursively try to get the originating block device */
+        while (block_get_originating(*ret, ret) > 0);
+
+        /* If this is a partition, try to get the originating block device */
+        (void) block_get_whole_disk(*ret, ret);
+        return 0;
+}
+
+static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
+        return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
+                c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
+}
+
+static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
+        return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
+                c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
+}
+
+static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
+        return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
+}
+
+static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
+        return c->cpuset_mems.set || c->startup_cpuset_mems.set;
+}
+
+uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
+        assert(c);
+
+        if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+            c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
+                return c->startup_cpu_weight;
+        else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
+                return c->cpu_weight;
+        else
+                return CGROUP_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
+        if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+            c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
+                return c->startup_cpu_shares;
+        else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
+                return c->cpu_shares;
+        else
+                return CGROUP_CPU_SHARES_DEFAULT;
+}
+
+static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
+        if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+            c->startup_cpuset_cpus.set)
+                return &c->startup_cpuset_cpus;
+        else
+                return &c->cpuset_cpus;
+}
+
+static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
+        if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+            c->startup_cpuset_mems.set)
+                return &c->startup_cpuset_mems;
+        else
+                return &c->cpuset_mems;
+}
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
+        /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
+         * need to be higher than that boundary. quota is specified in USecPerSec.
+         * Additionally, period must be at most max_period. */
+        assert(quota > 0);
+
+        return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
+}
+
+static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
+        usec_t new_period;
+
+        if (quota == USEC_INFINITY)
+                /* Always use default period for infinity quota. */
+                return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+        if (period == USEC_INFINITY)
+                /* Default period was requested. */
+                period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+        /* Clamp to interval [1ms, 1s] */
+        new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
+
+        if (new_period != period) {
+                log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
+                              "Clamping CPU interval for cpu.max: period is now %s",
+                              FORMAT_TIMESPAN(new_period, 1));
+                u->warned_clamping_cpu_quota_period = true;
+        }
+
+        return new_period;
+}
+
+static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
+        char buf[DECIMAL_STR_MAX(uint64_t) + 2];
+
+        if (weight == CGROUP_WEIGHT_IDLE)
+                return;
+        xsprintf(buf, "%" PRIu64 "\n", weight);
+        (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
+}
+
+static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) {
+        int r;
+        bool is_idle;
+        const char *idle_val;
+
+        is_idle = weight == CGROUP_WEIGHT_IDLE;
+        idle_val = one_zero(is_idle);
+        r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val);
+        if (r < 0 && (r != -ENOENT || is_idle))
+                log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
+                                    "cpu.idle", empty_to_root(u->cgroup_path), idle_val);
+}
+
+static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
+        char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
+
+        period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+        if (quota != USEC_INFINITY)
+                xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
+                         MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
+        else
+                xsprintf(buf, "max " USEC_FMT "\n", period);
+        (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
+}
+
+static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
+        char buf[DECIMAL_STR_MAX(uint64_t) + 2];
+
+        xsprintf(buf, "%" PRIu64 "\n", shares);
+        (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
+}
+
+static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
+        char buf[DECIMAL_STR_MAX(usec_t) + 2];
+
+        period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+
+        xsprintf(buf, USEC_FMT "\n", period);
+        (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
+
+        if (quota != USEC_INFINITY) {
+                xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
+                (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
+        } else
+                (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
+}
+
+static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
+        return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
+                     CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
+        /* we don't support idle in cgroupv1 */
+        if (weight == CGROUP_WEIGHT_IDLE)
+                return CGROUP_CPU_SHARES_MIN;
+
+        return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
+                     CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
+}
+
+static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
+        _cleanup_free_ char *buf = NULL;
+
+        buf = cpu_set_to_range_string(cpus);
+        if (!buf) {
+                log_oom();
+                return;
+        }
+
+        (void) set_attribute_and_warn(u, "cpuset", name, buf);
+}
+
+static bool cgroup_context_has_io_config(CGroupContext *c) {
+        return c->io_accounting ||
+                c->io_weight != CGROUP_WEIGHT_INVALID ||
+                c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+                c->io_device_weights ||
+                c->io_device_latencies ||
+                c->io_device_limits;
+}
+
+static bool cgroup_context_has_blockio_config(CGroupContext *c) {
+        return c->blockio_accounting ||
+                c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+                c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+                c->blockio_device_weights ||
+                c->blockio_device_bandwidths;
+}
+
+static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
+        if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+            c->startup_io_weight != CGROUP_WEIGHT_INVALID)
+                return c->startup_io_weight;
+        if (c->io_weight != CGROUP_WEIGHT_INVALID)
+                return c->io_weight;
+        return CGROUP_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
+        if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+            c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
+                return c->startup_blockio_weight;
+        if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
+                return c->blockio_weight;
+        return CGROUP_BLKIO_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
+        return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
+                     CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
+        return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
+                     CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
+}
+
+static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
+        static const char * const prop_names[] = {
+                "IOWeight",
+                "BlockIOWeight",
+                "IODeviceWeight",
+                "BlockIODeviceWeight",
+        };
+        static bool warned = false;
+        char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
+        const char *p;
+        uint64_t bfq_weight;
+        int r;
+
+        /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
+         * See also: https://github.com/systemd/systemd/pull/13335 and
+         * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
+        p = strjoina(controller, ".bfq.weight");
+        /* Adjust to kernel range is 1..1000, the default is 100. */
+        bfq_weight = BFQ_WEIGHT(io_weight);
+
+        if (major(dev) > 0)
+                xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
+        else
+                xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
+
+        r = cg_set_attribute(controller, u->cgroup_path, p, buf);
+
+        /* FIXME: drop this when kernels prior
+         * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
+         * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
+         * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
+        if (r == -EINVAL && major(dev) > 0) {
+               if (!warned) {
+                        log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
+                        warned = true;
+               }
+               r = -EOPNOTSUPP; /* mask as unconfigured device */
+        } else if (r >= 0 && io_weight != bfq_weight)
+                log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
+                               prop_names[2*(major(dev) > 0) + streq(controller, "blkio")],
+                               io_weight, p, bfq_weight);
+        return r;
+}
+
+static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
+        char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+        dev_t dev;
+        int r, r1, r2;
+
+        if (lookup_block_device(dev_path, &dev) < 0)
+                return;
+
+        r1 = set_bfq_weight(u, "io", dev, io_weight);
+
+        xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
+        r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
+
+        /* Look at the configured device, when both fail, prefer io.weight errno. */
+        r = r2 == -EOPNOTSUPP ? r1 : r2;
+
+        if (r < 0)
+                log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
+                                    r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
+                                    empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
+}
+
+static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
+        char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+        dev_t dev;
+        int r;
+
+        r = lookup_block_device(dev_path, &dev);
+        if (r < 0)
+                return;
+
+        xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), blkio_weight);
+        (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
+}
+
+static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
+        char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
+        dev_t dev;
+        int r;
+
+        r = lookup_block_device(dev_path, &dev);
+        if (r < 0)
+                return;
+
+        if (target != USEC_INFINITY)
+                xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
+        else
+                xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
+
+        (void) set_attribute_and_warn(u, "io", "io.latency", buf);
+}
+
+static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
+        char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
+             buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
+        dev_t dev;
+
+        if (lookup_block_device(dev_path, &dev) < 0)
+                return;
+
+        for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+                if (limits[type] != cgroup_io_limit_defaults[type])
+                        xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
+                else
+                        xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
+
+        xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
+                 limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
+                 limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
+        (void) set_attribute_and_warn(u, "io", "io.max", buf);
+}
+
+static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
+        char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+        dev_t dev;
+
+        if (lookup_block_device(dev_path, &dev) < 0)
+                return;
+
+        sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), rbps);
+        (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
+
+        sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), wbps);
+        (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
+}
+
+static bool unit_has_unified_memory_config(Unit *u) {
+        CGroupContext *c;
+
+        assert(u);
+
+        assert_se(c = unit_get_cgroup_context(u));
+
+        return unit_get_ancestor_memory_min(u) > 0 ||
+               unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 ||
+               c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set ||
+               c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set ||
+               c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set ||
+               c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set;
+}
+
+static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
+        char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
+
+        if (v != CGROUP_LIMIT_MAX)
+                xsprintf(buf, "%" PRIu64 "\n", v);
+
+        (void) set_attribute_and_warn(u, "memory", file, buf);
+}
+
+static void cgroup_apply_firewall(Unit *u) {
+        assert(u);
+
+        /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
+
+        if (bpf_firewall_compile(u) < 0)
+                return;
+
+        (void) bpf_firewall_load_custom(u);
+        (void) bpf_firewall_install(u);
+}
+
+void unit_modify_nft_set(Unit *u, bool add) {
+        int r;
+
+        assert(u);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return;
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return;
+
+        if (cg_all_unified() <= 0)
+                return;
+
+        if (u->cgroup_id == 0)
+                return;
+
+        if (!u->manager->fw_ctx) {
+                r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
+                if (r < 0)
+                        return;
+
+                assert(u->manager->fw_ctx);
+        }
+
+        CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u));
+
+        FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
+                if (nft_set->source != NFT_SET_SOURCE_CGROUP)
+                        continue;
+
+                uint64_t element = u->cgroup_id;
+
+                r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
+                if (r < 0)
+                        log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
+                                          add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
+                else
+                        log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
+                                  add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
+        }
+}
+
+static void cgroup_apply_socket_bind(Unit *u) {
+        assert(u);
+
+        (void) bpf_socket_bind_install(u);
+}
+
+static void cgroup_apply_restrict_network_interfaces(Unit *u) {
+        assert(u);
+
+        (void) restrict_network_interfaces_install(u);
+}
+
+static int cgroup_apply_devices(Unit *u) {
+        _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+        const char *path;
+        CGroupContext *c;
+        CGroupDevicePolicy policy;
+        int r;
+
+        assert_se(c = unit_get_cgroup_context(u));
+        assert_se(path = u->cgroup_path);
+
+        policy = c->device_policy;
+
+        if (cg_all_unified() > 0) {
+                r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
+                if (r < 0)
+                        return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
+
+        } else {
+                /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
+                 * EINVAL here. */
+
+                if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
+                        r = cg_set_attribute("devices", path, "devices.deny", "a");
+                else
+                        r = cg_set_attribute("devices", path, "devices.allow", "a");
+                if (r < 0)
+                        log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
+                                            "Failed to reset devices.allow/devices.deny: %m");
+        }
+
+        bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
+                (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
+        if (allow_list_static)
+                (void) bpf_devices_allow_list_static(prog, path);
+
+        bool any = allow_list_static;
+        LIST_FOREACH(device_allow, a, c->device_allow) {
+                const char *val;
+
+                if (a->permissions == 0)
+                        continue;
+
+                if (path_startswith(a->path, "/dev/"))
+                        r = bpf_devices_allow_list_device(prog, path, a->path, a->permissions);
+                else if ((val = startswith(a->path, "block-")))
+                        r = bpf_devices_allow_list_major(prog, path, val, 'b', a->permissions);
+                else if ((val = startswith(a->path, "char-")))
+                        r = bpf_devices_allow_list_major(prog, path, val, 'c', a->permissions);
+                else {
+                        log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
+                        continue;
+                }
+
+                if (r >= 0)
+                        any = true;
+        }
+
+        if (prog && !any) {
+                log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
+
+                /* The kernel verifier would reject a program we would build with the normal intro and outro
+                   but no allow-listing rules (outro would contain an unreachable instruction for successful
+                   return). */
+                policy = CGROUP_DEVICE_POLICY_STRICT;
+        }
+
+        r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed);
+        if (r < 0) {
+                static bool warned = false;
+
+                log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
+                               "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
+                               "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
+                               "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
+
+                warned = true;
+        }
+        return r;
+}
+
+static void set_io_weight(Unit *u, uint64_t weight) {
+        char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
+
+        assert(u);
+
+        (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
+
+        xsprintf(buf, "default %" PRIu64 "\n", weight);
+        (void) set_attribute_and_warn(u, "io", "io.weight", buf);
+}
+
+static void set_blkio_weight(Unit *u, uint64_t weight) {
+        char buf[STRLEN("\n")+DECIMAL_STR_MAX(uint64_t)];
+
+        assert(u);
+
+        (void) set_bfq_weight(u, "blkio", makedev(0, 0), weight);
+
+        xsprintf(buf, "%" PRIu64 "\n", weight);
+        (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
+}
+
+static void cgroup_apply_bpf_foreign_program(Unit *u) {
+        assert(u);
+
+        (void) bpf_foreign_install(u);
+}
+
+static void cgroup_context_apply(
+                Unit *u,
+                CGroupMask apply_mask,
+                ManagerState state) {
+
+        const char *path;
+        CGroupContext *c;
+        bool is_host_root, is_local_root;
+        int r;
+
+        assert(u);
+
+        /* Nothing to do? Exit early! */
+        if (apply_mask == 0)
+                return;
+
+        /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
+         * attributes should only be managed for cgroups further down the tree. */
+        is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
+        is_host_root = unit_has_host_root_cgroup(u);
+
+        assert_se(c = unit_get_cgroup_context(u));
+        assert_se(path = u->cgroup_path);
+
+        if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
+                path = "/";
+
+        /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
+         * then), and missing cgroups, i.e. EROFS and ENOENT. */
+
+        /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
+         * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
+         * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
+         * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
+         * we couldn't even write to them if we wanted to). */
+        if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
+
+                if (cg_all_unified() > 0) {
+                        uint64_t weight;
+
+                        if (cgroup_context_has_cpu_weight(c))
+                                weight = cgroup_context_cpu_weight(c, state);
+                        else if (cgroup_context_has_cpu_shares(c)) {
+                                uint64_t shares;
+
+                                shares = cgroup_context_cpu_shares(c, state);
+                                weight = cgroup_cpu_shares_to_weight(shares);
+
+                                log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
+                                                  shares, weight, path);
+                        } else
+                                weight = CGROUP_WEIGHT_DEFAULT;
+
+                        cgroup_apply_unified_cpu_idle(u, weight);
+                        cgroup_apply_unified_cpu_weight(u, weight);
+                        cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
+
+                } else {
+                        uint64_t shares;
+
+                        if (cgroup_context_has_cpu_weight(c)) {
+                                uint64_t weight;
+
+                                weight = cgroup_context_cpu_weight(c, state);
+                                shares = cgroup_cpu_weight_to_shares(weight);
+
+                                log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
+                                                  weight, shares, path);
+                        } else if (cgroup_context_has_cpu_shares(c))
+                                shares = cgroup_context_cpu_shares(c, state);
+                        else
+                                shares = CGROUP_CPU_SHARES_DEFAULT;
+
+                        cgroup_apply_legacy_cpu_shares(u, shares);
+                        cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
+                }
+        }
+
+        if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
+                cgroup_apply_unified_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
+                cgroup_apply_unified_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
+        }
+
+        /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
+         * controller), and in case of containers we want to leave control of these attributes to the container manager
+         * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
+        if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
+                bool has_io, has_blockio;
+                uint64_t weight;
+
+                has_io = cgroup_context_has_io_config(c);
+                has_blockio = cgroup_context_has_blockio_config(c);
+
+                if (has_io)
+                        weight = cgroup_context_io_weight(c, state);
+                else if (has_blockio) {
+                        uint64_t blkio_weight;
+
+                        blkio_weight = cgroup_context_blkio_weight(c, state);
+                        weight = cgroup_weight_blkio_to_io(blkio_weight);
+
+                        log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
+                                          blkio_weight, weight);
+                } else
+                        weight = CGROUP_WEIGHT_DEFAULT;
+
+                set_io_weight(u, weight);
+
+                if (has_io) {
+                        LIST_FOREACH(device_weights, w, c->io_device_weights)
+                                cgroup_apply_io_device_weight(u, w->path, w->weight);
+
+                        LIST_FOREACH(device_limits, limit, c->io_device_limits)
+                                cgroup_apply_io_device_limit(u, limit->path, limit->limits);
+
+                        LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
+                                cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
+
+                } else if (has_blockio) {
+                        LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+                                weight = cgroup_weight_blkio_to_io(w->weight);
+
+                                log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
+                                                  w->weight, weight, w->path);
+
+                                cgroup_apply_io_device_weight(u, w->path, weight);
+                        }
+
+                        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+                                uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+
+                                for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+                                        limits[type] = cgroup_io_limit_defaults[type];
+
+                                limits[CGROUP_IO_RBPS_MAX] = b->rbps;
+                                limits[CGROUP_IO_WBPS_MAX] = b->wbps;
+
+                                log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
+                                                  b->rbps, b->wbps, b->path);
+
+                                cgroup_apply_io_device_limit(u, b->path, limits);
+                        }
+                }
+        }
+
+        if (apply_mask & CGROUP_MASK_BLKIO) {
+                bool has_io, has_blockio;
+
+                has_io = cgroup_context_has_io_config(c);
+                has_blockio = cgroup_context_has_blockio_config(c);
+
+                /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
+                 * left to our container manager, too. */
+                if (!is_local_root) {
+                        uint64_t weight;
+
+                        if (has_io) {
+                                uint64_t io_weight;
+
+                                io_weight = cgroup_context_io_weight(c, state);
+                                weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
+
+                                log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
+                                                  io_weight, weight);
+                        } else if (has_blockio)
+                                weight = cgroup_context_blkio_weight(c, state);
+                        else
+                                weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
+
+                        set_blkio_weight(u, weight);
+
+                        if (has_io)
+                                LIST_FOREACH(device_weights, w, c->io_device_weights) {
+                                        weight = cgroup_weight_io_to_blkio(w->weight);
+
+                                        log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
+                                                          w->weight, weight, w->path);
+
+                                        cgroup_apply_blkio_device_weight(u, w->path, weight);
+                                }
+                        else if (has_blockio)
+                                LIST_FOREACH(device_weights, w, c->blockio_device_weights)
+                                        cgroup_apply_blkio_device_weight(u, w->path, w->weight);
+                }
+
+                /* The bandwidth limits are something that make sense to be applied to the host's root but not container
+                 * roots, as there we want the container manager to handle it */
+                if (is_host_root || !is_local_root) {
+                        if (has_io)
+                                LIST_FOREACH(device_limits, l, c->io_device_limits) {
+                                        log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
+                                                          l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
+
+                                        cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
+                                }
+                        else if (has_blockio)
+                                LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+                                        cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
+                }
+        }
+
+        /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
+         * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
+         * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
+         * write to this if we wanted to.) */
+        if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
+
+                if (cg_all_unified() > 0) {
+                        uint64_t max, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX;
+
+                        if (unit_has_unified_memory_config(u)) {
+                                bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
+
+                                high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high;
+                                max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max;
+                                swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max;
+                                zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max;
+                        } else {
+                                max = c->memory_limit;
+
+                                if (max != CGROUP_LIMIT_MAX)
+                                        log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
+                        }
+
+                        cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
+                        cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
+                        cgroup_apply_unified_memory_limit(u, "memory.high", high);
+                        cgroup_apply_unified_memory_limit(u, "memory.max", max);
+                        cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
+                        cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max);
+
+                        (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
+
+                } else {
+                        char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+                        uint64_t val;
+
+                        if (unit_has_unified_memory_config(u)) {
+                                val = c->memory_max;
+                                if (val != CGROUP_LIMIT_MAX)
+                                        log_cgroup_compat(u, "Applying MemoryMax=%" PRIu64 " as MemoryLimit=", val);
+                        } else
+                                val = c->memory_limit;
+
+                        if (val == CGROUP_LIMIT_MAX)
+                                strncpy(buf, "-1\n", sizeof(buf));
+                        else
+                                xsprintf(buf, "%" PRIu64 "\n", val);
+
+                        (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
+                }
+        }
+
+        /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
+         * containers, where we leave this to the manager */
+        if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
+            (is_host_root || cg_all_unified() > 0 || !is_local_root))
+                (void) cgroup_apply_devices(u);
+
+        if (apply_mask & CGROUP_MASK_PIDS) {
+
+                if (is_host_root) {
+                        /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
+                         * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
+                         * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
+                         * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
+                         * exclusive ownership of the sysctls, but we still want to honour things if the user sets
+                         * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
+                         * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
+                         * it also counts. But if the user never set a limit through us (i.e. we are the default of
+                         * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
+                         * the first time we set a limit. Note that this boolean is flushed out on manager reload,
+                         * which is desirable so that there's an official way to release control of the sysctl from
+                         * systemd: set the limit to unbounded and reload. */
+
+                        if (cgroup_tasks_max_isset(&c->tasks_max)) {
+                                u->manager->sysctl_pid_max_changed = true;
+                                r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max));
+                        } else if (u->manager->sysctl_pid_max_changed)
+                                r = procfs_tasks_set_limit(TASKS_MAX);
+                        else
+                                r = 0;
+                        if (r < 0)
+                                log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
+                                                    "Failed to write to tasks limit sysctls: %m");
+                }
+
+                /* The attribute itself is not available on the host root cgroup, and in the container case we want to
+                 * leave it for the container manager. */
+                if (!is_local_root) {
+                        if (cgroup_tasks_max_isset(&c->tasks_max)) {
+                                char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+
+                                xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max));
+                                (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
+                        } else
+                                (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
+                }
+        }
+
+        if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
+                cgroup_apply_firewall(u);
+
+        if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
+                cgroup_apply_bpf_foreign_program(u);
+
+        if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
+                cgroup_apply_socket_bind(u);
+
+        if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
+                cgroup_apply_restrict_network_interfaces(u);
+
+        unit_modify_nft_set(u, /* add = */ true);
+}
+
+static bool unit_get_needs_bpf_firewall(Unit *u) {
+        CGroupContext *c;
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return false;
+
+        if (c->ip_accounting ||
+            !set_isempty(c->ip_address_allow) ||
+            !set_isempty(c->ip_address_deny) ||
+            c->ip_filters_ingress ||
+            c->ip_filters_egress)
+                return true;
+
+        /* If any parent slice has an IP access list defined, it applies too */
+        for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
+                c = unit_get_cgroup_context(p);
+                if (!c)
+                        return false;
+
+                if (!set_isempty(c->ip_address_allow) ||
+                    !set_isempty(c->ip_address_deny))
+                        return true;
+        }
+
+        return false;
+}
+
+static bool unit_get_needs_bpf_foreign_program(Unit *u) {
+        CGroupContext *c;
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return false;
+
+        return !!c->bpf_foreign_programs;
+}
+
+static bool unit_get_needs_socket_bind(Unit *u) {
+        CGroupContext *c;
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return false;
+
+        return c->socket_bind_allow || c->socket_bind_deny;
+}
+
+static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
+        CGroupContext *c;
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return false;
+
+        return !set_isempty(c->restrict_network_interfaces);
+}
+
+static CGroupMask unit_get_cgroup_mask(Unit *u) {
+        CGroupMask mask = 0;
+        CGroupContext *c;
+
+        assert(u);
+
+        assert_se(c = unit_get_cgroup_context(u));
+
+        /* Figure out which controllers we need, based on the cgroup context object */
+
+        if (c->cpu_accounting)
+                mask |= get_cpu_accounting_mask();
+
+        if (cgroup_context_has_cpu_weight(c) ||
+            cgroup_context_has_cpu_shares(c) ||
+            c->cpu_quota_per_sec_usec != USEC_INFINITY)
+                mask |= CGROUP_MASK_CPU;
+
+        if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
+                mask |= CGROUP_MASK_CPUSET;
+
+        if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
+                mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
+
+        if (c->memory_accounting ||
+            c->memory_limit != CGROUP_LIMIT_MAX ||
+            unit_has_unified_memory_config(u))
+                mask |= CGROUP_MASK_MEMORY;
+
+        if (c->device_allow ||
+            c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
+                mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
+
+        if (c->tasks_accounting ||
+            cgroup_tasks_max_isset(&c->tasks_max))
+                mask |= CGROUP_MASK_PIDS;
+
+        return CGROUP_MASK_EXTEND_JOINED(mask);
+}
+
+static CGroupMask unit_get_bpf_mask(Unit *u) {
+        CGroupMask mask = 0;
+
+        /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
+         * too. */
+
+        if (unit_get_needs_bpf_firewall(u))
+                mask |= CGROUP_MASK_BPF_FIREWALL;
+
+        if (unit_get_needs_bpf_foreign_program(u))
+                mask |= CGROUP_MASK_BPF_FOREIGN;
+
+        if (unit_get_needs_socket_bind(u))
+                mask |= CGROUP_MASK_BPF_SOCKET_BIND;
+
+        if (unit_get_needs_restrict_network_interfaces(u))
+                mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
+
+        return mask;
+}
+
+CGroupMask unit_get_own_mask(Unit *u) {
+        CGroupContext *c;
+
+        /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
+         * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+}
+
+CGroupMask unit_get_delegate_mask(Unit *u) {
+        CGroupContext *c;
+
+        /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
+         * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
+         *
+         * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
+
+        if (!unit_cgroup_delegate(u))
+                return 0;
+
+        if (cg_all_unified() <= 0) {
+                ExecContext *e;
+
+                e = unit_get_exec_context(u);
+                if (e && !exec_context_maintains_privileges(e))
+                        return 0;
+        }
+
+        assert_se(c = unit_get_cgroup_context(u));
+        return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
+}
+
+static CGroupMask unit_get_subtree_mask(Unit *u) {
+
+        /* Returns the mask of this subtree, meaning of the group
+         * itself and its children. */
+
+        return unit_get_own_mask(u) | unit_get_members_mask(u);
+}
+
+CGroupMask unit_get_members_mask(Unit *u) {
+        assert(u);
+
+        /* Returns the mask of controllers all of the unit's children require, merged */
+
+        if (u->cgroup_members_mask_valid)
+                return u->cgroup_members_mask; /* Use cached value if possible */
+
+        u->cgroup_members_mask = 0;
+
+        if (u->type == UNIT_SLICE) {
+                Unit *member;
+
+                UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
+                        u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
+        }
+
+        u->cgroup_members_mask_valid = true;
+        return u->cgroup_members_mask;
+}
+
+CGroupMask unit_get_siblings_mask(Unit *u) {
+        Unit *slice;
+        assert(u);
+
+        /* Returns the mask of controllers all of the unit's siblings
+         * require, i.e. the members mask of the unit's parent slice
+         * if there is one. */
+
+        slice = UNIT_GET_SLICE(u);
+        if (slice)
+                return unit_get_members_mask(slice);
+
+        return unit_get_subtree_mask(u); /* we are the top-level slice */
+}
+
+static CGroupMask unit_get_disable_mask(Unit *u) {
+        CGroupContext *c;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        return c->disable_controllers;
+}
+
+CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
+        CGroupMask mask;
+        Unit *slice;
+
+        assert(u);
+        mask = unit_get_disable_mask(u);
+
+        /* Returns the mask of controllers which are marked as forcibly
+         * disabled in any ancestor unit or the unit in question. */
+
+        slice = UNIT_GET_SLICE(u);
+        if (slice)
+                mask |= unit_get_ancestor_disable_mask(slice);
+
+        return mask;
+}
+
+CGroupMask unit_get_target_mask(Unit *u) {
+        CGroupMask own_mask, mask;
+
+        /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
+         * it needs itself, plus all that its children need, plus all that its siblings need. This is
+         * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
+         * hierarchy that shall be enabled for it. */
+
+        own_mask = unit_get_own_mask(u);
+
+        if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
+                emit_bpf_firewall_warning(u);
+
+        mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
+
+        mask &= u->manager->cgroup_supported;
+        mask &= ~unit_get_ancestor_disable_mask(u);
+
+        return mask;
+}
+
+CGroupMask unit_get_enable_mask(Unit *u) {
+        CGroupMask mask;
+
+        /* This returns the cgroup mask of all controllers to enable
+         * for the children of a specific cgroup. This is primarily
+         * useful for the unified cgroup hierarchy, where each cgroup
+         * controls which controllers are enabled for its children. */
+
+        mask = unit_get_members_mask(u);
+        mask &= u->manager->cgroup_supported;
+        mask &= ~unit_get_ancestor_disable_mask(u);
+
+        return mask;
+}
+
+void unit_invalidate_cgroup_members_masks(Unit *u) {
+        Unit *slice;
+
+        assert(u);
+
+        /* Recurse invalidate the member masks cache all the way up the tree */
+        u->cgroup_members_mask_valid = false;
+
+        slice = UNIT_GET_SLICE(u);
+        if (slice)
+                unit_invalidate_cgroup_members_masks(slice);
+}
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
+
+        /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
+
+        while (u) {
+
+                if (u->cgroup_path &&
+                    u->cgroup_realized &&
+                    FLAGS_SET(u->cgroup_realized_mask, mask))
+                        return u->cgroup_path;
+
+                u = UNIT_GET_SLICE(u);
+        }
+
+        return NULL;
+}
+
+static const char *migrate_callback(CGroupMask mask, void *userdata) {
+        /* If not realized at all, migrate to root ("").
+         * It may happen if we're upgrading from older version that didn't clean up.
+         */
+        return strempty(unit_get_realized_cgroup_path(userdata, mask));
+}
+
+int unit_default_cgroup_path(const Unit *u, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(u);
+        assert(ret);
+
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                p = strdup(u->manager->cgroup_root);
+        else {
+                _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
+                Unit *slice;
+
+                slice = UNIT_GET_SLICE(u);
+                if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
+                        r = cg_slice_to_path(slice->id, &slice_path);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = cg_escape(u->id, &escaped);
+                if (r < 0)
+                        return r;
+
+                p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
+        }
+        if (!p)
+                return -ENOMEM;
+
+        *ret = TAKE_PTR(p);
+        return 0;
+}
+
+int unit_set_cgroup_path(Unit *u, const char *path) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(u);
+
+        if (streq_ptr(u->cgroup_path, path))
+                return 0;
+
+        if (path) {
+                p = strdup(path);
+                if (!p)
+                        return -ENOMEM;
+        }
+
+        if (p) {
+                r = hashmap_put(u->manager->cgroup_unit, p, u);
+                if (r < 0)
+                        return r;
+        }
+
+        unit_release_cgroup(u);
+        u->cgroup_path = TAKE_PTR(p);
+
+        return 1;
+}
+
+int unit_watch_cgroup(Unit *u) {
+        _cleanup_free_ char *events = NULL;
+        int r;
+
+        assert(u);
+
+        /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
+         * cgroupv2 is available. */
+
+        if (!u->cgroup_path)
+                return 0;
+
+        if (u->cgroup_control_inotify_wd >= 0)
+                return 0;
+
+        /* Only applies to the unified hierarchy */
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
+        if (r == 0)
+                return 0;
+
+        /* No point in watch the top-level slice, it's never going to run empty. */
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return 0;
+
+        r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
+        if (r < 0)
+                return log_oom();
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
+        if (r < 0)
+                return log_oom();
+
+        u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+        if (u->cgroup_control_inotify_wd < 0) {
+
+                if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+                                      * is not an error */
+                        return 0;
+
+                return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+        }
+
+        r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+
+        return 0;
+}
+
+int unit_watch_cgroup_memory(Unit *u) {
+        _cleanup_free_ char *events = NULL;
+        CGroupContext *c;
+        int r;
+
+        assert(u);
+
+        /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
+         * cgroupv2 is available. */
+
+        if (!u->cgroup_path)
+                return 0;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
+         * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
+         * all. */
+        if (!c->memory_accounting)
+                return 0;
+
+        /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
+         * we also don't want to generate a log message for each parent cgroup of a process. */
+        if (u->type == UNIT_SLICE)
+                return 0;
+
+        if (u->cgroup_memory_inotify_wd >= 0)
+                return 0;
+
+        /* Only applies to the unified hierarchy */
+        r = cg_all_unified();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
+        if (r == 0)
+                return 0;
+
+        r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
+        if (r < 0)
+                return log_oom();
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
+        if (r < 0)
+                return log_oom();
+
+        u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+        if (u->cgroup_memory_inotify_wd < 0) {
+
+                if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+                                      * is not an error */
+                        return 0;
+
+                return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+        }
+
+        r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+
+        return 0;
+}
+
+int unit_pick_cgroup_path(Unit *u) {
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        assert(u);
+
+        if (u->cgroup_path)
+                return 0;
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return -EINVAL;
+
+        r = unit_default_cgroup_path(u, &path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
+
+        r = unit_set_cgroup_path(u, path);
+        if (r == -EEXIST)
+                return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
+
+        return 0;
+}
+
+static int unit_update_cgroup(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask,
+                ManagerState state) {
+
+        bool created, is_root_slice;
+        CGroupMask migrate_mask = 0;
+        _cleanup_free_ char *cgroup_full_path = NULL;
+        int r;
+
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return 0;
+
+        /* Figure out our cgroup path */
+        r = unit_pick_cgroup_path(u);
+        if (r < 0)
+                return r;
+
+        /* First, create our own group */
+        r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
+        created = r;
+
+        if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+                uint64_t cgroup_id = 0;
+
+                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path);
+                if (r == 0) {
+                        r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
+                        if (r < 0)
+                                log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                                    "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
+                } else
+                        log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+                u->cgroup_id = cgroup_id;
+        }
+
+        /* Start watching it */
+        (void) unit_watch_cgroup(u);
+        (void) unit_watch_cgroup_memory(u);
+
+        /* For v2 we preserve enabled controllers in delegated units, adjust others,
+         * for v1 we figure out which controller hierarchies need migration. */
+        if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
+                CGroupMask result_mask = 0;
+
+                /* Enable all controllers we need */
+                r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+                /* Remember what's actually enabled now */
+                u->cgroup_enabled_mask = result_mask;
+
+                migrate_mask = u->cgroup_realized_mask ^ target_mask;
+        }
+
+        /* Keep track that this is now realized */
+        u->cgroup_realized = true;
+        u->cgroup_realized_mask = target_mask;
+
+        /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
+         *
+         * Unnecessary controller cgroups are trimmed (after emptied by upward migration).
+         * We perform migration also with whole slices for cases when users don't care about leave
+         * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
+         * delegated units.
+         */
+        if (cg_all_unified() == 0) {
+                r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+                is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+                r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path));
+        }
+
+        /* Set attributes */
+        cgroup_context_apply(u, target_mask, state);
+        cgroup_xattr_apply(u);
+
+        /* For most units we expect that memory monitoring is set up before the unit is started and we won't
+         * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
+         * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
+         * try to open the memory pressure interface anew. */
+        if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+                (void) manager_setup_memory_pressure_event_source(u->manager);
+
+        return 0;
+}
+
+static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        char *pp;
+        int r;
+
+        assert(u);
+
+        if (MANAGER_IS_SYSTEM(u->manager))
+                return -EINVAL;
+
+        if (!u->manager->system_bus)
+                return -EIO;
+
+        if (!u->cgroup_path)
+                return -EINVAL;
+
+        /* Determine this unit's cgroup path relative to our cgroup root */
+        pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
+        if (!pp)
+                return -EINVAL;
+
+        pp = strjoina("/", pp, suffix_path);
+        path_simplify(pp);
+
+        r = bus_call_method(u->manager->system_bus,
+                            bus_systemd_mgr,
+                            "AttachProcessesToUnit",
+                            &error, NULL,
+                            "ssau",
+                            NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
+
+        return 0;
+}
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
+        _cleanup_free_ char *joined = NULL;
+        CGroupMask delegated_mask;
+        const char *p;
+        PidRef *pid;
+        int ret, r;
+
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return -EINVAL;
+
+        if (set_isempty(pids))
+                return 0;
+
+        /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
+         * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
+        r = bpf_firewall_load_custom(u);
+        if (r < 0)
+                return r;
+
+        r = unit_realize_cgroup(u);
+        if (r < 0)
+                return r;
+
+        if (isempty(suffix_path))
+                p = u->cgroup_path;
+        else {
+                joined = path_join(u->cgroup_path, suffix_path);
+                if (!joined)
+                        return -ENOMEM;
+
+                p = joined;
+        }
+
+        delegated_mask = unit_get_delegate_mask(u);
+
+        ret = 0;
+        SET_FOREACH(pid, pids) {
+
+                /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
+                 * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
+                 * before we use it */
+                r = pidref_verify(pid);
+                if (r < 0) {
+                        log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p));
+                        continue;
+                }
+
+                /* First, attach the PID to the main cgroup hierarchy */
+                r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid->pid);
+                if (r < 0) {
+                        bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(r);
+
+                        log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO,  r,
+                                            "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
+                                            pid->pid, again ? " directly" : "", empty_to_root(p));
+
+                        if (again) {
+                                int z;
+
+                                /* If we are in a user instance, and we can't move the process ourselves due
+                                 * to permission problems, let's ask the system instance about it instead.
+                                 * Since it's more privileged it might be able to move the process across the
+                                 * leaves of a subtree whose top node is not owned by us. */
+
+                                z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path);
+                                if (z < 0)
+                                        log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p));
+                                else {
+                                        if (ret >= 0)
+                                                ret++; /* Count successful additions */
+                                        continue; /* When the bus thing worked via the bus we are fully done for this PID. */
+                                }
+                        }
+
+                        if (ret >= 0)
+                                ret = r; /* Remember first error */
+
+                        continue;
+                } else if (ret >= 0)
+                        ret++; /* Count successful additions */
+
+                r = cg_all_unified();
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        continue;
+
+                /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
+                 * innermost realized one */
+
+                for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+                        CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+                        const char *realized;
+
+                        if (!(u->manager->cgroup_supported & bit))
+                                continue;
+
+                        /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
+                        if (delegated_mask & u->cgroup_realized_mask & bit) {
+                                r = cg_attach(cgroup_controller_to_string(c), p, pid->pid);
+                                if (r >= 0)
+                                        continue; /* Success! */
+
+                                log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
+                                                     pid->pid, empty_to_root(p), cgroup_controller_to_string(c));
+                        }
+
+                        /* So this controller is either not delegate or realized, or something else weird happened. In
+                         * that case let's attach the PID at least to the closest cgroup up the tree that is
+                         * realized. */
+                        realized = unit_get_realized_cgroup_path(u, bit);
+                        if (!realized)
+                                continue; /* Not even realized in the root slice? Then let's not bother */
+
+                        r = cg_attach(cgroup_controller_to_string(c), realized, pid->pid);
+                        if (r < 0)
+                                log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
+                                                     pid->pid, realized, cgroup_controller_to_string(c));
+                }
+        }
+
+        return ret;
+}
+
+static bool unit_has_mask_realized(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask) {
+
+        assert(u);
+
+        /* Returns true if this unit is fully realized. We check four things:
+         *
+         * 1. Whether the cgroup was created at all
+         * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
+         * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
+         * 4. Whether the invalidation mask is currently zero
+         *
+         * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
+         * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
+         * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
+         * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
+         * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
+         * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
+         * simply don't matter. */
+
+        return u->cgroup_realized &&
+                ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
+                ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
+                u->cgroup_invalidated_mask == 0;
+}
+
+static bool unit_has_mask_disables_realized(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask) {
+
+        assert(u);
+
+        /* Returns true if all controllers which should be disabled are indeed disabled.
+         *
+         * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
+         * already removed. */
+
+        return !u->cgroup_realized ||
+                (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+                 FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+}
+
+static bool unit_has_mask_enables_realized(
+                Unit *u,
+                CGroupMask target_mask,
+                CGroupMask enable_mask) {
+
+        assert(u);
+
+        /* Returns true if all controllers which should be enabled are indeed enabled.
+         *
+         * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
+         * we want to add is already added. */
+
+        return u->cgroup_realized &&
+                ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+                ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+}
+
+void unit_add_to_cgroup_realize_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_cgroup_realize_queue)
+                return;
+
+        LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+        u->in_cgroup_realize_queue = true;
+}
+
+static void unit_remove_from_cgroup_realize_queue(Unit *u) {
+        assert(u);
+
+        if (!u->in_cgroup_realize_queue)
+                return;
+
+        LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+        u->in_cgroup_realize_queue = false;
+}
+
+/* Controllers can only be enabled breadth-first, from the root of the
+ * hierarchy downwards to the unit in question. */
+static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
+        CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+        Unit *slice;
+        int r;
+
+        assert(u);
+
+        /* First go deal with this unit's parent, or we won't be able to enable
+         * any new controllers at this layer. */
+        slice = UNIT_GET_SLICE(u);
+        if (slice) {
+                r = unit_realize_cgroup_now_enable(slice, state);
+                if (r < 0)
+                        return r;
+        }
+
+        target_mask = unit_get_target_mask(u);
+        enable_mask = unit_get_enable_mask(u);
+
+        /* We can only enable in this direction, don't try to disable anything.
+         */
+        if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
+                return 0;
+
+        new_target_mask = u->cgroup_realized_mask | target_mask;
+        new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+
+        return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
+}
+
+/* Controllers can only be disabled depth-first, from the leaves of the
+ * hierarchy upwards to the unit in question. */
+static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
+        Unit *m;
+
+        assert(u);
+
+        if (u->type != UNIT_SLICE)
+                return 0;
+
+        UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
+                CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+                int r;
+
+                /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
+                 * holding any controllers open anyway. */
+                if (!m->cgroup_realized)
+                        continue;
+
+                /* We must disable those below us first in order to release the controller. */
+                if (m->type == UNIT_SLICE)
+                        (void) unit_realize_cgroup_now_disable(m, state);
+
+                target_mask = unit_get_target_mask(m);
+                enable_mask = unit_get_enable_mask(m);
+
+                /* We can only disable in this direction, don't try to enable anything. */
+                if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
+                        continue;
+
+                new_target_mask = m->cgroup_realized_mask & target_mask;
+                new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+
+                r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+/* Check if necessary controllers and attributes for a unit are in place.
+ *
+ * - If so, do nothing.
+ * - If not, create paths, move processes over, and set attributes.
+ *
+ * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
+ * a depth-first way. As such the process looks like this:
+ *
+ * Suppose we have a cgroup hierarchy which looks like this:
+ *
+ *             root
+ *            /    \
+ *           /      \
+ *          /        \
+ *         a          b
+ *        / \        / \
+ *       /   \      /   \
+ *      c     d    e     f
+ *     / \   / \  / \   / \
+ *     h i   j k  l m   n o
+ *
+ * 1. We want to realise cgroup "d" now.
+ * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
+ * 3. cgroup "k" just started requesting the memory controller.
+ *
+ * To make this work we must do the following in order:
+ *
+ * 1. Disable CPU controller in k, j
+ * 2. Disable CPU controller in d
+ * 3. Enable memory controller in root
+ * 4. Enable memory controller in a
+ * 5. Enable memory controller in d
+ * 6. Enable memory controller in k
+ *
+ * Notice that we need to touch j in one direction, but not the other. We also
+ * don't go beyond d when disabling -- it's up to "a" to get realized if it
+ * wants to disable further. The basic rules are therefore:
+ *
+ * - If you're disabling something, you need to realise all of the cgroups from
+ *   your recursive descendants to the root. This starts from the leaves.
+ * - If you're enabling something, you need to realise from the root cgroup
+ *   downwards, but you don't need to iterate your recursive descendants.
+ *
+ * Returns 0 on success and < 0 on failure. */
+static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
+        CGroupMask target_mask, enable_mask;
+        Unit *slice;
+        int r;
+
+        assert(u);
+
+        unit_remove_from_cgroup_realize_queue(u);
+
+        target_mask = unit_get_target_mask(u);
+        enable_mask = unit_get_enable_mask(u);
+
+        if (unit_has_mask_realized(u, target_mask, enable_mask))
+                return 0;
+
+        /* Disable controllers below us, if there are any */
+        r = unit_realize_cgroup_now_disable(u, state);
+        if (r < 0)
+                return r;
+
+        /* Enable controllers above us, if there are any */
+        slice = UNIT_GET_SLICE(u);
+        if (slice) {
+                r = unit_realize_cgroup_now_enable(slice, state);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Now actually deal with the cgroup we were trying to realise and set attributes */
+        r = unit_update_cgroup(u, target_mask, enable_mask, state);
+        if (r < 0)
+                return r;
+
+        /* Now, reset the invalidation mask */
+        u->cgroup_invalidated_mask = 0;
+        return 0;
+}
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
+        ManagerState state;
+        unsigned n = 0;
+        Unit *i;
+        int r;
+
+        assert(m);
+
+        state = manager_state(m);
+
+        while ((i = m->cgroup_realize_queue)) {
+                assert(i->in_cgroup_realize_queue);
+
+                if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
+                        /* Maybe things changed, and the unit is not actually active anymore? */
+                        unit_remove_from_cgroup_realize_queue(i);
+                        continue;
+                }
+
+                r = unit_realize_cgroup_now(i, state);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
+
+                n++;
+        }
+
+        return n;
+}
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u) {
+        assert(u);
+        assert(u->type == UNIT_SLICE);
+
+        /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
+         * its ancestors.
+         *
+         * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
+         * very weird if two units that own processes reside in the same slice, but one is realized in the
+         * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
+         * not), because that means individual processes need to be scheduled against whole cgroups. Let's
+         * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
+         * controller hierarchies too (if unit requires the controller to be realized).
+         *
+         * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
+         * masks. */
+
+        do {
+                Unit *m;
+
+                /* Children of u likely changed when we're called */
+                u->cgroup_members_mask_valid = false;
+
+                UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
+
+                        /* No point in doing cgroup application for units without active processes. */
+                        if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
+                                continue;
+
+                        /* We only enqueue siblings if they were realized once at least, in the main
+                         * hierarchy. */
+                        if (!m->cgroup_realized)
+                                continue;
+
+                        /* If the unit doesn't need any new controllers and has current ones
+                         * realized, it doesn't need any changes. */
+                        if (unit_has_mask_realized(m,
+                                                   unit_get_target_mask(m),
+                                                   unit_get_enable_mask(m)))
+                                continue;
+
+                        unit_add_to_cgroup_realize_queue(m);
+                }
+
+                /* Parent comes after children */
+                unit_add_to_cgroup_realize_queue(u);
+
+                u = UNIT_GET_SLICE(u);
+        } while (u);
+}
+
+int unit_realize_cgroup(Unit *u) {
+        Unit *slice;
+
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return 0;
+
+        /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
+         * parents, but there's more actually: for the weight-based controllers we also need to make sure
+         * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too.  On the
+         * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
+         * and ancestors and they should be (de)realized too.
+         *
+         * This call will defer work on the siblings and derealized ancestors to the next event loop
+         * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
+
+        slice = UNIT_GET_SLICE(u);
+        if (slice)
+                unit_add_family_to_cgroup_realize_queue(slice);
+
+        /* And realize this one now (and apply the values) */
+        return unit_realize_cgroup_now(u, manager_state(u->manager));
+}
+
+void unit_release_cgroup(Unit *u) {
+        assert(u);
+
+        /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
+         * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
+
+        if (u->cgroup_path) {
+                (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
+                u->cgroup_path = mfree(u->cgroup_path);
+        }
+
+        if (u->cgroup_control_inotify_wd >= 0) {
+                if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
+                        log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
+
+                (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
+                u->cgroup_control_inotify_wd = -1;
+        }
+
+        if (u->cgroup_memory_inotify_wd >= 0) {
+                if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
+                        log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
+
+                (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
+                u->cgroup_memory_inotify_wd = -1;
+        }
+}
+
+bool unit_maybe_release_cgroup(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return true;
+
+        /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
+         * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
+         * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
+         * up later. */
+        r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+        if (r < 0)
+                log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
+        else if (r == 1) {
+                unit_release_cgroup(u);
+                return true;
+        }
+
+        return false;
+}
+
+void unit_prune_cgroup(Unit *u) {
+        int r;
+        bool is_root_slice;
+
+        assert(u);
+
+        /* Removes the cgroup, if empty and possible, and stops watching it. */
+
+        if (!u->cgroup_path)
+                return;
+
+        (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
+
+#if BPF_FRAMEWORK
+        (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
+#endif
+
+        unit_modify_nft_set(u, /* add = */ false);
+
+        is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+
+        r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
+        if (r < 0)
+                /* One reason we could have failed here is, that the cgroup still contains a process.
+                 * However, if the cgroup becomes removable at a later time, it might be removed when
+                 * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
+                 * that the cgroup is still realized the next time it is started. Do not return early
+                 * on error, continue cleanup. */
+                log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+        if (is_root_slice)
+                return;
+
+        if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
+                return;
+
+        u->cgroup_realized = false;
+        u->cgroup_realized_mask = 0;
+        u->cgroup_enabled_mask = 0;
+
+        u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed);
+}
+
+int unit_search_main_pid(Unit *u, PidRef *ret) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(u);
+        assert(ret);
+
+        if (!u->cgroup_path)
+                return -ENXIO;
+
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
+
+                r = cg_read_pidref(f, &npidref);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */
+                        continue;
+
+                if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */
+                        continue;
+
+                if (pidref_is_set(&pidref) != 0)
+                        /* Dang, there's more than one daemonized PID in this group, so we don't know what
+                         * process is the main process. */
+                        return -ENODATA;
+
+                pidref = TAKE_PIDREF(npidref);
+        }
+
+        if (!pidref_is_set(&pidref))
+                return -ENODATA;
+
+        *ret = TAKE_PIDREF(pidref);
+        return 0;
+}
+
+static int unit_watch_pids_in_path(Unit *u, const char *path) {
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int ret = 0, r;
+
+        assert(u);
+        assert(path);
+
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
+        if (r < 0)
+                RET_GATHER(ret, r);
+        else {
+                for (;;) {
+                        _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+
+                        r = cg_read_pidref(f, &pid);
+                        if (r == 0)
+                                break;
+                        if (r < 0) {
+                                RET_GATHER(ret, r);
+                                break;
+                        }
+
+                        RET_GATHER(ret, unit_watch_pidref(u, &pid, /* exclusive= */ false));
+                }
+        }
+
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+        if (r < 0)
+                RET_GATHER(ret, r);
+        else {
+                for (;;) {
+                        _cleanup_free_ char *fn = NULL, *p = NULL;
+
+                        r = cg_read_subgroup(d, &fn);
+                        if (r == 0)
+                                break;
+                        if (r < 0) {
+                                RET_GATHER(ret, r);
+                                break;
+                        }
+
+                        p = path_join(empty_to_root(path), fn);
+                        if (!p)
+                                return -ENOMEM;
+
+                        RET_GATHER(ret, unit_watch_pids_in_path(u, p));
+                }
+        }
+
+        return ret;
+}
+
+int unit_synthesize_cgroup_empty_event(Unit *u) {
+        int r;
+
+        assert(u);
+
+        /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
+         * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
+         * get as notification source as soon as we stopped having any useful PIDs to watch for. */
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return r;
+        if (r > 0) /* On unified we have reliable notifications, and don't need this */
+                return 0;
+
+        if (!set_isempty(u->pids))
+                return 0;
+
+        unit_add_to_cgroup_empty_queue(u);
+        return 0;
+}
+
+int unit_watch_all_pids(Unit *u) {
+        int r;
+
+        assert(u);
+
+        /* Adds all PIDs from our cgroup to the set of PIDs we
+         * watch. This is a fallback logic for cases where we do not
+         * get reliable cgroup empty notifications: we try to use
+         * SIGCHLD as replacement. */
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return r;
+        if (r > 0) /* On unified we can use proper notifications */
+                return 0;
+
+        return unit_watch_pids_in_path(u, u->cgroup_path);
+}
+
+static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        Unit *u;
+        int r;
+
+        assert(s);
+
+        u = m->cgroup_empty_queue;
+        if (!u)
+                return 0;
+
+        assert(u->in_cgroup_empty_queue);
+        u->in_cgroup_empty_queue = false;
+        LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
+
+        if (m->cgroup_empty_queue) {
+                /* More stuff queued, let's make sure we remain enabled */
+                r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
+        }
+
+        /* Update state based on OOM kills before we notify about cgroup empty event */
+        (void) unit_check_oom(u);
+        (void) unit_check_oomd_kill(u);
+
+        unit_add_to_gc_queue(u);
+
+        if (IN_SET(unit_active_state(u), UNIT_INACTIVE, UNIT_FAILED))
+                unit_prune_cgroup(u);
+        else if (UNIT_VTABLE(u)->notify_cgroup_empty)
+                UNIT_VTABLE(u)->notify_cgroup_empty(u);
+
+        return 0;
+}
+
+void unit_add_to_cgroup_empty_queue(Unit *u) {
+        int r;
+
+        assert(u);
+
+        /* Note that there are four different ways how cgroup empty events reach us:
+         *
+         * 1. On the unified hierarchy we get an inotify event on the cgroup
+         *
+         * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
+         *
+         * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
+         *
+         * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
+         *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
+         *
+         * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
+         * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
+         * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
+         * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
+         * case for scope units). */
+
+        if (u->in_cgroup_empty_queue)
+                return;
+
+        /* Let's verify that the cgroup is really empty */
+        if (!u->cgroup_path)
+                return;
+
+        r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+        if (r < 0) {
+                log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
+                return;
+        }
+        if (r == 0)
+                return;
+
+        LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+        u->in_cgroup_empty_queue = true;
+
+        /* Trigger the defer event */
+        r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
+        if (r < 0)
+                log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
+}
+
+static void unit_remove_from_cgroup_empty_queue(Unit *u) {
+        assert(u);
+
+        if (!u->in_cgroup_empty_queue)
+                return;
+
+        LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+        u->in_cgroup_empty_queue = false;
+}
+
+int unit_check_oomd_kill(Unit *u) {
+        _cleanup_free_ char *value = NULL;
+        bool increased;
+        uint64_t n = 0;
+        int r;
+
+        if (!u->cgroup_path)
+                return 0;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
+        else if (r == 0)
+                return 0;
+
+        r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_ooms", &value);
+        if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+                return r;
+
+        if (!isempty(value)) {
+                 r = safe_atou64(value, &n);
+                 if (r < 0)
+                         return r;
+        }
+
+        increased = n > u->managed_oom_kill_last;
+        u->managed_oom_kill_last = n;
+
+        if (!increased)
+                return 0;
+
+        n = 0;
+        value = mfree(value);
+        r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_kill", &value);
+        if (r >= 0 && !isempty(value))
+                (void) safe_atou64(value, &n);
+
+        if (n > 0)
+                log_unit_struct(u, LOG_NOTICE,
+                                "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+                                LOG_UNIT_INVOCATION_ID(u),
+                                LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
+                                "N_PROCESSES=%" PRIu64, n);
+        else
+                log_unit_struct(u, LOG_NOTICE,
+                                "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+                                LOG_UNIT_INVOCATION_ID(u),
+                                LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
+
+        unit_notify_cgroup_oom(u, /* ManagedOOM= */ true);
+
+        return 1;
+}
+
+int unit_check_oom(Unit *u) {
+        _cleanup_free_ char *oom_kill = NULL;
+        bool increased;
+        uint64_t c;
+        int r;
+
+        if (!u->cgroup_path)
+                return 0;
+
+        r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
+        if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
+                c = 0;
+        else if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
+        else {
+                r = safe_atou64(oom_kill, &c);
+                if (r < 0)
+                        return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
+        }
+
+        increased = c > u->oom_kill_last;
+        u->oom_kill_last = c;
+
+        if (!increased)
+                return 0;
+
+        log_unit_struct(u, LOG_NOTICE,
+                        "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
+                        LOG_UNIT_INVOCATION_ID(u),
+                        LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
+
+        unit_notify_cgroup_oom(u, /* ManagedOOM= */ false);
+
+        return 1;
+}
+
+static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        Unit *u;
+        int r;
+
+        assert(s);
+
+        u = m->cgroup_oom_queue;
+        if (!u)
+                return 0;
+
+        assert(u->in_cgroup_oom_queue);
+        u->in_cgroup_oom_queue = false;
+        LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
+
+        if (m->cgroup_oom_queue) {
+                /* More stuff queued, let's make sure we remain enabled */
+                r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
+        }
+
+        (void) unit_check_oom(u);
+        unit_add_to_gc_queue(u);
+
+        return 0;
+}
+
+static void unit_add_to_cgroup_oom_queue(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->in_cgroup_oom_queue)
+                return;
+        if (!u->cgroup_path)
+                return;
+
+        LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+        u->in_cgroup_oom_queue = true;
+
+        /* Trigger the defer event */
+        if (!u->manager->cgroup_oom_event_source) {
+                _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+                r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to create cgroup oom event source: %m");
+                        return;
+                }
+
+                r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
+                        return;
+                }
+
+                (void) sd_event_source_set_description(s, "cgroup-oom");
+                u->manager->cgroup_oom_event_source = TAKE_PTR(s);
+        }
+
+        r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
+        if (r < 0)
+                log_error_errno(r, "Failed to enable cgroup oom event source: %m");
+}
+
+static int unit_check_cgroup_events(Unit *u) {
+        char *values[2] = {};
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return 0;
+
+        r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+                                            STRV_MAKE("populated", "frozen"), values);
+        if (r < 0)
+                return r;
+
+        /* The cgroup.events notifications can be merged together so act as we saw the given state for the
+         * first time. The functions we call to handle given state are idempotent, which makes them
+         * effectively remember the previous state. */
+        if (values[0]) {
+                if (streq(values[0], "1"))
+                        unit_remove_from_cgroup_empty_queue(u);
+                else
+                        unit_add_to_cgroup_empty_queue(u);
+        }
+
+        /* Disregard freezer state changes due to operations not initiated by us */
+        if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
+                if (streq(values[1], "0"))
+                        unit_thawed(u);
+                else
+                        unit_frozen(u);
+        }
+
+        free(values[0]);
+        free(values[1]);
+
+        return 0;
+}
+
+static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+        assert(fd >= 0);
+
+        for (;;) {
+                union inotify_event_buffer buffer;
+                ssize_t l;
+
+                l = read(fd, &buffer, sizeof(buffer));
+                if (l < 0) {
+                        if (ERRNO_IS_TRANSIENT(errno))
+                                return 0;
+
+                        return log_error_errno(errno, "Failed to read control group inotify events: %m");
+                }
+
+                FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
+                        Unit *u;
+
+                        if (e->wd < 0)
+                                /* Queue overflow has no watch descriptor */
+                                continue;
+
+                        if (e->mask & IN_IGNORED)
+                                /* The watch was just removed */
+                                continue;
+
+                        /* Note that inotify might deliver events for a watch even after it was removed,
+                         * because it was queued before the removal. Let's ignore this here safely. */
+
+                        u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
+                        if (u)
+                                unit_check_cgroup_events(u);
+
+                        u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
+                        if (u)
+                                unit_add_to_cgroup_oom_queue(u);
+                }
+        }
+}
+
+static int cg_bpf_mask_supported(CGroupMask *ret) {
+        CGroupMask mask = 0;
+        int r;
+
+        /* BPF-based firewall */
+        r = bpf_firewall_supported();
+        if (r < 0)
+                return r;
+        if (r > 0)
+                mask |= CGROUP_MASK_BPF_FIREWALL;
+
+        /* BPF-based device access control */
+        r = bpf_devices_supported();
+        if (r < 0)
+                return r;
+        if (r > 0)
+                mask |= CGROUP_MASK_BPF_DEVICES;
+
+        /* BPF pinned prog */
+        r = bpf_foreign_supported();
+        if (r < 0)
+                return r;
+        if (r > 0)
+                mask |= CGROUP_MASK_BPF_FOREIGN;
+
+        /* BPF-based bind{4|6} hooks */
+        r = bpf_socket_bind_supported();
+        if (r < 0)
+                return r;
+        if (r > 0)
+                mask |= CGROUP_MASK_BPF_SOCKET_BIND;
+
+        /* BPF-based cgroup_skb/{egress|ingress} hooks */
+        r = restrict_network_interfaces_supported();
+        if (r < 0)
+                return r;
+        if (r > 0)
+                mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
+
+        *ret = mask;
+        return 0;
+}
+
+int manager_setup_cgroup(Manager *m) {
+        _cleanup_free_ char *path = NULL;
+        const char *scope_path;
+        int r, all_unified;
+        CGroupMask mask;
+        char *e;
+
+        assert(m);
+
+        /* 1. Determine hierarchy */
+        m->cgroup_root = mfree(m->cgroup_root);
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
+        if (r < 0)
+                return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
+
+        /* Chop off the init scope, if we are already located in it */
+        e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+
+        /* LEGACY: Also chop off the system slice if we are in
+         * it. This is to support live upgrades from older systemd
+         * versions where PID 1 was moved there. Also see
+         * cg_get_root_path(). */
+        if (!e && MANAGER_IS_SYSTEM(m)) {
+                e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
+                if (!e)
+                        e = endswith(m->cgroup_root, "/system"); /* even more legacy */
+        }
+        if (e)
+                *e = 0;
+
+        /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
+         * easily prepend it everywhere. */
+        delete_trailing_chars(m->cgroup_root, "/");
+
+        /* 2. Show data */
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
+        if (r < 0)
+                return log_error_errno(r, "Cannot find cgroup mount point: %m");
+
+        r = cg_unified();
+        if (r < 0)
+                return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
+
+        all_unified = cg_all_unified();
+        if (all_unified < 0)
+                return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
+        if (all_unified > 0)
+                log_debug("Unified cgroup hierarchy is located at %s.", path);
+        else {
+                r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
+                if (r > 0)
+                        log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
+                else
+                        log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
+        }
+
+        /* 3. Allocate cgroup empty defer event source */
+        m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+        r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create cgroup empty event source: %m");
+
+        /* Schedule cgroup empty checks early, but after having processed service notification messages or
+         * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
+         * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
+        r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
+
+        r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
+
+        (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
+
+        /* 4. Install notifier inotify object, or agent */
+        if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+
+                /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
+
+                m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
+                safe_close(m->cgroup_inotify_fd);
+
+                m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+                if (m->cgroup_inotify_fd < 0)
+                        return log_error_errno(errno, "Failed to create control group inotify object: %m");
+
+                r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to watch control group inotify object: %m");
+
+                /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
+                 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
+                 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
+                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set priority of inotify event source: %m");
+
+                (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
+
+        } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
+
+                /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
+                 * since it does not generate events when control groups with children run empty. */
+
+                r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to install release agent, ignoring: %m");
+                else if (r > 0)
+                        log_debug("Installed release agent.");
+                else if (r == 0)
+                        log_debug("Release agent already installed.");
+        }
+
+        /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
+        scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
+        if (r >= 0) {
+                /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
+                r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
+                if (r < 0)
+                        log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
+
+                /* 6. And pin it, so that it cannot be unmounted */
+                safe_close(m->pin_cgroupfs_fd);
+                m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
+                if (m->pin_cgroupfs_fd < 0)
+                        return log_error_errno(errno, "Failed to open pin file: %m");
+
+        } else if (!MANAGER_IS_TEST_RUN(m))
+                return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
+
+        /* 7. Always enable hierarchical support if it exists... */
+        if (!all_unified && !MANAGER_IS_TEST_RUN(m))
+                (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
+
+        /* 8. Figure out which controllers are supported */
+        r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine supported controllers: %m");
+
+        /* 9. Figure out which bpf-based pseudo-controllers are supported */
+        r = cg_bpf_mask_supported(&mask);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
+        m->cgroup_supported |= mask;
+
+        /* 10. Log which controllers are supported */
+        for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
+                log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
+                          yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
+
+        return 0;
+}
+
+void manager_shutdown_cgroup(Manager *m, bool delete) {
+        assert(m);
+
+        /* We can't really delete the group, since we are in it. But
+         * let's trim it. */
+        if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+                (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
+
+        m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+
+        m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
+        m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
+
+        m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
+        m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
+
+        m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
+
+        m->cgroup_root = mfree(m->cgroup_root);
+}
+
+Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
+        char *p;
+        Unit *u;
+
+        assert(m);
+        assert(cgroup);
+
+        u = hashmap_get(m->cgroup_unit, cgroup);
+        if (u)
+                return u;
+
+        p = strdupa_safe(cgroup);
+        for (;;) {
+                char *e;
+
+                e = strrchr(p, '/');
+                if (!e || e == p)
+                        return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
+
+                *e = 0;
+
+                u = hashmap_get(m->cgroup_unit, p);
+                if (u)
+                        return u;
+        }
+}
+
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) {
+        _cleanup_free_ char *cgroup = NULL;
+
+        assert(m);
+
+        if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
+                return NULL;
+
+        return manager_get_unit_by_cgroup(m, cgroup);
+}
+
+Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) {
+        Unit *u, **array;
+
+        assert(m);
+
+        if (!pidref_is_set(pid))
+                return NULL;
+
+        u = hashmap_get(m->watch_pids, pid);
+        if (u)
+                return u;
+
+        array = hashmap_get(m->watch_pids_more, pid);
+        if (array)
+                return array[0];
+
+        return NULL;
+}
+
+Unit *manager_get_unit_by_pidref(Manager *m, PidRef *pid) {
+        Unit *u;
+
+        assert(m);
+
+        /* Note that a process might be owned by multiple units, we return only one here, which is good
+         * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
+         * membership, as that's the most relevant one as children of the process will be assigned to that
+         * one, too, before all else. */
+
+        if (!pidref_is_set(pid))
+                return NULL;
+
+        if (pidref_is_self(pid))
+                return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
+        if (pid->pid == 1)
+                return NULL;
+
+        u = manager_get_unit_by_pidref_cgroup(m, pid);
+        if (u)
+                return u;
+
+        u = manager_get_unit_by_pidref_watching(m, pid);
+        if (u)
+                return u;
+
+        return NULL;
+}
+
+Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
+        assert(m);
+
+        if (!pid_is_valid(pid))
+                return NULL;
+
+        return manager_get_unit_by_pidref(m, &PIDREF_MAKE_FROM_PID(pid));
+}
+
+int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
+        Unit *u;
+
+        assert(m);
+        assert(cgroup);
+
+        /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
+         * or from the --system instance */
+
+        log_debug("Got cgroup empty notification for: %s", cgroup);
+
+        u = manager_get_unit_by_cgroup(m, cgroup);
+        if (!u)
+                return 0;
+
+        unit_add_to_cgroup_empty_queue(u);
+        return 1;
+}
+
+int unit_get_memory_available(Unit *u, uint64_t *ret) {
+        uint64_t available = UINT64_MAX, current = 0;
+
+        assert(u);
+        assert(ret);
+
+        /* If data from cgroups can be accessed, try to find out how much more memory a unit can
+         * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
+         * and MemoryMax, and also any slice the unit might be nested below. */
+
+        do {
+                uint64_t unit_available, unit_limit = UINT64_MAX;
+                CGroupContext *unit_context;
+
+                /* No point in continuing if we can't go any lower */
+                if (available == 0)
+                        break;
+
+                unit_context = unit_get_cgroup_context(u);
+                if (!unit_context)
+                        return -ENODATA;
+
+                if (!u->cgroup_path)
+                        continue;
+
+                (void) unit_get_memory_current(u, ¤t);
+                /* in case of error, previous current propagates as lower bound */
+
+                if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                        unit_limit = physical_memory();
+                else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX)
+                        continue;
+                unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high);
+
+                unit_available = LESS_BY(unit_limit, current);
+                available = MIN(unit_available, available);
+        } while ((u = UNIT_GET_SLICE(u)));
+
+        *ret = available;
+
+        return 0;
+}
+
+int unit_get_memory_current(Unit *u, uint64_t *ret) {
+        int r;
+
+        // FIXME: Merge this into unit_get_memory_accounting after support for cgroup v1 is dropped
+
+        assert(u);
+        assert(ret);
+
+        if (!UNIT_CGROUP_BOOL(u, memory_accounting))
+                return -ENODATA;
+
+        if (!u->cgroup_path)
+                return -ENODATA;
+
+        /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+        if (unit_has_host_root_cgroup(u))
+                return procfs_memory_get_used(ret);
+
+        if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
+                return -ENODATA;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+
+        return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
+}
+
+int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
+
+        static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
+                [CGROUP_MEMORY_PEAK]          = "memory.peak",
+                [CGROUP_MEMORY_SWAP_CURRENT]  = "memory.swap.current",
+                [CGROUP_MEMORY_SWAP_PEAK]     = "memory.swap.peak",
+                [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current",
+        };
+
+        uint64_t bytes;
+        bool updated = false;
+        int r;
+
+        assert(u);
+        assert(metric >= 0);
+        assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX);
+
+        if (!UNIT_CGROUP_BOOL(u, memory_accounting))
+                return -ENODATA;
+
+        if (!u->cgroup_path)
+                /* If the cgroup is already gone, we try to find the last cached value. */
+                goto finish;
+
+        /* The root cgroup doesn't expose this information. */
+        if (unit_has_host_root_cgroup(u))
+                return -ENODATA;
+
+        if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_MEMORY))
+                return -ENODATA;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -ENODATA;
+
+        r = cg_get_attribute_as_uint64("memory", u->cgroup_path, attributes_table[metric], &bytes);
+        if (r < 0 && r != -ENODATA)
+                return r;
+        updated = r >= 0;
+
+finish:
+        if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
+                uint64_t *last = &u->memory_accounting_last[metric];
+
+                if (updated)
+                        *last = bytes;
+                else if (*last != UINT64_MAX)
+                        bytes = *last;
+                else
+                        return -ENODATA;
+
+        } else if (!updated)
+                return -ENODATA;
+
+        if (ret)
+                *ret = bytes;
+
+        return 0;
+}
+
+int unit_get_tasks_current(Unit *u, uint64_t *ret) {
+        assert(u);
+        assert(ret);
+
+        if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
+                return -ENODATA;
+
+        if (!u->cgroup_path)
+                return -ENODATA;
+
+        /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+        if (unit_has_host_root_cgroup(u))
+                return procfs_tasks_get_current(ret);
+
+        if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
+                return -ENODATA;
+
+        return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
+}
+
+static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
+        uint64_t ns;
+        int r;
+
+        assert(u);
+        assert(ret);
+
+        if (!u->cgroup_path)
+                return -ENODATA;
+
+        /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+        if (unit_has_host_root_cgroup(u))
+                return procfs_cpu_get_usage(ret);
+
+        /* Requisite controllers for CPU accounting are not enabled */
+        if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
+                return -ENODATA;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                _cleanup_free_ char *val = NULL;
+                uint64_t us;
+
+                r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
+                if (IN_SET(r, -ENOENT, -ENXIO))
+                        return -ENODATA;
+                if (r < 0)
+                        return r;
+
+                r = safe_atou64(val, &us);
+                if (r < 0)
+                        return r;
+
+                ns = us * NSEC_PER_USEC;
+        } else
+                return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
+
+        *ret = ns;
+        return 0;
+}
+
+int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
+        nsec_t ns;
+        int r;
+
+        assert(u);
+
+        /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
+         * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
+         * call this function with a NULL return value. */
+
+        if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
+                return -ENODATA;
+
+        r = unit_get_cpu_usage_raw(u, &ns);
+        if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
+                /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
+                 * cached value. */
+
+                if (ret)
+                        *ret = u->cpu_usage_last;
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        if (ns > u->cpu_usage_base)
+                ns -= u->cpu_usage_base;
+        else
+                ns = 0;
+
+        u->cpu_usage_last = ns;
+        if (ret)
+                *ret = ns;
+
+        return 0;
+}
+
+int unit_get_ip_accounting(
+                Unit *u,
+                CGroupIPAccountingMetric metric,
+                uint64_t *ret) {
+
+        uint64_t value;
+        int fd, r;
+
+        assert(u);
+        assert(metric >= 0);
+        assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
+        assert(ret);
+
+        if (!UNIT_CGROUP_BOOL(u, ip_accounting))
+                return -ENODATA;
+
+        fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
+                u->ip_accounting_ingress_map_fd :
+                u->ip_accounting_egress_map_fd;
+        if (fd < 0)
+                return -ENODATA;
+
+        if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
+                r = bpf_firewall_read_accounting(fd, &value, NULL);
+        else
+                r = bpf_firewall_read_accounting(fd, NULL, &value);
+        if (r < 0)
+                return r;
+
+        /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
+         * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
+         * ip_accounting_extra[] field, and add them in here transparently. */
+
+        *ret = value + u->ip_accounting_extra[metric];
+
+        return r;
+}
+
+static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
+        static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+                [CGROUP_IO_READ_BYTES]       = "rbytes=",
+                [CGROUP_IO_WRITE_BYTES]      = "wbytes=",
+                [CGROUP_IO_READ_OPERATIONS]  = "rios=",
+                [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
+        };
+        uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
+        _cleanup_free_ char *path = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return -ENODATA;
+
+        if (unit_has_host_root_cgroup(u))
+                return -ENODATA; /* TODO: return useful data for the top-level cgroup */
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+        if (r == 0) /* TODO: support cgroupv1 */
+                return -ENODATA;
+
+        if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
+                return -ENODATA;
+
+        r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
+        if (r < 0)
+                return r;
+
+        f = fopen(path, "re");
+        if (!f)
+                return -errno;
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+                const char *p;
+
+                r = read_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                p = line;
+                p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
+                p += strspn(p, WHITESPACE);  /* Skip over following whitespace */
+
+                for (;;) {
+                        _cleanup_free_ char *word = NULL;
+
+                        r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+                                const char *x;
+
+                                x = startswith(word, field_names[i]);
+                                if (x) {
+                                        uint64_t w;
+
+                                        r = safe_atou64(x, &w);
+                                        if (r < 0)
+                                                return r;
+
+                                        /* Sum up the stats of all devices */
+                                        acc[i] += w;
+                                        break;
+                                }
+                        }
+                }
+        }
+
+        memcpy(ret, acc, sizeof(acc));
+        return 0;
+}
+
+int unit_get_io_accounting(
+                Unit *u,
+                CGroupIOAccountingMetric metric,
+                bool allow_cache,
+                uint64_t *ret) {
+
+        uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+        int r;
+
+        /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
+
+        if (!UNIT_CGROUP_BOOL(u, io_accounting))
+                return -ENODATA;
+
+        if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
+                goto done;
+
+        r = unit_get_io_accounting_raw(u, raw);
+        if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
+                goto done;
+        if (r < 0)
+                return r;
+
+        for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+                /* Saturated subtraction */
+                if (raw[i] > u->io_accounting_base[i])
+                        u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
+                else
+                        u->io_accounting_last[i] = 0;
+        }
+
+done:
+        if (ret)
+                *ret = u->io_accounting_last[metric];
+
+        return 0;
+}
+
+int unit_reset_cpu_accounting(Unit *u) {
+        int r;
+
+        assert(u);
+
+        u->cpu_usage_last = NSEC_INFINITY;
+
+        r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
+        if (r < 0) {
+                u->cpu_usage_base = 0;
+                return r;
+        }
+
+        return 0;
+}
+
+void unit_reset_memory_accounting_last(Unit *u) {
+        assert(u);
+
+        FOREACH_ARRAY(i, u->memory_accounting_last, ELEMENTSOF(u->memory_accounting_last))
+                *i = UINT64_MAX;
+}
+
+int unit_reset_ip_accounting(Unit *u) {
+        int r = 0;
+
+        assert(u);
+
+        if (u->ip_accounting_ingress_map_fd >= 0)
+                RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd));
+
+        if (u->ip_accounting_egress_map_fd >= 0)
+                RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd));
+
+        zero(u->ip_accounting_extra);
+
+        return r;
+}
+
+void unit_reset_io_accounting_last(Unit *u) {
+        assert(u);
+
+        FOREACH_ARRAY(i, u->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX)
+                *i = UINT64_MAX;
+}
+
+int unit_reset_io_accounting(Unit *u) {
+        int r;
+
+        assert(u);
+
+        unit_reset_io_accounting_last(u);
+
+        r = unit_get_io_accounting_raw(u, u->io_accounting_base);
+        if (r < 0) {
+                zero(u->io_accounting_base);
+                return r;
+        }
+
+        return 0;
+}
+
+int unit_reset_accounting(Unit *u) {
+        int r = 0;
+
+        assert(u);
+
+        RET_GATHER(r, unit_reset_cpu_accounting(u));
+        RET_GATHER(r, unit_reset_io_accounting(u));
+        RET_GATHER(r, unit_reset_ip_accounting(u));
+        unit_reset_memory_accounting_last(u);
+
+        return r;
+}
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return;
+
+        if (m == 0)
+                return;
+
+        /* always invalidate compat pairs together */
+        if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
+                m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
+
+        if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
+                m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
+
+        if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
+                return;
+
+        u->cgroup_invalidated_mask |= m;
+        unit_add_to_cgroup_realize_queue(u);
+}
+
+void unit_invalidate_cgroup_bpf(Unit *u) {
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return;
+
+        if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
+                return;
+
+        u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+        unit_add_to_cgroup_realize_queue(u);
+
+        /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
+         * list of our children includes our own. */
+        if (u->type == UNIT_SLICE) {
+                Unit *member;
+
+                UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
+                        unit_invalidate_cgroup_bpf(member);
+        }
+}
+
+void unit_cgroup_catchup(Unit *u) {
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return;
+
+        /* We dropped the inotify watch during reexec/reload, so we need to
+         * check these as they may have changed.
+         * Note that (currently) the kernel doesn't actually update cgroup
+         * file modification times, so we can't just serialize and then check
+         * the mtime for file(s) we are interested in. */
+        (void) unit_check_cgroup_events(u);
+        unit_add_to_cgroup_oom_queue(u);
+}
+
+bool unit_cgroup_delegate(Unit *u) {
+        CGroupContext *c;
+
+        assert(u);
+
+        if (!UNIT_VTABLE(u)->can_delegate)
+                return false;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return false;
+
+        return c->delegate;
+}
+
+void manager_invalidate_startup_units(Manager *m) {
+        Unit *u;
+
+        assert(m);
+
+        SET_FOREACH(u, m->startup_units)
+                unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
+}
+
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
+        _cleanup_free_ char *path = NULL;
+        FreezerState target, kernel = _FREEZER_STATE_INVALID;
+        int r, ret;
+
+        assert(u);
+        assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+        if (!cg_freezer_supported())
+                return 0;
+
+        /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
+                return action == FREEZER_FREEZE ? -EPERM : 0;
+
+        if (!u->cgroup_realized)
+                return -EBUSY;
+
+        if (action == FREEZER_THAW) {
+                Unit *slice = UNIT_GET_SLICE(u);
+
+                if (slice) {
+                        r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
+                        if (r < 0)
+                                return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
+                }
+        }
+
+        target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
+
+        r = unit_freezer_state_kernel(u, &kernel);
+        if (r < 0)
+                log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
+
+        if (target == kernel) {
+                u->freezer_state = target;
+                if (action == FREEZER_FREEZE)
+                        return 0;
+                ret = 0;
+        } else
+                ret = 1;
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
+        if (r < 0)
+                return r;
+
+        log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
+
+        if (target != kernel) {
+                if (action == FREEZER_FREEZE)
+                        u->freezer_state = FREEZER_FREEZING;
+                else
+                        u->freezer_state = FREEZER_THAWING;
+        }
+
+        r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                return r;
+
+        return ret;
+}
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
+        _cleanup_free_ char *v = NULL;
+        int r;
+
+        assert(u);
+        assert(cpus);
+
+        if (!u->cgroup_path)
+                return -ENODATA;
+
+        if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
+                return -ENODATA;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -ENODATA;
+
+        r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
+        if (r == -ENOENT)
+                return -ENODATA;
+        if (r < 0)
+                return r;
+
+        return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
+}
+
+static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
+        [CGROUP_DEVICE_POLICY_AUTO]   = "auto",
+        [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
+        [CGROUP_DEVICE_POLICY_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
+
+static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
+        [FREEZER_FREEZE] = "freeze",
+        [FREEZER_THAW] = "thaw",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
+
+static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
+        [CGROUP_PRESSURE_WATCH_OFF] = "off",
+        [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
+        [CGROUP_PRESSURE_WATCH_ON] = "on",
+        [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON);
+
+static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+        [CGROUP_IP_INGRESS_BYTES]   = "IPIngressBytes",
+        [CGROUP_IP_EGRESS_BYTES]    = "IPEgressBytes",
+        [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
+        [CGROUP_IP_EGRESS_PACKETS]  = "IPEgressPackets",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric);
+
+static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+        [CGROUP_IO_READ_BYTES]       = "IOReadBytes",
+        [CGROUP_IO_WRITE_BYTES]      = "IOWriteBytes",
+        [CGROUP_IO_READ_OPERATIONS]  = "IOReadOperations",
+        [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric);
+
+static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
+        [CGROUP_MEMORY_PEAK]          = "MemoryPeak",
+        [CGROUP_MEMORY_SWAP_CURRENT]  = "MemorySwapCurrent",
+        [CGROUP_MEMORY_SWAP_PEAK]     = "MemorySwapPeak",
+        [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
new file mode 100644
index 0000000..f1b674b
--- /dev/null
+++ b/src/core/cgroup.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "cpu-set-util.h"
+#include "firewall-util.h"
+#include "list.h"
+#include "pidref.h"
+#include "time-util.h"
+
+typedef struct CGroupTasksMax {
+        /* If scale == 0, just use value; otherwise, value / scale.
+         * See tasks_max_resolve(). */
+        uint64_t value;
+        uint64_t scale;
+} CGroupTasksMax;
+
+#define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
+
+static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) {
+        return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
+}
+
+uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max);
+
+typedef struct CGroupContext CGroupContext;
+typedef struct CGroupDeviceAllow CGroupDeviceAllow;
+typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
+typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
+typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
+typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
+typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
+typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
+typedef struct CGroupSocketBindItem CGroupSocketBindItem;
+
+typedef enum CGroupDevicePolicy {
+        /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
+         * everything. */
+        CGROUP_DEVICE_POLICY_AUTO,
+
+        /* Everything forbidden, except built-in ones and listed ones. */
+        CGROUP_DEVICE_POLICY_CLOSED,
+
+        /* Everything forbidden, except for the listed devices */
+        CGROUP_DEVICE_POLICY_STRICT,
+
+        _CGROUP_DEVICE_POLICY_MAX,
+        _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
+} CGroupDevicePolicy;
+
+typedef enum FreezerAction {
+        FREEZER_FREEZE,
+        FREEZER_THAW,
+
+        _FREEZER_ACTION_MAX,
+        _FREEZER_ACTION_INVALID = -EINVAL,
+} FreezerAction;
+
+typedef enum CGroupDevicePermissions {
+        /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
+        CGROUP_DEVICE_MKNOD                = 1 << 0,
+        CGROUP_DEVICE_READ                 = 1 << 1,
+        CGROUP_DEVICE_WRITE                = 1 << 2,
+        _CGROUP_DEVICE_PERMISSIONS_MAX     = 1 << 3,
+        _CGROUP_DEVICE_PERMISSIONS_ALL     = _CGROUP_DEVICE_PERMISSIONS_MAX - 1,
+        _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL,
+} CGroupDevicePermissions;
+
+struct CGroupDeviceAllow {
+        LIST_FIELDS(CGroupDeviceAllow, device_allow);
+        char *path;
+        CGroupDevicePermissions permissions;
+};
+
+struct CGroupIODeviceWeight {
+        LIST_FIELDS(CGroupIODeviceWeight, device_weights);
+        char *path;
+        uint64_t weight;
+};
+
+struct CGroupIODeviceLimit {
+        LIST_FIELDS(CGroupIODeviceLimit, device_limits);
+        char *path;
+        uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+};
+
+struct CGroupIODeviceLatency {
+        LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
+        char *path;
+        usec_t target_usec;
+};
+
+struct CGroupBlockIODeviceWeight {
+        LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
+        char *path;
+        uint64_t weight;
+};
+
+struct CGroupBlockIODeviceBandwidth {
+        LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
+        char *path;
+        uint64_t rbps;
+        uint64_t wbps;
+};
+
+struct CGroupBPFForeignProgram {
+        LIST_FIELDS(CGroupBPFForeignProgram, programs);
+        uint32_t attach_type;
+        char *bpffs_path;
+};
+
+struct CGroupSocketBindItem {
+        LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
+        int address_family;
+        int ip_protocol;
+        uint16_t nr_ports;
+        uint16_t port_min;
+};
+
+typedef enum CGroupPressureWatch {
+        CGROUP_PRESSURE_WATCH_OFF,      /* → tells the service payload explicitly not to watch for memory pressure */
+        CGROUP_PRESSURE_WATCH_AUTO,     /* → on if memory account is on anyway for the unit, otherwise off */
+        CGROUP_PRESSURE_WATCH_ON,
+        CGROUP_PRESSURE_WATCH_SKIP,     /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
+        _CGROUP_PRESSURE_WATCH_MAX,
+        _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
+} CGroupPressureWatch;
+
+struct CGroupContext {
+        bool cpu_accounting;
+        bool io_accounting;
+        bool blockio_accounting;
+        bool memory_accounting;
+        bool tasks_accounting;
+        bool ip_accounting;
+
+        /* Configures the memory.oom.group attribute (on unified) */
+        bool memory_oom_group;
+
+        bool delegate;
+        CGroupMask delegate_controllers;
+        CGroupMask disable_controllers;
+        char *delegate_subgroup;
+
+        /* For unified hierarchy */
+        uint64_t cpu_weight;
+        uint64_t startup_cpu_weight;
+        usec_t cpu_quota_per_sec_usec;
+        usec_t cpu_quota_period_usec;
+
+        CPUSet cpuset_cpus;
+        CPUSet startup_cpuset_cpus;
+        CPUSet cpuset_mems;
+        CPUSet startup_cpuset_mems;
+
+        uint64_t io_weight;
+        uint64_t startup_io_weight;
+        LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
+        LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
+        LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
+
+        uint64_t default_memory_min;
+        uint64_t default_memory_low;
+        uint64_t default_startup_memory_low;
+        uint64_t memory_min;
+        uint64_t memory_low;
+        uint64_t startup_memory_low;
+        uint64_t memory_high;
+        uint64_t startup_memory_high;
+        uint64_t memory_max;
+        uint64_t startup_memory_max;
+        uint64_t memory_swap_max;
+        uint64_t startup_memory_swap_max;
+        uint64_t memory_zswap_max;
+        uint64_t startup_memory_zswap_max;
+
+        bool default_memory_min_set:1;
+        bool default_memory_low_set:1;
+        bool default_startup_memory_low_set:1;
+        bool memory_min_set:1;
+        bool memory_low_set:1;
+        bool startup_memory_low_set:1;
+        bool startup_memory_high_set:1;
+        bool startup_memory_max_set:1;
+        bool startup_memory_swap_max_set:1;
+        bool startup_memory_zswap_max_set:1;
+
+        Set *ip_address_allow;
+        Set *ip_address_deny;
+        /* These two flags indicate that redundant entries have been removed from
+         * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
+        bool ip_address_allow_reduced;
+        bool ip_address_deny_reduced;
+
+        char **ip_filters_ingress;
+        char **ip_filters_egress;
+        LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
+
+        Set *restrict_network_interfaces;
+        bool restrict_network_interfaces_is_allow_list;
+
+        /* For legacy hierarchies */
+        uint64_t cpu_shares;
+        uint64_t startup_cpu_shares;
+
+        uint64_t blockio_weight;
+        uint64_t startup_blockio_weight;
+        LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
+        LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
+
+        uint64_t memory_limit;
+
+        CGroupDevicePolicy device_policy;
+        LIST_HEAD(CGroupDeviceAllow, device_allow);
+
+        LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
+        LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
+
+        /* Common */
+        CGroupTasksMax tasks_max;
+
+        /* Settings for systemd-oomd */
+        ManagedOOMMode moom_swap;
+        ManagedOOMMode moom_mem_pressure;
+        uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
+        ManagedOOMPreference moom_preference;
+
+        /* Memory pressure logic */
+        CGroupPressureWatch memory_pressure_watch;
+        usec_t memory_pressure_threshold_usec;
+        /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
+         * triggers, nor triggers for non-memory pressure. We might add that later. */
+
+        NFTSetContext nft_set_context;
+
+        /* Forward coredumps for processes that crash within this cgroup.
+         * Requires 'delegate' to also be true. */
+        bool coredump_receive;
+};
+
+/* Used when querying IP accounting data */
+typedef enum CGroupIPAccountingMetric {
+        CGROUP_IP_INGRESS_BYTES,
+        CGROUP_IP_INGRESS_PACKETS,
+        CGROUP_IP_EGRESS_BYTES,
+        CGROUP_IP_EGRESS_PACKETS,
+        _CGROUP_IP_ACCOUNTING_METRIC_MAX,
+        _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIPAccountingMetric;
+
+/* Used when querying IO accounting data */
+typedef enum CGroupIOAccountingMetric {
+        CGROUP_IO_READ_BYTES,
+        CGROUP_IO_WRITE_BYTES,
+        CGROUP_IO_READ_OPERATIONS,
+        CGROUP_IO_WRITE_OPERATIONS,
+        _CGROUP_IO_ACCOUNTING_METRIC_MAX,
+        _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIOAccountingMetric;
+
+typedef enum CGroupMemoryAccountingMetric {
+        CGROUP_MEMORY_PEAK,
+        CGROUP_MEMORY_SWAP_PEAK,
+        /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g.
+         * when systemd-run exits. */
+        _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST = CGROUP_MEMORY_SWAP_PEAK,
+
+        /* These attributes are transient, so no need for caching. */
+        CGROUP_MEMORY_SWAP_CURRENT,
+        CGROUP_MEMORY_ZSWAP_CURRENT,
+
+        _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX,
+        _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupMemoryAccountingMetric;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+typedef enum ManagerState ManagerState;
+
+uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
+
+void cgroup_context_init(CGroupContext *c);
+void cgroup_context_done(CGroupContext *c);
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
+
+static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
+        assert(c);
+
+        return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
+                (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
+}
+
+int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
+int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
+int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
+
+void unit_modify_nft_set(Unit *u, bool add);
+
+CGroupMask unit_get_own_mask(Unit *u);
+CGroupMask unit_get_delegate_mask(Unit *u);
+CGroupMask unit_get_members_mask(Unit *u);
+CGroupMask unit_get_siblings_mask(Unit *u);
+CGroupMask unit_get_ancestor_disable_mask(Unit *u);
+
+CGroupMask unit_get_target_mask(Unit *u);
+CGroupMask unit_get_enable_mask(Unit *u);
+
+void unit_invalidate_cgroup_members_masks(Unit *u);
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u);
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
+int unit_default_cgroup_path(const Unit *u, char **ret);
+int unit_set_cgroup_path(Unit *u, const char *path);
+int unit_pick_cgroup_path(Unit *u);
+
+int unit_realize_cgroup(Unit *u);
+void unit_prune_cgroup(Unit *u);
+int unit_watch_cgroup(Unit *u);
+int unit_watch_cgroup_memory(Unit *u);
+void unit_add_to_cgroup_realize_queue(Unit *u);
+
+void unit_release_cgroup(Unit *u);
+/* Releases the cgroup only if it is recursively empty.
+ * Returns true if the cgroup was released, false otherwise. */
+bool unit_maybe_release_cgroup(Unit *u);
+
+void unit_add_to_cgroup_empty_queue(Unit *u);
+int unit_check_oomd_kill(Unit *u);
+int unit_check_oom(Unit *u);
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
+
+int manager_setup_cgroup(Manager *m);
+void manager_shutdown_cgroup(Manager *m, bool delete);
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
+
+Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
+Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
+Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
+Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
+
+uint64_t unit_get_ancestor_memory_min(Unit *u);
+uint64_t unit_get_ancestor_memory_low(Unit *u);
+uint64_t unit_get_ancestor_startup_memory_low(Unit *u);
+
+int unit_search_main_pid(Unit *u, PidRef *ret);
+int unit_watch_all_pids(Unit *u);
+
+int unit_synthesize_cgroup_empty_event(Unit *u);
+
+int unit_get_memory_available(Unit *u, uint64_t *ret);
+int unit_get_memory_current(Unit *u, uint64_t *ret);
+int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret);
+int unit_get_tasks_current(Unit *u, uint64_t *ret);
+int unit_get_cpu_usage(Unit *u, nsec_t *ret);
+int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
+int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
+
+int unit_reset_cpu_accounting(Unit *u);
+void unit_reset_memory_accounting_last(Unit *u);
+int unit_reset_ip_accounting(Unit *u);
+void unit_reset_io_accounting_last(Unit *u);
+int unit_reset_io_accounting(Unit *u);
+int unit_reset_accounting(Unit *u);
+
+#define UNIT_CGROUP_BOOL(u, name)                       \
+        ({                                              \
+        CGroupContext *cc = unit_get_cgroup_context(u); \
+        cc ? cc->name : false;                          \
+        })
+
+bool manager_owns_host_root_cgroup(Manager *m);
+bool unit_has_host_root_cgroup(Unit *u);
+
+bool unit_has_startup_cgroup_constraints(Unit *u);
+
+int manager_notify_cgroup_empty(Manager *m, const char *group);
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m);
+void unit_invalidate_cgroup_bpf(Unit *u);
+
+void manager_invalidate_startup_units(Manager *m);
+
+const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
+CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
+
+void unit_cgroup_catchup(Unit *u);
+
+bool unit_cgroup_delegate(Unit *u);
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
+
+const char* freezer_action_to_string(FreezerAction a) _const_;
+FreezerAction freezer_action_from_string(const char *s) _pure_;
+
+const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
+CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;
+
+const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_;
+CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_;
+
+const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_;
+CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_;
+
+const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
+CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;
+
+const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_;
+CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_;
diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c
new file mode 100644
index 0000000..cd91381
--- /dev/null
+++ b/src/core/core-varlink.c
@@ -0,0 +1,652 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "core-varlink.h"
+#include "mkdir-label.h"
+#include "strv.h"
+#include "user-util.h"
+#include "varlink.h"
+#include "varlink-io.systemd.UserDatabase.h"
+#include "varlink-io.systemd.ManagedOOM.h"
+
+typedef struct LookupParameters {
+        const char *user_name;
+        const char *group_name;
+        union {
+                uid_t uid;
+                gid_t gid;
+        };
+        const char *service;
+} LookupParameters;
+
+static const char* const managed_oom_mode_properties[] = {
+        "ManagedOOMSwap",
+        "ManagedOOMMemoryPressure",
+};
+
+static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) {
+        assert(user_name);
+        assert(uid_is_valid(uid));
+        assert(ret);
+
+        return json_build(ret, JSON_BUILD_OBJECT(
+                                   JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)),
+                                       JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)),
+                                       JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(uid)),
+                                       JSON_BUILD_PAIR("realName", JSON_BUILD_CONST_STRING("Dynamic User")),
+                                       JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")),
+                                       JSON_BUILD_PAIR("shell", JSON_BUILD_CONST_STRING(NOLOGIN)),
+                                       JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)),
+                                       JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
+                                       JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
+}
+
+static bool user_match_lookup_parameters(LookupParameters *p, const char *name, uid_t uid) {
+        assert(p);
+
+        if (p->user_name && !streq(name, p->user_name))
+                return false;
+
+        if (uid_is_valid(p->uid) && uid != p->uid)
+                return false;
+
+        return true;
+}
+
+static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) {
+        bool use_limit = false;
+        CGroupContext *c;
+        const char *mode;
+
+        assert(u);
+        assert(property);
+        assert(ret_v);
+
+        if (!UNIT_VTABLE(u)->can_set_managed_oom)
+                return -EOPNOTSUPP;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return -EINVAL;
+
+        if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+                /* systemd-oomd should always treat inactive units as though they didn't enable any action since they
+                 * should not have a valid cgroup */
+                mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO);
+        else if (streq(property, "ManagedOOMSwap"))
+                mode = managed_oom_mode_to_string(c->moom_swap);
+        else if (streq(property, "ManagedOOMMemoryPressure")) {
+                mode = managed_oom_mode_to_string(c->moom_mem_pressure);
+                use_limit = true;
+        } else
+                return -EINVAL;
+
+        return json_build(ret_v, JSON_BUILD_OBJECT(
+                                 JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)),
+                                 JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)),
+                                 JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)),
+                                 JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit))));
+}
+
+int manager_varlink_send_managed_oom_update(Unit *u) {
+        _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL;
+        CGroupContext *c;
+        int r;
+
+        assert(u);
+
+        if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path)
+                return 0;
+
+        if (MANAGER_IS_SYSTEM(u->manager)) {
+                /* In system mode we can't send any notifications unless oomd connected back to us. In this
+                 * mode oomd must initiate communication, not us. */
+                if (!u->manager->managed_oom_varlink)
+                        return 0;
+        } else {
+                /* If we are in user mode, let's connect to oomd if we aren't connected yet. In this mode we
+                 * must initiate communication to oomd, not the other way round. */
+                r = manager_varlink_init(u->manager);
+                if (r <= 0)
+                        return r;
+        }
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+                r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e);
+                if (r < 0)
+                        return r;
+
+                r = json_variant_append_array(&arr, e);
+                if (r < 0)
+                        return r;
+        }
+
+        r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+        if (r < 0)
+                return r;
+
+        if (MANAGER_IS_SYSTEM(u->manager))
+                /* in system mode, oomd is our client, thus send out notifications as replies to the
+                 * initiating method call from them. */
+                r = varlink_notify(u->manager->managed_oom_varlink, v);
+        else
+                /* in user mode, we are oomd's client, thus send out notifications as method calls that do
+                 * not expect a reply. */
+                r = varlink_send(u->manager->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
+
+        return r;
+}
+
+static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+        if (r < 0)
+                return r;
+
+        for (UnitType t = 0; t < _UNIT_TYPE_MAX; t++) {
+
+                if (!unit_vtable[t]->can_set_managed_oom)
+                        continue;
+
+                LIST_FOREACH(units_by_type, u, m->units_by_type[t]) {
+                        CGroupContext *c;
+
+                        if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+                                continue;
+
+                        c = unit_get_cgroup_context(u);
+                        if (!c)
+                                continue;
+
+                        for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) {
+                                _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+                                /* For the initial varlink call we only care about units that enabled (i.e. mode is not
+                                 * set to "auto") oomd properties. */
+                                if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
+                                    !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
+                                        continue;
+
+                                r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e);
+                                if (r < 0)
+                                        return r;
+
+                                r = json_variant_append_array(&arr, e);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+        }
+
+        r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(v);
+        return 0;
+}
+
+static int vl_method_subscribe_managed_oom_cgroups(
+                Varlink *link,
+                JsonVariant *parameters,
+                VarlinkMethodFlags flags,
+                void *userdata) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        pid_t pid;
+        Unit *u;
+        int r;
+
+        assert(link);
+
+        r = varlink_get_peer_pid(link, &pid);
+        if (r < 0)
+                return r;
+
+        u = manager_get_unit_by_pid(m, pid);
+        if (!u)
+                return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+        /* This is meant to be a deterrent and not actual security. The alternative is to check for the systemd-oom
+         * user that this unit runs as, but NSS lookups are blocking and not allowed from PID 1. */
+        if (!streq(u->id, "systemd-oomd.service"))
+                return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+        if (json_variant_elements(parameters) > 0)
+                return varlink_error_invalid_parameter(link, parameters);
+
+        /* We only take one subscriber for this method so return an error if there's already an existing one.
+         * This shouldn't happen since systemd-oomd is the only client of this method. */
+        if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink)
+                return varlink_error(link, "io.systemd.ManagedOOM.SubscriptionTaken", NULL);
+
+        r = build_managed_oom_cgroups_json(m, &v);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(flags, VARLINK_METHOD_MORE))
+                return varlink_reply(link, v);
+
+        assert(!m->managed_oom_varlink);
+        m->managed_oom_varlink = varlink_ref(link);
+        return varlink_notify(m->managed_oom_varlink, v);
+}
+
+static int manager_varlink_send_managed_oom_initial(Manager *m) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        int r;
+
+        assert(m);
+
+        if (MANAGER_IS_SYSTEM(m))
+                return 0;
+
+        assert(m->managed_oom_varlink);
+
+        r = build_managed_oom_cgroups_json(m, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_send(m->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
+}
+
+static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "uid",      JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,      offsetof(LookupParameters, uid),       0         },
+                { "userName", JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE },
+                { "service",  JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, service),   0         },
+                {}
+        };
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        LookupParameters p = {
+                .uid = UID_INVALID,
+        };
+        _cleanup_free_ char *found_name = NULL;
+        uid_t found_uid = UID_INVALID, uid;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *un;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (uid_is_valid(p.uid))
+                r = dynamic_user_lookup_uid(m, p.uid, &found_name);
+        else if (p.user_name)
+                r = dynamic_user_lookup_name(m, p.user_name, &found_uid);
+        else {
+                DynamicUser *d;
+
+                HASHMAP_FOREACH(d, m->dynamic_users) {
+                        r = dynamic_user_current(d, &uid);
+                        if (r == -EAGAIN) /* not realized yet? */
+                                continue;
+                        if (r < 0)
+                                return r;
+
+                        if (!user_match_lookup_parameters(&p, d->name, uid))
+                                continue;
+
+                        if (v) {
+                                r = varlink_notify(link, v);
+                                if (r < 0)
+                                        return r;
+
+                                v = json_variant_unref(v);
+                        }
+
+                        r = build_user_json(d->name, uid, &v);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (!v)
+                        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+                return varlink_reply(link, v);
+        }
+        if (r == -ESRCH)
+                return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+        if (r < 0)
+                return r;
+
+        uid = uid_is_valid(found_uid) ? found_uid : p.uid;
+        un = found_name ?: p.user_name;
+
+        if (!user_match_lookup_parameters(&p, un, uid))
+                return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+        r = build_user_json(un, uid, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_reply(link, v);
+}
+
+static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret) {
+        assert(group_name);
+        assert(gid_is_valid(gid));
+        assert(ret);
+
+        return json_build(ret, JSON_BUILD_OBJECT(
+                                   JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(group_name)),
+                                       JSON_BUILD_PAIR("description", JSON_BUILD_CONST_STRING("Dynamic Group")),
+                                       JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)),
+                                       JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
+                                       JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
+    }
+
+static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) {
+        assert(p);
+
+        if (p->group_name && !streq(name, p->group_name))
+                return false;
+
+        if (gid_is_valid(p->gid) && gid != p->gid)
+                return false;
+
+        return true;
+}
+
+static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "gid",       JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,      offsetof(LookupParameters, gid),        0         },
+                { "groupName", JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+                { "service",   JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, service),    0         },
+                {}
+        };
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        LookupParameters p = {
+                .gid = GID_INVALID,
+        };
+        _cleanup_free_ char *found_name = NULL;
+        uid_t found_gid = GID_INVALID, gid;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *gn;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (gid_is_valid(p.gid))
+                r = dynamic_user_lookup_uid(m, (uid_t) p.gid, &found_name);
+        else if (p.group_name)
+                r = dynamic_user_lookup_name(m, p.group_name, (uid_t*) &found_gid);
+        else {
+                DynamicUser *d;
+
+                HASHMAP_FOREACH(d, m->dynamic_users) {
+                        uid_t uid;
+
+                        r = dynamic_user_current(d, &uid);
+                        if (r == -EAGAIN)
+                                continue;
+                        if (r < 0)
+                                return r;
+
+                        if (!group_match_lookup_parameters(&p, d->name, (gid_t) uid))
+                                continue;
+
+                        if (v) {
+                                r = varlink_notify(link, v);
+                                if (r < 0)
+                                        return r;
+
+                                v = json_variant_unref(v);
+                        }
+
+                        r = build_group_json(d->name, (gid_t) uid, &v);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (!v)
+                        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+                return varlink_reply(link, v);
+        }
+        if (r == -ESRCH)
+                return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+        if (r < 0)
+                return r;
+
+        gid = gid_is_valid(found_gid) ? found_gid : p.gid;
+        gn = found_name ?: p.group_name;
+
+        if (!group_match_lookup_parameters(&p, gn, gid))
+                return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+        r = build_group_json(gn, gid, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_reply(link, v);
+}
+
+static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "userName",  JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name),  JSON_SAFE },
+                { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+                { "service",   JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service),    0         },
+                {}
+        };
+
+        LookupParameters p = {};
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        /* We don't support auxiliary groups with dynamic users. */
+        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+        assert(link);
+
+        if (link == m->managed_oom_varlink)
+                m->managed_oom_varlink = varlink_unref(link);
+}
+
+static int manager_varlink_init_system(Manager *m) {
+        _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+        int r;
+
+        assert(m);
+
+        if (m->varlink_server)
+                return 1;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return 0;
+
+        r = manager_setup_varlink_server(m, &s);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set up varlink server: %m");
+
+        if (!MANAGER_IS_TEST_RUN(m)) {
+                (void) mkdir_p_label("/run/systemd/userdb", 0755);
+
+                FOREACH_STRING(address, "/run/systemd/userdb/io.systemd.DynamicUser", VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM) {
+                        if (MANAGER_IS_RELOADING(m)) {
+                                /* If manager is reloading, we skip listening on existing addresses, since
+                                 * the fd should be acquired later through deserialization. */
+                                if (access(address, F_OK) >= 0)
+                                        continue;
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno,
+                                                               "Failed to check if varlink socket '%s' exists: %m", address);
+                        }
+
+                        r = varlink_server_listen_address(s, address, 0666);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to bind to varlink socket '%s': %m", address);
+                }
+        }
+
+        r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+        m->varlink_server = TAKE_PTR(s);
+        return 1;
+}
+
+static int vl_reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        if (error_id)
+                log_debug("varlink systemd-oomd client error: %s", error_id);
+
+        if (FLAGS_SET(flags, VARLINK_REPLY_ERROR) && FLAGS_SET(flags, VARLINK_REPLY_LOCAL)) {
+                /* Varlink connection was closed, likely because of systemd-oomd restart. Let's try to
+                 * reconnect and send the initial ManagedOOM update again. */
+
+                m->managed_oom_varlink = varlink_unref(link);
+
+                log_debug("Reconnecting to %s", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+
+                r = manager_varlink_init(m);
+                if (r <= 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int manager_varlink_init_user(Manager *m) {
+        _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
+        int r;
+
+        assert(m);
+
+        if (m->managed_oom_varlink)
+                return 1;
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+        if (r < 0) {
+                if (r == -ENOENT || ERRNO_IS_DISCONNECT(r)) {
+                        log_debug("systemd-oomd varlink unix socket not found, skipping user manager varlink setup");
+                        return 0;
+                }
+                return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+        }
+
+        varlink_set_userdata(link, m);
+
+        r = varlink_bind_reply(link, vl_reply);
+        if (r < 0)
+                return r;
+
+        r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+        m->managed_oom_varlink = TAKE_PTR(link);
+
+        /* Queue the initial ManagedOOM update. */
+        (void) manager_varlink_send_managed_oom_initial(m);
+
+        return 1;
+}
+
+int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) {
+        _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to allocate varlink server object: %m");
+
+        varlink_server_set_userdata(s, m);
+
+        r = varlink_server_add_interface_many(
+                        s,
+                        &vl_interface_io_systemd_UserDatabase,
+                        &vl_interface_io_systemd_ManagedOOM);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add interfaces to varlink server: %m");
+
+        r = varlink_server_bind_method_many(
+                        s,
+                        "io.systemd.UserDatabase.GetUserRecord",  vl_method_get_user_record,
+                        "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
+                        "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships,
+                        "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups",  vl_method_subscribe_managed_oom_cgroups);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to register varlink methods: %m");
+
+        r = varlink_server_bind_disconnect(s, vl_disconnect);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to register varlink disconnect handler: %m");
+
+        *ret = TAKE_PTR(s);
+        return 0;
+}
+
+int manager_varlink_init(Manager *m) {
+        return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m);
+}
+
+void manager_varlink_done(Manager *m) {
+        assert(m);
+
+        /* Explicitly close the varlink connection to oomd. Note we first take the varlink connection out of
+         * the manager, and only then disconnect it — in two steps – so that we don't end up accidentally
+         * unreffing it twice. After all, closing the connection might cause the disconnect handler we
+         * installed (vl_disconnect() above) to be called, where we will unref it too. */
+        varlink_close_unref(TAKE_PTR(m->managed_oom_varlink));
+
+        m->varlink_server = varlink_server_unref(m->varlink_server);
+        m->managed_oom_varlink = varlink_close_unref(m->managed_oom_varlink);
+}
diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h
new file mode 100644
index 0000000..7f810d1
--- /dev/null
+++ b/src/core/core-varlink.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "manager.h"
+
+int manager_varlink_init(Manager *m);
+void manager_varlink_done(Manager *m);
+
+/* Creates a new VarlinkServer and binds methods. Does not set up sockets or attach events.
+ * Used for manager serialize/deserialize. */
+int manager_setup_varlink_server(Manager *m, VarlinkServer **ret_s);
+
+/* The manager is expected to send an update to systemd-oomd if one of the following occurs:
+ * - The value of ManagedOOM*= properties change
+ * - A unit with ManagedOOM*= properties changes unit active state */
+int manager_varlink_send_managed_oom_update(Unit *u);
diff --git a/src/core/crash-handler.c b/src/core/crash-handler.c
new file mode 100644
index 0000000..f5c31b6
--- /dev/null
+++ b/src/core/crash-handler.c
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-messages.h"
+
+#include "crash-handler.h"
+#include "exit-status.h"
+#include "macro.h"
+#include "main.h"
+#include "missing_syscall.h"
+#include "process-util.h"
+#include "raw-clone.h"
+#include "rlimit-util.h"
+#include "signal-util.h"
+#include "terminal-util.h"
+#include "virt.h"
+
+_noreturn_ void freeze_or_exit_or_reboot(void) {
+
+        /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
+         * the container manager, and thus inform it that something went wrong. */
+        if (detect_container() > 0) {
+                log_struct(LOG_EMERG,
+                           LOG_MESSAGE("Exiting PID 1..."),
+                           "MESSAGE_ID=" SD_MESSAGE_CRASH_EXIT_STR);
+                _exit(EXIT_EXCEPTION);
+        }
+
+        if (arg_crash_reboot) {
+                log_notice("Rebooting in 10s...");
+                (void) sleep(10);
+
+                log_notice("Rebooting now...");
+                (void) reboot(RB_AUTOBOOT);
+                log_struct_errno(LOG_EMERG, errno,
+                                 LOG_MESSAGE("Failed to reboot: %m"),
+                                 "MESSAGE_ID=" SD_MESSAGE_CRASH_FAILED_STR);
+        }
+
+        log_struct(LOG_EMERG,
+                   LOG_MESSAGE("Freezing execution."),
+                   "MESSAGE_ID=" SD_MESSAGE_CRASH_FREEZE_STR);
+        sync();
+        freeze();
+}
+
+_noreturn_ static void crash(int sig, siginfo_t *siginfo, void *context) {
+        struct sigaction sa;
+        pid_t pid;
+
+        /* NB: 💣 💣 💣 This is a signal handler, most likely executed in a situation where we have corrupted
+         * memory. Thus: please avoid any libc memory allocation here, or any functions that internally use
+         * memory allocation, as we cannot rely on memory allocation still working at this point! (Note that
+         * memory allocation is not async-signal-safe anyway — see signal-safety(7) for details —, and thus
+         * is not permissible in signal handlers.) */
+
+        if (getpid_cached() != 1)
+                /* Pass this on immediately, if this is not PID 1 */
+                propagate_signal(sig, siginfo);
+        else if (!arg_dump_core)
+                log_struct(LOG_EMERG,
+                           LOG_MESSAGE("Caught <%s>, not dumping core.", signal_to_string(sig)),
+                           "MESSAGE_ID=" SD_MESSAGE_CRASH_NO_COREDUMP_STR);
+        else {
+                sa = (struct sigaction) {
+                        .sa_handler = nop_signal_handler,
+                        .sa_flags = SA_NOCLDSTOP|SA_RESTART,
+                };
+
+                /* We want to wait for the core process, hence let's enable SIGCHLD */
+                (void) sigaction(SIGCHLD, &sa, NULL);
+
+                pid = raw_clone(SIGCHLD);
+                if (pid < 0)
+                        log_struct_errno(LOG_EMERG, errno,
+                                         LOG_MESSAGE("Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig)),
+                                         "MESSAGE_ID=" SD_MESSAGE_CRASH_NO_FORK_STR);
+                else if (pid == 0) {
+                        /* Enable default signal handler for core dump */
+
+                        sa = (struct sigaction) {
+                                .sa_handler = SIG_DFL,
+                        };
+                        (void) sigaction(sig, &sa, NULL);
+
+                        /* Don't limit the coredump size */
+                        (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));
+
+                        /* Just to be sure... */
+                        (void) chdir("/");
+
+                        /* Raise the signal again */
+                        propagate_signal(sig, siginfo);
+                        assert_not_reached();
+                        _exit(EXIT_EXCEPTION);
+                } else {
+                        siginfo_t status;
+                        int r;
+
+                        if (siginfo) {
+                                if (siginfo->si_pid == 0)
+                                        log_struct(LOG_EMERG,
+                                                   LOG_MESSAGE("Caught <%s>, from unknown sender process.", signal_to_string(sig)),
+                                                   "MESSAGE_ID=" SD_MESSAGE_CRASH_UNKNOWN_SIGNAL_STR);
+                                else if (siginfo->si_pid == 1)
+                                        log_struct(LOG_EMERG,
+                                                   LOG_MESSAGE("Caught <%s>, from our own process.", signal_to_string(sig)),
+                                                   "MESSAGE_ID=" SD_MESSAGE_CRASH_SYSTEMD_SIGNAL_STR);
+                                else
+                                        log_struct(LOG_EMERG,
+                                                   LOG_MESSAGE("Caught <%s> from PID "PID_FMT".", signal_to_string(sig), siginfo->si_pid),
+                                                   "MESSAGE_ID=" SD_MESSAGE_CRASH_PROCESS_SIGNAL_STR);
+                        }
+
+                        /* Order things nicely. */
+                        r = wait_for_terminate(pid, &status);
+                        if (r < 0)
+                                log_struct_errno(LOG_EMERG, r,
+                                                 LOG_MESSAGE("Caught <%s>, waitpid() failed: %m", signal_to_string(sig)),
+                                                 "MESSAGE_ID=" SD_MESSAGE_CRASH_WAITPID_FAILED_STR);
+                        else if (status.si_code != CLD_DUMPED) {
+                                const char *s = status.si_code == CLD_EXITED ?
+                                        exit_status_to_string(status.si_status, EXIT_STATUS_LIBC) :
+                                        signal_to_string(status.si_status);
+
+                                log_struct(LOG_EMERG,
+                                           LOG_MESSAGE("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).",
+                                                       signal_to_string(sig),
+                                                       pid,
+                                                       sigchld_code_to_string(status.si_code),
+                                                       status.si_status,
+                                                       strna(s)),
+                                           "MESSAGE_ID=" SD_MESSAGE_CRASH_COREDUMP_FAILED_STR);
+                        } else
+                                log_struct(LOG_EMERG,
+                                           LOG_MESSAGE("Caught <%s>, dumped core as pid "PID_FMT".",
+                                                       signal_to_string(sig), pid),
+                                           "MESSAGE_ID=" SD_MESSAGE_CRASH_COREDUMP_PID_STR);
+                }
+        }
+
+        if (arg_crash_chvt >= 0)
+                (void) chvt(arg_crash_chvt);
+
+        sa = (struct sigaction) {
+                .sa_handler = SIG_IGN,
+                .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART,
+        };
+
+        /* Let the kernel reap children for us */
+        (void) sigaction(SIGCHLD, &sa, NULL);
+
+        if (arg_crash_shell) {
+                log_notice("Executing crash shell in 10s...");
+                (void) sleep(10);
+
+                pid = raw_clone(SIGCHLD);
+                if (pid < 0)
+                        log_struct_errno(LOG_EMERG, errno,
+                                         LOG_MESSAGE("Failed to fork off crash shell: %m"),
+                                         "MESSAGE_ID=" SD_MESSAGE_CRASH_SHELL_FORK_FAILED_STR);
+                else if (pid == 0) {
+                        (void) setsid();
+                        (void) make_console_stdio();
+                        (void) rlimit_nofile_safe();
+                        (void) execle("/bin/sh", "/bin/sh", NULL, environ);
+
+                        log_struct_errno(LOG_EMERG, errno,
+                                         LOG_MESSAGE("execle() failed: %m"),
+                                         "MESSAGE_ID=" SD_MESSAGE_CRASH_EXECLE_FAILED_STR);
+                        _exit(EXIT_EXCEPTION);
+                } else {
+                        log_info("Spawned crash shell as PID "PID_FMT".", pid);
+                        (void) wait_for_terminate(pid, NULL);
+                }
+        }
+
+        freeze_or_exit_or_reboot();
+}
+
+void install_crash_handler(void) {
+        static const struct sigaction sa = {
+                .sa_sigaction = crash,
+                .sa_flags = SA_NODEFER | SA_SIGINFO, /* So that we can raise the signal again from the signal handler */
+        };
+        int r;
+
+        /* We ignore the return value here, since, we don't mind if we cannot set up a crash handler */
+        r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER);
+        if (r < 0)
+                log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m");
+}
diff --git a/src/core/crash-handler.h b/src/core/crash-handler.h
new file mode 100644
index 0000000..dc14335
--- /dev/null
+++ b/src/core/crash-handler.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+_noreturn_ void freeze_or_exit_or_reboot(void);
+void install_crash_handler(void);
diff --git a/src/core/dbus-automount.c b/src/core/dbus-automount.c
new file mode 100644
index 0000000..881bf50
--- /dev/null
+++ b/src/core/dbus-automount.c
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "automount.h"
+#include "bus-get-properties.h"
+#include "dbus-automount.h"
+#include "dbus-util.h"
+#include "string-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, automount_result, AutomountResult);
+
+const sd_bus_vtable bus_automount_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Automount, where), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExtraOptions", "s", NULL, offsetof(Automount, extra_options), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Automount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Automount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("TimeoutIdleUSec", "t", bus_property_get_usec, offsetof(Automount, timeout_idle_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_automount_set_transient_property(
+                Automount *a,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Unit *u = UNIT(a);
+
+        assert(a);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "Where"))
+                return bus_set_transient_path(u, name, &a->where, message, flags, error);
+
+        if (streq(name, "ExtraOptions"))
+                return bus_set_transient_string(u, name, &a->extra_options, message, flags, error);
+
+        if (streq(name, "TimeoutIdleUSec"))
+                return bus_set_transient_usec_fix_0(u, name, &a->timeout_idle_usec, message, flags, error);
+
+        if (streq(name, "DirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &a->directory_mode, message, flags, error);
+
+        return 0;
+}
+
+int bus_automount_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Automount *a = AUTOMOUNT(u);
+
+        assert(a);
+        assert(name);
+        assert(message);
+
+        if (u->transient && u->load_state == UNIT_STUB) /* This is a transient unit? let's load a little more */
+                return bus_automount_set_transient_property(a, name, message, flags, error);
+
+        return 0;
+}
diff --git a/src/core/dbus-automount.h b/src/core/dbus-automount.h
new file mode 100644
index 0000000..cfceaec
--- /dev/null
+++ b/src/core/dbus-automount.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_automount_vtable[];
+
+int bus_automount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
new file mode 100644
index 0000000..8a9570f
--- /dev/null
+++ b/src/core/dbus-cgroup.c
@@ -0,0 +1,2287 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "core-varlink.h"
+#include "dbus-cgroup.h"
+#include "dbus-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "in-addr-prefix-util.h"
+#include "ip-protocol-list.h"
+#include "limits-util.h"
+#include "memstream-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "socket-util.h"
+
+BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", CGroupTasksMax, cgroup_tasks_max_resolve);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch);
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference);
+
+static int property_get_cgroup_mask(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupMask *mask = userdata;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        for (CGroupController ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) {
+                if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0)
+                        continue;
+
+                r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl));
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_delegate_controllers(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        if (!c->delegate)
+                return sd_bus_message_append(reply, "as", 0);
+
+        return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
+}
+
+static int property_get_cpuset(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CPUSet *cpus = ASSERT_PTR(userdata);
+        _cleanup_free_ uint8_t *array = NULL;
+        size_t allocated;
+
+        assert(bus);
+        assert(reply);
+
+        (void) cpu_set_to_dbus(cpus, &array, &allocated);
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_io_device_weight(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(st)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(device_weights, w, c->io_device_weights) {
+                r = sd_bus_message_append(reply, "(st)", w->path, w->weight);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_io_device_limits(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(st)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(device_limits, l, c->io_device_limits) {
+                CGroupIOLimitType type;
+
+                type = cgroup_io_limit_type_from_string(property);
+                if (type < 0 || l->limits[type] == cgroup_io_limit_defaults[type])
+                        continue;
+
+                r = sd_bus_message_append(reply, "(st)", l->path, l->limits[type]);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_io_device_latency(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(st)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(device_latencies, l, c->io_device_latencies) {
+                r = sd_bus_message_append(reply, "(st)", l->path, l->target_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_blockio_device_weight(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(st)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+                r = sd_bus_message_append(reply, "(st)", w->path, w->weight);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_blockio_device_bandwidths(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(st)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+                uint64_t v;
+
+                if (streq(property, "BlockIOReadBandwidth"))
+                        v = b->rbps;
+                else
+                        v = b->wbps;
+
+                if (v == CGROUP_LIMIT_MAX)
+                        continue;
+
+                r = sd_bus_message_append(reply, "(st)", b->path, v);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_device_allow(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(device_allow, a, c->device_allow) {
+                r = sd_bus_message_append(reply, "(ss)", a->path, cgroup_device_permissions_to_string(a->permissions));
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_ip_address_access(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Set **prefixes = ASSERT_PTR(userdata);
+        struct in_addr_prefix *i;
+        int r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(iayu)");
+        if (r < 0)
+                return r;
+
+        SET_FOREACH(i, *prefixes) {
+
+                r = sd_bus_message_open_container(reply, 'r', "iayu");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append(reply, "i", i->family);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family));
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_close_container(reply);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_bpf_foreign_program(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+        CGroupContext *c = userdata;
+        int r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(programs, p, c->bpf_foreign_programs) {
+                const char *attach_type = bpf_cgroup_attach_type_to_string(p->attach_type);
+
+                r = sd_bus_message_append(reply, "(ss)", attach_type, p->bpffs_path);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_socket_bind(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupSocketBindItem **items = ASSERT_PTR(userdata);
+        int r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(iiqq)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(socket_bind_items, i, *items) {
+                r = sd_bus_message_append(reply, "(iiqq)", i->address_family, i->ip_protocol, i->nr_ports, i->port_min);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_restrict_network_interfaces(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        CGroupContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'r', "bas");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "b", c->restrict_network_interfaces_is_allow_list);
+        if (r < 0)
+                return r;
+
+        r = bus_message_append_string_set(reply, c->restrict_network_interfaces);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_cgroup_nft_set(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+        int r;
+        CGroupContext *c = userdata;
+
+        assert(bus);
+        assert(reply);
+        assert(c);
+
+        r = sd_bus_message_open_container(reply, 'a', "(iiss)");
+        if (r < 0)
+                return r;
+
+        FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
+                r = sd_bus_message_append(reply, "(iiss)", nft_set->source, nft_set->nfproto, nft_set->table, nft_set->set);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_cgroup_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
+        SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
+        SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0),
+        SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
+        SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
+        SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
+        SD_BUS_PROPERTY("CPUShares", "t", NULL, offsetof(CGroupContext, cpu_shares), 0),
+        SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0),
+        SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0),
+        SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0),
+        SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0),
+        SD_BUS_PROPERTY("StartupAllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_cpus), 0),
+        SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0),
+        SD_BUS_PROPERTY("StartupAllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_mems), 0),
+        SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0),
+        SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0),
+        SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0),
+        SD_BUS_PROPERTY("IODeviceWeight", "a(st)", property_get_io_device_weight, 0, 0),
+        SD_BUS_PROPERTY("IOReadBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0),
+        SD_BUS_PROPERTY("IOWriteBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0),
+        SD_BUS_PROPERTY("IOReadIOPSMax", "a(st)", property_get_io_device_limits, 0, 0),
+        SD_BUS_PROPERTY("IOWriteIOPSMax", "a(st)", property_get_io_device_limits, 0, 0),
+        SD_BUS_PROPERTY("IODeviceLatencyTargetUSec", "a(st)", property_get_io_device_latency, 0, 0),
+        SD_BUS_PROPERTY("BlockIOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, blockio_accounting), 0),
+        SD_BUS_PROPERTY("BlockIOWeight", "t", NULL, offsetof(CGroupContext, blockio_weight), 0),
+        SD_BUS_PROPERTY("StartupBlockIOWeight", "t", NULL, offsetof(CGroupContext, startup_blockio_weight), 0),
+        SD_BUS_PROPERTY("BlockIODeviceWeight", "a(st)", property_get_blockio_device_weight, 0, 0),
+        SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
+        SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
+        SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0),
+        SD_BUS_PROPERTY("DefaultMemoryLow", "t", NULL, offsetof(CGroupContext, default_memory_low), 0),
+        SD_BUS_PROPERTY("DefaultStartupMemoryLow", "t", NULL, offsetof(CGroupContext, default_startup_memory_low), 0),
+        SD_BUS_PROPERTY("DefaultMemoryMin", "t", NULL, offsetof(CGroupContext, default_memory_min), 0),
+        SD_BUS_PROPERTY("MemoryMin", "t", NULL, offsetof(CGroupContext, memory_min), 0),
+        SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0),
+        SD_BUS_PROPERTY("StartupMemoryLow", "t", NULL, offsetof(CGroupContext, startup_memory_low), 0),
+        SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0),
+        SD_BUS_PROPERTY("StartupMemoryHigh", "t", NULL, offsetof(CGroupContext, startup_memory_high), 0),
+        SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0),
+        SD_BUS_PROPERTY("StartupMemoryMax", "t", NULL, offsetof(CGroupContext, startup_memory_max), 0),
+        SD_BUS_PROPERTY("MemorySwapMax", "t", NULL, offsetof(CGroupContext, memory_swap_max), 0),
+        SD_BUS_PROPERTY("StartupMemorySwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_swap_max), 0),
+        SD_BUS_PROPERTY("MemoryZSwapMax", "t", NULL, offsetof(CGroupContext, memory_zswap_max), 0),
+        SD_BUS_PROPERTY("StartupMemoryZSwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_zswap_max), 0),
+        SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0),
+        SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
+        SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
+        SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0),
+        SD_BUS_PROPERTY("TasksMax", "t", bus_property_get_tasks_max, offsetof(CGroupContext, tasks_max), 0),
+        SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
+        SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
+        SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
+        SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0),
+        SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0),
+        SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
+        SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
+        SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
+        SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0),
+        SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
+        SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0),
+        SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0),
+        SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0),
+        SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0),
+        SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0),
+        SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0),
+        SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0),
+        SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_cgroup_set_transient_property(
+                Unit *u,
+                CGroupContext *c,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int r;
+
+        assert(u);
+        assert(c);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "Delegate")) {
+                int b;
+
+                if (!UNIT_VTABLE(u)->can_delegate)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+                r = sd_bus_message_read(message, "b", &b);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->delegate = b;
+                        c->delegate_controllers = b ? CGROUP_MASK_DELEGATE : 0;
+
+                        unit_write_settingf(u, flags, name, "Delegate=%s", yes_no(b));
+                }
+
+                return 1;
+
+        } else if (streq(name, "DelegateSubgroup")) {
+                const char *s;
+
+                if (!UNIT_VTABLE(u)->can_delegate)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(s) && cg_needs_escape(s))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (isempty(s))
+                                c->delegate_subgroup = mfree(c->delegate_subgroup);
+                        else {
+                                r = free_and_strdup_warn(&c->delegate_subgroup, s);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s);
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
+                CGroupMask mask = 0;
+
+                if (streq(name, "DelegateControllers") && !UNIT_VTABLE(u)->can_delegate)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+                r = sd_bus_message_enter_container(message, 'a', "s");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        CGroupController cc;
+                        const char *t;
+
+                        r = sd_bus_message_read(message, "s", &t);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        cc = cgroup_controller_from_string(t);
+                        if (cc < 0)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown cgroup controller '%s'", t);
+
+                        mask |= CGROUP_CONTROLLER_TO_MASK(cc);
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *t = NULL;
+
+                        r = cg_mask_to_string(mask, &t);
+                        if (r < 0)
+                                return r;
+
+                        if (streq(name, "DelegateControllers")) {
+
+                                c->delegate = true;
+                                if (mask == 0)
+                                        c->delegate_controllers = 0;
+                                else
+                                        c->delegate_controllers |= mask;
+
+                                unit_write_settingf(u, flags, name, "Delegate=%s", strempty(t));
+
+                        } else if (streq(name, "DisableControllers")) {
+
+                                if (mask == 0)
+                                        c->disable_controllers = 0;
+                                else
+                                        c->disable_controllers |= mask;
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, strempty(t));
+                        }
+                }
+
+                return 1;
+        } else if (STR_IN_SET(name, "IPIngressFilterPath", "IPEgressFilterPath")) {
+                char ***filters;
+                size_t n = 0;
+
+                filters = streq(name, "IPIngressFilterPath") ? &c->ip_filters_ingress : &c->ip_filters_egress;
+                r = sd_bus_message_enter_container(message, 'a', "s");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        const char *path;
+
+                        r = sd_bus_message_read(message, "s", &path);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        if (!path_is_normalized(path) || !path_is_absolute(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects a normalized absolute path.", name);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags) && !strv_contains(*filters, path)) {
+                                r = strv_extend(filters, path);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+                        n++;
+                }
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                *filters = strv_free(*filters);
+
+                        unit_invalidate_cgroup_bpf(u);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fputs(name, f);
+                        fputs("=\n", f);
+
+                        STRV_FOREACH(entry, *filters)
+                                fprintf(f, "%s=%s\n", name, *entry);
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+
+                        if (*filters) {
+                                r = bpf_firewall_supported();
+                                if (r < 0)
+                                        return r;
+                                if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+                                        static bool warned = false;
+
+                                        log_full(warned ? LOG_DEBUG : LOG_WARNING,
+                                                 "Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n"
+                                                 "Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id);
+                                        warned = true;
+                                }
+                        }
+                }
+
+                return 1;
+        } else if (streq(name, "BPFProgram")) {
+                const char *a, *p;
+                size_t n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ss)", &a, &p)) > 0) {
+                        int attach_type = bpf_cgroup_attach_type_from_string(a);
+                        if (attach_type < 0)
+                                return sd_bus_error_setf(
+                                                error,
+                                                SD_BUS_ERROR_INVALID_ARGS,
+                                                "%s expects a valid BPF attach type, got '%s'.",
+                                                name, a);
+
+                        if (!path_is_normalized(p) || !path_is_absolute(p))
+                                return sd_bus_error_setf(
+                                                error,
+                                                SD_BUS_ERROR_INVALID_ARGS,
+                                                "%s= expects a normalized absolute path.",
+                                                name);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = cgroup_context_add_bpf_foreign_program(c, attach_type, p);
+                                if (r < 0)
+                                        return r;
+                        }
+                        n++;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                while (c->bpf_foreign_programs)
+                                        cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fputs(name, f);
+                        fputs("=\n", f);
+
+                        LIST_FOREACH(programs, fp, c->bpf_foreign_programs)
+                                fprintf(f, "%s=%s:%s\n", name,
+                                                bpf_cgroup_attach_type_to_string(fp->attach_type),
+                                                fp->bpffs_path);
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+
+                        if (c->bpf_foreign_programs) {
+                                r = bpf_foreign_supported();
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        log_full(LOG_DEBUG,
+                                                 "Transient unit %s configures a BPF program pinned to BPF "
+                                                 "filesystem, but the local system does not support that.\n"
+                                                 "Starting this unit will fail!", u->id);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "MemoryPressureWatch")) {
+                CGroupPressureWatch p;
+                const char *t;
+
+                r = sd_bus_message_read(message, "s", &t);
+                if (r < 0)
+                        return r;
+
+                if (isempty(t))
+                        p = _CGROUP_PRESSURE_WATCH_INVALID;
+                else {
+                        p = cgroup_pressure_watch_from_string(t);
+                        if (p < 0)
+                                return p;
+                }
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->memory_pressure_watch = p;
+                        unit_write_settingf(u, flags, name, "MemoryPressureWatch=%s", strempty(cgroup_pressure_watch_to_string(p)));
+                }
+
+                return 1;
+
+        } else if (streq(name, "MemoryPressureThresholdUSec")) {
+                uint64_t t;
+
+                r = sd_bus_message_read(message, "t", &t);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->memory_pressure_threshold_usec = t;
+
+                        if (t == UINT64_MAX)
+                                unit_write_setting(u, flags, name, "MemoryPressureThresholdUSec=");
+                        else
+                                unit_write_settingf(u, flags, name, "MemoryPressureThresholdUSec=%" PRIu64, t);
+                }
+
+                return 1;
+        } else if (streq(name, "CoredumpReceive")) {
+                int b;
+
+                if (!UNIT_VTABLE(u)->can_delegate)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+                r = sd_bus_message_read(message, "b", &b);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->coredump_receive = b;
+
+                        unit_write_settingf(u, flags, name, "CoredumpReceive=%s", yes_no(b));
+                }
+
+                return 1;
+        }
+
+        return 0;
+}
+
+static int bus_cgroup_set_boolean(
+                Unit *u,
+                const char *name,
+                bool *p,
+                CGroupMask mask,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int b, r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "b", &b);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = b;
+                unit_invalidate_cgroup(u, mask);
+                unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(b));
+        }
+
+        return 1;
+}
+
+#define BUS_DEFINE_SET_CGROUP_WEIGHT(function, mask, check, val)        \
+        static int bus_cgroup_set_##function(                           \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        uint64_t *p,                                    \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                uint64_t v;                                             \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, "t", &v);              \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                if (!check(v))                                          \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Value specified in %s is out of range", name); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = v;                                         \
+                        unit_invalidate_cgroup(u, mask);                \
+                                                                        \
+                        if (v == (val))                                 \
+                                unit_write_settingf(u, flags, name,     \
+                                                    "%s=", name);       \
+                        else                                            \
+                                unit_write_settingf(u, flags, name,     \
+                                                    "%s=%" PRIu64, name, v); \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_CGROUP_LIMIT(function, mask, scale, minimum)     \
+        static int bus_cgroup_set_##function(                           \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        uint64_t *p,                                    \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                uint64_t v;                                             \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, "t", &v);              \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                if (v < minimum)                                        \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Value specified in %s is out of range", name); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = v;                                         \
+                        unit_invalidate_cgroup(u, mask);                \
+                                                                        \
+                        if (v == CGROUP_LIMIT_MAX)                      \
+                                unit_write_settingf(u, flags, name,     \
+                                                    "%s=infinity", name); \
+                        else                                            \
+                                unit_write_settingf(u, flags, name,     \
+                                                    "%s=%" PRIu64, name, v); \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }                                                               \
+        static int bus_cgroup_set_##function##_scale(                   \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        uint64_t *p,                                    \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                uint64_t v;                                             \
+                uint32_t raw;                                           \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, "u", &raw);            \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                v = scale(raw, UINT32_MAX);                             \
+                if (v < minimum || v >= UINT64_MAX)                     \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Value specified in %s is out of range", name); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = v;                                         \
+                        unit_invalidate_cgroup(u, mask);                \
+                                                                        \
+                        /* Prepare to chop off suffix */                \
+                        assert_se(endswith(name, "Scale"));             \
+                                                                        \
+                        int scaled = UINT32_SCALE_TO_PERMYRIAD(raw);    \
+                        unit_write_settingf(u, flags, name, "%.*s=" PERMYRIAD_AS_PERCENT_FORMAT_STR, \
+                                            (int)(strlen(name) - strlen("Scale")), name, \
+                                            PERMYRIAD_AS_PERCENT_FORMAT_VAL(scaled)); \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+DISABLE_WARNING_TYPE_LIMITS;
+BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_shares, CGROUP_MASK_CPU, CGROUP_CPU_SHARES_IS_OK, CGROUP_CPU_SHARES_INVALID);
+BUS_DEFINE_SET_CGROUP_WEIGHT(io_weight, CGROUP_MASK_IO, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID);
+BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEIGHT_IS_OK, CGROUP_BLKIO_WEIGHT_INVALID);
+BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1);
+BUS_DEFINE_SET_CGROUP_LIMIT(memory_protection, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+BUS_DEFINE_SET_CGROUP_LIMIT(zswap, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+REENABLE_WARNING;
+
+static int bus_cgroup_set_cpu_weight(
+                Unit *u,
+                const char *name,
+                uint64_t *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+        uint64_t v;
+        int r;
+        assert(p);
+        r = sd_bus_message_read(message, "t", &v);
+        if (r < 0)
+                return r;
+        if (!CGROUP_WEIGHT_IS_OK(v) && v != CGROUP_WEIGHT_IDLE)
+                return sd_bus_error_setf(
+                                error, SD_BUS_ERROR_INVALID_ARGS, "Value specified in %s is out of range", name);
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = v;
+                unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+                if (v == CGROUP_WEIGHT_INVALID)
+                        unit_write_settingf(u, flags, name, "%s=", name);
+                else if (v == CGROUP_WEIGHT_IDLE)
+                        unit_write_settingf(u, flags, name, "%s=idle", name);
+                else
+                        unit_write_settingf(u, flags, name, "%s=%" PRIu64, name, v);
+        }
+        return 1;
+}
+
+static int bus_cgroup_set_tasks_max(
+                Unit *u,
+                const char *name,
+                CGroupTasksMax *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        uint64_t v;
+        int r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "t", &v);
+        if (r < 0)
+                return r;
+
+        if (v < 1)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Value specified in %s is out of range", name);
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = (CGroupTasksMax) { .value = v, .scale = 0 }; /* When .scale==0, .value is the absolute value */
+                unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
+
+                if (v == CGROUP_LIMIT_MAX)
+                        unit_write_settingf(u, flags, name,
+                                            "%s=infinity", name);
+                else
+                        unit_write_settingf(u, flags, name,
+                                            "%s=%" PRIu64, name, v);
+        }
+
+        return 1;
+}
+
+static int bus_cgroup_set_tasks_max_scale(
+                Unit *u,
+                const char *name,
+                CGroupTasksMax *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        uint32_t v;
+        int r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "u", &v);
+        if (r < 0)
+                return r;
+
+        if (v < 1 || v >= UINT32_MAX)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Value specified in %s is out of range", name);
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = (CGroupTasksMax) { v, UINT32_MAX }; /* .scale is not 0, so this is interpreted as v/UINT32_MAX. */
+                unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
+
+                uint32_t scaled = DIV_ROUND_UP((uint64_t) v * 100U, (uint64_t) UINT32_MAX);
+                unit_write_settingf(u, flags, name, "%s=%" PRIu32 ".%" PRIu32 "%%", "TasksMax",
+                                    scaled / 10, scaled % 10);
+        }
+
+        return 1;
+}
+
+int bus_cgroup_set_property(
+                Unit *u,
+                CGroupContext *c,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        CGroupIOLimitType iol_type;
+        int r;
+
+        assert(u);
+        assert(c);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "CPUAccounting"))
+                return bus_cgroup_set_boolean(u, name, &c->cpu_accounting, get_cpu_accounting_mask(), message, flags, error);
+
+        if (streq(name, "CPUWeight"))
+                return bus_cgroup_set_cpu_weight(u, name, &c->cpu_weight, message, flags, error);
+
+        if (streq(name, "StartupCPUWeight"))
+                return bus_cgroup_set_cpu_weight(u, name, &c->startup_cpu_weight, message, flags, error);
+
+        if (streq(name, "CPUShares"))
+                return bus_cgroup_set_cpu_shares(u, name, &c->cpu_shares, message, flags, error);
+
+        if (streq(name, "StartupCPUShares"))
+                return bus_cgroup_set_cpu_shares(u, name, &c->startup_cpu_shares, message, flags, error);
+
+        if (streq(name, "IOAccounting"))
+                return bus_cgroup_set_boolean(u, name, &c->io_accounting, CGROUP_MASK_IO, message, flags, error);
+
+        if (streq(name, "IOWeight"))
+                return bus_cgroup_set_io_weight(u, name, &c->io_weight, message, flags, error);
+
+        if (streq(name, "StartupIOWeight"))
+                return bus_cgroup_set_io_weight(u, name, &c->startup_io_weight, message, flags, error);
+
+        if (streq(name, "BlockIOAccounting"))
+                return bus_cgroup_set_boolean(u, name, &c->blockio_accounting, CGROUP_MASK_BLKIO, message, flags, error);
+
+        if (streq(name, "BlockIOWeight"))
+                return bus_cgroup_set_blockio_weight(u, name, &c->blockio_weight, message, flags, error);
+
+        if (streq(name, "StartupBlockIOWeight"))
+                return bus_cgroup_set_blockio_weight(u, name, &c->startup_blockio_weight, message, flags, error);
+
+        if (streq(name, "MemoryAccounting"))
+                return bus_cgroup_set_boolean(u, name, &c->memory_accounting, CGROUP_MASK_MEMORY, message, flags, error);
+
+        if (streq(name, "MemoryMin")) {
+                r = bus_cgroup_set_memory_protection(u, name, &c->memory_min, message, flags, error);
+                if (r > 0)
+                        c->memory_min_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryLow")) {
+                r = bus_cgroup_set_memory_protection(u, name, &c->memory_low, message, flags, error);
+                if (r > 0)
+                        c->memory_low_set = true;
+                return r;
+        }
+
+        if (streq(name, "StartupMemoryLow")) {
+                r = bus_cgroup_set_memory_protection(u, name, &c->startup_memory_low, message, flags, error);
+                if (r > 0)
+                        c->startup_memory_low_set = true;
+                return r;
+        }
+
+        if (streq(name, "DefaultMemoryMin")) {
+                r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_min, message, flags, error);
+                if (r > 0)
+                        c->default_memory_min_set = true;
+                return r;
+        }
+
+        if (streq(name, "DefaultMemoryLow")) {
+                r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_low, message, flags, error);
+                if (r > 0)
+                        c->default_memory_low_set = true;
+                return r;
+        }
+
+        if (streq(name, "DefaultStartupMemoryLow")) {
+                r = bus_cgroup_set_memory_protection(u, name, &c->default_startup_memory_low, message, flags, error);
+                if (r > 0)
+                        c->default_startup_memory_low_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryHigh"))
+                return bus_cgroup_set_memory(u, name, &c->memory_high, message, flags, error);
+
+        if (streq(name, "StartupMemoryHigh")) {
+                r = bus_cgroup_set_memory(u, name, &c->startup_memory_high, message, flags, error);
+                if (r > 0)
+                        c->startup_memory_high_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemorySwapMax"))
+                return bus_cgroup_set_swap(u, name, &c->memory_swap_max, message, flags, error);
+
+        if (streq(name, "StartupMemorySwapMax")) {
+                r = bus_cgroup_set_swap(u, name, &c->startup_memory_swap_max, message, flags, error);
+                if (r > 0)
+                        c->startup_memory_swap_max_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryZSwapMax"))
+                return bus_cgroup_set_zswap(u, name, &c->memory_zswap_max, message, flags, error);
+
+        if (streq(name, "StartupMemoryZSwapMax")) {
+                r = bus_cgroup_set_zswap(u, name, &c->startup_memory_zswap_max, message, flags, error);
+                if (r > 0)
+                        c->startup_memory_zswap_max_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryMax"))
+                return bus_cgroup_set_memory(u, name, &c->memory_max, message, flags, error);
+
+        if (streq(name, "StartupMemoryMax")) {
+                r = bus_cgroup_set_memory(u, name, &c->startup_memory_max, message, flags, error);
+                if (r > 0)
+                        c->startup_memory_max_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryLimit"))
+                return bus_cgroup_set_memory(u, name, &c->memory_limit, message, flags, error);
+
+        if (streq(name, "MemoryMinScale")) {
+                r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_min, message, flags, error);
+                if (r > 0)
+                        c->memory_min_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryLowScale")) {
+                r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_low, message, flags, error);
+                if (r > 0)
+                        c->memory_low_set = true;
+                return r;
+        }
+
+        if (streq(name, "DefaultMemoryMinScale")) {
+                r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_min, message, flags, error);
+                if (r > 0)
+                        c->default_memory_min_set = true;
+                return r;
+        }
+
+        if (streq(name, "DefaultMemoryLowScale")) {
+                r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_low, message, flags, error);
+                if (r > 0)
+                        c->default_memory_low_set = true;
+                return r;
+        }
+
+        if (streq(name, "MemoryHighScale"))
+                return bus_cgroup_set_memory_scale(u, name, &c->memory_high, message, flags, error);
+
+        if (streq(name, "MemorySwapMaxScale"))
+                return bus_cgroup_set_swap_scale(u, name, &c->memory_swap_max, message, flags, error);
+
+        if (streq(name, "MemoryZSwapMaxScale"))
+                return bus_cgroup_set_zswap_scale(u, name, &c->memory_zswap_max, message, flags, error);
+
+        if (streq(name, "MemoryMaxScale"))
+                return bus_cgroup_set_memory_scale(u, name, &c->memory_max, message, flags, error);
+
+        if (streq(name, "MemoryLimitScale"))
+                return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error);
+
+        if (streq(name, "TasksAccounting"))
+                return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error);
+
+        if (streq(name, "TasksMax"))
+                return bus_cgroup_set_tasks_max(u, name, &c->tasks_max, message, flags, error);
+
+        if (streq(name, "TasksMaxScale"))
+                return bus_cgroup_set_tasks_max_scale(u, name, &c->tasks_max, message, flags, error);
+
+        if (streq(name, "CPUQuotaPerSecUSec")) {
+                uint64_t u64;
+
+                r = sd_bus_message_read(message, "t", &u64);
+                if (r < 0)
+                        return r;
+
+                if (u64 <= 0)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "CPUQuotaPerSecUSec= value out of range");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->cpu_quota_per_sec_usec = u64;
+                        u->warned_clamping_cpu_quota_period = false;
+                        unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+
+                        if (c->cpu_quota_per_sec_usec == USEC_INFINITY)
+                                unit_write_setting(u, flags, "CPUQuota", "CPUQuota=");
+                        else
+                                /* config_parse_cpu_quota() requires an integer, so truncating division is used on
+                                 * purpose here. */
+                                unit_write_settingf(u, flags, "CPUQuota",
+                                                    "CPUQuota=%0.f%%",
+                                                    (double) (c->cpu_quota_per_sec_usec / 10000));
+                }
+
+                return 1;
+
+        } else if (streq(name, "CPUQuotaPeriodUSec")) {
+                uint64_t u64;
+
+                r = sd_bus_message_read(message, "t", &u64);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->cpu_quota_period_usec = u64;
+                        u->warned_clamping_cpu_quota_period = false;
+                        unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+                        if (c->cpu_quota_period_usec == USEC_INFINITY)
+                                unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec=");
+                        else
+                                unit_write_settingf(u, flags, "CPUQuotaPeriodSec",
+                                                    "CPUQuotaPeriodSec=%s",
+                                                    FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1));
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "AllowedCPUs", "StartupAllowedCPUs", "AllowedMemoryNodes", "StartupAllowedMemoryNodes")) {
+                const void *a;
+                size_t n;
+                _cleanup_(cpu_set_reset) CPUSet new_set = {};
+
+                r = sd_bus_message_read_array(message, 'y', &a, &n);
+                if (r < 0)
+                        return r;
+
+                r = cpu_set_from_dbus(a, n, &new_set);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *setstr = NULL;
+                        CPUSet *set = NULL;
+
+                        setstr = cpu_set_to_range_string(&new_set);
+                        if (!setstr)
+                                return -ENOMEM;
+
+                        if (streq(name, "AllowedCPUs"))
+                                set = &c->cpuset_cpus;
+                        else if (streq(name, "StartupAllowedCPUs"))
+                                set = &c->startup_cpuset_cpus;
+                        else if (streq(name, "AllowedMemoryNodes"))
+                                set = &c->cpuset_mems;
+                        else if (streq(name, "StartupAllowedMemoryNodes"))
+                                set = &c->startup_cpuset_mems;
+
+                        assert(set);
+
+                        cpu_set_reset(set);
+                        *set = new_set;
+                        new_set = (CPUSet) {};
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET);
+                        unit_write_settingf(u, flags, name, "%s=\n%s=%s", name, name, setstr);
+                }
+
+                return 1;
+
+        } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) {
+                const char *path;
+                unsigned n = 0;
+                uint64_t u64;
+
+                r = sd_bus_message_enter_container(message, 'a', "(st)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) {
+
+                        if (!path_is_normalized(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                CGroupIODeviceLimit *a = NULL;
+
+                                LIST_FOREACH(device_limits, b, c->io_device_limits)
+                                        if (path_equal(path, b->path)) {
+                                                a = b;
+                                                break;
+                                        }
+
+                                if (!a) {
+                                        CGroupIOLimitType type;
+
+                                        a = new0(CGroupIODeviceLimit, 1);
+                                        if (!a)
+                                                return -ENOMEM;
+
+                                        a->path = strdup(path);
+                                        if (!a->path) {
+                                                free(a);
+                                                return -ENOMEM;
+                                        }
+
+                                        for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+                                                a->limits[type] = cgroup_io_limit_defaults[type];
+
+                                        LIST_PREPEND(device_limits, c->io_device_limits, a);
+                                }
+
+                                a->limits[iol_type] = u64;
+                        }
+
+                        n++;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                LIST_FOREACH(device_limits, a, c->io_device_limits)
+                                        a->limits[iol_type] = cgroup_io_limit_defaults[iol_type];
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fprintf(f, "%s=\n", name);
+                        LIST_FOREACH(device_limits, a, c->io_device_limits)
+                                if (a->limits[iol_type] != cgroup_io_limit_defaults[iol_type])
+                                        fprintf(f, "%s=%s %" PRIu64 "\n", name, a->path, a->limits[iol_type]);
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+
+        } else if (streq(name, "IODeviceWeight")) {
+                const char *path;
+                uint64_t weight;
+                unsigned n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(st)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) {
+
+                        if (!path_is_normalized(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+                        if (!CGROUP_WEIGHT_IS_OK(weight) || weight == CGROUP_WEIGHT_INVALID)
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "IODeviceWeight= value out of range");
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                CGroupIODeviceWeight *a = NULL;
+
+                                LIST_FOREACH(device_weights, b, c->io_device_weights)
+                                        if (path_equal(b->path, path)) {
+                                                a = b;
+                                                break;
+                                        }
+
+                                if (!a) {
+                                        a = new0(CGroupIODeviceWeight, 1);
+                                        if (!a)
+                                                return -ENOMEM;
+
+                                        a->path = strdup(path);
+                                        if (!a->path) {
+                                                free(a);
+                                                return -ENOMEM;
+                                        }
+                                        LIST_PREPEND(device_weights, c->io_device_weights, a);
+                                }
+
+                                a->weight = weight;
+                        }
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                while (c->io_device_weights)
+                                        cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fputs("IODeviceWeight=\n", f);
+                        LIST_FOREACH(device_weights, a, c->io_device_weights)
+                                fprintf(f, "IODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight);
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+
+        } else if (streq(name, "IODeviceLatencyTargetUSec")) {
+                const char *path;
+                uint64_t target;
+                unsigned n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(st)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(st)", &path, &target)) > 0) {
+
+                        if (!path_is_normalized(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                CGroupIODeviceLatency *a = NULL;
+
+                                LIST_FOREACH(device_latencies, b, c->io_device_latencies)
+                                        if (path_equal(b->path, path)) {
+                                                a = b;
+                                                break;
+                                        }
+
+                                if (!a) {
+                                        a = new0(CGroupIODeviceLatency, 1);
+                                        if (!a)
+                                                return -ENOMEM;
+
+                                        a->path = strdup(path);
+                                        if (!a->path) {
+                                                free(a);
+                                                return -ENOMEM;
+                                        }
+                                        LIST_PREPEND(device_latencies, c->io_device_latencies, a);
+                                }
+
+                                a->target_usec = target;
+                        }
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                while (c->io_device_latencies)
+                                        cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fputs("IODeviceLatencyTargetSec=\n", f);
+                        LIST_FOREACH(device_latencies, a, c->io_device_latencies)
+                                fprintf(f, "IODeviceLatencyTargetSec=%s %s\n",
+                                        a->path, FORMAT_TIMESPAN(a->target_usec, 1));
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) {
+                const char *path;
+                unsigned n = 0;
+                uint64_t u64;
+                bool read;
+
+                read = streq(name, "BlockIOReadBandwidth");
+
+                r = sd_bus_message_enter_container(message, 'a', "(st)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) {
+
+                        if (!path_is_normalized(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                CGroupBlockIODeviceBandwidth *a = NULL;
+
+                                LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+                                        if (path_equal(path, b->path)) {
+                                                a = b;
+                                                break;
+                                        }
+
+                                if (!a) {
+                                        a = new0(CGroupBlockIODeviceBandwidth, 1);
+                                        if (!a)
+                                                return -ENOMEM;
+
+                                        a->rbps = CGROUP_LIMIT_MAX;
+                                        a->wbps = CGROUP_LIMIT_MAX;
+                                        a->path = strdup(path);
+                                        if (!a->path) {
+                                                free(a);
+                                                return -ENOMEM;
+                                        }
+
+                                        LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+                                }
+
+                                if (read)
+                                        a->rbps = u64;
+                                else
+                                        a->wbps = u64;
+                        }
+
+                        n++;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) {
+                                        if (read)
+                                                a->rbps = CGROUP_LIMIT_MAX;
+                                        else
+                                                a->wbps = CGROUP_LIMIT_MAX;
+                                }
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        if (read) {
+                                fputs("BlockIOReadBandwidth=\n", f);
+                                LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths)
+                                        if (a->rbps != CGROUP_LIMIT_MAX)
+                                                fprintf(f, "BlockIOReadBandwidth=%s %" PRIu64 "\n", a->path, a->rbps);
+                        } else {
+                                fputs("BlockIOWriteBandwidth=\n", f);
+                                LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths)
+                                        if (a->wbps != CGROUP_LIMIT_MAX)
+                                                fprintf(f, "BlockIOWriteBandwidth=%s %" PRIu64 "\n", a->path, a->wbps);
+                        }
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+
+        } else if (streq(name, "BlockIODeviceWeight")) {
+                const char *path;
+                uint64_t weight;
+                unsigned n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(st)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) {
+
+                        if (!path_is_normalized(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+                        if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight) || weight == CGROUP_BLKIO_WEIGHT_INVALID)
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "BlockIODeviceWeight= out of range");
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                CGroupBlockIODeviceWeight *a = NULL;
+
+                                LIST_FOREACH(device_weights, b, c->blockio_device_weights)
+                                        if (path_equal(b->path, path)) {
+                                                a = b;
+                                                break;
+                                        }
+
+                                if (!a) {
+                                        a = new0(CGroupBlockIODeviceWeight, 1);
+                                        if (!a)
+                                                return -ENOMEM;
+
+                                        a->path = strdup(path);
+                                        if (!a->path) {
+                                                free(a);
+                                                return -ENOMEM;
+                                        }
+                                        LIST_PREPEND(device_weights, c->blockio_device_weights, a);
+                                }
+
+                                a->weight = weight;
+                        }
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                while (c->blockio_device_weights)
+                                        cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fputs("BlockIODeviceWeight=\n", f);
+                        LIST_FOREACH(device_weights, a, c->blockio_device_weights)
+                                fprintf(f, "BlockIODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight);
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+
+        } else if (streq(name, "DevicePolicy")) {
+                const char *policy;
+                CGroupDevicePolicy p;
+
+                r = sd_bus_message_read(message, "s", &policy);
+                if (r < 0)
+                        return r;
+
+                p = cgroup_device_policy_from_string(policy);
+                if (p < 0)
+                        return p;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->device_policy = p;
+                        unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
+                        unit_write_settingf(u, flags, name, "DevicePolicy=%s", policy);
+                }
+
+                return 1;
+
+        } else if (streq(name, "DeviceAllow")) {
+                const char *path, *rwm;
+                unsigned n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ss)", &path, &rwm)) > 0) {
+                        CGroupDevicePermissions p;
+
+                        if (!valid_device_allow_pattern(path) || strpbrk(path, WHITESPACE))
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires device node or pattern");
+
+                        if (isempty(rwm))
+                                p = _CGROUP_DEVICE_PERMISSIONS_ALL;
+                        else {
+                                p = cgroup_device_permissions_from_string(rwm);
+                                if (p < 0)
+                                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires combination of rwm flags");
+                        }
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = cgroup_context_add_or_update_device_allow(c, path, p);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        n++;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                while (c->device_allow)
+                                        cgroup_context_free_device_allow(c, c->device_allow);
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        fputs("DeviceAllow=\n", f);
+                        LIST_FOREACH(device_allow, a, c->device_allow)
+                                fprintf(f, "DeviceAllow=%s %s\n", a->path, cgroup_device_permissions_to_string(a->permissions));
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+
+        } else if (streq(name, "IPAccounting")) {
+                int b;
+
+                r = sd_bus_message_read(message, "b", &b);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->ip_accounting = b;
+
+                        unit_invalidate_cgroup_bpf(u);
+                        unit_write_settingf(u, flags, name, "IPAccounting=%s", yes_no(b));
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) {
+                _cleanup_set_free_ Set *new_prefixes = NULL;
+                size_t n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(iayu)");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        const void *ap;
+                        int32_t family;
+                        uint32_t prefixlen;
+                        size_t an;
+
+                        r = sd_bus_message_enter_container(message, 'r', "iayu");
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        r = sd_bus_message_read(message, "i", &family);
+                        if (r < 0)
+                                return r;
+
+                        if (!IN_SET(family, AF_INET, AF_INET6))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects IPv4 or IPv6 addresses only.", name);
+
+                        r = sd_bus_message_read_array(message, 'y', &ap, &an);
+                        if (r < 0)
+                                return r;
+
+                        if (an != FAMILY_ADDRESS_SIZE(family))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IP address has wrong size for family (%s, expected %zu, got %zu)",
+                                                               af_to_name(family), FAMILY_ADDRESS_SIZE(family), an);
+
+                        r = sd_bus_message_read(message, "u", &prefixlen);
+                        if (r < 0)
+                                return r;
+
+                        if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Prefix length %" PRIu32 " too large for address family %s.", prefixlen, af_to_name(family));
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                struct in_addr_prefix prefix = {
+                                        .family = family,
+                                        .prefixlen = prefixlen,
+                                };
+
+                                memcpy(&prefix.address, ap, an);
+
+                                r = in_addr_prefix_add(&new_prefixes, &prefix);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        r = sd_bus_message_exit_container(message);
+                        if (r < 0)
+                                return r;
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        Set **prefixes;
+                        bool *reduced;
+                        FILE *f;
+
+                        unit_invalidate_cgroup_bpf(u);
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        prefixes = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny;
+                        reduced = streq(name, "IPAddressAllow") ? &c->ip_address_allow_reduced : &c->ip_address_deny_reduced;
+
+                        if (n == 0) {
+                                *reduced = true;
+                                *prefixes = set_free(*prefixes);
+                                fputs(name, f);
+                                fputs("=\n", f);
+                        } else {
+                                *reduced = false;
+
+                                r = in_addr_prefixes_merge(prefixes, new_prefixes);
+                                if (r < 0)
+                                        return r;
+
+                                const struct in_addr_prefix *p;
+                                SET_FOREACH(p, new_prefixes)
+                                        fprintf(f, "%s=%s\n", name,
+                                                IN_ADDR_PREFIX_TO_STRING(p->family, &p->address, p->prefixlen));
+                        }
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+        }
+
+        if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) {
+                ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure;
+                ManagedOOMMode m;
+                const char *mode;
+
+                if (!UNIT_VTABLE(u)->can_set_managed_oom)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+                r = sd_bus_message_read(message, "s", &mode);
+                if (r < 0)
+                        return r;
+
+                m = managed_oom_mode_from_string(mode);
+                if (m < 0)
+                        return -EINVAL;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        *cgroup_mode = m;
+                        unit_write_settingf(u, flags, name, "%s=%s", name, mode);
+                }
+
+                (void) manager_varlink_send_managed_oom_update(u);
+                return 1;
+        }
+
+        if (streq(name, "ManagedOOMMemoryPressureLimit")) {
+                uint32_t v;
+
+                if (!UNIT_VTABLE(u)->can_set_managed_oom)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+                r = sd_bus_message_read(message, "u", &v);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->moom_mem_pressure_limit = v;
+                        unit_write_settingf(u, flags, name,
+                                            "ManagedOOMMemoryPressureLimit=" PERMYRIAD_AS_PERCENT_FORMAT_STR,
+                                            PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(v)));
+                }
+
+                if (c->moom_mem_pressure == MANAGED_OOM_KILL)
+                        (void) manager_varlink_send_managed_oom_update(u);
+
+                return 1;
+        }
+
+        if (streq(name, "ManagedOOMPreference")) {
+                ManagedOOMPreference p;
+                const char *pref;
+
+                r = sd_bus_message_read(message, "s", &pref);
+                if (r < 0)
+                        return r;
+
+                p = managed_oom_preference_from_string(pref);
+                if (p < 0)
+                        return p;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->moom_preference = p;
+                        unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref);
+                }
+
+                return 1;
+        }
+        if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) {
+                CGroupSocketBindItem **list;
+                uint16_t nr_ports, port_min;
+                size_t n = 0;
+                int32_t family, ip_protocol;
+
+                list = streq(name, "SocketBindAllow") ? &c->socket_bind_allow : &c->socket_bind_deny;
+
+                r = sd_bus_message_enter_container(message, 'a', "(iiqq)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(iiqq)", &family, &ip_protocol, &nr_ports, &port_min)) > 0) {
+
+                        if (!IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects INET or INET6 family, if specified.", name);
+
+                        if (!IN_SET(ip_protocol, 0, IPPROTO_TCP, IPPROTO_UDP))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects TCP or UDP protocol, if specified.", name);
+
+                        if (port_min + (uint32_t) nr_ports > (1 << 16))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects maximum port value lesser than 65536.", name);
+
+                        if (port_min == 0 && nr_ports != 0)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects port range starting with positive value.", name);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ CGroupSocketBindItem *item = NULL;
+
+                                item = new(CGroupSocketBindItem, 1);
+                                if (!item)
+                                        return log_oom();
+
+                                *item = (CGroupSocketBindItem) {
+                                        .address_family = family,
+                                        .ip_protocol = ip_protocol,
+                                        .nr_ports = nr_ports,
+                                        .port_min = port_min
+                                };
+
+                                LIST_PREPEND(socket_bind_items, *list, TAKE_PTR(item));
+                        }
+                        n++;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_(memstream_done) MemStream m = {};
+                        _cleanup_free_ char *buf = NULL;
+                        FILE *f;
+
+                        if (n == 0)
+                                cgroup_context_remove_socket_bind(list);
+                        else {
+                                if ((u->manager->cgroup_supported & CGROUP_MASK_BPF_SOCKET_BIND) == 0)
+                                        log_full(LOG_DEBUG,
+                                                 "Unit %s configures source compiled BPF programs "
+                                                 "but the local system does not support that.\n"
+                                                 "Starting this unit will fail!", u->id);
+                        }
+
+                        f = memstream_init(&m);
+                        if (!f)
+                                return -ENOMEM;
+
+                        if (n == 0)
+                                fprintf(f, "%s=\n", name);
+                        else
+                                LIST_FOREACH(socket_bind_items, item, *list) {
+                                        fprintf(f, "%s=", name);
+                                        cgroup_context_dump_socket_bind_item(item, f);
+                                        fputc('\n', f);
+                                }
+
+                        r = memstream_finalize(&m, &buf, NULL);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_setting(u, flags, name, buf);
+                }
+
+                return 1;
+        }
+        if (streq(name, "RestrictNetworkInterfaces")) {
+                int is_allow_list;
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_enter_container(message, 'r', "bas");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read(message, "b", &is_allow_list);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = NULL;
+
+                        if (strv_isempty(l)) {
+                                c->restrict_network_interfaces_is_allow_list = false;
+                                c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                                return 1;
+                        }
+
+                        if (set_isempty(c->restrict_network_interfaces))
+                                c->restrict_network_interfaces_is_allow_list = is_allow_list;
+
+                        STRV_FOREACH(s, l) {
+                                if (!ifname_valid(*s)) {
+                                        log_full(LOG_WARNING, "Invalid interface name, ignoring: %s", *s);
+                                        continue;
+                                }
+                                if (c->restrict_network_interfaces_is_allow_list != (bool) is_allow_list)
+                                        free(set_remove(c->restrict_network_interfaces, *s));
+                                else {
+                                        r = set_put_strdup(&c->restrict_network_interfaces, *s);
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+                        }
+
+                        joined = strv_join(l, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        unit_write_settingf(u, flags, name, "%s=%s%s", name, is_allow_list ? "" : "~", joined);
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "NFTSet")) {
+                int source, nfproto;
+                const char *table, *set;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(iiss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(iiss)", &source, &nfproto, &table, &set)) > 0) {
+                        const char *source_name, *nfproto_name;
+
+                        if (!IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid source %d.", source);
+
+                        source_name = nft_set_source_to_string(source);
+                        assert(source_name);
+
+                        nfproto_name = nfproto_to_string(nfproto);
+                        if (!nfproto_name)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid protocol %d.", nfproto);
+
+                        if (!nft_identifier_valid(table)) {
+                                _cleanup_free_ char *esc = NULL;
+
+                                esc = cescape(table);
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NFT table name %s.", strna(esc));
+                        }
+
+                        if (!nft_identifier_valid(set)) {
+                                _cleanup_free_ char *esc = NULL;
+
+                                esc = cescape(set);
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NFT set name %s.", strna(esc));
+                        }
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = nft_set_add(&c->nft_set_context, source, nfproto, table, set);
+                                if (r < 0)
+                                        return r;
+
+                                unit_write_settingf(
+                                                u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                                "%s=%s:%s:%s:%s",
+                                                name,
+                                                source_name,
+                                                nfproto_name,
+                                                table,
+                                                set);
+                        }
+
+                        empty = false;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (empty && !UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        nft_set_context_clear(&c->nft_set_context);
+                        unit_write_settingf(u, flags, name, "%s=", name);
+                }
+
+                return 1;
+        }
+
+        /* must be last */
+        if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB))
+                return bus_cgroup_set_transient_property(u, c, name, message, flags, error);
+
+        return 0;
+}
diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h
new file mode 100644
index 0000000..dd0d5da
--- /dev/null
+++ b/src/core/dbus-cgroup.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+#include "cgroup.h"
+
+extern const sd_bus_vtable bus_cgroup_vtable[];
+
+int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_cgroup_pressure_watch(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+
+int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-device.c b/src/core/dbus-device.c
new file mode 100644
index 0000000..b5e18d8
--- /dev/null
+++ b/src/core/dbus-device.c
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-device.h"
+#include "device.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_device_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("SysFSPath", "s", NULL, offsetof(Device, sysfs), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_VTABLE_END
+};
diff --git a/src/core/dbus-device.h b/src/core/dbus-device.h
new file mode 100644
index 0000000..bfb5770
--- /dev/null
+++ b/src/core/dbus-device.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+extern const sd_bus_vtable bus_device_vtable[];
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
new file mode 100644
index 0000000..2d05ba7
--- /dev/null
+++ b/src/core/dbus-execute.c
@@ -0,0 +1,3758 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "dbus-execute.h"
+#include "dbus-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "iovec-util.h"
+#include "ioprio-util.h"
+#include "journal-file.h"
+#include "load-fragment.h"
+#include "memstream-util.h"
+#include "missing_ioprio.h"
+#include "mountpoint-util.h"
+#include "namespace.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pcre2-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "utf8.h"
+
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutput);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
+static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio);
+static BUS_DEFINE_PROPERTY_GET(property_get_mount_apivfs, "b", ExecContext, exec_context_get_effective_mount_apivfs);
+static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_class, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_class);
+static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_data);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
+static BUS_DEFINE_PROPERTY_GET(property_get_oom_score_adjust, "i", ExecContext, exec_context_get_oom_score_adjust);
+static BUS_DEFINE_PROPERTY_GET(property_get_nice, "i", ExecContext, exec_context_get_nice);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_policy, "i", ExecContext, exec_context_get_cpu_sched_policy);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_priority, "i", ExecContext, exec_context_get_cpu_sched_priority);
+static BUS_DEFINE_PROPERTY_GET(property_get_coredump_filter, "t", ExecContext, exec_context_get_coredump_filter);
+static BUS_DEFINE_PROPERTY_GET(property_get_timer_slack_nsec, "t", ExecContext, exec_context_get_timer_slack_nsec);
+
+static int property_get_environment_files(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sb)");
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(j, c->environment_files) {
+                const char *fn = *j;
+
+                r = sd_bus_message_append(reply, "(sb)", fn[0] == '-' ? fn + 1 : fn, fn[0] == '-');
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_cpu_affinity(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_(cpu_set_reset) CPUSet s = {};
+        _cleanup_free_ uint8_t *array = NULL;
+        size_t allocated;
+
+        assert(bus);
+        assert(reply);
+
+        if (c->cpu_affinity_from_numa) {
+                int r;
+
+                r = numa_to_cpu_set(&c->numa_policy, &s);
+                if (r < 0)
+                        return r;
+        }
+
+        (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set,  &array, &allocated);
+
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_mask(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_free_ uint8_t *array = NULL;
+        size_t allocated;
+
+        assert(bus);
+        assert(reply);
+
+        (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated);
+
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_policy(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+        ExecContext *c = ASSERT_PTR(userdata);
+        int32_t policy;
+
+        assert(bus);
+        assert(reply);
+
+        policy = numa_policy_get_type(&c->numa_policy);
+
+        return sd_bus_message_append_basic(reply, 'i', &policy);
+}
+
+static int property_get_syscall_filter(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'r', "bas");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "b", c->syscall_allow_list);
+        if (r < 0)
+                return r;
+
+        l = exec_context_get_syscall_filter(c);
+        if (!l)
+                return -ENOMEM;
+
+        r = sd_bus_message_append_strv(reply, l);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_syscall_log(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'r', "bas");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "b", c->syscall_log_allow_list);
+        if (r < 0)
+                return r;
+
+        l = exec_context_get_syscall_log(c);
+        if (!l)
+                return -ENOMEM;
+
+        r = sd_bus_message_append_strv(reply, l);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_syscall_archs(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        l = exec_context_get_syscall_archs(c);
+        if (!l)
+                return -ENOMEM;
+
+        r = sd_bus_message_append_strv(reply, l);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int property_get_selinux_context(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "(bs)", c->selinux_context_ignore, c->selinux_context);
+}
+
+static int property_get_apparmor_profile(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "(bs)", c->apparmor_profile_ignore, c->apparmor_profile);
+}
+
+static int property_get_smack_process_label(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "(bs)", c->smack_process_label_ignore, c->smack_process_label);
+}
+
+static int property_get_address_families(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'r', "bas");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "b", c->address_families_allow_list);
+        if (r < 0)
+                return r;
+
+        l = exec_context_get_address_families(c);
+        if (!l)
+                return -ENOMEM;
+
+        r = sd_bus_message_append_strv(reply, l);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_working_directory(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        const char *wd;
+
+        assert(bus);
+        assert(reply);
+
+        if (c->working_directory_home)
+                wd = "~";
+        else
+                wd = c->working_directory;
+
+        if (c->working_directory_missing_ok)
+                wd = strjoina("!", wd);
+
+        return sd_bus_message_append(reply, "s", wd);
+}
+
+static int property_get_stdio_fdname(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int fileno;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        if (streq(property, "StandardInputFileDescriptorName"))
+                fileno = STDIN_FILENO;
+        else if (streq(property, "StandardOutputFileDescriptorName"))
+                fileno = STDOUT_FILENO;
+        else {
+                assert(streq(property, "StandardErrorFileDescriptorName"));
+                fileno = STDERR_FILENO;
+        }
+
+        return sd_bus_message_append(reply, "s", exec_context_fdname(c, fileno));
+}
+
+static int property_get_input_data(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size);
+}
+
+static int property_get_restrict_filesystems(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        _cleanup_free_ char **l = NULL;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'r', "bas");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "b", c->restrict_filesystems_allow_list);
+        if (r < 0)
+                return r;
+
+        l = exec_context_get_restrict_filesystems(c);
+        if (!l)
+                return -ENOMEM;
+
+        r = sd_bus_message_append_strv(reply, l);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_bind_paths(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        bool ro;
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        ro = strstr(property, "ReadOnly");
+
+        r = sd_bus_message_open_container(reply, 'a', "(ssbt)");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < c->n_bind_mounts; i++) {
+
+                if (ro != c->bind_mounts[i].read_only)
+                        continue;
+
+                r = sd_bus_message_append(
+                                reply, "(ssbt)",
+                                c->bind_mounts[i].source,
+                                c->bind_mounts[i].destination,
+                                c->bind_mounts[i].ignore_enoent,
+                                c->bind_mounts[i].recursive ? (uint64_t) MS_REC : UINT64_C(0));
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_temporary_filesystems(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        for (unsigned i = 0; i < c->n_temporary_filesystems; i++) {
+                TemporaryFileSystem *t = c->temporary_filesystems + i;
+
+                r = sd_bus_message_append(
+                                reply, "(ss)",
+                                t->path,
+                                t->options);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_log_extra_fields(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "ay");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < c->n_log_extra_fields; i++) {
+                r = sd_bus_message_append_array(reply, 'y', c->log_extra_fields[i].iov_base, c->log_extra_fields[i].iov_len);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int sd_bus_message_append_log_filter_patterns(sd_bus_message *reply, Set *patterns, bool is_allowlist) {
+        const char *pattern;
+        int r;
+
+        assert(reply);
+
+        SET_FOREACH(pattern, patterns) {
+                r = sd_bus_message_append(reply, "(bs)", is_allowlist, pattern);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int property_get_log_filter_patterns(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = userdata;
+        int r;
+
+        assert(c);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(bs)");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_log_filter_patterns(reply, c->log_filter_allowed_patterns,
+                                                      /* is_allowlist = */ true);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_log_filter_patterns(reply, c->log_filter_denied_patterns,
+                                                      /* is_allowlist = */ false);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_set_credential(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        ExecSetCredential *sc;
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(say)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(sc, c->set_credentials) {
+
+                if (sc->encrypted != streq(property, "SetCredentialEncrypted"))
+                        continue;
+
+                r = sd_bus_message_open_container(reply, 'r', "say");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append(reply, "s", sc->id);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append_array(reply, 'y', sc->data, sc->size);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_close_container(reply);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_load_credential(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        ExecLoadCredential *lc;
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(lc, c->load_credentials) {
+
+                if (lc->encrypted != streq(property, "LoadCredentialEncrypted"))
+                        continue;
+
+                r = sd_bus_message_append(reply, "(ss)", lc->id, lc->path);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_root_hash(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        return sd_bus_message_append_array(reply, 'y', c->root_hash, c->root_hash_size);
+}
+
+static int property_get_root_hash_sig(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        return sd_bus_message_append_array(reply, 'y', c->root_hash_sig, c->root_hash_sig_size);
+}
+
+static int property_get_root_image_options(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(mount_options, m, c->root_image_options) {
+                r = sd_bus_message_append(reply, "(ss)",
+                                          partition_designator_to_string(m->partition_designator),
+                                          m->options);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_mount_images(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ssba(ss))");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < c->n_mount_images; i++) {
+                r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "ssba(ss)");
+                if (r < 0)
+                        return r;
+                r = sd_bus_message_append(
+                                reply, "ssb",
+                                c->mount_images[i].source,
+                                c->mount_images[i].destination,
+                                c->mount_images[i].ignore_enoent);
+                if (r < 0)
+                        return r;
+                r = sd_bus_message_open_container(reply, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+                LIST_FOREACH(mount_options, m, c->mount_images[i].mount_options) {
+                        r = sd_bus_message_append(reply, "(ss)",
+                                                  partition_designator_to_string(m->partition_designator),
+                                                  m->options);
+                        if (r < 0)
+                                return r;
+                }
+                r = sd_bus_message_close_container(reply);
+                if (r < 0)
+                        return r;
+                r = sd_bus_message_close_container(reply);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_extension_images(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sba(ss))");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < c->n_extension_images; i++) {
+                r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "sba(ss)");
+                if (r < 0)
+                        return r;
+                r = sd_bus_message_append(
+                                reply, "sb",
+                                c->extension_images[i].source,
+                                c->extension_images[i].ignore_enoent);
+                if (r < 0)
+                        return r;
+                r = sd_bus_message_open_container(reply, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+                LIST_FOREACH(mount_options, m, c->extension_images[i].mount_options) {
+                        r = sd_bus_message_append(reply, "(ss)",
+                                                  partition_designator_to_string(m->partition_designator),
+                                                  m->options);
+                        if (r < 0)
+                                return r;
+                }
+                r = sd_bus_message_close_container(reply);
+                if (r < 0)
+                        return r;
+                r = sd_bus_message_close_container(reply);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int bus_property_get_exec_dir(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecDirectory *d = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < d->n_items; i++) {
+                r = sd_bus_message_append_basic(reply, 's', d->items[i].path);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int bus_property_get_exec_dir_symlink(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecDirectory *d = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sst)");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < d->n_items; i++)
+                STRV_FOREACH(dst, d->items[i].symlinks) {
+                        r = sd_bus_message_append(reply, "(sst)", d->items[i].path, *dst, UINT64_C(0) /* flags, unused for now */);
+                        if (r < 0)
+                                return r;
+                }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_image_policy(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ImagePolicy **pp = ASSERT_PTR(userdata);
+        _cleanup_free_ char *s = NULL;
+        int r;
+
+        assert(bus);
+        assert(property);
+        assert(reply);
+
+        r = image_policy_to_string(*pp ?: &image_policy_service, /* simplify= */ true, &s);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_append(reply, "s", s);
+}
+
+const sd_bus_vtable bus_exec_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("EnvironmentFiles", "a(sb)", property_get_environment_files, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PassEnvironment", "as", NULL, offsetof(ExecContext, pass_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UnsetEnvironment", "as", NULL, offsetof(ExecContext, unset_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UMask", "u", bus_property_get_mode, offsetof(ExecContext, umask), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitCPU", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitCPUSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitFSIZE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitDATA", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitDATASoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitSTACK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitCORE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitCORESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitRSS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitRSSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitNOFILE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitAS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitASSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitNPROC", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitLOCKS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitNICE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitNICESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitRTPRIO", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitRTTIME", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootImageOptions", "a(ss)", property_get_root_image_options, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootHash", "ay", property_get_root_hash, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootHashPath", "s", NULL, offsetof(ExecContext, root_hash_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CoredumpFilter", "t", property_get_coredump_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IOSchedulingClass", "i", property_get_ioprio_class, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IOSchedulingPriority", "i", property_get_ioprio_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardInputData", "ay", property_get_input_data, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TTYVTDisallocate", "b", bus_property_get_bool, offsetof(ExecContext, tty_vt_disallocate), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TTYRows", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_rows), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TTYColumns", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_cols), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SyslogPriority", "i", bus_property_get_int, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SyslogIdentifier", "s", NULL, offsetof(ExecContext, syslog_identifier), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SyslogLevelPrefix", "b", bus_property_get_bool, offsetof(ExecContext, syslog_level_prefix), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SyslogLevel", "i", property_get_syslog_level, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SyslogFacility", "i", property_get_syslog_facility, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogLevelMax", "i", bus_property_get_int, offsetof(ExecContext, log_level_max), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogRateLimitIntervalUSec", "t", bus_property_get_usec, offsetof(ExecContext, log_ratelimit_interval_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogRateLimitBurst", "u", bus_property_get_unsigned, offsetof(ExecContext, log_ratelimit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogExtraFields", "aay", property_get_log_extra_fields, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogFilterPatterns", "a(bs)", property_get_log_filter_patterns, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogNamespace", "s", NULL, offsetof(ExecContext, log_namespace), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SecureBits", "i", bus_property_get_int, offsetof(ExecContext, secure_bits), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CapabilityBoundingSet", "t", NULL, offsetof(ExecContext, capability_bounding_set), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("AmbientCapabilities", "t", NULL, offsetof(ExecContext, capability_ambient_set), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SetLoginEnvironment", "b", bus_property_get_tristate, offsetof(ExecContext, set_login_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SetCredentialEncrypted", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LoadCredential", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LoadCredentialEncrypted", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ImportCredential", "as", bus_property_get_string_set, offsetof(ExecContext, import_credentials), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReadWritePaths", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReadOnlyPaths", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExecPaths", "as", NULL, offsetof(ExecContext, exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NoExecPaths", "as", NULL, offsetof(ExecContext, no_exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExecSearchPath", "as", NULL, offsetof(ExecContext, exec_search_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_propagation_flag), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UtmpMode", "s", property_get_exec_utmp_mode, offsetof(ExecContext, utmp_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("AppArmorProfile", "(bs)", property_get_apparmor_profile, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SmackProcessLabel", "(bs)", property_get_smack_process_label, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IgnoreSIGPIPE", "b", bus_property_get_bool, offsetof(ExecContext, ignore_sigpipe), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NoNewPrivileges", "b", bus_property_get_bool, offsetof(ExecContext, no_new_privileges), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SystemCallFilter", "(bas)", property_get_syscall_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SystemCallErrorNumber", "i", bus_property_get_int, offsetof(ExecContext, syscall_errno), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SystemCallLog", "(bas)", property_get_syscall_log, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Personality", "s", property_get_personality, offsetof(ExecContext, personality), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", bus_property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StateDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StateDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StateDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CacheDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CacheDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CacheDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogsDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogsDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LogsDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ConfigurationDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ConfigurationDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimeoutCleanUSec", "t", bus_property_get_usec, offsetof(ExecContext, timeout_clean_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MountAPIVFS", "b", property_get_mount_apivfs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RootImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, root_image_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MountImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, mount_image_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExtensionImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, extension_image_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+
+        /* Obsolete/redundant properties: */
+        SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("ReadOnlyDirectories", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("InaccessibleDirectories", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+
+        SD_BUS_VTABLE_END
+};
+
+static int append_exec_command(sd_bus_message *reply, ExecCommand *c) {
+        int r;
+
+        assert(reply);
+        assert(c);
+
+        if (!c->path)
+                return 0;
+
+        r = sd_bus_message_open_container(reply, 'r', "sasbttttuii");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "s", c->path);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_strv(reply, c->argv);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "bttttuii",
+                                  !!(c->flags & EXEC_COMMAND_IGNORE_FAILURE),
+                                  c->exec_status.start_timestamp.realtime,
+                                  c->exec_status.start_timestamp.monotonic,
+                                  c->exec_status.exit_timestamp.realtime,
+                                  c->exec_status.exit_timestamp.monotonic,
+                                  (uint32_t) c->exec_status.pid,
+                                  (int32_t) c->exec_status.code,
+                                  (int32_t) c->exec_status.status);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int append_exec_ex_command(sd_bus_message *reply, ExecCommand *c) {
+        _cleanup_strv_free_ char **ex_opts = NULL;
+        int r;
+
+        assert(reply);
+        assert(c);
+
+        if (!c->path)
+                return 0;
+
+        r = sd_bus_message_open_container(reply, 'r', "sasasttttuii");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "s", c->path);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_strv(reply, c->argv);
+        if (r < 0)
+                return r;
+
+        r = exec_command_flags_to_strv(c->flags, &ex_opts);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_strv(reply, ex_opts);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "ttttuii",
+                                  c->exec_status.start_timestamp.realtime,
+                                  c->exec_status.start_timestamp.monotonic,
+                                  c->exec_status.exit_timestamp.realtime,
+                                  c->exec_status.exit_timestamp.monotonic,
+                                  (uint32_t) c->exec_status.pid,
+                                  (int32_t) c->exec_status.code,
+                                  (int32_t) c->exec_status.status);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_command(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *ret_error) {
+
+        ExecCommand *c = (ExecCommand*) userdata;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)");
+        if (r < 0)
+                return r;
+
+        r = append_exec_command(reply, c);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_command_list(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *ret_error) {
+
+        ExecCommand *exec_command = *(ExecCommand**) userdata;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(command, c, exec_command) {
+                r = append_exec_command(reply, c);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_ex_command_list(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *ret_error) {
+
+        ExecCommand *exec_command = *(ExecCommand**) userdata;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sasasttttuii)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(command, c, exec_command) {
+                r = append_exec_ex_command(reply, c);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static char *exec_command_flags_to_exec_chars(ExecCommandFlags flags) {
+        return strjoin(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE)   ? "-" : "",
+                       FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND)    ? ":" : "",
+                       FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) ? "+" : "",
+                       FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID)        ? "!" : "",
+                       FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC)    ? "!!" : "");
+}
+
+int bus_set_transient_exec_command(
+                Unit *u,
+                const char *name,
+                ExecCommand **exec_command,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+        bool is_ex_prop = endswith(name, "Ex");
+        unsigned n = 0;
+        int r;
+
+        /* Drop Ex from the written setting. E.g. ExecStart=, not ExecStartEx=. */
+        const char *written_name = is_ex_prop ? strndupa(name, strlen(name) - 2) : name;
+
+        r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)");
+        if (r < 0)
+                return r;
+
+        while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) {
+                _cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL;
+                const char *path;
+                int b;
+
+                r = sd_bus_message_read(message, "s", &path);
+                if (r < 0)
+                        return r;
+
+                if (!path_is_absolute(path) && !filename_is_valid(path))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "\"%s\" is neither a valid executable name nor an absolute path",
+                                                 path);
+
+                r = sd_bus_message_read_strv(message, &argv);
+                if (r < 0)
+                        return r;
+
+                if (strv_isempty(argv))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "\"%s\" argv cannot be empty", name);
+
+                r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        ExecCommand *c;
+
+                        c = new0(ExecCommand, 1);
+                        if (!c)
+                                return -ENOMEM;
+
+                        c->path = strdup(path);
+                        if (!c->path) {
+                                free(c);
+                                return -ENOMEM;
+                        }
+
+                        c->argv = TAKE_PTR(argv);
+
+                        if (is_ex_prop) {
+                                r = exec_command_flags_from_strv(ex_opts, &c->flags);
+                                if (r < 0)
+                                        return r;
+                        } else
+                                c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0;
+
+                        path_simplify(c->path);
+                        exec_command_append_list(exec_command, c);
+                }
+
+                n++;
+        }
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                _cleanup_(memstream_done) MemStream m = {};
+                _cleanup_free_ char *buf = NULL;
+                FILE *f;
+
+                if (n == 0)
+                        *exec_command = exec_command_free_list(*exec_command);
+
+                f = memstream_init(&m);
+                if (!f)
+                        return -ENOMEM;
+
+                fprintf(f, "%s=\n", written_name);
+
+                LIST_FOREACH(command, c, *exec_command) {
+                        _cleanup_free_ char *a = NULL, *exec_chars = NULL;
+                        UnitWriteFlags esc_flags = UNIT_ESCAPE_SPECIFIERS |
+                                (FLAGS_SET(c->flags, EXEC_COMMAND_NO_ENV_EXPAND) ? UNIT_ESCAPE_EXEC_SYNTAX : UNIT_ESCAPE_EXEC_SYNTAX_ENV);
+
+                        exec_chars = exec_command_flags_to_exec_chars(c->flags);
+                        if (!exec_chars)
+                                return -ENOMEM;
+
+                        a = unit_concat_strv(c->argv, esc_flags);
+                        if (!a)
+                                return -ENOMEM;
+
+                        if (streq_ptr(c->path, c->argv ? c->argv[0] : NULL))
+                                fprintf(f, "%s=%s%s\n", written_name, exec_chars, a);
+                        else {
+                                _cleanup_free_ char *t = NULL;
+                                const char *p;
+
+                                p = unit_escape_setting(c->path, esc_flags, &t);
+                                if (!p)
+                                        return -ENOMEM;
+
+                                fprintf(f, "%s=%s@%s %s\n", written_name, exec_chars, p, a);
+                        }
+                }
+
+                r = memstream_finalize(&m, &buf, NULL);
+                if (r < 0)
+                        return r;
+
+                unit_write_setting(u, flags, written_name, buf);
+        }
+
+        return 1;
+}
+
+static int parse_personality(const char *s, unsigned long *p) {
+        unsigned long v;
+
+        assert(p);
+
+        v = personality_from_string(s);
+        if (v == PERSONALITY_INVALID)
+                return -EINVAL;
+
+        *p = v;
+        return 0;
+}
+
+static const char* mount_propagation_flag_to_string_with_check(unsigned long n) {
+        if (!mount_propagation_flag_is_valid(n))
+                return NULL;
+
+        return mount_propagation_flag_to_string(n);
+}
+
+static BUS_DEFINE_SET_TRANSIENT(nsec, "t", uint64_t, nsec_t, NSEC_FMT);
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(log_level, "i", int32_t, int, "%" PRIi32, log_level_is_valid);
+#if HAVE_SECCOMP
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(errno, "i", int32_t, int, "%" PRIi32, seccomp_errno_or_action_is_valid);
+#endif
+static BUS_DEFINE_SET_TRANSIENT_PARSE(std_input, ExecInput, exec_input_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(std_output, ExecOutput, exec_output_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
+BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(capability, "t", uint64_t, uint64_t, "%" PRIu64, capability_set_to_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(namespace_flag, "t", uint64_t, unsigned long, "%" PRIu64, namespace_flags_to_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(mount_propagation_flag, "t", uint64_t, unsigned long, "%" PRIu64, mount_propagation_flag_to_string_with_check);
+
+int bus_exec_context_set_transient_property(
+                Unit *u,
+                ExecContext *c,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        const char *suffix;
+        int r;
+
+        assert(u);
+        assert(c);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "User"))
+                return bus_set_transient_user_relaxed(u, name, &c->user, message, flags, error);
+
+        if (streq(name, "Group"))
+                return bus_set_transient_user_relaxed(u, name, &c->group, message, flags, error);
+
+        if (streq(name, "SetLoginEnvironment"))
+                return bus_set_transient_tristate(u, name, &c->set_login_environment, message, flags, error);
+
+        if (streq(name, "TTYPath"))
+                return bus_set_transient_path(u, name, &c->tty_path, message, flags, error);
+
+        if (streq(name, "RootImage"))
+                return bus_set_transient_path(u, name, &c->root_image, message, flags, error);
+
+        if (streq(name, "RootImageOptions")) {
+                _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                _cleanup_free_ char *format_str = NULL;
+
+                r = bus_read_mount_options(message, error, &options, &format_str, " ");
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (options) {
+                                LIST_JOIN(mount_options, c->root_image_options, options);
+                                unit_write_settingf(
+                                                u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                                "%s=%s",
+                                                name,
+                                                format_str);
+                        } else {
+                                c->root_image_options = mount_options_free_all(c->root_image_options);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        }
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "RootHash")) {
+                const void *roothash_decoded;
+                size_t roothash_decoded_size;
+
+                r = sd_bus_message_read_array(message, 'y', &roothash_decoded, &roothash_decoded_size);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *encoded = NULL;
+
+                        if (roothash_decoded_size == 0) {
+                                c->root_hash_path = mfree(c->root_hash_path);
+                                c->root_hash = mfree(c->root_hash);
+                                c->root_hash_size = 0;
+
+                                unit_write_settingf(u, flags, name, "RootHash=");
+                        } else {
+                                _cleanup_free_ void *p = NULL;
+
+                                encoded = hexmem(roothash_decoded, roothash_decoded_size);
+                                if (!encoded)
+                                        return -ENOMEM;
+
+                                p = memdup(roothash_decoded, roothash_decoded_size);
+                                if (!p)
+                                        return -ENOMEM;
+
+                                free_and_replace(c->root_hash, p);
+                                c->root_hash_size = roothash_decoded_size;
+                                c->root_hash_path = mfree(c->root_hash_path);
+
+                                unit_write_settingf(u, flags, name, "RootHash=%s", encoded);
+                        }
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "RootHashPath")) {
+                c->root_hash_size = 0;
+                c->root_hash = mfree(c->root_hash);
+
+                return bus_set_transient_path(u, "RootHash", &c->root_hash_path, message, flags, error);
+        }
+
+        if (streq(name, "RootHashSignature")) {
+                const void *roothash_sig_decoded;
+                size_t roothash_sig_decoded_size;
+
+                r = sd_bus_message_read_array(message, 'y', &roothash_sig_decoded, &roothash_sig_decoded_size);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *encoded = NULL;
+
+                        if (roothash_sig_decoded_size == 0) {
+                                c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+                                c->root_hash_sig = mfree(c->root_hash_sig);
+                                c->root_hash_sig_size = 0;
+
+                                unit_write_settingf(u, flags, name, "RootHashSignature=");
+                        } else {
+                                _cleanup_free_ void *p = NULL;
+                                ssize_t len;
+
+                                len = base64mem(roothash_sig_decoded, roothash_sig_decoded_size, &encoded);
+                                if (len < 0)
+                                        return -ENOMEM;
+
+                                p = memdup(roothash_sig_decoded, roothash_sig_decoded_size);
+                                if (!p)
+                                        return -ENOMEM;
+
+                                free_and_replace(c->root_hash_sig, p);
+                                c->root_hash_sig_size = roothash_sig_decoded_size;
+                                c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+
+                                unit_write_settingf(u, flags, name, "RootHashSignature=base64:%s", encoded);
+                        }
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "RootHashSignaturePath")) {
+                c->root_hash_sig_size = 0;
+                c->root_hash_sig = mfree(c->root_hash_sig);
+
+                return bus_set_transient_path(u, "RootHashSignature", &c->root_hash_sig_path, message, flags, error);
+        }
+
+        if (streq(name, "RootVerity"))
+                return bus_set_transient_path(u, name, &c->root_verity, message, flags, error);
+
+        if (streq(name, "RootDirectory"))
+                return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
+
+        if (streq(name, "RootEphemeral"))
+                return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error);
+
+        if (streq(name, "SyslogIdentifier"))
+                return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);
+
+        if (streq(name, "LogLevelMax"))
+                return bus_set_transient_log_level(u, name, &c->log_level_max, message, flags, error);
+
+        if (streq(name, "LogRateLimitIntervalUSec"))
+                return bus_set_transient_usec(u, name, &c->log_ratelimit_interval_usec, message, flags, error);
+
+        if (streq(name, "LogRateLimitBurst"))
+                return bus_set_transient_unsigned(u, name, &c->log_ratelimit_burst, message, flags, error);
+
+        if (streq(name, "LogFilterPatterns")) {
+                /* Use _cleanup_free_, not _cleanup_strv_free_, as we don't want the content of the strv
+                 * to be freed. */
+                _cleanup_free_ char **allow_list = NULL, **deny_list = NULL;
+                const char *pattern;
+                int is_allowlist;
+
+                r = sd_bus_message_enter_container(message, 'a', "(bs)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(bs)", &is_allowlist, &pattern)) > 0) {
+                        _cleanup_(pattern_freep) pcre2_code *compiled_pattern = NULL;
+
+                        if (isempty(pattern))
+                                continue;
+
+                        r = pattern_compile_and_log(pattern, 0, &compiled_pattern);
+                        if (r < 0)
+                                return r;
+
+                        r = strv_push(is_allowlist ? &allow_list : &deny_list, (char *)pattern);
+                        if (r < 0)
+                                return r;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(allow_list) && strv_isempty(deny_list)) {
+                                c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
+                                c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                r = set_put_strdupv(&c->log_filter_allowed_patterns, allow_list);
+                                if (r < 0)
+                                        return r;
+                                r = set_put_strdupv(&c->log_filter_denied_patterns, deny_list);
+                                if (r < 0)
+                                        return r;
+
+                                STRV_FOREACH(unit_pattern, allow_list)
+                                        unit_write_settingf(u, flags, name, "%s=%s", name, *unit_pattern);
+                                STRV_FOREACH(unit_pattern, deny_list)
+                                        unit_write_settingf(u, flags, name, "%s=~%s", name, *unit_pattern);
+                        }
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "Personality"))
+                return bus_set_transient_personality(u, name, &c->personality, message, flags, error);
+
+        if (streq(name, "StandardInput"))
+                return bus_set_transient_std_input(u, name, &c->std_input, message, flags, error);
+
+        if (streq(name, "StandardOutput"))
+                return bus_set_transient_std_output(u, name, &c->std_output, message, flags, error);
+
+        if (streq(name, "StandardError"))
+                return bus_set_transient_std_output(u, name, &c->std_error, message, flags, error);
+
+        if (streq(name, "IgnoreSIGPIPE"))
+                return bus_set_transient_bool(u, name, &c->ignore_sigpipe, message, flags, error);
+
+        if (streq(name, "TTYVHangup"))
+                return bus_set_transient_bool(u, name, &c->tty_vhangup, message, flags, error);
+
+        if (streq(name, "TTYReset"))
+                return bus_set_transient_bool(u, name, &c->tty_reset, message, flags, error);
+
+        if (streq(name, "TTYVTDisallocate"))
+                return bus_set_transient_bool(u, name, &c->tty_vt_disallocate, message, flags, error);
+
+        if (streq(name, "TTYRows"))
+                return bus_set_transient_unsigned(u, name, &c->tty_rows, message, flags, error);
+
+        if (streq(name, "TTYColumns"))
+                return bus_set_transient_unsigned(u, name, &c->tty_cols, message, flags, error);
+
+        if (streq(name, "PrivateTmp"))
+                return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error);
+
+        if (streq(name, "PrivateDevices"))
+                return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
+
+        if (streq(name, "PrivateMounts"))
+                return bus_set_transient_tristate(u, name, &c->private_mounts, message, flags, error);
+
+        if (streq(name, "PrivateNetwork"))
+                return bus_set_transient_bool(u, name, &c->private_network, message, flags, error);
+
+        if (streq(name, "PrivateIPC"))
+                return bus_set_transient_bool(u, name, &c->private_ipc, message, flags, error);
+
+        if (streq(name, "PrivateUsers"))
+                return bus_set_transient_bool(u, name, &c->private_users, message, flags, error);
+
+        if (streq(name, "NoNewPrivileges"))
+                return bus_set_transient_bool(u, name, &c->no_new_privileges, message, flags, error);
+
+        if (streq(name, "SyslogLevelPrefix"))
+                return bus_set_transient_bool(u, name, &c->syslog_level_prefix, message, flags, error);
+
+        if (streq(name, "MemoryDenyWriteExecute"))
+                return bus_set_transient_bool(u, name, &c->memory_deny_write_execute, message, flags, error);
+
+        if (streq(name, "RestrictRealtime"))
+                return bus_set_transient_bool(u, name, &c->restrict_realtime, message, flags, error);
+
+        if (streq(name, "RestrictSUIDSGID"))
+                return bus_set_transient_bool(u, name, &c->restrict_suid_sgid, message, flags, error);
+
+        if (streq(name, "DynamicUser"))
+                return bus_set_transient_bool(u, name, &c->dynamic_user, message, flags, error);
+
+        if (streq(name, "RemoveIPC"))
+                return bus_set_transient_bool(u, name, &c->remove_ipc, message, flags, error);
+
+        if (streq(name, "ProtectKernelTunables"))
+                return bus_set_transient_bool(u, name, &c->protect_kernel_tunables, message, flags, error);
+
+        if (streq(name, "ProtectKernelModules"))
+                return bus_set_transient_bool(u, name, &c->protect_kernel_modules, message, flags, error);
+
+        if (streq(name, "ProtectKernelLogs"))
+                return bus_set_transient_bool(u, name, &c->protect_kernel_logs, message, flags, error);
+
+        if (streq(name, "ProtectClock"))
+                return bus_set_transient_bool(u, name, &c->protect_clock, message, flags, error);
+
+        if (streq(name, "ProtectControlGroups"))
+                return bus_set_transient_bool(u, name, &c->protect_control_groups, message, flags, error);
+
+        if (streq(name, "CPUSchedulingResetOnFork"))
+                return bus_set_transient_bool(u, name, &c->cpu_sched_reset_on_fork, message, flags, error);
+
+        if (streq(name, "NonBlocking"))
+                return bus_set_transient_bool(u, name, &c->non_blocking, message, flags, error);
+
+        if (streq(name, "LockPersonality"))
+                return bus_set_transient_bool(u, name, &c->lock_personality, message, flags, error);
+
+        if (streq(name, "ProtectHostname"))
+                return bus_set_transient_bool(u, name, &c->protect_hostname, message, flags, error);
+
+        if (streq(name, "MemoryKSM"))
+                return bus_set_transient_tristate(u, name, &c->memory_ksm, message, flags, error);
+
+        if (streq(name, "UtmpIdentifier"))
+                return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error);
+
+        if (streq(name, "UtmpMode"))
+                return bus_set_transient_utmp_mode(u, name, &c->utmp_mode, message, flags, error);
+
+        if (streq(name, "PAMName"))
+                return bus_set_transient_string(u, name, &c->pam_name, message, flags, error);
+
+        if (streq(name, "TimerSlackNSec"))
+                return bus_set_transient_nsec(u, name, &c->timer_slack_nsec, message, flags, error);
+
+        if (streq(name, "ProtectSystem"))
+                return bus_set_transient_protect_system(u, name, &c->protect_system, message, flags, error);
+
+        if (streq(name, "ProtectHome"))
+                return bus_set_transient_protect_home(u, name, &c->protect_home, message, flags, error);
+
+        if (streq(name, "KeyringMode"))
+                return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
+
+        if (streq(name, "ProtectProc"))
+                return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
+
+        if (streq(name, "ProcSubset"))
+                return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+
+        if (streq(name, "RuntimeDirectoryPreserve"))
+                return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
+
+        if (streq(name, "UMask"))
+                return bus_set_transient_mode_t(u, name, &c->umask, message, flags, error);
+
+        if (streq(name, "RuntimeDirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_RUNTIME].mode, message, flags, error);
+
+        if (streq(name, "StateDirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_STATE].mode, message, flags, error);
+
+        if (streq(name, "CacheDirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CACHE].mode, message, flags, error);
+
+        if (streq(name, "LogsDirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_LOGS].mode, message, flags, error);
+
+        if (streq(name, "ConfigurationDirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CONFIGURATION].mode, message, flags, error);
+
+        if (streq(name, "SELinuxContext"))
+                return bus_set_transient_string(u, name, &c->selinux_context, message, flags, error);
+
+        if (streq(name, "SecureBits"))
+                return bus_set_transient_secure_bits(u, name, &c->secure_bits, message, flags, error);
+
+        if (streq(name, "CapabilityBoundingSet"))
+                return bus_set_transient_capability(u, name, &c->capability_bounding_set, message, flags, error);
+
+        if (streq(name, "AmbientCapabilities"))
+                return bus_set_transient_capability(u, name, &c->capability_ambient_set, message, flags, error);
+
+        if (streq(name, "RestrictNamespaces"))
+                return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error);
+
+        if (streq(name, "RestrictFileSystems")) {
+                int allow_list;
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_enter_container(message, 'r', "bas");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read(message, "b", &allow_list);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = NULL;
+                        FilesystemParseFlags invert_flag = allow_list ? 0 : FILESYSTEM_PARSE_INVERT;
+
+                        if (strv_isempty(l)) {
+                                c->restrict_filesystems_allow_list = false;
+                                c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+
+                                unit_write_setting(u, flags, name, "RestrictFileSystems=");
+                                return 1;
+                        }
+
+                        if (!c->restrict_filesystems)
+                                c->restrict_filesystems_allow_list = allow_list;
+
+                        STRV_FOREACH(s, l) {
+                                r = lsm_bpf_parse_filesystem(
+                                              *s,
+                                              &c->restrict_filesystems,
+                                              FILESYSTEM_PARSE_LOG|
+                                              (invert_flag ? FILESYSTEM_PARSE_INVERT : 0)|
+                                              (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
+                                              u->id, NULL, 0);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        joined = strv_join(l, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        unit_write_settingf(u, flags, name, "%s=%s%s", name, allow_list ? "" : "~", joined);
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "MountFlags"))
+                return bus_set_transient_mount_propagation_flag(u, name, &c->mount_propagation_flag, message, flags, error);
+
+        if (streq(name, "NetworkNamespacePath"))
+                return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error);
+
+        if (streq(name, "IPCNamespacePath"))
+                return bus_set_transient_path(u, name, &c->ipc_namespace_path, message, flags, error);
+
+        if (streq(name, "SupplementaryGroups")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l)
+                        if (!isempty(*p) && !valid_user_group_name(*p, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                         "Invalid supplementary group names");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                c->supplementary_groups = strv_free(c->supplementary_groups);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+
+                                r = strv_extend_strv(&c->supplementary_groups, l, true);
+                                if (r < 0)
+                                        return -ENOMEM;
+
+                                joined = strv_join(c->supplementary_groups, " ");
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "SetCredential", "SetCredentialEncrypted")) {
+                bool isempty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(say)");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        const char *id;
+                        const void *p;
+                        size_t sz;
+
+                        r = sd_bus_message_enter_container(message, 'r', "say");
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        r = sd_bus_message_read(message, "s", &id);
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_read_array(message, 'y', &p, &sz);
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_exit_container(message);
+                        if (r < 0)
+                                return r;
+
+                        if (!credential_name_valid(id))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id);
+
+                        isempty = false;
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ char *a = NULL, *b = NULL;
+                                _cleanup_free_ void *copy = NULL;
+                                ExecSetCredential *old;
+
+                                copy = memdup(p, sz);
+                                if (!copy)
+                                        return -ENOMEM;
+
+                                old = hashmap_get(c->set_credentials, id);
+                                if (old) {
+                                        free_and_replace(old->data, copy);
+                                        old->size = sz;
+                                        old->encrypted = streq(name, "SetCredentialEncrypted");
+                                } else {
+                                        _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+
+                                        sc = new(ExecSetCredential, 1);
+                                        if (!sc)
+                                                return -ENOMEM;
+
+                                        *sc = (ExecSetCredential) {
+                                                .id = strdup(id),
+                                                .data = TAKE_PTR(copy),
+                                                .size = sz,
+                                                .encrypted = streq(name, "SetCredentialEncrypted"),
+                                        };
+
+                                        if (!sc->id)
+                                                return -ENOMEM;
+
+                                        r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+                                        if (r < 0)
+                                                return r;
+
+                                        TAKE_PTR(sc);
+                                }
+
+                                a = specifier_escape(id);
+                                if (!a)
+                                        return -ENOMEM;
+
+                                b = cescape_length(p, sz);
+                                if (!b)
+                                        return -ENOMEM;
+
+                                (void) unit_write_settingf(u, flags, name, "%s=%s:%s", name, a, b);
+                        }
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+                        c->set_credentials = hashmap_free(c->set_credentials);
+                        (void) unit_write_settingf(u, flags, name, "%s=", name);
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "LoadCredential", "LoadCredentialEncrypted")) {
+                bool isempty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        const char *id, *source;
+
+                        r = sd_bus_message_read(message, "(ss)", &id, &source);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        if (!credential_name_valid(id))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id);
+
+                        if (!(path_is_absolute(source) ? path_is_normalized(source) : credential_name_valid(source)))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential source is invalid: %s", source);
+
+                        isempty = false;
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                bool encrypted = streq(name, "LoadCredentialEncrypted");
+
+                                r = hashmap_put_credential(&c->load_credentials, id, source, encrypted);
+                                if (r < 0)
+                                        return r;
+
+                                (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s:%s", name, id, source);
+                        }
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+                        c->load_credentials = hashmap_free(c->load_credentials);
+                        (void) unit_write_settingf(u, flags, name, "%s=", name);
+                }
+
+                return 1;
+
+        } else if (streq(name, "ImportCredential")) {
+                bool isempty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "s");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        const char *s;
+
+                        r = sd_bus_message_read(message, "s", &s);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        if (!credential_glob_valid(s))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential name or glob is invalid: %s", s);
+
+                        isempty = false;
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = set_put_strdup(&c->import_credentials, s);
+                                if (r < 0)
+                                        return r;
+
+                                (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, s);
+                        }
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+                        c->import_credentials = set_free_free(c->import_credentials);
+                        (void) unit_write_settingf(u, flags, name, "%s=", name);
+                }
+
+                return 1;
+
+        } else if (streq(name, "SyslogLevel")) {
+                int32_t level;
+
+                r = sd_bus_message_read(message, "i", &level);
+                if (r < 0)
+                        return r;
+
+                if (!log_level_is_valid(level))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log level value out of range");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->syslog_priority = (c->syslog_priority & LOG_FACMASK) | level;
+                        unit_write_settingf(u, flags, name, "SyslogLevel=%i", level);
+                }
+
+                return 1;
+
+        } else if (streq(name, "SyslogFacility")) {
+                int32_t facility;
+
+                r = sd_bus_message_read(message, "i", &facility);
+                if (r < 0)
+                        return r;
+
+                if (!log_facility_unshifted_is_valid(facility))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log facility value out of range");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->syslog_priority = (facility << 3) | LOG_PRI(c->syslog_priority);
+                        unit_write_settingf(u, flags, name, "SyslogFacility=%i", facility);
+                }
+
+                return 1;
+
+        } else if (streq(name, "LogNamespace")) {
+                const char *n;
+
+                r = sd_bus_message_read(message, "s", &n);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(n) && !log_namespace_name_valid(n))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log namespace name not valid");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+                        if (isempty(n)) {
+                                c->log_namespace = mfree(c->log_namespace);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                r = free_and_strdup(&c->log_namespace, n);
+                                if (r < 0)
+                                        return r;
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, n);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "LogExtraFields")) {
+                size_t n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "ay");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_free_ void *copy = NULL;
+                        struct iovec *t;
+                        const char *eq;
+                        const void *p;
+                        size_t sz;
+
+                        /* Note that we expect a byte array for each field, instead of a string. That's because on the
+                         * lower-level journal fields can actually contain binary data and are not restricted to text,
+                         * and we should not "lose precision" in our types on the way. That said, I am pretty sure
+                         * actually encoding binary data as unit metadata is not a good idea. Hence we actually refuse
+                         * any actual binary data, and only accept UTF-8. This allows us to eventually lift this
+                         * limitation, should a good, valid use case arise. */
+
+                        r = sd_bus_message_read_array(message, 'y', &p, &sz);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        if (memchr(p, 0, sz))
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains zero byte");
+
+                        eq = memchr(p, '=', sz);
+                        if (!eq)
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains no '=' character");
+                        if (!journal_field_valid(p, eq - (const char*) p, false))
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field invalid");
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec));
+                                if (!t)
+                                        return -ENOMEM;
+                                c->log_extra_fields = t;
+                        }
+
+                        copy = malloc(sz + 1);
+                        if (!copy)
+                                return -ENOMEM;
+
+                        memcpy(copy, p, sz);
+                        ((uint8_t*) copy)[sz] = 0;
+
+                        if (!utf8_is_valid(copy))
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field is not valid UTF-8");
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE(copy, sz);
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C, name, "LogExtraFields=%s", (char*) copy);
+
+                                copy = NULL;
+                        }
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && n == 0) {
+                        exec_context_free_log_extra_fields(c);
+                        unit_write_setting(u, flags, name, "LogExtraFields=");
+                }
+
+                return 1;
+        }
+
+#if HAVE_SECCOMP
+
+        if (streq(name, "SystemCallErrorNumber"))
+                return bus_set_transient_errno(u, name, &c->syscall_errno, message, flags, error);
+
+        if (streq(name, "SystemCallFilter")) {
+                int allow_list;
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_enter_container(message, 'r', "bas");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read(message, "b", &allow_list);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = NULL;
+                        SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT;
+
+                        if (strv_isempty(l)) {
+                                c->syscall_allow_list = false;
+                                c->syscall_filter = hashmap_free(c->syscall_filter);
+
+                                unit_write_settingf(u, flags, name, "SystemCallFilter=");
+                                return 1;
+                        }
+
+                        if (!c->syscall_filter) {
+                                c->syscall_filter = hashmap_new(NULL);
+                                if (!c->syscall_filter)
+                                        return log_oom();
+
+                                c->syscall_allow_list = allow_list;
+
+                                if (c->syscall_allow_list) {
+                                        r = seccomp_parse_syscall_filter("@default",
+                                                                         -1,
+                                                                         c->syscall_filter,
+                                                                         SECCOMP_PARSE_PERMISSIVE |
+                                                                         SECCOMP_PARSE_ALLOW_LIST,
+                                                                         u->id,
+                                                                         NULL, 0);
+                                        if (r < 0)
+                                                return r;
+                                }
+                        }
+
+                        STRV_FOREACH(s, l) {
+                                _cleanup_free_ char *n = NULL;
+                                int e;
+
+                                r = parse_syscall_and_errno(*s, &n, &e);
+                                if (r < 0)
+                                        return r;
+
+                                if (allow_list && e >= 0)
+                                        return -EINVAL;
+
+                                r = seccomp_parse_syscall_filter(n,
+                                                                 e,
+                                                                 c->syscall_filter,
+                                                                 SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE |
+                                                                 invert_flag |
+                                                                 (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+                                                                 u->id,
+                                                                 NULL, 0);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        joined = strv_join(l, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        unit_write_settingf(u, flags, name, "SystemCallFilter=%s%s", allow_list ? "" : "~", joined);
+                }
+
+                return 1;
+
+        } else if (streq(name, "SystemCallLog")) {
+                int allow_list;
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_enter_container(message, 'r', "bas");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read(message, "b", &allow_list);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = NULL;
+                        SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT;
+
+                        if (strv_isempty(l)) {
+                                c->syscall_log_allow_list = false;
+                                c->syscall_log = hashmap_free(c->syscall_log);
+
+                                unit_write_settingf(u, flags, name, "SystemCallLog=");
+                                return 1;
+                        }
+
+                        if (!c->syscall_log) {
+                                c->syscall_log = hashmap_new(NULL);
+                                if (!c->syscall_log)
+                                        return log_oom();
+
+                                c->syscall_log_allow_list = allow_list;
+                        }
+
+                        STRV_FOREACH(s, l) {
+                                r = seccomp_parse_syscall_filter(*s,
+                                                                 -1, /* errno not used */
+                                                                 c->syscall_log,
+                                                                 SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE |
+                                                                 invert_flag |
+                                                                 (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+                                                                 u->id,
+                                                                 NULL, 0);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        joined = strv_join(l, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        unit_write_settingf(u, flags, name, "SystemCallLog=%s%s", allow_list ? "" : "~", joined);
+                }
+
+                return 1;
+
+        } else if (streq(name, "SystemCallArchitectures")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = NULL;
+
+                        if (strv_isempty(l))
+                                c->syscall_archs = set_free(c->syscall_archs);
+                        else
+                                STRV_FOREACH(s, l) {
+                                        uint32_t a;
+
+                                        r = seccomp_arch_from_string(*s, &a);
+                                        if (r < 0)
+                                                return r;
+
+                                        r = set_ensure_put(&c->syscall_archs, NULL, UINT32_TO_PTR(a + 1));
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                        joined = strv_join(l, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+                }
+
+                return 1;
+
+        } else if (streq(name, "RestrictAddressFamilies")) {
+                _cleanup_strv_free_ char **l = NULL;
+                int allow_list;
+
+                r = sd_bus_message_enter_container(message, 'r', "bas");
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read(message, "b", &allow_list);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = NULL;
+
+                        if (strv_isempty(l)) {
+                                c->address_families_allow_list = allow_list;
+                                c->address_families = set_free(c->address_families);
+
+                                unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s",
+                                                    allow_list ? "none" : "");
+                                return 1;
+                        }
+
+                        if (!c->address_families) {
+                                c->address_families = set_new(NULL);
+                                if (!c->address_families)
+                                        return log_oom();
+
+                                c->address_families_allow_list = allow_list;
+                        }
+
+                        STRV_FOREACH(s, l) {
+                                int af;
+
+                                af = af_from_name(*s);
+                                if (af < 0)
+                                        return af;
+
+                                if (allow_list == c->address_families_allow_list) {
+                                        r = set_put(c->address_families, INT_TO_PTR(af));
+                                        if (r < 0)
+                                                return r;
+                                } else
+                                        set_remove(c->address_families, INT_TO_PTR(af));
+                        }
+
+                        joined = strv_join(l, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s%s", allow_list ? "" : "~", joined);
+                }
+
+                return 1;
+        }
+#endif
+        if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
+                const void *a;
+                size_t n;
+                bool affinity = streq(name, "CPUAffinity");
+                _cleanup_(cpu_set_reset) CPUSet set = {};
+
+                r = sd_bus_message_read_array(message, 'y', &a, &n);
+                if (r < 0)
+                        return r;
+
+                r = cpu_set_from_dbus(a, n, &set);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (n == 0) {
+                                cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                _cleanup_free_ char *str = NULL;
+
+                                str = cpu_set_to_string(&set);
+                                if (!str)
+                                        return -ENOMEM;
+
+                                /* We forego any optimizations here, and always create the structure using
+                                 * cpu_set_add_all(), because we don't want to care if the existing size we
+                                 * got over dbus is appropriate. */
+                                r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set);
+                                if (r < 0)
+                                        return r;
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, str);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "CPUAffinityFromNUMA")) {
+                int q;
+
+                r = sd_bus_message_read_basic(message, 'b', &q);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->cpu_affinity_from_numa = q;
+                        unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa");
+                }
+
+                return 1;
+
+        } else if (streq(name, "NUMAPolicy")) {
+                int32_t type;
+
+                r = sd_bus_message_read(message, "i", &type);
+                if (r < 0)
+                        return r;
+
+                if (!mpol_is_valid(type))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags))
+                        c->numa_policy.type = type;
+
+                return 1;
+
+        } else if (streq(name, "Nice")) {
+                int32_t q;
+
+                r = sd_bus_message_read(message, "i", &q);
+                if (r < 0)
+                        return r;
+
+                if (!nice_is_valid(q))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid Nice value: %i", q);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->nice = q;
+                        c->nice_set = true;
+
+                        unit_write_settingf(u, flags, name, "Nice=%i", q);
+                }
+
+                return 1;
+
+        } else if (streq(name, "CPUSchedulingPolicy")) {
+                int32_t q;
+
+                r = sd_bus_message_read(message, "i", &q);
+                if (r < 0)
+                        return r;
+
+                if (!sched_policy_is_valid(q))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling policy: %i", q);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *s = NULL;
+
+                        r = sched_policy_to_string_alloc(q, &s);
+                        if (r < 0)
+                                return r;
+
+                        c->cpu_sched_policy = q;
+                        c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(q), sched_get_priority_max(q));
+                        c->cpu_sched_set = true;
+
+                        unit_write_settingf(u, flags, name, "CPUSchedulingPolicy=%s", s);
+                }
+
+                return 1;
+
+        } else if (streq(name, "CPUSchedulingPriority")) {
+                int32_t p;
+
+                r = sd_bus_message_read(message, "i", &p);
+                if (r < 0)
+                        return r;
+
+                /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set
+                 * later so we do not check the precise range, but only the generic outer bounds. */
+                if (p < 0 || p > 99)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling priority: %i", p);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->cpu_sched_priority = p;
+                        c->cpu_sched_set = true;
+
+                        unit_write_settingf(u, flags, name, "CPUSchedulingPriority=%i", p);
+                }
+
+                return 1;
+
+        } else if (streq(name, "IOSchedulingClass")) {
+                int32_t q;
+
+                r = sd_bus_message_read(message, "i", &q);
+                if (r < 0)
+                        return r;
+
+                if (!ioprio_class_is_valid(q))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling class: %i", q);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *s = NULL;
+
+                        r = ioprio_class_to_string_alloc(q, &s);
+                        if (r < 0)
+                                return r;
+
+                        c->ioprio = ioprio_normalize(ioprio_prio_value(q, ioprio_prio_data(c->ioprio)));
+                        c->ioprio_set = true;
+
+                        unit_write_settingf(u, flags, name, "IOSchedulingClass=%s", s);
+                }
+
+                return 1;
+
+        } else if (streq(name, "IOSchedulingPriority")) {
+                int32_t p;
+
+                r = sd_bus_message_read(message, "i", &p);
+                if (r < 0)
+                        return r;
+
+                if (!ioprio_priority_is_valid(p))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling priority: %i", p);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), p));
+                        c->ioprio_set = true;
+
+                        unit_write_settingf(u, flags, name, "IOSchedulingPriority=%i", p);
+                }
+
+                return 1;
+
+        } else if (streq(name, "MountAPIVFS")) {
+                bool b;
+
+                r = bus_set_transient_bool(u, name, &b, message, flags, error);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->mount_apivfs = b;
+                        c->mount_apivfs_set = true;
+                }
+
+                return 1;
+
+        } else if (streq(name, "WorkingDirectory")) {
+                const char *s;
+                bool missing_ok;
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                if (s[0] == '-') {
+                        missing_ok = true;
+                        s++;
+                } else
+                        missing_ok = false;
+
+                if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (streq(s, "~")) {
+                                c->working_directory = mfree(c->working_directory);
+                                c->working_directory_home = true;
+                        } else {
+                                r = free_and_strdup(&c->working_directory, empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                c->working_directory_home = false;
+                        }
+
+                        c->working_directory_missing_ok = missing_ok;
+                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s);
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name,
+                              "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) {
+                const char *s;
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(s) && !fdname_is_valid(s))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+                        if (streq(name, "StandardInputFileDescriptorName")) {
+                                r = free_and_strdup(c->stdio_fdname + STDIN_FILENO, empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                c->std_input = EXEC_INPUT_NAMED_FD;
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=fd:%s", exec_context_fdname(c, STDIN_FILENO));
+
+                        } else if (streq(name, "StandardOutputFileDescriptorName")) {
+                                r = free_and_strdup(c->stdio_fdname + STDOUT_FILENO, empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                c->std_output = EXEC_OUTPUT_NAMED_FD;
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=fd:%s", exec_context_fdname(c, STDOUT_FILENO));
+
+                        } else {
+                                assert(streq(name, "StandardErrorFileDescriptorName"));
+
+                                r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                c->std_error = EXEC_OUTPUT_NAMED_FD;
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=fd:%s", exec_context_fdname(c, STDERR_FILENO));
+                        }
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name,
+                              "StandardInputFile",
+                              "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate",
+                              "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate")) {
+                const char *s;
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(s)) {
+                        if (!path_is_absolute(s))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute", s);
+                        if (!path_is_normalized(s))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", s);
+                }
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+                        if (streq(name, "StandardInputFile")) {
+                                r = free_and_strdup(&c->stdio_file[STDIN_FILENO], empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                c->std_input = EXEC_INPUT_FILE;
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=file:%s", s);
+
+                        } else if (STR_IN_SET(name, "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate")) {
+                                r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                if (streq(name, "StandardOutputFile")) {
+                                        c->std_output = EXEC_OUTPUT_FILE;
+                                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=file:%s", s);
+                                } else if (streq(name, "StandardOutputFileToAppend")) {
+                                        c->std_output = EXEC_OUTPUT_FILE_APPEND;
+                                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=append:%s", s);
+                                } else {
+                                        assert(streq(name, "StandardOutputFileToTruncate"));
+                                        c->std_output = EXEC_OUTPUT_FILE_TRUNCATE;
+                                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=truncate:%s", s);
+                                }
+                        } else {
+                                assert(STR_IN_SET(name, "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate"));
+
+                                r = free_and_strdup(&c->stdio_file[STDERR_FILENO], empty_to_null(s));
+                                if (r < 0)
+                                        return r;
+
+                                if (streq(name, "StandardErrorFile")) {
+                                        c->std_error = EXEC_OUTPUT_FILE;
+                                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=file:%s", s);
+                                } else if (streq(name, "StandardErrorFileToAppend")) {
+                                        c->std_error = EXEC_OUTPUT_FILE_APPEND;
+                                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=append:%s", s);
+                                } else {
+                                        assert(streq(name, "StandardErrorFileToTruncate"));
+                                        c->std_error = EXEC_OUTPUT_FILE_TRUNCATE;
+                                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=truncate:%s", s);
+                                }
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "StandardInputData")) {
+                const void *p;
+                size_t sz;
+
+                r = sd_bus_message_read_array(message, 'y', &p, &sz);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *encoded = NULL;
+
+                        if (sz == 0) {
+                                c->stdin_data = mfree(c->stdin_data);
+                                c->stdin_data_size = 0;
+
+                                unit_write_settingf(u, flags, name, "StandardInputData=");
+                        } else {
+                                void *q;
+                                ssize_t n;
+
+                                if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */
+                                    c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX)
+                                        return -E2BIG;
+
+                                n = base64mem(p, sz, &encoded);
+                                if (n < 0)
+                                        return (int) n;
+
+                                q = realloc(c->stdin_data, c->stdin_data_size + sz);
+                                if (!q)
+                                        return -ENOMEM;
+
+                                memcpy((uint8_t*) q + c->stdin_data_size, p, sz);
+
+                                c->stdin_data = q;
+                                c->stdin_data_size += sz;
+
+                                unit_write_settingf(u, flags, name, "StandardInputData=%s", encoded);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "Environment")) {
+
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                if (!strv_env_is_valid(l))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment block.");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                c->environment = strv_free(c->environment);
+                                unit_write_setting(u, flags, name, "Environment=");
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+                                char **e;
+
+                                joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                e = strv_env_merge(c->environment, l);
+                                if (!e)
+                                        return -ENOMEM;
+
+                                strv_free_and_replace(c->environment, e);
+                                unit_write_settingf(u, flags, name, "Environment=%s", joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "UnsetEnvironment")) {
+
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                if (!strv_env_name_or_assignment_is_valid(l))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UnsetEnvironment= list.");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                c->unset_environment = strv_free(c->unset_environment);
+                                unit_write_setting(u, flags, name, "UnsetEnvironment=");
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+                                char **e;
+
+                                joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                e = strv_env_merge(c->unset_environment, l);
+                                if (!e)
+                                        return -ENOMEM;
+
+                                strv_free_and_replace(c->unset_environment, e);
+                                unit_write_settingf(u, flags, name, "UnsetEnvironment=%s", joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "OOMScoreAdjust")) {
+                int oa;
+
+                r = sd_bus_message_read(message, "i", &oa);
+                if (r < 0)
+                        return r;
+
+                if (!oom_score_adjust_is_valid(oa))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "OOM score adjust value out of range");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->oom_score_adjust = oa;
+                        c->oom_score_adjust_set = true;
+                        unit_write_settingf(u, flags, name, "OOMScoreAdjust=%i", oa);
+                }
+
+                return 1;
+
+        } else if (streq(name, "CoredumpFilter")) {
+                uint64_t f;
+
+                r = sd_bus_message_read(message, "t", &f);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->coredump_filter = f;
+                        c->coredump_filter_set = true;
+                        unit_write_settingf(u, flags, name, "CoredumpFilter=0x%"PRIx64, f);
+                }
+
+                return 1;
+
+        } else if (streq(name, "EnvironmentFiles")) {
+                _cleanup_(memstream_done) MemStream m = {};
+                _cleanup_free_ char *joined = NULL;
+                _cleanup_strv_free_ char **l = NULL;
+                FILE *f;
+
+                r = sd_bus_message_enter_container(message, 'a', "(sb)");
+                if (r < 0)
+                        return r;
+
+                f = memstream_init(&m);
+                if (!f)
+                        return -ENOMEM;
+
+                fputs("EnvironmentFile=\n", f);
+
+                STRV_FOREACH(i, c->environment_files) {
+                        _cleanup_free_ char *q = NULL;
+
+                        q = specifier_escape(*i);
+                        if (!q)
+                                return -ENOMEM;
+
+                        fprintf(f, "EnvironmentFile=%s\n", q);
+                }
+
+                while ((r = sd_bus_message_enter_container(message, 'r', "sb")) > 0) {
+                        const char *path;
+                        int b;
+
+                        r = sd_bus_message_read(message, "sb", &path, &b);
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_exit_container(message);
+                        if (r < 0)
+                                return r;
+
+                        if (!path_is_absolute(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ char *q = NULL, *buf = NULL;
+
+                                buf = strjoin(b ? "-" : "", path);
+                                if (!buf)
+                                        return -ENOMEM;
+
+                                q = specifier_escape(buf);
+                                if (!q)
+                                        return -ENOMEM;
+
+                                fprintf(f, "EnvironmentFile=%s\n", q);
+
+                                r = strv_consume(&l, TAKE_PTR(buf));
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                r = memstream_finalize(&m, &joined, NULL);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                c->environment_files = strv_free(c->environment_files);
+                                unit_write_setting(u, flags, name, "EnvironmentFile=");
+                        } else {
+                                r = strv_extend_strv(&c->environment_files, l, true);
+                                if (r < 0)
+                                        return r;
+
+                                unit_write_setting(u, flags, name, joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "PassEnvironment")) {
+
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                if (!strv_env_name_is_valid(l))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PassEnvironment= block.");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                c->pass_environment = strv_free(c->pass_environment);
+                                unit_write_setting(u, flags, name, "PassEnvironment=");
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+
+                                r = strv_extend_strv(&c->pass_environment, l, true);
+                                if (r < 0)
+                                        return r;
+
+                                /* We write just the new settings out to file, with unresolved specifiers. */
+                                joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(u, flags, name, "PassEnvironment=%s", joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "ReadWriteDirectories", "ReadOnlyDirectories", "InaccessibleDirectories",
+                              "ReadWritePaths", "ReadOnlyPaths", "InaccessiblePaths", "ExecPaths", "NoExecPaths",
+                              "ExtensionDirectories")) {
+                _cleanup_strv_free_ char **l = NULL;
+                char ***dirs;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l) {
+                        char *i = *p;
+                        size_t offset;
+
+                        offset = i[0] == '-';
+                        offset += i[offset] == '+';
+                        if (!path_is_absolute(i + offset))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
+
+                        path_simplify(i + offset);
+                }
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (STR_IN_SET(name, "ReadWriteDirectories", "ReadWritePaths"))
+                                dirs = &c->read_write_paths;
+                        else if (STR_IN_SET(name, "ReadOnlyDirectories", "ReadOnlyPaths"))
+                                dirs = &c->read_only_paths;
+                        else if (streq(name, "ExecPaths"))
+                                dirs = &c->exec_paths;
+                        else if (streq(name, "NoExecPaths"))
+                                dirs = &c->no_exec_paths;
+                        else if (streq(name, "ExtensionDirectories"))
+                                dirs = &c->extension_directories;
+                        else /* "InaccessiblePaths" */
+                                dirs = &c->inaccessible_paths;
+
+                        if (strv_isempty(l)) {
+                                *dirs = strv_free(*dirs);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+
+                                joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                r = strv_extend_strv(dirs, l, true);
+                                if (r < 0)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "ExecSearchPath")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l)
+                        if (!path_is_absolute(*p) || !path_is_normalized(*p) || strchr(*p, ':'))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                c->exec_search_path = strv_free(c->exec_search_path);
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=");
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+                                r = strv_extend_strv(&c->exec_search_path, l, true);
+                                if (r < 0)
+                                        return -ENOMEM;
+                                joined = strv_join(c->exec_search_path, ":");
+                                if (!joined)
+                                        return log_oom();
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=%s", joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "RuntimeDirectory", "StateDirectory", "CacheDirectory", "LogsDirectory", "ConfigurationDirectory")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l) {
+                        if (!path_is_normalized(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is not normalized: %s", name, *p);
+
+                        if (path_is_absolute(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is absolute: %s", name, *p);
+
+                        if (path_startswith(*p, "private"))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path can't be 'private': %s", name, *p);
+                }
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        ExecDirectoryType i;
+                        ExecDirectory *d;
+
+                        assert_se((i = exec_directory_type_from_string(name)) >= 0);
+                        d = c->directories + i;
+
+                        if (strv_isempty(l)) {
+                                exec_directory_done(d);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+
+                                STRV_FOREACH(source, l) {
+                                        r = exec_directory_add(d, *source, NULL);
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+                                exec_directory_sort(d);
+
+                                joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "AppArmorProfile", "SmackProcessLabel")) {
+                int ignore;
+                const char *s;
+
+                r = sd_bus_message_read(message, "(bs)", &ignore, &s);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        char **p;
+                        bool *b;
+
+                        if (streq(name, "AppArmorProfile")) {
+                                p = &c->apparmor_profile;
+                                b = &c->apparmor_profile_ignore;
+                        } else { /* "SmackProcessLabel" */
+                                p = &c->smack_process_label;
+                                b = &c->smack_process_label_ignore;
+                        }
+
+                        if (isempty(s)) {
+                                *p = mfree(*p);
+                                *b = false;
+                        } else {
+                                if (free_and_strdup(p, s) < 0)
+                                        return -ENOMEM;
+                                *b = ignore;
+                        }
+
+                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s%s", name, ignore ? "-" : "", strempty(s));
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) {
+                char *source, *destination;
+                int ignore_enoent;
+                uint64_t mount_flags;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ssbt)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ssbt)", &source, &destination, &ignore_enoent, &mount_flags)) > 0) {
+
+                        if (!path_is_absolute(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+                        if (!path_is_absolute(destination))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination);
+                        if (!IN_SET(mount_flags, 0, MS_REC))
+                                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mount flags.");
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+                                                   &(BindMount) {
+                                                           .source = source,
+                                                           .destination = destination,
+                                                           .read_only = !!strstr(name, "ReadOnly"),
+                                                           .recursive = !!(mount_flags & MS_REC),
+                                                           .ignore_enoent = ignore_enoent,
+                                                   });
+                                if (r < 0)
+                                        return r;
+
+                                unit_write_settingf(
+                                                u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                                "%s=%s%s:%s:%s",
+                                                name,
+                                                ignore_enoent ? "-" : "",
+                                                source,
+                                                destination,
+                                                (mount_flags & MS_REC) ? "rbind" : "norbind");
+                        }
+
+                        empty = false;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (empty) {
+                        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+                        c->bind_mounts = NULL;
+                        c->n_bind_mounts = 0;
+
+                        unit_write_settingf(u, flags, name, "%s=", name);
+                }
+
+                return 1;
+
+        } else if (streq(name, "TemporaryFileSystem")) {
+                const char *path, *options;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ss)", &path, &options)) > 0) {
+
+                        if (!path_is_absolute(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Mount point %s is not absolute.", path);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options);
+                                if (r < 0)
+                                        return r;
+
+                                unit_write_settingf(
+                                                u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                                "%s=%s:%s",
+                                                name,
+                                                path,
+                                                options);
+                        }
+
+                        empty = false;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (empty) {
+                        temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+                        c->temporary_filesystems = NULL;
+                        c->n_temporary_filesystems = 0;
+
+                        unit_write_settingf(u, flags, name, "%s=", name);
+                }
+
+                return 1;
+
+        } else if ((suffix = startswith(name, "Limit"))) {
+                const char *soft = NULL;
+                int ri;
+
+                ri = rlimit_from_string(suffix);
+                if (ri < 0) {
+                        soft = endswith(suffix, "Soft");
+                        if (soft) {
+                                const char *n;
+
+                                n = strndupa_safe(suffix, soft - suffix);
+                                ri = rlimit_from_string(n);
+                                if (ri >= 0)
+                                        name = strjoina("Limit", n);
+                        }
+                }
+
+                if (ri >= 0) {
+                        uint64_t rl;
+                        rlim_t x;
+
+                        r = sd_bus_message_read(message, "t", &rl);
+                        if (r < 0)
+                                return r;
+
+                        if (rl == UINT64_MAX)
+                                x = RLIM_INFINITY;
+                        else {
+                                x = (rlim_t) rl;
+
+                                if ((uint64_t) x != rl)
+                                        return -ERANGE;
+                        }
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ char *f = NULL;
+                                struct rlimit nl;
+
+                                if (c->rlimit[ri]) {
+                                        nl = *c->rlimit[ri];
+
+                                        if (soft)
+                                                nl.rlim_cur = x;
+                                        else
+                                                nl.rlim_max = x;
+                                } else
+                                        /* When the resource limit is not initialized yet, then assign the value to both fields */
+                                        nl = (struct rlimit) {
+                                                .rlim_cur = x,
+                                                .rlim_max = x,
+                                        };
+
+                                r = rlimit_format(&nl, &f);
+                                if (r < 0)
+                                        return r;
+
+                                if (c->rlimit[ri])
+                                        *c->rlimit[ri] = nl;
+                                else {
+                                        c->rlimit[ri] = newdup(struct rlimit, &nl, 1);
+                                        if (!c->rlimit[ri])
+                                                return -ENOMEM;
+                                }
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, f);
+                        }
+
+                        return 1;
+                }
+
+        } else if (streq(name, "MountImages")) {
+                _cleanup_free_ char *format_str = NULL;
+                MountImage *mount_images = NULL;
+                size_t n_mount_images = 0;
+                char *source, *destination;
+                int permissive;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ssba(ss))");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                        _cleanup_free_ char *source_escaped = NULL, *destination_escaped = NULL;
+                        char *tuple;
+
+                        r = sd_bus_message_enter_container(message, 'r', "ssba(ss)");
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_read(message, "ssb", &source, &destination, &permissive);
+                        if (r <= 0)
+                                break;
+
+                        if (!path_is_absolute(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+                        if (!path_is_normalized(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+                        if (!path_is_absolute(destination))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination);
+                        if (!path_is_normalized(destination))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination);
+
+                        /* Need to store them in the unit with the escapes, so that they can be parsed again */
+                        source_escaped = shell_escape(source, ":");
+                        if (!source_escaped)
+                                return -ENOMEM;
+                        destination_escaped = shell_escape(destination, ":");
+                        if (!destination_escaped)
+                                return -ENOMEM;
+
+                        tuple = strjoin(format_str,
+                                        format_str ? " " : "",
+                                        permissive ? "-" : "",
+                                        source_escaped,
+                                        ":",
+                                        destination_escaped);
+                        if (!tuple)
+                                return -ENOMEM;
+                        free_and_replace(format_str, tuple);
+
+                        r = bus_read_mount_options(message, error, &options, &format_str, ":");
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_exit_container(message);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_image_add(&mount_images, &n_mount_images,
+                                            &(MountImage) {
+                                                    .source = source,
+                                                    .destination = destination,
+                                                    .mount_options = options,
+                                                    .ignore_enoent = permissive,
+                                                    .type = MOUNT_IMAGE_DISCRETE,
+                                            });
+                        if (r < 0)
+                                return r;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (n_mount_images == 0) {
+                                c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                for (size_t i = 0; i < n_mount_images; ++i) {
+                                        r = mount_image_add(&c->mount_images, &c->n_mount_images, &mount_images[i]);
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS,
+                                                    name,
+                                                    "%s=%s",
+                                                    name,
+                                                    format_str);
+                        }
+                }
+
+                mount_images = mount_image_free_many(mount_images, &n_mount_images);
+
+                return 1;
+        } else if (streq(name, "ExtensionImages")) {
+                _cleanup_free_ char *format_str = NULL;
+                MountImage *extension_images = NULL;
+                size_t n_extension_images = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "(sba(ss))");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                        _cleanup_free_ char *source_escaped = NULL;
+                        char *source, *tuple;
+                        int permissive;
+
+                        r = sd_bus_message_enter_container(message, 'r', "sba(ss)");
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_read(message, "sb", &source, &permissive);
+                        if (r <= 0)
+                                break;
+
+                        if (!path_is_absolute(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+                        if (!path_is_normalized(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+
+                        /* Need to store them in the unit with the escapes, so that they can be parsed again */
+                        source_escaped = shell_escape(source, ":");
+                        if (!source_escaped)
+                                return -ENOMEM;
+
+                        tuple = strjoin(format_str,
+                                        format_str ? " " : "",
+                                        permissive ? "-" : "",
+                                        source_escaped);
+                        if (!tuple)
+                                return -ENOMEM;
+                        free_and_replace(format_str, tuple);
+
+                        r = bus_read_mount_options(message, error, &options, &format_str, ":");
+                        if (r < 0)
+                                return r;
+
+                        r = sd_bus_message_exit_container(message);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_image_add(&extension_images, &n_extension_images,
+                                            &(MountImage) {
+                                                    .source = source,
+                                                    .mount_options = options,
+                                                    .ignore_enoent = permissive,
+                                                    .type = MOUNT_IMAGE_EXTENSION,
+                                            });
+                        if (r < 0)
+                                return r;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (n_extension_images == 0) {
+                                c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                for (size_t i = 0; i < n_extension_images; ++i) {
+                                        r = mount_image_add(&c->extension_images, &c->n_extension_images, &extension_images[i]);
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS,
+                                                    name,
+                                                    "%s=%s",
+                                                    name,
+                                                    format_str);
+                        }
+                }
+
+                extension_images = mount_image_free_many(extension_images, &n_extension_images);
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "StateDirectorySymlink", "RuntimeDirectorySymlink", "CacheDirectorySymlink", "LogsDirectorySymlink")) {
+                char *source, *destination;
+                ExecDirectory *directory;
+                uint64_t symlink_flags; /* No flags for now, reserved for future uses. */
+                ExecDirectoryType i;
+
+                assert_se((i = exec_directory_type_symlink_from_string(name)) >= 0);
+                directory = c->directories + i;
+
+                r = sd_bus_message_enter_container(message, 'a', "(sst)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(sst)", &source, &destination, &symlink_flags)) > 0) {
+                        if (!path_is_valid(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not valid.", source);
+                        if (path_is_absolute(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is absolute.", source);
+                        if (!path_is_normalized(source))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+                        if (!path_is_valid(destination))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not valid.", destination);
+                        if (path_is_absolute(destination))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is absolute.", destination);
+                        if (!path_is_normalized(destination))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination);
+                        if (symlink_flags != 0)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero.");
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ char *destination_escaped = NULL, *source_escaped = NULL;
+
+                                r = exec_directory_add(directory, source, destination);
+                                if (r < 0)
+                                        return r;
+
+                                /* Need to store them in the unit with the escapes, so that they can be parsed again */
+                                source_escaped = xescape(source, ":");
+                                destination_escaped = xescape(destination, ":");
+                                if (!source_escaped || !destination_escaped)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(
+                                                u, flags|UNIT_ESCAPE_SPECIFIERS, exec_directory_type_to_string(i),
+                                                "%s=%s:%s",
+                                                exec_directory_type_to_string(i),
+                                                source_escaped,
+                                                destination_escaped);
+                        }
+                }
+                if (r < 0)
+                        return r;
+
+                exec_directory_sort(directory);
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                return 1;
+
+        } else if (STR_IN_SET(name, "RootImagePolicy", "MountImagePolicy", "ExtensionImagePolicy")) {
+                _cleanup_(image_policy_freep) ImagePolicy *p = NULL;
+                const char *s;
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                r = image_policy_from_string(s, &p);
+                if (r < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to parse image policy string: %s", s);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *t = NULL;
+                        ImagePolicy **pp =
+                                streq(name, "RootImagePolicy")  ? &c->root_image_policy :
+                                streq(name, "MountImagePolicy") ? &c->mount_image_policy :
+                                                                  &c->extension_image_policy;
+
+                        r = image_policy_to_string(p, /* simplify= */ true, &t);
+                        if (r < 0)
+                                return r;
+
+                        image_policy_free(*pp);
+                        *pp = TAKE_PTR(p);
+
+                        unit_write_settingf(
+                                        u, flags, name,
+                                        "%s=%s",
+                                        name,
+                                        t); /* no escaping necessary */
+                }
+
+                return 1;
+        }
+
+        return 0;
+}
diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h
new file mode 100644
index 0000000..5926bdb
--- /dev/null
+++ b/src/core/dbus-execute.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "execute.h"
+
+#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags)                   \
+        BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
+        BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
+        SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
+        SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
+        SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)
+
+#define BUS_EXEC_COMMAND_VTABLE(name, offset, flags)                    \
+        SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command, offset, flags)
+
+#define BUS_EXEC_COMMAND_LIST_VTABLE(name, offset, flags)                    \
+        SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command_list, offset, flags)
+
+#define BUS_EXEC_EX_COMMAND_LIST_VTABLE(name, offset, flags)                    \
+        SD_BUS_PROPERTY(name, "a(sasasttttuii)", bus_property_get_exec_ex_command_list, offset, flags)
+
+extern const sd_bus_vtable bus_exec_vtable[];
+
+int bus_property_get_exec_output(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_command(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_ex_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_preserve_mode(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+
+int bus_exec_context_set_transient_property(Unit *u, ExecContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_exec_command(Unit *u, const char *name, ExecCommand **exec_command, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_exec_preserve_mode(Unit *u, const char *name, ExecPreserveMode *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c
new file mode 100644
index 0000000..c88d8c2
--- /dev/null
+++ b/src/core/dbus-job.c
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "dbus-job.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "job.h"
+#include "log.h"
+#include "selinux-access.h"
+#include "string-util.h"
+#include "strv.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, job_type, JobType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_state, job_state, JobState);
+
+static int property_get_unit(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *p = NULL;
+        Job *j = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        p = unit_dbus_path(j->unit);
+        if (!p)
+                return -ENOMEM;
+
+        return sd_bus_message_append(reply, "(so)", j->unit->id, p);
+}
+
+int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Job *j = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(j->unit, message, "stop", error);
+        if (r < 0)
+                return r;
+
+        /* Access is granted to the job owner */
+        if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) {
+
+                /* And for everybody else consult polkit */
+                r = bus_verify_manage_units_async(j->unit->manager, message, error);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+        }
+
+        job_finish_and_invalidate(j, JOB_CANCELED, true, false);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_free_ Job **list = NULL;
+        Job *j = userdata;
+        int r, n;
+
+        if (strstr(sd_bus_message_get_member(message), "After"))
+                n = job_get_after(j, &list);
+        else
+                n = job_get_before(j, &list);
+        if (n < 0)
+                return n;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(usssoo)");
+        if (r < 0)
+                return r;
+
+        for (int i = 0; i < n; i ++) {
+                _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+
+                job_path = job_dbus_path(list[i]);
+                if (!job_path)
+                        return -ENOMEM;
+
+                unit_path = unit_dbus_path(list[i]->unit);
+                if (!unit_path)
+                        return -ENOMEM;
+
+                r = sd_bus_message_append(reply, "(usssoo)",
+                                          list[i]->id,
+                                          list[i]->unit->id,
+                                          job_type_to_string(list[i]->type),
+                                          job_state_to_string(list[i]->state),
+                                          job_path,
+                                          unit_path);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+const sd_bus_vtable bus_job_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_METHOD("Cancel", NULL, NULL, bus_job_method_cancel, SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetAfter",
+                                 SD_BUS_NO_ARGS,
+                                 SD_BUS_RESULT("a(usssoo)", jobs),
+                                 bus_job_method_get_waiting_jobs,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetBefore",
+                                 SD_BUS_NO_ARGS,
+                                 SD_BUS_RESULT("a(usssoo)", jobs),
+                                 bus_job_method_get_waiting_jobs,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Job, id), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Unit", "(so)", property_get_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("JobType", "s", property_get_type, offsetof(Job, type), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("State", "s", property_get_state, offsetof(Job, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Job, activation_details), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_job_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        Job *j;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        r = manager_get_job_from_dbus_path(m, path, &j);
+        if (r < 0)
+                return 0;
+
+        *found = j;
+        return 1;
+}
+
+static int bus_job_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) {
+        _cleanup_strv_free_ char **l = NULL;
+        Manager *m = userdata;
+        unsigned k = 0;
+        Job *j;
+
+        l = new0(char*, hashmap_size(m->jobs)+1);
+        if (!l)
+                return -ENOMEM;
+
+        HASHMAP_FOREACH(j, m->jobs) {
+                l[k] = job_dbus_path(j);
+                if (!l[k])
+                        return -ENOMEM;
+
+                k++;
+        }
+
+        assert(hashmap_size(m->jobs) == k);
+
+        *nodes = TAKE_PTR(l);
+
+        return k;
+}
+
+const BusObjectImplementation job_object = {
+        "/org/freedesktop/systemd1/job",
+        "org.freedesktop.systemd1.Job",
+        .fallback_vtables = BUS_FALLBACK_VTABLES({bus_job_vtable, bus_job_find}),
+        .node_enumerator = bus_job_enumerate,
+};
+
+static int send_new_signal(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_free_ char *p = NULL;
+        Job *j = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+
+        p = job_dbus_path(j);
+        if (!p)
+                return -ENOMEM;
+
+        r = sd_bus_message_new_signal(
+                        bus,
+                        &m,
+                        "/org/freedesktop/systemd1",
+                        "org.freedesktop.systemd1.Manager",
+                        "JobNew");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(m, "uos", j->id, p, j->unit->id);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, m, NULL);
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+        _cleanup_free_ char *p = NULL;
+        Job *j = ASSERT_PTR(userdata);
+
+        assert(bus);
+
+        p = job_dbus_path(j);
+        if (!p)
+                return -ENOMEM;
+
+        return sd_bus_emit_properties_changed(bus, p, "org.freedesktop.systemd1.Job", "State", NULL);
+}
+
+void bus_job_send_change_signal(Job *j) {
+        int r;
+
+        assert(j);
+
+        /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */
+        bus_unit_send_pending_change_signal(j->unit, true);
+
+        if (j->in_dbus_queue) {
+                LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+                j->in_dbus_queue = false;
+
+                /* The job might be good to be GC once its pending signals have been sent */
+                job_add_to_gc_queue(j);
+        }
+
+        r = bus_foreach_bus(j->manager, j->bus_track, j->sent_dbus_new_signal ? send_changed_signal : send_new_signal, j);
+        if (r < 0)
+                log_debug_errno(r, "Failed to send job change signal for %u: %m", j->id);
+
+        j->sent_dbus_new_signal = true;
+}
+
+void bus_job_send_pending_change_signal(Job *j, bool including_new) {
+        assert(j);
+
+        if (!j->in_dbus_queue)
+                return;
+
+        if (!j->sent_dbus_new_signal && !including_new)
+                return;
+
+        if (MANAGER_IS_RELOADING(j->unit->manager))
+                return;
+
+        bus_job_send_change_signal(j);
+}
+
+static int send_removed_signal(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_free_ char *p = NULL;
+        Job *j = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+
+        p = job_dbus_path(j);
+        if (!p)
+                return -ENOMEM;
+
+        r = sd_bus_message_new_signal(
+                        bus,
+                        &m,
+                        "/org/freedesktop/systemd1",
+                        "org.freedesktop.systemd1.Manager",
+                        "JobRemoved");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(m, "uoss", j->id, p, j->unit->id, job_result_to_string(j->result));
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, m, NULL);
+}
+
+void bus_job_send_removed_signal(Job *j) {
+        int r;
+
+        assert(j);
+
+        if (!j->sent_dbus_new_signal)
+                bus_job_send_change_signal(j);
+
+        /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */
+        bus_unit_send_pending_change_signal(j->unit, true);
+
+        r = bus_foreach_bus(j->manager, j->bus_track, send_removed_signal, j);
+        if (r < 0)
+                log_debug_errno(r, "Failed to send job remove signal for %u: %m", j->id);
+}
+
+static int bus_job_track_handler(sd_bus_track *t, void *userdata) {
+        Job *j = ASSERT_PTR(userdata);
+
+        assert(t);
+
+        j->bus_track = sd_bus_track_unref(j->bus_track); /* make sure we aren't called again */
+
+        /* Last client dropped off the bus, maybe we should GC this now? */
+        job_add_to_gc_queue(j);
+        return 0;
+}
+
+static int bus_job_allocate_bus_track(Job *j) {
+
+        assert(j);
+
+        if (j->bus_track)
+                return 0;
+
+        return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j);
+}
+
+int bus_job_coldplug_bus_track(Job *j) {
+        int r;
+        _cleanup_strv_free_ char **deserialized_clients = NULL;
+
+        assert(j);
+
+        deserialized_clients = TAKE_PTR(j->deserialized_clients);
+
+        if (strv_isempty(deserialized_clients))
+                return 0;
+
+        if (!j->manager->api_bus)
+                return 0;
+
+        r = bus_job_allocate_bus_track(j);
+        if (r < 0)
+                return r;
+
+        return bus_track_add_name_many(j->bus_track, deserialized_clients);
+}
+
+int bus_job_track_sender(Job *j, sd_bus_message *m) {
+        int r;
+
+        assert(j);
+        assert(m);
+
+        if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) {
+                j->ref_by_private_bus = true;
+                return 0;
+        }
+
+        r = bus_job_allocate_bus_track(j);
+        if (r < 0)
+                return r;
+
+        return sd_bus_track_add_sender(j->bus_track, m);
+}
diff --git a/src/core/dbus-job.h b/src/core/dbus-job.h
new file mode 100644
index 0000000..6f00581
--- /dev/null
+++ b/src/core/dbus-job.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "unit.h"
+#include "bus-object.h"
+
+extern const sd_bus_vtable bus_job_vtable[];
+extern const BusObjectImplementation job_object;
+
+int bus_job_method_cancel(sd_bus_message *message, void *job, sd_bus_error *error);
+int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+void bus_job_send_change_signal(Job *j);
+void bus_job_send_pending_change_signal(Job *j, bool including_new);
+void bus_job_send_removed_signal(Job *j);
+
+int bus_job_coldplug_bus_track(Job *j);
+int bus_job_track_sender(Job *j, sd_bus_message *m);
diff --git a/src/core/dbus-kill.c b/src/core/dbus-kill.c
new file mode 100644
index 0000000..19e439f
--- /dev/null
+++ b/src/core/dbus-kill.c
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "dbus-kill.h"
+#include "dbus-util.h"
+#include "kill.h"
+#include "signal-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_kill_mode, kill_mode, KillMode);
+
+static int property_get_restart_kill_signal(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+        KillContext *c = ASSERT_PTR(userdata);
+        int s;
+
+        s = restart_kill_signal(c);
+        return sd_bus_message_append_basic(reply, 'i', &s);
+}
+
+const sd_bus_vtable bus_kill_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("KillMode", "s", property_get_kill_mode, offsetof(KillContext, kill_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KillSignal", "i", bus_property_get_int, offsetof(KillContext, kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartKillSignal", "i", property_get_restart_kill_signal, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FinalKillSignal", "i", bus_property_get_int, offsetof(KillContext, final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SendSIGKILL", "b", bus_property_get_bool, offsetof(KillContext, send_sigkill), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SendSIGHUP", "b", bus_property_get_bool,  offsetof(KillContext, send_sighup), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WatchdogSignal", "i", bus_property_get_int, offsetof(KillContext, watchdog_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_VTABLE_END
+};
+
+static BUS_DEFINE_SET_TRANSIENT_PARSE(kill_mode, KillMode, kill_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(restart_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(final_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(watchdog_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+
+int bus_kill_context_set_transient_property(
+                Unit *u,
+                KillContext *c,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        assert(u);
+        assert(c);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "KillMode"))
+                return bus_set_transient_kill_mode(u, name, &c->kill_mode, message, flags, error);
+
+        if (streq(name, "SendSIGHUP"))
+                return bus_set_transient_bool(u, name, &c->send_sighup, message, flags, error);
+
+        if (streq(name, "SendSIGKILL"))
+                return bus_set_transient_bool(u, name, &c->send_sigkill, message, flags, error);
+
+        if (streq(name, "KillSignal"))
+                return bus_set_transient_kill_signal(u, name, &c->kill_signal, message, flags, error);
+
+        if (streq(name, "RestartKillSignal"))
+                return bus_set_transient_restart_kill_signal(u, name, &c->restart_kill_signal, message, flags, error);
+
+        if (streq(name, "FinalKillSignal"))
+                return bus_set_transient_final_kill_signal(u, name, &c->final_kill_signal, message, flags, error);
+
+        if (streq(name, "WatchdogSignal"))
+                return bus_set_transient_watchdog_signal(u, name, &c->watchdog_signal, message, flags, error);
+
+        return 0;
+}
diff --git a/src/core/dbus-kill.h b/src/core/dbus-kill.h
new file mode 100644
index 0000000..5a90287
--- /dev/null
+++ b/src/core/dbus-kill.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "kill.h"
+#include "unit.h"
+
+extern const sd_bus_vtable bus_kill_vtable[];
+
+int bus_kill_context_set_transient_property(Unit *u, KillContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c
new file mode 100644
index 0000000..745f5cc
--- /dev/null
+++ b/src/core/dbus-manager.c
@@ -0,0 +1,3628 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "build.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-log-control-api.h"
+#include "chase.h"
+#include "confidential-virt.h"
+#include "data-fd-util.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-scope.h"
+#include "dbus-service.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "initrd-util.h"
+#include "install.h"
+#include "log.h"
+#include "manager-dump.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "user-util.h"
+#include "version.h"
+#include "virt.h"
+#include "watchdog.h"
+
+/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state
+ * there, and if we can't we'll fail badly. */
+#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024))
+
+static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) {
+        return (runtime ? UNIT_FILE_RUNTIME : 0) |
+               (force   ? UNIT_FILE_FORCE   : 0);
+}
+
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_emergency_action, emergency_action, EmergencyAction);
+
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", systemd_features);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture()));
+static BUS_DEFINE_PROPERTY_GET2(property_get_system_state, "s", Manager, manager_state, manager_state_to_string);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_timer_slack_nsec, "t", (uint64_t) prctl(PR_GET_TIMERSLACK));
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "u", Hashmap *, hashmap_size);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_set_size, "u", Set *, set_size);
+static BUS_DEFINE_PROPERTY_GET(property_get_default_timeout_abort_usec, "t", Manager, manager_default_timeout_abort_usec);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_device, "s", watchdog_get_device());
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_realtime, "t", watchdog_get_last_ping(CLOCK_REALTIME));
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_monotonic, "t", watchdog_get_last_ping(CLOCK_MONOTONIC));
+static BUS_DEFINE_PROPERTY_GET(property_get_progress, "d", Manager, manager_get_progress);
+
+static int property_get_virtualization(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Virtualization v;
+
+        assert(bus);
+        assert(reply);
+
+        v = detect_virtualization();
+
+        /* Make sure to return the empty string when we detect no virtualization, as that is the API.
+         *
+         * https://github.com/systemd/systemd/issues/1423
+         */
+
+        return sd_bus_message_append(
+                        reply, "s",
+                        v == VIRTUALIZATION_NONE ? NULL : virtualization_to_string(v));
+}
+
+static int property_get_confidential_virtualization(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ConfidentialVirtualization v;
+
+        assert(bus);
+        assert(reply);
+
+        v = detect_confidential_virtualization();
+
+        return sd_bus_message_append(
+                        reply, "s",
+                        v <= 0 ? NULL : confidential_virtualization_to_string(v));
+}
+
+static int property_get_tainted(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *s = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        s = manager_taint_string(m);
+        if (!s)
+                return log_oom();
+
+        return sd_bus_message_append(reply, "s", s);
+}
+
+static int property_set_log_target(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = userdata;
+        const char *t;
+        int r;
+
+        assert(bus);
+        assert(value);
+
+        r = sd_bus_message_read(value, "s", &t);
+        if (r < 0)
+                return r;
+
+        if (isempty(t))
+                manager_restore_original_log_target(m);
+        else {
+                LogTarget target;
+
+                target = log_target_from_string(t);
+                if (target < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t);
+
+                manager_override_log_target(m, target);
+        }
+
+        return 0;
+}
+
+static int property_set_log_level(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = userdata;
+        const char *t;
+        int r;
+
+        assert(bus);
+        assert(value);
+
+        r = sd_bus_message_read(value, "s", &t);
+        if (r < 0)
+                return r;
+
+        if (isempty(t))
+                manager_restore_original_log_level(m);
+        else {
+                int level;
+
+                level = log_level_from_string(t);
+                if (level < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t);
+
+                manager_override_log_level(m, level);
+        }
+
+        return 0;
+}
+
+static int property_get_environment(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = manager_get_effective_environment(m, &l);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_append_strv(reply, l);
+}
+
+static int property_get_show_status(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "b", manager_get_show_status_on(m));
+}
+
+static int property_get_runtime_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_RUNTIME));
+}
+
+static int property_get_pretimeout_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_PRETIMEOUT));
+}
+
+static int property_get_pretimeout_watchdog_governor(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "s", m->watchdog_pretimeout_governor);
+}
+
+static int property_get_reboot_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_REBOOT));
+}
+
+static int property_get_kexec_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_KEXEC));
+}
+
+static int property_set_watchdog(Manager *m, WatchdogType type, sd_bus_message *value) {
+        usec_t timeout;
+        int r;
+
+        assert(m);
+        assert(value);
+
+        assert_cc(sizeof(usec_t) == sizeof(uint64_t));
+
+        r = sd_bus_message_read(value, "t", &timeout);
+        if (r < 0)
+                return r;
+
+        manager_override_watchdog(m, type, timeout);
+        return 0;
+}
+
+static int property_set_runtime_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        return property_set_watchdog(userdata, WATCHDOG_RUNTIME, value);
+}
+
+static int property_set_pretimeout_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        return property_set_watchdog(userdata, WATCHDOG_PRETIMEOUT, value);
+}
+
+static int property_set_pretimeout_watchdog_governor(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+        char *governor;
+        int r;
+
+        r = sd_bus_message_read(value, "s", &governor);
+        if (r < 0)
+                return r;
+        if (!string_is_safe(governor))
+                return -EINVAL;
+
+        return manager_override_watchdog_pretimeout_governor(m, governor);
+}
+
+static int property_set_reboot_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        return property_set_watchdog(userdata, WATCHDOG_REBOOT, value);
+}
+
+static int property_set_kexec_watchdog(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *value,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _unused_ Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(value);
+
+        return property_set_watchdog(userdata, WATCHDOG_KEXEC, value);
+}
+
+static int property_get_oom_score_adjust(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+        int r, n;
+
+        assert(bus);
+        assert(reply);
+
+        if (m->defaults.oom_score_adjust_set)
+                n = m->defaults.oom_score_adjust;
+        else {
+                n = 0;
+                r = get_oom_score_adjust(&n);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to read current OOM score adjustment value, ignoring: %m");
+        }
+
+        return sd_bus_message_append(reply, "i", n);
+}
+
+static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(message);
+        assert(ret_unit);
+
+        /* More or less a wrapper around manager_get_unit() that generates nice errors and has one trick up
+         * its sleeve: if the name is specified empty we use the client's unit. */
+
+        if (isempty(name)) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+                pid_t pid;
+
+                r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_creds_get_pid(creds, &pid);
+                if (r < 0)
+                        return r;
+
+                u = manager_get_unit_by_pid(m, pid);
+                if (!u)
+                        return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit.");
+        } else {
+                u = manager_get_unit(m, name);
+                if (!u)
+                        return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", name);
+        }
+
+        *ret_unit = u;
+        return 0;
+}
+
+static int bus_load_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
+        assert(m);
+        assert(message);
+        assert(ret_unit);
+
+        /* Pretty much the same as bus_get_unit_by_name(), but we also load the unit if necessary. */
+
+        if (isempty(name))
+                return bus_get_unit_by_name(m, message, name, ret_unit, error);
+
+        return manager_load_unit(m, name, NULL, error, ret_unit);
+}
+
+static int reply_unit_path(Unit *u, sd_bus_message *message, sd_bus_error *error) {
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        assert(u);
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "status", error);
+        if (r < 0)
+                return r;
+
+        path = unit_dbus_path(u);
+        if (!path)
+                return log_oom();
+
+        return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0)
+                return r;
+
+        r = bus_get_unit_by_name(m, message, name, &u, error);
+        if (r < 0)
+                return r;
+
+        return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        pid_t pid;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        assert_cc(sizeof(pid_t) == sizeof(uint32_t));
+
+        /* Anyone can call this method */
+
+        r = sd_bus_message_read(message, "u", &pid);
+        if (r < 0)
+                return r;
+        if (pid < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid);
+
+        if (pid == 0) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+
+                r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_creds_get_pid(creds, &pid);
+                if (r < 0)
+                        return r;
+        }
+
+        u = manager_get_unit_by_pid(m, pid);
+        if (!u)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid);
+
+        return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *path = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        sd_id128_t id;
+        const void *a;
+        Unit *u;
+        size_t sz;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = sd_bus_message_read_array(message, 'y', &a, &sz);
+        if (r < 0)
+                return r;
+        if (sz == 0)
+                id = SD_ID128_NULL;
+        else if (sz == 16)
+                memcpy(&id, a, sz);
+        else
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID");
+
+        if (sd_id128_is_null(id)) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+                pid_t pid;
+
+                r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_creds_get_pid(creds, &pid);
+                if (r < 0)
+                        return r;
+
+                u = manager_get_unit_by_pid(m, pid);
+                if (!u)
+                        return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+                                                 "Client " PID_FMT " not member of any unit.", pid);
+        } else {
+                u = hashmap_get(m->units_by_invocation_id, &id);
+                if (!u)
+                        return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id));
+        }
+
+        r = mac_selinux_unit_access_check(u, message, "status", error);
+        if (r < 0)
+                return r;
+
+        /* So here's a special trick: the bus path we return actually references the unit by its invocation
+         * ID instead of the unit name. This means it stays valid only as long as the invocation ID stays the
+         * same. */
+        path = unit_dbus_path_invocation_id(u);
+        if (!path)
+                return -ENOMEM;
+
+        return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_get_unit_by_control_group(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = userdata;
+        const char *cgroup;
+        Unit *u;
+        int r;
+
+        r = sd_bus_message_read(message, "s", &cgroup);
+        if (r < 0)
+                return r;
+
+        u = manager_get_unit_by_cgroup(m, cgroup);
+        if (!u)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+                                         "Control group '%s' is not valid or not managed by this instance",
+                                         cgroup);
+
+        return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_pidfd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        _cleanup_free_ char *path = NULL;
+        int r, pidfd;
+        Unit *u;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "h", &pidfd);
+        if (r < 0)
+                return r;
+
+        r = pidref_set_pidfd(&pidref, pidfd);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to get PID from PIDFD: %m");
+
+        u = manager_get_unit_by_pidref(m, &pidref);
+        if (!u)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pidref.pid);
+
+        r = mac_selinux_unit_access_check(u, message, "status", error);
+        if (r < 0)
+                return r;
+
+        path = unit_dbus_path(u);
+        if (!path)
+                return log_oom();
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "os", path, u->id);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_array(reply, 'y', u->invocation_id.bytes, sizeof(u->invocation_id.bytes));
+        if (r < 0)
+                return r;
+
+        /* Double-check that the process is still alive and that the PID did not change before returning the
+         * answer. */
+        r = pidref_verify(&pidref);
+        if (r == -ESRCH)
+                return sd_bus_error_setf(error,
+                                         BUS_ERROR_NO_SUCH_PROCESS,
+                                         "The PIDFD's PID "PID_FMT" changed during the lookup operation.",
+                                         pidref.pid);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to get PID from PIDFD: %m");
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0)
+                return r;
+
+        r = bus_load_unit_by_name(m, message, name, &u, error);
+        if (r < 0)
+                return r;
+
+        return reply_unit_path(u, message, error);
+}
+
+static int method_start_unit_generic(sd_bus_message *message, Manager *m, JobType job_type, bool reload_if_possible, sd_bus_error *error) {
+        const char *name;
+        Unit *u;
+        int r;
+
+        assert(message);
+        assert(m);
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0)
+                return r;
+
+        r = manager_load_unit(m, name, NULL, error, &u);
+        if (r < 0)
+                return r;
+
+        return bus_unit_method_start_generic(message, u, job_type, reload_if_possible, error);
+}
+
+static int method_start_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_START, /* reload_if_possible = */ false, error);
+}
+
+static int method_stop_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_STOP, /* reload_if_possible = */ false, error);
+}
+
+static int method_reload_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_RELOAD, /* reload_if_possible = */ false, error);
+}
+
+static int method_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_RESTART, /* reload_if_possible = */ false, error);
+}
+
+static int method_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, /* reload_if_possible = */ false, error);
+}
+
+static int method_reload_or_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_RESTART, /* reload_if_possible = */ true, error);
+}
+
+static int method_reload_or_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, /* reload_if_possible = */ true, error);
+}
+
+typedef enum GenericUnitOperationFlags {
+        GENERIC_UNIT_LOAD            = 1 << 0, /* Load if the unit is not loaded yet */
+        GENERIC_UNIT_VALIDATE_LOADED = 1 << 1, /* Verify unit is properly loaded before forwarding call */
+} GenericUnitOperationFlags;
+
+static int method_generic_unit_operation(
+                sd_bus_message *message,
+                Manager *m,
+                sd_bus_error *error,
+                sd_bus_message_handler_t handler,
+                GenericUnitOperationFlags flags) {
+
+        const char *name;
+        Unit *u;
+        int r;
+
+        assert(message);
+        assert(m);
+
+        /* Read the first argument from the command and pass the operation to the specified per-unit
+         * method. */
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0)
+                return r;
+
+        if (!isempty(name) && FLAGS_SET(flags, GENERIC_UNIT_LOAD))
+                r = manager_load_unit(m, name, NULL, error, &u);
+        else
+                r = bus_get_unit_by_name(m, message, name, &u, error);
+        if (r < 0)
+                return r;
+
+        if (FLAGS_SET(flags, GENERIC_UNIT_VALIDATE_LOADED)) {
+                r = bus_unit_validate_load_state(u, error);
+                if (r < 0)
+                        return r;
+        }
+
+        return handler(message, u, error);
+}
+
+static int method_enqueue_unit_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* We don't bother with GENERIC_UNIT_VALIDATE_LOADED here, as the job logic validates that anyway */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_enqueue_job, GENERIC_UNIT_LOAD);
+}
+
+static int method_start_unit_replace(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        const char *old_name;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "s", &old_name);
+        if (r < 0)
+                return r;
+
+        r = bus_get_unit_by_name(m, message, old_name, &u, error);
+        if (r < 0)
+                return r;
+        if (!u->job || u->job->type != JOB_START)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "No job queued for unit %s", old_name);
+
+        return method_start_unit_generic(message, m, JOB_START, /* reload_if_possible = */ false, error);
+}
+
+static int method_kill_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* We don't bother with GENERIC_UNIT_LOAD nor GENERIC_UNIT_VALIDATE_LOADED here, as it shouldn't
+         * matter whether a unit is loaded for killing any processes possibly in the unit's cgroup. */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_kill, 0);
+}
+
+static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Load the unit if necessary, in order to load it, and insist on the unit being loaded to be
+         * cleaned */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_clean, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0);
+}
+
+static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0);
+}
+
+static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Don't load the unit (because unloaded units can't be in failed state), and don't insist on the
+         * unit to be loaded properly (since a failed unit might have its unit file disappeared) */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_reset_failed, 0);
+}
+
+static int method_set_unit_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Only change properties on fully loaded units, and load them in order to set properties */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_set_properties, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_bind_mount_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Only add mounts on fully loaded units */
+        return method_generic_unit_operation(message, userdata, error, bus_service_method_bind_mount, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_mount_image_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Only add mounts on fully loaded units */
+        return method_generic_unit_operation(message, userdata, error, bus_service_method_mount_image, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Only allow reffing of fully loaded units, and make sure reffing a unit loads it. */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_ref, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_unref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Dropping a ref OTOH should not require the unit to still be loaded. And since a reffed unit is a
+         * loaded unit there's no need to load the unit for unreffing it. */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_unref, 0);
+}
+
+static int reply_unit_info(sd_bus_message *reply, Unit *u) {
+        _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+        Unit *following;
+
+        following = unit_following(u);
+
+        unit_path = unit_dbus_path(u);
+        if (!unit_path)
+                return -ENOMEM;
+
+        if (u->job) {
+                job_path = job_dbus_path(u->job);
+                if (!job_path)
+                        return -ENOMEM;
+        }
+
+        return sd_bus_message_append(
+                        reply, "(ssssssouso)",
+                        u->id,
+                        unit_description(u),
+                        unit_load_state_to_string(u->load_state),
+                        unit_active_state_to_string(unit_active_state(u)),
+                        unit_sub_state_to_string(u),
+                        following ? following->id : "",
+                        unit_path,
+                        u->job ? u->job->id : 0,
+                        u->job ? job_type_to_string(u->job->type) : "",
+                        empty_to_root(job_path));
+}
+
+static int method_list_units_by_names(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+        _cleanup_strv_free_ char **units = NULL;
+
+        assert(message);
+
+        r = sd_bus_message_read_strv(message, &units);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)");
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(unit, units) {
+                Unit *u;
+
+                if (!unit_name_is_valid(*unit, UNIT_NAME_ANY))
+                        continue;
+
+                r = bus_load_unit_by_name(m, message, *unit, &u, error);
+                if (r < 0)
+                        return r;
+
+                r = reply_unit_info(reply, u);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the
+         * unit being loaded (because even improperly loaded units might still have processes around */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0);
+}
+
+static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Don't allow attaching new processes to units that aren't loaded. Don't bother with loading a unit
+         * for this purpose though, as an unloaded unit is a stopped unit, and we don't allow attaching
+         * processes to stopped units anyway. */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int transient_unit_from_message(
+                Manager *m,
+                sd_bus_message *message,
+                const char *name,
+                Unit **unit,
+                sd_bus_error *error) {
+
+        UnitType t;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(message);
+        assert(name);
+
+        t = unit_name_to_type(name);
+        if (t < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Invalid unit name or type.");
+
+        if (!unit_vtable[t]->can_transient)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Unit type %s does not support transient units.",
+                                         unit_type_to_string(t));
+
+        r = manager_load_unit(m, name, NULL, error, &u);
+        if (r < 0)
+                return r;
+
+        if (!unit_is_pristine(u))
+                return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+                                         "Unit %s was already loaded or has a fragment file.", name);
+
+        /* OK, the unit failed to load and is unreferenced, now let's
+         * fill in the transient data instead */
+        r = unit_make_transient(u);
+        if (r < 0)
+                return r;
+
+        /* Set our properties */
+        r = bus_unit_set_properties(u, message, UNIT_RUNTIME, false, error);
+        if (r < 0)
+                return r;
+
+        /* If the client asked for it, automatically add a reference to this unit. */
+        if (u->bus_track_add) {
+                r = bus_unit_track_add_sender(u, message);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to watch sender: %m");
+        }
+
+        /* Now load the missing bits of the unit we just created */
+        unit_add_to_load_queue(u);
+        manager_dispatch_load_queue(m);
+
+        *unit = u;
+
+        return 0;
+}
+
+static int transient_aux_units_from_message(
+                Manager *m,
+                sd_bus_message *message,
+                sd_bus_error *error) {
+
+        int r;
+
+        assert(m);
+        assert(message);
+
+        r = sd_bus_message_enter_container(message, 'a', "(sa(sv))");
+        if (r < 0)
+                return r;
+
+        while ((r = sd_bus_message_enter_container(message, 'r', "sa(sv)")) > 0) {
+                const char *name = NULL;
+                Unit *u;
+
+                r = sd_bus_message_read(message, "s", &name);
+                if (r < 0)
+                        return r;
+
+                r = transient_unit_from_message(m, message, name, &u, error);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+        }
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int method_start_transient_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        const char *name, *smode;
+        Manager *m = ASSERT_PTR(userdata);
+        JobMode mode;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "start", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "ss", &name, &smode);
+        if (r < 0)
+                return r;
+
+        mode = job_mode_from_string(smode);
+        if (mode < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s is invalid.", smode);
+
+        r = bus_verify_manage_units_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = transient_unit_from_message(m, message, name, &u, error);
+        if (r < 0)
+                return r;
+
+        r = transient_aux_units_from_message(m, message, error);
+        if (r < 0)
+                return r;
+
+        /* Finally, start it */
+        return bus_unit_queue_job(message, u, JOB_START, mode, 0, error);
+}
+
+static int method_get_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *path = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        uint32_t id;
+        Job *j;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = sd_bus_message_read(message, "u", &id);
+        if (r < 0)
+                return r;
+
+        j = manager_get_job(m, id);
+        if (!j)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+        r = mac_selinux_unit_access_check(j->unit, message, "status", error);
+        if (r < 0)
+                return r;
+
+        path = job_dbus_path(j);
+        if (!path)
+                return -ENOMEM;
+
+        return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_cancel_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        uint32_t id;
+        Job *j;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "u", &id);
+        if (r < 0)
+                return r;
+
+        j = manager_get_job(m, id);
+        if (!j)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+        return bus_job_method_cancel(message, j, error);
+}
+
+static int method_clear_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        manager_clear_jobs(m);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        manager_reset_failed(m);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *k;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH_KEY(u, k, m->units) {
+                if (k != u->id)
+                        continue;
+
+                if (!strv_isempty(states) &&
+                    !strv_contains(states, unit_load_state_to_string(u->load_state)) &&
+                    !strv_contains(states, unit_active_state_to_string(unit_active_state(u))) &&
+                    !strv_contains(states, unit_sub_state_to_string(u)))
+                        continue;
+
+                if (!strv_isempty(patterns) &&
+                    !strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE))
+                        continue;
+
+                r = reply_unit_info(reply, u);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_list_units(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return list_units_filtered(message, userdata, error, NULL, NULL);
+}
+
+static int method_list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **states = NULL;
+        int r;
+
+        r = sd_bus_message_read_strv(message, &states);
+        if (r < 0)
+                return r;
+
+        return list_units_filtered(message, userdata, error, states, NULL);
+}
+
+static int method_list_units_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **states = NULL;
+        _cleanup_strv_free_ char **patterns = NULL;
+        int r;
+
+        r = sd_bus_message_read_strv(message, &states);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_strv(message, &patterns);
+        if (r < 0)
+                return r;
+
+        return list_units_filtered(message, userdata, error, states, patterns);
+}
+
+static int method_list_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        Job *j;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(usssoo)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(j, m->jobs) {
+                _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+
+                job_path = job_dbus_path(j);
+                if (!job_path)
+                        return -ENOMEM;
+
+                unit_path = unit_dbus_path(j->unit);
+                if (!unit_path)
+                        return -ENOMEM;
+
+                r = sd_bus_message_append(
+                                reply, "(usssoo)",
+                                j->id,
+                                j->unit->id,
+                                job_type_to_string(j->type),
+                                job_state_to_string(j->state),
+                                job_path,
+                                unit_path);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_subscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        if (sd_bus_message_get_bus(message) == m->api_bus) {
+
+                /* Note that direct bus connection subscribe by
+                 * default, we only track peers on the API bus here */
+
+                if (!m->subscribed) {
+                        r = sd_bus_track_new(sd_bus_message_get_bus(message), &m->subscribed, NULL, NULL);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = sd_bus_track_add_sender(m->subscribed, message);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return sd_bus_error_set(error, BUS_ERROR_ALREADY_SUBSCRIBED, "Client is already subscribed.");
+        }
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unsubscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        if (sd_bus_message_get_bus(message) == m->api_bus) {
+                r = sd_bus_track_remove_sender(m->subscribed, message);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return sd_bus_error_set(error, BUS_ERROR_NOT_SUBSCRIBED, "Client is not subscribed.");
+        }
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int dump_impl(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error,
+                char **patterns,
+                int (*reply)(sd_bus_message *, char *)) {
+
+        _cleanup_free_ char *dump = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        /* 'status' access is the bare minimum always needed for this, as the policy might straight out
+         * forbid a client from querying any information from systemd, regardless of any rate limiting. */
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        /* Rate limit reached? Check if the caller is privileged/allowed by policy to bypass this. We
+         * check the rate limit first to avoid the expensive roundtrip to polkit when not needed. */
+        if (!ratelimit_below(&m->dump_ratelimit)) {
+                /* We need a way for SELinux to constrain the operation when the rate limit is active, even
+                 * if polkit would allow it, but we cannot easily add new named permissions, so we need to
+                 * use an existing one. Reload/reexec are also slow but non-destructive/modifying
+                 * operations, and can cause PID1 to stall. So it seems similar enough in terms of security
+                 * considerations and impact, and thus use the same access check for dumps which, given the
+                 * large amount of data to fetch, can stall PID1 for quite some time. */
+                r = mac_selinux_access_check(message, "reload", error);
+                if (r < 0)
+                        goto ratelimited;
+
+                r = bus_verify_bypass_dump_ratelimit_async(m, message, error);
+                if (r < 0)
+                        goto ratelimited;
+                if (r == 0)
+                        /* No authorization for now, but the async polkit stuff will call us again when it
+                         * has it */
+                        return 1;
+        }
+
+        r = manager_get_dump_string(m, patterns, &dump);
+        if (r < 0)
+                return r;
+
+        return reply(message, dump);
+
+ratelimited:
+        log_warning("Dump request rejected due to rate limit on unprivileged callers, blocked for %s.",
+                    FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC));
+        return sd_bus_error_setf(error,
+                                 SD_BUS_ERROR_LIMITS_EXCEEDED,
+                                 "Dump request rejected due to rate limit on unprivileged callers, blocked for %s.",
+                                 FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC));
+}
+
+static int reply_dump(sd_bus_message *message, char *dump) {
+        return sd_bus_reply_method_return(message, "s", dump);
+}
+
+static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return dump_impl(message, userdata, error, NULL, reply_dump);
+}
+
+static int reply_dump_by_fd(sd_bus_message *message, char *dump) {
+        _cleanup_close_ int fd = -EBADF;
+
+        fd = acquire_data_fd(dump, strlen(dump), 0);
+        if (fd < 0)
+                return fd;
+
+        return sd_bus_reply_method_return(message, "h", fd);
+}
+
+static int method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return dump_impl(message, userdata, error, NULL, reply_dump_by_fd);
+}
+
+static int dump_units_matching_patterns(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error,
+                int (*reply)(sd_bus_message *, char *)) {
+        _cleanup_strv_free_ char **patterns = NULL;
+        int r;
+
+        r = sd_bus_message_read_strv(message, &patterns);
+        if (r < 0)
+                return r;
+
+        return dump_impl(message, userdata, error, patterns, reply);
+}
+
+static int method_dump_units_matching_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return dump_units_matching_patterns(message, userdata, error, reply_dump);
+}
+
+static int method_dump_units_matching_patterns_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return dump_units_matching_patterns(message, userdata, error, reply_dump_by_fd);
+}
+
+static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed.");
+}
+
+static int get_run_space(uint64_t *ret, sd_bus_error *error) {
+        struct statvfs svfs;
+
+        assert(ret);
+
+        if (statvfs("/run/systemd", &svfs) < 0)
+                return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m");
+
+        *ret = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize;
+        return 0;
+}
+
+static int verify_run_space(const char *message, sd_bus_error *error) {
+        uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */
+        int r;
+
+        assert(message);
+
+        r = get_run_space(&available, error);
+        if (r < 0)
+                return r;
+
+        if (available < RELOAD_DISK_SPACE_MIN)
+                return sd_bus_error_setf(error,
+                                         BUS_ERROR_DISK_FULL,
+                                         "%s, not enough space available on /run/systemd/. "
+                                         "Currently, %s are free, but a safety buffer of %s is enforced.",
+                                         message,
+                                         FORMAT_BYTES(available),
+                                         FORMAT_BYTES(RELOAD_DISK_SPACE_MIN));
+
+        return 0;
+}
+
+int verify_run_space_and_log(const char *message) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(message);
+
+        r = verify_run_space(message, &error);
+        if (r < 0)
+                return log_error_errno(r, "%s", bus_error_message(&error, r));
+
+        return 0;
+}
+
+static int verify_run_space_permissive(const char *message, sd_bus_error *error) {
+        uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */
+        int r;
+
+        assert(message);
+
+        r = get_run_space(&available, error);
+        if (r < 0)
+                return r;
+
+        if (available < RELOAD_DISK_SPACE_MIN)
+                log_warning("Dangerously low amount of free space on /run/systemd/, %s.\n"
+                            "Currently, %s are free, but %s are suggested. Proceeding anyway.",
+                            message,
+                            FORMAT_BYTES(available),
+                            FORMAT_BYTES(RELOAD_DISK_SPACE_MIN));
+
+        return 0;
+}
+
+static void log_caller(sd_bus_message *message, Manager *manager, const char *method) {
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        const char *comm = NULL;
+        Unit *caller;
+        pid_t pid;
+
+        assert(message);
+        assert(manager);
+        assert(method);
+
+        if (sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID|SD_BUS_CREDS_AUGMENT|SD_BUS_CREDS_COMM, &creds) < 0)
+                return;
+
+        /* We need at least the PID, otherwise there's nothing to log, the rest is optional */
+        if (sd_bus_creds_get_pid(creds, &pid) < 0)
+                return;
+
+        (void) sd_bus_creds_get_comm(creds, &comm);
+        caller = manager_get_unit_by_pid(manager, pid);
+
+        log_info("%s requested from client PID " PID_FMT "%s%s%s%s%s%s...",
+                 method, pid,
+                 comm ? " ('" : "", strempty(comm), comm ? "')" : "",
+                 caller ? " (unit " : "", caller ? caller->id : "", caller ? ")" : "");
+}
+
+static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = verify_run_space("Refusing to reload", error);
+        if (r < 0)
+                return r;
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_reload_daemon_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        /* Write a log message noting the unit or process who requested the Reload() */
+        log_caller(message, m, "Reloading");
+
+        /* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */
+        if (!ratelimit_below(&m->reload_ratelimit)) {
+                log_warning("Reloading request rejected due to rate limit.");
+                return sd_bus_error_setf(error,
+                                         SD_BUS_ERROR_LIMITS_EXCEEDED,
+                                         "Reload() request rejected due to rate limit.");
+        }
+
+        /* Instead of sending the reply back right away, we just
+         * remember that we need to and then send it after the reload
+         * is finished. That way the caller knows when the reload
+         * finished. */
+
+        assert(!m->pending_reload_message);
+        r = sd_bus_message_new_method_return(message, &m->pending_reload_message);
+        if (r < 0)
+                return r;
+
+        m->objective = MANAGER_RELOAD;
+
+        return 1;
+}
+
+static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = verify_run_space("Refusing to reexecute", error);
+        if (r < 0)
+                return r;
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_reload_daemon_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        /* Write a log message noting the unit or process who requested the Reexecute() */
+        log_caller(message, m, "Reexecuting");
+
+        /* We don't send a reply back here, the client should
+         * just wait for us disconnecting. */
+
+        m->objective = MANAGER_REEXECUTE;
+        return 1;
+}
+
+static int method_exit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "halt", error);
+        if (r < 0)
+                return r;
+
+        /* Exit() (in contrast to SetExitCode()) is actually allowed even if
+         * we are running on the host. It will fall back on reboot() in
+         * systemd-shutdown if it cannot do the exit() because it isn't a
+         * container. */
+
+        m->objective = MANAGER_EXIT;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reboot", error);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Reboot is only supported for system managers.");
+
+        m->objective = MANAGER_REBOOT;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_soft_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *rt = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *root;
+        int r;
+
+        assert(message);
+
+        r = verify_run_space_permissive("soft reboot may fail", error);
+        if (r < 0)
+                return r;
+
+        r = mac_selinux_access_check(message, "reboot", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "s", &root);
+        if (r < 0)
+                return r;
+
+        if (!isempty(root)) {
+                if (!path_is_valid(root))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "New root directory '%s' must be a valid path.", root);
+                if (!path_is_absolute(root))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "New root directory path '%s' is not absolute.", root);
+
+                rt = strdup(root);
+                if (!rt)
+                        return -ENOMEM;
+        }
+
+        free_and_replace(m->switch_root, rt);
+        m->objective = MANAGER_SOFT_REBOOT;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "halt", error);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Powering off is only supported for system managers.");
+
+        m->objective = MANAGER_POWEROFF;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "halt", error);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Halt is only supported for system managers.");
+
+        m->objective = MANAGER_HALT;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_kexec(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reboot", error);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "KExec is only supported for system managers.");
+
+        m->objective = MANAGER_KEXEC;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *ri = NULL, *rt = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *root, *init;
+        int r;
+
+        assert(message);
+
+        r = verify_run_space_permissive("root switching may fail", error);
+        if (r < 0)
+                return r;
+
+        r = mac_selinux_access_check(message, "reboot", error);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Root switching is only supported by system manager.");
+
+        r = sd_bus_message_read(message, "ss", &root, &init);
+        if (r < 0)
+                return r;
+
+        if (isempty(root))
+                /* If path is not specified, default to "/sysroot" which is what we generally expect initrds
+                 * to use */
+                root = "/sysroot";
+        else {
+                if (!path_is_valid(root))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "New root directory must be a valid path.");
+
+                if (!path_is_absolute(root))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "New root path '%s' is not absolute.", root);
+
+                r = path_is_root(root);
+                if (r < 0)
+                        return sd_bus_error_set_errnof(error, r,
+                                                       "Failed to check if new root directory '%s' is the same as old root: %m",
+                                                       root);
+                if (r > 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "New root directory cannot be the old root directory.");
+        }
+
+        /* Safety check */
+        if (!in_initrd())
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Not in initrd, refusing switch-root operation.");
+
+        r = path_is_os_tree(root);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r,
+                                               "Failed to determine whether root path '%s' contains an OS tree: %m",
+                                               root);
+        if (r == 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Specified switch root path '%s' does not seem to be an OS tree. os-release file is missing.",
+                                         root);
+
+        if (!isempty(init)) {
+                if (!path_is_valid(init))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "Path to init binary '%s' is not a valid path.", init);
+
+                if (!path_is_absolute(init))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "Path to init binary '%s' not absolute.", init);
+
+                r = chase_and_access(init, root, CHASE_PREFIX_ROOT, X_OK, NULL);
+                if (r == -EACCES)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "Init binary %s is not executable.", init);
+                if (r < 0)
+                        return sd_bus_error_set_errnof(error, r,
+                                                       "Could not resolve init executable %s: %m", init);
+        }
+
+        rt = strdup(root);
+        if (!rt)
+                return -ENOMEM;
+
+        if (!isempty(init)) {
+                ri = strdup(init);
+                if (!ri)
+                        return -ENOMEM;
+        }
+
+        free_and_replace(m->switch_root, rt);
+        free_and_replace(m->switch_root_init, ri);
+
+        m->objective = MANAGER_SWITCH_ROOT;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **plus = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_strv(message, &plus);
+        if (r < 0)
+                return r;
+        if (!strv_env_is_valid(plus))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments");
+
+        r = bus_verify_set_environment_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = manager_client_environment_modify(m, NULL, plus);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unset_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **minus = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_strv(message, &minus);
+        if (r < 0)
+                return r;
+
+        if (!strv_env_name_or_assignment_is_valid(minus))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Invalid environment variable names or assignments");
+
+        r = bus_verify_set_environment_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = manager_client_environment_modify(m, minus, NULL);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unset_and_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **minus = NULL, **plus = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_strv(message, &minus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_strv(message, &plus);
+        if (r < 0)
+                return r;
+
+        if (!strv_env_name_or_assignment_is_valid(minus))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Invalid environment variable names or assignments");
+        if (!strv_env_is_valid(plus))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Invalid environment assignments");
+
+        r = bus_verify_set_environment_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = manager_client_environment_modify(m, minus, plus);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_set_exit_code(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        uint8_t code;
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "exit", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_basic(message, 'y', &code);
+        if (r < 0)
+                return r;
+
+        m->return_value = code;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_lookup_dynamic_user_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        uid_t uid;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read_basic(message, 's', &name);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Dynamic users are only supported in the system instance.");
+        if (!valid_user_group_name(name, VALID_USER_RELAX))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "User name invalid: %s", name);
+
+        r = dynamic_user_lookup_name(m, name, &uid);
+        if (r == -ESRCH)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER,
+                                         "Dynamic user %s does not exist.", name);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, "u", (uint32_t) uid);
+}
+
+static int method_lookup_dynamic_user_by_uid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *name = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        uid_t uid;
+        int r;
+
+        assert(message);
+
+        assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+        r = sd_bus_message_read_basic(message, 'u', &uid);
+        if (r < 0)
+                return r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Dynamic users are only supported in the system instance.");
+        if (!uid_is_valid(uid))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "User ID invalid: " UID_FMT, uid);
+
+        r = dynamic_user_lookup_uid(m, uid, &name);
+        if (r == -ESRCH)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER,
+                                         "Dynamic user ID " UID_FMT " does not exist.", uid);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, "s", name);
+}
+
+static int method_get_dynamic_users(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        DynamicUser *d;
+        int r;
+
+        assert(message);
+
+        assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+                                         "Dynamic users are only supported in the system instance.");
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(us)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(d, m->dynamic_users) {
+                uid_t uid;
+
+                r = dynamic_user_current(d, &uid);
+                if (r == -EAGAIN) /* not realized yet? */
+                        continue;
+                if (r < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED,
+                                                 "Failed to look up a dynamic user.");
+
+                r = sd_bus_message_append(reply, "(us)", uid, d->name);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_enqueue_marked_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "start", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        log_info("Queuing reload/restart jobs for marked units%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "o");
+        if (r < 0)
+                return r;
+
+        Unit *u;
+        char *k;
+        int ret = 0;
+        HASHMAP_FOREACH_KEY(u, k, m->units) {
+                /* ignore aliases */
+                if (u->id != k)
+                        continue;
+
+                BusUnitQueueFlags flags;
+                if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RESTART))
+                        flags = 0;
+                else if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD))
+                        flags = BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+                else
+                        continue;
+
+                r = mac_selinux_unit_access_check(u, message, "start", error);
+                if (r >= 0)
+                        r = bus_unit_queue_job_one(message, u,
+                                                   JOB_TRY_RESTART, JOB_FAIL, flags,
+                                                   reply, error);
+                if (ERRNO_IS_NEG_RESOURCE(r))
+                        return r;
+                if (r < 0) {
+                        if (ret >= 0)
+                                ret = r;
+                        sd_bus_error_free(error);
+                }
+        }
+
+        if (ret < 0)
+                return sd_bus_error_set_errnof(error, ret,
+                                               "Failed to enqueue some jobs, see logs for details: %m");
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        UnitFileList *item;
+        _cleanup_hashmap_free_ Hashmap *h = NULL;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        h = hashmap_new(&unit_file_list_hash_ops_free);
+        if (!h)
+                return -ENOMEM;
+
+        r = unit_file_get_list(m->runtime_scope, NULL, h, states, patterns);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(item, h) {
+
+                r = sd_bus_message_append(reply, "(ss)", item->path, unit_file_state_to_string(item->state));
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_list_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return list_unit_files_by_patterns(message, userdata, error, NULL, NULL);
+}
+
+static int method_list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **states = NULL;
+        _cleanup_strv_free_ char **patterns = NULL;
+        int r;
+
+        r = sd_bus_message_read_strv(message, &states);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_strv(message, &patterns);
+        if (r < 0)
+                return r;
+
+        return list_unit_files_by_patterns(message, userdata, error, states, patterns);
+}
+
+static int method_get_unit_file_state(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        UnitFileState state;
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0)
+                return r;
+
+        r = unit_file_get_state(m->runtime_scope, NULL, name, &state);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, "s", unit_file_state_to_string(state));
+}
+
+static int method_get_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *default_target = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        /* Anyone can call this method */
+
+        r = mac_selinux_access_check(message, "status", error);
+        if (r < 0)
+                return r;
+
+        r = unit_file_get_default(m->runtime_scope, NULL, &default_target);
+        if (r == -ERFKILL)
+                sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit file is masked.");
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, "s", default_target);
+}
+
+static int send_unit_files_changed(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+        int r;
+
+        assert(bus);
+
+        r = sd_bus_message_new_signal(bus, &message,
+                                      "/org/freedesktop/systemd1",
+                                      "org.freedesktop.systemd1.Manager",
+                                      "UnitFilesChanged");
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, message, NULL);
+}
+
+/* Create an error reply, using the error information from changes[]
+ * if possible, and fall back to generating an error from error code c.
+ * The error message only describes the first error.
+ */
+static int install_error(
+                sd_bus_error *error,
+                int c,
+                InstallChange *changes,
+                size_t n_changes) {
+
+        CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+        for (size_t i = 0; i < n_changes; i++)
+
+                /* When making changes here, make sure to also change install_changes_dump() in install.c. */
+
+                switch (changes[i].type) {
+                case 0 ... _INSTALL_CHANGE_TYPE_MAX: /* not errors */
+                        break;
+
+                case -EEXIST:
+                        if (changes[i].source)
+                                return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+                                                         "File %s already exists and is a symlink to %s.",
+                                                         changes[i].path, changes[i].source);
+                        return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+                                                 "File %s already exists.",
+                                                 changes[i].path);
+
+                case -ERFKILL:
+                        return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED,
+                                                 "Unit file %s is masked.", changes[i].path);
+
+                case -EADDRNOTAVAIL:
+                        return sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED,
+                                                 "Unit %s is transient or generated.", changes[i].path);
+
+                case -ETXTBSY:
+                        return sd_bus_error_setf(error, BUS_ERROR_UNIT_BAD_PATH,
+                                                 "File %s is under the systemd unit hierarchy already.", changes[i].path);
+
+                case -EBADSLT:
+                        return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                 "Invalid specifier in %s.", changes[i].path);
+
+                case -EIDRM:
+                        return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                 "Destination unit %s is a non-template unit.", changes[i].path);
+
+                case -EUCLEAN:
+                        return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                 "\"%s\" is not a valid unit name.",
+                                                 changes[i].path);
+
+                case -ELOOP:
+                        return sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED,
+                                                 "Refusing to operate on alias name or linked unit file: %s",
+                                                 changes[i].path);
+
+                case -EXDEV:
+                        if (changes[i].source)
+                                return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                         "Cannot alias %s as %s.",
+                                                         changes[i].source, changes[i].path);
+                        return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                 "Invalid unit reference %s.", changes[i].path);
+
+                case -ENOENT:
+                        return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+                                                 "Unit file %s does not exist.", changes[i].path);
+
+                case -EUNATCH:
+                        return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                 "Cannot resolve specifiers in %s.", changes[i].path);
+
+                default:
+                        assert(changes[i].type < 0); /* other errors */
+                        return sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path);
+                }
+
+        return c < 0 ? c : -EINVAL;
+}
+
+static int reply_install_changes_and_free(
+                Manager *m,
+                sd_bus_message *message,
+                int carries_install_info,
+                InstallChange *changes,
+                size_t n_changes,
+                sd_bus_error *error) {
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        bool bad = false, good = false;
+        int r;
+
+        CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+        if (install_changes_have_modification(changes, n_changes)) {
+                r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m");
+        }
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        if (carries_install_info >= 0) {
+                r = sd_bus_message_append(reply, "b", carries_install_info);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_open_container(reply, 'a', "(sss)");
+        if (r < 0)
+                return r;
+
+        for (size_t i = 0; i < n_changes; i++) {
+
+                if (changes[i].type < 0) {
+                        bad = true;
+                        continue;
+                }
+
+                r = sd_bus_message_append(
+                                reply, "(sss)",
+                                install_change_type_to_string(changes[i].type),
+                                changes[i].path,
+                                changes[i].source);
+                if (r < 0)
+                        return r;
+
+                good = true;
+        }
+
+        /* If there was a failed change, and no successful change, then return the first failure as proper
+         * method call error. */
+        if (bad && !good)
+                return install_error(error, 0, TAKE_PTR(changes), n_changes);
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_enable_unit_files_generic(
+                sd_bus_message *message,
+                Manager *m,
+                int (*call)(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes),
+                bool carries_install_info,
+                sd_bus_error *error) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        InstallChange *changes = NULL;
+        size_t n_changes = 0;
+        UnitFileFlags flags;
+        int r;
+
+        assert(message);
+        assert(m);
+
+        r = sd_bus_message_read_strv(message, &l);
+        if (r < 0)
+                return r;
+
+        if (sd_bus_message_is_method_call(message, NULL, "EnableUnitFilesWithFlags")) {
+                uint64_t raw_flags;
+
+                r = sd_bus_message_read(message, "t", &raw_flags);
+                if (r < 0)
+                        return r;
+                if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0)
+                        return -EINVAL;
+                flags = raw_flags;
+        } else {
+                int runtime, force;
+
+                r = sd_bus_message_read(message, "bb", &runtime, &force);
+                if (r < 0)
+                        return r;
+                flags = unit_file_bools_to_flags(runtime, force);
+        }
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes);
+        m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error);
+}
+
+static int method_enable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_enable_unit_files_generic(message, userdata, unit_file_enable, /* carries_install_info = */ true, error);
+}
+
+static int method_enable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_enable_unit_files_generic(message, userdata, unit_file_enable, /* carries_install_info = */ true, error);
+}
+
+static int method_reenable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_enable_unit_files_generic(message, userdata, unit_file_reenable, /* carries_install_info = */ true, error);
+}
+
+static int method_link_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_enable_unit_files_generic(message, userdata, unit_file_link, /* carries_install_info = */ false, error);
+}
+
+static int unit_file_preset_without_mode(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char **files, InstallChange **changes, size_t *n_changes) {
+        return unit_file_preset(scope, flags, root_dir, files, UNIT_FILE_PRESET_FULL, changes, n_changes);
+}
+
+static int method_preset_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_enable_unit_files_generic(message, userdata, unit_file_preset_without_mode, /* carries_install_info = */ true, error);
+}
+
+static int method_mask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_enable_unit_files_generic(message, userdata, unit_file_mask, /* carries_install_info = */ false, error);
+}
+
+static int method_preset_unit_files_with_mode(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        InstallChange *changes = NULL;
+        size_t n_changes = 0;
+        Manager *m = ASSERT_PTR(userdata);
+        UnitFilePresetMode preset_mode;
+        int runtime, force, r;
+        UnitFileFlags flags;
+        const char *mode;
+
+        assert(message);
+
+        r = sd_bus_message_read_strv(message, &l);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force);
+        if (r < 0)
+                return r;
+
+        flags = unit_file_bools_to_flags(runtime, force);
+
+        if (isempty(mode))
+                preset_mode = UNIT_FILE_PRESET_FULL;
+        else {
+                preset_mode = unit_file_preset_mode_from_string(mode);
+                if (preset_mode < 0)
+                        return -EINVAL;
+        }
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = unit_file_preset(m->runtime_scope, flags, NULL, l, preset_mode, &changes, &n_changes);
+        m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, r, changes, n_changes, error);
+}
+
+static int method_disable_unit_files_generic(
+                sd_bus_message *message,
+                Manager *m,
+                int (*call)(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes),
+                bool carries_install_info,
+                sd_bus_error *error) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        InstallChange *changes = NULL;
+        UnitFileFlags flags;
+        size_t n_changes = 0;
+        int r;
+
+        assert(message);
+        assert(m);
+
+        r = sd_bus_message_read_strv(message, &l);
+        if (r < 0)
+                return r;
+
+        if (sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlags") ||
+            sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlagsAndInstallInfo")) {
+                uint64_t raw_flags;
+
+                r = sd_bus_message_read(message, "t", &raw_flags);
+                if (r < 0)
+                        return r;
+                if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0 ||
+                                FLAGS_SET(raw_flags, UNIT_FILE_FORCE))
+                        return -EINVAL;
+                flags = raw_flags;
+        } else {
+                int runtime;
+
+                r = sd_bus_message_read(message, "b", &runtime);
+                if (r < 0)
+                        return r;
+                flags = unit_file_bools_to_flags(runtime, false);
+        }
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes);
+        m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error);
+}
+
+static int method_disable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ false, error);
+}
+
+static int method_disable_unit_files_with_flags_and_install_info(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ true, error);
+}
+
+static int method_disable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ false, error);
+}
+
+static int method_unmask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_disable_unit_files_generic(message, userdata, unit_file_unmask, /* carries_install_info = */ false, error);
+}
+
+static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **l = NULL;
+        InstallChange *changes = NULL;
+        size_t n_changes = 0;
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read_strv(message, &l);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = unit_file_revert(m->runtime_scope, NULL, l, &changes, &n_changes);
+        m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_set_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        InstallChange *changes = NULL;
+        size_t n_changes = 0;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        int force, r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "enable", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "sb", &name, &force);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = unit_file_set_default(m->runtime_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes);
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        InstallChange *changes = NULL;
+        size_t n_changes = 0;
+        Manager *m = ASSERT_PTR(userdata);
+        UnitFilePresetMode preset_mode;
+        const char *mode;
+        UnitFileFlags flags;
+        int force, runtime, r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "enable", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force);
+        if (r < 0)
+                return r;
+
+        flags = unit_file_bools_to_flags(runtime, force);
+
+        if (isempty(mode))
+                preset_mode = UNIT_FILE_PRESET_FULL;
+        else {
+                preset_mode = unit_file_preset_mode_from_string(mode);
+                if (preset_mode < 0)
+                        return -EINVAL;
+        }
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = unit_file_preset_all(m->runtime_scope, flags, NULL, preset_mode, &changes, &n_changes);
+        m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_add_dependency_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_strv_free_ char **l = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        InstallChange *changes = NULL;
+        size_t n_changes = 0;
+        int runtime, force, r;
+        char *target, *type;
+        UnitDependency dep;
+        UnitFileFlags flags;
+
+        assert(message);
+
+        r = bus_verify_manage_unit_files_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = sd_bus_message_read_strv(message, &l);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "ssbb", &target, &type, &runtime, &force);
+        if (r < 0)
+                return r;
+
+        flags = unit_file_bools_to_flags(runtime, force);
+
+        dep = unit_dependency_from_string(type);
+        if (dep < 0)
+                return -EINVAL;
+
+        r = unit_file_add_dependency(m->runtime_scope, flags, NULL, l, target, dep, &changes, &n_changes);
+        m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+        if (r < 0)
+                return install_error(error, r, changes, n_changes);
+
+        return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_get_unit_file_links(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        InstallChange *changes = NULL;
+        size_t n_changes = 0, i;
+        const char *name;
+        int runtime, r;
+
+        CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+        r = sd_bus_message_read(message, "sb", &name, &runtime);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, SD_BUS_TYPE_ARRAY, "s");
+        if (r < 0)
+                return r;
+
+        r = unit_file_disable(m->runtime_scope,
+                              UNIT_FILE_DRY_RUN | (runtime ? UNIT_FILE_RUNTIME : 0),
+                              NULL, STRV_MAKE(name), &changes, &n_changes);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get file links for %s: %m", name);
+
+        for (i = 0; i < n_changes; i++)
+                if (changes[i].type == INSTALL_CHANGE_UNLINK) {
+                        r = sd_bus_message_append(reply, "s", changes[i].path);
+                        if (r < 0)
+                                return r;
+                }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_job_waiting(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        uint32_t id;
+        Job *j;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "u", &id);
+        if (r < 0)
+                return r;
+
+        j = manager_get_job(m, id);
+        if (!j)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+        return bus_job_method_get_waiting_jobs(message, j, error);
+}
+
+static int method_abandon_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0)
+                return r;
+
+        r = bus_get_unit_by_name(m, message, name, &u, error);
+        if (r < 0)
+                return r;
+
+        if (u->type != UNIT_SCOPE)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Unit '%s' is not a scope unit, refusing.", name);
+
+        return bus_scope_method_abandon(message, u, error);
+}
+
+static int method_set_show_status(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        ShowStatus mode = _SHOW_STATUS_INVALID;
+        const char *t;
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_access_check(message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_set_environment_async(m, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = sd_bus_message_read(message, "s", &t);
+        if (r < 0)
+                return r;
+
+        if (!isempty(t)) {
+                mode = show_status_from_string(t);
+                if (mode < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                 "Invalid show status '%s'", t);
+        }
+
+        manager_override_show_status(m, mode, "bus");
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_dump_unit_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return method_generic_unit_operation(message, userdata, error, bus_service_method_dump_file_descriptor_store, 0);
+}
+
+const sd_bus_vtable bus_manager_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_PROPERTY("Version", "s", property_get_version, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Features", "s", property_get_features, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Virtualization", "s", property_get_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ConfidentialVirtualization", "s", property_get_confidential_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Architecture", "s", property_get_architecture, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Tainted", "s", property_get_tainted, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("FirmwareTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FIRMWARE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("LoaderTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_LOADER]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("KernelTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_KERNEL]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0),
+        SD_BUS_PROPERTY("NNames", "u", property_get_hashmap_size, offsetof(Manager, units), 0),
+        SD_BUS_PROPERTY("NFailedUnits", "u", property_get_set_size, offsetof(Manager, failed_units), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0),
+        SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0),
+        SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0),
+        SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0),
+        SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0),
+        SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ShowStatus", "b", property_get_show_status, 0, 0),
+        SD_BUS_PROPERTY("UnitPath", "as", NULL, offsetof(Manager, lookup_paths.search_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultStandardOutput", "s", bus_property_get_exec_output, offsetof(Manager, defaults.std_output), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultStandardError", "s", bus_property_get_exec_output, offsetof(Manager, defaults.std_error), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WatchdogDevice", "s", property_get_watchdog_device, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WatchdogLastPingTimestamp", "t", property_get_watchdog_last_ping_realtime, 0, 0),
+        SD_BUS_PROPERTY("WatchdogLastPingTimestampMonotonic", "t", property_get_watchdog_last_ping_monotonic, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogUSec", "t", property_get_runtime_watchdog, property_set_runtime_watchdog, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreUSec", "t", property_get_pretimeout_watchdog, property_set_pretimeout_watchdog, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreGovernor", "s", property_get_pretimeout_watchdog_governor, property_set_pretimeout_watchdog_governor, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("RebootWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, 0),
+        /* The following item is an obsolete alias */
+        SD_BUS_WRITABLE_PROPERTY("ShutdownWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_WRITABLE_PROPERTY("KExecWatchdogUSec", "t", property_get_kexec_watchdog, property_set_kexec_watchdog, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("ServiceWatchdogs", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, service_watchdogs), 0),
+        SD_BUS_PROPERTY("ControlGroup", "s", NULL, offsetof(Manager, cgroup_root), 0),
+        SD_BUS_PROPERTY("SystemState", "s", property_get_system_state, 0, 0),
+        SD_BUS_PROPERTY("ExitCode", "y", bus_property_get_unsigned, offsetof(Manager, return_value), 0),
+        SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultTimeoutAbortUSec", "t", property_get_default_timeout_abort_usec, 0, 0),
+        SD_BUS_PROPERTY("DefaultDeviceTimeoutUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.device_timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+        /* The following two items are obsolete alias */
+        SD_BUS_PROPERTY("DefaultStartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("DefaultStartLimitInterval", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("DefaultStartLimitBurst", "u", bus_property_get_unsigned, offsetof(Manager, defaults.start_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultCPUAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.cpu_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultBlockIOAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.blockio_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultIOAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.io_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultIPAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.ip_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultMemoryAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.memory_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultTasksAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.tasks_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitCPU", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitCPUSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitFSIZE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitDATA", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitDATASoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitSTACK", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitCORE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitCORESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitRSS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitRSSSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitNOFILE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitAS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitASSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitNPROC", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitLOCKS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitNICE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitNICESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitRTPRIO", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, defaults.tasks_max), 0),
+        SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.memory_pressure_threshold_usec), 0),
+        SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.memory_pressure_watch), 0),
+        SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CtrlAltDelBurstAction", "s", bus_property_get_emergency_action, offsetof(Manager, cad_burst_action), SD_BUS_VTABLE_PROPERTY_CONST),
+
+        SD_BUS_METHOD_WITH_ARGS("GetUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_RESULT("o", unit),
+                                method_get_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitByPID",
+                                SD_BUS_ARGS("u", pid),
+                                SD_BUS_RESULT("o", unit),
+                                method_get_unit_by_pid,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitByInvocationID",
+                                SD_BUS_ARGS("ay", invocation_id),
+                                SD_BUS_RESULT("o", unit),
+                                method_get_unit_by_invocation_id,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitByControlGroup",
+                                SD_BUS_ARGS("s", cgroup),
+                                SD_BUS_RESULT("o", unit),
+                                method_get_unit_by_control_group,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitByPIDFD",
+                                SD_BUS_ARGS("h", pidfd),
+                                SD_BUS_RESULT("o", unit, "s", unit_id, "ay", invocation_id),
+                                method_get_unit_by_pidfd,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("LoadUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_RESULT("o", unit),
+                                method_load_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("StartUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_start_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("StartUnitWithFlags",
+                                SD_BUS_ARGS("s", name, "s", mode, "t", flags),
+                                SD_BUS_RESULT("o", job),
+                                method_start_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("StartUnitReplace",
+                                SD_BUS_ARGS("s", old_unit, "s", new_unit, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_start_unit_replace,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("StopUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_stop_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ReloadUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_reload_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("RestartUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_restart_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("TryRestartUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_try_restart_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ReloadOrRestartUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_reload_or_restart_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestartUnit",
+                                SD_BUS_ARGS("s", name, "s", mode),
+                                SD_BUS_RESULT("o", job),
+                                method_reload_or_try_restart_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("EnqueueUnitJob",
+                                SD_BUS_ARGS("s", name, "s", job_type, "s", job_mode),
+                                SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs),
+                                method_enqueue_unit_job,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("KillUnit",
+                                SD_BUS_ARGS("s", name, "s", whom, "i", signal),
+                                SD_BUS_NO_RESULT,
+                                method_kill_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("QueueSignalUnit",
+                                SD_BUS_ARGS("s", name, "s", whom, "i", signal, "i", value),
+                                SD_BUS_NO_RESULT,
+                                method_kill_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("CleanUnit",
+                                SD_BUS_ARGS("s", name, "as", mask),
+                                SD_BUS_NO_RESULT,
+                                method_clean_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("FreezeUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_freeze_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ThawUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_thaw_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ResetFailedUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_reset_failed_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetUnitProperties",
+                                SD_BUS_ARGS("s", name, "b", runtime, "a(sv)", properties),
+                                SD_BUS_NO_RESULT,
+                                method_set_unit_properties,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("BindMountUnit",
+                                SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir),
+                                SD_BUS_NO_RESULT,
+                                method_bind_mount_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("MountImageUnit",
+                                SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options),
+                                SD_BUS_NO_RESULT,
+                                method_mount_image_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("RefUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_ref_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("UnrefUnit",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_unref_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("StartTransientUnit",
+                                SD_BUS_ARGS("s", name, "s", mode, "a(sv)", properties, "a(sa(sv))", aux),
+                                SD_BUS_RESULT("o", job),
+                                method_start_transient_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitProcesses",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_RESULT("a(sus)", processes),
+                                method_get_unit_processes,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("AttachProcessesToUnit",
+                                SD_BUS_ARGS("s", unit_name, "s", subcgroup, "au", pids),
+                                SD_BUS_NO_RESULT,
+                                method_attach_processes_to_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("AbandonScope",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_abandon_scope,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetJob",
+                                SD_BUS_ARGS("u", id),
+                                SD_BUS_RESULT("o", job),
+                                method_get_job,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetJobAfter",
+                                SD_BUS_ARGS("u", id),
+                                SD_BUS_RESULT("a(usssoo)", jobs),
+                                method_get_job_waiting,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetJobBefore",
+                                SD_BUS_ARGS("u", id),
+                                SD_BUS_RESULT("a(usssoo)", jobs),
+                                method_get_job_waiting,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("CancelJob",
+                                SD_BUS_ARGS("u", id),
+                                SD_BUS_NO_RESULT,
+                                method_cancel_job,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("ClearJobs",
+                      NULL,
+                      NULL,
+                      method_clear_jobs,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("ResetFailed",
+                      NULL,
+                      NULL,
+                      method_reset_failed,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetShowStatus",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_NO_RESULT,
+                                method_set_show_status,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListUnits",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("a(ssssssouso)", units),
+                                method_list_units,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListUnitsFiltered",
+                                SD_BUS_ARGS("as", states),
+                                SD_BUS_RESULT("a(ssssssouso)", units),
+                                method_list_units_filtered,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListUnitsByPatterns",
+                                SD_BUS_ARGS("as", states, "as", patterns),
+                                SD_BUS_RESULT("a(ssssssouso)", units),
+                                method_list_units_by_patterns,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListUnitsByNames",
+                                SD_BUS_ARGS("as", names),
+                                SD_BUS_RESULT("a(ssssssouso)", units),
+                                method_list_units_by_names,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListJobs",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("a(usssoo)", jobs),
+                                method_list_jobs,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Subscribe",
+                      NULL,
+                      NULL,
+                      method_subscribe,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Unsubscribe",
+                      NULL,
+                      NULL,
+                      method_unsubscribe,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Dump",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("s", output),
+                                method_dump,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatterns",
+                                SD_BUS_ARGS("as", patterns),
+                                SD_BUS_RESULT("s", output),
+                                method_dump_units_matching_patterns,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DumpByFileDescriptor",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("h", fd),
+                                method_dump_by_fd,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatternsByFileDescriptor",
+                                SD_BUS_ARGS("as", patterns),
+                                SD_BUS_RESULT("h", fd),
+                                method_dump_units_matching_patterns_by_fd,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("CreateSnapshot",
+                                SD_BUS_ARGS("s", name, "b", cleanup),
+                                SD_BUS_RESULT("o", unit),
+                                method_refuse_snapshot,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_METHOD_WITH_ARGS("RemoveSnapshot",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_NO_RESULT,
+                                method_refuse_snapshot,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_METHOD("Reload",
+                      NULL,
+                      NULL,
+                      method_reload,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Reexecute",
+                      NULL,
+                      NULL,
+                      method_reexecute,
+                      SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_METHOD_NO_REPLY),
+        SD_BUS_METHOD("Exit",
+                      NULL,
+                      NULL,
+                      method_exit,
+                      0),
+        SD_BUS_METHOD("Reboot",
+                      NULL,
+                      NULL,
+                      method_reboot,
+                      SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+        SD_BUS_METHOD_WITH_ARGS("SoftReboot",
+                                SD_BUS_ARGS("s", new_root),
+                                SD_BUS_NO_RESULT,
+                                method_soft_reboot,
+                                SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+        SD_BUS_METHOD("PowerOff",
+                      NULL,
+                      NULL,
+                      method_poweroff,
+                      SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+        SD_BUS_METHOD("Halt",
+                      NULL,
+                      NULL,
+                      method_halt,
+                      SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+        SD_BUS_METHOD("KExec",
+                      NULL,
+                      NULL,
+                      method_kexec,
+                      SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+        SD_BUS_METHOD_WITH_ARGS("SwitchRoot",
+                                SD_BUS_ARGS("s", new_root, "s", init),
+                                SD_BUS_NO_RESULT,
+                                method_switch_root,
+                                SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+        SD_BUS_METHOD_WITH_ARGS("SetEnvironment",
+                                SD_BUS_ARGS("as", assignments),
+                                SD_BUS_NO_RESULT,
+                                method_set_environment,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("UnsetEnvironment",
+                                SD_BUS_ARGS("as", names),
+                                SD_BUS_NO_RESULT,
+                                method_unset_environment,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("UnsetAndSetEnvironment",
+                                SD_BUS_ARGS("as", names, "as", assignments),
+                                SD_BUS_NO_RESULT,
+                                method_unset_and_set_environment,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("EnqueueMarkedJobs",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("ao", jobs),
+                                method_enqueue_marked_jobs,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListUnitFiles",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("a(ss)", unit_files),
+                                method_list_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ListUnitFilesByPatterns",
+                                SD_BUS_ARGS("as", states, "as", patterns),
+                                SD_BUS_RESULT("a(ss)", unit_files),
+                                method_list_unit_files_by_patterns,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitFileState",
+                                SD_BUS_ARGS("s", file),
+                                SD_BUS_RESULT("s", state),
+                                method_get_unit_file_state,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("EnableUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+                                SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+                                method_enable_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DisableUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_disable_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("EnableUnitFilesWithFlags",
+                                SD_BUS_ARGS("as", files, "t", flags),
+                                SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+                                method_enable_unit_files_with_flags,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlags",
+                                SD_BUS_ARGS("as", files, "t", flags),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_disable_unit_files_with_flags,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlagsAndInstallInfo",
+                                SD_BUS_ARGS("as", files, "t", flags),
+                                SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+                                method_disable_unit_files_with_flags_and_install_info,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ReenableUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+                                SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+                                method_reenable_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("LinkUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_link_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("PresetUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+                                SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+                                method_preset_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("PresetUnitFilesWithMode",
+                                SD_BUS_ARGS("as", files, "s", mode, "b", runtime, "b", force),
+                                SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+                                method_preset_unit_files_with_mode,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("MaskUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_mask_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("UnmaskUnitFiles",
+                                SD_BUS_ARGS("as", files, "b", runtime),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_unmask_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("RevertUnitFiles",
+                                SD_BUS_ARGS("as", files),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_revert_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetDefaultTarget",
+                                SD_BUS_ARGS("s", name, "b", force),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_set_default_target,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetDefaultTarget",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("s", name),
+                                method_get_default_target,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("PresetAllUnitFiles",
+                                SD_BUS_ARGS("s", mode, "b", runtime, "b", force),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_preset_all_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("AddDependencyUnitFiles",
+                                SD_BUS_ARGS("as", files, "s", target, "s", type, "b", runtime, "b", force),
+                                SD_BUS_RESULT("a(sss)", changes),
+                                method_add_dependency_unit_files,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUnitFileLinks",
+                                SD_BUS_ARGS("s", name, "b", runtime),
+                                SD_BUS_RESULT("as", links),
+                                method_get_unit_file_links,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetExitCode",
+                                SD_BUS_ARGS("y", number),
+                                SD_BUS_NO_RESULT,
+                                method_set_exit_code,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByName",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_RESULT("u", uid),
+                                method_lookup_dynamic_user_by_name,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByUID",
+                                SD_BUS_ARGS("u", uid),
+                                SD_BUS_RESULT("s", name),
+                                method_lookup_dynamic_user_by_uid,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetDynamicUsers",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("a(us)", users),
+                                method_get_dynamic_users,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("DumpUnitFileDescriptorStore",
+                                SD_BUS_ARGS("s", name),
+                                SD_BUS_RESULT("a(suuutuusu)", entries),
+                                method_dump_unit_descriptor_store,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_SIGNAL_WITH_ARGS("UnitNew",
+                                SD_BUS_ARGS("s", id, "o", unit),
+                                0),
+        SD_BUS_SIGNAL_WITH_ARGS("UnitRemoved",
+                                SD_BUS_ARGS("s", id, "o", unit),
+                                0),
+        SD_BUS_SIGNAL_WITH_ARGS("JobNew",
+                                SD_BUS_ARGS("u", id, "o", job, "s", unit),
+                                0),
+        SD_BUS_SIGNAL_WITH_ARGS("JobRemoved",
+                                SD_BUS_ARGS("u", id, "o", job, "s", unit, "s", result),
+                                0),
+        SD_BUS_SIGNAL_WITH_ARGS("StartupFinished",
+                                SD_BUS_ARGS("t", firmware, "t", loader, "t", kernel, "t", initrd, "t", userspace, "t", total),
+                                0),
+        SD_BUS_SIGNAL("UnitFilesChanged", NULL, 0),
+        SD_BUS_SIGNAL_WITH_ARGS("Reloading",
+                                SD_BUS_ARGS("b", active),
+                                0),
+
+        SD_BUS_VTABLE_END
+};
+
+const sd_bus_vtable bus_manager_log_control_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        /* We define a private version of this interface here, since we want slightly different
+         * implementations for the setters. We'll still use the generic getters however, and we share the
+         * setters with the implementations for the Manager interface above (which pre-dates the generic
+         * service API interface). */
+
+        SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0),
+        SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0),
+        SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0),
+
+        SD_BUS_VTABLE_END,
+};
+
+static int send_finished(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+        usec_t *times = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+
+        r = sd_bus_message_new_signal(bus,
+                                      &message,
+                                      "/org/freedesktop/systemd1",
+                                      "org.freedesktop.systemd1.Manager",
+                                      "StartupFinished");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(message, "tttttt", times[0], times[1], times[2], times[3], times[4], times[5]);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, message, NULL);
+}
+
+void bus_manager_send_finished(
+                Manager *m,
+                usec_t firmware_usec,
+                usec_t loader_usec,
+                usec_t kernel_usec,
+                usec_t initrd_usec,
+                usec_t userspace_usec,
+                usec_t total_usec) {
+
+        int r;
+
+        assert(m);
+
+        r = bus_foreach_bus(
+                        m,
+                        NULL,
+                        send_finished,
+                        (usec_t[6]) {
+                                firmware_usec,
+                                loader_usec,
+                                kernel_usec,
+                                initrd_usec,
+                                userspace_usec,
+                                total_usec
+                        });
+        if (r < 0)
+                log_debug_errno(r, "Failed to send finished signal: %m");
+}
+
+static int send_reloading(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+        int r;
+
+        assert(bus);
+
+        r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "Reloading");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(message, "b", PTR_TO_INT(userdata));
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, message, NULL);
+}
+
+void bus_manager_send_reloading(Manager *m, bool active) {
+        int r;
+
+        assert(m);
+
+        r = bus_foreach_bus(m, NULL, send_reloading, INT_TO_PTR(active));
+        if (r < 0)
+                log_debug_errno(r, "Failed to send reloading signal: %m");
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+        assert(bus);
+
+        return sd_bus_emit_properties_changed_strv(bus,
+                                                   "/org/freedesktop/systemd1",
+                                                   "org.freedesktop.systemd1.Manager",
+                                                   NULL);
+}
+
+void bus_manager_send_change_signal(Manager *m) {
+        int r;
+
+        assert(m);
+
+        r = bus_foreach_bus(m, NULL, send_changed_signal, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to send manager change signal: %m");
+}
diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h
new file mode 100644
index 0000000..9b05080
--- /dev/null
+++ b/src/core/dbus-manager.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+#include "manager.h"
+
+extern const sd_bus_vtable bus_manager_vtable[];
+extern const sd_bus_vtable bus_manager_log_control_vtable[];
+
+void bus_manager_send_finished(Manager *m, usec_t firmware_usec, usec_t loader_usec, usec_t kernel_usec, usec_t initrd_usec, usec_t userspace_usec, usec_t total_usec);
+void bus_manager_send_reloading(Manager *m, bool active);
+void bus_manager_send_change_signal(Manager *m);
+
+int verify_run_space_and_log(const char *message);
+
+int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_emergency_action(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c
new file mode 100644
index 0000000..7dbbdd0
--- /dev/null
+++ b/src/core/dbus-mount.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-mount.h"
+#include "dbus-util.h"
+#include "mount.h"
+#include "string-util.h"
+#include "unit.h"
+#include "utf8.h"
+
+static int property_get_what(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *escaped = NULL;
+        Mount *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        escaped = mount_get_what_escaped(m);
+        if (!escaped)
+                return -ENOMEM;
+
+        return sd_bus_message_append_basic(reply, 's', escaped);
+}
+
+static int property_get_options(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *escaped = NULL;
+        Mount *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        escaped = mount_get_options_escaped(m);
+        if (!escaped)
+                return -ENOMEM;
+
+        return sd_bus_message_append_basic(reply, 's', escaped);
+}
+
+static BUS_DEFINE_PROPERTY_GET(property_get_type, "s", Mount, mount_get_fstype);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, mount_result, MountResult);
+
+const sd_bus_vtable bus_mount_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LazyUnmount", "b", bus_property_get_bool, offsetof(Mount, lazy_unmount), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ForceUnmount", "b", bus_property_get_bool, offsetof(Mount, force_unmount), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReadWriteOnly", "b", bus_property_get_bool, offsetof(Mount, read_write_only), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_mount_set_transient_property(
+                Mount *m,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Unit *u = UNIT(m);
+
+        assert(m);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "Where"))
+                return bus_set_transient_path(u, name, &m->where, message, flags, error);
+
+        if (streq(name, "What"))
+                return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error);
+
+        if (streq(name, "Options"))
+                return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error);
+
+        if (streq(name, "Type"))
+                return bus_set_transient_string(u, name, &m->parameters_fragment.fstype, message, flags, error);
+
+        if (streq(name, "TimeoutUSec"))
+                return bus_set_transient_usec_fix_0(u, name, &m->timeout_usec, message, flags, error);
+
+        if (streq(name, "DirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &m->directory_mode, message, flags, error);
+
+        if (streq(name, "SloppyOptions"))
+                return bus_set_transient_bool(u, name, &m->sloppy_options, message, flags, error);
+
+        if (streq(name, "LazyUnmount"))
+                return bus_set_transient_bool(u, name, &m->lazy_unmount, message, flags, error);
+
+        if (streq(name, "ForceUnmount"))
+                return bus_set_transient_bool(u, name, &m->force_unmount, message, flags, error);
+
+        if (streq(name, "ReadWriteOnly"))
+                return bus_set_transient_bool(u, name, &m->read_write_only, message, flags, error);
+
+        return 0;
+}
+
+int bus_mount_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Mount *m = MOUNT(u);
+        int r;
+
+        assert(m);
+        assert(name);
+        assert(message);
+
+        r = bus_cgroup_set_property(u, &m->cgroup_context, name, message, flags, error);
+        if (r != 0)
+                return r;
+
+        if (u->transient && u->load_state == UNIT_STUB) {
+                /* This is a transient unit, let's load a little more */
+
+                r = bus_mount_set_transient_property(m, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_exec_context_set_transient_property(u, &m->exec_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_kill_context_set_transient_property(u, &m->kill_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int bus_mount_commit_properties(Unit *u) {
+        assert(u);
+
+        unit_realize_cgroup(u);
+
+        return 0;
+}
diff --git a/src/core/dbus-mount.h b/src/core/dbus-mount.h
new file mode 100644
index 0000000..5a848d3
--- /dev/null
+++ b/src/core/dbus-mount.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_mount_vtable[];
+
+int bus_mount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_mount_commit_properties(Unit *u);
diff --git a/src/core/dbus-path.c b/src/core/dbus-path.c
new file mode 100644
index 0000000..8cb6a26
--- /dev/null
+++ b/src/core/dbus-path.c
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-path.h"
+#include "dbus-util.h"
+#include "list.h"
+#include "path.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, path_result, PathResult);
+
+static int property_get_paths(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Path *p = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(spec, k, p->specs) {
+                r = sd_bus_message_append(reply, "(ss)", path_type_to_string(k->type), k->path);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_path_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Paths", "a(ss)", property_get_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MakeDirectory", "b", bus_property_get_bool, offsetof(Path, make_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Path, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Path, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Path, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Path, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_path_set_transient_property(
+                Path *p,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Unit *u = UNIT(p);
+        int r;
+
+        assert(p);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "MakeDirectory"))
+                return bus_set_transient_bool(u, name, &p->make_directory, message, flags, error);
+
+        if (streq(name, "DirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &p->directory_mode, message, flags, error);
+
+        if (streq(name, "Paths")) {
+                const char *type_name, *path;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ss)", &type_name, &path)) > 0) {
+                        PathType t;
+
+                        t = path_type_from_string(type_name);
+                        if (t < 0)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown path type: %s", type_name);
+
+                        if (isempty(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is empty", type_name);
+
+                        if (!path_is_absolute(path))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is not absolute: %s", type_name, path);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ char *k = NULL;
+
+                                r = path_simplify_alloc(path, &k);
+                                if (r < 0)
+                                        return r;
+
+                                PathSpec *s = new(PathSpec, 1);
+                                if (!s)
+                                        return -ENOMEM;
+
+                                *s = (PathSpec) {
+                                        .unit = u,
+                                        .path = TAKE_PTR(k),
+                                        .type = t,
+                                        .inotify_fd = -EBADF,
+                                };
+
+                                LIST_PREPEND(spec, p->specs, s);
+
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", type_name, path);
+                        }
+
+                        empty = false;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+                        path_free_specs(p);
+                        unit_write_settingf(u, flags, name, "PathExists=");
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "TriggerLimitBurst"))
+                return bus_set_transient_unsigned(u, name, &p->trigger_limit.burst, message, flags, error);
+
+        if (streq(name, "TriggerLimitIntervalUSec"))
+                return bus_set_transient_usec(u, name, &p->trigger_limit.interval, message, flags, error);
+
+        return 0;
+}
+
+int bus_path_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags mode,
+                sd_bus_error *error) {
+
+        Path *p = PATH(u);
+
+        assert(p);
+        assert(name);
+        assert(message);
+
+        if (u->transient && u->load_state == UNIT_STUB)
+                return bus_path_set_transient_property(p, name, message, mode, error);
+
+        return 0;
+}
diff --git a/src/core/dbus-path.h b/src/core/dbus-path.h
new file mode 100644
index 0000000..b5018b0
--- /dev/null
+++ b/src/core/dbus-path.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_path_vtable[];
+
+int bus_path_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
new file mode 100644
index 0000000..78196a1
--- /dev/null
+++ b/src/core/dbus-scope.c
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-scope.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "scope.h"
+#include "selinux-access.h"
+#include "unit.h"
+
+int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Scope *s = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(UNIT(s), message, "stop", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async(UNIT(s)->manager, message, error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = scope_abandon(s);
+        if (r == -ESTALE)
+                return sd_bus_error_setf(error, BUS_ERROR_SCOPE_NOT_RUNNING, "Scope %s is not running, cannot abandon.", UNIT(s)->id);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
+
+const sd_bus_vtable bus_scope_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Controller", "s", NULL, offsetof(Scope, controller), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Scope, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_SIGNAL("RequestStop", NULL, 0),
+        SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_method_abandon, SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_scope_set_transient_property(
+                Scope *s,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Unit *u = UNIT(s);
+        int r;
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "TimeoutStopUSec"))
+                return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error);
+
+        if (streq(name, "RuntimeMaxUSec"))
+                return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error);
+
+        if (streq(name, "RuntimeRandomizedExtraUSec"))
+                return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error);
+
+        if (streq(name, "OOMPolicy"))
+                return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
+        if (streq(name, "PIDs")) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+                unsigned n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "u");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+                        uint32_t upid;
+                        pid_t pid;
+
+                        r = sd_bus_message_read(message, "u", &upid);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        if (upid == 0) {
+                                if (!creds) {
+                                        r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                r = sd_bus_creds_get_pid(creds, &pid);
+                                if (r < 0)
+                                        return r;
+                        } else
+                                pid = (uid_t) upid;
+
+                        r = pidref_set_pid(&pidref, pid);
+                        if (r < 0)
+                                return r;
+
+                        r = unit_pid_attachable(u, &pidref, error);
+                        if (r < 0)
+                                return r;
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+                                if (r < 0 && r != -EEXIST)
+                                        return r;
+                        }
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                return n <= 0 ? -EINVAL : 1;
+        }
+
+        if (streq(name, "PIDFDs")) {
+                unsigned n = 0;
+
+                r = sd_bus_message_enter_container(message, 'a', "h");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+                        int fd;
+
+                        r = sd_bus_message_read(message, "h", &fd);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        r = pidref_set_pidfd(&pidref, fd);
+                        if (r < 0)
+                                return r;
+
+                        r = unit_pid_attachable(u, &pidref, error);
+                        if (r < 0)
+                                return r;
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+                                if (r < 0 && r != -EEXIST)
+                                        return r;
+                        }
+
+                        n++;
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                return n <= 0 ? -EINVAL : 1;
+        }
+
+        if (streq(name, "Controller")) {
+                const char *controller;
+
+                /* We can't support direct connections with this, as direct connections know no service or unique name
+                 * concept, but the Controller field stores exactly that. */
+                if (sd_bus_message_get_bus(message) != u->manager->api_bus)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Sorry, Controller= logic only supported via the bus.");
+
+                r = sd_bus_message_read(message, "s", &controller);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(controller) && !sd_bus_service_name_is_valid(controller))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Controller '%s' is not a valid bus name.", controller);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        r = free_and_strdup(&s->controller, empty_to_null(controller));
+                        if (r < 0)
+                                return r;
+                }
+
+                return 1;
+        }
+
+        return 0;
+}
+
+int bus_scope_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Scope *s = SCOPE(u);
+        int r;
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+        if (r != 0)
+                return r;
+
+        if (u->load_state == UNIT_STUB) {
+                /* While we are created we still accept PIDs */
+
+                r = bus_scope_set_transient_property(s, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                if (streq(name, "User"))
+                        return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
+
+                if (streq(name, "Group"))
+                        return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
+        }
+
+        return 0;
+}
+
+int bus_scope_commit_properties(Unit *u) {
+        assert(u);
+
+        unit_realize_cgroup(u);
+
+        return 0;
+}
+
+int bus_scope_send_request_stop(Scope *s) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(s);
+
+        if (!s->controller)
+                return 0;
+
+        p = unit_dbus_path(UNIT(s));
+        if (!p)
+                return -ENOMEM;
+
+        r = sd_bus_message_new_signal(
+                        UNIT(s)->manager->api_bus,
+                        &m,
+                        p,
+                        "org.freedesktop.systemd1.Scope",
+                        "RequestStop");
+        if (r < 0)
+                return r;
+
+        return sd_bus_send_to(UNIT(s)->manager->api_bus, m, s->controller, NULL);
+}
+
+static int on_controller_gone(sd_bus_track *track, void *userdata) {
+        Scope *s = userdata;
+
+        assert(track);
+
+        if (s->controller) {
+                log_unit_debug(UNIT(s), "Controller %s disappeared from bus.", s->controller);
+                unit_add_to_dbus_queue(UNIT(s));
+                s->controller = mfree(s->controller);
+        }
+
+        s->controller_track = sd_bus_track_unref(s->controller_track);
+
+        return 0;
+}
+
+int bus_scope_track_controller(Scope *s) {
+        int r;
+
+        assert(s);
+
+        if (!s->controller || s->controller_track)
+                return 0;
+
+        r = sd_bus_track_new(UNIT(s)->manager->api_bus, &s->controller_track, on_controller_gone, s);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_track_add_name(s->controller_track, s->controller);
+        if (r < 0) {
+                s->controller_track = sd_bus_track_unref(s->controller_track);
+                return r;
+        }
+
+        return 0;
+}
diff --git a/src/core/dbus-scope.h b/src/core/dbus-scope.h
new file mode 100644
index 0000000..8f1bc02
--- /dev/null
+++ b/src/core/dbus-scope.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "scope.h"
+#include "unit.h"
+
+extern const sd_bus_vtable bus_scope_vtable[];
+
+int bus_scope_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
+int bus_scope_commit_properties(Unit *u);
+
+int bus_scope_send_request_stop(Scope *s);
+
+int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+int bus_scope_track_controller(Scope *s);
diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c
new file mode 100644
index 0000000..cc478f4
--- /dev/null
+++ b/src/core/dbus-service.c
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "async.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-service.h"
+#include "dbus-util.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "locale-util.h"
+#include "missing_fcntl.h"
+#include "mount-util.h"
+#include "open-file.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "selinux-access.h"
+#include "service.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, service_type, ServiceType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exit_type, service_exit_type, ServiceExitType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, service_result, ServiceResult);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, ServiceRestart);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart_mode, service_restart_mode, ServiceRestartMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction);
+static BUS_DEFINE_PROPERTY_GET2(property_get_notify_access, "s", Service, service_get_notify_access, notify_access_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_restart_usec_next, "t", Service, service_restart_usec_next);
+static BUS_DEFINE_PROPERTY_GET(property_get_timeout_abort_usec, "t", Service, service_timeout_abort_usec);
+static BUS_DEFINE_PROPERTY_GET(property_get_watchdog_usec, "t", Service, service_get_watchdog_usec);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode);
+
+static int property_get_open_files(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        OpenFile **open_files = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sst)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(open_files, of, *open_files) {
+                r = sd_bus_message_append(reply, "(sst)", of->path, of->fdname, (uint64_t) of->flags);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_exit_status_set(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        const ExitStatusSet *status_set = ASSERT_PTR(userdata);
+        unsigned n;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'r', "aiai");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "i");
+        if (r < 0)
+                return r;
+
+        BITMAP_FOREACH(n, &status_set->status) {
+                assert(n < 256);
+
+                r = sd_bus_message_append_basic(reply, 'i', &n);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "i");
+        if (r < 0)
+                return r;
+
+        BITMAP_FOREACH(n, &status_set->signal) {
+                const char *str;
+
+                str = signal_to_string(n);
+                if (!str)
+                        continue;
+
+                r = sd_bus_message_append_basic(reply, 'i', &n);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int bus_service_method_mount(sd_bus_message *message, void *userdata, sd_bus_error *error, bool is_image) {
+        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+        const char *dest, *src, *propagate_directory;
+        int read_only, make_file_or_directory;
+        Unit *u = ASSERT_PTR(userdata);
+        ExecContext *c;
+        int r;
+
+        assert(message);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Adding bind mounts at runtime is only supported for system managers.");
+
+        r = mac_selinux_unit_access_check(u, message, "start", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "ssbb", &src, &dest, &read_only, &make_file_or_directory);
+        if (r < 0)
+                return r;
+
+        if (!path_is_absolute(src) || !path_is_normalized(src))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized.");
+
+        if (!is_image && isempty(dest))
+                dest = src;
+        else if (!path_is_absolute(dest) || !path_is_normalized(dest))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized.");
+
+        if (is_image) {
+                r = bus_read_mount_options(message, error, &options, NULL, "");
+                if (r < 0)
+                        return r;
+        }
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        is_image ? "mount-image" : "bind-mount",
+                        CAP_SYS_ADMIN,
+                        N_("Authentication is required to mount on '$(unit)'."),
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        if (u->type != UNIT_SERVICE)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not of type .service");
+
+        /* If it would be dropped at startup time, return an error. The context should always be available, but
+         * there's an assert in exec_needs_mount_namespace, so double-check just in case. */
+        c = unit_get_exec_context(u);
+        if (!c)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot access unit execution context");
+        if (path_startswith_strv(dest, c->inaccessible_paths))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s is not accessible to this unit", dest);
+
+        /* Ensure that the unit was started in a private mount namespace */
+        if (!exec_needs_mount_namespace(c, NULL, unit_get_exec_runtime(u)))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit not running in private mount namespace, cannot activate bind mount");
+
+        PidRef* unit_pid = unit_main_pid(u);
+        if (!pidref_is_set(unit_pid) || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not running");
+
+        propagate_directory = strjoina("/run/systemd/propagate/", u->id);
+        if (is_image)
+                r = mount_image_in_namespace(
+                                unit_pid,
+                                propagate_directory,
+                                "/run/systemd/incoming/",
+                                src, dest,
+                                read_only,
+                                make_file_or_directory,
+                                options,
+                                c->mount_image_policy ?: &image_policy_service);
+        else
+                r = bind_mount_in_namespace(
+                                unit_pid,
+                                propagate_directory,
+                                "/run/systemd/incoming/",
+                                src, dest,
+                                read_only,
+                                make_file_or_directory);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in unit's namespace: %m", src, dest);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_service_method_mount(message, userdata, error, false);
+}
+
+int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_service_method_mount(message, userdata, error, true);
+}
+
+int bus_service_method_dump_file_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Service *s = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(UNIT(s), message, "status", error);
+        if (r < 0)
+                return r;
+
+        if (s->n_fd_store_max == 0 && s->n_fd_store == 0)
+                return sd_bus_error_setf(error, BUS_ERROR_FILE_DESCRIPTOR_STORE_DISABLED, "File descriptor store not enabled for %s.", UNIT(s)->id);
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(suuutuusu)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(fd_store, i, s->fd_store) {
+                _cleanup_free_ char *path = NULL;
+                struct stat st;
+                int flags;
+
+                if (fstat(i->fd, &st) < 0) {
+                        log_debug_errno(errno, "Failed to stat() file descriptor entry '%s', skipping.", strna(i->fdname));
+                        continue;
+                }
+
+                flags = fcntl(i->fd, F_GETFL);
+                if (flags < 0) {
+                        log_debug_errno(errno, "Failed to issue F_GETFL on file descriptor entry '%s', skipping.", strna(i->fdname));
+                        continue;
+                }
+
+                /* glibc implies O_LARGEFILE everywhere on 64-bit off_t builds, but forgets to hide it away on
+                 * F_GETFL, but provides no definition to check for that. Let's mask the flag away manually,
+                 * to not confuse clients. */
+                flags &= ~RAW_O_LARGEFILE;
+
+                (void) fd_get_path(i->fd, &path);
+
+                r = sd_bus_message_append(
+                                reply,
+                                "(suuutuusu)",
+                                i->fdname,
+                                (uint32_t) st.st_mode,
+                                (uint32_t) major(st.st_dev), (uint32_t) minor(st.st_dev),
+                                (uint64_t) st.st_ino,
+                                (uint32_t) major(st.st_rdev), (uint32_t) minor(st.st_rdev),
+                                path,
+                                (uint32_t) flags);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+#if __SIZEOF_SIZE_T__ == 8
+static int property_get_size_as_uint32(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        size_t *value = ASSERT_PTR(userdata);
+        uint32_t sz = *value >= UINT32_MAX ? UINT32_MAX : (uint32_t) *value;
+
+        /* Returns a size_t as a D-Bus "u" type, i.e. as 32-bit value, even if size_t is 64-bit. We'll saturate if it doesn't fit. */
+
+        return sd_bus_message_append_basic(reply, 'u', &sz);
+}
+#elif __SIZEOF_SIZE_T__ == 4
+#define property_get_size_as_uint32 ((sd_bus_property_get_t) NULL)
+#else
+#error "Unexpected size of size_t"
+#endif
+
+const sd_bus_vtable bus_service_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ExitType", "s", property_get_exit_type, offsetof(Service, exit_type), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Restart", "s", property_get_restart, offsetof(Service, restart), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartMode", "s", property_get_restart_mode, offsetof(Service, restart_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PIDFile", "s", NULL, offsetof(Service, pid_file), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NotifyAccess", "s", property_get_notify_access, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartSteps", "u", bus_property_get_unsigned, offsetof(Service, restart_steps), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartMaxDelayUSec", "t", bus_property_get_usec, offsetof(Service, restart_max_delay_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartUSecNext", "t", property_get_restart_usec_next, 0, 0),
+        SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimeoutAbortUSec", "t", property_get_timeout_abort_usec, 0, 0),
+        SD_BUS_PROPERTY("TimeoutStartFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_start_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimeoutStopFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_stop_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Service, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WatchdogUSec", "t", property_get_watchdog_usec, 0, 0),
+        BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0),
+        SD_BUS_PROPERTY("PermissionsStartOnly", "b", bus_property_get_bool, offsetof(Service, permissions_start_only), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* 😷 deprecated */
+        SD_BUS_PROPERTY("RootDirectoryStartOnly", "b", bus_property_get_bool, offsetof(Service, root_directory_start_only), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RemainAfterExit", "b", bus_property_get_bool, offsetof(Service, remain_after_exit), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("GuessMainPID", "b", bus_property_get_bool, offsetof(Service, guess_main_pid), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartPreventExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_prevent_status), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestartForceExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_force_status), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SuccessExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, success_status), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NFileDescriptorStore", "u", property_get_size_as_uint32, offsetof(Service, n_fd_store), 0),
+        SD_BUS_PROPERTY("FileDescriptorStorePreserve", "s", bus_property_get_exec_preserve_mode, offsetof(Service, fd_store_preserve_mode), 0),
+        SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("StatusErrno", "i", bus_property_get_int, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("ReloadResult", "s", property_get_result, offsetof(Service, reload_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("CleanResult", "s", property_get_result, offsetof(Service, clean_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OpenFile", "a(sst)", property_get_open_files, offsetof(Service, open_files), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReloadSignal", "i", bus_property_get_int, offsetof(Service, reload_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+
+        BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecCondition", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecConditionEx", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPreEx", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartEx", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPostEx", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecReload", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecReloadEx", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStop", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopPostEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+
+        SD_BUS_METHOD_WITH_ARGS("BindMount",
+                                SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir),
+                                SD_BUS_NO_RESULT,
+                                bus_service_method_bind_mount,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_METHOD_WITH_ARGS("MountImage",
+                                 SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options),
+                                 SD_BUS_NO_RESULT,
+                                 bus_service_method_mount_image,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_METHOD_WITH_ARGS("DumpFileDescriptorStore",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_ARGS("a(suuutuusu)", entries),
+                                bus_service_method_dump_file_descriptor_store,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */
+        SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_VTABLE_END
+};
+
+static int bus_set_transient_exit_status(
+                Unit *u,
+                const char *name,
+                ExitStatusSet *status_set,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        const int32_t *status, *signal;
+        size_t n_status, n_signal, i;
+        int r;
+
+        r = sd_bus_message_enter_container(message, 'r', "aiai");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_array(message, 'i', (const void **) &status, &n_status);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read_array(message, 'i', (const void **) &signal, &n_signal);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        n_status /= sizeof(int32_t);
+        n_signal /= sizeof(int32_t);
+
+        if (n_status == 0 && n_signal == 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) {
+                exit_status_set_free(status_set);
+                unit_write_settingf(u, flags, name, "%s=", name);
+                return 1;
+        }
+
+        for (i = 0; i < n_status; i++) {
+                if (status[i] < 0 || status[i] > 255)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid status code in %s: %"PRIi32, name, status[i]);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        r = bitmap_set(&status_set->status, status[i]);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_settingf(u, flags, name, "%s=%"PRIi32, name, status[i]);
+                }
+        }
+
+        for (i = 0; i < n_signal; i++) {
+                const char *str;
+
+                str = signal_to_string((int) signal[i]);
+                if (!str)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal in %s: %"PRIi32, name, signal[i]);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        r = bitmap_set(&status_set->signal, signal[i]);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_settingf(u, flags, name, "%s=%s", name, str);
+                }
+        }
+
+        return 1;
+}
+
+static int bus_set_transient_std_fd(
+                Unit *u,
+                const char *name,
+                int *p,
+                bool *b,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int fd, r;
+
+        assert(p);
+        assert(b);
+
+        r = sd_bus_message_read(message, "h", &fd);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                int copy;
+
+                copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+                if (copy < 0)
+                        return -errno;
+
+                asynchronous_close(*p);
+                *p = copy;
+                *b = true;
+        }
+
+        return 1;
+}
+static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_exit_type, ServiceExitType, service_exit_type_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart_mode, ServiceRestartMode, service_restart_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, sd_bus_service_name_is_valid);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(timeout_failure_mode, ServiceTimeoutFailureMode, service_timeout_failure_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(reload_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+
+static int bus_service_set_transient_property(
+                Service *s,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Unit *u = UNIT(s);
+        ServiceExecCommand ci;
+        int r;
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "PermissionsStartOnly"))
+                return bus_set_transient_bool(u, name, &s->permissions_start_only, message, flags, error);
+
+        if (streq(name, "RootDirectoryStartOnly"))
+                return bus_set_transient_bool(u, name, &s->root_directory_start_only, message, flags, error);
+
+        if (streq(name, "RemainAfterExit"))
+                return bus_set_transient_bool(u, name, &s->remain_after_exit, message, flags, error);
+
+        if (streq(name, "GuessMainPID"))
+                return bus_set_transient_bool(u, name, &s->guess_main_pid, message, flags, error);
+
+        if (streq(name, "Type"))
+                return bus_set_transient_service_type(u, name, &s->type, message, flags, error);
+
+        if (streq(name, "ExitType"))
+                return bus_set_transient_service_exit_type(u, name, &s->exit_type, message, flags, error);
+
+        if (streq(name, "OOMPolicy"))
+                return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
+        if (streq(name, "RestartUSec"))
+                return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error);
+
+        if (streq(name, "RestartSteps"))
+                return bus_set_transient_unsigned(u, name, &s->restart_steps, message, flags, error);
+
+        if (streq(name, "RestartMaxDelayUSec"))
+                return bus_set_transient_usec(u, name, &s->restart_max_delay_usec, message, flags, error);
+
+        if (streq(name, "TimeoutStartUSec")) {
+                r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, error);
+                if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+                        s->start_timeout_defined = true;
+
+                return r;
+        }
+
+        if (streq(name, "TimeoutStopUSec"))
+                return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error);
+
+        if (streq(name, "TimeoutAbortUSec")) {
+                r = bus_set_transient_usec(u, name, &s->timeout_abort_usec, message, flags, error);
+                if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+                        s->timeout_abort_set = true;
+                return r;
+        }
+
+        if (streq(name, "TimeoutStartFailureMode"))
+                return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_start_failure_mode, message, flags, error);
+
+        if (streq(name, "TimeoutStopFailureMode"))
+                return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_stop_failure_mode, message, flags, error);
+
+        if (streq(name, "RuntimeMaxUSec"))
+                return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error);
+
+        if (streq(name, "RuntimeRandomizedExtraUSec"))
+                return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error);
+
+        if (streq(name, "WatchdogUSec"))
+                return bus_set_transient_usec(u, name, &s->watchdog_usec, message, flags, error);
+
+        if (streq(name, "FileDescriptorStoreMax"))
+                return bus_set_transient_unsigned(u, name, &s->n_fd_store_max, message, flags, error);
+
+        if (streq(name, "FileDescriptorStorePreserve"))
+                return bus_set_transient_exec_preserve_mode(u, name, &s->fd_store_preserve_mode, message, flags, error);
+
+        if (streq(name, "NotifyAccess"))
+                return bus_set_transient_notify_access(u, name, &s->notify_access, message, flags, error);
+
+        if (streq(name, "PIDFile")) {
+                _cleanup_free_ char *n = NULL;
+                const char *v, *e;
+
+                r = sd_bus_message_read(message, "s", &v);
+                if (r < 0)
+                        return r;
+
+                if (!isempty(v)) {
+                        n = path_make_absolute(v, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+                        if (!n)
+                                return -ENOMEM;
+
+                        path_simplify(n);
+
+                        if (!path_is_normalized(n))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PIDFile= path '%s' is not valid", n);
+
+                        e = path_startswith(n, "/var/run/");
+                        if (e) {
+                                char *z;
+
+                                z = path_join("/run", e);
+                                if (!z)
+                                        return log_oom();
+
+                                if (!UNIT_WRITE_FLAGS_NOOP(flags))
+                                        log_unit_notice(u, "Transient unit's PIDFile= property references path below legacy directory /var/run, updating %s %s %s; please update client accordingly.",
+                                                        n, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), z);
+
+                                free_and_replace(n, z);
+                        }
+                }
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        free_and_replace(s->pid_file, n);
+                        unit_write_settingf(u, flags, name, "%s=%s", name, strempty(s->pid_file));
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "USBFunctionDescriptors"))
+                return bus_set_transient_path(u, name, &s->usb_function_descriptors, message, flags, error);
+
+        if (streq(name, "USBFunctionStrings"))
+                return bus_set_transient_path(u, name, &s->usb_function_strings, message, flags, error);
+
+        if (streq(name, "BusName"))
+                return bus_set_transient_bus_name(u, name, &s->bus_name, message, flags, error);
+
+        if (streq(name, "Restart"))
+                return bus_set_transient_service_restart(u, name, &s->restart, message, flags, error);
+
+        if (streq(name, "RestartMode"))
+                return bus_set_transient_service_restart_mode(u, name, &s->restart_mode, message, flags, error);
+
+        if (streq(name, "RestartPreventExitStatus"))
+                return bus_set_transient_exit_status(u, name, &s->restart_prevent_status, message, flags, error);
+
+        if (streq(name, "RestartForceExitStatus"))
+                return bus_set_transient_exit_status(u, name, &s->restart_force_status, message, flags, error);
+
+        if (streq(name, "SuccessExitStatus"))
+                return bus_set_transient_exit_status(u, name, &s->success_status, message, flags, error);
+
+        ci = service_exec_command_from_string(name);
+        if (ci < 0)
+                ci = service_exec_ex_command_from_string(name);
+        if (ci >= 0)
+                return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error);
+
+        if (streq(name, "StandardInputFileDescriptor"))
+                return bus_set_transient_std_fd(u, name, &s->stdin_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+        if (streq(name, "StandardOutputFileDescriptor"))
+                return bus_set_transient_std_fd(u, name, &s->stdout_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+        if (streq(name, "StandardErrorFileDescriptor"))
+                return bus_set_transient_std_fd(u, name, &s->stderr_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+        if (streq(name, "OpenFile")) {
+                const char *path, *fdname;
+                uint64_t offlags;
+
+                r = sd_bus_message_enter_container(message, 'a', "(sst)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(sst)", &path, &fdname, &offlags)) > 0) {
+                        _cleanup_(open_file_freep) OpenFile *of = NULL;
+                        _cleanup_free_ char *ofs = NULL;
+
+                        of = new(OpenFile, 1);
+                        if (!of)
+                                return -ENOMEM;
+
+                        *of = (OpenFile) {
+                                .path = strdup(path),
+                                .fdname = strdup(fdname),
+                                .flags = offlags,
+                        };
+
+                        if (!of->path || !of->fdname)
+                                return -ENOMEM;
+
+                        r = open_file_validate(of);
+                        if (r < 0)
+                                return r;
+
+                        if (UNIT_WRITE_FLAGS_NOOP(flags))
+                                continue;
+
+                        r = open_file_to_string(of, &ofs);
+                        if (r < 0)
+                                return sd_bus_error_set_errnof(
+                                                error, r, "Failed to convert OpenFile= value to string: %m");
+
+                        LIST_APPEND(open_files, s->open_files, TAKE_PTR(of));
+                        unit_write_settingf(u, flags | UNIT_ESCAPE_SPECIFIERS, name, "OpenFile=%s", ofs);
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                return 1;
+        }
+
+        if (streq(name, "ReloadSignal"))
+                return bus_set_transient_reload_signal(u, name, &s->reload_signal, message, flags, error);
+
+        return 0;
+}
+
+int bus_service_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+        if (r != 0)
+                return r;
+
+        if (u->transient && u->load_state == UNIT_STUB) {
+                /* This is a transient unit, let's allow a little more */
+
+                r = bus_service_set_transient_property(s, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int bus_service_commit_properties(Unit *u) {
+        assert(u);
+
+        unit_realize_cgroup(u);
+
+        return 0;
+}
diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h
new file mode 100644
index 0000000..aea6cf7
--- /dev/null
+++ b/src/core/dbus-service.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_service_vtable[];
+
+int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
+int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_service_commit_properties(Unit *u);
+int bus_service_method_dump_file_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error);
diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c
new file mode 100644
index 0000000..de41d65
--- /dev/null
+++ b/src/core/dbus-slice.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-cgroup.h"
+#include "dbus-slice.h"
+#include "slice.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_slice_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_VTABLE_END
+};
+
+int bus_slice_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Slice *s = SLICE(u);
+
+        assert(name);
+        assert(u);
+
+        return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+}
+
+int bus_slice_commit_properties(Unit *u) {
+        assert(u);
+
+        unit_realize_cgroup(u);
+
+        return 0;
+}
diff --git a/src/core/dbus-slice.h b/src/core/dbus-slice.h
new file mode 100644
index 0000000..eb71916
--- /dev/null
+++ b/src/core/dbus-slice.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_slice_vtable[];
+
+int bus_slice_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_slice_commit_properties(Unit *u);
diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c
new file mode 100644
index 0000000..e77e9e5
--- /dev/null
+++ b/src/core/dbus-socket.c
@@ -0,0 +1,470 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-socket.h"
+#include "dbus-util.h"
+#include "fd-util.h"
+#include "ip-protocol-list.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "socket.h"
+#include "socket-netlink.h"
+#include "socket-util.h"
+#include "string-util.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, socket_result, SocketResult);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_bind_ipv6_only, socket_address_bind_ipv6_only, SocketAddressBindIPv6Only);
+static BUS_DEFINE_PROPERTY_GET(property_get_fdname, "s", Socket, socket_fdname);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timestamping, socket_timestamping, SocketTimestamping);
+
+static int property_get_listen(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Socket *s = SOCKET(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+        assert(s);
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(port, p, s->ports) {
+                _cleanup_free_ char *address = NULL;
+
+                r = socket_port_to_address(p, &address);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append(reply, "(ss)", socket_port_type_to_string(p), address);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_socket_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("BindIPv6Only", "s", property_get_bind_ipv6_only, offsetof(Socket, bind_ipv6_only), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Backlog", "u", bus_property_get_unsigned, offsetof(Socket, backlog), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Socket, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("BindToDevice", "s", NULL, offsetof(Socket, bind_to_device), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SocketUser", "s", NULL, offsetof(Socket, user), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SocketGroup", "s", NULL, offsetof(Socket, group), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SocketMode", "u", bus_property_get_mode, offsetof(Socket, socket_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Socket, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Accept", "b", bus_property_get_bool, offsetof(Socket, accept), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FlushPending", "b", bus_property_get_bool, offsetof(Socket, flush_pending), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Writable", "b", bus_property_get_bool, offsetof(Socket, writable), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KeepAlive", "b", bus_property_get_bool, offsetof(Socket, keep_alive), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KeepAliveTimeUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_time), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KeepAliveIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KeepAliveProbes", "u", bus_property_get_unsigned, offsetof(Socket, keep_alive_cnt), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DeferAcceptUSec" , "t", bus_property_get_usec, offsetof(Socket, defer_accept), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NoDelay", "b", bus_property_get_bool, offsetof(Socket, no_delay), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Priority", "i", bus_property_get_int, offsetof(Socket, priority), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReceiveBuffer", "t", bus_property_get_size, offsetof(Socket, receive_buffer), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SendBuffer", "t", bus_property_get_size, offsetof(Socket, send_buffer), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IPTOS", "i", bus_property_get_int, offsetof(Socket, ip_tos), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IPTTL", "i", bus_property_get_int, offsetof(Socket, ip_ttl), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PipeSize", "t", bus_property_get_size, offsetof(Socket, pipe_size), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FreeBind", "b", bus_property_get_bool, offsetof(Socket, free_bind), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RemoveOnStop", "b", bus_property_get_bool, offsetof(Socket, remove_on_stop), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Listen", "a(ss)", property_get_listen, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Symlinks", "as", NULL, offsetof(Socket, symlinks), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Mark", "i", bus_property_get_int, offsetof(Socket, mark), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MaxConnections", "u", bus_property_get_unsigned, offsetof(Socket, max_connections), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MaxConnectionsPerSource", "u", bus_property_get_unsigned, offsetof(Socket, max_connections_per_source), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MessageQueueMaxMessages", "x", bus_property_get_long, offsetof(Socket, mq_maxmsg), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MessageQueueMessageSize", "x", bus_property_get_long, offsetof(Socket, mq_msgsize), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TCPCongestion", "s", NULL, offsetof(Socket, tcp_congestion), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReusePort", "b",  bus_property_get_bool, offsetof(Socket, reuse_port), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SmackLabel", "s", NULL, offsetof(Socket, smack), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SmackLabelIPIn", "s", NULL, offsetof(Socket, smack_ip_in), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SmackLabelIPOut", "s", NULL, offsetof(Socket, smack_ip_out), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Socket, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Socket, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("NConnections", "u", bus_property_get_unsigned, offsetof(Socket, n_connections), 0),
+        SD_BUS_PROPERTY("NAccepted", "u", bus_property_get_unsigned, offsetof(Socket, n_accepted), 0),
+        SD_BUS_PROPERTY("NRefused", "u", bus_property_get_unsigned, offsetof(Socket, n_refused), 0),
+        SD_BUS_PROPERTY("FileDescriptorName", "s", property_get_fdname, 0, 0),
+        SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PollLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, poll_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PollLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, poll_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_VTABLE_END
+};
+
+static bool check_size_t_truncation(uint64_t t) {
+        return (size_t) t == t;
+}
+
+static const char* socket_protocol_to_string(int32_t i) {
+        if (i == IPPROTO_IP)
+                return "";
+
+        if (!IN_SET(i, IPPROTO_UDPLITE, IPPROTO_SCTP))
+                return NULL;
+
+        return ip_protocol_to_name(i);
+}
+
+static BUS_DEFINE_SET_TRANSIENT(int, "i", int32_t, int, "%" PRIi32);
+static BUS_DEFINE_SET_TRANSIENT(message_queue, "x", int64_t, long, "%" PRIi64);
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(size_t_check_truncation, "t", uint64_t, size_t, "%" PRIu64, check_size_t_truncation);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(bind_ipv6_only, SocketAddressBindIPv6Only, socket_address_bind_ipv6_only_or_bool_from_string);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(fdname, fdname_is_valid);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(ifname, ifname_valid);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(ip_tos, "i", int32_t, int, "%" PRIi32, ip_tos_to_string_alloc);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(socket_protocol, "i", int32_t, int, "%" PRIi32, socket_protocol_to_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(socket_timestamping, SocketTimestamping, socket_timestamping_from_string_harder);
+
+static int bus_socket_set_transient_property(
+                Socket *s,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        SocketExecCommand ci;
+        Unit *u = UNIT(s);
+        int r;
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "Accept"))
+                return bus_set_transient_bool(u, name, &s->accept, message, flags, error);
+
+        if (streq(name, "FlushPending"))
+                return bus_set_transient_bool(u, name, &s->flush_pending, message, flags, error);
+
+        if (streq(name, "Writable"))
+                return bus_set_transient_bool(u, name, &s->writable, message, flags, error);
+
+        if (streq(name, "KeepAlive"))
+                return bus_set_transient_bool(u, name, &s->keep_alive, message, flags, error);
+
+        if (streq(name, "NoDelay"))
+                return bus_set_transient_bool(u, name, &s->no_delay, message, flags, error);
+
+        if (streq(name, "FreeBind"))
+                return bus_set_transient_bool(u, name, &s->free_bind, message, flags, error);
+
+        if (streq(name, "Transparent"))
+                return bus_set_transient_bool(u, name, &s->transparent, message, flags, error);
+
+        if (streq(name, "Broadcast"))
+                return bus_set_transient_bool(u, name, &s->broadcast, message, flags, error);
+
+        if (streq(name, "PassCredentials"))
+                return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error);
+
+        if (streq(name, "PassSecurity"))
+                return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error);
+
+        if (streq(name, "PassPacketInfo"))
+                return bus_set_transient_bool(u, name, &s->pass_pktinfo, message, flags, error);
+
+        if (streq(name, "Timestamping"))
+                return bus_set_transient_socket_timestamping(u, name, &s->timestamping, message, flags, error);
+
+        if (streq(name, "ReusePort"))
+                return bus_set_transient_bool(u, name, &s->reuse_port, message, flags, error);
+
+        if (streq(name, "RemoveOnStop"))
+                return bus_set_transient_bool(u, name, &s->remove_on_stop, message, flags, error);
+
+        if (streq(name, "SELinuxContextFromNet"))
+                return bus_set_transient_bool(u, name, &s->selinux_context_from_net, message, flags, error);
+
+        if (streq(name, "Priority"))
+                return bus_set_transient_int(u, name, &s->priority, message, flags, error);
+
+        if (streq(name, "IPTTL"))
+                return bus_set_transient_int(u, name, &s->ip_ttl, message, flags, error);
+
+        if (streq(name, "Mark"))
+                return bus_set_transient_int(u, name, &s->mark, message, flags, error);
+
+        if (streq(name, "Backlog"))
+                return bus_set_transient_unsigned(u, name, &s->backlog, message, flags, error);
+
+        if (streq(name, "MaxConnections"))
+                return bus_set_transient_unsigned(u, name, &s->max_connections, message, flags, error);
+
+        if (streq(name, "MaxConnectionsPerSource"))
+                return bus_set_transient_unsigned(u, name, &s->max_connections_per_source, message, flags, error);
+
+        if (streq(name, "KeepAliveProbes"))
+                return bus_set_transient_unsigned(u, name, &s->keep_alive_cnt, message, flags, error);
+
+        if (streq(name, "TriggerLimitBurst"))
+                return bus_set_transient_unsigned(u, name, &s->trigger_limit.burst, message, flags, error);
+
+        if (streq(name, "PollLimitBurst"))
+                return bus_set_transient_unsigned(u, name, &s->poll_limit_burst, message, flags, error);
+
+        if (streq(name, "SocketMode"))
+                return bus_set_transient_mode_t(u, name, &s->socket_mode, message, flags, error);
+
+        if (streq(name, "DirectoryMode"))
+                return bus_set_transient_mode_t(u, name, &s->directory_mode, message, flags, error);
+
+        if (streq(name, "MessageQueueMaxMessages"))
+                return bus_set_transient_message_queue(u, name, &s->mq_maxmsg, message, flags, error);
+
+        if (streq(name, "MessageQueueMessageSize"))
+                return bus_set_transient_message_queue(u, name, &s->mq_msgsize, message, flags, error);
+
+        if (streq(name, "TimeoutUSec"))
+                return bus_set_transient_usec_fix_0(u, name, &s->timeout_usec, message, flags, error);
+
+        if (streq(name, "KeepAliveTimeUSec"))
+                return bus_set_transient_usec(u, name, &s->keep_alive_time, message, flags, error);
+
+        if (streq(name, "KeepAliveIntervalUSec"))
+                return bus_set_transient_usec(u, name, &s->keep_alive_interval, message, flags, error);
+
+        if (streq(name, "DeferAcceptUSec"))
+                return bus_set_transient_usec(u, name, &s->defer_accept, message, flags, error);
+
+        if (streq(name, "TriggerLimitIntervalUSec"))
+                return bus_set_transient_usec(u, name, &s->trigger_limit.interval, message, flags, error);
+
+        if (streq(name, "PollLimitIntervalUSec"))
+                return bus_set_transient_usec(u, name, &s->poll_limit_interval, message, flags, error);
+
+        if (streq(name, "SmackLabel"))
+                return bus_set_transient_string(u, name, &s->smack, message, flags, error);
+
+        if (streq(name, "SmackLabelIPin"))
+                return bus_set_transient_string(u, name, &s->smack_ip_in, message, flags, error);
+
+        if (streq(name, "SmackLabelIPOut"))
+                return bus_set_transient_string(u, name, &s->smack_ip_out, message, flags, error);
+
+        if (streq(name, "TCPCongestion"))
+                return bus_set_transient_string(u, name, &s->tcp_congestion, message, flags, error);
+
+        if (streq(name, "FileDescriptorName"))
+                return bus_set_transient_fdname(u, name, &s->fdname, message, flags, error);
+
+        if (streq(name, "SocketUser"))
+                return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
+
+        if (streq(name, "SocketGroup"))
+                return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
+
+        if (streq(name, "BindIPv6Only"))
+                return bus_set_transient_bind_ipv6_only(u, name, &s->bind_ipv6_only, message, flags, error);
+
+        if (streq(name, "ReceiveBuffer"))
+                return bus_set_transient_size_t_check_truncation(u, name, &s->receive_buffer, message, flags, error);
+
+        if (streq(name, "SendBuffer"))
+                return bus_set_transient_size_t_check_truncation(u, name, &s->send_buffer, message, flags, error);
+
+        if (streq(name, "PipeSize"))
+                return bus_set_transient_size_t_check_truncation(u, name, &s->pipe_size, message, flags, error);
+
+        if (streq(name, "BindToDevice"))
+                return bus_set_transient_ifname(u, name, &s->bind_to_device, message, flags, error);
+
+        if (streq(name, "IPTOS"))
+                return bus_set_transient_ip_tos(u, name, &s->ip_tos, message, flags, error);
+
+        if (streq(name, "SocketProtocol"))
+                return bus_set_transient_socket_protocol(u, name, &s->socket_protocol, message, flags, error);
+
+        ci = socket_exec_command_from_string(name);
+        if (ci >= 0)
+                return bus_set_transient_exec_command(u, name,
+                                                      &s->exec_command[ci],
+                                                      message, flags, error);
+
+        if (streq(name, "Symlinks")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l)
+                        if (!path_is_absolute(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Symlink path is not absolute: %s", *p);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                s->symlinks = strv_free(s->symlinks);
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=", name);
+                        } else {
+                                _cleanup_free_ char *joined = NULL;
+
+                                r = strv_extend_strv(&s->symlinks, l, true);
+                                if (r < 0)
+                                        return -ENOMEM;
+
+                                joined = strv_join(l, " ");
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "Listen")) {
+                const char *t, *a;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ss)", &t, &a)) > 0) {
+                        _cleanup_(socket_port_freep) SocketPort *p = NULL;
+
+                        p = new(SocketPort, 1);
+                        if (!p)
+                                return log_oom();
+
+                        *p = (SocketPort) {
+                                .fd = -EBADF,
+                                .socket = s,
+                        };
+
+                        p->type = socket_port_type_from_string(t);
+                        if (p->type < 0)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown Socket type: %s", t);
+
+                        if (p->type != SOCKET_SOCKET) {
+                                if (!path_is_absolute(a) || !path_is_valid(a))
+                                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid socket path: %s", a);
+
+                                r = path_simplify_alloc(a, &p->path);
+                                if (r < 0)
+                                        return r;
+
+                        } else if (streq(t, "Netlink")) {
+                                r = socket_address_parse_netlink(&p->address, a);
+                                if (r < 0)
+                                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid netlink address: %s", a);
+
+                        } else {
+                                r = socket_address_parse(&p->address, a);
+                                if (r < 0)
+                                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address: %s", a);
+
+                                p->address.type = socket_address_type_from_string(t);
+                                if (p->address.type < 0)
+                                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address type: %s", t);
+
+                                if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET)
+                                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Address family not supported: %s", a);
+                        }
+
+                        empty = false;
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                LIST_APPEND(port, s->ports, TAKE_PTR(p));
+                                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Listen%s=%s", t, a);
+                        }
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+                        socket_free_ports(s);
+                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ListenStream=");
+                }
+
+                return 1;
+        }
+
+        return 0;
+}
+
+int bus_socket_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+        if (r != 0)
+                return r;
+
+        if (u->transient && u->load_state == UNIT_STUB) {
+                /* This is a transient unit, let's load a little more */
+
+                r = bus_socket_set_transient_property(s, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+
+                r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+                if (r != 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int bus_socket_commit_properties(Unit *u) {
+        assert(u);
+
+        unit_realize_cgroup(u);
+
+        return 0;
+}
diff --git a/src/core/dbus-socket.h b/src/core/dbus-socket.h
new file mode 100644
index 0000000..f9f36a2
--- /dev/null
+++ b/src/core/dbus-socket.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_socket_vtable[];
+
+int bus_socket_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_socket_commit_properties(Unit *u);
diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c
new file mode 100644
index 0000000..7230352
--- /dev/null
+++ b/src/core/dbus-swap.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+  Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-swap.h"
+#include "string-util.h"
+#include "swap.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET(property_get_priority, "i", Swap, swap_get_priority);
+static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Swap, swap_get_options);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, swap_result, SwapResult);
+
+const sd_bus_vtable bus_swap_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("What", "s", NULL, offsetof(Swap, what), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Priority", "i", property_get_priority, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_VTABLE_END
+};
+
+int bus_swap_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Swap *s = SWAP(u);
+
+        assert(s);
+        assert(name);
+        assert(message);
+
+        return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+}
+
+int bus_swap_commit_properties(Unit *u) {
+        assert(u);
+
+        unit_realize_cgroup(u);
+
+        return 0;
+}
diff --git a/src/core/dbus-swap.h b/src/core/dbus-swap.h
new file mode 100644
index 0000000..9d651b5
--- /dev/null
+++ b/src/core/dbus-swap.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+  Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_swap_vtable[];
+
+int bus_swap_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_swap_commit_properties(Unit *u);
diff --git a/src/core/dbus-target.c b/src/core/dbus-target.c
new file mode 100644
index 0000000..e979fb7
--- /dev/null
+++ b/src/core/dbus-target.c
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-target.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_target_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_VTABLE_END
+};
diff --git a/src/core/dbus-target.h b/src/core/dbus-target.h
new file mode 100644
index 0000000..fedd4a9
--- /dev/null
+++ b/src/core/dbus-target.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+extern const sd_bus_vtable bus_target_vtable[];
diff --git a/src/core/dbus-timer.c b/src/core/dbus-timer.c
new file mode 100644
index 0000000..4f78a52
--- /dev/null
+++ b/src/core/dbus-timer.c
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-timer.h"
+#include "dbus-util.h"
+#include "strv.h"
+#include "timer.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, timer_result, TimerResult);
+
+static int property_get_monotonic_timers(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Timer *t = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(stt)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(value, v, t->values) {
+                _cleanup_free_ char *usec = NULL;
+
+                if (v->base == TIMER_CALENDAR)
+                        continue;
+
+                usec = timer_base_to_usec_string(v->base);
+                if (!usec)
+                        return -ENOMEM;
+
+                r = sd_bus_message_append(reply, "(stt)", usec, v->value, v->next_elapse);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_calendar_timers(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Timer *t = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sst)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(value, v, t->values) {
+                _cleanup_free_ char *buf = NULL;
+
+                if (v->base != TIMER_CALENDAR)
+                        continue;
+
+                r = calendar_spec_to_string(v->calendar_spec, &buf);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append(reply, "(sst)", timer_base_to_string(v->base), buf, v->next_elapse);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_next_elapse_monotonic(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Timer *t = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "t", timer_next_elapse_monotonic(t));
+}
+
+const sd_bus_vtable bus_timer_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TimersMonotonic", "a(stt)", property_get_monotonic_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_PROPERTY("TimersCalendar", "a(sst)", property_get_calendar_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_PROPERTY("OnClockChange", "b", bus_property_get_bool, offsetof(Timer, on_clock_change), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnTimezoneChange", "b", bus_property_get_bool, offsetof(Timer, on_timezone_change), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NextElapseUSecRealtime", "t", bus_property_get_usec, offsetof(Timer, next_elapse_realtime), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("NextElapseUSecMonotonic", "t", property_get_next_elapse_monotonic, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("LastTriggerUSec", offsetof(Timer, last_trigger), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Timer, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("AccuracyUSec", "t", bus_property_get_usec, offsetof(Timer, accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Timer, random_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FixedRandomDelay", "b", bus_property_get_bool, offsetof(Timer, fixed_random_delay), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Persistent", "b", bus_property_get_bool, offsetof(Timer, persistent), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WakeSystem", "b", bus_property_get_bool, offsetof(Timer, wake_system), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RemainAfterElapse", "b", bus_property_get_bool, offsetof(Timer, remain_after_elapse), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_VTABLE_END
+};
+
+static int timer_add_one_monotonic_spec(
+                Timer *t,
+                const char *name,
+                TimerBase base,
+                UnitWriteFlags flags,
+                usec_t usec,
+                sd_bus_error *error) {
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                TimerValue *v;
+
+                unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                    "%s=%s",
+                                    timer_base_to_string(base),
+                                    FORMAT_TIMESPAN(usec, USEC_PER_MSEC));
+
+                v = new(TimerValue, 1);
+                if (!v)
+                        return -ENOMEM;
+
+                *v = (TimerValue) {
+                        .base = base,
+                        .value = usec,
+                };
+
+                LIST_PREPEND(value, t->values, v);
+        }
+
+        return 1;
+}
+
+static int timer_add_one_calendar_spec(
+                Timer *t,
+                const char *name,
+                TimerBase base,
+                UnitWriteFlags flags,
+                const char *str,
+                sd_bus_error *error) {
+
+        _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+        int r;
+
+        r = calendar_spec_from_string(str, &c);
+        if (r == -EINVAL)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec");
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                    "%s=%s", timer_base_to_string(base), str);
+
+                TimerValue *v = new(TimerValue, 1);
+                if (!v)
+                        return -ENOMEM;
+
+                *v = (TimerValue) {
+                        .base = base,
+                        .calendar_spec = TAKE_PTR(c),
+                };
+
+                LIST_PREPEND(value, t->values, v);
+        }
+
+        return 1;
+};
+
+static int bus_timer_set_transient_property(
+                Timer *t,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        Unit *u = UNIT(t);
+        int r;
+
+        assert(t);
+        assert(name);
+        assert(message);
+
+        flags |= UNIT_PRIVATE;
+
+        if (streq(name, "AccuracyUSec"))
+                return bus_set_transient_usec(u, name, &t->accuracy_usec, message, flags, error);
+
+        if (streq(name, "AccuracySec")) {
+                log_notice("Client is using obsolete AccuracySec= transient property, please use AccuracyUSec= instead.");
+                return bus_set_transient_usec(u, "AccuracyUSec", &t->accuracy_usec, message, flags, error);
+        }
+
+        if (streq(name, "RandomizedDelayUSec"))
+                return bus_set_transient_usec(u, name, &t->random_usec, message, flags, error);
+
+        if (streq(name, "FixedRandomDelay"))
+                return bus_set_transient_bool(u, name, &t->fixed_random_delay, message, flags, error);
+
+        if (streq(name, "WakeSystem"))
+                return bus_set_transient_bool(u, name, &t->wake_system, message, flags, error);
+
+        if (streq(name, "Persistent"))
+                return bus_set_transient_bool(u, name, &t->persistent, message, flags, error);
+
+        if (streq(name, "RemainAfterElapse"))
+                return bus_set_transient_bool(u, name, &t->remain_after_elapse, message, flags, error);
+
+        if (streq(name, "OnTimezoneChange"))
+                return bus_set_transient_bool(u, name, &t->on_timezone_change, message, flags, error);
+
+        if (streq(name, "OnClockChange"))
+                return bus_set_transient_bool(u, name, &t->on_clock_change, message, flags, error);
+
+        if (streq(name, "TimersMonotonic")) {
+                const char *base_name;
+                usec_t usec;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(st)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(st)", &base_name, &usec)) > 0) {
+                        TimerBase b;
+
+                        b = timer_base_from_string(base_name);
+                        if (b < 0 || b == TIMER_CALENDAR)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                         "Invalid timer base: %s", base_name);
+
+                        r = timer_add_one_monotonic_spec(t, name, b, flags, usec, error);
+                        if (r < 0)
+                                return r;
+
+                        empty = false;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+                        timer_free_values(t);
+                        unit_write_setting(u, flags, name, "OnActiveSec=");
+                }
+
+                return 1;
+
+        } else if (streq(name, "TimersCalendar")) {
+                const char *base_name, *str;
+                bool empty = true;
+
+                r = sd_bus_message_enter_container(message, 'a', "(ss)");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "(ss)", &base_name, &str)) > 0) {
+                        TimerBase b;
+
+                        b = timer_base_from_string(base_name);
+                        if (b != TIMER_CALENDAR)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                         "Invalid timer base: %s", base_name);
+
+                        r = timer_add_one_calendar_spec(t, name, b, flags, str, error);
+                        if (r < 0)
+                                return r;
+
+                        empty = false;
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+                        timer_free_values(t);
+                        unit_write_setting(u, flags, name, "OnCalendar=");
+                }
+
+                return 1;
+
+        } else if (STR_IN_SET(name,
+                       "OnActiveSec",
+                       "OnBootSec",
+                       "OnStartupSec",
+                       "OnUnitActiveSec",
+                       "OnUnitInactiveSec")) {
+
+                TimerBase b;
+                usec_t usec;
+
+                log_notice("Client is using obsolete %s= transient property, please use TimersMonotonic= instead.", name);
+
+                b = timer_base_from_string(name);
+                if (b < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown timer base %s", name);
+
+                r = sd_bus_message_read(message, "t", &usec);
+                if (r < 0)
+                        return r;
+
+                return timer_add_one_monotonic_spec(t, name, b, flags, usec, error);
+
+        } else if (streq(name, "OnCalendar")) {
+
+                const char *str;
+
+                log_notice("Client is using obsolete %s= transient property, please use TimersCalendar= instead.", name);
+
+                r = sd_bus_message_read(message, "s", &str);
+                if (r < 0)
+                        return r;
+
+                return timer_add_one_calendar_spec(t, name, TIMER_CALENDAR, flags, str, error);
+        }
+
+        return 0;
+}
+
+int bus_timer_set_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags mode,
+                sd_bus_error *error) {
+
+        Timer *t = TIMER(u);
+
+        assert(t);
+        assert(name);
+        assert(message);
+
+        if (u->transient && u->load_state == UNIT_STUB)
+                return bus_timer_set_transient_property(t, name, message, mode, error);
+
+        return 0;
+}
diff --git a/src/core/dbus-timer.h b/src/core/dbus-timer.h
new file mode 100644
index 0000000..ac436f1
--- /dev/null
+++ b/src/core/dbus-timer.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_timer_vtable[];
+
+int bus_timer_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c
new file mode 100644
index 0000000..1a037b7
--- /dev/null
+++ b/src/core/dbus-unit.c
@@ -0,0 +1,2629 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-polkit.h"
+#include "cgroup-util.h"
+#include "condition.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "install.h"
+#include "locale-util.h"
+#include "log.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "service.h"
+#include "signal-util.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+#include "web-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_collect_mode, collect_mode, CollectMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_load_state, unit_load_state, UnitLoadState);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_job_mode, job_mode, JobMode);
+static BUS_DEFINE_PROPERTY_GET(property_get_description, "s", Unit, unit_description);
+static BUS_DEFINE_PROPERTY_GET2(property_get_active_state, "s", Unit, unit_active_state, unit_active_state_to_string);
+static BUS_DEFINE_PROPERTY_GET2(property_get_freezer_state, "s", Unit, unit_freezer_state, freezer_state_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_sub_state, "s", Unit, unit_sub_state_to_string);
+static BUS_DEFINE_PROPERTY_GET2(property_get_unit_file_state, "s", Unit, unit_get_unit_file_state, unit_file_state_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_reload, "b", Unit, unit_can_reload);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_start, "b", Unit, unit_can_start_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_stop, "b", Unit, unit_can_stop_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_isolate, "b", Unit, unit_can_isolate_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_freeze, "b", Unit, unit_can_freeze);
+static BUS_DEFINE_PROPERTY_GET(property_get_need_daemon_reload, "b", Unit, unit_need_daemon_reload);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_strv, "as", 0);
+
+static int property_get_can_clean(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = userdata;
+        ExecCleanMask mask;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = unit_can_clean(u, &mask);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                if (!FLAGS_SET(mask, 1U << t))
+                        continue;
+
+                r = sd_bus_message_append(reply, "s", exec_resource_type_to_string(t));
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(mask, EXEC_CLEAN_FDSTORE)) {
+                r = sd_bus_message_append(reply, "s", "fdstore");
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_names(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        const char *t;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "s", u->id);
+        if (r < 0)
+                return r;
+
+        SET_FOREACH(t, u->aliases) {
+                r = sd_bus_message_append(reply, "s", t);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_following(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = userdata, *f;
+
+        assert(bus);
+        assert(reply);
+        assert(u);
+
+        f = unit_following(u);
+        return sd_bus_message_append(reply, "s", f ? f->id : NULL);
+}
+
+static int property_get_dependencies(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = userdata, *other;
+        UnitDependency d;
+        Hashmap *deps;
+        void *v;
+        int r;
+
+        assert(bus);
+        assert(reply);
+        assert(u);
+
+        d = unit_dependency_from_string(property);
+        assert_se(d >= 0);
+
+        deps = unit_get_dependencies(u, d);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH_KEY(v, other, deps) {
+                r = sd_bus_message_append(reply, "s", other->id);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_requires_mounts_for(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Hashmap **h = ASSERT_PTR(userdata);
+        const char *p;
+        void *v;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH_KEY(v, p, *h) {
+                r = sd_bus_message_append(reply, "s", p);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_unit_file_preset(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = unit_get_unit_file_preset(u);
+
+        return sd_bus_message_append(reply, "s", preset_action_past_tense_to_string(r));
+}
+
+static int property_get_job(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *p = NULL;
+        Job **j = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        if (!*j)
+                return sd_bus_message_append(reply, "(uo)", 0, "/");
+
+        p = job_dbus_path(*j);
+        if (!p)
+                return -ENOMEM;
+
+        return sd_bus_message_append(reply, "(uo)", (*j)->id, p);
+}
+
+static int property_get_conditions(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        const char *(*to_string)(ConditionType type) = NULL;
+        Condition **list = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        to_string = streq(property, "Asserts") ? assert_type_to_string : condition_type_to_string;
+
+        r = sd_bus_message_open_container(reply, 'a', "(sbbsi)");
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(conditions, c, *list) {
+                int tristate;
+
+                tristate =
+                        c->result == CONDITION_UNTESTED ? 0 :
+                        c->result == CONDITION_SUCCEEDED ? 1 : -1;
+
+                r = sd_bus_message_append(reply, "(sbbsi)",
+                                          to_string(c->type),
+                                          c->trigger, c->negate,
+                                          c->parameter, tristate);
+                if (r < 0)
+                        return r;
+
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int property_get_load_error(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = bus_unit_validate_load_state(u, &e);
+        if (r < 0)
+                return sd_bus_message_append(reply, "(ss)", e.name, e.message);
+
+        return sd_bus_message_append(reply, "(ss)", NULL, NULL);
+}
+
+static int property_get_markers(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        unsigned *markers = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        /* Make sure out values fit in the bitfield. */
+        assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
+
+        for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++)
+                if (FLAGS_SET(*markers, 1u << m)) {
+                        r = sd_bus_message_append(reply, "s", unit_marker_to_string(m));
+                        if (r < 0)
+                                return r;
+                }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = {
+        [JOB_START]       = N_("Authentication is required to start '$(unit)'."),
+        [JOB_STOP]        = N_("Authentication is required to stop '$(unit)'."),
+        [JOB_RELOAD]      = N_("Authentication is required to reload '$(unit)'."),
+        [JOB_RESTART]     = N_("Authentication is required to restart '$(unit)'."),
+        [JOB_TRY_RESTART] = N_("Authentication is required to restart '$(unit)'."),
+};
+
+int bus_unit_method_start_generic(
+                sd_bus_message *message,
+                Unit *u,
+                JobType job_type,
+                bool reload_if_possible,
+                sd_bus_error *error) {
+
+        BusUnitQueueFlags job_flags = reload_if_possible ? BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE : 0;
+        const char *smode, *verb;
+        JobMode mode;
+        int r;
+
+        assert(message);
+        assert(u);
+        assert(job_type >= 0 && job_type < _JOB_TYPE_MAX);
+
+        r = mac_selinux_unit_access_check(
+                        u, message,
+                        job_type_to_access_method(job_type),
+                        error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "s", &smode);
+        if (r < 0)
+                return r;
+
+        mode = job_mode_from_string(smode);
+        if (mode < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode);
+
+        if (reload_if_possible)
+                verb = strjoina("reload-or-", job_type_to_string(job_type));
+        else
+                verb = job_type_to_string(job_type);
+
+        if (sd_bus_message_is_method_call(message, NULL, "StartUnitWithFlags")) {
+                uint64_t input_flags = 0;
+
+                r = sd_bus_message_read(message, "t", &input_flags);
+                if (r < 0)
+                        return r;
+                /* Let clients know that this version doesn't support any flags at the moment. */
+                if (input_flags != 0)
+                        return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS,
+                                                          "Invalid 'flags' parameter '%" PRIu64 "'",
+                                                          input_flags);
+        }
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        verb,
+                        CAP_SYS_ADMIN,
+                        polkit_message_for_job[job_type],
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        return bus_unit_queue_job(message, u, job_type, mode, job_flags, error);
+}
+
+static int bus_unit_method_start(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_START, false, error);
+}
+
+static int bus_unit_method_stop(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_STOP, false, error);
+}
+
+static int bus_unit_method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_RELOAD, false, error);
+}
+
+static int bus_unit_method_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_RESTART, false, error);
+}
+
+static int bus_unit_method_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, false, error);
+}
+
+static int bus_unit_method_reload_or_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_RESTART, true, error);
+}
+
+static int bus_unit_method_reload_or_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, true, error);
+}
+
+int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        BusUnitQueueFlags flags = BUS_UNIT_QUEUE_VERBOSE_REPLY;
+        const char *jtype, *smode;
+        Unit *u = ASSERT_PTR(userdata);
+        JobType type;
+        JobMode mode;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "ss", &jtype, &smode);
+        if (r < 0)
+                return r;
+
+        /* Parse the two magic reload types "reload-or-…" manually */
+        if (streq(jtype, "reload-or-restart")) {
+                type = JOB_RESTART;
+                flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+        } else if (streq(jtype, "reload-or-try-restart")) {
+                type = JOB_TRY_RESTART;
+                flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+        } else {
+                /* And the rest generically */
+                type = job_type_from_string(jtype);
+                if (type < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job type %s invalid", jtype);
+        }
+
+        mode = job_mode_from_string(smode);
+        if (mode < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode);
+
+        r = mac_selinux_unit_access_check(
+                        u, message,
+                        job_type_to_access_method(type),
+                        error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        jtype,
+                        CAP_SYS_ADMIN,
+                        polkit_message_for_job[type],
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        return bus_unit_queue_job(message, u, type, mode, flags, error);
+}
+
+int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Unit *u = ASSERT_PTR(userdata);
+        int32_t value = 0;
+        const char *swho;
+        int32_t signo;
+        KillWho who;
+        int r, code;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "stop", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "si", &swho, &signo);
+        if (r < 0)
+                return r;
+
+        if (startswith(sd_bus_message_get_member(message), "QueueSignal")) {
+                r = sd_bus_message_read(message, "i", &value);
+                if (r < 0)
+                        return r;
+
+                code = SI_QUEUE;
+        } else
+                code = SI_USER;
+
+        if (isempty(swho))
+                who = KILL_ALL;
+        else {
+                who = kill_who_from_string(swho);
+                if (who < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid who argument: %s", swho);
+        }
+
+        if (!SIGNAL_VALID(signo))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Signal number out of range.");
+
+        if (code == SI_QUEUE && !((signo >= SIGRTMIN) && (signo <= SIGRTMAX)))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Value parameter only accepted for realtime signals (SIGRTMIN…SIGRTMAX), refusing for signal SIG%s.", signal_to_string(signo));
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        "kill",
+                        CAP_KILL,
+                        N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."),
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = unit_kill(u, who, signo, code, value, error);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "reload", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        "reset-failed",
+                        CAP_SYS_ADMIN,
+                        N_("Authentication is required to reset the \"failed\" state of '$(unit)'."),
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        unit_reset_failed(u);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Unit *u = ASSERT_PTR(userdata);
+        int runtime, r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "start", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "b", &runtime);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        "set-property",
+                        CAP_SYS_ADMIN,
+                        N_("Authentication is required to set properties on '$(unit)'."),
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = bus_unit_set_properties(u, message, runtime ? UNIT_RUNTIME : UNIT_PERSISTENT, true, error);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "start", error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        "ref",
+                        CAP_SYS_ADMIN,
+                        NULL,
+                        false,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = bus_unit_track_add_sender(u, message);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_unit_track_remove_sender(u, message);
+        if (r == -EUNATCH)
+                return sd_bus_error_set(error, BUS_ERROR_NOT_REFERENCED, "Unit has not been referenced yet.");
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        ExecCleanMask mask = 0;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "stop", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_enter_container(message, 'a', "s");
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                ExecCleanMask m;
+                const char *i;
+
+                r = sd_bus_message_read(message, "s", &i);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                m = exec_clean_mask_from_string(i);
+                if (m < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid resource type: %s", i);
+
+                mask |= m;
+        }
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        "clean",
+                        CAP_DAC_OVERRIDE,
+                        N_("Authentication is required to delete files and directories associated with '$(unit)'."),
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = unit_clean(u, mask);
+        if (r == -EOPNOTSUPP)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support cleaning.", u->id);
+        if (r == -EUNATCH)
+                return sd_bus_error_set(error, BUS_ERROR_NOTHING_TO_CLEAN, "No matching resources found.");
+        if (r == -EBUSY)
+                return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit is not inactive or has pending job.");
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) {
+        const char* perm;
+        int (*method)(Unit*);
+        Unit *u = ASSERT_PTR(userdata);
+        bool reply_no_delay = false;
+        int r;
+
+        assert(message);
+        assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+        if (action == FREEZER_FREEZE) {
+                perm = "stop";
+                method = unit_freeze;
+        } else {
+                perm = "start";
+                method = unit_thaw;
+        }
+
+        r = mac_selinux_unit_access_check(u, message, perm, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_manage_units_async_full(
+                        u,
+                        perm,
+                        CAP_SYS_ADMIN,
+                        N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."),
+                        true,
+                        message,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = method(u);
+        if (r == -EOPNOTSUPP)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id);
+        if (r == -EBUSY)
+                return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job.");
+        if (r == -EHOSTDOWN)
+                return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive.");
+        if (r == -EALREADY)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                reply_no_delay = true;
+
+        if (u->pending_freezer_invocation) {
+                bus_unit_send_pending_freezer_message(u, true);
+                assert(!u->pending_freezer_invocation);
+        }
+
+        u->pending_freezer_invocation = sd_bus_message_ref(message);
+
+        if (reply_no_delay) {
+                r = bus_unit_send_pending_freezer_message(u, false);
+                if (r < 0)
+                        return r;
+        }
+
+        return 1;
+}
+
+int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_THAW);
+}
+
+int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_FREEZE);
+}
+
+static int property_get_refs(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = userdata;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "s");
+        if (r < 0)
+                return r;
+
+        for (const char *i = sd_bus_track_first(u->bus_track); i; i = sd_bus_track_next(u->bus_track)) {
+                int c;
+
+                c = sd_bus_track_count_name(u->bus_track, i);
+                if (c < 0)
+                        return c;
+
+                /* Add the item multiple times if the ref count for each is above 1 */
+                for (int k = 0; k < c; k++) {
+                        r = sd_bus_message_append(reply, "s", i);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_unit_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Unit, id), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Names", "as", property_get_names, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Following", "s", property_get_following, 0, 0),
+        SD_BUS_PROPERTY("Requires", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Requisite", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Wants", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("BindsTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PartOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Upholds", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RequiredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RequisiteOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("WantedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("BoundBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UpheldBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ConsistsOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Conflicts", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ConflictedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Before", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("After", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnSuccess", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnSuccessOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnFailure", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnFailureOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Triggers", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("TriggeredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PropagatesReloadTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ReloadPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PropagatesStopTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StopPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SliceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("AccessSELinuxContext", "s", NULL, offsetof(Unit, access_selinux_context), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("LoadState", "s", property_get_load_state, offsetof(Unit, load_state), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ActiveState", "s", property_get_active_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("FreezerState", "s", property_get_freezer_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("SubState", "s", property_get_sub_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("FragmentPath", "s", NULL, offsetof(Unit, fragment_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Unit, source_path), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DropInPaths", "as", NULL, offsetof(Unit, dropin_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UnitFileState", "s", property_get_unit_file_state, 0, 0),
+        SD_BUS_PROPERTY("UnitFilePreset", "s", property_get_unit_file_preset, 0, 0),
+        BUS_PROPERTY_DUAL_TIMESTAMP("StateChangeTimestamp", offsetof(Unit, state_change_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InactiveExitTimestamp", offsetof(Unit, inactive_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("ActiveEnterTimestamp", offsetof(Unit, active_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("ActiveExitTimestamp", offsetof(Unit, active_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("InactiveEnterTimestamp", offsetof(Unit, inactive_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("CanStart", "b", property_get_can_start, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CanStop", "b", property_get_can_stop, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CanReload", "b", property_get_can_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CanIsolate", "b", property_get_can_isolate, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CanClean", "as", property_get_can_clean, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("CanFreeze", "b", property_get_can_freeze, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Job", "(uo)", property_get_job, offsetof(Unit, job), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("StopWhenUnneeded", "b", bus_property_get_bool, offsetof(Unit, stop_when_unneeded), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RefuseManualStart", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_start), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
+        SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("IgnoreOnIsolate", "b", bus_property_get_bool, offsetof(Unit, ignore_on_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NeedDaemonReload", "b", property_get_need_daemon_reload, 0, 0),
+        SD_BUS_PROPERTY("Markers", "as", property_get_markers, offsetof(Unit, markers), 0),
+        SD_BUS_PROPERTY("JobTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_timeout), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("JobRunningTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_running_timeout), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("JobTimeoutAction", "s", bus_property_get_emergency_action, offsetof(Unit, job_timeout_action), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("JobTimeoutRebootArgument", "s", NULL, offsetof(Unit, job_timeout_reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ConditionResult", "b", bus_property_get_bool, offsetof(Unit, condition_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("AssertResult", "b", bus_property_get_bool, offsetof(Unit, assert_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("ConditionTimestamp", offsetof(Unit, condition_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        BUS_PROPERTY_DUAL_TIMESTAMP("AssertTimestamp", offsetof(Unit, assert_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Conditions", "a(sbbsi)", property_get_conditions, offsetof(Unit, conditions), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_PROPERTY("Asserts", "a(sbbsi)", property_get_conditions, offsetof(Unit, asserts), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+        SD_BUS_PROPERTY("LoadError", "(ss)", property_get_load_error, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Transient", "b", bus_property_get_bool, offsetof(Unit, transient), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Perpetual", "b", bus_property_get_bool, offsetof(Unit, perpetual), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("StartLimitAction", "s", bus_property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FailureAction", "s", bus_property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FailureActionExitStatus", "i", bus_property_get_int, offsetof(Unit, failure_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SuccessAction", "s", bus_property_get_emergency_action, offsetof(Unit, success_action), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("SuccessActionExitStatus", "i", bus_property_get_int, offsetof(Unit, success_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0),
+        SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Unit, activation_details), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+
+        SD_BUS_METHOD_WITH_ARGS("Start",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_start,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Stop",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_stop,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Reload",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_reload,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Restart",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_restart,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("TryRestart",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_try_restart,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ReloadOrRestart",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_reload_or_restart,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestart",
+                                SD_BUS_ARGS("s", mode),
+                                SD_BUS_RESULT("o", job),
+                                bus_unit_method_reload_or_try_restart,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("EnqueueJob",
+                                SD_BUS_ARGS("s", job_type, "s", job_mode),
+                                SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs),
+                                bus_unit_method_enqueue_job,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Kill",
+                                SD_BUS_ARGS("s", whom, "i", signal),
+                                SD_BUS_NO_RESULT,
+                                bus_unit_method_kill,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("QueueSignal",
+                                SD_BUS_ARGS("s", whom, "i", signal, "i", value),
+                                SD_BUS_NO_RESULT,
+                                bus_unit_method_kill,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("ResetFailed",
+                      NULL,
+                      NULL,
+                      bus_unit_method_reset_failed,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetProperties",
+                                SD_BUS_ARGS("b", runtime, "a(sv)", properties),
+                                SD_BUS_NO_RESULT,
+                                bus_unit_method_set_properties,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Ref",
+                      NULL,
+                      NULL,
+                      bus_unit_method_ref,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Unref",
+                      NULL,
+                      NULL,
+                      bus_unit_method_unref,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Clean",
+                                SD_BUS_ARGS("as", mask),
+                                SD_BUS_NO_RESULT,
+                                bus_unit_method_clean,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Freeze",
+                      NULL,
+                      NULL,
+                      bus_unit_method_freeze,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD("Thaw",
+                      NULL,
+                      NULL,
+                      bus_unit_method_thaw,
+                      SD_BUS_VTABLE_UNPRIVILEGED),
+
+        /* For dependency types we don't support anymore always return an empty array */
+        SD_BUS_PROPERTY("RequiresOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("RequisiteOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("RequiredByOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("RequisiteOfOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+        /* Obsolete alias names */
+        SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+        SD_BUS_PROPERTY("StartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+
+        SD_BUS_VTABLE_END
+};
+
+static int property_get_slice(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "s", unit_slice_name(u));
+}
+
+static int property_get_current_memory(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        uint64_t sz = UINT64_MAX;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = unit_get_memory_current(u, &sz);
+        if (r < 0 && r != -ENODATA)
+                log_unit_warning_errno(u, r, "Failed to get current memory usage from cgroup: %m");
+
+        return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_available_memory(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        uint64_t sz = UINT64_MAX;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = unit_get_memory_available(u, &sz);
+        if (r < 0 && r != -ENODATA)
+                log_unit_warning_errno(u, r, "Failed to get total available memory from cgroup: %m");
+
+        return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_memory_accounting(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        CGroupMemoryAccountingMetric metric;
+        uint64_t sz = UINT64_MAX;
+
+        assert(bus);
+        assert(reply);
+
+        assert_se((metric = cgroup_memory_accounting_metric_from_string(property)) >= 0);
+        (void) unit_get_memory_accounting(u, metric, &sz);
+        return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_current_tasks(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        uint64_t cn = UINT64_MAX;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = unit_get_tasks_current(u, &cn);
+        if (r < 0 && r != -ENODATA)
+                log_unit_warning_errno(u, r, "Failed to get pids.current attribute: %m");
+
+        return sd_bus_message_append(reply, "t", cn);
+}
+
+static int property_get_cpu_usage(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        nsec_t ns = NSEC_INFINITY;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = unit_get_cpu_usage(u, &ns);
+        if (r < 0 && r != -ENODATA)
+                log_unit_warning_errno(u, r, "Failed to get cpuacct.usage attribute: %m");
+
+        return sd_bus_message_append(reply, "t", ns);
+}
+
+static int property_get_cpuset_cpus(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        _cleanup_(cpu_set_reset) CPUSet cpus = {};
+        _cleanup_free_ uint8_t *array = NULL;
+        size_t allocated;
+
+        assert(bus);
+        assert(reply);
+
+        (void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective");
+        (void) cpu_set_to_dbus(&cpus, &array, &allocated);
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_cpuset_mems(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        _cleanup_(cpu_set_reset) CPUSet mems = {};
+        _cleanup_free_ uint8_t *array = NULL;
+        size_t allocated;
+
+        assert(bus);
+        assert(reply);
+
+        (void) unit_get_cpuset(u, &mems, "cpuset.mems.effective");
+        (void) cpu_set_to_dbus(&mems, &array, &allocated);
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_cgroup(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        const char *t = NULL;
+
+        assert(bus);
+        assert(reply);
+
+        /* Three cases: a) u->cgroup_path is NULL, in which case the
+         * unit has no control group, which we report as the empty
+         * string. b) u->cgroup_path is the empty string, which
+         * indicates the root cgroup, which we report as "/". c) all
+         * other cases we report as-is. */
+
+        if (u->cgroup_path)
+                t = empty_to_root(u->cgroup_path);
+
+        return sd_bus_message_append(reply, "s", t);
+}
+
+static int append_process(sd_bus_message *reply, const char *p, PidRef *pid, Set *pids) {
+        _cleanup_free_ char *buf = NULL, *cmdline = NULL;
+        int r;
+
+        assert(reply);
+        assert(pidref_is_set(pid));
+
+        r = set_put(pids, PID_TO_PTR(pid->pid));
+        if (IN_SET(r, 0, -EEXIST))
+                return 0;
+        if (r < 0)
+                return r;
+
+        if (!p) {
+                r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &buf);
+                if (r == -ESRCH)
+                        return 0;
+                if (r < 0)
+                        return r;
+
+                p = buf;
+        }
+
+        (void) pidref_get_cmdline(
+                        pid,
+                        SIZE_MAX,
+                        PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_QUOTE,
+                        &cmdline);
+
+        return sd_bus_message_append(reply,
+                                     "(sus)",
+                                     p,
+                                     (uint32_t) pid->pid,
+                                     cmdline);
+}
+
+static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) {
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(reply);
+        assert(p);
+
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, p, &f);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+                /* libvirt / qemu uses threaded mode and cgroup.procs cannot be read at the lower levels.
+                 * From https://docs.kernel.org/admin-guide/cgroup-v2.html#threads, “cgroup.procs” in a
+                 * threaded domain cgroup contains the PIDs of all processes in the subtree and is not
+                 * readable in the subtree proper. */
+
+                r = cg_read_pidref(f, &pidref);
+                if (IN_SET(r, 0, -EOPNOTSUPP))
+                        break;
+                if (r < 0)
+                        return r;
+
+                r = pidref_is_kernel_thread(&pidref);
+                if (r == -ESRCH) /* gone by now */
+                        continue;
+                if (r < 0)
+                        log_debug_errno(r, "Failed to determine if " PID_FMT " is a kernel thread, assuming not: %m", pidref.pid);
+                if (r > 0)
+                        continue;
+
+                r = append_process(reply, p, &pidref, pids);
+                if (r < 0)
+                        return r;
+        }
+
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, p, &d);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                _cleanup_free_ char *g = NULL, *j = NULL;
+
+                r = cg_read_subgroup(d, &g);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                j = path_join(empty_to_root(p), g);
+                if (!j)
+                        return -ENOMEM;
+
+                r = append_cgroup(reply, j, pids);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_set_free_ Set *pids = NULL;
+        Unit *u = userdata;
+        int r;
+
+        assert(message);
+
+        r = mac_selinux_unit_access_check(u, message, "status", error);
+        if (r < 0)
+                return r;
+
+        pids = set_new(NULL);
+        if (!pids)
+                return -ENOMEM;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(sus)");
+        if (r < 0)
+                return r;
+
+        if (u->cgroup_path) {
+                r = append_cgroup(reply, u->cgroup_path, pids);
+                if (r < 0)
+                        return r;
+        }
+
+        /* The main and control pids might live outside of the cgroup, hence fetch them separately */
+        PidRef *pid = unit_main_pid(u);
+        if (pidref_is_set(pid)) {
+                r = append_process(reply, NULL, pid, pids);
+                if (r < 0)
+                        return r;
+        }
+
+        pid = unit_control_pid(u);
+        if (pidref_is_set(pid)) {
+                r = append_process(reply, NULL, pid, pids);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int property_get_ip_counter(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        uint64_t value = UINT64_MAX;
+        Unit *u = ASSERT_PTR(userdata);
+        CGroupIPAccountingMetric metric;
+
+        assert(bus);
+        assert(reply);
+        assert(property);
+
+        assert_se((metric = cgroup_ip_accounting_metric_from_string(property)) >= 0);
+        (void) unit_get_ip_accounting(u, metric, &value);
+        return sd_bus_message_append(reply, "t", value);
+}
+
+static int property_get_io_counter(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        uint64_t value = UINT64_MAX;
+        Unit *u = ASSERT_PTR(userdata);
+        ssize_t metric;
+
+        assert(bus);
+        assert(reply);
+        assert(property);
+
+        assert_se((metric = cgroup_io_accounting_metric_from_string(property)) >= 0);
+        (void) unit_get_io_accounting(u, metric, /* allow_cache= */ false, &value);
+        return sd_bus_message_append(reply, "t", value);
+}
+
+int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        _cleanup_set_free_ Set *pids = NULL;
+        Unit *u = userdata;
+        const char *path;
+        int r;
+
+        assert(message);
+
+        /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a
+         * specified cgroup path. Obviously this only works for units that actually maintain a cgroup
+         * representation. If a process is already in the cgroup no operation is executed – in this case the specified
+         * subcgroup path has no effect! */
+
+        r = mac_selinux_unit_access_check(u, message, "start", error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "s", &path);
+        if (r < 0)
+                return r;
+
+        path = empty_to_null(path);
+        if (path) {
+                if (!path_is_absolute(path))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
+
+                if (!path_is_normalized(path))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
+        }
+
+        if (!unit_cgroup_delegate(u))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units.");
+
+        if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing.");
+
+        r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_enter_container(message, 'a', "u");
+        if (r < 0)
+                return r;
+        for (;;) {
+                _cleanup_(pidref_freep) PidRef *pidref = NULL;
+                uid_t process_uid, sender_uid;
+                uint32_t upid;
+                pid_t pid;
+
+                r = sd_bus_message_read(message, "u", &upid);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                if (upid == 0) {
+                        r = sd_bus_creds_get_pid(creds, &pid);
+                        if (r < 0)
+                                return r;
+                } else
+                        pid = (uid_t) upid;
+
+                r = pidref_new_from_pid(pid, &pidref);
+                if (r < 0)
+                        return r;
+
+                /* Filter out duplicates */
+                if (set_contains(pids, pidref))
+                        continue;
+
+                /* Check if this process is suitable for attaching to this unit */
+                r = unit_pid_attachable(u, pidref, error);
+                if (r < 0)
+                        return r;
+
+                /* Let's query the sender's UID, so that we can make our security decisions */
+                r = sd_bus_creds_get_euid(creds, &sender_uid);
+                if (r < 0)
+                        return r;
+
+                /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit,
+                 * then the process' UID and the target unit's UID have to match the sender's UID */
+                if (sender_uid != 0 && sender_uid != getuid()) {
+                        r = pidref_get_uid(pidref, &process_uid);
+                        if (r < 0)
+                                return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m");
+
+                        if (process_uid != sender_uid)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid);
+                        if (process_uid != u->ref_uid)
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid);
+                }
+
+                r = set_ensure_consume(&pids, &pidref_hash_ops_free, TAKE_PTR(pidref));
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        r = unit_attach_pids_to_cgroup(u, pids, path);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m");
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+const sd_bus_vtable bus_unit_cgroup_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
+        SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0),
+        SD_BUS_PROPERTY("ControlGroupId", "t", NULL, offsetof(Unit, cgroup_id), 0),
+        SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0),
+        SD_BUS_PROPERTY("MemoryPeak", "t", property_get_memory_accounting, 0, 0),
+        SD_BUS_PROPERTY("MemorySwapCurrent", "t", property_get_memory_accounting, 0, 0),
+        SD_BUS_PROPERTY("MemorySwapPeak", "t", property_get_memory_accounting, 0, 0),
+        SD_BUS_PROPERTY("MemoryZSwapCurrent", "t", property_get_memory_accounting, 0, 0),
+        SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0),
+        SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0),
+        SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0),
+        SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0),
+        SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0),
+        SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0),
+        SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0),
+        SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0),
+        SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0),
+        SD_BUS_PROPERTY("IOReadBytes", "t", property_get_io_counter, 0, 0),
+        SD_BUS_PROPERTY("IOReadOperations", "t", property_get_io_counter, 0, 0),
+        SD_BUS_PROPERTY("IOWriteBytes", "t", property_get_io_counter, 0, 0),
+        SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0),
+
+        SD_BUS_METHOD_WITH_ARGS("GetProcesses",
+                                 SD_BUS_NO_ARGS,
+                                 SD_BUS_ARGS("a(sus)", processes),
+                                 bus_unit_method_get_processes,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_METHOD_WITH_ARGS("AttachProcesses",
+                                 SD_BUS_ARGS("s", subcgroup, "au", pids),
+                                 SD_BUS_NO_RESULT,
+                                 bus_unit_method_attach_processes,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_VTABLE_END
+};
+
+static int send_new_signal(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_free_ char *p = NULL;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+
+        p = unit_dbus_path(u);
+        if (!p)
+                return -ENOMEM;
+
+        r = sd_bus_message_new_signal(
+                        bus,
+                        &m,
+                        "/org/freedesktop/systemd1",
+                        "org.freedesktop.systemd1.Manager",
+                        "UnitNew");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(m, "so", u->id, p);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, m, NULL);
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+        _cleanup_free_ char *p = NULL;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+
+        p = unit_dbus_path(u);
+        if (!p)
+                return -ENOMEM;
+
+        /* Send a properties changed signal. First for the specific
+         * type, then for the generic unit. The clients may rely on
+         * this order to get atomic behavior if needed. */
+
+        r = sd_bus_emit_properties_changed_strv(
+                        bus, p,
+                        unit_dbus_interface_from_type(u->type),
+                        NULL);
+        if (r < 0)
+                return r;
+
+        return sd_bus_emit_properties_changed_strv(
+                        bus, p,
+                        "org.freedesktop.systemd1.Unit",
+                        NULL);
+}
+
+void bus_unit_send_change_signal(Unit *u) {
+        int r;
+        assert(u);
+
+        if (u->in_dbus_queue) {
+                LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u);
+                u->in_dbus_queue = false;
+
+                /* The unit might be good to be GC once its pending signals have been sent */
+                unit_add_to_gc_queue(u);
+        }
+
+        if (!u->id)
+                return;
+
+        r = bus_foreach_bus(u->manager, u->bus_track, u->sent_dbus_new_signal ? send_changed_signal : send_new_signal, u);
+        if (r < 0)
+                log_unit_debug_errno(u, r, "Failed to send unit change signal for %s: %m", u->id);
+
+        u->sent_dbus_new_signal = true;
+}
+
+void bus_unit_send_pending_change_signal(Unit *u, bool including_new) {
+
+        /* Sends out any pending change signals, but only if they really are pending. This call is used when we are
+         * about to change state in order to force out a PropertiesChanged signal beforehand if there was one pending
+         * so that clients can follow the full state transition */
+
+        if (!u->in_dbus_queue) /* If not enqueued, don't bother */
+                return;
+
+        if (!u->sent_dbus_new_signal && !including_new) /* If the unit was never announced, don't bother, it's fine if
+                                                         * the unit appears in the new state right-away (except if the
+                                                         * caller explicitly asked us to send it anyway) */
+                return;
+
+        if (MANAGER_IS_RELOADING(u->manager)) /* Don't generate unnecessary PropertiesChanged signals for the same unit
+                                               * when we are reloading. */
+                return;
+
+        bus_unit_send_change_signal(u);
+}
+
+int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        int r;
+
+        assert(u);
+
+        if (!u->pending_freezer_invocation)
+                return 0;
+
+        if (cancelled)
+                r = sd_bus_message_new_method_error(
+                                u->pending_freezer_invocation,
+                                &reply,
+                                &SD_BUS_ERROR_MAKE_CONST(
+                                                BUS_ERROR_FREEZE_CANCELLED, "Freeze operation aborted"));
+        else
+                r = sd_bus_message_new_method_return(u->pending_freezer_invocation, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_send(NULL, reply, NULL);
+        if (r < 0)
+                log_warning_errno(r, "Failed to send queued message, ignoring: %m");
+
+        u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
+
+        return 0;
+}
+
+static int send_removed_signal(sd_bus *bus, void *userdata) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_free_ char *p = NULL;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(bus);
+
+        p = unit_dbus_path(u);
+        if (!p)
+                return -ENOMEM;
+
+        r = sd_bus_message_new_signal(
+                        bus,
+                        &m,
+                        "/org/freedesktop/systemd1",
+                        "org.freedesktop.systemd1.Manager",
+                        "UnitRemoved");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(m, "so", u->id, p);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(bus, m, NULL);
+}
+
+void bus_unit_send_removed_signal(Unit *u) {
+        int r;
+        assert(u);
+
+        if (!u->sent_dbus_new_signal || u->in_dbus_queue)
+                bus_unit_send_change_signal(u);
+
+        if (!u->id)
+                return;
+
+        r = bus_foreach_bus(u->manager, u->bus_track, send_removed_signal, u);
+        if (r < 0)
+                log_unit_debug_errno(u, r, "Failed to send unit remove signal for %s: %m", u->id);
+}
+
+int bus_unit_queue_job_one(
+                sd_bus_message *message,
+                Unit *u,
+                JobType type,
+                JobMode mode,
+                BusUnitQueueFlags flags,
+                sd_bus_message *reply,
+                sd_bus_error *error) {
+
+        _cleanup_set_free_ Set *affected = NULL;
+        _cleanup_free_ char *job_path = NULL, *unit_path = NULL;
+        Job *j, *a;
+        int r;
+
+        if (FLAGS_SET(flags, BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE) && unit_can_reload(u)) {
+                if (type == JOB_RESTART)
+                        type = JOB_RELOAD_OR_START;
+                else if (type == JOB_TRY_RESTART)
+                        type = JOB_TRY_RELOAD;
+        }
+
+        if (type == JOB_STOP &&
+            IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_ERROR, UNIT_BAD_SETTING) &&
+            unit_active_state(u) == UNIT_INACTIVE)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", u->id);
+
+        if ((type == JOB_START && u->refuse_manual_start) ||
+            (type == JOB_STOP && u->refuse_manual_stop) ||
+            (IN_SET(type, JOB_RESTART, JOB_TRY_RESTART) && (u->refuse_manual_start || u->refuse_manual_stop)) ||
+            (type == JOB_RELOAD_OR_START && job_type_collapse(type, u) == JOB_START && u->refuse_manual_start))
+                return sd_bus_error_setf(error,
+                                         BUS_ERROR_ONLY_BY_DEPENDENCY,
+                                         "Operation refused, unit %s may be requested by dependency only (it is configured to refuse manual start/stop).",
+                                         u->id);
+
+        /* dbus-broker issues StartUnit for activation requests, and Type=dbus services automatically
+         * gain dependency on dbus.socket. Therefore, if dbus has a pending stop job, the new start
+         * job that pulls in dbus again would cause job type conflict. Let's avoid that by rejecting
+         * job enqueuing early.
+         *
+         * Note that unlike signal_activation_request(), we can't use unit_inactive_or_pending()
+         * here. StartUnit is a more generic interface, and thus users are allowed to use e.g. systemctl
+         * to start Type=dbus services even when dbus is inactive. */
+        if (type == JOB_START && u->type == UNIT_SERVICE && SERVICE(u)->type == SERVICE_DBUS)
+                FOREACH_STRING(dbus_unit, SPECIAL_DBUS_SOCKET, SPECIAL_DBUS_SERVICE) {
+                        Unit *dbus;
+
+                        dbus = manager_get_unit(u->manager, dbus_unit);
+                        if (dbus && unit_stop_pending(dbus))
+                                return sd_bus_error_setf(error,
+                                                         BUS_ERROR_SHUTTING_DOWN,
+                                                         "Operation for unit %s refused, D-Bus is shutting down.",
+                                                         u->id);
+                }
+
+        if (FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) {
+                affected = set_new(NULL);
+                if (!affected)
+                        return -ENOMEM;
+        }
+
+        r = manager_add_job(u->manager, type, u, mode, affected, error, &j);
+        if (r < 0)
+                return r;
+
+        r = bus_job_track_sender(j, message);
+        if (r < 0)
+                return r;
+
+        /* Before we send the method reply, force out the announcement JobNew for this job */
+        bus_job_send_pending_change_signal(j, true);
+
+        job_path = job_dbus_path(j);
+        if (!job_path)
+                return -ENOMEM;
+
+        /* The classic response is just a job object path */
+        if (!FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY))
+                return sd_bus_message_append(reply, "o", job_path);
+
+        /* In verbose mode respond with the anchor job plus everything that has been affected */
+
+        unit_path = unit_dbus_path(j->unit);
+        if (!unit_path)
+                return -ENOMEM;
+
+        r = sd_bus_message_append(reply, "uosos",
+                                  j->id, job_path,
+                                  j->unit->id, unit_path,
+                                  job_type_to_string(j->type));
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(uosos)");
+        if (r < 0)
+                return r;
+
+        SET_FOREACH(a, affected) {
+                if (a->id == j->id)
+                        continue;
+
+                /* Free paths from previous iteration */
+                job_path = mfree(job_path);
+                unit_path = mfree(unit_path);
+
+                job_path = job_dbus_path(a);
+                if (!job_path)
+                        return -ENOMEM;
+
+                unit_path = unit_dbus_path(a->unit);
+                if (!unit_path)
+                        return -ENOMEM;
+
+                r = sd_bus_message_append(reply, "(uosos)",
+                                          a->id, job_path,
+                                          a->unit->id, unit_path,
+                                          job_type_to_string(a->type));
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+int bus_unit_queue_job(
+                sd_bus_message *message,
+                Unit *u,
+                JobType type,
+                JobMode mode,
+                BusUnitQueueFlags flags,
+                sd_bus_error *error) {
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        int r;
+
+        assert(message);
+        assert(u);
+        assert(type >= 0 && type < _JOB_TYPE_MAX);
+        assert(mode >= 0 && mode < _JOB_MODE_MAX);
+
+        r = mac_selinux_unit_access_check(
+                        u, message,
+                        job_type_to_access_method(type),
+                        error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = bus_unit_queue_job_one(message, u, type, mode, flags, reply, error);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int bus_unit_set_live_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int r;
+
+        assert(u);
+        assert(name);
+        assert(message);
+
+        /* Handles setting properties both "live" (i.e. at any time during runtime), and during creation (for
+         * transient units that are being created). */
+
+        if (streq(name, "Description")) {
+                const char *d;
+
+                r = sd_bus_message_read(message, "s", &d);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        r = unit_set_description(u, d);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Description=%s", d);
+                }
+
+                return 1;
+        }
+
+        /* A setting that only applies to active units. We don't actually write this to /run, this state is
+         * managed internally. "+foo" sets flag foo, "-foo" unsets flag foo, just "foo" resets flags to
+         * foo. The last type cannot be mixed with "+" or "-". */
+
+        if (streq(name, "Markers")) {
+                unsigned settings = 0, mask = 0;
+                bool some_plus_minus = false, some_absolute = false;
+
+                r = sd_bus_message_enter_container(message, 'a', "s");
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        const char *word;
+                        bool b;
+
+                        r = sd_bus_message_read(message, "s", &word);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        if (IN_SET(word[0], '+', '-')) {
+                                b = word[0] == '+';
+                                word++;
+                                some_plus_minus = true;
+                        } else {
+                                b = true;
+                                some_absolute = true;
+                        }
+
+                        UnitMarker m = unit_marker_from_string(word);
+                        if (m < 0)
+                                return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+                                                         "Unknown marker \"%s\".", word);
+
+                        SET_FLAG(settings, 1u << m, b);
+                        SET_FLAG(mask, 1u << m, true);
+                }
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                if (some_plus_minus && some_absolute)
+                        return sd_bus_error_set(error, BUS_ERROR_BAD_UNIT_SETTING, "Bad marker syntax.");
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (some_absolute)
+                                u->markers = settings;
+                        else
+                                u->markers = settings | (u->markers & ~mask);
+                }
+
+                return 1;
+        }
+
+        return 0;
+}
+
+static int bus_set_transient_emergency_action(
+                Unit *u,
+                const char *name,
+                EmergencyAction *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        const char *s;
+        EmergencyAction v;
+        int r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "s", &s);
+        if (r < 0)
+                return r;
+
+        r = parse_emergency_action(s, u->manager->runtime_scope, &v);
+        if (r < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         r == -EOPNOTSUPP ? "%s setting invalid for manager type: %s"
+                                                          : "Invalid %s setting: %s",
+                                         name, s);
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = v;
+                unit_write_settingf(u, flags, name,
+                                    "%s=%s", name, s);
+        }
+
+        return 1;
+}
+
+static int bus_set_transient_exit_status(
+                Unit *u,
+                const char *name,
+                int *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int32_t k;
+        int r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "i", &k);
+        if (r < 0)
+                return r;
+
+        if (k > 255)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Exit status must be in range 0…255 or negative.");
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = k < 0 ? -1 : k;
+
+                if (k < 0)
+                        unit_write_settingf(u, flags, name, "%s=", name);
+                else
+                        unit_write_settingf(u, flags, name, "%s=%i", name, k);
+        }
+
+        return 1;
+}
+
+static BUS_DEFINE_SET_TRANSIENT_PARSE(collect_mode, CollectMode, collect_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(job_mode, JobMode, job_mode_from_string);
+
+static int bus_set_transient_conditions(
+                Unit *u,
+                const char *name,
+                Condition **list,
+                bool is_condition,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        const char *type_name, *param;
+        int trigger, negate, r;
+        bool empty = true;
+
+        assert(list);
+
+        r = sd_bus_message_enter_container(message, 'a', "(sbbs)");
+        if (r < 0)
+                return r;
+
+        while ((r = sd_bus_message_read(message, "(sbbs)", &type_name, &trigger, &negate, ¶m)) > 0) {
+                ConditionType t;
+
+                t = is_condition ? condition_type_from_string(type_name) : assert_type_from_string(type_name);
+                if (t < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid condition type: %s", type_name);
+
+                if (isempty(param))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Condition parameter in %s is empty", type_name);
+
+                if (condition_takes_path(t) && !path_is_absolute(param))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in condition %s is not absolute: %s", type_name, param);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        Condition *c;
+
+                        c = condition_new(t, param, trigger, negate);
+                        if (!c)
+                                return -ENOMEM;
+
+                        LIST_PREPEND(conditions, *list, c);
+
+                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                            "%s=%s%s%s", type_name,
+                                            trigger ? "|" : "", negate ? "!" : "", param);
+                }
+
+                empty = false;
+        }
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+                *list = condition_free_list(*list);
+                unit_write_settingf(u, flags, name, "%sNull=", is_condition ? "Condition" : "Assert");
+        }
+
+        return 1;
+}
+
+static int bus_unit_set_transient_property(
+                Unit *u,
+                const char *name,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        UnitDependency d;
+        int r;
+
+        assert(u);
+        assert(name);
+        assert(message);
+
+        /* Handles settings when transient units are created. This settings cannot be altered anymore after
+         * the unit has been created. */
+
+        if (streq(name, "SourcePath"))
+                return bus_set_transient_path(u, name, &u->source_path, message, flags, error);
+
+        if (streq(name, "StopWhenUnneeded"))
+                return bus_set_transient_bool(u, name, &u->stop_when_unneeded, message, flags, error);
+
+        if (streq(name, "RefuseManualStart"))
+                return bus_set_transient_bool(u, name, &u->refuse_manual_start, message, flags, error);
+
+        if (streq(name, "RefuseManualStop"))
+                return bus_set_transient_bool(u, name, &u->refuse_manual_stop, message, flags, error);
+
+        if (streq(name, "AllowIsolate"))
+                return bus_set_transient_bool(u, name, &u->allow_isolate, message, flags, error);
+
+        if (streq(name, "DefaultDependencies"))
+                return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
+
+        if (streq(name, "SurviveFinalKillSignal"))
+                return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error);
+
+        if (streq(name, "OnSuccessJobMode"))
+                return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
+
+        if (streq(name, "OnFailureJobMode"))
+                return bus_set_transient_job_mode(u, name, &u->on_failure_job_mode, message, flags, error);
+
+        if (streq(name, "IgnoreOnIsolate"))
+                return bus_set_transient_bool(u, name, &u->ignore_on_isolate, message, flags, error);
+
+        if (streq(name, "JobTimeoutUSec")) {
+                r = bus_set_transient_usec_fix_0(u, name, &u->job_timeout, message, flags, error);
+                if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags) && !u->job_running_timeout_set)
+                        u->job_running_timeout = u->job_timeout;
+        }
+
+        if (streq(name, "JobRunningTimeoutUSec")) {
+                r = bus_set_transient_usec_fix_0(u, name, &u->job_running_timeout, message, flags, error);
+                if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+                        u->job_running_timeout_set = true;
+
+                return r;
+        }
+
+        if (streq(name, "JobTimeoutAction"))
+                return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error);
+
+        if (streq(name, "JobTimeoutRebootArgument"))
+                return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error);
+
+        if (streq(name, "StartLimitIntervalUSec"))
+                return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error);
+
+        if (streq(name, "StartLimitBurst"))
+                return bus_set_transient_unsigned(u, name, &u->start_ratelimit.burst, message, flags, error);
+
+        if (streq(name, "StartLimitAction"))
+                return bus_set_transient_emergency_action(u, name, &u->start_limit_action, message, flags, error);
+
+        if (streq(name, "FailureAction"))
+                return bus_set_transient_emergency_action(u, name, &u->failure_action, message, flags, error);
+
+        if (streq(name, "SuccessAction"))
+                return bus_set_transient_emergency_action(u, name, &u->success_action, message, flags, error);
+
+        if (streq(name, "FailureActionExitStatus"))
+                return bus_set_transient_exit_status(u, name, &u->failure_action_exit_status, message, flags, error);
+
+        if (streq(name, "SuccessActionExitStatus"))
+                return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error);
+
+        if (streq(name, "RebootArgument"))
+                return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error);
+
+        if (streq(name, "CollectMode"))
+                return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error);
+
+        if (streq(name, "Conditions"))
+                return bus_set_transient_conditions(u, name, &u->conditions, true, message, flags, error);
+
+        if (streq(name, "Asserts"))
+                return bus_set_transient_conditions(u, name, &u->asserts, false, message, flags, error);
+
+        if (streq(name, "Documentation")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l)
+                        if (!documentation_url_is_valid(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid URL in %s: %s", name, *p);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        if (strv_isempty(l)) {
+                                u->documentation = strv_free(u->documentation);
+                                unit_write_settingf(u, flags, name, "%s=", name);
+                        } else {
+                                strv_extend_strv(&u->documentation, l, false);
+
+                                STRV_FOREACH(p, l)
+                                        unit_write_settingf(u, flags, name, "%s=%s", name, *p);
+                        }
+                }
+
+                return 1;
+
+        } else if (streq(name, "Slice")) {
+                Unit *slice;
+                const char *s;
+
+                if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups.");
+                if (u->type == UNIT_SLICE)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units.");
+                if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope");
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                if (!unit_name_is_valid(s, UNIT_NAME_PLAIN))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name '%s'", s);
+
+                /* Note that we do not dispatch the load queue here yet, as we don't want our own transient unit to be
+                 * loaded while we are still setting it up. Or in other words, we use manager_load_unit_prepare()
+                 * instead of manager_load_unit() on purpose, here. */
+                r = manager_load_unit_prepare(u->manager, s, NULL, error, &slice);
+                if (r < 0)
+                        return r;
+
+                if (slice->type != UNIT_SLICE)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit name '%s' is not a slice", s);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        r = unit_set_slice(u, slice);
+                        if (r < 0)
+                                return r;
+
+                        unit_write_settingf(u, flags|UNIT_PRIVATE, name, "Slice=%s", s);
+                }
+
+                return 1;
+
+        } else if (streq(name, "RequiresMountsFor")) {
+                _cleanup_strv_free_ char **l = NULL;
+
+                r = sd_bus_message_read_strv(message, &l);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, l) {
+                        path_simplify(*p);
+
+                        if (!path_is_absolute(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not absolute: %s", name, *p);
+
+                        if (!path_is_valid(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s has invalid length: %s", name, *p);
+
+                        if (!path_is_normalized(*p))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE);
+                                if (r < 0)
+                                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p);
+
+                                unit_write_settingf(u, flags, name, "%s=%s", name, *p);
+                        }
+                }
+
+                return 1;
+        }
+
+        if (streq(name, "RequiresOverridable"))
+                d = UNIT_REQUIRES; /* redirect for obsolete unit dependency type */
+        else if (streq(name, "RequisiteOverridable"))
+                d = UNIT_REQUISITE; /* same here */
+        else
+                d = unit_dependency_from_string(name);
+
+        if (d >= 0) {
+                const char *other;
+
+                if (!IN_SET(d,
+                            UNIT_REQUIRES,
+                            UNIT_REQUISITE,
+                            UNIT_WANTS,
+                            UNIT_BINDS_TO,
+                            UNIT_PART_OF,
+                            UNIT_UPHOLDS,
+                            UNIT_CONFLICTS,
+                            UNIT_BEFORE,
+                            UNIT_AFTER,
+                            UNIT_ON_SUCCESS,
+                            UNIT_ON_FAILURE,
+                            UNIT_PROPAGATES_RELOAD_TO,
+                            UNIT_RELOAD_PROPAGATED_FROM,
+                            UNIT_PROPAGATES_STOP_TO,
+                            UNIT_STOP_PROPAGATED_FROM,
+                            UNIT_JOINS_NAMESPACE_OF))
+                    return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Dependency type %s may not be created transiently.", unit_dependency_to_string(d));
+
+                r = sd_bus_message_enter_container(message, 'a', "s");
+                if (r < 0)
+                        return r;
+
+                while ((r = sd_bus_message_read(message, "s", &other)) > 0) {
+                        if (!unit_name_is_valid(other, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+                                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name %s", other);
+
+                        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                                _cleanup_free_ char *label = NULL;
+
+                                r = unit_add_dependency_by_name(u, d, other, true, UNIT_DEPENDENCY_FILE);
+                                if (r < 0)
+                                        return r;
+
+                                label = strjoin(name, "-", other);
+                                if (!label)
+                                        return -ENOMEM;
+
+                                unit_write_settingf(u, flags, label, "%s=%s", unit_dependency_to_string(d), other);
+                        }
+
+                }
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        return r;
+
+                return 1;
+
+        } else if (streq(name, "AddRef")) {
+
+                int b;
+
+                /* Why is this called "AddRef" rather than just "Ref", or "Reference"? There's already a "Ref()" method
+                 * on the Unit interface, and it's probably not a good idea to expose a property and a method on the
+                 * same interface (well, strictly speaking AddRef isn't exposed as full property, we just read it for
+                 * transient units, but still). And "References" and "ReferencedBy" is already used as unit reference
+                 * dependency type, hence let's not confuse things with that.
+                 *
+                 * Note that we don't actually add the reference to the bus track. We do that only after the setup of
+                 * the transient unit is complete, so that setting this property multiple times in the same transient
+                 * unit creation call doesn't count as individual references. */
+
+                r = sd_bus_message_read(message, "b", &b);
+                if (r < 0)
+                        return r;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags))
+                        u->bus_track_add = b;
+
+                return 1;
+        }
+
+        return 0;
+}
+
+int bus_unit_set_properties(
+                Unit *u,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                bool commit,
+                sd_bus_error *error) {
+
+        bool for_real = false;
+        unsigned n = 0;
+        int r;
+
+        assert(u);
+        assert(message);
+
+        /* We iterate through the array twice. First run just checks if all passed data is valid, second run
+         * actually applies it. This implements transaction-like behaviour without actually providing full
+         * transactions. */
+
+        r = sd_bus_message_enter_container(message, 'a', "(sv)");
+        if (r < 0)
+                goto error;
+
+        for (;;) {
+                const char *name;
+                UnitWriteFlags f;
+
+                r = sd_bus_message_enter_container(message, 'r', "sv");
+                if (r < 0)
+                        goto error;
+                if (r == 0) {
+                        if (for_real || UNIT_WRITE_FLAGS_NOOP(flags))
+                                break;
+
+                        /* Reached EOF. Let's try again, and this time for realz... */
+                        r = sd_bus_message_rewind(message, false);
+                        if (r < 0)
+                                goto error;
+
+                        for_real = true;
+                        continue;
+                }
+
+                r = sd_bus_message_read(message, "s", &name);
+                if (r < 0)
+                        goto error;
+
+                r = sd_bus_message_enter_container(message, 'v', NULL);
+                if (r < 0)
+                        goto error;
+
+                /* If not for real, then mask out the two target flags */
+                f = for_real ? flags : (flags & ~(UNIT_RUNTIME|UNIT_PERSISTENT));
+
+                if (UNIT_VTABLE(u)->bus_set_property)
+                        r = UNIT_VTABLE(u)->bus_set_property(u, name, message, f, error);
+                else
+                        r = 0;
+                if (r == 0 && u->transient && u->load_state == UNIT_STUB)
+                        r = bus_unit_set_transient_property(u, name, message, f, error);
+                if (r == 0)
+                        r = bus_unit_set_live_property(u, name, message, f, error);
+                if (r < 0)
+                        goto error;
+
+                if (r == 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY,
+                                                 "Cannot set property %s, or unknown property.", name);
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        goto error;
+
+                r = sd_bus_message_exit_container(message);
+                if (r < 0)
+                        goto error;
+
+                n += for_real;
+        }
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                goto error;
+
+        if (commit && n > 0 && UNIT_VTABLE(u)->bus_commit_properties)
+                UNIT_VTABLE(u)->bus_commit_properties(u);
+
+        return n;
+
+ error:
+        /* Pretty much any of the calls above can fail if the message is not formed properly
+         * or if it has unexpected contents. Fill in a more informative error message here. */
+        if (sd_bus_error_is_set(error))
+                return r;
+        return sd_bus_error_set_errnof(error, r,
+                                       r == -ENXIO ? "Failed to set unit properties: Unexpected message contents"
+                                                   : "Failed to set unit properties: %m");
+}
+
+int bus_unit_validate_load_state(Unit *u, sd_bus_error *error) {
+        assert(u);
+
+        /* Generates a pretty error if a unit isn't properly loaded. */
+
+        switch (u->load_state) {
+
+        case UNIT_LOADED:
+                return 0;
+
+        case UNIT_NOT_FOUND:
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not found.", u->id);
+
+        case UNIT_BAD_SETTING:
+                return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, "Unit %s has a bad unit file setting.", u->id);
+
+        case UNIT_ERROR: /* Only show .load_error in UNIT_ERROR state */
+                return sd_bus_error_set_errnof(error, u->load_error,
+                                               "Unit %s failed to load properly, please adjust/correct and reload service manager: %m", u->id);
+
+        case UNIT_MASKED:
+                return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit %s is masked.", u->id);
+
+        case UNIT_STUB:
+        case UNIT_MERGED:
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unexpected load state of unit %s", u->id);
+        }
+}
+
+static int bus_unit_track_handler(sd_bus_track *t, void *userdata) {
+        Unit *u = ASSERT_PTR(userdata);
+
+        assert(t);
+
+        u->bus_track = sd_bus_track_unref(u->bus_track); /* make sure we aren't called again */
+
+        /* If the client that tracks us disappeared, then there's reason to believe that the cgroup is empty now too,
+         * let's see */
+        unit_add_to_cgroup_empty_queue(u);
+
+        /* Also add the unit to the GC queue, after all if the client left it might be time to GC this unit */
+        unit_add_to_gc_queue(u);
+
+        return 0;
+}
+
+static int bus_unit_allocate_bus_track(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->bus_track)
+                return 0;
+
+        r = sd_bus_track_new(u->manager->api_bus, &u->bus_track, bus_unit_track_handler, u);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_track_set_recursive(u->bus_track, true);
+        if (r < 0) {
+                u->bus_track = sd_bus_track_unref(u->bus_track);
+                return r;
+        }
+
+        return 0;
+}
+
+int bus_unit_track_add_name(Unit *u, const char *name) {
+        int r;
+
+        assert(u);
+
+        r = bus_unit_allocate_bus_track(u);
+        if (r < 0)
+                return r;
+
+        return sd_bus_track_add_name(u->bus_track, name);
+}
+
+int bus_unit_track_add_sender(Unit *u, sd_bus_message *m) {
+        int r;
+
+        assert(u);
+
+        r = bus_unit_allocate_bus_track(u);
+        if (r < 0)
+                return r;
+
+        return sd_bus_track_add_sender(u->bus_track, m);
+}
+
+int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m) {
+        assert(u);
+
+        /* If we haven't allocated the bus track object yet, then there's definitely no reference taken yet,
+         * return an error */
+        if (!u->bus_track)
+                return -EUNATCH;
+
+        return sd_bus_track_remove_sender(u->bus_track, m);
+}
diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h
new file mode 100644
index 0000000..6b7828e
--- /dev/null
+++ b/src/core/dbus-unit.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_unit_vtable[];
+extern const sd_bus_vtable bus_unit_cgroup_vtable[];
+
+void bus_unit_send_change_signal(Unit *u);
+void bus_unit_send_pending_change_signal(Unit *u, bool including_new);
+int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled);
+void bus_unit_send_removed_signal(Unit *u);
+
+int bus_unit_method_start_generic(sd_bus_message *message, Unit *u, JobType job_type, bool reload_if_possible, sd_bus_error *error);
+int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error);
+int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+typedef enum BusUnitQueueFlags {
+        BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE            = 1 << 0,
+        BUS_UNIT_QUEUE_VERBOSE_REPLY                 = 1 << 1,
+} BusUnitQueueFlags;
+
+int bus_unit_queue_job_one(
+                sd_bus_message *message,
+                Unit *u,
+                JobType type,
+                JobMode mode,
+                BusUnitQueueFlags flags,
+                sd_bus_message *reply,
+                sd_bus_error *error);
+int bus_unit_queue_job(
+                sd_bus_message *message,
+                Unit *u,
+                JobType type,
+                JobMode mode,
+                BusUnitQueueFlags flags,
+                sd_bus_error *error);
+int bus_unit_validate_load_state(Unit *u, sd_bus_error *error);
+
+int bus_unit_track_add_name(Unit *u, const char *name);
+int bus_unit_track_add_sender(Unit *u, sd_bus_message *m);
+int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m);
diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c
new file mode 100644
index 0000000..d680a64
--- /dev/null
+++ b/src/core/dbus-util.c
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "dbus-util.h"
+#include "escape.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "unit.h"
+
+int bus_property_get_triggered_unit(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Unit *u = userdata, *trigger;
+
+        assert(bus);
+        assert(reply);
+        assert(u);
+
+        trigger = UNIT_TRIGGER(u);
+
+        return sd_bus_message_append(reply, "s", trigger ? trigger->id : NULL);
+}
+
+BUS_DEFINE_SET_TRANSIENT(mode_t, "u", uint32_t, mode_t, "%04o");
+BUS_DEFINE_SET_TRANSIENT(unsigned, "u", uint32_t, unsigned, "%" PRIu32);
+
+static bool valid_user_group_name_or_id_relaxed(const char *u) {
+        return valid_user_group_name(u, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX);
+}
+
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed);
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute);
+
+int bus_set_transient_string(
+                Unit *u,
+                const char *name,
+                char **p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        const char *v;
+        int r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "s", &v);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                r = free_and_strdup(p, empty_to_null(v));
+                if (r < 0)
+                        return r;
+
+                unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+                                    "%s=%s", name, strempty(v));
+        }
+
+        return 1;
+}
+
+int bus_set_transient_bool(
+                Unit *u,
+                const char *name,
+                bool *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int v, r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "b", &v);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = v;
+                unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v));
+        }
+
+        return 1;
+}
+
+int bus_set_transient_tristate(
+                Unit *u,
+                const char *name,
+                int *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int v, r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "b", &v);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = v;
+                unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v));
+        }
+
+        return 1;
+}
+
+int bus_set_transient_usec_internal(
+                Unit *u,
+                const char *name,
+                usec_t *p,
+                bool fix_0,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        uint64_t v;
+        int r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "t", &v);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                if (fix_0)
+                        *p = v != 0 ? v: USEC_INFINITY;
+                else
+                        *p = v;
+
+                char *n = strndupa_safe(name, strlen(name) - 4);
+                unit_write_settingf(u, flags, name, "%sSec=%s", n, FORMAT_TIMESPAN(v, USEC_PER_MSEC));
+        }
+
+        return 1;
+}
+
+int bus_verify_manage_units_async_full(
+                Unit *u,
+                const char *verb,
+                int capability,
+                const char *polkit_message,
+                bool interactive,
+                sd_bus_message *call,
+                sd_bus_error *error) {
+
+        const char *details[9] = {
+                "unit", u->id,
+                "verb", verb,
+        };
+
+        if (polkit_message) {
+                details[4] = "polkit.message";
+                details[5] = polkit_message;
+                details[6] = "polkit.gettext_domain";
+                details[7] = GETTEXT_PACKAGE;
+        }
+
+        return bus_verify_polkit_async(
+                        call,
+                        capability,
+                        "org.freedesktop.systemd1.manage-units",
+                        details,
+                        interactive,
+                        UID_INVALID,
+                        &u->manager->polkit_registry,
+                        error);
+}
+
+/* ret_format_str is an accumulator, so if it has any pre-existing content, new options will be appended to it */
+int bus_read_mount_options(
+                sd_bus_message *message,
+                sd_bus_error *error,
+                MountOptions **ret_options,
+                char **ret_format_str,
+                const char *separator) {
+
+        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+        _cleanup_free_ char *format_str = NULL;
+        const char *mount_options, *partition;
+        int r;
+
+        assert(message);
+        assert(ret_options);
+        assert(separator);
+
+        r = sd_bus_message_enter_container(message, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        while ((r = sd_bus_message_read(message, "(ss)", &partition, &mount_options)) > 0) {
+                _cleanup_free_ char *escaped = NULL;
+                _cleanup_free_ MountOptions *o = NULL;
+                PartitionDesignator partition_designator;
+
+                if (chars_intersect(mount_options, WHITESPACE))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                                "Invalid mount options string, contains whitespace character(s): %s", mount_options);
+
+                partition_designator = partition_designator_from_string(partition);
+                if (partition_designator < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid partition name %s", partition);
+
+                /* Need to store the options with the escapes, so that they can be parsed again */
+                escaped = shell_escape(mount_options, ":");
+                if (!escaped)
+                        return -ENOMEM;
+
+                if (!strextend_with_separator(&format_str, separator, partition, ":", escaped))
+                        return -ENOMEM;
+
+                o = new(MountOptions, 1);
+                if (!o)
+                        return -ENOMEM;
+                *o = (MountOptions) {
+                        .partition_designator = partition_designator,
+                        .options = strdup(mount_options),
+                };
+                if (!o->options)
+                        return -ENOMEM;
+                LIST_APPEND(mount_options, options, TAKE_PTR(o));
+        }
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_exit_container(message);
+        if (r < 0)
+                return r;
+
+        if (options) {
+                if (ret_format_str) {
+                        char *final = strjoin(*ret_format_str, !isempty(*ret_format_str) ? separator : "", format_str);
+                        if (!final)
+                                return -ENOMEM;
+                        free_and_replace(*ret_format_str, final);
+                }
+                LIST_JOIN(mount_options, *ret_options, options);
+        }
+
+        return 0;
+}
+
+int bus_property_get_activation_details(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ActivationDetails **details = ASSERT_PTR(userdata);
+        _cleanup_strv_free_ char **pairs = NULL;
+        int r;
+
+        assert(reply);
+
+        r = activation_details_append_pair(*details, &pairs);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(ss)");
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH_PAIR(key, value, pairs) {
+                r = sd_bus_message_append(reply, "(ss)", *key, *value);
+                if (r < 0)
+                        return r;
+        }
+
+        return sd_bus_message_close_container(reply);
+}
diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h
new file mode 100644
index 0000000..9464b25
--- /dev/null
+++ b/src/core/dbus-util.h
@@ -0,0 +1,256 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "dissect-image.h"
+#include "unit.h"
+
+int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+
+#define BUS_DEFINE_SET_TRANSIENT(function, bus_type, type, cast_type, fmt) \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        cast_type *p,                                   \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                type v;                                                 \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, bus_type, &v);         \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = (cast_type) v;                             \
+                        unit_write_settingf(u, flags, name,             \
+                                            "%s=" fmt, name, v);        \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_TRANSIENT_IS_VALID(function, bus_type, type, cast_type, fmt, check) \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        cast_type *p,                                   \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                type v;                                                 \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, bus_type, &v);         \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                if (!check(v))                                          \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Invalid %s setting: " fmt, name, v); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = (cast_type) v;                             \
+                        unit_write_settingf(u, flags, name,             \
+                                            "%s=" fmt, name, v);        \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_TRANSIENT_TO_STRING(function, bus_type, type, cast_type, fmt, to_string) \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        cast_type *p,                                   \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                const char *s;                                          \
+                type v;                                                 \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, bus_type, &v);         \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                s = to_string(v);                                       \
+                if (!s)                                                 \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Invalid %s setting: " fmt, name, v); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = (cast_type) v;                             \
+                        unit_write_settingf(u, flags, name,             \
+                                            "%s=%s", name, s);          \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(function, bus_type, type, cast_type, fmt, to_string) \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        cast_type *p,                                   \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                _cleanup_free_ char *s = NULL;                          \
+                type v;                                                 \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, bus_type, &v);         \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                r = to_string(v, &s);                                   \
+                if (r == -EINVAL)                                       \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Invalid %s setting: " fmt, name, v); \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = (cast_type) v;                             \
+                        unit_write_settingf(u, flags, name,             \
+                                            "%s=%s",                    \
+                                            name, strempty(s));         \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_TRANSIENT_PARSE(function, type, parse)           \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        type *p,                                        \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                const char *s;                                          \
+                type v;                                                 \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, "s", &s);              \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                v = parse(s);                                           \
+                if (v < 0)                                              \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Invalid %s setting: %s", name, s); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = v;                                         \
+                        unit_write_settingf(u, flags, name,             \
+                                            "%s=%s", name, s);          \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(function, type, parse)       \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        type *p,                                        \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                const char *s;                                          \
+                type v;                                                 \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, "s", &s);              \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                r = parse(s, &v);                                       \
+                if (r < 0)                                              \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Invalid %s setting: %s", name, s); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        *p = v;                                         \
+                        unit_write_settingf(u, flags, name,             \
+                                            "%s=%s", name, strempty(s)); \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+#define BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(function, check)     \
+        int bus_set_transient_##function(                               \
+                        Unit *u,                                        \
+                        const char *name,                               \
+                        char **p,                                       \
+                        sd_bus_message *message,                        \
+                        UnitWriteFlags flags,                           \
+                        sd_bus_error *error) {                          \
+                                                                        \
+                const char *v;                                          \
+                int r;                                                  \
+                                                                        \
+                assert(p);                                              \
+                                                                        \
+                r = sd_bus_message_read(message, "s", &v);              \
+                if (r < 0)                                              \
+                        return r;                                       \
+                                                                        \
+                if (!isempty(v) && !check(v))                           \
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+                                                 "Invalid %s setting: %s", name, v); \
+                                                                        \
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {                    \
+                        r = free_and_strdup(p, empty_to_null(v));       \
+                        if (r < 0)                                      \
+                                return r;                               \
+                                                                        \
+                        unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, \
+                                            "%s=%s", name, strempty(v)); \
+                }                                                       \
+                                                                        \
+                return 1;                                               \
+        }
+
+int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
+        return bus_set_transient_usec_internal(u, name, p, false, message, flags, error);
+}
+static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
+        return bus_set_transient_usec_internal(u, name, p, true, message, flags, error);
+}
+int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error);
+
+int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator);
+
+int bus_property_get_activation_details(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
diff --git a/src/core/dbus.c b/src/core/dbus.c
new file mode 100644
index 0000000..ba2cec4
--- /dev/null
+++ b/src/core/dbus.c
@@ -0,0 +1,1273 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-internal.h"
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "dbus-automount.h"
+#include "dbus-cgroup.h"
+#include "dbus-device.h"
+#include "dbus-execute.h"
+#include "dbus-job.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-mount.h"
+#include "dbus-path.h"
+#include "dbus-scope.h"
+#include "dbus-service.h"
+#include "dbus-slice.h"
+#include "dbus-socket.h"
+#include "dbus-swap.h"
+#include "dbus-target.h"
+#include "dbus-timer.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "serialize.h"
+#include "service.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "strxcpyx.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+#define CONNECTIONS_MAX 4096
+
+static void destroy_bus(Manager *m, sd_bus **bus);
+
+int bus_send_pending_reload_message(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (!m->pending_reload_message)
+                return 0;
+
+        /* If we cannot get rid of this message we won't dispatch any D-Bus messages, so that we won't end up wanting
+         * to queue another message. */
+
+        r = sd_bus_send(NULL, m->pending_reload_message, NULL);
+        if (r < 0)
+                log_warning_errno(r, "Failed to send queued message, ignoring: %m");
+
+        m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message);
+
+        return 0;
+}
+
+int bus_forward_agent_released(Manager *m, const char *path) {
+        int r;
+
+        assert(m);
+        assert(path);
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return 0;
+
+        if (!m->system_bus)
+                return 0;
+
+        /* If we are running a system instance we forward the agent message on the system bus, so that the user
+         * instances get notified about this, too */
+
+        r = sd_bus_emit_signal(m->system_bus,
+                               "/org/freedesktop/systemd1/agent",
+                               "org.freedesktop.systemd1.Agent",
+                               "Released",
+                               "s", path);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to propagate agent release message: %m");
+
+        return 1;
+}
+
+static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *cgroup;
+        uid_t sender_uid;
+        int r;
+
+        assert(message);
+
+        /* only accept org.freedesktop.systemd1.Agent from UID=0 */
+        r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_creds_get_euid(creds, &sender_uid);
+        if (r < 0 || sender_uid != 0)
+                return 0;
+
+        /* parse 'cgroup-empty' notification */
+        r = sd_bus_message_read(message, "s", &cgroup);
+        if (r < 0) {
+                bus_log_parse_error(r);
+                return 0;
+        }
+
+        manager_notify_cgroup_empty(m, cgroup);
+        return 0;
+}
+
+static int signal_disconnected(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        sd_bus *bus;
+
+        assert(message);
+        assert_se(bus = sd_bus_message_get_bus(message));
+
+        if (bus == m->api_bus)
+                bus_done_api(m);
+        if (bus == m->system_bus)
+                bus_done_system(m);
+
+        if (set_remove(m->private_buses, bus)) {
+                log_debug("Got disconnect on private connection.");
+                destroy_bus(m, &bus);
+        }
+
+        return 0;
+}
+
+static int signal_activation_request(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *name;
+        Unit *u;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "s", &name);
+        if (r < 0) {
+                bus_log_parse_error(r);
+                return 0;
+        }
+
+        if (manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SOCKET) ||
+            manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SERVICE)) {
+                r = sd_bus_error_set(&error, BUS_ERROR_SHUTTING_DOWN, "Refusing activation, D-Bus is shutting down.");
+                goto failed;
+        }
+
+        r = manager_load_unit(m, name, NULL, &error, &u);
+        if (r < 0)
+                goto failed;
+
+        if (u->refuse_manual_start) {
+                r = sd_bus_error_setf(&error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id);
+                goto failed;
+        }
+
+        r = manager_add_job(m, JOB_START, u, JOB_REPLACE, NULL, &error, NULL);
+        if (r < 0)
+                goto failed;
+
+        /* Successfully queued, that's it for us */
+        return 0;
+
+failed:
+        if (!sd_bus_error_is_set(&error))
+                sd_bus_error_set_errno(&error, r);
+
+        log_debug("D-Bus activation failed for %s: %s", name, bus_error_message(&error, r));
+
+        r = sd_bus_message_new_signal(sd_bus_message_get_bus(message), &reply, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Activator", "ActivationFailure");
+        if (r < 0) {
+                bus_log_create_error(r);
+                return 0;
+        }
+
+        r = sd_bus_message_append(reply, "sss", name, error.name, error.message);
+        if (r < 0) {
+                bus_log_create_error(r);
+                return 0;
+        }
+
+        r = sd_bus_send_to(NULL, reply, "org.freedesktop.DBus", NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to respond with to bus activation request: %m");
+
+        return 0;
+}
+
+#if HAVE_SELINUX
+static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = userdata;
+        const char *verb, *path;
+        Unit *u = NULL;
+        Job *j;
+        int r;
+
+        assert(message);
+
+        /* Our own method calls are all protected individually with
+         * selinux checks, but the built-in interfaces need to be
+         * protected too. */
+
+        if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", "Set"))
+                verb = "reload";
+        else if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Introspectable", NULL) ||
+                 sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", NULL) ||
+                 sd_bus_message_is_method_call(message, "org.freedesktop.DBus.ObjectManager", NULL) ||
+                 sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Peer", NULL))
+                verb = "status";
+        else
+                return 0;
+
+        path = sd_bus_message_get_path(message);
+
+        if (object_path_startswith("/org/freedesktop/systemd1", path)) {
+                r = mac_selinux_access_check(message, verb, error);
+                if (r < 0)
+                        return r;
+
+                return 0;
+        }
+
+        if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+                pid_t pid;
+
+                r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+                if (r < 0)
+                        return 0;
+
+                r = sd_bus_creds_get_pid(creds, &pid);
+                if (r < 0)
+                        return 0;
+
+                u = manager_get_unit_by_pid(m, pid);
+        } else {
+                r = manager_get_job_from_dbus_path(m, path, &j);
+                if (r >= 0)
+                        u = j->unit;
+                else
+                        manager_load_unit_from_dbus_path(m, path, NULL, &u);
+        }
+        if (!u)
+                return 0;
+
+        r = mac_selinux_unit_access_check(u, message, verb, error);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+#endif
+
+static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_bus_error *error) {
+        Unit *u = NULL;  /* just to appease gcc, initialization is not really necessary */
+        int r;
+
+        assert(m);
+        assert(bus);
+        assert(path);
+
+        if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+                sd_bus_message *message;
+                pid_t pid;
+
+                message = sd_bus_get_current_message(bus);
+                if (!message)
+                        return 0;
+
+                r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_creds_get_pid(creds, &pid);
+                if (r < 0)
+                        return r;
+
+                u = manager_get_unit_by_pid(m, pid);
+                if (!u)
+                        return 0;
+        } else {
+                r = manager_load_unit_from_dbus_path(m, path, error, &u);
+                if (r < 0)
+                        return 0;
+                assert(u);
+        }
+
+        *unit = u;
+        return 1;
+}
+
+static int bus_unit_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        return find_unit(m, bus, path, (Unit**) found, error);
+}
+
+static int bus_unit_interface_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        Unit *u;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        r = find_unit(m, bus, path, &u, error);
+        if (r <= 0)
+                return r;
+
+        if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+                return 0;
+
+        *found = u;
+        return 1;
+}
+
+static int bus_unit_cgroup_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        Unit *u;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        r = find_unit(m, bus, path, &u, error);
+        if (r <= 0)
+                return r;
+
+        if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+                return 0;
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return 0;
+
+        *found = u;
+        return 1;
+}
+
+static int bus_cgroup_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        CGroupContext *c;
+        Unit *u;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        r = find_unit(m, bus, path, &u, error);
+        if (r <= 0)
+                return r;
+
+        if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+                return 0;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        *found = c;
+        return 1;
+}
+
+static int bus_exec_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        ExecContext *c;
+        Unit *u;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        r = find_unit(m, bus, path, &u, error);
+        if (r <= 0)
+                return r;
+
+        if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+                return 0;
+
+        c = unit_get_exec_context(u);
+        if (!c)
+                return 0;
+
+        *found = c;
+        return 1;
+}
+
+static int bus_kill_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        KillContext *c;
+        Unit *u;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        r = find_unit(m, bus, path, &u, error);
+        if (r <= 0)
+                return r;
+
+        if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+                return 0;
+
+        c = unit_get_kill_context(u);
+        if (!c)
+                return 0;
+
+        *found = c;
+        return 1;
+}
+
+static int bus_unit_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) {
+        _cleanup_strv_free_ char **l = NULL;
+        Manager *m = userdata;
+        unsigned k = 0;
+        Unit *u;
+
+        l = new0(char*, hashmap_size(m->units)+1);
+        if (!l)
+                return -ENOMEM;
+
+        HASHMAP_FOREACH(u, m->units) {
+                l[k] = unit_dbus_path(u);
+                if (!l[k])
+                        return -ENOMEM;
+
+                k++;
+        }
+
+        *nodes = TAKE_PTR(l);
+
+        return k;
+}
+
+static const BusObjectImplementation unit_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Unit",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_unit_vtable,        bus_unit_find }),
+        .node_enumerator = bus_unit_enumerate,
+};
+
+static const BusObjectImplementation bus_automount_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Automount",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_automount_vtable,   bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_device_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Device",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_device_vtable,      bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_mount_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Mount",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_mount_vtable,       bus_unit_interface_find },
+                { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+                { bus_cgroup_vtable,      bus_cgroup_context_find },
+                { bus_exec_vtable,        bus_exec_context_find },
+                { bus_kill_vtable,        bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_path_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Path",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_path_vtable,        bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_scope_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Scope",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_scope_vtable,       bus_unit_interface_find },
+                { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+                { bus_cgroup_vtable,      bus_cgroup_context_find },
+                { bus_kill_vtable,        bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_service_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Service",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_service_vtable,     bus_unit_interface_find },
+                { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+                { bus_cgroup_vtable,      bus_cgroup_context_find },
+                { bus_exec_vtable,        bus_exec_context_find },
+                { bus_kill_vtable,        bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_slice_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Slice",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_slice_vtable,       bus_unit_interface_find },
+                { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+                { bus_cgroup_vtable,      bus_cgroup_context_find }),
+};
+
+static const BusObjectImplementation bus_socket_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Socket",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_socket_vtable,      bus_unit_interface_find },
+                { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+                { bus_cgroup_vtable,      bus_cgroup_context_find },
+                { bus_exec_vtable,        bus_exec_context_find },
+                { bus_kill_vtable,        bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_swap_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Swap",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_swap_vtable,        bus_unit_interface_find },
+                { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+                { bus_cgroup_vtable,      bus_cgroup_context_find },
+                { bus_exec_vtable,        bus_exec_context_find },
+                { bus_kill_vtable,        bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_target_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Target",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_target_vtable,      bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_timer_object = {
+        "/org/freedesktop/systemd1/unit",
+        "org.freedesktop.systemd1.Timer",
+        .fallback_vtables = BUS_FALLBACK_VTABLES(
+                { bus_timer_vtable,       bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_manager_object = {
+        "/org/freedesktop/systemd1",
+        "org.freedesktop.systemd1.Manager",
+        .vtables = BUS_VTABLES(bus_manager_vtable),
+        .children = BUS_IMPLEMENTATIONS(
+                        &job_object,
+                        &unit_object,
+                        &bus_automount_object,
+                        &bus_device_object,
+                        &bus_mount_object,
+                        &bus_path_object,
+                        &bus_scope_object,
+                        &bus_service_object,
+                        &bus_slice_object,
+                        &bus_socket_object,
+                        &bus_swap_object,
+                        &bus_target_object,
+                        &bus_timer_object),
+};
+
+static const BusObjectImplementation manager_log_control_object = {
+        "/org/freedesktop/LogControl1",
+        "org.freedesktop.LogControl1",
+        .vtables = BUS_VTABLES(bus_manager_log_control_vtable),
+};
+
+int bus_manager_introspect_implementations(FILE *out, const char *pattern) {
+        return bus_introspect_implementations(
+                        out,
+                        pattern,
+                        BUS_IMPLEMENTATIONS(&bus_manager_object,
+                                            &manager_log_control_object));
+}
+
+static int bus_setup_api_vtables(Manager *m, sd_bus *bus) {
+        int r;
+
+        assert(m);
+        assert(bus);
+
+#if HAVE_SELINUX
+        r = sd_bus_add_filter(bus, NULL, mac_selinux_filter, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add SELinux access filter: %m");
+#endif
+
+        r = bus_add_implementation(bus, &bus_manager_object, m);
+        if (r < 0)
+                return r;
+
+        return bus_add_implementation(bus, &manager_log_control_object, m);
+}
+
+static int bus_setup_disconnected_match(Manager *m, sd_bus *bus) {
+        int r;
+
+        assert(m);
+        assert(bus);
+
+        r = sd_bus_match_signal_async(
+                        bus,
+                        NULL,
+                        "org.freedesktop.DBus.Local",
+                        "/org/freedesktop/DBus/Local",
+                        "org.freedesktop.DBus.Local",
+                        "Disconnected",
+                        signal_disconnected, NULL, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to request match for Disconnected message: %m");
+
+        return 0;
+}
+
+static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_close_ int nfd = -EBADF;
+        Manager *m = ASSERT_PTR(userdata);
+        sd_id128_t id;
+        int r;
+
+        assert(s);
+
+        nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+        if (nfd < 0) {
+                if (ERRNO_IS_ACCEPT_AGAIN(errno))
+                        return 0;
+
+                log_warning_errno(errno, "Failed to accept private connection, ignoring: %m");
+                return 0;
+        }
+
+        if (set_size(m->private_buses) >= CONNECTIONS_MAX) {
+                log_warning("Too many concurrent connections, refusing");
+                return 0;
+        }
+
+        r = sd_bus_new(&bus);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to allocate new private connection bus: %m");
+                return 0;
+        }
+
+        (void) sd_bus_set_description(bus, "private-bus-connection");
+
+        r = sd_bus_set_fd(bus, nfd, nfd);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to set fd on new connection bus: %m");
+                return 0;
+        }
+
+        TAKE_FD(nfd);
+
+        r = bus_check_peercred(bus);
+        if (r < 0) {
+                log_warning_errno(r, "Incoming private connection from unprivileged client, refusing: %m");
+                return 0;
+        }
+
+        assert_se(sd_id128_randomize(&id) >= 0);
+
+        r = sd_bus_set_server(bus, 1, id);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to enable server support for new connection bus: %m");
+                return 0;
+        }
+
+        r = sd_bus_negotiate_creds(bus, 1,
+                                   SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|
+                                   SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS|
+                                   SD_BUS_CREDS_SELINUX_CONTEXT|
+                                   SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to enable credentials for new connection: %m");
+                return 0;
+        }
+
+        r = sd_bus_set_sender(bus, "org.freedesktop.systemd1");
+        if (r < 0) {
+                log_warning_errno(r, "Failed to set direct connection sender: %m");
+                return 0;
+        }
+
+        r = sd_bus_start(bus);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to start new connection bus: %m");
+                return 0;
+        }
+
+        if (DEBUG_LOGGING) {
+                _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL;
+                const char *comm = NULL, *description = NULL;
+                pid_t pid = 0;
+
+                r = sd_bus_get_owner_creds(bus, SD_BUS_CREDS_PID|SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION, &c);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to get peer creds, ignoring: %m");
+                else {
+                        (void) sd_bus_creds_get_pid(c, &pid);
+                        (void) sd_bus_creds_get_comm(c, &comm);
+                        (void) sd_bus_creds_get_description(c, &description);
+                }
+
+                log_debug("Accepting direct incoming connection from " PID_FMT " (%s) [%s]", pid, strna(comm), strna(description));
+        }
+
+        r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to attach new connection bus to event loop: %m");
+                return 0;
+        }
+
+        r = bus_setup_disconnected_match(m, bus);
+        if (r < 0)
+                return 0;
+
+        r = bus_setup_api_vtables(m, bus);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to set up API vtables on new connection bus: %m");
+                return 0;
+        }
+
+        r = bus_register_malloc_status(bus, "org.freedesktop.systemd1");
+        if (r < 0)
+                log_warning_errno(r, "Failed to register MemoryAllocation1, ignoring: %m");
+
+        r = set_ensure_put(&m->private_buses, NULL, bus);
+        if (r == -ENOMEM) {
+                log_oom();
+                return 0;
+        }
+        if (r < 0) {
+                log_warning_errno(r, "Failed to add new connection bus to set: %m");
+                return 0;
+        }
+
+        TAKE_PTR(bus);
+
+        log_debug("Accepted new private connection.");
+
+        return 0;
+}
+
+static int bus_setup_api(Manager *m, sd_bus *bus) {
+        char *name;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(bus);
+
+        /* Let's make sure we have enough credential bits so that we can make security and selinux decisions */
+        r = sd_bus_negotiate_creds(bus, 1,
+                                   SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|
+                                   SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS|
+                                   SD_BUS_CREDS_SELINUX_CONTEXT);
+        if (r < 0)
+                log_warning_errno(r, "Failed to enable credential passing, ignoring: %m");
+
+        r = bus_setup_api_vtables(m, bus);
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH_KEY(u, name, m->watch_bus) {
+                r = unit_install_bus_match(u, bus, name);
+                if (r < 0)
+                        log_error_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name);
+        }
+
+        r = sd_bus_match_signal_async(
+                        bus,
+                        NULL,
+                        "org.freedesktop.DBus",
+                        "/org/freedesktop/DBus",
+                        "org.freedesktop.systemd1.Activator",
+                        "ActivationRequest",
+                        signal_activation_request, NULL, m);
+        if (r < 0)
+                log_warning_errno(r, "Failed to subscribe to activation signal: %m");
+
+        /* Allow replacing of our name, to ease implementation of reexecution, where we keep the old connection open
+         * until after the new connection is set up and the name installed to allow clients to synchronously wait for
+         * reexecution to finish */
+        r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.systemd1", SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_ALLOW_REPLACEMENT, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to request name: %m");
+
+        r = bus_register_malloc_status(bus, "org.freedesktop.systemd1");
+        if (r < 0)
+                log_warning_errno(r, "Failed to register MemoryAllocation1, ignoring: %m");
+
+        log_debug("Successfully connected to API bus.");
+
+        return 0;
+}
+
+int bus_init_api(Manager *m) {
+        _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        if (m->api_bus)
+                return 0;
+
+        /* The API and system bus is the same if we are running in system mode */
+        if (MANAGER_IS_SYSTEM(m) && m->system_bus)
+                bus = sd_bus_ref(m->system_bus);
+        else {
+                if (MANAGER_IS_SYSTEM(m))
+                        r = sd_bus_open_system_with_description(&bus, "bus-api-system");
+                else
+                        r = sd_bus_open_user_with_description(&bus, "bus-api-user");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to connect to API bus: %m");
+
+                r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to attach API bus to event loop: %m");
+
+                r = bus_setup_disconnected_match(m, bus);
+                if (r < 0)
+                        return r;
+        }
+
+        r = bus_setup_api(m, bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set up API bus: %m");
+
+        m->api_bus = TAKE_PTR(bus);
+
+        return 0;
+}
+
+static int bus_setup_system(Manager *m, sd_bus *bus) {
+        int r;
+
+        assert(m);
+        assert(bus);
+
+        /* if we are a user instance we get the Released message via the system bus */
+        if (MANAGER_IS_USER(m)) {
+                r = sd_bus_match_signal_async(
+                                bus,
+                                NULL,
+                                NULL,
+                                "/org/freedesktop/systemd1/agent",
+                                "org.freedesktop.systemd1.Agent",
+                                "Released",
+                                signal_agent_released, NULL, m);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to request Released match on system bus: %m");
+        }
+
+        log_debug("Successfully connected to system bus.");
+        return 0;
+}
+
+int bus_init_system(Manager *m) {
+        _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        if (m->system_bus)
+                return 0;
+
+        /* The API and system bus is the same if we are running in system mode */
+        if (MANAGER_IS_SYSTEM(m) && m->api_bus)
+                bus = sd_bus_ref(m->api_bus);
+        else {
+                r = sd_bus_open_system_with_description(&bus, "bus-system");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to connect to system bus: %m");
+
+                r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to attach system bus to event loop: %m");
+
+                r = bus_setup_disconnected_match(m, bus);
+                if (r < 0)
+                        return r;
+        }
+
+        r = bus_setup_system(m, bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set up system bus: %m");
+
+        m->system_bus = TAKE_PTR(bus);
+
+        return 0;
+}
+
+int bus_init_private(Manager *m) {
+        _cleanup_close_ int fd = -EBADF;
+        union sockaddr_union sa;
+        socklen_t sa_len;
+        sd_event_source *s;
+        int r;
+
+        assert(m);
+
+        if (m->private_listen_fd >= 0)
+                return 0;
+
+        if (MANAGER_IS_SYSTEM(m)) {
+
+                /* We want the private bus only when running as init */
+                if (getpid_cached() != 1)
+                        return 0;
+
+                r = sockaddr_un_set_path(&sa.un, "/run/systemd/private");
+        } else {
+                _cleanup_free_ char *joined = NULL;
+                const char *e;
+
+                e = secure_getenv("XDG_RUNTIME_DIR");
+                if (!e)
+                        return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+                                               "XDG_RUNTIME_DIR is not set, refusing.");
+
+                joined = path_join(e, "/systemd/private");
+                if (!joined)
+                        return log_oom();
+
+                r = sockaddr_un_set_path(&sa.un, joined);
+        }
+        if (r < 0)
+                return log_error_errno(r, "Can't set path for AF_UNIX socket to bind to: %m");
+        sa_len = r;
+
+        (void) mkdir_parents_label(sa.un.sun_path, 0755);
+        (void) sockaddr_un_unlink(&sa.un);
+
+        fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+        if (fd < 0)
+                return log_error_errno(errno, "Failed to allocate private socket: %m");
+
+        WITH_UMASK(0077)
+                r = bind(fd, &sa.sa, sa_len);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to bind private socket: %m");
+
+        r = listen(fd, SOMAXCONN_DELUXE);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to make private socket listening: %m");
+
+        /* Generate an inotify event in case somebody waits for this socket to appear using inotify() */
+        (void) touch(sa.un.sun_path);
+
+        r = sd_event_add_io(m->event, &s, fd, EPOLLIN, bus_on_connection, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event source: %m");
+
+        (void) sd_event_source_set_description(s, "bus-connection");
+
+        m->private_listen_fd = TAKE_FD(fd);
+        m->private_listen_event_source = s;
+
+        log_debug("Successfully created private D-Bus server.");
+
+        return 0;
+}
+
+static void destroy_bus(Manager *m, sd_bus **bus) {
+        Unit *u;
+        Job *j;
+
+        assert(m);
+        assert(bus);
+
+        if (!*bus)
+                return;
+
+        /* Make sure all bus slots watching names are released. */
+        HASHMAP_FOREACH(u, m->watch_bus) {
+                if (u->match_bus_slot && sd_bus_slot_get_bus(u->match_bus_slot) == *bus)
+                        u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+                if (u->get_name_owner_slot && sd_bus_slot_get_bus(u->get_name_owner_slot) == *bus)
+                        u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+        }
+
+        /* Get rid of tracked clients on this bus */
+        if (m->subscribed && sd_bus_track_get_bus(m->subscribed) == *bus)
+                m->subscribed = sd_bus_track_unref(m->subscribed);
+
+        HASHMAP_FOREACH(j, m->jobs)
+                if (j->bus_track && sd_bus_track_get_bus(j->bus_track) == *bus)
+                        j->bus_track = sd_bus_track_unref(j->bus_track);
+
+        HASHMAP_FOREACH(u, m->units) {
+                if (u->bus_track && sd_bus_track_get_bus(u->bus_track) == *bus)
+                        u->bus_track = sd_bus_track_unref(u->bus_track);
+
+                /* Get rid of pending freezer messages on this bus */
+                if (u->pending_freezer_invocation && sd_bus_message_get_bus(u->pending_freezer_invocation) == *bus)
+                        u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
+        }
+
+        /* Get rid of queued message on this bus */
+        if (m->pending_reload_message && sd_bus_message_get_bus(m->pending_reload_message) == *bus)
+                m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message);
+
+        /* Possibly flush unwritten data, but only if we are
+         * unprivileged, since we don't want to sync here */
+        if (!MANAGER_IS_SYSTEM(m))
+                sd_bus_flush(*bus);
+
+        /* And destroy the object */
+        *bus = sd_bus_close_unref(*bus);
+}
+
+void bus_done_api(Manager *m) {
+        destroy_bus(m, &m->api_bus);
+}
+
+void bus_done_system(Manager *m) {
+        destroy_bus(m, &m->system_bus);
+}
+
+void bus_done_private(Manager *m) {
+        sd_bus *b;
+
+        assert(m);
+
+        while ((b = set_steal_first(m->private_buses)))
+                destroy_bus(m, &b);
+
+        m->private_buses = set_free(m->private_buses);
+
+        m->private_listen_event_source = sd_event_source_disable_unref(m->private_listen_event_source);
+        m->private_listen_fd = safe_close(m->private_listen_fd);
+}
+
+void bus_done(Manager *m) {
+        assert(m);
+
+        bus_done_api(m);
+        bus_done_system(m);
+        bus_done_private(m);
+
+        assert(!m->subscribed);
+
+        m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+        bus_verify_polkit_async_registry_free(m->polkit_registry);
+}
+
+int bus_fdset_add_all(Manager *m, FDSet *fds) {
+        sd_bus *b;
+        int fd;
+
+        assert(m);
+        assert(fds);
+
+        /* When we are about to reexecute we add all D-Bus fds to the
+         * set to pass over to the newly executed systemd. They won't
+         * be used there however, except thatt they are closed at the
+         * very end of deserialization, those making it possible for
+         * clients to synchronously wait for systemd to reexec by
+         * simply waiting for disconnection */
+
+        if (m->api_bus) {
+                fd = sd_bus_get_fd(m->api_bus);
+                if (fd >= 0) {
+                        fd = fdset_put_dup(fds, fd);
+                        if (fd < 0)
+                                return fd;
+                }
+        }
+
+        SET_FOREACH(b, m->private_buses) {
+                fd = sd_bus_get_fd(b);
+                if (fd >= 0) {
+                        fd = fdset_put_dup(fds, fd);
+                        if (fd < 0)
+                                return fd;
+                }
+        }
+
+        /* We don't offer any APIs on the system bus (well, unless it
+         * is the same as the API bus) hence we don't bother with it
+         * here */
+
+        return 0;
+}
+
+int bus_foreach_bus(
+                Manager *m,
+                sd_bus_track *subscribed2,
+                int (*send_message)(sd_bus *bus, void *userdata),
+                void *userdata) {
+
+        sd_bus *b;
+        int r, ret = 0;
+
+        /* Send to all direct buses, unconditionally */
+        SET_FOREACH(b, m->private_buses) {
+
+                /* Don't bother with enqueuing these messages to clients that haven't started yet */
+                if (sd_bus_is_ready(b) <= 0)
+                        continue;
+
+                r = send_message(b, userdata);
+                if (r < 0)
+                        ret = r;
+        }
+
+        /* Send to API bus, but only if somebody is subscribed */
+        if (m->api_bus &&
+            (sd_bus_track_count(m->subscribed) > 0 ||
+             sd_bus_track_count(subscribed2) > 0)) {
+                r = send_message(m->api_bus, userdata);
+                if (r < 0)
+                        ret = r;
+        }
+
+        return ret;
+}
+
+void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) {
+        const char *n;
+
+        assert(f);
+        assert(prefix);
+
+        for (n = sd_bus_track_first(t); n; n = sd_bus_track_next(t)) {
+                int c, j;
+
+                c = sd_bus_track_count_name(t, n);
+                for (j = 0; j < c; j++)
+                        (void) serialize_item(f, prefix, n);
+        }
+}
+
+int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) {
+        int r;
+
+        assert(m);
+        assert(t);
+
+        if (strv_isempty(l))
+                return 0;
+
+        if (!m->api_bus)
+                return 0;
+
+        if (!*t) {
+                r = sd_bus_track_new(m->api_bus, t, NULL, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_track_set_recursive(*t, recursive);
+        if (r < 0)
+                return r;
+
+        return bus_track_add_name_many(*t, l);
+}
+
+int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+        return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+        return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+        return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+        return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+        return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.bypass-dump-ratelimit", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+uint64_t manager_bus_n_queued_write(Manager *m) {
+        uint64_t c = 0;
+        sd_bus *b;
+        int r;
+
+        /* Returns the total number of messages queued for writing on all our direct and API buses. */
+
+        SET_FOREACH(b, m->private_buses) {
+                uint64_t k;
+
+                r = sd_bus_get_n_queued_write(b, &k);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to query queued messages for private bus: %m");
+                else
+                        c += k;
+        }
+
+        if (m->api_bus) {
+                uint64_t k;
+
+                r = sd_bus_get_n_queued_write(m->api_bus, &k);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to query queued messages for API bus: %m");
+                else
+                        c += k;
+        }
+
+        return c;
+}
+
+static void vtable_dump_bus_properties(FILE *f, const sd_bus_vtable *table) {
+        const sd_bus_vtable *i;
+
+        for (i = table; i->type != _SD_BUS_VTABLE_END; i++) {
+                if (!IN_SET(i->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY) ||
+                    (i->flags & (SD_BUS_VTABLE_DEPRECATED | SD_BUS_VTABLE_HIDDEN)) != 0)
+                        continue;
+
+                fprintf(f, "%s\n", i->x.property.member);
+        }
+}
+
+void dump_bus_properties(FILE *f) {
+        assert(f);
+
+        vtable_dump_bus_properties(f, bus_automount_vtable);
+        vtable_dump_bus_properties(f, bus_cgroup_vtable);
+        vtable_dump_bus_properties(f, bus_device_vtable);
+        vtable_dump_bus_properties(f, bus_exec_vtable);
+        vtable_dump_bus_properties(f, bus_job_vtable);
+        vtable_dump_bus_properties(f, bus_kill_vtable);
+        vtable_dump_bus_properties(f, bus_manager_vtable);
+        vtable_dump_bus_properties(f, bus_mount_vtable);
+        vtable_dump_bus_properties(f, bus_path_vtable);
+        vtable_dump_bus_properties(f, bus_scope_vtable);
+        vtable_dump_bus_properties(f, bus_service_vtable);
+        vtable_dump_bus_properties(f, bus_slice_vtable);
+        vtable_dump_bus_properties(f, bus_socket_vtable);
+        vtable_dump_bus_properties(f, bus_swap_vtable);
+        vtable_dump_bus_properties(f, bus_target_vtable);
+        vtable_dump_bus_properties(f, bus_timer_vtable);
+        vtable_dump_bus_properties(f, bus_unit_vtable);
+        vtable_dump_bus_properties(f, bus_unit_cgroup_vtable);
+}
diff --git a/src/core/dbus.h b/src/core/dbus.h
new file mode 100644
index 0000000..50e7bb4
--- /dev/null
+++ b/src/core/dbus.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "manager.h"
+
+int bus_send_pending_reload_message(Manager *m);
+
+int bus_init_private(Manager *m);
+int bus_init_api(Manager *m);
+int bus_init_system(Manager *m);
+
+void bus_done_private(Manager *m);
+void bus_done_api(Manager *m);
+void bus_done_system(Manager *m);
+void bus_done(Manager *m);
+
+int bus_fdset_add_all(Manager *m, FDSet *fds);
+
+void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix);
+int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l);
+
+int bus_foreach_bus(Manager *m, sd_bus_track *subscribed2, int (*send_message)(sd_bus *bus, void *userdata), void *userdata);
+
+int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+
+int bus_forward_agent_released(Manager *m, const char *path);
+
+uint64_t manager_bus_n_queued_write(Manager *m);
+
+void dump_bus_properties(FILE *f);
+int bus_manager_introspect_implementations(FILE *out, const char *pattern);
diff --git a/src/core/device.c b/src/core/device.c
new file mode 100644
index 0000000..6b2d7c3
--- /dev/null
+++ b/src/core/device.c
@@ -0,0 +1,1301 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "dbus-device.h"
+#include "dbus-unit.h"
+#include "device-private.h"
+#include "device-util.h"
+#include "device.h"
+#include "log.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "ratelimit.h"
+#include "serialize.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "swap.h"
+#include "udev-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_DEVICE_STATE_MAX] = {
+        [DEVICE_DEAD]      = UNIT_INACTIVE,
+        [DEVICE_TENTATIVE] = UNIT_ACTIVATING,
+        [DEVICE_PLUGGED]   = UNIT_ACTIVE,
+};
+
+static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata);
+
+static int device_by_path(Manager *m, const char *path, Unit **ret) {
+        _cleanup_free_ char *e = NULL;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(path);
+
+        r = unit_name_from_path(path, ".device", &e);
+        if (r < 0)
+                return r;
+
+        u = manager_get_unit(m, e);
+        if (!u)
+                return -ENOENT;
+
+        if (ret)
+                *ret = u;
+        return 0;
+}
+
+static void device_unset_sysfs(Device *d) {
+        Hashmap *devices;
+
+        assert(d);
+
+        if (!d->sysfs)
+                return;
+
+        /* Remove this unit from the chain of devices which share the same sysfs path. */
+
+        devices = UNIT(d)->manager->devices_by_sysfs;
+
+        if (d->same_sysfs_prev)
+                /* If this is not the first unit, then simply remove this unit. */
+                d->same_sysfs_prev->same_sysfs_next = d->same_sysfs_next;
+        else if (d->same_sysfs_next)
+                /* If this is the first unit, replace with the next unit. */
+                assert_se(hashmap_replace(devices, d->same_sysfs_next->sysfs, d->same_sysfs_next) >= 0);
+        else
+                /* Otherwise, remove the entry. */
+                hashmap_remove(devices, d->sysfs);
+
+        if (d->same_sysfs_next)
+                d->same_sysfs_next->same_sysfs_prev = d->same_sysfs_prev;
+
+        d->same_sysfs_prev = d->same_sysfs_next = NULL;
+
+        d->sysfs = mfree(d->sysfs);
+}
+
+static int device_set_sysfs(Device *d, const char *sysfs) {
+        _cleanup_free_ char *copy = NULL;
+        Device *first;
+        int r;
+
+        assert(d);
+
+        if (streq_ptr(d->sysfs, sysfs))
+                return 0;
+
+        r = hashmap_ensure_allocated(&UNIT(d)->manager->devices_by_sysfs, &path_hash_ops);
+        if (r < 0)
+                return r;
+
+        copy = strdup(sysfs);
+        if (!copy)
+                return -ENOMEM;
+
+        device_unset_sysfs(d);
+
+        first = hashmap_get(UNIT(d)->manager->devices_by_sysfs, sysfs);
+        LIST_PREPEND(same_sysfs, first, d);
+
+        r = hashmap_replace(UNIT(d)->manager->devices_by_sysfs, copy, first);
+        if (r < 0) {
+                LIST_REMOVE(same_sysfs, first, d);
+                return r;
+        }
+
+        d->sysfs = TAKE_PTR(copy);
+        unit_add_to_dbus_queue(UNIT(d));
+
+        return 0;
+}
+
+static void device_init(Unit *u) {
+        Device *d = DEVICE(u);
+
+        assert(d);
+        assert(UNIT(d)->load_state == UNIT_STUB);
+
+        /* In contrast to all other unit types we timeout jobs waiting
+         * for devices by default. This is because they otherwise wait
+         * indefinitely for plugged in devices, something which cannot
+         * happen for the other units since their operations time out
+         * anyway. */
+        u->job_running_timeout = u->manager->defaults.device_timeout_usec;
+
+        u->ignore_on_isolate = true;
+
+        d->deserialized_state = _DEVICE_STATE_INVALID;
+}
+
+static void device_done(Unit *u) {
+        Device *d = DEVICE(u);
+
+        assert(d);
+
+        device_unset_sysfs(d);
+        d->deserialized_sysfs = mfree(d->deserialized_sysfs);
+        d->wants_property = strv_free(d->wants_property);
+        d->path = mfree(d->path);
+}
+
+static int device_load(Unit *u) {
+        int r;
+
+        r = unit_load_fragment_and_dropin(u, false);
+        if (r < 0)
+                return r;
+
+        if (!u->description) {
+                /* Generate a description based on the path, to be used until the device is initialized
+                   properly */
+                r = unit_name_to_path(u->id, &u->description);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to unescape name: %m");
+        }
+
+        return 0;
+}
+
+static void device_set_state(Device *d, DeviceState state) {
+        DeviceState old_state;
+
+        assert(d);
+
+        if (d->state != state)
+                bus_unit_send_pending_change_signal(UNIT(d), false);
+
+        old_state = d->state;
+        d->state = state;
+
+        if (state == DEVICE_DEAD)
+                device_unset_sysfs(d);
+
+        if (state != old_state)
+                log_unit_debug(UNIT(d), "Changed %s -> %s", device_state_to_string(old_state), device_state_to_string(state));
+
+        unit_notify(UNIT(d), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static void device_found_changed(Device *d, DeviceFound previous, DeviceFound now) {
+        assert(d);
+
+        /* Didn't exist before, but does now? if so, generate a new invocation ID for it */
+        if (previous == DEVICE_NOT_FOUND && now != DEVICE_NOT_FOUND)
+                (void) unit_acquire_invocation_id(UNIT(d));
+
+        if (FLAGS_SET(now, DEVICE_FOUND_UDEV))
+                /* When the device is known to udev we consider it plugged. */
+                device_set_state(d, DEVICE_PLUGGED);
+        else if (now != DEVICE_NOT_FOUND && !FLAGS_SET(previous, DEVICE_FOUND_UDEV))
+                /* If the device has not been seen by udev yet, but is now referenced by the kernel, then we assume the
+                 * kernel knows it now, and udev might soon too. */
+                device_set_state(d, DEVICE_TENTATIVE);
+        else
+                /* If nobody sees the device, or if the device was previously seen by udev and now is only referenced
+                 * from the kernel, then we consider the device is gone, the kernel just hasn't noticed it yet. */
+                device_set_state(d, DEVICE_DEAD);
+}
+
+static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask) {
+        assert(d);
+
+        if (MANAGER_IS_RUNNING(UNIT(d)->manager)) {
+                DeviceFound n, previous;
+
+                /* When we are already running, then apply the new mask right-away, and trigger state changes
+                 * right-away */
+
+                n = (d->found & ~mask) | (found & mask);
+                if (n == d->found)
+                        return;
+
+                previous = d->found;
+                d->found = n;
+
+                device_found_changed(d, previous, n);
+        } else
+                /* We aren't running yet, let's apply the new mask to the shadow variable instead, which we'll apply as
+                 * soon as we catch-up with the state. */
+                d->enumerated_found = (d->enumerated_found & ~mask) | (found & mask);
+}
+
+static void device_update_found_by_sysfs(Manager *m, const char *sysfs, DeviceFound found, DeviceFound mask) {
+        Device *l;
+
+        assert(m);
+        assert(sysfs);
+
+        if (mask == 0)
+                return;
+
+        l = hashmap_get(m->devices_by_sysfs, sysfs);
+        LIST_FOREACH(same_sysfs, d, l)
+                device_update_found_one(d, found, mask);
+}
+
+static void device_update_found_by_name(Manager *m, const char *path, DeviceFound found, DeviceFound mask) {
+        Unit *u;
+
+        assert(m);
+        assert(path);
+
+        if (mask == 0)
+                return;
+
+        if (device_by_path(m, path, &u) < 0)
+                return;
+
+        device_update_found_one(DEVICE(u), found, mask);
+}
+
+static int device_coldplug(Unit *u) {
+        Device *d = DEVICE(u);
+
+        assert(d);
+        assert(d->state == DEVICE_DEAD);
+
+        /* First, let's put the deserialized state and found mask into effect, if we have it. */
+        if (d->deserialized_state < 0)
+                return 0;
+
+        Manager *m = u->manager;
+        DeviceFound found = d->deserialized_found;
+        DeviceState state = d->deserialized_state;
+
+        /* On initial boot, switch-root, reload, reexecute, the following happen:
+         * 1. MANAGER_IS_RUNNING() == false
+         * 2. enumerate devices: manager_enumerate() -> device_enumerate()
+         *    Device.enumerated_found is set.
+         * 3. deserialize devices: manager_deserialize() -> device_deserialize_item()
+         *    Device.deserialize_state and Device.deserialized_found are set.
+         * 4. coldplug devices: manager_coldplug() -> device_coldplug()
+         *    deserialized properties are copied to the main properties.
+         * 5. MANAGER_IS_RUNNING() == true: manager_ready()
+         * 6. catchup devices: manager_catchup() -> device_catchup()
+         *    Device.enumerated_found is applied to Device.found, and state is updated based on that.
+         *
+         * Notes:
+         * - On initial boot, no udev database exists. Hence, no devices are enumerated in the step 2.
+         *   Also, there is no deserialized device. Device units are (a) generated based on dependencies of
+         *   other units, or (b) generated when uevents are received.
+         *
+         * - On switch-root, the udev database may be cleared, except for devices with sticky bit, i.e.
+         *   OPTIONS="db_persist". Hence, almost no devices are enumerated in the step 2. However, in
+         *   general, we have several serialized devices. So, DEVICE_FOUND_UDEV bit in the
+         *   Device.deserialized_found must be ignored, as udev rules in initrd and the main system are often
+         *   different. If the deserialized state is DEVICE_PLUGGED, we need to downgrade it to
+         *   DEVICE_TENTATIVE. Unlike the other starting mode, MANAGER_IS_SWITCHING_ROOT() is true when
+         *   device_coldplug() and device_catchup() are called. Hence, let's conditionalize the operations by
+         *   using the flag. After switch-root, systemd-udevd will (re-)process all devices, and the
+         *   Device.found and Device.state will be adjusted.
+         *
+         * - On reload or reexecute, we can trust Device.enumerated_found, Device.deserialized_found, and
+         *   Device.deserialized_state. Of course, deserialized parameters may be outdated, but the unit
+         *   state can be adjusted later by device_catchup() or uevents. */
+
+        if (MANAGER_IS_SWITCHING_ROOT(m) &&
+            !FLAGS_SET(d->enumerated_found, DEVICE_FOUND_UDEV)) {
+
+                /* The device has not been enumerated. On switching-root, such situation is natural. See the
+                 * above comment. To prevent problematic state transition active → dead → active, let's
+                 * drop the DEVICE_FOUND_UDEV flag and downgrade state to DEVICE_TENTATIVE(activating). See
+                 * issue #12953 and #23208. */
+                found &= ~DEVICE_FOUND_UDEV;
+                if (state == DEVICE_PLUGGED)
+                        state = DEVICE_TENTATIVE;
+
+                /* Also check the validity of the device syspath. Without this check, if the device was
+                 * removed while switching root, it would never go to inactive state, as both Device.found
+                 * and Device.enumerated_found do not have the DEVICE_FOUND_UDEV flag, so device_catchup() in
+                 * device_update_found_one() does nothing in most cases. See issue #25106. Note that the
+                 * syspath field is only serialized when systemd is sufficiently new and the device has been
+                 * already processed by udevd. */
+                if (d->deserialized_sysfs) {
+                        _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+                        if (sd_device_new_from_syspath(&dev, d->deserialized_sysfs) < 0)
+                                state = DEVICE_DEAD;
+                }
+        }
+
+        if (d->found == found && d->state == state)
+                return 0;
+
+        d->found = found;
+        device_set_state(d, state);
+        return 0;
+}
+
+static void device_catchup(Unit *u) {
+        Device *d = DEVICE(u);
+
+        assert(d);
+
+        /* Second, let's update the state with the enumerated state */
+        device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK);
+}
+
+static const struct {
+        DeviceFound flag;
+        const char *name;
+} device_found_map[] = {
+        { DEVICE_FOUND_UDEV,  "found-udev"  },
+        { DEVICE_FOUND_MOUNT, "found-mount" },
+        { DEVICE_FOUND_SWAP,  "found-swap"  },
+};
+
+static int device_found_to_string_many(DeviceFound flags, char **ret) {
+        _cleanup_free_ char *s = NULL;
+
+        assert(ret);
+
+        for (size_t i = 0; i < ELEMENTSOF(device_found_map); i++) {
+                if (!FLAGS_SET(flags, device_found_map[i].flag))
+                        continue;
+
+                if (!strextend_with_separator(&s, ",", device_found_map[i].name))
+                        return -ENOMEM;
+        }
+
+        *ret = TAKE_PTR(s);
+
+        return 0;
+}
+
+static int device_found_from_string_many(const char *name, DeviceFound *ret) {
+        DeviceFound flags = 0;
+        int r;
+
+        assert(ret);
+
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+                DeviceFound f = 0;
+                unsigned i;
+
+                r = extract_first_word(&name, &word, ",", 0);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                for (i = 0; i < ELEMENTSOF(device_found_map); i++)
+                        if (streq(word, device_found_map[i].name)) {
+                                f = device_found_map[i].flag;
+                                break;
+                        }
+
+                if (f == 0)
+                        return -EINVAL;
+
+                flags |= f;
+        }
+
+        *ret = flags;
+        return 0;
+}
+
+static int device_serialize(Unit *u, FILE *f, FDSet *fds) {
+        _cleanup_free_ char *s = NULL;
+        Device *d = DEVICE(u);
+
+        assert(d);
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        if (d->sysfs)
+                (void) serialize_item(f, "sysfs", d->sysfs);
+
+        if (d->path)
+                (void) serialize_item(f, "path", d->path);
+
+        (void) serialize_item(f, "state", device_state_to_string(d->state));
+
+        if (device_found_to_string_many(d->found, &s) >= 0)
+                (void) serialize_item(f, "found", s);
+
+        return 0;
+}
+
+static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Device *d = DEVICE(u);
+        int r;
+
+        assert(d);
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "sysfs")) {
+                if (!d->deserialized_sysfs) {
+                        d->deserialized_sysfs = strdup(value);
+                        if (!d->deserialized_sysfs)
+                                log_oom_debug();
+                }
+
+        } else if (streq(key, "path")) {
+                if (!d->path) {
+                        d->path = strdup(value);
+                        if (!d->path)
+                                log_oom_debug();
+                }
+
+        } else if (streq(key, "state")) {
+                DeviceState state;
+
+                state = device_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value, ignoring: %s", value);
+                else
+                        d->deserialized_state = state;
+
+        } else if (streq(key, "found")) {
+                r = device_found_from_string_many(value, &d->deserialized_found);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to parse found value '%s', ignoring: %m", value);
+
+        } else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static void device_dump(Unit *u, FILE *f, const char *prefix) {
+        Device *d = DEVICE(u);
+        _cleanup_free_ char *s = NULL;
+
+        assert(d);
+
+        (void) device_found_to_string_many(d->found, &s);
+
+        fprintf(f,
+                "%sDevice State: %s\n"
+                "%sDevice Path: %s\n"
+                "%sSysfs Path: %s\n"
+                "%sFound: %s\n",
+                prefix, device_state_to_string(d->state),
+                prefix, strna(d->path),
+                prefix, strna(d->sysfs),
+                prefix, strna(s));
+
+        STRV_FOREACH(i, d->wants_property)
+                fprintf(f, "%sudev SYSTEMD_WANTS: %s\n",
+                        prefix, *i);
+}
+
+static UnitActiveState device_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[DEVICE(u)->state];
+}
+
+static const char *device_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return device_state_to_string(DEVICE(u)->state);
+}
+
+static int device_update_description(Unit *u, sd_device *dev, const char *path) {
+        _cleanup_free_ char *j = NULL;
+        const char *model, *label, *desc;
+        int r;
+
+        assert(u);
+        assert(path);
+
+        desc = path;
+
+        if (dev && device_get_model_string(dev, &model) >= 0) {
+                desc = model;
+
+                /* Try to concatenate the device model string with a label, if there is one */
+                if (sd_device_get_property_value(dev, "ID_FS_LABEL", &label) >= 0 ||
+                    sd_device_get_property_value(dev, "ID_PART_ENTRY_NAME", &label) >= 0 ||
+                    sd_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER", &label) >= 0) {
+
+                        desc = j = strjoin(model, " ", label);
+                        if (!j)
+                                return log_oom();
+                }
+        }
+
+        r = unit_set_description(u, desc);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to set device description: %m");
+
+        return 0;
+}
+
+static int device_add_udev_wants(Unit *u, sd_device *dev) {
+        _cleanup_strv_free_ char **added = NULL;
+        const char *wants, *property;
+        Device *d = DEVICE(u);
+        int r;
+
+        assert(d);
+        assert(dev);
+
+        property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS";
+
+        r = sd_device_get_property_value(dev, property, &wants);
+        if (r < 0)
+                return 0;
+
+        for (;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&wants, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == 0)
+                        break;
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "Failed to parse property %s with value %s: %m", property, wants);
+
+                if (unit_name_is_valid(word, UNIT_NAME_TEMPLATE) && d->sysfs) {
+                        _cleanup_free_ char *escaped = NULL;
+
+                        /* If the unit name is specified as template, then automatically fill in the sysfs path of the
+                         * device as instance name, properly escaped. */
+
+                        r = unit_name_path_escape(d->sysfs, &escaped);
+                        if (r < 0)
+                                return log_unit_error_errno(u, r, "Failed to escape %s: %m", d->sysfs);
+
+                        r = unit_name_replace_instance(word, escaped, &k);
+                        if (r < 0)
+                                return log_unit_error_errno(u, r, "Failed to build %s instance of template %s: %m", escaped, word);
+                } else {
+                        /* If this is not a template, then let's mangle it so, that it becomes a valid unit name. */
+
+                        r = unit_name_mangle(word, UNIT_NAME_MANGLE_WARN, &k);
+                        if (r < 0)
+                                return log_unit_error_errno(u, r, "Failed to mangle unit name \"%s\": %m", word);
+                }
+
+                r = unit_add_dependency_by_name(u, UNIT_WANTS, k, true, UNIT_DEPENDENCY_UDEV);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "Failed to add Wants= dependency: %m");
+
+                r = strv_consume(&added, TAKE_PTR(k));
+                if (r < 0)
+                        return log_oom();
+        }
+
+        if (d->state != DEVICE_DEAD)
+                /* So here's a special hack, to compensate for the fact that the udev database's reload cycles are not
+                 * synchronized with our own reload cycles: when we detect that the SYSTEMD_WANTS property of a device
+                 * changes while the device unit is already up, let's skip to trigger units that were already listed
+                 * and are active, and start units otherwise. This typically happens during the boot-time switch root
+                 * transition, as udev devices will generally already be up in the initrd, but SYSTEMD_WANTS properties
+                 * get then added through udev rules only available on the host system, and thus only when the initial
+                 * udev coldplug trigger runs.
+                 *
+                 * We do this only if the device has been up already when we parse this, as otherwise the usual
+                 * dependency logic that is run from the dead → plugged transition will trigger these deps. */
+                STRV_FOREACH(i, added) {
+                        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                        if (strv_contains(d->wants_property, *i)) {
+                                Unit *v;
+
+                                v = manager_get_unit(u->manager, *i);
+                                if (v && UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(v)))
+                                        continue; /* The unit was already listed and is running. */
+                        }
+
+                        r = manager_add_job_by_name(u->manager, JOB_START, *i, JOB_FAIL, NULL, &error, NULL);
+                        if (r < 0)
+                                log_unit_full_errno(u, sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_UNIT) ? LOG_DEBUG : LOG_WARNING, r,
+                                                    "Failed to enqueue %s job, ignoring: %s", property, bus_error_message(&error, r));
+                }
+
+        return strv_free_and_replace(d->wants_property, added);
+}
+
+static bool device_is_bound_by_mounts(Device *d, sd_device *dev) {
+        int r;
+
+        assert(d);
+        assert(dev);
+
+        r = device_get_property_bool(dev, "SYSTEMD_MOUNT_DEVICE_BOUND");
+        if (r < 0 && r != -ENOENT)
+                log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_MOUNT_DEVICE_BOUND= udev property, ignoring: %m");
+
+        d->bind_mounts = r > 0;
+
+        return d->bind_mounts;
+}
+
+static void device_upgrade_mount_deps(Unit *u) {
+        Unit *other;
+        void *v;
+        int r;
+
+        /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */
+
+        HASHMAP_FOREACH_KEY(v, other, unit_get_dependencies(u, UNIT_REQUIRED_BY)) {
+                if (other->type != UNIT_MOUNT)
+                        continue;
+
+                r = unit_add_dependency(other, UNIT_BINDS_TO, u, true, UNIT_DEPENDENCY_UDEV);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to add BindsTo= dependency between device and mount unit, ignoring: %m");
+        }
+}
+
+static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool main, Set **units) {
+        _cleanup_(unit_freep) Unit *new_unit = NULL;
+        _cleanup_free_ char *e = NULL;
+        const char *sysfs = NULL;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(path);
+
+        if (dev) {
+                r = sd_device_get_syspath(dev, &sysfs);
+                if (r < 0)
+                        return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m");
+        }
+
+        r = unit_name_from_path(path, ".device", &e);
+        if (r < 0)
+                return log_struct_errno(
+                                LOG_WARNING, r,
+                                "MESSAGE_ID=" SD_MESSAGE_DEVICE_PATH_NOT_SUITABLE_STR,
+                                "DEVICE=%s", path,
+                                LOG_MESSAGE("Failed to generate valid unit name from device path '%s', ignoring device: %m",
+                                            path));
+
+        u = manager_get_unit(m, e);
+        if (u) {
+                /* The device unit can still be present even if the device was unplugged: a mount unit can reference it
+                 * hence preventing the GC to have garbaged it. That's desired since the device unit may have a
+                 * dependency on the mount unit which was added during the loading of the later. When the device is
+                 * plugged the sysfs might not be initialized yet, as we serialize the device's state but do not
+                 * serialize the sysfs path across reloads/reexecs. Hence, when coming back from a reload/restart we
+                 * might have the state valid, but not the sysfs path. Also, there is another possibility; when multiple
+                 * devices have the same devlink (e.g. /dev/disk/by-uuid/xxxx), adding/updating/removing one of the
+                 * device causes syspath change. Hence, let's always update sysfs path. */
+
+                /* Let's remove all dependencies generated due to udev properties. We'll re-add whatever is configured
+                 * now below. */
+                unit_remove_dependencies(u, UNIT_DEPENDENCY_UDEV);
+
+        } else {
+                r = unit_new_for_name(m, sizeof(Device), e, &new_unit);
+                if (r < 0)
+                        return log_device_error_errno(dev, r, "Failed to allocate device unit %s: %m", e);
+
+                u = new_unit;
+
+                unit_add_to_load_queue(u);
+        }
+
+        if (!DEVICE(u)->path) {
+                DEVICE(u)->path = strdup(path);
+                if (!DEVICE(u)->path)
+                        return log_oom();
+        }
+
+        /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be
+         * initialized. Hence initialize it if necessary. */
+        if (sysfs) {
+                r = device_set_sysfs(DEVICE(u), sysfs);
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs);
+
+                /* The additional systemd udev properties we only interpret for the main object */
+                if (main)
+                        (void) device_add_udev_wants(u, dev);
+        }
+
+        (void) device_update_description(u, dev, path);
+
+        /* So the user wants the mount units to be bound to the device but a mount unit might has been seen
+         * by systemd before the device appears on its radar. In this case the device unit is partially
+         * initialized and includes the deps on the mount unit but at that time the "bind mounts" flag wasn't
+         * present. Fix this up now. */
+        if (dev && device_is_bound_by_mounts(DEVICE(u), dev))
+                device_upgrade_mount_deps(u);
+
+        if (units) {
+                r = set_ensure_put(units, NULL, DEVICE(u));
+                if (r < 0)
+                        return log_unit_error_errno(u, r, "Failed to store unit: %m");
+        }
+
+        TAKE_PTR(new_unit);
+        return 0;
+}
+
+static bool device_is_ready(sd_device *dev) {
+        int r;
+
+        assert(dev);
+
+        if (device_for_action(dev, SD_DEVICE_REMOVE))
+                return false;
+
+        r = device_is_renaming(dev);
+        if (r < 0)
+                log_device_warning_errno(dev, r, "Failed to check if device is renaming, assuming device is not renaming: %m");
+        if (r > 0) {
+                log_device_debug(dev, "Device busy: device is renaming");
+                return false;
+        }
+
+        /* Is it really tagged as 'systemd' right now? */
+        r = sd_device_has_current_tag(dev, "systemd");
+        if (r < 0)
+                log_device_warning_errno(dev, r, "Failed to check if device has \"systemd\" tag, assuming device is not tagged with \"systemd\": %m");
+        if (r == 0)
+                log_device_debug(dev, "Device busy: device is not tagged with \"systemd\"");
+        if (r <= 0)
+                return false;
+
+        r = device_get_property_bool(dev, "SYSTEMD_READY");
+        if (r < 0 && r != -ENOENT)
+                log_device_warning_errno(dev, r, "Failed to get device SYSTEMD_READY property, assuming device does not have \"SYSTEMD_READY\" property: %m");
+        if (r == 0)
+                log_device_debug(dev, "Device busy: SYSTEMD_READY property from device is false");
+
+        return r != 0;
+}
+
+static int device_setup_devlink_unit_one(Manager *m, const char *devlink, Set **ready_units, Set **not_ready_units) {
+        _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+        Unit *u;
+
+        assert(m);
+        assert(devlink);
+        assert(ready_units);
+        assert(not_ready_units);
+
+        if (sd_device_new_from_devname(&dev, devlink) >= 0 && device_is_ready(dev))
+                return device_setup_unit(m, dev, devlink, /* main = */ false, ready_units);
+
+        /* the devlink is already removed or not ready */
+        if (device_by_path(m, devlink, &u) < 0)
+                return 0; /* The corresponding .device unit not found. That's fine. */
+
+        return set_ensure_put(not_ready_units, NULL, DEVICE(u));
+}
+
+static int device_setup_extra_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) {
+        _cleanup_strv_free_ char **aliases = NULL;
+        const char *syspath, *devname = NULL;
+        Device *l;
+        int r;
+
+        assert(m);
+        assert(dev);
+        assert(ready_units);
+        assert(not_ready_units);
+
+        r = sd_device_get_syspath(dev, &syspath);
+        if (r < 0)
+                return r;
+
+        (void) sd_device_get_devname(dev, &devname);
+
+        /* devlink units */
+        FOREACH_DEVICE_DEVLINK(dev, devlink) {
+                /* These are a kind of special devlink. They should be always unique, but neither persistent
+                 * nor predictable. Hence, let's refuse them. See also the comments for alias units below. */
+                if (PATH_STARTSWITH_SET(devlink, "/dev/block/", "/dev/char/"))
+                        continue;
+
+                (void) device_setup_devlink_unit_one(m, devlink, ready_units, not_ready_units);
+        }
+
+        if (device_is_ready(dev)) {
+                const char *s;
+
+                r = sd_device_get_property_value(dev, "SYSTEMD_ALIAS", &s);
+                if (r < 0 && r != -ENOENT)
+                        log_device_warning_errno(dev, r, "Failed to get SYSTEMD_ALIAS property, ignoring: %m");
+                if (r >= 0) {
+                        r = strv_split_full(&aliases, s, NULL, EXTRACT_UNQUOTE);
+                        if (r < 0)
+                                log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_ALIAS property, ignoring: %m");
+                }
+        }
+
+        /* alias units */
+        STRV_FOREACH(alias, aliases) {
+                if (!path_is_absolute(*alias)) {
+                        log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not an absolute path, ignoring.", *alias);
+                        continue;
+                }
+
+                if (!path_is_safe(*alias)) {
+                        log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not safe, ignoring.", *alias);
+                        continue;
+                }
+
+                /* Note, even if the devlink is not persistent, LVM expects /dev/block/ symlink units exist.
+                 * To achieve that, they set the path to SYSTEMD_ALIAS. Hence, we cannot refuse aliases start
+                 * with /dev/, unfortunately. */
+
+                (void) device_setup_unit(m, dev, *alias, /* main = */ false, ready_units);
+        }
+
+        l = hashmap_get(m->devices_by_sysfs, syspath);
+        LIST_FOREACH(same_sysfs, d, l) {
+                if (!d->path)
+                        continue;
+
+                if (path_equal(d->path, syspath))
+                        continue; /* This is the main unit. */
+
+                if (devname && path_equal(d->path, devname))
+                        continue; /* This is the real device node. */
+
+                if (device_has_devlink(dev, d->path))
+                        continue; /* The devlink was already processed in the above loop. */
+
+                if (strv_contains(aliases, d->path))
+                        continue; /* This is already processed in the above, and ready. */
+
+                if (path_startswith(d->path, "/dev/"))
+                        /* This is a devlink unit. Check existence and update syspath. */
+                        (void) device_setup_devlink_unit_one(m, d->path, ready_units, not_ready_units);
+                else
+                        /* This is an alias unit of dropped or not ready device. */
+                        (void) set_ensure_put(not_ready_units, NULL, d);
+        }
+
+        return 0;
+}
+
+static int device_setup_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) {
+        const char *syspath, *devname = NULL;
+        int r;
+
+        assert(m);
+        assert(dev);
+        assert(ready_units);
+        assert(not_ready_units);
+
+        r = sd_device_get_syspath(dev, &syspath);
+        if (r < 0)
+                return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m");
+
+        /* First, process the main (that is, points to the syspath) and (real, not symlink) devnode units. */
+        if (device_for_action(dev, SD_DEVICE_REMOVE))
+                /* If the device is removed, the main and devnode units will be removed by
+                 * device_update_found_by_sysfs() in device_dispatch_io(). Hence, it is not necessary to
+                 * store them to not_ready_units, and we have nothing to do here.
+                 *
+                 * Note, still we need to process devlink units below, as a devlink previously points to this
+                 * device may still exist and now point to another device node. That is, do not forget to
+                 * call device_setup_extra_units(). */
+                ;
+        else if (device_is_ready(dev)) {
+                /* Add the main unit named after the syspath. If this one fails, don't bother with the rest,
+                 * as this one shall be the main device unit the others just follow. (Compare with how
+                 * device_following() is implemented, see below, which looks for the sysfs device.) */
+                r = device_setup_unit(m, dev, syspath, /* main = */ true, ready_units);
+                if (r < 0)
+                        return r;
+
+                /* Add an additional unit for the device node */
+                if (sd_device_get_devname(dev, &devname) >= 0)
+                        (void) device_setup_unit(m, dev, devname, /* main = */ false, ready_units);
+
+        } else {
+                Unit *u;
+
+                /* If the device exists but not ready, then save the units and unset udev bits later. */
+
+                if (device_by_path(m, syspath, &u) >= 0) {
+                        r = set_ensure_put(not_ready_units, NULL, DEVICE(u));
+                        if (r < 0)
+                                log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m");
+                }
+
+                if (sd_device_get_devname(dev, &devname) >= 0 &&
+                    device_by_path(m, devname, &u) >= 0) {
+                        r = set_ensure_put(not_ready_units, NULL, DEVICE(u));
+                        if (r < 0)
+                                log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m");
+                }
+        }
+
+        /* Next, add/update additional .device units point to aliases and symlinks. */
+        (void) device_setup_extra_units(m, dev, ready_units, not_ready_units);
+
+        /* Safety check: no unit should be in ready_units and not_ready_units simultaneously. */
+        Unit *u;
+        SET_FOREACH(u, *not_ready_units)
+                if (set_remove(*ready_units, u))
+                        log_unit_error(u, "Cannot activate and deactivate the unit simultaneously. Deactivating.");
+
+        return 0;
+}
+
+static Unit *device_following(Unit *u) {
+        Device *d = DEVICE(u);
+        Device *first = NULL;
+
+        assert(d);
+
+        if (startswith(u->id, "sys-"))
+                return NULL;
+
+        /* Make everybody follow the unit that's named after the sysfs path */
+        LIST_FOREACH(same_sysfs, other, d->same_sysfs_next)
+                if (startswith(UNIT(other)->id, "sys-"))
+                        return UNIT(other);
+
+        LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) {
+                if (startswith(UNIT(other)->id, "sys-"))
+                        return UNIT(other);
+
+                first = other;
+        }
+
+        return UNIT(first);
+}
+
+static int device_following_set(Unit *u, Set **_set) {
+        Device *d = DEVICE(u);
+        _cleanup_set_free_ Set *set = NULL;
+        int r;
+
+        assert(d);
+        assert(_set);
+
+        if (LIST_JUST_US(same_sysfs, d)) {
+                *_set = NULL;
+                return 0;
+        }
+
+        set = set_new(NULL);
+        if (!set)
+                return -ENOMEM;
+
+        LIST_FOREACH(same_sysfs, other, d->same_sysfs_next) {
+                r = set_put(set, other);
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) {
+                r = set_put(set, other);
+                if (r < 0)
+                        return r;
+        }
+
+        *_set = TAKE_PTR(set);
+        return 1;
+}
+
+static void device_shutdown(Manager *m) {
+        assert(m);
+
+        m->device_monitor = sd_device_monitor_unref(m->device_monitor);
+        m->devices_by_sysfs = hashmap_free(m->devices_by_sysfs);
+}
+
+static void device_enumerate(Manager *m) {
+        _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+        int r;
+
+        assert(m);
+
+        if (!m->device_monitor) {
+                r = sd_device_monitor_new(&m->device_monitor);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to allocate device monitor: %m");
+                        goto fail;
+                }
+
+                r = sd_device_monitor_filter_add_match_tag(m->device_monitor, "systemd");
+                if (r < 0) {
+                        log_error_errno(r, "Failed to add udev tag match: %m");
+                        goto fail;
+                }
+
+                r = sd_device_monitor_attach_event(m->device_monitor, m->event);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to attach event to device monitor: %m");
+                        goto fail;
+                }
+
+                r = sd_device_monitor_start(m->device_monitor, device_dispatch_io, m);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to start device monitor: %m");
+                        goto fail;
+                }
+        }
+
+        r = sd_device_enumerator_new(&e);
+        if (r < 0) {
+                log_error_errno(r, "Failed to allocate device enumerator: %m");
+                goto fail;
+        }
+
+        r = sd_device_enumerator_add_match_tag(e, "systemd");
+        if (r < 0) {
+                log_error_errno(r, "Failed to set tag for device enumeration: %m");
+                goto fail;
+        }
+
+        FOREACH_DEVICE(e, dev) {
+                _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
+                Device *d;
+
+                if (device_setup_units(m, dev, &ready_units, ¬_ready_units) < 0)
+                        continue;
+
+                SET_FOREACH(d, ready_units)
+                        device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV);
+                SET_FOREACH(d, not_ready_units)
+                        device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV);
+        }
+
+        return;
+
+fail:
+        device_shutdown(m);
+}
+
+static void device_propagate_reload(Manager *m, Device *d) {
+        int r;
+
+        assert(m);
+        assert(d);
+
+        if (d->state == DEVICE_DEAD)
+                return;
+
+        r = manager_propagate_reload(m, UNIT(d), JOB_REPLACE, NULL);
+        if (r < 0)
+                log_unit_warning_errno(UNIT(d), r, "Failed to propagate reload, ignoring: %m");
+}
+
+static void device_remove_old_on_move(Manager *m, sd_device *dev) {
+        _cleanup_free_ char *syspath_old = NULL;
+        const char *devpath_old;
+        int r;
+
+        assert(m);
+        assert(dev);
+
+        r = sd_device_get_property_value(dev, "DEVPATH_OLD", &devpath_old);
+        if (r < 0)
+                return (void) log_device_debug_errno(dev, r, "Failed to get DEVPATH_OLD= property on 'move' uevent, ignoring: %m");
+
+        syspath_old = path_join("/sys", devpath_old);
+        if (!syspath_old)
+                return (void) log_oom();
+
+        device_update_found_by_sysfs(m, syspath_old, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK);
+}
+
+static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata) {
+        _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        sd_device_action_t action;
+        const char *sysfs;
+        bool ready;
+        Device *d;
+        int r;
+
+        assert(dev);
+
+        log_device_uevent(dev, "Processing udev action");
+
+        r = sd_device_get_syspath(dev, &sysfs);
+        if (r < 0) {
+                log_device_warning_errno(dev, r, "Failed to get device syspath, ignoring: %m");
+                return 0;
+        }
+
+        r = sd_device_get_action(dev, &action);
+        if (r < 0) {
+                log_device_warning_errno(dev, r, "Failed to get udev action, ignoring: %m");
+                return 0;
+        }
+
+        log_device_debug(dev, "Got '%s' action on syspath '%s'.", device_action_to_string(action), sysfs);
+
+        if (action == SD_DEVICE_MOVE)
+                device_remove_old_on_move(m, dev);
+
+        /* When udevd failed to process the device, SYSTEMD_ALIAS or any other properties may contain invalid
+         * values. Let's refuse to handle the uevent. */
+        if (sd_device_get_property_value(dev, "UDEV_WORKER_FAILED", NULL) >= 0) {
+                int v;
+
+                if (device_get_property_int(dev, "UDEV_WORKER_ERRNO", &v) >= 0)
+                        log_device_warning_errno(dev, v, "systemd-udevd failed to process the device, ignoring: %m");
+                else if (device_get_property_int(dev, "UDEV_WORKER_EXIT_STATUS", &v) >= 0)
+                        log_device_warning(dev, "systemd-udevd failed to process the device with exit status %i, ignoring.", v);
+                else if (device_get_property_int(dev, "UDEV_WORKER_SIGNAL", &v) >= 0) {
+                        const char *s;
+                        (void) sd_device_get_property_value(dev, "UDEV_WORKER_SIGNAL_NAME", &s);
+                        log_device_warning(dev, "systemd-udevd failed to process the device with signal %i(%s), ignoring.", v, strna(s));
+                } else
+                        log_device_warning(dev, "systemd-udevd failed to process the device with unknown result, ignoring.");
+
+                return 0;
+        }
+
+        /* A change event can signal that a device is becoming ready, in particular if the device is using
+         * the SYSTEMD_READY logic in udev so we need to reach the else block of the following if, even for
+         * change events */
+        ready = device_is_ready(dev);
+
+        (void) device_setup_units(m, dev, &ready_units, ¬_ready_units);
+
+        if (action == SD_DEVICE_REMOVE) {
+                r = swap_process_device_remove(m, dev);
+                if (r < 0)
+                        log_device_warning_errno(dev, r, "Failed to process swap device remove event, ignoring: %m");
+        } else if (ready) {
+                r = swap_process_device_new(m, dev);
+                if (r < 0)
+                        log_device_warning_errno(dev, r, "Failed to process swap device new event, ignoring: %m");
+        }
+
+        if (!IN_SET(action, SD_DEVICE_ADD, SD_DEVICE_REMOVE, SD_DEVICE_MOVE))
+                SET_FOREACH(d, ready_units)
+                        device_propagate_reload(m, d);
+
+        if (!set_isempty(ready_units))
+                manager_dispatch_load_queue(m);
+
+        if (action == SD_DEVICE_REMOVE)
+                /* If we get notified that a device was removed by udev, then it's completely gone, hence
+                 * unset all found bits. Note this affects all .device units still point to the removed
+                 * device. */
+                device_update_found_by_sysfs(m, sysfs, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK);
+
+        /* These devices are found and ready now, set the udev found bit. Note, this is also necessary to do
+         * on remove uevent, as some devlinks may be updated and now point to other device nodes. */
+        SET_FOREACH(d, ready_units)
+                device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV);
+
+        /* These devices may be nominally around, but not ready for us. Hence unset the udev bit, but leave
+         * the rest around. This may be redundant for remove uevent, but should be harmless. */
+        SET_FOREACH(d, not_ready_units)
+                device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV);
+
+        return 0;
+}
+
+void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask) {
+        int r;
+
+        assert(m);
+        assert(node);
+        assert(!FLAGS_SET(mask, DEVICE_FOUND_UDEV));
+
+        if (!udev_available())
+                return;
+
+        if (mask == 0)
+                return;
+
+        /* This is called whenever we find a device referenced in /proc/swaps or /proc/self/mounts. Such a device might
+         * be mounted/enabled at a time where udev has not finished probing it yet, and we thus haven't learned about
+         * it yet. In this case we will set the device unit to "tentative" state.
+         *
+         * This takes a pair of DeviceFound flags parameters. The 'mask' parameter is a bit mask that indicates which
+         * bits of 'found' to copy into the per-device DeviceFound flags field. Thus, this function may be used to set
+         * and unset individual bits in a single call, while merging partially with previous state. */
+
+        if ((found & mask) != 0) {
+                _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+                /* If the device is known in the kernel and newly appeared, then we'll create a device unit for it,
+                 * under the name referenced in /proc/swaps or /proc/self/mountinfo. But first, let's validate if
+                 * everything is alright with the device node. Note that we're fine with missing device nodes,
+                 * but not with badly set up ones. */
+
+                r = sd_device_new_from_devname(&dev, node);
+                if (r == -ENODEV)
+                        log_debug("Could not find device for %s, continuing without device node", node);
+                else if (r < 0) {
+                        /* Reduce log noise from nodes which are not device nodes by skipping EINVAL. */
+                        if (r != -EINVAL)
+                                log_error_errno(r, "Failed to open %s device, ignoring: %m", node);
+                        return;
+                }
+
+                (void) device_setup_unit(m, dev, node, /* main = */ false, NULL); /* 'dev' may be NULL. */
+        }
+
+        /* Update the device unit's state, should it exist */
+        (void) device_update_found_by_name(m, node, found, mask);
+}
+
+bool device_shall_be_bound_by(Unit *device, Unit *u) {
+        assert(device);
+        assert(u);
+
+        if (u->type != UNIT_MOUNT)
+                return false;
+
+        return DEVICE(device)->bind_mounts;
+}
+
+const UnitVTable device_vtable = {
+        .object_size = sizeof(Device),
+        .sections =
+                "Unit\0"
+                "Device\0"
+                "Install\0",
+
+        .gc_jobs = true,
+
+        .init = device_init,
+        .done = device_done,
+        .load = device_load,
+
+        .coldplug = device_coldplug,
+        .catchup = device_catchup,
+
+        .serialize = device_serialize,
+        .deserialize_item = device_deserialize_item,
+
+        .dump = device_dump,
+
+        .active_state = device_active_state,
+        .sub_state_to_string = device_sub_state_to_string,
+
+        .following = device_following,
+        .following_set = device_following_set,
+
+        .enumerate = device_enumerate,
+        .shutdown = device_shutdown,
+        .supported = udev_available,
+
+        .status_message_formats = {
+                .starting_stopping = {
+                        [0] = "Expecting device %s...",
+                        [1] = "Waiting for device %s to disappear...",
+                },
+                .finished_start_job = {
+                        [JOB_DONE]       = "Found device %s.",
+                        [JOB_TIMEOUT]    = "Timed out waiting for device %s.",
+                },
+        },
+};
diff --git a/src/core/device.h b/src/core/device.h
new file mode 100644
index 0000000..9dd6fb5
--- /dev/null
+++ b/src/core/device.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Device Device;
+
+/* A mask specifying where we have seen the device currently. This is a bitmask because the device might show up
+ * asynchronously from each other at various places. For example, in very common case a device might already be mounted
+ * before udev finished probing it (think: a script setting up a loopback block device, formatting it and mounting it
+ * in quick succession). Hence we need to track precisely where it is already visible and where not. */
+typedef enum DeviceFound {
+        DEVICE_NOT_FOUND   = 0,
+        DEVICE_FOUND_UDEV  = 1 << 0, /* The device has shown up in the udev database */
+        DEVICE_FOUND_MOUNT = 1 << 1, /* The device has shown up in /proc/self/mountinfo */
+        DEVICE_FOUND_SWAP  = 1 << 2, /* The device has shown up in /proc/swaps */
+        DEVICE_FOUND_MASK  = DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP,
+} DeviceFound;
+
+struct Device {
+        Unit meta;
+
+        char *sysfs, *deserialized_sysfs;
+        char *path; /* syspath, device node, alias, or devlink */
+
+        /* In order to be able to distinguish dependencies on different device nodes we might end up creating multiple
+         * devices for the same sysfs path. We chain them up here. */
+        LIST_FIELDS(struct Device, same_sysfs);
+
+        DeviceState state, deserialized_state;
+        DeviceFound found, deserialized_found, enumerated_found;
+
+        bool bind_mounts;
+
+        /* The SYSTEMD_WANTS udev property for this device the last time we saw it */
+        char **wants_property;
+};
+
+extern const UnitVTable device_vtable;
+
+void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask);
+bool device_shall_be_bound_by(Unit *device, Unit *u);
+
+DEFINE_CAST(DEVICE, Device);
diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c
new file mode 100644
index 0000000..12724c6
--- /dev/null
+++ b/src/core/dynamic-user.c
@@ -0,0 +1,871 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "clean-ipc.h"
+#include "dynamic-user.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "iovec-util.h"
+#include "lock-util.h"
+#include "nscd-flush.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "socket-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */
+#define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN)
+
+DEFINE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user);
+
+DynamicUser* dynamic_user_free(DynamicUser *d) {
+        if (!d)
+                return NULL;
+
+        if (d->manager)
+                (void) hashmap_remove(d->manager->dynamic_users, d->name);
+
+        safe_close_pair(d->storage_socket);
+        return mfree(d);
+}
+
+static int dynamic_user_add(Manager *m, const char *name, int storage_socket[static 2], DynamicUser **ret) {
+        DynamicUser *d;
+        int r;
+
+        assert(m || ret);
+        assert(name);
+        assert(storage_socket);
+
+        if (m) { /* Might be called in sd-executor with no manager object */
+                r = hashmap_ensure_allocated(&m->dynamic_users, &string_hash_ops);
+                if (r < 0)
+                        return r;
+        }
+
+        d = malloc0(offsetof(DynamicUser, name) + strlen(name) + 1);
+        if (!d)
+                return -ENOMEM;
+
+        strcpy(d->name, name);
+
+        d->storage_socket[0] = storage_socket[0];
+        d->storage_socket[1] = storage_socket[1];
+
+        if (m) { /* Might be called in sd-executor with no manager object */
+                r = hashmap_put(m->dynamic_users, d->name, d);
+                if (r < 0) {
+                        free(d);
+                        return r;
+                }
+        }
+
+        d->manager = m;
+
+        if (ret)
+                *ret = d;
+
+        return 0;
+}
+
+static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) {
+        _cleanup_close_pair_ int storage_socket[2] = EBADF_PAIR;
+        DynamicUser *d;
+        int r;
+
+        assert(m);
+        assert(name);
+
+        /* Return the DynamicUser structure for a specific user name. Note that this won't actually allocate a UID for
+         * it, but just prepare the data structure for it. The UID is allocated only on demand, when it's really
+         * needed, and in the child process we fork off, since allocation involves NSS checks which are not OK to do
+         * from PID 1. To allow the children and PID 1 share information about allocated UIDs we use an anonymous
+         * AF_UNIX/SOCK_DGRAM socket (called the "storage socket") that contains at most one datagram with the
+         * allocated UID number, plus an fd referencing the lock file for the UID
+         * (i.e. /run/systemd/dynamic-uid/$UID). Why involve the socket pair? So that PID 1 and all its children can
+         * share the same storage for the UID and lock fd, simply by inheriting the storage socket fds. The socket pair
+         * may exist in three different states:
+         *
+         * a) no datagram stored. This is the initial state. In this case the dynamic user was never realized.
+         *
+         * b) a datagram containing a UID stored, but no lock fd attached to it. In this case there was already a
+         *    statically assigned UID by the same name, which we are reusing.
+         *
+         * c) a datagram containing a UID stored, and a lock fd is attached to it. In this case we allocated a dynamic
+         *    UID and locked it in the file system, using the lock fd.
+         *
+         * As PID 1 and various children might access the socket pair simultaneously, and pop the datagram or push it
+         * back in any time, we also maintain a lock on the socket pair. Note one peculiarity regarding locking here:
+         * the UID lock on disk is protected via a BSD file lock (i.e. an fd-bound lock), so that the lock is kept in
+         * place as long as there's a reference to the fd open. The lock on the storage socket pair however is a POSIX
+         * file lock (i.e. a process-bound lock), as all users share the same fd of this (after all it is anonymous,
+         * nobody else could get any access to it except via our own fd) and we want to synchronize access between all
+         * processes that have access to it. */
+
+        d = hashmap_get(m->dynamic_users, name);
+        if (d) {
+                if (ret) {
+                        /* We already have a structure for the dynamic user, let's increase the ref count and reuse it */
+                        d->n_ref++;
+                        *ret = d;
+                }
+                return 0;
+        }
+
+        if (!valid_user_group_name(name, VALID_USER_ALLOW_NUMERIC))
+                return -EINVAL;
+
+        if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, storage_socket) < 0)
+                return -errno;
+
+        r = dynamic_user_add(m, name, storage_socket, &d);
+        if (r < 0)
+                return r;
+
+        storage_socket[0] = storage_socket[1] = -EBADF;
+
+        if (ret) {
+                d->n_ref++;
+                *ret = d;
+        }
+
+        return 1;
+}
+
+static int make_uid_symlinks(uid_t uid, const char *name, bool b) {
+
+        char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1];
+        const char *path2;
+        int r = 0, k;
+
+        /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The
+         * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it
+         * would be its own client then). We hence keep these world-readable symlinks in place, so that the
+         * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go
+         * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks
+         * on them and as those may be taken by any user with read access we can't make them world-readable. */
+
+        xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid);
+        if (unlink(path1) < 0 && errno != ENOENT)
+                r = -errno;
+
+        if (b && symlink(name, path1) < 0) {
+                k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path1);
+                if (r == 0)
+                        r = k;
+        }
+
+        path2 = strjoina("/run/systemd/dynamic-uid/direct:", name);
+        if (unlink(path2) < 0 && errno != ENOENT) {
+                k = -errno;
+                if (r == 0)
+                        r = k;
+        }
+
+        if (b && symlink(path1 + STRLEN("/run/systemd/dynamic-uid/direct:"), path2) < 0) {
+                k = log_warning_errno(errno,  "Failed to symlink \"%s\": %m", path2);
+                if (r == 0)
+                        r = k;
+        }
+
+        return r;
+}
+
+static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) {
+
+        /* Find a suitable free UID. We use the following strategy to find a suitable UID:
+         *
+         * 1. Initially, we try to read the UID of a number of specified paths. If any of these UIDs works, we use
+         *    them. We use in order to increase the chance of UID reuse, if StateDirectory=, CacheDirectory= or
+         *    LogsDirectory= are used, as reusing the UID these directories are owned by saves us from having to
+         *    recursively chown() them to new users.
+         *
+         * 2. If that didn't yield a currently unused UID, we hash the user name, and try to use that. This should be
+         *    pretty good, as the use ris by default derived from the unit name, and hence the same service and same
+         *    user should usually get the same UID as long as our hashing doesn't clash.
+         *
+         * 3. Finally, if that didn't work, we randomly pick UIDs, until we find one that is empty.
+         *
+         * Since the dynamic UID space is relatively small we'll stop trying after 100 iterations, giving up. */
+
+        enum {
+                PHASE_SUGGESTED,  /* the first phase, reusing directory ownership UIDs */
+                PHASE_HASHED,     /* the second phase, deriving a UID from the username by hashing */
+                PHASE_RANDOM,     /* the last phase, randomly picking UIDs */
+        } phase = PHASE_SUGGESTED;
+
+        static const uint8_t hash_key[] = {
+                0x37, 0x53, 0x7e, 0x31, 0xcf, 0xce, 0x48, 0xf5,
+                0x8a, 0xbb, 0x39, 0x57, 0x8d, 0xd9, 0xec, 0x59
+        };
+
+        unsigned n_tries = 100, current_suggested = 0;
+        int r;
+
+        (void) mkdir("/run/systemd/dynamic-uid", 0755);
+
+        for (;;) {
+                char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+                _cleanup_close_ int lock_fd = -EBADF;
+                uid_t candidate;
+                ssize_t l;
+
+                if (--n_tries <= 0) /* Give up retrying eventually */
+                        return -EBUSY;
+
+                switch (phase) {
+
+                case PHASE_SUGGESTED: {
+                        struct stat st;
+
+                        if (!suggested_paths || !suggested_paths[current_suggested]) {
+                                /* We reached the end of the suggested paths list, let's try by hashing the name */
+                                phase = PHASE_HASHED;
+                                continue;
+                        }
+
+                        if (stat(suggested_paths[current_suggested++], &st) < 0)
+                                continue; /* We can't read the UID of this path, but that doesn't matter, just try the next */
+
+                        candidate = st.st_uid;
+                        break;
+                }
+
+                case PHASE_HASHED:
+                        /* A static user by this name does not exist yet. Let's find a free ID then, and use that. We
+                         * start with a UID generated as hash from the user name. */
+                        candidate = UID_CLAMP_INTO_RANGE(siphash24(name, strlen(name), hash_key));
+
+                        /* If this one fails, we should proceed with random tries */
+                        phase = PHASE_RANDOM;
+                        break;
+
+                case PHASE_RANDOM:
+
+                        /* Pick another random UID, and see if that works for us. */
+                        random_bytes(&candidate, sizeof(candidate));
+                        candidate = UID_CLAMP_INTO_RANGE(candidate);
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+
+                /* Make sure whatever we picked here actually is in the right range */
+                if (!uid_is_dynamic(candidate))
+                        continue;
+
+                xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, candidate);
+
+                for (;;) {
+                        struct stat st;
+
+                        lock_fd = open(lock_path, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600);
+                        if (lock_fd < 0)
+                                return -errno;
+
+                        r = flock(lock_fd, LOCK_EX|LOCK_NB); /* Try to get a BSD file lock on the UID lock file */
+                        if (r < 0) {
+                                if (IN_SET(errno, EBUSY, EAGAIN))
+                                        goto next; /* already in use */
+
+                                return -errno;
+                        }
+
+                        if (fstat(lock_fd, &st) < 0)
+                                return -errno;
+                        if (st.st_nlink > 0)
+                                break;
+
+                        /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and
+                         * got the lock. Close it, and try again. */
+                        lock_fd = safe_close(lock_fd);
+                }
+
+                /* Some superficial check whether this UID/GID might already be taken by some static user */
+                if (getpwuid(candidate) ||
+                    getgrgid((gid_t) candidate) ||
+                    search_ipc(candidate, (gid_t) candidate) != 0) {
+                        (void) unlink(lock_path);
+                        continue;
+                }
+
+                /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */
+                l = pwritev(lock_fd,
+                            (struct iovec[2]) {
+                                    IOVEC_MAKE_STRING(name),
+                                    IOVEC_MAKE((char[1]) { '\n' }, 1),
+                            }, 2, 0);
+                if (l < 0) {
+                        r = -errno;
+                        (void) unlink(lock_path);
+                        return r;
+                }
+
+                (void) ftruncate(lock_fd, l);
+                (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */
+
+                *ret_uid = candidate;
+                return TAKE_FD(lock_fd);
+
+        next:
+                ;
+        }
+}
+
+static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) {
+        uid_t uid = UID_INVALID;
+        struct iovec iov = IOVEC_MAKE(&uid, sizeof(uid));
+        int lock_fd;
+        ssize_t k;
+
+        assert(d);
+        assert(ret_uid);
+        assert(ret_lock_fd);
+
+        /* Read the UID and lock fd that is stored in the storage AF_UNIX socket. This should be called with
+         * the lock on the socket taken. */
+
+        k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd);
+        if (k < 0)
+                return (int) k;
+
+        *ret_uid = uid;
+        *ret_lock_fd = lock_fd;
+
+        return 0;
+}
+
+static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) {
+        struct iovec iov = IOVEC_MAKE(&uid, sizeof(uid));
+
+        assert(d);
+
+        /* Store the UID and lock_fd in the storage socket. This should be called with the socket pair lock taken. */
+        return send_one_fd_iov(d->storage_socket[1], lock_fd, &iov, 1, MSG_DONTWAIT);
+}
+
+static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) {
+        char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+
+        if (lock_fd < 0)
+                return;
+
+        xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid);
+        (void) unlink(lock_path);
+
+        (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */
+}
+
+static int dynamic_user_realize(
+                DynamicUser *d,
+                char **suggested_dirs,
+                uid_t *ret_uid, gid_t *ret_gid,
+                bool is_user) {
+
+        _cleanup_close_ int uid_lock_fd = -EBADF;
+        _cleanup_close_ int etc_passwd_lock_fd = -EBADF;
+        uid_t num = UID_INVALID; /* a uid if is_user, and a gid otherwise */
+        gid_t gid = GID_INVALID; /* a gid if is_user, ignored otherwise */
+        bool flush_cache = false;
+        int r;
+
+        assert(d);
+        assert(is_user == !!ret_uid);
+        assert(ret_gid);
+
+        /* Acquire a UID for the user name. This will allocate a UID for the user name if the user doesn't exist
+         * yet. If it already exists its existing UID/GID will be reused. */
+
+        r = posix_lock(d->storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return r;
+
+        CLEANUP_POSIX_UNLOCK(d->storage_socket[0]);
+
+        r = dynamic_user_pop(d, &num, &uid_lock_fd);
+        if (r < 0) {
+                int new_uid_lock_fd;
+                uid_t new_uid;
+
+                if (r != -EAGAIN)
+                        return r;
+
+                /* OK, nothing stored yet, let's try to find something useful. While we are working on this release the
+                 * lock however, so that nobody else blocks on our NSS lookups. */
+                r = posix_lock(d->storage_socket[0], LOCK_UN);
+                if (r < 0)
+                        return r;
+
+                /* Let's see if a proper, static user or group by this name exists. Try to take the lock on
+                 * /etc/passwd, if that fails with EROFS then /etc is read-only. In that case it's fine if we don't
+                 * take the lock, given that users can't be added there anyway in this case. */
+                etc_passwd_lock_fd = take_etc_passwd_lock(NULL);
+                if (etc_passwd_lock_fd < 0 && etc_passwd_lock_fd != -EROFS)
+                        return etc_passwd_lock_fd;
+
+                /* First, let's parse this as numeric UID */
+                r = parse_uid(d->name, &num);
+                if (r < 0) {
+                        struct passwd *p;
+                        struct group *g;
+
+                        if (is_user) {
+                                /* OK, this is not a numeric UID. Let's see if there's a user by this name */
+                                p = getpwnam(d->name);
+                                if (p) {
+                                        num = p->pw_uid;
+                                        gid = p->pw_gid;
+                                } else {
+                                        /* if the user does not exist but the group with the same name exists, refuse operation */
+                                        g = getgrnam(d->name);
+                                        if (g)
+                                                return -EILSEQ;
+                                }
+                        } else {
+                                /* Let's see if there's a group by this name */
+                                g = getgrnam(d->name);
+                                if (g)
+                                        num = (uid_t) g->gr_gid;
+                                else {
+                                        /* if the group does not exist but the user with the same name exists, refuse operation */
+                                        p = getpwnam(d->name);
+                                        if (p)
+                                                return -EILSEQ;
+                                }
+                        }
+                }
+
+                if (num == UID_INVALID) {
+                        /* No static UID assigned yet, excellent. Let's pick a new dynamic one, and lock it. */
+
+                        uid_lock_fd = pick_uid(suggested_dirs, d->name, &num);
+                        if (uid_lock_fd < 0)
+                                return uid_lock_fd;
+                }
+
+                /* So, we found a working UID/lock combination. Let's see if we actually still need it. */
+                r = posix_lock(d->storage_socket[0], LOCK_EX);
+                if (r < 0) {
+                        unlink_uid_lock(uid_lock_fd, num, d->name);
+                        return r;
+                }
+
+                r = dynamic_user_pop(d, &new_uid, &new_uid_lock_fd);
+                if (r < 0) {
+                        if (r != -EAGAIN) {
+                                /* OK, something bad happened, let's get rid of the bits we acquired. */
+                                unlink_uid_lock(uid_lock_fd, num, d->name);
+                                return r;
+                        }
+
+                        /* Great! Nothing is stored here, still. Store our newly acquired data. */
+                        flush_cache = true;
+                } else {
+                        /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we
+                         * acquired, and use what's stored now. */
+
+                        unlink_uid_lock(uid_lock_fd, num, d->name);
+                        safe_close(uid_lock_fd);
+
+                        num = new_uid;
+                        uid_lock_fd = new_uid_lock_fd;
+                }
+        } else if (is_user && !uid_is_dynamic(num)) {
+                struct passwd *p;
+
+                /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */
+                errno = 0;
+                p = getpwuid(num);
+                if (!p)
+                        return errno_or_else(ESRCH);
+
+                gid = p->pw_gid;
+        }
+
+        /* If the UID/GID was already allocated dynamically, push the data we popped out back in. If it was already
+         * allocated statically, push the UID back too, but do not push the lock fd in. If we allocated the UID
+         * dynamically right here, push that in along with the lock fd for it. */
+        r = dynamic_user_push(d, num, uid_lock_fd);
+        if (r < 0)
+                return r;
+
+        if (flush_cache) {
+                /* If we allocated a new dynamic UID, refresh nscd, so that it forgets about potentially cached
+                 * negative entries. But let's do so after we release the /etc/passwd lock, so that there's no
+                 * potential for nscd wanting to lock that for completing the invalidation. */
+                etc_passwd_lock_fd = safe_close(etc_passwd_lock_fd);
+                (void) nscd_flush_cache(STRV_MAKE("passwd", "group"));
+        }
+
+        if (is_user) {
+                *ret_uid = num;
+                *ret_gid = gid != GID_INVALID ? gid : num;
+        } else
+                *ret_gid = num;
+
+        return 0;
+}
+
+int dynamic_user_current(DynamicUser *d, uid_t *ret) {
+        _cleanup_close_ int lock_fd = -EBADF;
+        uid_t uid;
+        int r;
+
+        assert(d);
+
+        /* Get the currently assigned UID for the user, if there's any. This simply pops the data from the
+         * storage socket, and pushes it back in right-away. */
+
+        r = posix_lock(d->storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return r;
+
+        CLEANUP_POSIX_UNLOCK(d->storage_socket[0]);
+
+        r = dynamic_user_pop(d, &uid, &lock_fd);
+        if (r < 0)
+                return r;
+
+        r = dynamic_user_push(d, uid, lock_fd);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = uid;
+
+        return 0;
+}
+
+static DynamicUser* dynamic_user_unref(DynamicUser *d) {
+        if (!d)
+                return NULL;
+
+        /* Note that this doesn't actually release any resources itself. If a dynamic user should be fully
+         * destroyed and its UID released, use dynamic_user_destroy() instead. NB: the dynamic user table may
+         * contain entries with no references, which is commonly the case right before a daemon reload. */
+
+        assert(d->n_ref > 0);
+        d->n_ref--;
+
+        return NULL;
+}
+
+static int dynamic_user_close(DynamicUser *d) {
+        _cleanup_close_ int lock_fd = -EBADF;
+        uid_t uid;
+        int r;
+
+        /* Release the user ID, by releasing the lock on it, and emptying the storage socket. After this the
+         * user is unrealized again, much like it was after it the DynamicUser object was first allocated. */
+
+        r = posix_lock(d->storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return r;
+
+        CLEANUP_POSIX_UNLOCK(d->storage_socket[0]);
+
+        r = dynamic_user_pop(d, &uid, &lock_fd);
+        if (r == -EAGAIN)
+                /* User wasn't realized yet, nothing to do. */
+                return 0;
+        if (r < 0)
+                return r;
+
+        /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */
+        unlink_uid_lock(lock_fd, uid, d->name);
+
+        (void) nscd_flush_cache(STRV_MAKE("passwd", "group"));
+        return 1;
+}
+
+static DynamicUser* dynamic_user_destroy(DynamicUser *d) {
+        if (!d)
+                return NULL;
+
+        /* Drop a reference to a DynamicUser object, and destroy the user completely if this was the last
+         * reference. This is called whenever a service is shut down and wants its dynamic UID gone. Note that
+         * dynamic_user_unref() is what is called whenever a service is simply freed, for example during a reload
+         * cycle, where the dynamic users should not be destroyed, but our datastructures should. */
+
+        dynamic_user_unref(d);
+
+        if (d->n_ref > 0)
+                return NULL;
+
+        (void) dynamic_user_close(d);
+        return dynamic_user_free(d);
+}
+
+int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds) {
+        int copy0, copy1;
+
+        assert(key);
+        assert(f);
+        assert(fds);
+
+        if (!d)
+                return 0;
+
+        if (d->storage_socket[0] < 0 || d->storage_socket[1] < 0)
+                return 0;
+
+        copy0 = fdset_put_dup(fds, d->storage_socket[0]);
+        if (copy0 < 0)
+                return log_error_errno(copy0, "Failed to add dynamic user storage fd to serialization: %m");
+
+        copy1 = fdset_put_dup(fds, d->storage_socket[1]);
+        if (copy1 < 0)
+                return log_error_errno(copy1, "Failed to add dynamic user storage fd to serialization: %m");
+
+        (void) serialize_item_format(f, key, "%s %i %i", d->name, copy0, copy1);
+
+        return 0;
+}
+
+int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds) {
+        DynamicUser *d;
+
+        assert(m);
+
+        /* Dump the dynamic user database into the manager serialization, to deal with daemon reloads. */
+
+        HASHMAP_FOREACH(d, m->dynamic_users)
+                (void) dynamic_user_serialize_one(d, "dynamic-user", f, fds);
+
+        return 0;
+}
+
+void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret) {
+        _cleanup_free_ char *name = NULL, *s0 = NULL, *s1 = NULL;
+        _cleanup_close_ int fd0 = -EBADF, fd1 = -EBADF;
+        int r;
+
+        assert(value);
+        assert(fds);
+
+        /* Parse the serialization again, after a daemon reload */
+
+        r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL);
+        if (r != 3 || !isempty(value)) {
+                log_debug("Unable to parse dynamic user line.");
+                return;
+        }
+
+        fd0 = deserialize_fd(fds, s0);
+        if (fd0 < 0)
+                return;
+
+        fd1 = deserialize_fd(fds, s1);
+        if (fd1 < 0)
+                return;
+
+        r = dynamic_user_add(m, name, (int[]) { fd0, fd1 }, ret);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to add dynamic user: %m");
+                return;
+        }
+
+        TAKE_FD(fd0);
+        TAKE_FD(fd1);
+
+        if (ret) /* If the caller uses it directly, increment the refcount */
+                (*ret)->n_ref++;
+}
+
+void dynamic_user_vacuum(Manager *m, bool close_user) {
+        DynamicUser *d;
+
+        assert(m);
+
+        /* Empty the dynamic user database, optionally cleaning up orphaned dynamic users, i.e. destroy and free users
+         * to which no reference exist. This is called after a daemon reload finished, in order to destroy users which
+         * might not be referenced anymore. */
+
+        HASHMAP_FOREACH(d, m->dynamic_users) {
+                if (d->n_ref > 0)
+                        continue;
+
+                if (close_user) {
+                        log_debug("Removing orphaned dynamic user %s", d->name);
+                        (void) dynamic_user_close(d);
+                }
+
+                dynamic_user_free(d);
+        }
+}
+
+int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret) {
+        char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+        _cleanup_free_ char *user = NULL;
+        uid_t check_uid;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        /* A friendly way to translate a dynamic user's UID into a name. */
+        if (!uid_is_dynamic(uid))
+                return -ESRCH;
+
+        xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid);
+        r = read_one_line_file(lock_path, &user);
+        if (IN_SET(r, -ENOENT, 0))
+                return -ESRCH;
+        if (r < 0)
+                return r;
+
+        /* The lock file might be stale, hence let's verify the data before we return it */
+        r = dynamic_user_lookup_name(m, user, &check_uid);
+        if (r < 0)
+                return r;
+        if (check_uid != uid) /* lock file doesn't match our own idea */
+                return -ESRCH;
+
+        *ret = TAKE_PTR(user);
+
+        return 0;
+}
+
+int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) {
+        DynamicUser *d;
+        int r;
+
+        assert(m);
+        assert(name);
+
+        /* A friendly call for translating a dynamic user's name into its UID */
+
+        d = hashmap_get(m->dynamic_users, name);
+        if (!d)
+                return -ESRCH;
+
+        r = dynamic_user_current(d, ret);
+        if (r == -EAGAIN) /* not realized yet? */
+                return -ESRCH;
+
+        return r;
+}
+
+int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret) {
+        _cleanup_(dynamic_creds_unrefp) DynamicCreds *creds = NULL;
+        bool acquired = false;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        if (!user && !group) {
+                *ret = NULL;
+                return 0;
+        }
+
+        creds = new0(DynamicCreds, 1);
+        if (!creds)
+                return -ENOMEM;
+
+        /* A DynamicUser object encapsulates an allocation of both a UID and a GID for a specific name. However, some
+         * services use different user and groups. For cases like that there's DynamicCreds containing a pair of user
+         * and group. This call allocates a pair. */
+
+        if (user) {
+                r = dynamic_user_acquire(m, user, &creds->user);
+                if (r < 0)
+                        return r;
+
+                acquired = true;
+        }
+
+        if (creds->user && (!group || streq_ptr(user, group)))
+                creds->group = dynamic_user_ref(creds->user);
+        else if (group) {
+                r = dynamic_user_acquire(m, group, &creds->group);
+                if (r < 0) {
+                        if (acquired)
+                                creds->user = dynamic_user_unref(creds->user);
+                        return r;
+                }
+        }
+
+        *ret = TAKE_PTR(creds);
+
+        return 0;
+}
+
+int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid) {
+        uid_t u = UID_INVALID;
+        gid_t g = GID_INVALID;
+        int r;
+
+        assert(creds);
+        assert(uid);
+        assert(gid);
+
+        /* Realize both the referenced user and group */
+
+        if (creds->user) {
+                r = dynamic_user_realize(creds->user, suggested_paths, &u, &g, true);
+                if (r < 0)
+                        return r;
+        }
+
+        if (creds->group && creds->group != creds->user) {
+                r = dynamic_user_realize(creds->group, suggested_paths, NULL, &g, false);
+                if (r < 0)
+                        return r;
+        }
+
+        *uid = u;
+        *gid = g;
+        return 0;
+}
+
+DynamicCreds* dynamic_creds_unref(DynamicCreds *creds) {
+        if (!creds)
+                return NULL;
+
+        creds->user = dynamic_user_unref(creds->user);
+        creds->group = dynamic_user_unref(creds->group);
+
+        return mfree(creds);
+}
+
+DynamicCreds* dynamic_creds_destroy(DynamicCreds *creds) {
+        if (!creds)
+                return NULL;
+
+        creds->user = dynamic_user_destroy(creds->user);
+        creds->group = dynamic_user_destroy(creds->group);
+
+        return mfree(creds);
+}
+
+void dynamic_creds_done(DynamicCreds *creds) {
+        if (!creds)
+                return;
+
+        if (creds->group != creds->user)
+                dynamic_user_free(creds->group);
+        creds->group = creds->user = dynamic_user_free(creds->user);
+}
+
+void dynamic_creds_close(DynamicCreds *creds) {
+        if (!creds)
+                return;
+
+        if (creds->user)
+                safe_close_pair(creds->user->storage_socket);
+
+        if (creds->group && creds->group != creds->user)
+                safe_close_pair(creds->group->storage_socket);
+}
diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h
new file mode 100644
index 0000000..303a7d0
--- /dev/null
+++ b/src/core/dynamic-user.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct DynamicUser DynamicUser;
+
+typedef struct DynamicCreds {
+        /* A combination of a dynamic user and group */
+        DynamicUser *user;
+        DynamicUser *group;
+} DynamicCreds;
+
+#include "manager.h"
+
+/* Note that this object always allocates a pair of user and group under the same name, even if one of them isn't
+ * used. This means, if you want to allocate a group and user pair, and they might have two different names, then you
+ * need to allocated two of these objects. DynamicCreds below makes that easy. */
+struct DynamicUser {
+        Manager *manager;
+        unsigned n_ref;
+
+        /* An AF_UNIX socket pair that contains a datagram containing both the numeric ID assigned, as well as a lock
+         * file fd locking the user ID we picked. */
+        int storage_socket[2];
+
+        char name[];
+};
+
+int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds);
+int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds);
+void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret);
+DynamicUser* dynamic_user_free(DynamicUser *d);
+void dynamic_user_vacuum(Manager *m, bool close_user);
+
+int dynamic_user_current(DynamicUser *d, uid_t *ret);
+int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret);
+int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret);
+
+int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret);
+int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid);
+
+DynamicCreds *dynamic_creds_unref(DynamicCreds *creds);
+DynamicCreds *dynamic_creds_destroy(DynamicCreds *creds);
+void dynamic_creds_done(DynamicCreds *creds);
+void dynamic_creds_close(DynamicCreds *creds);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_destroy);
+
+DynamicUser *dynamic_user_ref(DynamicUser *user);
diff --git a/src/core/efi-random.c b/src/core/efi-random.c
new file mode 100644
index 0000000..dffde57
--- /dev/null
+++ b/src/core/efi-random.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "chattr-util.h"
+#include "efi-random.h"
+#include "efivars.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "random-util.h"
+#include "strv.h"
+
+void lock_down_efi_variables(void) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        fd = open(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderSystemToken)), O_RDONLY|O_CLOEXEC);
+        if (fd < 0) {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Unable to open LoaderSystemToken EFI variable, ignoring: %m");
+                return;
+        }
+
+        /* Paranoia: let's restrict access modes of these a bit, so that unprivileged users can't use them to
+         * identify the system or gain too much insight into what we might have credited to the entropy
+         * pool. */
+        r = chattr_fd(fd, 0, FS_IMMUTABLE_FL, NULL);
+        if (r < 0)
+                log_warning_errno(r, "Failed to drop FS_IMMUTABLE_FL from LoaderSystemToken EFI variable, ignoring: %m");
+        if (fchmod(fd, 0600) < 0)
+                log_warning_errno(errno, "Failed to reduce access mode of LoaderSystemToken EFI variable, ignoring: %m");
+}
diff --git a/src/core/efi-random.h b/src/core/efi-random.h
new file mode 100644
index 0000000..87166c9
--- /dev/null
+++ b/src/core/efi-random.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+void lock_down_efi_variables(void);
diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c
new file mode 100644
index 0000000..e2cd931
--- /dev/null
+++ b/src/core/emergency-action.c
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "bus-error.h"
+#include "bus-util.h"
+#include "emergency-action.h"
+#include "raw-reboot.h"
+#include "reboot-util.h"
+#include "special.h"
+#include "string-table.h"
+#include "terminal-util.h"
+#include "virt.h"
+
+static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = {
+        [EMERGENCY_ACTION_NONE] =               "none",
+        [EMERGENCY_ACTION_REBOOT] =             "reboot",
+        [EMERGENCY_ACTION_REBOOT_FORCE] =       "reboot-force",
+        [EMERGENCY_ACTION_REBOOT_IMMEDIATE] =   "reboot-immediate",
+        [EMERGENCY_ACTION_POWEROFF] =           "poweroff",
+        [EMERGENCY_ACTION_POWEROFF_FORCE] =     "poweroff-force",
+        [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate",
+        [EMERGENCY_ACTION_EXIT] =               "exit",
+        [EMERGENCY_ACTION_EXIT_FORCE] =         "exit-force",
+        [EMERGENCY_ACTION_SOFT_REBOOT] =        "soft-reboot",
+        [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] =  "soft-reboot-force",
+        [EMERGENCY_ACTION_KEXEC] =              "kexec",
+        [EMERGENCY_ACTION_KEXEC_FORCE] =        "kexec-force",
+        [EMERGENCY_ACTION_HALT] =               "halt",
+        [EMERGENCY_ACTION_HALT_FORCE] =         "halt-force",
+        [EMERGENCY_ACTION_HALT_IMMEDIATE] =     "halt-immediate",
+};
+
+static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) {
+        log_full(warn ? LOG_WARNING : LOG_DEBUG, "%s: %s", message, reason);
+        if (warn)
+                manager_status_printf(m, STATUS_TYPE_EMERGENCY,
+                                      ANSI_HIGHLIGHT_RED "  !!  " ANSI_NORMAL,
+                                      "%s: %s", message, reason);
+}
+
+void emergency_action(
+                Manager *m,
+                EmergencyAction action,
+                EmergencyActionFlags options,
+                const char *reboot_arg,
+                int exit_status,
+                const char *reason) {
+
+        Unit *u;
+
+        assert(m);
+        assert(action >= 0);
+        assert(action < _EMERGENCY_ACTION_MAX);
+
+        /* Is the special shutdown target active or queued? If so, we are in shutdown state */
+        if (IN_SET(action,
+                   EMERGENCY_ACTION_REBOOT,
+                   EMERGENCY_ACTION_SOFT_REBOOT,
+                   EMERGENCY_ACTION_POWEROFF,
+                   EMERGENCY_ACTION_EXIT,
+                   EMERGENCY_ACTION_KEXEC,
+                   EMERGENCY_ACTION_HALT)) {
+                u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET);
+                if (u && unit_active_or_pending(u)) {
+                        log_notice("Shutdown is already active. Skipping emergency action request %s.",
+                                   emergency_action_table[action]);
+                        return;
+                }
+        }
+
+        if (action == EMERGENCY_ACTION_NONE)
+                return;
+
+        if (FLAGS_SET(options, EMERGENCY_ACTION_IS_WATCHDOG) && !m->service_watchdogs) {
+                log_warning("Watchdog disabled! Not acting on: %s", reason);
+                return;
+        }
+
+        bool warn = FLAGS_SET(options, EMERGENCY_ACTION_WARN);
+
+        switch (action) {
+
+        case EMERGENCY_ACTION_REBOOT:
+                log_and_status(m, warn, "Rebooting", reason);
+
+                (void) update_reboot_parameter_and_warn(reboot_arg, true);
+                (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+                break;
+
+        case EMERGENCY_ACTION_REBOOT_FORCE:
+                log_and_status(m, warn, "Forcibly rebooting", reason);
+
+                (void) update_reboot_parameter_and_warn(reboot_arg, true);
+                m->objective = MANAGER_REBOOT;
+                break;
+
+        case EMERGENCY_ACTION_REBOOT_IMMEDIATE:
+                log_and_status(m, warn, "Rebooting immediately", reason);
+
+                sync();
+
+                if (!isempty(reboot_arg)) {
+                        log_info("Rebooting with argument '%s'.", reboot_arg);
+                        (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, reboot_arg);
+                        log_warning_errno(errno, "Failed to reboot with parameter, retrying without: %m");
+                }
+
+                log_info("Rebooting.");
+                (void) reboot(RB_AUTOBOOT);
+                break;
+
+        case EMERGENCY_ACTION_SOFT_REBOOT:
+                log_and_status(m, warn, "Soft-rebooting", reason);
+
+                (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_SOFT_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+                break;
+
+        case EMERGENCY_ACTION_SOFT_REBOOT_FORCE:
+                log_and_status(m, warn, "Forcibly soft-rebooting", reason);
+
+                m->objective = MANAGER_SOFT_REBOOT;
+                break;
+
+        case EMERGENCY_ACTION_EXIT:
+
+                if (exit_status >= 0)
+                        m->return_value = exit_status;
+
+                if (MANAGER_IS_USER(m) || detect_container() > 0) {
+                        log_and_status(m, warn, "Exiting", reason);
+                        (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+                        break;
+                }
+
+                log_notice("Doing \"poweroff\" action instead of an \"exit\" emergency action.");
+                _fallthrough_;
+
+        case EMERGENCY_ACTION_POWEROFF:
+                log_and_status(m, warn, "Powering off", reason);
+                (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+                break;
+
+        case EMERGENCY_ACTION_EXIT_FORCE:
+
+                if (exit_status >= 0)
+                        m->return_value = exit_status;
+
+                if (MANAGER_IS_USER(m) || detect_container() > 0) {
+                        log_and_status(m, warn, "Exiting immediately", reason);
+                        m->objective = MANAGER_EXIT;
+                        break;
+                }
+
+                log_notice("Doing \"poweroff-force\" action instead of an \"exit-force\" emergency action.");
+                _fallthrough_;
+
+        case EMERGENCY_ACTION_POWEROFF_FORCE:
+                log_and_status(m, warn, "Forcibly powering off", reason);
+                m->objective = MANAGER_POWEROFF;
+                break;
+
+        case EMERGENCY_ACTION_POWEROFF_IMMEDIATE:
+                log_and_status(m, warn, "Powering off immediately", reason);
+
+                sync();
+
+                log_info("Powering off.");
+                (void) reboot(RB_POWER_OFF);
+                break;
+
+        case EMERGENCY_ACTION_KEXEC:
+                log_and_status(m, warn, "Executing kexec", reason);
+                (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+                break;
+
+        case EMERGENCY_ACTION_KEXEC_FORCE:
+                log_and_status(m, warn, "Forcibly executing kexec", reason);
+                m->objective = MANAGER_KEXEC;
+                break;
+
+        case EMERGENCY_ACTION_HALT:
+                log_and_status(m, warn, "Halting", reason);
+                (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+                break;
+
+        case EMERGENCY_ACTION_HALT_FORCE:
+                log_and_status(m, warn, "Forcibly halting", reason);
+                m->objective = MANAGER_HALT;
+                break;
+
+        case EMERGENCY_ACTION_HALT_IMMEDIATE:
+                log_and_status(m, warn, "Halting immediately", reason);
+
+                sync();
+
+                log_info("Halting.");
+                (void) reboot(RB_HALT_SYSTEM);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+DEFINE_STRING_TABLE_LOOKUP(emergency_action, EmergencyAction);
+
+int parse_emergency_action(
+                const char *value,
+                RuntimeScope runtime_scope,
+                EmergencyAction *ret) {
+
+        EmergencyAction x;
+
+        x = emergency_action_from_string(value);
+        if (x < 0)
+                return -EINVAL;
+
+        if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION)
+                return -EOPNOTSUPP;
+
+        *ret = x;
+        return 0;
+}
diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h
new file mode 100644
index 0000000..33e0ec6
--- /dev/null
+++ b/src/core/emergency-action.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "runtime-scope.h"
+
+typedef enum EmergencyAction {
+        EMERGENCY_ACTION_NONE,
+        EMERGENCY_ACTION_REBOOT,
+        EMERGENCY_ACTION_REBOOT_FORCE,
+        EMERGENCY_ACTION_REBOOT_IMMEDIATE,
+        EMERGENCY_ACTION_POWEROFF,
+        EMERGENCY_ACTION_POWEROFF_FORCE,
+        EMERGENCY_ACTION_POWEROFF_IMMEDIATE,
+        EMERGENCY_ACTION_EXIT,
+        _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT,
+        EMERGENCY_ACTION_EXIT_FORCE,
+        EMERGENCY_ACTION_SOFT_REBOOT,
+        EMERGENCY_ACTION_SOFT_REBOOT_FORCE,
+        EMERGENCY_ACTION_KEXEC,
+        EMERGENCY_ACTION_KEXEC_FORCE,
+        EMERGENCY_ACTION_HALT,
+        EMERGENCY_ACTION_HALT_FORCE,
+        EMERGENCY_ACTION_HALT_IMMEDIATE,
+        _EMERGENCY_ACTION_MAX,
+        _EMERGENCY_ACTION_INVALID = -EINVAL,
+} EmergencyAction;
+
+typedef enum EmergencyActionFlags {
+        EMERGENCY_ACTION_IS_WATCHDOG = 1 << 0,
+        EMERGENCY_ACTION_WARN        = 1 << 1,
+} EmergencyActionFlags;
+
+#include "macro.h"
+#include "manager.h"
+
+void emergency_action(Manager *m,
+                      EmergencyAction action, EmergencyActionFlags options,
+                      const char *reboot_arg, int exit_status, const char *reason);
+
+const char* emergency_action_to_string(EmergencyAction i) _const_;
+EmergencyAction emergency_action_from_string(const char *s) _pure_;
+
+int parse_emergency_action(const char *value, RuntimeScope runtime_scope, EmergencyAction *ret);
diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c
new file mode 100644
index 0000000..6bcfb68
--- /dev/null
+++ b/src/core/exec-credential.c
@@ -0,0 +1,1023 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "acl-util.h"
+#include "creds-util.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fileio.h"
+#include "glob-util.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "tmpfile-util.h"
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
+        if (!sc)
+                return NULL;
+
+        free(sc->id);
+        free(sc->data);
+        return mfree(sc);
+}
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
+        if (!lc)
+                return NULL;
+
+        free(lc->id);
+        free(lc->path);
+        return mfree(lc);
+}
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+        exec_set_credential_hash_ops,
+        char, string_hash_func, string_compare_func,
+        ExecSetCredential, exec_set_credential_free);
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+        exec_load_credential_hash_ops,
+        char, string_hash_func, string_compare_func,
+        ExecLoadCredential, exec_load_credential_free);
+
+bool exec_context_has_credentials(const ExecContext *c) {
+        assert(c);
+
+        return !hashmap_isempty(c->set_credentials) ||
+                !hashmap_isempty(c->load_credentials) ||
+                !set_isempty(c->import_credentials);
+}
+
+bool exec_context_has_encrypted_credentials(ExecContext *c) {
+        ExecLoadCredential *load_cred;
+        ExecSetCredential *set_cred;
+
+        assert(c);
+
+        HASHMAP_FOREACH(load_cred, c->load_credentials)
+                if (load_cred->encrypted)
+                        return true;
+
+        HASHMAP_FOREACH(set_cred, c->set_credentials)
+                if (set_cred->encrypted)
+                        return true;
+
+        return false;
+}
+
+static int get_credential_directory(
+                const char *runtime_prefix,
+                const char *unit,
+                char **ret) {
+
+        char *p;
+
+        assert(ret);
+
+        if (!runtime_prefix || !unit) {
+                *ret = NULL;
+                return 0;
+        }
+
+        p = path_join(runtime_prefix, "credentials", unit);
+        if (!p)
+                return -ENOMEM;
+
+        *ret = p;
+        return 1;
+}
+
+int exec_context_get_credential_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                char **ret) {
+
+        assert(context);
+        assert(params);
+        assert(unit);
+        assert(ret);
+
+        if (!exec_context_has_credentials(context)) {
+                *ret = NULL;
+                return 0;
+        }
+
+        return get_credential_directory(params->prefix[EXEC_DIRECTORY_RUNTIME], unit, ret);
+}
+
+int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c) {
+        _cleanup_free_ char *p = NULL, *m = NULL;
+        int r;
+
+        assert(u);
+        assert(c);
+
+        if (!exec_context_has_credentials(c))
+                return 0;
+
+        /* Let's make sure the credentials directory of this service is unmounted *after* the service itself
+         * shuts down. This only matters if mount namespacing is not used for the service, and hence the
+         * credentials mount appears on the host. */
+
+        r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
+        if (r <= 0)
+                return r;
+
+        r = unit_name_from_path(p, ".mount", &m);
+        if (r < 0)
+                return r;
+
+        return unit_add_dependency_by_name(u, UNIT_AFTER, m, /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+}
+
+int exec_context_destroy_credentials(Unit *u) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(u);
+
+        r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
+        if (r <= 0)
+                return r;
+
+        /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
+         * unmount it, and afterwards remove the mount point */
+        if (umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW) >= 0)
+                (void) mount_invalidate_state_by_path(u->manager, p);
+
+        (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
+
+        return 0;
+}
+
+static int write_credential(
+                int dfd,
+                const char *id,
+                const void *data,
+                size_t size,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok) {
+
+        _cleanup_(unlink_and_freep) char *tmp = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        r = tempfn_random_child("", "cred", &tmp);
+        if (r < 0)
+                return r;
+
+        fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
+        if (fd < 0) {
+                tmp = mfree(tmp);
+                return -errno;
+        }
+
+        r = loop_write(fd, data, size);
+        if (r < 0)
+                return r;
+
+        if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
+                return -errno;
+
+        if (uid_is_valid(uid) && uid != getuid()) {
+                r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
+                if (r < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+                                return r;
+
+                        if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
+                                            * to express: that the user gets read access and nothing
+                                            * else. But if the backing fs can't support that (e.g. ramfs)
+                                            * then we can use file ownership instead. But that's only safe if
+                                            * we can then re-mount the whole thing read-only, so that the
+                                            * user can no longer chmod() the file to gain write access. */
+                                return r;
+
+                        if (fchown(fd, uid, gid) < 0)
+                                return -errno;
+                }
+        }
+
+        if (renameat(dfd, tmp, dfd, id) < 0)
+                return -errno;
+
+        tmp = mfree(tmp);
+        return 0;
+}
+
+typedef enum CredentialSearchPath {
+        CREDENTIAL_SEARCH_PATH_TRUSTED,
+        CREDENTIAL_SEARCH_PATH_ENCRYPTED,
+        CREDENTIAL_SEARCH_PATH_ALL,
+        _CREDENTIAL_SEARCH_PATH_MAX,
+        _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
+} CredentialSearchPath;
+
+static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
+
+        _cleanup_strv_free_ char **l = NULL;
+
+        assert(params);
+        assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
+
+        /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
+         * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
+         * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
+
+        if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
+                if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
+                        return NULL;
+
+                if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
+                        return NULL;
+        }
+
+        if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
+                if (params->received_credentials_directory)
+                        if (strv_extend(&l, params->received_credentials_directory) < 0)
+                                return NULL;
+
+                if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
+                        return NULL;
+        }
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *t = strv_join(l, ":");
+
+                log_debug("Credential search path is: %s", strempty(t));
+        }
+
+        return TAKE_PTR(l);
+}
+
+static int maybe_decrypt_and_write_credential(
+                int dir_fd,
+                const char *id,
+                bool encrypted,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok,
+                const char *data,
+                size_t size,
+                uint64_t *left) {
+
+        _cleanup_free_ void *plaintext = NULL;
+        size_t add;
+        int r;
+
+        if (encrypted) {
+                size_t plaintext_size = 0;
+
+                r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
+                                                &plaintext, &plaintext_size);
+                if (r < 0)
+                        return r;
+
+                data = plaintext;
+                size = plaintext_size;
+        }
+
+        add = strlen(id) + size;
+        if (add > *left)
+                return -E2BIG;
+
+        r = write_credential(dir_fd, id, data, size, uid, gid, ownership_ok);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to write credential '%s': %m", id);
+
+        *left -= add;
+        return 0;
+}
+
+static int load_credential_glob(
+                const char *path,
+                bool encrypted,
+                char **search_path,
+                ReadFullFileFlags flags,
+                int write_dfd,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok,
+                uint64_t *left) {
+
+        int r;
+
+        STRV_FOREACH(d, search_path) {
+                _cleanup_globfree_ glob_t pglob = {};
+                _cleanup_free_ char *j = NULL;
+
+                j = path_join(*d, path);
+                if (!j)
+                        return -ENOMEM;
+
+                r = safe_glob(j, 0, &pglob);
+                if (r == -ENOENT)
+                        continue;
+                if (r < 0)
+                        return r;
+
+                for (size_t n = 0; n < pglob.gl_pathc; n++) {
+                        _cleanup_free_ char *fn = NULL;
+                        _cleanup_(erase_and_freep) char *data = NULL;
+                        size_t size;
+
+                        /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+                        r = read_full_file_full(
+                                AT_FDCWD,
+                                pglob.gl_pathv[n],
+                                UINT64_MAX,
+                                encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
+                                flags,
+                                NULL,
+                                &data, &size);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to read credential '%s': %m",
+                                                        pglob.gl_pathv[n]);
+
+                        r = path_extract_filename(pglob.gl_pathv[n], &fn);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to extract filename from '%s': %m",
+                                                        pglob.gl_pathv[n]);
+
+                        r = maybe_decrypt_and_write_credential(
+                                write_dfd,
+                                fn,
+                                encrypted,
+                                uid,
+                                gid,
+                                ownership_ok,
+                                data, size,
+                                left);
+                        if (r == -EEXIST)
+                                continue;
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int load_credential(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *id,
+                const char *path,
+                bool encrypted,
+                const char *unit,
+                int read_dfd,
+                int write_dfd,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok,
+                uint64_t *left) {
+
+        ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
+        _cleanup_strv_free_ char **search_path = NULL;
+        _cleanup_(erase_and_freep) char *data = NULL;
+        _cleanup_free_ char *bindname = NULL;
+        const char *source = NULL;
+        bool missing_ok = true;
+        size_t size, maxsz;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(id);
+        assert(path);
+        assert(unit);
+        assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
+        assert(write_dfd >= 0);
+        assert(left);
+
+        if (read_dfd >= 0) {
+                /* If a directory fd is specified, then read the file directly from that dir. In this case we
+                 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
+                 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
+                 * open it. */
+
+                if (!filename_is_valid(path)) /* safety check */
+                        return -EINVAL;
+
+                missing_ok = true;
+                source = path;
+
+        } else if (path_is_absolute(path)) {
+                /* If this is an absolute path, read the data directly from it, and support AF_UNIX
+                 * sockets */
+
+                if (!path_is_valid(path)) /* safety check */
+                        return -EINVAL;
+
+                flags |= READ_FULL_FILE_CONNECT_SOCKET;
+
+                /* Pass some minimal info about the unit and the credential name we are looking to acquire
+                 * via the source socket address in case we read off an AF_UNIX socket. */
+                if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
+                        return -ENOMEM;
+
+                missing_ok = false;
+                source = path;
+
+        } else if (credential_name_valid(path)) {
+                /* If this is a relative path, take it as credential name relative to the credentials
+                 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
+                 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
+
+                search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
+                if (!search_path)
+                        return -ENOMEM;
+
+                missing_ok = true;
+        } else
+                source = NULL;
+
+        if (encrypted)
+                flags |= READ_FULL_FILE_UNBASE64;
+
+        maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
+
+        if (search_path) {
+                STRV_FOREACH(d, search_path) {
+                        _cleanup_free_ char *j = NULL;
+
+                        j = path_join(*d, path);
+                        if (!j)
+                                return -ENOMEM;
+
+                        r = read_full_file_full(
+                                        AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+                                        UINT64_MAX,
+                                        maxsz,
+                                        flags,
+                                        NULL,
+                                        &data, &size);
+                        if (r != -ENOENT)
+                                break;
+                }
+        } else if (source)
+                r = read_full_file_full(
+                                read_dfd, source,
+                                UINT64_MAX,
+                                maxsz,
+                                flags,
+                                bindname,
+                                &data, &size);
+        else
+                r = -ENOENT;
+
+        if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
+                /* Make a missing inherited credential non-fatal, let's just continue. After all apps
+                 * will get clear errors if we don't pass such a missing credential on as they
+                 * themselves will get ENOENT when trying to read them, which should not be much
+                 * worse than when we handle the error here and make it fatal.
+                 *
+                 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
+                 * we are fine, too. */
+                log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
+                return 0;
+        }
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read credential '%s': %m", path);
+
+        return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, gid, ownership_ok, data, size, left);
+}
+
+struct load_cred_args {
+        const ExecContext *context;
+        const ExecParameters *params;
+        bool encrypted;
+        const char *unit;
+        int dfd;
+        uid_t uid;
+        gid_t gid;
+        bool ownership_ok;
+        uint64_t *left;
+};
+
+static int load_cred_recurse_dir_cb(
+                RecurseDirEvent event,
+                const char *path,
+                int dir_fd,
+                int inode_fd,
+                const struct dirent *de,
+                const struct statx *sx,
+                void *userdata) {
+
+        struct load_cred_args *args = ASSERT_PTR(userdata);
+        _cleanup_free_ char *sub_id = NULL;
+        int r;
+
+        if (event != RECURSE_DIR_ENTRY)
+                return RECURSE_DIR_CONTINUE;
+
+        if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
+                return RECURSE_DIR_CONTINUE;
+
+        sub_id = strreplace(path, "/", "_");
+        if (!sub_id)
+                return -ENOMEM;
+
+        if (!credential_name_valid(sub_id))
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
+
+        if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
+                log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
+                return RECURSE_DIR_CONTINUE;
+        }
+        if (errno != ENOENT)
+                return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
+
+        r = load_credential(
+                        args->context,
+                        args->params,
+                        sub_id,
+                        de->d_name,
+                        args->encrypted,
+                        args->unit,
+                        dir_fd,
+                        args->dfd,
+                        args->uid,
+                        args->gid,
+                        args->ownership_ok,
+                        args->left);
+        if (r < 0)
+                return r;
+
+        return RECURSE_DIR_CONTINUE;
+}
+
+static int acquire_credentials(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                const char *p,
+                uid_t uid,
+                gid_t gid,
+                bool ownership_ok) {
+
+        uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
+        _cleanup_close_ int dfd = -EBADF;
+        const char *ic;
+        ExecLoadCredential *lc;
+        ExecSetCredential *sc;
+        int r;
+
+        assert(context);
+        assert(p);
+
+        dfd = open(p, O_DIRECTORY|O_CLOEXEC);
+        if (dfd < 0)
+                return -errno;
+
+        r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
+        if (r < 0)
+                return r;
+
+        /* First, load credentials off disk (or acquire via AF_UNIX socket) */
+        HASHMAP_FOREACH(lc, context->load_credentials) {
+                _cleanup_close_ int sub_fd = -EBADF;
+
+                /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
+                 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
+                 * a regular file. Finally, if it's a relative path we will use it as a credential name to
+                 * propagate a credential passed to us from further up. */
+
+                if (path_is_absolute(lc->path)) {
+                        sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+                        if (sub_fd < 0 && !IN_SET(errno,
+                                                  ENOTDIR,  /* Not a directory */
+                                                  ENOENT))  /* Doesn't exist? */
+                                return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
+                }
+
+                if (sub_fd < 0)
+                        /* Regular file (incl. a credential passed in from higher up) */
+                        r = load_credential(
+                                        context,
+                                        params,
+                                        lc->id,
+                                        lc->path,
+                                        lc->encrypted,
+                                        unit,
+                                        AT_FDCWD,
+                                        dfd,
+                                        uid,
+                                        gid,
+                                        ownership_ok,
+                                        &left);
+                else
+                        /* Directory */
+                        r = recurse_dir(
+                                        sub_fd,
+                                        /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
+                                        /* statx_mask= */ 0,
+                                        /* n_depth_max= */ UINT_MAX,
+                                        RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
+                                        load_cred_recurse_dir_cb,
+                                        &(struct load_cred_args) {
+                                                .context = context,
+                                                .params = params,
+                                                .encrypted = lc->encrypted,
+                                                .unit = unit,
+                                                .dfd = dfd,
+                                                .uid = uid,
+                                                .gid = gid,
+                                                .ownership_ok = ownership_ok,
+                                                .left = &left,
+                                        });
+                if (r < 0)
+                        return r;
+        }
+
+        /* Next, look for system credentials and credentials in the credentials store. Note that these do not
+         * override any credentials found earlier. */
+        SET_FOREACH(ic, context->import_credentials) {
+                _cleanup_free_ char **search_path = NULL;
+
+                search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
+                if (!search_path)
+                        return -ENOMEM;
+
+                r = load_credential_glob(
+                                ic,
+                                /* encrypted = */ false,
+                                search_path,
+                                READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
+                                dfd,
+                                uid,
+                                gid,
+                                ownership_ok,
+                                &left);
+                if (r < 0)
+                        return r;
+
+                search_path = strv_free(search_path);
+                search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
+                if (!search_path)
+                        return -ENOMEM;
+
+                r = load_credential_glob(
+                                ic,
+                                /* encrypted = */ true,
+                                search_path,
+                                READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
+                                dfd,
+                                uid,
+                                gid,
+                                ownership_ok,
+                                &left);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
+         * add them, so that they can act as a "default" if the same credential is specified multiple times. */
+        HASHMAP_FOREACH(sc, context->set_credentials) {
+                _cleanup_(erase_and_freep) void *plaintext = NULL;
+                const char *data;
+                size_t size, add;
+
+                /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
+                 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
+                 * slow and involved, hence it's nice to be able to skip that if the credential already
+                 * exists anyway. */
+                if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
+                        continue;
+                if (errno != ENOENT)
+                        return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
+
+                if (sc->encrypted) {
+                        r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
+                        if (r < 0)
+                                return r;
+
+                        data = plaintext;
+                } else {
+                        data = sc->data;
+                        size = sc->size;
+                }
+
+                add = strlen(sc->id) + size;
+                if (add > left)
+                        return -E2BIG;
+
+                r = write_credential(dfd, sc->id, data, size, uid, gid, ownership_ok);
+                if (r < 0)
+                        return r;
+
+                left -= add;
+        }
+
+        r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
+        if (r < 0)
+                return r;
+
+        /* After we created all keys with the right perms, also make sure the credential store as a whole is
+         * accessible */
+
+        if (uid_is_valid(uid) && uid != getuid()) {
+                r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
+                if (r < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+                                return r;
+
+                        if (!ownership_ok)
+                                return r;
+
+                        if (fchown(dfd, uid, gid) < 0)
+                                return -errno;
+                }
+        }
+
+        return 0;
+}
+
+static int setup_credentials_internal(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                const char *final,        /* This is where the credential store shall eventually end up at */
+                const char *workspace,    /* This is where we can prepare it before moving it to the final place */
+                bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
+                bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
+                uid_t uid,
+                gid_t gid) {
+
+        int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
+                                   * if we mounted something; false if we definitely can't mount anything */
+        bool final_mounted;
+        const char *where;
+
+        assert(context);
+        assert(final);
+        assert(workspace);
+
+        if (reuse_workspace) {
+                r = path_is_mount_point(workspace, NULL, 0);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
+                                                   * it, let's keep this in mind */
+                else
+                        workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
+        } else
+                workspace_mounted = -1; /* ditto */
+
+        r = path_is_mount_point(final, NULL, 0);
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                /* If the final place already has something mounted, we use that. If the workspace also has
+                 * something mounted we assume it's actually the same mount (but with MS_RDONLY
+                 * different). */
+                final_mounted = true;
+
+                if (workspace_mounted < 0) {
+                        /* If the final place is mounted, but the workspace isn't, then let's bind mount
+                         * the final version to the workspace, and make it writable, so that we can make
+                         * changes */
+
+                        r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+                        if (r < 0)
+                                return r;
+
+                        workspace_mounted = true;
+                }
+        } else
+                final_mounted = false;
+
+        if (workspace_mounted < 0) {
+                /* Nothing is mounted on the workspace yet, let's try to mount something now */
+
+                r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+                if (r < 0) {
+                        /* If that didn't work, try to make a bind mount from the final to the workspace, so
+                         * that we can make it writable there. */
+                        r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+                        if (r < 0) {
+                                if (!ERRNO_IS_PRIVILEGE(r))
+                                        /* Propagate anything that isn't a permission problem. */
+                                        return r;
+
+                                if (must_mount)
+                                        /* If it's not OK to use the plain directory fallback, propagate all
+                                         * errors too. */
+                                        return r;
+
+                                /* If we lack privileges to bind mount stuff, then let's gracefully proceed
+                                 * for compat with container envs, and just use the final dir as is. */
+
+                                workspace_mounted = false;
+                        } else {
+                                /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
+                                r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+                                if (r < 0)
+                                        return r;
+
+                                workspace_mounted = true;
+                        }
+                } else
+                        workspace_mounted = true;
+        }
+
+        assert(!must_mount || workspace_mounted > 0);
+        where = workspace_mounted ? workspace : final;
+
+        (void) label_fix_full(AT_FDCWD, where, final, 0);
+
+        r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
+        if (r < 0)
+                return r;
+
+        if (workspace_mounted) {
+                bool install;
+
+                /* Determine if we should actually install the prepared mount in the final location by bind
+                 * mounting it there. We do so only if the mount is not established there already, and if the
+                 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
+                 * case we are doing all this in a mount namespace, thus no one else will see that we
+                 * allocated a file system we are getting rid of again here. */
+                if (final_mounted)
+                        install = false; /* already installed */
+                else {
+                        r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+                        if (r < 0)
+                                return r;
+
+                        install = r == 0; /* install only if non-empty */
+                }
+
+                if (install) {
+                        /* Make workspace read-only now, so that any bind mount we make from it defaults to
+                         * read-only too */
+                        r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
+                        if (r < 0)
+                                return r;
+
+                        /* And mount it to the final place, read-only */
+                        r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
+                } else
+                        /* Otherwise get rid of it */
+                        r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
+                if (r < 0)
+                        return r;
+        } else {
+                _cleanup_free_ char *parent = NULL;
+
+                /* If we do not have our own mount put used the plain directory fallback, then we need to
+                 * open access to the top-level credential directory and the per-service directory now */
+
+                r = path_extract_directory(final, &parent);
+                if (r < 0)
+                        return r;
+                if (chmod(parent, 0755) < 0)
+                        return -errno;
+        }
+
+        return 0;
+}
+
+int exec_setup_credentials(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                uid_t uid,
+                gid_t gid) {
+
+        _cleanup_free_ char *p = NULL, *q = NULL;
+        int r;
+
+        assert(context);
+        assert(params);
+
+        if (!exec_context_has_credentials(context))
+                return 0;
+
+        if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
+                return -EINVAL;
+
+        /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
+         * and the subdir we mount over with a read-only file system readable by the service's user */
+        q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
+        if (!q)
+                return -ENOMEM;
+
+        r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
+        if (r < 0 && r != -EEXIST)
+                return r;
+
+        p = path_join(q, unit);
+        if (!p)
+                return -ENOMEM;
+
+        r = mkdir_label(p, 0700); /* per-unit dir: private to user */
+        if (r < 0 && r != -EEXIST)
+                return r;
+
+        r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
+        if (r < 0) {
+                _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
+                _cleanup_free_ char *t = NULL;
+
+                /* If this is not a privilege or support issue then propagate the error */
+                if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+                        return r;
+
+                /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
+                 * it into place, so that users can't access half-initialized credential stores. */
+                t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
+                if (!t)
+                        return -ENOMEM;
+
+                /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
+                 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
+                 * after it is fully set up */
+                u = path_join(t, unit);
+                if (!u)
+                        return -ENOMEM;
+
+                FOREACH_STRING(i, t, u) {
+                        r = mkdir_label(i, 0700);
+                        if (r < 0 && r != -EEXIST)
+                                return r;
+                }
+
+                r = setup_credentials_internal(
+                                context,
+                                params,
+                                unit,
+                                p,       /* final mount point */
+                                u,       /* temporary workspace to overmount */
+                                true,    /* reuse the workspace if it is already a mount */
+                                false,   /* it's OK to fall back to a plain directory if we can't mount anything */
+                                uid,
+                                gid);
+                if (r < 0)
+                        return r;
+
+        } else if (r == 0) {
+
+                /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
+                 * we can use the same directory for all cases, after turning off propagation. Question
+                 * though is: where do we turn off propagation exactly, and where do we place the workspace
+                 * directory? We need some place that is guaranteed to be a mount point in the host, and
+                 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
+                 * since we ultimately want to move the resulting file system there, i.e. we need propagation
+                 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
+                 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
+                 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
+                 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
+                 * propagation on the former, and then overmount the latter.
+                 *
+                 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
+                 * for this purpose, but there are few other candidates that work equally well for us, and
+                 * given that we do this in a privately namespaced short-lived single-threaded process that
+                 * no one else sees this should be OK to do. */
+
+                /* Turn off propagation from our namespace to host */
+                r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
+                if (r < 0)
+                        goto child_fail;
+
+                r = setup_credentials_internal(
+                                context,
+                                params,
+                                unit,
+                                p,           /* final mount point */
+                                "/dev/shm",  /* temporary workspace to overmount */
+                                false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
+                                true,        /* insist that something is mounted, do not allow fallback to plain directory */
+                                uid,
+                                gid);
+                if (r < 0)
+                        goto child_fail;
+
+                _exit(EXIT_SUCCESS);
+
+        child_fail:
+                _exit(EXIT_FAILURE);
+        }
+
+        /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
+         * try to remove it. This matters in particular if we created the dir as mount point but then didn't
+         * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
+         * seen by users when trying access this inode. */
+        (void) rmdir(p);
+        return 0;
+}
diff --git a/src/core/exec-credential.h b/src/core/exec-credential.h
new file mode 100644
index 0000000..6f836fb
--- /dev/null
+++ b/src/core/exec-credential.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+
+#include "hash-funcs.h"
+
+typedef struct ExecContext ExecContext;
+typedef struct ExecParameters ExecParameters;
+typedef struct Unit Unit;
+
+/* A credential configured with LoadCredential= */
+typedef struct ExecLoadCredential {
+        char *id, *path;
+        bool encrypted;
+} ExecLoadCredential;
+
+/* A credential configured with SetCredential= */
+typedef struct ExecSetCredential {
+        char *id;
+        bool encrypted;
+        void *data;
+        size_t size;
+} ExecSetCredential;
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSetCredential*, exec_set_credential_free);
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecLoadCredential*, exec_load_credential_free);
+
+extern const struct hash_ops exec_set_credential_hash_ops;
+extern const struct hash_ops exec_load_credential_hash_ops;
+
+bool exec_context_has_encrypted_credentials(ExecContext *c);
+bool exec_context_has_credentials(const ExecContext *c);
+
+int exec_context_get_credential_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                char **ret);
+
+int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c);
+
+int exec_context_destroy_credentials(Unit *u);
+int exec_setup_credentials(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *unit,
+                uid_t uid,
+                gid_t gid);
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
new file mode 100644
index 0000000..28d6142
--- /dev/null
+++ b/src/core/exec-invoke.c
@@ -0,0 +1,5235 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#if HAVE_PAM
+#include 
+#include 
+#endif
+
+#if HAVE_APPARMOR
+#include 
+#endif
+
+#include "sd-messages.h"
+
+#if HAVE_APPARMOR
+#include "apparmor-util.h"
+#endif
+#include "argv-util.h"
+#include "barrier.h"
+#include "bpf-dlopen.h"
+#include "bpf-lsm.h"
+#include "btrfs-util.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "chase.h"
+#include "chattr-util.h"
+#include "chown-recursive.h"
+#include "copy.h"
+#include "data-fd-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "exec-invoke.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "iovec-util.h"
+#include "missing_ioprio.h"
+#include "missing_prctl.h"
+#include "missing_securebits.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "smack-util.h"
+#include "socket-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "utmp-wtmp.h"
+
+#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
+#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
+
+#define SNDBUF_SIZE (8*1024*1024)
+
+static int shift_fds(int fds[], size_t n_fds) {
+        if (n_fds <= 0)
+                return 0;
+
+        /* Modifies the fds array! (sorts it) */
+
+        assert(fds);
+
+        for (int start = 0;;) {
+                int restart_from = -1;
+
+                for (int i = start; i < (int) n_fds; i++) {
+                        int nfd;
+
+                        /* Already at right index? */
+                        if (fds[i] == i+3)
+                                continue;
+
+                        nfd = fcntl(fds[i], F_DUPFD, i + 3);
+                        if (nfd < 0)
+                                return -errno;
+
+                        safe_close(fds[i]);
+                        fds[i] = nfd;
+
+                        /* Hmm, the fd we wanted isn't free? Then
+                         * let's remember that and try again from here */
+                        if (nfd != i+3 && restart_from < 0)
+                                restart_from = i;
+                }
+
+                if (restart_from < 0)
+                        break;
+
+                start = restart_from;
+        }
+
+        return 0;
+}
+
+static int flag_fds(
+                const int fds[],
+                size_t n_socket_fds,
+                size_t n_fds,
+                bool nonblock) {
+
+        int r;
+
+        assert(fds || n_fds == 0);
+
+        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
+         * O_NONBLOCK only applies to socket activation though. */
+
+        for (size_t i = 0; i < n_fds; i++) {
+
+                if (i < n_socket_fds) {
+                        r = fd_nonblock(fds[i], nonblock);
+                        if (r < 0)
+                                return r;
+                }
+
+                /* We unconditionally drop FD_CLOEXEC from the fds,
+                 * since after all we want to pass these fds to our
+                 * children */
+
+                r = fd_cloexec(fds[i], false);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static bool is_terminal_input(ExecInput i) {
+        return IN_SET(i,
+                      EXEC_INPUT_TTY,
+                      EXEC_INPUT_TTY_FORCE,
+                      EXEC_INPUT_TTY_FAIL);
+}
+
+static bool is_terminal_output(ExecOutput o) {
+        return IN_SET(o,
+                      EXEC_OUTPUT_TTY,
+                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
+                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
+}
+
+static bool is_kmsg_output(ExecOutput o) {
+        return IN_SET(o,
+                      EXEC_OUTPUT_KMSG,
+                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
+}
+
+static bool exec_context_needs_term(const ExecContext *c) {
+        assert(c);
+
+        /* Return true if the execution context suggests we should set $TERM to something useful. */
+
+        if (is_terminal_input(c->std_input))
+                return true;
+
+        if (is_terminal_output(c->std_output))
+                return true;
+
+        if (is_terminal_output(c->std_error))
+                return true;
+
+        return !!c->tty_path;
+}
+
+static int open_null_as(int flags, int nfd) {
+        int fd;
+
+        assert(nfd >= 0);
+
+        fd = open("/dev/null", flags|O_NOCTTY);
+        if (fd < 0)
+                return -errno;
+
+        return move_fd(fd, nfd, false);
+}
+
+static int connect_journal_socket(
+                int fd,
+                const char *log_namespace,
+                uid_t uid,
+                gid_t gid) {
+
+        uid_t olduid = UID_INVALID;
+        gid_t oldgid = GID_INVALID;
+        const char *j;
+        int r;
+
+        j = log_namespace ?
+                strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
+                "/run/systemd/journal/stdout";
+
+        if (gid_is_valid(gid)) {
+                oldgid = getgid();
+
+                if (setegid(gid) < 0)
+                        return -errno;
+        }
+
+        if (uid_is_valid(uid)) {
+                olduid = getuid();
+
+                if (seteuid(uid) < 0) {
+                        r = -errno;
+                        goto restore_gid;
+                }
+        }
+
+        r = connect_unix_path(fd, AT_FDCWD, j);
+
+        /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
+           an LSM interferes. */
+
+        if (uid_is_valid(uid))
+                (void) seteuid(olduid);
+
+ restore_gid:
+        if (gid_is_valid(gid))
+                (void) setegid(oldgid);
+
+        return r;
+}
+
+static int connect_logger_as(
+                const ExecContext *context,
+                const ExecParameters *params,
+                ExecOutput output,
+                const char *ident,
+                int nfd,
+                uid_t uid,
+                gid_t gid) {
+
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(output < _EXEC_OUTPUT_MAX);
+        assert(ident);
+        assert(nfd >= 0);
+
+        fd = socket(AF_UNIX, SOCK_STREAM, 0);
+        if (fd < 0)
+                return -errno;
+
+        r = connect_journal_socket(fd, context->log_namespace, uid, gid);
+        if (r < 0)
+                return r;
+
+        if (shutdown(fd, SHUT_RD) < 0)
+                return -errno;
+
+        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
+
+        if (dprintf(fd,
+                "%s\n"
+                "%s\n"
+                "%i\n"
+                "%i\n"
+                "%i\n"
+                "%i\n"
+                "%i\n",
+                context->syslog_identifier ?: ident,
+                params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
+                context->syslog_priority,
+                !!context->syslog_level_prefix,
+                false,
+                is_kmsg_output(output),
+                is_terminal_output(output)) < 0)
+                return -errno;
+
+        return move_fd(TAKE_FD(fd), nfd, false);
+}
+
+static int open_terminal_as(const char *path, int flags, int nfd) {
+        int fd;
+
+        assert(path);
+        assert(nfd >= 0);
+
+        fd = open_terminal(path, flags | O_NOCTTY);
+        if (fd < 0)
+                return fd;
+
+        return move_fd(fd, nfd, false);
+}
+
+static int acquire_path(const char *path, int flags, mode_t mode) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        assert(path);
+
+        if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
+                flags |= O_CREAT;
+
+        fd = open(path, flags|O_NOCTTY, mode);
+        if (fd >= 0)
+                return TAKE_FD(fd);
+
+        if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
+                return -errno;
+
+        /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
+
+        fd = socket(AF_UNIX, SOCK_STREAM, 0);
+        if (fd < 0)
+                return -errno;
+
+        r = connect_unix_path(fd, AT_FDCWD, path);
+        if (IN_SET(r, -ENOTSOCK, -EINVAL))
+                /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
+                 * wasn't an AF_UNIX socket after all */
+                return -ENXIO;
+        if (r < 0)
+                return r;
+
+        if ((flags & O_ACCMODE) == O_RDONLY)
+                r = shutdown(fd, SHUT_WR);
+        else if ((flags & O_ACCMODE) == O_WRONLY)
+                r = shutdown(fd, SHUT_RD);
+        else
+                r = 0;
+        if (r < 0)
+                return -errno;
+
+        return TAKE_FD(fd);
+}
+
+static int fixup_input(
+                const ExecContext *context,
+                int socket_fd,
+                bool apply_tty_stdin) {
+
+        ExecInput std_input;
+
+        assert(context);
+
+        std_input = context->std_input;
+
+        if (is_terminal_input(std_input) && !apply_tty_stdin)
+                return EXEC_INPUT_NULL;
+
+        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
+                return EXEC_INPUT_NULL;
+
+        if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
+                return EXEC_INPUT_NULL;
+
+        return std_input;
+}
+
+static int fixup_output(ExecOutput output, int socket_fd) {
+
+        if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
+                return EXEC_OUTPUT_INHERIT;
+
+        return output;
+}
+
+static int setup_input(
+                const ExecContext *context,
+                const ExecParameters *params,
+                int socket_fd,
+                const int named_iofds[static 3]) {
+
+        ExecInput i;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(named_iofds);
+
+        if (params->stdin_fd >= 0) {
+                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
+                        return -errno;
+
+                /* Try to make this the controlling tty, if it is a tty, and reset it */
+                if (isatty(STDIN_FILENO)) {
+                        (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
+
+                        if (context->tty_reset)
+                                (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
+
+                        (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
+                }
+
+                return STDIN_FILENO;
+        }
+
+        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
+
+        switch (i) {
+
+        case EXEC_INPUT_NULL:
+                return open_null_as(O_RDONLY, STDIN_FILENO);
+
+        case EXEC_INPUT_TTY:
+        case EXEC_INPUT_TTY_FORCE:
+        case EXEC_INPUT_TTY_FAIL: {
+                _cleanup_close_ int tty_fd = -EBADF;
+                const char *tty_path;
+
+                tty_path = ASSERT_PTR(exec_context_tty_path(context));
+
+                tty_fd = acquire_terminal(tty_path,
+                                          i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
+                                          i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
+                                                                      ACQUIRE_TERMINAL_WAIT,
+                                          USEC_INFINITY);
+                if (tty_fd < 0)
+                        return tty_fd;
+
+                r = exec_context_apply_tty_size(context, tty_fd, tty_path);
+                if (r < 0)
+                        return r;
+
+                r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
+                if (r < 0)
+                        return r;
+
+                TAKE_FD(tty_fd);
+                return r;
+        }
+
+        case EXEC_INPUT_SOCKET:
+                assert(socket_fd >= 0);
+
+                return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
+
+        case EXEC_INPUT_NAMED_FD:
+                assert(named_iofds[STDIN_FILENO] >= 0);
+
+                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
+                return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
+
+        case EXEC_INPUT_DATA: {
+                int fd;
+
+                fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
+                if (fd < 0)
+                        return fd;
+
+                return move_fd(fd, STDIN_FILENO, false);
+        }
+
+        case EXEC_INPUT_FILE: {
+                bool rw;
+                int fd;
+
+                assert(context->stdio_file[STDIN_FILENO]);
+
+                rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
+                        (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
+
+                fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
+                if (fd < 0)
+                        return fd;
+
+                return move_fd(fd, STDIN_FILENO, false);
+        }
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static bool can_inherit_stderr_from_stdout(
+                const ExecContext *context,
+                ExecOutput o,
+                ExecOutput e) {
+
+        assert(context);
+
+        /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
+         * stderr fd */
+
+        if (e == EXEC_OUTPUT_INHERIT)
+                return true;
+        if (e != o)
+                return false;
+
+        if (e == EXEC_OUTPUT_NAMED_FD)
+                return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
+
+        if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
+                return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
+
+        return true;
+}
+
+static int setup_output(
+                const ExecContext *context,
+                const ExecParameters *params,
+                int fileno,
+                int socket_fd,
+                const int named_iofds[static 3],
+                const char *ident,
+                uid_t uid,
+                gid_t gid,
+                dev_t *journal_stream_dev,
+                ino_t *journal_stream_ino) {
+
+        ExecOutput o;
+        ExecInput i;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(ident);
+        assert(journal_stream_dev);
+        assert(journal_stream_ino);
+
+        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
+
+                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
+                        return -errno;
+
+                return STDOUT_FILENO;
+        }
+
+        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
+                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
+                        return -errno;
+
+                return STDERR_FILENO;
+        }
+
+        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
+        o = fixup_output(context->std_output, socket_fd);
+
+        if (fileno == STDERR_FILENO) {
+                ExecOutput e;
+                e = fixup_output(context->std_error, socket_fd);
+
+                /* This expects the input and output are already set up */
+
+                /* Don't change the stderr file descriptor if we inherit all
+                 * the way and are not on a tty */
+                if (e == EXEC_OUTPUT_INHERIT &&
+                    o == EXEC_OUTPUT_INHERIT &&
+                    i == EXEC_INPUT_NULL &&
+                    !is_terminal_input(context->std_input) &&
+                    getppid() != 1)
+                        return fileno;
+
+                /* Duplicate from stdout if possible */
+                if (can_inherit_stderr_from_stdout(context, o, e))
+                        return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
+
+                o = e;
+
+        } else if (o == EXEC_OUTPUT_INHERIT) {
+                /* If input got downgraded, inherit the original value */
+                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
+                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
+
+                /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
+                if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
+                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
+                if (getppid() != 1)
+                        return fileno;
+
+                /* We need to open /dev/null here anew, to get the right access mode. */
+                return open_null_as(O_WRONLY, fileno);
+        }
+
+        switch (o) {
+
+        case EXEC_OUTPUT_NULL:
+                return open_null_as(O_WRONLY, fileno);
+
+        case EXEC_OUTPUT_TTY:
+                if (is_terminal_input(i))
+                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+                /* We don't reset the terminal if this is just about output */
+                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
+
+        case EXEC_OUTPUT_KMSG:
+        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
+        case EXEC_OUTPUT_JOURNAL:
+        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
+                r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
+                if (r < 0) {
+                        log_exec_warning_errno(context,
+                                               params,
+                                               r,
+                                               "Failed to connect %s to the journal socket, ignoring: %m",
+                                               fileno == STDOUT_FILENO ? "stdout" : "stderr");
+                        r = open_null_as(O_WRONLY, fileno);
+                } else {
+                        struct stat st;
+
+                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
+                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
+                         * services to detect whether they are connected to the journal or not.
+                         *
+                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
+                         * about STDERR as that's usually the best way to do logging. */
+
+                        if (fstat(fileno, &st) >= 0 &&
+                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
+                                *journal_stream_dev = st.st_dev;
+                                *journal_stream_ino = st.st_ino;
+                        }
+                }
+                return r;
+
+        case EXEC_OUTPUT_SOCKET:
+                assert(socket_fd >= 0);
+
+                return RET_NERRNO(dup2(socket_fd, fileno));
+
+        case EXEC_OUTPUT_NAMED_FD:
+                assert(named_iofds[fileno] >= 0);
+
+                (void) fd_nonblock(named_iofds[fileno], false);
+                return RET_NERRNO(dup2(named_iofds[fileno], fileno));
+
+        case EXEC_OUTPUT_FILE:
+        case EXEC_OUTPUT_FILE_APPEND:
+        case EXEC_OUTPUT_FILE_TRUNCATE: {
+                bool rw;
+                int fd, flags;
+
+                assert(context->stdio_file[fileno]);
+
+                rw = context->std_input == EXEC_INPUT_FILE &&
+                        streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
+
+                if (rw)
+                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+                flags = O_WRONLY;
+                if (o == EXEC_OUTPUT_FILE_APPEND)
+                        flags |= O_APPEND;
+                else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
+                        flags |= O_TRUNC;
+
+                fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
+                if (fd < 0)
+                        return fd;
+
+                return move_fd(fd, fileno, 0);
+        }
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int chown_terminal(int fd, uid_t uid) {
+        int r;
+
+        assert(fd >= 0);
+
+        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
+        if (isatty(fd) < 1) {
+                if (IN_SET(errno, EINVAL, ENOTTY))
+                        return 0; /* not a tty */
+
+                return -errno;
+        }
+
+        /* This might fail. What matters are the results. */
+        r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int setup_confirm_stdio(
+                const ExecContext *context,
+                const char *vc,
+                int *ret_saved_stdin,
+                int *ret_saved_stdout) {
+
+        _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
+        int r;
+
+        assert(ret_saved_stdin);
+        assert(ret_saved_stdout);
+
+        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
+        if (saved_stdin < 0)
+                return -errno;
+
+        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
+        if (saved_stdout < 0)
+                return -errno;
+
+        fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
+        if (fd < 0)
+                return fd;
+
+        r = chown_terminal(fd, getuid());
+        if (r < 0)
+                return r;
+
+        r = reset_terminal_fd(fd, /* switch_to_text= */ true);
+        if (r < 0)
+                return r;
+
+        r = exec_context_apply_tty_size(context, fd, vc);
+        if (r < 0)
+                return r;
+
+        r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
+        TAKE_FD(fd);
+        if (r < 0)
+                return r;
+
+        *ret_saved_stdin = TAKE_FD(saved_stdin);
+        *ret_saved_stdout = TAKE_FD(saved_stdout);
+        return 0;
+}
+
+static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
+        assert(err < 0);
+        assert(unit_id);
+
+        if (err == -ETIMEDOUT)
+                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
+        else {
+                errno = -err;
+                dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
+        }
+}
+
+static void write_confirm_error(int err, const char *vc, const char *unit_id) {
+        _cleanup_close_ int fd = -EBADF;
+
+        assert(vc);
+
+        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
+        if (fd < 0)
+                return;
+
+        write_confirm_error_fd(err, fd, unit_id);
+}
+
+static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
+        int r = 0;
+
+        assert(saved_stdin);
+        assert(saved_stdout);
+
+        release_terminal();
+
+        if (*saved_stdin >= 0)
+                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
+                        r = -errno;
+
+        if (*saved_stdout >= 0)
+                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
+                        r = -errno;
+
+        *saved_stdin = safe_close(*saved_stdin);
+        *saved_stdout = safe_close(*saved_stdout);
+
+        return r;
+}
+
+enum {
+        CONFIRM_PRETEND_FAILURE = -1,
+        CONFIRM_PRETEND_SUCCESS =  0,
+        CONFIRM_EXECUTE = 1,
+};
+
+static bool confirm_spawn_disabled(void) {
+        return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
+}
+
+static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
+        int saved_stdout = -1, saved_stdin = -1, r;
+        _cleanup_free_ char *e = NULL;
+        char c;
+
+        assert(context);
+        assert(params);
+
+        /* For any internal errors, assume a positive response. */
+        r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
+        if (r < 0) {
+                write_confirm_error(r, params->confirm_spawn, params->unit_id);
+                return CONFIRM_EXECUTE;
+        }
+
+        /* confirm_spawn might have been disabled while we were sleeping. */
+        if (!params->confirm_spawn || confirm_spawn_disabled()) {
+                r = 1;
+                goto restore_stdio;
+        }
+
+        e = ellipsize(cmdline, 60, 100);
+        if (!e) {
+                log_oom();
+                r = CONFIRM_EXECUTE;
+                goto restore_stdio;
+        }
+
+        for (;;) {
+                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
+                if (r < 0) {
+                        write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
+                        r = CONFIRM_EXECUTE;
+                        goto restore_stdio;
+                }
+
+                switch (c) {
+                case 'c':
+                        printf("Resuming normal execution.\n");
+                        manager_disable_confirm_spawn();
+                        r = 1;
+                        break;
+                case 'D':
+                        printf("  Unit: %s\n",
+                               params->unit_id);
+                        exec_context_dump(context, stdout, "  ");
+                        exec_params_dump(params, stdout, "  ");
+                        continue; /* ask again */
+                case 'f':
+                        printf("Failing execution.\n");
+                        r = CONFIRM_PRETEND_FAILURE;
+                        break;
+                case 'h':
+                        printf("  c - continue, proceed without asking anymore\n"
+                               "  D - dump, show the state of the unit\n"
+                               "  f - fail, don't execute the command and pretend it failed\n"
+                               "  h - help\n"
+                               "  i - info, show a short summary of the unit\n"
+                               "  j - jobs, show jobs that are in progress\n"
+                               "  s - skip, don't execute the command and pretend it succeeded\n"
+                               "  y - yes, execute the command\n");
+                        continue; /* ask again */
+                case 'i':
+                        printf("  Unit:        %s\n"
+                               "  Command:     %s\n",
+                               params->unit_id, cmdline);
+                        continue; /* ask again */
+                case 'j':
+                        if (sigqueue(getppid(),
+                                     SIGRTMIN+18,
+                                     (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
+                                return -errno;
+
+                        continue; /* ask again */
+                case 'n':
+                        /* 'n' was removed in favor of 'f'. */
+                        printf("Didn't understand 'n', did you mean 'f'?\n");
+                        continue; /* ask again */
+                case 's':
+                        printf("Skipping execution.\n");
+                        r = CONFIRM_PRETEND_SUCCESS;
+                        break;
+                case 'y':
+                        r = CONFIRM_EXECUTE;
+                        break;
+                default:
+                        assert_not_reached();
+                }
+                break;
+        }
+
+restore_stdio:
+        restore_confirm_stdio(&saved_stdin, &saved_stdout);
+        return r;
+}
+
+static int get_fixed_user(
+                const char *user_or_uid,
+                const char **ret_username,
+                uid_t *ret_uid,
+                gid_t *ret_gid,
+                const char **ret_home,
+                const char **ret_shell) {
+
+        int r;
+
+        assert(user_or_uid);
+        assert(ret_username);
+
+        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
+         * (i.e. are "/" or "/bin/nologin"). */
+
+        r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
+        if (r < 0)
+                return r;
+
+        /* user_or_uid is normalized by get_user_creds to username */
+        *ret_username = user_or_uid;
+
+        return 0;
+}
+
+static int get_fixed_group(
+                const char *group_or_gid,
+                const char **ret_groupname,
+                gid_t *ret_gid) {
+
+        int r;
+
+        assert(group_or_gid);
+        assert(ret_groupname);
+
+        r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
+        if (r < 0)
+                return r;
+
+        /* group_or_gid is normalized by get_group_creds to groupname */
+        *ret_groupname = group_or_gid;
+
+        return 0;
+}
+
+static int get_supplementary_groups(const ExecContext *c, const char *user,
+                                    const char *group, gid_t gid,
+                                    gid_t **supplementary_gids, int *ngids) {
+        int r, k = 0;
+        int ngroups_max;
+        bool keep_groups = false;
+        gid_t *groups = NULL;
+        _cleanup_free_ gid_t *l_gids = NULL;
+
+        assert(c);
+
+        /*
+         * If user is given, then lookup GID and supplementary groups list.
+         * We avoid NSS lookups for gid=0. Also we have to initialize groups
+         * here and as early as possible so we keep the list of supplementary
+         * groups of the caller.
+         */
+        if (user && gid_is_valid(gid) && gid != 0) {
+                /* First step, initialize groups from /etc/groups */
+                if (initgroups(user, gid) < 0)
+                        return -errno;
+
+                keep_groups = true;
+        }
+
+        if (strv_isempty(c->supplementary_groups))
+                return 0;
+
+        /*
+         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
+         * be positive, otherwise fail.
+         */
+        errno = 0;
+        ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
+        if (ngroups_max <= 0)
+                return errno_or_else(EOPNOTSUPP);
+
+        l_gids = new(gid_t, ngroups_max);
+        if (!l_gids)
+                return -ENOMEM;
+
+        if (keep_groups) {
+                /*
+                 * Lookup the list of groups that the user belongs to, we
+                 * avoid NSS lookups here too for gid=0.
+                 */
+                k = ngroups_max;
+                if (getgrouplist(user, gid, l_gids, &k) < 0)
+                        return -EINVAL;
+        } else
+                k = 0;
+
+        STRV_FOREACH(i, c->supplementary_groups) {
+                const char *g;
+
+                if (k >= ngroups_max)
+                        return -E2BIG;
+
+                g = *i;
+                r = get_group_creds(&g, l_gids+k, 0);
+                if (r < 0)
+                        return r;
+
+                k++;
+        }
+
+        /*
+         * Sets ngids to zero to drop all supplementary groups, happens
+         * when we are under root and SupplementaryGroups= is empty.
+         */
+        if (k == 0) {
+                *ngids = 0;
+                return 0;
+        }
+
+        /* Otherwise get the final list of supplementary groups */
+        groups = memdup(l_gids, sizeof(gid_t) * k);
+        if (!groups)
+                return -ENOMEM;
+
+        *supplementary_gids = groups;
+        *ngids = k;
+
+        groups = NULL;
+
+        return 0;
+}
+
+static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
+        int r;
+
+        /* Handle SupplementaryGroups= if it is not empty */
+        if (ngids > 0) {
+                r = maybe_setgroups(ngids, supplementary_gids);
+                if (r < 0)
+                        return r;
+        }
+
+        if (gid_is_valid(gid)) {
+                /* Then set our gids */
+                if (setresgid(gid, gid, gid) < 0)
+                        return -errno;
+        }
+
+        return 0;
+}
+
+static int set_securebits(unsigned bits, unsigned mask) {
+        unsigned applied;
+        int current;
+
+        current = prctl(PR_GET_SECUREBITS);
+        if (current < 0)
+                return -errno;
+
+        /* Clear all securebits defined in mask and set bits */
+        applied = ((unsigned) current & ~mask) | bits;
+        if ((unsigned) current == applied)
+                return 0;
+
+        if (prctl(PR_SET_SECUREBITS, applied) < 0)
+                return -errno;
+
+        return 1;
+}
+
+static int enforce_user(
+                const ExecContext *context,
+                uid_t uid,
+                uint64_t capability_ambient_set) {
+        assert(context);
+        int r;
+
+        if (!uid_is_valid(uid))
+                return 0;
+
+        /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
+         * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
+         * case. */
+
+        if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
+
+                /* First step: If we need to keep capabilities but drop privileges we need to make sure we
+                 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
+                r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Second step: actually set the uids */
+        if (setresuid(uid, uid, uid) < 0)
+                return -errno;
+
+        /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
+         * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
+         * outside of this call. */
+        return 0;
+}
+
+#if HAVE_PAM
+
+static int null_conv(
+                int num_msg,
+                const struct pam_message **msg,
+                struct pam_response **resp,
+                void *appdata_ptr) {
+
+        /* We don't support conversations */
+
+        return PAM_CONV_ERR;
+}
+
+static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
+        int r, s;
+
+        assert(handle);
+
+        r = pam_close_session(handle, flags);
+        if (r != PAM_SUCCESS)
+                log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
+
+        s = pam_setcred(handle, PAM_DELETE_CRED | flags);
+        if (s != PAM_SUCCESS)
+                log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
+
+        return r != PAM_SUCCESS ? r : s;
+}
+
+#endif
+
+static int setup_pam(
+                const char *name,
+                const char *user,
+                uid_t uid,
+                gid_t gid,
+                const char *tty,
+                char ***env, /* updated on success */
+                const int fds[], size_t n_fds) {
+
+#if HAVE_PAM
+
+        static const struct pam_conv conv = {
+                .conv = null_conv,
+                .appdata_ptr = NULL
+        };
+
+        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
+        _cleanup_strv_free_ char **e = NULL;
+        pam_handle_t *handle = NULL;
+        sigset_t old_ss;
+        int pam_code = PAM_SUCCESS, r;
+        bool close_session = false;
+        pid_t pam_pid = 0, parent_pid;
+        int flags = 0;
+
+        assert(name);
+        assert(user);
+        assert(env);
+
+        /* We set up PAM in the parent process, then fork. The child
+         * will then stay around until killed via PR_GET_PDEATHSIG or
+         * systemd via the cgroup logic. It will then remove the PAM
+         * session again. The parent process will exec() the actual
+         * daemon. We do things this way to ensure that the main PID
+         * of the daemon is the one we initially fork()ed. */
+
+        r = barrier_create(&barrier);
+        if (r < 0)
+                goto fail;
+
+        if (log_get_max_level() < LOG_DEBUG)
+                flags |= PAM_SILENT;
+
+        pam_code = pam_start(name, user, &conv, &handle);
+        if (pam_code != PAM_SUCCESS) {
+                handle = NULL;
+                goto fail;
+        }
+
+        if (!tty) {
+                _cleanup_free_ char *q = NULL;
+
+                /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
+                 * out if that's the case, and read the TTY off it. */
+
+                if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
+                        tty = strjoina("/dev/", q);
+        }
+
+        if (tty) {
+                pam_code = pam_set_item(handle, PAM_TTY, tty);
+                if (pam_code != PAM_SUCCESS)
+                        goto fail;
+        }
+
+        STRV_FOREACH(nv, *env) {
+                pam_code = pam_putenv(handle, *nv);
+                if (pam_code != PAM_SUCCESS)
+                        goto fail;
+        }
+
+        pam_code = pam_acct_mgmt(handle, flags);
+        if (pam_code != PAM_SUCCESS)
+                goto fail;
+
+        pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
+        if (pam_code != PAM_SUCCESS)
+                log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
+
+        pam_code = pam_open_session(handle, flags);
+        if (pam_code != PAM_SUCCESS)
+                goto fail;
+
+        close_session = true;
+
+        e = pam_getenvlist(handle);
+        if (!e) {
+                pam_code = PAM_BUF_ERR;
+                goto fail;
+        }
+
+        /* Block SIGTERM, so that we know that it won't get lost in the child */
+
+        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
+
+        parent_pid = getpid_cached();
+
+        r = safe_fork("(sd-pam)", 0, &pam_pid);
+        if (r < 0)
+                goto fail;
+        if (r == 0) {
+                int sig, ret = EXIT_PAM;
+
+                /* The child's job is to reset the PAM session on termination */
+                barrier_set_role(&barrier, BARRIER_CHILD);
+
+                /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
+                 * those fds are open here that have been opened by PAM. */
+                (void) close_many(fds, n_fds);
+
+                /* Drop privileges - we don't need any to pam_close_session and this will make
+                 * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
+                 * threads to fail to exit normally */
+
+                r = maybe_setgroups(0, NULL);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
+                if (setresgid(gid, gid, gid) < 0)
+                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
+                if (setresuid(uid, uid, uid) < 0)
+                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
+
+                (void) ignore_signals(SIGPIPE);
+
+                /* Wait until our parent died. This will only work if the above setresuid() succeeds,
+                 * otherwise the kernel will not allow unprivileged parents kill their privileged children
+                 * this way. We rely on the control groups kill logic to do the rest for us. */
+                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
+                        goto child_finish;
+
+                /* Tell the parent that our setup is done. This is especially important regarding dropping
+                 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
+                 *
+                 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
+                (void) barrier_place(&barrier);
+
+                /* Check if our parent process might already have died? */
+                if (getppid() == parent_pid) {
+                        sigset_t ss;
+
+                        assert_se(sigemptyset(&ss) >= 0);
+                        assert_se(sigaddset(&ss, SIGTERM) >= 0);
+
+                        for (;;) {
+                                if (sigwait(&ss, &sig) < 0) {
+                                        if (errno == EINTR)
+                                                continue;
+
+                                        goto child_finish;
+                                }
+
+                                assert(sig == SIGTERM);
+                                break;
+                        }
+                }
+
+                /* If our parent died we'll end the session */
+                if (getppid() != parent_pid) {
+                        pam_code = pam_close_session_and_delete_credentials(handle, flags);
+                        if (pam_code != PAM_SUCCESS)
+                                goto child_finish;
+                }
+
+                ret = 0;
+
+        child_finish:
+                /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
+                 * know about this. See pam_end(3) */
+                (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
+                _exit(ret);
+        }
+
+        barrier_set_role(&barrier, BARRIER_PARENT);
+
+        /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
+         * here. */
+        handle = NULL;
+
+        /* Unblock SIGTERM again in the parent */
+        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
+
+        /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
+         * this fd around. */
+        closelog();
+
+        /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
+         * recover. However, warn loudly if it happens. */
+        if (!barrier_place_and_sync(&barrier))
+                log_error("PAM initialization failed");
+
+        return strv_free_and_replace(*env, e);
+
+fail:
+        if (pam_code != PAM_SUCCESS) {
+                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
+                r = -EPERM;  /* PAM errors do not map to errno */
+        } else
+                log_error_errno(r, "PAM failed: %m");
+
+        if (handle) {
+                if (close_session)
+                        pam_code = pam_close_session_and_delete_credentials(handle, flags);
+
+                (void) pam_end(handle, pam_code | flags);
+        }
+
+        closelog();
+        return r;
+#else
+        return 0;
+#endif
+}
+
+static void rename_process_from_path(const char *path) {
+        _cleanup_free_ char *buf = NULL;
+        const char *p;
+
+        assert(path);
+
+        /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
+         * /bin/ps */
+
+        if (path_extract_filename(path, &buf) < 0) {
+                rename_process("(...)");
+                return;
+        }
+
+        size_t l = strlen(buf);
+        if (l > 8) {
+                /* The end of the process name is usually more interesting, since the first bit might just be
+                 * "systemd-" */
+                p = buf + l - 8;
+                l = 8;
+        } else
+                p = buf;
+
+        char process_name[11];
+        process_name[0] = '(';
+        memcpy(process_name+1, p, l);
+        process_name[1+l] = ')';
+        process_name[1+l+1] = 0;
+
+        rename_process(process_name);
+}
+
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
+
+        return c->address_families_allow_list ||
+                !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_allow_list ||
+                !hashmap_isempty(c->syscall_filter);
+}
+
+static bool context_has_syscall_logs(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_log_allow_list ||
+                !hashmap_isempty(c->syscall_log);
+}
+
+static bool context_has_seccomp(const ExecContext *c) {
+        /* We need NNP if we have any form of seccomp and are unprivileged */
+        return c->lock_personality ||
+                c->memory_deny_write_execute ||
+                c->private_devices ||
+                c->protect_clock ||
+                c->protect_hostname ||
+                c->protect_kernel_tunables ||
+                c->protect_kernel_modules ||
+                c->protect_kernel_logs ||
+                context_has_address_families(c) ||
+                exec_context_restrict_namespaces_set(c) ||
+                c->restrict_realtime ||
+                c->restrict_suid_sgid ||
+                !set_isempty(c->syscall_archs) ||
+                context_has_syscall_filters(c) ||
+                context_has_syscall_logs(c);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
+
+        if (c->no_new_privileges)
+                return true;
+
+        if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
+                return false;
+
+        return context_has_seccomp(c);
+}
+
+#if HAVE_SECCOMP
+
+static bool seccomp_allows_drop_privileges(const ExecContext *c) {
+        void *id, *val;
+        bool has_capget = false, has_capset = false, has_prctl = false;
+
+        assert(c);
+
+        /* No syscall filter, we are allowed to drop privileges */
+        if (hashmap_isempty(c->syscall_filter))
+                return true;
+
+        HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+                _cleanup_free_ char *name = NULL;
+
+                name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+
+                if (streq(name, "capget"))
+                        has_capget = true;
+                else if (streq(name, "capset"))
+                        has_capset = true;
+                else if (streq(name, "prctl"))
+                        has_prctl = true;
+        }
+
+        if (c->syscall_allow_list)
+                return has_capget && has_capset && has_prctl;
+        else
+                return !(has_capget || has_capset || has_prctl);
+}
+
+static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
+
+        if (is_seccomp_available())
+                return false;
+
+        log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
+        return true;
+}
+
+static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
+        uint32_t negative_action, default_action, action;
+        int r;
+
+        assert(c);
+        assert(p);
+
+        if (!context_has_syscall_filters(c))
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
+                return 0;
+
+        negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
+
+        if (c->syscall_allow_list) {
+                default_action = negative_action;
+                action = SCMP_ACT_ALLOW;
+        } else {
+                default_action = SCMP_ACT_ALLOW;
+                action = negative_action;
+        }
+
+        if (needs_ambient_hack) {
+                r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
+                if (r < 0)
+                        return r;
+        }
+
+        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
+}
+
+static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
+#ifdef SCMP_ACT_LOG
+        uint32_t default_action, action;
+#endif
+
+        assert(c);
+        assert(p);
+
+        if (!context_has_syscall_logs(c))
+                return 0;
+
+#ifdef SCMP_ACT_LOG
+        if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
+                return 0;
+
+        if (c->syscall_log_allow_list) {
+                /* Log nothing but the ones listed */
+                default_action = SCMP_ACT_ALLOW;
+                action = SCMP_ACT_LOG;
+        } else {
+                /* Log everything but the ones listed */
+                default_action = SCMP_ACT_LOG;
+                action = SCMP_ACT_ALLOW;
+        }
+
+        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
+#else
+        /* old libseccomp */
+        log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
+        return 0;
+#endif
+}
+
+static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (set_isempty(c->syscall_archs))
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
+                return 0;
+
+        return seccomp_restrict_archs(c->syscall_archs);
+}
+
+static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (!context_has_address_families(c))
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
+                return 0;
+
+        return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
+}
+
+static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
+        int r;
+
+        assert(c);
+        assert(p);
+
+        if (!c->memory_deny_write_execute)
+                return 0;
+
+        /* use prctl() if kernel supports it (6.3) */
+        r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
+        if (r == 0) {
+                log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
+                return 0;
+        }
+        if (r < 0 && errno != EINVAL)
+                return log_exec_debug_errno(c,
+                                            p,
+                                            errno,
+                                            "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
+        /* else use seccomp */
+        log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
+
+        if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
+                return 0;
+
+        return seccomp_memory_deny_write_execute();
+}
+
+static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (!c->restrict_realtime)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
+                return 0;
+
+        return seccomp_restrict_realtime();
+}
+
+static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (!c->restrict_suid_sgid)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
+                return 0;
+
+        return seccomp_restrict_suid_sgid();
+}
+
+static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+         * let's protect even those systems where this is left on in the kernel. */
+
+        if (!c->protect_kernel_tunables)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
+                return 0;
+
+        return seccomp_protect_sysctl();
+}
+
+static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        /* Turn off module syscalls on ProtectKernelModules=yes */
+
+        if (!c->protect_kernel_modules)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
+                return 0;
+
+        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (!c->protect_kernel_logs)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
+                return 0;
+
+        return seccomp_protect_syslog();
+}
+
+static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (!c->protect_clock)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "ProtectClock="))
+                return 0;
+
+        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+
+        if (!c->private_devices)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
+                return 0;
+
+        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
+        assert(c);
+        assert(p);
+
+        if (!exec_context_restrict_namespaces_set(c))
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
+                return 0;
+
+        return seccomp_restrict_namespaces(c->restrict_namespaces);
+}
+
+static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
+        unsigned long personality;
+        int r;
+
+        assert(c);
+        assert(p);
+
+        if (!c->lock_personality)
+                return 0;
+
+        if (skip_seccomp_unavailable(c, p, "LockPersonality="))
+                return 0;
+
+        personality = c->personality;
+
+        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
+        if (personality == PERSONALITY_INVALID) {
+
+                r = opinionated_personality(&personality);
+                if (r < 0)
+                        return r;
+        }
+
+        return seccomp_lock_personality(personality);
+}
+
+#endif
+
+#if HAVE_LIBBPF
+static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
+        int r;
+
+        assert(c);
+        assert(p);
+
+        if (!exec_context_restrict_filesystems_set(c))
+                return 0;
+
+        if (p->bpf_outer_map_fd < 0) {
+                /* LSM BPF is unsupported or lsm_bpf_setup failed */
+                log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
+                return 0;
+        }
+
+        /* We are in a new binary, so dl-open again */
+        r = dlopen_bpf();
+        if (r < 0)
+                return r;
+
+        return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
+}
+#endif
+
+static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
+        assert(c);
+        assert(p);
+
+        if (!c->protect_hostname)
+                return 0;
+
+        if (ns_type_supported(NAMESPACE_UTS)) {
+                if (unshare(CLONE_NEWUTS) < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
+                                *ret_exit_status = EXIT_NAMESPACE;
+                                return log_exec_error_errno(c,
+                                                            p,
+                                                            errno,
+                                                            "Failed to set up UTS namespacing: %m");
+                        }
+
+                        log_exec_warning(c,
+                                         p,
+                                         "ProtectHostname=yes is configured, but UTS namespace setup is "
+                                         "prohibited (container manager?), ignoring namespace setup.");
+                }
+        } else
+                log_exec_warning(c,
+                                 p,
+                                 "ProtectHostname=yes is configured, but the kernel does not "
+                                 "support UTS namespaces, ignoring namespace setup.");
+
+#if HAVE_SECCOMP
+        int r;
+
+        if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
+                return 0;
+
+        r = seccomp_protect_hostname();
+        if (r < 0) {
+                *ret_exit_status = EXIT_SECCOMP;
+                return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
+        }
+#endif
+
+        return 0;
+}
+
+static void do_idle_pipe_dance(int idle_pipe[static 4]) {
+        assert(idle_pipe);
+
+        idle_pipe[1] = safe_close(idle_pipe[1]);
+        idle_pipe[2] = safe_close(idle_pipe[2]);
+
+        if (idle_pipe[0] >= 0) {
+                int r;
+
+                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
+
+                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
+                        ssize_t n;
+
+                        /* Signal systemd that we are bored and want to continue. */
+                        n = write(idle_pipe[3], "x", 1);
+                        if (n > 0)
+                                /* Wait for systemd to react to the signal above. */
+                                (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
+                }
+
+                idle_pipe[0] = safe_close(idle_pipe[0]);
+
+        }
+
+        idle_pipe[3] = safe_close(idle_pipe[3]);
+}
+
+static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
+
+/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
+ * the service payload in. */
+static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+        [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
+        [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
+        [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
+        [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
+        [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
+
+static int build_environment(
+                const ExecContext *c,
+                const ExecParameters *p,
+                const CGroupContext *cgroup_context,
+                size_t n_fds,
+                const char *home,
+                const char *username,
+                const char *shell,
+                dev_t journal_stream_dev,
+                ino_t journal_stream_ino,
+                const char *memory_pressure_path,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **our_env = NULL;
+        size_t n_env = 0;
+        char *x;
+        int r;
+
+        assert(c);
+        assert(p);
+        assert(ret);
+
+#define N_ENV_VARS 19
+        our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+        if (!our_env)
+                return -ENOMEM;
+
+        if (n_fds > 0) {
+                _cleanup_free_ char *joined = NULL;
+
+                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+
+                if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+
+                joined = strv_join(p->fd_names, ":");
+                if (!joined)
+                        return -ENOMEM;
+
+                x = strjoin("LISTEN_FDNAMES=", joined);
+                if (!x)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+        }
+
+        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
+                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+
+                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+        }
+
+        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
+         * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
+         * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
+        if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
+                x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
+                if (!x)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+        }
+
+        /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
+         * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
+         * really make much sense since we're not logged in. Hence we conditionalize the three based on
+         * SetLoginEnvironment= switch. */
+        if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+                r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
+                if (r < 0)
+                        return log_exec_debug_errno(c,
+                                                    p,
+                                                    r,
+                                                    "Failed to determine user credentials for root: %m");
+        }
+
+        bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
+
+        if (username) {
+                x = strjoin("USER=", username);
+                if (!x)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+
+                if (set_user_login_env) {
+                        x = strjoin("LOGNAME=", username);
+                        if (!x)
+                                return -ENOMEM;
+                        our_env[n_env++] = x;
+                }
+        }
+
+        if (home && set_user_login_env) {
+                x = strjoin("HOME=", home);
+                if (!x)
+                        return -ENOMEM;
+
+                path_simplify(x + 5);
+                our_env[n_env++] = x;
+        }
+
+        if (shell && set_user_login_env) {
+                x = strjoin("SHELL=", shell);
+                if (!x)
+                        return -ENOMEM;
+
+                path_simplify(x + 6);
+                our_env[n_env++] = x;
+        }
+
+        if (!sd_id128_is_null(p->invocation_id)) {
+                assert(p->invocation_id_string);
+
+                x = strjoin("INVOCATION_ID=", p->invocation_id_string);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
+        if (exec_context_needs_term(c)) {
+                _cleanup_free_ char *cmdline = NULL;
+                const char *tty_path, *term = NULL;
+
+                tty_path = exec_context_tty_path(c);
+
+                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
+                 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
+                 * container manager passes to PID 1 ends up all the way in the console login shown. */
+
+                if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
+                        term = getenv("TERM");
+                else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
+                        _cleanup_free_ char *key = NULL;
+
+                        key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
+                        if (!key)
+                                return -ENOMEM;
+
+                        r = proc_cmdline_get_key(key, 0, &cmdline);
+                        if (r < 0)
+                                log_exec_debug_errno(c,
+                                                     p,
+                                                     r,
+                                                     "Failed to read %s from kernel cmdline, ignoring: %m",
+                                                     key);
+                        else if (r > 0)
+                                term = cmdline;
+                }
+
+                if (!term)
+                        term = default_term_for_tty(tty_path);
+
+                x = strjoin("TERM=", term);
+                if (!x)
+                        return -ENOMEM;
+                our_env[n_env++] = x;
+        }
+
+        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
+                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
+        if (c->log_namespace) {
+                x = strjoin("LOG_NAMESPACE=", c->log_namespace);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                _cleanup_free_ char *joined = NULL;
+                const char *n;
+
+                if (!p->prefix[t])
+                        continue;
+
+                if (c->directories[t].n_items == 0)
+                        continue;
+
+                n = exec_directory_env_name_to_string(t);
+                if (!n)
+                        continue;
+
+                for (size_t i = 0; i < c->directories[t].n_items; i++) {
+                        _cleanup_free_ char *prefixed = NULL;
+
+                        prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
+                        if (!prefixed)
+                                return -ENOMEM;
+
+                        if (!strextend_with_separator(&joined, ":", prefixed))
+                                return -ENOMEM;
+                }
+
+                x = strjoin(n, "=", joined);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
+        _cleanup_free_ char *creds_dir = NULL;
+        r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+        }
+
+        if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
+                return -ENOMEM;
+
+        our_env[n_env++] = x;
+
+        if (memory_pressure_path) {
+                x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+
+                if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
+                        _cleanup_free_ char *b = NULL, *e = NULL;
+
+                        if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
+                                     MEMORY_PRESSURE_DEFAULT_TYPE,
+                                     cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
+                                     CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
+                                     MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
+                                return -ENOMEM;
+
+                        if (base64mem(b, strlen(b) + 1, &e) < 0)
+                                return -ENOMEM;
+
+                        x = strjoin("MEMORY_PRESSURE_WRITE=", e);
+                        if (!x)
+                                return -ENOMEM;
+
+                        our_env[n_env++] = x;
+                }
+        }
+
+        assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+#undef N_ENV_VARS
+
+        *ret = TAKE_PTR(our_env);
+
+        return 0;
+}
+
+static int build_pass_environment(const ExecContext *c, char ***ret) {
+        _cleanup_strv_free_ char **pass_env = NULL;
+        size_t n_env = 0;
+
+        STRV_FOREACH(i, c->pass_environment) {
+                _cleanup_free_ char *x = NULL;
+                char *v;
+
+                v = getenv(*i);
+                if (!v)
+                        continue;
+                x = strjoin(*i, "=", v);
+                if (!x)
+                        return -ENOMEM;
+
+                if (!GREEDY_REALLOC(pass_env, n_env + 2))
+                        return -ENOMEM;
+
+                pass_env[n_env++] = TAKE_PTR(x);
+                pass_env[n_env] = NULL;
+        }
+
+        *ret = TAKE_PTR(pass_env);
+
+        return 0;
+}
+
+static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
+        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
+        _cleanup_close_ int unshare_ready_fd = -EBADF;
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        uint64_t c = 1;
+        ssize_t n;
+        int r;
+
+        /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
+         * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
+         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+         * which waits for the parent to create the new user namespace while staying in the original namespace. The
+         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+         * continues execution normally.
+         * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
+         * does not need CAP_SETUID to write the single line mapping to itself. */
+
+        /* Can only set up multiple mappings with CAP_SETUID. */
+        if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
+                r = asprintf(&uid_map,
+                             UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
+                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
+                             ouid, ouid, uid, uid);
+        else
+                r = asprintf(&uid_map,
+                             UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
+                             ouid, ouid);
+
+        if (r < 0)
+                return -ENOMEM;
+
+        /* Can only set up multiple mappings with CAP_SETGID. */
+        if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
+                r = asprintf(&gid_map,
+                             GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
+                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
+                             ogid, ogid, gid, gid);
+        else
+                r = asprintf(&gid_map,
+                             GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
+                             ogid, ogid);
+
+        if (r < 0)
+                return -ENOMEM;
+
+        /* Create a communication channel so that the parent can tell the child when it finished creating the user
+         * namespace. */
+        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+        if (unshare_ready_fd < 0)
+                return -errno;
+
+        /* Create a communication channel so that the child can tell the parent a proper error code in case it
+         * failed. */
+        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+                return -errno;
+
+        r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                _cleanup_close_ int fd = -EBADF;
+                const char *a;
+                pid_t ppid;
+
+                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+                 * here, after the parent opened its own user namespace. */
+
+                ppid = getppid();
+                errno_pipe[0] = safe_close(errno_pipe[0]);
+
+                /* Wait until the parent unshared the user namespace */
+                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+
+                /* Disable the setgroups() system call in the child user namespace, for good. */
+                a = procfs_file_alloca(ppid, "setgroups");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        if (errno != ENOENT) {
+                                r = -errno;
+                                goto child_fail;
+                        }
+
+                        /* If the file is missing the kernel is too old, let's continue anyway. */
+                } else {
+                        if (write(fd, "deny\n", 5) < 0) {
+                                r = -errno;
+                                goto child_fail;
+                        }
+
+                        fd = safe_close(fd);
+                }
+
+                /* First write the GID map */
+                a = procfs_file_alloca(ppid, "gid_map");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                if (write(fd, gid_map, strlen(gid_map)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                fd = safe_close(fd);
+
+                /* The write the UID map */
+                a = procfs_file_alloca(ppid, "uid_map");
+                fd = open(a, O_WRONLY|O_CLOEXEC);
+                if (fd < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+                if (write(fd, uid_map, strlen(uid_map)) < 0) {
+                        r = -errno;
+                        goto child_fail;
+                }
+
+                _exit(EXIT_SUCCESS);
+
+        child_fail:
+                (void) write(errno_pipe[1], &r, sizeof(r));
+                _exit(EXIT_FAILURE);
+        }
+
+        errno_pipe[1] = safe_close(errno_pipe[1]);
+
+        if (unshare(CLONE_NEWUSER) < 0)
+                return -errno;
+
+        /* Let the child know that the namespace is ready now */
+        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+                return -errno;
+
+        /* Try to read an error code from the child */
+        n = read(errno_pipe[0], &r, sizeof(r));
+        if (n < 0)
+                return -errno;
+        if (n == sizeof(r)) { /* an error code was sent to us */
+                if (r < 0)
+                        return r;
+                return -EIO;
+        }
+        if (n != 0) /* on success we should have read 0 bytes */
+                return -EIO;
+
+        r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
+        if (r < 0)
+                return r;
+        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
+                return -EIO;
+
+        return 0;
+}
+
+static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
+        _cleanup_free_ char *src_abs = NULL;
+        int r;
+
+        assert(source);
+
+        src_abs = path_join(root, source);
+        if (!src_abs)
+                return -ENOMEM;
+
+        STRV_FOREACH(dst, symlinks) {
+                _cleanup_free_ char *dst_abs = NULL;
+
+                dst_abs = path_join(root, *dst);
+                if (!dst_abs)
+                        return -ENOMEM;
+
+                r = mkdir_parents_label(dst_abs, 0755);
+                if (r < 0)
+                        return r;
+
+                r = symlink_idempotent(src_abs, dst_abs, true);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int setup_exec_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                uid_t uid,
+                gid_t gid,
+                ExecDirectoryType type,
+                bool needs_mount_namespace,
+                int *exit_status) {
+
+        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+                [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
+                [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
+                [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
+                [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
+                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
+        };
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
+        assert(exit_status);
+
+        if (!params->prefix[type])
+                return 0;
+
+        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
+                if (!uid_is_valid(uid))
+                        uid = 0;
+                if (!gid_is_valid(gid))
+                        gid = 0;
+        }
+
+        for (size_t i = 0; i < context->directories[type].n_items; i++) {
+                _cleanup_free_ char *p = NULL, *pp = NULL;
+
+                p = path_join(params->prefix[type], context->directories[type].items[i].path);
+                if (!p) {
+                        r = -ENOMEM;
+                        goto fail;
+                }
+
+                r = mkdir_parents_label(p, 0755);
+                if (r < 0)
+                        goto fail;
+
+                if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
+
+                        /* If we are in user mode, and a configuration directory exists but a state directory
+                         * doesn't exist, then we likely are upgrading from an older systemd version that
+                         * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
+                         * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
+                         * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
+                         * separated. If a service has both dirs configured but only the configuration dir
+                         * exists and the state dir does not, we assume we are looking at an update
+                         * situation. Hence, create a compatibility symlink, so that all expectations are
+                         * met.
+                         *
+                         * (We also do something similar with the log directory, which still doesn't exist in
+                         * the xdg basedir spec. We'll make it a subdir of the state dir.) */
+
+                        /* this assumes the state dir is always created before the configuration dir */
+                        assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
+                        assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
+
+                        r = laccess(p, F_OK);
+                        if (r == -ENOENT) {
+                                _cleanup_free_ char *q = NULL;
+
+                                /* OK, we know that the state dir does not exist. Let's see if the dir exists
+                                 * under the configuration hierarchy. */
+
+                                if (type == EXEC_DIRECTORY_STATE)
+                                        q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
+                                else if (type == EXEC_DIRECTORY_LOGS)
+                                        q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
+                                else
+                                        assert_not_reached();
+                                if (!q) {
+                                        r = -ENOMEM;
+                                        goto fail;
+                                }
+
+                                r = laccess(q, F_OK);
+                                if (r >= 0) {
+                                        /* It does exist! This hence looks like an update. Symlink the
+                                         * configuration directory into the state directory. */
+
+                                        r = symlink_idempotent(q, p, /* make_relative= */ true);
+                                        if (r < 0)
+                                                goto fail;
+
+                                        log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
+                                        continue;
+                                } else if (r != -ENOENT)
+                                        log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
+
+                        } else if (r < 0)
+                                log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
+                }
+
+                if (exec_directory_is_private(context, type)) {
+                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
+                         * case we want to avoid leaving a directory around fully accessible that is owned by
+                         * a dynamic user whose UID is later on reused. To lock this down we use the same
+                         * trick used by container managers to prohibit host users to get access to files of
+                         * the same UID in containers: we place everything inside a directory that has an
+                         * access mode of 0700 and is owned root:root, so that it acts as security boundary
+                         * for unprivileged host code. We then use fs namespacing to make this directory
+                         * permeable for the service itself.
+                         *
+                         * Specifically: for a service which wants a special directory "foo/" we first create
+                         * a directory "private/" with access mode 0700 owned by root:root. Then we place
+                         * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
+                         * "private/foo". This way, privileged host users can access "foo/" as usual, but
+                         * unprivileged host users can't look into it. Inside of the namespace of the unit
+                         * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
+                         * "private/foo/" is mounted under the same name, thus disabling the access boundary
+                         * for the service and making sure it only gets access to the dirs it needs but no
+                         * others. Tricky? Yes, absolutely, but it works!
+                         *
+                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
+                         * to be owned by the service itself.
+                         *
+                         * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
+                         * for sharing files or sockets with other services. */
+
+                        pp = path_join(params->prefix[type], "private");
+                        if (!pp) {
+                                r = -ENOMEM;
+                                goto fail;
+                        }
+
+                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
+                        r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
+                        if (r < 0)
+                                goto fail;
+
+                        if (!path_extend(&pp, context->directories[type].items[i].path)) {
+                                r = -ENOMEM;
+                                goto fail;
+                        }
+
+                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
+                        r = mkdir_parents_label(pp, 0755);
+                        if (r < 0)
+                                goto fail;
+
+                        if (is_dir(p, false) > 0 &&
+                            (laccess(pp, F_OK) == -ENOENT)) {
+
+                                /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
+                                 * it over. Most likely the service has been upgraded from one that didn't use
+                                 * DynamicUser=1, to one that does. */
+
+                                log_exec_info(context,
+                                              params,
+                                              "Found pre-existing public %s= directory %s, migrating to %s.\n"
+                                              "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
+                                              exec_directory_type_to_string(type), p, pp);
+
+                                r = RET_NERRNO(rename(p, pp));
+                                if (r < 0)
+                                        goto fail;
+                        } else {
+                                /* Otherwise, create the actual directory for the service */
+
+                                r = mkdir_label(pp, context->directories[type].mode);
+                                if (r < 0 && r != -EEXIST)
+                                        goto fail;
+                        }
+
+                        if (!context->directories[type].items[i].only_create) {
+                                /* And link it up from the original place.
+                                 * Notes
+                                 * 1) If a mount namespace is going to be used, then this symlink remains on
+                                 *    the host, and a new one for the child namespace will be created later.
+                                 * 2) It is not necessary to create this symlink when one of its parent
+                                 *    directories is specified and already created. E.g.
+                                 *        StateDirectory=foo foo/bar
+                                 *    In that case, the inode points to pp and p for "foo/bar" are the same:
+                                 *        pp = "/var/lib/private/foo/bar"
+                                 *        p = "/var/lib/foo/bar"
+                                 *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
+                                 *    we do not need to create the symlink, but we cannot create the symlink.
+                                 *    See issue #24783. */
+                                r = symlink_idempotent(pp, p, true);
+                                if (r < 0)
+                                        goto fail;
+                        }
+
+                } else {
+                        _cleanup_free_ char *target = NULL;
+
+                        if (type != EXEC_DIRECTORY_CONFIGURATION &&
+                            readlink_and_make_absolute(p, &target) >= 0) {
+                                _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
+
+                                /* This already exists and is a symlink? Interesting. Maybe it's one created
+                                 * by DynamicUser=1 (see above)?
+                                 *
+                                 * We do this for all directory types except for ConfigurationDirectory=,
+                                 * since they all support the private/ symlink logic at least in some
+                                 * configurations, see above. */
+
+                                r = chase(target, NULL, 0, &target_resolved, NULL);
+                                if (r < 0)
+                                        goto fail;
+
+                                q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
+                                if (!q) {
+                                        r = -ENOMEM;
+                                        goto fail;
+                                }
+
+                                /* /var/lib or friends may be symlinks. So, let's chase them also. */
+                                r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
+                                if (r < 0)
+                                        goto fail;
+
+                                if (path_equal(q_resolved, target_resolved)) {
+
+                                        /* Hmm, apparently DynamicUser= was once turned on for this service,
+                                         * but is no longer. Let's move the directory back up. */
+
+                                        log_exec_info(context,
+                                                      params,
+                                                      "Found pre-existing private %s= directory %s, migrating to %s.\n"
+                                                      "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
+                                                      exec_directory_type_to_string(type), q, p);
+
+                                        r = RET_NERRNO(unlink(p));
+                                        if (r < 0)
+                                                goto fail;
+
+                                        r = RET_NERRNO(rename(q, p));
+                                        if (r < 0)
+                                                goto fail;
+                                }
+                        }
+
+                        r = mkdir_label(p, context->directories[type].mode);
+                        if (r < 0) {
+                                if (r != -EEXIST)
+                                        goto fail;
+
+                                if (type == EXEC_DIRECTORY_CONFIGURATION) {
+                                        struct stat st;
+
+                                        /* Don't change the owner/access mode of the configuration directory,
+                                         * as in the common case it is not written to by a service, and shall
+                                         * not be writable. */
+
+                                        r = RET_NERRNO(stat(p, &st));
+                                        if (r < 0)
+                                                goto fail;
+
+                                        /* Still complain if the access mode doesn't match */
+                                        if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
+                                                log_exec_warning(context,
+                                                                 params,
+                                                                 "%s \'%s\' already exists but the mode is different. "
+                                                                 "(File system: %o %sMode: %o)",
+                                                                 exec_directory_type_to_string(type), context->directories[type].items[i].path,
+                                                                 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
+
+                                        continue;
+                                }
+                        }
+                }
+
+                /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
+                 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
+                 * current UID/GID ownership.) */
+                r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
+                if (r < 0)
+                        goto fail;
+
+                /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
+                 * available to user code anyway */
+                if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
+                        continue;
+
+                /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
+                 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
+                 * assignments to exist. */
+                r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
+                if (r < 0)
+                        goto fail;
+        }
+
+        /* If we are not going to run in a namespace, set up the symlinks - otherwise
+         * they are set up later, to allow configuring empty var/run/etc. */
+        if (!needs_mount_namespace)
+                for (size_t i = 0; i < context->directories[type].n_items; i++) {
+                        r = create_many_symlinks(params->prefix[type],
+                                                 context->directories[type].items[i].path,
+                                                 context->directories[type].items[i].symlinks);
+                        if (r < 0)
+                                goto fail;
+                }
+
+        return 0;
+
+fail:
+        *exit_status = exit_status_table[type];
+        return r;
+}
+
+#if ENABLE_SMACK
+static int setup_smack(
+                const ExecParameters *params,
+                const ExecContext *context,
+                int executable_fd) {
+        int r;
+
+        assert(params);
+        assert(executable_fd >= 0);
+
+        if (context->smack_process_label) {
+                r = mac_smack_apply_pid(0, context->smack_process_label);
+                if (r < 0)
+                        return r;
+        } else if (params->fallback_smack_process_label) {
+                _cleanup_free_ char *exec_label = NULL;
+
+                r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
+                if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+                        return r;
+
+                r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+#endif
+
+static int compile_bind_mounts(
+                const ExecContext *context,
+                const ExecParameters *params,
+                BindMount **ret_bind_mounts,
+                size_t *ret_n_bind_mounts,
+                char ***ret_empty_directories) {
+
+        _cleanup_strv_free_ char **empty_directories = NULL;
+        BindMount *bind_mounts = NULL;
+        size_t n, h = 0;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(ret_bind_mounts);
+        assert(ret_n_bind_mounts);
+        assert(ret_empty_directories);
+
+        CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
+
+        n = context->n_bind_mounts;
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                if (!params->prefix[t])
+                        continue;
+
+                for (size_t i = 0; i < context->directories[t].n_items; i++)
+                        n += !context->directories[t].items[i].only_create;
+        }
+
+        if (n <= 0) {
+                *ret_bind_mounts = NULL;
+                *ret_n_bind_mounts = 0;
+                *ret_empty_directories = NULL;
+                return 0;
+        }
+
+        bind_mounts = new(BindMount, n);
+        if (!bind_mounts)
+                return -ENOMEM;
+
+        for (size_t i = 0; i < context->n_bind_mounts; i++) {
+                BindMount *item = context->bind_mounts + i;
+                _cleanup_free_ char *s = NULL, *d = NULL;
+
+                s = strdup(item->source);
+                if (!s)
+                        return -ENOMEM;
+
+                d = strdup(item->destination);
+                if (!d)
+                        return -ENOMEM;
+
+                bind_mounts[h++] = (BindMount) {
+                        .source = TAKE_PTR(s),
+                        .destination = TAKE_PTR(d),
+                        .read_only = item->read_only,
+                        .recursive = item->recursive,
+                        .ignore_enoent = item->ignore_enoent,
+                };
+        }
+
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                if (!params->prefix[t])
+                        continue;
+
+                if (context->directories[t].n_items == 0)
+                        continue;
+
+                if (exec_directory_is_private(context, t) &&
+                    !exec_context_with_rootfs(context)) {
+                        char *private_root;
+
+                        /* So this is for a dynamic user, and we need to make sure the process can access its own
+                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
+                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
+
+                        private_root = path_join(params->prefix[t], "private");
+                        if (!private_root)
+                                return -ENOMEM;
+
+                        r = strv_consume(&empty_directories, private_root);
+                        if (r < 0)
+                                return r;
+                }
+
+                for (size_t i = 0; i < context->directories[t].n_items; i++) {
+                        _cleanup_free_ char *s = NULL, *d = NULL;
+
+                        /* When one of the parent directories is in the list, we cannot create the symlink
+                         * for the child directory. See also the comments in setup_exec_directory(). */
+                        if (context->directories[t].items[i].only_create)
+                                continue;
+
+                        if (exec_directory_is_private(context, t))
+                                s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
+                        else
+                                s = path_join(params->prefix[t], context->directories[t].items[i].path);
+                        if (!s)
+                                return -ENOMEM;
+
+                        if (exec_directory_is_private(context, t) &&
+                            exec_context_with_rootfs(context))
+                                /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
+                                 * directory is not created on the root directory. So, let's bind-mount the directory
+                                 * on the 'non-private' place. */
+                                d = path_join(params->prefix[t], context->directories[t].items[i].path);
+                        else
+                                d = strdup(s);
+                        if (!d)
+                                return -ENOMEM;
+
+                        bind_mounts[h++] = (BindMount) {
+                                .source = TAKE_PTR(s),
+                                .destination = TAKE_PTR(d),
+                                .read_only = false,
+                                .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
+                                .recursive = true,
+                                .ignore_enoent = false,
+                        };
+                }
+        }
+
+        assert(h == n);
+
+        *ret_bind_mounts = TAKE_PTR(bind_mounts);
+        *ret_n_bind_mounts = n;
+        *ret_empty_directories = TAKE_PTR(empty_directories);
+
+        return (int) n;
+}
+
+/* ret_symlinks will contain a list of pairs src:dest that describes
+ * the symlinks to create later on. For example, the symlinks needed
+ * to safely give private directories to DynamicUser=1 users. */
+static int compile_symlinks(
+                const ExecContext *context,
+                const ExecParameters *params,
+                bool setup_os_release_symlink,
+                char ***ret_symlinks) {
+
+        _cleanup_strv_free_ char **symlinks = NULL;
+        int r;
+
+        assert(context);
+        assert(params);
+        assert(ret_symlinks);
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+                for (size_t i = 0; i < context->directories[dt].n_items; i++) {
+                        _cleanup_free_ char *private_path = NULL, *path = NULL;
+
+                        STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
+                                _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
+
+                                src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+                                dst_abs = path_join(params->prefix[dt], *symlink);
+                                if (!src_abs || !dst_abs)
+                                        return -ENOMEM;
+
+                                r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        if (!exec_directory_is_private(context, dt) ||
+                            exec_context_with_rootfs(context) ||
+                            context->directories[dt].items[i].only_create)
+                                continue;
+
+                        private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
+                        if (!private_path)
+                                return -ENOMEM;
+
+                        path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+                        if (!path)
+                                return -ENOMEM;
+
+                        r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        /* We make the host's os-release available via a symlink, so that we can copy it atomically
+         * and readers will never get a half-written version. Note that, while the paths specified here are
+         * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
+         * 'os-release -> .os-release-stage/os-release' is what will be created. */
+        if (setup_os_release_symlink) {
+                r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
+                if (r < 0)
+                        return r;
+
+                r = strv_extend(&symlinks, "/run/host/os-release");
+                if (r < 0)
+                        return r;
+        }
+
+        *ret_symlinks = TAKE_PTR(symlinks);
+
+        return 0;
+}
+
+static bool insist_on_sandboxing(
+                const ExecContext *context,
+                const char *root_dir,
+                const char *root_image,
+                const BindMount *bind_mounts,
+                size_t n_bind_mounts) {
+
+        assert(context);
+        assert(n_bind_mounts == 0 || bind_mounts);
+
+        /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
+         * would alter the view on the file system beyond making things read-only or invisible, i.e. would
+         * rearrange stuff in a way we cannot ignore gracefully. */
+
+        if (context->n_temporary_filesystems > 0)
+                return true;
+
+        if (root_dir || root_image)
+                return true;
+
+        if (context->n_mount_images > 0)
+                return true;
+
+        if (context->dynamic_user)
+                return true;
+
+        if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
+                return true;
+
+        /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
+         * essential. */
+        for (size_t i = 0; i < n_bind_mounts; i++)
+                if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
+                        return true;
+
+        if (context->log_namespace)
+                return true;
+
+        return false;
+}
+
+static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        if (!runtime || !runtime->ephemeral_copy)
+                return 0;
+
+        r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
+
+        CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
+
+        fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+        if (fd >= 0)
+                /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
+                return 0;
+
+        if (fd != -EAGAIN)
+                return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
+
+        log_debug("Making ephemeral snapshot of %s to %s",
+                  context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+        if (context->root_image)
+                fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
+                               COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
+        else
+                fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
+                                              AT_FDCWD, runtime->ephemeral_copy,
+                                              BTRFS_SNAPSHOT_FALLBACK_COPY |
+                                              BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+                                              BTRFS_SNAPSHOT_RECURSIVE |
+                                              BTRFS_SNAPSHOT_LOCK_BSD);
+        if (fd < 0)
+                return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
+                                       context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+        if (context->root_image) {
+                /* A root image might be subject to lots of random writes so let's try to disable COW on it
+                 * which tends to not perform well in combination with lots of random writes.
+                 *
+                 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
+                 * copy, but we at least want to make the intention clear.
+                 */
+                r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
+                if (r < 0)
+                        log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
+        }
+
+        r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
+
+        return 1;
+}
+
+static int verity_settings_prepare(
+                VeritySettings *verity,
+                const char *root_image,
+                const void *root_hash,
+                size_t root_hash_size,
+                const char *root_hash_path,
+                const void *root_hash_sig,
+                size_t root_hash_sig_size,
+                const char *root_hash_sig_path,
+                const char *verity_data_path) {
+
+        int r;
+
+        assert(verity);
+
+        if (root_hash) {
+                void *d;
+
+                d = memdup(root_hash, root_hash_size);
+                if (!d)
+                        return -ENOMEM;
+
+                free_and_replace(verity->root_hash, d);
+                verity->root_hash_size = root_hash_size;
+                verity->designator = PARTITION_ROOT;
+        }
+
+        if (root_hash_sig) {
+                void *d;
+
+                d = memdup(root_hash_sig, root_hash_sig_size);
+                if (!d)
+                        return -ENOMEM;
+
+                free_and_replace(verity->root_hash_sig, d);
+                verity->root_hash_sig_size = root_hash_sig_size;
+                verity->designator = PARTITION_ROOT;
+        }
+
+        if (verity_data_path) {
+                r = free_and_strdup(&verity->data_path, verity_data_path);
+                if (r < 0)
+                        return r;
+        }
+
+        r = verity_settings_load(
+                        verity,
+                        root_image,
+                        root_hash_path,
+                        root_hash_sig_path);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to load root hash: %m");
+
+        return 0;
+}
+
+static int apply_mount_namespace(
+                ExecCommandFlags command_flags,
+                const ExecContext *context,
+                const ExecParameters *params,
+                ExecRuntime *runtime,
+                const char *memory_pressure_path,
+                char **error_path) {
+
+        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
+                        **read_write_paths_cleanup = NULL;
+        _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
+                        *extension_dir = NULL, *host_os_release_stage = NULL;
+        const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
+        char **read_write_paths;
+        bool needs_sandboxing, setup_os_release_symlink;
+        BindMount *bind_mounts = NULL;
+        size_t n_bind_mounts = 0;
+        int r;
+
+        assert(context);
+
+        CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
+
+        if (params->flags & EXEC_APPLY_CHROOT) {
+                r = setup_ephemeral(context, runtime);
+                if (r < 0)
+                        return r;
+
+                if (context->root_image)
+                        root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
+                else
+                        root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
+        }
+
+        r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
+        if (r < 0)
+                return r;
+
+        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
+         * service will need to write to it in order to start the notifications. */
+        if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+                read_write_paths_cleanup = strv_copy(context->read_write_paths);
+                if (!read_write_paths_cleanup)
+                        return -ENOMEM;
+
+                r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
+                if (r < 0)
+                        return r;
+
+                read_write_paths = read_write_paths_cleanup;
+        } else
+                read_write_paths = context->read_write_paths;
+
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+        if (needs_sandboxing) {
+                /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
+                 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
+                 * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
+
+                if (context->private_tmp && runtime && runtime->shared) {
+                        if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
+                                tmp_dir = runtime->shared->tmp_dir;
+                        else if (runtime->shared->tmp_dir)
+                                tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
+
+                        if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
+                                var_tmp_dir = runtime->shared->var_tmp_dir;
+                        else if (runtime->shared->var_tmp_dir)
+                                var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
+                }
+        }
+
+        /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
+        setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
+        r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
+        if (r < 0)
+                return r;
+
+        if (context->mount_propagation_flag == MS_SHARED)
+                log_exec_debug(context,
+                               params,
+                               "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
+
+        if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
+                r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
+                if (r < 0)
+                        return r;
+        }
+
+        if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+                propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
+                if (!propagate_dir)
+                        return -ENOMEM;
+
+                incoming_dir = strdup("/run/systemd/incoming");
+                if (!incoming_dir)
+                        return -ENOMEM;
+
+                extension_dir = strdup("/run/systemd/unit-extensions");
+                if (!extension_dir)
+                        return -ENOMEM;
+
+                /* If running under a different root filesystem, propagate the host's os-release. We make a
+                 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
+                if (setup_os_release_symlink) {
+                        host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
+                        if (!host_os_release_stage)
+                                return -ENOMEM;
+                }
+        } else {
+                assert(params->runtime_scope == RUNTIME_SCOPE_USER);
+
+                if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
+                        return -ENOMEM;
+
+                if (setup_os_release_symlink) {
+                        if (asprintf(&host_os_release_stage,
+                                     "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
+                                     geteuid()) < 0)
+                                return -ENOMEM;
+                }
+        }
+
+        if (root_image) {
+                r = verity_settings_prepare(
+                        &verity,
+                        root_image,
+                        context->root_hash, context->root_hash_size, context->root_hash_path,
+                        context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
+                        context->root_verity);
+                if (r < 0)
+                        return r;
+        }
+
+        NamespaceParameters parameters = {
+                .runtime_scope = params->runtime_scope,
+
+                .root_directory = root_dir,
+                .root_image = root_image,
+                .root_image_options = context->root_image_options,
+                .root_image_policy = context->root_image_policy ?: &image_policy_service,
+
+                .read_write_paths = read_write_paths,
+                .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
+                .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
+
+                .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
+                .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
+
+                .empty_directories = empty_directories,
+                .symlinks = symlinks,
+
+                .bind_mounts = bind_mounts,
+                .n_bind_mounts = n_bind_mounts,
+
+                .temporary_filesystems = context->temporary_filesystems,
+                .n_temporary_filesystems = context->n_temporary_filesystems,
+
+                .mount_images = context->mount_images,
+                .n_mount_images = context->n_mount_images,
+                .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
+
+                .tmp_dir = tmp_dir,
+                .var_tmp_dir = var_tmp_dir,
+
+                .creds_path = creds_path,
+                .log_namespace = context->log_namespace,
+                .mount_propagation_flag = context->mount_propagation_flag,
+
+                .verity = &verity,
+
+                .extension_images = context->extension_images,
+                .n_extension_images = context->n_extension_images,
+                .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
+                .extension_directories = context->extension_directories,
+
+                .propagate_dir = propagate_dir,
+                .incoming_dir = incoming_dir,
+                .extension_dir = extension_dir,
+                .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
+                .host_os_release_stage = host_os_release_stage,
+
+                /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
+                 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
+                 * sandbox inside the mount namespace. */
+                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
+
+                .protect_control_groups = needs_sandboxing && context->protect_control_groups,
+                .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
+                .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
+                .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
+                .protect_hostname = needs_sandboxing && context->protect_hostname,
+
+                .private_dev = needs_sandboxing && context->private_devices,
+                .private_network = needs_sandboxing && exec_needs_network_namespace(context),
+                .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
+
+                .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
+
+                /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
+                .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
+
+                .protect_home = needs_sandboxing ? context->protect_home : false,
+                .protect_system = needs_sandboxing ? context->protect_system : false,
+                .protect_proc = needs_sandboxing ? context->protect_proc : false,
+                .proc_subset = needs_sandboxing ? context->proc_subset : false,
+        };
+
+        r = setup_namespace(¶meters, error_path);
+        /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
+         * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
+         * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
+         * completely different execution environment. */
+        if (r == -ENOANO) {
+                if (insist_on_sandboxing(
+                                    context,
+                                    root_dir, root_image,
+                                    bind_mounts,
+                                    n_bind_mounts))
+                        return log_exec_debug_errno(context,
+                                                    params,
+                                                    SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                                    "Failed to set up namespace, and refusing to continue since "
+                                                    "the selected namespacing options alter mount environment non-trivially.\n"
+                                                    "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
+                                                    n_bind_mounts,
+                                                    context->n_temporary_filesystems,
+                                                    yes_no(root_dir),
+                                                    yes_no(root_image),
+                                                    yes_no(context->dynamic_user));
+
+                log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
+                return 0;
+        }
+
+        return r;
+}
+
+static int apply_working_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                ExecRuntime *runtime,
+                const char *home,
+                int *exit_status) {
+
+        const char *d, *wd;
+
+        assert(context);
+        assert(exit_status);
+
+        if (context->working_directory_home) {
+
+                if (!home) {
+                        *exit_status = EXIT_CHDIR;
+                        return -ENXIO;
+                }
+
+                wd = home;
+
+        } else
+                wd = empty_to_root(context->working_directory);
+
+        if (params->flags & EXEC_APPLY_CHROOT)
+                d = wd;
+        else
+                d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
+
+        if (chdir(d) < 0 && !context->working_directory_missing_ok) {
+                *exit_status = EXIT_CHDIR;
+                return -errno;
+        }
+
+        return 0;
+}
+
+static int apply_root_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                ExecRuntime *runtime,
+                const bool needs_mount_ns,
+                int *exit_status) {
+
+        assert(context);
+        assert(exit_status);
+
+        if (params->flags & EXEC_APPLY_CHROOT)
+                if (!needs_mount_ns && context->root_directory)
+                        if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
+                                *exit_status = EXIT_CHROOT;
+                                return -errno;
+                        }
+
+        return 0;
+}
+
+static int setup_keyring(
+                const ExecContext *context,
+                const ExecParameters *p,
+                uid_t uid, gid_t gid) {
+
+        key_serial_t keyring;
+        int r = 0;
+        uid_t saved_uid;
+        gid_t saved_gid;
+
+        assert(context);
+        assert(p);
+
+        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
+         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
+         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
+         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
+         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
+         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
+
+        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
+                return 0;
+
+        /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
+         * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
+         * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
+         * & group is just as nasty as acquiring a reference to the user keyring. */
+
+        saved_uid = getuid();
+        saved_gid = getgid();
+
+        if (gid_is_valid(gid) && gid != saved_gid) {
+                if (setregid(gid, -1) < 0)
+                        return log_exec_error_errno(context,
+                                                    p,
+                                                    errno,
+                                                    "Failed to change GID for user keyring: %m");
+        }
+
+        if (uid_is_valid(uid) && uid != saved_uid) {
+                if (setreuid(uid, -1) < 0) {
+                        r = log_exec_error_errno(context,
+                                                 p,
+                                                 errno,
+                                                 "Failed to change UID for user keyring: %m");
+                        goto out;
+                }
+        }
+
+        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
+        if (keyring == -1) {
+                if (errno == ENOSYS)
+                        log_exec_debug_errno(context,
+                                             p,
+                                             errno,
+                                             "Kernel keyring not supported, ignoring.");
+                else if (ERRNO_IS_PRIVILEGE(errno))
+                        log_exec_debug_errno(context,
+                                             p,
+                                             errno,
+                                             "Kernel keyring access prohibited, ignoring.");
+                else if (errno == EDQUOT)
+                        log_exec_debug_errno(context,
+                                             p,
+                                             errno,
+                                             "Out of kernel keyrings to allocate, ignoring.");
+                else
+                        r = log_exec_error_errno(context,
+                                                 p,
+                                                 errno,
+                                                 "Setting up kernel keyring failed: %m");
+
+                goto out;
+        }
+
+        /* When requested link the user keyring into the session keyring. */
+        if (context->keyring_mode == EXEC_KEYRING_SHARED) {
+
+                if (keyctl(KEYCTL_LINK,
+                           KEY_SPEC_USER_KEYRING,
+                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
+                        r = log_exec_error_errno(context,
+                                                 p,
+                                                 errno,
+                                                 "Failed to link user keyring into session keyring: %m");
+                        goto out;
+                }
+        }
+
+        /* Restore uid/gid back */
+        if (uid_is_valid(uid) && uid != saved_uid) {
+                if (setreuid(saved_uid, -1) < 0) {
+                        r = log_exec_error_errno(context,
+                                                 p,
+                                                 errno,
+                                                 "Failed to change UID back for user keyring: %m");
+                        goto out;
+                }
+        }
+
+        if (gid_is_valid(gid) && gid != saved_gid) {
+                if (setregid(saved_gid, -1) < 0)
+                        return log_exec_error_errno(context,
+                                                    p,
+                                                    errno,
+                                                    "Failed to change GID back for user keyring: %m");
+        }
+
+        /* Populate they keyring with the invocation ID by default, as original saved_uid. */
+        if (!sd_id128_is_null(p->invocation_id)) {
+                key_serial_t key;
+
+                key = add_key("user",
+                              "invocation_id",
+                              &p->invocation_id,
+                              sizeof(p->invocation_id),
+                              KEY_SPEC_SESSION_KEYRING);
+                if (key == -1)
+                        log_exec_debug_errno(context,
+                                             p,
+                                             errno,
+                                             "Failed to add invocation ID to keyring, ignoring: %m");
+                else {
+                        if (keyctl(KEYCTL_SETPERM, key,
+                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
+                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
+                                r = log_exec_error_errno(context,
+                                                         p,
+                                                         errno,
+                                                         "Failed to restrict invocation ID permission: %m");
+                }
+        }
+
+out:
+        /* Revert back uid & gid for the last time, and exit */
+        /* no extra logging, as only the first already reported error matters */
+        if (getuid() != saved_uid)
+                (void) setreuid(saved_uid, -1);
+
+        if (getgid() != saved_gid)
+                (void) setregid(saved_gid, -1);
+
+        return r;
+}
+
+static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
+        assert(array);
+        assert(n);
+        assert(pair);
+
+        if (pair[0] >= 0)
+                array[(*n)++] = pair[0];
+        if (pair[1] >= 0)
+                array[(*n)++] = pair[1];
+}
+
+static int close_remaining_fds(
+                const ExecParameters *params,
+                const ExecRuntime *runtime,
+                int socket_fd,
+                const int *fds, size_t n_fds) {
+
+        size_t n_dont_close = 0;
+        int dont_close[n_fds + 14];
+
+        assert(params);
+
+        if (params->stdin_fd >= 0)
+                dont_close[n_dont_close++] = params->stdin_fd;
+        if (params->stdout_fd >= 0)
+                dont_close[n_dont_close++] = params->stdout_fd;
+        if (params->stderr_fd >= 0)
+                dont_close[n_dont_close++] = params->stderr_fd;
+
+        if (socket_fd >= 0)
+                dont_close[n_dont_close++] = socket_fd;
+        if (n_fds > 0) {
+                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
+                n_dont_close += n_fds;
+        }
+
+        if (runtime)
+                append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
+
+        if (runtime && runtime->shared) {
+                append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
+                append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
+        }
+
+        if (runtime && runtime->dynamic_creds) {
+                if (runtime->dynamic_creds->user)
+                        append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
+                if (runtime->dynamic_creds->group)
+                        append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
+        }
+
+        if (params->user_lookup_fd >= 0)
+                dont_close[n_dont_close++] = params->user_lookup_fd;
+
+        return close_all_fds(dont_close, n_dont_close);
+}
+
+static int send_user_lookup(
+                const char *unit_id,
+                int user_lookup_fd,
+                uid_t uid,
+                gid_t gid) {
+
+        assert(unit_id);
+
+        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
+         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
+         * specified. */
+
+        if (user_lookup_fd < 0)
+                return 0;
+
+        if (!uid_is_valid(uid) && !gid_is_valid(gid))
+                return 0;
+
+        if (writev(user_lookup_fd,
+               (struct iovec[]) {
+                           IOVEC_MAKE(&uid, sizeof(uid)),
+                           IOVEC_MAKE(&gid, sizeof(gid)),
+                           IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
+                return -errno;
+
+        return 0;
+}
+
+static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
+        int r;
+
+        assert(c);
+        assert(home);
+        assert(buf);
+
+        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
+
+        if (*home)
+                return 0;
+
+        if (!c->working_directory_home)
+                return 0;
+
+        r = get_home_dir(buf);
+        if (r < 0)
+                return r;
+
+        *home = *buf;
+        return 1;
+}
+
+static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
+        _cleanup_strv_free_ char ** list = NULL;
+        int r;
+
+        assert(c);
+        assert(p);
+        assert(ret);
+
+        assert(c->dynamic_user);
+
+        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
+         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
+         * directories. */
+
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                if (t == EXEC_DIRECTORY_CONFIGURATION)
+                        continue;
+
+                if (!p->prefix[t])
+                        continue;
+
+                for (size_t i = 0; i < c->directories[t].n_items; i++) {
+                        char *e;
+
+                        if (exec_directory_is_private(c, t))
+                                e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
+                        else
+                                e = path_join(p->prefix[t], c->directories[t].items[i].path);
+                        if (!e)
+                                return -ENOMEM;
+
+                        r = strv_consume(&list, e);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        *ret = TAKE_PTR(list);
+
+        return 0;
+}
+
+static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
+        _cleanup_(cpu_set_reset) CPUSet s = {};
+        int r;
+
+        assert(c);
+        assert(ret);
+
+        if (!c->numa_policy.nodes.set) {
+                log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
+                return 0;
+        }
+
+        r = numa_to_cpu_set(&c->numa_policy, &s);
+        if (r < 0)
+                return r;
+
+        cpu_set_reset(ret);
+
+        return cpu_set_add_all(ret, &s);
+}
+
+static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
+        int r;
+
+        assert(fds);
+        assert(n_fds);
+        assert(*n_fds < fds_size);
+        assert(fd);
+
+        if (*fd < 0)
+               return 0;
+
+        if (*fd < 3 + (int) *n_fds) {
+                /* Let's move the fd up, so that it's outside of the fd range we will use to store
+                 * the fds we pass to the process (or which are closed only during execve). */
+
+                r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
+                if (r < 0)
+                        return -errno;
+
+                close_and_replace(*fd, r);
+        }
+
+        fds[(*n_fds)++] = *fd;
+        return 1;
+}
+
+static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
+        union sockaddr_union addr = {
+                .un.sun_family = AF_UNIX,
+        };
+        socklen_t sa_len;
+        static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
+        int r;
+
+        assert(c);
+        assert(p);
+        assert(of);
+        assert(ofd >= 0);
+
+        r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
+        if (r < 0)
+                return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
+
+        sa_len = r;
+
+        for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
+                _cleanup_close_ int fd = -EBADF;
+
+                fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
+                if (fd < 0)
+                        return log_exec_error_errno(c,
+                                                    p,
+                                                    errno,
+                                                    "Failed to create socket for %s: %m",
+                                                    of->path);
+
+                r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
+                if (r == -EPROTOTYPE)
+                        continue;
+                if (r < 0)
+                        return log_exec_error_errno(c,
+                                                    p,
+                                                    r,
+                                                    "Failed to connect socket for %s: %m",
+                                                    of->path);
+
+                return TAKE_FD(fd);
+        }
+
+        return log_exec_error_errno(c,
+                                    p,
+                                    SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
+                                    of->path);
+}
+
+static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
+        struct stat st;
+        _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
+
+        assert(c);
+        assert(p);
+        assert(of);
+
+        ofd = open(of->path, O_PATH | O_CLOEXEC);
+        if (ofd < 0)
+                return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
+
+        if (fstat(ofd, &st) < 0)
+                return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
+
+        if (S_ISSOCK(st.st_mode)) {
+                fd = connect_unix_harder(c, p, of, ofd);
+                if (fd < 0)
+                        return fd;
+
+                if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
+                        return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
+                                                    of->path);
+
+                log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
+        } else {
+                int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
+                if (FLAGS_SET(of->flags, OPENFILE_APPEND))
+                        flags |= O_APPEND;
+                else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
+                        flags |= O_TRUNC;
+
+                fd = fd_reopen(ofd, flags | O_CLOEXEC);
+                if (fd < 0)
+                        return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
+
+                log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
+        }
+
+        return TAKE_FD(fd);
+}
+
+static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
+        int r;
+
+        assert(c);
+        assert(p);
+        assert(n_fds);
+
+        LIST_FOREACH(open_files, of, p->open_files) {
+                _cleanup_close_ int fd = -EBADF;
+
+                fd = get_open_file_fd(c, p, of);
+                if (fd < 0) {
+                        if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
+                                log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
+                                continue;
+                        }
+
+                        return fd;
+                }
+
+                if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
+                        return -ENOMEM;
+
+                r = strv_extend(&p->fd_names, of->fdname);
+                if (r < 0)
+                        return r;
+
+                p->fds[*n_fds] = TAKE_FD(fd);
+
+                (*n_fds)++;
+        }
+
+        return 0;
+}
+
+static void log_command_line(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const char *msg,
+                const char *executable,
+                char **argv) {
+
+        assert(context);
+        assert(params);
+        assert(msg);
+        assert(executable);
+
+        if (!DEBUG_LOGGING)
+                return;
+
+        _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
+
+        log_exec_struct(context, params, LOG_DEBUG,
+                        "EXECUTABLE=%s", executable,
+                        LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
+                        LOG_EXEC_INVOCATION_ID(params));
+}
+
+static bool exec_context_need_unprivileged_private_users(
+                const ExecContext *context,
+                const ExecParameters *params) {
+
+        assert(context);
+        assert(params);
+
+        /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
+         * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
+         * (system manager) then we have privileges and don't need this. */
+        if (params->runtime_scope != RUNTIME_SCOPE_USER)
+                return false;
+
+        return context->private_users ||
+               context->private_tmp ||
+               context->private_devices ||
+               context->private_network ||
+               context->network_namespace_path ||
+               context->private_ipc ||
+               context->ipc_namespace_path ||
+               context->private_mounts > 0 ||
+               context->mount_apivfs ||
+               context->n_bind_mounts > 0 ||
+               context->n_temporary_filesystems > 0 ||
+               context->root_directory ||
+               !strv_isempty(context->extension_directories) ||
+               context->protect_system != PROTECT_SYSTEM_NO ||
+               context->protect_home != PROTECT_HOME_NO ||
+               context->protect_kernel_tunables ||
+               context->protect_kernel_modules ||
+               context->protect_kernel_logs ||
+               context->protect_control_groups ||
+               context->protect_clock ||
+               context->protect_hostname ||
+               !strv_isempty(context->read_write_paths) ||
+               !strv_isempty(context->read_only_paths) ||
+               !strv_isempty(context->inaccessible_paths) ||
+               !strv_isempty(context->exec_paths) ||
+               !strv_isempty(context->no_exec_paths);
+}
+
+static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
+        assert(context);
+
+        if (confirm_spawn_disabled())
+                return false;
+
+        /* For some reasons units remaining in the same process group
+         * as PID 1 fail to acquire the console even if it's not used
+         * by any process. So skip the confirmation question for them. */
+        return !context->same_pgrp;
+}
+
+static int exec_context_named_iofds(
+                const ExecContext *c,
+                const ExecParameters *p,
+                int named_iofds[static 3]) {
+
+        size_t targets;
+        const char* stdio_fdname[3];
+        size_t n_fds;
+
+        assert(c);
+        assert(p);
+        assert(named_iofds);
+
+        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
+                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
+                  (c->std_error == EXEC_OUTPUT_NAMED_FD);
+
+        for (size_t i = 0; i < 3; i++)
+                stdio_fdname[i] = exec_context_fdname(c, i);
+
+        n_fds = p->n_storage_fds + p->n_socket_fds;
+
+        for (size_t i = 0; i < n_fds  && targets > 0; i++)
+                if (named_iofds[STDIN_FILENO] < 0 &&
+                    c->std_input == EXEC_INPUT_NAMED_FD &&
+                    stdio_fdname[STDIN_FILENO] &&
+                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
+
+                        named_iofds[STDIN_FILENO] = p->fds[i];
+                        targets--;
+
+                } else if (named_iofds[STDOUT_FILENO] < 0 &&
+                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
+                           stdio_fdname[STDOUT_FILENO] &&
+                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
+
+                        named_iofds[STDOUT_FILENO] = p->fds[i];
+                        targets--;
+
+                } else if (named_iofds[STDERR_FILENO] < 0 &&
+                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
+                           stdio_fdname[STDERR_FILENO] &&
+                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
+
+                        named_iofds[STDERR_FILENO] = p->fds[i];
+                        targets--;
+                }
+
+        return targets == 0 ? 0 : -ENOENT;
+}
+
+static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
+        if (!shared)
+                return;
+
+        safe_close_pair(shared->netns_storage_socket);
+        safe_close_pair(shared->ipcns_storage_socket);
+}
+
+static void exec_runtime_close(ExecRuntime *rt) {
+        if (!rt)
+                return;
+
+        safe_close_pair(rt->ephemeral_storage_socket);
+
+        exec_shared_runtime_close(rt->shared);
+        dynamic_creds_close(rt->dynamic_creds);
+}
+
+static void exec_params_close(ExecParameters *p) {
+        if (!p)
+                return;
+
+        p->stdin_fd = safe_close(p->stdin_fd);
+        p->stdout_fd = safe_close(p->stdout_fd);
+        p->stderr_fd = safe_close(p->stderr_fd);
+}
+
+int exec_invoke(
+                const ExecCommand *command,
+                const ExecContext *context,
+                ExecParameters *params,
+                ExecRuntime *runtime,
+                const CGroupContext *cgroup_context,
+                int *exit_status) {
+
+        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
+        int r, ngids = 0;
+        _cleanup_free_ gid_t *supplementary_gids = NULL;
+        const char *username = NULL, *groupname = NULL;
+        _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
+        const char *home = NULL, *shell = NULL;
+        char **final_argv = NULL;
+        dev_t journal_stream_dev = 0;
+        ino_t journal_stream_ino = 0;
+        bool userns_set_up = false;
+        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
+                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
+                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
+                needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
+        bool keep_seccomp_privileges = false;
+#if HAVE_SELINUX
+        _cleanup_free_ char *mac_selinux_context_net = NULL;
+        bool use_selinux = false;
+#endif
+#if ENABLE_SMACK
+        bool use_smack = false;
+#endif
+#if HAVE_APPARMOR
+        bool use_apparmor = false;
+#endif
+#if HAVE_SECCOMP
+        uint64_t saved_bset = 0;
+#endif
+        uid_t saved_uid = getuid();
+        gid_t saved_gid = getgid();
+        uid_t uid = UID_INVALID;
+        gid_t gid = GID_INVALID;
+        size_t n_fds, /* fds to pass to the child */
+               n_keep_fds; /* total number of fds not to close */
+        int secure_bits;
+        _cleanup_free_ gid_t *gids_after_pam = NULL;
+        int ngids_after_pam = 0;
+
+        int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
+        size_t n_storage_fds, n_socket_fds;
+
+        assert(command);
+        assert(context);
+        assert(params);
+        assert(exit_status);
+
+        if (context->log_level_max >= 0)
+                log_set_max_level(context->log_level_max);
+
+        /* Explicitly test for CVE-2021-4034 inspired invocations */
+        if (!command->path || strv_isempty(command->argv)) {
+                *exit_status = EXIT_EXEC;
+                return log_exec_error_errno(
+                                context,
+                                params,
+                                SYNTHETIC_ERRNO(EINVAL),
+                                "Invalid command line arguments.");
+        }
+
+        LOG_CONTEXT_PUSH_EXEC(context, params);
+
+        if (context->std_input == EXEC_INPUT_SOCKET ||
+            context->std_output == EXEC_OUTPUT_SOCKET ||
+            context->std_error == EXEC_OUTPUT_SOCKET) {
+
+                if (params->n_socket_fds > 1)
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
+
+                if (params->n_socket_fds == 0)
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
+
+                socket_fd = params->fds[0];
+                n_storage_fds = n_socket_fds = 0;
+        } else {
+                n_socket_fds = params->n_socket_fds;
+                n_storage_fds = params->n_storage_fds;
+        }
+        n_fds = n_socket_fds + n_storage_fds;
+
+        r = exec_context_named_iofds(context, params, named_iofds);
+        if (r < 0)
+                return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
+
+        rename_process_from_path(command->path);
+
+        /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
+         * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
+         * both of which will be demoted to SIG_DFL. */
+        (void) default_signals(SIGNALS_CRASH_HANDLER,
+                               SIGNALS_IGNORE);
+
+        if (context->ignore_sigpipe)
+                (void) ignore_signals(SIGPIPE);
+
+        r = reset_signal_mask();
+        if (r < 0) {
+                *exit_status = EXIT_SIGNAL_MASK;
+                return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
+        }
+
+        if (params->idle_pipe)
+                do_idle_pipe_dance(params->idle_pipe);
+
+        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
+         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
+         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
+         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
+
+        log_forget_fds();
+        log_set_open_when_needed(true);
+        log_settle_target();
+
+        /* In case anything used libc syslog(), close this here, too */
+        closelog();
+
+        r = collect_open_file_fds(context, params, &n_fds);
+        if (r < 0) {
+                *exit_status = EXIT_FDS;
+                return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
+        }
+
+        int keep_fds[n_fds + 3];
+        memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
+        n_keep_fds = n_fds;
+
+        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->exec_fd);
+        if (r < 0) {
+                *exit_status = EXIT_FDS;
+                return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+        }
+
+#if HAVE_LIBBPF
+        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->bpf_outer_map_fd);
+        if (r < 0) {
+                *exit_status = EXIT_FDS;
+                return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+        }
+#endif
+
+        r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
+        if (r < 0) {
+                *exit_status = EXIT_FDS;
+                return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
+        }
+
+        if (!context->same_pgrp &&
+            setsid() < 0) {
+                *exit_status = EXIT_SETSID;
+                return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
+        }
+
+        exec_context_tty_reset(context, params);
+
+        if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
+                _cleanup_free_ char *cmdline = NULL;
+
+                cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
+                if (!cmdline) {
+                        *exit_status = EXIT_MEMORY;
+                        return log_oom();
+                }
+
+                r = ask_for_confirmation(context, params, cmdline);
+                if (r != CONFIRM_EXECUTE) {
+                        if (r == CONFIRM_PRETEND_SUCCESS) {
+                                *exit_status = EXIT_SUCCESS;
+                                return 0;
+                        }
+
+                        *exit_status = EXIT_CONFIRM;
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
+                                                    "Execution cancelled by the user");
+                }
+        }
+
+        /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
+         * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
+         * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
+         * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
+         * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
+        if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
+            setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
+                *exit_status = EXIT_MEMORY;
+                return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
+        }
+
+        if (context->dynamic_user && runtime && runtime->dynamic_creds) {
+                _cleanup_strv_free_ char **suggested_paths = NULL;
+
+                /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
+                 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
+                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
+                        *exit_status = EXIT_USER;
+                        return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
+                }
+
+                r = compile_suggested_paths(context, params, &suggested_paths);
+                if (r < 0) {
+                        *exit_status = EXIT_MEMORY;
+                        return log_oom();
+                }
+
+                r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
+                if (r < 0) {
+                        *exit_status = EXIT_USER;
+                        if (r == -EILSEQ)
+                                return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                                            "Failed to update dynamic user credentials: User or group with specified name already exists.");
+                        return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
+                }
+
+                if (!uid_is_valid(uid)) {
+                        *exit_status = EXIT_USER;
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
+                }
+
+                if (!gid_is_valid(gid)) {
+                        *exit_status = EXIT_USER;
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
+                }
+
+                if (runtime->dynamic_creds->user)
+                        username = runtime->dynamic_creds->user->name;
+
+        } else {
+                if (context->user) {
+                        r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
+                        if (r < 0) {
+                                *exit_status = EXIT_USER;
+                                return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
+                        }
+                }
+
+                if (context->group) {
+                        r = get_fixed_group(context->group, &groupname, &gid);
+                        if (r < 0) {
+                                *exit_status = EXIT_GROUP;
+                                return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
+                        }
+                }
+        }
+
+        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
+        r = get_supplementary_groups(context, username, groupname, gid,
+                                     &supplementary_gids, &ngids);
+        if (r < 0) {
+                *exit_status = EXIT_GROUP;
+                return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
+        }
+
+        r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
+        if (r < 0) {
+                *exit_status = EXIT_USER;
+                return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
+        }
+
+        params->user_lookup_fd = safe_close(params->user_lookup_fd);
+
+        r = acquire_home(context, uid, &home, &home_buffer);
+        if (r < 0) {
+                *exit_status = EXIT_CHDIR;
+                return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
+        }
+
+        /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
+        if (socket_fd >= 0)
+                (void) fd_nonblock(socket_fd, false);
+
+        /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
+         * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
+        if (params->cgroup_path) {
+                _cleanup_free_ char *p = NULL;
+
+                r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                if (r < 0) {
+                        *exit_status = EXIT_CGROUP;
+                        return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
+                }
+
+                r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
+                if (r == -EUCLEAN) {
+                        *exit_status = EXIT_CGROUP;
+                        return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
+                                                    "because the cgroup or one of its parents or "
+                                                    "siblings is in the threaded mode: %m", p);
+                }
+                if (r < 0) {
+                        *exit_status = EXIT_CGROUP;
+                        return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
+                }
+        }
+
+        if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
+                r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
+                if (r < 0) {
+                        *exit_status = EXIT_NETWORK;
+                        return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
+                }
+        }
+
+        if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
+                r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
+                }
+        }
+
+        r = setup_input(context, params, socket_fd, named_iofds);
+        if (r < 0) {
+                *exit_status = EXIT_STDIN;
+                return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
+        }
+
+        r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+        if (r < 0) {
+                *exit_status = EXIT_STDOUT;
+                return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
+        }
+
+        r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+        if (r < 0) {
+                *exit_status = EXIT_STDERR;
+                return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
+        }
+
+        if (context->oom_score_adjust_set) {
+                /* When we can't make this change due to EPERM, then let's silently skip over it. User
+                 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
+                r = set_oom_score_adjust(context->oom_score_adjust);
+                if (ERRNO_IS_NEG_PRIVILEGE(r))
+                        log_exec_debug_errno(context, params, r,
+                                             "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
+                else if (r < 0) {
+                        *exit_status = EXIT_OOM_ADJUST;
+                        return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
+                }
+        }
+
+        if (context->coredump_filter_set) {
+                r = set_coredump_filter(context->coredump_filter);
+                if (ERRNO_IS_NEG_PRIVILEGE(r))
+                        log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
+                else if (r < 0) {
+                        *exit_status = EXIT_LIMITS;
+                        return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
+                }
+        }
+
+        if (context->nice_set) {
+                r = setpriority_closest(context->nice);
+                if (r < 0) {
+                        *exit_status = EXIT_NICE;
+                        return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
+                }
+        }
+
+        if (context->cpu_sched_set) {
+                struct sched_param param = {
+                        .sched_priority = context->cpu_sched_priority,
+                };
+
+                r = sched_setscheduler(0,
+                                       context->cpu_sched_policy |
+                                       (context->cpu_sched_reset_on_fork ?
+                                        SCHED_RESET_ON_FORK : 0),
+                                       ¶m);
+                if (r < 0) {
+                        *exit_status = EXIT_SETSCHEDULER;
+                        return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
+                }
+        }
+
+        if (context->cpu_affinity_from_numa || context->cpu_set.set) {
+                _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
+                const CPUSet *cpu_set;
+
+                if (context->cpu_affinity_from_numa) {
+                        r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
+                        if (r < 0) {
+                                *exit_status = EXIT_CPUAFFINITY;
+                                return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
+                        }
+
+                        cpu_set = &converted_cpu_set;
+                } else
+                        cpu_set = &context->cpu_set;
+
+                if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
+                        *exit_status = EXIT_CPUAFFINITY;
+                        return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
+                }
+        }
+
+        if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
+                r = apply_numa_policy(&context->numa_policy);
+                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                        log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
+                else if (r < 0) {
+                        *exit_status = EXIT_NUMA_POLICY;
+                        return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
+                }
+        }
+
+        if (context->ioprio_set)
+                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
+                        *exit_status = EXIT_IOPRIO;
+                        return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
+                }
+
+        if (context->timer_slack_nsec != NSEC_INFINITY)
+                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
+                        *exit_status = EXIT_TIMERSLACK;
+                        return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
+                }
+
+        if (context->personality != PERSONALITY_INVALID) {
+                r = safe_personality(context->personality);
+                if (r < 0) {
+                        *exit_status = EXIT_PERSONALITY;
+                        return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
+                }
+        }
+
+#if ENABLE_UTMP
+        if (context->utmp_id) {
+                _cleanup_free_ char *username_alloc = NULL;
+
+                if (!username && context->utmp_mode == EXEC_UTMP_USER) {
+                        username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
+                        if (!username_alloc) {
+                                *exit_status = EXIT_USER;
+                                return log_oom();
+                        }
+                }
+
+                const char *line = context->tty_path ?
+                        (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
+                        NULL;
+                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
+                                      line,
+                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
+                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
+                                      USER_PROCESS,
+                                      username ?: username_alloc);
+        }
+#endif
+
+        if (uid_is_valid(uid)) {
+                r = chown_terminal(STDIN_FILENO, uid);
+                if (r < 0) {
+                        *exit_status = EXIT_STDIN;
+                        return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
+                }
+        }
+
+        if (params->cgroup_path) {
+                /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
+                 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
+                 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+                 * touch a single hierarchy too. */
+
+                if (params->flags & EXEC_CGROUP_DELEGATE) {
+                        _cleanup_free_ char *p = NULL;
+
+                        r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
+                        if (r < 0) {
+                                *exit_status = EXIT_CGROUP;
+                                return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
+                        }
+
+                        r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                        if (r < 0) {
+                                *exit_status = EXIT_CGROUP;
+                                return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
+                        }
+                        if (r > 0) {
+                                r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
+                                if (r < 0) {
+                                        *exit_status = EXIT_CGROUP;
+                                        return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
+                                }
+                        }
+                }
+
+                if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
+                        if (cgroup_context_want_memory_pressure(cgroup_context)) {
+                                r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
+                                if (r < 0) {
+                                        *exit_status = EXIT_MEMORY;
+                                        return log_oom();
+                                }
+
+                                r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
+                                if (r < 0) {
+                                        log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                                            "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
+                                        memory_pressure_path = mfree(memory_pressure_path);
+                                }
+                        } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
+                                memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
+                                if (!memory_pressure_path) {
+                                        *exit_status = EXIT_MEMORY;
+                                        return log_oom();
+                                }
+                        }
+                }
+        }
+
+        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+                r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
+                if (r < 0)
+                        return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
+        }
+
+        if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
+                r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_CREDENTIALS;
+                        return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
+                }
+        }
+
+        r = build_environment(
+                        context,
+                        params,
+                        cgroup_context,
+                        n_fds,
+                        home,
+                        username,
+                        shell,
+                        journal_stream_dev,
+                        journal_stream_ino,
+                        memory_pressure_path,
+                        &our_env);
+        if (r < 0) {
+                *exit_status = EXIT_MEMORY;
+                return log_oom();
+        }
+
+        r = build_pass_environment(context, &pass_env);
+        if (r < 0) {
+                *exit_status = EXIT_MEMORY;
+                return log_oom();
+        }
+
+        /* The $PATH variable is set to the default path in params->environment. However, this is overridden
+         * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
+         * not specify PATH but the unit has ExecSearchPath. */
+        if (!strv_isempty(context->exec_search_path)) {
+                _cleanup_free_ char *joined = NULL;
+
+                joined = strv_join(context->exec_search_path, ":");
+                if (!joined) {
+                        *exit_status = EXIT_MEMORY;
+                        return log_oom();
+                }
+
+                r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
+                if (r < 0) {
+                        *exit_status = EXIT_MEMORY;
+                        return log_oom();
+                }
+        }
+
+        accum_env = strv_env_merge(params->environment,
+                                   our_env,
+                                   joined_exec_search_path,
+                                   pass_env,
+                                   context->environment,
+                                   params->files_env);
+        if (!accum_env) {
+                *exit_status = EXIT_MEMORY;
+                return log_oom();
+        }
+        accum_env = strv_env_clean(accum_env);
+
+        (void) umask(context->umask);
+
+        r = setup_keyring(context, params, uid, gid);
+        if (r < 0) {
+                *exit_status = EXIT_KEYRING;
+                return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
+        }
+
+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+         * from it. */
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
+        /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
+         * for it, and the kernel doesn't actually support ambient caps. */
+        needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
+
+        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
+         * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
+         * desired. */
+        if (needs_ambient_hack)
+                needs_setuid = false;
+        else
+                needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
+
+        uint64_t capability_ambient_set = context->capability_ambient_set;
+
+        if (needs_sandboxing) {
+                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
+                 * /sys being present. The actual MAC context application will happen later, as late as
+                 * possible, to avoid impacting our own code paths. */
+
+#if HAVE_SELINUX
+                use_selinux = mac_selinux_use();
+#endif
+#if ENABLE_SMACK
+                use_smack = mac_smack_use();
+#endif
+#if HAVE_APPARMOR
+                use_apparmor = mac_apparmor_use();
+#endif
+        }
+
+        if (needs_sandboxing) {
+                int which_failed;
+
+                /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
+                 * is set here. (See below.) */
+
+                r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
+                if (r < 0) {
+                        *exit_status = EXIT_LIMITS;
+                        return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+                }
+        }
+
+        if (needs_setuid && context->pam_name && username) {
+                /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
+                 * wins here. (See above.) */
+
+                /* All fds passed in the fds array will be closed in the pam child process. */
+                r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds);
+                if (r < 0) {
+                        *exit_status = EXIT_PAM;
+                        return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
+                }
+
+                if (ambient_capabilities_supported()) {
+                        uint64_t ambient_after_pam;
+
+                        /* PAM modules might have set some ambient caps. Query them here and merge them into
+                         * the caps we want to set in the end, so that we don't end up unsetting them. */
+                        r = capability_get_ambient(&ambient_after_pam);
+                        if (r < 0) {
+                                *exit_status = EXIT_CAPABILITIES;
+                                return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
+                        }
+
+                        capability_ambient_set |= ambient_after_pam;
+                }
+
+                ngids_after_pam = getgroups_alloc(&gids_after_pam);
+                if (ngids_after_pam < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
+                }
+        }
+
+        if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
+                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
+                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
+                 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
+
+                r = setup_private_users(saved_uid, saved_gid, uid, gid);
+                /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
+                 * the actual requested operations fail (or silently continue). */
+                if (r < 0 && context->private_users) {
+                        *exit_status = EXIT_USER;
+                        return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
+                }
+                if (r < 0)
+                        log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
+                else
+                        userns_set_up = true;
+        }
+
+        if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
+
+                /* Try to enable network namespacing if network namespacing is available and we have
+                 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
+                 * new network namespace. And if we don't have that, then we could only create a network
+                 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
+                if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
+                        r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
+                        if (ERRNO_IS_NEG_PRIVILEGE(r))
+                                log_exec_notice_errno(context, params, r,
+                                                      "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
+                        else if (r < 0) {
+                                *exit_status = EXIT_NETWORK;
+                                return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
+                        }
+                } else if (context->network_namespace_path) {
+                        *exit_status = EXIT_NETWORK;
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                                    "NetworkNamespacePath= is not supported, refusing.");
+                } else
+                        log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
+        }
+
+        if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
+
+                if (ns_type_supported(NAMESPACE_IPC)) {
+                        r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
+                        if (r == -EPERM)
+                                log_exec_warning_errno(context, params, r,
+                                                       "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
+                        else if (r < 0) {
+                                *exit_status = EXIT_NAMESPACE;
+                                return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
+                        }
+                } else if (context->ipc_namespace_path) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                                    "IPCNamespacePath= is not supported, refusing.");
+                } else
+                        log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
+        }
+
+        if (needs_mount_namespace) {
+                _cleanup_free_ char *error_path = NULL;
+
+                r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
+                                                    error_path ? ": " : "", strempty(error_path));
+                }
+        }
+
+        if (needs_sandboxing) {
+                r = apply_protect_hostname(context, params, exit_status);
+                if (r < 0)
+                        return r;
+        }
+
+        if (context->memory_ksm >= 0)
+                if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
+                        if (ERRNO_IS_NOT_SUPPORTED(errno))
+                                log_exec_debug_errno(context,
+                                                     params,
+                                                     errno,
+                                                     "KSM support not available, ignoring.");
+                        else {
+                                *exit_status = EXIT_KSM;
+                                return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
+                        }
+                }
+
+        /* Drop groups as early as possible.
+         * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
+         * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
+        if (needs_setuid) {
+                _cleanup_free_ gid_t *gids_to_enforce = NULL;
+                int ngids_to_enforce = 0;
+
+                ngids_to_enforce = merge_gid_lists(supplementary_gids,
+                                                   ngids,
+                                                   gids_after_pam,
+                                                   ngids_after_pam,
+                                                   &gids_to_enforce);
+                if (ngids_to_enforce < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return log_exec_error_errno(context, params,
+                                                    ngids_to_enforce,
+                                                    "Failed to merge group lists. Group membership might be incorrect: %m");
+                }
+
+                r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
+                if (r < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
+                }
+        }
+
+        /* If the user namespace was not set up above, try to do it now.
+         * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
+         * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
+         * case of mount namespaces being less privileged when the mount point list is copied from a
+         * different user namespace). */
+
+        if (needs_sandboxing && context->private_users && !userns_set_up) {
+                r = setup_private_users(saved_uid, saved_gid, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_USER;
+                        return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
+                }
+        }
+
+        /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
+         * shall execute. */
+
+        _cleanup_free_ char *executable = NULL;
+        _cleanup_close_ int executable_fd = -EBADF;
+        r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
+        if (r < 0) {
+                if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+                        log_exec_struct_errno(context, params, LOG_INFO, r,
+                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+                                              LOG_EXEC_INVOCATION_ID(params),
+                                              LOG_EXEC_MESSAGE(params,
+                                                               "Executable %s missing, skipping: %m",
+                                                               command->path),
+                                              "EXECUTABLE=%s", command->path);
+                        *exit_status = EXIT_SUCCESS;
+                        return 0;
+                }
+
+                *exit_status = EXIT_EXEC;
+                return log_exec_struct_errno(context, params, LOG_INFO, r,
+                                             "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+                                             LOG_EXEC_INVOCATION_ID(params),
+                                             LOG_EXEC_MESSAGE(params,
+                                                              "Failed to locate executable %s: %m",
+                                                              command->path),
+                                             "EXECUTABLE=%s", command->path);
+        }
+
+        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
+        if (r < 0) {
+                *exit_status = EXIT_FDS;
+                return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+        }
+
+#if HAVE_SELINUX
+        if (needs_sandboxing && use_selinux && params->selinux_context_net) {
+                int fd = -EBADF;
+
+                if (socket_fd >= 0)
+                        fd = socket_fd;
+                else if (params->n_socket_fds == 1)
+                        /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
+                         * use context from that fd to compute the label. */
+                        fd = params->fds[0];
+
+                if (fd >= 0) {
+                        r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
+                        if (r < 0) {
+                                if (!context->selinux_context_ignore) {
+                                        *exit_status = EXIT_SELINUX_CONTEXT;
+                                        return log_exec_error_errno(context,
+                                                                    params,
+                                                                    r,
+                                                                    "Failed to determine SELinux context: %m");
+                                }
+                                log_exec_debug_errno(context,
+                                                     params,
+                                                     r,
+                                                     "Failed to determine SELinux context, ignoring: %m");
+                        }
+                }
+        }
+#endif
+
+        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
+         * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
+         * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
+         * execve(). But first, close the remaining sockets in the context objects. */
+
+        exec_runtime_close(runtime);
+        exec_params_close(params);
+
+        r = close_all_fds(keep_fds, n_keep_fds);
+        if (r >= 0)
+                r = shift_fds(params->fds, n_fds);
+        if (r >= 0)
+                r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
+        if (r < 0) {
+                *exit_status = EXIT_FDS;
+                return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
+        }
+
+        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
+         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
+         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
+         * came this far. */
+
+        secure_bits = context->secure_bits;
+
+        if (needs_sandboxing) {
+                uint64_t bset;
+
+                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
+                 * (Note this is placed after the general resource limit initialization, see above, in order
+                 * to take precedence.) */
+                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
+                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
+                                *exit_status = EXIT_LIMITS;
+                                return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
+                        }
+                }
+
+#if ENABLE_SMACK
+                /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
+                 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
+                if (use_smack && context->smack_process_label) {
+                        r = setup_smack(params, context, executable_fd);
+                        if (r < 0 && !context->smack_process_label_ignore) {
+                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
+                                return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
+                        }
+                }
+#endif
+
+                bset = context->capability_bounding_set;
+                /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
+                 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
+                 * instead of us doing that */
+                if (needs_ambient_hack)
+                        bset |= (UINT64_C(1) << CAP_SETPCAP) |
+                                (UINT64_C(1) << CAP_SETUID) |
+                                (UINT64_C(1) << CAP_SETGID);
+
+#if HAVE_SECCOMP
+                /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
+                 * keep the needed privileges to apply it even if we're not root. */
+                if (needs_setuid &&
+                    uid_is_valid(uid) &&
+                    context_has_seccomp(context) &&
+                    seccomp_allows_drop_privileges(context)) {
+                        keep_seccomp_privileges = true;
+
+                        if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
+                                *exit_status = EXIT_USER;
+                                return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
+                        }
+
+                        /* Save the current bounding set so we can restore it after applying the seccomp
+                         * filter */
+                        saved_bset = bset;
+                        bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
+                                (UINT64_C(1) << CAP_SETPCAP);
+                }
+#endif
+
+                if (!cap_test_all(bset)) {
+                        r = capability_bounding_set_drop(bset, /* right_now= */ false);
+                        if (r < 0) {
+                                *exit_status = EXIT_CAPABILITIES;
+                                return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
+                        }
+                }
+
+                /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
+                 * keep-caps set.
+                 *
+                 * To be able to raise the ambient capabilities after setresuid() they have to be added to
+                 * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
+                 * the ambient capabilities can be raised as they are present in the permitted and
+                 * inhertiable set. However it is possible that someone wants to set ambient capabilities
+                 * without changing the user, so we also set the ambient capabilities here.
+                 *
+                 * The requested ambient capabilities are raised in the inheritable set if the second
+                 * argument is true. */
+                if (!needs_ambient_hack) {
+                        r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
+                        if (r < 0) {
+                                *exit_status = EXIT_CAPABILITIES;
+                                return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
+                        }
+                }
+        }
+
+        /* chroot to root directory first, before we lose the ability to chroot */
+        r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
+        if (r < 0)
+                return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
+
+        if (needs_setuid) {
+                if (uid_is_valid(uid)) {
+                        r = enforce_user(context, uid, capability_ambient_set);
+                        if (r < 0) {
+                                *exit_status = EXIT_USER;
+                                return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
+                        }
+
+                        if (keep_seccomp_privileges) {
+                                if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
+                                        r = drop_capability(CAP_SETUID);
+                                        if (r < 0) {
+                                                *exit_status = EXIT_USER;
+                                                return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
+                                        }
+                                }
+
+                                r = keep_capability(CAP_SYS_ADMIN);
+                                if (r < 0) {
+                                        *exit_status = EXIT_USER;
+                                        return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
+                                }
+
+                                r = keep_capability(CAP_SETPCAP);
+                                if (r < 0) {
+                                        *exit_status = EXIT_USER;
+                                        return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
+                                }
+                        }
+
+                        if (!needs_ambient_hack && capability_ambient_set != 0) {
+
+                                /* Raise the ambient capabilities after user change. */
+                                r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
+                                if (r < 0) {
+                                        *exit_status = EXIT_CAPABILITIES;
+                                        return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
+                                }
+                        }
+                }
+        }
+
+        /* Apply working directory here, because the working directory might be on NFS and only the user running
+         * this service might have the correct privilege to change to the working directory */
+        r = apply_working_directory(context, params, runtime, home, exit_status);
+        if (r < 0)
+                return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
+
+        if (needs_sandboxing) {
+                /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
+                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
+                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
+                 * are restricted. */
+
+#if HAVE_SELINUX
+                if (use_selinux) {
+                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
+
+                        if (exec_context) {
+                                r = setexeccon(exec_context);
+                                if (r < 0) {
+                                        if (!context->selinux_context_ignore) {
+                                                *exit_status = EXIT_SELINUX_CONTEXT;
+                                                return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
+                                        }
+                                        log_exec_debug_errno(context,
+                                                             params,
+                                                             r,
+                                                             "Failed to change SELinux context to %s, ignoring: %m",
+                                                             exec_context);
+                                }
+                        }
+                }
+#endif
+
+#if HAVE_APPARMOR
+                if (use_apparmor && context->apparmor_profile) {
+                        r = aa_change_onexec(context->apparmor_profile);
+                        if (r < 0 && !context->apparmor_profile_ignore) {
+                                *exit_status = EXIT_APPARMOR_PROFILE;
+                                return log_exec_error_errno(context,
+                                                            params,
+                                                            errno,
+                                                            "Failed to prepare AppArmor profile change to %s: %m",
+                                                            context->apparmor_profile);
+                        }
+                }
+#endif
+
+                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
+                 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
+                 * requires CAP_SETPCAP. */
+                if (prctl(PR_GET_SECUREBITS) != secure_bits) {
+                        /* CAP_SETPCAP is required to set securebits. This capability is raised into the
+                         * effective set here.
+                         *
+                         * The effective set is overwritten during execve() with the following values:
+                         *
+                         * - ambient set (for non-root processes)
+                         *
+                         * - (inheritable | bounding) set for root processes)
+                         *
+                         * Hence there is no security impact to raise it in the effective set before execve
+                         */
+                        r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
+                        if (r < 0) {
+                                *exit_status = EXIT_CAPABILITIES;
+                                return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
+                        }
+                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
+                                *exit_status = EXIT_SECUREBITS;
+                                return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
+                        }
+                }
+
+                if (context_has_no_new_privileges(context))
+                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
+                                return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
+                        }
+
+#if HAVE_SECCOMP
+                r = apply_address_families(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_ADDRESS_FAMILIES;
+                        return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
+                }
+
+                r = apply_memory_deny_write_execute(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
+                }
+
+                r = apply_restrict_realtime(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
+                }
+
+                r = apply_restrict_suid_sgid(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
+                }
+
+                r = apply_restrict_namespaces(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
+                }
+
+                r = apply_protect_sysctl(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
+                }
+
+                r = apply_protect_kernel_modules(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
+                }
+
+                r = apply_protect_kernel_logs(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
+                }
+
+                r = apply_protect_clock(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
+                }
+
+                r = apply_private_devices(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
+                }
+
+                r = apply_syscall_archs(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
+                }
+
+                r = apply_lock_personality(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
+                }
+
+                r = apply_syscall_log(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
+                }
+#endif
+
+#if HAVE_LIBBPF
+                r = apply_restrict_filesystems(context, params);
+                if (r < 0) {
+                        *exit_status = EXIT_BPF;
+                        return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
+                }
+#endif
+
+#if HAVE_SECCOMP
+                /* This really should remain as close to the execve() as possible, to make sure our own code is affected
+                 * by the filter as little as possible. */
+                r = apply_syscall_filter(context, params, needs_ambient_hack);
+                if (r < 0) {
+                        *exit_status = EXIT_SECCOMP;
+                        return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
+                }
+
+                if (keep_seccomp_privileges) {
+                        /* Restore the capability bounding set with what's expected from the service + the
+                         * ambient capabilities hack */
+                        if (!cap_test_all(saved_bset)) {
+                                r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
+                                if (r < 0) {
+                                        *exit_status = EXIT_CAPABILITIES;
+                                        return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
+                                }
+                        }
+
+                        /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
+                         * applications that use it. */
+                        if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
+                                r = drop_capability(CAP_SYS_ADMIN);
+                                if (r < 0) {
+                                        *exit_status = EXIT_USER;
+                                        return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
+                                }
+                        }
+
+                        /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
+                         * applications that use it. */
+                        if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
+                                r = drop_capability(CAP_SETPCAP);
+                                if (r < 0) {
+                                        *exit_status = EXIT_USER;
+                                        return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
+                                }
+                        }
+
+                        if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
+                                *exit_status = EXIT_USER;
+                                return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
+                        }
+                }
+#endif
+
+        }
+
+        if (!strv_isempty(context->unset_environment)) {
+                char **ee = NULL;
+
+                ee = strv_env_delete(accum_env, 1, context->unset_environment);
+                if (!ee) {
+                        *exit_status = EXIT_MEMORY;
+                        return log_oom();
+                }
+
+                strv_free_and_replace(accum_env, ee);
+        }
+
+        if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
+                _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
+
+                r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
+                if (r < 0) {
+                        *exit_status = EXIT_MEMORY;
+                        return log_exec_error_errno(context,
+                                                    params,
+                                                    r,
+                                                    "Failed to replace environment variables: %m");
+                }
+                final_argv = replaced_argv;
+
+                if (!strv_isempty(unset_variables)) {
+                        _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
+                        log_exec_warning(context,
+                                         params,
+                                         "Referenced but unset environment variable evaluates to an empty string: %s",
+                                         strna(ju));
+                }
+
+                if (!strv_isempty(bad_variables)) {
+                        _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
+                        log_exec_warning(context,
+                                         params,
+                                         "Invalid environment variable name evaluates to an empty string: %s",
+                                         strna(jb));
+                }
+        } else
+                final_argv = command->argv;
+
+        log_command_line(context, params, "Executing", executable, final_argv);
+
+        if (params->exec_fd >= 0) {
+                uint8_t hot = 1;
+
+                /* We have finished with all our initializations. Let's now let the manager know that. From this point
+                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+
+                if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
+                        *exit_status = EXIT_EXEC;
+                        return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
+                }
+        }
+
+        r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
+
+        if (params->exec_fd >= 0) {
+                uint8_t hot = 0;
+
+                /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
+                 * that POLLHUP on it no longer means execve() succeeded. */
+
+                if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
+                        *exit_status = EXIT_EXEC;
+                        return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
+                }
+        }
+
+        *exit_status = EXIT_EXEC;
+        return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
+}
diff --git a/src/core/exec-invoke.h b/src/core/exec-invoke.h
new file mode 100644
index 0000000..a8a3ac6
--- /dev/null
+++ b/src/core/exec-invoke.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct ExecCommand ExecCommand;
+typedef struct ExecContext ExecContext;
+typedef struct ExecParameters ExecParameters;
+typedef struct ExecRuntime ExecRuntime;
+typedef struct CGroupContext CGroupContext;
+
+int exec_invoke(
+                const ExecCommand *command,
+                const ExecContext *context,
+                ExecParameters *params,
+                ExecRuntime *runtime,
+                const CGroupContext *cgroup_context,
+                int *exit_status);
diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c
new file mode 100644
index 0000000..b1e716e
--- /dev/null
+++ b/src/core/execute-serialize.c
@@ -0,0 +1,3896 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "af-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute-serialize.h"
+#include "hexdecoct.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "in-addr-prefix-util.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "serialize.h"
+#include "string-util.h"
+#include "strv.h"
+
+static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
+        _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL,
+                            *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL,
+                            *startup_cpuset_mems = NULL;
+        char *iface;
+        struct in_addr_prefix *iaai;
+        int r;
+
+        assert(f);
+
+        if (!c)
+                return 0;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-cpu-accounting", c->cpu_accounting);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-io-accounting", c->io_accounting);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-block-io-accounting", c->blockio_accounting);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-memory-accounting", c->memory_accounting);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-tasks-accounting", c->tasks_accounting);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-ip-accounting", c->ip_accounting);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-memory-oom-group", c->memory_oom_group);
+        if (r < 0)
+                return r;
+
+        if (c->cpu_weight != CGROUP_WEIGHT_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-cpu-weight", "%" PRIu64, c->cpu_weight);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_cpu_weight != CGROUP_WEIGHT_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-cpu-weight", "%" PRIu64, c->startup_cpu_weight);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-cpu-shares", "%" PRIu64, c->cpu_shares);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-cpu-shares", "%" PRIu64, c->startup_cpu_shares);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
+                r = serialize_usec(f, "exec-cgroup-context-cpu-quota-per-sec-usec", c->cpu_quota_per_sec_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->cpu_quota_period_usec != USEC_INFINITY) {
+                r = serialize_usec(f, "exec-cgroup-context-cpu-quota-period-usec", c->cpu_quota_period_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
+        if (!cpuset_cpus)
+                return log_oom_debug();
+
+        r = serialize_item(f, "exec-cgroup-context-allowed-cpus", cpuset_cpus);
+        if (r < 0)
+                return r;
+
+        startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
+        if (!startup_cpuset_cpus)
+                return log_oom_debug();
+
+        r = serialize_item(f, "exec-cgroup-context-startup-allowed-cpus", startup_cpuset_cpus);
+        if (r < 0)
+                return r;
+
+        cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
+        if (!cpuset_mems)
+                return log_oom_debug();
+
+        r = serialize_item(f, "exec-cgroup-context-allowed-memory-nodes", cpuset_mems);
+        if (r < 0)
+                return r;
+
+        startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
+        if (!startup_cpuset_mems)
+                return log_oom_debug();
+
+        r = serialize_item(f, "exec-cgroup-context-startup-allowed-memory-nodes", startup_cpuset_mems);
+        if (r < 0)
+                return r;
+
+        if (c->io_weight != CGROUP_WEIGHT_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-io-weight", "%" PRIu64, c->io_weight);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_io_weight != CGROUP_WEIGHT_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-io-weight", "%" PRIu64, c->startup_io_weight);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-block-io-weight", "%" PRIu64, c->blockio_weight);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-block-io-weight", "%" PRIu64, c->startup_blockio_weight);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->default_memory_min > 0) {
+                r = serialize_item_format(f, "exec-cgroup-context-default-memory-min", "%" PRIu64, c->default_memory_min);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->default_memory_low > 0) {
+                r = serialize_item_format(f, "exec-cgroup-context-default-memory-low", "%" PRIu64, c->default_memory_low);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_min > 0) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-min", "%" PRIu64, c->memory_min);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_low > 0) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-low", "%" PRIu64, c->memory_low);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_memory_low > 0) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-memory-low", "%" PRIu64, c->startup_memory_low);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_high != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-high", "%" PRIu64, c->memory_high);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_memory_high != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-memory-high", "%" PRIu64, c->startup_memory_high);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_max != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-max", "%" PRIu64, c->memory_max);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_memory_max != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-memory-max", "%" PRIu64, c->startup_memory_max);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_swap_max != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-swap-max", "%" PRIu64, c->memory_swap_max);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_memory_swap_max != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-memory-swap-max", "%" PRIu64, c->startup_memory_swap_max);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_zswap_max != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-zswap-max", "%" PRIu64, c->memory_zswap_max);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->startup_memory_zswap_max != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-startup-memory-zswap-max", "%" PRIu64, c->startup_memory_zswap_max);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->memory_limit != CGROUP_LIMIT_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-memory-limit", "%" PRIu64, c->memory_limit);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->tasks_max.value != UINT64_MAX) {
+                r = serialize_item_format(f, "exec-cgroup-context-tasks-max-value", "%" PRIu64, c->tasks_max.value);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->tasks_max.scale > 0) {
+                r = serialize_item_format(f, "exec-cgroup-context-tasks-max-scale", "%" PRIu64, c->tasks_max.scale);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-default-memory-min-set", c->default_memory_min_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-default-memory-low-set", c->default_memory_low_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-default-startup-memory-low-set", c->default_startup_memory_low_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-memory-min-set", c->memory_min_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-memory-low-set", c->memory_low_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-low-set", c->startup_memory_low_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-high-set", c->startup_memory_high_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-max-set", c->startup_memory_max_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-swap-max-set", c->startup_memory_swap_max_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-zswap-max-set", c->startup_memory_zswap_max_set);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-device-policy", cgroup_device_policy_to_string(c->device_policy));
+        if (r < 0)
+                return r;
+
+        r = cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-disable-controllers", disable_controllers_str);
+        if (r < 0)
+                return r;
+
+        r = cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-delegate-controllers", delegate_controllers_str);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-delegate", c->delegate);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-managed-oom-swap", managed_oom_mode_to_string(c->moom_swap));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-managed-oom-memory-pressure", managed_oom_mode_to_string(c->moom_mem_pressure));
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-cgroup-context-managed-oom-memory-pressure-limit", "%" PRIu32, c->moom_mem_pressure_limit);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-managed-oom-preference", managed_oom_preference_to_string(c->moom_preference));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->memory_pressure_watch));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-cgroup-context-delegate-subgroup", c->delegate_subgroup);
+        if (r < 0)
+                return r;
+
+        if (c->memory_pressure_threshold_usec != USEC_INFINITY) {
+                r = serialize_usec(f, "exec-cgroup-context-memory-pressure-threshold-usec", c->memory_pressure_threshold_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH(device_allow, a, c->device_allow) {
+                r = serialize_item_format(f, "exec-cgroup-context-device-allow", "%s %s",
+                                          a->path,
+                                          cgroup_device_permissions_to_string(a->permissions));
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH(device_weights, iw, c->io_device_weights) {
+                r = serialize_item_format(f, "exec-cgroup-context-io-device-weight", "%s %" PRIu64,
+                                          iw->path,
+                                          iw->weight);
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH(device_latencies, l, c->io_device_latencies) {
+                r = serialize_item_format(f, "exec-cgroup-context-io-device-latency-target-usec", "%s " USEC_FMT,
+                                          l->path,
+                                          l->target_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH(device_limits, il, c->io_device_limits)
+                for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
+                        _cleanup_free_ char *key = NULL;
+
+                        if (il->limits[type] == cgroup_io_limit_defaults[type])
+                                continue;
+
+                        key = strjoin("exec-cgroup-context-io-device-limit-",
+                                        cgroup_io_limit_type_to_string(type));
+                        if (!key)
+                                return -ENOMEM;
+
+                        r = serialize_item_format(f, key, "%s %" PRIu64, il->path, il->limits[type]);
+                        if (r < 0)
+                                return r;
+                }
+
+        LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+                r = serialize_item_format(f, "exec-cgroup-context-blockio-device-weight", "%s %" PRIu64,
+                                          w->path,
+                                          w->weight);
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+                if (b->rbps != CGROUP_LIMIT_MAX) {
+                        r = serialize_item_format(f, "exec-cgroup-context-blockio-read-bandwidth", "%s %" PRIu64,
+                                                  b->path,
+                                                  b->rbps);
+                        if (r < 0)
+                                return r;
+                }
+                if (b->wbps != CGROUP_LIMIT_MAX) {
+                        r = serialize_item_format(f, "exec-cgroup-context-blockio-write-bandwidth", "%s %" PRIu64,
+                                                  b->path,
+                                                  b->wbps);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        SET_FOREACH(iaai, c->ip_address_allow) {
+                r = serialize_item(f,
+                                   "exec-cgroup-context-ip-address-allow",
+                                   IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+                if (r < 0)
+                        return r;
+        }
+        SET_FOREACH(iaai, c->ip_address_deny) {
+                r = serialize_item(f,
+                                   "exec-cgroup-context-ip-address-deny",
+                                   IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-ip-address-allow-reduced", c->ip_address_allow_reduced);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-cgroup-context-ip-address-deny-reduced", c->ip_address_deny_reduced);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-cgroup-context-ip-ingress-filter-path=", c->ip_filters_ingress);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-cgroup-context-ip-egress-filter-path=", c->ip_filters_egress);
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(programs, p, c->bpf_foreign_programs) {
+                r = serialize_item_format(f, "exec-cgroup-context-bpf-program", "%" PRIu32 " %s",
+                                          p->attach_type,
+                                          p->bpffs_path);
+                if (r < 0)
+                        return r;
+        }
+
+        LIST_FOREACH(socket_bind_items, bi, c->socket_bind_allow) {
+                fprintf(f, "exec-cgroup-context-socket-bind-allow=");
+                cgroup_context_dump_socket_bind_item(bi, f);
+                fputc('\n', f);
+        }
+
+        LIST_FOREACH(socket_bind_items, bi, c->socket_bind_deny) {
+                fprintf(f, "exec-cgroup-context-socket-bind-deny=");
+                cgroup_context_dump_socket_bind_item(bi, f);
+                fputc('\n', f);
+        }
+
+        SET_FOREACH(iface, c->restrict_network_interfaces) {
+                r = serialize_item(f, "exec-cgroup-context-restrict-network-interfaces", iface);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(
+                        f,
+                        "exec-cgroup-context-restrict-network-interfaces-is-allow-list",
+                        c->restrict_network_interfaces_is_allow_list);
+        if (r < 0)
+                return r;
+
+        fputc('\n', f); /* End marker */
+
+        return 0;
+}
+
+static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
+        int r;
+
+        assert(f);
+
+        if (!c)
+                return 0;
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *val;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                if ((val = startswith(l, "exec-cgroup-context-cpu-accounting="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->cpu_accounting = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-accounting="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->io_accounting = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-block-io-accounting="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->blockio_accounting = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-accounting="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->memory_accounting = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-tasks-accounting="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->tasks_accounting = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-accounting="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->ip_accounting = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-oom-group="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->memory_oom_group = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-cpu-weight="))) {
+                        r = safe_atou64(val, &c->cpu_weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-cpu-weight="))) {
+                        r = safe_atou64(val, &c->startup_cpu_weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-cpu-shares="))) {
+                        r = safe_atou64(val, &c->cpu_shares);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-cpu-shares="))) {
+                        r = safe_atou64(val, &c->startup_cpu_shares);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-cpu-quota-per-sec-usec="))) {
+                        r = deserialize_usec(val, &c->cpu_quota_per_sec_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-cpu-quota-period-usec="))) {
+                        r = deserialize_usec(val, &c->cpu_quota_period_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-allowed-cpus="))) {
+                        if (c->cpuset_cpus.set)
+                                return -EINVAL; /* duplicated */
+
+                        r = parse_cpu_set_full(
+                                        val,
+                                        &c->cpuset_cpus,
+                                        /* warn= */ false,
+                                        /* unit= */ NULL,
+                                        /* filename= */ NULL,
+                                        /* line= */ 0,
+                                        /* lvalue= */ NULL);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-allowed-cpus="))) {
+                        if (c->startup_cpuset_cpus.set)
+                                return -EINVAL; /* duplicated */
+
+                        r = parse_cpu_set_full(
+                                        val,
+                                        &c->startup_cpuset_cpus,
+                                        /* warn= */ false,
+                                        /* unit= */ NULL,
+                                        /* filename= */ NULL,
+                                        /* line= */ 0,
+                                        /* lvalue= */ NULL);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-allowed-memory-nodes="))) {
+                        if (c->cpuset_mems.set)
+                                return -EINVAL; /* duplicated */
+
+                        r = parse_cpu_set_full(
+                                        val,
+                                        &c->cpuset_mems,
+                                        /* warn= */ false,
+                                        /* unit= */ NULL,
+                                        /* filename= */ NULL,
+                                        /* line= */ 0,
+                                        /* lvalue= */ NULL);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-allowed-memory-nodes="))) {
+                        if (c->startup_cpuset_mems.set)
+                                return -EINVAL; /* duplicated */
+
+                        r = parse_cpu_set_full(
+                                        val,
+                                        &c->startup_cpuset_mems,
+                                        /* warn= */ false,
+                                        /* unit= */ NULL,
+                                        /* filename= */ NULL,
+                                        /* line= */ 0,
+                                        /* lvalue= */ NULL);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-weight="))) {
+                        r = safe_atou64(val, &c->io_weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-io-weight="))) {
+                        r = safe_atou64(val, &c->startup_io_weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-block-io-weight="))) {
+                        r = safe_atou64(val, &c->blockio_weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-block-io-weight="))) {
+                        r = safe_atou64(val, &c->startup_blockio_weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-default-memory-min="))) {
+                        r = safe_atou64(val, &c->default_memory_min);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-default-memory-low="))) {
+                        r = safe_atou64(val, &c->default_memory_low);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-min="))) {
+                        r = safe_atou64(val, &c->memory_min);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-low="))) {
+                        r = safe_atou64(val, &c->memory_low);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-low="))) {
+                        r = safe_atou64(val, &c->startup_memory_low);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-high="))) {
+                        r = safe_atou64(val, &c->memory_high);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-high="))) {
+                        r = safe_atou64(val, &c->startup_memory_high);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-max="))) {
+                        r = safe_atou64(val, &c->memory_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-max="))) {
+                        r = safe_atou64(val, &c->startup_memory_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-swap-max="))) {
+                        r = safe_atou64(val, &c->memory_swap_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-swap-max="))) {
+                        r = safe_atou64(val, &c->startup_memory_swap_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-zswap-max="))) {
+                        r = safe_atou64(val, &c->memory_zswap_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-zswap-max="))) {
+                        r = safe_atou64(val, &c->startup_memory_zswap_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-limit="))) {
+                        r = safe_atou64(val, &c->memory_limit);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-tasks-max-value="))) {
+                        r = safe_atou64(val, &c->tasks_max.value);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-tasks-max-scale="))) {
+                        r = safe_atou64(val, &c->tasks_max.scale);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-default-memory-min-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->default_memory_min_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-default-memory-low-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->default_memory_low_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-default-startup-memory-low-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->default_startup_memory_low_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-min-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->memory_min_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-low-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->memory_low_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-low-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->startup_memory_low_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-high-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->startup_memory_high_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-max-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->startup_memory_max_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-swap-max-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->startup_memory_swap_max_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-zswap-max-set="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->startup_memory_zswap_max_set = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-device-policy="))) {
+                        c->device_policy = cgroup_device_policy_from_string(val);
+                        if (c->device_policy < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-cgroup-context-disable-controllers="))) {
+                        r = cg_mask_from_string(val, &c->disable_controllers);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-delegate-controllers="))) {
+                        r = cg_mask_from_string(val, &c->delegate_controllers);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-delegate="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->delegate = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-swap="))) {
+                        c->moom_swap = managed_oom_mode_from_string(val);
+                        if (c->moom_swap < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure="))) {
+                        c->moom_mem_pressure = managed_oom_mode_from_string(val);
+                        if (c->moom_mem_pressure < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure-limit="))) {
+                        r = safe_atou32(val, &c->moom_mem_pressure_limit);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-preference="))) {
+                        c->moom_preference = managed_oom_preference_from_string(val);
+                        if (c->moom_preference < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) {
+                        c->memory_pressure_watch = cgroup_pressure_watch_from_string(val);
+                        if (c->memory_pressure_watch < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) {
+                        r = free_and_strdup(&c->delegate_subgroup, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-threshold-usec="))) {
+                        r = deserialize_usec(val, &c->memory_pressure_threshold_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) {
+                        _cleanup_free_ char *path = NULL, *rwm = NULL;
+                        CGroupDevicePermissions p;
+
+                        r = extract_many_words(&val, " ", 0, &path, &rwm, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                return -EINVAL;
+
+                        p = isempty(rwm) ? 0 : cgroup_device_permissions_from_string(rwm);
+                        if (p < 0)
+                                return p;
+
+                        r = cgroup_context_add_or_update_device_allow(c, path, p);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-device-weight="))) {
+                        _cleanup_free_ char *path = NULL, *weight = NULL;
+                        CGroupIODeviceWeight *a = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &path, &weight, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                return -EINVAL;
+
+                        LIST_FOREACH(device_weights, b, c->io_device_weights)
+                                if (path_equal(b->path, path)) {
+                                        a = b;
+                                        break;
+                                }
+
+                        if (!a) {
+                                a = new0(CGroupIODeviceWeight, 1);
+                                if (!a)
+                                        return log_oom_debug();
+
+                                a->path = TAKE_PTR(path);
+
+                                LIST_PREPEND(device_weights, c->io_device_weights, a);
+                        }
+
+                        r = safe_atou64(weight, &a->weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-device-latency-target-usec="))) {
+                        _cleanup_free_ char *path = NULL, *target = NULL;
+                        CGroupIODeviceLatency *a = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &path, &target, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                return -EINVAL;
+
+                        LIST_FOREACH(device_latencies, b, c->io_device_latencies)
+                                if (path_equal(b->path, path)) {
+                                        a = b;
+                                        break;
+                                }
+
+                        if (!a) {
+                                a = new0(CGroupIODeviceLatency, 1);
+                                if (!a)
+                                        return log_oom_debug();
+
+                                a->path = TAKE_PTR(path);
+
+                                LIST_PREPEND(device_latencies, c->io_device_latencies, a);
+                        }
+
+                        r = deserialize_usec(target, &a->target_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-device-limit-"))) {
+                        _cleanup_free_ char *type = NULL, *path = NULL, *limits = NULL;
+                        CGroupIODeviceLimit *limit = NULL;
+                        CGroupIOLimitType t;
+
+                        r = extract_many_words(&val, "= ", 0, &type, &path, &limits, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 3)
+                                return -EINVAL;
+
+                        t = cgroup_io_limit_type_from_string(type);
+                        if (t < 0)
+                                return t;
+
+                        LIST_FOREACH(device_limits, i, c->io_device_limits)
+                                if (path_equal(path, i->path)) {
+                                        limit = i;
+                                        break;
+                                }
+
+                        if (!limit) {
+                                limit = new0(CGroupIODeviceLimit, 1);
+                                if (!limit)
+                                        return log_oom_debug();
+
+                                limit->path = TAKE_PTR(path);
+                                for (CGroupIOLimitType i = 0; i < _CGROUP_IO_LIMIT_TYPE_MAX; i++)
+                                        limit->limits[i] = cgroup_io_limit_defaults[i];
+
+                                LIST_PREPEND(device_limits, c->io_device_limits, limit);
+                        }
+
+                        r = safe_atou64(limits, &limit->limits[t]);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-block-io-device-weight="))) {
+                        _cleanup_free_ char *path = NULL, *weight = NULL;
+                        CGroupBlockIODeviceWeight *a = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &path, &weight, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                return -EINVAL;
+
+                        a = new0(CGroupBlockIODeviceWeight, 1);
+                        if (!a)
+                                return log_oom_debug();
+
+                        a->path = TAKE_PTR(path);
+
+                        LIST_PREPEND(device_weights, c->blockio_device_weights, a);
+
+                        r = safe_atou64(weight, &a->weight);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-block-io-read-bandwidth="))) {
+                        _cleanup_free_ char *path = NULL, *bw = NULL;
+                        CGroupBlockIODeviceBandwidth *a = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &path, &bw, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                return -EINVAL;
+
+                        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+                                if (path_equal(b->path, path)) {
+                                        a = b;
+                                        break;
+                                }
+
+                        if (!a) {
+                                a = new0(CGroupBlockIODeviceBandwidth, 1);
+                                if (!a)
+                                        return log_oom_debug();
+
+                                a->path = TAKE_PTR(path);
+                                a->wbps = CGROUP_LIMIT_MAX;
+
+                                LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+                        }
+
+                        r = safe_atou64(bw, &a->rbps);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-block-io-write-bandwidth="))) {
+                        _cleanup_free_ char *path = NULL, *bw = NULL;
+                        CGroupBlockIODeviceBandwidth *a = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &path, &bw, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                return -EINVAL;
+
+                        LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+                                if (path_equal(b->path, path)) {
+                                        a = b;
+                                        break;
+                                }
+
+                        if (!a) {
+                                a = new0(CGroupBlockIODeviceBandwidth, 1);
+                                if (!a)
+                                        return log_oom_debug();
+
+                                a->path = TAKE_PTR(path);
+                                a->rbps = CGROUP_LIMIT_MAX;
+
+                                LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+                        }
+
+                        r = safe_atou64(bw, &a->wbps);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-address-allow="))) {
+                        struct in_addr_prefix a;
+
+                        r = in_addr_prefix_from_string_auto(val, &a.family, &a.address, &a.prefixlen);
+                        if (r < 0)
+                                return r;
+
+                        r = in_addr_prefix_add(&c->ip_address_allow, &a);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-address-deny="))) {
+                        struct in_addr_prefix a;
+
+                        r = in_addr_prefix_from_string_auto(val, &a.family, &a.address, &a.prefixlen);
+                        if (r < 0)
+                                return r;
+
+                        r = in_addr_prefix_add(&c->ip_address_deny, &a);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-address-allow-reduced="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->ip_address_allow_reduced = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-address-deny-reduced="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->ip_address_deny_reduced = r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-ingress-filter-path="))) {
+                        r = deserialize_strv(val, &c->ip_filters_ingress);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-ip-egress-filter-path="))) {
+                        r = deserialize_strv(val, &c->ip_filters_egress);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-bpf-program="))) {
+                        _cleanup_free_ char *type = NULL, *path = NULL;
+                        uint32_t t;
+
+                        r = extract_many_words(&val, " ", 0, &type, &path, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                return -EINVAL;
+
+                        r = safe_atou32(type, &t);
+                        if (r < 0)
+                                return r;
+
+                        r = cgroup_context_add_bpf_foreign_program(c, t, path);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-socket-bind-allow="))) {
+                        CGroupSocketBindItem *item;
+                        uint16_t nr_ports, port_min;
+                        int af, ip_protocol;
+
+                        r = parse_socket_bind_item(val, &af, &ip_protocol, &nr_ports, &port_min);
+                        if (r < 0)
+                                return r;
+
+                        item = new(CGroupSocketBindItem, 1);
+                        if (!item)
+                                return log_oom_debug();
+                        *item = (CGroupSocketBindItem) {
+                                .address_family = af,
+                                .ip_protocol = ip_protocol,
+                                .nr_ports = nr_ports,
+                                .port_min = port_min,
+                        };
+
+                        LIST_PREPEND(socket_bind_items, c->socket_bind_allow, item);
+                } else if ((val = startswith(l, "exec-cgroup-context-socket-bind-deny="))) {
+                        CGroupSocketBindItem *item;
+                        uint16_t nr_ports, port_min;
+                        int af, ip_protocol;
+
+                        r = parse_socket_bind_item(val, &af, &ip_protocol, &nr_ports, &port_min);
+                        if (r < 0)
+                                return r;
+
+                        item = new(CGroupSocketBindItem, 1);
+                        if (!item)
+                                return log_oom_debug();
+                        *item = (CGroupSocketBindItem) {
+                                .address_family = af,
+                                .ip_protocol = ip_protocol,
+                                .nr_ports = nr_ports,
+                                .port_min = port_min,
+                        };
+
+                        LIST_PREPEND(socket_bind_items, c->socket_bind_deny, item);
+                } else if ((val = startswith(l, "exec-cgroup-context-restrict-network-interfaces="))) {
+                        r = set_ensure_allocated(&c->restrict_network_interfaces, &string_hash_ops);
+                        if (r < 0)
+                                return r;
+
+                        r = set_put_strdup(&c->restrict_network_interfaces, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-restrict-network-interfaces-is-allow-list="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->restrict_network_interfaces_is_allow_list = r;
+                } else
+                        log_warning("Failed to parse serialized line, ignoring: %s", l);
+        }
+
+        return 0;
+}
+
+static int exec_runtime_serialize(const ExecRuntime *rt, FILE *f, FDSet *fds) {
+        int r;
+
+        assert(f);
+        assert(fds);
+
+        if (!rt) {
+                fputc('\n', f); /* End marker */
+                return 0;
+        }
+
+        if (rt->shared) {
+                r = serialize_item(f, "exec-runtime-id", rt->shared->id);
+                if (r < 0)
+                        return r;
+
+                r = serialize_item(f, "exec-runtime-tmp-dir", rt->shared->tmp_dir);
+                if (r < 0)
+                        return r;
+
+                r = serialize_item(f, "exec-runtime-var-tmp-dir", rt->shared->var_tmp_dir);
+                if (r < 0)
+                        return r;
+
+                if (rt->shared->netns_storage_socket[0] >= 0 && rt->shared->netns_storage_socket[1] >= 0) {
+                        r = serialize_fd_many(f, fds, "exec-runtime-netns-storage-socket", rt->shared->netns_storage_socket, 2);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (rt->shared->ipcns_storage_socket[0] >= 0 && rt->shared->ipcns_storage_socket[1] >= 0) {
+                        r = serialize_fd_many(f, fds, "exec-runtime-ipcns-storage-socket", rt->shared->ipcns_storage_socket, 2);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (rt->dynamic_creds) {
+                r = dynamic_user_serialize_one(rt->dynamic_creds->user, "exec-runtime-dynamic-creds-user", f, fds);
+                if (r < 0)
+                        return r;
+        }
+
+        if (rt->dynamic_creds && rt->dynamic_creds->group && rt->dynamic_creds->group == rt->dynamic_creds->user) {
+                r = serialize_bool(f, "exec-runtime-dynamic-creds-group-copy", true);
+                if (r < 0)
+                        return r;
+        } else if (rt->dynamic_creds) {
+                r = dynamic_user_serialize_one(rt->dynamic_creds->group, "exec-runtime-dynamic-creds-group", f, fds);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-runtime-ephemeral-copy", rt->ephemeral_copy);
+        if (r < 0)
+                return r;
+
+        if (rt->ephemeral_storage_socket[0] >= 0 && rt->ephemeral_storage_socket[1] >= 0) {
+                r = serialize_fd_many(f, fds, "exec-runtime-ephemeral-storage-socket", rt->ephemeral_storage_socket, 2);
+                if (r < 0)
+                        return r;
+        }
+
+        fputc('\n', f); /* End marker */
+
+        return 0;
+}
+
+static int exec_runtime_deserialize(ExecRuntime *rt, FILE *f, FDSet *fds) {
+        int r;
+
+        assert(rt);
+        assert(rt->shared);
+        assert(rt->dynamic_creds);
+        assert(f);
+        assert(fds);
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *val;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                if ((val = startswith(l, "exec-runtime-id="))) {
+                        r = free_and_strdup(&rt->shared->id, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-runtime-tmp-dir="))) {
+                        r = free_and_strdup(&rt->shared->tmp_dir, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-runtime-var-tmp-dir="))) {
+                        r = free_and_strdup(&rt->shared->var_tmp_dir, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-runtime-netns-storage-socket="))) {
+
+                        r = deserialize_fd_many(fds, val, 2, rt->shared->netns_storage_socket);
+                        if (r < 0)
+                                continue;
+
+                } else if ((val = startswith(l, "exec-runtime-ipcns-storage-socket="))) {
+
+                        r = deserialize_fd_many(fds, val, 2, rt->shared->ipcns_storage_socket);
+                        if (r < 0)
+                                continue;
+
+                } else if ((val = startswith(l, "exec-runtime-dynamic-creds-user=")))
+                        dynamic_user_deserialize_one(/* m= */ NULL, val, fds, &rt->dynamic_creds->user);
+                else if ((val = startswith(l, "exec-runtime-dynamic-creds-group=")))
+                        dynamic_user_deserialize_one(/* m= */ NULL, val, fds, &rt->dynamic_creds->group);
+                else if ((val = startswith(l, "exec-runtime-dynamic-creds-group-copy="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        if (!r)
+                                continue; /* Nothing to do */
+
+                        if (!rt->dynamic_creds->user)
+                                return -EINVAL;
+
+                        rt->dynamic_creds->group = dynamic_user_ref(rt->dynamic_creds->user);
+                } else if ((val = startswith(l, "exec-runtime-ephemeral-copy="))) {
+                        r = free_and_strdup(&rt->ephemeral_copy, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-runtime-ephemeral-storage-socket="))) {
+
+                        r = deserialize_fd_many(fds, val, 2, rt->ephemeral_storage_socket);
+                        if (r < 0)
+                                continue;
+                } else
+                        log_warning("Failed to parse serialized line, ignoring: %s", l);
+        }
+
+        return 0;
+}
+
+static bool exec_parameters_is_idle_pipe_set(const ExecParameters *p) {
+        assert(p);
+
+        return p->idle_pipe &&
+                p->idle_pipe[0] >= 0 &&
+                p->idle_pipe[1] >= 0 &&
+                p->idle_pipe[2] >= 0 &&
+                p->idle_pipe[3] >= 0;
+}
+
+static int exec_parameters_serialize(const ExecParameters *p, const ExecContext *c, FILE *f, FDSet *fds) {
+        int r;
+
+        assert(f);
+        assert(fds);
+
+        if (!p)
+                return 0;
+
+        r = serialize_item(f, "exec-parameters-runtime-scope", runtime_scope_to_string(p->runtime_scope));
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-parameters-environment", p->environment);
+        if (r < 0)
+                return r;
+
+        if (p->fds) {
+                if (p->n_socket_fds > 0) {
+                        r = serialize_item_format(f, "exec-parameters-n-socket-fds", "%zu", p->n_socket_fds);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (p->n_storage_fds > 0) {
+                        r = serialize_item_format(f, "exec-parameters-n-storage-fds", "%zu", p->n_storage_fds);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = serialize_fd_many(f, fds, "exec-parameters-fds", p->fds, p->n_socket_fds + p->n_storage_fds);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_strv(f, "exec-parameters-fd-names", p->fd_names);
+        if (r < 0)
+                return r;
+
+        if (p->flags != 0) {
+                r = serialize_item_format(f, "exec-parameters-flags", "%u", (unsigned) p->flags);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-parameters-selinux-context-net", p->selinux_context_net);
+        if (r < 0)
+                return r;
+
+        if (p->cgroup_supported != 0) {
+                r = serialize_item_format(f, "exec-parameters-cgroup-supported", "%u", (unsigned) p->cgroup_supported);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-parameters-cgroup-path", p->cgroup_path);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-parameters-cgroup-id", "%" PRIu64, p->cgroup_id);
+        if (r < 0)
+                return r;
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+                _cleanup_free_ char *key = NULL;
+
+                key = strjoin("exec-parameters-prefix-directories-", exec_directory_type_to_string(dt));
+                if (!key)
+                        return log_oom_debug();
+
+                /* Always serialize, even an empty prefix, as this is a fixed array and we always expect
+                 * to have all elements (unless fuzzing is happening, hence the NULL check). */
+                r = serialize_item(f, key, strempty(p->prefix ? p->prefix[dt] : NULL));
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-parameters-received-credentials-directory", p->received_credentials_directory);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-parameters-received-encrypted-credentials-directory", p->received_encrypted_credentials_directory);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-parameters-confirm-spawn", p->confirm_spawn);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-parameters-shall-confirm-spawn", p->shall_confirm_spawn);
+        if (r < 0)
+                return r;
+
+        if (p->watchdog_usec > 0) {
+                r = serialize_usec(f, "exec-parameters-watchdog-usec", p->watchdog_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        if (exec_parameters_is_idle_pipe_set(p)) {
+                r = serialize_fd_many(f, fds, "exec-parameters-idle-pipe", p->idle_pipe, 4);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_fd(f, fds, "exec-parameters-stdin-fd", p->stdin_fd);
+        if (r < 0)
+                return r;
+
+        r = serialize_fd(f, fds, "exec-parameters-stdout-fd", p->stdout_fd);
+        if (r < 0)
+                return r;
+
+        r = serialize_fd(f, fds, "exec-parameters-stderr-fd", p->stderr_fd);
+        if (r < 0)
+                return r;
+
+        r = serialize_fd(f, fds, "exec-parameters-exec-fd", p->exec_fd);
+        if (r < 0)
+                return r;
+
+        if (c && exec_context_restrict_filesystems_set(c)) {
+                r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_outer_map_fd);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-parameters-notify-socket", p->notify_socket);
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(open_files, file, p->open_files) {
+                _cleanup_free_ char *ofs = NULL;
+
+                r = open_file_to_string(file, &ofs);
+                if (r < 0)
+                        return r;
+
+                r = serialize_item(f, "exec-parameters-open-file", ofs);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-parameters-fallback-smack-process-label", p->fallback_smack_process_label);
+        if (r < 0)
+                return r;
+
+        r = serialize_fd(f, fds, "exec-parameters-user-lookup-fd", p->user_lookup_fd);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-parameters-files-env", p->files_env);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-parameters-unit-id", p->unit_id);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-parameters-invocation-id-string", p->invocation_id_string);
+        if (r < 0)
+                return r;
+
+        fputc('\n', f); /* End marker */
+
+        return 0;
+}
+
+static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
+        int r, nr_open;
+
+        assert(p);
+        assert(f);
+        assert(fds);
+
+        nr_open = read_nr_open();
+        if (nr_open < 3)
+                nr_open = HIGH_RLIMIT_NOFILE;
+        assert(nr_open > 0); /* For compilers/static analyzers */
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *val;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                if ((val = startswith(l, "exec-parameters-runtime-scope="))) {
+                        p->runtime_scope = runtime_scope_from_string(val);
+                        if (p->runtime_scope < 0)
+                                return p->runtime_scope;
+                } else if ((val = startswith(l, "exec-parameters-environment="))) {
+                        r = deserialize_strv(val, &p->environment);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-n-socket-fds="))) {
+                        if (p->fds)
+                                return -EINVAL; /* Already received */
+
+                        r = safe_atozu(val, &p->n_socket_fds);
+                        if (r < 0)
+                                return r;
+
+                        if (p->n_socket_fds > (size_t) nr_open)
+                                return -EINVAL; /* too many, someone is playing games with us */
+                } else if ((val = startswith(l, "exec-parameters-n-storage-fds="))) {
+                        if (p->fds)
+                                return -EINVAL; /* Already received */
+
+                        r = safe_atozu(val, &p->n_storage_fds);
+                        if (r < 0)
+                                return r;
+
+                        if (p->n_storage_fds > (size_t) nr_open)
+                                return -EINVAL; /* too many, someone is playing games with us */
+                } else if ((val = startswith(l, "exec-parameters-fds="))) {
+                        if (p->n_socket_fds + p->n_storage_fds == 0)
+                                return log_warning_errno(
+                                                SYNTHETIC_ERRNO(EINVAL),
+                                                "Got exec-parameters-fds= without "
+                                                "prior exec-parameters-n-socket-fds= or exec-parameters-n-storage-fds=");
+                        if (p->n_socket_fds + p->n_storage_fds > (size_t) nr_open)
+                                return -EINVAL; /* too many, someone is playing games with us */
+
+                        if (p->fds)
+                                return -EINVAL; /* duplicated */
+
+                        p->fds = new(int, p->n_socket_fds + p->n_storage_fds);
+                        if (!p->fds)
+                                return log_oom_debug();
+
+                        /* Ensure we don't leave any FD uninitialized on error, it makes the fuzzer sad */
+                        for (size_t i = 0; i < p->n_socket_fds + p->n_storage_fds; ++i)
+                                p->fds[i] = -EBADF;
+
+                        r = deserialize_fd_many(fds, val, p->n_socket_fds + p->n_storage_fds, p->fds);
+                        if (r < 0)
+                                continue;
+
+                } else if ((val = startswith(l, "exec-parameters-fd-names="))) {
+                        r = deserialize_strv(val, &p->fd_names);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-flags="))) {
+                        unsigned flags;
+
+                        r = safe_atou(val, &flags);
+                        if (r < 0)
+                                return r;
+                        p->flags = flags;
+                } else if ((val = startswith(l, "exec-parameters-selinux-context-net="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+
+                        p->selinux_context_net = r;
+                } else if ((val = startswith(l, "exec-parameters-cgroup-supported="))) {
+                        unsigned cgroup_supported;
+
+                        r = safe_atou(val, &cgroup_supported);
+                        if (r < 0)
+                                return r;
+                        p->cgroup_supported = cgroup_supported;
+                } else if ((val = startswith(l, "exec-parameters-cgroup-path="))) {
+                        r = free_and_strdup(&p->cgroup_path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-cgroup-id="))) {
+                        r = safe_atou64(val, &p->cgroup_id);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-prefix-directories-"))) {
+                        _cleanup_free_ char *type = NULL, *prefix = NULL;
+                        ExecDirectoryType dt;
+
+                        r = extract_many_words(&val, "= ", 0, &type, &prefix, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                return -EINVAL;
+
+                        dt = exec_directory_type_from_string(type);
+                        if (dt < 0)
+                                return -EINVAL;
+
+                        if (!p->prefix) {
+                                p->prefix = new0(char*, _EXEC_DIRECTORY_TYPE_MAX+1);
+                                if (!p->prefix)
+                                        return log_oom_debug();
+                        }
+
+                        if (isempty(prefix))
+                                p->prefix[dt] = mfree(p->prefix[dt]);
+                        else
+                                free_and_replace(p->prefix[dt], prefix);
+                } else if ((val = startswith(l, "exec-parameters-received-credentials-directory="))) {
+                        r = free_and_strdup(&p->received_credentials_directory, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-received-encrypted-credentials-directory="))) {
+                        r = free_and_strdup(&p->received_encrypted_credentials_directory, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-confirm-spawn="))) {
+                        r = free_and_strdup(&p->confirm_spawn, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-shall-confirm-spawn="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+
+                        p->shall_confirm_spawn = r;
+                } else if ((val = startswith(l, "exec-parameters-watchdog-usec="))) {
+                        r = deserialize_usec(val, &p->watchdog_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-idle-pipe="))) {
+                        if (p->idle_pipe)
+                                return -EINVAL; /* duplicated */
+
+                        p->idle_pipe = new(int, 4);
+                        if (!p->idle_pipe)
+                                return log_oom_debug();
+
+                        p->idle_pipe[0] = p->idle_pipe[1] = p->idle_pipe[2] = p->idle_pipe[3] = -EBADF;
+
+                        r = deserialize_fd_many(fds, val, 4, p->idle_pipe);
+                        if (r < 0)
+                                continue;
+
+                } else if ((val = startswith(l, "exec-parameters-stdin-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd < 0)
+                                continue;
+
+                        p->stdin_fd = fd;
+
+                } else if ((val = startswith(l, "exec-parameters-stdout-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd < 0)
+                                continue;
+
+                        p->stdout_fd = fd;
+
+                } else if ((val = startswith(l, "exec-parameters-stderr-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd < 0)
+                                continue;
+
+                        p->stderr_fd = fd;
+                } else if ((val = startswith(l, "exec-parameters-exec-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd < 0)
+                                continue;
+
+                        p->exec_fd = fd;
+                } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd < 0)
+                                continue;
+
+                        p->bpf_outer_map_fd = fd;
+                } else if ((val = startswith(l, "exec-parameters-notify-socket="))) {
+                        r = free_and_strdup(&p->notify_socket, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-open-file="))) {
+                        OpenFile *of = NULL;
+
+                        r = open_file_parse(val, &of);
+                        if (r < 0)
+                                return r;
+
+                        LIST_APPEND(open_files, p->open_files, of);
+                } else if ((val = startswith(l, "exec-parameters-fallback-smack-process-label="))) {
+                        r = free_and_strdup(&p->fallback_smack_process_label, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-user-lookup-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd < 0)
+                                continue;
+
+                        p->user_lookup_fd = fd;
+                } else if ((val = startswith(l, "exec-parameters-files-env="))) {
+                        r = deserialize_strv(val, &p->files_env);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-unit-id="))) {
+                        r = free_and_strdup(&p->unit_id, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-parameters-invocation-id-string="))) {
+                        if (strlen(val) > SD_ID128_STRING_MAX - 1)
+                                return -EINVAL;
+
+                        r = sd_id128_from_string(val, &p->invocation_id);
+                        if (r < 0)
+                                return r;
+
+                        sd_id128_to_string(p->invocation_id, p->invocation_id_string);
+                } else
+                        log_warning("Failed to parse serialized line, ignoring: %s", l);
+        }
+
+        /* Bail out if we got exec-parameters-n-{socket/storage}-fds= but no corresponding
+         * exec-parameters-fds= */
+        if (p->n_socket_fds + p->n_storage_fds > 0 && !p->fds)
+                return -EINVAL;
+
+        return 0;
+}
+
+static int serialize_std_out_err(const ExecContext *c, FILE *f, int fileno) {
+        char *key, *value;
+        const char *type;
+
+        assert(c);
+        assert(f);
+        assert(IN_SET(fileno, STDOUT_FILENO, STDERR_FILENO));
+
+        type = fileno == STDOUT_FILENO ? "output" : "error";
+
+        switch (fileno == STDOUT_FILENO ? c->std_output : c->std_error) {
+        case EXEC_OUTPUT_NAMED_FD:
+                key = strjoina("exec-context-std-", type, "-fd-name");
+                value = c->stdio_fdname[fileno];
+
+                break;
+
+        case EXEC_OUTPUT_FILE:
+                key = strjoina("exec-context-std-", type, "-file");
+                value = c->stdio_file[fileno];
+
+                break;
+
+        case EXEC_OUTPUT_FILE_APPEND:
+                key = strjoina("exec-context-std-", type, "-file-append");
+                value = c->stdio_file[fileno];
+
+                break;
+
+        case EXEC_OUTPUT_FILE_TRUNCATE:
+                key = strjoina("exec-context-std-", type, "-file-truncate");
+                value = c->stdio_file[fileno];
+
+                break;
+
+        default:
+                return 0;
+        }
+
+        return serialize_item(f, key, value);
+}
+
+static int exec_context_serialize(const ExecContext *c, FILE *f) {
+        int r;
+
+        assert(f);
+
+        if (!c)
+                return 0;
+
+        r = serialize_strv(f, "exec-context-environment", c->environment);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-environment-files", c->environment_files);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-pass-environment", c->pass_environment);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-unset-environment", c->unset_environment);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-working-directory", c->working_directory);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-root-directory", c->root_directory);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-root-image", c->root_image);
+        if (r < 0)
+                return r;
+
+        if (c->root_image_options) {
+                _cleanup_free_ char *options = NULL;
+
+                LIST_FOREACH(mount_options, o, c->root_image_options) {
+                        if (isempty(o->options))
+                                continue;
+
+                        _cleanup_free_ char *escaped = NULL;
+                        escaped = shell_escape(o->options, ":");
+                        if (!escaped)
+                                return log_oom_debug();
+
+                        if (!strextend(&options,
+                                        " ",
+                                        partition_designator_to_string(o->partition_designator),
+                                               ":",
+                                               escaped))
+                                        return log_oom_debug();
+                }
+
+                r = serialize_item(f, "exec-context-root-image-options", options);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-context-root-verity", c->root_verity);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-root-hash-path", c->root_hash_path);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-root-hash-sig-path", c->root_hash_sig_path);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_hexmem(f, "exec-context-root-hash", c->root_hash, c->root_hash_size);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_base64mem(f, "exec-context-root-hash-sig", c->root_hash_sig, c->root_hash_sig_size);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-root-ephemeral", c->root_ephemeral);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-context-umask", "%04o", c->umask);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-non-blocking", c->non_blocking);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_tristate(f, "exec-context-private-mounts", c->private_mounts);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_tristate(f, "exec-context-memory-ksm", c->memory_ksm);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-private-tmp", c->private_tmp);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-private-devices", c->private_devices);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-protect-kernel-tunables", c->protect_kernel_tunables);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-protect-kernel-modules", c->protect_kernel_modules);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-protect-kernel-logs", c->protect_kernel_logs);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-protect-clock", c->protect_clock);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-protect-control-groups", c->protect_control_groups);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-private-network", c->private_network);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-private-users", c->private_users);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-private-ipc", c->private_ipc);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-protect-home", protect_home_to_string(c->protect_home));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-protect-system", protect_system_to_string(c->protect_system));
+        if (r < 0)
+                return r;
+
+        if (c->mount_apivfs_set) {
+                r = serialize_bool(f, "exec-context-mount-api-vfs", c->mount_apivfs);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-context-same-pgrp", c->same_pgrp);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-cpu-sched-reset-on-fork", c->cpu_sched_reset_on_fork);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool(f, "exec-context-ignore-sigpipe", c->ignore_sigpipe);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-memory-deny-write-execute", c->memory_deny_write_execute);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-restrict-realtime", c->restrict_realtime);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-restrict-suid-sgid", c->restrict_suid_sgid);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-keyring-mode", exec_keyring_mode_to_string(c->keyring_mode));
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-protect-hostname", c->protect_hostname);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-protect-proc", protect_proc_to_string(c->protect_proc));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-proc-subset", proc_subset_to_string(c->proc_subset));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
+        if (r < 0)
+                return r;
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+                _cleanup_free_ char *key = NULL, *value = NULL;
+
+                key = strjoin("exec-context-directories-", exec_directory_type_to_string(dt));
+                if (!key)
+                        return log_oom_debug();
+
+                if (asprintf(&value, "%04o", c->directories[dt].mode) < 0)
+                        return log_oom_debug();
+
+                FOREACH_ARRAY(i, c->directories[dt].items, c->directories[dt].n_items) {
+                        _cleanup_free_ char *path_escaped = NULL;
+
+                        path_escaped = shell_escape(i->path, ":" WHITESPACE);
+                        if (!path_escaped)
+                                return log_oom_debug();
+
+                        if (!strextend(&value, " ", path_escaped))
+                                return log_oom_debug();
+
+                        if (!strextend(&value, ":", yes_no(i->only_create)))
+                                return log_oom_debug();
+
+                        STRV_FOREACH(d, i->symlinks) {
+                                _cleanup_free_ char *link_escaped = NULL;
+
+                                link_escaped = shell_escape(*d, ":" WHITESPACE);
+                                if (!link_escaped)
+                                        return log_oom_debug();
+
+                                if (!strextend(&value, ":", link_escaped))
+                                        return log_oom_debug();
+                        }
+                }
+
+                r = serialize_item(f, key, value);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_usec(f, "exec-context-timeout-clean-usec", c->timeout_clean_usec);
+        if (r < 0)
+                return r;
+
+        if (c->nice_set) {
+                r = serialize_item_format(f, "exec-context-nice", "%i", c->nice);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-context-working-directory-missing-ok", c->working_directory_missing_ok);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-working-directory-home", c->working_directory_home);
+        if (r < 0)
+                return r;
+
+        if (c->oom_score_adjust_set) {
+                r = serialize_item_format(f, "exec-context-oom-score-adjust", "%i", c->oom_score_adjust);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->coredump_filter_set) {
+                r = serialize_item_format(f, "exec-context-coredump-filter", "%"PRIx64, c->coredump_filter);
+                if (r < 0)
+                        return r;
+        }
+
+        for (unsigned i = 0; i < RLIM_NLIMITS; i++) {
+                _cleanup_free_ char *key = NULL, *limit = NULL;
+
+                if (!c->rlimit[i])
+                        continue;
+
+                key = strjoin("exec-context-limit-", rlimit_to_string(i));
+                if (!key)
+                        return log_oom_debug();
+
+                r = rlimit_format(c->rlimit[i], &limit);
+                if (r < 0)
+                        return r;
+
+                r = serialize_item(f, key, limit);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->ioprio_set) {
+                r = serialize_item_format(f, "exec-context-ioprio", "%d", c->ioprio);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->cpu_sched_set) {
+                _cleanup_free_ char *policy_str = NULL;
+
+                r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
+                if (r < 0)
+                        return r;
+
+                r = serialize_item(f, "exec-context-cpu-scheduling-policy", policy_str);
+                if (r < 0)
+                        return r;
+
+                r = serialize_item_format(f, "exec-context-cpu-scheduling-priority", "%i", c->cpu_sched_priority);
+                if (r < 0)
+                        return r;
+
+                r = serialize_bool_elide(f, "exec-context-cpu-scheduling-reset-on-fork", c->cpu_sched_reset_on_fork);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->cpu_set.set) {
+                _cleanup_free_ char *affinity = NULL;
+
+                affinity = cpu_set_to_range_string(&c->cpu_set);
+                if (!affinity)
+                        return log_oom_debug();
+
+                r = serialize_item(f, "exec-context-cpu-affinity", affinity);
+                if (r < 0)
+                        return r;
+        }
+
+        if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+                _cleanup_free_ char *nodes = NULL;
+
+                nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+                if (!nodes)
+                        return log_oom_debug();
+
+                if (nodes) {
+                        r = serialize_item(f, "exec-context-numa-mask", nodes);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = serialize_item_format(f, "exec-context-numa-policy", "%d", c->numa_policy.type);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-context-cpu-affinity-from-numa", c->cpu_affinity_from_numa);
+        if (r < 0)
+                return r;
+
+        if (c->timer_slack_nsec != NSEC_INFINITY) {
+                r = serialize_item_format(f, "exec-context-timer-slack-nsec", NSEC_FMT, c->timer_slack_nsec);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-context-std-input", exec_input_to_string(c->std_input));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-std-output", exec_output_to_string(c->std_output));
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-std-error", exec_output_to_string(c->std_error));
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-stdio-as-fds", c->stdio_as_fds);
+        if (r < 0)
+                return r;
+
+        switch (c->std_input) {
+        case EXEC_INPUT_NAMED_FD:
+                r = serialize_item(f, "exec-context-std-input-fd-name", c->stdio_fdname[STDIN_FILENO]);
+                if (r < 0)
+                        return r;
+                break;
+
+        case EXEC_INPUT_FILE:
+                r = serialize_item(f, "exec-context-std-input-file", c->stdio_file[STDIN_FILENO]);
+                if (r < 0)
+                        return r;
+                break;
+
+        default:
+                break;
+        }
+
+        r = serialize_std_out_err(c, f, STDOUT_FILENO);
+        if (r < 0)
+                return r;
+
+        r = serialize_std_out_err(c, f, STDERR_FILENO);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_base64mem(f, "exec-context-stdin-data", c->stdin_data, c->stdin_data_size);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-tty-path", c->tty_path);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-tty-reset", c->tty_reset);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-tty-vhangup", c->tty_vhangup);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-tty-vt-disallocate", c->tty_vt_disallocate);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-context-tty-rows", "%u", c->tty_rows);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-context-tty-columns", "%u", c->tty_cols);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-context-syslog-priority", "%i", c->syslog_priority);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool(f, "exec-context-syslog-level-prefix", c->syslog_level_prefix);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-syslog-identifier", c->syslog_identifier);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-context-log-level-max", "%d", c->log_level_max);
+        if (r < 0)
+                return r;
+
+        if (c->log_ratelimit_interval_usec > 0) {
+                r = serialize_usec(f, "exec-context-log-ratelimit-interval-usec", c->log_ratelimit_interval_usec);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->log_ratelimit_burst > 0) {
+                r = serialize_item_format(f, "exec-context-log-ratelimit-burst", "%u", c->log_ratelimit_burst);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_string_set(f, "exec-context-log-filter-allowed-patterns", c->log_filter_allowed_patterns);
+        if (r < 0)
+                return r;
+
+        r = serialize_string_set(f, "exec-context-log-filter-denied-patterns", c->log_filter_denied_patterns);
+        if (r < 0)
+                return r;
+
+        FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) {
+                r = serialize_item(f, "exec-context-log-extra-fields", field->iov_base);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-context-log-namespace", c->log_namespace);
+        if (r < 0)
+                return r;
+
+        if (c->secure_bits != 0) {
+                r = serialize_item_format(f, "exec-context-secure-bits", "%d", c->secure_bits);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->capability_bounding_set != CAP_MASK_UNSET) {
+                r = serialize_item_format(f, "exec-context-capability-bounding-set", "%" PRIu64, c->capability_bounding_set);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->capability_ambient_set != 0) {
+                r = serialize_item_format(f, "exec-context-capability-ambient-set", "%" PRIu64, c->capability_ambient_set);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->user) {
+                r = serialize_item(f, "exec-context-user", c->user);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-context-group", c->group);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-dynamic-user", c->dynamic_user);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-supplementary-groups", c->supplementary_groups);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_tristate(f, "exec-context-set-login-environment", c->set_login_environment);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-pam-name", c->pam_name);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-read-write-paths", c->read_write_paths);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-read-only-paths", c->read_only_paths);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-inaccessible-paths", c->inaccessible_paths);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-exec-paths", c->exec_paths);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-no-exec-paths", c->no_exec_paths);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-context-exec-search-path", c->exec_search_path);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-context-mount-propagation-flag", "%lu", c->mount_propagation_flag);
+        if (r < 0)
+                return r;
+
+        FOREACH_ARRAY(mount, c->bind_mounts, c->n_bind_mounts) {
+                _cleanup_free_ char *src_escaped = NULL, *dst_escaped = NULL;
+
+                src_escaped = shell_escape(mount->source, ":" WHITESPACE);
+                if (!src_escaped)
+                        return log_oom_debug();
+
+                dst_escaped = shell_escape(mount->destination, ":" WHITESPACE);
+                if (!dst_escaped)
+                        return log_oom_debug();
+
+                r = serialize_item_format(f,
+                                          mount->read_only ? "exec-context-bind-read-only-path" : "exec-context-bind-path",
+                                          "%s%s:%s:%s",
+                                          mount->ignore_enoent ? "-" : "",
+                                          src_escaped,
+                                          dst_escaped,
+                                          mount->recursive ? "rbind" : "norbind");
+                if (r < 0)
+                        return r;
+        }
+
+        FOREACH_ARRAY(tmpfs, c->temporary_filesystems, c->n_temporary_filesystems) {
+                _cleanup_free_ char *escaped = NULL;
+
+                if (!isempty(tmpfs->options)) {
+                        escaped = shell_escape(tmpfs->options, ":");
+                        if (!escaped)
+                                return log_oom_debug();
+                }
+
+                r = serialize_item_format(f, "exec-context-temporary-filesystems", "%s%s%s",
+                                          tmpfs->path,
+                                          isempty(escaped) ? "" : ":",
+                                          strempty(escaped));
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_item(f, "exec-context-utmp-id", c->utmp_id);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-utmp-mode", exec_utmp_mode_to_string(c->utmp_mode));
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-no-new-privileges", c->no_new_privileges);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-selinux-context-ignore", c->selinux_context_ignore);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-apparmor-profile-ignore", c->apparmor_profile_ignore);
+        if (r < 0)
+                return r;
+
+        r = serialize_bool_elide(f, "exec-context-smack-process-label-ignore", c->smack_process_label_ignore);
+        if (r < 0)
+                return r;
+
+        if (c->selinux_context) {
+                r = serialize_item_format(f, "exec-context-selinux-context",
+                                          "%s%s",
+                                          c->selinux_context_ignore ? "-" : "",
+                                          c->selinux_context);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->apparmor_profile) {
+                r = serialize_item_format(f, "exec-context-apparmor-profile",
+                                          "%s%s",
+                                          c->apparmor_profile_ignore ? "-" : "",
+                                          c->apparmor_profile);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->smack_process_label) {
+                r = serialize_item_format(f, "exec-context-smack-process-label",
+                                          "%s%s",
+                                          c->smack_process_label_ignore ? "-" : "",
+                                          c->smack_process_label);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->personality != PERSONALITY_INVALID) {
+                r = serialize_item(f, "exec-context-personality", personality_to_string(c->personality));
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-context-lock-personality", c->lock_personality);
+        if (r < 0)
+                return r;
+
+#if HAVE_SECCOMP
+        if (!hashmap_isempty(c->syscall_filter)) {
+                void *errno_num, *id;
+                HASHMAP_FOREACH_KEY(errno_num, id, c->syscall_filter) {
+                        r = serialize_item_format(f, "exec-context-syscall-filter", "%d %d", PTR_TO_INT(id) - 1, PTR_TO_INT(errno_num));
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (!set_isempty(c->syscall_archs)) {
+                void *id;
+                SET_FOREACH(id, c->syscall_archs) {
+                        r = serialize_item_format(f, "exec-context-syscall-archs", "%u", PTR_TO_UINT(id) - 1);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (c->syscall_errno > 0) {
+                r = serialize_item_format(f, "exec-context-syscall-errno", "%d", c->syscall_errno);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_bool_elide(f, "exec-context-syscall-allow-list", c->syscall_allow_list);
+        if (r < 0)
+                return r;
+
+        if (!hashmap_isempty(c->syscall_log)) {
+                void *errno_num, *id;
+                HASHMAP_FOREACH_KEY(errno_num, id, c->syscall_log) {
+                        r = serialize_item_format(f, "exec-context-syscall-log", "%d %d", PTR_TO_INT(id) - 1, PTR_TO_INT(errno_num));
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        r = serialize_bool_elide(f, "exec-context-syscall-log-allow-list", c->syscall_log_allow_list);
+        if (r < 0)
+                return r;
+#endif
+
+        if (c->restrict_namespaces != NAMESPACE_FLAGS_INITIAL) {
+                r = serialize_item_format(f, "exec-context-restrict-namespaces", "%lu", c->restrict_namespaces);
+                if (r < 0)
+                        return r;
+        }
+
+#if HAVE_LIBBPF
+        if (exec_context_restrict_filesystems_set(c)) {
+                char *fs;
+                SET_FOREACH(fs, c->restrict_filesystems) {
+                        r = serialize_item(f, "exec-context-restrict-filesystems", fs);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        r = serialize_bool_elide(f, "exec-context-restrict-filesystems-allow-list", c->restrict_filesystems_allow_list);
+        if (r < 0)
+                return r;
+#endif
+
+        if (!set_isempty(c->address_families)) {
+                void *afp;
+
+                SET_FOREACH(afp, c->address_families) {
+                        int af = PTR_TO_INT(afp);
+
+                        if (af <= 0 || af >= af_max())
+                                continue;
+
+                        r = serialize_item_format(f, "exec-context-address-families", "%d", af);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        r = serialize_bool_elide(f, "exec-context-address-families-allow-list", c->address_families_allow_list);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-network-namespace-path", c->network_namespace_path);
+        if (r < 0)
+                return r;
+
+        r = serialize_item(f, "exec-context-ipc-namespace-path", c->ipc_namespace_path);
+        if (r < 0)
+                return r;
+
+        FOREACH_ARRAY(mount, c->mount_images, c->n_mount_images) {
+                _cleanup_free_ char *s = NULL, *source_escaped = NULL, *dest_escaped = NULL;
+
+                source_escaped = shell_escape(mount->source, WHITESPACE);
+                if (!source_escaped)
+                        return log_oom_debug();
+
+                dest_escaped = shell_escape(mount->destination, WHITESPACE);
+                if (!dest_escaped)
+                        return log_oom_debug();
+
+                s = strjoin(mount->ignore_enoent ? "-" : "",
+                            source_escaped,
+                            " ",
+                            dest_escaped);
+                if (!s)
+                        return log_oom_debug();
+
+                LIST_FOREACH(mount_options, o, mount->mount_options) {
+                        _cleanup_free_ char *escaped = NULL;
+
+                        if (isempty(o->options))
+                                continue;
+
+                        escaped = shell_escape(o->options, ":");
+                        if (!escaped)
+                                return log_oom_debug();
+
+                        if (!strextend(&s,
+                                       " ",
+                                       partition_designator_to_string(o->partition_designator),
+                                       ":",
+                                       escaped))
+                                return log_oom_debug();
+                }
+
+                r = serialize_item(f, "exec-context-mount-image", s);
+                if (r < 0)
+                        return r;
+        }
+
+        FOREACH_ARRAY(mount, c->extension_images, c->n_extension_images) {
+                _cleanup_free_ char *s = NULL, *source_escaped = NULL;
+
+                source_escaped = shell_escape(mount->source, ":" WHITESPACE);
+                if (!source_escaped)
+                        return log_oom_debug();
+
+                s = strjoin(mount->ignore_enoent ? "-" : "",
+                            source_escaped);
+                if (!s)
+                        return log_oom_debug();
+
+                LIST_FOREACH(mount_options, o, mount->mount_options) {
+                        _cleanup_free_ char *escaped = NULL;
+
+                        if (isempty(o->options))
+                                continue;
+
+                        escaped = shell_escape(o->options, ":");
+                        if (!escaped)
+                                return log_oom_debug();
+
+                        if (!strextend(&s,
+                                       " ",
+                                       partition_designator_to_string(o->partition_designator),
+                                       ":",
+                                       escaped))
+                                return log_oom_debug();
+                }
+
+                r = serialize_item(f, "exec-context-extension-image", s);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_strv(f, "exec-context-extension-directories", c->extension_directories);
+        if (r < 0)
+                return r;
+
+        ExecSetCredential *sc;
+        HASHMAP_FOREACH(sc, c->set_credentials) {
+                _cleanup_free_ char *data = NULL;
+
+                if (base64mem(sc->data, sc->size, &data) < 0)
+                        return log_oom_debug();
+
+                r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, yes_no(sc->encrypted), data);
+                if (r < 0)
+                        return r;
+        }
+
+        ExecLoadCredential *lc;
+        HASHMAP_FOREACH(lc, c->load_credentials) {
+                r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, yes_no(lc->encrypted), lc->path);
+                if (r < 0)
+                        return r;
+        }
+
+        if (!set_isempty(c->import_credentials)) {
+                char *ic;
+                SET_FOREACH(ic, c->import_credentials) {
+                        r = serialize_item(f, "exec-context-import-credentials", ic);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        r = serialize_image_policy(f, "exec-context-root-image-policy", c->root_image_policy);
+        if (r < 0)
+                return r;
+
+        r = serialize_image_policy(f, "exec-context-mount-image-policy", c->mount_image_policy);
+        if (r < 0)
+                return r;
+
+        r = serialize_image_policy(f, "exec-context-extension-image-policy", c->extension_image_policy);
+        if (r < 0)
+                return r;
+
+        fputc('\n', f); /* End marker */
+
+        return 0;
+}
+
+static int exec_context_deserialize(ExecContext *c, FILE *f) {
+        int r;
+
+        assert(f);
+
+        if (!c)
+                return 0;
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *val;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                if ((val = startswith(l, "exec-context-environment="))) {
+                        r = deserialize_strv(val, &c->environment);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-environment-files="))) {
+                        r = deserialize_strv(val, &c->environment_files);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-pass-environment="))) {
+                        r = deserialize_strv(val, &c->pass_environment);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-unset-environment="))) {
+                        r = deserialize_strv(val, &c->unset_environment);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-working-directory="))) {
+                        r = free_and_strdup(&c->working_directory, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-directory="))) {
+                        r = free_and_strdup(&c->root_directory, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-image="))) {
+                        r = free_and_strdup(&c->root_image, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-image-options="))) {
+                        for (;;) {
+                                _cleanup_free_ char *word = NULL, *mount_options = NULL, *partition = NULL;
+                                PartitionDesignator partition_designator;
+                                MountOptions *o = NULL;
+                                const char *p;
+
+                                r = extract_first_word(&val, &word, NULL, 0);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        break;
+
+                                p = word;
+                                r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        continue;
+
+                                partition_designator = partition_designator_from_string(partition);
+                                if (partition_designator < 0)
+                                        return -EINVAL;
+
+                                o = new(MountOptions, 1);
+                                if (!o)
+                                        return log_oom_debug();
+                                *o = (MountOptions) {
+                                        .partition_designator = partition_designator,
+                                        .options = TAKE_PTR(mount_options),
+                                };
+                                LIST_APPEND(mount_options, c->root_image_options, o);
+                        }
+                } else if ((val = startswith(l, "exec-context-root-verity="))) {
+                        r = free_and_strdup(&c->root_verity, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-hash-path="))) {
+                        r = free_and_strdup(&c->root_hash_path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-hash-sig-path="))) {
+                        r = free_and_strdup(&c->root_hash_sig_path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-hash="))) {
+                        c->root_hash = mfree(c->root_hash);
+                        r = unhexmem(val, strlen(val), &c->root_hash, &c->root_hash_size);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-hash-sig="))) {
+                        c->root_hash_sig = mfree(c->root_hash_sig);
+                        r= unbase64mem(val, strlen(val), &c->root_hash_sig, &c->root_hash_sig_size);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-ephemeral="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->root_ephemeral = r;
+                } else if ((val = startswith(l, "exec-context-umask="))) {
+                        r = parse_mode(val, &c->umask);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-private-non-blocking="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->non_blocking = r;
+                } else if ((val = startswith(l, "exec-context-private-mounts="))) {
+                        r = safe_atoi(val, &c->private_mounts);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-memory-ksm="))) {
+                        r = safe_atoi(val, &c->memory_ksm);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-private-tmp="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->private_tmp = r;
+                } else if ((val = startswith(l, "exec-context-private-devices="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->private_devices = r;
+                } else if ((val = startswith(l, "exec-context-protect-kernel-tunables="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->protect_kernel_tunables = r;
+                } else if ((val = startswith(l, "exec-context-protect-kernel-modules="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->protect_kernel_modules = r;
+                } else if ((val = startswith(l, "exec-context-protect-kernel-logs="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->protect_kernel_logs = r;
+                } else if ((val = startswith(l, "exec-context-protect-clock="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->protect_clock = r;
+                } else if ((val = startswith(l, "exec-context-protect-control-groups="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->protect_control_groups = r;
+                } else if ((val = startswith(l, "exec-context-private-network="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->private_network = r;
+                } else if ((val = startswith(l, "exec-context-private-users="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->private_users = r;
+                } else if ((val = startswith(l, "exec-context-private-ipc="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->private_ipc = r;
+                } else if ((val = startswith(l, "exec-context-remove-ipc="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->remove_ipc = r;
+                } else if ((val = startswith(l, "exec-context-protect-home="))) {
+                        c->protect_home = protect_home_from_string(val);
+                        if (c->protect_home < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-protect-system="))) {
+                        c->protect_system = protect_system_from_string(val);
+                        if (c->protect_system < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->mount_apivfs = r;
+                        c->mount_apivfs_set = true;
+                } else if ((val = startswith(l, "exec-context-same-pgrp="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->same_pgrp = r;
+                } else if ((val = startswith(l, "exec-context-cpu-sched-reset-on-fork="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->cpu_sched_reset_on_fork = r;
+                } else if ((val = startswith(l, "exec-context-non-blocking="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->non_blocking = r;
+                } else if ((val = startswith(l, "exec-context-ignore-sigpipe="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->ignore_sigpipe = r;
+                } else if ((val = startswith(l, "exec-context-memory-deny-write-execute="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->memory_deny_write_execute = r;
+                } else if ((val = startswith(l, "exec-context-restrict-realtime="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->restrict_realtime = r;
+                } else if ((val = startswith(l, "exec-context-restrict-suid-sgid="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->restrict_suid_sgid = r;
+                } else if ((val = startswith(l, "exec-context-keyring-mode="))) {
+                        c->keyring_mode = exec_keyring_mode_from_string(val);
+                        if (c->keyring_mode < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-protect-hostname="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->protect_hostname = r;
+                } else if ((val = startswith(l, "exec-context-protect-proc="))) {
+                        c->protect_proc = protect_proc_from_string(val);
+                        if (c->protect_proc < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-proc-subset="))) {
+                        c->proc_subset = proc_subset_from_string(val);
+                        if (c->proc_subset < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
+                        c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
+                        if (c->runtime_directory_preserve_mode < 0)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-directories-"))) {
+                        _cleanup_free_ char *type = NULL, *mode = NULL;
+                        ExecDirectoryType dt;
+
+                        r = extract_many_words(&val, "= ", 0, &type, &mode, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r == 0 || !mode)
+                                return -EINVAL;
+
+                        dt = exec_directory_type_from_string(type);
+                        if (dt < 0)
+                                return -EINVAL;
+
+                        r = parse_mode(mode, &c->directories[dt].mode);
+                        if (r < 0)
+                                return r;
+
+                        for (;;) {
+                                _cleanup_free_ char *tuple = NULL, *path = NULL, *only_create = NULL;
+                                const char *p;
+
+                                /* Use EXTRACT_UNESCAPE_RELAX here, as we unescape the colons in subsequent calls */
+                                r = extract_first_word(&val, &tuple, WHITESPACE, EXTRACT_UNESCAPE_SEPARATORS|EXTRACT_UNESCAPE_RELAX);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        break;
+
+                                p = tuple;
+                                r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create, NULL);
+                                if (r < 0)
+                                        return r;
+                                if (r < 2)
+                                        continue;
+
+                                r = exec_directory_add(&c->directories[dt], path, NULL);
+                                if (r < 0)
+                                        return r;
+
+                                r = parse_boolean(only_create);
+                                if (r < 0)
+                                        return r;
+                                c->directories[dt].items[c->directories[dt].n_items - 1].only_create = r;
+
+                                if (isempty(p))
+                                        continue;
+
+                                for (;;) {
+                                        _cleanup_free_ char *link = NULL;
+
+                                        r = extract_first_word(&p, &link, ":", EXTRACT_UNESCAPE_SEPARATORS);
+                                        if (r < 0)
+                                                return r;
+                                        if (r == 0)
+                                                break;
+
+                                        r = strv_consume(&c->directories[dt].items[c->directories[dt].n_items - 1].symlinks, TAKE_PTR(link));
+                                        if (r < 0)
+                                                return r;
+                                }
+                        }
+                } else if ((val = startswith(l, "exec-context-timeout-clean-usec="))) {
+                        r = deserialize_usec(val, &c->timeout_clean_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-nice="))) {
+                        r = safe_atoi(val, &c->nice);
+                        if (r < 0)
+                                return r;
+                        c->nice_set = true;
+                } else if ((val = startswith(l, "exec-context-working-directory-missing-ok="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->working_directory_missing_ok = r;
+                } else if ((val = startswith(l, "exec-context-working-directory-home="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->working_directory_home = r;
+                } else if ((val = startswith(l, "exec-context-oom-score-adjust="))) {
+                        r = safe_atoi(val, &c->oom_score_adjust);
+                        if (r < 0)
+                                return r;
+                        c->oom_score_adjust_set = true;
+                } else if ((val = startswith(l, "exec-context-coredump-filter="))) {
+                        r = safe_atoux64(val, &c->coredump_filter);
+                        if (r < 0)
+                                return r;
+                        c->coredump_filter_set = true;
+                } else if ((val = startswith(l, "exec-context-limit-"))) {
+                        _cleanup_free_ struct rlimit *rlimit = NULL;
+                        _cleanup_free_ char *limit = NULL;
+                        int type;
+
+                        r = extract_first_word(&val, &limit, "=", 0);
+                        if (r < 0)
+                                return r;
+                        if (r == 0 || !val)
+                                return -EINVAL;
+
+                        type = rlimit_from_string(limit);
+                        if (type < 0)
+                                return -EINVAL;
+
+                        if (!c->rlimit[type]) {
+                                rlimit = new0(struct rlimit, 1);
+                                if (!rlimit)
+                                        return log_oom_debug();
+
+                                r = rlimit_parse(type, val, rlimit);
+                                if (r < 0)
+                                        return r;
+
+                                c->rlimit[type] = TAKE_PTR(rlimit);
+                        } else {
+                                r = rlimit_parse(type, val, c->rlimit[type]);
+                                if (r < 0)
+                                        return r;
+                        }
+                } else if ((val = startswith(l, "exec-context-ioprio="))) {
+                        r = safe_atoi(val, &c->ioprio);
+                        if (r < 0)
+                                return r;
+                        c->ioprio_set = true;
+                } else if ((val = startswith(l, "exec-context-cpu-scheduling-policy="))) {
+                        c->cpu_sched_policy = sched_policy_from_string(val);
+                        if (c->cpu_sched_policy < 0)
+                                return -EINVAL;
+                        c->cpu_sched_set = true;
+                } else if ((val = startswith(l, "exec-context-cpu-scheduling-priority="))) {
+                        r = safe_atoi(val, &c->cpu_sched_priority);
+                        if (r < 0)
+                                return r;
+                        c->cpu_sched_set = true;
+                } else if ((val = startswith(l, "exec-context-cpu-scheduling-reset-on-fork="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->cpu_sched_reset_on_fork = r;
+                        c->cpu_sched_set = true;
+                } else if ((val = startswith(l, "exec-context-cpu-affinity="))) {
+                        if (c->cpu_set.set)
+                                return -EINVAL; /* duplicated */
+
+                        r = parse_cpu_set(val, &c->cpu_set);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-numa-mask="))) {
+                        if (c->numa_policy.nodes.set)
+                                return -EINVAL; /* duplicated */
+
+                        r = parse_cpu_set(val, &c->numa_policy.nodes);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-numa-policy="))) {
+                        r = safe_atoi(val, &c->numa_policy.type);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-cpu-affinity-from-numa="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->cpu_affinity_from_numa = r;
+                } else if ((val = startswith(l, "exec-context-timer-slack-nsec="))) {
+                        r = deserialize_usec(val, (usec_t *)&c->timer_slack_nsec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-input="))) {
+                        c->std_input = exec_input_from_string(val);
+                        if (c->std_input < 0)
+                                return c->std_input;
+                } else if ((val = startswith(l, "exec-context-std-output="))) {
+                        c->std_output = exec_output_from_string(val);
+                        if (c->std_output < 0)
+                                return c->std_output;
+                } else if ((val = startswith(l, "exec-context-std-error="))) {
+                        c->std_error = exec_output_from_string(val);
+                        if (c->std_error < 0)
+                                return c->std_error;
+                } else if ((val = startswith(l, "exec-context-stdio-as-fds="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->stdio_as_fds = r;
+                } else if ((val = startswith(l, "exec-context-std-input-fd-name="))) {
+                        r = free_and_strdup(&c->stdio_fdname[STDIN_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-output-fd-name="))) {
+                        r = free_and_strdup(&c->stdio_fdname[STDOUT_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-error-fd-name="))) {
+                        r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-input-file="))) {
+                        r = free_and_strdup(&c->stdio_file[STDIN_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-output-file="))) {
+                        r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-output-file-append="))) {
+                        r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-output-file-truncate="))) {
+                        r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-error-file="))) {
+                        r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-error-file-append="))) {
+                        r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-std-error-file-truncate="))) {
+                        r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-stdin-data="))) {
+                        if (c->stdin_data)
+                                return -EINVAL; /* duplicated */
+
+                        r = unbase64mem(val, strlen(val), &c->stdin_data, &c->stdin_data_size);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-tty-path="))) {
+                        r = free_and_strdup(&c->tty_path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-tty-reset="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->tty_reset = r;
+                } else if ((val = startswith(l, "exec-context-tty-vhangup="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->tty_vhangup = r;
+                } else if ((val = startswith(l, "exec-context-tty-vt-disallocate="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->tty_vt_disallocate = r;
+                } else if ((val = startswith(l, "exec-context-tty-rows="))) {
+                        r = safe_atou(val, &c->tty_rows);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-tty-columns="))) {
+                        r = safe_atou(val, &c->tty_cols);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-syslog-priority="))) {
+                        r = safe_atoi(val, &c->syslog_priority);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-syslog-level-prefix="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->syslog_level_prefix = r;
+                } else if ((val = startswith(l, "exec-context-syslog-identifier="))) {
+                        r = free_and_strdup(&c->syslog_identifier, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-log-level-max="))) {
+                        r = safe_atoi(val, &c->log_level_max);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-log-ratelimit-interval-usec="))) {
+                        r = deserialize_usec(val, &c->log_ratelimit_interval_usec);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-log-ratelimit-burst="))) {
+                        r = safe_atou(val, &c->log_ratelimit_burst);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-log-filter-allowed-patterns="))) {
+                        r = set_put_strdup(&c->log_filter_allowed_patterns, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-log-filter-denied-patterns="))) {
+                        r = set_put_strdup(&c->log_filter_denied_patterns, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-log-extra-fields="))) {
+                        if (!GREEDY_REALLOC(c->log_extra_fields, c->n_log_extra_fields + 1))
+                                return log_oom_debug();
+
+                        c->log_extra_fields[c->n_log_extra_fields++].iov_base = strdup(val);
+                        if (!c->log_extra_fields[c->n_log_extra_fields-1].iov_base)
+                                return log_oom_debug();
+                } else if ((val = startswith(l, "exec-context-log-namespace="))) {
+                        r = free_and_strdup(&c->log_namespace, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-secure-bits="))) {
+                        r = safe_atoi(val, &c->secure_bits);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-capability-bounding-set="))) {
+                        r = safe_atou64(val, &c->capability_bounding_set);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-capability-ambient-set="))) {
+                        r = safe_atou64(val, &c->capability_ambient_set);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-user="))) {
+                        r = free_and_strdup(&c->user, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-group="))) {
+                        r = free_and_strdup(&c->group, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-dynamic-user="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->dynamic_user = r;
+                } else if ((val = startswith(l, "exec-context-supplementary-groups="))) {
+                        r = deserialize_strv(val, &c->supplementary_groups);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-set-login-environment="))) {
+                        r = safe_atoi(val, &c->set_login_environment);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-pam-name="))) {
+                        r = free_and_strdup(&c->pam_name, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-read-write-paths="))) {
+                        r = deserialize_strv(val, &c->read_write_paths);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-read-only-paths="))) {
+                        r = deserialize_strv(val, &c->read_only_paths);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-inaccessible-paths="))) {
+                        r = deserialize_strv(val, &c->inaccessible_paths);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-exec-paths="))) {
+                        r = deserialize_strv(val, &c->exec_paths);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-no-exec-paths="))) {
+                        r = deserialize_strv(val, &c->no_exec_paths);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-exec-search-path="))) {
+                        r = deserialize_strv(val, &c->exec_search_path);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-mount-propagation-flag="))) {
+                        r = safe_atolu(val, &c->mount_propagation_flag);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-bind-read-only-path="))) {
+                        _cleanup_free_ char *source = NULL, *destination = NULL;
+                        bool rbind = true, ignore_enoent = false;
+                        char *s = NULL, *d = NULL;
+
+                        r = extract_first_word(&val,
+                                               &source,
+                                               ":" WHITESPACE,
+                                               EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                return -EINVAL;
+
+                        s = source;
+                        if (s[0] == '-') {
+                                ignore_enoent = true;
+                                s++;
+                        }
+
+                        if (val && val[-1] == ':') {
+                                r = extract_first_word(&val,
+                                                       &destination,
+                                                       ":" WHITESPACE,
+                                                       EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        continue;
+
+                                d = destination;
+
+                                if (val && val[-1] == ':') {
+                                        _cleanup_free_ char *options = NULL;
+
+                                        r = extract_first_word(&val, &options, NULL, EXTRACT_UNQUOTE);
+                                        if (r < 0)
+                                                return -r;
+
+                                        if (isempty(options) || streq(options, "rbind"))
+                                                rbind = true;
+                                        else if (streq(options, "norbind"))
+                                                rbind = false;
+                                        else
+                                                continue;
+                                }
+                        } else
+                                d = s;
+
+                        r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+                                        &(BindMount) {
+                                                .source = s,
+                                                .destination = d,
+                                                .read_only = true,
+                                                .recursive = rbind,
+                                                .ignore_enoent = ignore_enoent,
+                                        });
+                        if (r < 0)
+                                return log_oom_debug();
+                } else if ((val = startswith(l, "exec-context-bind-path="))) {
+                        _cleanup_free_ char *source = NULL, *destination = NULL;
+                        bool rbind = true, ignore_enoent = false;
+                        char *s = NULL, *d = NULL;
+
+                        r = extract_first_word(&val,
+                                               &source,
+                                               ":" WHITESPACE,
+                                               EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                return -EINVAL;
+
+                        s = source;
+                        if (s[0] == '-') {
+                                ignore_enoent = true;
+                                s++;
+                        }
+
+                        if (val && val[-1] == ':') {
+                                r = extract_first_word(&val,
+                                                       &destination,
+                                                       ":" WHITESPACE,
+                                                       EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        continue;
+
+                                d = destination;
+
+                                if (val && val[-1] == ':') {
+                                        _cleanup_free_ char *options = NULL;
+
+                                        r = extract_first_word(&val, &options, NULL, EXTRACT_UNQUOTE);
+                                        if (r < 0)
+                                                return -r;
+
+                                        if (isempty(options) || streq(options, "rbind"))
+                                                rbind = true;
+                                        else if (streq(options, "norbind"))
+                                                rbind = false;
+                                        else
+                                                continue;
+                                }
+                        } else
+                                d = s;
+
+                        r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+                                        &(BindMount) {
+                                                .source = s,
+                                                .destination = d,
+                                                .read_only = false,
+                                                .recursive = rbind,
+                                                .ignore_enoent = ignore_enoent,
+                                        });
+                        if (r < 0)
+                                return log_oom_debug();
+                } else if ((val = startswith(l, "exec-context-temporary-filesystems="))) {
+                        _cleanup_free_ char *path = NULL, *options = NULL;
+
+                        r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r < 1)
+                                continue;
+
+                        r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options);
+                        if (r < 0)
+                                return log_oom_debug();
+                } else if ((val = startswith(l, "exec-context-utmp-id="))) {
+                        r = free_and_strdup(&c->utmp_id, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-utmp-mode="))) {
+                        c->utmp_mode = exec_utmp_mode_from_string(val);
+                        if (c->utmp_mode < 0)
+                                return c->utmp_mode;
+                } else if ((val = startswith(l, "exec-context-no-new-privileges="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->no_new_privileges = r;
+                } else if ((val = startswith(l, "exec-context-selinux-context-ignore="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->selinux_context_ignore = r;
+                } else if ((val = startswith(l, "exec-context-apparmor-profile-ignore="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->apparmor_profile_ignore = r;
+                } else if ((val = startswith(l, "exec-context-smack-process-label-ignore="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->smack_process_label_ignore = r;
+                } else if ((val = startswith(l, "exec-context-selinux-context="))) {
+                        if (val[0] == '-') {
+                                c->selinux_context_ignore = true;
+                                val++;
+                        }
+
+                        r = free_and_strdup(&c->selinux_context, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-apparmor-profile="))) {
+                        if (val[0] == '-') {
+                                c->apparmor_profile_ignore = true;
+                                val++;
+                        }
+
+                        r = free_and_strdup(&c->apparmor_profile, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-smack-process-label="))) {
+                        if (val[0] == '-') {
+                                c->smack_process_label_ignore = true;
+                                val++;
+                        }
+
+                        r = free_and_strdup(&c->smack_process_label, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-personality="))) {
+                        c->personality = personality_from_string(val);
+                        if (c->personality == PERSONALITY_INVALID)
+                                return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-lock-personality="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->lock_personality = r;
+#if HAVE_SECCOMP
+                } else if ((val = startswith(l, "exec-context-syscall-filter="))) {
+                        _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL;
+                        int id, errno_num;
+
+                        r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                continue;
+
+                        r = safe_atoi(s_id, &id);
+                        if (r < 0)
+                                return r;
+
+                        r = safe_atoi(s_errno_num, &errno_num);
+                        if (r < 0)
+                                return r;
+
+                        r = hashmap_ensure_put(&c->syscall_filter, NULL, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-syscall-archs="))) {
+                        unsigned int id;
+
+                        r = safe_atou(val, &id);
+                        if (r < 0)
+                                return r;
+
+                        r = set_ensure_put(&c->syscall_archs, NULL, UINT_TO_PTR(id + 1));
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-syscall-errno="))) {
+                        r = safe_atoi(val, &c->syscall_errno);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-syscall-allow-list="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->syscall_allow_list = r;
+                } else if ((val = startswith(l, "exec-context-syscall-log="))) {
+                        _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL;
+                        int id, errno_num;
+
+                        r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 2)
+                                continue;
+
+                        r = safe_atoi(s_id, &id);
+                        if (r < 0)
+                                return r;
+
+                        r = safe_atoi(s_errno_num, &errno_num);
+                        if (r < 0)
+                                return r;
+
+                        r = hashmap_ensure_put(&c->syscall_log, NULL, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-syscall-log-allow-list="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->syscall_log_allow_list = r;
+#endif
+                } else if ((val = startswith(l, "exec-context-restrict-namespaces="))) {
+                        r = safe_atolu(val, &c->restrict_namespaces);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-restrict-filesystems="))) {
+                        r = set_ensure_allocated(&c->restrict_filesystems, &string_hash_ops);
+                        if (r < 0)
+                                return r;
+
+                        r = set_put_strdup(&c->restrict_filesystems, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-restrict-filesystems-allow-list="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->restrict_filesystems_allow_list = r;
+                } else if ((val = startswith(l, "exec-context-address-families="))) {
+                        int af;
+
+                        r = safe_atoi(val, &af);
+                        if (r < 0)
+                                return r;
+
+                        r = set_ensure_put(&c->address_families, NULL, INT_TO_PTR(af));
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-address-families-allow-list="))) {
+                        r = parse_boolean(val);
+                        if (r < 0)
+                                return r;
+                        c->address_families_allow_list = r;
+                } else if ((val = startswith(l, "exec-context-network-namespace-path="))) {
+                        r = free_and_strdup(&c->network_namespace_path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-ipc-namespace-path="))) {
+                        r = free_and_strdup(&c->ipc_namespace_path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-mount-image="))) {
+                        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                        _cleanup_free_ char *source = NULL, *destination = NULL;
+                        bool permissive = false;
+                        char *s;
+
+                        r = extract_many_words(&val,
+                                               NULL,
+                                               EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
+                                               &source,
+                                               &destination,
+                                               NULL);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                return -EINVAL;
+
+                        s = source;
+                        if (s[0] == '-') {
+                                permissive = true;
+                                s++;
+                        }
+
+                        if (isempty(destination))
+                                continue;
+
+                        for (;;) {
+                                _cleanup_free_ char *tuple = NULL, *partition = NULL, *opts = NULL;
+                                PartitionDesignator partition_designator;
+                                MountOptions *o = NULL;
+                                const char *p;
+
+                                r = extract_first_word(&val, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        break;
+
+                                p = tuple;
+                                r = extract_many_words(&p,
+                                                       ":",
+                                                       EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
+                                                       &partition,
+                                                       &opts,
+                                                       NULL);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        continue;
+                                if (r == 1) {
+                                        o = new(MountOptions, 1);
+                                        if (!o)
+                                                return log_oom_debug();
+                                        *o = (MountOptions) {
+                                                .partition_designator = PARTITION_ROOT,
+                                                .options = TAKE_PTR(partition),
+                                        };
+                                        LIST_APPEND(mount_options, options, o);
+
+                                        continue;
+                                }
+
+                                partition_designator = partition_designator_from_string(partition);
+                                if (partition_designator < 0)
+                                        continue;
+
+                                o = new(MountOptions, 1);
+                                if (!o)
+                                        return log_oom_debug();
+                                *o = (MountOptions) {
+                                        .partition_designator = partition_designator,
+                                        .options = TAKE_PTR(opts),
+                                };
+                                LIST_APPEND(mount_options, options, o);
+                        }
+
+                        r = mount_image_add(&c->mount_images, &c->n_mount_images,
+                                        &(MountImage) {
+                                                .source = s,
+                                                .destination = destination,
+                                                .mount_options = options,
+                                                .ignore_enoent = permissive,
+                                                .type = MOUNT_IMAGE_DISCRETE,
+                                        });
+                        if (r < 0)
+                                return log_oom_debug();
+                } else if ((val = startswith(l, "exec-context-extension-image="))) {
+                        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                        _cleanup_free_ char *source = NULL;
+                        bool permissive = false;
+                        char *s;
+
+                        r = extract_first_word(&val,
+                                               &source,
+                                               NULL,
+                                               EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                return -EINVAL;
+
+                        s = source;
+                        if (s[0] == '-') {
+                                permissive = true;
+                                s++;
+                        }
+
+                        for (;;) {
+                                _cleanup_free_ char *tuple = NULL, *partition = NULL, *opts = NULL;
+                                PartitionDesignator partition_designator;
+                                MountOptions *o = NULL;
+                                const char *p;
+
+                                r = extract_first_word(&val, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        break;
+
+                                p = tuple;
+                                r = extract_many_words(&p,
+                                                       ":",
+                                                       EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
+                                                       &partition,
+                                                       &opts,
+                                                       NULL);
+                                if (r < 0)
+                                        return r;
+                                if (r == 0)
+                                        continue;
+                                if (r == 1) {
+                                        o = new(MountOptions, 1);
+                                        if (!o)
+                                                return log_oom_debug();
+                                        *o = (MountOptions) {
+                                                .partition_designator = PARTITION_ROOT,
+                                                .options = TAKE_PTR(partition),
+                                        };
+                                        LIST_APPEND(mount_options, options, o);
+
+                                        continue;
+                                }
+
+                                partition_designator = partition_designator_from_string(partition);
+                                if (partition_designator < 0)
+                                        continue;
+
+                                o = new(MountOptions, 1);
+                                if (!o)
+                                        return log_oom_debug();
+                                *o = (MountOptions) {
+                                        .partition_designator = partition_designator,
+                                        .options = TAKE_PTR(opts),
+                                };
+                                LIST_APPEND(mount_options, options, o);
+                        }
+
+                        r = mount_image_add(&c->extension_images, &c->n_extension_images,
+                                        &(MountImage) {
+                                                .source = s,
+                                                .mount_options = options,
+                                                .ignore_enoent = permissive,
+                                                .type = MOUNT_IMAGE_EXTENSION,
+                                        });
+                        if (r < 0)
+                                return log_oom_debug();
+                } else if ((val = startswith(l, "exec-context-extension-directories="))) {
+                        r = deserialize_strv(val, &c->extension_directories);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-set-credentials="))) {
+                        _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+                        _cleanup_free_ char *id = NULL, *encrypted = NULL, *data = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &id, &encrypted, &data, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 3)
+                                return -EINVAL;
+
+                        r = parse_boolean(encrypted);
+                        if (r < 0)
+                                return r;
+
+                        sc = new(ExecSetCredential, 1);
+                        if (!sc)
+                                return -ENOMEM;
+
+                        *sc = (ExecSetCredential) {
+                                .id =  TAKE_PTR(id),
+                                .encrypted = r,
+                        };
+
+                        r = unbase64mem(data, strlen(data), &sc->data, &sc->size);
+                        if (r < 0)
+                                return r;
+
+                        r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+                        if (r < 0)
+                                return r;
+
+                        TAKE_PTR(sc);
+                } else if ((val = startswith(l, "exec-context-load-credentials="))) {
+                        _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
+                        _cleanup_free_ char *id = NULL, *encrypted = NULL, *path = NULL;
+
+                        r = extract_many_words(&val, " ", 0, &id, &encrypted, &path, NULL);
+                        if (r < 0)
+                                return r;
+                        if (r != 3)
+                                return -EINVAL;
+
+                        r = parse_boolean(encrypted);
+                        if (r < 0)
+                                return r;
+
+                        lc = new(ExecLoadCredential, 1);
+                        if (!lc)
+                                return -ENOMEM;
+
+                        *lc = (ExecLoadCredential) {
+                                .id =  TAKE_PTR(id),
+                                .path = TAKE_PTR(path),
+                                .encrypted = r,
+                        };
+
+                        r = hashmap_ensure_put(&c->load_credentials, &exec_load_credential_hash_ops, lc->id, lc);
+                        if (r < 0)
+                                return r;
+
+                        TAKE_PTR(lc);
+                } else if ((val = startswith(l, "exec-context-import-credentials="))) {
+                        r = set_ensure_allocated(&c->import_credentials, &string_hash_ops);
+                        if (r < 0)
+                                return r;
+
+                        r = set_put_strdup(&c->import_credentials, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-root-image-policy="))) {
+                        if (c->root_image_policy)
+                                return -EINVAL; /* duplicated */
+
+                        r = image_policy_from_string(val, &c->root_image_policy);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-mount-image-policy="))) {
+                        if (c->mount_image_policy)
+                                return -EINVAL; /* duplicated */
+
+                        r = image_policy_from_string(val, &c->mount_image_policy);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-context-extension-image-policy="))) {
+                        if (c->extension_image_policy)
+                                return -EINVAL; /* duplicated */
+
+                        r = image_policy_from_string(val, &c->extension_image_policy);
+                        if (r < 0)
+                                return r;
+                } else
+                        log_warning("Failed to parse serialized line, ignoring: %s", l);
+        }
+
+        return 0;
+}
+
+static int exec_command_serialize(const ExecCommand *c, FILE *f) {
+        int r;
+
+        assert(c);
+        assert(f);
+
+        r = serialize_item(f, "exec-command-path", c->path);
+        if (r < 0)
+                return r;
+
+        r = serialize_strv(f, "exec-command-argv", c->argv);
+        if (r < 0)
+                return r;
+
+        r = serialize_item_format(f, "exec-command-flags", "%d", (int) c->flags);
+        if (r < 0)
+                return r;
+
+        fputc('\n', f); /* End marker */
+
+        return 0;
+}
+
+static int exec_command_deserialize(ExecCommand *c, FILE *f) {
+        int r;
+
+        assert(c);
+        assert(f);
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *val;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                if ((val = startswith(l, "exec-command-path="))) {
+                        r = free_and_strdup(&c->path, val);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-command-argv="))) {
+                        r = deserialize_strv(val, &c->argv);
+                        if (r < 0)
+                                return r;
+                } else if ((val = startswith(l, "exec-command-flags="))) {
+                        r = safe_atoi(val, &c->flags);
+                        if (r < 0)
+                                return r;
+                } else
+                        log_warning("Failed to parse serialized line, ignoring: %s", l);
+
+        }
+
+        return 0;
+}
+
+int exec_serialize_invocation(
+                FILE *f,
+                FDSet *fds,
+                const ExecContext *ctx,
+                const ExecCommand *cmd,
+                const ExecParameters *p,
+                const ExecRuntime *rt,
+                const CGroupContext *cg) {
+
+        int r;
+
+        assert(f);
+        assert(fds);
+
+        r = exec_context_serialize(ctx, f);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to serialize context: %m");
+
+        r = exec_command_serialize(cmd, f);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to serialize command: %m");
+
+        r = exec_parameters_serialize(p, ctx, f, fds);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to serialize parameters: %m");
+
+        r = exec_runtime_serialize(rt, f, fds);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to serialize runtime: %m");
+
+        r = exec_cgroup_context_serialize(cg, f);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to serialize cgroup context: %m");
+
+        return 0;
+}
+
+int exec_deserialize_invocation(
+                FILE *f,
+                FDSet *fds,
+                ExecContext *ctx,
+                ExecCommand *cmd,
+                ExecParameters *p,
+                ExecRuntime *rt,
+                CGroupContext *cg) {
+
+        int r;
+
+        assert(f);
+        assert(fds);
+
+        r = exec_context_deserialize(ctx, f);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to deserialize context: %m");
+
+        r = exec_command_deserialize(cmd, f);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to deserialize command: %m");
+
+        r = exec_parameters_deserialize(p, f, fds);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to deserialize parameters: %m");
+
+        r = exec_runtime_deserialize(rt, f, fds);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to deserialize runtime: %m");
+
+        r = exec_cgroup_context_deserialize(cg, f);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to deserialize cgroup context: %m");
+
+        return 0;
+}
diff --git a/src/core/execute-serialize.h b/src/core/execute-serialize.h
new file mode 100644
index 0000000..89c8e09
--- /dev/null
+++ b/src/core/execute-serialize.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "execute.h"
+
+/* These functions serialize/deserialize for invocation purposes (i.e.: serialized object is passed to a
+ * child process) rather than to save state across reload/reexec. */
+
+int exec_serialize_invocation(FILE *f,
+        FDSet *fds,
+        const ExecContext *ctx,
+        const ExecCommand *cmd,
+        const ExecParameters *p,
+        const ExecRuntime *rt,
+        const CGroupContext *cg);
+
+int exec_deserialize_invocation(FILE *f,
+        FDSet *fds,
+        ExecContext *ctx,
+        ExecCommand *cmd,
+        ExecParameters *p,
+        ExecRuntime *rt,
+        CGroupContext *cg);
diff --git a/src/core/execute.c b/src/core/execute.c
new file mode 100644
index 0000000..8dbdfcf
--- /dev/null
+++ b/src/core/execute.c
@@ -0,0 +1,2742 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include  /* Must be included after  */
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "async.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "constants.h"
+#include "cpu-set-util.h"
+#include "dev-setup.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "execute-serialize.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "glob-util.h"
+#include "hexdecoct.h"
+#include "ioprio-util.h"
+#include "lock-util.h"
+#include "log.h"
+#include "macro.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "memory-util.h"
+#include "missing_fs.h"
+#include "missing_prctl.h"
+#include "mkdir-label.h"
+#include "namespace.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "rm-rf.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "selinux-util.h"
+#include "serialize.h"
+#include "sort-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+#include "utmp-wtmp.h"
+
+static bool is_terminal_input(ExecInput i) {
+        return IN_SET(i,
+                      EXEC_INPUT_TTY,
+                      EXEC_INPUT_TTY_FORCE,
+                      EXEC_INPUT_TTY_FAIL);
+}
+
+static bool is_terminal_output(ExecOutput o) {
+        return IN_SET(o,
+                      EXEC_OUTPUT_TTY,
+                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
+                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
+}
+
+const char *exec_context_tty_path(const ExecContext *context) {
+        assert(context);
+
+        if (context->stdio_as_fds)
+                return NULL;
+
+        if (context->tty_path)
+                return context->tty_path;
+
+        return "/dev/console";
+}
+
+static void exec_context_determine_tty_size(
+                const ExecContext *context,
+                const char *tty_path,
+                unsigned *ret_rows,
+                unsigned *ret_cols) {
+
+        unsigned rows, cols;
+
+        assert(context);
+        assert(ret_rows);
+        assert(ret_cols);
+
+        if (!tty_path)
+                tty_path = exec_context_tty_path(context);
+
+        rows = context->tty_rows;
+        cols = context->tty_cols;
+
+        if (tty_path && (rows == UINT_MAX || cols == UINT_MAX))
+                (void) proc_cmdline_tty_size(
+                                tty_path,
+                                rows == UINT_MAX ? &rows : NULL,
+                                cols == UINT_MAX ? &cols : NULL);
+
+        *ret_rows = rows;
+        *ret_cols = cols;
+}
+
+int exec_context_apply_tty_size(
+                const ExecContext *context,
+                int tty_fd,
+                const char *tty_path) {
+
+        unsigned rows, cols;
+
+        exec_context_determine_tty_size(context, tty_path, &rows, &cols);
+
+        return terminal_set_size_fd(tty_fd, tty_path, rows, cols);
+ }
+
+void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
+        _cleanup_close_ int _fd = -EBADF, lock_fd = -EBADF;
+        int fd;
+
+        assert(context);
+
+        const char *path = exec_context_tty_path(context);
+
+        if (p && p->stdin_fd >= 0 && isatty(p->stdin_fd))
+                fd = p->stdin_fd;
+        else if (path && (context->tty_path || is_terminal_input(context->std_input) ||
+                        is_terminal_output(context->std_output) || is_terminal_output(context->std_error))) {
+                fd = _fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
+                if (fd < 0)
+                        return (void) log_debug_errno(fd, "Failed to open terminal '%s', ignoring: %m", path);
+        } else
+                return;   /* nothing to do */
+
+        /* Take a synchronization lock for the duration of the setup that we do here.
+         * systemd-vconsole-setup.service also takes the lock to avoid being interrupted. We open a new fd
+         * that will be closed automatically, and operate on it for convenience. */
+        lock_fd = lock_dev_console();
+        if (ERRNO_IS_NEG_PRIVILEGE(lock_fd))
+                log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without: %m");
+        else if (lock_fd < 0)
+                return (void) log_debug_errno(lock_fd, "Failed to lock /dev/console: %m");
+
+        if (context->tty_vhangup)
+                (void) terminal_vhangup_fd(fd);
+
+        if (context->tty_reset)
+                (void) reset_terminal_fd(fd, /* switch_to_text= */ true);
+
+        (void) exec_context_apply_tty_size(context, fd, path);
+
+        if (context->tty_vt_disallocate && path)
+                (void) vt_disallocate(path);
+}
+
+bool exec_needs_network_namespace(const ExecContext *context) {
+        assert(context);
+
+        return context->private_network || context->network_namespace_path;
+}
+
+static bool exec_needs_ephemeral(const ExecContext *context) {
+        return (context->root_image || context->root_directory) && context->root_ephemeral;
+}
+
+bool exec_needs_ipc_namespace(const ExecContext *context) {
+        assert(context);
+
+        return context->private_ipc || context->ipc_namespace_path;
+}
+
+bool exec_needs_mount_namespace(
+                const ExecContext *context,
+                const ExecParameters *params,
+                const ExecRuntime *runtime) {
+
+        assert(context);
+
+        if (context->root_image)
+                return true;
+
+        if (!strv_isempty(context->read_write_paths) ||
+            !strv_isempty(context->read_only_paths) ||
+            !strv_isempty(context->inaccessible_paths) ||
+            !strv_isempty(context->exec_paths) ||
+            !strv_isempty(context->no_exec_paths))
+                return true;
+
+        if (context->n_bind_mounts > 0)
+                return true;
+
+        if (context->n_temporary_filesystems > 0)
+                return true;
+
+        if (context->n_mount_images > 0)
+                return true;
+
+        if (context->n_extension_images > 0)
+                return true;
+
+        if (!strv_isempty(context->extension_directories))
+                return true;
+
+        if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
+                return true;
+
+        if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
+                return true;
+
+        if (context->private_devices ||
+            context->private_mounts > 0 ||
+            (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
+            context->protect_system != PROTECT_SYSTEM_NO ||
+            context->protect_home != PROTECT_HOME_NO ||
+            context->protect_kernel_tunables ||
+            context->protect_kernel_modules ||
+            context->protect_kernel_logs ||
+            context->protect_control_groups ||
+            context->protect_proc != PROTECT_PROC_DEFAULT ||
+            context->proc_subset != PROC_SUBSET_ALL ||
+            exec_needs_ipc_namespace(context))
+                return true;
+
+        if (context->root_directory) {
+                if (exec_context_get_effective_mount_apivfs(context))
+                        return true;
+
+                for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                        if (params && !params->prefix[t])
+                                continue;
+
+                        if (context->directories[t].n_items > 0)
+                                return true;
+                }
+        }
+
+        if (context->dynamic_user &&
+            (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
+             context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
+             context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
+                return true;
+
+        if (context->log_namespace)
+                return true;
+
+        return false;
+}
+
+bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
+        assert(context);
+
+        if (!context->dynamic_user)
+                return false;
+
+        if (type == EXEC_DIRECTORY_CONFIGURATION)
+                return false;
+
+        if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
+                return false;
+
+        return true;
+}
+
+int exec_params_get_cgroup_path(
+                const ExecParameters *params,
+                const CGroupContext *c,
+                char **ret) {
+
+        const char *subgroup = NULL;
+        char *p;
+
+        assert(params);
+        assert(ret);
+
+        if (!params->cgroup_path)
+                return -EINVAL;
+
+        /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
+         * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
+         * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
+         * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
+         * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
+         * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
+         * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
+         * flag, which is only passed for the former statements, not for the latter. */
+
+        if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
+                if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+                        subgroup = ".control";
+                else
+                        subgroup = c->delegate_subgroup;
+        }
+
+        if (subgroup)
+                p = path_join(params->cgroup_path, subgroup);
+        else
+                p = strdup(params->cgroup_path);
+        if (!p)
+                return -ENOMEM;
+
+        *ret = p;
+        return !!subgroup;
+}
+
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
+        assert(c);
+
+        return c->cpu_affinity_from_numa;
+}
+
+static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
+        assert(unit);
+        assert(msg);
+        assert(executable);
+
+        if (!DEBUG_LOGGING)
+                return;
+
+        _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
+
+        log_unit_struct(unit, LOG_DEBUG,
+                        "EXECUTABLE=%s", executable,
+                        LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
+                        LOG_UNIT_INVOCATION_ID(unit));
+}
+
+static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
+
+int exec_spawn(Unit *unit,
+               ExecCommand *command,
+               const ExecContext *context,
+               ExecParameters *params,
+               ExecRuntime *runtime,
+               const CGroupContext *cgroup_context,
+               pid_t *ret) {
+
+        char serialization_fd_number[DECIMAL_STR_MAX(int) + 1];
+        _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL;
+        _cleanup_fdset_free_ FDSet *fdset = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        pid_t pid;
+        int r;
+
+        assert(unit);
+        assert(unit->manager);
+        assert(unit->manager->executor_fd >= 0);
+        assert(command);
+        assert(context);
+        assert(ret);
+        assert(params);
+        assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
+        assert(!params->files_env); /* We fill this field, ensure it comes NULL-initialized to us */
+
+        LOG_CONTEXT_PUSH_UNIT(unit);
+
+        r = exec_context_load_environment(unit, context, ¶ms->files_env);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
+
+        /* We won't know the real executable path until we create the mount namespace in the child, but we
+           want to log from the parent, so we use the possibly inaccurate path here. */
+        log_command_line(unit, "About to execute", command->path, command->argv);
+
+        if (params->cgroup_path) {
+                r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
+                if (r < 0)
+                        return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
+                if (r > 0) {
+                        /* If there's a subcgroup, then let's create it here now (the main cgroup was already
+                         * realized by the unit logic) */
+
+                        r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
+                        if (r < 0)
+                                return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
+                }
+        }
+
+        /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
+         * child's memory.max, serialize all the state needed to start the unit, and pass it to the
+         * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
+         * and ensure all memory is shared. The child immediately execs the new binary so the delay should
+         * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
+         * target cgroup. */
+
+        r = open_serialization_file("sd-executor-state", &f);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to open serialization stream: %m");
+
+        fdset = fdset_new();
+        if (!fdset)
+                return log_oom();
+
+        r = exec_serialize_invocation(f, fdset, context, command, params, runtime, cgroup_context);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to serialize parameters: %m");
+
+        if (fseeko(f, 0, SEEK_SET) < 0)
+                return log_unit_error_errno(unit, errno, "Failed to reseek on serialization stream: %m");
+
+        r = fd_cloexec(fileno(f), false);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialization fd: %m");
+
+        r = fdset_cloexec(fdset, false);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m");
+
+        r = log_level_to_string_alloc(log_get_max_level(), &log_level);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m");
+
+        r = fd_get_path(unit->manager->executor_fd, &executor_path);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m");
+
+        xsprintf(serialization_fd_number, "%i", fileno(f));
+
+        /* The executor binary is pinned, to avoid compatibility problems during upgrades. */
+        r = posix_spawn_wrapper(
+                        FORMAT_PROC_FD_PATH(unit->manager->executor_fd),
+                        STRV_MAKE(executor_path,
+                                  "--deserialize", serialization_fd_number,
+                                  "--log-level", log_level,
+                                  "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
+                        environ,
+                        &pid);
+        if (r < 0)
+                return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
+
+        log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
+
+        /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
+         * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
+         * process will be killed too). */
+        if (subcgroup_path)
+                (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
+
+        exec_status_start(&command->exec_status, pid);
+
+        *ret = pid;
+        return 0;
+}
+
+void exec_context_init(ExecContext *c) {
+        assert(c);
+
+        /* When initializing a bool member to 'true', make sure to serialize in execute-serialize.c using
+         * serialize_bool() instead of serialize_bool_elide(). */
+
+        *c = (ExecContext) {
+                .umask = 0022,
+                .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
+                .cpu_sched_policy = SCHED_OTHER,
+                .syslog_priority = LOG_DAEMON|LOG_INFO,
+                .syslog_level_prefix = true,
+                .ignore_sigpipe = true,
+                .timer_slack_nsec = NSEC_INFINITY,
+                .personality = PERSONALITY_INVALID,
+                .timeout_clean_usec = USEC_INFINITY,
+                .capability_bounding_set = CAP_MASK_UNSET,
+                .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
+                .log_level_max = -1,
+#if HAVE_SECCOMP
+                .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
+#endif
+                .tty_rows = UINT_MAX,
+                .tty_cols = UINT_MAX,
+                .private_mounts = -1,
+                .memory_ksm = -1,
+                .set_login_environment = -1,
+        };
+
+        FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
+                d->mode = 0755;
+
+        numa_policy_reset(&c->numa_policy);
+
+        assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
+}
+
+void exec_context_done(ExecContext *c) {
+        assert(c);
+
+        c->environment = strv_free(c->environment);
+        c->environment_files = strv_free(c->environment_files);
+        c->pass_environment = strv_free(c->pass_environment);
+        c->unset_environment = strv_free(c->unset_environment);
+
+        rlimit_free_all(c->rlimit);
+
+        for (size_t l = 0; l < 3; l++) {
+                c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
+                c->stdio_file[l] = mfree(c->stdio_file[l]);
+        }
+
+        c->working_directory = mfree(c->working_directory);
+        c->root_directory = mfree(c->root_directory);
+        c->root_image = mfree(c->root_image);
+        c->root_image_options = mount_options_free_all(c->root_image_options);
+        c->root_hash = mfree(c->root_hash);
+        c->root_hash_size = 0;
+        c->root_hash_path = mfree(c->root_hash_path);
+        c->root_hash_sig = mfree(c->root_hash_sig);
+        c->root_hash_sig_size = 0;
+        c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+        c->root_verity = mfree(c->root_verity);
+        c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+        c->extension_directories = strv_free(c->extension_directories);
+        c->tty_path = mfree(c->tty_path);
+        c->syslog_identifier = mfree(c->syslog_identifier);
+        c->user = mfree(c->user);
+        c->group = mfree(c->group);
+
+        c->supplementary_groups = strv_free(c->supplementary_groups);
+
+        c->pam_name = mfree(c->pam_name);
+
+        c->read_only_paths = strv_free(c->read_only_paths);
+        c->read_write_paths = strv_free(c->read_write_paths);
+        c->inaccessible_paths = strv_free(c->inaccessible_paths);
+        c->exec_paths = strv_free(c->exec_paths);
+        c->no_exec_paths = strv_free(c->no_exec_paths);
+        c->exec_search_path = strv_free(c->exec_search_path);
+
+        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+        c->bind_mounts = NULL;
+        c->n_bind_mounts = 0;
+        temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+        c->temporary_filesystems = NULL;
+        c->n_temporary_filesystems = 0;
+        c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+
+        cpu_set_reset(&c->cpu_set);
+        numa_policy_reset(&c->numa_policy);
+
+        c->utmp_id = mfree(c->utmp_id);
+        c->selinux_context = mfree(c->selinux_context);
+        c->apparmor_profile = mfree(c->apparmor_profile);
+        c->smack_process_label = mfree(c->smack_process_label);
+
+        c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+
+        c->syscall_filter = hashmap_free(c->syscall_filter);
+        c->syscall_archs = set_free(c->syscall_archs);
+        c->address_families = set_free(c->address_families);
+
+        FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
+                exec_directory_done(d);
+
+        c->log_level_max = -1;
+
+        exec_context_free_log_extra_fields(c);
+        c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
+        c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
+
+        c->log_ratelimit_interval_usec = 0;
+        c->log_ratelimit_burst = 0;
+
+        c->stdin_data = mfree(c->stdin_data);
+        c->stdin_data_size = 0;
+
+        c->network_namespace_path = mfree(c->network_namespace_path);
+        c->ipc_namespace_path = mfree(c->ipc_namespace_path);
+
+        c->log_namespace = mfree(c->log_namespace);
+
+        c->load_credentials = hashmap_free(c->load_credentials);
+        c->set_credentials = hashmap_free(c->set_credentials);
+        c->import_credentials = set_free_free(c->import_credentials);
+
+        c->root_image_policy = image_policy_free(c->root_image_policy);
+        c->mount_image_policy = image_policy_free(c->mount_image_policy);
+        c->extension_image_policy = image_policy_free(c->extension_image_policy);
+}
+
+int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
+        assert(c);
+
+        if (!runtime_prefix)
+                return 0;
+
+        FOREACH_ARRAY(i, c->directories[EXEC_DIRECTORY_RUNTIME].items, c->directories[EXEC_DIRECTORY_RUNTIME].n_items) {
+                _cleanup_free_ char *p = NULL;
+
+                if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
+                        p = path_join(runtime_prefix, "private", i->path);
+                else
+                        p = path_join(runtime_prefix, i->path);
+                if (!p)
+                        return -ENOMEM;
+
+                /* We execute this synchronously, since we need to be sure this is gone when we start the
+                 * service next. */
+                (void) rm_rf(p, REMOVE_ROOT);
+
+                STRV_FOREACH(symlink, i->symlinks) {
+                        _cleanup_free_ char *symlink_abs = NULL;
+
+                        if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
+                                symlink_abs = path_join(runtime_prefix, "private", *symlink);
+                        else
+                                symlink_abs = path_join(runtime_prefix, *symlink);
+                        if (!symlink_abs)
+                                return -ENOMEM;
+
+                        (void) unlink(symlink_abs);
+                }
+        }
+
+        return 0;
+}
+
+int exec_context_destroy_mount_ns_dir(Unit *u) {
+        _cleanup_free_ char *p = NULL;
+
+        if (!u || !MANAGER_IS_SYSTEM(u->manager))
+                return 0;
+
+        p = path_join("/run/systemd/propagate/", u->id);
+        if (!p)
+                return -ENOMEM;
+
+        /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
+        if (rmdir(p) < 0 && errno != ENOENT)
+                log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
+
+        return 0;
+}
+
+void exec_command_done(ExecCommand *c) {
+        assert(c);
+
+        c->path = mfree(c->path);
+        c->argv = strv_free(c->argv);
+}
+
+void exec_command_done_array(ExecCommand *c, size_t n) {
+        FOREACH_ARRAY(i, c, n)
+                exec_command_done(i);
+}
+
+ExecCommand* exec_command_free_list(ExecCommand *c) {
+        ExecCommand *i;
+
+        while ((i = LIST_POP(command, c))) {
+                exec_command_done(i);
+                free(i);
+        }
+
+        return NULL;
+}
+
+void exec_command_free_array(ExecCommand **c, size_t n) {
+        FOREACH_ARRAY(i, c, n)
+                *i = exec_command_free_list(*i);
+}
+
+void exec_command_reset_status_array(ExecCommand *c, size_t n) {
+        FOREACH_ARRAY(i, c, n)
+                exec_status_reset(&i->exec_status);
+}
+
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
+        FOREACH_ARRAY(i, c, n)
+                LIST_FOREACH(command, z, *i)
+                        exec_status_reset(&z->exec_status);
+}
+
+typedef struct InvalidEnvInfo {
+        const Unit *unit;
+        const char *path;
+} InvalidEnvInfo;
+
+static void invalid_env(const char *p, void *userdata) {
+        InvalidEnvInfo *info = userdata;
+
+        log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
+}
+
+const char* exec_context_fdname(const ExecContext *c, int fd_index) {
+        assert(c);
+
+        switch (fd_index) {
+
+        case STDIN_FILENO:
+                if (c->std_input != EXEC_INPUT_NAMED_FD)
+                        return NULL;
+
+                return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
+
+        case STDOUT_FILENO:
+                if (c->std_output != EXEC_OUTPUT_NAMED_FD)
+                        return NULL;
+
+                return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
+
+        case STDERR_FILENO:
+                if (c->std_error != EXEC_OUTPUT_NAMED_FD)
+                        return NULL;
+
+                return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
+
+        default:
+                return NULL;
+        }
+}
+
+static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
+        _cleanup_strv_free_ char **v = NULL;
+        int r;
+
+        assert(c);
+        assert(ret);
+
+        STRV_FOREACH(i, c->environment_files) {
+                _cleanup_globfree_ glob_t pglob = {};
+                bool ignore = false;
+                char *fn = *i;
+
+                if (fn[0] == '-') {
+                        ignore = true;
+                        fn++;
+                }
+
+                if (!path_is_absolute(fn)) {
+                        if (ignore)
+                                continue;
+                        return -EINVAL;
+                }
+
+                /* Filename supports globbing, take all matching files */
+                r = safe_glob(fn, 0, &pglob);
+                if (r < 0) {
+                        if (ignore)
+                                continue;
+                        return r;
+                }
+
+                /* When we don't match anything, -ENOENT should be returned */
+                assert(pglob.gl_pathc > 0);
+
+                FOREACH_ARRAY(path, pglob.gl_pathv, pglob.gl_pathc) {
+                        _cleanup_strv_free_ char **p = NULL;
+
+                        r = load_env_file(NULL, *path, &p);
+                        if (r < 0) {
+                                if (ignore)
+                                        continue;
+                                return r;
+                        }
+
+                        /* Log invalid environment variables with filename */
+                        if (p) {
+                                InvalidEnvInfo info = {
+                                        .unit = unit,
+                                        .path = *path,
+                                };
+
+                                p = strv_env_clean_with_callback(p, invalid_env, &info);
+                        }
+
+                        if (!v)
+                                v = TAKE_PTR(p);
+                        else {
+                                char **m = strv_env_merge(v, p);
+                                if (!m)
+                                        return -ENOMEM;
+
+                                strv_free_and_replace(v, m);
+                        }
+                }
+        }
+
+        *ret = TAKE_PTR(v);
+
+        return 0;
+}
+
+static bool tty_may_match_dev_console(const char *tty) {
+        _cleanup_free_ char *resolved = NULL;
+
+        if (!tty)
+                return true;
+
+        tty = skip_dev_prefix(tty);
+
+        /* trivial identity? */
+        if (streq(tty, "console"))
+                return true;
+
+        if (resolve_dev_console(&resolved) < 0)
+                return true; /* if we could not resolve, assume it may */
+
+        /* "tty0" means the active VC, so it may be the same sometimes */
+        return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
+}
+
+static bool exec_context_may_touch_tty(const ExecContext *ec) {
+        assert(ec);
+
+        return ec->tty_reset ||
+                ec->tty_vhangup ||
+                ec->tty_vt_disallocate ||
+                is_terminal_input(ec->std_input) ||
+                is_terminal_output(ec->std_output) ||
+                is_terminal_output(ec->std_error);
+}
+
+bool exec_context_may_touch_console(const ExecContext *ec) {
+
+        return exec_context_may_touch_tty(ec) &&
+               tty_may_match_dev_console(exec_context_tty_path(ec));
+}
+
+static void strv_fprintf(FILE *f, char **l) {
+        assert(f);
+
+        STRV_FOREACH(g, l)
+                fprintf(f, " %s", *g);
+}
+
+static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
+        assert(f);
+        assert(prefix);
+        assert(name);
+
+        if (!strv_isempty(strv)) {
+                fprintf(f, "%s%s:", prefix, name);
+                strv_fprintf(f, strv);
+                fputs("\n", f);
+        }
+}
+
+void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix) {
+        assert(p);
+        assert(f);
+
+        prefix = strempty(prefix);
+
+        fprintf(f,
+                "%sRuntimeScope: %s\n"
+                "%sExecFlags: %u\n"
+                "%sSELinuxContextNetwork: %s\n"
+                "%sCgroupSupportedMask: %u\n"
+                "%sCgroupPath: %s\n"
+                "%sCrededentialsDirectory: %s\n"
+                "%sEncryptedCredentialsDirectory: %s\n"
+                "%sConfirmSpawn: %s\n"
+                "%sShallConfirmSpawn: %s\n"
+                "%sWatchdogUSec: " USEC_FMT "\n"
+                "%sNotifySocket: %s\n"
+                "%sFallbackSmackProcessLabel: %s\n",
+                prefix, runtime_scope_to_string(p->runtime_scope),
+                prefix, p->flags,
+                prefix, yes_no(p->selinux_context_net),
+                prefix, p->cgroup_supported,
+                prefix, p->cgroup_path,
+                prefix, strempty(p->received_credentials_directory),
+                prefix, strempty(p->received_encrypted_credentials_directory),
+                prefix, strempty(p->confirm_spawn),
+                prefix, yes_no(p->shall_confirm_spawn),
+                prefix, p->watchdog_usec,
+                prefix, strempty(p->notify_socket),
+                prefix, strempty(p->fallback_smack_process_label));
+
+        strv_dump(f, prefix, "FdNames", p->fd_names);
+        strv_dump(f, prefix, "Environment", p->environment);
+        strv_dump(f, prefix, "Prefix", p->prefix);
+
+        LIST_FOREACH(open_files, file, p->open_files)
+                fprintf(f, "%sOpenFile: %s %s", prefix, file->path, open_file_flags_to_string(file->flags));
+
+        strv_dump(f, prefix, "FilesEnv", p->files_env);
+}
+
+void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
+        int r;
+
+        assert(c);
+        assert(f);
+
+        prefix = strempty(prefix);
+
+        fprintf(f,
+                "%sUMask: %04o\n"
+                "%sWorkingDirectory: %s\n"
+                "%sRootDirectory: %s\n"
+                "%sRootEphemeral: %s\n"
+                "%sNonBlocking: %s\n"
+                "%sPrivateTmp: %s\n"
+                "%sPrivateDevices: %s\n"
+                "%sProtectKernelTunables: %s\n"
+                "%sProtectKernelModules: %s\n"
+                "%sProtectKernelLogs: %s\n"
+                "%sProtectClock: %s\n"
+                "%sProtectControlGroups: %s\n"
+                "%sPrivateNetwork: %s\n"
+                "%sPrivateUsers: %s\n"
+                "%sProtectHome: %s\n"
+                "%sProtectSystem: %s\n"
+                "%sMountAPIVFS: %s\n"
+                "%sIgnoreSIGPIPE: %s\n"
+                "%sMemoryDenyWriteExecute: %s\n"
+                "%sRestrictRealtime: %s\n"
+                "%sRestrictSUIDSGID: %s\n"
+                "%sKeyringMode: %s\n"
+                "%sProtectHostname: %s\n"
+                "%sProtectProc: %s\n"
+                "%sProcSubset: %s\n",
+                prefix, c->umask,
+                prefix, empty_to_root(c->working_directory),
+                prefix, empty_to_root(c->root_directory),
+                prefix, yes_no(c->root_ephemeral),
+                prefix, yes_no(c->non_blocking),
+                prefix, yes_no(c->private_tmp),
+                prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->protect_kernel_tunables),
+                prefix, yes_no(c->protect_kernel_modules),
+                prefix, yes_no(c->protect_kernel_logs),
+                prefix, yes_no(c->protect_clock),
+                prefix, yes_no(c->protect_control_groups),
+                prefix, yes_no(c->private_network),
+                prefix, yes_no(c->private_users),
+                prefix, protect_home_to_string(c->protect_home),
+                prefix, protect_system_to_string(c->protect_system),
+                prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
+                prefix, yes_no(c->ignore_sigpipe),
+                prefix, yes_no(c->memory_deny_write_execute),
+                prefix, yes_no(c->restrict_realtime),
+                prefix, yes_no(c->restrict_suid_sgid),
+                prefix, exec_keyring_mode_to_string(c->keyring_mode),
+                prefix, yes_no(c->protect_hostname),
+                prefix, protect_proc_to_string(c->protect_proc),
+                prefix, proc_subset_to_string(c->proc_subset));
+
+        if (c->set_login_environment >= 0)
+                fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));
+
+        if (c->root_image)
+                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
+
+        if (c->root_image_options) {
+                fprintf(f, "%sRootImageOptions:", prefix);
+                LIST_FOREACH(mount_options, o, c->root_image_options)
+                        if (!isempty(o->options))
+                                fprintf(f, " %s:%s",
+                                        partition_designator_to_string(o->partition_designator),
+                                        o->options);
+                fprintf(f, "\n");
+        }
+
+        if (c->root_hash) {
+                _cleanup_free_ char *encoded = NULL;
+                encoded = hexmem(c->root_hash, c->root_hash_size);
+                if (encoded)
+                        fprintf(f, "%sRootHash: %s\n", prefix, encoded);
+        }
+
+        if (c->root_hash_path)
+                fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
+
+        if (c->root_hash_sig) {
+                _cleanup_free_ char *encoded = NULL;
+                ssize_t len;
+                len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
+                if (len)
+                        fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
+        }
+
+        if (c->root_hash_sig_path)
+                fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
+
+        if (c->root_verity)
+                fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
+
+        STRV_FOREACH(e, c->environment)
+                fprintf(f, "%sEnvironment: %s\n", prefix, *e);
+
+        STRV_FOREACH(e, c->environment_files)
+                fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
+
+        STRV_FOREACH(e, c->pass_environment)
+                fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
+
+        STRV_FOREACH(e, c->unset_environment)
+                fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
+
+        fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+                fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
+
+                for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+                        fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
+
+                        STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
+                                fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
+                }
+        }
+
+        fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
+
+        if (c->memory_ksm >= 0)
+                fprintf(f, "%sMemoryKSM: %s\n", prefix, yes_no(c->memory_ksm > 0));
+
+        if (c->nice_set)
+                fprintf(f, "%sNice: %i\n", prefix, c->nice);
+
+        if (c->oom_score_adjust_set)
+                fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
+
+        if (c->coredump_filter_set)
+                fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
+
+        for (unsigned i = 0; i < RLIM_NLIMITS; i++)
+                if (c->rlimit[i]) {
+                        fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
+                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
+                        fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
+                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
+                }
+
+        if (c->ioprio_set) {
+                _cleanup_free_ char *class_str = NULL;
+
+                r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
+                if (r >= 0)
+                        fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
+
+                fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
+        }
+
+        if (c->cpu_sched_set) {
+                _cleanup_free_ char *policy_str = NULL;
+
+                r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
+                if (r >= 0)
+                        fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
+
+                fprintf(f,
+                        "%sCPUSchedulingPriority: %i\n"
+                        "%sCPUSchedulingResetOnFork: %s\n",
+                        prefix, c->cpu_sched_priority,
+                        prefix, yes_no(c->cpu_sched_reset_on_fork));
+        }
+
+        if (c->cpu_set.set) {
+                _cleanup_free_ char *affinity = NULL;
+
+                affinity = cpu_set_to_range_string(&c->cpu_set);
+                fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
+        }
+
+        if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+                _cleanup_free_ char *nodes = NULL;
+
+                nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+                fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
+                fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
+        }
+
+        if (c->timer_slack_nsec != NSEC_INFINITY)
+                fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
+
+        fprintf(f,
+                "%sStandardInput: %s\n"
+                "%sStandardOutput: %s\n"
+                "%sStandardError: %s\n",
+                prefix, exec_input_to_string(c->std_input),
+                prefix, exec_output_to_string(c->std_output),
+                prefix, exec_output_to_string(c->std_error));
+
+        if (c->std_input == EXEC_INPUT_NAMED_FD)
+                fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
+        if (c->std_output == EXEC_OUTPUT_NAMED_FD)
+                fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
+        if (c->std_error == EXEC_OUTPUT_NAMED_FD)
+                fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
+
+        if (c->std_input == EXEC_INPUT_FILE)
+                fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
+        if (c->std_output == EXEC_OUTPUT_FILE)
+                fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+        if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
+                fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+        if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
+                fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+        if (c->std_error == EXEC_OUTPUT_FILE)
+                fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+        if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
+                fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+        if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
+                fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+
+        if (c->tty_path)
+                fprintf(f,
+                        "%sTTYPath: %s\n"
+                        "%sTTYReset: %s\n"
+                        "%sTTYVHangup: %s\n"
+                        "%sTTYVTDisallocate: %s\n"
+                        "%sTTYRows: %u\n"
+                        "%sTTYColumns: %u\n",
+                        prefix, c->tty_path,
+                        prefix, yes_no(c->tty_reset),
+                        prefix, yes_no(c->tty_vhangup),
+                        prefix, yes_no(c->tty_vt_disallocate),
+                        prefix, c->tty_rows,
+                        prefix, c->tty_cols);
+
+        if (IN_SET(c->std_output,
+                   EXEC_OUTPUT_KMSG,
+                   EXEC_OUTPUT_JOURNAL,
+                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
+                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
+            IN_SET(c->std_error,
+                   EXEC_OUTPUT_KMSG,
+                   EXEC_OUTPUT_JOURNAL,
+                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
+                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
+
+                _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
+
+                r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
+                if (r >= 0)
+                        fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
+
+                r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
+                if (r >= 0)
+                        fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
+        }
+
+        if (c->log_level_max >= 0) {
+                _cleanup_free_ char *t = NULL;
+
+                (void) log_level_to_string_alloc(c->log_level_max, &t);
+
+                fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
+        }
+
+        if (c->log_ratelimit_interval_usec > 0)
+                fprintf(f,
+                        "%sLogRateLimitIntervalSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
+
+        if (c->log_ratelimit_burst > 0)
+                fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
+
+        if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
+                fprintf(f, "%sLogFilterPatterns:", prefix);
+
+                char *pattern;
+                SET_FOREACH(pattern, c->log_filter_allowed_patterns)
+                        fprintf(f, " %s", pattern);
+                SET_FOREACH(pattern, c->log_filter_denied_patterns)
+                        fprintf(f, " ~%s", pattern);
+                fputc('\n', f);
+        }
+
+        FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) {
+                fprintf(f, "%sLogExtraFields: ", prefix);
+                fwrite(field->iov_base, 1, field->iov_len, f);
+                fputc('\n', f);
+        }
+
+        if (c->log_namespace)
+                fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
+
+        if (c->secure_bits) {
+                _cleanup_free_ char *str = NULL;
+
+                r = secure_bits_to_string_alloc(c->secure_bits, &str);
+                if (r >= 0)
+                        fprintf(f, "%sSecure Bits: %s\n", prefix, str);
+        }
+
+        if (c->capability_bounding_set != CAP_MASK_UNSET) {
+                _cleanup_free_ char *str = NULL;
+
+                r = capability_set_to_string(c->capability_bounding_set, &str);
+                if (r >= 0)
+                        fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
+        }
+
+        if (c->capability_ambient_set != 0) {
+                _cleanup_free_ char *str = NULL;
+
+                r = capability_set_to_string(c->capability_ambient_set, &str);
+                if (r >= 0)
+                        fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
+        }
+
+        if (c->user)
+                fprintf(f, "%sUser: %s\n", prefix, c->user);
+        if (c->group)
+                fprintf(f, "%sGroup: %s\n", prefix, c->group);
+
+        fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
+
+        strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
+
+        if (c->pam_name)
+                fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
+
+        strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
+        strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
+        strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
+        strv_dump(f, prefix, "ExecPaths", c->exec_paths);
+        strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
+        strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
+
+        FOREACH_ARRAY(mount, c->bind_mounts, c->n_bind_mounts)
+                fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
+                        mount->read_only ? "BindReadOnlyPaths" : "BindPaths",
+                        mount->ignore_enoent ? "-": "",
+                        mount->source,
+                        mount->destination,
+                        mount->recursive ? "rbind" : "norbind");
+
+        FOREACH_ARRAY(tmpfs, c->temporary_filesystems, c->n_temporary_filesystems)
+                fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
+                        tmpfs->path,
+                        isempty(tmpfs->options) ? "" : ":",
+                        strempty(tmpfs->options));
+
+        if (c->utmp_id)
+                fprintf(f,
+                        "%sUtmpIdentifier: %s\n",
+                        prefix, c->utmp_id);
+
+        if (c->selinux_context)
+                fprintf(f,
+                        "%sSELinuxContext: %s%s\n",
+                        prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
+
+        if (c->apparmor_profile)
+                fprintf(f,
+                        "%sAppArmorProfile: %s%s\n",
+                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
+
+        if (c->smack_process_label)
+                fprintf(f,
+                        "%sSmackProcessLabel: %s%s\n",
+                        prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
+
+        if (c->personality != PERSONALITY_INVALID)
+                fprintf(f,
+                        "%sPersonality: %s\n",
+                        prefix, strna(personality_to_string(c->personality)));
+
+        fprintf(f,
+                "%sLockPersonality: %s\n",
+                prefix, yes_no(c->lock_personality));
+
+        if (c->syscall_filter) {
+                fprintf(f,
+                        "%sSystemCallFilter: ",
+                        prefix);
+
+                if (!c->syscall_allow_list)
+                        fputc('~', f);
+
+#if HAVE_SECCOMP
+                void *id, *val;
+                bool first = true;
+                HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+                        _cleanup_free_ char *name = NULL;
+                        const char *errno_name = NULL;
+                        int num = PTR_TO_INT(val);
+
+                        if (first)
+                                first = false;
+                        else
+                                fputc(' ', f);
+
+                        name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+                        fputs(strna(name), f);
+
+                        if (num >= 0) {
+                                errno_name = seccomp_errno_or_action_to_string(num);
+                                if (errno_name)
+                                        fprintf(f, ":%s", errno_name);
+                                else
+                                        fprintf(f, ":%d", num);
+                        }
+                }
+#endif
+
+                fputc('\n', f);
+        }
+
+        if (c->syscall_archs) {
+                fprintf(f,
+                        "%sSystemCallArchitectures:",
+                        prefix);
+
+#if HAVE_SECCOMP
+                void *id;
+                SET_FOREACH(id, c->syscall_archs)
+                        fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
+#endif
+                fputc('\n', f);
+        }
+
+        if (exec_context_restrict_namespaces_set(c)) {
+                _cleanup_free_ char *s = NULL;
+
+                r = namespace_flags_to_string(c->restrict_namespaces, &s);
+                if (r >= 0)
+                        fprintf(f, "%sRestrictNamespaces: %s\n",
+                                prefix, strna(s));
+        }
+
+#if HAVE_LIBBPF
+        if (exec_context_restrict_filesystems_set(c)) {
+                char *fs;
+                SET_FOREACH(fs, c->restrict_filesystems)
+                        fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
+        }
+#endif
+
+        if (c->network_namespace_path)
+                fprintf(f,
+                        "%sNetworkNamespacePath: %s\n",
+                        prefix, c->network_namespace_path);
+
+        if (c->syscall_errno > 0) {
+                fprintf(f, "%sSystemCallErrorNumber: ", prefix);
+
+#if HAVE_SECCOMP
+                const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
+                if (errno_name)
+                        fputs(errno_name, f);
+                else
+                        fprintf(f, "%d", c->syscall_errno);
+#endif
+                fputc('\n', f);
+        }
+
+        FOREACH_ARRAY(mount, c->mount_images, c->n_mount_images) {
+                fprintf(f, "%sMountImages: %s%s:%s", prefix,
+                        mount->ignore_enoent ? "-": "",
+                        mount->source,
+                        mount->destination);
+                LIST_FOREACH(mount_options, o, mount->mount_options)
+                        fprintf(f, ":%s:%s",
+                                partition_designator_to_string(o->partition_designator),
+                                strempty(o->options));
+                fprintf(f, "\n");
+        }
+
+        FOREACH_ARRAY(mount, c->extension_images, c->n_extension_images) {
+                fprintf(f, "%sExtensionImages: %s%s", prefix,
+                        mount->ignore_enoent ? "-": "",
+                        mount->source);
+                LIST_FOREACH(mount_options, o, mount->mount_options)
+                        fprintf(f, ":%s:%s",
+                                partition_designator_to_string(o->partition_designator),
+                                strempty(o->options));
+                fprintf(f, "\n");
+        }
+
+        strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
+}
+
+bool exec_context_maintains_privileges(const ExecContext *c) {
+        assert(c);
+
+        /* Returns true if the process forked off would run under
+         * an unchanged UID or as root. */
+
+        if (!c->user)
+                return true;
+
+        if (streq(c->user, "root") || streq(c->user, "0"))
+                return true;
+
+        return false;
+}
+
+int exec_context_get_effective_ioprio(const ExecContext *c) {
+        int p;
+
+        assert(c);
+
+        if (c->ioprio_set)
+                return c->ioprio;
+
+        p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
+        if (p < 0)
+                return IOPRIO_DEFAULT_CLASS_AND_PRIO;
+
+        return ioprio_normalize(p);
+}
+
+bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
+        assert(c);
+
+        /* Explicit setting wins */
+        if (c->mount_apivfs_set)
+                return c->mount_apivfs;
+
+        /* Default to "yes" if root directory or image are specified */
+        if (exec_context_with_rootfs(c))
+                return true;
+
+        return false;
+}
+
+void exec_context_free_log_extra_fields(ExecContext *c) {
+        assert(c);
+
+        FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields)
+                free(field->iov_base);
+
+        c->log_extra_fields = mfree(c->log_extra_fields);
+        c->n_log_extra_fields = 0;
+}
+
+void exec_context_revert_tty(ExecContext *c) {
+        _cleanup_close_ int fd = -EBADF;
+        const char *path;
+        struct stat st;
+        int r;
+
+        assert(c);
+
+        /* First, reset the TTY (possibly kicking everybody else from the TTY) */
+        exec_context_tty_reset(c, /* parameters= */ NULL);
+
+        /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
+         * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
+         * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
+        if (!exec_context_may_touch_tty(c))
+                return;
+
+        path = exec_context_tty_path(c);
+        if (!path)
+                return;
+
+        fd = open(path, O_PATH|O_CLOEXEC); /* Pin the inode */
+        if (fd < 0)
+                return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+                                             "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
+                                             path);
+
+        if (fstat(fd, &st) < 0)
+                return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
+
+        /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
+         * if things are a character device, since a proper check either means we'd have to open the TTY and
+         * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
+         * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
+         * with this at all? → https://github.com/systemd/systemd/issues/19213 */
+        if (!S_ISCHR(st.st_mode))
+                return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
+
+        r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
+        if (r < 0)
+                log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s to " UID_FMT ":" GID_FMT ", ignoring: %m", path, (uid_t) 0, (gid_t) TTY_GID);
+}
+
+int exec_context_get_clean_directories(
+                ExecContext *c,
+                char **prefix,
+                ExecCleanMask mask,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(c);
+        assert(prefix);
+        assert(ret);
+
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+                if (!FLAGS_SET(mask, 1U << t))
+                        continue;
+
+                if (!prefix[t])
+                        continue;
+
+                FOREACH_ARRAY(i, c->directories[t].items, c->directories[t].n_items) {
+                        char *j;
+
+                        j = path_join(prefix[t], i->path);
+                        if (!j)
+                                return -ENOMEM;
+
+                        r = strv_consume(&l, j);
+                        if (r < 0)
+                                return r;
+
+                        /* Also remove private directories unconditionally. */
+                        if (t != EXEC_DIRECTORY_CONFIGURATION) {
+                                j = path_join(prefix[t], "private", i->path);
+                                if (!j)
+                                        return -ENOMEM;
+
+                                r = strv_consume(&l, j);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        STRV_FOREACH(symlink, i->symlinks) {
+                                j = path_join(prefix[t], *symlink);
+                                if (!j)
+                                        return -ENOMEM;
+
+                                r = strv_consume(&l, j);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+        }
+
+        *ret = TAKE_PTR(l);
+        return 0;
+}
+
+int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
+        ExecCleanMask mask = 0;
+
+        assert(c);
+        assert(ret);
+
+        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
+                if (c->directories[t].n_items > 0)
+                        mask |= 1U << t;
+
+        *ret = mask;
+        return 0;
+}
+
+int exec_context_get_oom_score_adjust(const ExecContext *c) {
+        int n = 0, r;
+
+        assert(c);
+
+        if (c->oom_score_adjust_set)
+                return c->oom_score_adjust;
+
+        r = get_oom_score_adjust(&n);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read /proc/self/oom_score_adj, ignoring: %m");
+
+        return n;
+}
+
+uint64_t exec_context_get_coredump_filter(const ExecContext *c) {
+        _cleanup_free_ char *t = NULL;
+        uint64_t n = COREDUMP_FILTER_MASK_DEFAULT;
+        int r;
+
+        assert(c);
+
+        if (c->coredump_filter_set)
+                return c->coredump_filter;
+
+        r = read_one_line_file("/proc/self/coredump_filter", &t);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read /proc/self/coredump_filter, ignoring: %m");
+        else {
+                r = safe_atoux64(t, &n);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to parse \"%s\" from /proc/self/coredump_filter, ignoring: %m", t);
+        }
+
+        return n;
+}
+
+int exec_context_get_nice(const ExecContext *c) {
+        int n;
+
+        assert(c);
+
+        if (c->nice_set)
+                return c->nice;
+
+        errno = 0;
+        n = getpriority(PRIO_PROCESS, 0);
+        if (errno > 0) {
+                log_debug_errno(errno, "Failed to get process nice value, ignoring: %m");
+                n = 0;
+        }
+
+        return n;
+}
+
+int exec_context_get_cpu_sched_policy(const ExecContext *c) {
+        int n;
+
+        assert(c);
+
+        if (c->cpu_sched_set)
+                return c->cpu_sched_policy;
+
+        n = sched_getscheduler(0);
+        if (n < 0)
+                log_debug_errno(errno, "Failed to get scheduler policy, ignoring: %m");
+
+        return n < 0 ? SCHED_OTHER : n;
+}
+
+int exec_context_get_cpu_sched_priority(const ExecContext *c) {
+        struct sched_param p = {};
+        int r;
+
+        assert(c);
+
+        if (c->cpu_sched_set)
+                return c->cpu_sched_priority;
+
+        r = sched_getparam(0, &p);
+        if (r < 0)
+                log_debug_errno(errno, "Failed to get scheduler priority, ignoring: %m");
+
+        return r >= 0 ? p.sched_priority : 0;
+}
+
+uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c) {
+        int r;
+
+        assert(c);
+
+        if (c->timer_slack_nsec != NSEC_INFINITY)
+                return c->timer_slack_nsec;
+
+        r = prctl(PR_GET_TIMERSLACK);
+        if (r < 0)
+                log_debug_errno(r, "Failed to get timer slack, ignoring: %m");
+
+        return (uint64_t) MAX(r, 0);
+}
+
+char** exec_context_get_syscall_filter(const ExecContext *c) {
+        _cleanup_strv_free_ char **l = NULL;
+
+        assert(c);
+
+#if HAVE_SECCOMP
+        void *id, *val;
+        HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+                _cleanup_free_ char *name = NULL;
+                const char *e = NULL;
+                char *s;
+                int num = PTR_TO_INT(val);
+
+                if (c->syscall_allow_list && num >= 0)
+                        /* syscall with num >= 0 in allow-list is denied. */
+                        continue;
+
+                name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+                if (!name)
+                        continue;
+
+                if (num >= 0) {
+                        e = seccomp_errno_or_action_to_string(num);
+                        if (e) {
+                                s = strjoin(name, ":", e);
+                                if (!s)
+                                        return NULL;
+                        } else {
+                                if (asprintf(&s, "%s:%d", name, num) < 0)
+                                        return NULL;
+                        }
+                } else
+                        s = TAKE_PTR(name);
+
+                if (strv_consume(&l, s) < 0)
+                        return NULL;
+        }
+
+        strv_sort(l);
+#endif
+
+        return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_syscall_archs(const ExecContext *c) {
+        _cleanup_strv_free_ char **l = NULL;
+
+        assert(c);
+
+#if HAVE_SECCOMP
+        void *id;
+        SET_FOREACH(id, c->syscall_archs) {
+                const char *name;
+
+                name = seccomp_arch_to_string(PTR_TO_UINT32(id) - 1);
+                if (!name)
+                        continue;
+
+                if (strv_extend(&l, name) < 0)
+                        return NULL;
+        }
+
+        strv_sort(l);
+#endif
+
+        return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_syscall_log(const ExecContext *c) {
+        _cleanup_strv_free_ char **l = NULL;
+
+        assert(c);
+
+#if HAVE_SECCOMP
+        void *id, *val;
+        HASHMAP_FOREACH_KEY(val, id, c->syscall_log) {
+                char *name = NULL;
+
+                name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+                if (!name)
+                        continue;
+
+                if (strv_consume(&l, name) < 0)
+                        return NULL;
+        }
+
+        strv_sort(l);
+#endif
+
+        return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_address_families(const ExecContext *c) {
+        _cleanup_strv_free_ char **l = NULL;
+        void *af;
+
+        assert(c);
+
+        SET_FOREACH(af, c->address_families) {
+                const char *name;
+
+                name = af_to_name(PTR_TO_INT(af));
+                if (!name)
+                        continue;
+
+                if (strv_extend(&l, name) < 0)
+                        return NULL;
+        }
+
+        strv_sort(l);
+
+        return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_restrict_filesystems(const ExecContext *c) {
+        _cleanup_strv_free_ char **l = NULL;
+
+        assert(c);
+
+#if HAVE_LIBBPF
+        l = set_get_strv(c->restrict_filesystems);
+        if (!l)
+                return NULL;
+
+        strv_sort(l);
+#endif
+
+        return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+void exec_status_start(ExecStatus *s, pid_t pid) {
+        assert(s);
+
+        *s = (ExecStatus) {
+                .pid = pid,
+        };
+
+        dual_timestamp_now(&s->start_timestamp);
+}
+
+void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
+        assert(s);
+
+        if (s->pid != pid)
+                *s = (ExecStatus) {
+                        .pid = pid,
+                };
+
+        dual_timestamp_now(&s->exit_timestamp);
+
+        s->code = code;
+        s->status = status;
+
+        if (context && context->utmp_id)
+                (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
+}
+
+void exec_status_reset(ExecStatus *s) {
+        assert(s);
+
+        *s = (ExecStatus) {};
+}
+
+void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
+        assert(s);
+        assert(f);
+
+        if (s->pid <= 0)
+                return;
+
+        prefix = strempty(prefix);
+
+        fprintf(f,
+                "%sPID: "PID_FMT"\n",
+                prefix, s->pid);
+
+        if (dual_timestamp_is_set(&s->start_timestamp))
+                fprintf(f,
+                        "%sStart Timestamp: %s\n",
+                        prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
+
+        if (dual_timestamp_is_set(&s->exit_timestamp))
+                fprintf(f,
+                        "%sExit Timestamp: %s\n"
+                        "%sExit Code: %s\n"
+                        "%sExit Status: %i\n",
+                        prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
+                        prefix, sigchld_code_to_string(s->code),
+                        prefix, s->status);
+}
+
+static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
+        _cleanup_free_ char *cmd = NULL;
+        const char *prefix2;
+
+        assert(c);
+        assert(f);
+
+        prefix = strempty(prefix);
+        prefix2 = strjoina(prefix, "\t");
+
+        cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
+
+        fprintf(f,
+                "%sCommand Line: %s\n",
+                prefix, strnull(cmd));
+
+        exec_status_dump(&c->exec_status, f, prefix2);
+}
+
+void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
+        assert(f);
+
+        prefix = strempty(prefix);
+
+        LIST_FOREACH(command, i, c)
+                exec_command_dump(i, f, prefix);
+}
+
+void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
+        ExecCommand *end;
+
+        assert(l);
+        assert(e);
+
+        if (*l) {
+                /* It's kind of important, that we keep the order here */
+                end = LIST_FIND_TAIL(command, *l);
+                LIST_INSERT_AFTER(command, *l, end, e);
+        } else
+                *l = e;
+}
+
+int exec_command_set(ExecCommand *c, const char *path, ...) {
+        va_list ap;
+        char **l, *p;
+
+        assert(c);
+        assert(path);
+
+        va_start(ap, path);
+        l = strv_new_ap(path, ap);
+        va_end(ap);
+
+        if (!l)
+                return -ENOMEM;
+
+        p = strdup(path);
+        if (!p) {
+                strv_free(l);
+                return -ENOMEM;
+        }
+
+        free_and_replace(c->path, p);
+
+        return strv_free_and_replace(c->argv, l);
+}
+
+int exec_command_append(ExecCommand *c, const char *path, ...) {
+        _cleanup_strv_free_ char **l = NULL;
+        va_list ap;
+        int r;
+
+        assert(c);
+        assert(path);
+
+        va_start(ap, path);
+        l = strv_new_ap(path, ap);
+        va_end(ap);
+
+        if (!l)
+                return -ENOMEM;
+
+        r = strv_extend_strv(&c->argv, l, false);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static char *destroy_tree(char *path) {
+        if (!path)
+                return NULL;
+
+        if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
+                log_debug("Spawning process to nuke '%s'", path);
+
+                (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
+        }
+
+        return mfree(path);
+}
+
+void exec_shared_runtime_done(ExecSharedRuntime *rt) {
+        if (!rt)
+                return;
+
+        if (rt->manager)
+                (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
+
+        rt->id = mfree(rt->id);
+        rt->tmp_dir = mfree(rt->tmp_dir);
+        rt->var_tmp_dir = mfree(rt->var_tmp_dir);
+        safe_close_pair(rt->netns_storage_socket);
+        safe_close_pair(rt->ipcns_storage_socket);
+}
+
+static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
+        exec_shared_runtime_done(rt);
+
+        return mfree(rt);
+}
+
+DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
+
+ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
+        if (!rt)
+                return NULL;
+
+        assert(rt->n_ref > 0);
+        rt->n_ref--;
+
+        if (rt->n_ref > 0)
+                return NULL;
+
+        rt->tmp_dir = destroy_tree(rt->tmp_dir);
+        rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
+
+        return exec_shared_runtime_free(rt);
+}
+
+static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
+        _cleanup_free_ char *id_copy = NULL;
+        ExecSharedRuntime *n;
+
+        assert(ret);
+
+        id_copy = strdup(id);
+        if (!id_copy)
+                return -ENOMEM;
+
+        n = new(ExecSharedRuntime, 1);
+        if (!n)
+                return -ENOMEM;
+
+        *n = (ExecSharedRuntime) {
+                .id = TAKE_PTR(id_copy),
+                .netns_storage_socket = EBADF_PAIR,
+                .ipcns_storage_socket = EBADF_PAIR,
+        };
+
+        *ret = n;
+        return 0;
+}
+
+static int exec_shared_runtime_add(
+                Manager *m,
+                const char *id,
+                char **tmp_dir,
+                char **var_tmp_dir,
+                int netns_storage_socket[2],
+                int ipcns_storage_socket[2],
+                ExecSharedRuntime **ret) {
+
+        _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
+        int r;
+
+        assert(m);
+        assert(id);
+
+        /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
+
+        r = exec_shared_runtime_allocate(&rt, id);
+        if (r < 0)
+                return r;
+
+        r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
+        if (r < 0)
+                return r;
+
+        assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
+        rt->tmp_dir = TAKE_PTR(*tmp_dir);
+        rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
+
+        if (netns_storage_socket) {
+                rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
+                rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
+        }
+
+        if (ipcns_storage_socket) {
+                rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
+                rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
+        }
+
+        rt->manager = m;
+
+        if (ret)
+                *ret = rt;
+        /* do not remove created ExecSharedRuntime object when the operation succeeds. */
+        TAKE_PTR(rt);
+        return 0;
+}
+
+static int exec_shared_runtime_make(
+                Manager *m,
+                const ExecContext *c,
+                const char *id,
+                ExecSharedRuntime **ret) {
+
+        _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
+        _cleanup_close_pair_ int netns_storage_socket[2] = EBADF_PAIR, ipcns_storage_socket[2] = EBADF_PAIR;
+        int r;
+
+        assert(m);
+        assert(c);
+        assert(id);
+
+        /* It is not necessary to create ExecSharedRuntime object. */
+        if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
+                *ret = NULL;
+                return 0;
+        }
+
+        if (c->private_tmp &&
+            !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
+              (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
+               prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
+                r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
+                if (r < 0)
+                        return r;
+        }
+
+        if (exec_needs_network_namespace(c)) {
+                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
+                        return -errno;
+        }
+
+        if (exec_needs_ipc_namespace(c)) {
+                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
+                        return -errno;
+        }
+
+        r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
+        ExecSharedRuntime *rt;
+        int r;
+
+        assert(m);
+        assert(id);
+        assert(ret);
+
+        rt = hashmap_get(m->exec_shared_runtime_by_id, id);
+        if (rt)
+                /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
+                goto ref;
+
+        if (!create) {
+                *ret = NULL;
+                return 0;
+        }
+
+        /* If not found, then create a new object. */
+        r = exec_shared_runtime_make(m, c, id, &rt);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
+                *ret = NULL;
+                return 0;
+        }
+
+ref:
+        /* increment reference counter. */
+        rt->n_ref++;
+        *ret = rt;
+        return 1;
+}
+
+int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
+        ExecSharedRuntime *rt;
+
+        assert(m);
+        assert(f);
+        assert(fds);
+
+        HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
+                fprintf(f, "exec-runtime=%s", rt->id);
+
+                if (rt->tmp_dir)
+                        fprintf(f, " tmp-dir=%s", rt->tmp_dir);
+
+                if (rt->var_tmp_dir)
+                        fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
+
+                if (rt->netns_storage_socket[0] >= 0) {
+                        int copy;
+
+                        copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
+                        if (copy < 0)
+                                return copy;
+
+                        fprintf(f, " netns-socket-0=%i", copy);
+                }
+
+                if (rt->netns_storage_socket[1] >= 0) {
+                        int copy;
+
+                        copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
+                        if (copy < 0)
+                                return copy;
+
+                        fprintf(f, " netns-socket-1=%i", copy);
+                }
+
+                if (rt->ipcns_storage_socket[0] >= 0) {
+                        int copy;
+
+                        copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
+                        if (copy < 0)
+                                return copy;
+
+                        fprintf(f, " ipcns-socket-0=%i", copy);
+                }
+
+                if (rt->ipcns_storage_socket[1] >= 0) {
+                        int copy;
+
+                        copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
+                        if (copy < 0)
+                                return copy;
+
+                        fprintf(f, " ipcns-socket-1=%i", copy);
+                }
+
+                fputc('\n', f);
+        }
+
+        return 0;
+}
+
+int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
+        _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
+        ExecSharedRuntime *rt = NULL;
+        int r;
+
+        /* This is for the migration from old (v237 or earlier) deserialization text.
+         * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
+         * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
+         * so or not from the serialized text, then we always creates a new object owned by this. */
+
+        assert(u);
+        assert(key);
+        assert(value);
+
+        /* Manager manages ExecSharedRuntime objects by the unit id.
+         * So, we omit the serialized text when the unit does not have id (yet?)... */
+        if (isempty(u->id)) {
+                log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
+                return 0;
+        }
+
+        if (u->manager) {
+                if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
+                        return log_oom();
+
+                rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
+        }
+        if (!rt) {
+                if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
+                        return log_oom();
+
+                rt = rt_create;
+        }
+
+        if (streq(key, "tmp-dir")) {
+                if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
+                        return -ENOMEM;
+
+        } else if (streq(key, "var-tmp-dir")) {
+                if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
+                        return -ENOMEM;
+
+        } else if (streq(key, "netns-socket-0")) {
+
+                safe_close(rt->netns_storage_socket[0]);
+                rt->netns_storage_socket[0] = deserialize_fd(fds, value);
+                if (rt->netns_storage_socket[0] < 0)
+                        return 0;
+
+        } else if (streq(key, "netns-socket-1")) {
+
+                safe_close(rt->netns_storage_socket[1]);
+                rt->netns_storage_socket[1] = deserialize_fd(fds, value);
+                if (rt->netns_storage_socket[1] < 0)
+                        return 0;
+        } else
+                return 0;
+
+        /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
+        if (rt_create && u->manager) {
+                r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
+                if (r < 0) {
+                        log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
+                        return 0;
+                }
+
+                rt_create->manager = u->manager;
+
+                /* Avoid cleanup */
+                TAKE_PTR(rt_create);
+        }
+
+        return 1;
+}
+
+int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
+        _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
+        char *id = NULL;
+        int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
+        const char *p, *v = ASSERT_PTR(value);
+        size_t n;
+
+        assert(m);
+        assert(fds);
+
+        n = strcspn(v, " ");
+        id = strndupa_safe(v, n);
+        if (v[n] != ' ')
+                goto finalize;
+        p = v + n + 1;
+
+        v = startswith(p, "tmp-dir=");
+        if (v) {
+                n = strcspn(v, " ");
+                tmp_dir = strndup(v, n);
+                if (!tmp_dir)
+                        return log_oom();
+                if (v[n] != ' ')
+                        goto finalize;
+                p = v + n + 1;
+        }
+
+        v = startswith(p, "var-tmp-dir=");
+        if (v) {
+                n = strcspn(v, " ");
+                var_tmp_dir = strndup(v, n);
+                if (!var_tmp_dir)
+                        return log_oom();
+                if (v[n] != ' ')
+                        goto finalize;
+                p = v + n + 1;
+        }
+
+        v = startswith(p, "netns-socket-0=");
+        if (v) {
+                char *buf;
+
+                n = strcspn(v, " ");
+                buf = strndupa_safe(v, n);
+
+                netns_fdpair[0] = deserialize_fd(fds, buf);
+                if (netns_fdpair[0] < 0)
+                        return netns_fdpair[0];
+                if (v[n] != ' ')
+                        goto finalize;
+                p = v + n + 1;
+        }
+
+        v = startswith(p, "netns-socket-1=");
+        if (v) {
+                char *buf;
+
+                n = strcspn(v, " ");
+                buf = strndupa_safe(v, n);
+
+                netns_fdpair[1] = deserialize_fd(fds, buf);
+                if (netns_fdpair[1] < 0)
+                        return netns_fdpair[1];
+                if (v[n] != ' ')
+                        goto finalize;
+                p = v + n + 1;
+        }
+
+        v = startswith(p, "ipcns-socket-0=");
+        if (v) {
+                char *buf;
+
+                n = strcspn(v, " ");
+                buf = strndupa_safe(v, n);
+
+                ipcns_fdpair[0] = deserialize_fd(fds, buf);
+                if (ipcns_fdpair[0] < 0)
+                        return ipcns_fdpair[0];
+                if (v[n] != ' ')
+                        goto finalize;
+                p = v + n + 1;
+        }
+
+        v = startswith(p, "ipcns-socket-1=");
+        if (v) {
+                char *buf;
+
+                n = strcspn(v, " ");
+                buf = strndupa_safe(v, n);
+
+                ipcns_fdpair[1] = deserialize_fd(fds, buf);
+                if (ipcns_fdpair[1] < 0)
+                        return ipcns_fdpair[1];
+        }
+
+finalize:
+        r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to add exec-runtime: %m");
+        return 0;
+}
+
+void exec_shared_runtime_vacuum(Manager *m) {
+        ExecSharedRuntime *rt;
+
+        assert(m);
+
+        /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
+
+        HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
+                if (rt->n_ref > 0)
+                        continue;
+
+                (void) exec_shared_runtime_free(rt);
+        }
+}
+
+int exec_runtime_make(
+                const Unit *unit,
+                const ExecContext *context,
+                ExecSharedRuntime *shared,
+                DynamicCreds *creds,
+                ExecRuntime **ret) {
+        _cleanup_close_pair_ int ephemeral_storage_socket[2] = EBADF_PAIR;
+        _cleanup_free_ char *ephemeral = NULL;
+        _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+        int r;
+
+        assert(unit);
+        assert(context);
+        assert(ret);
+
+        if (!shared && !creds && !exec_needs_ephemeral(context)) {
+                *ret = NULL;
+                return 0;
+        }
+
+        if (exec_needs_ephemeral(context)) {
+                r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
+                if (r < 0)
+                        return r;
+
+                r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
+                if (r < 0)
+                        return r;
+
+                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
+                        return -errno;
+        }
+
+        rt = new(ExecRuntime, 1);
+        if (!rt)
+                return -ENOMEM;
+
+        *rt = (ExecRuntime) {
+                .shared = shared,
+                .dynamic_creds = creds,
+                .ephemeral_copy = TAKE_PTR(ephemeral),
+                .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
+                .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
+        };
+
+        *ret = TAKE_PTR(rt);
+        return 1;
+}
+
+ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
+        if (!rt)
+                return NULL;
+
+        exec_shared_runtime_unref(rt->shared);
+        dynamic_creds_unref(rt->dynamic_creds);
+
+        rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
+
+        safe_close_pair(rt->ephemeral_storage_socket);
+        return mfree(rt);
+}
+
+ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
+        if (!rt)
+                return NULL;
+
+        rt->shared = exec_shared_runtime_destroy(rt->shared);
+        rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
+        return exec_runtime_free(rt);
+}
+
+void exec_runtime_clear(ExecRuntime *rt) {
+        if (!rt)
+                return;
+
+        safe_close_pair(rt->ephemeral_storage_socket);
+        rt->ephemeral_copy = mfree(rt->ephemeral_copy);
+}
+
+void exec_params_shallow_clear(ExecParameters *p) {
+        if (!p)
+                return;
+
+        /* This is called on the PID1 side, as many of the struct's FDs are only borrowed, and actually
+         * owned by the manager or other objects, and reused across multiple units. */
+
+        p->environment = strv_free(p->environment);
+        p->fd_names = strv_free(p->fd_names);
+        p->files_env = strv_free(p->files_env);
+        p->fds = mfree(p->fds);
+        p->exec_fd = safe_close(p->exec_fd);
+        p->user_lookup_fd = -EBADF;
+        p->bpf_outer_map_fd = -EBADF;
+        p->unit_id = mfree(p->unit_id);
+        p->invocation_id = SD_ID128_NULL;
+        p->invocation_id_string[0] = '\0';
+        p->confirm_spawn = mfree(p->confirm_spawn);
+}
+
+void exec_params_deep_clear(ExecParameters *p) {
+        if (!p)
+                return;
+
+        /* This is called on the sd-executor side, where everything received is owned by the process and has
+         * to be fully cleaned up to make sanitizers and analyzers happy, as opposed as the shallow clean
+         * function above. */
+
+        close_many_unset(p->fds, p->n_socket_fds + p->n_storage_fds);
+
+        p->cgroup_path = mfree(p->cgroup_path);
+
+        if (p->prefix) {
+                free_many_charp(p->prefix, _EXEC_DIRECTORY_TYPE_MAX);
+                p->prefix = mfree(p->prefix);
+        }
+
+        p->received_credentials_directory = mfree(p->received_credentials_directory);
+        p->received_encrypted_credentials_directory = mfree(p->received_encrypted_credentials_directory);
+
+        if (p->idle_pipe) {
+                close_many_and_free(p->idle_pipe, 4);
+                p->idle_pipe = NULL;
+        }
+
+        p->stdin_fd = safe_close(p->stdin_fd);
+        p->stdout_fd = safe_close(p->stdout_fd);
+        p->stderr_fd = safe_close(p->stderr_fd);
+
+        p->notify_socket = mfree(p->notify_socket);
+
+        open_file_free_many(&p->open_files);
+
+        p->fallback_smack_process_label = mfree(p->fallback_smack_process_label);
+
+        exec_params_shallow_clear(p);
+}
+
+void exec_directory_done(ExecDirectory *d) {
+        if (!d)
+                return;
+
+        FOREACH_ARRAY(i, d->items, d->n_items) {
+                free(i->path);
+                strv_free(i->symlinks);
+        }
+
+        d->items = mfree(d->items);
+        d->n_items = 0;
+        d->mode = 0755;
+}
+
+static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
+        assert(d);
+        assert(path);
+
+        FOREACH_ARRAY(i, d->items, d->n_items)
+                if (path_equal(i->path, path))
+                        return i;
+
+        return NULL;
+}
+
+int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
+        _cleanup_strv_free_ char **s = NULL;
+        _cleanup_free_ char *p = NULL;
+        ExecDirectoryItem *existing;
+        int r;
+
+        assert(d);
+        assert(path);
+
+        existing = exec_directory_find(d, path);
+        if (existing) {
+                r = strv_extend(&existing->symlinks, symlink);
+                if (r < 0)
+                        return r;
+
+                return 0; /* existing item is updated */
+        }
+
+        p = strdup(path);
+        if (!p)
+                return -ENOMEM;
+
+        if (symlink) {
+                s = strv_new(symlink);
+                if (!s)
+                        return -ENOMEM;
+        }
+
+        if (!GREEDY_REALLOC(d->items, d->n_items + 1))
+                return -ENOMEM;
+
+        d->items[d->n_items++] = (ExecDirectoryItem) {
+                .path = TAKE_PTR(p),
+                .symlinks = TAKE_PTR(s),
+        };
+
+        return 1; /* new item is added */
+}
+
+static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
+        assert(a);
+        assert(b);
+
+        return path_compare(a->path, b->path);
+}
+
+void exec_directory_sort(ExecDirectory *d) {
+        assert(d);
+
+        /* Sort the exec directories to make always parent directories processed at first in
+         * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
+         * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
+         * list. See also comments in setup_exec_directory() and issue #24783. */
+
+        if (d->n_items <= 1)
+                return;
+
+        typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
+
+        for (size_t i = 1; i < d->n_items; i++)
+                for (size_t j = 0; j < i; j++)
+                        if (path_startswith(d->items[i].path, d->items[j].path)) {
+                                d->items[i].only_create = true;
+                                break;
+                        }
+}
+
+ExecCleanMask exec_clean_mask_from_string(const char *s) {
+        ExecDirectoryType t;
+
+        assert(s);
+
+        if (streq(s, "all"))
+                return EXEC_CLEAN_ALL;
+        if (streq(s, "fdstore"))
+                return EXEC_CLEAN_FDSTORE;
+
+        t = exec_resource_type_from_string(s);
+        if (t < 0)
+                return (ExecCleanMask) t;
+
+        return 1U << t;
+}
+
+static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
+        [EXEC_INPUT_NULL] = "null",
+        [EXEC_INPUT_TTY] = "tty",
+        [EXEC_INPUT_TTY_FORCE] = "tty-force",
+        [EXEC_INPUT_TTY_FAIL] = "tty-fail",
+        [EXEC_INPUT_SOCKET] = "socket",
+        [EXEC_INPUT_NAMED_FD] = "fd",
+        [EXEC_INPUT_DATA] = "data",
+        [EXEC_INPUT_FILE] = "file",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
+
+static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
+        [EXEC_OUTPUT_INHERIT] = "inherit",
+        [EXEC_OUTPUT_NULL] = "null",
+        [EXEC_OUTPUT_TTY] = "tty",
+        [EXEC_OUTPUT_KMSG] = "kmsg",
+        [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
+        [EXEC_OUTPUT_JOURNAL] = "journal",
+        [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
+        [EXEC_OUTPUT_SOCKET] = "socket",
+        [EXEC_OUTPUT_NAMED_FD] = "fd",
+        [EXEC_OUTPUT_FILE] = "file",
+        [EXEC_OUTPUT_FILE_APPEND] = "append",
+        [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
+
+static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
+        [EXEC_UTMP_INIT] = "init",
+        [EXEC_UTMP_LOGIN] = "login",
+        [EXEC_UTMP_USER] = "user",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
+
+static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
+        [EXEC_PRESERVE_NO] = "no",
+        [EXEC_PRESERVE_YES] = "yes",
+        [EXEC_PRESERVE_RESTART] = "restart",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
+
+/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
+static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+        [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
+        [EXEC_DIRECTORY_STATE] = "StateDirectory",
+        [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
+        [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
+        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
+
+/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
+static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+        [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
+        [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
+        [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
+        [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
+        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
+
+static const char* const exec_directory_type_mode_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+        [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectoryMode",
+        [EXEC_DIRECTORY_STATE]         = "StateDirectoryMode",
+        [EXEC_DIRECTORY_CACHE]         = "CacheDirectoryMode",
+        [EXEC_DIRECTORY_LOGS]          = "LogsDirectoryMode",
+        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectoryMode",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_mode, ExecDirectoryType);
+
+/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
+ * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
+ * directories, specifically .timer units with their timestamp touch file. */
+static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+        [EXEC_DIRECTORY_RUNTIME] = "runtime",
+        [EXEC_DIRECTORY_STATE] = "state",
+        [EXEC_DIRECTORY_CACHE] = "cache",
+        [EXEC_DIRECTORY_LOGS] = "logs",
+        [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
+
+static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
+        [EXEC_KEYRING_INHERIT] = "inherit",
+        [EXEC_KEYRING_PRIVATE] = "private",
+        [EXEC_KEYRING_SHARED] = "shared",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
diff --git a/src/core/execute.h b/src/core/execute.h
new file mode 100644
index 0000000..5a6927a
--- /dev/null
+++ b/src/core/execute.h
@@ -0,0 +1,701 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct ExecStatus ExecStatus;
+typedef struct ExecCommand ExecCommand;
+typedef struct ExecContext ExecContext;
+typedef struct ExecSharedRuntime ExecSharedRuntime;
+typedef struct DynamicCreds DynamicCreds;
+typedef struct ExecRuntime ExecRuntime;
+typedef struct ExecParameters ExecParameters;
+typedef struct Manager Manager;
+
+#include 
+#include 
+#include 
+#include 
+
+#include "cgroup-util.h"
+#include "coredump-util.h"
+#include "cpu-set-util.h"
+#include "exec-util.h"
+#include "fdset.h"
+#include "list.h"
+#include "missing_resource.h"
+#include "namespace.h"
+#include "nsflags.h"
+#include "numa-util.h"
+#include "open-file.h"
+#include "path-util.h"
+#include "runtime-scope.h"
+#include "set.h"
+#include "time-util.h"
+
+#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U)
+
+typedef enum ExecUtmpMode {
+        EXEC_UTMP_INIT,
+        EXEC_UTMP_LOGIN,
+        EXEC_UTMP_USER,
+        _EXEC_UTMP_MODE_MAX,
+        _EXEC_UTMP_MODE_INVALID = -EINVAL,
+} ExecUtmpMode;
+
+typedef enum ExecInput {
+        EXEC_INPUT_NULL,
+        EXEC_INPUT_TTY,
+        EXEC_INPUT_TTY_FORCE,
+        EXEC_INPUT_TTY_FAIL,
+        EXEC_INPUT_SOCKET,
+        EXEC_INPUT_NAMED_FD,
+        EXEC_INPUT_DATA,
+        EXEC_INPUT_FILE,
+        _EXEC_INPUT_MAX,
+        _EXEC_INPUT_INVALID = -EINVAL,
+} ExecInput;
+
+typedef enum ExecOutput {
+        EXEC_OUTPUT_INHERIT,
+        EXEC_OUTPUT_NULL,
+        EXEC_OUTPUT_TTY,
+        EXEC_OUTPUT_KMSG,
+        EXEC_OUTPUT_KMSG_AND_CONSOLE,
+        EXEC_OUTPUT_JOURNAL,
+        EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+        EXEC_OUTPUT_SOCKET,
+        EXEC_OUTPUT_NAMED_FD,
+        EXEC_OUTPUT_FILE,
+        EXEC_OUTPUT_FILE_APPEND,
+        EXEC_OUTPUT_FILE_TRUNCATE,
+        _EXEC_OUTPUT_MAX,
+        _EXEC_OUTPUT_INVALID = -EINVAL,
+} ExecOutput;
+
+typedef enum ExecPreserveMode {
+        EXEC_PRESERVE_NO,
+        EXEC_PRESERVE_YES,
+        EXEC_PRESERVE_RESTART,
+        _EXEC_PRESERVE_MODE_MAX,
+        _EXEC_PRESERVE_MODE_INVALID = -EINVAL,
+} ExecPreserveMode;
+
+typedef enum ExecKeyringMode {
+        EXEC_KEYRING_INHERIT,
+        EXEC_KEYRING_PRIVATE,
+        EXEC_KEYRING_SHARED,
+        _EXEC_KEYRING_MODE_MAX,
+        _EXEC_KEYRING_MODE_INVALID = -EINVAL,
+} ExecKeyringMode;
+
+/* Contains start and exit information about an executed command.  */
+struct ExecStatus {
+        dual_timestamp start_timestamp;
+        dual_timestamp exit_timestamp;
+        pid_t pid;
+        int code;     /* as in siginfo_t::si_code */
+        int status;   /* as in siginfo_t::si_status */
+};
+
+/* Stores information about commands we execute. Covers both configuration settings as well as runtime data. */
+struct ExecCommand {
+        char *path;
+        char **argv;
+        ExecStatus exec_status; /* Note that this is not serialized to sd-executor */
+        ExecCommandFlags flags;
+        LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */
+};
+
+/* Encapsulates certain aspects of the runtime environment that is to be shared between multiple otherwise separate
+ * invocations of commands. Specifically, this allows sharing of /tmp and /var/tmp data as well as network namespaces
+ * between invocations of commands. This is a reference counted object, with one reference taken by each currently
+ * active command invocation that wants to share this runtime. */
+struct ExecSharedRuntime {
+        unsigned n_ref;
+
+        Manager *manager;
+
+        char *id; /* Unit id of the owner */
+
+        char *tmp_dir;
+        char *var_tmp_dir;
+
+        /* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network
+         * namespace. */
+        int netns_storage_socket[2];
+
+        /* Like netns_storage_socket, but the file descriptor is referring to the IPC namespace. */
+        int ipcns_storage_socket[2];
+};
+
+struct ExecRuntime {
+        ExecSharedRuntime *shared;
+        DynamicCreds *dynamic_creds;
+
+        /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */
+        char *ephemeral_copy;
+
+        /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of
+         * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot
+         * until we're done using it. */
+        int ephemeral_storage_socket[2];
+};
+
+typedef enum ExecDirectoryType {
+        EXEC_DIRECTORY_RUNTIME,
+        EXEC_DIRECTORY_STATE,
+        EXEC_DIRECTORY_CACHE,
+        EXEC_DIRECTORY_LOGS,
+        EXEC_DIRECTORY_CONFIGURATION,
+        _EXEC_DIRECTORY_TYPE_MAX,
+        _EXEC_DIRECTORY_TYPE_INVALID = -EINVAL,
+} ExecDirectoryType;
+
+typedef struct ExecDirectoryItem {
+        char *path;
+        char **symlinks;
+        bool only_create;
+} ExecDirectoryItem;
+
+typedef struct ExecDirectory {
+        mode_t mode;
+        size_t n_items;
+        ExecDirectoryItem *items;
+} ExecDirectory;
+
+typedef enum ExecCleanMask {
+        /* In case you wonder why the bitmask below doesn't use "directory" in its name: we want to keep this
+         * generic so that .timer timestamp files can nicely be covered by this too, and similar. */
+        EXEC_CLEAN_RUNTIME       = 1U << EXEC_DIRECTORY_RUNTIME,
+        EXEC_CLEAN_STATE         = 1U << EXEC_DIRECTORY_STATE,
+        EXEC_CLEAN_CACHE         = 1U << EXEC_DIRECTORY_CACHE,
+        EXEC_CLEAN_LOGS          = 1U << EXEC_DIRECTORY_LOGS,
+        EXEC_CLEAN_CONFIGURATION = 1U << EXEC_DIRECTORY_CONFIGURATION,
+        EXEC_CLEAN_FDSTORE       = 1U << _EXEC_DIRECTORY_TYPE_MAX,
+        EXEC_CLEAN_NONE          = 0,
+        EXEC_CLEAN_ALL           = (1U << (_EXEC_DIRECTORY_TYPE_MAX+1)) - 1,
+        _EXEC_CLEAN_MASK_INVALID = -EINVAL,
+} ExecCleanMask;
+
+/* Encodes configuration parameters applied to invoked commands. Does not carry runtime data, but only configuration
+ * changes sourced from unit files and suchlike. ExecContext objects are usually embedded into Unit objects, and do not
+ * change after being loaded. */
+struct ExecContext {
+        char **environment;
+        char **environment_files;
+        char **pass_environment;
+        char **unset_environment;
+
+        struct rlimit *rlimit[_RLIMIT_MAX];
+        char *working_directory, *root_directory, *root_image, *root_verity, *root_hash_path, *root_hash_sig_path;
+        void *root_hash, *root_hash_sig;
+        size_t root_hash_size, root_hash_sig_size;
+        LIST_HEAD(MountOptions, root_image_options);
+        bool root_ephemeral;
+        bool working_directory_missing_ok:1;
+        bool working_directory_home:1;
+
+        bool oom_score_adjust_set:1;
+        bool coredump_filter_set:1;
+        bool nice_set:1;
+        bool ioprio_set:1;
+        bool cpu_sched_set:1;
+        bool mount_apivfs_set:1;
+
+        /* This is not exposed to the user but available internally. We need it to make sure that whenever we
+         * spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects
+         * that it belongs to us and we don't enter a trigger loop. */
+        bool same_pgrp;
+
+        bool cpu_sched_reset_on_fork;
+        bool non_blocking;
+
+        mode_t umask;
+        int oom_score_adjust;
+        int nice;
+        int ioprio;
+        int cpu_sched_policy;
+        int cpu_sched_priority;
+        uint64_t coredump_filter;
+
+        CPUSet cpu_set;
+        NUMAPolicy numa_policy;
+        bool cpu_affinity_from_numa;
+
+        ExecInput std_input;
+        ExecOutput std_output;
+        ExecOutput std_error;
+
+        /* At least one of stdin/stdout/stderr was initialized from an fd passed in. This boolean survives
+         * the fds being closed. This only makes sense for transient units. */
+        bool stdio_as_fds;
+
+        char *stdio_fdname[3];
+        char *stdio_file[3];
+
+        void *stdin_data;
+        size_t stdin_data_size;
+
+        nsec_t timer_slack_nsec;
+
+        char *tty_path;
+
+        bool tty_reset;
+        bool tty_vhangup;
+        bool tty_vt_disallocate;
+
+        unsigned tty_rows;
+        unsigned tty_cols;
+
+        bool ignore_sigpipe;
+
+        ExecKeyringMode keyring_mode;
+
+        /* Since resolving these names might involve socket
+         * connections and we don't want to deadlock ourselves these
+         * names are resolved on execution only and in the child
+         * process. */
+        char *user;
+        char *group;
+        char **supplementary_groups;
+
+        int set_login_environment;
+
+        char *pam_name;
+
+        char *utmp_id;
+        ExecUtmpMode utmp_mode;
+
+        bool no_new_privileges;
+
+        bool selinux_context_ignore;
+        bool apparmor_profile_ignore;
+        bool smack_process_label_ignore;
+
+        char *selinux_context;
+        char *apparmor_profile;
+        char *smack_process_label;
+
+        char **read_write_paths, **read_only_paths, **inaccessible_paths, **exec_paths, **no_exec_paths;
+        char **exec_search_path;
+        unsigned long mount_propagation_flag;
+        BindMount *bind_mounts;
+        size_t n_bind_mounts;
+        TemporaryFileSystem *temporary_filesystems;
+        size_t n_temporary_filesystems;
+        MountImage *mount_images;
+        size_t n_mount_images;
+        MountImage *extension_images;
+        size_t n_extension_images;
+        char **extension_directories;
+
+        uint64_t capability_bounding_set;
+        uint64_t capability_ambient_set;
+        int secure_bits;
+
+        int syslog_priority;
+        bool syslog_level_prefix;
+        char *syslog_identifier;
+
+        struct iovec* log_extra_fields;
+        size_t n_log_extra_fields;
+        Set *log_filter_allowed_patterns;
+        Set *log_filter_denied_patterns;
+
+        usec_t log_ratelimit_interval_usec;
+        unsigned log_ratelimit_burst;
+
+        int log_level_max;
+
+        char *log_namespace;
+
+        ProtectProc protect_proc;  /* hidepid= */
+        ProcSubset proc_subset;    /* subset= */
+
+        int private_mounts;
+        int memory_ksm;
+        bool private_tmp;
+        bool private_network;
+        bool private_devices;
+        bool private_users;
+        bool private_ipc;
+        bool protect_kernel_tunables;
+        bool protect_kernel_modules;
+        bool protect_kernel_logs;
+        bool protect_clock;
+        bool protect_control_groups;
+        ProtectSystem protect_system;
+        ProtectHome protect_home;
+        bool protect_hostname;
+        bool mount_apivfs;
+
+        bool dynamic_user;
+        bool remove_ipc;
+
+        bool memory_deny_write_execute;
+        bool restrict_realtime;
+        bool restrict_suid_sgid;
+
+        bool lock_personality;
+        unsigned long personality;
+
+        unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
+
+        Set *restrict_filesystems;
+        bool restrict_filesystems_allow_list:1;
+
+        Hashmap *syscall_filter;
+        Set *syscall_archs;
+        int syscall_errno;
+        bool syscall_allow_list:1;
+
+        Hashmap *syscall_log;
+        bool syscall_log_allow_list:1; /* Log listed system calls */
+
+        bool address_families_allow_list:1;
+        Set *address_families;
+
+        char *network_namespace_path;
+        char *ipc_namespace_path;
+
+        ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX];
+        ExecPreserveMode runtime_directory_preserve_mode;
+        usec_t timeout_clean_usec;
+
+        Hashmap *set_credentials; /* output id → ExecSetCredential */
+        Hashmap *load_credentials; /* output id → ExecLoadCredential */
+        Set *import_credentials;
+
+        ImagePolicy *root_image_policy, *mount_image_policy, *extension_image_policy;
+};
+
+static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
+        assert(c);
+
+        return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
+}
+
+static inline bool exec_context_restrict_filesystems_set(const ExecContext *c) {
+        assert(c);
+
+        return c->restrict_filesystems_allow_list ||
+          !set_isempty(c->restrict_filesystems);
+}
+
+static inline bool exec_context_with_rootfs(const ExecContext *c) {
+        assert(c);
+
+        /* Checks if RootDirectory= or RootImage= are used */
+
+        return !empty_or_root(c->root_directory) || c->root_image;
+}
+
+typedef enum ExecFlags {
+        EXEC_APPLY_SANDBOXING      = 1 << 0,
+        EXEC_APPLY_CHROOT          = 1 << 1,
+        EXEC_APPLY_TTY_STDIN       = 1 << 2,
+        EXEC_PASS_LOG_UNIT         = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
+        EXEC_CHOWN_DIRECTORIES     = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
+        EXEC_NSS_DYNAMIC_BYPASS    = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
+        EXEC_CGROUP_DELEGATE       = 1 << 6,
+        EXEC_IS_CONTROL            = 1 << 7,
+        EXEC_CONTROL_CGROUP        = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
+        EXEC_WRITE_CREDENTIALS     = 1 << 9, /* Set up the credential store logic */
+
+        /* The following are not used by execute.c, but by consumers internally */
+        EXEC_PASS_FDS              = 1 << 10,
+        EXEC_SETENV_RESULT         = 1 << 11,
+        EXEC_SET_WATCHDOG          = 1 << 12,
+        EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
+} ExecFlags;
+
+/* Parameters for a specific invocation of a command. This structure is put together right before a command is
+ * executed. */
+struct ExecParameters {
+        RuntimeScope runtime_scope;
+
+        char **environment;
+
+        int *fds;
+        char **fd_names;
+        size_t n_socket_fds;
+        size_t n_storage_fds;
+
+        ExecFlags flags;
+        bool selinux_context_net:1;
+
+        CGroupMask cgroup_supported;
+        char *cgroup_path;
+        uint64_t cgroup_id;
+
+        char **prefix;
+        char *received_credentials_directory;
+        char *received_encrypted_credentials_directory;
+
+        char *confirm_spawn;
+        bool shall_confirm_spawn;
+
+        usec_t watchdog_usec;
+
+        int *idle_pipe;
+
+        int stdin_fd;
+        int stdout_fd;
+        int stderr_fd;
+
+        /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
+        int exec_fd;
+
+        char *notify_socket;
+
+        LIST_HEAD(OpenFile, open_files);
+
+        char *fallback_smack_process_label;
+
+        char **files_env;
+        int user_lookup_fd;
+        int bpf_outer_map_fd;
+
+        /* Used for logging in the executor functions */
+        char *unit_id;
+        sd_id128_t invocation_id;
+        char invocation_id_string[SD_ID128_STRING_MAX];
+};
+
+#define EXEC_PARAMETERS_INIT(_flags)        \
+        (ExecParameters) {                  \
+                .flags = (_flags),          \
+                .stdin_fd         = -EBADF, \
+                .stdout_fd        = -EBADF, \
+                .stderr_fd        = -EBADF, \
+                .exec_fd          = -EBADF, \
+                .bpf_outer_map_fd = -EBADF, \
+                .user_lookup_fd   = -EBADF, \
+        };
+
+#include "unit.h"
+#include "dynamic-user.h"
+
+int exec_spawn(Unit *unit,
+               ExecCommand *command,
+               const ExecContext *context,
+               ExecParameters *exec_params,
+               ExecRuntime *runtime,
+               const CGroupContext *cgroup_context,
+               pid_t *ret);
+
+void exec_command_done(ExecCommand *c);
+void exec_command_done_array(ExecCommand *c, size_t n);
+ExecCommand* exec_command_free_list(ExecCommand *c);
+void exec_command_free_array(ExecCommand **c, size_t n);
+void exec_command_reset_status_array(ExecCommand *c, size_t n);
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n);
+void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix);
+void exec_command_append_list(ExecCommand **l, ExecCommand *e);
+int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_;
+int exec_command_append(ExecCommand *c, const char *path, ...) _sentinel_;
+
+void exec_context_init(ExecContext *c);
+void exec_context_done(ExecContext *c);
+void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix);
+
+int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root);
+int exec_context_destroy_mount_ns_dir(Unit *u);
+
+const char* exec_context_fdname(const ExecContext *c, int fd_index);
+
+bool exec_context_may_touch_console(const ExecContext *c);
+bool exec_context_maintains_privileges(const ExecContext *c);
+
+int exec_context_get_effective_ioprio(const ExecContext *c);
+bool exec_context_get_effective_mount_apivfs(const ExecContext *c);
+
+void exec_context_free_log_extra_fields(ExecContext *c);
+
+void exec_context_revert_tty(ExecContext *c);
+
+int exec_context_get_clean_directories(ExecContext *c, char **prefix, ExecCleanMask mask, char ***ret);
+int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret);
+
+const char *exec_context_tty_path(const ExecContext *context);
+int exec_context_apply_tty_size(const ExecContext *context, int tty_fd, const char *tty_path);
+void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p);
+
+uint64_t exec_context_get_rlimit(const ExecContext *c, const char *name);
+int exec_context_get_oom_score_adjust(const ExecContext *c);
+uint64_t exec_context_get_coredump_filter(const ExecContext *c);
+int exec_context_get_nice(const ExecContext *c);
+int exec_context_get_cpu_sched_policy(const ExecContext *c);
+int exec_context_get_cpu_sched_priority(const ExecContext *c);
+uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c);
+char** exec_context_get_syscall_filter(const ExecContext *c);
+char** exec_context_get_syscall_archs(const ExecContext *c);
+char** exec_context_get_syscall_log(const ExecContext *c);
+char** exec_context_get_address_families(const ExecContext *c);
+char** exec_context_get_restrict_filesystems(const ExecContext *c);
+
+void exec_status_start(ExecStatus *s, pid_t pid);
+void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status);
+void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix);
+void exec_status_reset(ExecStatus *s);
+
+int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecSharedRuntime **ret);
+ExecSharedRuntime *exec_shared_runtime_destroy(ExecSharedRuntime *r);
+ExecSharedRuntime *exec_shared_runtime_unref(ExecSharedRuntime *r);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_unref);
+
+int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds);
+int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds);
+int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
+void exec_shared_runtime_done(ExecSharedRuntime *rt);
+void exec_shared_runtime_vacuum(Manager *m);
+
+int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
+ExecRuntime* exec_runtime_free(ExecRuntime *rt);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
+ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
+void exec_runtime_clear(ExecRuntime *rt);
+
+int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
+void exec_params_shallow_clear(ExecParameters *p);
+void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
+void exec_params_deep_clear(ExecParameters *p);
+
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c);
+
+void exec_directory_done(ExecDirectory *d);
+int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink);
+void exec_directory_sort(ExecDirectory *d);
+bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type);
+
+ExecCleanMask exec_clean_mask_from_string(const char *s);
+
+const char* exec_output_to_string(ExecOutput i) _const_;
+ExecOutput exec_output_from_string(const char *s) _pure_;
+
+const char* exec_input_to_string(ExecInput i) _const_;
+ExecInput exec_input_from_string(const char *s) _pure_;
+
+const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_;
+ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_;
+
+const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_;
+ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_;
+
+const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_;
+ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_symlink_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_symlink_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_mode_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_mode_from_string(const char *s) _pure_;
+
+const char* exec_resource_type_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
+
+bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
+bool exec_needs_network_namespace(const ExecContext *context);
+bool exec_needs_ipc_namespace(const ExecContext *context);
+
+/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
+ * instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
+ * not available). */
+
+#define LOG_EXEC_ID_FIELD(ep) \
+        ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_UNIT=" : "UNIT=")
+#define LOG_EXEC_ID_FIELD_FORMAT(ep) \
+        ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_UNIT=%s" : "UNIT=%s")
+#define LOG_EXEC_INVOCATION_ID_FIELD(ep) \
+        ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=" : "INVOCATION_ID=")
+#define LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep) \
+        ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=%s" : "INVOCATION_ID=%s")
+
+#define log_exec_full_errno_zerook(ec, ep, level, error, ...)             \
+        ({                                                                \
+                const ExecContext *_c = (ec);                             \
+                const ExecParameters *_p = (ep);                          \
+                const int _l = (level);                                   \
+                bool _do_log = !(log_get_max_level() < LOG_PRI(_l) ||     \
+                        !(_c->log_level_max < 0 ||                        \
+                        _c->log_level_max >= LOG_PRI(_l)));               \
+                LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields,                \
+                                     _c->n_log_extra_fields);             \
+                !_do_log ? -ERRNO_VALUE(error) :                          \
+                        log_object_internal(_l, error, PROJECT_FILE,      \
+                        __LINE__, __func__,                               \
+                        LOG_EXEC_ID_FIELD(_p),                            \
+                        _p->unit_id,                                      \
+                        LOG_EXEC_INVOCATION_ID_FIELD(_p),                 \
+                        _p->invocation_id_string, ##__VA_ARGS__);         \
+        })
+
+#define log_exec_full_errno(ec, ep, level, error, ...)                            \
+        ({                                                                        \
+                int _error = (error);                                             \
+                ASSERT_NON_ZERO(_error);                                          \
+                log_exec_full_errno_zerook(ec, ep, level, _error, ##__VA_ARGS__); \
+        })
+
+#define log_exec_full(ec, ep, level, ...) (void) log_exec_full_errno_zerook(ec, ep, level, 0, __VA_ARGS__)
+
+#define log_exec_debug(ec, ep, ...)   log_exec_full(ec, ep, LOG_DEBUG, __VA_ARGS__)
+#define log_exec_info(ec, ep, ...)    log_exec_full(ec, ep, LOG_INFO, __VA_ARGS__)
+#define log_exec_notice(ec, ep, ...)  log_exec_full(ec, ep, LOG_NOTICE, __VA_ARGS__)
+#define log_exec_warning(ec, ep, ...) log_exec_full(ec, ep, LOG_WARNING, __VA_ARGS__)
+#define log_exec_error(ec, ep, ...)   log_exec_full(ec, ep, LOG_ERR, __VA_ARGS__)
+
+#define log_exec_debug_errno(ec, ep, error, ...)   log_exec_full_errno(ec, ep, LOG_DEBUG, error, __VA_ARGS__)
+#define log_exec_info_errno(ec, ep, error, ...)    log_exec_full_errno(ec, ep, LOG_INFO, error, __VA_ARGS__)
+#define log_exec_notice_errno(ec, ep, error, ...)  log_exec_full_errno(ec, ep, LOG_NOTICE, error, __VA_ARGS__)
+#define log_exec_warning_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_WARNING, error, __VA_ARGS__)
+#define log_exec_error_errno(ec, ep, error, ...)   log_exec_full_errno(ec, ep, LOG_ERR, error, __VA_ARGS__)
+
+#define log_exec_struct_errno(ec, ep, level, error, ...)                                                      \
+        ({                                                                                                    \
+                const ExecContext *_c = (ec);                                                                 \
+                const ExecParameters *_p = (ep);                                                              \
+                const int _l = (level);                                                                       \
+                bool _do_log = !(_c->log_level_max < 0 ||                                                     \
+                                 _c->log_level_max >= LOG_PRI(_l));                                           \
+                LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields,                                                    \
+                                     _c->n_log_extra_fields);                                                 \
+                _do_log ?                                                                                     \
+                        log_struct_errno(_l, error, __VA_ARGS__, LOG_EXEC_ID_FIELD_FORMAT(_p), _p->unit_id) : \
+                        -ERRNO_VALUE(error);                            \
+        })
+
+#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__)
+
+#define log_exec_struct_iovec_errno(ec, ep, level, error, iovec, n_iovec)   \
+        ({                                                                  \
+                const ExecContext *_c = (ec);                               \
+                const ExecParameters *_p = (ep);                            \
+                const int _l = (level);                                     \
+                bool _do_log = !(_c->log_level_max < 0 ||                   \
+                                 _c->log_level_max >= LOG_PRI(_l));         \
+                LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields,                  \
+                                     _c->n_log_extra_fields);               \
+                _do_log ?                                                   \
+                        log_struct_iovec_errno(_l, error, iovec, n_iovec) : \
+                        -ERRNO_VALUE(error);                                \
+        })
+
+#define log_exec_struct_iovec(ec, ep, level, iovec, n_iovec) log_exec_struct_iovec_errno(ec, ep, level, 0, iovec, n_iovec)
+
+/* Like LOG_MESSAGE(), but with the unit name prefixed. */
+#define LOG_EXEC_MESSAGE(ep, fmt, ...) LOG_MESSAGE("%s: " fmt, (ep)->unit_id, ##__VA_ARGS__)
+#define LOG_EXEC_ID(ep) LOG_EXEC_ID_FIELD_FORMAT(ep), (ep)->unit_id
+#define LOG_EXEC_INVOCATION_ID(ep) LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep), (ep)->invocation_id_string
+
+#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c)                                                  \
+        const ExecContext *c = (ec);                                                          \
+        const ExecParameters *p = (ep);                                                       \
+        LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_ID_FIELD(p), p->unit_id);                              \
+        LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \
+        LOG_CONTEXT_PUSH_IOV(c->log_extra_fields, c->n_log_extra_fields)
+
+#define LOG_CONTEXT_PUSH_EXEC(ec, ep) \
+        _LOG_CONTEXT_PUSH_EXEC(ec, ep, UNIQ_T(p, UNIQ), UNIQ_T(c, UNIQ))
diff --git a/src/core/executor.c b/src/core/executor.c
new file mode 100644
index 0000000..b2716ef
--- /dev/null
+++ b/src/core/executor.c
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "argv-util.h"
+#include "build.h"
+#include "exec-invoke.h"
+#include "execute-serialize.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fdset.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "getopt-defs.h"
+#include "label-util.h"
+#include "parse-util.h"
+#include "pretty-print.h"
+#include "selinux-util.h"
+#include "static-destruct.h"
+
+static FILE *arg_serialization = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_serialization, fclosep);
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...]\n\n"
+               "%sSandbox and execute processes.%s\n\n"
+               "  -h --help                Show this help and exit\n"
+               "     --version             Print version string and exit\n"
+               "     --log-target=TARGET   Set log target (console, journal,\n"
+               "                                           journal-or-kmsg,\n"
+               "                                           kmsg, null)\n"
+               "     --log-level=LEVEL     Set log level (debug, info, notice,\n"
+               "                                          warning, err, crit,\n"
+               "                                          alert, emerg)\n"
+               "     --log-color=BOOL      Highlight important messages\n"
+               "     --log-location=BOOL   Include code location in messages\n"
+               "     --log-time=BOOL       Prefix messages with current time\n"
+               "     --deserialize=FD      Deserialize process config from FD\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                COMMON_GETOPT_ARGS,
+                ARG_VERSION,
+                ARG_DESERIALIZE,
+        };
+
+        static const struct option options[] = {
+                { "log-level",      required_argument, NULL, ARG_LOG_LEVEL      },
+                { "log-target",     required_argument, NULL, ARG_LOG_TARGET     },
+                { "log-color",      required_argument, NULL, ARG_LOG_COLOR      },
+                { "log-location",   required_argument, NULL, ARG_LOG_LOCATION   },
+                { "log-time",       required_argument, NULL, ARG_LOG_TIME       },
+                { "help",           no_argument,       NULL, 'h'                },
+                { "version",        no_argument,       NULL, ARG_VERSION        },
+                { "deserialize",    required_argument, NULL, ARG_DESERIALIZE    },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+                switch (c) {
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_LOG_LEVEL:
+                        r = log_set_max_level_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_LOG_TARGET:
+                        r = log_set_target_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_LOG_COLOR:
+                        r = log_show_color_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to parse log color setting \"%s\": %m",
+                                                optarg);
+
+                        break;
+
+                case ARG_LOG_LOCATION:
+                        r = log_show_location_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to parse log location setting \"%s\": %m",
+                                                optarg);
+
+                        break;
+
+                case ARG_LOG_TIME:
+                        r = log_show_time_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(
+                                                r,
+                                                "Failed to parse log time setting \"%s\": %m",
+                                                optarg);
+
+                        break;
+
+                case ARG_DESERIALIZE: {
+                        _cleanup_close_ int fd = -EBADF;
+                        FILE *f;
+
+                        fd = parse_fd(optarg);
+                        if (fd < 0)
+                                return log_error_errno(fd,
+                                                       "Failed to parse serialization fd \"%s\": %m",
+                                                       optarg);
+
+                        r = fd_cloexec(fd, /* cloexec= */ true);
+                        if (r < 0)
+                                return log_error_errno(r,
+                                                       "Failed to set serialization fd %d to close-on-exec: %m",
+                                                       fd);
+
+                        f = take_fdopen(&fd, "r");
+                        if (!f)
+                                return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
+
+                        safe_fclose(arg_serialization);
+                        arg_serialization = f;
+
+                        break;
+                }
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (!arg_serialization)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No serialization fd specified.");
+
+        return 1 /* work to do */;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_fdset_free_ FDSet *fdset = NULL;
+        _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {};
+        _cleanup_(exec_context_done) ExecContext context = {};
+        _cleanup_(exec_command_done) ExecCommand command = {};
+        _cleanup_(exec_params_deep_clear) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0);
+        _cleanup_(exec_shared_runtime_done) ExecSharedRuntime shared = {
+                .netns_storage_socket = EBADF_PAIR,
+                .ipcns_storage_socket = EBADF_PAIR,
+        };
+        _cleanup_(dynamic_creds_done) DynamicCreds dynamic_creds = {};
+        _cleanup_(exec_runtime_clear) ExecRuntime runtime = {
+                .ephemeral_storage_socket = EBADF_PAIR,
+                .shared = &shared,
+                .dynamic_creds = &dynamic_creds,
+        };
+        int exit_status = EXIT_SUCCESS, r;
+
+        exec_context_init(&context);
+        cgroup_context_init(&cgroup_context);
+
+        /* We might be starting the journal itself, we'll be told by the caller what to do */
+        log_set_always_reopen_console(true);
+        log_set_prohibit_ipc(true);
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        /* Now that we know the intended log target, allow IPC and open the final log target. */
+        log_set_prohibit_ipc(false);
+        log_open();
+
+        /* This call would collect all passed fds and enable CLOEXEC. We'll unset it in exec_invoke (flag_fds)
+         * for fds that shall be passed to the child.
+         * The serialization fd is set to CLOEXEC in parse_argv, so it's also filtered. */
+        r = fdset_new_fill(/* filter_cloexec= */ 0, &fdset);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create fd set: %m");
+
+        /* Initialize lazily. SMACK is just a few operations, but the SELinux is very slow as it requires
+         * loading the entire database in memory, so we will do it lazily only if it is actually needed, to
+         * avoid wasting 2ms-10ms for each sd-executor that gets spawned. */
+        r = mac_init_lazy();
+        if (r < 0)
+                return log_error_errno(r, "Failed to initialize MAC layer: %m");
+
+        r = exec_deserialize_invocation(arg_serialization,
+                                        fdset,
+                                        &context,
+                                        &command,
+                                        ¶ms,
+                                        &runtime,
+                                        &cgroup_context);
+        if (r < 0)
+                return log_error_errno(r, "Failed to deserialize: %m");
+
+        arg_serialization = safe_fclose(arg_serialization);
+        fdset = fdset_free(fdset);
+
+        r = exec_invoke(&command,
+                        &context,
+                        ¶ms,
+                        &runtime,
+                        &cgroup_context,
+                        &exit_status);
+        if (r < 0) {
+                const char *status = ASSERT_PTR(
+                                exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
+
+                log_exec_struct_errno(&context, ¶ms, LOG_ERR, r,
+                                      "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+                                      LOG_EXEC_INVOCATION_ID(¶ms),
+                                      LOG_EXEC_MESSAGE(¶ms, "Failed at step %s spawning %s: %m",
+                                                       status, command.path),
+                                      "EXECUTABLE=%s", command.path);
+        } else
+                assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */
+
+        return exit_status;
+}
+
+int main(int argc, char *argv[]) {
+        int r;
+
+        /* We use safe_fork() for spawning sd-pam helper process, which internally calls rename_process().
+         * As the last step of renaming, all saved argvs are memzero()-ed. Hence, we need to save the argv
+         * first to prevent showing "intense" cmdline. See #30352. */
+        save_argc_argv(argc, argv);
+
+        r = run(argc, argv);
+
+        mac_selinux_finish();
+        static_destruct();
+
+        return r < 0 ? EXIT_FAILURE : r;
+}
diff --git a/src/core/fuzz-execute-serialize.c b/src/core/fuzz-execute-serialize.c
new file mode 100644
index 0000000..6069efd
--- /dev/null
+++ b/src/core/fuzz-execute-serialize.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/* Notes on how to run the fuzzer manually:
+ *  1) Build the fuzzers with LLVM's libFuzzer and ASan+UBSan:
+ *    $ CC=clang CXX=clang++ meson build-libfuzz -Db_sanitize=address,undefined -Dllvm-fuzz=true -Db_lundef=false
+ *
+ *  2) Collect some valid inputs:
+ *
+ * OUT=test/fuzz/fuzz-execute-serialize/initial
+ * for section in context command parameters runtime cgroup; do
+ *     awk "match(\$0, /startswith\\(.+, \"(exec-${section}-[^\"]+=)\"/, m) { print m[1]; }" \
+ *         src/core/execute-serialize.c >>"$OUT"
+ *     # Each "section" is delimited by an empty line
+ *     echo >>"$OUT"
+ * done
+ *
+ *  3) Run the fuzzer:
+ *    $ build-libfuzz/fuzz-execute-serialize test/fuzz/fuzz-execute-serialize
+ */
+
+#include 
+
+#include "alloc-util.h"
+#include "execute-serialize.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "service.h"
+
+static void exec_fuzz_one(FILE *f, FDSet *fdset) {
+        _cleanup_(exec_params_deep_clear) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0);
+        _cleanup_(exec_context_done) ExecContext exec_context = {};
+        _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {};
+        DynamicCreds dynamic_creds = {};
+        ExecCommand command = {};
+        ExecSharedRuntime shared = {
+                .netns_storage_socket = EBADF_PAIR,
+                .ipcns_storage_socket = EBADF_PAIR,
+        };
+        ExecRuntime runtime = {
+                .ephemeral_storage_socket = EBADF_PAIR,
+                .shared = &shared,
+                .dynamic_creds = &dynamic_creds,
+        };
+
+        exec_context_init(&exec_context);
+        cgroup_context_init(&cgroup_context);
+
+        (void) exec_deserialize_invocation(f, fdset, &exec_context, &command, ¶ms, &runtime, &cgroup_context);
+        (void) exec_serialize_invocation(f, fdset, &exec_context, &command, ¶ms, &runtime, &cgroup_context);
+        (void) exec_deserialize_invocation(f, fdset, &exec_context, &command, ¶ms, &runtime, &cgroup_context);
+
+        /* We definitely didn't provide valid FDs during deserialization, so
+         * wipe the FDs before exec_params_serialized_clear() kicks in, otherwise
+         * we'll hit the assert in safe_close() */
+        params.stdin_fd = -EBADF;
+        params.stdout_fd = -EBADF;
+        params.stderr_fd = -EBADF;
+        params.exec_fd = -EBADF;
+        params.user_lookup_fd = -EBADF;
+        params.bpf_outer_map_fd = -EBADF;
+        if (!params.fds)
+                params.n_socket_fds = params.n_storage_fds = 0;
+        for (size_t i = 0; params.fds && i < params.n_socket_fds + params.n_storage_fds; i++)
+                params.fds[i] = -EBADF;
+
+        exec_command_done_array(&command, /* n= */ 1);
+        exec_shared_runtime_done(&shared);
+        if (dynamic_creds.group != dynamic_creds.user)
+                dynamic_user_free(dynamic_creds.group);
+        dynamic_user_free(dynamic_creds.user);
+        free(runtime.ephemeral_copy);
+        safe_close_pair(runtime.ephemeral_storage_socket);
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_fdset_free_ FDSet *fdset = NULL;
+
+        if (outside_size_range(size, 0, 128 * 1024))
+                return 0;
+
+        fuzz_setup_logging();
+
+        assert_se(fdset = fdset_new());
+        assert_se(f = data_to_file(data, size));
+
+        exec_fuzz_one(f, fdset);
+
+        return 0;
+}
diff --git a/src/core/fuzz-manager-serialize.c b/src/core/fuzz-manager-serialize.c
new file mode 100644
index 0000000..57083ca
--- /dev/null
+++ b/src/core/fuzz-manager-serialize.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "manager-serialize.h"
+#include "manager.h"
+#include "service.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        _cleanup_fclose_ FILE *f = NULL, *null = NULL;
+        _cleanup_fdset_free_ FDSet *fdset = NULL;
+
+        if (outside_size_range(size, 0, 65536))
+                return 0;
+
+        fuzz_setup_logging();
+
+        assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0);
+        /* Set log overrides as well to make it harder for a serialization file
+         * to switch log levels/targets during fuzzing */
+        manager_override_log_level(m, log_get_max_level());
+        manager_override_log_target(m, log_get_target());
+        assert_se(null = fopen("/dev/null", "we"));
+        assert_se(fdset = fdset_new());
+        assert_se(f = data_to_file(data, size));
+
+        (void) manager_deserialize(m, f, fdset);
+        (void) manager_serialize(m, null, fdset, true);
+        (void) manager_serialize(m, null, fdset, false);
+
+        return 0;
+}
diff --git a/src/core/fuzz-manager-serialize.options b/src/core/fuzz-manager-serialize.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/core/fuzz-manager-serialize.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/core/fuzz-unit-file.c b/src/core/fuzz-unit-file.c
new file mode 100644
index 0000000..57480cf
--- /dev/null
+++ b/src/core/fuzz-unit-file.c
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "install.h"
+#include "load-fragment.h"
+#include "manager-dump.h"
+#include "memstream-util.h"
+#include "string-util.h"
+#include "unit-serialize.h"
+#include "utf8.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *p = NULL;
+        UnitType t;
+        _cleanup_(manager_freep) Manager *m = NULL;
+        Unit *u;
+        const char *name;
+        long offset;
+
+        if (outside_size_range(size, 0, 65536))
+                return 0;
+
+        f = data_to_file(data, size);
+
+        assert_se(f);
+
+        if (read_line(f, LINE_MAX, &p) < 0)
+                return 0;
+
+        t = unit_type_from_string(p);
+        if (t < 0)
+                return 0;
+
+        if (!unit_vtable[t]->load)
+                return 0;
+
+        offset = ftell(f);
+        assert_se(offset >= 0);
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *ll;
+
+                if (read_line(f, LONG_LINE_MAX, &l) <= 0)
+                        break;
+
+                ll = startswith(l, UTF8_BYTE_ORDER_MARK) ?: l;
+                ll = ll + strspn(ll, WHITESPACE);
+
+                if (HAS_FEATURE_MEMORY_SANITIZER && startswith(ll, "ListenNetlink")) {
+                        /* ListenNetlink causes a false positive in msan,
+                         * let's skip this for now. */
+                        log_notice("Skipping test because ListenNetlink= is present");
+                        return 0;
+                }
+        }
+
+        assert_se(fseek(f, offset, SEEK_SET) == 0);
+
+        fuzz_setup_logging();
+
+        assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0);
+
+        name = strjoina("a.", unit_type_to_string(t));
+        assert_se(unit_new_for_name(m, unit_vtable[t]->object_size, name, &u) >= 0);
+
+        (void) config_parse(
+                        name, name, f,
+                        UNIT_VTABLE(u)->sections,
+                        config_item_perf_lookup, load_fragment_gperf_lookup,
+                        0,
+                        u,
+                        NULL);
+
+        _cleanup_(memstream_done) MemStream ms = {};
+        FILE *g;
+
+        assert_se(g = memstream_init(&ms));
+        unit_dump(u, g, "");
+        manager_dump(m, g, /* patterns= */ NULL, ">>>");
+
+        return 0;
+}
diff --git a/src/core/fuzz-unit-file.options b/src/core/fuzz-unit-file.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/core/fuzz-unit-file.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c
new file mode 100644
index 0000000..00d6ad6
--- /dev/null
+++ b/src/core/generator-setup.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "generator-setup.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "rm-rf.h"
+
+int lookup_paths_mkdir_generator(LookupPaths *p) {
+        int r, q;
+
+        assert(p);
+
+        if (!p->generator || !p->generator_early || !p->generator_late)
+                return -EINVAL;
+
+        r = mkdir_p_label(p->generator, 0755);
+
+        q = mkdir_p_label(p->generator_early, 0755);
+        if (q < 0 && r >= 0)
+                r = q;
+
+        q = mkdir_p_label(p->generator_late, 0755);
+        if (q < 0 && r >= 0)
+                r = q;
+
+        return r;
+}
+
+void lookup_paths_trim_generator(LookupPaths *p) {
+        assert(p);
+
+        /* Trim empty dirs */
+
+        if (p->generator)
+                (void) rmdir(p->generator);
+        if (p->generator_early)
+                (void) rmdir(p->generator_early);
+        if (p->generator_late)
+                (void) rmdir(p->generator_late);
+}
+
+void lookup_paths_flush_generator(LookupPaths *p) {
+        assert(p);
+
+        /* Flush the generated unit files in full */
+
+        if (p->generator)
+                (void) rm_rf(p->generator, REMOVE_ROOT|REMOVE_PHYSICAL);
+        if (p->generator_early)
+                (void) rm_rf(p->generator_early, REMOVE_ROOT|REMOVE_PHYSICAL);
+        if (p->generator_late)
+                (void) rm_rf(p->generator_late, REMOVE_ROOT|REMOVE_PHYSICAL);
+
+        if (p->temporary_dir)
+                (void) rm_rf(p->temporary_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
+}
diff --git a/src/core/generator-setup.h b/src/core/generator-setup.h
new file mode 100644
index 0000000..1cc816b
--- /dev/null
+++ b/src/core/generator-setup.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "path-lookup.h"
+
+int lookup_paths_mkdir_generator(LookupPaths *p);
+void lookup_paths_trim_generator(LookupPaths *p);
+void lookup_paths_flush_generator(LookupPaths *p);
diff --git a/src/core/ima-setup.c b/src/core/ima-setup.c
new file mode 100644
index 0000000..37916bb
--- /dev/null
+++ b/src/core/ima-setup.c
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+  Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy
+                                   TORSEC group — http://security.polito.it
+***/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "ima-setup.h"
+#include "log.h"
+
+#define IMA_SECFS_DIR "/sys/kernel/security/ima"
+#define IMA_SECFS_POLICY IMA_SECFS_DIR "/policy"
+#define IMA_POLICY_PATH "/etc/ima/ima-policy"
+
+int ima_setup(void) {
+#if ENABLE_IMA
+        _cleanup_fclose_ FILE *input = NULL;
+        _cleanup_close_ int imafd = -EBADF;
+        unsigned lineno = 0;
+        int r;
+
+        if (access(IMA_SECFS_DIR, F_OK) < 0) {
+                log_debug_errno(errno, "IMA support is disabled in the kernel, ignoring: %m");
+                return 0;
+        }
+
+        if (access(IMA_SECFS_POLICY, W_OK) < 0) {
+                log_warning_errno(errno, "Another IMA custom policy has already been loaded, ignoring: %m");
+                return 0;
+        }
+
+        if (access(IMA_POLICY_PATH, F_OK) < 0) {
+                log_debug_errno(errno, "No IMA custom policy file "IMA_POLICY_PATH", ignoring: %m");
+                return 0;
+        }
+
+        imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC);
+        if (imafd < 0) {
+                log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m");
+                return 0;
+        }
+
+        /* attempt to write the name of the policy file into sysfs file */
+        if (write(imafd, IMA_POLICY_PATH, STRLEN(IMA_POLICY_PATH)) > 0)
+                goto done;
+
+        /* fall back to copying the policy line-by-line */
+        input = fopen(IMA_POLICY_PATH, "re");
+        if (!input) {
+                log_warning_errno(errno, "Failed to open the IMA custom policy file "IMA_POLICY_PATH", ignoring: %m");
+                return 0;
+        }
+
+        safe_close(imafd);
+
+        imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC);
+        if (imafd < 0) {
+                log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m");
+                return 0;
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+                size_t len;
+
+                r = read_line(input, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read the IMA custom policy file "IMA_POLICY_PATH": %m");
+                if (r == 0)
+                        break;
+
+                len = strlen(line);
+                lineno++;
+
+                if (len > 0 && write(imafd, line, len) < 0)
+                        return log_error_errno(errno, "Failed to load the IMA custom policy file "IMA_POLICY_PATH"%u: %m",
+                                               lineno);
+        }
+
+done:
+        log_info("Successfully loaded the IMA custom policy "IMA_POLICY_PATH".");
+#endif /* ENABLE_IMA */
+        return 0;
+}
diff --git a/src/core/ima-setup.h b/src/core/ima-setup.h
new file mode 100644
index 0000000..f964c7b
--- /dev/null
+++ b/src/core/ima-setup.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+  Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy
+                                   TORSEC group — http://security.polito.it
+***/
+
+int ima_setup(void);
diff --git a/src/core/import-creds.c b/src/core/import-creds.c
new file mode 100644
index 0000000..48f3160
--- /dev/null
+++ b/src/core/import-creds.c
@@ -0,0 +1,938 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "confidential-virt.h"
+#include "copy.h"
+#include "creds-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "import-creds.h"
+#include "initrd-util.h"
+#include "io-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "recurse-dir.h"
+#include "strv.h"
+#include "virt.h"
+
+/* This imports credentials passed in from environments higher up (VM manager, boot loader, …) and rearranges
+ * them so that later code can access them using our regular credential protocol
+ * (i.e. $CREDENTIALS_DIRECTORY). It's supposed to be minimal glue to unify behaviour how PID 1 (and
+ * generators invoked by it) can acquire credentials from outside, to mimic how we support it for containers,
+ * but on VM/physical environments.
+ *
+ * This does four things:
+ *
+ * 1. It imports credentials picked up by sd-boot (and placed in the /.extra/credentials/ dir in the initrd)
+ *    and puts them in /run/credentials/@encrypted/. Note that during the initrd→host transition the initrd root
+ *    file system is cleaned out, thus it is essential we pick up these files before they are deleted. Note
+ *    that these credentials originate from an untrusted source, i.e. the ESP and are not
+ *    pre-authenticated. They still have to be authenticated before use.
+ *
+ * 2. It imports credentials from /proc/cmdline and puts them in /run/credentials/@system/. These come from a
+ *    trusted environment (i.e. the boot loader), and are typically authenticated (if authentication is done
+ *    at all). However, they are world-readable, which might be less than ideal. Hence only use this for data
+ *    that doesn't require trust.
+ *
+ * 3. It imports credentials passed in through qemu's fw_cfg logic. Specifically, credential data passed in
+ *    /sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials/ is picked up and also placed in
+ *    /run/credentials/@system/.
+ *
+ * 4. It imports credentials passed in via the DMI/SMBIOS OEM string tables, quite similar to fw_cfg. It
+ *    looks for strings starting with "io.systemd.credential:" and "io.systemd.credential.binary:". Both
+ *    expect a key=value assignment, but in the latter case the value is Base64 decoded, allowing binary
+ *    credentials to be passed in.
+ *
+ * If it picked up any credentials it will set the $CREDENTIALS_DIRECTORY and
+ * $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables to point to these directories, so that processes
+ * can find them there later on. If "ramfs" is available $CREDENTIALS_DIRECTORY will be backed by it (but
+ * $ENCRYPTED_CREDENTIALS_DIRECTORY is just a regular tmpfs).
+ *
+ * Net result: the service manager can pick up trusted credentials from $CREDENTIALS_DIRECTORY afterwards,
+ * and untrusted ones from $ENCRYPTED_CREDENTIALS_DIRECTORY. */
+
+typedef struct ImportCredentialContext {
+        int target_dir_fd;
+        size_t size_sum;
+        unsigned n_credentials;
+} ImportCredentialContext;
+
+static void import_credentials_context_free(ImportCredentialContext *c) {
+        assert(c);
+
+        c->target_dir_fd = safe_close(c->target_dir_fd);
+}
+
+static int acquire_credential_directory(ImportCredentialContext *c, const char *path, bool with_mount) {
+        int r;
+
+        assert(c);
+        assert(path);
+
+        if (c->target_dir_fd >= 0)
+                return c->target_dir_fd;
+
+        r = path_is_mount_point(path, NULL, 0);
+        if (r < 0) {
+                if (r != -ENOENT)
+                        return log_error_errno(r, "Failed to determine if %s is a mount point: %m", path);
+
+                r = mkdir_safe_label(path, 0700, 0, 0, MKDIR_WARN_MODE);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create %s mount point: %m", path);
+
+                r = 0; /* Now it exists and is not a mount point */
+        }
+        if (r > 0)
+                /* If already a mount point, then remount writable */
+                (void) mount_nofollow_verbose(LOG_WARNING, NULL, path, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+        else if (with_mount)
+                /* If not a mount point yet, and the credentials are not encrypted, then let's try to mount a no-swap fs there */
+                (void) mount_credentials_fs(path, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+
+        c->target_dir_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+        if (c->target_dir_fd < 0)
+                return log_error_errno(errno, "Failed to open %s: %m", path);
+
+        return c->target_dir_fd;
+}
+
+static int open_credential_file_for_write(int target_dir_fd, const char *dir_name, const char *n) {
+        int fd;
+
+        assert(target_dir_fd >= 0);
+        assert(dir_name);
+        assert(n);
+
+        fd = openat(target_dir_fd, n, O_WRONLY|O_CLOEXEC|O_CREAT|O_EXCL|O_NOFOLLOW, 0400);
+        if (fd < 0) {
+                if (errno == EEXIST) /* In case of EEXIST we'll only debug log! */
+                        return log_debug_errno(errno, "Credential '%s' set twice, ignoring.", n);
+
+                return log_error_errno(errno, "Failed to create %s/%s: %m", dir_name, n);
+        }
+
+        return fd;
+}
+
+static bool credential_size_ok(ImportCredentialContext *c, const char *name, uint64_t size) {
+        assert(c);
+        assert(name);
+
+        if (size > CREDENTIAL_SIZE_MAX) {
+                log_warning("Credential '%s' is larger than allowed limit (%s > %s), skipping.", name, FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIAL_SIZE_MAX));
+                return false;
+        }
+
+        if (size > CREDENTIALS_TOTAL_SIZE_MAX - c->size_sum) {
+                log_warning("Accumulated credential size would be above allowed limit (%s+%s > %s), skipping '%s'.",
+                            FORMAT_BYTES(c->size_sum), FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIALS_TOTAL_SIZE_MAX), name);
+                return false;
+        }
+
+        return true;
+}
+
+static int finalize_credentials_dir(const char *dir, const char *envvar) {
+        int r;
+
+        assert(dir);
+        assert(envvar);
+
+        /* Try to make the credentials directory read-only now */
+
+        r = make_mount_point(dir);
+        if (r < 0)
+                log_warning_errno(r, "Failed to make '%s' a mount point, ignoring: %m", dir);
+        else
+                (void) mount_nofollow_verbose(LOG_WARNING, NULL, dir, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
+
+        if (setenv(envvar, dir, /* overwrite= */ true) < 0)
+                return log_error_errno(errno, "Failed to set $%s environment variable: %m", envvar);
+
+        return 0;
+}
+
+static int import_credentials_boot(void) {
+        _cleanup_(import_credentials_context_free) ImportCredentialContext context = {
+                .target_dir_fd = -EBADF,
+        };
+        int r;
+
+        /* systemd-stub will wrap sidecar *.cred files from the UEFI kernel image directory into initrd
+         * cpios, so that they unpack into /.extra/. We'll pick them up from there and copy them into /run/
+         * so that we can access them during the entire runtime (note that the initrd file system is erased
+         * during the initrd → host transition). Note that these credentials originate from an untrusted
+         * source (i.e. the ESP typically) and thus need to be authenticated later. We thus put them in a
+         * directory separate from the usual credentials which are from a trusted source. */
+
+        if (!in_initrd())
+                return 0;
+
+        FOREACH_STRING(p,
+                       "/.extra/credentials/", /* specific to this boot menu */
+                       "/.extra/global_credentials/") { /* boot partition wide */
+
+                _cleanup_free_ DirectoryEntries *de = NULL;
+                _cleanup_close_ int source_dir_fd = -EBADF;
+
+                source_dir_fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+                if (source_dir_fd < 0) {
+                        if (errno == ENOENT) {
+                                log_debug("No credentials passed via %s.", p);
+                                continue;
+                        }
+
+                        log_warning_errno(errno, "Failed to open '%s', ignoring: %m", p);
+                        continue;
+                }
+
+                r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to read '%s' contents, ignoring: %m", p);
+                        continue;
+                }
+
+                for (size_t i = 0; i < de->n_entries; i++) {
+                        const struct dirent *d = de->entries[i];
+                        _cleanup_close_ int cfd = -EBADF, nfd = -EBADF;
+                        _cleanup_free_ char *n = NULL;
+                        const char *e;
+                        struct stat st;
+
+                        e = endswith(d->d_name, ".cred");
+                        if (!e)
+                                continue;
+
+                        /* drop .cred suffix (which we want in the ESP sidecar dir, but not for our internal
+                         * processing) */
+                        n = strndup(d->d_name, e - d->d_name);
+                        if (!n)
+                                return log_oom();
+
+                        if (!credential_name_valid(n)) {
+                                log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+                                continue;
+                        }
+
+                        cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC);
+                        if (cfd < 0) {
+                                log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name);
+                                continue;
+                        }
+
+                        if (fstat(cfd, &st) < 0) {
+                                log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name);
+                                continue;
+                        }
+
+                        r = stat_verify_regular(&st);
+                        if (r < 0) {
+                                log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name);
+                                continue;
+                        }
+
+                        if (!credential_size_ok(&context, n, st.st_size))
+                                continue;
+
+                        r = acquire_credential_directory(&context, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ false);
+                        if (r < 0)
+                                return r;
+
+                        nfd = open_credential_file_for_write(context.target_dir_fd, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, n);
+                        if (nfd == -EEXIST)
+                                continue;
+                        if (nfd < 0)
+                                return nfd;
+
+                        r = copy_bytes(cfd, nfd, st.st_size, 0);
+                        if (r < 0) {
+                                (void) unlinkat(context.target_dir_fd, n, 0);
+                                return log_error_errno(r, "Failed to create credential '%s': %m", n);
+                        }
+
+                        context.size_sum += st.st_size;
+                        context.n_credentials++;
+
+                        log_debug("Successfully copied boot credential '%s'.", n);
+                }
+        }
+
+        if (context.n_credentials > 0) {
+                log_debug("Imported %u credentials from boot loader.", context.n_credentials);
+
+                r = finalize_credentials_dir(ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, "ENCRYPTED_CREDENTIALS_DIRECTORY");
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int proc_cmdline_callback(const char *key, const char *value, void *data) {
+        ImportCredentialContext *c = ASSERT_PTR(data);
+        _cleanup_free_ void *binary = NULL;
+        _cleanup_free_ char *n = NULL;
+        _cleanup_close_ int nfd = -EBADF;
+        const char *colon, *d;
+        bool base64;
+        size_t l;
+        int r;
+
+        assert(key);
+
+        if (proc_cmdline_key_streq(key, "systemd.set_credential"))
+                base64 = false;
+        else if (proc_cmdline_key_streq(key, "systemd.set_credential_binary"))
+                base64 = true;
+        else
+                return 0;
+
+        colon = value ? strchr(value, ':') : NULL;
+        if (!colon) {
+                log_warning("Credential assignment through kernel command line lacks ':' character, ignoring: %s", value);
+                return 0;
+        }
+
+        n = strndup(value, colon - value);
+        if (!n)
+                return log_oom();
+
+        if (!credential_name_valid(n)) {
+                log_warning("Credential name '%s' is invalid, ignoring.", n);
+                return 0;
+        }
+
+        colon++;
+
+        if (base64) {
+                r = unbase64mem(colon, SIZE_MAX, &binary, &l);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to decode binary credential '%s' data, ignoring: %m", n);
+                        return 0;
+                }
+
+                d = binary;
+        } else {
+                d = colon;
+                l = strlen(colon);
+        }
+
+        if (!credential_size_ok(c, n, l))
+                return 0;
+
+        r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+        if (r < 0)
+                return r;
+
+        nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, n);
+        if (nfd == -EEXIST)
+                return 0;
+        if (nfd < 0)
+                return nfd;
+
+        r = loop_write(nfd, d, l);
+        if (r < 0) {
+                (void) unlinkat(c->target_dir_fd, n, 0);
+                return log_error_errno(r, "Failed to write credential: %m");
+        }
+
+        c->size_sum += l;
+        c->n_credentials++;
+
+        log_debug("Successfully processed kernel command line credential '%s'.", n);
+
+        return 0;
+}
+
+static int import_credentials_proc_cmdline(ImportCredentialContext *c) {
+        int r;
+
+        assert(c);
+
+        r = proc_cmdline_parse(proc_cmdline_callback, c, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse /proc/cmdline: %m");
+
+        return 0;
+}
+
+#define QEMU_FWCFG_PATH "/sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials"
+
+static int import_credentials_qemu(ImportCredentialContext *c) {
+        _cleanup_free_ DirectoryEntries *de = NULL;
+        _cleanup_close_ int source_dir_fd = -EBADF;
+        int r;
+
+        assert(c);
+
+        if (detect_container() > 0) /* don't access /sys/ in a container */
+                return 0;
+
+        if (detect_confidential_virtualization() > 0) /* don't trust firmware if confidential VMs */
+                return 0;
+
+        source_dir_fd = open(QEMU_FWCFG_PATH, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+        if (source_dir_fd < 0) {
+                if (errno == ENOENT) {
+                        log_debug("No credentials passed via fw_cfg.");
+                        return 0;
+                }
+
+                log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "', ignoring: %m");
+                return 0;
+        }
+
+        r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "' contents, ignoring: %m");
+                return 0;
+        }
+
+        for (size_t i = 0; i < de->n_entries; i++) {
+                const struct dirent *d = de->entries[i];
+                _cleanup_close_ int vfd = -EBADF, rfd = -EBADF, nfd = -EBADF;
+                _cleanup_free_ char *szs = NULL;
+                uint64_t sz;
+
+                if (!credential_name_valid(d->d_name)) {
+                        log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+                        continue;
+                }
+
+                vfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+                if (vfd < 0) {
+                        log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                r = read_virtual_file_at(vfd, "size", LINE_MAX, &szs, NULL);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "'/%s/size, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                r = safe_atou64(strstrip(szs), &sz);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse size of credential '%s', ignoring: %s", d->d_name, szs);
+                        continue;
+                }
+
+                if (!credential_size_ok(c, d->d_name, sz))
+                        continue;
+
+                /* Ideally we'd just symlink the data here. Alas the kernel driver exports the raw file as
+                 * having size zero, and we'd rather not have applications support such credential
+                 * files. Let's hence copy the files to make them regular. */
+
+                rfd = openat(vfd, "raw", O_RDONLY|O_CLOEXEC);
+                if (rfd < 0) {
+                        log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/raw, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+                if (r < 0)
+                        return r;
+
+                nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name);
+                if (nfd == -EEXIST)
+                        continue;
+                if (nfd < 0)
+                        return nfd;
+
+                r = copy_bytes(rfd, nfd, sz, 0);
+                if (r < 0) {
+                        (void) unlinkat(c->target_dir_fd, d->d_name, 0);
+                        return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name);
+                }
+
+                c->size_sum += sz;
+                c->n_credentials++;
+
+                log_debug("Successfully copied qemu fw_cfg credential '%s'.", d->d_name);
+        }
+
+        return 0;
+}
+
+static int parse_smbios_strings(ImportCredentialContext *c, const char *data, size_t size) {
+        size_t left, skip;
+        const char *p;
+        int r;
+
+        assert(c);
+        assert(data || size == 0);
+
+        /* Unpacks a packed series of SMBIOS OEM vendor strings. These are a series of NUL terminated
+         * strings, one after the other. */
+
+        for (p = data, left = size; left > 0; p += skip, left -= skip) {
+                _cleanup_free_ void *buf = NULL;
+                _cleanup_free_ char *cn = NULL;
+                _cleanup_close_ int nfd = -EBADF;
+                const char *nul, *n, *eq;
+                const void *cdata;
+                size_t buflen, cdata_len;
+                bool unbase64;
+
+                nul = memchr(p, 0, left);
+                if (nul)
+                        skip = (nul - p) + 1;
+                else {
+                        nul = p + left;
+                        skip = left;
+                }
+
+                if (nul - p == 0) /* Skip empty strings */
+                        continue;
+
+                /* Only care about strings starting with either of these two prefixes */
+                if ((n = memory_startswith(p, nul - p, "io.systemd.credential:")))
+                        unbase64 = false;
+                else if ((n = memory_startswith(p, nul - p, "io.systemd.credential.binary:")))
+                        unbase64 = true;
+                else {
+                        _cleanup_free_ char *escaped = NULL;
+
+                        escaped = cescape_length(p, nul - p);
+                        log_debug("Ignoring OEM string: %s", strnull(escaped));
+                        continue;
+                }
+
+                eq = memchr(n, '=', nul - n);
+                if (!eq) {
+                        log_warning("SMBIOS OEM string lacks '=' character, ignoring.");
+                        continue;
+                }
+
+                cn = memdup_suffix0(n, eq - n);
+                if (!cn)
+                        return log_oom();
+
+                if (!credential_name_valid(cn)) {
+                        log_warning("SMBIOS credential name '%s' is not valid, ignoring: %m", cn);
+                        continue;
+                }
+
+                /* Optionally base64 decode the data, if requested, to allow binary credentials */
+                if (unbase64) {
+                        r = unbase64mem(eq + 1, nul - (eq + 1), &buf, &buflen);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to base64 decode credential '%s', ignoring: %m", cn);
+                                continue;
+                        }
+
+                        cdata = buf;
+                        cdata_len = buflen;
+                } else {
+                        cdata = eq + 1;
+                        cdata_len = nul - (eq + 1);
+                }
+
+                if (!credential_size_ok(c, cn, cdata_len))
+                        continue;
+
+                r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+                if (r < 0)
+                        return r;
+
+                nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, cn);
+                if (nfd == -EEXIST)
+                        continue;
+                if (nfd < 0)
+                        return nfd;
+
+                r = loop_write(nfd, cdata, cdata_len);
+                if (r < 0) {
+                        (void) unlinkat(c->target_dir_fd, cn, 0);
+                        return log_error_errno(r, "Failed to write credential: %m");
+                }
+
+                c->size_sum += cdata_len;
+                c->n_credentials++;
+
+                log_debug("Successfully processed SMBIOS credential '%s'.", cn);
+        }
+
+        return 0;
+}
+
+static int import_credentials_smbios(ImportCredentialContext *c) {
+        int r;
+
+        /* Parses DMI OEM strings fields (SMBIOS type 11), as settable with qemu's -smbios type=11,value=… switch. */
+
+        if (detect_container() > 0) /* don't access /sys/ in a container */
+                return 0;
+
+        if (detect_confidential_virtualization() > 0) /* don't trust firmware if confidential VMs */
+                return 0;
+
+        for (unsigned i = 0;; i++) {
+                struct dmi_field_header {
+                        uint8_t type;
+                        uint8_t length;
+                        uint16_t handle;
+                        uint8_t count;
+                        char contents[];
+                } _packed_ *dmi_field_header;
+                _cleanup_free_ char *p = NULL;
+                _cleanup_free_ void *data = NULL;
+                size_t size;
+
+                assert_cc(offsetof(struct dmi_field_header, contents) == 5);
+
+                if (asprintf(&p, "/sys/firmware/dmi/entries/11-%u/raw", i) < 0)
+                        return log_oom();
+
+                r = read_virtual_file(p, sizeof(dmi_field_header) + CREDENTIALS_TOTAL_SIZE_MAX, (char**) &data, &size);
+                if (r < 0) {
+                        /* Once we reach ENOENT there are no more DMI Type 11 fields around. */
+                        log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to open '%s', ignoring: %m", p);
+                        break;
+                }
+
+                if (size < offsetof(struct dmi_field_header, contents))
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "DMI field header of '%s' too short.", p);
+
+                dmi_field_header = data;
+                if (dmi_field_header->type != 11 ||
+                    dmi_field_header->length != offsetof(struct dmi_field_header, contents))
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid DMI field header.");
+
+                r = parse_smbios_strings(c, dmi_field_header->contents, size - offsetof(struct dmi_field_header, contents));
+                if (r < 0)
+                        return r;
+
+                if (i == UINT_MAX) /* Prevent overflow */
+                        break;
+        }
+
+        return 0;
+}
+
+static int import_credentials_initrd(ImportCredentialContext *c) {
+        _cleanup_free_ DirectoryEntries *de = NULL;
+        _cleanup_close_ int source_dir_fd = -EBADF;
+        int r;
+
+        assert(c);
+
+        /* This imports credentials from /run/credentials/@initrd/ into our credentials directory and deletes
+         * the source directory afterwards. This is run once after the initrd → host transition. This is
+         * supposed to establish a well-defined avenue for initrd-based host configurators to pass
+         * credentials into the main system. */
+
+        if (in_initrd())
+                return 0;
+
+        source_dir_fd = open("/run/credentials/@initrd", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+        if (source_dir_fd < 0) {
+                if (errno == ENOENT)
+                        log_debug_errno(errno, "No credentials passed from initrd.");
+                else
+                        log_warning_errno(errno, "Failed to open '/run/credentials/@initrd', ignoring: %m");
+                return 0;
+        }
+
+        r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to read '/run/credentials/@initrd' contents, ignoring: %m");
+                return 0;
+        }
+
+        FOREACH_ARRAY(entry, de->entries, de->n_entries) {
+                _cleanup_close_ int cfd = -EBADF, nfd = -EBADF;
+                const struct dirent *d = *entry;
+                struct stat st;
+
+                if (!credential_name_valid(d->d_name)) {
+                        log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+                        continue;
+                }
+
+                cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC);
+                if (cfd < 0) {
+                        log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                if (fstat(cfd, &st) < 0) {
+                        log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                r = stat_verify_regular(&st);
+                if (r < 0) {
+                        log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                if (!credential_size_ok(c, d->d_name, st.st_size))
+                        continue;
+
+                r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+                if (r < 0)
+                        return r;
+
+                nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name);
+                if (nfd == -EEXIST)
+                        continue;
+                if (nfd < 0)
+                        return nfd;
+
+                r = copy_bytes(cfd, nfd, st.st_size, 0);
+                if (r < 0) {
+                        (void) unlinkat(c->target_dir_fd, d->d_name, 0);
+                        return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name);
+                }
+
+                c->size_sum += st.st_size;
+                c->n_credentials++;
+
+                log_debug("Successfully copied initrd credential '%s'.", d->d_name);
+
+                (void) unlinkat(source_dir_fd, d->d_name, 0);
+        }
+
+        source_dir_fd = safe_close(source_dir_fd);
+
+        if (rmdir("/run/credentials/@initrd") < 0)
+                log_warning_errno(errno, "Failed to remove /run/credentials/@initrd after import, ignoring: %m");
+
+        return 0;
+}
+
+static int import_credentials_trusted(void) {
+        _cleanup_(import_credentials_context_free) ImportCredentialContext c = {
+                .target_dir_fd = -EBADF,
+        };
+        int q, w, r, y;
+
+        /* This is invoked during early boot when no credentials have been imported so far. (Specifically, if
+         * the $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables are not set
+         * yet.) */
+
+        r = import_credentials_qemu(&c);
+        w = import_credentials_smbios(&c);
+        q = import_credentials_proc_cmdline(&c);
+        y = import_credentials_initrd(&c);
+
+        if (c.n_credentials > 0) {
+                int z;
+
+                log_debug("Imported %u credentials from kernel command line/smbios/fw_cfg/initrd.", c.n_credentials);
+
+                z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY");
+                if (z < 0)
+                        return z;
+        }
+
+        return r < 0 ? r : w < 0 ? w : q < 0 ? q : y;
+}
+
+static int merge_credentials_trusted(const char *creds_dir) {
+        _cleanup_(import_credentials_context_free) ImportCredentialContext c = {
+                .target_dir_fd = -EBADF,
+        };
+        int r;
+
+        /* This is invoked after the initrd → host transitions, when credentials already have been imported,
+         * but we might want to import some more from the initrd. */
+
+        if (in_initrd())
+                return 0;
+
+        /* Do not try to merge initrd credentials into foreign credentials directories */
+        if (!path_equal_ptr(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) {
+                log_debug("Not importing initrd credentials, as foreign $CREDENTIALS_DIRECTORY has been set.");
+                return 0;
+        }
+
+        r = import_credentials_initrd(&c);
+
+        if (c.n_credentials > 0) {
+                int z;
+
+                log_debug("Merged %u credentials from initrd.", c.n_credentials);
+
+                z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY");
+                if (z < 0)
+                        return z;
+        }
+
+        return r;
+}
+
+static int symlink_credential_dir(const char *envvar, const char *path, const char *where) {
+        int r;
+
+        assert(envvar);
+        assert(path);
+        assert(where);
+
+        if (!path_is_valid(path) || !path_is_absolute(path))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "String specified via $%s is not a valid absolute path, refusing: %s", envvar, path);
+
+        /* If the env var already points to where we intend to create the symlink, then most likely we
+         * already imported some creds earlier, and thus set the env var, and hence don't need to do
+         * anything. */
+        if (path_equal(path, where))
+                return 0;
+
+        r = symlink_idempotent(path, where, /* make_relative= */ true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to link $%s to %s: %m", envvar, where);
+
+        return 0;
+}
+
+static int setenv_notify_socket(void) {
+        _cleanup_free_ char *address = NULL;
+        int r;
+
+        r = read_credential_with_decryption("vmm.notify_socket", (void **)&address, /* ret_size= */ NULL);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to read 'vmm.notify_socket' credential, ignoring: %m");
+
+        if (isempty(address))
+                return 0;
+
+        if (setenv("NOTIFY_SOCKET", address, /* replace= */ 1) < 0)
+                return log_warning_errno(errno, "Failed to set $NOTIFY_SOCKET environment variable, ignoring: %m");
+
+        return 1;
+}
+
+static int report_credentials_per_func(const char *title, int (*get_directory_func)(const char **ret)) {
+        _cleanup_free_ DirectoryEntries *de = NULL;
+        _cleanup_close_ int dir_fd = -EBADF;
+        _cleanup_free_ char *ll = NULL;
+        const char *d = NULL;
+        int r, c = 0;
+
+        assert(title);
+        assert(get_directory_func);
+
+        r = get_directory_func(&d);
+        if (r < 0) {
+                if (r == -ENXIO) /* Env var not set */
+                        return 0;
+
+                return log_warning_errno(r, "Failed to determine %s directory: %m", title);
+        }
+
+        dir_fd = open(d, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+        if (dir_fd < 0)
+                return log_warning_errno(errno, "Failed to open credentials directory %s: %m", d);
+
+        r = readdir_all(dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to enumerate credentials directory %s: %m", d);
+
+        FOREACH_ARRAY(entry, de->entries, de->n_entries) {
+                const struct dirent *e = *entry;
+
+                if (!credential_name_valid(e->d_name))
+                        continue;
+
+                if (!strextend_with_separator(&ll, ", ", e->d_name))
+                        return log_oom();
+
+                c++;
+        }
+
+        if (ll)
+                log_info("Received %s: %s", title, ll);
+
+        return c;
+}
+
+static void report_credentials(void) {
+        int p, q;
+
+        p = report_credentials_per_func("regular credentials", get_credentials_dir);
+        q = report_credentials_per_func("untrusted credentials", get_encrypted_credentials_dir);
+
+        log_full(p > 0 || q > 0 ? LOG_INFO : LOG_DEBUG,
+                 "Acquired %i regular credentials, %i untrusted credentials.",
+                 p > 0 ? p : 0,
+                 q > 0 ? q : 0);
+}
+
+int import_credentials(void) {
+        const char *received_creds_dir = NULL, *received_encrypted_creds_dir = NULL;
+        bool envvar_set = false;
+        int r, q;
+
+        r = get_credentials_dir(&received_creds_dir);
+        if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */
+                log_warning_errno(r, "Failed to determine credentials directory, ignoring: %m");
+
+        envvar_set = r >= 0;
+
+        r = get_encrypted_credentials_dir(&received_encrypted_creds_dir);
+        if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */
+                log_warning_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m");
+
+        envvar_set = envvar_set || r >= 0;
+
+        if (envvar_set) {
+                /* Maybe an earlier stage initrd already set this up? If so, don't try to import anything again. */
+                log_debug("Not importing credentials, $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY already set.");
+
+                /* But, let's make sure the creds are available from our regular paths. */
+                if (received_creds_dir)
+                        r = symlink_credential_dir("CREDENTIALS_DIRECTORY", received_creds_dir, SYSTEM_CREDENTIALS_DIRECTORY);
+                else
+                        r = 0;
+
+                if (received_encrypted_creds_dir) {
+                        q = symlink_credential_dir("ENCRYPTED_CREDENTIALS_DIRECTORY", received_encrypted_creds_dir, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY);
+                        if (r >= 0)
+                                r = q;
+                }
+
+                q = merge_credentials_trusted(received_creds_dir);
+                if (r >= 0)
+                        r = q;
+
+        } else {
+                _cleanup_free_ char *v = NULL;
+
+                r = proc_cmdline_get_key("systemd.import_credentials", PROC_CMDLINE_STRIP_RD_PREFIX, &v);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to check if 'systemd.import_credentials=' kernel command line option is set, ignoring: %m");
+                else if (r > 0) {
+                        r = parse_boolean(v);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to parse 'systemd.import_credentials=' parameter, ignoring: %m");
+                        else if (r == 0) {
+                                log_notice("systemd.import_credentials=no is set, skipping importing of credentials.");
+                                return 0;
+                        }
+                }
+
+                r = import_credentials_boot();
+
+                q = import_credentials_trusted();
+                if (r >= 0)
+                        r = q;
+        }
+
+        report_credentials();
+
+        /* Propagate vmm_notify_socket credential → $NOTIFY_SOCKET env var */
+        (void) setenv_notify_socket();
+
+        return r;
+}
diff --git a/src/core/import-creds.h b/src/core/import-creds.h
new file mode 100644
index 0000000..a87865c
--- /dev/null
+++ b/src/core/import-creds.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int import_credentials(void);
diff --git a/src/core/job.c b/src/core/job.c
new file mode 100644
index 0000000..e78c2a7
--- /dev/null
+++ b/src/core/job.c
@@ -0,0 +1,1712 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-id128.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "async.h"
+#include "cgroup.h"
+#include "dbus-job.h"
+#include "dbus.h"
+#include "escape.h"
+#include "fileio.h"
+#include "job.h"
+#include "log.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "serialize.h"
+#include "set.h"
+#include "sort-util.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "unit.h"
+#include "virt.h"
+
+Job* job_new_raw(Unit *unit) {
+        Job *j;
+
+        /* used for deserialization */
+
+        assert(unit);
+
+        j = new(Job, 1);
+        if (!j)
+                return NULL;
+
+        *j = (Job) {
+                .manager = unit->manager,
+                .unit = unit,
+                .type = _JOB_TYPE_INVALID,
+        };
+
+        return j;
+}
+
+static uint32_t manager_get_new_job_id(Manager *m) {
+        bool overflow = false;
+
+        assert(m);
+
+        for (;;) {
+                uint32_t id = m->current_job_id;
+
+                if (_unlikely_(id == UINT32_MAX)) {
+                        assert_se(!overflow);
+                        m->current_job_id = 1;
+                        overflow = true;
+                } else
+                        m->current_job_id++;
+
+                if (hashmap_contains(m->jobs, UINT32_TO_PTR(id)))
+                        continue;
+
+                return id;
+        }
+}
+
+Job* job_new(Unit *unit, JobType type) {
+        Job *j;
+
+        assert(type < _JOB_TYPE_MAX);
+
+        j = job_new_raw(unit);
+        if (!j)
+                return NULL;
+
+        j->id = manager_get_new_job_id(j->manager);
+        j->type = type;
+
+        /* We don't link it here, that's what job_dependency() is for */
+
+        return j;
+}
+
+void job_unlink(Job *j) {
+        assert(j);
+        assert(!j->installed);
+        assert(!j->transaction_prev);
+        assert(!j->transaction_next);
+        assert(!j->subject_list);
+        assert(!j->object_list);
+
+        if (j->in_run_queue) {
+                prioq_remove(j->manager->run_queue, j, &j->run_queue_idx);
+                j->in_run_queue = false;
+        }
+
+        if (j->in_dbus_queue) {
+                LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+                j->in_dbus_queue = false;
+        }
+
+        if (j->in_gc_queue) {
+                LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j);
+                j->in_gc_queue = false;
+        }
+
+        j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source);
+}
+
+Job* job_free(Job *j) {
+        assert(j);
+        assert(!j->installed);
+        assert(!j->transaction_prev);
+        assert(!j->transaction_next);
+        assert(!j->subject_list);
+        assert(!j->object_list);
+
+        job_unlink(j);
+
+        sd_bus_track_unref(j->bus_track);
+        strv_free(j->deserialized_clients);
+
+        activation_details_unref(j->activation_details);
+
+        return mfree(j);
+}
+
+static void job_set_state(Job *j, JobState state) {
+        assert(j);
+        assert(state >= 0);
+        assert(state < _JOB_STATE_MAX);
+
+        if (j->state == state)
+                return;
+
+        j->state = state;
+
+        if (!j->installed)
+                return;
+
+        if (j->state == JOB_RUNNING)
+                j->unit->manager->n_running_jobs++;
+        else {
+                assert(j->state == JOB_WAITING);
+                assert(j->unit->manager->n_running_jobs > 0);
+
+                j->unit->manager->n_running_jobs--;
+
+                if (j->unit->manager->n_running_jobs <= 0)
+                        j->unit->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->unit->manager->jobs_in_progress_event_source);
+        }
+}
+
+void job_uninstall(Job *j) {
+        Job **pj;
+
+        assert(j->installed);
+
+        job_set_state(j, JOB_WAITING);
+
+        pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+        assert(*pj == j);
+
+        /* Detach from next 'bigger' objects */
+
+        /* daemon-reload should be transparent to job observers */
+        if (!MANAGER_IS_RELOADING(j->manager))
+                bus_job_send_removed_signal(j);
+
+        *pj = NULL;
+
+        unit_add_to_gc_queue(j->unit);
+
+        unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */
+
+        hashmap_remove_value(j->manager->jobs, UINT32_TO_PTR(j->id), j);
+        j->installed = false;
+}
+
+static bool job_type_allows_late_merge(JobType t) {
+        /* Tells whether it is OK to merge a job of type 't' with an already
+         * running job.
+         * Reloads cannot be merged this way. Think of the sequence:
+         * 1. Reload of a daemon is in progress; the daemon has already loaded
+         *    its config file, but hasn't completed the reload operation yet.
+         * 2. Edit foo's config file.
+         * 3. Trigger another reload to have the daemon use the new config.
+         * Should the second reload job be merged into the first one, the daemon
+         * would not know about the new config.
+         * JOB_RESTART jobs on the other hand can be merged, because they get
+         * patched into JOB_START after stopping the unit. So if we see a
+         * JOB_RESTART running, it means the unit hasn't stopped yet and at
+         * this time the merge is still allowed. */
+        return t != JOB_RELOAD;
+}
+
+static void job_merge_into_installed(Job *j, Job *other) {
+        assert(j->installed);
+        assert(j->unit == other->unit);
+
+        if (j->type != JOB_NOP) {
+                assert_se(job_type_merge_and_collapse(&j->type, other->type, j->unit) == 0);
+
+                /* Keep the oldest ActivationDetails, if any */
+                if (!j->activation_details)
+                        j->activation_details = TAKE_PTR(other->activation_details);
+        } else
+                assert(other->type == JOB_NOP);
+
+        j->irreversible = j->irreversible || other->irreversible;
+        j->ignore_order = j->ignore_order || other->ignore_order;
+}
+
+Job* job_install(Job *j, bool refuse_late_merge) {
+        Job **pj;
+        Job *uj;
+
+        assert(j);
+        assert(!j->installed);
+        assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+        assert(j->state == JOB_WAITING);
+
+        pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+        uj = *pj;
+
+        if (uj) {
+                if (job_type_is_conflicting(uj->type, j->type))
+                        job_finish_and_invalidate(uj, JOB_CANCELED, false, false);
+                else {
+                        /* not conflicting, i.e. mergeable */
+
+                        if (uj->state == JOB_WAITING ||
+                            (!refuse_late_merge && job_type_allows_late_merge(j->type) && job_type_is_superset(uj->type, j->type))) {
+                                job_merge_into_installed(uj, j);
+                                log_unit_debug(uj->unit,
+                                               "Merged %s/%s into installed job %s/%s as %"PRIu32,
+                                               j->unit->id, job_type_to_string(j->type), uj->unit->id,
+                                               job_type_to_string(uj->type), uj->id);
+                                return uj;
+                        } else {
+                                /* already running and not safe to merge into */
+                                /* Patch uj to become a merged job and re-run it. */
+                                /* XXX It should be safer to queue j to run after uj finishes, but it is
+                                 * not currently possible to have more than one installed job per unit. */
+                                job_merge_into_installed(uj, j);
+                                log_unit_debug(uj->unit,
+                                               "Merged into running job, re-running: %s/%s as %"PRIu32,
+                                               uj->unit->id, job_type_to_string(uj->type), uj->id);
+
+                                job_set_state(uj, JOB_WAITING);
+                                return uj;
+                        }
+                }
+        }
+
+        /* Install the job */
+        assert(!*pj);
+        *pj = j;
+        j->installed = true;
+
+        j->manager->n_installed_jobs++;
+        log_unit_debug(j->unit,
+                       "Installed new job %s/%s as %u",
+                       j->unit->id, job_type_to_string(j->type), (unsigned) j->id);
+
+        job_add_to_gc_queue(j);
+
+        job_add_to_dbus_queue(j); /* announce this job to clients */
+        unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */
+
+        return j;
+}
+
+int job_install_deserialized(Job *j) {
+        Job **pj;
+        int r;
+
+        assert(!j->installed);
+
+        if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION)
+                return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EINVAL),
+                                            "Invalid job type %s in deserialization.",
+                                            strna(job_type_to_string(j->type)));
+
+        pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+        if (*pj)
+                return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EEXIST),
+                                            "Unit already has a job installed. Not installing deserialized job.");
+
+        /* When the job does not have ID, or we failed to deserialize the job ID, then use a new ID. */
+        if (j->id <= 0)
+                j->id = manager_get_new_job_id(j->manager);
+
+        r = hashmap_ensure_put(&j->manager->jobs, NULL, UINT32_TO_PTR(j->id), j);
+        if (r == -EEXIST)
+                return log_unit_debug_errno(j->unit, r, "Job ID %" PRIu32 " already used, cannot deserialize job.", j->id);
+        if (r < 0)
+                return log_unit_debug_errno(j->unit, r, "Failed to insert job into jobs hash table: %m");
+
+        *pj = j;
+        j->installed = true;
+
+        if (j->state == JOB_RUNNING)
+                j->unit->manager->n_running_jobs++;
+
+        log_unit_debug(j->unit,
+                       "Reinstalled deserialized job %s/%s as %u",
+                       j->unit->id, job_type_to_string(j->type), (unsigned) j->id);
+        return 0;
+}
+
+JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts) {
+        JobDependency *l;
+
+        assert(object);
+
+        /* Adds a new job link, which encodes that the 'subject' job
+         * needs the 'object' job in some way. If 'subject' is NULL
+         * this means the 'anchor' job (i.e. the one the user
+         * explicitly asked for) is the requester. */
+
+        l = new0(JobDependency, 1);
+        if (!l)
+                return NULL;
+
+        l->subject = subject;
+        l->object = object;
+        l->matters = matters;
+        l->conflicts = conflicts;
+
+        if (subject)
+                LIST_PREPEND(subject, subject->subject_list, l);
+
+        LIST_PREPEND(object, object->object_list, l);
+
+        return l;
+}
+
+void job_dependency_free(JobDependency *l) {
+        assert(l);
+
+        if (l->subject)
+                LIST_REMOVE(subject, l->subject->subject_list, l);
+
+        LIST_REMOVE(object, l->object->object_list, l);
+
+        free(l);
+}
+
+void job_dump(Job *j, FILE *f, const char *prefix) {
+        assert(j);
+        assert(f);
+
+        prefix = strempty(prefix);
+
+        fprintf(f,
+                "%s-> Job %u:\n"
+                "%s\tAction: %s -> %s\n"
+                "%s\tState: %s\n"
+                "%s\tIrreversible: %s\n"
+                "%s\tMay GC: %s\n",
+                prefix, j->id,
+                prefix, j->unit->id, job_type_to_string(j->type),
+                prefix, job_state_to_string(j->state),
+                prefix, yes_no(j->irreversible),
+                prefix, yes_no(job_may_gc(j)));
+}
+
+/*
+ * Merging is commutative, so imagine the matrix as symmetric. We store only
+ * its lower triangle to avoid duplication. We don't store the main diagonal,
+ * because A merged with A is simply A.
+ *
+ * If the resulting type is collapsed immediately afterwards (to get rid of
+ * the JOB_RELOAD_OR_START, which lies outside the lookup function's domain),
+ * the following properties hold:
+ *
+ * Merging is associative! A merged with B, and then merged with C is the same
+ * as A merged with the result of B merged with C.
+ *
+ * Mergeability is transitive! If A can be merged with B and B with C then
+ * A also with C.
+ *
+ * Also, if A merged with B cannot be merged with C, then either A or B cannot
+ * be merged with C either.
+ */
+static const JobType job_merging_table[] = {
+/* What \ With       *  JOB_START         JOB_VERIFY_ACTIVE  JOB_STOP JOB_RELOAD */
+/*********************************************************************************/
+/*JOB_START          */
+/*JOB_VERIFY_ACTIVE  */ JOB_START,
+/*JOB_STOP           */ -1,                  -1,
+/*JOB_RELOAD         */ JOB_RELOAD_OR_START, JOB_RELOAD,          -1,
+/*JOB_RESTART        */ JOB_RESTART,         JOB_RESTART,         -1, JOB_RESTART,
+};
+
+JobType job_type_lookup_merge(JobType a, JobType b) {
+        assert_cc(ELEMENTSOF(job_merging_table) == _JOB_TYPE_MAX_MERGING * (_JOB_TYPE_MAX_MERGING - 1) / 2);
+        assert(a >= 0 && a < _JOB_TYPE_MAX_MERGING);
+        assert(b >= 0 && b < _JOB_TYPE_MAX_MERGING);
+
+        if (a == b)
+                return a;
+
+        if (a < b) {
+                JobType tmp = a;
+                a = b;
+                b = tmp;
+        }
+
+        return job_merging_table[(a - 1) * a / 2 + b];
+}
+
+bool job_type_is_redundant(JobType a, UnitActiveState b) {
+        switch (a) {
+
+        case JOB_START:
+                return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING);
+
+        case JOB_STOP:
+                return IN_SET(b, UNIT_INACTIVE, UNIT_FAILED);
+
+        case JOB_VERIFY_ACTIVE:
+                return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING);
+
+        case JOB_RELOAD:
+                return
+                        b == UNIT_RELOADING;
+
+        case JOB_RESTART:
+                /* Restart jobs must always be kept.
+                 *
+                 * For ACTIVE/RELOADING units, this is obvious.
+                 *
+                 * For ACTIVATING units, it's more subtle:
+                 *
+                 * Generally, if a service Requires= another unit, restarts of
+                 * the unit must be propagated to the service. If the service is
+                 * ACTIVATING, it must still be restarted since it might have
+                 * stale information regarding the other unit.
+                 *
+                 * For example, consider a service that Requires= a socket: if
+                 * the socket is restarted, but the service is still ACTIVATING,
+                 * it's necessary to restart the service so that it gets the new
+                 * socket. */
+                return false;
+
+        case JOB_NOP:
+                return true;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+JobType job_type_collapse(JobType t, Unit *u) {
+        UnitActiveState s;
+
+        switch (t) {
+
+        case JOB_TRY_RESTART:
+                /* Be sure to keep the restart job even if the unit is
+                 * ACTIVATING.
+                 *
+                 * See the job_type_is_redundant(JOB_RESTART) for more info */
+                s = unit_active_state(u);
+                if (!UNIT_IS_ACTIVE_OR_ACTIVATING(s))
+                        return JOB_NOP;
+
+                return JOB_RESTART;
+
+        case JOB_TRY_RELOAD:
+                s = unit_active_state(u);
+                if (!UNIT_IS_ACTIVE_OR_RELOADING(s))
+                        return JOB_NOP;
+
+                return JOB_RELOAD;
+
+        case JOB_RELOAD_OR_START:
+                s = unit_active_state(u);
+                if (!UNIT_IS_ACTIVE_OR_RELOADING(s))
+                        return JOB_START;
+
+                return JOB_RELOAD;
+
+        default:
+                return t;
+        }
+}
+
+int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u) {
+        JobType t;
+
+        t = job_type_lookup_merge(*a, b);
+        if (t < 0)
+                return -EEXIST;
+
+        *a = job_type_collapse(t, u);
+        return 0;
+}
+
+static bool job_is_runnable(Job *j) {
+        Unit *other;
+
+        assert(j);
+        assert(j->installed);
+
+        /* Checks whether there is any job running for the units this
+         * job needs to be running after (in the case of a 'positive'
+         * job type) or before (in the case of a 'negative' job
+         * type. */
+
+        /* Note that unit types have a say in what is runnable,
+         * too. For example, if they return -EAGAIN from
+         * unit_start() they can indicate they are not
+         * runnable yet. */
+
+        /* First check if there is an override */
+        if (j->ignore_order)
+                return true;
+
+        if (j->type == JOB_NOP)
+                return true;
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER)
+                if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) > 0) {
+                        log_unit_debug(j->unit,
+                                       "starting held back, waiting for: %s",
+                                       other->id);
+                        return false;
+                }
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE)
+                if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) > 0) {
+                        log_unit_debug(j->unit,
+                                       "stopping held back, waiting for: %s",
+                                       other->id);
+                        return false;
+                }
+
+        return true;
+}
+
+static void job_change_type(Job *j, JobType newtype) {
+        assert(j);
+
+        log_unit_debug(j->unit,
+                       "Converting job %s/%s -> %s/%s",
+                       j->unit->id, job_type_to_string(j->type),
+                       j->unit->id, job_type_to_string(newtype));
+
+        j->type = newtype;
+}
+
+static const char* job_start_message_format(Unit *u, JobType t) {
+        assert(u);
+        assert(IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD));
+
+        if (t == JOB_RELOAD)
+                return "Reloading %s...";
+        else if (t == JOB_START)
+                return UNIT_VTABLE(u)->status_message_formats.starting_stopping[0] ?: "Starting %s...";
+        else
+                return UNIT_VTABLE(u)->status_message_formats.starting_stopping[1] ?: "Stopping %s...";
+}
+
+static void job_emit_start_message(Unit *u, uint32_t job_id, JobType t) {
+        _cleanup_free_ char *free_ident = NULL;
+        const char *ident, *format;
+
+        assert(u);
+        assert(t >= 0);
+        assert(t < _JOB_TYPE_MAX);
+        assert(u->id); /* We better don't try to run a unit that doesn't even have an id. */
+
+        if (!IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD))
+                return;
+
+        if (!unit_log_level_test(u, LOG_INFO))
+                return;
+
+        format = job_start_message_format(u, t);
+        ident = unit_status_string(u, &free_ident);
+
+        bool do_console = t != JOB_RELOAD;
+        bool console_only = do_console && log_on_console(); /* Reload status messages have traditionally
+                                                             * not been printed to the console. */
+
+        /* Print to the log first. */
+        if (!console_only) {  /* Skip this if it would only go on the console anyway */
+
+                const char *mid =
+                        t == JOB_START ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTING_STR :
+                        t == JOB_STOP  ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPING_STR :
+                                         "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADING_STR;
+                const char *msg_fmt = strjoina("MESSAGE=", format);
+
+                /* Note that we deliberately use LOG_MESSAGE() instead of LOG_UNIT_MESSAGE() here, since this
+                 * is supposed to mimic closely what is written to screen using the status output, which is
+                 * supposed to be high level friendly output. */
+
+                DISABLE_WARNING_FORMAT_NONLITERAL;
+                log_unit_struct(u, LOG_INFO,
+                                msg_fmt, ident,
+                                "JOB_ID=%" PRIu32, job_id,
+                                "JOB_TYPE=%s", job_type_to_string(t),
+                                LOG_UNIT_INVOCATION_ID(u),
+                                mid);
+                REENABLE_WARNING;
+        }
+
+        /* Log to the console second. */
+        if (do_console) {
+                DISABLE_WARNING_FORMAT_NONLITERAL;
+                unit_status_printf(u, STATUS_TYPE_NORMAL, "", format, ident);
+                REENABLE_WARNING;
+        }
+}
+
+static const char* job_done_message_format(Unit *u, JobType t, JobResult result) {
+        static const char* const generic_finished_start_job[_JOB_RESULT_MAX] = {
+                [JOB_DONE]        = "Started %s.",
+                [JOB_TIMEOUT]     = "Timed out starting %s.",
+                [JOB_FAILED]      = "Failed to start %s.",
+                [JOB_DEPENDENCY]  = "Dependency failed for %s.",
+                [JOB_ASSERT]      = "Assertion failed for %s.",
+                [JOB_UNSUPPORTED] = "Starting of %s unsupported.",
+                [JOB_COLLECTED]   = "Unnecessary job was removed for %s.",
+                [JOB_ONCE]        = "Unit %s has been started before and cannot be started again.",
+        };
+        static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = {
+                [JOB_DONE]        = "Stopped %s.",
+                [JOB_FAILED]      = "Stopped %s with error.",
+                [JOB_TIMEOUT]     = "Timed out stopping %s.",
+        };
+        static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = {
+                [JOB_DONE]        = "Reloaded %s.",
+                [JOB_FAILED]      = "Reload failed for %s.",
+                [JOB_TIMEOUT]     = "Timed out reloading %s.",
+        };
+        /* When verify-active detects the unit is inactive, report it.
+         * Most likely a DEPEND warning from a requisiting unit will
+         * occur next and it's nice to see what was requisited. */
+        static const char* const generic_finished_verify_active_job[_JOB_RESULT_MAX] = {
+                [JOB_SKIPPED]     = "%s is inactive.",
+        };
+        const char *format;
+
+        assert(u);
+        assert(t >= 0);
+        assert(t < _JOB_TYPE_MAX);
+
+        /* Show condition check message if the job did not actually do anything due to unmet condition. */
+        if (t == JOB_START && result == JOB_DONE && !u->condition_result)
+                return "Condition check resulted in %s being skipped.";
+
+        if (IN_SET(t, JOB_START, JOB_STOP, JOB_RESTART)) {
+                const UnitStatusMessageFormats *formats = &UNIT_VTABLE(u)->status_message_formats;
+                if (formats->finished_job) {
+                        format = formats->finished_job(u, t, result);
+                        if (format)
+                                return format;
+                }
+
+                format = (t == JOB_START ? formats->finished_start_job : formats->finished_stop_job)[result];
+                if (format)
+                        return format;
+        }
+
+        /* Return generic strings */
+        switch (t) {
+        case JOB_START:
+                return generic_finished_start_job[result];
+        case JOB_STOP:
+        case JOB_RESTART:
+                return generic_finished_stop_job[result];
+        case JOB_RELOAD:
+                return generic_finished_reload_job[result];
+        case JOB_VERIFY_ACTIVE:
+                return generic_finished_verify_active_job[result];
+        default:
+                return NULL;
+        }
+}
+
+static const struct {
+        int log_level;
+        const char *color, *word;
+} job_done_messages[_JOB_RESULT_MAX] = {
+        [JOB_DONE]        = { LOG_INFO,    ANSI_OK_COLOR,         "  OK  " },
+        [JOB_CANCELED]    = { LOG_INFO,                                    },
+        [JOB_TIMEOUT]     = { LOG_ERR,     ANSI_HIGHLIGHT_RED,    " TIME " },
+        [JOB_FAILED]      = { LOG_ERR,     ANSI_HIGHLIGHT_RED,    "FAILED" },
+        [JOB_DEPENDENCY]  = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "DEPEND" },
+        [JOB_SKIPPED]     = { LOG_NOTICE,  ANSI_HIGHLIGHT,        " INFO " },
+        [JOB_INVALID]     = { LOG_INFO,                                    },
+        [JOB_ASSERT]      = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "ASSERT" },
+        [JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" },
+        [JOB_COLLECTED]   = { LOG_INFO,                                    },
+        [JOB_ONCE]        = { LOG_ERR,     ANSI_HIGHLIGHT_RED,    " ONCE " },
+};
+
+static const char* job_done_mid(JobType type, JobResult result) {
+        switch (type) {
+        case JOB_START:
+                if (result == JOB_DONE)
+                        return "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR;
+                else
+                        return "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILED_STR;
+
+        case JOB_RELOAD:
+                return "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADED_STR;
+
+        case JOB_STOP:
+        case JOB_RESTART:
+                return "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPED_STR;
+
+        default:
+                return NULL;
+        }
+}
+
+static void job_emit_done_message(Unit *u, uint32_t job_id, JobType t, JobResult result) {
+        _cleanup_free_ char *free_ident = NULL;
+        const char *ident, *format;
+
+        assert(u);
+        assert(t >= 0);
+        assert(t < _JOB_TYPE_MAX);
+
+        if (!unit_log_level_test(u, job_done_messages[result].log_level))
+                return;
+
+        format = job_done_message_format(u, t, result);
+        if (!format)
+                return;
+
+        ident = unit_status_string(u, &free_ident);
+
+        const char *status = job_done_messages[result].word;
+        bool do_console = t != JOB_RELOAD && status;
+        bool console_only = do_console && log_on_console();
+
+        if (t == JOB_START && result == JOB_DONE && !u->condition_result) {
+                /* No message on the console if the job did not actually do anything due to unmet condition. */
+                if (console_only)
+                        return;
+                else
+                        do_console = false;
+        }
+
+        if (!console_only) {  /* Skip printing if output goes to the console, and job_print_status_message()
+                               * will actually print something to the console. */
+                Condition *c;
+                const char *mid = job_done_mid(t, result);  /* mid may be NULL. log_unit_struct() will ignore it. */
+
+                c = t == JOB_START && result == JOB_DONE ? unit_find_failed_condition(u) : NULL;
+                if (c) {
+                        /* Special case units that were skipped because of a unmet condition check so that
+                         * we can add more information to the message. */
+                        if (c->trigger)
+                                log_unit_struct(
+                                        u,
+                                        job_done_messages[result].log_level,
+                                        LOG_MESSAGE("%s was skipped because no trigger condition checks were met.",
+                                                    ident),
+                                        "JOB_ID=%" PRIu32, job_id,
+                                        "JOB_TYPE=%s", job_type_to_string(t),
+                                        "JOB_RESULT=%s", job_result_to_string(result),
+                                        LOG_UNIT_INVOCATION_ID(u),
+                                        mid);
+                        else
+                                log_unit_struct(
+                                        u,
+                                        job_done_messages[result].log_level,
+                                        LOG_MESSAGE("%s was skipped because of an unmet condition check (%s=%s%s).",
+                                                    ident,
+                                                    condition_type_to_string(c->type),
+                                                    c->negate ? "!" : "",
+                                                    c->parameter),
+                                        "JOB_ID=%" PRIu32, job_id,
+                                        "JOB_TYPE=%s", job_type_to_string(t),
+                                        "JOB_RESULT=%s", job_result_to_string(result),
+                                        LOG_UNIT_INVOCATION_ID(u),
+                                        mid);
+                } else {
+                        const char *msg_fmt = strjoina("MESSAGE=", format);
+
+                        DISABLE_WARNING_FORMAT_NONLITERAL;
+                        log_unit_struct(u, job_done_messages[result].log_level,
+                                        msg_fmt, ident,
+                                        "JOB_ID=%" PRIu32, job_id,
+                                        "JOB_TYPE=%s", job_type_to_string(t),
+                                        "JOB_RESULT=%s", job_result_to_string(result),
+                                        LOG_UNIT_INVOCATION_ID(u),
+                                        mid);
+                        REENABLE_WARNING;
+                }
+        }
+
+        if (do_console) {
+                if (log_get_show_color())
+                        status = strjoina(job_done_messages[result].color,
+                                          status,
+                                          ANSI_NORMAL);
+
+                DISABLE_WARNING_FORMAT_NONLITERAL;
+                unit_status_printf(u,
+                                   result == JOB_DONE ? STATUS_TYPE_NORMAL : STATUS_TYPE_NOTICE,
+                                   status, format, ident);
+                REENABLE_WARNING;
+
+                if (t == JOB_START && result == JOB_FAILED) {
+                        _cleanup_free_ char *quoted = NULL;
+
+                        quoted = shell_maybe_quote(u->id, 0);
+                        if (quoted)
+                                manager_status_printf(u->manager, STATUS_TYPE_NORMAL, NULL,
+                                                      "See 'systemctl status %s' for details.", quoted);
+                }
+        }
+}
+
+static int job_perform_on_unit(Job **j) {
+        ActivationDetails *a;
+        uint32_t id;
+        Manager *m;
+        JobType t;
+        Unit *u;
+        bool wait_only;
+        int r;
+
+        /* While we execute this operation the job might go away (for example: because it finishes immediately
+         * or is replaced by a new, conflicting job). To make sure we don't access a freed job later on we
+         * store the id here, so that we can verify the job is still valid. */
+
+        assert(j);
+        assert(*j);
+
+        m = (*j)->manager;
+        u = (*j)->unit;
+        t = (*j)->type;
+        id = (*j)->id;
+        a = (*j)->activation_details;
+
+        switch (t) {
+                case JOB_START:
+                        r = unit_start(u, a);
+                        wait_only = r == -EBADR; /* If the unit type does not support starting, then simply wait. */
+                        break;
+
+                case JOB_RESTART:
+                        t = JOB_STOP;
+                        _fallthrough_;
+                case JOB_STOP:
+                        r = unit_stop(u);
+                        wait_only = r == -EBADR; /* If the unit type does not support stopping, then simply wait. */
+                        break;
+
+                case JOB_RELOAD:
+                        r = unit_reload(u);
+                        wait_only = false; /* A clear error is generated if reload is not supported. */
+                        break;
+
+                default:
+                        assert_not_reached();
+        }
+
+        /* Log if the job still exists and the start/stop/reload function actually did something or we're
+         * only waiting for unit status change (common for device units). The latter ensures that job start
+         * messages for device units are correctly shown. Note that if the job disappears too quickly, e.g.
+         * for units for which there's no 'activating' phase (i.e. because we transition directly from
+         * 'inactive' to 'active'), we'll possibly skip the "Starting..." message. */
+        *j = manager_get_job(m, id);
+        if (*j && (r > 0 || wait_only))
+                job_emit_start_message(u, id, t);
+
+        return wait_only ? 0 : r;
+}
+
+int job_run_and_invalidate(Job *j) {
+        int r;
+
+        assert(j);
+        assert(j->installed);
+        assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+        assert(j->in_run_queue);
+
+        prioq_remove(j->manager->run_queue, j, &j->run_queue_idx);
+        j->in_run_queue = false;
+
+        if (j->state != JOB_WAITING)
+                return 0;
+
+        if (!job_is_runnable(j))
+                return -EAGAIN;
+
+        job_start_timer(j, true);
+        job_set_state(j, JOB_RUNNING);
+        job_add_to_dbus_queue(j);
+
+        switch (j->type) {
+
+                case JOB_VERIFY_ACTIVE: {
+                        UnitActiveState t;
+
+                        t = unit_active_state(j->unit);
+                        if (UNIT_IS_ACTIVE_OR_RELOADING(t))
+                                r = -EALREADY;
+                        else if (t == UNIT_ACTIVATING)
+                                r = -EAGAIN;
+                        else
+                                r = -EBADR;
+                        break;
+                }
+
+                case JOB_START:
+                case JOB_STOP:
+                case JOB_RESTART:
+                case JOB_RELOAD:
+                        r = job_perform_on_unit(&j);
+                        break;
+
+                case JOB_NOP:
+                        r = -EALREADY;
+                        break;
+
+                default:
+                        assert_not_reached();
+        }
+
+        if (j) {
+                if (r == -EAGAIN)
+                        job_set_state(j, JOB_WAITING); /* Hmm, not ready after all, let's return to JOB_WAITING state */
+                else if (r == -EALREADY) /* already being executed */
+                        r = job_finish_and_invalidate(j, JOB_DONE, true, true);
+                else if (r == -ECOMM)
+                        r = job_finish_and_invalidate(j, JOB_DONE, true, false);
+                else if (r == -EBADR)
+                        r = job_finish_and_invalidate(j, JOB_SKIPPED, true, false);
+                else if (r == -ENOEXEC)
+                        r = job_finish_and_invalidate(j, JOB_INVALID, true, false);
+                else if (r == -EPROTO)
+                        r = job_finish_and_invalidate(j, JOB_ASSERT, true, false);
+                else if (r == -EOPNOTSUPP)
+                        r = job_finish_and_invalidate(j, JOB_UNSUPPORTED, true, false);
+                else if (r == -ENOLINK)
+                        r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
+                else if (r == -ESTALE)
+                        r = job_finish_and_invalidate(j, JOB_ONCE, true, false);
+                else if (r < 0)
+                        r = job_finish_and_invalidate(j, JOB_FAILED, true, false);
+        }
+
+        return r;
+}
+
+static void job_fail_dependencies(Unit *u, UnitDependencyAtom match_atom) {
+        Unit *other;
+
+        assert(u);
+
+        UNIT_FOREACH_DEPENDENCY(other, u, match_atom) {
+                Job *j = other->job;
+
+                if (!j)
+                        continue;
+                if (!IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE))
+                        continue;
+
+                job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
+        }
+}
+
+int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
+        Unit *u, *other;
+        JobType t;
+
+        assert(j);
+        assert(j->installed);
+        assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+
+        u = j->unit;
+        t = j->type;
+
+        j->result = result;
+
+        log_unit_debug(u, "Job %" PRIu32 " %s/%s finished, result=%s",
+                       j->id, u->id, job_type_to_string(t), job_result_to_string(result));
+
+        /* If this job did nothing to the respective unit we don't log the status message */
+        if (!already)
+                job_emit_done_message(u, j->id, t, result);
+
+        /* Patch restart jobs so that they become normal start jobs */
+        if (result == JOB_DONE && t == JOB_RESTART) {
+
+                job_change_type(j, JOB_START);
+                job_set_state(j, JOB_WAITING);
+
+                job_add_to_dbus_queue(j);
+                job_add_to_run_queue(j);
+                job_add_to_gc_queue(j);
+
+                goto finish;
+        }
+
+        if (IN_SET(result, JOB_FAILED, JOB_INVALID))
+                j->manager->n_failed_jobs++;
+
+        job_uninstall(j);
+        job_free(j);
+
+        /* Fail depending jobs on failure */
+        if (result != JOB_DONE && recursive) {
+                if (IN_SET(t, JOB_START, JOB_VERIFY_ACTIVE))
+                        job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_START_FAILURE);
+                else if (t == JOB_STOP)
+                        job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_STOP_FAILURE);
+        }
+
+        /* A special check to make sure we take down anything RequisiteOf= if we aren't active. This is when
+         * the verify-active job merges with a satisfying job type, and then loses its invalidation effect,
+         * as the result there is JOB_DONE for the start job we merged into, while we should be failing the
+         * depending job if the said unit isn't in fact active. Oneshots are an example of this, where going
+         * directly from activating to inactive is success.
+         *
+         * This happens when you use ConditionXYZ= in a unit too, since in that case the job completes with
+         * the JOB_DONE result, but the unit never really becomes active. Note that such a case still
+         * involves merging:
+         *
+         * A start job waits for something else, and a verify-active comes in and merges in the installed
+         * job. Then, later, when it becomes runnable, it finishes with JOB_DONE result as execution on
+         * conditions not being met is skipped, breaking our dependency semantics.
+         *
+         * Also, depending on if start job waits or not, the merging may or may not happen (the verify-active
+         * job may trigger after it finishes), so you get undeterministic results without this check.
+         */
+        if (result == JOB_DONE && recursive &&
+            IN_SET(t, JOB_START, JOB_RELOAD) &&
+            !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+                job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE);
+
+        /* Trigger OnFailure= dependencies that are not generated by the unit itself. We don't treat
+         * JOB_CANCELED as failure in this context. And JOB_FAILURE is already handled by the unit itself. */
+        if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) {
+                log_unit_struct(u, LOG_NOTICE,
+                                "JOB_TYPE=%s", job_type_to_string(t),
+                                "JOB_RESULT=%s", job_result_to_string(result),
+                                LOG_UNIT_MESSAGE(u, "Job %s/%s failed with result '%s'.",
+                                                 u->id,
+                                                 job_type_to_string(t),
+                                                 job_result_to_string(result)));
+
+                unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode);
+        }
+
+        unit_trigger_notify(u);
+
+finish:
+        /* Try to start the next jobs that can be started */
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_AFTER)
+                if (other->job) {
+                        job_add_to_run_queue(other->job);
+                        job_add_to_gc_queue(other->job);
+                }
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_BEFORE)
+                if (other->job) {
+                        job_add_to_run_queue(other->job);
+                        job_add_to_gc_queue(other->job);
+                }
+
+        /* Ensure that when an upheld/unneeded/bound unit activation job fails we requeue it, if it still
+         * necessary. If there are no state changes in the triggerer, it would not be retried otherwise. */
+        unit_submit_to_start_when_upheld_queue(u);
+        unit_submit_to_stop_when_bound_queue(u);
+        unit_submit_to_stop_when_unneeded_queue(u);
+
+        manager_check_finished(u->manager);
+
+        return 0;
+}
+
+static int job_dispatch_timer(sd_event_source *s, uint64_t monotonic, void *userdata) {
+        Job *j = ASSERT_PTR(userdata);
+        Unit *u;
+
+        assert(s == j->timer_event_source);
+
+        log_unit_warning(j->unit, "Job %s/%s timed out.", j->unit->id, job_type_to_string(j->type));
+
+        u = j->unit;
+        job_finish_and_invalidate(j, JOB_TIMEOUT, true, false);
+
+        emergency_action(u->manager, u->job_timeout_action,
+                         EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN,
+                         u->job_timeout_reboot_arg, -1, "job timed out");
+
+        return 0;
+}
+
+int job_start_timer(Job *j, bool job_running) {
+        int r;
+        usec_t timeout_time, old_timeout_time;
+
+        if (job_running) {
+                j->begin_running_usec = now(CLOCK_MONOTONIC);
+
+                if (j->unit->job_running_timeout == USEC_INFINITY)
+                        return 0;
+
+                timeout_time = usec_add(j->begin_running_usec, j->unit->job_running_timeout);
+
+                if (j->timer_event_source) {
+                        /* Update only if JobRunningTimeoutSec= results in earlier timeout */
+                        r = sd_event_source_get_time(j->timer_event_source, &old_timeout_time);
+                        if (r < 0)
+                                return r;
+
+                        if (old_timeout_time <= timeout_time)
+                                return 0;
+
+                        return sd_event_source_set_time(j->timer_event_source, timeout_time);
+                }
+        } else {
+                if (j->timer_event_source)
+                        return 0;
+
+                j->begin_usec = now(CLOCK_MONOTONIC);
+
+                if (j->unit->job_timeout == USEC_INFINITY)
+                        return 0;
+
+                timeout_time = usec_add(j->begin_usec, j->unit->job_timeout);
+        }
+
+        r = sd_event_add_time(
+                        j->manager->event,
+                        &j->timer_event_source,
+                        CLOCK_MONOTONIC,
+                        timeout_time, 0,
+                        job_dispatch_timer, j);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(j->timer_event_source, "job-start");
+
+        return 0;
+}
+
+void job_add_to_run_queue(Job *j) {
+        int r;
+
+        assert(j);
+        assert(j->installed);
+
+        if (j->in_run_queue)
+                return;
+
+        r = prioq_put(j->manager->run_queue, j, &j->run_queue_idx);
+        if (r < 0)
+                log_warning_errno(r, "Failed put job in run queue, ignoring: %m");
+        else
+                j->in_run_queue = true;
+
+        manager_trigger_run_queue(j->manager);
+}
+
+void job_add_to_dbus_queue(Job *j) {
+        assert(j);
+        assert(j->installed);
+
+        if (j->in_dbus_queue)
+                return;
+
+        /* We don't check if anybody is subscribed here, since this
+         * job might just have been created and not yet assigned to a
+         * connection/client. */
+
+        LIST_PREPEND(dbus_queue, j->manager->dbus_job_queue, j);
+        j->in_dbus_queue = true;
+}
+
+char *job_dbus_path(Job *j) {
+        char *p;
+
+        assert(j);
+
+        if (asprintf(&p, "/org/freedesktop/systemd1/job/%"PRIu32, j->id) < 0)
+                return NULL;
+
+        return p;
+}
+
+int job_serialize(Job *j, FILE *f) {
+        assert(j);
+        assert(f);
+
+        (void) serialize_item_format(f, "job-id", "%u", j->id);
+        (void) serialize_item(f, "job-type", job_type_to_string(j->type));
+        (void) serialize_item(f, "job-state", job_state_to_string(j->state));
+        (void) serialize_bool(f, "job-irreversible", j->irreversible);
+        (void) serialize_bool(f, "job-sent-dbus-new-signal", j->sent_dbus_new_signal);
+        (void) serialize_bool(f, "job-ignore-order", j->ignore_order);
+
+        if (j->begin_usec > 0)
+                (void) serialize_usec(f, "job-begin", j->begin_usec);
+        if (j->begin_running_usec > 0)
+                (void) serialize_usec(f, "job-begin-running", j->begin_running_usec);
+
+        bus_track_serialize(j->bus_track, f, "subscribed");
+
+        activation_details_serialize(j->activation_details, f);
+
+        /* End marker */
+        fputc('\n', f);
+        return 0;
+}
+
+int job_deserialize(Job *j, FILE *f) {
+        int r;
+
+        assert(j);
+        assert(f);
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                size_t k;
+                char *v;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                k = strcspn(l, "=");
+
+                if (l[k] == '=') {
+                        l[k] = 0;
+                        v = l+k+1;
+                } else
+                        v = l+k;
+
+                if (streq(l, "job-id")) {
+
+                        if (safe_atou32(v, &j->id) < 0)
+                                log_debug("Failed to parse job id value: %s", v);
+
+                } else if (streq(l, "job-type")) {
+                        JobType t;
+
+                        t = job_type_from_string(v);
+                        if (t < 0)
+                                log_debug("Failed to parse job type: %s", v);
+                        else if (t >= _JOB_TYPE_MAX_IN_TRANSACTION)
+                                log_debug("Cannot deserialize job of type: %s", v);
+                        else
+                                j->type = t;
+
+                } else if (streq(l, "job-state")) {
+                        JobState s;
+
+                        s = job_state_from_string(v);
+                        if (s < 0)
+                                log_debug("Failed to parse job state: %s", v);
+                        else
+                                job_set_state(j, s);
+
+                } else if (streq(l, "job-irreversible")) {
+                        int b;
+
+                        b = parse_boolean(v);
+                        if (b < 0)
+                                log_debug("Failed to parse job irreversible flag: %s", v);
+                        else
+                                j->irreversible = j->irreversible || b;
+
+                } else if (streq(l, "job-sent-dbus-new-signal")) {
+                        int b;
+
+                        b = parse_boolean(v);
+                        if (b < 0)
+                                log_debug("Failed to parse job sent_dbus_new_signal flag: %s", v);
+                        else
+                                j->sent_dbus_new_signal = j->sent_dbus_new_signal || b;
+
+                } else if (streq(l, "job-ignore-order")) {
+                        int b;
+
+                        b = parse_boolean(v);
+                        if (b < 0)
+                                log_debug("Failed to parse job ignore_order flag: %s", v);
+                        else
+                                j->ignore_order = j->ignore_order || b;
+
+                } else if (streq(l, "job-begin"))
+                        (void) deserialize_usec(v, &j->begin_usec);
+
+                else if (streq(l, "job-begin-running"))
+                        (void) deserialize_usec(v, &j->begin_running_usec);
+
+                else if (streq(l, "subscribed")) {
+                        if (strv_extend(&j->deserialized_clients, v) < 0)
+                                return log_oom();
+
+                } else if (startswith(l, "activation-details")) {
+                        if (activation_details_deserialize(l, v, &j->activation_details) < 0)
+                                log_debug("Failed to parse job ActivationDetails element: %s", v);
+
+                } else
+                        log_debug("Unknown job serialization key: %s", l);
+        }
+
+        return 0;
+}
+
+int job_coldplug(Job *j) {
+        int r;
+        usec_t timeout_time = USEC_INFINITY;
+
+        assert(j);
+
+        /* After deserialization is complete and the bus connection
+         * set up again, let's start watching our subscribers again */
+        (void) bus_job_coldplug_bus_track(j);
+
+        if (j->state == JOB_WAITING)
+                job_add_to_run_queue(j);
+
+        /* Maybe due to new dependencies we don't actually need this job anymore? */
+        job_add_to_gc_queue(j);
+
+        /* Create timer only when job began or began running and the respective timeout is finite.
+         * Follow logic of job_start_timer() if both timeouts are finite */
+        if (j->begin_usec == 0)
+                return 0;
+
+        if (j->unit->job_timeout != USEC_INFINITY)
+                timeout_time = usec_add(j->begin_usec, j->unit->job_timeout);
+
+        if (timestamp_is_set(j->begin_running_usec))
+                timeout_time = MIN(timeout_time, usec_add(j->begin_running_usec, j->unit->job_running_timeout));
+
+        if (timeout_time == USEC_INFINITY)
+                return 0;
+
+        j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source);
+
+        r = sd_event_add_time(
+                        j->manager->event,
+                        &j->timer_event_source,
+                        CLOCK_MONOTONIC,
+                        timeout_time, 0,
+                        job_dispatch_timer, j);
+        if (r < 0)
+                log_debug_errno(r, "Failed to restart timeout for job: %m");
+
+        (void) sd_event_source_set_description(j->timer_event_source, "job-timeout");
+
+        return r;
+}
+
+void job_shutdown_magic(Job *j) {
+        assert(j);
+
+        /* The shutdown target gets some special treatment here: we
+         * tell the kernel to begin with flushing its disk caches, to
+         * optimize shutdown time a bit. Ideally we wouldn't hardcode
+         * this magic into PID 1. However all other processes aren't
+         * options either since they'd exit much sooner than PID 1 and
+         * asynchronous sync() would cause their exit to be
+         * delayed. */
+
+        if (j->type != JOB_START)
+                return;
+
+        if (!MANAGER_IS_SYSTEM(j->unit->manager))
+                return;
+
+        if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET))
+                return;
+
+        /* In case messages on console has been disabled on boot */
+        j->unit->manager->no_console_output = false;
+
+        manager_invalidate_startup_units(j->unit->manager);
+
+        if (detect_container() > 0)
+                return;
+
+        (void) asynchronous_sync(NULL);
+}
+
+int job_get_timeout(Job *j, usec_t *ret) {
+        usec_t x = USEC_INFINITY, y = USEC_INFINITY;
+        Unit *u = ASSERT_PTR(ASSERT_PTR(j)->unit);
+        int r;
+
+        assert(ret);
+
+        if (j->timer_event_source) {
+                r = sd_event_source_get_time(j->timer_event_source, &x);
+                if (r < 0)
+                        return r;
+        }
+
+        if (UNIT_VTABLE(u)->get_timeout) {
+                r = UNIT_VTABLE(u)->get_timeout(u, &y);
+                if (r < 0)
+                        return r;
+        }
+
+        if (x == USEC_INFINITY && y == USEC_INFINITY) {
+                *ret = 0;
+                return 0;
+        }
+
+        *ret = MIN(x, y);
+        return 1;
+}
+
+bool job_may_gc(Job *j) {
+        Unit *other;
+
+        assert(j);
+
+        /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their
+         * own and just track external state. For now the only unit type that qualifies for this are .device units.
+         * Returns true if the job can be collected. */
+
+        if (!UNIT_VTABLE(j->unit)->gc_jobs)
+                return false;
+
+        /* Make sure to send out pending D-Bus events before we unload the unit */
+        if (j->in_dbus_queue)
+                return false;
+
+        if (sd_bus_track_count(j->bus_track) > 0)
+                return false;
+
+        /* FIXME: So this is a bit ugly: for now we don't properly track references made via private bus connections
+         * (because it's nasty, as sd_bus_track doesn't apply to it). We simply remember that the job was once
+         * referenced by one, and reset this whenever we notice that no private bus connections are around. This means
+         * the GC is a bit too conservative when it comes to jobs created by private bus connections. */
+        if (j->ref_by_private_bus) {
+                if (set_isempty(j->unit->manager->private_buses))
+                        j->ref_by_private_bus = false;
+                else
+                        return false;
+        }
+
+        if (j->type == JOB_NOP)
+                return false;
+
+        /* The logic is inverse to job_is_runnable, we cannot GC as long as we block any job. */
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE)
+                if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) < 0)
+                        return false;
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER)
+                if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) < 0)
+                        return false;
+
+        return true;
+}
+
+void job_add_to_gc_queue(Job *j) {
+        assert(j);
+
+        if (j->in_gc_queue)
+                return;
+
+        if (!job_may_gc(j))
+                return;
+
+        LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j);
+        j->in_gc_queue = true;
+}
+
+static int job_compare_id(Job * const *a, Job * const *b) {
+        return CMP((*a)->id, (*b)->id);
+}
+
+static size_t sort_job_list(Job **list, size_t n) {
+        Job *previous = NULL;
+        size_t a, b;
+
+        /* Order by numeric IDs */
+        typesafe_qsort(list, n, job_compare_id);
+
+        /* Filter out duplicates */
+        for (a = 0, b = 0; a < n; a++) {
+
+                if (previous == list[a])
+                        continue;
+
+                previous = list[b++] = list[a];
+        }
+
+        return b;
+}
+
+int job_get_before(Job *j, Job*** ret) {
+        _cleanup_free_ Job** list = NULL;
+        Unit *other = NULL;
+        size_t n = 0;
+
+        /* Returns a list of all pending jobs that need to finish before this job may be started. */
+
+        assert(j);
+        assert(ret);
+
+        if (j->ignore_order) {
+                *ret = NULL;
+                return 0;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) {
+                if (!other->job)
+                        continue;
+                if (job_compare(j, other->job, UNIT_ATOM_AFTER) <= 0)
+                        continue;
+
+                if (!GREEDY_REALLOC(list, n+1))
+                        return -ENOMEM;
+                list[n++] = other->job;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) {
+                if (!other->job)
+                        continue;
+                if (job_compare(j, other->job, UNIT_ATOM_BEFORE) <= 0)
+                        continue;
+
+                if (!GREEDY_REALLOC(list, n+1))
+                        return -ENOMEM;
+                list[n++] = other->job;
+        }
+
+        n = sort_job_list(list, n);
+
+        *ret = TAKE_PTR(list);
+
+        return (int) n;
+}
+
+int job_get_after(Job *j, Job*** ret) {
+        _cleanup_free_ Job** list = NULL;
+        Unit *other = NULL;
+        size_t n = 0;
+
+        assert(j);
+        assert(ret);
+
+        /* Returns a list of all pending jobs that are waiting for this job to finish. */
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) {
+                if (!other->job)
+                        continue;
+
+                if (other->job->ignore_order)
+                        continue;
+
+                if (job_compare(j, other->job, UNIT_ATOM_BEFORE) >= 0)
+                        continue;
+
+                if (!GREEDY_REALLOC(list, n+1))
+                        return -ENOMEM;
+                list[n++] = other->job;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) {
+                if (!other->job)
+                        continue;
+
+                if (other->job->ignore_order)
+                        continue;
+
+                if (job_compare(j, other->job, UNIT_ATOM_AFTER) >= 0)
+                        continue;
+
+                if (!GREEDY_REALLOC(list, n+1))
+                        return -ENOMEM;
+                list[n++] = other->job;
+        }
+
+        n = sort_job_list(list, n);
+
+        *ret = TAKE_PTR(list);
+
+        return (int) n;
+}
+
+static const char* const job_state_table[_JOB_STATE_MAX] = {
+        [JOB_WAITING] = "waiting",
+        [JOB_RUNNING] = "running",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_state, JobState);
+
+static const char* const job_type_table[_JOB_TYPE_MAX] = {
+        [JOB_START]           = "start",
+        [JOB_VERIFY_ACTIVE]   = "verify-active",
+        [JOB_STOP]            = "stop",
+        [JOB_RELOAD]          = "reload",
+        [JOB_RELOAD_OR_START] = "reload-or-start",
+        [JOB_RESTART]         = "restart",
+        [JOB_TRY_RESTART]     = "try-restart",
+        [JOB_TRY_RELOAD]      = "try-reload",
+        [JOB_NOP]             = "nop",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_type, JobType);
+
+static const char* const job_mode_table[_JOB_MODE_MAX] = {
+        [JOB_FAIL]                 = "fail",
+        [JOB_REPLACE]              = "replace",
+        [JOB_REPLACE_IRREVERSIBLY] = "replace-irreversibly",
+        [JOB_ISOLATE]              = "isolate",
+        [JOB_FLUSH]                = "flush",
+        [JOB_IGNORE_DEPENDENCIES]  = "ignore-dependencies",
+        [JOB_IGNORE_REQUIREMENTS]  = "ignore-requirements",
+        [JOB_TRIGGERING]           = "triggering",
+        [JOB_RESTART_DEPENDENCIES] = "restart-dependencies",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode);
+
+static const char* const job_result_table[_JOB_RESULT_MAX] = {
+        [JOB_DONE]        = "done",
+        [JOB_CANCELED]    = "canceled",
+        [JOB_TIMEOUT]     = "timeout",
+        [JOB_FAILED]      = "failed",
+        [JOB_DEPENDENCY]  = "dependency",
+        [JOB_SKIPPED]     = "skipped",
+        [JOB_INVALID]     = "invalid",
+        [JOB_ASSERT]      = "assert",
+        [JOB_UNSUPPORTED] = "unsupported",
+        [JOB_COLLECTED]   = "collected",
+        [JOB_ONCE]        = "once",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult);
+
+const char* job_type_to_access_method(JobType t) {
+        assert(t >= 0);
+        assert(t < _JOB_TYPE_MAX);
+
+        if (IN_SET(t, JOB_START, JOB_RESTART, JOB_TRY_RESTART))
+                return "start";
+        else if (t == JOB_STOP)
+                return "stop";
+        else
+                return "reload";
+}
+
+/*
+ * assume_dep   assumed dependency between units (a is before/after b)
+ *
+ * Returns
+ *    0         jobs are independent,
+ *   >0         a should run after b,
+ *   <0         a should run before b,
+ *
+ * The logic means that for a service a and a service b where b.After=a:
+ *
+ *  start a + start b → 1st step start a, 2nd step start b
+ *  start a + stop b  → 1st step stop b,  2nd step start a
+ *  stop a  + start b → 1st step stop a,  2nd step start b
+ *  stop a  + stop b  → 1st step stop b,  2nd step stop a
+ *
+ *  This has the side effect that restarts are properly synchronized too.
+ */
+int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep) {
+        assert(a);
+        assert(b);
+        assert(a->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+        assert(b->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+        assert(IN_SET(assume_dep, UNIT_ATOM_AFTER, UNIT_ATOM_BEFORE));
+
+        /* Trivial cases first */
+        if (a->type == JOB_NOP || b->type == JOB_NOP)
+                return 0;
+
+        if (a->ignore_order || b->ignore_order)
+                return 0;
+
+        if (assume_dep == UNIT_ATOM_AFTER)
+                return -job_compare(b, a, UNIT_ATOM_BEFORE);
+
+        /* Let's make it simple, JOB_STOP goes always first (in case both ua and ub stop, then ub's stop goes
+         * first anyway). JOB_RESTART is JOB_STOP in disguise (before it is patched to JOB_START). */
+        if (IN_SET(b->type, JOB_STOP, JOB_RESTART))
+                return 1;
+        else
+                return -1;
+}
+
+void job_set_activation_details(Job *j, ActivationDetails *info) {
+        /* Existing (older) ActivationDetails win, newer ones are discarded. */
+        if (!j || j->activation_details || !info)
+                return; /* Nothing to do. */
+
+        j->activation_details = activation_details_ref(info);
+}
diff --git a/src/core/job.h b/src/core/job.h
new file mode 100644
index 0000000..891d87a
--- /dev/null
+++ b/src/core/job.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "sd-event.h"
+
+#include "list.h"
+#include "unit-dependency-atom.h"
+#include "unit-name.h"
+#include "unit.h"
+
+typedef struct ActivationDetails ActivationDetails;
+typedef struct Job Job;
+typedef struct JobDependency JobDependency;
+typedef enum JobType JobType;
+typedef enum JobState JobState;
+typedef enum JobMode JobMode;
+typedef enum JobResult JobResult;
+
+/* Be careful when changing the job types! Adjust job_merging_table[] accordingly! */
+enum JobType {
+        JOB_START,                  /* if a unit does not support being started, we'll just wait until it becomes active */
+        JOB_VERIFY_ACTIVE,
+
+        JOB_STOP,
+
+        JOB_RELOAD,                 /* if running, reload */
+
+        /* Note that restarts are first treated like JOB_STOP, but
+         * then instead of finishing are patched to become
+         * JOB_START. */
+        JOB_RESTART,                /* If running, stop. Then start unconditionally. */
+
+        _JOB_TYPE_MAX_MERGING,
+
+        /* JOB_NOP can enter into a transaction, but as it won't pull in
+         * any dependencies and it uses the special 'nop_job' slot in Unit,
+         * it won't have to merge with anything (except possibly into another
+         * JOB_NOP, previously installed). JOB_NOP is special-cased in
+         * job_type_is_*() functions so that the transaction can be
+         * activated. */
+        JOB_NOP = _JOB_TYPE_MAX_MERGING, /* do nothing */
+
+        _JOB_TYPE_MAX_IN_TRANSACTION,
+
+        /* JOB_TRY_RESTART can never appear in a transaction, because
+         * it always collapses into JOB_RESTART or JOB_NOP before entering.
+         * Thus we never need to merge it with anything. */
+        JOB_TRY_RESTART = _JOB_TYPE_MAX_IN_TRANSACTION, /* if running, stop and then start */
+
+        /* Similar to JOB_TRY_RESTART but collapses to JOB_RELOAD or JOB_NOP */
+        JOB_TRY_RELOAD,
+
+        /* JOB_RELOAD_OR_START won't enter into a transaction and cannot result
+         * from transaction merging (there's no way for JOB_RELOAD and
+         * JOB_START to meet in one transaction). It can result from a merge
+         * during job installation, but then it will immediately collapse into
+         * one of the two simpler types. */
+        JOB_RELOAD_OR_START,        /* if running, reload, otherwise start */
+
+        _JOB_TYPE_MAX,
+        _JOB_TYPE_INVALID = -EINVAL,
+};
+
+enum JobState {
+        JOB_WAITING,
+        JOB_RUNNING,
+        _JOB_STATE_MAX,
+        _JOB_STATE_INVALID = -EINVAL,
+};
+
+enum JobMode {
+        JOB_FAIL,                /* Fail if a conflicting job is already queued */
+        JOB_REPLACE,             /* Replace an existing conflicting job */
+        JOB_REPLACE_IRREVERSIBLY,/* Like JOB_REPLACE + produce irreversible jobs */
+        JOB_ISOLATE,             /* Start a unit, and stop all others */
+        JOB_FLUSH,               /* Flush out all other queued jobs when queueing this one */
+        JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */
+        JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */
+        JOB_TRIGGERING,          /* Adds TRIGGERED_BY dependencies to the same transaction */
+        JOB_RESTART_DEPENDENCIES,/* A "start" job for the specified unit becomes "restart" for depending units */
+        _JOB_MODE_MAX,
+        _JOB_MODE_INVALID = -EINVAL,
+};
+
+enum JobResult {
+        JOB_DONE,                /* Job completed successfully (or skipped due to an unmet ConditionXYZ=) */
+        JOB_CANCELED,            /* Job canceled by a conflicting job installation or by explicit cancel request */
+        JOB_TIMEOUT,             /* Job timeout elapsed */
+        JOB_FAILED,              /* Job failed */
+        JOB_DEPENDENCY,          /* A required dependency job did not result in JOB_DONE */
+        JOB_SKIPPED,             /* Negative result of JOB_VERIFY_ACTIVE or skip due to ExecCondition= */
+        JOB_INVALID,             /* JOB_RELOAD of inactive unit */
+        JOB_ASSERT,              /* Couldn't start a unit, because an assert didn't hold */
+        JOB_UNSUPPORTED,         /* Couldn't start a unit, because the unit type is not supported on the system */
+        JOB_COLLECTED,           /* Job was garbage collected, since nothing needed it anymore */
+        JOB_ONCE,                /* Unit was started before, and hence can't be started again */
+        _JOB_RESULT_MAX,
+        _JOB_RESULT_INVALID = -EINVAL,
+};
+
+struct JobDependency {
+        /* Encodes that the 'subject' job needs the 'object' job in
+         * some way. This structure is used only while building a transaction. */
+        Job *subject;
+        Job *object;
+
+        LIST_FIELDS(JobDependency, subject);
+        LIST_FIELDS(JobDependency, object);
+
+        bool matters:1;
+        bool conflicts:1;
+};
+
+struct Job {
+        Manager *manager;
+        Unit *unit;
+
+        LIST_FIELDS(Job, transaction);
+        LIST_FIELDS(Job, dbus_queue);
+        LIST_FIELDS(Job, gc_queue);
+
+        LIST_HEAD(JobDependency, subject_list);
+        LIST_HEAD(JobDependency, object_list);
+
+        /* Used for graph algs as a "I have been here" marker */
+        Job* marker;
+        unsigned generation;
+
+        uint32_t id;
+
+        JobType type;
+        JobState state;
+
+        sd_event_source *timer_event_source;
+        usec_t begin_usec;
+        usec_t begin_running_usec;
+
+        /*
+         * This tracks where to send signals, and also which clients
+         * are allowed to call DBus methods on the job (other than
+         * root).
+         *
+         * There can be more than one client, because of job merging.
+         */
+        sd_bus_track *bus_track;
+        char **deserialized_clients;
+
+        JobResult result;
+
+        unsigned run_queue_idx;
+
+        /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */
+        ActivationDetails *activation_details;
+
+        bool installed:1;
+        bool in_run_queue:1;
+        bool matters_to_anchor:1;
+        bool in_dbus_queue:1;
+        bool sent_dbus_new_signal:1;
+        bool ignore_order:1;
+        bool irreversible:1;
+        bool in_gc_queue:1;
+        bool ref_by_private_bus:1;
+};
+
+Job* job_new(Unit *unit, JobType type);
+Job* job_new_raw(Unit *unit);
+void job_unlink(Job *job);
+Job* job_free(Job *job);
+Job* job_install(Job *j, bool refuse_late_merge);
+int job_install_deserialized(Job *j);
+void job_uninstall(Job *j);
+void job_dump(Job *j, FILE *f, const char *prefix);
+int job_serialize(Job *j, FILE *f);
+int job_deserialize(Job *j, FILE *f);
+int job_coldplug(Job *j);
+
+JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts);
+void job_dependency_free(JobDependency *l);
+
+int job_merge(Job *j, Job *other);
+
+JobType job_type_lookup_merge(JobType a, JobType b) _pure_;
+
+_pure_ static inline bool job_type_is_mergeable(JobType a, JobType b) {
+        return job_type_lookup_merge(a, b) >= 0;
+}
+
+_pure_ static inline bool job_type_is_conflicting(JobType a, JobType b) {
+        return a != JOB_NOP && b != JOB_NOP && !job_type_is_mergeable(a, b);
+}
+
+_pure_ static inline bool job_type_is_superset(JobType a, JobType b) {
+        /* Checks whether operation a is a "superset" of b in its actions */
+        if (b == JOB_NOP)
+                return true;
+        if (a == JOB_NOP)
+                return false;
+        return a == job_type_lookup_merge(a, b);
+}
+
+bool job_type_is_redundant(JobType a, UnitActiveState b) _pure_;
+
+/* Collapses a state-dependent job type into a simpler type by observing
+ * the state of the unit which it is going to be applied to. */
+JobType job_type_collapse(JobType t, Unit *u);
+
+int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u);
+
+void job_add_to_run_queue(Job *j);
+void job_add_to_dbus_queue(Job *j);
+
+int job_start_timer(Job *j, bool job_running);
+
+int job_run_and_invalidate(Job *j);
+int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already);
+
+char *job_dbus_path(Job *j);
+
+void job_shutdown_magic(Job *j);
+
+int job_get_timeout(Job *j, usec_t *ret);
+
+bool job_may_gc(Job *j);
+void job_add_to_gc_queue(Job *j);
+
+int job_get_before(Job *j, Job*** ret);
+int job_get_after(Job *j, Job*** ret);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Job*, job_free);
+
+const char* job_type_to_string(JobType t) _const_;
+JobType job_type_from_string(const char *s) _pure_;
+
+const char* job_state_to_string(JobState t) _const_;
+JobState job_state_from_string(const char *s) _pure_;
+
+const char* job_mode_to_string(JobMode t) _const_;
+JobMode job_mode_from_string(const char *s) _pure_;
+
+const char* job_result_to_string(JobResult t) _const_;
+JobResult job_result_from_string(const char *s) _pure_;
+
+const char* job_type_to_access_method(JobType t);
+
+int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep);
+
+void job_set_activation_details(Job *j, ActivationDetails *info);
diff --git a/src/core/kill.c b/src/core/kill.c
new file mode 100644
index 0000000..c8b581d
--- /dev/null
+++ b/src/core/kill.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "kill.h"
+#include "signal-util.h"
+#include "string-table.h"
+
+void kill_context_init(KillContext *c) {
+        assert(c);
+
+        c->kill_signal = SIGTERM;
+        /* restart_kill_signal is unset by default and we fall back to kill_signal */
+        c->final_kill_signal = SIGKILL;
+        c->send_sigkill = true;
+        c->send_sighup = false;
+        c->watchdog_signal = SIGABRT;
+}
+
+void kill_context_dump(KillContext *c, FILE *f, const char *prefix) {
+        assert(c);
+
+        prefix = strempty(prefix);
+
+        fprintf(f,
+                "%sKillMode: %s\n"
+                "%sKillSignal: SIG%s\n"
+                "%sRestartKillSignal: SIG%s\n"
+                "%sFinalKillSignal: SIG%s\n"
+                "%sSendSIGKILL: %s\n"
+                "%sSendSIGHUP: %s\n",
+                prefix, kill_mode_to_string(c->kill_mode),
+                prefix, signal_to_string(c->kill_signal),
+                prefix, signal_to_string(restart_kill_signal(c)),
+                prefix, signal_to_string(c->final_kill_signal),
+                prefix, yes_no(c->send_sigkill),
+                prefix, yes_no(c->send_sighup));
+}
+
+static const char* const kill_mode_table[_KILL_MODE_MAX] = {
+        [KILL_CONTROL_GROUP] = "control-group",
+        [KILL_PROCESS]       = "process",
+        [KILL_MIXED]         = "mixed",
+        [KILL_NONE]          = "none",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(kill_mode, KillMode);
+
+static const char* const kill_who_table[_KILL_WHO_MAX] = {
+        [KILL_MAIN]         = "main",
+        [KILL_CONTROL]      = "control",
+        [KILL_ALL]          = "all",
+        [KILL_MAIN_FAIL]    = "main-fail",
+        [KILL_CONTROL_FAIL] = "control-fail",
+        [KILL_ALL_FAIL]     = "all-fail",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho);
diff --git a/src/core/kill.h b/src/core/kill.h
new file mode 100644
index 0000000..dbf884d
--- /dev/null
+++ b/src/core/kill.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct KillContext KillContext;
+
+#include 
+#include 
+
+#include "macro.h"
+
+typedef enum KillMode {
+        /* The kill mode is a property of a unit. */
+        KILL_CONTROL_GROUP = 0,
+        KILL_PROCESS,
+        KILL_MIXED,
+        KILL_NONE,
+        _KILL_MODE_MAX,
+        _KILL_MODE_INVALID = -EINVAL,
+} KillMode;
+
+struct KillContext {
+        KillMode kill_mode;
+        int kill_signal;
+        int restart_kill_signal;
+        int final_kill_signal;
+        int watchdog_signal;
+        bool send_sigkill;
+        bool send_sighup;
+};
+
+typedef enum KillWho {
+        /* Kill who is a property of an operation */
+        KILL_MAIN,
+        KILL_CONTROL,
+        KILL_ALL,
+        KILL_MAIN_FAIL,
+        KILL_CONTROL_FAIL,
+        KILL_ALL_FAIL,
+        _KILL_WHO_MAX,
+        _KILL_WHO_INVALID = -EINVAL,
+} KillWho;
+
+void kill_context_init(KillContext *c);
+void kill_context_dump(KillContext *c, FILE *f, const char *prefix);
+
+const char *kill_mode_to_string(KillMode k) _const_;
+KillMode kill_mode_from_string(const char *s) _pure_;
+
+const char *kill_who_to_string(KillWho k) _const_;
+KillWho kill_who_from_string(const char *s) _pure_;
+
+static inline int restart_kill_signal(const KillContext *c) {
+        if (c->restart_kill_signal != 0)
+                return c->restart_kill_signal;
+        return c->kill_signal;
+}
diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c
new file mode 100644
index 0000000..b8e3f7a
--- /dev/null
+++ b/src/core/kmod-setup.c
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "bus-util.h"
+#include "capability-util.h"
+#include "efi-api.h"
+#include "fileio.h"
+#include "kmod-setup.h"
+#include "macro.h"
+#include "recurse-dir.h"
+#include "string-util.h"
+#include "strv.h"
+#include "virt.h"
+
+#if HAVE_KMOD
+#include "module-util.h"
+
+static void systemd_kmod_log(
+                void *data,
+                int priority,
+                const char *file, int line,
+                const char *fn,
+                const char *format,
+                va_list args) {
+
+        /* library logging is enabled at debug only */
+        DISABLE_WARNING_FORMAT_NONLITERAL;
+        log_internalv(LOG_DEBUG, 0, file, line, fn, format, args);
+        REENABLE_WARNING;
+}
+
+static int match_modalias_recurse_dir_cb(
+                RecurseDirEvent event,
+                const char *path,
+                int dir_fd,
+                int inode_fd,
+                const struct dirent *de,
+                const struct statx *sx,
+                void *userdata) {
+
+        _cleanup_free_ char *alias = NULL;
+        char **modaliases = ASSERT_PTR(userdata);
+        int r;
+
+        if (event != RECURSE_DIR_ENTRY)
+                return RECURSE_DIR_CONTINUE;
+
+        if (de->d_type != DT_REG)
+                return RECURSE_DIR_CONTINUE;
+
+        if (!streq(de->d_name, "modalias"))
+                return RECURSE_DIR_CONTINUE;
+
+        r = read_one_line_file(path, &alias);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to read %s, ignoring: %m", path);
+                return RECURSE_DIR_LEAVE_DIRECTORY;
+        }
+
+        if (startswith_strv(alias, modaliases))
+                return 1;
+
+        return RECURSE_DIR_LEAVE_DIRECTORY;
+}
+
+static bool has_virtio_feature(const char *name, char **modaliases) {
+        int r;
+
+        /* Directory traversal might be slow, hence let's do a cheap check first if it's even worth it */
+        if (detect_vm() == VIRTUALIZATION_NONE)
+                return false;
+
+        r = recurse_dir_at(
+                        AT_FDCWD,
+                        "/sys/devices/pci0000:00",
+                        /* statx_mask= */ 0,
+                        /* n_depth_max= */ 3,
+                        RECURSE_DIR_ENSURE_TYPE,
+                        match_modalias_recurse_dir_cb,
+                        modaliases);
+        if (r < 0)
+                log_debug_errno(r, "Failed to determine whether host has %s device, ignoring: %m", name);
+
+        return r > 0;
+}
+
+static bool has_virtio_rng(void) {
+        return has_virtio_feature("virtio-rng", STRV_MAKE("pci:v00001AF4d00001005", "pci:v00001AF4d00001044"));
+}
+
+static bool has_virtio_console(void) {
+        return has_virtio_feature("virtio-console", STRV_MAKE("virtio:d00000003v", "virtio:d0000000Bv"));
+}
+
+static bool has_virtio_vsock(void) {
+        return has_virtio_feature("virtio-vsock", STRV_MAKE("virtio:d00000013v"));
+}
+
+static bool has_virtiofs(void) {
+        return has_virtio_feature("virtiofs", STRV_MAKE("virtio:d0000001Av"));
+}
+
+static bool has_virtio_pci(void) {
+        return has_virtio_feature("virtio-pci", STRV_MAKE("pci:v00001AF4d"));
+}
+
+static bool in_qemu(void) {
+        return IN_SET(detect_vm(), VIRTUALIZATION_KVM, VIRTUALIZATION_QEMU);
+}
+#endif
+
+int kmod_setup(void) {
+#if HAVE_KMOD
+
+        static const struct {
+                const char *module;
+                const char *path;
+                bool warn_if_unavailable:1;
+                bool warn_if_module:1;
+                bool (*condition_fn)(void);
+        } kmod_table[] = {
+                /* This one we need to load explicitly, since auto-loading on use doesn't work
+                 * before udev created the ghost device nodes, and we need it earlier than that. */
+                { "autofs4",                    "/sys/class/misc/autofs",    true,  false, NULL               },
+
+                /* This one we need to load explicitly, since auto-loading of IPv6 is not done when
+                 * we try to configure ::1 on the loopback device. */
+                { "ipv6",                       "/sys/module/ipv6",          false, true,  NULL               },
+
+                /* This should never be a module */
+                { "unix",                       "/proc/net/unix",            true,  true,  NULL               },
+
+#if HAVE_LIBIPTC
+                /* netfilter is needed by networkd, nspawn among others, and cannot be autoloaded */
+                { "ip_tables",                  "/proc/net/ip_tables_names", false, false, NULL               },
+#endif
+                /* virtio_rng would be loaded by udev later, but real entropy might be needed very early */
+                { "virtio_rng",                 NULL,                        false, false, has_virtio_rng     },
+
+                /* we want early logging to hvc consoles if possible, and make sure systemd-getty-generator
+                 * can rely on all consoles being probed already.*/
+                { "virtio_console",             NULL,                        false, false, has_virtio_console },
+
+                /* Make sure we can send sd-notify messages over vsock as early as possible. */
+                { "vmw_vsock_virtio_transport", NULL,                        false, false, has_virtio_vsock   },
+
+                /* We can't wait for specific virtiofs tags to show up as device nodes so we have to load the
+                 * virtiofs and virtio_pci modules early to make sure the virtiofs tags are found when
+                 * sysroot.mount is started.
+                 *
+                 * TODO: Remove these again once https://gitlab.com/virtio-fs/virtiofsd/-/issues/128 is
+                 * resolved and the kernel fix is widely available. */
+                { "virtiofs",                   "/sys/module/virtiofs",      false, false, has_virtiofs       },
+                { "virtio_pci",                 "/sys/module/virtio_pci",    false, false, has_virtio_pci     },
+
+                /* qemu_fw_cfg would be loaded by udev later, but we want to import credentials from it super early */
+                { "qemu_fw_cfg",                "/sys/firmware/qemu_fw_cfg", false, false, in_qemu            },
+
+                /* dmi-sysfs is needed to import credentials from it super early */
+                { "dmi-sysfs",                  "/sys/firmware/dmi/entries", false, false, NULL               },
+
+#if HAVE_TPM2
+                /* Make sure the tpm subsystem is available which ConditionSecurity=tpm2 depends on. */
+                { "tpm",                        "/sys/class/tpmrm",          false, false, efi_has_tpm2       },
+#endif
+        };
+        _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL;
+        unsigned i;
+
+        if (have_effective_cap(CAP_SYS_MODULE) <= 0)
+                return 0;
+
+        for (i = 0; i < ELEMENTSOF(kmod_table); i++) {
+                if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0)
+                        continue;
+
+                if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn())
+                        continue;
+
+                if (kmod_table[i].warn_if_module)
+                        log_debug("Your kernel apparently lacks built-in %s support. Might be "
+                                  "a good idea to compile it in. We'll now try to work around "
+                                  "this by loading the module...", kmod_table[i].module);
+
+                if (!ctx) {
+                        ctx = kmod_new(NULL, NULL);
+                        if (!ctx)
+                                return log_oom();
+
+                        kmod_set_log_fn(ctx, systemd_kmod_log, NULL);
+                        kmod_load_resources(ctx);
+                }
+
+                (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable);
+        }
+
+#endif
+        return 0;
+}
diff --git a/src/core/kmod-setup.h b/src/core/kmod-setup.h
new file mode 100644
index 0000000..1c842d3
--- /dev/null
+++ b/src/core/kmod-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int kmod_setup(void);
diff --git a/src/core/load-dropin.c b/src/core/load-dropin.c
new file mode 100644
index 0000000..fd45744
--- /dev/null
+++ b/src/core/load-dropin.c
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "fs-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static int process_deps(Unit *u, UnitDependency dependency, const char *dir_suffix) {
+        _cleanup_strv_free_ char **paths = NULL;
+        int r;
+
+        r = unit_file_find_dropin_paths(NULL,
+                                        u->manager->lookup_paths.search_path,
+                                        u->manager->unit_path_cache,
+                                        dir_suffix, NULL,
+                                        u->id, u->aliases,
+                                        &paths);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(p, paths) {
+                _cleanup_free_ char *target = NULL;
+                const char *entry;
+
+                entry = basename(*p);
+
+                if (null_or_empty_path(*p) > 0) {
+                        /* an error usually means an invalid symlink, which is not a mask */
+                        log_unit_debug(u, "%s dependency on %s is masked by %s, ignoring.",
+                                       unit_dependency_to_string(dependency), entry, *p);
+                        continue;
+                }
+
+                r = is_symlink(*p);
+                if (r < 0) {
+                        log_unit_warning_errno(u, r, "%s dropin %s unreadable, ignoring: %m",
+                                               unit_dependency_to_string(dependency), *p);
+                        continue;
+                }
+                if (r == 0) {
+                        log_unit_warning(u, "%s dependency dropin %s is not a symlink, ignoring.",
+                                         unit_dependency_to_string(dependency), *p);
+                        continue;
+                }
+
+                if (!unit_name_is_valid(entry, UNIT_NAME_ANY)) {
+                        log_unit_warning(u, "%s dependency dropin %s is not a valid unit name, ignoring.",
+                                         unit_dependency_to_string(dependency), *p);
+                        continue;
+                }
+
+                r = readlink_malloc(*p, &target);
+                if (r < 0) {
+                        log_unit_warning_errno(u, r, "readlink(\"%s\") failed, ignoring: %m", *p);
+                        continue;
+                }
+
+                /* We don't treat this as an error, especially because we didn't check this for a
+                 * long time. Nevertheless, we warn, because such mismatch can be mighty confusing. */
+                r = unit_symlink_name_compatible(entry, basename(target), u->instance);
+                if (r < 0) {
+                        log_unit_warning_errno(u, r, "Can't check if names %s and %s are compatible, ignoring: %m",
+                                               entry, basename(target));
+                        continue;
+                }
+                if (r == 0)
+                        log_unit_warning(u, "%s dependency dropin %s target %s has different name",
+                                         unit_dependency_to_string(dependency), *p, target);
+
+                r = unit_add_dependency_by_name(u, dependency, entry, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Cannot add %s dependency on %s, ignoring: %m",
+                                               unit_dependency_to_string(dependency), entry);
+        }
+
+        return 0;
+}
+
+int unit_load_dropin(Unit *u) {
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(u);
+
+        /* Load dependencies from .wants, .requires and .upholds directories */
+        r = process_deps(u, UNIT_WANTS, ".wants");
+        if (r < 0)
+                return r;
+
+        r = process_deps(u, UNIT_REQUIRES, ".requires");
+        if (r < 0)
+                return r;
+
+        r = process_deps(u, UNIT_UPHOLDS, ".upholds");
+        if (r < 0)
+                return r;
+
+        /* Load .conf dropins */
+        r = unit_find_dropin_paths(u, &l);
+        if (r <= 0)
+                return 0;
+
+        if (!u->dropin_paths)
+                u->dropin_paths = TAKE_PTR(l);
+        else {
+                r = strv_extend_strv(&u->dropin_paths, l, true);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        u->dropin_mtime = 0;
+        STRV_FOREACH(f, u->dropin_paths) {
+                struct stat st;
+
+                r = config_parse(u->id, *f, NULL,
+                                 UNIT_VTABLE(u)->sections,
+                                 config_item_perf_lookup, load_fragment_gperf_lookup,
+                                 0, u, &st);
+                if (r > 0)
+                        u->dropin_mtime = MAX(u->dropin_mtime, timespec_load(&st.st_mtim));
+        }
+
+        return 0;
+}
diff --git a/src/core/load-dropin.h b/src/core/load-dropin.h
new file mode 100644
index 0000000..f0b87d3
--- /dev/null
+++ b/src/core/load-dropin.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "dropin.h"
+#include "unit.h"
+
+/* Read service data supplementary drop-in directories */
+
+static inline int unit_find_dropin_paths(Unit *u, char ***paths) {
+        assert(u);
+
+        return unit_file_find_dropin_paths(NULL,
+                                           u->manager->lookup_paths.search_path,
+                                           u->manager->unit_path_cache,
+                                           ".d", ".conf",
+                                           u->id, u->aliases,
+                                           paths);
+}
+
+int unit_load_dropin(Unit *u);
diff --git a/src/core/load-fragment-gperf-nulstr.awk b/src/core/load-fragment-gperf-nulstr.awk
new file mode 100644
index 0000000..a1b7d1c
--- /dev/null
+++ b/src/core/load-fragment-gperf-nulstr.awk
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+BEGIN{
+        keywords=0 ; FS="," ;
+        print "extern const char load_fragment_gperf_nulstr[];" ;
+        print "const char load_fragment_gperf_nulstr[] ="
+}
+keyword==1 {
+        print "\"" $1 "\\0\""
+}
+/%%/ {
+        keyword=1
+}
+END {
+        print ";"
+}
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
new file mode 100644
index 0000000..45f9ab0
--- /dev/null
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -0,0 +1,595 @@
+{# SPDX-License-Identifier: LGPL-2.1-or-later #}
+
+{%- macro EXEC_CONTEXT_CONFIG_ITEMS(type) -%}
+{# Define the context options only once #}
+{{type}}.WorkingDirectory,                 config_parse_working_directory,              0,                                  offsetof({{type}}, exec_context)
+{{type}}.RootDirectory,                    config_parse_unit_path_printf,               true,                               offsetof({{type}}, exec_context.root_directory)
+{{type}}.RootImage,                        config_parse_unit_path_printf,               true,                               offsetof({{type}}, exec_context.root_image)
+{{type}}.RootImageOptions,                 config_parse_root_image_options,             0,                                  offsetof({{type}}, exec_context)
+{{type}}.RootImagePolicy,                  config_parse_image_policy,                   0,                                  offsetof({{type}}, exec_context.root_image_policy)
+{{type}}.RootHash,                         config_parse_exec_root_hash,                 0,                                  offsetof({{type}}, exec_context)
+{{type}}.RootHashSignature,                config_parse_exec_root_hash_sig,             0,                                  offsetof({{type}}, exec_context)
+{{type}}.RootVerity,                       config_parse_unit_path_printf,               true,                               offsetof({{type}}, exec_context.root_verity)
+{{type}}.RootEphemeral,                    config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.root_ephemeral)
+{{type}}.ExtensionDirectories,             config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.extension_directories)
+{{type}}.ExtensionImages,                  config_parse_extension_images,               0,                                  offsetof({{type}}, exec_context)
+{{type}}.ExtensionImagePolicy,             config_parse_image_policy,                   0,                                  offsetof({{type}}, exec_context.extension_image_policy)
+{{type}}.MountImages,                      config_parse_mount_images,                   0,                                  offsetof({{type}}, exec_context)
+{{type}}.MountImagePolicy,                 config_parse_image_policy,                   0,                                  offsetof({{type}}, exec_context.mount_image_policy)
+{{type}}.User,                             config_parse_user_group_compat,              0,                                  offsetof({{type}}, exec_context.user)
+{{type}}.Group,                            config_parse_user_group_compat,              0,                                  offsetof({{type}}, exec_context.group)
+{{type}}.SupplementaryGroups,              config_parse_user_group_strv_compat,         0,                                  offsetof({{type}}, exec_context.supplementary_groups)
+{{type}}.SetLoginEnvironment,              config_parse_tristate,                       0,                                  offsetof({{type}}, exec_context.set_login_environment)
+{{type}}.Nice,                             config_parse_exec_nice,                      0,                                  offsetof({{type}}, exec_context)
+{{type}}.OOMScoreAdjust,                   config_parse_exec_oom_score_adjust,          0,                                  offsetof({{type}}, exec_context)
+{{type}}.CoredumpFilter,                   config_parse_exec_coredump_filter,           0,                                  offsetof({{type}}, exec_context)
+{{type}}.IOSchedulingClass,                config_parse_exec_io_class,                  0,                                  offsetof({{type}}, exec_context)
+{{type}}.IOSchedulingPriority,             config_parse_exec_io_priority,               0,                                  offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingPolicy,              config_parse_exec_cpu_sched_policy,          0,                                  offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingPriority,            config_parse_exec_cpu_sched_prio,            0,                                  offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingResetOnFork,         config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.cpu_sched_reset_on_fork)
+{{type}}.CPUAffinity,                      config_parse_exec_cpu_affinity,              0,                                  offsetof({{type}}, exec_context)
+{{type}}.NUMAPolicy,                       config_parse_numa_policy,                    0,                                  offsetof({{type}}, exec_context.numa_policy.type)
+{{type}}.NUMAMask,                         config_parse_numa_mask,                      0,                                  offsetof({{type}}, exec_context.numa_policy)
+{{type}}.UMask,                            config_parse_mode,                           0,                                  offsetof({{type}}, exec_context.umask)
+{{type}}.Environment,                      config_parse_environ,                        0,                                  offsetof({{type}}, exec_context.environment)
+{{type}}.EnvironmentFile,                  config_parse_unit_env_file,                  0,                                  offsetof({{type}}, exec_context.environment_files)
+{{type}}.PassEnvironment,                  config_parse_pass_environ,                   0,                                  offsetof({{type}}, exec_context.pass_environment)
+{{type}}.UnsetEnvironment,                 config_parse_unset_environ,                  0,                                  offsetof({{type}}, exec_context.unset_environment)
+{{type}}.DynamicUser,                      config_parse_bool,                           true,                               offsetof({{type}}, exec_context.dynamic_user)
+{{type}}.RemoveIPC,                        config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.remove_ipc)
+{{type}}.StandardInput,                    config_parse_exec_input,                     0,                                  offsetof({{type}}, exec_context)
+{{type}}.StandardOutput,                   config_parse_exec_output,                    0,                                  offsetof({{type}}, exec_context)
+{{type}}.StandardError,                    config_parse_exec_output,                    0,                                  offsetof({{type}}, exec_context)
+{{type}}.StandardInputText,                config_parse_exec_input_text,                0,                                  offsetof({{type}}, exec_context)
+{{type}}.StandardInputData,                config_parse_exec_input_data,                0,                                  offsetof({{type}}, exec_context)
+{{type}}.TTYPath,                          config_parse_unit_path_printf,               0,                                  offsetof({{type}}, exec_context.tty_path)
+{{type}}.TTYReset,                         config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.tty_reset)
+{{type}}.TTYVHangup,                       config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.tty_vhangup)
+{{type}}.TTYVTDisallocate,                 config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.tty_vt_disallocate)
+{{type}}.TTYRows,                          config_parse_tty_size,                       0,                                  offsetof({{type}}, exec_context.tty_rows)
+{{type}}.TTYColumns,                       config_parse_tty_size,                       0,                                  offsetof({{type}}, exec_context.tty_cols)
+{{type}}.SyslogIdentifier,                 config_parse_unit_string_printf,             0,                                  offsetof({{type}}, exec_context.syslog_identifier)
+{{type}}.SyslogFacility,                   config_parse_log_facility,                   0,                                  offsetof({{type}}, exec_context.syslog_priority)
+{{type}}.SyslogLevel,                      config_parse_log_level,                      0,                                  offsetof({{type}}, exec_context.syslog_priority)
+{{type}}.SyslogLevelPrefix,                config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.syslog_level_prefix)
+{{type}}.LogLevelMax,                      config_parse_log_level,                      0,                                  offsetof({{type}}, exec_context.log_level_max)
+{{type}}.LogRateLimitIntervalSec,          config_parse_sec,                            0,                                  offsetof({{type}}, exec_context.log_ratelimit_interval_usec)
+{{type}}.LogRateLimitBurst,                config_parse_unsigned,                       0,                                  offsetof({{type}}, exec_context.log_ratelimit_burst)
+{{type}}.LogExtraFields,                   config_parse_log_extra_fields,               0,                                  offsetof({{type}}, exec_context)
+{{type}}.LogFilterPatterns,                config_parse_log_filter_patterns,            0,                                  offsetof({{type}}, exec_context)
+{{type}}.Capabilities,                     config_parse_warn_compat,                    DISABLED_LEGACY,                    offsetof({{type}}, exec_context)
+{{type}}.SecureBits,                       config_parse_exec_secure_bits,               0,                                  offsetof({{type}}, exec_context.secure_bits)
+{{type}}.CapabilityBoundingSet,            config_parse_capability_set,                 0,                                  offsetof({{type}}, exec_context.capability_bounding_set)
+{{type}}.AmbientCapabilities,              config_parse_capability_set,                 0,                                  offsetof({{type}}, exec_context.capability_ambient_set)
+{{type}}.TimerSlackNSec,                   config_parse_nsec,                           0,                                  offsetof({{type}}, exec_context.timer_slack_nsec)
+{{type}}.NoNewPrivileges,                  config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.no_new_privileges)
+{{type}}.KeyringMode,                      config_parse_exec_keyring_mode,              0,                                  offsetof({{type}}, exec_context.keyring_mode)
+{{type}}.ProtectProc,                      config_parse_protect_proc,                   0,                                  offsetof({{type}}, exec_context.protect_proc)
+{{type}}.ProcSubset,                       config_parse_proc_subset,                    0,                                  offsetof({{type}}, exec_context.proc_subset)
+{% if HAVE_SECCOMP %}
+{{type}}.SystemCallFilter,                 config_parse_syscall_filter,                 0,                                  offsetof({{type}}, exec_context)
+{{type}}.SystemCallArchitectures,          config_parse_syscall_archs,                  0,                                  offsetof({{type}}, exec_context.syscall_archs)
+{{type}}.SystemCallErrorNumber,            config_parse_syscall_errno,                  0,                                  offsetof({{type}}, exec_context)
+{{type}}.SystemCallLog,                    config_parse_syscall_log,                    0,                                  offsetof({{type}}, exec_context)
+{{type}}.MemoryDenyWriteExecute,           config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.memory_deny_write_execute)
+{{type}}.RestrictNamespaces,               config_parse_restrict_namespaces,            0,                                  offsetof({{type}}, exec_context)
+{{type}}.RestrictRealtime,                 config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.restrict_realtime)
+{{type}}.RestrictSUIDSGID,                 config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.restrict_suid_sgid)
+{{type}}.RestrictAddressFamilies,          config_parse_address_families,               0,                                  offsetof({{type}}, exec_context)
+{{type}}.LockPersonality,                  config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.lock_personality)
+{% else %}
+{{type}}.SystemCallFilter,                 config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.SystemCallArchitectures,          config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.SystemCallErrorNumber,            config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.SystemCallLog,                    config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.MemoryDenyWriteExecute,           config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.RestrictNamespaces,               config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.RestrictRealtime,                 config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.RestrictSUIDSGID,                 config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.RestrictAddressFamilies,          config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{{type}}.LockPersonality,                  config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{{type}}.RestrictFileSystems,              config_parse_restrict_filesystems,           0,                                  offsetof({{type}}, exec_context)
+{{type}}.LimitCPU,                         config_parse_rlimit,                         RLIMIT_CPU,                         offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitFSIZE,                       config_parse_rlimit,                         RLIMIT_FSIZE,                       offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitDATA,                        config_parse_rlimit,                         RLIMIT_DATA,                        offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitSTACK,                       config_parse_rlimit,                         RLIMIT_STACK,                       offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitCORE,                        config_parse_rlimit,                         RLIMIT_CORE,                        offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRSS,                         config_parse_rlimit,                         RLIMIT_RSS,                         offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNOFILE,                      config_parse_rlimit,                         RLIMIT_NOFILE,                      offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitAS,                          config_parse_rlimit,                         RLIMIT_AS,                          offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNPROC,                       config_parse_rlimit,                         RLIMIT_NPROC,                       offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitMEMLOCK,                     config_parse_rlimit,                         RLIMIT_MEMLOCK,                     offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitLOCKS,                       config_parse_rlimit,                         RLIMIT_LOCKS,                       offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitSIGPENDING,                  config_parse_rlimit,                         RLIMIT_SIGPENDING,                  offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitMSGQUEUE,                    config_parse_rlimit,                         RLIMIT_MSGQUEUE,                    offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNICE,                        config_parse_rlimit,                         RLIMIT_NICE,                        offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRTPRIO,                      config_parse_rlimit,                         RLIMIT_RTPRIO,                      offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRTTIME,                      config_parse_rlimit,                         RLIMIT_RTTIME,                      offsetof({{type}}, exec_context.rlimit)
+{{type}}.ReadWriteDirectories,             config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.read_write_paths)
+{{type}}.ReadOnlyDirectories,              config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.read_only_paths)
+{{type}}.InaccessibleDirectories,          config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.inaccessible_paths)
+{{type}}.ReadWritePaths,                   config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.read_write_paths)
+{{type}}.ReadOnlyPaths,                    config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.read_only_paths)
+{{type}}.InaccessiblePaths,                config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.inaccessible_paths)
+{{type}}.ExecPaths,                        config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.exec_paths)
+{{type}}.NoExecPaths,                      config_parse_namespace_path_strv,            0,                                  offsetof({{type}}, exec_context.no_exec_paths)
+{{type}}.ExecSearchPath,                   config_parse_colon_separated_paths,          0,                                  offsetof({{type}}, exec_context.exec_search_path)
+{{type}}.BindPaths,                        config_parse_bind_paths,                     0,                                  offsetof({{type}}, exec_context)
+{{type}}.BindReadOnlyPaths,                config_parse_bind_paths,                     0,                                  offsetof({{type}}, exec_context)
+{{type}}.TemporaryFileSystem,              config_parse_temporary_filesystems,          0,                                  offsetof({{type}}, exec_context)
+{{type}}.PrivateTmp,                       config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_tmp)
+{{type}}.PrivateDevices,                   config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_devices)
+{{type}}.ProtectKernelTunables,            config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_kernel_tunables)
+{{type}}.ProtectKernelModules,             config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_kernel_modules)
+{{type}}.ProtectKernelLogs,                config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_kernel_logs)
+{{type}}.ProtectClock,                     config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_clock)
+{{type}}.ProtectControlGroups,             config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_control_groups)
+{{type}}.NetworkNamespacePath,             config_parse_unit_path_printf,               0,                                  offsetof({{type}}, exec_context.network_namespace_path)
+{{type}}.IPCNamespacePath,                 config_parse_unit_path_printf,               0,                                  offsetof({{type}}, exec_context.ipc_namespace_path)
+{{type}}.LogNamespace,                     config_parse_log_namespace,                  0,                                  offsetof({{type}}, exec_context)
+{{type}}.PrivateNetwork,                   config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_network)
+{{type}}.PrivateUsers,                     config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_users)
+{{type}}.PrivateMounts,                    config_parse_tristate,                       0,                                  offsetof({{type}}, exec_context.private_mounts)
+{{type}}.PrivateIPC,                       config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_ipc)
+{{type}}.ProtectSystem,                    config_parse_protect_system,                 0,                                  offsetof({{type}}, exec_context.protect_system)
+{{type}}.ProtectHome,                      config_parse_protect_home,                   0,                                  offsetof({{type}}, exec_context.protect_home)
+{{type}}.MountFlags,                       config_parse_exec_mount_propagation_flag,    0,                                  offsetof({{type}}, exec_context.mount_propagation_flag)
+{{type}}.MountAPIVFS,                      config_parse_exec_mount_apivfs,              0,                                  offsetof({{type}}, exec_context)
+{{type}}.Personality,                      config_parse_personality,                    0,                                  offsetof({{type}}, exec_context.personality)
+{{type}}.RuntimeDirectoryPreserve,         config_parse_exec_preserve_mode,             0,                                  offsetof({{type}}, exec_context.runtime_directory_preserve_mode)
+{{type}}.RuntimeDirectoryMode,             config_parse_mode,                           0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode)
+{{type}}.RuntimeDirectory,                 config_parse_exec_directories,               0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME])
+{{type}}.StateDirectoryMode,               config_parse_mode,                           0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE].mode)
+{{type}}.StateDirectory,                   config_parse_exec_directories,               0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE])
+{{type}}.CacheDirectoryMode,               config_parse_mode,                           0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE].mode)
+{{type}}.CacheDirectory,                   config_parse_exec_directories,               0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE])
+{{type}}.LogsDirectoryMode,                config_parse_mode,                           0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS].mode)
+{{type}}.LogsDirectory,                    config_parse_exec_directories,               0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS])
+{{type}}.ConfigurationDirectoryMode,       config_parse_mode,                           0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].mode)
+{{type}}.ConfigurationDirectory,           config_parse_exec_directories,               0,                                  offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION])
+{{type}}.SetCredential,                    config_parse_set_credential,                 0,                                  offsetof({{type}}, exec_context)
+{{type}}.SetCredentialEncrypted,           config_parse_set_credential,                 1,                                  offsetof({{type}}, exec_context)
+{{type}}.LoadCredential,                   config_parse_load_credential,                0,                                  offsetof({{type}}, exec_context)
+{{type}}.LoadCredentialEncrypted,          config_parse_load_credential,                1,                                  offsetof({{type}}, exec_context)
+{{type}}.ImportCredential,                 config_parse_import_credential,              0,                                  offsetof({{type}}, exec_context.import_credentials)
+{{type}}.TimeoutCleanSec,                  config_parse_sec,                            0,                                  offsetof({{type}}, exec_context.timeout_clean_usec)
+{% if HAVE_PAM %}
+{{type}}.PAMName,                          config_parse_unit_string_printf,             0,                                  offsetof({{type}}, exec_context.pam_name)
+{% else %}
+{{type}}.PAMName,                          config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{{type}}.IgnoreSIGPIPE,                    config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.ignore_sigpipe)
+{{type}}.UtmpIdentifier,                   config_parse_unit_string_printf,             0,                                  offsetof({{type}}, exec_context.utmp_id)
+{{type}}.UtmpMode,                         config_parse_exec_utmp_mode,                 0,                                  offsetof({{type}}, exec_context.utmp_mode)
+{% if HAVE_SELINUX %}
+{{type}}.SELinuxContext,                   config_parse_exec_selinux_context,           0,                                  offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.SELinuxContext,                   config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{% if HAVE_APPARMOR %}
+{{type}}.AppArmorProfile,                  config_parse_exec_apparmor_profile,          0,                                  offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.AppArmorProfile,                  config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{% if ENABLE_SMACK %}
+{{type}}.SmackProcessLabel,                config_parse_exec_smack_process_label,       0,                                  offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.SmackProcessLabel,                config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{{type}}.ProtectHostname,                  config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_hostname)
+{{type}}.MemoryKSM,                        config_parse_tristate,                       0,                                  offsetof({{type}}, exec_context.memory_ksm)
+{%- endmacro -%}
+
+{%- macro KILL_CONTEXT_CONFIG_ITEMS(type) -%}
+{{type}}.SendSIGKILL,                      config_parse_bool,                           0,                                  offsetof({{type}}, kill_context.send_sigkill)
+{{type}}.SendSIGHUP,                       config_parse_bool,                           0,                                  offsetof({{type}}, kill_context.send_sighup)
+{{type}}.KillMode,                         config_parse_kill_mode,                      0,                                  offsetof({{type}}, kill_context.kill_mode)
+{{type}}.KillSignal,                       config_parse_signal,                         0,                                  offsetof({{type}}, kill_context.kill_signal)
+{{type}}.RestartKillSignal,                config_parse_signal,                         0,                                  offsetof({{type}}, kill_context.restart_kill_signal)
+{{type}}.FinalKillSignal,                  config_parse_signal,                         0,                                  offsetof({{type}}, kill_context.final_kill_signal)
+{{type}}.WatchdogSignal,                   config_parse_signal,                         0,                                  offsetof({{type}}, kill_context.watchdog_signal)
+{%- endmacro -%}
+
+{%- macro CGROUP_CONTEXT_CONFIG_ITEMS(type) -%}
+{{type}}.Slice,                            config_parse_unit_slice,                     0,                                  0
+{{type}}.AllowedCPUs,                      config_parse_allowed_cpuset,                 0,                                  offsetof({{type}}, cgroup_context.cpuset_cpus)
+{{type}}.StartupAllowedCPUs,               config_parse_allowed_cpuset,                 0,                                  offsetof({{type}}, cgroup_context.startup_cpuset_cpus)
+{{type}}.AllowedMemoryNodes,               config_parse_allowed_cpuset,                 0,                                  offsetof({{type}}, cgroup_context.cpuset_mems)
+{{type}}.StartupAllowedMemoryNodes,        config_parse_allowed_cpuset,                 0,                                  offsetof({{type}}, cgroup_context.startup_cpuset_mems)
+{{type}}.CPUAccounting,                    config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.cpu_accounting)
+{{type}}.CPUWeight,                        config_parse_cg_cpu_weight,                  0,                                  offsetof({{type}}, cgroup_context.cpu_weight)
+{{type}}.StartupCPUWeight,                 config_parse_cg_cpu_weight,                  0,                                  offsetof({{type}}, cgroup_context.startup_cpu_weight)
+{{type}}.CPUShares,                        config_parse_cpu_shares,                     0,                                  offsetof({{type}}, cgroup_context.cpu_shares)
+{{type}}.StartupCPUShares,                 config_parse_cpu_shares,                     0,                                  offsetof({{type}}, cgroup_context.startup_cpu_shares)
+{{type}}.CPUQuota,                         config_parse_cpu_quota,                      0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.CPUQuotaPeriodSec,                config_parse_sec_def_infinity,               0,                                  offsetof({{type}}, cgroup_context.cpu_quota_period_usec)
+{{type}}.MemoryAccounting,                 config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.memory_accounting)
+{{type}}.MemoryMin,                        config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DefaultMemoryMin,                 config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DefaultMemoryLow,                 config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DefaultStartupMemoryLow,          config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemoryLow,                        config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryLow,                 config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemoryHigh,                       config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryHigh,                config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemoryMax,                        config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryMax,                 config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemorySwapMax,                    config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemorySwapMax,             config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemoryZSwapMax,                   config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryZSwapMax,            config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemoryLimit,                      config_parse_memory_limit,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DeviceAllow,                      config_parse_device_allow,                   0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DevicePolicy,                     config_parse_device_policy,                  0,                                  offsetof({{type}}, cgroup_context.device_policy)
+{{type}}.IOAccounting,                     config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.io_accounting)
+{{type}}.IOWeight,                         config_parse_cg_weight,                      0,                                  offsetof({{type}}, cgroup_context.io_weight)
+{{type}}.StartupIOWeight,                  config_parse_cg_weight,                      0,                                  offsetof({{type}}, cgroup_context.startup_io_weight)
+{{type}}.IODeviceWeight,                   config_parse_io_device_weight,               0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.IOReadBandwidthMax,               config_parse_io_limit,                       0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.IOWriteBandwidthMax,              config_parse_io_limit,                       0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.IOReadIOPSMax,                    config_parse_io_limit,                       0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.IOWriteIOPSMax,                   config_parse_io_limit,                       0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.IODeviceLatencyTargetSec,         config_parse_io_device_latency,              0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOAccounting,                config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.blockio_accounting)
+{{type}}.BlockIOWeight,                    config_parse_blockio_weight,                 0,                                  offsetof({{type}}, cgroup_context.blockio_weight)
+{{type}}.StartupBlockIOWeight,             config_parse_blockio_weight,                 0,                                  offsetof({{type}}, cgroup_context.startup_blockio_weight)
+{{type}}.BlockIODeviceWeight,              config_parse_blockio_device_weight,          0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOReadBandwidth,             config_parse_blockio_bandwidth,              0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOWriteBandwidth,            config_parse_blockio_bandwidth,              0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.TasksAccounting,                  config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.tasks_accounting)
+{{type}}.TasksMax,                         config_parse_tasks_max,                      0,                                  offsetof({{type}}, cgroup_context.tasks_max)
+{{type}}.Delegate,                         config_parse_delegate,                       0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DelegateSubgroup,                 config_parse_delegate_subgroup ,             0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.DisableControllers,               config_parse_disable_controllers,            0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.IPAccounting,                     config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.ip_accounting)
+{{type}}.IPAddressAllow,                   config_parse_in_addr_prefixes,               AF_UNSPEC,                          offsetof({{type}}, cgroup_context.ip_address_allow)
+{{type}}.IPAddressDeny,                    config_parse_in_addr_prefixes,               AF_UNSPEC,                          offsetof({{type}}, cgroup_context.ip_address_deny)
+{{type}}.IPIngressFilterPath,              config_parse_ip_filter_bpf_progs,            0,                                  offsetof({{type}}, cgroup_context.ip_filters_ingress)
+{{type}}.IPEgressFilterPath,               config_parse_ip_filter_bpf_progs,            0,                                  offsetof({{type}}, cgroup_context.ip_filters_egress)
+{{type}}.ManagedOOMSwap,                   config_parse_managed_oom_mode,               0,                                  offsetof({{type}}, cgroup_context.moom_swap)
+{{type}}.ManagedOOMMemoryPressure,         config_parse_managed_oom_mode,               0,                                  offsetof({{type}}, cgroup_context.moom_mem_pressure)
+{{type}}.ManagedOOMMemoryPressureLimit,    config_parse_managed_oom_mem_pressure_limit, 0,                                  offsetof({{type}}, cgroup_context.moom_mem_pressure_limit)
+{{type}}.ManagedOOMPreference,             config_parse_managed_oom_preference,         0,                                  offsetof({{type}}, cgroup_context.moom_preference)
+{{type}}.NetClass,                         config_parse_warn_compat,                    DISABLED_LEGACY,                    0
+{{type}}.BPFProgram,                       config_parse_bpf_foreign_program,            0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.SocketBindAllow,                  config_parse_cgroup_socket_bind,             0,                                  offsetof({{type}}, cgroup_context.socket_bind_allow)
+{{type}}.SocketBindDeny,                   config_parse_cgroup_socket_bind,             0,                                  offsetof({{type}}, cgroup_context.socket_bind_deny)
+{{type}}.RestrictNetworkInterfaces,        config_parse_restrict_network_interfaces,    0,                                  offsetof({{type}}, cgroup_context)
+{{type}}.MemoryPressureThresholdSec,       config_parse_sec,                            0,                                  offsetof({{type}}, cgroup_context.memory_pressure_threshold_usec)
+{{type}}.MemoryPressureWatch,              config_parse_memory_pressure_watch,          0,                                  offsetof({{type}}, cgroup_context.memory_pressure_watch)
+{{type}}.NFTSet,                           config_parse_cgroup_nft_set,                 NFT_SET_PARSE_CGROUP,               offsetof({{type}}, cgroup_context)
+{{type}}.CoredumpReceive,                  config_parse_bool,                           0,                                  offsetof({{type}}, cgroup_context.coredump_receive)
+{%- endmacro -%}
+
+%{
+#if __GNUC__ >= 7
+_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
+#endif
+#include 
+#include "all-units.h"
+#include "conf-parser.h"
+#include "image-policy.h"
+#include "in-addr-prefix-util.h"
+#include "load-fragment.h"
+%}
+struct ConfigPerfItem;
+%null_strings
+%language=ANSI-C
+%define slot-name section_and_lvalue
+%define hash-function-name load_fragment_gperf_hash
+%define lookup-function-name load_fragment_gperf_lookup
+%readonly-tables
+%omit-struct-type
+%struct-type
+%includes
+%%
+Unit.Description,                        config_parse_unit_string_printf,             0,                                  offsetof(Unit, description)
+Unit.Documentation,                      config_parse_documentation,                  0,                                  offsetof(Unit, documentation)
+Unit.SourcePath,                         config_parse_unit_path_printf,               0,                                  offsetof(Unit, source_path)
+Unit.Requires,                           config_parse_unit_deps,                      UNIT_REQUIRES,                      0
+Unit.Requisite,                          config_parse_unit_deps,                      UNIT_REQUISITE,                     0
+Unit.Wants,                              config_parse_unit_deps,                      UNIT_WANTS,                         0
+Unit.BindsTo,                            config_parse_unit_deps,                      UNIT_BINDS_TO,                      0
+Unit.BindTo,                             config_parse_unit_deps,                      UNIT_BINDS_TO,                      0
+Unit.Upholds,                            config_parse_unit_deps,                      UNIT_UPHOLDS,                       0
+Unit.Conflicts,                          config_parse_unit_deps,                      UNIT_CONFLICTS,                     0
+Unit.Before,                             config_parse_unit_deps,                      UNIT_BEFORE,                        0
+Unit.After,                              config_parse_unit_deps,                      UNIT_AFTER,                         0
+Unit.OnSuccess,                          config_parse_unit_deps,                      UNIT_ON_SUCCESS,                    0
+Unit.OnFailure,                          config_parse_unit_deps,                      UNIT_ON_FAILURE,                    0
+Unit.PropagatesReloadTo,                 config_parse_unit_deps,                      UNIT_PROPAGATES_RELOAD_TO,          0
+Unit.PropagateReloadTo,                  config_parse_unit_deps,                      UNIT_PROPAGATES_RELOAD_TO,          0
+Unit.ReloadPropagatedFrom,               config_parse_unit_deps,                      UNIT_RELOAD_PROPAGATED_FROM,        0
+Unit.PropagateReloadFrom,                config_parse_unit_deps,                      UNIT_RELOAD_PROPAGATED_FROM,        0
+Unit.PropagatesStopTo,                   config_parse_unit_deps,                      UNIT_PROPAGATES_STOP_TO,            0
+Unit.StopPropagatedFrom,                 config_parse_unit_deps,                      UNIT_STOP_PROPAGATED_FROM,          0
+Unit.PartOf,                             config_parse_unit_deps,                      UNIT_PART_OF,                       0
+Unit.JoinsNamespaceOf,                   config_parse_unit_deps,                      UNIT_JOINS_NAMESPACE_OF,            0
+Unit.RequiresOverridable,                config_parse_obsolete_unit_deps,             UNIT_REQUIRES,                      0
+Unit.RequisiteOverridable,               config_parse_obsolete_unit_deps,             UNIT_REQUISITE,                     0
+Unit.RequiresMountsFor,                  config_parse_unit_requires_mounts_for,       0,                                  0
+Unit.StopWhenUnneeded,                   config_parse_bool,                           0,                                  offsetof(Unit, stop_when_unneeded)
+Unit.RefuseManualStart,                  config_parse_bool,                           0,                                  offsetof(Unit, refuse_manual_start)
+Unit.RefuseManualStop,                   config_parse_bool,                           0,                                  offsetof(Unit, refuse_manual_stop)
+Unit.AllowIsolate,                       config_parse_bool,                           0,                                  offsetof(Unit, allow_isolate)
+Unit.DefaultDependencies,                config_parse_bool,                           0,                                  offsetof(Unit, default_dependencies)
+Unit.SurviveFinalKillSignal,             config_parse_bool,                           0,                                  offsetof(Unit, survive_final_kill_signal)
+Unit.OnSuccessJobMode,                   config_parse_job_mode,                       0,                                  offsetof(Unit, on_success_job_mode)
+Unit.OnFailureJobMode,                   config_parse_job_mode,                       0,                                  offsetof(Unit, on_failure_job_mode)
+{# The following is a legacy alias name for compatibility #}
+Unit.OnFailureIsolate,                   config_parse_job_mode_isolate,               0,                                  offsetof(Unit, on_failure_job_mode)
+Unit.IgnoreOnIsolate,                    config_parse_bool,                           0,                                  offsetof(Unit, ignore_on_isolate)
+Unit.IgnoreOnSnapshot,                   config_parse_warn_compat,                    DISABLED_LEGACY,                    0
+Unit.JobTimeoutSec,                      config_parse_job_timeout_sec,                0,                                  0
+Unit.JobRunningTimeoutSec,               config_parse_job_running_timeout_sec,        0,                                  0
+Unit.JobTimeoutAction,                   config_parse_emergency_action,               0,                                  offsetof(Unit, job_timeout_action)
+Unit.JobTimeoutRebootArgument,           config_parse_unit_string_printf,             0,                                  offsetof(Unit, job_timeout_reboot_arg)
+Unit.StartLimitIntervalSec,              config_parse_sec,                            0,                                  offsetof(Unit, start_ratelimit.interval)
+{# The following is a legacy alias name for compatibility #}
+Unit.StartLimitInterval,                 config_parse_sec,                            0,                                  offsetof(Unit, start_ratelimit.interval)
+Unit.StartLimitBurst,                    config_parse_unsigned,                       0,                                  offsetof(Unit, start_ratelimit.burst)
+Unit.StartLimitAction,                   config_parse_emergency_action,               0,                                  offsetof(Unit, start_limit_action)
+Unit.FailureAction,                      config_parse_emergency_action,               0,                                  offsetof(Unit, failure_action)
+Unit.SuccessAction,                      config_parse_emergency_action,               0,                                  offsetof(Unit, success_action)
+Unit.FailureActionExitStatus,            config_parse_exit_status,                    0,                                  offsetof(Unit, failure_action_exit_status)
+Unit.SuccessActionExitStatus,            config_parse_exit_status,                    0,                                  offsetof(Unit, success_action_exit_status)
+Unit.RebootArgument,                     config_parse_unit_string_printf,             0,                                  offsetof(Unit, reboot_arg)
+Unit.ConditionPathExists,                config_parse_unit_condition_path,            CONDITION_PATH_EXISTS,              offsetof(Unit, conditions)
+Unit.ConditionPathExistsGlob,            config_parse_unit_condition_path,            CONDITION_PATH_EXISTS_GLOB,         offsetof(Unit, conditions)
+Unit.ConditionPathIsDirectory,           config_parse_unit_condition_path,            CONDITION_PATH_IS_DIRECTORY,        offsetof(Unit, conditions)
+Unit.ConditionPathIsSymbolicLink,        config_parse_unit_condition_path,            CONDITION_PATH_IS_SYMBOLIC_LINK,    offsetof(Unit, conditions)
+Unit.ConditionPathIsMountPoint,          config_parse_unit_condition_path,            CONDITION_PATH_IS_MOUNT_POINT,      offsetof(Unit, conditions)
+Unit.ConditionPathIsReadWrite,           config_parse_unit_condition_path,            CONDITION_PATH_IS_READ_WRITE,       offsetof(Unit, conditions)
+Unit.ConditionPathIsEncrypted,           config_parse_unit_condition_path,            CONDITION_PATH_IS_ENCRYPTED,        offsetof(Unit, conditions)
+Unit.ConditionDirectoryNotEmpty,         config_parse_unit_condition_path,            CONDITION_DIRECTORY_NOT_EMPTY,      offsetof(Unit, conditions)
+Unit.ConditionFileNotEmpty,              config_parse_unit_condition_path,            CONDITION_FILE_NOT_EMPTY,           offsetof(Unit, conditions)
+Unit.ConditionFileIsExecutable,          config_parse_unit_condition_path,            CONDITION_FILE_IS_EXECUTABLE,       offsetof(Unit, conditions)
+Unit.ConditionNeedsUpdate,               config_parse_unit_condition_path,            CONDITION_NEEDS_UPDATE,             offsetof(Unit, conditions)
+Unit.ConditionFirstBoot,                 config_parse_unit_condition_string,          CONDITION_FIRST_BOOT,               offsetof(Unit, conditions)
+Unit.ConditionArchitecture,              config_parse_unit_condition_string,          CONDITION_ARCHITECTURE,             offsetof(Unit, conditions)
+Unit.ConditionFirmware,                  config_parse_unit_condition_string,          CONDITION_FIRMWARE,                 offsetof(Unit, conditions)
+Unit.ConditionVirtualization,            config_parse_unit_condition_string,          CONDITION_VIRTUALIZATION,           offsetof(Unit, conditions)
+Unit.ConditionHost,                      config_parse_unit_condition_string,          CONDITION_HOST,                     offsetof(Unit, conditions)
+Unit.ConditionKernelCommandLine,         config_parse_unit_condition_string,          CONDITION_KERNEL_COMMAND_LINE,      offsetof(Unit, conditions)
+Unit.ConditionKernelVersion,             config_parse_unit_condition_string,          CONDITION_KERNEL_VERSION,           offsetof(Unit, conditions)
+Unit.ConditionCredential,                config_parse_unit_condition_string,          CONDITION_CREDENTIAL,               offsetof(Unit, conditions)
+Unit.ConditionSecurity,                  config_parse_unit_condition_string,          CONDITION_SECURITY,                 offsetof(Unit, conditions)
+Unit.ConditionCapability,                config_parse_unit_condition_string,          CONDITION_CAPABILITY,               offsetof(Unit, conditions)
+Unit.ConditionACPower,                   config_parse_unit_condition_string,          CONDITION_AC_POWER,                 offsetof(Unit, conditions)
+Unit.ConditionMemory,                    config_parse_unit_condition_string,          CONDITION_MEMORY,                   offsetof(Unit, conditions)
+Unit.ConditionCPUFeature,                config_parse_unit_condition_string,          CONDITION_CPU_FEATURE,              offsetof(Unit, conditions)
+Unit.ConditionCPUs,                      config_parse_unit_condition_string,          CONDITION_CPUS,                     offsetof(Unit, conditions)
+Unit.ConditionEnvironment,               config_parse_unit_condition_string,          CONDITION_ENVIRONMENT,              offsetof(Unit, conditions)
+Unit.ConditionUser,                      config_parse_unit_condition_string,          CONDITION_USER,                     offsetof(Unit, conditions)
+Unit.ConditionGroup,                     config_parse_unit_condition_string,          CONDITION_GROUP,                    offsetof(Unit, conditions)
+Unit.ConditionControlGroupController,    config_parse_unit_condition_string,          CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, conditions)
+Unit.ConditionOSRelease,                 config_parse_unit_condition_string,          CONDITION_OS_RELEASE,               offsetof(Unit, conditions)
+Unit.ConditionMemoryPressure,            config_parse_unit_condition_string,          CONDITION_MEMORY_PRESSURE,          offsetof(Unit, conditions)
+Unit.ConditionCPUPressure,               config_parse_unit_condition_string,          CONDITION_CPU_PRESSURE,             offsetof(Unit, conditions)
+Unit.ConditionIOPressure,                config_parse_unit_condition_string,          CONDITION_IO_PRESSURE,              offsetof(Unit, conditions)
+Unit.AssertPathExists,                   config_parse_unit_condition_path,            CONDITION_PATH_EXISTS,              offsetof(Unit, asserts)
+Unit.AssertPathExistsGlob,               config_parse_unit_condition_path,            CONDITION_PATH_EXISTS_GLOB,         offsetof(Unit, asserts)
+Unit.AssertPathIsDirectory,              config_parse_unit_condition_path,            CONDITION_PATH_IS_DIRECTORY,        offsetof(Unit, asserts)
+Unit.AssertPathIsSymbolicLink,           config_parse_unit_condition_path,            CONDITION_PATH_IS_SYMBOLIC_LINK,    offsetof(Unit, asserts)
+Unit.AssertPathIsMountPoint,             config_parse_unit_condition_path,            CONDITION_PATH_IS_MOUNT_POINT,      offsetof(Unit, asserts)
+Unit.AssertPathIsReadWrite,              config_parse_unit_condition_path,            CONDITION_PATH_IS_READ_WRITE,       offsetof(Unit, asserts)
+Unit.AssertPathIsEncrypted,              config_parse_unit_condition_path,            CONDITION_PATH_IS_ENCRYPTED,        offsetof(Unit, asserts)
+Unit.AssertDirectoryNotEmpty,            config_parse_unit_condition_path,            CONDITION_DIRECTORY_NOT_EMPTY,      offsetof(Unit, asserts)
+Unit.AssertFileNotEmpty,                 config_parse_unit_condition_path,            CONDITION_FILE_NOT_EMPTY,           offsetof(Unit, asserts)
+Unit.AssertFileIsExecutable,             config_parse_unit_condition_path,            CONDITION_FILE_IS_EXECUTABLE,       offsetof(Unit, asserts)
+Unit.AssertNeedsUpdate,                  config_parse_unit_condition_path,            CONDITION_NEEDS_UPDATE,             offsetof(Unit, asserts)
+Unit.AssertFirstBoot,                    config_parse_unit_condition_string,          CONDITION_FIRST_BOOT,               offsetof(Unit, asserts)
+Unit.AssertArchitecture,                 config_parse_unit_condition_string,          CONDITION_ARCHITECTURE,             offsetof(Unit, asserts)
+Unit.AssertVirtualization,               config_parse_unit_condition_string,          CONDITION_VIRTUALIZATION,           offsetof(Unit, asserts)
+Unit.AssertHost,                         config_parse_unit_condition_string,          CONDITION_HOST,                     offsetof(Unit, asserts)
+Unit.AssertKernelCommandLine,            config_parse_unit_condition_string,          CONDITION_KERNEL_COMMAND_LINE,      offsetof(Unit, asserts)
+Unit.AssertKernelVersion,                config_parse_unit_condition_string,          CONDITION_KERNEL_VERSION,           offsetof(Unit, asserts)
+Unit.AssertCredential,                   config_parse_unit_condition_string,          CONDITION_CREDENTIAL,               offsetof(Unit, asserts)
+Unit.AssertSecurity,                     config_parse_unit_condition_string,          CONDITION_SECURITY,                 offsetof(Unit, asserts)
+Unit.AssertCapability,                   config_parse_unit_condition_string,          CONDITION_CAPABILITY,               offsetof(Unit, asserts)
+Unit.AssertACPower,                      config_parse_unit_condition_string,          CONDITION_AC_POWER,                 offsetof(Unit, asserts)
+Unit.AssertMemory,                       config_parse_unit_condition_string,          CONDITION_MEMORY,                   offsetof(Unit, asserts)
+Unit.AssertCPUFeature,                   config_parse_unit_condition_string,          CONDITION_CPU_FEATURE,              offsetof(Unit, asserts)
+Unit.AssertCPUs,                         config_parse_unit_condition_string,          CONDITION_CPUS,                     offsetof(Unit, asserts)
+Unit.AssertEnvironment,                  config_parse_unit_condition_string,          CONDITION_ENVIRONMENT,              offsetof(Unit, asserts)
+Unit.AssertUser,                         config_parse_unit_condition_string,          CONDITION_USER,                     offsetof(Unit, asserts)
+Unit.AssertGroup,                        config_parse_unit_condition_string,          CONDITION_GROUP,                    offsetof(Unit, asserts)
+Unit.AssertControlGroupController,       config_parse_unit_condition_string,          CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, asserts)
+Unit.AssertOSRelease,                    config_parse_unit_condition_string,          CONDITION_OS_RELEASE,               offsetof(Unit, asserts)
+Unit.AssertMemoryPressure,               config_parse_unit_condition_string,          CONDITION_MEMORY_PRESSURE,          offsetof(Unit, asserts)
+Unit.AssertCPUPressure,                  config_parse_unit_condition_string,          CONDITION_CPU_PRESSURE,             offsetof(Unit, asserts)
+Unit.AssertIOPressure,                   config_parse_unit_condition_string,          CONDITION_IO_PRESSURE,              offsetof(Unit, asserts)
+Unit.CollectMode,                        config_parse_collect_mode,                   0,                                  offsetof(Unit, collect_mode)
+Service.PIDFile,                         config_parse_pid_file,                       0,                                  offsetof(Service, pid_file)
+Service.ExecCondition,                   config_parse_exec,                           SERVICE_EXEC_CONDITION,             offsetof(Service, exec_command)
+Service.ExecStartPre,                    config_parse_exec,                           SERVICE_EXEC_START_PRE,             offsetof(Service, exec_command)
+Service.ExecStart,                       config_parse_exec,                           SERVICE_EXEC_START,                 offsetof(Service, exec_command)
+Service.ExecStartPost,                   config_parse_exec,                           SERVICE_EXEC_START_POST,            offsetof(Service, exec_command)
+Service.ExecReload,                      config_parse_exec,                           SERVICE_EXEC_RELOAD,                offsetof(Service, exec_command)
+Service.ExecStop,                        config_parse_exec,                           SERVICE_EXEC_STOP,                  offsetof(Service, exec_command)
+Service.ExecStopPost,                    config_parse_exec,                           SERVICE_EXEC_STOP_POST,             offsetof(Service, exec_command)
+Service.RestartSec,                      config_parse_sec,                            0,                                  offsetof(Service, restart_usec)
+Service.RestartSteps,                    config_parse_unsigned,                       0,                                  offsetof(Service, restart_steps)
+Service.RestartMaxDelaySec,              config_parse_sec,                            0,                                  offsetof(Service, restart_max_delay_usec)
+Service.TimeoutSec,                      config_parse_service_timeout,                0,                                  0
+Service.TimeoutStartSec,                 config_parse_service_timeout,                0,                                  0
+Service.TimeoutStopSec,                  config_parse_sec_fix_0,                      0,                                  offsetof(Service, timeout_stop_usec)
+Service.TimeoutAbortSec,                 config_parse_service_timeout_abort,          0,                                  0
+Service.TimeoutStartFailureMode,         config_parse_service_timeout_failure_mode,   0,                                  offsetof(Service, timeout_start_failure_mode)
+Service.TimeoutStopFailureMode,          config_parse_service_timeout_failure_mode,   0,                                  offsetof(Service, timeout_stop_failure_mode)
+Service.RuntimeMaxSec,                   config_parse_sec,                            0,                                  offsetof(Service, runtime_max_usec)
+Service.RuntimeRandomizedExtraSec,       config_parse_sec,                            0,                                  offsetof(Service, runtime_rand_extra_usec)
+Service.WatchdogSec,                     config_parse_sec,                            0,                                  offsetof(Service, watchdog_usec)
+{# The following five only exist for compatibility, they moved into Unit, see above #}
+Service.StartLimitInterval,              config_parse_sec,                            0,                                  offsetof(Unit, start_ratelimit.interval)
+Service.StartLimitBurst,                 config_parse_unsigned,                       0,                                  offsetof(Unit, start_ratelimit.burst)
+Service.StartLimitAction,                config_parse_emergency_action,               0,                                  offsetof(Unit, start_limit_action)
+Service.FailureAction,                   config_parse_emergency_action,               0,                                  offsetof(Unit, failure_action)
+Service.RebootArgument,                  config_parse_unit_string_printf,             0,                                  offsetof(Unit, reboot_arg)
+Service.Type,                            config_parse_service_type,                   0,                                  offsetof(Service, type)
+Service.ExitType,                        config_parse_service_exit_type,              0,                                  offsetof(Service, exit_type)
+Service.Restart,                         config_parse_service_restart,                0,                                  offsetof(Service, restart)
+Service.RestartMode,                     config_parse_service_restart_mode,           0,                                  offsetof(Service, restart_mode)
+Service.PermissionsStartOnly,            config_parse_bool,                           0,                                  offsetof(Service, permissions_start_only)
+Service.RootDirectoryStartOnly,          config_parse_bool,                           0,                                  offsetof(Service, root_directory_start_only)
+Service.RemainAfterExit,                 config_parse_bool,                           0,                                  offsetof(Service, remain_after_exit)
+Service.GuessMainPID,                    config_parse_bool,                           0,                                  offsetof(Service, guess_main_pid)
+Service.RestartPreventExitStatus,        config_parse_set_status,                     0,                                  offsetof(Service, restart_prevent_status)
+Service.RestartForceExitStatus,          config_parse_set_status,                     0,                                  offsetof(Service, restart_force_status)
+Service.SuccessExitStatus,               config_parse_set_status,                     0,                                  offsetof(Service, success_status)
+Service.SysVStartPriority,               config_parse_warn_compat,                    DISABLED_LEGACY,                    0
+Service.NonBlocking,                     config_parse_bool,                           0,                                  offsetof(Service, exec_context.non_blocking)
+Service.BusName,                         config_parse_bus_name,                       0,                                  offsetof(Service, bus_name)
+Service.FileDescriptorStoreMax,          config_parse_unsigned,                       0,                                  offsetof(Service, n_fd_store_max)
+Service.FileDescriptorStorePreserve,     config_parse_exec_preserve_mode,             0,                                  offsetof(Service, fd_store_preserve_mode)
+Service.NotifyAccess,                    config_parse_notify_access,                  0,                                  offsetof(Service, notify_access)
+Service.Sockets,                         config_parse_service_sockets,                0,                                  0
+Service.BusPolicy,                       config_parse_warn_compat,                    DISABLED_LEGACY,                    0
+Service.USBFunctionDescriptors,          config_parse_unit_path_printf,               0,                                  offsetof(Service, usb_function_descriptors)
+Service.USBFunctionStrings,              config_parse_unit_path_printf,               0,                                  offsetof(Service, usb_function_strings)
+Service.OOMPolicy,                       config_parse_oom_policy,                     0,                                  offsetof(Service, oom_policy)
+Service.OpenFile,                        config_parse_open_file,                      0,                                  offsetof(Service, open_files)
+Service.ReloadSignal,                    config_parse_signal,                         0,                                  offsetof(Service, reload_signal)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Service') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Service') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Service') }}
+Socket.ListenStream,                     config_parse_socket_listen,                  SOCKET_SOCKET,                      0
+Socket.ListenDatagram,                   config_parse_socket_listen,                  SOCKET_SOCKET,                      0
+Socket.ListenSequentialPacket,           config_parse_socket_listen,                  SOCKET_SOCKET,                      0
+Socket.ListenFIFO,                       config_parse_socket_listen,                  SOCKET_FIFO,                        0
+Socket.ListenNetlink,                    config_parse_socket_listen,                  SOCKET_SOCKET,                      0
+Socket.ListenSpecial,                    config_parse_socket_listen,                  SOCKET_SPECIAL,                     0
+Socket.ListenMessageQueue,               config_parse_socket_listen,                  SOCKET_MQUEUE,                      0
+Socket.ListenUSBFunction,                config_parse_socket_listen,                  SOCKET_USB_FUNCTION,                0
+Socket.SocketProtocol,                   config_parse_socket_protocol,                0,                                  offsetof(Socket, socket_protocol)
+Socket.BindIPv6Only,                     config_parse_socket_bind,                    0,                                  offsetof(Socket, bind_ipv6_only)
+Socket.Backlog,                          config_parse_unsigned,                       0,                                  offsetof(Socket, backlog)
+Socket.BindToDevice,                     config_parse_socket_bindtodevice,            0,                                  0
+Socket.ExecStartPre,                     config_parse_exec,                           SOCKET_EXEC_START_PRE,              offsetof(Socket, exec_command)
+Socket.ExecStartPost,                    config_parse_exec,                           SOCKET_EXEC_START_POST,             offsetof(Socket, exec_command)
+Socket.ExecStopPre,                      config_parse_exec,                           SOCKET_EXEC_STOP_PRE,               offsetof(Socket, exec_command)
+Socket.ExecStopPost,                     config_parse_exec,                           SOCKET_EXEC_STOP_POST,              offsetof(Socket, exec_command)
+Socket.TimeoutSec,                       config_parse_sec_fix_0,                      0,                                  offsetof(Socket, timeout_usec)
+Socket.SocketUser,                       config_parse_user_group_compat,              0,                                  offsetof(Socket, user)
+Socket.SocketGroup,                      config_parse_user_group_compat,              0,                                  offsetof(Socket, group)
+Socket.SocketMode,                       config_parse_mode,                           0,                                  offsetof(Socket, socket_mode)
+Socket.DirectoryMode,                    config_parse_mode,                           0,                                  offsetof(Socket, directory_mode)
+Socket.Accept,                           config_parse_bool,                           0,                                  offsetof(Socket, accept)
+Socket.FlushPending,                     config_parse_bool,                           0,                                  offsetof(Socket, flush_pending)
+Socket.Writable,                         config_parse_bool,                           0,                                  offsetof(Socket, writable)
+Socket.MaxConnections,                   config_parse_unsigned,                       0,                                  offsetof(Socket, max_connections)
+Socket.MaxConnectionsPerSource,          config_parse_unsigned,                       0,                                  offsetof(Socket, max_connections_per_source)
+Socket.KeepAlive,                        config_parse_bool,                           0,                                  offsetof(Socket, keep_alive)
+Socket.KeepAliveTimeSec,                 config_parse_sec,                            0,                                  offsetof(Socket, keep_alive_time)
+Socket.KeepAliveIntervalSec,             config_parse_sec,                            0,                                  offsetof(Socket, keep_alive_interval)
+Socket.KeepAliveProbes,                  config_parse_unsigned,                       0,                                  offsetof(Socket, keep_alive_cnt)
+Socket.DeferAcceptSec,                   config_parse_sec,                            0,                                  offsetof(Socket, defer_accept)
+Socket.NoDelay,                          config_parse_bool,                           0,                                  offsetof(Socket, no_delay)
+Socket.Priority,                         config_parse_int,                            0,                                  offsetof(Socket, priority)
+Socket.ReceiveBuffer,                    config_parse_iec_size,                       0,                                  offsetof(Socket, receive_buffer)
+Socket.SendBuffer,                       config_parse_iec_size,                       0,                                  offsetof(Socket, send_buffer)
+Socket.IPTOS,                            config_parse_ip_tos,                         0,                                  offsetof(Socket, ip_tos)
+Socket.IPTTL,                            config_parse_int,                            0,                                  offsetof(Socket, ip_ttl)
+Socket.Mark,                             config_parse_int,                            0,                                  offsetof(Socket, mark)
+Socket.PipeSize,                         config_parse_iec_size,                       0,                                  offsetof(Socket, pipe_size)
+Socket.FreeBind,                         config_parse_bool,                           0,                                  offsetof(Socket, free_bind)
+Socket.Transparent,                      config_parse_bool,                           0,                                  offsetof(Socket, transparent)
+Socket.Broadcast,                        config_parse_bool,                           0,                                  offsetof(Socket, broadcast)
+Socket.PassCredentials,                  config_parse_bool,                           0,                                  offsetof(Socket, pass_cred)
+Socket.PassSecurity,                     config_parse_bool,                           0,                                  offsetof(Socket, pass_sec)
+Socket.PassPacketInfo,                   config_parse_bool,                           0,                                  offsetof(Socket, pass_pktinfo)
+Socket.Timestamping,                     config_parse_socket_timestamping,            0,                                  offsetof(Socket, timestamping)
+Socket.TCPCongestion,                    config_parse_string,                         0,                                  offsetof(Socket, tcp_congestion)
+Socket.ReusePort,                        config_parse_bool,                           0,                                  offsetof(Socket, reuse_port)
+Socket.MessageQueueMaxMessages,          config_parse_long,                           0,                                  offsetof(Socket, mq_maxmsg)
+Socket.MessageQueueMessageSize,          config_parse_long,                           0,                                  offsetof(Socket, mq_msgsize)
+Socket.RemoveOnStop,                     config_parse_bool,                           0,                                  offsetof(Socket, remove_on_stop)
+Socket.Symlinks,                         config_parse_unit_path_strv_printf,          0,                                  offsetof(Socket, symlinks)
+Socket.FileDescriptorName,               config_parse_fdname,                         0,                                  0
+Socket.Service,                          config_parse_socket_service,                 0,                                  0
+Socket.TriggerLimitIntervalSec,          config_parse_sec,                            0,                                  offsetof(Socket, trigger_limit.interval)
+Socket.TriggerLimitBurst,                config_parse_unsigned,                       0,                                  offsetof(Socket, trigger_limit.burst)
+Socket.PollLimitIntervalSec,             config_parse_sec,                            0,                                  offsetof(Socket, poll_limit_interval)
+Socket.PollLimitBurst,                   config_parse_unsigned,                       0,                                  offsetof(Socket, poll_limit_burst)
+{% if ENABLE_SMACK %}
+Socket.SmackLabel,                       config_parse_unit_string_printf,             0,                                  offsetof(Socket, smack)
+Socket.SmackLabelIPIn,                   config_parse_unit_string_printf,             0,                                  offsetof(Socket, smack_ip_in)
+Socket.SmackLabelIPOut,                  config_parse_unit_string_printf,             0,                                  offsetof(Socket, smack_ip_out)
+{% else %}
+Socket.SmackLabel,                       config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+Socket.SmackLabelIPIn,                   config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+Socket.SmackLabelIPOut,                  config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{% if HAVE_SELINUX %}
+Socket.SELinuxContextFromNet,            config_parse_bool,                           0,                                  offsetof(Socket, selinux_context_from_net)
+{% else %}
+Socket.SELinuxContextFromNet,            config_parse_warn_compat,                    DISABLED_CONFIGURATION,             0
+{% endif %}
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Socket') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Socket') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Socket') }}
+Mount.What,                              config_parse_unit_string_printf,             0,                                  offsetof(Mount, parameters_fragment.what)
+Mount.Where,                             config_parse_unit_path_printf,               0,                                  offsetof(Mount, where)
+Mount.Options,                           config_parse_unit_string_printf,             0,                                  offsetof(Mount, parameters_fragment.options)
+Mount.Type,                              config_parse_unit_string_printf,             0,                                  offsetof(Mount, parameters_fragment.fstype)
+Mount.TimeoutSec,                        config_parse_sec_fix_0,                      0,                                  offsetof(Mount, timeout_usec)
+Mount.DirectoryMode,                     config_parse_mode,                           0,                                  offsetof(Mount, directory_mode)
+Mount.SloppyOptions,                     config_parse_bool,                           0,                                  offsetof(Mount, sloppy_options)
+Mount.LazyUnmount,                       config_parse_bool,                           0,                                  offsetof(Mount, lazy_unmount)
+Mount.ForceUnmount,                      config_parse_bool,                           0,                                  offsetof(Mount, force_unmount)
+Mount.ReadWriteOnly,                     config_parse_bool,                           0,                                  offsetof(Mount, read_write_only)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Mount') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Mount') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Mount') }}
+Automount.Where,                         config_parse_unit_path_printf,               0,                                  offsetof(Automount, where)
+Automount.ExtraOptions,                  config_parse_unit_string_printf,             0,                                  offsetof(Automount, extra_options)
+Automount.DirectoryMode,                 config_parse_mode,                           0,                                  offsetof(Automount, directory_mode)
+Automount.TimeoutIdleSec,                config_parse_sec_fix_0,                      0,                                  offsetof(Automount, timeout_idle_usec)
+Swap.What,                               config_parse_unit_path_printf,               0,                                  offsetof(Swap, parameters_fragment.what)
+Swap.Priority,                           config_parse_swap_priority,                  0,                                  0
+Swap.Options,                            config_parse_unit_string_printf,             0,                                  offsetof(Swap, parameters_fragment.options)
+Swap.TimeoutSec,                         config_parse_sec_fix_0,                      0,                                  offsetof(Swap, timeout_usec)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Swap') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Swap') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Swap') }}
+Timer.OnCalendar,                        config_parse_timer,                          TIMER_CALENDAR,                     0
+Timer.OnActiveSec,                       config_parse_timer,                          TIMER_ACTIVE,                       0
+Timer.OnBootSec,                         config_parse_timer,                          TIMER_BOOT,                         0
+Timer.OnStartupSec,                      config_parse_timer,                          TIMER_STARTUP,                      0
+Timer.OnUnitActiveSec,                   config_parse_timer,                          TIMER_UNIT_ACTIVE,                  0
+Timer.OnUnitInactiveSec,                 config_parse_timer,                          TIMER_UNIT_INACTIVE,                0
+Timer.OnClockChange,                     config_parse_bool,                           0,                                  offsetof(Timer, on_clock_change)
+Timer.OnTimezoneChange,                  config_parse_bool,                           0,                                  offsetof(Timer, on_timezone_change)
+Timer.Persistent,                        config_parse_bool,                           0,                                  offsetof(Timer, persistent)
+Timer.WakeSystem,                        config_parse_bool,                           0,                                  offsetof(Timer, wake_system)
+Timer.RemainAfterElapse,                 config_parse_bool,                           0,                                  offsetof(Timer, remain_after_elapse)
+Timer.FixedRandomDelay,                  config_parse_bool,                           0,                                  offsetof(Timer, fixed_random_delay)
+Timer.AccuracySec,                       config_parse_sec,                            0,                                  offsetof(Timer, accuracy_usec)
+Timer.RandomizedDelaySec,                config_parse_sec,                            0,                                  offsetof(Timer, random_usec)
+Timer.Unit,                              config_parse_trigger_unit,                   0,                                  0
+Path.PathExists,                         config_parse_path_spec,                      0,                                  0
+Path.PathExistsGlob,                     config_parse_path_spec,                      0,                                  0
+Path.PathChanged,                        config_parse_path_spec,                      0,                                  0
+Path.PathModified,                       config_parse_path_spec,                      0,                                  0
+Path.DirectoryNotEmpty,                  config_parse_path_spec,                      0,                                  0
+Path.Unit,                               config_parse_trigger_unit,                   0,                                  0
+Path.MakeDirectory,                      config_parse_bool,                           0,                                  offsetof(Path, make_directory)
+Path.DirectoryMode,                      config_parse_mode,                           0,                                  offsetof(Path, directory_mode)
+Path.TriggerLimitIntervalSec,            config_parse_sec,                            0,                                  offsetof(Path, trigger_limit.interval)
+Path.TriggerLimitBurst,                  config_parse_unsigned,                       0,                                  offsetof(Path, trigger_limit.burst)
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Slice') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Scope') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Scope') }}
+Scope.RuntimeMaxSec,                     config_parse_sec,                            0,                                  offsetof(Scope, runtime_max_usec)
+Scope.RuntimeRandomizedExtraSec,         config_parse_sec,                            0,                                  offsetof(Scope, runtime_rand_extra_usec)
+Scope.TimeoutStopSec,                    config_parse_sec,                            0,                                  offsetof(Scope, timeout_stop_usec)
+Scope.OOMPolicy,                         config_parse_oom_policy,                     0,                                  offsetof(Scope, oom_policy)
+{# The [Install] section is ignored here #}
+Install.Alias,                           NULL,                                        0,                                  0
+Install.WantedBy,                        NULL,                                        0,                                  0
+Install.RequiredBy,                      NULL,                                        0,                                  0
+Install.UpheldBy,                        NULL,                                        0,                                  0
+Install.Also,                            NULL,                                        0,                                  0
+Install.DefaultInstance,                 NULL,                                        0,                                  0
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
new file mode 100644
index 0000000..0baf08e
--- /dev/null
+++ b/src/core/load-fragment.c
@@ -0,0 +1,6735 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+  Copyright © 2012 Holger Hans Peter Freyther
+***/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "all-units.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-lsm.h"
+#include "bpf-program.h"
+#include "bpf-socket-bind.h"
+#include "bus-error.h"
+#include "bus-internal.h"
+#include "bus-util.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "conf-parser.h"
+#include "core-varlink.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "iovec-util.h"
+#include "ioprio-util.h"
+#include "ip-protocol-list.h"
+#include "journal-file.h"
+#include "limits-util.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "missing_ioprio.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "open-file.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pcre2-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "socket-netlink.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "time-util.h"
+#include "unit-name.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "utf8.h"
+#include "web-util.h"
+
+static int parse_socket_protocol(const char *s) {
+        int r;
+
+        r = parse_ip_protocol(s);
+        if (r < 0)
+                return r;
+        if (!IN_SET(r, IPPROTO_UDPLITE, IPPROTO_SCTP))
+                return -EPROTONOSUPPORT;
+
+        return r;
+}
+
+int parse_crash_chvt(const char *value, int *data) {
+        int b;
+
+        if (safe_atoi(value, data) >= 0)
+                return 0;
+
+        b = parse_boolean(value);
+        if (b < 0)
+                return b;
+
+        if (b > 0)
+                *data = 0; /* switch to where kmsg goes */
+        else
+                *data = -1; /* turn off switching */
+
+        return 0;
+}
+
+int parse_confirm_spawn(const char *value, char **console) {
+        char *s;
+        int r;
+
+        r = value ? parse_boolean(value) : 1;
+        if (r == 0) {
+                *console = NULL;
+                return 0;
+        } else if (r > 0) /* on with default tty */
+                s = strdup("/dev/console");
+        else if (is_path(value)) /* on with fully qualified path */
+                s = strdup(value);
+        else /* on with only a tty file name, not a fully qualified path */
+                s = path_join("/dev/", value);
+        if (!s)
+                return -ENOMEM;
+
+        *console = s;
+        return 0;
+}
+
+DEFINE_CONFIG_PARSE(config_parse_socket_protocol, parse_socket_protocol, "Failed to parse socket protocol");
+DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Failed to parse secure bits");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_home, protect_home, ProtectHome, "Failed to parse protect home value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_system, protect_system, ProtectSystem, "Failed to parse protect system value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode, "Failed to parse resource preserve mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_exit_type, service_exit_type, ServiceExitType, "Failed to parse service exit type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart_mode, service_restart_mode, ServiceRestartMode, "Failed to parse service restart mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference=");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_memory_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch, "Failed to parse memory pressure watch setting");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
+DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
+DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
+DEFINE_CONFIG_PARSE_PTR(config_parse_cg_cpu_weight, cg_cpu_weight_parse, uint64_t, "Invalid CPU weight");
+static DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares_internal, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares");
+DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_propagation_flag, mount_propagation_flag_from_string, unsigned long, "Failed to parse mount propagation flag");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat, "Failed to parse status unit format");
+DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping, "Failed to parse timestamping precision");
+
+int config_parse_cpu_shares(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+
+        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                   "Unit uses %s=; please use CPUWeight= instead. Support for %s= will be removed soon.",
+                   lvalue, lvalue);
+
+        return config_parse_cpu_shares_internal(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+bool contains_instance_specifier_superset(const char *s) {
+        const char *p, *q;
+        bool percent = false;
+
+        assert(s);
+
+        p = strchr(s, '@');
+        if (!p)
+                return false;
+
+        p++; /* Skip '@' */
+
+        q = strrchr(p, '.');
+        if (!q)
+                q = p + strlen(p);
+
+        /* If the string is just the instance specifier, it's not a superset of the instance. */
+        if (memcmp_nn(p, q - p, "%i", strlen("%i")) == 0)
+                return false;
+
+        /* %i, %n and %N all expand to the instance or a superset of it. */
+        for (; p < q; p++)
+                if (*p == '%')
+                        percent = !percent;
+                else if (percent) {
+                        if (IN_SET(*p, 'n', 'N', 'i'))
+                                return true;
+                        percent = false;
+                }
+
+        return false;
+}
+
+/* `name` is the rendered version of `format` via `unit_printf` or similar functions. */
+int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format) {
+        const char *fragment_path;
+        int r;
+
+        assert(u);
+        assert(name);
+
+        /* If a template unit has a direct dependency on itself that includes the unit instance as part of
+         * the template instance via a unit specifier (%i, %n or %N), this will almost certainly lead to
+         * infinite recursion as systemd will keep instantiating new instances of the template unit.
+         * https://github.com/systemd/systemd/issues/17602 shows a good example of how this can happen in
+         * practice. To guard against this, we check for templates that depend on themselves and have the
+         * instantiated unit instance included as part of the template instance of the dependency via a
+         * specifier.
+         *
+         * For example, if systemd-notify@.service depends on systemd-notify@%n.service, this will result in
+         * infinite recursion.
+         */
+
+        if (!unit_name_is_valid(name, UNIT_NAME_INSTANCE))
+                return false;
+
+        if (!unit_name_prefix_equal(u->id, name))
+                return false;
+
+        if (u->type != unit_name_to_type(name))
+                return false;
+
+        r = unit_file_find_fragment(u->manager->unit_id_map, u->manager->unit_name_map, name, &fragment_path, NULL);
+        if (r < 0)
+                return r;
+
+        /* Fragment paths should also be equal as a custom fragment for a specific template instance
+         * wouldn't necessarily lead to infinite recursion. */
+        if (!path_equal_ptr(u->fragment_path, fragment_path))
+                return false;
+
+        if (!contains_instance_specifier_superset(format))
+                return false;
+
+        return true;
+}
+
+int config_parse_unit_deps(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        UnitDependency d = ltype;
+        Unit *u = userdata;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+                int r;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+                if (r == 0)
+                        return 0;
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+
+                r = unit_name_printf(u, word, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+                        continue;
+                }
+
+                r = unit_is_likely_recursive_template_dependency(u, k, word);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to determine if '%s' is a recursive dependency, ignoring: %m", k);
+                        continue;
+                }
+                if (r > 0) {
+                        log_syntax(unit, LOG_DEBUG, filename, line, 0,
+                                   "Dropping dependency %s=%s that likely leads to infinite recursion.",
+                                   unit_dependency_to_string(d), word);
+                        continue;
+                }
+
+                r = unit_add_dependency_by_name(u, d, k, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+        }
+}
+
+int config_parse_obsolete_unit_deps(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                   "Unit dependency type %s= is obsolete, replacing by %s=, please update your unit file", lvalue, unit_dependency_to_string(ltype));
+
+        return config_parse_unit_deps(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_unit_string_printf(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *k = NULL;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_unit_strv_printf(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        const Unit *u = ASSERT_PTR(userdata);
+        _cleanup_free_ char *k = NULL;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        return config_parse_strv(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_unit_path_printf(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *k = NULL;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+        bool fatal = ltype;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = unit_path_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s'%s: %m",
+                           rvalue, fatal ? "" : ", ignoring");
+                return fatal ? -ENOEXEC : 0;
+        }
+
+        return config_parse_path(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_colon_separated_paths(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+        char ***sv = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *sv = strv_free(*sv);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        break;
+
+                r = unit_path_printf(u, word, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                r = strv_consume(sv, TAKE_PTR(k));
+                if (r < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+int config_parse_unit_path_strv_printf(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        char ***x = data;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                *x = strv_free(*x);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == 0)
+                        return 0;
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+
+                r = unit_path_printf(u, word, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                r = strv_consume(x, TAKE_PTR(k));
+                if (r < 0)
+                        return log_oom();
+        }
+}
+
+static int patch_var_run(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *lvalue,
+                char **path) {
+
+        const char *e;
+        char *z;
+
+        e = path_startswith(*path, "/var/run/");
+        if (!e)
+                return 0;
+
+        z = path_join("/run/", e);
+        if (!z)
+                return log_oom();
+
+        log_syntax(unit, LOG_NOTICE, filename, line, 0,
+                   "%s= references a path below legacy directory /var/run/, updating %s → %s; "
+                   "please update the unit file accordingly.", lvalue, *path, z);
+
+        free_and_replace(*path, z);
+
+        return 1;
+}
+
+int config_parse_socket_listen(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ SocketPort *p = NULL;
+        SocketPort *tail;
+        Socket *s;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(data);
+
+        s = SOCKET(data);
+
+        if (isempty(rvalue)) {
+                /* An empty assignment removes all ports */
+                socket_free_ports(s);
+                return 0;
+        }
+
+        p = new0(SocketPort, 1);
+        if (!p)
+                return log_oom();
+
+        if (ltype != SOCKET_SOCKET) {
+                _cleanup_free_ char *k = NULL;
+
+                r = unit_path_printf(UNIT(s), rvalue, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                if (ltype == SOCKET_FIFO) {
+                        r = patch_var_run(unit, filename, line, lvalue, &k);
+                        if (r < 0)
+                                return r;
+                }
+
+                free_and_replace(p->path, k);
+                p->type = ltype;
+
+        } else if (streq(lvalue, "ListenNetlink")) {
+                _cleanup_free_ char  *k = NULL;
+
+                r = unit_path_printf(UNIT(s), rvalue, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                        return 0;
+                }
+
+                r = socket_address_parse_netlink(&p->address, k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k);
+                        return 0;
+                }
+
+                p->type = SOCKET_SOCKET;
+
+        } else {
+                _cleanup_free_ char *k = NULL;
+
+                r = unit_path_printf(UNIT(s), rvalue, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                        return 0;
+                }
+
+                if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */
+                        r = patch_var_run(unit, filename, line, lvalue, &k);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = socket_address_parse_and_warn(&p->address, k);
+                if (r < 0) {
+                        if (r != -EAFNOSUPPORT)
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k);
+                        return 0;
+                }
+
+                if (streq(lvalue, "ListenStream"))
+                        p->address.type = SOCK_STREAM;
+                else if (streq(lvalue, "ListenDatagram"))
+                        p->address.type = SOCK_DGRAM;
+                else {
+                        assert(streq(lvalue, "ListenSequentialPacket"));
+                        p->address.type = SOCK_SEQPACKET;
+                }
+
+                if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Address family not supported, ignoring: %s", rvalue);
+                        return 0;
+                }
+
+                p->type = SOCKET_SOCKET;
+        }
+
+        p->fd = -EBADF;
+        p->auxiliary_fds = NULL;
+        p->n_auxiliary_fds = 0;
+        p->socket = s;
+
+        tail = LIST_FIND_TAIL(port, s->ports);
+        LIST_INSERT_AFTER(port, s->ports, tail, p);
+
+        p = NULL;
+
+        return 0;
+}
+
+int config_parse_exec_nice(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int priority, r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->nice_set = false;
+                return 0;
+        }
+
+        r = parse_nice(rvalue, &priority);
+        if (r < 0) {
+                if (r == -ERANGE)
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Nice priority out of range, ignoring: %s", rvalue);
+                else
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse nice priority '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        c->nice = priority;
+        c->nice_set = true;
+
+        return 0;
+}
+
+int config_parse_exec_oom_score_adjust(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int oa, r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->oom_score_adjust_set = false;
+                return 0;
+        }
+
+        r = parse_oom_score_adjust(rvalue, &oa);
+        if (r < 0) {
+                if (r == -ERANGE)
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue);
+                else
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        c->oom_score_adjust = oa;
+        c->oom_score_adjust_set = true;
+
+        return 0;
+}
+
+int config_parse_exec_coredump_filter(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->coredump_filter = 0;
+                c->coredump_filter_set = false;
+                return 0;
+        }
+
+        uint64_t f;
+        r = coredump_filter_mask_from_string(rvalue, &f);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to parse the CoredumpFilter=%s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        c->coredump_filter |= f;
+        c->coredump_filter_set = true;
+        return 0;
+}
+
+int config_parse_kill_mode(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        KillMode *k = data, m;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(data);
+
+        if (isempty(rvalue)) {
+                *k = KILL_CONTROL_GROUP;
+                return 0;
+        }
+
+        m = kill_mode_from_string(rvalue);
+        if (m < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, m,
+                           "Failed to parse kill mode specification, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        if (m == KILL_NONE)
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Unit uses KillMode=none. "
+                           "This is unsafe, as it disables systemd's process lifecycle management for the service. "
+                           "Please update the service to use a safer KillMode=, such as 'mixed' or 'control-group'. "
+                           "Support for KillMode=none is deprecated and will eventually be removed.");
+
+        *k = m;
+        return 0;
+}
+
+int config_parse_exec(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecCommand **e = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        const char *p;
+        bool semicolon;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        e += ltype;
+
+        if (isempty(rvalue)) {
+                /* An empty assignment resets the list */
+                *e = exec_command_free_list(*e);
+                return 0;
+        }
+
+        p = rvalue;
+        do {
+                _cleanup_free_ char *path = NULL, *firstword = NULL;
+                ExecCommandFlags flags = 0;
+                bool ignore = false, separate_argv0 = false;
+                _cleanup_free_ ExecCommand *nce = NULL;
+                _cleanup_strv_free_ char **n = NULL;
+                size_t nlen = 0;
+                const char *f;
+
+                semicolon = false;
+
+                r = extract_first_word_and_warn(&p, &firstword, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
+                if (r <= 0)
+                        return 0;
+
+                /* A lone ";" is a separator. Let's make sure we don't treat it as an executable name. */
+                if (streq(firstword, ";")) {
+                        semicolon = true;
+                        continue;
+                }
+
+                f = firstword;
+                for (;;) {
+                        /* We accept an absolute path as first argument.  If it's prefixed with - and the path doesn't
+                         * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of
+                         * argv[0]; if it's prefixed with :, we will not do environment variable substitution;
+                         * if it's prefixed with +, it will be run with full privileges and no sandboxing; if
+                         * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if
+                         * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient
+                         * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most
+                         * other sandboxing, with some special exceptions for changing UID.
+                         *
+                         * The idea is that '!!' may be used to write services that can take benefit of systemd's
+                         * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to
+                         * privilege dropping within the daemon if the kernel does not offer that. */
+
+                        if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+                                flags |= EXEC_COMMAND_IGNORE_FAILURE;
+                                ignore = true;
+                        } else if (*f == '@' && !separate_argv0)
+                                separate_argv0 = true;
+                        else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND))
+                                flags |= EXEC_COMMAND_NO_ENV_EXPAND;
+                        else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+                                flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
+                        else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+                                flags |= EXEC_COMMAND_NO_SETUID;
+                        else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) {
+                                flags &= ~EXEC_COMMAND_NO_SETUID;
+                                flags |= EXEC_COMMAND_AMBIENT_MAGIC;
+                        } else
+                                break;
+                        f++;
+                }
+
+                r = unit_path_printf(u, f, &path);
+                if (r < 0) {
+                        log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+                                   "Failed to resolve unit specifiers in '%s'%s: %m",
+                                   f, ignore ? ", ignoring" : "");
+                        return ignore ? 0 : -ENOEXEC;
+                }
+
+                if (isempty(path)) {
+                        /* First word is either "-" or "@" with no command. */
+                        log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+                                   "Empty path in command line%s: '%s'",
+                                   ignore ? ", ignoring" : "", rvalue);
+                        return ignore ? 0 : -ENOEXEC;
+                }
+                if (!string_is_safe(path)) {
+                        log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+                                   "Executable name contains special characters%s: %s",
+                                   ignore ? ", ignoring" : "", path);
+                        return ignore ? 0 : -ENOEXEC;
+                }
+                if (endswith(path, "/")) {
+                        log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+                                   "Executable path specifies a directory%s: %s",
+                                   ignore ? ", ignoring" : "", path);
+                        return ignore ? 0 : -ENOEXEC;
+                }
+
+                if (!(path_is_absolute(path) ? path_is_valid(path) : filename_is_valid(path))) {
+                        log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+                                   "Neither a valid executable name nor an absolute path%s: %s",
+                                   ignore ? ", ignoring" : "", path);
+                        return ignore ? 0 : -ENOEXEC;
+                }
+
+                if (!separate_argv0) {
+                        char *w = NULL;
+
+                        if (!GREEDY_REALLOC0(n, nlen + 2))
+                                return log_oom();
+
+                        w = strdup(path);
+                        if (!w)
+                                return log_oom();
+                        n[nlen++] = w;
+                        n[nlen] = NULL;
+                }
+
+                path_simplify(path);
+
+                while (!isempty(p)) {
+                        _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+                        /* Check explicitly for an unquoted semicolon as
+                         * command separator token.  */
+                        if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) {
+                                p++;
+                                p += strspn(p, WHITESPACE);
+                                semicolon = true;
+                                break;
+                        }
+
+                        /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc.
+                         * extract_first_word() would return the same for all of those.  */
+                        if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) {
+                                char *w;
+
+                                p += 2;
+                                p += strspn(p, WHITESPACE);
+
+                                if (!GREEDY_REALLOC0(n, nlen + 2))
+                                        return log_oom();
+
+                                w = strdup(";");
+                                if (!w)
+                                        return log_oom();
+                                n[nlen++] = w;
+                                n[nlen] = NULL;
+                                continue;
+                        }
+
+                        r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
+                        if (r == 0)
+                                break;
+                        if (r < 0)
+                                return ignore ? 0 : -ENOEXEC;
+
+                        r = unit_full_printf(u, word, &resolved);
+                        if (r < 0) {
+                                log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+                                           "Failed to resolve unit specifiers in %s%s: %m",
+                                           word, ignore ? ", ignoring" : "");
+                                return ignore ? 0 : -ENOEXEC;
+                        }
+
+                        if (!GREEDY_REALLOC(n, nlen + 2))
+                                return log_oom();
+
+                        n[nlen++] = TAKE_PTR(resolved);
+                        n[nlen] = NULL;
+                }
+
+                if (!n || !n[0]) {
+                        log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+                                   "Empty executable name or zeroeth argument%s: %s",
+                                   ignore ? ", ignoring" : "", rvalue);
+                        return ignore ? 0 : -ENOEXEC;
+                }
+
+                nce = new0(ExecCommand, 1);
+                if (!nce)
+                        return log_oom();
+
+                nce->argv = TAKE_PTR(n);
+                nce->path = TAKE_PTR(path);
+                nce->flags = flags;
+
+                exec_command_append_list(e, nce);
+
+                /* Do not _cleanup_free_ these. */
+                nce = NULL;
+
+                rvalue = p;
+        } while (semicolon);
+
+        return 0;
+}
+
+int config_parse_socket_bindtodevice(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Socket *s = ASSERT_PTR(data);
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue) || streq(rvalue, "*")) {
+                s->bind_to_device = mfree(s->bind_to_device);
+                return 0;
+        }
+
+        if (!ifname_valid(rvalue)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        return free_and_strdup_warn(&s->bind_to_device, rvalue);
+}
+
+int config_parse_exec_input(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        const char *n;
+        ExecInput ei;
+        int r;
+
+        assert(filename);
+        assert(line);
+        assert(rvalue);
+
+        n = startswith(rvalue, "fd:");
+        if (n) {
+                _cleanup_free_ char *resolved = NULL;
+
+                r = unit_fd_printf(u, n, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n);
+                        return 0;
+                }
+
+                if (isempty(resolved))
+                        resolved = mfree(resolved);
+                else if (!fdname_is_valid(resolved)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved);
+                        return 0;
+                }
+
+                free_and_replace(c->stdio_fdname[STDIN_FILENO], resolved);
+
+                ei = EXEC_INPUT_NAMED_FD;
+
+        } else if ((n = startswith(rvalue, "file:"))) {
+                _cleanup_free_ char *resolved = NULL;
+
+                r = unit_path_printf(u, n, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                free_and_replace(c->stdio_file[STDIN_FILENO], resolved);
+
+                ei = EXEC_INPUT_FILE;
+
+        } else {
+                ei = exec_input_from_string(rvalue);
+                if (ei < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, ei, "Failed to parse input specifier, ignoring: %s", rvalue);
+                        return 0;
+                }
+        }
+
+        c->std_input = ei;
+        return 0;
+}
+
+int config_parse_exec_input_text(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *unescaped = NULL, *resolved = NULL;
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(line);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Reset if the empty string is assigned */
+                c->stdin_data = mfree(c->stdin_data);
+                c->stdin_data_size = 0;
+                return 0;
+        }
+
+        ssize_t l = cunescape(rvalue, 0, &unescaped);
+        if (l < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, l,
+                           "Failed to decode C escaped text '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        r = unit_full_printf_full(u, unescaped, EXEC_STDIN_DATA_MAX, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", unescaped);
+                return 0;
+        }
+
+        size_t sz = strlen(resolved);
+        if (c->stdin_data_size + sz + 1 < c->stdin_data_size || /* check for overflow */
+            c->stdin_data_size + sz + 1 > EXEC_STDIN_DATA_MAX) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Standard input data too large (%zu), maximum of %zu permitted, ignoring.",
+                           c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX);
+                return 0;
+        }
+
+        void *p = realloc(c->stdin_data, c->stdin_data_size + sz + 1);
+        if (!p)
+                return log_oom();
+
+        *((char*) mempcpy((char*) p + c->stdin_data_size, resolved, sz)) = '\n';
+
+        c->stdin_data = p;
+        c->stdin_data_size += sz + 1;
+
+        return 0;
+}
+
+int config_parse_exec_input_data(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ void *p = NULL;
+        ExecContext *c = ASSERT_PTR(data);
+        size_t sz;
+        void *q;
+        int r;
+
+        assert(filename);
+        assert(line);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Reset if the empty string is assigned */
+                c->stdin_data = mfree(c->stdin_data);
+                c->stdin_data_size = 0;
+                return 0;
+        }
+
+        r = unbase64mem(rvalue, SIZE_MAX, &p, &sz);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to decode base64 data, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        assert(sz > 0);
+
+        if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */
+            c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Standard input data too large (%zu), maximum of %zu permitted, ignoring.",
+                           c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX);
+                return 0;
+        }
+
+        q = realloc(c->stdin_data, c->stdin_data_size + sz);
+        if (!q)
+                return log_oom();
+
+        memcpy((uint8_t*) q + c->stdin_data_size, p, sz);
+
+        c->stdin_data = q;
+        c->stdin_data_size += sz;
+
+        return 0;
+}
+
+int config_parse_exec_output(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *resolved = NULL;
+        const char *n;
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        bool obsolete = false;
+        ExecOutput eo;
+        int r;
+
+        assert(filename);
+        assert(line);
+        assert(lvalue);
+        assert(rvalue);
+
+        n = startswith(rvalue, "fd:");
+        if (n) {
+                r = unit_fd_printf(u, n, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n);
+                        return 0;
+                }
+
+                if (isempty(resolved))
+                        resolved = mfree(resolved);
+                else if (!fdname_is_valid(resolved)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved);
+                        return 0;
+                }
+
+                eo = EXEC_OUTPUT_NAMED_FD;
+
+        } else if (streq(rvalue, "syslog")) {
+                eo = EXEC_OUTPUT_JOURNAL;
+                obsolete = true;
+
+        } else if (streq(rvalue, "syslog+console")) {
+                eo = EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+                obsolete = true;
+
+        } else if ((n = startswith(rvalue, "file:"))) {
+
+                r = unit_path_printf(u, n, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                eo = EXEC_OUTPUT_FILE;
+
+        } else if ((n = startswith(rvalue, "append:"))) {
+
+                r = unit_path_printf(u, n, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                eo = EXEC_OUTPUT_FILE_APPEND;
+
+        } else if ((n = startswith(rvalue, "truncate:"))) {
+
+                r = unit_path_printf(u, n, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+                        return 0;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                eo = EXEC_OUTPUT_FILE_TRUNCATE;
+        } else {
+                eo = exec_output_from_string(rvalue);
+                if (eo < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, eo, "Failed to parse output specifier, ignoring: %s", rvalue);
+                        return 0;
+                }
+        }
+
+        if (obsolete)
+                log_syntax(unit, LOG_NOTICE, filename, line, 0,
+                           "Standard output type %s is obsolete, automatically updating to %s. Please update your unit file, and consider removing the setting altogether.",
+                           rvalue, exec_output_to_string(eo));
+
+        if (streq(lvalue, "StandardOutput")) {
+                if (eo == EXEC_OUTPUT_NAMED_FD)
+                        free_and_replace(c->stdio_fdname[STDOUT_FILENO], resolved);
+                else
+                        free_and_replace(c->stdio_file[STDOUT_FILENO], resolved);
+
+                c->std_output = eo;
+
+        } else {
+                assert(streq(lvalue, "StandardError"));
+
+                if (eo == EXEC_OUTPUT_NAMED_FD)
+                        free_and_replace(c->stdio_fdname[STDERR_FILENO], resolved);
+                else
+                        free_and_replace(c->stdio_file[STDERR_FILENO], resolved);
+
+                c->std_error = eo;
+        }
+
+        return 0;
+}
+
+int config_parse_exec_io_class(const char *unit,
+                               const char *filename,
+                               unsigned line,
+                               const char *section,
+                               unsigned section_line,
+                               const char *lvalue,
+                               int ltype,
+                               const char *rvalue,
+                               void *data,
+                               void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int x;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->ioprio_set = false;
+                c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+                return 0;
+        }
+
+        x = ioprio_class_from_string(rvalue);
+        if (x < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse IO scheduling class, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        c->ioprio = ioprio_normalize(ioprio_prio_value(x, ioprio_prio_data(c->ioprio)));
+        c->ioprio_set = true;
+
+        return 0;
+}
+
+int config_parse_exec_io_priority(const char *unit,
+                                  const char *filename,
+                                  unsigned line,
+                                  const char *section,
+                                  unsigned section_line,
+                                  const char *lvalue,
+                                  int ltype,
+                                  const char *rvalue,
+                                  void *data,
+                                  void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int i, r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->ioprio_set = false;
+                c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+                return 0;
+        }
+
+        r = ioprio_parse_priority(rvalue, &i);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse IO priority, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), i));
+        c->ioprio_set = true;
+
+        return 0;
+}
+
+int config_parse_exec_cpu_sched_policy(const char *unit,
+                                       const char *filename,
+                                       unsigned line,
+                                       const char *section,
+                                       unsigned section_line,
+                                       const char *lvalue,
+                                       int ltype,
+                                       const char *rvalue,
+                                       void *data,
+                                       void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int x;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->cpu_sched_set = false;
+                c->cpu_sched_policy = SCHED_OTHER;
+                c->cpu_sched_priority = 0;
+                return 0;
+        }
+
+        x = sched_policy_from_string(rvalue);
+        if (x < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse CPU scheduling policy, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        c->cpu_sched_policy = x;
+        /* Moving to or from real-time policy? We need to adjust the priority */
+        c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(x), sched_get_priority_max(x));
+        c->cpu_sched_set = true;
+
+        return 0;
+}
+
+int config_parse_exec_mount_apivfs(const char *unit,
+                                   const char *filename,
+                                   unsigned line,
+                                   const char *section,
+                                   unsigned section_line,
+                                   const char *lvalue,
+                                   int ltype,
+                                   const char *rvalue,
+                                   void *data,
+                                   void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int k;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->mount_apivfs_set = false;
+                c->mount_apivfs = false;
+                return 0;
+        }
+
+        k = parse_boolean(rvalue);
+        if (k < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, k,
+                           "Failed to parse boolean value, ignoring: %s",
+                           rvalue);
+                return 0;
+        }
+
+        c->mount_apivfs_set = true;
+        c->mount_apivfs = k;
+        return 0;
+}
+
+int config_parse_numa_mask(const char *unit,
+                           const char *filename,
+                           unsigned line,
+                           const char *section,
+                           unsigned section_line,
+                           const char *lvalue,
+                           int ltype,
+                           const char *rvalue,
+                           void *data,
+                           void *userdata) {
+        int r;
+        NUMAPolicy *p = ASSERT_PTR(data);
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (streq(rvalue, "all")) {
+                r = numa_mask_add_all(&p->nodes);
+                if (r < 0)
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to create NUMA mask representing \"all\" NUMA nodes, ignoring: %m");
+        } else {
+                r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue);
+                if (r < 0)
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue);
+        }
+
+        return 0;
+}
+
+int config_parse_exec_cpu_sched_prio(const char *unit,
+                                     const char *filename,
+                                     unsigned line,
+                                     const char *section,
+                                     unsigned section_line,
+                                     const char *lvalue,
+                                     int ltype,
+                                     const char *rvalue,
+                                     void *data,
+                                     void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int i, r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = safe_atoi(rvalue, &i);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CPU scheduling priority, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set later so
+         * we do not check the precise range, but only the generic outer bounds. */
+        if (i < 0 || i > 99) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "CPU scheduling priority is out of range, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        c->cpu_sched_priority = i;
+        c->cpu_sched_set = true;
+
+        return 0;
+}
+
+int config_parse_root_image_options(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+        _cleanup_strv_free_ char **l = NULL;
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->root_image_options = mount_options_free_all(c->root_image_options);
+                return 0;
+        }
+
+        r = strv_split_colon_pairs(&l, rvalue);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+                return 0;
+        }
+
+        STRV_FOREACH_PAIR(first, second, l) {
+                MountOptions *o = NULL;
+                _cleanup_free_ char *mount_options_resolved = NULL;
+                const char *mount_options = NULL, *partition = "root";
+                PartitionDesignator partition_designator;
+
+                /* Format is either 'root:foo' or 'foo' (root is implied) */
+                if (!isempty(*second)) {
+                        partition = *first;
+                        mount_options = *second;
+                } else
+                        mount_options = *first;
+
+                partition_designator = partition_designator_from_string(partition);
+                if (partition_designator < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, partition_designator,
+                                   "Invalid partition name %s, ignoring", partition);
+                        continue;
+                }
+                r = unit_full_printf(u, mount_options, &mount_options_resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+                        continue;
+                }
+
+                o = new(MountOptions, 1);
+                if (!o)
+                        return log_oom();
+                *o = (MountOptions) {
+                        .partition_designator = partition_designator,
+                        .options = TAKE_PTR(mount_options_resolved),
+                };
+                LIST_APPEND(mount_options, options, TAKE_PTR(o));
+        }
+
+        if (options)
+                LIST_JOIN(mount_options, c->root_image_options, options);
+        else
+                /* empty spaces/separators only */
+                c->root_image_options = mount_options_free_all(c->root_image_options);
+
+        return 0;
+}
+
+int config_parse_exec_root_hash(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ void *roothash_decoded = NULL;
+        ExecContext *c = ASSERT_PTR(data);
+        size_t roothash_decoded_size = 0;
+        int r;
+
+        assert(filename);
+        assert(line);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Reset if the empty string is assigned */
+                c->root_hash_path = mfree(c->root_hash_path);
+                c->root_hash = mfree(c->root_hash);
+                c->root_hash_size = 0;
+                return 0;
+        }
+
+        if (path_is_absolute(rvalue)) {
+                /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */
+                _cleanup_free_ char *p = NULL;
+
+                p = strdup(rvalue);
+                if (!p)
+                        return -ENOMEM;
+
+                free_and_replace(c->root_hash_path, p);
+                c->root_hash = mfree(c->root_hash);
+                c->root_hash_size = 0;
+                return 0;
+        }
+
+        /* We have a roothash to decode, eg: RootHash=012345789abcdef */
+        r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue);
+                return 0;
+        }
+        if (roothash_decoded_size < sizeof(sd_id128_t)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "RootHash= is too short, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        free_and_replace(c->root_hash, roothash_decoded);
+        c->root_hash_size = roothash_decoded_size;
+        c->root_hash_path = mfree(c->root_hash_path);
+
+        return 0;
+}
+
+int config_parse_exec_root_hash_sig(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ void *roothash_sig_decoded = NULL;
+        char *value;
+        ExecContext *c = ASSERT_PTR(data);
+        size_t roothash_sig_decoded_size = 0;
+        int r;
+
+        assert(filename);
+        assert(line);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Reset if the empty string is assigned */
+                c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+                c->root_hash_sig = mfree(c->root_hash_sig);
+                c->root_hash_sig_size = 0;
+                return 0;
+        }
+
+        if (path_is_absolute(rvalue)) {
+                /* We have the path to a roothash signature to load and decode, eg: RootHashSignature=/foo/bar.roothash.p7s */
+                _cleanup_free_ char *p = NULL;
+
+                p = strdup(rvalue);
+                if (!p)
+                        return log_oom();
+
+                free_and_replace(c->root_hash_sig_path, p);
+                c->root_hash_sig = mfree(c->root_hash_sig);
+                c->root_hash_sig_size = 0;
+                return 0;
+        }
+
+        if (!(value = startswith(rvalue, "base64:"))) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Failed to decode RootHashSignature=, not a path but doesn't start with 'base64:', ignoring: %s", rvalue);
+                return 0;
+        }
+
+        /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */
+        r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        free_and_replace(c->root_hash_sig, roothash_sig_decoded);
+        c->root_hash_sig_size = roothash_sig_decoded_size;
+        c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+
+        return 0;
+}
+
+int config_parse_exec_cpu_affinity(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        _cleanup_free_ char *k = NULL;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (streq(rvalue, "numa")) {
+                c->cpu_affinity_from_numa = true;
+                cpu_set_reset(&c->cpu_set);
+
+                return 0;
+        }
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m",
+                           rvalue);
+                return 0;
+        }
+
+        r = parse_cpu_set_extend(k, &c->cpu_set, true, unit, filename, line, lvalue);
+        if (r >= 0)
+                c->cpu_affinity_from_numa = false;
+
+        return 0;
+}
+
+int config_parse_capability_set(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        uint64_t *capability_set = ASSERT_PTR(data);
+        uint64_t sum = 0, initial, def;
+        bool invert = false;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        if (streq(lvalue, "CapabilityBoundingSet")) {
+                initial = CAP_MASK_ALL; /* initialized to all bits on */
+                def = CAP_MASK_UNSET;   /* not set */
+        } else
+                def = initial = 0; /* All bits off */
+
+        r = capability_set_from_string(rvalue, &sum);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= specifier '%s', ignoring: %m", lvalue, rvalue);
+                return 0;
+        }
+
+        if (sum == 0 || *capability_set == def)
+                /* "", "~" or uninitialized data -> replace */
+                *capability_set = invert ? ~sum : sum;
+        else {
+                /* previous data -> merge */
+                if (invert)
+                        *capability_set &= ~sum;
+                else
+                        *capability_set |= sum;
+        }
+
+        return 0;
+}
+
+int config_parse_exec_selinux_context(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        bool ignore;
+        char *k;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->selinux_context = mfree(c->selinux_context);
+                c->selinux_context_ignore = false;
+                return 0;
+        }
+
+        if (rvalue[0] == '-') {
+                ignore = true;
+                rvalue++;
+        } else
+                ignore = false;
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s'%s: %m",
+                           rvalue, ignore ? ", ignoring" : "");
+                return ignore ? 0 : -ENOEXEC;
+        }
+
+        free_and_replace(c->selinux_context, k);
+        c->selinux_context_ignore = ignore;
+
+        return 0;
+}
+
+int config_parse_exec_apparmor_profile(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        bool ignore;
+        char *k;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->apparmor_profile = mfree(c->apparmor_profile);
+                c->apparmor_profile_ignore = false;
+                return 0;
+        }
+
+        if (rvalue[0] == '-') {
+                ignore = true;
+                rvalue++;
+        } else
+                ignore = false;
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s'%s: %m",
+                           rvalue, ignore ? ", ignoring" : "");
+                return ignore ? 0 : -ENOEXEC;
+        }
+
+        free_and_replace(c->apparmor_profile, k);
+        c->apparmor_profile_ignore = ignore;
+
+        return 0;
+}
+
+int config_parse_exec_smack_process_label(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        bool ignore;
+        char *k;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->smack_process_label = mfree(c->smack_process_label);
+                c->smack_process_label_ignore = false;
+                return 0;
+        }
+
+        if (rvalue[0] == '-') {
+                ignore = true;
+                rvalue++;
+        } else
+                ignore = false;
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s'%s: %m",
+                           rvalue, ignore ? ", ignoring" : "");
+                return ignore ? 0 : -ENOEXEC;
+        }
+
+        free_and_replace(c->smack_process_label, k);
+        c->smack_process_label_ignore = ignore;
+
+        return 0;
+}
+
+int config_parse_timer(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+        _cleanup_free_ char *k = NULL;
+        const Unit *u = userdata;
+        Timer *t = ASSERT_PTR(data);
+        usec_t usec = 0;
+        TimerValue *v;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets list */
+                timer_free_values(t);
+                return 0;
+        }
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        if (ltype == TIMER_CALENDAR) {
+                r = calendar_spec_from_string(k, &c);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse calendar specification, ignoring: %s", k);
+                        return 0;
+                }
+        } else {
+                r = parse_sec(k, &usec);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", k);
+                        return 0;
+                }
+        }
+
+        v = new(TimerValue, 1);
+        if (!v)
+                return log_oom();
+
+        *v = (TimerValue) {
+                .base = ltype,
+                .value = usec,
+                .calendar_spec = TAKE_PTR(c),
+        };
+
+        LIST_PREPEND(value, t->values, v);
+
+        return 0;
+}
+
+int config_parse_trigger_unit(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *p = NULL;
+        Unit *u = ASSERT_PTR(data);
+        UnitType type;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (UNIT_TRIGGER(u)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Multiple units to trigger specified, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        r = unit_name_printf(u, rvalue, &p);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        type = unit_name_to_type(p);
+        if (type < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, type, "Unit type not valid, ignoring: %s", rvalue);
+                return 0;
+        }
+        if (unit_has_name(u, p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Units cannot trigger themselves, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        r = unit_add_two_dependencies_by_name(u, UNIT_BEFORE, UNIT_TRIGGERS, p, true, UNIT_DEPENDENCY_FILE);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add trigger on %s, ignoring: %m", p);
+                return 0;
+        }
+
+        return 0;
+}
+
+int config_parse_path_spec(const char *unit,
+                           const char *filename,
+                           unsigned line,
+                           const char *section,
+                           unsigned section_line,
+                           const char *lvalue,
+                           int ltype,
+                           const char *rvalue,
+                           void *data,
+                           void *userdata) {
+
+        Path *p = ASSERT_PTR(data);
+        PathSpec *s;
+        PathType b;
+        _cleanup_free_ char *k = NULL;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment clears list */
+                path_free_specs(p);
+                return 0;
+        }
+
+        b = path_type_from_string(lvalue);
+        if (b < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, b, "Failed to parse path type, ignoring: %s", lvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(UNIT(p), rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        s = new0(PathSpec, 1);
+        if (!s)
+                return log_oom();
+
+        s->unit = UNIT(p);
+        s->path = TAKE_PTR(k);
+        s->type = b;
+        s->inotify_fd = -EBADF;
+
+        LIST_PREPEND(spec, p->specs, s);
+
+        return 0;
+}
+
+int config_parse_socket_service(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_free_ char *p = NULL;
+        Socket *s = ASSERT_PTR(data);
+        Unit *x;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = unit_name_printf(UNIT(s), rvalue, &p);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        if (!endswith(p, ".service")) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type service, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        r = manager_load_unit(UNIT(s)->manager, p, NULL, &error, &x);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load unit %s, ignoring: %s", rvalue, bus_error_message(&error, r));
+                return 0;
+        }
+
+        unit_ref_set(&s->service, UNIT(s), x);
+
+        return 0;
+}
+
+int config_parse_fdname(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *p = NULL;
+        Socket *s = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                s->fdname = mfree(s->fdname);
+                return 0;
+        }
+
+        r = unit_fd_printf(UNIT(s), rvalue, &p);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        if (!fdname_is_valid(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", p);
+                return 0;
+        }
+
+        return free_and_replace(s->fdname, p);
+}
+
+int config_parse_service_sockets(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Service *s = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Trailing garbage in sockets, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = unit_name_printf(UNIT(s), word, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+                        continue;
+                }
+
+                if (!endswith(k, ".socket")) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type socket, ignoring: %s", k);
+                        continue;
+                }
+
+                r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_WANTS, UNIT_AFTER, k, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+
+                r = unit_add_dependency_by_name(UNIT(s), UNIT_TRIGGERED_BY, k, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+        }
+}
+
+int config_parse_bus_name(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *k = NULL;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = unit_full_printf_full(u, rvalue, SD_BUS_MAXIMUM_NAME_LENGTH, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        if (!sd_bus_service_name_is_valid(k)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid bus name, ignoring: %s", k);
+                return 0;
+        }
+
+        return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_service_timeout(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Service *s = ASSERT_PTR(userdata);
+        usec_t usec;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        /* This is called for two cases: TimeoutSec= and TimeoutStartSec=. */
+
+        /* Traditionally, these options accepted 0 to disable the timeouts. However, a timeout of 0 suggests it happens
+         * immediately, hence fix this to become USEC_INFINITY instead. This is in-line with how we internally handle
+         * all other timeouts. */
+        r = parse_sec_fix_0(rvalue, &usec);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue);
+                return 0;
+        }
+
+        s->start_timeout_defined = true;
+        s->timeout_start_usec = usec;
+
+        if (streq(lvalue, "TimeoutSec"))
+                s->timeout_stop_usec = usec;
+
+        return 0;
+}
+
+int config_parse_timeout_abort(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        usec_t *ret = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        /* Note: apart from setting the arg, this returns an extra bit of information in the return value. */
+
+        if (isempty(rvalue)) {
+                *ret = 0;
+                return 0; /* "not set" */
+        }
+
+        r = parse_sec(rvalue, ret);
+        if (r < 0)
+                return log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= setting, ignoring: %s", lvalue, rvalue);
+
+        return 1; /* "set" */
+}
+
+int config_parse_service_timeout_abort(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Service *s = ASSERT_PTR(userdata);
+        int r;
+
+        r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
+                                       &s->timeout_abort_usec, s);
+        if (r >= 0)
+                s->timeout_abort_set = r;
+        return 0;
+}
+
+int config_parse_user_group_compat(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *k = NULL;
+        char **user = data;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                *user = mfree(*user);
+                return 0;
+        }
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue);
+                return -ENOEXEC;
+        }
+
+        if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) {
+                log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k);
+                return -ENOEXEC;
+        }
+
+        if (strstr(lvalue, "User") && streq(k, NOBODY_USER_NAME))
+                log_struct(LOG_NOTICE,
+                           "MESSAGE=%s:%u: Special user %s configured, this is not safe!", filename, line, k,
+                           "UNIT=%s", unit,
+                           "MESSAGE_ID=" SD_MESSAGE_NOBODY_USER_UNSUITABLE_STR,
+                           "OFFENDING_USER=%s", k,
+                           "CONFIG_FILE=%s", filename,
+                           "CONFIG_LINE=%u", line);
+
+        return free_and_replace(*user, k);
+}
+
+int config_parse_user_group_strv_compat(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        char ***users = data;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                *users = strv_free(*users);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax: %s", rvalue);
+                        return -ENOEXEC;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = unit_full_printf(u, word, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", word);
+                        return -ENOEXEC;
+                }
+
+                if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) {
+                        log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k);
+                        return -ENOEXEC;
+                }
+
+                r = strv_push(users, k);
+                if (r < 0)
+                        return log_oom();
+
+                k = NULL;
+        }
+}
+
+int config_parse_working_directory(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = ASSERT_PTR(userdata);
+        bool missing_ok;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->working_directory_home = false;
+                c->working_directory = mfree(c->working_directory);
+                return 0;
+        }
+
+        if (rvalue[0] == '-') {
+                missing_ok = true;
+                rvalue++;
+        } else
+                missing_ok = false;
+
+        if (streq(rvalue, "~")) {
+                c->working_directory_home = true;
+                c->working_directory = mfree(c->working_directory);
+        } else {
+                _cleanup_free_ char *k = NULL;
+
+                r = unit_path_printf(u, rvalue, &k);
+                if (r < 0) {
+                        log_syntax(unit, missing_ok ? LOG_WARNING : LOG_ERR, filename, line, r,
+                                   "Failed to resolve unit specifiers in working directory path '%s'%s: %m",
+                                   rvalue, missing_ok ? ", ignoring" : "");
+                        return missing_ok ? 0 : -ENOEXEC;
+                }
+
+                r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue);
+                if (r < 0)
+                        return missing_ok ? 0 : -ENOEXEC;
+
+                c->working_directory_home = false;
+                free_and_replace(c->working_directory, k);
+        }
+
+        c->working_directory_missing_ok = missing_ok;
+        return 0;
+}
+
+int config_parse_unit_env_file(const char *unit,
+                               const char *filename,
+                               unsigned line,
+                               const char *section,
+                               unsigned section_line,
+                               const char *lvalue,
+                               int ltype,
+                               const char *rvalue,
+                               void *data,
+                               void *userdata) {
+
+        char ***env = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        _cleanup_free_ char *n = NULL;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment frees the list */
+                *env = strv_free(*env);
+                return 0;
+        }
+
+        r = unit_full_printf_full(u, rvalue, PATH_MAX, &n);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(n[0] == '-' ? n + 1 : n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        r = strv_push(env, n);
+        if (r < 0)
+                return log_oom();
+
+        n = NULL;
+
+        return 0;
+}
+
+int config_parse_environ(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        const Unit *u = userdata;
+        char ***env = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *env = strv_free(*env);
+                return 0;
+        }
+
+        /* If 'u' is set, we operate on the regular unit specifier table. Otherwise we use a manager-specific
+         * specifier table (in which case ltype must contain the runtime scope). */
+        const Specifier *table = u ? NULL : (const Specifier[]) {
+                COMMON_SYSTEM_SPECIFIERS,
+                COMMON_TMP_SPECIFIERS,
+                COMMON_CREDS_SPECIFIERS(ltype),
+                { 'h', specifier_user_home,  NULL },
+                { 's', specifier_user_shell, NULL },
+        };
+
+        for (const char *p = rvalue;; ) {
+                _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                if (table)
+                        r = specifier_printf(word, sc_arg_max(), table, NULL, NULL, &resolved);
+                else
+                        r = unit_env_printf(u, word, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve specifiers in %s, ignoring: %m", word);
+                        continue;
+                }
+
+                if (!env_assignment_is_valid(resolved)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                   "Invalid environment assignment, ignoring: %s", resolved);
+                        continue;
+                }
+
+                r = strv_env_replace_consume(env, TAKE_PTR(resolved));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to update environment: %m");
+        }
+}
+
+int config_parse_pass_environ(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_strv_free_ char **n = NULL;
+        const Unit *u = userdata;
+        char*** passenv = ASSERT_PTR(data);
+        size_t nlen = 0;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *passenv = strv_free(*passenv);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+                        break;
+                }
+                if (r == 0)
+                        break;
+
+                if (u) {
+                        r = unit_env_printf(u, word, &k);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r,
+                                           "Failed to resolve specifiers in %s, ignoring: %m", word);
+                                continue;
+                        }
+                } else
+                        k = TAKE_PTR(word);
+
+                if (!env_name_is_valid(k)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                   "Invalid environment name for %s, ignoring: %s", lvalue, k);
+                        continue;
+                }
+
+                if (!GREEDY_REALLOC(n, nlen + 2))
+                        return log_oom();
+
+                n[nlen++] = TAKE_PTR(k);
+                n[nlen] = NULL;
+        }
+
+        if (n) {
+                r = strv_extend_strv(passenv, n, true);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+int config_parse_unset_environ(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_strv_free_ char **n = NULL;
+        char*** unsetenv = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        size_t nlen = 0;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *unsetenv = strv_free(*unsetenv);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+                        break;
+                }
+                if (r == 0)
+                        break;
+
+                if (u) {
+                        r = unit_env_printf(u, word, &k);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r,
+                                           "Failed to resolve unit specifiers in %s, ignoring: %m", word);
+                                continue;
+                        }
+                } else
+                        k = TAKE_PTR(word);
+
+                if (!env_assignment_is_valid(k) && !env_name_is_valid(k)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                   "Invalid environment name or assignment %s, ignoring: %s", lvalue, k);
+                        continue;
+                }
+
+                if (!GREEDY_REALLOC(n, nlen + 2))
+                        return log_oom();
+
+                n[nlen++] = TAKE_PTR(k);
+                n[nlen] = NULL;
+        }
+
+        if (n) {
+                r = strv_extend_strv(unsetenv, n, true);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+int config_parse_log_extra_fields(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                exec_context_free_log_extra_fields(c);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *k = NULL;
+                struct iovec *t;
+                const char *eq;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = unit_full_printf(u, word, &k);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", word);
+                        continue;
+                }
+
+                eq = strchr(k, '=');
+                if (!eq) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field lacks '=' character, ignoring: %s", k);
+                        continue;
+                }
+
+                if (!journal_field_valid(k, eq-k, false)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field name is invalid, ignoring: %s", k);
+                        continue;
+                }
+
+                t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec));
+                if (!t)
+                        return log_oom();
+
+                c->log_extra_fields = t;
+                c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE_STRING(k);
+
+                k = NULL;
+        }
+}
+
+int config_parse_log_namespace(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *k = NULL;
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->log_namespace = mfree(c->log_namespace);
+                return 0;
+        }
+
+        r = unit_full_printf_full(u, rvalue, NAME_MAX, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        if (!log_namespace_name_valid(k)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Specified log namespace name is not valid, ignoring: %s", k);
+                return 0;
+        }
+
+        free_and_replace(c->log_namespace, k);
+        return 0;
+}
+
+int config_parse_unit_condition_path(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *p = NULL;
+        Condition **list = ASSERT_PTR(data), *c;
+        ConditionType t = ltype;
+        bool trigger, negate;
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *list = condition_free_list(*list);
+                return 0;
+        }
+
+        trigger = rvalue[0] == '|';
+        if (trigger)
+                rvalue++;
+
+        negate = rvalue[0] == '!';
+        if (negate)
+                rvalue++;
+
+        r = unit_path_printf(u, rvalue, &p);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(p, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        c = condition_new(t, p, trigger, negate);
+        if (!c)
+                return log_oom();
+
+        LIST_PREPEND(conditions, *list, c);
+        return 0;
+}
+
+int config_parse_unit_condition_string(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *s = NULL;
+        Condition **list = ASSERT_PTR(data), *c;
+        ConditionType t = ltype;
+        bool trigger, negate;
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *list = condition_free_list(*list);
+                return 0;
+        }
+
+        trigger = *rvalue == '|';
+        if (trigger)
+                rvalue += 1 + strspn(rvalue + 1, WHITESPACE);
+
+        negate = *rvalue == '!';
+        if (negate)
+                rvalue += 1 + strspn(rvalue + 1, WHITESPACE);
+
+        r = unit_full_printf(u, rvalue, &s);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        c = condition_new(t, s, trigger, negate);
+        if (!c)
+                return log_oom();
+
+        LIST_PREPEND(conditions, *list, c);
+        return 0;
+}
+
+int config_parse_unit_requires_mounts_for(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(data);
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = unit_path_printf(u, word, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved);
+                        continue;
+                }
+        }
+}
+
+int config_parse_documentation(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+        char **a, **b;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                u->documentation = strv_free(u->documentation);
+                return 0;
+        }
+
+        r = config_parse_unit_strv_printf(unit, filename, line, section, section_line, lvalue, ltype,
+                                          rvalue, data, userdata);
+        if (r < 0)
+                return r;
+
+        for (a = b = u->documentation; a && *a; a++) {
+
+                if (documentation_url_is_valid(*a))
+                        *(b++) = *a;
+                else {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid URL, ignoring: %s", *a);
+                        free(*a);
+                }
+        }
+        if (b)
+                *b = NULL;
+
+        return 0;
+}
+
+#if HAVE_SECCOMP
+int config_parse_syscall_filter(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = data;
+        _unused_ const Unit *u = ASSERT_PTR(userdata);
+        bool invert = false;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->syscall_filter = hashmap_free(c->syscall_filter);
+                c->syscall_allow_list = false;
+                return 0;
+        }
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        if (!c->syscall_filter) {
+                c->syscall_filter = hashmap_new(NULL);
+                if (!c->syscall_filter)
+                        return log_oom();
+
+                if (invert)
+                        /* Allow everything but the ones listed */
+                        c->syscall_allow_list = false;
+                else {
+                        /* Allow nothing but the ones listed */
+                        c->syscall_allow_list = true;
+
+                        /* Accept default syscalls if we are on an allow_list */
+                        r = seccomp_parse_syscall_filter(
+                                        "@default", -1, c->syscall_filter,
+                                        SECCOMP_PARSE_PERMISSIVE|SECCOMP_PARSE_ALLOW_LIST,
+                                        unit,
+                                        NULL, 0);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *name = NULL;
+                int num;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = parse_syscall_and_errno(word, &name, &num);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to parse syscall:errno, ignoring: %s", word);
+                        continue;
+                }
+                if (!invert && num >= 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                   "Allow-listed system calls cannot take error number, ignoring: %s", word);
+                        continue;
+                }
+
+                r = seccomp_parse_syscall_filter(
+                                name, num, c->syscall_filter,
+                                SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|
+                                (invert ? SECCOMP_PARSE_INVERT : 0)|
+                                (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+                                unit, filename, line);
+                if (r < 0)
+                        return r;
+        }
+}
+
+int config_parse_syscall_log(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = data;
+        _unused_ const Unit *u = ASSERT_PTR(userdata);
+        bool invert = false;
+        const char *p;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->syscall_log = hashmap_free(c->syscall_log);
+                c->syscall_log_allow_list = false;
+                return 0;
+        }
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        if (!c->syscall_log) {
+                c->syscall_log = hashmap_new(NULL);
+                if (!c->syscall_log)
+                        return log_oom();
+
+                if (invert)
+                        /* Log everything but the ones listed */
+                        c->syscall_log_allow_list = false;
+                else
+                        /* Log nothing but the ones listed */
+                        c->syscall_log_allow_list = true;
+        }
+
+        p = rvalue;
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = seccomp_parse_syscall_filter(
+                                word, -1, c->syscall_log,
+                                SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|
+                                (invert ? SECCOMP_PARSE_INVERT : 0)|
+                                (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+                                unit, filename, line);
+                if (r < 0)
+                        return r;
+        }
+}
+
+int config_parse_syscall_archs(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Set **archs = data;
+        int r;
+
+        if (isempty(rvalue)) {
+                *archs = set_free(*archs);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL;
+                uint32_t a;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                r = seccomp_arch_from_string(word, &a);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to parse system call architecture \"%s\", ignoring: %m", word);
+                        continue;
+                }
+
+                r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1));
+                if (r < 0)
+                        return log_oom();
+        }
+}
+
+int config_parse_syscall_errno(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = data;
+        int e;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue) || streq(rvalue, "kill")) {
+                /* Empty assignment resets to KILL */
+                c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
+                return 0;
+        }
+
+        e = parse_errno(rvalue);
+        if (e < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, e, "Failed to parse error number, ignoring: %s", rvalue);
+                return 0;
+        }
+        if (e == 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid error number, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        c->syscall_errno = e;
+        return 0;
+}
+
+int config_parse_address_families(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = data;
+        bool invert = false;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->address_families = set_free(c->address_families);
+                c->address_families_allow_list = false;
+                return 0;
+        }
+
+        if (streq(rvalue, "none")) {
+                /* Forbid all address families. */
+                c->address_families = set_free(c->address_families);
+                c->address_families_allow_list = true;
+                return 0;
+        }
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        if (!c->address_families) {
+                c->address_families = set_new(NULL);
+                if (!c->address_families)
+                        return log_oom();
+
+                c->address_families_allow_list = !invert;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL;
+                int af;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                af = af_from_name(word);
+                if (af < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, af,
+                                   "Failed to parse address family, ignoring: %s", word);
+                        continue;
+                }
+
+                /* If we previously wanted to forbid an address family and now
+                 * we want to allow it, then just remove it from the list.
+                 */
+                if (!invert == c->address_families_allow_list)  {
+                        r = set_put(c->address_families, INT_TO_PTR(af));
+                        if (r < 0)
+                                return log_oom();
+                } else
+                        set_remove(c->address_families, INT_TO_PTR(af));
+        }
+}
+
+int config_parse_restrict_namespaces(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = data;
+        unsigned long flags;
+        bool invert = false;
+        int r;
+
+        if (isempty(rvalue)) {
+                /* Reset to the default. */
+                c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
+                return 0;
+        }
+
+        /* Boolean parameter ignores the previous settings */
+        r = parse_boolean(rvalue);
+        if (r > 0) {
+                c->restrict_namespaces = 0;
+                return 0;
+        } else if (r == 0) {
+                c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
+                return 0;
+        }
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        /* Not a boolean argument, in this case it's a list of namespace types. */
+        r = namespace_flags_from_string(rvalue, &flags);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL)
+                /* Initial assignment. Just set the value. */
+                c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags;
+        else
+                /* Merge the value with the previous one. */
+                SET_FLAG(c->restrict_namespaces, flags, !invert);
+
+        return 0;
+}
+#endif
+
+int config_parse_restrict_filesystems(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+        ExecContext *c = ASSERT_PTR(data);
+        bool invert = false;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+                c->restrict_filesystems_allow_list = false;
+                return 0;
+        }
+
+        if (rvalue[0] == '~') {
+                invert = true;
+                rvalue++;
+        }
+
+        if (!c->restrict_filesystems) {
+                if (invert)
+                        /* Allow everything but the ones listed */
+                        c->restrict_filesystems_allow_list = false;
+                else
+                        /* Allow nothing but the ones listed */
+                        c->restrict_filesystems_allow_list = true;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == 0)
+                        break;
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+                        break;
+                }
+
+                r = lsm_bpf_parse_filesystem(
+                              word,
+                              &c->restrict_filesystems,
+                              FILESYSTEM_PARSE_LOG|
+                              (invert ? FILESYSTEM_PARSE_INVERT : 0)|
+                              (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
+                              unit, filename, line);
+
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int config_parse_unit_slice(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_free_ char *k = NULL;
+        Unit *u = userdata, *slice;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(u);
+
+        r = unit_name_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+                return 0;
+        }
+
+        r = manager_load_unit(u->manager, k, NULL, &error, &slice);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load slice unit %s, ignoring: %s", k, bus_error_message(&error, r));
+                return 0;
+        }
+
+        r = unit_set_slice(u, slice);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to assign slice %s to unit %s, ignoring: %m", slice->id, u->id);
+                return 0;
+        }
+
+        return 0;
+}
+
+int config_parse_cpu_quota(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CGroupContext *c = data;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                c->cpu_quota_per_sec_usec = USEC_INFINITY;
+                return 0;
+        }
+
+        r = parse_permyriad_unbounded(rvalue);
+        if (r <= 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid CPU quota '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        c->cpu_quota_per_sec_usec = ((usec_t) r * USEC_PER_SEC) / 10000U;
+        return 0;
+}
+
+int config_parse_allowed_cpuset(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CPUSet *c = data;
+        const Unit *u = userdata;
+        _cleanup_free_ char *k = NULL;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = unit_full_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m",
+                           rvalue);
+                return 0;
+        }
+
+        (void) parse_cpu_set_extend(k, c, true, unit, filename, line, lvalue);
+        return 0;
+}
+
+int config_parse_memory_limit(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CGroupContext *c = data;
+        uint64_t bytes = CGROUP_LIMIT_MAX;
+        int r;
+
+        if (isempty(rvalue) && STR_IN_SET(lvalue, "DefaultMemoryLow",
+                                                  "DefaultMemoryMin",
+                                                  "MemoryLow",
+                                                  "StartupMemoryLow",
+                                                  "MemoryMin"))
+                bytes = CGROUP_LIMIT_MIN;
+        else if (!isempty(rvalue) && !streq(rvalue, "infinity")) {
+
+                r = parse_permyriad(rvalue);
+                if (r < 0) {
+                        r = parse_size(rvalue, 1024, &bytes);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid memory limit '%s', ignoring: %m", rvalue);
+                                return 0;
+                        }
+                } else
+                        bytes = physical_memory_scale(r, 10000U);
+
+                if (bytes >= UINT64_MAX ||
+                    (bytes <= 0 && !STR_IN_SET(lvalue,
+                                               "MemorySwapMax",
+                                               "StartupMemorySwapMax",
+                                               "MemoryZSwapMax",
+                                               "StartupMemoryZSwapMax",
+                                               "MemoryLow",
+                                               "StartupMemoryLow",
+                                               "MemoryMin",
+                                               "DefaultMemoryLow",
+                                               "DefaultstartupMemoryLow",
+                                               "DefaultMemoryMin"))) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Memory limit '%s' out of range, ignoring.", rvalue);
+                        return 0;
+                }
+        }
+
+        if (streq(lvalue, "DefaultMemoryLow")) {
+                c->default_memory_low = bytes;
+                c->default_memory_low_set = true;
+        } else if (streq(lvalue, "DefaultStartupMemoryLow")) {
+                c->default_startup_memory_low = bytes;
+                c->default_startup_memory_low_set = true;
+        } else if (streq(lvalue, "DefaultMemoryMin")) {
+                c->default_memory_min = bytes;
+                c->default_memory_min_set = true;
+        } else if (streq(lvalue, "MemoryMin")) {
+                c->memory_min = bytes;
+                c->memory_min_set = true;
+        } else if (streq(lvalue, "MemoryLow")) {
+                c->memory_low = bytes;
+                c->memory_low_set = true;
+        } else if (streq(lvalue, "StartupMemoryLow")) {
+                c->startup_memory_low = bytes;
+                c->startup_memory_low_set = true;
+        } else if (streq(lvalue, "MemoryHigh"))
+                c->memory_high = bytes;
+        else if (streq(lvalue, "StartupMemoryHigh")) {
+                c->startup_memory_high = bytes;
+                c->startup_memory_high_set = true;
+        } else if (streq(lvalue, "MemoryMax"))
+                c->memory_max = bytes;
+        else if (streq(lvalue, "StartupMemoryMax")) {
+                c->startup_memory_max = bytes;
+                c->startup_memory_max_set = true;
+        } else if (streq(lvalue, "MemorySwapMax"))
+                c->memory_swap_max = bytes;
+        else if (streq(lvalue, "StartupMemorySwapMax")) {
+                c->startup_memory_swap_max = bytes;
+                c->startup_memory_swap_max_set = true;
+        } else if (streq(lvalue, "MemoryZSwapMax"))
+                c->memory_zswap_max = bytes;
+        else if (streq(lvalue, "StartupMemoryZSwapMax")) {
+                c->startup_memory_zswap_max = bytes;
+                c->startup_memory_zswap_max_set = true;
+        } else if (streq(lvalue, "MemoryLimit")) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Unit uses MemoryLimit=; please use MemoryMax= instead. Support for MemoryLimit= will be removed soon.");
+                c->memory_limit = bytes;
+        } else
+                return -EINVAL;
+
+        return 0;
+}
+
+int config_parse_tasks_max(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        const Unit *u = userdata;
+        CGroupTasksMax *tasks_max = data;
+        uint64_t v;
+        int r;
+
+        if (isempty(rvalue)) {
+                *tasks_max = u ? u->manager->defaults.tasks_max : CGROUP_TASKS_MAX_UNSET;
+                return 0;
+        }
+
+        if (streq(rvalue, "infinity")) {
+                *tasks_max = CGROUP_TASKS_MAX_UNSET;
+                return 0;
+        }
+
+        r = parse_permyriad(rvalue);
+        if (r >= 0)
+                *tasks_max = (CGroupTasksMax) { r, 10000U }; /* r‱ */
+        else {
+                r = safe_atou64(rvalue, &v);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue);
+                        return 0;
+                }
+
+                if (v <= 0 || v >= UINT64_MAX) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue);
+                        return 0;
+                }
+
+                *tasks_max = (CGroupTasksMax) { v };
+        }
+
+        return 0;
+}
+
+int config_parse_delegate(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CGroupContext *c = data;
+        UnitType t;
+        int r;
+
+        t = unit_name_to_type(unit);
+        assert(t != _UNIT_TYPE_INVALID);
+
+        if (!unit_vtable[t]->can_delegate) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Delegate= setting not supported for this unit type, ignoring.");
+                return 0;
+        }
+
+        /* We either accept a boolean value, which may be used to turn on delegation for all controllers, or
+         * turn it off for all. Or it takes a list of controller names, in which case we add the specified
+         * controllers to the mask to delegate. Delegate= enables delegation without any controllers. */
+
+        if (isempty(rvalue)) {
+                /* An empty string resets controllers and sets Delegate=yes. */
+                c->delegate = true;
+                c->delegate_controllers = 0;
+                return 0;
+        }
+
+        r = parse_boolean(rvalue);
+        if (r < 0) {
+                CGroupMask mask = 0;
+
+                for (const char *p = rvalue;;) {
+                        _cleanup_free_ char *word = NULL;
+                        CGroupController cc;
+
+                        r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+                                return 0;
+                        }
+                        if (r == 0)
+                                break;
+
+                        cc = cgroup_controller_from_string(word);
+                        if (cc < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid controller name '%s', ignoring", word);
+                                continue;
+                        }
+
+                        mask |= CGROUP_CONTROLLER_TO_MASK(cc);
+                }
+
+                c->delegate = true;
+                c->delegate_controllers |= mask;
+
+        } else if (r > 0) {
+                c->delegate = true;
+                c->delegate_controllers = CGROUP_MASK_DELEGATE;
+        } else {
+                c->delegate = false;
+                c->delegate_controllers = 0;
+        }
+
+        return 0;
+}
+
+int config_parse_delegate_subgroup(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CGroupContext *c = ASSERT_PTR(data);
+        UnitType t;
+
+        t = unit_name_to_type(unit);
+        assert(t >= 0);
+
+        if (!unit_vtable[t]->can_delegate) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring.");
+                return 0;
+        }
+
+        if (isempty(rvalue)) {
+                c->delegate_subgroup = mfree(c->delegate_subgroup);
+                return 0;
+        }
+
+        if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        return free_and_strdup_warn(&c->delegate_subgroup, rvalue);
+}
+
+int config_parse_managed_oom_mode(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ManagedOOMMode *mode = data, m;
+        UnitType t;
+
+        t = unit_name_to_type(unit);
+        assert(t != _UNIT_TYPE_INVALID);
+
+        if (!unit_vtable[t]->can_set_managed_oom)
+                return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+        if (isempty(rvalue)) {
+                *mode = MANAGED_OOM_AUTO;
+                return 0;
+        }
+
+        m = managed_oom_mode_from_string(rvalue);
+        if (m < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, m, "Invalid syntax, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        *mode = m;
+        return 0;
+}
+
+int config_parse_managed_oom_mem_pressure_limit(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        uint32_t *limit = data;
+        UnitType t;
+        int r;
+
+        t = unit_name_to_type(unit);
+        assert(t != _UNIT_TYPE_INVALID);
+
+        if (!unit_vtable[t]->can_set_managed_oom)
+                return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+        if (isempty(rvalue)) {
+                *limit = 0;
+                return 0;
+        }
+
+        r = parse_permyriad(rvalue);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse memory pressure limit value, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        /* Normalize to 2^32-1 == 100% */
+        *limit = UINT32_SCALE_FROM_PERMYRIAD(r);
+        return 0;
+}
+
+int config_parse_device_allow(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *path = NULL, *resolved = NULL;
+        CGroupDevicePermissions permissions;
+        CGroupContext *c = data;
+        const char *p = rvalue;
+        int r;
+
+        if (isempty(rvalue)) {
+                while (c->device_allow)
+                        cgroup_context_free_device_allow(c, c->device_allow);
+
+                return 0;
+        }
+
+        r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r <= 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to extract device path and rights from '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(userdata, path, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+                return 0;
+        }
+
+        if (!STARTSWITH_SET(resolved, "block-", "char-")) {
+
+                r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+                if (r < 0)
+                        return 0;
+
+                if (!valid_device_node_path(resolved)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device node path '%s', ignoring.", resolved);
+                        return 0;
+                }
+        }
+
+        permissions = isempty(p) ? 0 : cgroup_device_permissions_from_string(p);
+        if (permissions < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, permissions, "Invalid device rights '%s', ignoring.", p);
+                return 0;
+        }
+
+        return cgroup_context_add_device_allow(c, resolved, permissions);
+}
+
+int config_parse_io_device_weight(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *path = NULL, *resolved = NULL;
+        CGroupIODeviceWeight *w;
+        CGroupContext *c = data;
+        const char *p = ASSERT_PTR(rvalue);
+        uint64_t u;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        if (isempty(rvalue)) {
+                while (c->io_device_weights)
+                        cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+                return 0;
+        }
+
+        r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to extract device path and weight from '%s', ignoring.", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Invalid device path or weight specified in '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(userdata, path, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        r = cg_weight_parse(p, &u);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "IO weight '%s' invalid, ignoring: %m", p);
+                return 0;
+        }
+
+        assert(u != CGROUP_WEIGHT_INVALID);
+
+        w = new0(CGroupIODeviceWeight, 1);
+        if (!w)
+                return log_oom();
+
+        w->path = TAKE_PTR(resolved);
+        w->weight = u;
+
+        LIST_PREPEND(device_weights, c->io_device_weights, w);
+        return 0;
+}
+
+int config_parse_io_device_latency(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *path = NULL, *resolved = NULL;
+        CGroupIODeviceLatency *l;
+        CGroupContext *c = data;
+        const char *p = ASSERT_PTR(rvalue);
+        usec_t usec;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        if (isempty(rvalue)) {
+                while (c->io_device_latencies)
+                        cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+                return 0;
+        }
+
+        r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to extract device path and latency from '%s', ignoring.", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Invalid device path or latency specified in '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(userdata, path, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        r = parse_sec(p, &usec);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", p);
+                return 0;
+        }
+
+        l = new0(CGroupIODeviceLatency, 1);
+        if (!l)
+                return log_oom();
+
+        l->path = TAKE_PTR(resolved);
+        l->target_usec = usec;
+
+        LIST_PREPEND(device_latencies, c->io_device_latencies, l);
+        return 0;
+}
+
+int config_parse_io_limit(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *path = NULL, *resolved = NULL;
+        CGroupIODeviceLimit *l = NULL;
+        CGroupContext *c = data;
+        CGroupIOLimitType type;
+        const char *p = ASSERT_PTR(rvalue);
+        uint64_t num;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        type = cgroup_io_limit_type_from_string(lvalue);
+        assert(type >= 0);
+
+        if (isempty(rvalue)) {
+                LIST_FOREACH(device_limits, t, c->io_device_limits)
+                        t->limits[type] = cgroup_io_limit_defaults[type];
+                return 0;
+        }
+
+        r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(userdata, path, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        if (streq("infinity", p))
+                num = CGROUP_LIMIT_MAX;
+        else {
+                r = parse_size(p, 1000, &num);
+                if (r < 0 || num <= 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid IO limit '%s', ignoring.", p);
+                        return 0;
+                }
+        }
+
+        LIST_FOREACH(device_limits, t, c->io_device_limits)
+                if (path_equal(resolved, t->path)) {
+                        l = t;
+                        break;
+                }
+
+        if (!l) {
+                l = new0(CGroupIODeviceLimit, 1);
+                if (!l)
+                        return log_oom();
+
+                l->path = TAKE_PTR(resolved);
+                for (CGroupIOLimitType i = 0; i < _CGROUP_IO_LIMIT_TYPE_MAX; i++)
+                        l->limits[i] = cgroup_io_limit_defaults[i];
+
+                LIST_PREPEND(device_limits, c->io_device_limits, l);
+        }
+
+        l->limits[type] = num;
+
+        return 0;
+}
+
+int config_parse_blockio_device_weight(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *path = NULL, *resolved = NULL;
+        CGroupBlockIODeviceWeight *w;
+        CGroupContext *c = data;
+        const char *p = ASSERT_PTR(rvalue);
+        uint64_t u;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                   "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.",
+                   lvalue, lvalue);
+
+        if (isempty(rvalue)) {
+                while (c->blockio_device_weights)
+                        cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+                return 0;
+        }
+
+        r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to extract device node and weight from '%s', ignoring.", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Invalid device node or weight specified in '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(userdata, path, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        r = cg_blkio_weight_parse(p, &u);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid block IO weight '%s', ignoring: %m", p);
+                return 0;
+        }
+
+        assert(u != CGROUP_BLKIO_WEIGHT_INVALID);
+
+        w = new0(CGroupBlockIODeviceWeight, 1);
+        if (!w)
+                return log_oom();
+
+        w->path = TAKE_PTR(resolved);
+        w->weight = u;
+
+        LIST_PREPEND(device_weights, c->blockio_device_weights, w);
+        return 0;
+}
+
+int config_parse_blockio_bandwidth(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *path = NULL, *resolved = NULL;
+        CGroupBlockIODeviceBandwidth *b = NULL;
+        CGroupContext *c = data;
+        const char *p = ASSERT_PTR(rvalue);
+        uint64_t bytes;
+        bool read;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                   "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.",
+                   lvalue, lvalue);
+
+        read = streq("BlockIOReadBandwidth", lvalue);
+
+        if (isempty(rvalue)) {
+                LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) {
+                        t->rbps = CGROUP_LIMIT_MAX;
+                        t->wbps = CGROUP_LIMIT_MAX;
+                }
+                return 0;
+        }
+
+        r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                           "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(userdata, path, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        r = parse_size(p, 1000, &bytes);
+        if (r < 0 || bytes <= 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid Block IO Bandwidth '%s', ignoring.", p);
+                return 0;
+        }
+
+        LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths)
+                if (path_equal(resolved, t->path)) {
+                        b = t;
+                        break;
+                }
+
+        if (!b) {
+                b = new0(CGroupBlockIODeviceBandwidth, 1);
+                if (!b)
+                        return log_oom();
+
+                b->path = TAKE_PTR(resolved);
+                b->rbps = CGROUP_LIMIT_MAX;
+                b->wbps = CGROUP_LIMIT_MAX;
+
+                LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, b);
+        }
+
+        if (read)
+                b->rbps = bytes;
+        else
+                b->wbps = bytes;
+
+        return 0;
+}
+
+int config_parse_job_mode_isolate(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        JobMode *m = data;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = parse_boolean(rvalue);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        log_notice("%s is deprecated. Please use OnFailureJobMode= instead", lvalue);
+
+        *m = r ? JOB_ISOLATE : JOB_REPLACE;
+        return 0;
+}
+
+int config_parse_exec_directories(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecDirectory *ed = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                exec_directory_done(ed);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *tuple = NULL;
+
+                r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                _cleanup_free_ char *src = NULL, *dest = NULL;
+                const char *q = tuple;
+                r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest, NULL);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r <= 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+                        return 0;
+                }
+
+                _cleanup_free_ char *sresolved = NULL;
+                r = unit_path_printf(u, src, &sresolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve unit specifiers in \"%s\", ignoring: %m", src);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(sresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                if (path_startswith(sresolved, "private")) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                   "%s= path can't be 'private', ignoring assignment: %s", lvalue, tuple);
+                        continue;
+                }
+
+                /* For State and Runtime directories we support an optional destination parameter, which
+                 * will be used to create a symlink to the source. */
+                _cleanup_free_ char *dresolved = NULL;
+                if (!isempty(dest)) {
+                        if (streq(lvalue, "ConfigurationDirectory")) {
+                                log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                           "Destination parameter is not supported for ConfigurationDirectory, ignoring: %s", tuple);
+                                continue;
+                        }
+
+                        r = unit_path_printf(u, dest, &dresolved);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r,
+                                        "Failed to resolve unit specifiers in \"%s\", ignoring: %m", dest);
+                                continue;
+                        }
+
+                        r = path_simplify_and_warn(dresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue);
+                        if (r < 0)
+                                continue;
+                }
+
+                r = exec_directory_add(ed, sresolved, dresolved);
+                if (r < 0)
+                        return log_oom();
+        }
+}
+
+int config_parse_set_credential(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *word = NULL, *k = NULL;
+        _cleanup_free_ void *d = NULL;
+        ExecContext *context = ASSERT_PTR(data);
+        ExecSetCredential *old;
+        Unit *u = userdata;
+        bool encrypted = ltype;
+        const char *p = ASSERT_PTR(rvalue);
+        size_t size;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                context->set_credentials = hashmap_free(context->set_credentials);
+                return 0;
+        }
+
+        r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract credential name, ignoring: %s", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        r = unit_cred_printf(u, word, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
+                return 0;
+        }
+        if (!credential_name_valid(k)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+                return 0;
+        }
+
+        if (encrypted) {
+                r = unbase64mem_full(p, SIZE_MAX, true, &d, &size);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Encrypted credential data not valid Base64 data, ignoring.");
+                        return 0;
+                }
+        } else {
+                char *unescaped;
+                ssize_t l;
+
+                /* We support escape codes here, so that users can insert trailing \n if they like */
+                l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped);
+                if (l < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, l, "Can't unescape \"%s\", ignoring: %m", p);
+                        return 0;
+                }
+
+                d = unescaped;
+                size = l;
+        }
+
+        old = hashmap_get(context->set_credentials, k);
+        if (old) {
+                free_and_replace(old->data, d);
+                old->size = size;
+                old->encrypted = encrypted;
+        } else {
+                _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+
+                sc = new(ExecSetCredential, 1);
+                if (!sc)
+                        return log_oom();
+
+                *sc = (ExecSetCredential) {
+                        .id = TAKE_PTR(k),
+                        .data = TAKE_PTR(d),
+                        .size = size,
+                        .encrypted = encrypted,
+                };
+
+                r = hashmap_ensure_put(&context->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Duplicated credential value '%s', ignoring assignment: %s", sc->id, rvalue);
+                        return 0;
+                }
+
+                TAKE_PTR(sc);
+        }
+
+        return 0;
+}
+
+int hashmap_put_credential(Hashmap **h, const char *id, const char *path, bool encrypted) {
+        ExecLoadCredential *old;
+        int r;
+
+        assert(h);
+        assert(id);
+        assert(path);
+
+        old = hashmap_get(*h, id);
+        if (old) {
+                r = free_and_strdup(&old->path, path);
+                if (r < 0)
+                        return r;
+
+                old->encrypted = encrypted;
+        } else {
+                _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
+
+                lc = new(ExecLoadCredential, 1);
+                if (!lc)
+                        return log_oom();
+
+                *lc = (ExecLoadCredential) {
+                        .id = strdup(id),
+                        .path = strdup(path),
+                        .encrypted = encrypted,
+                };
+                if (!lc->id || !lc->path)
+                        return -ENOMEM;
+
+                r = hashmap_ensure_put(h, &exec_load_credential_hash_ops, lc->id, lc);
+                if (r < 0)
+                        return r;
+
+                TAKE_PTR(lc);
+        }
+
+        return 0;
+}
+
+int config_parse_load_credential(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL;
+        ExecContext *context = ASSERT_PTR(data);
+        bool encrypted = ltype;
+        Unit *u = userdata;
+        const char *p;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                context->load_credentials = hashmap_free(context->load_credentials);
+                return 0;
+        }
+
+        p = rvalue;
+        r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r <= 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        r = unit_cred_printf(u, word, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
+                return 0;
+        }
+        if (!credential_name_valid(k)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+                return 0;
+        }
+
+        if (isempty(p)) {
+                /* If only one field is specified take it as shortcut for inheriting a credential named
+                 * the same way from our parent */
+                q = strdup(k);
+                if (!q)
+                        return log_oom();
+        } else {
+                r = unit_path_printf(u, p, &q);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p);
+                        return 0;
+                }
+                if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", q);
+                        return 0;
+                }
+        }
+
+        r = hashmap_put_credential(&context->load_credentials, k, q, encrypted);
+        if (r < 0)
+                return log_error_errno(r, "Failed to store load credential '%s': %m", rvalue);
+
+        return 0;
+}
+
+int config_parse_import_credential(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *s = NULL;
+        Set** import_credentials = ASSERT_PTR(data);
+        Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *import_credentials = set_free_free(*import_credentials);
+                return 0;
+        }
+
+        r = unit_cred_printf(u, rvalue, &s);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+                return 0;
+        }
+        if (!credential_glob_valid(s)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name or glob \"%s\" not valid, ignoring.", s);
+                return 0;
+        }
+
+        r = set_put_strdup(import_credentials, s);
+        if (r < 0)
+                return log_error_errno(r, "Failed to store credential name '%s': %m", rvalue);
+
+        return 0;
+}
+
+int config_parse_set_status(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExitStatusSet *status_set = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        /* Empty assignment resets the list */
+        if (isempty(rvalue)) {
+                exit_status_set_free(status_set);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL;
+                Bitmap *bitmap;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to parse %s=%s, ignoring: %m", lvalue, rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                /* We need to call exit_status_from_string() first, because we want
+                 * to parse numbers as exit statuses, not signals. */
+
+                r = exit_status_from_string(word);
+                if (r >= 0) {
+                        assert(r >= 0 && r < 256);
+                        bitmap = &status_set->status;
+                } else {
+                        r = signal_from_string(word);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r,
+                                           "Failed to parse value, ignoring: %s", word);
+                                continue;
+                        }
+                        bitmap = &status_set->signal;
+                }
+
+                r = bitmap_set(bitmap, r);
+                if (r < 0)
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to set signal or status %s, ignoring: %m", word);
+        }
+}
+
+int config_parse_namespace_path_strv(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        const Unit *u = userdata;
+        char*** sv = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                *sv = strv_free(*sv);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL;
+                const char *w;
+                bool ignore_enoent = false, shall_prefix = false;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        break;
+
+                w = word;
+                if (startswith(w, "-")) {
+                        ignore_enoent = true;
+                        w++;
+                }
+                if (startswith(w, "+")) {
+                        shall_prefix = true;
+                        w++;
+                }
+
+                r = unit_path_printf(u, w, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", w);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                joined = strjoin(ignore_enoent ? "-" : "",
+                                 shall_prefix ? "+" : "",
+                                 resolved);
+
+                r = strv_push(sv, joined);
+                if (r < 0)
+                        return log_oom();
+
+                joined = NULL;
+        }
+
+        return 0;
+}
+
+int config_parse_temporary_filesystems(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        const Unit *u = userdata;
+        ExecContext *c = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+                c->temporary_filesystems = NULL;
+                c->n_temporary_filesystems = 0;
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL, *path = NULL, *resolved = NULL;
+                const char *w;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                w = word;
+                r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", word);
+                        continue;
+                }
+                if (r == 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", word);
+                        continue;
+                }
+
+                r = unit_path_printf(u, path, &resolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", path);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, resolved, w);
+                if (r < 0)
+                        return log_oom();
+        }
+}
+
+int config_parse_bind_paths(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+                c->bind_mounts = NULL;
+                c->n_bind_mounts = 0;
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *source = NULL, *destination = NULL;
+                _cleanup_free_ char *sresolved = NULL, *dresolved = NULL;
+                char *s = NULL, *d = NULL;
+                bool rbind = true, ignore_enoent = false;
+
+                r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        break;
+
+                r = unit_full_printf_full(u, source, PATH_MAX, &sresolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve unit specifiers in \"%s\", ignoring: %m", source);
+                        continue;
+                }
+
+                s = sresolved;
+                if (s[0] == '-') {
+                        ignore_enoent = true;
+                        s++;
+                }
+
+                r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                /* Optionally, the destination is specified. */
+                if (p && p[-1] == ':') {
+                        r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+                                return 0;
+                        }
+                        if (r == 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing argument after ':', ignoring: %s", s);
+                                continue;
+                        }
+
+                        r = unit_path_printf(u, destination, &dresolved);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r,
+                                           "Failed to resolve specifiers in \"%s\", ignoring: %m", destination);
+                                continue;
+                        }
+
+                        r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                        if (r < 0)
+                                continue;
+
+                        d = dresolved;
+
+                        /* Optionally, there's also a short option string specified */
+                        if (p && p[-1] == ':') {
+                                _cleanup_free_ char *options = NULL;
+
+                                r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE);
+                                if (r == -ENOMEM)
+                                        return log_oom();
+                                if (r < 0) {
+                                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+                                        return 0;
+                                }
+
+                                if (isempty(options) || streq(options, "rbind"))
+                                        rbind = true;
+                                else if (streq(options, "norbind"))
+                                        rbind = false;
+                                else {
+                                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid option string, ignoring setting: %s", options);
+                                        continue;
+                                }
+                        }
+                } else
+                        d = s;
+
+                r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+                                   &(BindMount) {
+                                           .source = s,
+                                           .destination = d,
+                                           .read_only = !!strstr(lvalue, "ReadOnly"),
+                                           .recursive = rbind,
+                                           .ignore_enoent = ignore_enoent,
+                                   });
+                if (r < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+int config_parse_mount_images(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL;
+                _cleanup_free_ char *sresolved = NULL, *dresolved = NULL;
+                const char *q = NULL;
+                char *s = NULL;
+                bool permissive = false;
+
+                r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                q = tuple;
+                r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+                        return 0;
+                }
+                if (r == 0)
+                        continue;
+
+                s = first;
+                if (s[0] == '-') {
+                        permissive = true;
+                        s++;
+                }
+
+                r = unit_path_printf(u, s, &sresolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                if (isempty(second)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing destination in %s, ignoring: %s", lvalue, rvalue);
+                        continue;
+                }
+
+                r = unit_path_printf(u, second, &dresolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                        "Failed to resolve specifiers in \"%s\", ignoring: %m", second);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                for (;;) {
+                        _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL;
+                        MountOptions *o = NULL;
+                        PartitionDesignator partition_designator;
+
+                        r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q);
+                                return 0;
+                        }
+                        if (r == 0)
+                                break;
+                        /* Single set of options, applying to the root partition/single filesystem */
+                        if (r == 1) {
+                                r = unit_full_printf(u, partition, &mount_options_resolved);
+                                if (r < 0) {
+                                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", first);
+                                        continue;
+                                }
+
+                                o = new(MountOptions, 1);
+                                if (!o)
+                                        return log_oom();
+                                *o = (MountOptions) {
+                                        .partition_designator = PARTITION_ROOT,
+                                        .options = TAKE_PTR(mount_options_resolved),
+                                };
+                                LIST_APPEND(mount_options, options, o);
+
+                                break;
+                        }
+
+                        partition_designator = partition_designator_from_string(partition);
+                        if (partition_designator < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, partition_designator,
+                                           "Invalid partition name %s, ignoring", partition);
+                                continue;
+                        }
+                        r = unit_full_printf(u, mount_options, &mount_options_resolved);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+                                continue;
+                        }
+
+                        o = new(MountOptions, 1);
+                        if (!o)
+                                return log_oom();
+                        *o = (MountOptions) {
+                                .partition_designator = partition_designator,
+                                .options = TAKE_PTR(mount_options_resolved),
+                        };
+                        LIST_APPEND(mount_options, options, o);
+                }
+
+                r = mount_image_add(&c->mount_images, &c->n_mount_images,
+                                    &(MountImage) {
+                                            .source = sresolved,
+                                            .destination = dresolved,
+                                            .mount_options = options,
+                                            .ignore_enoent = permissive,
+                                            .type = MOUNT_IMAGE_DISCRETE,
+                                    });
+                if (r < 0)
+                        return log_oom();
+        }
+}
+
+int config_parse_extension_images(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const Unit *u = userdata;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+                return 0;
+        }
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *source = NULL, *tuple = NULL, *sresolved = NULL;
+                _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+                bool permissive = false;
+                const char *q = NULL;
+                char *s = NULL;
+
+                r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+                        return 0;
+                }
+                if (r == 0)
+                        return 0;
+
+                q = tuple;
+                r = extract_first_word(&q, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+                        return 0;
+                }
+                if (r == 0)
+                        continue;
+
+                s = source;
+                if (s[0] == '-') {
+                        permissive = true;
+                        s++;
+                }
+
+                r = unit_path_printf(u, s, &sresolved);
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+                        continue;
+                }
+
+                r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+                if (r < 0)
+                        continue;
+
+                for (;;) {
+                        _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL;
+                        MountOptions *o = NULL;
+                        PartitionDesignator partition_designator;
+
+                        r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q);
+                                return 0;
+                        }
+                        if (r == 0)
+                                break;
+                        /* Single set of options, applying to the root partition/single filesystem */
+                        if (r == 1) {
+                                r = unit_full_printf(u, partition, &mount_options_resolved);
+                                if (r < 0) {
+                                        log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", partition);
+                                        continue;
+                                }
+
+                                o = new(MountOptions, 1);
+                                if (!o)
+                                        return log_oom();
+                                *o = (MountOptions) {
+                                        .partition_designator = PARTITION_ROOT,
+                                        .options = TAKE_PTR(mount_options_resolved),
+                                };
+                                LIST_APPEND(mount_options, options, o);
+
+                                break;
+                        }
+
+                        partition_designator = partition_designator_from_string(partition);
+                        if (partition_designator < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid partition name %s, ignoring", partition);
+                                continue;
+                        }
+                        r = unit_full_printf(u, mount_options, &mount_options_resolved);
+                        if (r < 0) {
+                                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+                                continue;
+                        }
+
+                        o = new(MountOptions, 1);
+                        if (!o)
+                                return log_oom();
+                        *o = (MountOptions) {
+                                .partition_designator = partition_designator,
+                                .options = TAKE_PTR(mount_options_resolved),
+                        };
+                        LIST_APPEND(mount_options, options, o);
+                }
+
+                r = mount_image_add(&c->extension_images, &c->n_extension_images,
+                                    &(MountImage) {
+                                            .source = sresolved,
+                                            .mount_options = options,
+                                            .ignore_enoent = permissive,
+                                            .type = MOUNT_IMAGE_EXTENSION,
+                                    });
+                if (r < 0)
+                        return log_oom();
+        }
+}
+
+int config_parse_job_timeout_sec(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Unit *u = ASSERT_PTR(data);
+        usec_t usec;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = parse_sec_fix_0(rvalue, &usec);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobTimeoutSec= parameter, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        /* If the user explicitly changed JobTimeoutSec= also change JobRunningTimeoutSec=, for compatibility with old
+         * versions. If JobRunningTimeoutSec= was explicitly set, avoid this however as whatever the user picked should
+         * count. */
+
+        if (!u->job_running_timeout_set)
+                u->job_running_timeout = usec;
+
+        u->job_timeout = usec;
+
+        return 0;
+}
+
+int config_parse_job_running_timeout_sec(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Unit *u = ASSERT_PTR(data);
+        usec_t usec;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        r = parse_sec_fix_0(rvalue, &usec);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobRunningTimeoutSec= parameter, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        u->job_running_timeout = usec;
+        u->job_running_timeout_set = true;
+
+        return 0;
+}
+
+int config_parse_emergency_action(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        EmergencyAction *x = ASSERT_PTR(data);
+        RuntimeScope runtime_scope;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        /* If we have a unit determine the scope based on it */
+        if (unit)
+                runtime_scope = ((Unit*) ASSERT_PTR(userdata))->manager->runtime_scope;
+        else
+                runtime_scope = ltype; /* otherwise, assume the scope is passed in via ltype */
+
+        r = parse_emergency_action(rvalue, runtime_scope, x);
+        if (r < 0) {
+                if (r == -EOPNOTSUPP)
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "%s= specified as %s mode action, ignoring: %s",
+                                   lvalue, runtime_scope_to_string(runtime_scope), rvalue);
+                else
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+                return 0;
+        }
+
+        return 0;
+}
+
+int config_parse_pid_file(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *k = NULL, *n = NULL;
+        const Unit *u = ASSERT_PTR(userdata);
+        char **s = data;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* An empty assignment removes already set value. */
+                *s = mfree(*s);
+                return 0;
+        }
+
+        r = unit_path_printf(u, rvalue, &k);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        /* If this is a relative path make it absolute by prefixing the /run */
+        n = path_make_absolute(k, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+        if (!n)
+                return log_oom();
+
+        /* Check that the result is a sensible path */
+        r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+        if (r < 0)
+                return r;
+
+        r = patch_var_run(unit, filename, line, lvalue, &n);
+        if (r < 0)
+                return r;
+
+        free_and_replace(*s, n);
+        return 0;
+}
+
+int config_parse_exit_status(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int *exit_status = data, r;
+        uint8_t u;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(exit_status);
+
+        if (isempty(rvalue)) {
+                *exit_status = -1;
+                return 0;
+        }
+
+        r = safe_atou8(rvalue, &u);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse exit status '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        *exit_status = u;
+        return 0;
+}
+
+int config_parse_disable_controllers(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int r;
+        CGroupContext *c = data;
+        CGroupMask disabled_mask;
+
+        /* 1. If empty, make all controllers eligible for use again.
+         * 2. If non-empty, merge all listed controllers, space separated. */
+
+        if (isempty(rvalue)) {
+                c->disable_controllers = 0;
+                return 0;
+        }
+
+        r = cg_mask_from_string(rvalue, &disabled_mask);
+        if (r < 0 || disabled_mask <= 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue);
+                return 0;
+        }
+
+        c->disable_controllers |= disabled_mask;
+
+        return 0;
+}
+
+int config_parse_ip_filter_bpf_progs(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_free_ char *resolved = NULL;
+        const Unit *u = userdata;
+        char ***paths = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                *paths = strv_free(*paths);
+                return 0;
+        }
+
+        r = unit_path_printf(u, rvalue, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        if (strv_contains(*paths, resolved))
+                return 0;
+
+        r = strv_extend(paths, resolved);
+        if (r < 0)
+                return log_oom();
+
+        r = bpf_firewall_supported();
+        if (r < 0)
+                return r;
+        if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+                static bool warned = false;
+
+                log_full(warned ? LOG_DEBUG : LOG_WARNING,
+                         "File %s:%u configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters.\n"
+                         "Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue);
+
+                warned = true;
+        }
+
+        return 0;
+}
+
+int config_parse_bpf_foreign_program(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+        _cleanup_free_ char *resolved = NULL, *word = NULL;
+        CGroupContext *c = data;
+        const char *p = ASSERT_PTR(rvalue);
+        Unit *u = userdata;
+        int attach_type, r;
+
+        assert(filename);
+        assert(lvalue);
+
+        if (isempty(rvalue)) {
+                while (c->bpf_foreign_programs)
+                        cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+                return 0;
+        }
+
+        r = extract_first_word(&p, &word, ":", 0);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse foreign BPF program, ignoring: %s", rvalue);
+                return 0;
+        }
+        if (r == 0 || isempty(p)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax in %s=, ignoring: %s", lvalue, rvalue);
+                return 0;
+        }
+
+        attach_type = bpf_cgroup_attach_type_from_string(word);
+        if (attach_type < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown BPF attach type=%s, ignoring: %s", word, rvalue);
+                return 0;
+        }
+
+        r = unit_path_printf(u, p, &resolved);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %s", p, rvalue);
+                return 0;
+        }
+
+        r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+        if (r < 0)
+                return 0;
+
+        r = cgroup_context_add_bpf_foreign_program(c, attach_type, resolved);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add foreign BPF program to cgroup context: %m");
+
+        return 0;
+}
+
+int config_parse_cgroup_socket_bind(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+        _cleanup_free_ CGroupSocketBindItem *item = NULL;
+        CGroupSocketBindItem **head = data;
+        uint16_t nr_ports, port_min;
+        int af, ip_protocol, r;
+
+        if (isempty(rvalue)) {
+                cgroup_context_remove_socket_bind(head);
+                return 0;
+        }
+
+        r = parse_socket_bind_item(rvalue, &af, &ip_protocol, &nr_ports, &port_min);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Unable to parse %s= assignment, ignoring: %s", lvalue, rvalue);
+                return 0;
+        }
+
+        item = new(CGroupSocketBindItem, 1);
+        if (!item)
+                return log_oom();
+        *item = (CGroupSocketBindItem) {
+                .address_family = af,
+                .ip_protocol = ip_protocol,
+                .nr_ports = nr_ports,
+                .port_min = port_min,
+        };
+
+        LIST_PREPEND(socket_bind_items, *head, TAKE_PTR(item));
+
+        return 0;
+}
+
+int config_parse_restrict_network_interfaces(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+        CGroupContext *c = ASSERT_PTR(data);
+        bool is_allow_rule = true;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                /* Empty assignment resets the list */
+                c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+                return 0;
+        }
+
+        if (rvalue[0] == '~') {
+                is_allow_rule = false;
+                rvalue++;
+        }
+
+        if (set_isempty(c->restrict_network_interfaces))
+                /* Only initialize this when creating the set */
+                c->restrict_network_interfaces_is_allow_list = is_allow_rule;
+
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+                if (r == 0)
+                        break;
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, r,
+                                   "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+                        break;
+                }
+
+                if (!ifname_valid(word)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", word);
+                        continue;
+                }
+
+                if (c->restrict_network_interfaces_is_allow_list != is_allow_rule)
+                        free(set_remove(c->restrict_network_interfaces, word));
+                else {
+                        r = set_put_strdup(&c->restrict_network_interfaces, word);
+                        if (r < 0)
+                                return log_oom();
+                }
+        }
+
+        return 0;
+}
+
+static int merge_by_names(Unit *u, Set *names, const char *id) {
+        char *k;
+        int r;
+
+        assert(u);
+
+        /* Let's try to add in all names that are aliases of this unit */
+        while ((k = set_steal_first(names))) {
+                _cleanup_free_ _unused_ char *free_k = k;
+
+                /* First try to merge in the other name into our unit */
+                r = unit_merge_by_name(u, k);
+                if (r < 0) {
+                        Unit *other;
+
+                        /* Hmm, we couldn't merge the other unit into ours? Then let's try it the other way
+                         * round. */
+
+                        other = manager_get_unit(u->manager, k);
+                        if (!other)
+                                return r; /* return previous failure */
+
+                        r = unit_merge(other, u);
+                        if (r < 0)
+                                return r;
+
+                        return merge_by_names(other, names, NULL);
+                }
+
+                if (streq_ptr(id, k))
+                        unit_choose_id(u, id);
+        }
+
+        return 0;
+}
+
+int unit_load_fragment(Unit *u) {
+        const char *fragment;
+        _cleanup_set_free_free_ Set *names = NULL;
+        int r;
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+        assert(u->id);
+
+        if (u->transient) {
+                u->access_selinux_context = mfree(u->access_selinux_context);
+                u->load_state = UNIT_LOADED;
+                return 0;
+        }
+
+        /* Possibly rebuild the fragment map to catch new units */
+        r = unit_file_build_name_map(&u->manager->lookup_paths,
+                                     &u->manager->unit_cache_timestamp_hash,
+                                     &u->manager->unit_id_map,
+                                     &u->manager->unit_name_map,
+                                     &u->manager->unit_path_cache);
+        if (r < 0)
+                return log_error_errno(r, "Failed to rebuild name map: %m");
+
+        r = unit_file_find_fragment(u->manager->unit_id_map,
+                                    u->manager->unit_name_map,
+                                    u->id,
+                                    &fragment,
+                                    &names);
+        if (r < 0 && r != -ENOENT)
+                return r;
+
+        if (fragment) {
+                /* Open the file, check if this is a mask, otherwise read. */
+                _cleanup_fclose_ FILE *f = NULL;
+                struct stat st;
+
+                /* Try to open the file name. A symlink is OK, for example for linked files or masks. We
+                 * expect that all symlinks within the lookup paths have been already resolved, but we don't
+                 * verify this here. */
+                f = fopen(fragment, "re");
+                if (!f)
+                        return log_unit_notice_errno(u, errno, "Failed to open %s: %m", fragment);
+
+                if (fstat(fileno(f), &st) < 0)
+                        return -errno;
+
+                r = free_and_strdup(&u->fragment_path, fragment);
+                if (r < 0)
+                        return r;
+
+                if (null_or_empty(&st)) {
+                        /* Unit file is masked */
+
+                        u->load_state = u->perpetual ? UNIT_LOADED : UNIT_MASKED; /* don't allow perpetual units to ever be masked */
+                        u->fragment_mtime = 0;
+                        u->access_selinux_context = mfree(u->access_selinux_context);
+                } else {
+#if HAVE_SELINUX
+                        if (mac_selinux_use()) {
+                                _cleanup_freecon_ char *selcon = NULL;
+
+                                /* Cache the SELinux context of the unit file here. We'll make use of when checking access permissions to loaded units */
+                                r = fgetfilecon_raw(fileno(f), &selcon);
+                                if (r < 0)
+                                        log_unit_warning_errno(u, r, "Failed to read SELinux context of '%s', ignoring: %m", fragment);
+
+                                r = free_and_strdup(&u->access_selinux_context, selcon);
+                                if (r < 0)
+                                        return r;
+                        } else
+#endif
+                                u->access_selinux_context = mfree(u->access_selinux_context);
+
+                        u->load_state = UNIT_LOADED;
+                        u->fragment_mtime = timespec_load(&st.st_mtim);
+
+                        /* Now, parse the file contents */
+                        r = config_parse(u->id, fragment, f,
+                                         UNIT_VTABLE(u)->sections,
+                                         config_item_perf_lookup, load_fragment_gperf_lookup,
+                                         0,
+                                         u,
+                                         NULL);
+                        if (r == -ENOEXEC)
+                                log_unit_notice_errno(u, r, "Unit configuration has fatal error, unit will not be started.");
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        /* Call merge_by_names with the name derived from the fragment path as the preferred name.
+         *
+         * We do the merge dance here because for some unit types, the unit might have aliases which are not
+         * declared in the file system. In particular, this is true (and frequent) for device and swap units.
+         */
+        const char *id = u->id;
+        _cleanup_free_ char *filename = NULL, *free_id = NULL;
+
+        if (fragment) {
+                r = path_extract_filename(fragment, &filename);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to extract filename from fragment '%s': %m", fragment);
+                id = filename;
+
+                if (unit_name_is_valid(id, UNIT_NAME_TEMPLATE)) {
+                        assert(u->instance); /* If we're not trying to use a template for non-instanced unit,
+                                              * this must be set. */
+
+                        r = unit_name_replace_instance(id, u->instance, &free_id);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to build id (%s + %s): %m", id, u->instance);
+                        id = free_id;
+                }
+        }
+
+        return merge_by_names(u, names, id);
+}
+
+void unit_dump_config_items(FILE *f) {
+        static const struct {
+                const ConfigParserCallback callback;
+                const char *rvalue;
+        } table[] = {
+                { config_parse_warn_compat,           "NOTSUPPORTED" },
+                { config_parse_int,                   "INTEGER" },
+                { config_parse_unsigned,              "UNSIGNED" },
+                { config_parse_iec_size,              "SIZE" },
+                { config_parse_iec_uint64,            "SIZE" },
+                { config_parse_si_uint64,             "SIZE" },
+                { config_parse_bool,                  "BOOLEAN" },
+                { config_parse_string,                "STRING" },
+                { config_parse_path,                  "PATH" },
+                { config_parse_unit_path_printf,      "PATH" },
+                { config_parse_colon_separated_paths, "PATH" },
+                { config_parse_strv,                  "STRING [...]" },
+                { config_parse_exec_nice,             "NICE" },
+                { config_parse_exec_oom_score_adjust, "OOMSCOREADJUST" },
+                { config_parse_exec_io_class,         "IOCLASS" },
+                { config_parse_exec_io_priority,      "IOPRIORITY" },
+                { config_parse_exec_cpu_sched_policy, "CPUSCHEDPOLICY" },
+                { config_parse_exec_cpu_sched_prio,   "CPUSCHEDPRIO" },
+                { config_parse_exec_cpu_affinity,     "CPUAFFINITY" },
+                { config_parse_mode,                  "MODE" },
+                { config_parse_unit_env_file,         "FILE" },
+                { config_parse_exec_output,           "OUTPUT" },
+                { config_parse_exec_input,            "INPUT" },
+                { config_parse_log_facility,          "FACILITY" },
+                { config_parse_log_level,             "LEVEL" },
+                { config_parse_exec_secure_bits,      "SECUREBITS" },
+                { config_parse_capability_set,        "BOUNDINGSET" },
+                { config_parse_rlimit,                "LIMIT" },
+                { config_parse_unit_deps,             "UNIT [...]" },
+                { config_parse_exec,                  "PATH [ARGUMENT [...]]" },
+                { config_parse_service_type,          "SERVICETYPE" },
+                { config_parse_service_exit_type,     "SERVICEEXITTYPE" },
+                { config_parse_service_restart,       "SERVICERESTART" },
+                { config_parse_service_restart_mode,  "SERVICERESTARTMODE" },
+                { config_parse_service_timeout_failure_mode, "TIMEOUTMODE" },
+                { config_parse_kill_mode,             "KILLMODE" },
+                { config_parse_signal,                "SIGNAL" },
+                { config_parse_socket_listen,         "SOCKET [...]" },
+                { config_parse_socket_bind,           "SOCKETBIND" },
+                { config_parse_socket_bindtodevice,   "NETWORKINTERFACE" },
+                { config_parse_sec,                   "SECONDS" },
+                { config_parse_nsec,                  "NANOSECONDS" },
+                { config_parse_namespace_path_strv,   "PATH [...]" },
+                { config_parse_bind_paths,            "PATH[:PATH[:OPTIONS]] [...]" },
+                { config_parse_unit_requires_mounts_for,
+                                                      "PATH [...]" },
+                { config_parse_exec_mount_propagation_flag,
+                                                      "MOUNTFLAG" },
+                { config_parse_unit_string_printf,    "STRING" },
+                { config_parse_trigger_unit,          "UNIT" },
+                { config_parse_timer,                 "TIMER" },
+                { config_parse_path_spec,             "PATH" },
+                { config_parse_notify_access,         "ACCESS" },
+                { config_parse_ip_tos,                "TOS" },
+                { config_parse_unit_condition_path,   "CONDITION" },
+                { config_parse_unit_condition_string, "CONDITION" },
+                { config_parse_unit_slice,            "SLICE" },
+                { config_parse_documentation,         "URL" },
+                { config_parse_service_timeout,       "SECONDS" },
+                { config_parse_emergency_action,      "ACTION" },
+                { config_parse_set_status,            "STATUS" },
+                { config_parse_service_sockets,       "SOCKETS" },
+                { config_parse_environ,               "ENVIRON" },
+#if HAVE_SECCOMP
+                { config_parse_syscall_filter,        "SYSCALLS" },
+                { config_parse_syscall_archs,         "ARCHS" },
+                { config_parse_syscall_errno,         "ERRNO" },
+                { config_parse_syscall_log,           "SYSCALLS" },
+                { config_parse_address_families,      "FAMILIES" },
+                { config_parse_restrict_namespaces,   "NAMESPACES"  },
+#endif
+                { config_parse_restrict_filesystems,  "FILESYSTEMS"  },
+                { config_parse_cpu_shares,            "SHARES" },
+                { config_parse_cg_weight,             "WEIGHT" },
+                { config_parse_cg_cpu_weight,         "CPUWEIGHT" },
+                { config_parse_memory_limit,          "LIMIT" },
+                { config_parse_device_allow,          "DEVICE" },
+                { config_parse_device_policy,         "POLICY" },
+                { config_parse_io_limit,              "LIMIT" },
+                { config_parse_io_device_weight,      "DEVICEWEIGHT" },
+                { config_parse_io_device_latency,     "DEVICELATENCY" },
+                { config_parse_blockio_bandwidth,     "BANDWIDTH" },
+                { config_parse_blockio_weight,        "WEIGHT" },
+                { config_parse_blockio_device_weight, "DEVICEWEIGHT" },
+                { config_parse_long,                  "LONG" },
+                { config_parse_socket_service,        "SERVICE" },
+#if HAVE_SELINUX
+                { config_parse_exec_selinux_context,  "LABEL" },
+#endif
+                { config_parse_job_mode,              "MODE" },
+                { config_parse_job_mode_isolate,      "BOOLEAN" },
+                { config_parse_personality,           "PERSONALITY" },
+                { config_parse_log_filter_patterns,   "REGEX" },
+        };
+
+        const char *prev = NULL;
+
+        assert(f);
+
+        NULSTR_FOREACH(i, load_fragment_gperf_nulstr) {
+                const char *rvalue = "OTHER", *lvalue;
+                const ConfigPerfItem *p;
+                const char *dot;
+
+                assert_se(p = load_fragment_gperf_lookup(i, strlen(i)));
+
+                /* Hide legacy settings */
+                if (p->parse == config_parse_warn_compat &&
+                    p->ltype == DISABLED_LEGACY)
+                        continue;
+
+                for (size_t j = 0; j < ELEMENTSOF(table); j++)
+                        if (p->parse == table[j].callback) {
+                                rvalue = table[j].rvalue;
+                                break;
+                        }
+
+                dot = strchr(i, '.');
+                lvalue = dot ? dot + 1 : i;
+
+                if (dot) {
+                        size_t prefix_len = dot - i;
+
+                        if (!prev || !strneq(prev, i, prefix_len+1)) {
+                                if (prev)
+                                        fputc('\n', f);
+
+                                fprintf(f, "[%.*s]\n", (int) prefix_len, i);
+                        }
+                }
+
+                fprintf(f, "%s=%s\n", lvalue, rvalue);
+                prev = i;
+        }
+}
+
+int config_parse_cpu_affinity2(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CPUSet *affinity = ASSERT_PTR(data);
+
+        (void) parse_cpu_set_extend(rvalue, affinity, true, unit, filename, line, lvalue);
+
+        return 0;
+}
+
+int config_parse_show_status(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int k;
+        ShowStatus *b = ASSERT_PTR(data);
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        k = parse_show_status(rvalue, b);
+        if (k < 0)
+                log_syntax(unit, LOG_WARNING, filename, line, k, "Failed to parse show status setting, ignoring: %s", rvalue);
+
+        return 0;
+}
+
+int config_parse_output_restricted(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecOutput t, *eo = ASSERT_PTR(data);
+        bool obsolete = false;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (streq(rvalue, "syslog")) {
+                t = EXEC_OUTPUT_JOURNAL;
+                obsolete = true;
+        } else if (streq(rvalue, "syslog+console")) {
+                t = EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+                obsolete = true;
+        } else {
+                t = exec_output_from_string(rvalue);
+                if (t < 0) {
+                        log_syntax(unit, LOG_WARNING, filename, line, t, "Failed to parse output type, ignoring: %s", rvalue);
+                        return 0;
+                }
+
+                if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Standard output types socket, fd:, file:, append:, truncate: are not supported as defaults, ignoring: %s", rvalue);
+                        return 0;
+                }
+        }
+
+        if (obsolete)
+                log_syntax(unit, LOG_NOTICE, filename, line, 0,
+                           "Standard output type %s is obsolete, automatically updating to %s. Please update your configuration.",
+                           rvalue, exec_output_to_string(t));
+
+        *eo = t;
+        return 0;
+}
+
+int config_parse_crash_chvt(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(data);
+
+        r = parse_crash_chvt(rvalue, data);
+        if (r < 0)
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashChangeVT= setting, ignoring: %s", rvalue);
+
+        return 0;
+}
+
+int config_parse_swap_priority(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Swap *s = ASSERT_PTR(userdata);
+        int r, priority;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(data);
+
+        if (isempty(rvalue)) {
+                s->parameters_fragment.priority = -1;
+                s->parameters_fragment.priority_set = false;
+                return 0;
+        }
+
+        r = safe_atoi(rvalue, &priority);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid swap priority '%s', ignoring.", rvalue);
+                return 0;
+        }
+
+        if (priority < -1) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Sorry, swap priorities smaller than -1 may only be assigned by the kernel itself, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        if (priority > 32767) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Swap priority out of range, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        s->parameters_fragment.priority = priority;
+        s->parameters_fragment.priority_set = true;
+        return 0;
+}
+
+int config_parse_watchdog_sec(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        usec_t *usec = data;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        /* This is called for {Runtime,Reboot,KExec}WatchdogSec= where "default" maps to
+         * USEC_INFINITY internally. */
+
+        if (streq(rvalue, "default"))
+                *usec = USEC_INFINITY;
+        else if (streq(rvalue, "off"))
+                *usec = 0;
+        else
+                return config_parse_sec(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+
+        return 0;
+}
+
+int config_parse_tty_size(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        unsigned *sz = data;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                *sz = UINT_MAX;
+                return 0;
+        }
+
+        return config_parse_unsigned(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_log_filter_patterns(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        const char *pattern = ASSERT_PTR(rvalue);
+        bool is_allowlist = true;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+
+        if (isempty(pattern)) {
+                /* Empty assignment resets the lists. */
+                c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
+                c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
+                return 0;
+        }
+
+        if (pattern[0] == '~') {
+                is_allowlist = false;
+                pattern++;
+                if (isempty(pattern))
+                        /* LogFilterPatterns=~ is not considered a valid pattern. */
+                        return log_syntax(unit, LOG_WARNING, filename, line, 0,
+                                          "Regex pattern invalid, ignoring: %s=%s", lvalue, rvalue);
+        }
+
+        if (pattern_compile_and_log(pattern, 0, NULL) < 0)
+                return 0;
+
+        r = set_put_strdup(is_allowlist ? &c->log_filter_allowed_patterns : &c->log_filter_denied_patterns,
+                           pattern);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r,
+                           "Failed to store log filtering pattern, ignoring: %s=%s", lvalue, rvalue);
+                return 0;
+        }
+
+        return 0;
+}
+
+int config_parse_open_file(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        _cleanup_(open_file_freep) OpenFile *of = NULL;
+        OpenFile **head = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        if (isempty(rvalue)) {
+                open_file_free_many(head);
+                return 0;
+        }
+
+        r = open_file_parse(rvalue, &of);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse OpenFile= setting, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        LIST_APPEND(open_files, *head, TAKE_PTR(of));
+
+        return 0;
+}
+
+int config_parse_cgroup_nft_set(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        CGroupContext *c = ASSERT_PTR(data);
+        Unit *u = ASSERT_PTR(userdata);
+
+        return config_parse_nft_set(unit, filename, line, section, section_line, lvalue, ltype, rvalue, &c->nft_set_context, u);
+}
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
new file mode 100644
index 0000000..6919805
--- /dev/null
+++ b/src/core/load-fragment.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "conf-parser.h"
+#include "unit.h"
+
+/* These functions are declared in the header to make them accessible to unit tests. */
+bool contains_instance_specifier_superset(const char *s);
+int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format);
+
+/* Config-parsing helpers relevant only for sources under src/core/ */
+int parse_crash_chvt(const char *value, int *data);
+int parse_confirm_spawn(const char *value, char **console);
+
+int hashmap_put_credential(Hashmap **h, const char *id, const char *path, bool encrypted);
+
+/* Read service data from .desktop file style configuration fragments */
+
+int unit_load_fragment(Unit *u);
+
+void unit_dump_config_items(FILE *f);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps);
+CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_colon_separated_paths);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_strv_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_documentation);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_listen);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_protocol);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_bind);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_coredump_filter);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_abort);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_failure_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_type);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_exit_type);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_restart);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_restart_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_output);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_text);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_data);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_class);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_priority);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_prio);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_affinity);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_apivfs);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_secure_bits);
+CONFIG_PARSER_PROTOTYPE(config_parse_root_image_options);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash_sig);
+CONFIG_PARSER_PROTOTYPE(config_parse_capability_set);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_propagation_flag);
+CONFIG_PARSER_PROTOTYPE(config_parse_timer);
+CONFIG_PARSER_PROTOTYPE(config_parse_trigger_unit);
+CONFIG_PARSER_PROTOTYPE(config_parse_path_spec);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_service);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_sockets);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_env_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_tos);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_path);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string);
+CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_notify_access);
+CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_log);
+CONFIG_PARSER_PROTOTYPE(config_parse_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_pass_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_unset_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_slice);
+CONFIG_PARSER_PROTOTYPE(config_parse_cg_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_cg_cpu_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
+CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
+CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_device_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_device_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_bandwidth);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_mode_isolate);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_selinux_context);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_apparmor_profile);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_smack_process_label);
+CONFIG_PARSER_PROTOTYPE(config_parse_address_families);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_preserve_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_directories);
+CONFIG_PARSER_PROTOTYPE(config_parse_set_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_load_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_import_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_set_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
+CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
+CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_home);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_system);
+CONFIG_PARSER_PROTOTYPE(config_parse_bus_name);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_utmp_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_working_directory);
+CONFIG_PARSER_PROTOTYPE(config_parse_fdname);
+CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
+CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_namespace);
+CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
+CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity2);
+CONFIG_PARSER_PROTOTYPE(config_parse_show_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_status_unit_format);
+CONFIG_PARSER_PROTOTYPE(config_parse_output_restricted);
+CONFIG_PARSER_PROTOTYPE(config_parse_crash_chvt);
+CONFIG_PARSER_PROTOTYPE(config_parse_timeout_abort);
+CONFIG_PARSER_PROTOTYPE(config_parse_swap_priority);
+CONFIG_PARSER_PROTOTYPE(config_parse_mount_images);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_timestamping);
+CONFIG_PARSER_PROTOTYPE(config_parse_extension_images);
+CONFIG_PARSER_PROTOTYPE(config_parse_bpf_foreign_program);
+CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_socket_bind);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_network_interfaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_tty_size);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns);
+CONFIG_PARSER_PROTOTYPE(config_parse_open_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_memory_pressure_watch);
+CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_nft_set);
+
+/* gperf prototypes */
+const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
+extern const char load_fragment_gperf_nulstr[];
diff --git a/src/core/main.c b/src/core/main.c
new file mode 100644
index 0000000..3f71cc0
--- /dev/null
+++ b/src/core/main.c
@@ -0,0 +1,3227 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#if HAVE_VALGRIND_VALGRIND_H
+#  include 
+#endif
+
+#include "sd-bus.h"
+#include "sd-daemon.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "apparmor-setup.h"
+#include "architecture.h"
+#include "argv-util.h"
+#if HAVE_LIBBPF
+#include "bpf-lsm.h"
+#endif
+#include "build.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "capability-util.h"
+#include "cgroup-util.h"
+#include "chase.h"
+#include "clock-util.h"
+#include "conf-parser.h"
+#include "confidential-virt.h"
+#include "copy.h"
+#include "cpu-set-util.h"
+#include "crash-handler.h"
+#include "dbus-manager.h"
+#include "dbus.h"
+#include "constants.h"
+#include "dev-setup.h"
+#include "efi-random.h"
+#include "efivars.h"
+#include "emergency-action.h"
+#include "env-util.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fdset.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "getopt-defs.h"
+#include "hexdecoct.h"
+#include "hostname-setup.h"
+#include "ima-setup.h"
+#include "import-creds.h"
+#include "initrd-util.h"
+#include "killall.h"
+#include "kmod-setup.h"
+#include "limits-util.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "loopback-setup.h"
+#include "machine-id-setup.h"
+#include "main.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "manager-serialize.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "os-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "random-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "selinux-setup.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "smack-setup.h"
+#include "special.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "switch-root.h"
+#include "sysctl-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+#include "version.h"
+#include "virt.h"
+#include "watchdog.h"
+
+#if HAS_FEATURE_ADDRESS_SANITIZER
+#include 
+#endif
+
+static enum {
+        ACTION_RUN,
+        ACTION_HELP,
+        ACTION_VERSION,
+        ACTION_TEST,
+        ACTION_DUMP_CONFIGURATION_ITEMS,
+        ACTION_DUMP_BUS_PROPERTIES,
+        ACTION_BUS_INTROSPECT,
+} arg_action = ACTION_RUN;
+
+static const char *arg_bus_introspect = NULL;
+
+/* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access.  Real
+ * defaults are assigned in reset_arguments() below. */
+static char *arg_default_unit;
+static RuntimeScope arg_runtime_scope;
+bool arg_dump_core;
+int arg_crash_chvt;
+bool arg_crash_shell;
+bool arg_crash_reboot;
+static char *arg_confirm_spawn;
+static ShowStatus arg_show_status;
+static StatusUnitFormat arg_status_unit_format;
+static bool arg_switched_root;
+static PagerFlags arg_pager_flags;
+static bool arg_service_watchdogs;
+static UnitDefaults arg_defaults;
+static usec_t arg_runtime_watchdog;
+static usec_t arg_reboot_watchdog;
+static usec_t arg_kexec_watchdog;
+static usec_t arg_pretimeout_watchdog;
+static char *arg_early_core_pattern;
+static char *arg_watchdog_pretimeout_governor;
+static char *arg_watchdog_device;
+static char **arg_default_environment;
+static char **arg_manager_environment;
+static uint64_t arg_capability_bounding_set;
+static bool arg_no_new_privs;
+static nsec_t arg_timer_slack_nsec;
+static Set* arg_syscall_archs;
+static FILE* arg_serialization;
+static sd_id128_t arg_machine_id;
+static EmergencyAction arg_cad_burst_action;
+static CPUSet arg_cpu_affinity;
+static NUMAPolicy arg_numa_policy;
+static usec_t arg_clock_usec;
+static void *arg_random_seed;
+static size_t arg_random_seed_size;
+static usec_t arg_reload_limit_interval_sec;
+static unsigned arg_reload_limit_burst;
+
+/* A copy of the original environment block */
+static char **saved_env = NULL;
+
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+                               const struct rlimit *saved_rlimit_memlock);
+
+static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
+        _cleanup_free_ char *base = NULL;
+        _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+        int r;
+
+        r = xdg_user_config_dir(&base, "/systemd");
+        if (r < 0)
+                return r;
+
+        r = strv_extendf(&files, "%s/user.conf", base);
+        if (r < 0)
+                return r;
+
+        r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
+        if (r < 0)
+                return r;
+
+        r = strv_consume(&dirs, TAKE_PTR(base));
+        if (r < 0)
+                return r;
+
+        r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
+        if (r < 0)
+                return r;
+
+        *ret_files = TAKE_PTR(files);
+        *ret_dirs = TAKE_PTR(dirs);
+        return 0;
+}
+
+static int console_setup(void) {
+        _cleanup_close_ int tty_fd = -EBADF;
+        unsigned rows, cols;
+        int r;
+
+        tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+        if (tty_fd < 0)
+                return log_error_errno(tty_fd, "Failed to open /dev/console: %m");
+
+        /* We don't want to force text mode.  plymouth may be showing
+         * pictures already from initrd. */
+        r = reset_terminal_fd(tty_fd, false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset /dev/console: %m");
+
+        r = proc_cmdline_tty_size("/dev/console", &rows, &cols);
+        if (r < 0)
+                log_warning_errno(r, "Failed to get terminal size, ignoring: %m");
+        else {
+                r = terminal_set_size_fd(tty_fd, NULL, rows, cols);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to set terminal size, ignoring: %m");
+        }
+
+        return 0;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        int r;
+
+        assert(key);
+
+        if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+                        log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
+                else if (in_initrd() == !!startswith(key, "rd."))
+                        return free_and_strdup_warn(&arg_default_unit, value);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
+
+                r = value ? parse_boolean(value) : true;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
+                else
+                        arg_dump_core = r;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (path_is_absolute(value))
+                        (void) parse_path_argument(value, false, &arg_early_core_pattern);
+                else
+                        log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
+
+                if (!value)
+                        arg_crash_chvt = 0; /* turn on */
+                else {
+                        r = parse_crash_chvt(value, &arg_crash_chvt);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
+                }
+
+        } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
+
+                r = value ? parse_boolean(value) : true;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
+                else
+                        arg_crash_shell = r;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
+
+                r = value ? parse_boolean(value) : true;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
+                else
+                        arg_crash_reboot = r;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
+                char *s;
+
+                r = parse_confirm_spawn(value, &s);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
+                else
+                        free_and_replace(arg_confirm_spawn, s);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
+
+                r = value ? parse_boolean(value) : true;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
+                else
+                        arg_service_watchdogs = r;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
+
+                if (value) {
+                        r = parse_show_status(value, &arg_show_status);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
+                } else
+                        arg_show_status = SHOW_STATUS_YES;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = status_unit_format_from_string(value);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
+                else
+                        arg_status_unit_format = r;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = exec_output_from_string(value);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
+                else
+                        arg_defaults.std_output = r;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = exec_output_from_string(value);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
+                else
+                        arg_defaults.std_error = r;
+
+        } else if (streq(key, "systemd.setenv")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!env_assignment_is_valid(value))
+                        log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
+                else {
+                        r = strv_env_replace_strdup(&arg_default_environment, value);
+                        if (r < 0)
+                                return log_oom();
+                }
+
+        } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = id128_from_string_nonzero(value, &arg_machine_id);
+                if (r < 0)
+                        log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = parse_sec(value, &arg_defaults.timeout_start_usec);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
+
+                if (arg_defaults.timeout_start_usec <= 0)
+                        arg_defaults.timeout_start_usec = USEC_INFINITY;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.default_device_timeout_sec")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = parse_sec(value, &arg_defaults.device_timeout_usec);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse default device timeout '%s', ignoring: %m", value);
+
+                if (arg_defaults.device_timeout_usec <= 0)
+                        arg_defaults.device_timeout_usec = USEC_INFINITY;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = parse_cpu_set(value, &arg_cpu_affinity);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                (void) parse_path_argument(value, false, &arg_watchdog_device);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (streq(value, "default"))
+                        arg_runtime_watchdog = USEC_INFINITY;
+                else if (streq(value, "off"))
+                        arg_runtime_watchdog = 0;
+                else {
+                        r = parse_sec(value, &arg_runtime_watchdog);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value);
+                                return 0;
+                        }
+                }
+
+                arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (streq(value, "default"))
+                        arg_pretimeout_watchdog = USEC_INFINITY;
+                else if (streq(value, "off"))
+                        arg_pretimeout_watchdog = 0;
+                else {
+                        r = parse_sec(value, &arg_pretimeout_watchdog);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value);
+                                return 0;
+                        }
+                }
+
+        } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) {
+
+                if (proc_cmdline_value_missing(key, value) || isempty(value)) {
+                        arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
+                        return 0;
+                }
+
+                if (!string_is_safe(value)) {
+                        log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value);
+                        return 0;
+                }
+
+                return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = safe_atou64(value, &arg_clock_usec);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
+
+        } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
+                void *p;
+                size_t sz;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = unbase64mem(value, SIZE_MAX, &p, &sz);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
+
+                free(arg_random_seed);
+                arg_random_seed = sz > 0 ? p : mfree(p);
+                arg_random_seed_size = sz;
+
+        } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_interval_sec")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = parse_sec(value, &arg_reload_limit_interval_sec);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse systemd.reload_limit_interval_sec= argument '%s', ignoring: %m", value);
+                        return 0;
+                }
+
+        } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_burst")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = safe_atou(value, &arg_reload_limit_burst);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse systemd.reload_limit_burst= argument '%s', ignoring: %m", value);
+                        return 0;
+                }
+
+        } else if (streq(key, "quiet") && !value) {
+
+                if (arg_show_status == _SHOW_STATUS_INVALID)
+                        arg_show_status = SHOW_STATUS_ERROR;
+
+        } else if (streq(key, "debug") && !value) {
+
+                /* Note that log_parse_environment() handles 'debug'
+                 * too, and sets the log level to LOG_DEBUG. */
+
+                if (detect_container() > 0)
+                        log_set_target(LOG_TARGET_CONSOLE);
+
+        } else if (!value) {
+                const char *target;
+
+                /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
+                target = runlevel_to_target(key);
+                if (target)
+                        return free_and_strdup_warn(&arg_default_unit, target);
+        }
+
+        return 0;
+}
+
+#define DEFINE_SETTER(name, func, descr)                              \
+        static int name(const char *unit,                             \
+                        const char *filename,                         \
+                        unsigned line,                                \
+                        const char *section,                          \
+                        unsigned section_line,                        \
+                        const char *lvalue,                           \
+                        int ltype,                                    \
+                        const char *rvalue,                           \
+                        void *data,                                   \
+                        void *userdata) {                             \
+                                                                      \
+                int r;                                                \
+                                                                      \
+                assert(filename);                                     \
+                assert(lvalue);                                       \
+                assert(rvalue);                                       \
+                                                                      \
+                r = func(rvalue);                                     \
+                if (r < 0)                                            \
+                        log_syntax(unit, LOG_ERR, filename, line, r,  \
+                                   "Invalid " descr "'%s': %m",       \
+                                   rvalue);                           \
+                                                                      \
+                return 0;                                             \
+        }
+
+DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
+DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
+DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
+DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
+DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
+
+static int config_parse_default_timeout_abort(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+        int r;
+
+        r = config_parse_timeout_abort(
+                        unit,
+                        filename,
+                        line,
+                        section,
+                        section_line,
+                        lvalue,
+                        ltype,
+                        rvalue,
+                        &arg_defaults.timeout_abort_usec,
+                        userdata);
+        if (r >= 0)
+                arg_defaults.timeout_abort_set = r;
+        return 0;
+}
+
+static int config_parse_oom_score_adjust(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int oa, r;
+
+        if (isempty(rvalue)) {
+                arg_defaults.oom_score_adjust_set = false;
+                return 0;
+        }
+
+        r = parse_oom_score_adjust(rvalue, &oa);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        arg_defaults.oom_score_adjust = oa;
+        arg_defaults.oom_score_adjust_set = true;
+
+        return 0;
+}
+
+static int parse_config_file(void) {
+        const ConfigTableItem items[] = {
+                { "Manager", "LogLevel",                     config_parse_level2,                0,                        NULL                              },
+                { "Manager", "LogTarget",                    config_parse_target,                0,                        NULL                              },
+                { "Manager", "LogColor",                     config_parse_color,                 0,                        NULL                              },
+                { "Manager", "LogLocation",                  config_parse_location,              0,                        NULL                              },
+                { "Manager", "LogTime",                      config_parse_time,                  0,                        NULL                              },
+                { "Manager", "DumpCore",                     config_parse_bool,                  0,                        &arg_dump_core                    },
+                { "Manager", "CrashChVT", /* legacy */       config_parse_crash_chvt,            0,                        &arg_crash_chvt                   },
+                { "Manager", "CrashChangeVT",                config_parse_crash_chvt,            0,                        &arg_crash_chvt                   },
+                { "Manager", "CrashShell",                   config_parse_bool,                  0,                        &arg_crash_shell                  },
+                { "Manager", "CrashReboot",                  config_parse_bool,                  0,                        &arg_crash_reboot                 },
+                { "Manager", "ShowStatus",                   config_parse_show_status,           0,                        &arg_show_status                  },
+                { "Manager", "StatusUnitFormat",             config_parse_status_unit_format,    0,                        &arg_status_unit_format           },
+                { "Manager", "CPUAffinity",                  config_parse_cpu_affinity2,         0,                        &arg_cpu_affinity                 },
+                { "Manager", "NUMAPolicy",                   config_parse_numa_policy,           0,                        &arg_numa_policy.type             },
+                { "Manager", "NUMAMask",                     config_parse_numa_mask,             0,                        &arg_numa_policy                  },
+                { "Manager", "JoinControllers",              config_parse_warn_compat,           DISABLED_CONFIGURATION,   NULL                              },
+                { "Manager", "RuntimeWatchdogSec",           config_parse_watchdog_sec,          0,                        &arg_runtime_watchdog             },
+                { "Manager", "RuntimeWatchdogPreSec",        config_parse_watchdog_sec,          0,                        &arg_pretimeout_watchdog          },
+                { "Manager", "RebootWatchdogSec",            config_parse_watchdog_sec,          0,                        &arg_reboot_watchdog              },
+                { "Manager", "ShutdownWatchdogSec",          config_parse_watchdog_sec,          0,                        &arg_reboot_watchdog              }, /* obsolete alias */
+                { "Manager", "KExecWatchdogSec",             config_parse_watchdog_sec,          0,                        &arg_kexec_watchdog               },
+                { "Manager", "WatchdogDevice",               config_parse_path,                  0,                        &arg_watchdog_device              },
+                { "Manager", "RuntimeWatchdogPreGovernor",   config_parse_string,                CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
+                { "Manager", "CapabilityBoundingSet",        config_parse_capability_set,        0,                        &arg_capability_bounding_set      },
+                { "Manager", "NoNewPrivileges",              config_parse_bool,                  0,                        &arg_no_new_privs                 },
+#if HAVE_SECCOMP
+                { "Manager", "SystemCallArchitectures",      config_parse_syscall_archs,         0,                        &arg_syscall_archs                },
+#else
+                { "Manager", "SystemCallArchitectures",      config_parse_warn_compat,           DISABLED_CONFIGURATION,   NULL                              },
+
+#endif
+                { "Manager", "TimerSlackNSec",               config_parse_nsec,                  0,                        &arg_timer_slack_nsec             },
+                { "Manager", "DefaultTimerAccuracySec",      config_parse_sec,                   0,                        &arg_defaults.timer_accuracy_usec },
+                { "Manager", "DefaultStandardOutput",        config_parse_output_restricted,     0,                        &arg_defaults.std_output          },
+                { "Manager", "DefaultStandardError",         config_parse_output_restricted,     0,                        &arg_defaults.std_error           },
+                { "Manager", "DefaultTimeoutStartSec",       config_parse_sec,                   0,                        &arg_defaults.timeout_start_usec  },
+                { "Manager", "DefaultTimeoutStopSec",        config_parse_sec,                   0,                        &arg_defaults.timeout_stop_usec   },
+                { "Manager", "DefaultTimeoutAbortSec",       config_parse_default_timeout_abort, 0,                        NULL                              },
+                { "Manager", "DefaultDeviceTimeoutSec",      config_parse_sec,                   0,                        &arg_defaults.device_timeout_usec },
+                { "Manager", "DefaultRestartSec",            config_parse_sec,                   0,                        &arg_defaults.restart_usec        },
+                { "Manager", "DefaultStartLimitInterval",    config_parse_sec,                   0,                        &arg_defaults.start_limit_interval}, /* obsolete alias */
+                { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec,                   0,                        &arg_defaults.start_limit_interval},
+                { "Manager", "DefaultStartLimitBurst",       config_parse_unsigned,              0,                        &arg_defaults.start_limit_burst   },
+                { "Manager", "DefaultEnvironment",           config_parse_environ,               arg_runtime_scope,        &arg_default_environment          },
+                { "Manager", "ManagerEnvironment",           config_parse_environ,               arg_runtime_scope,        &arg_manager_environment          },
+                { "Manager", "DefaultLimitCPU",              config_parse_rlimit,                RLIMIT_CPU,               arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitFSIZE",            config_parse_rlimit,                RLIMIT_FSIZE,             arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitDATA",             config_parse_rlimit,                RLIMIT_DATA,              arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitSTACK",            config_parse_rlimit,                RLIMIT_STACK,             arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitCORE",             config_parse_rlimit,                RLIMIT_CORE,              arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitRSS",              config_parse_rlimit,                RLIMIT_RSS,               arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitNOFILE",           config_parse_rlimit,                RLIMIT_NOFILE,            arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitAS",               config_parse_rlimit,                RLIMIT_AS,                arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitNPROC",            config_parse_rlimit,                RLIMIT_NPROC,             arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitMEMLOCK",          config_parse_rlimit,                RLIMIT_MEMLOCK,           arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitLOCKS",            config_parse_rlimit,                RLIMIT_LOCKS,             arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitSIGPENDING",       config_parse_rlimit,                RLIMIT_SIGPENDING,        arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitMSGQUEUE",         config_parse_rlimit,                RLIMIT_MSGQUEUE,          arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitNICE",             config_parse_rlimit,                RLIMIT_NICE,              arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitRTPRIO",           config_parse_rlimit,                RLIMIT_RTPRIO,            arg_defaults.rlimit               },
+                { "Manager", "DefaultLimitRTTIME",           config_parse_rlimit,                RLIMIT_RTTIME,            arg_defaults.rlimit               },
+                { "Manager", "DefaultCPUAccounting",         config_parse_bool,                  0,                        &arg_defaults.cpu_accounting      },
+                { "Manager", "DefaultIOAccounting",          config_parse_bool,                  0,                        &arg_defaults.io_accounting       },
+                { "Manager", "DefaultIPAccounting",          config_parse_bool,                  0,                        &arg_defaults.ip_accounting       },
+                { "Manager", "DefaultBlockIOAccounting",     config_parse_bool,                  0,                        &arg_defaults.blockio_accounting  },
+                { "Manager", "DefaultMemoryAccounting",      config_parse_bool,                  0,                        &arg_defaults.memory_accounting   },
+                { "Manager", "DefaultTasksAccounting",       config_parse_bool,                  0,                        &arg_defaults.tasks_accounting    },
+                { "Manager", "DefaultTasksMax",              config_parse_tasks_max,             0,                        &arg_defaults.tasks_max           },
+                { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec,              0,                        &arg_defaults.memory_pressure_threshold_usec },
+                { "Manager", "DefaultMemoryPressureWatch",   config_parse_memory_pressure_watch, 0,                        &arg_defaults.memory_pressure_watch },
+                { "Manager", "CtrlAltDelBurstAction",        config_parse_emergency_action,      arg_runtime_scope,        &arg_cad_burst_action             },
+                { "Manager", "DefaultOOMPolicy",             config_parse_oom_policy,            0,                        &arg_defaults.oom_policy          },
+                { "Manager", "DefaultOOMScoreAdjust",        config_parse_oom_score_adjust,      0,                        NULL                              },
+                { "Manager", "ReloadLimitIntervalSec",       config_parse_sec,                   0,                        &arg_reload_limit_interval_sec    },
+                { "Manager", "ReloadLimitBurst",             config_parse_unsigned,              0,                        &arg_reload_limit_burst           },
+#if ENABLE_SMACK
+                { "Manager", "DefaultSmackProcessLabel",     config_parse_string,                0,                        &arg_defaults.smack_process_label },
+#else
+                { "Manager", "DefaultSmackProcessLabel",     config_parse_warn_compat,           DISABLED_CONFIGURATION,   NULL                              },
+#endif
+                {}
+        };
+
+        if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
+                (void) config_parse_config_file("system.conf",
+                                                "Manager\0",
+                                                config_item_table_lookup, items,
+                                                CONFIG_PARSE_WARN,
+                                                NULL);
+        else {
+                _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+                int r;
+
+                assert(arg_runtime_scope == RUNTIME_SCOPE_USER);
+
+                r = manager_find_user_config_paths(&files, &dirs);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine config file paths: %m");
+
+                (void) config_parse_many(
+                                (const char* const*) files,
+                                (const char* const*) dirs,
+                                "user.conf.d",
+                                /* root = */ NULL,
+                                "Manager\0",
+                                config_item_table_lookup, items,
+                                CONFIG_PARSE_WARN,
+                                NULL, NULL, NULL);
+        }
+
+        /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
+         * USEC_INFINITY like everywhere else. */
+        if (arg_defaults.timeout_start_usec <= 0)
+                arg_defaults.timeout_start_usec = USEC_INFINITY;
+        if (arg_defaults.timeout_stop_usec <= 0)
+                arg_defaults.timeout_stop_usec = USEC_INFINITY;
+
+        return 0;
+}
+
+static void set_manager_defaults(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* Propagates the various default unit property settings into the manager object, i.e. properties
+         * that do not affect the manager itself, but are just what newly allocated units will have set if
+         * they haven't set anything else. (Also see set_manager_settings() for the settings that affect the
+         * manager's own behaviour) */
+
+        r = manager_set_unit_defaults(m, &arg_defaults);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set manager defaults, ignoring: %m");
+
+        r = manager_default_environment(m);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set manager default environment, ignoring: %m");
+
+        r = manager_transient_environment_add(m, arg_default_environment);
+        if (r < 0)
+                log_warning_errno(r, "Failed to add to transient environment, ignoring: %m");
+}
+
+static void set_manager_settings(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* Propagates the various manager settings into the manager object, i.e. properties that
+         * effect the manager itself (as opposed to just being inherited into newly allocated
+         * units, see set_manager_defaults() above). */
+
+        m->confirm_spawn = arg_confirm_spawn;
+        m->service_watchdogs = arg_service_watchdogs;
+        m->cad_burst_action = arg_cad_burst_action;
+        /* Note that we don't do structured initialization here, otherwise it will reset the rate limit
+         * counter on every daemon-reload. */
+        m->reload_ratelimit.interval = arg_reload_limit_interval_sec;
+        m->reload_ratelimit.burst = arg_reload_limit_burst;
+
+        manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
+        manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
+        manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
+        manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog);
+        r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor);
+
+        manager_set_show_status(m, arg_show_status, "command line");
+        m->status_unit_format = arg_status_unit_format;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                COMMON_GETOPT_ARGS,
+                SYSTEMD_GETOPT_ARGS,
+        };
+
+        static const struct option options[] = {
+                COMMON_GETOPT_OPTIONS,
+                SYSTEMD_GETOPT_OPTIONS,
+                {}
+        };
+
+        int c, r;
+        bool user_arg_seen = false;
+
+        assert(argc >= 1);
+        assert(argv);
+
+        if (getpid_cached() == 1)
+                opterr = 0;
+
+        while ((c = getopt_long(argc, argv, SYSTEMD_GETOPT_SHORT_OPTIONS, options, NULL)) >= 0)
+
+                switch (c) {
+
+                case ARG_LOG_LEVEL:
+                        r = log_set_max_level_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_LOG_TARGET:
+                        r = log_set_target_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_LOG_COLOR:
+
+                        if (optarg) {
+                                r = log_show_color_from_string(optarg);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
+                                                               optarg);
+                        } else
+                                log_show_color(true);
+
+                        break;
+
+                case ARG_LOG_LOCATION:
+                        if (optarg) {
+                                r = log_show_location_from_string(optarg);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
+                                                               optarg);
+                        } else
+                                log_show_location(true);
+
+                        break;
+
+                case ARG_LOG_TIME:
+
+                        if (optarg) {
+                                r = log_show_time_from_string(optarg);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
+                                                               optarg);
+                        } else
+                                log_show_time(true);
+
+                        break;
+
+                case ARG_DEFAULT_STD_OUTPUT:
+                        r = exec_output_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
+                                                       optarg);
+                        arg_defaults.std_output = r;
+                        break;
+
+                case ARG_DEFAULT_STD_ERROR:
+                        r = exec_output_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
+                                                       optarg);
+                        arg_defaults.std_error = r;
+                        break;
+
+                case ARG_UNIT:
+                        r = free_and_strdup(&arg_default_unit, optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
+
+                        break;
+
+                case ARG_SYSTEM:
+                        arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+                        break;
+
+                case ARG_USER:
+                        arg_runtime_scope = RUNTIME_SCOPE_USER;
+                        user_arg_seen = true;
+                        break;
+
+                case ARG_TEST:
+                        arg_action = ACTION_TEST;
+                        break;
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_VERSION:
+                        arg_action = ACTION_VERSION;
+                        break;
+
+                case ARG_DUMP_CONFIGURATION_ITEMS:
+                        arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
+                        break;
+
+                case ARG_DUMP_BUS_PROPERTIES:
+                        arg_action = ACTION_DUMP_BUS_PROPERTIES;
+                        break;
+
+                case ARG_BUS_INTROSPECT:
+                        arg_bus_introspect = optarg;
+                        arg_action = ACTION_BUS_INTROSPECT;
+                        break;
+
+                case ARG_DUMP_CORE:
+                        r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_CRASH_CHVT:
+                        r = parse_crash_chvt(optarg, &arg_crash_chvt);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
+                                                       optarg);
+                        break;
+
+                case ARG_CRASH_SHELL:
+                        r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_CRASH_REBOOT:
+                        r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_CONFIRM_SPAWN:
+                        arg_confirm_spawn = mfree(arg_confirm_spawn);
+
+                        r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
+                                                       optarg);
+                        break;
+
+                case ARG_SERVICE_WATCHDOGS:
+                        r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_SHOW_STATUS:
+                        if (optarg) {
+                                r = parse_show_status(optarg, &arg_show_status);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
+                                                               optarg);
+                        } else
+                                arg_show_status = SHOW_STATUS_YES;
+                        break;
+
+                case ARG_DESERIALIZE: {
+                        int fd;
+                        FILE *f;
+
+                        fd = parse_fd(optarg);
+                        if (fd < 0)
+                                return log_error_errno(fd, "Failed to parse serialization fd \"%s\": %m", optarg);
+
+                        (void) fd_cloexec(fd, true);
+
+                        f = fdopen(fd, "r");
+                        if (!f)
+                                return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
+
+                        safe_fclose(arg_serialization);
+                        arg_serialization = f;
+
+                        break;
+                }
+
+                case ARG_SWITCHED_ROOT:
+                        arg_switched_root = true;
+                        break;
+
+                case ARG_MACHINE_ID:
+                        r = id128_from_string_nonzero(optarg, &arg_machine_id);
+                        if (r < 0)
+                                return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
+                        break;
+
+                case 'h':
+                        arg_action = ACTION_HELP;
+                        break;
+
+                case 'D':
+                        log_set_max_level(LOG_DEBUG);
+                        break;
+
+                case 'b':
+                case 's':
+                case 'z':
+                        /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
+                         * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
+                         */
+                case '?':
+                        if (getpid_cached() != 1)
+                                return -EINVAL;
+                        else
+                                return 0;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (optind < argc && getpid_cached() != 1)
+                /* Hmm, when we aren't run as init system let's complain about excess arguments */
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
+
+        if (arg_action == ACTION_RUN && arg_runtime_scope == RUNTIME_SCOPE_USER && !user_arg_seen)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Explicit --user argument required to run as user manager.");
+
+        return 0;
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...]\n\n"
+               "%sStarts and monitors system and user services.%s\n\n"
+               "This program takes no positional arguments.\n\n"
+               "%sOptions%s:\n"
+               "  -h --help                      Show this help\n"
+               "     --version                   Show version\n"
+               "     --test                      Determine initial transaction, dump it and exit\n"
+               "     --system                    Combined with --test: operate in system mode\n"
+               "     --user                      Combined with --test: operate in user mode\n"
+               "     --dump-configuration-items  Dump understood unit configuration items\n"
+               "     --dump-bus-properties       Dump exposed bus properties\n"
+               "     --bus-introspect=PATH       Write XML introspection data\n"
+               "     --unit=UNIT                 Set default unit\n"
+               "     --dump-core[=BOOL]          Dump core on crash\n"
+               "     --crash-vt=NR               Change to specified VT on crash\n"
+               "     --crash-reboot[=BOOL]       Reboot on crash\n"
+               "     --crash-shell[=BOOL]        Run shell on crash\n"
+               "     --confirm-spawn[=BOOL]      Ask for confirmation when spawning processes\n"
+               "     --show-status[=BOOL]        Show status updates on the console during boot\n"
+               "     --log-target=TARGET         Set log target (console, journal, kmsg,\n"
+               "                                                 journal-or-kmsg, null)\n"
+               "     --log-level=LEVEL           Set log level (debug, info, notice, warning,\n"
+               "                                                err, crit, alert, emerg)\n"
+               "     --log-color[=BOOL]          Highlight important log messages\n"
+               "     --log-location[=BOOL]       Include code location in log messages\n"
+               "     --log-time[=BOOL]           Prefix log messages with current time\n"
+               "     --default-standard-output=  Set default standard output for services\n"
+               "     --default-standard-error=   Set default standard error output for services\n"
+               "     --no-pager                  Do not pipe output into a pager\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               ansi_underline(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int prepare_reexecute(
+                Manager *m,
+                FILE **ret_f,
+                FDSet **ret_fds,
+                bool switching_root) {
+
+        _cleanup_fdset_free_ FDSet *fds = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(m);
+        assert(ret_f);
+        assert(ret_fds);
+
+        r = manager_open_serialization(m, &f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create serialization file: %m");
+
+        /* Make sure nothing is really destructed when we shut down */
+        m->n_reloading++;
+        bus_manager_send_reloading(m, true);
+
+        fds = fdset_new();
+        if (!fds)
+                return log_oom();
+
+        r = manager_serialize(m, f, fds, switching_root);
+        if (r < 0)
+                return r;
+
+        if (fseeko(f, 0, SEEK_SET) < 0)
+                return log_error_errno(errno, "Failed to rewind serialization fd: %m");
+
+        r = fd_cloexec(fileno(f), false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
+
+        r = fdset_cloexec(fds, false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
+
+        *ret_f = TAKE_PTR(f);
+        *ret_fds = TAKE_PTR(fds);
+
+        return 0;
+}
+
+static void bump_file_max_and_nr_open(void) {
+
+        /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large
+         * numbers of file descriptors are no longer a performance problem and their memory is properly
+         * tracked by memcg, thus counting them and limiting them in another two layers of limits is
+         * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits
+         * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */
+
+#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
+        int r;
+#endif
+
+#if BUMP_PROC_SYS_FS_FILE_MAX
+        /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were
+         * different, but the operation would fail silently.) */
+        r = sysctl_write("fs/file-max", LONG_MAX_STR);
+        if (r < 0)
+                log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING,
+                               r, "Failed to bump fs.file-max, ignoring: %m");
+#endif
+
+#if BUMP_PROC_SYS_FS_NR_OPEN
+        int v = INT_MAX;
+
+        /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know
+         * what they are. The expression by which the maximum is determined is dependent on the architecture,
+         * and is something we don't really want to copy to userspace, as it is dependent on implementation
+         * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try
+         * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that
+         * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */
+
+        for (;;) {
+                int k;
+
+                v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
+                if (v < 1024) {
+                        log_warning("Can't bump fs.nr_open, value too small.");
+                        break;
+                }
+
+                k = read_nr_open();
+                if (k < 0) {
+                        log_error_errno(k, "Failed to read fs.nr_open: %m");
+                        break;
+                }
+                if (k >= v) { /* Already larger */
+                        log_debug("Skipping bump, value is already larger.");
+                        break;
+                }
+
+                r = sysctl_writef("fs/nr_open", "%i", v);
+                if (r == -EINVAL) {
+                        log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
+                        v /= 2;
+                        continue;
+                }
+                if (r < 0) {
+                        log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
+                        break;
+                }
+
+                log_debug("Successfully bumped fs.nr_open to %i", v);
+                break;
+        }
+#endif
+}
+
+static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) {
+        struct rlimit new_rlimit;
+        int r, nr;
+
+        /* Get the underlying absolute limit the kernel enforces */
+        nr = read_nr_open();
+
+        /* Calculate the new limits to use for us. Never lower from what we inherited. */
+        new_rlimit = (struct rlimit) {
+                .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
+                .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
+        };
+
+        /* Shortcut if nothing changes. */
+        if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
+            saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
+                log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
+                return 0;
+        }
+
+        /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
+         * both hard and soft. */
+        r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
+        if (r < 0)
+                return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
+
+        return 0;
+}
+
+static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) {
+        struct rlimit new_rlimit;
+        uint64_t mm;
+        int r;
+
+        /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK
+         * which should normally disable such checks. We need them to implement IPAddressAllow= and
+         * IPAddressDeny=, hence let's bump the value high enough for our user. */
+
+        /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
+         * must be unsigned, hence this is a given, but let's make this clear here. */
+        assert_cc(RLIM_INFINITY > 0);
+
+        mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of
+                                           * physical RAM. We allow an eighth to be locked by us, just to
+                                           * pick a value. */
+
+        new_rlimit = (struct rlimit) {
+                .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
+                .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
+        };
+
+        if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
+            saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
+                log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
+                return 0;
+        }
+
+        r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
+        if (r < 0)
+                return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
+
+        return 0;
+}
+
+static void test_usr(void) {
+
+        /* Check that /usr is either on the same file system as / or mounted already. */
+
+        if (dir_is_empty("/usr", /* ignore_hidden_or_backup= */ false) <= 0)
+                return;
+
+        log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
+                    "Some things will probably break (sometimes even silently) in mysterious ways. "
+                    "Consult https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
+}
+
+static int enforce_syscall_archs(Set *archs) {
+#if HAVE_SECCOMP
+        int r;
+
+        if (!is_seccomp_available())
+                return 0;
+
+        r = seccomp_restrict_archs(arg_syscall_archs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
+#endif
+        return 0;
+}
+
+static int os_release_status(void) {
+        _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL,
+                            *ansi_color = NULL, *support_end = NULL;
+        int r;
+
+        r = parse_os_release(NULL,
+                             "PRETTY_NAME", &pretty_name,
+                             "NAME",        &name,
+                             "VERSION",     &version,
+                             "ANSI_COLOR",  &ansi_color,
+                             "SUPPORT_END", &support_end);
+        if (r < 0)
+                return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                                      "Failed to read os-release file, ignoring: %m");
+
+        const char *label = os_release_pretty_name(pretty_name, name);
+
+        if (show_status_on(arg_show_status)) {
+                if (log_get_show_color())
+                        status_printf(NULL, 0,
+                                      "\nWelcome to \x1B[%sm%s\x1B[0m!\n",
+                                      empty_to_null(ansi_color) ?: "1",
+                                      label);
+                else
+                        status_printf(NULL, 0,
+                                      "\nWelcome to %s!\n",
+                                      label);
+        }
+
+        if (support_end && os_release_support_ended(support_end, /* quiet */ false, NULL) > 0)
+                /* pretty_name may include the version already, so we'll print the version only if we
+                 * have it and we're not using pretty_name. */
+                status_printf(ANSI_HIGHLIGHT_RED "  !!  " ANSI_NORMAL, 0,
+                              "This OS version (%s%s%s) is past its end-of-support date (%s)",
+                              label,
+                              (pretty_name || !version) ? "" : " version ",
+                              (pretty_name || !version) ? "" : version,
+                              support_end);
+
+        return 0;
+}
+
+static int setup_os_release(RuntimeScope scope) {
+        _cleanup_free_ char *os_release_dst = NULL;
+        const char *os_release_src = "/etc/os-release";
+        int r;
+
+        if (access("/etc/os-release", F_OK) < 0) {
+                if (errno != ENOENT)
+                        log_debug_errno(errno, "Failed to check if /etc/os-release exists, ignoring: %m");
+
+                os_release_src = "/usr/lib/os-release";
+        }
+
+        if (scope == RUNTIME_SCOPE_SYSTEM) {
+                os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release");
+                if (!os_release_dst)
+                        return log_oom_debug();
+        } else {
+                if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0)
+                        return log_oom_debug();
+        }
+
+        r = mkdir_parents_label(os_release_dst, 0755);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst);
+
+        r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst);
+
+        return 0;
+}
+
+static int write_container_id(void) {
+        const char *c;
+        int r = 0;  /* avoid false maybe-uninitialized warning */
+
+        c = getenv("container");
+        if (isempty(c))
+                return 0;
+
+        WITH_UMASK(0022)
+                r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
+
+        return 1;
+}
+
+static int bump_unix_max_dgram_qlen(void) {
+        _cleanup_free_ char *qlen = NULL;
+        unsigned long v;
+        int r;
+
+        /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set
+         * the value really really early during boot, so that it is actually applied to all our sockets,
+         * including the $NOTIFY_SOCKET one. */
+
+        r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
+        if (r < 0)
+                return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                                      "Failed to read AF_UNIX datagram queue length, ignoring: %m");
+
+        r = safe_atolu(qlen, &v);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
+
+        if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
+                return 0;
+
+        r = sysctl_write("net/unix/max_dgram_qlen", STRINGIFY(DEFAULT_UNIX_MAX_DGRAM_QLEN));
+        if (r < 0)
+                return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+                                      "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
+
+        return 1;
+}
+
+static int fixup_environment(void) {
+        _cleanup_free_ char *term = NULL;
+        const char *t;
+        int r;
+
+        /* Only fix up the environment when we are started as PID 1 */
+        if (getpid_cached() != 1)
+                return 0;
+
+        /* We expect the environment to be set correctly if run inside a container. */
+        if (detect_container() > 0)
+                return 0;
+
+        /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the
+         * backend device used by the console. We try to make a better guess here since some consoles might
+         * not have support for color mode for example.
+         *
+         * However if TERM was configured through the kernel command line then leave it alone. */
+        r = proc_cmdline_get_key("TERM", 0, &term);
+        if (r < 0)
+                return r;
+
+        if (r == 0) {
+                r = proc_cmdline_get_key("systemd.tty.term.console", 0, &term);
+                if (r < 0)
+                        return r;
+        }
+
+        t = term ?: default_term_for_tty("/dev/console");
+
+        if (setenv("TERM", t, 1) < 0)
+                return -errno;
+
+        /* The kernels sets HOME=/ for init. Let's undo this. */
+        if (path_equal_ptr(getenv("HOME"), "/"))
+                assert_se(unsetenv("HOME") == 0);
+
+        return 0;
+}
+
+static void redirect_telinit(int argc, char *argv[]) {
+
+        /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
+
+#if HAVE_SYSV_COMPAT
+        if (getpid_cached() == 1)
+                return;
+
+        if (!invoked_as(argv, "init"))
+                return;
+
+        execv(SYSTEMCTL_BINARY_PATH, argv);
+        log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m");
+        exit(EXIT_FAILURE);
+#endif
+}
+
+static int become_shutdown(int objective, int retval) {
+        static const char* const table[_MANAGER_OBJECTIVE_MAX] = {
+                [MANAGER_EXIT]     = "exit",
+                [MANAGER_REBOOT]   = "reboot",
+                [MANAGER_POWEROFF] = "poweroff",
+                [MANAGER_HALT]     = "halt",
+                [MANAGER_KEXEC]    = "kexec",
+        };
+
+        char log_level[STRLEN("--log-level=") + DECIMAL_STR_MAX(int)],
+             timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
+             exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)];
+
+        _cleanup_strv_free_ char **env_block = NULL;
+        usec_t watchdog_timer = 0;
+        int r;
+
+        assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
+        assert(table[objective]);
+
+        xsprintf(log_level, "--log-level=%d", log_get_max_level());
+        xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec);
+
+        const char* command_line[10] = {
+                SYSTEMD_SHUTDOWN_BINARY_PATH,
+                table[objective],
+                log_level,
+                timeout,
+                /* Note that the last position is a terminator and must contain NULL. */
+        };
+        size_t pos = 4;
+
+        assert(command_line[pos-1]);
+        assert(!command_line[pos]);
+
+        switch (log_get_target()) {
+
+        case LOG_TARGET_KMSG:
+        case LOG_TARGET_JOURNAL_OR_KMSG:
+        case LOG_TARGET_SYSLOG_OR_KMSG:
+                command_line[pos++] = "--log-target=kmsg";
+                break;
+
+        case LOG_TARGET_NULL:
+                command_line[pos++] = "--log-target=null";
+                break;
+
+        case LOG_TARGET_CONSOLE:
+        default:
+                command_line[pos++] = "--log-target=console";
+                break;
+        };
+
+        if (log_get_show_color())
+                command_line[pos++] = "--log-color";
+
+        if (log_get_show_location())
+                command_line[pos++] = "--log-location";
+
+        if (log_get_show_time())
+                command_line[pos++] = "--log-time";
+
+        xsprintf(exit_code, "--exit-code=%d", retval);
+        command_line[pos++] = exit_code;
+
+        assert(pos < ELEMENTSOF(command_line));
+
+        /* The watchdog: */
+
+        if (objective == MANAGER_REBOOT)
+                watchdog_timer = arg_reboot_watchdog;
+        else if (objective == MANAGER_KEXEC)
+                watchdog_timer = arg_kexec_watchdog;
+
+        /* If we reboot or kexec let's set the shutdown watchdog and tell the
+         * shutdown binary to repeatedly ping it.
+         * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */
+        (void) watchdog_setup_pretimeout(0);
+        (void) watchdog_setup_pretimeout_governor(NULL);
+        r = watchdog_setup(watchdog_timer);
+        watchdog_close(r < 0);
+
+        /* The environment block: */
+
+        env_block = strv_copy(environ);
+
+        /* Tell the binary how often to ping, ignore failure */
+        (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer);
+
+        if (arg_watchdog_device)
+                (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device);
+
+        /* Avoid the creation of new processes forked by the kernel; at this
+         * point, we will not listen to the signals anyway */
+        if (detect_container() <= 0)
+                (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);
+
+        execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
+        return -errno;
+}
+
+static void initialize_clock(void) {
+        int r;
+
+        /* This is called very early on, before we parse the kernel command line or otherwise figure out why
+         * we are running, but only once. */
+
+        if (clock_is_localtime(NULL) > 0) {
+                int min;
+
+                /* The very first call of settimeofday() also does a time warp in the kernel.
+                 *
+                 * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to
+                 * take care of maintaining the RTC and do all adjustments.  This matches the behavior of
+                 * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC.
+                 */
+                r = clock_set_timezone(&min);
+                if (r < 0)
+                        log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
+                else
+                        log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
+
+        } else if (!in_initrd())
+                /*
+                 * Do a dummy very first call to seal the kernel's time warp magic.
+                 *
+                 * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with
+                 * LOCAL, but the real system could be set up that way. In such case, we need to delay the
+                 * time-warp or the sealing until we reach the real system.
+                 *
+                 * Do no set the kernel's timezone. The concept of local time cannot be supported reliably,
+                 * the time will jump or be incorrect at every daylight saving time change. All kernel local
+                 * time concepts will be treated as UTC that way.
+                 */
+                (void) clock_reset_timewarp();
+
+        ClockChangeDirection change_dir;
+        r = clock_apply_epoch(&change_dir);
+        if (r > 0 && change_dir == CLOCK_CHANGE_FORWARD)
+                log_info("System time before build time, advancing clock.");
+        else if (r > 0 && change_dir == CLOCK_CHANGE_BACKWARD)
+                log_info("System time is further ahead than %s after build time, resetting clock to build time.",
+                         FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY));
+        else if (r < 0 && change_dir == CLOCK_CHANGE_FORWARD)
+                log_error_errno(r, "Current system time is before build time, but cannot correct: %m");
+        else if (r < 0 && change_dir == CLOCK_CHANGE_BACKWARD)
+                log_error_errno(r, "Current system time is further ahead %s after build time, but cannot correct: %m",
+                                FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY));
+}
+
+static void apply_clock_update(void) {
+        /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel
+         * command line and such. */
+
+        if (arg_clock_usec == 0)
+                return;
+
+        if (getpid_cached() != 1)
+                return;
+
+        if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0)
+                log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
+        else
+                log_info("Set system clock to %s, as specified on the kernel command line.",
+                         FORMAT_TIMESTAMP(arg_clock_usec));
+}
+
+static void cmdline_take_random_seed(void) {
+        size_t suggested;
+        int r;
+
+        if (arg_random_seed_size == 0)
+                return;
+
+        if (getpid_cached() != 1)
+                return;
+
+        assert(arg_random_seed);
+        suggested = random_pool_size();
+
+        if (arg_random_seed_size < suggested)
+                log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
+                            arg_random_seed_size, suggested);
+
+        r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
+                return;
+        }
+
+        log_notice("Successfully credited entropy passed on kernel command line.\n"
+                   "Note that the seed provided this way is accessible to unprivileged programs. "
+                   "This functionality should not be used outside of testing environments.");
+}
+
+static void initialize_coredump(bool skip_setup) {
+        if (getpid_cached() != 1)
+                return;
+
+        /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour
+         * the limit) will process core dumps for system services by default. */
+        if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
+                log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
+
+        /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored
+         * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel
+         * command line later so core dumps can still be generated during early startup and in initrd. */
+        if (!skip_setup)
+                disable_coredumps();
+}
+
+static void initialize_core_pattern(bool skip_setup) {
+        int r;
+
+        if (skip_setup || !arg_early_core_pattern)
+                return;
+
+        if (getpid_cached() != 1)
+                return;
+
+        r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m",
+                                  arg_early_core_pattern);
+}
+
+static void update_cpu_affinity(bool skip_setup) {
+        _cleanup_free_ char *mask = NULL;
+
+        if (skip_setup || !arg_cpu_affinity.set)
+                return;
+
+        assert(arg_cpu_affinity.allocated > 0);
+
+        mask = cpu_set_to_range_string(&arg_cpu_affinity);
+        log_debug("Setting CPU affinity to {%s}.", strnull(mask));
+
+        if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
+                log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m");
+}
+
+static void update_numa_policy(bool skip_setup) {
+        int r;
+        _cleanup_free_ char *nodes = NULL;
+        const char * policy = NULL;
+
+        if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
+                return;
+
+        if (DEBUG_LOGGING) {
+                policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
+                nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
+                log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes));
+        }
+
+        r = apply_numa_policy(&arg_numa_policy);
+        if (r == -EOPNOTSUPP)
+                log_debug_errno(r, "NUMA support not available, ignoring.");
+        else if (r < 0)
+                log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
+}
+
+static void filter_args(
+                const char* dst[],
+                size_t *dst_index,
+                char **src,
+                int argc) {
+
+        assert(dst);
+        assert(dst_index);
+
+        /* Copy some filtered arguments into the dst array from src. */
+        for (int i = 1; i < argc; i++) {
+                if (STR_IN_SET(src[i],
+                               "--switched-root",
+                               "--system",
+                               "--user"))
+                        continue;
+
+                if (startswith(src[i], "--deserialize="))
+                        continue;
+                if (streq(src[i], "--deserialize")) {
+                        i++;                            /* Skip the argument too */
+                        continue;
+                }
+
+                /* Skip target unit designators. We already acted upon this information and have queued
+                 * appropriate jobs. We don't want to redo all this after reexecution. */
+                if (startswith(src[i], "--unit="))
+                        continue;
+                if (streq(src[i], "--unit")) {
+                        i++;                            /* Skip the argument too */
+                        continue;
+                }
+
+                /* Seems we have a good old option. Let's pass it over to the new instance. */
+                dst[(*dst_index)++] = src[i];
+        }
+}
+
+static void finish_remaining_processes(ManagerObjective objective) {
+        assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
+
+        /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
+         * SIGCHLD for them after deserializing. */
+        if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
+                broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
+
+        /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
+         * of units that were configured with SurviveFinalKillSignal=yes. */
+        if (objective == MANAGER_SOFT_REBOOT)
+                broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
+}
+
+static int do_reexecute(
+                ManagerObjective objective,
+                int argc,
+                char* argv[],
+                const struct rlimit *saved_rlimit_nofile,
+                const struct rlimit *saved_rlimit_memlock,
+                FDSet *fds,
+                const char *switch_root_dir,
+                const char *switch_root_init,
+                const char **ret_error_message) {
+
+        size_t i, args_size;
+        const char **args;
+        int r;
+
+        assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT));
+        assert(argc >= 0);
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
+        assert(ret_error_message);
+
+        if (switch_root_init) {
+                r = chase(switch_root_init, switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to chase configured init %s/%s: %m",
+                                          strempty(switch_root_dir), switch_root_init);
+        } else {
+                r = chase(SYSTEMD_BINARY_PATH, switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to chase our own binary %s/%s: %m",
+                                        strempty(switch_root_dir), SYSTEMD_BINARY_PATH);
+        }
+
+        if (r < 0) {
+                r = chase("/sbin/init", switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to chase %s/sbin/init", strempty(switch_root_dir));
+        }
+
+        /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get
+         * rebooted while we do that */
+        watchdog_close(true);
+
+        /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
+         * the kernel default to its child processes */
+        if (saved_rlimit_nofile->rlim_cur != 0)
+                (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
+        if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
+                (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
+
+        finish_remaining_processes(objective);
+
+        if (!switch_root_dir && objective == MANAGER_SOFT_REBOOT) {
+                /* If no switch root dir is specified, then check if /run/nextroot/ qualifies and use that */
+                r = path_is_os_tree("/run/nextroot");
+                if (r < 0 && r != -ENOENT)
+                        log_debug_errno(r, "Failed to determine if /run/nextroot/ is a valid OS tree, ignoring: %m");
+                else if (r > 0)
+                        switch_root_dir = "/run/nextroot";
+        }
+
+        if (switch_root_dir) {
+                r = switch_root(/* new_root= */ switch_root_dir,
+                                /* old_root_after= */ NULL,
+                                /* flags= */ (objective == MANAGER_SWITCH_ROOT ? SWITCH_ROOT_DESTROY_OLD_ROOT : 0) |
+                                             (objective == MANAGER_SOFT_REBOOT ? 0 : SWITCH_ROOT_RECURSIVE_RUN));
+                if (r < 0)
+                        log_error_errno(r, "Failed to switch root, trying to continue: %m");
+        }
+
+        args_size = argc + 5;
+        args = newa(const char*, args_size);
+
+        if (!switch_root_init) {
+                char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)];
+
+                /* First try to spawn ourselves with the right path, and with full serialization. We do this
+                 * only if the user didn't specify an explicit init to spawn. */
+
+                assert(arg_serialization);
+                assert(fds);
+
+                xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization));
+
+                i = 1;         /* Leave args[0] empty for now. */
+
+                /* Put our stuff first to make sure it always gets parsed in case
+                 * we get weird stuff from the kernel cmdline (like --) */
+                if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
+                        args[i++] = "--switched-root";
+                args[i++] = runtime_scope_cmdline_option_to_string(arg_runtime_scope);
+                args[i++] = sfd;
+
+                filter_args(args, &i, argv, argc);
+
+                args[i++] = NULL;
+
+                assert(i <= args_size);
+
+                /*
+                 * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do
+                 * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a
+                 * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary,
+                 * and wait() for it in the parent, before proceeding into the exec().
+                 */
+                valgrind_summary_hack();
+
+                args[0] = SYSTEMD_BINARY_PATH;
+                (void) execv(args[0], (char* const*) args);
+
+                if (objective == MANAGER_REEXECUTE) {
+                        *ret_error_message = "Failed to execute our own binary";
+                        return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]);
+                }
+
+                log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]);
+        }
+
+        /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and
+         * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[],
+         * but let's hope that doesn't matter.) */
+
+        arg_serialization = safe_fclose(arg_serialization);
+        fds = fdset_free(fds);
+
+        /* Reopen the console */
+        (void) make_console_stdio();
+
+        i = 1;         /* Leave args[0] empty for now. */
+        for (int j = 1; j <= argc; j++)
+                args[i++] = argv[j];
+        assert(i <= args_size);
+
+        /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */
+        (void) reset_all_signal_handlers();
+        (void) reset_signal_mask();
+        (void) rlimit_nofile_safe();
+
+        if (switch_root_init) {
+                args[0] = switch_root_init;
+                (void) execve(args[0], (char* const*) args, saved_env);
+                log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]);
+        }
+
+        args[0] = "/sbin/init";
+        (void) execv(args[0], (char* const*) args);
+        r = -errno;
+
+        manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
+                              ANSI_HIGHLIGHT_RED "  !!  " ANSI_NORMAL,
+                              "Failed to execute /sbin/init");
+
+        *ret_error_message = "Failed to execute fallback shell";
+        if (r == -ENOENT) {
+                log_warning("No /sbin/init, trying fallback");
+
+                args[0] = "/bin/sh";
+                args[1] = NULL;
+                (void) execve(args[0], (char* const*) args, saved_env);
+                return log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m");
+        } else
+                return log_error_errno(r, "Failed to execute /sbin/init, giving up: %m");
+}
+
+static int invoke_main_loop(
+                Manager *m,
+                const struct rlimit *saved_rlimit_nofile,
+                const struct rlimit *saved_rlimit_memlock,
+                int *ret_retval,                   /* Return parameters relevant for shutting down */
+                FDSet **ret_fds,                   /* Return parameters for reexecuting */
+                char **ret_switch_root_dir,        /* … */
+                char **ret_switch_root_init,       /* … */
+                const char **ret_error_message) {
+
+        int r;
+
+        assert(m);
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
+        assert(ret_retval);
+        assert(ret_fds);
+        assert(ret_switch_root_dir);
+        assert(ret_switch_root_init);
+        assert(ret_error_message);
+
+        for (;;) {
+                int objective = manager_loop(m);
+                if (objective < 0) {
+                        *ret_error_message = "Failed to run main loop";
+                        return log_struct_errno(LOG_EMERG, objective,
+                                                LOG_MESSAGE("Failed to run main loop: %m"),
+                                                "MESSAGE_ID=" SD_MESSAGE_CORE_MAINLOOP_FAILED_STR);
+                }
+
+                switch (objective) {
+
+                case MANAGER_RELOAD: {
+                        LogTarget saved_log_target;
+                        int saved_log_level;
+
+                        manager_send_reloading(m);
+
+                        log_info("Reloading...");
+
+                        /* First, save any overridden log level/target, then parse the configuration file,
+                         * which might change the log level to new settings. */
+
+                        saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
+                        saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
+
+                        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
+
+                        set_manager_defaults(m);
+                        set_manager_settings(m);
+
+                        update_cpu_affinity(false);
+                        update_numa_policy(false);
+
+                        if (saved_log_level >= 0)
+                                manager_override_log_level(m, saved_log_level);
+                        if (saved_log_target >= 0)
+                                manager_override_log_target(m, saved_log_target);
+
+                        if (manager_reload(m) < 0)
+                                /* Reloading failed before the point of no return.
+                                 * Let's continue running as if nothing happened. */
+                                m->objective = MANAGER_OK;
+                        else
+                                log_info("Reloading finished in " USEC_FMT " ms.",
+                                         usec_sub_unsigned(now(CLOCK_MONOTONIC), m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].monotonic) / USEC_PER_MSEC);
+
+                        continue;
+                }
+
+                case MANAGER_REEXECUTE:
+
+                        manager_send_reloading(m); /* From the perspective of the manager calling us this is
+                                                    * pretty much the same as a reload */
+
+                        r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
+                        if (r < 0) {
+                                *ret_error_message = "Failed to prepare for reexecution";
+                                return r;
+                        }
+
+                        log_notice("Reexecuting.");
+
+                        *ret_retval = EXIT_SUCCESS;
+                        *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+                        return objective;
+
+                case MANAGER_SWITCH_ROOT:
+
+                        manager_send_reloading(m); /* From the perspective of the manager calling us this is
+                                                    * pretty much the same as a reload */
+
+                        manager_set_switching_root(m, true);
+
+                        if (!m->switch_root_init) {
+                                r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
+                                if (r < 0) {
+                                        *ret_error_message = "Failed to prepare for reexecution";
+                                        return r;
+                                }
+                        } else
+                                *ret_fds = NULL;
+
+                        log_notice("Switching root.");
+
+                        *ret_retval = EXIT_SUCCESS;
+
+                        /* Steal the switch root parameters */
+                        *ret_switch_root_dir = TAKE_PTR(m->switch_root);
+                        *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
+
+                        return objective;
+
+                case MANAGER_SOFT_REBOOT:
+                        manager_send_reloading(m);
+                        manager_set_switching_root(m, true);
+
+                        r = prepare_reexecute(m, &arg_serialization, ret_fds, /* switching_root= */ true);
+                        if (r < 0) {
+                                *ret_error_message = "Failed to prepare for reexecution";
+                                return r;
+                        }
+
+                        log_notice("Soft-rebooting.");
+
+                        *ret_retval = EXIT_SUCCESS;
+                        *ret_switch_root_dir = TAKE_PTR(m->switch_root);
+                        *ret_switch_root_init = NULL;
+
+                        return objective;
+
+                case MANAGER_EXIT:
+                        if (MANAGER_IS_USER(m)) {
+                                log_debug("Exit.");
+
+                                *ret_retval = m->return_value;
+                                *ret_fds = NULL;
+                                *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+                                return objective;
+                        }
+
+                        _fallthrough_;
+                case MANAGER_REBOOT:
+                case MANAGER_POWEROFF:
+                case MANAGER_HALT:
+                case MANAGER_KEXEC: {
+                        log_notice("Shutting down.");
+
+                        *ret_retval = m->return_value;
+                        *ret_fds = NULL;
+                        *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+                        return objective;
+                }
+
+                default:
+                        assert_not_reached();
+                }
+        }
+}
+
+static void log_execution_mode(bool *ret_first_boot) {
+        bool first_boot = false;
+        int r;
+
+        assert(ret_first_boot);
+
+        switch (arg_runtime_scope) {
+
+        case RUNTIME_SCOPE_SYSTEM: {
+                struct utsname uts;
+                int v;
+
+                log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)",
+                         arg_action == ACTION_TEST ? "test " : "",
+                         systemd_features);
+
+                v = detect_virtualization();
+                if (v > 0)
+                        log_info("Detected virtualization %s.", virtualization_to_string(v));
+
+                v = detect_confidential_virtualization();
+                if (v > 0)
+                        log_info("Detected confidential virtualization %s.", confidential_virtualization_to_string(v));
+
+                log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
+
+                if (in_initrd())
+                        log_info("Running in initrd.");
+                else {
+                        _cleanup_free_ char *id_text = NULL;
+
+                        /* Let's check whether we are in first boot. First, check if an override was
+                         * specified on the kernel command line. If yes, we honour that. */
+
+                        r = proc_cmdline_get_bool("systemd.condition-first-boot", /* flags = */ 0, &first_boot);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to parse systemd.condition-first-boot= kernel command line argument, ignoring: %m");
+
+                        if (r > 0)
+                                log_full(first_boot ? LOG_INFO : LOG_DEBUG,
+                                         "Kernel command line argument says we are %s first boot.",
+                                         first_boot ? "in" : "not in");
+                        else {
+                                /* Second, perform autodetection. We use /etc/machine-id as flag file for
+                                 * this: If it is missing or contains the value "uninitialized", this is the
+                                 * first boot. In other cases, it is not. This allows container managers and
+                                 * installers to provision a couple of files in /etc but still permit the
+                                 * first-boot initialization to occur. If the container manager wants to
+                                 * provision the machine ID it should pass $container_uuid to PID 1. */
+
+                                r = read_one_line_file("/etc/machine-id", &id_text);
+                                if (r < 0 || streq(id_text, "uninitialized")) {
+                                        if (r < 0 && r != -ENOENT)
+                                                log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m");
+
+                                        first_boot = true;
+                                        log_info("Detected first boot.");
+                                } else
+                                        log_debug("Detected initialized system, this is not the first boot.");
+                        }
+                }
+
+                assert_se(uname(&uts) >= 0);
+
+                if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+                        log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. "
+                                    "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION);
+                else
+                        log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION);
+
+                break;
+        }
+
+        case RUNTIME_SCOPE_USER:
+                if (DEBUG_LOGGING) {
+                        _cleanup_free_ char *t = NULL;
+
+                        t = uid_to_name(getuid());
+                        log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
+                                  arg_action == ACTION_TEST ? " test" : "",
+                                  getuid(), strna(t), systemd_features);
+                }
+
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        *ret_first_boot = first_boot;
+}
+
+static int initialize_runtime(
+                bool skip_setup,
+                bool first_boot,
+                struct rlimit *saved_rlimit_nofile,
+                struct rlimit *saved_rlimit_memlock,
+                const char **ret_error_message) {
+        int r;
+
+        assert(ret_error_message);
+
+        /* Sets up various runtime parameters. Many of these initializations are conditionalized:
+         *
+         * - Some only apply to --system instances
+         * - Some only apply to --user instances
+         * - Some only apply when we first start up, but not when we reexecute
+         */
+
+        if (arg_action != ACTION_RUN)
+                return 0;
+
+        update_cpu_affinity(skip_setup);
+        update_numa_policy(skip_setup);
+
+        switch (arg_runtime_scope) {
+
+        case RUNTIME_SCOPE_SYSTEM:
+                /* Make sure we leave a core dump without panicking the kernel. */
+                install_crash_handler();
+
+                if (!skip_setup) {
+                        r = mount_cgroup_controllers();
+                        if (r < 0) {
+                                *ret_error_message = "Failed to mount cgroup hierarchies";
+                                return r;
+                        }
+
+                        /* Pull credentials from various sources into a common credential directory (we do
+                         * this here, before setting up the machine ID, so that we can use credential info
+                         * for setting up the machine ID) */
+                        (void) import_credentials();
+
+                        (void) os_release_status();
+                        (void) hostname_setup(true);
+                        /* Force transient machine-id on first boot. */
+                        machine_id_setup(/* root= */ NULL, /* force_transient= */ first_boot, arg_machine_id, /* ret_machine_id */ NULL);
+                        (void) loopback_setup();
+                        bump_unix_max_dgram_qlen();
+                        bump_file_max_and_nr_open();
+                        test_usr();
+                        write_container_id();
+
+                        /* Copy os-release to the propagate directory, so that we update it for services running
+                         * under RootDirectory=/RootImage= when we do a soft reboot. */
+                        r = setup_os_release(RUNTIME_SCOPE_SYSTEM);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
+                }
+
+                r = watchdog_set_device(arg_watchdog_device);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
+
+                break;
+
+        case RUNTIME_SCOPE_USER: {
+                _cleanup_free_ char *p = NULL;
+
+                /* Create the runtime directory and place the inaccessible device nodes there, if we run in
+                 * user mode. In system mode mount_setup() already did that. */
+
+                r = xdg_user_runtime_dir(&p, "/systemd");
+                if (r < 0) {
+                        *ret_error_message = "$XDG_RUNTIME_DIR is not set";
+                        return log_struct_errno(LOG_EMERG, r,
+                                                LOG_MESSAGE("Failed to determine $XDG_RUNTIME_DIR path: %m"),
+                                                "MESSAGE_ID=" SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR);
+                }
+
+                (void) mkdir_p_label(p, 0755);
+                (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
+                r = setup_os_release(RUNTIME_SCOPE_USER);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
+                break;
+        }
+
+        default:
+                assert_not_reached();
+        }
+
+        if (arg_timer_slack_nsec != NSEC_INFINITY)
+                if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
+                        log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
+
+        if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+
+                if (!cap_test_all(arg_capability_bounding_set)) {
+                        r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
+                        if (r < 0) {
+                                *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
+                                return log_struct_errno(LOG_EMERG, r,
+                                                        LOG_MESSAGE("Failed to drop capability bounding set of usermode helpers: %m"),
+                                                        "MESSAGE_ID=" SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR);
+                        }
+
+                        r = capability_bounding_set_drop(arg_capability_bounding_set, true);
+                        if (r < 0) {
+                                *ret_error_message = "Failed to drop capability bounding set";
+                                return log_struct_errno(LOG_EMERG, r,
+                                                        LOG_MESSAGE("Failed to drop capability bounding set: %m"),
+                                                        "MESSAGE_ID=" SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR);
+                        }
+                }
+
+                if (arg_no_new_privs) {
+                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+                                *ret_error_message = "Failed to disable new privileges";
+                                return log_struct_errno(LOG_EMERG, errno,
+                                                        LOG_MESSAGE("Failed to disable new privileges: %m"),
+                                                        "MESSAGE_ID=" SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR);
+                        }
+                }
+        }
+
+        if (arg_syscall_archs) {
+                r = enforce_syscall_archs(arg_syscall_archs);
+                if (r < 0) {
+                        *ret_error_message = "Failed to set syscall architectures";
+                        return r;
+                }
+        }
+
+        r = make_reaper_process(true);
+        if (r < 0)
+                log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
+
+        /* Bump up RLIMIT_NOFILE for systemd itself */
+        (void) bump_rlimit_nofile(saved_rlimit_nofile);
+        (void) bump_rlimit_memlock(saved_rlimit_memlock);
+
+        return 0;
+}
+
+static int do_queue_default_job(
+                Manager *m,
+                const char **ret_error_message) {
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        const char *unit;
+        Job *job;
+        Unit *target;
+        int r;
+
+        if (arg_default_unit)
+                unit = arg_default_unit;
+        else if (in_initrd())
+                unit = SPECIAL_INITRD_TARGET;
+        else
+                unit = SPECIAL_DEFAULT_TARGET;
+
+        log_debug("Activating default unit: %s", unit);
+
+        r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
+        if (r < 0 && in_initrd() && !arg_default_unit) {
+                /* Fall back to default.target, which we used to always use by default. Only do this if no
+                 * explicit configuration was given. */
+
+                log_info("Falling back to " SPECIAL_DEFAULT_TARGET ".");
+
+                r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
+        }
+        if (r < 0) {
+                log_info("Falling back to " SPECIAL_RESCUE_TARGET ".");
+
+                r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
+                if (r < 0) {
+                        *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
+                                                           : "Failed to load " SPECIAL_RESCUE_TARGET;
+                        return r;
+                }
+        }
+
+        assert(target->load_state == UNIT_LOADED);
+
+        r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job);
+        if (r == -EPERM) {
+                log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
+
+                sd_bus_error_free(&error);
+
+                r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job);
+                if (r < 0) {
+                        *ret_error_message = "Failed to start default target";
+                        return log_struct_errno(LOG_EMERG, r,
+                                                LOG_MESSAGE("Failed to start default target: %s", bus_error_message(&error, r)),
+                                                "MESSAGE_ID=" SD_MESSAGE_CORE_START_TARGET_FAILED_STR);
+                }
+
+        } else if (r < 0) {
+                *ret_error_message = "Failed to isolate default target";
+                return log_struct_errno(LOG_EMERG, r,
+                                        LOG_MESSAGE("Failed to isolate default target: %s", bus_error_message(&error, r)),
+                                        "MESSAGE_ID=" SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR);
+        } else
+                log_info("Queued %s job for default target %s.",
+                         job_type_to_string(job->type),
+                         unit_status_string(job->unit, NULL));
+
+        m->default_unit_job_id = job->id;
+
+        return 0;
+}
+
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
+                         struct rlimit *saved_rlimit_memlock) {
+
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
+
+        if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
+                log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
+
+        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
+                log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
+}
+
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
+        struct rlimit *rl;
+
+        if (arg_defaults.rlimit[RLIMIT_NOFILE])
+                return;
+
+        /* Make sure forked processes get limits based on the original kernel setting */
+
+        rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
+        if (!rl) {
+                log_oom();
+                return;
+        }
+
+        /* Bump the hard limit for system services to a substantially higher value. The default
+         * hard limit current kernels set is pretty low (4K), mostly for historical
+         * reasons. According to kernel developers, the fd handling in recent kernels has been
+         * optimized substantially enough, so that we can bump the limit now, without paying too
+         * high a price in memory or performance. Note however that we only bump the hard limit,
+         * not the soft limit. That's because select() works the way it works, and chokes on fds
+         * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
+         * unexpecting programs that they get fds higher than what they can process using
+         * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
+         * this pitfall:  programs that are written by folks aware of the select() problem in mind
+         * (and thus use poll()/epoll instead of select(), the way everybody should) can
+         * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
+         * we pass. */
+        if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+                int nr;
+
+                /* Get the underlying absolute limit the kernel enforces */
+                nr = read_nr_open();
+
+                rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
+        }
+
+        /* If for some reason we were invoked with a soft limit above 1024 (which should never
+         * happen!, but who knows what we get passed in from pam_limit when invoked as --user
+         * instance), then lower what we pass on to not confuse our children */
+        rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
+
+        arg_defaults.rlimit[RLIMIT_NOFILE] = rl;
+}
+
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
+        struct rlimit *rl;
+
+        /* Pass the original value down to invoked processes */
+
+        if (arg_defaults.rlimit[RLIMIT_MEMLOCK])
+                return;
+
+        rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
+        if (!rl) {
+                log_oom();
+                return;
+        }
+
+        if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)  {
+                /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel
+                 * default for this since kernel 5.16) */
+                rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
+                rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
+        }
+
+        arg_defaults.rlimit[RLIMIT_MEMLOCK] = rl;
+}
+
+static void setenv_manager_environment(void) {
+        int r;
+
+        STRV_FOREACH(p, arg_manager_environment) {
+                log_debug("Setting '%s' in our own environment.", *p);
+
+                r = putenv_dup(*p, true);
+                if (r < 0)
+                        log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p);
+        }
+}
+
+static void reset_arguments(void) {
+        /* Frees/resets arg_* variables, with a few exceptions commented below. */
+
+        arg_default_unit = mfree(arg_default_unit);
+
+        /* arg_runtime_scope — ignore */
+
+        arg_dump_core = true;
+        arg_crash_chvt = -1;
+        arg_crash_shell = false;
+        arg_crash_reboot = false;
+        arg_confirm_spawn = mfree(arg_confirm_spawn);
+        arg_show_status = _SHOW_STATUS_INVALID;
+        arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
+        arg_switched_root = false;
+        arg_pager_flags = 0;
+        arg_service_watchdogs = true;
+
+        unit_defaults_done(&arg_defaults);
+        unit_defaults_init(&arg_defaults, arg_runtime_scope);
+
+        arg_runtime_watchdog = 0;
+        arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
+        arg_kexec_watchdog = 0;
+        arg_pretimeout_watchdog = 0;
+        arg_early_core_pattern = mfree(arg_early_core_pattern);
+        arg_watchdog_device = mfree(arg_watchdog_device);
+        arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
+
+        arg_default_environment = strv_free(arg_default_environment);
+        arg_manager_environment = strv_free(arg_manager_environment);
+
+        arg_capability_bounding_set = CAP_MASK_UNSET;
+        arg_no_new_privs = false;
+        arg_timer_slack_nsec = NSEC_INFINITY;
+
+        arg_syscall_archs = set_free(arg_syscall_archs);
+
+        /* arg_serialization — ignore */
+
+        arg_machine_id = (sd_id128_t) {};
+        arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
+
+        cpu_set_reset(&arg_cpu_affinity);
+        numa_policy_reset(&arg_numa_policy);
+
+        arg_random_seed = mfree(arg_random_seed);
+        arg_random_seed_size = 0;
+        arg_clock_usec = 0;
+
+        arg_reload_limit_interval_sec = 0;
+        arg_reload_limit_burst = 0;
+}
+
+static void determine_default_oom_score_adjust(void) {
+        int r, a, b;
+
+        /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and
+         * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged
+         * user). */
+
+        if (arg_defaults.oom_score_adjust_set)
+                return;
+
+        if (getuid() == 0)
+                return;
+
+        r = get_oom_score_adjust(&a);
+        if (r < 0)
+                return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m");
+
+        assert_cc(100 <= OOM_SCORE_ADJ_MAX);
+        b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100;
+
+        if (a == b)
+                return;
+
+        arg_defaults.oom_score_adjust = b;
+        arg_defaults.oom_score_adjust_set = true;
+}
+
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+                               const struct rlimit *saved_rlimit_memlock) {
+        int r;
+
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
+
+        /* Assign configuration defaults */
+        reset_arguments();
+
+        r = parse_config_file();
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse config file, ignoring: %m");
+
+        if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+                r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+        }
+
+        /* Initialize some default rlimits for services if they haven't been configured */
+        fallback_rlimit_nofile(saved_rlimit_nofile);
+        fallback_rlimit_memlock(saved_rlimit_memlock);
+
+        /* Note that this also parses bits from the kernel command line, including "debug". */
+        log_parse_environment();
+
+        /* Initialize the show status setting if it hasn't been set explicitly yet */
+        if (arg_show_status == _SHOW_STATUS_INVALID)
+                arg_show_status = SHOW_STATUS_YES;
+
+        /* Slightly raise the OOM score for our services if we are running for unprivileged users. */
+        determine_default_oom_score_adjust();
+
+        /* Push variables into the manager environment block */
+        setenv_manager_environment();
+
+        /* Parse log environment variables again to take into account any new environment variables. */
+        log_parse_environment();
+
+        return 0;
+}
+
+static int safety_checks(void) {
+
+        if (getpid_cached() == 1 &&
+            arg_action != ACTION_RUN)
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+                                       "Unsupported execution mode while PID 1.");
+
+        if (getpid_cached() == 1 &&
+            arg_runtime_scope == RUNTIME_SCOPE_USER)
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+                                       "Can't run --user mode as PID 1.");
+
+        if (arg_action == ACTION_RUN &&
+            arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
+            getpid_cached() != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+                                       "Can't run system mode unless PID 1.");
+
+        if (arg_action == ACTION_TEST &&
+            geteuid() == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+                                       "Don't run test mode as root.");
+
+        switch (arg_runtime_scope) {
+
+        case RUNTIME_SCOPE_USER:
+
+                if (arg_action == ACTION_RUN &&
+                    sd_booted() <= 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                               "Trying to run as user instance, but the system has not been booted with systemd.");
+
+                if (arg_action == ACTION_RUN &&
+                    !getenv("XDG_RUNTIME_DIR"))
+                        return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
+                                               "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
+
+                break;
+
+        case RUNTIME_SCOPE_SYSTEM:
+                if (arg_action == ACTION_RUN &&
+                    running_in_chroot() > 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                               "Cannot be run in a chroot() environment.");
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+static int initialize_security(
+                bool *loaded_policy,
+                dual_timestamp *security_start_timestamp,
+                dual_timestamp *security_finish_timestamp,
+                const char **ret_error_message) {
+
+        int r;
+
+        assert(loaded_policy);
+        assert(security_start_timestamp);
+        assert(security_finish_timestamp);
+        assert(ret_error_message);
+
+        dual_timestamp_now(security_start_timestamp);
+
+        r = mac_selinux_setup(loaded_policy);
+        if (r < 0) {
+                *ret_error_message = "Failed to load SELinux policy";
+                return r;
+        }
+
+        r = mac_smack_setup(loaded_policy);
+        if (r < 0) {
+                *ret_error_message = "Failed to load SMACK policy";
+                return r;
+        }
+
+        r = mac_apparmor_setup();
+        if (r < 0) {
+                *ret_error_message = "Failed to load AppArmor policy";
+                return r;
+        }
+
+        r = ima_setup();
+        if (r < 0) {
+                *ret_error_message = "Failed to load IMA policy";
+                return r;
+        }
+
+        dual_timestamp_now(security_finish_timestamp);
+        return 0;
+}
+
+static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
+        int r;
+
+        assert(ret_fds);
+        assert(ret_error_message);
+
+        /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
+         * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies
+         * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe
+         * mechanism to distinguish passed in fds from our own.
+         *
+         * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our
+         * process behind our back. We should not take possession of that (and then accidentally close
+         * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */
+        r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds);
+        if (r < 0) {
+                *ret_error_message = "Failed to allocate fd set";
+                return log_struct_errno(LOG_EMERG, r,
+                                        LOG_MESSAGE("Failed to allocate fd set: %m"),
+                                        "MESSAGE_ID=" SD_MESSAGE_CORE_FD_SET_FAILED_STR);
+        }
+
+        /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
+        assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
+
+        return 0;
+}
+
+static void setup_console_terminal(bool skip_setup) {
+
+        if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM)
+                return;
+
+        /* Become a session leader if we aren't one yet. */
+        (void) setsid();
+
+        /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a
+         * controlling tty. */
+        (void) release_terminal();
+
+        /* Reset the console, but only if this is really init and we are freshly booted */
+        if (getpid_cached() == 1 && !skip_setup)
+                (void) console_setup();
+}
+
+static bool early_skip_setup_check(int argc, char *argv[]) {
+        bool found_deserialize = false;
+
+        /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much
+         * later, so let's just have a quick peek here. Note that if we have switched root, do all the
+         * special setup things anyway, even if in that case we also do deserialization. */
+
+        for (int i = 1; i < argc; i++)
+                if (streq(argv[i], "--switched-root"))
+                        return false; /* If we switched root, don't skip the setup. */
+                else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize"))
+                        found_deserialize = true;
+
+        return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
+}
+
+static int save_env(void) {
+        char **l;
+
+        l = strv_copy(environ);
+        if (!l)
+                return -ENOMEM;
+
+        strv_free_and_replace(saved_env, l);
+        return 0;
+}
+
+int main(int argc, char *argv[]) {
+        dual_timestamp
+                initrd_timestamp = DUAL_TIMESTAMP_NULL,
+                userspace_timestamp = DUAL_TIMESTAMP_NULL,
+                kernel_timestamp = DUAL_TIMESTAMP_NULL,
+                security_start_timestamp = DUAL_TIMESTAMP_NULL,
+                security_finish_timestamp = DUAL_TIMESTAMP_NULL;
+        struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
+                saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
+                                                                          * in. Note we use different values
+                                                                          * for the two that indicate whether
+                                                                          * these fields are initialized! */
+        bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
+        char *switch_root_dir = NULL, *switch_root_init = NULL;
+        usec_t before_startup, after_startup;
+        static char systemd[] = "systemd";
+        const char *error_message = NULL;
+        int r, retval = EXIT_FAILURE;
+        Manager *m = NULL;
+        FDSet *fds = NULL;
+
+        assert_se(argc > 0 && !isempty(argv[0]));
+
+        /* SysV compatibility: redirect init → telinit */
+        redirect_telinit(argc, argv);
+
+        /* Take timestamps early on */
+        dual_timestamp_from_monotonic(&kernel_timestamp, 0);
+        dual_timestamp_now(&userspace_timestamp);
+
+        /* Figure out whether we need to do initialize the system, or if we already did that because we are
+         * reexecuting. */
+        skip_setup = early_skip_setup_check(argc, argv);
+
+        /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent
+         * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd
+         * right-away. */
+        program_invocation_short_name = systemd;
+        (void) prctl(PR_SET_NAME, systemd);
+
+        /* Save the original command line */
+        save_argc_argv(argc, argv);
+
+        /* Save the original environment as we might need to restore it if we're requested to execute another
+         * system manager later. */
+        r = save_env();
+        if (r < 0) {
+                error_message = "Failed to copy environment block";
+                goto finish;
+        }
+
+        /* Make sure that if the user says "syslog" we actually log to the journal. */
+        log_set_upgrade_syslog_to_journal(true);
+
+        if (getpid_cached() == 1) {
+                /* When we run as PID 1 force system mode */
+                arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+
+                /* Disable the umask logic */
+                umask(0);
+
+                /* Make sure that at least initially we do not ever log to journald/syslogd, because it might
+                 * not be activated yet (even though the log socket for it exists). */
+                log_set_prohibit_ipc(true);
+
+                /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This
+                 * is important so that we never end up logging to any foreign stderr, for example if we have
+                 * to log in a child process right before execve()'ing the actual binary, at a point in time
+                 * where socket activation stderr/stdout area already set up. */
+                log_set_always_reopen_console(true);
+
+                if (detect_container() <= 0) {
+
+                        /* Running outside of a container as PID 1 */
+                        log_set_target_and_open(LOG_TARGET_KMSG);
+
+                        if (in_initrd())
+                                initrd_timestamp = userspace_timestamp;
+
+                        if (!skip_setup) {
+                                r = mount_setup_early();
+                                if (r < 0) {
+                                        error_message = "Failed to mount early API filesystems";
+                                        goto finish;
+                                }
+                        }
+
+                        /* We might have just mounted /proc, so let's try to parse the kernel
+                         * command line log arguments immediately. */
+                        log_parse_environment();
+
+                        /* Let's open the log backend a second time, in case the first time didn't
+                         * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
+                         * available, and it previously wasn't. */
+                        log_open();
+
+                        if (!skip_setup) {
+                                disable_printk_ratelimit();
+
+                                r = initialize_security(
+                                                &loaded_policy,
+                                                &security_start_timestamp,
+                                                &security_finish_timestamp,
+                                                &error_message);
+                                if (r < 0)
+                                        goto finish;
+                        }
+
+                        if (mac_init() < 0) {
+                                error_message = "Failed to initialize MAC support";
+                                goto finish;
+                        }
+
+                        if (!skip_setup)
+                                initialize_clock();
+
+                        /* Set the default for later on, but don't actually open the logs like this for
+                         * now. Note that if we are transitioning from the initrd there might still be
+                         * journal fd open, and we shouldn't attempt opening that before we parsed
+                         * /proc/cmdline which might redirect output elsewhere. */
+                        log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
+
+                } else {
+                        /* Running inside a container, as PID 1 */
+                        log_set_target_and_open(LOG_TARGET_CONSOLE);
+
+                        /* For later on, see above... */
+                        log_set_target(LOG_TARGET_JOURNAL);
+
+                        /* clear the kernel timestamp, because we are in a container */
+                        kernel_timestamp = DUAL_TIMESTAMP_NULL;
+                }
+
+                initialize_coredump(skip_setup);
+
+                r = fixup_environment();
+                if (r < 0) {
+                        log_struct_errno(LOG_EMERG, r,
+                                         LOG_MESSAGE("Failed to fix up PID 1 environment: %m"),
+                                         "MESSAGE_ID=" SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR);
+                        error_message = "Failed to fix up PID1 environment";
+                        goto finish;
+                }
+
+                /* Try to figure out if we can use colors with the console. No need to do that for user
+                 * instances since they never log into the console. */
+                log_show_color(colors_enabled());
+
+                r = make_null_stdio();
+                if (r < 0)
+                        log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
+
+                /* Load the kernel modules early. */
+                if (!skip_setup)
+                        (void) kmod_setup();
+
+                /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
+                r = mount_setup(loaded_policy, skip_setup);
+                if (r < 0) {
+                        error_message = "Failed to mount API filesystems";
+                        goto finish;
+                }
+
+                /* The efivarfs is now mounted, let's lock down the system token. */
+                lock_down_efi_variables();
+
+                /* Cache command-line options passed from EFI variables */
+                if (!skip_setup)
+                        (void) cache_efi_options_variable();
+        } else {
+                /* Running as user instance */
+                arg_runtime_scope = RUNTIME_SCOPE_USER;
+                log_set_always_reopen_console(true);
+                log_set_target_and_open(LOG_TARGET_AUTO);
+
+                /* clear the kernel timestamp, because we are not PID 1 */
+                kernel_timestamp = DUAL_TIMESTAMP_NULL;
+
+                /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does
+                 * not affect the permitted and effective sets which are important for the manager itself to
+                 * operate. */
+                capability_ambient_set_apply(0, /* also_inherit= */ false);
+
+                if (mac_init() < 0) {
+                        error_message = "Failed to initialize MAC support";
+                        goto finish;
+                }
+        }
+
+        /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
+         * transitioning from the initrd to the main systemd or suchlike. */
+        save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
+        /* Reset all signal handlers. */
+        (void) reset_all_signal_handlers();
+        (void) ignore_signals(SIGNALS_IGNORE);
+
+        (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
+        r = parse_argv(argc, argv);
+        if (r < 0) {
+                error_message = "Failed to parse command line arguments";
+                goto finish;
+        }
+
+        r = safety_checks();
+        if (r < 0)
+                goto finish;
+
+        if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
+                pager_open(arg_pager_flags);
+
+        if (arg_action != ACTION_RUN)
+                skip_setup = true;
+
+        if (arg_action == ACTION_HELP) {
+                retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+                goto finish;
+        } else if (arg_action == ACTION_VERSION) {
+                retval = version();
+                goto finish;
+        } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
+                unit_dump_config_items(stdout);
+                retval = EXIT_SUCCESS;
+                goto finish;
+        } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
+                dump_bus_properties(stdout);
+                retval = EXIT_SUCCESS;
+                goto finish;
+        } else if (arg_action == ACTION_BUS_INTROSPECT) {
+                r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
+                retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+                goto finish;
+        }
+
+        assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
+
+        /* Move out of the way, so that we won't block unmounts */
+        assert_se(chdir("/") == 0);
+
+        if (arg_action == ACTION_RUN) {
+                if (!skip_setup) {
+                        /* Apply the systemd.clock_usec= kernel command line switch */
+                        apply_clock_update();
+
+                        /* Apply random seed from kernel command line */
+                        cmdline_take_random_seed();
+                }
+
+                /* A core pattern might have been specified via the cmdline.  */
+                initialize_core_pattern(skip_setup);
+
+                /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
+                log_close();
+
+                /* Remember open file descriptors for later deserialization */
+                r = collect_fds(&fds, &error_message);
+                if (r < 0)
+                        goto finish;
+
+                /* Give up any control of the console, but make sure its initialized. */
+                setup_console_terminal(skip_setup);
+
+                /* Open the logging devices, if possible and necessary */
+                log_open();
+        }
+
+        log_execution_mode(&first_boot);
+
+        r = initialize_runtime(skip_setup,
+                               first_boot,
+                               &saved_rlimit_nofile,
+                               &saved_rlimit_memlock,
+                               &error_message);
+        if (r < 0)
+                goto finish;
+
+        r = manager_new(arg_runtime_scope,
+                        arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
+                        &m);
+        if (r < 0) {
+                log_struct_errno(LOG_EMERG, r,
+                                 LOG_MESSAGE("Failed to allocate manager object: %m"),
+                                 "MESSAGE_ID=" SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR);
+                error_message = "Failed to allocate manager object";
+                goto finish;
+        }
+
+        m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
+        m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
+        m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
+        m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
+        m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
+
+        set_manager_defaults(m);
+        set_manager_settings(m);
+        manager_set_first_boot(m, first_boot);
+        manager_set_switching_root(m, arg_switched_root);
+
+        /* Remember whether we should queue the default job */
+        queue_default_job = !arg_serialization || arg_switched_root;
+
+        before_startup = now(CLOCK_MONOTONIC);
+
+        r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
+        if (r < 0) {
+                error_message = "Failed to start up manager";
+                goto finish;
+        }
+
+        /* This will close all file descriptors that were opened, but not claimed by any unit. */
+        fds = fdset_free(fds);
+        arg_serialization = safe_fclose(arg_serialization);
+
+        if (queue_default_job) {
+                r = do_queue_default_job(m, &error_message);
+                if (r < 0)
+                        goto finish;
+        }
+
+        after_startup = now(CLOCK_MONOTONIC);
+
+        log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
+                 "Loaded units and determined initial transaction in %s.",
+                 FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC));
+
+        if (arg_action == ACTION_TEST) {
+                manager_test_summary(m);
+                retval = EXIT_SUCCESS;
+                goto finish;
+        }
+
+        r = invoke_main_loop(m,
+                             &saved_rlimit_nofile,
+                             &saved_rlimit_memlock,
+                             &retval,
+                             &fds,
+                             &switch_root_dir,
+                             &switch_root_init,
+                             &error_message);
+        assert(r < 0 || IN_SET(r, MANAGER_EXIT,          /* MANAGER_OK is not expected here. */
+                                  MANAGER_RELOAD,
+                                  MANAGER_REEXECUTE,
+                                  MANAGER_REBOOT,
+                                  MANAGER_SOFT_REBOOT,
+                                  MANAGER_POWEROFF,
+                                  MANAGER_HALT,
+                                  MANAGER_KEXEC,
+                                  MANAGER_SWITCH_ROOT));
+
+finish:
+        pager_close();
+
+        if (m) {
+                arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
+                arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
+                m = manager_free(m);
+        }
+
+        mac_selinux_finish();
+
+        if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
+                r = do_reexecute(r,
+                                 argc, argv,
+                                 &saved_rlimit_nofile,
+                                 &saved_rlimit_memlock,
+                                 fds,
+                                 switch_root_dir,
+                                 switch_root_init,
+                                 &error_message); /* This only returns if reexecution failed */
+
+        arg_serialization = safe_fclose(arg_serialization);
+        fds = fdset_free(fds);
+
+        saved_env = strv_free(saved_env);
+
+#if HAVE_VALGRIND_VALGRIND_H
+        /* If we are PID 1 and running under valgrind, then let's exit
+         * here explicitly. valgrind will only generate nice output on
+         * exit(), not on exec(), hence let's do the former not the
+         * latter here. */
+        if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
+                /* Cleanup watchdog_device strings for valgrind. We need them
+                 * in become_shutdown() so normally we cannot free them yet. */
+                watchdog_free_device();
+                reset_arguments();
+                return retval;
+        }
+#endif
+
+#if HAS_FEATURE_ADDRESS_SANITIZER
+        /* At this stage we most likely don't have stdio/stderr open, so the following
+         * LSan check would not print any actionable information and would just crash
+         * PID 1. To make this a bit more helpful, let's try to open /dev/console,
+         * and if we succeed redirect LSan's report there. */
+        if (getpid_cached() == 1) {
+                _cleanup_close_ int tty_fd = -EBADF;
+
+                tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+                if (tty_fd >= 0)
+                        __sanitizer_set_report_fd((void*) (intptr_t) tty_fd);
+
+                __lsan_do_leak_check();
+        }
+#endif
+
+        if (r < 0)
+                (void) sd_notifyf(0, "ERRNO=%i", -r);
+
+        /* Try to invoke the shutdown binary unless we already failed.
+         * If we failed above, we want to freeze after finishing cleanup. */
+        if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
+            IN_SET(r, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)) {
+                r = become_shutdown(r, retval);
+                log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
+                error_message = "Failed to execute shutdown binary";
+        }
+
+        /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with
+         * a mechanism to pick up systemd's exit status in the VM. */
+        (void) sd_notifyf(0, "EXIT_STATUS=%i", retval);
+
+        watchdog_free_device();
+        arg_watchdog_device = mfree(arg_watchdog_device);
+
+        if (getpid_cached() == 1) {
+                if (error_message)
+                        manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
+                                              ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
+                                              "%s.", error_message);
+                freeze_or_exit_or_reboot();
+        }
+
+        reset_arguments();
+        return retval;
+}
diff --git a/src/core/main.h b/src/core/main.h
new file mode 100644
index 0000000..b12a1cc
--- /dev/null
+++ b/src/core/main.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+extern bool arg_dump_core;
+extern int arg_crash_chvt;
+extern bool arg_crash_shell;
+extern bool arg_crash_reboot;
diff --git a/src/core/manager-dump.c b/src/core/manager-dump.c
new file mode 100644
index 0000000..6c32d78
--- /dev/null
+++ b/src/core/manager-dump.c
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "build.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hashmap.h"
+#include "manager-dump.h"
+#include "memstream-util.h"
+#include "unit-serialize.h"
+#include "version.h"
+
+void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix) {
+        Job *j;
+
+        assert(s);
+        assert(f);
+
+        HASHMAP_FOREACH(j, s->jobs) {
+
+                if (!strv_fnmatch_or_empty(patterns, j->unit->id, FNM_NOESCAPE))
+                        continue;
+
+                job_dump(j, f, prefix);
+        }
+}
+
+int manager_get_dump_jobs_string(Manager *m, char **patterns, const char *prefix, char **ret) {
+        _cleanup_(memstream_done) MemStream ms = {};
+        FILE *f;
+
+        assert(m);
+        assert(ret);
+
+        f = memstream_init(&ms);
+        if (!f)
+                return -errno;
+
+        manager_dump_jobs(m, f, patterns, prefix);
+
+        return memstream_finalize(&ms, ret, NULL);
+}
+
+void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix) {
+        Unit *u;
+        const char *t;
+
+        assert(s);
+        assert(f);
+
+        HASHMAP_FOREACH_KEY(u, t, s->units) {
+                if (u->id != t)
+                        continue;
+
+                if (!strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE))
+                        continue;
+
+                unit_dump(u, f, prefix);
+        }
+}
+
+static void manager_dump_header(Manager *m, FILE *f, const char *prefix) {
+
+        /* NB: this is a debug interface for developers. It's not supposed to be machine readable or be
+         * stable between versions. We take the liberty to restructure it entirely between versions and
+         * add/remove fields at will. */
+
+        fprintf(f, "%sManager: systemd " STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")\n", strempty(prefix));
+        fprintf(f, "%sFeatures: %s\n", strempty(prefix), systemd_features);
+
+        for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+                const dual_timestamp *t = m->timestamps + q;
+
+                if (dual_timestamp_is_set(t))
+                        fprintf(f, "%sTimestamp %s: %s\n",
+                                strempty(prefix),
+                                manager_timestamp_to_string(q),
+                                timestamp_is_set(t->realtime) ? FORMAT_TIMESTAMP(t->realtime) :
+                                                                FORMAT_TIMESPAN(t->monotonic, 1));
+        }
+}
+
+void manager_dump(Manager *m, FILE *f, char **patterns, const char *prefix) {
+        assert(m);
+        assert(f);
+
+        /* If no pattern is provided, dump the full manager state including the manager version, features and
+         * so on. Otherwise limit the dump to the units/jobs matching the specified patterns. */
+        if (!patterns)
+                manager_dump_header(m, f, prefix);
+
+        manager_dump_units(m, f, patterns, prefix);
+        manager_dump_jobs(m, f, patterns, prefix);
+}
+
+int manager_get_dump_string(Manager *m, char **patterns, char **ret) {
+        _cleanup_(memstream_done) MemStream ms = {};
+        FILE *f;
+
+        assert(m);
+        assert(ret);
+
+        f = memstream_init(&ms);
+        if (!f)
+                return -errno;
+
+        manager_dump(m, f, patterns, NULL);
+
+        return memstream_finalize(&ms, ret, NULL);
+}
+
+void manager_test_summary(Manager *m) {
+        assert(m);
+
+        printf("-> By units:\n");
+        manager_dump_units(m, stdout, /* patterns= */ NULL, "\t");
+
+        printf("-> By jobs:\n");
+        manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t");
+}
diff --git a/src/core/manager-dump.h b/src/core/manager-dump.h
new file mode 100644
index 0000000..5b96f26
--- /dev/null
+++ b/src/core/manager-dump.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "manager.h"
+
+void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix);
+int manager_get_dump_jobs_string(Manager *m, char **patterns, const char *prefix, char **ret);
+void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix);
+void manager_dump(Manager *s, FILE *f, char **patterns, const char *prefix);
+int manager_get_dump_string(Manager *m, char **patterns, char **ret);
+void manager_test_summary(Manager *m);
diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c
new file mode 100644
index 0000000..e9d567a
--- /dev/null
+++ b/src/core/manager-serialize.c
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "clean-ipc.h"
+#include "core-varlink.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "initrd-util.h"
+#include "macro.h"
+#include "manager-serialize.h"
+#include "manager.h"
+#include "parse-util.h"
+#include "serialize.h"
+#include "syslog-util.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+#include "varlink-internal.h"
+
+int manager_open_serialization(Manager *m, FILE **ret_f) {
+        assert(ret_f);
+
+        return open_serialization_file("systemd-state", ret_f);
+}
+
+static bool manager_timestamp_shall_serialize(ManagerTimestamp t) {
+        if (!in_initrd())
+                return true;
+
+        /* The following timestamps only apply to the host system, hence only serialize them there */
+        return !IN_SET(t,
+                       MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH,
+                       MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH,
+                       MANAGER_TIMESTAMP_GENERATORS_START, MANAGER_TIMESTAMP_GENERATORS_FINISH,
+                       MANAGER_TIMESTAMP_UNITS_LOAD_START, MANAGER_TIMESTAMP_UNITS_LOAD_FINISH);
+}
+
+static void manager_serialize_uid_refs_internal(
+                FILE *f,
+                Hashmap *uid_refs,
+                const char *field_name) {
+
+        void *p, *k;
+
+        assert(f);
+        assert(field_name);
+
+        /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as
+         * the actual counter of it is better rebuild after a reload/reexec. */
+
+        HASHMAP_FOREACH_KEY(p, k, uid_refs) {
+                uint32_t c;
+                uid_t uid;
+
+                uid = PTR_TO_UID(k);
+                c = PTR_TO_UINT32(p);
+
+                if (!(c & DESTROY_IPC_FLAG))
+                        continue;
+
+                (void) serialize_item_format(f, field_name, UID_FMT, uid);
+        }
+}
+
+static void manager_serialize_uid_refs(Manager *m, FILE *f) {
+        manager_serialize_uid_refs_internal(f, m->uid_refs, "destroy-ipc-uid");
+}
+
+static void manager_serialize_gid_refs(Manager *m, FILE *f) {
+        manager_serialize_uid_refs_internal(f, m->gid_refs, "destroy-ipc-gid");
+}
+
+int manager_serialize(
+                Manager *m,
+                FILE *f,
+                FDSet *fds,
+                bool switching_root) {
+
+        const char *t;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(f);
+        assert(fds);
+
+        _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
+
+        (void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id);
+        (void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs);
+        (void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs);
+        (void) serialize_bool(f, "ready-sent", m->ready_sent);
+        (void) serialize_bool(f, "taint-logged", m->taint_logged);
+        (void) serialize_bool(f, "service-watchdogs", m->service_watchdogs);
+
+        if (m->show_status_overridden != _SHOW_STATUS_INVALID)
+                (void) serialize_item(f, "show-status-overridden",
+                                      show_status_to_string(m->show_status_overridden));
+
+        if (m->log_level_overridden)
+                (void) serialize_item_format(f, "log-level-override", "%i", log_get_max_level());
+        if (m->log_target_overridden)
+                (void) serialize_item(f, "log-target-override", log_target_to_string(log_get_target()));
+
+        (void) serialize_usec(f, "runtime-watchdog-overridden", m->watchdog_overridden[WATCHDOG_RUNTIME]);
+        (void) serialize_usec(f, "reboot-watchdog-overridden", m->watchdog_overridden[WATCHDOG_REBOOT]);
+        (void) serialize_usec(f, "kexec-watchdog-overridden", m->watchdog_overridden[WATCHDOG_KEXEC]);
+        (void) serialize_usec(f, "pretimeout-watchdog-overridden", m->watchdog_overridden[WATCHDOG_PRETIMEOUT]);
+        (void) serialize_item(f, "pretimeout-watchdog-governor-overridden", m->watchdog_pretimeout_governor_overridden);
+
+        for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+                _cleanup_free_ char *joined = NULL;
+
+                if (!manager_timestamp_shall_serialize(q))
+                        continue;
+
+                joined = strjoin(manager_timestamp_to_string(q), "-timestamp");
+                if (!joined)
+                        return log_oom();
+
+                (void) serialize_dual_timestamp(f, joined, m->timestamps + q);
+        }
+
+        if (!switching_root)
+                (void) serialize_strv(f, "env", m->client_environment);
+
+        if (m->notify_fd >= 0) {
+                r = serialize_fd(f, fds, "notify-fd", m->notify_fd);
+                if (r < 0)
+                        return r;
+
+                (void) serialize_item(f, "notify-socket", m->notify_socket);
+        }
+
+        if (m->cgroups_agent_fd >= 0) {
+                r = serialize_fd(f, fds, "cgroups-agent-fd", m->cgroups_agent_fd);
+                if (r < 0)
+                        return r;
+        }
+
+        if (m->user_lookup_fds[0] >= 0) {
+                int copy0, copy1;
+
+                copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]);
+                if (copy0 < 0)
+                        return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m");
+
+                copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]);
+                if (copy1 < 0)
+                        return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m");
+
+                (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1);
+        }
+
+        (void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit);
+
+        bus_track_serialize(m->subscribed, f, "subscribed");
+
+        r = dynamic_user_serialize(m, f, fds);
+        if (r < 0)
+                return r;
+
+        manager_serialize_uid_refs(m, f);
+        manager_serialize_gid_refs(m, f);
+
+        r = exec_shared_runtime_serialize(m, f, fds);
+        if (r < 0)
+                return r;
+
+        r = varlink_server_serialize(m->varlink_server, f, fds);
+        if (r < 0)
+                return r;
+
+        (void) fputc('\n', f);
+
+        HASHMAP_FOREACH_KEY(u, t, m->units) {
+                if (u->id != t)
+                        continue;
+
+                r = unit_serialize_state(u, f, fds, switching_root);
+                if (r < 0)
+                        return r;
+        }
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to flush serialization: %m");
+
+        r = bus_fdset_add_all(m, fds);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add bus sockets to serialization: %m");
+
+        return 0;
+}
+
+static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) {
+        Unit *u;
+        int r;
+
+        r = manager_load_unit(m, name, NULL, NULL, &u);
+        if (r < 0) {
+                if (r == -ENOMEM)
+                        return r;
+                return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name);
+        }
+
+        r = unit_deserialize_state(u, f, fds);
+        if (r < 0) {
+                if (r == -ENOMEM)
+                        return r;
+                return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name);
+        }
+
+        return 0;
+}
+
+static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) {
+        int r;
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+
+                /* Start marker */
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read serialization line: %m");
+                if (r == 0)
+                        break;
+
+                r = manager_deserialize_one_unit(m, line, f, fds);
+                if (r == -ENOMEM)
+                        return r;
+                if (r < 0) {
+                        r = unit_deserialize_state_skip(f);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static void manager_deserialize_uid_refs_one_internal(
+                Hashmap** uid_refs,
+                const char *value) {
+
+        uid_t uid;
+        uint32_t c;
+        int r;
+
+        assert(uid_refs);
+        assert(value);
+
+        r = parse_uid(value, &uid);
+        if (r < 0 || uid == 0) {
+                log_debug("Unable to parse UID/GID reference serialization: %s", value);
+                return;
+        }
+
+        if (hashmap_ensure_allocated(uid_refs, &trivial_hash_ops) < 0) {
+                log_oom();
+                return;
+        }
+
+        c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid)));
+        if (c & DESTROY_IPC_FLAG)
+                return;
+
+        c |= DESTROY_IPC_FLAG;
+
+        r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c));
+        if (r < 0) {
+                log_debug_errno(r, "Failed to add UID/GID reference entry: %m");
+                return;
+        }
+}
+
+static void manager_deserialize_uid_refs_one(Manager *m, const char *value) {
+        manager_deserialize_uid_refs_one_internal(&m->uid_refs, value);
+}
+
+static void manager_deserialize_gid_refs_one(Manager *m, const char *value) {
+        manager_deserialize_uid_refs_one_internal(&m->gid_refs, value);
+}
+
+int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
+        bool deserialize_varlink_sockets = false;
+        int r = 0;
+
+        assert(m);
+        assert(f);
+
+        if (DEBUG_LOGGING) {
+                if (fdset_isempty(fds))
+                        log_debug("No file descriptors passed");
+                else {
+                        int fd;
+
+                        FDSET_FOREACH(fd, fds) {
+                                _cleanup_free_ char *fn = NULL;
+
+                                r = fd_get_path(fd, &fn);
+                                if (r < 0)
+                                        log_debug_errno(r, "Received serialized fd %i %s %m",
+                                                        fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT));
+                                else
+                                        log_debug("Received serialized fd %i %s %s",
+                                                  fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), strna(fn));
+                        }
+                }
+        }
+
+        log_debug("Deserializing state...");
+
+        /* If we are not in reload mode yet, enter it now. Not that this is recursive, a caller might already have
+         * increased it to non-zero, which is why we just increase it by one here and down again at the end of this
+         * call. */
+        _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
+
+        for (;;) {
+                _cleanup_free_ char *l = NULL;
+                const char *val;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                if ((val = startswith(l, "current-job-id="))) {
+                        uint32_t id;
+
+                        if (safe_atou32(val, &id) < 0)
+                                log_notice("Failed to parse current job id value '%s', ignoring.", val);
+                        else
+                                m->current_job_id = MAX(m->current_job_id, id);
+
+                } else if ((val = startswith(l, "n-installed-jobs="))) {
+                        uint32_t n;
+
+                        if (safe_atou32(val, &n) < 0)
+                                log_notice("Failed to parse installed jobs counter '%s', ignoring.", val);
+                        else
+                                m->n_installed_jobs += n;
+
+                } else if ((val = startswith(l, "n-failed-jobs="))) {
+                        uint32_t n;
+
+                        if (safe_atou32(val, &n) < 0)
+                                log_notice("Failed to parse failed jobs counter '%s', ignoring.", val);
+                        else
+                                m->n_failed_jobs += n;
+
+                } else if ((val = startswith(l, "ready-sent="))) {
+                        int b;
+
+                        b = parse_boolean(val);
+                        if (b < 0)
+                                log_notice("Failed to parse ready-sent flag '%s', ignoring.", val);
+                        else
+                                m->ready_sent = m->ready_sent || b;
+
+                } else if ((val = startswith(l, "taint-logged="))) {
+                        int b;
+
+                        b = parse_boolean(val);
+                        if (b < 0)
+                                log_notice("Failed to parse taint-logged flag '%s', ignoring.", val);
+                        else
+                                m->taint_logged = m->taint_logged || b;
+
+                } else if ((val = startswith(l, "service-watchdogs="))) {
+                        int b;
+
+                        b = parse_boolean(val);
+                        if (b < 0)
+                                log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val);
+                        else
+                                m->service_watchdogs = b;
+
+                } else if ((val = startswith(l, "show-status-overridden="))) {
+                        ShowStatus s;
+
+                        s = show_status_from_string(val);
+                        if (s < 0)
+                                log_notice("Failed to parse show-status-overridden flag '%s', ignoring.", val);
+                        else
+                                manager_override_show_status(m, s, "deserialize");
+
+                } else if ((val = startswith(l, "log-level-override="))) {
+                        int level;
+
+                        level = log_level_from_string(val);
+                        if (level < 0)
+                                log_notice("Failed to parse log-level-override value '%s', ignoring.", val);
+                        else
+                                manager_override_log_level(m, level);
+
+                } else if ((val = startswith(l, "log-target-override="))) {
+                        LogTarget target;
+
+                        target = log_target_from_string(val);
+                        if (target < 0)
+                                log_notice("Failed to parse log-target-override value '%s', ignoring.", val);
+                        else
+                                manager_override_log_target(m, target);
+
+                } else if ((val = startswith(l, "runtime-watchdog-overridden="))) {
+                        usec_t t;
+
+                        if (deserialize_usec(val, &t) < 0)
+                                log_notice("Failed to parse runtime-watchdog-overridden value '%s', ignoring.", val);
+                        else
+                                manager_override_watchdog(m, WATCHDOG_RUNTIME, t);
+
+                } else if ((val = startswith(l, "reboot-watchdog-overridden="))) {
+                        usec_t t;
+
+                        if (deserialize_usec(val, &t) < 0)
+                                log_notice("Failed to parse reboot-watchdog-overridden value '%s', ignoring.", val);
+                        else
+                                manager_override_watchdog(m, WATCHDOG_REBOOT, t);
+
+                } else if ((val = startswith(l, "kexec-watchdog-overridden="))) {
+                        usec_t t;
+
+                        if (deserialize_usec(val, &t) < 0)
+                                log_notice("Failed to parse kexec-watchdog-overridden value '%s', ignoring.", val);
+                        else
+                                manager_override_watchdog(m, WATCHDOG_KEXEC, t);
+
+                } else if ((val = startswith(l, "pretimeout-watchdog-overridden="))) {
+                        usec_t t;
+
+                        if (deserialize_usec(val, &t) < 0)
+                                log_notice("Failed to parse pretimeout-watchdog-overridden value '%s', ignoring.", val);
+                        else
+                                manager_override_watchdog(m, WATCHDOG_PRETIMEOUT, t);
+
+                } else if ((val = startswith(l, "pretimeout-watchdog-governor-overridden="))) {
+                        r = free_and_strdup(&m->watchdog_pretimeout_governor_overridden, val);
+                        if (r < 0)
+                                return r;
+
+                } else if (startswith(l, "env=")) {
+                        r = deserialize_environment(l + 4, &m->client_environment);
+                        if (r < 0)
+                                log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l);
+
+                } else if ((val = startswith(l, "notify-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd >= 0) {
+                                m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
+                                safe_close(m->notify_fd);
+                                m->notify_fd = fd;
+                        }
+
+                } else if ((val = startswith(l, "notify-socket="))) {
+                        r = free_and_strdup(&m->notify_socket, val);
+                        if (r < 0)
+                                return r;
+
+                } else if ((val = startswith(l, "cgroups-agent-fd="))) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, val);
+                        if (fd >= 0) {
+                                m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
+                                safe_close(m->cgroups_agent_fd);
+                                m->cgroups_agent_fd = fd;
+                        }
+
+                } else if ((val = startswith(l, "user-lookup="))) {
+                        int fd0, fd1;
+
+                        if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1))
+                                log_notice("Failed to parse user lookup fd, ignoring: %s", val);
+                        else {
+                                m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+                                safe_close_pair(m->user_lookup_fds);
+                                m->user_lookup_fds[0] = fdset_remove(fds, fd0);
+                                m->user_lookup_fds[1] = fdset_remove(fds, fd1);
+                        }
+
+                } else if ((val = startswith(l, "dynamic-user=")))
+                        dynamic_user_deserialize_one(m, val, fds, NULL);
+                else if ((val = startswith(l, "destroy-ipc-uid=")))
+                        manager_deserialize_uid_refs_one(m, val);
+                else if ((val = startswith(l, "destroy-ipc-gid=")))
+                        manager_deserialize_gid_refs_one(m, val);
+                else if ((val = startswith(l, "exec-runtime=")))
+                        (void) exec_shared_runtime_deserialize_one(m, val, fds);
+                else if ((val = startswith(l, "subscribed="))) {
+
+                        if (strv_extend(&m->deserialized_subscribed, val) < 0)
+                                return -ENOMEM;
+                } else if ((val = startswith(l, "varlink-server-socket-address="))) {
+                        if (!m->varlink_server && MANAGER_IS_SYSTEM(m)) {
+                                r = manager_varlink_init(m);
+                                if (r < 0) {
+                                        log_warning_errno(r, "Failed to setup varlink server, ignoring: %m");
+                                        continue;
+                                }
+
+                                deserialize_varlink_sockets = true;
+                        }
+
+                        /* To avoid unnecessary deserialization (i.e. during reload vs. reexec) we only deserialize
+                         * the FDs if we had to create a new m->varlink_server. The deserialize_varlink_sockets flag
+                         * is initialized outside of the loop, is flipped after the VarlinkServer is setup, and
+                         * remains set until all serialized contents are handled. */
+                        if (deserialize_varlink_sockets)
+                                (void) varlink_server_deserialize_one(m->varlink_server, val, fds);
+                } else if ((val = startswith(l, "dump-ratelimit=")))
+                        deserialize_ratelimit(&m->dump_ratelimit, "dump-ratelimit", val);
+                else {
+                        ManagerTimestamp q;
+
+                        for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+                                val = startswith(l, manager_timestamp_to_string(q));
+                                if (!val)
+                                        continue;
+
+                                val = startswith(val, "-timestamp=");
+                                if (val)
+                                        break;
+                        }
+
+                        if (q < _MANAGER_TIMESTAMP_MAX) /* found it */
+                                (void) deserialize_dual_timestamp(val, m->timestamps + q);
+                        else if (!STARTSWITH_SET(l, "kdbus-fd=", "honor-device-enumeration=")) /* ignore deprecated values */
+                                log_notice("Unknown serialization item '%s', ignoring.", l);
+                }
+        }
+
+        return manager_deserialize_units(m, f, fds);
+}
diff --git a/src/core/manager-serialize.h b/src/core/manager-serialize.h
new file mode 100644
index 0000000..c52261e
--- /dev/null
+++ b/src/core/manager-serialize.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "manager.h"
+#include "fdset.h"
+
+#define DESTROY_IPC_FLAG (UINT32_C(1) << 31)
+
+int manager_open_serialization(Manager *m, FILE **ret_f);
+int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root);
+int manager_deserialize(Manager *m, FILE *f, FDSet *fds);
diff --git a/src/core/manager.c b/src/core/manager.c
new file mode 100644
index 0000000..88eebfc
--- /dev/null
+++ b/src/core/manager.c
@@ -0,0 +1,5039 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#if HAVE_AUDIT
+#include 
+#endif
+
+#include "sd-daemon.h"
+#include "sd-messages.h"
+#include "sd-path.h"
+
+#include "all-units.h"
+#include "alloc-util.h"
+#include "audit-fd.h"
+#include "boot-timestamps.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-kernel.h"
+#include "bus-util.h"
+#include "clean-ipc.h"
+#include "clock-util.h"
+#include "common-signal.h"
+#include "confidential-virt.h"
+#include "constants.h"
+#include "core-varlink.h"
+#include "creds-util.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "dirent-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "event-util.h"
+#include "exec-util.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "generator-setup.h"
+#include "hashmap.h"
+#include "initrd-util.h"
+#include "inotify-util.h"
+#include "install.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "load-fragment.h"
+#include "locale-setup.h"
+#include "log.h"
+#include "macro.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "manager-serialize.h"
+#include "memory-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "plymouth-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "ratelimit.h"
+#include "rlimit-util.h"
+#include "rm-rf.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "strxcpyx.h"
+#include "sysctl-util.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "transaction.h"
+#include "uid-range.h"
+#include "umask-util.h"
+#include "unit-name.h"
+#include "user-util.h"
+#include "virt.h"
+#include "watchdog.h"
+
+#define NOTIFY_RCVBUF_SIZE (8*1024*1024)
+#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024)
+
+/* Initial delay and the interval for printing status messages about running jobs */
+#define JOBS_IN_PROGRESS_WAIT_USEC (2*USEC_PER_SEC)
+#define JOBS_IN_PROGRESS_QUIET_WAIT_USEC (25*USEC_PER_SEC)
+#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3)
+#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3
+
+/* If there are more than 1K bus messages queue across our API and direct buses, then let's not add more on top until
+ * the queue gets more empty. */
+#define MANAGER_BUS_BUSY_THRESHOLD 1024LU
+
+/* How many units and jobs to process of the bus queue before returning to the event loop. */
+#define MANAGER_BUS_MESSAGE_BUDGET 100U
+
+#define DEFAULT_TASKS_MAX ((CGroupTasksMax) { 15U, 100U }) /* 15% */
+
+static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
+static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
+static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
+static int manager_dispatch_timezone_change(sd_event_source *source, const struct inotify_event *event, void *userdata);
+static int manager_run_environment_generators(Manager *m);
+static int manager_run_generators(Manager *m);
+static void manager_vacuum(Manager *m);
+
+static usec_t manager_watch_jobs_next_time(Manager *m) {
+        usec_t timeout;
+
+        if (MANAGER_IS_USER(m))
+                /* Let the user manager without a timeout show status quickly, so the system manager can make
+                 * use of it, if it wants to. */
+                timeout = JOBS_IN_PROGRESS_WAIT_USEC * 2 / 3;
+        else if (show_status_on(m->show_status))
+                /* When status is on, just use the usual timeout. */
+                timeout = JOBS_IN_PROGRESS_WAIT_USEC;
+        else
+                timeout = JOBS_IN_PROGRESS_QUIET_WAIT_USEC;
+
+        return usec_add(now(CLOCK_MONOTONIC), timeout);
+}
+
+static bool manager_is_confirm_spawn_disabled(Manager *m) {
+        assert(m);
+
+        if (!m->confirm_spawn)
+                return true;
+
+        return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
+}
+
+static void manager_watch_jobs_in_progress(Manager *m) {
+        usec_t next;
+        int r;
+
+        assert(m);
+
+        /* We do not want to show the cylon animation if the user
+         * needs to confirm service executions otherwise confirmation
+         * messages will be screwed by the cylon animation. */
+        if (!manager_is_confirm_spawn_disabled(m))
+                return;
+
+        if (m->jobs_in_progress_event_source)
+                return;
+
+        next = manager_watch_jobs_next_time(m);
+        r = sd_event_add_time(
+                        m->event,
+                        &m->jobs_in_progress_event_source,
+                        CLOCK_MONOTONIC,
+                        next, 0,
+                        manager_dispatch_jobs_in_progress, m);
+        if (r < 0)
+                return;
+
+        (void) sd_event_source_set_description(m->jobs_in_progress_event_source, "manager-jobs-in-progress");
+}
+
+static void manager_flip_auto_status(Manager *m, bool enable, const char *reason) {
+        assert(m);
+
+        if (enable) {
+                if (m->show_status == SHOW_STATUS_AUTO)
+                        manager_set_show_status(m, SHOW_STATUS_TEMPORARY, reason);
+        } else {
+                if (m->show_status == SHOW_STATUS_TEMPORARY)
+                        manager_set_show_status(m, SHOW_STATUS_AUTO, reason);
+        }
+}
+
+static void manager_print_jobs_in_progress(Manager *m) {
+        Job *j;
+        unsigned counter = 0, print_nr;
+        char cylon[6 + CYLON_BUFFER_EXTRA + 1];
+        unsigned cylon_pos;
+        uint64_t timeout = 0;
+
+        assert(m);
+        assert(m->n_running_jobs > 0);
+
+        manager_flip_auto_status(m, true, "delay");
+
+        print_nr = (m->jobs_in_progress_iteration / JOBS_IN_PROGRESS_PERIOD_DIVISOR) % m->n_running_jobs;
+
+        HASHMAP_FOREACH(j, m->jobs)
+                if (j->state == JOB_RUNNING && counter++ == print_nr)
+                        break;
+
+        /* m->n_running_jobs must be consistent with the contents of m->jobs,
+         * so the above loop must have succeeded in finding j. */
+        assert(counter == print_nr + 1);
+        assert(j);
+
+        cylon_pos = m->jobs_in_progress_iteration % 14;
+        if (cylon_pos >= 8)
+                cylon_pos = 14 - cylon_pos;
+        draw_cylon(cylon, sizeof(cylon), 6, cylon_pos);
+
+        m->jobs_in_progress_iteration++;
+
+        char job_of_n[STRLEN("( of ) ") + DECIMAL_STR_MAX(unsigned)*2] = "";
+        if (m->n_running_jobs > 1)
+                xsprintf(job_of_n, "(%u of %u) ", counter, m->n_running_jobs);
+
+        (void) job_get_timeout(j, &timeout);
+
+        /* We want to use enough information for the user to identify previous lines talking about the same
+         * unit, but keep the message as short as possible. So if 'Starting foo.service' or 'Starting
+         * foo.service - Description' were used, 'foo.service' is enough here. On the other hand, if we used
+         * 'Starting Description' before, then we shall also use 'Description' here. So we pass NULL as the
+         * second argument to unit_status_string(). */
+        const char *ident = unit_status_string(j->unit, NULL);
+
+        const char *time = FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - j->begin_usec, 1*USEC_PER_SEC);
+        const char *limit = timeout > 0 ? FORMAT_TIMESPAN(timeout - j->begin_usec, 1*USEC_PER_SEC) : "no limit";
+
+        if (m->status_unit_format == STATUS_UNIT_FORMAT_DESCRIPTION)
+                /* When using 'Description', we effectively don't have enough space to show the nested status
+                 * without ellipsization, so let's not even try. */
+                manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon,
+                                      "%sA %s job is running for %s (%s / %s)",
+                                      job_of_n,
+                                      job_type_to_string(j->type),
+                                      ident,
+                                      time, limit);
+        else {
+                const char *status_text = unit_status_text(j->unit);
+
+                manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon,
+                                      "%sJob %s/%s running (%s / %s)%s%s",
+                                      job_of_n,
+                                      ident,
+                                      job_type_to_string(j->type),
+                                      time, limit,
+                                      status_text ? ": " : "",
+                                      strempty(status_text));
+        }
+
+        sd_notifyf(false,
+                   "STATUS=%sUser job %s/%s running (%s / %s)...",
+                   job_of_n,
+                   ident,
+                   job_type_to_string(j->type),
+                   time, limit);
+        m->status_ready = false;
+}
+
+static int have_ask_password(void) {
+        _cleanup_closedir_ DIR *dir = NULL;
+
+        dir = opendir("/run/systemd/ask-password");
+        if (!dir) {
+                if (errno == ENOENT)
+                        return false;
+                else
+                        return -errno;
+        }
+
+        FOREACH_DIRENT_ALL(de, dir, return -errno)
+                if (startswith(de->d_name, "ask."))
+                        return true;
+        return false;
+}
+
+static int manager_dispatch_ask_password_fd(sd_event_source *source,
+                                            int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        (void) flush_fd(fd);
+
+        m->have_ask_password = have_ask_password();
+        if (m->have_ask_password < 0)
+                /* Log error but continue. Negative have_ask_password
+                 * is treated as unknown status. */
+                log_error_errno(m->have_ask_password, "Failed to list /run/systemd/ask-password: %m");
+
+        return 0;
+}
+
+static void manager_close_ask_password(Manager *m) {
+        assert(m);
+
+        m->ask_password_event_source = sd_event_source_disable_unref(m->ask_password_event_source);
+        m->ask_password_inotify_fd = safe_close(m->ask_password_inotify_fd);
+        m->have_ask_password = -EINVAL;
+}
+
+static int manager_check_ask_password(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (!m->ask_password_event_source) {
+                assert(m->ask_password_inotify_fd < 0);
+
+                (void) mkdir_p_label("/run/systemd/ask-password", 0755);
+
+                m->ask_password_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+                if (m->ask_password_inotify_fd < 0)
+                        return log_error_errno(errno, "Failed to create inotify object: %m");
+
+                r = inotify_add_watch_and_warn(m->ask_password_inotify_fd,
+                                               "/run/systemd/ask-password",
+                                               IN_CREATE|IN_DELETE|IN_MOVE);
+                if (r < 0) {
+                        manager_close_ask_password(m);
+                        return r;
+                }
+
+                r = sd_event_add_io(m->event, &m->ask_password_event_source,
+                                    m->ask_password_inotify_fd, EPOLLIN,
+                                    manager_dispatch_ask_password_fd, m);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to add event source for /run/systemd/ask-password: %m");
+                        manager_close_ask_password(m);
+                        return r;
+                }
+
+                (void) sd_event_source_set_description(m->ask_password_event_source, "manager-ask-password");
+
+                /* Queries might have been added meanwhile... */
+                manager_dispatch_ask_password_fd(m->ask_password_event_source,
+                                                 m->ask_password_inotify_fd, EPOLLIN, m);
+        }
+
+        return m->have_ask_password;
+}
+
+static int manager_watch_idle_pipe(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (m->idle_pipe_event_source)
+                return 0;
+
+        if (m->idle_pipe[2] < 0)
+                return 0;
+
+        r = sd_event_add_io(m->event, &m->idle_pipe_event_source, m->idle_pipe[2], EPOLLIN, manager_dispatch_idle_pipe_fd, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to watch idle pipe: %m");
+
+        (void) sd_event_source_set_description(m->idle_pipe_event_source, "manager-idle-pipe");
+
+        return 0;
+}
+
+static void manager_close_idle_pipe(Manager *m) {
+        assert(m);
+
+        m->idle_pipe_event_source = sd_event_source_disable_unref(m->idle_pipe_event_source);
+
+        safe_close_pair(m->idle_pipe);
+        safe_close_pair(m->idle_pipe + 2);
+}
+
+static int manager_setup_time_change(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        m->time_change_event_source = sd_event_source_disable_unref(m->time_change_event_source);
+
+        r = event_add_time_change(m->event, &m->time_change_event_source, manager_dispatch_time_change_fd, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create time change event source: %m");
+
+        /* Schedule this slightly earlier than the .timer event sources */
+        r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set priority of time change event sources: %m");
+
+        log_debug("Set up TFD_TIMER_CANCEL_ON_SET timerfd.");
+
+        return 0;
+}
+
+static int manager_read_timezone_stat(Manager *m) {
+        struct stat st;
+        bool changed;
+
+        assert(m);
+
+        /* Read the current stat() data of /etc/localtime so that we detect changes */
+        if (lstat("/etc/localtime", &st) < 0) {
+                log_debug_errno(errno, "Failed to stat /etc/localtime, ignoring: %m");
+                changed = m->etc_localtime_accessible;
+                m->etc_localtime_accessible = false;
+        } else {
+                usec_t k;
+
+                k = timespec_load(&st.st_mtim);
+                changed = !m->etc_localtime_accessible || k != m->etc_localtime_mtime;
+
+                m->etc_localtime_mtime = k;
+                m->etc_localtime_accessible = true;
+        }
+
+        return changed;
+}
+
+static int manager_setup_timezone_change(Manager *m) {
+        _cleanup_(sd_event_source_unrefp) sd_event_source *new_event = NULL;
+        int r;
+
+        assert(m);
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        /* We watch /etc/localtime for three events: change of the link count (which might mean removal from /etc even
+         * though another link might be kept), renames, and file close operations after writing. Note we don't bother
+         * with IN_DELETE_SELF, as that would just report when the inode is removed entirely, i.e. after the link count
+         * went to zero and all fds to it are closed.
+         *
+         * Note that we never follow symlinks here. This is a simplification, but should cover almost all cases
+         * correctly.
+         *
+         * Note that we create the new event source first here, before releasing the old one. This should optimize
+         * behaviour as this way sd-event can reuse the old watch in case the inode didn't change. */
+
+        r = sd_event_add_inotify(m->event, &new_event, "/etc/localtime",
+                                 IN_ATTRIB|IN_MOVE_SELF|IN_CLOSE_WRITE|IN_DONT_FOLLOW, manager_dispatch_timezone_change, m);
+        if (r == -ENOENT) {
+                /* If the file doesn't exist yet, subscribe to /etc instead, and wait until it is created either by
+                 * O_CREATE or by rename() */
+
+                log_debug_errno(r, "/etc/localtime doesn't exist yet, watching /etc instead.");
+                r = sd_event_add_inotify(m->event, &new_event, "/etc",
+                                         IN_CREATE|IN_MOVED_TO|IN_ONLYDIR, manager_dispatch_timezone_change, m);
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to create timezone change event source: %m");
+
+        /* Schedule this slightly earlier than the .timer event sources */
+        r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set priority of timezone change event sources: %m");
+
+        sd_event_source_unref(m->timezone_change_event_source);
+        m->timezone_change_event_source = TAKE_PTR(new_event);
+
+        return 0;
+}
+
+static int enable_special_signals(Manager *m) {
+        _cleanup_close_ int fd = -EBADF;
+
+        assert(m);
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        /* Enable that we get SIGINT on control-alt-del. In containers
+         * this will fail with EPERM (older) or EINVAL (newer), so
+         * ignore that. */
+        if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL))
+                log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m");
+
+        fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC);
+        if (fd < 0) {
+                /* Support systems without virtual console */
+                if (fd != -ENOENT)
+                        log_warning_errno(errno, "Failed to open /dev/tty0: %m");
+        } else {
+                /* Enable that we get SIGWINCH on kbrequest */
+                if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0)
+                        log_warning_errno(errno, "Failed to enable kbrequest handling: %m");
+        }
+
+        return 0;
+}
+
+#define RTSIG_IF_AVAILABLE(signum) (signum <= SIGRTMAX ? signum : -1)
+
+static int manager_setup_signals(Manager *m) {
+        struct sigaction sa = {
+                .sa_handler = SIG_DFL,
+                .sa_flags = SA_NOCLDSTOP|SA_RESTART,
+        };
+        sigset_t mask;
+        int r;
+
+        assert(m);
+
+        assert_se(sigaction(SIGCHLD, &sa, NULL) == 0);
+
+        /* We make liberal use of realtime signals here. On
+         * Linux/glibc we have 30 of them (with the exception of Linux
+         * on hppa, see below), between SIGRTMIN+0 ... SIGRTMIN+30
+         * (aka SIGRTMAX). */
+
+        assert_se(sigemptyset(&mask) == 0);
+        sigset_add_many(&mask,
+                        SIGCHLD,     /* Child died */
+                        SIGTERM,     /* Reexecute daemon */
+                        SIGHUP,      /* Reload configuration */
+                        SIGUSR1,     /* systemd: reconnect to D-Bus */
+                        SIGUSR2,     /* systemd: dump status */
+                        SIGINT,      /* Kernel sends us this on control-alt-del */
+                        SIGWINCH,    /* Kernel sends us this on kbrequest (alt-arrowup) */
+                        SIGPWR,      /* Some kernel drivers and upsd send us this on power failure */
+
+                        SIGRTMIN+0,  /* systemd: start default.target */
+                        SIGRTMIN+1,  /* systemd: isolate rescue.target */
+                        SIGRTMIN+2,  /* systemd: isolate emergency.target */
+                        SIGRTMIN+3,  /* systemd: start halt.target */
+                        SIGRTMIN+4,  /* systemd: start poweroff.target */
+                        SIGRTMIN+5,  /* systemd: start reboot.target */
+                        SIGRTMIN+6,  /* systemd: start kexec.target */
+                        SIGRTMIN+7,  /* systemd: start soft-reboot.target */
+
+                        /* ... space for more special targets ... */
+
+                        SIGRTMIN+13, /* systemd: Immediate halt */
+                        SIGRTMIN+14, /* systemd: Immediate poweroff */
+                        SIGRTMIN+15, /* systemd: Immediate reboot */
+                        SIGRTMIN+16, /* systemd: Immediate kexec */
+                        SIGRTMIN+17, /* systemd: Immediate soft-reboot */
+                        SIGRTMIN+18, /* systemd: control command */
+
+                        /* ... space ... */
+
+                        SIGRTMIN+20, /* systemd: enable status messages */
+                        SIGRTMIN+21, /* systemd: disable status messages */
+                        SIGRTMIN+22, /* systemd: set log level to LOG_DEBUG */
+                        SIGRTMIN+23, /* systemd: set log level to LOG_INFO */
+                        SIGRTMIN+24, /* systemd: Immediate exit (--user only) */
+                        SIGRTMIN+25, /* systemd: reexecute manager */
+
+                        /* Apparently Linux on hppa had fewer RT signals until v3.18,
+                         * SIGRTMAX was SIGRTMIN+25, and then SIGRTMIN was lowered,
+                         * see commit v3.17-7614-g1f25df2eff.
+                         *
+                         * We cannot unconditionally make use of those signals here,
+                         * so let's use a runtime check. Since these commands are
+                         * accessible by different means and only really a safety
+                         * net, the missing functionality on hppa shouldn't matter.
+                         */
+
+                        RTSIG_IF_AVAILABLE(SIGRTMIN+26), /* systemd: set log target to journal-or-kmsg */
+                        RTSIG_IF_AVAILABLE(SIGRTMIN+27), /* systemd: set log target to console */
+                        RTSIG_IF_AVAILABLE(SIGRTMIN+28), /* systemd: set log target to kmsg */
+                        RTSIG_IF_AVAILABLE(SIGRTMIN+29), /* systemd: set log target to syslog-or-kmsg (obsolete) */
+
+                        /* ... one free signal here SIGRTMIN+30 ... */
+                        -1);
+        assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
+
+        m->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC);
+        if (m->signal_fd < 0)
+                return -errno;
+
+        r = sd_event_add_io(m->event, &m->signal_event_source, m->signal_fd, EPOLLIN, manager_dispatch_signal_fd, m);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(m->signal_event_source, "manager-signal");
+
+        /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the
+         * notify processing can still figure out to which process/service a message belongs, before we reap the
+         * process. Also, process this before handling cgroup notifications, so that we always collect child exit
+         * status information before detecting that there's no process in a cgroup. */
+        r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6);
+        if (r < 0)
+                return r;
+
+        if (MANAGER_IS_SYSTEM(m))
+                return enable_special_signals(m);
+
+        return 0;
+}
+
+static char** sanitize_environment(char **l) {
+
+        /* Let's remove some environment variables that we need ourselves to communicate with our clients */
+        strv_env_unset_many(
+                        l,
+                        "CACHE_DIRECTORY",
+                        "CONFIGURATION_DIRECTORY",
+                        "CREDENTIALS_DIRECTORY",
+                        "EXIT_CODE",
+                        "EXIT_STATUS",
+                        "INVOCATION_ID",
+                        "JOURNAL_STREAM",
+                        "LISTEN_FDNAMES",
+                        "LISTEN_FDS",
+                        "LISTEN_PID",
+                        "LOGS_DIRECTORY",
+                        "LOG_NAMESPACE",
+                        "MAINPID",
+                        "MANAGERPID",
+                        "MEMORY_PRESSURE_WATCH",
+                        "MEMORY_PRESSURE_WRITE",
+                        "MONITOR_EXIT_CODE",
+                        "MONITOR_EXIT_STATUS",
+                        "MONITOR_INVOCATION_ID",
+                        "MONITOR_SERVICE_RESULT",
+                        "MONITOR_UNIT",
+                        "NOTIFY_SOCKET",
+                        "PIDFILE",
+                        "REMOTE_ADDR",
+                        "REMOTE_PORT",
+                        "RUNTIME_DIRECTORY",
+                        "SERVICE_RESULT",
+                        "STATE_DIRECTORY",
+                        "SYSTEMD_EXEC_PID",
+                        "TRIGGER_PATH",
+                        "TRIGGER_TIMER_MONOTONIC_USEC",
+                        "TRIGGER_TIMER_REALTIME_USEC",
+                        "TRIGGER_UNIT",
+                        "WATCHDOG_PID",
+                        "WATCHDOG_USEC",
+                        NULL);
+
+        /* Let's order the environment alphabetically, just to make it pretty */
+        return strv_sort(l);
+}
+
+int manager_default_environment(Manager *m) {
+        int r;
+
+        assert(m);
+
+        m->transient_environment = strv_free(m->transient_environment);
+
+        if (MANAGER_IS_SYSTEM(m)) {
+                /* The system manager always starts with a clean environment for its children. It does not
+                 * import the kernel's or the parents' exported variables.
+                 *
+                 * The initial passed environment is untouched to keep /proc/self/environ valid; it is used
+                 * for tagging the init process inside containers. */
+                m->transient_environment = strv_new("PATH=" DEFAULT_PATH);
+                if (!m->transient_environment)
+                        return log_oom();
+
+                /* Import locale variables LC_*= from configuration */
+                (void) locale_setup(&m->transient_environment);
+        } else {
+                /* The user manager passes its own environment along to its children, except for $PATH. */
+                m->transient_environment = strv_copy(environ);
+                if (!m->transient_environment)
+                        return log_oom();
+
+                r = strv_env_replace_strdup(&m->transient_environment, "PATH=" DEFAULT_USER_PATH);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        sanitize_environment(m->transient_environment);
+        return 0;
+}
+
+static int manager_setup_prefix(Manager *m) {
+        struct table_entry {
+                uint64_t type;
+                const char *suffix;
+        };
+
+        static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = {
+                [EXEC_DIRECTORY_RUNTIME] =       { SD_PATH_SYSTEM_RUNTIME,       NULL },
+                [EXEC_DIRECTORY_STATE] =         { SD_PATH_SYSTEM_STATE_PRIVATE, NULL },
+                [EXEC_DIRECTORY_CACHE] =         { SD_PATH_SYSTEM_STATE_CACHE,   NULL },
+                [EXEC_DIRECTORY_LOGS] =          { SD_PATH_SYSTEM_STATE_LOGS,    NULL },
+                [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL },
+        };
+
+        static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = {
+                [EXEC_DIRECTORY_RUNTIME] =       { SD_PATH_USER_RUNTIME,       NULL  },
+                [EXEC_DIRECTORY_STATE] =         { SD_PATH_USER_STATE_PRIVATE, NULL  },
+                [EXEC_DIRECTORY_CACHE] =         { SD_PATH_USER_STATE_CACHE,   NULL  },
+                [EXEC_DIRECTORY_LOGS] =          { SD_PATH_USER_STATE_PRIVATE, "log" },
+                [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL  },
+        };
+
+        assert(m);
+
+        const struct table_entry *p = MANAGER_IS_SYSTEM(m) ? paths_system : paths_user;
+        int r;
+
+        for (ExecDirectoryType i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) {
+                r = sd_path_lookup(p[i].type, p[i].suffix, &m->prefix[i]);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to lookup %s path: %m",
+                                                 exec_directory_type_to_string(i));
+        }
+
+        return 0;
+}
+
+static void manager_free_unit_name_maps(Manager *m) {
+        m->unit_id_map = hashmap_free(m->unit_id_map);
+        m->unit_name_map = hashmap_free(m->unit_name_map);
+        m->unit_path_cache = set_free(m->unit_path_cache);
+        m->unit_cache_timestamp_hash = 0;
+}
+
+static int manager_setup_run_queue(Manager *m) {
+        int r;
+
+        assert(m);
+        assert(!m->run_queue_event_source);
+
+        r = sd_event_add_defer(m->event, &m->run_queue_event_source, manager_dispatch_run_queue, m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_enabled(m->run_queue_event_source, SD_EVENT_OFF);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(m->run_queue_event_source, "manager-run-queue");
+
+        return 0;
+}
+
+static int manager_setup_sigchld_event_source(Manager *m) {
+        int r;
+
+        assert(m);
+        assert(!m->sigchld_event_source);
+
+        r = sd_event_add_defer(m->event, &m->sigchld_event_source, manager_dispatch_sigchld, m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(m->sigchld_event_source, "manager-sigchld");
+
+        return 0;
+}
+
+int manager_setup_memory_pressure_event_source(Manager *m) {
+        int r;
+
+        assert(m);
+
+        m->memory_pressure_event_source = sd_event_source_disable_unref(m->memory_pressure_event_source);
+
+        r = sd_event_add_memory_pressure(m->event, &m->memory_pressure_event_source, NULL, NULL);
+        if (r < 0)
+                log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r,
+                               "Failed to establish memory pressure event source, ignoring: %m");
+        else if (m->defaults.memory_pressure_threshold_usec != USEC_INFINITY) {
+
+                /* If there's a default memory pressure threshold set, also apply it to the service manager itself */
+                r = sd_event_source_set_memory_pressure_period(
+                                m->memory_pressure_event_source,
+                                m->defaults.memory_pressure_threshold_usec,
+                                MEMORY_PRESSURE_DEFAULT_WINDOW_USEC);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to adjust memory pressure threshold, ignoring: %m");
+        }
+
+        return 0;
+}
+
+static int manager_find_credentials_dirs(Manager *m) {
+        const char *e;
+        int r;
+
+        assert(m);
+
+        r = get_credentials_dir(&e);
+        if (r < 0) {
+                if (r != -ENXIO)
+                        log_debug_errno(r, "Failed to determine credentials directory, ignoring: %m");
+        } else {
+                m->received_credentials_directory = strdup(e);
+                if (!m->received_credentials_directory)
+                        return -ENOMEM;
+        }
+
+        r = get_encrypted_credentials_dir(&e);
+        if (r < 0) {
+                if (r != -ENXIO)
+                        log_debug_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m");
+        } else {
+                m->received_encrypted_credentials_directory = strdup(e);
+                if (!m->received_encrypted_credentials_directory)
+                        return -ENOMEM;
+        }
+
+        return 0;
+}
+
+void manager_set_switching_root(Manager *m, bool switching_root) {
+        assert(m);
+
+        m->switching_root = MANAGER_IS_SYSTEM(m) && switching_root;
+}
+
+double manager_get_progress(Manager *m) {
+        assert(m);
+
+        if (MANAGER_IS_FINISHED(m) || m->n_installed_jobs == 0)
+                return 1.0;
+
+        return 1.0 - ((double) hashmap_size(m->jobs) / (double) m->n_installed_jobs);
+}
+
+static int compare_job_priority(const void *a, const void *b) {
+        const Job *x = a, *y = b;
+
+        return unit_compare_priority(x->unit, y->unit);
+}
+
+int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, Manager **ret) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        int r;
+
+        assert(IN_SET(runtime_scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER));
+        assert(ret);
+
+        m = new(Manager, 1);
+        if (!m)
+                return -ENOMEM;
+
+        *m = (Manager) {
+                .runtime_scope = runtime_scope,
+                .objective = _MANAGER_OBJECTIVE_INVALID,
+
+                .status_unit_format = STATUS_UNIT_FORMAT_DEFAULT,
+
+                .original_log_level = -1,
+                .original_log_target = _LOG_TARGET_INVALID,
+
+                .watchdog_overridden[WATCHDOG_RUNTIME] = USEC_INFINITY,
+                .watchdog_overridden[WATCHDOG_REBOOT] = USEC_INFINITY,
+                .watchdog_overridden[WATCHDOG_KEXEC] = USEC_INFINITY,
+                .watchdog_overridden[WATCHDOG_PRETIMEOUT] = USEC_INFINITY,
+
+                .show_status_overridden = _SHOW_STATUS_INVALID,
+
+                .notify_fd = -EBADF,
+                .cgroups_agent_fd = -EBADF,
+                .signal_fd = -EBADF,
+                .user_lookup_fds = EBADF_PAIR,
+                .private_listen_fd = -EBADF,
+                .dev_autofs_fd = -EBADF,
+                .cgroup_inotify_fd = -EBADF,
+                .pin_cgroupfs_fd = -EBADF,
+                .ask_password_inotify_fd = -EBADF,
+                .idle_pipe = { -EBADF, -EBADF, -EBADF, -EBADF},
+
+                 /* start as id #1, so that we can leave #0 around as "null-like" value */
+                .current_job_id = 1,
+
+                .have_ask_password = -EINVAL, /* we don't know */
+                .first_boot = -1,
+                .test_run_flags = test_run_flags,
+
+                .dump_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_MINUTE, .burst = 10 },
+
+                .executor_fd = -EBADF,
+        };
+
+        unit_defaults_init(&m->defaults, runtime_scope);
+
+#if ENABLE_EFI
+        if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0)
+                boot_timestamps(m->timestamps + MANAGER_TIMESTAMP_USERSPACE,
+                                m->timestamps + MANAGER_TIMESTAMP_FIRMWARE,
+                                m->timestamps + MANAGER_TIMESTAMP_LOADER);
+#endif
+
+        /* Prepare log fields we can use for structured logging */
+        if (MANAGER_IS_SYSTEM(m)) {
+                m->unit_log_field = "UNIT=";
+                m->unit_log_format_string = "UNIT=%s";
+
+                m->invocation_log_field = "INVOCATION_ID=";
+                m->invocation_log_format_string = "INVOCATION_ID=%s";
+        } else {
+                m->unit_log_field = "USER_UNIT=";
+                m->unit_log_format_string = "USER_UNIT=%s";
+
+                m->invocation_log_field = "USER_INVOCATION_ID=";
+                m->invocation_log_format_string = "USER_INVOCATION_ID=%s";
+        }
+
+        /* Reboot immediately if the user hits C-A-D more often than 7x per 2s */
+        m->ctrl_alt_del_ratelimit = (const RateLimit) { .interval = 2 * USEC_PER_SEC, .burst = 7 };
+
+        r = manager_default_environment(m);
+        if (r < 0)
+                return r;
+
+        r = hashmap_ensure_allocated(&m->units, &string_hash_ops);
+        if (r < 0)
+                return r;
+
+        r = hashmap_ensure_allocated(&m->cgroup_unit, &path_hash_ops);
+        if (r < 0)
+                return r;
+
+        r = hashmap_ensure_allocated(&m->watch_bus, &string_hash_ops);
+        if (r < 0)
+                return r;
+
+        r = prioq_ensure_allocated(&m->run_queue, compare_job_priority);
+        if (r < 0)
+                return r;
+
+        r = manager_setup_prefix(m);
+        if (r < 0)
+                return r;
+
+        r = manager_find_credentials_dirs(m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_default(&m->event);
+        if (r < 0)
+                return r;
+
+        r = manager_setup_run_queue(m);
+        if (r < 0)
+                return r;
+
+        if (FLAGS_SET(test_run_flags, MANAGER_TEST_RUN_MINIMAL)) {
+                m->cgroup_root = strdup("");
+                if (!m->cgroup_root)
+                        return -ENOMEM;
+        } else {
+                r = manager_setup_signals(m);
+                if (r < 0)
+                        return r;
+
+                r = manager_setup_cgroup(m);
+                if (r < 0)
+                        return r;
+
+                r = manager_setup_time_change(m);
+                if (r < 0)
+                        return r;
+
+                r = manager_read_timezone_stat(m);
+                if (r < 0)
+                        return r;
+
+                (void) manager_setup_timezone_change(m);
+
+                r = manager_setup_sigchld_event_source(m);
+                if (r < 0)
+                        return r;
+
+                r = manager_setup_memory_pressure_event_source(m);
+                if (r < 0)
+                        return r;
+
+#if HAVE_LIBBPF
+                if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported(/* initialize = */ true)) {
+                        r = lsm_bpf_setup(m);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to setup LSM BPF, ignoring: %m");
+                }
+#endif
+        }
+
+        if (test_run_flags == 0) {
+                if (MANAGER_IS_SYSTEM(m))
+                        r = mkdir_label("/run/systemd/units", 0755);
+                else {
+                        _cleanup_free_ char *units_path = NULL;
+                        r = xdg_user_runtime_dir(&units_path, "/systemd/units");
+                        if (r < 0)
+                                return r;
+                        r = mkdir_p_label(units_path, 0755);
+                }
+
+                if (r < 0 && r != -EEXIST)
+                        return r;
+
+                m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH);
+                if (m->executor_fd < 0)
+                        return log_emergency_errno(errno,
+                                                   "Failed to open executor binary '%s': %m",
+                                                   SYSTEMD_EXECUTOR_BINARY_PATH);
+        } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) {
+                _cleanup_free_ char *self_exe = NULL, *executor_path = NULL;
+                _cleanup_close_ int self_dir_fd = -EBADF;
+                int level = LOG_DEBUG;
+
+                /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the
+                * build directory. Fallback to working directory and then the installation path. */
+                r = readlink_and_make_absolute("/proc/self/exe", &self_exe);
+                if (r < 0)
+                        return r;
+
+                self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_PATH|O_DIRECTORY, 0);
+                if (self_dir_fd < 0)
+                        return self_dir_fd;
+
+                m->executor_fd = RET_NERRNO(openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH));
+                if (m->executor_fd == -ENOENT)
+                        m->executor_fd = RET_NERRNO(openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH));
+                if (m->executor_fd == -ENOENT) {
+                        m->executor_fd = RET_NERRNO(open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH));
+                        level = LOG_WARNING; /* Tests should normally use local builds */
+                }
+                if (m->executor_fd < 0)
+                        return m->executor_fd;
+
+                r = fd_get_path(m->executor_fd, &executor_path);
+                if (r < 0)
+                        return r;
+
+                log_full(level, "Using systemd-executor binary from '%s'.", executor_path);
+        }
+
+        /* Note that we do not set up the notify fd here. We do that after deserialization,
+         * since they might have gotten serialized across the reexec. */
+
+        *ret = TAKE_PTR(m);
+
+        return 0;
+}
+
+static int manager_setup_notify(Manager *m) {
+        int r;
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        if (m->notify_fd < 0) {
+                _cleanup_close_ int fd = -EBADF;
+                union sockaddr_union sa;
+                socklen_t sa_len;
+
+                /* First free all secondary fields */
+                m->notify_socket = mfree(m->notify_socket);
+                m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
+
+                fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to allocate notification socket: %m");
+
+                fd_increase_rxbuf(fd, NOTIFY_RCVBUF_SIZE);
+
+                m->notify_socket = path_join(m->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/notify");
+                if (!m->notify_socket)
+                        return log_oom();
+
+                r = sockaddr_un_set_path(&sa.un, m->notify_socket);
+                if (r < 0)
+                        return log_error_errno(r, "Notify socket '%s' not valid for AF_UNIX socket address, refusing.",
+                                               m->notify_socket);
+                sa_len = r;
+
+                (void) mkdir_parents_label(m->notify_socket, 0755);
+                (void) sockaddr_un_unlink(&sa.un);
+
+                r = mac_selinux_bind(fd, &sa.sa, sa_len);
+                if (r < 0)
+                        return log_error_errno(r, "bind(%s) failed: %m", m->notify_socket);
+
+                r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+                if (r < 0)
+                        return log_error_errno(r, "SO_PASSCRED failed: %m");
+
+                m->notify_fd = TAKE_FD(fd);
+
+                log_debug("Using notification socket %s", m->notify_socket);
+        }
+
+        if (!m->notify_event_source) {
+                r = sd_event_add_io(m->event, &m->notify_event_source, m->notify_fd, EPOLLIN, manager_dispatch_notify_fd, m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate notify event source: %m");
+
+                /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which
+                 * service an exit message belongs. */
+                r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set priority of notify event source: %m");
+
+                (void) sd_event_source_set_description(m->notify_event_source, "manager-notify");
+        }
+
+        return 0;
+}
+
+static int manager_setup_cgroups_agent(Manager *m) {
+
+        static const union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/cgroups-agent",
+        };
+        int r;
+
+        /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering
+         * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and
+         * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on
+         * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number
+         * of D-Bus connections may be queued until the kernel will start dropping further incoming connections,
+         * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX
+         * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and
+         * we thus won't lose messages.
+         *
+         * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen
+         * to it. The system instance hence listens on this special socket, but the user instances listen on the system
+         * bus for these messages. */
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return 0;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether unified cgroups hierarchy is used: %m");
+        if (r > 0) /* We don't need this anymore on the unified hierarchy */
+                return 0;
+
+        if (m->cgroups_agent_fd < 0) {
+                _cleanup_close_ int fd = -EBADF;
+
+                /* First free all secondary fields */
+                m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
+
+                fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m");
+
+                fd_increase_rxbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE);
+
+                (void) sockaddr_un_unlink(&sa.un);
+
+                /* Only allow root to connect to this socket */
+                WITH_UMASK(0077)
+                        r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+                if (r < 0)
+                        return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+
+                m->cgroups_agent_fd = TAKE_FD(fd);
+        }
+
+        if (!m->cgroups_agent_event_source) {
+                r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate cgroups agent event source: %m");
+
+                /* Process cgroups notifications early. Note that when the agent notification is received
+                 * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than
+                 * that. Also see handling of cgroup inotify for the unified cgroup stuff. */
+                r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
+
+                (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent");
+        }
+
+        return 0;
+}
+
+static int manager_setup_user_lookup_fd(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID
+         * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation,
+         * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked
+         * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects
+         * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes,
+         * hence we establish this communication channel so that forked off processes can pass their UID/GID
+         * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple
+         * datagram, along with their unit name, so that we can share one communication socket pair among all units for
+         * this purpose.
+         *
+         * You might wonder why we need a communication channel for this that is independent of the usual notification
+         * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET
+         * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user
+         * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available.
+         *
+         * Note that this function is called under two circumstances: when we first initialize (in which case we
+         * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload
+         * (in which case the socket pair already exists but we still need to allocate the event source for it). */
+
+        if (m->user_lookup_fds[0] < 0) {
+
+                /* Free all secondary fields */
+                safe_close_pair(m->user_lookup_fds);
+                m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+
+                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0)
+                        return log_error_errno(errno, "Failed to allocate user lookup socket: %m");
+
+                (void) fd_increase_rxbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE);
+        }
+
+        if (!m->user_lookup_event_source) {
+                r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to allocate user lookup event source: %m");
+
+                /* Process even earlier than the notify event source, so that we always know first about valid UID/GID
+                 * resolutions */
+                r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to set priority of user lookup event source: %m");
+
+                (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup");
+        }
+
+        return 0;
+}
+
+static unsigned manager_dispatch_cleanup_queue(Manager *m) {
+        Unit *u;
+        unsigned n = 0;
+
+        assert(m);
+
+        while ((u = m->cleanup_queue)) {
+                assert(u->in_cleanup_queue);
+
+                unit_free(u);
+                n++;
+        }
+
+        return n;
+}
+
+static unsigned manager_dispatch_release_resources_queue(Manager *m) {
+        unsigned n = 0;
+        Unit *u;
+
+        assert(m);
+
+        while ((u = LIST_POP(release_resources_queue, m->release_resources_queue))) {
+                assert(u->in_release_resources_queue);
+                u->in_release_resources_queue = false;
+
+                n++;
+
+                unit_release_resources(u);
+        }
+
+        return n;
+}
+
+enum {
+        GC_OFFSET_IN_PATH,  /* This one is on the path we were traveling */
+        GC_OFFSET_UNSURE,   /* No clue */
+        GC_OFFSET_GOOD,     /* We still need this unit */
+        GC_OFFSET_BAD,      /* We don't need this unit anymore */
+        _GC_OFFSET_MAX
+};
+
+static void unit_gc_mark_good(Unit *u, unsigned gc_marker) {
+        Unit *other;
+
+        u->gc_marker = gc_marker + GC_OFFSET_GOOD;
+
+        /* Recursively mark referenced units as GOOD as well */
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCES)
+                if (other->gc_marker == gc_marker + GC_OFFSET_UNSURE)
+                        unit_gc_mark_good(other, gc_marker);
+}
+
+static void unit_gc_sweep(Unit *u, unsigned gc_marker) {
+        Unit *other;
+        bool is_bad;
+
+        assert(u);
+
+        if (IN_SET(u->gc_marker - gc_marker,
+                   GC_OFFSET_GOOD, GC_OFFSET_BAD, GC_OFFSET_UNSURE, GC_OFFSET_IN_PATH))
+                return;
+
+        if (u->in_cleanup_queue)
+                goto bad;
+
+        if (!unit_may_gc(u))
+                goto good;
+
+        u->gc_marker = gc_marker + GC_OFFSET_IN_PATH;
+
+        is_bad = true;
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCED_BY) {
+                unit_gc_sweep(other, gc_marker);
+
+                if (other->gc_marker == gc_marker + GC_OFFSET_GOOD)
+                        goto good;
+
+                if (other->gc_marker != gc_marker + GC_OFFSET_BAD)
+                        is_bad = false;
+        }
+
+        LIST_FOREACH(refs_by_target, ref, u->refs_by_target) {
+                unit_gc_sweep(ref->source, gc_marker);
+
+                if (ref->source->gc_marker == gc_marker + GC_OFFSET_GOOD)
+                        goto good;
+
+                if (ref->source->gc_marker != gc_marker + GC_OFFSET_BAD)
+                        is_bad = false;
+        }
+
+        if (is_bad)
+                goto bad;
+
+        /* We were unable to find anything out about this entry, so
+         * let's investigate it later */
+        u->gc_marker = gc_marker + GC_OFFSET_UNSURE;
+        unit_add_to_gc_queue(u);
+        return;
+
+bad:
+        /* We definitely know that this one is not useful anymore, so
+         * let's mark it for deletion */
+        u->gc_marker = gc_marker + GC_OFFSET_BAD;
+        unit_add_to_cleanup_queue(u);
+        return;
+
+good:
+        unit_gc_mark_good(u, gc_marker);
+}
+
+static unsigned manager_dispatch_gc_unit_queue(Manager *m) {
+        unsigned n = 0, gc_marker;
+        Unit *u;
+
+        assert(m);
+
+        /* log_debug("Running GC..."); */
+
+        m->gc_marker += _GC_OFFSET_MAX;
+        if (m->gc_marker + _GC_OFFSET_MAX <= _GC_OFFSET_MAX)
+                m->gc_marker = 1;
+
+        gc_marker = m->gc_marker;
+
+        while ((u = LIST_POP(gc_queue, m->gc_unit_queue))) {
+                assert(u->in_gc_queue);
+
+                unit_gc_sweep(u, gc_marker);
+
+                u->in_gc_queue = false;
+
+                n++;
+
+                if (IN_SET(u->gc_marker - gc_marker,
+                           GC_OFFSET_BAD, GC_OFFSET_UNSURE)) {
+                        if (u->id)
+                                log_unit_debug(u, "Collecting.");
+                        u->gc_marker = gc_marker + GC_OFFSET_BAD;
+                        unit_add_to_cleanup_queue(u);
+                }
+        }
+
+        return n;
+}
+
+static unsigned manager_dispatch_gc_job_queue(Manager *m) {
+        unsigned n = 0;
+        Job *j;
+
+        assert(m);
+
+        while ((j = LIST_POP(gc_queue, m->gc_job_queue))) {
+                assert(j->in_gc_queue);
+                j->in_gc_queue = false;
+
+                n++;
+
+                if (!job_may_gc(j))
+                        continue;
+
+                log_unit_debug(j->unit, "Collecting job.");
+                (void) job_finish_and_invalidate(j, JOB_COLLECTED, false, false);
+        }
+
+        return n;
+}
+
+static int manager_ratelimit_requeue(sd_event_source *s, uint64_t usec, void *userdata) {
+        Unit *u = userdata;
+
+        assert(u);
+        assert(s == u->auto_start_stop_event_source);
+
+        u->auto_start_stop_event_source = sd_event_source_unref(u->auto_start_stop_event_source);
+
+        /* Re-queue to all queues, if the rate limit hit we might have been throttled on any of them. */
+        unit_submit_to_stop_when_unneeded_queue(u);
+        unit_submit_to_start_when_upheld_queue(u);
+        unit_submit_to_stop_when_bound_queue(u);
+
+        return 0;
+}
+
+static int manager_ratelimit_check_and_queue(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (ratelimit_below(&u->auto_start_stop_ratelimit))
+                return 1;
+
+        /* Already queued, no need to requeue */
+        if (u->auto_start_stop_event_source)
+                return 0;
+
+        r = sd_event_add_time(
+                        u->manager->event,
+                        &u->auto_start_stop_event_source,
+                        CLOCK_MONOTONIC,
+                        ratelimit_end(&u->auto_start_stop_ratelimit),
+                        0,
+                        manager_ratelimit_requeue,
+                        u);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to queue timer on event loop: %m");
+
+        return 0;
+}
+
+static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) {
+        unsigned n = 0;
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        while ((u = LIST_POP(stop_when_unneeded_queue, m->stop_when_unneeded_queue))) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                assert(u->in_stop_when_unneeded_queue);
+                u->in_stop_when_unneeded_queue = false;
+
+                n++;
+
+                if (!unit_is_unneeded(u))
+                        continue;
+
+                log_unit_debug(u, "Unit is not needed anymore.");
+
+                /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+                 * service being unnecessary after a while. */
+
+                r = manager_ratelimit_check_and_queue(u);
+                if (r <= 0) {
+                        log_unit_warning(u,
+                                         "Unit not needed anymore, but not stopping since we tried this too often recently.%s",
+                                         r == 0 ? " Will retry later." : "");
+                        continue;
+                }
+
+                /* Ok, nobody needs us anymore. Sniff. Then let's commit suicide */
+                r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, NULL, &error, NULL);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r));
+        }
+
+        return n;
+}
+
+static unsigned manager_dispatch_start_when_upheld_queue(Manager *m) {
+        unsigned n = 0;
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        while ((u = LIST_POP(start_when_upheld_queue, m->start_when_upheld_queue))) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                Unit *culprit = NULL;
+
+                assert(u->in_start_when_upheld_queue);
+                u->in_start_when_upheld_queue = false;
+
+                n++;
+
+                if (!unit_is_upheld_by_active(u, &culprit))
+                        continue;
+
+                log_unit_debug(u, "Unit is started because upheld by active unit %s.", culprit->id);
+
+                /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+                 * service being unnecessary after a while. */
+
+                r = manager_ratelimit_check_and_queue(u);
+                if (r <= 0) {
+                        log_unit_warning(u,
+                                         "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.%s",
+                                         culprit->id,
+                                         r == 0 ? " Will retry later." : "");
+                        continue;
+                }
+
+                r = manager_add_job(u->manager, JOB_START, u, JOB_FAIL, NULL, &error, NULL);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to enqueue start job, ignoring: %s", bus_error_message(&error, r));
+        }
+
+        return n;
+}
+
+static unsigned manager_dispatch_stop_when_bound_queue(Manager *m) {
+        unsigned n = 0;
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        while ((u = LIST_POP(stop_when_bound_queue, m->stop_when_bound_queue))) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                Unit *culprit = NULL;
+
+                assert(u->in_stop_when_bound_queue);
+                u->in_stop_when_bound_queue = false;
+
+                n++;
+
+                if (!unit_is_bound_by_inactive(u, &culprit))
+                        continue;
+
+                log_unit_debug(u, "Unit is stopped because bound to inactive unit %s.", culprit->id);
+
+                /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+                 * service being unnecessary after a while. */
+
+                r = manager_ratelimit_check_and_queue(u);
+                if (r <= 0) {
+                        log_unit_warning(u,
+                                         "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.%s",
+                                         culprit->id,
+                                         r == 0 ? " Will retry later." : "");
+                        continue;
+                }
+
+                r = manager_add_job(u->manager, JOB_STOP, u, JOB_REPLACE, NULL, &error, NULL);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r));
+        }
+
+        return n;
+}
+
+static void manager_clear_jobs_and_units(Manager *m) {
+        Unit *u;
+
+        assert(m);
+
+        while ((u = hashmap_first(m->units)))
+                unit_free(u);
+
+        manager_dispatch_cleanup_queue(m);
+
+        assert(!m->load_queue);
+        assert(prioq_isempty(m->run_queue));
+        assert(!m->dbus_unit_queue);
+        assert(!m->dbus_job_queue);
+        assert(!m->cleanup_queue);
+        assert(!m->gc_unit_queue);
+        assert(!m->gc_job_queue);
+        assert(!m->cgroup_realize_queue);
+        assert(!m->cgroup_empty_queue);
+        assert(!m->cgroup_oom_queue);
+        assert(!m->target_deps_queue);
+        assert(!m->stop_when_unneeded_queue);
+        assert(!m->start_when_upheld_queue);
+        assert(!m->stop_when_bound_queue);
+        assert(!m->release_resources_queue);
+
+        assert(hashmap_isempty(m->jobs));
+        assert(hashmap_isempty(m->units));
+
+        m->n_on_console = 0;
+        m->n_running_jobs = 0;
+        m->n_installed_jobs = 0;
+        m->n_failed_jobs = 0;
+}
+
+Manager* manager_free(Manager *m) {
+        if (!m)
+                return NULL;
+
+        manager_clear_jobs_and_units(m);
+
+        for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++)
+                if (unit_vtable[c]->shutdown)
+                        unit_vtable[c]->shutdown(m);
+
+        /* Keep the cgroup hierarchy in place except when we know we are going down for good */
+        manager_shutdown_cgroup(m, /* delete= */ IN_SET(m->objective, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC));
+
+        lookup_paths_flush_generator(&m->lookup_paths);
+
+        bus_done(m);
+        manager_varlink_done(m);
+
+        exec_shared_runtime_vacuum(m);
+        hashmap_free(m->exec_shared_runtime_by_id);
+
+        dynamic_user_vacuum(m, false);
+        hashmap_free(m->dynamic_users);
+
+        hashmap_free(m->units);
+        hashmap_free(m->units_by_invocation_id);
+        hashmap_free(m->jobs);
+        hashmap_free(m->watch_pids);
+        hashmap_free(m->watch_pids_more);
+        hashmap_free(m->watch_bus);
+
+        prioq_free(m->run_queue);
+
+        set_free(m->startup_units);
+        set_free(m->failed_units);
+
+        sd_event_source_unref(m->signal_event_source);
+        sd_event_source_unref(m->sigchld_event_source);
+        sd_event_source_unref(m->notify_event_source);
+        sd_event_source_unref(m->cgroups_agent_event_source);
+        sd_event_source_unref(m->time_change_event_source);
+        sd_event_source_unref(m->timezone_change_event_source);
+        sd_event_source_unref(m->jobs_in_progress_event_source);
+        sd_event_source_unref(m->run_queue_event_source);
+        sd_event_source_unref(m->user_lookup_event_source);
+        sd_event_source_unref(m->memory_pressure_event_source);
+
+        safe_close(m->signal_fd);
+        safe_close(m->notify_fd);
+        safe_close(m->cgroups_agent_fd);
+        safe_close_pair(m->user_lookup_fds);
+
+        manager_close_ask_password(m);
+
+        manager_close_idle_pipe(m);
+
+        sd_event_unref(m->event);
+
+        free(m->notify_socket);
+
+        lookup_paths_free(&m->lookup_paths);
+        strv_free(m->transient_environment);
+        strv_free(m->client_environment);
+
+        hashmap_free(m->cgroup_unit);
+        manager_free_unit_name_maps(m);
+
+        free(m->switch_root);
+        free(m->switch_root_init);
+
+        unit_defaults_done(&m->defaults);
+
+        assert(hashmap_isempty(m->units_requiring_mounts_for));
+        hashmap_free(m->units_requiring_mounts_for);
+
+        hashmap_free(m->uid_refs);
+        hashmap_free(m->gid_refs);
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+                m->prefix[dt] = mfree(m->prefix[dt]);
+        free(m->received_credentials_directory);
+        free(m->received_encrypted_credentials_directory);
+
+        free(m->watchdog_pretimeout_governor);
+        free(m->watchdog_pretimeout_governor_overridden);
+
+        m->fw_ctx = fw_ctx_free(m->fw_ctx);
+
+#if BPF_FRAMEWORK
+        lsm_bpf_destroy(m->restrict_fs);
+#endif
+
+        safe_close(m->executor_fd);
+
+        return mfree(m);
+}
+
+static void manager_enumerate_perpetual(Manager *m) {
+        assert(m);
+
+        if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+                return;
+
+        /* Let's ask every type to load all units from disk/kernel that it might know */
+        for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) {
+                if (!unit_type_supported(c)) {
+                        log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c));
+                        continue;
+                }
+
+                if (unit_vtable[c]->enumerate_perpetual)
+                        unit_vtable[c]->enumerate_perpetual(m);
+        }
+}
+
+static void manager_enumerate(Manager *m) {
+        assert(m);
+
+        if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+                return;
+
+        /* Let's ask every type to load all units from disk/kernel that it might know */
+        for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) {
+                if (!unit_type_supported(c)) {
+                        log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c));
+                        continue;
+                }
+
+                if (unit_vtable[c]->enumerate)
+                        unit_vtable[c]->enumerate(m);
+        }
+
+        manager_dispatch_load_queue(m);
+}
+
+static void manager_coldplug(Manager *m) {
+        Unit *u;
+        char *k;
+        int r;
+
+        assert(m);
+
+        log_debug("Invoking unit coldplug() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+        /* Let's place the units back into their deserialized state */
+        HASHMAP_FOREACH_KEY(u, k, m->units) {
+
+                /* ignore aliases */
+                if (u->id != k)
+                        continue;
+
+                r = unit_coldplug(u);
+                if (r < 0)
+                        log_warning_errno(r, "We couldn't coldplug %s, proceeding anyway: %m", u->id);
+        }
+}
+
+static void manager_catchup(Manager *m) {
+        Unit *u;
+        char *k;
+
+        assert(m);
+
+        log_debug("Invoking unit catchup() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+        /* Let's catch up on any state changes that happened while we were reloading/reexecing */
+        HASHMAP_FOREACH_KEY(u, k, m->units) {
+
+                /* ignore aliases */
+                if (u->id != k)
+                        continue;
+
+                unit_catchup(u);
+        }
+}
+
+static void manager_distribute_fds(Manager *m, FDSet *fds) {
+        Unit *u;
+
+        assert(m);
+
+        HASHMAP_FOREACH(u, m->units) {
+
+                if (fdset_size(fds) <= 0)
+                        break;
+
+                if (!UNIT_VTABLE(u)->distribute_fds)
+                        continue;
+
+                UNIT_VTABLE(u)->distribute_fds(u, fds);
+        }
+}
+
+static bool manager_dbus_is_running(Manager *m, bool deserialized) {
+        Unit *u;
+
+        assert(m);
+
+        /* This checks whether the dbus instance we are supposed to expose our APIs on is up. We check both the socket
+         * and the service unit. If the 'deserialized' parameter is true we'll check the deserialized state of the unit
+         * rather than the current one. */
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return false;
+
+        u = manager_get_unit(m, SPECIAL_DBUS_SOCKET);
+        if (!u)
+                return false;
+        if ((deserialized ? SOCKET(u)->deserialized_state : SOCKET(u)->state) != SOCKET_RUNNING)
+                return false;
+
+        u = manager_get_unit(m, SPECIAL_DBUS_SERVICE);
+        if (!u)
+                return false;
+        if (!IN_SET((deserialized ? SERVICE(u)->deserialized_state : SERVICE(u)->state),
+                    SERVICE_RUNNING,
+                    SERVICE_RELOAD,
+                    SERVICE_RELOAD_NOTIFY,
+                    SERVICE_RELOAD_SIGNAL))
+                return false;
+
+        return true;
+}
+
+static void manager_setup_bus(Manager *m) {
+        assert(m);
+
+        /* Let's set up our private bus connection now, unconditionally */
+        (void) bus_init_private(m);
+
+        /* If we are in --user mode also connect to the system bus now */
+        if (MANAGER_IS_USER(m))
+                (void) bus_init_system(m);
+
+        /* Let's connect to the bus now, but only if the unit is supposed to be up */
+        if (manager_dbus_is_running(m, MANAGER_IS_RELOADING(m))) {
+                (void) bus_init_api(m);
+
+                if (MANAGER_IS_SYSTEM(m))
+                        (void) bus_init_system(m);
+        }
+}
+
+static void manager_preset_all(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (m->first_boot <= 0)
+                return;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return;
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return;
+
+        /* If this is the first boot, and we are in the host system, then preset everything */
+        UnitFilePresetMode mode =
+                ENABLE_FIRST_BOOT_FULL_PRESET ? UNIT_FILE_PRESET_FULL : UNIT_FILE_PRESET_ENABLE_ONLY;
+
+        r = unit_file_preset_all(RUNTIME_SCOPE_SYSTEM, 0, NULL, mode, NULL, 0);
+        if (r < 0)
+                log_full_errno(r == -EEXIST ? LOG_NOTICE : LOG_WARNING, r,
+                               "Failed to populate /etc with preset unit settings, ignoring: %m");
+        else
+                log_info("Populated /etc with preset unit settings.");
+}
+
+static void manager_ready(Manager *m) {
+        assert(m);
+
+        /* After having loaded everything, do the final round of catching up with what might have changed */
+
+        m->objective = MANAGER_OK; /* Tell everyone we are up now */
+
+        /* It might be safe to log to the journal now and connect to dbus */
+        manager_recheck_journal(m);
+        manager_recheck_dbus(m);
+
+        /* Let's finally catch up with any changes that took place while we were reloading/reexecing */
+        manager_catchup(m);
+
+        /* Create a file which will indicate when the manager started loading units the last time. */
+        if (MANAGER_IS_SYSTEM(m))
+                (void) touch_file("/run/systemd/systemd-units-load", false,
+                        m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].realtime ?: now(CLOCK_REALTIME),
+                        UID_INVALID, GID_INVALID, 0444);
+}
+
+Manager* manager_reloading_start(Manager *m) {
+        m->n_reloading++;
+        dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_UNITS_LOAD);
+        return m;
+}
+
+void manager_reloading_stopp(Manager **m) {
+        if (*m) {
+                assert((*m)->n_reloading > 0);
+                (*m)->n_reloading--;
+        }
+}
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root) {
+        int r;
+
+        assert(m);
+
+        /* If we are running in test mode, we still want to run the generators,
+         * but we should not touch the real generator directories. */
+        r = lookup_paths_init_or_warn(&m->lookup_paths, m->runtime_scope,
+                                      MANAGER_IS_TEST_RUN(m) ? LOOKUP_PATHS_TEMPORARY_GENERATED : 0,
+                                      root);
+        if (r < 0)
+                return r;
+
+        dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_START));
+        r = manager_run_environment_generators(m);
+        if (r >= 0)
+                r = manager_run_generators(m);
+        dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_FINISH));
+        if (r < 0)
+                return r;
+
+        manager_preset_all(m);
+
+        lookup_paths_log(&m->lookup_paths);
+
+        {
+                /* This block is (optionally) done with the reloading counter bumped */
+                _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
+
+                /* Make sure we don't have a left-over from a previous run */
+                if (!serialization)
+                        (void) rm_rf(m->lookup_paths.transient, 0);
+
+                /* If we will deserialize make sure that during enumeration this is already known, so we increase the
+                 * counter here already */
+                if (serialization)
+                        reloading = manager_reloading_start(m);
+
+                /* First, enumerate what we can from all config files */
+                dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_START));
+                manager_enumerate_perpetual(m);
+                manager_enumerate(m);
+                dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_FINISH));
+
+                /* Second, deserialize if there is something to deserialize */
+                if (serialization) {
+                        r = manager_deserialize(m, serialization, fds);
+                        if (r < 0)
+                                return log_error_errno(r, "Deserialization failed: %m");
+                }
+
+                /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass
+                 * some file descriptors to us pre-initialized. This enables socket-based activation of entire
+                 * containers. */
+                manager_distribute_fds(m, fds);
+
+                /* We might have deserialized the notify fd, but if we didn't then let's create the bus now */
+                r = manager_setup_notify(m);
+                if (r < 0)
+                        /* No sense to continue without notifications, our children would fail anyway. */
+                        return r;
+
+                r = manager_setup_cgroups_agent(m);
+                if (r < 0)
+                        /* Likewise, no sense to continue without empty cgroup notifications. */
+                        return r;
+
+                r = manager_setup_user_lookup_fd(m);
+                if (r < 0)
+                        /* This shouldn't fail, except if things are really broken. */
+                        return r;
+
+                /* Connect to the bus if we are good for it */
+                manager_setup_bus(m);
+
+                /* Now that we are connected to all possible buses, let's deserialize who is tracking us. */
+                r = bus_track_coldplug(m, &m->subscribed, false, m->deserialized_subscribed);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to deserialized tracked clients, ignoring: %m");
+                m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+
+                r = manager_varlink_init(m);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to set up Varlink, ignoring: %m");
+
+                /* Third, fire things up! */
+                manager_coldplug(m);
+
+                /* Clean up runtime objects */
+                manager_vacuum(m);
+
+                if (serialization)
+                        /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the
+                         * reload is finished */
+                        m->send_reloading_done = true;
+        }
+
+        manager_ready(m);
+
+        manager_set_switching_root(m, false);
+
+        return 0;
+}
+
+int manager_add_job(
+                Manager *m,
+                JobType type,
+                Unit *unit,
+                JobMode mode,
+                Set *affected_jobs,
+                sd_bus_error *error,
+                Job **ret) {
+
+        _cleanup_(transaction_abort_and_freep) Transaction *tr = NULL;
+        int r;
+
+        assert(m);
+        assert(type < _JOB_TYPE_MAX);
+        assert(unit);
+        assert(mode < _JOB_MODE_MAX);
+
+        if (mode == JOB_ISOLATE && type != JOB_START)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Isolate is only valid for start.");
+
+        if (mode == JOB_ISOLATE && !unit->allow_isolate)
+                return sd_bus_error_set(error, BUS_ERROR_NO_ISOLATION, "Operation refused, unit may not be isolated.");
+
+        if (mode == JOB_TRIGGERING && type != JOB_STOP)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=triggering is only valid for stop.");
+
+        if (mode == JOB_RESTART_DEPENDENCIES && type != JOB_START)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=restart-dependencies is only valid for start.");
+
+        log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode));
+
+        type = job_type_collapse(type, unit);
+
+        tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
+        if (!tr)
+                return -ENOMEM;
+
+        r = transaction_add_job_and_dependencies(
+                        tr,
+                        type,
+                        unit,
+                        /* by= */ NULL,
+                        TRANSACTION_MATTERS |
+                        (IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS) ? TRANSACTION_IGNORE_REQUIREMENTS : 0) |
+                        (mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0) |
+                        (mode == JOB_RESTART_DEPENDENCIES ? TRANSACTION_PROPAGATE_START_AS_RESTART : 0),
+                        error);
+        if (r < 0)
+                return r;
+
+        if (mode == JOB_ISOLATE) {
+                r = transaction_add_isolate_jobs(tr, m);
+                if (r < 0)
+                        return r;
+        }
+
+        if (mode == JOB_TRIGGERING) {
+                r = transaction_add_triggering_jobs(tr, unit);
+                if (r < 0)
+                        return r;
+        }
+
+        r = transaction_activate(tr, m, mode, affected_jobs, error);
+        if (r < 0)
+                return r;
+
+        log_unit_debug(unit,
+                       "Enqueued job %s/%s as %u", unit->id,
+                       job_type_to_string(type), (unsigned) tr->anchor_job->id);
+
+        if (ret)
+                *ret = tr->anchor_job;
+
+        tr = transaction_free(tr);
+        return 0;
+}
+
+int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **ret) {
+        Unit *unit = NULL;  /* just to appease gcc, initialization is not really necessary */
+        int r;
+
+        assert(m);
+        assert(type < _JOB_TYPE_MAX);
+        assert(name);
+        assert(mode < _JOB_MODE_MAX);
+
+        r = manager_load_unit(m, name, NULL, NULL, &unit);
+        if (r < 0)
+                return r;
+        assert(unit);
+
+        return manager_add_job(m, type, unit, mode, affected_jobs, e, ret);
+}
+
+int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(m);
+        assert(type < _JOB_TYPE_MAX);
+        assert(name);
+        assert(mode < _JOB_MODE_MAX);
+
+        r = manager_add_job_by_name(m, type, name, mode, affected_jobs, &error, ret);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to enqueue %s job for %s: %s", job_mode_to_string(mode), name, bus_error_message(&error, r));
+
+        return r;
+}
+
+int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) {
+        int r;
+        _cleanup_(transaction_abort_and_freep) Transaction *tr = NULL;
+
+        assert(m);
+        assert(unit);
+        assert(mode < _JOB_MODE_MAX);
+        assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */
+
+        tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
+        if (!tr)
+                return -ENOMEM;
+
+        /* We need an anchor job */
+        r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, TRANSACTION_IGNORE_REQUIREMENTS|TRANSACTION_IGNORE_ORDER, e);
+        if (r < 0)
+                return r;
+
+        /* Failure in adding individual dependencies is ignored, so this always succeeds. */
+        transaction_add_propagate_reload_jobs(
+                        tr,
+                        unit,
+                        tr->anchor_job,
+                        mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0);
+
+        r = transaction_activate(tr, m, mode, NULL, e);
+        if (r < 0)
+                return r;
+
+        tr = transaction_free(tr);
+        return 0;
+}
+
+Job *manager_get_job(Manager *m, uint32_t id) {
+        assert(m);
+
+        return hashmap_get(m->jobs, UINT32_TO_PTR(id));
+}
+
+Unit *manager_get_unit(Manager *m, const char *name) {
+        assert(m);
+        assert(name);
+
+        return hashmap_get(m->units, name);
+}
+
+static int manager_dispatch_target_deps_queue(Manager *m) {
+        Unit *u;
+        int r = 0;
+
+        assert(m);
+
+        while ((u = LIST_POP(target_deps_queue, m->target_deps_queue))) {
+                _cleanup_free_ Unit **targets = NULL;
+                int n_targets;
+
+                assert(u->in_target_deps_queue);
+
+                u->in_target_deps_queue = false;
+
+                /* Take an "atomic" snapshot of dependencies here, as the call below will likely modify the
+                 * dependencies, and we can't have it that hash tables we iterate through are modified while
+                 * we are iterating through them. */
+                n_targets = unit_get_dependency_array(u, UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, &targets);
+                if (n_targets < 0)
+                        return n_targets;
+
+                for (int i = 0; i < n_targets; i++) {
+                        r = unit_add_default_target_dependency(u, targets[i]);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return r;
+}
+
+unsigned manager_dispatch_load_queue(Manager *m) {
+        Unit *u;
+        unsigned n = 0;
+
+        assert(m);
+
+        /* Make sure we are not run recursively */
+        if (m->dispatching_load_queue)
+                return 0;
+
+        m->dispatching_load_queue = true;
+
+        /* Dispatches the load queue. Takes a unit from the queue and
+         * tries to load its data until the queue is empty */
+
+        while ((u = m->load_queue)) {
+                assert(u->in_load_queue);
+
+                unit_load(u);
+                n++;
+        }
+
+        m->dispatching_load_queue = false;
+
+        /* Dispatch the units waiting for their target dependencies to be added now, as all targets that we know about
+         * should be loaded and have aliases resolved */
+        (void) manager_dispatch_target_deps_queue(m);
+
+        return n;
+}
+
+bool manager_unit_cache_should_retry_load(Unit *u) {
+        assert(u);
+
+        /* Automatic reloading from disk only applies to units which were not found sometime in the past, and
+         * the not-found stub is kept pinned in the unit graph by dependencies. For units that were
+         * previously loaded, we don't do automatic reloading, and daemon-reload is necessary to update. */
+        if (u->load_state != UNIT_NOT_FOUND)
+                return false;
+
+        /* The cache has been updated since the last time we tried to load the unit. There might be new
+         * fragment paths to read. */
+        if (u->manager->unit_cache_timestamp_hash != u->fragment_not_found_timestamp_hash)
+                return true;
+
+        /* The cache needs to be updated because there are modifications on disk. */
+        return !lookup_paths_timestamp_hash_same(&u->manager->lookup_paths, u->manager->unit_cache_timestamp_hash, NULL);
+}
+
+int manager_load_unit_prepare(
+                Manager *m,
+                const char *name,
+                const char *path,
+                sd_bus_error *e,
+                Unit **ret) {
+
+        _cleanup_(unit_freep) Unit *cleanup_unit = NULL;
+        _cleanup_free_ char *nbuf = NULL;
+        int r;
+
+        assert(m);
+        assert(ret);
+        assert(name || path);
+
+        /* This will prepare the unit for loading, but not actually load anything from disk. */
+
+        if (path && !path_is_absolute(path))
+                return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path);
+
+        if (!name) {
+                r = path_extract_filename(path, &nbuf);
+                if (r < 0)
+                        return r;
+                if (r == O_DIRECTORY)
+                        return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' refers to directory, refusing.", path);
+
+                name = nbuf;
+        }
+
+        UnitType t = unit_name_to_type(name);
+
+        if (t == _UNIT_TYPE_INVALID || !unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) {
+                if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE))
+                        return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is missing the instance name.", name);
+
+                return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is not valid.", name);
+        }
+
+        Unit *unit = manager_get_unit(m, name);
+        if (unit) {
+                /* The time-based cache allows to start new units without daemon-reload,
+                 * but if they are already referenced (because of dependencies or ordering)
+                 * then we have to force a load of the fragment. As an optimization, check
+                 * first if anything in the usual paths was modified since the last time
+                 * the cache was loaded. Also check if the last time an attempt to load the
+                 * unit was made was before the most recent cache refresh, so that we know
+                 * we need to try again — even if the cache is current, it might have been
+                 * updated in a different context before we had a chance to retry loading
+                 * this particular unit. */
+                if (manager_unit_cache_should_retry_load(unit))
+                        unit->load_state = UNIT_STUB;
+                else {
+                        *ret = unit;
+                        return 0;  /* The unit was already loaded */
+                }
+        } else {
+                unit = cleanup_unit = unit_new(m, unit_vtable[t]->object_size);
+                if (!unit)
+                        return -ENOMEM;
+        }
+
+        if (path) {
+                r = free_and_strdup(&unit->fragment_path, path);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_add_name(unit, name);
+        if (r < 0)
+                return r;
+
+        unit_add_to_load_queue(unit);
+        unit_add_to_dbus_queue(unit);
+        unit_add_to_gc_queue(unit);
+
+        *ret = unit;
+        TAKE_PTR(cleanup_unit);
+
+        return 1;  /* The unit was added the load queue */
+}
+
+int manager_load_unit(
+                Manager *m,
+                const char *name,
+                const char *path,
+                sd_bus_error *e,
+                Unit **ret) {
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        /* This will load the unit config, but not actually start any services or anything. */
+
+        r = manager_load_unit_prepare(m, name, path, e, ret);
+        if (r <= 0)
+                return r;
+
+        /* Unit was newly loaded */
+        manager_dispatch_load_queue(m);
+        *ret = unit_follow_merge(*ret);
+        return 0;
+}
+
+int manager_load_startable_unit_or_warn(
+                Manager *m,
+                const char *name,
+                const char *path,
+                Unit **ret) {
+
+        /* Load a unit, make sure it loaded fully and is not masked. */
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        Unit *unit;
+        int r;
+
+        r = manager_load_unit(m, name, path, &error, &unit);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load %s %s: %s",
+                                       name ? "unit" : "unit file", name ?: path,
+                                       bus_error_message(&error, r));
+
+        r = bus_unit_validate_load_state(unit, &error);
+        if (r < 0)
+                return log_error_errno(r, "%s", bus_error_message(&error, r));
+
+        *ret = unit;
+        return 0;
+}
+
+void manager_clear_jobs(Manager *m) {
+        Job *j;
+
+        assert(m);
+
+        while ((j = hashmap_first(m->jobs)))
+                /* No need to recurse. We're cancelling all jobs. */
+                job_finish_and_invalidate(j, JOB_CANCELED, false, false);
+}
+
+void manager_unwatch_pidref(Manager *m, PidRef *pid) {
+        assert(m);
+
+        for (;;) {
+                Unit *u;
+
+                u = manager_get_unit_by_pidref_watching(m, pid);
+                if (!u)
+                        break;
+
+                unit_unwatch_pidref(u, pid);
+        }
+}
+
+static int manager_dispatch_run_queue(sd_event_source *source, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        Job *j;
+
+        assert(source);
+
+        while ((j = prioq_peek(m->run_queue))) {
+                assert(j->installed);
+                assert(j->in_run_queue);
+
+                (void) job_run_and_invalidate(j);
+        }
+
+        if (m->n_running_jobs > 0)
+                manager_watch_jobs_in_progress(m);
+
+        if (m->n_on_console > 0)
+                manager_watch_idle_pipe(m);
+
+        return 1;
+}
+
+void manager_trigger_run_queue(Manager *m) {
+        int r;
+
+        assert(m);
+
+        r = sd_event_source_set_enabled(
+                        m->run_queue_event_source,
+                        prioq_isempty(m->run_queue) ? SD_EVENT_OFF : SD_EVENT_ONESHOT);
+        if (r < 0)
+                log_warning_errno(r, "Failed to enable job run queue event source, ignoring: %m");
+}
+
+static unsigned manager_dispatch_dbus_queue(Manager *m) {
+        unsigned n = 0, budget;
+        Unit *u;
+        Job *j;
+
+        assert(m);
+
+        /* When we are reloading, let's not wait with generating signals, since we need to exit the manager as quickly
+         * as we can. There's no point in throttling generation of signals in that case. */
+        if (MANAGER_IS_RELOADING(m) || m->send_reloading_done || m->pending_reload_message)
+                budget = UINT_MAX; /* infinite budget in this case */
+        else {
+                /* Anything to do at all? */
+                if (!m->dbus_unit_queue && !m->dbus_job_queue)
+                        return 0;
+
+                /* Do we have overly many messages queued at the moment? If so, let's not enqueue more on top, let's
+                 * sit this cycle out, and process things in a later cycle when the queues got a bit emptier. */
+                if (manager_bus_n_queued_write(m) > MANAGER_BUS_BUSY_THRESHOLD)
+                        return 0;
+
+                /* Only process a certain number of units/jobs per event loop iteration. Even if the bus queue wasn't
+                 * overly full before this call we shouldn't increase it in size too wildly in one step, and we
+                 * shouldn't monopolize CPU time with generating these messages. Note the difference in counting of
+                 * this "budget" and the "threshold" above: the "budget" is decreased only once per generated message,
+                 * regardless how many buses/direct connections it is enqueued on, while the "threshold" is applied to
+                 * each queued instance of bus message, i.e. if the same message is enqueued to five buses/direct
+                 * connections it will be counted five times. This difference in counting ("references"
+                 * vs. "instances") is primarily a result of the fact that it's easier to implement it this way,
+                 * however it also reflects the thinking that the "threshold" should put a limit on used queue memory,
+                 * i.e. space, while the "budget" should put a limit on time. Also note that the "threshold" is
+                 * currently chosen much higher than the "budget". */
+                budget = MANAGER_BUS_MESSAGE_BUDGET;
+        }
+
+        while (budget != 0 && (u = m->dbus_unit_queue)) {
+
+                assert(u->in_dbus_queue);
+
+                bus_unit_send_change_signal(u);
+                n++;
+
+                if (budget != UINT_MAX)
+                        budget--;
+        }
+
+        while (budget != 0 && (j = m->dbus_job_queue)) {
+                assert(j->in_dbus_queue);
+
+                bus_job_send_change_signal(j);
+                n++;
+
+                if (budget != UINT_MAX)
+                        budget--;
+        }
+
+        if (m->send_reloading_done) {
+                m->send_reloading_done = false;
+                bus_manager_send_reloading(m, false);
+                n++;
+        }
+
+        if (m->pending_reload_message) {
+                bus_send_pending_reload_message(m);
+                n++;
+        }
+
+        return n;
+}
+
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = userdata;
+        char buf[PATH_MAX];
+        ssize_t n;
+
+        n = recv(fd, buf, sizeof(buf), 0);
+        if (n < 0)
+                return log_error_errno(errno, "Failed to read cgroups agent message: %m");
+        if (n == 0) {
+                log_error("Got zero-length cgroups agent message, ignoring.");
+                return 0;
+        }
+        if ((size_t) n >= sizeof(buf)) {
+                log_error("Got overly long cgroups agent message, ignoring.");
+                return 0;
+        }
+
+        if (memchr(buf, 0, n)) {
+                log_error("Got cgroups agent message with embedded NUL byte, ignoring.");
+                return 0;
+        }
+        buf[n] = 0;
+
+        manager_notify_cgroup_empty(m, buf);
+        (void) bus_forward_agent_released(m, buf);
+
+        return 0;
+}
+
+static bool manager_process_barrier_fd(char * const *tags, FDSet *fds) {
+
+        /* nothing else must be sent when using BARRIER=1 */
+        if (strv_contains(tags, "BARRIER=1")) {
+                if (strv_length(tags) != 1)
+                        log_warning("Extra notification messages sent with BARRIER=1, ignoring everything.");
+                else if (fdset_size(fds) != 1)
+                        log_warning("Got incorrect number of fds with BARRIER=1, closing them.");
+
+                /* Drop the message if BARRIER=1 was found */
+                return true;
+        }
+
+        return false;
+}
+
+static void manager_invoke_notify_message(
+                Manager *m,
+                Unit *u,
+                const struct ucred *ucred,
+                char * const *tags,
+                FDSet *fds) {
+
+        assert(m);
+        assert(u);
+        assert(ucred);
+        assert(tags);
+
+        if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */
+                return;
+        u->notifygen = m->notifygen;
+
+        if (UNIT_VTABLE(u)->notify_message)
+                UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds);
+
+        else if (DEBUG_LOGGING) {
+                _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL;
+
+                buf = strv_join(tags, ", ");
+                if (buf)
+                        x = ellipsize(buf, 20, 90);
+                if (x)
+                        y = cescape(x);
+
+                log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y));
+        }
+}
+
+static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+
+        _cleanup_fdset_free_ FDSet *fds = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        char buf[NOTIFY_BUFFER_MAX+1];
+        struct iovec iovec = {
+                .iov_base = buf,
+                .iov_len = sizeof(buf)-1,
+        };
+        CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
+                         CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
+        struct msghdr msghdr = {
+                .msg_iov = &iovec,
+                .msg_iovlen = 1,
+                .msg_control = &control,
+                .msg_controllen = sizeof(control),
+        };
+
+        struct cmsghdr *cmsg;
+        struct ucred *ucred = NULL;
+        _cleanup_free_ Unit **array_copy = NULL;
+        _cleanup_strv_free_ char **tags = NULL;
+        Unit *u1, *u2, **array;
+        int r, *fd_array = NULL;
+        size_t n_fds = 0;
+        bool found = false;
+        ssize_t n;
+
+        assert(m->notify_fd == fd);
+
+        if (revents != EPOLLIN) {
+                log_warning("Got unexpected poll event for notify fd.");
+                return 0;
+        }
+
+        n = recvmsg_safe(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
+        if (ERRNO_IS_NEG_TRANSIENT(n))
+                return 0; /* Spurious wakeup, try again */
+        if (n == -EXFULL) {
+                log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
+                return 0;
+        }
+        if (n < 0)
+                /* If this is any other, real error, then stop processing this socket. This of course means
+                 * we won't take notification messages anymore, but that's still better than busy looping:
+                 * being woken up over and over again, but being unable to actually read the message from the
+                 * socket. */
+                return log_error_errno(n, "Failed to receive notification message: %m");
+
+        CMSG_FOREACH(cmsg, &msghdr)
+                if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+
+                        assert(!fd_array);
+                        fd_array = CMSG_TYPED_DATA(cmsg, int);
+                        n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+                } else if (cmsg->cmsg_level == SOL_SOCKET &&
+                           cmsg->cmsg_type == SCM_CREDENTIALS &&
+                           cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+                        assert(!ucred);
+                        ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
+                }
+
+        if (n_fds > 0) {
+                assert(fd_array);
+
+                r = fdset_new_array(&fds, fd_array, n_fds);
+                if (r < 0) {
+                        close_many(fd_array, n_fds);
+                        log_oom();
+                        return 0;
+                }
+        }
+
+        if (!ucred || !pid_is_valid(ucred->pid)) {
+                log_warning("Received notify message without valid credentials. Ignoring.");
+                return 0;
+        }
+
+        if ((size_t) n >= sizeof(buf) || (msghdr.msg_flags & MSG_TRUNC)) {
+                log_warning("Received notify message exceeded maximum size. Ignoring.");
+                return 0;
+        }
+
+        /* As extra safety check, let's make sure the string we get doesn't contain embedded NUL bytes.
+         * We permit one trailing NUL byte in the message, but don't expect it. */
+        if (n > 1 && memchr(buf, 0, n-1)) {
+                log_warning("Received notify message with embedded NUL bytes. Ignoring.");
+                return 0;
+        }
+
+        /* Make sure it's NUL-terminated, then parse it to obtain the tags list. */
+        buf[n] = 0;
+        tags = strv_split_newlines(buf);
+        if (!tags) {
+                log_oom();
+                return 0;
+        }
+
+        /* Possibly a barrier fd, let's see. */
+        if (manager_process_barrier_fd(tags, fds)) {
+                log_debug("Received barrier notification message from PID " PID_FMT ".", ucred->pid);
+                return 0;
+        }
+
+        /* Increase the generation counter used for filtering out duplicate unit invocations. */
+        m->notifygen++;
+
+        /* Generate lookup key from the PID (we have no pidfd here, after all) */
+        PidRef pidref = PIDREF_MAKE_FROM_PID(ucred->pid);
+
+        /* Notify every unit that might be interested, which might be multiple. */
+        u1 = manager_get_unit_by_pidref_cgroup(m, &pidref);
+        u2 = hashmap_get(m->watch_pids, &pidref);
+        array = hashmap_get(m->watch_pids_more, &pidref);
+        if (array) {
+                size_t k = 0;
+
+                while (array[k])
+                        k++;
+
+                array_copy = newdup(Unit*, array, k+1);
+                if (!array_copy)
+                        log_oom();
+        }
+        /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle
+         * duplicate units make sure we only invoke each unit's handler once. */
+        if (u1) {
+                manager_invoke_notify_message(m, u1, ucred, tags, fds);
+                found = true;
+        }
+        if (u2) {
+                manager_invoke_notify_message(m, u2, ucred, tags, fds);
+                found = true;
+        }
+        if (array_copy)
+                for (size_t i = 0; array_copy[i]; i++) {
+                        manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds);
+                        found = true;
+                }
+
+        if (!found)
+                log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid);
+
+        if (fdset_size(fds) > 0)
+                log_warning("Got extra auxiliary fds with notification message, closing them.");
+
+        return 0;
+}
+
+static void manager_invoke_sigchld_event(
+                Manager *m,
+                Unit *u,
+                const siginfo_t *si) {
+
+        assert(m);
+        assert(u);
+        assert(si);
+
+        /* Already invoked the handler of this unit in this iteration? Then don't process this again */
+        if (u->sigchldgen == m->sigchldgen)
+                return;
+        u->sigchldgen = m->sigchldgen;
+
+        log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id);
+        unit_unwatch_pid(u, si->si_pid);
+
+        if (UNIT_VTABLE(u)->sigchld_event)
+                UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status);
+}
+
+static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        siginfo_t si = {};
+        int r;
+
+        assert(source);
+
+        /* First we call waitid() for a PID and do not reap the zombie. That way we can still access
+         * /proc/$PID for it while it is a zombie. */
+
+        if (waitid(P_ALL, 0, &si, WEXITED|WNOHANG|WNOWAIT) < 0) {
+
+                if (errno != ECHILD)
+                        log_error_errno(errno, "Failed to peek for child with waitid(), ignoring: %m");
+
+                goto turn_off;
+        }
+
+        if (si.si_pid <= 0)
+                goto turn_off;
+
+        if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) {
+                _cleanup_free_ Unit **array_copy = NULL;
+                _cleanup_free_ char *name = NULL;
+                Unit *u1, *u2, **array;
+
+                (void) pid_get_comm(si.si_pid, &name);
+
+                log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)",
+                          si.si_pid, strna(name),
+                          sigchld_code_to_string(si.si_code),
+                          si.si_status,
+                          strna(si.si_code == CLD_EXITED
+                                ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL)
+                                : signal_to_string(si.si_status)));
+
+                /* Increase the generation counter used for filtering out duplicate unit invocations */
+                m->sigchldgen++;
+
+                /* We look this up by a PidRef that only consists of the PID. After all we couldn't create a
+                 * pidfd here any more even if we wanted (since the process just exited). */
+                PidRef pidref = PIDREF_MAKE_FROM_PID(si.si_pid);
+
+                /* And now figure out the unit this belongs to, it might be multiple... */
+                u1 = manager_get_unit_by_pidref_cgroup(m, &pidref);
+                u2 = hashmap_get(m->watch_pids, &pidref);
+                array = hashmap_get(m->watch_pids_more, &pidref);
+                if (array) {
+                        size_t n = 0;
+
+                        /* Count how many entries the array has */
+                        while (array[n])
+                                n++;
+
+                        /* Make a copy of the array so that we don't trip up on the array changing beneath us */
+                        array_copy = newdup(Unit*, array, n+1);
+                        if (!array_copy)
+                                log_oom();
+                }
+
+                /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but
+                 * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for
+                 * each iteration. */
+                if (u1) {
+                        /* We check for oom condition, in case we got SIGCHLD before the oom notification.
+                         * We only do this for the cgroup the PID belonged to. */
+                        (void) unit_check_oom(u1);
+
+                        /* We check if systemd-oomd performed a kill so that we log and notify appropriately */
+                        (void) unit_check_oomd_kill(u1);
+
+                        manager_invoke_sigchld_event(m, u1, &si);
+                }
+                if (u2)
+                        manager_invoke_sigchld_event(m, u2, &si);
+                if (array_copy)
+                        for (size_t i = 0; array_copy[i]; i++)
+                                manager_invoke_sigchld_event(m, array_copy[i], &si);
+        }
+
+        /* And now, we actually reap the zombie. */
+        if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) {
+                log_error_errno(errno, "Failed to dequeue child, ignoring: %m");
+                return 0;
+        }
+
+        return 0;
+
+turn_off:
+        /* All children processed for now, turn off event source */
+
+        r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to disable SIGCHLD event source: %m");
+
+        return 0;
+}
+
+static void manager_start_special(Manager *m, const char *name, JobMode mode) {
+        Job *job;
+
+        if (manager_add_job_by_name_and_warn(m, JOB_START, name, mode, NULL, &job) < 0)
+                return;
+
+        const char *s = unit_status_string(job->unit, NULL);
+
+        log_info("Activating special unit %s...", s);
+
+        sd_notifyf(false,
+                   "STATUS=Activating special unit %s...", s);
+        m->status_ready = false;
+}
+
+static void manager_handle_ctrl_alt_del(Manager *m) {
+        /* If the user presses C-A-D more than
+         * 7 times within 2s, we reboot/shutdown immediately,
+         * unless it was disabled in system.conf */
+
+        if (ratelimit_below(&m->ctrl_alt_del_ratelimit) || m->cad_burst_action == EMERGENCY_ACTION_NONE)
+                manager_start_special(m, SPECIAL_CTRL_ALT_DEL_TARGET, JOB_REPLACE_IRREVERSIBLY);
+        else
+                emergency_action(m, m->cad_burst_action, EMERGENCY_ACTION_WARN, NULL, -1,
+                                "Ctrl-Alt-Del was pressed more than 7 times within 2s");
+}
+
+static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        ssize_t n;
+        struct signalfd_siginfo sfsi;
+        int r;
+
+        assert(m->signal_fd == fd);
+
+        if (revents != EPOLLIN) {
+                log_warning("Got unexpected events from signal file descriptor.");
+                return 0;
+        }
+
+        n = read(m->signal_fd, &sfsi, sizeof(sfsi));
+        if (n < 0) {
+                if (ERRNO_IS_TRANSIENT(errno))
+                        return 0;
+
+                /* We return an error here, which will kill this handler,
+                 * to avoid a busy loop on read error. */
+                return log_error_errno(errno, "Reading from signal fd failed: %m");
+        }
+        if (n != sizeof(sfsi)) {
+                log_warning("Truncated read from signal fd (%zi bytes), ignoring!", n);
+                return 0;
+        }
+
+        log_received_signal(sfsi.ssi_signo == SIGCHLD ||
+                            (sfsi.ssi_signo == SIGTERM && MANAGER_IS_USER(m))
+                            ? LOG_DEBUG : LOG_INFO,
+                            &sfsi);
+
+        switch (sfsi.ssi_signo) {
+
+        case SIGCHLD:
+                r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to enable SIGCHLD event source, ignoring: %m");
+
+                break;
+
+        case SIGTERM:
+                if (MANAGER_IS_SYSTEM(m)) {
+                        /* This is for compatibility with the original sysvinit */
+                        if (verify_run_space_and_log("Refusing to reexecute") < 0)
+                                break;
+
+                        m->objective = MANAGER_REEXECUTE;
+                        break;
+                }
+
+                _fallthrough_;
+        case SIGINT:
+                if (MANAGER_IS_SYSTEM(m))
+                        manager_handle_ctrl_alt_del(m);
+                else
+                        manager_start_special(m, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY);
+                break;
+
+        case SIGWINCH:
+                /* This is a nop on non-init */
+                if (MANAGER_IS_SYSTEM(m))
+                        manager_start_special(m, SPECIAL_KBREQUEST_TARGET, JOB_REPLACE);
+
+                break;
+
+        case SIGPWR:
+                /* This is a nop on non-init */
+                if (MANAGER_IS_SYSTEM(m))
+                        manager_start_special(m, SPECIAL_SIGPWR_TARGET, JOB_REPLACE);
+
+                break;
+
+        case SIGUSR1:
+                if (manager_dbus_is_running(m, false)) {
+                        log_info("Trying to reconnect to bus...");
+
+                        (void) bus_init_api(m);
+
+                        if (MANAGER_IS_SYSTEM(m))
+                                (void) bus_init_system(m);
+                } else
+                        manager_start_special(m, SPECIAL_DBUS_SERVICE, JOB_REPLACE);
+
+                break;
+
+        case SIGUSR2: {
+                _cleanup_free_ char *dump = NULL;
+
+                r = manager_get_dump_string(m, /* patterns= */ NULL, &dump);
+                if (r < 0) {
+                        log_warning_errno(errno, "Failed to acquire manager dump: %m");
+                        break;
+                }
+
+                log_dump(LOG_INFO, dump);
+                break;
+        }
+
+        case SIGHUP:
+                if (verify_run_space_and_log("Refusing to reload") < 0)
+                        break;
+
+                m->objective = MANAGER_RELOAD;
+                break;
+
+        default: {
+
+                /* Starting SIGRTMIN+0 */
+                static const struct {
+                        const char *target;
+                        JobMode mode;
+                } target_table[] = {
+                        [0] = { SPECIAL_DEFAULT_TARGET,     JOB_ISOLATE },
+                        [1] = { SPECIAL_RESCUE_TARGET,      JOB_ISOLATE },
+                        [2] = { SPECIAL_EMERGENCY_TARGET,   JOB_ISOLATE },
+                        [3] = { SPECIAL_HALT_TARGET,        JOB_REPLACE_IRREVERSIBLY },
+                        [4] = { SPECIAL_POWEROFF_TARGET,    JOB_REPLACE_IRREVERSIBLY },
+                        [5] = { SPECIAL_REBOOT_TARGET,      JOB_REPLACE_IRREVERSIBLY },
+                        [6] = { SPECIAL_KEXEC_TARGET,       JOB_REPLACE_IRREVERSIBLY },
+                        [7] = { SPECIAL_SOFT_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY },
+                };
+
+                /* Starting SIGRTMIN+13, so that target halt and system halt are 10 apart */
+                static const ManagerObjective objective_table[] = {
+                        [0] = MANAGER_HALT,
+                        [1] = MANAGER_POWEROFF,
+                        [2] = MANAGER_REBOOT,
+                        [3] = MANAGER_KEXEC,
+                        [4] = MANAGER_SOFT_REBOOT,
+                };
+
+                if ((int) sfsi.ssi_signo >= SIGRTMIN+0 &&
+                    (int) sfsi.ssi_signo < SIGRTMIN+(int) ELEMENTSOF(target_table)) {
+                        int idx = (int) sfsi.ssi_signo - SIGRTMIN;
+                        manager_start_special(m, target_table[idx].target, target_table[idx].mode);
+                        break;
+                }
+
+                if ((int) sfsi.ssi_signo >= SIGRTMIN+13 &&
+                    (int) sfsi.ssi_signo < SIGRTMIN+13+(int) ELEMENTSOF(objective_table)) {
+                        m->objective = objective_table[sfsi.ssi_signo - SIGRTMIN - 13];
+                        break;
+                }
+
+                switch (sfsi.ssi_signo - SIGRTMIN) {
+
+                case 18: {
+                        bool generic = false;
+
+                        if (sfsi.ssi_code != SI_QUEUE)
+                                generic = true;
+                        else {
+                                /* Override a few select commands by our own PID1-specific logic */
+
+                                switch (sfsi.ssi_int) {
+
+                                case _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE..._COMMON_SIGNAL_COMMAND_LOG_LEVEL_END:
+                                        manager_override_log_level(m, sfsi.ssi_int - _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE);
+                                        break;
+
+                                case COMMON_SIGNAL_COMMAND_CONSOLE:
+                                        manager_override_log_target(m, LOG_TARGET_CONSOLE);
+                                        break;
+
+                                case COMMON_SIGNAL_COMMAND_JOURNAL:
+                                        manager_override_log_target(m, LOG_TARGET_JOURNAL);
+                                        break;
+
+                                case COMMON_SIGNAL_COMMAND_KMSG:
+                                        manager_override_log_target(m, LOG_TARGET_KMSG);
+                                        break;
+
+                                case COMMON_SIGNAL_COMMAND_NULL:
+                                        manager_override_log_target(m, LOG_TARGET_NULL);
+                                        break;
+
+                                case MANAGER_SIGNAL_COMMAND_DUMP_JOBS: {
+                                        _cleanup_free_ char *dump_jobs = NULL;
+
+                                        r = manager_get_dump_jobs_string(m, /* patterns= */ NULL, "  ", &dump_jobs);
+                                        if (r < 0) {
+                                                log_warning_errno(errno, "Failed to acquire manager jobs dump: %m");
+                                                break;
+                                        }
+
+                                        log_dump(LOG_INFO, dump_jobs);
+                                        break;
+                                }
+
+                                default:
+                                        generic = true;
+                                }
+                        }
+
+                        if (generic)
+                                return sigrtmin18_handler(source, &sfsi, NULL);
+
+                        break;
+                }
+
+                case 20:
+                        manager_override_show_status(m, SHOW_STATUS_YES, "signal");
+                        break;
+
+                case 21:
+                        manager_override_show_status(m, SHOW_STATUS_NO, "signal");
+                        break;
+
+                case 22:
+                        manager_override_log_level(m, LOG_DEBUG);
+                        break;
+
+                case 23:
+                        manager_restore_original_log_level(m);
+                        break;
+
+                case 24:
+                        if (MANAGER_IS_USER(m)) {
+                                m->objective = MANAGER_EXIT;
+                                return 0;
+                        }
+
+                        /* This is a nop on init */
+                        break;
+
+                case 25:
+                        m->objective = MANAGER_REEXECUTE;
+                        break;
+
+                case 26:
+                case 29: /* compatibility: used to be mapped to LOG_TARGET_SYSLOG_OR_KMSG */
+                        manager_restore_original_log_target(m);
+                        break;
+
+                case 27:
+                        manager_override_log_target(m, LOG_TARGET_CONSOLE);
+                        break;
+
+                case 28:
+                        manager_override_log_target(m, LOG_TARGET_KMSG);
+                        break;
+
+                default:
+                        log_warning("Got unhandled signal <%s>.", signal_to_string(sfsi.ssi_signo));
+                }
+        }}
+
+        return 0;
+}
+
+static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        Unit *u;
+
+        log_struct(LOG_DEBUG,
+                   "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR,
+                   LOG_MESSAGE("Time has been changed"));
+
+        /* Restart the watch */
+        (void) manager_setup_time_change(m);
+
+        HASHMAP_FOREACH(u, m->units)
+                if (UNIT_VTABLE(u)->time_change)
+                        UNIT_VTABLE(u)->time_change(u);
+
+        return 0;
+}
+
+static int manager_dispatch_timezone_change(
+                sd_event_source *source,
+                const struct inotify_event *e,
+                void *userdata) {
+
+        Manager *m = ASSERT_PTR(userdata);
+        int changed;
+        Unit *u;
+
+        log_debug("inotify event for /etc/localtime");
+
+        changed = manager_read_timezone_stat(m);
+        if (changed <= 0)
+                return changed;
+
+        /* Something changed, restart the watch, to ensure we watch the new /etc/localtime if it changed */
+        (void) manager_setup_timezone_change(m);
+
+        /* Read the new timezone */
+        tzset();
+
+        log_debug("Timezone has been changed (now: %s).", tzname[daylight]);
+
+        HASHMAP_FOREACH(u, m->units)
+                if (UNIT_VTABLE(u)->timezone_change)
+                        UNIT_VTABLE(u)->timezone_change(u);
+
+        return 0;
+}
+
+static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(m->idle_pipe[2] == fd);
+
+        /* There's at least one Type=idle child that just gave up on us waiting for the boot process to
+         * complete. Let's now turn off any further console output if there's at least one service that needs
+         * console access, so that from now on our own output should not spill into that service's output
+         * anymore. After all, we support Type=idle only to beautify console output and it generally is set
+         * on services that want to own the console exclusively without our interference. */
+        m->no_console_output = m->n_on_console > 0;
+
+        /* Acknowledge the child's request, and let all other children know too that they shouldn't wait
+         * any longer by closing the pipes towards them, which is what they are waiting for. */
+        manager_close_idle_pipe(m);
+
+        return 0;
+}
+
+static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(source);
+
+        manager_print_jobs_in_progress(m);
+
+        r = sd_event_source_set_time_relative(source, JOBS_IN_PROGRESS_PERIOD_USEC);
+        if (r < 0)
+                return r;
+
+        return sd_event_source_set_enabled(source, SD_EVENT_ONESHOT);
+}
+
+int manager_loop(Manager *m) {
+        RateLimit rl = { .interval = 1*USEC_PER_SEC, .burst = 50000 };
+        int r;
+
+        assert(m);
+        assert(m->objective == MANAGER_OK); /* Ensure manager_startup() has been called */
+
+        manager_check_finished(m);
+
+        /* There might still be some zombies hanging around from before we were exec()'ed. Let's reap them. */
+        r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable SIGCHLD event source: %m");
+
+        while (m->objective == MANAGER_OK) {
+
+                (void) watchdog_ping();
+
+                if (!ratelimit_below(&rl)) {
+                        /* Yay, something is going seriously wrong, pause a little */
+                        log_warning("Looping too fast. Throttling execution a little.");
+                        sleep(1);
+                }
+
+                if (manager_dispatch_load_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_gc_job_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_gc_unit_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_cleanup_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_cgroup_realize_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_start_when_upheld_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_stop_when_bound_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_stop_when_unneeded_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_release_resources_queue(m) > 0)
+                        continue;
+
+                if (manager_dispatch_dbus_queue(m) > 0)
+                        continue;
+
+                /* Sleep for watchdog runtime wait time */
+                r = sd_event_run(m->event, watchdog_runtime_wait());
+                if (r < 0)
+                        return log_error_errno(r, "Failed to run event loop: %m");
+        }
+
+        return m->objective;
+}
+
+int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) {
+        _cleanup_free_ char *n = NULL;
+        sd_id128_t invocation_id;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(s);
+        assert(_u);
+
+        r = unit_name_from_dbus_path(s, &n);
+        if (r < 0)
+                return r;
+
+        /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128-bit ID then
+         * we use it as invocation ID. */
+        r = sd_id128_from_string(n, &invocation_id);
+        if (r >= 0) {
+                u = hashmap_get(m->units_by_invocation_id, &invocation_id);
+                if (u) {
+                        *_u = u;
+                        return 0;
+                }
+
+                return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID,
+                                         "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.",
+                                         SD_ID128_FORMAT_VAL(invocation_id));
+        }
+
+        /* If this didn't work, we check if this is a unit name */
+        if (!unit_name_is_valid(n, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) {
+                _cleanup_free_ char *nn = NULL;
+
+                nn = cescape(n);
+                return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Unit name %s is neither a valid invocation ID nor unit name.", strnull(nn));
+        }
+
+        r = manager_load_unit(m, n, NULL, e, &u);
+        if (r < 0)
+                return r;
+
+        *_u = u;
+        return 0;
+}
+
+int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j) {
+        const char *p;
+        unsigned id;
+        Job *j;
+        int r;
+
+        assert(m);
+        assert(s);
+        assert(_j);
+
+        p = startswith(s, "/org/freedesktop/systemd1/job/");
+        if (!p)
+                return -EINVAL;
+
+        r = safe_atou(p, &id);
+        if (r < 0)
+                return r;
+
+        j = manager_get_job(m, id);
+        if (!j)
+                return -ENOENT;
+
+        *_j = j;
+
+        return 0;
+}
+
+void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) {
+
+#if HAVE_AUDIT
+        _cleanup_free_ char *p = NULL;
+        const char *msg;
+        int audit_fd, r;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return;
+
+        audit_fd = get_audit_fd();
+        if (audit_fd < 0)
+                return;
+
+        /* Don't generate audit events if the service was already
+         * started and we're just deserializing */
+        if (MANAGER_IS_RELOADING(m))
+                return;
+
+        r = unit_name_to_prefix_and_instance(u->id, &p);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to extract prefix and instance of unit name, ignoring: %m");
+                return;
+        }
+
+        msg = strjoina("unit=", p);
+        if (audit_log_user_comm_message(audit_fd, type, msg, "systemd", NULL, NULL, NULL, success) < 0) {
+                if (ERRNO_IS_PRIVILEGE(errno)) {
+                        /* We aren't allowed to send audit messages?  Then let's not retry again. */
+                        log_debug_errno(errno, "Failed to send audit message, closing audit socket: %m");
+                        close_audit_fd();
+                } else
+                        log_warning_errno(errno, "Failed to send audit message, ignoring: %m");
+        }
+#endif
+
+}
+
+void manager_send_unit_plymouth(Manager *m, Unit *u) {
+        _cleanup_free_ char *message = NULL;
+        int c, r;
+
+        /* Don't generate plymouth events if the service was already
+         * started and we're just deserializing */
+        if (MANAGER_IS_RELOADING(m))
+                return;
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return;
+
+        if (detect_container() > 0)
+                return;
+
+        if (!UNIT_VTABLE(u)->notify_plymouth)
+                return;
+
+        c = asprintf(&message, "U\x02%c%s%c", (int) (strlen(u->id) + 1), u->id, '\x00');
+        if (c < 0)
+                return (void) log_oom();
+
+        /* We set SOCK_NONBLOCK here so that we rather drop the message then wait for plymouth */
+        r = plymouth_send_raw(message, c, SOCK_NONBLOCK);
+        if (r < 0)
+                log_full_errno(ERRNO_IS_NO_PLYMOUTH(r) ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to communicate with plymouth: %m");
+}
+
+usec_t manager_get_watchdog(Manager *m, WatchdogType t) {
+        assert(m);
+
+        if (MANAGER_IS_USER(m))
+                return USEC_INFINITY;
+
+        if (m->watchdog_overridden[t] != USEC_INFINITY)
+                return m->watchdog_overridden[t];
+
+        return m->watchdog[t];
+}
+
+void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
+
+        assert(m);
+
+        if (MANAGER_IS_USER(m))
+                return;
+
+        if (m->watchdog[t] == timeout)
+                return;
+
+        if (m->watchdog_overridden[t] == USEC_INFINITY) {
+                if (t == WATCHDOG_RUNTIME)
+                        (void) watchdog_setup(timeout);
+                else if (t == WATCHDOG_PRETIMEOUT)
+                        (void) watchdog_setup_pretimeout(timeout);
+        }
+
+        m->watchdog[t] = timeout;
+}
+
+void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
+        usec_t usec;
+
+        assert(m);
+
+        if (MANAGER_IS_USER(m))
+                return;
+
+        if (m->watchdog_overridden[t] == timeout)
+                return;
+
+        usec = timeout == USEC_INFINITY ? m->watchdog[t] : timeout;
+        if (t == WATCHDOG_RUNTIME)
+                (void) watchdog_setup(usec);
+        else if (t == WATCHDOG_PRETIMEOUT)
+                (void) watchdog_setup_pretimeout(usec);
+
+        m->watchdog_overridden[t] = timeout;
+}
+
+int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(m);
+
+        if (MANAGER_IS_USER(m))
+                return 0;
+
+        if (streq_ptr(m->watchdog_pretimeout_governor, governor))
+                return 0;
+
+        p = strdup(governor);
+        if (!p)
+                return -ENOMEM;
+
+        r = watchdog_setup_pretimeout_governor(governor);
+        if (r < 0)
+                return r;
+
+        return free_and_replace(m->watchdog_pretimeout_governor, p);
+}
+
+int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(m);
+
+        if (MANAGER_IS_USER(m))
+                return 0;
+
+        if (streq_ptr(m->watchdog_pretimeout_governor_overridden, governor))
+                return 0;
+
+        p = strdup(governor);
+        if (!p)
+                return -ENOMEM;
+
+        r = watchdog_setup_pretimeout_governor(governor);
+        if (r < 0)
+                return r;
+
+        return free_and_replace(m->watchdog_pretimeout_governor_overridden, p);
+}
+
+int manager_reload(Manager *m) {
+        _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
+        _cleanup_fdset_free_ FDSet *fds = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(m);
+
+        r = manager_open_serialization(m, &f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create serialization file: %m");
+
+        fds = fdset_new();
+        if (!fds)
+                return log_oom();
+
+        /* We are officially in reload mode from here on. */
+        reloading = manager_reloading_start(m);
+
+        r = manager_serialize(m, f, fds, false);
+        if (r < 0)
+                return r;
+
+        if (fseeko(f, 0, SEEK_SET) < 0)
+                return log_error_errno(errno, "Failed to seek to beginning of serialization: %m");
+
+        /* 💀 This is the point of no return, from here on there is no way back. 💀 */
+        reloading = NULL;
+
+        bus_manager_send_reloading(m, true);
+
+        /* Start by flushing out all jobs and units, all generated units, all runtime environments, all dynamic users
+         * and everything else that is worth flushing out. We'll get it all back from the serialization — if we need
+         * it. */
+
+        manager_clear_jobs_and_units(m);
+        lookup_paths_flush_generator(&m->lookup_paths);
+        lookup_paths_free(&m->lookup_paths);
+        exec_shared_runtime_vacuum(m);
+        dynamic_user_vacuum(m, false);
+        m->uid_refs = hashmap_free(m->uid_refs);
+        m->gid_refs = hashmap_free(m->gid_refs);
+
+        r = lookup_paths_init_or_warn(&m->lookup_paths, m->runtime_scope, 0, NULL);
+        if (r < 0)
+                return r;
+
+        (void) manager_run_environment_generators(m);
+        (void) manager_run_generators(m);
+
+        lookup_paths_log(&m->lookup_paths);
+
+        /* We flushed out generated files, for which we don't watch mtime, so we should flush the old map. */
+        manager_free_unit_name_maps(m);
+        m->unit_file_state_outdated = false;
+
+        /* First, enumerate what we can from kernel and suchlike */
+        manager_enumerate_perpetual(m);
+        manager_enumerate(m);
+
+        /* Second, deserialize our stored data */
+        r = manager_deserialize(m, f, fds);
+        if (r < 0)
+                log_warning_errno(r, "Deserialization failed, proceeding anyway: %m");
+
+        /* We don't need the serialization anymore */
+        f = safe_fclose(f);
+
+        /* Re-register notify_fd as event source, and set up other sockets/communication channels we might need */
+        (void) manager_setup_notify(m);
+        (void) manager_setup_cgroups_agent(m);
+        (void) manager_setup_user_lookup_fd(m);
+
+        /* Third, fire things up! */
+        manager_coldplug(m);
+
+        /* Clean up runtime objects no longer referenced */
+        manager_vacuum(m);
+
+        /* Clean up deserialized tracked clients */
+        m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+
+        /* Consider the reload process complete now. */
+        assert(m->n_reloading > 0);
+        m->n_reloading--;
+
+        manager_ready(m);
+
+        m->send_reloading_done = true;
+        return 0;
+}
+
+void manager_reset_failed(Manager *m) {
+        Unit *u;
+
+        assert(m);
+
+        HASHMAP_FOREACH(u, m->units)
+                unit_reset_failed(u);
+}
+
+bool manager_unit_inactive_or_pending(Manager *m, const char *name) {
+        Unit *u;
+
+        assert(m);
+        assert(name);
+
+        /* Returns true if the unit is inactive or going down */
+        u = manager_get_unit(m, name);
+        if (!u)
+                return true;
+
+        return unit_inactive_or_pending(u);
+}
+
+static void log_taint_string(Manager *m) {
+        _cleanup_free_ char *taint = NULL;
+
+        assert(m);
+
+        if (MANAGER_IS_USER(m) || m->taint_logged)
+                return;
+
+        m->taint_logged = true; /* only check for taint once */
+
+        taint = manager_taint_string(m);
+        if (isempty(taint))
+                return;
+
+        log_struct(LOG_NOTICE,
+                   LOG_MESSAGE("System is tainted: %s", taint),
+                   "TAINT=%s", taint,
+                   "MESSAGE_ID=" SD_MESSAGE_TAINTED_STR);
+}
+
+static void manager_notify_finished(Manager *m) {
+        usec_t firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec;
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return;
+
+        if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) {
+                char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")]
+                        = {};
+                char *p = buf;
+                size_t size = sizeof buf;
+
+                /* Note that MANAGER_TIMESTAMP_KERNEL's monotonic value is always at 0, and
+                 * MANAGER_TIMESTAMP_FIRMWARE's and MANAGER_TIMESTAMP_LOADER's monotonic value should be considered
+                 * negative values. */
+
+                firmware_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic - m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic;
+                loader_usec = m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+                userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+                total_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic + m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic;
+
+                if (firmware_usec > 0)
+                        size = strpcpyf(&p, size, "%s (firmware) + ", FORMAT_TIMESPAN(firmware_usec, USEC_PER_MSEC));
+                if (loader_usec > 0)
+                        size = strpcpyf(&p, size, "%s (loader) + ", FORMAT_TIMESPAN(loader_usec, USEC_PER_MSEC));
+
+                if (dual_timestamp_is_set(&m->timestamps[MANAGER_TIMESTAMP_INITRD])) {
+
+                        /* The initrd case on bare-metal */
+                        kernel_usec = m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+                        initrd_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic;
+
+                        log_struct(LOG_INFO,
+                                   "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+                                   "KERNEL_USEC="USEC_FMT, kernel_usec,
+                                   "INITRD_USEC="USEC_FMT, initrd_usec,
+                                   "USERSPACE_USEC="USEC_FMT, userspace_usec,
+                                   LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (initrd) + %s (userspace) = %s.",
+                                               buf,
+                                               FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC),
+                                               FORMAT_TIMESPAN(initrd_usec, USEC_PER_MSEC),
+                                               FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC),
+                                               FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+                } else {
+                        /* The initrd-less case on bare-metal */
+
+                        kernel_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+                        initrd_usec = 0;
+
+                        log_struct(LOG_INFO,
+                                   "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+                                   "KERNEL_USEC="USEC_FMT, kernel_usec,
+                                   "USERSPACE_USEC="USEC_FMT, userspace_usec,
+                                   LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (userspace) = %s.",
+                                               buf,
+                                               FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC),
+                                               FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC),
+                                               FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+                }
+        } else {
+                /* The container and --user case */
+                firmware_usec = loader_usec = initrd_usec = kernel_usec = 0;
+                total_usec = userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+
+                log_struct(LOG_INFO,
+                           "MESSAGE_ID=" SD_MESSAGE_USER_STARTUP_FINISHED_STR,
+                           "USERSPACE_USEC="USEC_FMT, userspace_usec,
+                           LOG_MESSAGE("Startup finished in %s.",
+                                       FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+        }
+
+        bus_manager_send_finished(m, firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec);
+
+        log_taint_string(m);
+}
+
+static void user_manager_send_ready(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* We send READY=1 on reaching basic.target only when running in --user mode. */
+        if (!MANAGER_IS_USER(m) || m->ready_sent)
+                return;
+
+        r = sd_notify(false,
+                      "READY=1\n"
+                      "STATUS=Reached " SPECIAL_BASIC_TARGET ".");
+        if (r < 0)
+                log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
+
+        m->ready_sent = true;
+        m->status_ready = false;
+}
+
+static void manager_send_ready(Manager *m) {
+        int r;
+
+        if (m->ready_sent && m->status_ready)
+                /* Skip the notification if nothing changed. */
+                return;
+
+        r = sd_notify(false,
+                      "READY=1\n"
+                      "STATUS=Ready.");
+        if (r < 0)
+                log_full_errno(m->ready_sent ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to send readiness notification, ignoring: %m");
+
+        m->ready_sent = m->status_ready = true;
+}
+
+static void manager_check_basic_target(Manager *m) {
+        Unit *u;
+
+        assert(m);
+
+        /* Small shortcut */
+        if (m->ready_sent && m->taint_logged)
+                return;
+
+        u = manager_get_unit(m, SPECIAL_BASIC_TARGET);
+        if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+                return;
+
+        /* For user managers, send out READY=1 as soon as we reach basic.target */
+        user_manager_send_ready(m);
+
+        /* Log the taint string as soon as we reach basic.target */
+        log_taint_string(m);
+}
+
+void manager_check_finished(Manager *m) {
+        assert(m);
+
+        if (MANAGER_IS_RELOADING(m))
+                return;
+
+        /* Verify that we have entered the event loop already, and not left it again. */
+        if (!MANAGER_IS_RUNNING(m))
+                return;
+
+        manager_check_basic_target(m);
+
+        if (hashmap_size(m->jobs) > 0) {
+                if (m->jobs_in_progress_event_source)
+                        /* Ignore any failure, this is only for feedback */
+                        (void) sd_event_source_set_time(m->jobs_in_progress_event_source,
+                                                        manager_watch_jobs_next_time(m));
+                return;
+        }
+
+        /* The jobs hashmap tends to grow a lot during boot, and then it's not reused until shutdown. Let's
+           kill the hashmap if it is relatively large. */
+        if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10)
+                m->jobs = hashmap_free(m->jobs);
+
+        manager_send_ready(m);
+
+        /* Notify Type=idle units that we are done now */
+        manager_close_idle_pipe(m);
+
+        if (MANAGER_IS_FINISHED(m))
+                return;
+
+        manager_flip_auto_status(m, false, "boot finished");
+
+        /* Turn off confirm spawn now */
+        m->confirm_spawn = NULL;
+
+        /* No need to update ask password status when we're going non-interactive */
+        manager_close_ask_password(m);
+
+        /* This is no longer the first boot */
+        manager_set_first_boot(m, false);
+
+        dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_FINISH);
+
+        manager_notify_finished(m);
+
+        manager_invalidate_startup_units(m);
+}
+
+void manager_send_reloading(Manager *m) {
+        assert(m);
+
+        /* Let whoever invoked us know that we are now reloading */
+        (void) sd_notifyf(/* unset= */ false,
+                          "RELOADING=1\n"
+                          "MONOTONIC_USEC=" USEC_FMT "\n", now(CLOCK_MONOTONIC));
+
+        /* And ensure that we'll send READY=1 again as soon as we are ready again */
+        m->ready_sent = false;
+}
+
+static bool generator_path_any(const char* const* paths) {
+        bool found = false;
+
+        /* Optimize by skipping the whole process by not creating output directories
+         * if no generators are found. */
+        STRV_FOREACH(path, paths)
+                if (access(*path, F_OK) == 0)
+                        found = true;
+                else if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to open generator directory %s: %m", *path);
+
+        return found;
+}
+
+static int manager_run_environment_generators(Manager *m) {
+        char **tmp = NULL; /* this is only used in the forked process, no cleanup here */
+        _cleanup_strv_free_ char **paths = NULL;
+        void* args[] = {
+                [STDOUT_GENERATE] = &tmp,
+                [STDOUT_COLLECT] = &tmp,
+                [STDOUT_CONSUME] = &m->transient_environment,
+        };
+        int r;
+
+        if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_ENV_GENERATORS))
+                return 0;
+
+        paths = env_generator_binary_paths(m->runtime_scope);
+        if (!paths)
+                return log_oom();
+
+        if (!generator_path_any((const char* const*) paths))
+                return 0;
+
+        WITH_UMASK(0022)
+                r = execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, gather_environment,
+                                        args, NULL, m->transient_environment,
+                                        EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
+        return r;
+}
+
+static int build_generator_environment(Manager *m, char ***ret) {
+        _cleanup_strv_free_ char **nl = NULL;
+        Virtualization v;
+        ConfidentialVirtualization cv;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        /* Generators oftentimes want to know some basic facts about the environment they run in, in order to
+         * adjust generated units to that. Let's pass down some bits of information that are easy for us to
+         * determine (but a bit harder for generator scripts to determine), as environment variables. */
+
+        nl = strv_copy(m->transient_environment);
+        if (!nl)
+                return -ENOMEM;
+
+        r = strv_env_assign(&nl, "SYSTEMD_SCOPE", runtime_scope_to_string(m->runtime_scope));
+        if (r < 0)
+                return r;
+
+        if (MANAGER_IS_SYSTEM(m)) {
+                /* Note that $SYSTEMD_IN_INITRD may be used to override the initrd detection in much of our
+                 * codebase. This is hence more than purely informational. It will shortcut detection of the
+                 * initrd state if generators invoke our own tools. But that's OK, as it would come to the
+                 * same results (hopefully). */
+                r = strv_env_assign(&nl, "SYSTEMD_IN_INITRD", one_zero(in_initrd()));
+                if (r < 0)
+                        return r;
+
+                if (m->first_boot >= 0) {
+                        r = strv_env_assign(&nl, "SYSTEMD_FIRST_BOOT", one_zero(m->first_boot));
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        v = detect_virtualization();
+        if (v < 0)
+                log_debug_errno(v, "Failed to detect virtualization, ignoring: %m");
+        else if (v > 0) {
+                const char *s;
+
+                s = strjoina(VIRTUALIZATION_IS_VM(v) ? "vm:" :
+                             VIRTUALIZATION_IS_CONTAINER(v) ? "container:" : ":",
+                             virtualization_to_string(v));
+
+                r = strv_env_assign(&nl, "SYSTEMD_VIRTUALIZATION", s);
+                if (r < 0)
+                        return r;
+        }
+
+        cv = detect_confidential_virtualization();
+        if (cv < 0)
+                log_debug_errno(cv, "Failed to detect confidential virtualization, ignoring: %m");
+        else if (cv > 0) {
+                r = strv_env_assign(&nl, "SYSTEMD_CONFIDENTIAL_VIRTUALIZATION", confidential_virtualization_to_string(cv));
+                if (r < 0)
+                        return r;
+        }
+
+        r = strv_env_assign(&nl, "SYSTEMD_ARCHITECTURE", architecture_to_string(uname_architecture()));
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(nl);
+        return 0;
+}
+
+static int manager_execute_generators(Manager *m, char **paths, bool remount_ro) {
+        _cleanup_strv_free_ char **ge = NULL;
+        const char *argv[] = {
+                NULL, /* Leave this empty, execute_directory() will fill something in */
+                m->lookup_paths.generator,
+                m->lookup_paths.generator_early,
+                m->lookup_paths.generator_late,
+                NULL,
+        };
+        int r;
+
+        r = build_generator_environment(m, &ge);
+        if (r < 0)
+                return log_error_errno(r, "Failed to build generator environment: %m");
+
+        if (remount_ro) {
+                /* Remount most of the filesystem tree read-only. We leave /sys/ as-is, because our code
+                 * checks whether it is read-only to detect containerized execution environments. We leave
+                 * /run/ as-is too, because that's where our output goes. We also leave /proc/ and /dev/shm/
+                 * because they're API, and /tmp/ that safe_fork() mounted for us.
+                 */
+                r = bind_remount_recursive("/", MS_RDONLY, MS_RDONLY,
+                                           STRV_MAKE("/sys", "/run", "/proc", "/dev/shm", "/tmp"));
+                if (r < 0)
+                        log_warning_errno(r, "Read-only bind remount failed, ignoring: %m");
+        }
+
+        BLOCK_WITH_UMASK(0022);
+        return execute_directories(
+                        (const char* const*) paths,
+                        DEFAULT_TIMEOUT_USEC,
+                        /* callbacks= */ NULL, /* callback_args= */ NULL,
+                        (char**) argv,
+                        ge,
+                        EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
+}
+
+static int manager_run_generators(Manager *m) {
+        ForkFlags flags = FORK_RESET_SIGNALS | FORK_WAIT | FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE;
+        _cleanup_strv_free_ char **paths = NULL;
+        int r;
+
+        assert(m);
+
+        if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_GENERATORS))
+                return 0;
+
+        paths = generator_binary_paths(m->runtime_scope);
+        if (!paths)
+                return log_oom();
+
+        if (!generator_path_any((const char* const*) paths))
+                return 0;
+
+        r = lookup_paths_mkdir_generator(&m->lookup_paths);
+        if (r < 0) {
+                log_error_errno(r, "Failed to create generator directories: %m");
+                goto finish;
+        }
+
+        /* If we are the system manager, we fork and invoke the generators in a sanitized mount namespace. If
+         * we are the user manager, let's just execute the generators directly. We might not have the
+         * necessary privileges, and the system manager has already mounted /tmp/ and everything else for us.
+         */
+        if (MANAGER_IS_USER(m)) {
+                r = manager_execute_generators(m, paths, /* remount_ro= */ false);
+                goto finish;
+        }
+
+        /* On some systems /tmp/ doesn't exist, and on some other systems we cannot create it at all. Avoid
+         * trying to mount a private tmpfs on it as there's no one size fits all. */
+        if (is_dir("/tmp", /* follow= */ false) > 0)
+                flags |= FORK_PRIVATE_TMP;
+
+        r = safe_fork("(sd-gens)", flags, NULL);
+        if (r == 0) {
+                r = manager_execute_generators(m, paths, /* remount_ro= */ true);
+                _exit(r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+        }
+        if (r < 0) {
+                if (!ERRNO_IS_PRIVILEGE(r) && r != -EINVAL) {
+                        log_error_errno(r, "Failed to fork off sandboxing environment for executing generators: %m");
+                        goto finish;
+                }
+
+                /* Failed to fork with new mount namespace? Maybe, running in a container environment with
+                 * seccomp or without capability.
+                 *
+                 * We also allow -EINVAL to allow running without CLONE_NEWNS.
+                 *
+                 * Also, when running on non-native userland architecture via systemd-nspawn and
+                 * qemu-user-static QEMU-emulator, clone() with CLONE_NEWNS fails with EINVAL, see
+                 * https://github.com/systemd/systemd/issues/28901.
+                 */
+                log_debug_errno(r,
+                                "Failed to fork off sandboxing environment for executing generators. "
+                                "Falling back to execute generators without sandboxing: %m");
+                r = manager_execute_generators(m, paths, /* remount_ro= */ false);
+        }
+
+finish:
+        lookup_paths_trim_generator(&m->lookup_paths);
+        return r;
+}
+
+int manager_transient_environment_add(Manager *m, char **plus) {
+        char **a;
+
+        assert(m);
+
+        if (strv_isempty(plus))
+                return 0;
+
+        a = strv_env_merge(m->transient_environment, plus);
+        if (!a)
+                return log_oom();
+
+        sanitize_environment(a);
+
+        return strv_free_and_replace(m->transient_environment, a);
+}
+
+int manager_client_environment_modify(
+                Manager *m,
+                char **minus,
+                char **plus) {
+
+        char **a = NULL, **b = NULL, **l;
+
+        assert(m);
+
+        if (strv_isempty(minus) && strv_isempty(plus))
+                return 0;
+
+        l = m->client_environment;
+
+        if (!strv_isempty(minus)) {
+                a = strv_env_delete(l, 1, minus);
+                if (!a)
+                        return -ENOMEM;
+
+                l = a;
+        }
+
+        if (!strv_isempty(plus)) {
+                b = strv_env_merge(l, plus);
+                if (!b) {
+                        strv_free(a);
+                        return -ENOMEM;
+                }
+
+                l = b;
+        }
+
+        if (m->client_environment != l)
+                strv_free(m->client_environment);
+
+        if (a != l)
+                strv_free(a);
+        if (b != l)
+                strv_free(b);
+
+        m->client_environment = sanitize_environment(l);
+        return 0;
+}
+
+int manager_get_effective_environment(Manager *m, char ***ret) {
+        char **l;
+
+        assert(m);
+        assert(ret);
+
+        l = strv_env_merge(m->transient_environment, m->client_environment);
+        if (!l)
+                return -ENOMEM;
+
+        *ret = l;
+        return 0;
+}
+
+int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults) {
+        _cleanup_free_ char *label = NULL;
+        struct rlimit *rlimit[_RLIMIT_MAX];
+        int r;
+
+        assert(m);
+        assert(defaults);
+
+        if (streq_ptr(defaults->smack_process_label, "/"))
+                label = NULL;
+        else  {
+                const char *l = defaults->smack_process_label;
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+                if (!l)
+                        l = SMACK_DEFAULT_PROCESS_LABEL;
+#endif
+                if (l) {
+                        label = strdup(l);
+                        if (!label)
+                                return -ENOMEM;
+                } else
+                        label = NULL;
+        }
+
+        r = rlimit_copy_all(rlimit, defaults->rlimit);
+        if (r < 0)
+                return r;
+
+        m->defaults.std_output = defaults->std_output;
+        m->defaults.std_error = defaults->std_error;
+
+        m->defaults.restart_usec = defaults->restart_usec;
+        m->defaults.timeout_start_usec = defaults->timeout_start_usec;
+        m->defaults.timeout_stop_usec = defaults->timeout_stop_usec;
+        m->defaults.timeout_abort_usec = defaults->timeout_abort_usec;
+        m->defaults.timeout_abort_set = defaults->timeout_abort_set;
+        m->defaults.device_timeout_usec = defaults->device_timeout_usec;
+
+        m->defaults.start_limit_interval = defaults->start_limit_interval;
+        m->defaults.start_limit_burst = defaults->start_limit_burst;
+
+        m->defaults.cpu_accounting = defaults->cpu_accounting;
+        m->defaults.memory_accounting = defaults->memory_accounting;
+        m->defaults.io_accounting = defaults->io_accounting;
+        m->defaults.blockio_accounting = defaults->blockio_accounting;
+        m->defaults.tasks_accounting = defaults->tasks_accounting;
+        m->defaults.ip_accounting = defaults->ip_accounting;
+
+        m->defaults.tasks_max = defaults->tasks_max;
+        m->defaults.timer_accuracy_usec = defaults->timer_accuracy_usec;
+
+        m->defaults.oom_policy = defaults->oom_policy;
+        m->defaults.oom_score_adjust = defaults->oom_score_adjust;
+        m->defaults.oom_score_adjust_set = defaults->oom_score_adjust_set;
+
+        m->defaults.memory_pressure_watch = defaults->memory_pressure_watch;
+        m->defaults.memory_pressure_threshold_usec = defaults->memory_pressure_threshold_usec;
+
+        free_and_replace(m->defaults.smack_process_label, label);
+        rlimit_free_all(m->defaults.rlimit);
+        memcpy(m->defaults.rlimit, rlimit, sizeof(struct rlimit*) * _RLIMIT_MAX);
+
+        return 0;
+}
+
+void manager_recheck_dbus(Manager *m) {
+        assert(m);
+
+        /* Connects to the bus if the dbus service and socket are running. If we are running in user mode
+         * this is all it does. In system mode we'll also connect to the system bus (which will most likely
+         * just reuse the connection of the API bus). That's because the system bus after all runs as service
+         * of the system instance, while in the user instance we can assume it's already there. */
+
+        if (MANAGER_IS_RELOADING(m))
+                return; /* don't check while we are reloading… */
+
+        if (manager_dbus_is_running(m, false)) {
+                (void) bus_init_api(m);
+
+                if (MANAGER_IS_SYSTEM(m))
+                        (void) bus_init_system(m);
+        } else {
+                (void) bus_done_api(m);
+
+                if (MANAGER_IS_SYSTEM(m))
+                        (void) bus_done_system(m);
+        }
+}
+
+static bool manager_journal_is_running(Manager *m) {
+        Unit *u;
+
+        assert(m);
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return false;
+
+        /* If we are the user manager we can safely assume that the journal is up */
+        if (!MANAGER_IS_SYSTEM(m))
+                return true;
+
+        /* Check that the socket is not only up, but in RUNNING state */
+        u = manager_get_unit(m, SPECIAL_JOURNALD_SOCKET);
+        if (!u)
+                return false;
+        if (SOCKET(u)->state != SOCKET_RUNNING)
+                return false;
+
+        /* Similar, check if the daemon itself is fully up, too */
+        u = manager_get_unit(m, SPECIAL_JOURNALD_SERVICE);
+        if (!u)
+                return false;
+        if (!IN_SET(SERVICE(u)->state, SERVICE_RELOAD, SERVICE_RUNNING))
+                return false;
+
+        return true;
+}
+
+void disable_printk_ratelimit(void) {
+        /* Disable kernel's printk ratelimit.
+         *
+         * Logging to /dev/kmsg is most useful during early boot and shutdown, where normal logging
+         * mechanisms are not available. The semantics of this sysctl are such that any kernel command-line
+         * setting takes precedence. */
+        int r;
+
+        r = sysctl_write("kernel/printk_devkmsg", "on");
+        if (r < 0)
+                log_debug_errno(r, "Failed to set sysctl kernel.printk_devkmsg=on: %m");
+}
+
+void manager_recheck_journal(Manager *m) {
+
+        assert(m);
+
+        /* Don't bother with this unless we are in the special situation of being PID 1 */
+        if (getpid_cached() != 1)
+                return;
+
+        /* Don't check this while we are reloading, things might still change */
+        if (MANAGER_IS_RELOADING(m))
+                return;
+
+        /* The journal is fully and entirely up? If so, let's permit logging to it, if that's configured. If
+         * the journal is down, don't ever log to it, otherwise we might end up deadlocking ourselves as we
+         * might trigger an activation ourselves we can't fulfill. */
+        log_set_prohibit_ipc(!manager_journal_is_running(m));
+        log_open();
+}
+
+static ShowStatus manager_get_show_status(Manager *m) {
+        assert(m);
+
+        if (MANAGER_IS_USER(m))
+                return _SHOW_STATUS_INVALID;
+
+        if (m->show_status_overridden != _SHOW_STATUS_INVALID)
+                return m->show_status_overridden;
+
+        return m->show_status;
+}
+
+bool manager_get_show_status_on(Manager *m) {
+        assert(m);
+
+        return show_status_on(manager_get_show_status(m));
+}
+
+static void set_show_status_marker(bool b) {
+        if (b)
+                (void) touch("/run/systemd/show-status");
+        else
+                (void) unlink("/run/systemd/show-status");
+}
+
+void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason) {
+        assert(m);
+        assert(reason);
+        assert(mode >= 0 && mode < _SHOW_STATUS_MAX);
+
+        if (MANAGER_IS_USER(m))
+                return;
+
+        if (mode == m->show_status)
+                return;
+
+        if (m->show_status_overridden == _SHOW_STATUS_INVALID) {
+                bool enabled;
+
+                enabled = show_status_on(mode);
+                log_debug("%s (%s) showing of status (%s).",
+                          enabled ? "Enabling" : "Disabling",
+                          strna(show_status_to_string(mode)),
+                          reason);
+
+                set_show_status_marker(enabled);
+        }
+
+        m->show_status = mode;
+}
+
+void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason) {
+        assert(m);
+        assert(mode < _SHOW_STATUS_MAX);
+
+        if (MANAGER_IS_USER(m))
+                return;
+
+        if (mode == m->show_status_overridden)
+                return;
+
+        m->show_status_overridden = mode;
+
+        if (mode == _SHOW_STATUS_INVALID)
+                mode = m->show_status;
+
+        log_debug("%s (%s) showing of status (%s).",
+                  m->show_status_overridden != _SHOW_STATUS_INVALID ? "Overriding" : "Restoring",
+                  strna(show_status_to_string(mode)),
+                  reason);
+
+        set_show_status_marker(show_status_on(mode));
+}
+
+const char *manager_get_confirm_spawn(Manager *m) {
+        static int last_errno = 0;
+        struct stat st;
+        int r;
+
+        assert(m);
+
+        /* Here's the deal: we want to test the validity of the console but don't want
+         * PID1 to go through the whole console process which might block. But we also
+         * want to warn the user only once if something is wrong with the console so we
+         * cannot do the sanity checks after spawning our children. So here we simply do
+         * really basic tests to hopefully trap common errors.
+         *
+         * If the console suddenly disappear at the time our children will really it
+         * then they will simply fail to acquire it and a positive answer will be
+         * assumed. New children will fall back to /dev/console though.
+         *
+         * Note: TTYs are devices that can come and go any time, and frequently aren't
+         * available yet during early boot (consider a USB rs232 dongle...). If for any
+         * reason the configured console is not ready, we fall back to the default
+         * console. */
+
+        if (!m->confirm_spawn || path_equal(m->confirm_spawn, "/dev/console"))
+                return m->confirm_spawn;
+
+        if (stat(m->confirm_spawn, &st) < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        if (!S_ISCHR(st.st_mode)) {
+                r = -ENOTTY;
+                goto fail;
+        }
+
+        last_errno = 0;
+        return m->confirm_spawn;
+
+fail:
+        if (last_errno != r)
+                last_errno = log_warning_errno(r, "Failed to open %s, using default console: %m", m->confirm_spawn);
+
+        return "/dev/console";
+}
+
+void manager_set_first_boot(Manager *m, bool b) {
+        assert(m);
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return;
+
+        if (m->first_boot != (int) b) {
+                if (b)
+                        (void) touch("/run/systemd/first-boot");
+                else
+                        (void) unlink("/run/systemd/first-boot");
+        }
+
+        m->first_boot = b;
+}
+
+void manager_disable_confirm_spawn(void) {
+        (void) touch("/run/systemd/confirm_spawn_disabled");
+}
+
+static bool manager_should_show_status(Manager *m, StatusType type) {
+        assert(m);
+
+        if (!MANAGER_IS_SYSTEM(m))
+                return false;
+
+        if (m->no_console_output)
+                return false;
+
+        if (!IN_SET(manager_state(m), MANAGER_INITIALIZING, MANAGER_STARTING, MANAGER_STOPPING))
+                return false;
+
+        /* If we cannot find out the status properly, just proceed. */
+        if (type != STATUS_TYPE_EMERGENCY && manager_check_ask_password(m) > 0)
+                return false;
+
+        if (type == STATUS_TYPE_NOTICE && m->show_status != SHOW_STATUS_NO)
+                return true;
+
+        return manager_get_show_status_on(m);
+}
+
+void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) {
+        va_list ap;
+
+        /* If m is NULL, assume we're after shutdown and let the messages through. */
+
+        if (m && !manager_should_show_status(m, type))
+                return;
+
+        /* XXX We should totally drop the check for ephemeral here
+         * and thus effectively make 'Type=idle' pointless. */
+        if (type == STATUS_TYPE_EPHEMERAL && m && m->n_on_console > 0)
+                return;
+
+        va_start(ap, format);
+        status_vprintf(status, SHOW_STATUS_ELLIPSIZE|(type == STATUS_TYPE_EPHEMERAL ? SHOW_STATUS_EPHEMERAL : 0), format, ap);
+        va_end(ap);
+}
+
+Set* manager_get_units_requiring_mounts_for(Manager *m, const char *path) {
+        assert(m);
+        assert(path);
+
+        if (path_equal(path, "/"))
+                path = "";
+
+        return hashmap_get(m->units_requiring_mounts_for, path);
+}
+
+int manager_update_failed_units(Manager *m, Unit *u, bool failed) {
+        unsigned size;
+        int r;
+
+        assert(m);
+        assert(u->manager == m);
+
+        size = set_size(m->failed_units);
+
+        if (failed) {
+                r = set_ensure_put(&m->failed_units, NULL, u);
+                if (r < 0)
+                        return log_oom();
+        } else
+                (void) set_remove(m->failed_units, u);
+
+        if (set_size(m->failed_units) != size)
+                bus_manager_send_change_signal(m);
+
+        return 0;
+}
+
+ManagerState manager_state(Manager *m) {
+        Unit *u;
+
+        assert(m);
+
+        /* Is the special shutdown target active or queued? If so, we are in shutdown state */
+        u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET);
+        if (u && unit_active_or_pending(u))
+                return MANAGER_STOPPING;
+
+        /* Did we ever finish booting? If not then we are still starting up */
+        if (!MANAGER_IS_FINISHED(m)) {
+
+                u = manager_get_unit(m, SPECIAL_BASIC_TARGET);
+                if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+                        return MANAGER_INITIALIZING;
+
+                return MANAGER_STARTING;
+        }
+
+        if (MANAGER_IS_SYSTEM(m)) {
+                /* Are the rescue or emergency targets active or queued? If so we are in maintenance state */
+                u = manager_get_unit(m, SPECIAL_RESCUE_TARGET);
+                if (u && unit_active_or_pending(u))
+                        return MANAGER_MAINTENANCE;
+
+                u = manager_get_unit(m, SPECIAL_EMERGENCY_TARGET);
+                if (u && unit_active_or_pending(u))
+                        return MANAGER_MAINTENANCE;
+        }
+
+        /* Are there any failed units? If so, we are in degraded mode */
+        if (set_size(m->failed_units) > 0)
+                return MANAGER_DEGRADED;
+
+        return MANAGER_RUNNING;
+}
+
+static void manager_unref_uid_internal(
+                Hashmap *uid_refs,
+                uid_t uid,
+                bool destroy_now,
+                int (*_clean_ipc)(uid_t uid)) {
+
+        uint32_t c, n;
+
+        assert(uid_is_valid(uid));
+        assert(_clean_ipc);
+
+        /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the
+         * assumption that uid_t and gid_t are actually defined the same way, with the same validity rules.
+         *
+         * We store a hashmap where the key is the UID/GID and the value is a 32-bit reference counter, whose
+         * highest bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last
+         * reference to the UID/GID is dropped. The flag is set to on, once at least one reference from a
+         * unit where RemoveIPC= is set is added on a UID/GID. It is reset when the UID's/GID's reference
+         * counter drops to 0 again. */
+
+        assert_cc(sizeof(uid_t) == sizeof(gid_t));
+        assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+        if (uid == 0) /* We don't keep track of root, and will never destroy it */
+                return;
+
+        c = PTR_TO_UINT32(hashmap_get(uid_refs, UID_TO_PTR(uid)));
+
+        n = c & ~DESTROY_IPC_FLAG;
+        assert(n > 0);
+        n--;
+
+        if (destroy_now && n == 0) {
+                hashmap_remove(uid_refs, UID_TO_PTR(uid));
+
+                if (c & DESTROY_IPC_FLAG) {
+                        log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.",
+                                  _clean_ipc == clean_ipc_by_uid ? "UID" : "GID",
+                                  uid);
+                        (void) _clean_ipc(uid);
+                }
+        } else {
+                c = n | (c & DESTROY_IPC_FLAG);
+                assert_se(hashmap_update(uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0);
+        }
+}
+
+void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) {
+        manager_unref_uid_internal(m->uid_refs, uid, destroy_now, clean_ipc_by_uid);
+}
+
+void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) {
+        manager_unref_uid_internal(m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid);
+}
+
+static int manager_ref_uid_internal(
+                Hashmap **uid_refs,
+                uid_t uid,
+                bool clean_ipc) {
+
+        uint32_t c, n;
+        int r;
+
+        assert(uid_refs);
+        assert(uid_is_valid(uid));
+
+        /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the
+         * assumption that uid_t and gid_t are actually defined the same way, with the same validity
+         * rules. */
+
+        assert_cc(sizeof(uid_t) == sizeof(gid_t));
+        assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+        if (uid == 0) /* We don't keep track of root, and will never destroy it */
+                return 0;
+
+        r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops);
+        if (r < 0)
+                return r;
+
+        c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid)));
+
+        n = c & ~DESTROY_IPC_FLAG;
+        n++;
+
+        if (n & DESTROY_IPC_FLAG) /* check for overflow */
+                return -EOVERFLOW;
+
+        c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0);
+
+        return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c));
+}
+
+int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) {
+        return manager_ref_uid_internal(&m->uid_refs, uid, clean_ipc);
+}
+
+int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) {
+        return manager_ref_uid_internal(&m->gid_refs, (uid_t) gid, clean_ipc);
+}
+
+static void manager_vacuum_uid_refs_internal(
+                Hashmap *uid_refs,
+                int (*_clean_ipc)(uid_t uid)) {
+
+        void *p, *k;
+
+        assert(_clean_ipc);
+
+        HASHMAP_FOREACH_KEY(p, k, uid_refs) {
+                uint32_t c, n;
+                uid_t uid;
+
+                uid = PTR_TO_UID(k);
+                c = PTR_TO_UINT32(p);
+
+                n = c & ~DESTROY_IPC_FLAG;
+                if (n > 0)
+                        continue;
+
+                if (c & DESTROY_IPC_FLAG) {
+                        log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.",
+                                  _clean_ipc == clean_ipc_by_uid ? "UID" : "GID",
+                                  uid);
+                        (void) _clean_ipc(uid);
+                }
+
+                assert_se(hashmap_remove(uid_refs, k) == p);
+        }
+}
+
+static void manager_vacuum_uid_refs(Manager *m) {
+        manager_vacuum_uid_refs_internal(m->uid_refs, clean_ipc_by_uid);
+}
+
+static void manager_vacuum_gid_refs(Manager *m) {
+        manager_vacuum_uid_refs_internal(m->gid_refs, clean_ipc_by_gid);
+}
+
+static void manager_vacuum(Manager *m) {
+        assert(m);
+
+        /* Release any dynamic users no longer referenced */
+        dynamic_user_vacuum(m, true);
+
+        /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */
+        manager_vacuum_uid_refs(m);
+        manager_vacuum_gid_refs(m);
+
+        /* Release any runtimes no longer referenced */
+        exec_shared_runtime_vacuum(m);
+}
+
+int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        struct buffer {
+                uid_t uid;
+                gid_t gid;
+                char unit_name[UNIT_NAME_MAX+1];
+        } _packed_ buffer;
+
+        Manager *m = userdata;
+        ssize_t l;
+        size_t n;
+        Unit *u;
+
+        assert_se(source);
+        assert_se(m);
+
+        /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the
+         * resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that
+         * it can add a reference to the UID/GID so that it can destroy the UID/GID's IPC objects when the
+         * reference counter drops to 0. */
+
+        l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT);
+        if (l < 0) {
+                if (ERRNO_IS_TRANSIENT(errno))
+                        return 0;
+
+                return log_error_errno(errno, "Failed to read from user lookup fd: %m");
+        }
+
+        if ((size_t) l <= offsetof(struct buffer, unit_name)) {
+                log_warning("Received too short user lookup message, ignoring.");
+                return 0;
+        }
+
+        if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) {
+                log_warning("Received too long user lookup message, ignoring.");
+                return 0;
+        }
+
+        if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) {
+                log_warning("Got user lookup message with invalid UID/GID pair, ignoring.");
+                return 0;
+        }
+
+        n = (size_t) l - offsetof(struct buffer, unit_name);
+        if (memchr(buffer.unit_name, 0, n)) {
+                log_warning("Received lookup message with embedded NUL character, ignoring.");
+                return 0;
+        }
+
+        buffer.unit_name[n] = 0;
+        u = manager_get_unit(m, buffer.unit_name);
+        if (!u) {
+                log_debug("Got user lookup message but unit doesn't exist, ignoring.");
+                return 0;
+        }
+
+        log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid);
+
+        unit_notify_user_lookup(u, buffer.uid, buffer.gid);
+        return 0;
+}
+
+static int short_uid_range(const char *path) {
+        _cleanup_(uid_range_freep) UidRange *p = NULL;
+        int r;
+
+        assert(path);
+
+        /* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534,
+         * i.e. from root to nobody. */
+
+        r = uid_range_load_userns(&p, path);
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                return false;
+        if (r < 0)
+                return log_debug_errno(r, "Failed to load %s: %m", path);
+
+        return !uid_range_covers(p, 0, 65535);
+}
+
+char* manager_taint_string(const Manager *m) {
+        /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at
+         * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the
+         * configuration phase. */
+
+        assert(m);
+
+        const char* stage[12] = {};
+        size_t n = 0;
+
+        _cleanup_free_ char *usrbin = NULL;
+        if (readlink_malloc("/bin", &usrbin) < 0 || !PATH_IN_SET(usrbin, "usr/bin", "/usr/bin"))
+                stage[n++] = "unmerged-usr";
+
+        if (access("/proc/cgroups", F_OK) < 0)
+                stage[n++] = "cgroups-missing";
+
+        if (cg_all_unified() == 0)
+                stage[n++] = "cgroupsv1";
+
+        if (clock_is_localtime(NULL) > 0)
+                stage[n++] = "local-hwclock";
+
+        if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0)
+                stage[n++] = "support-ended";
+
+        _cleanup_free_ char *destination = NULL;
+        if (readlink_malloc("/var/run", &destination) < 0 ||
+            !PATH_IN_SET(destination, "../run", "/run"))
+                stage[n++] = "var-run-bad";
+
+        _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL;
+        if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 &&
+            !streq(overflowuid, "65534"))
+                stage[n++] = "overflowuid-not-65534";
+        if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 &&
+            !streq(overflowgid, "65534"))
+                stage[n++] = "overflowgid-not-65534";
+
+        struct utsname uts;
+        assert_se(uname(&uts) >= 0);
+        if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+                stage[n++] = "old-kernel";
+
+        if (short_uid_range("/proc/self/uid_map") > 0)
+                stage[n++] = "short-uid-range";
+        if (short_uid_range("/proc/self/gid_map") > 0)
+                stage[n++] = "short-gid-range";
+
+        assert(n < ELEMENTSOF(stage) - 1);  /* One extra for NULL terminator */
+
+        return strv_join((char**) stage, ":");
+}
+
+void manager_ref_console(Manager *m) {
+        assert(m);
+
+        m->n_on_console++;
+}
+
+void manager_unref_console(Manager *m) {
+
+        assert(m->n_on_console > 0);
+        m->n_on_console--;
+
+        if (m->n_on_console == 0)
+                m->no_console_output = false; /* unset no_console_output flag, since the console is definitely free now */
+}
+
+void manager_override_log_level(Manager *m, int level) {
+        _cleanup_free_ char *s = NULL;
+        assert(m);
+
+        if (!m->log_level_overridden) {
+                m->original_log_level = log_get_max_level();
+                m->log_level_overridden = true;
+        }
+
+        (void) log_level_to_string_alloc(level, &s);
+        log_info("Setting log level to %s.", strna(s));
+
+        log_set_max_level(level);
+}
+
+void manager_restore_original_log_level(Manager *m) {
+        _cleanup_free_ char *s = NULL;
+        assert(m);
+
+        if (!m->log_level_overridden)
+                return;
+
+        (void) log_level_to_string_alloc(m->original_log_level, &s);
+        log_info("Restoring log level to original (%s).", strna(s));
+
+        log_set_max_level(m->original_log_level);
+        m->log_level_overridden = false;
+}
+
+void manager_override_log_target(Manager *m, LogTarget target) {
+        assert(m);
+
+        if (!m->log_target_overridden) {
+                m->original_log_target = log_get_target();
+                m->log_target_overridden = true;
+        }
+
+        log_info("Setting log target to %s.", log_target_to_string(target));
+        log_set_target(target);
+}
+
+void manager_restore_original_log_target(Manager *m) {
+        assert(m);
+
+        if (!m->log_target_overridden)
+                return;
+
+        log_info("Restoring log target to original %s.", log_target_to_string(m->original_log_target));
+
+        log_set_target(m->original_log_target);
+        m->log_target_overridden = false;
+}
+
+ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s) {
+        if (in_initrd() &&
+            s >= MANAGER_TIMESTAMP_SECURITY_START &&
+            s <= MANAGER_TIMESTAMP_UNITS_LOAD_FINISH)
+                return s - MANAGER_TIMESTAMP_SECURITY_START + MANAGER_TIMESTAMP_INITRD_SECURITY_START;
+        return s;
+}
+
+int manager_allocate_idle_pipe(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (m->idle_pipe[0] >= 0) {
+                assert(m->idle_pipe[1] >= 0);
+                assert(m->idle_pipe[2] >= 0);
+                assert(m->idle_pipe[3] >= 0);
+                return 0;
+        }
+
+        assert(m->idle_pipe[1] < 0);
+        assert(m->idle_pipe[2] < 0);
+        assert(m->idle_pipe[3] < 0);
+
+        r = RET_NERRNO(pipe2(m->idle_pipe + 0, O_NONBLOCK|O_CLOEXEC));
+        if (r < 0)
+                return r;
+
+        r = RET_NERRNO(pipe2(m->idle_pipe + 2, O_NONBLOCK|O_CLOEXEC));
+        if (r < 0) {
+                safe_close_pair(m->idle_pipe + 0);
+                return r;
+        }
+
+        return 1;
+}
+
+void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) {
+        assert(defaults);
+        assert(scope >= 0);
+        assert(scope < _RUNTIME_SCOPE_MAX);
+
+        *defaults = (UnitDefaults) {
+                .std_output = EXEC_OUTPUT_JOURNAL,
+                .std_error = EXEC_OUTPUT_INHERIT,
+                .restart_usec = DEFAULT_RESTART_USEC,
+                .timeout_start_usec = manager_default_timeout(scope),
+                .timeout_stop_usec = manager_default_timeout(scope),
+                .timeout_abort_usec = manager_default_timeout(scope),
+                .timeout_abort_set = false,
+                .device_timeout_usec = manager_default_timeout(scope),
+                .start_limit_interval = DEFAULT_START_LIMIT_INTERVAL,
+                .start_limit_burst = DEFAULT_START_LIMIT_BURST,
+
+                /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU
+                 * controller to be enabled, so the default is to enable it unless we got told otherwise. */
+                .cpu_accounting = cpu_accounting_is_cheap(),
+                .memory_accounting = MEMORY_ACCOUNTING_DEFAULT,
+                .io_accounting = false,
+                .blockio_accounting = false,
+                .tasks_accounting = true,
+                .ip_accounting = false,
+
+                .tasks_max = DEFAULT_TASKS_MAX,
+                .timer_accuracy_usec = 1 * USEC_PER_MINUTE,
+
+                .memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO,
+                .memory_pressure_threshold_usec = MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
+
+                .oom_policy = OOM_STOP,
+                .oom_score_adjust_set = false,
+        };
+}
+
+void unit_defaults_done(UnitDefaults *defaults) {
+        assert(defaults);
+
+        defaults->smack_process_label = mfree(defaults->smack_process_label);
+        rlimit_free_all(defaults->rlimit);
+}
+
+LogTarget manager_get_executor_log_target(Manager *m) {
+        assert(m);
+
+        /* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */
+
+        if (manager_journal_is_running(m))
+                return log_get_target();
+
+        return LOG_TARGET_KMSG;
+}
+
+static const char *const manager_state_table[_MANAGER_STATE_MAX] = {
+        [MANAGER_INITIALIZING] = "initializing",
+        [MANAGER_STARTING]     = "starting",
+        [MANAGER_RUNNING]      = "running",
+        [MANAGER_DEGRADED]     = "degraded",
+        [MANAGER_MAINTENANCE]  = "maintenance",
+        [MANAGER_STOPPING]     = "stopping",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState);
+
+static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
+        [MANAGER_TIMESTAMP_FIRMWARE]                 = "firmware",
+        [MANAGER_TIMESTAMP_LOADER]                   = "loader",
+        [MANAGER_TIMESTAMP_KERNEL]                   = "kernel",
+        [MANAGER_TIMESTAMP_INITRD]                   = "initrd",
+        [MANAGER_TIMESTAMP_USERSPACE]                = "userspace",
+        [MANAGER_TIMESTAMP_FINISH]                   = "finish",
+        [MANAGER_TIMESTAMP_SECURITY_START]           = "security-start",
+        [MANAGER_TIMESTAMP_SECURITY_FINISH]          = "security-finish",
+        [MANAGER_TIMESTAMP_GENERATORS_START]         = "generators-start",
+        [MANAGER_TIMESTAMP_GENERATORS_FINISH]        = "generators-finish",
+        [MANAGER_TIMESTAMP_UNITS_LOAD_START]         = "units-load-start",
+        [MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]        = "units-load-finish",
+        [MANAGER_TIMESTAMP_UNITS_LOAD]               = "units-load",
+        [MANAGER_TIMESTAMP_INITRD_SECURITY_START]    = "initrd-security-start",
+        [MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]   = "initrd-security-finish",
+        [MANAGER_TIMESTAMP_INITRD_GENERATORS_START]  = "initrd-generators-start",
+        [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish",
+        [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]  = "initrd-units-load-start",
+        [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp);
+
+static const char* const oom_policy_table[_OOM_POLICY_MAX] = {
+        [OOM_CONTINUE] = "continue",
+        [OOM_STOP]     = "stop",
+        [OOM_KILL]     = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy);
diff --git a/src/core/manager.h b/src/core/manager.h
new file mode 100644
index 0000000..d96eb7b
--- /dev/null
+++ b/src/core/manager.h
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+#include "sd-bus.h"
+#include "sd-device.h"
+#include "sd-event.h"
+
+#include "common-signal.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "fdset.h"
+#include "hashmap.h"
+#include "list.h"
+#include "prioq.h"
+#include "ratelimit.h"
+#include "varlink.h"
+
+struct libmnt_monitor;
+typedef struct Unit Unit;
+
+/* Enforce upper limit how many names we allow */
+#define MANAGER_MAX_NAMES 131072 /* 128K */
+
+/* On sigrtmin+18, private commands */
+enum {
+        MANAGER_SIGNAL_COMMAND_DUMP_JOBS = _COMMON_SIGNAL_COMMAND_PRIVATE_BASE + 0,
+        _MANAGER_SIGNAL_COMMAND_MAX,
+};
+
+assert_cc((int) _MANAGER_SIGNAL_COMMAND_MAX <= (int) _COMMON_SIGNAL_COMMAND_PRIVATE_END);
+
+typedef struct Manager Manager;
+
+/* An externally visible state. We don't actually maintain this as state variable, but derive it from various fields
+ * when requested */
+typedef enum ManagerState {
+        MANAGER_INITIALIZING,
+        MANAGER_STARTING,
+        MANAGER_RUNNING,
+        MANAGER_DEGRADED,
+        MANAGER_MAINTENANCE,
+        MANAGER_STOPPING,
+        _MANAGER_STATE_MAX,
+        _MANAGER_STATE_INVALID = -EINVAL,
+} ManagerState;
+
+typedef enum ManagerObjective {
+        MANAGER_OK,
+        MANAGER_EXIT,
+        MANAGER_RELOAD,
+        MANAGER_REEXECUTE,
+        MANAGER_REBOOT,
+        MANAGER_SOFT_REBOOT,
+        MANAGER_POWEROFF,
+        MANAGER_HALT,
+        MANAGER_KEXEC,
+        MANAGER_SWITCH_ROOT,
+        _MANAGER_OBJECTIVE_MAX,
+        _MANAGER_OBJECTIVE_INVALID = -EINVAL,
+} ManagerObjective;
+
+typedef enum StatusType {
+        STATUS_TYPE_EPHEMERAL,
+        STATUS_TYPE_NORMAL,
+        STATUS_TYPE_NOTICE,
+        STATUS_TYPE_EMERGENCY,
+} StatusType;
+
+typedef enum OOMPolicy {
+        OOM_CONTINUE,          /* The kernel or systemd-oomd kills the process it wants to kill, and that's it */
+        OOM_STOP,              /* The kernel or systemd-oomd kills the process it wants to kill, and we stop the unit */
+        OOM_KILL,              /* The kernel or systemd-oomd kills the process it wants to kill, and all others in the unit, and we stop the unit */
+        _OOM_POLICY_MAX,
+        _OOM_POLICY_INVALID = -EINVAL,
+} OOMPolicy;
+
+/* Notes:
+ * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD,
+ *    TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when
+ *    the manager is system and not running under container environment.
+ *
+ * 2. The monotonic timestamp of TIMESTAMP_KERNEL is always zero.
+ *
+ * 3. The realtime timestamp of TIMESTAMP_KERNEL will be unset if the system does not
+ *    have RTC.
+ *
+ * 4. TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER will be unset if the system does not
+ *    have RTC, or systemd is built without EFI support.
+ *
+ * 5. The monotonic timestamps of TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER are stored as
+ *    negative of the actual value.
+ *
+ * 6. TIMESTAMP_USERSPACE is the timestamp of when the manager was started.
+ *
+ * 7. TIMESTAMP_INITRD_* are set only when the system is booted with an initrd.
+ */
+
+typedef enum ManagerTimestamp {
+        MANAGER_TIMESTAMP_FIRMWARE,
+        MANAGER_TIMESTAMP_LOADER,
+        MANAGER_TIMESTAMP_KERNEL,
+        MANAGER_TIMESTAMP_INITRD,
+        MANAGER_TIMESTAMP_USERSPACE,
+        MANAGER_TIMESTAMP_FINISH,
+
+        MANAGER_TIMESTAMP_SECURITY_START,
+        MANAGER_TIMESTAMP_SECURITY_FINISH,
+        MANAGER_TIMESTAMP_GENERATORS_START,
+        MANAGER_TIMESTAMP_GENERATORS_FINISH,
+        MANAGER_TIMESTAMP_UNITS_LOAD_START,
+        MANAGER_TIMESTAMP_UNITS_LOAD_FINISH,
+        MANAGER_TIMESTAMP_UNITS_LOAD,
+
+        MANAGER_TIMESTAMP_INITRD_SECURITY_START,
+        MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH,
+        MANAGER_TIMESTAMP_INITRD_GENERATORS_START,
+        MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH,
+        MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START,
+        MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH,
+        _MANAGER_TIMESTAMP_MAX,
+        _MANAGER_TIMESTAMP_INVALID = -EINVAL,
+} ManagerTimestamp;
+
+typedef enum WatchdogType {
+        WATCHDOG_RUNTIME,
+        WATCHDOG_REBOOT,
+        WATCHDOG_KEXEC,
+        WATCHDOG_PRETIMEOUT,
+        _WATCHDOG_TYPE_MAX,
+} WatchdogType;
+
+#include "execute.h"
+#include "job.h"
+#include "path-lookup.h"
+#include "show-status.h"
+#include "unit-name.h"
+
+typedef enum ManagerTestRunFlags {
+        MANAGER_TEST_NORMAL                  = 0,       /* run normally */
+        MANAGER_TEST_RUN_MINIMAL             = 1 << 0,  /* create basic data structures */
+        MANAGER_TEST_RUN_BASIC               = 1 << 1,  /* interact with the environment */
+        MANAGER_TEST_RUN_ENV_GENERATORS      = 1 << 2,  /* also run env generators  */
+        MANAGER_TEST_RUN_GENERATORS          = 1 << 3,  /* also run unit generators */
+        MANAGER_TEST_RUN_IGNORE_DEPENDENCIES = 1 << 4,  /* run while ignoring dependencies */
+        MANAGER_TEST_DONT_OPEN_EXECUTOR      = 1 << 5,  /* avoid trying to load sd-executor */
+        MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS,
+} ManagerTestRunFlags;
+
+assert_cc((MANAGER_TEST_FULL & UINT8_MAX) == MANAGER_TEST_FULL);
+
+/* Various defaults for unit file settings. */
+typedef struct UnitDefaults {
+        ExecOutput std_output, std_error;
+
+        usec_t restart_usec, timeout_start_usec, timeout_stop_usec, timeout_abort_usec, device_timeout_usec;
+        bool timeout_abort_set;
+
+        usec_t start_limit_interval;
+        unsigned start_limit_burst;
+
+        bool cpu_accounting;
+        bool memory_accounting;
+        bool io_accounting;
+        bool blockio_accounting;
+        bool tasks_accounting;
+        bool ip_accounting;
+
+        CGroupTasksMax tasks_max;
+        usec_t timer_accuracy_usec;
+
+        OOMPolicy oom_policy;
+        int oom_score_adjust;
+        bool oom_score_adjust_set;
+
+        CGroupPressureWatch memory_pressure_watch;
+        usec_t memory_pressure_threshold_usec;
+
+        char *smack_process_label;
+
+        struct rlimit *rlimit[_RLIMIT_MAX];
+} UnitDefaults;
+
+struct Manager {
+        /* Note that the set of units we know of is allowed to be
+         * inconsistent. However the subset of it that is loaded may
+         * not, and the list of jobs may neither. */
+
+        /* Active jobs and units */
+        Hashmap *units;  /* name string => Unit object n:1 */
+        Hashmap *units_by_invocation_id;
+        Hashmap *jobs;   /* job id => Job object 1:1 */
+
+        /* To make it easy to iterate through the units of a specific
+         * type we maintain a per type linked list */
+        LIST_HEAD(Unit, units_by_type[_UNIT_TYPE_MAX]);
+
+        /* Units that need to be loaded */
+        LIST_HEAD(Unit, load_queue); /* this is actually more a stack than a queue, but uh. */
+
+        /* Jobs that need to be run */
+        struct Prioq *run_queue;
+
+        /* Units and jobs that have not yet been announced via
+         * D-Bus. When something about a job changes it is added here
+         * if it is not in there yet. This allows easy coalescing of
+         * D-Bus change signals. */
+        LIST_HEAD(Unit, dbus_unit_queue);
+        LIST_HEAD(Job, dbus_job_queue);
+
+        /* Units to remove */
+        LIST_HEAD(Unit, cleanup_queue);
+
+        /* Units and jobs to check when doing GC */
+        LIST_HEAD(Unit, gc_unit_queue);
+        LIST_HEAD(Job, gc_job_queue);
+
+        /* Units that should be realized */
+        LIST_HEAD(Unit, cgroup_realize_queue);
+
+        /* Units whose cgroup ran empty */
+        LIST_HEAD(Unit, cgroup_empty_queue);
+
+        /* Units whose memory.event fired */
+        LIST_HEAD(Unit, cgroup_oom_queue);
+
+        /* Target units whose default target dependencies haven't been set yet */
+        LIST_HEAD(Unit, target_deps_queue);
+
+        /* Units that might be subject to StopWhenUnneeded= clean-up */
+        LIST_HEAD(Unit, stop_when_unneeded_queue);
+
+        /* Units which are upheld by another other which we might need to act on */
+        LIST_HEAD(Unit, start_when_upheld_queue);
+
+        /* Units that have BindsTo= another unit, and might need to be shutdown because the bound unit is not active. */
+        LIST_HEAD(Unit, stop_when_bound_queue);
+
+        /* Units that have resources open, and where it might be good to check if they can be released now */
+        LIST_HEAD(Unit, release_resources_queue);
+
+        sd_event *event;
+
+        /* This maps PIDs we care about to units that are interested in them. We allow multiple units to be
+         * interested in the same PID and multiple PIDs to be relevant to the same unit. Since in most cases
+         * only a single unit will be interested in the same PID though, we use a somewhat special structure
+         * here: the first unit interested in a PID is stored in the hashmap 'watch_pids', keyed by the
+         * PID. If there are other units interested too they'll be stored in a NULL-terminated array, stored
+         * in the hashmap 'watch_pids_more', keyed by the PID. Thus to go through the full list of units
+         * interested in a PID we must look into both hashmaps. */
+        Hashmap *watch_pids;            /* PidRef* → Unit* */
+        Hashmap *watch_pids_more;       /* PidRef* → NUL terminated array of Unit* */
+
+        /* A set contains all units which cgroup should be refreshed after startup */
+        Set *startup_units;
+
+        /* A set which contains all currently failed units */
+        Set *failed_units;
+
+        sd_event_source *run_queue_event_source;
+
+        char *notify_socket;
+        int notify_fd;
+        sd_event_source *notify_event_source;
+
+        int cgroups_agent_fd;
+        sd_event_source *cgroups_agent_event_source;
+
+        int signal_fd;
+        sd_event_source *signal_event_source;
+
+        sd_event_source *sigchld_event_source;
+
+        sd_event_source *time_change_event_source;
+
+        sd_event_source *timezone_change_event_source;
+
+        sd_event_source *jobs_in_progress_event_source;
+
+        int user_lookup_fds[2];
+        sd_event_source *user_lookup_event_source;
+
+        RuntimeScope runtime_scope;
+
+        LookupPaths lookup_paths;
+        Hashmap *unit_id_map;
+        Hashmap *unit_name_map;
+        Set *unit_path_cache;
+        uint64_t unit_cache_timestamp_hash;
+
+        /* We don't have support for atomically enabling/disabling units, and unit_file_state might become
+         * outdated if such operations failed half-way. Therefore, we set this flag if changes to unit files
+         * are made, and reset it after daemon-reload. If set, we report that daemon-reload is needed through
+         * unit's NeedDaemonReload property. */
+        bool unit_file_state_outdated;
+
+        char **transient_environment;  /* The environment, as determined from config files, kernel cmdline and environment generators */
+        char **client_environment;     /* Environment variables created by clients through the bus API */
+
+        usec_t watchdog[_WATCHDOG_TYPE_MAX];
+        usec_t watchdog_overridden[_WATCHDOG_TYPE_MAX];
+        char *watchdog_pretimeout_governor;
+        char *watchdog_pretimeout_governor_overridden;
+
+        dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX];
+
+        /* Data specific to the device subsystem */
+        sd_device_monitor *device_monitor;
+        Hashmap *devices_by_sysfs;
+
+        /* Data specific to the mount subsystem */
+        struct libmnt_monitor *mount_monitor;
+        sd_event_source *mount_event_source;
+
+        /* Data specific to the swap filesystem */
+        FILE *proc_swaps;
+        sd_event_source *swap_event_source;
+        Hashmap *swaps_by_devnode;
+
+        /* Data specific to the D-Bus subsystem */
+        sd_bus *api_bus, *system_bus;
+        Set *private_buses;
+        int private_listen_fd;
+        sd_event_source *private_listen_event_source;
+
+        /* Contains all the clients that are subscribed to signals via
+        the API bus. Note that private bus connections are always
+        considered subscribes, since they last for very short only,
+        and it is much simpler that way. */
+        sd_bus_track *subscribed;
+        char **deserialized_subscribed;
+
+        /* This is used during reloading: before the reload we queue
+         * the reply message here, and afterwards we send it */
+        sd_bus_message *pending_reload_message;
+
+        Hashmap *watch_bus;  /* D-Bus names => Unit object n:1 */
+
+        bool send_reloading_done;
+
+        uint32_t current_job_id;
+        uint32_t default_unit_job_id;
+
+        /* Data specific to the Automount subsystem */
+        int dev_autofs_fd;
+
+        /* Data specific to the cgroup subsystem */
+        Hashmap *cgroup_unit;
+        CGroupMask cgroup_supported;
+        char *cgroup_root;
+
+        /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
+        int cgroup_inotify_fd;
+        sd_event_source *cgroup_inotify_event_source;
+
+        /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and
+         * memory.events cgroupv2 attributes. */
+        Hashmap *cgroup_control_inotify_wd_unit;
+        Hashmap *cgroup_memory_inotify_wd_unit;
+
+        /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
+        sd_event_source *cgroup_empty_event_source;
+        sd_event_source *cgroup_oom_event_source;
+
+        /* Make sure the user cannot accidentally unmount our cgroup
+         * file system */
+        int pin_cgroupfs_fd;
+
+        unsigned gc_marker;
+
+        /* The stat() data the last time we saw /etc/localtime */
+        usec_t etc_localtime_mtime;
+        bool etc_localtime_accessible;
+
+        ManagerObjective objective;
+
+        /* Flags */
+        bool dispatching_load_queue;
+
+        /* Have we already sent out the READY=1 notification? */
+        bool ready_sent;
+
+        /* Was the last status sent "STATUS=Ready."? */
+        bool status_ready;
+
+        /* Have we already printed the taint line if necessary? */
+        bool taint_logged;
+
+        /* Have we ever changed the "kernel.pid_max" sysctl? */
+        bool sysctl_pid_max_changed;
+
+        ManagerTestRunFlags test_run_flags;
+
+        /* If non-zero, exit with the following value when the systemd
+         * process terminate. Useful for containers: systemd-nspawn could get
+         * the return value. */
+        uint8_t return_value;
+
+        ShowStatus show_status;
+        ShowStatus show_status_overridden;
+        StatusUnitFormat status_unit_format;
+        char *confirm_spawn;
+        bool no_console_output;
+        bool service_watchdogs;
+
+        UnitDefaults defaults;
+
+        int original_log_level;
+        LogTarget original_log_target;
+        bool log_level_overridden;
+        bool log_target_overridden;
+
+        /* non-zero if we are reloading or reexecuting, */
+        int n_reloading;
+
+        unsigned n_installed_jobs;
+        unsigned n_failed_jobs;
+
+        /* Jobs in progress watching */
+        unsigned n_running_jobs;
+        unsigned n_on_console;
+        unsigned jobs_in_progress_iteration;
+
+        /* Do we have any outstanding password prompts? */
+        int have_ask_password;
+        int ask_password_inotify_fd;
+        sd_event_source *ask_password_event_source;
+
+        /* Type=idle pipes */
+        int idle_pipe[4];
+        sd_event_source *idle_pipe_event_source;
+
+        char *switch_root;
+        char *switch_root_init;
+
+        /* This is true before and after switching root. */
+        bool switching_root;
+
+        /* This maps all possible path prefixes to the units needing
+         * them. It's a hashmap with a path string as key and a Set as
+         * value where Unit objects are contained. */
+        Hashmap *units_requiring_mounts_for;
+
+        /* Used for processing polkit authorization responses */
+        Hashmap *polkit_registry;
+
+        /* Dynamic users/groups, indexed by their name */
+        Hashmap *dynamic_users;
+
+        /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */
+        Hashmap *uid_refs;
+        Hashmap *gid_refs;
+
+        /* ExecSharedRuntime, indexed by their owner unit id */
+        Hashmap *exec_shared_runtime_by_id;
+
+        /* When the user hits C-A-D more than 7 times per 2s, do something immediately... */
+        RateLimit ctrl_alt_del_ratelimit;
+        EmergencyAction cad_burst_action;
+
+        const char *unit_log_field;
+        const char *unit_log_format_string;
+
+        const char *invocation_log_field;
+        const char *invocation_log_format_string;
+
+        int first_boot; /* tri-state */
+
+        /* Prefixes of e.g. RuntimeDirectory= */
+        char *prefix[_EXEC_DIRECTORY_TYPE_MAX];
+        char *received_credentials_directory;
+        char *received_encrypted_credentials_directory;
+
+        /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event
+         * multiple times on the same unit. */
+        unsigned sigchldgen;
+        unsigned notifygen;
+
+        VarlinkServer *varlink_server;
+        /* When we're a system manager, this object manages the subscription from systemd-oomd to PID1 that's
+         * used to report changes in ManagedOOM settings (systemd server - oomd client). When
+         * we're a user manager, this object manages the client connection from the user manager to
+         * systemd-oomd to report changes in ManagedOOM settings (systemd client - oomd server). */
+        Varlink *managed_oom_varlink;
+
+        /* Reference to RestrictFileSystems= BPF program */
+        struct restrict_fs_bpf *restrict_fs;
+
+        /* Allow users to configure a rate limit for Reload() operations */
+        RateLimit reload_ratelimit;
+        /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */
+        RateLimit dump_ratelimit;
+
+        sd_event_source *memory_pressure_event_source;
+
+        /* For NFTSet= */
+        FirewallContext *fw_ctx;
+
+        /* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have
+         * serialization/deserialization compatibility issues during upgrades. */
+        int executor_fd;
+};
+
+static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
+        assert(m);
+        return m->defaults.timeout_abort_set ? m->defaults.timeout_abort_usec : m->defaults.timeout_stop_usec;
+}
+
+#define MANAGER_IS_SYSTEM(m) ((m)->runtime_scope == RUNTIME_SCOPE_SYSTEM)
+#define MANAGER_IS_USER(m) ((m)->runtime_scope == RUNTIME_SCOPE_USER)
+
+#define MANAGER_IS_RELOADING(m) ((m)->n_reloading > 0)
+
+#define MANAGER_IS_FINISHED(m) (dual_timestamp_is_set((m)->timestamps + MANAGER_TIMESTAMP_FINISH))
+
+/* The objective is set to OK as soon as we enter the main loop, and set otherwise as soon as we are done with it */
+#define MANAGER_IS_RUNNING(m) ((m)->objective == MANAGER_OK)
+
+#define MANAGER_IS_SWITCHING_ROOT(m) ((m)->switching_root)
+
+#define MANAGER_IS_TEST_RUN(m) ((m)->test_run_flags != 0)
+
+static inline usec_t manager_default_timeout(RuntimeScope scope) {
+        return scope == RUNTIME_SCOPE_SYSTEM ? DEFAULT_TIMEOUT_USEC : DEFAULT_USER_TIMEOUT_USEC;
+}
+
+int manager_new(RuntimeScope scope, ManagerTestRunFlags test_run_flags, Manager **m);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root);
+
+Job *manager_get_job(Manager *m, uint32_t id);
+Unit *manager_get_unit(Manager *m, const char *name);
+
+int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j);
+
+bool manager_unit_cache_should_retry_load(Unit *u);
+int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret);
+int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u);
+
+int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret);
+int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret);
+int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs,  Job **ret);
+int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
+
+void manager_clear_jobs(Manager *m);
+
+void manager_unwatch_pidref(Manager *m, PidRef *pid);
+
+unsigned manager_dispatch_load_queue(Manager *m);
+
+int manager_setup_memory_pressure_event_source(Manager *m);
+
+int manager_default_environment(Manager *m);
+int manager_transient_environment_add(Manager *m, char **plus);
+int manager_client_environment_modify(Manager *m, char **minus, char **plus);
+int manager_get_effective_environment(Manager *m, char ***ret);
+
+int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults);
+
+void manager_trigger_run_queue(Manager *m);
+
+int manager_loop(Manager *m);
+
+int manager_reload(Manager *m);
+Manager* manager_reloading_start(Manager *m);
+void manager_reloading_stopp(Manager **m);
+
+void manager_reset_failed(Manager *m);
+
+void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success);
+void manager_send_unit_plymouth(Manager *m, Unit *u);
+
+bool manager_unit_inactive_or_pending(Manager *m, const char *name);
+
+void manager_check_finished(Manager *m);
+void manager_send_reloading(Manager *m);
+
+void disable_printk_ratelimit(void);
+void manager_recheck_dbus(Manager *m);
+void manager_recheck_journal(Manager *m);
+
+bool manager_get_show_status_on(Manager *m);
+void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason);
+void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason);
+
+void manager_set_first_boot(Manager *m, bool b);
+void manager_set_switching_root(Manager *m, bool switching_root);
+
+double manager_get_progress(Manager *m);
+
+void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5);
+
+Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path);
+
+ManagerState manager_state(Manager *m);
+
+int manager_update_failed_units(Manager *m, Unit *u, bool failed);
+
+void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now);
+int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc);
+
+void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now);
+int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc);
+
+char* manager_taint_string(const Manager *m);
+
+void manager_ref_console(Manager *m);
+void manager_unref_console(Manager *m);
+
+void manager_override_log_level(Manager *m, int level);
+void manager_restore_original_log_level(Manager *m);
+
+void manager_override_log_target(Manager *m, LogTarget target);
+void manager_restore_original_log_target(Manager *m);
+
+const char *manager_state_to_string(ManagerState m) _const_;
+ManagerState manager_state_from_string(const char *s) _pure_;
+
+const char *manager_get_confirm_spawn(Manager *m);
+void manager_disable_confirm_spawn(void);
+
+const char *manager_timestamp_to_string(ManagerTimestamp m) _const_;
+ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_;
+ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
+
+usec_t manager_get_watchdog(Manager *m, WatchdogType t);
+void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout);
+void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout);
+int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor);
+int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor);
+
+LogTarget manager_get_executor_log_target(Manager *m);
+
+int manager_allocate_idle_pipe(Manager *m);
+
+const char* oom_policy_to_string(OOMPolicy i) _const_;
+OOMPolicy oom_policy_from_string(const char *s) _pure_;
+
+void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope);
+void unit_defaults_done(UnitDefaults *defaults);
diff --git a/src/core/meson.build b/src/core/meson.build
new file mode 100644
index 0000000..7701d3d
--- /dev/null
+++ b/src/core/meson.build
@@ -0,0 +1,260 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+libcore_sources = files(
+        'apparmor-setup.c',
+        'audit-fd.c',
+        'automount.c',
+        'bpf-devices.c',
+        'bpf-firewall.c',
+        'bpf-foreign.c',
+        'bpf-lsm.c',
+        'bpf-socket-bind.c',
+        'cgroup.c',
+        'core-varlink.c',
+        'dbus-automount.c',
+        'dbus-cgroup.c',
+        'dbus-device.c',
+        'dbus-execute.c',
+        'dbus-job.c',
+        'dbus-kill.c',
+        'dbus-manager.c',
+        'dbus-mount.c',
+        'dbus-path.c',
+        'dbus-scope.c',
+        'dbus-service.c',
+        'dbus-slice.c',
+        'dbus-socket.c',
+        'dbus-swap.c',
+        'dbus-target.c',
+        'dbus-timer.c',
+        'dbus-unit.c',
+        'dbus-util.c',
+        'dbus.c',
+        'device.c',
+        'dynamic-user.c',
+        'efi-random.c',
+        'emergency-action.c',
+        'exec-credential.c',
+        'execute.c',
+        'execute-serialize.c',
+        'generator-setup.c',
+        'ima-setup.c',
+        'import-creds.c',
+        'job.c',
+        'kill.c',
+        'kmod-setup.c',
+        'load-dropin.c',
+        'load-fragment.c',
+        'manager-dump.c',
+        'manager-serialize.c',
+        'manager.c',
+        'mount.c',
+        'namespace.c',
+        'path.c',
+        'restrict-ifaces.c',
+        'scope.c',
+        'selinux-access.c',
+        'selinux-setup.c',
+        'service.c',
+        'show-status.c',
+        'slice.c',
+        'smack-setup.c',
+        'socket.c',
+        'swap.c',
+        'target.c',
+        'timer.c',
+        'transaction.c',
+        'unit-dependency-atom.c',
+        'unit-printf.c',
+        'unit-serialize.c',
+        'unit.c',
+)
+
+if conf.get('BPF_FRAMEWORK') == 1
+        libcore_sources += files(
+                'bpf-util.c',
+        )
+endif
+
+subdir('bpf/socket_bind')
+subdir('bpf/restrict_fs')
+subdir('bpf/restrict_ifaces')
+
+if conf.get('BPF_FRAMEWORK') == 1
+        libcore_sources += [
+                socket_bind_skel_h,
+                restrict_fs_skel_h,
+                restrict_ifaces_skel_h]
+endif
+
+load_fragment_gperf_gperf = custom_target(
+        'load-fragment-gperf.gperf',
+        input : 'load-fragment-gperf.gperf.in',
+        output: 'load-fragment-gperf.gperf',
+        command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'])
+
+load_fragment_gperf_c = custom_target(
+        'load-fragment-gperf.c',
+        input : load_fragment_gperf_gperf,
+        output : 'load-fragment-gperf.c',
+        command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@'])
+
+awkscript = 'load-fragment-gperf-nulstr.awk'
+load_fragment_gperf_nulstr_c = custom_target(
+        'load-fragment-gperf-nulstr.c',
+        input : [awkscript, load_fragment_gperf_gperf],
+        output : 'load-fragment-gperf-nulstr.c',
+        command : [awk, '-f', '@INPUT0@', '@INPUT1@'],
+        capture : true)
+
+libcore_name = 'systemd-core-@0@'.format(shared_lib_tag)
+
+libcore = shared_library(
+        libcore_name,
+        libcore_sources,
+        load_fragment_gperf_c,
+        load_fragment_gperf_nulstr_c,
+        include_directories : includes,
+        c_args : ['-fvisibility=default'],
+        link_args : ['-shared',
+                     '-Wl,--version-script=' + libshared_sym_path],
+        link_depends : libshared_sym_path,
+        link_with : libshared,
+        dependencies : [libacl,
+                        libapparmor,
+                        libaudit,
+                        libblkid,
+                        libdl,
+                        libkmod,
+                        libm,
+                        libmount,
+                        libpam,
+                        librt,
+                        libseccomp,
+                        libselinux,
+                        threads,
+                        userspace],
+        install : true,
+        install_dir : pkglibdir)
+
+core_includes = [includes, include_directories('.')]
+
+systemd_sources = files(
+        'main.c',
+        'crash-handler.c',
+)
+
+systemd_executor_sources = files(
+        'executor.c',
+        'exec-invoke.c',
+)
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd',
+                'dbus' : true,
+                'public' : true,
+                'sources' : systemd_sources,
+                'link_with' : [
+                        libcore,
+                        libshared,
+                ],
+                'dependencies' : libseccomp,
+        },
+        libexec_template + {
+                'name' : 'systemd-executor',
+                'public' : true,
+                'sources' : systemd_executor_sources,
+                'include_directories' : core_includes,
+                'link_with' : [
+                        libcore,
+                        libshared,
+                ],
+                'dependencies' : [
+                        libapparmor,
+                        libpam,
+                        libseccomp,
+                        libselinux,
+                ],
+        },
+        fuzz_template + {
+                'sources' : files('fuzz-unit-file.c'),
+                'link_with' : [
+                        libcore,
+                        libshared
+                ],
+                'dependencies' : libmount,
+        },
+        fuzz_template + {
+                'sources' : files('fuzz-manager-serialize.c'),
+                'link_with' : [
+                        libcore,
+                        libshared
+                ],
+        },
+        fuzz_template + {
+                'sources' : files('fuzz-execute-serialize.c'),
+                'link_with' : [
+                        libcore,
+                        libshared
+                ],
+        },
+]
+
+in_files = [['system.conf',                     pkgconfigfiledir],
+            ['user.conf',                       pkgconfigfiledir],
+            ['org.freedesktop.systemd1.policy', polkitpolicydir]]
+
+foreach item : in_files
+        file = item[0]
+        dir = item[1]
+
+        custom_target(
+                file,
+                input : file + '.in',
+                output: file,
+                command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'],
+                install : (dir == pkgconfigfiledir) ? install_sysconfdir_samples : (dir != 'no'),
+                install_dir : dir)
+endforeach
+
+systemd_pc = custom_target(
+        'systemd.pc',
+        input : 'systemd.pc.in',
+        output : 'systemd.pc',
+        command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'],
+        install : pkgconfigdatadir != 'no',
+        install_tag : 'devel',
+        install_dir : pkgconfigdatadir)
+
+install_data('org.freedesktop.systemd1.conf',
+             install_dir : dbuspolicydir)
+install_data('org.freedesktop.systemd1.service',
+             install_dir : dbussystemservicedir)
+
+install_emptydir(systemshutdowndir)
+install_emptydir(systemsleepdir)
+install_emptydir(systemgeneratordir)
+install_emptydir(usergeneratordir)
+
+if install_sysconfdir
+        install_emptydir(pkgsysconfdir / 'system')
+        install_emptydir(pkgsysconfdir / 'user')
+        install_emptydir(sysconfdir / 'xdg/systemd')
+        meson.add_install_script(sh, '-c', ln_s.format(pkgsysconfdir / 'user',
+                                                       sysconfdir / 'xdg/systemd/user'))
+endif
+
+install_emptydir(sbindir)
+meson.add_install_script(sh, '-c', ln_s.format(libexecdir / 'systemd', sbindir / 'init'))
+
+############################################################
+
+core_test_template = test_template + {
+        'link_with' : [
+                libcore,
+                libshared,
+        ],
+        'include_directories' : core_includes,
+        'suite' : 'core',
+}
diff --git a/src/core/mount.c b/src/core/mount.c
new file mode 100644
index 0000000..ded322d
--- /dev/null
+++ b/src/core/mount.c
@@ -0,0 +1,2502 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "dbus-mount.h"
+#include "dbus-unit.h"
+#include "device.h"
+#include "exit-status.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "fstab-util.h"
+#include "initrd-util.h"
+#include "libmount-util.h"
+#include "log.h"
+#include "manager.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "utf8.h"
+
+#define RETRY_UMOUNT_MAX 32
+
+static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
+        [MOUNT_DEAD] = UNIT_INACTIVE,
+        [MOUNT_MOUNTING] = UNIT_ACTIVATING,
+        [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING,
+        [MOUNT_MOUNTED] = UNIT_ACTIVE,
+        [MOUNT_REMOUNTING] = UNIT_RELOADING,
+        [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING,
+        [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING,
+        [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING,
+        [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING,
+        [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING,
+        [MOUNT_FAILED] = UNIT_FAILED,
+        [MOUNT_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static void mount_enter_dead(Mount *m, MountResult f);
+static void mount_enter_mounted(Mount *m, MountResult f);
+static void mount_cycle_clear(Mount *m);
+static int mount_process_proc_self_mountinfo(Manager *m);
+
+static bool MOUNT_STATE_WITH_PROCESS(MountState state) {
+        return IN_SET(state,
+                      MOUNT_MOUNTING,
+                      MOUNT_MOUNTING_DONE,
+                      MOUNT_REMOUNTING,
+                      MOUNT_REMOUNTING_SIGTERM,
+                      MOUNT_REMOUNTING_SIGKILL,
+                      MOUNT_UNMOUNTING,
+                      MOUNT_UNMOUNTING_SIGTERM,
+                      MOUNT_UNMOUNTING_SIGKILL,
+                      MOUNT_CLEANING);
+}
+
+static MountParameters* get_mount_parameters_fragment(Mount *m) {
+        assert(m);
+
+        if (m->from_fragment)
+                return &m->parameters_fragment;
+
+        return NULL;
+}
+
+static MountParameters* get_mount_parameters(Mount *m) {
+        assert(m);
+
+        if (m->from_proc_self_mountinfo)
+                return &m->parameters_proc_self_mountinfo;
+
+        return get_mount_parameters_fragment(m);
+}
+
+static bool mount_is_network(const MountParameters *p) {
+        assert(p);
+
+        if (fstab_test_option(p->options, "_netdev\0"))
+                return true;
+
+        if (p->fstype && fstype_is_network(p->fstype))
+                return true;
+
+        return false;
+}
+
+static bool mount_is_nofail(const Mount *m) {
+        assert(m);
+
+        if (!m->from_fragment)
+                return false;
+
+        return fstab_test_yes_no_option(m->parameters_fragment.options, "nofail\0" "fail\0");
+}
+
+static bool mount_is_loop(const MountParameters *p) {
+        assert(p);
+
+        if (fstab_test_option(p->options, "loop\0"))
+                return true;
+
+        return false;
+}
+
+static bool mount_is_bind(const MountParameters *p) {
+        assert(p);
+        return fstab_is_bind(p->options, p->fstype);
+}
+
+static int mount_is_bound_to_device(Mount *m) {
+        _cleanup_free_ char *value = NULL;
+        const MountParameters *p;
+        int r;
+
+        assert(m);
+
+        /* Determines whether to place a Requires= or BindsTo= dependency on the backing device unit. We do
+         * this by checking for the x-systemd.device-bound= mount option. If it is enabled we use BindsTo=,
+         * otherwise Requires=. But note that we might combine the latter with StopPropagatedFrom=, see
+         * below. */
+
+        p = get_mount_parameters(m);
+        if (!p)
+                return false;
+
+        r = fstab_filter_options(p->options, "x-systemd.device-bound\0", NULL, &value, NULL, NULL);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EIDRM; /* If unspecified at all, return recognizable error */
+
+        if (isempty(value))
+                return true;
+
+        return parse_boolean(value);
+}
+
+static bool mount_propagate_stop(Mount *m) {
+        int r;
+
+        assert(m);
+
+        r = mount_is_bound_to_device(m);
+        if (r >= 0)
+                /* If x-systemd.device-bound=no is explicitly requested by user, don't try to set StopPropagatedFrom=.
+                 * Also don't bother if true, since with BindsTo= the stop propagation is implicit. */
+                return false;
+        if (r != -EIDRM)
+                log_debug_errno(r, "Failed to get x-systemd.device-bound= option, ignoring: %m");
+
+        return m->from_fragment; /* let's propagate stop whenever this is an explicitly configured unit,
+                                  * otherwise let's not bother. */
+}
+
+static bool mount_needs_quota(const MountParameters *p) {
+        assert(p);
+
+        if (p->fstype && !fstype_needs_quota(p->fstype))
+                return false;
+
+        if (mount_is_bind(p))
+                return false;
+
+        return fstab_test_option(p->options,
+                                 "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0");
+}
+
+static void mount_init(Unit *u) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        m->timeout_usec = u->manager->defaults.timeout_start_usec;
+
+        m->exec_context.std_output = u->manager->defaults.std_output;
+        m->exec_context.std_error = u->manager->defaults.std_error;
+
+        m->directory_mode = 0755;
+
+        /* We need to make sure that /usr/bin/mount is always called
+         * in the same process group as us, so that the autofs kernel
+         * side doesn't send us another mount request while we are
+         * already trying to comply its last one. */
+        m->exec_context.same_pgrp = true;
+
+        m->control_pid = PIDREF_NULL;
+        m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+
+        u->ignore_on_isolate = true;
+}
+
+static int mount_arm_timer(Mount *m, bool relative, usec_t usec) {
+        assert(m);
+
+        return unit_arm_timer(UNIT(m), &m->timer_event_source, relative, usec, mount_dispatch_timer);
+}
+
+static void mount_unwatch_control_pid(Mount *m) {
+        assert(m);
+
+        if (!pidref_is_set(&m->control_pid))
+                return;
+
+        unit_unwatch_pidref(UNIT(m), &m->control_pid);
+        pidref_done(&m->control_pid);
+}
+
+static void mount_parameters_done(MountParameters *p) {
+        assert(p);
+
+        p->what = mfree(p->what);
+        p->options = mfree(p->options);
+        p->fstype = mfree(p->fstype);
+}
+
+static void mount_done(Unit *u) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+
+        m->where = mfree(m->where);
+
+        mount_parameters_done(&m->parameters_proc_self_mountinfo);
+        mount_parameters_done(&m->parameters_fragment);
+
+        m->exec_runtime = exec_runtime_free(m->exec_runtime);
+        exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
+        m->control_command = NULL;
+
+        mount_unwatch_control_pid(m);
+
+        m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+}
+
+static int update_parameters_proc_self_mountinfo(
+                Mount *m,
+                const char *what,
+                const char *options,
+                const char *fstype) {
+
+        MountParameters *p;
+        int r, q, w;
+
+        p = &m->parameters_proc_self_mountinfo;
+
+        r = free_and_strdup(&p->what, what);
+        if (r < 0)
+                return r;
+
+        q = free_and_strdup(&p->options, options);
+        if (q < 0)
+                return q;
+
+        w = free_and_strdup(&p->fstype, fstype);
+        if (w < 0)
+                return w;
+
+        return r > 0 || q > 0 || w > 0;
+}
+
+static int mount_add_mount_dependencies(Mount *m) {
+        MountParameters *pm;
+        Unit *other;
+        Set *s;
+        int r;
+
+        assert(m);
+
+        if (!path_equal(m->where, "/")) {
+                _cleanup_free_ char *parent = NULL;
+
+                /* Adds in links to other mount points that might lie further up in the hierarchy */
+
+                r = path_extract_directory(m->where, &parent);
+                if (r < 0)
+                        return r;
+
+                r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Adds in dependencies to other mount points that might be needed for the source path (if this is a bind mount
+         * or a loop mount) to be available. */
+        pm = get_mount_parameters_fragment(m);
+        if (pm && pm->what &&
+            path_is_absolute(pm->what) &&
+            (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) {
+
+                r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */
+        s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where);
+        SET_FOREACH(other, s) {
+
+                if (other->load_state != UNIT_LOADED)
+                        continue;
+
+                if (other == UNIT(m))
+                        continue;
+
+                r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+                if (r < 0)
+                        return r;
+
+                if (UNIT(m)->fragment_path) {
+                        /* If we have fragment configuration, then make this dependency required */
+                        r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int mount_add_device_dependencies(Mount *m) {
+        UnitDependencyMask mask;
+        MountParameters *p;
+        UnitDependency dep;
+        int r;
+
+        assert(m);
+
+        log_unit_trace(UNIT(m), "Processing implicit device dependencies");
+
+        p = get_mount_parameters(m);
+        if (!p) {
+                log_unit_trace(UNIT(m), "Missing mount parameters, skipping implicit device dependencies");
+                return 0;
+        }
+
+        if (!p->what) {
+                log_unit_trace(UNIT(m), "Missing mount source, skipping implicit device dependencies");
+                return 0;
+        }
+
+        if (mount_is_bind(p)) {
+                log_unit_trace(UNIT(m), "Mount unit is a bind mount, skipping implicit device dependencies");
+                return 0;
+        }
+
+        if (!is_device_path(p->what)) {
+                log_unit_trace(UNIT(m), "Mount source is not a device path, skipping implicit device dependencies");
+                return 0;
+        }
+
+        /* /dev/root is a really weird thing, it's not a real device, but just a path the kernel exports for
+         * the root file system specified on the kernel command line. Ignore it here. */
+        if (PATH_IN_SET(p->what, "/dev/root", "/dev/nfs")) {
+                log_unit_trace(UNIT(m), "Mount source is in /dev/root or /dev/nfs, skipping implicit device dependencies");
+                return 0;
+        }
+
+        if (path_equal(m->where, "/")) {
+                log_unit_trace(UNIT(m), "Mount destination is '/', skipping implicit device dependencies");
+                return 0;
+        }
+
+        /* Mount units from /proc/self/mountinfo are not bound to devices by default since they're subject to
+         * races when mounts are established by other tools with different backing devices than what we
+         * maintain. The user can still force this to be a BindsTo= dependency with an appropriate option (or
+         * udev property) so the mount units are automatically stopped when the device disappears
+         * suddenly. */
+        dep = mount_is_bound_to_device(m) > 0 ? UNIT_BINDS_TO : UNIT_REQUIRES;
+
+        /* We always use 'what' from /proc/self/mountinfo if mounted */
+        mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE;
+
+        r = unit_add_node_dependency(UNIT(m), p->what, dep, mask);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(dep), p->what);
+
+        if (mount_propagate_stop(m)) {
+                r = unit_add_node_dependency(UNIT(m), p->what, UNIT_STOP_PROPAGATED_FROM, mask);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        log_unit_trace(UNIT(m), "Added %s dependency on %s",
+                                       unit_dependency_to_string(UNIT_STOP_PROPAGATED_FROM), p->what);
+        }
+
+        r = unit_add_blockdev_dependency(UNIT(m), p->what, mask);
+        if (r > 0)
+                log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(UNIT_AFTER), p->what);
+
+        return 0;
+}
+
+static int mount_add_quota_dependencies(Mount *m) {
+        MountParameters *p;
+        int r;
+
+        assert(m);
+
+        if (!MANAGER_IS_SYSTEM(UNIT(m)->manager))
+                return 0;
+
+        p = get_mount_parameters_fragment(m);
+        if (!p)
+                return 0;
+
+        if (!mount_needs_quota(p))
+                return 0;
+
+        r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE,
+                                              /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+        if (r < 0)
+                return r;
+
+        r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE,
+                                              /* add_reference= */true, UNIT_DEPENDENCY_FILE);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static bool mount_is_extrinsic(Unit *u) {
+        MountParameters *p;
+        Mount *m = MOUNT(u);
+        assert(m);
+
+        /* Returns true for all units that are "magic" and should be excluded from the usual
+         * start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally
+         * mounted outside of the systemd dependency logic. We shouldn't attempt to manage them
+         * ourselves but it's fine if the user operates on them with us. */
+
+        /* We only automatically manage mounts if we are in system mode */
+        if (MANAGER_IS_USER(u->manager))
+                return true;
+
+        p = get_mount_parameters(m);
+        if (p && fstab_is_extrinsic(m->where, p->options))
+                return true;
+
+        return false;
+}
+
+static bool mount_is_credentials(Mount *m) {
+        const char *e;
+
+        assert(m);
+
+        /* Returns true if this is a credentials mount. We don't want automatic dependencies on credential
+         * mounts, since they are managed by us for even the earliest services, and we never want anything to
+         * be ordered before them hence. */
+
+        e = path_startswith(m->where, UNIT(m)->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+        if (!e)
+                return false;
+
+        return !isempty(path_startswith(e, "credentials"));
+}
+
+static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
+        const char *after, *before, *e;
+        int r;
+
+        assert(m);
+
+        e = path_startswith(m->where, "/sysroot");
+        if (e && in_initrd()) {
+                /* All mounts under /sysroot need to happen later, at initrd-fs.target time. IOW,
+                 * it's not technically part of the basic initrd filesystem itself, and so
+                 * shouldn't inherit the default Before=local-fs.target dependency. However,
+                 * these mounts still need to start after local-fs-pre.target, as a sync point
+                 * for things like systemd-hibernate-resume.service that should start before
+                 * any mounts. */
+
+                after = SPECIAL_LOCAL_FS_PRE_TARGET;
+                before = isempty(e) ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_INITRD_FS_TARGET;
+
+        } else if (in_initrd() && path_startswith(m->where, "/sysusr/usr")) {
+                after = SPECIAL_LOCAL_FS_PRE_TARGET;
+                before = SPECIAL_INITRD_USR_FS_TARGET;
+
+        } else if (mount_is_credentials(m))
+                after = before = NULL;
+
+        else if (mount_is_network(p)) {
+                after = SPECIAL_REMOTE_FS_PRE_TARGET;
+                before = SPECIAL_REMOTE_FS_TARGET;
+
+        } else {
+                after = SPECIAL_LOCAL_FS_PRE_TARGET;
+                before = SPECIAL_LOCAL_FS_TARGET;
+        }
+
+        if (before && !mount_is_nofail(m)) {
+                r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, /* add_reference= */ true, mask);
+                if (r < 0)
+                        return r;
+        }
+
+        if (after) {
+                r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, /* add_reference= */ true, mask);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET,
+                                              /* add_reference= */ true, mask);
+        if (r < 0)
+                return r;
+
+        /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */
+        if (streq_ptr(p->fstype, "tmpfs") && !mount_is_credentials(m)) {
+                r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET,
+                                                /* add_reference= */ true, mask);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int mount_add_default_network_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
+        int r;
+
+        assert(m);
+
+        if (!mount_is_network(p))
+                return 0;
+
+        /* We order ourselves after network.target. This is primarily useful at shutdown: services that take
+         * down the network should order themselves before network.target, so that they are shut down only
+         * after this mount unit is stopped. */
+
+        r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET,
+                                        /* add_reference= */ true, mask);
+        if (r < 0)
+                return r;
+
+        /* We pull in network-online.target, and order ourselves after it. This is useful at start-up to
+         * actively pull in tools that want to be started before we start mounting network file systems, and
+         * whose purpose it is to delay this until the network is "up". */
+
+        return unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET,
+                                                 /* add_reference= */ true, mask);
+}
+
+static int mount_add_default_dependencies(Mount *m) {
+        UnitDependencyMask mask;
+        MountParameters *p;
+        int r;
+
+        assert(m);
+
+        if (!UNIT(m)->default_dependencies)
+                return 0;
+
+        /* We do not add any default dependencies to /, /usr or /run/initramfs/, since they are
+         * guaranteed to stay mounted the whole time, since our system is on it.  Also, don't
+         * bother with anything mounted below virtual file systems, it's also going to be virtual,
+         * and hence not worth the effort. */
+        if (mount_is_extrinsic(UNIT(m)))
+                return 0;
+
+        p = get_mount_parameters(m);
+        if (!p)
+                return 0;
+
+        mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE;
+
+        r = mount_add_default_ordering_dependencies(m, p, mask);
+        if (r < 0)
+                return r;
+
+        r = mount_add_default_network_dependencies(m, p, mask);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int mount_verify(Mount *m) {
+        _cleanup_free_ char *e = NULL;
+        MountParameters *p;
+        int r;
+
+        assert(m);
+        assert(UNIT(m)->load_state == UNIT_LOADED);
+
+        if (!m->from_fragment && !m->from_proc_self_mountinfo && !UNIT(m)->perpetual)
+                return -ENOENT;
+
+        r = unit_name_from_path(m->where, ".mount", &e);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(m), r, "Failed to generate unit name from mount path: %m");
+
+        if (!unit_has_name(UNIT(m), e))
+                return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing.");
+
+        if (mount_point_is_api(m->where) || mount_point_ignore(m->where))
+                return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Cannot create mount unit for API file system %s. Refusing.", m->where);
+
+        p = get_mount_parameters_fragment(m);
+        if (p && !p->what && !UNIT(m)->perpetual)
+                return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC),
+                                            "What= setting is missing. Refusing.");
+
+        if (m->exec_context.pam_name && m->kill_context.kill_mode != KILL_CONTROL_GROUP)
+                return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to control-group'. Refusing.");
+
+        return 0;
+}
+
+static int mount_add_non_exec_dependencies(Mount *m) {
+        int r;
+
+        assert(m);
+
+        /* We may be called due to this mount appearing in /proc/self/mountinfo, hence we clear all existing
+         * dependencies that were initialized from the unit file but whose final value really depends on the
+         * content of /proc/self/mountinfo. Some (such as m->where) might have become stale now. */
+        unit_remove_dependencies(UNIT(m), UNIT_DEPENDENCY_MOUNTINFO | UNIT_DEPENDENCY_MOUNT_FILE);
+
+        if (!m->where)
+                return 0;
+
+        /* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies
+         * resulting from the ExecContext and such. */
+
+        r = mount_add_device_dependencies(m);
+        if (r < 0)
+                return r;
+
+        r = mount_add_mount_dependencies(m);
+        if (r < 0)
+                return r;
+
+        r = mount_add_quota_dependencies(m);
+        if (r < 0)
+                return r;
+
+        r = mount_add_default_dependencies(m);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int mount_add_extras(Mount *m) {
+        Unit *u = UNIT(m);
+        int r;
+
+        assert(m);
+
+        /* Note: this call might be called after we already have been loaded once (and even when it has already been
+         * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready
+         * to run with an already set up unit. */
+
+        if (u->fragment_path)
+                m->from_fragment = true;
+
+        if (!m->where) {
+                r = unit_name_to_path(u->id, &m->where);
+                if (r == -ENAMETOOLONG)
+                        log_unit_error_errno(u, r, "Failed to derive mount point path from unit name, because unit name is hashed. "
+                                                   "Set \"Where=\" in the unit file explicitly.");
+                if (r < 0)
+                        return r;
+        }
+
+        path_simplify(m->where);
+
+        if (!u->description) {
+                r = unit_set_description(u, m->where);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_patch_contexts(u);
+        if (r < 0)
+                return r;
+
+        r = unit_add_exec_dependencies(u, &m->exec_context);
+        if (r < 0)
+                return r;
+
+        r = unit_set_default_slice(u);
+        if (r < 0)
+                return r;
+
+        r = mount_add_non_exec_dependencies(m);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static void mount_load_root_mount(Unit *u) {
+        assert(u);
+
+        if (!unit_has_name(u, SPECIAL_ROOT_MOUNT))
+                return;
+
+        u->perpetual = true;
+        u->default_dependencies = false;
+
+        /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */
+        MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL;
+        MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL;
+
+        if (!u->description)
+                u->description = strdup("Root Mount");
+}
+
+static int mount_load(Unit *u) {
+        Mount *m = MOUNT(u);
+        int r, q = 0;
+
+        assert(m);
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        mount_load_root_mount(u);
+
+        bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual;
+        r = unit_load_fragment_and_dropin(u, !fragment_optional);
+
+        /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the
+         * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus
+         * we need to update the state in our unit to track it. After all, consider that we don't allow changing the
+         * 'slice' field for a unit once it is active. */
+        if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual)
+                q = mount_add_extras(m);
+
+        if (r < 0)
+                return r;
+        if (q < 0)
+                return q;
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        return mount_verify(m);
+}
+
+static void mount_set_state(Mount *m, MountState state) {
+        MountState old_state;
+        assert(m);
+
+        if (m->state != state)
+                bus_unit_send_pending_change_signal(UNIT(m), false);
+
+        old_state = m->state;
+        m->state = state;
+
+        if (!MOUNT_STATE_WITH_PROCESS(state)) {
+                m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+                mount_unwatch_control_pid(m);
+                m->control_command = NULL;
+                m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+        }
+
+        if (state != old_state)
+                log_unit_debug(UNIT(m), "Changed %s -> %s", mount_state_to_string(old_state), mount_state_to_string(state));
+
+        unit_notify(UNIT(m), state_translation_table[old_state], state_translation_table[state], m->reload_result == MOUNT_SUCCESS);
+}
+
+static int mount_coldplug(Unit *u) {
+        Mount *m = MOUNT(u);
+        int r;
+
+        assert(m);
+        assert(m->state == MOUNT_DEAD);
+
+        if (m->deserialized_state == m->state)
+                return 0;
+
+        if (pidref_is_set(&m->control_pid) &&
+            pidref_is_unwaited(&m->control_pid) > 0 &&
+            MOUNT_STATE_WITH_PROCESS(m->deserialized_state)) {
+
+                r = unit_watch_pidref(UNIT(m), &m->control_pid, /* exclusive= */ false);
+                if (r < 0)
+                        return r;
+
+                r = mount_arm_timer(m, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, m->timeout_usec));
+                if (r < 0)
+                        return r;
+        }
+
+        if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED))
+                (void) unit_setup_exec_runtime(u);
+
+        mount_set_state(m, m->deserialized_state);
+        return 0;
+}
+
+static void mount_catchup(Unit *u) {
+        Mount *m = MOUNT(ASSERT_PTR(u));
+
+        assert(m);
+
+        /* Adjust the deserialized state. See comments in mount_process_proc_self_mountinfo(). */
+        if (m->from_proc_self_mountinfo)
+                switch (m->state) {
+                case MOUNT_DEAD:
+                case MOUNT_FAILED:
+                        assert(!pidref_is_set(&m->control_pid));
+                        (void) unit_acquire_invocation_id(u);
+                        mount_cycle_clear(m);
+                        mount_enter_mounted(m, MOUNT_SUCCESS);
+                        break;
+                case MOUNT_MOUNTING:
+                        assert(pidref_is_set(&m->control_pid));
+                        mount_set_state(m, MOUNT_MOUNTING_DONE);
+                        break;
+                default:
+                        break;
+                }
+        else
+                switch (m->state) {
+                case MOUNT_MOUNTING_DONE:
+                        assert(pidref_is_set(&m->control_pid));
+                        mount_set_state(m, MOUNT_MOUNTING);
+                        break;
+                case MOUNT_MOUNTED:
+                        assert(!pidref_is_set(&m->control_pid));
+                        mount_enter_dead(m, MOUNT_SUCCESS);
+                        break;
+                default:
+                        break;
+                }
+}
+
+static void mount_dump(Unit *u, FILE *f, const char *prefix) {
+        Mount *m = MOUNT(u);
+        MountParameters *p;
+
+        assert(m);
+        assert(f);
+
+        p = get_mount_parameters(m);
+
+        fprintf(f,
+                "%sMount State: %s\n"
+                "%sResult: %s\n"
+                "%sClean Result: %s\n"
+                "%sWhere: %s\n"
+                "%sWhat: %s\n"
+                "%sFile System Type: %s\n"
+                "%sOptions: %s\n"
+                "%sFrom /proc/self/mountinfo: %s\n"
+                "%sFrom fragment: %s\n"
+                "%sExtrinsic: %s\n"
+                "%sDirectoryMode: %04o\n"
+                "%sSloppyOptions: %s\n"
+                "%sLazyUnmount: %s\n"
+                "%sForceUnmount: %s\n"
+                "%sReadWriteOnly: %s\n"
+                "%sTimeoutSec: %s\n",
+                prefix, mount_state_to_string(m->state),
+                prefix, mount_result_to_string(m->result),
+                prefix, mount_result_to_string(m->clean_result),
+                prefix, m->where,
+                prefix, p ? strna(p->what) : "n/a",
+                prefix, p ? strna(p->fstype) : "n/a",
+                prefix, p ? strna(p->options) : "n/a",
+                prefix, yes_no(m->from_proc_self_mountinfo),
+                prefix, yes_no(m->from_fragment),
+                prefix, yes_no(mount_is_extrinsic(u)),
+                prefix, m->directory_mode,
+                prefix, yes_no(m->sloppy_options),
+                prefix, yes_no(m->lazy_unmount),
+                prefix, yes_no(m->force_unmount),
+                prefix, yes_no(m->read_write_only),
+                prefix, FORMAT_TIMESPAN(m->timeout_usec, USEC_PER_SEC));
+
+        if (pidref_is_set(&m->control_pid))
+                fprintf(f,
+                        "%sControl PID: "PID_FMT"\n",
+                        prefix, m->control_pid.pid);
+
+        exec_context_dump(&m->exec_context, f, prefix);
+        kill_context_dump(&m->kill_context, f, prefix);
+        cgroup_context_dump(UNIT(m), f, prefix);
+}
+
+static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) {
+
+        _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
+                        EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        pid_t pid;
+        int r;
+
+        assert(m);
+        assert(c);
+        assert(ret_pid);
+
+        r = unit_prepare_exec(UNIT(m));
+        if (r < 0)
+                return r;
+
+        r = mount_arm_timer(m, /* relative= */ true, m->timeout_usec);
+        if (r < 0)
+                return r;
+
+        r = unit_set_exec_params(UNIT(m), &exec_params);
+        if (r < 0)
+                return r;
+
+        r = exec_spawn(UNIT(m),
+                       c,
+                       &m->exec_context,
+                       &exec_params,
+                       m->exec_runtime,
+                       &m->cgroup_context,
+                       &pid);
+        if (r < 0)
+                return r;
+
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0)
+                return r;
+
+        r = unit_watch_pidref(UNIT(m), &pidref, /* exclusive= */ true);
+        if (r < 0)
+                return r;
+
+        *ret_pid = TAKE_PIDREF(pidref);
+        return 0;
+}
+
+static void mount_enter_dead(Mount *m, MountResult f) {
+        assert(m);
+
+        if (m->result == MOUNT_SUCCESS)
+                m->result = f;
+
+        unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result));
+        unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_stop);
+
+        mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD);
+
+        m->exec_runtime = exec_runtime_destroy(m->exec_runtime);
+
+        unit_destroy_runtime_data(UNIT(m), &m->exec_context);
+
+        unit_unref_uid_gid(UNIT(m), true);
+
+        /* Any dependencies based on /proc/self/mountinfo are now stale. Let's re-generate dependencies from
+         * .mount unit. */
+        (void) mount_add_non_exec_dependencies(m);
+}
+
+static void mount_enter_mounted(Mount *m, MountResult f) {
+        assert(m);
+
+        if (m->result == MOUNT_SUCCESS)
+                m->result = f;
+
+        mount_set_state(m, MOUNT_MOUNTED);
+}
+
+static void mount_enter_dead_or_mounted(Mount *m, MountResult f) {
+        assert(m);
+
+        /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this
+         * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all
+         * ultimately we just mirror the kernel's internal state on this. */
+
+        if (m->from_proc_self_mountinfo)
+                mount_enter_mounted(m, f);
+        else
+                mount_enter_dead(m, f);
+}
+
+static int state_to_kill_operation(MountState state) {
+        switch (state) {
+
+        case MOUNT_REMOUNTING_SIGTERM:
+                return KILL_RESTART;
+
+        case MOUNT_UNMOUNTING_SIGTERM:
+                return KILL_TERMINATE;
+
+        case MOUNT_REMOUNTING_SIGKILL:
+        case MOUNT_UNMOUNTING_SIGKILL:
+                return KILL_KILL;
+
+        default:
+                return _KILL_OPERATION_INVALID;
+        }
+}
+
+static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
+        int r;
+
+        assert(m);
+
+        if (m->result == MOUNT_SUCCESS)
+                m->result = f;
+
+        r = unit_kill_context(
+                        UNIT(m),
+                        &m->kill_context,
+                        state_to_kill_operation(state),
+                        /* main_pid= */ NULL,
+                        &m->control_pid,
+                        /* main_pid_alien= */ false);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m");
+                goto fail;
+        }
+
+        if (r > 0) {
+                r = mount_arm_timer(m, /* relative= */ true, m->timeout_usec);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(m), r, "Failed to install timer: %m");
+                        goto fail;
+                }
+
+                mount_set_state(m, state);
+        } else if (state == MOUNT_REMOUNTING_SIGTERM && m->kill_context.send_sigkill)
+                mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS);
+        else if (IN_SET(state, MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL))
+                mount_enter_mounted(m, MOUNT_SUCCESS);
+        else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill)
+                mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS);
+        else
+                mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+
+        return;
+
+fail:
+        mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static int mount_set_umount_command(Mount *m, ExecCommand *c) {
+        int r;
+
+        assert(m);
+        assert(c);
+
+        r = exec_command_set(c, UMOUNT_PATH, m->where, "-c", NULL);
+        if (r < 0)
+                return r;
+
+        if (m->lazy_unmount) {
+                r = exec_command_append(c, "-l", NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        if (m->force_unmount) {
+                r = exec_command_append(c, "-f", NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static void mount_enter_unmounting(Mount *m) {
+        int r;
+
+        assert(m);
+
+        /* Start counting our attempts */
+        if (!IN_SET(m->state,
+                    MOUNT_UNMOUNTING,
+                    MOUNT_UNMOUNTING_SIGTERM,
+                    MOUNT_UNMOUNTING_SIGKILL))
+                m->n_retry_umount = 0;
+
+        m->control_command_id = MOUNT_EXEC_UNMOUNT;
+        m->control_command = m->exec_command + MOUNT_EXEC_UNMOUNT;
+
+        r = mount_set_umount_command(m, m->control_command);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(m), r, "Failed to prepare umount command line: %m");
+                goto fail;
+        }
+
+        mount_unwatch_control_pid(m);
+
+        r = mount_spawn(m, m->control_command, &m->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'umount' task: %m");
+                goto fail;
+        }
+
+        mount_set_state(m, MOUNT_UNMOUNTING);
+
+        return;
+
+fail:
+        mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static int mount_set_mount_command(Mount *m, ExecCommand *c, const MountParameters *p) {
+        int r;
+
+        assert(m);
+        assert(c);
+        assert(p);
+
+        r = exec_command_set(c, MOUNT_PATH, p->what, m->where, NULL);
+        if (r < 0)
+                return r;
+
+        if (m->sloppy_options) {
+                r = exec_command_append(c, "-s", NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        if (m->read_write_only) {
+                r = exec_command_append(c, "-w", NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        if (p->fstype) {
+                r = exec_command_append(c, "-t", p->fstype, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        _cleanup_free_ char *opts = NULL;
+        r = fstab_filter_options(p->options, "nofail\0" "noauto\0" "auto\0", NULL, NULL, NULL, &opts);
+        if (r < 0)
+                return r;
+
+        if (!isempty(opts)) {
+                r = exec_command_append(c, "-o", opts, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static void mount_enter_mounting(Mount *m) {
+        int r;
+        MountParameters *p;
+        bool source_is_dir = true;
+
+        assert(m);
+
+        r = unit_fail_if_noncanonical(UNIT(m), m->where);
+        if (r < 0)
+                goto fail;
+
+        p = get_mount_parameters_fragment(m);
+        if (p && mount_is_bind(p)) {
+                r = is_dir(p->what, /* follow = */ true);
+                if (r < 0 && r != -ENOENT)
+                        log_unit_info_errno(UNIT(m), r, "Failed to determine type of bind mount source '%s', ignoring: %m", p->what);
+                else if (r == 0)
+                        source_is_dir = false;
+        }
+
+        if (source_is_dir)
+                r = mkdir_p_label(m->where, m->directory_mode);
+        else
+                r = touch_file(m->where, /* parents = */ true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID);
+        if (r < 0 && r != -EEXIST)
+                log_unit_warning_errno(UNIT(m), r, "Failed to create mount point '%s', ignoring: %m", m->where);
+
+        if (source_is_dir)
+                unit_warn_if_dir_nonempty(UNIT(m), m->where);
+        unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start);
+
+        m->control_command_id = MOUNT_EXEC_MOUNT;
+        m->control_command = m->exec_command + MOUNT_EXEC_MOUNT;
+
+        /* Create the source directory for bind-mounts if needed */
+        if (p && mount_is_bind(p)) {
+                r = mkdir_p_label(p->what, m->directory_mode);
+                /* mkdir_p_label() can return -EEXIST if the target path exists and is not a directory - which is
+                 * totally OK, in case the user wants us to overmount a non-directory inode. Also -EROFS can be
+                 * returned on read-only filesystem. Moreover, -EACCES (and also maybe -EPERM?) may be returned
+                 * when the path is on NFS. See issue #24120. All such errors will be logged in the debug level. */
+                if (r < 0 && r != -EEXIST)
+                        log_unit_full_errno(UNIT(m),
+                                            (r == -EROFS || ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_WARNING,
+                                            r, "Failed to make bind mount source '%s', ignoring: %m", p->what);
+        }
+
+        if (p) {
+                r = mount_set_mount_command(m, m->control_command, p);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(m), r, "Failed to prepare mount command line: %m");
+                        goto fail;
+                }
+        } else {
+                r = log_unit_warning_errno(UNIT(m), SYNTHETIC_ERRNO(ENOENT), "No mount parameters to operate on.");
+                goto fail;
+        }
+
+        mount_unwatch_control_pid(m);
+
+        r = mount_spawn(m, m->control_command, &m->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'mount' task: %m");
+                goto fail;
+        }
+
+        mount_set_state(m, MOUNT_MOUNTING);
+        return;
+
+fail:
+        mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static void mount_set_reload_result(Mount *m, MountResult result) {
+        assert(m);
+
+        /* Only store the first error we encounter */
+        if (m->reload_result != MOUNT_SUCCESS)
+                return;
+
+        m->reload_result = result;
+}
+
+static void mount_enter_remounting(Mount *m) {
+        int r;
+        MountParameters *p;
+
+        assert(m);
+
+        /* Reset reload result when we are about to start a new remount operation */
+        m->reload_result = MOUNT_SUCCESS;
+
+        m->control_command_id = MOUNT_EXEC_REMOUNT;
+        m->control_command = m->exec_command + MOUNT_EXEC_REMOUNT;
+
+        p = get_mount_parameters_fragment(m);
+        if (p) {
+                const char *o;
+
+                if (p->options)
+                        o = strjoina("remount,", p->options);
+                else
+                        o = "remount";
+
+                r = exec_command_set(m->control_command, MOUNT_PATH,
+                                     p->what, m->where,
+                                     "-o", o, NULL);
+                if (r >= 0 && m->sloppy_options)
+                        r = exec_command_append(m->control_command, "-s", NULL);
+                if (r >= 0 && m->read_write_only)
+                        r = exec_command_append(m->control_command, "-w", NULL);
+                if (r >= 0 && p->fstype)
+                        r = exec_command_append(m->control_command, "-t", p->fstype, NULL);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(m), r, "Failed to prepare remount command line: %m");
+                        goto fail;
+                }
+
+        } else {
+                r = log_unit_warning_errno(UNIT(m), SYNTHETIC_ERRNO(ENOENT), "No mount parameters to operate on.");
+                goto fail;
+        }
+
+        mount_unwatch_control_pid(m);
+
+        r = mount_spawn(m, m->control_command, &m->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'remount' task: %m");
+                goto fail;
+        }
+
+        mount_set_state(m, MOUNT_REMOUNTING);
+        return;
+
+fail:
+        mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES);
+        mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+}
+
+static void mount_cycle_clear(Mount *m) {
+        assert(m);
+
+        /* Clear all state we shall forget for this new cycle */
+
+        m->result = MOUNT_SUCCESS;
+        m->reload_result = MOUNT_SUCCESS;
+        exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
+        UNIT(m)->reset_accounting = true;
+}
+
+static int mount_start(Unit *u) {
+        Mount *m = MOUNT(u);
+        int r;
+
+        assert(m);
+
+        /* We cannot fulfill this request right now, try again later
+         * please! */
+        if (IN_SET(m->state,
+                   MOUNT_UNMOUNTING,
+                   MOUNT_UNMOUNTING_SIGTERM,
+                   MOUNT_UNMOUNTING_SIGKILL,
+                   MOUNT_CLEANING))
+                return -EAGAIN;
+
+        /* Already on it! */
+        if (IN_SET(m->state, MOUNT_MOUNTING, MOUNT_MOUNTING_DONE))
+                return 0;
+
+        assert(IN_SET(m->state, MOUNT_DEAD, MOUNT_FAILED));
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        mount_cycle_clear(m);
+        mount_enter_mounting(m);
+
+        return 1;
+}
+
+static int mount_stop(Unit *u) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+
+        /* When we directly call umount() for a path, then the state of the corresponding mount unit may be
+         * outdated. Let's re-read mountinfo now and update the state. */
+        if (m->invalidated_state)
+                (void) mount_process_proc_self_mountinfo(u->manager);
+
+        switch (m->state) {
+
+        case MOUNT_UNMOUNTING:
+        case MOUNT_UNMOUNTING_SIGKILL:
+        case MOUNT_UNMOUNTING_SIGTERM:
+                /* Already on it */
+                return 0;
+
+        case MOUNT_MOUNTING:
+        case MOUNT_MOUNTING_DONE:
+        case MOUNT_REMOUNTING:
+                /* If we are still waiting for /bin/mount, we go directly into kill mode. */
+                mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_SUCCESS);
+                return 0;
+
+        case MOUNT_REMOUNTING_SIGTERM:
+                /* If we are already waiting for a hung remount, convert this to the matching unmounting state */
+                mount_set_state(m, MOUNT_UNMOUNTING_SIGTERM);
+                return 0;
+
+        case MOUNT_REMOUNTING_SIGKILL:
+                /* as above */
+                mount_set_state(m, MOUNT_UNMOUNTING_SIGKILL);
+                return 0;
+
+        case MOUNT_MOUNTED:
+                mount_enter_unmounting(m);
+                return 1;
+
+        case MOUNT_CLEANING:
+                /* If we are currently cleaning, then abort it, brutally. */
+                mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS);
+                return 0;
+
+        case MOUNT_DEAD:
+        case MOUNT_FAILED:
+                /* The mount has just been unmounted by somebody else. */
+                return 0;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int mount_reload(Unit *u) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+        assert(m->state == MOUNT_MOUNTED);
+
+        mount_enter_remounting(m);
+
+        return 1;
+}
+
+static int mount_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", mount_state_to_string(m->state));
+        (void) serialize_item(f, "result", mount_result_to_string(m->result));
+        (void) serialize_item(f, "reload-result", mount_result_to_string(m->reload_result));
+        (void) serialize_item_format(f, "n-retry-umount", "%u", m->n_retry_umount);
+        (void) serialize_pidref(f, fds, "control-pid", &m->control_pid);
+
+        if (m->control_command_id >= 0)
+                (void) serialize_item(f, "control-command", mount_exec_command_to_string(m->control_command_id));
+
+        return 0;
+}
+
+static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Mount *m = MOUNT(u);
+        int r;
+
+        assert(m);
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                MountState state;
+
+                state = mount_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug_errno(u, state, "Failed to parse state value: %s", value);
+                else
+                        m->deserialized_state = state;
+
+        } else if (streq(key, "result")) {
+                MountResult f;
+
+                f = mount_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug_errno(u, f, "Failed to parse result value: %s", value);
+                else if (f != MOUNT_SUCCESS)
+                        m->result = f;
+
+        } else if (streq(key, "reload-result")) {
+                MountResult f;
+
+                f = mount_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug_errno(u, f, "Failed to parse reload result value: %s", value);
+                else if (f != MOUNT_SUCCESS)
+                        m->reload_result = f;
+
+        } else if (streq(key, "n-retry-umount")) {
+
+                r = safe_atou(value, &m->n_retry_umount);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to parse n-retry-umount value: %s", value);
+
+        } else if (streq(key, "control-pid")) {
+
+                pidref_done(&m->control_pid);
+                (void) deserialize_pidref(fds, value, &m->control_pid);
+
+        } else if (streq(key, "control-command")) {
+                MountExecCommand id;
+
+                id = mount_exec_command_from_string(value);
+                if (id < 0)
+                        log_unit_debug_errno(u, id, "Failed to parse exec-command value: %s", value);
+                else {
+                        m->control_command_id = id;
+                        m->control_command = m->exec_command + id;
+                }
+        } else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static UnitActiveState mount_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[MOUNT(u)->state];
+}
+
+static const char *mount_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return mount_state_to_string(MOUNT(u)->state);
+}
+
+static bool mount_may_gc(Unit *u) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+
+        if (m->from_proc_self_mountinfo)
+                return false;
+
+        return true;
+}
+
+static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+        Mount *m = MOUNT(u);
+        MountResult f;
+
+        assert(m);
+        assert(pid >= 0);
+
+        if (pid != m->control_pid.pid)
+                return;
+
+        /* So here's the thing, we really want to know before /usr/bin/mount or /usr/bin/umount exit whether
+         * they established/remove a mount. This is important when mounting, but even more so when unmounting
+         * since we need to deal with nested mounts and otherwise cannot safely determine whether to repeat
+         * the unmounts. In theory, the kernel fires /proc/self/mountinfo changes off before returning from
+         * the mount() or umount() syscalls, and thus we should see the changes to the proc file before we
+         * process the waitid() for the /usr/bin/(u)mount processes. However, this is unfortunately racy: we
+         * have to waitid() for processes using P_ALL (since we need to reap unexpected children that got
+         * reparented to PID 1), but when using P_ALL we might end up reaping processes that terminated just
+         * instants ago, i.e. already after our last event loop iteration (i.e. after the last point we might
+         * have noticed /proc/self/mountinfo events via epoll). This means event loop priorities for
+         * processing SIGCHLD vs. /proc/self/mountinfo IO events are not as relevant as we want. To fix that
+         * race, let's explicitly scan /proc/self/mountinfo before we start processing /usr/bin/(u)mount
+         * dying. It's ugly, but it makes our ordering systematic again, and makes sure we always see
+         * /proc/self/mountinfo changes before our mount/umount exits. */
+        (void) mount_process_proc_self_mountinfo(u->manager);
+
+        pidref_done(&m->control_pid);
+
+        if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+                f = MOUNT_SUCCESS;
+        else if (code == CLD_EXITED)
+                f = MOUNT_FAILURE_EXIT_CODE;
+        else if (code == CLD_KILLED)
+                f = MOUNT_FAILURE_SIGNAL;
+        else if (code == CLD_DUMPED)
+                f = MOUNT_FAILURE_CORE_DUMP;
+        else
+                assert_not_reached();
+
+        if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM))
+                mount_set_reload_result(m, f);
+        else if (m->result == MOUNT_SUCCESS)
+                m->result = f;
+
+        if (m->control_command) {
+                exec_status_exit(&m->control_command->exec_status, &m->exec_context, pid, code, status);
+
+                m->control_command = NULL;
+                m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+        }
+
+        unit_log_process_exit(
+                        u,
+                        "Mount process",
+                        mount_exec_command_to_string(m->control_command_id),
+                        f == MOUNT_SUCCESS,
+                        code, status);
+
+        /* Note that due to the io event priority logic, we can be sure the new mountinfo is loaded
+         * before we process the SIGCHLD for the mount command. */
+
+        switch (m->state) {
+
+        case MOUNT_MOUNTING:
+                /* Our mount point has not appeared in mountinfo.  Something went wrong. */
+
+                if (f == MOUNT_SUCCESS) {
+                        /* Either /bin/mount has an unexpected definition of success,
+                         * or someone raced us and we lost. */
+                        log_unit_warning(UNIT(m), "Mount process finished, but there is no mount.");
+                        f = MOUNT_FAILURE_PROTOCOL;
+                }
+                mount_enter_dead(m, f);
+                break;
+
+        case MOUNT_MOUNTING_DONE:
+                mount_enter_mounted(m, f);
+                break;
+
+        case MOUNT_REMOUNTING:
+        case MOUNT_REMOUNTING_SIGTERM:
+        case MOUNT_REMOUNTING_SIGKILL:
+                mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+                break;
+
+        case MOUNT_UNMOUNTING:
+
+                if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) {
+
+                        /* Still a mount point? If so, let's try again. Most likely there were multiple mount points
+                         * stacked on top of each other. We might exceed the timeout specified by the user overall,
+                         * but we will stop as soon as any one umount times out. */
+
+                        if (m->n_retry_umount < RETRY_UMOUNT_MAX) {
+                                log_unit_debug(u, "Mount still present, trying again.");
+                                m->n_retry_umount++;
+                                mount_enter_unmounting(m);
+                        } else {
+                                log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount);
+                                mount_enter_mounted(m, f);
+                        }
+                } else
+                        mount_enter_dead_or_mounted(m, f);
+
+                break;
+
+        case MOUNT_UNMOUNTING_SIGKILL:
+        case MOUNT_UNMOUNTING_SIGTERM:
+                mount_enter_dead_or_mounted(m, f);
+                break;
+
+        case MOUNT_CLEANING:
+                if (m->clean_result == MOUNT_SUCCESS)
+                        m->clean_result = f;
+
+                mount_enter_dead(m, MOUNT_SUCCESS);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        /* Notify clients about changed exit status */
+        unit_add_to_dbus_queue(u);
+}
+
+static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Mount *m = MOUNT(userdata);
+
+        assert(m);
+        assert(m->timer_event_source == source);
+
+        switch (m->state) {
+
+        case MOUNT_MOUNTING:
+        case MOUNT_MOUNTING_DONE:
+                log_unit_warning(UNIT(m), "Mounting timed out. Terminating.");
+                mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT);
+                break;
+
+        case MOUNT_REMOUNTING:
+                log_unit_warning(UNIT(m), "Remounting timed out. Terminating remount process.");
+                mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+                mount_enter_signal(m, MOUNT_REMOUNTING_SIGTERM, MOUNT_SUCCESS);
+                break;
+
+        case MOUNT_REMOUNTING_SIGTERM:
+                mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+
+                if (m->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(m), "Remounting timed out. Killing.");
+                        mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS);
+                } else {
+                        log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring.");
+                        mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+                }
+                break;
+
+        case MOUNT_REMOUNTING_SIGKILL:
+                mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+
+                log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring.");
+                mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+                break;
+
+        case MOUNT_UNMOUNTING:
+                log_unit_warning(UNIT(m), "Unmounting timed out. Terminating.");
+                mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT);
+                break;
+
+        case MOUNT_UNMOUNTING_SIGTERM:
+                if (m->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(m), "Mount process timed out. Killing.");
+                        mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring.");
+                        mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+                }
+                break;
+
+        case MOUNT_UNMOUNTING_SIGKILL:
+                log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring.");
+                mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+                break;
+
+        case MOUNT_CLEANING:
+                log_unit_warning(UNIT(m), "Cleaning timed out. killing.");
+
+                if (m->clean_result == MOUNT_SUCCESS)
+                        m->clean_result = MOUNT_FAILURE_TIMEOUT;
+
+                mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, 0);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+static int mount_setup_new_unit(
+                Manager *m,
+                const char *name,
+                const char *what,
+                const char *where,
+                const char *options,
+                const char *fstype,
+                MountProcFlags *ret_flags,
+                Unit **ret) {
+
+        _cleanup_(unit_freep) Unit *u = NULL;
+        int r;
+
+        assert(m);
+        assert(name);
+        assert(ret_flags);
+        assert(ret);
+
+        r = unit_new_for_name(m, sizeof(Mount), name, &u);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&u->source_path, "/proc/self/mountinfo");
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&MOUNT(u)->where, where);
+        if (r < 0)
+                return r;
+
+        r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+        if (r < 0)
+                return r;
+
+        /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the
+         * time we load the unit file for it (and thus add in extra deps right after) we know what source to
+         * attributes the deps to. */
+        MOUNT(u)->from_proc_self_mountinfo = true;
+
+        r = mount_add_non_exec_dependencies(MOUNT(u));
+        if (r < 0)
+                return r;
+
+        /* We have only allocated the stub now, let's enqueue this unit for loading now, so that everything
+         * else is loaded in now. */
+        unit_add_to_load_queue(u);
+
+        *ret_flags = MOUNT_PROC_IS_MOUNTED | MOUNT_PROC_JUST_MOUNTED | MOUNT_PROC_JUST_CHANGED;
+        *ret = TAKE_PTR(u);
+        return 0;
+}
+
+static int mount_setup_existing_unit(
+                Unit *u,
+                const char *what,
+                const char *where,
+                const char *options,
+                const char *fstype,
+                MountProcFlags *ret_flags) {
+
+        int r;
+
+        assert(u);
+        assert(ret_flags);
+
+        if (!MOUNT(u)->where) {
+                MOUNT(u)->where = strdup(where);
+                if (!MOUNT(u)->where)
+                        return -ENOMEM;
+        }
+
+        /* In case we have multiple mounts established on the same mount point, let's merge flags set already
+         * for the current unit. Note that the flags field is reset on each iteration of reading
+         * /proc/self/mountinfo, hence we know for sure anything already set here is from the current
+         * iteration and thus worthy of taking into account. */
+        MountProcFlags flags =
+                MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED;
+
+        r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                flags |= MOUNT_PROC_JUST_CHANGED;
+
+        /* There are two conditions when we consider a mount point just mounted: when we haven't seen it in
+         * /proc/self/mountinfo before or when MOUNT_MOUNTING is our current state. Why bother with the
+         * latter? Shouldn't that be covered by the former? No, during reload it is not because we might then
+         * encounter a new /proc/self/mountinfo in combination with an old mount unit state (since it stems
+         * from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is
+         * reached when we wait for the mount to appear we hence can assume that if we are in it, we are
+         * actually seeing it established for the first time. */
+        if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING)
+                flags |= MOUNT_PROC_JUST_MOUNTED;
+
+        MOUNT(u)->from_proc_self_mountinfo = true;
+
+        if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+                /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in
+                 * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */
+                u->load_state = UNIT_LOADED;
+                u->load_error = 0;
+
+                flags |= MOUNT_PROC_JUST_CHANGED;
+        }
+
+        if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) {
+                /* If things changed, then make sure that all deps are regenerated. Let's
+                 * first remove all automatic deps, and then add in the new ones. */
+                r = mount_add_non_exec_dependencies(MOUNT(u));
+                if (r < 0)
+                        return r;
+        }
+
+        *ret_flags = flags;
+        return 0;
+}
+
+static int mount_setup_unit(
+                Manager *m,
+                const char *what,
+                const char *where,
+                const char *options,
+                const char *fstype,
+                bool set_flags) {
+
+        _cleanup_free_ char *e = NULL;
+        MountProcFlags flags;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(what);
+        assert(where);
+        assert(options);
+        assert(fstype);
+
+        /* Ignore API mount points. They should never be referenced in
+         * dependencies ever. */
+        if (mount_point_is_api(where) || mount_point_ignore(where))
+                return 0;
+
+        if (streq(fstype, "autofs"))
+                return 0;
+
+        /* probably some kind of swap, ignore */
+        if (!is_path(where))
+                return 0;
+
+        r = unit_name_from_path(where, ".mount", &e);
+        if (r < 0)
+                return log_struct_errno(
+                                LOG_WARNING, r,
+                                "MESSAGE_ID=" SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE_STR,
+                                "MOUNT_POINT=%s", where,
+                                LOG_MESSAGE("Failed to generate valid unit name from mount point path '%s', ignoring mount point: %m",
+                                            where));
+
+        u = manager_get_unit(m, e);
+        if (u)
+                r = mount_setup_existing_unit(u, what, where, options, fstype, &flags);
+        else
+                /* First time we see this mount point meaning that it's not been initiated by a mount unit
+                 * but rather by the sysadmin having called mount(8) directly. */
+                r = mount_setup_new_unit(m, e, what, where, options, fstype, &flags, &u);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to set up mount unit for '%s': %m", where);
+
+        /* If the mount changed properties or state, let's notify our clients */
+        if (flags & (MOUNT_PROC_JUST_CHANGED|MOUNT_PROC_JUST_MOUNTED))
+                unit_add_to_dbus_queue(u);
+
+        if (set_flags)
+                MOUNT(u)->proc_flags = flags;
+
+        return 0;
+}
+
+static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) {
+        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+        _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+        int r;
+
+        assert(m);
+
+        r = libmount_parse(NULL, NULL, &table, &iter);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m");
+
+        for (;;) {
+                struct libmnt_fs *fs;
+                const char *device, *path, *options, *fstype;
+
+                r = mnt_table_next_fs(table, iter, &fs);
+                if (r == 1)
+                        break;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
+
+                device = mnt_fs_get_source(fs);
+                path = mnt_fs_get_target(fs);
+                options = mnt_fs_get_options(fs);
+                fstype = mnt_fs_get_fstype(fs);
+
+                if (!device || !path)
+                        continue;
+
+                device_found_node(m, device, DEVICE_FOUND_MOUNT, DEVICE_FOUND_MOUNT);
+
+                (void) mount_setup_unit(m, device, path, options, fstype, set_flags);
+        }
+
+        return 0;
+}
+
+static void mount_shutdown(Manager *m) {
+        assert(m);
+
+        m->mount_event_source = sd_event_source_disable_unref(m->mount_event_source);
+
+        mnt_unref_monitor(m->mount_monitor);
+        m->mount_monitor = NULL;
+}
+
+static int mount_get_timeout(Unit *u, usec_t *timeout) {
+        Mount *m = MOUNT(u);
+        usec_t t;
+        int r;
+
+        assert(m);
+        assert(u);
+
+        if (!m->timer_event_source)
+                return 0;
+
+        r = sd_event_source_get_time(m->timer_event_source, &t);
+        if (r < 0)
+                return r;
+        if (t == USEC_INFINITY)
+                return 0;
+
+        *timeout = t;
+        return 1;
+}
+
+static void mount_enumerate_perpetual(Manager *m) {
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        /* Whatever happens, we know for sure that the root directory is around, and cannot go away. Let's
+         * unconditionally synthesize it here and mark it as perpetual. */
+
+        u = manager_get_unit(m, SPECIAL_ROOT_MOUNT);
+        if (!u) {
+                r = unit_new_for_name(m, sizeof(Mount), SPECIAL_ROOT_MOUNT, &u);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to allocate the special " SPECIAL_ROOT_MOUNT " unit: %m");
+                        return;
+                }
+        }
+
+        u->perpetual = true;
+        MOUNT(u)->deserialized_state = MOUNT_MOUNTED;
+
+        unit_add_to_load_queue(u);
+        unit_add_to_dbus_queue(u);
+}
+
+static bool mount_is_mounted(Mount *m) {
+        assert(m);
+
+        return UNIT(m)->perpetual || FLAGS_SET(m->proc_flags, MOUNT_PROC_IS_MOUNTED);
+}
+
+static int mount_on_ratelimit_expire(sd_event_source *s, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        Job *j;
+
+        /* Let's enqueue all start jobs that were previously skipped because of active ratelimit. */
+        HASHMAP_FOREACH(j, m->jobs) {
+                if (j->unit->type != UNIT_MOUNT)
+                        continue;
+
+                job_add_to_run_queue(j);
+        }
+
+        /* By entering ratelimited state we made all mount start jobs not runnable, now rate limit is over so
+         * let's make sure we dispatch them in the next iteration. */
+        manager_trigger_run_queue(m);
+
+        return 0;
+}
+
+static void mount_enumerate(Manager *m) {
+        int r;
+
+        assert(m);
+
+        mnt_init_debug(0);
+
+        if (!m->mount_monitor) {
+                unsigned mount_rate_limit_burst = 5;
+                int fd;
+
+                m->mount_monitor = mnt_new_monitor();
+                if (!m->mount_monitor) {
+                        log_oom();
+                        goto fail;
+                }
+
+                r = mnt_monitor_enable_kernel(m->mount_monitor, 1);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to enable watching of kernel mount events: %m");
+                        goto fail;
+                }
+
+                r = mnt_monitor_enable_userspace(m->mount_monitor, 1, NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to enable watching of userspace mount events: %m");
+                        goto fail;
+                }
+
+                /* mnt_unref_monitor() will close the fd */
+                fd = r = mnt_monitor_get_fd(m->mount_monitor);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to acquire watch file descriptor: %m");
+                        goto fail;
+                }
+
+                r = sd_event_add_io(m->event, &m->mount_event_source, fd, EPOLLIN, mount_dispatch_io, m);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to watch mount file descriptor: %m");
+                        goto fail;
+                }
+
+                r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to adjust mount watch priority: %m");
+                        goto fail;
+                }
+
+                /* Let users override the default (5 in 1s), as it stalls the boot sequence on busy systems. */
+                const char *e = secure_getenv("SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST");
+                if (e) {
+                        r = safe_atou(e, &mount_rate_limit_burst);
+                        if (r < 0)
+                                log_debug("Invalid value in $SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST, ignoring: %s", e);
+                }
+
+                r = sd_event_source_set_ratelimit(m->mount_event_source, 1 * USEC_PER_SEC, mount_rate_limit_burst);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+                        goto fail;
+                }
+
+                r = sd_event_source_set_ratelimit_expire_callback(m->mount_event_source, mount_on_ratelimit_expire);
+                if (r < 0) {
+                         log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+                         goto fail;
+                }
+
+                (void) sd_event_source_set_description(m->mount_event_source, "mount-monitor-dispatch");
+        }
+
+        r = mount_load_proc_self_mountinfo(m, false);
+        if (r < 0)
+                goto fail;
+
+        return;
+
+fail:
+        mount_shutdown(m);
+}
+
+static int drain_libmount(Manager *m) {
+        bool rescan = false;
+        int r;
+
+        assert(m);
+
+        /* Drain all events and verify that the event is valid.
+         *
+         * Note that libmount also monitors /run/mount mkdir if the directory does not exist yet. The mkdir
+         * may generate event which is irrelevant for us.
+         *
+         * error: r < 0; valid: r == 0, false positive: r == 1 */
+        do {
+                r = mnt_monitor_next_change(m->mount_monitor, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to drain libmount events: %m");
+                if (r == 0)
+                        rescan = true;
+        } while (r == 0);
+
+        return rescan;
+}
+
+static int mount_process_proc_self_mountinfo(Manager *m) {
+        _cleanup_set_free_ Set *around = NULL, *gone = NULL;
+        const char *what;
+        int r;
+
+        assert(m);
+
+        r = drain_libmount(m);
+        if (r <= 0)
+                return r;
+
+        r = mount_load_proc_self_mountinfo(m, true);
+        if (r < 0) {
+                /* Reset flags, just in case, for later calls */
+                LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT])
+                        MOUNT(u)->proc_flags = 0;
+
+                return 0;
+        }
+
+        manager_dispatch_load_queue(m);
+
+        LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) {
+                Mount *mount = MOUNT(u);
+
+                mount->invalidated_state = false;
+
+                if (!mount_is_mounted(mount)) {
+
+                        /* A mount point is not around right now. It might be gone, or might never have
+                         * existed. */
+
+                        if (mount->from_proc_self_mountinfo &&
+                            mount->parameters_proc_self_mountinfo.what)
+                                /* Remember that this device might just have disappeared */
+                                if (set_put_strdup_full(&gone, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0)
+                                        log_oom(); /* we don't care too much about OOM here... */
+
+                        mount->from_proc_self_mountinfo = false;
+                        assert_se(update_parameters_proc_self_mountinfo(mount, NULL, NULL, NULL) >= 0);
+
+                        switch (mount->state) {
+
+                        case MOUNT_MOUNTED:
+                                /* This has just been unmounted by somebody else, follow the state change. */
+                                mount_enter_dead(mount, MOUNT_SUCCESS);
+                                break;
+
+                        case MOUNT_MOUNTING_DONE:
+                                /* The mount command may add the corresponding proc mountinfo entry and
+                                 * then remove it because of an internal error. E.g., fuse.sshfs seems
+                                 * to do that when the connection fails. See #17617. To handle such the
+                                 * case, let's once set the state back to mounting. Then, the unit can
+                                 * correctly enter the failed state later in mount_sigchld(). */
+                                mount_set_state(mount, MOUNT_MOUNTING);
+                                break;
+
+                        default:
+                                break;
+                        }
+
+                } else if (mount->proc_flags & (MOUNT_PROC_JUST_MOUNTED|MOUNT_PROC_JUST_CHANGED)) {
+
+                        /* A mount point was added or changed */
+
+                        switch (mount->state) {
+
+                        case MOUNT_DEAD:
+                        case MOUNT_FAILED:
+
+                                /* This has just been mounted by somebody else, follow the state change, but let's
+                                 * generate a new invocation ID for this implicitly and automatically. */
+                                (void) unit_acquire_invocation_id(u);
+                                mount_cycle_clear(mount);
+                                mount_enter_mounted(mount, MOUNT_SUCCESS);
+                                break;
+
+                        case MOUNT_MOUNTING:
+                                mount_set_state(mount, MOUNT_MOUNTING_DONE);
+                                break;
+
+                        default:
+                                /* Nothing really changed, but let's issue an notification call nonetheless,
+                                 * in case somebody is waiting for this. (e.g. file system ro/rw
+                                 * remounts.) */
+                                mount_set_state(mount, mount->state);
+                                break;
+                        }
+                }
+
+                if (mount_is_mounted(mount) &&
+                    mount->from_proc_self_mountinfo &&
+                    mount->parameters_proc_self_mountinfo.what)
+                        /* Track devices currently used */
+                        if (set_put_strdup_full(&around, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0)
+                                log_oom();
+
+                /* Reset the flags for later calls */
+                mount->proc_flags = 0;
+        }
+
+        SET_FOREACH(what, gone) {
+                if (set_contains(around, what))
+                        continue;
+
+                /* Let the device units know that the device is no longer mounted */
+                device_found_node(m, what, DEVICE_NOT_FOUND, DEVICE_FOUND_MOUNT);
+        }
+
+        return 0;
+}
+
+static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(revents & EPOLLIN);
+
+        return mount_process_proc_self_mountinfo(m);
+}
+
+int mount_invalidate_state_by_path(Manager *manager, const char *path) {
+        _cleanup_free_ char *name = NULL;
+        Unit *u;
+        int r;
+
+        assert(manager);
+        assert(path);
+
+        r = unit_name_from_path(path, ".mount", &name);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to generate unit name from path \"%s\", ignoring: %m", path);
+
+        u = manager_get_unit(manager, name);
+        if (!u)
+                return -ENOENT;
+
+        MOUNT(u)->invalidated_state = true;
+        return 0;
+}
+
+static void mount_reset_failed(Unit *u) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+
+        if (m->state == MOUNT_FAILED)
+                mount_set_state(m, MOUNT_DEAD);
+
+        m->result = MOUNT_SUCCESS;
+        m->reload_result = MOUNT_SUCCESS;
+        m->clean_result = MOUNT_SUCCESS;
+}
+
+static PidRef* mount_control_pid(Unit *u) {
+        return &ASSERT_PTR(MOUNT(u))->control_pid;
+}
+
+static int mount_clean(Unit *u, ExecCleanMask mask) {
+        _cleanup_strv_free_ char **l = NULL;
+        Mount *m = MOUNT(u);
+        int r;
+
+        assert(m);
+        assert(mask != 0);
+
+        if (m->state != MOUNT_DEAD)
+                return -EBUSY;
+
+        r = exec_context_get_clean_directories(&m->exec_context, u->manager->prefix, mask, &l);
+        if (r < 0)
+                return r;
+
+        if (strv_isempty(l))
+                return -EUNATCH;
+
+        mount_unwatch_control_pid(m);
+        m->clean_result = MOUNT_SUCCESS;
+        m->control_command = NULL;
+        m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+
+        r = mount_arm_timer(m, /* relative= */ true, m->exec_context.timeout_clean_usec);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to install timer: %m");
+                goto fail;
+        }
+
+        r = unit_fork_and_watch_rm_rf(u, l, &m->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+                goto fail;
+        }
+
+        mount_set_state(m, MOUNT_CLEANING);
+        return 0;
+
+fail:
+        m->clean_result = MOUNT_FAILURE_RESOURCES;
+        m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+        return r;
+}
+
+static int mount_can_clean(Unit *u, ExecCleanMask *ret) {
+        Mount *m = MOUNT(u);
+
+        assert(m);
+
+        return exec_context_get_clean_mask(&m->exec_context, ret);
+}
+
+static int mount_can_start(Unit *u) {
+        Mount *m = MOUNT(u);
+        int r;
+
+        assert(m);
+
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT);
+                return r;
+        }
+
+        return 1;
+}
+
+static int mount_subsystem_ratelimited(Manager *m) {
+        assert(m);
+
+        if (!m->mount_event_source)
+                return false;
+
+        return sd_event_source_is_ratelimited(m->mount_event_source);
+}
+
+char* mount_get_what_escaped(const Mount *m) {
+        _cleanup_free_ char *escaped = NULL;
+        const char *s = NULL;
+
+        assert(m);
+
+        if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.what)
+                s = m->parameters_proc_self_mountinfo.what;
+        else if (m->from_fragment && m->parameters_fragment.what)
+                s = m->parameters_fragment.what;
+
+        if (s) {
+                escaped = utf8_escape_invalid(s);
+                if (!escaped)
+                        return NULL;
+        }
+
+        return escaped ? TAKE_PTR(escaped) : strdup("");
+}
+
+char* mount_get_options_escaped(const Mount *m) {
+        _cleanup_free_ char *escaped = NULL;
+        const char *s = NULL;
+
+        assert(m);
+
+        if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.options)
+                s = m->parameters_proc_self_mountinfo.options;
+        else if (m->from_fragment && m->parameters_fragment.options)
+                s = m->parameters_fragment.options;
+
+        if (s) {
+                escaped = utf8_escape_invalid(s);
+                if (!escaped)
+                        return NULL;
+        }
+
+        return escaped ? TAKE_PTR(escaped) : strdup("");
+}
+
+const char* mount_get_fstype(const Mount *m) {
+        assert(m);
+
+        if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.fstype)
+                return m->parameters_proc_self_mountinfo.fstype;
+
+        if (m->from_fragment && m->parameters_fragment.fstype)
+                return m->parameters_fragment.fstype;
+
+        return NULL;
+}
+
+static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = {
+        [MOUNT_EXEC_MOUNT]   = "ExecMount",
+        [MOUNT_EXEC_UNMOUNT] = "ExecUnmount",
+        [MOUNT_EXEC_REMOUNT] = "ExecRemount",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_exec_command, MountExecCommand);
+
+static const char* const mount_result_table[_MOUNT_RESULT_MAX] = {
+        [MOUNT_SUCCESS]                 = "success",
+        [MOUNT_FAILURE_RESOURCES]       = "resources",
+        [MOUNT_FAILURE_TIMEOUT]         = "timeout",
+        [MOUNT_FAILURE_EXIT_CODE]       = "exit-code",
+        [MOUNT_FAILURE_SIGNAL]          = "signal",
+        [MOUNT_FAILURE_CORE_DUMP]       = "core-dump",
+        [MOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+        [MOUNT_FAILURE_PROTOCOL]        = "protocol",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_result, MountResult);
+
+const UnitVTable mount_vtable = {
+        .object_size = sizeof(Mount),
+        .exec_context_offset = offsetof(Mount, exec_context),
+        .cgroup_context_offset = offsetof(Mount, cgroup_context),
+        .kill_context_offset = offsetof(Mount, kill_context),
+        .exec_runtime_offset = offsetof(Mount, exec_runtime),
+
+        .sections =
+                "Unit\0"
+                "Mount\0"
+                "Install\0",
+        .private_section = "Mount",
+
+        .can_transient = true,
+        .can_fail = true,
+        .exclude_from_switch_root_serialization = true,
+
+        .init = mount_init,
+        .load = mount_load,
+        .done = mount_done,
+
+        .coldplug = mount_coldplug,
+        .catchup = mount_catchup,
+
+        .dump = mount_dump,
+
+        .start = mount_start,
+        .stop = mount_stop,
+        .reload = mount_reload,
+
+        .clean = mount_clean,
+        .can_clean = mount_can_clean,
+
+        .serialize = mount_serialize,
+        .deserialize_item = mount_deserialize_item,
+
+        .active_state = mount_active_state,
+        .sub_state_to_string = mount_sub_state_to_string,
+
+        .will_restart = unit_will_restart_default,
+
+        .may_gc = mount_may_gc,
+        .is_extrinsic = mount_is_extrinsic,
+
+        .sigchld_event = mount_sigchld_event,
+
+        .reset_failed = mount_reset_failed,
+
+        .control_pid = mount_control_pid,
+
+        .bus_set_property = bus_mount_set_property,
+        .bus_commit_properties = bus_mount_commit_properties,
+
+        .get_timeout = mount_get_timeout,
+
+        .enumerate_perpetual = mount_enumerate_perpetual,
+        .enumerate = mount_enumerate,
+        .shutdown = mount_shutdown,
+        .subsystem_ratelimited = mount_subsystem_ratelimited,
+
+        .status_message_formats = {
+                .starting_stopping = {
+                        [0] = "Mounting %s...",
+                        [1] = "Unmounting %s...",
+                },
+                .finished_start_job = {
+                        [JOB_DONE]       = "Mounted %s.",
+                        [JOB_FAILED]     = "Failed to mount %s.",
+                        [JOB_TIMEOUT]    = "Timed out mounting %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Unmounted %s.",
+                        [JOB_FAILED]     = "Failed unmounting %s.",
+                        [JOB_TIMEOUT]    = "Timed out unmounting %s.",
+                },
+        },
+
+        .can_start = mount_can_start,
+
+        .notify_plymouth = true,
+};
diff --git a/src/core/mount.h b/src/core/mount.h
new file mode 100644
index 0000000..6712c16
--- /dev/null
+++ b/src/core/mount.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Mount Mount;
+
+#include "dynamic-user.h"
+#include "kill.h"
+#include "pidref.h"
+#include "unit.h"
+
+typedef enum MountExecCommand {
+        MOUNT_EXEC_MOUNT,
+        MOUNT_EXEC_UNMOUNT,
+        MOUNT_EXEC_REMOUNT,
+        _MOUNT_EXEC_COMMAND_MAX,
+        _MOUNT_EXEC_COMMAND_INVALID = -EINVAL,
+} MountExecCommand;
+
+typedef enum MountResult {
+        MOUNT_SUCCESS,
+        MOUNT_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */
+        MOUNT_FAILURE_TIMEOUT,
+        MOUNT_FAILURE_EXIT_CODE,
+        MOUNT_FAILURE_SIGNAL,
+        MOUNT_FAILURE_CORE_DUMP,
+        MOUNT_FAILURE_START_LIMIT_HIT,
+        MOUNT_FAILURE_PROTOCOL,
+        _MOUNT_RESULT_MAX,
+        _MOUNT_RESULT_INVALID = -EINVAL,
+} MountResult;
+
+typedef struct MountParameters {
+        char *what;
+        char *options;
+        char *fstype;
+} MountParameters;
+
+/* Used while looking for mount points that vanished or got added from/to /proc/self/mountinfo */
+typedef enum MountProcFlags {
+        MOUNT_PROC_IS_MOUNTED   = 1 << 0,
+        MOUNT_PROC_JUST_MOUNTED = 1 << 1,
+        MOUNT_PROC_JUST_CHANGED = 1 << 2,
+} MountProcFlags;
+
+struct Mount {
+        Unit meta;
+
+        char *where;
+
+        MountParameters parameters_proc_self_mountinfo;
+        MountParameters parameters_fragment;
+
+        bool invalidated_state:1; /* Set when the 'state' of the mount unit may be outdated, and we need to
+                                   * re-read /proc/self/mountinfo. */
+        bool from_proc_self_mountinfo:1;
+        bool from_fragment:1;
+
+        MountProcFlags proc_flags;
+
+        bool sloppy_options;
+
+        bool lazy_unmount;
+        bool force_unmount;
+
+        bool read_write_only;
+
+        MountResult result;
+        MountResult reload_result;
+        MountResult clean_result;
+
+        mode_t directory_mode;
+
+        usec_t timeout_usec;
+
+        ExecCommand exec_command[_MOUNT_EXEC_COMMAND_MAX];
+
+        ExecContext exec_context;
+        KillContext kill_context;
+        CGroupContext cgroup_context;
+
+        ExecRuntime *exec_runtime;
+
+        MountState state, deserialized_state;
+
+        ExecCommand* control_command;
+        MountExecCommand control_command_id;
+        PidRef control_pid;
+
+        sd_event_source *timer_event_source;
+
+        unsigned n_retry_umount;
+};
+
+extern const UnitVTable mount_vtable;
+
+void mount_fd_event(Manager *m, int events);
+
+int mount_invalidate_state_by_path(Manager *manager, const char *path);
+
+char* mount_get_what_escaped(const Mount *m);
+char* mount_get_options_escaped(const Mount *m);
+const char* mount_get_fstype(const Mount *m);
+
+const char* mount_exec_command_to_string(MountExecCommand i) _const_;
+MountExecCommand mount_exec_command_from_string(const char *s) _pure_;
+
+const char* mount_result_to_string(MountResult i) _const_;
+MountResult mount_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(MOUNT, Mount);
diff --git a/src/core/namespace.c b/src/core/namespace.c
new file mode 100644
index 0000000..88681aa
--- /dev/null
+++ b/src/core/namespace.c
@@ -0,0 +1,3047 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#if WANT_LINUX_FS_H
+#include 
+#endif
+
+#include "alloc-util.h"
+#include "base-filesystem.h"
+#include "chase.h"
+#include "dev-setup.h"
+#include "devnum-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "extension-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "glyph-util.h"
+#include "label-util.h"
+#include "list.h"
+#include "lock-util.h"
+#include "loop-util.h"
+#include "loopback-setup.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "namespace.h"
+#include "nsflags.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
+
+typedef enum MountMode {
+        /* This is ordered by priority! */
+        MOUNT_INACCESSIBLE,
+        MOUNT_OVERLAY,
+        MOUNT_IMAGE,
+        MOUNT_BIND,
+        MOUNT_BIND_RECURSIVE,
+        MOUNT_PRIVATE_TMP,
+        MOUNT_PRIVATE_TMP_READ_ONLY,
+        MOUNT_PRIVATE_DEV,
+        MOUNT_BIND_DEV,
+        MOUNT_EMPTY_DIR,
+        MOUNT_PRIVATE_SYSFS,
+        MOUNT_BIND_SYSFS,
+        MOUNT_PROCFS,
+        MOUNT_READ_ONLY,
+        MOUNT_READ_WRITE,
+        MOUNT_NOEXEC,
+        MOUNT_EXEC,
+        MOUNT_TMPFS,
+        MOUNT_RUN,
+        MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */
+        MOUNT_EXTENSION_IMAGE,     /* Mounted outside the root directory, and used by subsequent mounts */
+        MOUNT_MQUEUEFS,
+        MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
+        _MOUNT_MODE_MAX,
+        _MOUNT_MODE_INVALID = -EINVAL,
+} MountMode;
+
+typedef enum MountEntryState {
+        MOUNT_PENDING,
+        MOUNT_APPLIED,
+        MOUNT_SKIPPED,
+        _MOUNT_ENTRY_STATE_MAX,
+        _MOUNT_ENTRY_STATE_INVALID = -EINVAL,
+} MountEntryState;
+
+typedef struct MountEntry {
+        const char *path_const;   /* Memory allocated on stack or static */
+        MountMode mode;
+        bool ignore:1;            /* Ignore if path does not exist? */
+        bool has_prefix:1;        /* Already is prefixed by the root dir? */
+        bool read_only:1;         /* Shall this mount point be read-only? */
+        bool nosuid:1;            /* Shall set MS_NOSUID on the mount itself */
+        bool noexec:1;            /* Shall set MS_NOEXEC on the mount itself */
+        bool exec:1;              /* Shall clear MS_NOEXEC on the mount itself */
+        MountEntryState state;    /* Whether it was already processed or skipped */
+        char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
+        const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
+        char *unprefixed_path_malloc;
+        const char *source_const; /* The source path, for bind mounts or images */
+        char *source_malloc;
+        const char *options_const;/* Mount options for tmpfs */
+        char *options_malloc;
+        unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
+        unsigned n_followed;
+        LIST_HEAD(MountOptions, image_options_const);
+        char **overlay_layers;
+} MountEntry;
+
+typedef struct MountList {
+        MountEntry *mounts;
+        size_t n_mounts;
+} MountList;
+
+/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
+ * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
+static const MountEntry apivfs_table[] = {
+        { "/proc",               MOUNT_PROCFS,       false },
+        { "/dev",                MOUNT_BIND_DEV,     false },
+        { "/sys",                MOUNT_BIND_SYSFS,   false },
+        { "/run",                MOUNT_RUN,          false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
+};
+
+/* ProtectKernelTunables= option and the related filesystem APIs */
+static const MountEntry protect_kernel_tunables_proc_table[] = {
+        { "/proc/acpi",          MOUNT_READ_ONLY,           true  },
+        { "/proc/apm",           MOUNT_READ_ONLY,           true  }, /* Obsolete API, there's no point in permitting access to this, ever */
+        { "/proc/asound",        MOUNT_READ_ONLY,           true  },
+        { "/proc/bus",           MOUNT_READ_ONLY,           true  },
+        { "/proc/fs",            MOUNT_READ_ONLY,           true  },
+        { "/proc/irq",           MOUNT_READ_ONLY,           true  },
+        { "/proc/kallsyms",      MOUNT_INACCESSIBLE,        true  },
+        { "/proc/kcore",         MOUNT_INACCESSIBLE,        true  },
+        { "/proc/latency_stats", MOUNT_READ_ONLY,           true  },
+        { "/proc/mtrr",          MOUNT_READ_ONLY,           true  },
+        { "/proc/scsi",          MOUNT_READ_ONLY,           true  },
+        { "/proc/sys",           MOUNT_READ_ONLY,           true  },
+        { "/proc/sysrq-trigger", MOUNT_READ_ONLY,           true  },
+        { "/proc/timer_stats",   MOUNT_READ_ONLY,           true  },
+};
+
+static const MountEntry protect_kernel_tunables_sys_table[] = {
+        { "/sys",                MOUNT_READ_ONLY,           false },
+        { "/sys/fs/bpf",         MOUNT_READ_ONLY,           true  },
+        { "/sys/fs/cgroup",      MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
+        { "/sys/fs/selinux",     MOUNT_READ_WRITE_IMPLICIT, true  },
+        { "/sys/kernel/debug",   MOUNT_READ_ONLY,           true  },
+        { "/sys/kernel/tracing", MOUNT_READ_ONLY,           true  },
+};
+
+/* ProtectKernelModules= option */
+static const MountEntry protect_kernel_modules_table[] = {
+        { "/usr/lib/modules",    MOUNT_INACCESSIBLE, true  },
+};
+
+/* ProtectKernelLogs= option */
+static const MountEntry protect_kernel_logs_proc_table[] = {
+        { "/proc/kmsg",          MOUNT_INACCESSIBLE, true },
+};
+
+static const MountEntry protect_kernel_logs_dev_table[] = {
+        { "/dev/kmsg",           MOUNT_INACCESSIBLE, true },
+};
+
+/*
+ * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
+ * system should be protected by ProtectSystem=
+ */
+static const MountEntry protect_home_read_only_table[] = {
+        { "/home",               MOUNT_READ_ONLY,     true  },
+        { "/run/user",           MOUNT_READ_ONLY,     true  },
+        { "/root",               MOUNT_READ_ONLY,     true  },
+};
+
+/* ProtectHome=tmpfs table */
+static const MountEntry protect_home_tmpfs_table[] = {
+        { "/home",               MOUNT_TMPFS,        true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+        { "/run/user",           MOUNT_TMPFS,        true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+        { "/root",               MOUNT_TMPFS,        true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+};
+
+/* ProtectHome=yes table */
+static const MountEntry protect_home_yes_table[] = {
+        { "/home",               MOUNT_INACCESSIBLE, true  },
+        { "/run/user",           MOUNT_INACCESSIBLE, true  },
+        { "/root",               MOUNT_INACCESSIBLE, true  },
+};
+
+/* ProtectSystem=yes table */
+static const MountEntry protect_system_yes_table[] = {
+        { "/usr",                MOUNT_READ_ONLY,     false },
+        { "/boot",               MOUNT_READ_ONLY,     true  },
+        { "/efi",                MOUNT_READ_ONLY,     true  },
+};
+
+/* ProtectSystem=full includes ProtectSystem=yes */
+static const MountEntry protect_system_full_table[] = {
+        { "/usr",                MOUNT_READ_ONLY,     false },
+        { "/boot",               MOUNT_READ_ONLY,     true  },
+        { "/efi",                MOUNT_READ_ONLY,     true  },
+        { "/etc",                MOUNT_READ_ONLY,     false },
+};
+
+/* ProtectSystem=strict table. In this strict mode, we mount everything read-only, except for /proc, /dev,
+ * /sys which are the kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+ * protect those, and these options should be fully orthogonal.  (And of course /home and friends are also
+ * left writable, as ProtectHome= shall manage those, orthogonally).
+ */
+static const MountEntry protect_system_strict_table[] = {
+        { "/",                   MOUNT_READ_ONLY,          false },
+        { "/proc",               MOUNT_READ_WRITE_IMPLICIT, false },      /* ProtectKernelTunables= */
+        { "/sys",                MOUNT_READ_WRITE_IMPLICIT, false },      /* ProtectKernelTunables= */
+        { "/dev",                MOUNT_READ_WRITE_IMPLICIT, false },      /* PrivateDevices= */
+        { "/home",               MOUNT_READ_WRITE_IMPLICIT, true  },      /* ProtectHome= */
+        { "/run/user",           MOUNT_READ_WRITE_IMPLICIT, true  },      /* ProtectHome= */
+        { "/root",               MOUNT_READ_WRITE_IMPLICIT, true  },      /* ProtectHome= */
+};
+
+/* ProtectHostname=yes able */
+static const MountEntry protect_hostname_table[] = {
+        { "/proc/sys/kernel/hostname",   MOUNT_READ_ONLY, false },
+        { "/proc/sys/kernel/domainname", MOUNT_READ_ONLY, false },
+};
+
+static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
+        [MOUNT_INACCESSIBLE]          = "inaccessible",
+        [MOUNT_OVERLAY]               = "overlay",
+        [MOUNT_IMAGE]                 = "image",
+        [MOUNT_BIND]                  = "bind",
+        [MOUNT_BIND_RECURSIVE]        = "bind-recursive",
+        [MOUNT_PRIVATE_TMP]           = "private-tmp",
+        [MOUNT_PRIVATE_TMP_READ_ONLY] = "private-tmp-read-only",
+        [MOUNT_PRIVATE_DEV]           = "private-dev",
+        [MOUNT_BIND_DEV]              = "bind-dev",
+        [MOUNT_EMPTY_DIR]             = "empty-dir",
+        [MOUNT_PRIVATE_SYSFS]         = "private-sysfs",
+        [MOUNT_BIND_SYSFS]            = "bind-sysfs",
+        [MOUNT_PROCFS]                = "procfs",
+        [MOUNT_READ_ONLY]             = "read-only",
+        [MOUNT_READ_WRITE]            = "read-write",
+        [MOUNT_NOEXEC]                = "noexec",
+        [MOUNT_EXEC]                  = "exec",
+        [MOUNT_TMPFS]                 = "tmpfs",
+        [MOUNT_RUN]                   = "run",
+        [MOUNT_EXTENSION_DIRECTORY]   = "extension-directory",
+        [MOUNT_EXTENSION_IMAGE]       = "extension-image",
+        [MOUNT_MQUEUEFS]              = "mqueuefs",
+        [MOUNT_READ_WRITE_IMPLICIT]   = "read-write-implicit",
+};
+
+/* Helper struct for naming simplicity and reusability */
+static const struct {
+        const char *level_env;
+        const char *level_env_print;
+} image_class_info[_IMAGE_CLASS_MAX] = {
+        [IMAGE_SYSEXT] = {
+                .level_env = "SYSEXT_LEVEL",
+                .level_env_print = " SYSEXT_LEVEL=",
+        },
+        [IMAGE_CONFEXT] = {
+                .level_env = "CONFEXT_LEVEL",
+                .level_env_print = " CONFEXT_LEVEL=",
+        }
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
+
+static const char *mount_entry_path(const MountEntry *p) {
+        assert(p);
+
+        /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
+         * otherwise the stack/static ->path field is returned. */
+
+        return p->path_malloc ?: p->path_const;
+}
+
+static const char *mount_entry_unprefixed_path(const MountEntry *p) {
+        assert(p);
+
+        /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
+
+        return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
+}
+
+static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
+        assert(p);
+        assert(p->path_malloc || p->path_const);
+        assert(new_path);
+
+        /* Saves current path in unprefixed_ variable, and takes over new_path */
+
+        free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
+        /* If we didn't have a path on the heap, then it's a static one */
+        if (!p->unprefixed_path_malloc)
+                p->unprefixed_path_const = p->path_const;
+        p->path_malloc = new_path;
+        p->has_prefix = true;
+}
+
+static bool mount_entry_read_only(const MountEntry *p) {
+        assert(p);
+
+        return p->read_only || IN_SET(p->mode, MOUNT_READ_ONLY, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_TMP_READ_ONLY);
+}
+
+static bool mount_entry_noexec(const MountEntry *p) {
+        assert(p);
+
+        return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
+}
+
+static bool mount_entry_exec(const MountEntry *p) {
+        assert(p);
+
+        return p->exec || p->mode == MOUNT_EXEC;
+}
+
+static const char *mount_entry_source(const MountEntry *p) {
+        assert(p);
+
+        return p->source_malloc ?: p->source_const;
+}
+
+static const char *mount_entry_options(const MountEntry *p) {
+        assert(p);
+
+        return p->options_malloc ?: p->options_const;
+}
+
+static void mount_entry_done(MountEntry *p) {
+        assert(p);
+
+        p->path_malloc = mfree(p->path_malloc);
+        p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
+        p->source_malloc = mfree(p->source_malloc);
+        p->options_malloc = mfree(p->options_malloc);
+        p->overlay_layers = strv_free(p->overlay_layers);
+}
+
+static void mount_list_done(MountList *ml) {
+        assert(ml);
+
+        FOREACH_ARRAY(m, ml->mounts, ml->n_mounts)
+                mount_entry_done(m);
+
+        ml->mounts = mfree(ml->mounts);
+        ml->n_mounts = 0;
+}
+
+static MountEntry *mount_list_extend(MountList *ml) {
+        assert(ml);
+
+        if (!GREEDY_REALLOC0(ml->mounts, ml->n_mounts+1))
+                return NULL;
+
+        return ml->mounts + ml->n_mounts++;
+}
+
+static int append_access_mounts(MountList *ml, char **strv, MountMode mode, bool forcibly_require_prefix) {
+        assert(ml);
+
+        /* Adds a list of user-supplied READ_WRITE/READ_WRITE_IMPLICIT/READ_ONLY/INACCESSIBLE entries */
+
+        STRV_FOREACH(i, strv) {
+                bool ignore = false, needs_prefix = false;
+                const char *e = *i;
+
+                /* Look for any prefixes */
+                if (startswith(e, "-")) {
+                        e++;
+                        ignore = true;
+                }
+                if (startswith(e, "+")) {
+                        e++;
+                        needs_prefix = true;
+                }
+
+                if (!path_is_absolute(e))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", e);
+
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = e,
+                        .mode = mode,
+                        .ignore = ignore,
+                        .has_prefix = !needs_prefix && !forcibly_require_prefix,
+                };
+        }
+
+        return 0;
+}
+
+static int append_empty_dir_mounts(MountList *ml, char **strv) {
+        assert(ml);
+
+        /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
+         * "/private/" boundary directories for DynamicUser=1. */
+
+        STRV_FOREACH(i, strv) {
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = *i,
+                        .mode = MOUNT_EMPTY_DIR,
+                        .ignore = false,
+                        .read_only = true,
+                        .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
+                        .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
+                };
+        }
+
+        return 0;
+}
+
+static int append_bind_mounts(MountList *ml, const BindMount *binds, size_t n) {
+        assert(ml);
+        assert(binds || n == 0);
+
+        FOREACH_ARRAY(b, binds, n) {
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = b->destination,
+                        .mode = b->recursive ? MOUNT_BIND_RECURSIVE : MOUNT_BIND,
+                        .read_only = b->read_only,
+                        .nosuid = b->nosuid,
+                        .source_const = b->source,
+                        .ignore = b->ignore_enoent,
+                };
+        }
+
+        return 0;
+}
+
+static int append_mount_images(MountList *ml, const MountImage *mount_images, size_t n) {
+        assert(ml);
+        assert(mount_images || n == 0);
+
+        FOREACH_ARRAY(m, mount_images, n) {
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = m->destination,
+                        .mode = MOUNT_IMAGE,
+                        .source_const = m->source,
+                        .image_options_const = m->mount_options,
+                        .ignore = m->ignore_enoent,
+                };
+        }
+
+        return 0;
+}
+
+static int append_extensions(
+                MountList *ml,
+                const char *root,
+                const char *extension_dir,
+                char **hierarchies,
+                const MountImage *mount_images,
+                size_t n,
+                char **extension_directories) {
+
+        char ***overlays = NULL;
+        size_t n_overlays = 0;
+        int r;
+
+        assert(ml);
+
+        if (n == 0 && strv_isempty(extension_directories))
+                return 0;
+
+        assert(extension_dir);
+
+        n_overlays = strv_length(hierarchies);
+        if (n_overlays == 0)
+                return 0;
+
+        /* Prepare a list of overlays, that will have as each element a strv containing all the layers that
+         * will later be concatenated as a lowerdir= parameter for the mount operation.
+         * The overlays vector will have the same number of elements and will correspond to the
+         * hierarchies vector, so they can be iterated upon together. */
+        overlays = new0(char**, n_overlays);
+        if (!overlays)
+                return -ENOMEM;
+
+        CLEANUP_ARRAY(overlays, n_overlays, strv_free_many);
+
+        /* First, prepare a mount for each image, but these won't be visible to the unit, instead
+         * they will be mounted in our propagate directory, and used as a source for the overlay. */
+        for (size_t i = 0; i < n; i++) {
+                _cleanup_free_ char *mount_point = NULL;
+                const MountImage *m = mount_images + i;
+
+                if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0)
+                        return -ENOMEM;
+
+                for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
+                        char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
+                        if (!prefixed_hierarchy)
+                                return -ENOMEM;
+
+                        r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
+                        if (r < 0)
+                                return r;
+                }
+
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return -ENOMEM;
+
+                *me = (MountEntry) {
+                        .path_malloc = TAKE_PTR(mount_point),
+                        .image_options_const = m->mount_options,
+                        .ignore = m->ignore_enoent,
+                        .source_const = m->source,
+                        .mode = MOUNT_EXTENSION_IMAGE,
+                        .has_prefix = true,
+                };
+        }
+
+        /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
+         * Bind mount them in the same location as the ExtensionImages, so that we
+         * can check that they are valid trees (extension-release.d). */
+        STRV_FOREACH(extension_directory, extension_directories) {
+                _cleanup_free_ char *mount_point = NULL, *source = NULL;
+                const char *e = *extension_directory;
+                bool ignore_enoent = false;
+
+                /* Pick up the counter where the ExtensionImages left it. */
+                if (asprintf(&mount_point, "%s/%zu", extension_dir, n++) < 0)
+                        return -ENOMEM;
+
+                /* Look for any prefixes */
+                if (startswith(e, "-")) {
+                        e++;
+                        ignore_enoent = true;
+                }
+                /* Ignore this for now */
+                if (startswith(e, "+"))
+                        e++;
+
+                source = strdup(e);
+                if (!source)
+                        return -ENOMEM;
+
+                for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
+                        char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
+                        if (!prefixed_hierarchy)
+                                return -ENOMEM;
+
+                        r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
+                        if (r < 0)
+                                return r;
+                }
+
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return -ENOMEM;
+
+                *me = (MountEntry) {
+                        .path_malloc = TAKE_PTR(mount_point),
+                        .source_malloc = TAKE_PTR(source),
+                        .mode = MOUNT_EXTENSION_DIRECTORY,
+                        .ignore = ignore_enoent,
+                        .has_prefix = true,
+                        .read_only = true,
+                };
+        }
+
+        /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
+         * set up earlier. */
+        for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
+                _cleanup_free_ char *prefixed_hierarchy = NULL;
+
+                prefixed_hierarchy = path_join(root, hierarchies[i]);
+                if (!prefixed_hierarchy)
+                        return -ENOMEM;
+
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return -ENOMEM;
+
+                *me = (MountEntry) {
+                        .path_malloc = TAKE_PTR(prefixed_hierarchy),
+                        .overlay_layers = TAKE_PTR(overlays[i]),
+                        .mode = MOUNT_OVERLAY,
+                        .has_prefix = true,
+                        .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
+                };
+        }
+
+        return 0;
+}
+
+static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, size_t n) {
+        assert(ml);
+        assert(tmpfs || n == 0);
+
+        FOREACH_ARRAY(t, tmpfs, n) {
+                _cleanup_free_ char *o = NULL, *str = NULL;
+                unsigned long flags;
+                bool ro = false;
+                int r;
+
+                if (!path_is_absolute(t->path))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", t->path);
+
+                str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
+                if (!str)
+                        return -ENOMEM;
+
+                r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
+
+                ro = flags & MS_RDONLY;
+                if (ro)
+                        flags ^= MS_RDONLY;
+
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = t->path,
+                        .mode = MOUNT_TMPFS,
+                        .read_only = ro,
+                        .options_malloc = TAKE_PTR(o),
+                        .flags = flags,
+                };
+        }
+
+        return 0;
+}
+
+static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t n, bool ignore_protect) {
+        assert(ml);
+        assert(mounts || n == 0);
+
+        /* Adds a list of static pre-defined entries */
+
+        FOREACH_ARRAY(m, mounts, n) {
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = mount_entry_path(m),
+                        .mode = m->mode,
+                        .ignore = m->ignore || ignore_protect,
+                };
+        }
+
+        return 0;
+}
+
+static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) {
+        assert(ml);
+
+        switch (protect_home) {
+
+        case PROTECT_HOME_NO:
+                return 0;
+
+        case PROTECT_HOME_READ_ONLY:
+                return append_static_mounts(ml, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
+
+        case PROTECT_HOME_TMPFS:
+                return append_static_mounts(ml, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
+
+        case PROTECT_HOME_YES:
+                return append_static_mounts(ml, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int append_protect_system(MountList *ml, ProtectSystem protect_system, bool ignore_protect) {
+        assert(ml);
+
+        switch (protect_system) {
+
+        case PROTECT_SYSTEM_NO:
+                return 0;
+
+        case PROTECT_SYSTEM_STRICT:
+                return append_static_mounts(ml, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
+
+        case PROTECT_SYSTEM_YES:
+                return append_static_mounts(ml, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
+
+        case PROTECT_SYSTEM_FULL:
+                return append_static_mounts(ml, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
+        int d;
+
+        /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
+         * regardless of the prefix - they are set up in the propagate directory anyway */
+        d = -CMP(a->mode == MOUNT_EXTENSION_IMAGE, b->mode == MOUNT_EXTENSION_IMAGE);
+        if (d != 0)
+                return d;
+        d = -CMP(a->mode == MOUNT_EXTENSION_DIRECTORY, b->mode == MOUNT_EXTENSION_DIRECTORY);
+        if (d != 0)
+                return d;
+
+        /* If the paths are not equal, then order prefixes first */
+        d = path_compare(mount_entry_path(a), mount_entry_path(b));
+        if (d != 0)
+                return d;
+
+        /* If the paths are equal, check the mode */
+        return CMP((int) a->mode, (int) b->mode);
+}
+
+static int prefix_where_needed(MountList *ml, const char *root_directory) {
+        /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
+
+        assert(ml);
+
+        FOREACH_ARRAY(me, ml->mounts, ml->n_mounts) {
+                char *s;
+
+                if (me->has_prefix)
+                        continue;
+
+                s = path_join(root_directory, mount_entry_path(me));
+                if (!s)
+                        return -ENOMEM;
+
+                mount_entry_consume_prefix(me, s);
+        }
+
+        return 0;
+}
+
+static void drop_duplicates(MountList *ml) {
+        MountEntry *f, *t, *previous;
+
+        assert(ml);
+
+        /* Drops duplicate entries. Expects that the array is properly ordered already. */
+
+        for (f = ml->mounts, t = ml->mounts, previous = NULL; f < ml->mounts + ml->n_mounts; f++) {
+
+                /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
+                 * above. Note that we only drop duplicates that haven't been mounted yet. */
+                if (previous &&
+                    path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
+                    f->state == MOUNT_PENDING && previous->state == MOUNT_PENDING) {
+                        log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
+                        /* Propagate the flags to the remaining entry */
+                        previous->read_only = previous->read_only || mount_entry_read_only(f);
+                        previous->noexec = previous->noexec || mount_entry_noexec(f);
+                        previous->exec = previous->exec || mount_entry_exec(f);
+                        mount_entry_done(f);
+                        continue;
+                }
+
+                *t = *f;
+                previous = t;
+                t++;
+        }
+
+        ml->n_mounts = t - ml->mounts;
+}
+
+static void drop_inaccessible(MountList *ml) {
+        MountEntry *f, *t;
+        const char *clear = NULL;
+
+        assert(ml);
+
+        /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
+         * ordered already. */
+
+        for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
+
+                /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
+                 * it, as inaccessible paths really should drop the entire subtree. */
+                if (clear && path_startswith(mount_entry_path(f), clear)) {
+                        log_debug("%s is masked by %s.", mount_entry_path(f), clear);
+                        mount_entry_done(f);
+                        continue;
+                }
+
+                clear = f->mode == MOUNT_INACCESSIBLE ? mount_entry_path(f) : NULL;
+
+                *t = *f;
+                t++;
+        }
+
+        ml->n_mounts = t - ml->mounts;
+}
+
+static void drop_nop(MountList *ml) {
+        MountEntry *f, *t;
+
+        assert(ml);
+
+        /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
+         * list is ordered by prefixes. */
+
+        for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
+
+                /* Only suppress such subtrees for READ_ONLY, READ_WRITE and READ_WRITE_IMPLICIT entries */
+                if (IN_SET(f->mode, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_READ_WRITE_IMPLICIT)) {
+                        MountEntry *found = NULL;
+
+                        /* Now let's find the first parent of the entry we are looking at. */
+                        for (MountEntry *p = PTR_SUB1(t, ml->mounts); p; p = PTR_SUB1(p, ml->mounts))
+                                if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
+                                        found = p;
+                                        break;
+                                }
+
+                        /* We found it, let's see if it's the same mode, if so, we can drop this entry */
+                        if (found && found->mode == f->mode) {
+                                log_debug("%s (%s) is made redundant by %s (%s)",
+                                          mount_entry_path(f), mount_mode_to_string(f->mode),
+                                          mount_entry_path(found), mount_mode_to_string(found->mode));
+                                mount_entry_done(f);
+                                continue;
+                        }
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        ml->n_mounts = t - ml->mounts;
+}
+
+static void drop_outside_root(MountList *ml, const char *root_directory) {
+        MountEntry *f, *t;
+
+        assert(ml);
+
+        /* Nothing to do */
+        if (!root_directory)
+                return;
+
+        /* Drops all mounts that are outside of the root directory. */
+
+        for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
+
+                /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */
+                if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) && !path_startswith(mount_entry_path(f), root_directory)) {
+                        log_debug("%s is outside of root directory.", mount_entry_path(f));
+                        mount_entry_done(f);
+                        continue;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        ml->n_mounts = t - ml->mounts;
+}
+
+static int clone_device_node(
+                const char *d,
+                const char *temporary_mount,
+                bool *make_devnode) {
+
+        _cleanup_free_ char *sl = NULL;
+        const char *dn, *bn, *t;
+        struct stat st;
+        int r;
+
+        if (stat(d, &st) < 0) {
+                if (errno == ENOENT) {
+                        log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
+                        return -ENXIO;
+                }
+
+                return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
+        }
+
+        if (!S_ISBLK(st.st_mode) &&
+            !S_ISCHR(st.st_mode))
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Device node '%s' to clone is not a device node, ignoring.",
+                                       d);
+
+        dn = strjoina(temporary_mount, d);
+
+        /* First, try to create device node properly */
+        if (*make_devnode) {
+                mac_selinux_create_file_prepare(d, st.st_mode);
+                r = mknod(dn, st.st_mode, st.st_rdev);
+                mac_selinux_create_file_clear();
+                if (r >= 0)
+                        goto add_symlink;
+                if (errno != EPERM)
+                        return log_debug_errno(errno, "mknod failed for %s: %m", d);
+
+                /* This didn't work, let's not try this again for the next iterations. */
+                *make_devnode = false;
+        }
+
+        /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
+         * Do not prepare device-node SELinux label (see issue 13762) */
+        r = mknod(dn, S_IFREG, 0);
+        if (r < 0 && errno != EEXIST)
+                return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
+
+        /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
+         * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
+         * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
+        r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
+
+add_symlink:
+        bn = path_startswith(d, "/dev/");
+        if (!bn)
+                return 0;
+
+        /* Create symlinks like /dev/char/1:9 → ../urandom */
+        if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
+                     temporary_mount,
+                     S_ISCHR(st.st_mode) ? "char" : "block",
+                     DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
+                return log_oom_debug();
+
+        (void) mkdir_parents(sl, 0755);
+
+        t = strjoina("../", bn);
+        if (symlink(t, sl) < 0)
+                log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
+
+        return 0;
+}
+
+static char *settle_runtime_dir(RuntimeScope scope) {
+        char *runtime_dir;
+
+        if (scope != RUNTIME_SCOPE_USER)
+                return strdup("/run/");
+
+        if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+                return NULL;
+
+        return runtime_dir;
+}
+
+static int create_temporary_mount_point(RuntimeScope scope, char **ret) {
+        _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
+
+        assert(ret);
+
+        runtime_dir = settle_runtime_dir(scope);
+        if (!runtime_dir)
+                return log_oom_debug();
+
+        temporary_mount = path_join(runtime_dir, "systemd/namespace-XXXXXX");
+        if (!temporary_mount)
+                return log_oom_debug();
+
+        if (!mkdtemp(temporary_mount))
+                return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
+
+        *ret = TAKE_PTR(temporary_mount);
+        return 0;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
+        static const char devnodes[] =
+                "/dev/null\0"
+                "/dev/zero\0"
+                "/dev/full\0"
+                "/dev/random\0"
+                "/dev/urandom\0"
+                "/dev/tty\0";
+
+        _cleanup_free_ char *temporary_mount = NULL;
+        const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
+        bool can_mknod = true;
+        int r;
+
+        assert(m);
+
+        r = create_temporary_mount_point(scope, &temporary_mount);
+        if (r < 0)
+                return r;
+
+        dev = strjoina(temporary_mount, "/dev");
+        (void) mkdir(dev, 0755);
+        r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
+        if (r < 0)
+                goto fail;
+
+        r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev);
+                goto fail;
+        }
+
+        devpts = strjoina(temporary_mount, "/dev/pts");
+        (void) mkdir(devpts, 0755);
+        r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
+        if (r < 0)
+                goto fail;
+
+        /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
+         * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
+         * Thus, in that case make a clone.
+         * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
+        r = is_symlink("/dev/ptmx");
+        if (r < 0) {
+                log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
+                goto fail;
+        } else if (r > 0) {
+                devptmx = strjoina(temporary_mount, "/dev/ptmx");
+                if (symlink("pts/ptmx", devptmx) < 0) {
+                        r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
+                        goto fail;
+                }
+        } else {
+                r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
+                if (r < 0)
+                        goto fail;
+        }
+
+        devshm = strjoina(temporary_mount, "/dev/shm");
+        (void) mkdir(devshm, 0755);
+        r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
+        if (r < 0)
+                goto fail;
+
+        devmqueue = strjoina(temporary_mount, "/dev/mqueue");
+        (void) mkdir(devmqueue, 0755);
+        (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
+
+        devhugepages = strjoina(temporary_mount, "/dev/hugepages");
+        (void) mkdir(devhugepages, 0755);
+        (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
+
+        devlog = strjoina(temporary_mount, "/dev/log");
+        if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
+                log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
+
+        NULSTR_FOREACH(d, devnodes) {
+                r = clone_device_node(d, temporary_mount, &can_mknod);
+                /* ENXIO means the *source* is not a device file, skip creation in that case */
+                if (r < 0 && r != -ENXIO)
+                        goto fail;
+        }
+
+        r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
+        if (r < 0)
+                log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
+
+        /* Create the /dev directory if missing. It is more likely to be missing when the service is started
+         * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
+        (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+        /* Unmount everything in old /dev */
+        r = umount_recursive(mount_entry_path(m), 0);
+        if (r < 0)
+                log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
+
+        r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
+        if (r < 0)
+                goto fail;
+
+        (void) rmdir(dev);
+        (void) rmdir(temporary_mount);
+
+        return 1;
+
+fail:
+        if (devpts)
+                (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
+
+        if (devshm)
+                (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
+
+        if (devhugepages)
+                (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
+
+        if (devmqueue)
+                (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
+
+        (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
+        (void) rmdir(dev);
+        (void) rmdir(temporary_mount);
+
+        return r;
+}
+
+static int mount_bind_dev(const MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
+         * service's /dev. This is only used when RootDirectory= is set. */
+
+        (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
+        if (r > 0) /* make this a NOP if /dev is already a mount point */
+                return 0;
+
+        r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int mount_bind_sysfs(const MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
+        if (r > 0) /* make this a NOP if /sys is already a mount point */
+                return 0;
+
+        /* Bind mount the host's version so that we get all child mounts of it, too. */
+        r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int mount_private_apivfs(
+                const char *fstype,
+                const char *entry_path,
+                const char *bind_source,
+                const char *opts,
+                RuntimeScope scope) {
+
+        _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
+        int r;
+
+        assert(fstype);
+        assert(entry_path);
+        assert(bind_source);
+
+        (void) mkdir_p_label(entry_path, 0755);
+
+        /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance
+         * cannot be mounted on an already existing mount. Let's use a temporary place. */
+        r = create_temporary_mount_point(scope, &temporary_mount);
+        if (r < 0)
+                return r;
+
+        r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+        if (r == -EINVAL && opts)
+                /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
+                 * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
+                 * really don't want to use it, since it would affect our host's /proc mount. Hence let's
+                 * gracefully fallback to a classic, unrestricted version. */
+                r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                /* When we do not have enough privileges to mount a new instance, fall back to use an
+                 * existing mount. */
+
+                r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0);
+                if (r < 0)
+                        return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
+                if (r > 0)
+                        return 0; /* Use the current mount as is. */
+
+                /* We lack permissions to mount a new instance, and it is not already mounted. But we can
+                 * access the host's, so as a final fallback bind-mount it to the destination, as most likely
+                 * we are inside a user manager in an unprivileged user namespace. */
+                r = mount_nofollow_verbose(LOG_DEBUG, bind_source, entry_path, /* fstype = */ NULL, MS_BIND|MS_REC, /* opts = */ NULL);
+                if (r < 0)
+                        return r;
+
+                return 1;
+
+        } else if (r < 0)
+                return r;
+
+        /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
+        r = umount_recursive(entry_path, /* flags = */ 0);
+        if (r < 0)
+                log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path);
+
+        /* Then, move the new mount instance. */
+        r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* opts = */ NULL);
+        if (r < 0)
+                return r;
+
+        /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
+         * where a bunch of files are overmounted, in particular the boot id. */
+        (void) bind_mount_submounts(bind_source, entry_path);
+        return 1;
+}
+
+static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) {
+        assert(m);
+        assert(p);
+        return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
+}
+
+static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
+        _cleanup_free_ char *opts = NULL;
+
+        assert(m);
+        assert(p);
+
+        if (p->protect_proc != PROTECT_PROC_DEFAULT ||
+            p->proc_subset != PROC_SUBSET_ALL) {
+
+                /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
+                 * pretended to be per-instance but actually was per-namespace), hence let's make use of it
+                 * if requested. To make sure this logic succeeds only on kernels where hidepid= is
+                 * per-instance, we'll exclusively use the textual value for hidepid=, since support was
+                 * added in the same commit: if it's supported it is thus also per-instance. */
+
+                const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ?
+                                  "off" :
+                                  protect_proc_to_string(p->protect_proc);
+
+                /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
+                 * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
+                 * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
+                 * fsopen()/fsconfig() was also backported on some distros which allows us to detect
+                 * hidepid=/subset= support in even more scenarios. */
+
+                if (mount_option_supported("proc", "hidepid", hpv) != 0) {
+                        opts = strjoin("hidepid=", hpv);
+                        if (!opts)
+                                return -ENOMEM;
+                }
+
+                if (p->proc_subset == PROC_SUBSET_PID &&
+                    mount_option_supported("proc", "subset", "pid") != 0)
+                        if (!strextend_with_separator(&opts, ",", "subset=pid"))
+                                return -ENOMEM;
+        }
+
+        /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
+         * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
+         * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
+         * mounted on /proc/ first. */
+        return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope);
+}
+
+static int mount_tmpfs(const MountEntry *m) {
+        const char *entry_path, *inner_path;
+        int r;
+
+        assert(m);
+
+        entry_path = mount_entry_path(m);
+        inner_path = mount_entry_unprefixed_path(m);
+
+        /* First, get rid of everything that is below if there is anything. Then, overmount with our new
+         * tmpfs */
+
+        (void) mkdir_p_label(entry_path, 0755);
+        (void) umount_recursive(entry_path, 0);
+
+        r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
+        if (r < 0)
+                return r;
+
+        r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
+
+        return 1;
+}
+
+static int mount_run(const MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0 && r != -ENOENT)
+                return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
+        if (r > 0) /* make this a NOP if /run is already a mount point */
+                return 0;
+
+        return mount_tmpfs(m);
+}
+
+static int mount_mqueuefs(const MountEntry *m) {
+        int r;
+        const char *entry_path;
+
+        assert(m);
+
+        entry_path = mount_entry_path(m);
+
+        (void) mkdir_p_label(entry_path, 0755);
+        (void) umount_recursive(entry_path, 0);
+
+        r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int mount_image(
+                const MountEntry *m,
+                const char *root_directory,
+                const ImagePolicy *image_policy) {
+
+        _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
+                            *host_os_release_sysext_level = NULL, *host_os_release_confext_level = NULL,
+                            *extension_name = NULL;
+        int r;
+
+        assert(m);
+
+        r = path_extract_filename(mount_entry_source(m), &extension_name);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
+
+        if (m->mode == MOUNT_EXTENSION_IMAGE) {
+                r = parse_os_release(
+                                empty_to_root(root_directory),
+                                "ID", &host_os_release_id,
+                                "VERSION_ID", &host_os_release_version_id,
+                                image_class_info[IMAGE_SYSEXT].level_env, &host_os_release_sysext_level,
+                                image_class_info[IMAGE_CONFEXT].level_env, &host_os_release_confext_level,
+                                NULL);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+                if (isempty(host_os_release_id))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+        }
+
+        r = verity_dissect_and_mount(
+                        /* src_fd= */ -1,
+                        mount_entry_source(m),
+                        mount_entry_path(m),
+                        m->image_options_const,
+                        image_policy,
+                        host_os_release_id,
+                        host_os_release_version_id,
+                        host_os_release_sysext_level,
+                        host_os_release_confext_level,
+                        /* required_sysext_scope= */ NULL,
+                        /* ret_image= */ NULL);
+        if (r == -ENOENT && m->ignore)
+                return 0;
+        if (r == -ESTALE && host_os_release_id)
+                return log_error_errno(r, // FIXME: this should not be logged ad LOG_ERR, as it will result in duplicate logging.
+                                       "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s%s%s%s%s%s%s",
+                                       mount_entry_source(m),
+                                       host_os_release_id,
+                                       host_os_release_version_id ? " VERSION_ID=" : "",
+                                       strempty(host_os_release_version_id),
+                                       host_os_release_sysext_level ? image_class_info[IMAGE_SYSEXT].level_env_print : "",
+                                       strempty(host_os_release_sysext_level),
+                                       host_os_release_confext_level ? image_class_info[IMAGE_CONFEXT].level_env_print : "",
+                                       strempty(host_os_release_confext_level));
+        if (r < 0)
+                return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
+
+        return 1;
+}
+
+static int mount_overlay(const MountEntry *m) {
+        _cleanup_free_ char *options = NULL, *layers = NULL;
+        int r;
+
+        assert(m);
+
+        /* Extension hierarchies are optional (e.g.: confext might not have /opt) so check if they actually
+         * exist in an image before attempting to create an overlay with them, otherwise the mount will
+         * fail. We can't check before this, as the images will not be mounted until now. */
+
+        /* Note that lowerdir= parameters are in 'reverse' order, so the top-most directory in the overlay
+         * comes first in the list. */
+        STRV_FOREACH_BACKWARDS(o, m->overlay_layers) {
+                _cleanup_free_ char *escaped = NULL;
+
+                r = is_dir(*o, /* follow= */ false);
+                if (r <= 0) {
+                        if (r != -ENOENT)
+                                log_debug_errno(r,
+                                                "Failed to check whether overlay layer source path '%s' exists, ignoring: %m",
+                                                *o);
+                        continue;
+                }
+
+                escaped = shell_escape(*o, ",:");
+                if (!escaped)
+                        return log_oom_debug();
+
+                if (!strextend_with_separator(&layers, ":", escaped))
+                        return log_oom_debug();
+        }
+
+        if (!layers) {
+                log_debug("None of the overlays specified in '%s' exist at the source, skipping.",
+                          mount_entry_options(m));
+                return 0; /* Only the root is set? Then there's nothing to overlay */
+        }
+
+        options = strjoin("lowerdir=", layers, ":", mount_entry_path(m)); /* The root goes in last */
+        if (!options)
+                return log_oom_debug();
+
+        (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+        r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
+        if (r == -ENOENT && m->ignore)
+                return 0;
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int follow_symlink(
+                const char *root_directory,
+                MountEntry *m) {
+
+        _cleanup_free_ char *target = NULL;
+        int r;
+
+        /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
+         * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
+         * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
+         * end and already have a fully normalized name. */
+
+        r = chase(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
+        if (r > 0) /* Reached the end, nothing more to resolve */
+                return 1;
+
+        if (m->n_followed >= CHASE_MAX) /* put a boundary on things */
+                return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+                                       "Symlink loop on '%s'.",
+                                       mount_entry_path(m));
+
+        log_debug("Followed mount entry path symlink %s %s %s.",
+                  mount_entry_path(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), target);
+
+        mount_entry_consume_prefix(m, TAKE_PTR(target));
+
+        m->n_followed ++;
+
+        return 0;
+}
+
+static int apply_one_mount(
+                const char *root_directory,
+                MountEntry *m,
+                const NamespaceParameters *p) {
+
+        _cleanup_free_ char *inaccessible = NULL;
+        bool rbind = true, make = false;
+        const char *what;
+        int r;
+
+        /* Return 1 when the mount should be post-processed (remounted r/o, etc.), 0 otherwise. In most
+         * cases post-processing is the right thing, the typical exception is when the mount is gracefully
+         * skipped. */
+
+        assert(m);
+        assert(p);
+
+        log_debug("Applying namespace mount on %s", mount_entry_path(m));
+
+        switch (m->mode) {
+
+        case MOUNT_INACCESSIBLE: {
+                _cleanup_free_ char *runtime_dir = NULL;
+                struct stat target;
+
+                /* First, get rid of everything that is below if there
+                 * is anything... Then, overmount it with an
+                 * inaccessible path. */
+                (void) umount_recursive(mount_entry_path(m), 0);
+
+                if (lstat(mount_entry_path(m), &target) < 0) {
+                        if (errno == ENOENT && m->ignore)
+                                return 0;
+
+                        return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
+                                               mount_entry_path(m));
+                }
+
+                /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+                 * means that the root user's --user services will use the host's inaccessible inodes rather
+                 * then root's private ones. This is preferable since it means device nodes that are
+                 * overmounted to make them inaccessible will be overmounted with a device node, rather than
+                 * an AF_UNIX socket inode. */
+                runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+                if (!runtime_dir)
+                        return log_oom_debug();
+
+                r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
+                if (r < 0)
+                        return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+                                               "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
+                what = inaccessible;
+                break;
+        }
+
+        case MOUNT_READ_ONLY:
+        case MOUNT_READ_WRITE:
+        case MOUNT_READ_WRITE_IMPLICIT:
+        case MOUNT_EXEC:
+        case MOUNT_NOEXEC:
+                r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
+                if (r == -ENOENT && m->ignore)
+                        return 0;
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
+                                               mount_entry_path(m));
+                if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
+                            * and MS_NOEXEC bits for the mount point if needed. */
+                        return 1;
+                /* This isn't a mount point yet, let's make it one. */
+                what = mount_entry_path(m);
+                break;
+
+        case MOUNT_EXTENSION_DIRECTORY: {
+                _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
+                                *host_os_release_level = NULL, *extension_name = NULL;
+                _cleanup_strv_free_ char **extension_release = NULL;
+                ImageClass class = IMAGE_SYSEXT;
+
+                r = path_extract_filename(mount_entry_source(m), &extension_name);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
+
+                r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                if (r == -ENOENT) {
+                        r = load_extension_release_pairs(mount_entry_source(m), IMAGE_CONFEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                        if (r >= 0)
+                                class = IMAGE_CONFEXT;
+                }
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to acquire 'extension-release' data of extension tree %s: %m", mount_entry_source(m));
+
+                r = parse_os_release(
+                                empty_to_root(root_directory),
+                                "ID", &host_os_release_id,
+                                "VERSION_ID", &host_os_release_version_id,
+                                image_class_info[class].level_env, &host_os_release_level,
+                                NULL);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+                if (isempty(host_os_release_id))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+
+                r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+                if (r == -ENOENT && m->ignore)
+                        return 0;
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to parse directory %s extension-release metadata: %m", extension_name);
+
+                r = extension_release_validate(
+                                extension_name,
+                                host_os_release_id,
+                                host_os_release_version_id,
+                                host_os_release_level,
+                                /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */
+                                extension_release,
+                                class);
+                if (r == 0)
+                        return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
+
+                _fallthrough_;
+        }
+
+        case MOUNT_BIND:
+                rbind = false;
+
+                _fallthrough_;
+        case MOUNT_BIND_RECURSIVE: {
+                _cleanup_free_ char *chased = NULL;
+
+                /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
+                 * that bind mount source paths are always relative to the host root, hence we pass NULL as
+                 * root directory to chase() here. */
+
+                r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
+                if (r == -ENOENT && m->ignore) {
+                        log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
+                        return 0;
+                }
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
+
+                log_debug("Followed source symlinks %s %s %s.",
+                          mount_entry_source(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), chased);
+
+                free_and_replace(m->source_malloc, chased);
+
+                what = mount_entry_source(m);
+                make = true;
+                break;
+        }
+
+        case MOUNT_EMPTY_DIR:
+        case MOUNT_TMPFS:
+                return mount_tmpfs(m);
+
+        case MOUNT_PRIVATE_TMP:
+        case MOUNT_PRIVATE_TMP_READ_ONLY:
+                what = mount_entry_source(m);
+                make = true;
+                break;
+
+        case MOUNT_PRIVATE_DEV:
+                return mount_private_dev(m, p->runtime_scope);
+
+        case MOUNT_BIND_DEV:
+                return mount_bind_dev(m);
+
+        case MOUNT_PRIVATE_SYSFS:
+                return mount_private_sysfs(m, p);
+
+        case MOUNT_BIND_SYSFS:
+                return mount_bind_sysfs(m);
+
+        case MOUNT_PROCFS:
+                return mount_procfs(m, p);
+
+        case MOUNT_RUN:
+                return mount_run(m);
+
+        case MOUNT_MQUEUEFS:
+                return mount_mqueuefs(m);
+
+        case MOUNT_IMAGE:
+                return mount_image(m, NULL, p->mount_image_policy);
+
+        case MOUNT_EXTENSION_IMAGE:
+                return mount_image(m, root_directory, p->extension_image_policy);
+
+        case MOUNT_OVERLAY:
+                return mount_overlay(m);
+
+        default:
+                assert_not_reached();
+        }
+
+        assert(what);
+
+        r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
+        if (r < 0) {
+                bool try_again = false;
+
+                if (r == -ENOENT && make) {
+                        int q;
+
+                        /* Hmm, either the source or the destination are missing. Let's see if we can create
+                           the destination, then try again. */
+
+                        (void) mkdir_parents(mount_entry_path(m), 0755);
+
+                        q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
+                        if (q < 0) {
+                                if (q != -EEXIST) // FIXME: this shouldn't be logged at LOG_WARNING, but be bubbled up, and logged there to avoid duplicate logging
+                                        log_warning_errno(q, "Failed to create destination mount point node '%s', ignoring: %m",
+                                                          mount_entry_path(m));
+                        } else
+                                try_again = true;
+                }
+
+                if (try_again)
+                        r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); // FIXME: this should not be logged here, but be bubbled up, to avoid duplicate logging
+        }
+
+        log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
+        return 1;
+}
+
+static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
+        unsigned long new_flags = 0, flags_mask = 0;
+        bool submounts;
+        int r;
+
+        assert(m);
+        assert(proc_self_mountinfo);
+
+        if (m->state != MOUNT_APPLIED)
+                return 0;
+
+        if (mount_entry_read_only(m) || m->mode == MOUNT_PRIVATE_DEV) {
+                new_flags |= MS_RDONLY;
+                flags_mask |= MS_RDONLY;
+        }
+
+        if (m->nosuid) {
+                new_flags |= MS_NOSUID;
+                flags_mask |= MS_NOSUID;
+        }
+
+        if (flags_mask == 0) /* No Change? */
+                return 0;
+
+        /* We generally apply these changes recursively, except for /dev, and the cases we know there's
+         * nothing further down.  Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
+         * per-mount read-only flag.  We can't set it on the superblock, if we are inside a user namespace
+         * and running Linux <= 4.17. */
+        submounts =
+                mount_entry_read_only(m) &&
+                !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+        if (submounts)
+                r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
+        else
+                r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
+
+        /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
+         * read-only already stays this way. This improves compatibility with container managers, where we
+         * won't attempt to undo read-only mounts already applied. */
+
+        if (r == -ENOENT && m->ignore)
+                return 0;
+        if (r < 0)
+                return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+                                       submounts ? " and its submounts" : "");
+        return 0;
+}
+
+static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
+        unsigned long new_flags = 0, flags_mask = 0;
+        bool submounts;
+        int r;
+
+        assert(m);
+        assert(proc_self_mountinfo);
+
+        if (m->state != MOUNT_APPLIED)
+                return 0;
+
+        if (mount_entry_noexec(m)) {
+                new_flags |= MS_NOEXEC;
+                flags_mask |= MS_NOEXEC;
+        } else if (mount_entry_exec(m)) {
+                new_flags &= ~MS_NOEXEC;
+                flags_mask |= MS_NOEXEC;
+        }
+
+        if (flags_mask == 0) /* No Change? */
+                return 0;
+
+        submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+
+        if (submounts)
+                r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
+        else
+                r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
+
+        if (r == -ENOENT && m->ignore)
+                return 0;
+        if (r < 0)
+                return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+                                       submounts ? " and its submounts" : "");
+        return 0;
+}
+
+static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
+        bool submounts;
+        int r;
+
+        assert(m);
+        assert(proc_self_mountinfo);
+
+        if (m->state != MOUNT_APPLIED)
+                return 0;
+
+        submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+        if (submounts)
+                r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
+        else
+                r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
+        if (r == -ENOENT && m->ignore)
+                return 0;
+        if (r < 0)
+                return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+                                       submounts ? " and its submounts" : "");
+        return 0;
+}
+
+static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
+        assert(p);
+
+        /*
+         * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
+         * since to protect the API VFS mounts, they need to be around in the
+         * first place...
+         */
+
+        return p->mount_apivfs ||
+                p->protect_control_groups ||
+                p->protect_kernel_tunables ||
+                p->protect_proc != PROTECT_PROC_DEFAULT ||
+                p->proc_subset != PROC_SUBSET_ALL;
+}
+
+/* Walk all mount entries and dropping any unused mounts. This affects all
+ * mounts:
+ * - that are implicitly protected by a path that has been rendered inaccessible
+ * - whose immediate parent requests the same protection mode as the mount itself
+ * - that are outside of the relevant root directory
+ * - which are duplicates
+ */
+static void drop_unused_mounts(MountList *ml, const char *root_directory) {
+        assert(ml);
+        assert(root_directory);
+
+        assert(ml->mounts || ml->n_mounts == 0);
+
+        typesafe_qsort(ml->mounts, ml->n_mounts, mount_path_compare);
+
+        drop_duplicates(ml);
+        drop_outside_root(ml, root_directory);
+        drop_inaccessible(ml);
+        drop_nop(ml);
+}
+
+static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
+        int r;
+
+        STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
+                _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
+
+                src_abs = path_join(root, *src);
+                dst_abs = path_join(root, *dst);
+                if (!src_abs || !dst_abs)
+                        return -ENOMEM;
+
+                r = mkdir_parents_label(dst_abs, 0755);
+                if (r < 0)
+                        return log_debug_errno(
+                                        r,
+                                        "Failed to create parent directory for symlink '%s': %m",
+                                        dst_abs);
+
+                r = symlink_idempotent(src_abs, dst_abs, true);
+                if (r < 0)
+                        return log_debug_errno(
+                                        r,
+                                        "Failed to create symlink from '%s' to '%s': %m",
+                                        src_abs,
+                                        dst_abs);
+        }
+
+        return 0;
+}
+
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+        assert(m);
+
+        /* Create a string suitable for debugging logs, stripping for example the local working directory.
+         * For example, with a BindPaths=/var/bar that does not exist on the host:
+         *
+         * Before:
+         *  foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+         * After:
+         *  foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+         *
+         * Note that this is an error path, so no OOM check is done on purpose. */
+
+        if (!error_path)
+                return;
+
+        if (!mount_entry_path(m)) {
+                *error_path = NULL;
+                return;
+        }
+
+        if (root) {
+                const char *e = startswith(mount_entry_path(m), root);
+                if (e) {
+                        *error_path = strdup(e);
+                        return;
+                }
+        }
+
+        *error_path = strdup(mount_entry_path(m));
+        return;
+}
+
+static int apply_mounts(
+                MountList *ml,
+                const char *root,
+                const NamespaceParameters *p,
+                char **error_path) {
+
+        _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
+        _cleanup_free_ char **deny_list = NULL;
+        int r;
+
+        assert(ml);
+        assert(root);
+        assert(p);
+
+        if (ml->n_mounts == 0) /* Shortcut: nothing to do */
+                return 0;
+
+        /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
+         * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
+        proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
+        if (!proc_self_mountinfo) {
+                r = -errno;
+
+                if (error_path)
+                        *error_path = strdup("/proc/self/mountinfo");
+
+                return log_debug_errno(r, "Failed to open /proc/self/mountinfo: %m");
+        }
+
+        /* First round, establish all mounts we need */
+        for (;;) {
+                bool again = false;
+
+                FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+
+                        if (m->state != MOUNT_PENDING)
+                                continue;
+
+                        /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
+                        r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) ? root : NULL, m);
+                        if (r < 0) {
+                                mount_entry_path_debug_string(root, m, error_path);
+                                return r;
+                        }
+                        if (r == 0) {
+                                /* We hit a symlinked mount point. The entry got rewritten and might
+                                 * point to a very different place now. Let's normalize the changed
+                                 * list, and start from the beginning. After all to mount the entry
+                                 * at the new location we might need some other mounts first */
+                                again = true;
+                                break;
+                        }
+
+                        /* Returns 1 if the mount should be post-processed, 0 otherwise */
+                        r = apply_one_mount(root, m, p);
+                        if (r < 0) {
+                                mount_entry_path_debug_string(root, m, error_path);
+                                return r;
+                        }
+                        m->state = r == 0 ? MOUNT_SKIPPED : MOUNT_APPLIED;
+                }
+
+                if (!again)
+                        break;
+
+                drop_unused_mounts(ml, root);
+        }
+
+        /* Now that all filesystems have been set up, but before the
+         * read-only switches are flipped, create the exec dirs and other symlinks.
+         * Note that when /var/lib is not empty/tmpfs, these symlinks will already
+         * exist, which means this will be a no-op. */
+        r = create_symlinks_from_tuples(root, p->symlinks);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
+
+        /* Create a deny list we can pass to bind_mount_recursive() */
+        deny_list = new(char*, ml->n_mounts+1);
+        if (!deny_list)
+                return -ENOMEM;
+        for (size_t j = 0; j < ml->n_mounts; j++)
+                deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
+        deny_list[ml->n_mounts] = NULL;
+
+        /* Second round, flip the ro bits if necessary. */
+        FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+                r = make_read_only(m, deny_list, proc_self_mountinfo);
+                if (r < 0) {
+                        mount_entry_path_debug_string(root, m, error_path);
+                        return r;
+                }
+        }
+
+        /* Third round, flip the noexec bits with a simplified deny list. */
+        for (size_t j = 0; j < ml->n_mounts; j++)
+                if (IN_SET((ml->mounts+j)->mode, MOUNT_EXEC, MOUNT_NOEXEC))
+                        deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
+        deny_list[ml->n_mounts] = NULL;
+
+        FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+                r = make_noexec(m, deny_list, proc_self_mountinfo);
+                if (r < 0) {
+                        mount_entry_path_debug_string(root, m, error_path);
+                        return r;
+                }
+        }
+
+        /* Fourth round, flip the nosuid bits without a deny list. */
+        if (p->mount_nosuid)
+                FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+                        r = make_nosuid(m, proc_self_mountinfo);
+                        if (r < 0) {
+                                mount_entry_path_debug_string(root, m, error_path);
+                                return r;
+                        }
+                }
+
+        return 1;
+}
+
+static bool root_read_only(
+                char **read_only_paths,
+                ProtectSystem protect_system) {
+
+        /* Determine whether the root directory is going to be read-only given the configured settings. */
+
+        if (protect_system == PROTECT_SYSTEM_STRICT)
+                return true;
+
+        if (prefixed_path_strv_contains(read_only_paths, "/"))
+                return true;
+
+        return false;
+}
+
+static bool home_read_only(
+                char** read_only_paths,
+                char** inaccessible_paths,
+                char** empty_directories,
+                const BindMount *bind_mounts,
+                size_t n_bind_mounts,
+                const TemporaryFileSystem *temporary_filesystems,
+                size_t n_temporary_filesystems,
+                ProtectHome protect_home) {
+
+        /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
+         * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
+         * settings. */
+
+        if (protect_home != PROTECT_HOME_NO)
+                return true;
+
+        if (prefixed_path_strv_contains(read_only_paths, "/home") ||
+            prefixed_path_strv_contains(inaccessible_paths, "/home") ||
+            prefixed_path_strv_contains(empty_directories, "/home"))
+                return true;
+
+        for (size_t i = 0; i < n_temporary_filesystems; i++)
+                if (path_equal(temporary_filesystems[i].path, "/home"))
+                        return true;
+
+        /* If /home is overmounted with some dir from the host it's not writable. */
+        for (size_t i = 0; i < n_bind_mounts; i++)
+                if (path_equal(bind_mounts[i].destination, "/home"))
+                        return true;
+
+        return false;
+}
+
+int setup_namespace(const NamespaceParameters *p, char **error_path) {
+
+        _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+        _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+        _cleanup_strv_free_ char **hierarchies = NULL;
+        _cleanup_(mount_list_done) MountList ml = {};
+        bool require_prefix = false;
+        const char *root;
+        DissectImageFlags dissect_image_flags =
+                DISSECT_IMAGE_GENERIC_ROOT |
+                DISSECT_IMAGE_REQUIRE_ROOT |
+                DISSECT_IMAGE_DISCARD_ON_LOOP |
+                DISSECT_IMAGE_RELAX_VAR_CHECK |
+                DISSECT_IMAGE_FSCK |
+                DISSECT_IMAGE_USR_NO_ROOT |
+                DISSECT_IMAGE_GROWFS |
+                DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+                DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+        int r;
+
+        assert(p);
+
+        /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
+         * we configure take effect */
+        BLOCK_WITH_UMASK(0000);
+
+        bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
+        unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
+
+        if (p->root_image) {
+                /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
+                if (root_read_only(p->read_only_paths,
+                                   p->protect_system) &&
+                    home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories,
+                                   p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems,
+                                   p->protect_home) &&
+                    strv_isempty(p->read_write_paths))
+                        dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
+
+                SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
+
+                r = loop_device_make_by_path(
+                                p->root_image,
+                                FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
+                                /* sector_size= */ UINT32_MAX,
+                                FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+                                LOCK_SH,
+                                &loop_device);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to create loop device for root image: %m");
+
+                r = dissect_loop_device(
+                                loop_device,
+                                p->verity,
+                                p->root_image_options,
+                                p->root_image_policy,
+                                dissect_image_flags,
+                                &dissected_image);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to dissect image: %m");
+
+                r = dissected_image_load_verity_sig_partition(
+                                dissected_image,
+                                loop_device->fd,
+                                p->verity);
+                if (r < 0)
+                        return r;
+
+                r = dissected_image_decrypt(
+                                dissected_image,
+                                NULL,
+                                p->verity,
+                                dissect_image_flags);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+        }
+
+        if (p->root_directory)
+                root = p->root_directory;
+        else {
+                /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
+                 * when running tests (test-execute), it might not have been created yet so let's make sure
+                 * we create it if it doesn't already exist. */
+                (void) mkdir_p_label("/run/systemd", 0755);
+
+                /* Always create the mount namespace in a temporary directory, instead of operating directly
+                 * in the root. The temporary directory prevents any mounts from being potentially obscured
+                 * my other mounts we already applied.  We use the same mount point for all images, which is
+                 * safe, since they all live in their own namespaces after all, and hence won't see each
+                 * other. (Note: this directory is also created by PID 1 early on, we create it here for
+                 * similar reasons as /run/systemd/ first.) */
+                root = "/run/systemd/mount-rootfs";
+                (void) mkdir_label(root, 0555);
+
+                require_prefix = true;
+        }
+
+        if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
+                /* Hierarchy population needs to be done for sysext and confext extension images */
+                r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES");
+                if (r < 0)
+                        return r;
+        }
+
+        r = append_access_mounts(&ml, p->read_write_paths, MOUNT_READ_WRITE, require_prefix);
+        if (r < 0)
+                return r;
+
+        r = append_access_mounts(&ml, p->read_only_paths, MOUNT_READ_ONLY, require_prefix);
+        if (r < 0)
+                return r;
+
+        r = append_access_mounts(&ml, p->inaccessible_paths, MOUNT_INACCESSIBLE, require_prefix);
+        if (r < 0)
+                return r;
+
+        r = append_access_mounts(&ml, p->exec_paths, MOUNT_EXEC, require_prefix);
+        if (r < 0)
+                return r;
+
+        r = append_access_mounts(&ml, p->no_exec_paths, MOUNT_NOEXEC, require_prefix);
+        if (r < 0)
+                return r;
+
+        r = append_empty_dir_mounts(&ml, p->empty_directories);
+        if (r < 0)
+                return r;
+
+        r = append_bind_mounts(&ml, p->bind_mounts, p->n_bind_mounts);
+        if (r < 0)
+                return r;
+
+        r = append_tmpfs_mounts(&ml, p->temporary_filesystems, p->n_temporary_filesystems);
+        if (r < 0)
+                return r;
+
+        if (p->tmp_dir) {
+                bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY);
+
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/tmp",
+                        .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
+                        .source_const = p->tmp_dir,
+                };
+        }
+
+        if (p->var_tmp_dir) {
+                bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY);
+
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/var/tmp",
+                        .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
+                        .source_const = p->var_tmp_dir,
+                };
+        }
+
+        r = append_mount_images(&ml, p->mount_images, p->n_mount_images);
+        if (r < 0)
+                return r;
+
+        r = append_extensions(&ml, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
+        if (r < 0)
+                return r;
+
+        if (p->private_dev) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/dev",
+                        .mode = MOUNT_PRIVATE_DEV,
+                        .flags = DEV_MOUNT_OPTIONS,
+                };
+        }
+
+        /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective
+           mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so
+           let's try the mounts but it's not fatal if they don't succeed. */
+        bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID;
+        if (p->protect_kernel_tunables) {
+                r = append_static_mounts(&ml,
+                                         protect_kernel_tunables_proc_table,
+                                         ELEMENTSOF(protect_kernel_tunables_proc_table),
+                                         ignore_protect_proc);
+                if (r < 0)
+                        return r;
+
+                r = append_static_mounts(&ml,
+                                         protect_kernel_tunables_sys_table,
+                                         ELEMENTSOF(protect_kernel_tunables_sys_table),
+                                         p->ignore_protect_paths);
+                if (r < 0)
+                        return r;
+        }
+
+        if (p->protect_kernel_modules) {
+                r = append_static_mounts(&ml,
+                                         protect_kernel_modules_table,
+                                         ELEMENTSOF(protect_kernel_modules_table),
+                                         p->ignore_protect_paths);
+                if (r < 0)
+                        return r;
+        }
+
+        if (p->protect_kernel_logs) {
+                r = append_static_mounts(&ml,
+                                         protect_kernel_logs_proc_table,
+                                         ELEMENTSOF(protect_kernel_logs_proc_table),
+                                         ignore_protect_proc);
+                if (r < 0)
+                        return r;
+
+                r = append_static_mounts(&ml,
+                                         protect_kernel_logs_dev_table,
+                                         ELEMENTSOF(protect_kernel_logs_dev_table),
+                                         p->ignore_protect_paths);
+                if (r < 0)
+                        return r;
+        }
+
+        if (p->protect_control_groups) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/sys/fs/cgroup",
+                        .mode = MOUNT_READ_ONLY,
+                };
+        }
+
+        r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths);
+        if (r < 0)
+                return r;
+
+        r = append_protect_system(&ml, p->protect_system, false);
+        if (r < 0)
+                return r;
+
+        if (namespace_parameters_mount_apivfs(p)) {
+                r = append_static_mounts(&ml,
+                                         apivfs_table,
+                                         ELEMENTSOF(apivfs_table),
+                                         p->ignore_protect_paths);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Note, if proc is mounted with subset=pid then neither of the two paths will exist, i.e. they are
+         * implicitly protected by the mount option. */
+        if (p->protect_hostname) {
+                r = append_static_mounts(
+                                &ml,
+                                protect_hostname_table,
+                                ELEMENTSOF(protect_hostname_table),
+                                ignore_protect_proc);
+                if (r < 0)
+                        return r;
+        }
+
+        if (p->private_network) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/sys",
+                        .mode = MOUNT_PRIVATE_SYSFS,
+                };
+        }
+
+        if (p->private_ipc) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/dev/mqueue",
+                        .mode = MOUNT_MQUEUEFS,
+                        .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
+                };
+        }
+
+        if (p->creds_path) {
+                /* If our service has a credentials store configured, then bind that one in, but hide
+                 * everything else. */
+
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/run/credentials",
+                        .mode = MOUNT_TMPFS,
+                        .read_only = true,
+                        .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
+                        .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
+                };
+
+                me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = p->creds_path,
+                        .mode = MOUNT_BIND,
+                        .read_only = true,
+                        .source_const = p->creds_path,
+                        .ignore = true,
+                };
+        } else {
+                /* If our service has no credentials store configured, then make the whole credentials tree
+                 * inaccessible wholesale. */
+
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/run/credentials",
+                        .mode = MOUNT_INACCESSIBLE,
+                        .ignore = true,
+                };
+        }
+
+        if (p->log_namespace) {
+                _cleanup_free_ char *q = NULL;
+
+                q = strjoin("/run/systemd/journal.", p->log_namespace);
+                if (!q)
+                        return log_oom_debug();
+
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/run/systemd/journal",
+                        .mode = MOUNT_BIND_RECURSIVE,
+                        .read_only = true,
+                        .source_malloc = TAKE_PTR(q),
+                };
+        }
+
+        /* Will be used to add bind mounts at runtime */
+        if (setup_propagate) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .source_const = p->propagate_dir,
+                        .path_const = p->incoming_dir,
+                        .mode = MOUNT_BIND,
+                        .read_only = true,
+                };
+        }
+
+        if (p->notify_socket) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = p->notify_socket,
+                        .source_const = p->notify_socket,
+                        .mode = MOUNT_BIND,
+                        .read_only = true,
+                };
+        }
+
+        if (p->host_os_release_stage) {
+                MountEntry *me = mount_list_extend(&ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/run/host/.os-release-stage/",
+                        .source_const = p->host_os_release_stage,
+                        .mode = MOUNT_BIND,
+                        .read_only = true,
+                        .ignore = true, /* Live copy, don't hard-fail if it goes missing */
+                };
+        }
+
+        /* Prepend the root directory where that's necessary */
+        r = prefix_where_needed(&ml, root);
+        if (r < 0)
+                return r;
+
+        drop_unused_mounts(&ml, root);
+
+        /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
+
+        if (unshare(CLONE_NEWNS) < 0) {
+                r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
+
+                if (ERRNO_IS_PRIVILEGE(r) ||
+                    ERRNO_IS_NOT_SUPPORTED(r))
+                        /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
+                         * in place that doesn't allow us to create namespaces (or a missing cap), then
+                         * propagate a recognizable error back, which the caller can use to detect this case
+                         * (and only this) and optionally continue without namespacing applied. */
+                        return -ENOANO;
+
+                return r;
+        }
+
+        /* Create the source directory to allow runtime propagation of mounts */
+        if (setup_propagate)
+                (void) mkdir_p(p->propagate_dir, 0600);
+
+        if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories))
+                /* ExtensionImages/Directories mountpoint directories will be created while parsing the
+                 * mounts to create, so have the parent ready */
+                (void) mkdir_p(p->extension_dir, 0600);
+
+        /* Remount / as SLAVE so that nothing now mounted in the namespace
+         * shows up in the parent */
+        if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
+                return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
+
+        if (p->root_image) {
+                /* A root image is specified, mount it to the right place */
+                r = dissected_image_mount(
+                                dissected_image,
+                                root,
+                                /* uid_shift= */ UID_INVALID,
+                                /* uid_range= */ UID_INVALID,
+                                /* userns_fd= */ -EBADF,
+                                dissect_image_flags);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to mount root image: %m");
+
+                /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
+                 * if it likes. */
+                r = loop_device_flock(loop_device, LOCK_UN);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
+
+                r = dissected_image_relinquish(dissected_image);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to relinquish dissected image: %m");
+
+        } else if (p->root_directory) {
+
+                /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
+                r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
+                if (r == 0) {
+                        r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
+                        if (r < 0)
+                                return r;
+                }
+
+        } else {
+                /* Let's mount the main root directory to the root directory to use */
+                r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Try to set up the new root directory before mounting anything else there. */
+        if (p->root_image || p->root_directory)
+                (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
+
+        /* Now make the magic happen */
+        r = apply_mounts(&ml, root, p, error_path);
+        if (r < 0)
+                return r;
+
+        /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
+        r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
+        if (r == -EINVAL && p->root_directory) {
+                /* If we are using root_directory and we don't have privileges (ie: user manager in a user
+                 * namespace) and the root_directory is already a mount point in the parent namespace,
+                 * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
+                 * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
+                 * mount point) and try again. */
+                r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
+                if (r < 0)
+                        return r;
+                r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
+        }
+        if (r < 0)
+                return log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
+
+        /* Remount / as the desired mode. Note that this will not reestablish propagation from our side to
+         * the host, since what's disconnected is disconnected. */
+        if (mount(NULL, "/", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
+                return log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
+
+        /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only supported for
+         * non-shared mounts. This needs to happen after remounting / or it will fail. */
+        if (setup_propagate && mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL) < 0)
+                return log_debug_errno(errno, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir);
+
+        return 0;
+}
+
+void bind_mount_free_many(BindMount *b, size_t n) {
+        assert(b || n == 0);
+
+        for (size_t i = 0; i < n; i++) {
+                free(b[i].source);
+                free(b[i].destination);
+        }
+
+        free(b);
+}
+
+int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
+        _cleanup_free_ char *s = NULL, *d = NULL;
+        BindMount *c;
+
+        assert(b);
+        assert(n);
+        assert(item);
+
+        s = strdup(item->source);
+        if (!s)
+                return -ENOMEM;
+
+        d = strdup(item->destination);
+        if (!d)
+                return -ENOMEM;
+
+        c = reallocarray(*b, *n + 1, sizeof(BindMount));
+        if (!c)
+                return -ENOMEM;
+
+        *b = c;
+
+        c[(*n) ++] = (BindMount) {
+                .source = TAKE_PTR(s),
+                .destination = TAKE_PTR(d),
+                .read_only = item->read_only,
+                .nosuid = item->nosuid,
+                .recursive = item->recursive,
+                .ignore_enoent = item->ignore_enoent,
+        };
+
+        return 0;
+}
+
+MountImage* mount_image_free_many(MountImage *m, size_t *n) {
+        assert(n);
+        assert(m || *n == 0);
+
+        for (size_t i = 0; i < *n; i++) {
+                free(m[i].source);
+                free(m[i].destination);
+                mount_options_free_all(m[i].mount_options);
+        }
+
+        free(m);
+        *n = 0;
+        return NULL;
+}
+
+int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
+        _cleanup_free_ char *s = NULL, *d = NULL;
+        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+        MountImage *c;
+
+        assert(m);
+        assert(n);
+        assert(item);
+
+        s = strdup(item->source);
+        if (!s)
+                return -ENOMEM;
+
+        if (item->destination) {
+                d = strdup(item->destination);
+                if (!d)
+                        return -ENOMEM;
+        }
+
+        LIST_FOREACH(mount_options, i, item->mount_options) {
+                _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
+
+                o = new(MountOptions, 1);
+                if (!o)
+                        return -ENOMEM;
+
+                *o = (MountOptions) {
+                        .partition_designator = i->partition_designator,
+                        .options = strdup(i->options),
+                };
+                if (!o->options)
+                        return -ENOMEM;
+
+                LIST_APPEND(mount_options, options, TAKE_PTR(o));
+        }
+
+        c = reallocarray(*m, *n + 1, sizeof(MountImage));
+        if (!c)
+                return -ENOMEM;
+
+        *m = c;
+
+        c[(*n) ++] = (MountImage) {
+                .source = TAKE_PTR(s),
+                .destination = TAKE_PTR(d),
+                .mount_options = TAKE_PTR(options),
+                .ignore_enoent = item->ignore_enoent,
+                .type = item->type,
+        };
+
+        return 0;
+}
+
+void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
+        assert(t || n == 0);
+
+        for (size_t i = 0; i < n; i++) {
+                free(t[i].path);
+                free(t[i].options);
+        }
+
+        free(t);
+}
+
+int temporary_filesystem_add(
+                TemporaryFileSystem **t,
+                size_t *n,
+                const char *path,
+                const char *options) {
+
+        _cleanup_free_ char *p = NULL, *o = NULL;
+        TemporaryFileSystem *c;
+
+        assert(t);
+        assert(n);
+        assert(path);
+
+        p = strdup(path);
+        if (!p)
+                return -ENOMEM;
+
+        if (!isempty(options)) {
+                o = strdup(options);
+                if (!o)
+                        return -ENOMEM;
+        }
+
+        c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
+        if (!c)
+                return -ENOMEM;
+
+        *t = c;
+
+        c[(*n) ++] = (TemporaryFileSystem) {
+                .path = TAKE_PTR(p),
+                .options = TAKE_PTR(o),
+        };
+
+        return 0;
+}
+
+static int make_tmp_prefix(const char *prefix) {
+        _cleanup_free_ char *t = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        /* Don't do anything unless we know the dir is actually missing */
+        r = access(prefix, F_OK);
+        if (r >= 0)
+                return 0;
+        if (errno != ENOENT)
+                return -errno;
+
+        WITH_UMASK(000)
+                r = mkdir_parents(prefix, 0755);
+        if (r < 0)
+                return r;
+
+        r = tempfn_random(prefix, NULL, &t);
+        if (r < 0)
+                return r;
+
+        /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
+         * the suid bit, below. */
+        fd = open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0777);
+        if (fd < 0)
+                return fd;
+
+        r = RET_NERRNO(fchmod(fd, 01777));
+        if (r < 0) {
+                (void) rmdir(t);
+                return r;
+        }
+
+        r = RET_NERRNO(rename(t, prefix));
+        if (r < 0) {
+                (void) rmdir(t);
+                return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
+        }
+
+        return 0;
+
+}
+
+static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
+        _cleanup_free_ char *x = NULL;
+        _cleanup_free_ char *y = NULL;
+        sd_id128_t boot_id;
+        bool rw = true;
+        int r;
+
+        assert(id);
+        assert(prefix);
+        assert(path);
+
+        /* We include the boot id in the directory so that after a
+         * reboot we can easily identify obsolete directories. */
+
+        r = sd_id128_get_boot(&boot_id);
+        if (r < 0)
+                return r;
+
+        x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
+        if (!x)
+                return -ENOMEM;
+
+        r = make_tmp_prefix(prefix);
+        if (r < 0)
+                return r;
+
+        WITH_UMASK(0077)
+                if (!mkdtemp(x)) {
+                        if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
+                                rw = false;
+                        else
+                                return -errno;
+                }
+
+        if (rw) {
+                y = strjoin(x, "/tmp");
+                if (!y)
+                        return -ENOMEM;
+
+                WITH_UMASK(0000)
+                        if (mkdir(y, 0777 | S_ISVTX) < 0)
+                                return -errno;
+
+                r = label_fix_full(AT_FDCWD, y, prefix, 0);
+                if (r < 0)
+                        return r;
+
+                if (tmp_path)
+                        *tmp_path = TAKE_PTR(y);
+        } else {
+                /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
+                 * read-only. This way the service will get the EROFS result as if it was writing to the real
+                 * file system. */
+                WITH_UMASK(0000)
+                        r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
+                if (r < 0)
+                        return r;
+
+                r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
+                if (r < 0)
+                        return r;
+        }
+
+        *path = TAKE_PTR(x);
+        return 0;
+}
+
+int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
+        _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
+        _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
+        char *b;
+        int r;
+
+        assert(id);
+        assert(tmp_dir);
+        assert(var_tmp_dir);
+
+        r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
+        if (r < 0)
+                return r;
+
+        r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
+        if (r < 0)
+                return r;
+
+        a_tmp = mfree(a_tmp); /* avoid rmdir */
+        *tmp_dir = TAKE_PTR(a);
+        *var_tmp_dir = TAKE_PTR(b);
+
+        return 0;
+}
+
+int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
+        _cleanup_close_ int ns = -EBADF;
+        const char *ns_name, *ns_path;
+        int r;
+
+        assert(ns_storage_socket);
+        assert(ns_storage_socket[0] >= 0);
+        assert(ns_storage_socket[1] >= 0);
+
+        ns_name = ASSERT_PTR(namespace_single_flag_to_string(nsflag));
+
+        /* We use the passed socketpair as a storage buffer for our namespace reference fd. Whatever process
+         * runs this first shall create a new namespace, all others should just join it. To serialize that we
+         * use a file lock on the socket pair.
+         *
+         * It's a bit crazy, but hey, works great! */
+
+        r = posix_lock(ns_storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return r;
+
+        CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
+
+        ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+        if (ns >= 0) {
+                /* Yay, found something, so let's join the namespace */
+                r = RET_NERRNO(setns(ns, nsflag));
+                if (r < 0)
+                        return r;
+
+                return 0;
+        }
+
+        if (ns != -EAGAIN)
+                return ns;
+
+        /* Nothing stored yet, so let's create a new namespace. */
+
+        if (unshare(nsflag) < 0)
+                return -errno;
+
+        if (nsflag == CLONE_NEWNET)
+                (void) loopback_setup();
+
+        ns_path = strjoina("/proc/self/ns/", ns_name);
+        ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+        if (ns < 0)
+                return -errno;
+
+        r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
+        _cleanup_close_ int ns = -EBADF;
+        int r;
+
+        assert(ns_storage_socket);
+        assert(ns_storage_socket[0] >= 0);
+        assert(ns_storage_socket[1] >= 0);
+        assert(path);
+
+        /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
+         * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
+         * allocate a new anonymous ns if needed. */
+
+        r = posix_lock(ns_storage_socket[0], LOCK_EX);
+        if (r < 0)
+                return r;
+
+        CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
+
+        ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+        if (ns >= 0)
+                return 0;
+        if (ns != -EAGAIN)
+                return ns;
+
+        /* Nothing stored yet. Open the file from the file system. */
+
+        ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+        if (ns < 0)
+                return -errno;
+
+        r = fd_is_ns(ns, nsflag);
+        if (r == 0)
+                return -EINVAL;
+        if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
+                return r;
+
+        r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+bool ns_type_supported(NamespaceType type) {
+        const char *t, *ns_proc;
+
+        t = namespace_type_to_string(type);
+        if (!t) /* Don't know how to translate this? Then it's not supported */
+                return false;
+
+        ns_proc = strjoina("/proc/self/ns/", t);
+        return access(ns_proc, F_OK) == 0;
+}
+
+static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
+        [PROTECT_HOME_NO]        = "no",
+        [PROTECT_HOME_YES]       = "yes",
+        [PROTECT_HOME_READ_ONLY] = "read-only",
+        [PROTECT_HOME_TMPFS]     = "tmpfs",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
+
+static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
+        [PROTECT_SYSTEM_NO]     = "no",
+        [PROTECT_SYSTEM_YES]    = "yes",
+        [PROTECT_SYSTEM_FULL]   = "full",
+        [PROTECT_SYSTEM_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
+
+static const char* const namespace_type_table[] = {
+        [NAMESPACE_MOUNT]  = "mnt",
+        [NAMESPACE_CGROUP] = "cgroup",
+        [NAMESPACE_UTS]    = "uts",
+        [NAMESPACE_IPC]    = "ipc",
+        [NAMESPACE_USER]   = "user",
+        [NAMESPACE_PID]    = "pid",
+        [NAMESPACE_NET]    = "net",
+        [NAMESPACE_TIME]   = "time",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
+
+static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
+        [PROTECT_PROC_DEFAULT]    = "default",
+        [PROTECT_PROC_NOACCESS]   = "noaccess",
+        [PROTECT_PROC_INVISIBLE]  = "invisible",
+        [PROTECT_PROC_PTRACEABLE] = "ptraceable",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
+
+static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
+        [PROC_SUBSET_ALL] = "all",
+        [PROC_SUBSET_PID] = "pid",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
diff --git a/src/core/namespace.h b/src/core/namespace.h
new file mode 100644
index 0000000..921716b
--- /dev/null
+++ b/src/core/namespace.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+  Copyright © 2016 Djalal Harouni
+***/
+
+typedef struct NamespaceParameters NamespaceParameters;
+typedef struct BindMount BindMount;
+typedef struct TemporaryFileSystem TemporaryFileSystem;
+typedef struct MountImage MountImage;
+
+#include 
+
+#include "dissect-image.h"
+#include "fs-util.h"
+#include "macro.h"
+#include "namespace-util.h"
+#include "runtime-scope.h"
+#include "string-util.h"
+
+typedef enum ProtectHome {
+        PROTECT_HOME_NO,
+        PROTECT_HOME_YES,
+        PROTECT_HOME_READ_ONLY,
+        PROTECT_HOME_TMPFS,
+        _PROTECT_HOME_MAX,
+        _PROTECT_HOME_INVALID = -EINVAL,
+} ProtectHome;
+
+typedef enum ProtectSystem {
+        PROTECT_SYSTEM_NO,
+        PROTECT_SYSTEM_YES,
+        PROTECT_SYSTEM_FULL,
+        PROTECT_SYSTEM_STRICT,
+        _PROTECT_SYSTEM_MAX,
+        _PROTECT_SYSTEM_INVALID = -EINVAL,
+} ProtectSystem;
+
+typedef enum ProtectProc {
+        PROTECT_PROC_DEFAULT,
+        PROTECT_PROC_NOACCESS,   /* hidepid=noaccess */
+        PROTECT_PROC_INVISIBLE,  /* hidepid=invisible */
+        PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
+        _PROTECT_PROC_MAX,
+        _PROTECT_PROC_INVALID = -EINVAL,
+} ProtectProc;
+
+typedef enum ProcSubset {
+        PROC_SUBSET_ALL,
+        PROC_SUBSET_PID, /* subset=pid */
+        _PROC_SUBSET_MAX,
+        _PROC_SUBSET_INVALID = -EINVAL,
+} ProcSubset;
+
+struct BindMount {
+        char *source;
+        char *destination;
+        bool read_only;
+        bool nosuid;
+        bool recursive;
+        bool ignore_enoent;
+};
+
+struct TemporaryFileSystem {
+        char *path;
+        char *options;
+};
+
+typedef enum MountImageType {
+        MOUNT_IMAGE_DISCRETE,
+        MOUNT_IMAGE_EXTENSION,
+        _MOUNT_IMAGE_TYPE_MAX,
+        _MOUNT_IMAGE_TYPE_INVALID = -EINVAL,
+} MountImageType;
+
+struct MountImage {
+        char *source;
+        char *destination; /* Unused if MountImageType == MOUNT_IMAGE_EXTENSION */
+        LIST_HEAD(MountOptions, mount_options);
+        bool ignore_enoent;
+        MountImageType type;
+};
+
+struct NamespaceParameters {
+        RuntimeScope runtime_scope;
+
+        const char *root_directory;
+        const char *root_image;
+        const MountOptions *root_image_options;
+        const ImagePolicy *root_image_policy;
+
+        char **read_write_paths;
+        char **read_only_paths;
+        char **inaccessible_paths;
+
+        char **exec_paths;
+        char **no_exec_paths;
+
+        char **empty_directories;
+        char **symlinks;
+
+        const BindMount *bind_mounts;
+        size_t n_bind_mounts;
+
+        const TemporaryFileSystem *temporary_filesystems;
+        size_t n_temporary_filesystems;
+
+        const MountImage *mount_images;
+        size_t n_mount_images;
+        const ImagePolicy *mount_image_policy;
+
+        const char *tmp_dir;
+        const char *var_tmp_dir;
+
+        const char *creds_path;
+        const char *log_namespace;
+
+        unsigned long mount_propagation_flag;
+        VeritySettings *verity;
+
+        const MountImage *extension_images;
+        size_t n_extension_images;
+        const ImagePolicy *extension_image_policy;
+        char **extension_directories;
+
+        const char *propagate_dir;
+        const char *incoming_dir;
+
+        const char *extension_dir;
+        const char *notify_socket;
+        const char *host_os_release_stage;
+
+        bool ignore_protect_paths;
+
+        bool protect_control_groups;
+        bool protect_kernel_tunables;
+        bool protect_kernel_modules;
+        bool protect_kernel_logs;
+        bool protect_hostname;
+
+        bool private_dev;
+        bool private_network;
+        bool private_ipc;
+
+        bool mount_apivfs;
+        bool mount_nosuid;
+
+        ProtectHome protect_home;
+        ProtectSystem protect_system;
+        ProtectProc protect_proc;
+        ProcSubset proc_subset;
+};
+
+int setup_namespace(const NamespaceParameters *p, char **error_path);
+
+#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
+
+static inline char* namespace_cleanup_tmpdir(char *p) {
+        PROTECT_ERRNO;
+        if (!streq_ptr(p, RUN_SYSTEMD_EMPTY))
+                (void) rmdir(p);
+        return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, namespace_cleanup_tmpdir);
+
+int setup_tmp_dirs(
+                const char *id,
+                char **tmp_dir,
+                char **var_tmp_dir);
+
+int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag);
+int open_shareable_ns_path(int netns_storage_socket[static 2], const char *path, unsigned long nsflag);
+
+const char* protect_home_to_string(ProtectHome p) _const_;
+ProtectHome protect_home_from_string(const char *s) _pure_;
+
+const char* protect_system_to_string(ProtectSystem p) _const_;
+ProtectSystem protect_system_from_string(const char *s) _pure_;
+
+const char* protect_proc_to_string(ProtectProc i) _const_;
+ProtectProc protect_proc_from_string(const char *s) _pure_;
+
+const char* proc_subset_to_string(ProcSubset i) _const_;
+ProcSubset proc_subset_from_string(const char *s) _pure_;
+
+void bind_mount_free_many(BindMount *b, size_t n);
+int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
+
+void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n);
+int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n,
+                             const char *path, const char *options);
+
+MountImage* mount_image_free_many(MountImage *m, size_t *n);
+int mount_image_add(MountImage **m, size_t *n, const MountImage *item);
+
+const char* namespace_type_to_string(NamespaceType t) _const_;
+NamespaceType namespace_type_from_string(const char *s) _pure_;
+
+bool ns_type_supported(NamespaceType type);
diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf
new file mode 100644
index 0000000..52034e0
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.conf
@@ -0,0 +1,452 @@
+ 
+
+
+
+
+
+
+        
+                
+
+                
+                
+                
+
+                
+                
+        
+
+        
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+        
+
+
diff --git a/src/core/org.freedesktop.systemd1.policy.in b/src/core/org.freedesktop.systemd1.policy.in
new file mode 100644
index 0000000..0083e0b
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.policy.in
@@ -0,0 +1,83 @@
+ 
+
+
+
+
+
+
+        The systemd Project
+        https://systemd.io
+
+        
+                Send passphrase back to system
+                Authentication is required to send the entered passphrase back to the system.
+                
+                        no
+                        no
+                        auth_admin_keep
+                
+                {{LIBEXECDIR}}/systemd-reply-password
+        
+
+        
+                Manage system services or other units
+                Authentication is required to manage system services or other units.
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+        
+                Manage system service or unit files
+                Authentication is required to manage system service or unit files.
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+                org.freedesktop.systemd1.reload-daemon org.freedesktop.systemd1.manage-units
+        
+
+        
+                Set or unset system and service manager environment variables
+                Authentication is required to set or unset system and service manager environment variables.
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+        
+                Reload the systemd state
+                Authentication is required to reload the systemd state.
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+        
+                Dump the systemd state without rate limits
+                Authentication is required to dump the systemd state without rate limits.
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+
diff --git a/src/core/org.freedesktop.systemd1.service b/src/core/org.freedesktop.systemd1.service
new file mode 100644
index 0000000..082125f
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.service
@@ -0,0 +1,13 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.systemd1
+Exec=/bin/false
+User=root
diff --git a/src/core/path.c b/src/core/path.c
new file mode 100644
index 0000000..ef00c20
--- /dev/null
+++ b/src/core/path.c
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-path.h"
+#include "dbus-unit.h"
+#include "escape.h"
+#include "event-util.h"
+#include "fd-util.h"
+#include "glob-util.h"
+#include "inotify-util.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path.h"
+#include "path-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_PATH_STATE_MAX] = {
+        [PATH_DEAD]    = UNIT_INACTIVE,
+        [PATH_WAITING] = UNIT_ACTIVE,
+        [PATH_RUNNING] = UNIT_ACTIVE,
+        [PATH_FAILED]  = UNIT_FAILED,
+};
+
+static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+
+int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) {
+        static const int flags_table[_PATH_TYPE_MAX] = {
+                [PATH_EXISTS]              = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB,
+                [PATH_EXISTS_GLOB]         = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB,
+                [PATH_CHANGED]             = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO,
+                [PATH_MODIFIED]            = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO|IN_MODIFY,
+                [PATH_DIRECTORY_NOT_EMPTY] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO,
+        };
+
+        bool exists = false;
+        char *slash, *oldslash = NULL;
+        int r;
+
+        assert(s);
+        assert(s->unit);
+        assert(handler);
+
+        path_spec_unwatch(s);
+
+        s->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+        if (s->inotify_fd < 0) {
+                r = log_error_errno(errno, "Failed to allocate inotify fd: %m");
+                goto fail;
+        }
+
+        r = sd_event_add_io(s->unit->manager->event, &s->event_source, s->inotify_fd, EPOLLIN, handler, s);
+        if (r < 0) {
+                log_error_errno(r, "Failed to add inotify fd to event loop: %m");
+                goto fail;
+        }
+
+        (void) sd_event_source_set_description(s->event_source, "path");
+
+        /* This function assumes the path was passed through path_simplify()! */
+        assert(!strstr(s->path, "//"));
+
+        for (slash = strchr(s->path, '/'); ; slash = strchr(slash+1, '/')) {
+                bool incomplete = false;
+                int flags, wd = -1;
+                char tmp, *cut;
+
+                if (slash) {
+                        cut = slash + (slash == s->path);
+                        tmp = *cut;
+                        *cut = '\0';
+
+                        flags = IN_MOVE_SELF | IN_DELETE_SELF | IN_ATTRIB | IN_CREATE | IN_MOVED_TO;
+                } else {
+                        cut = NULL;
+                        flags = flags_table[s->type];
+                }
+
+                /* If this is a symlink watch both the symlink inode and where it points to. If the inode is
+                 * not a symlink both calls will install the same watch, which is redundant and doesn't
+                 * hurt. */
+                for (int follow_symlink = 0; follow_symlink < 2; follow_symlink ++) {
+                        uint32_t f = flags;
+
+                        SET_FLAG(f, IN_DONT_FOLLOW, !follow_symlink);
+
+                        wd = inotify_add_watch(s->inotify_fd, s->path, f);
+                        if (wd < 0) {
+                                if (IN_SET(errno, EACCES, ENOENT)) {
+                                        incomplete = true; /* This is an expected error, let's accept this
+                                                            * quietly: we have an incomplete watch for
+                                                            * now. */
+                                        break;
+                                }
+
+                                /* This second call to inotify_add_watch() should fail like the previous one
+                                 * and is done for logging the error in a comprehensive way. */
+                                wd = inotify_add_watch_and_warn(s->inotify_fd, s->path, f);
+                                if (wd < 0) {
+                                        if (cut)
+                                                *cut = tmp;
+
+                                        r = wd;
+                                        goto fail;
+                                }
+
+                                /* Hmm, we succeeded in adding the watch this time... let's continue. */
+                        }
+                }
+
+                if (incomplete) {
+                        if (cut)
+                                *cut = tmp;
+
+                        break;
+                }
+
+                exists = true;
+
+                /* Path exists, we don't need to watch parent too closely. */
+                if (oldslash) {
+                        char *cut2 = oldslash + (oldslash == s->path);
+                        char tmp2 = *cut2;
+                        *cut2 = '\0';
+
+                        (void) inotify_add_watch(s->inotify_fd, s->path, IN_MOVE_SELF);
+                        /* Error is ignored, the worst can happen is we get spurious events. */
+
+                        *cut2 = tmp2;
+                }
+
+                if (cut)
+                        *cut = tmp;
+
+                if (slash)
+                        oldslash = slash;
+                else {
+                        /* whole path has been iterated over */
+                        s->primary_wd = wd;
+                        break;
+                }
+        }
+
+        if (!exists) {
+                r = log_error_errno(errno, "Failed to add watch on any of the components of %s: %m", s->path);
+                /* either EACCESS or ENOENT */
+                goto fail;
+        }
+
+        return 0;
+
+fail:
+        path_spec_unwatch(s);
+        return r;
+}
+
+void path_spec_unwatch(PathSpec *s) {
+        assert(s);
+
+        s->event_source = sd_event_source_disable_unref(s->event_source);
+        s->inotify_fd = safe_close(s->inotify_fd);
+}
+
+int path_spec_fd_event(PathSpec *s, uint32_t revents) {
+        union inotify_event_buffer buffer;
+        ssize_t l;
+
+        assert(s);
+
+        if (revents != EPOLLIN)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Got invalid poll event on inotify.");
+
+        l = read(s->inotify_fd, &buffer, sizeof(buffer));
+        if (l < 0) {
+                if (ERRNO_IS_TRANSIENT(errno))
+                        return 0;
+
+                return log_error_errno(errno, "Failed to read inotify event: %m");
+        }
+
+        if (IN_SET(s->type, PATH_CHANGED, PATH_MODIFIED))
+                FOREACH_INOTIFY_EVENT_WARN(e, buffer, l)
+                        if (s->primary_wd == e->wd)
+                                return 1;
+
+        return 0;
+}
+
+static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_notify, char **ret_trigger_path) {
+        _cleanup_free_ char *trigger = NULL;
+        bool b, good = false;
+
+        assert(s);
+        assert(ret_trigger_path);
+
+        switch (s->type) {
+
+        case PATH_EXISTS:
+                good = access(s->path, F_OK) >= 0;
+                break;
+
+        case PATH_EXISTS_GLOB:
+                good = glob_first(s->path, &trigger) > 0;
+                break;
+
+        case PATH_DIRECTORY_NOT_EMPTY: {
+                int k;
+
+                k = dir_is_empty(s->path, /* ignore_hidden_or_backup= */ true);
+                good = !(IN_SET(k, -ENOENT, -ENOTDIR) || k > 0);
+                break;
+        }
+
+        case PATH_CHANGED:
+        case PATH_MODIFIED:
+                b = access(s->path, F_OK) >= 0;
+                good = !initial && !from_trigger_notify && b != s->previous_exists;
+                s->previous_exists = b;
+                break;
+
+        default:
+                ;
+        }
+
+        if (good) {
+                if (!trigger) {
+                        trigger = strdup(s->path);
+                        if (!trigger)
+                                (void) log_oom_debug();
+                }
+                *ret_trigger_path = TAKE_PTR(trigger);
+        }
+
+        return good;
+}
+
+static void path_spec_mkdir(PathSpec *s, mode_t mode) {
+        int r;
+
+        if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB))
+                return;
+
+        r = mkdir_p_label(s->path, mode);
+        if (r < 0)
+                log_warning_errno(r, "mkdir(%s) failed: %m", s->path);
+}
+
+static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) {
+        const char *type;
+
+        assert_se(type = path_type_to_string(s->type));
+        fprintf(f, "%s%s: %s\n", prefix, type, s->path);
+}
+
+void path_spec_done(PathSpec *s) {
+        assert(s);
+        assert(s->inotify_fd == -EBADF);
+
+        free(s->path);
+}
+
+static void path_init(Unit *u) {
+        Path *p = PATH(u);
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        p->directory_mode = 0755;
+
+        p->trigger_limit = RATELIMIT_OFF;
+}
+
+void path_free_specs(Path *p) {
+        PathSpec *s;
+
+        assert(p);
+
+        while ((s = LIST_POP(spec, p->specs))) {
+                path_spec_unwatch(s);
+                path_spec_done(s);
+                free(s);
+        }
+}
+
+static void path_done(Unit *u) {
+        Path *p = PATH(u);
+
+        assert(p);
+
+        p->trigger_notify_event_source = sd_event_source_disable_unref(p->trigger_notify_event_source);
+        path_free_specs(p);
+}
+
+static int path_add_mount_dependencies(Path *p) {
+        int r;
+
+        assert(p);
+
+        LIST_FOREACH(spec, s, p->specs) {
+                r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int path_verify(Path *p) {
+        assert(p);
+        assert(UNIT(p)->load_state == UNIT_LOADED);
+
+        if (!p->specs)
+                return log_unit_error_errno(UNIT(p), SYNTHETIC_ERRNO(ENOEXEC), "Path unit lacks path setting. Refusing.");
+
+        return 0;
+}
+
+static int path_add_default_dependencies(Path *p) {
+        int r;
+
+        assert(p);
+
+        if (!UNIT(p)->default_dependencies)
+                return 0;
+
+        r = unit_add_dependency_by_name(UNIT(p), UNIT_BEFORE, SPECIAL_PATHS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        if (MANAGER_IS_SYSTEM(UNIT(p)->manager)) {
+                r = unit_add_two_dependencies_by_name(UNIT(p), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+                if (r < 0)
+                        return r;
+        }
+
+        return unit_add_two_dependencies_by_name(UNIT(p), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int path_add_trigger_dependencies(Path *p) {
+        Unit *x;
+        int r;
+
+        assert(p);
+
+        if (UNIT_TRIGGER(UNIT(p)))
+                return 0;
+
+        r = unit_load_related_unit(UNIT(p), ".service", &x);
+        if (r < 0)
+                return r;
+
+        return unit_add_two_dependencies(UNIT(p), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int path_add_extras(Path *p) {
+        int r;
+
+        assert(p);
+
+        /* To avoid getting pid1 in a busy-loop state (eg: unmet condition on associated service),
+         * set a default trigger limit if the user didn't specify any. */
+        if (p->trigger_limit.interval == USEC_INFINITY)
+                p->trigger_limit.interval = 2 * USEC_PER_SEC;
+
+        if (p->trigger_limit.burst == UINT_MAX)
+                p->trigger_limit.burst = 200;
+
+        r = path_add_trigger_dependencies(p);
+        if (r < 0)
+                return r;
+
+        r = path_add_mount_dependencies(p);
+        if (r < 0)
+                return r;
+
+        return path_add_default_dependencies(p);
+}
+
+static int path_load(Unit *u) {
+        Path *p = PATH(u);
+        int r;
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        r = unit_load_fragment_and_dropin(u, true);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        r = path_add_extras(p);
+        if (r < 0)
+                return r;
+
+        return path_verify(p);
+}
+
+static void path_dump(Unit *u, FILE *f, const char *prefix) {
+        Path *p = PATH(u);
+        Unit *trigger;
+
+        assert(p);
+        assert(f);
+
+        trigger = UNIT_TRIGGER(u);
+
+        fprintf(f,
+                "%sPath State: %s\n"
+                "%sResult: %s\n"
+                "%sUnit: %s\n"
+                "%sMakeDirectory: %s\n"
+                "%sDirectoryMode: %04o\n"
+                "%sTriggerLimitIntervalSec: %s\n"
+                "%sTriggerLimitBurst: %u\n",
+                prefix, path_state_to_string(p->state),
+                prefix, path_result_to_string(p->result),
+                prefix, trigger ? trigger->id : "n/a",
+                prefix, yes_no(p->make_directory),
+                prefix, p->directory_mode,
+                prefix, FORMAT_TIMESPAN(p->trigger_limit.interval, USEC_PER_SEC),
+                prefix, p->trigger_limit.burst);
+
+        LIST_FOREACH(spec, s, p->specs)
+                path_spec_dump(s, f, prefix);
+}
+
+static void path_unwatch(Path *p) {
+        assert(p);
+
+        LIST_FOREACH(spec, s, p->specs)
+                path_spec_unwatch(s);
+}
+
+static int path_watch(Path *p) {
+        int r;
+
+        assert(p);
+
+        LIST_FOREACH(spec, s, p->specs) {
+                r = path_spec_watch(s, path_dispatch_io);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static void path_set_state(Path *p, PathState state) {
+        PathState old_state;
+        assert(p);
+
+        if (p->state != state)
+                bus_unit_send_pending_change_signal(UNIT(p), false);
+
+        old_state = p->state;
+        p->state = state;
+
+        if (!IN_SET(state, PATH_WAITING, PATH_RUNNING))
+                path_unwatch(p);
+
+        if (state != old_state)
+                log_unit_debug(UNIT(p), "Changed %s -> %s", path_state_to_string(old_state), path_state_to_string(state));
+
+        unit_notify(UNIT(p), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify);
+
+static int path_coldplug(Unit *u) {
+        Path *p = PATH(u);
+
+        assert(p);
+        assert(p->state == PATH_DEAD);
+
+        if (p->deserialized_state != p->state) {
+
+                if (IN_SET(p->deserialized_state, PATH_WAITING, PATH_RUNNING))
+                        path_enter_waiting(p, true, false);
+                else
+                        path_set_state(p, p->deserialized_state);
+        }
+
+        return 0;
+}
+
+static void path_enter_dead(Path *p, PathResult f) {
+        assert(p);
+
+        if (p->result == PATH_SUCCESS)
+                p->result = f;
+
+        unit_log_result(UNIT(p), p->result == PATH_SUCCESS, path_result_to_string(p->result));
+        path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD);
+}
+
+static void path_enter_running(Path *p, char *trigger_path) {
+        _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        Unit *trigger;
+        Job *job;
+        int r;
+
+        assert(p);
+
+        /* Don't start job if we are supposed to go down */
+        if (unit_stop_pending(UNIT(p)))
+                return;
+
+        if (!ratelimit_below(&p->trigger_limit)) {
+                log_unit_warning(UNIT(p), "Trigger limit hit, refusing further activation.");
+                path_enter_dead(p, PATH_FAILURE_TRIGGER_LIMIT_HIT);
+                return;
+        }
+
+        trigger = UNIT_TRIGGER(UNIT(p));
+        if (!trigger) {
+                log_unit_error(UNIT(p), "Unit to trigger vanished.");
+                goto fail;
+        }
+
+        details = activation_details_new(UNIT(p));
+        if (!details) {
+                log_oom();
+                goto fail;
+        }
+
+        r = free_and_strdup(&(ACTIVATION_DETAILS_PATH(details))->trigger_path_filename, trigger_path);
+        if (r < 0) {
+                log_oom();
+                goto fail;
+        }
+
+        r = manager_add_job(UNIT(p)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job);
+        if (r < 0) {
+                log_unit_warning(UNIT(p), "Failed to queue unit startup job: %s", bus_error_message(&error, r));
+                goto fail;
+        }
+
+        job_set_activation_details(job, details);
+
+        path_set_state(p, PATH_RUNNING);
+        path_unwatch(p);
+
+        return;
+
+fail:
+        path_enter_dead(p, PATH_FAILURE_RESOURCES);
+}
+
+static bool path_check_good(Path *p, bool initial, bool from_trigger_notify, char **ret_trigger_path) {
+        assert(p);
+        assert(ret_trigger_path);
+
+        LIST_FOREACH(spec, s, p->specs)
+                if (path_spec_check_good(s, initial, from_trigger_notify, ret_trigger_path))
+                        return true;
+
+        return false;
+}
+
+static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify) {
+        _cleanup_free_ char *trigger_path = NULL;
+        Unit *trigger;
+        int r;
+
+        if (p->trigger_notify_event_source)
+                (void) event_source_disable(p->trigger_notify_event_source);
+
+        /* If the triggered unit is already running, so are we */
+        trigger = UNIT_TRIGGER(UNIT(p));
+        if (trigger && !UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(trigger))) {
+                path_set_state(p, PATH_RUNNING);
+                path_unwatch(p);
+                return;
+        }
+
+        if (path_check_good(p, initial, from_trigger_notify, &trigger_path)) {
+                log_unit_debug(UNIT(p), "Got triggered.");
+                path_enter_running(p, trigger_path);
+                return;
+        }
+
+        r = path_watch(p);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(p), r, "Failed to enter waiting state: %m");
+                path_enter_dead(p, PATH_FAILURE_RESOURCES);
+                return;
+        }
+
+        /* Hmm, so now we have created inotify watches, but the file
+         * might have appeared/been removed by now, so we must
+         * recheck */
+
+        if (path_check_good(p, false, from_trigger_notify, &trigger_path)) {
+                log_unit_debug(UNIT(p), "Got triggered.");
+                path_enter_running(p, trigger_path);
+                return;
+        }
+
+        path_set_state(p, PATH_WAITING);
+}
+
+static void path_mkdir(Path *p) {
+        assert(p);
+
+        if (!p->make_directory)
+                return;
+
+        LIST_FOREACH(spec, s, p->specs)
+                path_spec_mkdir(s, p->directory_mode);
+}
+
+static int path_start(Unit *u) {
+        Path *p = PATH(u);
+        int r;
+
+        assert(p);
+        assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED));
+
+        r = unit_test_trigger_loaded(u);
+        if (r < 0)
+                return r;
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        path_mkdir(p);
+
+        p->result = PATH_SUCCESS;
+        path_enter_waiting(p, true, false);
+
+        return 1;
+}
+
+static int path_stop(Unit *u) {
+        Path *p = PATH(u);
+
+        assert(p);
+        assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING));
+
+        path_enter_dead(p, PATH_SUCCESS);
+        return 1;
+}
+
+static int path_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Path *p = PATH(u);
+
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", path_state_to_string(p->state));
+        (void) serialize_item(f, "result", path_result_to_string(p->result));
+
+        LIST_FOREACH(spec, s, p->specs) {
+                const char *type;
+                _cleanup_free_ char *escaped = NULL;
+
+                escaped = cescape(s->path);
+                if (!escaped)
+                        return log_oom();
+
+                assert_se(type = path_type_to_string(s->type));
+                (void) serialize_item_format(f, "path-spec", "%s %i %s",
+                                             type,
+                                             s->previous_exists,
+                                             escaped);
+        }
+
+        (void) serialize_ratelimit(f, "trigger-ratelimit", &p->trigger_limit);
+
+        return 0;
+}
+
+static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Path *p = PATH(u);
+
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                PathState state;
+
+                state = path_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        p->deserialized_state = state;
+
+        } else if (streq(key, "result")) {
+                PathResult f;
+
+                f = path_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse result value: %s", value);
+                else if (f != PATH_SUCCESS)
+                        p->result = f;
+
+        } else if (streq(key, "path-spec")) {
+                int previous_exists, skip = 0;
+                _cleanup_free_ char *type_str = NULL;
+
+                if (sscanf(value, "%ms %i %n", &type_str, &previous_exists, &skip) < 2)
+                        log_unit_debug(u, "Failed to parse path-spec value: %s", value);
+                else {
+                        _cleanup_free_ char *unescaped = NULL;
+                        ssize_t l;
+                        PathType type;
+
+                        type = path_type_from_string(type_str);
+                        if (type < 0) {
+                                log_unit_warning(u, "Unknown path type \"%s\", ignoring.", type_str);
+                                return 0;
+                        }
+
+                        l = cunescape(value+skip, 0, &unescaped);
+                        if (l < 0) {
+                                log_unit_warning_errno(u, l, "Failed to unescape serialize path: %m");
+                                return 0;
+                        }
+
+                        LIST_FOREACH(spec, s, p->specs)
+                                if (s->type == type &&
+                                    path_equal(s->path, unescaped)) {
+
+                                        s->previous_exists = previous_exists;
+                                        break;
+                                }
+                }
+
+        } else if (streq(key, "trigger-ratelimit"))
+                deserialize_ratelimit(&p->trigger_limit, key, value);
+
+        else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static UnitActiveState path_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[PATH(u)->state];
+}
+
+static const char *path_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return path_state_to_string(PATH(u)->state);
+}
+
+static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        PathSpec *s = userdata, *found = NULL;
+        Path *p;
+        int changed;
+
+        assert(s);
+        assert(s->unit);
+        assert(fd >= 0);
+
+        p = PATH(s->unit);
+
+        if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
+                return 0;
+
+        LIST_FOREACH(spec, i, p->specs)
+                if (path_spec_owns_inotify_fd(i, fd)) {
+                        found = i;
+                        break;
+                }
+
+        if (!found) {
+                log_error("Got event on unknown fd.");
+                goto fail;
+        }
+
+        changed = path_spec_fd_event(found, revents);
+        if (changed < 0)
+                goto fail;
+
+        if (changed)
+                path_enter_running(p, found->path);
+        else
+                path_enter_waiting(p, false, false);
+
+        return 0;
+
+fail:
+        path_enter_dead(p, PATH_FAILURE_RESOURCES);
+        return 0;
+}
+
+static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer);
+
+static int path_trigger_notify_on_defer(sd_event_source *s, void *userdata) {
+        Path *p = ASSERT_PTR(userdata);
+        Unit *trigger;
+
+        assert(s);
+
+        trigger = UNIT_TRIGGER(UNIT(p));
+        if (!trigger) {
+                log_unit_error(UNIT(p), "Unit to trigger vanished.");
+                path_enter_dead(p, PATH_FAILURE_RESOURCES);
+                return 0;
+        }
+
+        path_trigger_notify_impl(UNIT(p), trigger, /* on_defer = */ true);
+        return 0;
+}
+
+static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer) {
+        Path *p = PATH(u);
+        int r;
+
+        assert(u);
+        assert(other);
+
+        /* Invoked whenever the unit we trigger changes state or gains or loses a job */
+
+        /* Filter out invocations with bogus state */
+        assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+
+        /* Don't propagate state changes from the triggered unit if we are already down */
+        if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
+                return;
+
+        /* Propagate start limit hit state */
+        if (other->start_limit_hit) {
+                path_enter_dead(p, PATH_FAILURE_UNIT_START_LIMIT_HIT);
+                return;
+        }
+
+        /* Don't propagate anything if there's still a job queued */
+        if (other->job)
+                return;
+
+        if (p->state == PATH_RUNNING &&
+            UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) {
+                if (!on_defer)
+                        log_unit_debug(u, "Got notified about unit deactivation.");
+        } else if (p->state == PATH_WAITING &&
+                   !UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) {
+                if (!on_defer)
+                        log_unit_debug(u, "Got notified about unit activation.");
+        } else
+                return;
+
+        if (on_defer) {
+                path_enter_waiting(p, /* initial = */ false, /* from_trigger_notify = */ true);
+                return;
+        }
+
+        /* Do not call path_enter_waiting() directly from path_trigger_notify(), as this may be called by
+         * job_install() -> job_finish_and_invalidate() -> unit_trigger_notify(), and path_enter_waiting()
+         * may install another job and will trigger assertion in job_install().
+         * https://github.com/systemd/systemd/issues/24577#issuecomment-1522628906
+         * Hence, first setup defer event source here, and call path_enter_waiting() slightly later. */
+        if (p->trigger_notify_event_source) {
+                r = sd_event_source_set_enabled(p->trigger_notify_event_source, SD_EVENT_ONESHOT);
+                if (r < 0) {
+                        log_unit_warning_errno(u, r, "Failed to enable event source for triggering notify: %m");
+                        path_enter_dead(p, PATH_FAILURE_RESOURCES);
+                        return;
+                }
+        } else {
+                r = sd_event_add_defer(u->manager->event, &p->trigger_notify_event_source, path_trigger_notify_on_defer, p);
+                if (r < 0) {
+                        log_unit_warning_errno(u, r, "Failed to allocate event source for triggering notify: %m");
+                        path_enter_dead(p, PATH_FAILURE_RESOURCES);
+                        return;
+                }
+
+                (void) sd_event_source_set_description(p->trigger_notify_event_source, "path-trigger-notify");
+        }
+}
+
+static void path_trigger_notify(Unit *u, Unit *other) {
+        path_trigger_notify_impl(u, other, /* on_defer = */ false);
+}
+
+static void path_reset_failed(Unit *u) {
+        Path *p = PATH(u);
+
+        assert(p);
+
+        if (p->state == PATH_FAILED)
+                path_set_state(p, PATH_DEAD);
+
+        p->result = PATH_SUCCESS;
+}
+
+static int path_can_start(Unit *u) {
+        Path *p = PATH(u);
+        int r;
+
+        assert(p);
+
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT);
+                return r;
+        }
+
+        return 1;
+}
+
+static void activation_details_path_done(ActivationDetails *details) {
+        ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
+
+        p->trigger_path_filename = mfree(p->trigger_path_filename);
+}
+
+static void activation_details_path_serialize(ActivationDetails *details, FILE *f) {
+        ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
+
+        assert(f);
+
+        if (p->trigger_path_filename)
+                (void) serialize_item(f, "activation-details-path-filename", p->trigger_path_filename);
+}
+
+static int activation_details_path_deserialize(const char *key, const char *value, ActivationDetails **details) {
+        int r;
+
+        assert(key);
+        assert(value);
+
+        if (!details || !*details)
+                return -EINVAL;
+
+        ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(*details);
+        if (!p)
+                return -EINVAL;
+
+        if (!streq(key, "activation-details-path-filename"))
+                return -EINVAL;
+
+        r = free_and_strdup(&p->trigger_path_filename, value);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int activation_details_path_append_env(ActivationDetails *details, char ***strv) {
+        ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+        char *s;
+        int r;
+
+        assert(details);
+        assert(strv);
+        assert(p);
+
+        if (isempty(p->trigger_path_filename))
+                return 0;
+
+        s = strjoin("TRIGGER_PATH=", p->trigger_path_filename);
+        if (!s)
+                return -ENOMEM;
+
+        r = strv_consume(strv, TAKE_PTR(s));
+        if (r < 0)
+                return r;
+
+        return 1; /* Return the number of variables added to the env block */
+}
+
+static int activation_details_path_append_pair(ActivationDetails *details, char ***strv) {
+        ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+        int r;
+
+        assert(details);
+        assert(strv);
+        assert(p);
+
+        if (isempty(p->trigger_path_filename))
+                return 0;
+
+        r = strv_extend(strv, "trigger_path");
+        if (r < 0)
+                return r;
+
+        r = strv_extend(strv, p->trigger_path_filename);
+        if (r < 0)
+                return r;
+
+        return 1; /* Return the number of pairs added to the env block */
+}
+
+static const char* const path_type_table[_PATH_TYPE_MAX] = {
+        [PATH_EXISTS]              = "PathExists",
+        [PATH_EXISTS_GLOB]         = "PathExistsGlob",
+        [PATH_DIRECTORY_NOT_EMPTY] = "DirectoryNotEmpty",
+        [PATH_CHANGED]             = "PathChanged",
+        [PATH_MODIFIED]            = "PathModified",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_type, PathType);
+
+static const char* const path_result_table[_PATH_RESULT_MAX] = {
+        [PATH_SUCCESS]                      = "success",
+        [PATH_FAILURE_RESOURCES]            = "resources",
+        [PATH_FAILURE_START_LIMIT_HIT]      = "start-limit-hit",
+        [PATH_FAILURE_UNIT_START_LIMIT_HIT] = "unit-start-limit-hit",
+        [PATH_FAILURE_TRIGGER_LIMIT_HIT]    = "trigger-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_result, PathResult);
+
+const UnitVTable path_vtable = {
+        .object_size = sizeof(Path),
+
+        .sections =
+                "Unit\0"
+                "Path\0"
+                "Install\0",
+        .private_section = "Path",
+
+        .can_transient = true,
+        .can_fail = true,
+        .can_trigger = true,
+
+        .init = path_init,
+        .done = path_done,
+        .load = path_load,
+
+        .coldplug = path_coldplug,
+
+        .dump = path_dump,
+
+        .start = path_start,
+        .stop = path_stop,
+
+        .serialize = path_serialize,
+        .deserialize_item = path_deserialize_item,
+
+        .active_state = path_active_state,
+        .sub_state_to_string = path_sub_state_to_string,
+
+        .trigger_notify = path_trigger_notify,
+
+        .reset_failed = path_reset_failed,
+
+        .bus_set_property = bus_path_set_property,
+
+        .can_start = path_can_start,
+};
+
+const ActivationDetailsVTable activation_details_path_vtable = {
+        .object_size = sizeof(ActivationDetailsPath),
+
+        .done = activation_details_path_done,
+        .serialize = activation_details_path_serialize,
+        .deserialize = activation_details_path_deserialize,
+        .append_env = activation_details_path_append_env,
+        .append_pair = activation_details_path_append_pair,
+};
diff --git a/src/core/path.h b/src/core/path.h
new file mode 100644
index 0000000..cb5b662
--- /dev/null
+++ b/src/core/path.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Path Path;
+typedef struct PathSpec PathSpec;
+typedef struct ActivationDetailsPath ActivationDetailsPath;
+
+#include "unit.h"
+
+typedef enum PathType {
+        PATH_EXISTS,
+        PATH_EXISTS_GLOB,
+        PATH_DIRECTORY_NOT_EMPTY,
+        PATH_CHANGED,
+        PATH_MODIFIED,
+        _PATH_TYPE_MAX,
+        _PATH_TYPE_INVALID = -EINVAL,
+} PathType;
+
+typedef struct PathSpec {
+        Unit *unit;
+
+        char *path;
+
+        sd_event_source *event_source;
+
+        LIST_FIELDS(struct PathSpec, spec);
+
+        PathType type;
+        int inotify_fd;
+        int primary_wd;
+
+        bool previous_exists;
+} PathSpec;
+
+int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler);
+void path_spec_unwatch(PathSpec *s);
+int path_spec_fd_event(PathSpec *s, uint32_t events);
+void path_spec_done(PathSpec *s);
+
+static inline bool path_spec_owns_inotify_fd(PathSpec *s, int fd) {
+        return s->inotify_fd == fd;
+}
+
+typedef enum PathResult {
+        PATH_SUCCESS,
+        PATH_FAILURE_RESOURCES,
+        PATH_FAILURE_START_LIMIT_HIT,
+        PATH_FAILURE_UNIT_START_LIMIT_HIT,
+        PATH_FAILURE_TRIGGER_LIMIT_HIT,
+        _PATH_RESULT_MAX,
+        _PATH_RESULT_INVALID = -EINVAL,
+} PathResult;
+
+struct Path {
+        Unit meta;
+
+        LIST_HEAD(PathSpec, specs);
+
+        PathState state, deserialized_state;
+
+        bool make_directory;
+        mode_t directory_mode;
+
+        PathResult result;
+
+        RateLimit trigger_limit;
+
+        sd_event_source *trigger_notify_event_source;
+};
+
+struct ActivationDetailsPath {
+        ActivationDetails meta;
+        char *trigger_path_filename;
+};
+
+void path_free_specs(Path *p);
+
+extern const UnitVTable path_vtable;
+extern const ActivationDetailsVTable activation_details_path_vtable;
+
+const char* path_type_to_string(PathType i) _const_;
+PathType path_type_from_string(const char *s) _pure_;
+
+const char* path_result_to_string(PathResult i) _const_;
+PathResult path_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(PATH, Path);
+DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_PATH, ActivationDetailsPath, PATH);
diff --git a/src/core/restrict-ifaces.c b/src/core/restrict-ifaces.c
new file mode 100644
index 0000000..4dd8656
--- /dev/null
+++ b/src/core/restrict-ifaces.c
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "fd-util.h"
+#include "restrict-ifaces.h"
+#include "netlink-util.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang and llc compile time dependencies are satisfied */
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/restrict_ifaces/restrict-ifaces-skel.h"
+
+static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) {
+        restrict_ifaces_bpf__destroy(obj);
+        return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_ifaces_bpf *, restrict_ifaces_bpf_free);
+
+static int prepare_restrict_ifaces_bpf(
+                Unit* u,
+                bool is_allow_list,
+                const Set *restrict_network_interfaces,
+                struct restrict_ifaces_bpf **ret_object) {
+
+        _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+        char *iface;
+        int r, map_fd;
+
+        assert(ret_object);
+
+        obj = restrict_ifaces_bpf__open();
+        if (!obj)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "restrict-interfaces: Failed to open BPF object: %m");
+
+        r = sym_bpf_map__set_max_entries(obj->maps.sd_restrictif, MAX(set_size(restrict_network_interfaces), 1u));
+        if (r != 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+                                "restrict-interfaces: Failed to resize BPF map '%s': %m",
+                                sym_bpf_map__name(obj->maps.sd_restrictif));
+
+        obj->rodata->is_allow_list = is_allow_list;
+
+        r = restrict_ifaces_bpf__load(obj);
+        if (r != 0)
+                return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, r, "restrict-interfaces: Failed to load BPF object: %m");
+
+        map_fd = sym_bpf_map__fd(obj->maps.sd_restrictif);
+
+        SET_FOREACH(iface, restrict_network_interfaces) {
+                uint8_t dummy = 0;
+                int ifindex;
+
+                ifindex = rtnl_resolve_interface(&rtnl, iface);
+                if (ifindex < 0) {
+                        log_unit_warning_errno(u, ifindex,
+                                               "restrict-interfaces: Couldn't find index of network interface '%s', ignoring: %m",
+                                               iface);
+                        continue;
+                }
+
+                if (sym_bpf_map_update_elem(map_fd, &ifindex, &dummy, BPF_ANY))
+                        return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+                                                   "restrict-interfaces: Failed to update BPF map '%s' fd: %m",
+                                                   sym_bpf_map__name(obj->maps.sd_restrictif));
+        }
+
+        *ret_object = TAKE_PTR(obj);
+        return 0;
+}
+
+int restrict_network_interfaces_supported(void) {
+        _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+        static int supported = -1;
+        int r;
+
+        if (supported >= 0)
+                return supported;
+
+        if (!cgroup_bpf_supported())
+                return (supported = false);
+
+        if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) {
+                log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported");
+                return (supported = false);
+        }
+
+        r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj);
+        if (r < 0) {
+                log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m");
+                return (supported = false);
+        }
+
+        return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i));
+}
+
+static int restrict_network_interfaces_install_impl(Unit *u) {
+        _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL;
+        _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+        _cleanup_free_ char *cgroup_path = NULL;
+        _cleanup_close_ int cgroup_fd = -EBADF;
+        CGroupContext *cc;
+        int r;
+
+        cc = unit_get_cgroup_context(u);
+        if (!cc)
+                return 0;
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m");
+
+        if (!cc->restrict_network_interfaces)
+                return 0;
+
+        r = prepare_restrict_ifaces_bpf(u,
+                cc->restrict_network_interfaces_is_allow_list,
+                cc->restrict_network_interfaces,
+                &obj);
+        if (r < 0)
+                return r;
+
+        cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC | O_DIRECTORY, 0);
+        if (cgroup_fd < 0)
+                return -errno;
+
+        ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd);
+        r = sym_libbpf_get_error(ingress_link);
+        if (r != 0)
+                return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m");
+
+        egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd);
+        r = sym_libbpf_get_error(egress_link);
+        if (r != 0)
+                return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m");
+
+        u->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link);
+        u->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link);
+
+        return 0;
+}
+
+int restrict_network_interfaces_install(Unit *u) {
+        int r = restrict_network_interfaces_install_impl(u);
+        fdset_close(u->initial_restric_ifaces_link_fds);
+        return r;
+}
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+        int r;
+
+        assert(u);
+
+        r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_ingress_bpf_link);
+        if (r < 0)
+                return r;
+
+        return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_egress_bpf_link);
+}
+
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+        int r;
+
+        assert(u);
+
+        if (!u->initial_restric_ifaces_link_fds) {
+                u->initial_restric_ifaces_link_fds = fdset_new();
+                if (!u->initial_restric_ifaces_link_fds)
+                        return log_oom();
+        }
+
+        r = fdset_put(u->initial_restric_ifaces_link_fds, fd);
+        if (r < 0)
+                return log_unit_error_errno(u, r,
+                        "restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd);
+
+        return 0;
+}
+
+#else /* ! BPF_FRAMEWORK */
+int restrict_network_interfaces_supported(void) {
+        return 0;
+}
+
+int restrict_network_interfaces_install(Unit *u) {
+        return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+                        "restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m");
+}
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+        return 0;
+}
+
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+        return 0;
+}
+#endif
diff --git a/src/core/restrict-ifaces.h b/src/core/restrict-ifaces.h
new file mode 100644
index 0000000..6e7a824
--- /dev/null
+++ b/src/core/restrict-ifaces.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "fdset.h"
+#include "unit.h"
+
+typedef struct Unit Unit;
+
+int restrict_network_interfaces_supported(void);
+int restrict_network_interfaces_install(Unit *u);
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds);
+
+/* Add BPF link fd created before daemon-reload or daemon-reexec.
+ * FDs will be closed at the end of restrict_network_interfaces_install. */
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd);
diff --git a/src/core/scope.c b/src/core/scope.c
new file mode 100644
index 0000000..e4c27da
--- /dev/null
+++ b/src/core/scope.c
@@ -0,0 +1,829 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "cgroup-setup.h"
+#include "dbus-scope.h"
+#include "dbus-unit.h"
+#include "exit-status.h"
+#include "load-dropin.h"
+#include "log.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "scope.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+
+static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
+        [SCOPE_DEAD] = UNIT_INACTIVE,
+        [SCOPE_START_CHOWN] = UNIT_ACTIVATING,
+        [SCOPE_RUNNING] = UNIT_ACTIVE,
+        [SCOPE_ABANDONED] = UNIT_ACTIVE,
+        [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+        [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+        [SCOPE_FAILED] = UNIT_FAILED,
+};
+
+static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+
+static void scope_init(Unit *u) {
+        Scope *s = SCOPE(u);
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        s->runtime_max_usec = USEC_INFINITY;
+        s->timeout_stop_usec = u->manager->defaults.timeout_stop_usec;
+        u->ignore_on_isolate = true;
+        s->user = s->group = NULL;
+        s->oom_policy = _OOM_POLICY_INVALID;
+}
+
+static void scope_done(Unit *u) {
+        Scope *s = SCOPE(u);
+
+        assert(u);
+
+        s->controller = mfree(s->controller);
+        s->controller_track = sd_bus_track_unref(s->controller_track);
+
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+        s->user = mfree(s->user);
+        s->group = mfree(s->group);
+}
+
+static usec_t scope_running_timeout(Scope *s) {
+        usec_t delta = 0;
+
+        assert(s);
+
+        if (s->runtime_rand_extra_usec != 0) {
+                delta = random_u64_range(s->runtime_rand_extra_usec);
+                log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC));
+        }
+
+        return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic,
+                                 s->runtime_max_usec),
+                        delta);
+}
+
+static int scope_arm_timer(Scope *s, bool relative, usec_t usec) {
+        assert(s);
+
+        return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, scope_dispatch_timer);
+}
+
+static void scope_set_state(Scope *s, ScopeState state) {
+        ScopeState old_state;
+        assert(s);
+
+        if (s->state != state)
+                bus_unit_send_pending_change_signal(UNIT(s), false);
+
+        old_state = s->state;
+        s->state = state;
+
+        if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL, SCOPE_START_CHOWN, SCOPE_RUNNING))
+                s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+        if (!IN_SET(old_state, SCOPE_DEAD, SCOPE_FAILED) && IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
+                unit_unwatch_all_pids(UNIT(s));
+                unit_dequeue_rewatch_pids(UNIT(s));
+        }
+
+        if (state != old_state)
+                log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
+
+        unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int scope_add_default_dependencies(Scope *s) {
+        int r;
+
+        assert(s);
+
+        if (!UNIT(s)->default_dependencies)
+                return 0;
+
+        /* Make sure scopes are unloaded on shutdown */
+        r = unit_add_two_dependencies_by_name(
+                        UNIT(s),
+                        UNIT_BEFORE, UNIT_CONFLICTS,
+                        SPECIAL_SHUTDOWN_TARGET, true,
+                        UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int scope_verify(Scope *s) {
+        assert(s);
+        assert(UNIT(s)->load_state == UNIT_LOADED);
+
+        if (set_isempty(UNIT(s)->pids) &&
+            !MANAGER_IS_RELOADING(UNIT(s)->manager) &&
+            !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), "Scope has no PIDs. Refusing.");
+
+        return 0;
+}
+
+static int scope_load_init_scope(Unit *u) {
+        assert(u);
+
+        if (!unit_has_name(u, SPECIAL_INIT_SCOPE))
+                return 0;
+
+        u->transient = true;
+        u->perpetual = true;
+
+        /* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we
+         * synthesize it here, instead of relying on the unit file on disk. */
+
+        u->default_dependencies = false;
+
+        /* Prettify things, if we can. */
+        if (!u->description)
+                u->description = strdup("System and Service Manager");
+        if (!u->documentation)
+                (void) strv_extend(&u->documentation, "man:systemd(1)");
+
+        return 1;
+}
+
+static int scope_add_extras(Scope *s) {
+        int r;
+
+        r = unit_patch_contexts(UNIT(s));
+        if (r < 0)
+                return r;
+
+        r = unit_set_default_slice(UNIT(s));
+        if (r < 0)
+                return r;
+
+        if (s->oom_policy < 0)
+                s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->defaults.oom_policy;
+
+        s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
+        return scope_add_default_dependencies(s);
+}
+
+static int scope_load(Unit *u) {
+        Scope *s = SCOPE(u);
+        int r;
+
+        assert(s);
+        assert(u->load_state == UNIT_STUB);
+
+        if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
+                /* Refuse to load non-transient scope units, but allow them while reloading. */
+                return -ENOENT;
+
+        r = scope_load_init_scope(u);
+        if (r < 0)
+                return r;
+
+        r = unit_load_fragment_and_dropin(u, false);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        r = scope_add_extras(s);
+        if (r < 0)
+                return r;
+
+        return scope_verify(s);
+}
+
+static usec_t scope_coldplug_timeout(Scope *s) {
+        assert(s);
+
+        switch (s->deserialized_state) {
+
+        case SCOPE_RUNNING:
+                return scope_running_timeout(s);
+
+        case SCOPE_STOP_SIGKILL:
+        case SCOPE_STOP_SIGTERM:
+                return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
+
+        default:
+                return USEC_INFINITY;
+        }
+}
+
+static int scope_coldplug(Unit *u) {
+        Scope *s = SCOPE(u);
+        int r;
+
+        assert(s);
+        assert(s->state == SCOPE_DEAD);
+
+        if (s->deserialized_state == s->state)
+                return 0;
+
+        r = scope_arm_timer(s, /* relative= */ false, scope_coldplug_timeout(s));
+        if (r < 0)
+                return r;
+
+        if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) {
+                if (u->pids) {
+                        PidRef *pid;
+
+                        SET_FOREACH(pid, u->pids) {
+                                r = unit_watch_pidref(u, pid, /* exclusive= */ false);
+                                if (r < 0 && r != -EEXIST)
+                                        return r;
+                        }
+                } else
+                        (void) unit_enqueue_rewatch_pids(u);
+        }
+
+        bus_scope_track_controller(s);
+
+        scope_set_state(s, s->deserialized_state);
+        return 0;
+}
+
+static void scope_dump(Unit *u, FILE *f, const char *prefix) {
+        Scope *s = SCOPE(u);
+
+        assert(s);
+        assert(f);
+
+        fprintf(f,
+                "%sScope State: %s\n"
+                "%sResult: %s\n"
+                "%sRuntimeMaxSec: %s\n"
+                "%sRuntimeRandomizedExtraSec: %s\n"
+                "%sOOMPolicy: %s\n",
+                prefix, scope_state_to_string(s->state),
+                prefix, scope_result_to_string(s->result),
+                prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC),
+                prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
+                prefix, oom_policy_to_string(s->oom_policy));
+
+        cgroup_context_dump(UNIT(s), f, prefix);
+        kill_context_dump(&s->kill_context, f, prefix);
+}
+
+static void scope_enter_dead(Scope *s, ScopeResult f) {
+        assert(s);
+
+        if (s->result == SCOPE_SUCCESS)
+                s->result = f;
+
+        unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result));
+        scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD);
+}
+
+static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
+        bool skip_signal = false;
+        int r;
+
+        assert(s);
+
+        if (s->result == SCOPE_SUCCESS)
+                s->result = f;
+
+        /* Before sending any signal, make sure we track all members of this cgroup */
+        (void) unit_watch_all_pids(UNIT(s));
+
+        /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+         * died now */
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+        /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going
+         * directly into SIGTERM berserk mode */
+        if (state == SCOPE_STOP_SIGTERM)
+                skip_signal = bus_scope_send_request_stop(s) > 0;
+
+        if (skip_signal)
+                r = 1; /* wait */
+        else {
+                r = unit_kill_context(
+                                UNIT(s),
+                                &s->kill_context,
+                                state != SCOPE_STOP_SIGTERM ? KILL_KILL :
+                                s->was_abandoned            ? KILL_TERMINATE_AND_LOG :
+                                                              KILL_TERMINATE,
+                                /* main_pid= */ NULL,
+                                /* control_pid= */ NULL,
+                                /* main_pid_alien= */ false);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+                        goto fail;
+                }
+        }
+
+        if (r > 0) {
+                r = scope_arm_timer(s, /* relative= */ true, s->timeout_stop_usec);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                        goto fail;
+                }
+
+                scope_set_state(s, state);
+        } else if (state == SCOPE_STOP_SIGTERM)
+                scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS);
+        else
+                scope_enter_dead(s, SCOPE_SUCCESS);
+
+        return;
+
+fail:
+        scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+}
+
+static int scope_enter_start_chown(Scope *s) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        Unit *u = UNIT(s);
+        int r;
+
+        assert(s);
+        assert(s->user);
+
+        r = scope_arm_timer(s, /* relative= */ true, u->manager->defaults.timeout_start_usec);
+        if (r < 0)
+                return r;
+
+        r = unit_fork_helper_process(u, "(sd-chown-cgroup)", &pidref);
+        if (r < 0)
+                goto fail;
+
+        if (r == 0) {
+                uid_t uid = UID_INVALID;
+                gid_t gid = GID_INVALID;
+
+                if (!isempty(s->user)) {
+                        const char *user = s->user;
+
+                        r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+                        if (r < 0) {
+                                log_unit_error_errno(UNIT(s), r, "Failed to resolve user \"%s\": %m", user);
+                                _exit(EXIT_USER);
+                        }
+                }
+
+                if (!isempty(s->group)) {
+                        const char *group = s->group;
+
+                        r = get_group_creds(&group, &gid, 0);
+                        if (r < 0) {
+                                log_unit_error_errno(UNIT(s), r, "Failed to resolve group \"%s\": %m", group);
+                                _exit(EXIT_GROUP);
+                        }
+                }
+
+                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid);
+                if (r < 0) {
+                        log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m");
+                        _exit(EXIT_CGROUP);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+        if (r < 0)
+                goto fail;
+
+        scope_set_state(s, SCOPE_START_CHOWN);
+
+        return 1;
+fail:
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+        return r;
+}
+
+static int scope_enter_running(Scope *s) {
+        Unit *u = UNIT(s);
+        int r;
+
+        assert(s);
+
+        (void) bus_scope_track_controller(s);
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        unit_export_state_files(u);
+
+        r = unit_attach_pids_to_cgroup(u, u->pids, NULL);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to add PIDs to scope's control group: %m");
+                goto fail;
+        }
+        if (r == 0) {
+                r = log_unit_warning_errno(u, SYNTHETIC_ERRNO(ECHILD), "No PIDs left to attach to the scope's control group, refusing.");
+                goto fail;
+        }
+        log_unit_debug(u, "%i %s added to scope's control group.", r, r == 1 ? "process" : "processes");
+
+        s->result = SCOPE_SUCCESS;
+
+        scope_set_state(s, SCOPE_RUNNING);
+
+        /* Set the maximum runtime timeout. */
+        scope_arm_timer(s, /* relative= */ false, scope_running_timeout(s));
+
+        /* On unified we use proper notifications hence we can unwatch the PIDs
+         * we just attached to the scope. This can also be done on legacy as
+         * we're going to update the list of the processes we watch with the
+         * PIDs currently in the scope anyway. */
+        unit_unwatch_all_pids(u);
+
+        /* Start watching the PIDs currently in the scope (legacy hierarchy only) */
+        (void) unit_enqueue_rewatch_pids(u);
+        return 1;
+
+fail:
+        scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+        return r;
+}
+
+static int scope_start(Unit *u) {
+        Scope *s = SCOPE(u);
+
+        assert(s);
+
+        if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+                return -EPERM;
+
+        if (s->state == SCOPE_FAILED)
+                return -EPERM;
+
+        /* We can't fulfill this right now, please try again later */
+        if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+                return -EAGAIN;
+
+        assert(s->state == SCOPE_DEAD);
+
+        if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
+                return -ENOENT;
+
+        (void) unit_realize_cgroup(u);
+        (void) unit_reset_accounting(u);
+
+        /* We check only for User= option to keep behavior consistent with logic for service units,
+         * i.e. having 'Delegate=true Group=foo' w/o specifying User= has no effect. */
+        if (s->user && unit_cgroup_delegate(u))
+                return scope_enter_start_chown(s);
+
+        return scope_enter_running(s);
+}
+
+static int scope_stop(Unit *u) {
+        Scope *s = SCOPE(u);
+
+        assert(s);
+
+        if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+                return 0;
+
+        assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED));
+
+        scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
+        return 1;
+}
+
+static void scope_reset_failed(Unit *u) {
+        Scope *s = SCOPE(u);
+
+        assert(s);
+
+        if (s->state == SCOPE_FAILED)
+                scope_set_state(s, SCOPE_DEAD);
+
+        s->result = SCOPE_SUCCESS;
+}
+
+static int scope_get_timeout(Unit *u, usec_t *timeout) {
+        Scope *s = SCOPE(u);
+        usec_t t;
+        int r;
+
+        if (!s->timer_event_source)
+                return 0;
+
+        r = sd_event_source_get_time(s->timer_event_source, &t);
+        if (r < 0)
+                return r;
+        if (t == USEC_INFINITY)
+                return 0;
+
+        *timeout = t;
+        return 1;
+}
+
+static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Scope *s = SCOPE(u);
+        PidRef *pid;
+
+        assert(s);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", scope_state_to_string(s->state));
+        (void) serialize_bool(f, "was-abandoned", s->was_abandoned);
+
+        if (s->controller)
+                (void) serialize_item(f, "controller", s->controller);
+
+        SET_FOREACH(pid, u->pids)
+                serialize_pidref(f, fds, "pids", pid);
+
+        return 0;
+}
+
+static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Scope *s = SCOPE(u);
+        int r;
+
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                ScopeState state;
+
+                state = scope_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        s->deserialized_state = state;
+
+        } else if (streq(key, "was-abandoned")) {
+                int k;
+
+                k = parse_boolean(value);
+                if (k < 0)
+                        log_unit_debug(u, "Failed to parse boolean value: %s", value);
+                else
+                        s->was_abandoned = k;
+        } else if (streq(key, "controller")) {
+
+                r = free_and_strdup(&s->controller, value);
+                if (r < 0)
+                        return log_oom();
+
+        } else if (streq(key, "pids")) {
+                _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+                if (deserialize_pidref(fds, value, &pidref) >= 0) {
+                        r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to watch PID, ignoring: %s", value);
+                }
+        } else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static void scope_notify_cgroup_empty_event(Unit *u) {
+        Scope *s = SCOPE(u);
+        assert(u);
+
+        log_unit_debug(u, "cgroup is empty");
+
+        if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+                scope_enter_dead(s, SCOPE_SUCCESS);
+}
+
+static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
+        Scope *s = SCOPE(u);
+
+        if (managed_oom)
+                log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
+        else
+                log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+        if (s->oom_policy == OOM_CONTINUE)
+                return;
+
+        switch (s->state) {
+
+        case SCOPE_START_CHOWN:
+        case SCOPE_RUNNING:
+                scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_OOM_KILL);
+                break;
+
+        case SCOPE_STOP_SIGTERM:
+                scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_OOM_KILL);
+                break;
+
+        case SCOPE_STOP_SIGKILL:
+                if (s->result == SCOPE_SUCCESS)
+                        s->result = SCOPE_FAILURE_OOM_KILL;
+                break;
+        /* SCOPE_DEAD, SCOPE_ABANDONED, and SCOPE_FAILED end up in default */
+        default:
+                ;
+        }
+}
+
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+        Scope *s = SCOPE(u);
+
+        assert(s);
+
+        if (s->state == SCOPE_START_CHOWN) {
+                if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+                        scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+                else
+                        scope_enter_running(s);
+                return;
+        }
+
+        /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
+         * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
+         * process we watched was probably the parent of them, and they are hence now our children. */
+
+        (void) unit_enqueue_rewatch_pids(u);
+}
+
+static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Scope *s = SCOPE(userdata);
+
+        assert(s);
+        assert(s->timer_event_source == source);
+
+        switch (s->state) {
+
+        case SCOPE_RUNNING:
+                log_unit_warning(UNIT(s), "Scope reached runtime time limit. Stopping.");
+                scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_TIMEOUT);
+                break;
+
+        case SCOPE_STOP_SIGTERM:
+                if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+                        scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
+                        scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+                }
+
+                break;
+
+        case SCOPE_STOP_SIGKILL:
+                log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring.");
+                scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+                break;
+
+        case SCOPE_START_CHOWN:
+                log_unit_warning(UNIT(s), "User lookup timed out. Entering failed state.");
+                scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+int scope_abandon(Scope *s) {
+        assert(s);
+
+        if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+                return -EPERM;
+
+        if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
+                return -ESTALE;
+
+        s->was_abandoned = true;
+
+        s->controller = mfree(s->controller);
+        s->controller_track = sd_bus_track_unref(s->controller_track);
+
+        scope_set_state(s, SCOPE_ABANDONED);
+
+        /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that
+         * the remaining processes will be sooner or later reassigned to us as parent. */
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+        return 0;
+}
+
+static UnitActiveState scope_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[SCOPE(u)->state];
+}
+
+static const char *scope_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return scope_state_to_string(SCOPE(u)->state);
+}
+
+static void scope_enumerate_perpetual(Manager *m) {
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        /* Let's unconditionally add the "init.scope" special unit
+         * that encapsulates PID 1. Note that PID 1 already is in the
+         * cgroup for this, we hence just need to allocate the object
+         * for it and that's it. */
+
+        u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
+        if (!u) {
+                r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u);
+                if (r < 0)  {
+                        log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m");
+                        return;
+                }
+        }
+
+        u->transient = true;
+        u->perpetual = true;
+        SCOPE(u)->deserialized_state = SCOPE_RUNNING;
+
+        unit_add_to_load_queue(u);
+        unit_add_to_dbus_queue(u);
+        /* Enqueue an explicit cgroup realization here. Unlike other cgroups this one already exists and is
+         * populated (by us, after all!) already, even when we are not in a reload cycle. Hence we cannot
+         * apply the settings at creation time anymore, but let's at least apply them asynchronously. */
+        unit_add_to_cgroup_realize_queue(u);
+}
+
+static const char* const scope_result_table[_SCOPE_RESULT_MAX] = {
+        [SCOPE_SUCCESS]           = "success",
+        [SCOPE_FAILURE_RESOURCES] = "resources",
+        [SCOPE_FAILURE_TIMEOUT]   = "timeout",
+        [SCOPE_FAILURE_OOM_KILL]  = "oom-kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult);
+
+const UnitVTable scope_vtable = {
+        .object_size = sizeof(Scope),
+        .cgroup_context_offset = offsetof(Scope, cgroup_context),
+        .kill_context_offset = offsetof(Scope, kill_context),
+
+        .sections =
+                "Unit\0"
+                "Scope\0"
+                "Install\0",
+        .private_section = "Scope",
+
+        .can_transient = true,
+        .can_delegate = true,
+        .can_fail = true,
+        .once_only = true,
+        .can_set_managed_oom = true,
+
+        .init = scope_init,
+        .load = scope_load,
+        .done = scope_done,
+
+        .coldplug = scope_coldplug,
+
+        .dump = scope_dump,
+
+        .start = scope_start,
+        .stop = scope_stop,
+
+        .freeze = unit_freeze_vtable_common,
+        .thaw = unit_thaw_vtable_common,
+
+        .get_timeout = scope_get_timeout,
+
+        .serialize = scope_serialize,
+        .deserialize_item = scope_deserialize_item,
+
+        .active_state = scope_active_state,
+        .sub_state_to_string = scope_sub_state_to_string,
+
+        .sigchld_event = scope_sigchld_event,
+
+        .reset_failed = scope_reset_failed,
+
+        .notify_cgroup_empty = scope_notify_cgroup_empty_event,
+        .notify_cgroup_oom = scope_notify_cgroup_oom_event,
+
+        .bus_set_property = bus_scope_set_property,
+        .bus_commit_properties = bus_scope_commit_properties,
+
+        .enumerate_perpetual = scope_enumerate_perpetual,
+};
diff --git a/src/core/scope.h b/src/core/scope.h
new file mode 100644
index 0000000..c9574a3
--- /dev/null
+++ b/src/core/scope.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Scope Scope;
+
+#include "cgroup.h"
+#include "kill.h"
+#include "unit.h"
+
+typedef enum ScopeResult {
+        SCOPE_SUCCESS,
+        SCOPE_FAILURE_RESOURCES,
+        SCOPE_FAILURE_TIMEOUT,
+        SCOPE_FAILURE_OOM_KILL,
+        _SCOPE_RESULT_MAX,
+        _SCOPE_RESULT_INVALID = -EINVAL,
+} ScopeResult;
+
+struct Scope {
+        Unit meta;
+
+        CGroupContext cgroup_context;
+        KillContext kill_context;
+
+        ScopeState state, deserialized_state;
+        ScopeResult result;
+
+        usec_t runtime_max_usec;
+        usec_t runtime_rand_extra_usec;
+        usec_t timeout_stop_usec;
+
+        char *controller;
+        sd_bus_track *controller_track;
+
+        bool was_abandoned;
+
+        sd_event_source *timer_event_source;
+
+        char *user;
+        char *group;
+
+        OOMPolicy oom_policy;
+};
+
+extern const UnitVTable scope_vtable;
+
+int scope_abandon(Scope *s);
+
+const char* scope_result_to_string(ScopeResult i) _const_;
+ScopeResult scope_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SCOPE, Scope);
diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c
new file mode 100644
index 0000000..62181a6
--- /dev/null
+++ b/src/core/selinux-access.c
@@ -0,0 +1,288 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "selinux-access.h"
+
+#if HAVE_SELINUX
+
+#include 
+#include 
+#include 
+#if HAVE_AUDIT
+#include 
+#endif
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "audit-fd.h"
+#include "bus-util.h"
+#include "errno-util.h"
+#include "format-util.h"
+#include "log.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+
+static bool initialized = false;
+
+struct audit_info {
+        sd_bus_creds *creds;
+        const char *path;
+        const char *cmdline;
+        const char *function;
+};
+
+/*
+   Any time an access gets denied this callback will be called
+   with the audit data.  We then need to just copy the audit data into the msgbuf.
+*/
+static int audit_callback(
+                void *auditdata,
+                security_class_t cls,
+                char *msgbuf,
+                size_t msgbufsize) {
+
+        const struct audit_info *audit = auditdata;
+        uid_t uid = 0, login_uid = 0;
+        gid_t gid = 0;
+        char login_uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a";
+        char uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a";
+        char gid_buf[DECIMAL_STR_MAX(gid_t) + 1] = "n/a";
+
+        if (sd_bus_creds_get_audit_login_uid(audit->creds, &login_uid) >= 0)
+                xsprintf(login_uid_buf, UID_FMT, login_uid);
+        if (sd_bus_creds_get_euid(audit->creds, &uid) >= 0)
+                xsprintf(uid_buf, UID_FMT, uid);
+        if (sd_bus_creds_get_egid(audit->creds, &gid) >= 0)
+                xsprintf(gid_buf, GID_FMT, gid);
+
+        (void) snprintf(msgbuf, msgbufsize,
+                        "auid=%s uid=%s gid=%s%s%s%s%s%s%s%s%s%s",
+                        login_uid_buf, uid_buf, gid_buf,
+                        audit->path ? " path=\"" : "", strempty(audit->path), audit->path ? "\"" : "",
+                        audit->cmdline ? " cmdline=\"" : "", strempty(audit->cmdline), audit->cmdline ? "\"" : "",
+                        audit->function ? " function=\"" : "", strempty(audit->function), audit->function ? "\"" : "");
+
+        return 0;
+}
+
+static int callback_type_to_priority(int type) {
+        switch (type) {
+
+        case SELINUX_ERROR:
+                return LOG_ERR;
+
+        case SELINUX_WARNING:
+                return LOG_WARNING;
+
+        case SELINUX_INFO:
+                return LOG_INFO;
+
+        case SELINUX_AVC:
+        default:
+                return LOG_NOTICE;
+        }
+}
+
+/*
+   libselinux uses this callback when access gets denied or other
+   events happen. If audit is turned on, messages will be reported
+   using audit netlink, otherwise they will be logged using the usual
+   channels.
+
+   Code copied from dbus and modified.
+*/
+_printf_(2, 3) static int log_callback(int type, const char *fmt, ...) {
+        va_list ap;
+        const char *fmt2;
+
+#if HAVE_AUDIT
+        int fd;
+
+        fd = get_audit_fd();
+
+        if (fd >= 0) {
+                _cleanup_free_ char *buf = NULL;
+                int r;
+
+                va_start(ap, fmt);
+                r = vasprintf(&buf, fmt, ap);
+                va_end(ap);
+
+                if (r >= 0) {
+                        if (type == SELINUX_AVC)
+                                audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_AVC, buf, NULL, NULL, NULL, getuid());
+                        else if (type == SELINUX_ERROR)
+                                audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_SELINUX_ERR, buf, NULL, NULL, NULL, getuid());
+
+                        return 0;
+                }
+        }
+#endif
+
+        fmt2 = strjoina("selinux: ", fmt);
+
+        va_start(ap, fmt);
+
+        DISABLE_WARNING_FORMAT_NONLITERAL;
+        log_internalv(LOG_AUTH | callback_type_to_priority(type),
+                      0, PROJECT_FILE, __LINE__, __func__,
+                      fmt2, ap);
+        REENABLE_WARNING;
+        va_end(ap);
+
+        return 0;
+}
+
+static int access_init(sd_bus_error *error) {
+        int r;
+
+        if (!mac_selinux_use())
+                return 0;
+
+        if (initialized)
+                return 1;
+
+        if (avc_open(NULL, 0) != 0) {
+                r = -errno;  /* Save original errno for later */
+
+                bool enforce = security_getenforce() != 0;
+                log_full_errno(enforce ? LOG_ERR : LOG_WARNING, r, "Failed to open the SELinux AVC: %m");
+
+                /* If enforcement isn't on, then let's suppress this error, and just don't do any AVC checks.
+                 * The warning we printed is hence all the admin will see. */
+                if (!enforce)
+                        return 0;
+
+                /* Return an access denied error based on the original errno, if we couldn't load the AVC but
+                 * enforcing mode was on, or we couldn't determine whether it is one. */
+                errno = -r;
+                return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to open the SELinux AVC: %m");
+        }
+
+        selinux_set_callback(SELINUX_CB_AUDIT, (union selinux_callback) { .func_audit = audit_callback });
+        selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) { .func_log = log_callback });
+
+        initialized = true;
+        return 1;
+}
+
+/*
+   This function communicates with the kernel to check whether or not it should
+   allow the access.
+   If the machine is in permissive mode it will return ok.  Audit messages will
+   still be generated if the access would be denied in enforcing mode.
+*/
+int mac_selinux_access_check_internal(
+                sd_bus_message *message,
+                const char *unit_path,
+                const char *unit_context,
+                const char *permission,
+                const char *function,
+                sd_bus_error *error) {
+
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        const char *tclass, *scon, *acon;
+        _cleanup_free_ char *cl = NULL;
+        _cleanup_freecon_ char *fcon = NULL;
+        char **cmdline = NULL;
+        bool enforce;
+        int r = 0;
+
+        assert(message);
+        assert(permission);
+        assert(function);
+        assert(error);
+
+        r = access_init(error);
+        if (r <= 0)
+                return r;
+
+        /* delay call until we checked in `access_init()` if SELinux is actually enabled */
+        enforce = mac_selinux_enforcing();
+
+        r = sd_bus_query_sender_creds(
+                        message,
+                        SD_BUS_CREDS_PID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID|
+                        SD_BUS_CREDS_CMDLINE|SD_BUS_CREDS_AUDIT_LOGIN_UID|
+                        SD_BUS_CREDS_SELINUX_CONTEXT|
+                        SD_BUS_CREDS_AUGMENT /* get more bits from /proc */,
+                        &creds);
+        if (r < 0)
+                return r;
+
+        /* The SELinux context is something we really should have gotten directly from the message or sender,
+         * and not be an augmented field. If it was augmented we cannot use it for authorization, since this
+         * is racy and vulnerable. Let's add an extra check, just in case, even though this really shouldn't
+         * be possible. */
+        assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_SELINUX_CONTEXT) == 0, -EPERM);
+
+        r = sd_bus_creds_get_selinux_context(creds, &scon);
+        if (r < 0)
+                return r;
+
+        if (unit_context) {
+                /* Nice! The unit comes with a SELinux context read from the unit file */
+                acon = unit_context;
+                tclass = "service";
+        } else {
+                /* If no unit context is known, use our own */
+                if (getcon_raw(&fcon) < 0) {
+                        log_warning_errno(errno, "SELinux getcon_raw() failed%s (perm=%s): %m",
+                                          enforce ? "" : ", ignoring",
+                                          permission);
+                        if (!enforce)
+                                return 0;
+
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get current context: %m");
+                }
+                if (!fcon) {
+                        if (!enforce)
+                                return 0;
+
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "We appear not to have any SELinux context: %m");
+                }
+
+                acon = fcon;
+                tclass = "system";
+        }
+
+        sd_bus_creds_get_cmdline(creds, &cmdline);
+        cl = strv_join(cmdline, " ");
+
+        struct audit_info audit_info = {
+                .creds = creds,
+                .path = unit_path,
+                .cmdline = cl,
+                .function = function,
+        };
+
+        r = selinux_check_access(scon, acon, tclass, permission, &audit_info);
+        if (r < 0) {
+                errno = -(r = errno_or_else(EPERM));
+
+                if (enforce)
+                        sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "SELinux policy denies access: %m");
+        }
+
+        log_full_errno_zerook(LOG_DEBUG, r,
+                              "SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s function=%s path=%s cmdline=%s: %m",
+                              scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), strna(empty_to_null(cl)));
+        return enforce ? r : 0;
+}
+
+#else /* HAVE_SELINUX */
+
+int mac_selinux_access_check_internal(
+                sd_bus_message *message,
+                const char *unit_path,
+                const char *unit_label,
+                const char *permission,
+                const char *function,
+                sd_bus_error *error) {
+
+        return 0;
+}
+
+#endif /* HAVE_SELINUX */
diff --git a/src/core/selinux-access.h b/src/core/selinux-access.h
new file mode 100644
index 0000000..dc8da9e
--- /dev/null
+++ b/src/core/selinux-access.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "manager.h"
+
+int mac_selinux_access_check_internal(sd_bus_message *message, const char *unit_path, const char *unit_label, const char *permission, const char *function, sd_bus_error *error);
+
+#define mac_selinux_access_check(message, permission, error) \
+        mac_selinux_access_check_internal((message), NULL, NULL, (permission), __func__, (error))
+
+#define mac_selinux_unit_access_check(unit, message, permission, error) \
+        mac_selinux_access_check_internal((message), (unit)->fragment_path, (unit)->access_selinux_context, (permission), __func__, (error))
diff --git a/src/core/selinux-setup.c b/src/core/selinux-setup.c
new file mode 100644
index 0000000..bc1a249
--- /dev/null
+++ b/src/core/selinux-setup.c
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#if HAVE_SELINUX
+#include 
+#endif
+
+#include "sd-messages.h"
+
+#include "initrd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "selinux-setup.h"
+#include "selinux-util.h"
+#include "string-util.h"
+#include "time-util.h"
+
+#if HAVE_SELINUX
+_printf_(2,3)
+static int null_log(int type, const char *fmt, ...) {
+        return 0;
+}
+#endif
+
+int mac_selinux_setup(bool *loaded_policy) {
+
+#if HAVE_SELINUX
+        int enforce = 0;
+        usec_t before_load, after_load;
+        char *con;
+        int r;
+        bool initialized;
+
+        assert(loaded_policy);
+
+        /* Turn off all of SELinux' own logging, we want to do that */
+        selinux_set_callback(SELINUX_CB_LOG, (const union selinux_callback) { .func_log = null_log });
+
+        /* Don't load policy in the initrd if we don't appear to have it.  For the real root, we check below
+         * if we've already loaded policy, and return gracefully. */
+        if (in_initrd() && access(selinux_path(), F_OK) < 0)
+                return 0;
+
+        /* Already initialized by somebody else? */
+        r = getcon_raw(&con);
+        /* getcon_raw can return 0, and still give us a NULL pointer if /proc/self/attr/current is
+         * empty. SELinux guarantees this won't happen, but that file isn't specific to SELinux, and may be
+         * provided by some other arbitrary LSM with different semantics. */
+        if (r == 0 && con) {
+                initialized = !streq(con, "kernel");
+                freecon(con);
+        } else
+                initialized = false;
+
+        /* Make sure we have no fds open while loading the policy and
+         * transitioning */
+        log_close();
+
+        /* Now load the policy */
+        before_load = now(CLOCK_MONOTONIC);
+        r = selinux_init_load_policy(&enforce);
+        if (r == 0) {
+                _cleanup_(mac_selinux_freep) char *label = NULL;
+
+                mac_selinux_retest();
+
+                /* Transition to the new context */
+                r = mac_selinux_get_create_label_from_exe(SYSTEMD_BINARY_PATH, &label);
+                if (r < 0 || !label) {
+                        log_open();
+                        log_error("Failed to compute init label, ignoring.");
+                } else {
+                        r = setcon_raw(label);
+
+                        log_open();
+                        if (r < 0)
+                                log_error("Failed to transition into init label '%s', ignoring.", label);
+                }
+
+                after_load = now(CLOCK_MONOTONIC);
+
+                log_info("Successfully loaded SELinux policy in %s.",
+                         FORMAT_TIMESPAN(after_load - before_load, 0));
+
+                *loaded_policy = true;
+
+        } else {
+                log_open();
+
+                if (enforce > 0) {
+                        if (!initialized)
+                                return log_struct_errno(LOG_EMERG, SYNTHETIC_ERRNO(EIO),
+                                                        LOG_MESSAGE("Failed to load SELinux policy :%m"),
+                                                        "MESSAGE_ID=" SD_MESSAGE_SELINUX_FAILED_STR);
+
+                        log_warning("Failed to load new SELinux policy. Continuing with old policy.");
+                } else
+                        log_debug("Unable to load SELinux policy. Ignoring.");
+        }
+#endif
+
+        return 0;
+}
diff --git a/src/core/selinux-setup.h b/src/core/selinux-setup.h
new file mode 100644
index 0000000..cdff51d
--- /dev/null
+++ b/src/core/selinux-setup.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+int mac_selinux_setup(bool *loaded_policy);
diff --git a/src/core/service.c b/src/core/service.c
new file mode 100644
index 0000000..060ac08
--- /dev/null
+++ b/src/core/service.c
@@ -0,0 +1,5161 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "async.h"
+#include "bus-error.h"
+#include "bus-kernel.h"
+#include "bus-util.h"
+#include "chase.h"
+#include "constants.h"
+#include "dbus-service.h"
+#include "dbus-unit.h"
+#include "devnum-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "manager.h"
+#include "missing_audit.h"
+#include "open-file.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "service.h"
+#include "signal-util.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "utf8.h"
+
+#define service_spawn(...) service_spawn_internal(__func__, __VA_ARGS__)
+
+static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
+        [SERVICE_DEAD] = UNIT_INACTIVE,
+        [SERVICE_CONDITION] = UNIT_ACTIVATING,
+        [SERVICE_START_PRE] = UNIT_ACTIVATING,
+        [SERVICE_START] = UNIT_ACTIVATING,
+        [SERVICE_START_POST] = UNIT_ACTIVATING,
+        [SERVICE_RUNNING] = UNIT_ACTIVE,
+        [SERVICE_EXITED] = UNIT_ACTIVE,
+        [SERVICE_RELOAD] = UNIT_RELOADING,
+        [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
+        [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
+        [SERVICE_STOP] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+        [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+        [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+        [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+        [SERVICE_FAILED] = UNIT_FAILED,
+        [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
+        [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED,
+        [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
+        [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+        [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
+        [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+};
+
+/* For Type=idle we never want to delay any other jobs, hence we
+ * consider idle jobs active as soon as we start working on them */
+static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = {
+        [SERVICE_DEAD] = UNIT_INACTIVE,
+        [SERVICE_CONDITION] = UNIT_ACTIVE,
+        [SERVICE_START_PRE] = UNIT_ACTIVE,
+        [SERVICE_START] = UNIT_ACTIVE,
+        [SERVICE_START_POST] = UNIT_ACTIVE,
+        [SERVICE_RUNNING] = UNIT_ACTIVE,
+        [SERVICE_EXITED] = UNIT_ACTIVE,
+        [SERVICE_RELOAD] = UNIT_RELOADING,
+        [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
+        [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
+        [SERVICE_STOP] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+        [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+        [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+        [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+        [SERVICE_FAILED] = UNIT_FAILED,
+        [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
+        [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED,
+        [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
+        [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+        [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
+        [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
+static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata);
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
+
+static void service_enter_signal(Service *s, ServiceState state, ServiceResult f);
+static void service_enter_reload_by_notify(Service *s);
+
+static void service_init(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        s->timeout_start_usec = u->manager->defaults.timeout_start_usec;
+        s->timeout_stop_usec = u->manager->defaults.timeout_stop_usec;
+        s->timeout_abort_usec = u->manager->defaults.timeout_abort_usec;
+        s->timeout_abort_set = u->manager->defaults.timeout_abort_set;
+        s->restart_usec = u->manager->defaults.restart_usec;
+        s->restart_max_delay_usec = USEC_INFINITY;
+        s->runtime_max_usec = USEC_INFINITY;
+        s->type = _SERVICE_TYPE_INVALID;
+        s->socket_fd = -EBADF;
+        s->stdin_fd = s->stdout_fd = s->stderr_fd = -EBADF;
+        s->guess_main_pid = true;
+        s->main_pid = PIDREF_NULL;
+        s->control_pid = PIDREF_NULL;
+        s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+        s->exec_context.keyring_mode = MANAGER_IS_SYSTEM(u->manager) ?
+                EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT;
+
+        s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+
+        s->watchdog_original_usec = USEC_INFINITY;
+
+        s->oom_policy = _OOM_POLICY_INVALID;
+        s->reload_begin_usec = USEC_INFINITY;
+        s->reload_signal = SIGHUP;
+
+        s->fd_store_preserve_mode = EXEC_PRESERVE_RESTART;
+}
+
+static void service_unwatch_control_pid(Service *s) {
+        assert(s);
+
+        if (!pidref_is_set(&s->control_pid))
+                return;
+
+        unit_unwatch_pidref(UNIT(s), &s->control_pid);
+        pidref_done(&s->control_pid);
+}
+
+static void service_unwatch_main_pid(Service *s) {
+        assert(s);
+
+        if (!pidref_is_set(&s->main_pid))
+                return;
+
+        unit_unwatch_pidref(UNIT(s), &s->main_pid);
+        pidref_done(&s->main_pid);
+}
+
+static void service_unwatch_pid_file(Service *s) {
+        if (!s->pid_file_pathspec)
+                return;
+
+        log_unit_debug(UNIT(s), "Stopping watch for PID file %s", s->pid_file_pathspec->path);
+        path_spec_unwatch(s->pid_file_pathspec);
+        path_spec_done(s->pid_file_pathspec);
+        s->pid_file_pathspec = mfree(s->pid_file_pathspec);
+}
+
+static int service_set_main_pidref(Service *s, PidRef *pidref) {
+        int r;
+
+        assert(s);
+
+        /* Takes ownership of the specified pidref on success, but not on failure. */
+
+        if (!pidref_is_set(pidref))
+                return -ESRCH;
+
+        if (pidref->pid <= 1)
+                return -EINVAL;
+
+        if (pidref_is_self(pidref))
+                return -EINVAL;
+
+        if (pidref_equal(&s->main_pid, pidref) && s->main_pid_known) {
+                pidref_done(pidref);
+                return 0;
+        }
+
+        if (!pidref_equal(&s->main_pid, pidref)) {
+                service_unwatch_main_pid(s);
+                exec_status_start(&s->main_exec_status, pidref->pid);
+        }
+
+        s->main_pid = TAKE_PIDREF(*pidref);
+        s->main_pid_known = true;
+
+        r = pidref_is_my_child(&s->main_pid);
+        if (r < 0)
+                log_unit_warning_errno(UNIT(s), r, "Can't determine if process "PID_FMT" is our child, assuming it is not: %m", s->main_pid.pid);
+        else if (r == 0)
+                log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", s->main_pid.pid);
+
+        s->main_pid_alien = r <= 0;
+        return 0;
+}
+
+void service_release_socket_fd(Service *s) {
+        assert(s);
+
+        if (s->socket_fd < 0 && !UNIT_ISSET(s->accept_socket) && !s->socket_peer)
+                return;
+
+        log_unit_debug(UNIT(s), "Closing connection socket.");
+
+        /* Undo the effect of service_set_socket_fd(). */
+
+        s->socket_fd = asynchronous_close(s->socket_fd);
+
+        if (UNIT_ISSET(s->accept_socket)) {
+                socket_connection_unref(SOCKET(UNIT_DEREF(s->accept_socket)));
+                unit_ref_unset(&s->accept_socket);
+        }
+
+        s->socket_peer = socket_peer_unref(s->socket_peer);
+}
+
+static void service_override_notify_access(Service *s, NotifyAccess notify_access_override) {
+        assert(s);
+
+        s->notify_access_override = notify_access_override;
+
+        log_unit_debug(UNIT(s), "notify_access=%s", notify_access_to_string(s->notify_access));
+        log_unit_debug(UNIT(s), "notify_access_override=%s", notify_access_to_string(s->notify_access_override));
+}
+
+static void service_stop_watchdog(Service *s) {
+        assert(s);
+
+        s->watchdog_event_source = sd_event_source_disable_unref(s->watchdog_event_source);
+        s->watchdog_timestamp = DUAL_TIMESTAMP_NULL;
+}
+
+static void service_start_watchdog(Service *s) {
+        usec_t watchdog_usec;
+        int r;
+
+        assert(s);
+
+        watchdog_usec = service_get_watchdog_usec(s);
+        if (!timestamp_is_set(watchdog_usec)) {
+                service_stop_watchdog(s);
+                return;
+        }
+
+        if (s->watchdog_event_source) {
+                r = sd_event_source_set_time(s->watchdog_event_source, usec_add(s->watchdog_timestamp.monotonic, watchdog_usec));
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to reset watchdog timer: %m");
+                        return;
+                }
+
+                r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ONESHOT);
+        } else {
+                r = sd_event_add_time(
+                                UNIT(s)->manager->event,
+                                &s->watchdog_event_source,
+                                CLOCK_MONOTONIC,
+                                usec_add(s->watchdog_timestamp.monotonic, watchdog_usec), 0,
+                                service_dispatch_watchdog, s);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to add watchdog timer: %m");
+                        return;
+                }
+
+                (void) sd_event_source_set_description(s->watchdog_event_source, "service-watchdog");
+
+                /* Let's process everything else which might be a sign
+                 * of living before we consider a service died. */
+                r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE);
+        }
+        if (r < 0)
+                log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m");
+}
+
+usec_t service_restart_usec_next(Service *s) {
+        unsigned n_restarts_next;
+
+        assert(s);
+
+        /* When the service state is in SERVICE_*_BEFORE_AUTO_RESTART or SERVICE_AUTO_RESTART, we still need
+         * to add 1 to s->n_restarts manually, because s->n_restarts is not updated until a restart job is
+         * enqueued, i.e. state has transitioned to SERVICE_AUTO_RESTART_QUEUED. */
+        n_restarts_next = s->n_restarts + (s->state == SERVICE_AUTO_RESTART_QUEUED ? 0 : 1);
+
+        if (n_restarts_next <= 1 ||
+            s->restart_steps == 0 ||
+            s->restart_usec == 0 ||
+            s->restart_max_delay_usec == USEC_INFINITY ||
+            s->restart_usec >= s->restart_max_delay_usec)
+                return s->restart_usec;
+
+        if (n_restarts_next > s->restart_steps)
+                return s->restart_max_delay_usec;
+
+        /* Enforced in service_verify() and above */
+        assert(s->restart_max_delay_usec > s->restart_usec);
+
+        /* r_i / r_0 = (r_n / r_0) ^ (i / n)
+         * where,
+         *   r_0 : initial restart usec (s->restart_usec),
+         *   r_i : i-th restart usec (value),
+         *   r_n : maximum restart usec (s->restart_max_delay_usec),
+         *   i : index of the next step (n_restarts_next - 1)
+         *   n : num maximum steps (s->restart_steps) */
+        return (usec_t) (s->restart_usec * powl((long double) s->restart_max_delay_usec / s->restart_usec,
+                                                (long double) (n_restarts_next - 1) / s->restart_steps));
+}
+
+static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) {
+        usec_t current;
+        int r;
+
+        assert(s);
+
+        /* Extends the specified event source timer to at least the specified time, unless it is already later
+         * anyway. */
+
+        if (!source)
+                return;
+
+        r = sd_event_source_get_time(source, ¤t);
+        if (r < 0) {
+                const char *desc;
+                (void) sd_event_source_get_description(s->timer_event_source, &desc);
+                log_unit_warning_errno(UNIT(s), r, "Failed to retrieve timeout time for event source '%s', ignoring: %m", strna(desc));
+                return;
+        }
+
+        if (current >= extended) /* Current timeout is already longer, ignore this. */
+                return;
+
+        r = sd_event_source_set_time(source, extended);
+        if (r < 0) {
+                const char *desc;
+                (void) sd_event_source_get_description(s->timer_event_source, &desc);
+                log_unit_warning_errno(UNIT(s), r, "Failed to set timeout time for event source '%s', ignoring %m", strna(desc));
+        }
+}
+
+static void service_extend_timeout(Service *s, usec_t extend_timeout_usec) {
+        usec_t extended;
+
+        assert(s);
+
+        if (!timestamp_is_set(extend_timeout_usec))
+                return;
+
+        extended = usec_add(now(CLOCK_MONOTONIC), extend_timeout_usec);
+
+        service_extend_event_source_timeout(s, s->timer_event_source, extended);
+        service_extend_event_source_timeout(s, s->watchdog_event_source, extended);
+}
+
+static void service_reset_watchdog(Service *s) {
+        assert(s);
+
+        dual_timestamp_now(&s->watchdog_timestamp);
+        service_start_watchdog(s);
+}
+
+static void service_override_watchdog_timeout(Service *s, usec_t watchdog_override_usec) {
+        assert(s);
+
+        s->watchdog_override_enable = true;
+        s->watchdog_override_usec = watchdog_override_usec;
+        service_reset_watchdog(s);
+
+        log_unit_debug(UNIT(s), "watchdog_usec="USEC_FMT, s->watchdog_usec);
+        log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec);
+}
+
+static ServiceFDStore* service_fd_store_unlink(ServiceFDStore *fs) {
+        if (!fs)
+                return NULL;
+
+        if (fs->service) {
+                assert(fs->service->n_fd_store > 0);
+                LIST_REMOVE(fd_store, fs->service->fd_store, fs);
+                fs->service->n_fd_store--;
+        }
+
+        sd_event_source_disable_unref(fs->event_source);
+
+        free(fs->fdname);
+        asynchronous_close(fs->fd);
+        return mfree(fs);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(ServiceFDStore*, service_fd_store_unlink);
+
+static void service_release_fd_store(Service *s) {
+        assert(s);
+
+        if (!s->fd_store)
+                return;
+
+        log_unit_debug(UNIT(s), "Releasing all stored fds");
+
+        while (s->fd_store)
+                service_fd_store_unlink(s->fd_store);
+
+        assert(s->n_fd_store == 0);
+}
+
+static void service_release_stdio_fd(Service *s) {
+        assert(s);
+
+        if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stdout_fd < 0)
+                return;
+
+        log_unit_debug(UNIT(s), "Releasing stdin/stdout/stderr file descriptors.");
+
+        s->stdin_fd = asynchronous_close(s->stdin_fd);
+        s->stdout_fd = asynchronous_close(s->stdout_fd);
+        s->stderr_fd = asynchronous_close(s->stderr_fd);
+}
+static void service_done(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        open_file_free_many(&s->open_files);
+
+        s->pid_file = mfree(s->pid_file);
+        s->status_text = mfree(s->status_text);
+
+        s->exec_runtime = exec_runtime_free(s->exec_runtime);
+        exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
+        s->control_command = NULL;
+        s->main_command = NULL;
+
+        exit_status_set_free(&s->restart_prevent_status);
+        exit_status_set_free(&s->restart_force_status);
+        exit_status_set_free(&s->success_status);
+
+        /* This will leak a process, but at least no memory or any of our resources */
+        service_unwatch_main_pid(s);
+        service_unwatch_control_pid(s);
+        service_unwatch_pid_file(s);
+
+        if (s->bus_name)  {
+                unit_unwatch_bus_name(u, s->bus_name);
+                s->bus_name = mfree(s->bus_name);
+        }
+
+        s->bus_name_owner = mfree(s->bus_name_owner);
+
+        s->usb_function_descriptors = mfree(s->usb_function_descriptors);
+        s->usb_function_strings = mfree(s->usb_function_strings);
+
+        service_stop_watchdog(s);
+
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+        s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+        s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+        service_release_socket_fd(s);
+        service_release_stdio_fd(s);
+        service_release_fd_store(s);
+}
+
+static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+        ServiceFDStore *fs = ASSERT_PTR(userdata);
+
+        assert(e);
+
+        /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */
+        log_unit_debug(UNIT(fs->service),
+                       "Received %s on stored fd %d (%s), closing.",
+                       revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP",
+                       fs->fd, strna(fs->fdname));
+        service_fd_store_unlink(fs);
+        return 0;
+}
+
+static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do_poll) {
+        _cleanup_(service_fd_store_unlinkp) ServiceFDStore *fs = NULL;
+        _cleanup_(asynchronous_closep) int fd = ASSERT_FD(fd_in);
+        struct stat st;
+        int r;
+
+        /* fd is always consumed even if the function fails. */
+
+        assert(s);
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64, DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino);
+
+        if (s->n_fd_store >= s->n_fd_store_max)
+                /* Our store is full.  Use this errno rather than E[NM]FILE to distinguish from the case
+                 * where systemd itself hits the file limit. */
+                return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(EXFULL), "Hit fd store limit.");
+
+        LIST_FOREACH(fd_store, i, s->fd_store) {
+                r = same_fd(i->fd, fd);
+                if (r < 0)
+                        return r;
+                if (r > 0) {
+                        log_unit_debug(UNIT(s), "Suppressing duplicate fd %i in fd store.", fd);
+                        return 0; /* fd already included */
+                }
+        }
+
+        fs = new(ServiceFDStore, 1);
+        if (!fs)
+                return -ENOMEM;
+
+        *fs = (ServiceFDStore) {
+                .fd = TAKE_FD(fd),
+                .do_poll = do_poll,
+                .fdname = strdup(name ?: "stored"),
+        };
+
+        if (!fs->fdname)
+                return -ENOMEM;
+
+        if (do_poll) {
+                r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fs->fd, 0, on_fd_store_io, fs);
+                if (r < 0 && r != -EPERM) /* EPERM indicates fds that aren't pollable, which is OK */
+                        return r;
+                else if (r >= 0)
+                        (void) sd_event_source_set_description(fs->event_source, "service-fd-store");
+        }
+
+        fs->service = s;
+        LIST_PREPEND(fd_store, s->fd_store, fs);
+        s->n_fd_store++;
+
+        log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname);
+
+        TAKE_PTR(fs);
+        return 1; /* fd newly stored */
+}
+
+static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll) {
+        int r;
+
+        assert(s);
+
+        for (;;) {
+                int fd;
+
+                fd = fdset_steal_first(fds);
+                if (fd < 0)
+                        break;
+
+                r = service_add_fd_store(s, fd, name, do_poll);
+                if (r == -EXFULL)
+                        return log_unit_warning_errno(UNIT(s), r,
+                                                      "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.",
+                                                      s->n_fd_store_max);
+                if (r < 0)
+                        return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m");
+        }
+
+        return 0;
+}
+
+static void service_remove_fd_store(Service *s, const char *name) {
+        assert(s);
+        assert(name);
+
+        LIST_FOREACH(fd_store, fs, s->fd_store) {
+                if (!streq(fs->fdname, name))
+                        continue;
+
+                log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name);
+                service_fd_store_unlink(fs);
+        }
+}
+
+static usec_t service_running_timeout(Service *s) {
+        usec_t delta = 0;
+
+        assert(s);
+
+        if (s->runtime_rand_extra_usec != 0) {
+                delta = random_u64_range(s->runtime_rand_extra_usec);
+                log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC));
+        }
+
+        return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic,
+                                 s->runtime_max_usec),
+                        delta);
+}
+
+static int service_arm_timer(Service *s, bool relative, usec_t usec) {
+        assert(s);
+
+        return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, service_dispatch_timer);
+}
+
+static int service_verify(Service *s) {
+        assert(s);
+        assert(UNIT(s)->load_state == UNIT_LOADED);
+
+        for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++)
+                LIST_FOREACH(command, command, s->exec_command[c]) {
+                        if (!path_is_absolute(command->path) && !filename_is_valid(command->path))
+                                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC),
+                                                            "Service %s= binary path \"%s\" is neither a valid executable name nor an absolute path. Refusing.",
+                                                            command->path,
+                                                            service_exec_command_to_string(c));
+                        if (strv_isempty(command->argv))
+                                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC),
+                                                            "Service has an empty argv in %s=. Refusing.",
+                                                            service_exec_command_to_string(c));
+                }
+
+        if (!s->exec_command[SERVICE_EXEC_START] && !s->exec_command[SERVICE_EXEC_STOP] &&
+            UNIT(s)->success_action == EMERGENCY_ACTION_NONE)
+                /* FailureAction= only makes sense if one of the start or stop commands is specified.
+                 * SuccessAction= will be executed unconditionally if no commands are specified. Hence,
+                 * either a command or SuccessAction= are required. */
+
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart=, ExecStop=, or SuccessAction=. Refusing.");
+
+        if (s->type != SERVICE_ONESHOT && !s->exec_command[SERVICE_EXEC_START])
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= setting, which is only allowed for Type=oneshot services. Refusing.");
+
+        if (!s->remain_after_exit && !s->exec_command[SERVICE_EXEC_START] && UNIT(s)->success_action == EMERGENCY_ACTION_NONE)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= and no SuccessAction= settings and does not have RemainAfterExit=yes set. Refusing.");
+
+        if (s->type != SERVICE_ONESHOT && s->exec_command[SERVICE_EXEC_START]->command_next)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. Refusing.");
+
+        if (s->type == SERVICE_ONESHOT && IN_SET(s->restart, SERVICE_RESTART_ALWAYS, SERVICE_RESTART_ON_SUCCESS))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing.");
+
+        if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has RestartForceExitStatus= set, which isn't allowed for Type=oneshot services. Refusing.");
+
+        if (s->type == SERVICE_ONESHOT && s->exit_type == SERVICE_EXIT_CGROUP)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has ExitType=cgroup set, which isn't allowed for Type=oneshot services. Refusing.");
+
+        if (s->type == SERVICE_DBUS && !s->bus_name)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
+
+        if (s->exec_context.pam_name && !IN_SET(s->kill_context.kill_mode, KILL_CONTROL_GROUP, KILL_MIXED))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing.");
+
+        if (s->usb_function_descriptors && !s->usb_function_strings)
+                log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring.");
+
+        if (!s->usb_function_descriptors && s->usb_function_strings)
+                log_unit_warning(UNIT(s), "Service has USBFunctionStrings= setting, but no USBFunctionDescriptors=. Ignoring.");
+
+        if (s->runtime_max_usec != USEC_INFINITY && s->type == SERVICE_ONESHOT)
+                log_unit_warning(UNIT(s), "RuntimeMaxSec= has no effect in combination with Type=oneshot. Ignoring.");
+
+        if (s->runtime_max_usec == USEC_INFINITY && s->runtime_rand_extra_usec != 0)
+                log_unit_warning(UNIT(s), "Service has RuntimeRandomizedExtraSec= setting, but no RuntimeMaxSec=. Ignoring.");
+
+        if (s->exit_type == SERVICE_EXIT_CGROUP && cg_unified() < CGROUP_UNIFIED_SYSTEMD)
+                log_unit_warning(UNIT(s), "Service has ExitType=cgroup set, but we are running with legacy cgroups v1, which might not work correctly. Continuing.");
+
+        if (s->restart_max_delay_usec == USEC_INFINITY && s->restart_steps > 0)
+                log_unit_warning(UNIT(s), "Service has RestartSteps= but no RestartMaxDelaySec= setting. Ignoring.");
+
+        if (s->restart_max_delay_usec != USEC_INFINITY && s->restart_steps == 0)
+                log_unit_warning(UNIT(s), "Service has RestartMaxDelaySec= but no RestartSteps= setting. Ignoring.");
+
+        if (s->restart_max_delay_usec < s->restart_usec) {
+                log_unit_warning(UNIT(s), "RestartMaxDelaySec= has a value smaller than RestartSec=, resetting RestartSec= to RestartMaxDelaySec=.");
+                s->restart_usec = s->restart_max_delay_usec;
+        }
+
+        return 0;
+}
+
+static int service_add_default_dependencies(Service *s) {
+        int r;
+
+        assert(s);
+
+        if (!UNIT(s)->default_dependencies)
+                return 0;
+
+        /* Add a number of automatic dependencies useful for the
+         * majority of services. */
+
+        if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) {
+                /* First, pull in the really early boot stuff, and
+                 * require it, so that we fail if we can't acquire
+                 * it. */
+
+                r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+                if (r < 0)
+                        return r;
+        } else {
+
+                /* In the --user instance there's no sysinit.target,
+                 * in that case require basic.target instead. */
+
+                r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Second, if the rest of the base system is in the same
+         * transaction, order us after it, but do not pull it in or
+         * even require it. */
+        r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        /* Third, add us in for normal shutdown. */
+        return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static void service_fix_stdio(Service *s) {
+        assert(s);
+
+        /* Note that EXEC_INPUT_NULL and EXEC_OUTPUT_INHERIT play a special role here: they are both the
+         * default value that is subject to automatic overriding triggered by other settings and an explicit
+         * choice the user can make. We don't distinguish between these cases currently. */
+
+        if (s->exec_context.std_input == EXEC_INPUT_NULL &&
+            s->exec_context.stdin_data_size > 0)
+                s->exec_context.std_input = EXEC_INPUT_DATA;
+
+        if (IN_SET(s->exec_context.std_input,
+                    EXEC_INPUT_TTY,
+                    EXEC_INPUT_TTY_FORCE,
+                    EXEC_INPUT_TTY_FAIL,
+                    EXEC_INPUT_SOCKET,
+                    EXEC_INPUT_NAMED_FD))
+                return;
+
+        /* We assume these listed inputs refer to bidirectional streams, and hence duplicating them from
+         * stdin to stdout/stderr makes sense and hence leaving EXEC_OUTPUT_INHERIT in place makes sense,
+         * too. Outputs such as regular files or sealed data memfds otoh don't really make sense to be
+         * duplicated for both input and output at the same time (since they then would cause a feedback
+         * loop), hence override EXEC_OUTPUT_INHERIT with the default stderr/stdout setting.  */
+
+        if (s->exec_context.std_error == EXEC_OUTPUT_INHERIT &&
+            s->exec_context.std_output == EXEC_OUTPUT_INHERIT)
+                s->exec_context.std_error = UNIT(s)->manager->defaults.std_error;
+
+        if (s->exec_context.std_output == EXEC_OUTPUT_INHERIT)
+                s->exec_context.std_output = UNIT(s)->manager->defaults.std_output;
+}
+
+static int service_setup_bus_name(Service *s) {
+        int r;
+
+        assert(s);
+
+        /* If s->bus_name is not set, then the unit will be refused by service_verify() later. */
+        if (!s->bus_name)
+                return 0;
+
+        if (s->type == SERVICE_DBUS) {
+                r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m");
+
+                /* We always want to be ordered against dbus.socket if both are in the transaction. */
+                r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m");
+        }
+
+        r = unit_watch_bus_name(UNIT(s), s->bus_name);
+        if (r == -EEXIST)
+                return log_unit_error_errno(UNIT(s), r, "Two services allocated for the same bus name %s, refusing operation.", s->bus_name);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Cannot watch bus name %s: %m", s->bus_name);
+
+        return 0;
+}
+
+static int service_add_extras(Service *s) {
+        int r;
+
+        assert(s);
+
+        if (s->type == _SERVICE_TYPE_INVALID) {
+                /* Figure out a type automatically */
+                if (s->bus_name)
+                        s->type = SERVICE_DBUS;
+                else if (s->exec_command[SERVICE_EXEC_START])
+                        s->type = SERVICE_SIMPLE;
+                else
+                        s->type = SERVICE_ONESHOT;
+        }
+
+        /* Oneshot services have disabled start timeout by default */
+        if (s->type == SERVICE_ONESHOT && !s->start_timeout_defined)
+                s->timeout_start_usec = USEC_INFINITY;
+
+        service_fix_stdio(s);
+
+        r = unit_patch_contexts(UNIT(s));
+        if (r < 0)
+                return r;
+
+        r = unit_add_exec_dependencies(UNIT(s), &s->exec_context);
+        if (r < 0)
+                return r;
+
+        r = unit_set_default_slice(UNIT(s));
+        if (r < 0)
+                return r;
+
+        /* If the service needs the notify socket, let's enable it automatically. */
+        if (s->notify_access == NOTIFY_NONE &&
+            (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) || s->watchdog_usec > 0 || s->n_fd_store_max > 0))
+                s->notify_access = NOTIFY_MAIN;
+
+        /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when
+         * delegation is on, in that case it we assume the payload knows better what to do and can process
+         * things in a more focused way. */
+        if (s->oom_policy < 0)
+                s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->defaults.oom_policy;
+
+        /* Let the kernel do the killing if that's requested. */
+        s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
+        r = service_add_default_dependencies(s);
+        if (r < 0)
+                return r;
+
+        r = service_setup_bus_name(s);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int service_load(Unit *u) {
+        Service *s = SERVICE(u);
+        int r;
+
+        r = unit_load_fragment_and_dropin(u, true);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        /* This is a new unit? Then let's add in some extras */
+        r = service_add_extras(s);
+        if (r < 0)
+                return r;
+
+        return service_verify(s);
+}
+
+static void service_dump_fdstore(Service *s, FILE *f, const char *prefix) {
+        assert(s);
+        assert(f);
+        assert(prefix);
+
+        LIST_FOREACH(fd_store, i, s->fd_store) {
+                _cleanup_free_ char *path = NULL;
+                struct stat st;
+                int flags;
+
+                if (fstat(i->fd, &st) < 0) {
+                        log_debug_errno(errno, "Failed to stat fdstore entry: %m");
+                        continue;
+                }
+
+                flags = fcntl(i->fd, F_GETFL);
+                if (flags < 0) {
+                        log_debug_errno(errno, "Failed to get fdstore entry flags: %m");
+                        continue;
+                }
+
+                (void) fd_get_path(i->fd, &path);
+
+                fprintf(f,
+                        "%s%s '%s' (type=%s; dev=" DEVNUM_FORMAT_STR "; inode=%" PRIu64 "; rdev=" DEVNUM_FORMAT_STR "; path=%s; access=%s)\n",
+                        prefix, i == s->fd_store ? "File Descriptor Store Entry:" : "                            ",
+                        i->fdname,
+                        inode_type_to_string(st.st_mode),
+                        DEVNUM_FORMAT_VAL(st.st_dev),
+                        (uint64_t) st.st_ino,
+                        DEVNUM_FORMAT_VAL(st.st_rdev),
+                        strna(path),
+                        accmode_to_string(flags));
+        }
+}
+
+static void service_dump(Unit *u, FILE *f, const char *prefix) {
+        Service *s = SERVICE(u);
+        const char *prefix2;
+
+        assert(s);
+
+        prefix = strempty(prefix);
+        prefix2 = strjoina(prefix, "\t");
+
+        fprintf(f,
+                "%sService State: %s\n"
+                "%sResult: %s\n"
+                "%sReload Result: %s\n"
+                "%sClean Result: %s\n"
+                "%sPermissionsStartOnly: %s\n"
+                "%sRootDirectoryStartOnly: %s\n"
+                "%sRemainAfterExit: %s\n"
+                "%sGuessMainPID: %s\n"
+                "%sType: %s\n"
+                "%sRestart: %s\n"
+                "%sNotifyAccess: %s\n"
+                "%sNotifyState: %s\n"
+                "%sOOMPolicy: %s\n"
+                "%sReloadSignal: %s\n",
+                prefix, service_state_to_string(s->state),
+                prefix, service_result_to_string(s->result),
+                prefix, service_result_to_string(s->reload_result),
+                prefix, service_result_to_string(s->clean_result),
+                prefix, yes_no(s->permissions_start_only),
+                prefix, yes_no(s->root_directory_start_only),
+                prefix, yes_no(s->remain_after_exit),
+                prefix, yes_no(s->guess_main_pid),
+                prefix, service_type_to_string(s->type),
+                prefix, service_restart_to_string(s->restart),
+                prefix, notify_access_to_string(service_get_notify_access(s)),
+                prefix, notify_state_to_string(s->notify_state),
+                prefix, oom_policy_to_string(s->oom_policy),
+                prefix, signal_to_string(s->reload_signal));
+
+        if (pidref_is_set(&s->control_pid))
+                fprintf(f,
+                        "%sControl PID: "PID_FMT"\n",
+                        prefix, s->control_pid.pid);
+
+        if (pidref_is_set(&s->main_pid))
+                fprintf(f,
+                        "%sMain PID: "PID_FMT"\n"
+                        "%sMain PID Known: %s\n"
+                        "%sMain PID Alien: %s\n",
+                        prefix, s->main_pid.pid,
+                        prefix, yes_no(s->main_pid_known),
+                        prefix, yes_no(s->main_pid_alien));
+
+        if (s->pid_file)
+                fprintf(f,
+                        "%sPIDFile: %s\n",
+                        prefix, s->pid_file);
+
+        if (s->bus_name)
+                fprintf(f,
+                        "%sBusName: %s\n"
+                        "%sBus Name Good: %s\n",
+                        prefix, s->bus_name,
+                        prefix, yes_no(s->bus_name_good));
+
+        if (UNIT_ISSET(s->accept_socket))
+                fprintf(f,
+                        "%sAccept Socket: %s\n",
+                        prefix, UNIT_DEREF(s->accept_socket)->id);
+
+        fprintf(f,
+                "%sRestartSec: %s\n"
+                "%sRestartSteps: %u\n"
+                "%sRestartMaxDelaySec: %s\n"
+                "%sTimeoutStartSec: %s\n"
+                "%sTimeoutStopSec: %s\n"
+                "%sTimeoutStartFailureMode: %s\n"
+                "%sTimeoutStopFailureMode: %s\n",
+                prefix, FORMAT_TIMESPAN(s->restart_usec, USEC_PER_SEC),
+                prefix, s->restart_steps,
+                prefix, FORMAT_TIMESPAN(s->restart_max_delay_usec, USEC_PER_SEC),
+                prefix, FORMAT_TIMESPAN(s->timeout_start_usec, USEC_PER_SEC),
+                prefix, FORMAT_TIMESPAN(s->timeout_stop_usec, USEC_PER_SEC),
+                prefix, service_timeout_failure_mode_to_string(s->timeout_start_failure_mode),
+                prefix, service_timeout_failure_mode_to_string(s->timeout_stop_failure_mode));
+
+        if (s->timeout_abort_set)
+                fprintf(f,
+                        "%sTimeoutAbortSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(s->timeout_abort_usec, USEC_PER_SEC));
+
+        fprintf(f,
+                "%sRuntimeMaxSec: %s\n"
+                "%sRuntimeRandomizedExtraSec: %s\n"
+                "%sWatchdogSec: %s\n",
+                prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC),
+                prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
+                prefix, FORMAT_TIMESPAN(s->watchdog_usec, USEC_PER_SEC));
+
+        kill_context_dump(&s->kill_context, f, prefix);
+        exec_context_dump(&s->exec_context, f, prefix);
+
+        for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) {
+                if (!s->exec_command[c])
+                        continue;
+
+                fprintf(f, "%s-> %s:\n",
+                        prefix, service_exec_command_to_string(c));
+
+                exec_command_dump_list(s->exec_command[c], f, prefix2);
+        }
+
+        if (s->status_text)
+                fprintf(f, "%sStatus Text: %s\n",
+                        prefix, s->status_text);
+
+        if (s->n_fd_store_max > 0)
+                fprintf(f,
+                        "%sFile Descriptor Store Max: %u\n"
+                        "%sFile Descriptor Store Pin: %s\n"
+                        "%sFile Descriptor Store Current: %zu\n",
+                        prefix, s->n_fd_store_max,
+                        prefix, exec_preserve_mode_to_string(s->fd_store_preserve_mode),
+                        prefix, s->n_fd_store);
+
+        service_dump_fdstore(s, f, prefix);
+
+        if (s->open_files)
+                LIST_FOREACH(open_files, of, s->open_files) {
+                        _cleanup_free_ char *ofs = NULL;
+                        int r;
+
+                        r = open_file_to_string(of, &ofs);
+                        if (r < 0) {
+                                log_debug_errno(r,
+                                                "Failed to convert OpenFile= setting to string, ignoring: %m");
+                                continue;
+                        }
+
+                        fprintf(f, "%sOpen File: %s\n", prefix, ofs);
+                }
+
+        cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int service_is_suitable_main_pid(Service *s, PidRef *pid, int prio) {
+        Unit *owner;
+        int r;
+
+        assert(s);
+        assert(pidref_is_set(pid));
+
+        /* Checks whether the specified PID is suitable as main PID for this service. returns negative if not, 0 if the
+         * PID is questionnable but should be accepted if the source of configuration is trusted. > 0 if the PID is
+         * good */
+
+        if (pidref_is_self(pid) || pid->pid == 1)
+                return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the manager, refusing.", pid->pid);
+
+        if (pidref_equal(pid, &s->control_pid))
+                return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the control process, refusing.", pid->pid);
+
+        r = pidref_is_alive(pid);
+        if (r < 0)
+                return log_unit_full_errno(UNIT(s), prio, r, "Failed to check if main PID "PID_FMT" exists or is a zombie: %m", pid->pid);
+        if (r == 0)
+                return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(ESRCH), "New main PID "PID_FMT" does not exist or is a zombie.", pid->pid);
+
+        owner = manager_get_unit_by_pidref(UNIT(s)->manager, pid);
+        if (owner == UNIT(s)) {
+                log_unit_debug(UNIT(s), "New main PID "PID_FMT" belongs to service, we are happy.", pid->pid);
+                return 1; /* Yay, it's definitely a good PID */
+        }
+
+        return 0; /* Hmm it's a suspicious PID, let's accept it if configuration source is trusted */
+}
+
+static int service_load_pid_file(Service *s, bool may_warn) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        bool questionable_pid_file = false;
+        _cleanup_free_ char *k = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        int r, prio;
+
+        assert(s);
+
+        if (!s->pid_file)
+                return -ENOENT;
+
+        prio = may_warn ? LOG_INFO : LOG_DEBUG;
+
+        r = chase(s->pid_file, NULL, CHASE_SAFE, NULL, &fd);
+        if (r == -ENOLINK) {
+                log_unit_debug_errno(UNIT(s), r,
+                                     "Potentially unsafe symlink chain, will now retry with relaxed checks: %s", s->pid_file);
+
+                questionable_pid_file = true;
+
+                r = chase(s->pid_file, NULL, 0, NULL, &fd);
+        }
+        if (r < 0)
+                return log_unit_full_errno(UNIT(s), prio, r,
+                                           "Can't open PID file %s (yet?) after %s: %m", s->pid_file, service_state_to_string(s->state));
+
+        /* Let's read the PID file now that we chased it down. But we need to convert the O_PATH fd
+         * chase() returned us into a proper fd first. */
+        r = read_one_line_file(FORMAT_PROC_FD_PATH(fd), &k);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r,
+                                            "Can't convert PID files %s O_PATH file descriptor to proper file descriptor: %m",
+                                            s->pid_file);
+
+        r = pidref_set_pidstr(&pidref, k);
+        if (r < 0)
+                return log_unit_full_errno(UNIT(s), prio, r, "Failed to parse PID from file %s: %m", s->pid_file);
+
+        if (s->main_pid_known && pidref_equal(&pidref, &s->main_pid))
+                return 0;
+
+        r = service_is_suitable_main_pid(s, &pidref, prio);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                struct stat st;
+
+                if (questionable_pid_file)
+                        return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM),
+                                                    "Refusing to accept PID outside of service control group, acquired through unsafe symlink chain: %s", s->pid_file);
+
+                /* Hmm, it's not clear if the new main PID is safe. Let's allow this if the PID file is owned by root */
+
+                if (fstat(fd, &st) < 0)
+                        return log_unit_error_errno(UNIT(s), errno, "Failed to fstat() PID file O_PATH fd: %m");
+
+                if (st.st_uid != 0)
+                        return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM),
+                                                    "New main PID "PID_FMT" does not belong to service, and PID file is not owned by root. Refusing.", pidref.pid);
+
+                log_unit_debug(UNIT(s), "New main PID "PID_FMT" does not belong to service, but we'll accept it since PID file is owned by root.", pidref.pid);
+        }
+
+        if (s->main_pid_known) {
+                log_unit_debug(UNIT(s), "Main PID changing: "PID_FMT" -> "PID_FMT, s->main_pid.pid, pidref.pid);
+
+                service_unwatch_main_pid(s);
+                s->main_pid_known = false;
+        } else
+                log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pidref.pid);
+
+        r = service_set_main_pidref(s, &pidref);
+        if (r < 0)
+                return r;
+
+        r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+        if (r < 0) /* FIXME: we need to do something here */
+                return log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" for service: %m", s->main_pid.pid);
+
+        return 1;
+}
+
+static void service_search_main_pid(Service *s) {
+        _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+        int r;
+
+        assert(s);
+
+        /* If we know it anyway, don't ever fall back to unreliable heuristics */
+        if (s->main_pid_known)
+                return;
+
+        if (!s->guess_main_pid)
+                return;
+
+        assert(!pidref_is_set(&s->main_pid));
+
+        if (unit_search_main_pid(UNIT(s), &pid) < 0)
+                return;
+
+        log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid.pid);
+        if (service_set_main_pidref(s, &pid) < 0)
+                return;
+
+        r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+        if (r < 0)
+                /* FIXME: we need to do something here */
+                log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" from: %m", s->main_pid.pid);
+}
+
+static void service_set_state(Service *s, ServiceState state) {
+        ServiceState old_state;
+        const UnitActiveState *table;
+
+        assert(s);
+
+        if (s->state != state)
+                bus_unit_send_pending_change_signal(UNIT(s), false);
+
+        table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+
+        old_state = s->state;
+        s->state = state;
+
+        service_unwatch_pid_file(s);
+
+        if (!IN_SET(state,
+                    SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+                    SERVICE_RUNNING,
+                    SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+                    SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+                    SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+                    SERVICE_AUTO_RESTART,
+                    SERVICE_CLEANING))
+                s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+        if (!IN_SET(state,
+                    SERVICE_START, SERVICE_START_POST,
+                    SERVICE_RUNNING,
+                    SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+                    SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+                    SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
+                service_unwatch_main_pid(s);
+                s->main_command = NULL;
+        }
+
+        if (!IN_SET(state,
+                    SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+                    SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+                    SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+                    SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+                    SERVICE_CLEANING)) {
+                service_unwatch_control_pid(s);
+                s->control_command = NULL;
+                s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+        }
+
+        if (IN_SET(state,
+                   SERVICE_DEAD, SERVICE_FAILED,
+                   SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED,
+                   SERVICE_DEAD_RESOURCES_PINNED)) {
+                unit_unwatch_all_pids(UNIT(s));
+                unit_dequeue_rewatch_pids(UNIT(s));
+        }
+
+        if (state != SERVICE_START)
+                s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+        if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
+                service_stop_watchdog(s);
+
+        /* For the inactive states unit_notify() will trim the cgroup,
+         * but for exit we have to do that ourselves... */
+        if (state == SERVICE_EXITED && !MANAGER_IS_RELOADING(UNIT(s)->manager))
+                unit_prune_cgroup(UNIT(s));
+
+        if (old_state != state)
+                log_unit_debug(UNIT(s), "Changed %s -> %s", service_state_to_string(old_state), service_state_to_string(state));
+
+        unit_notify(UNIT(s), table[old_state], table[state], s->reload_result == SERVICE_SUCCESS);
+}
+
+static usec_t service_coldplug_timeout(Service *s) {
+        assert(s);
+
+        switch (s->deserialized_state) {
+
+        case SERVICE_CONDITION:
+        case SERVICE_START_PRE:
+        case SERVICE_START:
+        case SERVICE_START_POST:
+        case SERVICE_RELOAD:
+        case SERVICE_RELOAD_SIGNAL:
+        case SERVICE_RELOAD_NOTIFY:
+                return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_start_usec);
+
+        case SERVICE_RUNNING:
+                return service_running_timeout(s);
+
+        case SERVICE_STOP:
+        case SERVICE_STOP_SIGTERM:
+        case SERVICE_STOP_SIGKILL:
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_SIGTERM:
+        case SERVICE_FINAL_SIGKILL:
+                return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
+
+        case SERVICE_STOP_WATCHDOG:
+        case SERVICE_FINAL_WATCHDOG:
+                return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s));
+
+        case SERVICE_AUTO_RESTART:
+                return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, service_restart_usec_next(s));
+
+        case SERVICE_CLEANING:
+                return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->exec_context.timeout_clean_usec);
+
+        default:
+                return USEC_INFINITY;
+        }
+}
+
+static int service_coldplug(Unit *u) {
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(s);
+        assert(s->state == SERVICE_DEAD);
+
+        if (s->deserialized_state == s->state)
+                return 0;
+
+        r = service_arm_timer(s, /* relative= */ false, service_coldplug_timeout(s));
+        if (r < 0)
+                return r;
+
+        if (pidref_is_set(&s->main_pid) &&
+            pidref_is_unwaited(&s->main_pid) > 0 &&
+            (IN_SET(s->deserialized_state,
+                    SERVICE_START, SERVICE_START_POST,
+                    SERVICE_RUNNING,
+                    SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+                    SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+                    SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
+                r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+                if (r < 0)
+                        return r;
+        }
+
+        if (pidref_is_set(&s->control_pid) &&
+            pidref_is_unwaited(&s->control_pid) > 0 &&
+            IN_SET(s->deserialized_state,
+                   SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+                   SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+                   SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+                   SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+                   SERVICE_CLEANING)) {
+                r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
+                if (r < 0)
+                        return r;
+        }
+
+        if (!IN_SET(s->deserialized_state,
+                    SERVICE_DEAD, SERVICE_FAILED,
+                    SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED,
+                    SERVICE_CLEANING,
+                    SERVICE_DEAD_RESOURCES_PINNED)) {
+                (void) unit_enqueue_rewatch_pids(u);
+                (void) unit_setup_exec_runtime(u);
+        }
+
+        if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
+                service_start_watchdog(s);
+
+        if (UNIT_ISSET(s->accept_socket)) {
+                Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket));
+
+                if (socket->max_connections_per_source > 0) {
+                        SocketPeer *peer;
+
+                        /* Make a best-effort attempt at bumping the connection count */
+                        if (socket_acquire_peer(socket, s->socket_fd, &peer) > 0) {
+                                socket_peer_unref(s->socket_peer);
+                                s->socket_peer = peer;
+                        }
+                }
+        }
+
+        service_set_state(s, s->deserialized_state);
+        return 0;
+}
+
+static int service_collect_fds(
+                Service *s,
+                int **fds,
+                char ***fd_names,
+                size_t *n_socket_fds,
+                size_t *n_storage_fds) {
+
+        _cleanup_strv_free_ char **rfd_names = NULL;
+        _cleanup_free_ int *rfds = NULL;
+        size_t rn_socket_fds = 0, rn_storage_fds = 0;
+        int r;
+
+        assert(s);
+        assert(fds);
+        assert(fd_names);
+        assert(n_socket_fds);
+        assert(n_storage_fds);
+
+        if (s->socket_fd >= 0) {
+
+                /* Pass the per-connection socket */
+
+                rfds = newdup(int, &s->socket_fd, 1);
+                if (!rfds)
+                        return -ENOMEM;
+
+                rfd_names = strv_new("connection");
+                if (!rfd_names)
+                        return -ENOMEM;
+
+                rn_socket_fds = 1;
+        } else {
+                Unit *u;
+
+                /* Pass all our configured sockets for singleton services */
+
+                UNIT_FOREACH_DEPENDENCY(u, UNIT(s), UNIT_ATOM_TRIGGERED_BY) {
+                        _cleanup_free_ int *cfds = NULL;
+                        Socket *sock;
+                        int cn_fds;
+
+                        if (u->type != UNIT_SOCKET)
+                                continue;
+
+                        sock = SOCKET(u);
+
+                        cn_fds = socket_collect_fds(sock, &cfds);
+                        if (cn_fds < 0)
+                                return cn_fds;
+
+                        if (cn_fds <= 0)
+                                continue;
+
+                        if (!rfds) {
+                                rfds = TAKE_PTR(cfds);
+                                rn_socket_fds = cn_fds;
+                        } else {
+                                int *t;
+
+                                t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int));
+                                if (!t)
+                                        return -ENOMEM;
+
+                                memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int));
+
+                                rfds = t;
+                                rn_socket_fds += cn_fds;
+                        }
+
+                        r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (s->n_fd_store > 0) {
+                size_t n_fds;
+                char **nl;
+                int *t;
+
+                t = reallocarray(rfds, rn_socket_fds + s->n_fd_store, sizeof(int));
+                if (!t)
+                        return -ENOMEM;
+
+                rfds = t;
+
+                nl = reallocarray(rfd_names, rn_socket_fds + s->n_fd_store + 1, sizeof(char *));
+                if (!nl)
+                        return -ENOMEM;
+
+                rfd_names = nl;
+                n_fds = rn_socket_fds;
+
+                LIST_FOREACH(fd_store, fs, s->fd_store) {
+                        rfds[n_fds] = fs->fd;
+                        rfd_names[n_fds] = strdup(strempty(fs->fdname));
+                        if (!rfd_names[n_fds])
+                                return -ENOMEM;
+
+                        rn_storage_fds++;
+                        n_fds++;
+                }
+
+                rfd_names[n_fds] = NULL;
+        }
+
+        *fds = TAKE_PTR(rfds);
+        *fd_names = TAKE_PTR(rfd_names);
+        *n_socket_fds = rn_socket_fds;
+        *n_storage_fds = rn_storage_fds;
+
+        return 0;
+}
+
+static int service_allocate_exec_fd_event_source(
+                Service *s,
+                int fd,
+                sd_event_source **ret_event_source) {
+
+        _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL;
+        int r;
+
+        assert(s);
+        assert(fd >= 0);
+        assert(ret_event_source);
+
+        r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m");
+
+        /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */
+
+        r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m");
+
+        (void) sd_event_source_set_description(source, "service exec_fd");
+
+        r = sd_event_source_set_io_fd_own(source, true);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m");
+
+        *ret_event_source = TAKE_PTR(source);
+        return 0;
+}
+
+static int service_allocate_exec_fd(
+                Service *s,
+                sd_event_source **ret_event_source,
+                int *ret_exec_fd) {
+
+        _cleanup_close_pair_ int p[] = EBADF_PAIR;
+        int r;
+
+        assert(s);
+        assert(ret_event_source);
+        assert(ret_exec_fd);
+
+        if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0)
+                return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m");
+
+        r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source);
+        if (r < 0)
+                return r;
+
+        TAKE_FD(p[0]);
+        *ret_exec_fd = TAKE_FD(p[1]);
+
+        return 0;
+}
+
+static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) {
+        assert(s);
+
+        /* Notifications are accepted depending on the process and
+         * the access setting of the service:
+         *     process: \ access:  NONE  MAIN  EXEC   ALL
+         *     main                  no   yes   yes   yes
+         *     control               no    no   yes   yes
+         *     other (forked)        no    no    no   yes */
+
+        if (flags & EXEC_IS_CONTROL)
+                /* A control process */
+                return IN_SET(service_get_notify_access(s), NOTIFY_EXEC, NOTIFY_ALL);
+
+        /* We only spawn main processes and control processes, so any
+         * process that is not a control process is a main process */
+        return service_get_notify_access(s) != NOTIFY_NONE;
+}
+
+static Service *service_get_triggering_service(Service *s) {
+        Unit *candidate = NULL, *other;
+
+        assert(s);
+
+        /* Return the service which triggered service 's', this means dependency
+         * types which include the UNIT_ATOM_ON_{FAILURE,SUCCESS}_OF atoms.
+         *
+         * N.B. if there are multiple services which could trigger 's' via OnFailure=
+         * or OnSuccess= then we return NULL. This is since we don't know from which
+         * one to propagate the exit status. */
+
+        UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_FAILURE_OF) {
+                if (candidate)
+                        goto have_other;
+                candidate = other;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_SUCCESS_OF) {
+                if (candidate)
+                        goto have_other;
+                candidate = other;
+        }
+
+        return SERVICE(candidate);
+
+ have_other:
+        log_unit_warning(UNIT(s), "multiple trigger source candidates for exit status propagation (%s, %s), skipping.",
+                         candidate->id, other->id);
+        return NULL;
+}
+
+static int service_spawn_internal(
+                const char *caller,
+                Service *s,
+                ExecCommand *c,
+                usec_t timeout,
+                ExecFlags flags,
+                PidRef *ret_pid) {
+
+        _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags);
+        _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
+        _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL;
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        size_t n_env = 0;
+        pid_t pid;
+        int r;
+
+        assert(caller);
+        assert(s);
+        assert(c);
+        assert(ret_pid);
+
+        log_unit_debug(UNIT(s), "Will spawn child (%s): %s", caller, c->path);
+
+        r = unit_prepare_exec(UNIT(s)); /* This realizes the cgroup, among other things */
+        if (r < 0)
+                return r;
+
+        assert(!s->exec_fd_event_source);
+
+        if (flags & EXEC_IS_CONTROL) {
+                /* If this is a control process, mask the permissions/chroot application if this is requested. */
+                if (s->permissions_start_only)
+                        exec_params.flags &= ~EXEC_APPLY_SANDBOXING;
+                if (s->root_directory_start_only)
+                        exec_params.flags &= ~EXEC_APPLY_CHROOT;
+        }
+
+        if ((flags & EXEC_PASS_FDS) ||
+            s->exec_context.std_input == EXEC_INPUT_SOCKET ||
+            s->exec_context.std_output == EXEC_OUTPUT_SOCKET ||
+            s->exec_context.std_error == EXEC_OUTPUT_SOCKET) {
+
+                r = service_collect_fds(s,
+                                        &exec_params.fds,
+                                        &exec_params.fd_names,
+                                        &exec_params.n_socket_fds,
+                                        &exec_params.n_storage_fds);
+                if (r < 0)
+                        return r;
+
+                exec_params.open_files = s->open_files;
+
+                log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
+        }
+
+        if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
+                r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
+                if (r < 0)
+                        return r;
+        }
+
+        r = service_arm_timer(s, /* relative= */ true, timeout);
+        if (r < 0)
+                return r;
+
+        our_env = new0(char*, 13);
+        if (!our_env)
+                return -ENOMEM;
+
+        if (service_exec_needs_notify_socket(s, flags)) {
+                if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0)
+                        return -ENOMEM;
+
+                exec_params.notify_socket = UNIT(s)->manager->notify_socket;
+
+                if (s->n_fd_store_max > 0)
+                        if (asprintf(our_env + n_env++, "FDSTORE=%u", s->n_fd_store_max) < 0)
+                                return -ENOMEM;
+        }
+
+        if (pidref_is_set(&s->main_pid))
+                if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid.pid) < 0)
+                        return -ENOMEM;
+
+        if (MANAGER_IS_USER(UNIT(s)->manager))
+                if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid_cached()) < 0)
+                        return -ENOMEM;
+
+        if (s->pid_file)
+                if (asprintf(our_env + n_env++, "PIDFILE=%s", s->pid_file) < 0)
+                        return -ENOMEM;
+
+        if (s->socket_fd >= 0) {
+                union sockaddr_union sa;
+                socklen_t salen = sizeof(sa);
+
+                /* If this is a per-connection service instance, let's set $REMOTE_ADDR and $REMOTE_PORT to something
+                 * useful. Note that we do this only when we are still connected at this point in time, which we might
+                 * very well not be. Hence we ignore all errors when retrieving peer information (as that might result
+                 * in ENOTCONN), and just use whate we can use. */
+
+                if (getpeername(s->socket_fd, &sa.sa, &salen) >= 0 &&
+                    IN_SET(sa.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+                        _cleanup_free_ char *addr = NULL;
+                        char *t;
+                        unsigned port;
+
+                        r = sockaddr_pretty(&sa.sa, salen, true, false, &addr);
+                        if (r < 0)
+                                return r;
+
+                        t = strjoin("REMOTE_ADDR=", addr);
+                        if (!t)
+                                return -ENOMEM;
+                        our_env[n_env++] = t;
+
+                        r = sockaddr_port(&sa.sa, &port);
+                        if (r < 0)
+                                return r;
+
+                        if (asprintf(&t, "REMOTE_PORT=%u", port) < 0)
+                                return -ENOMEM;
+                        our_env[n_env++] = t;
+                }
+        }
+
+        Service *env_source = NULL;
+        const char *monitor_prefix;
+        if (flags & EXEC_SETENV_RESULT) {
+                env_source = s;
+                monitor_prefix = "";
+        } else if (flags & EXEC_SETENV_MONITOR_RESULT) {
+                env_source = service_get_triggering_service(s);
+                monitor_prefix = "MONITOR_";
+        }
+
+        if (env_source) {
+                if (asprintf(our_env + n_env++, "%sSERVICE_RESULT=%s", monitor_prefix, service_result_to_string(env_source->result)) < 0)
+                        return -ENOMEM;
+
+                if (env_source->main_exec_status.pid > 0 &&
+                    dual_timestamp_is_set(&env_source->main_exec_status.exit_timestamp)) {
+                        if (asprintf(our_env + n_env++, "%sEXIT_CODE=%s", monitor_prefix, sigchld_code_to_string(env_source->main_exec_status.code)) < 0)
+                                return -ENOMEM;
+
+                        if (env_source->main_exec_status.code == CLD_EXITED)
+                                r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%i", monitor_prefix, env_source->main_exec_status.status);
+                        else
+                                r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%s", monitor_prefix, signal_to_string(env_source->main_exec_status.status));
+
+                        if (r < 0)
+                                return -ENOMEM;
+                }
+
+                if (env_source != s) {
+                        if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) {
+                                r = asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR,
+                                             monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id));
+                                if (r < 0)
+                                        return -ENOMEM;
+                        }
+
+                        if (asprintf(our_env + n_env++, "%sUNIT=%s", monitor_prefix, UNIT(env_source)->id) < 0)
+                                return -ENOMEM;
+                }
+        }
+
+        if (UNIT(s)->activation_details) {
+                r = activation_details_append_env(UNIT(s)->activation_details, &our_env);
+                if (r < 0)
+                        return r;
+                /* The number of env vars added here can vary, rather than keeping the allocation block in
+                 * sync manually, these functions simply use the strv methods to append to it, so we need
+                 * to update n_env when we are done in case of future usage. */
+                n_env += r;
+        }
+
+        r = unit_set_exec_params(UNIT(s), &exec_params);
+        if (r < 0)
+                return r;
+
+        final_env = strv_env_merge(exec_params.environment, our_env);
+        if (!final_env)
+                return -ENOMEM;
+
+        /* System D-Bus needs nss-systemd disabled, so that we don't deadlock */
+        SET_FLAG(exec_params.flags, EXEC_NSS_DYNAMIC_BYPASS,
+                 MANAGER_IS_SYSTEM(UNIT(s)->manager) && unit_has_name(UNIT(s), SPECIAL_DBUS_SERVICE));
+
+        strv_free_and_replace(exec_params.environment, final_env);
+        exec_params.watchdog_usec = service_get_watchdog_usec(s);
+        exec_params.selinux_context_net = s->socket_fd_selinux_context_net;
+        if (s->type == SERVICE_IDLE)
+                exec_params.idle_pipe = UNIT(s)->manager->idle_pipe;
+        exec_params.stdin_fd = s->stdin_fd;
+        exec_params.stdout_fd = s->stdout_fd;
+        exec_params.stderr_fd = s->stderr_fd;
+
+        r = exec_spawn(UNIT(s),
+                       c,
+                       &s->exec_context,
+                       &exec_params,
+                       s->exec_runtime,
+                       &s->cgroup_context,
+                       &pid);
+        if (r < 0)
+                return r;
+
+        s->exec_fd_event_source = TAKE_PTR(exec_fd_source);
+        s->exec_fd_hot = false;
+
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0)
+                return r;
+
+        r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+        if (r < 0)
+                return r;
+
+        *ret_pid = TAKE_PIDREF(pidref);
+        return 0;
+}
+
+static int main_pid_good(Service *s) {
+        assert(s);
+
+        /* Returns 0 if the pid is dead, > 0 if it is good, < 0 if we don't know */
+
+        /* If we know the pid file, then let's just check if it is still valid */
+        if (s->main_pid_known) {
+
+                /* If it's an alien child let's check if it is still alive ... */
+                if (s->main_pid_alien && pidref_is_set(&s->main_pid))
+                        return pidref_is_alive(&s->main_pid);
+
+                /* .. otherwise assume we'll get a SIGCHLD for it, which we really should wait for to collect
+                 * exit status and code */
+                return pidref_is_set(&s->main_pid);
+        }
+
+        /* We don't know the pid */
+        return -EAGAIN;
+}
+
+static int control_pid_good(Service *s) {
+        assert(s);
+
+        /* Returns 0 if the control PID is dead, > 0 if it is good. We never actually return < 0 here, but in order to
+         * make this function as similar as possible to main_pid_good() and cgroup_good(), we pretend that < 0 also
+         * means: we can't figure it out. */
+
+        return pidref_is_set(&s->control_pid);
+}
+
+static int cgroup_good(Service *s) {
+        int r;
+
+        assert(s);
+
+        /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't
+         * figure it out */
+
+        if (!UNIT(s)->cgroup_path)
+                return 0;
+
+        r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path);
+        if (r < 0)
+                return r;
+
+        return r == 0;
+}
+
+static bool service_shall_restart(Service *s, const char **reason) {
+        assert(s);
+
+        /* Don't restart after manual stops */
+        if (s->forbid_restart) {
+                *reason = "manual stop";
+                return false;
+        }
+
+        /* Never restart if this is configured as special exception */
+        if (exit_status_set_test(&s->restart_prevent_status, s->main_exec_status.code, s->main_exec_status.status)) {
+                *reason = "prevented by exit status";
+                return false;
+        }
+
+        /* Restart if the exit code/status are configured as restart triggers */
+        if (exit_status_set_test(&s->restart_force_status,  s->main_exec_status.code, s->main_exec_status.status)) {
+                *reason = "forced by exit status";
+                return true;
+        }
+
+        *reason = "restart setting";
+        switch (s->restart) {
+
+        case SERVICE_RESTART_NO:
+                return false;
+
+        case SERVICE_RESTART_ALWAYS:
+                return s->result != SERVICE_SKIP_CONDITION;
+
+        case SERVICE_RESTART_ON_SUCCESS:
+                return s->result == SERVICE_SUCCESS;
+
+        case SERVICE_RESTART_ON_FAILURE:
+                return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_SKIP_CONDITION);
+
+        case SERVICE_RESTART_ON_ABNORMAL:
+                return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_FAILURE_EXIT_CODE, SERVICE_SKIP_CONDITION);
+
+        case SERVICE_RESTART_ON_WATCHDOG:
+                return s->result == SERVICE_FAILURE_WATCHDOG;
+
+        case SERVICE_RESTART_ON_ABORT:
+                return IN_SET(s->result, SERVICE_FAILURE_SIGNAL, SERVICE_FAILURE_CORE_DUMP);
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static bool service_will_restart(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        if (IN_SET(s->state, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED))
+                return true;
+
+        return unit_will_restart_default(u);
+}
+
+static ServiceState service_determine_dead_state(Service *s) {
+        assert(s);
+
+        return s->fd_store && s->fd_store_preserve_mode == EXEC_PRESERVE_YES ? SERVICE_DEAD_RESOURCES_PINNED : SERVICE_DEAD;
+}
+
+static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) {
+        ServiceState end_state, restart_state;
+        int r;
+
+        assert(s);
+
+        /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not
+         * undo what has already been enqueued. */
+        if (unit_stop_pending(UNIT(s)))
+                allow_restart = false;
+
+        if (s->result == SERVICE_SUCCESS)
+                s->result = f;
+
+        if (s->result == SERVICE_SUCCESS) {
+                unit_log_success(UNIT(s));
+                end_state = service_determine_dead_state(s);
+                restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART;
+        } else if (s->result == SERVICE_SKIP_CONDITION) {
+                unit_log_skip(UNIT(s), service_result_to_string(s->result));
+                end_state = service_determine_dead_state(s);
+                restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART;
+        } else {
+                unit_log_failure(UNIT(s), service_result_to_string(s->result));
+                end_state = SERVICE_FAILED;
+                restart_state = SERVICE_FAILED_BEFORE_AUTO_RESTART;
+        }
+        unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+
+        if (!allow_restart)
+                log_unit_debug(UNIT(s), "Service restart not allowed.");
+        else {
+                const char *reason;
+
+                allow_restart = service_shall_restart(s, &reason);
+                log_unit_debug(UNIT(s), "Service will %srestart (%s)",
+                                        allow_restart ? "" : "not ",
+                                        reason);
+        }
+
+        if (allow_restart) {
+                usec_t restart_usec_next;
+
+                /* We make two state changes here: one that maps to the high-level UNIT_INACTIVE/UNIT_FAILED
+                 * state (i.e. a state indicating deactivation), and then one that that maps to the
+                 * high-level UNIT_STARTING state (i.e. a state indicating activation). We do this so that
+                 * external software can watch the state changes and see all service failures, even if they
+                 * are only transitionary and followed by an automatic restart. We have fine-grained
+                 * low-level states for this though so that software can distinguish the permanent UNIT_INACTIVE
+                 * state from this transitionary UNIT_INACTIVE state by looking at the low-level states. */
+                if (s->restart_mode != SERVICE_RESTART_MODE_DIRECT)
+                        service_set_state(s, restart_state);
+
+                restart_usec_next = service_restart_usec_next(s);
+
+                r = service_arm_timer(s, /* relative= */ true, restart_usec_next);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to install restart timer: %m");
+                        service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false);
+                        return;
+                }
+
+                log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0));
+
+                service_set_state(s, SERVICE_AUTO_RESTART);
+        } else {
+                service_set_state(s, end_state);
+
+                /* If we shan't restart, then flush out the restart counter. But don't do that immediately, so that the
+                 * user can still introspect the counter. Do so on the next start. */
+                s->flush_n_restarts = true;
+        }
+
+        /* The new state is in effect, let's decrease the fd store ref counter again. Let's also re-add us to the GC
+         * queue, so that the fd store is possibly gc'ed again */
+        unit_add_to_gc_queue(UNIT(s));
+
+        /* The next restart might not be a manual stop, hence reset the flag indicating manual stops */
+        s->forbid_restart = false;
+
+        /* Reset NotifyAccess override */
+        s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+
+        /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */
+        s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
+
+        /* Also, remove the runtime directory */
+        unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+        /* Also get rid of the fd store, if that's configured. */
+        if (s->fd_store_preserve_mode == EXEC_PRESERVE_NO)
+                service_release_fd_store(s);
+
+        /* Get rid of the IPC bits of the user */
+        unit_unref_uid_gid(UNIT(s), true);
+
+        /* Try to delete the pid file. At this point it will be
+         * out-of-date, and some software might be confused by it, so
+         * let's remove it. */
+        if (s->pid_file)
+                (void) unlink(s->pid_file);
+
+        /* Reset TTY ownership if necessary */
+        exec_context_revert_tty(&s->exec_context);
+}
+
+static void service_enter_stop_post(Service *s, ServiceResult f) {
+        int r;
+        assert(s);
+
+        if (s->result == SERVICE_SUCCESS)
+                s->result = f;
+
+        service_unwatch_control_pid(s);
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+        s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
+        if (s->control_command) {
+                s->control_command_id = SERVICE_EXEC_STOP_POST;
+                pidref_done(&s->control_pid);
+
+                r = service_spawn(s,
+                                  s->control_command,
+                                  s->timeout_stop_usec,
+                                  EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
+                                  &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m");
+                        service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
+                        return;
+                }
+
+                service_set_state(s, SERVICE_STOP_POST);
+        } else
+                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_SUCCESS);
+}
+
+static int state_to_kill_operation(Service *s, ServiceState state) {
+        switch (state) {
+
+        case SERVICE_STOP_WATCHDOG:
+        case SERVICE_FINAL_WATCHDOG:
+                return KILL_WATCHDOG;
+
+        case SERVICE_STOP_SIGTERM:
+                if (unit_has_job_type(UNIT(s), JOB_RESTART))
+                        return KILL_RESTART;
+                _fallthrough_;
+
+        case SERVICE_FINAL_SIGTERM:
+                return KILL_TERMINATE;
+
+        case SERVICE_STOP_SIGKILL:
+        case SERVICE_FINAL_SIGKILL:
+                return KILL_KILL;
+
+        default:
+                return _KILL_OPERATION_INVALID;
+        }
+}
+
+static void service_enter_signal(Service *s, ServiceState state, ServiceResult f) {
+        int kill_operation, r;
+
+        assert(s);
+
+        if (s->result == SERVICE_SUCCESS)
+                s->result = f;
+
+        /* Before sending any signal, make sure we track all members of this cgroup */
+        (void) unit_watch_all_pids(UNIT(s));
+
+        /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+         * died now */
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+        kill_operation = state_to_kill_operation(s, state);
+        r = unit_kill_context(
+                        UNIT(s),
+                        &s->kill_context,
+                        kill_operation,
+                        &s->main_pid,
+                        &s->control_pid,
+                        s->main_pid_alien);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+                goto fail;
+        }
+
+        if (r > 0) {
+                r = service_arm_timer(s, /* relative= */ true,
+                                      kill_operation == KILL_WATCHDOG ? service_timeout_abort_usec(s) : s->timeout_stop_usec);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                        goto fail;
+                }
+
+                service_set_state(s, state);
+        } else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM) && s->kill_context.send_sigkill)
+                service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
+        else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL))
+                service_enter_stop_post(s, SERVICE_SUCCESS);
+        else if (IN_SET(state, SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM) && s->kill_context.send_sigkill)
+                service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS);
+        else
+                service_enter_dead(s, SERVICE_SUCCESS, /* allow_restart= */ true);
+
+        return;
+
+fail:
+        if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL))
+                service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
+        else
+                service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+}
+
+static void service_enter_stop_by_notify(Service *s) {
+        int r;
+
+        assert(s);
+
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+        r = service_arm_timer(s, /* relative= */ true, s->timeout_stop_usec);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+                return;
+        }
+
+        /* The service told us it's stopping, so it's as if we SIGTERM'd it. */
+        service_set_state(s, SERVICE_STOP_SIGTERM);
+}
+
+static void service_enter_stop(Service *s, ServiceResult f) {
+        int r;
+
+        assert(s);
+
+        if (s->result == SERVICE_SUCCESS)
+                s->result = f;
+
+        service_unwatch_control_pid(s);
+        (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+        s->control_command = s->exec_command[SERVICE_EXEC_STOP];
+        if (s->control_command) {
+                s->control_command_id = SERVICE_EXEC_STOP;
+                pidref_done(&s->control_pid);
+
+                r = service_spawn(s,
+                                  s->control_command,
+                                  s->timeout_stop_usec,
+                                  EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
+                                  &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop' task: %m");
+                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+                        return;
+                }
+
+                service_set_state(s, SERVICE_STOP);
+        } else
+                service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
+}
+
+static bool service_good(Service *s) {
+        int main_pid_ok;
+        assert(s);
+
+        if (s->type == SERVICE_DBUS && !s->bus_name_good)
+                return false;
+
+        main_pid_ok = main_pid_good(s);
+        if (main_pid_ok > 0) /* It's alive */
+                return true;
+        if (main_pid_ok == 0 && s->exit_type == SERVICE_EXIT_MAIN) /* It's dead */
+                return false;
+
+        /* OK, we don't know anything about the main PID, maybe
+         * because there is none. Let's check the control group
+         * instead. */
+
+        return cgroup_good(s) != 0;
+}
+
+static void service_enter_running(Service *s, ServiceResult f) {
+        int r;
+
+        assert(s);
+
+        if (s->result == SERVICE_SUCCESS)
+                s->result = f;
+
+        service_unwatch_control_pid(s);
+
+        if (s->result != SERVICE_SUCCESS)
+                service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+        else if (service_good(s)) {
+
+                /* If there are any queued up sd_notify() notifications, process them now */
+                if (s->notify_state == NOTIFY_RELOADING)
+                        service_enter_reload_by_notify(s);
+                else if (s->notify_state == NOTIFY_STOPPING)
+                        service_enter_stop_by_notify(s);
+                else {
+                        service_set_state(s, SERVICE_RUNNING);
+
+                        r = service_arm_timer(s, /* relative= */ false, service_running_timeout(s));
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                                service_enter_running(s, SERVICE_FAILURE_RESOURCES);
+                                return;
+                        }
+                }
+
+        } else if (s->remain_after_exit)
+                service_set_state(s, SERVICE_EXITED);
+        else
+                service_enter_stop(s, SERVICE_SUCCESS);
+}
+
+static void service_enter_start_post(Service *s) {
+        int r;
+        assert(s);
+
+        service_unwatch_control_pid(s);
+        service_reset_watchdog(s);
+
+        s->control_command = s->exec_command[SERVICE_EXEC_START_POST];
+        if (s->control_command) {
+                s->control_command_id = SERVICE_EXEC_START_POST;
+                pidref_done(&s->control_pid);
+
+                r = service_spawn(s,
+                                  s->control_command,
+                                  s->timeout_start_usec,
+                                  EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
+                                  &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m");
+                        service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+                        return;
+                }
+
+                service_set_state(s, SERVICE_START_POST);
+        } else
+                service_enter_running(s, SERVICE_SUCCESS);
+}
+
+static void service_kill_control_process(Service *s) {
+        int r;
+
+        assert(s);
+
+        if (!pidref_is_set(&s->control_pid))
+                return;
+
+        r = pidref_kill_and_sigcont(&s->control_pid, SIGKILL);
+        if (r < 0) {
+                _cleanup_free_ char *comm = NULL;
+
+                (void) pidref_get_comm(&s->control_pid, &comm);
+
+                log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m",
+                                     s->control_pid.pid, strna(comm));
+        }
+}
+
+static int service_adverse_to_leftover_processes(Service *s) {
+        assert(s);
+
+        /* KillMode=mixed and control group are used to indicate that all process should be killed off.
+         * SendSIGKILL= is used for services that require a clean shutdown. These are typically database
+         * service where a SigKilled process would result in a lengthy recovery and who's shutdown or startup
+         * time is quite variable (so Timeout settings aren't of use).
+         *
+         * Here we take these two factors and refuse to start a service if there are existing processes
+         * within a control group. Databases, while generally having some protection against multiple
+         * instances running, lets not stress the rigor of these. Also ExecStartPre= parts of the service
+         * aren't as rigoriously written to protect aganst against multiple use. */
+
+        if (unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start) > 0 &&
+            IN_SET(s->kill_context.kill_mode, KILL_MIXED, KILL_CONTROL_GROUP) &&
+            !s->kill_context.send_sigkill)
+               return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EBUSY),
+                                           "Will not start SendSIGKILL=no service of type KillMode=control-group or mixed while processes exist");
+
+        return 0;
+}
+
+static void service_enter_start(Service *s) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        ExecCommand *c;
+        usec_t timeout;
+        int r;
+
+        assert(s);
+
+        service_unwatch_control_pid(s);
+        service_unwatch_main_pid(s);
+
+        r = service_adverse_to_leftover_processes(s);
+        if (r < 0)
+                goto fail;
+
+        if (s->type == SERVICE_FORKING) {
+                s->control_command_id = SERVICE_EXEC_START;
+                c = s->control_command = s->exec_command[SERVICE_EXEC_START];
+
+                s->main_command = NULL;
+        } else {
+                s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+                s->control_command = NULL;
+
+                c = s->main_command = s->exec_command[SERVICE_EXEC_START];
+        }
+
+        if (!c) {
+                if (s->type != SERVICE_ONESHOT) {
+                        /* There's no command line configured for the main command? Hmm, that is strange.
+                         * This can only happen if the configuration changes at runtime. In this case,
+                         * let's enter a failure state. */
+                        r = log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENXIO), "There's no 'start' task anymore we could start.");
+                        goto fail;
+                }
+
+                /* We force a fake state transition here. Otherwise, the unit would go directly from
+                 * SERVICE_DEAD to SERVICE_DEAD without SERVICE_ACTIVATING or SERVICE_ACTIVE
+                 * in between. This way we can later trigger actions that depend on the state
+                 * transition, including SuccessAction=. */
+                service_set_state(s, SERVICE_START);
+
+                service_enter_start_post(s);
+                return;
+        }
+
+        if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE))
+                /* For simple + idle this is the main process. We don't apply any timeout here, but
+                 * service_enter_running() will later apply the .runtime_max_usec timeout. */
+                timeout = USEC_INFINITY;
+        else
+                timeout = s->timeout_start_usec;
+
+        r = service_spawn(s,
+                          c,
+                          timeout,
+                          EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS|EXEC_SETENV_MONITOR_RESULT,
+                          &pidref);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start' task: %m");
+                goto fail;
+        }
+
+        if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) {
+                /* For simple services we immediately start
+                 * the START_POST binaries. */
+
+                (void) service_set_main_pidref(s, &pidref);
+                service_enter_start_post(s);
+
+        } else  if (s->type == SERVICE_FORKING) {
+
+                /* For forking services we wait until the start
+                 * process exited. */
+
+                pidref_done(&s->control_pid);
+                s->control_pid = TAKE_PIDREF(pidref);
+                service_set_state(s, SERVICE_START);
+
+        } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC)) {
+
+                /* For oneshot services we wait until the start process exited, too, but it is our main process. */
+
+                /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the
+                 * bus. 'notify' and 'exec' services are similar. */
+
+                (void) service_set_main_pidref(s, &pidref);
+                service_set_state(s, SERVICE_START);
+        } else
+                assert_not_reached();
+
+        return;
+
+fail:
+        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+}
+
+static void service_enter_start_pre(Service *s) {
+        int r;
+
+        assert(s);
+
+        service_unwatch_control_pid(s);
+
+        s->control_command = s->exec_command[SERVICE_EXEC_START_PRE];
+        if (s->control_command) {
+
+                r = service_adverse_to_leftover_processes(s);
+                if (r < 0)
+                        goto fail;
+
+                s->control_command_id = SERVICE_EXEC_START_PRE;
+
+                r = service_spawn(s,
+                                  s->control_command,
+                                  s->timeout_start_usec,
+                                  EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
+                                  &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m");
+                        goto fail;
+                }
+
+                service_set_state(s, SERVICE_START_PRE);
+        } else
+                service_enter_start(s);
+
+        return;
+
+fail:
+        service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+}
+
+static void service_enter_condition(Service *s) {
+        int r;
+
+        assert(s);
+
+        service_unwatch_control_pid(s);
+
+        s->control_command = s->exec_command[SERVICE_EXEC_CONDITION];
+        if (s->control_command) {
+
+                r = service_adverse_to_leftover_processes(s);
+                if (r < 0)
+                        goto fail;
+
+                s->control_command_id = SERVICE_EXEC_CONDITION;
+                pidref_done(&s->control_pid);
+
+                r = service_spawn(s,
+                                  s->control_command,
+                                  s->timeout_start_usec,
+                                  EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN,
+                                  &s->control_pid);
+
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'exec-condition' task: %m");
+                        goto fail;
+                }
+
+                service_set_state(s, SERVICE_CONDITION);
+        } else
+                service_enter_start_pre(s);
+
+        return;
+
+fail:
+        service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+}
+
+static void service_enter_restart(Service *s) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(s);
+
+        if (unit_has_job_type(UNIT(s), JOB_STOP)) {
+                /* Don't restart things if we are going down anyway */
+                log_unit_info(UNIT(s), "Stop job pending for unit, skipping automatic restart.");
+                return;
+        }
+
+        /* Any units that are bound to this service must also be restarted. We use JOB_START for ourselves
+         * but then set JOB_RESTART_DEPENDENCIES which will enqueue JOB_RESTART for those dependency jobs. */
+        r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT(s), JOB_RESTART_DEPENDENCIES, NULL, &error, NULL);
+        if (r < 0) {
+                log_unit_warning(UNIT(s), "Failed to schedule restart job: %s", bus_error_message(&error, r));
+                service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false);
+                return;
+        }
+
+        /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't
+         * fully stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset
+         * the counter explicitly however via the usual "systemctl reset-failure" logic. */
+        s->n_restarts ++;
+        s->flush_n_restarts = false;
+
+        s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+
+        log_unit_struct(UNIT(s), LOG_INFO,
+                        "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR,
+                        LOG_UNIT_INVOCATION_ID(UNIT(s)),
+                        LOG_UNIT_MESSAGE(UNIT(s),
+                                         "Scheduled restart job, restart counter is at %u.", s->n_restarts),
+                        "N_RESTARTS=%u", s->n_restarts);
+
+        service_set_state(s, SERVICE_AUTO_RESTART_QUEUED);
+
+        /* Notify clients about changed restart counter */
+        unit_add_to_dbus_queue(UNIT(s));
+}
+
+static void service_enter_reload_by_notify(Service *s) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(s);
+
+        r = service_arm_timer(s, /* relative= */ true, s->timeout_start_usec);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                s->reload_result = SERVICE_FAILURE_RESOURCES;
+                service_enter_running(s, SERVICE_SUCCESS);
+                return;
+        }
+
+        service_set_state(s, SERVICE_RELOAD_NOTIFY);
+
+        /* service_enter_reload_by_notify is never called during a reload, thus no loops are possible. */
+        r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error);
+        if (r < 0)
+                log_unit_warning(UNIT(s), "Failed to schedule propagation of reload, ignoring: %s", bus_error_message(&error, r));
+}
+
+static void service_enter_reload(Service *s) {
+        bool killed = false;
+        int r;
+
+        assert(s);
+
+        service_unwatch_control_pid(s);
+        s->reload_result = SERVICE_SUCCESS;
+
+        usec_t ts = now(CLOCK_MONOTONIC);
+
+        if (s->type == SERVICE_NOTIFY_RELOAD && pidref_is_set(&s->main_pid)) {
+                r = pidref_kill_and_sigcont(&s->main_pid, s->reload_signal);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to send reload signal: %m");
+                        goto fail;
+                }
+
+                killed = true;
+        }
+
+        s->control_command = s->exec_command[SERVICE_EXEC_RELOAD];
+        if (s->control_command) {
+                s->control_command_id = SERVICE_EXEC_RELOAD;
+                pidref_done(&s->control_pid);
+
+                r = service_spawn(s,
+                                  s->control_command,
+                                  s->timeout_start_usec,
+                                  EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
+                                  &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'reload' task: %m");
+                        goto fail;
+                }
+
+                service_set_state(s, SERVICE_RELOAD);
+        } else if (killed) {
+                r = service_arm_timer(s, /* relative= */ true, s->timeout_start_usec);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                        goto fail;
+                }
+
+                service_set_state(s, SERVICE_RELOAD_SIGNAL);
+        } else {
+                service_enter_running(s, SERVICE_SUCCESS);
+                return;
+        }
+
+        /* Store the timestamp when we started reloading: when reloading via SIGHUP we won't leave the reload
+         * state until we received both RELOADING=1 and READY=1 with MONOTONIC_USEC= set to a value above
+         * this. Thus we know for sure the reload cycle was executed *after* we requested it, and is not one
+         * that was already in progress before. */
+        s->reload_begin_usec = ts;
+        return;
+
+fail:
+        s->reload_result = SERVICE_FAILURE_RESOURCES;
+        service_enter_running(s, SERVICE_SUCCESS);
+}
+
+static void service_run_next_control(Service *s) {
+        usec_t timeout;
+        int r;
+
+        assert(s);
+        assert(s->control_command);
+        assert(s->control_command->command_next);
+
+        assert(s->control_command_id != SERVICE_EXEC_START);
+
+        s->control_command = s->control_command->command_next;
+        service_unwatch_control_pid(s);
+
+        if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
+                timeout = s->timeout_start_usec;
+        else
+                timeout = s->timeout_stop_usec;
+
+        pidref_done(&s->control_pid);
+
+        r = service_spawn(s,
+                          s->control_command,
+                          timeout,
+                          EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|
+                          (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD) ? EXEC_WRITE_CREDENTIALS : 0)|
+                          (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)|
+                          (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)|
+                          (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START) ? EXEC_SETENV_MONITOR_RESULT : 0)|
+                          (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0),
+                          &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to spawn next control task: %m");
+
+                if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START_POST, SERVICE_STOP))
+                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+                else if (s->state == SERVICE_STOP_POST)
+                        service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+                else if (s->state == SERVICE_RELOAD) {
+                        s->reload_result = SERVICE_FAILURE_RESOURCES;
+                        service_enter_running(s, SERVICE_SUCCESS);
+                } else
+                        service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+        }
+}
+
+static void service_run_next_main(Service *s) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        int r;
+
+        assert(s);
+        assert(s->main_command);
+        assert(s->main_command->command_next);
+        assert(s->type == SERVICE_ONESHOT);
+
+        s->main_command = s->main_command->command_next;
+        service_unwatch_main_pid(s);
+
+        r = service_spawn(s,
+                          s->main_command,
+                          s->timeout_start_usec,
+                          EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
+                          &pidref);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to spawn next main task: %m");
+                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+                return;
+        }
+
+        (void) service_set_main_pidref(s, &pidref);
+}
+
+static int service_start(Unit *u) {
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(s);
+
+        /* We cannot fulfill this request right now, try again later
+         * please! */
+        if (IN_SET(s->state,
+                   SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+                   SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, SERVICE_CLEANING))
+                return -EAGAIN;
+
+        /* Already on it! */
+        if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST))
+                return 0;
+
+        /* A service that will be restarted must be stopped first to trigger BindsTo and/or OnFailure
+         * dependencies. If a user does not want to wait for the holdoff time to elapse, the service should
+         * be manually restarted, not started. We simply return EAGAIN here, so that any start jobs stay
+         * queued, and assume that the auto restart timer will eventually trigger the restart. */
+        if (IN_SET(s->state, SERVICE_AUTO_RESTART, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART))
+                return -EAGAIN;
+
+        assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED, SERVICE_AUTO_RESTART_QUEUED));
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        s->result = SERVICE_SUCCESS;
+        s->reload_result = SERVICE_SUCCESS;
+        s->main_pid_known = false;
+        s->main_pid_alien = false;
+        s->forbid_restart = false;
+
+        s->status_text = mfree(s->status_text);
+        s->status_errno = 0;
+
+        s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+        s->notify_state = NOTIFY_UNKNOWN;
+
+        s->watchdog_original_usec = s->watchdog_usec;
+        s->watchdog_override_enable = false;
+        s->watchdog_override_usec = USEC_INFINITY;
+
+        exec_command_reset_status_list_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
+        exec_status_reset(&s->main_exec_status);
+
+        /* This is not an automatic restart? Flush the restart counter then */
+        if (s->flush_n_restarts) {
+                s->n_restarts = 0;
+                s->flush_n_restarts = false;
+        }
+
+        u->reset_accounting = true;
+
+        service_enter_condition(s);
+        return 1;
+}
+
+static int service_stop(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        /* Don't create restart jobs from manual stops. */
+        s->forbid_restart = true;
+
+        switch (s->state) {
+
+        case SERVICE_STOP:
+        case SERVICE_STOP_SIGTERM:
+        case SERVICE_STOP_SIGKILL:
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_WATCHDOG:
+        case SERVICE_FINAL_SIGTERM:
+        case SERVICE_FINAL_SIGKILL:
+                /* Already on it */
+                return 0;
+
+        case SERVICE_AUTO_RESTART:
+        case SERVICE_AUTO_RESTART_QUEUED:
+                /* Give up on the auto restart */
+                service_set_state(s, service_determine_dead_state(s));
+                return 0;
+
+        case SERVICE_CONDITION:
+        case SERVICE_START_PRE:
+        case SERVICE_START:
+        case SERVICE_START_POST:
+        case SERVICE_RELOAD:
+        case SERVICE_RELOAD_SIGNAL:
+        case SERVICE_RELOAD_NOTIFY:
+        case SERVICE_STOP_WATCHDOG:
+                /* If there's already something running we go directly into kill mode. */
+                service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
+                return 0;
+
+        case SERVICE_CLEANING:
+                /* If we are currently cleaning, then abort it, brutally. */
+                service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS);
+                return 0;
+
+        case SERVICE_RUNNING:
+        case SERVICE_EXITED:
+                service_enter_stop(s, SERVICE_SUCCESS);
+                return 1;
+
+        case SERVICE_DEAD_BEFORE_AUTO_RESTART:
+        case SERVICE_FAILED_BEFORE_AUTO_RESTART:
+        case SERVICE_DEAD:
+        case SERVICE_FAILED:
+        case SERVICE_DEAD_RESOURCES_PINNED:
+        default:
+                /* Unknown state, or unit_stop() should already have handled these */
+                assert_not_reached();
+        }
+}
+
+static int service_reload(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED));
+
+        service_enter_reload(s);
+        return 1;
+}
+
+static bool service_can_reload(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        return s->exec_command[SERVICE_EXEC_RELOAD] ||
+                s->type == SERVICE_NOTIFY_RELOAD;
+}
+
+static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, const ExecCommand *current) {
+        Service *s = SERVICE(u);
+        unsigned idx = 0;
+
+        assert(s);
+        assert(id >= 0);
+        assert(id < _SERVICE_EXEC_COMMAND_MAX);
+
+        const ExecCommand *first = s->exec_command[id];
+
+        /* Figure out where we are in the list by walking back to the beginning */
+        for (const ExecCommand *c = current; c != first; c = c->command_prev)
+                idx++;
+
+        return idx;
+}
+
+static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *command) {
+        _cleanup_free_ char *args = NULL, *p = NULL;
+        Service *s = SERVICE(u);
+        const char *type, *key;
+        ServiceExecCommand id;
+        size_t length = 0;
+        unsigned idx;
+
+        assert(s);
+        assert(f);
+
+        if (!command)
+                return 0;
+
+        if (command == s->control_command) {
+                type = "control";
+                id = s->control_command_id;
+        } else {
+                type = "main";
+                id = SERVICE_EXEC_START;
+        }
+
+        idx = service_exec_command_index(u, id, command);
+
+        STRV_FOREACH(arg, command->argv) {
+                _cleanup_free_ char *e = NULL;
+                size_t n;
+
+                e = cescape(*arg);
+                if (!e)
+                        return log_oom();
+
+                n = strlen(e);
+                if (!GREEDY_REALLOC(args, length + 2 + n + 2))
+                        return log_oom();
+
+                if (length > 0)
+                        args[length++] = ' ';
+
+                args[length++] = '"';
+                memcpy(args + length, e, n);
+                length += n;
+                args[length++] = '"';
+        }
+
+        if (!GREEDY_REALLOC(args, length + 1))
+                return log_oom();
+
+        args[length++] = 0;
+
+        p = cescape(command->path);
+        if (!p)
+                return log_oom();
+
+        key = strjoina(type, "-command");
+
+        /* We use '+1234' instead of '1234' to mark the last command in a sequence.
+         * This is used in service_deserialize_exec_command(). */
+        (void) serialize_item_format(
+                        f, key,
+                        "%s %s%u %s %s",
+                        service_exec_command_to_string(id),
+                        command->command_next ? "" : "+",
+                        idx,
+                        p, args);
+
+        return 0;
+}
+
+static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", service_state_to_string(s->state));
+        (void) serialize_item(f, "result", service_result_to_string(s->result));
+        (void) serialize_item(f, "reload-result", service_result_to_string(s->reload_result));
+
+        (void) serialize_pidref(f, fds, "control-pid", &s->control_pid);
+        if (s->main_pid_known)
+                (void) serialize_pidref(f, fds, "main-pid", &s->main_pid);
+
+        (void) serialize_bool(f, "main-pid-known", s->main_pid_known);
+        (void) serialize_bool(f, "bus-name-good", s->bus_name_good);
+        (void) serialize_bool(f, "bus-name-owner", s->bus_name_owner);
+
+        (void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts);
+        (void) serialize_bool(f, "flush-n-restarts", s->flush_n_restarts);
+
+        r = serialize_item_escaped(f, "status-text", s->status_text);
+        if (r < 0)
+                return r;
+
+        service_serialize_exec_command(u, f, s->control_command);
+        service_serialize_exec_command(u, f, s->main_command);
+
+        r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd);
+        if (r < 0)
+                return r;
+        r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd);
+        if (r < 0)
+                return r;
+        r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd);
+        if (r < 0)
+                return r;
+
+        if (s->exec_fd_event_source) {
+                r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source));
+                if (r < 0)
+                        return r;
+
+                (void) serialize_bool(f, "exec-fd-hot", s->exec_fd_hot);
+        }
+
+        if (UNIT_ISSET(s->accept_socket)) {
+                r = serialize_item(f, "accept-socket", UNIT_DEREF(s->accept_socket)->id);
+                if (r < 0)
+                        return r;
+        }
+
+        r = serialize_fd(f, fds, "socket-fd", s->socket_fd);
+        if (r < 0)
+                return r;
+
+        LIST_FOREACH(fd_store, fs, s->fd_store) {
+                _cleanup_free_ char *c = NULL;
+                int copy;
+
+                copy = fdset_put_dup(fds, fs->fd);
+                if (copy < 0)
+                        return log_error_errno(copy, "Failed to copy file descriptor for serialization: %m");
+
+                c = cescape(fs->fdname);
+                if (!c)
+                        return log_oom();
+
+                (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll);
+        }
+
+        if (s->main_exec_status.pid > 0) {
+                (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
+                (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
+                (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
+
+                if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
+                        (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
+                        (void) serialize_item_format(f, "main-exec-status-status", "%i", s->main_exec_status.status);
+                }
+        }
+
+        if (s->notify_access_override >= 0)
+                (void) serialize_item(f, "notify-access-override", notify_access_to_string(s->notify_access_override));
+
+        (void) serialize_dual_timestamp(f, "watchdog-timestamp", &s->watchdog_timestamp);
+        (void) serialize_bool(f, "forbid-restart", s->forbid_restart);
+
+        if (s->watchdog_override_enable)
+                (void) serialize_item_format(f, "watchdog-override-usec", USEC_FMT, s->watchdog_override_usec);
+
+        if (s->watchdog_original_usec != USEC_INFINITY)
+                (void) serialize_item_format(f, "watchdog-original-usec", USEC_FMT, s->watchdog_original_usec);
+
+        if (s->reload_begin_usec != USEC_INFINITY)
+                (void) serialize_item_format(f, "reload-begin-usec", USEC_FMT, s->reload_begin_usec);
+
+        return 0;
+}
+
+int service_deserialize_exec_command(
+                Unit *u,
+                const char *key,
+                const char *value) {
+
+        Service *s = SERVICE(u);
+        int r;
+        unsigned idx = 0, i;
+        bool control, found = false, last = false;
+        ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID;
+        ExecCommand *command = NULL;
+        _cleanup_free_ char *path = NULL;
+        _cleanup_strv_free_ char **argv = NULL;
+
+        enum ExecCommandState {
+                STATE_EXEC_COMMAND_TYPE,
+                STATE_EXEC_COMMAND_INDEX,
+                STATE_EXEC_COMMAND_PATH,
+                STATE_EXEC_COMMAND_ARGS,
+                _STATE_EXEC_COMMAND_MAX,
+                _STATE_EXEC_COMMAND_INVALID = -EINVAL,
+        } state;
+
+        assert(s);
+        assert(key);
+        assert(value);
+
+        control = streq(key, "control-command");
+
+        state = STATE_EXEC_COMMAND_TYPE;
+
+        for (;;) {
+                _cleanup_free_ char *arg = NULL;
+
+                r = extract_first_word(&value, &arg, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                switch (state) {
+                case STATE_EXEC_COMMAND_TYPE:
+                        id = service_exec_command_from_string(arg);
+                        if (id < 0)
+                                return id;
+
+                        state = STATE_EXEC_COMMAND_INDEX;
+                        break;
+                case STATE_EXEC_COMMAND_INDEX:
+                        /* PID 1234 is serialized as either '1234' or '+1234'. The second form is used to
+                         * mark the last command in a sequence. We warn if the deserialized command doesn't
+                         * match what we have loaded from the unit, but we don't need to warn if that is the
+                         * last command. */
+
+                        r = safe_atou(arg, &idx);
+                        if (r < 0)
+                                return r;
+                        last = arg[0] == '+';
+
+                        state = STATE_EXEC_COMMAND_PATH;
+                        break;
+                case STATE_EXEC_COMMAND_PATH:
+                        path = TAKE_PTR(arg);
+                        state = STATE_EXEC_COMMAND_ARGS;
+                        break;
+                case STATE_EXEC_COMMAND_ARGS:
+                        r = strv_extend(&argv, arg);
+                        if (r < 0)
+                                return -ENOMEM;
+                        break;
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        if (state != STATE_EXEC_COMMAND_ARGS)
+                return -EINVAL;
+        if (strv_isempty(argv))
+                return -EINVAL; /* At least argv[0] must be always present. */
+
+        /* Let's check whether exec command on given offset matches data that we just deserialized */
+        for (command = s->exec_command[id], i = 0; command; command = command->command_next, i++) {
+                if (i != idx)
+                        continue;
+
+                found = strv_equal(argv, command->argv) && streq(command->path, path);
+                break;
+        }
+
+        if (!found) {
+                /* Command at the index we serialized is different, let's look for command that exactly
+                 * matches but is on different index. If there is no such command we will not resume execution. */
+                for (command = s->exec_command[id]; command; command = command->command_next)
+                        if (strv_equal(command->argv, argv) && streq(command->path, path))
+                                break;
+        }
+
+        if (command && control) {
+                s->control_command = command;
+                s->control_command_id = id;
+        } else if (command)
+                s->main_command = command;
+        else if (last)
+                log_unit_debug(u, "Current command vanished from the unit file.");
+        else
+                log_unit_warning(u, "Current command vanished from the unit file, execution of the command list won't be resumed.");
+
+        return 0;
+}
+
+static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                ServiceState state;
+
+                state = service_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        s->deserialized_state = state;
+        } else if (streq(key, "result")) {
+                ServiceResult f;
+
+                f = service_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse result value: %s", value);
+                else if (f != SERVICE_SUCCESS)
+                        s->result = f;
+
+        } else if (streq(key, "reload-result")) {
+                ServiceResult f;
+
+                f = service_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse reload result value: %s", value);
+                else if (f != SERVICE_SUCCESS)
+                        s->reload_result = f;
+
+        } else if (streq(key, "control-pid")) {
+                pidref_done(&s->control_pid);
+
+                (void) deserialize_pidref(fds, value, &s->control_pid);
+
+        } else if (streq(key, "main-pid")) {
+                _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+                if (deserialize_pidref(fds, value, &pidref) >= 0)
+                        (void) service_set_main_pidref(s, &pidref);
+
+        } else if (streq(key, "main-pid-known")) {
+                int b;
+
+                b = parse_boolean(value);
+                if (b < 0)
+                        log_unit_debug(u, "Failed to parse main-pid-known value: %s", value);
+                else
+                        s->main_pid_known = b;
+        } else if (streq(key, "bus-name-good")) {
+                int b;
+
+                b = parse_boolean(value);
+                if (b < 0)
+                        log_unit_debug(u, "Failed to parse bus-name-good value: %s", value);
+                else
+                        s->bus_name_good = b;
+        } else if (streq(key, "bus-name-owner")) {
+                r = free_and_strdup(&s->bus_name_owner, value);
+                if (r < 0)
+                        log_unit_error_errno(u, r, "Unable to deserialize current bus owner %s: %m", value);
+        } else if (streq(key, "status-text")) {
+                char *t;
+                ssize_t l;
+
+                l = cunescape(value, 0, &t);
+                if (l < 0)
+                        log_unit_debug_errno(u, l, "Failed to unescape status text '%s': %m", value);
+                else
+                        free_and_replace(s->status_text, t);
+
+        } else if (streq(key, "accept-socket")) {
+                Unit *socket;
+
+                if (u->type != UNIT_SOCKET) {
+                        log_unit_debug(u, "Failed to deserialize accept-socket: unit is not a socket");
+                        return 0;
+                }
+
+                r = manager_load_unit(u->manager, value, NULL, NULL, &socket);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to load accept-socket unit '%s': %m", value);
+                else {
+                        unit_ref_set(&s->accept_socket, u, socket);
+                        SOCKET(socket)->n_connections++;
+                }
+
+        } else if (streq(key, "socket-fd")) {
+                asynchronous_close(s->socket_fd);
+                s->socket_fd = deserialize_fd(fds, value);
+
+        } else if (streq(key, "fd-store-fd")) {
+                _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL;
+                _cleanup_close_ int fd = -EBADF;
+                int do_poll;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value);
+                        return 0;
+                }
+
+                fd = deserialize_fd(fds, fdv);
+                if (fd < 0)
+                        return 0;
+
+                r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value);
+                        return 0;
+                }
+
+                r = extract_first_word(&value, &fdp, NULL, 0);
+                if (r == 0) {
+                        /* If the value is not present, we assume the default */
+                        do_poll = 1;
+                } else if (r < 0 || (r = safe_atoi(fdp, &do_poll)) < 0) {
+                        log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\", ignoring: %m", value);
+                        return 0;
+                }
+
+                r = service_add_fd_store(s, fd, fdn, do_poll);
+                if (r < 0) {
+                        log_unit_debug_errno(u, r, "Failed to store deserialized fd %i, ignoring: %m", fd);
+                        return 0;
+                }
+
+                TAKE_FD(fd);
+        } else if (streq(key, "main-exec-status-pid")) {
+                pid_t pid;
+
+                if (parse_pid(value, &pid) < 0)
+                        log_unit_debug(u, "Failed to parse main-exec-status-pid value: %s", value);
+                else
+                        s->main_exec_status.pid = pid;
+        } else if (streq(key, "main-exec-status-code")) {
+                int i;
+
+                if (safe_atoi(value, &i) < 0)
+                        log_unit_debug(u, "Failed to parse main-exec-status-code value: %s", value);
+                else
+                        s->main_exec_status.code = i;
+        } else if (streq(key, "main-exec-status-status")) {
+                int i;
+
+                if (safe_atoi(value, &i) < 0)
+                        log_unit_debug(u, "Failed to parse main-exec-status-status value: %s", value);
+                else
+                        s->main_exec_status.status = i;
+        } else if (streq(key, "main-exec-status-start"))
+                deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
+        else if (streq(key, "main-exec-status-exit"))
+                deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
+        else if (streq(key, "notify-access-override")) {
+                NotifyAccess notify_access;
+
+                notify_access = notify_access_from_string(value);
+                if (notify_access < 0)
+                        log_unit_debug(u, "Failed to parse notify-access-override value: %s", value);
+                else
+                        s->notify_access_override = notify_access;
+        } else if (streq(key, "watchdog-timestamp"))
+                deserialize_dual_timestamp(value, &s->watchdog_timestamp);
+        else if (streq(key, "forbid-restart")) {
+                int b;
+
+                b = parse_boolean(value);
+                if (b < 0)
+                        log_unit_debug(u, "Failed to parse forbid-restart value: %s", value);
+                else
+                        s->forbid_restart = b;
+        } else if (streq(key, "stdin-fd")) {
+
+                asynchronous_close(s->stdin_fd);
+                s->stdin_fd = deserialize_fd(fds, value);
+                if (s->stdin_fd >= 0)
+                        s->exec_context.stdio_as_fds = true;
+
+        } else if (streq(key, "stdout-fd")) {
+
+                asynchronous_close(s->stdout_fd);
+                s->stdout_fd = deserialize_fd(fds, value);
+                if (s->stdout_fd >= 0)
+                        s->exec_context.stdio_as_fds = true;
+
+        } else if (streq(key, "stderr-fd")) {
+
+                asynchronous_close(s->stderr_fd);
+                s->stderr_fd = deserialize_fd(fds, value);
+                if (s->stderr_fd >= 0)
+                        s->exec_context.stdio_as_fds = true;
+
+        } else if (streq(key, "exec-fd")) {
+                _cleanup_close_ int fd = -EBADF;
+
+                fd = deserialize_fd(fds, value);
+                if (fd >= 0) {
+                        s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+                        if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) >= 0)
+                                TAKE_FD(fd);
+                }
+
+        } else if (streq(key, "watchdog-override-usec")) {
+                if (deserialize_usec(value, &s->watchdog_override_usec) < 0)
+                        log_unit_debug(u, "Failed to parse watchdog_override_usec value: %s", value);
+                else
+                        s->watchdog_override_enable = true;
+
+        } else if (streq(key, "watchdog-original-usec")) {
+                if (deserialize_usec(value, &s->watchdog_original_usec) < 0)
+                        log_unit_debug(u, "Failed to parse watchdog_original_usec value: %s", value);
+
+        } else if (STR_IN_SET(key, "main-command", "control-command")) {
+                r = service_deserialize_exec_command(u, key, value);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to parse serialized command \"%s\": %m", value);
+
+        } else if (streq(key, "n-restarts")) {
+                r = safe_atou(value, &s->n_restarts);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value);
+
+        } else if (streq(key, "flush-n-restarts")) {
+                r = parse_boolean(value);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to parse serialized flush restart counter setting '%s': %m", value);
+                else
+                        s->flush_n_restarts = r;
+        } else if (streq(key, "reload-begin-usec")) {
+                r = deserialize_usec(value, &s->reload_begin_usec);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to parse serialized reload begin timestamp '%s', ignoring: %m", value);
+        } else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static UnitActiveState service_active_state(Unit *u) {
+        const UnitActiveState *table;
+
+        assert(u);
+
+        table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+
+        return table[SERVICE(u)->state];
+}
+
+static const char *service_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return service_state_to_string(SERVICE(u)->state);
+}
+
+static bool service_may_gc(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        /* Never clean up services that still have a process around, even if the service is formally dead. Note that
+         * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they
+         * have moved outside of the cgroup. */
+
+        if (main_pid_good(s) > 0 ||
+            control_pid_good(s) > 0)
+                return false;
+
+        /* Only allow collection of actually dead services, i.e. not those that are in the transitionary
+         * SERVICE_DEAD_BEFORE_AUTO_RESTART/SERVICE_FAILED_BEFORE_AUTO_RESTART states. */
+        if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED))
+                return false;
+
+        return true;
+}
+
+static int service_retry_pid_file(Service *s) {
+        int r;
+
+        assert(s->pid_file);
+        assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
+
+        r = service_load_pid_file(s, false);
+        if (r < 0)
+                return r;
+
+        service_unwatch_pid_file(s);
+
+        service_enter_running(s, SERVICE_SUCCESS);
+        return 0;
+}
+
+static int service_watch_pid_file(Service *s) {
+        int r;
+
+        log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path);
+
+        r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io);
+        if (r < 0) {
+                log_unit_error_errno(UNIT(s), r, "Failed to set a watch for PID file %s: %m", s->pid_file_pathspec->path);
+                service_unwatch_pid_file(s);
+                return r;
+        }
+
+        /* the pidfile might have appeared just before we set the watch */
+        log_unit_debug(UNIT(s), "Trying to read PID file %s in case it changed", s->pid_file_pathspec->path);
+        service_retry_pid_file(s);
+
+        return 0;
+}
+
+static int service_demand_pid_file(Service *s) {
+        _cleanup_free_ PathSpec *ps = NULL;
+
+        assert(s->pid_file);
+        assert(!s->pid_file_pathspec);
+
+        ps = new(PathSpec, 1);
+        if (!ps)
+                return -ENOMEM;
+
+        *ps = (PathSpec) {
+                .unit = UNIT(s),
+                .path = strdup(s->pid_file),
+                /* PATH_CHANGED would not be enough. There are daemons (sendmail) that keep their PID file
+                 * open all the time. */
+                .type = PATH_MODIFIED,
+                .inotify_fd = -EBADF,
+        };
+
+        if (!ps->path)
+                return -ENOMEM;
+
+        path_simplify(ps->path);
+
+        s->pid_file_pathspec = TAKE_PTR(ps);
+
+        return service_watch_pid_file(s);
+}
+
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
+        PathSpec *p = ASSERT_PTR(userdata);
+        Service *s;
+
+        s = SERVICE(p->unit);
+
+        assert(s);
+        assert(fd >= 0);
+        assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
+        assert(s->pid_file_pathspec);
+        assert(path_spec_owns_inotify_fd(s->pid_file_pathspec, fd));
+
+        log_unit_debug(UNIT(s), "inotify event");
+
+        if (path_spec_fd_event(p, events) < 0)
+                goto fail;
+
+        if (service_retry_pid_file(s) == 0)
+                return 0;
+
+        if (service_watch_pid_file(s) < 0)
+                goto fail;
+
+        return 0;
+
+fail:
+        service_unwatch_pid_file(s);
+        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+        return 0;
+}
+
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
+        Service *s = SERVICE(userdata);
+
+        assert(s);
+
+        log_unit_debug(UNIT(s), "got exec-fd event");
+
+        /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
+         * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
+         * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
+         * parent. We need to be careful however, as there are other reasons that we might cause the child's side of
+         * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
+         * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
+         * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
+         * sends a zero byte we'll ignore POLLHUP on the fd again. */
+
+        for (;;) {
+                uint8_t x;
+                ssize_t n;
+
+                n = read(fd, &x, sizeof(x));
+                if (n < 0) {
+                        if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
+                                return 0;
+
+                        return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
+                }
+                if (n == 0) { /* EOF → the event we are waiting for */
+
+                        s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+                        if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
+                                log_unit_debug(UNIT(s), "Got EOF on exec-fd");
+
+                                s->exec_fd_hot = false;
+
+                                /* Nice! This is what we have been waiting for. Transition to next state. */
+                                if (s->type == SERVICE_EXEC && s->state == SERVICE_START)
+                                        service_enter_start_post(s);
+                        } else
+                                log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring.");
+
+                        return 0;
+                }
+
+                /* A byte was read → this turns on/off the exec fd logic */
+                assert(n == sizeof(x));
+                s->exec_fd_hot = x;
+        }
+
+        return 0;
+}
+
+static void service_notify_cgroup_empty_event(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(u);
+
+        log_unit_debug(u, "Control group is empty.");
+
+        switch (s->state) {
+
+                /* Waiting for SIGCHLD is usually more interesting, because it includes return
+                 * codes/signals. Which is why we ignore the cgroup events for most cases, except when we
+                 * don't know pid which to expect the SIGCHLD for. */
+
+        case SERVICE_START:
+                if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) &&
+                    main_pid_good(s) == 0 &&
+                    control_pid_good(s) == 0) {
+                        /* No chance of getting a ready notification anymore */
+                        service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL);
+                        break;
+                }
+
+                if (s->exit_type == SERVICE_EXIT_CGROUP && main_pid_good(s) <= 0)
+                        service_enter_start_post(s);
+
+                _fallthrough_;
+        case SERVICE_START_POST:
+                if (s->pid_file_pathspec &&
+                    main_pid_good(s) == 0 &&
+                    control_pid_good(s) == 0) {
+
+                        /* Give up hoping for the daemon to write its PID file */
+                        log_unit_warning(u, "Daemon never wrote its PID file. Failing.");
+
+                        service_unwatch_pid_file(s);
+                        if (s->state == SERVICE_START)
+                                service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL);
+                        else
+                                service_enter_stop(s, SERVICE_FAILURE_PROTOCOL);
+                }
+                break;
+
+        case SERVICE_RUNNING:
+                /* service_enter_running() will figure out what to do */
+                service_enter_running(s, SERVICE_SUCCESS);
+                break;
+
+        case SERVICE_STOP_WATCHDOG:
+        case SERVICE_STOP_SIGTERM:
+        case SERVICE_STOP_SIGKILL:
+
+                if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0)
+                        service_enter_stop_post(s, SERVICE_SUCCESS);
+
+                break;
+
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_WATCHDOG:
+        case SERVICE_FINAL_SIGTERM:
+        case SERVICE_FINAL_SIGKILL:
+                if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0)
+                        service_enter_dead(s, SERVICE_SUCCESS, true);
+
+                break;
+
+        /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean
+         * up the cgroup earlier and should do it now. */
+        case SERVICE_AUTO_RESTART:
+        case SERVICE_AUTO_RESTART_QUEUED:
+                unit_prune_cgroup(u);
+                break;
+
+        default:
+                ;
+        }
+}
+
+static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
+        Service *s = SERVICE(u);
+
+        if (managed_oom)
+                log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
+        else
+                log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+        if (s->oom_policy == OOM_CONTINUE)
+                return;
+
+        switch (s->state) {
+
+        case SERVICE_CONDITION:
+        case SERVICE_START_PRE:
+        case SERVICE_START:
+        case SERVICE_START_POST:
+        case SERVICE_STOP:
+                if (s->oom_policy == OOM_STOP)
+                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL);
+                else if (s->oom_policy == OOM_KILL)
+                        service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+                break;
+
+        case SERVICE_EXITED:
+        case SERVICE_RUNNING:
+                if (s->oom_policy == OOM_STOP)
+                        service_enter_stop(s, SERVICE_FAILURE_OOM_KILL);
+                else if (s->oom_policy == OOM_KILL)
+                        service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+                break;
+
+        case SERVICE_STOP_WATCHDOG:
+        case SERVICE_STOP_SIGTERM:
+                service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+                break;
+
+        case SERVICE_STOP_SIGKILL:
+        case SERVICE_FINAL_SIGKILL:
+                if (s->result == SERVICE_SUCCESS)
+                        s->result = SERVICE_FAILURE_OOM_KILL;
+                break;
+
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_SIGTERM:
+                service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+                break;
+
+        default:
+                ;
+        }
+}
+
+static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+        bool notify_dbus = true;
+        Service *s = SERVICE(u);
+        ServiceResult f;
+        ExitClean clean_mode;
+
+        assert(s);
+        assert(pid >= 0);
+
+        /* Oneshot services and non-SERVICE_EXEC_START commands should not be
+         * considered daemons as they are typically not long running. */
+        if (s->type == SERVICE_ONESHOT || (s->control_pid.pid == pid && s->control_command_id != SERVICE_EXEC_START))
+                clean_mode = EXIT_CLEAN_COMMAND;
+        else
+                clean_mode = EXIT_CLEAN_DAEMON;
+
+        if (is_clean_exit(code, status, clean_mode, &s->success_status))
+                f = SERVICE_SUCCESS;
+        else if (code == CLD_EXITED)
+                f = SERVICE_FAILURE_EXIT_CODE;
+        else if (code == CLD_KILLED)
+                f = SERVICE_FAILURE_SIGNAL;
+        else if (code == CLD_DUMPED)
+                f = SERVICE_FAILURE_CORE_DUMP;
+        else
+                assert_not_reached();
+
+        if (s->main_pid.pid == pid) {
+                /* Clean up the exec_fd event source. We want to do this here, not later in
+                 * service_set_state(), because service_enter_stop_post() calls service_spawn().
+                 * The source owns its end of the pipe, so this will close that too. */
+                s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+                /* Forking services may occasionally move to a new PID.
+                 * As long as they update the PID file before exiting the old
+                 * PID, they're fine. */
+                if (service_load_pid_file(s, false) > 0)
+                        return;
+
+                pidref_done(&s->main_pid);
+                exec_status_exit(&s->main_exec_status, &s->exec_context, pid, code, status);
+
+                if (s->main_command) {
+                        /* If this is not a forking service than the
+                         * main process got started and hence we copy
+                         * the exit status so that it is recorded both
+                         * as main and as control process exit
+                         * status */
+
+                        s->main_command->exec_status = s->main_exec_status;
+
+                        if (s->main_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+                                f = SERVICE_SUCCESS;
+                } else if (s->exec_command[SERVICE_EXEC_START]) {
+
+                        /* If this is a forked process, then we should
+                         * ignore the return value if this was
+                         * configured for the starter process */
+
+                        if (s->exec_command[SERVICE_EXEC_START]->flags & EXEC_COMMAND_IGNORE_FAILURE)
+                                f = SERVICE_SUCCESS;
+                }
+
+                unit_log_process_exit(
+                                u,
+                                "Main process",
+                                service_exec_command_to_string(SERVICE_EXEC_START),
+                                f == SERVICE_SUCCESS,
+                                code, status);
+
+                if (s->result == SERVICE_SUCCESS)
+                        s->result = f;
+
+                if (s->main_command &&
+                    s->main_command->command_next &&
+                    s->type == SERVICE_ONESHOT &&
+                    f == SERVICE_SUCCESS) {
+
+                        /* There is another command to execute, so let's do that. */
+
+                        log_unit_debug(u, "Running next main command for state %s.", service_state_to_string(s->state));
+                        service_run_next_main(s);
+
+                } else {
+                        s->main_command = NULL;
+
+                        /* Services with ExitType=cgroup do not act on main PID exiting, unless the cgroup is
+                         * already empty */
+                        if (s->exit_type == SERVICE_EXIT_MAIN || cgroup_good(s) <= 0) {
+                                /* The service exited, so the service is officially gone. */
+                                switch (s->state) {
+
+                                case SERVICE_START_POST:
+                                case SERVICE_RELOAD:
+                                case SERVICE_RELOAD_SIGNAL:
+                                case SERVICE_RELOAD_NOTIFY:
+                                        /* If neither main nor control processes are running then the current
+                                         * state can never exit cleanly, hence immediately terminate the
+                                         * service. */
+                                        if (control_pid_good(s) <= 0)
+                                                service_enter_stop(s, f);
+
+                                        /* Otherwise need to wait until the operation is done. */
+                                        break;
+
+                                case SERVICE_STOP:
+                                        /* Need to wait until the operation is done. */
+                                        break;
+
+                                case SERVICE_START:
+                                        if (s->type == SERVICE_ONESHOT) {
+                                                /* This was our main goal, so let's go on */
+                                                if (f == SERVICE_SUCCESS)
+                                                        service_enter_start_post(s);
+                                                else
+                                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                                break;
+                                        } else if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD)) {
+                                                /* Only enter running through a notification, so that the
+                                                 * SERVICE_START state signifies that no ready notification
+                                                 * has been received */
+                                                if (f != SERVICE_SUCCESS)
+                                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                                else if (!s->remain_after_exit || service_get_notify_access(s) == NOTIFY_MAIN)
+                                                        /* The service has never been and will never be active */
+                                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL);
+                                                break;
+                                        }
+
+                                        _fallthrough_;
+                                case SERVICE_RUNNING:
+                                        service_enter_running(s, f);
+                                        break;
+
+                                case SERVICE_STOP_WATCHDOG:
+                                case SERVICE_STOP_SIGTERM:
+                                case SERVICE_STOP_SIGKILL:
+
+                                        if (control_pid_good(s) <= 0)
+                                                service_enter_stop_post(s, f);
+
+                                        /* If there is still a control process, wait for that first */
+                                        break;
+
+                                case SERVICE_STOP_POST:
+
+                                        if (control_pid_good(s) <= 0)
+                                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, f);
+
+                                        break;
+
+                                case SERVICE_FINAL_WATCHDOG:
+                                case SERVICE_FINAL_SIGTERM:
+                                case SERVICE_FINAL_SIGKILL:
+
+                                        if (control_pid_good(s) <= 0)
+                                                service_enter_dead(s, f, true);
+                                        break;
+
+                                default:
+                                        assert_not_reached();
+                                }
+                        } else if (s->exit_type == SERVICE_EXIT_CGROUP && s->state == SERVICE_START)
+                                /* If a main process exits very quickly, this function might be executed
+                                 * before service_dispatch_exec_io(). Since this function disabled IO events
+                                 * to monitor the main process above, we need to update the state here too.
+                                 * Let's consider the process is successfully launched and exited. */
+                                service_enter_start_post(s);
+                }
+
+        } else if (s->control_pid.pid == pid) {
+                const char *kind;
+                bool success;
+
+                pidref_done(&s->control_pid);
+
+                if (s->control_command) {
+                        exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+                        if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+                                f = SERVICE_SUCCESS;
+                }
+
+                /* ExecCondition= calls that exit with (0, 254] should invoke skip-like behavior instead of failing */
+                if (s->state == SERVICE_CONDITION) {
+                        if (f == SERVICE_FAILURE_EXIT_CODE && status < 255) {
+                                UNIT(s)->condition_result = false;
+                                f = SERVICE_SKIP_CONDITION;
+                                success = true;
+                        } else if (f == SERVICE_SUCCESS) {
+                                UNIT(s)->condition_result = true;
+                                success = true;
+                        } else
+                                success = false;
+
+                        kind = "Condition check process";
+                } else {
+                        kind = "Control process";
+                        success = f == SERVICE_SUCCESS;
+                }
+
+                unit_log_process_exit(
+                                u,
+                                kind,
+                                service_exec_command_to_string(s->control_command_id),
+                                success,
+                                code, status);
+
+                if (s->state != SERVICE_RELOAD && s->result == SERVICE_SUCCESS)
+                        s->result = f;
+
+                if (s->control_command &&
+                    s->control_command->command_next &&
+                    f == SERVICE_SUCCESS) {
+
+                        /* There is another command to * execute, so let's do that. */
+
+                        log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state));
+                        service_run_next_control(s);
+
+                } else {
+                        /* No further commands for this step, so let's figure out what to do next */
+
+                        s->control_command = NULL;
+                        s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+                        log_unit_debug(u, "Got final SIGCHLD for state %s.", service_state_to_string(s->state));
+
+                        switch (s->state) {
+
+                        case SERVICE_CONDITION:
+                                if (f == SERVICE_SUCCESS)
+                                        service_enter_start_pre(s);
+                                else
+                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                break;
+
+                        case SERVICE_START_PRE:
+                                if (f == SERVICE_SUCCESS)
+                                        service_enter_start(s);
+                                else
+                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                break;
+
+                        case SERVICE_START:
+                                if (s->type != SERVICE_FORKING)
+                                        /* Maybe spurious event due to a reload that changed the type? */
+                                        break;
+
+                                if (f != SERVICE_SUCCESS) {
+                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                        break;
+                                }
+
+                                if (s->pid_file) {
+                                        bool has_start_post;
+                                        int r;
+
+                                        /* Let's try to load the pid file here if we can.
+                                         * The PID file might actually be created by a START_POST
+                                         * script. In that case don't worry if the loading fails. */
+
+                                        has_start_post = s->exec_command[SERVICE_EXEC_START_POST];
+                                        r = service_load_pid_file(s, !has_start_post);
+                                        if (!has_start_post && r < 0) {
+                                                r = service_demand_pid_file(s);
+                                                if (r < 0 || cgroup_good(s) == 0)
+                                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL);
+                                                break;
+                                        }
+                                } else
+                                        service_search_main_pid(s);
+
+                                service_enter_start_post(s);
+                                break;
+
+                        case SERVICE_START_POST:
+                                if (f != SERVICE_SUCCESS) {
+                                        service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                        break;
+                                }
+
+                                if (s->pid_file) {
+                                        int r;
+
+                                        r = service_load_pid_file(s, true);
+                                        if (r < 0) {
+                                                r = service_demand_pid_file(s);
+                                                if (r < 0 || cgroup_good(s) == 0)
+                                                        service_enter_stop(s, SERVICE_FAILURE_PROTOCOL);
+                                                break;
+                                        }
+                                } else
+                                        service_search_main_pid(s);
+
+                                service_enter_running(s, SERVICE_SUCCESS);
+                                break;
+
+                        case SERVICE_RELOAD:
+                        case SERVICE_RELOAD_SIGNAL:
+                        case SERVICE_RELOAD_NOTIFY:
+                                if (f == SERVICE_SUCCESS)
+                                        if (service_load_pid_file(s, true) < 0)
+                                                service_search_main_pid(s);
+
+                                s->reload_result = f;
+
+                                /* If the last notification we received from the service process indicates
+                                 * we are still reloading, then don't leave reloading state just yet, just
+                                 * transition into SERVICE_RELOAD_NOTIFY, to wait for the READY=1 coming,
+                                 * too. */
+                                if (s->notify_state == NOTIFY_RELOADING)
+                                        service_set_state(s, SERVICE_RELOAD_NOTIFY);
+                                else
+                                        service_enter_running(s, SERVICE_SUCCESS);
+                                break;
+
+                        case SERVICE_STOP:
+                                service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+                                break;
+
+                        case SERVICE_STOP_WATCHDOG:
+                        case SERVICE_STOP_SIGTERM:
+                        case SERVICE_STOP_SIGKILL:
+                                if (main_pid_good(s) <= 0)
+                                        service_enter_stop_post(s, f);
+
+                                /* If there is still a service process around, wait until
+                                 * that one quit, too */
+                                break;
+
+                        case SERVICE_STOP_POST:
+                                if (main_pid_good(s) <= 0)
+                                        service_enter_signal(s, SERVICE_FINAL_SIGTERM, f);
+                                break;
+
+                        case SERVICE_FINAL_WATCHDOG:
+                        case SERVICE_FINAL_SIGTERM:
+                        case SERVICE_FINAL_SIGKILL:
+                                if (main_pid_good(s) <= 0)
+                                        service_enter_dead(s, f, true);
+                                break;
+
+                        case SERVICE_CLEANING:
+
+                                if (s->clean_result == SERVICE_SUCCESS)
+                                        s->clean_result = f;
+
+                                service_enter_dead(s, SERVICE_SUCCESS, false);
+                                break;
+
+                        default:
+                                assert_not_reached();
+                        }
+                }
+        } else /* Neither control nor main PID? If so, don't notify about anything */
+                notify_dbus = false;
+
+        /* Notify clients about changed exit status */
+        if (notify_dbus)
+                unit_add_to_dbus_queue(u);
+
+        /* We watch the main/control process otherwise we can't retrieve the unit they
+         * belong to with cgroupv1. But if they are not our direct child, we won't get a
+         * SIGCHLD for them. Therefore we need to look for others to watch so we can
+         * detect when the cgroup becomes empty. Note that the control process is always
+         * our child so it's pointless to watch all other processes. */
+        if (!control_pid_good(s))
+                if (!s->main_pid_known || s->main_pid_alien)
+                        (void) unit_enqueue_rewatch_pids(u);
+}
+
+static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Service *s = SERVICE(userdata);
+
+        assert(s);
+        assert(source == s->timer_event_source);
+
+        switch (s->state) {
+
+        case SERVICE_CONDITION:
+        case SERVICE_START_PRE:
+        case SERVICE_START:
+        case SERVICE_START_POST:
+                switch (s->timeout_start_failure_mode) {
+
+                case SERVICE_TIMEOUT_TERMINATE:
+                        log_unit_warning(UNIT(s), "%s operation timed out. Terminating.", service_state_to_string(s->state));
+                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+                        break;
+
+                case SERVICE_TIMEOUT_ABORT:
+                        log_unit_warning(UNIT(s), "%s operation timed out. Aborting.", service_state_to_string(s->state));
+                        service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+                        break;
+
+                case SERVICE_TIMEOUT_KILL:
+                        if (s->kill_context.send_sigkill) {
+                                log_unit_warning(UNIT(s), "%s operation timed out. Killing.", service_state_to_string(s->state));
+                                service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                        } else {
+                                log_unit_warning(UNIT(s), "%s operation timed out. Skipping SIGKILL.", service_state_to_string(s->state));
+                                service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+                        }
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+                break;
+
+        case SERVICE_RUNNING:
+                log_unit_warning(UNIT(s), "Service reached runtime time limit. Stopping.");
+                service_enter_stop(s, SERVICE_FAILURE_TIMEOUT);
+                break;
+
+        case SERVICE_RELOAD:
+        case SERVICE_RELOAD_SIGNAL:
+        case SERVICE_RELOAD_NOTIFY:
+                log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process.");
+                service_kill_control_process(s);
+                s->reload_result = SERVICE_FAILURE_TIMEOUT;
+                service_enter_running(s, SERVICE_SUCCESS);
+                break;
+
+        case SERVICE_STOP:
+                switch (s->timeout_stop_failure_mode) {
+
+                case SERVICE_TIMEOUT_TERMINATE:
+                        log_unit_warning(UNIT(s), "Stopping timed out. Terminating.");
+                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+                        break;
+
+                case SERVICE_TIMEOUT_ABORT:
+                        log_unit_warning(UNIT(s), "Stopping timed out. Aborting.");
+                        service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+                        break;
+
+                case SERVICE_TIMEOUT_KILL:
+                        if (s->kill_context.send_sigkill) {
+                                log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+                                service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                        } else {
+                                log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
+                                service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+                        }
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+                break;
+
+        case SERVICE_STOP_WATCHDOG:
+                if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Killing.");
+                        service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Skipping SIGKILL.");
+                        service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+                }
+                break;
+
+        case SERVICE_STOP_SIGTERM:
+                if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) {
+                        log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Aborting.");
+                        service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+                } else if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Killing.");
+                        service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Skipping SIGKILL.");
+                        service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+                }
+
+                break;
+
+        case SERVICE_STOP_SIGKILL:
+                /* Uh, we sent a SIGKILL and it is still not gone?
+                 * Must be something we cannot kill, so let's just be
+                 * weirded out and continue */
+
+                log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring.");
+                service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+                break;
+
+        case SERVICE_STOP_POST:
+                switch (s->timeout_stop_failure_mode) {
+
+                case SERVICE_TIMEOUT_TERMINATE:
+                        log_unit_warning(UNIT(s), "State 'stop-post' timed out. Terminating.");
+                        service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+                        break;
+
+                case SERVICE_TIMEOUT_ABORT:
+                        log_unit_warning(UNIT(s), "State 'stop-post' timed out. Aborting.");
+                        service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+                        break;
+
+                case SERVICE_TIMEOUT_KILL:
+                        if (s->kill_context.send_sigkill) {
+                                log_unit_warning(UNIT(s), "State 'stop-post' timed out. Killing.");
+                                service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                        } else {
+                                log_unit_warning(UNIT(s), "State 'stop-post' timed out. Skipping SIGKILL. Entering failed mode.");
+                                service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+                        }
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+                break;
+
+        case SERVICE_FINAL_WATCHDOG:
+                if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Killing.");
+                        service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Skipping SIGKILL. Entering failed mode.");
+                        service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+                }
+                break;
+
+        case SERVICE_FINAL_SIGTERM:
+                if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) {
+                        log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Aborting.");
+                        service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+                } else if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Killing.");
+                        service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Skipping SIGKILL. Entering failed mode.");
+                        service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+                }
+
+                break;
+
+        case SERVICE_FINAL_SIGKILL:
+                log_unit_warning(UNIT(s), "Processes still around after final SIGKILL. Entering failed mode.");
+                service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, true);
+                break;
+
+        case SERVICE_AUTO_RESTART:
+                if (s->restart_usec > 0)
+                        log_unit_debug(UNIT(s),
+                                       "Service restart interval %s expired, scheduling restart.",
+                                       FORMAT_TIMESPAN(service_restart_usec_next(s), USEC_PER_SEC));
+                else
+                        log_unit_debug(UNIT(s),
+                                       "Service has no hold-off time (RestartSec=0), scheduling restart.");
+
+                service_enter_restart(s);
+                break;
+
+        case SERVICE_CLEANING:
+                log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+                if (s->clean_result == SERVICE_SUCCESS)
+                        s->clean_result = SERVICE_FAILURE_TIMEOUT;
+
+                service_enter_signal(s, SERVICE_FINAL_SIGKILL, 0);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) {
+        Service *s = SERVICE(userdata);
+        usec_t watchdog_usec;
+
+        assert(s);
+        assert(source == s->watchdog_event_source);
+
+        watchdog_usec = service_get_watchdog_usec(s);
+
+        if (UNIT(s)->manager->service_watchdogs) {
+                log_unit_error(UNIT(s), "Watchdog timeout (limit %s)!",
+                               FORMAT_TIMESPAN(watchdog_usec, 1));
+
+                service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+        } else
+                log_unit_warning(UNIT(s), "Watchdog disabled! Ignoring watchdog timeout (limit %s)!",
+                                 FORMAT_TIMESPAN(watchdog_usec, 1));
+
+        return 0;
+}
+
+static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) {
+        assert(s);
+
+        NotifyAccess notify_access = service_get_notify_access(s);
+
+        if (notify_access == NOTIFY_NONE) {
+                log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid);
+                return false;
+        }
+
+        if (notify_access == NOTIFY_MAIN && pid != s->main_pid.pid) {
+                if (pidref_is_set(&s->main_pid))
+                        log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
+                else
+                        log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid);
+
+                return false;
+        }
+
+        if (notify_access == NOTIFY_EXEC && pid != s->main_pid.pid && pid != s->control_pid.pid) {
+                if (pidref_is_set(&s->main_pid) && pidref_is_set(&s->control_pid))
+                        log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT,
+                                         pid, s->main_pid.pid, s->control_pid.pid);
+                else if (pidref_is_set(&s->main_pid))
+                        log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
+                else if (pidref_is_set(&s->control_pid))
+                        log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid);
+                else
+                        log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid);
+
+                return false;
+        }
+
+        return true;
+}
+
+static void service_force_watchdog(Service *s) {
+        if (!UNIT(s)->manager->service_watchdogs)
+                return;
+
+        log_unit_error(UNIT(s), "Watchdog request (last status: %s)!",
+                       s->status_text ?: "");
+
+        service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+}
+
+static void service_notify_message(
+                Unit *u,
+                const struct ucred *ucred,
+                char * const *tags,
+                FDSet *fds) {
+
+        Service *s = SERVICE(u);
+        bool notify_dbus = false;
+        usec_t monotonic_usec = USEC_INFINITY;
+        const char *e;
+        int r;
+
+        assert(u);
+        assert(ucred);
+
+        if (!service_notify_message_authorized(s, ucred->pid, fds))
+                return;
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *cc = NULL;
+
+                cc = strv_join(tags, ", ");
+                log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, empty_to_na(cc));
+        }
+
+        /* Interpret MAINPID= */
+        e = strv_find_startswith(tags, "MAINPID=");
+        if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) {
+                _cleanup_(pidref_done) PidRef new_main_pid = PIDREF_NULL;
+
+                r = pidref_set_pidstr(&new_main_pid, e);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to parse MAINPID=%s field in notification message, ignoring: %m", e);
+                else if (!s->main_pid_known || !pidref_equal(&new_main_pid, &s->main_pid)) {
+
+                        r = service_is_suitable_main_pid(s, &new_main_pid, LOG_WARNING);
+                        if (r == 0) {
+                                /* The new main PID is a bit suspicious, which is OK if the sender is privileged. */
+
+                                if (ucred->uid == 0) {
+                                        log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid.pid);
+                                        r = 1;
+                                } else
+                                        log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid);
+                        }
+                        if (r > 0) {
+                                (void) service_set_main_pidref(s, &new_main_pid);
+
+                                r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+                                if (r < 0)
+                                        log_unit_warning_errno(UNIT(s), r, "Failed to watch new main PID "PID_FMT" for service: %m", s->main_pid.pid);
+
+                                notify_dbus = true;
+                        }
+                }
+        }
+
+        /* Parse MONOTONIC_USEC= */
+        e = strv_find_startswith(tags, "MONOTONIC_USEC=");
+        if (e) {
+                r = safe_atou64(e, &monotonic_usec);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Failed to parse MONOTONIC_USEC= field in notification message, ignoring: %s", e);
+        }
+
+        /* Interpret READY=/STOPPING=/RELOADING=. STOPPING= wins over the others, and READY= over RELOADING= */
+        if (strv_contains(tags, "STOPPING=1")) {
+                s->notify_state = NOTIFY_STOPPING;
+
+                if (IN_SET(s->state, SERVICE_RUNNING, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
+                        service_enter_stop_by_notify(s);
+
+                notify_dbus = true;
+
+        } else if (strv_contains(tags, "READY=1")) {
+
+                s->notify_state = NOTIFY_READY;
+
+                /* Type=notify services inform us about completed initialization with READY=1 */
+                if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) &&
+                    s->state == SERVICE_START)
+                        service_enter_start_post(s);
+
+                /* Sending READY=1 while we are reloading informs us that the reloading is complete. */
+                if (s->state == SERVICE_RELOAD_NOTIFY)
+                        service_enter_running(s, SERVICE_SUCCESS);
+
+                /* Combined RELOADING=1 and READY=1? Then this is indication that the service started and
+                 * immediately finished reloading. */
+                if (s->state == SERVICE_RELOAD_SIGNAL &&
+                    strv_contains(tags, "RELOADING=1") &&
+                    monotonic_usec != USEC_INFINITY &&
+                    monotonic_usec >= s->reload_begin_usec) {
+                        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                        /* Propagate a reload explicitly */
+                        r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error);
+                        if (r < 0)
+                                log_unit_warning(UNIT(s), "Failed to schedule propagation of reload, ignoring: %s", bus_error_message(&error, r));
+
+                        service_enter_running(s, SERVICE_SUCCESS);
+                }
+
+                notify_dbus = true;
+
+        } else if (strv_contains(tags, "RELOADING=1")) {
+
+                s->notify_state = NOTIFY_RELOADING;
+
+                /* Sending RELOADING=1 after we send SIGHUP to request a reload will transition
+                 * things to "reload-notify" state, where we'll wait for READY=1 to let us know the
+                 * reload is done. Note that we insist on a timestamp being sent along here, so that
+                 * we know for sure this is a reload cycle initiated *after* we sent the signal */
+                if (s->state == SERVICE_RELOAD_SIGNAL &&
+                    monotonic_usec != USEC_INFINITY &&
+                    monotonic_usec >= s->reload_begin_usec)
+                        /* Note, we don't call service_enter_reload_by_notify() here, because we
+                         * don't need reload propagation nor do we want to restart the time-out. */
+                        service_set_state(s, SERVICE_RELOAD_NOTIFY);
+
+                if (s->state == SERVICE_RUNNING)
+                        service_enter_reload_by_notify(s);
+
+                notify_dbus = true;
+        }
+
+        /* Interpret STATUS= */
+        e = strv_find_startswith(tags, "STATUS=");
+        if (e) {
+                _cleanup_free_ char *t = NULL;
+
+                if (!isempty(e)) {
+                        /* Note that this size limit check is mostly paranoia: since the datagram size we are willing
+                         * to process is already limited to NOTIFY_BUFFER_MAX, this limit here should never be hit. */
+                        if (strlen(e) > STATUS_TEXT_MAX)
+                                log_unit_warning(u, "Status message overly long (%zu > %u), ignoring.", strlen(e), STATUS_TEXT_MAX);
+                        else if (!utf8_is_valid(e))
+                                log_unit_warning(u, "Status message in notification message is not UTF-8 clean, ignoring.");
+                        else {
+                                t = strdup(e);
+                                if (!t)
+                                        log_oom();
+                        }
+                }
+
+                if (!streq_ptr(s->status_text, t)) {
+                        free_and_replace(s->status_text, t);
+                        notify_dbus = true;
+                }
+        }
+
+        /* Interpret NOTIFYACCESS= */
+        e = strv_find_startswith(tags, "NOTIFYACCESS=");
+        if (e) {
+                NotifyAccess notify_access;
+
+                notify_access = notify_access_from_string(e);
+                if (notify_access < 0)
+                        log_unit_warning_errno(u, notify_access,
+                                               "Failed to parse NOTIFYACCESS= field value '%s' in notification message, ignoring: %m", e);
+
+                /* We don't need to check whether the new access mode is more strict than what is
+                 * already in use, since only the privileged process is allowed to change it
+                 * in the first place. */
+                if (service_get_notify_access(s) != notify_access) {
+                        service_override_notify_access(s, notify_access);
+                        notify_dbus = true;
+                }
+        }
+
+        /* Interpret ERRNO= */
+        e = strv_find_startswith(tags, "ERRNO=");
+        if (e) {
+                int status_errno;
+
+                status_errno = parse_errno(e);
+                if (status_errno < 0)
+                        log_unit_warning_errno(u, status_errno,
+                                               "Failed to parse ERRNO= field value '%s' in notification message: %m", e);
+                else if (s->status_errno != status_errno) {
+                        s->status_errno = status_errno;
+                        notify_dbus = true;
+                }
+        }
+
+        /* Interpret EXTEND_TIMEOUT= */
+        e = strv_find_startswith(tags, "EXTEND_TIMEOUT_USEC=");
+        if (e) {
+                usec_t extend_timeout_usec;
+                if (safe_atou64(e, &extend_timeout_usec) < 0)
+                        log_unit_warning(u, "Failed to parse EXTEND_TIMEOUT_USEC=%s", e);
+                else
+                        service_extend_timeout(s, extend_timeout_usec);
+        }
+
+        /* Interpret WATCHDOG= */
+        e = strv_find_startswith(tags, "WATCHDOG=");
+        if (e) {
+                if (streq(e, "1"))
+                        service_reset_watchdog(s);
+                else if (streq(e, "trigger"))
+                        service_force_watchdog(s);
+                else
+                        log_unit_warning(u, "Passed WATCHDOG= field is invalid, ignoring.");
+        }
+
+        e = strv_find_startswith(tags, "WATCHDOG_USEC=");
+        if (e) {
+                usec_t watchdog_override_usec;
+                if (safe_atou64(e, &watchdog_override_usec) < 0)
+                        log_unit_warning(u, "Failed to parse WATCHDOG_USEC=%s", e);
+                else
+                        service_override_watchdog_timeout(s, watchdog_override_usec);
+        }
+
+        /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases,
+         * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing
+         * fds, but optional when pushing in new fds, for compatibility reasons. */
+        if (strv_contains(tags, "FDSTOREREMOVE=1")) {
+                const char *name;
+
+                name = strv_find_startswith(tags, "FDNAME=");
+                if (!name || !fdname_is_valid(name))
+                        log_unit_warning(u, "FDSTOREREMOVE=1 requested, but no valid file descriptor name passed, ignoring.");
+                else
+                        service_remove_fd_store(s, name);
+
+        } else if (strv_contains(tags, "FDSTORE=1")) {
+                const char *name;
+
+                name = strv_find_startswith(tags, "FDNAME=");
+                if (name && !fdname_is_valid(name)) {
+                        log_unit_warning(u, "Passed FDNAME= name is invalid, ignoring.");
+                        name = NULL;
+                }
+
+                (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0"));
+        }
+
+        /* Notify clients about changed status or main pid */
+        if (notify_dbus)
+                unit_add_to_dbus_queue(u);
+}
+
+static int service_get_timeout(Unit *u, usec_t *timeout) {
+        Service *s = SERVICE(u);
+        uint64_t t;
+        int r;
+
+        if (!s->timer_event_source)
+                return 0;
+
+        r = sd_event_source_get_time(s->timer_event_source, &t);
+        if (r < 0)
+                return r;
+        if (t == USEC_INFINITY)
+                return 0;
+
+        *timeout = t;
+        return 1;
+}
+
+static usec_t service_get_timeout_start_usec(Unit *u) {
+        Service *s = SERVICE(ASSERT_PTR(u));
+        return s->timeout_start_usec;
+}
+
+static bool pick_up_pid_from_bus_name(Service *s) {
+        assert(s);
+
+        /* If the service is running but we have no main PID yet, get it from the owner of the D-Bus name */
+
+        return !pidref_is_set(&s->main_pid) &&
+                IN_SET(s->state,
+                       SERVICE_START,
+                       SERVICE_START_POST,
+                       SERVICE_RUNNING,
+                       SERVICE_RELOAD,
+                       SERVICE_RELOAD_SIGNAL,
+                       SERVICE_RELOAD_NOTIFY);
+}
+
+static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, sd_bus_error *ret_error) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        const sd_bus_error *e;
+        Unit *u = ASSERT_PTR(userdata);
+        uint32_t pid;
+        Service *s;
+        int r;
+
+        assert(reply);
+
+        s = SERVICE(u);
+        s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+        if (!s->bus_name || !pick_up_pid_from_bus_name(s))
+                return 1;
+
+        e = sd_bus_message_get_error(reply);
+        if (e) {
+                r = sd_bus_error_get_errno(e);
+                log_warning_errno(r, "GetConnectionUnixProcessID() failed: %s", bus_error_message(e, r));
+                return 1;
+        }
+
+        r = sd_bus_message_read(reply, "u", &pid);
+        if (r < 0) {
+                bus_log_parse_error(r);
+                return 1;
+        }
+
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0) {
+                log_debug_errno(r, "GetConnectionUnixProcessID() returned invalid PID: %m");
+                return 1;
+        }
+
+        log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid);
+
+        (void) service_set_main_pidref(s, &pidref);
+        (void) unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+        return 1;
+}
+
+static void service_bus_name_owner_change(Unit *u, const char *new_owner) {
+
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(s);
+
+        if (new_owner)
+                log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner);
+        else
+                log_unit_debug(u, "D-Bus name %s now not owned by anyone.", s->bus_name);
+
+        s->bus_name_good = new_owner;
+
+        /* Track the current owner, so we can reconstruct changes after a daemon reload */
+        r = free_and_strdup(&s->bus_name_owner, new_owner);
+        if (r < 0) {
+                log_unit_error_errno(u, r, "Unable to set new bus name owner %s: %m", new_owner);
+                return;
+        }
+
+        if (s->type == SERVICE_DBUS) {
+
+                /* service_enter_running() will figure out what to
+                 * do */
+                if (s->state == SERVICE_RUNNING)
+                        service_enter_running(s, SERVICE_SUCCESS);
+                else if (s->state == SERVICE_START && new_owner)
+                        service_enter_start_post(s);
+
+        } else if (new_owner && pick_up_pid_from_bus_name(s)) {
+
+                /* Try to acquire PID from bus service */
+
+                s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+                r = sd_bus_call_method_async(
+                                u->manager->api_bus,
+                                &s->bus_name_pid_lookup_slot,
+                                "org.freedesktop.DBus",
+                                "/org/freedesktop/DBus",
+                                "org.freedesktop.DBus",
+                                "GetConnectionUnixProcessID",
+                                bus_name_pid_lookup_callback,
+                                s,
+                                "s",
+                                s->bus_name);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to request owner PID of service name, ignoring: %m");
+        }
+}
+
+int service_set_socket_fd(
+                Service *s,
+                int fd,
+                Socket *sock,
+                SocketPeer *peer,
+                bool selinux_context_net) {
+
+        _cleanup_free_ char *peer_text = NULL;
+        int r;
+
+        assert(s);
+        assert(fd >= 0);
+
+        /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs
+         * to be configured. We take ownership of the passed fd on success. */
+
+        if (UNIT(s)->load_state != UNIT_LOADED)
+                return -EINVAL;
+
+        if (s->socket_fd >= 0)
+                return -EBUSY;
+
+        assert(!s->socket_peer);
+
+        if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED))
+                return -EAGAIN;
+
+        if (getpeername_pretty(fd, true, &peer_text) >= 0) {
+
+                if (UNIT(s)->description) {
+                        _cleanup_free_ char *a = NULL;
+
+                        a = strjoin(UNIT(s)->description, " (", peer_text, ")");
+                        if (!a)
+                                return -ENOMEM;
+
+                        r = unit_set_description(UNIT(s), a);
+                }  else
+                        r = unit_set_description(UNIT(s), peer_text);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT);
+        if (r < 0)
+                return r;
+
+        s->socket_fd = fd;
+        s->socket_peer = socket_peer_ref(peer);
+        s->socket_fd_selinux_context_net = selinux_context_net;
+
+        unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock));
+        return 0;
+}
+
+static void service_reset_failed(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        if (s->state == SERVICE_FAILED)
+                service_set_state(s, service_determine_dead_state(s));
+
+        s->result = SERVICE_SUCCESS;
+        s->reload_result = SERVICE_SUCCESS;
+        s->clean_result = SERVICE_SUCCESS;
+        s->n_restarts = 0;
+        s->flush_n_restarts = false;
+}
+
+static PidRef* service_main_pid(Unit *u) {
+        return &ASSERT_PTR(SERVICE(u))->main_pid;
+}
+
+static PidRef* service_control_pid(Unit *u) {
+        return &ASSERT_PTR(SERVICE(u))->control_pid;
+}
+
+static bool service_needs_console(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        /* We provide our own implementation of this here, instead of relying of the generic implementation
+         * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */
+
+        if (!exec_context_may_touch_console(&s->exec_context))
+                return false;
+
+        return IN_SET(s->state,
+                      SERVICE_CONDITION,
+                      SERVICE_START_PRE,
+                      SERVICE_START,
+                      SERVICE_START_POST,
+                      SERVICE_RUNNING,
+                      SERVICE_RELOAD,
+                      SERVICE_RELOAD_SIGNAL,
+                      SERVICE_RELOAD_NOTIFY,
+                      SERVICE_STOP,
+                      SERVICE_STOP_WATCHDOG,
+                      SERVICE_STOP_SIGTERM,
+                      SERVICE_STOP_SIGKILL,
+                      SERVICE_STOP_POST,
+                      SERVICE_FINAL_WATCHDOG,
+                      SERVICE_FINAL_SIGTERM,
+                      SERVICE_FINAL_SIGKILL);
+}
+
+static int service_exit_status(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(u);
+
+        if (s->main_exec_status.pid <= 0 ||
+            !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp))
+                return -ENODATA;
+
+        if (s->main_exec_status.code != CLD_EXITED)
+                return -EBADE;
+
+        return s->main_exec_status.status;
+}
+
+static const char* service_status_text(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        return s->status_text;
+}
+
+static int service_clean(Unit *u, ExecCleanMask mask) {
+        _cleanup_strv_free_ char **l = NULL;
+        bool may_clean_fdstore = false;
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(s);
+        assert(mask != 0);
+
+        if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED))
+                return -EBUSY;
+
+        /* Determine if there's anything we could potentially clean */
+        r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+        if (r < 0)
+                return r;
+
+        if (mask & EXEC_CLEAN_FDSTORE)
+                may_clean_fdstore = s->n_fd_store > 0 || s->n_fd_store_max > 0;
+
+        if (strv_isempty(l) && !may_clean_fdstore)
+                return -EUNATCH; /* Nothing to potentially clean */
+
+        /* Let's clean the stuff we can clean quickly */
+        if (may_clean_fdstore)
+                service_release_fd_store(s);
+
+        /* If we are done, leave quickly */
+        if (strv_isempty(l)) {
+                if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !s->fd_store)
+                        service_set_state(s, SERVICE_DEAD);
+                return 0;
+        }
+
+        /* We need to clean disk stuff. This is slow, hence do it out of process, and change state */
+        service_unwatch_control_pid(s);
+        s->clean_result = SERVICE_SUCCESS;
+        s->control_command = NULL;
+        s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+        r = service_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to install timer: %m");
+                goto fail;
+        }
+
+        r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+                goto fail;
+        }
+
+        service_set_state(s, SERVICE_CLEANING);
+        return 0;
+
+fail:
+        s->clean_result = SERVICE_FAILURE_RESOURCES;
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+        return r;
+}
+
+static int service_can_clean(Unit *u, ExecCleanMask *ret) {
+        Service *s = SERVICE(u);
+        ExecCleanMask mask = 0;
+        int r;
+
+        assert(s);
+        assert(ret);
+
+        r = exec_context_get_clean_mask(&s->exec_context, &mask);
+        if (r < 0)
+                return r;
+
+        if (s->n_fd_store_max > 0)
+                mask |= EXEC_CLEAN_FDSTORE;
+
+        *ret = mask;
+        return 0;
+}
+
+static const char *service_finished_job(Unit *u, JobType t, JobResult result) {
+        if (t == JOB_START &&
+            result == JOB_DONE &&
+            SERVICE(u)->type == SERVICE_ONESHOT)
+                return "Finished %s.";
+
+        /* Fall back to generic */
+        return NULL;
+}
+
+static int service_can_start(Unit *u) {
+        Service *s = SERVICE(u);
+        int r;
+
+        assert(s);
+
+        /* Make sure we don't enter a busy loop of some kind. */
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                service_enter_dead(s, SERVICE_FAILURE_START_LIMIT_HIT, false);
+                return r;
+        }
+
+        return 1;
+}
+
+static void service_release_resources(Unit *u) {
+        Service *s = SERVICE(ASSERT_PTR(u));
+
+        /* Invoked by the unit state engine, whenever it realizes that unit is dead and there's no job
+         * anymore for it, and it hence is a good idea to release resources */
+
+        /* Don't release resources if this is a transitionary failed/dead state
+         * (i.e. SERVICE_DEAD_BEFORE_AUTO_RESTART/SERVICE_FAILED_BEFORE_AUTO_RESTART), insist on a permanent
+         * failure state. */
+        if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED))
+                return;
+
+        log_unit_debug(u, "Releasing resources...");
+
+        service_release_socket_fd(s);
+        service_release_stdio_fd(s);
+
+        if (s->fd_store_preserve_mode != EXEC_PRESERVE_YES)
+                service_release_fd_store(s);
+
+        if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !s->fd_store)
+                service_set_state(s, SERVICE_DEAD);
+}
+
+static const char* const service_restart_table[_SERVICE_RESTART_MAX] = {
+        [SERVICE_RESTART_NO]          = "no",
+        [SERVICE_RESTART_ON_SUCCESS]  = "on-success",
+        [SERVICE_RESTART_ON_FAILURE]  = "on-failure",
+        [SERVICE_RESTART_ON_ABNORMAL] = "on-abnormal",
+        [SERVICE_RESTART_ON_WATCHDOG] = "on-watchdog",
+        [SERVICE_RESTART_ON_ABORT]    = "on-abort",
+        [SERVICE_RESTART_ALWAYS]      = "always",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart);
+
+static const char* const service_restart_mode_table[_SERVICE_RESTART_MODE_MAX] = {
+        [SERVICE_RESTART_MODE_NORMAL] = "normal",
+        [SERVICE_RESTART_MODE_DIRECT]  = "direct",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_restart_mode, ServiceRestartMode);
+
+static const char* const service_type_table[_SERVICE_TYPE_MAX] = {
+        [SERVICE_SIMPLE]        = "simple",
+        [SERVICE_FORKING]       = "forking",
+        [SERVICE_ONESHOT]       = "oneshot",
+        [SERVICE_DBUS]          = "dbus",
+        [SERVICE_NOTIFY]        = "notify",
+        [SERVICE_NOTIFY_RELOAD] = "notify-reload",
+        [SERVICE_IDLE]          = "idle",
+        [SERVICE_EXEC]          = "exec",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType);
+
+static const char* const service_exit_type_table[_SERVICE_EXIT_TYPE_MAX] = {
+        [SERVICE_EXIT_MAIN]   = "main",
+        [SERVICE_EXIT_CGROUP] = "cgroup",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exit_type, ServiceExitType);
+
+static const char* const service_exec_command_table[_SERVICE_EXEC_COMMAND_MAX] = {
+        [SERVICE_EXEC_CONDITION]  = "ExecCondition",
+        [SERVICE_EXEC_START_PRE]  = "ExecStartPre",
+        [SERVICE_EXEC_START]      = "ExecStart",
+        [SERVICE_EXEC_START_POST] = "ExecStartPost",
+        [SERVICE_EXEC_RELOAD]     = "ExecReload",
+        [SERVICE_EXEC_STOP]       = "ExecStop",
+        [SERVICE_EXEC_STOP_POST]  = "ExecStopPost",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exec_command, ServiceExecCommand);
+
+static const char* const service_exec_ex_command_table[_SERVICE_EXEC_COMMAND_MAX] = {
+        [SERVICE_EXEC_CONDITION]  = "ExecConditionEx",
+        [SERVICE_EXEC_START_PRE]  = "ExecStartPreEx",
+        [SERVICE_EXEC_START]      = "ExecStartEx",
+        [SERVICE_EXEC_START_POST] = "ExecStartPostEx",
+        [SERVICE_EXEC_RELOAD]     = "ExecReloadEx",
+        [SERVICE_EXEC_STOP]       = "ExecStopEx",
+        [SERVICE_EXEC_STOP_POST]  = "ExecStopPostEx",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exec_ex_command, ServiceExecCommand);
+
+static const char* const notify_state_table[_NOTIFY_STATE_MAX] = {
+        [NOTIFY_UNKNOWN]   = "unknown",
+        [NOTIFY_READY]     = "ready",
+        [NOTIFY_RELOADING] = "reloading",
+        [NOTIFY_STOPPING]  = "stopping",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(notify_state, NotifyState);
+
+static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
+        [SERVICE_SUCCESS]                 = "success",
+        [SERVICE_FAILURE_RESOURCES]       = "resources",
+        [SERVICE_FAILURE_PROTOCOL]        = "protocol",
+        [SERVICE_FAILURE_TIMEOUT]         = "timeout",
+        [SERVICE_FAILURE_EXIT_CODE]       = "exit-code",
+        [SERVICE_FAILURE_SIGNAL]          = "signal",
+        [SERVICE_FAILURE_CORE_DUMP]       = "core-dump",
+        [SERVICE_FAILURE_WATCHDOG]        = "watchdog",
+        [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+        [SERVICE_FAILURE_OOM_KILL]        = "oom-kill",
+        [SERVICE_SKIP_CONDITION]          = "exec-condition",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
+
+static const char* const service_timeout_failure_mode_table[_SERVICE_TIMEOUT_FAILURE_MODE_MAX] = {
+        [SERVICE_TIMEOUT_TERMINATE] = "terminate",
+        [SERVICE_TIMEOUT_ABORT]     = "abort",
+        [SERVICE_TIMEOUT_KILL]      = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_timeout_failure_mode, ServiceTimeoutFailureMode);
+
+const UnitVTable service_vtable = {
+        .object_size = sizeof(Service),
+        .exec_context_offset = offsetof(Service, exec_context),
+        .cgroup_context_offset = offsetof(Service, cgroup_context),
+        .kill_context_offset = offsetof(Service, kill_context),
+        .exec_runtime_offset = offsetof(Service, exec_runtime),
+
+        .sections =
+                "Unit\0"
+                "Service\0"
+                "Install\0",
+        .private_section = "Service",
+
+        .can_transient = true,
+        .can_delegate = true,
+        .can_fail = true,
+        .can_set_managed_oom = true,
+
+        .init = service_init,
+        .done = service_done,
+        .load = service_load,
+        .release_resources = service_release_resources,
+
+        .coldplug = service_coldplug,
+
+        .dump = service_dump,
+
+        .start = service_start,
+        .stop = service_stop,
+        .reload = service_reload,
+
+        .can_reload = service_can_reload,
+
+        .clean = service_clean,
+        .can_clean = service_can_clean,
+
+        .freeze = unit_freeze_vtable_common,
+        .thaw = unit_thaw_vtable_common,
+
+        .serialize = service_serialize,
+        .deserialize_item = service_deserialize_item,
+
+        .active_state = service_active_state,
+        .sub_state_to_string = service_sub_state_to_string,
+
+        .will_restart = service_will_restart,
+
+        .may_gc = service_may_gc,
+
+        .sigchld_event = service_sigchld_event,
+
+        .reset_failed = service_reset_failed,
+
+        .notify_cgroup_empty = service_notify_cgroup_empty_event,
+        .notify_cgroup_oom = service_notify_cgroup_oom_event,
+        .notify_message = service_notify_message,
+
+        .main_pid = service_main_pid,
+        .control_pid = service_control_pid,
+
+        .bus_name_owner_change = service_bus_name_owner_change,
+
+        .bus_set_property = bus_service_set_property,
+        .bus_commit_properties = bus_service_commit_properties,
+
+        .get_timeout = service_get_timeout,
+        .get_timeout_start_usec = service_get_timeout_start_usec,
+        .needs_console = service_needs_console,
+        .exit_status = service_exit_status,
+        .status_text = service_status_text,
+
+        .status_message_formats = {
+                .finished_start_job = {
+                        [JOB_FAILED]     = "Failed to start %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Stopped %s.",
+                        [JOB_FAILED]     = "Stopped (with error) %s.",
+                },
+                .finished_job = service_finished_job,
+        },
+
+        .can_start = service_can_start,
+
+        .notify_plymouth = true,
+
+        .audit_start_message_type = AUDIT_SERVICE_START,
+        .audit_stop_message_type = AUDIT_SERVICE_STOP,
+};
diff --git a/src/core/service.h b/src/core/service.h
new file mode 100644
index 0000000..e85302e
--- /dev/null
+++ b/src/core/service.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Service Service;
+typedef struct ServiceFDStore ServiceFDStore;
+
+#include "exit-status.h"
+#include "kill.h"
+#include "open-file.h"
+#include "path.h"
+#include "pidref.h"
+#include "ratelimit.h"
+#include "socket.h"
+#include "unit.h"
+
+typedef enum ServiceRestart {
+        SERVICE_RESTART_NO,
+        SERVICE_RESTART_ON_SUCCESS,
+        SERVICE_RESTART_ON_FAILURE,
+        SERVICE_RESTART_ON_ABNORMAL,
+        SERVICE_RESTART_ON_WATCHDOG,
+        SERVICE_RESTART_ON_ABORT,
+        SERVICE_RESTART_ALWAYS,
+        _SERVICE_RESTART_MAX,
+        _SERVICE_RESTART_INVALID = -EINVAL,
+} ServiceRestart;
+
+typedef enum ServiceType {
+        SERVICE_SIMPLE,        /* we fork and go on right-away (i.e. modern socket activated daemons) */
+        SERVICE_FORKING,       /* forks by itself (i.e. traditional daemons) */
+        SERVICE_ONESHOT,       /* we fork and wait until the program finishes (i.e. programs like fsck which run and need to finish before we continue) */
+        SERVICE_DBUS,          /* we fork and wait until a specific D-Bus name appears on the bus */
+        SERVICE_NOTIFY,        /* we fork and wait until a daemon sends us a ready message with sd_notify() */
+        SERVICE_NOTIFY_RELOAD, /* just like SERVICE_NOTIFY, but also implements a reload protocol via SIGHUP */
+        SERVICE_IDLE,          /* much like simple, but delay exec() until all jobs are dispatched. */
+        SERVICE_EXEC,          /* we fork and wait until we execute exec() (this means our own setup is waited for) */
+        _SERVICE_TYPE_MAX,
+        _SERVICE_TYPE_INVALID = -EINVAL,
+} ServiceType;
+
+typedef enum ServiceExitType {
+        SERVICE_EXIT_MAIN,    /* we consider the main PID when deciding if the service exited */
+        SERVICE_EXIT_CGROUP,  /* we wait for the last process in the cgroup to exit */
+        _SERVICE_EXIT_TYPE_MAX,
+        _SERVICE_EXIT_TYPE_INVALID = -EINVAL,
+} ServiceExitType;
+
+typedef enum ServiceExecCommand {
+        SERVICE_EXEC_CONDITION,
+        SERVICE_EXEC_START_PRE,
+        SERVICE_EXEC_START,
+        SERVICE_EXEC_START_POST,
+        SERVICE_EXEC_RELOAD,
+        SERVICE_EXEC_STOP,
+        SERVICE_EXEC_STOP_POST,
+        _SERVICE_EXEC_COMMAND_MAX,
+        _SERVICE_EXEC_COMMAND_INVALID = -EINVAL,
+} ServiceExecCommand;
+
+typedef enum NotifyState {
+        NOTIFY_UNKNOWN,
+        NOTIFY_READY,
+        NOTIFY_RELOADING,
+        NOTIFY_STOPPING,
+        _NOTIFY_STATE_MAX,
+        _NOTIFY_STATE_INVALID = -EINVAL,
+} NotifyState;
+
+/* The values of this enum are referenced in man/systemd.exec.xml and src/shared/bus-unit-util.c.
+ * Update those sources for each change to this enum. */
+typedef enum ServiceResult {
+        SERVICE_SUCCESS,
+        SERVICE_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */
+        SERVICE_FAILURE_PROTOCOL,
+        SERVICE_FAILURE_TIMEOUT,
+        SERVICE_FAILURE_EXIT_CODE,
+        SERVICE_FAILURE_SIGNAL,
+        SERVICE_FAILURE_CORE_DUMP,
+        SERVICE_FAILURE_WATCHDOG,
+        SERVICE_FAILURE_START_LIMIT_HIT,
+        SERVICE_FAILURE_OOM_KILL, /* OOM Kill by the Kernel or systemd-oomd */
+        SERVICE_SKIP_CONDITION,
+        _SERVICE_RESULT_MAX,
+        _SERVICE_RESULT_INVALID = -EINVAL,
+} ServiceResult;
+
+typedef enum ServiceTimeoutFailureMode {
+        SERVICE_TIMEOUT_TERMINATE,
+        SERVICE_TIMEOUT_ABORT,
+        SERVICE_TIMEOUT_KILL,
+        _SERVICE_TIMEOUT_FAILURE_MODE_MAX,
+        _SERVICE_TIMEOUT_FAILURE_MODE_INVALID = -EINVAL,
+} ServiceTimeoutFailureMode;
+
+typedef enum ServiceRestartMode {
+        SERVICE_RESTART_MODE_NORMAL,
+        SERVICE_RESTART_MODE_DIRECT,
+        _SERVICE_RESTART_MODE_MAX,
+        _SERVICE_RESTART_MODE_INVALID = -EINVAL,
+} ServiceRestartMode;
+
+struct ServiceFDStore {
+        Service *service;
+
+        int fd;
+        char *fdname;
+        sd_event_source *event_source;
+        bool do_poll;
+
+        LIST_FIELDS(ServiceFDStore, fd_store);
+};
+
+struct Service {
+        Unit meta;
+
+        ServiceType type;
+        ServiceExitType exit_type;
+        ServiceRestart restart;
+        ServiceRestartMode restart_mode;
+        ExitStatusSet restart_prevent_status;
+        ExitStatusSet restart_force_status;
+        ExitStatusSet success_status;
+
+        /* If set we'll read the main daemon PID from this file */
+        char *pid_file;
+
+        usec_t restart_usec;
+        unsigned restart_steps;
+        usec_t restart_max_delay_usec;
+        usec_t timeout_start_usec;
+        usec_t timeout_stop_usec;
+        usec_t timeout_abort_usec;
+        bool timeout_abort_set;
+        usec_t runtime_max_usec;
+        usec_t runtime_rand_extra_usec;
+        ServiceTimeoutFailureMode timeout_start_failure_mode;
+        ServiceTimeoutFailureMode timeout_stop_failure_mode;
+
+        dual_timestamp watchdog_timestamp;
+        usec_t watchdog_usec;            /* the requested watchdog timeout in the unit file */
+        usec_t watchdog_original_usec;   /* the watchdog timeout that was in effect when the unit was started, i.e. the timeout the forked off processes currently see */
+        usec_t watchdog_override_usec;   /* the watchdog timeout requested by the service itself through sd_notify() */
+        bool watchdog_override_enable;
+        sd_event_source *watchdog_event_source;
+
+        ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX];
+
+        ExecContext exec_context;
+        KillContext kill_context;
+        CGroupContext cgroup_context;
+
+        ServiceState state, deserialized_state;
+
+        /* The exit status of the real main process */
+        ExecStatus main_exec_status;
+
+        /* The currently executed control process */
+        ExecCommand *control_command;
+
+        /* The currently executed main process, which may be NULL if
+         * the main process got started via forking mode and not by
+         * us */
+        ExecCommand *main_command;
+
+        /* The ID of the control command currently being executed */
+        ServiceExecCommand control_command_id;
+
+        /* Runtime data of the execution context */
+        ExecRuntime *exec_runtime;
+
+        PidRef main_pid, control_pid;
+
+        /* if we are a socket activated service instance, store information of the connection/peer/socket */
+        int socket_fd;
+        SocketPeer *socket_peer;
+        UnitRef accept_socket;
+        bool socket_fd_selinux_context_net;
+
+        bool permissions_start_only;
+        bool root_directory_start_only;
+        bool remain_after_exit;
+        bool guess_main_pid;
+
+        /* If we shut down, remember why */
+        ServiceResult result;
+        ServiceResult reload_result;
+        ServiceResult clean_result;
+
+        bool main_pid_known:1;
+        bool main_pid_alien:1;
+        bool bus_name_good:1;
+        bool forbid_restart:1;
+        bool start_timeout_defined:1;
+        bool exec_fd_hot:1;
+
+        char *bus_name;
+        char *bus_name_owner; /* unique name of the current owner */
+
+        char *status_text;
+        int status_errno;
+
+        sd_event_source *timer_event_source;
+        PathSpec *pid_file_pathspec;
+
+        NotifyAccess notify_access;
+        NotifyAccess notify_access_override;
+        NotifyState notify_state;
+
+        sd_bus_slot *bus_name_pid_lookup_slot;
+
+        sd_event_source *exec_fd_event_source;
+
+        ServiceFDStore *fd_store;
+        size_t n_fd_store;
+        unsigned n_fd_store_max;
+        ExecPreserveMode fd_store_preserve_mode;
+
+        char *usb_function_descriptors;
+        char *usb_function_strings;
+
+        int stdin_fd;
+        int stdout_fd;
+        int stderr_fd;
+
+        unsigned n_restarts;
+        bool flush_n_restarts;
+
+        OOMPolicy oom_policy;
+
+        LIST_HEAD(OpenFile, open_files);
+
+        int reload_signal;
+        usec_t reload_begin_usec;
+};
+
+static inline usec_t service_timeout_abort_usec(Service *s) {
+        assert(s);
+        return s->timeout_abort_set ? s->timeout_abort_usec : s->timeout_stop_usec;
+}
+
+static inline NotifyAccess service_get_notify_access(Service *s) {
+        assert(s);
+        return s->notify_access_override < 0 ? s->notify_access : s->notify_access_override;
+}
+
+static inline usec_t service_get_watchdog_usec(Service *s) {
+        assert(s);
+        return s->watchdog_override_enable ? s->watchdog_override_usec : s->watchdog_original_usec;
+}
+
+extern const UnitVTable service_vtable;
+
+int service_set_socket_fd(Service *s, int fd, struct Socket *socket, struct SocketPeer *peer, bool selinux_context_net);
+void service_release_socket_fd(Service *s);
+
+usec_t service_restart_usec_next(Service *s);
+
+const char* service_restart_to_string(ServiceRestart i) _const_;
+ServiceRestart service_restart_from_string(const char *s) _pure_;
+
+const char* service_restart_mode_to_string(ServiceRestartMode i) _const_;
+ServiceRestartMode service_restart_mode_from_string(const char *s) _pure_;
+
+const char* service_type_to_string(ServiceType i) _const_;
+ServiceType service_type_from_string(const char *s) _pure_;
+
+const char* service_exit_type_to_string(ServiceExitType i) _const_;
+ServiceExitType service_exit_type_from_string(const char *s) _pure_;
+
+const char* service_exec_command_to_string(ServiceExecCommand i) _const_;
+ServiceExecCommand service_exec_command_from_string(const char *s) _pure_;
+
+const char* service_exec_ex_command_to_string(ServiceExecCommand i) _const_;
+ServiceExecCommand service_exec_ex_command_from_string(const char *s) _pure_;
+
+const char* notify_state_to_string(NotifyState i) _const_;
+NotifyState notify_state_from_string(const char *s) _pure_;
+
+const char* service_result_to_string(ServiceResult i) _const_;
+ServiceResult service_result_from_string(const char *s) _pure_;
+
+const char* service_timeout_failure_mode_to_string(ServiceTimeoutFailureMode i) _const_;
+ServiceTimeoutFailureMode service_timeout_failure_mode_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SERVICE, Service);
+
+#define STATUS_TEXT_MAX (16U*1024U)
+
+/* Only exported for unit tests */
+int service_deserialize_exec_command(Unit *u, const char *key, const char *value);
diff --git a/src/core/show-status.c b/src/core/show-status.c
new file mode 100644
index 0000000..606237e
--- /dev/null
+++ b/src/core/show-status.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "iovec-util.h"
+#include "parse-util.h"
+#include "show-status.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "terminal-util.h"
+
+static const char* const show_status_table[_SHOW_STATUS_MAX] = {
+        [SHOW_STATUS_NO]        = "no",
+        [SHOW_STATUS_ERROR]     = "error",
+        [SHOW_STATUS_AUTO]      = "auto",
+        [SHOW_STATUS_TEMPORARY] = "temporary",
+        [SHOW_STATUS_YES]       = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(show_status, ShowStatus, SHOW_STATUS_YES);
+
+int parse_show_status(const char *v, ShowStatus *ret) {
+        ShowStatus s;
+
+        assert(ret);
+
+        s = show_status_from_string(v);
+        if (s < 0 || s == SHOW_STATUS_TEMPORARY)
+                return -EINVAL;
+
+        *ret = s;
+        return 0;
+}
+
+int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) {
+        static const char status_indent[] = "         "; /* "[" STATUS "] " */
+        _cleanup_free_ char *s = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        struct iovec iovec[7] = {};
+        int n = 0;
+        static bool prev_ephemeral;
+
+        assert(format);
+
+        /* This is independent of logging, as status messages are
+         * optional and go exclusively to the console. */
+
+        if (vasprintf(&s, format, ap) < 0)
+                return log_oom();
+
+        /* Before you ask: yes, on purpose we open/close the console for each status line we write individually. This
+         * is a good strategy to avoid PID 1 getting killed by the kernel's SAK concept (it doesn't fix this entirely,
+         * but minimizes the time window the kernel might end up killing PID 1 due to SAK). It also makes things easier
+         * for us so that we don't have to recover from hangups and suchlike triggered on the console. */
+
+        fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+        if (fd < 0)
+                return fd;
+
+        if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) {
+                char *e;
+                size_t emax, sl;
+                int c;
+
+                c = fd_columns(fd);
+                if (c <= 0)
+                        c = 80;
+
+                sl = status ? sizeof(status_indent)-1 : 0;
+
+                emax = c - sl - 1;
+                if (emax < 3)
+                        emax = 3;
+
+                e = ellipsize(s, emax, 50);
+                if (e)
+                        free_and_replace(s, e);
+        }
+
+        if (prev_ephemeral)
+                iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE);
+
+        if (status) {
+                if (!isempty(status)) {
+                        iovec[n++] = IOVEC_MAKE_STRING("[");
+                        iovec[n++] = IOVEC_MAKE_STRING(status);
+                        iovec[n++] = IOVEC_MAKE_STRING("] ");
+                } else
+                        iovec[n++] = IOVEC_MAKE_STRING(status_indent);
+        }
+
+        iovec[n++] = IOVEC_MAKE_STRING(s);
+        iovec[n++] = IOVEC_MAKE_STRING("\r\n"); /* use CRNL instead of just NL, to be robust towards TTYs in raw mode */
+
+        if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL))
+                iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE);
+        prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL);
+
+        if (writev(fd, iovec, n) < 0)
+                return -errno;
+
+        return 0;
+}
+
+int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) {
+        va_list ap;
+        int r;
+
+        assert(format);
+
+        va_start(ap, format);
+        r = status_vprintf(status, flags, format, ap);
+        va_end(ap);
+
+        return r;
+}
+
+static const char* const status_unit_format_table[_STATUS_UNIT_FORMAT_MAX] = {
+        [STATUS_UNIT_FORMAT_NAME]        = "name",
+        [STATUS_UNIT_FORMAT_DESCRIPTION] = "description",
+        [STATUS_UNIT_FORMAT_COMBINED]    = "combined",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(status_unit_format, StatusUnitFormat);
diff --git a/src/core/show-status.h b/src/core/show-status.h
new file mode 100644
index 0000000..f441223
--- /dev/null
+++ b/src/core/show-status.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "macro.h"
+
+/* Manager status */
+
+typedef enum ShowStatus {
+        SHOW_STATUS_NO,         /* printing of status is disabled */
+        SHOW_STATUS_ERROR,      /* only print errors */
+        SHOW_STATUS_AUTO,       /* disabled but may flip to _TEMPORARY */
+        SHOW_STATUS_TEMPORARY,  /* enabled temporarily, may flip back to _AUTO */
+        SHOW_STATUS_YES,        /* printing of status is enabled */
+        _SHOW_STATUS_MAX,
+        _SHOW_STATUS_INVALID = -EINVAL,
+} ShowStatus;
+
+typedef enum ShowStatusFlags {
+        SHOW_STATUS_ELLIPSIZE = 1 << 0,
+        SHOW_STATUS_EPHEMERAL = 1 << 1,
+} ShowStatusFlags;
+
+typedef enum StatusUnitFormat {
+        STATUS_UNIT_FORMAT_NAME,
+        STATUS_UNIT_FORMAT_DESCRIPTION,
+        STATUS_UNIT_FORMAT_COMBINED,
+        _STATUS_UNIT_FORMAT_MAX,
+        _STATUS_UNIT_FORMAT_INVALID = -EINVAL,
+} StatusUnitFormat;
+
+static inline bool show_status_on(ShowStatus s) {
+        return IN_SET(s, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES);
+}
+ShowStatus show_status_from_string(const char *v) _const_;
+const char* show_status_to_string(ShowStatus s) _pure_;
+int parse_show_status(const char *v, ShowStatus *ret);
+
+StatusUnitFormat status_unit_format_from_string(const char *v) _const_;
+const char* status_unit_format_to_string(StatusUnitFormat s) _pure_;
+
+int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) _printf_(3,0);
+int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) _printf_(3,4);
diff --git a/src/core/slice.c b/src/core/slice.c
new file mode 100644
index 0000000..fb4f23c
--- /dev/null
+++ b/src/core/slice.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "dbus-slice.h"
+#include "dbus-unit.h"
+#include "fd-util.h"
+#include "log.h"
+#include "serialize.h"
+#include "slice.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = {
+        [SLICE_DEAD] = UNIT_INACTIVE,
+        [SLICE_ACTIVE] = UNIT_ACTIVE
+};
+
+static void slice_init(Unit *u) {
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        u->ignore_on_isolate = true;
+}
+
+static void slice_set_state(Slice *t, SliceState state) {
+        SliceState old_state;
+        assert(t);
+
+        if (t->state != state)
+                bus_unit_send_pending_change_signal(UNIT(t), false);
+
+        old_state = t->state;
+        t->state = state;
+
+        if (state != old_state)
+                log_debug("%s changed %s -> %s",
+                          UNIT(t)->id,
+                          slice_state_to_string(old_state),
+                          slice_state_to_string(state));
+
+        unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int slice_add_parent_slice(Slice *s) {
+        Unit *u = UNIT(s);
+        _cleanup_free_ char *a = NULL;
+        int r;
+
+        assert(s);
+
+        if (UNIT_GET_SLICE(u))
+                return 0;
+
+        r = slice_build_parent_slice(u->id, &a);
+        if (r <= 0) /* 0 means root slice */
+                return r;
+
+        return unit_add_dependency_by_name(u, UNIT_IN_SLICE, a, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int slice_add_default_dependencies(Slice *s) {
+        int r;
+
+        assert(s);
+
+        if (!UNIT(s)->default_dependencies)
+                return 0;
+
+        /* Make sure slices are unloaded on shutdown */
+        r = unit_add_two_dependencies_by_name(
+                        UNIT(s),
+                        UNIT_BEFORE, UNIT_CONFLICTS,
+                        SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int slice_verify(Slice *s) {
+        _cleanup_free_ char *parent = NULL;
+        int r;
+
+        assert(s);
+        assert(UNIT(s)->load_state == UNIT_LOADED);
+
+        if (!slice_name_is_valid(UNIT(s)->id))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Slice name %s is not valid. Refusing.", UNIT(s)->id);
+
+        r = slice_build_parent_slice(UNIT(s)->id, &parent);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to determine parent slice: %m");
+
+        /* If recursive errors are to be ignored, the parent slice should not be verified */
+        if (UNIT(s)->manager && FLAGS_SET(UNIT(s)->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+                return 0;
+
+        if (parent ? !unit_has_name(UNIT_GET_SLICE(UNIT(s)), parent) : !!UNIT_GET_SLICE(UNIT(s)))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Located outside of parent slice. Refusing.");
+
+        return 0;
+}
+
+static int slice_load_root_slice(Unit *u) {
+        assert(u);
+
+        if (!unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return 0;
+
+        u->perpetual = true;
+
+        /* The root slice is a bit special. For example it is always running and cannot be terminated. Because of its
+         * special semantics we synthesize it here, instead of relying on the unit file on disk. */
+
+        u->default_dependencies = false;
+
+        if (!u->description)
+                u->description = strdup("Root Slice");
+        if (!u->documentation)
+                u->documentation = strv_new("man:systemd.special(7)");
+
+        return 1;
+}
+
+static int slice_load_system_slice(Unit *u) {
+        assert(u);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return 0;
+        if (!unit_has_name(u, SPECIAL_SYSTEM_SLICE))
+                return 0;
+
+        u->perpetual = true;
+
+        /* The system slice is a bit special. For example it is always running and cannot be terminated. Because of its
+         * special semantics we synthesize it here, instead of relying on the unit file on disk. */
+
+        u->default_dependencies = false;
+
+        if (!u->description)
+                u->description = strdup("System Slice");
+        if (!u->documentation)
+                u->documentation = strv_new("man:systemd.special(7)");
+
+        return 1;
+}
+
+static int slice_load(Unit *u) {
+        Slice *s = SLICE(u);
+        int r;
+
+        assert(s);
+        assert(u->load_state == UNIT_STUB);
+
+        r = slice_load_root_slice(u);
+        if (r < 0)
+                return r;
+        r = slice_load_system_slice(u);
+        if (r < 0)
+                return r;
+
+        r = unit_load_fragment_and_dropin(u, false);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        /* This is a new unit? Then let's add in some extras */
+        r = unit_patch_contexts(u);
+        if (r < 0)
+                return r;
+
+        r = slice_add_parent_slice(s);
+        if (r < 0)
+                return r;
+
+        r = slice_add_default_dependencies(s);
+        if (r < 0)
+                return r;
+
+        if (!u->description) {
+                _cleanup_free_ char *tmp = NULL;
+
+                r = unit_name_to_path(u->id, &tmp);
+                if (r >= 0)  /* Failure is ignored… */
+                        u->description = strjoin("Slice ", tmp);
+        }
+
+        return slice_verify(s);
+}
+
+static int slice_coldplug(Unit *u) {
+        Slice *t = SLICE(u);
+
+        assert(t);
+        assert(t->state == SLICE_DEAD);
+
+        if (t->deserialized_state != t->state)
+                slice_set_state(t, t->deserialized_state);
+
+        return 0;
+}
+
+static void slice_dump(Unit *u, FILE *f, const char *prefix) {
+        Slice *t = SLICE(u);
+
+        assert(t);
+        assert(f);
+
+        fprintf(f,
+                "%sSlice State: %s\n",
+                prefix, slice_state_to_string(t->state));
+
+        cgroup_context_dump(UNIT(t), f, prefix);
+}
+
+static int slice_start(Unit *u) {
+        Slice *t = SLICE(u);
+        int r;
+
+        assert(t);
+        assert(t->state == SLICE_DEAD);
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        (void) unit_realize_cgroup(u);
+        (void) unit_reset_accounting(u);
+
+        slice_set_state(t, SLICE_ACTIVE);
+        return 1;
+}
+
+static int slice_stop(Unit *u) {
+        Slice *t = SLICE(u);
+
+        assert(t);
+        assert(t->state == SLICE_ACTIVE);
+
+        /* We do not need to destroy the cgroup explicitly,
+         * unit_notify() will do that for us anyway. */
+
+        slice_set_state(t, SLICE_DEAD);
+        return 1;
+}
+
+static int slice_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Slice *s = SLICE(u);
+
+        assert(s);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", slice_state_to_string(s->state));
+
+        return 0;
+}
+
+static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Slice *s = SLICE(u);
+
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                SliceState state;
+
+                state = slice_state_from_string(value);
+                if (state < 0)
+                        log_debug("Failed to parse state value %s", value);
+                else
+                        s->deserialized_state = state;
+
+        } else
+                log_debug("Unknown serialization key '%s'", key);
+
+        return 0;
+}
+
+static UnitActiveState slice_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[SLICE(u)->state];
+}
+
+static const char *slice_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return slice_state_to_string(SLICE(u)->state);
+}
+
+static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) {
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(name);
+
+        u = manager_get_unit(m, name);
+        if (!u) {
+                r = unit_new_for_name(m, sizeof(Slice), name, &u);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate the special %s unit: %m", name);
+        }
+
+        u->perpetual = true;
+        SLICE(u)->deserialized_state = SLICE_ACTIVE;
+
+        unit_add_to_load_queue(u);
+        unit_add_to_dbus_queue(u);
+
+        if (ret)
+                *ret = u;
+
+        return 0;
+}
+
+static void slice_enumerate_perpetual(Manager *m) {
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u);
+        if (r >= 0 && manager_owns_host_root_cgroup(m)) {
+                Slice *s = SLICE(u);
+
+                /* If we are managing the root cgroup then this means our root slice covers the whole system, which
+                 * means the kernel will track CPU/tasks/memory for us anyway, and it is all available in /proc. Let's
+                 * hence turn accounting on here, so that our APIs to query this data are available. */
+
+                s->cgroup_context.cpu_accounting = true;
+                s->cgroup_context.tasks_accounting = true;
+                s->cgroup_context.memory_accounting = true;
+        }
+
+        if (MANAGER_IS_SYSTEM(m))
+                (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL);
+}
+
+static bool slice_freezer_action_supported_by_children(Unit *s) {
+        Unit *member;
+
+        assert(s);
+
+        UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+
+                if (member->type == UNIT_SLICE &&
+                    !slice_freezer_action_supported_by_children(member))
+                        return false;
+
+                if (!UNIT_VTABLE(member)->freeze)
+                        return false;
+        }
+
+        return true;
+}
+
+static int slice_freezer_action(Unit *s, FreezerAction action) {
+        Unit *member;
+        int r;
+
+        assert(s);
+        assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+        if (action == FREEZER_FREEZE && !slice_freezer_action_supported_by_children(s)) {
+                log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice");
+                return 0;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+                if (!member->cgroup_realized)
+                        continue;
+
+                if (action == FREEZER_FREEZE)
+                        r = UNIT_VTABLE(member)->freeze(member);
+                else if (UNIT_VTABLE(member)->thaw)
+                        r = UNIT_VTABLE(member)->thaw(member);
+                else
+                        /* Thawing is requested but no corresponding method is available, ignore. */
+                        r = 0;
+                if (r < 0)
+                        return r;
+        }
+
+        return unit_cgroup_freezer_action(s, action);
+}
+
+static int slice_freeze(Unit *s) {
+        assert(s);
+
+        return slice_freezer_action(s, FREEZER_FREEZE);
+}
+
+static int slice_thaw(Unit *s) {
+        assert(s);
+
+        return slice_freezer_action(s, FREEZER_THAW);
+}
+
+static bool slice_can_freeze(Unit *s) {
+        assert(s);
+
+        return slice_freezer_action_supported_by_children(s);
+}
+
+const UnitVTable slice_vtable = {
+        .object_size = sizeof(Slice),
+        .cgroup_context_offset = offsetof(Slice, cgroup_context),
+
+        .sections =
+                "Unit\0"
+                "Slice\0"
+                "Install\0",
+        .private_section = "Slice",
+
+        .can_transient = true,
+        .can_set_managed_oom = true,
+
+        .init = slice_init,
+        .load = slice_load,
+
+        .coldplug = slice_coldplug,
+
+        .dump = slice_dump,
+
+        .start = slice_start,
+        .stop = slice_stop,
+
+        .freeze = slice_freeze,
+        .thaw = slice_thaw,
+        .can_freeze = slice_can_freeze,
+
+        .serialize = slice_serialize,
+        .deserialize_item = slice_deserialize_item,
+
+        .active_state = slice_active_state,
+        .sub_state_to_string = slice_sub_state_to_string,
+
+        .bus_set_property = bus_slice_set_property,
+        .bus_commit_properties = bus_slice_commit_properties,
+
+        .enumerate_perpetual = slice_enumerate_perpetual,
+
+        .status_message_formats = {
+                .finished_start_job = {
+                        [JOB_DONE]       = "Created slice %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Removed slice %s.",
+                },
+        },
+};
diff --git a/src/core/slice.h b/src/core/slice.h
new file mode 100644
index 0000000..e2f9274
--- /dev/null
+++ b/src/core/slice.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Slice Slice;
+
+struct Slice {
+        Unit meta;
+
+        SliceState state, deserialized_state;
+
+        CGroupContext cgroup_context;
+};
+
+extern const UnitVTable slice_vtable;
+
+DEFINE_CAST(SLICE, Slice);
diff --git a/src/core/smack-setup.c b/src/core/smack-setup.c
new file mode 100644
index 0000000..7ea902b
--- /dev/null
+++ b/src/core/smack-setup.c
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+  Copyright © 2013 Intel Corporation
+  Authors:
+        Nathaniel Chen 
+***/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "macro.h"
+#include "smack-setup.h"
+#include "string-util.h"
+
+#if ENABLE_SMACK
+
+static int fdopen_unlocked_at(int dfd, const char *dir, const char *name, int *status, FILE **ret_file) {
+        int fd, r;
+        FILE *f;
+
+        fd = openat(dfd, name, O_RDONLY|O_CLOEXEC);
+        if (fd < 0) {
+                if (*status == 0)
+                        *status = -errno;
+
+                return log_warning_errno(errno, "Failed to open \"%s/%s\": %m", dir, name);
+        }
+
+        r = fdopen_unlocked(fd, "r", &f);
+        if (r < 0) {
+                if (*status == 0)
+                        *status = r;
+
+                safe_close(fd);
+                return log_error_errno(r, "Failed to open \"%s/%s\": %m", dir, name);
+        }
+
+        *ret_file = f;
+        return 0;
+}
+
+static int write_access2_rules(const char *srcdir) {
+        _cleanup_close_ int load2_fd = -EBADF, change_fd = -EBADF;
+        _cleanup_closedir_ DIR *dir = NULL;
+        int dfd = -EBADF, r = 0;
+
+        load2_fd = open("/sys/fs/smackfs/load2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+        if (load2_fd < 0)  {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/load2': %m");
+                return -errno; /* negative error */
+        }
+
+        change_fd = open("/sys/fs/smackfs/change-rule", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+        if (change_fd < 0)  {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/change-rule': %m");
+                return -errno; /* negative error */
+        }
+
+        /* write rules to load2 or change-rule from every file in the directory */
+        dir = opendir(srcdir);
+        if (!dir) {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir);
+                return errno; /* positive on purpose */
+        }
+
+        dfd = dirfd(dir);
+        assert(dfd >= 0);
+
+        FOREACH_DIRENT(entry, dir, return 0) {
+                _cleanup_fclose_ FILE *policy = NULL;
+
+                if (!dirent_is_file(entry))
+                        continue;
+
+                if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+                        continue;
+
+                /* load2 write rules in the kernel require a line buffered stream */
+                for (;;) {
+                        _cleanup_free_ char *buf = NULL, *sbj = NULL, *obj = NULL, *acc1 = NULL, *acc2 = NULL;
+                        int q;
+
+                        q = read_line(policy, NAME_MAX, &buf);
+                        if (q < 0)
+                                return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name);
+                        if (q == 0)
+                                break;
+
+                        if (isempty(buf) || strchr(COMMENTS, buf[0]))
+                                continue;
+
+                        /* if 3 args -> load rule   : subject object access1 */
+                        /* if 4 args -> change rule : subject object access1 access2 */
+                        if (sscanf(buf, "%ms %ms %ms %ms", &sbj, &obj, &acc1, &acc2) < 3) {
+                                log_error_errno(errno, "Failed to parse rule '%s' in '%s', ignoring.", buf, entry->d_name);
+                                continue;
+                        }
+
+                        if (write(isempty(acc2) ? load2_fd : change_fd, buf, strlen(buf)) < 0) {
+                                if (r == 0)
+                                        r = -errno;
+                                log_error_errno(errno, "Failed to write '%s' to '%s' in '%s': %m",
+                                                buf, isempty(acc2) ? "/sys/fs/smackfs/load2" : "/sys/fs/smackfs/change-rule", entry->d_name);
+                        }
+                }
+        }
+
+        return r;
+}
+
+static int write_cipso2_rules(const char *srcdir) {
+        _cleanup_close_ int cipso2_fd = -EBADF;
+        _cleanup_closedir_ DIR *dir = NULL;
+        int dfd = -EBADF, r = 0;
+
+        cipso2_fd = open("/sys/fs/smackfs/cipso2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+        if (cipso2_fd < 0)  {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/cipso2': %m");
+                return -errno; /* negative error */
+        }
+
+        /* write rules to cipso2 from every file in the directory */
+        dir = opendir(srcdir);
+        if (!dir) {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir);
+                return errno; /* positive on purpose */
+        }
+
+        dfd = dirfd(dir);
+        assert(dfd >= 0);
+
+        FOREACH_DIRENT(entry, dir, return 0) {
+                _cleanup_fclose_ FILE *policy = NULL;
+
+                if (!dirent_is_file(entry))
+                        continue;
+
+                if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+                        continue;
+
+                /* cipso2 write rules in the kernel require a line buffered stream */
+                for (;;) {
+                        _cleanup_free_ char *buf = NULL;
+                        int q;
+
+                        q = read_line(policy, NAME_MAX, &buf);
+                        if (q < 0)
+                                return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name);
+                        if (q == 0)
+                                break;
+
+                        if (isempty(buf) || strchr(COMMENTS, buf[0]))
+                                continue;
+
+                        if (write(cipso2_fd, buf, strlen(buf)) < 0) {
+                                if (r == 0)
+                                        r = -errno;
+                                log_error_errno(errno, "Failed to write '%s' to '/sys/fs/smackfs/cipso2' in '%s': %m",
+                                                buf, entry->d_name);
+                                break;
+                        }
+                }
+        }
+
+        return r;
+}
+
+static int write_netlabel_rules(const char *srcdir) {
+        _cleanup_fclose_ FILE *dst = NULL;
+        _cleanup_closedir_ DIR *dir = NULL;
+        int dfd = -EBADF, r = 0;
+
+        dst = fopen("/sys/fs/smackfs/netlabel", "we");
+        if (!dst)  {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to open /sys/fs/smackfs/netlabel: %m");
+                return -errno; /* negative error */
+        }
+
+        /* write rules to dst from every file in the directory */
+        dir = opendir(srcdir);
+        if (!dir) {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to opendir %s: %m", srcdir);
+                return errno; /* positive on purpose */
+        }
+
+        dfd = dirfd(dir);
+        assert(dfd >= 0);
+
+        FOREACH_DIRENT(entry, dir, return 0) {
+                _cleanup_fclose_ FILE *policy = NULL;
+
+                if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+                        continue;
+
+                /* load2 write rules in the kernel require a line buffered stream */
+                for (;;) {
+                        _cleanup_free_ char *buf = NULL;
+                        int q;
+
+                        q = read_line(policy, NAME_MAX, &buf);
+                        if (q < 0)
+                                return log_error_errno(q, "Failed to read line from %s: %m", entry->d_name);
+                        if (q == 0)
+                                break;
+
+                        if (!fputs(buf, dst)) {
+                                if (r == 0)
+                                        r = -EINVAL;
+                                log_error_errno(errno, "Failed to write line to /sys/fs/smackfs/netlabel: %m");
+                                break;
+                        }
+                        q = fflush_and_check(dst);
+                        if (q < 0) {
+                                if (r == 0)
+                                        r = q;
+                                log_error_errno(q, "Failed to flush writes to /sys/fs/smackfs/netlabel: %m");
+                                break;
+                        }
+                }
+        }
+
+        return r;
+}
+
+static int write_onlycap_list(void) {
+        _cleanup_close_ int onlycap_fd = -EBADF;
+        _cleanup_free_ char *list = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        size_t len = 0;
+        int r;
+
+        f = fopen("/etc/smack/onlycap", "re");
+        if (!f) {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to read '/etc/smack/onlycap': %m");
+
+                return errno == ENOENT ? ENOENT : -errno;
+        }
+
+        for (;;) {
+                _cleanup_free_ char *buf = NULL;
+                size_t l;
+
+                r = read_line(f, LONG_LINE_MAX, &buf);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read line from /etc/smack/onlycap: %m");
+                if (r == 0)
+                        break;
+
+                if (isempty(buf) || strchr(COMMENTS, *buf))
+                        continue;
+
+                l = strlen(buf);
+                if (!GREEDY_REALLOC(list, len + l + 1))
+                        return log_oom();
+
+                stpcpy(list + len, buf)[0] = ' ';
+                len += l + 1;
+        }
+
+        if (len == 0)
+                return 0;
+
+        list[len - 1] = 0;
+
+        onlycap_fd = open("/sys/fs/smackfs/onlycap", O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+        if (onlycap_fd < 0) {
+                if (errno != ENOENT)
+                        log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/onlycap': %m");
+                return -errno; /* negative error */
+        }
+
+        r = write(onlycap_fd, list, len);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to write onlycap list(%s) to '/sys/fs/smackfs/onlycap': %m", list);
+
+        return 0;
+}
+
+#endif
+
+int mac_smack_setup(bool *loaded_policy) {
+
+#if ENABLE_SMACK
+
+        int r;
+
+        assert(loaded_policy);
+
+        r = write_access2_rules("/etc/smack/accesses.d/");
+        switch (r) {
+        case -ENOENT:
+                log_debug("Smack is not enabled in the kernel.");
+                return 0;
+        case ENOENT:
+                log_debug("Smack access rules directory '/etc/smack/accesses.d/' not found");
+                return 0;
+        case 0:
+                log_info("Successfully loaded Smack policies.");
+                break;
+        default:
+                log_warning_errno(r, "Failed to load Smack access rules, ignoring: %m");
+                return 0;
+        }
+
+#if HAVE_SMACK_RUN_LABEL
+        r = write_string_file("/proc/self/attr/current", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set SMACK label \"" SMACK_RUN_LABEL "\" on self: %m");
+        r = write_string_file("/sys/fs/smackfs/ambient", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set SMACK ambient label \"" SMACK_RUN_LABEL "\": %m");
+        r = write_string_file("/sys/fs/smackfs/netlabel",
+                              "0.0.0.0/0 " SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set SMACK netlabel rule \"0.0.0.0/0 " SMACK_RUN_LABEL "\": %m");
+        r = write_string_file("/sys/fs/smackfs/netlabel", "127.0.0.1 -CIPSO", WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                log_warning_errno(r, "Failed to set SMACK netlabel rule \"127.0.0.1 -CIPSO\": %m");
+#endif
+
+        r = write_cipso2_rules("/etc/smack/cipso.d/");
+        switch (r) {
+        case -ENOENT:
+                log_debug("Smack/CIPSO is not enabled in the kernel.");
+                return 0;
+        case ENOENT:
+                log_debug("Smack/CIPSO access rules directory '/etc/smack/cipso.d/' not found");
+                break;
+        case 0:
+                log_info("Successfully loaded Smack/CIPSO policies.");
+                break;
+        default:
+                log_warning_errno(r, "Failed to load Smack/CIPSO access rules, ignoring: %m");
+                break;
+        }
+
+        r = write_netlabel_rules("/etc/smack/netlabel.d/");
+        switch (r) {
+        case -ENOENT:
+                log_debug("Smack/CIPSO is not enabled in the kernel.");
+                return 0;
+        case ENOENT:
+                log_debug("Smack network host rules directory '/etc/smack/netlabel.d/' not found");
+                break;
+        case 0:
+                log_info("Successfully loaded Smack network host rules.");
+                break;
+        default:
+                log_warning_errno(r, "Failed to load Smack network host rules: %m, ignoring.");
+                break;
+        }
+
+        r = write_onlycap_list();
+        switch (r) {
+        case -ENOENT:
+                log_debug("Smack is not enabled in the kernel.");
+                break;
+        case ENOENT:
+                log_debug("Smack onlycap list file '/etc/smack/onlycap' not found");
+                break;
+        case 0:
+                log_info("Successfully wrote Smack onlycap list.");
+                break;
+        default:
+                return log_struct_errno(LOG_EMERG, r,
+                                        LOG_MESSAGE("Failed to write Smack onlycap list: %m"),
+                                        "MESSAGE_ID=" SD_MESSAGE_SMACK_FAILED_WRITE_STR);
+        }
+
+        *loaded_policy = true;
+
+#endif
+
+        return 0;
+}
diff --git a/src/core/smack-setup.h b/src/core/smack-setup.h
new file mode 100644
index 0000000..d29370d
--- /dev/null
+++ b/src/core/smack-setup.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+  Copyright © 2013 Intel Corporation
+  Authors:
+        Nathaniel Chen 
+***/
+
+int mac_smack_setup(bool *loaded_policy);
diff --git a/src/core/socket.c b/src/core/socket.c
new file mode 100644
index 0000000..388be62
--- /dev/null
+++ b/src/core/socket.c
@@ -0,0 +1,3617 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "chase.h"
+#include "constants.h"
+#include "copy.h"
+#include "dbus-socket.h"
+#include "dbus-unit.h"
+#include "errno-list.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "in-addr-util.h"
+#include "io-util.h"
+#include "ip-protocol-list.h"
+#include "label-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-util.h"
+#include "serialize.h"
+#include "service.h"
+#include "signal-util.h"
+#include "smack-util.h"
+#include "socket.h"
+#include "socket-netlink.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+
+struct SocketPeer {
+        unsigned n_ref;
+
+        Socket *socket;
+        union sockaddr_union peer;
+        socklen_t peer_salen;
+};
+
+static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = {
+        [SOCKET_DEAD] = UNIT_INACTIVE,
+        [SOCKET_START_PRE] = UNIT_ACTIVATING,
+        [SOCKET_START_CHOWN] = UNIT_ACTIVATING,
+        [SOCKET_START_POST] = UNIT_ACTIVATING,
+        [SOCKET_LISTENING] = UNIT_ACTIVE,
+        [SOCKET_RUNNING] = UNIT_ACTIVE,
+        [SOCKET_STOP_PRE] = UNIT_DEACTIVATING,
+        [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING,
+        [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING,
+        [SOCKET_STOP_POST] = UNIT_DEACTIVATING,
+        [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+        [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+        [SOCKET_FAILED] = UNIT_FAILED,
+        [SOCKET_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static void flush_ports(Socket *s);
+
+static void socket_init(Unit *u) {
+        Socket *s = SOCKET(u);
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        s->backlog = SOMAXCONN_DELUXE;
+        s->timeout_usec = u->manager->defaults.timeout_start_usec;
+        s->directory_mode = 0755;
+        s->socket_mode = 0666;
+
+        s->max_connections = 64;
+
+        s->priority = -1;
+        s->ip_tos = -1;
+        s->ip_ttl = -1;
+        s->mark = -1;
+
+        s->exec_context.std_output = u->manager->defaults.std_output;
+        s->exec_context.std_error = u->manager->defaults.std_error;
+
+        s->control_pid = PIDREF_NULL;
+        s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+        s->trigger_limit = RATELIMIT_OFF;
+
+        s->poll_limit_interval = USEC_INFINITY;
+        s->poll_limit_burst = UINT_MAX;
+}
+
+static void socket_unwatch_control_pid(Socket *s) {
+        assert(s);
+
+        if (!pidref_is_set(&s->control_pid))
+                return;
+
+        unit_unwatch_pidref(UNIT(s), &s->control_pid);
+        pidref_done(&s->control_pid);
+}
+
+static void socket_cleanup_fd_list(SocketPort *p) {
+        assert(p);
+
+        close_many(p->auxiliary_fds, p->n_auxiliary_fds);
+        p->auxiliary_fds = mfree(p->auxiliary_fds);
+        p->n_auxiliary_fds = 0;
+}
+
+SocketPort *socket_port_free(SocketPort *p) {
+        if (!p)
+                return NULL;
+
+        sd_event_source_unref(p->event_source);
+
+        socket_cleanup_fd_list(p);
+        safe_close(p->fd);
+        free(p->path);
+
+        return mfree(p);
+}
+
+void socket_free_ports(Socket *s) {
+        assert(s);
+
+        LIST_CLEAR(port, s->ports, socket_port_free);
+}
+
+static void socket_done(Unit *u) {
+        Socket *s = SOCKET(u);
+        SocketPeer *p;
+
+        assert(s);
+
+        socket_free_ports(s);
+
+        while ((p = set_steal_first(s->peers_by_address)))
+                p->socket = NULL;
+
+        s->peers_by_address = set_free(s->peers_by_address);
+
+        s->exec_runtime = exec_runtime_free(s->exec_runtime);
+        exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
+        s->control_command = NULL;
+
+        socket_unwatch_control_pid(s);
+
+        unit_ref_unset(&s->service);
+
+        s->tcp_congestion = mfree(s->tcp_congestion);
+        s->bind_to_device = mfree(s->bind_to_device);
+
+        s->smack = mfree(s->smack);
+        s->smack_ip_in = mfree(s->smack_ip_in);
+        s->smack_ip_out = mfree(s->smack_ip_out);
+
+        strv_free(s->symlinks);
+
+        s->user = mfree(s->user);
+        s->group = mfree(s->group);
+
+        s->fdname = mfree(s->fdname);
+
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+}
+
+static int socket_arm_timer(Socket *s, bool relative, usec_t usec) {
+        assert(s);
+
+        return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, socket_dispatch_timer);
+}
+
+static bool have_non_accept_socket(Socket *s) {
+        assert(s);
+
+        if (!s->accept)
+                return true;
+
+        LIST_FOREACH(port, p, s->ports) {
+
+                if (p->type != SOCKET_SOCKET)
+                        return true;
+
+                if (!socket_address_can_accept(&p->address))
+                        return true;
+        }
+
+        return false;
+}
+
+static int socket_add_mount_dependencies(Socket *s) {
+        int r;
+
+        assert(s);
+
+        LIST_FOREACH(port, p, s->ports) {
+                const char *path = NULL;
+
+                if (p->type == SOCKET_SOCKET)
+                        path = socket_address_get_path(&p->address);
+                else if (IN_SET(p->type, SOCKET_FIFO, SOCKET_SPECIAL, SOCKET_USB_FUNCTION))
+                        path = p->path;
+
+                if (!path)
+                        continue;
+
+                r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int socket_add_device_dependencies(Socket *s) {
+        char *t;
+
+        assert(s);
+
+        if (!s->bind_to_device || streq(s->bind_to_device, "lo"))
+                return 0;
+
+        t = strjoina("/sys/subsystem/net/devices/", s->bind_to_device);
+        return unit_add_node_dependency(UNIT(s), t, UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE);
+}
+
+static int socket_add_default_dependencies(Socket *s) {
+        int r;
+        assert(s);
+
+        if (!UNIT(s)->default_dependencies)
+                return 0;
+
+        r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SOCKETS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) {
+                r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+                if (r < 0)
+                        return r;
+        }
+
+        return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static bool socket_has_exec(Socket *s) {
+        unsigned i;
+        assert(s);
+
+        for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++)
+                if (s->exec_command[i])
+                        return true;
+
+        return false;
+}
+
+static int socket_add_extras(Socket *s) {
+        Unit *u = UNIT(s);
+        int r;
+
+        assert(s);
+
+        /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit
+         * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept()
+         * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly
+         * and reliably. This is different for Accept=no, where the spawned service has to take the incoming traffic
+         * off the queues, which it might not necessarily do. Moreover, while Accept=no services are supposed to
+         * process whatever is queued in one go, and thus should normally never have to be started frequently. This is
+         * different for Accept=yes where each connection is processed by a new service instance, and thus frequent
+         * service starts are typical.
+         *
+         * For the poll limit we follow a similar rule, but use 3/4th of the trigger limit parameters, to
+         * trigger this earlier. */
+
+        if (s->trigger_limit.interval == USEC_INFINITY)
+                s->trigger_limit.interval = 2 * USEC_PER_SEC;
+        if (s->trigger_limit.burst == UINT_MAX)
+                s->trigger_limit.burst = s->accept ? 200 : 20;
+
+        if (s->poll_limit_interval == USEC_INFINITY)
+                s->poll_limit_interval = 2 * USEC_PER_SEC;
+        if (s->poll_limit_burst == UINT_MAX)
+                s->poll_limit_burst = s->accept ? 150 : 15;
+
+        if (have_non_accept_socket(s)) {
+
+                if (!UNIT_DEREF(s->service)) {
+                        Unit *x;
+
+                        r = unit_load_related_unit(u, ".service", &x);
+                        if (r < 0)
+                                return r;
+
+                        unit_ref_set(&s->service, u, x);
+                }
+
+                r = unit_add_two_dependencies(u, UNIT_BEFORE, UNIT_TRIGGERS, UNIT_DEREF(s->service), true, UNIT_DEPENDENCY_IMPLICIT);
+                if (r < 0)
+                        return r;
+        }
+
+        r = socket_add_mount_dependencies(s);
+        if (r < 0)
+                return r;
+
+        r = socket_add_device_dependencies(s);
+        if (r < 0)
+                return r;
+
+        r = unit_patch_contexts(u);
+        if (r < 0)
+                return r;
+
+        if (socket_has_exec(s)) {
+                r = unit_add_exec_dependencies(u, &s->exec_context);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_set_default_slice(u);
+        if (r < 0)
+                return r;
+
+        r = socket_add_default_dependencies(s);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static const char *socket_find_symlink_target(Socket *s) {
+        const char *found = NULL;
+
+        LIST_FOREACH(port, p, s->ports) {
+                const char *f = NULL;
+
+                switch (p->type) {
+
+                case SOCKET_FIFO:
+                        f = p->path;
+                        break;
+
+                case SOCKET_SOCKET:
+                        f = socket_address_get_path(&p->address);
+                        break;
+
+                default:
+                        break;
+                }
+
+                if (f) {
+                        if (found)
+                                return NULL;
+
+                        found = f;
+                }
+        }
+
+        return found;
+}
+
+static int socket_verify(Socket *s) {
+        assert(s);
+        assert(UNIT(s)->load_state == UNIT_LOADED);
+
+        if (!s->ports)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has no Listen setting (ListenStream=, ListenDatagram=, ListenFIFO=, ...). Refusing.");
+
+        if (s->accept && have_non_accept_socket(s))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit configured for accepting sockets, but sockets are non-accepting. Refusing.");
+
+        if (s->accept && s->max_connections <= 0)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "MaxConnection= setting too small. Refusing.");
+
+        if (s->accept && UNIT_DEREF(s->service))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Explicit service configuration for accepting socket units not supported. Refusing.");
+
+        if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing.");
+
+        if (!strv_isempty(s->symlinks) && !socket_find_symlink_target(s))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has symlinks set but none or more than one node in the file system. Refusing.");
+
+        return 0;
+}
+
+static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) {
+        assert(s);
+
+        if (s->peer.sa.sa_family == AF_INET)
+                siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state);
+        else if (s->peer.sa.sa_family == AF_INET6)
+                siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state);
+        else if (s->peer.sa.sa_family == AF_VSOCK)
+                siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state);
+        else
+                assert_not_reached();
+}
+
+static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) {
+        int r;
+
+        r = CMP(x->peer.sa.sa_family, y->peer.sa.sa_family);
+        if (r != 0)
+                return r;
+
+        switch (x->peer.sa.sa_family) {
+        case AF_INET:
+                return memcmp(&x->peer.in.sin_addr, &y->peer.in.sin_addr, sizeof(x->peer.in.sin_addr));
+        case AF_INET6:
+                return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr));
+        case AF_VSOCK:
+                return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid);
+        }
+        assert_not_reached();
+}
+
+DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func);
+
+static int socket_load(Unit *u) {
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        r = unit_load_fragment_and_dropin(u, true);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        /* This is a new unit? Then let's add in some extras */
+        r = socket_add_extras(s);
+        if (r < 0)
+                return r;
+
+        return socket_verify(s);
+}
+
+static SocketPeer *socket_peer_new(void) {
+        SocketPeer *p;
+
+        p = new(SocketPeer, 1);
+        if (!p)
+                return NULL;
+
+        *p = (SocketPeer) {
+                .n_ref = 1,
+        };
+        return p;
+}
+
+static SocketPeer *socket_peer_free(SocketPeer *p) {
+        assert(p);
+
+        if (p->socket)
+                set_remove(p->socket->peers_by_address, p);
+
+        return mfree(p);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free);
+
+int socket_acquire_peer(Socket *s, int fd, SocketPeer **ret) {
+        _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL;
+        SocketPeer sa = {
+                .peer_salen = sizeof(union sockaddr_union),
+        }, *i;
+        int r;
+
+        assert(fd >= 0);
+        assert(s);
+        assert(ret);
+
+        if (getpeername(fd, &sa.peer.sa, &sa.peer_salen) < 0)
+                return log_unit_error_errno(UNIT(s), errno, "getpeername() failed: %m");
+
+        if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+                *ret = NULL;
+                return 0;
+        }
+
+        i = set_get(s->peers_by_address, &sa);
+        if (i) {
+                *ret = socket_peer_ref(i);
+                return 1;
+        }
+
+        remote = socket_peer_new();
+        if (!remote)
+                return log_oom();
+
+        remote->peer = sa.peer;
+        remote->peer_salen = sa.peer_salen;
+
+        r = set_ensure_put(&s->peers_by_address, &peer_address_hash_ops, remote);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to insert peer info into hash table: %m");
+
+        remote->socket = s;
+
+        *ret = TAKE_PTR(remote);
+        return 1;
+}
+
+static const char* listen_lookup(int family, int type) {
+
+        if (family == AF_NETLINK)
+                return "ListenNetlink";
+
+        if (type == SOCK_STREAM)
+                return "ListenStream";
+        else if (type == SOCK_DGRAM)
+                return "ListenDatagram";
+        else if (type == SOCK_SEQPACKET)
+                return "ListenSequentialPacket";
+
+        assert_not_reached();
+        return NULL;
+}
+
+static void socket_dump(Unit *u, FILE *f, const char *prefix) {
+        Socket *s = SOCKET(u);
+        const char *prefix2, *str;
+
+        assert(s);
+        assert(f);
+
+        prefix = strempty(prefix);
+        prefix2 = strjoina(prefix, "\t");
+
+        fprintf(f,
+                "%sSocket State: %s\n"
+                "%sResult: %s\n"
+                "%sClean Result: %s\n"
+                "%sBindIPv6Only: %s\n"
+                "%sBacklog: %u\n"
+                "%sSocketMode: %04o\n"
+                "%sDirectoryMode: %04o\n"
+                "%sKeepAlive: %s\n"
+                "%sNoDelay: %s\n"
+                "%sFreeBind: %s\n"
+                "%sTransparent: %s\n"
+                "%sBroadcast: %s\n"
+                "%sPassCredentials: %s\n"
+                "%sPassSecurity: %s\n"
+                "%sPassPacketInfo: %s\n"
+                "%sTCPCongestion: %s\n"
+                "%sRemoveOnStop: %s\n"
+                "%sWritable: %s\n"
+                "%sFileDescriptorName: %s\n"
+                "%sSELinuxContextFromNet: %s\n",
+                prefix, socket_state_to_string(s->state),
+                prefix, socket_result_to_string(s->result),
+                prefix, socket_result_to_string(s->clean_result),
+                prefix, socket_address_bind_ipv6_only_to_string(s->bind_ipv6_only),
+                prefix, s->backlog,
+                prefix, s->socket_mode,
+                prefix, s->directory_mode,
+                prefix, yes_no(s->keep_alive),
+                prefix, yes_no(s->no_delay),
+                prefix, yes_no(s->free_bind),
+                prefix, yes_no(s->transparent),
+                prefix, yes_no(s->broadcast),
+                prefix, yes_no(s->pass_cred),
+                prefix, yes_no(s->pass_sec),
+                prefix, yes_no(s->pass_pktinfo),
+                prefix, strna(s->tcp_congestion),
+                prefix, yes_no(s->remove_on_stop),
+                prefix, yes_no(s->writable),
+                prefix, socket_fdname(s),
+                prefix, yes_no(s->selinux_context_from_net));
+
+        if (s->timestamping != SOCKET_TIMESTAMPING_OFF)
+                fprintf(f,
+                        "%sTimestamping: %s\n",
+                        prefix, socket_timestamping_to_string(s->timestamping));
+
+        if (pidref_is_set(&s->control_pid))
+                fprintf(f,
+                        "%sControl PID: "PID_FMT"\n",
+                        prefix, s->control_pid.pid);
+
+        if (s->bind_to_device)
+                fprintf(f,
+                        "%sBindToDevice: %s\n",
+                        prefix, s->bind_to_device);
+
+        if (s->accept)
+                fprintf(f,
+                        "%sAccepted: %u\n"
+                        "%sNConnections: %u\n"
+                        "%sMaxConnections: %u\n"
+                        "%sMaxConnectionsPerSource: %u\n",
+                        prefix, s->n_accepted,
+                        prefix, s->n_connections,
+                        prefix, s->max_connections,
+                        prefix, s->max_connections_per_source);
+        else
+                fprintf(f,
+                        "%sFlushPending: %s\n",
+                         prefix, yes_no(s->flush_pending));
+
+
+        if (s->priority >= 0)
+                fprintf(f,
+                        "%sPriority: %i\n",
+                        prefix, s->priority);
+
+        if (s->receive_buffer > 0)
+                fprintf(f,
+                        "%sReceiveBuffer: %zu\n",
+                        prefix, s->receive_buffer);
+
+        if (s->send_buffer > 0)
+                fprintf(f,
+                        "%sSendBuffer: %zu\n",
+                        prefix, s->send_buffer);
+
+        if (s->ip_tos >= 0)
+                fprintf(f,
+                        "%sIPTOS: %i\n",
+                        prefix, s->ip_tos);
+
+        if (s->ip_ttl >= 0)
+                fprintf(f,
+                        "%sIPTTL: %i\n",
+                        prefix, s->ip_ttl);
+
+        if (s->pipe_size > 0)
+                fprintf(f,
+                        "%sPipeSize: %zu\n",
+                        prefix, s->pipe_size);
+
+        if (s->mark >= 0)
+                fprintf(f,
+                        "%sMark: %i\n",
+                        prefix, s->mark);
+
+        if (s->mq_maxmsg > 0)
+                fprintf(f,
+                        "%sMessageQueueMaxMessages: %li\n",
+                        prefix, s->mq_maxmsg);
+
+        if (s->mq_msgsize > 0)
+                fprintf(f,
+                        "%sMessageQueueMessageSize: %li\n",
+                        prefix, s->mq_msgsize);
+
+        if (s->reuse_port)
+                fprintf(f,
+                        "%sReusePort: %s\n",
+                         prefix, yes_no(s->reuse_port));
+
+        if (s->smack)
+                fprintf(f,
+                        "%sSmackLabel: %s\n",
+                        prefix, s->smack);
+
+        if (s->smack_ip_in)
+                fprintf(f,
+                        "%sSmackLabelIPIn: %s\n",
+                        prefix, s->smack_ip_in);
+
+        if (s->smack_ip_out)
+                fprintf(f,
+                        "%sSmackLabelIPOut: %s\n",
+                        prefix, s->smack_ip_out);
+
+        if (!isempty(s->user) || !isempty(s->group))
+                fprintf(f,
+                        "%sSocketUser: %s\n"
+                        "%sSocketGroup: %s\n",
+                        prefix, strna(s->user),
+                        prefix, strna(s->group));
+
+        if (timestamp_is_set(s->keep_alive_time))
+                fprintf(f,
+                        "%sKeepAliveTimeSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(s->keep_alive_time, USEC_PER_SEC));
+
+        if (s->keep_alive_interval > 0)
+                fprintf(f,
+                        "%sKeepAliveIntervalSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(s->keep_alive_interval, USEC_PER_SEC));
+
+        if (s->keep_alive_cnt > 0)
+                fprintf(f,
+                        "%sKeepAliveProbes: %u\n",
+                        prefix, s->keep_alive_cnt);
+
+        if (s->defer_accept > 0)
+                fprintf(f,
+                        "%sDeferAcceptSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(s->defer_accept, USEC_PER_SEC));
+
+        LIST_FOREACH(port, p, s->ports) {
+
+                switch (p->type) {
+                case SOCKET_SOCKET: {
+                        _cleanup_free_ char *k = NULL;
+                        int r;
+
+                        r = socket_address_print(&p->address, &k);
+                        if (r < 0) {
+                                errno = -r;
+                                fprintf(f, "%s%s: %m\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type));
+                        } else
+                                fprintf(f, "%s%s: %s\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type), k);
+                        break;
+                }
+                case SOCKET_SPECIAL:
+                        fprintf(f, "%sListenSpecial: %s\n", prefix, p->path);
+                        break;
+                case SOCKET_USB_FUNCTION:
+                        fprintf(f, "%sListenUSBFunction: %s\n", prefix, p->path);
+                        break;
+                case SOCKET_MQUEUE:
+                        fprintf(f, "%sListenMessageQueue: %s\n", prefix, p->path);
+                        break;
+                default:
+                        fprintf(f, "%sListenFIFO: %s\n", prefix, p->path);
+                }
+        }
+
+        fprintf(f,
+                "%sTriggerLimitIntervalSec: %s\n"
+                "%sTriggerLimitBurst: %u\n"
+                "%sPollLimitIntervalSec: %s\n"
+                "%sPollLimitBurst: %u\n",
+                prefix, FORMAT_TIMESPAN(s->trigger_limit.interval, USEC_PER_SEC),
+                prefix, s->trigger_limit.burst,
+                prefix, FORMAT_TIMESPAN(s->poll_limit_interval, USEC_PER_SEC),
+                prefix, s->poll_limit_burst);
+
+        str = ip_protocol_to_name(s->socket_protocol);
+        if (str)
+                fprintf(f, "%sSocketProtocol: %s\n", prefix, str);
+
+        if (!strv_isempty(s->symlinks)) {
+                fprintf(f, "%sSymlinks:", prefix);
+                STRV_FOREACH(q, s->symlinks)
+                        fprintf(f, " %s", *q);
+
+                fprintf(f, "\n");
+        }
+
+        fprintf(f,
+                "%sTimeoutSec: %s\n",
+                prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC));
+
+        exec_context_dump(&s->exec_context, f, prefix);
+        kill_context_dump(&s->kill_context, f, prefix);
+
+        for (SocketExecCommand c = 0; c < _SOCKET_EXEC_COMMAND_MAX; c++) {
+                if (!s->exec_command[c])
+                        continue;
+
+                fprintf(f, "%s-> %s:\n",
+                        prefix, socket_exec_command_to_string(c));
+
+                exec_command_dump_list(s->exec_command[c], f, prefix2);
+        }
+
+        cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int instance_from_socket(int fd, unsigned nr, char **instance) {
+        socklen_t l;
+        char *r;
+        union sockaddr_union local, remote;
+
+        assert(fd >= 0);
+        assert(instance);
+
+        l = sizeof(local);
+        if (getsockname(fd, &local.sa, &l) < 0)
+                return -errno;
+
+        l = sizeof(remote);
+        if (getpeername(fd, &remote.sa, &l) < 0)
+                return -errno;
+
+        switch (local.sa.sa_family) {
+
+        case AF_INET: {
+                uint32_t
+                        a = be32toh(local.in.sin_addr.s_addr),
+                        b = be32toh(remote.in.sin_addr.s_addr);
+
+                if (asprintf(&r,
+                             "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u",
+                             nr,
+                             a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF,
+                             be16toh(local.in.sin_port),
+                             b >> 24, (b >> 16) & 0xFF, (b >> 8) & 0xFF, b & 0xFF,
+                             be16toh(remote.in.sin_port)) < 0)
+                        return -ENOMEM;
+
+                break;
+        }
+
+        case AF_INET6: {
+                static const unsigned char ipv4_prefix[] = {
+                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF
+                };
+
+                if (memcmp(&local.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0 &&
+                    memcmp(&remote.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) {
+                        const uint8_t
+                                *a = local.in6.sin6_addr.s6_addr+12,
+                                *b = remote.in6.sin6_addr.s6_addr+12;
+
+                        if (asprintf(&r,
+                                     "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u",
+                                     nr,
+                                     a[0], a[1], a[2], a[3],
+                                     be16toh(local.in6.sin6_port),
+                                     b[0], b[1], b[2], b[3],
+                                     be16toh(remote.in6.sin6_port)) < 0)
+                                return -ENOMEM;
+                } else {
+                        if (asprintf(&r,
+                                     "%u-%s:%u-%s:%u",
+                                     nr,
+                                     IN6_ADDR_TO_STRING(&local.in6.sin6_addr),
+                                     be16toh(local.in6.sin6_port),
+                                     IN6_ADDR_TO_STRING(&remote.in6.sin6_addr),
+                                     be16toh(remote.in6.sin6_port)) < 0)
+                                return -ENOMEM;
+                }
+
+                break;
+        }
+
+        case AF_UNIX: {
+                struct ucred ucred;
+                int k;
+
+                k = getpeercred(fd, &ucred);
+                if (k >= 0) {
+                        if (asprintf(&r,
+                                     "%u-"PID_FMT"-"UID_FMT,
+                                     nr, ucred.pid, ucred.uid) < 0)
+                                return -ENOMEM;
+                } else if (k == -ENODATA) {
+                        /* This handles the case where somebody is
+                         * connecting from another pid/uid namespace
+                         * (e.g. from outside of our container). */
+                        if (asprintf(&r,
+                                     "%u-unknown",
+                                     nr) < 0)
+                                return -ENOMEM;
+                } else
+                        return k;
+
+                break;
+        }
+
+        case AF_VSOCK:
+                if (asprintf(&r,
+                             "%u-%u:%u-%u:%u",
+                             nr,
+                             local.vm.svm_cid, local.vm.svm_port,
+                             remote.vm.svm_cid, remote.vm.svm_port) < 0)
+                        return -ENOMEM;
+
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        *instance = r;
+        return 0;
+}
+
+static void socket_close_fds(Socket *s) {
+        assert(s);
+
+        LIST_FOREACH(port, p, s->ports) {
+                bool was_open;
+
+                was_open = p->fd >= 0;
+
+                p->event_source = sd_event_source_disable_unref(p->event_source);
+                p->fd = safe_close(p->fd);
+                socket_cleanup_fd_list(p);
+
+                /* One little note: we should normally not delete any sockets in the file system here! After all some
+                 * other process we spawned might still have a reference of this fd and wants to continue to use
+                 * it. Therefore we normally delete sockets in the file system before we create a new one, not after we
+                 * stopped using one! That all said, if the user explicitly requested this, we'll delete them here
+                 * anyway, but only then. */
+
+                if (!was_open || !s->remove_on_stop)
+                        continue;
+
+                switch (p->type) {
+
+                case SOCKET_FIFO:
+                        (void) unlink(p->path);
+                        break;
+
+                case SOCKET_MQUEUE:
+                        (void) mq_unlink(p->path);
+                        break;
+
+                case SOCKET_SOCKET:
+                        (void) socket_address_unlink(&p->address);
+                        break;
+
+                default:
+                        break;
+                }
+        }
+
+        if (s->remove_on_stop)
+                STRV_FOREACH(i, s->symlinks)
+                        (void) unlink(*i);
+
+        /* Note that we don't return NULL here, since s has not been freed. */
+}
+
+static void socket_apply_socket_options(Socket *s, SocketPort *p, int fd) {
+        int r;
+
+        assert(s);
+        assert(p);
+        assert(fd >= 0);
+
+        if (s->keep_alive) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, true);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "SO_KEEPALIVE failed: %m");
+        }
+
+        if (timestamp_is_set(s->keep_alive_time)) {
+                r = setsockopt_int(fd, SOL_TCP, TCP_KEEPIDLE, s->keep_alive_time / USEC_PER_SEC);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "TCP_KEEPIDLE failed: %m");
+        }
+
+        if (s->keep_alive_interval > 0) {
+                r = setsockopt_int(fd, SOL_TCP, TCP_KEEPINTVL, s->keep_alive_interval / USEC_PER_SEC);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "TCP_KEEPINTVL failed: %m");
+        }
+
+        if (s->keep_alive_cnt > 0) {
+                r = setsockopt_int(fd, SOL_TCP, TCP_KEEPCNT, s->keep_alive_cnt);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "TCP_KEEPCNT failed: %m");
+        }
+
+        if (s->defer_accept > 0) {
+                r = setsockopt_int(fd, SOL_TCP, TCP_DEFER_ACCEPT, s->defer_accept / USEC_PER_SEC);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "TCP_DEFER_ACCEPT failed: %m");
+        }
+
+        if (s->no_delay) {
+                if (s->socket_protocol == IPPROTO_SCTP) {
+                        r = setsockopt_int(fd, SOL_SCTP, SCTP_NODELAY, true);
+                        if (r < 0)
+                                log_unit_warning_errno(UNIT(s), r, "SCTP_NODELAY failed: %m");
+                } else {
+                        r = setsockopt_int(fd, SOL_TCP, TCP_NODELAY, true);
+                        if (r < 0)
+                                log_unit_warning_errno(UNIT(s), r, "TCP_NODELAY failed: %m");
+                }
+        }
+
+        if (s->broadcast) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_BROADCAST, true);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "SO_BROADCAST failed: %m");
+        }
+
+        if (s->pass_cred) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "SO_PASSCRED failed: %m");
+        }
+
+        if (s->pass_sec) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_PASSSEC, true);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "SO_PASSSEC failed: %m");
+        }
+
+        if (s->pass_pktinfo) {
+                r = socket_set_recvpktinfo(fd, socket_address_family(&p->address), true);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "Failed to enable packet info socket option: %m");
+        }
+
+        if (s->timestamping != SOCKET_TIMESTAMPING_OFF) {
+                r = setsockopt_int(fd, SOL_SOCKET,
+                                   s->timestamping == SOCKET_TIMESTAMPING_NS ? SO_TIMESTAMPNS : SO_TIMESTAMP,
+                                   true);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "Failed to enable timestamping socket option, ignoring: %m");
+        }
+
+        if (s->priority >= 0) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_PRIORITY, s->priority);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "SO_PRIORITY failed: %m");
+        }
+
+        if (s->receive_buffer > 0) {
+                r = fd_set_rcvbuf(fd, s->receive_buffer, false);
+                if (r < 0)
+                        log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                            "SO_RCVBUF/SO_RCVBUFFORCE failed: %m");
+        }
+
+        if (s->send_buffer > 0) {
+                r = fd_set_sndbuf(fd, s->send_buffer, false);
+                if (r < 0)
+                        log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                            "SO_SNDBUF/SO_SNDBUFFORCE failed: %m");
+        }
+
+        if (s->mark >= 0) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_MARK, s->mark);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "SO_MARK failed: %m");
+        }
+
+        if (s->ip_tos >= 0) {
+                r = setsockopt_int(fd, IPPROTO_IP, IP_TOS, s->ip_tos);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "IP_TOS failed: %m");
+        }
+
+        if (s->ip_ttl >= 0) {
+                r = socket_set_ttl(fd, socket_address_family(&p->address), s->ip_ttl);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "IP_TTL/IPV6_UNICAST_HOPS failed: %m");
+        }
+
+        if (s->tcp_congestion)
+                if (setsockopt(fd, SOL_TCP, TCP_CONGESTION, s->tcp_congestion, strlen(s->tcp_congestion)+1) < 0)
+                        log_unit_warning_errno(UNIT(s), errno, "TCP_CONGESTION failed: %m");
+
+        if (s->smack_ip_in) {
+                r = mac_smack_apply_fd(fd, SMACK_ATTR_IPIN, s->smack_ip_in);
+                if (r < 0)
+                        log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_in_fd: %m");
+        }
+
+        if (s->smack_ip_out) {
+                r = mac_smack_apply_fd(fd, SMACK_ATTR_IPOUT, s->smack_ip_out);
+                if (r < 0)
+                        log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_out_fd: %m");
+        }
+}
+
+static void socket_apply_fifo_options(Socket *s, int fd) {
+        int r;
+
+        assert(s);
+        assert(fd >= 0);
+
+        if (s->pipe_size > 0)
+                if (fcntl(fd, F_SETPIPE_SZ, s->pipe_size) < 0)
+                        log_unit_warning_errno(UNIT(s), errno, "Setting pipe size failed, ignoring: %m");
+
+        if (s->smack) {
+                r = mac_smack_apply_fd(fd, SMACK_ATTR_ACCESS, s->smack);
+                if (r < 0)
+                        log_unit_error_errno(UNIT(s), r, "SMACK relabelling failed, ignoring: %m");
+        }
+}
+
+static int fifo_address_create(
+                const char *path,
+                mode_t directory_mode,
+                mode_t socket_mode) {
+
+        _cleanup_close_ int fd = -EBADF;
+        mode_t old_mask;
+        struct stat st;
+        int r;
+
+        assert(path);
+
+        (void) mkdir_parents_label(path, directory_mode);
+
+        r = mac_selinux_create_file_prepare(path, S_IFIFO);
+        if (r < 0)
+                return r;
+
+        /* Enforce the right access mode for the fifo */
+        old_mask = umask(~socket_mode);
+
+        /* Include the original umask in our mask */
+        (void) umask(~socket_mode | old_mask);
+
+        r = mkfifo(path, socket_mode);
+        (void) umask(old_mask);
+
+        if (r < 0 && errno != EEXIST) {
+                r = -errno;
+                goto fail;
+        }
+
+        fd = open(path, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK | O_NOFOLLOW);
+        if (fd < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        mac_selinux_create_file_clear();
+
+        if (fstat(fd, &st) < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        if (!S_ISFIFO(st.st_mode) ||
+            (st.st_mode & 0777) != (socket_mode & ~old_mask) ||
+            st.st_uid != getuid() ||
+            st.st_gid != getgid()) {
+                r = -EEXIST;
+                goto fail;
+        }
+
+        return TAKE_FD(fd);
+
+fail:
+        mac_selinux_create_file_clear();
+        return r;
+}
+
+static int special_address_create(const char *path, bool writable) {
+        _cleanup_close_ int fd = -EBADF;
+        struct stat st;
+
+        assert(path);
+
+        fd = open(path, (writable ? O_RDWR : O_RDONLY)|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW);
+        if (fd < 0)
+                return -errno;
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        /* Check whether this is a /proc, /sys or /dev file or char device */
+        if (!S_ISREG(st.st_mode) && !S_ISCHR(st.st_mode))
+                return -EEXIST;
+
+        return TAKE_FD(fd);
+}
+
+static int usbffs_address_create(const char *path) {
+        _cleanup_close_ int fd = -EBADF;
+        struct stat st;
+
+        assert(path);
+
+        fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW);
+        if (fd < 0)
+                return -errno;
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        /* Check whether this is a regular file (ffs endpoint) */
+        if (!S_ISREG(st.st_mode))
+                return -EEXIST;
+
+        return TAKE_FD(fd);
+}
+
+static int mq_address_create(
+                const char *path,
+                mode_t mq_mode,
+                long maxmsg,
+                long msgsize) {
+
+        _cleanup_close_ int fd = -EBADF;
+        struct stat st;
+        mode_t old_mask;
+        struct mq_attr _attr, *attr = NULL;
+
+        assert(path);
+
+        if (maxmsg > 0 && msgsize > 0) {
+                _attr = (struct mq_attr) {
+                        .mq_flags = O_NONBLOCK,
+                        .mq_maxmsg = maxmsg,
+                        .mq_msgsize = msgsize,
+                };
+                attr = &_attr;
+        }
+
+        /* Enforce the right access mode for the mq */
+        old_mask = umask(~mq_mode);
+
+        /* Include the original umask in our mask */
+        (void) umask(~mq_mode | old_mask);
+        fd = mq_open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_CREAT, mq_mode, attr);
+        (void) umask(old_mask);
+
+        if (fd < 0)
+                return -errno;
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        if ((st.st_mode & 0777) != (mq_mode & ~old_mask) ||
+            st.st_uid != getuid() ||
+            st.st_gid != getgid())
+                return -EEXIST;
+
+        return TAKE_FD(fd);
+}
+
+static int socket_symlink(Socket *s) {
+        const char *p;
+        int r;
+
+        assert(s);
+
+        p = socket_find_symlink_target(s);
+        if (!p)
+                return 0;
+
+        STRV_FOREACH(i, s->symlinks) {
+                (void) mkdir_parents_label(*i, s->directory_mode);
+
+                r = symlink_idempotent(p, *i, false);
+
+                if (r == -EEXIST && s->remove_on_stop) {
+                        /* If there's already something where we want to create the symlink, and the destructive
+                         * RemoveOnStop= mode is set, then we might as well try to remove what already exists and try
+                         * again. */
+
+                        if (unlink(*i) >= 0)
+                                r = symlink_idempotent(p, *i, false);
+                }
+
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "Failed to create symlink %s %s %s, ignoring: %m",
+                                               p, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), *i);
+        }
+
+        return 0;
+}
+
+static int usbffs_write_descs(int fd, Service *s) {
+        int r;
+
+        if (!s->usb_function_descriptors || !s->usb_function_strings)
+                return -EINVAL;
+
+        r = copy_file_fd(s->usb_function_descriptors, fd, 0);
+        if (r < 0)
+                return r;
+
+        return copy_file_fd(s->usb_function_strings, fd, 0);
+}
+
+static int usbffs_select_ep(const struct dirent *d) {
+        return d->d_name[0] != '.' && !streq(d->d_name, "ep0");
+}
+
+static int usbffs_dispatch_eps(SocketPort *p) {
+        _cleanup_free_ struct dirent **ent = NULL;
+        size_t n, k;
+        int r;
+
+        r = scandir(p->path, &ent, usbffs_select_ep, alphasort);
+        if (r < 0)
+                return -errno;
+
+        n = (size_t) r;
+        p->auxiliary_fds = new(int, n);
+        if (!p->auxiliary_fds) {
+                r = -ENOMEM;
+                goto clear;
+        }
+
+        p->n_auxiliary_fds = n;
+
+        k = 0;
+        for (size_t i = 0; i < n; ++i) {
+                _cleanup_free_ char *ep = NULL;
+
+                ep = path_make_absolute(ent[i]->d_name, p->path);
+                if (!ep) {
+                        r = -ENOMEM;
+                        goto fail;
+                }
+
+                path_simplify(ep);
+
+                r = usbffs_address_create(ep);
+                if (r < 0)
+                        goto fail;
+
+                p->auxiliary_fds[k++] = r;
+        }
+
+        r = 0;
+        goto clear;
+
+fail:
+        close_many(p->auxiliary_fds, k);
+        p->auxiliary_fds = mfree(p->auxiliary_fds);
+        p->n_auxiliary_fds = 0;
+
+clear:
+        free_many((void**) ent, n);
+        return r;
+}
+
+int socket_load_service_unit(Socket *s, int cfd, Unit **ret) {
+        /* Figure out what the unit that will be used to handle the connections on the socket looks like.
+         *
+         * If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake
+         * instance name.
+         */
+
+        if (UNIT_ISSET(s->service)) {
+                *ret = UNIT_DEREF(s->service);
+                return 0;
+        }
+
+        if (!s->accept)
+                return -ENODATA;
+
+        /* Build the instance name and load the unit */
+        _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL;
+        int r;
+
+        r = unit_name_to_prefix(UNIT(s)->id, &prefix);
+        if (r < 0)
+                return r;
+
+        if (cfd >= 0) {
+                r = instance_from_socket(cfd, s->n_accepted, &instance);
+                if (r < 0) {
+                        if (ERRNO_IS_DISCONNECT(r))
+                                /* ENOTCONN is legitimate if TCP RST was received. Other socket families might return
+                                 * different errors. This connection is over, but the socket unit lives on. */
+                                return log_unit_debug_errno(UNIT(s), r,
+                                                            "Got %s on incoming socket, assuming aborted connection attempt, ignoring.",
+                                                            errno_to_name(r));
+                        return r;
+                }
+        }
+
+        /* For accepting sockets, we don't know how the instance will be called until we get a connection and
+         * can figure out what the peer name is. So let's use "internal" as the instance to make it clear
+         * that this is not an actual peer name. We use "unknown" when we cannot figure out the peer. */
+        r = unit_name_build(prefix, instance ?: "internal", ".service", &name);
+        if (r < 0)
+                return r;
+
+        return manager_load_unit(UNIT(s)->manager, name, NULL, NULL, ret);
+}
+
+static int socket_determine_selinux_label(Socket *s, char **ret) {
+        int r;
+
+        assert(s);
+        assert(ret);
+
+        Unit *service;
+        ExecCommand *c;
+        const char *exec_context;
+        _cleanup_free_ char *path = NULL;
+
+        r = socket_load_service_unit(s, -1, &service);
+        if (r == -ENODATA)
+                goto no_label;
+        if (r < 0)
+                return r;
+
+        exec_context = SERVICE(service)->exec_context.selinux_context;
+        if (exec_context) {
+                char *con;
+
+                con = strdup(exec_context);
+                if (!con)
+                        return -ENOMEM;
+
+                *ret = TAKE_PTR(con);
+                return 0;
+        }
+
+        c = SERVICE(service)->exec_command[SERVICE_EXEC_START];
+        if (!c)
+                goto no_label;
+
+        r = chase(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL);
+        if (r < 0)
+                goto no_label;
+
+        r = mac_selinux_get_create_label_from_exe(path, ret);
+        if (IN_SET(r, -EPERM, -EOPNOTSUPP))
+                goto no_label;
+        return r;
+
+no_label:
+        *ret = NULL;
+        return 0;
+}
+
+static int socket_address_listen_do(
+                Socket *s,
+                const SocketAddress *address,
+                const char *label) {
+
+        assert(s);
+        assert(address);
+
+        return socket_address_listen(
+                        address,
+                        SOCK_CLOEXEC|SOCK_NONBLOCK,
+                        s->backlog,
+                        s->bind_ipv6_only,
+                        s->bind_to_device,
+                        s->reuse_port,
+                        s->free_bind,
+                        s->transparent,
+                        s->directory_mode,
+                        s->socket_mode,
+                        label);
+}
+
+#define log_address_error_errno(u, address, error, fmt)          \
+        ({                                                       \
+                _cleanup_free_ char *_t = NULL;                  \
+                                                                 \
+                (void) socket_address_print(address, &_t);       \
+                log_unit_error_errno(u, error, fmt, strna(_t));  \
+        })
+
+static int fork_needed(const SocketAddress *address, Socket *s) {
+        int r;
+
+        assert(address);
+        assert(s);
+
+        /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */
+
+        /* If there are any NFTSet= directives with cgroup source, we need the cgroup */
+        Unit *u = UNIT(s);
+        CGroupContext *c = unit_get_cgroup_context(u);
+        if (c)
+                FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
+                        if (nft_set->source == NFT_SET_SOURCE_CGROUP)
+                                return true;
+
+        if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) {
+                r = bpf_firewall_supported();
+                if (r < 0)
+                        return r;
+                if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */
+                        return true;
+        }
+
+        return exec_needs_network_namespace(&s->exec_context);
+}
+
+static int socket_address_listen_in_cgroup(
+                Socket *s,
+                const SocketAddress *address,
+                const char *label) {
+
+        _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+        _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+        int fd, r;
+
+        assert(s);
+        assert(address);
+
+        /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the
+         * socket's cgroup and network namespace in which the socket is actually created. This way we ensure
+         * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and
+         * such. */
+
+        r = fork_needed(address, s);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* Shortcut things... */
+                fd = socket_address_listen_do(s, address, label);
+                if (fd < 0)
+                        return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m");
+
+                return fd;
+        }
+
+        r = unit_setup_exec_runtime(UNIT(s));
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m");
+
+        if (s->exec_context.network_namespace_path &&
+            s->exec_runtime &&
+            s->exec_runtime->shared &&
+            s->exec_runtime->shared->netns_storage_socket[0] >= 0) {
+                r = open_shareable_ns_path(s->exec_runtime->shared->netns_storage_socket, s->exec_context.network_namespace_path, CLONE_NEWNET);
+                if (r < 0)
+                        return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path);
+        }
+
+        if (s->exec_context.ipc_namespace_path &&
+            s->exec_runtime &&
+            s->exec_runtime->shared &&
+            s->exec_runtime->shared->ipcns_storage_socket[0] >= 0) {
+                r = open_shareable_ns_path(s->exec_runtime->shared->ipcns_storage_socket, s->exec_context.ipc_namespace_path, CLONE_NEWIPC);
+                if (r < 0)
+                        return log_unit_error_errno(UNIT(s), r, "Failed to open IPC namespace path %s: %m", s->exec_context.ipc_namespace_path);
+        }
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)
+                return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m");
+
+        r = unit_fork_helper_process(UNIT(s), "(sd-listen)", &pid);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m");
+        if (r == 0) {
+                /* Child */
+
+                pair[0] = safe_close(pair[0]);
+
+                if (exec_needs_network_namespace(&s->exec_context) &&
+                    s->exec_runtime &&
+                    s->exec_runtime->shared &&
+                    s->exec_runtime->shared->netns_storage_socket[0] >= 0) {
+
+                        if (ns_type_supported(NAMESPACE_NET)) {
+                                r = setup_shareable_ns(s->exec_runtime->shared->netns_storage_socket, CLONE_NEWNET);
+                                if (r < 0) {
+                                        log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m");
+                                        _exit(EXIT_NETWORK);
+                                }
+                        } else if (s->exec_context.network_namespace_path) {
+                                log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported.");
+                                _exit(EXIT_NETWORK);
+                        } else
+                                log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
+                }
+
+                fd = socket_address_listen_do(s, address, label);
+                if (fd < 0) {
+                        log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = send_one_fd(pair[1], fd, 0);
+                if (r < 0) {
+                        log_address_error_errno(UNIT(s), address, r, "Failed to send listening socket (%s) to parent: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        pair[1] = safe_close(pair[1]);
+        fd = receive_one_fd(pair[0], 0);
+
+        /* We synchronously wait for the helper, as it shouldn't be slow */
+        r = wait_for_terminate_and_check("(sd-listen)", pid.pid, WAIT_LOG_ABNORMAL);
+        if (r < 0) {
+                safe_close(fd);
+                return r;
+        }
+
+        if (fd < 0)
+                return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m");
+
+        return fd;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(Socket *, socket_close_fds, NULL);
+
+static int socket_open_fds(Socket *orig_s) {
+        _cleanup_(socket_close_fdsp) Socket *s = orig_s;
+        _cleanup_(mac_selinux_freep) char *label = NULL;
+        bool know_label = false;
+        int r;
+
+        assert(s);
+
+        LIST_FOREACH(port, p, s->ports) {
+
+                if (p->fd >= 0)
+                        continue;
+
+                switch (p->type) {
+
+                case SOCKET_SOCKET:
+
+                        if (!know_label) {
+                                /* Figure out the label, if we don't it know yet. We do it once for the first
+                                 * socket where we need this and remember it for the rest. */
+
+                                r = socket_determine_selinux_label(s, &label);
+                                if (r < 0)
+                                        return log_unit_error_errno(UNIT(s), r, "Failed to determine SELinux label: %m");
+
+                                know_label = true;
+                        }
+
+                        /* Apply the socket protocol */
+                        switch (p->address.type) {
+
+                        case SOCK_STREAM:
+                        case SOCK_SEQPACKET:
+                                if (s->socket_protocol == IPPROTO_SCTP)
+                                        p->address.protocol = s->socket_protocol;
+                                break;
+
+                        case SOCK_DGRAM:
+                                if (s->socket_protocol == IPPROTO_UDPLITE)
+                                        p->address.protocol = s->socket_protocol;
+                                break;
+                        }
+
+                        p->fd = socket_address_listen_in_cgroup(s, &p->address, label);
+                        if (p->fd < 0)
+                                return p->fd;
+
+                        socket_apply_socket_options(s, p, p->fd);
+                        socket_symlink(s);
+                        break;
+
+                case SOCKET_SPECIAL:
+
+                        p->fd = special_address_create(p->path, s->writable);
+                        if (p->fd < 0)
+                                return log_unit_error_errno(UNIT(s), p->fd, "Failed to open special file %s: %m", p->path);
+                        break;
+
+                case SOCKET_FIFO:
+
+                        p->fd = fifo_address_create(
+                                        p->path,
+                                        s->directory_mode,
+                                        s->socket_mode);
+                        if (p->fd < 0)
+                                return log_unit_error_errno(UNIT(s), p->fd, "Failed to open FIFO %s: %m", p->path);
+
+                        socket_apply_fifo_options(s, p->fd);
+                        socket_symlink(s);
+                        break;
+
+                case SOCKET_MQUEUE:
+
+                        p->fd = mq_address_create(
+                                        p->path,
+                                        s->socket_mode,
+                                        s->mq_maxmsg,
+                                        s->mq_msgsize);
+                        if (p->fd < 0)
+                                return log_unit_error_errno(UNIT(s), p->fd, "Failed to open message queue %s: %m", p->path);
+                        break;
+
+                case SOCKET_USB_FUNCTION: {
+                        _cleanup_free_ char *ep = NULL;
+
+                        ep = path_make_absolute("ep0", p->path);
+                        if (!ep)
+                                return -ENOMEM;
+
+                        p->fd = usbffs_address_create(ep);
+                        if (p->fd < 0)
+                                return p->fd;
+
+                        r = usbffs_write_descs(p->fd, SERVICE(UNIT_DEREF(s->service)));
+                        if (r < 0)
+                                return r;
+
+                        r = usbffs_dispatch_eps(p);
+                        if (r < 0)
+                                return r;
+
+                        break;
+                }
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        s = NULL;
+        return 0;
+}
+
+static void socket_unwatch_fds(Socket *s) {
+        int r;
+
+        assert(s);
+
+        LIST_FOREACH(port, p, s->ports) {
+                if (p->fd < 0)
+                        continue;
+
+                if (!p->event_source)
+                        continue;
+
+                r = sd_event_source_set_enabled(p->event_source, SD_EVENT_OFF);
+                if (r < 0)
+                        log_unit_debug_errno(UNIT(s), r, "Failed to disable event source: %m");
+        }
+}
+
+static int socket_watch_fds(Socket *s) {
+        int r;
+
+        assert(s);
+
+        LIST_FOREACH(port, p, s->ports) {
+                if (p->fd < 0)
+                        continue;
+
+                if (p->event_source) {
+                        r = sd_event_source_set_enabled(p->event_source, SD_EVENT_ON);
+                        if (r < 0)
+                                goto fail;
+                } else {
+                        r = sd_event_add_io(UNIT(s)->manager->event, &p->event_source, p->fd, EPOLLIN, socket_dispatch_io, p);
+                        if (r < 0)
+                                goto fail;
+
+                        (void) sd_event_source_set_description(p->event_source, "socket-port-io");
+                }
+
+                r = sd_event_source_set_ratelimit(p->event_source, s->poll_limit_interval, s->poll_limit_burst);
+                if (r < 0)
+                        log_unit_debug_errno(UNIT(s), r, "Failed to set poll limit on I/O event source, ignoring: %m");
+        }
+
+        return 0;
+
+fail:
+        log_unit_warning_errno(UNIT(s), r, "Failed to watch listening fds: %m");
+        socket_unwatch_fds(s);
+        return r;
+}
+
+enum {
+        SOCKET_OPEN_NONE,
+        SOCKET_OPEN_SOME,
+        SOCKET_OPEN_ALL,
+};
+
+static int socket_check_open(Socket *s) {
+        bool have_open = false, have_closed = false;
+
+        assert(s);
+
+        LIST_FOREACH(port, p, s->ports) {
+                if (p->fd < 0)
+                        have_closed = true;
+                else
+                        have_open = true;
+
+                if (have_open && have_closed)
+                        return SOCKET_OPEN_SOME;
+        }
+
+        if (have_open)
+                return SOCKET_OPEN_ALL;
+
+        return SOCKET_OPEN_NONE;
+}
+
+static void socket_set_state(Socket *s, SocketState state) {
+        SocketState old_state;
+        assert(s);
+
+        if (s->state != state)
+                bus_unit_send_pending_change_signal(UNIT(s), false);
+
+        old_state = s->state;
+        s->state = state;
+
+        if (!IN_SET(state,
+                    SOCKET_START_PRE,
+                    SOCKET_START_CHOWN,
+                    SOCKET_START_POST,
+                    SOCKET_STOP_PRE,
+                    SOCKET_STOP_PRE_SIGTERM,
+                    SOCKET_STOP_PRE_SIGKILL,
+                    SOCKET_STOP_POST,
+                    SOCKET_FINAL_SIGTERM,
+                    SOCKET_FINAL_SIGKILL,
+                    SOCKET_CLEANING)) {
+
+                s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+                socket_unwatch_control_pid(s);
+                s->control_command = NULL;
+                s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+        }
+
+        if (state != SOCKET_LISTENING)
+                socket_unwatch_fds(s);
+
+        if (!IN_SET(state,
+                    SOCKET_START_CHOWN,
+                    SOCKET_START_POST,
+                    SOCKET_LISTENING,
+                    SOCKET_RUNNING,
+                    SOCKET_STOP_PRE,
+                    SOCKET_STOP_PRE_SIGTERM,
+                    SOCKET_STOP_PRE_SIGKILL,
+                    SOCKET_CLEANING))
+                socket_close_fds(s);
+
+        if (state != old_state)
+                log_unit_debug(UNIT(s), "Changed %s -> %s", socket_state_to_string(old_state), socket_state_to_string(state));
+
+        unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int socket_coldplug(Unit *u) {
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(s);
+        assert(s->state == SOCKET_DEAD);
+
+        if (s->deserialized_state == s->state)
+                return 0;
+
+        if (pidref_is_set(&s->control_pid) &&
+            pidref_is_unwaited(&s->control_pid) > 0 &&
+            IN_SET(s->deserialized_state,
+                   SOCKET_START_PRE,
+                   SOCKET_START_CHOWN,
+                   SOCKET_START_POST,
+                   SOCKET_STOP_PRE,
+                   SOCKET_STOP_PRE_SIGTERM,
+                   SOCKET_STOP_PRE_SIGKILL,
+                   SOCKET_STOP_POST,
+                   SOCKET_FINAL_SIGTERM,
+                   SOCKET_FINAL_SIGKILL,
+                   SOCKET_CLEANING)) {
+
+                r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
+                if (r < 0)
+                        return r;
+
+                r = socket_arm_timer(s, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec));
+                if (r < 0)
+                        return r;
+        }
+
+        if (IN_SET(s->deserialized_state,
+                   SOCKET_START_CHOWN,
+                   SOCKET_START_POST,
+                   SOCKET_LISTENING,
+                   SOCKET_RUNNING)) {
+
+                /* Originally, we used to simply reopen all sockets here that we didn't have file descriptors
+                 * for. However, this is problematic, as we won't traverse through the SOCKET_START_CHOWN state for
+                 * them, and thus the UID/GID wouldn't be right. Hence, instead simply check if we have all fds open,
+                 * and if there's a mismatch, warn loudly. */
+
+                r = socket_check_open(s);
+                if (r == SOCKET_OPEN_NONE)
+                        log_unit_warning(UNIT(s),
+                                         "Socket unit configuration has changed while unit has been running, "
+                                         "no open socket file descriptor left. "
+                                         "The socket unit is not functional until restarted.");
+                else if (r == SOCKET_OPEN_SOME)
+                        log_unit_warning(UNIT(s),
+                                         "Socket unit configuration has changed while unit has been running, "
+                                         "and some socket file descriptors have not been opened yet. "
+                                         "The socket unit is not fully functional until restarted.");
+        }
+
+        if (s->deserialized_state == SOCKET_LISTENING) {
+                r = socket_watch_fds(s);
+                if (r < 0)
+                        return r;
+        }
+
+        if (!IN_SET(s->deserialized_state, SOCKET_DEAD, SOCKET_FAILED, SOCKET_CLEANING))
+                (void) unit_setup_exec_runtime(u);
+
+        socket_set_state(s, s->deserialized_state);
+        return 0;
+}
+
+static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) {
+
+        _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
+                        EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        pid_t pid;
+        int r;
+
+        assert(s);
+        assert(c);
+        assert(ret_pid);
+
+        r = unit_prepare_exec(UNIT(s));
+        if (r < 0)
+                return r;
+
+        r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec);
+        if (r < 0)
+                return r;
+
+        r = unit_set_exec_params(UNIT(s), &exec_params);
+        if (r < 0)
+                return r;
+
+        r = exec_spawn(UNIT(s),
+                       c,
+                       &s->exec_context,
+                       &exec_params,
+                       s->exec_runtime,
+                       &s->cgroup_context,
+                       &pid);
+        if (r < 0)
+                return r;
+
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0)
+                return r;
+
+        r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+        if (r < 0)
+                return r;
+
+        *ret_pid = TAKE_PIDREF(pidref);
+        return 0;
+}
+
+static int socket_chown(Socket *s, PidRef *ret_pid) {
+        _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+        int r;
+
+        assert(s);
+
+        r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec);
+        if (r < 0)
+                return r;
+
+        /* We have to resolve the user names out-of-process, hence
+         * let's fork here. It's messy, but well, what can we do? */
+
+        r = unit_fork_helper_process(UNIT(s), "(sd-chown)", &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                uid_t uid = UID_INVALID;
+                gid_t gid = GID_INVALID;
+
+                /* Child */
+
+                if (!isempty(s->user)) {
+                        const char *user = s->user;
+
+                        r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+                        if (r < 0) {
+                                log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user);
+                                _exit(EXIT_USER);
+                        }
+                }
+
+                if (!isempty(s->group)) {
+                        const char *group = s->group;
+
+                        r = get_group_creds(&group, &gid, 0);
+                        if (r < 0) {
+                                log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group);
+                                _exit(EXIT_GROUP);
+                        }
+                }
+
+                LIST_FOREACH(port, p, s->ports) {
+                        const char *path = NULL;
+
+                        if (p->type == SOCKET_SOCKET)
+                                path = socket_address_get_path(&p->address);
+                        else if (p->type == SOCKET_FIFO)
+                                path = p->path;
+
+                        if (!path)
+                                continue;
+
+                        if (chown(path, uid, gid) < 0) {
+                                log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m");
+                                _exit(EXIT_CHOWN);
+                        }
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        r = unit_watch_pidref(UNIT(s), &pid, /* exclusive= */ true);
+        if (r < 0)
+                return r;
+
+        *ret_pid = TAKE_PIDREF(pid);
+        return 0;
+}
+
+static void socket_enter_dead(Socket *s, SocketResult f) {
+        assert(s);
+
+        if (s->result == SOCKET_SUCCESS)
+                s->result = f;
+
+        if (s->result == SOCKET_SUCCESS)
+                unit_log_success(UNIT(s));
+        else
+                unit_log_failure(UNIT(s), socket_result_to_string(s->result));
+
+        unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+
+        socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD);
+
+        s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
+
+        unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+        unit_unref_uid_gid(UNIT(s), true);
+}
+
+static void socket_enter_signal(Socket *s, SocketState state, SocketResult f);
+
+static void socket_enter_stop_post(Socket *s, SocketResult f) {
+        int r;
+        assert(s);
+
+        if (s->result == SOCKET_SUCCESS)
+                s->result = f;
+
+        socket_unwatch_control_pid(s);
+        s->control_command_id = SOCKET_EXEC_STOP_POST;
+        s->control_command = s->exec_command[SOCKET_EXEC_STOP_POST];
+
+        if (s->control_command) {
+                pidref_done(&s->control_pid);
+
+                r = socket_spawn(s, s->control_command, &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m");
+                        socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES);
+                        return;
+                }
+
+                socket_set_state(s, SOCKET_STOP_POST);
+        } else
+                socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_SUCCESS);
+}
+
+static int state_to_kill_operation(Socket *s, SocketState state) {
+        if (state == SOCKET_STOP_PRE_SIGTERM && unit_has_job_type(UNIT(s), JOB_RESTART))
+                return KILL_RESTART;
+
+        if (state == SOCKET_FINAL_SIGTERM)
+                return KILL_TERMINATE;
+
+        return KILL_KILL;
+}
+
+static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
+        int r;
+
+        assert(s);
+
+        if (s->result == SOCKET_SUCCESS)
+                s->result = f;
+
+        r = unit_kill_context(
+                        UNIT(s),
+                        &s->kill_context,
+                        state_to_kill_operation(s, state),
+                        /* main_pid= */ NULL,
+                        &s->control_pid,
+                        /* main_pid_alien= */ false);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+                goto fail;
+        }
+
+        if (r > 0) {
+                r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                        goto fail;
+                }
+
+                socket_set_state(s, state);
+        } else if (state == SOCKET_STOP_PRE_SIGTERM)
+                socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_SUCCESS);
+        else if (state == SOCKET_STOP_PRE_SIGKILL)
+                socket_enter_stop_post(s, SOCKET_SUCCESS);
+        else if (state == SOCKET_FINAL_SIGTERM)
+                socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS);
+        else
+                socket_enter_dead(s, SOCKET_SUCCESS);
+
+        return;
+
+fail:
+        if (IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_STOP_PRE_SIGKILL))
+                socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES);
+        else
+                socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_stop_pre(Socket *s, SocketResult f) {
+        int r;
+        assert(s);
+
+        if (s->result == SOCKET_SUCCESS)
+                s->result = f;
+
+        socket_unwatch_control_pid(s);
+        s->control_command_id = SOCKET_EXEC_STOP_PRE;
+        s->control_command = s->exec_command[SOCKET_EXEC_STOP_PRE];
+
+        if (s->control_command) {
+                pidref_done(&s->control_pid);
+
+                r = socket_spawn(s, s->control_command, &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-pre' task: %m");
+                        socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES);
+                        return;
+                }
+
+                socket_set_state(s, SOCKET_STOP_PRE);
+        } else
+                socket_enter_stop_post(s, SOCKET_SUCCESS);
+}
+
+static void socket_enter_listening(Socket *s) {
+        int r;
+        assert(s);
+
+        if (!s->accept && s->flush_pending) {
+                log_unit_debug(UNIT(s), "Flushing socket before listening.");
+                flush_ports(s);
+        }
+
+        r = socket_watch_fds(s);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to watch sockets: %m");
+                socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+                return;
+        }
+
+        socket_set_state(s, SOCKET_LISTENING);
+}
+
+static void socket_enter_start_post(Socket *s) {
+        int r;
+        assert(s);
+
+        socket_unwatch_control_pid(s);
+        s->control_command_id = SOCKET_EXEC_START_POST;
+        s->control_command = s->exec_command[SOCKET_EXEC_START_POST];
+
+        if (s->control_command) {
+                pidref_done(&s->control_pid);
+
+                r = socket_spawn(s, s->control_command, &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m");
+                        socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+                        return;
+                }
+
+                socket_set_state(s, SOCKET_START_POST);
+        } else
+                socket_enter_listening(s);
+}
+
+static void socket_enter_start_chown(Socket *s) {
+        int r;
+
+        assert(s);
+
+        r = socket_open_fds(s);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to listen on sockets: %m");
+                goto fail;
+        }
+
+        if (!isempty(s->user) || !isempty(s->group)) {
+
+                socket_unwatch_control_pid(s);
+                s->control_command_id = SOCKET_EXEC_START_CHOWN;
+                s->control_command = NULL;
+
+                r = socket_chown(s, &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-chown' task: %m");
+                        goto fail;
+                }
+
+                socket_set_state(s, SOCKET_START_CHOWN);
+        } else
+                socket_enter_start_post(s);
+
+        return;
+
+fail:
+        socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_start_pre(Socket *s) {
+        int r;
+        assert(s);
+
+        socket_unwatch_control_pid(s);
+
+        unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start);
+
+        s->control_command_id = SOCKET_EXEC_START_PRE;
+        s->control_command = s->exec_command[SOCKET_EXEC_START_PRE];
+
+        if (s->control_command) {
+                pidref_done(&s->control_pid);
+
+                r = socket_spawn(s, s->control_command, &s->control_pid);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m");
+                        socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+                        return;
+                }
+
+                socket_set_state(s, SOCKET_START_PRE);
+        } else
+                socket_enter_start_chown(s);
+}
+
+static void flush_ports(Socket *s) {
+        assert(s);
+
+        /* Flush all incoming traffic, regardless if actual bytes or new connections, so that this socket isn't busy
+         * anymore */
+
+        LIST_FOREACH(port, p, s->ports) {
+                if (p->fd < 0)
+                        continue;
+
+                (void) flush_accept(p->fd);
+                (void) flush_fd(p->fd);
+        }
+}
+
+static void socket_enter_running(Socket *s, int cfd_in) {
+        /* Note that this call takes possession of the connection fd passed. It either has to assign it
+         * somewhere or close it. */
+        _cleanup_close_ int cfd = cfd_in;
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(s);
+
+        /* We don't take connections anymore if we are supposed to shut down anyway */
+        if (unit_stop_pending(UNIT(s))) {
+
+                log_unit_debug(UNIT(s), "Suppressing connection request since unit stop is scheduled.");
+
+                if (cfd >= 0)
+                        goto refuse;
+
+                flush_ports(s);
+                return;
+        }
+
+        if (!ratelimit_below(&s->trigger_limit)) {
+                log_unit_warning(UNIT(s), "Trigger limit hit, refusing further activation.");
+                socket_enter_stop_pre(s, SOCKET_FAILURE_TRIGGER_LIMIT_HIT);
+                goto refuse;
+        }
+
+        if (cfd < 0) { /* Accept=no case */
+                bool pending = false;
+                Unit *other;
+
+                /* If there's already a start pending don't bother to do anything */
+                UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_TRIGGERS)
+                        if (unit_active_or_pending(other)) {
+                                pending = true;
+                                break;
+                        }
+
+                if (!pending) {
+                        if (!UNIT_ISSET(s->service)) {
+                                r = log_unit_warning_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT),
+                                                           "Service to activate vanished, refusing activation.");
+                                goto fail;
+                        }
+
+                        r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT_DEREF(s->service), JOB_REPLACE, NULL, &error, NULL);
+                        if (r < 0)
+                                goto queue_error;
+                }
+
+                socket_set_state(s, SOCKET_RUNNING);
+        } else { /* Accept=yes case */
+                _cleanup_(socket_peer_unrefp) SocketPeer *p = NULL;
+                Unit *service;
+
+                if (s->n_connections >= s->max_connections) {
+                        log_unit_warning(UNIT(s), "Too many incoming connections (%u), dropping connection.",
+                                         s->n_connections);
+                        goto refuse;
+                }
+
+                if (s->max_connections_per_source > 0) {
+                        r = socket_acquire_peer(s, cfd, &p);
+                        if (r < 0) {
+                                if (ERRNO_IS_DISCONNECT(r))
+                                        return;
+                                /* We didn't have enough resources to acquire peer information, let's fail. */
+                                goto fail;
+                        }
+                        if (r > 0 && p->n_ref > s->max_connections_per_source) {
+                                _cleanup_free_ char *t = NULL;
+
+                                (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t);
+
+                                log_unit_warning(UNIT(s),
+                                                 "Too many incoming connections (%u) from source %s, dropping connection.",
+                                                 p->n_ref, strnull(t));
+                                goto refuse;
+                        }
+                }
+
+                r = socket_load_service_unit(s, cfd, &service);
+                if (r < 0) {
+                        if (ERRNO_IS_DISCONNECT(r))
+                                return;
+
+                        log_unit_warning_errno(UNIT(s), r, "Failed to load connection service unit: %m");
+                        goto fail;
+                }
+
+                r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service,
+                                              false, UNIT_DEPENDENCY_IMPLICIT);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to add Before=/Triggers= dependencies on connection unit: %m");
+                        goto fail;
+                }
+
+                s->n_accepted++;
+
+                r = service_set_socket_fd(SERVICE(service), cfd, s, p, s->selinux_context_from_net);
+                if (r < 0) {
+                        if (ERRNO_IS_DISCONNECT(r))
+                                return;
+
+                        log_unit_warning_errno(UNIT(s), r, "Failed to set socket on service: %m");
+                        goto fail;
+                }
+
+                TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */
+                s->n_connections++;
+
+                r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL);
+                if (r < 0) {
+                        /* We failed to activate the new service, but it still exists. Let's make sure the
+                         * service closes and forgets the connection fd again, immediately. */
+                        service_release_socket_fd(SERVICE(service));
+                        goto queue_error;
+                }
+
+                /* Notify clients about changed counters */
+                unit_add_to_dbus_queue(UNIT(s));
+        }
+
+        return;
+
+refuse:
+        s->n_refused++;
+        return;
+
+queue_error:
+        if (ERRNO_IS_RESOURCE(r))
+                log_unit_warning(UNIT(s), "Failed to queue service startup job: %s",
+                                 bus_error_message(&error, r));
+        else
+                log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s",
+                                 cfd >= 0 ? "template" : "non-template",
+                                 bus_error_message(&error, r));
+
+fail:
+        socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_run_next(Socket *s) {
+        int r;
+
+        assert(s);
+        assert(s->control_command);
+        assert(s->control_command->command_next);
+
+        socket_unwatch_control_pid(s);
+
+        s->control_command = s->control_command->command_next;
+
+        pidref_done(&s->control_pid);
+
+        r = socket_spawn(s, s->control_command, &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to spawn next task: %m");
+
+                if (s->state == SOCKET_START_POST)
+                        socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+                else if (s->state == SOCKET_STOP_POST)
+                        socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+                else
+                        socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES);
+        }
+}
+
+static int socket_start(Unit *u) {
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(s);
+
+        /* We cannot fulfill this request right now, try again later
+         * please! */
+        if (IN_SET(s->state,
+                   SOCKET_STOP_PRE,
+                   SOCKET_STOP_PRE_SIGKILL,
+                   SOCKET_STOP_PRE_SIGTERM,
+                   SOCKET_STOP_POST,
+                   SOCKET_FINAL_SIGTERM,
+                   SOCKET_FINAL_SIGKILL,
+                   SOCKET_CLEANING))
+                return -EAGAIN;
+
+        /* Already on it! */
+        if (IN_SET(s->state,
+                   SOCKET_START_PRE,
+                   SOCKET_START_CHOWN,
+                   SOCKET_START_POST))
+                return 0;
+
+        /* Cannot run this without the service being around */
+        if (UNIT_ISSET(s->service)) {
+                Service *service;
+
+                service = SERVICE(UNIT_DEREF(s->service));
+
+                if (UNIT(service)->load_state != UNIT_LOADED)
+                        return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+                                                    "Socket service %s not loaded, refusing.", UNIT(service)->id);
+
+                /* If the service is already active we cannot start the
+                 * socket */
+                if (!IN_SET(service->state,
+                            SERVICE_DEAD, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED, SERVICE_FAILED_BEFORE_AUTO_RESTART,
+                            SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED))
+                        return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY),
+                                                    "Socket service %s already active, refusing.", UNIT(service)->id);
+        }
+
+        assert(IN_SET(s->state, SOCKET_DEAD, SOCKET_FAILED));
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        s->result = SOCKET_SUCCESS;
+        exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
+
+        u->reset_accounting = true;
+
+        socket_enter_start_pre(s);
+        return 1;
+}
+
+static int socket_stop(Unit *u) {
+        Socket *s = SOCKET(u);
+
+        assert(s);
+
+        /* Already on it */
+        if (IN_SET(s->state,
+                   SOCKET_STOP_PRE,
+                   SOCKET_STOP_PRE_SIGTERM,
+                   SOCKET_STOP_PRE_SIGKILL,
+                   SOCKET_STOP_POST,
+                   SOCKET_FINAL_SIGTERM,
+                   SOCKET_FINAL_SIGKILL))
+                return 0;
+
+        /* If there's already something running we go directly into
+         * kill mode. */
+        if (IN_SET(s->state,
+                   SOCKET_START_PRE,
+                   SOCKET_START_CHOWN,
+                   SOCKET_START_POST)) {
+                socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_SUCCESS);
+                return -EAGAIN;
+        }
+
+        /* If we are currently cleaning, then abort it, brutally. */
+        if (s->state == SOCKET_CLEANING) {
+                socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS);
+                return 0;
+        }
+
+        assert(IN_SET(s->state, SOCKET_LISTENING, SOCKET_RUNNING));
+
+        socket_enter_stop_pre(s, SOCKET_SUCCESS);
+        return 1;
+}
+
+static int socket_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", socket_state_to_string(s->state));
+        (void) serialize_item(f, "result", socket_result_to_string(s->result));
+        (void) serialize_item_format(f, "n-accepted", "%u", s->n_accepted);
+        (void) serialize_item_format(f, "n-refused", "%u", s->n_refused);
+        (void) serialize_pidref(f, fds, "control-pid", &s->control_pid);
+
+        if (s->control_command_id >= 0)
+                (void) serialize_item(f, "control-command", socket_exec_command_to_string(s->control_command_id));
+
+        LIST_FOREACH(port, p, s->ports) {
+                int copy;
+
+                if (p->fd < 0)
+                        continue;
+
+                copy = fdset_put_dup(fds, p->fd);
+                if (copy < 0)
+                        return log_unit_warning_errno(u, copy, "Failed to serialize socket fd: %m");
+
+                if (p->type == SOCKET_SOCKET) {
+                        _cleanup_free_ char *t = NULL;
+
+                        r = socket_address_print(&p->address, &t);
+                        if (r < 0)
+                                return log_unit_error_errno(u, r, "Failed to format socket address: %m");
+
+                        if (socket_address_family(&p->address) == AF_NETLINK)
+                                (void) serialize_item_format(f, "netlink", "%i %s", copy, t);
+                        else
+                                (void) serialize_item_format(f, "socket", "%i %i %s", copy, p->address.type, t);
+                } else if (p->type == SOCKET_SPECIAL)
+                        (void) serialize_item_format(f, "special", "%i %s", copy, p->path);
+                else if (p->type == SOCKET_MQUEUE)
+                        (void) serialize_item_format(f, "mqueue", "%i %s", copy, p->path);
+                else if (p->type == SOCKET_USB_FUNCTION)
+                        (void) serialize_item_format(f, "ffs", "%i %s", copy, p->path);
+                else {
+                        assert(p->type == SOCKET_FIFO);
+                        (void) serialize_item_format(f, "fifo", "%i %s", copy, p->path);
+                }
+        }
+
+        (void) serialize_ratelimit(f, "trigger-ratelimit", &s->trigger_limit);
+
+        return 0;
+}
+
+static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(u);
+        assert(key);
+        assert(value);
+
+        if (streq(key, "state")) {
+                SocketState state;
+
+                state = socket_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        s->deserialized_state = state;
+        } else if (streq(key, "result")) {
+                SocketResult f;
+
+                f = socket_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse result value: %s", value);
+                else if (f != SOCKET_SUCCESS)
+                        s->result = f;
+
+        } else if (streq(key, "n-accepted")) {
+                unsigned k;
+
+                if (safe_atou(value, &k) < 0)
+                        log_unit_debug(u, "Failed to parse n-accepted value: %s", value);
+                else
+                        s->n_accepted += k;
+        } else if (streq(key, "n-refused")) {
+                unsigned k;
+
+                if (safe_atou(value, &k) < 0)
+                        log_unit_debug(u, "Failed to parse n-refused value: %s", value);
+                else
+                        s->n_refused += k;
+        } else if (streq(key, "control-pid")) {
+                pidref_done(&s->control_pid);
+                (void) deserialize_pidref(fds, value, &s->control_pid);
+
+        } else if (streq(key, "control-command")) {
+                SocketExecCommand id;
+
+                id = socket_exec_command_from_string(value);
+                if (id < 0)
+                        log_unit_debug(u, "Failed to parse exec-command value: %s", value);
+                else {
+                        s->control_command_id = id;
+                        s->control_command = s->exec_command[id];
+                }
+        } else if (streq(key, "fifo")) {
+                _cleanup_free_ char *fdv = NULL;
+                bool found = false;
+                int fd;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse fifo value: %s", value);
+                        return 0;
+                }
+
+                fd = parse_fd(fdv);
+                if (fd < 0 || !fdset_contains(fds, fd)) {
+                        log_unit_debug(u, "Invalid fifo value: %s", fdv);
+                        return 0;
+                }
+
+                LIST_FOREACH(port, p, s->ports)
+                        if (p->fd < 0 &&
+                            p->type == SOCKET_FIFO &&
+                            path_equal_or_inode_same(p->path, value, 0)) {
+                                p->fd = fdset_remove(fds, fd);
+                                found = true;
+                                break;
+                        }
+                if (!found)
+                        log_unit_debug(u, "No matching fifo socket found: %s", value);
+
+        } else if (streq(key, "special")) {
+                _cleanup_free_ char *fdv = NULL;
+                bool found = false;
+                int fd;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse special value: %s", value);
+                        return 0;
+                }
+
+                fd = parse_fd(fdv);
+                if (fd < 0 || !fdset_contains(fds, fd)) {
+                        log_unit_debug(u, "Invalid special value: %s", fdv);
+                        return 0;
+                }
+
+                LIST_FOREACH(port, p, s->ports)
+                        if (p->fd < 0 &&
+                            p->type == SOCKET_SPECIAL &&
+                            path_equal_or_inode_same(p->path, value, 0)) {
+                                p->fd = fdset_remove(fds, fd);
+                                found = true;
+                                break;
+                        }
+                if (!found)
+                        log_unit_debug(u, "No matching special socket found: %s", value);
+
+        } else if (streq(key, "mqueue")) {
+                _cleanup_free_ char *fdv = NULL;
+                bool found = false;
+                int fd;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse mqueue value: %s", value);
+                        return 0;
+                }
+
+                fd = parse_fd(fdv);
+                if (fd < 0 || !fdset_contains(fds, fd)) {
+                        log_unit_debug(u, "Invalid mqueue value: %s", fdv);
+                        return 0;
+                }
+
+                LIST_FOREACH(port, p, s->ports)
+                        if (p->fd < 0 &&
+                            p->type == SOCKET_MQUEUE &&
+                            streq(p->path, value)) {
+                                p->fd = fdset_remove(fds, fd);
+                                found = true;
+                                break;
+                        }
+                if (!found)
+                        log_unit_debug(u, "No matching mqueue socket found: %s", value);
+
+        } else if (streq(key, "socket")) {
+                _cleanup_free_ char *fdv = NULL, *typev = NULL;
+                bool found = false;
+                int fd, type;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse socket fd from value: %s", value);
+                        return 0;
+                }
+
+                fd = parse_fd(fdv);
+                if (fd < 0 || !fdset_contains(fds, fd)) {
+                        log_unit_debug(u, "Invalid socket fd: %s", fdv);
+                        return 0;
+                }
+
+                r = extract_first_word(&value, &typev, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse socket type from value: %s", value);
+                        return 0;
+                }
+
+                if (safe_atoi(typev, &type) < 0 || type < 0) {
+                        log_unit_debug(u, "Invalid socket type: %s", typev);
+                        return 0;
+                }
+
+                LIST_FOREACH(port, p, s->ports)
+                        if (p->fd < 0 &&
+                            socket_address_is(&p->address, value, type)) {
+                                p->fd = fdset_remove(fds, fd);
+                                found = true;
+                                break;
+                        }
+                if (!found)
+                        log_unit_debug(u, "No matching %s socket found: %s",
+                                       socket_address_type_to_string(type), value);
+
+        } else if (streq(key, "netlink")) {
+                _cleanup_free_ char *fdv = NULL;
+                bool found = false;
+                int fd;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse socket value: %s", value);
+                        return 0;
+                }
+
+                fd = parse_fd(fdv);
+                if (fd < 0 || !fdset_contains(fds, fd)) {
+                        log_unit_debug(u, "Invalid socket value: %s", fdv);
+                        return 0;
+                }
+
+                LIST_FOREACH(port, p, s->ports)
+                        if (p->fd < 0 &&
+                            socket_address_is_netlink(&p->address, value)) {
+                                p->fd = fdset_remove(fds, fd);
+                                found = true;
+                                break;
+                        }
+                if (!found)
+                        log_unit_debug(u, "No matching netlink socket found: %s", value);
+
+        } else if (streq(key, "ffs")) {
+                _cleanup_free_ char *fdv = NULL;
+                bool found = false;
+                int fd;
+
+                r = extract_first_word(&value, &fdv, NULL, 0);
+                if (r <= 0) {
+                        log_unit_debug(u, "Failed to parse ffs value: %s", value);
+                        return 0;
+                }
+
+                fd = parse_fd(fdv);
+                if (fd < 0 || !fdset_contains(fds, fd)) {
+                        log_unit_debug(u, "Invalid ffs value: %s", fdv);
+                        return 0;
+                }
+
+                LIST_FOREACH(port, p, s->ports)
+                        if (p->fd < 0 &&
+                            p->type == SOCKET_USB_FUNCTION &&
+                            path_equal_or_inode_same(p->path, value, 0)) {
+                                p->fd = fdset_remove(fds, fd);
+                                found = true;
+                                break;
+                        }
+                if (!found)
+                        log_unit_debug(u, "No matching ffs socket found: %s", value);
+
+        } else if (streq(key, "trigger-ratelimit"))
+                deserialize_ratelimit(&s->trigger_limit, key, value);
+
+        else
+                log_unit_debug(UNIT(s), "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static void socket_distribute_fds(Unit *u, FDSet *fds) {
+        Socket *s = SOCKET(u);
+
+        assert(u);
+
+        LIST_FOREACH(port, p, s->ports) {
+                int fd;
+
+                if (p->type != SOCKET_SOCKET)
+                        continue;
+
+                if (p->fd >= 0)
+                        continue;
+
+                FDSET_FOREACH(fd, fds) {
+                        if (socket_address_matches_fd(&p->address, fd)) {
+                                p->fd = fdset_remove(fds, fd);
+                                s->deserialized_state = SOCKET_LISTENING;
+                                break;
+                        }
+                }
+        }
+}
+
+static UnitActiveState socket_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[SOCKET(u)->state];
+}
+
+static const char *socket_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return socket_state_to_string(SOCKET(u)->state);
+}
+
+int socket_port_to_address(const SocketPort *p, char **ret) {
+        _cleanup_free_ char *address = NULL;
+        int r;
+
+        assert(p);
+        assert(ret);
+
+        switch (p->type) {
+                case SOCKET_SOCKET: {
+                        r = socket_address_print(&p->address, &address);
+                        if (r < 0)
+                                return r;
+
+                        break;
+                }
+
+                case SOCKET_SPECIAL:
+                case SOCKET_MQUEUE:
+                case SOCKET_FIFO:
+                case SOCKET_USB_FUNCTION:
+                        address = strdup(p->path);
+                        if (!address)
+                                return -ENOMEM;
+                        break;
+
+                default:
+                        assert_not_reached();
+        }
+
+        *ret = TAKE_PTR(address);
+
+        return 0;
+}
+
+const char* socket_port_type_to_string(SocketPort *p) {
+
+        assert(p);
+
+        switch (p->type) {
+
+        case SOCKET_SOCKET:
+
+                switch (p->address.type) {
+
+                case SOCK_STREAM:
+                        return "Stream";
+
+                case SOCK_DGRAM:
+                        return "Datagram";
+
+                case SOCK_SEQPACKET:
+                        return "SequentialPacket";
+
+                case SOCK_RAW:
+                        if (socket_address_family(&p->address) == AF_NETLINK)
+                                return "Netlink";
+
+                        _fallthrough_;
+                default:
+                        return NULL;
+                }
+
+        case SOCKET_SPECIAL:
+                return "Special";
+
+        case SOCKET_MQUEUE:
+                return "MessageQueue";
+
+        case SOCKET_FIFO:
+                return "FIFO";
+
+        case SOCKET_USB_FUNCTION:
+                return "USBFunction";
+
+        default:
+                return NULL;
+        }
+}
+
+SocketType socket_port_type_from_string(const char *s) {
+        assert(s);
+
+        if (STR_IN_SET(s, "Stream", "Datagram", "SequentialPacket", "Netlink"))
+                return SOCKET_SOCKET;
+        else if (streq(s, "Special"))
+                return SOCKET_SPECIAL;
+        else if (streq(s, "MessageQueue"))
+                return SOCKET_MQUEUE;
+        else if (streq(s, "FIFO"))
+                return SOCKET_FIFO;
+        else if (streq(s, "USBFunction"))
+                return SOCKET_USB_FUNCTION;
+        else
+                return _SOCKET_TYPE_INVALID;
+}
+
+static bool socket_may_gc(Unit *u) {
+        Socket *s = SOCKET(u);
+
+        assert(u);
+
+        return s->n_connections == 0;
+}
+
+static int socket_accept_do(Socket *s, int fd) {
+        int cfd;
+
+        assert(s);
+        assert(fd >= 0);
+
+        cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+        if (cfd < 0)
+                /* Convert transient network errors into clean and well-defined EAGAIN */
+                return ERRNO_IS_ACCEPT_AGAIN(errno) ? -EAGAIN : -errno;
+
+        return cfd;
+}
+
+static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) {
+        _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+        _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+        int cfd, r;
+
+        assert(s);
+        assert(p);
+        assert(fd >= 0);
+
+        /* Similar to socket_address_listen_in_cgroup(), but for accept() rather than socket(): make sure that any
+         * connection socket is also properly associated with the cgroup. */
+
+        if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6))
+                goto shortcut;
+
+        r = bpf_firewall_supported();
+        if (r < 0)
+                return r;
+        if (r == BPF_FIREWALL_UNSUPPORTED)
+                goto shortcut;
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)
+                return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m");
+
+        r = unit_fork_helper_process(UNIT(s), "(sd-accept)", &pid);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m");
+        if (r == 0) {
+                /* Child */
+
+                pair[0] = safe_close(pair[0]);
+
+                cfd = socket_accept_do(s, fd);
+                if (cfd == -EAGAIN) /* spurious accept() */
+                        _exit(EXIT_SUCCESS);
+                if (cfd < 0) {
+                        log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = send_one_fd(pair[1], cfd, 0);
+                if (r < 0) {
+                        log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        pair[1] = safe_close(pair[1]);
+        cfd = receive_one_fd(pair[0], 0);
+
+        /* We synchronously wait for the helper, as it shouldn't be slow */
+        r = wait_for_terminate_and_check("(sd-accept)", pid.pid, WAIT_LOG_ABNORMAL);
+        if (r < 0) {
+                safe_close(cfd);
+                return r;
+        }
+
+        /* If we received no fd, we got EIO here. If this happens with a process exit code of EXIT_SUCCESS
+         * this is a spurious accept(), let's convert that back to EAGAIN here. */
+        if (cfd == -EIO)
+                return -EAGAIN;
+        if (cfd < 0)
+                return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m");
+
+        return cfd;
+
+shortcut:
+        cfd = socket_accept_do(s, fd);
+        if (cfd == -EAGAIN) /* spurious accept(), skip it silently */
+                return -EAGAIN;
+        if (cfd < 0)
+                return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m");
+
+        return cfd;
+}
+
+static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        SocketPort *p = ASSERT_PTR(userdata);
+        int cfd = -EBADF;
+
+        assert(fd >= 0);
+
+        if (p->socket->state != SOCKET_LISTENING)
+                return 0;
+
+        log_unit_debug(UNIT(p->socket), "Incoming traffic");
+
+        if (revents != EPOLLIN) {
+                if (revents & EPOLLHUP)
+                        log_unit_error(UNIT(p->socket), "Got POLLHUP on a listening socket. The service probably invoked shutdown() on it, and should better not do that.");
+                else
+                        log_unit_error(UNIT(p->socket), "Got unexpected poll event (0x%x) on socket.", revents);
+                goto fail;
+        }
+
+        if (p->socket->accept &&
+            p->type == SOCKET_SOCKET &&
+            socket_address_can_accept(&p->address)) {
+
+                cfd = socket_accept_in_cgroup(p->socket, p, fd);
+                if (cfd == -EAGAIN) /* Spurious accept() */
+                        return 0;
+                if (cfd < 0)
+                        goto fail;
+
+                socket_apply_socket_options(p->socket, p, cfd);
+        }
+
+        socket_enter_running(p->socket, cfd);
+        return 0;
+
+fail:
+        socket_enter_stop_pre(p->socket, SOCKET_FAILURE_RESOURCES);
+        return 0;
+}
+
+static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+        Socket *s = SOCKET(u);
+        SocketResult f;
+
+        assert(s);
+        assert(pid >= 0);
+
+        if (pid != s->control_pid.pid)
+                return;
+
+        pidref_done(&s->control_pid);
+
+        if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+                f = SOCKET_SUCCESS;
+        else if (code == CLD_EXITED)
+                f = SOCKET_FAILURE_EXIT_CODE;
+        else if (code == CLD_KILLED)
+                f = SOCKET_FAILURE_SIGNAL;
+        else if (code == CLD_DUMPED)
+                f = SOCKET_FAILURE_CORE_DUMP;
+        else
+                assert_not_reached();
+
+        if (s->control_command) {
+                exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+                if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+                        f = SOCKET_SUCCESS;
+        }
+
+        unit_log_process_exit(
+                        u,
+                        "Control process",
+                        socket_exec_command_to_string(s->control_command_id),
+                        f == SOCKET_SUCCESS,
+                        code, status);
+
+        if (s->result == SOCKET_SUCCESS)
+                s->result = f;
+
+        if (s->control_command &&
+            s->control_command->command_next &&
+            f == SOCKET_SUCCESS) {
+
+                log_unit_debug(u, "Running next command for state %s", socket_state_to_string(s->state));
+                socket_run_next(s);
+        } else {
+                s->control_command = NULL;
+                s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+                /* No further commands for this step, so let's figure
+                 * out what to do next */
+
+                log_unit_debug(u, "Got final SIGCHLD for state %s", socket_state_to_string(s->state));
+
+                switch (s->state) {
+
+                case SOCKET_START_PRE:
+                        if (f == SOCKET_SUCCESS)
+                                socket_enter_start_chown(s);
+                        else
+                                socket_enter_signal(s, SOCKET_FINAL_SIGTERM, f);
+                        break;
+
+                case SOCKET_START_CHOWN:
+                        if (f == SOCKET_SUCCESS)
+                                socket_enter_start_post(s);
+                        else
+                                socket_enter_stop_pre(s, f);
+                        break;
+
+                case SOCKET_START_POST:
+                        if (f == SOCKET_SUCCESS)
+                                socket_enter_listening(s);
+                        else
+                                socket_enter_stop_pre(s, f);
+                        break;
+
+                case SOCKET_STOP_PRE:
+                case SOCKET_STOP_PRE_SIGTERM:
+                case SOCKET_STOP_PRE_SIGKILL:
+                        socket_enter_stop_post(s, f);
+                        break;
+
+                case SOCKET_STOP_POST:
+                case SOCKET_FINAL_SIGTERM:
+                case SOCKET_FINAL_SIGKILL:
+                        socket_enter_dead(s, f);
+                        break;
+
+                case SOCKET_CLEANING:
+
+                        if (s->clean_result == SOCKET_SUCCESS)
+                                s->clean_result = f;
+
+                        socket_enter_dead(s, SOCKET_SUCCESS);
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        /* Notify clients about changed exit status */
+        unit_add_to_dbus_queue(u);
+}
+
+static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Socket *s = SOCKET(userdata);
+
+        assert(s);
+        assert(s->timer_event_source == source);
+
+        switch (s->state) {
+
+        case SOCKET_START_PRE:
+                log_unit_warning(UNIT(s), "Starting timed out. Terminating.");
+                socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+                break;
+
+        case SOCKET_START_CHOWN:
+        case SOCKET_START_POST:
+                log_unit_warning(UNIT(s), "Starting timed out. Stopping.");
+                socket_enter_stop_pre(s, SOCKET_FAILURE_TIMEOUT);
+                break;
+
+        case SOCKET_STOP_PRE:
+                log_unit_warning(UNIT(s), "Stopping timed out. Terminating.");
+                socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+                break;
+
+        case SOCKET_STOP_PRE_SIGTERM:
+                if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+                        socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL. Ignoring.");
+                        socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT);
+                }
+                break;
+
+        case SOCKET_STOP_PRE_SIGKILL:
+                log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring.");
+                socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT);
+                break;
+
+        case SOCKET_STOP_POST:
+                log_unit_warning(UNIT(s), "Stopping timed out (2). Terminating.");
+                socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+                break;
+
+        case SOCKET_FINAL_SIGTERM:
+                if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "Stopping timed out (2). Killing.");
+                        socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "Stopping timed out (2). Skipping SIGKILL. Ignoring.");
+                        socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT);
+                }
+                break;
+
+        case SOCKET_FINAL_SIGKILL:
+                log_unit_warning(UNIT(s), "Still around after SIGKILL (2). Entering failed mode.");
+                socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT);
+                break;
+
+        case SOCKET_CLEANING:
+                log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+                if (s->clean_result == SOCKET_SUCCESS)
+                        s->clean_result = SOCKET_FAILURE_TIMEOUT;
+
+                socket_enter_signal(s, SOCKET_FINAL_SIGKILL, 0);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+int socket_collect_fds(Socket *s, int **fds) {
+        size_t k = 0, n = 0;
+        int *rfds;
+
+        assert(s);
+        assert(fds);
+
+        /* Called from the service code for requesting our fds */
+
+        LIST_FOREACH(port, p, s->ports) {
+                if (p->fd >= 0)
+                        n++;
+                n += p->n_auxiliary_fds;
+        }
+
+        if (n <= 0) {
+                *fds = NULL;
+                return 0;
+        }
+
+        rfds = new(int, n);
+        if (!rfds)
+                return -ENOMEM;
+
+        LIST_FOREACH(port, p, s->ports) {
+                if (p->fd >= 0)
+                        rfds[k++] = p->fd;
+                for (size_t i = 0; i < p->n_auxiliary_fds; ++i)
+                        rfds[k++] = p->auxiliary_fds[i];
+        }
+
+        assert(k == n);
+
+        *fds = rfds;
+        return (int) n;
+}
+
+static void socket_reset_failed(Unit *u) {
+        Socket *s = SOCKET(u);
+
+        assert(s);
+
+        if (s->state == SOCKET_FAILED)
+                socket_set_state(s, SOCKET_DEAD);
+
+        s->result = SOCKET_SUCCESS;
+        s->clean_result = SOCKET_SUCCESS;
+}
+
+void socket_connection_unref(Socket *s) {
+        assert(s);
+
+        /* The service is dead. Yay!
+         *
+         * This is strictly for one-instance-per-connection
+         * services. */
+
+        assert(s->n_connections > 0);
+        s->n_connections--;
+
+        log_unit_debug(UNIT(s), "One connection closed, %u left.", s->n_connections);
+}
+
+static void socket_trigger_notify(Unit *u, Unit *other) {
+        Socket *s = SOCKET(u);
+
+        assert(u);
+        assert(other);
+
+        /* Filter out invocations with bogus state */
+        assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+        assert(other->type == UNIT_SERVICE);
+
+        /* Don't propagate state changes from the service if we are already down */
+        if (!IN_SET(s->state, SOCKET_RUNNING, SOCKET_LISTENING))
+                return;
+
+        /* We don't care for the service state if we are in Accept=yes mode */
+        if (s->accept)
+                return;
+
+        /* Propagate start limit hit state */
+        if (other->start_limit_hit) {
+                socket_enter_stop_pre(s, SOCKET_FAILURE_SERVICE_START_LIMIT_HIT);
+                return;
+        }
+
+        /* Don't propagate anything if there's still a job queued */
+        if (other->job)
+                return;
+
+        if (IN_SET(SERVICE(other)->state,
+                   SERVICE_DEAD, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED, SERVICE_FAILED_BEFORE_AUTO_RESTART,
+                   SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+                   SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED))
+               socket_enter_listening(s);
+
+        if (SERVICE(other)->state == SERVICE_RUNNING)
+                socket_set_state(s, SOCKET_RUNNING);
+}
+
+static int socket_get_timeout(Unit *u, usec_t *timeout) {
+        Socket *s = SOCKET(u);
+        usec_t t;
+        int r;
+
+        if (!s->timer_event_source)
+                return 0;
+
+        r = sd_event_source_get_time(s->timer_event_source, &t);
+        if (r < 0)
+                return r;
+        if (t == USEC_INFINITY)
+                return 0;
+
+        *timeout = t;
+        return 1;
+}
+
+char *socket_fdname(Socket *s) {
+        assert(s);
+
+        /* Returns the name to use for $LISTEN_NAMES. If the user
+         * didn't specify anything specifically, use the socket unit's
+         * name as fallback. */
+
+        return s->fdname ?: UNIT(s)->id;
+}
+
+static PidRef *socket_control_pid(Unit *u) {
+        return &ASSERT_PTR(SOCKET(u))->control_pid;
+}
+
+static int socket_clean(Unit *u, ExecCleanMask mask) {
+        _cleanup_strv_free_ char **l = NULL;
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(s);
+        assert(mask != 0);
+
+        if (s->state != SOCKET_DEAD)
+                return -EBUSY;
+
+        r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+        if (r < 0)
+                return r;
+
+        if (strv_isempty(l))
+                return -EUNATCH;
+
+        socket_unwatch_control_pid(s);
+        s->clean_result = SOCKET_SUCCESS;
+        s->control_command = NULL;
+        s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+        r = socket_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to install timer: %m");
+                goto fail;
+        }
+
+        r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+                goto fail;
+        }
+
+        socket_set_state(s, SOCKET_CLEANING);
+        return 0;
+
+fail:
+        s->clean_result = SOCKET_FAILURE_RESOURCES;
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+        return r;
+}
+
+static int socket_can_clean(Unit *u, ExecCleanMask *ret) {
+        Socket *s = SOCKET(u);
+
+        assert(s);
+
+        return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static int socket_can_start(Unit *u) {
+        Socket *s = SOCKET(u);
+        int r;
+
+        assert(s);
+
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT);
+                return r;
+        }
+
+        return 1;
+}
+
+static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = {
+        [SOCKET_EXEC_START_PRE]   = "ExecStartPre",
+        [SOCKET_EXEC_START_CHOWN] = "ExecStartChown",
+        [SOCKET_EXEC_START_POST]  = "ExecStartPost",
+        [SOCKET_EXEC_STOP_PRE]    = "ExecStopPre",
+        [SOCKET_EXEC_STOP_POST]   = "ExecStopPost"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand);
+
+static const char* const socket_result_table[_SOCKET_RESULT_MAX] = {
+        [SOCKET_SUCCESS]                         = "success",
+        [SOCKET_FAILURE_RESOURCES]               = "resources",
+        [SOCKET_FAILURE_TIMEOUT]                 = "timeout",
+        [SOCKET_FAILURE_EXIT_CODE]               = "exit-code",
+        [SOCKET_FAILURE_SIGNAL]                  = "signal",
+        [SOCKET_FAILURE_CORE_DUMP]               = "core-dump",
+        [SOCKET_FAILURE_START_LIMIT_HIT]         = "start-limit-hit",
+        [SOCKET_FAILURE_TRIGGER_LIMIT_HIT]       = "trigger-limit-hit",
+        [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult);
+
+static const char* const socket_timestamping_table[_SOCKET_TIMESTAMPING_MAX] = {
+        [SOCKET_TIMESTAMPING_OFF] = "off",
+        [SOCKET_TIMESTAMPING_US]  = "us",
+        [SOCKET_TIMESTAMPING_NS]  = "ns",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_timestamping, SocketTimestamping);
+
+SocketTimestamping socket_timestamping_from_string_harder(const char *p) {
+        SocketTimestamping t;
+        int r;
+
+        if (!p)
+                return _SOCKET_TIMESTAMPING_INVALID;
+
+        t = socket_timestamping_from_string(p);
+        if (t >= 0)
+                return t;
+
+        /* Let's alternatively support the various other aliases parse_time() accepts for ns and µs here,
+         * too. */
+        if (streq(p, "nsec"))
+                return SOCKET_TIMESTAMPING_NS;
+        if (STR_IN_SET(p, "usec", "µs", "μs")) /* Accept both small greek letter mu + micro sign unicode codepoints */
+                return SOCKET_TIMESTAMPING_US;
+
+        r = parse_boolean(p);
+        if (r < 0)
+                return _SOCKET_TIMESTAMPING_INVALID;
+
+        return r ? SOCKET_TIMESTAMPING_NS : SOCKET_TIMESTAMPING_OFF; /* If boolean yes, default to ns accuracy */
+}
+
+const UnitVTable socket_vtable = {
+        .object_size = sizeof(Socket),
+        .exec_context_offset = offsetof(Socket, exec_context),
+        .cgroup_context_offset = offsetof(Socket, cgroup_context),
+        .kill_context_offset = offsetof(Socket, kill_context),
+        .exec_runtime_offset = offsetof(Socket, exec_runtime),
+
+        .sections =
+                "Unit\0"
+                "Socket\0"
+                "Install\0",
+        .private_section = "Socket",
+
+        .can_transient = true,
+        .can_trigger = true,
+        .can_fail = true,
+
+        .init = socket_init,
+        .done = socket_done,
+        .load = socket_load,
+
+        .coldplug = socket_coldplug,
+
+        .dump = socket_dump,
+
+        .start = socket_start,
+        .stop = socket_stop,
+
+        .clean = socket_clean,
+        .can_clean = socket_can_clean,
+
+        .get_timeout = socket_get_timeout,
+
+        .serialize = socket_serialize,
+        .deserialize_item = socket_deserialize_item,
+        .distribute_fds = socket_distribute_fds,
+
+        .active_state = socket_active_state,
+        .sub_state_to_string = socket_sub_state_to_string,
+
+        .will_restart = unit_will_restart_default,
+
+        .may_gc = socket_may_gc,
+
+        .sigchld_event = socket_sigchld_event,
+
+        .trigger_notify = socket_trigger_notify,
+
+        .reset_failed = socket_reset_failed,
+
+        .control_pid = socket_control_pid,
+
+        .bus_set_property = bus_socket_set_property,
+        .bus_commit_properties = bus_socket_commit_properties,
+
+        .status_message_formats = {
+                .finished_start_job = {
+                        [JOB_DONE]       = "Listening on %s.",
+                        [JOB_FAILED]     = "Failed to listen on %s.",
+                        [JOB_TIMEOUT]    = "Timed out starting %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Closed %s.",
+                        [JOB_FAILED]     = "Failed stopping %s.",
+                        [JOB_TIMEOUT]    = "Timed out stopping %s.",
+                },
+        },
+
+        .can_start = socket_can_start,
+};
diff --git a/src/core/socket.h b/src/core/socket.h
new file mode 100644
index 0000000..0983e8c
--- /dev/null
+++ b/src/core/socket.h
@@ -0,0 +1,204 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Socket Socket;
+typedef struct SocketPeer SocketPeer;
+
+#include "mount.h"
+#include "pidref.h"
+#include "socket-util.h"
+#include "unit.h"
+
+typedef enum SocketExecCommand {
+        SOCKET_EXEC_START_PRE,
+        SOCKET_EXEC_START_CHOWN,
+        SOCKET_EXEC_START_POST,
+        SOCKET_EXEC_STOP_PRE,
+        SOCKET_EXEC_STOP_POST,
+        _SOCKET_EXEC_COMMAND_MAX,
+        _SOCKET_EXEC_COMMAND_INVALID = -EINVAL,
+} SocketExecCommand;
+
+typedef enum SocketType {
+        SOCKET_SOCKET,
+        SOCKET_FIFO,
+        SOCKET_SPECIAL,
+        SOCKET_MQUEUE,
+        SOCKET_USB_FUNCTION,
+        _SOCKET_TYPE_MAX,
+        _SOCKET_TYPE_INVALID = -EINVAL,
+} SocketType;
+
+typedef enum SocketResult {
+        SOCKET_SUCCESS,
+        SOCKET_FAILURE_RESOURCES,
+        SOCKET_FAILURE_TIMEOUT,
+        SOCKET_FAILURE_EXIT_CODE,
+        SOCKET_FAILURE_SIGNAL,
+        SOCKET_FAILURE_CORE_DUMP,
+        SOCKET_FAILURE_START_LIMIT_HIT,
+        SOCKET_FAILURE_TRIGGER_LIMIT_HIT,
+        SOCKET_FAILURE_SERVICE_START_LIMIT_HIT,
+        _SOCKET_RESULT_MAX,
+        _SOCKET_RESULT_INVALID = -EINVAL,
+} SocketResult;
+
+typedef struct SocketPort {
+        Socket *socket;
+
+        SocketType type;
+        int fd;
+        int *auxiliary_fds;
+        size_t n_auxiliary_fds;
+
+        SocketAddress address;
+        char *path;
+        sd_event_source *event_source;
+
+        LIST_FIELDS(struct SocketPort, port);
+} SocketPort;
+
+typedef enum SocketTimestamping {
+        SOCKET_TIMESTAMPING_OFF,
+        SOCKET_TIMESTAMPING_US,  /* SO_TIMESTAMP */
+        SOCKET_TIMESTAMPING_NS,  /* SO_TIMESTAMPNS */
+        _SOCKET_TIMESTAMPING_MAX,
+        _SOCKET_TIMESTAMPING_INVALID = -EINVAL,
+} SocketTimestamping;
+
+struct Socket {
+        Unit meta;
+
+        LIST_HEAD(SocketPort, ports);
+
+        Set *peers_by_address;
+
+        unsigned n_accepted;
+        unsigned n_connections;
+        unsigned n_refused;
+        unsigned max_connections;
+        unsigned max_connections_per_source;
+
+        unsigned backlog;
+        unsigned keep_alive_cnt;
+        usec_t timeout_usec;
+        usec_t keep_alive_time;
+        usec_t keep_alive_interval;
+        usec_t defer_accept;
+
+        ExecCommand* exec_command[_SOCKET_EXEC_COMMAND_MAX];
+        ExecContext exec_context;
+        KillContext kill_context;
+        CGroupContext cgroup_context;
+
+        ExecRuntime *exec_runtime;
+
+        /* For Accept=no sockets refers to the one service we'll
+         * activate. For Accept=yes sockets is either NULL, or filled
+         * to refer to the next service we spawn. */
+        UnitRef service;
+
+        SocketState state, deserialized_state;
+
+        sd_event_source *timer_event_source;
+
+        ExecCommand* control_command;
+        SocketExecCommand control_command_id;
+        PidRef control_pid;
+
+        mode_t directory_mode;
+        mode_t socket_mode;
+
+        SocketResult result;
+        SocketResult clean_result;
+
+        char **symlinks;
+
+        bool accept;
+        bool remove_on_stop;
+        bool writable;
+        bool flush_pending;
+
+        int socket_protocol;
+
+        /* Socket options */
+        bool keep_alive;
+        bool no_delay;
+        bool free_bind;
+        bool transparent;
+        bool broadcast;
+        bool pass_cred;
+        bool pass_sec;
+        bool pass_pktinfo;
+        SocketTimestamping timestamping;
+
+        /* Only for INET6 sockets: issue IPV6_V6ONLY sockopt */
+        SocketAddressBindIPv6Only bind_ipv6_only;
+
+        int priority;
+        int mark;
+        size_t receive_buffer;
+        size_t send_buffer;
+        int ip_tos;
+        int ip_ttl;
+        size_t pipe_size;
+        char *bind_to_device;
+        char *tcp_congestion;
+        bool reuse_port;
+        long mq_maxmsg;
+        long mq_msgsize;
+
+        char *smack;
+        char *smack_ip_in;
+        char *smack_ip_out;
+
+        bool selinux_context_from_net;
+
+        char *user, *group;
+
+        char *fdname;
+
+        RateLimit trigger_limit;
+        usec_t poll_limit_interval;
+        unsigned poll_limit_burst;
+};
+
+SocketPeer *socket_peer_ref(SocketPeer *p);
+SocketPeer *socket_peer_unref(SocketPeer *p);
+int socket_acquire_peer(Socket *s, int fd, SocketPeer **p);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref);
+
+/* Called from the service code when collecting fds */
+int socket_collect_fds(Socket *s, int **fds);
+
+/* Called from the service code when a per-connection service ended */
+void socket_connection_unref(Socket *s);
+
+SocketPort *socket_port_free(SocketPort *p);
+DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPort*, socket_port_free);
+
+void socket_free_ports(Socket *s);
+
+int socket_port_to_address(const SocketPort *s, char **ret);
+
+int socket_load_service_unit(Socket *s, int cfd, Unit **ret);
+
+char *socket_fdname(Socket *s);
+
+extern const UnitVTable socket_vtable;
+
+const char* socket_exec_command_to_string(SocketExecCommand i) _const_;
+SocketExecCommand socket_exec_command_from_string(const char *s) _pure_;
+
+const char* socket_result_to_string(SocketResult i) _const_;
+SocketResult socket_result_from_string(const char *s) _pure_;
+
+const char* socket_port_type_to_string(SocketPort *p) _pure_;
+SocketType socket_port_type_from_string(const char *p) _pure_;
+
+const char* socket_timestamping_to_string(SocketTimestamping p) _const_;
+SocketTimestamping socket_timestamping_from_string(const char *p) _pure_;
+SocketTimestamping socket_timestamping_from_string_harder(const char *p) _pure_;
+
+DEFINE_CAST(SOCKET, Socket);
diff --git a/src/core/swap.c b/src/core/swap.c
new file mode 100644
index 0000000..488b171
--- /dev/null
+++ b/src/core/swap.c
@@ -0,0 +1,1680 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-device.h"
+
+#include "alloc-util.h"
+#include "dbus-swap.h"
+#include "dbus-unit.h"
+#include "device-util.h"
+#include "device.h"
+#include "escape.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fstab-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "swap.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "virt.h"
+
+static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = {
+        [SWAP_DEAD] = UNIT_INACTIVE,
+        [SWAP_ACTIVATING] = UNIT_ACTIVATING,
+        [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE,
+        [SWAP_ACTIVE] = UNIT_ACTIVE,
+        [SWAP_DEACTIVATING] = UNIT_DEACTIVATING,
+        [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING,
+        [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING,
+        [SWAP_FAILED] = UNIT_FAILED,
+        [SWAP_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int swap_process_proc_swaps(Manager *m);
+
+static bool SWAP_STATE_WITH_PROCESS(SwapState state) {
+        return IN_SET(state,
+                      SWAP_ACTIVATING,
+                      SWAP_ACTIVATING_DONE,
+                      SWAP_DEACTIVATING,
+                      SWAP_DEACTIVATING_SIGTERM,
+                      SWAP_DEACTIVATING_SIGKILL,
+                      SWAP_CLEANING);
+}
+
+static UnitActiveState swap_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[SWAP(u)->state];
+}
+
+static const char *swap_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return swap_state_to_string(SWAP(u)->state);
+}
+
+static bool swap_may_gc(Unit *u) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+
+        if (s->from_proc_swaps)
+                return false;
+
+        return true;
+}
+
+static bool swap_is_extrinsic(Unit *u) {
+        assert(SWAP(u));
+
+        return MANAGER_IS_USER(u->manager);
+}
+
+static void swap_unset_proc_swaps(Swap *s) {
+        assert(s);
+
+        if (!s->from_proc_swaps)
+                return;
+
+        s->parameters_proc_swaps.what = mfree(s->parameters_proc_swaps.what);
+        s->from_proc_swaps = false;
+}
+
+static int swap_set_devnode(Swap *s, const char *devnode) {
+        Hashmap *swaps;
+        Swap *first;
+        int r;
+
+        assert(s);
+
+        r = hashmap_ensure_allocated(&UNIT(s)->manager->swaps_by_devnode, &path_hash_ops);
+        if (r < 0)
+                return r;
+
+        swaps = UNIT(s)->manager->swaps_by_devnode;
+
+        if (s->devnode) {
+                first = hashmap_get(swaps, s->devnode);
+
+                LIST_REMOVE(same_devnode, first, s);
+                if (first)
+                        hashmap_replace(swaps, first->devnode, first);
+                else
+                        hashmap_remove(swaps, s->devnode);
+
+                s->devnode = mfree(s->devnode);
+        }
+
+        if (devnode) {
+                s->devnode = strdup(devnode);
+                if (!s->devnode)
+                        return -ENOMEM;
+
+                first = hashmap_get(swaps, s->devnode);
+                LIST_PREPEND(same_devnode, first, s);
+
+                return hashmap_replace(swaps, first->devnode, first);
+        }
+
+        return 0;
+}
+
+static void swap_init(Unit *u) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+        assert(UNIT(s)->load_state == UNIT_STUB);
+
+        s->timeout_usec = u->manager->defaults.timeout_start_usec;
+
+        s->exec_context.std_output = u->manager->defaults.std_output;
+        s->exec_context.std_error = u->manager->defaults.std_error;
+
+        s->control_pid = PIDREF_NULL;
+        s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+
+        u->ignore_on_isolate = true;
+}
+
+static void swap_unwatch_control_pid(Swap *s) {
+        assert(s);
+
+        if (!pidref_is_set(&s->control_pid))
+                return;
+
+        unit_unwatch_pidref(UNIT(s), &s->control_pid);
+        pidref_done(&s->control_pid);
+}
+
+static void swap_done(Unit *u) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+
+        swap_unset_proc_swaps(s);
+        swap_set_devnode(s, NULL);
+
+        s->what = mfree(s->what);
+        s->parameters_fragment.what = mfree(s->parameters_fragment.what);
+        s->parameters_fragment.options = mfree(s->parameters_fragment.options);
+
+        s->exec_runtime = exec_runtime_free(s->exec_runtime);
+        exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
+        s->control_command = NULL;
+
+        swap_unwatch_control_pid(s);
+
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+}
+
+static int swap_arm_timer(Swap *s, bool relative, usec_t usec) {
+        assert(s);
+
+        return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, swap_dispatch_timer);
+}
+
+static SwapParameters* swap_get_parameters(Swap *s) {
+        assert(s);
+
+        if (s->from_proc_swaps)
+                return &s->parameters_proc_swaps;
+
+        if (s->from_fragment)
+                return &s->parameters_fragment;
+
+        return NULL;
+}
+
+static int swap_add_device_dependencies(Swap *s) {
+        UnitDependencyMask mask;
+        SwapParameters *p;
+        int r;
+
+        assert(s);
+
+        if (!s->what)
+                return 0;
+
+        p = swap_get_parameters(s);
+        if (!p || !p->what)
+                return 0;
+
+        mask = s->from_proc_swaps ? UNIT_DEPENDENCY_PROC_SWAP : UNIT_DEPENDENCY_FILE;
+
+        if (is_device_path(p->what)) {
+                r = unit_add_node_dependency(UNIT(s), p->what, UNIT_REQUIRES, mask);
+                if (r < 0)
+                        return r;
+
+                return unit_add_blockdev_dependency(UNIT(s), p->what, mask);
+        }
+
+        /* File based swap devices need to be ordered after systemd-remount-fs.service, since they might need
+         * a writable file system. */
+        return unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, mask);
+}
+
+static int swap_add_default_dependencies(Swap *s) {
+        int r;
+
+        assert(s);
+
+        if (!UNIT(s)->default_dependencies)
+                return 0;
+
+        if (!MANAGER_IS_SYSTEM(UNIT(s)->manager))
+                return 0;
+
+        if (detect_container() > 0)
+                return 0;
+
+        /* swap units generated for the swap dev links are missing the
+         * ordering dep against the swap target. */
+        r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SWAP_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int swap_verify(Swap *s) {
+        _cleanup_free_ char *e = NULL;
+        int r;
+
+        assert(UNIT(s)->load_state == UNIT_LOADED);
+
+        r = unit_name_from_path(s->what, ".swap", &e);
+        if (r < 0)
+                return log_unit_error_errno(UNIT(s), r, "Failed to generate unit name from path: %m");
+
+        if (!unit_has_name(UNIT(s), e))
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Value of What= and unit name do not match, not loading.");
+
+        if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP)
+                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing to load.");
+
+        return 0;
+}
+
+static int swap_load_devnode(Swap *s) {
+        _cleanup_free_ char *p = NULL;
+        struct stat st;
+        int r;
+
+        assert(s);
+
+        if (stat(s->what, &st) < 0 || !S_ISBLK(st.st_mode))
+                return 0;
+
+        r = devname_from_stat_rdev(&st, &p);
+        if (r < 0) {
+                log_unit_full_errno(UNIT(s), r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                                    "Failed to get device node for swap %s: %m", s->what);
+                return 0;
+        }
+
+        return swap_set_devnode(s, p);
+}
+
+static int swap_add_extras(Swap *s) {
+        int r;
+
+        assert(s);
+
+        if (UNIT(s)->fragment_path)
+                s->from_fragment = true;
+
+        if (!s->what) {
+                if (s->parameters_fragment.what)
+                        s->what = strdup(s->parameters_fragment.what);
+                else if (s->parameters_proc_swaps.what)
+                        s->what = strdup(s->parameters_proc_swaps.what);
+                else {
+                        r = unit_name_to_path(UNIT(s)->id, &s->what);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (!s->what)
+                        return -ENOMEM;
+        }
+
+        path_simplify(s->what);
+
+        if (!UNIT(s)->description) {
+                r = unit_set_description(UNIT(s), s->what);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT);
+        if (r < 0)
+                return r;
+
+        r = swap_add_device_dependencies(s);
+        if (r < 0)
+                return r;
+
+        r = swap_load_devnode(s);
+        if (r < 0)
+                return r;
+
+        r = unit_patch_contexts(UNIT(s));
+        if (r < 0)
+                return r;
+
+        r = unit_add_exec_dependencies(UNIT(s), &s->exec_context);
+        if (r < 0)
+                return r;
+
+        r = unit_set_default_slice(UNIT(s));
+        if (r < 0)
+                return r;
+
+        r = swap_add_default_dependencies(s);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int swap_load(Unit *u) {
+        Swap *s = SWAP(u);
+        int r, q = 0;
+
+        assert(s);
+        assert(u->load_state == UNIT_STUB);
+
+        /* Load a .swap file */
+        bool fragment_optional = s->from_proc_swaps;
+        r = unit_load_fragment_and_dropin(u, !fragment_optional);
+
+        /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is
+         * already active. */
+        if (u->load_state == UNIT_LOADED || s->from_proc_swaps)
+                q = swap_add_extras(s);
+
+        if (r < 0)
+                return r;
+        if (q < 0)
+                return q;
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        return swap_verify(s);
+}
+
+static int swap_setup_unit(
+                Manager *m,
+                const char *what,
+                const char *what_proc_swaps,
+                int priority,
+                bool set_flags) {
+
+        _cleanup_free_ char *e = NULL;
+        bool delete = false;
+        Unit *u = NULL;
+        int r;
+        SwapParameters *p;
+
+        assert(m);
+        assert(what);
+        assert(what_proc_swaps);
+
+        r = unit_name_from_path(what, ".swap", &e);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m");
+
+        u = manager_get_unit(m, e);
+        if (u &&
+            SWAP(u)->from_proc_swaps &&
+            !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps))
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                       "Swap %s appeared twice with different device paths %s and %s",
+                                       e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps);
+
+        if (!u) {
+                delete = true;
+
+                r = unit_new_for_name(m, sizeof(Swap), e, &u);
+                if (r < 0) {
+                        log_unit_warning_errno(u, r, "Failed to load swap unit: %m");
+                        goto fail;
+                }
+
+                SWAP(u)->what = strdup(what);
+                if (!SWAP(u)->what) {
+                        r = log_oom();
+                        goto fail;
+                }
+
+                unit_add_to_load_queue(u);
+        } else
+                delete = false;
+
+        p = &SWAP(u)->parameters_proc_swaps;
+
+        if (!p->what) {
+                p->what = strdup(what_proc_swaps);
+                if (!p->what) {
+                        r = log_oom();
+                        goto fail;
+                }
+        }
+
+        /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be
+         * loaded. After all we can load it now, from the data in /proc/swaps. */
+        if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+                u->load_state = UNIT_LOADED;
+                u->load_error = 0;
+        }
+
+        if (set_flags) {
+                SWAP(u)->is_active = true;
+                SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps;
+        }
+
+        SWAP(u)->from_proc_swaps = true;
+
+        p->priority = priority;
+        p->priority_set = true;
+
+        unit_add_to_dbus_queue(u);
+        return 0;
+
+fail:
+        if (delete)
+                unit_free(u);
+
+        return r;
+}
+
+static void swap_process_new(Manager *m, const char *device, int prio, bool set_flags) {
+        _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+        const char *dn;
+        struct stat st, st_link;
+        int r;
+
+        assert(m);
+
+        if (swap_setup_unit(m, device, device, prio, set_flags) < 0)
+                return;
+
+        /* If this is a block device, then let's add duplicates for
+         * all other names of this block device */
+        if (stat(device, &st) < 0 || !S_ISBLK(st.st_mode))
+                return;
+
+        r = sd_device_new_from_stat_rdev(&d, &st);
+        if (r < 0)
+                return (void) log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                                             "Failed to allocate device for swap %s: %m", device);
+
+        /* Add the main device node */
+        if (sd_device_get_devname(d, &dn) >= 0 && !streq(dn, device))
+                (void) swap_setup_unit(m, dn, device, prio, set_flags);
+
+        /* Add additional units for all symlinks */
+        FOREACH_DEVICE_DEVLINK(d, devlink) {
+
+                /* Don't bother with the /dev/block links */
+                if (streq(devlink, device))
+                        continue;
+
+                if (path_startswith(devlink, "/dev/block/"))
+                        continue;
+
+                if (stat(devlink, &st_link) >= 0 &&
+                    (!S_ISBLK(st_link.st_mode) ||
+                     st_link.st_rdev != st.st_rdev))
+                        continue;
+
+                (void) swap_setup_unit(m, devlink, device, prio, set_flags);
+        }
+}
+
+static void swap_set_state(Swap *s, SwapState state) {
+        SwapState old_state;
+
+        assert(s);
+
+        if (s->state != state)
+                bus_unit_send_pending_change_signal(UNIT(s), false);
+
+        old_state = s->state;
+        s->state = state;
+
+        if (!SWAP_STATE_WITH_PROCESS(state)) {
+                s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+                swap_unwatch_control_pid(s);
+                s->control_command = NULL;
+                s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+        }
+
+        if (state != old_state)
+                log_unit_debug(UNIT(s), "Changed %s -> %s", swap_state_to_string(old_state), swap_state_to_string(state));
+
+        unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+
+        /* If there other units for the same device node have a job
+           queued it might be worth checking again if it is runnable
+           now. This is necessary, since swap_start() refuses
+           operation with EAGAIN if there's already another job for
+           the same device node queued. */
+        LIST_FOREACH_OTHERS(same_devnode, other, s)
+                if (UNIT(other)->job)
+                        job_add_to_run_queue(UNIT(other)->job);
+}
+
+static int swap_coldplug(Unit *u) {
+        Swap *s = SWAP(u);
+        SwapState new_state = SWAP_DEAD;
+        int r;
+
+        assert(s);
+        assert(s->state == SWAP_DEAD);
+
+        if (s->deserialized_state != s->state)
+                new_state = s->deserialized_state;
+        else if (s->from_proc_swaps)
+                new_state = SWAP_ACTIVE;
+
+        if (new_state == s->state)
+                return 0;
+
+        if (pidref_is_set(&s->control_pid) &&
+            pidref_is_unwaited(&s->control_pid) > 0 &&
+            SWAP_STATE_WITH_PROCESS(new_state)) {
+
+                r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
+                if (r < 0)
+                        return r;
+
+                r = swap_arm_timer(s, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec));
+                if (r < 0)
+                        return r;
+        }
+
+        if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED))
+                (void) unit_setup_exec_runtime(u);
+
+        swap_set_state(s, new_state);
+        return 0;
+}
+
+static void swap_dump(Unit *u, FILE *f, const char *prefix) {
+        Swap *s = SWAP(u);
+        SwapParameters *p;
+
+        assert(s);
+        assert(f);
+
+        if (s->from_proc_swaps)
+                p = &s->parameters_proc_swaps;
+        else if (s->from_fragment)
+                p = &s->parameters_fragment;
+        else
+                p = NULL;
+
+        fprintf(f,
+                "%sSwap State: %s\n"
+                "%sResult: %s\n"
+                "%sClean Result: %s\n"
+                "%sWhat: %s\n"
+                "%sFrom /proc/swaps: %s\n"
+                "%sFrom fragment: %s\n"
+                "%sExtrinsic: %s\n",
+                prefix, swap_state_to_string(s->state),
+                prefix, swap_result_to_string(s->result),
+                prefix, swap_result_to_string(s->clean_result),
+                prefix, s->what,
+                prefix, yes_no(s->from_proc_swaps),
+                prefix, yes_no(s->from_fragment),
+                prefix, yes_no(swap_is_extrinsic(u)));
+
+        if (s->devnode)
+                fprintf(f, "%sDevice Node: %s\n", prefix, s->devnode);
+
+        if (p)
+                fprintf(f,
+                        "%sPriority: %i\n"
+                        "%sOptions: %s\n",
+                        prefix, p->priority,
+                        prefix, strempty(p->options));
+
+        fprintf(f,
+                "%sTimeoutSec: %s\n",
+                prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC));
+
+        if (pidref_is_set(&s->control_pid))
+                fprintf(f,
+                        "%sControl PID: "PID_FMT"\n",
+                        prefix, s->control_pid.pid);
+
+        exec_context_dump(&s->exec_context, f, prefix);
+        kill_context_dump(&s->kill_context, f, prefix);
+        cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) {
+
+        _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
+                        EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        pid_t pid;
+        int r;
+
+        assert(s);
+        assert(c);
+        assert(ret_pid);
+
+        r = unit_prepare_exec(UNIT(s));
+        if (r < 0)
+                return r;
+
+        r = swap_arm_timer(s, /* relative= */ true, s->timeout_usec);
+        if (r < 0)
+                return r;
+
+        r = unit_set_exec_params(UNIT(s), &exec_params);
+        if (r < 0)
+                return r;
+
+        r = exec_spawn(UNIT(s),
+                       c,
+                       &s->exec_context,
+                       &exec_params,
+                       s->exec_runtime,
+                       &s->cgroup_context,
+                       &pid);
+        if (r < 0)
+                return r;
+
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0)
+                return r;
+
+        r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+        if (r < 0)
+                return r;
+
+        *ret_pid = TAKE_PIDREF(pidref);
+        return 0;
+}
+
+static void swap_enter_dead(Swap *s, SwapResult f) {
+        assert(s);
+
+        if (s->result == SWAP_SUCCESS)
+                s->result = f;
+
+        unit_log_result(UNIT(s), s->result == SWAP_SUCCESS, swap_result_to_string(s->result));
+        unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+        swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD);
+
+        s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
+
+        unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+        unit_unref_uid_gid(UNIT(s), true);
+}
+
+static void swap_enter_active(Swap *s, SwapResult f) {
+        assert(s);
+
+        if (s->result == SWAP_SUCCESS)
+                s->result = f;
+
+        swap_set_state(s, SWAP_ACTIVE);
+}
+
+static void swap_enter_dead_or_active(Swap *s, SwapResult f) {
+        assert(s);
+
+        if (s->from_proc_swaps) {
+                swap_enter_active(s, f);
+
+                LIST_FOREACH_OTHERS(same_devnode, other, s)
+                        if (UNIT(other)->job)
+                                swap_enter_dead_or_active(other, f);
+        } else
+                swap_enter_dead(s, f);
+}
+
+static int state_to_kill_operation(Swap *s, SwapState state) {
+        if (state == SWAP_DEACTIVATING_SIGTERM) {
+                if (unit_has_job_type(UNIT(s), JOB_RESTART))
+                        return KILL_RESTART;
+                else
+                        return KILL_TERMINATE;
+        }
+
+        return KILL_KILL;
+}
+
+static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
+        int r;
+
+        assert(s);
+
+        if (s->result == SWAP_SUCCESS)
+                s->result = f;
+
+        r = unit_kill_context(
+                        UNIT(s),
+                        &s->kill_context,
+                        state_to_kill_operation(s, state),
+                        /* main_pid= */ NULL,
+                        &s->control_pid,
+                        /* main_pid_alien= */ false);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+                goto fail;
+        }
+
+        if (r > 0) {
+                r = swap_arm_timer(s, /* relative= */ true, s->timeout_usec);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+                        goto fail;
+                }
+
+                swap_set_state(s, state);
+        } else if (state == SWAP_DEACTIVATING_SIGTERM && s->kill_context.send_sigkill)
+                swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS);
+        else
+                swap_enter_dead_or_active(s, SWAP_SUCCESS);
+
+        return;
+
+fail:
+        swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_enter_activating(Swap *s) {
+        _cleanup_free_ char *opts = NULL;
+        int r;
+
+        assert(s);
+
+        unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start);
+
+        s->control_command_id = SWAP_EXEC_ACTIVATE;
+        s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE;
+
+        if (s->from_fragment) {
+                int priority = 0;
+
+                r = fstab_find_pri(s->parameters_fragment.options, &priority);
+                if (r < 0)
+                        log_unit_warning_errno(UNIT(s), r, "Failed to parse swap priority \"%s\", ignoring: %m", s->parameters_fragment.options);
+                else if (r > 0 && s->parameters_fragment.priority_set)
+                        log_unit_warning(UNIT(s), "Duplicate swap priority configuration by Priority= and Options= fields.");
+
+                if (r <= 0 && s->parameters_fragment.priority_set) {
+                        if (s->parameters_fragment.options)
+                                r = asprintf(&opts, "%s,pri=%i", s->parameters_fragment.options, s->parameters_fragment.priority);
+                        else
+                                r = asprintf(&opts, "pri=%i", s->parameters_fragment.priority);
+                        if (r < 0) {
+                                r = log_oom();
+                                goto fail;
+                        }
+                }
+        }
+
+        r = exec_command_set(s->control_command, "/sbin/swapon", "--fixpgsz", NULL);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to initialize swapon command line: %m");
+                goto fail;
+        }
+
+        if (s->parameters_fragment.options || opts) {
+                r = exec_command_append(s->control_command, "-o",
+                                opts ?: s->parameters_fragment.options, NULL);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapon command line: %m");
+                        goto fail;
+                }
+        }
+
+        r = exec_command_append(s->control_command, s->what, NULL);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapon command line: %m");
+                goto fail;
+        }
+
+        swap_unwatch_control_pid(s);
+
+        r = swap_spawn(s, s->control_command, &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'swapon' task: %m");
+                goto fail;
+        }
+
+        swap_set_state(s, SWAP_ACTIVATING);
+        return;
+
+fail:
+        swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_enter_deactivating(Swap *s) {
+        int r;
+
+        assert(s);
+
+        s->control_command_id = SWAP_EXEC_DEACTIVATE;
+        s->control_command = s->exec_command + SWAP_EXEC_DEACTIVATE;
+
+        r = exec_command_set(s->control_command,
+                             "/sbin/swapoff",
+                             s->what,
+                             NULL);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapoff command line: %m");
+                goto fail;
+        }
+
+        swap_unwatch_control_pid(s);
+
+        r = swap_spawn(s, s->control_command, &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'swapoff' task: %m");
+                goto fail;
+        }
+
+        swap_set_state(s, SWAP_DEACTIVATING);
+        return;
+
+fail:
+        swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_cycle_clear(Swap *s) {
+        assert(s);
+
+        s->result = SWAP_SUCCESS;
+        exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
+        UNIT(s)->reset_accounting = true;
+}
+
+static int swap_start(Unit *u) {
+        Swap *s = SWAP(u);
+        int r;
+
+        assert(s);
+
+        /* We cannot fulfill this request right now, try again later please! */
+        if (IN_SET(s->state,
+                   SWAP_DEACTIVATING,
+                   SWAP_DEACTIVATING_SIGTERM,
+                   SWAP_DEACTIVATING_SIGKILL,
+                   SWAP_CLEANING))
+                return -EAGAIN;
+
+        /* Already on it! */
+        if (s->state == SWAP_ACTIVATING)
+                return 0;
+
+        assert(IN_SET(s->state, SWAP_DEAD, SWAP_FAILED));
+
+        if (detect_container() > 0)
+                return -EPERM;
+
+        /* If there's a job for another swap unit for the same node
+         * running, then let's not dispatch this one for now, and wait
+         * until that other job has finished. */
+        LIST_FOREACH_OTHERS(same_devnode, other, s)
+                if (UNIT(other)->job && UNIT(other)->job->state == JOB_RUNNING)
+                        return -EAGAIN;
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        swap_cycle_clear(s);
+        swap_enter_activating(s);
+        return 1;
+}
+
+static int swap_stop(Unit *u) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+
+        switch (s->state) {
+
+        case SWAP_DEACTIVATING:
+        case SWAP_DEACTIVATING_SIGTERM:
+        case SWAP_DEACTIVATING_SIGKILL:
+                /* Already on it */
+                return 0;
+
+        case SWAP_ACTIVATING:
+        case SWAP_ACTIVATING_DONE:
+                /* There's a control process pending, directly enter kill mode */
+                swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_SUCCESS);
+                return 0;
+
+        case SWAP_ACTIVE:
+                if (detect_container() > 0)
+                        return -EPERM;
+
+                swap_enter_deactivating(s);
+                return 1;
+
+        case SWAP_CLEANING:
+                /* If we are currently cleaning, then abort it, brutally. */
+                swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS);
+                return 0;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int swap_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", swap_state_to_string(s->state));
+        (void) serialize_item(f, "result", swap_result_to_string(s->result));
+        (void) serialize_pidref(f, fds, "control-pid", &s->control_pid);
+
+        if (s->control_command_id >= 0)
+                (void) serialize_item(f, "control-command", swap_exec_command_to_string(s->control_command_id));
+
+        return 0;
+}
+
+static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                SwapState state;
+
+                state = swap_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        s->deserialized_state = state;
+        } else if (streq(key, "result")) {
+                SwapResult f;
+
+                f = swap_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse result value: %s", value);
+                else if (f != SWAP_SUCCESS)
+                        s->result = f;
+        } else if (streq(key, "control-pid")) {
+
+                pidref_done(&s->control_pid);
+                (void) deserialize_pidref(fds, value, &s->control_pid);
+
+        } else if (streq(key, "control-command")) {
+                SwapExecCommand id;
+
+                id = swap_exec_command_from_string(value);
+                if (id < 0)
+                        log_unit_debug(u, "Failed to parse exec-command value: %s", value);
+                else {
+                        s->control_command_id = id;
+                        s->control_command = s->exec_command + id;
+                }
+        } else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+        Swap *s = SWAP(u);
+        SwapResult f;
+
+        assert(s);
+        assert(pid >= 0);
+
+        if (pid != s->control_pid.pid)
+                return;
+
+        /* Let's scan /proc/swaps before we process SIGCHLD. For the reasoning see the similar code in
+         * mount.c */
+        (void) swap_process_proc_swaps(u->manager);
+
+        pidref_done(&s->control_pid);
+
+        if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+                f = SWAP_SUCCESS;
+        else if (code == CLD_EXITED)
+                f = SWAP_FAILURE_EXIT_CODE;
+        else if (code == CLD_KILLED)
+                f = SWAP_FAILURE_SIGNAL;
+        else if (code == CLD_DUMPED)
+                f = SWAP_FAILURE_CORE_DUMP;
+        else
+                assert_not_reached();
+
+        if (s->result == SWAP_SUCCESS)
+                s->result = f;
+
+        if (s->control_command) {
+                exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+                s->control_command = NULL;
+                s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+        }
+
+        unit_log_process_exit(
+                        u,
+                        "Swap process",
+                        swap_exec_command_to_string(s->control_command_id),
+                        f == SWAP_SUCCESS,
+                        code, status);
+
+        switch (s->state) {
+
+        case SWAP_ACTIVATING:
+        case SWAP_ACTIVATING_DONE:
+
+                if (f == SWAP_SUCCESS || s->from_proc_swaps)
+                        swap_enter_active(s, f);
+                else
+                        swap_enter_dead(s, f);
+                break;
+
+        case SWAP_DEACTIVATING:
+        case SWAP_DEACTIVATING_SIGKILL:
+        case SWAP_DEACTIVATING_SIGTERM:
+
+                swap_enter_dead_or_active(s, f);
+                break;
+
+        case SWAP_CLEANING:
+                if (s->clean_result == SWAP_SUCCESS)
+                        s->clean_result = f;
+
+                swap_enter_dead(s, SWAP_SUCCESS);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        /* Notify clients about changed exit status */
+        unit_add_to_dbus_queue(u);
+}
+
+static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Swap *s = SWAP(userdata);
+
+        assert(s);
+        assert(s->timer_event_source == source);
+
+        switch (s->state) {
+
+        case SWAP_ACTIVATING:
+        case SWAP_ACTIVATING_DONE:
+                log_unit_warning(UNIT(s), "Activation timed out. Stopping.");
+                swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT);
+                break;
+
+        case SWAP_DEACTIVATING:
+                log_unit_warning(UNIT(s), "Deactivation timed out. Stopping.");
+                swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT);
+                break;
+
+        case SWAP_DEACTIVATING_SIGTERM:
+                if (s->kill_context.send_sigkill) {
+                        log_unit_warning(UNIT(s), "Swap process timed out. Killing.");
+                        swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_FAILURE_TIMEOUT);
+                } else {
+                        log_unit_warning(UNIT(s), "Swap process timed out. Skipping SIGKILL. Ignoring.");
+                        swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT);
+                }
+                break;
+
+        case SWAP_DEACTIVATING_SIGKILL:
+                log_unit_warning(UNIT(s), "Swap process still around after SIGKILL. Ignoring.");
+                swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT);
+                break;
+
+        case SWAP_CLEANING:
+                log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+                if (s->clean_result == SWAP_SUCCESS)
+                        s->clean_result = SWAP_FAILURE_TIMEOUT;
+
+                swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, 0);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+static int swap_load_proc_swaps(Manager *m, bool set_flags) {
+        assert(m);
+
+        rewind(m->proc_swaps);
+
+        (void) fscanf(m->proc_swaps, "%*s %*s %*s %*s %*s\n");
+
+        for (unsigned i = 1;; i++) {
+                _cleanup_free_ char *dev = NULL, *d = NULL;
+                int prio = 0, k;
+
+                k = fscanf(m->proc_swaps,
+                           "%ms "  /* device/file */
+                           "%*s "  /* type of swap */
+                           "%*s "  /* swap size */
+                           "%*s "  /* used */
+                           "%i\n", /* priority */
+                           &dev, &prio);
+                if (k != 2) {
+                        if (k == EOF)
+                                break;
+
+                        log_warning("Failed to parse /proc/swaps:%u, skipping.", i);
+                        continue;
+                }
+
+                ssize_t l = cunescape(dev, UNESCAPE_RELAX, &d);
+                if (l < 0)
+                        return log_error_errno(l, "Failed to unescape device path: %m");
+
+                device_found_node(m, d, DEVICE_FOUND_SWAP, DEVICE_FOUND_SWAP);
+
+                (void) swap_process_new(m, d, prio, set_flags);
+        }
+
+        return 0;
+}
+
+static int swap_process_proc_swaps(Manager *m) {
+        int r;
+
+        assert(m);
+
+        r = swap_load_proc_swaps(m, true);
+        if (r < 0) {
+                /* Reset flags, just in case, for late calls */
+                LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) {
+                        Swap *swap = SWAP(u);
+
+                        assert(swap);
+
+                        swap->is_active = swap->just_activated = false;
+                }
+
+                return 0;
+        }
+
+        manager_dispatch_load_queue(m);
+
+        LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) {
+                Swap *swap = SWAP(u);
+
+                assert(swap);
+
+                if (!swap->is_active) {
+
+                        swap_unset_proc_swaps(swap);
+
+                        switch (swap->state) {
+
+                        case SWAP_ACTIVE:
+                                /* This has just been deactivated */
+                                swap_enter_dead(swap, SWAP_SUCCESS);
+                                break;
+
+                        default:
+                                /* Fire again */
+                                swap_set_state(swap, swap->state);
+                                break;
+                        }
+
+                        if (swap->what)
+                                device_found_node(m, swap->what, DEVICE_NOT_FOUND, DEVICE_FOUND_SWAP);
+
+                } else if (swap->just_activated) {
+
+                        /* New swap entry */
+
+                        switch (swap->state) {
+
+                        case SWAP_DEAD:
+                        case SWAP_FAILED:
+                                (void) unit_acquire_invocation_id(u);
+                                swap_cycle_clear(swap);
+                                swap_enter_active(swap, SWAP_SUCCESS);
+                                break;
+
+                        case SWAP_ACTIVATING:
+                                swap_set_state(swap, SWAP_ACTIVATING_DONE);
+                                break;
+
+                        default:
+                                /* Nothing really changed, but let's
+                                 * issue an notification call
+                                 * nonetheless, in case somebody is
+                                 * waiting for this. */
+                                swap_set_state(swap, swap->state);
+                                break;
+                        }
+                }
+
+                /* Reset the flags for later calls */
+                swap->is_active = swap->just_activated = false;
+        }
+
+        return 1;
+}
+
+static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(revents & EPOLLPRI);
+
+        return swap_process_proc_swaps(m);
+}
+
+static Unit *swap_following(Unit *u) {
+        Swap *s = SWAP(u);
+        Swap *first = NULL;
+
+        assert(s);
+
+        /* If the user configured the swap through /etc/fstab or
+         * a device unit, follow that. */
+
+        if (s->from_fragment)
+                return NULL;
+
+        LIST_FOREACH_OTHERS(same_devnode, other, s)
+                if (other->from_fragment)
+                        return UNIT(other);
+
+        /* Otherwise, make everybody follow the unit that's named after
+         * the swap device in the kernel */
+
+        if (streq_ptr(s->what, s->devnode))
+                return NULL;
+
+        LIST_FOREACH(same_devnode, other, s->same_devnode_next)
+                if (streq_ptr(other->what, other->devnode))
+                        return UNIT(other);
+
+        LIST_FOREACH_BACKWARDS(same_devnode, other, s->same_devnode_prev) {
+                if (streq_ptr(other->what, other->devnode))
+                        return UNIT(other);
+
+                first = other;
+        }
+
+        /* Fall back to the first on the list */
+        return UNIT(first);
+}
+
+static int swap_following_set(Unit *u, Set **_set) {
+        Swap *s = SWAP(u);
+        _cleanup_set_free_ Set *set = NULL;
+        int r;
+
+        assert(s);
+        assert(_set);
+
+        if (LIST_JUST_US(same_devnode, s)) {
+                *_set = NULL;
+                return 0;
+        }
+
+        set = set_new(NULL);
+        if (!set)
+                return -ENOMEM;
+
+        LIST_FOREACH_OTHERS(same_devnode, other, s) {
+                r = set_put(set, other);
+                if (r < 0)
+                        return r;
+        }
+
+        *_set = TAKE_PTR(set);
+        return 1;
+}
+
+static void swap_shutdown(Manager *m) {
+        assert(m);
+
+        m->swap_event_source = sd_event_source_disable_unref(m->swap_event_source);
+        m->proc_swaps = safe_fclose(m->proc_swaps);
+        m->swaps_by_devnode = hashmap_free(m->swaps_by_devnode);
+}
+
+static void swap_enumerate(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (!m->proc_swaps) {
+                m->proc_swaps = fopen("/proc/swaps", "re");
+                if (!m->proc_swaps) {
+                        if (errno == ENOENT)
+                                log_debug_errno(errno, "Not swap enabled, skipping enumeration.");
+                        else
+                                log_warning_errno(errno, "Failed to open /proc/swaps, ignoring: %m");
+
+                        return;
+                }
+
+                r = sd_event_add_io(m->event, &m->swap_event_source, fileno(m->proc_swaps), EPOLLPRI, swap_dispatch_io, m);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to watch /proc/swaps: %m");
+                        goto fail;
+                }
+
+                /* Dispatch this before we dispatch SIGCHLD, so that
+                 * we always get the events from /proc/swaps before
+                 * the SIGCHLD of /sbin/swapon. */
+                r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to change /proc/swaps priority: %m");
+                        goto fail;
+                }
+
+                (void) sd_event_source_set_description(m->swap_event_source, "swap-proc");
+        }
+
+        r = swap_load_proc_swaps(m, false);
+        if (r < 0)
+                goto fail;
+
+        return;
+
+fail:
+        swap_shutdown(m);
+}
+
+int swap_process_device_new(Manager *m, sd_device *dev) {
+        _cleanup_free_ char *e = NULL;
+        const char *dn;
+        Unit *u;
+        int r;
+
+        assert(m);
+        assert(dev);
+
+        if (sd_device_get_devname(dev, &dn) < 0)
+                return 0;
+
+        r = unit_name_from_path(dn, ".swap", &e);
+        if (r < 0) {
+                log_debug_errno(r, "Cannot convert device name '%s' to unit name, ignoring: %m", dn);
+                return 0;
+        }
+
+        u = manager_get_unit(m, e);
+        if (u)
+                r = swap_set_devnode(SWAP(u), dn);
+
+        FOREACH_DEVICE_DEVLINK(dev, devlink) {
+                _cleanup_free_ char *n = NULL;
+                int q;
+
+                q = unit_name_from_path(devlink, ".swap", &n);
+                if (q == -EINVAL) /* If the name is not convertible to unit name, we can't manage it */
+                        continue;
+                if (q < 0)
+                        return q;
+
+                u = manager_get_unit(m, n);
+                if (u) {
+                        q = swap_set_devnode(SWAP(u), dn);
+                        if (q < 0)
+                                r = q;
+                }
+        }
+
+        return r;
+}
+
+int swap_process_device_remove(Manager *m, sd_device *dev) {
+        const char *dn;
+        int r;
+        Swap *s;
+
+        r = sd_device_get_devname(dev, &dn);
+        if (r < 0)
+                return 0;
+
+        while ((s = hashmap_get(m->swaps_by_devnode, dn))) {
+                int q;
+
+                q = swap_set_devnode(s, NULL);
+                if (q < 0)
+                        r = q;
+        }
+
+        return r;
+}
+
+static void swap_reset_failed(Unit *u) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+
+        if (s->state == SWAP_FAILED)
+                swap_set_state(s, SWAP_DEAD);
+
+        s->result = SWAP_SUCCESS;
+        s->clean_result = SWAP_SUCCESS;
+}
+
+static int swap_get_timeout(Unit *u, usec_t *timeout) {
+        Swap *s = SWAP(u);
+        usec_t t;
+        int r;
+
+        assert(s);
+        assert(u);
+
+        if (!s->timer_event_source)
+                return 0;
+
+        r = sd_event_source_get_time(s->timer_event_source, &t);
+        if (r < 0)
+                return r;
+        if (t == USEC_INFINITY)
+                return 0;
+
+        *timeout = t;
+        return 1;
+}
+
+static bool swap_supported(void) {
+        static int supported = -1;
+
+        /* If swap support is not available in the kernel, or we are
+         * running in a container we don't support swap units, and any
+         * attempts to starting one should fail immediately. */
+
+        if (supported < 0)
+                supported =
+                        access("/proc/swaps", F_OK) >= 0 &&
+                        detect_container() <= 0;
+
+        return supported;
+}
+
+static PidRef* swap_control_pid(Unit *u) {
+        return &ASSERT_PTR(SWAP(u))->control_pid;
+}
+
+static int swap_clean(Unit *u, ExecCleanMask mask) {
+        _cleanup_strv_free_ char **l = NULL;
+        Swap *s = SWAP(u);
+        int r;
+
+        assert(s);
+        assert(mask != 0);
+
+        if (s->state != SWAP_DEAD)
+                return -EBUSY;
+
+        r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+        if (r < 0)
+                return r;
+
+        if (strv_isempty(l))
+                return -EUNATCH;
+
+        swap_unwatch_control_pid(s);
+        s->clean_result = SWAP_SUCCESS;
+        s->control_command = NULL;
+        s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+
+        r = swap_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to install timer: %m");
+                goto fail;
+        }
+
+        r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+                goto fail;
+        }
+
+        swap_set_state(s, SWAP_CLEANING);
+        return 0;
+
+fail:
+        s->clean_result = SWAP_FAILURE_RESOURCES;
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+        return r;
+}
+
+static int swap_can_clean(Unit *u, ExecCleanMask *ret) {
+        Swap *s = SWAP(u);
+
+        assert(s);
+
+        return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static int swap_can_start(Unit *u) {
+        Swap *s = SWAP(u);
+        int r;
+
+        assert(s);
+
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT);
+                return r;
+        }
+
+        return 1;
+}
+
+int swap_get_priority(const Swap *s) {
+        assert(s);
+
+        if (s->from_proc_swaps && s->parameters_proc_swaps.priority_set)
+                return s->parameters_proc_swaps.priority;
+
+        if (s->from_fragment && s->parameters_fragment.priority_set)
+                return s->parameters_fragment.priority;
+
+        return -1;
+}
+
+const char* swap_get_options(const Swap *s) {
+        assert(s);
+
+        if (s->from_fragment)
+                return s->parameters_fragment.options;
+
+        return NULL;
+}
+
+static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = {
+        [SWAP_EXEC_ACTIVATE]   = "ExecActivate",
+        [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_exec_command, SwapExecCommand);
+
+static const char* const swap_result_table[_SWAP_RESULT_MAX] = {
+        [SWAP_SUCCESS]                 = "success",
+        [SWAP_FAILURE_RESOURCES]       = "resources",
+        [SWAP_FAILURE_TIMEOUT]         = "timeout",
+        [SWAP_FAILURE_EXIT_CODE]       = "exit-code",
+        [SWAP_FAILURE_SIGNAL]          = "signal",
+        [SWAP_FAILURE_CORE_DUMP]       = "core-dump",
+        [SWAP_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_result, SwapResult);
+
+const UnitVTable swap_vtable = {
+        .object_size = sizeof(Swap),
+        .exec_context_offset = offsetof(Swap, exec_context),
+        .cgroup_context_offset = offsetof(Swap, cgroup_context),
+        .kill_context_offset = offsetof(Swap, kill_context),
+        .exec_runtime_offset = offsetof(Swap, exec_runtime),
+
+        .sections =
+                "Unit\0"
+                "Swap\0"
+                "Install\0",
+        .private_section = "Swap",
+
+        .can_fail = true,
+
+        .init = swap_init,
+        .load = swap_load,
+        .done = swap_done,
+
+        .coldplug = swap_coldplug,
+
+        .dump = swap_dump,
+
+        .start = swap_start,
+        .stop = swap_stop,
+
+        .clean = swap_clean,
+        .can_clean = swap_can_clean,
+
+        .get_timeout = swap_get_timeout,
+
+        .serialize = swap_serialize,
+        .deserialize_item = swap_deserialize_item,
+
+        .active_state = swap_active_state,
+        .sub_state_to_string = swap_sub_state_to_string,
+
+        .will_restart = unit_will_restart_default,
+
+        .may_gc = swap_may_gc,
+        .is_extrinsic = swap_is_extrinsic,
+
+        .sigchld_event = swap_sigchld_event,
+
+        .reset_failed = swap_reset_failed,
+
+        .control_pid = swap_control_pid,
+
+        .bus_set_property = bus_swap_set_property,
+        .bus_commit_properties = bus_swap_commit_properties,
+
+        .following = swap_following,
+        .following_set = swap_following_set,
+
+        .enumerate = swap_enumerate,
+        .shutdown = swap_shutdown,
+        .supported = swap_supported,
+
+        .status_message_formats = {
+                .starting_stopping = {
+                        [0] = "Activating swap %s...",
+                        [1] = "Deactivating swap %s...",
+                },
+                .finished_start_job = {
+                        [JOB_DONE]       = "Activated swap %s.",
+                        [JOB_FAILED]     = "Failed to activate swap %s.",
+                        [JOB_TIMEOUT]    = "Timed out activating swap %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Deactivated swap %s.",
+                        [JOB_FAILED]     = "Failed deactivating swap %s.",
+                        [JOB_TIMEOUT]    = "Timed out deactivating swap %s.",
+                },
+        },
+
+        .can_start = swap_can_start,
+
+        .notify_plymouth = true,
+};
diff --git a/src/core/swap.h b/src/core/swap.h
new file mode 100644
index 0000000..ef20f0f
--- /dev/null
+++ b/src/core/swap.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+  Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "sd-device.h"
+
+#include "pidref.h"
+#include "unit.h"
+
+typedef struct Swap Swap;
+
+typedef enum SwapExecCommand {
+        SWAP_EXEC_ACTIVATE,
+        SWAP_EXEC_DEACTIVATE,
+        _SWAP_EXEC_COMMAND_MAX,
+        _SWAP_EXEC_COMMAND_INVALID = -EINVAL,
+} SwapExecCommand;
+
+typedef enum SwapResult {
+        SWAP_SUCCESS,
+        SWAP_FAILURE_RESOURCES,
+        SWAP_FAILURE_TIMEOUT,
+        SWAP_FAILURE_EXIT_CODE,
+        SWAP_FAILURE_SIGNAL,
+        SWAP_FAILURE_CORE_DUMP,
+        SWAP_FAILURE_START_LIMIT_HIT,
+        _SWAP_RESULT_MAX,
+        _SWAP_RESULT_INVALID = -EINVAL,
+} SwapResult;
+
+typedef struct SwapParameters {
+        char *what;
+        char *options;
+        int priority;
+        bool priority_set;
+} SwapParameters;
+
+struct Swap {
+        Unit meta;
+
+        char *what;
+
+        /* If the device has already shown up, this is the device
+         * node, which might be different from what, due to
+         * symlinks */
+        char *devnode;
+
+        SwapParameters parameters_proc_swaps;
+        SwapParameters parameters_fragment;
+
+        bool from_proc_swaps:1;
+        bool from_fragment:1;
+
+        /* Used while looking for swaps that vanished or got added
+         * from/to /proc/swaps */
+        bool is_active:1;
+        bool just_activated:1;
+
+        SwapResult result;
+        SwapResult clean_result;
+
+        usec_t timeout_usec;
+
+        ExecCommand exec_command[_SWAP_EXEC_COMMAND_MAX];
+        ExecContext exec_context;
+        KillContext kill_context;
+        CGroupContext cgroup_context;
+
+        ExecRuntime *exec_runtime;
+
+        SwapState state, deserialized_state;
+
+        ExecCommand* control_command;
+        SwapExecCommand control_command_id;
+        PidRef control_pid;
+
+        sd_event_source *timer_event_source;
+
+        /* In order to be able to distinguish dependencies on
+        different device nodes we might end up creating multiple
+        devices for the same swap. We chain them up here. */
+
+        LIST_FIELDS(struct Swap, same_devnode);
+};
+
+extern const UnitVTable swap_vtable;
+
+int swap_process_device_new(Manager *m, sd_device *dev);
+int swap_process_device_remove(Manager *m, sd_device *dev);
+
+int swap_get_priority(const Swap *s);
+const char* swap_get_options(const Swap *s);
+
+const char* swap_exec_command_to_string(SwapExecCommand i) _const_;
+SwapExecCommand swap_exec_command_from_string(const char *s) _pure_;
+
+const char* swap_result_to_string(SwapResult i) _const_;
+SwapResult swap_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SWAP, Swap);
diff --git a/src/core/system.conf.in b/src/core/system.conf.in
new file mode 100644
index 0000000..05eb681
--- /dev/null
+++ b/src/core/system.conf.in
@@ -0,0 +1,83 @@
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it under the
+#  terms of the GNU Lesser General Public License as published by the Free
+#  Software Foundation; either version 2.1 of the License, or (at your option)
+#  any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file (or a copy of it placed in
+# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in
+# /etc/systemd/system.conf.d/ directory. The latter is generally recommended.
+# Defaults can be restored by simply deleting the main configuration file and
+# all drop-ins located in /etc/.
+#
+# Use 'systemd-analyze cat-config systemd/system.conf' to display the full config.
+#
+# See systemd-system.conf(5) for details.
+
+[Manager]
+#LogLevel=info
+#LogTarget=journal-or-kmsg
+#LogColor=yes
+#LogLocation=no
+#LogTime=no
+#DumpCore=yes
+#ShowStatus=yes
+#CrashChangeVT=no
+#CrashShell=no
+#CrashReboot=no
+#CtrlAltDelBurstAction=reboot-force
+#CPUAffinity=
+#NUMAPolicy=default
+#NUMAMask=
+#RuntimeWatchdogSec=off
+#RuntimeWatchdogPreSec=off
+#RuntimeWatchdogPreGovernor=
+#RebootWatchdogSec=10min
+#KExecWatchdogSec=off
+#WatchdogDevice=
+#CapabilityBoundingSet=
+#NoNewPrivileges=no
+#SystemCallArchitectures=
+#TimerSlackNSec=
+#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
+#DefaultTimerAccuracySec=1min
+#DefaultStandardOutput=journal
+#DefaultStandardError=inherit
+#DefaultTimeoutStartSec={{DEFAULT_TIMEOUT_SEC}}s
+#DefaultTimeoutStopSec={{DEFAULT_TIMEOUT_SEC}}s
+#DefaultTimeoutAbortSec=
+#DefaultDeviceTimeoutSec={{DEFAULT_TIMEOUT_SEC}}s
+#DefaultRestartSec=100ms
+#DefaultStartLimitIntervalSec=10s
+#DefaultStartLimitBurst=5
+#DefaultEnvironment=
+#DefaultCPUAccounting=yes
+#DefaultIOAccounting=no
+#DefaultIPAccounting=no
+#DefaultMemoryAccounting={{ 'yes' if MEMORY_ACCOUNTING_DEFAULT else 'no' }}
+#DefaultTasksAccounting=yes
+#DefaultTasksMax=15%
+#DefaultLimitCPU=
+#DefaultLimitFSIZE=
+#DefaultLimitDATA=
+#DefaultLimitSTACK=
+#DefaultLimitCORE=
+#DefaultLimitRSS=
+#DefaultLimitNOFILE=1024:{{HIGH_RLIMIT_NOFILE}}
+#DefaultLimitAS=
+#DefaultLimitNPROC=
+#DefaultLimitMEMLOCK=8M
+#DefaultLimitLOCKS=
+#DefaultLimitSIGPENDING=
+#DefaultLimitMSGQUEUE=
+#DefaultLimitNICE=
+#DefaultLimitRTPRIO=
+#DefaultLimitRTTIME=
+#DefaultMemoryPressureThresholdSec=200ms
+#DefaultMemoryPressureWatch=auto
+#DefaultOOMPolicy=stop
+#DefaultSmackProcessLabel=
+#ReloadLimitIntervalSec=
+#ReloadLimitBurst=
diff --git a/src/core/systemd.pc.in b/src/core/systemd.pc.in
new file mode 100644
index 0000000..f3b85b0
--- /dev/null
+++ b/src/core/systemd.pc.in
@@ -0,0 +1,108 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+# Names with prefixes are preferred, and the run-together names should be
+# considered deprecated (though there is no plan to remove them). New names
+# shall have underscores.
+
+# root_prefix and rootprefix are deprecated since we dropped support for split-usr
+# however we used to install units in root_prefix and a lot of downstream software
+# overrode this variable in their build system to support installing units elsewhere.
+# To stop those builds from silently breaking we keep root_prefix around but have
+# it as an alias for prefix
+root_prefix={{PREFIX_NOSLASH}}
+rootprefix=${root_prefix}
+prefix=${rootprefix}
+sysconf_dir={{SYSCONF_DIR}}
+sysconfdir=${sysconf_dir}
+
+systemd_util_dir=${prefix}/lib/systemd
+systemdutildir=${systemd_util_dir}
+
+systemd_system_unit_dir=${prefix}/lib/systemd/system
+systemdsystemunitdir=${systemd_system_unit_dir}
+
+systemd_system_preset_dir=${prefix}/lib/systemd/system-preset
+systemdsystempresetdir=${systemd_system_preset_dir}
+
+systemd_user_unit_dir=${prefix}/lib/systemd/user
+systemduserunitdir=${systemd_user_unit_dir}
+
+systemd_user_preset_dir=${prefix}/lib/systemd/user-preset
+systemduserpresetdir=${systemd_user_preset_dir}
+
+systemd_system_conf_dir=${sysconfdir}/systemd/system
+systemdsystemconfdir=${systemd_system_conf_dir}
+
+systemd_user_conf_dir=${sysconfdir}/systemd/user
+systemduserconfdir=${systemd_user_conf_dir}
+
+systemd_system_unit_path=${systemd_system_conf_dir}:/etc/systemd/system:/run/systemd/system:/usr/local/lib/systemd/system:${systemd_system_unit_dir}:/usr/lib/systemd/system:/lib/systemd/system
+systemdsystemunitpath=${systemd_system_unit_path}
+
+systemd_user_unit_path=${systemd_user_conf_dir}:/etc/systemd/user:/run/systemd/user:/usr/local/lib/systemd/user:/usr/local/share/systemd/user:${systemd_user_unit_dir}:/usr/lib/systemd/user:/usr/share/systemd/user
+systemduserunitpath=${systemd_user_unit_path}
+
+systemd_system_generator_dir=${prefix}/lib/systemd/system-generators
+systemdsystemgeneratordir=${systemd_system_generator_dir}
+
+systemd_user_generator_dir=${prefix}/lib/systemd/user-generators
+systemdusergeneratordir=${systemd_user_generator_dir}
+
+systemd_system_generator_path=/run/systemd/system-generators:/etc/systemd/system-generators:/usr/local/lib/systemd/system-generators:${systemd_system_generator_dir}
+systemdsystemgeneratorpath=${systemd_system_generator_path}
+
+systemd_user_generator_path=/run/systemd/user-generators:/etc/systemd/user-generators:/usr/local/lib/systemd/user-generators:${systemd_user_generator_dir}
+systemdusergeneratorpath=${systemd_user_generator_path}
+
+systemd_sleep_dir=${prefix}/lib/systemd/system-sleep
+systemdsleepdir=${systemd_sleep_dir}
+
+systemd_shutdown_dir=${prefix}/lib/systemd/system-shutdown
+systemdshutdowndir=${systemd_shutdown_dir}
+
+tmpfiles_dir=${prefix}/lib/tmpfiles.d
+tmpfilesdir=${tmpfiles_dir}
+
+user_tmpfiles_dir=${prefix}/share/user-tmpfiles.d
+
+sysusers_dir=${prefix}/lib/sysusers.d
+sysusersdir=${sysusers_dir}
+
+sysctl_dir=${prefix}/lib/sysctl.d
+sysctldir=${sysctl_dir}
+
+binfmt_dir=${prefix}/lib/binfmt.d
+binfmtdir=${binfmt_dir}
+
+modules_load_dir=${prefix}/lib/modules-load.d
+modulesloaddir=${modules_load_dir}
+
+catalog_dir=${prefix}/lib/systemd/catalog
+catalogdir=${catalog_dir}
+
+system_uid_max={{SYSTEM_UID_MAX}}
+systemuidmax=${system_uid_max}
+system_gid_max={{SYSTEM_GID_MAX}}
+systemgidmax=${system_gid_max}
+
+dynamic_uid_min={{DYNAMIC_UID_MIN}}
+dynamicuidmin=${dynamic_uid_min}
+dynamic_uid_max={{DYNAMIC_UID_MAX}}
+dynamicuidmax=${dynamic_uid_max}
+
+container_uid_base_min={{CONTAINER_UID_BASE_MIN}}
+containeruidbasemin=${container_uid_base_min}
+container_uid_base_max={{CONTAINER_UID_BASE_MAX}}
+containeruidbasemax=${container_uid_base_max}
+
+Name: systemd
+Description: systemd System and Service Manager
+URL: {{PROJECT_URL}}
+Version: {{PROJECT_VERSION}}
diff --git a/src/core/target.c b/src/core/target.c
new file mode 100644
index 0000000..8f2a331
--- /dev/null
+++ b/src/core/target.c
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-target.h"
+#include "dbus-unit.h"
+#include "log.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-util.h"
+#include "target.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = {
+        [TARGET_DEAD] = UNIT_INACTIVE,
+        [TARGET_ACTIVE] = UNIT_ACTIVE
+};
+
+static void target_set_state(Target *t, TargetState state) {
+        TargetState old_state;
+        assert(t);
+
+        if (t->state != state)
+                bus_unit_send_pending_change_signal(UNIT(t), false);
+
+        old_state = t->state;
+        t->state = state;
+
+        if (state != old_state)
+                log_debug("%s changed %s -> %s",
+                          UNIT(t)->id,
+                          target_state_to_string(old_state),
+                          target_state_to_string(state));
+
+        unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int target_add_default_dependencies(Target *t) {
+        _cleanup_free_ Unit **others = NULL;
+        int r, n_others;
+
+        assert(t);
+
+        if (!UNIT(t)->default_dependencies)
+                return 0;
+
+        /* Imply ordering for requirement dependencies on target units. Note that when the user created a
+         * contradicting ordering manually we won't add anything in here to make sure we don't create a
+         * loop.
+         *
+         * Note that quite likely iterating through these dependencies will add new dependencies, which
+         * conflicts with the hashmap-based iteration logic. Hence, instead of iterating through the
+         * dependencies and acting on them as we go, first take an "atomic snapshot" of sorts and iterate
+         * through that. */
+
+        n_others = unit_get_dependency_array(UNIT(t), UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, &others);
+        if (n_others < 0)
+                return n_others;
+
+        for (int i = 0; i < n_others; i++) {
+                r = unit_add_default_target_dependency(others[i], UNIT(t));
+                if (r < 0)
+                        return r;
+        }
+
+        if (unit_has_name(UNIT(t), SPECIAL_SHUTDOWN_TARGET))
+                return 0;
+
+        /* Make sure targets are unloaded on shutdown */
+        return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int target_load(Unit *u) {
+        Target *t = TARGET(u);
+        int r;
+
+        assert(t);
+
+        r = unit_load_fragment_and_dropin(u, true);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        /* This is a new unit? Then let's add in some extras */
+        return target_add_default_dependencies(t);
+}
+
+static int target_coldplug(Unit *u) {
+        Target *t = TARGET(u);
+
+        assert(t);
+        assert(t->state == TARGET_DEAD);
+
+        if (t->deserialized_state != t->state)
+                target_set_state(t, t->deserialized_state);
+
+        return 0;
+}
+
+static void target_dump(Unit *u, FILE *f, const char *prefix) {
+        Target *t = TARGET(u);
+
+        assert(t);
+        assert(f);
+
+        fprintf(f,
+                "%sTarget State: %s\n",
+                prefix, target_state_to_string(t->state));
+}
+
+static int target_start(Unit *u) {
+        Target *t = TARGET(u);
+        int r;
+
+        assert(t);
+        assert(t->state == TARGET_DEAD);
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        target_set_state(t, TARGET_ACTIVE);
+        return 1;
+}
+
+static int target_stop(Unit *u) {
+        Target *t = TARGET(u);
+
+        assert(t);
+        assert(t->state == TARGET_ACTIVE);
+
+        target_set_state(t, TARGET_DEAD);
+        return 1;
+}
+
+static int target_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Target *s = TARGET(u);
+
+        assert(s);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", target_state_to_string(s->state));
+        return 0;
+}
+
+static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Target *s = TARGET(u);
+
+        assert(s);
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                TargetState state;
+
+                state = target_state_from_string(value);
+                if (state < 0)
+                        log_debug("Failed to parse state value %s", value);
+                else
+                        s->deserialized_state = state;
+
+        } else
+                log_debug("Unknown serialization key '%s'", key);
+
+        return 0;
+}
+
+static UnitActiveState target_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[TARGET(u)->state];
+}
+
+static const char *target_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return target_state_to_string(TARGET(u)->state);
+}
+
+const UnitVTable target_vtable = {
+        .object_size = sizeof(Target),
+
+        .sections =
+                "Unit\0"
+                "Target\0"
+                "Install\0",
+
+        .can_fail = true,
+
+        .load = target_load,
+        .coldplug = target_coldplug,
+
+        .dump = target_dump,
+
+        .start = target_start,
+        .stop = target_stop,
+
+        .serialize = target_serialize,
+        .deserialize_item = target_deserialize_item,
+
+        .active_state = target_active_state,
+        .sub_state_to_string = target_sub_state_to_string,
+
+        .status_message_formats = {
+                .finished_start_job = {
+                        [JOB_DONE]       = "Reached target %s.",
+                },
+                .finished_stop_job = {
+                        [JOB_DONE]       = "Stopped target %s.",
+                },
+        },
+};
diff --git a/src/core/target.h b/src/core/target.h
new file mode 100644
index 0000000..bb909d6
--- /dev/null
+++ b/src/core/target.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Target Target;
+
+struct Target {
+        Unit meta;
+
+        TargetState state, deserialized_state;
+};
+
+extern const UnitVTable target_vtable;
+
+DEFINE_CAST(TARGET, Target);
diff --git a/src/core/timer.c b/src/core/timer.c
new file mode 100644
index 0000000..3c41a25
--- /dev/null
+++ b/src/core/timer.c
@@ -0,0 +1,1106 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-timer.h"
+#include "dbus-unit.h"
+#include "fs-util.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "timer.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+#include "virt.h"
+
+static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = {
+        [TIMER_DEAD] = UNIT_INACTIVE,
+        [TIMER_WAITING] = UNIT_ACTIVE,
+        [TIMER_RUNNING] = UNIT_ACTIVE,
+        [TIMER_ELAPSED] = UNIT_ACTIVE,
+        [TIMER_FAILED] = UNIT_FAILED
+};
+
+static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata);
+
+static void timer_init(Unit *u) {
+        Timer *t = TIMER(u);
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
+        t->next_elapse_realtime = USEC_INFINITY;
+        t->accuracy_usec = u->manager->defaults.timer_accuracy_usec;
+        t->remain_after_elapse = true;
+}
+
+void timer_free_values(Timer *t) {
+        TimerValue *v;
+
+        assert(t);
+
+        while ((v = LIST_POP(value, t->values))) {
+                calendar_spec_free(v->calendar_spec);
+                free(v);
+        }
+}
+
+static void timer_done(Unit *u) {
+        Timer *t = TIMER(u);
+
+        assert(t);
+
+        timer_free_values(t);
+
+        t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source);
+        t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source);
+
+        t->stamp_path = mfree(t->stamp_path);
+}
+
+static int timer_verify(Timer *t) {
+        assert(t);
+        assert(UNIT(t)->load_state == UNIT_LOADED);
+
+        if (!t->values && !t->on_clock_change && !t->on_timezone_change)
+                return log_unit_error_errno(UNIT(t), SYNTHETIC_ERRNO(ENOEXEC), "Timer unit lacks value setting. Refusing.");
+
+        return 0;
+}
+
+static int timer_add_default_dependencies(Timer *t) {
+        int r;
+
+        assert(t);
+
+        if (!UNIT(t)->default_dependencies)
+                return 0;
+
+        r = unit_add_dependency_by_name(UNIT(t), UNIT_BEFORE, SPECIAL_TIMERS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+        if (r < 0)
+                return r;
+
+        if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
+                r = unit_add_two_dependencies_by_name(UNIT(t), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+                if (r < 0)
+                        return r;
+
+                LIST_FOREACH(value, v, t->values) {
+                        if (v->base != TIMER_CALENDAR)
+                                continue;
+
+                        FOREACH_STRING(target, SPECIAL_TIME_SYNC_TARGET, SPECIAL_TIME_SET_TARGET) {
+                                r = unit_add_dependency_by_name(UNIT(t), UNIT_AFTER, target, true, UNIT_DEPENDENCY_DEFAULT);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        break;
+                }
+        }
+
+        return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int timer_add_trigger_dependencies(Timer *t) {
+        Unit *x;
+        int r;
+
+        assert(t);
+
+        if (UNIT_TRIGGER(UNIT(t)))
+                return 0;
+
+        r = unit_load_related_unit(UNIT(t), ".service", &x);
+        if (r < 0)
+                return r;
+
+        return unit_add_two_dependencies(UNIT(t), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int timer_setup_persistent(Timer *t) {
+        _cleanup_free_ char *stamp_path = NULL;
+        int r;
+
+        assert(t);
+
+        if (!t->persistent)
+                return 0;
+
+        if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
+
+                r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+
+                stamp_path = strjoin("/var/lib/systemd/timers/stamp-", UNIT(t)->id);
+        } else {
+                const char *e;
+
+                e = getenv("XDG_DATA_HOME");
+                if (e)
+                        stamp_path = strjoin(e, "/systemd/timers/stamp-", UNIT(t)->id);
+                else {
+
+                        _cleanup_free_ char *h = NULL;
+
+                        r = get_home_dir(&h);
+                        if (r < 0)
+                                return log_unit_error_errno(UNIT(t), r, "Failed to determine home directory: %m");
+
+                        stamp_path = strjoin(h, "/.local/share/systemd/timers/stamp-", UNIT(t)->id);
+                }
+        }
+
+        if (!stamp_path)
+                return log_oom();
+
+        return free_and_replace(t->stamp_path, stamp_path);
+}
+
+static uint64_t timer_get_fixed_delay_hash(Timer *t) {
+        static const uint8_t hash_key[] = {
+                0x51, 0x0a, 0xdb, 0x76, 0x29, 0x51, 0x42, 0xc2,
+                0x80, 0x35, 0xea, 0xe6, 0x8e, 0x3a, 0x37, 0xbd
+        };
+
+        struct siphash state;
+        sd_id128_t machine_id;
+        uid_t uid;
+        int r;
+
+        assert(t);
+
+        uid = getuid();
+        r = sd_id128_get_machine(&machine_id);
+        if (r < 0) {
+                log_unit_debug_errno(UNIT(t), r,
+                                     "Failed to get machine ID for the fixed delay calculation, proceeding with 0: %m");
+                machine_id = SD_ID128_NULL;
+        }
+
+        siphash24_init(&state, hash_key);
+        siphash24_compress(&machine_id, sizeof(sd_id128_t), &state);
+        siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state);
+        siphash24_compress(&uid, sizeof(uid_t), &state);
+        siphash24_compress_string(UNIT(t)->id, &state);
+
+        return siphash24_finalize(&state);
+}
+
+static int timer_load(Unit *u) {
+        Timer *t = TIMER(u);
+        int r;
+
+        assert(u);
+        assert(u->load_state == UNIT_STUB);
+
+        r = unit_load_fragment_and_dropin(u, true);
+        if (r < 0)
+                return r;
+
+        if (u->load_state != UNIT_LOADED)
+                return 0;
+
+        /* This is a new unit? Then let's add in some extras */
+        r = timer_add_trigger_dependencies(t);
+        if (r < 0)
+                return r;
+
+        r = timer_setup_persistent(t);
+        if (r < 0)
+                return r;
+
+        r = timer_add_default_dependencies(t);
+        if (r < 0)
+                return r;
+
+        return timer_verify(t);
+}
+
+static void timer_dump(Unit *u, FILE *f, const char *prefix) {
+        Timer *t = TIMER(u);
+        Unit *trigger;
+
+        trigger = UNIT_TRIGGER(u);
+
+        fprintf(f,
+                "%sTimer State: %s\n"
+                "%sResult: %s\n"
+                "%sUnit: %s\n"
+                "%sPersistent: %s\n"
+                "%sWakeSystem: %s\n"
+                "%sAccuracy: %s\n"
+                "%sRemainAfterElapse: %s\n"
+                "%sFixedRandomDelay: %s\n"
+                "%sOnClockChange: %s\n"
+                "%sOnTimeZoneChange: %s\n",
+                prefix, timer_state_to_string(t->state),
+                prefix, timer_result_to_string(t->result),
+                prefix, trigger ? trigger->id : "n/a",
+                prefix, yes_no(t->persistent),
+                prefix, yes_no(t->wake_system),
+                prefix, FORMAT_TIMESPAN(t->accuracy_usec, 1),
+                prefix, yes_no(t->remain_after_elapse),
+                prefix, yes_no(t->fixed_random_delay),
+                prefix, yes_no(t->on_clock_change),
+                prefix, yes_no(t->on_timezone_change));
+
+        LIST_FOREACH(value, v, t->values)
+                if (v->base == TIMER_CALENDAR) {
+                        _cleanup_free_ char *p = NULL;
+
+                        (void) calendar_spec_to_string(v->calendar_spec, &p);
+
+                        fprintf(f,
+                                "%s%s: %s\n",
+                                prefix,
+                                timer_base_to_string(v->base),
+                                strna(p));
+                } else
+                        fprintf(f,
+                                "%s%s: %s\n",
+                                prefix,
+                                timer_base_to_string(v->base),
+                                FORMAT_TIMESPAN(v->value, 0));
+}
+
+static void timer_set_state(Timer *t, TimerState state) {
+        TimerState old_state;
+        assert(t);
+
+        if (t->state != state)
+                bus_unit_send_pending_change_signal(UNIT(t), false);
+
+        old_state = t->state;
+        t->state = state;
+
+        if (state != TIMER_WAITING) {
+                t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source);
+                t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source);
+                t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
+                t->next_elapse_realtime = USEC_INFINITY;
+        }
+
+        if (state != old_state)
+                log_unit_debug(UNIT(t), "Changed %s -> %s", timer_state_to_string(old_state), timer_state_to_string(state));
+
+        unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static void timer_enter_waiting(Timer *t, bool time_change);
+
+static int timer_coldplug(Unit *u) {
+        Timer *t = TIMER(u);
+
+        assert(t);
+        assert(t->state == TIMER_DEAD);
+
+        if (t->deserialized_state == t->state)
+                return 0;
+
+        if (t->deserialized_state == TIMER_WAITING)
+                timer_enter_waiting(t, false);
+        else
+                timer_set_state(t, t->deserialized_state);
+
+        return 0;
+}
+
+static void timer_enter_dead(Timer *t, TimerResult f) {
+        assert(t);
+
+        if (t->result == TIMER_SUCCESS)
+                t->result = f;
+
+        unit_log_result(UNIT(t), t->result == TIMER_SUCCESS, timer_result_to_string(t->result));
+        timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD);
+}
+
+static void timer_enter_elapsed(Timer *t, bool leave_around) {
+        assert(t);
+
+        /* If a unit is marked with RemainAfterElapse=yes we leave it
+         * around even after it elapsed once, so that starting it
+         * later again does not necessarily mean immediate
+         * retriggering. We unconditionally leave units with
+         * TIMER_UNIT_ACTIVE or TIMER_UNIT_INACTIVE triggers around,
+         * since they might be restarted automatically at any time
+         * later on. */
+
+        if (t->remain_after_elapse || leave_around)
+                timer_set_state(t, TIMER_ELAPSED);
+        else
+                timer_enter_dead(t, TIMER_SUCCESS);
+}
+
+static void add_random(Timer *t, usec_t *v) {
+        usec_t add;
+
+        assert(t);
+        assert(v);
+
+        if (t->random_usec == 0)
+                return;
+        if (*v == USEC_INFINITY)
+                return;
+
+        add = (t->fixed_random_delay ? timer_get_fixed_delay_hash(t) : random_u64()) % t->random_usec;
+
+        if (*v + add < *v) /* overflow */
+                *v = (usec_t) -2; /* Highest possible value, that is not USEC_INFINITY */
+        else
+                *v += add;
+
+        log_unit_debug(UNIT(t), "Adding %s random time.", FORMAT_TIMESPAN(add, 0));
+}
+
+static void timer_enter_waiting(Timer *t, bool time_change) {
+        bool found_monotonic = false, found_realtime = false;
+        bool leave_around = false;
+        triple_timestamp ts;
+        Unit *trigger;
+        int r;
+
+        assert(t);
+
+        trigger = UNIT_TRIGGER(UNIT(t));
+        if (!trigger) {
+                log_unit_error(UNIT(t), "Unit to trigger vanished.");
+                goto fail;
+        }
+
+        triple_timestamp_now(&ts);
+        t->next_elapse_monotonic_or_boottime = t->next_elapse_realtime = 0;
+
+        LIST_FOREACH(value, v, t->values) {
+                if (v->disabled)
+                        continue;
+
+                if (v->base == TIMER_CALENDAR) {
+                        usec_t b, rebased;
+
+                        /* If we know the last time this was
+                         * triggered, schedule the job based relative
+                         * to that. If we don't, just start from
+                         * the activation time. */
+
+                        if (dual_timestamp_is_set(&t->last_trigger))
+                                b = t->last_trigger.realtime;
+                        else if (dual_timestamp_is_set(&UNIT(t)->inactive_exit_timestamp))
+                                b = UNIT(t)->inactive_exit_timestamp.realtime;
+                        else
+                                b = ts.realtime;
+
+                        r = calendar_spec_next_usec(v->calendar_spec, b, &v->next_elapse);
+                        if (r < 0)
+                                continue;
+
+                        /* To make the delay due to RandomizedDelaySec= work even at boot, if the scheduled
+                         * time has already passed, set the time when systemd first started as the scheduled
+                         * time. Note that we base this on the monotonic timestamp of the boot, not the
+                         * realtime one, since the wallclock might have been off during boot. */
+                        rebased = map_clock_usec(UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic,
+                                                 CLOCK_MONOTONIC, CLOCK_REALTIME);
+                        if (v->next_elapse < rebased)
+                                v->next_elapse = rebased;
+
+                        if (!found_realtime)
+                                t->next_elapse_realtime = v->next_elapse;
+                        else
+                                t->next_elapse_realtime = MIN(t->next_elapse_realtime, v->next_elapse);
+
+                        found_realtime = true;
+
+                } else {
+                        usec_t base;
+
+                        switch (v->base) {
+
+                        case TIMER_ACTIVE:
+                                if (state_translation_table[t->state] == UNIT_ACTIVE)
+                                        base = UNIT(t)->inactive_exit_timestamp.monotonic;
+                                else
+                                        base = ts.monotonic;
+                                break;
+
+                        case TIMER_BOOT:
+                                if (detect_container() <= 0) {
+                                        /* CLOCK_MONOTONIC equals the uptime on Linux */
+                                        base = 0;
+                                        break;
+                                }
+                                /* In a container we don't want to include the time the host
+                                 * was already up when the container started, so count from
+                                 * our own startup. */
+                                _fallthrough_;
+                        case TIMER_STARTUP:
+                                base = UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+                                break;
+
+                        case TIMER_UNIT_ACTIVE:
+                                leave_around = true;
+                                base = MAX(trigger->inactive_exit_timestamp.monotonic, t->last_trigger.monotonic);
+                                if (base <= 0)
+                                        continue;
+                                break;
+
+                        case TIMER_UNIT_INACTIVE:
+                                leave_around = true;
+                                base = MAX(trigger->inactive_enter_timestamp.monotonic, t->last_trigger.monotonic);
+                                if (base <= 0)
+                                        continue;
+                                break;
+
+                        default:
+                                assert_not_reached();
+                        }
+
+                        v->next_elapse = usec_add(usec_shift_clock(base, CLOCK_MONOTONIC, TIMER_MONOTONIC_CLOCK(t)), v->value);
+
+                        if (dual_timestamp_is_set(&t->last_trigger) &&
+                            !time_change &&
+                            v->next_elapse < triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)) &&
+                            IN_SET(v->base, TIMER_ACTIVE, TIMER_BOOT, TIMER_STARTUP)) {
+                                /* This is a one time trigger, disable it now */
+                                v->disabled = true;
+                                continue;
+                        }
+
+                        if (!found_monotonic)
+                                t->next_elapse_monotonic_or_boottime = v->next_elapse;
+                        else
+                                t->next_elapse_monotonic_or_boottime = MIN(t->next_elapse_monotonic_or_boottime, v->next_elapse);
+
+                        found_monotonic = true;
+                }
+        }
+
+        if (!found_monotonic && !found_realtime && !t->on_timezone_change && !t->on_clock_change) {
+                log_unit_debug(UNIT(t), "Timer is elapsed.");
+                timer_enter_elapsed(t, leave_around);
+                return;
+        }
+
+        if (found_monotonic) {
+                usec_t left;
+
+                add_random(t, &t->next_elapse_monotonic_or_boottime);
+
+                left = usec_sub_unsigned(t->next_elapse_monotonic_or_boottime, triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)));
+                log_unit_debug(UNIT(t), "Monotonic timer elapses in %s.", FORMAT_TIMESPAN(left, 0));
+
+                if (t->monotonic_event_source) {
+                        r = sd_event_source_set_time(t->monotonic_event_source, t->next_elapse_monotonic_or_boottime);
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(t), r, "Failed to reschedule monotonic event source: %m");
+                                goto fail;
+                        }
+
+                        r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_ONESHOT);
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(t), r, "Failed to enable monotonic event source: %m");
+                                goto fail;
+                        }
+                } else {
+
+                        r = sd_event_add_time(
+                                        UNIT(t)->manager->event,
+                                        &t->monotonic_event_source,
+                                        t->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC,
+                                        t->next_elapse_monotonic_or_boottime, t->accuracy_usec,
+                                        timer_dispatch, t);
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(t), r, "Failed to add monotonic event source: %m");
+                                goto fail;
+                        }
+
+                        (void) sd_event_source_set_description(t->monotonic_event_source, "timer-monotonic");
+                }
+
+        } else if (t->monotonic_event_source) {
+
+                r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_OFF);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(t), r, "Failed to disable monotonic event source: %m");
+                        goto fail;
+                }
+        }
+
+        if (found_realtime) {
+                add_random(t, &t->next_elapse_realtime);
+
+                log_unit_debug(UNIT(t), "Realtime timer elapses at %s.", FORMAT_TIMESTAMP(t->next_elapse_realtime));
+
+                if (t->realtime_event_source) {
+                        r = sd_event_source_set_time(t->realtime_event_source, t->next_elapse_realtime);
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(t), r, "Failed to reschedule realtime event source: %m");
+                                goto fail;
+                        }
+
+                        r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_ONESHOT);
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(t), r, "Failed to enable realtime event source: %m");
+                                goto fail;
+                        }
+                } else {
+                        r = sd_event_add_time(
+                                        UNIT(t)->manager->event,
+                                        &t->realtime_event_source,
+                                        t->wake_system ? CLOCK_REALTIME_ALARM : CLOCK_REALTIME,
+                                        t->next_elapse_realtime, t->accuracy_usec,
+                                        timer_dispatch, t);
+                        if (r < 0) {
+                                log_unit_warning_errno(UNIT(t), r, "Failed to add realtime event source: %m");
+                                goto fail;
+                        }
+
+                        (void) sd_event_source_set_description(t->realtime_event_source, "timer-realtime");
+                }
+
+        } else if (t->realtime_event_source) {
+
+                r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_OFF);
+                if (r < 0) {
+                        log_unit_warning_errno(UNIT(t), r, "Failed to disable realtime event source: %m");
+                        goto fail;
+                }
+        }
+
+        timer_set_state(t, TIMER_WAITING);
+        return;
+
+fail:
+        timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+}
+
+static void timer_enter_running(Timer *t) {
+        _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        Unit *trigger;
+        Job *job;
+        int r;
+
+        assert(t);
+
+        /* Don't start job if we are supposed to go down */
+        if (unit_stop_pending(UNIT(t)))
+                return;
+
+        trigger = UNIT_TRIGGER(UNIT(t));
+        if (!trigger) {
+                log_unit_error(UNIT(t), "Unit to trigger vanished.");
+                goto fail;
+        }
+
+        details = activation_details_new(UNIT(t));
+        if (!details) {
+                log_oom();
+                goto fail;
+        }
+
+        r = manager_add_job(UNIT(t)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job);
+        if (r < 0) {
+                log_unit_warning(UNIT(t), "Failed to queue unit startup job: %s", bus_error_message(&error, r));
+                goto fail;
+        }
+
+        dual_timestamp_now(&t->last_trigger);
+        ACTIVATION_DETAILS_TIMER(details)->last_trigger = t->last_trigger;
+
+        job_set_activation_details(job, details);
+
+        if (t->stamp_path)
+                touch_file(t->stamp_path, true, t->last_trigger.realtime, UID_INVALID, GID_INVALID, MODE_INVALID);
+
+        timer_set_state(t, TIMER_RUNNING);
+        return;
+
+fail:
+        timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+}
+
+static int timer_start(Unit *u) {
+        Timer *t = TIMER(u);
+        int r;
+
+        assert(t);
+        assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED));
+
+        r = unit_test_trigger_loaded(u);
+        if (r < 0)
+                return r;
+
+        r = unit_acquire_invocation_id(u);
+        if (r < 0)
+                return r;
+
+        t->last_trigger = DUAL_TIMESTAMP_NULL;
+
+        /* Reenable all timers that depend on unit activation time */
+        LIST_FOREACH(value, v, t->values)
+                if (v->base == TIMER_ACTIVE)
+                        v->disabled = false;
+
+        if (t->stamp_path) {
+                struct stat st;
+
+                if (stat(t->stamp_path, &st) >= 0) {
+                        usec_t ft;
+
+                        /* Load the file timestamp, but only if it is actually in the past. If it is in the future,
+                         * something is wrong with the system clock. */
+
+                        ft = timespec_load(&st.st_mtim);
+                        if (ft < now(CLOCK_REALTIME))
+                                t->last_trigger.realtime = ft;
+                        else
+                                log_unit_warning(u, "Not using persistent file timestamp %s as it is in the future.",
+                                                 FORMAT_TIMESTAMP(ft));
+
+                } else if (errno == ENOENT)
+                        /* The timer has never run before, make sure a stamp file exists. */
+                        (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID);
+        }
+
+        t->result = TIMER_SUCCESS;
+        timer_enter_waiting(t, false);
+        return 1;
+}
+
+static int timer_stop(Unit *u) {
+        Timer *t = TIMER(u);
+
+        assert(t);
+        assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED));
+
+        timer_enter_dead(t, TIMER_SUCCESS);
+        return 1;
+}
+
+static int timer_serialize(Unit *u, FILE *f, FDSet *fds) {
+        Timer *t = TIMER(u);
+
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        (void) serialize_item(f, "state", timer_state_to_string(t->state));
+        (void) serialize_item(f, "result", timer_result_to_string(t->result));
+
+        if (dual_timestamp_is_set(&t->last_trigger))
+                (void) serialize_usec(f, "last-trigger-realtime", t->last_trigger.realtime);
+
+        if (t->last_trigger.monotonic > 0)
+                (void) serialize_usec(f, "last-trigger-monotonic", t->last_trigger.monotonic);
+
+        return 0;
+}
+
+static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+        Timer *t = TIMER(u);
+
+        assert(u);
+        assert(key);
+        assert(value);
+        assert(fds);
+
+        if (streq(key, "state")) {
+                TimerState state;
+
+                state = timer_state_from_string(value);
+                if (state < 0)
+                        log_unit_debug(u, "Failed to parse state value: %s", value);
+                else
+                        t->deserialized_state = state;
+
+        } else if (streq(key, "result")) {
+                TimerResult f;
+
+                f = timer_result_from_string(value);
+                if (f < 0)
+                        log_unit_debug(u, "Failed to parse result value: %s", value);
+                else if (f != TIMER_SUCCESS)
+                        t->result = f;
+
+        } else if (streq(key, "last-trigger-realtime"))
+                (void) deserialize_usec(value, &t->last_trigger.realtime);
+        else if (streq(key, "last-trigger-monotonic"))
+                (void) deserialize_usec(value, &t->last_trigger.monotonic);
+        else
+                log_unit_debug(u, "Unknown serialization key: %s", key);
+
+        return 0;
+}
+
+static UnitActiveState timer_active_state(Unit *u) {
+        assert(u);
+
+        return state_translation_table[TIMER(u)->state];
+}
+
+static const char *timer_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return timer_state_to_string(TIMER(u)->state);
+}
+
+static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) {
+        Timer *t = TIMER(userdata);
+
+        assert(t);
+
+        if (t->state != TIMER_WAITING)
+                return 0;
+
+        log_unit_debug(UNIT(t), "Timer elapsed.");
+        timer_enter_running(t);
+        return 0;
+}
+
+static void timer_trigger_notify(Unit *u, Unit *other) {
+        Timer *t = TIMER(u);
+
+        assert(u);
+        assert(other);
+
+        /* Filter out invocations with bogus state */
+        assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+
+        /* Reenable all timers that depend on unit state */
+        LIST_FOREACH(value, v, t->values)
+                if (IN_SET(v->base, TIMER_UNIT_ACTIVE, TIMER_UNIT_INACTIVE))
+                        v->disabled = false;
+
+        switch (t->state) {
+
+        case TIMER_WAITING:
+        case TIMER_ELAPSED:
+
+                /* Recalculate sleep time */
+                timer_enter_waiting(t, false);
+                break;
+
+        case TIMER_RUNNING:
+
+                if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+                        log_unit_debug(UNIT(t), "Got notified about unit deactivation.");
+                        timer_enter_waiting(t, false);
+                }
+                break;
+
+        case TIMER_DEAD:
+        case TIMER_FAILED:
+                break;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static void timer_reset_failed(Unit *u) {
+        Timer *t = TIMER(u);
+
+        assert(t);
+
+        if (t->state == TIMER_FAILED)
+                timer_set_state(t, TIMER_DEAD);
+
+        t->result = TIMER_SUCCESS;
+}
+
+static void timer_time_change(Unit *u) {
+        Timer *t = TIMER(u);
+        usec_t ts;
+
+        assert(u);
+
+        if (t->state != TIMER_WAITING)
+                return;
+
+        /* If we appear to have triggered in the future, the system clock must
+         * have been set backwards.  So let's rewind our own clock and allow
+         * the future triggers to happen again :).  Exactly the same as when
+         * you start a timer unit with Persistent=yes. */
+        ts = now(CLOCK_REALTIME);
+        if (t->last_trigger.realtime > ts)
+                t->last_trigger.realtime = ts;
+
+        if (t->on_clock_change) {
+                log_unit_debug(u, "Time change, triggering activation.");
+                timer_enter_running(t);
+        } else {
+                log_unit_debug(u, "Time change, recalculating next elapse.");
+                timer_enter_waiting(t, true);
+        }
+}
+
+static void timer_timezone_change(Unit *u) {
+        Timer *t = TIMER(u);
+
+        assert(u);
+
+        if (t->state != TIMER_WAITING)
+                return;
+
+        if (t->on_timezone_change) {
+                log_unit_debug(u, "Timezone change, triggering activation.");
+                timer_enter_running(t);
+        } else {
+                log_unit_debug(u, "Timezone change, recalculating next elapse.");
+                timer_enter_waiting(t, false);
+        }
+}
+
+static int timer_clean(Unit *u, ExecCleanMask mask) {
+        Timer *t = TIMER(u);
+        int r;
+
+        assert(t);
+        assert(mask != 0);
+
+        if (t->state != TIMER_DEAD)
+                return -EBUSY;
+
+        if (mask != EXEC_CLEAN_STATE)
+                return -EUNATCH;
+
+        r = timer_setup_persistent(t);
+        if (r < 0)
+                return r;
+
+        if (!t->stamp_path)
+                return -EUNATCH;
+
+        if (unlink(t->stamp_path) && errno != ENOENT)
+                return log_unit_error_errno(u, errno, "Failed to clean stamp file of timer: %m");
+
+        return 0;
+}
+
+static int timer_can_clean(Unit *u, ExecCleanMask *ret) {
+        Timer *t = TIMER(u);
+
+        assert(t);
+        assert(ret);
+
+        *ret = t->persistent ? EXEC_CLEAN_STATE : 0;
+        return 0;
+}
+
+static int timer_can_start(Unit *u) {
+        Timer *t = TIMER(u);
+        int r;
+
+        assert(t);
+
+        r = unit_test_start_limit(u);
+        if (r < 0) {
+                timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT);
+                return r;
+        }
+
+        return 1;
+}
+
+static void activation_details_timer_serialize(ActivationDetails *details, FILE *f) {
+        ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+
+        assert(details);
+        assert(f);
+        assert(t);
+
+        (void) serialize_dual_timestamp(f, "activation-details-timer-last-trigger", &t->last_trigger);
+}
+
+static int activation_details_timer_deserialize(const char *key, const char *value, ActivationDetails **details) {
+        int r;
+
+        assert(key);
+        assert(value);
+
+        if (!details || !*details)
+                return -EINVAL;
+
+        ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(*details);
+        if (!t)
+                return -EINVAL;
+
+        if (!streq(key, "activation-details-timer-last-trigger"))
+                return -EINVAL;
+
+        r = deserialize_dual_timestamp(value, &t->last_trigger);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int activation_details_timer_append_env(ActivationDetails *details, char ***strv) {
+        ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+        int r;
+
+        assert(details);
+        assert(strv);
+        assert(t);
+
+        if (!dual_timestamp_is_set(&t->last_trigger))
+                return 0;
+
+        r = strv_extendf(strv, "TRIGGER_TIMER_REALTIME_USEC=" USEC_FMT, t->last_trigger.realtime);
+        if (r < 0)
+                return r;
+
+        r = strv_extendf(strv, "TRIGGER_TIMER_MONOTONIC_USEC=" USEC_FMT, t->last_trigger.monotonic);
+        if (r < 0)
+                return r;
+
+        return 2; /* Return the number of variables added to the env block */
+}
+
+static int activation_details_timer_append_pair(ActivationDetails *details, char ***strv) {
+        ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+        int r;
+
+        assert(details);
+        assert(strv);
+        assert(t);
+
+        if (!dual_timestamp_is_set(&t->last_trigger))
+                return 0;
+
+        r = strv_extend(strv, "trigger_timer_realtime_usec");
+        if (r < 0)
+                return r;
+
+        r = strv_extendf(strv, USEC_FMT, t->last_trigger.realtime);
+        if (r < 0)
+                return r;
+
+        r = strv_extend(strv, "trigger_timer_monotonic_usec");
+        if (r < 0)
+                return r;
+
+        r = strv_extendf(strv, USEC_FMT, t->last_trigger.monotonic);
+        if (r < 0)
+                return r;
+
+        return 2; /* Return the number of pairs added to the env block */
+}
+
+uint64_t timer_next_elapse_monotonic(const Timer *t) {
+        assert(t);
+
+        return (uint64_t) usec_shift_clock(t->next_elapse_monotonic_or_boottime,
+                                           TIMER_MONOTONIC_CLOCK(t), CLOCK_MONOTONIC);
+}
+
+static const char* const timer_base_table[_TIMER_BASE_MAX] = {
+        [TIMER_ACTIVE]        = "OnActiveSec",
+        [TIMER_BOOT]          = "OnBootSec",
+        [TIMER_STARTUP]       = "OnStartupSec",
+        [TIMER_UNIT_ACTIVE]   = "OnUnitActiveSec",
+        [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec",
+        [TIMER_CALENDAR]      = "OnCalendar"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase);
+
+char* timer_base_to_usec_string(TimerBase i) {
+        _cleanup_free_ char *buf = NULL;
+        const char *s;
+        size_t l;
+
+        s = timer_base_to_string(i);
+
+        if (endswith(s, "Sec")) {
+                /* s/Sec/USec/ */
+                l = strlen(s);
+                buf = new(char, l+2);
+                if (!buf)
+                        return NULL;
+
+                memcpy(buf, s, l-3);
+                memcpy(buf+l-3, "USec", 5);
+        } else {
+                buf = strdup(s);
+                if (!buf)
+                        return NULL;
+        }
+
+        return TAKE_PTR(buf);
+}
+
+static const char* const timer_result_table[_TIMER_RESULT_MAX] = {
+        [TIMER_SUCCESS]                 = "success",
+        [TIMER_FAILURE_RESOURCES]       = "resources",
+        [TIMER_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_result, TimerResult);
+
+const UnitVTable timer_vtable = {
+        .object_size = sizeof(Timer),
+
+        .sections =
+                "Unit\0"
+                "Timer\0"
+                "Install\0",
+        .private_section = "Timer",
+
+        .can_transient = true,
+        .can_fail = true,
+        .can_trigger = true,
+
+        .init = timer_init,
+        .done = timer_done,
+        .load = timer_load,
+
+        .coldplug = timer_coldplug,
+
+        .dump = timer_dump,
+
+        .start = timer_start,
+        .stop = timer_stop,
+
+        .clean = timer_clean,
+        .can_clean = timer_can_clean,
+
+        .serialize = timer_serialize,
+        .deserialize_item = timer_deserialize_item,
+
+        .active_state = timer_active_state,
+        .sub_state_to_string = timer_sub_state_to_string,
+
+        .trigger_notify = timer_trigger_notify,
+
+        .reset_failed = timer_reset_failed,
+        .time_change = timer_time_change,
+        .timezone_change = timer_timezone_change,
+
+        .bus_set_property = bus_timer_set_property,
+
+        .can_start = timer_can_start,
+};
+
+const ActivationDetailsVTable activation_details_timer_vtable = {
+        .object_size = sizeof(ActivationDetailsTimer),
+
+        .serialize = activation_details_timer_serialize,
+        .deserialize = activation_details_timer_deserialize,
+        .append_env = activation_details_timer_append_env,
+        .append_pair = activation_details_timer_append_pair,
+};
diff --git a/src/core/timer.h b/src/core/timer.h
new file mode 100644
index 0000000..76d45b2
--- /dev/null
+++ b/src/core/timer.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Timer Timer;
+typedef struct ActivationDetailsTimer ActivationDetailsTimer;
+
+#include "calendarspec.h"
+#include "unit.h"
+
+typedef enum TimerBase {
+        TIMER_ACTIVE,
+        TIMER_BOOT,
+        TIMER_STARTUP,
+        TIMER_UNIT_ACTIVE,
+        TIMER_UNIT_INACTIVE,
+        TIMER_CALENDAR,
+        _TIMER_BASE_MAX,
+        _TIMER_BASE_INVALID = -EINVAL,
+} TimerBase;
+
+typedef struct TimerValue {
+        TimerBase base;
+        bool disabled;
+
+        usec_t value; /* only for monotonic events */
+        CalendarSpec *calendar_spec; /* only for calendar events */
+        usec_t next_elapse;
+
+        LIST_FIELDS(struct TimerValue, value);
+} TimerValue;
+
+typedef enum TimerResult {
+        TIMER_SUCCESS,
+        TIMER_FAILURE_RESOURCES,
+        TIMER_FAILURE_START_LIMIT_HIT,
+        _TIMER_RESULT_MAX,
+        _TIMER_RESULT_INVALID = -EINVAL,
+} TimerResult;
+
+struct Timer {
+        Unit meta;
+
+        usec_t accuracy_usec;
+        usec_t random_usec;
+
+        LIST_HEAD(TimerValue, values);
+        usec_t next_elapse_realtime;
+        usec_t next_elapse_monotonic_or_boottime;
+        dual_timestamp last_trigger;
+
+        TimerState state, deserialized_state;
+
+        sd_event_source *monotonic_event_source;
+        sd_event_source *realtime_event_source;
+
+        TimerResult result;
+
+        bool persistent;
+        bool wake_system;
+        bool remain_after_elapse;
+        bool on_clock_change;
+        bool on_timezone_change;
+        bool fixed_random_delay;
+
+        char *stamp_path;
+};
+
+struct ActivationDetailsTimer {
+        ActivationDetails meta;
+        dual_timestamp last_trigger;
+};
+
+#define TIMER_MONOTONIC_CLOCK(t) ((t)->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC)
+
+uint64_t timer_next_elapse_monotonic(const Timer *t);
+
+void timer_free_values(Timer *t);
+
+extern const UnitVTable timer_vtable;
+extern const ActivationDetailsVTable activation_details_timer_vtable;
+
+const char *timer_base_to_string(TimerBase i) _const_;
+TimerBase timer_base_from_string(const char *s) _pure_;
+
+char* timer_base_to_usec_string(TimerBase i);
+
+const char* timer_result_to_string(TimerResult i) _const_;
+TimerResult timer_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(TIMER, Timer);
+DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_TIMER, ActivationDetailsTimer, TIMER);
diff --git a/src/core/transaction.c b/src/core/transaction.c
new file mode 100644
index 0000000..a81c40f
--- /dev/null
+++ b/src/core/transaction.c
@@ -0,0 +1,1261 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "dbus-unit.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "transaction.h"
+
+static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies);
+
+static void transaction_delete_job(Transaction *tr, Job *j, bool delete_dependencies) {
+        assert(tr);
+        assert(j);
+
+        /* Deletes one job from the transaction */
+
+        transaction_unlink_job(tr, j, delete_dependencies);
+
+        job_free(j);
+}
+
+static void transaction_delete_unit(Transaction *tr, Unit *u) {
+        Job *j;
+
+        /* Deletes all jobs associated with a certain unit from the
+         * transaction */
+
+        while ((j = hashmap_get(tr->jobs, u)))
+                transaction_delete_job(tr, j, true);
+}
+
+static void transaction_abort(Transaction *tr) {
+        Job *j;
+
+        assert(tr);
+
+        while ((j = hashmap_first(tr->jobs)))
+                transaction_delete_job(tr, j, false);
+
+        assert(hashmap_isempty(tr->jobs));
+}
+
+static void transaction_find_jobs_that_matter_to_anchor(Job *j, unsigned generation) {
+        assert(j);
+
+        /* A recursive sweep through the graph that marks all units
+         * that matter to the anchor job, i.e. are directly or
+         * indirectly a dependency of the anchor job via paths that
+         * are fully marked as mattering. */
+
+        j->matters_to_anchor = true;
+        j->generation = generation;
+
+        LIST_FOREACH(subject, l, j->subject_list) {
+
+                /* This link does not matter */
+                if (!l->matters)
+                        continue;
+
+                /* This unit has already been marked */
+                if (l->object->generation == generation)
+                        continue;
+
+                transaction_find_jobs_that_matter_to_anchor(l->object, generation);
+        }
+}
+
+static void transaction_merge_and_delete_job(Transaction *tr, Job *j, Job *other, JobType t) {
+        JobDependency *last;
+
+        assert(j);
+        assert(other);
+        assert(j->unit == other->unit);
+        assert(!j->installed);
+
+        /* Merges 'other' into 'j' and then deletes 'other'. */
+
+        j->type = t;
+        j->state = JOB_WAITING;
+        j->irreversible = j->irreversible || other->irreversible;
+        j->matters_to_anchor = j->matters_to_anchor || other->matters_to_anchor;
+
+        /* Patch us in as new owner of the JobDependency objects */
+        last = NULL;
+        LIST_FOREACH(subject, l, other->subject_list) {
+                assert(l->subject == other);
+                l->subject = j;
+                last = l;
+        }
+
+        /* Merge both lists */
+        if (last) {
+                last->subject_next = j->subject_list;
+                if (j->subject_list)
+                        j->subject_list->subject_prev = last;
+                j->subject_list = other->subject_list;
+        }
+
+        /* Patch us in as new owner of the JobDependency objects */
+        last = NULL;
+        LIST_FOREACH(object, l, other->object_list) {
+                assert(l->object == other);
+                l->object = j;
+                last = l;
+        }
+
+        /* Merge both lists */
+        if (last) {
+                last->object_next = j->object_list;
+                if (j->object_list)
+                        j->object_list->object_prev = last;
+                j->object_list = other->object_list;
+        }
+
+        /* Kill the other job */
+        other->subject_list = NULL;
+        other->object_list = NULL;
+        transaction_delete_job(tr, other, true);
+}
+
+static bool job_is_conflicted_by(Job *j) {
+        assert(j);
+
+        /* Returns true if this job is pulled in by a least one
+         * ConflictedBy dependency. */
+
+        LIST_FOREACH(object, l, j->object_list)
+                if (l->conflicts)
+                        return true;
+
+        return false;
+}
+
+static int delete_one_unmergeable_job(Transaction *tr, Job *job) {
+        assert(job);
+
+        /* Tries to delete one item in the linked list
+         * j->transaction_next->transaction_next->... that conflicts
+         * with another one, in an attempt to make an inconsistent
+         * transaction work. */
+
+        /* We rely here on the fact that if a merged with b does not
+         * merge with c, either a or b merge with c neither */
+        LIST_FOREACH(transaction, j, job)
+                LIST_FOREACH(transaction, k, j->transaction_next) {
+                        Job *d;
+
+                        /* Is this one mergeable? Then skip it */
+                        if (job_type_is_mergeable(j->type, k->type))
+                                continue;
+
+                        /* Ok, we found two that conflict, let's see if we can
+                         * drop one of them */
+                        if (!j->matters_to_anchor && !k->matters_to_anchor) {
+
+                                /* Both jobs don't matter, so let's
+                                 * find the one that is smarter to
+                                 * remove. Let's think positive and
+                                 * rather remove stops then starts --
+                                 * except if something is being
+                                 * stopped because it is conflicted by
+                                 * another unit in which case we
+                                 * rather remove the start. */
+
+                                log_unit_debug(j->unit,
+                                               "Looking at job %s/%s conflicted_by=%s",
+                                               j->unit->id, job_type_to_string(j->type),
+                                               yes_no(j->type == JOB_STOP && job_is_conflicted_by(j)));
+                                log_unit_debug(k->unit,
+                                               "Looking at job %s/%s conflicted_by=%s",
+                                               k->unit->id, job_type_to_string(k->type),
+                                               yes_no(k->type == JOB_STOP && job_is_conflicted_by(k)));
+
+                                if (j->type == JOB_STOP) {
+
+                                        if (job_is_conflicted_by(j))
+                                                d = k;
+                                        else
+                                                d = j;
+
+                                } else if (k->type == JOB_STOP) {
+
+                                        if (job_is_conflicted_by(k))
+                                                d = j;
+                                        else
+                                                d = k;
+                                } else
+                                        d = j;
+
+                        } else if (!j->matters_to_anchor)
+                                d = j;
+                        else if (!k->matters_to_anchor)
+                                d = k;
+                        else
+                                return -ENOEXEC;
+
+                        /* Ok, we can drop one, so let's do so. */
+                        log_unit_debug(d->unit,
+                                       "Fixing conflicting jobs %s/%s,%s/%s by deleting job %s/%s",
+                                       j->unit->id, job_type_to_string(j->type),
+                                       k->unit->id, job_type_to_string(k->type),
+                                       d->unit->id, job_type_to_string(d->type));
+                        transaction_delete_job(tr, d, true);
+                        return 0;
+                }
+
+        return -EINVAL;
+}
+
+static int transaction_merge_jobs(Transaction *tr, sd_bus_error *e) {
+        Job *j;
+        int r;
+
+        assert(tr);
+
+        /* First step, check whether any of the jobs for one specific
+         * task conflict. If so, try to drop one of them. */
+        HASHMAP_FOREACH(j, tr->jobs) {
+                JobType t;
+
+                t = j->type;
+                LIST_FOREACH(transaction, k, j->transaction_next) {
+                        if (job_type_merge_and_collapse(&t, k->type, j->unit) >= 0)
+                                continue;
+
+                        /* OK, we could not merge all jobs for this
+                         * action. Let's see if we can get rid of one
+                         * of them */
+
+                        r = delete_one_unmergeable_job(tr, j);
+                        if (r >= 0)
+                                /* Ok, we managed to drop one, now
+                                 * let's ask our callers to call us
+                                 * again after garbage collecting */
+                                return -EAGAIN;
+
+                        /* We couldn't merge anything. Failure */
+                        return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_JOBS_CONFLICTING,
+                                                 "Transaction contains conflicting jobs '%s' and '%s' for %s. "
+                                                 "Probably contradicting requirement dependencies configured.",
+                                                 job_type_to_string(t),
+                                                 job_type_to_string(k->type),
+                                                 k->unit->id);
+                }
+        }
+
+        /* Second step, merge the jobs. */
+        HASHMAP_FOREACH(j, tr->jobs) {
+                JobType t = j->type;
+
+                /* Merge all transaction jobs for j->unit */
+                LIST_FOREACH(transaction, k, j->transaction_next)
+                        assert_se(job_type_merge_and_collapse(&t, k->type, j->unit) == 0);
+
+                Job *k;
+                while ((k = j->transaction_next)) {
+                        if (tr->anchor_job == k) {
+                                transaction_merge_and_delete_job(tr, k, j, t);
+                                j = k;
+                        } else
+                                transaction_merge_and_delete_job(tr, j, k, t);
+                }
+
+                assert(!j->transaction_next);
+                assert(!j->transaction_prev);
+        }
+
+        return 0;
+}
+
+static void transaction_drop_redundant(Transaction *tr) {
+        bool again;
+
+        /* Goes through the transaction and removes all jobs of the units whose jobs are all noops. If not
+         * all of a unit's jobs are redundant, they are kept. */
+
+        assert(tr);
+
+        do {
+                Job *j;
+
+                again = false;
+
+                HASHMAP_FOREACH(j, tr->jobs) {
+                        bool keep = false;
+
+                        LIST_FOREACH(transaction, k, j)
+                                if (tr->anchor_job == k ||
+                                    !job_type_is_redundant(k->type, unit_active_state(k->unit)) ||
+                                    (k->unit->job && job_type_is_conflicting(k->type, k->unit->job->type))) {
+                                        keep = true;
+                                        break;
+                                }
+
+                        if (!keep) {
+                                log_trace("Found redundant job %s/%s, dropping from transaction.",
+                                          j->unit->id, job_type_to_string(j->type));
+                                transaction_delete_job(tr, j, false);
+                                again = true;
+                                break;
+                        }
+                }
+        } while (again);
+}
+
+static bool job_matters_to_anchor(Job *job) {
+        assert(job);
+        assert(!job->transaction_prev);
+
+        /* Checks whether at least one of the jobs for this transaction matters to the anchor. */
+
+        LIST_FOREACH(transaction, j, job)
+                if (j->matters_to_anchor)
+                        return true;
+
+        return false;
+}
+
+static char* merge_unit_ids(const char* unit_log_field, char * const* pairs) {
+        _cleanup_free_ char *ans = NULL;
+        size_t size = 0;
+
+        assert(unit_log_field);
+
+        STRV_FOREACH_PAIR(unit_id, job_type, pairs) {
+                size_t next;
+
+                if (size > 0)
+                        ans[size - 1] = '\n';
+
+                next = strlen(unit_log_field) + strlen(*unit_id);
+                if (!GREEDY_REALLOC(ans, size + next + 1))
+                        return NULL;
+
+                sprintf(ans + size, "%s%s", unit_log_field, *unit_id);
+                size += next + 1;
+        }
+
+        if (!ans)
+                return strdup("");
+
+        return TAKE_PTR(ans);
+}
+
+static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsigned generation, sd_bus_error *e) {
+
+        static const UnitDependencyAtom directions[] = {
+                UNIT_ATOM_BEFORE,
+                UNIT_ATOM_AFTER,
+        };
+
+        int r;
+
+        assert(tr);
+        assert(j);
+        assert(!j->transaction_prev);
+
+        /* Does a recursive sweep through the ordering graph, looking for a cycle. If we find a cycle we try
+         * to break it. */
+
+        /* Have we seen this before? */
+        if (j->generation == generation) {
+                Job *k, *delete = NULL;
+                _cleanup_free_ char **array = NULL, *unit_ids = NULL;
+
+                /* If the marker is NULL we have been here already and decided the job was loop-free from
+                 * here. Hence shortcut things and return right-away. */
+                if (!j->marker)
+                        return 0;
+
+                /* So, the marker is not NULL and we already have been here. We have a cycle. Let's try to
+                 * break it. We go backwards in our path and try to find a suitable job to remove. We use the
+                 * marker to find our way back, since smart how we are we stored our way back in there. */
+                for (k = from; k; k = ((k->generation == generation && k->marker != k) ? k->marker : NULL)) {
+
+                        /* For logging below */
+                        if (strv_push_pair(&array, k->unit->id, (char*) job_type_to_string(k->type)) < 0)
+                                log_oom();
+
+                        if (!delete && hashmap_contains(tr->jobs, k->unit) && !job_matters_to_anchor(k))
+                                /* Ok, we can drop this one, so let's do so. */
+                                delete = k;
+
+                        /* Check if this in fact was the beginning of the cycle */
+                        if (k == j)
+                                break;
+                }
+
+                unit_ids = merge_unit_ids(j->manager->unit_log_field, array); /* ignore error */
+
+                STRV_FOREACH_PAIR(unit_id, job_type, array)
+                        /* logging for j not k here to provide a consistent narrative */
+                        log_struct(LOG_WARNING,
+                                   LOG_UNIT_MESSAGE(j->unit,
+                                                    "Found %s on %s/%s",
+                                                    unit_id == array ? "ordering cycle" : "dependency",
+                                                    *unit_id, *job_type),
+                                   "%s", strna(unit_ids));
+
+                if (delete) {
+                        const char *status;
+                        /* logging for j not k here to provide a consistent narrative */
+                        log_struct(LOG_ERR,
+                                   LOG_UNIT_MESSAGE(j->unit,
+                                                    "Job %s/%s deleted to break ordering cycle starting with %s/%s",
+                                                    delete->unit->id, job_type_to_string(delete->type),
+                                                    j->unit->id, job_type_to_string(j->type)),
+                                   "%s", strna(unit_ids));
+
+                        if (log_get_show_color())
+                                status = ANSI_HIGHLIGHT_RED " SKIP " ANSI_NORMAL;
+                        else
+                                status = " SKIP ";
+
+                        unit_status_printf(delete->unit,
+                                           STATUS_TYPE_NOTICE,
+                                           status,
+                                           "Ordering cycle found, skipping %s",
+                                           unit_status_string(delete->unit, NULL));
+                        transaction_delete_unit(tr, delete->unit);
+                        return -EAGAIN;
+                }
+
+                log_struct(LOG_ERR,
+                           LOG_UNIT_MESSAGE(j->unit, "Unable to break cycle starting with %s/%s",
+                                            j->unit->id, job_type_to_string(j->type)),
+                           "%s", strna(unit_ids));
+
+                return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC,
+                                         "Transaction order is cyclic. See system logs for details.");
+        }
+
+        /* Make the marker point to where we come from, so that we can
+         * find our way backwards if we want to break a cycle. We use
+         * a special marker for the beginning: we point to
+         * ourselves. */
+        j->marker = from ?: j;
+        j->generation = generation;
+
+        /* Actual ordering of jobs depends on the unit ordering dependency and job types. We need to traverse
+         * the graph over 'before' edges in the actual job execution order. We traverse over both unit
+         * ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job
+         * execution ordering. */
+        for (size_t d = 0; d < ELEMENTSOF(directions); d++) {
+                Unit *u;
+
+                UNIT_FOREACH_DEPENDENCY(u, j->unit, directions[d]) {
+                        Job *o;
+
+                        /* Is there a job for this unit? */
+                        o = hashmap_get(tr->jobs, u);
+                        if (!o) {
+                                /* Ok, there is no job for this in the transaction, but maybe there is
+                                 * already one running? */
+                                o = u->job;
+                                if (!o)
+                                        continue;
+                        }
+
+                        /* Cut traversing if the job j is not really *before* o. */
+                        if (job_compare(j, o, directions[d]) >= 0)
+                                continue;
+
+                        r = transaction_verify_order_one(tr, o, j, generation, e);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        /* Ok, let's backtrack, and remember that this entry is not on
+         * our path anymore. */
+        j->marker = NULL;
+
+        return 0;
+}
+
+static int transaction_verify_order(Transaction *tr, unsigned *generation, sd_bus_error *e) {
+        Job *j;
+        int r;
+        unsigned g;
+
+        assert(tr);
+        assert(generation);
+
+        /* Check if the ordering graph is cyclic. If it is, try to fix
+         * that up by dropping one of the jobs. */
+
+        g = (*generation)++;
+
+        HASHMAP_FOREACH(j, tr->jobs) {
+                r = transaction_verify_order_one(tr, j, NULL, g, e);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static void transaction_collect_garbage(Transaction *tr) {
+        bool again;
+
+        assert(tr);
+
+        /* Drop jobs that are not required by any other job */
+
+        do {
+                Job *j;
+
+                again = false;
+
+                HASHMAP_FOREACH(j, tr->jobs) {
+                        if (tr->anchor_job == j)
+                                continue;
+
+                        if (!j->object_list) {
+                                log_trace("Garbage collecting job %s/%s", j->unit->id, job_type_to_string(j->type));
+                                transaction_delete_job(tr, j, true);
+                                again = true;
+                                break;
+                        }
+
+                        log_trace("Keeping job %s/%s because of %s/%s",
+                                  j->unit->id, job_type_to_string(j->type),
+                                  j->object_list->subject ? j->object_list->subject->unit->id : "root",
+                                  j->object_list->subject ? job_type_to_string(j->object_list->subject->type) : "root");
+                }
+
+        } while (again);
+}
+
+static int transaction_is_destructive(Transaction *tr, JobMode mode, sd_bus_error *e) {
+        Job *j;
+
+        assert(tr);
+
+        /* Checks whether applying this transaction means that
+         * existing jobs would be replaced */
+
+        HASHMAP_FOREACH(j, tr->jobs) {
+
+                /* Assume merged */
+                assert(!j->transaction_prev);
+                assert(!j->transaction_next);
+
+                if (j->unit->job && (mode == JOB_FAIL || j->unit->job->irreversible) &&
+                    job_type_is_conflicting(j->unit->job->type, j->type))
+                        return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE,
+                                                 "Transaction for %s/%s is destructive (%s has '%s' job queued, but '%s' is included in transaction).",
+                                                 tr->anchor_job->unit->id, job_type_to_string(tr->anchor_job->type),
+                                                 j->unit->id, job_type_to_string(j->unit->job->type), job_type_to_string(j->type));
+        }
+
+        return 0;
+}
+
+static void transaction_minimize_impact(Transaction *tr) {
+        Job *head;
+
+        assert(tr);
+
+        /* Drops all unnecessary jobs that reverse already active jobs
+         * or that stop a running service. */
+
+rescan:
+        HASHMAP_FOREACH(head, tr->jobs) {
+                LIST_FOREACH(transaction, j, head) {
+                        bool stops_running_service, changes_existing_job;
+
+                        /* If it matters, we shouldn't drop it */
+                        if (j->matters_to_anchor)
+                                continue;
+
+                        /* Would this stop a running service?
+                         * Would this change an existing job?
+                         * If so, let's drop this entry */
+
+                        stops_running_service =
+                                j->type == JOB_STOP && UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(j->unit));
+
+                        changes_existing_job =
+                                j->unit->job &&
+                                job_type_is_conflicting(j->type, j->unit->job->type);
+
+                        if (!stops_running_service && !changes_existing_job)
+                                continue;
+
+                        if (stops_running_service)
+                                log_unit_debug(j->unit,
+                                               "%s/%s would stop a running service.",
+                                               j->unit->id, job_type_to_string(j->type));
+
+                        if (changes_existing_job)
+                                log_unit_debug(j->unit,
+                                               "%s/%s would change existing job.",
+                                               j->unit->id, job_type_to_string(j->type));
+
+                        /* Ok, let's get rid of this */
+                        log_unit_debug(j->unit,
+                                       "Deleting %s/%s to minimize impact.",
+                                       j->unit->id, job_type_to_string(j->type));
+
+                        transaction_delete_job(tr, j, true);
+                        goto rescan;
+                }
+        }
+}
+
+static int transaction_apply(
+                Transaction *tr,
+                Manager *m,
+                JobMode mode,
+                Set *affected_jobs) {
+
+        Job *j;
+        int r;
+
+        /* Moves the transaction jobs to the set of active jobs */
+
+        if (IN_SET(mode, JOB_ISOLATE, JOB_FLUSH)) {
+
+                /* When isolating first kill all installed jobs which
+                 * aren't part of the new transaction */
+                HASHMAP_FOREACH(j, m->jobs) {
+                        assert(j->installed);
+
+                        if (j->unit->ignore_on_isolate)
+                                continue;
+
+                        if (hashmap_contains(tr->jobs, j->unit))
+                                continue;
+
+                        /* Not invalidating recursively. Avoids triggering
+                         * OnFailure= actions of dependent jobs. Also avoids
+                         * invalidating our iterator. */
+                        job_finish_and_invalidate(j, JOB_CANCELED, false, false);
+                }
+        }
+
+        HASHMAP_FOREACH(j, tr->jobs) {
+                /* Assume merged */
+                assert(!j->transaction_prev);
+                assert(!j->transaction_next);
+
+                r = hashmap_ensure_put(&m->jobs, NULL, UINT32_TO_PTR(j->id), j);
+                if (r < 0)
+                        goto rollback;
+        }
+
+        while ((j = hashmap_steal_first(tr->jobs))) {
+                Job *installed_job;
+
+                /* Clean the job dependencies */
+                transaction_unlink_job(tr, j, false);
+
+                /* When RestartMode=direct is used, the service being restarted don't enter the inactive/failed
+                 * state, i.e. unit_process_job -> job_finish_and_invalidate is never called, and the previous
+                 * job might still be running (especially for Type=oneshot services). We need to refuse
+                 * late merge and re-enqueue the anchor job. */
+                installed_job = job_install(j,
+                                            /* refuse_late_merge = */ mode == JOB_RESTART_DEPENDENCIES && j == tr->anchor_job);
+                if (installed_job != j) {
+                        /* j has been merged into a previously installed job */
+                        if (tr->anchor_job == j)
+                                tr->anchor_job = installed_job;
+
+                        hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j);
+                        free_and_replace_full(j, installed_job, job_free);
+                }
+
+                job_add_to_run_queue(j);
+                job_add_to_dbus_queue(j);
+                job_start_timer(j, false);
+                job_shutdown_magic(j);
+
+                /* When 'affected' is specified, let's track all in it all jobs that were touched because of
+                 * this transaction. */
+                if (affected_jobs)
+                        (void) set_put(affected_jobs, j);
+        }
+
+        return 0;
+
+rollback:
+
+        HASHMAP_FOREACH(j, tr->jobs)
+                hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j);
+
+        return r;
+}
+
+int transaction_activate(
+                Transaction *tr,
+                Manager *m,
+                JobMode mode,
+                Set *affected_jobs,
+                sd_bus_error *e) {
+
+        Job *j;
+        int r;
+        unsigned generation = 1;
+
+        assert(tr);
+
+        /* This applies the changes recorded in tr->jobs to
+         * the actual list of jobs, if possible. */
+
+        /* Reset the generation counter of all installed jobs. The detection of cycles
+         * looks at installed jobs. If they had a non-zero generation from some previous
+         * walk of the graph, the algorithm would break. */
+        HASHMAP_FOREACH(j, m->jobs)
+                j->generation = 0;
+
+        /* First step: figure out which jobs matter */
+        transaction_find_jobs_that_matter_to_anchor(tr->anchor_job, generation++);
+
+        /* Second step: Try not to stop any running services if
+         * we don't have to. Don't try to reverse running
+         * jobs if we don't have to. */
+        if (mode == JOB_FAIL)
+                transaction_minimize_impact(tr);
+
+        /* Third step: Drop redundant jobs */
+        transaction_drop_redundant(tr);
+
+        for (;;) {
+                /* Fourth step: Let's remove unneeded jobs that might
+                 * be lurking. */
+                if (mode != JOB_ISOLATE)
+                        transaction_collect_garbage(tr);
+
+                /* Fifth step: verify order makes sense and correct
+                 * cycles if necessary and possible */
+                r = transaction_verify_order(tr, &generation, e);
+                if (r >= 0)
+                        break;
+
+                if (r != -EAGAIN)
+                        return log_warning_errno(r, "Requested transaction contains an unfixable cyclic ordering dependency: %s", bus_error_message(e, r));
+
+                /* Let's see if the resulting transaction ordering
+                 * graph is still cyclic... */
+        }
+
+        for (;;) {
+                /* Sixth step: let's drop unmergeable entries if
+                 * necessary and possible, merge entries we can
+                 * merge */
+                r = transaction_merge_jobs(tr, e);
+                if (r >= 0)
+                        break;
+
+                if (r != -EAGAIN)
+                        return log_warning_errno(r, "Requested transaction contains unmergeable jobs: %s", bus_error_message(e, r));
+
+                /* Seventh step: an entry got dropped, let's garbage
+                 * collect its dependencies. */
+                if (mode != JOB_ISOLATE)
+                        transaction_collect_garbage(tr);
+
+                /* Let's see if the resulting transaction still has
+                 * unmergeable entries ... */
+        }
+
+        /* Eights step: Drop redundant jobs again, if the merging now allows us to drop more. */
+        transaction_drop_redundant(tr);
+
+        /* Ninth step: check whether we can actually apply this */
+        r = transaction_is_destructive(tr, mode, e);
+        if (r < 0)
+                return log_notice_errno(r, "Requested transaction contradicts existing jobs: %s", bus_error_message(e, r));
+
+        /* Tenth step: apply changes */
+        r = transaction_apply(tr, m, mode, affected_jobs);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to apply transaction: %m");
+
+        assert(hashmap_isempty(tr->jobs));
+
+        /* Are there any jobs now? Then make sure we have the idle pipe around. We don't really care too much
+         * whether this works or not, as the idle pipe is a feature for cosmetics, not actually useful for
+         * anything beyond that. */
+        if (!hashmap_isempty(m->jobs))
+                (void) manager_allocate_idle_pipe(m);
+
+        return 0;
+}
+
+static Job* transaction_add_one_job(Transaction *tr, JobType type, Unit *unit, bool *is_new) {
+        Job *j, *f;
+
+        assert(tr);
+        assert(unit);
+
+        /* Looks for an existing prospective job and returns that. If
+         * it doesn't exist it is created and added to the prospective
+         * jobs list. */
+
+        f = hashmap_get(tr->jobs, unit);
+
+        LIST_FOREACH(transaction, i, f) {
+                assert(i->unit == unit);
+
+                if (i->type == type) {
+                        if (is_new)
+                                *is_new = false;
+                        return i;
+                }
+        }
+
+        j = job_new(unit, type);
+        if (!j)
+                return NULL;
+
+        j->generation = 0;
+        j->marker = NULL;
+        j->matters_to_anchor = false;
+        j->irreversible = tr->irreversible;
+
+        LIST_PREPEND(transaction, f, j);
+
+        if (hashmap_replace(tr->jobs, unit, f) < 0) {
+                LIST_REMOVE(transaction, f, j);
+                job_free(j);
+                return NULL;
+        }
+
+        if (is_new)
+                *is_new = true;
+
+        log_trace("Added job %s/%s to transaction.", unit->id, job_type_to_string(type));
+
+        return j;
+}
+
+static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies) {
+        assert(tr);
+        assert(j);
+
+        if (j->transaction_prev)
+                j->transaction_prev->transaction_next = j->transaction_next;
+        else if (j->transaction_next)
+                hashmap_replace(tr->jobs, j->unit, j->transaction_next);
+        else
+                hashmap_remove_value(tr->jobs, j->unit, j);
+
+        if (j->transaction_next)
+                j->transaction_next->transaction_prev = j->transaction_prev;
+
+        j->transaction_prev = j->transaction_next = NULL;
+
+        while (j->subject_list)
+                job_dependency_free(j->subject_list);
+
+        while (j->object_list) {
+                Job *other = j->object_list->matters ? j->object_list->subject : NULL;
+
+                job_dependency_free(j->object_list);
+
+                if (other && delete_dependencies) {
+                        log_unit_debug(other->unit,
+                                       "Deleting job %s/%s as dependency of job %s/%s",
+                                       other->unit->id, job_type_to_string(other->type),
+                                       j->unit->id, job_type_to_string(j->type));
+                        transaction_delete_job(tr, other, delete_dependencies);
+                }
+        }
+}
+
+void transaction_add_propagate_reload_jobs(
+                Transaction *tr,
+                Unit *unit,
+                Job *by,
+                TransactionAddFlags flags) {
+
+        JobType nt;
+        Unit *dep;
+        int r;
+
+        assert(tr);
+        assert(unit);
+
+        UNIT_FOREACH_DEPENDENCY(dep, unit, UNIT_ATOM_PROPAGATES_RELOAD_TO) {
+                _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+
+                nt = job_type_collapse(JOB_TRY_RELOAD, dep);
+                if (nt == JOB_NOP)
+                        continue;
+
+                r = transaction_add_job_and_dependencies(tr, nt, dep, by, flags, &e);
+                if (r < 0)
+                        log_unit_warning(dep,
+                                         "Cannot add dependency reload job, ignoring: %s",
+                                         bus_error_message(&e, r));
+        }
+}
+
+static JobType job_type_propagate_stop_graceful(Job *j) {
+        JobType type;
+
+        if (!j)
+                return JOB_STOP;
+
+        type = JOB_STOP;
+
+        LIST_FOREACH(transaction, i, j)
+                switch (i->type) {
+
+                case JOB_STOP:
+                case JOB_RESTART:
+                        /* Nothing to worry about, an appropriate job is in-place */
+                        return JOB_NOP;
+
+                case JOB_START:
+                        /* This unit is pulled in by other dependency types in this transaction. We will run
+                         * into job type conflict if we enqueue a stop job, so let's enqueue a restart job
+                         * instead. */
+                        type = JOB_RESTART;
+                        break;
+
+                default: /* We don't care about others */
+                        ;
+
+                }
+
+        return type;
+}
+
+int transaction_add_job_and_dependencies(
+                Transaction *tr,
+                JobType type,
+                Unit *unit,
+                Job *by,
+                TransactionAddFlags flags,
+                sd_bus_error *e) {
+
+        bool is_new;
+        Job *ret;
+        int r;
+
+        assert(tr);
+        assert(type < _JOB_TYPE_MAX);
+        assert(type < _JOB_TYPE_MAX_IN_TRANSACTION);
+        assert(unit);
+
+        /* Before adding jobs for this unit, let's ensure that its state has been loaded This matters when
+         * jobs are spawned as part of coldplugging itself (see e. g. path_coldplug()).  This way, we
+         * "recursively" coldplug units, ensuring that we do not look at state of not-yet-coldplugged
+         * units. */
+        if (MANAGER_IS_RELOADING(unit->manager))
+                unit_coldplug(unit);
+
+        if (by)
+                log_trace("Pulling in %s/%s from %s/%s", unit->id, job_type_to_string(type), by->unit->id, job_type_to_string(by->type));
+
+        /* Safety check that the unit is a valid state, i.e. not in UNIT_STUB or UNIT_MERGED which should only be set
+         * temporarily. */
+        if (!UNIT_IS_LOAD_COMPLETE(unit->load_state))
+                return sd_bus_error_setf(e, BUS_ERROR_LOAD_FAILED, "Unit %s is not loaded properly.", unit->id);
+
+        if (type != JOB_STOP) {
+                r = bus_unit_validate_load_state(unit, e);
+                /* The time-based cache allows to start new units without daemon-reload, but if they are
+                 * already referenced (because of dependencies or ordering) then we have to force a load of
+                 * the fragment. As an optimization, check first if anything in the usual paths was modified
+                 * since the last time the cache was loaded. Also check if the last time an attempt to load
+                 * the unit was made was before the most recent cache refresh, so that we know we need to try
+                 * again — even if the cache is current, it might have been updated in a different context
+                 * before we had a chance to retry loading this particular unit.
+                 *
+                 * Given building up the transaction is a synchronous operation, attempt
+                 * to load the unit immediately. */
+                if (r < 0 && manager_unit_cache_should_retry_load(unit)) {
+                        sd_bus_error_free(e);
+                        unit->load_state = UNIT_STUB;
+                        r = unit_load(unit);
+                        if (r < 0 || unit->load_state == UNIT_STUB)
+                                unit->load_state = UNIT_NOT_FOUND;
+                        r = bus_unit_validate_load_state(unit, e);
+                }
+                if (r < 0)
+                        return r;
+        }
+
+        if (!unit_job_is_applicable(unit, type))
+                return sd_bus_error_setf(e, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE,
+                                         "Job type %s is not applicable for unit %s.",
+                                         job_type_to_string(type), unit->id);
+
+        /* First add the job. */
+        ret = transaction_add_one_job(tr, type, unit, &is_new);
+        if (!ret)
+                return -ENOMEM;
+
+        if (FLAGS_SET(flags, TRANSACTION_IGNORE_ORDER))
+                ret->ignore_order = true;
+
+        /* Then, add a link to the job. */
+        if (by) {
+                if (!job_dependency_new(by, ret, FLAGS_SET(flags, TRANSACTION_MATTERS), FLAGS_SET(flags, TRANSACTION_CONFLICTS)))
+                        return -ENOMEM;
+        } else {
+                /* If the job has no parent job, it is the anchor job. */
+                assert(!tr->anchor_job);
+                tr->anchor_job = ret;
+        }
+
+        if (!is_new || FLAGS_SET(flags, TRANSACTION_IGNORE_REQUIREMENTS) || type == JOB_NOP)
+                return 0;
+
+        _cleanup_set_free_ Set *following = NULL;
+        Unit *dep;
+
+        /* If we are following some other unit, make sure we add all dependencies of everybody following. */
+        if (unit_following_set(ret->unit, &following) > 0)
+                SET_FOREACH(dep, following) {
+                        r = transaction_add_job_and_dependencies(tr, type, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e);
+                        if (r < 0) {
+                                log_unit_full_errno(dep, r == -ERFKILL ? LOG_INFO : LOG_WARNING, r,
+                                                    "Cannot add dependency job, ignoring: %s",
+                                                    bus_error_message(e, r));
+                                sd_bus_error_free(e);
+                        }
+                }
+
+        /* Finally, recursively add in all dependencies. */
+        if (IN_SET(type, JOB_START, JOB_RESTART)) {
+                UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START) {
+                        r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e);
+                        if (r < 0) {
+                                if (r != -EBADR) /* job type not applicable */
+                                        goto fail;
+
+                                sd_bus_error_free(e);
+                        }
+                }
+
+                UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START_IGNORED) {
+                        r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e);
+                        if (r < 0) {
+                                /* unit masked, job type not applicable and unit not found are not considered
+                                 * as errors. */
+                                log_unit_full_errno(dep,
+                                                    IN_SET(r, -ERFKILL, -EBADR, -ENOENT) ? LOG_DEBUG : LOG_WARNING,
+                                                    r, "Cannot add dependency job, ignoring: %s",
+                                                    bus_error_message(e, r));
+                                sd_bus_error_free(e);
+                        }
+                }
+
+                UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_VERIFY) {
+                        r = transaction_add_job_and_dependencies(tr, JOB_VERIFY_ACTIVE, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e);
+                        if (r < 0) {
+                                if (r != -EBADR) /* job type not applicable */
+                                        goto fail;
+
+                                sd_bus_error_free(e);
+                        }
+                }
+
+                UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP) {
+                        r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, TRANSACTION_MATTERS | TRANSACTION_CONFLICTS | (flags & TRANSACTION_IGNORE_ORDER), e);
+                        if (r < 0) {
+                                if (r != -EBADR) /* job type not applicable */
+                                        goto fail;
+
+                                sd_bus_error_free(e);
+                        }
+                }
+
+                UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP_IGNORED) {
+                        r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e);
+                        if (r < 0) {
+                                log_unit_warning(dep,
+                                                 "Cannot add dependency job, ignoring: %s",
+                                                 bus_error_message(e, r));
+                                sd_bus_error_free(e);
+                        }
+                }
+        }
+
+        if (IN_SET(type, JOB_RESTART, JOB_STOP) || (type == JOB_START && FLAGS_SET(flags, TRANSACTION_PROPAGATE_START_AS_RESTART))) {
+                bool is_stop = type == JOB_STOP;
+
+                UNIT_FOREACH_DEPENDENCY(dep, ret->unit, is_stop ? UNIT_ATOM_PROPAGATE_STOP : UNIT_ATOM_PROPAGATE_RESTART) {
+                        /* We propagate RESTART only as TRY_RESTART, in order not to start dependencies that
+                         * are not around. */
+                        JobType nt;
+
+                        nt = job_type_collapse(is_stop ? JOB_STOP : JOB_TRY_RESTART, dep);
+                        if (nt == JOB_NOP)
+                                continue;
+
+                        r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e);
+                        if (r < 0) {
+                                if (r != -EBADR) /* job type not applicable */
+                                        return r;
+
+                                sd_bus_error_free(e);
+                        }
+                }
+
+                /* Process UNIT_ATOM_PROPAGATE_STOP_GRACEFUL (PropagatesStopTo=) units. We need to wait until
+                 * all other dependencies are processed, i.e. we're the anchor job or already in the recursion
+                 * that handles it. */
+                if (!by || FLAGS_SET(flags, TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL))
+                        UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PROPAGATE_STOP_GRACEFUL) {
+                                JobType nt;
+                                Job *j;
+
+                                j = hashmap_get(tr->jobs, dep);
+                                nt = job_type_propagate_stop_graceful(j);
+
+                                if (nt == JOB_NOP)
+                                        continue;
+
+                                r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER) | TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL, e);
+                                if (r < 0) {
+                                        if (r != -EBADR) /* job type not applicable */
+                                                return r;
+
+                                        sd_bus_error_free(e);
+                                }
+                        }
+        }
+
+        if (type == JOB_RELOAD)
+                transaction_add_propagate_reload_jobs(tr, ret->unit, ret, flags & TRANSACTION_IGNORE_ORDER);
+
+        /* JOB_VERIFY_ACTIVE requires no dependency handling */
+
+        return 0;
+
+fail:
+        /* Recursive call failed to add required jobs so let's drop top level job as well. */
+        log_unit_debug_errno(unit, r, "Cannot add dependency job to transaction, deleting job %s/%s again: %s",
+                             unit->id, job_type_to_string(type), bus_error_message(e, r));
+
+        transaction_delete_job(tr, ret, /* delete_dependencies= */ false);
+        return r;
+}
+
+static bool shall_stop_on_isolate(Transaction *tr, Unit *u) {
+        assert(tr);
+        assert(u);
+
+        if (u->ignore_on_isolate)
+                return false;
+
+        /* Is there already something listed for this? */
+        if (hashmap_contains(tr->jobs, u))
+                return false;
+
+        return true;
+}
+
+int transaction_add_isolate_jobs(Transaction *tr, Manager *m) {
+        Unit *u;
+        char *k;
+        int r;
+
+        assert(tr);
+        assert(m);
+
+        HASHMAP_FOREACH_KEY(u, k, m->units) {
+                _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+                Unit *o;
+
+                /* Ignore aliases */
+                if (u->id != k)
+                        continue;
+
+                /* No need to stop inactive units */
+                if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) && !u->job)
+                        continue;
+
+                if (!shall_stop_on_isolate(tr, u))
+                        continue;
+
+                /* Keep units that are triggered by units we want to keep around. */
+                bool keep = false;
+                UNIT_FOREACH_DEPENDENCY(o, u, UNIT_ATOM_TRIGGERED_BY)
+                        if (!shall_stop_on_isolate(tr, o)) {
+                                keep = true;
+                                break;
+                        }
+                if (keep)
+                        continue;
+
+                r = transaction_add_job_and_dependencies(tr, JOB_STOP, u, tr->anchor_job, TRANSACTION_MATTERS, &e);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Cannot add isolate job, ignoring: %s", bus_error_message(&e, r));
+        }
+
+        return 0;
+}
+
+int transaction_add_triggering_jobs(Transaction *tr, Unit *u) {
+        Unit *trigger;
+        int r;
+
+        assert(tr);
+        assert(u);
+
+        UNIT_FOREACH_DEPENDENCY(trigger, u, UNIT_ATOM_TRIGGERED_BY) {
+                _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+
+                /* No need to stop inactive jobs */
+                if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger)) && !trigger->job)
+                        continue;
+
+                /* Is there already something listed for this? */
+                if (hashmap_contains(tr->jobs, trigger))
+                        continue;
+
+                r = transaction_add_job_and_dependencies(tr, JOB_STOP, trigger, tr->anchor_job, TRANSACTION_MATTERS, &e);
+                if (r < 0)
+                        log_unit_warning_errno(u, r, "Cannot add triggered by job, ignoring: %s", bus_error_message(&e, r));
+        }
+
+        return 0;
+}
+
+Transaction *transaction_new(bool irreversible) {
+        Transaction *tr;
+
+        tr = new0(Transaction, 1);
+        if (!tr)
+                return NULL;
+
+        tr->jobs = hashmap_new(NULL);
+        if (!tr->jobs)
+                return mfree(tr);
+
+        tr->irreversible = irreversible;
+
+        return tr;
+}
+
+Transaction *transaction_free(Transaction *tr) {
+        if (!tr)
+                return NULL;
+
+        assert(hashmap_isempty(tr->jobs));
+        hashmap_free(tr->jobs);
+
+        return mfree(tr);
+}
+
+Transaction *transaction_abort_and_free(Transaction *tr) {
+        if (!tr)
+                return NULL;
+
+        transaction_abort(tr);
+
+        return transaction_free(tr);
+}
diff --git a/src/core/transaction.h b/src/core/transaction.h
new file mode 100644
index 0000000..151e02d
--- /dev/null
+++ b/src/core/transaction.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Transaction Transaction;
+
+#include "hashmap.h"
+#include "job.h"
+#include "manager.h"
+#include "unit.h"
+
+struct Transaction {
+        /* Jobs to be added */
+        Hashmap *jobs;      /* Unit object => Job object list 1:1 */
+        Job *anchor_job;      /* the job the user asked for */
+        bool irreversible;
+};
+
+Transaction *transaction_new(bool irreversible);
+Transaction *transaction_free(Transaction *tr);
+Transaction *transaction_abort_and_free(Transaction *tr);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Transaction*, transaction_abort_and_free);
+
+typedef enum TransactionAddFlags {
+        TRANSACTION_MATTERS                         = 1 << 0,
+        TRANSACTION_CONFLICTS                       = 1 << 1,
+        TRANSACTION_IGNORE_REQUIREMENTS             = 1 << 2,
+        TRANSACTION_IGNORE_ORDER                    = 1 << 3,
+
+        /* Propagate a START job to other units like a RESTART */
+        TRANSACTION_PROPAGATE_START_AS_RESTART      = 1 << 4,
+
+        /* Indicate that we're in the recursion for processing UNIT_ATOM_PROPAGATE_STOP_GRACEFUL units */
+        TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL = 1 << 5,
+} TransactionAddFlags;
+
+void transaction_add_propagate_reload_jobs(
+                Transaction *tr,
+                Unit *unit, Job *by,
+                TransactionAddFlags flags);
+
+int transaction_add_job_and_dependencies(
+                Transaction *tr,
+                JobType type,
+                Unit *unit,
+                Job *by,
+                TransactionAddFlags flags,
+                sd_bus_error *e);
+
+int transaction_activate(Transaction *tr, Manager *m, JobMode mode, Set *affected, sd_bus_error *e);
+int transaction_add_isolate_jobs(Transaction *tr, Manager *m);
+int transaction_add_triggering_jobs(Transaction *tr, Unit *u);
diff --git a/src/core/unit-dependency-atom.c b/src/core/unit-dependency-atom.c
new file mode 100644
index 0000000..35b279b
--- /dev/null
+++ b/src/core/unit-dependency-atom.c
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "unit-dependency-atom.h"
+
+static const UnitDependencyAtom atom_map[_UNIT_DEPENDENCY_MAX] = {
+        /* A table that maps high-level dependency types to low-level dependency "atoms". The latter actually
+         * describe specific facets of dependency behaviour. The former combine them into one user-facing
+         * concept. Atoms are a bit mask, though a bunch of dependency types have only a single bit set.
+         *
+         * Typically when the user configures a dependency they go via dependency type, but when we act on
+         * them we go by atom.
+         *
+         * NB: when you add a new dependency type here, make sure to also add one to the (best-effort)
+         * reverse table in unit_dependency_from_unique_atom() further down. */
+
+        [UNIT_REQUIRES]               = UNIT_ATOM_PULL_IN_START |
+                                        UNIT_ATOM_RETROACTIVE_START_REPLACE |
+                                        UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                                        UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+        [UNIT_REQUISITE]              = UNIT_ATOM_PULL_IN_VERIFY |
+                                        UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                                        UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+        [UNIT_WANTS]                  = UNIT_ATOM_PULL_IN_START_IGNORED |
+                                        UNIT_ATOM_RETROACTIVE_START_FAIL |
+                                        UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                                        UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+        [UNIT_BINDS_TO]               = UNIT_ATOM_PULL_IN_START |
+                                        UNIT_ATOM_RETROACTIVE_START_REPLACE |
+                                        UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT |
+                                        UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                                        UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+        [UNIT_PART_OF]                = UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+        [UNIT_UPHOLDS]                = UNIT_ATOM_PULL_IN_START_IGNORED |
+                                        UNIT_ATOM_RETROACTIVE_START_REPLACE |
+                                        UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE |
+                                        UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                                        UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+        [UNIT_REQUIRED_BY]            = UNIT_ATOM_PROPAGATE_STOP |
+                                        UNIT_ATOM_PROPAGATE_RESTART |
+                                        UNIT_ATOM_PROPAGATE_START_FAILURE |
+                                        UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+                                        UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+        [UNIT_REQUISITE_OF]           = UNIT_ATOM_PROPAGATE_STOP |
+                                        UNIT_ATOM_PROPAGATE_RESTART |
+                                        UNIT_ATOM_PROPAGATE_START_FAILURE |
+                                        UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE |
+                                        UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+                                        UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+        [UNIT_WANTED_BY]              = UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+                                        UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED,
+
+        [UNIT_BOUND_BY]               = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+                                        UNIT_ATOM_PROPAGATE_STOP |
+                                        UNIT_ATOM_PROPAGATE_RESTART |
+                                        UNIT_ATOM_PROPAGATE_START_FAILURE |
+                                        UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+                                        UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE |
+                                        UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+        [UNIT_UPHELD_BY]              = UNIT_ATOM_START_STEADILY |
+                                        UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+                                        UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED,
+
+        [UNIT_CONSISTS_OF]            = UNIT_ATOM_PROPAGATE_STOP |
+                                        UNIT_ATOM_PROPAGATE_RESTART,
+
+        [UNIT_CONFLICTS]              = UNIT_ATOM_PULL_IN_STOP |
+                                        UNIT_ATOM_RETROACTIVE_STOP_ON_START,
+
+        [UNIT_CONFLICTED_BY]          = UNIT_ATOM_PULL_IN_STOP_IGNORED |
+                                        UNIT_ATOM_RETROACTIVE_STOP_ON_START |
+                                        UNIT_ATOM_PROPAGATE_STOP_FAILURE,
+
+        [UNIT_PROPAGATES_STOP_TO]     = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+                                        UNIT_ATOM_PROPAGATE_STOP_GRACEFUL,
+
+        /* These are simple dependency types: they consist of a single atom only */
+        [UNIT_ON_FAILURE]             = UNIT_ATOM_ON_FAILURE,
+        [UNIT_ON_SUCCESS]             = UNIT_ATOM_ON_SUCCESS,
+        [UNIT_ON_FAILURE_OF]          = UNIT_ATOM_ON_FAILURE_OF,
+        [UNIT_ON_SUCCESS_OF]          = UNIT_ATOM_ON_SUCCESS_OF,
+        [UNIT_BEFORE]                 = UNIT_ATOM_BEFORE,
+        [UNIT_AFTER]                  = UNIT_ATOM_AFTER,
+        [UNIT_TRIGGERS]               = UNIT_ATOM_TRIGGERS,
+        [UNIT_TRIGGERED_BY]           = UNIT_ATOM_TRIGGERED_BY,
+        [UNIT_PROPAGATES_RELOAD_TO]   = UNIT_ATOM_PROPAGATES_RELOAD_TO,
+        [UNIT_JOINS_NAMESPACE_OF]     = UNIT_ATOM_JOINS_NAMESPACE_OF,
+        [UNIT_REFERENCES]             = UNIT_ATOM_REFERENCES,
+        [UNIT_REFERENCED_BY]          = UNIT_ATOM_REFERENCED_BY,
+        [UNIT_IN_SLICE]               = UNIT_ATOM_IN_SLICE,
+        [UNIT_SLICE_OF]               = UNIT_ATOM_SLICE_OF,
+
+        /* These are dependency types without effect on our state engine. We maintain them only to make
+         * things discoverable/debuggable as they are the inverse dependencies to some of the above. As they
+         * have no effect of their own, they all map to no atoms at all, i.e. the value 0. */
+        [UNIT_RELOAD_PROPAGATED_FROM] = 0,
+        [UNIT_STOP_PROPAGATED_FROM]   = 0,
+};
+
+UnitDependencyAtom unit_dependency_to_atom(UnitDependency d) {
+        if (d < 0)
+                return _UNIT_DEPENDENCY_ATOM_INVALID;
+
+        assert(d < _UNIT_DEPENDENCY_MAX);
+
+        return atom_map[d];
+}
+
+UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom) {
+
+        /* This is a "best-effort" function that maps the specified 'atom' mask to a dependency type that is
+         * is equal to or has a superset of bits set if that's uniquely possible. The idea is that this
+         * function is used when iterating through deps that have a specific atom: if there's exactly one
+         * dependency type of the specific atom we don't need iterate through all deps a unit has, but can
+         * pinpoint things directly.
+         *
+         * This function will return _UNIT_DEPENDENCY_INVALID in case the specified value is not known or not
+         * uniquely defined, i.e. there are multiple dependencies with the atom or the combination set. */
+
+        switch ((int64_t) atom) {
+
+                /* Note that we can't list UNIT_REQUIRES here since it's a true subset of UNIT_BINDS_TO, and
+                 * hence its atom bits not uniquely mappable. */
+
+        case UNIT_ATOM_PULL_IN_VERIFY |
+                UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+        case UNIT_ATOM_PULL_IN_VERIFY: /* a single dep type uses this atom */
+                return UNIT_REQUISITE;
+
+        case UNIT_ATOM_PULL_IN_START_IGNORED |
+                UNIT_ATOM_RETROACTIVE_START_FAIL |
+                UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+        case UNIT_ATOM_RETROACTIVE_START_FAIL:
+                return UNIT_WANTS;
+
+        case UNIT_ATOM_PULL_IN_START |
+                UNIT_ATOM_RETROACTIVE_START_REPLACE |
+                UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT |
+                UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+        case UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT:
+                return UNIT_BINDS_TO;
+
+        case UNIT_ATOM_PULL_IN_START_IGNORED |
+                UNIT_ATOM_RETROACTIVE_START_REPLACE |
+                UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE |
+                UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+                UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+        case UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE:
+                return UNIT_UPHOLDS;
+
+        case UNIT_ATOM_PROPAGATE_STOP |
+                UNIT_ATOM_PROPAGATE_RESTART |
+                UNIT_ATOM_PROPAGATE_START_FAILURE |
+                UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE |
+                UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+                UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES:
+        case UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE:
+                return UNIT_REQUISITE_OF;
+
+        case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+                UNIT_ATOM_PROPAGATE_STOP |
+                UNIT_ATOM_PROPAGATE_RESTART |
+                UNIT_ATOM_PROPAGATE_START_FAILURE |
+                UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+                UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE |
+                UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES:
+        case UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE:
+                return UNIT_BOUND_BY;
+
+        case UNIT_ATOM_START_STEADILY |
+                UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+                UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED:
+        case UNIT_ATOM_START_STEADILY:
+                return UNIT_UPHELD_BY;
+
+        case UNIT_ATOM_PULL_IN_STOP |
+                UNIT_ATOM_RETROACTIVE_STOP_ON_START:
+        case UNIT_ATOM_PULL_IN_STOP:
+                return UNIT_CONFLICTS;
+
+        case UNIT_ATOM_PULL_IN_STOP_IGNORED |
+                UNIT_ATOM_RETROACTIVE_STOP_ON_START |
+                UNIT_ATOM_PROPAGATE_STOP_FAILURE:
+        case UNIT_ATOM_PULL_IN_STOP_IGNORED:
+        case UNIT_ATOM_PROPAGATE_STOP_FAILURE:
+                return UNIT_CONFLICTED_BY;
+
+        case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+                UNIT_ATOM_PROPAGATE_STOP_GRACEFUL:
+        case UNIT_ATOM_PROPAGATE_STOP_GRACEFUL:
+                return UNIT_PROPAGATES_STOP_TO;
+
+        /* And now, the simple ones */
+
+        case UNIT_ATOM_ON_FAILURE:
+                return UNIT_ON_FAILURE;
+
+        case UNIT_ATOM_ON_SUCCESS:
+                return UNIT_ON_SUCCESS;
+
+        case UNIT_ATOM_ON_SUCCESS_OF:
+                return UNIT_ON_SUCCESS_OF;
+
+        case UNIT_ATOM_ON_FAILURE_OF:
+                return UNIT_ON_FAILURE_OF;
+
+        case UNIT_ATOM_BEFORE:
+                return UNIT_BEFORE;
+
+        case UNIT_ATOM_AFTER:
+                return UNIT_AFTER;
+
+        case UNIT_ATOM_TRIGGERS:
+                return UNIT_TRIGGERS;
+
+        case UNIT_ATOM_TRIGGERED_BY:
+                return UNIT_TRIGGERED_BY;
+
+        case UNIT_ATOM_PROPAGATES_RELOAD_TO:
+                return UNIT_PROPAGATES_RELOAD_TO;
+
+        case UNIT_ATOM_JOINS_NAMESPACE_OF:
+                return UNIT_JOINS_NAMESPACE_OF;
+
+        case UNIT_ATOM_REFERENCES:
+                return UNIT_REFERENCES;
+
+        case UNIT_ATOM_REFERENCED_BY:
+                return UNIT_REFERENCED_BY;
+
+        case UNIT_ATOM_IN_SLICE:
+                return UNIT_IN_SLICE;
+
+        case UNIT_ATOM_SLICE_OF:
+                return UNIT_SLICE_OF;
+
+        default:
+                return _UNIT_DEPENDENCY_INVALID;
+        }
+}
diff --git a/src/core/unit-dependency-atom.h b/src/core/unit-dependency-atom.h
new file mode 100644
index 0000000..96f00ca
--- /dev/null
+++ b/src/core/unit-dependency-atom.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "unit-def.h"
+
+/* Flags that identify the various "atomic" behaviours a specific dependency type implies. Each dependency is
+ * a combination of one or more of these flags that define what they actually entail. */
+typedef enum UnitDependencyAtom {
+
+        /* This unit pulls in the other unit as JOB_START job into the transaction, and if that doesn't work
+         * the transaction fails. */
+        UNIT_ATOM_PULL_IN_START                       = UINT64_C(1) << 0,
+        /* Similar, but if it doesn't work, ignore. */
+        UNIT_ATOM_PULL_IN_START_IGNORED               = UINT64_C(1) << 1,
+        /* Pull in a JOB_VERIFY job into the transaction, i.e. pull in JOB_VERIFY rather than
+         * JOB_START. i.e. check the unit is started but don't pull it in. */
+        UNIT_ATOM_PULL_IN_VERIFY                      = UINT64_C(1) << 2,
+
+        /* Pull in a JOB_STOP job for the other job into transactions, and fail if that doesn't work. */
+        UNIT_ATOM_PULL_IN_STOP                        = UINT64_C(1) << 3,
+        /* Same, but don't fail, ignore it. */
+        UNIT_ATOM_PULL_IN_STOP_IGNORED                = UINT64_C(1) << 4,
+
+        /* If our enters inactive state, add the other unit to the StopWhenUneeded= queue */
+        UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE        = UINT64_C(1) << 5,
+        /* Pin the other unit i.e. ensure StopWhenUneeded= won't trigger for the other unit as long as we are
+         * not in inactive state */
+        UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED             = UINT64_C(1) << 6,
+
+        /* Stop our unit if the other unit happens to inactive */
+        UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT            = UINT64_C(1) << 7,
+        /* If our unit enters inactive state, add the other unit to the BoundBy= queue */
+        UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE  = UINT64_C(1) << 8,
+
+        /* Start this unit whenever we find it inactive and the other unit active */
+        UNIT_ATOM_START_STEADILY                      = UINT64_C(1) << 9,
+        /* Whenever our unit becomes active, add other unit to start_when_upheld_queue */
+        UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE         = UINT64_C(1) << 10,
+
+        /* If our unit unexpectedly becomes active, retroactively start the other unit too, in "replace" job
+         * mode */
+        UNIT_ATOM_RETROACTIVE_START_REPLACE           = UINT64_C(1) << 11,
+        /* Similar, but in "fail" job mode */
+        UNIT_ATOM_RETROACTIVE_START_FAIL              = UINT64_C(1) << 12,
+        /* If our unit unexpectedly becomes active, retroactively stop the other unit too */
+        UNIT_ATOM_RETROACTIVE_STOP_ON_START           = UINT64_C(1) << 13,
+        /* If our unit unexpectedly becomes inactive, retroactively stop the other unit too */
+        UNIT_ATOM_RETROACTIVE_STOP_ON_STOP            = UINT64_C(1) << 14,
+
+        /* If a start job for this unit fails, propagate the failure to start job of other unit too */
+        UNIT_ATOM_PROPAGATE_START_FAILURE             = UINT64_C(1) << 15,
+        /* If a stop job for this unit fails, propagate the failure to any stop job of the other unit too */
+        UNIT_ATOM_PROPAGATE_STOP_FAILURE              = UINT64_C(1) << 16,
+        /* If our start job succeeded but the unit is inactive then (think: oneshot units), propagate this as
+         * failure to the other unit. */
+        UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE = UINT64_C(1) << 17,
+        /* When putting together a transaction, propagate JOB_STOP from our unit to the other. */
+        UNIT_ATOM_PROPAGATE_STOP                      = UINT64_C(1) << 18,
+        /* Like UNIT_ATOM_PROPAGATE_STOP, but enqueues a restart job if there's already a start job (avoids
+         * job type conflict). */
+        UNIT_ATOM_PROPAGATE_STOP_GRACEFUL             = UINT64_C(1) << 19,
+        /* When putting together a transaction, propagate JOB_RESTART from our unit to the other. */
+        UNIT_ATOM_PROPAGATE_RESTART                   = UINT64_C(1) << 20,
+
+        /* Add the other unit to the default target dependency queue */
+        UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE = UINT64_C(1) << 21,
+        /* Recheck default target deps on other units (which are target units) */
+        UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES         = UINT64_C(1) << 22,
+
+        /* The remaining atoms map 1:1 to the equally named high-level deps */
+        UNIT_ATOM_ON_FAILURE                          = UINT64_C(1) << 23,
+        UNIT_ATOM_ON_SUCCESS                          = UINT64_C(1) << 24,
+        UNIT_ATOM_ON_FAILURE_OF                       = UINT64_C(1) << 25,
+        UNIT_ATOM_ON_SUCCESS_OF                       = UINT64_C(1) << 26,
+        UNIT_ATOM_BEFORE                              = UINT64_C(1) << 27,
+        UNIT_ATOM_AFTER                               = UINT64_C(1) << 28,
+        UNIT_ATOM_TRIGGERS                            = UINT64_C(1) << 29,
+        UNIT_ATOM_TRIGGERED_BY                        = UINT64_C(1) << 30,
+        UNIT_ATOM_PROPAGATES_RELOAD_TO                = UINT64_C(1) << 31,
+        UNIT_ATOM_JOINS_NAMESPACE_OF                  = UINT64_C(1) << 32,
+        UNIT_ATOM_REFERENCES                          = UINT64_C(1) << 33,
+        UNIT_ATOM_REFERENCED_BY                       = UINT64_C(1) << 34,
+        UNIT_ATOM_IN_SLICE                            = UINT64_C(1) << 35,
+        UNIT_ATOM_SLICE_OF                            = UINT64_C(1) << 36,
+        _UNIT_DEPENDENCY_ATOM_MAX                     = (UINT64_C(1) << 37) - 1,
+        _UNIT_DEPENDENCY_ATOM_INVALID                 = -EINVAL,
+} UnitDependencyAtom;
+
+UnitDependencyAtom unit_dependency_to_atom(UnitDependency d);
+UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom);
diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c
new file mode 100644
index 0000000..9f95984
--- /dev/null
+++ b/src/core/unit-printf.c
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "cgroup-util.h"
+#include "format-util.h"
+#include "macro.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit-printf.h"
+#include "unit.h"
+#include "user-util.h"
+
+static int specifier_prefix_and_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+
+        return unit_name_to_prefix_and_instance(u->id, ret);
+}
+
+static int specifier_prefix(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+
+        return unit_name_to_prefix(u->id, ret);
+}
+
+static int specifier_prefix_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        const Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        r = unit_name_to_prefix(u->id, &p);
+        if (r < 0)
+                return r;
+
+        return unit_name_unescape(p, ret);
+}
+
+static int specifier_instance_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+
+        return unit_name_unescape(strempty(u->instance), ret);
+}
+
+static int specifier_last_component(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+        _cleanup_free_ char *prefix = NULL;
+        char *dash;
+        int r;
+
+        r = unit_name_to_prefix(u->id, &prefix);
+        if (r < 0)
+                return r;
+
+        dash = strrchr(prefix, '-');
+        if (dash)
+                return specifier_string(specifier, dash + 1, root, userdata, ret);
+
+        *ret = TAKE_PTR(prefix);
+        return 0;
+}
+
+static int specifier_last_component_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        r = specifier_last_component(specifier, data, root, userdata, &p);
+        if (r < 0)
+                return r;
+
+        return unit_name_unescape(p, ret);
+}
+
+static int specifier_filename(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+
+        if (u->instance)
+                return unit_name_path_unescape(u->instance, ret);
+        else
+                return unit_name_to_path(u->id, ret);
+}
+
+static void bad_specifier(const Unit *u, char specifier) {
+        log_unit_warning(u, "Specifier '%%%c' used in unit configuration, which is deprecated. Please update your unit file, as it does not work as intended.", specifier);
+}
+
+static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+
+        bad_specifier(u, specifier);
+
+        if (u->cgroup_path) {
+                char *n;
+
+                n = strdup(u->cgroup_path);
+                if (!n)
+                        return -ENOMEM;
+
+                *ret = n;
+                return 0;
+        }
+
+        return unit_default_cgroup_path(u, ret);
+}
+
+static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+        char *n;
+
+        bad_specifier(u, specifier);
+
+        n = strdup(u->manager->cgroup_root);
+        if (!n)
+                return -ENOMEM;
+
+        *ret = n;
+        return 0;
+}
+
+static int specifier_cgroup_slice(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata), *slice;
+        char *n;
+
+        bad_specifier(u, specifier);
+
+        slice = UNIT_GET_SLICE(u);
+        if (slice) {
+                if (slice->cgroup_path)
+                        n = strdup(slice->cgroup_path);
+                else
+                        return unit_default_cgroup_path(slice, ret);
+        } else
+                n = strdup(u->manager->cgroup_root);
+        if (!n)
+                return -ENOMEM;
+
+        *ret = n;
+        return 0;
+}
+
+static int specifier_special_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+        char *n;
+
+        n = strdup(u->manager->prefix[PTR_TO_UINT(data)]);
+        if (!n)
+                return -ENOMEM;
+
+        *ret = n;
+        return 0;
+}
+
+static int specifier_credentials_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+        const Unit *u = ASSERT_PTR(userdata);
+        char *d;
+
+        assert(ret);
+
+        d = strjoin(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
+        if (!d)
+                return -ENOMEM;
+
+        *ret = d;
+        return 0;
+}
+
+int unit_name_printf(const Unit *u, const char* format, char **ret) {
+        /*
+         * This will use the passed string as format string and replace the following specifiers (which should all be
+         * safe for inclusion in unit names):
+         *
+         * %n: the full id of the unit                 (foo-aaa@bar.waldo)
+         * %N: the id of the unit without the suffix   (foo-aaa@bar)
+         * %p: the prefix                              (foo-aaa)
+         * %i: the instance                            (bar)
+         * %j: the last component of the prefix        (aaa)
+         */
+
+        const Specifier table[] = {
+                { 'i', specifier_string,              u->instance },
+                { 'j', specifier_last_component,      NULL },
+                { 'n', specifier_string,              u->id },
+                { 'N', specifier_prefix_and_instance, NULL },
+                { 'p', specifier_prefix,              NULL },
+
+                COMMON_SYSTEM_SPECIFIERS,
+
+                COMMON_CREDS_SPECIFIERS(u->manager->runtime_scope),
+                {}
+        };
+
+        assert(u);
+        assert(format);
+        assert(ret);
+
+        return specifier_printf(format, UNIT_NAME_MAX, table, NULL, u, ret);
+}
+
+int unit_full_printf_full(const Unit *u, const char *format, size_t max_length, char **ret) {
+        /* This is similar to unit_name_printf() but also supports unescaping. Also, adds a couple of
+         * additional codes (which are likely not suitable for unescaped inclusion in unit names):
+         *
+         * %f: the unescaped instance if set, otherwise the id unescaped as path
+         *
+         * %c: cgroup path of unit (deprecated)
+         * %r: where units in this slice are placed in the cgroup tree (deprecated)
+         * %R: the root of this systemd's instance tree (deprecated)
+         *
+         * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME)
+         * %d: the credentials directory ($CREDENTIALS_DIRECTORY)
+         * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME)
+         * %L: the log directory root (e.g. /var/log or $XDG_STATE_HOME/log)
+         * %S: the state directory root (e.g. /var/lib or $XDG_STATE_HOME)
+         * %t: the runtime directory root (e.g. /run or $XDG_RUNTIME_DIR)
+         *
+         * %h: the homedir of the running user
+         * %s: the shell of the running user
+         *
+         * NOTICE: When you add new entries here, please be careful: specifiers which depend on settings of
+         * the unit file itself are broken by design, as they would resolve differently depending on whether
+         * they are used before or after the relevant configuration setting. Hence: don't add them.
+         */
+
+        assert(u);
+        assert(format);
+        assert(ret);
+
+        const Specifier table[] = {
+                { 'i', specifier_string,                   u->instance },
+                { 'I', specifier_instance_unescaped,       NULL },
+                { 'j', specifier_last_component,           NULL },
+                { 'J', specifier_last_component_unescaped, NULL },
+                { 'n', specifier_string,                   u->id },
+                { 'N', specifier_prefix_and_instance,      NULL },
+                { 'p', specifier_prefix,                   NULL },
+                { 'P', specifier_prefix_unescaped,         NULL },
+
+                { 'f', specifier_filename,                 NULL },
+                { 'y', specifier_real_path,                u->fragment_path },
+                { 'Y', specifier_real_directory,           u->fragment_path },
+
+                { 'c', specifier_cgroup,                   NULL },  /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+                { 'r', specifier_cgroup_slice,             NULL },  /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+                { 'R', specifier_cgroup_root,              NULL },  /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+
+                { 'C', specifier_special_directory,        UINT_TO_PTR(EXEC_DIRECTORY_CACHE) },
+                { 'd', specifier_credentials_dir,          NULL },
+                { 'E', specifier_special_directory,        UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) },
+                { 'L', specifier_special_directory,        UINT_TO_PTR(EXEC_DIRECTORY_LOGS) },
+                { 'S', specifier_special_directory,        UINT_TO_PTR(EXEC_DIRECTORY_STATE) },
+                { 't', specifier_special_directory,        UINT_TO_PTR(EXEC_DIRECTORY_RUNTIME) },
+
+                { 'h', specifier_user_home,                NULL },
+                { 's', specifier_user_shell,               NULL },
+
+                COMMON_SYSTEM_SPECIFIERS,
+
+                COMMON_CREDS_SPECIFIERS(u->manager->runtime_scope),
+
+                COMMON_TMP_SPECIFIERS,
+                {}
+        };
+
+        return specifier_printf(format, max_length, table, NULL, u, ret);
+}
diff --git a/src/core/unit-printf.h b/src/core/unit-printf.h
new file mode 100644
index 0000000..2df07db
--- /dev/null
+++ b/src/core/unit-printf.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "creds-util.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "unit.h"
+
+int unit_name_printf(const Unit *u, const char* text, char **ret);
+int unit_full_printf_full(const Unit *u, const char *text, size_t max_length, char **ret);
+static inline int unit_full_printf(const Unit *u, const char *text, char **ret) {
+        return unit_full_printf_full(u, text, LONG_LINE_MAX, ret);
+}
+static inline int unit_path_printf(const Unit *u, const char *text, char **ret) {
+        return unit_full_printf_full(u, text, PATH_MAX-1, ret);
+}
+static inline int unit_fd_printf(const Unit *u, const char *text, char **ret) {
+        return unit_full_printf_full(u, text, FDNAME_MAX, ret);
+}
+static inline int unit_cred_printf(const Unit *u, const char *text, char **ret) {
+        return unit_full_printf_full(u, text, CREDENTIAL_NAME_MAX, ret);
+}
+static inline int unit_env_printf(const Unit *u, const char *text, char **ret) {
+        return unit_full_printf_full(u, text, sc_arg_max(), ret);
+}
diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c
new file mode 100644
index 0000000..fe4221c
--- /dev/null
+++ b/src/core/unit-serialize.c
@@ -0,0 +1,890 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-socket-bind.h"
+#include "bus-util.h"
+#include "dbus.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "parse-util.h"
+#include "restrict-ifaces.h"
+#include "serialize.h"
+#include "string-table.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+
+static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
+        _cleanup_free_ char *s = NULL;
+        int r;
+
+        assert(f);
+        assert(key);
+
+        if (mask == 0)
+                return 0;
+
+        r = cg_mask_to_string(mask, &s);
+        if (r < 0)
+                return log_error_errno(r, "Failed to format cgroup mask: %m");
+
+        return serialize_item(f, key, s);
+}
+
+/* Make sure out values fit in the bitfield. */
+assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
+
+static int serialize_markers(FILE *f, unsigned markers) {
+        assert(f);
+
+        if (markers == 0)
+                return 0;
+
+        fputs("markers=", f);
+        for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++)
+                if (FLAGS_SET(markers, 1u << m))
+                        fputs(unit_marker_to_string(m), f);
+        fputc('\n', f);
+        return 0;
+}
+
+static int deserialize_markers(Unit *u, const char *value) {
+        assert(u);
+        assert(value);
+        int r;
+
+        for (const char *p = value;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r <= 0)
+                        return r;
+
+                UnitMarker m = unit_marker_from_string(word);
+                if (m < 0) {
+                        log_unit_debug_errno(u, m, "Unknown unit marker \"%s\", ignoring.", word);
+                        continue;
+                }
+
+                u->markers |= 1u << m;
+        }
+}
+
+static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+        [CGROUP_IP_INGRESS_BYTES]   = "ip-accounting-ingress-bytes",
+        [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
+        [CGROUP_IP_EGRESS_BYTES]    = "ip-accounting-egress-bytes",
+        [CGROUP_IP_EGRESS_PACKETS]  = "ip-accounting-egress-packets",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
+
+static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+        [CGROUP_IO_READ_BYTES]       = "io-accounting-read-bytes-base",
+        [CGROUP_IO_WRITE_BYTES]      = "io-accounting-write-bytes-base",
+        [CGROUP_IO_READ_OPERATIONS]  = "io-accounting-read-operations-base",
+        [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
+
+static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+        [CGROUP_IO_READ_BYTES]       = "io-accounting-read-bytes-last",
+        [CGROUP_IO_WRITE_BYTES]      = "io-accounting-write-bytes-last",
+        [CGROUP_IO_READ_OPERATIONS]  = "io-accounting-read-operations-last",
+        [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
+
+static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
+        [CGROUP_MEMORY_PEAK]      = "memory-accounting-peak",
+        [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
+
+int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) {
+        int r;
+
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        if (switching_root && UNIT_VTABLE(u)->exclude_from_switch_root_serialization) {
+                /* In the new root, paths for mounts and automounts will be different, so it doesn't make
+                 * much sense to serialize things. API file systems will be moved to the new root, but we
+                 * don't have mount units for those. */
+                log_unit_debug(u, "not serializing before switch-root");
+                return 0;
+        }
+
+        /* Start marker */
+        fputs(u->id, f);
+        fputc('\n', f);
+
+        assert(!!UNIT_VTABLE(u)->serialize == !!UNIT_VTABLE(u)->deserialize_item);
+
+        if (UNIT_VTABLE(u)->serialize) {
+                r = UNIT_VTABLE(u)->serialize(u, f, fds);
+                if (r < 0)
+                        return r;
+        }
+
+        (void) serialize_dual_timestamp(f, "state-change-timestamp", &u->state_change_timestamp);
+
+        (void) serialize_dual_timestamp(f, "inactive-exit-timestamp", &u->inactive_exit_timestamp);
+        (void) serialize_dual_timestamp(f, "active-enter-timestamp", &u->active_enter_timestamp);
+        (void) serialize_dual_timestamp(f, "active-exit-timestamp", &u->active_exit_timestamp);
+        (void) serialize_dual_timestamp(f, "inactive-enter-timestamp", &u->inactive_enter_timestamp);
+
+        (void) serialize_dual_timestamp(f, "condition-timestamp", &u->condition_timestamp);
+        (void) serialize_dual_timestamp(f, "assert-timestamp", &u->assert_timestamp);
+
+        (void) serialize_ratelimit(f, "start-ratelimit", &u->start_ratelimit);
+        (void) serialize_ratelimit(f, "auto-start-stop-ratelimit", &u->auto_start_stop_ratelimit);
+
+        if (dual_timestamp_is_set(&u->condition_timestamp))
+                (void) serialize_bool(f, "condition-result", u->condition_result);
+
+        if (dual_timestamp_is_set(&u->assert_timestamp))
+                (void) serialize_bool(f, "assert-result", u->assert_result);
+
+        (void) serialize_bool(f, "transient", u->transient);
+        (void) serialize_bool(f, "in-audit", u->in_audit);
+
+        (void) serialize_bool(f, "exported-invocation-id", u->exported_invocation_id);
+        (void) serialize_bool(f, "exported-log-level-max", u->exported_log_level_max);
+        (void) serialize_bool(f, "exported-log-extra-fields", u->exported_log_extra_fields);
+        (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval);
+        (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst);
+
+        (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base);
+        if (u->cpu_usage_last != NSEC_INFINITY)
+                (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);
+
+        if (u->managed_oom_kill_last > 0)
+                (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last);
+
+        if (u->oom_kill_last > 0)
+                (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
+
+        for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
+                (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, u->io_accounting_base[im]);
+
+                if (u->io_accounting_last[im] != UINT64_MAX)
+                        (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, u->io_accounting_last[im]);
+        }
+
+        for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
+                uint64_t v;
+
+                r = unit_get_memory_accounting(u, metric, &v);
+                if (r >= 0)
+                        (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
+        }
+
+        if (u->cgroup_path)
+                (void) serialize_item(f, "cgroup", u->cgroup_path);
+
+        (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized);
+        (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask);
+        (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask);
+        (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask);
+
+        (void) bpf_serialize_socket_bind(u, f, fds);
+
+        (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", u->ip_bpf_ingress_installed);
+        (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", u->ip_bpf_egress_installed);
+        (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", u->bpf_device_control_installed);
+        (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", u->ip_bpf_custom_ingress_installed);
+        (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", u->ip_bpf_custom_egress_installed);
+
+        (void) serialize_restrict_network_interfaces(u, f, fds);
+
+        if (uid_is_valid(u->ref_uid))
+                (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid);
+        if (gid_is_valid(u->ref_gid))
+                (void) serialize_item_format(f, "ref-gid", GID_FMT, u->ref_gid);
+
+        if (!sd_id128_is_null(u->invocation_id))
+                (void) serialize_item_format(f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id));
+
+        (void) serialize_item_format(f, "freezer-state", "%s", freezer_state_to_string(unit_freezer_state(u)));
+        (void) serialize_markers(f, u->markers);
+
+        bus_track_serialize(u->bus_track, f, "ref");
+
+        for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+                uint64_t v;
+
+                r = unit_get_ip_accounting(u, m, &v);
+                if (r >= 0)
+                        (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
+        }
+
+        if (!switching_root) {
+                if (u->job) {
+                        fputs("job\n", f);
+                        job_serialize(u->job, f);
+                }
+
+                if (u->nop_job) {
+                        fputs("job\n", f);
+                        job_serialize(u->nop_job, f);
+                }
+        }
+
+        /* End marker */
+        fputc('\n', f);
+        return 0;
+}
+
+static int unit_deserialize_job(Unit *u, FILE *f) {
+        _cleanup_(job_freep) Job *j = NULL;
+        int r;
+
+        assert(u);
+        assert(f);
+
+        j = job_new_raw(u);
+        if (!j)
+                return log_oom();
+
+        r = job_deserialize(j, f);
+        if (r < 0)
+                return r;
+
+        r = job_install_deserialized(j);
+        if (r < 0)
+                return r;
+
+        TAKE_PTR(j);
+        return 0;
+}
+
+#define MATCH_DESERIALIZE(key, l, v, parse_func, target)                \
+        ({                                                              \
+                bool _deserialize_matched = streq(l, key);              \
+                if (_deserialize_matched) {                             \
+                        int _deserialize_r = parse_func(v);             \
+                        if (_deserialize_r < 0)                         \
+                                log_unit_debug_errno(u, _deserialize_r, \
+                                                     "Failed to parse \"%s=%s\", ignoring.", l, v); \
+                        else                                            \
+                                target = _deserialize_r;                \
+                };                                                      \
+                _deserialize_matched;                                   \
+        })
+
+#define MATCH_DESERIALIZE_IMMEDIATE(key, l, v, parse_func, target)      \
+        ({                                                              \
+                bool _deserialize_matched = streq(l, key);              \
+                if (_deserialize_matched) {                             \
+                        int _deserialize_r = parse_func(v, &target);    \
+                        if (_deserialize_r < 0)                         \
+                                log_unit_debug_errno(u, _deserialize_r, \
+                                                     "Failed to parse \"%s=%s\", ignoring", l, v); \
+                };                                                      \
+                _deserialize_matched;                                   \
+        })
+
+int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
+        int r;
+
+        assert(u);
+        assert(f);
+        assert(fds);
+
+        for (;;) {
+                _cleanup_free_ char *l  = NULL;
+                ssize_t m;
+                size_t k;
+                char *v;
+
+                r = deserialize_read_line(f, &l);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* eof or end marker */
+                        break;
+
+                k = strcspn(l, "=");
+
+                if (l[k] == '=') {
+                        l[k] = 0;
+                        v = l+k+1;
+                } else
+                        v = l+k;
+
+                if (streq(l, "job")) {
+                        if (v[0] == '\0') {
+                                /* New-style serialized job */
+                                r = unit_deserialize_job(u, f);
+                                if (r < 0)
+                                        return r;
+                        } else  /* Legacy for pre-44 */
+                                log_unit_warning(u, "Update from too old systemd versions are unsupported, cannot deserialize job: %s", v);
+                        continue;
+                } else if (streq(l, "state-change-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->state_change_timestamp);
+                        continue;
+                } else if (streq(l, "inactive-exit-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->inactive_exit_timestamp);
+                        continue;
+                } else if (streq(l, "active-enter-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->active_enter_timestamp);
+                        continue;
+                } else if (streq(l, "active-exit-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->active_exit_timestamp);
+                        continue;
+                } else if (streq(l, "inactive-enter-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->inactive_enter_timestamp);
+                        continue;
+                } else if (streq(l, "condition-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->condition_timestamp);
+                        continue;
+                } else if (streq(l, "assert-timestamp")) {
+                        (void) deserialize_dual_timestamp(v, &u->assert_timestamp);
+                        continue;
+
+                } else if (streq(l, "start-ratelimit")) {
+                        deserialize_ratelimit(&u->start_ratelimit, l, v);
+                        continue;
+                } else if (streq(l, "auto-start-stop-ratelimit")) {
+                        deserialize_ratelimit(&u->auto_start_stop_ratelimit, l, v);
+                        continue;
+
+                } else if (MATCH_DESERIALIZE("condition-result", l, v, parse_boolean, u->condition_result))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("assert-result", l, v, parse_boolean, u->assert_result))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("transient", l, v, parse_boolean, u->transient))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("in-audit", l, v, parse_boolean, u->in_audit))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("exported-invocation-id", l, v, parse_boolean, u->exported_invocation_id))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("exported-log-level-max", l, v, parse_boolean, u->exported_log_level_max))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("exported-log-extra-fields", l, v, parse_boolean, u->exported_log_extra_fields))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("exported-log-rate-limit-interval", l, v, parse_boolean, u->exported_log_ratelimit_interval))
+                        continue;
+
+                else if (MATCH_DESERIALIZE("exported-log-rate-limit-burst", l, v, parse_boolean, u->exported_log_ratelimit_burst))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-base", l, v, safe_atou64, u->cpu_usage_base) ||
+                         MATCH_DESERIALIZE_IMMEDIATE("cpuacct-usage-base", l, v, safe_atou64, u->cpu_usage_base))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-last", l, v, safe_atou64, u->cpu_usage_last))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("managed-oom-kill-last", l, v, safe_atou64, u->managed_oom_kill_last))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("oom-kill-last", l, v, safe_atou64, u->oom_kill_last))
+                        continue;
+
+                else if (streq(l, "cgroup")) {
+                        r = unit_set_cgroup_path(u, v);
+                        if (r < 0)
+                                log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
+
+                        (void) unit_watch_cgroup(u);
+                        (void) unit_watch_cgroup_memory(u);
+
+                        continue;
+
+                } else if (MATCH_DESERIALIZE("cgroup-realized", l, v, parse_boolean, u->cgroup_realized))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-realized-mask", l, v, cg_mask_from_string, u->cgroup_realized_mask))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-enabled-mask", l, v, cg_mask_from_string, u->cgroup_enabled_mask))
+                        continue;
+
+                else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-invalidated-mask", l, v, cg_mask_from_string, u->cgroup_invalidated_mask))
+                        continue;
+
+                else if (STR_IN_SET(l, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, v);
+                        if (fd >= 0)
+                                (void) bpf_socket_bind_add_initial_link_fd(u, fd);
+                        continue;
+
+                } else if (streq(l, "ip-bpf-ingress-installed")) {
+                         (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_ingress_installed);
+                         continue;
+                } else if (streq(l, "ip-bpf-egress-installed")) {
+                         (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_egress_installed);
+                         continue;
+                } else if (streq(l, "bpf-device-control-installed")) {
+                         (void) bpf_program_deserialize_attachment(v, fds, &u->bpf_device_control_installed);
+                         continue;
+
+                } else if (streq(l, "ip-bpf-custom-ingress-installed")) {
+                         (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_ingress_installed);
+                         continue;
+                } else if (streq(l, "ip-bpf-custom-egress-installed")) {
+                         (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_egress_installed);
+                         continue;
+
+                } else if (streq(l, "restrict-ifaces-bpf-fd")) {
+                        int fd;
+
+                        fd = deserialize_fd(fds, v);
+                        if (fd >= 0)
+                                (void) restrict_network_interfaces_add_initial_link_fd(u, fd);
+
+                        continue;
+
+                } else if (streq(l, "ref-uid")) {
+                        uid_t uid;
+
+                        r = parse_uid(v, &uid);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+                        else
+                                unit_ref_uid_gid(u, uid, GID_INVALID);
+                        continue;
+
+                } else if (streq(l, "ref-gid")) {
+                        gid_t gid;
+
+                        r = parse_gid(v, &gid);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+                        else
+                                unit_ref_uid_gid(u, UID_INVALID, gid);
+                        continue;
+
+                } else if (streq(l, "ref")) {
+                        r = strv_extend(&u->deserialized_refs, v);
+                        if (r < 0)
+                                return log_oom();
+                        continue;
+
+                } else if (streq(l, "invocation-id")) {
+                        sd_id128_t id;
+
+                        r = sd_id128_from_string(v, &id);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+                        else {
+                                r = unit_set_invocation_id(u, id);
+                                if (r < 0)
+                                        log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m");
+                        }
+
+                        continue;
+
+                } else if (MATCH_DESERIALIZE("freezer-state", l, v, freezer_state_from_string, u->freezer_state))
+                        continue;
+
+                else if (streq(l, "markers")) {
+                        r = deserialize_markers(u, v);
+                        if (r < 0)
+                                log_unit_debug_errno(u, r, "Failed to deserialize \"%s=%s\", ignoring: %m", l, v);
+                        continue;
+                }
+
+                m = memory_accounting_metric_field_last_from_string(l);
+                if (m >= 0) {
+                        uint64_t c;
+
+                        r = safe_atou64(v, &c);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", v);
+                        else
+                                u->memory_accounting_last[m] = c;
+                        continue;
+                }
+
+                /* Check if this is an IP accounting metric serialization field */
+                m = ip_accounting_metric_field_from_string(l);
+                if (m >= 0) {
+                        uint64_t c;
+
+                        r = safe_atou64(v, &c);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v);
+                        else
+                                u->ip_accounting_extra[m] = c;
+                        continue;
+                }
+
+                m = io_accounting_metric_field_base_from_string(l);
+                if (m >= 0) {
+                        uint64_t c;
+
+                        r = safe_atou64(v, &c);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v);
+                        else
+                                u->io_accounting_base[m] = c;
+                        continue;
+                }
+
+                m = io_accounting_metric_field_last_from_string(l);
+                if (m >= 0) {
+                        uint64_t c;
+
+                        r = safe_atou64(v, &c);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v);
+                        else
+                                u->io_accounting_last[m] = c;
+                        continue;
+                }
+
+                r = exec_shared_runtime_deserialize_compat(u, l, v, fds);
+                if (r < 0) {
+                        log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l);
+                        continue;
+                } else if (r > 0)
+                        /* Returns positive if key was handled by the call */
+                        continue;
+
+                if (UNIT_VTABLE(u)->deserialize_item) {
+                        r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds);
+                        if (r < 0)
+                                log_unit_warning(u, "Failed to deserialize unit parameter '%s', ignoring.", l);
+                }
+        }
+
+        /* Versions before 228 did not carry a state change timestamp. In this case, take the current
+         * time. This is useful, so that timeouts based on this timestamp don't trigger too early, and is
+         * in-line with the logic from before 228 where the base for timeouts was not persistent across
+         * reboots. */
+
+        if (!dual_timestamp_is_set(&u->state_change_timestamp))
+                dual_timestamp_now(&u->state_change_timestamp);
+
+        /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings
+         * applied after we are done. For that we invalidate anything already realized, so that we can
+         * realize it again. */
+        if (u->cgroup_realized) {
+                unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
+                unit_invalidate_cgroup_bpf(u);
+        }
+
+        return 0;
+}
+
+int unit_deserialize_state_skip(FILE *f) {
+        int r;
+
+        assert(f);
+
+        /* Skip serialized data for this unit. We don't know what it is. */
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read serialization line: %m");
+                if (r == 0)
+                        return 0;
+
+                /* End marker */
+                if (isempty(line))
+                        return 1;
+        }
+}
+
+static void print_unit_dependency_mask(FILE *f, const char *kind, UnitDependencyMask mask, bool *space) {
+        const struct {
+                UnitDependencyMask mask;
+                const char *name;
+        } table[] = {
+                { UNIT_DEPENDENCY_FILE,               "file"               },
+                { UNIT_DEPENDENCY_IMPLICIT,           "implicit"           },
+                { UNIT_DEPENDENCY_DEFAULT,            "default"            },
+                { UNIT_DEPENDENCY_UDEV,               "udev"               },
+                { UNIT_DEPENDENCY_PATH,               "path"               },
+                { UNIT_DEPENDENCY_MOUNT_FILE,         "mount-file"         },
+                { UNIT_DEPENDENCY_MOUNTINFO,          "mountinfo"          },
+                { UNIT_DEPENDENCY_PROC_SWAP,          "proc-swap"          },
+                { UNIT_DEPENDENCY_SLICE_PROPERTY,     "slice-property"     },
+        };
+
+        assert(f);
+        assert(kind);
+        assert(space);
+
+        for (size_t i = 0; i < ELEMENTSOF(table); i++) {
+
+                if (mask == 0)
+                        break;
+
+                if (FLAGS_SET(mask, table[i].mask)) {
+                        if (*space)
+                                fputc(' ', f);
+                        else
+                                *space = true;
+
+                        fputs(kind, f);
+                        fputs("-", f);
+                        fputs(table[i].name, f);
+
+                        mask &= ~table[i].mask;
+                }
+        }
+
+        assert(mask == 0);
+}
+
+void unit_dump(Unit *u, FILE *f, const char *prefix) {
+        char *t;
+        const char *prefix2;
+        Unit *following;
+        _cleanup_set_free_ Set *following_set = NULL;
+        CGroupMask m;
+        int r;
+
+        assert(u);
+        assert(u->type >= 0);
+
+        prefix = strempty(prefix);
+        prefix2 = strjoina(prefix, "\t");
+
+        fprintf(f,
+                "%s-> Unit %s:\n",
+                prefix, u->id);
+
+        SET_FOREACH(t, u->aliases)
+                fprintf(f, "%s\tAlias: %s\n", prefix, t);
+
+        fprintf(f,
+                "%s\tDescription: %s\n"
+                "%s\tInstance: %s\n"
+                "%s\tUnit Load State: %s\n"
+                "%s\tUnit Active State: %s\n"
+                "%s\tState Change Timestamp: %s\n"
+                "%s\tInactive Exit Timestamp: %s\n"
+                "%s\tActive Enter Timestamp: %s\n"
+                "%s\tActive Exit Timestamp: %s\n"
+                "%s\tInactive Enter Timestamp: %s\n"
+                "%s\tMay GC: %s\n"
+                "%s\tNeed Daemon Reload: %s\n"
+                "%s\tTransient: %s\n"
+                "%s\tPerpetual: %s\n"
+                "%s\tGarbage Collection Mode: %s\n",
+                prefix, unit_description(u),
+                prefix, strna(u->instance),
+                prefix, unit_load_state_to_string(u->load_state),
+                prefix, unit_active_state_to_string(unit_active_state(u)),
+                prefix, strna(FORMAT_TIMESTAMP(u->state_change_timestamp.realtime)),
+                prefix, strna(FORMAT_TIMESTAMP(u->inactive_exit_timestamp.realtime)),
+                prefix, strna(FORMAT_TIMESTAMP(u->active_enter_timestamp.realtime)),
+                prefix, strna(FORMAT_TIMESTAMP(u->active_exit_timestamp.realtime)),
+                prefix, strna(FORMAT_TIMESTAMP(u->inactive_enter_timestamp.realtime)),
+                prefix, yes_no(unit_may_gc(u)),
+                prefix, yes_no(unit_need_daemon_reload(u)),
+                prefix, yes_no(u->transient),
+                prefix, yes_no(u->perpetual),
+                prefix, collect_mode_to_string(u->collect_mode));
+
+        if (u->markers != 0) {
+                fprintf(f, "%s\tMarkers:", prefix);
+
+                for (UnitMarker marker = 0; marker < _UNIT_MARKER_MAX; marker++)
+                        if (FLAGS_SET(u->markers, 1u << marker))
+                                fprintf(f, " %s", unit_marker_to_string(marker));
+                fputs("\n", f);
+        }
+
+        if (UNIT_HAS_CGROUP_CONTEXT(u)) {
+                fprintf(f,
+                        "%s\tSlice: %s\n"
+                        "%s\tCGroup: %s\n"
+                        "%s\tCGroup realized: %s\n",
+                        prefix, strna(unit_slice_name(u)),
+                        prefix, strna(u->cgroup_path),
+                        prefix, yes_no(u->cgroup_realized));
+
+                if (u->cgroup_realized_mask != 0) {
+                        _cleanup_free_ char *s = NULL;
+                        (void) cg_mask_to_string(u->cgroup_realized_mask, &s);
+                        fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s));
+                }
+
+                if (u->cgroup_enabled_mask != 0) {
+                        _cleanup_free_ char *s = NULL;
+                        (void) cg_mask_to_string(u->cgroup_enabled_mask, &s);
+                        fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s));
+                }
+
+                m = unit_get_own_mask(u);
+                if (m != 0) {
+                        _cleanup_free_ char *s = NULL;
+                        (void) cg_mask_to_string(m, &s);
+                        fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s));
+                }
+
+                m = unit_get_members_mask(u);
+                if (m != 0) {
+                        _cleanup_free_ char *s = NULL;
+                        (void) cg_mask_to_string(m, &s);
+                        fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s));
+                }
+
+                m = unit_get_delegate_mask(u);
+                if (m != 0) {
+                        _cleanup_free_ char *s = NULL;
+                        (void) cg_mask_to_string(m, &s);
+                        fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s));
+                }
+        }
+
+        if (!sd_id128_is_null(u->invocation_id))
+                fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n",
+                        prefix, SD_ID128_FORMAT_VAL(u->invocation_id));
+
+        STRV_FOREACH(j, u->documentation)
+                fprintf(f, "%s\tDocumentation: %s\n", prefix, *j);
+
+        if (u->access_selinux_context)
+                fprintf(f, "%s\tAccess SELinux Context: %s\n", prefix, u->access_selinux_context);
+
+        following = unit_following(u);
+        if (following)
+                fprintf(f, "%s\tFollowing: %s\n", prefix, following->id);
+
+        r = unit_following_set(u, &following_set);
+        if (r >= 0) {
+                Unit *other;
+
+                SET_FOREACH(other, following_set)
+                        fprintf(f, "%s\tFollowing Set Member: %s\n", prefix, other->id);
+        }
+
+        if (u->fragment_path)
+                fprintf(f, "%s\tFragment Path: %s\n", prefix, u->fragment_path);
+
+        if (u->source_path)
+                fprintf(f, "%s\tSource Path: %s\n", prefix, u->source_path);
+
+        STRV_FOREACH(j, u->dropin_paths)
+                fprintf(f, "%s\tDropIn Path: %s\n", prefix, *j);
+
+        if (u->failure_action != EMERGENCY_ACTION_NONE)
+                fprintf(f, "%s\tFailure Action: %s\n", prefix, emergency_action_to_string(u->failure_action));
+        if (u->failure_action_exit_status >= 0)
+                fprintf(f, "%s\tFailure Action Exit Status: %i\n", prefix, u->failure_action_exit_status);
+        if (u->success_action != EMERGENCY_ACTION_NONE)
+                fprintf(f, "%s\tSuccess Action: %s\n", prefix, emergency_action_to_string(u->success_action));
+        if (u->success_action_exit_status >= 0)
+                fprintf(f, "%s\tSuccess Action Exit Status: %i\n", prefix, u->success_action_exit_status);
+
+        if (u->job_timeout != USEC_INFINITY)
+                fprintf(f, "%s\tJob Timeout: %s\n", prefix, FORMAT_TIMESPAN(u->job_timeout, 0));
+
+        if (u->job_timeout_action != EMERGENCY_ACTION_NONE)
+                fprintf(f, "%s\tJob Timeout Action: %s\n", prefix, emergency_action_to_string(u->job_timeout_action));
+
+        if (u->job_timeout_reboot_arg)
+                fprintf(f, "%s\tJob Timeout Reboot Argument: %s\n", prefix, u->job_timeout_reboot_arg);
+
+        condition_dump_list(u->conditions, f, prefix, condition_type_to_string);
+        condition_dump_list(u->asserts, f, prefix, assert_type_to_string);
+
+        if (dual_timestamp_is_set(&u->condition_timestamp))
+                fprintf(f,
+                        "%s\tCondition Timestamp: %s\n"
+                        "%s\tCondition Result: %s\n",
+                        prefix, strna(FORMAT_TIMESTAMP(u->condition_timestamp.realtime)),
+                        prefix, yes_no(u->condition_result));
+
+        if (dual_timestamp_is_set(&u->assert_timestamp))
+                fprintf(f,
+                        "%s\tAssert Timestamp: %s\n"
+                        "%s\tAssert Result: %s\n",
+                        prefix, strna(FORMAT_TIMESTAMP(u->assert_timestamp.realtime)),
+                        prefix, yes_no(u->assert_result));
+
+        for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) {
+                UnitDependencyInfo di;
+                Unit *other;
+
+                HASHMAP_FOREACH_KEY(di.data, other, unit_get_dependencies(u, d)) {
+                        bool space = false;
+
+                        fprintf(f, "%s\t%s: %s (", prefix, unit_dependency_to_string(d), other->id);
+
+                        print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+                        print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+
+                        fputs(")\n", f);
+                }
+        }
+
+        if (!hashmap_isempty(u->requires_mounts_for)) {
+                UnitDependencyInfo di;
+                const char *path;
+
+                HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
+                        bool space = false;
+
+                        fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path);
+
+                        print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+                        print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+
+                        fputs(")\n", f);
+                }
+        }
+
+        if (u->load_state == UNIT_LOADED) {
+
+                fprintf(f,
+                        "%s\tStopWhenUnneeded: %s\n"
+                        "%s\tRefuseManualStart: %s\n"
+                        "%s\tRefuseManualStop: %s\n"
+                        "%s\tDefaultDependencies: %s\n"
+                        "%s\tSurviveFinalKillSignal: %s\n"
+                        "%s\tOnSuccessJobMode: %s\n"
+                        "%s\tOnFailureJobMode: %s\n"
+                        "%s\tIgnoreOnIsolate: %s\n",
+                        prefix, yes_no(u->stop_when_unneeded),
+                        prefix, yes_no(u->refuse_manual_start),
+                        prefix, yes_no(u->refuse_manual_stop),
+                        prefix, yes_no(u->default_dependencies),
+                        prefix, yes_no(u->survive_final_kill_signal),
+                        prefix, job_mode_to_string(u->on_success_job_mode),
+                        prefix, job_mode_to_string(u->on_failure_job_mode),
+                        prefix, yes_no(u->ignore_on_isolate));
+
+                if (UNIT_VTABLE(u)->dump)
+                        UNIT_VTABLE(u)->dump(u, f, prefix2);
+
+        } else if (u->load_state == UNIT_MERGED)
+                fprintf(f,
+                        "%s\tMerged into: %s\n",
+                        prefix, u->merged_into->id);
+        else if (u->load_state == UNIT_ERROR) {
+                errno = abs(u->load_error);
+                fprintf(f, "%s\tLoad Error Code: %m\n", prefix);
+        }
+
+        for (const char *n = sd_bus_track_first(u->bus_track); n; n = sd_bus_track_next(u->bus_track))
+                fprintf(f, "%s\tBus Ref: %s\n", prefix, n);
+
+        if (u->job)
+                job_dump(u->job, f, prefix2);
+
+        if (u->nop_job)
+                job_dump(u->nop_job, f, prefix2);
+}
diff --git a/src/core/unit-serialize.h b/src/core/unit-serialize.h
new file mode 100644
index 0000000..ab8a8e3
--- /dev/null
+++ b/src/core/unit-serialize.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "unit.h"
+#include "fdset.h"
+
+/* These functions serialize state for our own usage, i.e.: across a reload/reexec, rather than for being
+ * passed to a child process. */
+
+int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs);
+int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds);
+int unit_deserialize_state_skip(FILE *f);
+
+void unit_dump(Unit *u, FILE *f, const char *prefix);
diff --git a/src/core/unit.c b/src/core/unit.c
new file mode 100644
index 0000000..2fc9f5a
--- /dev/null
+++ b/src/core/unit.c
@@ -0,0 +1,6617 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-id128.h"
+#include "sd-messages.h"
+
+#include "all-units.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bpf-socket-bind.h"
+#include "bus-common-errors.h"
+#include "bus-internal.h"
+#include "bus-util.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "chase.h"
+#include "core-varlink.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "dropin.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "id128-util.h"
+#include "install.h"
+#include "iovec-util.h"
+#include "label-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "logarithm.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
+#include "serialize.h"
+#include "set.h"
+#include "signal-util.h"
+#include "sparse-endian.h"
+#include "special.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+#include "virt.h"
+#if BPF_FRAMEWORK
+#include "bpf-link.h"
+#endif
+
+/* Thresholds for logging at INFO level about resource consumption */
+#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC)
+#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL)
+#define MENTIONWORTHY_IP_BYTES (0ULL)
+
+/* Thresholds for logging at INFO level about resource consumption */
+#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */
+#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL)  /* 10 MB */
+#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */
+
+const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = {
+        [UNIT_SERVICE] = &service_vtable,
+        [UNIT_SOCKET] = &socket_vtable,
+        [UNIT_TARGET] = &target_vtable,
+        [UNIT_DEVICE] = &device_vtable,
+        [UNIT_MOUNT] = &mount_vtable,
+        [UNIT_AUTOMOUNT] = &automount_vtable,
+        [UNIT_SWAP] = &swap_vtable,
+        [UNIT_TIMER] = &timer_vtable,
+        [UNIT_PATH] = &path_vtable,
+        [UNIT_SLICE] = &slice_vtable,
+        [UNIT_SCOPE] = &scope_vtable,
+};
+
+Unit* unit_new(Manager *m, size_t size) {
+        Unit *u;
+
+        assert(m);
+        assert(size >= sizeof(Unit));
+
+        u = malloc0(size);
+        if (!u)
+                return NULL;
+
+        u->manager = m;
+        u->type = _UNIT_TYPE_INVALID;
+        u->default_dependencies = true;
+        u->unit_file_state = _UNIT_FILE_STATE_INVALID;
+        u->unit_file_preset = -1;
+        u->on_failure_job_mode = JOB_REPLACE;
+        u->on_success_job_mode = JOB_FAIL;
+        u->cgroup_control_inotify_wd = -1;
+        u->cgroup_memory_inotify_wd = -1;
+        u->job_timeout = USEC_INFINITY;
+        u->job_running_timeout = USEC_INFINITY;
+        u->ref_uid = UID_INVALID;
+        u->ref_gid = GID_INVALID;
+        u->cpu_usage_last = NSEC_INFINITY;
+
+        unit_reset_memory_accounting_last(u);
+
+        unit_reset_io_accounting_last(u);
+
+        u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+        u->failure_action_exit_status = u->success_action_exit_status = -1;
+
+        u->ip_accounting_ingress_map_fd = -EBADF;
+        u->ip_accounting_egress_map_fd = -EBADF;
+
+        u->ipv4_allow_map_fd = -EBADF;
+        u->ipv6_allow_map_fd = -EBADF;
+        u->ipv4_deny_map_fd = -EBADF;
+        u->ipv6_deny_map_fd = -EBADF;
+
+        u->last_section_private = -1;
+
+        u->start_ratelimit = (const RateLimit) {
+                m->defaults.start_limit_interval,
+                m->defaults.start_limit_burst,
+        };
+
+        u->auto_start_stop_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_SEC, .burst = 16 };
+
+        return u;
+}
+
+int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret) {
+        _cleanup_(unit_freep) Unit *u = NULL;
+        int r;
+
+        u = unit_new(m, size);
+        if (!u)
+                return -ENOMEM;
+
+        r = unit_add_name(u, name);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(u);
+
+        return r;
+}
+
+bool unit_has_name(const Unit *u, const char *name) {
+        assert(u);
+        assert(name);
+
+        return streq_ptr(name, u->id) ||
+               set_contains(u->aliases, name);
+}
+
+static void unit_init(Unit *u) {
+        CGroupContext *cc;
+        ExecContext *ec;
+        KillContext *kc;
+
+        assert(u);
+        assert(u->manager);
+        assert(u->type >= 0);
+
+        cc = unit_get_cgroup_context(u);
+        if (cc) {
+                cgroup_context_init(cc);
+
+                /* Copy in the manager defaults into the cgroup
+                 * context, _before_ the rest of the settings have
+                 * been initialized */
+
+                cc->cpu_accounting = u->manager->defaults.cpu_accounting;
+                cc->io_accounting = u->manager->defaults.io_accounting;
+                cc->blockio_accounting = u->manager->defaults.blockio_accounting;
+                cc->memory_accounting = u->manager->defaults.memory_accounting;
+                cc->tasks_accounting = u->manager->defaults.tasks_accounting;
+                cc->ip_accounting = u->manager->defaults.ip_accounting;
+
+                if (u->type != UNIT_SLICE)
+                        cc->tasks_max = u->manager->defaults.tasks_max;
+
+                cc->memory_pressure_watch = u->manager->defaults.memory_pressure_watch;
+                cc->memory_pressure_threshold_usec = u->manager->defaults.memory_pressure_threshold_usec;
+        }
+
+        ec = unit_get_exec_context(u);
+        if (ec) {
+                exec_context_init(ec);
+
+                if (u->manager->defaults.oom_score_adjust_set) {
+                        ec->oom_score_adjust = u->manager->defaults.oom_score_adjust;
+                        ec->oom_score_adjust_set = true;
+                }
+
+                if (MANAGER_IS_SYSTEM(u->manager))
+                        ec->keyring_mode = EXEC_KEYRING_SHARED;
+                else {
+                        ec->keyring_mode = EXEC_KEYRING_INHERIT;
+
+                        /* User manager might have its umask redefined by PAM or UMask=. In this
+                         * case let the units it manages inherit this value by default. They can
+                         * still tune this value through their own unit file */
+                        (void) get_process_umask(0, &ec->umask);
+                }
+        }
+
+        kc = unit_get_kill_context(u);
+        if (kc)
+                kill_context_init(kc);
+
+        if (UNIT_VTABLE(u)->init)
+                UNIT_VTABLE(u)->init(u);
+}
+
+static int unit_add_alias(Unit *u, char *donated_name) {
+        int r;
+
+        /* Make sure that u->names is allocated. We may leave u->names
+         * empty if we fail later, but this is not a problem. */
+        r = set_ensure_put(&u->aliases, &string_hash_ops, donated_name);
+        if (r < 0)
+                return r;
+        assert(r > 0);
+
+        return 0;
+}
+
+int unit_add_name(Unit *u, const char *text) {
+        _cleanup_free_ char *name = NULL, *instance = NULL;
+        UnitType t;
+        int r;
+
+        assert(u);
+        assert(text);
+
+        if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) {
+                if (!u->instance)
+                        return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                                    "instance is not set when adding name '%s': %m", text);
+
+                r = unit_name_replace_instance(text, u->instance, &name);
+                if (r < 0)
+                        return log_unit_debug_errno(u, r,
+                                                    "failed to build instance name from '%s': %m", text);
+        } else {
+                name = strdup(text);
+                if (!name)
+                        return -ENOMEM;
+        }
+
+        if (unit_has_name(u, name))
+                return 0;
+
+        if (hashmap_contains(u->manager->units, name))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
+                                            "unit already exist when adding name '%s': %m", name);
+
+        if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "name '%s' is invalid: %m", name);
+
+        t = unit_name_to_type(name);
+        if (t < 0)
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "failed to derive unit type from name '%s': %m", name);
+
+        if (u->type != _UNIT_TYPE_INVALID && t != u->type)
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m",
+                                            u->type, t, name);
+
+        r = unit_name_to_instance(name, &instance);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name);
+
+        if (instance && !unit_type_may_template(t))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name);
+
+        /* Ensure that this unit either has no instance, or that the instance matches. */
+        if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "cannot add name %s, the instances don't match (\"%s\" != \"%s\").",
+                                            name, instance, u->instance);
+
+        if (u->id && !unit_type_may_alias(t))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
+                                            "cannot add name %s, aliases are not allowed for %s units.",
+                                            name, unit_type_to_string(t));
+
+        if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES)
+                return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m");
+
+        /* Add name to the global hashmap first, because that's easier to undo */
+        r = hashmap_put(u->manager->units, name, u);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text);
+
+        if (u->id) {
+                r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */
+                if (r < 0) {
+                        hashmap_remove(u->manager->units, name);
+                        return r;
+                }
+                TAKE_PTR(name);
+
+        } else {
+                /* A new name, we don't need the set yet. */
+                assert(u->type == _UNIT_TYPE_INVALID);
+                assert(!u->instance);
+
+                u->type = t;
+                u->id = TAKE_PTR(name);
+                u->instance = TAKE_PTR(instance);
+
+                LIST_PREPEND(units_by_type, u->manager->units_by_type[t], u);
+                unit_init(u);
+        }
+
+        unit_add_to_dbus_queue(u);
+        return 0;
+}
+
+int unit_choose_id(Unit *u, const char *name) {
+        _cleanup_free_ char *t = NULL;
+        char *s;
+        int r;
+
+        assert(u);
+        assert(name);
+
+        if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+                if (!u->instance)
+                        return -EINVAL;
+
+                r = unit_name_replace_instance(name, u->instance, &t);
+                if (r < 0)
+                        return r;
+
+                name = t;
+        }
+
+        if (streq_ptr(u->id, name))
+                return 0; /* Nothing to do. */
+
+        /* Selects one of the aliases of this unit as the id */
+        s = set_get(u->aliases, (char*) name);
+        if (!s)
+                return -ENOENT;
+
+        if (u->id) {
+                r = set_remove_and_put(u->aliases, name, u->id);
+                if (r < 0)
+                        return r;
+        } else
+                assert_se(set_remove(u->aliases, name)); /* see set_get() above… */
+
+        u->id = s; /* Old u->id is now stored in the set, and s is not stored anywhere */
+        unit_add_to_dbus_queue(u);
+
+        return 0;
+}
+
+int unit_set_description(Unit *u, const char *description) {
+        int r;
+
+        assert(u);
+
+        r = free_and_strdup(&u->description, empty_to_null(description));
+        if (r < 0)
+                return r;
+        if (r > 0)
+                unit_add_to_dbus_queue(u);
+
+        return 0;
+}
+
+static bool unit_success_failure_handler_has_jobs(Unit *unit) {
+        Unit *other;
+
+        UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_SUCCESS)
+                if (other->job || other->nop_job)
+                        return true;
+
+        UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_FAILURE)
+                if (other->job || other->nop_job)
+                        return true;
+
+        return false;
+}
+
+void unit_release_resources(Unit *u) {
+        UnitActiveState state;
+        ExecContext *ec;
+
+        assert(u);
+
+        if (u->job || u->nop_job)
+                return;
+
+        if (u->perpetual)
+                return;
+
+        state = unit_active_state(u);
+        if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED))
+                return;
+
+        if (unit_will_restart(u))
+                return;
+
+        ec = unit_get_exec_context(u);
+        if (ec && ec->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART)
+                exec_context_destroy_runtime_directory(ec, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+
+        if (UNIT_VTABLE(u)->release_resources)
+                UNIT_VTABLE(u)->release_resources(u);
+}
+
+bool unit_may_gc(Unit *u) {
+        UnitActiveState state;
+        int r;
+
+        assert(u);
+
+        /* Checks whether the unit is ready to be unloaded for garbage collection.  Returns true when the
+         * unit may be collected, and false if there's some reason to keep it loaded.
+         *
+         * References from other units are *not* checked here. Instead, this is done in unit_gc_sweep(), but
+         * using markers to properly collect dependency loops.
+         */
+
+        if (u->job || u->nop_job)
+                return false;
+
+        if (u->perpetual)
+                return false;
+
+        /* if we saw a cgroup empty event for this unit, stay around until we processed it so that we remove
+         * the empty cgroup if possible. Similar, process any pending OOM events if they are already queued
+         * before we release the unit. */
+        if (u->in_cgroup_empty_queue || u->in_cgroup_oom_queue)
+                return false;
+
+        /* Make sure to send out D-Bus events before we unload the unit */
+        if (u->in_dbus_queue)
+                return false;
+
+        if (sd_bus_track_count(u->bus_track) > 0)
+                return false;
+
+        state = unit_active_state(u);
+
+        /* But we keep the unit object around for longer when it is referenced or configured to not be
+         * gc'ed */
+        switch (u->collect_mode) {
+
+        case COLLECT_INACTIVE:
+                if (state != UNIT_INACTIVE)
+                        return false;
+
+                break;
+
+        case COLLECT_INACTIVE_OR_FAILED:
+                if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED))
+                        return false;
+
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        /* Check if any OnFailure= or on Success= jobs may be pending */
+        if (unit_success_failure_handler_has_jobs(u))
+                return false;
+
+        if (u->cgroup_path) {
+                /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay
+                 * around. Units with active processes should never be collected. */
+
+                r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
+                if (r <= 0)
+                        return false;
+        }
+
+        if (!UNIT_VTABLE(u)->may_gc)
+                return true;
+
+        return UNIT_VTABLE(u)->may_gc(u);
+}
+
+void unit_add_to_load_queue(Unit *u) {
+        assert(u);
+        assert(u->type != _UNIT_TYPE_INVALID);
+
+        if (u->load_state != UNIT_STUB || u->in_load_queue)
+                return;
+
+        LIST_PREPEND(load_queue, u->manager->load_queue, u);
+        u->in_load_queue = true;
+}
+
+void unit_add_to_cleanup_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_cleanup_queue)
+                return;
+
+        LIST_PREPEND(cleanup_queue, u->manager->cleanup_queue, u);
+        u->in_cleanup_queue = true;
+}
+
+void unit_add_to_gc_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_gc_queue || u->in_cleanup_queue)
+                return;
+
+        if (!unit_may_gc(u))
+                return;
+
+        LIST_PREPEND(gc_queue, u->manager->gc_unit_queue, u);
+        u->in_gc_queue = true;
+}
+
+void unit_add_to_dbus_queue(Unit *u) {
+        assert(u);
+        assert(u->type != _UNIT_TYPE_INVALID);
+
+        if (u->load_state == UNIT_STUB || u->in_dbus_queue)
+                return;
+
+        /* Shortcut things if nobody cares */
+        if (sd_bus_track_count(u->manager->subscribed) <= 0 &&
+            sd_bus_track_count(u->bus_track) <= 0 &&
+            set_isempty(u->manager->private_buses)) {
+                u->sent_dbus_new_signal = true;
+                return;
+        }
+
+        LIST_PREPEND(dbus_queue, u->manager->dbus_unit_queue, u);
+        u->in_dbus_queue = true;
+}
+
+void unit_submit_to_stop_when_unneeded_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_stop_when_unneeded_queue)
+                return;
+
+        if (!u->stop_when_unneeded)
+                return;
+
+        if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+                return;
+
+        LIST_PREPEND(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u);
+        u->in_stop_when_unneeded_queue = true;
+}
+
+void unit_submit_to_start_when_upheld_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_start_when_upheld_queue)
+                return;
+
+        if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+                return;
+
+        if (!unit_has_dependency(u, UNIT_ATOM_START_STEADILY, NULL))
+                return;
+
+        LIST_PREPEND(start_when_upheld_queue, u->manager->start_when_upheld_queue, u);
+        u->in_start_when_upheld_queue = true;
+}
+
+void unit_submit_to_stop_when_bound_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_stop_when_bound_queue)
+                return;
+
+        if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+                return;
+
+        if (!unit_has_dependency(u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT, NULL))
+                return;
+
+        LIST_PREPEND(stop_when_bound_queue, u->manager->stop_when_bound_queue, u);
+        u->in_stop_when_bound_queue = true;
+}
+
+static bool unit_can_release_resources(Unit *u) {
+        ExecContext *ec;
+
+        assert(u);
+
+        if (UNIT_VTABLE(u)->release_resources)
+                return true;
+
+        ec = unit_get_exec_context(u);
+        if (ec && ec->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART)
+                return true;
+
+        return false;
+}
+
+void unit_submit_to_release_resources_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_release_resources_queue)
+                return;
+
+        if (u->job || u->nop_job)
+                return;
+
+        if (u->perpetual)
+                return;
+
+        if (!unit_can_release_resources(u))
+                return;
+
+        LIST_PREPEND(release_resources_queue, u->manager->release_resources_queue, u);
+        u->in_release_resources_queue = true;
+}
+
+static void unit_clear_dependencies(Unit *u) {
+        assert(u);
+
+        /* Removes all dependencies configured on u and their reverse dependencies. */
+
+        for (Hashmap *deps; (deps = hashmap_steal_first(u->dependencies));) {
+
+                for (Unit *other; (other = hashmap_steal_first_key(deps));) {
+                        Hashmap *other_deps;
+
+                        HASHMAP_FOREACH(other_deps, other->dependencies)
+                                hashmap_remove(other_deps, u);
+
+                        unit_add_to_gc_queue(other);
+                }
+
+                hashmap_free(deps);
+        }
+
+        u->dependencies = hashmap_free(u->dependencies);
+}
+
+static void unit_remove_transient(Unit *u) {
+        assert(u);
+
+        if (!u->transient)
+                return;
+
+        if (u->fragment_path)
+                (void) unlink(u->fragment_path);
+
+        STRV_FOREACH(i, u->dropin_paths) {
+                _cleanup_free_ char *p = NULL, *pp = NULL;
+
+                if (path_extract_directory(*i, &p) < 0) /* Get the drop-in directory from the drop-in file */
+                        continue;
+
+                if (path_extract_directory(p, &pp) < 0) /* Get the config directory from the drop-in directory */
+                        continue;
+
+                /* Only drop transient drop-ins */
+                if (!path_equal(u->manager->lookup_paths.transient, pp))
+                        continue;
+
+                (void) unlink(*i);
+                (void) rmdir(p);
+        }
+}
+
+static void unit_free_requires_mounts_for(Unit *u) {
+        assert(u);
+
+        for (;;) {
+                _cleanup_free_ char *path = NULL;
+
+                path = hashmap_steal_first_key(u->requires_mounts_for);
+                if (!path)
+                        break;
+                else {
+                        char s[strlen(path) + 1];
+
+                        PATH_FOREACH_PREFIX_MORE(s, path) {
+                                char *y;
+                                Set *x;
+
+                                x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y);
+                                if (!x)
+                                        continue;
+
+                                (void) set_remove(x, u);
+
+                                if (set_isempty(x)) {
+                                        (void) hashmap_remove(u->manager->units_requiring_mounts_for, y);
+                                        free(y);
+                                        set_free(x);
+                                }
+                        }
+                }
+        }
+
+        u->requires_mounts_for = hashmap_free(u->requires_mounts_for);
+}
+
+static void unit_done(Unit *u) {
+        ExecContext *ec;
+        CGroupContext *cc;
+
+        assert(u);
+
+        if (u->type < 0)
+                return;
+
+        if (UNIT_VTABLE(u)->done)
+                UNIT_VTABLE(u)->done(u);
+
+        ec = unit_get_exec_context(u);
+        if (ec)
+                exec_context_done(ec);
+
+        cc = unit_get_cgroup_context(u);
+        if (cc)
+                cgroup_context_done(cc);
+}
+
+Unit* unit_free(Unit *u) {
+        Unit *slice;
+        char *t;
+
+        if (!u)
+                return NULL;
+
+        sd_event_source_disable_unref(u->auto_start_stop_event_source);
+
+        u->transient_file = safe_fclose(u->transient_file);
+
+        if (!MANAGER_IS_RELOADING(u->manager))
+                unit_remove_transient(u);
+
+        bus_unit_send_removed_signal(u);
+
+        unit_done(u);
+
+        unit_dequeue_rewatch_pids(u);
+
+        u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+        u->bus_track = sd_bus_track_unref(u->bus_track);
+        u->deserialized_refs = strv_free(u->deserialized_refs);
+        u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
+
+        unit_free_requires_mounts_for(u);
+
+        SET_FOREACH(t, u->aliases)
+                hashmap_remove_value(u->manager->units, t, u);
+        if (u->id)
+                hashmap_remove_value(u->manager->units, u->id, u);
+
+        if (!sd_id128_is_null(u->invocation_id))
+                hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u);
+
+        if (u->job) {
+                Job *j = u->job;
+                job_uninstall(j);
+                job_free(j);
+        }
+
+        if (u->nop_job) {
+                Job *j = u->nop_job;
+                job_uninstall(j);
+                job_free(j);
+        }
+
+        /* A unit is being dropped from the tree, make sure our family is realized properly. Do this after we
+         * detach the unit from slice tree in order to eliminate its effect on controller masks. */
+        slice = UNIT_GET_SLICE(u);
+        unit_clear_dependencies(u);
+        if (slice)
+                unit_add_family_to_cgroup_realize_queue(slice);
+
+        if (u->on_console)
+                manager_unref_console(u->manager);
+
+        fdset_free(u->initial_socket_bind_link_fds);
+#if BPF_FRAMEWORK
+        bpf_link_free(u->ipv4_socket_bind_link);
+        bpf_link_free(u->ipv6_socket_bind_link);
+#endif
+
+        unit_release_cgroup(u);
+
+        if (!MANAGER_IS_RELOADING(u->manager))
+                unit_unlink_state_files(u);
+
+        unit_unref_uid_gid(u, false);
+
+        (void) manager_update_failed_units(u->manager, u, false);
+        set_remove(u->manager->startup_units, u);
+
+        unit_unwatch_all_pids(u);
+
+        while (u->refs_by_target)
+                unit_ref_unset(u->refs_by_target);
+
+        if (u->type != _UNIT_TYPE_INVALID)
+                LIST_REMOVE(units_by_type, u->manager->units_by_type[u->type], u);
+
+        if (u->in_load_queue)
+                LIST_REMOVE(load_queue, u->manager->load_queue, u);
+
+        if (u->in_dbus_queue)
+                LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u);
+
+        if (u->in_cleanup_queue)
+                LIST_REMOVE(cleanup_queue, u->manager->cleanup_queue, u);
+
+        if (u->in_gc_queue)
+                LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u);
+
+        if (u->in_cgroup_realize_queue)
+                LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+
+        if (u->in_cgroup_empty_queue)
+                LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+
+        if (u->in_cgroup_oom_queue)
+                LIST_REMOVE(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+
+        if (u->in_target_deps_queue)
+                LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u);
+
+        if (u->in_stop_when_unneeded_queue)
+                LIST_REMOVE(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u);
+
+        if (u->in_start_when_upheld_queue)
+                LIST_REMOVE(start_when_upheld_queue, u->manager->start_when_upheld_queue, u);
+
+        if (u->in_stop_when_bound_queue)
+                LIST_REMOVE(stop_when_bound_queue, u->manager->stop_when_bound_queue, u);
+
+        if (u->in_release_resources_queue)
+                LIST_REMOVE(release_resources_queue, u->manager->release_resources_queue, u);
+
+        bpf_firewall_close(u);
+
+        hashmap_free(u->bpf_foreign_by_key);
+
+        bpf_program_free(u->bpf_device_control_installed);
+
+#if BPF_FRAMEWORK
+        bpf_link_free(u->restrict_ifaces_ingress_bpf_link);
+        bpf_link_free(u->restrict_ifaces_egress_bpf_link);
+#endif
+        fdset_free(u->initial_restric_ifaces_link_fds);
+
+        condition_free_list(u->conditions);
+        condition_free_list(u->asserts);
+
+        free(u->description);
+        strv_free(u->documentation);
+        free(u->fragment_path);
+        free(u->source_path);
+        strv_free(u->dropin_paths);
+        free(u->instance);
+
+        free(u->job_timeout_reboot_arg);
+        free(u->reboot_arg);
+
+        free(u->access_selinux_context);
+
+        set_free_free(u->aliases);
+        free(u->id);
+
+        activation_details_unref(u->activation_details);
+
+        return mfree(u);
+}
+
+FreezerState unit_freezer_state(Unit *u) {
+        assert(u);
+
+        return u->freezer_state;
+}
+
+int unit_freezer_state_kernel(Unit *u, FreezerState *ret) {
+        char *values[1] = {};
+        int r;
+
+        assert(u);
+
+        r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+                                   STRV_MAKE("frozen"), values);
+        if (r < 0)
+                return r;
+
+        r = _FREEZER_STATE_INVALID;
+
+        if (values[0])  {
+                if (streq(values[0], "0"))
+                        r = FREEZER_RUNNING;
+                else if (streq(values[0], "1"))
+                        r = FREEZER_FROZEN;
+        }
+
+        free(values[0]);
+        *ret = r;
+
+        return 0;
+}
+
+UnitActiveState unit_active_state(Unit *u) {
+        assert(u);
+
+        if (u->load_state == UNIT_MERGED)
+                return unit_active_state(unit_follow_merge(u));
+
+        /* After a reload it might happen that a unit is not correctly
+         * loaded but still has a process around. That's why we won't
+         * shortcut failed loading to UNIT_INACTIVE_FAILED. */
+
+        return UNIT_VTABLE(u)->active_state(u);
+}
+
+const char* unit_sub_state_to_string(Unit *u) {
+        assert(u);
+
+        return UNIT_VTABLE(u)->sub_state_to_string(u);
+}
+
+static int unit_merge_names(Unit *u, Unit *other) {
+        char *name;
+        int r;
+
+        assert(u);
+        assert(other);
+
+        r = unit_add_alias(u, other->id);
+        if (r < 0)
+                return r;
+
+        r = set_move(u->aliases, other->aliases);
+        if (r < 0) {
+                set_remove(u->aliases, other->id);
+                return r;
+        }
+
+        TAKE_PTR(other->id);
+        other->aliases = set_free_free(other->aliases);
+
+        SET_FOREACH(name, u->aliases)
+                assert_se(hashmap_replace(u->manager->units, name, u) == 0);
+
+        return 0;
+}
+
+static int unit_reserve_dependencies(Unit *u, Unit *other) {
+        size_t n_reserve;
+        Hashmap* deps;
+        void *d;
+        int r;
+
+        assert(u);
+        assert(other);
+
+        /* Let's reserve some space in the dependency hashmaps so that later on merging the units cannot
+         * fail.
+         *
+         * First make some room in the per dependency type hashmaps. Using the summed size of both units'
+         * hashmaps is an estimate that is likely too high since they probably use some of the same
+         * types. But it's never too low, and that's all we need. */
+
+        n_reserve = MIN(hashmap_size(other->dependencies), LESS_BY((size_t) _UNIT_DEPENDENCY_MAX, hashmap_size(u->dependencies)));
+        if (n_reserve > 0) {
+                r = hashmap_ensure_allocated(&u->dependencies, NULL);
+                if (r < 0)
+                        return r;
+
+                r = hashmap_reserve(u->dependencies, n_reserve);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Now, enlarge our per dependency type hashmaps by the number of entries in the same hashmap of the
+         * other unit's dependencies.
+         *
+         * NB: If u does not have a dependency set allocated for some dependency type, there is no need to
+         * reserve anything for. In that case other's set will be transferred as a whole to u by
+         * complete_move(). */
+
+        HASHMAP_FOREACH_KEY(deps, d, u->dependencies) {
+                Hashmap *other_deps;
+
+                other_deps = hashmap_get(other->dependencies, d);
+
+                r = hashmap_reserve(deps, hashmap_size(other_deps));
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static bool unit_should_warn_about_dependency(UnitDependency dependency) {
+        /* Only warn about some unit types */
+        return IN_SET(dependency,
+                      UNIT_CONFLICTS,
+                      UNIT_CONFLICTED_BY,
+                      UNIT_BEFORE,
+                      UNIT_AFTER,
+                      UNIT_ON_SUCCESS,
+                      UNIT_ON_FAILURE,
+                      UNIT_TRIGGERS,
+                      UNIT_TRIGGERED_BY);
+}
+
+static int unit_per_dependency_type_hashmap_update(
+                Hashmap *per_type,
+                Unit *other,
+                UnitDependencyMask origin_mask,
+                UnitDependencyMask destination_mask) {
+
+        UnitDependencyInfo info;
+        int r;
+
+        assert(other);
+        assert_cc(sizeof(void*) == sizeof(info));
+
+        /* Acquire the UnitDependencyInfo entry for the Unit* we are interested in, and update it if it
+         * exists, or insert it anew if not. */
+
+        info.data = hashmap_get(per_type, other);
+        if (info.data) {
+                /* Entry already exists. Add in our mask. */
+
+                if (FLAGS_SET(origin_mask, info.origin_mask) &&
+                    FLAGS_SET(destination_mask, info.destination_mask))
+                        return 0; /* NOP */
+
+                info.origin_mask |= origin_mask;
+                info.destination_mask |= destination_mask;
+
+                r = hashmap_update(per_type, other, info.data);
+        } else {
+                info = (UnitDependencyInfo) {
+                        .origin_mask = origin_mask,
+                        .destination_mask = destination_mask,
+                };
+
+                r = hashmap_put(per_type, other, info.data);
+        }
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static void unit_merge_dependencies(Unit *u, Unit *other) {
+        Hashmap *deps;
+        void *dt; /* Actually of type UnitDependency, except that we don't bother casting it here,
+                   * since the hashmaps all want it as void pointer. */
+
+        assert(u);
+        assert(other);
+
+        if (u == other)
+                return;
+
+        /* First, remove dependency to other. */
+        HASHMAP_FOREACH_KEY(deps, dt, u->dependencies) {
+                if (hashmap_remove(deps, other) && unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt)))
+                        log_unit_warning(u, "Dependency %s=%s is dropped, as %s is merged into %s.",
+                                         unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)),
+                                         other->id, other->id, u->id);
+
+                if (hashmap_isempty(deps))
+                        hashmap_free(hashmap_remove(u->dependencies, dt));
+        }
+
+        for (;;) {
+                _cleanup_hashmap_free_ Hashmap *other_deps = NULL;
+                UnitDependencyInfo di_back;
+                Unit *back;
+
+                /* Let's focus on one dependency type at a time, that 'other' has defined. */
+                other_deps = hashmap_steal_first_key_and_value(other->dependencies, &dt);
+                if (!other_deps)
+                        break; /* done! */
+
+                deps = hashmap_get(u->dependencies, dt);
+
+                /* Now iterate through all dependencies of this dependency type, of 'other'. We refer to the
+                 * referenced units as 'back'. */
+                HASHMAP_FOREACH_KEY(di_back.data, back, other_deps) {
+                        Hashmap *back_deps;
+                        void *back_dt;
+
+                        if (back == u) {
+                                /* This is a dependency pointing back to the unit we want to merge with?
+                                 * Suppress it (but warn) */
+                                if (unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt)))
+                                        log_unit_warning(u, "Dependency %s=%s in %s is dropped, as %s is merged into %s.",
+                                                         unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)),
+                                                         u->id, other->id, other->id, u->id);
+
+                                hashmap_remove(other_deps, back);
+                                continue;
+                        }
+
+                        /* Now iterate through all deps of 'back', and fix the ones pointing to 'other' to
+                         * point to 'u' instead. */
+                        HASHMAP_FOREACH_KEY(back_deps, back_dt, back->dependencies) {
+                                UnitDependencyInfo di_move;
+
+                                di_move.data = hashmap_remove(back_deps, other);
+                                if (!di_move.data)
+                                        continue;
+
+                                assert_se(unit_per_dependency_type_hashmap_update(
+                                                          back_deps,
+                                                          u,
+                                                          di_move.origin_mask,
+                                                          di_move.destination_mask) >= 0);
+                        }
+
+                        /* The target unit already has dependencies of this type, let's then merge this individually. */
+                        if (deps)
+                                assert_se(unit_per_dependency_type_hashmap_update(
+                                                          deps,
+                                                          back,
+                                                          di_back.origin_mask,
+                                                          di_back.destination_mask) >= 0);
+                }
+
+                /* Now all references towards 'other' of the current type 'dt' are corrected to point to 'u'.
+                 * Lets's now move the deps of type 'dt' from 'other' to 'u'. If the unit does not have
+                 * dependencies of this type, let's move them per type wholesale. */
+                if (!deps)
+                        assert_se(hashmap_put(u->dependencies, dt, TAKE_PTR(other_deps)) >= 0);
+        }
+
+        other->dependencies = hashmap_free(other->dependencies);
+}
+
+int unit_merge(Unit *u, Unit *other) {
+        int r;
+
+        assert(u);
+        assert(other);
+        assert(u->manager == other->manager);
+        assert(u->type != _UNIT_TYPE_INVALID);
+
+        other = unit_follow_merge(other);
+
+        if (other == u)
+                return 0;
+
+        if (u->type != other->type)
+                return -EINVAL;
+
+        if (!unit_type_may_alias(u->type)) /* Merging only applies to unit names that support aliases */
+                return -EEXIST;
+
+        if (!IN_SET(other->load_state, UNIT_STUB, UNIT_NOT_FOUND))
+                return -EEXIST;
+
+        if (!streq_ptr(u->instance, other->instance))
+                return -EINVAL;
+
+        if (other->job)
+                return -EEXIST;
+
+        if (other->nop_job)
+                return -EEXIST;
+
+        if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other)))
+                return -EEXIST;
+
+        /* Make reservations to ensure merge_dependencies() won't fail. We don't rollback reservations if we
+         * fail. We don't have a way to undo reservations. A reservation is not a leak. */
+        r = unit_reserve_dependencies(u, other);
+        if (r < 0)
+                return r;
+
+        /* Redirect all references */
+        while (other->refs_by_target)
+                unit_ref_set(other->refs_by_target, other->refs_by_target->source, u);
+
+        /* Merge dependencies */
+        unit_merge_dependencies(u, other);
+
+        /* Merge names. It is better to do that after merging deps, otherwise the log message contains n/a. */
+        r = unit_merge_names(u, other);
+        if (r < 0)
+                return r;
+
+        other->load_state = UNIT_MERGED;
+        other->merged_into = u;
+
+        if (!u->activation_details)
+                u->activation_details = activation_details_ref(other->activation_details);
+
+        /* If there is still some data attached to the other node, we
+         * don't need it anymore, and can free it. */
+        if (other->load_state != UNIT_STUB)
+                if (UNIT_VTABLE(other)->done)
+                        UNIT_VTABLE(other)->done(other);
+
+        unit_add_to_dbus_queue(u);
+        unit_add_to_cleanup_queue(other);
+
+        return 0;
+}
+
+int unit_merge_by_name(Unit *u, const char *name) {
+        _cleanup_free_ char *s = NULL;
+        Unit *other;
+        int r;
+
+        /* Either add name to u, or if a unit with name already exists, merge it with u.
+         * If name is a template, do the same for name@instance, where instance is u's instance. */
+
+        assert(u);
+        assert(name);
+
+        if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+                if (!u->instance)
+                        return -EINVAL;
+
+                r = unit_name_replace_instance(name, u->instance, &s);
+                if (r < 0)
+                        return r;
+
+                name = s;
+        }
+
+        other = manager_get_unit(u->manager, name);
+        if (other)
+                return unit_merge(u, other);
+
+        return unit_add_name(u, name);
+}
+
+Unit* unit_follow_merge(Unit *u) {
+        assert(u);
+
+        while (u->load_state == UNIT_MERGED)
+                assert_se(u = u->merged_into);
+
+        return u;
+}
+
+int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
+        int r;
+
+        assert(u);
+        assert(c);
+
+        /* Unlike unit_add_dependency() or friends, this always returns 0 on success. */
+
+        if (c->working_directory && !c->working_directory_missing_ok) {
+                r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->root_directory) {
+                r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->root_image) {
+                r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+                if (!u->manager->prefix[dt])
+                        continue;
+
+                for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+                        _cleanup_free_ char *p = NULL;
+
+                        p = path_join(u->manager->prefix[dt], c->directories[dt].items[i].path);
+                        if (!p)
+                                return -ENOMEM;
+
+                        r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return 0;
+
+        /* For the following three directory types we need write access, and /var/ is possibly on the root
+         * fs. Hence order after systemd-remount-fs.service, to ensure things are writable. */
+        if (c->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
+            c->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
+            c->directories[EXEC_DIRECTORY_LOGS].n_items > 0) {
+                r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->private_tmp) {
+
+                /* FIXME: for now we make a special case for /tmp and add a weak dependency on
+                 * tmp.mount so /tmp being masked is supported. However there's no reason to treat
+                 * /tmp specifically and masking other mount units should be handled more
+                 * gracefully too, see PR#16894. */
+                r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "tmp.mount", true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+
+                r = unit_require_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+
+                r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_TMPFILES_SETUP_SERVICE, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        if (c->root_image) {
+                /* We need to wait for /dev/loopX to appear when doing RootImage=, hence let's add an
+                 * implicit dependency on udev */
+
+                r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_UDEVD_SERVICE, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        if (!IN_SET(c->std_output,
+                    EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+                    EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) &&
+            !IN_SET(c->std_error,
+                    EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+                    EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) &&
+            !c->log_namespace)
+                return 0;
+
+        /* If syslog or kernel logging is requested (or log namespacing is), make sure our own logging daemon
+         * is run first. */
+
+        if (c->log_namespace) {
+                _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL;
+
+                r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit);
+                if (r < 0)
+                        return r;
+
+                r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+
+                r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit);
+                if (r < 0)
+                        return r;
+
+                r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        } else {
+                r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
+        r = unit_add_default_credential_dependencies(u, c);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+const char* unit_description(Unit *u) {
+        assert(u);
+
+        if (u->description)
+                return u->description;
+
+        return strna(u->id);
+}
+
+const char* unit_status_string(Unit *u, char **ret_combined_buffer) {
+        assert(u);
+        assert(u->id);
+
+        /* Return u->id, u->description, or "{u->id} - {u->description}".
+         * Versions with u->description are only used if it is set.
+         * The last option is used if configured and the caller provided the 'ret_combined_buffer'
+         * pointer.
+         *
+         * Note that *ret_combined_buffer may be set to NULL. */
+
+        if (!u->description ||
+            u->manager->status_unit_format == STATUS_UNIT_FORMAT_NAME ||
+            (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && !ret_combined_buffer) ||
+            streq(u->description, u->id)) {
+
+                if (ret_combined_buffer)
+                        *ret_combined_buffer = NULL;
+                return u->id;
+        }
+
+        if (ret_combined_buffer) {
+                if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED) {
+                        *ret_combined_buffer = strjoin(u->id, " - ", u->description);
+                        if (*ret_combined_buffer)
+                                return *ret_combined_buffer;
+                        log_oom(); /* Fall back to ->description */
+                } else
+                        *ret_combined_buffer = NULL;
+        }
+
+        return u->description;
+}
+
+/* Common implementation for multiple backends */
+int unit_load_fragment_and_dropin(Unit *u, bool fragment_required) {
+        int r;
+
+        assert(u);
+
+        /* Load a .{service,socket,...} file */
+        r = unit_load_fragment(u);
+        if (r < 0)
+                return r;
+
+        if (u->load_state == UNIT_STUB) {
+                if (fragment_required)
+                        return -ENOENT;
+
+                u->load_state = UNIT_LOADED;
+        }
+
+        /* Load drop-in directory data. If u is an alias, we might be reloading the
+         * target unit needlessly. But we cannot be sure which drops-ins have already
+         * been loaded and which not, at least without doing complicated book-keeping,
+         * so let's always reread all drop-ins. */
+        r = unit_load_dropin(unit_follow_merge(u));
+        if (r < 0)
+                return r;
+
+        if (u->source_path) {
+                struct stat st;
+
+                if (stat(u->source_path, &st) >= 0)
+                        u->source_mtime = timespec_load(&st.st_mtim);
+                else
+                        u->source_mtime = 0;
+        }
+
+        return 0;
+}
+
+void unit_add_to_target_deps_queue(Unit *u) {
+        Manager *m = ASSERT_PTR(ASSERT_PTR(u)->manager);
+
+        if (u->in_target_deps_queue)
+                return;
+
+        LIST_PREPEND(target_deps_queue, m->target_deps_queue, u);
+        u->in_target_deps_queue = true;
+}
+
+int unit_add_default_target_dependency(Unit *u, Unit *target) {
+        assert(u);
+        assert(target);
+
+        if (target->type != UNIT_TARGET)
+                return 0;
+
+        /* Only add the dependency if both units are loaded, so that
+         * that loop check below is reliable */
+        if (u->load_state != UNIT_LOADED ||
+            target->load_state != UNIT_LOADED)
+                return 0;
+
+        /* If either side wants no automatic dependencies, then let's
+         * skip this */
+        if (!u->default_dependencies ||
+            !target->default_dependencies)
+                return 0;
+
+        /* Don't create loops */
+        if (unit_has_dependency(target, UNIT_ATOM_BEFORE, u))
+                return 0;
+
+        return unit_add_dependency(target, UNIT_AFTER, u, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int unit_add_slice_dependencies(Unit *u) {
+        Unit *slice;
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return 0;
+
+        /* Slice units are implicitly ordered against their parent slices (as this relationship is encoded in the
+           name), while all other units are ordered based on configuration (as in their case Slice= configures the
+           relationship). */
+        UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE;
+
+        slice = UNIT_GET_SLICE(u);
+        if (slice)
+                return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, slice, true, mask);
+
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return 0;
+
+        return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, SPECIAL_ROOT_SLICE, true, mask);
+}
+
+static int unit_add_mount_dependencies(Unit *u) {
+        UnitDependencyInfo di;
+        const char *path;
+        bool changed = false;
+        int r;
+
+        assert(u);
+
+        HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
+                char prefix[strlen(path) + 1];
+
+                PATH_FOREACH_PREFIX_MORE(prefix, path) {
+                        _cleanup_free_ char *p = NULL;
+                        Unit *m;
+
+                        r = unit_name_from_path(prefix, ".mount", &p);
+                        if (r == -EINVAL)
+                                continue; /* If the path cannot be converted to a mount unit name, then it's
+                                           * not manageable as a unit by systemd, and hence we don't need a
+                                           * dependency on it. Let's thus silently ignore the issue. */
+                        if (r < 0)
+                                return r;
+
+                        m = manager_get_unit(u->manager, p);
+                        if (!m) {
+                                /* Make sure to load the mount unit if it exists. If so the dependencies on
+                                 * this unit will be added later during the loading of the mount unit. */
+                                (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m);
+                                continue;
+                        }
+                        if (m == u)
+                                continue;
+
+                        if (m->load_state != UNIT_LOADED)
+                                continue;
+
+                        r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask);
+                        if (r < 0)
+                                return r;
+                        changed = changed || r > 0;
+
+                        if (m->fragment_path) {
+                                r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask);
+                                if (r < 0)
+                                        return r;
+                                changed = changed || r > 0;
+                        }
+                }
+        }
+
+        return changed;
+}
+
+static int unit_add_oomd_dependencies(Unit *u) {
+        CGroupContext *c;
+        CGroupMask mask;
+        int r;
+
+        assert(u);
+
+        if (!u->default_dependencies)
+                return 0;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL;
+        if (!wants_oomd)
+                return 0;
+
+        if (!cg_all_unified())
+                return 0;
+
+        r = cg_mask_supported(&mask);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine supported controllers: %m");
+
+        if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY))
+                return 0;
+
+        return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE);
+}
+
+static int unit_add_startup_units(Unit *u) {
+        if (!unit_has_startup_cgroup_constraints(u))
+                return 0;
+
+        return set_ensure_put(&u->manager->startup_units, NULL, u);
+}
+
+static int unit_validate_on_failure_job_mode(
+                Unit *u,
+                const char *job_mode_setting,
+                JobMode job_mode,
+                const char *dependency_name,
+                UnitDependencyAtom atom) {
+
+        Unit *other, *found = NULL;
+
+        if (job_mode != JOB_ISOLATE)
+                return 0;
+
+        UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+                if (!found)
+                        found = other;
+                else if (found != other)
+                        return log_unit_error_errno(
+                                        u, SYNTHETIC_ERRNO(ENOEXEC),
+                                        "More than one %s dependencies specified but %sisolate set. Refusing.",
+                                        dependency_name, job_mode_setting);
+        }
+
+        return 0;
+}
+
+int unit_load(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->in_load_queue) {
+                LIST_REMOVE(load_queue, u->manager->load_queue, u);
+                u->in_load_queue = false;
+        }
+
+        if (u->type == _UNIT_TYPE_INVALID)
+                return -EINVAL;
+
+        if (u->load_state != UNIT_STUB)
+                return 0;
+
+        if (u->transient_file) {
+                /* Finalize transient file: if this is a transient unit file, as soon as we reach unit_load() the setup
+                 * is complete, hence let's synchronize the unit file we just wrote to disk. */
+
+                r = fflush_and_check(u->transient_file);
+                if (r < 0)
+                        goto fail;
+
+                u->transient_file = safe_fclose(u->transient_file);
+                u->fragment_mtime = now(CLOCK_REALTIME);
+        }
+
+        r = UNIT_VTABLE(u)->load(u);
+        if (r < 0)
+                goto fail;
+
+        assert(u->load_state != UNIT_STUB);
+
+        if (u->load_state == UNIT_LOADED) {
+                unit_add_to_target_deps_queue(u);
+
+                r = unit_add_slice_dependencies(u);
+                if (r < 0)
+                        goto fail;
+
+                r = unit_add_mount_dependencies(u);
+                if (r < 0)
+                        goto fail;
+
+                r = unit_add_oomd_dependencies(u);
+                if (r < 0)
+                        goto fail;
+
+                r = unit_add_startup_units(u);
+                if (r < 0)
+                        goto fail;
+
+                r = unit_validate_on_failure_job_mode(u, "OnSuccessJobMode=", u->on_success_job_mode, "OnSuccess=", UNIT_ATOM_ON_SUCCESS);
+                if (r < 0)
+                        goto fail;
+
+                r = unit_validate_on_failure_job_mode(u, "OnFailureJobMode=", u->on_failure_job_mode, "OnFailure=", UNIT_ATOM_ON_FAILURE);
+                if (r < 0)
+                        goto fail;
+
+                if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout)
+                        log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect.");
+
+                /* We finished loading, let's ensure our parents recalculate the members mask */
+                unit_invalidate_cgroup_members_masks(u);
+        }
+
+        assert((u->load_state != UNIT_MERGED) == !u->merged_into);
+
+        unit_add_to_dbus_queue(unit_follow_merge(u));
+        unit_add_to_gc_queue(u);
+        (void) manager_varlink_send_managed_oom_update(u);
+
+        return 0;
+
+fail:
+        /* We convert ENOEXEC errors to the UNIT_BAD_SETTING load state here. Configuration parsing code
+         * should hence return ENOEXEC to ensure units are placed in this state after loading. */
+
+        u->load_state = u->load_state == UNIT_STUB ? UNIT_NOT_FOUND :
+                                     r == -ENOEXEC ? UNIT_BAD_SETTING :
+                                                     UNIT_ERROR;
+        u->load_error = r;
+
+        /* Record the timestamp on the cache, so that if the cache gets updated between now and the next time
+         * an attempt is made to load this unit, we know we need to check again. */
+        if (u->load_state == UNIT_NOT_FOUND)
+                u->fragment_not_found_timestamp_hash = u->manager->unit_cache_timestamp_hash;
+
+        unit_add_to_dbus_queue(u);
+        unit_add_to_gc_queue(u);
+
+        return log_unit_debug_errno(u, r, "Failed to load configuration: %m");
+}
+
+_printf_(7, 8)
+static int log_unit_internal(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) {
+        Unit *u = userdata;
+        va_list ap;
+        int r;
+
+        if (u && !unit_log_level_test(u, level))
+                return -ERRNO_VALUE(error);
+
+        va_start(ap, format);
+        if (u)
+                r = log_object_internalv(level, error, file, line, func,
+                                         u->manager->unit_log_field,
+                                         u->id,
+                                         u->manager->invocation_log_field,
+                                         u->invocation_id_string,
+                                         format, ap);
+        else
+                r = log_internalv(level, error,  file, line, func, format, ap);
+        va_end(ap);
+
+        return r;
+}
+
+static bool unit_test_condition(Unit *u) {
+        _cleanup_strv_free_ char **env = NULL;
+        int r;
+
+        assert(u);
+
+        dual_timestamp_now(&u->condition_timestamp);
+
+        r = manager_get_effective_environment(u->manager, &env);
+        if (r < 0) {
+                log_unit_error_errno(u, r, "Failed to determine effective environment: %m");
+                u->condition_result = true;
+        } else
+                u->condition_result = condition_test_list(
+                                u->conditions,
+                                env,
+                                condition_type_to_string,
+                                log_unit_internal,
+                                u);
+
+        unit_add_to_dbus_queue(u);
+        return u->condition_result;
+}
+
+static bool unit_test_assert(Unit *u) {
+        _cleanup_strv_free_ char **env = NULL;
+        int r;
+
+        assert(u);
+
+        dual_timestamp_now(&u->assert_timestamp);
+
+        r = manager_get_effective_environment(u->manager, &env);
+        if (r < 0) {
+                log_unit_error_errno(u, r, "Failed to determine effective environment: %m");
+                u->assert_result = CONDITION_ERROR;
+        } else
+                u->assert_result = condition_test_list(
+                                u->asserts,
+                                env,
+                                assert_type_to_string,
+                                log_unit_internal,
+                                u);
+
+        unit_add_to_dbus_queue(u);
+        return u->assert_result;
+}
+
+void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) {
+        if (log_get_show_color()) {
+                if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && strchr(ident, ' '))
+                        ident = strjoina(ANSI_HIGHLIGHT, u->id, ANSI_NORMAL, " - ", u->description);
+                else
+                        ident = strjoina(ANSI_HIGHLIGHT, ident, ANSI_NORMAL);
+        }
+
+        DISABLE_WARNING_FORMAT_NONLITERAL;
+        manager_status_printf(u->manager, status_type, status, format, ident);
+        REENABLE_WARNING;
+}
+
+int unit_test_start_limit(Unit *u) {
+        const char *reason;
+
+        assert(u);
+
+        if (ratelimit_below(&u->start_ratelimit)) {
+                u->start_limit_hit = false;
+                return 0;
+        }
+
+        log_unit_warning(u, "Start request repeated too quickly.");
+        u->start_limit_hit = true;
+
+        reason = strjoina("unit ", u->id, " failed");
+
+        emergency_action(u->manager, u->start_limit_action,
+                         EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN,
+                         u->reboot_arg, -1, reason);
+
+        return -ECANCELED;
+}
+
+static bool unit_verify_deps(Unit *u) {
+        Unit *other;
+
+        assert(u);
+
+        /* Checks whether all BindsTo= dependencies of this unit are fulfilled — if they are also combined
+         * with After=. We do not check Requires= or Requisite= here as they only should have an effect on
+         * the job processing, but do not have any effect afterwards. We don't check BindsTo= dependencies
+         * that are not used in conjunction with After= as for them any such check would make things entirely
+         * racy. */
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) {
+
+                if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other))
+                        continue;
+
+                if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) {
+                        log_unit_notice(u, "Bound to unit %s, but unit isn't active.", other->id);
+                        return false;
+                }
+        }
+
+        return true;
+}
+
+/* Errors that aren't really errors:
+ *         -EALREADY:   Unit is already started.
+ *         -ECOMM:      Condition failed
+ *         -EAGAIN:     An operation is already in progress. Retry later.
+ *
+ * Errors that are real errors:
+ *         -EBADR:      This unit type does not support starting.
+ *         -ECANCELED:  Start limit hit, too many requests for now
+ *         -EPROTO:     Assert failed
+ *         -EINVAL:     Unit not loaded
+ *         -EOPNOTSUPP: Unit type not supported
+ *         -ENOLINK:    The necessary dependencies are not fulfilled.
+ *         -ESTALE:     This unit has been started before and can't be started a second time
+ *         -ENOENT:     This is a triggering unit and unit to trigger is not loaded
+ */
+int unit_start(Unit *u, ActivationDetails *details) {
+        UnitActiveState state;
+        Unit *following;
+        int r;
+
+        assert(u);
+
+        /* Let's hold off running start jobs for mount units when /proc/self/mountinfo monitor is ratelimited. */
+        if (UNIT_VTABLE(u)->subsystem_ratelimited) {
+                r = UNIT_VTABLE(u)->subsystem_ratelimited(u->manager);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        return -EAGAIN;
+        }
+
+        /* If this is already started, then this will succeed. Note that this will even succeed if this unit
+         * is not startable by the user. This is relied on to detect when we need to wait for units and when
+         * waiting is finished. */
+        state = unit_active_state(u);
+        if (UNIT_IS_ACTIVE_OR_RELOADING(state))
+                return -EALREADY;
+        if (state == UNIT_MAINTENANCE)
+                return -EAGAIN;
+
+        /* Units that aren't loaded cannot be started */
+        if (u->load_state != UNIT_LOADED)
+                return -EINVAL;
+
+        /* Refuse starting scope units more than once */
+        if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_enter_timestamp))
+                return -ESTALE;
+
+        /* If the conditions were unmet, don't do anything at all. If we already are activating this call might
+         * still be useful to speed up activation in case there is some hold-off time, but we don't want to
+         * recheck the condition in that case. */
+        if (state != UNIT_ACTIVATING &&
+            !unit_test_condition(u))
+                return log_unit_debug_errno(u, SYNTHETIC_ERRNO(ECOMM), "Starting requested but condition not met. Not starting unit.");
+
+        /* If the asserts failed, fail the entire job */
+        if (state != UNIT_ACTIVATING &&
+            !unit_test_assert(u))
+                return log_unit_notice_errno(u, SYNTHETIC_ERRNO(EPROTO), "Starting requested but asserts failed.");
+
+        /* Units of types that aren't supported cannot be started. Note that we do this test only after the
+         * condition checks, so that we rather return condition check errors (which are usually not
+         * considered a true failure) than "not supported" errors (which are considered a failure).
+         */
+        if (!unit_type_supported(u->type))
+                return -EOPNOTSUPP;
+
+        /* Let's make sure that the deps really are in order before we start this. Normally the job engine
+         * should have taken care of this already, but let's check this here again. After all, our
+         * dependencies might not be in effect anymore, due to a reload or due to an unmet condition. */
+        if (!unit_verify_deps(u))
+                return -ENOLINK;
+
+        /* Forward to the main object, if we aren't it. */
+        following = unit_following(u);
+        if (following) {
+                log_unit_debug(u, "Redirecting start request from %s to %s.", u->id, following->id);
+                return unit_start(following, details);
+        }
+
+        /* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */
+        if (UNIT_VTABLE(u)->can_start) {
+                r = UNIT_VTABLE(u)->can_start(u);
+                if (r < 0)
+                        return r;
+        }
+
+        /* If it is stopped, but we cannot start it, then fail */
+        if (!UNIT_VTABLE(u)->start)
+                return -EBADR;
+
+        /* We don't suppress calls to ->start() here when we are already starting, to allow this request to
+         * be used as a "hurry up" call, for example when the unit is in some "auto restart" state where it
+         * waits for a holdoff timer to elapse before it will start again. */
+
+        unit_add_to_dbus_queue(u);
+        unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+        if (!u->activation_details) /* Older details object wins */
+                u->activation_details = activation_details_ref(details);
+
+        return UNIT_VTABLE(u)->start(u);
+}
+
+bool unit_can_start(Unit *u) {
+        assert(u);
+
+        if (u->load_state != UNIT_LOADED)
+                return false;
+
+        if (!unit_type_supported(u->type))
+                return false;
+
+        /* Scope units may be started only once */
+        if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_exit_timestamp))
+                return false;
+
+        return !!UNIT_VTABLE(u)->start;
+}
+
+bool unit_can_isolate(Unit *u) {
+        assert(u);
+
+        return unit_can_start(u) &&
+                u->allow_isolate;
+}
+
+/* Errors:
+ *         -EBADR:    This unit type does not support stopping.
+ *         -EALREADY: Unit is already stopped.
+ *         -EAGAIN:   An operation is already in progress. Retry later.
+ */
+int unit_stop(Unit *u) {
+        UnitActiveState state;
+        Unit *following;
+
+        assert(u);
+
+        state = unit_active_state(u);
+        if (UNIT_IS_INACTIVE_OR_FAILED(state))
+                return -EALREADY;
+
+        following = unit_following(u);
+        if (following) {
+                log_unit_debug(u, "Redirecting stop request from %s to %s.", u->id, following->id);
+                return unit_stop(following);
+        }
+
+        if (!UNIT_VTABLE(u)->stop)
+                return -EBADR;
+
+        unit_add_to_dbus_queue(u);
+        unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+        return UNIT_VTABLE(u)->stop(u);
+}
+
+bool unit_can_stop(Unit *u) {
+        assert(u);
+
+        /* Note: if we return true here, it does not mean that the unit may be successfully stopped.
+         * Extrinsic units follow external state and they may stop following external state changes
+         * (hence we return true here), but an attempt to do this through the manager will fail. */
+
+        if (!unit_type_supported(u->type))
+                return false;
+
+        if (u->perpetual)
+                return false;
+
+        return !!UNIT_VTABLE(u)->stop;
+}
+
+/* Errors:
+ *         -EBADR:    This unit type does not support reloading.
+ *         -ENOEXEC:  Unit is not started.
+ *         -EAGAIN:   An operation is already in progress. Retry later.
+ */
+int unit_reload(Unit *u) {
+        UnitActiveState state;
+        Unit *following;
+
+        assert(u);
+
+        if (u->load_state != UNIT_LOADED)
+                return -EINVAL;
+
+        if (!unit_can_reload(u))
+                return -EBADR;
+
+        state = unit_active_state(u);
+        if (state == UNIT_RELOADING)
+                return -EAGAIN;
+
+        if (state != UNIT_ACTIVE)
+                return log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "Unit cannot be reloaded because it is inactive.");
+
+        following = unit_following(u);
+        if (following) {
+                log_unit_debug(u, "Redirecting reload request from %s to %s.", u->id, following->id);
+                return unit_reload(following);
+        }
+
+        unit_add_to_dbus_queue(u);
+
+        if (!UNIT_VTABLE(u)->reload) {
+                /* Unit doesn't have a reload function, but we need to propagate the reload anyway */
+                unit_notify(u, unit_active_state(u), unit_active_state(u), /* reload_success = */ true);
+                return 0;
+        }
+
+        unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+        return UNIT_VTABLE(u)->reload(u);
+}
+
+bool unit_can_reload(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->can_reload)
+                return UNIT_VTABLE(u)->can_reload(u);
+
+        if (unit_has_dependency(u, UNIT_ATOM_PROPAGATES_RELOAD_TO, NULL))
+                return true;
+
+        return UNIT_VTABLE(u)->reload;
+}
+
+bool unit_is_unneeded(Unit *u) {
+        Unit *other;
+        assert(u);
+
+        if (!u->stop_when_unneeded)
+                return false;
+
+        /* Don't clean up while the unit is transitioning or is even inactive. */
+        if (unit_active_state(u) != UNIT_ACTIVE)
+                return false;
+        if (u->job)
+                return false;
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED) {
+                /* If a dependent unit has a job queued, is active or transitioning, or is marked for
+                 * restart, then don't clean this one up. */
+
+                if (other->job)
+                        return false;
+
+                if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other)))
+                        return false;
+
+                if (unit_will_restart(other))
+                        return false;
+        }
+
+        return true;
+}
+
+bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit) {
+        Unit *other;
+
+        assert(u);
+
+        /* Checks if the unit needs to be started because it currently is not running, but some other unit
+         * that is active declared an Uphold= dependencies on it */
+
+        if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) || u->job) {
+                if (ret_culprit)
+                        *ret_culprit = NULL;
+                return false;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_START_STEADILY) {
+                if (other->job)
+                        continue;
+
+                if (UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) {
+                        if (ret_culprit)
+                                *ret_culprit = other;
+                        return true;
+                }
+        }
+
+        if (ret_culprit)
+                *ret_culprit = NULL;
+        return false;
+}
+
+bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit) {
+        Unit *other;
+
+        assert(u);
+
+        /* Checks whether this unit is bound to another unit that is inactive, i.e. whether we should stop
+         * because the other unit is down. */
+
+        if (unit_active_state(u) != UNIT_ACTIVE || u->job) {
+                /* Don't clean up while the unit is transitioning or is even inactive. */
+                if (ret_culprit)
+                        *ret_culprit = NULL;
+                return false;
+        }
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) {
+                if (other->job)
+                        continue;
+
+                if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+                        if (ret_culprit)
+                                *ret_culprit = other;
+
+                        return true;
+                }
+        }
+
+        if (ret_culprit)
+                *ret_culprit = NULL;
+        return false;
+}
+
+static void check_unneeded_dependencies(Unit *u) {
+        Unit *other;
+        assert(u);
+
+        /* Add all units this unit depends on to the queue that processes StopWhenUnneeded= behaviour. */
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE)
+                unit_submit_to_stop_when_unneeded_queue(other);
+}
+
+static void check_uphold_dependencies(Unit *u) {
+        Unit *other;
+        assert(u);
+
+        /* Add all units this unit depends on to the queue that processes Uphold= behaviour. */
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE)
+                unit_submit_to_start_when_upheld_queue(other);
+}
+
+static void check_bound_by_dependencies(Unit *u) {
+        Unit *other;
+        assert(u);
+
+        /* Add all units this unit depends on to the queue that processes BindsTo= stop behaviour. */
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE)
+                unit_submit_to_stop_when_bound_queue(other);
+}
+
+static void retroactively_start_dependencies(Unit *u) {
+        Unit *other;
+
+        assert(u);
+        assert(UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u)));
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_REPLACE) /* Requires= + BindsTo= */
+                if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
+                    !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
+                        manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL);
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_FAIL) /* Wants= */
+                if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
+                    !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
+                        manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL);
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_START) /* Conflicts= (and inverse) */
+                if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
+                        manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+}
+
+static void retroactively_stop_dependencies(Unit *u) {
+        Unit *other;
+
+        assert(u);
+        assert(UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u)));
+
+        /* Pull down units which are bound to us recursively if enabled */
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_STOP) /* BoundBy= */
+                if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
+                        manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+}
+
+void unit_start_on_failure(
+                Unit *u,
+                const char *dependency_name,
+                UnitDependencyAtom atom,
+                JobMode job_mode) {
+
+        int n_jobs = -1;
+        Unit *other;
+        int r;
+
+        assert(u);
+        assert(dependency_name);
+        assert(IN_SET(atom, UNIT_ATOM_ON_SUCCESS, UNIT_ATOM_ON_FAILURE));
+
+        /* Act on OnFailure= and OnSuccess= dependencies */
+
+        UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                if (n_jobs < 0) {
+                        log_unit_info(u, "Triggering %s dependencies.", dependency_name);
+                        n_jobs = 0;
+                }
+
+                r = manager_add_job(u->manager, JOB_START, other, job_mode, NULL, &error, NULL);
+                if (r < 0)
+                        log_unit_warning_errno(
+                                        u, r, "Failed to enqueue %s job, ignoring: %s",
+                                        dependency_name, bus_error_message(&error, r));
+                n_jobs ++;
+        }
+
+        if (n_jobs >= 0)
+                log_unit_debug(u, "Triggering %s dependencies done (%i %s).",
+                               dependency_name, n_jobs, n_jobs == 1 ? "job" : "jobs");
+}
+
+void unit_trigger_notify(Unit *u) {
+        Unit *other;
+
+        assert(u);
+
+        UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_TRIGGERED_BY)
+                if (UNIT_VTABLE(other)->trigger_notify)
+                        UNIT_VTABLE(other)->trigger_notify(other, u);
+}
+
+static int raise_level(int log_level, bool condition_info, bool condition_notice) {
+        if (condition_notice && log_level > LOG_NOTICE)
+                return LOG_NOTICE;
+        if (condition_info && log_level > LOG_INFO)
+                return LOG_INFO;
+        return log_level;
+}
+
+static int unit_log_resources(Unit *u) {
+        struct iovec iovec[1 + 2 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4];
+        bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false;
+        _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL;
+        int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */
+        size_t n_message_parts = 0, n_iovec = 0;
+        char* message_parts[1 + 2 + 2 + 2 + 1], *t;
+        nsec_t nsec = NSEC_INFINITY;
+        uint64_t memory_peak = UINT64_MAX, memory_swap_peak = UINT64_MAX;
+        int r;
+        const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+                [CGROUP_IP_INGRESS_BYTES]   = "IP_METRIC_INGRESS_BYTES",
+                [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS",
+                [CGROUP_IP_EGRESS_BYTES]    = "IP_METRIC_EGRESS_BYTES",
+                [CGROUP_IP_EGRESS_PACKETS]  = "IP_METRIC_EGRESS_PACKETS",
+        };
+        const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+                [CGROUP_IO_READ_BYTES]       = "IO_METRIC_READ_BYTES",
+                [CGROUP_IO_WRITE_BYTES]      = "IO_METRIC_WRITE_BYTES",
+                [CGROUP_IO_READ_OPERATIONS]  = "IO_METRIC_READ_OPERATIONS",
+                [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS",
+        };
+
+        assert(u);
+
+        /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource
+         * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced
+         * information and the complete data in structured fields. */
+
+        (void) unit_get_cpu_usage(u, &nsec);
+        if (nsec != NSEC_INFINITY) {
+                /* Format the CPU time for inclusion in the structured log message */
+                if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) {
+                        r = log_oom();
+                        goto finish;
+                }
+                iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+                /* Format the CPU time for inclusion in the human language message string */
+                t = strjoin("consumed ", FORMAT_TIMESPAN(nsec / NSEC_PER_USEC, USEC_PER_MSEC), " CPU time");
+                if (!t) {
+                        r = log_oom();
+                        goto finish;
+                }
+
+                message_parts[n_message_parts++] = t;
+
+                log_level = raise_level(log_level,
+                                        nsec > MENTIONWORTHY_CPU_NSEC,
+                                        nsec > NOTICEWORTHY_CPU_NSEC);
+        }
+
+        (void) unit_get_memory_accounting(u, CGROUP_MEMORY_PEAK, &memory_peak);
+        if (memory_peak != UINT64_MAX) {
+                /* Format peak memory for inclusion in the structured log message */
+                if (asprintf(&t, "MEMORY_PEAK=%" PRIu64, memory_peak) < 0) {
+                        r = log_oom();
+                        goto finish;
+                }
+                iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+                /* Format peak memory for inclusion in the human language message string */
+                t = strjoin(FORMAT_BYTES(memory_peak), " memory peak");
+                if (!t) {
+                        r = log_oom();
+                        goto finish;
+                }
+                message_parts[n_message_parts++] = t;
+        }
+
+        (void) unit_get_memory_accounting(u, CGROUP_MEMORY_SWAP_PEAK, &memory_swap_peak);
+        if (memory_swap_peak != UINT64_MAX) {
+                /* Format peak swap memory for inclusion in the structured log message */
+                if (asprintf(&t, "MEMORY_SWAP_PEAK=%" PRIu64, memory_swap_peak) < 0) {
+                        r = log_oom();
+                        goto finish;
+                }
+                iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+                /* Format peak swap memory for inclusion in the human language message string */
+                t = strjoin(FORMAT_BYTES(memory_swap_peak), " memory swap peak");
+                if (!t) {
+                        r = log_oom();
+                        goto finish;
+                }
+                message_parts[n_message_parts++] = t;
+        }
+
+        for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) {
+                uint64_t value = UINT64_MAX;
+
+                assert(io_fields[k]);
+
+                (void) unit_get_io_accounting(u, k, k > 0, &value);
+                if (value == UINT64_MAX)
+                        continue;
+
+                have_io_accounting = true;
+                if (value > 0)
+                        any_io = true;
+
+                /* Format IO accounting data for inclusion in the structured log message */
+                if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) {
+                        r = log_oom();
+                        goto finish;
+                }
+                iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+                /* Format the IO accounting data for inclusion in the human language message string, but only
+                 * for the bytes counters (and not for the operations counters) */
+                if (k == CGROUP_IO_READ_BYTES) {
+                        assert(!rr);
+                        rr = strjoin("read ", strna(FORMAT_BYTES(value)), " from disk");
+                        if (!rr) {
+                                r = log_oom();
+                                goto finish;
+                        }
+                } else if (k == CGROUP_IO_WRITE_BYTES) {
+                        assert(!wr);
+                        wr = strjoin("written ", strna(FORMAT_BYTES(value)), " to disk");
+                        if (!wr) {
+                                r = log_oom();
+                                goto finish;
+                        }
+                }
+
+                if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES))
+                        log_level = raise_level(log_level,
+                                                value > MENTIONWORTHY_IO_BYTES,
+                                                value > NOTICEWORTHY_IO_BYTES);
+        }
+
+        if (have_io_accounting) {
+                if (any_io) {
+                        if (rr)
+                                message_parts[n_message_parts++] = TAKE_PTR(rr);
+                        if (wr)
+                                message_parts[n_message_parts++] = TAKE_PTR(wr);
+
+                } else {
+                        char *k;
+
+                        k = strdup("no IO");
+                        if (!k) {
+                                r = log_oom();
+                                goto finish;
+                        }
+
+                        message_parts[n_message_parts++] = k;
+                }
+        }
+
+        for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+                uint64_t value = UINT64_MAX;
+
+                assert(ip_fields[m]);
+
+                (void) unit_get_ip_accounting(u, m, &value);
+                if (value == UINT64_MAX)
+                        continue;
+
+                have_ip_accounting = true;
+                if (value > 0)
+                        any_traffic = true;
+
+                /* Format IP accounting data for inclusion in the structured log message */
+                if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) {
+                        r = log_oom();
+                        goto finish;
+                }
+                iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+                /* Format the IP accounting data for inclusion in the human language message string, but only for the
+                 * bytes counters (and not for the packets counters) */
+                if (m == CGROUP_IP_INGRESS_BYTES) {
+                        assert(!igress);
+                        igress = strjoin("received ", strna(FORMAT_BYTES(value)), " IP traffic");
+                        if (!igress) {
+                                r = log_oom();
+                                goto finish;
+                        }
+                } else if (m == CGROUP_IP_EGRESS_BYTES) {
+                        assert(!egress);
+                        egress = strjoin("sent ", strna(FORMAT_BYTES(value)), " IP traffic");
+                        if (!egress) {
+                                r = log_oom();
+                                goto finish;
+                        }
+                }
+
+                if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
+                        log_level = raise_level(log_level,
+                                                value > MENTIONWORTHY_IP_BYTES,
+                                                value > NOTICEWORTHY_IP_BYTES);
+        }
+
+        /* This check is here because it is the earliest point following all possible log_level assignments. If
+         * log_level is assigned anywhere after this point, move this check. */
+        if (!unit_log_level_test(u, log_level)) {
+                r = 0;
+                goto finish;
+        }
+
+        if (have_ip_accounting) {
+                if (any_traffic) {
+                        if (igress)
+                                message_parts[n_message_parts++] = TAKE_PTR(igress);
+                        if (egress)
+                                message_parts[n_message_parts++] = TAKE_PTR(egress);
+
+                } else {
+                        char *k;
+
+                        k = strdup("no IP traffic");
+                        if (!k) {
+                                r = log_oom();
+                                goto finish;
+                        }
+
+                        message_parts[n_message_parts++] = k;
+                }
+        }
+
+        /* Is there any accounting data available at all? */
+        if (n_iovec == 0) {
+                r = 0;
+                goto finish;
+        }
+
+        if (n_message_parts == 0)
+                t = strjoina("MESSAGE=", u->id, ": Completed.");
+        else {
+                _cleanup_free_ char *joined = NULL;
+
+                message_parts[n_message_parts] = NULL;
+
+                joined = strv_join(message_parts, ", ");
+                if (!joined) {
+                        r = log_oom();
+                        goto finish;
+                }
+
+                joined[0] = ascii_toupper(joined[0]);
+                t = strjoina("MESSAGE=", u->id, ": ", joined, ".");
+        }
+
+        /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them,
+         * and hence don't increase n_iovec for them */
+        iovec[n_iovec] = IOVEC_MAKE_STRING(t);
+        iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR);
+
+        t = strjoina(u->manager->unit_log_field, u->id);
+        iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t);
+
+        t = strjoina(u->manager->invocation_log_field, u->invocation_id_string);
+        iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t);
+
+        log_unit_struct_iovec(u, log_level, iovec, n_iovec + 4);
+        r = 0;
+
+finish:
+        free_many_charp(message_parts, n_message_parts);
+
+        for (size_t i = 0; i < n_iovec; i++)
+                free(iovec[i].iov_base);
+
+        return r;
+
+}
+
+static void unit_update_on_console(Unit *u) {
+        bool b;
+
+        assert(u);
+
+        b = unit_needs_console(u);
+        if (u->on_console == b)
+                return;
+
+        u->on_console = b;
+        if (b)
+                manager_ref_console(u->manager);
+        else
+                manager_unref_console(u->manager);
+}
+
+static void unit_emit_audit_start(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->audit_start_message_type <= 0)
+                return;
+
+        /* Write audit record if we have just finished starting up */
+        manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_start_message_type, /* success= */ true);
+        u->in_audit = true;
+}
+
+static void unit_emit_audit_stop(Unit *u, UnitActiveState state) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->audit_start_message_type <= 0)
+                return;
+
+        if (u->in_audit) {
+                /* Write audit record if we have just finished shutting down */
+                manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_stop_message_type, /* success= */ state == UNIT_INACTIVE);
+                u->in_audit = false;
+        } else {
+                /* Hmm, if there was no start record written write it now, so that we always have a nice pair */
+                manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_start_message_type, /* success= */ state == UNIT_INACTIVE);
+
+                if (state == UNIT_INACTIVE)
+                        manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_stop_message_type, /* success= */ true);
+        }
+}
+
+static bool unit_process_job(Job *j, UnitActiveState ns, bool reload_success) {
+        bool unexpected = false;
+        JobResult result;
+
+        assert(j);
+
+        if (j->state == JOB_WAITING)
+                /* So we reached a different state for this job. Let's see if we can run it now if it failed previously
+                 * due to EAGAIN. */
+                job_add_to_run_queue(j);
+
+        /* Let's check whether the unit's new state constitutes a finished job, or maybe contradicts a running job and
+         * hence needs to invalidate jobs. */
+
+        switch (j->type) {
+
+        case JOB_START:
+        case JOB_VERIFY_ACTIVE:
+
+                if (UNIT_IS_ACTIVE_OR_RELOADING(ns))
+                        job_finish_and_invalidate(j, JOB_DONE, true, false);
+                else if (j->state == JOB_RUNNING && ns != UNIT_ACTIVATING) {
+                        unexpected = true;
+
+                        if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+                                if (ns == UNIT_FAILED)
+                                        result = JOB_FAILED;
+                                else
+                                        result = JOB_DONE;
+
+                                job_finish_and_invalidate(j, result, true, false);
+                        }
+                }
+
+                break;
+
+        case JOB_RELOAD:
+        case JOB_RELOAD_OR_START:
+        case JOB_TRY_RELOAD:
+
+                if (j->state == JOB_RUNNING) {
+                        if (ns == UNIT_ACTIVE)
+                                job_finish_and_invalidate(j, reload_success ? JOB_DONE : JOB_FAILED, true, false);
+                        else if (!IN_SET(ns, UNIT_ACTIVATING, UNIT_RELOADING)) {
+                                unexpected = true;
+
+                                if (UNIT_IS_INACTIVE_OR_FAILED(ns))
+                                        job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false);
+                        }
+                }
+
+                break;
+
+        case JOB_STOP:
+        case JOB_RESTART:
+        case JOB_TRY_RESTART:
+
+                if (UNIT_IS_INACTIVE_OR_FAILED(ns))
+                        job_finish_and_invalidate(j, JOB_DONE, true, false);
+                else if (j->state == JOB_RUNNING && ns != UNIT_DEACTIVATING) {
+                        unexpected = true;
+                        job_finish_and_invalidate(j, JOB_FAILED, true, false);
+                }
+
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return unexpected;
+}
+
+void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success) {
+        const char *reason;
+        Manager *m;
+
+        assert(u);
+        assert(os < _UNIT_ACTIVE_STATE_MAX);
+        assert(ns < _UNIT_ACTIVE_STATE_MAX);
+
+        /* Note that this is called for all low-level state changes, even if they might map to the same high-level
+         * UnitActiveState! That means that ns == os is an expected behavior here. For example: if a mount point is
+         * remounted this function will be called too! */
+
+        m = u->manager;
+
+        /* Let's enqueue the change signal early. In case this unit has a job associated we want that this unit is in
+         * the bus queue, so that any job change signal queued will force out the unit change signal first. */
+        unit_add_to_dbus_queue(u);
+
+        /* Update systemd-oomd on the property/state change */
+        if (os != ns) {
+                /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop
+                 * monitoring.
+                 * Also send an update whenever the unit goes active; this is to handle a case where an override file
+                 * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to
+                 * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't
+                 * have the information on the property. Thus, indiscriminately send an update. */
+                if (UNIT_IS_INACTIVE_OR_FAILED(ns) || UNIT_IS_ACTIVE_OR_RELOADING(ns))
+                        (void) manager_varlink_send_managed_oom_update(u);
+        }
+
+        /* Update timestamps for state changes */
+        if (!MANAGER_IS_RELOADING(m)) {
+                dual_timestamp_now(&u->state_change_timestamp);
+
+                if (UNIT_IS_INACTIVE_OR_FAILED(os) && !UNIT_IS_INACTIVE_OR_FAILED(ns))
+                        u->inactive_exit_timestamp = u->state_change_timestamp;
+                else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_INACTIVE_OR_FAILED(ns))
+                        u->inactive_enter_timestamp = u->state_change_timestamp;
+
+                if (!UNIT_IS_ACTIVE_OR_RELOADING(os) && UNIT_IS_ACTIVE_OR_RELOADING(ns))
+                        u->active_enter_timestamp = u->state_change_timestamp;
+                else if (UNIT_IS_ACTIVE_OR_RELOADING(os) && !UNIT_IS_ACTIVE_OR_RELOADING(ns))
+                        u->active_exit_timestamp = u->state_change_timestamp;
+        }
+
+        /* Keep track of failed units */
+        (void) manager_update_failed_units(m, u, ns == UNIT_FAILED);
+
+        /* Make sure the cgroup and state files are always removed when we become inactive */
+        if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+                SET_FLAG(u->markers,
+                         (1u << UNIT_MARKER_NEEDS_RELOAD)|(1u << UNIT_MARKER_NEEDS_RESTART),
+                         false);
+                unit_prune_cgroup(u);
+                unit_unlink_state_files(u);
+        } else if (ns != os && ns == UNIT_RELOADING)
+                SET_FLAG(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD, false);
+
+        unit_update_on_console(u);
+
+        if (!MANAGER_IS_RELOADING(m)) {
+                bool unexpected;
+
+                /* Let's propagate state changes to the job */
+                if (u->job)
+                        unexpected = unit_process_job(u->job, ns, reload_success);
+                else
+                        unexpected = true;
+
+                /* If this state change happened without being requested by a job, then let's retroactively start or
+                 * stop dependencies. We skip that step when deserializing, since we don't want to create any
+                 * additional jobs just because something is already activated. */
+
+                if (unexpected) {
+                        if (UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_ACTIVE_OR_ACTIVATING(ns))
+                                retroactively_start_dependencies(u);
+                        else if (UNIT_IS_ACTIVE_OR_ACTIVATING(os) && UNIT_IS_INACTIVE_OR_DEACTIVATING(ns))
+                                retroactively_stop_dependencies(u);
+                }
+
+                if (ns != os && ns == UNIT_FAILED) {
+                        log_unit_debug(u, "Unit entered failed state.");
+                        unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode);
+                }
+
+                if (UNIT_IS_ACTIVE_OR_RELOADING(ns) && !UNIT_IS_ACTIVE_OR_RELOADING(os)) {
+                        /* This unit just finished starting up */
+
+                        unit_emit_audit_start(u);
+                        manager_send_unit_plymouth(m, u);
+                }
+
+                if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) {
+                        /* This unit just stopped/failed. */
+
+                        unit_emit_audit_stop(u, ns);
+                        unit_log_resources(u);
+                }
+
+                if (ns == UNIT_INACTIVE && !IN_SET(os, UNIT_FAILED, UNIT_INACTIVE, UNIT_MAINTENANCE))
+                        unit_start_on_failure(u, "OnSuccess=", UNIT_ATOM_ON_SUCCESS, u->on_success_job_mode);
+        }
+
+        manager_recheck_journal(m);
+        manager_recheck_dbus(m);
+
+        unit_trigger_notify(u);
+
+        if (!MANAGER_IS_RELOADING(m)) {
+                if (os != UNIT_FAILED && ns == UNIT_FAILED) {
+                        reason = strjoina("unit ", u->id, " failed");
+                        emergency_action(m, u->failure_action, 0, u->reboot_arg, unit_failure_action_exit_status(u), reason);
+                } else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && ns == UNIT_INACTIVE) {
+                        reason = strjoina("unit ", u->id, " succeeded");
+                        emergency_action(m, u->success_action, 0, u->reboot_arg, unit_success_action_exit_status(u), reason);
+                }
+        }
+
+        /* And now, add the unit or depending units to various queues that will act on the new situation if
+         * needed. These queues generally check for continuous state changes rather than events (like most of
+         * the state propagation above), and do work deferred instead of instantly, since they typically
+         * don't want to run during reloading, and usually involve checking combined state of multiple units
+         * at once. */
+
+        if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+                /* Stop unneeded units and bound-by units regardless if going down was expected or not */
+                check_unneeded_dependencies(u);
+                check_bound_by_dependencies(u);
+
+                /* Maybe someone wants us to remain up? */
+                unit_submit_to_start_when_upheld_queue(u);
+
+                /* Maybe the unit should be GC'ed now? */
+                unit_add_to_gc_queue(u);
+
+                /* Maybe we can release some resources now? */
+                unit_submit_to_release_resources_queue(u);
+        }
+
+        if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) {
+                /* Start uphold units regardless if going up was expected or not */
+                check_uphold_dependencies(u);
+
+                /* Maybe we finished startup and are now ready for being stopped because unneeded? */
+                unit_submit_to_stop_when_unneeded_queue(u);
+
+                /* Maybe we finished startup, but something we needed has vanished? Let's die then. (This happens
+                 * when something BindsTo= to a Type=oneshot unit, as these units go directly from starting to
+                 * inactive, without ever entering started.) */
+                unit_submit_to_stop_when_bound_queue(u);
+        }
+}
+
+int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive) {
+        _cleanup_(pidref_freep) PidRef *pid_dup = NULL;
+        int r;
+
+        /* Adds a specific PID to the set of PIDs this unit watches. */
+
+        assert(u);
+        assert(pidref_is_set(pid));
+
+        /* Caller might be sure that this PID belongs to this unit only. Let's take this
+         * opportunity to remove any stalled references to this PID as they can be created
+         * easily (when watching a process which is not our direct child). */
+        if (exclusive)
+                manager_unwatch_pidref(u->manager, pid);
+
+        if (set_contains(u->pids, pid)) /* early exit if already being watched */
+                return 0;
+
+        r = pidref_dup(pid, &pid_dup);
+        if (r < 0)
+                return r;
+
+        /* First, insert into the set of PIDs maintained by the unit */
+        r = set_ensure_put(&u->pids, &pidref_hash_ops_free, pid_dup);
+        if (r < 0)
+                return r;
+
+        pid = TAKE_PTR(pid_dup); /* continue with our copy now that we have installed it properly in our set */
+
+        /* Second, insert it into the simple global table, see if that works */
+        r = hashmap_ensure_put(&u->manager->watch_pids, &pidref_hash_ops_free, pid, u);
+        if (r != -EEXIST)
+                return r;
+
+        /* OK, the key is already assigned to a different unit. That's fine, then add us via the second
+         * hashmap that points to an array. */
+
+        PidRef *old_pid = NULL;
+        Unit **array = hashmap_get2(u->manager->watch_pids_more, pid, (void**) &old_pid);
+
+        /* Count entries in array */
+        size_t n = 0;
+        for (; array && array[n]; n++)
+                ;
+
+        /* Allocate a new array */
+        _cleanup_free_ Unit **new_array = new(Unit*, n + 2);
+        if (!new_array)
+                return -ENOMEM;
+
+        /* Append us to the end */
+        memcpy_safe(new_array, array, sizeof(Unit*) * n);
+        new_array[n] = u;
+        new_array[n+1] = NULL;
+
+        /* Make sure the hashmap is allocated */
+        r = hashmap_ensure_allocated(&u->manager->watch_pids_more, &pidref_hash_ops_free);
+        if (r < 0)
+                return r;
+
+        /* Add or replace the old array */
+        r = hashmap_replace(u->manager->watch_pids_more, old_pid ?: pid, new_array);
+        if (r < 0)
+                return r;
+
+        TAKE_PTR(new_array); /* Now part of the hash table */
+        free(array);         /* Which means we can now delete the old version */
+        return 0;
+}
+
+int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) {
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        int r;
+
+        assert(u);
+        assert(pid_is_valid(pid));
+
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0)
+                return r;
+
+        return unit_watch_pidref(u, &pidref, exclusive);
+}
+
+void unit_unwatch_pidref(Unit *u, PidRef *pid) {
+        assert(u);
+        assert(pidref_is_set(pid));
+
+        /* Remove from the set we maintain for this unit. (And destroy the returned pid eventually) */
+        _cleanup_(pidref_freep) PidRef *pid1 = set_remove(u->pids, pid);
+        if (!pid1)
+                return; /* Early exit if this PID was never watched by us */
+
+        /* First let's drop the unit from the simple hash table, if it is included there */
+        PidRef *pid2 = NULL;
+        Unit *uu = hashmap_get2(u->manager->watch_pids, pid, (void**) &pid2);
+
+        /* Quick validation: iff we are in the watch_pids table then the PidRef object must be the same as in our local pids set */
+        assert((uu == u) == (pid1 == pid2));
+
+        if (uu == u)
+                /* OK, we are in the first table. Let's remove it there then, and we are done already. */
+                assert_se(hashmap_remove_value(u->manager->watch_pids, pid2, uu));
+        else {
+                /* We weren't in the first table, then let's consult the 2nd table that points to an array */
+                PidRef *pid3 = NULL;
+                Unit **array = hashmap_get2(u->manager->watch_pids_more, pid, (void**) &pid3);
+
+                /* Let's iterate through the array, dropping our own entry */
+                size_t m = 0, n = 0;
+                for (; array && array[n]; n++)
+                        if (array[n] != u)
+                                array[m++] = array[n];
+                if (n == m)
+                        return; /* Not there */
+
+                array[m] = NULL; /* set trailing NULL marker on the new end */
+
+                if (m == 0) {
+                        /* The array is now empty, remove the entire entry */
+                        assert_se(hashmap_remove_value(u->manager->watch_pids_more, pid3, array));
+                        free(array);
+                } else {
+                        /* The array is not empty, but let's make sure the entry is not keyed by the PidRef
+                         * we will delete, but by the PidRef object of the Unit that is now first in the
+                         * array. */
+
+                        PidRef *new_pid3 = ASSERT_PTR(set_get(array[0]->pids, pid));
+                        assert_se(hashmap_replace(u->manager->watch_pids_more, new_pid3, array) >= 0);
+                }
+        }
+}
+
+void unit_unwatch_pid(Unit *u, pid_t pid) {
+        return unit_unwatch_pidref(u, &PIDREF_MAKE_FROM_PID(pid));
+}
+
+void unit_unwatch_all_pids(Unit *u) {
+        assert(u);
+
+        while (!set_isempty(u->pids))
+                unit_unwatch_pidref(u, set_first(u->pids));
+
+        u->pids = set_free(u->pids);
+}
+
+static void unit_tidy_watch_pids(Unit *u) {
+        PidRef *except1, *except2, *e;
+
+        assert(u);
+
+        /* Cleans dead PIDs from our list */
+
+        except1 = unit_main_pid(u);
+        except2 = unit_control_pid(u);
+
+        SET_FOREACH(e, u->pids) {
+                if (pidref_equal(except1, e) || pidref_equal(except2, e))
+                        continue;
+
+                if (pidref_is_unwaited(e) <= 0)
+                        unit_unwatch_pidref(u, e);
+        }
+}
+
+static int on_rewatch_pids_event(sd_event_source *s, void *userdata) {
+        Unit *u = ASSERT_PTR(userdata);
+
+        assert(s);
+
+        unit_tidy_watch_pids(u);
+        unit_watch_all_pids(u);
+
+        /* If the PID set is empty now, then let's finish this off. */
+        unit_synthesize_cgroup_empty_event(u);
+
+        return 0;
+}
+
+int unit_enqueue_rewatch_pids(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+        if (r < 0)
+                return r;
+        if (r > 0) /* On unified we can use proper notifications */
+                return 0;
+
+        /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new
+         * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it
+         * involves issuing kill(pid, 0) on all processes we watch. */
+
+        if (!u->rewatch_pids_event_source) {
+                _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+                r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m");
+
+                r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m");
+
+                (void) sd_event_source_set_description(s, "tidy-watch-pids");
+
+                u->rewatch_pids_event_source = TAKE_PTR(s);
+        }
+
+        r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m");
+
+        return 0;
+}
+
+void unit_dequeue_rewatch_pids(Unit *u) {
+        int r;
+        assert(u);
+
+        if (!u->rewatch_pids_event_source)
+                return;
+
+        r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF);
+        if (r < 0)
+                log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m");
+
+        u->rewatch_pids_event_source = sd_event_source_disable_unref(u->rewatch_pids_event_source);
+}
+
+bool unit_job_is_applicable(Unit *u, JobType j) {
+        assert(u);
+        assert(j >= 0 && j < _JOB_TYPE_MAX);
+
+        switch (j) {
+
+        case JOB_VERIFY_ACTIVE:
+        case JOB_START:
+        case JOB_NOP:
+                /* Note that we don't check unit_can_start() here. That's because .device units and suchlike are not
+                 * startable by us but may appear due to external events, and it thus makes sense to permit enqueuing
+                 * jobs for it. */
+                return true;
+
+        case JOB_STOP:
+                /* Similar as above. However, perpetual units can never be stopped (neither explicitly nor due to
+                 * external events), hence it makes no sense to permit enqueuing such a request either. */
+                return !u->perpetual;
+
+        case JOB_RESTART:
+        case JOB_TRY_RESTART:
+                return unit_can_stop(u) && unit_can_start(u);
+
+        case JOB_RELOAD:
+        case JOB_TRY_RELOAD:
+                return unit_can_reload(u);
+
+        case JOB_RELOAD_OR_START:
+                return unit_can_reload(u) && unit_can_start(u);
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static Hashmap *unit_get_dependency_hashmap_per_type(Unit *u, UnitDependency d) {
+        Hashmap *deps;
+
+        assert(u);
+        assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+
+        deps = hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d));
+        if (!deps) {
+                _cleanup_hashmap_free_ Hashmap *h = NULL;
+
+                h = hashmap_new(NULL);
+                if (!h)
+                        return NULL;
+
+                if (hashmap_ensure_put(&u->dependencies, NULL, UNIT_DEPENDENCY_TO_PTR(d), h) < 0)
+                        return NULL;
+
+                deps = TAKE_PTR(h);
+        }
+
+        return deps;
+}
+
+typedef enum NotifyDependencyFlags {
+        NOTIFY_DEPENDENCY_UPDATE_FROM = 1 << 0,
+        NOTIFY_DEPENDENCY_UPDATE_TO   = 1 << 1,
+} NotifyDependencyFlags;
+
+static int unit_add_dependency_impl(
+                Unit *u,
+                UnitDependency d,
+                Unit *other,
+                UnitDependencyMask mask) {
+
+        static const UnitDependency inverse_table[_UNIT_DEPENDENCY_MAX] = {
+                [UNIT_REQUIRES]               = UNIT_REQUIRED_BY,
+                [UNIT_REQUISITE]              = UNIT_REQUISITE_OF,
+                [UNIT_WANTS]                  = UNIT_WANTED_BY,
+                [UNIT_BINDS_TO]               = UNIT_BOUND_BY,
+                [UNIT_PART_OF]                = UNIT_CONSISTS_OF,
+                [UNIT_UPHOLDS]                = UNIT_UPHELD_BY,
+                [UNIT_REQUIRED_BY]            = UNIT_REQUIRES,
+                [UNIT_REQUISITE_OF]           = UNIT_REQUISITE,
+                [UNIT_WANTED_BY]              = UNIT_WANTS,
+                [UNIT_BOUND_BY]               = UNIT_BINDS_TO,
+                [UNIT_CONSISTS_OF]            = UNIT_PART_OF,
+                [UNIT_UPHELD_BY]              = UNIT_UPHOLDS,
+                [UNIT_CONFLICTS]              = UNIT_CONFLICTED_BY,
+                [UNIT_CONFLICTED_BY]          = UNIT_CONFLICTS,
+                [UNIT_BEFORE]                 = UNIT_AFTER,
+                [UNIT_AFTER]                  = UNIT_BEFORE,
+                [UNIT_ON_SUCCESS]             = UNIT_ON_SUCCESS_OF,
+                [UNIT_ON_SUCCESS_OF]          = UNIT_ON_SUCCESS,
+                [UNIT_ON_FAILURE]             = UNIT_ON_FAILURE_OF,
+                [UNIT_ON_FAILURE_OF]          = UNIT_ON_FAILURE,
+                [UNIT_TRIGGERS]               = UNIT_TRIGGERED_BY,
+                [UNIT_TRIGGERED_BY]           = UNIT_TRIGGERS,
+                [UNIT_PROPAGATES_RELOAD_TO]   = UNIT_RELOAD_PROPAGATED_FROM,
+                [UNIT_RELOAD_PROPAGATED_FROM] = UNIT_PROPAGATES_RELOAD_TO,
+                [UNIT_PROPAGATES_STOP_TO]     = UNIT_STOP_PROPAGATED_FROM,
+                [UNIT_STOP_PROPAGATED_FROM]   = UNIT_PROPAGATES_STOP_TO,
+                [UNIT_JOINS_NAMESPACE_OF]     = UNIT_JOINS_NAMESPACE_OF, /* symmetric! 👓 */
+                [UNIT_REFERENCES]             = UNIT_REFERENCED_BY,
+                [UNIT_REFERENCED_BY]          = UNIT_REFERENCES,
+                [UNIT_IN_SLICE]               = UNIT_SLICE_OF,
+                [UNIT_SLICE_OF]               = UNIT_IN_SLICE,
+        };
+
+        Hashmap *u_deps, *other_deps;
+        UnitDependencyInfo u_info, u_info_old, other_info, other_info_old;
+        NotifyDependencyFlags flags = 0;
+        int r;
+
+        assert(u);
+        assert(other);
+        assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+        assert(inverse_table[d] >= 0 && inverse_table[d] < _UNIT_DEPENDENCY_MAX);
+        assert(mask > 0 && mask < _UNIT_DEPENDENCY_MASK_FULL);
+
+        /* Ensure the following two hashmaps for each unit exist:
+         * - the top-level dependency hashmap that maps UnitDependency → Hashmap(Unit* → UnitDependencyInfo),
+         * - the inner hashmap, that maps Unit* → UnitDependencyInfo, for the specified dependency type. */
+        u_deps = unit_get_dependency_hashmap_per_type(u, d);
+        if (!u_deps)
+                return -ENOMEM;
+
+        other_deps = unit_get_dependency_hashmap_per_type(other, inverse_table[d]);
+        if (!other_deps)
+                return -ENOMEM;
+
+        /* Save the original dependency info. */
+        u_info.data = u_info_old.data = hashmap_get(u_deps, other);
+        other_info.data = other_info_old.data = hashmap_get(other_deps, u);
+
+        /* Update dependency info. */
+        u_info.origin_mask |= mask;
+        other_info.destination_mask |= mask;
+
+        /* Save updated dependency info. */
+        if (u_info.data != u_info_old.data) {
+                r = hashmap_replace(u_deps, other, u_info.data);
+                if (r < 0)
+                        return r;
+
+                flags = NOTIFY_DEPENDENCY_UPDATE_FROM;
+        }
+
+        if (other_info.data != other_info_old.data) {
+                r = hashmap_replace(other_deps, u, other_info.data);
+                if (r < 0) {
+                        if (u_info.data != u_info_old.data) {
+                                /* Restore the old dependency. */
+                                if (u_info_old.data)
+                                        (void) hashmap_update(u_deps, other, u_info_old.data);
+                                else
+                                        hashmap_remove(u_deps, other);
+                        }
+                        return r;
+                }
+
+                flags |= NOTIFY_DEPENDENCY_UPDATE_TO;
+        }
+
+        return flags;
+}
+
+int unit_add_dependency(
+                Unit *u,
+                UnitDependency d,
+                Unit *other,
+                bool add_reference,
+                UnitDependencyMask mask) {
+
+        UnitDependencyAtom a;
+        int r;
+
+        /* Helper to know whether sending a notification is necessary or not: if the dependency is already
+         * there, no need to notify! */
+        NotifyDependencyFlags notify_flags;
+
+        assert(u);
+        assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+        assert(other);
+
+        u = unit_follow_merge(u);
+        other = unit_follow_merge(other);
+        a = unit_dependency_to_atom(d);
+        assert(a >= 0);
+
+        /* We won't allow dependencies on ourselves. We will not consider them an error however. */
+        if (u == other) {
+                if (unit_should_warn_about_dependency(d))
+                        log_unit_warning(u, "Dependency %s=%s is dropped.",
+                                         unit_dependency_to_string(d), u->id);
+                return 0;
+        }
+
+        if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+                return 0;
+
+        /* Note that ordering a device unit after a unit is permitted since it allows to start its job
+         * running timeout at a specific time. */
+        if (FLAGS_SET(a, UNIT_ATOM_BEFORE) && other->type == UNIT_DEVICE) {
+                log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id);
+                return 0;
+        }
+
+        if (FLAGS_SET(a, UNIT_ATOM_ON_FAILURE) && !UNIT_VTABLE(u)->can_fail) {
+                log_unit_warning(u, "Requested dependency OnFailure=%s ignored (%s units cannot fail).", other->id, unit_type_to_string(u->type));
+                return 0;
+        }
+
+        if (FLAGS_SET(a, UNIT_ATOM_TRIGGERS) && !UNIT_VTABLE(u)->can_trigger)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "Requested dependency Triggers=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(u->type));
+        if (FLAGS_SET(a, UNIT_ATOM_TRIGGERED_BY) && !UNIT_VTABLE(other)->can_trigger)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "Requested dependency TriggeredBy=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(other->type));
+
+        if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && other->type != UNIT_SLICE)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "Requested dependency Slice=%s refused (%s is not a slice unit).", other->id, other->id);
+        if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && u->type != UNIT_SLICE)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "Requested dependency SliceOf=%s refused (%s is not a slice unit).", other->id, u->id);
+
+        if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && !UNIT_HAS_CGROUP_CONTEXT(u))
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "Requested dependency Slice=%s refused (%s is not a cgroup unit).", other->id, u->id);
+
+        if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && !UNIT_HAS_CGROUP_CONTEXT(other))
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+                                            "Requested dependency SliceOf=%s refused (%s is not a cgroup unit).", other->id, other->id);
+
+        r = unit_add_dependency_impl(u, d, other, mask);
+        if (r < 0)
+                return r;
+        notify_flags = r;
+
+        if (add_reference) {
+                r = unit_add_dependency_impl(u, UNIT_REFERENCES, other, mask);
+                if (r < 0)
+                        return r;
+                notify_flags |= r;
+        }
+
+        if (FLAGS_SET(notify_flags, NOTIFY_DEPENDENCY_UPDATE_FROM))
+                unit_add_to_dbus_queue(u);
+        if (FLAGS_SET(notify_flags, NOTIFY_DEPENDENCY_UPDATE_TO))
+                unit_add_to_dbus_queue(other);
+
+        return notify_flags != 0;
+}
+
+int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask) {
+        int r = 0, s = 0;
+
+        assert(u);
+        assert(d >= 0 || e >= 0);
+
+        if (d >= 0) {
+                r = unit_add_dependency(u, d, other, add_reference, mask);
+                if (r < 0)
+                        return r;
+        }
+
+        if (e >= 0) {
+                s = unit_add_dependency(u, e, other, add_reference, mask);
+                if (s < 0)
+                        return s;
+        }
+
+        return r > 0 || s > 0;
+}
+
+static int resolve_template(Unit *u, const char *name, char **buf, const char **ret) {
+        int r;
+
+        assert(u);
+        assert(name);
+        assert(buf);
+        assert(ret);
+
+        if (!unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+                *buf = NULL;
+                *ret = name;
+                return 0;
+        }
+
+        if (u->instance)
+                r = unit_name_replace_instance(name, u->instance, buf);
+        else {
+                _cleanup_free_ char *i = NULL;
+
+                r = unit_name_to_prefix(u->id, &i);
+                if (r < 0)
+                        return r;
+
+                r = unit_name_replace_instance(name, i, buf);
+        }
+        if (r < 0)
+                return r;
+
+        *ret = *buf;
+        return 0;
+}
+
+int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask) {
+        _cleanup_free_ char *buf = NULL;
+        Unit *other;
+        int r;
+
+        assert(u);
+        assert(name);
+
+        r = resolve_template(u, name, &buf, &name);
+        if (r < 0)
+                return r;
+
+        if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+                return 0;
+
+        r = manager_load_unit(u->manager, name, NULL, NULL, &other);
+        if (r < 0)
+                return r;
+
+        return unit_add_dependency(u, d, other, add_reference, mask);
+}
+
+int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask) {
+        _cleanup_free_ char *buf = NULL;
+        Unit *other;
+        int r;
+
+        assert(u);
+        assert(name);
+
+        r = resolve_template(u, name, &buf, &name);
+        if (r < 0)
+                return r;
+
+        if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+                return 0;
+
+        r = manager_load_unit(u->manager, name, NULL, NULL, &other);
+        if (r < 0)
+                return r;
+
+        return unit_add_two_dependencies(u, d, e, other, add_reference, mask);
+}
+
+int set_unit_path(const char *p) {
+        /* This is mostly for debug purposes */
+        return RET_NERRNO(setenv("SYSTEMD_UNIT_PATH", p, 1));
+}
+
+char *unit_dbus_path(Unit *u) {
+        assert(u);
+
+        if (!u->id)
+                return NULL;
+
+        return unit_dbus_path_from_name(u->id);
+}
+
+char *unit_dbus_path_invocation_id(Unit *u) {
+        assert(u);
+
+        if (sd_id128_is_null(u->invocation_id))
+                return NULL;
+
+        return unit_dbus_path_from_name(u->invocation_id_string);
+}
+
+int unit_set_invocation_id(Unit *u, sd_id128_t id) {
+        int r;
+
+        assert(u);
+
+        /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */
+
+        if (sd_id128_equal(u->invocation_id, id))
+                return 0;
+
+        if (!sd_id128_is_null(u->invocation_id))
+                (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u);
+
+        if (sd_id128_is_null(id)) {
+                r = 0;
+                goto reset;
+        }
+
+        r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops);
+        if (r < 0)
+                goto reset;
+
+        u->invocation_id = id;
+        sd_id128_to_string(id, u->invocation_id_string);
+
+        r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u);
+        if (r < 0)
+                goto reset;
+
+        return 0;
+
+reset:
+        u->invocation_id = SD_ID128_NULL;
+        u->invocation_id_string[0] = 0;
+        return r;
+}
+
+int unit_set_slice(Unit *u, Unit *slice) {
+        int r;
+
+        assert(u);
+        assert(slice);
+
+        /* Sets the unit slice if it has not been set before. Is extra careful, to only allow this for units
+         * that actually have a cgroup context. Also, we don't allow to set this for slices (since the parent
+         * slice is derived from the name). Make sure the unit we set is actually a slice. */
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return -EOPNOTSUPP;
+
+        if (u->type == UNIT_SLICE)
+                return -EINVAL;
+
+        if (unit_active_state(u) != UNIT_INACTIVE)
+                return -EBUSY;
+
+        if (slice->type != UNIT_SLICE)
+                return -EINVAL;
+
+        if (unit_has_name(u, SPECIAL_INIT_SCOPE) &&
+            !unit_has_name(slice, SPECIAL_ROOT_SLICE))
+                return -EPERM;
+
+        if (UNIT_GET_SLICE(u) == slice)
+                return 0;
+
+        /* Disallow slice changes if @u is already bound to cgroups */
+        if (UNIT_GET_SLICE(u) && u->cgroup_realized)
+                return -EBUSY;
+
+        /* Remove any slices assigned prior; we should only have one UNIT_IN_SLICE dependency */
+        if (UNIT_GET_SLICE(u))
+                unit_remove_dependencies(u, UNIT_DEPENDENCY_SLICE_PROPERTY);
+
+        r = unit_add_dependency(u, UNIT_IN_SLICE, slice, true, UNIT_DEPENDENCY_SLICE_PROPERTY);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int unit_set_default_slice(Unit *u) {
+        const char *slice_name;
+        Unit *slice;
+        int r;
+
+        assert(u);
+
+        if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+                return 0;
+
+        if (UNIT_GET_SLICE(u))
+                return 0;
+
+        if (u->instance) {
+                _cleanup_free_ char *prefix = NULL, *escaped = NULL;
+
+                /* Implicitly place all instantiated units in their
+                 * own per-template slice */
+
+                r = unit_name_to_prefix(u->id, &prefix);
+                if (r < 0)
+                        return r;
+
+                /* The prefix is already escaped, but it might include
+                 * "-" which has a special meaning for slice units,
+                 * hence escape it here extra. */
+                escaped = unit_name_escape(prefix);
+                if (!escaped)
+                        return -ENOMEM;
+
+                if (MANAGER_IS_SYSTEM(u->manager))
+                        slice_name = strjoina("system-", escaped, ".slice");
+                else
+                        slice_name = strjoina("app-", escaped, ".slice");
+
+        } else if (unit_is_extrinsic(u))
+                /* Keep all extrinsic units (e.g. perpetual units and swap and mount units in user mode) in
+                 * the root slice. They don't really belong in one of the subslices. */
+                slice_name = SPECIAL_ROOT_SLICE;
+
+        else if (MANAGER_IS_SYSTEM(u->manager))
+                slice_name = SPECIAL_SYSTEM_SLICE;
+        else
+                slice_name = SPECIAL_APP_SLICE;
+
+        r = manager_load_unit(u->manager, slice_name, NULL, NULL, &slice);
+        if (r < 0)
+                return r;
+
+        return unit_set_slice(u, slice);
+}
+
+const char *unit_slice_name(Unit *u) {
+        Unit *slice;
+        assert(u);
+
+        slice = UNIT_GET_SLICE(u);
+        if (!slice)
+                return NULL;
+
+        return slice->id;
+}
+
+int unit_load_related_unit(Unit *u, const char *type, Unit **_found) {
+        _cleanup_free_ char *t = NULL;
+        int r;
+
+        assert(u);
+        assert(type);
+        assert(_found);
+
+        r = unit_name_change_suffix(u->id, type, &t);
+        if (r < 0)
+                return r;
+        if (unit_has_name(u, t))
+                return -EINVAL;
+
+        r = manager_load_unit(u->manager, t, NULL, NULL, _found);
+        assert(r < 0 || *_found != u);
+        return r;
+}
+
+static int signal_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        const char *new_owner;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "sss", NULL, NULL, &new_owner);
+        if (r < 0) {
+                bus_log_parse_error(r);
+                return 0;
+        }
+
+        if (UNIT_VTABLE(u)->bus_name_owner_change)
+                UNIT_VTABLE(u)->bus_name_owner_change(u, empty_to_null(new_owner));
+
+        return 0;
+}
+
+static int get_name_owner_handler(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        const sd_bus_error *e;
+        const char *new_owner;
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+
+        e = sd_bus_message_get_error(message);
+        if (e) {
+                if (!sd_bus_error_has_name(e, SD_BUS_ERROR_NAME_HAS_NO_OWNER)) {
+                        r = sd_bus_error_get_errno(e);
+                        log_unit_error_errno(u, r,
+                                             "Unexpected error response from GetNameOwner(): %s",
+                                             bus_error_message(e, r));
+                }
+
+                new_owner = NULL;
+        } else {
+                r = sd_bus_message_read(message, "s", &new_owner);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                assert(!isempty(new_owner));
+        }
+
+        if (UNIT_VTABLE(u)->bus_name_owner_change)
+                UNIT_VTABLE(u)->bus_name_owner_change(u, new_owner);
+
+        return 0;
+}
+
+int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        const char *match;
+        usec_t timeout_usec = 0;
+        int r;
+
+        assert(u);
+        assert(bus);
+        assert(name);
+
+        if (u->match_bus_slot || u->get_name_owner_slot)
+                return -EBUSY;
+
+        /* NameOwnerChanged and GetNameOwner is used to detect when a service finished starting up. The dbus
+         * call timeout shouldn't be earlier than that. If we couldn't get the start timeout, use the default
+         * value defined above. */
+        if (UNIT_VTABLE(u)->get_timeout_start_usec)
+                timeout_usec = UNIT_VTABLE(u)->get_timeout_start_usec(u);
+
+        match = strjoina("type='signal',"
+                         "sender='org.freedesktop.DBus',"
+                         "path='/org/freedesktop/DBus',"
+                         "interface='org.freedesktop.DBus',"
+                         "member='NameOwnerChanged',"
+                         "arg0='", name, "'");
+
+        r = bus_add_match_full(
+                        bus,
+                        &u->match_bus_slot,
+                        true,
+                        match,
+                        signal_name_owner_changed,
+                        NULL,
+                        u,
+                        timeout_usec);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_call(
+                        bus,
+                        &m,
+                        "org.freedesktop.DBus",
+                        "/org/freedesktop/DBus",
+                        "org.freedesktop.DBus",
+                        "GetNameOwner");
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(m, "s", name);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_call_async(
+                        bus,
+                        &u->get_name_owner_slot,
+                        m,
+                        get_name_owner_handler,
+                        u,
+                        timeout_usec);
+
+        if (r < 0) {
+                u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+                return r;
+        }
+
+        log_unit_debug(u, "Watching D-Bus name '%s'.", name);
+        return 0;
+}
+
+int unit_watch_bus_name(Unit *u, const char *name) {
+        int r;
+
+        assert(u);
+        assert(name);
+
+        /* Watch a specific name on the bus. We only support one unit
+         * watching each name for now. */
+
+        if (u->manager->api_bus) {
+                /* If the bus is already available, install the match directly.
+                 * Otherwise, just put the name in the list. bus_setup_api() will take care later. */
+                r = unit_install_bus_match(u, u->manager->api_bus, name);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name);
+        }
+
+        r = hashmap_put(u->manager->watch_bus, name, u);
+        if (r < 0) {
+                u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+                u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+                return log_warning_errno(r, "Failed to put bus name to hashmap: %m");
+        }
+
+        return 0;
+}
+
+void unit_unwatch_bus_name(Unit *u, const char *name) {
+        assert(u);
+        assert(name);
+
+        (void) hashmap_remove_value(u->manager->watch_bus, name, u);
+        u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+        u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+}
+
+int unit_add_node_dependency(Unit *u, const char *what, UnitDependency dep, UnitDependencyMask mask) {
+        _cleanup_free_ char *e = NULL;
+        Unit *device;
+        int r;
+
+        assert(u);
+
+        /* Adds in links to the device node that this unit is based on */
+        if (isempty(what))
+                return 0;
+
+        if (!is_device_path(what))
+                return 0;
+
+        /* When device units aren't supported (such as in a container), don't create dependencies on them. */
+        if (!unit_type_supported(UNIT_DEVICE))
+                return 0;
+
+        r = unit_name_from_path(what, ".device", &e);
+        if (r < 0)
+                return r;
+
+        r = manager_load_unit(u->manager, e, NULL, NULL, &device);
+        if (r < 0)
+                return r;
+
+        if (dep == UNIT_REQUIRES && device_shall_be_bound_by(device, u))
+                dep = UNIT_BINDS_TO;
+
+        return unit_add_two_dependencies(u, UNIT_AFTER,
+                                         MANAGER_IS_SYSTEM(u->manager) ? dep : UNIT_WANTS,
+                                         device, true, mask);
+}
+
+int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask) {
+        _cleanup_free_ char *escaped = NULL, *target = NULL;
+        int r;
+
+        assert(u);
+
+        if (isempty(what))
+                return 0;
+
+        if (!path_startswith(what, "/dev/"))
+                return 0;
+
+        /* If we don't support devices, then also don't bother with blockdev@.target */
+        if (!unit_type_supported(UNIT_DEVICE))
+                return 0;
+
+        r = unit_name_path_escape(what, &escaped);
+        if (r < 0)
+                return r;
+
+        r = unit_name_build("blockdev", escaped, ".target", &target);
+        if (r < 0)
+                return r;
+
+        return unit_add_dependency_by_name(u, UNIT_AFTER, target, true, mask);
+}
+
+int unit_coldplug(Unit *u) {
+        int r = 0;
+
+        assert(u);
+
+        /* Make sure we don't enter a loop, when coldplugging recursively. */
+        if (u->coldplugged)
+                return 0;
+
+        u->coldplugged = true;
+
+        STRV_FOREACH(i, u->deserialized_refs)
+                RET_GATHER(r, bus_unit_track_add_name(u, *i));
+
+        u->deserialized_refs = strv_free(u->deserialized_refs);
+
+        if (UNIT_VTABLE(u)->coldplug)
+                RET_GATHER(r, UNIT_VTABLE(u)->coldplug(u));
+
+        if (u->job)
+                RET_GATHER(r, job_coldplug(u->job));
+        if (u->nop_job)
+                RET_GATHER(r, job_coldplug(u->nop_job));
+
+        unit_modify_nft_set(u, /* add = */ true);
+        return r;
+}
+
+void unit_catchup(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->catchup)
+                UNIT_VTABLE(u)->catchup(u);
+
+        unit_cgroup_catchup(u);
+}
+
+static bool fragment_mtime_newer(const char *path, usec_t mtime, bool path_masked) {
+        struct stat st;
+
+        if (!path)
+                return false;
+
+        /* If the source is some virtual kernel file system, then we assume we watch it anyway, and hence pretend we
+         * are never out-of-date. */
+        if (PATH_STARTSWITH_SET(path, "/proc", "/sys"))
+                return false;
+
+        if (stat(path, &st) < 0)
+                /* What, cannot access this anymore? */
+                return true;
+
+        if (path_masked)
+                /* For masked files check if they are still so */
+                return !null_or_empty(&st);
+        else
+                /* For non-empty files check the mtime */
+                return timespec_load(&st.st_mtim) > mtime;
+
+        return false;
+}
+
+bool unit_need_daemon_reload(Unit *u) {
+        _cleanup_strv_free_ char **dropins = NULL;
+
+        assert(u);
+        assert(u->manager);
+
+        if (u->manager->unit_file_state_outdated)
+                return true;
+
+        /* For unit files, we allow masking… */
+        if (fragment_mtime_newer(u->fragment_path, u->fragment_mtime,
+                                 u->load_state == UNIT_MASKED))
+                return true;
+
+        /* Source paths should not be masked… */
+        if (fragment_mtime_newer(u->source_path, u->source_mtime, false))
+                return true;
+
+        if (u->load_state == UNIT_LOADED)
+                (void) unit_find_dropin_paths(u, &dropins);
+        if (!strv_equal(u->dropin_paths, dropins))
+                return true;
+
+        /* … any drop-ins that are masked are simply omitted from the list. */
+        STRV_FOREACH(path, u->dropin_paths)
+                if (fragment_mtime_newer(*path, u->dropin_mtime, false))
+                        return true;
+
+        return false;
+}
+
+void unit_reset_failed(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->reset_failed)
+                UNIT_VTABLE(u)->reset_failed(u);
+
+        ratelimit_reset(&u->start_ratelimit);
+        u->start_limit_hit = false;
+}
+
+Unit *unit_following(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->following)
+                return UNIT_VTABLE(u)->following(u);
+
+        return NULL;
+}
+
+bool unit_stop_pending(Unit *u) {
+        assert(u);
+
+        /* This call does check the current state of the unit. It's
+         * hence useful to be called from state change calls of the
+         * unit itself, where the state isn't updated yet. This is
+         * different from unit_inactive_or_pending() which checks both
+         * the current state and for a queued job. */
+
+        return unit_has_job_type(u, JOB_STOP);
+}
+
+bool unit_inactive_or_pending(Unit *u) {
+        assert(u);
+
+        /* Returns true if the unit is inactive or going down */
+
+        if (UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u)))
+                return true;
+
+        if (unit_stop_pending(u))
+                return true;
+
+        return false;
+}
+
+bool unit_active_or_pending(Unit *u) {
+        assert(u);
+
+        /* Returns true if the unit is active or going up */
+
+        if (UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u)))
+                return true;
+
+        if (u->job &&
+            IN_SET(u->job->type, JOB_START, JOB_RELOAD_OR_START, JOB_RESTART))
+                return true;
+
+        return false;
+}
+
+bool unit_will_restart_default(Unit *u) {
+        assert(u);
+
+        return unit_has_job_type(u, JOB_START);
+}
+
+bool unit_will_restart(Unit *u) {
+        assert(u);
+
+        if (!UNIT_VTABLE(u)->will_restart)
+                return false;
+
+        return UNIT_VTABLE(u)->will_restart(u);
+}
+
+void unit_notify_cgroup_oom(Unit *u, bool managed_oom) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->notify_cgroup_oom)
+                UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom);
+}
+
+static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) {
+        _cleanup_set_free_ Set *pid_set = NULL;
+        int r;
+
+        pid_set = set_new(NULL);
+        if (!pid_set)
+                return NULL;
+
+        /* Exclude the main/control pids from being killed via the cgroup */
+        if (main_pid > 0) {
+                r = set_put(pid_set, PID_TO_PTR(main_pid));
+                if (r < 0)
+                        return NULL;
+        }
+
+        if (control_pid > 0) {
+                r = set_put(pid_set, PID_TO_PTR(control_pid));
+                if (r < 0)
+                        return NULL;
+        }
+
+        return TAKE_PTR(pid_set);
+}
+
+static int kill_common_log(const PidRef *pid, int signo, void *userdata) {
+        _cleanup_free_ char *comm = NULL;
+        Unit *u = ASSERT_PTR(userdata);
+
+        (void) pidref_get_comm(pid, &comm);
+
+        log_unit_info(u, "Sending signal SIG%s to process " PID_FMT " (%s) on client request.",
+                      signal_to_string(signo), pid->pid, strna(comm));
+
+        return 1;
+}
+
+static int kill_or_sigqueue(PidRef* pidref, int signo, int code, int value) {
+        assert(pidref_is_set(pidref));
+        assert(SIGNAL_VALID(signo));
+
+        switch (code) {
+
+        case SI_USER:
+                log_debug("Killing " PID_FMT " with signal SIG%s.", pidref->pid, signal_to_string(signo));
+                return pidref_kill(pidref, signo);
+
+        case SI_QUEUE:
+                log_debug("Enqueuing value %i to " PID_FMT " on signal SIG%s.", value, pidref->pid, signal_to_string(signo));
+                return pidref_sigqueue(pidref, signo, value);
+
+        default:
+                assert_not_reached();
+        }
+}
+
+int unit_kill(
+                Unit *u,
+                KillWho who,
+                int signo,
+                int code,
+                int value,
+                sd_bus_error *error) {
+
+        PidRef *main_pid, *control_pid;
+        bool killed = false;
+        int ret = 0, r;
+
+        /* This is the common implementation for explicit user-requested killing of unit processes, shared by
+         * various unit types. Do not confuse with unit_kill_context(), which is what we use when we want to
+         * stop a service ourselves. */
+
+        assert(u);
+        assert(who >= 0);
+        assert(who < _KILL_WHO_MAX);
+        assert(SIGNAL_VALID(signo));
+        assert(IN_SET(code, SI_USER, SI_QUEUE));
+
+        main_pid = unit_main_pid(u);
+        control_pid = unit_control_pid(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u) && !main_pid && !control_pid)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing.");
+
+        if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) {
+                if (!main_pid)
+                        return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type));
+                if (!pidref_is_set(main_pid))
+                        return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill");
+        }
+
+        if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) {
+                if (!control_pid)
+                        return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type));
+                if (!pidref_is_set(control_pid))
+                        return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill");
+        }
+
+        if (pidref_is_set(control_pid) &&
+            IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
+                _cleanup_free_ char *comm = NULL;
+                (void) pidref_get_comm(control_pid, &comm);
+
+                r = kill_or_sigqueue(control_pid, signo, code, value);
+                if (r < 0) {
+                        ret = r;
+
+                        /* Report this failure both to the logs and to the client */
+                        sd_bus_error_set_errnof(
+                                        error, r,
+                                        "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m",
+                                        signal_to_string(signo), control_pid->pid, strna(comm));
+                        log_unit_warning_errno(
+                                        u, r,
+                                        "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m",
+                                        signal_to_string(signo), control_pid->pid, strna(comm));
+                } else {
+                        log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.",
+                                      signal_to_string(signo), control_pid->pid, strna(comm));
+                        killed = true;
+                }
+        }
+
+        if (pidref_is_set(main_pid) &&
+            IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
+                _cleanup_free_ char *comm = NULL;
+                (void) pidref_get_comm(main_pid, &comm);
+
+                r = kill_or_sigqueue(main_pid, signo, code, value);
+                if (r < 0) {
+                        if (ret == 0) {
+                                ret = r;
+
+                                sd_bus_error_set_errnof(
+                                                error, r,
+                                                "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m",
+                                                signal_to_string(signo), main_pid->pid, strna(comm));
+                        }
+
+                        log_unit_warning_errno(
+                                        u, r,
+                                        "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m",
+                                        signal_to_string(signo), main_pid->pid, strna(comm));
+
+                } else {
+                        log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.",
+                                      signal_to_string(signo), main_pid->pid, strna(comm));
+                        killed = true;
+                }
+        }
+
+        /* Note: if we shall enqueue rather than kill we won't do this via the cgroup mechanism, since it
+         * doesn't really make much sense (and given that enqueued values are a relatively expensive
+         * resource, and we shouldn't allow us to be subjects for such allocation sprees) */
+        if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path && code == SI_USER) {
+                _cleanup_set_free_ Set *pid_set = NULL;
+
+                /* Exclude the main/control pids from being killed via the cgroup */
+                pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
+                if (!pid_set)
+                        return log_oom();
+
+                r = cg_kill_recursive(u->cgroup_path, signo, 0, pid_set, kill_common_log, u);
+                if (r < 0) {
+                        if (!IN_SET(r, -ESRCH, -ENOENT)) {
+                                if (ret == 0) {
+                                        ret = r;
+
+                                        sd_bus_error_set_errnof(
+                                                        error, r,
+                                                        "Failed to send signal SIG%s to auxiliary processes: %m",
+                                                        signal_to_string(signo));
+                                }
+
+                                log_unit_warning_errno(
+                                                u, r,
+                                                "Failed to send signal SIG%s to auxiliary processes on client request: %m",
+                                                signal_to_string(signo));
+                        }
+                } else
+                        killed = true;
+        }
+
+        /* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */
+        if (ret == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL))
+                return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill");
+
+        return ret;
+}
+
+int unit_following_set(Unit *u, Set **s) {
+        assert(u);
+        assert(s);
+
+        if (UNIT_VTABLE(u)->following_set)
+                return UNIT_VTABLE(u)->following_set(u, s);
+
+        *s = NULL;
+        return 0;
+}
+
+UnitFileState unit_get_unit_file_state(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->unit_file_state < 0 && u->fragment_path) {
+                r = unit_file_get_state(
+                                u->manager->runtime_scope,
+                                NULL,
+                                u->id,
+                                &u->unit_file_state);
+                if (r < 0)
+                        u->unit_file_state = UNIT_FILE_BAD;
+        }
+
+        return u->unit_file_state;
+}
+
+PresetAction unit_get_unit_file_preset(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->unit_file_preset < 0 && u->fragment_path) {
+                _cleanup_free_ char *bn = NULL;
+
+                r = path_extract_filename(u->fragment_path, &bn);
+                if (r < 0)
+                        return (u->unit_file_preset = r);
+
+                if (r == O_DIRECTORY)
+                        return (u->unit_file_preset = -EISDIR);
+
+                u->unit_file_preset = unit_file_query_preset(
+                                u->manager->runtime_scope,
+                                NULL,
+                                bn,
+                                NULL);
+        }
+
+        return u->unit_file_preset;
+}
+
+Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target) {
+        assert(ref);
+        assert(source);
+        assert(target);
+
+        if (ref->target)
+                unit_ref_unset(ref);
+
+        ref->source = source;
+        ref->target = target;
+        LIST_PREPEND(refs_by_target, target->refs_by_target, ref);
+        return target;
+}
+
+void unit_ref_unset(UnitRef *ref) {
+        assert(ref);
+
+        if (!ref->target)
+                return;
+
+        /* We are about to drop a reference to the unit, make sure the garbage collection has a look at it as it might
+         * be unreferenced now. */
+        unit_add_to_gc_queue(ref->target);
+
+        LIST_REMOVE(refs_by_target, ref->target->refs_by_target, ref);
+        ref->source = ref->target = NULL;
+}
+
+static int user_from_unit_name(Unit *u, char **ret) {
+
+        static const uint8_t hash_key[] = {
+                0x58, 0x1a, 0xaf, 0xe6, 0x28, 0x58, 0x4e, 0x96,
+                0xb4, 0x4e, 0xf5, 0x3b, 0x8c, 0x92, 0x07, 0xec
+        };
+
+        _cleanup_free_ char *n = NULL;
+        int r;
+
+        r = unit_name_to_prefix(u->id, &n);
+        if (r < 0)
+                return r;
+
+        if (valid_user_group_name(n, 0)) {
+                *ret = TAKE_PTR(n);
+                return 0;
+        }
+
+        /* If we can't use the unit name as a user name, then let's hash it and use that */
+        if (asprintf(ret, "_du%016" PRIx64, siphash24(n, strlen(n), hash_key)) < 0)
+                return -ENOMEM;
+
+        return 0;
+}
+
+int unit_patch_contexts(Unit *u) {
+        CGroupContext *cc;
+        ExecContext *ec;
+        int r;
+
+        assert(u);
+
+        /* Patch in the manager defaults into the exec and cgroup
+         * contexts, _after_ the rest of the settings have been
+         * initialized */
+
+        ec = unit_get_exec_context(u);
+        if (ec) {
+                /* This only copies in the ones that need memory */
+                for (unsigned i = 0; i < _RLIMIT_MAX; i++)
+                        if (u->manager->defaults.rlimit[i] && !ec->rlimit[i]) {
+                                ec->rlimit[i] = newdup(struct rlimit, u->manager->defaults.rlimit[i], 1);
+                                if (!ec->rlimit[i])
+                                        return -ENOMEM;
+                        }
+
+                if (MANAGER_IS_USER(u->manager) &&
+                    !ec->working_directory) {
+
+                        r = get_home_dir(&ec->working_directory);
+                        if (r < 0)
+                                return r;
+
+                        /* Allow user services to run, even if the
+                         * home directory is missing */
+                        ec->working_directory_missing_ok = true;
+                }
+
+                if (ec->private_devices)
+                        ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO));
+
+                if (ec->protect_kernel_modules)
+                        ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);
+
+                if (ec->protect_kernel_logs)
+                        ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYSLOG);
+
+                if (ec->protect_clock)
+                        ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_SYS_TIME) | (UINT64_C(1) << CAP_WAKE_ALARM));
+
+                if (ec->dynamic_user) {
+                        if (!ec->user) {
+                                r = user_from_unit_name(u, &ec->user);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        if (!ec->group) {
+                                ec->group = strdup(ec->user);
+                                if (!ec->group)
+                                        return -ENOMEM;
+                        }
+
+                        /* If the dynamic user option is on, let's make sure that the unit can't leave its
+                         * UID/GID around in the file system or on IPC objects. Hence enforce a strict
+                         * sandbox. */
+
+                        ec->private_tmp = true;
+                        ec->remove_ipc = true;
+                        ec->protect_system = PROTECT_SYSTEM_STRICT;
+                        if (ec->protect_home == PROTECT_HOME_NO)
+                                ec->protect_home = PROTECT_HOME_READ_ONLY;
+
+                        /* Make sure this service can neither benefit from SUID/SGID binaries nor create
+                         * them. */
+                        ec->no_new_privileges = true;
+                        ec->restrict_suid_sgid = true;
+                }
+
+                for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+                        exec_directory_sort(ec->directories + dt);
+        }
+
+        cc = unit_get_cgroup_context(u);
+        if (cc && ec) {
+
+                if (ec->private_devices &&
+                    cc->device_policy == CGROUP_DEVICE_POLICY_AUTO)
+                        cc->device_policy = CGROUP_DEVICE_POLICY_CLOSED;
+
+                /* Only add these if needed, as they imply that everything else is blocked. */
+                if (cc->device_policy != CGROUP_DEVICE_POLICY_AUTO || cc->device_allow) {
+                        if (ec->root_image || ec->mount_images) {
+
+                                /* When RootImage= or MountImages= is specified, the following devices are touched. */
+                                FOREACH_STRING(p, "/dev/loop-control", "/dev/mapper/control") {
+                                        r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
+                                        if (r < 0)
+                                                return r;
+                                }
+                                FOREACH_STRING(p, "block-loop", "block-blkext", "block-device-mapper") {
+                                        r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD);
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                /* Make sure "block-loop" can be resolved, i.e. make sure "loop" shows up in /proc/devices.
+                                * Same for mapper and verity. */
+                                FOREACH_STRING(p, "modprobe@loop.service", "modprobe@dm_mod.service", "modprobe@dm_verity.service") {
+                                        r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, p, true, UNIT_DEPENDENCY_FILE);
+                                        if (r < 0)
+                                                return r;
+                                }
+                        }
+
+                        if (ec->protect_clock) {
+                                r = cgroup_context_add_device_allow(cc, "char-rtc", CGROUP_DEVICE_READ);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        /* If there are encrypted credentials we might need to access the TPM. */
+                        if (exec_context_has_encrypted_credentials(ec)) {
+                                r = cgroup_context_add_device_allow(cc, "char-tpm", CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+        }
+
+        return 0;
+}
+
+ExecContext *unit_get_exec_context(const Unit *u) {
+        size_t offset;
+        assert(u);
+
+        if (u->type < 0)
+                return NULL;
+
+        offset = UNIT_VTABLE(u)->exec_context_offset;
+        if (offset <= 0)
+                return NULL;
+
+        return (ExecContext*) ((uint8_t*) u + offset);
+}
+
+KillContext *unit_get_kill_context(Unit *u) {
+        size_t offset;
+        assert(u);
+
+        if (u->type < 0)
+                return NULL;
+
+        offset = UNIT_VTABLE(u)->kill_context_offset;
+        if (offset <= 0)
+                return NULL;
+
+        return (KillContext*) ((uint8_t*) u + offset);
+}
+
+CGroupContext *unit_get_cgroup_context(Unit *u) {
+        size_t offset;
+
+        if (u->type < 0)
+                return NULL;
+
+        offset = UNIT_VTABLE(u)->cgroup_context_offset;
+        if (offset <= 0)
+                return NULL;
+
+        return (CGroupContext*) ((uint8_t*) u + offset);
+}
+
+ExecRuntime *unit_get_exec_runtime(Unit *u) {
+        size_t offset;
+
+        if (u->type < 0)
+                return NULL;
+
+        offset = UNIT_VTABLE(u)->exec_runtime_offset;
+        if (offset <= 0)
+                return NULL;
+
+        return *(ExecRuntime**) ((uint8_t*) u + offset);
+}
+
+static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) {
+        assert(u);
+
+        if (UNIT_WRITE_FLAGS_NOOP(flags))
+                return NULL;
+
+        if (u->transient) /* Redirect drop-ins for transient units always into the transient directory. */
+                return u->manager->lookup_paths.transient;
+
+        if (flags & UNIT_PERSISTENT)
+                return u->manager->lookup_paths.persistent_control;
+
+        if (flags & UNIT_RUNTIME)
+                return u->manager->lookup_paths.runtime_control;
+
+        return NULL;
+}
+
+const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf) {
+        assert(s);
+        assert(popcount(flags & (UNIT_ESCAPE_EXEC_SYNTAX_ENV | UNIT_ESCAPE_EXEC_SYNTAX | UNIT_ESCAPE_C)) <= 1);
+        assert(buf);
+
+        _cleanup_free_ char *t = NULL;
+
+        /* Returns a string with any escaping done. If no escaping was necessary, *buf is set to NULL, and
+         * the input pointer is returned as-is. If an allocation was needed, the return buffer pointer is
+         * written to *buf. This means the return value always contains a properly escaped version, but *buf
+         * only contains a pointer if an allocation was made. Callers can use this to optimize memory
+         * allocations. */
+
+        if (flags & UNIT_ESCAPE_SPECIFIERS) {
+                t = specifier_escape(s);
+                if (!t)
+                        return NULL;
+
+                s = t;
+        }
+
+        /* We either do C-escaping or shell-escaping, to additionally escape characters that we parse for
+         * ExecStart= and friends, i.e. '$' and quotes. */
+
+        if (flags & (UNIT_ESCAPE_EXEC_SYNTAX_ENV | UNIT_ESCAPE_EXEC_SYNTAX)) {
+                char *t2;
+
+                if (flags & UNIT_ESCAPE_EXEC_SYNTAX_ENV) {
+                        t2 = strreplace(s, "$", "$$");
+                        if (!t2)
+                                return NULL;
+                        free_and_replace(t, t2);
+                }
+
+                t2 = shell_escape(t ?: s, "\"");
+                if (!t2)
+                        return NULL;
+                free_and_replace(t, t2);
+
+                s = t;
+
+        } else if (flags & UNIT_ESCAPE_C) {
+                char *t2;
+
+                t2 = cescape(s);
+                if (!t2)
+                        return NULL;
+                free_and_replace(t, t2);
+
+                s = t;
+        }
+
+        *buf = TAKE_PTR(t);
+        return s;
+}
+
+char* unit_concat_strv(char **l, UnitWriteFlags flags) {
+        _cleanup_free_ char *result = NULL;
+        size_t n = 0;
+
+        /* Takes a list of strings, escapes them, and concatenates them. This may be used to format command
+         * lines in a way suitable for ExecStart= stanzas. */
+
+        STRV_FOREACH(i, l) {
+                _cleanup_free_ char *buf = NULL;
+                const char *p;
+                size_t a;
+                char *q;
+
+                p = unit_escape_setting(*i, flags, &buf);
+                if (!p)
+                        return NULL;
+
+                a = (n > 0) + 1 + strlen(p) + 1; /* separating space + " + entry + " */
+                if (!GREEDY_REALLOC(result, n + a + 1))
+                        return NULL;
+
+                q = result + n;
+                if (n > 0)
+                        *(q++) = ' ';
+
+                *(q++) = '"';
+                q = stpcpy(q, p);
+                *(q++) = '"';
+
+                n += a;
+        }
+
+        if (!GREEDY_REALLOC(result, n + 1))
+                return NULL;
+
+        result[n] = 0;
+
+        return TAKE_PTR(result);
+}
+
+int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data) {
+        _cleanup_free_ char *p = NULL, *q = NULL, *escaped = NULL;
+        const char *dir, *wrapped;
+        int r;
+
+        assert(u);
+        assert(name);
+        assert(data);
+
+        if (UNIT_WRITE_FLAGS_NOOP(flags))
+                return 0;
+
+        data = unit_escape_setting(data, flags, &escaped);
+        if (!data)
+                return -ENOMEM;
+
+        /* Prefix the section header. If we are writing this out as transient file, then let's suppress this if the
+         * previous section header is the same */
+
+        if (flags & UNIT_PRIVATE) {
+                if (!UNIT_VTABLE(u)->private_section)
+                        return -EINVAL;
+
+                if (!u->transient_file || u->last_section_private < 0)
+                        data = strjoina("[", UNIT_VTABLE(u)->private_section, "]\n", data);
+                else if (u->last_section_private == 0)
+                        data = strjoina("\n[", UNIT_VTABLE(u)->private_section, "]\n", data);
+        } else {
+                if (!u->transient_file || u->last_section_private < 0)
+                        data = strjoina("[Unit]\n", data);
+                else if (u->last_section_private > 0)
+                        data = strjoina("\n[Unit]\n", data);
+        }
+
+        if (u->transient_file) {
+                /* When this is a transient unit file in creation, then let's not create a new drop-in but instead
+                 * write to the transient unit file. */
+                fputs(data, u->transient_file);
+
+                if (!endswith(data, "\n"))
+                        fputc('\n', u->transient_file);
+
+                /* Remember which section we wrote this entry to */
+                u->last_section_private = !!(flags & UNIT_PRIVATE);
+                return 0;
+        }
+
+        dir = unit_drop_in_dir(u, flags);
+        if (!dir)
+                return -EINVAL;
+
+        wrapped = strjoina("# This is a drop-in unit file extension, created via \"systemctl set-property\"\n"
+                           "# or an equivalent operation. Do not edit.\n",
+                           data,
+                           "\n");
+
+        r = drop_in_file(dir, u->id, 50, name, &p, &q);
+        if (r < 0)
+                return r;
+
+        (void) mkdir_p_label(p, 0755);
+
+        /* Make sure the drop-in dir is registered in our path cache. This way we don't need to stupidly
+         * recreate the cache after every drop-in we write. */
+        if (u->manager->unit_path_cache) {
+                r = set_put_strdup(&u->manager->unit_path_cache, p);
+                if (r < 0)
+                        return r;
+        }
+
+        r = write_string_file_atomic_label(q, wrapped);
+        if (r < 0)
+                return r;
+
+        r = strv_push(&u->dropin_paths, q);
+        if (r < 0)
+                return r;
+        q = NULL;
+
+        strv_uniq(u->dropin_paths);
+
+        u->dropin_mtime = now(CLOCK_REALTIME);
+
+        return 0;
+}
+
+int unit_write_settingf(Unit *u, UnitWriteFlags flags, const char *name, const char *format, ...) {
+        _cleanup_free_ char *p = NULL;
+        va_list ap;
+        int r;
+
+        assert(u);
+        assert(name);
+        assert(format);
+
+        if (UNIT_WRITE_FLAGS_NOOP(flags))
+                return 0;
+
+        va_start(ap, format);
+        r = vasprintf(&p, format, ap);
+        va_end(ap);
+
+        if (r < 0)
+                return -ENOMEM;
+
+        return unit_write_setting(u, flags, name, p);
+}
+
+int unit_make_transient(Unit *u) {
+        _cleanup_free_ char *path = NULL;
+        FILE *f;
+
+        assert(u);
+
+        if (!UNIT_VTABLE(u)->can_transient)
+                return -EOPNOTSUPP;
+
+        (void) mkdir_p_label(u->manager->lookup_paths.transient, 0755);
+
+        path = path_join(u->manager->lookup_paths.transient, u->id);
+        if (!path)
+                return -ENOMEM;
+
+        /* Let's open the file we'll write the transient settings into. This file is kept open as long as we are
+         * creating the transient, and is closed in unit_load(), as soon as we start loading the file. */
+
+        WITH_UMASK(0022) {
+                f = fopen(path, "we");
+                if (!f)
+                        return -errno;
+        }
+
+        safe_fclose(u->transient_file);
+        u->transient_file = f;
+
+        free_and_replace(u->fragment_path, path);
+
+        u->source_path = mfree(u->source_path);
+        u->dropin_paths = strv_free(u->dropin_paths);
+        u->fragment_mtime = u->source_mtime = u->dropin_mtime = 0;
+
+        u->load_state = UNIT_STUB;
+        u->load_error = 0;
+        u->transient = true;
+
+        unit_add_to_dbus_queue(u);
+        unit_add_to_gc_queue(u);
+
+        fputs("# This is a transient unit file, created programmatically via the systemd API. Do not edit.\n",
+              u->transient_file);
+
+        return 0;
+}
+
+static int log_kill(const PidRef *pid, int sig, void *userdata) {
+        _cleanup_free_ char *comm = NULL;
+
+        assert(pidref_is_set(pid));
+
+        (void) pidref_get_comm(pid, &comm);
+
+        /* Don't log about processes marked with brackets, under the assumption that these are temporary processes
+           only, like for example systemd's own PAM stub process. */
+        if (comm && comm[0] == '(')
+                /* Although we didn't log anything, as this callback is used in unit_kill_context we must return 1
+                 * here to let the manager know that a process was killed. */
+                return 1;
+
+        log_unit_notice(userdata,
+                        "Killing process " PID_FMT " (%s) with signal SIG%s.",
+                        pid->pid,
+                        strna(comm),
+                        signal_to_string(sig));
+
+        return 1;
+}
+
+static int operation_to_signal(
+                const KillContext *c,
+                KillOperation k,
+                bool *ret_noteworthy) {
+
+        assert(c);
+
+        switch (k) {
+
+        case KILL_TERMINATE:
+        case KILL_TERMINATE_AND_LOG:
+                *ret_noteworthy = false;
+                return c->kill_signal;
+
+        case KILL_RESTART:
+                *ret_noteworthy = false;
+                return restart_kill_signal(c);
+
+        case KILL_KILL:
+                *ret_noteworthy = true;
+                return c->final_kill_signal;
+
+        case KILL_WATCHDOG:
+                *ret_noteworthy = true;
+                return c->watchdog_signal;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+int unit_kill_context(
+                Unit *u,
+                KillContext *c,
+                KillOperation k,
+                PidRef* main_pid,
+                PidRef* control_pid,
+                bool main_pid_alien) {
+
+        bool wait_for_exit = false, send_sighup;
+        cg_kill_log_func_t log_func = NULL;
+        int sig, r;
+
+        assert(u);
+        assert(c);
+
+        /* Kill the processes belonging to this unit, in preparation for shutting the unit down.  Returns > 0
+         * if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common()
+         * which is used for user-requested killing of unit processes. */
+
+        if (c->kill_mode == KILL_NONE)
+                return 0;
+
+        bool noteworthy;
+        sig = operation_to_signal(c, k, ¬eworthy);
+        if (noteworthy)
+                log_func = log_kill;
+
+        send_sighup =
+                c->send_sighup &&
+                IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) &&
+                sig != SIGHUP;
+
+        if (pidref_is_set(main_pid)) {
+                if (log_func)
+                        log_func(main_pid, sig, u);
+
+                r = pidref_kill_and_sigcont(main_pid, sig);
+                if (r < 0 && r != -ESRCH) {
+                        _cleanup_free_ char *comm = NULL;
+                        (void) pidref_get_comm(main_pid, &comm);
+
+                        log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid->pid, strna(comm));
+                } else {
+                        if (!main_pid_alien)
+                                wait_for_exit = true;
+
+                        if (r != -ESRCH && send_sighup)
+                                (void) pidref_kill(main_pid, SIGHUP);
+                }
+        }
+
+        if (pidref_is_set(control_pid)) {
+                if (log_func)
+                        log_func(control_pid, sig, u);
+
+                r = pidref_kill_and_sigcont(control_pid, sig);
+                if (r < 0 && r != -ESRCH) {
+                        _cleanup_free_ char *comm = NULL;
+                        (void) pidref_get_comm(control_pid, &comm);
+
+                        log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid->pid, strna(comm));
+                } else {
+                        wait_for_exit = true;
+
+                        if (r != -ESRCH && send_sighup)
+                                (void) pidref_kill(control_pid, SIGHUP);
+                }
+        }
+
+        if (u->cgroup_path &&
+            (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) {
+                _cleanup_set_free_ Set *pid_set = NULL;
+
+                /* Exclude the main/control pids from being killed via the cgroup */
+                pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
+                if (!pid_set)
+                        return -ENOMEM;
+
+                r = cg_kill_recursive(
+                                u->cgroup_path,
+                                sig,
+                                CGROUP_SIGCONT|CGROUP_IGNORE_SELF,
+                                pid_set,
+                                log_func, u);
+                if (r < 0) {
+                        if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT))
+                                log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+                } else if (r > 0) {
+
+                        /* FIXME: For now, on the legacy hierarchy, we will not wait for the cgroup members to die if
+                         * we are running in a container or if this is a delegation unit, simply because cgroup
+                         * notification is unreliable in these cases. It doesn't work at all in containers, and outside
+                         * of containers it can be confused easily by left-over directories in the cgroup — which
+                         * however should not exist in non-delegated units. On the unified hierarchy that's different,
+                         * there we get proper events. Hence rely on them. */
+
+                        if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0 ||
+                            (detect_container() == 0 && !unit_cgroup_delegate(u)))
+                                wait_for_exit = true;
+
+                        if (send_sighup) {
+                                set_free(pid_set);
+
+                                pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
+                                if (!pid_set)
+                                        return -ENOMEM;
+
+                                (void) cg_kill_recursive(
+                                                u->cgroup_path,
+                                                SIGHUP,
+                                                CGROUP_IGNORE_SELF,
+                                                pid_set,
+                                                /* kill_log= */ NULL,
+                                                /* userdata= */ NULL);
+                        }
+                }
+        }
+
+        return wait_for_exit;
+}
+
+int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) {
+        int r;
+
+        assert(u);
+        assert(path);
+
+        /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these
+         * paths in the unit (from the path to the UnitDependencyInfo structure indicating how to the
+         * dependency came to be). However, we build a prefix table for all possible prefixes so that new
+         * appearing mount units can easily determine which units to make themselves a dependency of. */
+
+        if (!path_is_absolute(path))
+                return -EINVAL;
+
+        if (hashmap_contains(u->requires_mounts_for, path)) /* Exit quickly if the path is already covered. */
+                return 0;
+
+        /* Use the canonical form of the path as the stored key. We call path_is_normalized()
+         * only after simplification, since path_is_normalized() rejects paths with '.'.
+         * path_is_normalized() also verifies that the path fits in PATH_MAX. */
+        _cleanup_free_ char *p = NULL;
+        r = path_simplify_alloc(path, &p);
+        if (r < 0)
+                return r;
+        path = p;
+
+        if (!path_is_normalized(path))
+                return -EPERM;
+
+        UnitDependencyInfo di = {
+                .origin_mask = mask
+        };
+
+        r = hashmap_ensure_put(&u->requires_mounts_for, &path_hash_ops, p, di.data);
+        if (r < 0)
+                return r;
+        assert(r > 0);
+        TAKE_PTR(p); /* path remains a valid pointer to the string stored in the hashmap */
+
+        char prefix[strlen(path) + 1];
+        PATH_FOREACH_PREFIX_MORE(prefix, path) {
+                Set *x;
+
+                x = hashmap_get(u->manager->units_requiring_mounts_for, prefix);
+                if (!x) {
+                        _cleanup_free_ char *q = NULL;
+
+                        r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops);
+                        if (r < 0)
+                                return r;
+
+                        q = strdup(prefix);
+                        if (!q)
+                                return -ENOMEM;
+
+                        x = set_new(NULL);
+                        if (!x)
+                                return -ENOMEM;
+
+                        r = hashmap_put(u->manager->units_requiring_mounts_for, q, x);
+                        if (r < 0) {
+                                set_free(x);
+                                return r;
+                        }
+                        q = NULL;
+                }
+
+                r = set_put(x, u);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int unit_setup_exec_runtime(Unit *u) {
+        _cleanup_(exec_shared_runtime_unrefp) ExecSharedRuntime *esr = NULL;
+        _cleanup_(dynamic_creds_unrefp) DynamicCreds *dcreds = NULL;
+        _cleanup_set_free_ Set *units = NULL;
+        ExecRuntime **rt;
+        ExecContext *ec;
+        size_t offset;
+        Unit *other;
+        int r;
+
+        offset = UNIT_VTABLE(u)->exec_runtime_offset;
+        assert(offset > 0);
+
+        /* Check if there already is an ExecRuntime for this unit? */
+        rt = (ExecRuntime**) ((uint8_t*) u + offset);
+        if (*rt)
+                return 0;
+
+        ec = unit_get_exec_context(u);
+        assert(ec);
+
+        r = unit_get_transitive_dependency_set(u, UNIT_ATOM_JOINS_NAMESPACE_OF, &units);
+        if (r < 0)
+                return r;
+
+        /* Try to get it from somebody else */
+        SET_FOREACH(other, units) {
+                r = exec_shared_runtime_acquire(u->manager, NULL, other->id, false, &esr);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        break;
+        }
+
+        if (!esr) {
+                r = exec_shared_runtime_acquire(u->manager, ec, u->id, true, &esr);
+                if (r < 0)
+                        return r;
+        }
+
+        if (ec->dynamic_user) {
+                r = dynamic_creds_make(u->manager, ec->user, ec->group, &dcreds);
+                if (r < 0)
+                        return r;
+        }
+
+        r = exec_runtime_make(u, ec, esr, dcreds, rt);
+        if (r < 0)
+                return r;
+
+        TAKE_PTR(esr);
+        TAKE_PTR(dcreds);
+
+        return r;
+}
+
+bool unit_type_supported(UnitType t) {
+        static int8_t cache[_UNIT_TYPE_MAX] = {}; /* -1: disabled, 1: enabled: 0: don't know */
+        int r;
+
+        assert(t >= 0 && t < _UNIT_TYPE_MAX);
+
+        if (cache[t] == 0) {
+                char *e;
+
+                e = strjoina("SYSTEMD_SUPPORT_", unit_type_to_string(t));
+
+                r = getenv_bool(ascii_strupper(e));
+                if (r < 0 && r != -ENXIO)
+                        log_debug_errno(r, "Failed to parse $%s, ignoring: %m", e);
+
+                cache[t] = r == 0 ? -1 : 1;
+        }
+        if (cache[t] < 0)
+                return false;
+
+        if (!unit_vtable[t]->supported)
+                return true;
+
+        return unit_vtable[t]->supported();
+}
+
+void unit_warn_if_dir_nonempty(Unit *u, const char* where) {
+        int r;
+
+        assert(u);
+        assert(where);
+
+        if (!unit_log_level_test(u, LOG_NOTICE))
+                return;
+
+        r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+        if (r > 0 || r == -ENOTDIR)
+                return;
+        if (r < 0) {
+                log_unit_warning_errno(u, r, "Failed to check directory %s: %m", where);
+                return;
+        }
+
+        log_unit_struct(u, LOG_NOTICE,
+                        "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR,
+                        LOG_UNIT_INVOCATION_ID(u),
+                        LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where),
+                        "WHERE=%s", where);
+}
+
+int unit_fail_if_noncanonical(Unit *u, const char* where) {
+        _cleanup_free_ char *canonical_where = NULL;
+        int r;
+
+        assert(u);
+        assert(where);
+
+        r = chase(where, NULL, CHASE_NONEXISTENT, &canonical_where, NULL);
+        if (r < 0) {
+                log_unit_debug_errno(u, r, "Failed to check %s for symlinks, ignoring: %m", where);
+                return 0;
+        }
+
+        /* We will happily ignore a trailing slash (or any redundant slashes) */
+        if (path_equal(where, canonical_where))
+                return 0;
+
+        /* No need to mention "." or "..", they would already have been rejected by unit_name_from_path() */
+        log_unit_struct(u, LOG_ERR,
+                        "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR,
+                        LOG_UNIT_INVOCATION_ID(u),
+                        LOG_UNIT_MESSAGE(u, "Mount path %s is not canonical (contains a symlink).", where),
+                        "WHERE=%s", where);
+
+        return -ELOOP;
+}
+
+bool unit_is_pristine(Unit *u) {
+        assert(u);
+
+        /* Check if the unit already exists or is already around, in a number of different ways. Note that to
+         * cater for unit types such as slice, we are generally fine with units that are marked UNIT_LOADED
+         * even though nothing was actually loaded, as those unit types don't require a file on disk.
+         *
+         * Note that we don't check for drop-ins here, because we allow drop-ins for transient units
+         * identically to non-transient units, both unit-specific and hierarchical. E.g. for a-b-c.service:
+         * service.d/….conf, a-.service.d/….conf, a-b-.service.d/….conf, a-b-c.service.d/….conf.
+         */
+
+        return IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_LOADED) &&
+               !u->fragment_path &&
+               !u->source_path &&
+               !u->job &&
+               !u->merged_into;
+}
+
+PidRef* unit_control_pid(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->control_pid)
+                return UNIT_VTABLE(u)->control_pid(u);
+
+        return NULL;
+}
+
+PidRef* unit_main_pid(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->main_pid)
+                return UNIT_VTABLE(u)->main_pid(u);
+
+        return NULL;
+}
+
+static void unit_modify_user_nft_set(Unit *u, bool add, NFTSetSource source, uint32_t element) {
+        int r;
+
+        assert(u);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return;
+
+        CGroupContext *c;
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return;
+
+        if (!u->manager->fw_ctx) {
+                r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
+                if (r < 0)
+                        return;
+
+                assert(u->manager->fw_ctx);
+        }
+
+        FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
+                if (nft_set->source != source)
+                        continue;
+
+                r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
+                if (r < 0)
+                        log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, ID %u, ignoring: %m",
+                                          add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, element);
+                else
+                        log_debug("%s NFT set: family %s, table %s, set %s, ID %u",
+                                  add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, element);
+        }
+}
+
+static void unit_unref_uid_internal(
+                Unit *u,
+                uid_t *ref_uid,
+                bool destroy_now,
+                void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) {
+
+        assert(u);
+        assert(ref_uid);
+        assert(_manager_unref_uid);
+
+        /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and
+         * gid_t are actually the same time, with the same validity rules.
+         *
+         * Drops a reference to UID/GID from a unit. */
+
+        assert_cc(sizeof(uid_t) == sizeof(gid_t));
+        assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+        if (!uid_is_valid(*ref_uid))
+                return;
+
+        _manager_unref_uid(u->manager, *ref_uid, destroy_now);
+        *ref_uid = UID_INVALID;
+}
+
+static void unit_unref_uid(Unit *u, bool destroy_now) {
+        assert(u);
+
+        unit_modify_user_nft_set(u, /* add = */ false, NFT_SET_SOURCE_USER, u->ref_uid);
+
+        unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid);
+}
+
+static void unit_unref_gid(Unit *u, bool destroy_now) {
+        assert(u);
+
+        unit_modify_user_nft_set(u, /* add = */ false, NFT_SET_SOURCE_GROUP, u->ref_gid);
+
+        unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid);
+}
+
+void unit_unref_uid_gid(Unit *u, bool destroy_now) {
+        assert(u);
+
+        unit_unref_uid(u, destroy_now);
+        unit_unref_gid(u, destroy_now);
+}
+
+static int unit_ref_uid_internal(
+                Unit *u,
+                uid_t *ref_uid,
+                uid_t uid,
+                bool clean_ipc,
+                int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) {
+
+        int r;
+
+        assert(u);
+        assert(ref_uid);
+        assert(uid_is_valid(uid));
+        assert(_manager_ref_uid);
+
+        /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t
+         * are actually the same type, and have the same validity rules.
+         *
+         * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a
+         * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter
+         * drops to zero. */
+
+        assert_cc(sizeof(uid_t) == sizeof(gid_t));
+        assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+        if (*ref_uid == uid)
+                return 0;
+
+        if (uid_is_valid(*ref_uid)) /* Already set? */
+                return -EBUSY;
+
+        r = _manager_ref_uid(u->manager, uid, clean_ipc);
+        if (r < 0)
+                return r;
+
+        *ref_uid = uid;
+        return 1;
+}
+
+static int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) {
+        return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid);
+}
+
+static int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) {
+        return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid);
+}
+
+static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) {
+        int r = 0, q = 0;
+
+        assert(u);
+
+        /* Reference both a UID and a GID in one go. Either references both, or neither. */
+
+        if (uid_is_valid(uid)) {
+                r = unit_ref_uid(u, uid, clean_ipc);
+                if (r < 0)
+                        return r;
+        }
+
+        if (gid_is_valid(gid)) {
+                q = unit_ref_gid(u, gid, clean_ipc);
+                if (q < 0) {
+                        if (r > 0)
+                                unit_unref_uid(u, false);
+
+                        return q;
+                }
+        }
+
+        return r > 0 || q > 0;
+}
+
+int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) {
+        ExecContext *c;
+        int r;
+
+        assert(u);
+
+        c = unit_get_exec_context(u);
+
+        r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false);
+        if (r < 0)
+                return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m");
+
+        unit_modify_user_nft_set(u, /* add = */ true, NFT_SET_SOURCE_USER, uid);
+        unit_modify_user_nft_set(u, /* add = */ true, NFT_SET_SOURCE_GROUP, gid);
+
+        return r;
+}
+
+void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) {
+        int r;
+
+        assert(u);
+
+        /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names
+         * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC
+         * objects when no service references the UID/GID anymore. */
+
+        r = unit_ref_uid_gid(u, uid, gid);
+        if (r > 0)
+                unit_add_to_dbus_queue(u);
+}
+
+int unit_acquire_invocation_id(Unit *u) {
+        sd_id128_t id;
+        int r;
+
+        assert(u);
+
+        r = sd_id128_randomize(&id);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m");
+
+        r = unit_set_invocation_id(u, id);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m");
+
+        unit_add_to_dbus_queue(u);
+        return 0;
+}
+
+int unit_set_exec_params(Unit *u, ExecParameters *p) {
+        const char *confirm_spawn;
+        int r;
+
+        assert(u);
+        assert(p);
+
+        /* Copy parameters from manager */
+        r = manager_get_effective_environment(u->manager, &p->environment);
+        if (r < 0)
+                return r;
+
+        p->runtime_scope = u->manager->runtime_scope;
+
+        confirm_spawn = manager_get_confirm_spawn(u->manager);
+        if (confirm_spawn) {
+                p->confirm_spawn = strdup(confirm_spawn);
+                if (!p->confirm_spawn)
+                        return -ENOMEM;
+        }
+
+        p->cgroup_supported = u->manager->cgroup_supported;
+        p->prefix = u->manager->prefix;
+        SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager));
+
+        /* Copy parameters from unit */
+        p->cgroup_path = u->cgroup_path;
+        SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u));
+
+        p->received_credentials_directory = u->manager->received_credentials_directory;
+        p->received_encrypted_credentials_directory = u->manager->received_encrypted_credentials_directory;
+
+        p->shall_confirm_spawn = u->manager->confirm_spawn;
+
+        p->fallback_smack_process_label = u->manager->defaults.smack_process_label;
+
+        if (u->manager->restrict_fs && p->bpf_outer_map_fd < 0) {
+                int fd = lsm_bpf_map_restrict_fs_fd(u);
+                if (fd < 0)
+                        return fd;
+
+                p->bpf_outer_map_fd = fd;
+        }
+
+        p->user_lookup_fd = u->manager->user_lookup_fds[1];
+
+        p->cgroup_id = u->cgroup_id;
+        p->invocation_id = u->invocation_id;
+        sd_id128_to_string(p->invocation_id, p->invocation_id_string);
+        p->unit_id = strdup(u->id);
+        if (!p->unit_id)
+                return -ENOMEM;
+
+        return 0;
+}
+
+int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) {
+        pid_t pid;
+        int r;
+
+        assert(u);
+        assert(ret);
+
+        /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child,
+         * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */
+
+        (void) unit_realize_cgroup(u);
+
+        r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid);
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+                int q;
+
+                /* Parent */
+
+                q = pidref_set_pid(&pidref, pid);
+                if (q < 0)
+                        return q;
+
+                *ret = TAKE_PIDREF(pidref);
+                return r;
+        }
+
+        /* Child */
+
+        (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE);
+        (void) ignore_signals(SIGPIPE);
+
+        if (u->cgroup_path) {
+                r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL);
+                if (r < 0) {
+                        log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(u->cgroup_path));
+                        _exit(EXIT_CGROUP);
+                }
+        }
+
+        return 0;
+}
+
+int unit_fork_and_watch_rm_rf(Unit *u, char **paths, PidRef *ret_pid) {
+        _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+        int r;
+
+        assert(u);
+        assert(ret_pid);
+
+        r = unit_fork_helper_process(u, "(sd-rmrf)", &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                int ret = EXIT_SUCCESS;
+
+                STRV_FOREACH(i, paths) {
+                        r = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to remove '%s': %m", *i);
+                                ret = EXIT_FAILURE;
+                        }
+                }
+
+                _exit(ret);
+        }
+
+        r = unit_watch_pidref(u, &pid, /* exclusive= */ true);
+        if (r < 0)
+                return r;
+
+        *ret_pid = TAKE_PIDREF(pid);
+        return 0;
+}
+
+static void unit_update_dependency_mask(Hashmap *deps, Unit *other, UnitDependencyInfo di) {
+        assert(deps);
+        assert(other);
+
+        if (di.origin_mask == 0 && di.destination_mask == 0)
+                /* No bit set anymore, let's drop the whole entry */
+                assert_se(hashmap_remove(deps, other));
+        else
+                /* Mask was reduced, let's update the entry */
+                assert_se(hashmap_update(deps, other, di.data) == 0);
+}
+
+void unit_remove_dependencies(Unit *u, UnitDependencyMask mask) {
+        Hashmap *deps;
+        assert(u);
+
+        /* Removes all dependencies u has on other units marked for ownership by 'mask'. */
+
+        if (mask == 0)
+                return;
+
+        HASHMAP_FOREACH(deps, u->dependencies) {
+                bool done;
+
+                do {
+                        UnitDependencyInfo di;
+                        Unit *other;
+
+                        done = true;
+
+                        HASHMAP_FOREACH_KEY(di.data, other, deps) {
+                                Hashmap *other_deps;
+
+                                if (FLAGS_SET(~mask, di.origin_mask))
+                                        continue;
+
+                                di.origin_mask &= ~mask;
+                                unit_update_dependency_mask(deps, other, di);
+
+                                /* We updated the dependency from our unit to the other unit now. But most
+                                 * dependencies imply a reverse dependency. Hence, let's delete that one
+                                 * too. For that we go through all dependency types on the other unit and
+                                 * delete all those which point to us and have the right mask set. */
+
+                                HASHMAP_FOREACH(other_deps, other->dependencies) {
+                                        UnitDependencyInfo dj;
+
+                                        dj.data = hashmap_get(other_deps, u);
+                                        if (FLAGS_SET(~mask, dj.destination_mask))
+                                                continue;
+
+                                        dj.destination_mask &= ~mask;
+                                        unit_update_dependency_mask(other_deps, u, dj);
+                                }
+
+                                unit_add_to_gc_queue(other);
+
+                                /* The unit 'other' may not be wanted by the unit 'u'. */
+                                unit_submit_to_stop_when_unneeded_queue(other);
+
+                                done = false;
+                                break;
+                        }
+
+                } while (!done);
+        }
+}
+
+static int unit_get_invocation_path(Unit *u, char **ret) {
+        char *p;
+        int r;
+
+        assert(u);
+        assert(ret);
+
+        if (MANAGER_IS_SYSTEM(u->manager))
+                p = strjoin("/run/systemd/units/invocation:", u->id);
+        else {
+                _cleanup_free_ char *user_path = NULL;
+                r = xdg_user_runtime_dir(&user_path, "/systemd/units/invocation:");
+                if (r < 0)
+                        return r;
+                p = strjoin(user_path, u->id);
+        }
+
+        if (!p)
+                return -ENOMEM;
+
+        *ret = p;
+        return 0;
+}
+
+static int unit_export_invocation_id(Unit *u) {
+        _cleanup_free_ char *p = NULL;
+        int r;
+
+        assert(u);
+
+        if (u->exported_invocation_id)
+                return 0;
+
+        if (sd_id128_is_null(u->invocation_id))
+                return 0;
+
+        r = unit_get_invocation_path(u, &p);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to get invocation path: %m");
+
+        r = symlink_atomic_label(u->invocation_id_string, p);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to create invocation ID symlink %s: %m", p);
+
+        u->exported_invocation_id = true;
+        return 0;
+}
+
+static int unit_export_log_level_max(Unit *u, const ExecContext *c) {
+        const char *p;
+        char buf[2];
+        int r;
+
+        assert(u);
+        assert(c);
+
+        if (u->exported_log_level_max)
+                return 0;
+
+        if (c->log_level_max < 0)
+                return 0;
+
+        assert(c->log_level_max <= 7);
+
+        buf[0] = '0' + c->log_level_max;
+        buf[1] = 0;
+
+        p = strjoina("/run/systemd/units/log-level-max:", u->id);
+        r = symlink_atomic(buf, p);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to create maximum log level symlink %s: %m", p);
+
+        u->exported_log_level_max = true;
+        return 0;
+}
+
+static int unit_export_log_extra_fields(Unit *u, const ExecContext *c) {
+        _cleanup_close_ int fd = -EBADF;
+        struct iovec *iovec;
+        const char *p;
+        char *pattern;
+        le64_t *sizes;
+        ssize_t n;
+        int r;
+
+        if (u->exported_log_extra_fields)
+                return 0;
+
+        if (c->n_log_extra_fields <= 0)
+                return 0;
+
+        sizes = newa(le64_t, c->n_log_extra_fields);
+        iovec = newa(struct iovec, c->n_log_extra_fields * 2);
+
+        for (size_t i = 0; i < c->n_log_extra_fields; i++) {
+                sizes[i] = htole64(c->log_extra_fields[i].iov_len);
+
+                iovec[i*2] = IOVEC_MAKE(sizes + i, sizeof(le64_t));
+                iovec[i*2+1] = c->log_extra_fields[i];
+        }
+
+        p = strjoina("/run/systemd/units/log-extra-fields:", u->id);
+        pattern = strjoina(p, ".XXXXXX");
+
+        fd = mkostemp_safe(pattern);
+        if (fd < 0)
+                return log_unit_debug_errno(u, fd, "Failed to create extra fields file %s: %m", p);
+
+        n = writev(fd, iovec, c->n_log_extra_fields*2);
+        if (n < 0) {
+                r = log_unit_debug_errno(u, errno, "Failed to write extra fields: %m");
+                goto fail;
+        }
+
+        (void) fchmod(fd, 0644);
+
+        if (rename(pattern, p) < 0) {
+                r = log_unit_debug_errno(u, errno, "Failed to rename extra fields file: %m");
+                goto fail;
+        }
+
+        u->exported_log_extra_fields = true;
+        return 0;
+
+fail:
+        (void) unlink(pattern);
+        return r;
+}
+
+static int unit_export_log_ratelimit_interval(Unit *u, const ExecContext *c) {
+        _cleanup_free_ char *buf = NULL;
+        const char *p;
+        int r;
+
+        assert(u);
+        assert(c);
+
+        if (u->exported_log_ratelimit_interval)
+                return 0;
+
+        if (c->log_ratelimit_interval_usec == 0)
+                return 0;
+
+        p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id);
+
+        if (asprintf(&buf, "%" PRIu64, c->log_ratelimit_interval_usec) < 0)
+                return log_oom();
+
+        r = symlink_atomic(buf, p);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to create log rate limit interval symlink %s: %m", p);
+
+        u->exported_log_ratelimit_interval = true;
+        return 0;
+}
+
+static int unit_export_log_ratelimit_burst(Unit *u, const ExecContext *c) {
+        _cleanup_free_ char *buf = NULL;
+        const char *p;
+        int r;
+
+        assert(u);
+        assert(c);
+
+        if (u->exported_log_ratelimit_burst)
+                return 0;
+
+        if (c->log_ratelimit_burst == 0)
+                return 0;
+
+        p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id);
+
+        if (asprintf(&buf, "%u", c->log_ratelimit_burst) < 0)
+                return log_oom();
+
+        r = symlink_atomic(buf, p);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to create log rate limit burst symlink %s: %m", p);
+
+        u->exported_log_ratelimit_burst = true;
+        return 0;
+}
+
+void unit_export_state_files(Unit *u) {
+        const ExecContext *c;
+
+        assert(u);
+
+        if (!u->id)
+                return;
+
+        if (MANAGER_IS_TEST_RUN(u->manager))
+                return;
+
+        /* Exports a couple of unit properties to /run/systemd/units/, so that journald can quickly query this data
+         * from there. Ideally, journald would use IPC to query this, like everybody else, but that's hard, as long as
+         * the IPC system itself and PID 1 also log to the journal.
+         *
+         * Note that these files really shouldn't be considered API for anyone else, as use a runtime file system as
+         * IPC replacement is not compatible with today's world of file system namespaces. However, this doesn't really
+         * apply to communication between the journal and systemd, as we assume that these two daemons live in the same
+         * namespace at least.
+         *
+         * Note that some of the "files" exported here are actually symlinks and not regular files. Symlinks work
+         * better for storing small bits of data, in particular as we can write them with two system calls, and read
+         * them with one. */
+
+        (void) unit_export_invocation_id(u);
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return;
+
+        c = unit_get_exec_context(u);
+        if (c) {
+                (void) unit_export_log_level_max(u, c);
+                (void) unit_export_log_extra_fields(u, c);
+                (void) unit_export_log_ratelimit_interval(u, c);
+                (void) unit_export_log_ratelimit_burst(u, c);
+        }
+}
+
+void unit_unlink_state_files(Unit *u) {
+        const char *p;
+
+        assert(u);
+
+        if (!u->id)
+                return;
+
+        /* Undoes the effect of unit_export_state() */
+
+        if (u->exported_invocation_id) {
+                _cleanup_free_ char *invocation_path = NULL;
+                int r = unit_get_invocation_path(u, &invocation_path);
+                if (r >= 0) {
+                        (void) unlink(invocation_path);
+                        u->exported_invocation_id = false;
+                }
+        }
+
+        if (!MANAGER_IS_SYSTEM(u->manager))
+                return;
+
+        if (u->exported_log_level_max) {
+                p = strjoina("/run/systemd/units/log-level-max:", u->id);
+                (void) unlink(p);
+
+                u->exported_log_level_max = false;
+        }
+
+        if (u->exported_log_extra_fields) {
+                p = strjoina("/run/systemd/units/extra-fields:", u->id);
+                (void) unlink(p);
+
+                u->exported_log_extra_fields = false;
+        }
+
+        if (u->exported_log_ratelimit_interval) {
+                p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id);
+                (void) unlink(p);
+
+                u->exported_log_ratelimit_interval = false;
+        }
+
+        if (u->exported_log_ratelimit_burst) {
+                p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id);
+                (void) unlink(p);
+
+                u->exported_log_ratelimit_burst = false;
+        }
+}
+
+int unit_prepare_exec(Unit *u) {
+        int r;
+
+        assert(u);
+
+        /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
+         * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
+        r = bpf_firewall_load_custom(u);
+        if (r < 0)
+                return r;
+
+        /* Prepares everything so that we can fork of a process for this unit */
+
+        (void) unit_realize_cgroup(u);
+
+        if (u->reset_accounting) {
+                (void) unit_reset_accounting(u);
+                u->reset_accounting = false;
+        }
+
+        unit_export_state_files(u);
+
+        r = unit_setup_exec_runtime(u);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static bool ignore_leftover_process(const char *comm) {
+        return comm && comm[0] == '('; /* Most likely our own helper process (PAM?), ignore */
+}
+
+int unit_log_leftover_process_start(const PidRef *pid, int sig, void *userdata) {
+        _cleanup_free_ char *comm = NULL;
+
+        assert(pidref_is_set(pid));
+
+        (void) pidref_get_comm(pid, &comm);
+
+        if (ignore_leftover_process(comm))
+                return 0;
+
+        /* During start we print a warning */
+
+        log_unit_warning(userdata,
+                         "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n"
+                         "This usually indicates unclean termination of a previous run, or service implementation deficiencies.",
+                         pid->pid, strna(comm));
+
+        return 1;
+}
+
+int unit_log_leftover_process_stop(const PidRef *pid, int sig, void *userdata) {
+        _cleanup_free_ char *comm = NULL;
+
+        assert(pidref_is_set(pid));
+
+        (void) pidref_get_comm(pid, &comm);
+
+        if (ignore_leftover_process(comm))
+                return 0;
+
+        /* During stop we only print an informational message */
+
+        log_unit_info(userdata,
+                      "Unit process " PID_FMT " (%s) remains running after unit stopped.",
+                      pid->pid, strna(comm));
+
+        return 1;
+}
+
+int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) {
+        assert(u);
+
+        (void) unit_pick_cgroup_path(u);
+
+        if (!u->cgroup_path)
+                return 0;
+
+        return cg_kill_recursive(
+                        u->cgroup_path,
+                        /* sig= */ 0,
+                        /* flags= */ 0,
+                        /* set= */ NULL,
+                        log_func,
+                        u);
+}
+
+bool unit_needs_console(Unit *u) {
+        ExecContext *ec;
+        UnitActiveState state;
+
+        assert(u);
+
+        state = unit_active_state(u);
+
+        if (UNIT_IS_INACTIVE_OR_FAILED(state))
+                return false;
+
+        if (UNIT_VTABLE(u)->needs_console)
+                return UNIT_VTABLE(u)->needs_console(u);
+
+        /* If this unit type doesn't implement this call, let's use a generic fallback implementation: */
+        ec = unit_get_exec_context(u);
+        if (!ec)
+                return false;
+
+        return exec_context_may_touch_console(ec);
+}
+
+int unit_pid_attachable(Unit *u, PidRef *pid, sd_bus_error *error) {
+        int r;
+
+        assert(u);
+
+        /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself,
+         * and not a kernel thread either */
+
+        /* First, a simple range check */
+        if (!pidref_is_set(pid))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier is not valid.");
+
+        /* Some extra safety check */
+        if (pid->pid == 1 || pidref_is_self(pid))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager process, refusing.", pid->pid);
+
+        /* Don't even begin to bother with kernel threads */
+        r = pidref_is_kernel_thread(pid);
+        if (r == -ESRCH)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid->pid);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid->pid);
+        if (r > 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid->pid);
+
+        return 0;
+}
+
+void unit_log_success(Unit *u) {
+        assert(u);
+
+        /* Let's show message "Deactivated successfully" in debug mode (when manager is user) rather than in info mode.
+         * This message has low information value for regular users and it might be a bit overwhelming on a system with
+         * a lot of devices. */
+        log_unit_struct(u,
+                        MANAGER_IS_USER(u->manager) ? LOG_DEBUG : LOG_INFO,
+                        "MESSAGE_ID=" SD_MESSAGE_UNIT_SUCCESS_STR,
+                        LOG_UNIT_INVOCATION_ID(u),
+                        LOG_UNIT_MESSAGE(u, "Deactivated successfully."));
+}
+
+void unit_log_failure(Unit *u, const char *result) {
+        assert(u);
+        assert(result);
+
+        log_unit_struct(u, LOG_WARNING,
+                        "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILURE_RESULT_STR,
+                        LOG_UNIT_INVOCATION_ID(u),
+                        LOG_UNIT_MESSAGE(u, "Failed with result '%s'.", result),
+                        "UNIT_RESULT=%s", result);
+}
+
+void unit_log_skip(Unit *u, const char *result) {
+        assert(u);
+        assert(result);
+
+        log_unit_struct(u, LOG_INFO,
+                        "MESSAGE_ID=" SD_MESSAGE_UNIT_SKIPPED_STR,
+                        LOG_UNIT_INVOCATION_ID(u),
+                        LOG_UNIT_MESSAGE(u, "Skipped due to '%s'.", result),
+                        "UNIT_RESULT=%s", result);
+}
+
+void unit_log_process_exit(
+                Unit *u,
+                const char *kind,
+                const char *command,
+                bool success,
+                int code,
+                int status) {
+
+        int level;
+
+        assert(u);
+        assert(kind);
+
+        /* If this is a successful exit, let's log about the exit code on DEBUG level. If this is a failure
+         * and the process exited on its own via exit(), then let's make this a NOTICE, under the assumption
+         * that the service already logged the reason at a higher log level on its own. Otherwise, make it a
+         * WARNING. */
+        if (success)
+                level = LOG_DEBUG;
+        else if (code == CLD_EXITED)
+                level = LOG_NOTICE;
+        else
+                level = LOG_WARNING;
+
+        log_unit_struct(u, level,
+                        "MESSAGE_ID=" SD_MESSAGE_UNIT_PROCESS_EXIT_STR,
+                        LOG_UNIT_MESSAGE(u, "%s exited, code=%s, status=%i/%s%s",
+                                         kind,
+                                         sigchld_code_to_string(code), status,
+                                         strna(code == CLD_EXITED
+                                               ? exit_status_to_string(status, EXIT_STATUS_FULL)
+                                               : signal_to_string(status)),
+                                         success ? " (success)" : ""),
+                        "EXIT_CODE=%s", sigchld_code_to_string(code),
+                        "EXIT_STATUS=%i", status,
+                        "COMMAND=%s", strna(command),
+                        LOG_UNIT_INVOCATION_ID(u));
+}
+
+int unit_exit_status(Unit *u) {
+        assert(u);
+
+        /* Returns the exit status to propagate for the most recent cycle of this unit. Returns a value in the range
+         * 0…255 if there's something to propagate. EOPNOTSUPP if the concept does not apply to this unit type, ENODATA
+         * if no data is currently known (for example because the unit hasn't deactivated yet) and EBADE if the main
+         * service process has exited abnormally (signal/coredump). */
+
+        if (!UNIT_VTABLE(u)->exit_status)
+                return -EOPNOTSUPP;
+
+        return UNIT_VTABLE(u)->exit_status(u);
+}
+
+int unit_failure_action_exit_status(Unit *u) {
+        int r;
+
+        assert(u);
+
+        /* Returns the exit status to propagate on failure, or an error if there's nothing to propagate */
+
+        if (u->failure_action_exit_status >= 0)
+                return u->failure_action_exit_status;
+
+        r = unit_exit_status(u);
+        if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */
+                return 255;
+
+        return r;
+}
+
+int unit_success_action_exit_status(Unit *u) {
+        int r;
+
+        assert(u);
+
+        /* Returns the exit status to propagate on success, or an error if there's nothing to propagate */
+
+        if (u->success_action_exit_status >= 0)
+                return u->success_action_exit_status;
+
+        r = unit_exit_status(u);
+        if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */
+                return 255;
+
+        return r;
+}
+
+int unit_test_trigger_loaded(Unit *u) {
+        Unit *trigger;
+
+        /* Tests whether the unit to trigger is loaded */
+
+        trigger = UNIT_TRIGGER(u);
+        if (!trigger)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+                                            "Refusing to start, no unit to trigger.");
+        if (trigger->load_state != UNIT_LOADED)
+                return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+                                            "Refusing to start, unit %s to trigger not loaded.", trigger->id);
+
+        return 0;
+}
+
+void unit_destroy_runtime_data(Unit *u, const ExecContext *context) {
+        assert(u);
+        assert(context);
+
+        /* EXEC_PRESERVE_RESTART is handled via unit_release_resources()! */
+        if (context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
+                exec_context_destroy_runtime_directory(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+
+        exec_context_destroy_credentials(u);
+        exec_context_destroy_mount_ns_dir(u);
+}
+
+int unit_clean(Unit *u, ExecCleanMask mask) {
+        UnitActiveState state;
+
+        assert(u);
+
+        /* Special return values:
+         *
+         *   -EOPNOTSUPP → cleaning not supported for this unit type
+         *   -EUNATCH    → cleaning not defined for this resource type
+         *   -EBUSY      → unit currently can't be cleaned since it's running or not properly loaded, or has
+         *                 a job queued or similar
+         */
+
+        if (!UNIT_VTABLE(u)->clean)
+                return -EOPNOTSUPP;
+
+        if (mask == 0)
+                return -EUNATCH;
+
+        if (u->load_state != UNIT_LOADED)
+                return -EBUSY;
+
+        if (u->job)
+                return -EBUSY;
+
+        state = unit_active_state(u);
+        if (state != UNIT_INACTIVE)
+                return -EBUSY;
+
+        return UNIT_VTABLE(u)->clean(u, mask);
+}
+
+int unit_can_clean(Unit *u, ExecCleanMask *ret) {
+        assert(u);
+
+        if (!UNIT_VTABLE(u)->clean ||
+            u->load_state != UNIT_LOADED) {
+                *ret = 0;
+                return 0;
+        }
+
+        /* When the clean() method is set, can_clean() really should be set too */
+        assert(UNIT_VTABLE(u)->can_clean);
+
+        return UNIT_VTABLE(u)->can_clean(u, ret);
+}
+
+bool unit_can_start_refuse_manual(Unit *u) {
+        return unit_can_start(u) && !u->refuse_manual_start;
+}
+
+bool unit_can_stop_refuse_manual(Unit *u) {
+        return unit_can_stop(u) && !u->refuse_manual_stop;
+}
+
+bool unit_can_isolate_refuse_manual(Unit *u) {
+        return unit_can_isolate(u) && !u->refuse_manual_start;
+}
+
+bool unit_can_freeze(Unit *u) {
+        assert(u);
+
+        if (UNIT_VTABLE(u)->can_freeze)
+                return UNIT_VTABLE(u)->can_freeze(u);
+
+        return UNIT_VTABLE(u)->freeze;
+}
+
+void unit_frozen(Unit *u) {
+        assert(u);
+
+        u->freezer_state = FREEZER_FROZEN;
+
+        bus_unit_send_pending_freezer_message(u, false);
+}
+
+void unit_thawed(Unit *u) {
+        assert(u);
+
+        u->freezer_state = FREEZER_RUNNING;
+
+        bus_unit_send_pending_freezer_message(u, false);
+}
+
+static int unit_freezer_action(Unit *u, FreezerAction action) {
+        UnitActiveState s;
+        int (*method)(Unit*);
+        int r;
+
+        assert(u);
+        assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+        method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw;
+        if (!method || !cg_freezer_supported())
+                return -EOPNOTSUPP;
+
+        if (u->job)
+                return -EBUSY;
+
+        if (u->load_state != UNIT_LOADED)
+                return -EHOSTDOWN;
+
+        s = unit_active_state(u);
+        if (s != UNIT_ACTIVE)
+                return -EHOSTDOWN;
+
+        if ((IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING) && action == FREEZER_FREEZE) ||
+            (u->freezer_state == FREEZER_THAWING && action == FREEZER_THAW))
+                return -EALREADY;
+
+        r = method(u);
+        if (r <= 0)
+                return r;
+
+        assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING));
+
+        return 1;
+}
+
+int unit_freeze(Unit *u) {
+        return unit_freezer_action(u, FREEZER_FREEZE);
+}
+
+int unit_thaw(Unit *u) {
+        return unit_freezer_action(u, FREEZER_THAW);
+}
+
+/* Wrappers around low-level cgroup freezer operations common for service and scope units */
+int unit_freeze_vtable_common(Unit *u) {
+        return unit_cgroup_freezer_action(u, FREEZER_FREEZE);
+}
+
+int unit_thaw_vtable_common(Unit *u) {
+        return unit_cgroup_freezer_action(u, FREEZER_THAW);
+}
+
+Condition *unit_find_failed_condition(Unit *u) {
+        Condition *failed_trigger = NULL;
+        bool has_succeeded_trigger = false;
+
+        if (u->condition_result)
+                return NULL;
+
+        LIST_FOREACH(conditions, c, u->conditions)
+                if (c->trigger) {
+                        if (c->result == CONDITION_SUCCEEDED)
+                                 has_succeeded_trigger = true;
+                        else if (!failed_trigger)
+                                 failed_trigger = c;
+                } else if (c->result != CONDITION_SUCCEEDED)
+                        return c;
+
+        return failed_trigger && !has_succeeded_trigger ? failed_trigger : NULL;
+}
+
+static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
+        [COLLECT_INACTIVE] = "inactive",
+        [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(collect_mode, CollectMode);
+
+Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other) {
+        Unit *i;
+
+        assert(u);
+
+        /* Checks if the unit has a dependency on 'other' with the specified dependency atom. If 'other' is
+         * NULL checks if the unit has *any* dependency of that atom. Returns 'other' if found (or if 'other'
+         * is NULL the first entry found), or NULL if not found. */
+
+        UNIT_FOREACH_DEPENDENCY(i, u, atom)
+                if (!other || other == i)
+                        return i;
+
+        return NULL;
+}
+
+int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array) {
+        _cleanup_free_ Unit **array = NULL;
+        size_t n = 0;
+        Unit *other;
+
+        assert(u);
+        assert(ret_array);
+
+        /* Gets a list of units matching a specific atom as array. This is useful when iterating through
+         * dependencies while modifying them: the array is an "atomic snapshot" of sorts, that can be read
+         * while the dependency table is continuously updated. */
+
+        UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+                if (!GREEDY_REALLOC(array, n + 1))
+                        return -ENOMEM;
+
+                array[n++] = other;
+        }
+
+        *ret_array = TAKE_PTR(array);
+
+        assert(n <= INT_MAX);
+        return (int) n;
+}
+
+int unit_get_transitive_dependency_set(Unit *u, UnitDependencyAtom atom, Set **ret) {
+        _cleanup_set_free_ Set *units = NULL, *queue = NULL;
+        Unit *other;
+        int r;
+
+        assert(u);
+        assert(ret);
+
+        /* Similar to unit_get_dependency_array(), but also search the same dependency in other units. */
+
+        do {
+                UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+                        r = set_ensure_put(&units, NULL, other);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                continue;
+                        r = set_ensure_put(&queue, NULL, other);
+                        if (r < 0)
+                                return r;
+                }
+        } while ((u = set_steal_first(queue)));
+
+        *ret = TAKE_PTR(units);
+        return 0;
+}
+
+int unit_arm_timer(
+                Unit *u,
+                sd_event_source **source,
+                bool relative,
+                usec_t usec,
+                sd_event_time_handler_t handler) {
+
+        int r;
+
+        assert(u);
+        assert(source);
+        assert(handler);
+
+        if (*source) {
+                if (usec == USEC_INFINITY)
+                        return sd_event_source_set_enabled(*source, SD_EVENT_OFF);
+
+                r = (relative ? sd_event_source_set_time_relative : sd_event_source_set_time)(*source, usec);
+                if (r < 0)
+                        return r;
+
+                return sd_event_source_set_enabled(*source, SD_EVENT_ONESHOT);
+        }
+
+        if (usec == USEC_INFINITY)
+                return 0;
+
+        r = (relative ? sd_event_add_time_relative : sd_event_add_time)(
+                        u->manager->event,
+                        source,
+                        CLOCK_MONOTONIC,
+                        usec, 0,
+                        handler,
+                        u);
+        if (r < 0)
+                return r;
+
+        const char *d = strjoina(unit_type_to_string(u->type), "-timer");
+        (void) sd_event_source_set_description(*source, d);
+
+        return 0;
+}
+
+static int unit_get_nice(Unit *u) {
+        ExecContext *ec;
+
+        ec = unit_get_exec_context(u);
+        return ec ? ec->nice : 0;
+}
+
+static uint64_t unit_get_cpu_weight(Unit *u) {
+        CGroupContext *cc;
+
+        cc = unit_get_cgroup_context(u);
+        return cc ? cgroup_context_cpu_weight(cc, manager_state(u->manager)) : CGROUP_WEIGHT_DEFAULT;
+}
+
+int unit_compare_priority(Unit *a, Unit *b) {
+        int ret;
+
+        ret = CMP(a->type, b->type);
+        if (ret != 0)
+                return -ret;
+
+        ret = CMP(unit_get_cpu_weight(a), unit_get_cpu_weight(b));
+        if (ret != 0)
+                return -ret;
+
+        ret = CMP(unit_get_nice(a), unit_get_nice(b));
+        if (ret != 0)
+                return ret;
+
+        return strcmp(a->id, b->id);
+}
+
+const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX] = {
+        [UNIT_PATH] = &activation_details_path_vtable,
+        [UNIT_TIMER] = &activation_details_timer_vtable,
+};
+
+ActivationDetails *activation_details_new(Unit *trigger_unit) {
+        _cleanup_free_ ActivationDetails *details = NULL;
+
+        assert(trigger_unit);
+        assert(trigger_unit->type != _UNIT_TYPE_INVALID);
+        assert(trigger_unit->id);
+
+        details = malloc0(activation_details_vtable[trigger_unit->type]->object_size);
+        if (!details)
+                return NULL;
+
+        *details = (ActivationDetails) {
+                .n_ref = 1,
+                .trigger_unit_type = trigger_unit->type,
+        };
+
+        details->trigger_unit_name = strdup(trigger_unit->id);
+        if (!details->trigger_unit_name)
+                return NULL;
+
+        if (ACTIVATION_DETAILS_VTABLE(details)->init)
+                ACTIVATION_DETAILS_VTABLE(details)->init(details, trigger_unit);
+
+        return TAKE_PTR(details);
+}
+
+static ActivationDetails *activation_details_free(ActivationDetails *details) {
+        if (!details)
+                return NULL;
+
+        if (ACTIVATION_DETAILS_VTABLE(details)->done)
+                ACTIVATION_DETAILS_VTABLE(details)->done(details);
+
+        free(details->trigger_unit_name);
+
+        return mfree(details);
+}
+
+void activation_details_serialize(ActivationDetails *details, FILE *f) {
+        if (!details || details->trigger_unit_type == _UNIT_TYPE_INVALID)
+                return;
+
+        (void) serialize_item(f, "activation-details-unit-type", unit_type_to_string(details->trigger_unit_type));
+        if (details->trigger_unit_name)
+                (void) serialize_item(f, "activation-details-unit-name", details->trigger_unit_name);
+        if (ACTIVATION_DETAILS_VTABLE(details)->serialize)
+                ACTIVATION_DETAILS_VTABLE(details)->serialize(details, f);
+}
+
+int activation_details_deserialize(const char *key, const char *value, ActivationDetails **details) {
+        int r;
+
+        assert(key);
+        assert(value);
+        assert(details);
+
+        if (!*details) {
+                UnitType t;
+
+                if (!streq(key, "activation-details-unit-type"))
+                        return -EINVAL;
+
+                t = unit_type_from_string(value);
+                if (t < 0)
+                        return t;
+
+                /* The activation details vtable has defined ops only for path and timer units */
+                if (!activation_details_vtable[t])
+                        return -EINVAL;
+
+                *details = malloc0(activation_details_vtable[t]->object_size);
+                if (!*details)
+                        return -ENOMEM;
+
+                **details = (ActivationDetails) {
+                        .n_ref = 1,
+                        .trigger_unit_type = t,
+                };
+
+                return 0;
+        }
+
+        if (streq(key, "activation-details-unit-name")) {
+                r = free_and_strdup(&(*details)->trigger_unit_name, value);
+                if (r < 0)
+                        return r;
+
+                return 0;
+        }
+
+        if (ACTIVATION_DETAILS_VTABLE(*details)->deserialize)
+                return ACTIVATION_DETAILS_VTABLE(*details)->deserialize(key, value, details);
+
+        return -EINVAL;
+}
+
+int activation_details_append_env(ActivationDetails *details, char ***strv) {
+        int r = 0;
+
+        assert(strv);
+
+        if (!details)
+                return 0;
+
+        if (!isempty(details->trigger_unit_name)) {
+                char *s = strjoin("TRIGGER_UNIT=", details->trigger_unit_name);
+                if (!s)
+                        return -ENOMEM;
+
+                r = strv_consume(strv, TAKE_PTR(s));
+                if (r < 0)
+                        return r;
+        }
+
+        if (ACTIVATION_DETAILS_VTABLE(details)->append_env) {
+                r = ACTIVATION_DETAILS_VTABLE(details)->append_env(details, strv);
+                if (r < 0)
+                        return r;
+        }
+
+        return r + !isempty(details->trigger_unit_name); /* Return the number of variables added to the env block */
+}
+
+int activation_details_append_pair(ActivationDetails *details, char ***strv) {
+        int r = 0;
+
+        assert(strv);
+
+        if (!details)
+                return 0;
+
+        if (!isempty(details->trigger_unit_name)) {
+                r = strv_extend(strv, "trigger_unit");
+                if (r < 0)
+                        return r;
+
+                r = strv_extend(strv, details->trigger_unit_name);
+                if (r < 0)
+                        return r;
+        }
+
+        if (ACTIVATION_DETAILS_VTABLE(details)->append_pair) {
+                r = ACTIVATION_DETAILS_VTABLE(details)->append_pair(details, strv);
+                if (r < 0)
+                        return r;
+        }
+
+        return r + !isempty(details->trigger_unit_name); /* Return the number of pairs added to the strv */
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(ActivationDetails, activation_details, activation_details_free);
diff --git a/src/core/unit.h b/src/core/unit.h
new file mode 100644
index 0000000..60bc2e3
--- /dev/null
+++ b/src/core/unit.h
@@ -0,0 +1,1249 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-id128.h"
+
+#include "bpf-program.h"
+#include "cgroup.h"
+#include "condition.h"
+#include "emergency-action.h"
+#include "install.h"
+#include "list.h"
+#include "pidref.h"
+#include "set.h"
+#include "show-status.h"
+#include "unit-file.h"
+
+typedef struct UnitRef UnitRef;
+
+typedef enum KillOperation {
+        KILL_TERMINATE,
+        KILL_TERMINATE_AND_LOG,
+        KILL_RESTART,
+        KILL_KILL,
+        KILL_WATCHDOG,
+        _KILL_OPERATION_MAX,
+        _KILL_OPERATION_INVALID = -EINVAL,
+} KillOperation;
+
+typedef enum CollectMode {
+        COLLECT_INACTIVE,
+        COLLECT_INACTIVE_OR_FAILED,
+        _COLLECT_MODE_MAX,
+        _COLLECT_MODE_INVALID = -EINVAL,
+} CollectMode;
+
+static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
+        return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING);
+}
+
+static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) {
+        return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING);
+}
+
+static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) {
+        return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING);
+}
+
+static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) {
+        return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED);
+}
+
+static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) {
+        return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED;
+}
+
+/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We
+ * use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be
+ * created as a result of multiple "reasons", hence the bitmask. */
+typedef enum UnitDependencyMask {
+        /* Configured directly by the unit file, .wants/.requires symlink or drop-in, or as an immediate result of a
+         * non-dependency option configured that way.  */
+        UNIT_DEPENDENCY_FILE               = 1 << 0,
+
+        /* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and
+         * type) */
+        UNIT_DEPENDENCY_IMPLICIT           = 1 << 1,
+
+        /* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually
+         * just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only
+         * be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */
+        UNIT_DEPENDENCY_DEFAULT            = 1 << 2,
+
+        /* A dependency created from udev rules */
+        UNIT_DEPENDENCY_UDEV               = 1 << 3,
+
+        /* A dependency created because of some unit's RequiresMountsFor= setting */
+        UNIT_DEPENDENCY_PATH               = 1 << 4,
+
+        /* A dependency initially configured from the mount unit file however the dependency will be updated
+         * from /proc/self/mountinfo as soon as the kernel will make the entry for that mount available in
+         * the /proc file */
+        UNIT_DEPENDENCY_MOUNT_FILE         = 1 << 5,
+
+        /* A dependency created or updated because of data read from /proc/self/mountinfo */
+        UNIT_DEPENDENCY_MOUNTINFO          = 1 << 6,
+
+        /* A dependency created because of data read from /proc/swaps and no other configuration source */
+        UNIT_DEPENDENCY_PROC_SWAP          = 1 << 7,
+
+        /* A dependency for units in slices assigned by directly setting Slice= */
+        UNIT_DEPENDENCY_SLICE_PROPERTY     = 1 << 8,
+
+        _UNIT_DEPENDENCY_MASK_FULL         = (1 << 9) - 1,
+} UnitDependencyMask;
+
+/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can
+ * be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin
+ * and the destination of a dependency might have created it. */
+typedef union UnitDependencyInfo {
+        void *data;
+        struct {
+                UnitDependencyMask origin_mask:16;
+                UnitDependencyMask destination_mask:16;
+        } _packed_;
+} UnitDependencyInfo;
+
+/* Store information about why a unit was activated.
+ * We start with trigger units (.path/.timer), eventually it will be expanded to include more metadata. */
+typedef struct ActivationDetails {
+        unsigned n_ref;
+        UnitType trigger_unit_type;
+        char *trigger_unit_name;
+} ActivationDetails;
+
+/* For casting an activation event into the various unit-specific types */
+#define DEFINE_ACTIVATION_DETAILS_CAST(UPPERCASE, MixedCase, UNIT_TYPE)         \
+        static inline MixedCase* UPPERCASE(ActivationDetails *a) {              \
+                if (_unlikely_(!a || a->trigger_unit_type != UNIT_##UNIT_TYPE)) \
+                        return NULL;                                            \
+                                                                                \
+                return (MixedCase*) a;                                          \
+        }
+
+/* For casting the various unit types into a unit */
+#define ACTIVATION_DETAILS(u)                                         \
+        ({                                                            \
+                typeof(u) _u_ = (u);                                  \
+                ActivationDetails *_w_ = _u_ ? &(_u_)->meta : NULL;   \
+                _w_;                                                  \
+        })
+
+ActivationDetails *activation_details_new(Unit *trigger_unit);
+ActivationDetails *activation_details_ref(ActivationDetails *p);
+ActivationDetails *activation_details_unref(ActivationDetails *p);
+void activation_details_serialize(ActivationDetails *p, FILE *f);
+int activation_details_deserialize(const char *key, const char *value, ActivationDetails **info);
+int activation_details_append_env(ActivationDetails *info, char ***strv);
+int activation_details_append_pair(ActivationDetails *info, char ***strv);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ActivationDetails*, activation_details_unref);
+
+typedef struct ActivationDetailsVTable {
+        /* How much memory does an object of this activation type need */
+        size_t object_size;
+
+        /* This should reset all type-specific variables. This should not allocate memory, and is called
+         * with zero-initialized data. It should hence only initialize variables that need to be set != 0. */
+        void (*init)(ActivationDetails *info, Unit *trigger_unit);
+
+        /* This should free all type-specific variables. It should be idempotent. */
+        void (*done)(ActivationDetails *info);
+
+        /* This should serialize all type-specific variables. */
+        void (*serialize)(ActivationDetails *info, FILE *f);
+
+        /* This should deserialize all type-specific variables, one at a time. */
+        int (*deserialize)(const char *key, const char *value, ActivationDetails **info);
+
+        /* This should format the type-specific variables for the env block of the spawned service,
+         * and return the number of added items. */
+        int (*append_env)(ActivationDetails *info, char ***strv);
+
+        /* This should append type-specific variables as key/value pairs for the D-Bus property of the job,
+         * and return the number of added pairs. */
+        int (*append_pair)(ActivationDetails *info, char ***strv);
+} ActivationDetailsVTable;
+
+extern const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX];
+
+static inline const ActivationDetailsVTable* ACTIVATION_DETAILS_VTABLE(const ActivationDetails *a) {
+        assert(a);
+        assert(a->trigger_unit_type < _UNIT_TYPE_MAX);
+
+        return activation_details_vtable[a->trigger_unit_type];
+}
+
+/* Newer LLVM versions don't like implicit casts from large pointer types to smaller enums, hence let's add
+ * explicit type-safe helpers for that. */
+static inline UnitDependency UNIT_DEPENDENCY_FROM_PTR(const void *p) {
+        return PTR_TO_INT(p);
+}
+
+static inline void* UNIT_DEPENDENCY_TO_PTR(UnitDependency d) {
+        return INT_TO_PTR(d);
+}
+
+#include "job.h"
+
+struct UnitRef {
+        /* Keeps tracks of references to a unit. This is useful so
+         * that we can merge two units if necessary and correct all
+         * references to them */
+
+        Unit *source, *target;
+        LIST_FIELDS(UnitRef, refs_by_target);
+};
+
+typedef struct Unit {
+        Manager *manager;
+
+        UnitType type;
+        UnitLoadState load_state;
+        Unit *merged_into;
+
+        char *id;   /* The one special name that we use for identification */
+        char *instance;
+
+        Set *aliases; /* All the other names. */
+
+        /* For each dependency type we can look up another Hashmap with this, whose key is a Unit* object,
+         * and whose value encodes why the dependency exists, using the UnitDependencyInfo type. i.e. a
+         * Hashmap(UnitDependency → Hashmap(Unit* → UnitDependencyInfo)) */
+        Hashmap *dependencies;
+
+        /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the
+         * UnitDependencyInfo type */
+        Hashmap *requires_mounts_for;
+
+        char *description;
+        char **documentation;
+
+        /* The SELinux context used for checking access to this unit read off the unit file at load time (do
+         * not confuse with the selinux_context field in ExecContext which is the SELinux context we'll set
+         * for processes) */
+        char *access_selinux_context;
+
+        char *fragment_path; /* if loaded from a config file this is the primary path to it */
+        char *source_path; /* if converted, the source file */
+        char **dropin_paths;
+
+        usec_t fragment_not_found_timestamp_hash;
+        usec_t fragment_mtime;
+        usec_t source_mtime;
+        usec_t dropin_mtime;
+
+        /* If this is a transient unit we are currently writing, this is where we are writing it to */
+        FILE *transient_file;
+
+        /* Freezer state */
+        sd_bus_message *pending_freezer_invocation;
+        FreezerState freezer_state;
+
+        /* Job timeout and action to take */
+        EmergencyAction job_timeout_action;
+        usec_t job_timeout;
+        usec_t job_running_timeout;
+        char *job_timeout_reboot_arg;
+
+        /* If there is something to do with this unit, then this is the installed job for it */
+        Job *job;
+
+        /* JOB_NOP jobs are special and can be installed without disturbing the real job. */
+        Job *nop_job;
+
+        /* The slot used for watching NameOwnerChanged signals */
+        sd_bus_slot *match_bus_slot;
+        sd_bus_slot *get_name_owner_slot;
+
+        /* References to this unit from clients */
+        sd_bus_track *bus_track;
+        char **deserialized_refs;
+
+        /* References to this */
+        LIST_HEAD(UnitRef, refs_by_target);
+
+        /* Conditions to check */
+        LIST_HEAD(Condition, conditions);
+        LIST_HEAD(Condition, asserts);
+
+        dual_timestamp condition_timestamp;
+        dual_timestamp assert_timestamp;
+
+        /* Updated whenever the low-level state changes */
+        dual_timestamp state_change_timestamp;
+
+        /* Updated whenever the (high-level) active state enters or leaves the active or inactive states */
+        dual_timestamp inactive_exit_timestamp;
+        dual_timestamp active_enter_timestamp;
+        dual_timestamp active_exit_timestamp;
+        dual_timestamp inactive_enter_timestamp;
+
+        /* Per type list */
+        LIST_FIELDS(Unit, units_by_type);
+
+        /* Load queue */
+        LIST_FIELDS(Unit, load_queue);
+
+        /* D-Bus queue */
+        LIST_FIELDS(Unit, dbus_queue);
+
+        /* Cleanup queue */
+        LIST_FIELDS(Unit, cleanup_queue);
+
+        /* GC queue */
+        LIST_FIELDS(Unit, gc_queue);
+
+        /* CGroup realize members queue */
+        LIST_FIELDS(Unit, cgroup_realize_queue);
+
+        /* cgroup empty queue */
+        LIST_FIELDS(Unit, cgroup_empty_queue);
+
+        /* cgroup OOM queue */
+        LIST_FIELDS(Unit, cgroup_oom_queue);
+
+        /* Target dependencies queue */
+        LIST_FIELDS(Unit, target_deps_queue);
+
+        /* Queue of units with StopWhenUnneeded= set that shall be checked for clean-up. */
+        LIST_FIELDS(Unit, stop_when_unneeded_queue);
+
+        /* Queue of units that have an Uphold= dependency from some other unit, and should be checked for starting */
+        LIST_FIELDS(Unit, start_when_upheld_queue);
+
+        /* Queue of units that have a BindTo= dependency on some other unit, and should possibly be shut down */
+        LIST_FIELDS(Unit, stop_when_bound_queue);
+
+        /* Queue of units that should be checked if they can release resources now */
+        LIST_FIELDS(Unit, release_resources_queue);
+
+        /* PIDs we keep an eye on. Note that a unit might have many more, but these are the ones we care
+         * enough about to process SIGCHLD for */
+        Set *pids; /* → PidRef* */
+
+        /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event
+         * multiple times on the same unit. */
+        unsigned sigchldgen;
+        unsigned notifygen;
+
+        /* Used during GC sweeps */
+        unsigned gc_marker;
+
+        /* Error code when we didn't manage to load the unit (negative) */
+        int load_error;
+
+        /* Put a ratelimit on unit starting */
+        RateLimit start_ratelimit;
+        EmergencyAction start_limit_action;
+
+        /* The unit has been marked for reload, restart, etc. Stored as 1u << marker1 | 1u << marker2. */
+        unsigned markers;
+
+        /* What to do on failure or success */
+        EmergencyAction success_action, failure_action;
+        int success_action_exit_status, failure_action_exit_status;
+        char *reboot_arg;
+
+        /* Make sure we never enter endless loops with the StopWhenUnneeded=, BindsTo=, Uphold= logic */
+        RateLimit auto_start_stop_ratelimit;
+        sd_event_source *auto_start_stop_event_source;
+
+        /* Reference to a specific UID/GID */
+        uid_t ref_uid;
+        gid_t ref_gid;
+
+        /* Cached unit file state and preset */
+        UnitFileState unit_file_state;
+        PresetAction unit_file_preset;
+
+        /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */
+        nsec_t cpu_usage_base;
+        nsec_t cpu_usage_last; /* the most recently read value */
+
+        /* Most recently read value of memory accounting metrics */
+        uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1];
+
+        /* The current counter of OOM kills initiated by systemd-oomd */
+        uint64_t managed_oom_kill_last;
+
+        /* The current counter of the oom_kill field in the memory.events cgroup attribute */
+        uint64_t oom_kill_last;
+
+        /* Where the io.stat data was at the time the unit was started */
+        uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+        uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */
+
+        /* Counterparts in the cgroup filesystem */
+        char *cgroup_path;
+        uint64_t cgroup_id;
+        CGroupMask cgroup_realized_mask;           /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
+        CGroupMask cgroup_enabled_mask;            /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
+        CGroupMask cgroup_invalidated_mask;        /* A mask specifying controllers which shall be considered invalidated, and require re-realization */
+        CGroupMask cgroup_members_mask;            /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
+
+        /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
+        int cgroup_control_inotify_wd;
+        int cgroup_memory_inotify_wd;
+
+        /* Device Controller BPF program */
+        BPFProgram *bpf_device_control_installed;
+
+        /* IP BPF Firewalling/accounting */
+        int ip_accounting_ingress_map_fd;
+        int ip_accounting_egress_map_fd;
+        uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
+
+        int ipv4_allow_map_fd;
+        int ipv6_allow_map_fd;
+        int ipv4_deny_map_fd;
+        int ipv6_deny_map_fd;
+        BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+        BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
+
+        Set *ip_bpf_custom_ingress;
+        Set *ip_bpf_custom_ingress_installed;
+        Set *ip_bpf_custom_egress;
+        Set *ip_bpf_custom_egress_installed;
+
+        /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd,
+         * attached to unit cgroup by provided program fd and attach type. */
+        Hashmap *bpf_foreign_by_key;
+
+        FDSet *initial_socket_bind_link_fds;
+#if BPF_FRAMEWORK
+        /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and
+         * responsible for allowing or denying a unit to bind(2) to a socket
+         * address. */
+        struct bpf_link *ipv4_socket_bind_link;
+        struct bpf_link *ipv6_socket_bind_link;
+#endif
+
+        FDSet *initial_restric_ifaces_link_fds;
+#if BPF_FRAMEWORK
+        struct bpf_link *restrict_ifaces_ingress_bpf_link;
+        struct bpf_link *restrict_ifaces_egress_bpf_link;
+#endif
+
+        /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new
+         * ones which might have appeared. */
+        sd_event_source *rewatch_pids_event_source;
+
+        /* How to start OnSuccess=/OnFailure= units */
+        JobMode on_success_job_mode;
+        JobMode on_failure_job_mode;
+
+        /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */
+        ActivationDetails *activation_details;
+
+        /* Tweaking the GC logic */
+        CollectMode collect_mode;
+
+        /* The current invocation ID */
+        sd_id128_t invocation_id;
+        char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */
+
+        /* Garbage collect us we nobody wants or requires us anymore */
+        bool stop_when_unneeded;
+
+        /* Create default dependencies */
+        bool default_dependencies;
+
+        /* Configure so that the unit survives a system transition without stopping/starting. */
+        bool survive_final_kill_signal;
+
+        /* Refuse manual starting, allow starting only indirectly via dependency. */
+        bool refuse_manual_start;
+
+        /* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */
+        bool refuse_manual_stop;
+
+        /* Allow isolation requests */
+        bool allow_isolate;
+
+        /* Ignore this unit when isolating */
+        bool ignore_on_isolate;
+
+        /* Did the last condition check succeed? */
+        bool condition_result;
+        bool assert_result;
+
+        /* Is this a transient unit? */
+        bool transient;
+
+        /* Is this a unit that is always running and cannot be stopped? */
+        bool perpetual;
+
+        /* Booleans indicating membership of this unit in the various queues */
+        bool in_load_queue:1;
+        bool in_dbus_queue:1;
+        bool in_cleanup_queue:1;
+        bool in_gc_queue:1;
+        bool in_cgroup_realize_queue:1;
+        bool in_cgroup_empty_queue:1;
+        bool in_cgroup_oom_queue:1;
+        bool in_target_deps_queue:1;
+        bool in_stop_when_unneeded_queue:1;
+        bool in_start_when_upheld_queue:1;
+        bool in_stop_when_bound_queue:1;
+        bool in_release_resources_queue:1;
+
+        bool sent_dbus_new_signal:1;
+
+        bool job_running_timeout_set:1;
+
+        bool in_audit:1;
+        bool on_console:1;
+
+        bool cgroup_realized:1;
+        bool cgroup_members_mask_valid:1;
+
+        /* Reset cgroup accounting next time we fork something off */
+        bool reset_accounting:1;
+
+        bool start_limit_hit:1;
+
+        /* Did we already invoke unit_coldplug() for this unit? */
+        bool coldplugged:1;
+
+        /* For transient units: whether to add a bus track reference after creating the unit */
+        bool bus_track_add:1;
+
+        /* Remember which unit state files we created */
+        bool exported_invocation_id:1;
+        bool exported_log_level_max:1;
+        bool exported_log_extra_fields:1;
+        bool exported_log_ratelimit_interval:1;
+        bool exported_log_ratelimit_burst:1;
+
+        /* Whether we warned about clamping the CPU quota period */
+        bool warned_clamping_cpu_quota_period:1;
+
+        /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If
+         * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */
+        signed int last_section_private:2;
+} Unit;
+
+typedef struct UnitStatusMessageFormats {
+        const char *starting_stopping[2];
+        const char *finished_start_job[_JOB_RESULT_MAX];
+        const char *finished_stop_job[_JOB_RESULT_MAX];
+        /* If this entry is present, it'll be called to provide a context-dependent format string,
+         * or NULL to fall back to finished_{start,stop}_job; if those are NULL too, fall back to generic. */
+        const char *(*finished_job)(Unit *u, JobType t, JobResult result);
+} UnitStatusMessageFormats;
+
+/* Flags used when writing drop-in files or transient unit files */
+typedef enum UnitWriteFlags {
+        /* Write a runtime unit file or drop-in (i.e. one below /run) */
+        UNIT_RUNTIME                = 1 << 0,
+
+        /* Write a persistent drop-in (i.e. one below /etc) */
+        UNIT_PERSISTENT             = 1 << 1,
+
+        /* Place this item in the per-unit-type private section, instead of [Unit] */
+        UNIT_PRIVATE                = 1 << 2,
+
+        /* Apply specifier escaping */
+        UNIT_ESCAPE_SPECIFIERS      = 1 << 3,
+
+        /* Escape elements of ExecStart= syntax, incl. prevention of variable expansion */
+        UNIT_ESCAPE_EXEC_SYNTAX_ENV = 1 << 4,
+
+        /* Escape elements of ExecStart=: syntax (no variable expansion) */
+        UNIT_ESCAPE_EXEC_SYNTAX     = 1 << 5,
+
+        /* Apply C escaping before writing */
+        UNIT_ESCAPE_C               = 1 << 6,
+} UnitWriteFlags;
+
+/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */
+static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) {
+        return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0;
+}
+
+#include "kill.h"
+
+typedef struct UnitVTable {
+        /* How much memory does an object of this unit type need */
+        size_t object_size;
+
+        /* If greater than 0, the offset into the object where
+         * ExecContext is found, if the unit type has that */
+        size_t exec_context_offset;
+
+        /* If greater than 0, the offset into the object where
+         * CGroupContext is found, if the unit type has that */
+        size_t cgroup_context_offset;
+
+        /* If greater than 0, the offset into the object where
+         * KillContext is found, if the unit type has that */
+        size_t kill_context_offset;
+
+        /* If greater than 0, the offset into the object where the
+         * pointer to ExecSharedRuntime is found, if the unit type has
+         * that */
+        size_t exec_runtime_offset;
+
+        /* The name of the configuration file section with the private settings of this unit */
+        const char *private_section;
+
+        /* Config file sections this unit type understands, separated
+         * by NUL chars */
+        const char *sections;
+
+        /* This should reset all type-specific variables. This should
+         * not allocate memory, and is called with zero-initialized
+         * data. It should hence only initialize variables that need
+         * to be set != 0. */
+        void (*init)(Unit *u);
+
+        /* This should free all type-specific variables. It should be
+         * idempotent. */
+        void (*done)(Unit *u);
+
+        /* Actually load data from disk. This may fail, and should set
+         * load_state to UNIT_LOADED, UNIT_MERGED or leave it at
+         * UNIT_STUB if no configuration could be found. */
+        int (*load)(Unit *u);
+
+        /* During deserialization we only record the intended state to return to. With coldplug() we actually put the
+         * deserialized state in effect. This is where unit_notify() should be called to start things up. Note that
+         * this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the
+         * reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was
+         * in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's
+         * what catchup() below is for. */
+        int (*coldplug)(Unit *u);
+
+        /* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the
+         * reloading state. It's supposed to catch up with state changes due to external events we missed so far (for
+         * example because they took place while we were reloading/reexecing) */
+        void (*catchup)(Unit *u);
+
+        void (*dump)(Unit *u, FILE *f, const char *prefix);
+
+        int (*start)(Unit *u);
+        int (*stop)(Unit *u);
+        int (*reload)(Unit *u);
+
+        /* Clear out the various runtime/state/cache/logs/configuration data */
+        int (*clean)(Unit *u, ExecCleanMask m);
+
+        /* Freeze the unit */
+        int (*freeze)(Unit *u);
+        int (*thaw)(Unit *u);
+        bool (*can_freeze)(Unit *u);
+
+        /* Return which kind of data can be cleaned */
+        int (*can_clean)(Unit *u, ExecCleanMask *ret);
+
+        bool (*can_reload)(Unit *u);
+
+        /* Serialize state and file descriptors that should be carried over into the new
+         * instance after reexecution. */
+        int (*serialize)(Unit *u, FILE *f, FDSet *fds);
+
+        /* Restore one item from the serialization */
+        int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds);
+
+        /* Try to match up fds with what we need for this unit */
+        void (*distribute_fds)(Unit *u, FDSet *fds);
+
+        /* Boils down the more complex internal state of this unit to
+         * a simpler one that the engine can understand */
+        UnitActiveState (*active_state)(Unit *u);
+
+        /* Returns the substate specific to this unit type as
+         * string. This is purely information so that we can give the
+         * user a more fine grained explanation in which actual state a
+         * unit is in. */
+        const char* (*sub_state_to_string)(Unit *u);
+
+        /* Additionally to UnitActiveState determine whether unit is to be restarted. */
+        bool (*will_restart)(Unit *u);
+
+        /* Return false when there is a reason to prevent this unit from being gc'ed
+         * even though nothing references it and it isn't active in any way. */
+        bool (*may_gc)(Unit *u);
+
+        /* Return true when the unit is not controlled by the manager (e.g. extrinsic mounts). */
+        bool (*is_extrinsic)(Unit *u);
+
+        /* When the unit is not running and no job for it queued we shall release its runtime resources */
+        void (*release_resources)(Unit *u);
+
+        /* Invoked on every child that died */
+        void (*sigchld_event)(Unit *u, pid_t pid, int code, int status);
+
+        /* Reset failed state if we are in failed state */
+        void (*reset_failed)(Unit *u);
+
+        /* Called whenever any of the cgroups this unit watches for ran empty */
+        void (*notify_cgroup_empty)(Unit *u);
+
+        /* Called whenever an OOM kill event on this unit was seen */
+        void (*notify_cgroup_oom)(Unit *u, bool managed_oom);
+
+        /* Called whenever a process of this unit sends us a message */
+        void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds);
+
+        /* Called whenever a name this Unit registered for comes or goes away. */
+        void (*bus_name_owner_change)(Unit *u, const char *new_owner);
+
+        /* Called for each property that is being set */
+        int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+
+        /* Called after at least one property got changed to apply the necessary change */
+        int (*bus_commit_properties)(Unit *u);
+
+        /* Return the unit this unit is following */
+        Unit *(*following)(Unit *u);
+
+        /* Return the set of units that are following each other */
+        int (*following_set)(Unit *u, Set **s);
+
+        /* Invoked each time a unit this unit is triggering changes
+         * state or gains/loses a job */
+        void (*trigger_notify)(Unit *u, Unit *trigger);
+
+        /* Called whenever CLOCK_REALTIME made a jump */
+        void (*time_change)(Unit *u);
+
+        /* Called whenever /etc/localtime was modified */
+        void (*timezone_change)(Unit *u);
+
+        /* Returns the next timeout of a unit */
+        int (*get_timeout)(Unit *u, usec_t *timeout);
+
+        /* Returns the start timeout of a unit */
+        usec_t (*get_timeout_start_usec)(Unit *u);
+
+        /* Returns the main PID if there is any defined, or 0. */
+        PidRef* (*main_pid)(Unit *u);
+
+        /* Returns the control PID if there is any defined, or 0. */
+        PidRef* (*control_pid)(Unit *u);
+
+        /* Returns true if the unit currently needs access to the console */
+        bool (*needs_console)(Unit *u);
+
+        /* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the
+         * exit code of the "main" process of the service or similar. */
+        int (*exit_status)(Unit *u);
+
+        /* Return a copy of the status string pointer. */
+        const char* (*status_text)(Unit *u);
+
+        /* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that
+         * unconditionally exist and are always active. The main reason to keep both enumeration functions separate is
+         * philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those
+         * discovered through regular enumeration should be put in place by catchup(), see below. */
+        void (*enumerate_perpetual)(Manager *m);
+
+        /* This is called for each unit type and should be used to enumerate units already existing in the system
+         * internally and load them. However, everything that is loaded here should still stay in inactive state. It is
+         * the job of the catchup() call above to put the units into the discovered state. */
+        void (*enumerate)(Manager *m);
+
+        /* Type specific cleanups. */
+        void (*shutdown)(Manager *m);
+
+        /* If this function is set and returns false all jobs for units
+         * of this type will immediately fail. */
+        bool (*supported)(void);
+
+        /* If this function is set, it's invoked first as part of starting a unit to allow start rate
+         * limiting checks to occur before we do anything else. */
+        int (*can_start)(Unit *u);
+
+        /* Returns > 0 if the whole subsystem is ratelimited, and new start operations should not be started
+         * for this unit type right now. */
+        int (*subsystem_ratelimited)(Manager *m);
+
+        /* The strings to print in status messages */
+        UnitStatusMessageFormats status_message_formats;
+
+        /* True if transient units of this type are OK */
+        bool can_transient;
+
+        /* True if cgroup delegation is permissible */
+        bool can_delegate;
+
+        /* True if the unit type triggers other units, i.e. can have a UNIT_TRIGGERS dependency */
+        bool can_trigger;
+
+        /* True if the unit type knows a failure state, and thus can be source of an OnFailure= dependency */
+        bool can_fail;
+
+        /* True if units of this type shall be startable only once and then never again */
+        bool once_only;
+
+        /* Do not serialize this unit when preparing for root switch */
+        bool exclude_from_switch_root_serialization;
+
+        /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */
+        bool gc_jobs;
+
+        /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroups  */
+        bool can_set_managed_oom;
+
+        /* If true, we'll notify plymouth about this unit */
+        bool notify_plymouth;
+
+        /* The audit events to generate on start + stop (or 0 if none shall be generated) */
+        int audit_start_message_type;
+        int audit_stop_message_type;
+} UnitVTable;
+
+extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX];
+
+static inline const UnitVTable* UNIT_VTABLE(const Unit *u) {
+        return unit_vtable[u->type];
+}
+
+/* For casting a unit into the various unit types */
+#define DEFINE_CAST(UPPERCASE, MixedCase)                               \
+        static inline MixedCase* UPPERCASE(Unit *u) {                   \
+                if (_unlikely_(!u || u->type != UNIT_##UPPERCASE))      \
+                        return NULL;                                    \
+                                                                        \
+                return (MixedCase*) u;                                  \
+        }
+
+/* For casting the various unit types into a unit */
+#define UNIT(u)                                         \
+        ({                                              \
+                typeof(u) _u_ = (u);                    \
+                Unit *_w_ = _u_ ? &(_u_)->meta : NULL;  \
+                _w_;                                    \
+        })
+
+#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0)
+#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0)
+#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0)
+
+Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other);
+int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array);
+int unit_get_transitive_dependency_set(Unit *u, UnitDependencyAtom atom, Set **ret);
+
+static inline Hashmap* unit_get_dependencies(Unit *u, UnitDependency d) {
+        return hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d));
+}
+
+static inline Unit* UNIT_TRIGGER(Unit *u) {
+        return unit_has_dependency(u, UNIT_ATOM_TRIGGERS, NULL);
+}
+
+static inline Unit* UNIT_GET_SLICE(const Unit *u) {
+        return unit_has_dependency(u, UNIT_ATOM_IN_SLICE, NULL);
+}
+
+Unit* unit_new(Manager *m, size_t size);
+Unit* unit_free(Unit *u);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free);
+
+int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret);
+int unit_add_name(Unit *u, const char *name);
+
+int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask);
+int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask);
+
+int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask);
+int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask);
+
+int unit_add_exec_dependencies(Unit *u, ExecContext *c);
+
+int unit_choose_id(Unit *u, const char *name);
+int unit_set_description(Unit *u, const char *description);
+
+void unit_release_resources(Unit *u);
+
+bool unit_may_gc(Unit *u);
+
+static inline bool unit_is_extrinsic(Unit *u) {
+        return u->perpetual ||
+                (UNIT_VTABLE(u)->is_extrinsic && UNIT_VTABLE(u)->is_extrinsic(u));
+}
+
+static inline const char* unit_status_text(Unit *u) {
+        if (u && UNIT_VTABLE(u)->status_text)
+                return UNIT_VTABLE(u)->status_text(u);
+        return NULL;
+}
+
+void unit_add_to_load_queue(Unit *u);
+void unit_add_to_dbus_queue(Unit *u);
+void unit_add_to_cleanup_queue(Unit *u);
+void unit_add_to_gc_queue(Unit *u);
+void unit_add_to_target_deps_queue(Unit *u);
+void unit_submit_to_stop_when_unneeded_queue(Unit *u);
+void unit_submit_to_start_when_upheld_queue(Unit *u);
+void unit_submit_to_stop_when_bound_queue(Unit *u);
+void unit_submit_to_release_resources_queue(Unit *u);
+
+int unit_merge(Unit *u, Unit *other);
+int unit_merge_by_name(Unit *u, const char *other);
+
+Unit *unit_follow_merge(Unit *u) _pure_;
+
+int unit_load_fragment_and_dropin(Unit *u, bool fragment_required);
+int unit_load(Unit *unit);
+
+int unit_set_slice(Unit *u, Unit *slice);
+int unit_set_default_slice(Unit *u);
+
+const char *unit_description(Unit *u) _pure_;
+const char *unit_status_string(Unit *u, char **combined);
+
+bool unit_has_name(const Unit *u, const char *name);
+
+UnitActiveState unit_active_state(Unit *u);
+FreezerState unit_freezer_state(Unit *u);
+int unit_freezer_state_kernel(Unit *u, FreezerState *ret);
+
+const char* unit_sub_state_to_string(Unit *u);
+
+bool unit_can_reload(Unit *u) _pure_;
+bool unit_can_start(Unit *u) _pure_;
+bool unit_can_stop(Unit *u) _pure_;
+bool unit_can_isolate(Unit *u) _pure_;
+
+int unit_start(Unit *u, ActivationDetails *details);
+int unit_stop(Unit *u);
+int unit_reload(Unit *u);
+
+int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *error);
+
+void unit_notify_cgroup_oom(Unit *u, bool managed_oom);
+
+void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success);
+
+int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive);
+int unit_watch_pid(Unit *u, pid_t pid, bool exclusive);
+void unit_unwatch_pidref(Unit *u, PidRef *pid);
+void unit_unwatch_pid(Unit *u, pid_t pid);
+void unit_unwatch_all_pids(Unit *u);
+
+int unit_enqueue_rewatch_pids(Unit *u);
+void unit_dequeue_rewatch_pids(Unit *u);
+
+int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name);
+int unit_watch_bus_name(Unit *u, const char *name);
+void unit_unwatch_bus_name(Unit *u, const char *name);
+
+bool unit_job_is_applicable(Unit *u, JobType j);
+
+int set_unit_path(const char *p);
+
+char *unit_dbus_path(Unit *u);
+char *unit_dbus_path_invocation_id(Unit *u);
+
+int unit_load_related_unit(Unit *u, const char *type, Unit **_found);
+
+int unit_add_node_dependency(Unit *u, const char *what, UnitDependency d, UnitDependencyMask mask);
+int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask);
+
+int unit_coldplug(Unit *u);
+void unit_catchup(Unit *u);
+
+void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) _printf_(4, 0);
+
+bool unit_need_daemon_reload(Unit *u);
+
+void unit_reset_failed(Unit *u);
+
+Unit *unit_following(Unit *u);
+int unit_following_set(Unit *u, Set **s);
+
+const char *unit_slice_name(Unit *u);
+
+bool unit_stop_pending(Unit *u) _pure_;
+bool unit_inactive_or_pending(Unit *u) _pure_;
+bool unit_active_or_pending(Unit *u);
+bool unit_will_restart_default(Unit *u);
+bool unit_will_restart(Unit *u);
+
+int unit_add_default_target_dependency(Unit *u, Unit *target);
+
+void unit_start_on_failure(Unit *u, const char *dependency_name, UnitDependencyAtom atom, JobMode job_mode);
+void unit_trigger_notify(Unit *u);
+
+UnitFileState unit_get_unit_file_state(Unit *u);
+PresetAction unit_get_unit_file_preset(Unit *u);
+
+Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target);
+void unit_ref_unset(UnitRef *ref);
+
+#define UNIT_DEREF(ref) ((ref).target)
+#define UNIT_ISSET(ref) (!!(ref).target)
+
+int unit_patch_contexts(Unit *u);
+
+ExecContext *unit_get_exec_context(const Unit *u) _pure_;
+KillContext *unit_get_kill_context(Unit *u) _pure_;
+CGroupContext *unit_get_cgroup_context(Unit *u) _pure_;
+
+ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_;
+
+int unit_setup_exec_runtime(Unit *u);
+
+const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf);
+char* unit_concat_strv(char **l, UnitWriteFlags flags);
+
+int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data);
+int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5);
+
+int unit_kill_context(Unit *u, KillContext *c, KillOperation k, PidRef *main_pid, PidRef *control_pid, bool main_pid_alien);
+
+int unit_make_transient(Unit *u);
+
+int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask);
+
+bool unit_type_supported(UnitType t);
+
+bool unit_is_pristine(Unit *u);
+
+bool unit_is_unneeded(Unit *u);
+bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit);
+bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit);
+
+PidRef* unit_control_pid(Unit *u);
+PidRef* unit_main_pid(Unit *u);
+
+void unit_warn_if_dir_nonempty(Unit *u, const char* where);
+int unit_fail_if_noncanonical(Unit *u, const char* where);
+
+int unit_test_start_limit(Unit *u);
+
+int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid);
+void unit_unref_uid_gid(Unit *u, bool destroy_now);
+
+void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid);
+
+int unit_set_invocation_id(Unit *u, sd_id128_t id);
+int unit_acquire_invocation_id(Unit *u);
+
+int unit_set_exec_params(Unit *s, ExecParameters *p);
+
+int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret);
+int unit_fork_and_watch_rm_rf(Unit *u, char **paths, PidRef *ret);
+
+void unit_remove_dependencies(Unit *u, UnitDependencyMask mask);
+
+void unit_export_state_files(Unit *u);
+void unit_unlink_state_files(Unit *u);
+
+int unit_prepare_exec(Unit *u);
+
+int unit_log_leftover_process_start(const PidRef* pid, int sig, void *userdata);
+int unit_log_leftover_process_stop(const PidRef* pid, int sig, void *userdata);
+
+int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func);
+
+bool unit_needs_console(Unit *u);
+
+int unit_pid_attachable(Unit *unit, PidRef *pid, sd_bus_error *error);
+
+static inline bool unit_has_job_type(Unit *u, JobType type) {
+        return u && u->job && u->job->type == type;
+}
+
+static inline bool unit_log_level_test(const Unit *u, int level) {
+        ExecContext *ec = unit_get_exec_context(u);
+        return !ec || ec->log_level_max < 0 || ec->log_level_max >= LOG_PRI(level);
+}
+
+/* unit_log_skip is for cases like ExecCondition= where a unit is considered "done"
+ * after some execution, rather than succeeded or failed. */
+void unit_log_skip(Unit *u, const char *result);
+void unit_log_success(Unit *u);
+void unit_log_failure(Unit *u, const char *result);
+static inline void unit_log_result(Unit *u, bool success, const char *result) {
+        if (success)
+                unit_log_success(u);
+        else
+                unit_log_failure(u, result);
+}
+
+void unit_log_process_exit(Unit *u, const char *kind, const char *command, bool success, int code, int status);
+
+int unit_exit_status(Unit *u);
+int unit_success_action_exit_status(Unit *u);
+int unit_failure_action_exit_status(Unit *u);
+
+int unit_test_trigger_loaded(Unit *u);
+
+void unit_destroy_runtime_data(Unit *u, const ExecContext *context);
+int unit_clean(Unit *u, ExecCleanMask mask);
+int unit_can_clean(Unit *u, ExecCleanMask *ret_mask);
+
+bool unit_can_start_refuse_manual(Unit *u);
+bool unit_can_stop_refuse_manual(Unit *u);
+bool unit_can_isolate_refuse_manual(Unit *u);
+
+bool unit_can_freeze(Unit *u);
+int unit_freeze(Unit *u);
+void unit_frozen(Unit *u);
+
+int unit_thaw(Unit *u);
+void unit_thawed(Unit *u);
+
+int unit_freeze_vtable_common(Unit *u);
+int unit_thaw_vtable_common(Unit *u);
+
+Condition *unit_find_failed_condition(Unit *u);
+
+int unit_arm_timer(Unit *u, sd_event_source **source, bool relative, usec_t usec, sd_event_time_handler_t handler);
+
+int unit_compare_priority(Unit *a, Unit *b);
+
+/* Macros which append UNIT= or USER_UNIT= to the message */
+
+#define log_unit_full_errno_zerook(unit, level, error, ...)             \
+        ({                                                              \
+                const Unit *_u = (unit);                                \
+                const int _l = (level);                                 \
+                bool _do_log = !(log_get_max_level() < LOG_PRI(_l) ||   \
+                        (_u && !unit_log_level_test(_u, _l)));          \
+                const ExecContext *_c = _do_log && _u ?                 \
+                        unit_get_exec_context(_u) : NULL;               \
+                LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL,  \
+                                     _c ? _c->n_log_extra_fields : 0);  \
+                !_do_log ? -ERRNO_VALUE(error) :                        \
+                        _u ? log_object_internal(_l, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \
+                                log_internal(_l, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \
+        })
+
+#define log_unit_full_errno(unit, level, error, ...) \
+        ({                                                              \
+                int _error = (error);                                   \
+                ASSERT_NON_ZERO(_error);                                \
+                log_unit_full_errno_zerook(unit, level, _error, ##__VA_ARGS__); \
+        })
+
+#define log_unit_full(unit, level, ...) (void) log_unit_full_errno_zerook(unit, level, 0, __VA_ARGS__)
+
+#define log_unit_debug(unit, ...)   log_unit_full(unit, LOG_DEBUG, __VA_ARGS__)
+#define log_unit_info(unit, ...)    log_unit_full(unit, LOG_INFO, __VA_ARGS__)
+#define log_unit_notice(unit, ...)  log_unit_full(unit, LOG_NOTICE, __VA_ARGS__)
+#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, __VA_ARGS__)
+#define log_unit_error(unit, ...)   log_unit_full(unit, LOG_ERR, __VA_ARGS__)
+
+#define log_unit_debug_errno(unit, error, ...)   log_unit_full_errno(unit, LOG_DEBUG, error, __VA_ARGS__)
+#define log_unit_info_errno(unit, error, ...)    log_unit_full_errno(unit, LOG_INFO, error, __VA_ARGS__)
+#define log_unit_notice_errno(unit, error, ...)  log_unit_full_errno(unit, LOG_NOTICE, error, __VA_ARGS__)
+#define log_unit_warning_errno(unit, error, ...) log_unit_full_errno(unit, LOG_WARNING, error, __VA_ARGS__)
+#define log_unit_error_errno(unit, error, ...)   log_unit_full_errno(unit, LOG_ERR, error, __VA_ARGS__)
+
+#if LOG_TRACE
+#  define log_unit_trace(...)          log_unit_debug(__VA_ARGS__)
+#  define log_unit_trace_errno(...)    log_unit_debug_errno(__VA_ARGS__)
+#else
+#  define log_unit_trace(...)          do {} while (0)
+#  define log_unit_trace_errno(e, ...) (-ERRNO_VALUE(e))
+#endif
+
+#define log_unit_struct_errno(unit, level, error, ...)                  \
+        ({                                                              \
+                const Unit *_u = (unit);                                \
+                const int _l = (level);                                 \
+                bool _do_log = unit_log_level_test(_u, _l);             \
+                const ExecContext *_c = _do_log && _u ?                 \
+                        unit_get_exec_context(_u) : NULL;               \
+                LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL,  \
+                                     _c ? _c->n_log_extra_fields : 0);  \
+                _do_log ?                                               \
+                        log_struct_errno(_l, error, __VA_ARGS__, LOG_UNIT_ID(_u)) : \
+                        -ERRNO_VALUE(error);                            \
+        })
+
+#define log_unit_struct(unit, level, ...) log_unit_struct_errno(unit, level, 0, __VA_ARGS__)
+
+#define log_unit_struct_iovec_errno(unit, level, error, iovec, n_iovec) \
+        ({                                                              \
+                const Unit *_u = (unit);                                \
+                const int _l = (level);                                 \
+                bool _do_log = unit_log_level_test(_u, _l);             \
+                const ExecContext *_c = _do_log && _u ?                 \
+                        unit_get_exec_context(_u) : NULL;               \
+                LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL,  \
+                                     _c ? _c->n_log_extra_fields : 0);  \
+                _do_log ?                                               \
+                        log_struct_iovec_errno(_l, error, iovec, n_iovec) : \
+                        -ERRNO_VALUE(error);                            \
+        })
+
+#define log_unit_struct_iovec(unit, level, iovec, n_iovec) log_unit_struct_iovec_errno(unit, level, 0, iovec, n_iovec)
+
+/* Like LOG_MESSAGE(), but with the unit name prefixed. */
+#define LOG_UNIT_MESSAGE(unit, fmt, ...) LOG_MESSAGE("%s: " fmt, (unit)->id, ##__VA_ARGS__)
+#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id
+#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string
+
+const char* collect_mode_to_string(CollectMode m) _const_;
+CollectMode collect_mode_from_string(const char *s) _pure_;
+
+typedef struct UnitForEachDependencyData {
+        /* Stores state for the FOREACH macro below for iterating through all deps that have any of the
+         * specified dependency atom bits set */
+        UnitDependencyAtom match_atom;
+        Hashmap *by_type, *by_unit;
+        void *current_type;
+        Iterator by_type_iterator, by_unit_iterator;
+        Unit **current_unit;
+} UnitForEachDependencyData;
+
+/* Iterates through all dependencies that have a specific atom in the dependency type set. This tries to be
+ * smart: if the atom is unique, we'll directly go to right entry. Otherwise we'll iterate through the
+ * per-dependency type hashmap and match all dep that have the right atom set. */
+#define _UNIT_FOREACH_DEPENDENCY(other, u, ma, data)                    \
+        for (UnitForEachDependencyData data = {                         \
+                        .match_atom = (ma),                             \
+                        .by_type = (u)->dependencies,                   \
+                        .by_type_iterator = ITERATOR_FIRST,             \
+                        .current_unit = &(other),                       \
+                };                                                      \
+             ({                                                         \
+                     UnitDependency _dt = _UNIT_DEPENDENCY_INVALID;     \
+                     bool _found;                                       \
+                                                                        \
+                     if (data.by_type && ITERATOR_IS_FIRST(data.by_type_iterator)) { \
+                             _dt = unit_dependency_from_unique_atom(data.match_atom); \
+                             if (_dt >= 0) {                            \
+                                     data.by_unit = hashmap_get(data.by_type, UNIT_DEPENDENCY_TO_PTR(_dt)); \
+                                     data.current_type = UNIT_DEPENDENCY_TO_PTR(_dt); \
+                                     data.by_type = NULL;               \
+                                     _found = !!data.by_unit;           \
+                             }                                          \
+                     }                                                  \
+                     if (_dt < 0)                                       \
+                             _found = hashmap_iterate(data.by_type,     \
+                                                      &data.by_type_iterator, \
+                                                      (void**)&(data.by_unit), \
+                                                      (const void**) &(data.current_type)); \
+                     _found;                                            \
+             }); )                                                      \
+                if ((unit_dependency_to_atom(UNIT_DEPENDENCY_FROM_PTR(data.current_type)) & data.match_atom) != 0) \
+                        for (data.by_unit_iterator = ITERATOR_FIRST;    \
+                                hashmap_iterate(data.by_unit,           \
+                                                &data.by_unit_iterator, \
+                                                NULL,                   \
+                                                (const void**) data.current_unit); )
+
+/* Note: this matches deps that have *any* of the atoms specified in match_atom set */
+#define UNIT_FOREACH_DEPENDENCY(other, u, match_atom) \
+        _UNIT_FOREACH_DEPENDENCY(other, u, match_atom, UNIQ_T(data, UNIQ))
+
+#define _LOG_CONTEXT_PUSH_UNIT(unit, u, c)                                                      \
+        const Unit *u = (unit);                                                                 \
+        const ExecContext *c = unit_get_exec_context(u);                                        \
+        LOG_CONTEXT_PUSH_KEY_VALUE(u->manager->unit_log_field, u->id);                          \
+        LOG_CONTEXT_PUSH_KEY_VALUE(u->manager->invocation_log_field, u->invocation_id_string);  \
+        LOG_CONTEXT_PUSH_IOV(c ? c->log_extra_fields : NULL, c ? c->n_log_extra_fields : 0)
+
+#define LOG_CONTEXT_PUSH_UNIT(unit) \
+        _LOG_CONTEXT_PUSH_UNIT(unit, UNIQ_T(u, UNIQ), UNIQ_T(c, UNIQ))
diff --git a/src/core/user.conf.in b/src/core/user.conf.in
new file mode 100644
index 0000000..14f0eae
--- /dev/null
+++ b/src/core/user.conf.in
@@ -0,0 +1,59 @@
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it under the
+#  terms of the GNU Lesser General Public License as published by the Free
+#  Software Foundation; either version 2.1 of the License, or (at your option)
+#  any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file (or a copy of it placed in
+# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in
+# the /etc/systemd/user.conf.d/ directory. The latter is generally recommended.
+# Defaults can be restored by simply deleting the main configuration file and
+# all drop-ins located in /etc/.
+#
+# Use 'systemd-analyze cat-config systemd/user.conf' to display the full config.
+#
+# See systemd-user.conf(5) for details.
+
+[Manager]
+#LogLevel=info
+#LogTarget=auto
+#LogColor=yes
+#LogLocation=no
+#LogTime=no
+#SystemCallArchitectures=
+#TimerSlackNSec=
+#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
+#DefaultTimerAccuracySec=1min
+#DefaultStandardOutput=inherit
+#DefaultStandardError=inherit
+#DefaultTimeoutStartSec={{DEFAULT_USER_TIMEOUT_SEC}}s
+#DefaultTimeoutStopSec={{DEFAULT_USER_TIMEOUT_SEC}}s
+#DefaultTimeoutAbortSec=
+#DefaultDeviceTimeoutSec={{DEFAULT_USER_TIMEOUT_SEC}}s
+#DefaultRestartSec=100ms
+#DefaultStartLimitIntervalSec=10s
+#DefaultStartLimitBurst=5
+#DefaultEnvironment=
+#DefaultLimitCPU=
+#DefaultLimitFSIZE=
+#DefaultLimitDATA=
+#DefaultLimitSTACK=
+#DefaultLimitCORE=
+#DefaultLimitRSS=
+#DefaultLimitNOFILE=
+#DefaultLimitAS=
+#DefaultLimitNPROC=
+#DefaultLimitMEMLOCK=
+#DefaultLimitLOCKS=
+#DefaultLimitSIGPENDING=
+#DefaultLimitMSGQUEUE=
+#DefaultLimitNICE=
+#DefaultLimitRTPRIO=
+#DefaultLimitRTTIME=
+#DefaultMemoryPressureThresholdSec=200ms
+#DefaultMemoryPressureWatch=auto
+#DefaultSmackProcessLabel=
+#ReloadLimitIntervalSec=
+#ReloadLimitBurst
diff --git a/src/coredump/coredump-vacuum.c b/src/coredump/coredump-vacuum.c
new file mode 100644
index 0000000..7e0c98c
--- /dev/null
+++ b/src/coredump/coredump-vacuum.c
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "coredump-vacuum.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hashmap.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "time-util.h"
+#include "user-util.h"
+
+#define DEFAULT_MAX_USE_LOWER (uint64_t) (1ULL*1024ULL*1024ULL)           /* 1 MiB */
+#define DEFAULT_MAX_USE_UPPER (uint64_t) (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
+#define DEFAULT_KEEP_FREE_UPPER (uint64_t) (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
+#define DEFAULT_KEEP_FREE (uint64_t) (1024ULL*1024ULL)                    /* 1 MB */
+
+typedef struct VacuumCandidate {
+        unsigned n_files;
+        char *oldest_file;
+        usec_t oldest_mtime;
+} VacuumCandidate;
+
+static VacuumCandidate* vacuum_candidate_free(VacuumCandidate *c) {
+        if (!c)
+                return NULL;
+
+        free(c->oldest_file);
+        return mfree(c);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(VacuumCandidate*, vacuum_candidate_free);
+
+static Hashmap* vacuum_candidate_hashmap_free(Hashmap *h) {
+        return hashmap_free_with_destructor(h, vacuum_candidate_free);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Hashmap*, vacuum_candidate_hashmap_free);
+
+static int uid_from_file_name(const char *filename, uid_t *uid) {
+        const char *p, *e, *u;
+
+        p = startswith(filename, "core.");
+        if (!p)
+                return -EINVAL;
+
+        /* Skip the comm field */
+        p = strchr(p, '.');
+        if (!p)
+                return -EINVAL;
+        p++;
+
+        /* Find end up UID */
+        e = strchr(p, '.');
+        if (!e)
+                return -EINVAL;
+
+        u = strndupa_safe(p, e - p);
+        return parse_uid(u, uid);
+}
+
+static bool vacuum_necessary(int fd, uint64_t sum, uint64_t keep_free, uint64_t max_use) {
+        uint64_t fs_size = 0, fs_free = UINT64_MAX;
+        struct statvfs sv;
+
+        assert(fd >= 0);
+
+        if (fstatvfs(fd, &sv) >= 0) {
+                fs_size = sv.f_frsize * sv.f_blocks;
+                fs_free = sv.f_frsize * sv.f_bfree;
+        }
+
+        if (max_use == UINT64_MAX) {
+
+                if (fs_size > 0) {
+                        max_use = PAGE_ALIGN(fs_size / 10); /* 10% */
+
+                        if (max_use > DEFAULT_MAX_USE_UPPER)
+                                max_use = DEFAULT_MAX_USE_UPPER;
+
+                        if (max_use < DEFAULT_MAX_USE_LOWER)
+                                max_use = DEFAULT_MAX_USE_LOWER;
+                } else
+                        max_use = DEFAULT_MAX_USE_LOWER;
+        } else
+                max_use = PAGE_ALIGN(max_use);
+
+        if (max_use > 0 && sum > max_use)
+                return true;
+
+        if (keep_free == UINT64_MAX) {
+
+                if (fs_size > 0) {
+                        keep_free = PAGE_ALIGN((fs_size * 3) / 20); /* 15% */
+
+                        if (keep_free > DEFAULT_KEEP_FREE_UPPER)
+                                keep_free = DEFAULT_KEEP_FREE_UPPER;
+                } else
+                        keep_free = DEFAULT_KEEP_FREE;
+        } else
+                keep_free = PAGE_ALIGN(keep_free);
+
+        if (keep_free > 0 && fs_free < keep_free)
+                return true;
+
+        return false;
+}
+
+int coredump_vacuum(int exclude_fd, uint64_t keep_free, uint64_t max_use) {
+        _cleanup_closedir_ DIR *d = NULL;
+        struct stat exclude_st;
+        int r;
+
+        if (keep_free == 0 && max_use == 0)
+                return 0;
+
+        if (exclude_fd >= 0) {
+                if (fstat(exclude_fd, &exclude_st) < 0)
+                        return log_error_errno(errno, "Failed to fstat(): %m");
+        }
+
+        /* This algorithm will keep deleting the oldest file of the
+         * user with the most coredumps until we are back in the size
+         * limits. Note that vacuuming for journal files is different,
+         * because we rely on rate-limiting of the messages there,
+         * to avoid being flooded. */
+
+        d = opendir("/var/lib/systemd/coredump");
+        if (!d) {
+                if (errno == ENOENT)
+                        return 0;
+
+                return log_error_errno(errno, "Can't open coredump directory: %m");
+        }
+
+        for (;;) {
+                _cleanup_(vacuum_candidate_hashmap_freep) Hashmap *h = NULL;
+                VacuumCandidate *worst = NULL;
+                uint64_t sum = 0;
+
+                rewinddir(d);
+
+                FOREACH_DIRENT(de, d, goto fail) {
+                        VacuumCandidate *c;
+                        struct stat st;
+                        uid_t uid;
+                        usec_t t;
+
+                        r = uid_from_file_name(de->d_name, &uid);
+                        if (r < 0)
+                                continue;
+
+                        if (fstatat(dirfd(d), de->d_name, &st, AT_NO_AUTOMOUNT|AT_SYMLINK_NOFOLLOW) < 0) {
+                                if (errno == ENOENT)
+                                        continue;
+
+                                log_warning_errno(errno, "Failed to stat /var/lib/systemd/coredump/%s: %m", de->d_name);
+                                continue;
+                        }
+
+                        if (!S_ISREG(st.st_mode))
+                                continue;
+
+                        if (exclude_fd >= 0 && stat_inode_same(&exclude_st, &st))
+                                continue;
+
+                        r = hashmap_ensure_allocated(&h, NULL);
+                        if (r < 0)
+                                return log_oom();
+
+                        t = timespec_load(&st.st_mtim);
+
+                        c = hashmap_get(h, UID_TO_PTR(uid));
+                        if (c) {
+
+                                if (t < c->oldest_mtime) {
+                                        char *n;
+
+                                        n = strdup(de->d_name);
+                                        if (!n)
+                                                return log_oom();
+
+                                        free_and_replace(c->oldest_file, n);
+                                        c->oldest_mtime = t;
+                                }
+
+                        } else {
+                                _cleanup_(vacuum_candidate_freep) VacuumCandidate *n = NULL;
+
+                                n = new0(VacuumCandidate, 1);
+                                if (!n)
+                                        return log_oom();
+
+                                n->oldest_file = strdup(de->d_name);
+                                if (!n->oldest_file)
+                                        return log_oom();
+
+                                n->oldest_mtime = t;
+
+                                r = hashmap_put(h, UID_TO_PTR(uid), n);
+                                if (r < 0)
+                                        return log_oom();
+
+                                c = TAKE_PTR(n);
+                        }
+
+                        c->n_files++;
+
+                        if (!worst ||
+                            worst->n_files < c->n_files ||
+                            (worst->n_files == c->n_files && c->oldest_mtime < worst->oldest_mtime))
+                                worst = c;
+
+                        sum += st.st_blocks * 512;
+                }
+
+                if (!worst)
+                        break;
+
+                r = vacuum_necessary(dirfd(d), sum, keep_free, max_use);
+                if (r <= 0)
+                        return r;
+
+                r = unlinkat_deallocate(dirfd(d), worst->oldest_file, 0);
+                if (r == -ENOENT)
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to remove file %s: %m", worst->oldest_file);
+
+                log_info("Removed old coredump %s.", worst->oldest_file);
+        }
+
+        return 0;
+
+fail:
+        return log_error_errno(errno, "Failed to read directory: %m");
+}
diff --git a/src/coredump/coredump-vacuum.h b/src/coredump/coredump-vacuum.h
new file mode 100644
index 0000000..8ad5baf
--- /dev/null
+++ b/src/coredump/coredump-vacuum.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+int coredump_vacuum(int exclude_fd, uint64_t keep_free, uint64_t max_use);
diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c
new file mode 100644
index 0000000..32c1766
--- /dev/null
+++ b/src/coredump/coredump.c
@@ -0,0 +1,1718 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-daemon.h"
+#include "sd-journal.h"
+#include "sd-login.h"
+#include "sd-messages.h"
+
+#include "acl-util.h"
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "capability-util.h"
+#include "cgroup-util.h"
+#include "compress.h"
+#include "conf-parser.h"
+#include "copy.h"
+#include "coredump-util.h"
+#include "coredump-vacuum.h"
+#include "dirent-util.h"
+#include "elf-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "iovec-util.h"
+#include "journal-importer.h"
+#include "journal-send.h"
+#include "log.h"
+#include "macro.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "memstream-util.h"
+#include "mkdir-label.h"
+#include "namespace-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "sync-util.h"
+#include "tmpfile-util.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+/* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */
+#if __SIZEOF_POINTER__ == 4
+#define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
+#elif __SIZEOF_POINTER__ == 8
+#define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
+#else
+#error "Unexpected pointer size"
+#endif
+
+/* The maximum size up to which we leave the coredump around on disk */
+#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
+
+/* The maximum size up to which we store the coredump in the journal */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
+#else
+/* oss-fuzz limits memory usage. */
+#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
+#endif
+
+/* When checking for available memory and setting lower limits, don't
+ * go below 4MB for writing core files to storage. */
+#define PROCESS_SIZE_MIN (4U*1024U*1024U)
+
+/* Make sure to not make this larger than the maximum journal entry
+ * size. See DATA_SIZE_MAX in journal-importer.h. */
+assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
+
+enum {
+        /* We use these as array indexes for our process metadata cache.
+         *
+         * The first indices of the cache stores the same metadata as the ones passed by
+         * the kernel via argv[], ie the strings array passed by the kernel according to
+         * our pattern defined in /proc/sys/kernel/core_pattern (see man:core(5)). */
+
+        META_ARGV_PID,          /* %P: as seen in the initial pid namespace */
+        META_ARGV_UID,          /* %u: as seen in the initial user namespace */
+        META_ARGV_GID,          /* %g: as seen in the initial user namespace */
+        META_ARGV_SIGNAL,       /* %s: number of signal causing dump */
+        META_ARGV_TIMESTAMP,    /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */
+        META_ARGV_RLIMIT,       /* %c: core file size soft resource limit */
+        META_ARGV_HOSTNAME,     /* %h: hostname */
+        _META_ARGV_MAX,
+
+        /* The following indexes are cached for a couple of special fields we use (and
+         * thereby need to be retrieved quickly) for naming coredump files, and attaching
+         * xattrs. Unlike the previous ones they are retrieved from the runtime
+         * environment. */
+
+        META_COMM = _META_ARGV_MAX,
+        _META_MANDATORY_MAX,
+
+        /* The rest are similar to the previous ones except that we won't fail if one of
+         * them is missing. */
+
+        META_EXE = _META_MANDATORY_MAX,
+        META_UNIT,
+        META_PROC_AUXV,
+        _META_MAX
+};
+
+static const char * const meta_field_names[_META_MAX] = {
+        [META_ARGV_PID]       = "COREDUMP_PID=",
+        [META_ARGV_UID]       = "COREDUMP_UID=",
+        [META_ARGV_GID]       = "COREDUMP_GID=",
+        [META_ARGV_SIGNAL]    = "COREDUMP_SIGNAL=",
+        [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
+        [META_ARGV_RLIMIT]    = "COREDUMP_RLIMIT=",
+        [META_ARGV_HOSTNAME]  = "COREDUMP_HOSTNAME=",
+        [META_COMM]           = "COREDUMP_COMM=",
+        [META_EXE]            = "COREDUMP_EXE=",
+        [META_UNIT]           = "COREDUMP_UNIT=",
+        [META_PROC_AUXV]      = "COREDUMP_PROC_AUXV=",
+};
+
+typedef struct Context {
+        const char *meta[_META_MAX];
+        size_t meta_size[_META_MAX];
+        pid_t pid;
+        uid_t uid;
+        gid_t gid;
+        bool is_pid1;
+        bool is_journald;
+} Context;
+
+typedef enum CoredumpStorage {
+        COREDUMP_STORAGE_NONE,
+        COREDUMP_STORAGE_EXTERNAL,
+        COREDUMP_STORAGE_JOURNAL,
+        _COREDUMP_STORAGE_MAX,
+        _COREDUMP_STORAGE_INVALID = -EINVAL,
+} CoredumpStorage;
+
+static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
+        [COREDUMP_STORAGE_NONE]     = "none",
+        [COREDUMP_STORAGE_EXTERNAL] = "external",
+        [COREDUMP_STORAGE_JOURNAL]  = "journal",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
+static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage, "Failed to parse storage setting");
+
+static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
+static bool arg_compress = true;
+static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
+static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
+static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
+static uint64_t arg_keep_free = UINT64_MAX;
+static uint64_t arg_max_use = UINT64_MAX;
+
+static int parse_config(void) {
+        static const ConfigTableItem items[] = {
+                { "Coredump", "Storage",          config_parse_coredump_storage,     0, &arg_storage           },
+                { "Coredump", "Compress",         config_parse_bool,                 0, &arg_compress          },
+                { "Coredump", "ProcessSizeMax",   config_parse_iec_uint64,           0, &arg_process_size_max  },
+                { "Coredump", "ExternalSizeMax",  config_parse_iec_uint64_infinity,  0, &arg_external_size_max },
+                { "Coredump", "JournalSizeMax",   config_parse_iec_size,             0, &arg_journal_size_max  },
+                { "Coredump", "KeepFree",         config_parse_iec_uint64,           0, &arg_keep_free         },
+                { "Coredump", "MaxUse",           config_parse_iec_uint64,           0, &arg_max_use           },
+                {}
+        };
+
+        int r;
+
+        r = config_parse_config_file(
+                        "coredump.conf",
+                        "Coredump\0",
+                        config_item_table_lookup,
+                        items,
+                        CONFIG_PARSE_WARN,
+                        /* userdata= */ NULL);
+        if (r < 0)
+                return r;
+
+        /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for
+         * efficiency reasons. journald wouldn't accept anything larger anyway. */
+        if (arg_journal_size_max > JOURNAL_SIZE_MAX) {
+                log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.",
+                            FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX));
+                arg_journal_size_max = JOURNAL_SIZE_MAX;
+        }
+
+        return 0;
+}
+
+static uint64_t storage_size_max(void) {
+        if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
+                return arg_external_size_max;
+        if (arg_storage == COREDUMP_STORAGE_JOURNAL)
+                return arg_journal_size_max;
+        assert(arg_storage == COREDUMP_STORAGE_NONE);
+        return 0;
+}
+
+static int fix_acl(int fd, uid_t uid, bool allow_user) {
+        assert(fd >= 0);
+        assert(uid_is_valid(uid));
+
+#if HAVE_ACL
+        int r;
+
+        /* We don't allow users to read coredumps if the uid or capabilities were changed. */
+        if (!allow_user)
+                return 0;
+
+        if (uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY)
+                return 0;
+
+        /* Make sure normal users can read (but not write or delete) their own coredumps */
+        r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
+        if (r < 0)
+                return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
+#endif
+
+        return 0;
+}
+
+static int fix_xattr(int fd, const Context *context) {
+
+        static const char * const xattrs[_META_MAX] = {
+                [META_ARGV_PID]       = "user.coredump.pid",
+                [META_ARGV_UID]       = "user.coredump.uid",
+                [META_ARGV_GID]       = "user.coredump.gid",
+                [META_ARGV_SIGNAL]    = "user.coredump.signal",
+                [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
+                [META_ARGV_RLIMIT]    = "user.coredump.rlimit",
+                [META_ARGV_HOSTNAME]  = "user.coredump.hostname",
+                [META_COMM]           = "user.coredump.comm",
+                [META_EXE]            = "user.coredump.exe",
+        };
+
+        int r = 0;
+
+        assert(fd >= 0);
+
+        /* Attach some metadata to coredumps via extended attributes. Just because we can. */
+
+        for (unsigned i = 0; i < _META_MAX; i++) {
+                int k;
+
+                if (isempty(context->meta[i]) || !xattrs[i])
+                        continue;
+
+                k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE));
+                RET_GATHER(r, k);
+        }
+
+        return r;
+}
+
+#define filename_escape(s) xescape((s), "./ ")
+
+static const char *coredump_tmpfile_name(const char *s) {
+        return s ?: "(unnamed temporary file)";
+}
+
+static int fix_permissions(
+                int fd,
+                const char *filename,
+                const char *target,
+                const Context *context,
+                bool allow_user) {
+
+        int r;
+
+        assert(fd >= 0);
+        assert(target);
+        assert(context);
+
+        /* Ignore errors on these */
+        (void) fchmod(fd, 0640);
+        (void) fix_acl(fd, context->uid, allow_user);
+        (void) fix_xattr(fd, context);
+
+        r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
+
+        return 0;
+}
+
+static int maybe_remove_external_coredump(const char *filename, uint64_t size) {
+
+        /* Returns 1 if might remove, 0 if will not remove, < 0 on error. */
+
+        if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
+            size <= arg_external_size_max)
+                return 0;
+
+        if (!filename)
+                return 1;
+
+        if (unlink(filename) < 0 && errno != ENOENT)
+                return log_error_errno(errno, "Failed to unlink %s: %m", filename);
+
+        return 1;
+}
+
+static int make_filename(const Context *context, char **ret) {
+        _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
+        sd_id128_t boot = {};
+        int r;
+
+        assert(context);
+
+        c = filename_escape(context->meta[META_COMM]);
+        if (!c)
+                return -ENOMEM;
+
+        u = filename_escape(context->meta[META_ARGV_UID]);
+        if (!u)
+                return -ENOMEM;
+
+        r = sd_id128_get_boot(&boot);
+        if (r < 0)
+                return r;
+
+        p = filename_escape(context->meta[META_ARGV_PID]);
+        if (!p)
+                return -ENOMEM;
+
+        t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
+        if (!t)
+                return -ENOMEM;
+
+        if (asprintf(ret,
+                     "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
+                     c,
+                     u,
+                     SD_ID128_FORMAT_VAL(boot),
+                     p,
+                     t) < 0)
+                return -ENOMEM;
+
+        return 0;
+}
+
+static int grant_user_access(int core_fd, const Context *context) {
+        int at_secure = -1;
+        uid_t uid = UID_INVALID, euid = UID_INVALID;
+        uid_t gid = GID_INVALID, egid = GID_INVALID;
+        int r;
+
+        assert(core_fd >= 0);
+        assert(context);
+
+        if (!context->meta[META_PROC_AUXV])
+                return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
+
+        uint8_t elf[EI_NIDENT];
+        errno = 0;
+        if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
+                return log_warning_errno(errno_or_else(EIO),
+                                         "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
+
+        if (elf[EI_MAG0] != ELFMAG0 ||
+            elf[EI_MAG1] != ELFMAG1 ||
+            elf[EI_MAG2] != ELFMAG2 ||
+            elf[EI_MAG3] != ELFMAG3 ||
+            elf[EI_VERSION] != EV_CURRENT)
+                return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
+                                      "Core file does not have ELF header, not adjusting permissions.");
+        if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
+            !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
+                return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
+                                      "Core file has strange ELF class, not adjusting permissions.");
+
+        if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
+                return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
+                                      "Core file has non-native endianness, not adjusting permissions.");
+
+        r = parse_auxv(LOG_WARNING,
+                       /* elf_class= */ elf[EI_CLASS],
+                       context->meta[META_PROC_AUXV],
+                       context->meta_size[META_PROC_AUXV],
+                       &at_secure, &uid, &euid, &gid, &egid);
+        if (r < 0)
+                return r;
+
+        /* We allow access if we got all the data and at_secure is not set and
+         * the uid/gid matches euid/egid. */
+        bool ret =
+                at_secure == 0 &&
+                uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
+                gid != GID_INVALID && egid != GID_INVALID && gid == egid;
+        log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
+                  ret ? "permit" : "restrict",
+                  uid, euid, gid, egid, yes_no(at_secure));
+        return ret;
+}
+
+static int save_external_coredump(
+                const Context *context,
+                int input_fd,
+                char **ret_filename,
+                int *ret_node_fd,
+                int *ret_data_fd,
+                uint64_t *ret_size,
+                uint64_t *ret_compressed_size,
+                bool *ret_truncated) {
+
+        _cleanup_(unlink_and_freep) char *tmp = NULL;
+        _cleanup_free_ char *fn = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        uint64_t rlimit, process_limit, max_size;
+        bool truncated, storage_on_tmpfs;
+        struct stat st;
+        int r;
+
+        assert(context);
+        assert(ret_filename);
+        assert(ret_node_fd);
+        assert(ret_data_fd);
+        assert(ret_size);
+        assert(ret_compressed_size);
+        assert(ret_truncated);
+
+        r = safe_atou64(context->meta[META_ARGV_RLIMIT], &rlimit);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse resource limit '%s': %m",
+                                       context->meta[META_ARGV_RLIMIT]);
+        if (rlimit < page_size())
+                /* Is coredumping disabled? Then don't bother saving/processing the
+                 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
+                 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
+                 * is usually the same as PAGE_SIZE. */
+                return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
+                                      "Resource limits disable core dumping for process %s (%s).",
+                                      context->meta[META_ARGV_PID], context->meta[META_COMM]);
+
+        process_limit = MAX(arg_process_size_max, storage_size_max());
+        if (process_limit == 0)
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
+                                       "Limits for coredump processing and storage are both 0, not dumping core.");
+
+        /* Never store more than the process configured, or than we actually shall keep or process */
+        max_size = MIN(rlimit, process_limit);
+
+        r = make_filename(context, &fn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine coredump file name: %m");
+
+        (void) mkdir_parents_label(fn, 0755);
+
+        fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
+        if (fd < 0)
+                return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
+
+        /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
+         * the service or the slice it belongs to. This is common on low-resources systems,
+         * to avoid crashing processes to take away too many system resources.
+         * Check the cgroup settings, and set max_size to a bit less than half of the
+         * available memory left to the process.
+         * Then, attempt to write the core file uncompressed first - if the write gets
+         * interrupted, we know we won't be able to write it all, so instead compress what
+         * was written so far, delete the uncompressed truncated core, and then continue
+         * compressing from STDIN. Given the compressed core cannot be larger than the
+         * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
+         * should be able to at least store the full compressed core file. */
+
+        storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
+        if (storage_on_tmpfs && arg_compress) {
+                _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+                uint64_t cgroup_limit = UINT64_MAX;
+                struct statvfs sv;
+
+                /* If we can't get the cgroup limit, just ignore it, but don't fail,
+                 * try anyway with the config settings. */
+                r = sd_bus_default_system(&bus);
+                if (r < 0)
+                        log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
+                else {
+                        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                        r = sd_bus_get_property_trivial(
+                                        bus,
+                                        "org.freedesktop.systemd1",
+                                        "/org/freedesktop/systemd1/unit/self",
+                                        "org.freedesktop.systemd1.Service",
+                                        "MemoryAvailable",
+                                        &error,
+                                        't', &cgroup_limit);
+                        if (r < 0)
+                                log_warning_errno(r,
+                                                  "Failed to query MemoryAvailable for current unit, "
+                                                  "falling back to static config settings: %s",
+                                                  bus_error_message(&error, r));
+                }
+
+                max_size = MIN(cgroup_limit, max_size);
+                max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
+                max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
+
+                /* tmpfs might get full quickly, so check the available space too.
+                 * But don't worry about errors here, failing to access the storage
+                 * location will be better logged when writing to it. */
+                if (fstatvfs(fd, &sv) >= 0)
+                        max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
+
+                log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
+        }
+
+        r = copy_bytes(input_fd, fd, max_size, 0);
+        if (r < 0)
+                return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
+                                context->meta[META_ARGV_PID], context->meta[META_COMM]);
+        truncated = r == 1;
+
+        bool allow_user = grant_user_access(fd, context) > 0;
+
+#if HAVE_COMPRESSION
+        if (arg_compress) {
+                _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
+                _cleanup_free_ char *fn_compressed = NULL;
+                _cleanup_close_ int fd_compressed = -EBADF;
+                uint64_t uncompressed_size = 0;
+
+                if (lseek(fd, 0, SEEK_SET) < 0)
+                        return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
+
+                fn_compressed = strjoin(fn, default_compression_extension());
+                if (!fn_compressed)
+                        return log_oom();
+
+                fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
+                if (fd_compressed < 0)
+                        return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
+
+                r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
+
+                if (truncated && storage_on_tmpfs) {
+                        uint64_t partial_uncompressed_size = 0;
+
+                        /* Uncompressed write was truncated and we are writing to tmpfs: delete
+                         * the uncompressed core, and compress the remaining part from STDIN. */
+
+                        tmp = unlink_and_free(tmp);
+                        fd = safe_close(fd);
+
+                        r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
+                        uncompressed_size += partial_uncompressed_size;
+                }
+
+                r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, allow_user);
+                if (r < 0)
+                        return r;
+
+                if (fstat(fd_compressed, &st) < 0)
+                        return log_error_errno(errno,
+                                        "Failed to fstat core file %s: %m",
+                                        coredump_tmpfile_name(tmp_compressed));
+
+                *ret_filename = TAKE_PTR(fn_compressed);       /* compressed */
+                *ret_node_fd = TAKE_FD(fd_compressed);         /* compressed */
+                *ret_compressed_size = (uint64_t) st.st_size;  /* compressed */
+                *ret_data_fd = TAKE_FD(fd);
+                *ret_size = uncompressed_size;
+                *ret_truncated = truncated;
+                tmp_compressed = mfree(tmp_compressed);
+
+                return 0;
+        }
+#endif
+
+        if (truncated)
+                log_struct(LOG_INFO,
+                           LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
+                           "SIZE_LIMIT=%"PRIu64, max_size,
+                           "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
+
+        r = fix_permissions(fd, tmp, fn, context, allow_user);
+        if (r < 0)
+                return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
+
+        if (fstat(fd, &st) < 0)
+                return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
+
+        if (lseek(fd, 0, SEEK_SET) < 0)
+                return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
+
+        *ret_filename = TAKE_PTR(fn);
+        *ret_data_fd = TAKE_FD(fd);
+        *ret_size = (uint64_t) st.st_size;
+        *ret_truncated = truncated;
+
+        return 0;
+}
+
+static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
+        _cleanup_free_ char *field = NULL;
+        ssize_t n;
+
+        assert(fd >= 0);
+        assert(ret);
+        assert(ret_size);
+
+        if (lseek(fd, 0, SEEK_SET) < 0)
+                return log_warning_errno(errno, "Failed to seek: %m");
+
+        field = malloc(9 + size);
+        if (!field)
+                return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM),
+                                         "Failed to allocate memory for coredump, coredump will not be stored.");
+
+        memcpy(field, "COREDUMP=", 9);
+
+        /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with
+         * 0x7ffff000 bytes max. Hence call things in a loop. */
+        n = loop_read(fd, field + 9, size, /* do_poll= */ false);
+        if (n < 0)
+                return log_error_errno((int) n, "Failed to read core data: %m");
+        if ((size_t) n < size)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                       "Core data too short.");
+
+        *ret = TAKE_PTR(field);
+        *ret_size = size + 9;
+
+        return 0;
+}
+
+/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
+ * 0:/dev/pts/23
+ * pos:    0
+ * flags:  0100002
+ *
+ * 1:/dev/pts/23
+ * pos:    0
+ * flags:  0100002
+ *
+ * 2:/dev/pts/23
+ * pos:    0
+ * flags:  0100002
+ * EOF
+ */
+static int compose_open_fds(pid_t pid, char **ret) {
+        _cleanup_(memstream_done) MemStream m = {};
+        _cleanup_closedir_ DIR *proc_fd_dir = NULL;
+        _cleanup_close_ int proc_fdinfo_fd = -EBADF;
+        const char *fddelim = "", *path;
+        FILE *stream;
+        int r;
+
+        assert(pid >= 0);
+        assert(ret);
+
+        path = procfs_file_alloca(pid, "fd");
+        proc_fd_dir = opendir(path);
+        if (!proc_fd_dir)
+                return -errno;
+
+        proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
+        if (proc_fdinfo_fd < 0)
+                return -errno;
+
+        stream = memstream_init(&m);
+        if (!stream)
+                return -ENOMEM;
+
+        FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
+                _cleanup_fclose_ FILE *fdinfo = NULL;
+                _cleanup_free_ char *fdname = NULL;
+                _cleanup_close_ int fd = -EBADF;
+
+                r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
+                if (r < 0)
+                        return r;
+
+                fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
+                fddelim = "\n";
+
+                /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
+                fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
+                if (fd < 0)
+                        continue;
+
+                fdinfo = take_fdopen(&fd, "r");
+                if (!fdinfo)
+                        continue;
+
+                for (;;) {
+                        _cleanup_free_ char *line = NULL;
+
+                        r = read_line(fdinfo, LONG_LINE_MAX, &line);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        fputs(line, stream);
+                        fputc('\n', stream);
+                }
+        }
+
+        return memstream_finalize(&m, ret, NULL);
+}
+
+/* Returns 1 if the parent was found.
+ * Returns 0 if there is not a process we can call the pid's
+ * container parent (the pid's process isn't 'containerized').
+ * Returns a negative number on errors.
+ */
+static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
+        pid_t container_pid;
+        const char *proc_root_path;
+        struct stat root_stat, proc_root_stat;
+        int r;
+
+        /* To compare inodes of / and /proc/[pid]/root */
+        if (stat("/", &root_stat) < 0)
+                return -errno;
+
+        proc_root_path = procfs_file_alloca(pid, "root");
+        if (stat(proc_root_path, &proc_root_stat) < 0)
+                return -errno;
+
+        /* The process uses system root. */
+        if (stat_inode_same(&proc_root_stat, &root_stat)) {
+                *cmdline = NULL;
+                return 0;
+        }
+
+        r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid);
+        if (r < 0)
+                return r;
+
+        r = pid_get_cmdline(container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, cmdline);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int change_uid_gid(const Context *context) {
+        uid_t uid = context->uid;
+        gid_t gid = context->gid;
+        int r;
+
+        if (uid_is_system(uid)) {
+                const char *user = "systemd-coredump";
+
+                r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+                if (r < 0) {
+                        log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
+                        uid = gid = 0;
+                }
+        }
+
+        return drop_privileges(uid, gid, 0);
+}
+
+static int submit_coredump(
+                const Context *context,
+                struct iovec_wrapper *iovw,
+                int input_fd) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
+        _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
+        _cleanup_free_ char *filename = NULL, *coredump_data = NULL;
+        _cleanup_free_ char *stacktrace = NULL;
+        const char *module_name;
+        uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
+        bool truncated = false, written = false;
+        JsonVariant *module_json;
+        int r;
+
+        assert(context);
+        assert(iovw);
+        assert(input_fd >= 0);
+
+        /* Vacuum before we write anything again */
+        (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
+
+        /* Always stream the coredump to disk, if that's possible */
+        written = save_external_coredump(
+                        context, input_fd,
+                        &filename, &coredump_node_fd, &coredump_fd,
+                        &coredump_size, &coredump_compressed_size, &truncated) >= 0;
+        if (written) {
+                /* If we could write it to disk we can now process it. */
+                /* If we don't want to keep the coredump on disk, remove it now, as later on we
+                 * will lack the privileges for it. However, we keep the fd to it, so that we can
+                 * still process it and log it. */
+                r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
+                else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
+                        log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
+                                 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
+
+                /* Vacuum again, but exclude the coredump we just created */
+                (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
+        }
+
+        /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
+         * coredump memory under the user's uid. This also ensures that the credentials journald will see are
+         * the ones of the coredumping user, thus making sure the user gets access to the core dump. Let's
+         * also get rid of all capabilities, if we run as root, we won't need them anymore. */
+        r = change_uid_gid(context);
+        if (r < 0)
+                return log_error_errno(r, "Failed to drop privileges: %m");
+
+        if (written) {
+                /* Try to get a stack trace if we can */
+                if (coredump_size > arg_process_size_max)
+                        log_debug("Not generating stack trace: core size %"PRIu64" is greater "
+                                  "than %"PRIu64" (the configured maximum)",
+                                  coredump_size, arg_process_size_max);
+                else if (coredump_fd >= 0) {
+                        bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
+
+                        (void) parse_elf_object(coredump_fd,
+                                                context->meta[META_EXE],
+                                                /* fork_disable_dump= */ skip, /* avoid loops */
+                                                &stacktrace,
+                                                &json_metadata);
+                }
+        }
+
+        _cleanup_free_ char *core_message = NULL;
+        core_message = strjoin(
+                        "Process ", context->meta[META_ARGV_PID],
+                        " (", context->meta[META_COMM],
+                        ") of user ", context->meta[META_ARGV_UID],
+                        written ? " dumped core." : " terminated abnormally without generating a coredump.");
+        if (!core_message)
+                return log_oom();
+
+        if (context->is_journald && filename)
+                if (!strextend(&core_message, "\nCoredump diverted to ", filename))
+                        return log_oom();
+
+        if (stacktrace)
+                if (!strextend(&core_message, "\n\n", stacktrace))
+                        return log_oom();
+
+        if (context->is_journald)
+                /* We might not be able to log to the journal, so let's always print the message to another
+                 * log target. The target was set previously to something safe. */
+                log_dispatch(LOG_ERR, 0, core_message);
+
+        (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
+
+        if (truncated)
+                (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
+
+        /* If we managed to parse any ELF metadata (build-id, ELF package meta),
+         * attach it as journal metadata. */
+        if (json_metadata) {
+                _cleanup_free_ char *formatted_json = NULL;
+
+                r = json_variant_format(json_metadata, 0, &formatted_json);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to format JSON package metadata: %m");
+
+                (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
+        }
+
+        /* In the unlikely scenario that context->meta[META_EXE] is not available,
+         * let's avoid guessing the module name and skip the loop. */
+        if (context->meta[META_EXE])
+                JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
+                        JsonVariant *t;
+
+                        /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
+                        if (!path_equal_filename(module_name, context->meta[META_EXE]))
+                                continue;
+
+                        t = json_variant_by_key(module_json, "name");
+                        if (t)
+                                (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", json_variant_string(t));
+
+                        t = json_variant_by_key(module_json, "version");
+                        if (t)
+                                (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", json_variant_string(t));
+                }
+
+        /* Optionally store the entire coredump in the journal */
+        if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
+                if (coredump_size <= arg_journal_size_max) {
+                        size_t sz = 0;
+
+                        /* Store the coredump itself in the journal */
+
+                        r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
+                        if (r >= 0) {
+                                if (iovw_put(iovw, coredump_data, sz) >= 0)
+                                        TAKE_PTR(coredump_data);
+                        } else
+                                log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
+                } else
+                        log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
+                                 coredump_size, arg_journal_size_max);
+        }
+
+        /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
+         * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
+         * the coredump to the socket. */
+
+        if (context->is_journald) {
+                r = journal_fd_nonblock(true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
+        }
+
+        r = sd_journal_sendv(iovw->iovec, iovw->count);
+
+        if (context->is_journald) {
+                int k;
+
+                k = journal_fd_nonblock(false);
+                if (k < 0)
+                        return log_error_errno(k, "Failed to make journal socket blocking: %m");
+        }
+
+        if (r == -EAGAIN && context->is_journald)
+                log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
+        else if (r < 0)
+                return log_error_errno(r, "Failed to log coredump: %m");
+
+        return 0;
+}
+
+static int save_context(Context *context, const struct iovec_wrapper *iovw) {
+        const char *unit;
+        int r;
+
+        assert(context);
+        assert(iovw);
+        assert(iovw->count >= _META_ARGV_MAX);
+
+        /* The context does not allocate any memory on its own */
+
+        for (size_t n = 0; n < iovw->count; n++) {
+                struct iovec *iovec = iovw->iovec + n;
+
+                for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
+                        /* Note that these strings are NUL terminated, because we made sure that a
+                         * trailing NUL byte is in the buffer, though not included in the iov_len
+                         * count (see process_socket() and gather_pid_metadata_*()) */
+                        assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
+
+                        const char *p = startswith(iovec->iov_base, meta_field_names[i]);
+                        if (p) {
+                                context->meta[i] = p;
+                                context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
+                                break;
+                        }
+                }
+        }
+
+        if (!context->meta[META_ARGV_PID])
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Failed to find the PID of crashing process");
+
+        r = parse_pid(context->meta[META_ARGV_PID], &context->pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
+
+        r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
+
+        r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
+
+        unit = context->meta[META_UNIT];
+        context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
+        context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
+
+        return 0;
+}
+
+static int process_socket(int fd) {
+        _cleanup_close_ int input_fd = -EBADF;
+        Context context = {};
+        struct iovec_wrapper iovw = {};
+        struct iovec iovec;
+        int r;
+
+        assert(fd >= 0);
+
+        log_setup();
+
+        log_debug("Processing coredump received on stdin...");
+
+        for (;;) {
+                CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
+                struct msghdr mh = {
+                        .msg_control = &control,
+                        .msg_controllen = sizeof(control),
+                        .msg_iovlen = 1,
+                };
+                ssize_t n;
+                ssize_t l;
+
+                l = next_datagram_size_fd(fd);
+                if (l < 0) {
+                        r = log_error_errno(l, "Failed to determine datagram size to read: %m");
+                        goto finish;
+                }
+
+                iovec.iov_len = l;
+                iovec.iov_base = malloc(l + 1);
+                if (!iovec.iov_base) {
+                        r = log_oom();
+                        goto finish;
+                }
+
+                mh.msg_iov = &iovec;
+
+                n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
+                if (n < 0)  {
+                        free(iovec.iov_base);
+                        r = log_error_errno(n, "Failed to receive datagram: %m");
+                        goto finish;
+                }
+
+                /* The final zero-length datagram carries the file descriptor and tells us
+                 * that we're done. */
+                if (n == 0) {
+                        struct cmsghdr *found;
+
+                        free(iovec.iov_base);
+
+                        found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
+                        if (!found) {
+                                cmsg_close_all(&mh);
+                                r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                                    "Coredump file descriptor missing.");
+                                goto finish;
+                        }
+
+                        assert(input_fd < 0);
+                        input_fd = *CMSG_TYPED_DATA(found, int);
+                        break;
+                } else
+                        cmsg_close_all(&mh);
+
+                /* Add trailing NUL byte, in case these are strings */
+                ((char*) iovec.iov_base)[n] = 0;
+                iovec.iov_len = (size_t) n;
+
+                r = iovw_put(&iovw, iovec.iov_base, iovec.iov_len);
+                if (r < 0)
+                        goto finish;
+        }
+
+        /* Make sure we got all data we really need */
+        assert(input_fd >= 0);
+
+        r = save_context(&context, &iovw);
+        if (r < 0)
+                goto finish;
+
+        /* Make sure we received at least all fields we need. */
+        for (int i = 0; i < _META_MANDATORY_MAX; i++)
+                if (!context.meta[i]) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                            "A mandatory argument (%i) has not been sent, aborting.",
+                                            i);
+                        goto finish;
+                }
+
+        r = submit_coredump(&context, &iovw, input_fd);
+
+finish:
+        iovw_free_contents(&iovw, true);
+        return r;
+}
+
+static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        assert(iovw);
+        assert(input_fd >= 0);
+
+        fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
+        if (fd < 0)
+                return log_error_errno(errno, "Failed to create coredump socket: %m");
+
+        r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
+        if (r < 0)
+                return log_error_errno(r, "Failed to connect to coredump service: %m");
+
+        for (size_t i = 0; i < iovw->count; i++) {
+                struct msghdr mh = {
+                        .msg_iov = iovw->iovec + i,
+                        .msg_iovlen = 1,
+                };
+                struct iovec copy[2];
+
+                for (;;) {
+                        if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
+                                break;
+
+                        if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
+                                /* This field didn't fit? That's a pity. Given that this is
+                                 * just metadata, let's truncate the field at half, and try
+                                 * again. We append three dots, in order to show that this is
+                                 * truncated. */
+
+                                if (mh.msg_iov != copy) {
+                                        /* We don't want to modify the caller's iovec, hence
+                                         * let's create our own array, consisting of two new
+                                         * iovecs, where the first is a (truncated) copy of
+                                         * what we want to send, and the second one contains
+                                         * the trailing dots. */
+                                        copy[0] = iovw->iovec[i];
+                                        copy[1] = IOVEC_MAKE(((char[]){'.', '.', '.'}), 3);
+
+                                        mh.msg_iov = copy;
+                                        mh.msg_iovlen = 2;
+                                }
+
+                                copy[0].iov_len /= 2; /* halve it, and try again */
+                                continue;
+                        }
+
+                        return log_error_errno(errno, "Failed to send coredump datagram: %m");
+                }
+        }
+
+        r = send_one_fd(fd, input_fd, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to send coredump fd: %m");
+
+        return 0;
+}
+
+static int gather_pid_metadata_from_argv(
+                struct iovec_wrapper *iovw,
+                Context *context,
+                int argc, char **argv) {
+
+        _cleanup_free_ char *free_timestamp = NULL;
+        int r, signo;
+        char *t;
+
+        assert(iovw);
+        assert(context);
+
+        /* We gather all metadata that were passed via argv[] into an array of iovecs that
+         * we'll forward to the socket unit */
+
+        if (argc < _META_ARGV_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Not enough arguments passed by the kernel (%i, expected %i).",
+                                       argc, _META_ARGV_MAX);
+
+        for (int i = 0; i < _META_ARGV_MAX; i++) {
+
+                t = argv[i];
+
+                switch (i) {
+
+                case META_ARGV_TIMESTAMP:
+                        /* The journal fields contain the timestamp padded with six
+                         * zeroes, so that the kernel-supplied 1s granularity timestamps
+                         * becomes 1μs granularity, i.e. the granularity systemd usually
+                         * operates in. */
+                        t = free_timestamp = strjoin(argv[i], "000000");
+                        if (!t)
+                                return log_oom();
+                        break;
+
+                case META_ARGV_SIGNAL:
+                        /* For signal, record its pretty name too */
+                        if (safe_atoi(argv[i], &signo) >= 0 && SIGNAL_VALID(signo))
+                                (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG",
+                                                             signal_to_string(signo));
+                        break;
+
+                default:
+                        break;
+                }
+
+                r = iovw_put_string_field(iovw, meta_field_names[i], t);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Cache some of the process metadata we collected so far and that we'll need to
+         * access soon */
+        return save_context(context, iovw);
+}
+
+static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
+        uid_t owner_uid;
+        pid_t pid;
+        char *t;
+        size_t size;
+        const char *p;
+        int r;
+
+        assert(iovw);
+        assert(context);
+
+        /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
+         * structure. (It remains valid, with the first iovec fields initialized.) */
+
+        pid = context->pid;
+
+        /* The following is mandatory */
+        r = pid_get_comm(pid, &t);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get COMM: %m");
+
+        r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
+        if (r < 0)
+                return r;
+
+        /* The following are optional, but we use them if present. */
+        r = get_process_exe(pid, &t);
+        if (r >= 0)
+                r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
+        if (r < 0)
+                log_warning_errno(r, "Failed to get EXE, ignoring: %m");
+
+        if (cg_pid_get_unit(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
+
+        if (cg_pid_get_user_unit(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
+
+        if (sd_pid_get_session(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
+
+        if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) {
+                r = asprintf(&t, UID_FMT, owner_uid);
+                if (r > 0)
+                        (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
+        }
+
+        if (sd_pid_get_slice(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
+
+        if (pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
+
+        if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
+
+        if (compose_open_fds(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
+
+        p = procfs_file_alloca(pid, "status");
+        if (read_full_virtual_file(p, &t, NULL) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
+
+        p = procfs_file_alloca(pid, "maps");
+        if (read_full_virtual_file(p, &t, NULL) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
+
+        p = procfs_file_alloca(pid, "limits");
+        if (read_full_virtual_file(p, &t, NULL) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
+
+        p = procfs_file_alloca(pid, "cgroup");
+        if (read_full_virtual_file(p, &t, NULL) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
+
+        p = procfs_file_alloca(pid, "mountinfo");
+        if (read_full_virtual_file(p, &t, NULL) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
+
+        /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
+        p = procfs_file_alloca(pid, "auxv");
+        if (read_full_virtual_file(p, &t, &size) >= 0) {
+                char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
+                if (buf) {
+                        /* Add a dummy terminator to make save_context() happy. */
+                        *((uint8_t*) mempcpy(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size)) = '\0';
+                        (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
+                }
+
+                free(t);
+        }
+
+        if (get_process_cwd(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
+
+        if (get_process_root(pid, &t) >= 0) {
+                bool proc_self_root_is_slash;
+
+                proc_self_root_is_slash = strcmp(t, "/") == 0;
+
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
+
+                /* If the process' root is "/", then there is a chance it has
+                 * mounted own root and hence being containerized. */
+                if (proc_self_root_is_slash && get_process_container_parent_cmdline(pid, &t) > 0)
+                        (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
+        }
+
+        if (get_process_environ(pid, &t) >= 0)
+                (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
+
+        /* we successfully acquired all metadata */
+        return save_context(context, iovw);
+}
+
+static int send_ucred(int transport_fd, struct ucred *ucred) {
+        CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
+        struct msghdr mh = {
+                .msg_control = &control,
+                .msg_controllen = sizeof(control),
+        };
+        struct cmsghdr *cmsg;
+
+        assert(transport_fd >= 0);
+
+        cmsg = CMSG_FIRSTHDR(&mh);
+        *cmsg = (struct cmsghdr) {
+                .cmsg_level = SOL_SOCKET,
+                .cmsg_type = SCM_CREDENTIALS,
+                .cmsg_len = CMSG_LEN(sizeof(struct ucred)),
+        };
+        memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred));
+
+        return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL));
+}
+
+static int receive_ucred(int transport_fd, struct ucred *ret_ucred) {
+        CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
+        struct msghdr mh = {
+                .msg_control = &control,
+                .msg_controllen = sizeof(control),
+        };
+        struct cmsghdr *cmsg = NULL;
+        struct ucred *ucred = NULL;
+        ssize_t n;
+
+        assert(ret_ucred);
+
+        n = recvmsg_safe(transport_fd, &mh, 0);
+        if (n < 0)
+                return n;
+
+        CMSG_FOREACH(cmsg, &mh)
+                if (cmsg->cmsg_level == SOL_SOCKET &&
+                    cmsg->cmsg_type == SCM_CREDENTIALS &&
+                    cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+                        assert(!ucred);
+                        ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
+                }
+
+        if (!ucred)
+                return -EIO;
+
+        *ret_ucred = *ucred;
+
+        return 0;
+}
+
+static int can_forward_coredump(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL;
+        int r;
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+        if (r < 0)
+                return r;
+
+        r = path_extract_directory(cgroup, &path);
+        if (r < 0)
+                return r;
+
+        r = cg_path_get_unit_path(path, &unit);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r == -ENXIO)
+                /* No valid units in this path. */
+                return false;
+        if (r < 0)
+                return r;
+
+        /* We require that this process belongs to a delegated cgroup
+         * (i.e. Delegate=yes), with CoredumpReceive=yes also. */
+        r = cg_is_delegated(unit);
+        if (r <= 0)
+                return r;
+
+        return cg_has_coredump_receive(unit);
+}
+
+static int forward_coredump_to_container(Context *context) {
+        _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF;
+        _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+        pid_t pid, child;
+        struct ucred ucred = {
+                .pid = context->pid,
+                .uid = context->uid,
+                .gid = context->gid,
+        };
+        int r;
+
+        r = namespace_get_leader(context->pid, NAMESPACE_PID, &pid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get namespace leader: %m");
+
+        r = can_forward_coredump(pid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m");
+        if (r == 0)
+                return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
+                                       "Coredump will not be forwarded because no target cgroup was found.");
+
+        r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair));
+        if (r < 0)
+                return log_debug_errno(r, "Failed to create socket pair: %m");
+
+        r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to set SO_PASSCRED: %m");
+
+        r = namespace_open(pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to join namespaces of PID " PID_FMT ": %m", pid);
+
+        r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0,
+                           FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
+                           pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", pid);
+        if (r == 0) {
+                _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
+                Context child_context = {};
+
+                pair[0] = safe_close(pair[0]);
+
+                if (laccess("/run/systemd/coredump", W_OK) < 0) {
+                        log_debug_errno(errno, "Cannot find coredump socket, exiting: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = receive_ucred(pair[1], &ucred);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to receive ucred and fd: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                iovw = iovw_new();
+                if (!iovw) {
+                        log_oom();
+                        _exit(EXIT_FAILURE);
+                }
+
+                (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
+                (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
+                (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1");
+
+                for (int i = 0; i < _META_ARGV_MAX; i++) {
+                        int signo;
+                        char buf[DECIMAL_STR_MAX(pid_t)];
+                        const char *t = context->meta[i];
+
+                        switch(i) {
+
+                        case META_ARGV_PID:
+                                xsprintf(buf, PID_FMT, ucred.pid);
+                                t = buf;
+
+                                break;
+
+                        case META_ARGV_UID:
+                                xsprintf(buf, UID_FMT, ucred.uid);
+                                t = buf;
+                                break;
+
+                        case META_ARGV_GID:
+                                xsprintf(buf, GID_FMT, ucred.gid);
+                                t = buf;
+                                break;
+
+                        case META_ARGV_SIGNAL:
+                                if (safe_atoi(t, &signo) >= 0 && SIGNAL_VALID(signo))
+                                        (void) iovw_put_string_field(iovw,
+                                                                     "COREDUMP_SIGNAL_NAME=SIG",
+                                                                     signal_to_string(signo));
+                                break;
+
+                        default:
+                                break;
+                        }
+
+                        r = iovw_put_string_field(iovw, meta_field_names[i], t);
+                        if (r < 0) {
+                                log_debug_errno(r, "Failed to construct iovec: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+                }
+
+                r = save_context(&child_context, iovw);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to save context: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = gather_pid_metadata_from_procfs(iovw, &child_context);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to gather metadata from procfs: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = send_iovec(iovw, STDIN_FILENO);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to send iovec to coredump socket: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        pair[1] = safe_close(pair[1]);
+
+        /* We need to translate the PID, UID, and GID of the crashing process
+         * to the container's namespaces. Do this by sending an SCM_CREDENTIALS
+         * message on a socket pair, and read the result when we join the
+         * container. The kernel will perform the translation for us. */
+        r = send_ucred(pair[0], &ucred);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to send metadata to container: %m");
+
+        r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to wait for child to terminate: %m");
+        if (r != EXIT_SUCCESS)
+                return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container: %m");
+
+        return 0;
+}
+
+static int process_kernel(int argc, char* argv[]) {
+        _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
+        Context context = {};
+        int r;
+
+        /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
+         * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
+         * /dev/null. */
+        r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
+
+        log_debug("Processing coredump received from the kernel...");
+
+        iovw = iovw_new();
+        if (!iovw)
+                return log_oom();
+
+        /* Collect all process metadata passed by the kernel through argv[] */
+        r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
+        if (r < 0)
+                return r;
+
+        /* Collect the rest of the process metadata retrieved from the runtime */
+        r = gather_pid_metadata_from_procfs(iovw, &context);
+        if (r < 0)
+                return r;
+
+        if (!context.is_journald)
+                /* OK, now we know it's not the journal, hence we can make use of it now. */
+                log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG);
+
+        r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID);
+        if (r < 0)
+                log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
+        if (r == 0) {
+                /* If this fails, fallback to the old behavior so that
+                 * there is still some record of the crash. */
+                r = forward_coredump_to_container(&context);
+                if (r >= 0)
+                        return 0;
+        }
+
+        /* If this is PID 1 disable coredump collection, we'll unlikely be able to process
+         * it later on.
+         *
+         * FIXME: maybe we should disable coredumps generation from the beginning and
+         * re-enable it only when we know it's either safe (ie we're not running OOM) or
+         * it's not pid1 ? */
+        if (context.is_pid1) {
+                log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
+                disable_coredumps();
+        }
+
+        (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
+        (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
+
+        if (context.is_journald || context.is_pid1)
+                return submit_coredump(&context, iovw, STDIN_FILENO);
+
+        return send_iovec(iovw, STDIN_FILENO);
+}
+
+static int process_backtrace(int argc, char *argv[]) {
+        _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
+        _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
+        Context context = {};
+        char *message;
+        int r;
+
+        log_debug("Processing backtrace on stdin...");
+
+        iovw = iovw_new();
+        if (!iovw)
+                return log_oom();
+
+        (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
+        (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
+
+        /* Collect all process metadata from argv[] by making sure to skip the
+         * '--backtrace' option */
+        r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
+        if (r < 0)
+                return r;
+
+        /* Collect the rest of the process metadata retrieved from the runtime */
+        r = gather_pid_metadata_from_procfs(iovw, &context);
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                r = journal_importer_process_data(&importer);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse journal entry on stdin: %m");
+                if (r == 1 ||                        /* complete entry */
+                    journal_importer_eof(&importer)) /* end of data */
+                        break;
+        }
+
+        if (journal_importer_eof(&importer)) {
+                log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
+
+                message = strjoina("Process ", context.meta[META_ARGV_PID],
+                                  " (", context.meta[META_COMM], ")"
+                                  " of user ", context.meta[META_ARGV_UID],
+                                  " failed with ", context.meta[META_ARGV_SIGNAL]);
+
+                r = iovw_put_string_field(iovw, "MESSAGE=", message);
+                if (r < 0)
+                        return r;
+        } else {
+                /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the
+                 * end of the array. */
+                r = iovw_append(iovw, &importer.iovw);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_journal_sendv(iovw->iovec, iovw->count);
+        if (r < 0)
+                return log_error_errno(r, "Failed to log backtrace: %m");
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        /* First, log to a safe place, since we don't know what crashed and it might
+         * be journald which we'd rather not log to then. */
+
+        log_set_target_and_open(LOG_TARGET_KMSG);
+
+        /* Make sure we never enter a loop */
+        (void) prctl(PR_SET_DUMPABLE, 0);
+
+        /* Ignore all parse errors */
+        (void) parse_config();
+
+        log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
+        log_debug("Selected compression %s.", yes_no(arg_compress));
+
+        r = sd_listen_fds(false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
+
+        /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
+         * are invoked from the kernel as coredump handler. */
+        if (r == 0) {
+                if (streq_ptr(argv[1], "--backtrace"))
+                        return process_backtrace(argc, argv);
+                else
+                        return process_kernel(argc, argv);
+        } else if (r == 1)
+                return process_socket(SD_LISTEN_FDS_START);
+
+        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                               "Received unexpected number of file descriptors.");
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/coredump/coredump.conf b/src/coredump/coredump.conf
new file mode 100644
index 0000000..ae341e4
--- /dev/null
+++ b/src/coredump/coredump.conf
@@ -0,0 +1,27 @@
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it under the
+#  terms of the GNU Lesser General Public License as published by the Free
+#  Software Foundation; either version 2.1 of the License, or (at your option)
+#  any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file (or a copy of it placed in
+# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in
+# the /etc/systemd/coredump.conf.d/ directory. The latter is generally
+# recommended. Defaults can be restored by simply deleting the main
+# configuration file and all drop-ins located in /etc/.
+#
+# Use 'systemd-analyze cat-config systemd/coredump.conf' to display the full config.
+#
+# See coredump.conf(5) for details.
+
+[Coredump]
+#Storage=external
+#Compress=yes
+# On 32-bit, the default is 1G instead of 32G.
+#ProcessSizeMax=32G
+#ExternalSizeMax=32G
+#JournalSizeMax=767M
+#MaxUse=
+#KeepFree=
diff --git a/src/coredump/coredumpctl.c b/src/coredump/coredumpctl.c
new file mode 100644
index 0000000..84d4531
--- /dev/null
+++ b/src/coredump/coredumpctl.c
@@ -0,0 +1,1418 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+#include "sd-journal.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-util.h"
+#include "chase.h"
+#include "compress.h"
+#include "constants.h"
+#include "dissect-image.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "format-table.h"
+#include "fs-util.h"
+#include "glob-util.h"
+#include "journal-internal.h"
+#include "journal-util.h"
+#include "log.h"
+#include "macro.h"
+#include "main-func.h"
+#include "mount-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "sigbus.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "user-util.h"
+#include "verbs.h"
+
+#define SHORT_BUS_CALL_TIMEOUT_USEC (3 * USEC_PER_SEC)
+
+static usec_t arg_since = USEC_INFINITY, arg_until = USEC_INFINITY;
+static const char* arg_field = NULL;
+static const char *arg_debugger = NULL;
+static char **arg_debugger_args = NULL;
+static const char *arg_directory = NULL;
+static char *arg_root = NULL;
+static char *arg_image = NULL;
+static char **arg_file = NULL;
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+static PagerFlags arg_pager_flags = 0;
+static int arg_legend = true;
+static size_t arg_rows_max = SIZE_MAX;
+static const char* arg_output = NULL;
+static bool arg_reverse = false;
+static bool arg_quiet = false;
+static bool arg_all = false;
+static ImagePolicy *arg_image_policy = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_debugger_args, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_file, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
+
+static int add_match(sd_journal *j, const char *match) {
+        _cleanup_free_ char *p = NULL;
+        const char* prefix, *pattern;
+        pid_t pid;
+        int r;
+
+        if (strchr(match, '='))
+                prefix = "";
+        else if (strchr(match, '/')) {
+                r = path_make_absolute_cwd(match, &p);
+                if (r < 0)
+                        return log_error_errno(r, "path_make_absolute_cwd(\"%s\"): %m", match);
+
+                match = p;
+                prefix = "COREDUMP_EXE=";
+        } else if (parse_pid(match, &pid) >= 0)
+                prefix = "COREDUMP_PID=";
+        else
+                prefix = "COREDUMP_COMM=";
+
+        pattern = strjoina(prefix, match);
+        log_debug("Adding match: %s", pattern);
+        r = sd_journal_add_match(j, pattern, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add match \"%s\": %m", match);
+
+        return 0;
+}
+
+static int add_matches(sd_journal *j, char **matches) {
+        int r;
+
+        r = sd_journal_add_match(j, "MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add match \"%s\": %m", "MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR);
+
+        r = sd_journal_add_match(j, "MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add match \"%s\": %m", "MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR);
+
+        STRV_FOREACH(match, matches) {
+                r = add_match(j, *match);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int acquire_journal(sd_journal **ret, char **matches) {
+        _cleanup_(sd_journal_closep) sd_journal *j = NULL;
+        int r;
+
+        assert(ret);
+
+        if (arg_directory) {
+                r = sd_journal_open_directory(&j, arg_directory, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open journals in directory: %s: %m", arg_directory);
+        } else if (arg_root) {
+                r = sd_journal_open_directory(&j, arg_root, SD_JOURNAL_OS_ROOT);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open journals in root directory: %s: %m", arg_root);
+        } else if (arg_file) {
+                r = sd_journal_open_files(&j, (const char**)arg_file, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open journal files: %m");
+        } else {
+                r = sd_journal_open(&j, arg_all ? 0 : SD_JOURNAL_LOCAL_ONLY);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open journal: %m");
+        }
+
+        r = journal_access_check_and_warn(j, arg_quiet, true);
+        if (r < 0)
+                return r;
+
+        r = add_matches(j, matches);
+        if (r < 0)
+                return r;
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *filter = NULL;
+
+                filter = journal_make_match_string(j);
+                log_debug("Journal filter: %s", filter);
+        }
+
+        *ret = TAKE_PTR(j);
+
+        return 0;
+}
+
+static int verb_help(int argc, char **argv, void *userdata) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("coredumpctl", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s [OPTIONS...] COMMAND ...\n\n"
+               "%5$sList or retrieve coredumps from the journal.%6$s\n"
+               "\n%3$sCommands:%4$s\n"
+               "  list [MATCHES...]  List available coredumps (default)\n"
+               "  info [MATCHES...]  Show detailed information about one or more coredumps\n"
+               "  dump [MATCHES...]  Print first matching coredump to stdout\n"
+               "  debug [MATCHES...] Start a debugger for the first matching coredump\n"
+               "\n%3$sOptions:%4$s\n"
+               "  -h --help                    Show this help\n"
+               "     --version                 Print version string\n"
+               "     --no-pager                Do not pipe output into a pager\n"
+               "     --no-legend               Do not print the column headers\n"
+               "     --json=pretty|short|off\n"
+               "                               Generate JSON output\n"
+               "     --debugger=DEBUGGER       Use the given debugger\n"
+               "  -A --debugger-arguments=ARGS Pass the given arguments to the debugger\n"
+               "  -n INT                       Show maximum number of rows\n"
+               "  -1                           Show information about most recent entry only\n"
+               "  -S --since=DATE              Only print coredumps since the date\n"
+               "  -U --until=DATE              Only print coredumps until the date\n"
+               "  -r --reverse                 Show the newest entries first\n"
+               "  -F --field=FIELD             List all values a certain field takes\n"
+               "  -o --output=FILE             Write output to FILE\n"
+               "     --file=PATH               Use journal file\n"
+               "  -D --directory=DIR           Use journal files from directory\n\n"
+               "  -q --quiet                   Do not show info messages and privilege warning\n"
+               "     --all                     Look at all journal files instead of local ones\n"
+               "     --root=PATH               Operate on an alternate filesystem root\n"
+               "     --image=PATH              Operate on disk image as filesystem root\n"
+               "     --image-policy=POLICY     Specify disk image dissection policy\n"
+               "\nSee the %2$s for details.\n",
+               program_invocation_short_name,
+               link,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_PAGER,
+                ARG_NO_LEGEND,
+                ARG_JSON,
+                ARG_DEBUGGER,
+                ARG_FILE,
+                ARG_ROOT,
+                ARG_IMAGE,
+                ARG_IMAGE_POLICY,
+                ARG_ALL,
+        };
+
+        int c, r;
+
+        static const struct option options[] = {
+                { "help",               no_argument,       NULL, 'h'              },
+                { "version" ,           no_argument,       NULL, ARG_VERSION      },
+                { "no-pager",           no_argument,       NULL, ARG_NO_PAGER     },
+                { "no-legend",          no_argument,       NULL, ARG_NO_LEGEND    },
+                { "debugger",           required_argument, NULL, ARG_DEBUGGER     },
+                { "debugger-arguments", required_argument, NULL, 'A'              },
+                { "output",             required_argument, NULL, 'o'              },
+                { "field",              required_argument, NULL, 'F'              },
+                { "file",               required_argument, NULL, ARG_FILE         },
+                { "directory",          required_argument, NULL, 'D'              },
+                { "reverse",            no_argument,       NULL, 'r'              },
+                { "since",              required_argument, NULL, 'S'              },
+                { "until",              required_argument, NULL, 'U'              },
+                { "quiet",              no_argument,       NULL, 'q'              },
+                { "json",               required_argument, NULL, ARG_JSON         },
+                { "root",               required_argument, NULL, ARG_ROOT         },
+                { "image",              required_argument, NULL, ARG_IMAGE        },
+                { "image-policy",       required_argument, NULL, ARG_IMAGE_POLICY },
+                { "all",                no_argument,       NULL, ARG_ALL          },
+                {}
+        };
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hA:o:F:1D:rS:U:qn:", options, NULL)) >= 0)
+                switch (c) {
+                case 'h':
+                        return verb_help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_NO_LEGEND:
+                        arg_legend = false;
+                        break;
+
+                case ARG_DEBUGGER:
+                        arg_debugger = optarg;
+                        break;
+
+                case 'A': {
+                        _cleanup_strv_free_ char **l = NULL;
+                        r = strv_split_full(&l, optarg, WHITESPACE, EXTRACT_UNQUOTE);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse debugger arguments '%s': %m", optarg);
+                        strv_free_and_replace(arg_debugger_args, l);
+                        break;
+                }
+
+                case ARG_FILE:
+                        r = glob_extend(&arg_file, optarg, GLOB_NOCHECK);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to add paths: %m");
+                        break;
+
+                case 'o':
+                        if (arg_output)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Cannot set output more than once.");
+
+                        arg_output = optarg;
+                        break;
+
+                case 'S':
+                        r = parse_timestamp(optarg, &arg_since);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse timestamp '%s': %m", optarg);
+                        break;
+
+                case 'U':
+                        r = parse_timestamp(optarg, &arg_until);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse timestamp '%s': %m", optarg);
+                        break;
+
+                case 'F':
+                        if (arg_field)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Cannot use --field/-F more than once.");
+                        arg_field = optarg;
+                        break;
+
+                case '1':
+                        arg_rows_max = 1;
+                        arg_reverse = true;
+                        break;
+
+                case 'n': {
+                        unsigned n;
+
+                        r = safe_atou(optarg, &n);
+                        if (r < 0 || n < 1)
+                                return log_error_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL),
+                                                       "Invalid numeric parameter to -n: %s", optarg);
+
+                        arg_rows_max = n;
+                        break;
+                }
+
+                case 'D':
+                        arg_directory = optarg;
+                        break;
+
+                case ARG_ROOT:
+                        r = parse_path_argument(optarg, false, &arg_root);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE:
+                        r = parse_path_argument(optarg, false, &arg_image);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE_POLICY:
+                        r = parse_image_policy_argument(optarg, &arg_image_policy);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case 'r':
+                        arg_reverse = true;
+                        break;
+
+                case 'q':
+                        arg_quiet = true;
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case ARG_ALL:
+                        arg_all = true;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (arg_since != USEC_INFINITY && arg_until != USEC_INFINITY &&
+            arg_since > arg_until)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--since= must be before --until=.");
+
+        if ((!!arg_directory + !!arg_image + !!arg_root) > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root=, --image= or -D/--directory=, the combination of these options is not supported.");
+
+        return 1;
+}
+
+static int retrieve(const void *data,
+                    size_t len,
+                    const char *name,
+                    char **var) {
+
+        size_t ident;
+        char *v;
+
+        ident = strlen(name) + 1; /* name + "=" */
+
+        if (len < ident)
+                return 0;
+
+        if (memcmp(data, name, ident - 1) != 0)
+                return 0;
+
+        if (((const char*) data)[ident - 1] != '=')
+                return 0;
+
+        v = strndup((const char*)data + ident, len - ident);
+        if (!v)
+                return log_oom();
+
+        free_and_replace(*var, v);
+        return 1;
+}
+
+static int print_field(FILE* file, sd_journal *j) {
+        const void *d;
+        size_t l;
+
+        assert(file);
+        assert(j);
+
+        assert(arg_field);
+
+        /* A (user-specified) field may appear more than once for a given entry.
+         * We will print all of the occurrences.
+         * This is different below for fields that systemd-coredump uses,
+         * because they cannot meaningfully appear more than once.
+         */
+        SD_JOURNAL_FOREACH_DATA(j, d, l) {
+                _cleanup_free_ char *value = NULL;
+                int r;
+
+                r = retrieve(d, l, arg_field, &value);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        fprintf(file, "%s\n", value);
+        }
+
+        return 0;
+}
+
+#define RETRIEVE(d, l, name, arg)                    \
+        {                                            \
+                int _r = retrieve(d, l, name, &arg); \
+                if (_r < 0)                          \
+                        return _r;                   \
+                if (_r > 0)                          \
+                        continue;                    \
+        }
+
+static void analyze_coredump_file(
+                const char *path,
+                const char **ret_state,
+                const char **ret_color,
+                uint64_t *ret_size) {
+
+        _cleanup_close_ int fd = -EBADF;
+        struct stat st;
+        int r;
+
+        assert(path);
+        assert(ret_state);
+        assert(ret_color);
+        assert(ret_size);
+
+        fd = open(path, O_PATH|O_CLOEXEC);
+        if (fd < 0) {
+                if (errno == ENOENT) {
+                        *ret_state = "missing";
+                        *ret_color = ansi_grey();
+                        *ret_size = UINT64_MAX;
+                        return;
+                }
+
+                r = -errno;
+        } else
+                r = access_fd(fd, R_OK);
+        if (r < 0) {
+                if (ERRNO_IS_PRIVILEGE(r)) {
+                        *ret_state = "inaccessible";
+                        *ret_color = ansi_highlight_yellow();
+                        *ret_size = UINT64_MAX;
+                        return;
+                }
+                goto error;
+        }
+
+        if (fstat(fd, &st) < 0)
+                goto error;
+
+        if (!S_ISREG(st.st_mode))
+                goto error;
+
+        *ret_state = "present";
+        *ret_color = NULL;
+        *ret_size = st.st_size;
+        return;
+
+error:
+        *ret_state = "error";
+        *ret_color = ansi_highlight_red();
+        *ret_size = UINT64_MAX;
+}
+
+static int resolve_filename(const char *root, char **p) {
+        char *resolved = NULL;
+        int r;
+
+        if (!*p)
+                return 0;
+
+        r = chase(*p, root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to resolve \"%s%s\": %m", strempty(root), *p);
+
+        free_and_replace(*p, resolved);
+
+        /* chase() with flag CHASE_NONEXISTENT will return 0 if the file doesn't exist and 1 if it does.
+         * Return that to the caller
+         */
+        return r;
+}
+
+static int print_list(FILE* file, sd_journal *j, Table *t) {
+        _cleanup_free_ char
+                *mid = NULL, *pid = NULL, *uid = NULL, *gid = NULL,
+                *sgnl = NULL, *exe = NULL, *comm = NULL, *cmdline = NULL,
+                *filename = NULL, *truncated = NULL, *coredump = NULL;
+        const void *d;
+        size_t l;
+        usec_t ts;
+        int r, signal_as_int = 0;
+        const char *present = NULL, *color = NULL;
+        uint64_t size = UINT64_MAX;
+        bool normal_coredump;
+        uid_t uid_as_int = UID_INVALID;
+        gid_t gid_as_int = GID_INVALID;
+        pid_t pid_as_int = 0;
+
+        assert(file);
+        assert(j);
+        assert(t);
+
+        SD_JOURNAL_FOREACH_DATA(j, d, l) {
+                RETRIEVE(d, l, "MESSAGE_ID", mid);
+                RETRIEVE(d, l, "COREDUMP_PID", pid);
+                RETRIEVE(d, l, "COREDUMP_UID", uid);
+                RETRIEVE(d, l, "COREDUMP_GID", gid);
+                RETRIEVE(d, l, "COREDUMP_SIGNAL", sgnl);
+                RETRIEVE(d, l, "COREDUMP_EXE", exe);
+                RETRIEVE(d, l, "COREDUMP_COMM", comm);
+                RETRIEVE(d, l, "COREDUMP_CMDLINE", cmdline);
+                RETRIEVE(d, l, "COREDUMP_FILENAME", filename);
+                RETRIEVE(d, l, "COREDUMP_TRUNCATED", truncated);
+                RETRIEVE(d, l, "COREDUMP", coredump);
+        }
+
+        if (!pid && !uid && !gid && !sgnl && !exe && !comm && !cmdline && !filename)
+                return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Empty coredump log entry");
+
+        (void) parse_uid(uid, &uid_as_int);
+        (void) parse_gid(gid, &gid_as_int);
+        (void) parse_pid(pid, &pid_as_int);
+        signal_as_int = signal_from_string(sgnl);
+
+        r = sd_journal_get_realtime_usec(j, &ts);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get realtime timestamp: %m");
+
+        normal_coredump = streq_ptr(mid, SD_MESSAGE_COREDUMP_STR);
+
+        if (filename) {
+                r = resolve_filename(arg_root, &filename);
+                if (r < 0)
+                        return r;
+
+                analyze_coredump_file(filename, &present, &color, &size);
+        } else if (coredump)
+                present = "journal";
+        else if (normal_coredump) {
+                present = "none";
+                color = ansi_grey();
+        } else
+                present = NULL;
+
+        if (STRPTR_IN_SET(present, "present", "journal") && truncated && parse_boolean(truncated) > 0)
+                present = "truncated";
+
+        r = table_add_many(
+                        t,
+                        TABLE_TIMESTAMP, ts,
+                        TABLE_PID, pid_as_int,
+                        TABLE_UID, uid_as_int,
+                        TABLE_GID, gid_as_int,
+                        TABLE_SIGNAL, normal_coredump ? signal_as_int : 0,
+                        TABLE_STRING, present,
+                        TABLE_SET_COLOR, color,
+                        TABLE_STRING, exe ?: comm ?: cmdline,
+                        TABLE_SIZE, size);
+        if (r < 0)
+                return table_log_add_error(r);
+
+        return 0;
+}
+
+static int print_info(FILE *file, sd_journal *j, bool need_space) {
+        _cleanup_free_ char
+                *mid = NULL, *pid = NULL, *uid = NULL, *gid = NULL,
+                *sgnl = NULL, *exe = NULL, *comm = NULL, *cmdline = NULL,
+                *unit = NULL, *user_unit = NULL, *session = NULL,
+                *boot_id = NULL, *machine_id = NULL, *hostname = NULL,
+                *slice = NULL, *cgroup = NULL, *owner_uid = NULL,
+                *message = NULL, *timestamp = NULL, *filename = NULL,
+                *truncated = NULL, *coredump = NULL,
+                *pkgmeta_name = NULL, *pkgmeta_version = NULL, *pkgmeta_json = NULL;
+        const void *d;
+        size_t l;
+        bool normal_coredump;
+        int r;
+
+        assert(file);
+        assert(j);
+
+        (void) sd_journal_set_data_threshold(j, 0);
+
+        SD_JOURNAL_FOREACH_DATA(j, d, l) {
+                RETRIEVE(d, l, "MESSAGE_ID", mid);
+                RETRIEVE(d, l, "COREDUMP_PID", pid);
+                RETRIEVE(d, l, "COREDUMP_UID", uid);
+                RETRIEVE(d, l, "COREDUMP_GID", gid);
+                RETRIEVE(d, l, "COREDUMP_SIGNAL", sgnl);
+                RETRIEVE(d, l, "COREDUMP_EXE", exe);
+                RETRIEVE(d, l, "COREDUMP_COMM", comm);
+                RETRIEVE(d, l, "COREDUMP_CMDLINE", cmdline);
+                RETRIEVE(d, l, "COREDUMP_HOSTNAME", hostname);
+                RETRIEVE(d, l, "COREDUMP_UNIT", unit);
+                RETRIEVE(d, l, "COREDUMP_USER_UNIT", user_unit);
+                RETRIEVE(d, l, "COREDUMP_SESSION", session);
+                RETRIEVE(d, l, "COREDUMP_OWNER_UID", owner_uid);
+                RETRIEVE(d, l, "COREDUMP_SLICE", slice);
+                RETRIEVE(d, l, "COREDUMP_CGROUP", cgroup);
+                RETRIEVE(d, l, "COREDUMP_TIMESTAMP", timestamp);
+                RETRIEVE(d, l, "COREDUMP_FILENAME", filename);
+                RETRIEVE(d, l, "COREDUMP_TRUNCATED", truncated);
+                RETRIEVE(d, l, "COREDUMP", coredump);
+                RETRIEVE(d, l, "COREDUMP_PACKAGE_NAME", pkgmeta_name);
+                RETRIEVE(d, l, "COREDUMP_PACKAGE_VERSION", pkgmeta_version);
+                RETRIEVE(d, l, "COREDUMP_PACKAGE_JSON", pkgmeta_json);
+                RETRIEVE(d, l, "_BOOT_ID", boot_id);
+                RETRIEVE(d, l, "_MACHINE_ID", machine_id);
+                RETRIEVE(d, l, "MESSAGE", message);
+        }
+
+        if (need_space)
+                fputs("\n", file);
+
+        normal_coredump = streq_ptr(mid, SD_MESSAGE_COREDUMP_STR);
+
+        if (comm)
+                fprintf(file,
+                        "           PID: %s%s%s (%s)\n",
+                        ansi_highlight(), strna(pid), ansi_normal(), comm);
+        else
+                fprintf(file,
+                        "           PID: %s%s%s\n",
+                        ansi_highlight(), strna(pid), ansi_normal());
+
+        if (uid) {
+                uid_t n;
+
+                if (parse_uid(uid, &n) >= 0) {
+                        _cleanup_free_ char *u = NULL;
+
+                        u = uid_to_name(n);
+                        fprintf(file,
+                                "           UID: %s (%s)\n",
+                                uid, u);
+                } else {
+                        fprintf(file,
+                                "           UID: %s\n",
+                                uid);
+                }
+        }
+
+        if (gid) {
+                gid_t n;
+
+                if (parse_gid(gid, &n) >= 0) {
+                        _cleanup_free_ char *g = NULL;
+
+                        g = gid_to_name(n);
+                        fprintf(file,
+                                "           GID: %s (%s)\n",
+                                gid, g);
+                } else {
+                        fprintf(file,
+                                "           GID: %s\n",
+                                gid);
+                }
+        }
+
+        if (sgnl) {
+                int sig;
+                const char *name = normal_coredump ? "Signal" : "Reason";
+
+                if (normal_coredump && safe_atoi(sgnl, &sig) >= 0)
+                        fprintf(file, "        %s: %s (%s)\n", name, sgnl, signal_to_string(sig));
+                else
+                        fprintf(file, "        %s: %s\n", name, sgnl);
+        }
+
+        if (timestamp) {
+                usec_t u;
+
+                r = safe_atou64(timestamp, &u);
+                if (r >= 0)
+                        fprintf(file, "     Timestamp: %s (%s)\n",
+                                FORMAT_TIMESTAMP(u), FORMAT_TIMESTAMP_RELATIVE(u));
+
+                else
+                        fprintf(file, "     Timestamp: %s\n", timestamp);
+        }
+
+        if (cmdline)
+                fprintf(file, "  Command Line: %s\n", cmdline);
+        if (exe)
+                fprintf(file, "    Executable: %s%s%s\n", ansi_highlight(), exe, ansi_normal());
+        if (cgroup)
+                fprintf(file, " Control Group: %s\n", cgroup);
+        if (unit)
+                fprintf(file, "          Unit: %s\n", unit);
+        if (user_unit)
+                fprintf(file, "     User Unit: %s\n", user_unit);
+        if (slice)
+                fprintf(file, "         Slice: %s\n", slice);
+        if (session)
+                fprintf(file, "       Session: %s\n", session);
+        if (owner_uid) {
+                uid_t n;
+
+                if (parse_uid(owner_uid, &n) >= 0) {
+                        _cleanup_free_ char *u = NULL;
+
+                        u = uid_to_name(n);
+                        fprintf(file,
+                                "     Owner UID: %s (%s)\n",
+                                owner_uid, u);
+                } else {
+                        fprintf(file,
+                                "     Owner UID: %s\n",
+                                owner_uid);
+                }
+        }
+        if (boot_id)
+                fprintf(file, "       Boot ID: %s\n", boot_id);
+        if (machine_id)
+                fprintf(file, "    Machine ID: %s\n", machine_id);
+        if (hostname)
+                fprintf(file, "      Hostname: %s\n", hostname);
+
+        if (filename) {
+                r = resolve_filename(arg_root, &filename);
+                if (r < 0)
+                        return r;
+
+                const char *state = NULL, *color = NULL;
+                uint64_t size = UINT64_MAX;
+
+                analyze_coredump_file(filename, &state, &color, &size);
+
+                if (STRPTR_IN_SET(state, "present", "journal") && truncated && parse_boolean(truncated) > 0)
+                        state = "truncated";
+
+                fprintf(file,
+                        "       Storage: %s%s (%s)%s\n",
+                        strempty(color),
+                        filename,
+                        state,
+                        ansi_normal());
+
+                if (size != UINT64_MAX)
+                        fprintf(file, "  Size on Disk: %s\n", FORMAT_BYTES(size));
+
+        } else if (coredump)
+                fprintf(file, "       Storage: journal\n");
+        else
+                fprintf(file, "       Storage: none\n");
+
+        if (pkgmeta_name && pkgmeta_version)
+                fprintf(file, "       Package: %s/%s\n", pkgmeta_name, pkgmeta_version);
+
+        /* Print out the build-id of the 'main' ELF module, by matching the JSON key
+         * with the 'exe' field. */
+        if (exe && pkgmeta_json) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+                r = json_parse(pkgmeta_json, 0, &v, NULL, NULL);
+                if (r < 0) {
+                        _cleanup_free_ char *esc = cescape(pkgmeta_json);
+                        log_warning_errno(r, "json_parse on \"%s\" failed, ignoring: %m", strnull(esc));
+                } else {
+                        const char *module_name;
+                        JsonVariant *module_json;
+
+                        JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, v) {
+                                JsonVariant *build_id;
+
+                                /* We only print the build-id for the 'main' ELF module */
+                                if (!path_equal_filename(module_name, exe))
+                                        continue;
+
+                                build_id = json_variant_by_key(module_json, "buildId");
+                                if (build_id)
+                                        fprintf(file, "      build-id: %s\n", json_variant_string(build_id));
+
+                                break;
+                        }
+                }
+        }
+
+        if (message) {
+                _cleanup_free_ char *m = NULL;
+
+                m = strreplace(message, "\n", "\n                ");
+
+                fprintf(file, "       Message: %s\n", strstrip(m ?: message));
+        }
+
+        return 0;
+}
+
+static int focus(sd_journal *j) {
+        int r;
+
+        r = sd_journal_seek_tail(j);
+        if (r == 0)
+                r = sd_journal_previous(j);
+        if (r < 0)
+                return log_error_errno(r, "Failed to search journal: %m");
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
+                                       "No match found.");
+        return r;
+}
+
+static int print_entry(
+                sd_journal *j,
+                size_t n_found,
+                Table *t) {
+
+        assert(j);
+
+        if (t)
+                return print_list(stdout, j, t);
+        else if (arg_field)
+                return print_field(stdout, j);
+        else
+                return print_info(stdout, j, n_found > 0);
+}
+
+static int dump_list(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_journal_closep) sd_journal *j = NULL;
+        _cleanup_(table_unrefp) Table *t = NULL;
+        size_t n_found = 0;
+        bool verb_is_info;
+        int r;
+
+        verb_is_info = argc >= 1 && streq(argv[0], "info");
+
+        r = acquire_journal(&j, argv + 1);
+        if (r < 0)
+                return r;
+
+        /* The coredumps are likely compressed, and for just listing them we don't need to decompress them,
+         * so let's pick a fairly low data threshold here */
+        (void) sd_journal_set_data_threshold(j, 4096);
+
+        if (!verb_is_info && !arg_field) {
+                t = table_new("time", "pid", "uid", "gid", "sig", "corefile", "exe", "size");
+                if (!t)
+                        return log_oom();
+
+                (void) table_set_align_percent(t, TABLE_HEADER_CELL(1), 100);
+                (void) table_set_align_percent(t, TABLE_HEADER_CELL(2), 100);
+                (void) table_set_align_percent(t, TABLE_HEADER_CELL(3), 100);
+                (void) table_set_align_percent(t, TABLE_HEADER_CELL(7), 100);
+
+                table_set_ersatz_string(t, TABLE_ERSATZ_DASH);
+        } else
+                pager_open(arg_pager_flags);
+
+        /* "info" without pattern implies "-1" */
+        if ((arg_rows_max == 1 && arg_reverse) || (verb_is_info && argc == 1)) {
+                r = focus(j);
+                if (r < 0)
+                        return r;
+
+                r = print_entry(j, 0, t);
+                if (r < 0)
+                        return r;
+        } else {
+                if (arg_since != USEC_INFINITY && !arg_reverse)
+                        r = sd_journal_seek_realtime_usec(j, arg_since);
+                else if (arg_until != USEC_INFINITY && arg_reverse)
+                        r = sd_journal_seek_realtime_usec(j, arg_until);
+                else if (arg_reverse)
+                        r = sd_journal_seek_tail(j);
+                else
+                        r = sd_journal_seek_head(j);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to seek to date: %m");
+
+                for (;;) {
+                        if (!arg_reverse)
+                                r = sd_journal_next(j);
+                        else
+                                r = sd_journal_previous(j);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to iterate through journal: %m");
+                        if (r == 0)
+                                break;
+
+                        if (arg_until != USEC_INFINITY && !arg_reverse) {
+                                usec_t usec;
+
+                                r = sd_journal_get_realtime_usec(j, &usec);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to determine timestamp: %m");
+                                if (usec > arg_until)
+                                        continue;
+                        }
+
+                        if (arg_since != USEC_INFINITY && arg_reverse) {
+                                usec_t usec;
+
+                                r = sd_journal_get_realtime_usec(j, &usec);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to determine timestamp: %m");
+                                if (usec < arg_since)
+                                        continue;
+                        }
+
+                        r = print_entry(j, n_found++, t);
+                        if (r < 0)
+                                return r;
+
+                        if (arg_rows_max != SIZE_MAX && n_found >= arg_rows_max)
+                                break;
+                }
+
+                if (!arg_field && n_found <= 0) {
+                        if (!arg_quiet)
+                                log_notice("No coredumps found.");
+                        return -ESRCH;
+                }
+        }
+
+        if (t) {
+                r = table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int save_core(sd_journal *j, FILE *file, char **path, bool *unlink_temp) {
+        const char *data;
+        _cleanup_free_ char *filename = NULL;
+        size_t len;
+        int r, fd;
+        _cleanup_close_ int fdt = -EBADF;
+        char *temp = NULL;
+
+        assert(!(file && path));         /* At most one can be specified */
+        assert(!!path == !!unlink_temp); /* Those must be specified together */
+
+        /* Look for a coredump on disk first. */
+        r = sd_journal_get_data(j, "COREDUMP_FILENAME", (const void**) &data, &len);
+        if (r == 0) {
+                _cleanup_free_ char *resolved = NULL;
+
+                r = retrieve(data, len, "COREDUMP_FILENAME", &filename);
+                if (r < 0)
+                        return r;
+                assert(r > 0);
+
+                r = chase_and_access(filename, arg_root, CHASE_PREFIX_ROOT, F_OK, &resolved);
+                if (r < 0)
+                        return log_error_errno(r, "Cannot access \"%s%s\": %m", strempty(arg_root), filename);
+
+                free_and_replace(filename, resolved);
+
+                if (path && !ENDSWITH_SET(filename, ".xz", ".lz4", ".zst")) {
+                        *path = TAKE_PTR(filename);
+
+                        return 0;
+                }
+
+        } else {
+                if (r != -ENOENT)
+                        return log_error_errno(r, "Failed to retrieve COREDUMP_FILENAME field: %m");
+                /* Check that we can have a COREDUMP field. We still haven't set a high
+                 * data threshold, so we'll get a few kilobytes at most.
+                 */
+
+                r = sd_journal_get_data(j, "COREDUMP", (const void**) &data, &len);
+                if (r == -ENOENT)
+                        return log_error_errno(r, "Coredump entry has no core attached (neither internally in the journal nor externally on disk).");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to retrieve COREDUMP field: %m");
+        }
+
+        if (path) {
+                const char *vt;
+
+                /* Create a temporary file to write the uncompressed core to. */
+
+                r = var_tmp_dir(&vt);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire temporary directory path: %m");
+
+                temp = path_join(vt, "coredump-XXXXXX");
+                if (!temp)
+                        return log_oom();
+
+                fdt = mkostemp_safe(temp);
+                if (fdt < 0)
+                        return log_error_errno(fdt, "Failed to create temporary file: %m");
+                log_debug("Created temporary file %s", temp);
+
+                fd = fdt;
+        } else {
+                /* If neither path or file are specified, we will write to stdout. Let's now check
+                 * if stdout is connected to a tty. We checked that the file exists, or that the
+                 * core might be stored in the journal. In this second case, if we found the entry,
+                 * in all likelihood we will be able to access the COREDUMP= field.  In either case,
+                 * we stop before doing any "real" work, i.e. before starting decompression or
+                 * reading from the file or creating temporary files.
+                 */
+                if (!file) {
+                        if (on_tty())
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY),
+                                                       "Refusing to dump core to tty"
+                                                       " (use shell redirection or specify --output).");
+                        file = stdout;
+                }
+
+                fd = fileno(file);
+        }
+
+        if (filename) {
+#if HAVE_COMPRESSION
+                _cleanup_close_ int fdf = -EBADF;
+
+                fdf = open(filename, O_RDONLY | O_CLOEXEC);
+                if (fdf < 0) {
+                        r = log_error_errno(errno, "Failed to open %s: %m", filename);
+                        goto error;
+                }
+
+                r = decompress_stream(filename, fdf, fd, -1);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to decompress %s: %m", filename);
+                        goto error;
+                }
+#else
+                r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                    "Cannot decompress file. Compiled without compression support.");
+                goto error;
+#endif
+        } else {
+                ssize_t sz;
+
+                /* We want full data, nothing truncated. */
+                sd_journal_set_data_threshold(j, 0);
+
+                r = sd_journal_get_data(j, "COREDUMP", (const void**) &data, &len);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to retrieve COREDUMP field: %m");
+
+                assert(len >= 9);
+                data += 9;
+                len -= 9;
+
+                sz = write(fd, data, len);
+                if (sz < 0) {
+                        r = log_error_errno(errno, "Failed to write output: %m");
+                        goto error;
+                }
+                if (sz != (ssize_t) len) {
+                        log_error("Short write to output.");
+                        r = -EIO;
+                        goto error;
+                }
+        }
+
+        if (temp) {
+                *path = temp;
+                *unlink_temp = true;
+        }
+        return 0;
+
+error:
+        if (temp) {
+                (void) unlink(temp);
+                log_debug("Removed temporary file %s", temp);
+        }
+        return r;
+}
+
+static int dump_core(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_journal_closep) sd_journal *j = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        if (arg_field)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Option --field/-F only makes sense with list");
+
+        r = acquire_journal(&j, argv + 1);
+        if (r < 0)
+                return r;
+
+        r = focus(j);
+        if (r < 0)
+                return r;
+
+        if (arg_output) {
+                f = fopen(arg_output, "we");
+                if (!f)
+                        return log_error_errno(errno, "Failed to open \"%s\" for writing: %m", arg_output);
+        }
+
+        print_info(f ? stdout : stderr, j, false);
+
+        r = save_core(j, f, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_journal_previous(j);
+        if (r > 0 && !arg_quiet)
+                log_notice("More than one entry matches, ignoring rest.");
+
+        return 0;
+}
+
+static void sigterm_handler(int signal, siginfo_t *info, void *ucontext) {
+        assert(signal == SIGTERM);
+        assert(info);
+
+        /* If the sender is not us, propagate the signal to all processes in
+         * the same process group */
+        if (pid_is_valid(info->si_pid) && info->si_pid != getpid_cached())
+                (void) kill(0, signal);
+}
+
+static int run_debug(int argc, char **argv, void *userdata) {
+        _cleanup_(sd_journal_closep) sd_journal *j = NULL;
+        _cleanup_free_ char *exe = NULL, *path = NULL;
+        _cleanup_strv_free_ char **debugger_call = NULL;
+        struct sigaction sa = {
+                .sa_sigaction = sigterm_handler,
+                .sa_flags = SA_SIGINFO,
+        };
+        bool unlink_path = false;
+        const char *data, *fork_name;
+        size_t len;
+        pid_t pid;
+        int r;
+
+        if (!arg_debugger) {
+                char *env_debugger;
+
+                env_debugger = getenv("SYSTEMD_DEBUGGER");
+                if (env_debugger)
+                        arg_debugger = env_debugger;
+                else
+                        arg_debugger = "gdb";
+        }
+
+        r = strv_extend(&debugger_call, arg_debugger);
+        if (r < 0)
+                return log_oom();
+
+        r = strv_extend_strv(&debugger_call, arg_debugger_args, false);
+        if (r < 0)
+                return log_oom();
+
+        if (arg_field)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Option --field/-F only makes sense with list");
+
+        r = acquire_journal(&j, argv + 1);
+        if (r < 0)
+                return r;
+
+        r = focus(j);
+        if (r < 0)
+                return r;
+
+        if (!arg_quiet) {
+                print_info(stdout, j, false);
+                fputs("\n", stdout);
+        }
+
+        r = sd_journal_get_data(j, "COREDUMP_EXE", (const void**) &data, &len);
+        if (r < 0)
+                return log_error_errno(r, "Failed to retrieve COREDUMP_EXE field: %m");
+
+        assert(len > STRLEN("COREDUMP_EXE="));
+        data += STRLEN("COREDUMP_EXE=");
+        len -= STRLEN("COREDUMP_EXE=");
+
+        exe = strndup(data, len);
+        if (!exe)
+                return log_oom();
+
+        if (endswith(exe, " (deleted)"))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+                                       "Binary already deleted.");
+
+        if (!path_is_absolute(exe))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+                                       "Binary is not an absolute path.");
+
+        r = resolve_filename(arg_root, &exe);
+        if (r < 0)
+                return r;
+
+        r = save_core(j, NULL, &path, &unlink_path);
+        if (r < 0)
+                return r;
+
+        r = strv_extend_strv(&debugger_call, STRV_MAKE(exe, "-c", path), false);
+        if (r < 0)
+                return log_oom();
+
+        if (arg_root) {
+                if (streq(arg_debugger, "gdb")) {
+                        const char *sysroot_cmd;
+                        sysroot_cmd = strjoina("set sysroot ", arg_root);
+
+                        r = strv_extend_strv(&debugger_call, STRV_MAKE("-iex", sysroot_cmd), false);
+                        if (r < 0)
+                                return log_oom();
+                } else if (streq(arg_debugger, "lldb")) {
+                        const char *sysroot_cmd;
+                        sysroot_cmd = strjoina("platform select --sysroot ", arg_root, " host");
+
+                        r = strv_extend_strv(&debugger_call, STRV_MAKE("-O", sysroot_cmd), false);
+                        if (r < 0)
+                                return log_oom();
+                }
+        }
+
+        /* Don't interfere with gdb and its handling of SIGINT. */
+        (void) ignore_signals(SIGINT);
+        (void) sigaction(SIGTERM, &sa, NULL);
+
+        fork_name = strjoina("(", debugger_call[0], ")");
+
+        r = safe_fork(fork_name, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG|FORK_FLUSH_STDIO, &pid);
+        if (r < 0)
+                goto finish;
+        if (r == 0) {
+                execvp(debugger_call[0], debugger_call);
+                log_open();
+                log_error_errno(errno, "Failed to invoke %s: %m", debugger_call[0]);
+                _exit(EXIT_FAILURE);
+        }
+
+        r = wait_for_terminate_and_check(debugger_call[0], pid, WAIT_LOG_ABNORMAL);
+
+finish:
+        (void) default_signals(SIGINT, SIGTERM);
+
+        if (unlink_path) {
+                log_debug("Removed temporary file %s", path);
+                (void) unlink(path);
+        }
+
+        return r;
+}
+
+static int check_units_active(void) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        int c = 0, r;
+        const char *id, *state, *substate;
+
+        if (arg_quiet)
+                return false;
+
+        r = sd_bus_default_system(&bus);
+        if (r == -ENOENT) {
+                log_debug("D-Bus is not running, skipping active unit check");
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire bus: %m");
+
+        r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "ListUnitsByPatterns");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_append_strv(m, NULL);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_append_strv(m, STRV_MAKE("systemd-coredump@*.service"));
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_call(bus, m, SHORT_BUS_CALL_TIMEOUT_USEC, &error, &reply);
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if any systemd-coredump@.service units are running: %s",
+                                       bus_error_message(&error, r));
+
+        r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)");
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        while ((r = sd_bus_message_read(
+                                reply, "(ssssssouso)",
+                                &id,  NULL,  NULL,  &state,  &substate,
+                                NULL,  NULL,  NULL,  NULL,  NULL)) > 0) {
+                bool found = !STR_IN_SET(state, "inactive", "dead", "failed");
+                log_debug("Unit %s is %s/%s, %scounting it.", id, state, substate, found ? "" : "not ");
+                c += found;
+        }
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        r = sd_bus_message_exit_container(reply);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        return c;
+}
+
+static int coredumpctl_main(int argc, char *argv[]) {
+
+        static const Verb verbs[] = {
+                { "list",  VERB_ANY, VERB_ANY, VERB_DEFAULT, dump_list },
+                { "info",  VERB_ANY, VERB_ANY, 0,            dump_list },
+                { "dump",  VERB_ANY, VERB_ANY, 0,            dump_core },
+                { "debug", VERB_ANY, VERB_ANY, 0,            run_debug },
+                { "gdb",   VERB_ANY, VERB_ANY, 0,            run_debug },
+                { "help",  VERB_ANY, 1,        0,            verb_help },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+        _cleanup_(umount_and_freep) char *mounted_dir = NULL;
+        int r, units_active;
+
+        setlocale(LC_ALL, "");
+        log_setup();
+
+        /* The journal merging logic potentially needs a lot of fds. */
+        (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE);
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        sigbus_install();
+
+        units_active = check_units_active(); /* error is treated the same as 0 */
+
+        if (arg_image) {
+                assert(!arg_root);
+
+                r = mount_image_privately_interactively(
+                                arg_image,
+                                arg_image_policy,
+                                DISSECT_IMAGE_GENERIC_ROOT |
+                                DISSECT_IMAGE_REQUIRE_ROOT |
+                                DISSECT_IMAGE_RELAX_VAR_CHECK |
+                                DISSECT_IMAGE_VALIDATE_OS,
+                                &mounted_dir,
+                                /* ret_dir_fd= */ NULL,
+                                &loop_device);
+                if (r < 0)
+                        return r;
+
+                arg_root = strdup(mounted_dir);
+                if (!arg_root)
+                        return log_oom();
+        }
+
+        r = coredumpctl_main(argc, argv);
+
+        if (units_active > 0)
+                printf("%s-- Notice: %d systemd-coredump@.service %s, output may be incomplete.%s\n",
+                       ansi_highlight_red(),
+                       units_active, units_active == 1 ? "unit is running" : "units are running",
+                       ansi_normal());
+
+        return r;
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/coredump/meson.build b/src/coredump/meson.build
new file mode 100644
index 0000000..a699746
--- /dev/null
+++ b/src/coredump/meson.build
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+systemd_coredump_sources = files(
+        'coredump.c',
+        'coredump-vacuum.c',
+)
+
+common_link_with = [
+        libshared,
+        libbasic_compress,
+]
+
+common_dependencies = [
+        liblz4,
+        libxz,
+        libzstd,
+        threads,
+]
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-coredump',
+                'conditions' : ['ENABLE_COREDUMP'],
+                'sources' : systemd_coredump_sources,
+                'link_with' : common_link_with,
+                'dependencies' : common_dependencies + [libacl],
+        },
+        executable_template + {
+                'name' : 'coredumpctl',
+                'public' : true,
+                'conditions' : ['ENABLE_COREDUMP'],
+                'sources' : files('coredumpctl.c'),
+                'link_with' : common_link_with,
+                'dependencies' : common_dependencies,
+        },
+        test_template + {
+                'sources' : files(
+                        'test-coredump-vacuum.c',
+                        'coredump-vacuum.c',
+                ),
+                'type' : 'manual',
+        },
+]
+
+if conf.get('ENABLE_COREDUMP') == 1 and install_sysconfdir_samples
+        install_data('coredump.conf',
+                     install_dir : pkgconfigfiledir)
+endif
diff --git a/src/coredump/test-coredump-vacuum.c b/src/coredump/test-coredump-vacuum.c
new file mode 100644
index 0000000..27f8330
--- /dev/null
+++ b/src/coredump/test-coredump-vacuum.c
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "coredump-vacuum.h"
+#include "tests.h"
+
+int main(int argc, char *argv[]) {
+        test_setup_logging(LOG_DEBUG);
+
+        if (coredump_vacuum(-1, UINT64_MAX, 70 * 1024) < 0)
+                return EXIT_FAILURE;
+
+        return EXIT_SUCCESS;
+}
diff --git a/src/creds/creds.c b/src/creds/creds.c
new file mode 100644
index 0000000..10d1171
--- /dev/null
+++ b/src/creds/creds.c
@@ -0,0 +1,967 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "build.h"
+#include "creds-util.h"
+#include "dirent-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "format-table.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "json.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "missing_magic.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "terminal-util.h"
+#include "tpm2-pcr.h"
+#include "tpm2-util.h"
+#include "verbs.h"
+
+typedef enum TranscodeMode {
+        TRANSCODE_OFF,
+        TRANSCODE_BASE64,
+        TRANSCODE_UNBASE64,
+        TRANSCODE_HEX,
+        TRANSCODE_UNHEX,
+        _TRANSCODE_MAX,
+        _TRANSCODE_INVALID = -EINVAL,
+} TranscodeMode;
+
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+static PagerFlags arg_pager_flags = 0;
+static bool arg_legend = true;
+static bool arg_system = false;
+static TranscodeMode arg_transcode = TRANSCODE_OFF;
+static int arg_newline = -1;
+static sd_id128_t arg_with_key = _CRED_AUTO;
+static const char *arg_tpm2_device = NULL;
+static uint32_t arg_tpm2_pcr_mask = UINT32_MAX;
+static char *arg_tpm2_public_key = NULL;
+static uint32_t arg_tpm2_public_key_pcr_mask = UINT32_MAX;
+static char *arg_tpm2_signature = NULL;
+static const char *arg_name = NULL;
+static bool arg_name_any = false;
+static usec_t arg_timestamp = USEC_INFINITY;
+static usec_t arg_not_after = USEC_INFINITY;
+static bool arg_pretty = false;
+static bool arg_quiet = false;
+
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_public_key, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_signature, freep);
+
+static const char* transcode_mode_table[_TRANSCODE_MAX] = {
+        [TRANSCODE_OFF] = "off",
+        [TRANSCODE_BASE64] = "base64",
+        [TRANSCODE_UNBASE64] = "unbase64",
+        [TRANSCODE_HEX] = "hex",
+        [TRANSCODE_UNHEX] = "unhex",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(transcode_mode, TranscodeMode);
+
+static int open_credential_directory(
+                bool encrypted,
+                DIR **ret_dir,
+                const char **ret_prefix) {
+
+        const char *p;
+        DIR *d;
+        int r;
+
+        assert(ret_dir);
+
+        if (arg_system)
+                /* PID 1 ensures that system credentials are always accessible under the same fixed path. It
+                 * will create symlinks if necessary to guarantee that. */
+                p = encrypted ?
+                        ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY :
+                        SYSTEM_CREDENTIALS_DIRECTORY;
+        else {
+                /* Otherwise take the dirs from the env vars we got passed */
+                r = (encrypted ? get_encrypted_credentials_dir : get_credentials_dir)(&p);
+                if (r == -ENXIO) /* No environment variable? */
+                        goto not_found;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get credentials directory: %m");
+        }
+
+        d = opendir(p);
+        if (!d) {
+                /* No such dir? Then no creds where passed. (We conditionalize this on arg_system, since for
+                 * the per-service case a non-existing path would indicate an issue since the env var would
+                 * be set incorrectly in that case.) */
+                if (arg_system && errno == ENOENT)
+                        goto not_found;
+
+                return log_error_errno(errno, "Failed to open credentials directory '%s': %m", p);
+        }
+
+        *ret_dir = d;
+
+        if (ret_prefix)
+                *ret_prefix = p;
+
+        return 1;
+
+not_found:
+        *ret_dir = NULL;
+
+        if (ret_prefix)
+                *ret_prefix = NULL;
+
+        return 0;
+}
+
+static int add_credentials_to_table(Table *t, bool encrypted) {
+        _cleanup_closedir_ DIR *d = NULL;
+        const char *prefix;
+        int r;
+
+        assert(t);
+
+        r = open_credential_directory(encrypted, &d, &prefix);
+        if (r < 0)
+                return r;
+        if (!d)
+                return 0; /* No creds dir set */
+
+        for (;;) {
+                _cleanup_free_ char *j = NULL;
+                const char *secure, *secure_color = NULL;
+                _cleanup_close_ int fd = -EBADF;
+                struct dirent *de;
+                struct stat st;
+
+                errno = 0;
+                de = readdir_no_dot(d);
+                if (!de) {
+                        if (errno == 0)
+                                break;
+
+                        return log_error_errno(errno, "Failed to read credentials directory: %m");
+                }
+
+                if (!IN_SET(de->d_type, DT_REG, DT_UNKNOWN))
+                        continue;
+
+                if (!credential_name_valid(de->d_name))
+                        continue;
+
+                fd = openat(dirfd(d), de->d_name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
+                if (fd < 0) {
+                        if (errno == ENOENT) /* Vanished by now? */
+                                continue;
+
+                        return log_error_errno(errno, "Failed to open credential '%s': %m", de->d_name);
+                }
+
+                if (fstat(fd, &st) < 0)
+                        return log_error_errno(errno, "Failed to stat credential '%s': %m", de->d_name);
+
+                if (!S_ISREG(st.st_mode))
+                        continue;
+
+                if (encrypted) {
+                        secure = "encrypted";
+                        secure_color = ansi_highlight_green();
+                } else if ((st.st_mode & 0377) != 0) {
+                        secure = "insecure"; /* Anything that is accessible more than read-only to its owner is insecure */
+                        secure_color = ansi_highlight_red();
+                } else {
+                        r = fd_is_fs_type(fd, RAMFS_MAGIC);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to determine backing file system of '%s': %m", de->d_name);
+
+                        secure = r > 0 ? "secure" : "weak"; /* ramfs is not swappable, hence "secure", everything else is "weak" */
+                        secure_color = r > 0 ? ansi_highlight_green() : ansi_highlight_yellow4();
+                }
+
+                j = path_join(prefix, de->d_name);
+                if (!j)
+                        return log_oom();
+
+                r = table_add_many(
+                                t,
+                                TABLE_STRING, de->d_name,
+                                TABLE_STRING, secure,
+                                TABLE_SET_COLOR, secure_color,
+                                TABLE_SIZE, (uint64_t) st.st_size,
+                                TABLE_STRING, j);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        return 1; /* Creds dir set */
+}
+
+static int verb_list(int argc, char **argv, void *userdata) {
+        _cleanup_(table_unrefp) Table *t = NULL;
+        int r, q;
+
+        t = table_new("name", "secure", "size", "path");
+        if (!t)
+                return log_oom();
+
+        (void) table_set_align_percent(t, table_get_cell(t, 0, 2), 100);
+
+        r = add_credentials_to_table(t, /* encrypted= */ true);
+        if (r < 0)
+                return r;
+
+        q = add_credentials_to_table(t, /* encrypted= */ false);
+        if (q < 0)
+                return q;
+
+        if (r == 0 && q == 0) {
+                if (arg_system)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "No credentials passed to system.");
+
+                return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "No credentials passed. (i.e. $CREDENTIALS_DIRECTORY not set.)");
+        }
+
+        if ((arg_json_format_flags & JSON_FORMAT_OFF) && table_get_rows(t) <= 1) {
+                log_info("No credentials");
+                return 0;
+        }
+
+        return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend);
+}
+
+static int transcode(
+                const void *input,
+                size_t input_size,
+                void **ret_output,
+                size_t *ret_output_size) {
+
+        int r;
+
+        assert(input);
+        assert(input_size);
+        assert(ret_output);
+        assert(ret_output_size);
+
+        switch (arg_transcode) {
+
+        case TRANSCODE_BASE64: {
+                char *buf;
+                ssize_t l;
+
+                l = base64mem_full(input, input_size, 79, &buf);
+                if (l < 0)
+                        return l;
+
+                *ret_output = buf;
+                *ret_output_size = l;
+                return 0;
+        }
+
+        case TRANSCODE_UNBASE64:
+                r = unbase64mem_full(input, input_size, true, ret_output, ret_output_size);
+                if (r == -EPIPE) /* Uneven number of chars */
+                        return -EINVAL;
+
+                return r;
+
+        case TRANSCODE_HEX: {
+                char *buf;
+
+                buf = hexmem(input, input_size);
+                if (!buf)
+                        return -ENOMEM;
+
+                *ret_output = buf;
+                *ret_output_size = input_size * 2;
+                return 0;
+        }
+
+        case TRANSCODE_UNHEX:
+                r = unhexmem_full(input, input_size, true, ret_output, ret_output_size);
+                if (r == -EPIPE) /* Uneven number of chars */
+                        return -EINVAL;
+
+                return r;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int print_newline(FILE *f, const char *data, size_t l) {
+        int fd;
+
+        assert(f);
+        assert(data || l == 0);
+
+        /* If turned off explicitly, don't print newline */
+        if (arg_newline == 0)
+                return 0;
+
+        /* If data already has newline, don't print either */
+        if (l > 0 && data[l-1] == '\n')
+                return 0;
+
+        /* Don't bother unless this is a tty */
+        fd = fileno(f);
+        if (fd >= 0 && isatty(fd) <= 0)
+                return 0;
+
+        if (fputc('\n', f) != '\n')
+                return log_error_errno(errno, "Failed to write trailing newline: %m");
+
+        return 1;
+}
+
+static int write_blob(FILE *f, const void *data, size_t size) {
+        _cleanup_(erase_and_freep) void *transcoded = NULL;
+        int r;
+
+        if (arg_transcode == TRANSCODE_OFF &&
+            arg_json_format_flags != JSON_FORMAT_OFF) {
+                _cleanup_(erase_and_freep) char *suffixed = NULL;
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+                r = make_cstring(data, size, MAKE_CSTRING_REFUSE_TRAILING_NUL, &suffixed);
+                if (r < 0)
+                        return log_error_errno(r, "Unable to convert binary string to C string: %m");
+
+                r = json_parse(suffixed, JSON_PARSE_SENSITIVE, &v, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse JSON: %m");
+
+                json_variant_dump(v, arg_json_format_flags, f, NULL);
+                return 0;
+        }
+
+        if (arg_transcode != TRANSCODE_OFF) {
+                r = transcode(data, size, &transcoded, &size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to transcode data: %m");
+
+                data = transcoded;
+        }
+
+        if (fwrite(data, 1, size, f) != size)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to write credential data.");
+
+        r = print_newline(f, data, size);
+        if (r < 0)
+                return r;
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to flush output: %m");
+
+        return 0;
+}
+
+static int verb_cat(int argc, char **argv, void *userdata) {
+        usec_t timestamp;
+        int r, ret = 0;
+
+        timestamp = arg_timestamp != USEC_INFINITY ? arg_timestamp : now(CLOCK_REALTIME);
+
+        STRV_FOREACH(cn, strv_skip(argv, 1)) {
+                _cleanup_(erase_and_freep) void *data = NULL;
+                size_t size = 0;
+                int encrypted;
+
+                if (!credential_name_valid(*cn)) {
+                        log_error("Credential name '%s' is not valid.", *cn);
+                        if (ret >= 0)
+                                ret = -EINVAL;
+                        continue;
+                }
+
+                /* Look both in regular and in encrypted credentials */
+                for (encrypted = 0; encrypted < 2; encrypted++) {
+                        _cleanup_closedir_ DIR *d = NULL;
+
+                        r = open_credential_directory(encrypted, &d, NULL);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to open credentials directory: %m");
+                        if (!d) /* Not set */
+                                continue;
+
+                        r = read_full_file_full(
+                                        dirfd(d), *cn,
+                                        UINT64_MAX, SIZE_MAX,
+                                        READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE,
+                                        NULL,
+                                        (char**) &data, &size);
+                        if (r == -ENOENT) /* Not found */
+                                continue;
+                        if (r >= 0) /* Found */
+                                break;
+
+                        log_error_errno(r, "Failed to read credential '%s': %m", *cn);
+                        if (ret >= 0)
+                                ret = r;
+                }
+
+                if (encrypted >= 2) { /* Found nowhere */
+                        log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Credential '%s' not set.", *cn);
+                        if (ret >= 0)
+                                ret = -ENOENT;
+
+                        continue;
+                }
+
+                if (encrypted) {
+                        _cleanup_(erase_and_freep) void *plaintext = NULL;
+                        size_t plaintext_size;
+
+                        r = decrypt_credential_and_warn(
+                                        *cn,
+                                        timestamp,
+                                        arg_tpm2_device,
+                                        arg_tpm2_signature,
+                                        data, size,
+                                        &plaintext, &plaintext_size);
+                        if (r < 0)
+                                return r;
+
+                        erase_and_free(data);
+                        data = TAKE_PTR(plaintext);
+                        size = plaintext_size;
+                }
+
+                r = write_blob(stdout, data, size);
+                if (r < 0)
+                        return r;
+        }
+
+        return ret;
+}
+
+static int verb_encrypt(int argc, char **argv, void *userdata) {
+        _cleanup_free_ char *base64_buf = NULL, *fname = NULL;
+        _cleanup_(erase_and_freep) char *plaintext = NULL;
+        const char *input_path, *output_path, *name;
+        _cleanup_free_ void *output = NULL;
+        size_t plaintext_size, output_size;
+        ssize_t base64_size;
+        usec_t timestamp;
+        int r;
+
+        assert(argc == 3);
+
+        input_path = empty_or_dash(argv[1]) ? NULL : argv[1];
+
+        if (input_path)
+                r = read_full_file_full(AT_FDCWD, input_path, UINT64_MAX, CREDENTIAL_SIZE_MAX, READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &plaintext, &plaintext_size);
+        else
+                r = read_full_stream_full(stdin, NULL, UINT64_MAX, CREDENTIAL_SIZE_MAX, READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER, &plaintext, &plaintext_size);
+        if (r == -E2BIG)
+                return log_error_errno(r, "Plaintext too long for credential (allowed size: %zu).", (size_t) CREDENTIAL_SIZE_MAX);
+        if (r < 0)
+                return log_error_errno(r, "Failed to read plaintext: %m");
+
+        output_path = empty_or_dash(argv[2]) ? NULL : argv[2];
+
+        if (arg_name_any)
+                name = NULL;
+        else if (arg_name)
+                name = arg_name;
+        else if (output_path) {
+                r = path_extract_filename(output_path, &fname);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from '%s': %m", output_path);
+                if (r == O_DIRECTORY)
+                        return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Path '%s' refers to directory, refusing.", output_path);
+
+                name = fname;
+        } else {
+                log_warning("No credential name specified, not embedding credential name in encrypted data. (Disable this warning with --name=)");
+                name = NULL;
+        }
+
+        timestamp = arg_timestamp != USEC_INFINITY ? arg_timestamp : now(CLOCK_REALTIME);
+
+        if (arg_not_after != USEC_INFINITY && arg_not_after < timestamp)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential is invalidated before it is valid.");
+
+        r = encrypt_credential_and_warn(
+                        arg_with_key,
+                        name,
+                        timestamp,
+                        arg_not_after,
+                        arg_tpm2_device,
+                        arg_tpm2_pcr_mask,
+                        arg_tpm2_public_key,
+                        arg_tpm2_public_key_pcr_mask,
+                        plaintext, plaintext_size,
+                        &output, &output_size);
+        if (r < 0)
+                return r;
+
+        base64_size = base64mem_full(output, output_size, arg_pretty ? 69 : 79, &base64_buf);
+        if (base64_size < 0)
+                return base64_size;
+
+        /* Pretty print makes sense only if we're printing stuff to stdout
+         * and if a cred name is provided via --name= (since we can't use
+         * the output file name as the cred name here) */
+        if (arg_pretty && !output_path && name) {
+                _cleanup_free_ char *escaped = NULL, *indented = NULL, *j = NULL;
+
+                escaped = cescape(name);
+                if (!escaped)
+                        return log_oom();
+
+                indented = strreplace(base64_buf, "\n", " \\\n        ");
+                if (!indented)
+                        return log_oom();
+
+                j = strjoin("SetCredentialEncrypted=", escaped, ": \\\n        ", indented, "\n");
+                if (!j)
+                        return log_oom();
+
+                free_and_replace(base64_buf, j);
+        }
+
+        if (output_path)
+                r = write_string_file(output_path, base64_buf, WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_CREATE);
+        else
+                r = write_string_stream(stdout, base64_buf, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write result: %m");
+
+        return EXIT_SUCCESS;
+}
+
+static int verb_decrypt(int argc, char **argv, void *userdata) {
+        _cleanup_(erase_and_freep) void *plaintext = NULL;
+        _cleanup_free_ char *input = NULL, *fname = NULL;
+        _cleanup_fclose_ FILE *output_file = NULL;
+        const char *input_path, *output_path, *name;
+        size_t input_size, plaintext_size;
+        usec_t timestamp;
+        FILE *f;
+        int r;
+
+        assert(IN_SET(argc, 2, 3));
+
+        input_path = empty_or_dash(argv[1]) ? NULL : argv[1];
+
+        if (input_path)
+                r = read_full_file_full(AT_FDCWD, argv[1], UINT64_MAX, CREDENTIAL_ENCRYPTED_SIZE_MAX, READ_FULL_FILE_UNBASE64|READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &input, &input_size);
+        else
+                r = read_full_stream_full(stdin, NULL, UINT64_MAX, CREDENTIAL_ENCRYPTED_SIZE_MAX, READ_FULL_FILE_UNBASE64|READ_FULL_FILE_FAIL_WHEN_LARGER, &input, &input_size);
+        if (r == -E2BIG)
+                return log_error_errno(r, "Data too long for encrypted credential (allowed size: %zu).", (size_t) CREDENTIAL_ENCRYPTED_SIZE_MAX);
+        if (r < 0)
+                return log_error_errno(r, "Failed to read encrypted credential data: %m");
+
+        output_path = (argc < 3 || empty_or_dash(argv[2])) ? NULL : argv[2];
+
+        if (arg_name_any)
+                name = NULL;
+        else if (arg_name)
+                name = arg_name;
+        else if (input_path) {
+                r = path_extract_filename(input_path, &fname);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from '%s': %m", input_path);
+                if (r == O_DIRECTORY)
+                        return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Path '%s' refers to directory, refusing.", input_path);
+
+                name = fname;
+        } else {
+                log_warning("No credential name specified, not validating credential name embedded in encrypted data. (Disable this warning with --name=.)");
+                name = NULL;
+        }
+
+        timestamp = arg_timestamp != USEC_INFINITY ? arg_timestamp : now(CLOCK_REALTIME);
+
+        r = decrypt_credential_and_warn(
+                        name,
+                        timestamp,
+                        arg_tpm2_device,
+                        arg_tpm2_signature,
+                        input, input_size,
+                        &plaintext, &plaintext_size);
+        if (r < 0)
+                return r;
+
+        if (output_path) {
+                output_file = fopen(output_path, "we");
+                if (!output_file)
+                        return log_error_errno(errno, "Failed to create output file '%s': %m", output_path);
+
+                f = output_file;
+        } else
+                f = stdout;
+
+        r = write_blob(f, plaintext, plaintext_size);
+        if (r < 0)
+                return r;
+
+        return EXIT_SUCCESS;
+}
+
+static int verb_setup(int argc, char **argv, void *userdata) {
+        size_t size;
+        int r;
+
+        r = get_credential_host_secret(CREDENTIAL_SECRET_GENERATE|CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED, NULL, &size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to setup credentials host key: %m");
+
+        log_info("%zu byte credentials host key set up.", size);
+
+        return EXIT_SUCCESS;
+}
+
+static int verb_has_tpm2(int argc, char **argv, void *userdata) {
+        Tpm2Support s;
+
+        s = tpm2_support();
+
+        if (!arg_quiet) {
+                if (s == TPM2_SUPPORT_FULL)
+                        puts("yes");
+                else if (s == TPM2_SUPPORT_NONE)
+                        puts("no");
+                else
+                        puts("partial");
+
+                printf("%sfirmware\n"
+                       "%sdriver\n"
+                       "%ssystem\n"
+                       "%ssubsystem\n"
+                       "%slibraries\n",
+                       plus_minus(s & TPM2_SUPPORT_FIRMWARE),
+                       plus_minus(s & TPM2_SUPPORT_DRIVER),
+                       plus_minus(s & TPM2_SUPPORT_SYSTEM),
+                       plus_minus(s & TPM2_SUPPORT_SUBSYSTEM),
+                       plus_minus(s & TPM2_SUPPORT_LIBRARIES));
+        }
+
+        /* Return inverted bit flags. So that TPM2_SUPPORT_FULL becomes EXIT_SUCCESS and the other values
+         * become some reasonable values 1…7. i.e. the flags we return here tell what is missing rather than
+         * what is there, acknowledging the fact that for process exit statuses it is customary to return
+         * zero (EXIT_FAILURE) when all is good, instead of all being bad. */
+        return ~s & TPM2_SUPPORT_FULL;
+}
+
+static int verb_help(int argc, char **argv, void *userdata) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-creds", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s [OPTIONS...] COMMAND ...\n"
+               "\n%5$sDisplay and Process Credentials.%6$s\n"
+               "\n%3$sCommands:%4$s\n"
+               "  list                    Show installed and available versions\n"
+               "  cat CREDENTIAL...       Show specified credentials\n"
+               "  setup                   Generate credentials host key, if not existing yet\n"
+               "  encrypt INPUT OUTPUT    Encrypt plaintext credential file and write to\n"
+               "                          ciphertext credential file\n"
+               "  decrypt INPUT [OUTPUT]  Decrypt ciphertext credential file and write to\n"
+               "                          plaintext credential file\n"
+               "  has-tpm2                Report whether TPM2 support is available\n"
+               "  -h --help               Show this help\n"
+               "     --version            Show package version\n"
+               "\n%3$sOptions:%4$s\n"
+               "     --no-pager           Do not pipe output into a pager\n"
+               "     --no-legend          Do not show the headers and footers\n"
+               "     --json=pretty|short|off\n"
+               "                          Generate JSON output\n"
+               "     --system             Show credentials passed to system\n"
+               "     --transcode=base64|unbase64|hex|unhex\n"
+               "                          Transcode credential data\n"
+               "     --newline=auto|yes|no\n"
+               "                          Suffix output with newline\n"
+               "  -p --pretty             Output as SetCredentialEncrypted= line\n"
+               "     --name=NAME          Override filename included in encrypted credential\n"
+               "     --timestamp=TIME     Include specified timestamp in encrypted credential\n"
+               "     --not-after=TIME     Include specified invalidation time in encrypted\n"
+               "                          credential\n"
+               "     --with-key=host|tpm2|host+tpm2|tpm2-absent|auto|auto-initrd\n"
+               "                          Which keys to encrypt with\n"
+               "  -H                      Shortcut for --with-key=host\n"
+               "  -T                      Shortcut for --with-key=tpm2\n"
+               "     --tpm2-device=PATH\n"
+               "                          Pick TPM2 device\n"
+               "     --tpm2-pcrs=PCR1+PCR2+PCR3+…\n"
+               "                          Specify TPM2 PCRs to seal against (fixed hash)\n"
+               "     --tpm2-public-key=PATH\n"
+               "                          Specify PEM certificate to seal against\n"
+               "     --tpm2-public-key-pcrs=PCR1+PCR2+PCR3+…\n"
+               "                          Specify TPM2 PCRs to seal against (public key)\n"
+               "     --tpm2-signature=PATH\n"
+               "                          Specify signature for public key PCR policy\n"
+               "  -q --quiet              Suppress output for 'has-tpm2' verb\n"
+               "\nSee the %2$s for details.\n"
+               , program_invocation_short_name
+               , link
+               , ansi_underline(), ansi_normal()
+               , ansi_highlight(), ansi_normal()
+        );
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_PAGER,
+                ARG_NO_LEGEND,
+                ARG_JSON,
+                ARG_SYSTEM,
+                ARG_TRANSCODE,
+                ARG_NEWLINE,
+                ARG_WITH_KEY,
+                ARG_TPM2_DEVICE,
+                ARG_TPM2_PCRS,
+                ARG_TPM2_PUBLIC_KEY,
+                ARG_TPM2_PUBLIC_KEY_PCRS,
+                ARG_TPM2_SIGNATURE,
+                ARG_NAME,
+                ARG_TIMESTAMP,
+                ARG_NOT_AFTER,
+        };
+
+        static const struct option options[] = {
+                { "help",                 no_argument,       NULL, 'h'                      },
+                { "version",              no_argument,       NULL, ARG_VERSION              },
+                { "no-pager",             no_argument,       NULL, ARG_NO_PAGER             },
+                { "no-legend",            no_argument,       NULL, ARG_NO_LEGEND            },
+                { "json",                 required_argument, NULL, ARG_JSON                 },
+                { "system",               no_argument,       NULL, ARG_SYSTEM               },
+                { "transcode",            required_argument, NULL, ARG_TRANSCODE            },
+                { "newline",              required_argument, NULL, ARG_NEWLINE              },
+                { "pretty",               no_argument,       NULL, 'p'                      },
+                { "with-key",             required_argument, NULL, ARG_WITH_KEY             },
+                { "tpm2-device",          required_argument, NULL, ARG_TPM2_DEVICE          },
+                { "tpm2-pcrs",            required_argument, NULL, ARG_TPM2_PCRS            },
+                { "tpm2-public-key",      required_argument, NULL, ARG_TPM2_PUBLIC_KEY      },
+                { "tpm2-public-key-pcrs", required_argument, NULL, ARG_TPM2_PUBLIC_KEY_PCRS },
+                { "tpm2-signature",       required_argument, NULL, ARG_TPM2_SIGNATURE       },
+                { "name",                 required_argument, NULL, ARG_NAME                 },
+                { "timestamp",            required_argument, NULL, ARG_TIMESTAMP            },
+                { "not-after",            required_argument, NULL, ARG_NOT_AFTER            },
+                { "quiet",                no_argument,       NULL, 'q'                      },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hHTpq", options, NULL)) >= 0) {
+
+                switch (c) {
+
+                case 'h':
+                        return verb_help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_NO_LEGEND:
+                        arg_legend = false;
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case ARG_SYSTEM:
+                        arg_system = true;
+                        break;
+
+                case ARG_TRANSCODE:
+                        if (parse_boolean(optarg) == 0) /* If specified as "false", turn transcoding off */
+                                arg_transcode = TRANSCODE_OFF;
+                        else {
+                                TranscodeMode m;
+
+                                m = transcode_mode_from_string(optarg);
+                                if (m < 0)
+                                        return log_error_errno(m, "Failed to parse transcode mode: %m");
+
+                                arg_transcode = m;
+                        }
+
+                        break;
+
+                case ARG_NEWLINE:
+                        if (isempty(optarg) || streq(optarg, "auto"))
+                                arg_newline = -1;
+                        else {
+                                r = parse_boolean_argument("--newline=", optarg, NULL);
+                                if (r < 0)
+                                        return r;
+
+                                arg_newline = r;
+                        }
+                        break;
+
+                case 'p':
+                        arg_pretty = true;
+                        break;
+
+                case ARG_WITH_KEY:
+                        if (isempty(optarg) || streq(optarg, "auto"))
+                                arg_with_key = _CRED_AUTO;
+                        else if (streq(optarg, "auto-initrd"))
+                                arg_with_key = _CRED_AUTO_INITRD;
+                        else if (streq(optarg, "host"))
+                                arg_with_key = CRED_AES256_GCM_BY_HOST;
+                        else if (streq(optarg, "tpm2"))
+                                arg_with_key = CRED_AES256_GCM_BY_TPM2_HMAC;
+                        else if (streq(optarg, "tpm2-with-public-key"))
+                                arg_with_key = CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK;
+                        else if (STR_IN_SET(optarg, "host+tpm2", "tpm2+host"))
+                                arg_with_key = CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC;
+                        else if (STR_IN_SET(optarg, "host+tpm2-with-public-key", "tpm2-with-public-key+host"))
+                                arg_with_key = CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK;
+                        else if (streq(optarg, "tpm2-absent"))
+                                arg_with_key = CRED_AES256_GCM_BY_TPM2_ABSENT;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown key type: %s", optarg);
+
+                        break;
+
+                case 'H':
+                        arg_with_key = CRED_AES256_GCM_BY_HOST;
+                        break;
+
+                case 'T':
+                        arg_with_key = CRED_AES256_GCM_BY_TPM2_HMAC;
+                        break;
+
+                case ARG_TPM2_DEVICE:
+                        if (streq(optarg, "list"))
+                                return tpm2_list_devices();
+
+                        arg_tpm2_device = streq(optarg, "auto") ? NULL : optarg;
+                        break;
+
+                case ARG_TPM2_PCRS: /* For fixed hash PCR policies only */
+                        r = tpm2_parse_pcr_argument_to_mask(optarg, &arg_tpm2_pcr_mask);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_PUBLIC_KEY:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_public_key);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_PUBLIC_KEY_PCRS: /* For public key PCR policies only */
+                        r = tpm2_parse_pcr_argument_to_mask(optarg, &arg_tpm2_public_key_pcr_mask);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_SIGNATURE:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_signature);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_NAME:
+                        if (isempty(optarg)) {
+                                arg_name = NULL;
+                                arg_name_any = true;
+                                break;
+                        }
+
+                        if (!credential_name_valid(optarg))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid credential name: %s", optarg);
+
+                        arg_name = optarg;
+                        arg_name_any = false;
+                        break;
+
+                case ARG_TIMESTAMP:
+                        r = parse_timestamp(optarg, &arg_timestamp);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse timestamp: %s", optarg);
+
+                        break;
+
+                case ARG_NOT_AFTER:
+                        r = parse_timestamp(optarg, &arg_not_after);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --not-after= timestamp: %s", optarg);
+
+                        break;
+
+                case 'q':
+                        arg_quiet = true;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        if (arg_tpm2_pcr_mask == UINT32_MAX)
+                arg_tpm2_pcr_mask = TPM2_PCR_MASK_DEFAULT;
+        if (arg_tpm2_public_key_pcr_mask == UINT32_MAX)
+                arg_tpm2_public_key_pcr_mask = UINT32_C(1) << TPM2_PCR_KERNEL_BOOT;
+
+        return 1;
+}
+
+static int creds_main(int argc, char *argv[]) {
+
+        static const Verb verbs[] = {
+                { "list",     VERB_ANY, 1,        VERB_DEFAULT, verb_list     },
+                { "cat",      2,        VERB_ANY, 0,            verb_cat      },
+                { "encrypt",  3,        3,        0,            verb_encrypt  },
+                { "decrypt",  2,        3,        0,            verb_decrypt  },
+                { "setup",    VERB_ANY, 1,        0,            verb_setup    },
+                { "help",     VERB_ANY, 1,        0,            verb_help     },
+                { "has-tpm2", VERB_ANY, 1,        0,            verb_has_tpm2 },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return creds_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/creds/meson.build b/src/creds/meson.build
new file mode 100644
index 0000000..8557256
--- /dev/null
+++ b/src/creds/meson.build
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-creds',
+                'public' : true,
+                'sources' : files('creds.c'),
+                'dependencies' : [
+                        libopenssl,
+                        threads,
+                ],
+        },
+]
+
+# Protecting files from the distro in /usr doesn't make sense since they can be trivially accessed otherwise,
+# so don't restrict the access mode in /usr. That doesn't apply to /etc, so we do restrict the access mode
+# there.
+install_emptydir(credstoredir)
+if install_sysconfdir
+        # Keep in sync with tmpfiles.d/credstore.conf
+        install_emptydir(sysconfdir / 'credstore',
+                         install_mode : 'rwx------')
+        install_emptydir(sysconfdir / 'credstore.encrypted',
+                         install_mode : 'rwx------')
+endif
diff --git a/src/cryptenroll/cryptenroll-fido2.c b/src/cryptenroll/cryptenroll-fido2.c
new file mode 100644
index 0000000..2baeb92
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-fido2.c
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "ask-password-api.h"
+#include "cryptenroll-fido2.h"
+#include "cryptsetup-fido2.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "libfido2-util.h"
+#include "memory-util.h"
+#include "random-util.h"
+
+int load_volume_key_fido2(
+                struct crypt_device *cd,
+                const char *cd_node,
+                const char *device,
+                void *ret_vk,
+                size_t *ret_vks) {
+
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(erase_and_freep) char *passphrase = NULL;
+        size_t decrypted_key_size;
+        ssize_t passphrase_size;
+        int r;
+
+        assert_se(cd);
+        assert_se(cd_node);
+        assert_se(ret_vk);
+        assert_se(ret_vks);
+
+        r = acquire_fido2_key_auto(
+                        cd,
+                        cd_node,
+                        cd_node,
+                        device,
+                        /* until= */ 0,
+                        /* headless= */ false,
+                        &decrypted_key,
+                        &decrypted_key_size,
+                        ASK_PASSWORD_PUSH_CACHE|ASK_PASSWORD_ACCEPT_CACHED);
+        if (r == -EAGAIN)
+                return log_error_errno(r, "FIDO2 token does not exist, or UV is blocked. Please try again.");
+        if (r < 0)
+                return r;
+
+        /* Because cryptenroll requires a LUKS header, we can assume that this device is not
+         * a PLAIN device. In this case, we need to base64 encode the secret to use as the passphrase */
+        passphrase_size = base64mem(decrypted_key, decrypted_key_size, &passphrase);
+        if (passphrase_size < 0)
+                return log_oom();
+
+        r = crypt_volume_key_get(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        ret_vk,
+                        ret_vks,
+                        passphrase,
+                        passphrase_size);
+        if (r < 0)
+                return log_error_errno(r, "Unlocking via FIDO2 device failed: %m");
+
+        return r;
+}
+
+int enroll_fido2(
+                struct crypt_device *cd,
+                const void *volume_key,
+                size_t volume_key_size,
+                const char *device,
+                Fido2EnrollFlags lock_with,
+                int cred_alg) {
+
+        _cleanup_(erase_and_freep) void *salt = NULL, *secret = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_free_ char *keyslot_as_string = NULL;
+        size_t cid_size, salt_size, secret_size;
+        _cleanup_free_ void *cid = NULL;
+        ssize_t base64_encoded_size;
+        const char *node, *un;
+        int r, keyslot;
+
+        assert_se(cd);
+        assert_se(volume_key);
+        assert_se(volume_key_size > 0);
+        assert_se(device);
+
+        assert_se(node = crypt_get_device_name(cd));
+
+        un = strempty(crypt_get_uuid(cd));
+
+        r = fido2_generate_hmac_hash(
+                        device,
+                        /* rp_id= */ "io.systemd.cryptsetup",
+                        /* rp_name= */ "Encrypted Volume",
+                        /* user_id= */ un, strlen(un), /* We pass the user ID and name as the same: the disk's UUID if we have it */
+                        /* user_name= */ un,
+                        /* user_display_name= */ node,
+                        /* user_icon_name= */ NULL,
+                        /* askpw_icon_name= */ "drive-harddisk",
+                        lock_with,
+                        cred_alg,
+                        &cid, &cid_size,
+                        &salt, &salt_size,
+                        &secret, &secret_size,
+                        NULL,
+                        &lock_with);
+        if (r < 0)
+                return r;
+
+        /* Before we use the secret, we base64 encode it, for compat with homed, and to make it easier to type in manually */
+        base64_encoded_size = base64mem(secret, secret_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return log_error_errno(base64_encoded_size, "Failed to base64 encode secret key: %m");
+
+        r = cryptsetup_set_minimal_pbkdf(cd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set minimal PBKDF: %m");
+
+        keyslot = crypt_keyslot_add_by_volume_key(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        volume_key,
+                        volume_key_size,
+                        base64_encoded,
+                        base64_encoded_size);
+        if (keyslot < 0)
+                return log_error_errno(keyslot, "Failed to add new FIDO2 key to %s: %m", node);
+
+        if (asprintf(&keyslot_as_string, "%i", keyslot) < 0)
+                return log_oom();
+
+        r = json_build(&v,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("systemd-fido2")),
+                                       JSON_BUILD_PAIR("keyslots", JSON_BUILD_ARRAY(JSON_BUILD_STRING(keyslot_as_string))),
+                                       JSON_BUILD_PAIR("fido2-credential", JSON_BUILD_BASE64(cid, cid_size)),
+                                       JSON_BUILD_PAIR("fido2-salt", JSON_BUILD_BASE64(salt, salt_size)),
+                                       JSON_BUILD_PAIR("fido2-rp", JSON_BUILD_CONST_STRING("io.systemd.cryptsetup")),
+                                       JSON_BUILD_PAIR("fido2-clientPin-required", JSON_BUILD_BOOLEAN(FLAGS_SET(lock_with, FIDO2ENROLL_PIN))),
+                                       JSON_BUILD_PAIR("fido2-up-required", JSON_BUILD_BOOLEAN(FLAGS_SET(lock_with, FIDO2ENROLL_UP))),
+                                       JSON_BUILD_PAIR("fido2-uv-required", JSON_BUILD_BOOLEAN(FLAGS_SET(lock_with, FIDO2ENROLL_UV)))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to prepare FIDO2 JSON token object: %m");
+
+        r = cryptsetup_add_token_json(cd, v);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add FIDO2 JSON token to LUKS2 header: %m");
+
+        log_info("New FIDO2 token enrolled as key slot %i.", keyslot);
+        return keyslot;
+}
diff --git a/src/cryptenroll/cryptenroll-fido2.h b/src/cryptenroll/cryptenroll-fido2.h
new file mode 100644
index 0000000..3315308
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-fido2.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+#include "libfido2-util.h"
+#include "log.h"
+
+#if HAVE_LIBFIDO2
+int load_volume_key_fido2(struct crypt_device *cd, const char *cd_node, const char *device, void *ret_vk, size_t *ret_vks);
+int enroll_fido2(struct crypt_device *cd, const void *volume_key, size_t volume_key_size, const char *device, Fido2EnrollFlags lock_with, int cred_alg);
+
+#else
+static inline int load_volume_key_fido2(struct crypt_device *cd, const char *cd_node, const char *device, void *ret_vk, size_t *ret_vks) {
+        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "FIDO2 unlocking not supported.");
+}
+
+static inline int enroll_fido2(struct crypt_device *cd, const void *volume_key, size_t volume_key_size, const char *device, Fido2EnrollFlags lock_with, int cred_alg) {
+        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "FIDO2 key enrollment not supported.");
+}
+#endif
diff --git a/src/cryptenroll/cryptenroll-list.c b/src/cryptenroll/cryptenroll-list.c
new file mode 100644
index 0000000..d21df71
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-list.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cryptenroll-list.h"
+#include "cryptenroll.h"
+#include "format-table.h"
+#include "parse-util.h"
+
+struct keyslot_metadata {
+        int slot;
+        const char *type;
+};
+
+int list_enrolled(struct crypt_device *cd) {
+        _cleanup_free_ struct keyslot_metadata *keyslot_metadata = NULL;
+        _cleanup_(table_unrefp) Table *t = NULL;
+        size_t n_keyslot_metadata = 0;
+        int slot_max, r;
+        TableCell *cell;
+
+        assert(cd);
+
+        /* First step, find out all currently used slots */
+        assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+        for (int slot = 0; slot < slot_max; slot++) {
+                crypt_keyslot_info status;
+
+                status = crypt_keyslot_status(cd, slot);
+                if (!IN_SET(status, CRYPT_SLOT_ACTIVE, CRYPT_SLOT_ACTIVE_LAST))
+                        continue;
+
+                if (!GREEDY_REALLOC(keyslot_metadata, n_keyslot_metadata+1))
+                        return log_oom();
+
+                keyslot_metadata[n_keyslot_metadata++] = (struct keyslot_metadata) {
+                        .slot = slot,
+                };
+        }
+
+        /* Second step, enumerate through all tokens, and update the slot table, indicating what kind of
+         * token they are assigned to */
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                const char *type;
+                JsonVariant *w, *z;
+                EnrollType et;
+
+                r = cryptsetup_get_token_as_json(cd, token, NULL, &v);
+                if (IN_SET(r, -ENOENT, -EINVAL))
+                        continue;
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to read JSON token data off disk, ignoring: %m");
+                        continue;
+                }
+
+                w = json_variant_by_key(v, "type");
+                if (!w || !json_variant_is_string(w)) {
+                        log_warning("Token JSON data lacks type field, ignoring.");
+                        continue;
+                }
+
+                et = luks2_token_type_from_string(json_variant_string(w));
+                if (et < 0)
+                        type = "other";
+                else
+                        type = enroll_type_to_string(et);
+
+                w = json_variant_by_key(v, "keyslots");
+                if (!w || !json_variant_is_array(w)) {
+                        log_warning("Token JSON data lacks keyslots field, ignoring.");
+                        continue;
+                }
+
+                JSON_VARIANT_ARRAY_FOREACH(z, w) {
+                        unsigned u;
+
+                        if (!json_variant_is_string(z)) {
+                                log_warning("Token JSON data's keyslot field is not an array of strings, ignoring.");
+                                continue;
+                        }
+
+                        r = safe_atou(json_variant_string(z), &u);
+                        if (r < 0) {
+                                log_warning_errno(r, "Token JSON data's keyslot field is not an integer formatted as string, ignoring.");
+                                continue;
+                        }
+
+                        for (size_t i = 0; i < n_keyslot_metadata; i++) {
+                                if ((unsigned) keyslot_metadata[i].slot != u)
+                                        continue;
+
+                                if (keyslot_metadata[i].type) /* Slot claimed multiple times? */
+                                        keyslot_metadata[i].type = POINTER_MAX;
+                                else
+                                        keyslot_metadata[i].type = type;
+                        }
+                }
+        }
+
+        /* Finally, create a table out of it all */
+        t = table_new("slot", "type");
+        if (!t)
+                return log_oom();
+
+        assert_se(cell = table_get_cell(t, 0, 0));
+        (void) table_set_align_percent(t, cell, 100);
+
+        for (size_t i = 0; i < n_keyslot_metadata; i++) {
+                r = table_add_many(
+                                t,
+                                TABLE_INT, keyslot_metadata[i].slot,
+                                TABLE_STRING, keyslot_metadata[i].type == POINTER_MAX ? "conflict" :
+                                              keyslot_metadata[i].type ?: "password");
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (table_get_rows(t) <= 1) {
+                log_info("No slots found.");
+                return 0;
+        }
+
+        r = table_print(t, stdout);
+        if (r < 0)
+                return log_error_errno(r, "Failed to show slot table: %m");
+
+        return 0;
+}
diff --git a/src/cryptenroll/cryptenroll-list.h b/src/cryptenroll/cryptenroll-list.h
new file mode 100644
index 0000000..d322988
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-list.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "cryptsetup-util.h"
+
+int list_enrolled(struct crypt_device *cd);
diff --git a/src/cryptenroll/cryptenroll-password.c b/src/cryptenroll/cryptenroll-password.c
new file mode 100644
index 0000000..c35b609
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-password.c
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "ask-password-api.h"
+#include "cryptenroll-password.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "memory-util.h"
+#include "password-quality-util.h"
+#include "strv.h"
+
+int load_volume_key_password(
+                struct crypt_device *cd,
+                const char *cd_node,
+                void *ret_vk,
+                size_t *ret_vks) {
+
+        _cleanup_(erase_and_freep) char *envpw = NULL;
+        int r;
+
+        assert_se(cd);
+        assert_se(cd_node);
+        assert_se(ret_vk);
+        assert_se(ret_vks);
+
+        r = getenv_steal_erase("PASSWORD", &envpw);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire password from environment: %m");
+        if (r > 0) {
+                r = crypt_volume_key_get(
+                                cd,
+                                CRYPT_ANY_SLOT,
+                                ret_vk,
+                                ret_vks,
+                                envpw,
+                                strlen(envpw));
+                if (r < 0)
+                        return log_error_errno(r, "Password from environment variable $PASSWORD did not work: %m");
+        } else {
+                AskPasswordFlags ask_password_flags = ASK_PASSWORD_PUSH_CACHE|ASK_PASSWORD_ACCEPT_CACHED;
+                _cleanup_free_ char *question = NULL, *disk_path = NULL;
+                unsigned i = 5;
+                const char *id;
+
+                question = strjoin("Please enter current passphrase for disk ", cd_node, ":");
+                if (!question)
+                        return log_oom();
+
+                disk_path = cescape(cd_node);
+                if (!disk_path)
+                        return log_oom();
+
+                id = strjoina("cryptsetup:", disk_path);
+
+                for (;;) {
+                        _cleanup_strv_free_erase_ char **passwords = NULL;
+
+                        if (--i == 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOKEY),
+                                                       "Too many attempts, giving up.");
+
+                        r = ask_password_auto(
+                                        question, "drive-harddisk", id, "cryptenroll", "cryptenroll.passphrase", USEC_INFINITY,
+                                        ask_password_flags,
+                                        &passwords);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to query password: %m");
+
+                        r = -EPERM;
+                        STRV_FOREACH(p, passwords) {
+                                r = crypt_volume_key_get(
+                                                cd,
+                                                CRYPT_ANY_SLOT,
+                                                ret_vk,
+                                                ret_vks,
+                                                *p,
+                                                strlen(*p));
+                                if (r >= 0)
+                                        break;
+                        }
+                        if (r >= 0)
+                                break;
+
+                        log_error_errno(r, "Password not correct, please try again: %m");
+                        ask_password_flags &= ~ASK_PASSWORD_ACCEPT_CACHED;
+                }
+        }
+
+        return r;
+}
+
+int enroll_password(
+                struct crypt_device *cd,
+                const void *volume_key,
+                size_t volume_key_size) {
+
+        _cleanup_(erase_and_freep) char *new_password = NULL;
+        _cleanup_free_ char *error = NULL;
+        const char *node;
+        int r, keyslot;
+
+        assert_se(node = crypt_get_device_name(cd));
+
+        r = getenv_steal_erase("NEWPASSWORD", &new_password);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire password from environment: %m");
+        if (r == 0) {
+                _cleanup_free_ char *disk_path = NULL;
+                unsigned i = 5;
+                const char *id;
+
+                assert_se(node = crypt_get_device_name(cd));
+
+                (void) suggest_passwords();
+
+                disk_path = cescape(node);
+                if (!disk_path)
+                        return log_oom();
+
+                id = strjoina("cryptsetup:", disk_path);
+
+                for (;;) {
+                        _cleanup_strv_free_erase_ char **passwords = NULL, **passwords2 = NULL;
+                        _cleanup_free_ char *question = NULL;
+
+                        if (--i == 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOKEY),
+                                                       "Too many attempts, giving up.");
+
+                        question = strjoin("Please enter new passphrase for disk ", node, ":");
+                        if (!question)
+                                return log_oom();
+
+                        r = ask_password_auto(question, "drive-harddisk", id, "cryptenroll", "cryptenroll.new-passphrase", USEC_INFINITY, 0, &passwords);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to query password: %m");
+
+                        assert(strv_length(passwords) == 1);
+
+                        free(question);
+                        question = strjoin("Please enter new passphrase for disk ", node, " (repeat):");
+                        if (!question)
+                                return log_oom();
+
+                        r = ask_password_auto(question, "drive-harddisk", id, "cryptenroll", "cryptenroll.new-passphrase", USEC_INFINITY, 0, &passwords2);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to query password: %m");
+
+                        assert(strv_length(passwords2) == 1);
+
+                        if (strv_equal(passwords, passwords2)) {
+                                new_password = passwords2[0];
+                                passwords2 = mfree(passwords2);
+                                break;
+                        }
+
+                        log_error("Password didn't match, try again.");
+                }
+        }
+
+        r = check_password_quality(new_password, /* old */ NULL, /* user */ NULL, &error);
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                log_warning("Password quality check is not supported, proceeding anyway.");
+        else if (r < 0)
+                return log_error_errno(r, "Failed to check password quality: %m");
+        else if (r == 0)
+                log_warning("Specified password does not pass quality checks (%s), proceeding anyway.", error);
+
+        keyslot = crypt_keyslot_add_by_volume_key(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        volume_key,
+                        volume_key_size,
+                        new_password,
+                        strlen(new_password));
+        if (keyslot < 0)
+                return log_error_errno(keyslot, "Failed to add new password to %s: %m", node);
+
+        log_info("New password enrolled as key slot %i.", keyslot);
+        return keyslot;
+}
diff --git a/src/cryptenroll/cryptenroll-password.h b/src/cryptenroll/cryptenroll-password.h
new file mode 100644
index 0000000..aa07a6f
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-password.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+
+int load_volume_key_password(struct crypt_device *cd, const char* cd_node, void *ret_vk, size_t *ret_vks);
+int enroll_password(struct crypt_device *cd, const void *volume_key, size_t volume_key_size);
diff --git a/src/cryptenroll/cryptenroll-pkcs11.c b/src/cryptenroll/cryptenroll-pkcs11.c
new file mode 100644
index 0000000..54b6b86
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-pkcs11.c
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cryptenroll-pkcs11.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "memory-util.h"
+#include "openssl-util.h"
+#include "pkcs11-util.h"
+#include "random-util.h"
+
+int enroll_pkcs11(
+                struct crypt_device *cd,
+                const void *volume_key,
+                size_t volume_key_size,
+                const char *uri) {
+
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_free_ char *keyslot_as_string = NULL;
+        size_t decrypted_key_size, encrypted_key_size;
+        _cleanup_free_ void *encrypted_key = NULL;
+        _cleanup_(X509_freep) X509 *cert = NULL;
+        ssize_t base64_encoded_size;
+        const char *node;
+        EVP_PKEY *pkey;
+        int keyslot, r;
+
+        assert_se(cd);
+        assert_se(volume_key);
+        assert_se(volume_key_size > 0);
+        assert_se(uri);
+
+        assert_se(node = crypt_get_device_name(cd));
+
+        r = pkcs11_acquire_certificate(uri, "volume enrollment operation", "drive-harddisk", &cert, NULL);
+        if (r < 0)
+                return r;
+
+        pkey = X509_get0_pubkey(cert);
+        if (!pkey)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to extract public key from X.509 certificate.");
+
+        r = rsa_pkey_to_suitable_key_size(pkey, &decrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine RSA public key size.");
+
+        log_debug("Generating %zu bytes random key.", decrypted_key_size);
+
+        decrypted_key = malloc(decrypted_key_size);
+        if (!decrypted_key)
+                return log_oom();
+
+        r = crypto_random_bytes(decrypted_key, decrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate random key: %m");
+
+        r = rsa_encrypt_bytes(pkey, decrypted_key, decrypted_key_size, &encrypted_key, &encrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to encrypt key: %m");
+
+        /* Let's base64 encode the key to use, for compat with homed (and it's easier to type it in by
+         * keyboard, if that might ever end up being necessary.) */
+        base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return log_error_errno(base64_encoded_size, "Failed to base64 encode secret key: %m");
+
+        r = cryptsetup_set_minimal_pbkdf(cd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set minimal PBKDF: %m");
+
+        keyslot = crypt_keyslot_add_by_volume_key(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        volume_key,
+                        volume_key_size,
+                        base64_encoded,
+                        base64_encoded_size);
+        if (keyslot < 0)
+                return log_error_errno(keyslot, "Failed to add new PKCS#11 key to %s: %m", node);
+
+        if (asprintf(&keyslot_as_string, "%i", keyslot) < 0)
+                return log_oom();
+
+        r = json_build(&v,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("systemd-pkcs11")),
+                                       JSON_BUILD_PAIR("keyslots", JSON_BUILD_ARRAY(JSON_BUILD_STRING(keyslot_as_string))),
+                                       JSON_BUILD_PAIR("pkcs11-uri", JSON_BUILD_STRING(uri)),
+                                       JSON_BUILD_PAIR("pkcs11-key", JSON_BUILD_BASE64(encrypted_key, encrypted_key_size))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to prepare PKCS#11 JSON token object: %m");
+
+        r = cryptsetup_add_token_json(cd, v);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add PKCS#11 JSON token to LUKS2 header: %m");
+
+        log_info("New PKCS#11 token enrolled as key slot %i.", keyslot);
+        return keyslot;
+}
diff --git a/src/cryptenroll/cryptenroll-pkcs11.h b/src/cryptenroll/cryptenroll-pkcs11.h
new file mode 100644
index 0000000..b6d28bd
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-pkcs11.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+#include "log.h"
+
+#if HAVE_P11KIT && HAVE_OPENSSL
+int enroll_pkcs11(struct crypt_device *cd, const void *volume_key, size_t volume_key_size, const char *uri);
+#else
+static inline int enroll_pkcs11(struct crypt_device *cd, const void *volume_key, size_t volume_key_size, const char *uri) {
+        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "PKCS#11 key enrollment not supported.");
+}
+#endif
diff --git a/src/cryptenroll/cryptenroll-recovery.c b/src/cryptenroll/cryptenroll-recovery.c
new file mode 100644
index 0000000..7c170f2
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-recovery.c
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cryptenroll-recovery.h"
+#include "glyph-util.h"
+#include "json.h"
+#include "memory-util.h"
+#include "qrcode-util.h"
+#include "recovery-key.h"
+#include "terminal-util.h"
+
+int enroll_recovery(
+                struct crypt_device *cd,
+                const void *volume_key,
+                size_t volume_key_size) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(erase_and_freep) char *password = NULL;
+        _cleanup_free_ char *keyslot_as_string = NULL;
+        int keyslot, r, q;
+        const char *node;
+
+        assert_se(cd);
+        assert_se(volume_key);
+        assert_se(volume_key_size > 0);
+
+        assert_se(node = crypt_get_device_name(cd));
+
+        r = make_recovery_key(&password);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate recovery key: %m");
+
+        r = cryptsetup_set_minimal_pbkdf(cd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set minimal PBKDF: %m");
+
+        keyslot = crypt_keyslot_add_by_volume_key(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        volume_key,
+                        volume_key_size,
+                        password,
+                        strlen(password));
+        if (keyslot < 0)
+                return log_error_errno(keyslot, "Failed to add new recovery key to %s: %m", node);
+
+        fflush(stdout);
+        fprintf(stderr,
+                "A secret recovery key has been generated for this volume:\n\n"
+                "    %s%s%s",
+                emoji_enabled() ? special_glyph(SPECIAL_GLYPH_LOCK_AND_KEY) : "",
+                emoji_enabled() ? " " : "",
+                ansi_highlight());
+        fflush(stderr);
+
+        fputs(password, stdout);
+        fflush(stdout);
+
+        fputs(ansi_normal(), stderr);
+        fflush(stderr);
+
+        fputc('\n', stdout);
+        fflush(stdout);
+
+        fputs("\nPlease save this secret recovery key at a secure location. It may be used to\n"
+              "regain access to the volume if the other configured access credentials have\n"
+              "been lost or forgotten. The recovery key may be entered in place of a password\n"
+              "whenever authentication is requested.\n", stderr);
+        fflush(stderr);
+
+        (void) print_qrcode(stderr, "You may optionally scan the recovery key off screen", password);
+
+        if (asprintf(&keyslot_as_string, "%i", keyslot) < 0) {
+                r = log_oom();
+                goto rollback;
+        }
+
+        r = json_build(&v,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("systemd-recovery")),
+                                       JSON_BUILD_PAIR("keyslots", JSON_BUILD_ARRAY(JSON_BUILD_STRING(keyslot_as_string)))));
+        if (r < 0) {
+                log_error_errno(r, "Failed to prepare recovery key JSON token object: %m");
+                goto rollback;
+        }
+
+        r = cryptsetup_add_token_json(cd, v);
+        if (r < 0) {
+                log_error_errno(r, "Failed to add recovery JSON token to LUKS2 header: %m");
+                goto rollback;
+        }
+
+        log_info("New recovery key enrolled as key slot %i.", keyslot);
+        return keyslot;
+
+rollback:
+        q = crypt_keyslot_destroy(cd, keyslot);
+        if (q < 0)
+                log_debug_errno(q, "Unable to remove key slot we just added again, can't rollback, sorry: %m");
+
+        return r;
+}
diff --git a/src/cryptenroll/cryptenroll-recovery.h b/src/cryptenroll/cryptenroll-recovery.h
new file mode 100644
index 0000000..9bf4f2e
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-recovery.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+
+int enroll_recovery(struct crypt_device *cd, const void *volume_key, size_t volume_key_size);
diff --git a/src/cryptenroll/cryptenroll-tpm2.c b/src/cryptenroll/cryptenroll-tpm2.c
new file mode 100644
index 0000000..653ad44
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-tpm2.c
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "cryptenroll-tpm2.h"
+#include "env-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "memory-util.h"
+#include "random-util.h"
+#include "sha256.h"
+#include "tpm2-util.h"
+
+static int search_policy_hash(
+                struct crypt_device *cd,
+                const void *hash,
+                size_t hash_size) {
+
+        int r;
+
+        assert(cd);
+        assert(hash || hash_size == 0);
+
+        if (hash_size == 0)
+                return 0;
+
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token ++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                _cleanup_free_ void *thash = NULL;
+                size_t thash_size = 0;
+                int keyslot;
+                JsonVariant *w;
+
+                r = cryptsetup_get_token_as_json(cd, token, "systemd-tpm2", &v);
+                if (IN_SET(r, -ENOENT, -EINVAL, -EMEDIUMTYPE))
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read JSON token data off disk: %m");
+
+                keyslot = cryptsetup_get_keyslot_from_token(v);
+                if (keyslot < 0) {
+                        /* Handle parsing errors of the keyslots field gracefully, since it's not 'owned' by
+                         * us, but by the LUKS2 spec */
+                        log_warning_errno(keyslot, "Failed to determine keyslot of JSON token %i, skipping: %m", token);
+                        continue;
+                }
+
+                w = json_variant_by_key(v, "tpm2-policy-hash");
+                if (!w || !json_variant_is_string(w))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "TPM2 token data lacks 'tpm2-policy-hash' field.");
+
+                r = unhexmem(json_variant_string(w), SIZE_MAX, &thash, &thash_size);
+                if (r < 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Invalid base64 data in 'tpm2-policy-hash' field.");
+
+                if (memcmp_nn(hash, hash_size, thash, thash_size) == 0)
+                        return keyslot; /* Found entry with same hash. */
+        }
+
+        return -ENOENT; /* Not found */
+}
+
+static int get_pin(char **ret_pin_str, TPM2Flags *ret_flags) {
+        _cleanup_(erase_and_freep) char *pin_str = NULL;
+        TPM2Flags flags = 0;
+        int r;
+
+        assert(ret_pin_str);
+        assert(ret_flags);
+
+        r = getenv_steal_erase("NEWPIN", &pin_str);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire PIN from environment: %m");
+        if (r > 0)
+                flags |= TPM2_FLAGS_USE_PIN;
+        else {
+                for (size_t i = 5;; i--) {
+                        _cleanup_strv_free_erase_ char **pin = NULL, **pin2 = NULL;
+
+                        if (i <= 0)
+                                return log_error_errno(
+                                                SYNTHETIC_ERRNO(ENOKEY), "Too many attempts, giving up.");
+
+                        pin = strv_free_erase(pin);
+                        r = ask_password_auto(
+                                        "Please enter TPM2 PIN:",
+                                        "drive-harddisk",
+                                        NULL,
+                                        "tpm2-pin",
+                                        "cryptenroll.tpm2-pin",
+                                        USEC_INFINITY,
+                                        0,
+                                        &pin);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to ask for user pin: %m");
+                        assert(strv_length(pin) == 1);
+
+                        r = ask_password_auto(
+                                        "Please enter TPM2 PIN (repeat):",
+                                        "drive-harddisk",
+                                        NULL,
+                                        "tpm2-pin",
+                                        "cryptenroll.tpm2-pin",
+                                        USEC_INFINITY,
+                                        0,
+                                        &pin2);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to ask for user pin: %m");
+                        assert(strv_length(pin) == 1);
+
+                        if (strv_equal(pin, pin2)) {
+                                pin_str = strdup(*pin);
+                                if (!pin_str)
+                                        return log_oom();
+                                flags |= TPM2_FLAGS_USE_PIN;
+                                break;
+                        }
+
+                        log_error("PINs didn't match, please try again!");
+                }
+        }
+
+        *ret_flags = flags;
+        *ret_pin_str = TAKE_PTR(pin_str);
+
+        return 0;
+}
+
+int enroll_tpm2(struct crypt_device *cd,
+                const void *volume_key,
+                size_t volume_key_size,
+                const char *device,
+                uint32_t seal_key_handle,
+                const char *device_key,
+                Tpm2PCRValue *hash_pcr_values,
+                size_t n_hash_pcr_values,
+                const char *pubkey_path,
+                uint32_t pubkey_pcr_mask,
+                const char *signature_path,
+                bool use_pin,
+                const char *pcrlock_path) {
+
+        _cleanup_(erase_and_freep) void *secret = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *signature_json = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+        _cleanup_free_ void *srk_buf = NULL;
+        size_t secret_size, blob_size, pubkey_size = 0, srk_buf_size = 0;
+        _cleanup_free_ void *blob = NULL, *pubkey = NULL;
+        const char *node;
+        _cleanup_(erase_and_freep) char *pin_str = NULL;
+        ssize_t base64_encoded_size;
+        int r, keyslot;
+        TPM2Flags flags = 0;
+        uint8_t binary_salt[SHA256_DIGEST_SIZE] = {};
+        /*
+         * erase the salt, we'd rather attempt to not have this in a coredump
+         * as an attacker would have all the parameters but pin used to create
+         * the session key. This problem goes away when we move to a trusted
+         * primary key, aka the SRK.
+         */
+        CLEANUP_ERASE(binary_salt);
+
+        assert(cd);
+        assert(volume_key);
+        assert(volume_key_size > 0);
+        assert(tpm2_pcr_values_valid(hash_pcr_values, n_hash_pcr_values));
+        assert(TPM2_PCR_MASK_VALID(pubkey_pcr_mask));
+
+        assert_se(node = crypt_get_device_name(cd));
+
+        if (use_pin) {
+                r = get_pin(&pin_str, &flags);
+                if (r < 0)
+                        return r;
+
+                r = crypto_random_bytes(binary_salt, sizeof(binary_salt));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire random salt: %m");
+
+                uint8_t salted_pin[SHA256_DIGEST_SIZE] = {};
+                CLEANUP_ERASE(salted_pin);
+                r = tpm2_util_pbkdf2_hmac_sha256(pin_str, strlen(pin_str), binary_salt, sizeof(binary_salt), salted_pin);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to perform PBKDF2: %m");
+
+                pin_str = erase_and_free(pin_str);
+                /* re-stringify pin_str */
+                base64_encoded_size = base64mem(salted_pin, sizeof(salted_pin), &pin_str);
+                if (base64_encoded_size < 0)
+                        return log_error_errno(base64_encoded_size, "Failed to base64 encode salted pin: %m");
+        }
+
+        TPM2B_PUBLIC public = {};
+        r = tpm2_load_pcr_public_key(pubkey_path, &pubkey, &pubkey_size);
+        if (r < 0) {
+                if (pubkey_path || signature_path || r != -ENOENT)
+                        return log_error_errno(r, "Failed to read TPM PCR public key: %m");
+
+                log_debug_errno(r, "Failed to read TPM2 PCR public key, proceeding without: %m");
+                pubkey_pcr_mask = 0;
+        } else {
+                r = tpm2_tpm2b_public_from_pem(pubkey, pubkey_size, &public);
+                if (r < 0)
+                        return log_error_errno(r, "Could not convert public key to TPM2B_PUBLIC: %m");
+
+                if (signature_path) {
+                        /* Also try to load the signature JSON object, to verify that our enrollment will work.
+                         * This is optional however, skip it if it's not explicitly provided. */
+
+                        r = tpm2_load_pcr_signature(signature_path, &signature_json);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to read TPM PCR signature: %m");
+                }
+        }
+
+        bool any_pcr_value_specified = tpm2_pcr_values_has_any_values(hash_pcr_values, n_hash_pcr_values);
+
+        _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy pcrlock_policy = {};
+        if (pcrlock_path) {
+                r = tpm2_pcrlock_policy_load(pcrlock_path, &pcrlock_policy);
+                if (r < 0)
+                        return r;
+
+                any_pcr_value_specified = true;
+                flags |= TPM2_FLAGS_USE_PCRLOCK;
+        }
+
+        _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL;
+        TPM2B_PUBLIC device_key_public = {};
+        if (device_key) {
+                r = tpm2_load_public_key_file(device_key, &device_key_public);
+                if (r < 0)
+                        return r;
+
+                if (!tpm2_pcr_values_has_all_values(hash_pcr_values, n_hash_pcr_values))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Must provide all PCR values when using TPM2 device key.");
+        } else {
+                r = tpm2_context_new(device, &tpm2_context);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create TPM2 context: %m");
+
+                if (!tpm2_pcr_values_has_all_values(hash_pcr_values, n_hash_pcr_values)) {
+                        r = tpm2_pcr_read_missing_values(tpm2_context, hash_pcr_values, n_hash_pcr_values);
+                        if (r < 0)
+                                return log_error_errno(r, "Could not read pcr values: %m");
+                }
+        }
+
+        uint16_t hash_pcr_bank = 0;
+        uint32_t hash_pcr_mask = 0;
+        if (n_hash_pcr_values > 0) {
+                size_t hash_count;
+                r = tpm2_pcr_values_hash_count(hash_pcr_values, n_hash_pcr_values, &hash_count);
+                if (r < 0)
+                        return log_error_errno(r, "Could not get hash count: %m");
+
+                if (hash_count > 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Multiple PCR banks selected.");
+
+                hash_pcr_bank = hash_pcr_values[0].hash;
+                r = tpm2_pcr_values_to_mask(hash_pcr_values, n_hash_pcr_values, hash_pcr_bank, &hash_pcr_mask);
+                if (r < 0)
+                        return log_error_errno(r, "Could not get hash mask: %m");
+        }
+
+        TPM2B_DIGEST policy = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE);
+        r = tpm2_calculate_sealing_policy(
+                        hash_pcr_values,
+                        n_hash_pcr_values,
+                        pubkey ? &public : NULL,
+                        use_pin,
+                        pcrlock_path ? &pcrlock_policy : NULL,
+                        &policy);
+        if (r < 0)
+                return r;
+
+        if (device_key)
+                r = tpm2_calculate_seal(
+                                seal_key_handle,
+                                &device_key_public,
+                                /* attributes= */ NULL,
+                                /* secret= */ NULL, /* secret_size= */ 0,
+                                &policy,
+                                pin_str,
+                                &secret, &secret_size,
+                                &blob, &blob_size,
+                                &srk_buf, &srk_buf_size);
+        else
+                r = tpm2_seal(tpm2_context,
+                              seal_key_handle,
+                              &policy,
+                              pin_str,
+                              &secret, &secret_size,
+                              &blob, &blob_size,
+                              /* ret_primary_alg= */ NULL,
+                              &srk_buf, &srk_buf_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to seal to TPM2: %m");
+
+        /* Let's see if we already have this specific PCR policy hash enrolled, if so, exit early. */
+        r = search_policy_hash(cd, policy.buffer, policy.size);
+        if (r == -ENOENT)
+                log_debug_errno(r, "PCR policy hash not yet enrolled, enrolling now.");
+        else if (r < 0)
+                return r;
+        else {
+                log_info("This PCR set is already enrolled, executing no operation.");
+                return r; /* return existing keyslot, so that wiping won't kill it */
+        }
+
+        /* If possible, verify the sealed data object. */
+        if ((!pubkey || signature_json) && !any_pcr_value_specified && !device_key) {
+                _cleanup_(erase_and_freep) void *secret2 = NULL;
+                size_t secret2_size;
+
+                log_debug("Unsealing for verification...");
+                r = tpm2_unseal(tpm2_context,
+                                hash_pcr_mask,
+                                hash_pcr_bank,
+                                pubkey, pubkey_size,
+                                pubkey_pcr_mask,
+                                signature_json,
+                                pin_str,
+                                pcrlock_path ? &pcrlock_policy : NULL,
+                                /* primary_alg= */ 0,
+                                blob, blob_size,
+                                policy.buffer, policy.size,
+                                srk_buf, srk_buf_size,
+                                &secret2, &secret2_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to unseal secret using TPM2: %m");
+
+                if (memcmp_nn(secret, secret_size, secret2, secret2_size) != 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM2 seal/unseal verification failed.");
+        }
+
+        /* let's base64 encode the key to use, for compat with homed (and it's easier to every type it in by keyboard, if that might end up being necessary. */
+        base64_encoded_size = base64mem(secret, secret_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return log_error_errno(base64_encoded_size, "Failed to base64 encode secret key: %m");
+
+        r = cryptsetup_set_minimal_pbkdf(cd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set minimal PBKDF: %m");
+
+        keyslot = crypt_keyslot_add_by_volume_key(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        volume_key,
+                        volume_key_size,
+                        base64_encoded,
+                        base64_encoded_size);
+        if (keyslot < 0)
+                return log_error_errno(keyslot, "Failed to add new TPM2 key to %s: %m", node);
+
+        r = tpm2_make_luks2_json(
+                        keyslot,
+                        hash_pcr_mask,
+                        hash_pcr_bank,
+                        pubkey, pubkey_size,
+                        pubkey_pcr_mask,
+                        /* primary_alg= */ 0,
+                        blob, blob_size,
+                        policy.buffer, policy.size,
+                        use_pin ? binary_salt : NULL,
+                        use_pin ? sizeof(binary_salt) : 0,
+                        srk_buf, srk_buf_size,
+                        flags,
+                        &v);
+        if (r < 0)
+                return log_error_errno(r, "Failed to prepare TPM2 JSON token object: %m");
+
+        r = cryptsetup_add_token_json(cd, v);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add TPM2 JSON token to LUKS2 header: %m");
+
+        log_info("New TPM2 token enrolled as key slot %i.", keyslot);
+        return keyslot;
+}
diff --git a/src/cryptenroll/cryptenroll-tpm2.h b/src/cryptenroll/cryptenroll-tpm2.h
new file mode 100644
index 0000000..2fbcdd4
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-tpm2.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+#include "log.h"
+#include "tpm2-util.h"
+
+#if HAVE_TPM2
+int enroll_tpm2(struct crypt_device *cd, const void *volume_key, size_t volume_key_size, const char *device, uint32_t seal_key_handle, const char *device_key, Tpm2PCRValue *hash_pcrs, size_t n_hash_pcrs, const char *pubkey_path, uint32_t pubkey_pcr_mask, const char *signature_path, bool use_pin, const char *pcrlock_path);
+#else
+static inline int enroll_tpm2(struct crypt_device *cd, const void *volume_key, size_t volume_key_size, const char *device, uint32_t seal_key_handle, const char *device_key, Tpm2PCRValue *hash_pcrs, size_t n_hash_pcrs, const char *pubkey_path, uint32_t pubkey_pcr_mask, const char *signature_path, bool use_pin, const char *pcrlock_path) {
+        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "TPM2 key enrollment not supported.");
+}
+#endif
diff --git a/src/cryptenroll/cryptenroll-wipe.c b/src/cryptenroll/cryptenroll-wipe.c
new file mode 100644
index 0000000..314ebd3
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-wipe.c
@@ -0,0 +1,445 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cryptenroll-wipe.h"
+#include "cryptenroll.h"
+#include "json.h"
+#include "memory-util.h"
+#include "parse-util.h"
+#include "set.h"
+#include "sort-util.h"
+
+static int find_all_slots(struct crypt_device *cd, Set *wipe_slots, Set *keep_slots) {
+        int slot_max;
+
+        assert(cd);
+        assert(wipe_slots);
+        assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+
+        /* Finds all currently assigned slots, and adds them to 'wipe_slots', except if listed already in 'keep_slots' */
+
+        for (int slot = 0; slot < slot_max; slot++) {
+                crypt_keyslot_info status;
+
+                /* No need to check this slot if we already know we want to wipe it or definitely keep it. */
+                if (set_contains(keep_slots, INT_TO_PTR(slot)) ||
+                    set_contains(wipe_slots, INT_TO_PTR(slot)))
+                        continue;
+
+                status = crypt_keyslot_status(cd, slot);
+                if (!IN_SET(status, CRYPT_SLOT_ACTIVE, CRYPT_SLOT_ACTIVE_LAST))
+                        continue;
+
+                if (set_put(wipe_slots, INT_TO_PTR(slot)) < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+static int find_empty_passphrase_slots(struct crypt_device *cd, Set *wipe_slots, Set *keep_slots) {
+        size_t vks;
+        int r, slot_max;
+
+        assert(cd);
+        assert(wipe_slots);
+        assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+
+        /* Finds all slots with an empty passphrase assigned (i.e. "") and adds them to 'wipe_slots', except
+         * if listed already in 'keep_slots' */
+
+        r = crypt_get_volume_key_size(cd);
+        if (r <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine LUKS volume key size");
+        vks = (size_t) r;
+
+        for (int slot = 0; slot < slot_max; slot++) {
+                _cleanup_(erase_and_freep) char *vk = NULL;
+                crypt_keyslot_info status;
+
+                /* No need to check this slot if we already know we want to wipe it or definitely keep it. */
+                if (set_contains(keep_slots, INT_TO_PTR(slot)) ||
+                    set_contains(wipe_slots, INT_TO_PTR(slot)))
+                        continue;
+
+                status = crypt_keyslot_status(cd, slot);
+                if (!IN_SET(status, CRYPT_SLOT_ACTIVE, CRYPT_SLOT_ACTIVE_LAST))
+                        continue;
+
+                vk = malloc(vks);
+                if (!vk)
+                        return log_oom();
+
+                r = crypt_volume_key_get(cd, slot, vk, &vks, "", 0);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to acquire volume key from slot %i with empty password, ignoring: %m", slot);
+                        continue;
+                }
+
+                if (set_put(wipe_slots, INT_TO_PTR(r)) < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+static int find_slots_by_mask(
+                struct crypt_device *cd,
+                Set *wipe_slots,
+                Set *keep_slots,
+                unsigned by_mask) {
+
+        _cleanup_set_free_ Set *listed_slots = NULL;
+        int r;
+
+        assert(cd);
+        assert(wipe_slots);
+
+        if (by_mask == 0)
+                return 0;
+
+        /* Find all slots that are associated with a token of a type in the specified token type mask */
+
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                JsonVariant *w, *z;
+                EnrollType t;
+
+                r = cryptsetup_get_token_as_json(cd, token, NULL, &v);
+                if (IN_SET(r, -ENOENT, -EINVAL))
+                        continue;
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to read JSON token data off disk, ignoring: %m");
+                        continue;
+                }
+
+                w = json_variant_by_key(v, "type");
+                if (!w || !json_variant_is_string(w)) {
+                        log_warning("Token JSON data lacks type field, ignoring.");
+                        continue;
+                }
+
+                t = luks2_token_type_from_string(json_variant_string(w));
+
+                w = json_variant_by_key(v, "keyslots");
+                if (!w || !json_variant_is_array(w)) {
+                        log_warning("Token JSON data lacks keyslots field, ignoring.");
+                        continue;
+                }
+
+                JSON_VARIANT_ARRAY_FOREACH(z, w) {
+                        int slot;
+
+                        if (!json_variant_is_string(z)) {
+                                log_warning("Token JSON data's keyslot field is not an array of strings, ignoring.");
+                                continue;
+                        }
+
+                        r = safe_atoi(json_variant_string(z), &slot);
+                        if (r < 0) {
+                                log_warning_errno(r, "Token JSON data's keyslot filed is not an integer formatted as string, ignoring.");
+                                continue;
+                        }
+
+                        if (t >= 0 && (by_mask & (1U << t)) != 0) {
+                                /* Selected by token type */
+                                if (set_put(wipe_slots, INT_TO_PTR(slot)) < 0)
+                                        return log_oom();
+                        } else if ((by_mask & (1U << ENROLL_PASSWORD)) != 0) {
+                                /* If we shall remove all plain password slots, let's maintain a list of
+                                 * slots that are listed in any tokens, since those are *NOT* plain
+                                 * passwords */
+                                if (set_ensure_allocated(&listed_slots, NULL) < 0)
+                                        return log_oom();
+
+                                if (set_put(listed_slots, INT_TO_PTR(slot)) < 0)
+                                        return log_oom();
+                        }
+                }
+        }
+
+        /* "password" slots are those which have no token assigned. If we shall remove those, iterate through
+         * all slots and mark those for wiping that weren't listed in any token */
+        if ((by_mask & (1U << ENROLL_PASSWORD)) != 0) {
+                int slot_max;
+
+                assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+
+                for (int slot = 0; slot < slot_max; slot++) {
+                        crypt_keyslot_info status;
+
+                        /* No need to check this slot if we already know we want to wipe it or definitely keep it. */
+                        if (set_contains(keep_slots, INT_TO_PTR(slot)) ||
+                            set_contains(wipe_slots, INT_TO_PTR(slot)))
+                                continue;
+
+                        if (set_contains(listed_slots, INT_TO_PTR(slot))) /* This has a token, hence is not a password. */
+                                continue;
+
+                        status = crypt_keyslot_status(cd, slot);
+                        if (!IN_SET(status, CRYPT_SLOT_ACTIVE, CRYPT_SLOT_ACTIVE_LAST)) /* Not actually assigned? */
+                                continue;
+
+                        /* Finally, we found a password, add it to the list of slots to wipe */
+                        if (set_put(wipe_slots, INT_TO_PTR(slot)) < 0)
+                                return log_oom();
+                }
+        }
+
+        return 0;
+}
+
+static int find_slot_tokens(struct crypt_device *cd, Set *wipe_slots, Set *keep_slots, Set *wipe_tokens) {
+        int r;
+
+        assert(cd);
+        assert(wipe_slots);
+        assert(keep_slots);
+        assert(wipe_tokens);
+
+        /* Find all tokens matching the slots we want to wipe, so that we can wipe them too. Also, for update
+         * the slots sets according to the token data: add any other slots listed in the tokens we act on. */
+
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                bool shall_wipe = false;
+                JsonVariant *w, *z;
+
+                r = cryptsetup_get_token_as_json(cd, token, NULL, &v);
+                if (IN_SET(r, -ENOENT, -EINVAL))
+                        continue;
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to read JSON token data off disk, ignoring: %m");
+                        continue;
+                }
+
+                w = json_variant_by_key(v, "keyslots");
+                if (!w || !json_variant_is_array(w)) {
+                        log_warning("Token JSON data lacks keyslots field, ignoring.");
+                        continue;
+                }
+
+                /* Go through the slots associated with this token: if we shall keep any slot of them, the token shall stay too. */
+                JSON_VARIANT_ARRAY_FOREACH(z, w) {
+                        int slot;
+
+                        if (!json_variant_is_string(z)) {
+                                log_warning("Token JSON data's keyslot field is not an array of strings, ignoring.");
+                                continue;
+                        }
+
+                        r = safe_atoi(json_variant_string(z), &slot);
+                        if (r < 0) {
+                                log_warning_errno(r, "Token JSON data's keyslot filed is not an integer formatted as string, ignoring.");
+                                continue;
+                        }
+
+                        if (set_contains(keep_slots, INT_TO_PTR(slot))) {
+                                shall_wipe = false;
+                                break; /* If we shall keep this slot, then this is definite: we will keep its token too */
+                        }
+
+                        /* If there's a slot associated with this token that we shall wipe, then remove the
+                         * token too. But we are careful here: let's continue iterating, maybe there's a slot
+                         * that we need to keep, in which case we can reverse the decision again. */
+                        if (set_contains(wipe_slots, INT_TO_PTR(slot)))
+                                shall_wipe = true;
+                }
+
+                /* Go through the slots again, and this time add them to the list of slots to keep/remove */
+                JSON_VARIANT_ARRAY_FOREACH(z, w) {
+                        int slot;
+
+                        if (!json_variant_is_string(z))
+                                continue;
+                        if (safe_atoi(json_variant_string(z), &slot) < 0)
+                                continue;
+
+                        if (set_put(shall_wipe ? wipe_slots : keep_slots, INT_TO_PTR(slot)) < 0)
+                                return log_oom();
+                }
+
+                /* And of course, also remember the tokens to remove. */
+                if (shall_wipe)
+                        if (set_put(wipe_tokens, INT_TO_PTR(token)) < 0)
+                                return log_oom();
+        }
+
+        return 0;
+}
+
+static bool slots_remain(struct crypt_device *cd, Set *wipe_slots, Set *keep_slots) {
+        int slot_max;
+
+        assert(cd);
+        assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+
+        /* Checks if any slots remaining in the LUKS2 header if we remove all slots listed in 'wipe_slots'
+         * (keeping those listed in 'keep_slots') */
+
+        for (int slot = 0; slot < slot_max; slot++) {
+                crypt_keyslot_info status;
+
+                status = crypt_keyslot_status(cd, slot);
+                if (!IN_SET(status, CRYPT_SLOT_ACTIVE, CRYPT_SLOT_ACTIVE_LAST))
+                        continue;
+
+                /* The "keep" set wins if a slot is listed in both sets. This is important so that we can
+                 * safely add a new slot and remove all others of the same type, which in a naive
+                 * implementation might mean we remove what we just added — which we of course don't want. */
+                if (set_contains(keep_slots, INT_TO_PTR(slot)) ||
+                    !set_contains(wipe_slots, INT_TO_PTR(slot)))
+                        return true;
+        }
+
+        return false;
+}
+
+int wipe_slots(struct crypt_device *cd,
+               const int explicit_slots[],
+               size_t n_explicit_slots,
+               WipeScope by_scope,
+               unsigned by_mask,
+               int except_slot) {
+
+        _cleanup_set_free_ Set *wipe_slots = NULL, *wipe_tokens = NULL, *keep_slots = NULL;
+        _cleanup_free_ int *ordered_slots = NULL, *ordered_tokens = NULL;
+        size_t n_ordered_slots = 0, n_ordered_tokens = 0;
+        int r, slot_max, ret;
+        void *e;
+
+        assert_se(cd);
+
+        /* Shortcut if nothing to wipe. */
+        if (n_explicit_slots == 0 && by_mask == 0 && by_scope == WIPE_EXPLICIT)
+                return 0;
+
+        /* So this is a bit more complicated than I'd wish, but we want support three different axis for wiping slots:
+         *
+         *    1. Wiping by slot indexes
+         *    2. Wiping slots of specified token types
+         *    3. Wiping "all" entries, or entries with an empty password (i.e. "")
+         *
+         * (or any combination of the above)
+         *
+         * Plus: We always want to remove tokens matching the slots.
+         * Plus: We always want to exclude the slots/tokens we just added.
+         */
+
+        wipe_slots = set_new(NULL);
+        keep_slots = set_new(NULL);
+        wipe_tokens = set_new(NULL);
+        if (!wipe_slots || !keep_slots || !wipe_tokens)
+                return log_oom();
+
+        /* Let's maintain one set of slots for the slots we definitely want to keep */
+        if (except_slot >= 0)
+                if (set_put(keep_slots, INT_TO_PTR(except_slot)) < 0)
+                        return log_oom();
+
+        assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+
+        /* Maintain another set of the slots we intend to wipe */
+        for (size_t i = 0; i < n_explicit_slots; i++) {
+                if (explicit_slots[i] >= slot_max)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Slot index %i out of range.", explicit_slots[i]);
+
+                if (set_put(wipe_slots, INT_TO_PTR(explicit_slots[i])) < 0)
+                        return log_oom();
+        }
+
+        /* Now, handle the "all" and "empty passphrase" cases. */
+        switch (by_scope) {
+
+        case WIPE_EXPLICIT:
+                break; /* Nothing to do here */
+
+        case WIPE_ALL:
+                r = find_all_slots(cd, wipe_slots, keep_slots);
+                if (r < 0)
+                        return r;
+
+                break;
+
+        case WIPE_EMPTY_PASSPHRASE:
+                r = find_empty_passphrase_slots(cd, wipe_slots, keep_slots);
+                if (r < 0)
+                        return r;
+
+                break;
+        default:
+                assert_not_reached();
+        }
+
+        /* Then add all slots that match a token type */
+        r = find_slots_by_mask(cd, wipe_slots, keep_slots, by_mask);
+        if (r < 0)
+                return r;
+
+        /* And determine tokens that we shall remove */
+        r = find_slot_tokens(cd, wipe_slots, keep_slots, wipe_tokens);
+        if (r < 0)
+                return r;
+
+        /* Safety check: let's make sure that after we are done there's at least one slot remaining */
+        if (!slots_remain(cd, wipe_slots, keep_slots))
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+                                       "Wipe operation would leave no valid slots around, can't allow that, sorry.");
+
+        /* Generated ordered lists of the slots and the tokens to remove */
+        ordered_slots = new(int, set_size(wipe_slots));
+        if (!ordered_slots)
+                return log_oom();
+        SET_FOREACH(e, wipe_slots) {
+                int slot = PTR_TO_INT(e);
+
+                if (set_contains(keep_slots, INT_TO_PTR(slot)))
+                        continue;
+
+                ordered_slots[n_ordered_slots++] = slot;
+        }
+        typesafe_qsort(ordered_slots, n_ordered_slots, cmp_int);
+
+        ordered_tokens = new(int, set_size(wipe_tokens));
+        if (!ordered_tokens)
+                return log_oom();
+        SET_FOREACH(e, wipe_tokens)
+                ordered_tokens[n_ordered_tokens++] = PTR_TO_INT(e);
+        typesafe_qsort(ordered_tokens, n_ordered_tokens, cmp_int);
+
+        if (n_ordered_slots == 0 && n_ordered_tokens == 0) {
+                log_full(except_slot < 0 ? LOG_NOTICE : LOG_DEBUG,
+                         "No slots to remove selected.");
+                return 0;
+        }
+
+        if (DEBUG_LOGGING) {
+                for (size_t i = 0; i < n_ordered_slots; i++)
+                        log_debug("Going to wipe slot %i.", ordered_slots[i]);
+                for (size_t i = 0; i < n_ordered_tokens; i++)
+                        log_debug("Going to wipe token %i.", ordered_tokens[i]);
+        }
+
+        /* Now, let's actually start wiping things. (We go from back to front, to make space at the end
+         * first.) */
+        ret = 0;
+        for (size_t i = n_ordered_slots; i > 0; i--) {
+                r = crypt_keyslot_destroy(cd, ordered_slots[i - 1]);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to wipe slot %i, continuing: %m", ordered_slots[i - 1]);
+                        if (ret == 0)
+                                ret = r;
+                } else
+                        log_info("Wiped slot %i.", ordered_slots[i - 1]);
+        }
+
+        for (size_t i = n_ordered_tokens; i > 0; i--) {
+                r = crypt_token_json_set(cd, ordered_tokens[i - 1], NULL);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to wipe token %i, continuing: %m", ordered_tokens[i - 1]);
+                        if (ret == 0)
+                                ret = r;
+                }
+        }
+
+        return ret;
+}
diff --git a/src/cryptenroll/cryptenroll-wipe.h b/src/cryptenroll/cryptenroll-wipe.h
new file mode 100644
index 0000000..5bcd783
--- /dev/null
+++ b/src/cryptenroll/cryptenroll-wipe.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "cryptenroll.h"
+#include "cryptsetup-util.h"
+
+int wipe_slots(struct crypt_device *cd,
+               const int explicit_slots[],
+               size_t n_explicit_slots,
+               WipeScope by_scope,
+               unsigned by_mask,
+               int except_slot);
diff --git a/src/cryptenroll/cryptenroll.c b/src/cryptenroll/cryptenroll.c
new file mode 100644
index 0000000..1cb6652
--- /dev/null
+++ b/src/cryptenroll/cryptenroll.c
@@ -0,0 +1,762 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "ask-password-api.h"
+#include "build.h"
+#include "cryptenroll-fido2.h"
+#include "cryptenroll-list.h"
+#include "cryptenroll-password.h"
+#include "cryptenroll-pkcs11.h"
+#include "cryptenroll-recovery.h"
+#include "cryptenroll-tpm2.h"
+#include "cryptenroll-wipe.h"
+#include "cryptenroll.h"
+#include "cryptsetup-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "libfido2-util.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pkcs11-util.h"
+#include "pretty-print.h"
+#include "string-table.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tpm2-pcr.h"
+
+static EnrollType arg_enroll_type = _ENROLL_TYPE_INVALID;
+static char *arg_unlock_keyfile = NULL;
+static UnlockType arg_unlock_type = UNLOCK_PASSWORD;
+static char *arg_unlock_fido2_device = NULL;
+static char *arg_pkcs11_token_uri = NULL;
+static char *arg_fido2_device = NULL;
+static char *arg_tpm2_device = NULL;
+static uint32_t arg_tpm2_seal_key_handle = 0;
+static char *arg_tpm2_device_key = NULL;
+static Tpm2PCRValue *arg_tpm2_hash_pcr_values = NULL;
+static size_t arg_tpm2_n_hash_pcr_values = 0;
+static bool arg_tpm2_pin = false;
+static char *arg_tpm2_public_key = NULL;
+static uint32_t arg_tpm2_public_key_pcr_mask = 0;
+static char *arg_tpm2_signature = NULL;
+static char *arg_tpm2_pcrlock = NULL;
+static char *arg_node = NULL;
+static int *arg_wipe_slots = NULL;
+static size_t arg_n_wipe_slots = 0;
+static WipeScope arg_wipe_slots_scope = WIPE_EXPLICIT;
+static unsigned arg_wipe_slots_mask = 0; /* Bitmask of (1U << EnrollType), for wiping all slots of specific types */
+static Fido2EnrollFlags arg_fido2_lock_with = FIDO2ENROLL_PIN | FIDO2ENROLL_UP;
+#if HAVE_LIBFIDO2
+static int arg_fido2_cred_alg = COSE_ES256;
+#else
+static int arg_fido2_cred_alg = 0;
+#endif
+
+assert_cc(sizeof(arg_wipe_slots_mask) * 8 >= _ENROLL_TYPE_MAX);
+
+STATIC_DESTRUCTOR_REGISTER(arg_unlock_keyfile, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_unlock_fido2_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pkcs11_token_uri, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_fido2_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device_key, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_hash_pcr_values, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_public_key, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_signature, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_pcrlock, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_node, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_wipe_slots, freep);
+
+static bool wipe_requested(void) {
+        return arg_n_wipe_slots > 0 ||
+                arg_wipe_slots_scope != WIPE_EXPLICIT ||
+                arg_wipe_slots_mask != 0;
+}
+
+static const char* const enroll_type_table[_ENROLL_TYPE_MAX] = {
+        [ENROLL_PASSWORD] = "password",
+        [ENROLL_RECOVERY] = "recovery",
+        [ENROLL_PKCS11]   = "pkcs11",
+        [ENROLL_FIDO2]    = "fido2",
+        [ENROLL_TPM2]     = "tpm2",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(enroll_type, EnrollType);
+
+static const char *const luks2_token_type_table[_ENROLL_TYPE_MAX] = {
+        /* ENROLL_PASSWORD has no entry here, as slots of this type do not have a token in the LUKS2 header */
+        [ENROLL_RECOVERY] = "systemd-recovery",
+        [ENROLL_PKCS11]   = "systemd-pkcs11",
+        [ENROLL_FIDO2]    = "systemd-fido2",
+        [ENROLL_TPM2]     = "systemd-tpm2",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(luks2_token_type, EnrollType);
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-cryptenroll", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s [OPTIONS...] BLOCK-DEVICE\n\n"
+               "%5$sEnroll a security token or authentication credential to a LUKS volume.%6$s\n\n"
+               "  -h --help            Show this help\n"
+               "     --version         Show package version\n"
+               "     --wipe-slot=SLOT1,SLOT2,…\n"
+               "                       Wipe specified slots\n"
+               "\n%3$sUnlocking:%4$s\n"
+               "     --unlock-key-file=PATH\n"
+               "                       Use a file to unlock the volume\n"
+               "     --unlock-fido2-device=PATH\n"
+               "                       Use a FIDO2 device to unlock the volume\n"
+               "\n%3$sSimple Enrollment:%4$s\n"
+               "     --password        Enroll a user-supplied password\n"
+               "     --recovery-key    Enroll a recovery key\n"
+               "\n%3$sPKCS11 Enrollment:%4$s\n"
+               "     --pkcs11-token-uri=URI\n"
+               "                       Specify PKCS#11 security token URI\n"
+               "\n%3$sFIDO2 Enrollment:%4$s\n"
+               "     --fido2-device=PATH\n"
+               "                       Enroll a FIDO2-HMAC security token\n"
+               "     --fido2-credential-algorithm=STRING\n"
+               "                       Specify COSE algorithm for FIDO2 credential\n"
+               "     --fido2-with-client-pin=BOOL\n"
+               "                       Whether to require entering a PIN to unlock the volume\n"
+               "     --fido2-with-user-presence=BOOL\n"
+               "                       Whether to require user presence to unlock the volume\n"
+               "     --fido2-with-user-verification=BOOL\n"
+               "                       Whether to require user verification to unlock the volume\n"
+               "\n%3$sTPM2 Enrollment:%4$s\n"
+               "     --tpm2-device=PATH\n"
+               "                       Enroll a TPM2 device\n"
+               "     --tpm2-device-key=PATH\n"
+               "                       Enroll a TPM2 device using its public key\n"
+               "     --tpm2-seal-key-handle=HANDLE\n"
+               "                       Specify handle of key to use for sealing\n"
+               "     --tpm2-pcrs=PCR1+PCR2+PCR3+…\n"
+               "                       Specify TPM2 PCRs to seal against\n"
+               "     --tpm2-public-key=PATH\n"
+               "                       Enroll signed TPM2 PCR policy against PEM public key\n"
+               "     --tpm2-public-key-pcrs=PCR1+PCR2+PCR3+…\n"
+               "                       Enroll signed TPM2 PCR policy for specified TPM2 PCRs\n"
+               "     --tpm2-signature=PATH\n"
+               "                       Validate public key enrollment works with JSON signature\n"
+               "                       file\n"
+               "     --tpm2-pcrlock=PATH\n"
+               "                       Specify pcrlock policy to lock against\n"
+               "     --tpm2-with-pin=BOOL\n"
+               "                       Whether to require entering a PIN to unlock the volume\n"
+               "\nSee the %2$s for details.\n",
+               program_invocation_short_name,
+               link,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_PASSWORD,
+                ARG_RECOVERY_KEY,
+                ARG_UNLOCK_KEYFILE,
+                ARG_UNLOCK_FIDO2_DEVICE,
+                ARG_PKCS11_TOKEN_URI,
+                ARG_FIDO2_DEVICE,
+                ARG_TPM2_DEVICE,
+                ARG_TPM2_DEVICE_KEY,
+                ARG_TPM2_SEAL_KEY_HANDLE,
+                ARG_TPM2_PCRS,
+                ARG_TPM2_PUBLIC_KEY,
+                ARG_TPM2_PUBLIC_KEY_PCRS,
+                ARG_TPM2_SIGNATURE,
+                ARG_TPM2_PCRLOCK,
+                ARG_TPM2_WITH_PIN,
+                ARG_WIPE_SLOT,
+                ARG_FIDO2_WITH_PIN,
+                ARG_FIDO2_WITH_UP,
+                ARG_FIDO2_WITH_UV,
+                ARG_FIDO2_CRED_ALG,
+        };
+
+        static const struct option options[] = {
+                { "help",                         no_argument,       NULL, 'h'                       },
+                { "version",                      no_argument,       NULL, ARG_VERSION               },
+                { "password",                     no_argument,       NULL, ARG_PASSWORD              },
+                { "recovery-key",                 no_argument,       NULL, ARG_RECOVERY_KEY          },
+                { "unlock-key-file",              required_argument, NULL, ARG_UNLOCK_KEYFILE        },
+                { "unlock-fido2-device",          required_argument, NULL, ARG_UNLOCK_FIDO2_DEVICE   },
+                { "pkcs11-token-uri",             required_argument, NULL, ARG_PKCS11_TOKEN_URI      },
+                { "fido2-credential-algorithm",   required_argument, NULL, ARG_FIDO2_CRED_ALG        },
+                { "fido2-device",                 required_argument, NULL, ARG_FIDO2_DEVICE          },
+                { "fido2-with-client-pin",        required_argument, NULL, ARG_FIDO2_WITH_PIN        },
+                { "fido2-with-user-presence",     required_argument, NULL, ARG_FIDO2_WITH_UP         },
+                { "fido2-with-user-verification", required_argument, NULL, ARG_FIDO2_WITH_UV         },
+                { "tpm2-device",                  required_argument, NULL, ARG_TPM2_DEVICE           },
+                { "tpm2-device-key",              required_argument, NULL, ARG_TPM2_DEVICE_KEY       },
+                { "tpm2-seal-key-handle",         required_argument, NULL, ARG_TPM2_SEAL_KEY_HANDLE  },
+                { "tpm2-pcrs",                    required_argument, NULL, ARG_TPM2_PCRS             },
+                { "tpm2-public-key",              required_argument, NULL, ARG_TPM2_PUBLIC_KEY       },
+                { "tpm2-public-key-pcrs",         required_argument, NULL, ARG_TPM2_PUBLIC_KEY_PCRS  },
+                { "tpm2-signature",               required_argument, NULL, ARG_TPM2_SIGNATURE        },
+                { "tpm2-pcrlock",                 required_argument, NULL, ARG_TPM2_PCRLOCK          },
+                { "tpm2-with-pin",                required_argument, NULL, ARG_TPM2_WITH_PIN         },
+                { "wipe-slot",                    required_argument, NULL, ARG_WIPE_SLOT             },
+                {}
+        };
+
+        bool auto_hash_pcr_values = true, auto_public_key_pcr_mask = true, auto_pcrlock = true;
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_FIDO2_WITH_PIN:
+                        r = parse_boolean_argument("--fido2-with-client-pin=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_fido2_lock_with, FIDO2ENROLL_PIN, r);
+                        break;
+
+                case ARG_FIDO2_WITH_UP:
+                        r = parse_boolean_argument("--fido2-with-user-presence=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_fido2_lock_with, FIDO2ENROLL_UP, r);
+                        break;
+
+                case ARG_FIDO2_WITH_UV:
+                        r = parse_boolean_argument("--fido2-with-user-verification=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_fido2_lock_with, FIDO2ENROLL_UV, r);
+                        break;
+
+                case ARG_PASSWORD:
+                        if (arg_enroll_type >= 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple operations specified at once, refusing.");
+
+                        arg_enroll_type = ENROLL_PASSWORD;
+                        break;
+
+                case ARG_RECOVERY_KEY:
+                        if (arg_enroll_type >= 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple operations specified at once, refusing.");
+
+                        arg_enroll_type = ENROLL_RECOVERY;
+                        break;
+
+                case ARG_UNLOCK_KEYFILE:
+                        if (arg_unlock_type != UNLOCK_PASSWORD)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple unlock methods specified at once, refusing.");
+
+                        r = parse_path_argument(optarg, /* suppress_root= */ true, &arg_unlock_keyfile);
+                        if (r < 0)
+                                return r;
+
+                        arg_unlock_type = UNLOCK_KEYFILE;
+                        break;
+
+                case ARG_UNLOCK_FIDO2_DEVICE: {
+                        _cleanup_free_ char *device = NULL;
+
+                        if (arg_unlock_type != UNLOCK_PASSWORD)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple unlock methods specified at once, refusing.");
+
+                        assert(!arg_unlock_fido2_device);
+
+                        if (!streq(optarg, "auto")) {
+                                device = strdup(optarg);
+                                if (!device)
+                                        return log_oom();
+                        }
+
+                        arg_unlock_type = UNLOCK_FIDO2;
+                        arg_unlock_fido2_device = TAKE_PTR(device);
+                        break;
+                }
+
+                case ARG_PKCS11_TOKEN_URI: {
+                        _cleanup_free_ char *uri = NULL;
+
+                        if (streq(optarg, "list"))
+                                return pkcs11_list_tokens();
+
+                        if (arg_enroll_type >= 0 || arg_pkcs11_token_uri)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple operations specified at once, refusing.");
+
+                        if (streq(optarg, "auto")) {
+                                r = pkcs11_find_token_auto(&uri);
+                                if (r < 0)
+                                        return r;
+                        } else {
+                                if (!pkcs11_uri_valid(optarg))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Not a valid PKCS#11 URI: %s", optarg);
+
+                                uri = strdup(optarg);
+                                if (!uri)
+                                        return log_oom();
+                        }
+
+                        arg_enroll_type = ENROLL_PKCS11;
+                        arg_pkcs11_token_uri = TAKE_PTR(uri);
+                        break;
+                }
+
+                case ARG_FIDO2_CRED_ALG:
+                        r = parse_fido2_algorithm(optarg, &arg_fido2_cred_alg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse COSE algorithm: %s", optarg);
+                        break;
+
+                case ARG_FIDO2_DEVICE: {
+                        _cleanup_free_ char *device = NULL;
+
+                        if (streq(optarg, "list"))
+                                return fido2_list_devices();
+
+                        if (arg_enroll_type >= 0 || arg_fido2_device)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple operations specified at once, refusing.");
+
+                        if (!streq(optarg, "auto")) {
+                                device = strdup(optarg);
+                                if (!device)
+                                        return log_oom();
+                        }
+
+                        arg_enroll_type = ENROLL_FIDO2;
+                        arg_fido2_device = TAKE_PTR(device);
+                        break;
+                }
+
+                case ARG_TPM2_DEVICE: {
+                        _cleanup_free_ char *device = NULL;
+
+                        if (streq(optarg, "list"))
+                                return tpm2_list_devices();
+
+                        if (arg_enroll_type >= 0 || arg_tpm2_device)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple operations specified at once, refusing.");
+
+                        if (!streq(optarg, "auto")) {
+                                device = strdup(optarg);
+                                if (!device)
+                                        return log_oom();
+                        }
+
+                        arg_enroll_type = ENROLL_TPM2;
+                        arg_tpm2_device = TAKE_PTR(device);
+                        break;
+                }
+
+                case ARG_TPM2_DEVICE_KEY:
+                        if (arg_enroll_type >= 0 || arg_tpm2_device_key)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Multiple operations specified at once, refusing.");
+
+
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_device_key);
+                        if (r < 0)
+                                return r;
+
+                        arg_enroll_type = ENROLL_TPM2;
+                        break;
+
+                case ARG_TPM2_SEAL_KEY_HANDLE:
+                        r = safe_atou32_full(optarg, 16, &arg_tpm2_seal_key_handle);
+                        if (r < 0)
+                                return log_error_errno(r, "Could not parse TPM2 seal key handle index '%s': %m", optarg);
+
+                        break;
+
+                case ARG_TPM2_PCRS:
+                        auto_hash_pcr_values = false;
+                        r = tpm2_parse_pcr_argument_append(optarg, &arg_tpm2_hash_pcr_values, &arg_tpm2_n_hash_pcr_values);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_PUBLIC_KEY:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_public_key);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_PUBLIC_KEY_PCRS:
+                        auto_public_key_pcr_mask = false;
+                        r = tpm2_parse_pcr_argument_to_mask(optarg, &arg_tpm2_public_key_pcr_mask);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_SIGNATURE:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_signature);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_TPM2_PCRLOCK:
+                        r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_pcrlock);
+                        if (r < 0)
+                                return r;
+
+                        auto_pcrlock = false;
+                        break;
+
+                case ARG_TPM2_WITH_PIN:
+                        r = parse_boolean_argument("--tpm2-with-pin=", optarg, &arg_tpm2_pin);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_WIPE_SLOT: {
+                        const char *p = optarg;
+
+                        if (isempty(optarg)) {
+                                arg_wipe_slots_mask = 0;
+                                arg_wipe_slots_scope = WIPE_EXPLICIT;
+                                break;
+                        }
+
+                        for (;;) {
+                                _cleanup_free_ char *slot = NULL;
+                                unsigned n;
+
+                                r = extract_first_word(&p, &slot, ",", EXTRACT_DONT_COALESCE_SEPARATORS);
+                                if (r == 0)
+                                        break;
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse slot list: %s", optarg);
+
+                                if (streq(slot, "all"))
+                                        arg_wipe_slots_scope = WIPE_ALL;
+                                else if (streq(slot, "empty")) {
+                                        if (arg_wipe_slots_scope != WIPE_ALL) /* if "all" was specified before, that wins */
+                                                arg_wipe_slots_scope = WIPE_EMPTY_PASSPHRASE;
+                                } else if (streq(slot, "password"))
+                                        arg_wipe_slots_mask |= 1U << ENROLL_PASSWORD;
+                                else if (streq(slot, "recovery"))
+                                        arg_wipe_slots_mask |= 1U << ENROLL_RECOVERY;
+                                else if (streq(slot, "pkcs11"))
+                                        arg_wipe_slots_mask |= 1U << ENROLL_PKCS11;
+                                else if (streq(slot, "fido2"))
+                                        arg_wipe_slots_mask |= 1U << ENROLL_FIDO2;
+                                else if (streq(slot, "tpm2"))
+                                        arg_wipe_slots_mask |= 1U << ENROLL_TPM2;
+                                else {
+                                        int *a;
+
+                                        r = safe_atou(slot, &n);
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to parse slot index: %s", slot);
+                                        if (n > INT_MAX)
+                                                return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Slot index out of range: %u", n);
+
+                                        a = reallocarray(arg_wipe_slots, arg_n_wipe_slots + 1, sizeof(int));
+                                        if (!a)
+                                                return log_oom();
+
+                                        arg_wipe_slots = a;
+                                        arg_wipe_slots[arg_n_wipe_slots++] = (int) n;
+                                }
+                        }
+                        break;
+                }
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        if (optind >= argc)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "No block device node specified, refusing.");
+
+        if (argc > optind+1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Too many arguments, refusing.");
+
+        r = parse_path_argument(argv[optind], false, &arg_node);
+        if (r < 0)
+                return r;
+
+        if (arg_enroll_type == ENROLL_FIDO2) {
+
+                if (arg_unlock_type == UNLOCK_FIDO2 && !(arg_fido2_device && arg_unlock_fido2_device))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "When both enrolling and unlocking with FIDO2 tokens, automatic discovery is unsupported. "
+                                               "Please specify device paths for enrolling and unlocking respectively.");
+
+                if (!arg_fido2_device) {
+                        r = fido2_find_device_auto(&arg_fido2_device);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (auto_pcrlock) {
+                assert(!arg_tpm2_pcrlock);
+
+                r = tpm2_pcrlock_search_file(NULL, NULL, &arg_tpm2_pcrlock);
+                if (r < 0) {
+                        if (r != -ENOENT)
+                                log_warning_errno(r, "Search for pcrlock.json failed, assuming it does not exist: %m");
+                } else
+                        log_info("Automatically using pcrlock policy '%s'.", arg_tpm2_pcrlock);
+        }
+
+        if (auto_public_key_pcr_mask) {
+                assert(arg_tpm2_public_key_pcr_mask == 0);
+                arg_tpm2_public_key_pcr_mask = INDEX_TO_MASK(uint32_t, TPM2_PCR_KERNEL_BOOT);
+        }
+
+        if (auto_hash_pcr_values && !arg_tpm2_pcrlock) { /* Only lock to PCR 7 by default if no pcrlock policy is around (which is a better replacement) */
+                assert(arg_tpm2_n_hash_pcr_values == 0);
+
+                if (!GREEDY_REALLOC_APPEND(
+                                    arg_tpm2_hash_pcr_values,
+                                    arg_tpm2_n_hash_pcr_values,
+                                    &TPM2_PCR_VALUE_MAKE(TPM2_PCR_INDEX_DEFAULT, /* hash= */ 0, /* value= */ {}),
+                                    1))
+                        return log_oom();
+        }
+
+        return 1;
+}
+
+static int check_for_homed(struct crypt_device *cd) {
+        int r;
+
+        assert_se(cd);
+
+        /* Politely refuse operating on homed volumes. The enrolled tokens for the user record and the LUKS2
+         * volume should not get out of sync. */
+
+        for (int token = 0; token < crypt_token_max(CRYPT_LUKS2); token ++) {
+                r = cryptsetup_get_token_as_json(cd, token, "systemd-homed", NULL);
+                if (IN_SET(r, -ENOENT, -EINVAL, -EMEDIUMTYPE))
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read JSON token data off disk: %m");
+
+                return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+                                       "LUKS2 volume is managed by systemd-homed, please use homectl to enroll tokens.");
+        }
+
+        return 0;
+}
+
+static int load_volume_key_keyfile(
+                struct crypt_device *cd,
+                void *ret_vk,
+                size_t *ret_vks) {
+
+        _cleanup_(erase_and_freep) char *password = NULL;
+        size_t password_len;
+        int r;
+
+        assert_se(cd);
+        assert_se(ret_vk);
+        assert_se(ret_vks);
+
+        r = read_full_file_full(
+                        AT_FDCWD,
+                        arg_unlock_keyfile,
+                        UINT64_MAX,
+                        SIZE_MAX,
+                        READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET,
+                        NULL,
+                        &password,
+                        &password_len);
+        if (r < 0)
+                return log_error_errno(r, "Reading keyfile %s failed: %m", arg_unlock_keyfile);
+
+        r = crypt_volume_key_get(
+                        cd,
+                        CRYPT_ANY_SLOT,
+                        ret_vk,
+                        ret_vks,
+                        password,
+                        password_len);
+        if (r < 0)
+                return log_error_errno(r, "Unlocking via keyfile failed: %m");
+
+        return r;
+}
+
+static int prepare_luks(
+                struct crypt_device **ret_cd,
+                void **ret_volume_key,
+                size_t *ret_volume_key_size) {
+
+        _cleanup_(crypt_freep) struct crypt_device *cd = NULL;
+        _cleanup_(erase_and_freep) void *vk = NULL;
+        size_t vks;
+        int r;
+
+        assert(ret_cd);
+        assert(!ret_volume_key == !ret_volume_key_size);
+
+        r = crypt_init(&cd, arg_node);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate libcryptsetup context: %m");
+
+        cryptsetup_enable_logging(cd);
+
+        r = crypt_load(cd, CRYPT_LUKS2, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load LUKS2 superblock: %m");
+
+        r = check_for_homed(cd);
+        if (r < 0)
+                return r;
+
+        if (!ret_volume_key) {
+                *ret_cd = TAKE_PTR(cd);
+                return 0;
+        }
+
+        r = crypt_get_volume_key_size(cd);
+        if (r <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine LUKS volume key size");
+        vks = (size_t) r;
+
+        vk = malloc(vks);
+        if (!vk)
+                return log_oom();
+
+        switch (arg_unlock_type) {
+
+        case UNLOCK_KEYFILE:
+                r = load_volume_key_keyfile(cd, vk, &vks);
+                break;
+
+        case UNLOCK_FIDO2:
+                r = load_volume_key_fido2(cd, arg_node, arg_unlock_fido2_device, vk, &vks);
+                break;
+
+        case UNLOCK_PASSWORD:
+                r = load_volume_key_password(cd, arg_node, vk, &vks);
+                break;
+
+        default:
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown LUKS unlock method");
+        }
+
+        if (r < 0)
+                return r;
+
+        *ret_cd = TAKE_PTR(cd);
+        *ret_volume_key = TAKE_PTR(vk);
+        *ret_volume_key_size = vks;
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(crypt_freep) struct crypt_device *cd = NULL;
+        _cleanup_(erase_and_freep) void *vk = NULL;
+        size_t vks;
+        int slot, r;
+
+        log_show_color(true);
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        cryptsetup_enable_logging(NULL);
+
+        if (arg_enroll_type < 0)
+                r = prepare_luks(&cd, NULL, NULL); /* No need to unlock device if we don't need the volume key because we don't need to enroll anything */
+        else
+                r = prepare_luks(&cd, &vk, &vks);
+        if (r < 0)
+                return r;
+
+        switch (arg_enroll_type) {
+
+        case ENROLL_PASSWORD:
+                slot = enroll_password(cd, vk, vks);
+                break;
+
+        case ENROLL_RECOVERY:
+                slot = enroll_recovery(cd, vk, vks);
+                break;
+
+        case ENROLL_PKCS11:
+                slot = enroll_pkcs11(cd, vk, vks, arg_pkcs11_token_uri);
+                break;
+
+        case ENROLL_FIDO2:
+                slot = enroll_fido2(cd, vk, vks, arg_fido2_device, arg_fido2_lock_with, arg_fido2_cred_alg);
+                break;
+
+        case ENROLL_TPM2:
+                slot = enroll_tpm2(cd, vk, vks, arg_tpm2_device, arg_tpm2_seal_key_handle, arg_tpm2_device_key, arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values, arg_tpm2_public_key, arg_tpm2_public_key_pcr_mask, arg_tpm2_signature, arg_tpm2_pin, arg_tpm2_pcrlock);
+                break;
+
+        case _ENROLL_TYPE_INVALID:
+                /* List enrolled slots if we are called without anything to enroll or wipe */
+                if (!wipe_requested())
+                        return list_enrolled(cd);
+
+                /* Only slot wiping selected */
+                return wipe_slots(cd, arg_wipe_slots, arg_n_wipe_slots, arg_wipe_slots_scope, arg_wipe_slots_mask, -1);
+
+        default:
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Operation not implemented yet.");
+        }
+        if (slot < 0)
+                return slot;
+
+        /* After we completed enrolling, remove user selected slots */
+        r = wipe_slots(cd, arg_wipe_slots, arg_n_wipe_slots, arg_wipe_slots_scope, arg_wipe_slots_mask, slot);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/cryptenroll/cryptenroll.h b/src/cryptenroll/cryptenroll.h
new file mode 100644
index 0000000..335d9cc
--- /dev/null
+++ b/src/cryptenroll/cryptenroll.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+typedef enum EnrollType {
+        ENROLL_PASSWORD,
+        ENROLL_RECOVERY,
+        ENROLL_PKCS11,
+        ENROLL_FIDO2,
+        ENROLL_TPM2,
+        _ENROLL_TYPE_MAX,
+        _ENROLL_TYPE_INVALID = -EINVAL,
+} EnrollType;
+
+typedef enum UnlockType {
+        UNLOCK_PASSWORD,
+        UNLOCK_KEYFILE,
+        UNLOCK_FIDO2,
+        _UNLOCK_TYPE_MAX,
+        _UNLOCK_TYPE_INVALID = -EINVAL,
+} UnlockType;
+
+typedef enum WipeScope {
+        WIPE_EXPLICIT,          /* only wipe the listed slots */
+        WIPE_ALL,               /* wipe all slots */
+        WIPE_EMPTY_PASSPHRASE,  /* wipe slots with empty passphrases plus listed slots */
+        _WIPE_SCOPE_MAX,
+        _WIPE_SCOPE_INVALID = -EINVAL,
+} WipeScope;
+
+const char* enroll_type_to_string(EnrollType t);
+EnrollType enroll_type_from_string(const char *s);
+
+const char* luks2_token_type_to_string(EnrollType t);
+EnrollType luks2_token_type_from_string(const char *s);
diff --git a/src/cryptenroll/meson.build b/src/cryptenroll/meson.build
new file mode 100644
index 0000000..5374d65
--- /dev/null
+++ b/src/cryptenroll/meson.build
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+systemd_cryptenroll_sources = files(
+        'cryptenroll-list.c',
+        'cryptenroll-password.c',
+        'cryptenroll-recovery.c',
+        'cryptenroll-wipe.c',
+        'cryptenroll.c',
+)
+
+if conf.get('HAVE_P11KIT') == 1 and conf.get('HAVE_OPENSSL') == 1
+        systemd_cryptenroll_sources += files('cryptenroll-pkcs11.c')
+endif
+
+if conf.get('HAVE_LIBFIDO2') == 1
+        systemd_cryptenroll_sources += files('cryptenroll-fido2.c')
+endif
+
+if conf.get('HAVE_TPM2') == 1
+        systemd_cryptenroll_sources += files('cryptenroll-tpm2.c')
+endif
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-cryptenroll',
+                'public' : true,
+                'conditions' : ['HAVE_LIBCRYPTSETUP'],
+                'sources' : systemd_cryptenroll_sources,
+                'dependencies' : [
+                        libcryptsetup,
+                        libdl,
+                        libopenssl,
+                        libp11kit_cflags,
+                ],
+        },
+]
diff --git a/src/cryptsetup/cryptsetup-generator.c b/src/cryptsetup/cryptsetup-generator.c
new file mode 100644
index 0000000..904e4cd
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-generator.c
@@ -0,0 +1,940 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "dropin.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fstab-util.h"
+#include "generator.h"
+#include "hashmap.h"
+#include "id128-util.h"
+#include "log.h"
+#include "mkdir.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+
+typedef struct crypto_device {
+        char *uuid;
+        char *keyfile;
+        char *keydev;
+        char *headerdev;
+        char *datadev;
+        char *name;
+        char *options;
+        bool create;
+} crypto_device;
+
+static const char *arg_dest = NULL;
+static bool arg_enabled = true;
+static bool arg_read_crypttab = true;
+static const char *arg_crypttab = NULL;
+static const char *arg_runtime_directory = NULL;
+static bool arg_allow_list = false;
+static Hashmap *arg_disks = NULL;
+static char *arg_default_options = NULL;
+static char *arg_default_keyfile = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_disks, hashmap_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_default_options, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_default_keyfile, freep);
+
+static int split_locationspec(const char *locationspec, char **ret_file, char **ret_device) {
+        _cleanup_free_ char *file = NULL, *device = NULL;
+        const char *c;
+
+        assert(ret_file);
+        assert(ret_device);
+
+        if (!locationspec) {
+                *ret_file = *ret_device = NULL;
+                return 0;
+        }
+
+        c = strrchr(locationspec, ':');
+        if (c) {
+                /* The device part has to be either an absolute path to device node (/dev/something,
+                 * /dev/foo/something, or even possibly /dev/foo/something:part), or a fstab device
+                 * specification starting with LABEL= or similar. The file part has the same syntax.
+                 *
+                 * Let's try to guess if the second part looks like a device specification, or just part of a
+                 * filename with a colon. fstab_node_to_udev_node() will convert the fstab device syntax to
+                 * an absolute path. If we didn't get an absolute path, assume that it is just part of the
+                 * first file argument. */
+
+                device = fstab_node_to_udev_node(c + 1);
+                if (!device)
+                        return log_oom();
+
+                if (path_is_absolute(device))
+                        file = strndup(locationspec, c-locationspec);
+                else {
+                        log_debug("Location specification argument contains a colon, but \"%s\" doesn't look like a device specification.\n"
+                                  "Assuming that \"%s\" is a single device specification.",
+                                  c + 1, locationspec);
+                        device = mfree(device);
+                        c = NULL;
+                }
+        }
+
+        if (!c)
+                /* No device specified */
+                file = strdup(locationspec);
+
+        if (!file)
+                return log_oom();
+
+        *ret_file = TAKE_PTR(file);
+        *ret_device = TAKE_PTR(device);
+
+        return 0;
+}
+
+static int generate_device_mount(
+                const char *name,
+                const char *device,
+                const char *type_prefix, /* "keydev" or "headerdev" */
+                const char *device_timeout,
+                bool canfail,
+                bool readonly,
+                char **unit,
+                char **mount) {
+
+        _cleanup_free_ char *u = NULL, *where = NULL, *name_escaped = NULL, *device_unit = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+        usec_t timeout_us;
+
+        assert(name);
+        assert(device);
+        assert(unit);
+        assert(mount);
+
+        r = mkdir_parents(arg_runtime_directory, 0755);
+        if (r < 0)
+                return r;
+
+        r = mkdir(arg_runtime_directory, 0700);
+        if (r < 0 && errno != EEXIST)
+                return -errno;
+
+        name_escaped = cescape(name);
+        if (!name_escaped)
+                return -ENOMEM;
+
+        where = strjoin(arg_runtime_directory, "/", type_prefix, "-", name_escaped);
+        if (!where)
+                return -ENOMEM;
+
+        r = mkdir(where, 0700);
+        if (r < 0 && errno != EEXIST)
+                return -errno;
+
+        r = unit_name_from_path(where, ".mount", &u);
+        if (r < 0)
+                return r;
+
+        r = generator_open_unit_file(arg_dest, NULL, u, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "DefaultDependencies=no\n\n"
+                "[Mount]\n"
+                "What=%s\n"
+                "Where=%s\n"
+                "Options=%s%s\n", device, where, readonly ? "ro" : "rw", canfail ? ",nofail" : "");
+
+        if (device_timeout) {
+                r = parse_sec_fix_0(device_timeout, &timeout_us);
+                if (r >= 0) {
+                        r = unit_name_from_path(device, ".device", &device_unit);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to generate unit name: %m");
+
+                        r = write_drop_in_format(arg_dest, device_unit, 90, "device-timeout",
+                                "# Automatically generated by systemd-cryptsetup-generator \n\n"
+                                "[Unit]\nJobRunningTimeoutSec=%s", device_timeout);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to write device drop-in: %m");
+
+                } else
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", device_timeout);
+
+        }
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return r;
+
+        *unit = TAKE_PTR(u);
+        *mount = TAKE_PTR(where);
+
+        return 0;
+}
+
+static int generate_device_umount(const char *name,
+                                  const char *device_mount,
+                                  const char *type_prefix, /* "keydev" or "headerdev" */
+                                  char **ret_umount_unit) {
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *u = NULL, *name_escaped = NULL, *mount = NULL;
+        int r;
+
+        assert(name);
+        assert(ret_umount_unit);
+
+        name_escaped = cescape(name);
+        if (!name_escaped)
+                return -ENOMEM;
+
+        u = strjoin(type_prefix, "-", name_escaped, "-umount.service");
+        if (!u)
+                return -ENOMEM;
+
+        r = unit_name_from_path(device_mount, ".mount", &mount);
+        if (r < 0)
+                return r;
+
+        r = generator_open_unit_file(arg_dest, NULL, u, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "DefaultDependencies=no\n"
+                "After=%s\n\n"
+                "[Service]\n"
+                "ExecStart=-" UMOUNT_PATH " %s\n\n", mount, device_mount);
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return r;
+
+        *ret_umount_unit = TAKE_PTR(u);
+        return 0;
+}
+
+static int print_dependencies(FILE *f, const char* device_path, const char* timeout_value, bool canfail) {
+        int r;
+
+        assert(!canfail || timeout_value);
+
+        if (STR_IN_SET(device_path, "-", "none"))
+                /* None, nothing to do */
+                return 0;
+
+        if (PATH_IN_SET(device_path,
+                        "/dev/urandom",
+                        "/dev/random",
+                        "/dev/hw_random",
+                        "/dev/hwrng")) {
+                /* RNG device, add random dep */
+                fputs("After=systemd-random-seed.service\n", f);
+                return 0;
+        }
+
+        _cleanup_free_ char *udev_node = fstab_node_to_udev_node(device_path);
+        if (!udev_node)
+                return log_oom();
+
+        if (path_equal(udev_node, "/dev/null"))
+                return 0;
+
+        if (path_startswith(udev_node, "/dev/")) {
+                /* We are dealing with a block device, add dependency for corresponding unit */
+                _cleanup_free_ char *unit = NULL;
+
+                r = unit_name_from_path(udev_node, ".device", &unit);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate unit name: %m");
+
+                fprintf(f, "After=%1$s\n", unit);
+                if (canfail) {
+                        fprintf(f, "Wants=%1$s\n", unit);
+                        r = write_drop_in_format(arg_dest, unit, 90, "device-timeout",
+                                "# Automatically generated by systemd-cryptsetup-generator \n\n"
+                                "[Unit]\nJobRunningTimeoutSec=%s", timeout_value);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to write device drop-in: %m");
+                } else
+                        fprintf(f, "Requires=%1$s\n", unit);
+        } else {
+                /* Regular file, add mount dependency */
+                _cleanup_free_ char *escaped_path = specifier_escape(device_path);
+                if (!escaped_path)
+                        return log_oom();
+
+                fprintf(f, "RequiresMountsFor=%s\n", escaped_path);
+        }
+
+        return 0;
+}
+
+static bool attach_in_initrd(const char *name, const char *options) {
+        assert(name);
+
+        /* Imply x-initrd.attach in case the volume name is among those defined in the Discoverable Partition
+         * Specification for partitions that we require to be mounted during the initrd → host transition,
+         * i.e. for the root fs itself, and /usr/. This mirrors similar behaviour in
+         * systemd-fstab-generator. */
+
+        return fstab_test_option(options, "x-initrd.attach\0") ||
+                STR_IN_SET(name, "root", "usr");
+}
+
+static int create_disk(
+                const char *name,
+                const char *device,
+                const char *key_file,
+                const char *keydev,
+                const char *headerdev,
+                const char *options,
+                const char *source) {
+
+        _cleanup_free_ char *n = NULL, *d = NULL, *u = NULL, *e = NULL,
+                *keydev_mount = NULL, *keyfile_timeout_value = NULL,
+                *filtered = NULL, *u_escaped = NULL, *name_escaped = NULL, *header_path = NULL, *key_file_buffer = NULL,
+                *tmp_fstype = NULL, *filtered_header = NULL, *headerdev_mount = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        const char *dmname;
+        bool noauto, nofail, swap, netdev;
+        int r, detached_header, keyfile_can_timeout, tmp;
+
+        assert(name);
+        assert(device);
+
+        noauto = fstab_test_yes_no_option(options, "noauto\0" "auto\0");
+        nofail = fstab_test_yes_no_option(options, "nofail\0" "fail\0");
+        swap = fstab_test_option(options, "swap\0");
+        netdev = fstab_test_option(options, "_netdev\0");
+
+        keyfile_can_timeout = fstab_filter_options(options,
+                                                   "keyfile-timeout\0",
+                                                   NULL, &keyfile_timeout_value, NULL, NULL);
+        if (keyfile_can_timeout < 0)
+                return log_error_errno(keyfile_can_timeout, "Failed to parse keyfile-timeout= option value: %m");
+
+        detached_header = fstab_filter_options(
+                options,
+                "header\0",
+                NULL,
+                &header_path,
+                NULL,
+                headerdev ? &filtered_header : NULL);
+        if (detached_header < 0)
+                return log_error_errno(detached_header, "Failed to parse header= option value: %m");
+
+        tmp = fstab_filter_options(options, "tmp\0", NULL, &tmp_fstype, NULL, NULL);
+        if (tmp < 0)
+                return log_error_errno(tmp, "Failed to parse tmp= option value: %m");
+
+        if (tmp && swap)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Device '%s' cannot be both 'tmp' and 'swap'. Ignoring.",
+                                       name);
+
+        name_escaped = specifier_escape(name);
+        if (!name_escaped)
+                return log_oom();
+
+        e = unit_name_escape(name);
+        if (!e)
+                return log_oom();
+
+        u = fstab_node_to_udev_node(device);
+        if (!u)
+                return log_oom();
+
+        r = unit_name_build("systemd-cryptsetup", e, ".service", &n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        u_escaped = specifier_escape(u);
+        if (!u_escaped)
+                return log_oom();
+
+        r = unit_name_from_path(u, ".device", &d);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        if (keydev && !key_file)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Key device is specified, but path to the key file is missing.");
+
+        r = generator_open_unit_file(arg_dest, NULL, n, &f);
+        if (r < 0)
+                return r;
+
+        r = generator_write_cryptsetup_unit_section(f, source);
+        if (r < 0)
+                return r;
+
+        if (netdev)
+                fprintf(f, "After=remote-fs-pre.target\n");
+
+        /* If initrd takes care of attaching the disk then it should also detach it during shutdown. */
+        if (!attach_in_initrd(name, options))
+                fprintf(f,
+                        "Conflicts=umount.target\n"
+                        "Before=umount.target\n");
+
+        if (keydev) {
+                _cleanup_free_ char *unit = NULL, *umount_unit = NULL;
+
+                r = generate_device_mount(
+                        name,
+                        keydev,
+                        "keydev",
+                        keyfile_timeout_value,
+                        /* canfail = */ keyfile_can_timeout > 0,
+                        /* readonly= */ true,
+                        &unit,
+                        &keydev_mount);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate keydev mount unit: %m");
+
+                r = generate_device_umount(name, keydev_mount, "keydev", &umount_unit);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate keydev umount unit: %m");
+
+                key_file_buffer = path_join(keydev_mount, key_file);
+                if (!key_file_buffer)
+                        return log_oom();
+
+                key_file = key_file_buffer;
+
+                fprintf(f, "After=%s\n", unit);
+                if (keyfile_can_timeout > 0)
+                        fprintf(f, "Wants=%s\n", unit);
+                else
+                        fprintf(f, "Requires=%s\n", unit);
+
+                if (umount_unit)
+                        fprintf(f,
+                                "Wants=%s\n"
+                                "Before=%s\n",
+                                umount_unit,
+                                umount_unit
+                        );
+        }
+
+        if (headerdev) {
+                _cleanup_free_ char *unit = NULL, *umount_unit = NULL, *p = NULL;
+
+                r = generate_device_mount(
+                        name,
+                        headerdev,
+                        "headerdev",
+                        NULL,
+                        /* canfail=  */ false, /* header is always necessary */
+                        /* readonly= */ false, /* LUKS2 recovery requires rw header access */
+                        &unit,
+                        &headerdev_mount);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate header device mount unit: %m");
+
+                r = generate_device_umount(name, headerdev_mount, "headerdev", &umount_unit);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate header device umount unit: %m");
+
+                p = path_join(headerdev_mount, header_path);
+                if (!p)
+                        return log_oom();
+
+                free_and_replace(header_path, p);
+
+                if (isempty(filtered_header))
+                        p = strjoin("header=", header_path);
+                else
+                        p = strjoin(filtered_header, ",header=", header_path);
+
+                if (!p)
+                        return log_oom();
+
+                free_and_replace(filtered_header, p);
+                options = filtered_header;
+
+                fprintf(f, "After=%s\n"
+                           "Requires=%s\n", unit, unit);
+
+                if (umount_unit)
+                        fprintf(f,
+                                "Wants=%s\n"
+                                "Before=%s\n",
+                                umount_unit,
+                                umount_unit
+                        );
+        }
+
+        if (!nofail)
+                fprintf(f,
+                        "Before=%s\n",
+                        netdev ? "remote-cryptsetup.target" : "cryptsetup.target");
+
+        if (key_file && !keydev) {
+                r = print_dependencies(f, key_file,
+                        keyfile_timeout_value,
+                        /* canfail= */ keyfile_can_timeout > 0);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Check if a header option was specified */
+        if (detached_header > 0 && !headerdev) {
+                r = print_dependencies(f, header_path,
+                        NULL,
+                        /* canfail= */ false); /* header is always necessary */
+                if (r < 0)
+                        return r;
+        }
+
+        if (path_startswith(u, "/dev/"))
+                fprintf(f,
+                        "BindsTo=%s\n"
+                        "After=%s\n",
+                        d, d);
+        else
+                /* For loopback devices make sure to explicitly load loop.ko, as this code might run very
+                 * early where device nodes created via systemd-tmpfiles-setup-dev.service might not be
+                 * around yet. Hence let's sync on the module itself. */
+                fprintf(f,
+                        "RequiresMountsFor=%s\n"
+                        "Wants=modprobe@loop.service\n"
+                        "After=modprobe@loop.service\n",
+                        u_escaped);
+
+        r = generator_write_timeouts(arg_dest, device, name, options, &filtered);
+        if (r < 0)
+                log_warning_errno(r, "Failed to write device timeout drop-in: %m");
+
+        r = generator_write_cryptsetup_service_section(f, name, u, key_file, filtered);
+        if (r < 0)
+                return r;
+
+        if (tmp) {
+                _cleanup_free_ char *tmp_fstype_escaped = NULL;
+
+                if (tmp_fstype) {
+                        tmp_fstype_escaped = specifier_escape(tmp_fstype);
+                        if (!tmp_fstype_escaped)
+                                return log_oom();
+                }
+
+                fprintf(f,
+                        "ExecStartPost=" LIBEXECDIR "/systemd-makefs '%s' '/dev/mapper/%s'\n",
+                        tmp_fstype_escaped ?: "ext4", name_escaped);
+        }
+
+        if (swap)
+                fprintf(f,
+                        "ExecStartPost=" LIBEXECDIR "/systemd-makefs swap '/dev/mapper/%s'\n",
+                        name_escaped);
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit file %s: %m", n);
+
+        if (!noauto) {
+                r = generator_add_symlink(arg_dest,
+                                          netdev ? "remote-cryptsetup.target" : "cryptsetup.target",
+                                          nofail ? "wants" : "requires", n);
+                if (r < 0)
+                        return r;
+        }
+
+        dmname = strjoina("dev-mapper-", e, ".device");
+        r = generator_add_symlink(arg_dest, dmname, "requires", n);
+        if (r < 0)
+                return r;
+
+        if (!noauto && !nofail) {
+                r = write_drop_in(arg_dest, dmname, 40, "device-timeout",
+                                  "# Automatically generated by systemd-cryptsetup-generator\n\n"
+                                  "[Unit]\n"
+                                  "JobTimeoutSec=infinity\n");
+                if (r < 0)
+                        log_warning_errno(r, "Failed to write device timeout drop-in: %m");
+        }
+
+        return 0;
+}
+
+static crypto_device* crypt_device_free(crypto_device *d) {
+        if (!d)
+                return NULL;
+
+        free(d->uuid);
+        free(d->keyfile);
+        free(d->keydev);
+        free(d->name);
+        free(d->options);
+        return mfree(d);
+}
+
+static crypto_device *get_crypto_device(const char *uuid) {
+        int r;
+        crypto_device *d;
+
+        assert(uuid);
+
+        d = hashmap_get(arg_disks, uuid);
+        if (!d) {
+                d = new0(struct crypto_device, 1);
+                if (!d)
+                        return NULL;
+
+                d->uuid = strdup(uuid);
+                if (!d->uuid)
+                        return mfree(d);
+
+                r = hashmap_put(arg_disks, d->uuid, d);
+                if (r < 0) {
+                        free(d->uuid);
+                        return mfree(d);
+                }
+        }
+
+        return d;
+}
+
+static bool warn_uuid_invalid(const char *uuid, const char *key) {
+        assert(key);
+
+        if (!id128_is_valid(uuid)) {
+                log_warning("Failed to parse %s= kernel command line switch. UUID is invalid, ignoring.", key);
+                return true;
+        }
+
+        return false;
+}
+
+static int filter_header_device(const char *options,
+                                char **ret_headerdev,
+                                char **ret_filtered_headerdev_options) {
+        int r;
+        _cleanup_free_ char *headerfile = NULL, *headerdev = NULL, *headerspec = NULL,
+                            *filtered_headerdev = NULL, *filtered_headerspec = NULL;
+
+        assert(ret_headerdev);
+        assert(ret_filtered_headerdev_options);
+
+        r = fstab_filter_options(options, "header\0", NULL, &headerspec, NULL, &filtered_headerspec);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse header= option value: %m");
+
+        if (r > 0) {
+                r = split_locationspec(headerspec, &headerfile, &headerdev);
+                if (r < 0)
+                        return r;
+
+                if (isempty(filtered_headerspec))
+                        filtered_headerdev = strjoin("header=", headerfile);
+                else
+                        filtered_headerdev = strjoin(filtered_headerspec, ",header=", headerfile);
+
+                if (!filtered_headerdev)
+                        return log_oom();
+        } else
+                filtered_headerdev = TAKE_PTR(filtered_headerspec);
+
+        *ret_filtered_headerdev_options = TAKE_PTR(filtered_headerdev);
+        *ret_headerdev = TAKE_PTR(headerdev);
+
+        return 0;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        _cleanup_free_ char *uuid = NULL, *uuid_value = NULL;
+        crypto_device *d;
+        int r;
+
+        if (streq(key, "luks")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning("Failed to parse luks= kernel command line switch %s. Ignoring.", value);
+                else
+                        arg_enabled = r;
+
+        } else if (streq(key, "luks.crypttab")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning("Failed to parse luks.crypttab= kernel command line switch %s. Ignoring.", value);
+                else
+                        arg_read_crypttab = r;
+
+        } else if (streq(key, "luks.uuid")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                d = get_crypto_device(startswith(value, "luks-") ?: value);
+                if (!d)
+                        return log_oom();
+
+                d->create = arg_allow_list = true;
+
+        } else if (streq(key, "luks.options")) {
+                _cleanup_free_ char *headerdev = NULL, *filtered_headerdev_options = NULL;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = sscanf(value, "%m[0-9a-fA-F-]=%ms", &uuid, &uuid_value);
+                if (r != 2)
+                        return free_and_strdup_warn(&arg_default_options, value);
+
+                if (warn_uuid_invalid(uuid, key))
+                        return 0;
+
+                d = get_crypto_device(uuid);
+                if (!d)
+                        return log_oom();
+
+                r = filter_header_device(uuid_value, &headerdev, &filtered_headerdev_options);
+                if (r < 0)
+                        return r;
+
+                free_and_replace(d->options, filtered_headerdev_options);
+                free_and_replace(d->headerdev, headerdev);
+        } else if (streq(key, "luks.key")) {
+                size_t n;
+                _cleanup_free_ char *keyfile = NULL, *keydev = NULL;
+                const char *keyspec;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                n = strspn(value, ALPHANUMERICAL "-");
+                if (value[n] != '=')
+                        return free_and_strdup_warn(&arg_default_keyfile, value);
+
+                uuid = strndup(value, n);
+                if (!uuid)
+                        return log_oom();
+
+                if (warn_uuid_invalid(uuid, key))
+                        return 0;
+
+                d = get_crypto_device(uuid);
+                if (!d)
+                        return log_oom();
+
+                keyspec = value + n + 1;
+                r = split_locationspec(keyspec, &keyfile, &keydev);
+                if (r < 0)
+                        return r;
+
+                free_and_replace(d->keyfile, keyfile);
+                free_and_replace(d->keydev, keydev);
+        } else if (streq(key, "luks.data")) {
+                size_t n;
+                _cleanup_free_ char *datadev = NULL;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                n = strspn(value, ALPHANUMERICAL "-");
+                if (value[n] != '=') {
+                        log_warning("Failed to parse luks.data= kernel command line switch. UUID is invalid, ignoring.");
+                        return 0;
+                }
+
+                uuid = strndup(value, n);
+                if (!uuid)
+                        return log_oom();
+
+                if (warn_uuid_invalid(uuid, key))
+                        return 0;
+
+                d = get_crypto_device(uuid);
+                if (!d)
+                        return log_oom();
+
+                datadev = fstab_node_to_udev_node(value + n + 1);
+                if (!datadev)
+                        return log_oom();
+
+                free_and_replace(d->datadev, datadev);
+        } else if (streq(key, "luks.name")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = sscanf(value, "%m[0-9a-fA-F-]=%ms", &uuid, &uuid_value);
+                if (r == 2) {
+                        d = get_crypto_device(uuid);
+                        if (!d)
+                                return log_oom();
+
+                        d->create = arg_allow_list = true;
+
+                        free_and_replace(d->name, uuid_value);
+                } else
+                        log_warning("Failed to parse luks name switch %s. Ignoring.", value);
+        }
+
+        return 0;
+}
+
+static int add_crypttab_devices(void) {
+        _cleanup_fclose_ FILE *f = NULL;
+        unsigned crypttab_line = 0;
+        int r;
+
+        if (!arg_read_crypttab)
+                return 0;
+
+        r = fopen_unlocked(arg_crypttab, "re", &f);
+        if (r < 0) {
+                if (errno != ENOENT)
+                        log_error_errno(errno, "Failed to open %s: %m", arg_crypttab);
+                return 0;
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL, *name = NULL, *device = NULL, *keyspec = NULL, *options = NULL,
+                                    *keyfile = NULL, *keydev = NULL, *headerdev = NULL, *filtered_header = NULL;
+                crypto_device *d = NULL;
+                char *uuid;
+                int k;
+
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read %s: %m", arg_crypttab);
+                if (r == 0)
+                        break;
+
+                crypttab_line++;
+
+                if (IN_SET(line[0], 0, '#'))
+                        continue;
+
+                k = sscanf(line, "%ms %ms %ms %ms", &name, &device, &keyspec, &options);
+                if (k < 2 || k > 4) {
+                        log_error("Failed to parse %s:%u, ignoring.", arg_crypttab, crypttab_line);
+                        continue;
+                }
+
+                uuid = startswith(device, "UUID=");
+                if (!uuid)
+                        uuid = path_startswith(device, "/dev/disk/by-uuid/");
+                if (!uuid)
+                        uuid = startswith(name, "luks-");
+                if (uuid)
+                        d = hashmap_get(arg_disks, uuid);
+
+                if (arg_allow_list && !d) {
+                        log_info("Not creating device '%s' because it was not specified on the kernel command line.", name);
+                        continue;
+                }
+
+                r = split_locationspec(keyspec, &keyfile, &keydev);
+                if (r < 0)
+                        return r;
+
+                if (options && (!d || !d->options)) {
+                        r = filter_header_device(options, &headerdev, &filtered_header);
+                        if (r < 0)
+                                return r;
+                        free_and_replace(options, filtered_header);
+                }
+
+                r = create_disk(name,
+                                device,
+                                keyfile,
+                                keydev,
+                                (d && d->options) ? d->headerdev : headerdev,
+                                (d && d->options) ? d->options : options,
+                                arg_crypttab);
+                if (r < 0)
+                        return r;
+
+                if (d)
+                        d->create = false;
+        }
+
+        return 0;
+}
+
+static int add_proc_cmdline_devices(void) {
+        int r;
+        crypto_device *d;
+
+        HASHMAP_FOREACH(d, arg_disks) {
+                _cleanup_free_ char *device = NULL;
+
+                if (!d->create)
+                        continue;
+
+                if (!d->name) {
+                        d->name = strjoin("luks-", d->uuid);
+                        if (!d->name)
+                                return log_oom();
+                }
+
+                device = strjoin("UUID=", d->uuid);
+                if (!device)
+                        return log_oom();
+
+                r = create_disk(d->name,
+                                d->datadev ?: device,
+                                d->keyfile ?: arg_default_keyfile,
+                                d->keydev,
+                                d->headerdev,
+                                d->options ?: arg_default_options,
+                                "/proc/cmdline");
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(crypt_device_hash_ops, char, string_hash_func, string_compare_func,
+                                              crypto_device, crypt_device_free);
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+        int r;
+
+        assert_se(arg_dest = dest);
+
+        arg_crypttab = getenv("SYSTEMD_CRYPTTAB") ?: "/etc/crypttab";
+        arg_runtime_directory = getenv("RUNTIME_DIRECTORY") ?: "/run/systemd/cryptsetup";
+
+        arg_disks = hashmap_new(&crypt_device_hash_ops);
+        if (!arg_disks)
+                return log_oom();
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, PROC_CMDLINE_STRIP_RD_PREFIX);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse kernel command line: %m");
+
+        if (!arg_enabled)
+                return 0;
+
+        r = add_crypttab_devices();
+        if (r < 0)
+                return r;
+
+        r = add_proc_cmdline_devices();
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/cryptsetup/cryptsetup-keyfile.c b/src/cryptsetup/cryptsetup-keyfile.c
new file mode 100644
index 0000000..1867e90
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-keyfile.c
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cryptsetup-keyfile.h"
+#include "fileio.h"
+#include "path-util.h"
+#include "strv.h"
+
+int find_key_file(
+                const char *key_file,
+                char **search_path,
+                const char *bindname,
+                void **ret_key,
+                size_t *ret_key_size) {
+
+        int r;
+
+        assert(key_file);
+        assert(ret_key);
+        assert(ret_key_size);
+
+        if (strv_isempty(search_path) || path_is_absolute(key_file)) {
+
+                r = read_full_file_full(
+                                AT_FDCWD, key_file, UINT64_MAX, SIZE_MAX,
+                                READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET,
+                                bindname,
+                                (char**) ret_key, ret_key_size);
+                if (r == -E2BIG)
+                        return log_error_errno(r, "Key file '%s' too large.", key_file);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to load key file '%s': %m", key_file);
+
+                return 1;
+        }
+
+        STRV_FOREACH(i, search_path) {
+                _cleanup_free_ char *joined = NULL;
+
+                joined = path_join(*i, key_file);
+                if (!joined)
+                        return log_oom();
+
+                r = read_full_file_full(
+                                AT_FDCWD, joined, UINT64_MAX, SIZE_MAX,
+                                READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET,
+                                bindname,
+                                (char**) ret_key, ret_key_size);
+                if (r >= 0)
+                        return 1;
+                if (r == -E2BIG) {
+                        log_warning_errno(r, "Key file '%s' too large, ignoring.", key_file);
+                        continue;
+                }
+                if (r != -ENOENT)
+                        return log_error_errno(r, "Failed to load key file '%s': %m", key_file);
+        }
+
+        /* Search path supplied, but file not found, report by returning NULL, but not failing */
+        *ret_key = NULL;
+        *ret_key_size = 0;
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-keyfile.h b/src/cryptsetup/cryptsetup-keyfile.h
new file mode 100644
index 0000000..83bd1fb
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-keyfile.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+int find_key_file(
+                const char *key_file,
+                char **search_path,
+                const char *bindname,
+                void **ret_key,
+                size_t *ret_key_size);
diff --git a/src/cryptsetup/cryptsetup-pkcs11.c b/src/cryptsetup/cryptsetup-pkcs11.c
new file mode 100644
index 0000000..f991389
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-pkcs11.c
@@ -0,0 +1,173 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "cryptsetup-pkcs11.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "parse-util.h"
+#include "pkcs11-util.h"
+#include "random-util.h"
+#include "stat-util.h"
+#include "strv.h"
+
+int decrypt_pkcs11_key(
+                const char *volume_name,
+                const char *friendly_name,
+                const char *pkcs11_uri,
+                const char *key_file,         /* We either expect key_file and associated parameters to be set (for file keys) … */
+                size_t key_file_size,
+                uint64_t key_file_offset,
+                const void *key_data,         /* … or key_data and key_data_size (for literal keys) */
+                size_t key_data_size,
+                usec_t until,
+                bool headless,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        _cleanup_(pkcs11_crypt_device_callback_data_release) pkcs11_crypt_device_callback_data data = {
+                .friendly_name = friendly_name,
+                .until = until,
+                .headless = headless,
+        };
+        int r;
+
+        assert(friendly_name);
+        assert(pkcs11_uri);
+        assert(key_file || key_data);
+        assert(ret_decrypted_key);
+        assert(ret_decrypted_key_size);
+
+        /* The functions called here log about all errors, except for EAGAIN which means "token not found right now" */
+
+        if (key_data) {
+                data.encrypted_key = (void*) key_data;
+                data.encrypted_key_size = key_data_size;
+
+                data.free_encrypted_key = false;
+        } else {
+                _cleanup_free_ char *bindname = NULL;
+
+                /* If we read the key via AF_UNIX, make this client recognizable */
+                if (asprintf(&bindname, "@%" PRIx64"/cryptsetup-pkcs11/%s", random_u64(), volume_name) < 0)
+                        return log_oom();
+
+                r = read_full_file_full(
+                                AT_FDCWD, key_file,
+                                key_file_offset == 0 ? UINT64_MAX : key_file_offset,
+                                key_file_size == 0 ? SIZE_MAX : key_file_size,
+                                READ_FULL_FILE_CONNECT_SOCKET,
+                                bindname,
+                                (char**) &data.encrypted_key, &data.encrypted_key_size);
+                if (r < 0)
+                        return r;
+
+                data.free_encrypted_key = true;
+        }
+
+        r = pkcs11_find_token(pkcs11_uri, pkcs11_crypt_device_callback, &data);
+        if (r < 0)
+                return r;
+
+        *ret_decrypted_key = TAKE_PTR(data.decrypted_key);
+        *ret_decrypted_key_size = data.decrypted_key_size;
+
+        return 0;
+}
+
+int find_pkcs11_auto_data(
+                struct crypt_device *cd,
+                char **ret_uri,
+                void **ret_encrypted_key,
+                size_t *ret_encrypted_key_size,
+                int *ret_keyslot) {
+
+        _cleanup_free_ char *uri = NULL;
+        _cleanup_free_ void *key = NULL;
+        int r, keyslot = -1;
+        size_t key_size = 0;
+
+        assert(cd);
+        assert(ret_uri);
+        assert(ret_encrypted_key);
+        assert(ret_encrypted_key_size);
+        assert(ret_keyslot);
+
+        /* Loads PKCS#11 metadata from LUKS2 JSON token headers. */
+
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                JsonVariant *w;
+                int ks;
+
+                r = cryptsetup_get_token_as_json(cd, token, "systemd-pkcs11", &v);
+                if (IN_SET(r, -ENOENT, -EINVAL, -EMEDIUMTYPE))
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read JSON token data off disk: %m");
+
+                ks = cryptsetup_get_keyslot_from_token(v);
+                if (ks < 0) {
+                        /* Handle parsing errors of the keyslots field gracefully, since it's not 'owned' by
+                         * us, but by the LUKS2 spec */
+                        log_warning_errno(ks, "Failed to extract keyslot index from PKCS#11 JSON data token %i, skipping: %m", token);
+                        continue;
+                }
+
+                if (uri)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ),
+                                               "Multiple PKCS#11 tokens enrolled, cannot automatically determine token.");
+
+                assert(keyslot < 0);
+                keyslot = ks;
+
+                w = json_variant_by_key(v, "pkcs11-uri");
+                if (!w || !json_variant_is_string(w))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "PKCS#11 token data lacks 'pkcs11-uri' field.");
+
+                uri = strdup(json_variant_string(w));
+                if (!uri)
+                        return log_oom();
+
+                if (!pkcs11_uri_valid(uri))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "PKCS#11 token data contains invalid PKCS#11 URI.");
+
+                w = json_variant_by_key(v, "pkcs11-key");
+                if (!w || !json_variant_is_string(w))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "PKCS#11 token data lacks 'pkcs11-key' field.");
+
+                assert(!key);
+                assert(key_size == 0);
+                r = unbase64mem(json_variant_string(w), SIZE_MAX, &key, &key_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to decode base64 encoded key.");
+        }
+
+        if (!uri)
+                return log_error_errno(SYNTHETIC_ERRNO(ENXIO),
+                                       "No valid PKCS#11 token data found.");
+
+        log_info("Automatically discovered security PKCS#11 token '%s' unlocks volume.", uri);
+
+        *ret_uri = TAKE_PTR(uri);
+        *ret_encrypted_key = TAKE_PTR(key);
+        *ret_encrypted_key_size = key_size;
+        *ret_keyslot = keyslot;
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-pkcs11.h b/src/cryptsetup/cryptsetup-pkcs11.h
new file mode 100644
index 0000000..256c09a
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-pkcs11.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+#include "log.h"
+#include "time-util.h"
+
+#if HAVE_P11KIT
+
+int decrypt_pkcs11_key(
+                const char *volume_name,
+                const char *friendly_name,
+                const char *pkcs11_uri,
+                const char *key_file,
+                size_t key_file_size,
+                uint64_t key_file_offset,
+                const void *key_data,
+                size_t key_data_size,
+                usec_t until,
+                bool headless,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size);
+
+int find_pkcs11_auto_data(
+                struct crypt_device *cd,
+                char **ret_uri,
+                void **ret_encrypted_key,
+                size_t *ret_encrypted_key_size,
+                int *ret_keyslot);
+
+#else
+
+static inline int decrypt_pkcs11_key(
+                const char *volume_name,
+                const char *friendly_name,
+                const char *pkcs11_uri,
+                const char *key_file,
+                size_t key_file_size,
+                uint64_t key_file_offset,
+                const void *key_data,
+                size_t key_data_size,
+                usec_t until,
+                bool headless,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "PKCS#11 Token support not available.");
+}
+
+static inline int find_pkcs11_auto_data(
+                struct crypt_device *cd,
+                char **ret_uri,
+                void **ret_encrypted_key,
+                size_t *ret_encrypted_key_size,
+                int *ret_keyslot) {
+
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "PKCS#11 Token support not available.");
+}
+
+#endif
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-fido2.c b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-fido2.c
new file mode 100644
index 0000000..fdb3b17
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-fido2.c
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "cryptsetup-token.h"
+#include "cryptsetup-token-util.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "luks2-fido2.h"
+#include "memory-util.h"
+#include "version.h"
+
+#define TOKEN_NAME "systemd-fido2"
+#define TOKEN_VERSION_MAJOR "1"
+#define TOKEN_VERSION_MINOR "0"
+
+/* for libcryptsetup debug purpose */
+_public_ const char *cryptsetup_token_version(void) {
+        return TOKEN_VERSION_MAJOR "." TOKEN_VERSION_MINOR " systemd-v" STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")";
+}
+
+_public_ int cryptsetup_token_open_pin(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                int token /* is always >= 0 */,
+                const char *pin,
+                size_t pin_size,
+                char **password, /* freed by cryptsetup_token_buffer_free */
+                size_t *password_len,
+                void *usrptr /* plugin defined parameter passed to crypt_activate_by_token*() API */) {
+
+        int r;
+        const char *json;
+        _cleanup_(erase_and_freep) char *pin_string = NULL;
+
+        assert(!pin || pin_size);
+        assert(token >= 0);
+
+        /* This must not fail at this moment (internal error) */
+        r = crypt_token_json_get(cd, token, &json);
+        /* Use assert_se() here to avoid emitting warning with -DNDEBUG */
+        assert_se(token == r);
+        assert(json);
+
+        r = crypt_normalize_pin(pin, pin_size, &pin_string);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Cannot normalize PIN: %m");
+
+        return acquire_luks2_key(cd, json, (const char *)usrptr, pin_string, password, password_len);
+}
+
+/*
+ * This function is called from within following libcryptsetup calls
+ * provided conditions further below are met:
+ *
+ * crypt_activate_by_token(), crypt_activate_by_token_type(type == 'systemd-fido2'):
+ *
+ * - token is assigned to at least one luks2 keyslot eligible to activate LUKS2 device
+ *   (alternatively: name is set to null, flags contains CRYPT_ACTIVATE_ALLOW_UNBOUND_KEY
+ *    and token is assigned to at least single keyslot).
+ *
+ * - if plugin defines validate function (see cryptsetup_token_validate below) it must have
+ *   passed the check (aka return 0)
+ */
+_public_ int cryptsetup_token_open(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                int token /* is always >= 0 */,
+                char **password, /* freed by cryptsetup_token_buffer_free */
+                size_t *password_len,
+                void *usrptr /* plugin defined parameter passed to crypt_activate_by_token*() API */) {
+
+        return cryptsetup_token_open_pin(cd, token, NULL, 0, password, password_len, usrptr);
+}
+
+/*
+ * libcryptsetup callback for memory deallocation of 'password' parameter passed in
+ * any crypt_token_open_* plugin function
+ */
+_public_ void cryptsetup_token_buffer_free(void *buffer, size_t buffer_len) {
+        erase_and_free(buffer);
+}
+
+/*
+ * prints systemd-fido2 token content in crypt_dump().
+ * 'type' and 'keyslots' fields are printed by libcryptsetup
+ */
+_public_ void cryptsetup_token_dump(
+                struct crypt_device *cd /* is always LUKS2 context */,
+                const char *json /* validated 'systemd-tpm2' token if cryptsetup_token_validate is defined */) {
+
+        int r;
+        Fido2EnrollFlags required;
+        size_t cid_size, salt_size;
+        const char *client_pin_req_str, *up_req_str, *uv_req_str;
+        _cleanup_free_ void *cid = NULL, *salt = NULL;
+        _cleanup_free_ char *rp_id = NULL, *cid_str = NULL, *salt_str = NULL;
+
+        assert(json);
+
+        r = parse_luks2_fido2_data(cd, json, &rp_id, &salt, &salt_size, &cid, &cid_size, &required);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Failed to parse " TOKEN_NAME " metadata: %m.");
+
+        r = crypt_dump_buffer_to_hex_string(cid, cid_size, &cid_str);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Cannot dump " TOKEN_NAME " content: %m");
+
+        r = crypt_dump_buffer_to_hex_string(salt, salt_size, &salt_str);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Cannot dump " TOKEN_NAME " content: %m");
+
+        if (required & FIDO2ENROLL_PIN)
+                client_pin_req_str = "true";
+        else if (required & FIDO2ENROLL_PIN_IF_NEEDED)
+                client_pin_req_str = NULL;
+        else
+                client_pin_req_str = "false";
+
+        if (required & FIDO2ENROLL_UP)
+                up_req_str = "true";
+        else if (required & FIDO2ENROLL_UP_IF_NEEDED)
+                up_req_str = NULL;
+        else
+                up_req_str = "false";
+
+        if (required & FIDO2ENROLL_UV)
+                uv_req_str = "true";
+        else if (required & FIDO2ENROLL_UV_OMIT)
+                uv_req_str = NULL;
+        else
+                uv_req_str = "false";
+
+        crypt_log(cd, "\tfido2-credential:" CRYPT_DUMP_LINE_SEP "%s\n", cid_str);
+        crypt_log(cd, "\tfido2-salt: %s\n", salt_str);
+
+        /* optional fields */
+        if (rp_id)
+                crypt_log(cd, "\tfido2-rp:   %s\n", rp_id);
+        if (client_pin_req_str)
+                crypt_log(cd, "\tfido2-clientPin-required:" CRYPT_DUMP_LINE_SEP "%s\n",
+                          client_pin_req_str);
+        if (up_req_str)
+                crypt_log(cd, "\tfido2-up-required:" CRYPT_DUMP_LINE_SEP "%s\n", up_req_str);
+        if (uv_req_str)
+                crypt_log(cd, "\tfido2-uv-required:" CRYPT_DUMP_LINE_SEP "%s\n", uv_req_str);
+}
+
+/*
+ * Note:
+ *   If plugin is available in library path, it's called in before following libcryptsetup calls:
+ *
+ *   crypt_token_json_set, crypt_dump, any crypt_activate_by_token_* flavour
+ */
+_public_ int cryptsetup_token_validate(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                const char *json /* contains valid 'type' and 'keyslots' fields. 'type' is 'systemd-tpm2' */) {
+
+        int r;
+        JsonVariant *w;
+       _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+        assert(json);
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Could not parse " TOKEN_NAME " json object: %m.");
+
+        w = json_variant_by_key(v, "fido2-credential");
+        if (!w || !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "FIDO2 token data lacks 'fido2-credential' field.");
+                return 1;
+        }
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Invalid base64 data in 'fido2-credential' field: %m");
+
+        w = json_variant_by_key(v, "fido2-salt");
+        if (!w || !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "FIDO2 token data lacks 'fido2-salt' field.");
+                return 1;
+        }
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Failed to decode base64 encoded salt: %m.");
+
+        /* The "rp" field is optional. */
+        w = json_variant_by_key(v, "fido2-rp");
+        if (w && !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "FIDO2 token data's 'fido2-rp' field is not a string.");
+                return 1;
+        }
+
+        /* The "fido2-clientPin-required" field is optional. */
+        w = json_variant_by_key(v, "fido2-clientPin-required");
+        if (w && !json_variant_is_boolean(w)) {
+                crypt_log_debug(cd, "FIDO2 token data's 'fido2-clientPin-required' field is not a boolean.");
+                return 1;
+        }
+
+        /* The "fido2-up-required" field is optional. */
+        w = json_variant_by_key(v, "fido2-up-required");
+        if (w && !json_variant_is_boolean(w)) {
+                crypt_log_debug(cd, "FIDO2 token data's 'fido2-up-required' field is not a boolean.");
+                return 1;
+        }
+
+        /* The "fido2-uv-required" field is optional. */
+        w = json_variant_by_key(v, "fido2-uv-required");
+        if (w && !json_variant_is_boolean(w)) {
+                crypt_log_debug(cd, "FIDO2 token data's 'fido2-uv-required' field is not a boolean.");
+                return 1;
+        }
+
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-pkcs11.c b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-pkcs11.c
new file mode 100644
index 0000000..2ac8a27
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-pkcs11.c
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "cryptsetup-token.h"
+#include "cryptsetup-token-util.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "luks2-pkcs11.h"
+#include "memory-util.h"
+#include "pkcs11-util.h"
+#include "version.h"
+
+#define TOKEN_NAME "systemd-pkcs11"
+#define TOKEN_VERSION_MAJOR "1"
+#define TOKEN_VERSION_MINOR "0"
+
+/* for libcryptsetup debug purpose */
+_public_ const char *cryptsetup_token_version(void) {
+        return TOKEN_VERSION_MAJOR "." TOKEN_VERSION_MINOR " systemd-v" STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")";
+}
+
+_public_ int cryptsetup_token_open_pin(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                int token /* is always >= 0 */,
+                const char *pin,
+                size_t pin_size,
+                char **password, /* freed by cryptsetup_token_buffer_free */
+                size_t *password_len,
+                void *usrptr /* plugin defined parameter passed to crypt_activate_by_token*() API */) {
+
+        const char *json;
+        int r;
+
+        assert(!pin || pin_size);
+        assert(token >= 0);
+
+        /* This must not fail at this moment (internal error) */
+        r = crypt_token_json_get(cd, token, &json);
+        /* Use assert_se() here to avoid emitting warning with -DNDEBUG */
+        assert_se(token == r);
+        assert(json);
+
+        return acquire_luks2_key(cd, json, usrptr, pin, pin_size, password, password_len);
+}
+
+/*
+ * This function is called from within following libcryptsetup calls
+ * provided conditions further below are met:
+ *
+ * crypt_activate_by_token(), crypt_activate_by_token_type(type == 'systemd-pkcs11'):
+ *
+ * - token is assigned to at least one luks2 keyslot eligible to activate LUKS2 device
+ *   (alternatively: name is set to null, flags contains CRYPT_ACTIVATE_ALLOW_UNBOUND_KEY
+ *    and token is assigned to at least single keyslot).
+ *
+ * - if plugin defines validate function (see cryptsetup_token_validate below) it must have
+ *   passed the check (aka return 0)
+ */
+_public_ int cryptsetup_token_open(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                int token /* is always >= 0 */,
+                char **password, /* freed by cryptsetup_token_buffer_free */
+                size_t *password_len,
+                void *usrptr /* plugin defined parameter passed to crypt_activate_by_token*() API */) {
+
+        return cryptsetup_token_open_pin(cd, token, NULL, 0, password, password_len, usrptr);
+}
+
+/*
+ * libcryptsetup callback for memory deallocation of 'password' parameter passed in
+ * any crypt_token_open_* plugin function
+ */
+_public_ void cryptsetup_token_buffer_free(void *buffer, size_t buffer_len) {
+        erase_and_free(buffer);
+}
+
+/*
+ * prints systemd-pkcs11 token content in crypt_dump().
+ * 'type' and 'keyslots' fields are printed by libcryptsetup
+ */
+_public_ void cryptsetup_token_dump(
+                struct crypt_device *cd /* is always LUKS2 context */,
+                const char *json /* validated 'systemd-pkcs11' token if cryptsetup_token_validate is defined */) {
+
+        int r;
+        size_t pkcs11_key_size;
+        _cleanup_free_ char *pkcs11_uri = NULL, *key_str = NULL;
+        _cleanup_free_ void *pkcs11_key = NULL;
+
+        r = parse_luks2_pkcs11_data(cd, json, &pkcs11_uri, &pkcs11_key, &pkcs11_key_size);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Failed to parse " TOKEN_NAME " metadata: %m.");
+
+        r = crypt_dump_buffer_to_hex_string(pkcs11_key, pkcs11_key_size, &key_str);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Cannot dump " TOKEN_NAME " content: %m");
+
+        crypt_log(cd, "\tpkcs11-uri: %s\n", pkcs11_uri);
+        crypt_log(cd, "\tpkcs11-key: %s\n", key_str);
+}
+
+/*
+ * Note:
+ *   If plugin is available in library path, it's called in before following libcryptsetup calls:
+ *
+ *   crypt_token_json_set, crypt_dump, any crypt_activate_by_token_* flavour
+ */
+_public_ int cryptsetup_token_validate(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                const char *json /* contains valid 'type' and 'keyslots' fields. 'type' is 'systemd-pkcs11' */) {
+
+        int r;
+        JsonVariant *w;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Could not parse " TOKEN_NAME " json object: %m.");
+
+        w = json_variant_by_key(v, "pkcs11-uri");
+        if (!w || !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "PKCS#11 token data lacks 'pkcs11-uri' field.");
+                return 1;
+        }
+
+        if (!pkcs11_uri_valid(json_variant_string(w))) {
+                crypt_log_debug(cd, "PKCS#11 token data contains invalid PKCS#11 URI.");
+                return 1;
+        }
+
+        w = json_variant_by_key(v, "pkcs11-key");
+        if (!w || !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "PKCS#11 token data lacks 'pkcs11-key' field.");
+                return 1;
+        }
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Failed to decode base64 encoded key: %m.");
+
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-tpm2.c b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-tpm2.c
new file mode 100644
index 0000000..6fee831
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-systemd-tpm2.c
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "cryptsetup-token.h"
+#include "cryptsetup-token-util.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "luks2-tpm2.h"
+#include "memory-util.h"
+#include "strv.h"
+#include "tpm2-util.h"
+#include "version.h"
+
+#define TOKEN_NAME "systemd-tpm2"
+#define TOKEN_VERSION_MAJOR "1"
+#define TOKEN_VERSION_MINOR "0"
+
+/* for libcryptsetup debug purpose */
+_public_ const char *cryptsetup_token_version(void) {
+
+        return TOKEN_VERSION_MAJOR "." TOKEN_VERSION_MINOR " systemd-v" STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")";
+}
+
+static int log_debug_open_error(struct crypt_device *cd, int r) {
+        if (r == -EAGAIN)
+                return crypt_log_debug_errno(cd, r, "TPM2 device not found.");
+        if (r == -ENXIO)
+                return crypt_log_debug_errno(cd, r, "No matching TPM2 token data found.");
+
+        return crypt_log_debug_errno(cd, r, TOKEN_NAME " open failed: %m.");
+}
+
+_public_ int cryptsetup_token_open_pin(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                int token /* is always >= 0 */,
+                const char *pin,
+                size_t pin_size,
+                char **ret_password, /* freed by cryptsetup_token_buffer_free */
+                size_t *ret_password_len,
+                void *usrptr /* plugin defined parameter passed to crypt_activate_by_token*() API */) {
+
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL, *pin_string = NULL;
+        _cleanup_free_ void *blob = NULL, *pubkey = NULL, *policy_hash = NULL, *salt = NULL, *srk_buf = NULL;
+        size_t blob_size, policy_hash_size, decrypted_key_size, pubkey_size, salt_size = 0, srk_buf_size = 0;
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        uint32_t hash_pcr_mask, pubkey_pcr_mask;
+        systemd_tpm2_plugin_params params = {
+                .search_pcr_mask = UINT32_MAX
+        };
+        uint16_t pcr_bank, primary_alg;
+        ssize_t base64_encoded_size;
+        TPM2Flags flags = 0;
+        const char *json;
+        int r;
+
+        assert(token >= 0);
+        assert(!pin || pin_size > 0);
+        assert(ret_password);
+        assert(ret_password_len);
+
+        /* This must not fail at this moment (internal error) */
+        r = crypt_token_json_get(cd, token, &json);
+        assert(token == r);
+        assert(json);
+
+        r = crypt_normalize_pin(pin, pin_size, &pin_string);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Cannot normalize PIN: %m");
+
+        if (usrptr)
+                params = *(systemd_tpm2_plugin_params *)usrptr;
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Failed to parse token JSON data: %m");
+
+        r = tpm2_parse_luks2_json(
+                        v,
+                        NULL,
+                        &hash_pcr_mask,
+                        &pcr_bank,
+                        &pubkey,
+                        &pubkey_size,
+                        &pubkey_pcr_mask,
+                        &primary_alg,
+                        &blob,
+                        &blob_size,
+                        &policy_hash,
+                        &policy_hash_size,
+                        &salt,
+                        &salt_size,
+                        &srk_buf,
+                        &srk_buf_size,
+                        &flags);
+        if (r < 0)
+                return log_debug_open_error(cd, r);
+
+        if (params.search_pcr_mask != UINT32_MAX && hash_pcr_mask != params.search_pcr_mask)
+                return crypt_log_debug_errno(cd, ENXIO, "PCR mask doesn't match expectation (%" PRIu32 " vs. %" PRIu32 ")", hash_pcr_mask, params.search_pcr_mask);
+
+        r = acquire_luks2_key(
+                        params.device,
+                        hash_pcr_mask,
+                        pcr_bank,
+                        pubkey, pubkey_size,
+                        pubkey_pcr_mask,
+                        params.signature_path,
+                        pin_string,
+                        params.pcrlock_path,
+                        primary_alg,
+                        blob,
+                        blob_size,
+                        policy_hash,
+                        policy_hash_size,
+                        salt,
+                        salt_size,
+                        srk_buf,
+                        srk_buf_size,
+                        flags,
+                        &decrypted_key,
+                        &decrypted_key_size);
+        if (r < 0)
+                return log_debug_open_error(cd, r);
+
+        /* Before using this key as passphrase we base64 encode it, for compat with homed */
+        base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return log_debug_open_error(cd, base64_encoded_size);
+
+        /* free'd automatically by libcryptsetup */
+        *ret_password = TAKE_PTR(base64_encoded);
+        *ret_password_len = base64_encoded_size;
+
+        return 0;
+}
+
+/*
+ * This function is called from within following libcryptsetup calls
+ * provided conditions further below are met:
+ *
+ * crypt_activate_by_token(), crypt_activate_by_token_type(type == 'systemd-tpm2'):
+ *
+ * - token is assigned to at least one luks2 keyslot eligible to activate LUKS2 device
+ *   (alternatively: name is set to null, flags contains CRYPT_ACTIVATE_ALLOW_UNBOUND_KEY
+ *    and token is assigned to at least single keyslot).
+ *
+ * - if plugin defines validate function (see cryptsetup_token_validate below) it must have
+ *   passed the check (aka return 0)
+ */
+_public_ int cryptsetup_token_open(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                int token /* is always >= 0 */,
+                char **ret_password, /* freed by cryptsetup_token_buffer_free */
+                size_t *ret_password_len,
+                void *usrptr /* plugin defined parameter passed to crypt_activate_by_token*() API */) {
+
+        return cryptsetup_token_open_pin(cd, token, NULL, 0, ret_password, ret_password_len, usrptr);
+}
+
+/*
+ * libcryptsetup callback for memory deallocation of 'password' parameter passed in
+ * any crypt_token_open_* plugin function
+ */
+_public_ void cryptsetup_token_buffer_free(void *buffer, size_t buffer_len) {
+        erase_and_free(buffer);
+}
+
+/*
+ * prints systemd-tpm2 token content in crypt_dump().
+ * 'type' and 'keyslots' fields are printed by libcryptsetup
+ */
+_public_ void cryptsetup_token_dump(
+                struct crypt_device *cd /* is always LUKS2 context */,
+                const char *json /* validated 'systemd-tpm2' token if cryptsetup_token_validate is defined */) {
+
+        _cleanup_free_ char *hash_pcrs_str = NULL, *pubkey_pcrs_str = NULL, *blob_str = NULL, *policy_hash_str = NULL, *pubkey_str = NULL;
+        _cleanup_free_ void *blob = NULL, *pubkey = NULL, *policy_hash = NULL, *salt = NULL, *srk_buf = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        size_t blob_size, policy_hash_size, pubkey_size, salt_size = 0, srk_buf_size = 0;
+        uint32_t hash_pcr_mask, pubkey_pcr_mask;
+        uint16_t pcr_bank, primary_alg;
+        TPM2Flags flags = 0;
+        int r;
+
+        assert(json);
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Failed to parse " TOKEN_NAME " JSON object: %m");
+
+        r = tpm2_parse_luks2_json(
+                        v,
+                        NULL,
+                        &hash_pcr_mask,
+                        &pcr_bank,
+                        &pubkey,
+                        &pubkey_size,
+                        &pubkey_pcr_mask,
+                        &primary_alg,
+                        &blob,
+                        &blob_size,
+                        &policy_hash,
+                        &policy_hash_size,
+                        &salt,
+                        &salt_size,
+                        &srk_buf,
+                        &srk_buf_size,
+                        &flags);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Failed to parse " TOKEN_NAME " JSON fields: %m");
+
+        hash_pcrs_str = tpm2_pcr_mask_to_string(hash_pcr_mask);
+        if (!hash_pcrs_str)
+                return (void) crypt_log_debug_errno(cd, ENOMEM, "Cannot format PCR hash mask: %m");
+
+        pubkey_pcrs_str = tpm2_pcr_mask_to_string(pubkey_pcr_mask);
+        if (!pubkey_pcrs_str)
+                return (void) crypt_log_debug_errno(cd, ENOMEM, "Cannot format PCR hash mask: %m");
+
+        r = crypt_dump_buffer_to_hex_string(blob, blob_size, &blob_str);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Cannot dump " TOKEN_NAME " content: %m");
+
+        r = crypt_dump_buffer_to_hex_string(pubkey, pubkey_size, &pubkey_str);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Cannot dump " TOKEN_NAME " content: %m");
+
+        r = crypt_dump_buffer_to_hex_string(policy_hash, policy_hash_size, &policy_hash_str);
+        if (r < 0)
+                return (void) crypt_log_debug_errno(cd, r, "Cannot dump " TOKEN_NAME " content: %m");
+
+        crypt_log(cd, "\ttpm2-hash-pcrs:   %s\n", strna(hash_pcrs_str));
+        crypt_log(cd, "\ttpm2-pcr-bank:    %s\n", strna(tpm2_hash_alg_to_string(pcr_bank)));
+        crypt_log(cd, "\ttpm2-pubkey:" CRYPT_DUMP_LINE_SEP "%s\n", pubkey_str);
+        crypt_log(cd, "\ttpm2-pubkey-pcrs: %s\n", strna(pubkey_pcrs_str));
+        crypt_log(cd, "\ttpm2-primary-alg: %s\n", strna(tpm2_asym_alg_to_string(primary_alg)));
+        crypt_log(cd, "\ttpm2-blob:        %s\n", blob_str);
+        crypt_log(cd, "\ttpm2-policy-hash:" CRYPT_DUMP_LINE_SEP "%s\n", policy_hash_str);
+        crypt_log(cd, "\ttpm2-pin:         %s\n", true_false(flags & TPM2_FLAGS_USE_PIN));
+        crypt_log(cd, "\ttpm2-pcrlock:     %s\n", true_false(flags & TPM2_FLAGS_USE_PCRLOCK));
+        crypt_log(cd, "\ttpm2-salt:        %s\n", true_false(salt));
+        crypt_log(cd, "\ttpm2-srk:         %s\n", true_false(srk_buf));
+}
+
+/*
+ * Note:
+ *   If plugin is available in library path, it's called in before following libcryptsetup calls:
+ *
+ *   crypt_token_json_set, crypt_dump, any crypt_activate_by_token_* flavour
+ */
+_public_ int cryptsetup_token_validate(
+                struct crypt_device *cd, /* is always LUKS2 context */
+                const char *json /* contains valid 'type' and 'keyslots' fields. 'type' is 'systemd-tpm2' */) {
+
+        int r;
+        JsonVariant *w, *e;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+        assert(json);
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Could not parse " TOKEN_NAME " json object: %m");
+
+        w = json_variant_by_key(v, "tpm2-pcrs");
+        if (!w || !json_variant_is_array(w)) {
+                crypt_log_debug(cd, "TPM2 token data lacks 'tpm2-pcrs' field.");
+                return 1;
+        }
+
+        JSON_VARIANT_ARRAY_FOREACH(e, w) {
+                uint64_t u;
+
+                if (!json_variant_is_number(e)) {
+                        crypt_log_debug(cd, "TPM2 PCR is not a number.");
+                        return 1;
+                }
+
+                u = json_variant_unsigned(e);
+                if (!TPM2_PCR_INDEX_VALID(u)) {
+                        crypt_log_debug(cd, "TPM2 PCR number out of range.");
+                        return 1;
+                }
+        }
+
+        /* The bank field is optional, since it was added in systemd 250 only. Before the bank was hardcoded
+         * to SHA256. */
+        w = json_variant_by_key(v, "tpm2-pcr-bank");
+        if (w) {
+                /* The PCR bank field is optional */
+
+                if (!json_variant_is_string(w)) {
+                        crypt_log_debug(cd, "TPM2 PCR bank is not a string.");
+                        return 1;
+                }
+
+                if (tpm2_hash_alg_from_string(json_variant_string(w)) < 0) {
+                        crypt_log_debug(cd, "TPM2 PCR bank invalid or not supported: %s.", json_variant_string(w));
+                        return 1;
+                }
+        }
+
+        /* The primary key algorithm field is optional, since it was also added in systemd 250 only. Before
+         * the algorithm was hardcoded to ECC. */
+        w = json_variant_by_key(v, "tpm2-primary-alg");
+        if (w) {
+                /* The primary key algorithm is optional */
+
+                if (!json_variant_is_string(w)) {
+                        crypt_log_debug(cd, "TPM2 primary key algorithm is not a string.");
+                        return 1;
+                }
+
+                if (tpm2_asym_alg_from_string(json_variant_string(w)) < 0) {
+                        crypt_log_debug(cd, "TPM2 primary key algorithm invalid or not supported: %s", json_variant_string(w));
+                        return 1;
+                }
+        }
+
+        w = json_variant_by_key(v, "tpm2-blob");
+        if (!w || !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "TPM2 token data lacks 'tpm2-blob' field.");
+                return 1;
+        }
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Invalid base64 data in 'tpm2-blob' field: %m");
+
+        w = json_variant_by_key(v, "tpm2-policy-hash");
+        if (!w || !json_variant_is_string(w)) {
+                crypt_log_debug(cd, "TPM2 token data lacks 'tpm2-policy-hash' field.");
+                return 1;
+        }
+
+        r = unhexmem(json_variant_string(w), SIZE_MAX, NULL, NULL);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Invalid base64 data in 'tpm2-policy-hash' field: %m");
+
+        w = json_variant_by_key(v, "tpm2-pin");
+        if (w) {
+                if (!json_variant_is_boolean(w)) {
+                        crypt_log_debug(cd, "TPM2 PIN policy is not a boolean.");
+                        return 1;
+                }
+        }
+
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.c b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.c
new file mode 100644
index 0000000..4e3090b
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.c
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cryptsetup-token-util.h"
+#include "string-util.h"
+
+int crypt_dump_buffer_to_hex_string(
+                const char *buf,
+                size_t buf_size,
+                char **ret_dump_str) {
+
+        int r;
+        _cleanup_free_ char *dump_str = NULL;
+
+        assert(buf || !buf_size);
+        assert(ret_dump_str);
+
+        for (size_t i = 0; i < buf_size; i++) {
+                /* crypt_dump() breaks line after every
+                 * 16th couple of chars in dumped hexstring */
+                r = strextendf_with_separator(
+                        &dump_str,
+                        (i && !(i % 16)) ? CRYPT_DUMP_LINE_SEP : " ",
+                        "%02hhx", buf[i]);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret_dump_str = TAKE_PTR(dump_str);
+
+        return 0;
+}
+
+int crypt_dump_hex_string(const char *hex_str, char **ret_dump_str) {
+
+        int r;
+        size_t len;
+        _cleanup_free_ char *dump_str = NULL;
+
+        assert(hex_str);
+        assert(ret_dump_str);
+
+        len = strlen(hex_str) >> 1;
+
+        for (size_t i = 0; i < len; i++) {
+                /* crypt_dump() breaks line after every
+                 * 16th couple of chars in dumped hexstring */
+                r = strextendf_with_separator(
+                        &dump_str,
+                        (i && !(i % 16)) ? CRYPT_DUMP_LINE_SEP : " ",
+                        "%.2s", hex_str + (i<<1));
+                if (r < 0)
+                        return r;
+        }
+
+        *ret_dump_str = TAKE_PTR(dump_str);
+
+        return 0;
+}
+
+int crypt_normalize_pin(const void *pin, size_t pin_size, char **ret_pin_string) {
+        assert(pin || pin_size == 0);
+        assert(ret_pin_string);
+
+        if (pin_size == 0) {
+                *ret_pin_string = NULL;
+                return 0;
+        }
+
+        return make_cstring(pin, pin_size, MAKE_CSTRING_ALLOW_TRAILING_NUL, ret_pin_string);
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.h b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.h
new file mode 100644
index 0000000..146beff
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token-util.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include 
+#include 
+#include 
+
+/* crypt_dump() internal indentation magic */
+#define CRYPT_DUMP_LINE_SEP "\n\t            "
+
+#define crypt_log_debug(cd, ...)   crypt_logf(cd, CRYPT_LOG_DEBUG,   __VA_ARGS__)
+#define crypt_log_error(cd, ...)   crypt_logf(cd, CRYPT_LOG_ERROR,   __VA_ARGS__)
+#define crypt_log_verbose(cd, ...) crypt_logf(cd, CRYPT_LOG_VERBOSE, __VA_ARGS__)
+#define crypt_log(cd, ...)         crypt_logf(cd, CRYPT_LOG_NORMAL,  __VA_ARGS__)
+
+#define crypt_log_full_errno(cd, e, lvl, ...) ({ \
+        int _e = abs(e), _s = errno; \
+        errno = _e; \
+        crypt_logf(cd, lvl, __VA_ARGS__); \
+        errno = _s; \
+        -_e; \
+})
+
+#define crypt_log_debug_errno(cd, e, ...) \
+        crypt_log_full_errno(cd, e, CRYPT_LOG_DEBUG, __VA_ARGS__)
+
+#define crypt_log_error_errno(cd, e, ...) \
+        crypt_log_full_errno(cd, e, CRYPT_LOG_ERROR, __VA_ARGS__)
+
+#define crypt_log_oom(cd) crypt_log_error_errno(cd, ENOMEM, "Not enough memory.")
+
+int crypt_dump_buffer_to_hex_string(
+                const char *buf,
+                size_t buf_size,
+                char **ret_dump_str);
+
+int crypt_dump_hex_string(const char *hex_str, char **ret_dump_str);
+
+int crypt_normalize_pin(const void *pin, size_t pin_size, char **ret_pin_string);
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token.h b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token.h
new file mode 100644
index 0000000..2a9d23f
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* for more information see libcryptsetup.h crypt-tokens section */
+
+const char *cryptsetup_token_version(void);
+
+int cryptsetup_token_open(struct crypt_device *cd, int token,
+        char **password, size_t *password_len, void *usrptr);
+
+int cryptsetup_token_open_pin(struct crypt_device *cd, int token,
+        const char *pin, size_t pin_size,
+        char **password, size_t *password_len, void *usrptr);
+
+void cryptsetup_token_dump(struct crypt_device *cd, const char *json);
+
+int cryptsetup_token_validate(struct crypt_device *cd, const char *json);
+
+void cryptsetup_token_buffer_free(void *buffer, size_t buffer_len);
diff --git a/src/cryptsetup/cryptsetup-tokens/cryptsetup-token.sym b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token.sym
new file mode 100644
index 0000000..730e78e
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/cryptsetup-token.sym
@@ -0,0 +1,19 @@
+/***
+  SPDX-License-Identifier: LGPL-2.1-or-later
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+***/
+
+CRYPTSETUP_TOKEN_1.0 {
+global:
+        cryptsetup_token_open;
+        cryptsetup_token_open_pin;
+        cryptsetup_token_buffer_free;
+        cryptsetup_token_validate;
+        cryptsetup_token_dump;
+        cryptsetup_token_version;
+local: *;
+};
diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-fido2.c b/src/cryptsetup/cryptsetup-tokens/luks2-fido2.c
new file mode 100644
index 0000000..a1c85e6
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/luks2-fido2.c
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "cryptsetup-token-util.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "luks2-fido2.h"
+#include "memory-util.h"
+#include "strv.h"
+
+int acquire_luks2_key(
+                struct crypt_device *cd,
+                const char *json,
+                const char *device,
+                const char *pin,
+                char **ret_keyslot_passphrase,
+                size_t *ret_keyslot_passphrase_size) {
+
+        int r;
+        Fido2EnrollFlags required;
+        size_t cid_size, salt_size, decrypted_key_size;
+        _cleanup_free_ void *cid = NULL, *salt = NULL;
+        _cleanup_free_ char *rp_id = NULL;
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+        _cleanup_strv_free_erase_ char **pins = NULL;
+        ssize_t base64_encoded_size;
+
+        assert(ret_keyslot_passphrase);
+        assert(ret_keyslot_passphrase_size);
+
+        r = parse_luks2_fido2_data(cd, json, &rp_id, &salt, &salt_size, &cid, &cid_size, &required);
+        if (r < 0)
+                return r;
+
+        if (pin) {
+                pins = strv_new(pin);
+                if (!pins)
+                        return crypt_log_oom(cd);
+        }
+
+        /* configured to use pin but none was provided */
+        if ((required & FIDO2ENROLL_PIN) && strv_isempty(pins))
+                return -ENOANO;
+
+        r = fido2_use_hmac_hash(
+                        device,
+                        rp_id ?: "io.systemd.cryptsetup",
+                        salt, salt_size,
+                        cid, cid_size,
+                        pins,
+                        required,
+                        &decrypted_key,
+                        &decrypted_key_size);
+        if (r == -ENOLCK) /* libcryptsetup returns -ENOANO also on wrong PIN */
+                r = -ENOANO;
+        if (r < 0)
+                return r;
+
+        /* Before using this key as passphrase we base64 encode it, for compat with homed */
+        base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return crypt_log_error_errno(cd, (int) base64_encoded_size, "Failed to base64 encode key: %m");
+
+        *ret_keyslot_passphrase = TAKE_PTR(base64_encoded);
+        *ret_keyslot_passphrase_size = base64_encoded_size;
+
+        return 0;
+}
+
+/* this function expects valid "systemd-fido2" in json */
+int parse_luks2_fido2_data(
+                struct crypt_device *cd,
+                const char *json,
+                char **ret_rp_id,
+                void **ret_salt,
+                size_t *ret_salt_size,
+                void **ret_cid,
+                size_t *ret_cid_size,
+                Fido2EnrollFlags *ret_required) {
+
+        _cleanup_free_ void *cid = NULL, *salt = NULL;
+        size_t cid_size = 0, salt_size = 0;
+        _cleanup_free_ char *rp = NULL;
+        int r;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        JsonVariant *w;
+        Fido2EnrollFlags required = 0;
+
+        assert(json);
+        assert(ret_rp_id);
+        assert(ret_salt);
+        assert(ret_salt_size);
+        assert(ret_cid);
+        assert(ret_cid_size);
+        assert(ret_required);
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return crypt_log_error_errno(cd, r, "Failed to parse JSON token data: %m");
+
+        w = json_variant_by_key(v, "fido2-credential");
+        if (!w)
+                return -EINVAL;
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, &cid, &cid_size);
+        if (r < 0)
+                return crypt_log_error_errno(cd, r, "Failed to parse 'fido2-credentials' field: %m");
+
+        w = json_variant_by_key(v, "fido2-salt");
+        if (!w)
+                return -EINVAL;
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, &salt, &salt_size);
+        if (r < 0)
+                return crypt_log_error_errno(cd, r, "Failed to parse 'fido2-salt' field: %m");
+
+        w = json_variant_by_key(v, "fido2-rp");
+        if (w) {
+                /* The "rp" field is optional. */
+                rp = strdup(json_variant_string(w));
+                if (!rp) {
+                        crypt_log_error(cd, "Not enough memory.");
+                        return -ENOMEM;
+                }
+        }
+
+        w = json_variant_by_key(v, "fido2-clientPin-required");
+        if (w)
+                /* The "fido2-clientPin-required" field is optional. */
+                SET_FLAG(required, FIDO2ENROLL_PIN, json_variant_boolean(w));
+        else
+                required |= FIDO2ENROLL_PIN_IF_NEEDED; /* compat with 248, where the field was unset */
+
+        w = json_variant_by_key(v, "fido2-up-required");
+        if (w)
+                /* The "fido2-up-required" field is optional. */
+                SET_FLAG(required, FIDO2ENROLL_UP, json_variant_boolean(w));
+        else
+                required |= FIDO2ENROLL_UP_IF_NEEDED; /* compat with 248 */
+
+        w = json_variant_by_key(v, "fido2-uv-required");
+        if (w)
+                /* The "fido2-uv-required" field is optional. */
+                SET_FLAG(required, FIDO2ENROLL_UV, json_variant_boolean(w));
+        else
+                required |= FIDO2ENROLL_UV_OMIT; /* compat with 248 */
+
+        *ret_rp_id = TAKE_PTR(rp);
+        *ret_cid = TAKE_PTR(cid);
+        *ret_cid_size = cid_size;
+        *ret_salt = TAKE_PTR(salt);
+        *ret_salt_size = salt_size;
+        *ret_required = required;
+
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-fido2.h b/src/cryptsetup/cryptsetup-tokens/luks2-fido2.h
new file mode 100644
index 0000000..48416ec
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/luks2-fido2.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "libfido2-util.h"
+
+struct crypt_device;
+
+int acquire_luks2_key(
+                struct crypt_device *cd,
+                const char *json,
+                const char *device,
+                const char *pin,
+                char **ret_keyslot_passphrase,
+                size_t *ret_keyslot_passphrase_size);
+
+int parse_luks2_fido2_data(
+                struct crypt_device *cd,
+                const char *json,
+                char **ret_rp_id,
+                void **ret_salt,
+                size_t *ret_salt_size,
+                void **ret_cid,
+                size_t *ret_cid_size,
+                Fido2EnrollFlags *ret_required);
diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.c b/src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.c
new file mode 100644
index 0000000..178fc7a
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.c
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "cryptsetup-token-util.h"
+#include "escape.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "luks2-pkcs11.h"
+#include "memory-util.h"
+#include "pkcs11-util.h"
+#include "time-util.h"
+
+struct luks2_pkcs11_callback_data {
+        struct crypt_device *cd;
+        const char *pin;
+        size_t pin_size;
+        void *encrypted_key;
+        size_t encrypted_key_size;
+        void *decrypted_key;
+        size_t decrypted_key_size;
+};
+
+static int luks2_pkcs11_callback(
+                CK_FUNCTION_LIST *m,
+                CK_SESSION_HANDLE session,
+                CK_SLOT_ID slot_id,
+                const CK_SLOT_INFO *slot_info,
+                const CK_TOKEN_INFO *token_info,
+                P11KitUri *uri,
+                void *userdata) {
+
+        CK_OBJECT_HANDLE object;
+        CK_RV rv;
+        CK_TOKEN_INFO updated_token_info;
+        int r;
+        _cleanup_free_ char *token_label = NULL;
+        struct luks2_pkcs11_callback_data *data = ASSERT_PTR(userdata);
+
+        assert(m);
+        assert(slot_info);
+        assert(token_info);
+        assert(uri);
+
+        token_label = pkcs11_token_label(token_info);
+        if (!token_label)
+                return -ENOMEM;
+
+        /* Called for every token matching our URI */
+        r = pkcs11_token_login_by_pin(m, session, token_info, token_label, data->pin, data->pin_size);
+        if (r == -ENOLCK) {
+                /* Refresh the token info, so that we can prompt knowing the new flags if they changed. */
+                rv = m->C_GetTokenInfo(slot_id, &updated_token_info);
+                if (rv != CKR_OK) {
+                        crypt_log_error(data->cd,
+                                       "Failed to acquire updated security token information for slot %lu: %s",
+                                       slot_id, sym_p11_kit_strerror(rv));
+                        return -EIO;
+                }
+                token_info = &updated_token_info;
+                r = -ENOANO;
+        }
+
+        if (r == -ENOANO) {
+                if (FLAGS_SET(token_info->flags, CKF_USER_PIN_FINAL_TRY))
+                        crypt_log_error(data->cd, "Please enter correct PIN for security token "
+                                        "'%s' in order to unlock it (final try).", token_label);
+                else if (FLAGS_SET(token_info->flags, CKF_USER_PIN_COUNT_LOW))
+                        crypt_log_error(data->cd, "PIN has been entered incorrectly previously, "
+                                      "please enter correct PIN for security token '%s' in order to unlock it.",
+                                      token_label);
+        }
+
+        if (r == -EPERM) /* pin is locked, but map it to -ENOANO anyway */
+                r = -ENOANO;
+
+        if (r < 0)
+                return r;
+
+        r = pkcs11_token_find_private_key(m, session, uri, &object);
+        if (r < 0)
+                return r;
+
+        r = pkcs11_token_decrypt_data(
+                        m,
+                        session,
+                        object,
+                        data->encrypted_key,
+                        data->encrypted_key_size,
+                        &data->decrypted_key,
+                        &data->decrypted_key_size);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static void luks2_pkcs11_callback_data_release(struct luks2_pkcs11_callback_data *data) {
+        erase_and_free(data->decrypted_key);
+}
+
+static int acquire_luks2_key_by_pin(
+                struct crypt_device *cd,
+                const char *pkcs11_uri,
+                const void *pin,
+                size_t pin_size,
+                void *encrypted_key,
+                size_t encrypted_key_size,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        int r;
+        _cleanup_(luks2_pkcs11_callback_data_release) struct luks2_pkcs11_callback_data data = {
+                .cd = cd,
+                .pin = pin,
+                .pin_size = pin_size,
+                .encrypted_key = encrypted_key,
+                .encrypted_key_size = encrypted_key_size,
+        };
+
+        assert(pkcs11_uri);
+        assert(encrypted_key);
+        assert(ret_decrypted_key);
+        assert(ret_decrypted_key_size);
+
+        r = pkcs11_find_token(pkcs11_uri, luks2_pkcs11_callback, &data);
+        if (r < 0)
+                return r;
+
+        *ret_decrypted_key = TAKE_PTR(data.decrypted_key);
+        *ret_decrypted_key_size = data.decrypted_key_size;
+
+        return 0;
+}
+
+/* called from within systemd utilities */
+static int acquire_luks2_key_systemd(
+                const char *pkcs11_uri,
+                systemd_pkcs11_plugin_params *params,
+                void *encrypted_key,
+                size_t encrypted_key_size,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        int r;
+        _cleanup_(pkcs11_crypt_device_callback_data_release) pkcs11_crypt_device_callback_data data = {
+                .encrypted_key = encrypted_key,
+                .encrypted_key_size = encrypted_key_size,
+                .free_encrypted_key = false
+        };
+
+        assert(pkcs11_uri);
+        assert(encrypted_key);
+        assert(ret_decrypted_key);
+        assert(ret_decrypted_key_size);
+        assert(params);
+
+        data.friendly_name = params->friendly_name;
+        data.headless = params->headless;
+        data.askpw_flags = params->askpw_flags;
+        data.until = params->until;
+
+        /* The functions called here log about all errors, except for EAGAIN which means "token not found right now" */
+        r = pkcs11_find_token(pkcs11_uri, pkcs11_crypt_device_callback, &data);
+        if (r < 0)
+                return r;
+
+        *ret_decrypted_key = TAKE_PTR(data.decrypted_key);
+        *ret_decrypted_key_size = data.decrypted_key_size;
+
+        return 0;
+}
+
+int acquire_luks2_key(
+                struct crypt_device *cd,
+                const char *json,
+                void *userdata,
+                const void *pin,
+                size_t pin_size,
+                char **ret_password,
+                size_t *ret_password_size) {
+
+        int r;
+        size_t decrypted_key_size, encrypted_key_size;
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+        _cleanup_free_ char *pkcs11_uri = NULL;
+        _cleanup_free_ void *encrypted_key = NULL;
+        systemd_pkcs11_plugin_params *pkcs11_params = userdata;
+        ssize_t base64_encoded_size;
+
+        assert(json);
+        assert(ret_password);
+        assert(ret_password_size);
+
+        r = parse_luks2_pkcs11_data(cd, json, &pkcs11_uri, &encrypted_key, &encrypted_key_size);
+        if (r < 0)
+                return r;
+
+        if (pkcs11_params && pin)
+                crypt_log_verbose(cd, "PIN parameter ignored in interactive mode.");
+
+        if (pkcs11_params) /* systemd based activation with interactive pin query callbacks */
+                r = acquire_luks2_key_systemd(
+                        pkcs11_uri,
+                        pkcs11_params,
+                        encrypted_key, encrypted_key_size,
+                        &decrypted_key, &decrypted_key_size);
+        else /* default activation that provides single PIN if needed */
+                r = acquire_luks2_key_by_pin(
+                        cd, pkcs11_uri, pin, pin_size,
+                        encrypted_key, encrypted_key_size,
+                        &decrypted_key, &decrypted_key_size);
+        if (r < 0)
+                return r;
+
+        base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return crypt_log_error_errno(cd, (int) base64_encoded_size, "Cannot base64 encode key: %m");
+
+        *ret_password = TAKE_PTR(base64_encoded);
+        *ret_password_size = base64_encoded_size;
+
+        return 0;
+}
+
+int parse_luks2_pkcs11_data(
+                struct crypt_device *cd,
+                const char *json,
+                char **ret_uri,
+                void **ret_encrypted_key,
+                size_t *ret_encrypted_key_size) {
+
+        int r;
+        size_t key_size;
+        _cleanup_free_ char *uri = NULL;
+        _cleanup_free_ void *key = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        JsonVariant *w;
+
+        assert(json);
+        assert(ret_uri);
+        assert(ret_encrypted_key);
+        assert(ret_encrypted_key_size);
+
+        r = json_parse(json, 0, &v, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        w = json_variant_by_key(v, "pkcs11-uri");
+        if (!w)
+                return -EINVAL;
+
+        uri = strdup(json_variant_string(w));
+        if (!uri)
+                return -ENOMEM;
+
+        w = json_variant_by_key(v, "pkcs11-key");
+        if (!w)
+                return -EINVAL;
+
+        r = unbase64mem(json_variant_string(w), SIZE_MAX, &key, &key_size);
+        if (r < 0)
+                return crypt_log_debug_errno(cd, r, "Failed to decode base64 encoded key: %m.");
+
+        *ret_uri = TAKE_PTR(uri);
+        *ret_encrypted_key = TAKE_PTR(key);
+        *ret_encrypted_key_size = key_size;
+
+        return 0;
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.h b/src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.h
new file mode 100644
index 0000000..41ce9f0
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/luks2-pkcs11.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+struct crypt_device;
+
+int acquire_luks2_key(
+                struct crypt_device *cd,
+                const char *json,
+                void *userdata,
+                const void *pin,
+                size_t pin_size,
+                char **password,
+                size_t *password_size);
+
+int parse_luks2_pkcs11_data(
+                struct crypt_device *cd,
+                const char *json,
+                char **ret_uri,
+                void **ret_encrypted_key,
+                size_t *ret_encrypted_key_size);
diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c
new file mode 100644
index 0000000..72be5cc
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.c
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "env-util.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "log.h"
+#include "luks2-tpm2.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "sha256.h"
+#include "strv.h"
+#include "tpm2-util.h"
+
+int acquire_luks2_key(
+                const char *device,
+                uint32_t hash_pcr_mask,
+                uint16_t pcr_bank,
+                const void *pubkey,
+                size_t pubkey_size,
+                uint32_t pubkey_pcr_mask,
+                const char *signature_path,
+                const char *pin,
+                const char *pcrlock_path,
+                uint16_t primary_alg,
+                const void *key_data,
+                size_t key_data_size,
+                const void *policy_hash,
+                size_t policy_hash_size,
+                const void *salt,
+                size_t salt_size,
+                const void *srk_buf,
+                size_t srk_buf_size,
+                TPM2Flags flags,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *signature_json = NULL;
+        _cleanup_free_ char *auto_device = NULL;
+        _cleanup_(erase_and_freep) char *b64_salted_pin = NULL;
+        int r;
+
+        assert(salt || salt_size == 0);
+        assert(ret_decrypted_key);
+        assert(ret_decrypted_key_size);
+
+        if (!device) {
+                r = tpm2_find_device_auto(&auto_device);
+                if (r == -ENODEV)
+                        return -EAGAIN; /* Tell the caller to wait for a TPM2 device to show up */
+                if (r < 0)
+                        return log_error_errno(r, "Could not find TPM2 device: %m");
+
+                device = auto_device;
+        }
+
+        if ((flags & TPM2_FLAGS_USE_PIN) && !pin)
+                return -ENOANO;
+
+        if (pin && salt_size > 0) {
+                uint8_t salted_pin[SHA256_DIGEST_SIZE] = {};
+                CLEANUP_ERASE(salted_pin);
+                r = tpm2_util_pbkdf2_hmac_sha256(pin, strlen(pin), salt, salt_size, salted_pin);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to perform PBKDF2: %m");
+
+                r = base64mem(salted_pin, sizeof(salted_pin), &b64_salted_pin);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to base64 encode salted pin: %m");
+                pin = b64_salted_pin;
+        }
+
+        if (pubkey_pcr_mask != 0) {
+                r = tpm2_load_pcr_signature(signature_path, &signature_json);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to load PCR signature: %m");
+        }
+
+        _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy pcrlock_policy = {};
+        if (FLAGS_SET(flags, TPM2_FLAGS_USE_PCRLOCK)) {
+                r = tpm2_pcrlock_policy_load(pcrlock_path, &pcrlock_policy);
+                if (r < 0)
+                        return r;
+        }
+
+        _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL;
+        r = tpm2_context_new(device, &tpm2_context);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create TPM2 context: %m");
+
+        r = tpm2_unseal(tpm2_context,
+                        hash_pcr_mask,
+                        pcr_bank,
+                        pubkey, pubkey_size,
+                        pubkey_pcr_mask,
+                        signature_json,
+                        pin,
+                        FLAGS_SET(flags, TPM2_FLAGS_USE_PCRLOCK) ? &pcrlock_policy : NULL,
+                        primary_alg,
+                        key_data, key_data_size,
+                        policy_hash, policy_hash_size,
+                        srk_buf, srk_buf_size,
+                        ret_decrypted_key, ret_decrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unseal secret using TPM2: %m");
+
+        return r;
+}
diff --git a/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h
new file mode 100644
index 0000000..d84e5a3
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/luks2-tpm2.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include "tpm2-util.h"
+
+struct crypt_device;
+
+int acquire_luks2_key(
+                const char *device,
+                uint32_t pcr_mask,
+                uint16_t pcr_bank,
+                const void *pubkey,
+                size_t pubkey_size,
+                uint32_t pubkey_pcr_mask,
+                const char *signature_path,
+                const char *pcrlock_path,
+                const char *pin,
+                uint16_t primary_alg,
+                const void *key_data,
+                size_t key_data_size,
+                const void *policy_hash,
+                size_t policy_hash_size,
+                const void *salt,
+                size_t salt_size,
+                const void *srk_buf,
+                size_t srk_buf_size,
+                TPM2Flags flags,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size);
diff --git a/src/cryptsetup/cryptsetup-tokens/meson.build b/src/cryptsetup/cryptsetup-tokens/meson.build
new file mode 100644
index 0000000..b26940c
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tokens/meson.build
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+lib_cryptsetup_token_common = static_library(
+        'cryptsetup-token-common',
+        'cryptsetup-token-util.c',
+        include_directories : includes,
+        dependencies : userspace,
+        link_with : libshared,
+        build_by_default : false)
+
+cryptsetup_token_systemd_tpm2_sources = files(
+        'cryptsetup-token-systemd-tpm2.c',
+        'luks2-tpm2.c',
+)
+
+cryptsetup_token_systemd_fido2_sources = files(
+        'cryptsetup-token-systemd-fido2.c',
+        'luks2-fido2.c',
+)
+
+cryptsetup_token_systemd_pkcs11_sources = files(
+        'cryptsetup-token-systemd-pkcs11.c',
+        'luks2-pkcs11.c',
+)
+
+template = {
+        'include_directories' : includes,
+        'link_with' : [
+                lib_cryptsetup_token_common,
+                libshared,
+        ],
+        'version-script' : meson.current_source_dir() / 'cryptsetup-token.sym',
+        'install_rpath' : pkglibdir,
+        'install' : true,
+        'install_dir' : libcryptsetup_plugins_dir,
+}
+
+modules += [
+        template + {
+                'name' : 'cryptsetup-token-systemd-tpm2',
+                'conditions' : [
+                        'HAVE_LIBCRYPTSETUP_PLUGINS',
+                        'HAVE_TPM2',
+                ],
+                'sources' : cryptsetup_token_systemd_tpm2_sources,
+                'dependencies' : [
+                        libcryptsetup,
+                        tpm2,
+                ],
+        },
+        template + {
+                'name' : 'cryptsetup-token-systemd-fido2',
+                'conditions' : [
+                        'HAVE_LIBCRYPTSETUP_PLUGINS',
+                        'HAVE_LIBFIDO2',
+                ],
+                'sources' : cryptsetup_token_systemd_fido2_sources,
+                'dependencies' : [
+                        libcryptsetup,
+                        libfido2,
+                ],
+        },
+        template + {
+                'name' : 'cryptsetup-token-systemd-pkcs11',
+                'conditions' : [
+                        'HAVE_LIBCRYPTSETUP_PLUGINS',
+                        'HAVE_P11KIT',
+                ],
+                'sources' : cryptsetup_token_systemd_pkcs11_sources,
+                'dependencies' : [
+                        libcryptsetup,
+                        libp11kit_cflags,
+                ],
+        },
+]
diff --git a/src/cryptsetup/cryptsetup-tpm2.c b/src/cryptsetup/cryptsetup-tpm2.c
new file mode 100644
index 0000000..f59d5f9
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tpm2.c
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "cryptsetup-tpm2.h"
+#include "env-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "sha256.h"
+#include "tpm2-util.h"
+
+static int get_pin(usec_t until, AskPasswordFlags ask_password_flags, bool headless, char **ret_pin_str) {
+        _cleanup_(erase_and_freep) char *pin_str = NULL;
+        _cleanup_strv_free_erase_ char **pin = NULL;
+        int r;
+
+        assert(ret_pin_str);
+
+        r = getenv_steal_erase("PIN", &pin_str);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire PIN from environment: %m");
+        if (!r) {
+                if (headless)
+                        return log_error_errno(
+                                        SYNTHETIC_ERRNO(ENOPKG),
+                                        "PIN querying disabled via 'headless' option. "
+                                        "Use the '$PIN' environment variable.");
+
+                pin = strv_free_erase(pin);
+                r = ask_password_auto(
+                                "Please enter TPM2 PIN:",
+                                "drive-harddisk",
+                                NULL,
+                                "tpm2-pin",
+                                "cryptsetup.tpm2-pin",
+                                until,
+                                ask_password_flags,
+                                &pin);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to ask for user pin: %m");
+                assert(strv_length(pin) == 1);
+
+                pin_str = strdup(pin[0]);
+                if (!pin_str)
+                        return log_oom();
+        }
+
+        *ret_pin_str = TAKE_PTR(pin_str);
+
+        return r;
+}
+
+int acquire_tpm2_key(
+                const char *volume_name,
+                const char *device,
+                uint32_t hash_pcr_mask,
+                uint16_t pcr_bank,
+                const void *pubkey,
+                size_t pubkey_size,
+                uint32_t pubkey_pcr_mask,
+                const char *signature_path,
+                const char *pcrlock_path,
+                uint16_t primary_alg,
+                const char *key_file,
+                size_t key_file_size,
+                uint64_t key_file_offset,
+                const void *key_data,
+                size_t key_data_size,
+                const void *policy_hash,
+                size_t policy_hash_size,
+                const void *salt,
+                size_t salt_size,
+                const void *srk_buf,
+                size_t srk_buf_size,
+                TPM2Flags flags,
+                usec_t until,
+                bool headless,
+                AskPasswordFlags ask_password_flags,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *signature_json = NULL;
+        _cleanup_free_ void *loaded_blob = NULL;
+        _cleanup_free_ char *auto_device = NULL;
+        size_t blob_size;
+        const void *blob;
+        int r;
+
+        assert(salt || salt_size == 0);
+
+        if (!device) {
+                r = tpm2_find_device_auto(&auto_device);
+                if (r == -ENODEV)
+                        return -EAGAIN; /* Tell the caller to wait for a TPM2 device to show up */
+                if (r < 0)
+                        return log_error_errno(r, "Could not find TPM2 device: %m");
+
+                device = auto_device;
+        }
+
+        if (key_data) {
+                blob = key_data;
+                blob_size = key_data_size;
+        } else {
+                _cleanup_free_ char *bindname = NULL;
+
+                /* If we read the salt via AF_UNIX, make this client recognizable */
+                if (asprintf(&bindname, "@%" PRIx64"/cryptsetup-tpm2/%s", random_u64(), volume_name) < 0)
+                        return log_oom();
+
+                r = read_full_file_full(
+                                AT_FDCWD, key_file,
+                                key_file_offset == 0 ? UINT64_MAX : key_file_offset,
+                                key_file_size == 0 ? SIZE_MAX : key_file_size,
+                                READ_FULL_FILE_CONNECT_SOCKET,
+                                bindname,
+                                (char**) &loaded_blob, &blob_size);
+                if (r < 0)
+                        return r;
+
+                blob = loaded_blob;
+        }
+
+        if (pubkey_pcr_mask != 0) {
+                r = tpm2_load_pcr_signature(signature_path, &signature_json);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to load pcr signature: %m");
+        }
+
+        _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy pcrlock_policy = {};
+
+        if (FLAGS_SET(flags, TPM2_FLAGS_USE_PCRLOCK)) {
+                r = tpm2_pcrlock_policy_load(pcrlock_path, &pcrlock_policy);
+                if (r < 0)
+                        return r;
+        }
+
+        _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL;
+        r = tpm2_context_new(device, &tpm2_context);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create TPM2 context: %m");
+
+        if (!(flags & TPM2_FLAGS_USE_PIN)) {
+                r = tpm2_unseal(tpm2_context,
+                                hash_pcr_mask,
+                                pcr_bank,
+                                pubkey, pubkey_size,
+                                pubkey_pcr_mask,
+                                signature_json,
+                                /* pin= */ NULL,
+                                FLAGS_SET(flags, TPM2_FLAGS_USE_PCRLOCK) ? &pcrlock_policy : NULL,
+                                primary_alg,
+                                blob,
+                                blob_size,
+                                policy_hash,
+                                policy_hash_size,
+                                srk_buf,
+                                srk_buf_size,
+                                ret_decrypted_key,
+                                ret_decrypted_key_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to unseal secret using TPM2: %m");
+
+                return r;
+        }
+
+        for (int i = 5;; i--) {
+                _cleanup_(erase_and_freep) char *pin_str = NULL, *b64_salted_pin = NULL;
+
+                if (i <= 0)
+                        return -EACCES;
+
+                r = get_pin(until, ask_password_flags, headless, &pin_str);
+                if (r < 0)
+                        return r;
+
+                if (salt_size > 0) {
+                        uint8_t salted_pin[SHA256_DIGEST_SIZE] = {};
+                        CLEANUP_ERASE(salted_pin);
+
+                        r = tpm2_util_pbkdf2_hmac_sha256(pin_str, strlen(pin_str), salt, salt_size, salted_pin);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to perform PBKDF2: %m");
+
+                        r = base64mem(salted_pin, sizeof(salted_pin), &b64_salted_pin);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to base64 encode salted pin: %m");
+                } else
+                        /* no salting needed, backwards compat with non-salted pins */
+                        b64_salted_pin = TAKE_PTR(pin_str);
+
+                r = tpm2_unseal(tpm2_context,
+                                hash_pcr_mask,
+                                pcr_bank,
+                                pubkey, pubkey_size,
+                                pubkey_pcr_mask,
+                                signature_json,
+                                b64_salted_pin,
+                                pcrlock_path ? &pcrlock_policy : NULL,
+                                primary_alg,
+                                blob,
+                                blob_size,
+                                policy_hash,
+                                policy_hash_size,
+                                srk_buf,
+                                srk_buf_size,
+                                ret_decrypted_key,
+                                ret_decrypted_key_size);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to unseal secret using TPM2: %m");
+
+                        /* We get this error in case there is an authentication policy mismatch. This should
+                         * not happen, but this avoids confusing behavior, just in case. */
+                        if (!IN_SET(r, -EPERM, -ENOLCK))
+                                continue;
+                }
+
+                return r;
+        }
+}
+
+int find_tpm2_auto_data(
+                struct crypt_device *cd,
+                uint32_t search_pcr_mask,
+                int start_token,
+                uint32_t *ret_hash_pcr_mask,
+                uint16_t *ret_pcr_bank,
+                void **ret_pubkey,
+                size_t *ret_pubkey_size,
+                uint32_t *ret_pubkey_pcr_mask,
+                uint16_t *ret_primary_alg,
+                void **ret_blob,
+                size_t *ret_blob_size,
+                void **ret_policy_hash,
+                size_t *ret_policy_hash_size,
+                void **ret_salt,
+                size_t *ret_salt_size,
+                void **ret_srk_buf,
+                size_t *ret_srk_buf_size,
+                TPM2Flags *ret_flags,
+                int *ret_keyslot,
+                int *ret_token) {
+
+        int r, token;
+
+        assert(cd);
+
+        for (token = start_token; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_free_ void *blob = NULL, *policy_hash = NULL, *pubkey = NULL, *salt = NULL, *srk_buf = NULL;
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                size_t blob_size, policy_hash_size, pubkey_size, salt_size = 0, srk_buf_size = 0;
+                uint32_t hash_pcr_mask, pubkey_pcr_mask;
+                uint16_t pcr_bank, primary_alg;
+                TPM2Flags flags;
+                int keyslot;
+
+                r = cryptsetup_get_token_as_json(cd, token, "systemd-tpm2", &v);
+                if (IN_SET(r, -ENOENT, -EINVAL, -EMEDIUMTYPE))
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read JSON token data off disk: %m");
+
+                r = tpm2_parse_luks2_json(
+                                v,
+                                &keyslot,
+                                &hash_pcr_mask,
+                                &pcr_bank,
+                                &pubkey, &pubkey_size,
+                                &pubkey_pcr_mask,
+                                &primary_alg,
+                                &blob, &blob_size,
+                                &policy_hash, &policy_hash_size,
+                                &salt, &salt_size,
+                                &srk_buf, &srk_buf_size,
+                                &flags);
+                if (r == -EUCLEAN) /* Gracefully handle issues in JSON fields not owned by us */
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse TPM2 JSON data: %m");
+
+                if (search_pcr_mask == UINT32_MAX ||
+                    search_pcr_mask == hash_pcr_mask) {
+
+                        if (start_token <= 0)
+                                log_info("Automatically discovered security TPM2 token unlocks volume.");
+
+                        *ret_hash_pcr_mask = hash_pcr_mask;
+                        *ret_pcr_bank = pcr_bank;
+                        *ret_pubkey = TAKE_PTR(pubkey);
+                        *ret_pubkey_size = pubkey_size;
+                        *ret_pubkey_pcr_mask = pubkey_pcr_mask;
+                        *ret_primary_alg = primary_alg;
+                        *ret_blob = TAKE_PTR(blob);
+                        *ret_blob_size = blob_size;
+                        *ret_policy_hash = TAKE_PTR(policy_hash);
+                        *ret_policy_hash_size = policy_hash_size;
+                        *ret_salt = TAKE_PTR(salt);
+                        *ret_salt_size = salt_size;
+                        *ret_keyslot = keyslot;
+                        *ret_token = token;
+                        *ret_srk_buf = TAKE_PTR(srk_buf);
+                        *ret_srk_buf_size = srk_buf_size;
+                        *ret_flags = flags;
+                        return 0;
+                }
+
+                /* PCR mask doesn't match what is configured, ignore this entry, let's see next */
+        }
+
+        return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "No valid TPM2 token data found.");
+}
diff --git a/src/cryptsetup/cryptsetup-tpm2.h b/src/cryptsetup/cryptsetup-tpm2.h
new file mode 100644
index 0000000..a50a943
--- /dev/null
+++ b/src/cryptsetup/cryptsetup-tpm2.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "ask-password-api.h"
+#include "cryptsetup-util.h"
+#include "log.h"
+#include "time-util.h"
+#include "tpm2-util.h"
+
+#if HAVE_TPM2
+
+int acquire_tpm2_key(
+                const char *volume_name,
+                const char *device,
+                uint32_t hash_pcr_mask,
+                uint16_t pcr_bank,
+                const void *pubkey,
+                size_t pubkey_size,
+                uint32_t pubkey_pcr_mask,
+                const char *signature_path,
+                const char *pcrlock_path,
+                uint16_t primary_alg,
+                const char *key_file,
+                size_t key_file_size,
+                uint64_t key_file_offset,
+                const void *key_data,
+                size_t key_data_size,
+                const void *policy_hash,
+                size_t policy_hash_size,
+                const void *salt,
+                size_t salt_size,
+                const void *srk_buf,
+                size_t salt_srk_buf_size,
+                TPM2Flags flags,
+                usec_t until,
+                bool headless,
+                AskPasswordFlags ask_password_flags,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size);
+
+int find_tpm2_auto_data(
+                struct crypt_device *cd,
+                uint32_t search_pcr_mask,
+                int start_token,
+                uint32_t *ret_hash_pcr_mask,
+                uint16_t *ret_pcr_bank,
+                void **ret_pubkey,
+                size_t *ret_pubkey_size,
+                uint32_t *ret_pubkey_pcr_mask,
+                uint16_t *ret_primary_alg,
+                void **ret_blob,
+                size_t *ret_blob_size,
+                void **ret_policy_hash,
+                size_t *ret_policy_hash_size,
+                void **ret_salt,
+                size_t *ret_salt_size,
+                void **ret_srk_buf,
+                size_t *ret_srk_size,
+                TPM2Flags *ret_flags,
+                int *ret_keyslot,
+                int *ret_token);
+
+#else
+
+static inline int acquire_tpm2_key(
+                const char *volume_name,
+                const char *device,
+                uint32_t hash_pcr_mask,
+                uint16_t pcr_bank,
+                const void *pubkey,
+                size_t pubkey_size,
+                uint32_t pubkey_pcr_mask,
+                const char *signature_path,
+                const char *pcrlock_path,
+                uint16_t primary_alg,
+                const char *key_file,
+                size_t key_file_size,
+                uint64_t key_file_offset,
+                const void *key_data,
+                size_t key_data_size,
+                const void *policy_hash,
+                size_t policy_hash_size,
+                const void *salt,
+                size_t salt_size,
+                const void *srk_buf,
+                size_t salt_srk_buf_size,
+                TPM2Flags flags,
+                usec_t until,
+                bool headless,
+                AskPasswordFlags ask_password_flags,
+                void **ret_decrypted_key,
+                size_t *ret_decrypted_key_size) {
+
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "TPM2 support not available.");
+}
+
+static inline int find_tpm2_auto_data(
+                struct crypt_device *cd,
+                uint32_t search_pcr_mask,
+                int start_token,
+                uint32_t *ret_hash_pcr_mask,
+                uint16_t *ret_pcr_bank,
+                void **ret_pubkey,
+                size_t *ret_pubkey_size,
+                uint32_t *ret_pubkey_pcr_mask,
+                uint16_t *ret_primary_alg,
+                void **ret_blob,
+                size_t *ret_blob_size,
+                void **ret_policy_hash,
+                size_t *ret_policy_hash_size,
+                void **ret_salt,
+                size_t *ret_salt_size,
+                void **ret_srk_buf,
+                size_t *ret_srk_size,
+                TPM2Flags *ret_flags,
+                int *ret_keyslot,
+                int *ret_token) {
+
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "TPM2 support not available.");
+}
+
+#endif
diff --git a/src/cryptsetup/cryptsetup.c b/src/cryptsetup/cryptsetup.c
new file mode 100644
index 0000000..b56b51a
--- /dev/null
+++ b/src/cryptsetup/cryptsetup.c
@@ -0,0 +1,2423 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-device.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "build.h"
+#include "cryptsetup-fido2.h"
+#include "cryptsetup-keyfile.h"
+#include "cryptsetup-pkcs11.h"
+#include "cryptsetup-tpm2.h"
+#include "cryptsetup-util.h"
+#include "device-util.h"
+#include "efi-api.h"
+#include "efi-loader.h"
+#include "env-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "fstab-util.h"
+#include "hexdecoct.h"
+#include "libfido2-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "mount-util.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pkcs11-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "tpm2-pcr.h"
+#include "tpm2-util.h"
+
+/* internal helper */
+#define ANY_LUKS "LUKS"
+/* as in src/cryptsetup.h */
+#define CRYPT_SECTOR_SIZE 512U
+#define CRYPT_MAX_SECTOR_SIZE 4096U
+
+typedef enum PassphraseType {
+        PASSPHRASE_NONE,
+        PASSPHRASE_REGULAR = 1 << 0,
+        PASSPHRASE_RECOVERY_KEY = 1 << 1,
+        PASSPHRASE_BOTH = PASSPHRASE_REGULAR|PASSPHRASE_RECOVERY_KEY,
+        _PASSPHRASE_TYPE_MAX,
+        _PASSPHRASE_TYPE_INVALID = -1,
+} PassphraseType;
+
+static const char *arg_type = NULL; /* ANY_LUKS, CRYPT_LUKS1, CRYPT_LUKS2, CRYPT_TCRYPT, CRYPT_BITLK or CRYPT_PLAIN */
+static char *arg_cipher = NULL;
+static unsigned arg_key_size = 0;
+static unsigned arg_sector_size = CRYPT_SECTOR_SIZE;
+static int arg_key_slot = CRYPT_ANY_SLOT;
+static unsigned arg_keyfile_size = 0;
+static uint64_t arg_keyfile_offset = 0;
+static bool arg_keyfile_erase = false;
+static bool arg_try_empty_password = false;
+static char *arg_hash = NULL;
+static char *arg_header = NULL;
+static unsigned arg_tries = 3;
+static bool arg_readonly = false;
+static bool arg_verify = false;
+static AskPasswordFlags arg_ask_password_flags = 0;
+static bool arg_discards = false;
+static bool arg_same_cpu_crypt = false;
+static bool arg_submit_from_crypt_cpus = false;
+static bool arg_no_read_workqueue = false;
+static bool arg_no_write_workqueue = false;
+static bool arg_tcrypt_hidden = false;
+static bool arg_tcrypt_system = false;
+static bool arg_tcrypt_veracrypt = false;
+static uint32_t arg_tcrypt_veracrypt_pim = 0;
+static char **arg_tcrypt_keyfiles = NULL;
+static uint64_t arg_offset = 0;
+static uint64_t arg_skip = 0;
+static usec_t arg_timeout = USEC_INFINITY;
+static char *arg_pkcs11_uri = NULL;
+static bool arg_pkcs11_uri_auto = false;
+static char *arg_fido2_device = NULL;
+static bool arg_fido2_device_auto = false;
+static void *arg_fido2_cid = NULL;
+static size_t arg_fido2_cid_size = 0;
+static char *arg_fido2_rp_id = NULL;
+static char *arg_tpm2_device = NULL; /* These and the following fields are about locking an encrypted volume to the local TPM */
+static bool arg_tpm2_device_auto = false;
+static uint32_t arg_tpm2_pcr_mask = UINT32_MAX;
+static char *arg_tpm2_signature = NULL;
+static bool arg_tpm2_pin = false;
+static char *arg_tpm2_pcrlock = NULL;
+static bool arg_headless = false;
+static usec_t arg_token_timeout_usec = 30*USEC_PER_SEC;
+static unsigned arg_tpm2_measure_pcr = UINT_MAX; /* This and the following field is about measuring the unlocked volume key to the local TPM */
+static char **arg_tpm2_measure_banks = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_cipher, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_hash, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_header, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tcrypt_keyfiles, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pkcs11_uri, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_fido2_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_fido2_cid, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_fido2_rp_id, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_signature, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_measure_banks, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_tpm2_pcrlock, freep);
+
+static const char* const passphrase_type_table[_PASSPHRASE_TYPE_MAX] = {
+        [PASSPHRASE_REGULAR] = "passphrase",
+        [PASSPHRASE_RECOVERY_KEY] = "recovery key",
+        [PASSPHRASE_BOTH] = "passphrase or recovery key",
+};
+
+const char* passphrase_type_to_string(PassphraseType t);
+PassphraseType passphrase_type_from_string(const char *s);
+
+DEFINE_STRING_TABLE_LOOKUP(passphrase_type, PassphraseType);
+
+/* Options Debian's crypttab knows we don't:
+    check=
+    checkargs=
+    noearly
+    loud
+    quiet
+    keyscript=
+    initramfs
+*/
+
+static int parse_one_option(const char *option) {
+        const char *val;
+        int r;
+
+        assert(option);
+
+        /* Handled outside of this tool */
+        if (STR_IN_SET(option, "noauto", "auto", "nofail", "fail", "_netdev", "keyfile-timeout"))
+                return 0;
+
+        if (startswith(option, "keyfile-timeout="))
+                return 0;
+
+        if ((val = startswith(option, "cipher="))) {
+                r = free_and_strdup(&arg_cipher, val);
+                if (r < 0)
+                        return log_oom();
+
+        } else if ((val = startswith(option, "size="))) {
+
+                r = safe_atou(val, &arg_key_size);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+
+                if (arg_key_size % 8) {
+                        log_warning("size= not a multiple of 8, ignoring.");
+                        return 0;
+                }
+
+                arg_key_size /= 8;
+
+        } else if ((val = startswith(option, "sector-size="))) {
+
+                r = safe_atou(val, &arg_sector_size);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+
+                if (arg_sector_size % 2) {
+                        log_warning("sector-size= not a multiple of 2, ignoring.");
+                        return 0;
+                }
+
+                if (arg_sector_size < CRYPT_SECTOR_SIZE || arg_sector_size > CRYPT_MAX_SECTOR_SIZE)
+                        log_warning("sector-size= is outside of %u and %u, ignoring.", CRYPT_SECTOR_SIZE, CRYPT_MAX_SECTOR_SIZE);
+
+        } else if ((val = startswith(option, "key-slot=")) ||
+                   (val = startswith(option, "keyslot="))) {
+
+                arg_type = ANY_LUKS;
+                r = safe_atoi(val, &arg_key_slot);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+
+        } else if ((val = startswith(option, "tcrypt-keyfile="))) {
+
+                arg_type = CRYPT_TCRYPT;
+                if (path_is_absolute(val)) {
+                        if (strv_extend(&arg_tcrypt_keyfiles, val) < 0)
+                                return log_oom();
+                } else
+                        log_warning("Key file path \"%s\" is not absolute, ignoring.", val);
+
+        } else if ((val = startswith(option, "keyfile-size="))) {
+
+                r = safe_atou(val, &arg_keyfile_size);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+
+        } else if ((val = startswith(option, "keyfile-offset="))) {
+
+                r = safe_atou64(val, &arg_keyfile_offset);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+
+        } else if ((val = startswith(option, "keyfile-erase="))) {
+
+                r = parse_boolean(val);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+
+                arg_keyfile_erase = r;
+
+        } else if (streq(option, "keyfile-erase"))
+                arg_keyfile_erase = true;
+
+        else if ((val = startswith(option, "hash="))) {
+                r = free_and_strdup(&arg_hash, val);
+                if (r < 0)
+                        return log_oom();
+
+        } else if ((val = startswith(option, "header="))) {
+                if (!arg_type || !STR_IN_SET(arg_type, ANY_LUKS, CRYPT_LUKS1, CRYPT_LUKS2, CRYPT_TCRYPT))
+                        arg_type = ANY_LUKS;
+
+                if (!path_is_absolute(val))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Header path \"%s\" is not absolute, refusing.", val);
+
+                if (arg_header)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Duplicate header= option, refusing.");
+
+                arg_header = strdup(val);
+                if (!arg_header)
+                        return log_oom();
+
+        } else if ((val = startswith(option, "tries="))) {
+
+                r = safe_atou(val, &arg_tries);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+
+        } else if (STR_IN_SET(option, "readonly", "read-only"))
+                arg_readonly = true;
+        else if (streq(option, "verify"))
+                arg_verify = true;
+        else if ((val = startswith(option, "password-echo="))) {
+                if (streq(val, "masked"))
+                        arg_ask_password_flags &= ~(ASK_PASSWORD_ECHO|ASK_PASSWORD_SILENT);
+                else {
+                        r = parse_boolean(val);
+                        if (r < 0) {
+                                log_warning_errno(r, "Invalid password-echo= option \"%s\", ignoring.", val);
+                                return 0;
+                        }
+
+                        SET_FLAG(arg_ask_password_flags, ASK_PASSWORD_ECHO, r);
+                        SET_FLAG(arg_ask_password_flags, ASK_PASSWORD_SILENT, !r);
+                }
+        } else if (STR_IN_SET(option, "allow-discards", "discard"))
+                arg_discards = true;
+        else if (streq(option, "same-cpu-crypt"))
+                arg_same_cpu_crypt = true;
+        else if (streq(option, "submit-from-crypt-cpus"))
+                arg_submit_from_crypt_cpus = true;
+        else if (streq(option, "no-read-workqueue"))
+                arg_no_read_workqueue = true;
+        else if (streq(option, "no-write-workqueue"))
+                arg_no_write_workqueue = true;
+        else if (streq(option, "luks"))
+                arg_type = ANY_LUKS;
+/* since cryptsetup 2.3.0 (Feb 2020) */
+#ifdef CRYPT_BITLK
+        else if (streq(option, "bitlk"))
+                arg_type = CRYPT_BITLK;
+#endif
+        else if (streq(option, "tcrypt"))
+                arg_type = CRYPT_TCRYPT;
+        else if (STR_IN_SET(option, "tcrypt-hidden", "tcrypthidden")) {
+                arg_type = CRYPT_TCRYPT;
+                arg_tcrypt_hidden = true;
+        } else if (streq(option, "tcrypt-system")) {
+                arg_type = CRYPT_TCRYPT;
+                arg_tcrypt_system = true;
+        } else if (STR_IN_SET(option, "tcrypt-veracrypt", "veracrypt")) {
+                arg_type = CRYPT_TCRYPT;
+                arg_tcrypt_veracrypt = true;
+        } else if ((val = startswith(option, "veracrypt-pim="))) {
+
+                r = safe_atou32(val, &arg_tcrypt_veracrypt_pim);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+        } else if (STR_IN_SET(option, "plain", "swap", "tmp") ||
+                   startswith(option, "tmp="))
+                arg_type = CRYPT_PLAIN;
+        else if ((val = startswith(option, "timeout="))) {
+
+                r = parse_sec_fix_0(val, &arg_timeout);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+
+        } else if ((val = startswith(option, "offset="))) {
+
+                r = safe_atou64(val, &arg_offset);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse %s: %m", option);
+
+        } else if ((val = startswith(option, "skip="))) {
+
+                r = safe_atou64(val, &arg_skip);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse %s: %m", option);
+
+        } else if ((val = startswith(option, "pkcs11-uri="))) {
+
+                if (streq(val, "auto")) {
+                        arg_pkcs11_uri = mfree(arg_pkcs11_uri);
+                        arg_pkcs11_uri_auto = true;
+                } else {
+                        if (!pkcs11_uri_valid(val))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "pkcs11-uri= parameter expects a PKCS#11 URI, refusing");
+
+                        r = free_and_strdup(&arg_pkcs11_uri, val);
+                        if (r < 0)
+                                return log_oom();
+
+                        arg_pkcs11_uri_auto = false;
+                }
+
+        } else if ((val = startswith(option, "fido2-device="))) {
+
+                if (streq(val, "auto")) {
+                        arg_fido2_device = mfree(arg_fido2_device);
+                        arg_fido2_device_auto = true;
+                } else {
+                        r = free_and_strdup(&arg_fido2_device, val);
+                        if (r < 0)
+                                return log_oom();
+
+                        arg_fido2_device_auto = false;
+                }
+
+        } else if ((val = startswith(option, "fido2-cid="))) {
+
+                if (streq(val, "auto"))
+                        arg_fido2_cid = mfree(arg_fido2_cid);
+                else {
+                        _cleanup_free_ void *cid = NULL;
+                        size_t cid_size;
+
+                        r = unbase64mem(val, SIZE_MAX, &cid, &cid_size);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to decode FIDO2 CID data: %m");
+
+                        free(arg_fido2_cid);
+                        arg_fido2_cid = TAKE_PTR(cid);
+                        arg_fido2_cid_size = cid_size;
+                }
+
+                /* Turn on FIDO2 as side-effect, if not turned on yet. */
+                if (!arg_fido2_device && !arg_fido2_device_auto)
+                        arg_fido2_device_auto = true;
+
+        } else if ((val = startswith(option, "fido2-rp="))) {
+
+                r = free_and_strdup(&arg_fido2_rp_id, val);
+                if (r < 0)
+                        return log_oom();
+
+        } else if ((val = startswith(option, "tpm2-device="))) {
+
+                if (streq(val, "auto")) {
+                        arg_tpm2_device = mfree(arg_tpm2_device);
+                        arg_tpm2_device_auto = true;
+                } else {
+                        r = free_and_strdup(&arg_tpm2_device, val);
+                        if (r < 0)
+                                return log_oom();
+
+                        arg_tpm2_device_auto = false;
+                }
+
+        } else if ((val = startswith(option, "tpm2-pcrs="))) {
+
+                r = tpm2_parse_pcr_argument_to_mask(val, &arg_tpm2_pcr_mask);
+                if (r < 0)
+                        return r;
+
+        } else if ((val = startswith(option, "tpm2-signature="))) {
+
+                if (!path_is_absolute(val))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "TPM2 signature path \"%s\" is not absolute, refusing.", val);
+
+                r = free_and_strdup(&arg_tpm2_signature, val);
+                if (r < 0)
+                        return log_oom();
+
+        } else if ((val = startswith(option, "tpm2-pin="))) {
+
+                r = parse_boolean(val);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+
+                arg_tpm2_pin = r;
+
+        } else if ((val = startswith(option, "tpm2-pcrlock="))) {
+
+                if (!path_is_absolute(val))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "TPM2 pcrlock policy path \"%s\" is not absolute, refusing.", val);
+
+                r = free_and_strdup(&arg_tpm2_pcrlock, val);
+                if (r < 0)
+                        return log_oom();
+
+        } else if ((val = startswith(option, "tpm2-measure-pcr="))) {
+                unsigned pcr;
+
+                r = safe_atou(val, &pcr);
+                if (r < 0) {
+                        r = parse_boolean(val);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to parse %s, ignoring: %m", option);
+                                return 0;
+                        }
+
+                        pcr = r ? TPM2_PCR_SYSTEM_IDENTITY : UINT_MAX;
+                } else if (!TPM2_PCR_INDEX_VALID(pcr)) {
+                        log_warning("Selected TPM index for measurement %u outside of allowed range 0…%u, ignoring.", pcr, TPM2_PCRS_MAX-1);
+                        return 0;
+                }
+
+                arg_tpm2_measure_pcr = pcr;
+
+        } else if ((val = startswith(option, "tpm2-measure-bank="))) {
+
+#if HAVE_OPENSSL
+                _cleanup_strv_free_ char **l = NULL;
+
+                l = strv_split(optarg, ":");
+                if (!l)
+                        return log_oom();
+
+                STRV_FOREACH(i, l) {
+                        const EVP_MD *implementation;
+
+                        implementation = EVP_get_digestbyname(*i);
+                        if (!implementation)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown bank '%s', refusing.", val);
+
+                        if (strv_extend(&arg_tpm2_measure_banks, EVP_MD_name(implementation)) < 0)
+                                return log_oom();
+                }
+#else
+                log_error("Build lacks OpenSSL support, cannot measure to PCR banks, ignoring: %s", option);
+#endif
+
+        } else if ((val = startswith(option, "try-empty-password="))) {
+
+                r = parse_boolean(val);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+
+                arg_try_empty_password = r;
+
+        } else if (streq(option, "try-empty-password"))
+                arg_try_empty_password = true;
+        else if ((val = startswith(option, "headless="))) {
+
+                r = parse_boolean(val);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+                        return 0;
+                }
+
+                arg_headless = r;
+        } else if (streq(option, "headless"))
+                arg_headless = true;
+
+        else if ((val = startswith(option, "token-timeout="))) {
+
+                r = parse_sec_fix_0(val, &arg_token_timeout_usec);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse %s, ignoring: %m", option);
+
+        } else if (!streq(option, "x-initrd.attach"))
+                log_warning("Encountered unknown /etc/crypttab option '%s', ignoring.", option);
+
+        return 0;
+}
+
+static int parse_crypt_config(const char *options) {
+        assert(options);
+
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+                int r;
+
+                r = extract_first_word(&options, &word, ",", EXTRACT_DONT_COALESCE_SEPARATORS | EXTRACT_UNESCAPE_SEPARATORS);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse options: %m");
+                if (r == 0)
+                        break;
+
+                r = parse_one_option(word);
+                if (r < 0)
+                        return r;
+        }
+
+        /* sanity-check options */
+        if (arg_type && !streq(arg_type, CRYPT_PLAIN)) {
+                if (arg_offset != 0)
+                      log_warning("offset= ignored with type %s", arg_type);
+                if (arg_skip != 0)
+                      log_warning("skip= ignored with type %s", arg_type);
+        }
+
+        return 0;
+}
+
+static char* disk_description(const char *path) {
+        static const char name_fields[] =
+                "DM_NAME\0"
+                "ID_MODEL_FROM_DATABASE\0"
+                "ID_MODEL\0";
+
+        _cleanup_(sd_device_unrefp) sd_device *device = NULL;
+        const char *name;
+        struct stat st;
+
+        assert(path);
+
+        if (stat(path, &st) < 0)
+                return NULL;
+
+        if (!S_ISBLK(st.st_mode))
+                return NULL;
+
+        if (sd_device_new_from_stat_rdev(&device, &st) < 0)
+                return NULL;
+
+        if (sd_device_get_property_value(device, "ID_PART_ENTRY_NAME", &name) >= 0) {
+                _cleanup_free_ char *unescaped = NULL;
+                ssize_t l;
+
+                /* ID_PART_ENTRY_NAME uses \x style escaping, using libblkid's blkid_encode_string(). Let's
+                 * reverse this here to make the string more human friendly in case people embed spaces or
+                 * other weird stuff. */
+
+                l = cunescape(name, UNESCAPE_RELAX, &unescaped);
+                if (l < 0) {
+                        log_debug_errno(l, "Failed to unescape ID_PART_ENTRY_NAME, skipping device: %m");
+                        return NULL;
+                }
+
+                if (!isempty(unescaped) && !string_has_cc(unescaped, NULL))
+                        return TAKE_PTR(unescaped);
+        }
+
+        /* These need no unescaping. */
+        NULSTR_FOREACH(i, name_fields)
+                if (sd_device_get_property_value(device, i, &name) >= 0 &&
+                    !isempty(name))
+                        return strdup(name);
+
+        return NULL;
+}
+
+static char *disk_mount_point(const char *label) {
+        _cleanup_free_ char *device = NULL;
+        _cleanup_endmntent_ FILE *f = NULL;
+        struct mntent *m;
+
+        /* Yeah, we don't support native systemd unit files here for now */
+
+        device = strjoin("/dev/mapper/", label);
+        if (!device)
+                return NULL;
+
+        f = setmntent(fstab_path(), "re");
+        if (!f)
+                return NULL;
+
+        while ((m = getmntent(f)))
+                if (path_equal(m->mnt_fsname, device))
+                        return strdup(m->mnt_dir);
+
+        return NULL;
+}
+
+static char *friendly_disk_name(const char *src, const char *vol) {
+        _cleanup_free_ char *description = NULL, *mount_point = NULL;
+        char *name_buffer = NULL;
+        int r;
+
+        assert(src);
+        assert(vol);
+
+        description = disk_description(src);
+        mount_point = disk_mount_point(vol);
+
+        /* If the description string is simply the volume name, then let's not show this twice */
+        if (description && streq(vol, description))
+                description = mfree(description);
+
+        if (mount_point && description)
+                r = asprintf(&name_buffer, "%s (%s) on %s", description, vol, mount_point);
+        else if (mount_point)
+                r = asprintf(&name_buffer, "%s on %s", vol, mount_point);
+        else if (description)
+                r = asprintf(&name_buffer, "%s (%s)", description, vol);
+        else
+                return strdup(vol);
+        if (r < 0)
+                return NULL;
+
+        return name_buffer;
+}
+
+static PassphraseType check_registered_passwords(struct crypt_device *cd) {
+        _cleanup_free_ bool *slots = NULL;
+        int slot_max;
+        PassphraseType passphrase_type = PASSPHRASE_NONE;
+
+        assert(cd);
+
+        if (!streq_ptr(crypt_get_type(cd), CRYPT_LUKS2)) {
+                log_debug("%s: not a LUKS2 device, only passphrases are supported", crypt_get_device_name(cd));
+                return PASSPHRASE_REGULAR;
+        }
+
+        /* Search all used slots */
+        assert_se((slot_max = crypt_keyslot_max(CRYPT_LUKS2)) > 0);
+        slots = new(bool, slot_max);
+        if (!slots)
+                return log_oom();
+
+        for (int slot = 0; slot < slot_max; slot++)
+                slots[slot] = IN_SET(crypt_keyslot_status(cd, slot), CRYPT_SLOT_ACTIVE, CRYPT_SLOT_ACTIVE_LAST);
+
+        /* Iterate all LUKS2 tokens and keep track of all their slots */
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                const char *type;
+                JsonVariant *w, *z;
+                int tk;
+
+                tk = cryptsetup_get_token_as_json(cd, token, NULL, &v);
+                if (IN_SET(tk, -ENOENT, -EINVAL))
+                        continue;
+                if (tk < 0) {
+                        log_warning_errno(tk, "Failed to read JSON token data, ignoring: %m");
+                        continue;
+                }
+
+                w = json_variant_by_key(v, "type");
+                if (!w || !json_variant_is_string(w)) {
+                        log_warning("Token JSON data lacks type field, ignoring.");
+                        continue;
+                }
+
+                type = json_variant_string(w);
+                if (STR_IN_SET(type, "systemd-recovery", "systemd-pkcs11", "systemd-fido2", "systemd-tpm2")) {
+
+                        /* At least exists one recovery key */
+                        if (streq(type, "systemd-recovery"))
+                                passphrase_type |= PASSPHRASE_RECOVERY_KEY;
+
+                        w = json_variant_by_key(v, "keyslots");
+                        if (!w || !json_variant_is_array(w)) {
+                                log_warning("Token JSON data lacks keyslots field, ignoring.");
+                                continue;
+                        }
+
+                        JSON_VARIANT_ARRAY_FOREACH(z, w) {
+                                unsigned u;
+                                int at;
+
+                                if (!json_variant_is_string(z)) {
+                                        log_warning("Token JSON data's keyslot field is not an array of strings, ignoring.");
+                                        continue;
+                                }
+
+                                at = safe_atou(json_variant_string(z), &u);
+                                if (at < 0) {
+                                        log_warning_errno(at, "Token JSON data's keyslot field is not an integer formatted as string, ignoring.");
+                                        continue;
+                                }
+
+                                if (u >= (unsigned) slot_max) {
+                                        log_warning_errno(at, "Token JSON data's keyslot field exceeds the maximum value allowed, ignoring.");
+                                        continue;
+                                }
+
+                                slots[u] = false;
+                        }
+                }
+        }
+
+        /* Check if any of the slots is not referenced by systemd tokens */
+        for (int slot = 0; slot < slot_max; slot++)
+                if (slots[slot]) {
+                        passphrase_type |= PASSPHRASE_REGULAR;
+                        break;
+                }
+
+        /* All the slots are referenced by systemd tokens, so if a recovery key is not enrolled,
+         * we will not be able to enter a passphrase. */
+        return passphrase_type;
+}
+
+static int get_password(
+                const char *vol,
+                const char *src,
+                usec_t until,
+                bool accept_cached,
+                PassphraseType passphrase_type,
+                char ***ret) {
+
+        _cleanup_free_ char *friendly = NULL, *text = NULL, *disk_path = NULL;
+        _cleanup_strv_free_erase_ char **passwords = NULL;
+        char *id;
+        int r = 0;
+        AskPasswordFlags flags = arg_ask_password_flags | ASK_PASSWORD_PUSH_CACHE;
+
+        assert(vol);
+        assert(src);
+        assert(ret);
+
+        if (arg_headless)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "Password querying disabled via 'headless' option.");
+
+        friendly = friendly_disk_name(src, vol);
+        if (!friendly)
+                return log_oom();
+
+        if (asprintf(&text, "Please enter %s for disk %s:", passphrase_type_to_string(passphrase_type), friendly) < 0)
+                return log_oom();
+
+        disk_path = cescape(src);
+        if (!disk_path)
+                return log_oom();
+
+        id = strjoina("cryptsetup:", disk_path);
+
+        r = ask_password_auto(text, "drive-harddisk", id, "cryptsetup", "cryptsetup.passphrase", until,
+                              flags | (accept_cached*ASK_PASSWORD_ACCEPT_CACHED),
+                              &passwords);
+        if (r < 0)
+                return log_error_errno(r, "Failed to query password: %m");
+
+        if (arg_verify) {
+                _cleanup_strv_free_erase_ char **passwords2 = NULL;
+
+                assert(strv_length(passwords) == 1);
+
+                if (asprintf(&text, "Please enter %s for disk %s (verification):", passphrase_type_to_string(passphrase_type), friendly) < 0)
+                        return log_oom();
+
+                id = strjoina("cryptsetup-verification:", disk_path);
+
+                r = ask_password_auto(text, "drive-harddisk", id, "cryptsetup", "cryptsetup.passphrase", until, flags, &passwords2);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to query verification password: %m");
+
+                assert(strv_length(passwords2) == 1);
+
+                if (!streq(passwords[0], passwords2[0]))
+                        return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                 "Passwords did not match, retrying.");
+        }
+
+        strv_uniq(passwords);
+
+        STRV_FOREACH(p, passwords) {
+                char *c;
+
+                if (strlen(*p)+1 >= arg_key_size)
+                        continue;
+
+                /* Pad password if necessary */
+                c = new(char, arg_key_size);
+                if (!c)
+                        return log_oom();
+
+                strncpy(c, *p, arg_key_size);
+                erase_and_free(*p);
+                *p = TAKE_PTR(c);
+        }
+
+        *ret = TAKE_PTR(passwords);
+
+        return 0;
+}
+
+static int measure_volume_key(
+                struct crypt_device *cd,
+                const char *name,
+                const void *volume_key,
+                size_t volume_key_size) {
+
+        int r;
+
+        assert(cd);
+        assert(name);
+        assert(volume_key);
+        assert(volume_key_size > 0);
+
+        if (arg_tpm2_measure_pcr == UINT_MAX) {
+                log_debug("Not measuring volume key, deactivated.");
+                return 0;
+        }
+
+        r = efi_measured_uki(LOG_WARNING);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                log_debug("Kernel stub did not measure kernel image into the expected PCR, skipping userspace measurement, too.");
+                return 0;
+        }
+
+#if HAVE_TPM2
+        _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL;
+        r = tpm2_context_new(arg_tpm2_device, &c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create TPM2 context: %m");
+
+        _cleanup_strv_free_ char **l = NULL;
+        if (strv_isempty(arg_tpm2_measure_banks)) {
+                r = tpm2_get_good_pcr_banks_strv(c, UINT32_C(1) << arg_tpm2_measure_pcr, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Could not verify pcr banks: %m");
+        }
+
+        _cleanup_free_ char *joined = strv_join(l ?: arg_tpm2_measure_banks, ", ");
+        if (!joined)
+                return log_oom();
+
+        /* Note: we don't directly measure the volume key, it might be a security problem to send an
+         * unprotected direct hash of the secret volume key over the wire to the TPM. Hence let's instead
+         * send a HMAC signature instead. */
+
+        _cleanup_free_ char *escaped = NULL;
+        escaped = xescape(name, ":"); /* avoid ambiguity around ":" once we join things below */
+        if (!escaped)
+                return log_oom();
+
+        _cleanup_free_ char *s = NULL;
+        s = strjoin("cryptsetup:", escaped, ":", strempty(crypt_get_uuid(cd)));
+        if (!s)
+                return log_oom();
+
+        r = tpm2_extend_bytes(c, l ?: arg_tpm2_measure_banks, arg_tpm2_measure_pcr, s, SIZE_MAX, volume_key, volume_key_size, TPM2_EVENT_VOLUME_KEY, s);
+        if (r < 0)
+                return log_error_errno(r, "Could not extend PCR: %m");
+
+        log_struct(LOG_INFO,
+                   "MESSAGE_ID=" SD_MESSAGE_TPM_PCR_EXTEND_STR,
+                   LOG_MESSAGE("Successfully extended PCR index %u with '%s' and volume key (banks %s).", arg_tpm2_measure_pcr, s, joined),
+                   "MEASURING=%s", s,
+                   "PCR=%u", arg_tpm2_measure_pcr,
+                   "BANKS=%s", joined);
+
+        return 0;
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support disabled, not measuring.");
+#endif
+}
+
+static int measured_crypt_activate_by_volume_key(
+                struct crypt_device *cd,
+                const char *name,
+                const void *volume_key,
+                size_t volume_key_size,
+                uint32_t flags) {
+
+        int r;
+
+        assert(cd);
+        assert(name);
+
+        /* A wrapper around crypt_activate_by_volume_key() which also measures to a PCR if that's requested. */
+
+        r = crypt_activate_by_volume_key(cd, name, volume_key, volume_key_size, flags);
+        if (r < 0)
+                return r;
+
+        if (volume_key_size == 0) {
+                log_debug("Not measuring volume key, none specified.");
+                return r;
+        }
+
+        (void) measure_volume_key(cd, name, volume_key, volume_key_size); /* OK if fails */
+        return r;
+}
+
+static int measured_crypt_activate_by_passphrase(
+                struct crypt_device *cd,
+                const char *name,
+                int keyslot,
+                const char *passphrase,
+                size_t passphrase_size,
+                uint32_t flags) {
+
+        _cleanup_(erase_and_freep) void *vk = NULL;
+        size_t vks;
+        int r;
+
+        assert(cd);
+
+        /* A wrapper around crypt_activate_by_passphrase() which also measures to a PCR if that's
+         * requested. Note that we need the volume key for the measurement, and
+         * crypt_activate_by_passphrase() doesn't give us access to this. Hence, we operate indirectly, and
+         * retrieve the volume key first, and then activate through that. */
+
+        if (arg_tpm2_measure_pcr == UINT_MAX) {
+                log_debug("Not measuring volume key, deactivated.");
+                goto shortcut;
+        }
+
+        r = crypt_get_volume_key_size(cd);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                log_debug("Not measuring volume key, none defined.");
+                goto shortcut;
+        }
+
+        vk = malloc(vks = r);
+        if (!vk)
+                return -ENOMEM;
+
+        r = crypt_volume_key_get(cd, keyslot, vk, &vks, passphrase, passphrase_size);
+        if (r < 0)
+                return r;
+
+        return measured_crypt_activate_by_volume_key(cd, name, vk, vks, flags);
+
+shortcut:
+        return crypt_activate_by_passphrase(cd, name, keyslot, passphrase, passphrase_size, flags);
+}
+
+static int attach_tcrypt(
+                struct crypt_device *cd,
+                const char *name,
+                const char *key_file,
+                const void *key_data,
+                size_t key_data_size,
+                char **passwords,
+                uint32_t flags) {
+
+        int r = 0;
+        _cleanup_(erase_and_freep) char *passphrase = NULL;
+        struct crypt_params_tcrypt params = {
+                .flags = CRYPT_TCRYPT_LEGACY_MODES,
+                .keyfiles = (const char **)arg_tcrypt_keyfiles,
+                .keyfiles_count = strv_length(arg_tcrypt_keyfiles)
+        };
+
+        assert(cd);
+        assert(name);
+        assert(key_file || key_data || !strv_isempty(passwords));
+
+        if (arg_pkcs11_uri || arg_pkcs11_uri_auto || arg_fido2_device || arg_fido2_device_auto || arg_tpm2_device || arg_tpm2_device_auto)
+                /* Ask for a regular password */
+                return log_error_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                       "Sorry, but tcrypt devices are currently not supported in conjunction with pkcs11/fido2/tpm2 support.");
+
+        if (arg_tcrypt_hidden)
+                params.flags |= CRYPT_TCRYPT_HIDDEN_HEADER;
+
+        if (arg_tcrypt_system)
+                params.flags |= CRYPT_TCRYPT_SYSTEM_HEADER;
+
+        if (arg_tcrypt_veracrypt)
+                params.flags |= CRYPT_TCRYPT_VERA_MODES;
+
+        if (arg_tcrypt_veracrypt && arg_tcrypt_veracrypt_pim != 0)
+                params.veracrypt_pim = arg_tcrypt_veracrypt_pim;
+
+        if (key_data) {
+                params.passphrase = key_data;
+                params.passphrase_size = key_data_size;
+                r = crypt_load(cd, CRYPT_TCRYPT, ¶ms);
+        } else if (key_file) {
+                r = read_one_line_file(key_file, &passphrase);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to read password file '%s': %m", key_file);
+                        return -EAGAIN; /* log with the actual error, but return EAGAIN */
+                }
+                params.passphrase = passphrase;
+                params.passphrase_size = strlen(passphrase);
+                r = crypt_load(cd, CRYPT_TCRYPT, ¶ms);
+        } else {
+                r = -EINVAL;
+                STRV_FOREACH(p, passwords){
+                        params.passphrase = *p;
+                        params.passphrase_size = strlen(*p);
+                        r = crypt_load(cd, CRYPT_TCRYPT, ¶ms);
+                        if (r >= 0)
+                                break;
+                }
+        }
+
+        if (r < 0) {
+                if (r == -EPERM) {
+                        if (key_data)
+                                log_error_errno(r, "Failed to activate using discovered key. (Key not correct?)");
+                        else if (key_file)
+                                log_error_errno(r, "Failed to activate using password file '%s'. (Key data not correct?)", key_file);
+                        else
+                                log_error_errno(r, "Failed to activate using supplied passwords.");
+
+                        return r;
+                }
+
+                return log_error_errno(r, "Failed to load tcrypt superblock on device %s: %m", crypt_get_device_name(cd));
+        }
+
+        r = measured_crypt_activate_by_volume_key(cd, name, NULL, 0, flags);
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate tcrypt device %s: %m", crypt_get_device_name(cd));
+
+        return 0;
+}
+
+static char *make_bindname(const char *volume) {
+        char *s;
+
+        if (asprintf(&s, "@%" PRIx64"/cryptsetup/%s", random_u64(), volume) < 0)
+                return NULL;
+
+        return s;
+}
+
+static int make_security_device_monitor(
+                sd_event **ret_event,
+                sd_device_monitor **ret_monitor) {
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        int r;
+
+        assert(ret_event);
+        assert(ret_monitor);
+
+        /* Waits for a device with "security-device" tag to show up in udev */
+
+        r = sd_event_default(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        r = sd_event_add_time_relative(event, NULL, CLOCK_MONOTONIC, arg_token_timeout_usec, USEC_PER_SEC, NULL, INT_TO_PTR(-ETIMEDOUT));
+        if (r < 0)
+                return log_error_errno(r, "Failed to install timeout event source: %m");
+
+        r = sd_device_monitor_new(&monitor);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate device monitor: %m");
+
+        (void) sd_device_monitor_set_description(monitor, "security-device");
+
+        r = sd_device_monitor_filter_add_match_tag(monitor, "security-device");
+        if (r < 0)
+                return log_error_errno(r, "Failed to configure device monitor: %m");
+
+        r = sd_device_monitor_attach_event(monitor, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach device monitor: %m");
+
+        r = sd_device_monitor_start(monitor, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start device monitor: %m");
+
+        *ret_event = TAKE_PTR(event);
+        *ret_monitor = TAKE_PTR(monitor);
+        return 0;
+}
+
+static int run_security_device_monitor(
+                sd_event *event,
+                sd_device_monitor *monitor) {
+        bool processed = false;
+        int r;
+
+        assert(event);
+        assert(monitor);
+
+        /* Runs the event loop for the device monitor until either something happens, or the time-out is
+         * hit. */
+
+        for (;;) {
+                int x;
+
+                r = sd_event_get_exit_code(event, &x);
+                if (r < 0) {
+                        if (r != -ENODATA)
+                                return log_error_errno(r, "Failed to query exit code from event loop: %m");
+
+                        /* On ENODATA we aren't told to exit yet. */
+                } else {
+                        assert(x == -ETIMEDOUT);
+                        return log_notice_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                "Timed out waiting for security device, aborting security device based authentication attempt.");
+                }
+
+                /* Wait for one event, and then eat all subsequent events until there are no further ones */
+                r = sd_event_run(event, processed ? 0 : UINT64_MAX);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to run event loop: %m");
+                if (r == 0) /* no events queued anymore */
+                        return 0;
+
+                processed = true;
+        }
+}
+
+static bool libcryptsetup_plugins_support(void) {
+
+#if HAVE_TPM2
+        /* Currently, there's no way for us to query the volume key when plugins are used. Hence don't use
+         * plugins, if measurement has been requested. */
+        if (arg_tpm2_measure_pcr != UINT_MAX)
+                return false;
+#endif
+
+#if HAVE_LIBCRYPTSETUP_PLUGINS
+        int r;
+
+        /* Permit a way to disable libcryptsetup token module support, for debugging purposes. */
+        r = getenv_bool("SYSTEMD_CRYPTSETUP_USE_TOKEN_MODULE");
+        if (r < 0 && r != -ENXIO)
+                log_debug_errno(r, "Failed to parse $SYSTEMD_CRYPTSETUP_USE_TOKEN_MODULE env var: %m");
+        if (r == 0)
+                return false;
+
+        return crypt_token_external_path();
+#else
+        return false;
+#endif
+}
+
+#if HAVE_LIBCRYPTSETUP_PLUGINS
+static int acquire_pins_from_env_variable(char ***ret_pins) {
+        _cleanup_(erase_and_freep) char *envpin = NULL;
+        _cleanup_strv_free_erase_ char **pins = NULL;
+        int r;
+
+        assert(ret_pins);
+
+        r = getenv_steal_erase("PIN", &envpin);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire PIN from environment: %m");
+        if (r > 0) {
+                pins = strv_new(envpin);
+                if (!pins)
+                        return log_oom();
+        }
+
+        *ret_pins = TAKE_PTR(pins);
+
+        return 0;
+}
+#endif
+
+static int crypt_activate_by_token_pin_ask_password(
+                struct crypt_device *cd,
+                const char *name,
+                const char *type,
+                usec_t until,
+                bool headless,
+                void *userdata,
+                uint32_t activation_flags,
+                const char *message,
+                const char *key_name,
+                const char *credential_name) {
+
+#if HAVE_LIBCRYPTSETUP_PLUGINS
+        AskPasswordFlags flags = arg_ask_password_flags | ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_ACCEPT_CACHED;
+        _cleanup_strv_free_erase_ char **pins = NULL;
+        int r;
+
+        r = crypt_activate_by_token_pin(cd, name, type, CRYPT_ANY_TOKEN, /* pin=*/ NULL, /* pin_size= */ 0, userdata, activation_flags);
+        if (r > 0) /* returns unlocked keyslot id on success */
+                return 0;
+        if (r != -ENOANO) /* needs pin or pin is wrong */
+                return r;
+
+        r = acquire_pins_from_env_variable(&pins);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(p, pins) {
+                r = crypt_activate_by_token_pin(cd, name, type, CRYPT_ANY_TOKEN, *p, strlen(*p), userdata, activation_flags);
+                if (r > 0) /* returns unlocked keyslot id on success */
+                        return 0;
+                if (r != -ENOANO) /* needs pin or pin is wrong */
+                        return r;
+        }
+
+        if (headless)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "PIN querying disabled via 'headless' option. Use the '$PIN' environment variable.");
+
+        for (;;) {
+                pins = strv_free_erase(pins);
+                r = ask_password_auto(message, "drive-harddisk", /* id= */ NULL, key_name, credential_name, until, flags, &pins);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(p, pins) {
+                        r = crypt_activate_by_token_pin(cd, name, type, CRYPT_ANY_TOKEN, *p, strlen(*p), userdata, activation_flags);
+                        if (r > 0) /* returns unlocked keyslot id on success */
+                                return 0;
+                        if (r != -ENOANO) /* needs pin or pin is wrong */
+                                return r;
+                }
+
+                flags &= ~ASK_PASSWORD_ACCEPT_CACHED;
+        }
+        return r;
+#else
+        return -EOPNOTSUPP;
+#endif
+}
+
+static int attach_luks2_by_fido2_via_plugin(
+                struct crypt_device *cd,
+                const char *name,
+                usec_t until,
+                bool headless,
+                void *userdata,
+                uint32_t activation_flags) {
+
+        return crypt_activate_by_token_pin_ask_password(
+                        cd,
+                        name,
+                        "systemd-fido2",
+                        until,
+                        headless,
+                        userdata,
+                        activation_flags,
+                        "Please enter security token PIN:",
+                        "fido2-pin",
+                        "cryptsetup.fido2-pin");
+}
+
+static int attach_luks_or_plain_or_bitlk_by_fido2(
+                struct crypt_device *cd,
+                const char *name,
+                const char *key_file,
+                const void *key_data,
+                size_t key_data_size,
+                usec_t until,
+                uint32_t flags,
+                bool pass_volume_key) {
+
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        size_t decrypted_key_size, cid_size = 0;
+        _cleanup_free_ char *friendly = NULL;
+        int keyslot = arg_key_slot, r;
+        const char *rp_id = NULL;
+        const void *cid = NULL;
+        Fido2EnrollFlags required;
+        bool use_libcryptsetup_plugin = libcryptsetup_plugins_support();
+
+        assert(cd);
+        assert(name);
+        assert(arg_fido2_device || arg_fido2_device_auto);
+
+        if (arg_fido2_cid) {
+                if (!key_file && !key_data)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "FIDO2 mode with manual parameters selected, but no keyfile specified, refusing.");
+
+                rp_id = arg_fido2_rp_id;
+                cid = arg_fido2_cid;
+                cid_size = arg_fido2_cid_size;
+
+                /* For now and for compatibility, if the user explicitly configured FIDO2 support and we do
+                 * not read FIDO2 metadata off the LUKS2 header, default to the systemd 248 logic, where we
+                 * use PIN + UP when needed, and do not configure UV at all. Eventually, we should make this
+                 * explicitly configurable. */
+                required = FIDO2ENROLL_PIN_IF_NEEDED | FIDO2ENROLL_UP_IF_NEEDED | FIDO2ENROLL_UV_OMIT;
+        }
+
+        friendly = friendly_disk_name(crypt_get_device_name(cd), name);
+        if (!friendly)
+                return log_oom();
+
+        for (;;) {
+                if (use_libcryptsetup_plugin && !arg_fido2_cid) {
+                        r = attach_luks2_by_fido2_via_plugin(cd, name, until, arg_headless, arg_fido2_device, flags);
+                        if (IN_SET(r, -ENOTUNIQ, -ENXIO, -ENOENT))
+                                return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                       "Automatic FIDO2 metadata discovery was not possible because missing or not unique, falling back to traditional unlocking.");
+
+                } else {
+                        if (cid)
+                                r = acquire_fido2_key(
+                                                name,
+                                                friendly,
+                                                arg_fido2_device,
+                                                rp_id,
+                                                cid, cid_size,
+                                                key_file, arg_keyfile_size, arg_keyfile_offset,
+                                                key_data, key_data_size,
+                                                until,
+                                                arg_headless,
+                                                required,
+                                                &decrypted_key, &decrypted_key_size,
+                                                arg_ask_password_flags);
+                        else
+                                r = acquire_fido2_key_auto(
+                                                cd,
+                                                name,
+                                                friendly,
+                                                arg_fido2_device,
+                                                until,
+                                                arg_headless,
+                                                &decrypted_key, &decrypted_key_size,
+                                                arg_ask_password_flags);
+                        if (r >= 0)
+                                break;
+                }
+
+                if (r != -EAGAIN) /* EAGAIN means: token not found */
+                        return r;
+
+                if (!monitor) {
+                        /* We didn't find the token. In this case, watch for it via udev. Let's
+                         * create an event loop and monitor first. */
+
+                        assert(!event);
+
+                        r = make_security_device_monitor(&event, &monitor);
+                        if (r < 0)
+                                return r;
+
+                        log_notice("Security token not present for unlocking volume %s, please plug it in.", friendly);
+
+                        /* Let's immediately rescan in case the token appeared in the time we needed
+                         * to create and configure the monitor */
+                        continue;
+                }
+
+                r = run_security_device_monitor(event, monitor);
+                if (r < 0)
+                        return r;
+
+                log_debug("Got one or more potentially relevant udev events, rescanning FIDO2...");
+        }
+
+        if (pass_volume_key)
+                r = measured_crypt_activate_by_volume_key(cd, name, decrypted_key, decrypted_key_size, flags);
+        else {
+                _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+                ssize_t base64_encoded_size;
+
+                /* Before using this key as passphrase we base64 encode it, for compat with homed */
+
+                base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+                if (base64_encoded_size < 0)
+                        return log_oom();
+
+                r = measured_crypt_activate_by_passphrase(cd, name, keyslot, base64_encoded, base64_encoded_size, flags);
+        }
+        if (r == -EPERM) {
+                log_error_errno(r, "Failed to activate with FIDO2 decrypted key. (Key incorrect?)");
+                return -EAGAIN; /* log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate with FIDO2 acquired key: %m");
+
+        return 0;
+}
+
+static int attach_luks2_by_pkcs11_via_plugin(
+                struct crypt_device *cd,
+                const char *name,
+                const char *friendly_name,
+                usec_t until,
+                bool headless,
+                uint32_t flags) {
+
+#if HAVE_LIBCRYPTSETUP_PLUGINS
+        int r;
+
+        if (!streq_ptr(crypt_get_type(cd), CRYPT_LUKS2))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Automatic PKCS#11 metadata requires LUKS2 device.");
+
+        systemd_pkcs11_plugin_params params = {
+                .friendly_name = friendly_name,
+                .until = until,
+                .headless = headless,
+                .askpw_flags = arg_ask_password_flags,
+        };
+
+        r = crypt_activate_by_token_pin(cd, name, "systemd-pkcs11", CRYPT_ANY_TOKEN, NULL, 0, ¶ms, flags);
+        if (r > 0) /* returns unlocked keyslot id on success */
+                r = 0;
+
+        return r;
+#else
+        return -EOPNOTSUPP;
+#endif
+}
+
+static int attach_luks_or_plain_or_bitlk_by_pkcs11(
+                struct crypt_device *cd,
+                const char *name,
+                const char *key_file,
+                const void *key_data,
+                size_t key_data_size,
+                usec_t until,
+                uint32_t flags,
+                bool pass_volume_key) {
+
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+        _cleanup_free_ char *friendly = NULL, *discovered_uri = NULL;
+        size_t decrypted_key_size = 0, discovered_key_size = 0;
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_free_ void *discovered_key = NULL;
+        int keyslot = arg_key_slot, r;
+        const char *uri = NULL;
+        bool use_libcryptsetup_plugin = libcryptsetup_plugins_support();
+
+        assert(cd);
+        assert(name);
+        assert(arg_pkcs11_uri || arg_pkcs11_uri_auto);
+
+        if (arg_pkcs11_uri_auto) {
+                if (!use_libcryptsetup_plugin) {
+                        r = find_pkcs11_auto_data(cd, &discovered_uri, &discovered_key, &discovered_key_size, &keyslot);
+                        if (IN_SET(r, -ENOTUNIQ, -ENXIO))
+                                return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                       "Automatic PKCS#11 metadata discovery was not possible because missing or not unique, falling back to traditional unlocking.");
+                        if (r < 0)
+                                return r;
+
+                        uri = discovered_uri;
+                        key_data = discovered_key;
+                        key_data_size = discovered_key_size;
+                }
+        } else {
+                uri = arg_pkcs11_uri;
+
+                if (!key_file && !key_data)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "PKCS#11 mode selected but no key file specified, refusing.");
+        }
+
+        friendly = friendly_disk_name(crypt_get_device_name(cd), name);
+        if (!friendly)
+                return log_oom();
+
+        for (;;) {
+                if (use_libcryptsetup_plugin && arg_pkcs11_uri_auto)
+                        r = attach_luks2_by_pkcs11_via_plugin(cd, name, friendly, until, arg_headless, flags);
+                else {
+                        r = decrypt_pkcs11_key(
+                                        name,
+                                        friendly,
+                                        uri,
+                                        key_file, arg_keyfile_size, arg_keyfile_offset,
+                                        key_data, key_data_size,
+                                        until,
+                                        arg_headless,
+                                        &decrypted_key, &decrypted_key_size);
+                        if (r >= 0)
+                                break;
+                }
+
+                if (r != -EAGAIN) /* EAGAIN means: token not found */
+                        return r;
+
+                if (!monitor) {
+                        /* We didn't find the token. In this case, watch for it via udev. Let's
+                         * create an event loop and monitor first. */
+
+                        assert(!event);
+
+                        r = make_security_device_monitor(&event, &monitor);
+                        if (r < 0)
+                                return r;
+
+                        log_notice("Security token%s%s not present for unlocking volume %s, please plug it in.",
+                                   uri ? " " : "", strempty(uri), friendly);
+
+                        /* Let's immediately rescan in case the token appeared in the time we needed
+                         * to create and configure the monitor */
+                        continue;
+                }
+
+                r = run_security_device_monitor(event, monitor);
+                if (r < 0)
+                        return r;
+
+                log_debug("Got one or more potentially relevant udev events, rescanning PKCS#11...");
+        }
+        assert(decrypted_key);
+
+        if (pass_volume_key)
+                r = measured_crypt_activate_by_volume_key(cd, name, decrypted_key, decrypted_key_size, flags);
+        else {
+                _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+                ssize_t base64_encoded_size;
+
+                /* Before using this key as passphrase we base64 encode it. Why? For compatibility
+                 * with homed's PKCS#11 hookup: there we want to use the key we acquired through
+                 * PKCS#11 for other authentication/decryption mechanisms too, and some of them do
+                 * not take arbitrary binary blobs, but require NUL-terminated strings — most
+                 * importantly UNIX password hashes. Hence, for compatibility we want to use a string
+                 * without embedded NUL here too, and that's easiest to generate from a binary blob
+                 * via base64 encoding. */
+
+                base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+                if (base64_encoded_size < 0)
+                        return log_oom();
+
+                r = measured_crypt_activate_by_passphrase(cd, name, keyslot, base64_encoded, base64_encoded_size, flags);
+        }
+        if (r == -EPERM) {
+                log_error_errno(r, "Failed to activate with PKCS#11 decrypted key. (Key incorrect?)");
+                return -EAGAIN; /* log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate with PKCS#11 acquired key: %m");
+
+        return 0;
+}
+
+static int make_tpm2_device_monitor(
+                sd_event **ret_event,
+                sd_device_monitor **ret_monitor) {
+
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        int r;
+
+        assert(ret_event);
+        assert(ret_monitor);
+
+        r = sd_event_default(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        r = sd_event_add_time_relative(event, NULL, CLOCK_MONOTONIC, arg_token_timeout_usec, USEC_PER_SEC, NULL, INT_TO_PTR(-ETIMEDOUT));
+        if (r < 0)
+                return log_error_errno(r, "Failed to install timeout event source: %m");
+
+        r = sd_device_monitor_new(&monitor);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate device monitor: %m");
+
+        (void) sd_device_monitor_set_description(monitor, "tpmrm");
+
+        r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "tpmrm", NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to configure device monitor: %m");
+
+        r = sd_device_monitor_attach_event(monitor, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach device monitor: %m");
+
+        r = sd_device_monitor_start(monitor, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start device monitor: %m");
+
+        *ret_event = TAKE_PTR(event);
+        *ret_monitor = TAKE_PTR(monitor);
+        return 0;
+}
+
+static bool use_token_plugins(void) {
+        int r;
+
+        /* Disable tokens if we shall measure, since we won't get access to the volume key then. */
+        if (arg_tpm2_measure_pcr != UINT_MAX)
+                return false;
+
+        r = getenv_bool("SYSTEMD_CRYPTSETUP_USE_TOKEN_MODULE");
+        if (r < 0 && r != -ENXIO)
+                log_debug_errno(r, "Failed to parse $SYSTEMD_CRYPTSETUP_USE_TOKEN_MODULE value, ignoring: %m");
+
+        return r != 0;
+}
+
+static int attach_luks2_by_tpm2_via_plugin(
+                struct crypt_device *cd,
+                const char *name,
+                usec_t until,
+                bool headless,
+                uint32_t flags) {
+
+#if HAVE_LIBCRYPTSETUP_PLUGINS
+        systemd_tpm2_plugin_params params = {
+                .search_pcr_mask = arg_tpm2_pcr_mask,
+                .device = arg_tpm2_device,
+                .signature_path = arg_tpm2_signature,
+                .pcrlock_path = arg_tpm2_pcrlock,
+        };
+
+        if (!libcryptsetup_plugins_support())
+                return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                       "Libcryptsetup has external plugins support disabled.");
+
+        return crypt_activate_by_token_pin_ask_password(
+                        cd,
+                        name,
+                        "systemd-tpm2",
+                        until,
+                        headless,
+                        ¶ms,
+                        flags,
+                        "Please enter TPM2 PIN:",
+                        "tpm2-pin",
+                        "cryptsetup.tpm2-pin");
+#else
+        return -EOPNOTSUPP;
+#endif
+}
+
+static int attach_luks_or_plain_or_bitlk_by_tpm2(
+                struct crypt_device *cd,
+                const char *name,
+                const char *key_file,
+                const void *key_data,
+                size_t key_data_size,
+                usec_t until,
+                uint32_t flags,
+                bool pass_volume_key) {
+
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_free_ char *friendly = NULL;
+        int keyslot = arg_key_slot, r;
+        size_t decrypted_key_size;
+
+        assert(cd);
+        assert(name);
+        assert(arg_tpm2_device || arg_tpm2_device_auto);
+
+        friendly = friendly_disk_name(crypt_get_device_name(cd), name);
+        if (!friendly)
+                return log_oom();
+
+        for (;;) {
+                if (key_file || key_data) {
+                        /* If key data is specified, use that */
+
+                        r = acquire_tpm2_key(
+                                        name,
+                                        arg_tpm2_device,
+                                        arg_tpm2_pcr_mask == UINT32_MAX ? TPM2_PCR_MASK_DEFAULT : arg_tpm2_pcr_mask,
+                                        UINT16_MAX,
+                                        /* pubkey= */ NULL, /* pubkey_size= */ 0,
+                                        /* pubkey_pcr_mask= */ 0,
+                                        /* signature_path= */ NULL,
+                                        /* pcrlock_path= */ NULL,
+                                        /* primary_alg= */ 0,
+                                        key_file, arg_keyfile_size, arg_keyfile_offset,
+                                        key_data, key_data_size,
+                                        /* policy_hash= */ NULL, /* policy_hash_size= */ 0, /* we don't know the policy hash */
+                                        /* salt= */ NULL, /* salt_size= */ 0,
+                                        /* srk_buf= */ NULL, /* srk_buf_size= */ 0,
+                                        arg_tpm2_pin ? TPM2_FLAGS_USE_PIN : 0,
+                                        until,
+                                        arg_headless,
+                                        arg_ask_password_flags,
+                                        &decrypted_key, &decrypted_key_size);
+                        if (r >= 0)
+                                break;
+                        if (IN_SET(r, -EACCES, -ENOLCK))
+                                return log_error_errno(SYNTHETIC_ERRNO(EAGAIN), "TPM2 PIN unlock failed, falling back to traditional unlocking.");
+                        if (ERRNO_IS_NOT_SUPPORTED(r)) /* TPM2 support not compiled in? */
+                                return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), "TPM2 support not available, falling back to traditional unlocking.");
+                        /* EAGAIN means: no tpm2 chip found */
+                        if (r != -EAGAIN) {
+                                log_notice_errno(r, "TPM2 operation failed, falling back to traditional unlocking: %m");
+                                return -EAGAIN; /* Mangle error code: let's make any form of TPM2 failure non-fatal. */
+                        }
+                } else {
+                        r = attach_luks2_by_tpm2_via_plugin(cd, name, until, arg_headless, flags);
+                        if (r >= 0)
+                                return 0;
+                        /* EAGAIN     means: no tpm2 chip found
+                         * EOPNOTSUPP means: no libcryptsetup plugins support */
+                        if (r == -ENXIO)
+                                return log_notice_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                        "No TPM2 metadata matching the current system state found in LUKS2 header, falling back to traditional unlocking.");
+                        if (r == -ENOENT)
+                                return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                       "No TPM2 metadata enrolled in LUKS2 header or TPM2 support not available, falling back to traditional unlocking.");
+                        if (!IN_SET(r, -EOPNOTSUPP, -EAGAIN)) {
+                                log_notice_errno(r, "TPM2 operation failed, falling back to traditional unlocking: %m");
+                                return -EAGAIN; /* Mangle error code: let's make any form of TPM2 failure non-fatal. */
+                        }
+                }
+
+                if (r == -EOPNOTSUPP) { /* Plugin not available, let's process TPM2 stuff right here instead */
+                        _cleanup_free_ void *blob = NULL, *policy_hash = NULL;
+                        size_t blob_size, policy_hash_size;
+                        bool found_some = false;
+                        int token = 0; /* first token to look at */
+
+                        /* If no key data is specified, look for it in the header. In order to support
+                         * software upgrades we'll iterate through all suitable tokens, maybe one of them
+                         * works. */
+
+                        for (;;) {
+                                _cleanup_free_ void *pubkey = NULL, *salt = NULL, *srk_buf = NULL;
+                                size_t pubkey_size = 0, salt_size = 0, srk_buf_size = 0;
+                                uint32_t hash_pcr_mask, pubkey_pcr_mask;
+                                uint16_t pcr_bank, primary_alg;
+                                TPM2Flags tpm2_flags;
+
+                                r = find_tpm2_auto_data(
+                                                cd,
+                                                arg_tpm2_pcr_mask, /* if != UINT32_MAX we'll only look for tokens with this PCR mask */
+                                                token, /* search for the token with this index, or any later index than this */
+                                                &hash_pcr_mask,
+                                                &pcr_bank,
+                                                &pubkey, &pubkey_size,
+                                                &pubkey_pcr_mask,
+                                                &primary_alg,
+                                                &blob, &blob_size,
+                                                &policy_hash, &policy_hash_size,
+                                                &salt, &salt_size,
+                                                &srk_buf, &srk_buf_size,
+                                                &tpm2_flags,
+                                                &keyslot,
+                                                &token);
+                                if (r == -ENXIO)
+                                        /* No further TPM2 tokens found in the LUKS2 header. */
+                                        return log_full_errno(found_some ? LOG_NOTICE : LOG_DEBUG,
+                                                              SYNTHETIC_ERRNO(EAGAIN),
+                                                              found_some
+                                                              ? "No TPM2 metadata matching the current system state found in LUKS2 header, falling back to traditional unlocking."
+                                                              : "No TPM2 metadata enrolled in LUKS2 header, falling back to traditional unlocking.");
+                                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                                        /* TPM2 support not compiled in? */
+                                        return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                               "TPM2 support not available, falling back to traditional unlocking.");
+                                if (r < 0)
+                                        return r;
+
+                                found_some = true;
+
+                                r = acquire_tpm2_key(
+                                                name,
+                                                arg_tpm2_device,
+                                                hash_pcr_mask,
+                                                pcr_bank,
+                                                pubkey, pubkey_size,
+                                                pubkey_pcr_mask,
+                                                arg_tpm2_signature,
+                                                arg_tpm2_pcrlock,
+                                                primary_alg,
+                                                /* key_file= */ NULL, /* key_file_size= */ 0, /* key_file_offset= */ 0, /* no key file */
+                                                blob, blob_size,
+                                                policy_hash, policy_hash_size,
+                                                salt, salt_size,
+                                                srk_buf, srk_buf_size,
+                                                tpm2_flags,
+                                                until,
+                                                arg_headless,
+                                                arg_ask_password_flags,
+                                                &decrypted_key, &decrypted_key_size);
+                                if (IN_SET(r, -EACCES, -ENOLCK))
+                                        return log_notice_errno(SYNTHETIC_ERRNO(EAGAIN), "TPM2 PIN unlock failed, falling back to traditional unlocking.");
+                                if (r != -EPERM)
+                                        break;
+
+                                token++; /* try a different token next time */
+                        }
+
+                        if (r >= 0)
+                                break;
+                        /* EAGAIN means: no tpm2 chip found */
+                        if (r != -EAGAIN) {
+                                log_notice_errno(r, "TPM2 operation failed, falling back to traditional unlocking: %m");
+                                return -EAGAIN; /* Mangle error code: let's make any form of TPM2 failure non-fatal. */
+                        }
+                }
+
+                if (!monitor) {
+                        /* We didn't find the TPM2 device. In this case, watch for it via udev. Let's create
+                         * an event loop and monitor first. */
+
+                        assert(!event);
+
+                        if (is_efi_boot() && !efi_has_tpm2())
+                                return log_notice_errno(SYNTHETIC_ERRNO(EAGAIN),
+                                                        "No TPM2 hardware discovered and EFI firmware does not see it either, falling back to traditional unlocking.");
+
+                        r = make_tpm2_device_monitor(&event, &monitor);
+                        if (r < 0)
+                                return r;
+
+                        log_info("TPM2 device not present for unlocking %s, waiting for it to become available.", friendly);
+
+                        /* Let's immediately rescan in case the device appeared in the time we needed
+                         * to create and configure the monitor */
+                        continue;
+                }
+
+                r = run_security_device_monitor(event, monitor);
+                if (r < 0)
+                        return r;
+
+                log_debug("Got one or more potentially relevant udev events, rescanning for TPM2...");
+        }
+        assert(decrypted_key);
+
+        if (pass_volume_key)
+                r = measured_crypt_activate_by_volume_key(cd, name, decrypted_key, decrypted_key_size, flags);
+        else {
+                _cleanup_(erase_and_freep) char *base64_encoded = NULL;
+                ssize_t base64_encoded_size;
+
+                /* Before using this key as passphrase we base64 encode it, for compat with homed */
+
+                base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+                if (base64_encoded_size < 0)
+                        return log_oom();
+
+                r = measured_crypt_activate_by_passphrase(cd, name, keyslot, base64_encoded, base64_encoded_size, flags);
+        }
+        if (r == -EPERM) {
+                log_error_errno(r, "Failed to activate with TPM2 decrypted key. (Key incorrect?)");
+                return -EAGAIN; /* log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate with TPM2 acquired key: %m");
+
+        return 0;
+}
+
+static int attach_luks_or_plain_or_bitlk_by_key_data(
+                struct crypt_device *cd,
+                const char *name,
+                const void *key_data,
+                size_t key_data_size,
+                uint32_t flags,
+                bool pass_volume_key) {
+
+        int r;
+
+        assert(cd);
+        assert(name);
+        assert(key_data);
+
+        if (pass_volume_key)
+                r = measured_crypt_activate_by_volume_key(cd, name, key_data, key_data_size, flags);
+        else
+                r = measured_crypt_activate_by_passphrase(cd, name, arg_key_slot, key_data, key_data_size, flags);
+        if (r == -EPERM) {
+                log_error_errno(r, "Failed to activate. (Key incorrect?)");
+                return -EAGAIN; /* Log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate: %m");
+
+        return 0;
+}
+
+static int attach_luks_or_plain_or_bitlk_by_key_file(
+                struct crypt_device *cd,
+                const char *name,
+                const char *key_file,
+                uint32_t flags,
+                bool pass_volume_key) {
+
+        _cleanup_(erase_and_freep) char *kfdata = NULL;
+        _cleanup_free_ char *bindname = NULL;
+        size_t kfsize;
+        int r;
+
+        assert(cd);
+        assert(name);
+        assert(key_file);
+
+        /* If we read the key via AF_UNIX, make this client recognizable */
+        bindname = make_bindname(name);
+        if (!bindname)
+                return log_oom();
+
+        r = read_full_file_full(
+                        AT_FDCWD, key_file,
+                        arg_keyfile_offset == 0 ? UINT64_MAX : arg_keyfile_offset,
+                        arg_keyfile_size == 0 ? SIZE_MAX : arg_keyfile_size,
+                        READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET,
+                        bindname,
+                        &kfdata, &kfsize);
+        if (r == -E2BIG) {
+                log_error_errno(r, "Failed to activate, key file '%s' too large.", key_file);
+                return -EAGAIN;
+        }
+        if (r == -ENOENT) {
+                log_error_errno(r, "Failed to activate, key file '%s' missing.", key_file);
+                return -EAGAIN; /* Log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to read key file '%s': %m", key_file);
+
+        if (pass_volume_key)
+                r = measured_crypt_activate_by_volume_key(cd, name, kfdata, kfsize, flags);
+        else
+                r = measured_crypt_activate_by_passphrase(cd, name, arg_key_slot, kfdata, kfsize, flags);
+        if (r == -EPERM) {
+                log_error_errno(r, "Failed to activate with key file '%s'. (Key data incorrect?)", key_file);
+                return -EAGAIN; /* Log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate with key file '%s': %m", key_file);
+
+        return 0;
+}
+
+static int attach_luks_or_plain_or_bitlk_by_passphrase(
+                struct crypt_device *cd,
+                const char *name,
+                char **passwords,
+                uint32_t flags,
+                bool pass_volume_key) {
+
+        int r;
+
+        assert(cd);
+        assert(name);
+
+        r = -EINVAL;
+        STRV_FOREACH(p, passwords) {
+                if (pass_volume_key)
+                        r = measured_crypt_activate_by_volume_key(cd, name, *p, arg_key_size, flags);
+                else
+                        r = measured_crypt_activate_by_passphrase(cd, name, arg_key_slot, *p, strlen(*p), flags);
+                if (r >= 0)
+                        break;
+        }
+        if (r == -EPERM) {
+                log_error_errno(r, "Failed to activate with specified passphrase. (Passphrase incorrect?)");
+                return -EAGAIN; /* log actual error, but return EAGAIN */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate with specified passphrase: %m");
+
+        return 0;
+}
+
+static int attach_luks_or_plain_or_bitlk(
+                struct crypt_device *cd,
+                const char *name,
+                const char *key_file,
+                const void *key_data,
+                size_t key_data_size,
+                char **passwords,
+                uint32_t flags,
+                usec_t until) {
+
+        bool pass_volume_key = false;
+        int r;
+
+        assert(cd);
+        assert(name);
+
+        if ((!arg_type && !crypt_get_type(cd)) || streq_ptr(arg_type, CRYPT_PLAIN)) {
+                struct crypt_params_plain params = {
+                        .offset = arg_offset,
+                        .skip = arg_skip,
+                        .sector_size = arg_sector_size,
+                };
+                const char *cipher, *cipher_mode;
+                _cleanup_free_ char *truncated_cipher = NULL;
+
+                if (streq_ptr(arg_hash, "plain"))
+                        /* plain isn't a real hash type. it just means "use no hash" */
+                        params.hash = NULL;
+                else if (arg_hash)
+                        params.hash = arg_hash;
+                else if (!key_file)
+                        /* for CRYPT_PLAIN, the behaviour of cryptsetup package is to not hash when a key
+                         * file is provided */
+                        params.hash = "ripemd160";
+
+                if (arg_cipher) {
+                        size_t l;
+
+                        l = strcspn(arg_cipher, "-");
+                        truncated_cipher = strndup(arg_cipher, l);
+                        if (!truncated_cipher)
+                                return log_oom();
+
+                        cipher = truncated_cipher;
+                        cipher_mode = arg_cipher[l] ? arg_cipher+l+1 : "plain";
+                } else {
+                        cipher = "aes";
+                        cipher_mode = "cbc-essiv:sha256";
+                }
+
+                /* for CRYPT_PLAIN limit reads from keyfile to key length, and ignore keyfile-size */
+                arg_keyfile_size = arg_key_size;
+
+                /* In contrast to what the name crypt_format() might suggest this doesn't actually format
+                 * anything, it just configures encryption parameters when used for plain mode. */
+                r = crypt_format(cd, CRYPT_PLAIN, cipher, cipher_mode, NULL, NULL, arg_keyfile_size, ¶ms);
+                if (r < 0)
+                        return log_error_errno(r, "Loading of cryptographic parameters failed: %m");
+
+                /* hash == NULL implies the user passed "plain" */
+                pass_volume_key = !params.hash;
+        }
+
+        log_info("Set cipher %s, mode %s, key size %i bits for device %s.",
+                 crypt_get_cipher(cd),
+                 crypt_get_cipher_mode(cd),
+                 crypt_get_volume_key_size(cd)*8,
+                 crypt_get_device_name(cd));
+
+        if (arg_tpm2_device || arg_tpm2_device_auto)
+                return attach_luks_or_plain_or_bitlk_by_tpm2(cd, name, key_file, key_data, key_data_size, until, flags, pass_volume_key);
+        if (arg_fido2_device || arg_fido2_device_auto)
+                return attach_luks_or_plain_or_bitlk_by_fido2(cd, name, key_file, key_data, key_data_size, until, flags, pass_volume_key);
+        if (arg_pkcs11_uri || arg_pkcs11_uri_auto)
+                return attach_luks_or_plain_or_bitlk_by_pkcs11(cd, name, key_file, key_data, key_data_size, until, flags, pass_volume_key);
+        if (key_data)
+                return attach_luks_or_plain_or_bitlk_by_key_data(cd, name, key_data, key_data_size, flags, pass_volume_key);
+        if (key_file)
+                return attach_luks_or_plain_or_bitlk_by_key_file(cd, name, key_file, flags, pass_volume_key);
+
+        return attach_luks_or_plain_or_bitlk_by_passphrase(cd, name, passwords, flags, pass_volume_key);
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-cryptsetup", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s attach VOLUME SOURCE-DEVICE [KEY-FILE] [CONFIG]\n"
+               "%1$s detach VOLUME\n\n"
+               "%2$sAttach or detach an encrypted block device.%3$s\n\n"
+               "  -h --help            Show this help\n"
+               "     --version         Show package version\n"
+               "\nSee the %4$s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+        };
+
+        static const struct option options[] = {
+                { "help",                         no_argument,       NULL, 'h'                       },
+                { "version",                      no_argument,       NULL, ARG_VERSION               },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        if (argv_looks_like_help(argc, argv))
+                return help();
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static uint32_t determine_flags(void) {
+        uint32_t flags = 0;
+
+        if (arg_readonly)
+                flags |= CRYPT_ACTIVATE_READONLY;
+
+        if (arg_discards)
+                flags |= CRYPT_ACTIVATE_ALLOW_DISCARDS;
+
+        if (arg_same_cpu_crypt)
+                flags |= CRYPT_ACTIVATE_SAME_CPU_CRYPT;
+
+        if (arg_submit_from_crypt_cpus)
+                flags |= CRYPT_ACTIVATE_SUBMIT_FROM_CRYPT_CPUS;
+
+        if (arg_no_read_workqueue)
+                flags |= CRYPT_ACTIVATE_NO_READ_WORKQUEUE;
+
+        if (arg_no_write_workqueue)
+                flags |= CRYPT_ACTIVATE_NO_WRITE_WORKQUEUE;
+
+#ifdef CRYPT_ACTIVATE_SERIALIZE_MEMORY_HARD_PBKDF
+        /* Try to decrease the risk of OOM event if memory hard key derivation function is in use */
+        /* https://gitlab.com/cryptsetup/cryptsetup/issues/446/ */
+        flags |= CRYPT_ACTIVATE_SERIALIZE_MEMORY_HARD_PBKDF;
+#endif
+
+        return flags;
+}
+
+static void remove_and_erasep(const char **p) {
+        int r;
+
+        if (!*p)
+                return;
+
+        r = unlinkat_deallocate(AT_FDCWD, *p, UNLINK_ERASE);
+        if (r < 0 && r != -ENOENT)
+                log_warning_errno(r, "Unable to erase key file '%s', ignoring: %m", *p);
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(crypt_freep) struct crypt_device *cd = NULL;
+        const char *verb;
+        int r;
+
+        log_setup();
+
+        umask(0022);
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        cryptsetup_enable_logging(NULL);
+
+        if (argc - optind < 2)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "This program requires at least two arguments.");
+        verb = ASSERT_PTR(argv[optind]);
+
+        if (streq(verb, "attach")) {
+                _unused_ _cleanup_(remove_and_erasep) const char *destroy_key_file = NULL;
+                _cleanup_(erase_and_freep) void *key_data = NULL;
+                crypt_status_info status;
+                size_t key_data_size = 0;
+                uint32_t flags = 0;
+                unsigned tries;
+                usec_t until;
+                PassphraseType passphrase_type = PASSPHRASE_NONE;
+
+                /* Arguments: systemd-cryptsetup attach VOLUME SOURCE-DEVICE [KEY-FILE] [CONFIG] */
+
+                if (argc - optind < 3)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "attach requires at least two arguments.");
+                if (argc - optind >= 6)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "attach does not accept more than four arguments.");
+
+                const char *volume = ASSERT_PTR(argv[optind + 1]),
+                           *source = ASSERT_PTR(argv[optind + 2]),
+                           *key_file = argc - optind >= 4 ? mangle_none(argv[optind + 3]) : NULL,
+                           *config = argc - optind >= 5 ? mangle_none(argv[optind + 4]) : NULL;
+
+                if (!filename_is_valid(volume))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume name '%s' is not valid.", volume);
+
+                if (key_file && !path_is_absolute(key_file)) {
+                        log_warning("Password file path '%s' is not absolute. Ignoring.", key_file);
+                        key_file = NULL;
+                }
+
+                if (config) {
+                        r = parse_crypt_config(config);
+                        if (r < 0)
+                                return r;
+                }
+
+                log_debug("%s %s ← %s type=%s cipher=%s", __func__,
+                          volume, source, strempty(arg_type), strempty(arg_cipher));
+
+                /* A delicious drop of snake oil */
+                (void) mlockall(MCL_FUTURE);
+
+                if (!key_file) {
+                        _cleanup_free_ char *bindname = NULL;
+                        const char *fn;
+
+                        bindname = make_bindname(volume);
+                        if (!bindname)
+                                return log_oom();
+
+                        /* If a key file is not explicitly specified, search for a key in a well defined
+                         * search path, and load it. */
+
+                        fn = strjoina(volume, ".key");
+                        r = find_key_file(
+                                        fn,
+                                        STRV_MAKE("/etc/cryptsetup-keys.d", "/run/cryptsetup-keys.d"),
+                                        bindname,
+                                        &key_data, &key_data_size);
+                        if (r < 0)
+                                return r;
+                        if (r > 0)
+                                log_debug("Automatically discovered key for volume '%s'.", volume);
+                } else if (arg_keyfile_erase)
+                        destroy_key_file = key_file; /* let's get this baby erased when we leave */
+
+                if (arg_header) {
+                        if (streq_ptr(arg_type, CRYPT_TCRYPT)){
+                            log_debug("tcrypt header: %s", arg_header);
+                            r = crypt_init_data_device(&cd, arg_header, source);
+                        } else {
+                            log_debug("LUKS header: %s", arg_header);
+                            r = crypt_init(&cd, arg_header);
+                        }
+                } else
+                        r = crypt_init(&cd, source);
+                if (r < 0)
+                        return log_error_errno(r, "crypt_init() failed: %m");
+
+                cryptsetup_enable_logging(cd);
+
+                status = crypt_status(cd, volume);
+                if (IN_SET(status, CRYPT_ACTIVE, CRYPT_BUSY)) {
+                        log_info("Volume %s already active.", volume);
+                        return 0;
+                }
+
+                flags = determine_flags();
+
+                until = usec_add(now(CLOCK_MONOTONIC), arg_timeout);
+                if (until == USEC_INFINITY)
+                        until = 0;
+
+                if (arg_key_size == 0)
+                        arg_key_size = 256U / 8U;
+
+                if (key_file) {
+                        struct stat st;
+
+                        /* Ideally we'd do this on the open fd, but since this is just a
+                         * warning it's OK to do this in two steps. */
+                        if (stat(key_file, &st) >= 0 && S_ISREG(st.st_mode) && (st.st_mode & 0005))
+                                log_warning("Key file %s is world-readable. This is not a good idea!", key_file);
+                }
+
+                if (!arg_type || STR_IN_SET(arg_type, ANY_LUKS, CRYPT_LUKS1, CRYPT_LUKS2)) {
+                        r = crypt_load(cd, !arg_type || streq(arg_type, ANY_LUKS) ? CRYPT_LUKS : arg_type, NULL);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to load LUKS superblock on device %s: %m", crypt_get_device_name(cd));
+
+                        if (arg_header) {
+                                r = crypt_set_data_device(cd, source);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to set LUKS data device %s: %m", source);
+                        }
+
+                        /* Tokens are available in LUKS2 only, but it is ok to call (and fail) with LUKS1. */
+                        if (!key_file && !key_data && use_token_plugins()) {
+                                r = crypt_activate_by_token_pin_ask_password(
+                                                cd,
+                                                volume,
+                                                /* type= */ NULL,
+                                                until,
+                                                arg_headless,
+                                                /* userdata= */ NULL,
+                                                flags,
+                                                "Please enter LUKS2 token PIN:",
+                                                "luks2-pin",
+                                                "cryptsetup.luks2-pin");
+                                if (r >= 0) {
+                                        log_debug("Volume %s activated with LUKS token id %i.", volume, r);
+                                        return 0;
+                                }
+
+                                log_debug_errno(r, "Token activation unsuccessful for device %s: %m", crypt_get_device_name(cd));
+                        }
+                }
+
+/* since cryptsetup 2.3.0 (Feb 2020) */
+#ifdef CRYPT_BITLK
+                if (streq_ptr(arg_type, CRYPT_BITLK)) {
+                        r = crypt_load(cd, CRYPT_BITLK, NULL);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to load Bitlocker superblock on device %s: %m", crypt_get_device_name(cd));
+                }
+#endif
+
+                for (tries = 0; arg_tries == 0 || tries < arg_tries; tries++) {
+                        _cleanup_strv_free_erase_ char **passwords = NULL;
+
+                        /* When we were able to acquire multiple keys, let's always process them in this order:
+                         *
+                         *    1. A key acquired via PKCS#11 or FIDO2 token, or TPM2 chip
+                         *    2. The discovered key: i.e. key_data + key_data_size
+                         *    3. The configured key: i.e. key_file + arg_keyfile_offset + arg_keyfile_size
+                         *    4. The empty password, in case arg_try_empty_password is set
+                         *    5. We enquire the user for a password
+                         */
+
+                        if (!key_file && !key_data && !arg_pkcs11_uri && !arg_pkcs11_uri_auto && !arg_fido2_device && !arg_fido2_device_auto && !arg_tpm2_device && !arg_tpm2_device_auto) {
+
+                                if (arg_try_empty_password) {
+                                        /* Hmm, let's try an empty password now, but only once */
+                                        arg_try_empty_password = false;
+
+                                        key_data = strdup("");
+                                        if (!key_data)
+                                                return log_oom();
+
+                                        key_data_size = 0;
+                                } else {
+                                        /* Ask the user for a passphrase or recovery key only as last resort, if we have
+                                         * nothing else to check for */
+                                        if (passphrase_type == PASSPHRASE_NONE) {
+                                                passphrase_type = check_registered_passwords(cd);
+                                                if (passphrase_type == PASSPHRASE_NONE)
+                                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No passphrase or recovery key registered.");
+                                        }
+
+                                        r = get_password(volume, source, until, tries == 0 && !arg_verify, passphrase_type, &passwords);
+                                        if (r == -EAGAIN)
+                                                continue;
+                                        if (r < 0)
+                                                return r;
+                                }
+                        }
+
+                        if (streq_ptr(arg_type, CRYPT_TCRYPT))
+                                r = attach_tcrypt(cd, volume, key_file, key_data, key_data_size, passwords, flags);
+                        else
+                                r = attach_luks_or_plain_or_bitlk(cd, volume, key_file, key_data, key_data_size, passwords, flags, until);
+                        if (r >= 0)
+                                break;
+                        if (r != -EAGAIN)
+                                return r;
+
+                        /* Key not correct? Let's try again! */
+
+                        key_file = NULL;
+                        key_data = erase_and_free(key_data);
+                        key_data_size = 0;
+                        arg_pkcs11_uri = mfree(arg_pkcs11_uri);
+                        arg_pkcs11_uri_auto = false;
+                        arg_fido2_device = mfree(arg_fido2_device);
+                        arg_fido2_device_auto = false;
+                        arg_tpm2_device = mfree(arg_tpm2_device);
+                        arg_tpm2_device_auto = false;
+                }
+
+                if (arg_tries != 0 && tries >= arg_tries)
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Too many attempts to activate; giving up.");
+
+        } else if (streq(verb, "detach")) {
+                const char *volume = ASSERT_PTR(argv[optind + 1]);
+
+                if (argc - optind >= 3)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "attach does not accept more than one argument.");
+
+                if (!filename_is_valid(volume))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume name '%s' is not valid.", volume);
+
+                r = crypt_init_by_name(&cd, volume);
+                if (r == -ENODEV) {
+                        log_info("Volume %s already inactive.", volume);
+                        return 0;
+                }
+                if (r < 0)
+                        return log_error_errno(r, "crypt_init_by_name() for volume '%s' failed: %m", volume);
+
+                cryptsetup_enable_logging(cd);
+
+                r = crypt_deactivate(cd, volume);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to deactivate '%s': %m", volume);
+
+        } else
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown verb %s.", verb);
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/cryptsetup/meson.build b/src/cryptsetup/meson.build
new file mode 100644
index 0000000..90e2be7
--- /dev/null
+++ b/src/cryptsetup/meson.build
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+subdir('cryptsetup-tokens')
+
+systemd_cryptsetup_sources = files(
+        'cryptsetup-keyfile.c',
+        'cryptsetup.c',
+)
+
+if conf.get('HAVE_P11KIT') == 1
+        systemd_cryptsetup_sources += files('cryptsetup-pkcs11.c')
+endif
+
+if conf.get('HAVE_TPM2') == 1
+        systemd_cryptsetup_sources += files('cryptsetup-tpm2.c')
+endif
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-cryptsetup',
+                'public' : true,
+                'conditions' : ['HAVE_LIBCRYPTSETUP'],
+                'sources' : systemd_cryptsetup_sources,
+                'dependencies' : [
+                        libcryptsetup,
+                        libopenssl,
+                        libp11kit_cflags,
+                ],
+        },
+        generator_template + {
+                'name' : 'systemd-cryptsetup-generator',
+                'conditions' : ['HAVE_LIBCRYPTSETUP'],
+                'sources' : files('cryptsetup-generator.c'),
+        },
+]
+
+if conf.get('HAVE_LIBCRYPTSETUP') == 1
+        # symlink for backwards compatibility after rename
+        meson.add_install_script(sh, '-c',
+                                 ln_s.format(bindir / 'systemd-cryptsetup',
+                                             libexecdir / 'systemd-cryptsetup'))
+endif
diff --git a/src/debug-generator/debug-generator.c b/src/debug-generator/debug-generator.c
new file mode 100644
index 0000000..8a474c5
--- /dev/null
+++ b/src/debug-generator/debug-generator.c
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "dropin.h"
+#include "generator.h"
+#include "initrd-util.h"
+#include "mkdir-label.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-file.h"
+#include "unit-name.h"
+
+static const char *arg_dest = NULL;
+static char *arg_default_unit = NULL;
+static char **arg_mask = NULL;
+static char **arg_wants = NULL;
+static char *arg_debug_shell = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_default_unit, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_mask, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_wants, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_debug_shell, freep);
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        int r;
+
+        assert(key);
+
+        if (streq(key, "systemd.mask")) {
+                char *n;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = unit_name_mangle(value, UNIT_NAME_MANGLE_WARN, &n);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to glob unit name: %m");
+
+                r = strv_consume(&arg_mask, n);
+                if (r < 0)
+                        return log_oom();
+
+        } else if (streq(key, "systemd.wants")) {
+                char *n;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = unit_name_mangle(value, UNIT_NAME_MANGLE_WARN, &n);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to glob unit name: %m");
+
+                r = strv_consume(&arg_wants, n);
+                if (r < 0)
+                        return log_oom();
+
+        } else if (proc_cmdline_key_streq(key, "systemd.debug_shell")) {
+                const char *t = NULL;
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        t = skip_dev_prefix(value);
+                else if (r > 0)
+                        t = skip_dev_prefix(DEBUGTTY);
+
+                return free_and_strdup_warn(&arg_debug_shell, t);
+
+        } else if (streq(key, "systemd.unit")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_default_unit, value);
+
+        } else if (!value) {
+                const char *target;
+
+                target = runlevel_to_target(key);
+                if (target)
+                        return free_and_strdup_warn(&arg_default_unit, target);
+        }
+
+        return 0;
+}
+
+static int generate_mask_symlinks(void) {
+        int r = 0;
+
+        STRV_FOREACH(u, arg_mask) {
+                _cleanup_free_ char *p = NULL;
+
+                p = path_join(empty_to_root(arg_dest), *u);
+                if (!p)
+                        return log_oom();
+
+                if (symlink("/dev/null", p) < 0)
+                        r = log_error_errno(errno,
+                                            "Failed to create mask symlink %s: %m",
+                                            p);
+        }
+
+        return r;
+}
+
+static int generate_wants_symlinks(void) {
+        int r = 0;
+
+        STRV_FOREACH(u, arg_wants) {
+                _cleanup_free_ char *f = NULL;
+                const char *target;
+
+                /* This should match what do_queue_default_job() in core/main.c does. */
+                if (arg_default_unit)
+                        target = arg_default_unit;
+                else if (in_initrd())
+                        target = SPECIAL_INITRD_TARGET;
+                else
+                        target = SPECIAL_DEFAULT_TARGET;
+
+                f = path_join(SYSTEM_DATA_UNIT_DIR, *u);
+                if (!f)
+                        return log_oom();
+
+                r = generator_add_symlink(arg_dest, target, "wants", f);
+                if (r < 0)
+                        return r;
+        }
+
+        return r;
+}
+
+static void install_debug_shell_dropin(const char *dir) {
+        int r;
+
+        if (streq(arg_debug_shell, skip_dev_prefix(DEBUGTTY)))
+                return;
+
+        r = write_drop_in_format(dir, "debug-shell.service", 50, "tty",
+                        "[Unit]\n"
+                        "Description=Early root shell on /dev/%s FOR DEBUGGING ONLY\n"
+                        "ConditionPathExists=\n"
+                        "[Service]\n"
+                        "TTYPath=/dev/%s",
+                        arg_debug_shell, arg_debug_shell);
+        if (r < 0)
+                log_warning_errno(r, "Failed to write drop-in for debug-shell.service, ignoring: %m");
+}
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+        int r, q;
+
+        assert_se(arg_dest = dest_early);
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, PROC_CMDLINE_RD_STRICT | PROC_CMDLINE_STRIP_RD_PREFIX);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+
+        if (arg_debug_shell) {
+                r = strv_extend(&arg_wants, "debug-shell.service");
+                if (r < 0)
+                        return log_oom();
+
+                install_debug_shell_dropin(arg_dest);
+        }
+
+        r = generate_mask_symlinks();
+        q = generate_wants_symlinks();
+
+        return r < 0 ? r : q;
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/debug-generator/meson.build b/src/debug-generator/meson.build
new file mode 100644
index 0000000..33cb344
--- /dev/null
+++ b/src/debug-generator/meson.build
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        generator_template + {
+                'name' : 'systemd-debug-generator',
+                'sources' : files('debug-generator.c'),
+        },
+]
diff --git a/src/delta/delta.c b/src/delta/delta.c
new file mode 100644
index 0000000..3337b7f
--- /dev/null
+++ b/src/delta/delta.c
@@ -0,0 +1,637 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "build.h"
+#include "chase.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "log.h"
+#include "main-func.h"
+#include "nulstr-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+
+static const char prefixes[] =
+        "/etc\0"
+        "/run\0"
+        "/usr/local/lib\0"
+        "/usr/local/share\0"
+        "/usr/lib\0"
+        "/usr/share\0"
+        ;
+
+static const char suffixes[] =
+        "sysctl.d\0"
+        "tmpfiles.d\0"
+        "modules-load.d\0"
+        "binfmt.d\0"
+        "systemd/system\0"
+        "systemd/user\0"
+        "systemd/system-preset\0"
+        "systemd/user-preset\0"
+        "udev/rules.d\0"
+        "modprobe.d\0";
+
+static const char have_dropins[] =
+        "systemd/system\0"
+        "systemd/user\0";
+
+static PagerFlags arg_pager_flags = 0;
+static int arg_diff = -1;
+
+static enum {
+        SHOW_MASKED     = 1 << 0,
+        SHOW_EQUIVALENT = 1 << 1,
+        SHOW_REDIRECTED = 1 << 2,
+        SHOW_OVERRIDDEN = 1 << 3,
+        SHOW_UNCHANGED  = 1 << 4,
+        SHOW_EXTENDED   = 1 << 5,
+
+        SHOW_DEFAULTS =
+        (SHOW_MASKED | SHOW_EQUIVALENT | SHOW_REDIRECTED | SHOW_OVERRIDDEN | SHOW_EXTENDED)
+} arg_flags = 0;
+
+static int equivalent(const char *a, const char *b) {
+        _cleanup_free_ char *x = NULL, *y = NULL;
+        int r;
+
+        r = chase(a, NULL, CHASE_TRAIL_SLASH, &x, NULL);
+        if (r < 0)
+                return r;
+
+        r = chase(b, NULL, CHASE_TRAIL_SLASH, &y, NULL);
+        if (r < 0)
+                return r;
+
+        return path_equal(x, y);
+}
+
+static int notify_override_masked(const char *top, const char *bottom) {
+        if (!(arg_flags & SHOW_MASKED))
+                return 0;
+
+        printf("%s%s%s     %s %s %s\n",
+               ansi_highlight_red(), "[MASKED]", ansi_normal(),
+               top, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), bottom);
+        return 1;
+}
+
+static int notify_override_equivalent(const char *top, const char *bottom) {
+        if (!(arg_flags & SHOW_EQUIVALENT))
+                return 0;
+
+        printf("%s%s%s %s %s %s\n",
+               ansi_highlight_green(), "[EQUIVALENT]", ansi_normal(),
+               top, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), bottom);
+        return 1;
+}
+
+static int notify_override_redirected(const char *top, const char *bottom) {
+        if (!(arg_flags & SHOW_REDIRECTED))
+                return 0;
+
+        printf("%s%s%s %s %s %s\n",
+               ansi_highlight(), "[REDIRECTED]", ansi_normal(),
+               top, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), bottom);
+        return 1;
+}
+
+static int notify_override_overridden(const char *top, const char *bottom) {
+        if (!(arg_flags & SHOW_OVERRIDDEN))
+                return 0;
+
+        printf("%s%s%s %s %s %s\n",
+               ansi_highlight(), "[OVERRIDDEN]", ansi_normal(),
+               top, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), bottom);
+        return 1;
+}
+
+static int notify_override_extended(const char *top, const char *bottom) {
+        if (!(arg_flags & SHOW_EXTENDED))
+               return 0;
+
+        printf("%s%s%s   %s %s %s\n",
+               ansi_highlight(), "[EXTENDED]", ansi_normal(),
+               top, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), bottom);
+        return 1;
+}
+
+static int notify_override_unchanged(const char *f) {
+        if (!(arg_flags & SHOW_UNCHANGED))
+                return 0;
+
+        printf("[UNCHANGED]  %s\n", f);
+        return 1;
+}
+
+static int found_override(const char *top, const char *bottom) {
+        _cleanup_free_ char *dest = NULL;
+        pid_t pid;
+        int r;
+
+        assert(top);
+        assert(bottom);
+
+        if (null_or_empty_path(top) > 0)
+                return notify_override_masked(top, bottom);
+
+        r = readlink_malloc(top, &dest);
+        if (r >= 0) {
+                if (equivalent(dest, bottom) > 0)
+                        return notify_override_equivalent(top, bottom);
+                else
+                        return notify_override_redirected(top, bottom);
+        }
+
+        r = notify_override_overridden(top, bottom);
+        if (!arg_diff)
+                return r;
+
+        putchar('\n');
+
+        fflush(stdout);
+
+        r = safe_fork("(diff)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                execlp("diff", "diff", "-us", "--", bottom, top, NULL);
+                log_open();
+                log_error_errno(errno, "Failed to execute diff: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        (void) wait_for_terminate_and_check("diff", pid, WAIT_LOG_ABNORMAL);
+        putchar('\n');
+
+        return r;
+}
+
+static int enumerate_dir_d(
+                OrderedHashmap *top,
+                OrderedHashmap *bottom,
+                OrderedHashmap *drops,
+                const char *toppath, const char *drop) {
+
+        _cleanup_free_ char *unit = NULL;
+        _cleanup_free_ char *path = NULL;
+        _cleanup_strv_free_ char **list = NULL;
+        char *c;
+        int r;
+
+        assert(!endswith(drop, "/"));
+
+        path = path_join(toppath, drop);
+        if (!path)
+                return -ENOMEM;
+
+        log_debug("Looking at %s", path);
+
+        unit = strdup(drop);
+        if (!unit)
+                return -ENOMEM;
+
+        c = strrchr(unit, '.');
+        if (!c)
+                return -EINVAL;
+        *c = 0;
+
+        r = get_files_in_directory(path, &list);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enumerate %s: %m", path);
+
+        strv_sort(list);
+
+        STRV_FOREACH(file, list) {
+                OrderedHashmap *h;
+                int k;
+                char *p;
+                char *d;
+
+                if (!endswith(*file, ".conf"))
+                        continue;
+
+                p = path_join(path, *file);
+                if (!p)
+                        return -ENOMEM;
+                d = p + strlen(toppath) + 1;
+
+                log_debug("Adding at top: %s %s %s", d, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), p);
+                k = ordered_hashmap_put(top, d, p);
+                if (k >= 0) {
+                        p = strdup(p);
+                        if (!p)
+                                return -ENOMEM;
+                        d = p + strlen(toppath) + 1;
+                } else if (k != -EEXIST) {
+                        free(p);
+                        return k;
+                }
+
+                log_debug("Adding at bottom: %s %s %s", d, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), p);
+                free(ordered_hashmap_remove(bottom, d));
+                k = ordered_hashmap_put(bottom, d, p);
+                if (k < 0) {
+                        free(p);
+                        return k;
+                }
+
+                h = ordered_hashmap_get(drops, unit);
+                if (!h) {
+                        h = ordered_hashmap_new(&string_hash_ops);
+                        if (!h)
+                                return -ENOMEM;
+                        ordered_hashmap_put(drops, unit, h);
+                        unit = strdup(unit);
+                        if (!unit)
+                                return -ENOMEM;
+                }
+
+                p = strdup(p);
+                if (!p)
+                        return -ENOMEM;
+
+                log_debug("Adding to drops: %s %s %s %s %s",
+                          unit, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), basename(p), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), p);
+                k = ordered_hashmap_put(h, basename(p), p);
+                if (k < 0) {
+                        free(p);
+                        if (k != -EEXIST)
+                                return k;
+                }
+        }
+        return 0;
+}
+
+static int enumerate_dir(
+                OrderedHashmap *top,
+                OrderedHashmap *bottom,
+                OrderedHashmap *drops,
+                const char *path, bool dropins) {
+
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+        size_t n_files = 0, n_dirs = 0;
+        int r;
+
+        assert(top);
+        assert(bottom);
+        assert(drops);
+        assert(path);
+
+        log_debug("Looking at %s", path);
+
+        d = opendir(path);
+        if (!d) {
+                if (errno == ENOENT)
+                        return 0;
+
+                return log_error_errno(errno, "Failed to open %s: %m", path);
+        }
+
+        FOREACH_DIRENT_ALL(de, d, return -errno) {
+                if (dropins && de->d_type == DT_DIR && endswith(de->d_name, ".d")) {
+                        if (!GREEDY_REALLOC0(dirs, n_dirs + 2))
+                                return -ENOMEM;
+
+                        dirs[n_dirs] = strdup(de->d_name);
+                        if (!dirs[n_dirs])
+                                return -ENOMEM;
+                        n_dirs ++;
+                }
+
+                if (!dirent_is_file(de))
+                        continue;
+
+                if (!GREEDY_REALLOC0(files, n_files + 2))
+                        return -ENOMEM;
+
+                files[n_files] = strdup(de->d_name);
+                if (!files[n_files])
+                        return -ENOMEM;
+                n_files ++;
+        }
+
+        strv_sort(dirs);
+        strv_sort(files);
+
+        STRV_FOREACH(t, dirs) {
+                r = enumerate_dir_d(top, bottom, drops, path, *t);
+                if (r < 0)
+                        return r;
+        }
+
+        STRV_FOREACH(t, files) {
+                _cleanup_free_ char *p = NULL;
+
+                p = path_join(path, *t);
+                if (!p)
+                        return -ENOMEM;
+
+                log_debug("Adding at top: %s %s %s", basename(p), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), p);
+                r = ordered_hashmap_put(top, basename(p), p);
+                if (r >= 0) {
+                        p = strdup(p);
+                        if (!p)
+                                return -ENOMEM;
+                } else if (r != -EEXIST)
+                        return r;
+
+                log_debug("Adding at bottom: %s %s %s", basename(p), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), p);
+                free(ordered_hashmap_remove(bottom, basename(p)));
+                r = ordered_hashmap_put(bottom, basename(p), p);
+                if (r < 0)
+                        return r;
+                p = NULL;
+        }
+
+        return 0;
+}
+
+static int process_suffix(const char *suffix, const char *onlyprefix) {
+        char *f, *key;
+        OrderedHashmap *top, *bottom, *drops, *h;
+        int r = 0, k, n_found = 0;
+        bool dropins;
+
+        assert(suffix);
+        assert(!startswith(suffix, "/"));
+        assert(!strstr(suffix, "//"));
+
+        dropins = nulstr_contains(have_dropins, suffix);
+
+        top = ordered_hashmap_new(&string_hash_ops);
+        bottom = ordered_hashmap_new(&string_hash_ops);
+        drops = ordered_hashmap_new(&string_hash_ops);
+        if (!top || !bottom || !drops) {
+                r = -ENOMEM;
+                goto finish;
+        }
+
+        NULSTR_FOREACH(p, prefixes) {
+                _cleanup_free_ char *t = NULL;
+
+                t = path_join(p, suffix);
+                if (!t) {
+                        r = -ENOMEM;
+                        goto finish;
+                }
+
+                k = enumerate_dir(top, bottom, drops, t, dropins);
+                if (r == 0)
+                        r = k;
+        }
+
+        ORDERED_HASHMAP_FOREACH_KEY(f, key, top) {
+                char *o;
+
+                o = ordered_hashmap_get(bottom, key);
+                assert(o);
+
+                if (!onlyprefix || startswith(o, onlyprefix)) {
+                        if (path_equal(o, f)) {
+                                notify_override_unchanged(f);
+                        } else {
+                                k = found_override(f, o);
+                                if (k < 0)
+                                        r = k;
+                                else
+                                        n_found += k;
+                        }
+                }
+
+                h = ordered_hashmap_get(drops, key);
+                if (h)
+                        ORDERED_HASHMAP_FOREACH(o, h)
+                                if (!onlyprefix || startswith(o, onlyprefix))
+                                        n_found += notify_override_extended(f, o);
+        }
+
+finish:
+        ordered_hashmap_free_free(top);
+        ordered_hashmap_free_free(bottom);
+
+        ORDERED_HASHMAP_FOREACH_KEY(h, key, drops) {
+                ordered_hashmap_free_free(ordered_hashmap_remove(drops, key));
+                ordered_hashmap_remove(drops, key);
+                free(key);
+        }
+        ordered_hashmap_free(drops);
+
+        return r < 0 ? r : n_found;
+}
+
+static int process_suffixes(const char *onlyprefix) {
+        int n_found = 0, r;
+
+        NULSTR_FOREACH(n, suffixes) {
+                r = process_suffix(n, onlyprefix);
+                if (r < 0)
+                        return r;
+
+                n_found += r;
+        }
+
+        return n_found;
+}
+
+static int process_suffix_chop(const char *arg) {
+        assert(arg);
+
+        if (!path_is_absolute(arg))
+                return process_suffix(arg, NULL);
+
+        /* Strip prefix from the suffix */
+        NULSTR_FOREACH(p, prefixes) {
+                const char *suffix;
+
+                suffix = startswith(arg, p);
+                if (suffix) {
+                        suffix += strspn(suffix, "/");
+                        if (*suffix)
+                                return process_suffix(suffix, p);
+                        else
+                                return process_suffixes(arg);
+                }
+        }
+
+        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                               "Invalid suffix specification %s.", arg);
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-delta", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] [SUFFIX...]\n\n"
+               "Find overridden configuration files.\n\n"
+               "  -h --help           Show this help\n"
+               "     --version        Show package version\n"
+               "     --no-pager       Do not pipe output into a pager\n"
+               "     --diff[=1|0]     Show a diff when overridden files differ\n"
+               "  -t --type=LIST...   Only display a selected set of override types\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int parse_flags(const char *flag_str, int flags) {
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+                int r;
+
+                r = extract_first_word(&flag_str, &word, ",", EXTRACT_DONT_COALESCE_SEPARATORS);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return flags;
+
+                if (streq(word, "masked"))
+                        flags |= SHOW_MASKED;
+                else if (streq(word, "equivalent"))
+                        flags |= SHOW_EQUIVALENT;
+                else if (streq(word, "redirected"))
+                        flags |= SHOW_REDIRECTED;
+                else if (streq(word, "overridden"))
+                        flags |= SHOW_OVERRIDDEN;
+                else if (streq(word, "unchanged"))
+                        flags |= SHOW_UNCHANGED;
+                else if (streq(word, "extended"))
+                        flags |= SHOW_EXTENDED;
+                else if (streq(word, "default"))
+                        flags |= SHOW_DEFAULTS;
+                else
+                        return -EINVAL;
+        }
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_NO_PAGER = 0x100,
+                ARG_DIFF,
+                ARG_VERSION
+        };
+
+        static const struct option options[] = {
+                { "help",      no_argument,       NULL, 'h'          },
+                { "version",   no_argument,       NULL, ARG_VERSION  },
+                { "no-pager",  no_argument,       NULL, ARG_NO_PAGER },
+                { "diff",      optional_argument, NULL, ARG_DIFF     },
+                { "type",      required_argument, NULL, 't'          },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 1);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "ht:", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case 't': {
+                        int f;
+                        f = parse_flags(optarg, arg_flags);
+                        if (f < 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Failed to parse flags field.");
+                        arg_flags = f;
+                        break;
+                }
+
+                case ARG_DIFF:
+                        r = parse_boolean_argument("--diff", optarg, NULL);
+                        if (r < 0)
+                                return r;
+                        arg_diff = r;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        int r, k, n_found = 0;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        if (arg_flags == 0)
+                arg_flags = SHOW_DEFAULTS;
+
+        if (arg_diff < 0)
+                arg_diff = !!(arg_flags & SHOW_OVERRIDDEN);
+        else if (arg_diff)
+                arg_flags |= SHOW_OVERRIDDEN;
+
+        pager_open(arg_pager_flags);
+
+        if (optind < argc) {
+                for (int i = optind; i < argc; i++) {
+                        path_simplify(argv[i]);
+
+                        k = process_suffix_chop(argv[i]);
+                        if (k < 0)
+                                r = k;
+                        else
+                                n_found += k;
+                }
+
+        } else {
+                k = process_suffixes(NULL);
+                if (k < 0)
+                        r = k;
+                else
+                        n_found += k;
+        }
+
+        if (r >= 0)
+                printf("%s%i overridden configuration files found.\n", n_found ? "\n" : "", n_found);
+        return r;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/delta/meson.build b/src/delta/meson.build
new file mode 100644
index 0000000..31c4be2
--- /dev/null
+++ b/src/delta/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-delta',
+                'public' : true,
+                'sources' : files('delta.c'),
+        },
+]
diff --git a/src/detect-virt/detect-virt.c b/src/detect-virt/detect-virt.c
new file mode 100644
index 0000000..2a65a3e
--- /dev/null
+++ b/src/detect-virt/detect-virt.c
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "build.h"
+#include "confidential-virt.h"
+#include "main-func.h"
+#include "pretty-print.h"
+#include "string-table.h"
+#include "virt.h"
+
+static bool arg_quiet = false;
+static enum {
+        ANY_VIRTUALIZATION,
+        ONLY_VM,
+        ONLY_CONTAINER,
+        ONLY_CHROOT,
+        ONLY_PRIVATE_USERS,
+        ONLY_CVM,
+} arg_mode = ANY_VIRTUALIZATION;
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-detect-virt", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...]\n\n"
+               "Detect execution in a virtualized environment.\n\n"
+               "  -h --help             Show this help\n"
+               "     --version          Show package version\n"
+               "  -c --container        Only detect whether we are run in a container\n"
+               "  -v --vm               Only detect whether we are run in a VM\n"
+               "  -r --chroot           Detect whether we are run in a chroot() environment\n"
+               "     --private-users    Only detect whether we are running in a user namespace\n"
+               "     --cvm              Only detect whether we are run in a confidential VM\n"
+               "  -q --quiet            Don't output anything, just set return value\n"
+               "     --list             List all known and detectable types of virtualization\n"
+               "     --list-cvm         List all known and detectable types of confidential \n"
+               "                        virtualization\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_PRIVATE_USERS,
+                ARG_LIST,
+                ARG_CVM,
+                ARG_LIST_CVM,
+        };
+
+        static const struct option options[] = {
+                { "help",          no_argument, NULL, 'h'               },
+                { "version",       no_argument, NULL, ARG_VERSION       },
+                { "container",     no_argument, NULL, 'c'               },
+                { "vm",            no_argument, NULL, 'v'               },
+                { "chroot",        no_argument, NULL, 'r'               },
+                { "private-users", no_argument, NULL, ARG_PRIVATE_USERS },
+                { "quiet",         no_argument, NULL, 'q'               },
+                { "cvm",           no_argument, NULL, ARG_CVM           },
+                { "list",          no_argument, NULL, ARG_LIST          },
+                { "list-cvm",      no_argument, NULL, ARG_LIST_CVM      },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hqcvr", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case 'q':
+                        arg_quiet = true;
+                        break;
+
+                case 'c':
+                        arg_mode = ONLY_CONTAINER;
+                        break;
+
+                case ARG_PRIVATE_USERS:
+                        arg_mode = ONLY_PRIVATE_USERS;
+                        break;
+
+                case 'v':
+                        arg_mode = ONLY_VM;
+                        break;
+
+                case 'r':
+                        arg_mode = ONLY_CHROOT;
+                        break;
+
+                case ARG_LIST:
+                        DUMP_STRING_TABLE(virtualization, Virtualization, _VIRTUALIZATION_MAX);
+                        return 0;
+
+                case ARG_CVM:
+                        arg_mode = ONLY_CVM;
+                        return 1;
+
+                case ARG_LIST_CVM:
+                        DUMP_STRING_TABLE(confidential_virtualization, ConfidentialVirtualization, _CONFIDENTIAL_VIRTUALIZATION_MAX);
+                        return 0;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (optind < argc)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "%s takes no arguments.",
+                                       program_invocation_short_name);
+
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        Virtualization v;
+        ConfidentialVirtualization c;
+        int r;
+
+        /* This is mostly intended to be used for scripts which want
+         * to detect whether we are being run in a virtualized
+         * environment or not */
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        switch (arg_mode) {
+        case ONLY_VM:
+                v = detect_vm();
+                if (v < 0)
+                        return log_error_errno(v, "Failed to check for VM: %m");
+                break;
+
+        case ONLY_CONTAINER:
+                v = detect_container();
+                if (v < 0)
+                        return log_error_errno(v, "Failed to check for container: %m");
+                break;
+
+        case ONLY_CHROOT:
+                r = running_in_chroot();
+                if (r < 0)
+                        return log_error_errno(r, "Failed to check for chroot() environment: %m");
+                return !r;
+
+        case ONLY_PRIVATE_USERS:
+                r = running_in_userns();
+                if (r < 0)
+                        return log_error_errno(r, "Failed to check for user namespace: %m");
+                return !r;
+
+        case ONLY_CVM:
+                c = detect_confidential_virtualization();
+                if (c < 0)
+                        return log_error_errno(c, "Failed to check for confidential virtualization: %m");
+                if (!arg_quiet)
+                        puts(confidential_virtualization_to_string(c));
+                return c == CONFIDENTIAL_VIRTUALIZATION_NONE;
+
+        case ANY_VIRTUALIZATION:
+        default:
+                v = detect_virtualization();
+                if (v < 0)
+                        return log_error_errno(v, "Failed to check for virtualization: %m");
+                break;
+        }
+
+        if (!arg_quiet)
+                puts(virtualization_to_string(v));
+
+        return v == VIRTUALIZATION_NONE;
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/detect-virt/meson.build b/src/detect-virt/meson.build
new file mode 100644
index 0000000..7bcb298
--- /dev/null
+++ b/src/detect-virt/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-detect-virt',
+                'public' : true,
+                'sources' : files('detect-virt.c'),
+        },
+]
diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c
new file mode 100644
index 0000000..92432b6
--- /dev/null
+++ b/src/dissect/dissect.c
@@ -0,0 +1,1927 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-device.h"
+
+#include "architecture.h"
+#include "blockdev-util.h"
+#include "build.h"
+#include "chase.h"
+#include "copy.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "discover-image.h"
+#include "dissect-image.h"
+#include "env-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-table.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "log.h"
+#include "loop-util.h"
+#include "main-func.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "recurse-dir.h"
+#include "sha256.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+static enum {
+        ACTION_DISSECT,
+        ACTION_MOUNT,
+        ACTION_UMOUNT,
+        ACTION_ATTACH,
+        ACTION_DETACH,
+        ACTION_LIST,
+        ACTION_MTREE,
+        ACTION_WITH,
+        ACTION_COPY_FROM,
+        ACTION_COPY_TO,
+        ACTION_DISCOVER,
+        ACTION_VALIDATE,
+} arg_action = ACTION_DISSECT;
+static char *arg_image = NULL;
+static char *arg_root = NULL;
+static char *arg_path = NULL;
+static const char *arg_source = NULL;
+static const char *arg_target = NULL;
+static DissectImageFlags arg_flags =
+        DISSECT_IMAGE_GENERIC_ROOT |
+        DISSECT_IMAGE_DISCARD_ON_LOOP |
+        DISSECT_IMAGE_RELAX_VAR_CHECK |
+        DISSECT_IMAGE_FSCK |
+        DISSECT_IMAGE_USR_NO_ROOT |
+        DISSECT_IMAGE_GROWFS |
+        DISSECT_IMAGE_PIN_PARTITION_DEVICES |
+        DISSECT_IMAGE_ADD_PARTITION_DEVICES;
+static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+static PagerFlags arg_pager_flags = 0;
+static bool arg_legend = true;
+static bool arg_rmdir = false;
+static bool arg_in_memory = false;
+static char **arg_argv = NULL;
+static char *arg_loop_ref = NULL;
+static ImagePolicy* arg_image_policy = NULL;
+static bool arg_mtree_hash = true;
+
+STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
+STATIC_DESTRUCTOR_REGISTER(arg_argv, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_loop_ref, freep);
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        pager_open(arg_pager_flags);
+
+        r = terminal_urlify_man("systemd-dissect", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s [OPTIONS...] IMAGE\n"
+               "%1$s [OPTIONS...] --mount IMAGE PATH\n"
+               "%1$s [OPTIONS...] --umount PATH\n"
+               "%1$s [OPTIONS...] --attach IMAGE\n"
+               "%1$s [OPTIONS...] --detach PATH\n"
+               "%1$s [OPTIONS...] --list IMAGE\n"
+               "%1$s [OPTIONS...] --mtree IMAGE\n"
+               "%1$s [OPTIONS...] --with IMAGE [COMMAND…]\n"
+               "%1$s [OPTIONS...] --copy-from IMAGE PATH [TARGET]\n"
+               "%1$s [OPTIONS...] --copy-to IMAGE [SOURCE] PATH\n"
+               "%1$s [OPTIONS...] --discover\n"
+               "%1$s [OPTIONS...] --validate IMAGE\n"
+               "\n%5$sDissect a Discoverable Disk Image (DDI).%6$s\n\n"
+               "%3$sOptions:%4$s\n"
+               "     --no-pager           Do not pipe output into a pager\n"
+               "     --no-legend          Do not show the headers and footers\n"
+               "  -r --read-only          Mount read-only\n"
+               "     --fsck=BOOL          Run fsck before mounting\n"
+               "     --growfs=BOOL        Grow file system to partition size, if marked\n"
+               "     --mkdir              Make mount directory before mounting, if missing\n"
+               "     --rmdir              Remove mount directory after unmounting\n"
+               "     --discard=MODE       Choose 'discard' mode (disabled, loop, all, crypto)\n"
+               "     --in-memory          Copy image into memory\n"
+               "     --root-hash=HASH     Specify root hash for verity\n"
+               "     --root-hash-sig=SIG  Specify pkcs7 signature of root hash for verity\n"
+               "                          as a DER encoded PKCS7, either as a path to a file\n"
+               "                          or as an ASCII base64 encoded string prefixed by\n"
+               "                          'base64:'\n"
+               "     --verity-data=PATH   Specify data file with hash tree for verity if it is\n"
+               "                          not embedded in IMAGE\n"
+               "     --image-policy=POLICY\n"
+               "                          Specify image dissection policy\n"
+               "     --json=pretty|short|off\n"
+               "                          Generate JSON output\n"
+               "     --loop-ref=NAME      Set reference string for loopback device\n"
+               "     --mtree-hash=BOOL    Whether to include SHA256 hash in the mtree output\n"
+               "\n%3$sCommands:%4$s\n"
+               "  -h --help               Show this help\n"
+               "     --version            Show package version\n"
+               "  -m --mount              Mount the image to the specified directory\n"
+               "  -M                      Shortcut for --mount --mkdir\n"
+               "  -u --umount             Unmount the image from the specified directory\n"
+               "  -U                      Shortcut for --umount --rmdir\n"
+               "     --attach             Attach the disk image to a loopback block device\n"
+               "     --detach             Detach a loopback block device gain\n"
+               "  -l --list               List all the files and directories of the specified\n"
+               "                          OS image\n"
+               "     --mtree              Show BSD mtree manifest of OS image\n"
+               "     --with               Mount, run command, unmount\n"
+               "  -x --copy-from          Copy files from image to host\n"
+               "  -a --copy-to            Copy files from host to image\n"
+               "     --discover           Discover DDIs in well known directories\n"
+               "     --validate           Validate image and image policy\n"
+               "\nSee the %2$s for details.\n",
+               program_invocation_short_name,
+               link,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int patch_argv(int *argc, char ***argv, char ***buf) {
+        _cleanup_free_ char **l = NULL;
+        char **e;
+
+        assert(argc);
+        assert(*argc >= 0);
+        assert(argv);
+        assert(*argv);
+        assert(buf);
+
+        /* Ugly hack: if --with is included in command line, also insert "--" immediately after it, to make
+         * getopt_long() stop processing switches */
+
+        for (e = *argv + 1; e < *argv + *argc; e++) {
+                assert(*e);
+
+                if (streq(*e, "--with"))
+                        break;
+        }
+
+        if (e >= *argv + *argc || streq_ptr(e[1], "--")) {
+                /* No --with used? Or already followed by "--"? Then don't do anything */
+                *buf = NULL;
+                return 0;
+        }
+
+        /* Insert the extra "--" right after the --with */
+        l = new(char*, *argc + 2);
+        if (!l)
+                return log_oom();
+
+        size_t idx = e - *argv + 1;
+        memcpy(l, *argv, sizeof(char*) * idx);                          /* copy everything up to and including the --with */
+        l[idx] = (char*) "--";                                          /* insert "--" */
+        memcpy(l + idx + 1, e + 1, sizeof(char*) * (*argc - idx + 1));  /* copy the rest, including trailing NULL entry */
+
+        (*argc)++;
+        (*argv) = l;
+
+        *buf = TAKE_PTR(l);
+        return 1;
+}
+
+static int parse_image_path_argument(const char *path, char **ret_root, char **ret_image) {
+        _cleanup_free_ char *p = NULL;
+        struct stat st;
+        int r;
+
+        assert(ret_image);
+
+        r = parse_path_argument(path, /* suppress_root= */ false, &p);
+        if (r < 0)
+                return r;
+
+        if (stat(p, &st) < 0)
+                return log_error_errno(errno, "Failed to stat %s: %m", p);
+
+        if (S_ISDIR(st.st_mode)) {
+                if (!ret_root)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not an image file.", p);
+
+                *ret_root = TAKE_PTR(p);
+        } else
+                *ret_image = TAKE_PTR(p);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_PAGER,
+                ARG_NO_LEGEND,
+                ARG_WITH,
+                ARG_DISCARD,
+                ARG_FSCK,
+                ARG_GROWFS,
+                ARG_ROOT_HASH,
+                ARG_ROOT_HASH_SIG,
+                ARG_VERITY_DATA,
+                ARG_MKDIR,
+                ARG_RMDIR,
+                ARG_IN_MEMORY,
+                ARG_JSON,
+                ARG_MTREE,
+                ARG_DISCOVER,
+                ARG_ATTACH,
+                ARG_DETACH,
+                ARG_LOOP_REF,
+                ARG_IMAGE_POLICY,
+                ARG_VALIDATE,
+                ARG_MTREE_HASH,
+        };
+
+        static const struct option options[] = {
+                { "help",          no_argument,       NULL, 'h'               },
+                { "version",       no_argument,       NULL, ARG_VERSION       },
+                { "no-pager",      no_argument,       NULL, ARG_NO_PAGER      },
+                { "no-legend",     no_argument,       NULL, ARG_NO_LEGEND     },
+                { "mount",         no_argument,       NULL, 'm'               },
+                { "umount",        no_argument,       NULL, 'u'               },
+                { "attach",        no_argument,       NULL, ARG_ATTACH        },
+                { "detach",        no_argument,       NULL, ARG_DETACH        },
+                { "with",          no_argument,       NULL, ARG_WITH          },
+                { "read-only",     no_argument,       NULL, 'r'               },
+                { "discard",       required_argument, NULL, ARG_DISCARD       },
+                { "fsck",          required_argument, NULL, ARG_FSCK          },
+                { "growfs",        required_argument, NULL, ARG_GROWFS        },
+                { "root-hash",     required_argument, NULL, ARG_ROOT_HASH     },
+                { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
+                { "verity-data",   required_argument, NULL, ARG_VERITY_DATA   },
+                { "mkdir",         no_argument,       NULL, ARG_MKDIR         },
+                { "rmdir",         no_argument,       NULL, ARG_RMDIR         },
+                { "in-memory",     no_argument,       NULL, ARG_IN_MEMORY     },
+                { "list",          no_argument,       NULL, 'l'               },
+                { "mtree",         no_argument,       NULL, ARG_MTREE         },
+                { "copy-from",     no_argument,       NULL, 'x'               },
+                { "copy-to",       no_argument,       NULL, 'a'               },
+                { "json",          required_argument, NULL, ARG_JSON          },
+                { "discover",      no_argument,       NULL, ARG_DISCOVER      },
+                { "loop-ref",      required_argument, NULL, ARG_LOOP_REF      },
+                { "image-policy",  required_argument, NULL, ARG_IMAGE_POLICY  },
+                { "validate",      no_argument,       NULL, ARG_VALIDATE      },
+                { "mtree-hash",    required_argument, NULL, ARG_MTREE_HASH    },
+                {}
+        };
+
+        _cleanup_free_ char **buf = NULL; /* we use free(), not strv_free() here, as we don't copy the strings here */
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        r = patch_argv(&argc, &argv, &buf);
+        if (r < 0)
+                return r;
+
+        while ((c = getopt_long(argc, argv, "hmurMUlxa", options, NULL)) >= 0) {
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_NO_LEGEND:
+                        arg_legend = false;
+                        break;
+
+                case 'm':
+                        arg_action = ACTION_MOUNT;
+                        break;
+
+                case ARG_MKDIR:
+                        arg_flags |= DISSECT_IMAGE_MKDIR;
+                        break;
+
+                case 'M':
+                        /* Shortcut combination of the above two */
+                        arg_action = ACTION_MOUNT;
+                        arg_flags |= DISSECT_IMAGE_MKDIR;
+                        break;
+
+                case 'u':
+                        arg_action = ACTION_UMOUNT;
+                        break;
+
+                case ARG_RMDIR:
+                        arg_rmdir = true;
+                        break;
+
+                case 'U':
+                        /* Shortcut combination of the above two */
+                        arg_action = ACTION_UMOUNT;
+                        arg_rmdir = true;
+                        break;
+
+                case ARG_ATTACH:
+                        arg_action = ACTION_ATTACH;
+                        break;
+
+                case ARG_DETACH:
+                        arg_action = ACTION_DETACH;
+                        break;
+
+                case 'l':
+                        arg_action = ACTION_LIST;
+                        arg_flags |= DISSECT_IMAGE_READ_ONLY;
+                        break;
+
+                case ARG_MTREE:
+                        arg_action = ACTION_MTREE;
+                        arg_flags |= DISSECT_IMAGE_READ_ONLY;
+                        break;
+
+                case ARG_WITH:
+                        arg_action = ACTION_WITH;
+                        break;
+
+                case 'x':
+                        arg_action = ACTION_COPY_FROM;
+                        arg_flags |= DISSECT_IMAGE_READ_ONLY;
+                        break;
+
+                case 'a':
+                        arg_action = ACTION_COPY_TO;
+                        break;
+
+                case 'r':
+                        arg_flags |= DISSECT_IMAGE_READ_ONLY;
+                        break;
+
+                case ARG_DISCARD: {
+                        DissectImageFlags flags;
+
+                        if (streq(optarg, "disabled"))
+                                flags = 0;
+                        else if (streq(optarg, "loop"))
+                                flags = DISSECT_IMAGE_DISCARD_ON_LOOP;
+                        else if (streq(optarg, "all"))
+                                flags = DISSECT_IMAGE_DISCARD_ON_LOOP | DISSECT_IMAGE_DISCARD;
+                        else if (streq(optarg, "crypt"))
+                                flags = DISSECT_IMAGE_DISCARD_ANY;
+                        else if (streq(optarg, "list")) {
+                                puts("disabled\n"
+                                     "all\n"
+                                     "crypt\n"
+                                     "loop");
+                                return 0;
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unknown --discard= parameter: %s",
+                                                       optarg);
+                        arg_flags = (arg_flags & ~DISSECT_IMAGE_DISCARD_ANY) | flags;
+
+                        break;
+                }
+
+                case ARG_IN_MEMORY:
+                        arg_in_memory = true;
+                        break;
+
+                case ARG_ROOT_HASH: {
+                        _cleanup_free_ void *p = NULL;
+                        size_t l;
+
+                        r = unhexmem(optarg, strlen(optarg), &p, &l);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse root hash '%s': %m", optarg);
+                        if (l < sizeof(sd_id128_t))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Root hash must be at least 128-bit long: %s", optarg);
+
+                        free_and_replace(arg_verity_settings.root_hash, p);
+                        arg_verity_settings.root_hash_size = l;
+                        break;
+                }
+
+                case ARG_ROOT_HASH_SIG: {
+                        char *value;
+                        size_t l;
+                        void *p;
+
+                        if ((value = startswith(optarg, "base64:"))) {
+                                r = unbase64mem(value, strlen(value), &p, &l);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
+                        } else {
+                                r = read_full_file(optarg, (char**) &p, &l);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to read root hash signature file '%s': %m", optarg);
+                        }
+
+                        free_and_replace(arg_verity_settings.root_hash_sig, p);
+                        arg_verity_settings.root_hash_sig_size = l;
+                        break;
+                }
+
+                case ARG_VERITY_DATA:
+                        r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_FSCK:
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --fsck= parameter: %s", optarg);
+
+                        SET_FLAG(arg_flags, DISSECT_IMAGE_FSCK, r);
+                        break;
+
+                case ARG_GROWFS:
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --growfs= parameter: %s", optarg);
+
+                        SET_FLAG(arg_flags, DISSECT_IMAGE_GROWFS, r);
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case ARG_DISCOVER:
+                        arg_action = ACTION_DISCOVER;
+                        break;
+
+                case ARG_LOOP_REF:
+                        if (isempty(optarg)) {
+                                arg_loop_ref = mfree(arg_loop_ref);
+                                break;
+                        }
+
+                        if (strlen(optarg) >= sizeof_field(struct loop_info64, lo_file_name))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Loop device ref string '%s' is too long.", optarg);
+
+                        r = free_and_strdup_warn(&arg_loop_ref, optarg);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE_POLICY:
+                        r = parse_image_policy_argument(optarg, &arg_image_policy);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_VALIDATE:
+                        arg_action = ACTION_VALIDATE;
+                        break;
+
+                case ARG_MTREE_HASH:
+                        r = parse_boolean_argument("--mtree-hash=", optarg, &arg_mtree_hash);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        switch (arg_action) {
+
+        case ACTION_DISSECT:
+                if (optind + 1 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file path as only argument.");
+
+                r = parse_image_path_argument(argv[optind], NULL, &arg_image);
+                if (r < 0)
+                        return r;
+
+                /* when dumping image info be even more liberal than otherwise, do not even require a single valid partition */
+                arg_flags |= DISSECT_IMAGE_READ_ONLY|DISSECT_IMAGE_ALLOW_EMPTY;
+                break;
+
+        case ACTION_MOUNT:
+                if (optind + 2 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file path and mount point path as only arguments.");
+
+                r = parse_image_path_argument(argv[optind], NULL, &arg_image);
+                if (r < 0)
+                        return r;
+
+                r = parse_path_argument(argv[optind+1], /* suppress_root= */ false, &arg_path);
+                if (r < 0)
+                        return r;
+
+                arg_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
+                break;
+
+        case ACTION_UMOUNT:
+                if (optind + 1 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected a mount point path as only argument.");
+
+                r = parse_path_argument(argv[optind], /* suppress_root= */ false, &arg_path);
+                if (r < 0)
+                        return r;
+                break;
+
+        case ACTION_ATTACH:
+                if (optind + 1 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file path as only argument.");
+
+                r = parse_image_path_argument(argv[optind], NULL, &arg_image);
+                if (r < 0)
+                        return r;
+                break;
+
+        case ACTION_DETACH:
+                if (optind + 1 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file path or loopback device as only argument.");
+
+                r = parse_image_path_argument(argv[optind], NULL, &arg_image);
+                if (r < 0)
+                        return r;
+                break;
+
+        case ACTION_LIST:
+        case ACTION_MTREE:
+                if (optind + 1 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file or directory path as only argument.");
+
+                r = parse_image_path_argument(argv[optind], &arg_root, &arg_image);
+                if (r < 0)
+                        return r;
+
+                arg_flags |= DISSECT_IMAGE_READ_ONLY | DISSECT_IMAGE_REQUIRE_ROOT;
+                break;
+
+        case ACTION_COPY_FROM:
+                if (argc < optind + 2 || argc > optind + 3)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file or directory path, a source path and an optional destination path as only arguments.");
+
+                r = parse_image_path_argument(argv[optind], &arg_root, &arg_image);
+                if (r < 0)
+                        return r;
+                arg_source = argv[optind + 1];
+                arg_target = argc > optind + 2 ? argv[optind + 2] : "-" /* this means stdout */ ;
+
+                arg_flags |= DISSECT_IMAGE_READ_ONLY | DISSECT_IMAGE_REQUIRE_ROOT;
+                break;
+
+        case ACTION_COPY_TO:
+                if (argc < optind + 2 || argc > optind + 3)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file or directory path, an optional source path and a destination path as only arguments.");
+
+                r = parse_image_path_argument(argv[optind], &arg_root, &arg_image);
+                if (r < 0)
+                        return r;
+
+                if (argc > optind + 2) {
+                        arg_source = argv[optind + 1];
+                        arg_target = argv[optind + 2];
+                } else {
+                        arg_source = "-"; /* this means stdin */
+                        arg_target = argv[optind + 1];
+                }
+
+                arg_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
+                break;
+
+        case ACTION_WITH:
+                if (optind >= argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file path and an optional command line.");
+
+                r = parse_image_path_argument(argv[optind], NULL, &arg_image);
+                if (r < 0)
+                        return r;
+
+                if (argc > optind + 1) {
+                        arg_argv = strv_copy(argv + optind + 1);
+                        if (!arg_argv)
+                                return log_oom();
+                }
+
+                break;
+
+        case ACTION_DISCOVER:
+                if (optind != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected no argument.");
+                break;
+
+        case ACTION_VALIDATE:
+                if (optind + 1 != argc)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Expected an image file path as only argument.");
+
+                r = parse_image_path_argument(argv[optind], NULL, &arg_image);
+                if (r < 0)
+                        return r;
+
+                arg_flags |= DISSECT_IMAGE_READ_ONLY;
+                arg_flags &= ~(DISSECT_IMAGE_PIN_PARTITION_DEVICES|DISSECT_IMAGE_ADD_PARTITION_DEVICES);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 1;
+}
+
+static int parse_argv_as_mount_helper(int argc, char *argv[]) {
+        const char *options = NULL;
+        bool fake = false;
+        int c, r;
+
+        /* Implements util-linux "external helper" command line interface, as per mount(8) man page. */
+
+        while ((c = getopt(argc, argv, "sfnvN:o:t:")) >= 0) {
+                switch(c) {
+
+                case 'f':
+                        fake = true;
+                        break;
+
+                case 'o':
+                        options = optarg;
+                        break;
+
+                case 't':
+                        if (!streq(optarg, "ddi"))
+                                log_debug("Unexpected file system type '%s', ignoring.", optarg);
+                        break;
+
+                case 's': /* sloppy mount options */
+                case 'n': /* aka --no-mtab */
+                case 'v': /* aka --verbose */
+                        log_debug("Ignoring option -%c, not implemented.", c);
+                        break;
+
+                case 'N': /* aka --namespace= */
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Option -%c is not implemented, refusing.", c);
+
+                case '?':
+                        return -EINVAL;
+                }
+        }
+
+        if (optind + 2 != argc)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Expected an image file path and target directory as only argument.");
+
+        for (const char *p = options;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract mount option: %m");
+                if (r == 0)
+                        break;
+
+                if (streq(word, "ro"))
+                        SET_FLAG(arg_flags, DISSECT_IMAGE_READ_ONLY, true);
+                else if (streq(word, "rw"))
+                        SET_FLAG(arg_flags, DISSECT_IMAGE_READ_ONLY, false);
+                else if (streq(word, "discard"))
+                        SET_FLAG(arg_flags, DISSECT_IMAGE_DISCARD_ANY, true);
+                else if (streq(word, "nodiscard"))
+                        SET_FLAG(arg_flags, DISSECT_IMAGE_DISCARD_ANY, false);
+                else
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Unknown mount option '%s'.", word);
+        }
+
+        if (fake)
+                return 0;
+
+        r = parse_path_argument(argv[optind], /* suppress_root= */ false, &arg_image);
+        if (r < 0)
+                return r;
+
+        r = parse_path_argument(argv[optind+1], /* suppress_root= */ false, &arg_path);
+        if (r < 0)
+                return r;
+
+        arg_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
+        arg_action = ACTION_MOUNT;
+        return 1;
+}
+
+static void strv_pair_print(char **l, const char *prefix) {
+        assert(prefix);
+
+        STRV_FOREACH_PAIR(p, q, l)
+                if (p == l)
+                        printf("%s %s=%s\n", prefix, *p, *q);
+                else
+                        printf("%*s %s=%s\n", (int) strlen(prefix), "", *p, *q);
+}
+
+static int get_extension_scopes(DissectedImage *m, ImageClass class, char ***ret_scopes) {
+        _cleanup_strv_free_ char **l = NULL;
+        const char *e, *field_name;
+        char **release_data;
+
+        assert(m);
+        assert(ret_scopes);
+
+        switch (class) {
+
+        case IMAGE_SYSEXT:
+                release_data = m->sysext_release;
+                field_name = "SYSEXT_SCOPE";
+                break;
+
+        case IMAGE_CONFEXT:
+                release_data = m->confext_release;
+                field_name = "CONFEXT_SCOPE";
+                break;
+
+        default:
+                return -EINVAL;
+        }
+
+        /* If there's no extension-release file its not a system extension. Otherwise the SYSEXT_SCOPE
+         * field for sysext images and the CONFEXT_SCOPE field for confext images indicates which scope
+         * it is for — and it defaults to "system" + "portable" if unset. */
+
+        if (!release_data) {
+                *ret_scopes = NULL;
+                return 0;
+        }
+
+        e = strv_env_pairs_get(release_data, field_name);
+        if (e)
+                l = strv_split(e, WHITESPACE);
+        else
+                l = strv_new("system", "portable");
+        if (!l)
+                return -ENOMEM;
+
+        *ret_scopes = TAKE_PTR(l);
+        return 1;
+}
+
+static int action_dissect(DissectedImage *m, LoopDevice *d) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(table_unrefp) Table *t = NULL;
+        _cleanup_free_ char *bn = NULL;
+        uint64_t size = UINT64_MAX;
+        int r;
+
+        assert(m);
+        assert(d);
+
+        r = path_extract_filename(arg_image, &bn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract file name from image path '%s': %m", arg_image);
+
+        if (arg_json_format_flags & (JSON_FORMAT_OFF|JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+                pager_open(arg_pager_flags);
+
+        if (arg_json_format_flags & JSON_FORMAT_OFF)
+                printf("      Name: %s%s%s\n", ansi_highlight(), bn, ansi_normal());
+
+        if (ioctl(d->fd, BLKGETSIZE64, &size) < 0)
+                log_debug_errno(errno, "Failed to query size of loopback device: %m");
+        else if (arg_json_format_flags & JSON_FORMAT_OFF)
+                printf("      Size: %s\n", FORMAT_BYTES(size));
+
+        if (arg_json_format_flags & JSON_FORMAT_OFF) {
+                printf(" Sec. Size: %" PRIu32 "\n", m->sector_size);
+
+                printf("     Arch.: %s\n",
+                       strna(architecture_to_string(dissected_image_architecture(m))));
+
+                putc('\n', stdout);
+                fflush(stdout);
+        }
+
+        r = dissected_image_acquire_metadata(m, 0);
+        if (r == -ENXIO)
+                return log_error_errno(r, "No root partition discovered.");
+        if (r == -EUCLEAN)
+                return log_error_errno(r, "File system check of image failed.");
+        if (r == -EMEDIUMTYPE)
+                log_warning_errno(r, "Not a valid OS image, no os-release file included. Proceeding anyway.");
+        else if (r == -EUNATCH)
+                log_warning_errno(r, "OS image is encrypted, proceeding without showing OS image metadata.");
+        else if (r == -EBUSY)
+                log_warning_errno(r, "OS image is currently in use, proceeding without showing OS image metadata.");
+        else if (r < 0)
+                return log_error_errno(r, "Failed to acquire image metadata: %m");
+        else if (arg_json_format_flags & JSON_FORMAT_OFF) {
+
+                if (!sd_id128_is_null(m->image_uuid))
+                        printf("Image UUID: %s\n", SD_ID128_TO_UUID_STRING(m->image_uuid));
+
+                if (m->hostname)
+                        printf("  Hostname: %s\n", m->hostname);
+
+                if (!sd_id128_is_null(m->machine_id))
+                        printf("Machine ID: " SD_ID128_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(m->machine_id));
+
+                strv_pair_print(m->machine_info,
+                               "Mach. Info:");
+                strv_pair_print(m->os_release,
+                               "OS Release:");
+                strv_pair_print(m->initrd_release,
+                                "initrd R.:");
+                strv_pair_print(m->sysext_release,
+                               " sysext R.:");
+                strv_pair_print(m->confext_release,
+                               "confext R.:");
+
+                if (m->hostname ||
+                    !sd_id128_is_null(m->machine_id) ||
+                    !strv_isempty(m->machine_info) ||
+                    !strv_isempty(m->os_release) ||
+                    !strv_isempty(m->initrd_release) ||
+                    !strv_isempty(m->sysext_release) ||
+                    !strv_isempty(m->confext_release))
+                        putc('\n', stdout);
+
+                printf("    Use As: %s bootable system for UEFI\n",
+                       COLOR_MARK_BOOL(dissected_image_is_bootable_uefi(m)));
+                printf("            %s bootable system for container\n",
+                       COLOR_MARK_BOOL(dissected_image_is_bootable_os(m)));
+                printf("            %s portable service\n",
+                       COLOR_MARK_BOOL(dissected_image_is_portable(m)));
+                printf("            %s initrd\n",
+                       COLOR_MARK_BOOL(dissected_image_is_initrd(m)));
+
+                for (ImageClass c = _IMAGE_CLASS_EXTENSION_FIRST; c <= _IMAGE_CLASS_EXTENSION_LAST; c++) {
+                        const char *string_class = image_class_to_string(c);
+                        _cleanup_strv_free_ char **extension_scopes = NULL;
+
+                        r = get_extension_scopes(m, c, &extension_scopes);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse scopes: %m");
+
+                        printf("            %s %s for system\n",
+                               COLOR_MARK_BOOL(strv_contains(extension_scopes, "system")), string_class);
+                        printf("            %s %s for portable service\n",
+                               COLOR_MARK_BOOL(strv_contains(extension_scopes, "portable")), string_class);
+                        printf("            %s %s for initrd\n",
+                               COLOR_MARK_BOOL(strv_contains(extension_scopes, "initrd")), string_class);
+                }
+
+                putc('\n', stdout);
+        } else {
+                _cleanup_strv_free_ char **sysext_scopes = NULL, **confext_scopes = NULL;
+
+                r = get_extension_scopes(m, IMAGE_SYSEXT, &sysext_scopes);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse sysext scopes: %m");
+
+                r = get_extension_scopes(m, IMAGE_CONFEXT, &confext_scopes);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse confext scopes: %m");
+
+                Architecture a = dissected_image_architecture(m);
+
+                r = json_build(&v, JSON_BUILD_OBJECT(
+                                               JSON_BUILD_PAIR("name", JSON_BUILD_STRING(bn)),
+                                               JSON_BUILD_PAIR_CONDITION(size != UINT64_MAX, "size", JSON_BUILD_INTEGER(size)),
+                                               JSON_BUILD_PAIR("sectorSize", JSON_BUILD_INTEGER(m->sector_size)),
+                                               JSON_BUILD_PAIR_CONDITION(a >= 0, "architecture", JSON_BUILD_STRING(architecture_to_string(a))),
+                                               JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(m->image_uuid), "imageUuid", JSON_BUILD_UUID(m->image_uuid)),
+                                               JSON_BUILD_PAIR_CONDITION(m->hostname, "hostname", JSON_BUILD_STRING(m->hostname)),
+                                               JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(m->machine_id), "machineId", JSON_BUILD_ID128(m->machine_id)),
+                                               JSON_BUILD_PAIR_CONDITION(!strv_isempty(m->machine_info), "machineInfo", JSON_BUILD_STRV_ENV_PAIR(m->machine_info)),
+                                               JSON_BUILD_PAIR_CONDITION(!strv_isempty(m->os_release), "osRelease", JSON_BUILD_STRV_ENV_PAIR(m->os_release)),
+                                               JSON_BUILD_PAIR_CONDITION(!strv_isempty(m->initrd_release), "initrdRelease", JSON_BUILD_STRV_ENV_PAIR(m->initrd_release)),
+                                               JSON_BUILD_PAIR_CONDITION(!strv_isempty(m->sysext_release), "sysextRelease", JSON_BUILD_STRV_ENV_PAIR(m->sysext_release)),
+                                               JSON_BUILD_PAIR_CONDITION(!strv_isempty(m->confext_release), "confextRelease", JSON_BUILD_STRV_ENV_PAIR(m->confext_release)),
+                                               JSON_BUILD_PAIR("useBootableUefi", JSON_BUILD_BOOLEAN(dissected_image_is_bootable_uefi(m))),
+                                               JSON_BUILD_PAIR("useBootableContainer", JSON_BUILD_BOOLEAN(dissected_image_is_bootable_os(m))),
+                                               JSON_BUILD_PAIR("useInitrd", JSON_BUILD_BOOLEAN(dissected_image_is_initrd(m))),
+                                               JSON_BUILD_PAIR("usePortableService", JSON_BUILD_BOOLEAN(dissected_image_is_portable(m))),
+                                               JSON_BUILD_PAIR("useSystemExtension", JSON_BUILD_BOOLEAN(strv_contains(sysext_scopes, "system"))),
+                                               JSON_BUILD_PAIR("useInitRDSystemExtension", JSON_BUILD_BOOLEAN(strv_contains(sysext_scopes, "initrd"))),
+                                               JSON_BUILD_PAIR("usePortableSystemExtension", JSON_BUILD_BOOLEAN(strv_contains(sysext_scopes, "portable"))),
+                                               JSON_BUILD_PAIR("useConfigurationExtension", JSON_BUILD_BOOLEAN(strv_contains(confext_scopes, "system"))),
+                                               JSON_BUILD_PAIR("useInitRDConfigurationExtension", JSON_BUILD_BOOLEAN(strv_contains(confext_scopes, "initrd"))),
+                                               JSON_BUILD_PAIR("usePortableConfigurationExtension", JSON_BUILD_BOOLEAN(strv_contains(confext_scopes, "portable")))));
+                if (r < 0)
+                        return log_oom();
+        }
+
+        t = table_new("rw", "designator", "partition uuid", "partition label", "fstype", "architecture", "verity", "growfs", "node", "partno");
+        if (!t)
+                return log_oom();
+
+        table_set_ersatz_string(t, TABLE_ERSATZ_DASH);
+        (void) table_set_align_percent(t, table_get_cell(t, 0, 9), 100);
+
+        for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) {
+                DissectedPartition *p = m->partitions + i;
+
+                if (!p->found)
+                        continue;
+
+                r = table_add_many(
+                                t,
+                                TABLE_STRING, p->rw ? "rw" : "ro",
+                                TABLE_STRING, partition_designator_to_string(i));
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                if (sd_id128_is_null(p->uuid))
+                        r = table_add_cell(t, NULL, TABLE_EMPTY, NULL);
+                else
+                        r = table_add_cell(t, NULL, TABLE_UUID, &p->uuid);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                r = table_add_many(
+                                t,
+                                TABLE_STRING, p->label,
+                                TABLE_STRING, p->fstype,
+                                TABLE_STRING, architecture_to_string(p->architecture));
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                if (arg_verity_settings.data_path)
+                        r = table_add_cell(t, NULL, TABLE_STRING, "external");
+                else if (dissected_image_verity_candidate(m, i))
+                        r = table_add_cell(t, NULL, TABLE_STRING,
+                                           dissected_image_verity_sig_ready(m, i) ? "signed" :
+                                           yes_no(dissected_image_verity_ready(m, i)));
+                else
+                        r = table_add_cell(t, NULL, TABLE_EMPTY, NULL);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                r = table_add_many(t, TABLE_BOOLEAN, (int) p->growfs);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                if (p->partno < 0) /* no partition table, naked file system */ {
+                        r = table_add_cell(t, NULL, TABLE_PATH_BASENAME, arg_image);
+                        if (r < 0)
+                                return table_log_add_error(r);
+
+                        r = table_add_cell(t, NULL, TABLE_EMPTY, NULL);
+                } else {
+                        r = table_add_cell(t, NULL, TABLE_STRING, p->node);
+                        if (r < 0)
+                                return table_log_add_error(r);
+
+                        r = table_add_cell(t, NULL, TABLE_INT, &p->partno);
+                }
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (arg_json_format_flags & JSON_FORMAT_OFF) {
+                (void) table_set_header(t, arg_legend);
+
+                r = table_print(t, NULL);
+                if (r < 0)
+                        return table_log_print_error(r);
+        } else {
+                _cleanup_(json_variant_unrefp) JsonVariant *jt = NULL;
+
+                r = table_to_json(t, &jt);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to convert table to JSON: %m");
+
+                r = json_variant_set_field(&v, "mounts", jt);
+                if (r < 0)
+                        return log_oom();
+
+                json_variant_dump(v, arg_json_format_flags, stdout, NULL);
+        }
+
+        return 0;
+}
+
+static int action_mount(DissectedImage *m, LoopDevice *d) {
+        int r;
+
+        assert(m);
+        assert(d);
+        assert(arg_action == ACTION_MOUNT);
+
+        r = dissected_image_mount_and_warn(
+                        m,
+                        arg_path,
+                        /* uid_shift= */ UID_INVALID,
+                        /* uid_range= */ UID_INVALID,
+                        /* userns_fd= */ -EBADF,
+                        arg_flags);
+        if (r < 0)
+                return r;
+
+        r = loop_device_flock(d, LOCK_UN);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock loopback block device: %m");
+
+        r = dissected_image_relinquish(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+
+        return 0;
+}
+
+static int list_print_item(
+                RecurseDirEvent event,
+                const char *path,
+                int dir_fd,
+                int inode_fd,
+                const struct dirent *de,
+                const struct statx *sx,
+                void *userdata) {
+
+        assert(path);
+
+        if (event == RECURSE_DIR_ENTER)
+                printf("%s%s/%s\n", path, ansi_grey(), ansi_normal());
+        else if (event == RECURSE_DIR_ENTRY)
+                printf("%s\n", path);
+
+        return RECURSE_DIR_CONTINUE;
+}
+
+static int get_file_sha256(int inode_fd, uint8_t ret[static SHA256_DIGEST_SIZE]) {
+        _cleanup_close_ int fd = -EBADF;
+        struct sha256_ctx ctx;
+
+        /* convert O_PATH fd into a regular one */
+        fd = fd_reopen(inode_fd, O_RDONLY|O_CLOEXEC);
+        if (fd < 0)
+                return fd;
+
+        /* Calculating the SHA sum might be slow, hence let's flush STDOUT first, to give user an idea where we are slow. */
+        fflush(stdout);
+
+        sha256_init_ctx(&ctx);
+
+        for (;;) {
+                uint8_t buffer[64 * 1024];
+                ssize_t n;
+
+                n = read(fd, buffer, sizeof(buffer));
+                if (n < 0)
+                        return -errno;
+                if (n == 0)
+                        break;
+
+                sha256_process_bytes(buffer, n, &ctx);
+        }
+
+        sha256_finish_ctx(&ctx, ret);
+        return 0;
+}
+
+static const char *pick_color_for_uid_gid(uid_t uid) {
+        if (uid == UID_NOBODY)
+                return ansi_highlight_yellow4(); /* files should never be owned by 'nobody' (but might happen due to userns mapping) */
+        if (uid_is_system(uid))
+                return ansi_normal();            /* files in disk images are typically owned by root and other system users, no issue there */
+        if (uid_is_dynamic(uid))
+                return ansi_highlight_red();     /* files should never be owned persistently by dynamic users, and there are just no excuses */
+        if (uid_is_container(uid))
+                return ansi_highlight_cyan();
+
+        return ansi_highlight();
+}
+
+static int mtree_print_item(
+                RecurseDirEvent event,
+                const char *path,
+                int dir_fd,
+                int inode_fd,
+                const struct dirent *de,
+                const struct statx *sx,
+                void *userdata) {
+
+        _cleanup_free_ char *escaped = NULL;
+        int r;
+
+        assert(path);
+
+        if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY))
+                return RECURSE_DIR_CONTINUE;
+
+        assert(sx);
+
+        if (isempty(path))
+                path = ".";
+        else {
+                /* BSD mtree uses either C or octal escaping, and covers whitespace, comments and glob characters. We use C style escaping and follow suit */
+                path = escaped = xescape(path, WHITESPACE COMMENTS GLOB_CHARS);
+                if (!escaped)
+                        return log_oom();
+        }
+
+        printf("%s", isempty(path) ? "." : path);
+
+        if (FLAGS_SET(sx->stx_mask, STATX_TYPE)) {
+                if (S_ISDIR(sx->stx_mode))
+                        printf("%s/%s", ansi_grey(), ansi_normal());
+
+                printf(" %stype=%s%s%s%s",
+                       ansi_grey(),
+                       ansi_normal(),
+                       S_ISDIR(sx->stx_mode) ? ansi_highlight_blue() :
+                       S_ISLNK(sx->stx_mode) ? ansi_highlight_cyan() :
+                       (S_ISFIFO(sx->stx_mode) || S_ISCHR(sx->stx_mode) || S_ISBLK(sx->stx_mode)) ? ansi_highlight_yellow4() :
+                       S_ISSOCK(sx->stx_mode) ? ansi_highlight_magenta() : "",
+                       ASSERT_PTR(S_ISDIR(sx->stx_mode) ? "dir" :
+                                  S_ISREG(sx->stx_mode) ? "file" :
+                                  S_ISLNK(sx->stx_mode) ? "link" :
+                                  S_ISFIFO(sx->stx_mode) ? "fifo" :
+                                  S_ISBLK(sx->stx_mode) ? "block" :
+                                  S_ISCHR(sx->stx_mode) ? "char" :
+                                  S_ISSOCK(sx->stx_mode) ? "socket" : NULL),
+                       ansi_normal());
+        }
+
+        if (FLAGS_SET(sx->stx_mask, STATX_MODE) && (!FLAGS_SET(sx->stx_mask, STATX_TYPE) || !S_ISLNK(sx->stx_mode)))
+                printf(" %smode=%s%04o",
+                       ansi_grey(),
+                       ansi_normal(),
+                       (unsigned) (sx->stx_mode & 0777));
+
+        if (FLAGS_SET(sx->stx_mask, STATX_UID))
+                printf(" %suid=%s" UID_FMT "%s",
+                       ansi_grey(),
+                       pick_color_for_uid_gid(sx->stx_uid),
+                       sx->stx_uid,
+                       ansi_normal());
+
+        if (FLAGS_SET(sx->stx_mask, STATX_GID))
+                printf(" %sgid=%s" GID_FMT "%s",
+                       ansi_grey(),
+                       pick_color_for_uid_gid(sx->stx_gid),
+                       sx->stx_gid,
+                       ansi_normal());
+
+        if (FLAGS_SET(sx->stx_mask, STATX_TYPE|STATX_SIZE) && S_ISREG(sx->stx_mode)) {
+                printf(" %ssize=%s%" PRIu64,
+                       ansi_grey(),
+                       ansi_normal(),
+                       (uint64_t) sx->stx_size);
+
+                if (arg_mtree_hash && inode_fd >= 0 && sx->stx_size > 0) {
+                        uint8_t hash[SHA256_DIGEST_SIZE];
+
+                        r = get_file_sha256(inode_fd, hash);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to calculate file SHA256 sum for '%s', ignoring: %m", path);
+                        else {
+                                _cleanup_free_ char *h = NULL;
+
+                                h = hexmem(hash, sizeof(hash));
+                                if (!h)
+                                        return log_oom();
+
+                                printf(" %ssha256sum=%s%s",
+                                       ansi_grey(),
+                                       ansi_normal(),
+                                       h);
+                        }
+                }
+        }
+
+        if (FLAGS_SET(sx->stx_mask, STATX_TYPE) && S_ISLNK(sx->stx_mode) && inode_fd >= 0) {
+                _cleanup_free_ char *target = NULL;
+
+                r = readlinkat_malloc(inode_fd, "", &target);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to read symlink '%s', ignoring: %m", path);
+                else {
+                        _cleanup_free_ char *target_escaped = NULL;
+
+                        target_escaped = xescape(target, WHITESPACE COMMENTS GLOB_CHARS);
+                        if (!target_escaped)
+                                return log_oom();
+
+                        printf(" %slink=%s%s",
+                               ansi_grey(),
+                               ansi_normal(),
+                               target_escaped);
+                }
+        }
+
+        if (FLAGS_SET(sx->stx_mask, STATX_TYPE) && (S_ISBLK(sx->stx_mode) || S_ISCHR(sx->stx_mode)))
+                printf(" %sdevice=%slinux,%" PRIu64 ",%" PRIu64,
+                       ansi_grey(),
+                       ansi_normal(),
+                       (uint64_t) sx->stx_rdev_major,
+                       (uint64_t) sx->stx_rdev_minor);
+
+        printf("\n");
+
+        return RECURSE_DIR_CONTINUE;
+}
+
+static int action_list_or_mtree_or_copy(DissectedImage *m, LoopDevice *d) {
+        _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL;
+        _cleanup_(rmdir_and_freep) char *created_dir = NULL;
+        _cleanup_free_ char *temp = NULL;
+        const char *root;
+        int r;
+
+        assert(IN_SET(arg_action, ACTION_LIST, ACTION_MTREE, ACTION_COPY_FROM, ACTION_COPY_TO));
+
+        if (arg_image) {
+                assert(m);
+                assert(d);
+
+                r = detach_mount_namespace();
+                if (r < 0)
+                        return log_error_errno(r, "Failed to detach mount namespace: %m");
+
+                r = tempfn_random_child(NULL, program_invocation_short_name, &temp);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate temporary mount directory: %m");
+
+                r = mkdir_p(temp, 0700);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create mount point: %m");
+
+                created_dir = TAKE_PTR(temp);
+
+                r = dissected_image_mount_and_warn(
+                                m,
+                                created_dir,
+                                /* uid_shift= */ UID_INVALID,
+                                /* uid_range= */ UID_INVALID,
+                                /* userns_fd= */ -EBADF,
+                                arg_flags);
+                if (r < 0)
+                        return r;
+
+                mounted_dir = TAKE_PTR(created_dir);
+
+                r = loop_device_flock(d, LOCK_UN);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to unlock loopback block device: %m");
+
+                r = dissected_image_relinquish(m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+        }
+
+        root = mounted_dir ?: arg_root;
+
+        switch (arg_action) {
+
+        case ACTION_COPY_FROM: {
+                _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF;
+
+                source_fd = chase_and_open(arg_source, root, CHASE_PREFIX_ROOT|CHASE_WARN, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL);
+                if (source_fd < 0)
+                        return log_error_errno(source_fd, "Failed to open source path '%s' in image '%s': %m", arg_source, arg_image);
+
+                /* Copying to stdout? */
+                if (streq(arg_target, "-")) {
+                        r = copy_bytes(source_fd, STDOUT_FILENO, UINT64_MAX, COPY_REFLINK);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to copy bytes from %s in mage '%s' to stdout: %m", arg_source, arg_image);
+
+                        /* When we copy to stdout we don't copy any attributes (i.e. no access mode, no ownership, no xattr, no times) */
+                        return 0;
+                }
+
+                /* Try to copy as directory? */
+                r = copy_directory_at(source_fd, NULL, AT_FDCWD, arg_target, COPY_REFLINK|COPY_MERGE_EMPTY|COPY_SIGINT|COPY_HARDLINKS);
+                if (r >= 0)
+                        return 0;
+                if (r != -ENOTDIR)
+                        return log_error_errno(r, "Failed to copy %s in image '%s' to '%s': %m", arg_source, arg_image, arg_target);
+
+                r = fd_verify_regular(source_fd);
+                if (r == -EISDIR)
+                        return log_error_errno(r, "Target '%s' exists already and is not a directory.", arg_target);
+                if (r < 0)
+                        return log_error_errno(r, "Source path %s in image '%s' is neither regular file nor directory, refusing: %m", arg_source, arg_image);
+
+                /* Nah, it's a plain file! */
+                target_fd = open(arg_target, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0600);
+                if (target_fd < 0)
+                        return log_error_errno(errno, "Failed to create regular file at target path '%s': %m", arg_target);
+
+                r = copy_bytes(source_fd, target_fd, UINT64_MAX, COPY_REFLINK);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to copy bytes from %s in mage '%s' to '%s': %m", arg_source, arg_image, arg_target);
+
+                (void) copy_xattr(source_fd, NULL, target_fd, NULL, 0);
+                (void) copy_access(source_fd, target_fd);
+                (void) copy_times(source_fd, target_fd, 0);
+
+                /* When this is a regular file we don't copy ownership! */
+                return 0;
+        }
+
+        case ACTION_COPY_TO: {
+                _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF, dfd = -EBADF;
+                _cleanup_free_ char *dn = NULL, *bn = NULL;
+                bool is_dir;
+
+                r = path_extract_directory(arg_target, &dn);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract directory from target path '%s': %m", arg_target);
+                r = path_extract_filename(arg_target, &bn);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from target path '%s': %m", arg_target);
+                is_dir = r == O_DIRECTORY;
+
+                r = chase(dn, root, CHASE_PREFIX_ROOT|CHASE_WARN, NULL, &dfd);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open '%s': %m", dn);
+
+                /* Are we reading from stdin? */
+                if (streq(arg_source, "-")) {
+                        if (is_dir)
+                                return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot copy STDIN to a directory, refusing.");
+
+                        target_fd = openat(dfd, bn, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY|O_EXCL, 0644);
+                        if (target_fd < 0)
+                                return log_error_errno(errno, "Failed to open target file '%s': %m", arg_target);
+
+                        r = copy_bytes(STDIN_FILENO, target_fd, UINT64_MAX, COPY_REFLINK);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to copy bytes from stdin to '%s' in image '%s': %m", arg_target, arg_image);
+
+                        /* When we copy from stdin we don't copy any attributes (i.e. no access mode, no ownership, no xattr, no times) */
+                        return 0;
+                }
+
+                source_fd = open(arg_source, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+                if (source_fd < 0)
+                        return log_error_errno(source_fd, "Failed to open source path '%s': %m", arg_source);
+
+                r = fd_verify_regular(source_fd);
+                if (r < 0) {
+                        if (r != -EISDIR)
+                                return log_error_errno(r, "Source '%s' is neither regular file nor directory: %m", arg_source);
+
+                        /* We are looking at a directory. */
+
+                        target_fd = openat(dfd, bn, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+                        if (target_fd < 0) {
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno, "Failed to open destination '%s': %m", arg_target);
+
+                                r = copy_tree_at(source_fd, ".", dfd, bn, UID_INVALID, GID_INVALID, COPY_REFLINK|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS, NULL, NULL);
+                        } else
+                                r = copy_tree_at(source_fd, ".", target_fd, ".", UID_INVALID, GID_INVALID, COPY_REFLINK|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS, NULL, NULL);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to copy '%s' to '%s' in image '%s': %m", arg_source, arg_target, arg_image);
+
+                        return 0;
+                }
+
+                if (is_dir)
+                        return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Source is a regular file, but target is not, refusing.");
+
+                /* We area looking at a regular file */
+                target_fd = openat(dfd, bn, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY|O_EXCL, 0600);
+                if (target_fd < 0)
+                        return log_error_errno(errno, "Failed to open target file '%s': %m", arg_target);
+
+                r = copy_bytes(source_fd, target_fd, UINT64_MAX, COPY_REFLINK);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to copy bytes from '%s' to '%s' in image '%s': %m", arg_source, arg_target, arg_image);
+
+                (void) copy_xattr(source_fd, NULL, target_fd, NULL, 0);
+                (void) copy_access(source_fd, target_fd);
+                (void) copy_times(source_fd, target_fd, 0);
+
+                /* When this is a regular file we don't copy ownership! */
+                return 0;
+        }
+
+        case ACTION_LIST:
+        case ACTION_MTREE: {
+                _cleanup_close_ int dfd = -EBADF;
+
+                dfd = open(root, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+                if (dfd < 0)
+                        return log_error_errno(errno, "Failed to open mount directory: %m");
+
+                pager_open(arg_pager_flags);
+
+                if (arg_action == ACTION_LIST)
+                        r = recurse_dir(dfd, NULL, 0, UINT_MAX, RECURSE_DIR_SORT, list_print_item, NULL);
+                else if (arg_action == ACTION_MTREE)
+                        r = recurse_dir(dfd, ".", STATX_TYPE|STATX_MODE|STATX_UID|STATX_GID|STATX_SIZE, UINT_MAX, RECURSE_DIR_SORT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL, mtree_print_item, NULL);
+                else
+                        assert_not_reached();
+                if (r < 0)
+                        return log_error_errno(r, "Failed to list image: %m");
+                return 0;
+        }
+
+        default:
+                assert_not_reached();
+        }
+}
+
+static int action_umount(const char *path) {
+        _cleanup_close_ int fd = -EBADF;
+        _cleanup_free_ char *canonical = NULL;
+        _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
+        _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+        int r;
+
+        fd = chase_and_open(path, NULL, 0, O_DIRECTORY, &canonical);
+        if (fd == -ENOTDIR)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "'%s' is not a directory", path);
+        if (fd < 0)
+                return log_error_errno(fd, "Failed to resolve path '%s': %m", path);
+
+        r = fd_is_mount_point(fd, NULL, 0);
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "'%s' is not a mount point", canonical);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether '%s' is a mount point: %m", canonical);
+
+        r = block_device_new_from_fd(fd, BLOCK_DEVICE_LOOKUP_WHOLE_DISK | BLOCK_DEVICE_LOOKUP_BACKING, &dev);
+        if (r < 0) {
+                _cleanup_close_ int usr_fd = -EBADF;
+
+                /* The command `systemd-dissect --mount` expects that the image at least has the root or /usr
+                 * partition. If it does not have the root partition, then we mount the /usr partition on a
+                 * tmpfs. Hence, let's try to find the backing block device through the /usr partition. */
+
+                usr_fd = openat(fd, "usr", O_CLOEXEC | O_DIRECTORY | O_NOFOLLOW);
+                if (usr_fd < 0)
+                        return log_error_errno(errno, "Failed to open '%s/usr': %m", canonical);
+
+                r = block_device_new_from_fd(usr_fd, BLOCK_DEVICE_LOOKUP_WHOLE_DISK | BLOCK_DEVICE_LOOKUP_BACKING, &dev);
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to find backing block device for '%s': %m", canonical);
+
+        r = loop_device_open(dev, 0, LOCK_EX, &d);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to open loopback block device: %m");
+
+        /* We've locked the loop device, now we're ready to unmount. To allow the unmount to succeed, we have
+         * to close the O_PATH fd we opened earlier. */
+        fd = safe_close(fd);
+
+        r = umount_recursive(canonical, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unmount '%s': %m", canonical);
+
+        /* We managed to lock and unmount successfully? That means we can try to remove the loop device. */
+        loop_device_unrelinquish(d);
+
+        if (arg_rmdir) {
+                r = RET_NERRNO(rmdir(canonical));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to remove mount directory '%s': %m", canonical);
+        }
+
+        return 0;
+}
+
+static int action_with(DissectedImage *m, LoopDevice *d) {
+        _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL;
+        _cleanup_(rmdir_and_freep) char *created_dir = NULL;
+        _cleanup_free_ char *temp = NULL;
+        int r, rcode;
+
+        assert(m);
+        assert(d);
+        assert(arg_action == ACTION_WITH);
+
+        r = tempfn_random_child(NULL, program_invocation_short_name, &temp);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate temporary mount directory: %m");
+
+        r = mkdir_p(temp, 0700);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create mount point: %m");
+
+        created_dir = TAKE_PTR(temp);
+
+        r = dissected_image_mount_and_warn(
+                        m,
+                        created_dir,
+                        /* uid_shift= */ UID_INVALID,
+                        /* uid_range= */ UID_INVALID,
+                        /* userns_fd= */ -EBADF,
+                        arg_flags);
+        if (r < 0)
+                return r;
+
+        mounted_dir = TAKE_PTR(created_dir);
+
+        r = dissected_image_relinquish(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+
+        r = loop_device_flock(d, LOCK_UN);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock loopback block device: %m");
+
+        rcode = safe_fork("(with)", FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL);
+        if (rcode == 0) {
+                /* Child */
+
+                if (chdir(mounted_dir) < 0) {
+                        log_error_errno(errno, "Failed to change to '%s' directory: %m", mounted_dir);
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setenv("SYSTEMD_DISSECT_ROOT", mounted_dir, /* overwrite= */ true) < 0) {
+                        log_error_errno(errno, "Failed to set $SYSTEMD_DISSECT_ROOT: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setenv("SYSTEMD_DISSECT_DEVICE", d->node, /* overwrite= */ true) < 0) {
+                        log_error_errno(errno, "Failed to set $SYSTEMD_DISSECT_DEVICE: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (strv_isempty(arg_argv)) {
+                        const char *sh;
+
+                        sh = secure_getenv("SHELL");
+                        if (sh) {
+                                execvp(sh, STRV_MAKE(sh));
+                                log_warning_errno(errno, "Failed to execute $SHELL, falling back to /bin/sh: %m");
+                        }
+
+                        execl("/bin/sh", "sh", NULL);
+                        log_error_errno(errno, "Failed to invoke /bin/sh: %m");
+                } else {
+                        execvp(arg_argv[0], arg_argv);
+                        log_error_errno(errno, "Failed to execute '%s': %m", arg_argv[0]);
+                }
+
+                _exit(EXIT_FAILURE);
+        }
+
+        /* Let's manually detach everything, to make things synchronous */
+        r = loop_device_flock(d, LOCK_SH);
+        if (r < 0)
+                log_warning_errno(r, "Failed to lock loopback block device, ignoring: %m");
+
+        r = umount_recursive(mounted_dir, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to unmount '%s', ignoring: %m", mounted_dir);
+        else
+                loop_device_unrelinquish(d); /* Let's try to destroy the loopback device */
+
+        created_dir = TAKE_PTR(mounted_dir);
+
+        if (rmdir(created_dir) < 0)
+                log_warning_errno(r, "Failed to remove directory '%s', ignoring: %m", created_dir);
+
+        temp = TAKE_PTR(created_dir);
+
+        return rcode;
+}
+
+static int action_discover(void) {
+        _cleanup_hashmap_free_ Hashmap *images = NULL;
+        _cleanup_(table_unrefp) Table *t = NULL;
+        Image *img;
+        int r;
+
+        images = hashmap_new(&image_hash_ops);
+        if (!images)
+                return log_oom();
+
+        for (ImageClass cl = 0; cl < _IMAGE_CLASS_MAX; cl++) {
+                r = image_discover(cl, NULL, images);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to discover images: %m");
+        }
+
+        if ((arg_json_format_flags & JSON_FORMAT_OFF) && hashmap_isempty(images)) {
+                log_info("No images found.");
+                return 0;
+        }
+
+        t = table_new("name", "type", "class", "ro", "path", "time", "usage");
+        if (!t)
+                return log_oom();
+
+        table_set_align_percent(t, table_get_cell(t, 0, 6), 100);
+        table_set_ersatz_string(t, TABLE_ERSATZ_DASH);
+
+        HASHMAP_FOREACH(img, images) {
+
+                if (!IN_SET(img->type, IMAGE_RAW, IMAGE_BLOCK))
+                        continue;
+
+                r = table_add_many(
+                                t,
+                                TABLE_STRING, img->name,
+                                TABLE_STRING, image_type_to_string(img->type),
+                                TABLE_STRING, image_class_to_string(img->class),
+                                TABLE_BOOLEAN, img->read_only,
+                                TABLE_PATH, img->path,
+                                TABLE_TIMESTAMP, img->mtime != 0 ? img->mtime : img->crtime,
+                                TABLE_SIZE, img->usage);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        (void) table_set_sort(t, (size_t) 0);
+
+        return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend);
+}
+
+static int action_attach(DissectedImage *m, LoopDevice *d) {
+        int r;
+
+        assert(m);
+        assert(d);
+
+        r = loop_device_set_autoclear(d, false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to disable auto-clear logic on loopback device: %m");
+
+        r = dissected_image_relinquish(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+
+        puts(d->node);
+        return 0;
+}
+
+static int action_detach(const char *path) {
+        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        struct stat st;
+        int r;
+
+        fd = open(path, O_PATH|O_CLOEXEC);
+        if (fd < 0)
+                return log_error_errno(errno, "Failed to open '%s': %m", path);
+
+        if (fstat(fd, &st) < 0)
+                return log_error_errno(errno, "Failed to stat '%s': %m", path);
+
+        if (S_ISBLK(st.st_mode)) {
+                r = loop_device_open_from_fd(fd, O_RDONLY, LOCK_EX, &loop);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open '%s' as loopback block device: %m", path);
+
+        } else if (S_ISREG(st.st_mode)) {
+                _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+
+                /* If a regular file is specified, search for a loopback block device that is backed by it */
+
+                r = sd_device_enumerator_new(&e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate enumerator: %m");
+
+                r = sd_device_enumerator_add_match_subsystem(e, "block", true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to match block devices: %m");
+
+                r = sd_device_enumerator_add_match_sysname(e, "loop*");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to match loopback block devices: %m");
+
+                (void) sd_device_enumerator_allow_uninitialized(e);
+
+                FOREACH_DEVICE(e, d) {
+                        _cleanup_(loop_device_unrefp) LoopDevice *entry_loop = NULL;
+                        const char *name, *devtype;
+
+                        r = sd_device_get_sysname(d, &name);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to get enumerated device's sysname, skipping: %m");
+                                continue;
+                        }
+
+                        r = sd_device_get_devtype(d, &devtype);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to get devtype of '%s', skipping: %m", name);
+                                continue;
+                        }
+
+                        if (!streq(devtype, "disk")) /* Filter out partition block devices */
+                                continue;
+
+                        r = loop_device_open(d, O_RDONLY, LOCK_SH, &entry_loop);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to open loopback block device '%s', skipping: %m", name);
+                                continue;
+                        }
+
+                        if (entry_loop->backing_devno == st.st_dev && entry_loop->backing_inode == st.st_ino) {
+                                /* Found it! The kernel allows attaching a single file to multiple loopback
+                                 * devices. Let's destruct them in reverse order, i.e. find the last matching
+                                 * loopback device here, rather than the first. */
+
+                                loop_device_unref(loop);
+                                loop = TAKE_PTR(entry_loop);
+                        }
+                }
+
+                if (!loop)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "No loopback block device backed by '%s' found.", path);
+
+                r = loop_device_flock(loop, LOCK_EX);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to upgrade device lock: %m");
+        }
+
+        r = loop_device_set_autoclear(loop, true);
+        if (r < 0)
+                log_warning_errno(r, "Failed to enable autoclear logic on '%s', ignoring: %m", loop->node);
+
+        loop_device_unrelinquish(loop);
+        return 0;
+}
+
+static int action_validate(void) {
+        int r;
+
+        r = dissect_image_file_and_warn(
+                        arg_image,
+                        &arg_verity_settings,
+                        NULL,
+                        arg_image_policy,
+                        arg_flags,
+                        NULL);
+        if (r < 0)
+                return r;
+
+        if (isatty(STDOUT_FILENO) && emoji_enabled())
+                printf("%s ", special_glyph(SPECIAL_GLYPH_SPARKLES));
+
+        printf("%sOK%s", ansi_highlight_green(), ansi_normal());
+
+        if (isatty(STDOUT_FILENO) && emoji_enabled())
+                printf(" %s", special_glyph(SPECIAL_GLYPH_SPARKLES));
+
+        putc('\n', stdout);
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
+        _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
+        uint32_t loop_flags;
+        int open_flags, r;
+
+        log_setup();
+
+        if (invoked_as(argv, "mount.ddi"))
+                r = parse_argv_as_mount_helper(argc, argv);
+        else
+                r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        switch (arg_action) {
+        case ACTION_UMOUNT:
+                return action_umount(arg_path);
+
+        case ACTION_DETACH:
+                return action_detach(arg_image);
+
+        case ACTION_DISCOVER:
+                return action_discover();
+
+        default:
+                /* All other actions need the image dissected */
+                break;
+        }
+
+        if (arg_image) {
+                r = verity_settings_load(
+                        &arg_verity_settings,
+                        arg_image, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read verity artifacts for %s: %m", arg_image);
+
+                if (arg_verity_settings.data_path)
+                        arg_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE; /* We only support Verity per file system,
+                                                                        * hence if there's external Verity data
+                                                                        * available we turn off partition table
+                                                                        * support */
+
+                if (arg_action == ACTION_VALIDATE)
+                        return action_validate();
+
+                open_flags = FLAGS_SET(arg_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR;
+                loop_flags = FLAGS_SET(arg_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN;
+                if (arg_in_memory)
+                        r = loop_device_make_by_path_memory(arg_image, open_flags, /* sector_size= */ UINT32_MAX, loop_flags, LOCK_SH, &d);
+                else
+                        r = loop_device_make_by_path(arg_image, open_flags, /* sector_size= */ UINT32_MAX, loop_flags, LOCK_SH, &d);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set up loopback device for %s: %m", arg_image);
+
+                if (arg_loop_ref) {
+                        r = loop_device_set_filename(d, arg_loop_ref);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to set loop reference string to '%s', ignoring: %m", arg_loop_ref);
+                }
+
+                r = dissect_loop_device_and_warn(
+                                d,
+                                &arg_verity_settings,
+                                /* mount_options= */ NULL,
+                                arg_image_policy,
+                                arg_flags,
+                                &m);
+                if (r < 0)
+                        return r;
+
+                if (arg_action == ACTION_ATTACH)
+                        return action_attach(m, d);
+
+                r = dissected_image_load_verity_sig_partition(
+                                m,
+                                d->fd,
+                                &arg_verity_settings);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to load verity signature partition: %m");
+
+                if (arg_action != ACTION_DISSECT) {
+                        r = dissected_image_decrypt_interactively(
+                                        m, NULL,
+                                        &arg_verity_settings,
+                                        arg_flags);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        switch (arg_action) {
+
+        case ACTION_DISSECT:
+                return action_dissect(m, d);
+
+        case ACTION_MOUNT:
+                return action_mount(m, d);
+
+        case ACTION_LIST:
+        case ACTION_MTREE:
+        case ACTION_COPY_FROM:
+        case ACTION_COPY_TO:
+                return action_list_or_mtree_or_copy(m, d);
+
+        case ACTION_WITH:
+                return action_with(m, d);
+
+        default:
+                assert_not_reached();
+        }
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/dissect/meson.build b/src/dissect/meson.build
new file mode 100644
index 0000000..e422dbd
--- /dev/null
+++ b/src/dissect/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-dissect',
+                'public' : true,
+                'conditions' : ['HAVE_BLKID'],
+                'sources' : files('dissect.c'),
+        },
+]
+
+if conf.get('HAVE_BLKID') == 1
+        install_emptydir(sbindir)
+        meson.add_install_script(sh, '-c',
+                                 ln_s.format(bindir / 'systemd-dissect',
+                                             sbindir / 'mount.ddi'))
+endif
diff --git a/src/environment-d-generator/environment-d-generator.c b/src/environment-d-generator/environment-d-generator.c
new file mode 100644
index 0000000..90e31c9
--- /dev/null
+++ b/src/environment-d-generator/environment-d-generator.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-path.h"
+
+#include "conf-files.h"
+#include "constants.h"
+#include "env-file.h"
+#include "escape.h"
+#include "glyph-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "path-lookup.h"
+#include "strv.h"
+
+static int environment_dirs(char ***ret) {
+        _cleanup_strv_free_ char **dirs = NULL;
+        _cleanup_free_ char *c = NULL;
+        int r;
+
+        dirs = strv_new(CONF_PATHS_USR("environment.d"), NULL);
+        if (!dirs)
+                return -ENOMEM;
+
+        /* ~/.config/systemd/environment.d */
+        r = sd_path_lookup(SD_PATH_USER_CONFIGURATION, "environment.d", &c);
+        if (r < 0)
+                return r;
+
+        r = strv_extend_front(&dirs, c);
+        if (r < 0)
+                return r;
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *t = NULL;
+
+                t = strv_join(dirs, "\n\t");
+                log_debug("Looking for environment.d files in (higher priority first):\n\t%s", strna(t));
+        }
+
+        *ret = TAKE_PTR(dirs);
+        return 0;
+}
+
+static int load_and_print(void) {
+        _cleanup_strv_free_ char **dirs = NULL, **files = NULL, **env = NULL;
+        int r;
+
+        r = environment_dirs(&dirs);
+        if (r < 0)
+                return r;
+
+        r = conf_files_list_strv(&files, ".conf", NULL, 0, (const char **) dirs);
+        if (r < 0)
+                return r;
+
+        /* This will mutate the existing environment, based on the presumption
+         * that in case of failure, a partial update is better than none. */
+
+        STRV_FOREACH(i, files) {
+                log_debug("Reading %s%s", *i, special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+                r = merge_env_file(&env, NULL, *i);
+                if (r == -ENOMEM)
+                        return r;
+        }
+
+        STRV_FOREACH(i, env) {
+                char *t;
+                _cleanup_free_ char *q = NULL;
+
+                t = strchr(*i, '=');
+                assert(t);
+
+                q = shell_maybe_quote(t + 1, 0);
+                if (!q)
+                        return log_oom();
+
+                printf("%.*s=%s\n", (int) (t - *i), *i, q);
+        }
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_parse_environment();
+        log_open();
+
+        if (argc > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments.");
+
+        r = load_and_print();
+        if (r < 0)
+                return log_error_errno(r, "Failed to load environment.d: %m");
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/environment-d-generator/meson.build b/src/environment-d-generator/meson.build
new file mode 100644
index 0000000..346be05
--- /dev/null
+++ b/src/environment-d-generator/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : '30-systemd-environment-d-generator',
+                'conditions' : ['ENABLE_ENVIRONMENT_D'],
+                'sources' : files('environment-d-generator.c'),
+                'install_dir' : userenvgeneratordir,
+        },
+]
+
+if conf.get('ENABLE_ENVIRONMENT_D') == 1
+        install_emptydir(environmentdir)
+        meson.add_install_script(sh, '-c',
+                                 ln_s.format(sysconfdir / 'environment',
+                                             environmentdir / '99-environment.conf'))
+endif
diff --git a/src/escape/escape.c b/src/escape/escape.c
new file mode 100644
index 0000000..abbc08b
--- /dev/null
+++ b/src/escape/escape.c
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "build.h"
+#include "log.h"
+#include "main-func.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+
+static enum {
+        ACTION_ESCAPE,
+        ACTION_UNESCAPE,
+        ACTION_MANGLE
+} arg_action = ACTION_ESCAPE;
+static const char *arg_suffix = NULL;
+static const char *arg_template = NULL;
+static bool arg_path = false;
+static bool arg_instance = false;
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-escape", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] [NAME...]\n\n"
+               "Escape strings for usage in systemd unit names.\n\n"
+               "  -h --help               Show this help\n"
+               "     --version            Show package version\n"
+               "     --suffix=SUFFIX      Unit suffix to append to escaped strings\n"
+               "     --template=TEMPLATE  Insert strings as instance into template\n"
+               "     --instance           With --unescape, show just the instance part\n"
+               "  -u --unescape           Unescape strings\n"
+               "  -m --mangle             Mangle strings\n"
+               "  -p --path               When escaping/unescaping assume the string is a path\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_SUFFIX,
+                ARG_TEMPLATE
+        };
+
+        static const struct option options[] = {
+                { "help",      no_argument,       NULL, 'h'           },
+                { "version",   no_argument,       NULL, ARG_VERSION   },
+                { "suffix",    required_argument, NULL, ARG_SUFFIX    },
+                { "template",  required_argument, NULL, ARG_TEMPLATE  },
+                { "unescape",  no_argument,       NULL, 'u'           },
+                { "mangle",    no_argument,       NULL, 'm'           },
+                { "path",      no_argument,       NULL, 'p'           },
+                { "instance",  no_argument,       NULL, 'i'           },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hump", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_SUFFIX: {
+                        UnitType t = unit_type_from_string(optarg);
+                        if (t < 0)
+                                return log_error_errno(t, "Invalid unit suffix type \"%s\".", optarg);
+
+                        arg_suffix = optarg;
+                        break;
+                }
+
+                case ARG_TEMPLATE:
+                        if (!unit_name_is_valid(optarg, UNIT_NAME_TEMPLATE))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Template name %s is not valid.", optarg);
+
+                        arg_template = optarg;
+                        break;
+
+                case 'u':
+                        arg_action = ACTION_UNESCAPE;
+                        break;
+
+                case 'm':
+                        arg_action = ACTION_MANGLE;
+                        break;
+
+                case 'p':
+                        arg_path = true;
+                        break;
+
+                case 'i':
+                        arg_instance = true;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (optind >= argc)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Not enough arguments.");
+
+        if (arg_template && arg_suffix)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--suffix= and --template= may not be combined.");
+
+        if ((arg_template || arg_suffix) && arg_action == ACTION_MANGLE)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--suffix= and --template= are not compatible with --mangle.");
+
+        if (arg_suffix && arg_action == ACTION_UNESCAPE)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--suffix is not compatible with --unescape.");
+
+        if (arg_path && !IN_SET(arg_action, ACTION_ESCAPE, ACTION_UNESCAPE))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--path may not be combined with --mangle.");
+
+        if (arg_instance && arg_action != ACTION_UNESCAPE)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--instance must be used in conjunction with --unescape.");
+
+        if (arg_instance && arg_template)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--instance may not be combined with --template.");
+
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        STRV_FOREACH(i, argv + optind) {
+                _cleanup_free_ char *e = NULL;
+
+                switch (arg_action) {
+
+                case ACTION_ESCAPE:
+                        if (arg_path) {
+                                r = unit_name_path_escape(*i, &e);
+                                if (r < 0) {
+                                        if (r == -EINVAL) {
+                                                /* If escaping failed because the string was invalid, let's print a
+                                                 * friendly message about it. Catch these specific error cases
+                                                 * explicitly. */
+
+                                                if (!path_is_valid(*i))
+                                                        return log_error_errno(r, "Input '%s' is not a valid file system path, failed to escape.", *i);
+                                                if (!path_is_absolute(*i))
+                                                        return log_error_errno(r, "Input '%s' is not an absolute file system path, failed to escape.", *i);
+                                                if (!path_is_normalized(*i))
+                                                        return log_error_errno(r, "Input '%s' is not a normalized file system path, failed to escape.", *i);
+                                        }
+
+                                        /* All other error cases. */
+                                        return log_error_errno(r, "Failed to escape string: %m");
+                                }
+
+                                /* If the escaping worked, then still warn if the path is not like we'd like
+                                 * it. Because that means escaping is not necessarily reversible. */
+
+                                if (!path_is_valid(*i))
+                                        log_warning("Input '%s' is not a valid file system path, escaping is likely not going be reversible.", *i);
+                                else if (!path_is_absolute(*i))
+                                        log_warning("Input '%s' is not an absolute file system path, escaping is likely not going to be reversible.", *i);
+
+                                /* Note that we don't complain about paths not being normalized here, because
+                                 * some forms of non-normalization is actually OK, such as a series // and
+                                 * unit_name_path_escape() will clean those up silently, and the reversal is
+                                 * "close enough" to be OK. */
+                        } else {
+                                e = unit_name_escape(*i);
+                                if (!e)
+                                        return log_oom();
+                        }
+
+                        if (arg_template) {
+                                char *x;
+
+                                r = unit_name_replace_instance(arg_template, e, &x);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to replace instance: %m");
+
+                                free_and_replace(e, x);
+                        } else if (arg_suffix) {
+                                if (!strextend(&e, ".", arg_suffix))
+                                        return log_oom();
+                        }
+
+                        break;
+
+                case ACTION_UNESCAPE: {
+                        _cleanup_free_ char *name = NULL;
+
+                        if (arg_template || arg_instance) {
+                                _cleanup_free_ char *template = NULL;
+
+                                r = unit_name_to_instance(*i, &name);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to extract instance: %m");
+                                if (isempty(name))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unit %s is missing the instance name.", *i);
+
+                                r = unit_name_template(*i, &template);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to extract template: %m");
+                                if (arg_template && !streq(arg_template, template))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Unit %s template %s does not match specified template %s.",
+                                                               *i, template, arg_template);
+                        } else {
+                                name = strdup(*i);
+                                if (!name)
+                                        return log_oom();
+                        }
+
+                        if (arg_path)
+                                r = unit_name_path_unescape(name, &e);
+                        else
+                                r = unit_name_unescape(name, &e);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to unescape string: %m");
+
+                        break;
+                }
+
+                case ACTION_MANGLE:
+                        r = unit_name_mangle(*i, 0, &e);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to mangle name: %m");
+
+                        break;
+                }
+
+                if (i != argv + optind)
+                        fputc(' ', stdout);
+
+                fputs(e, stdout);
+        }
+
+        fputc('\n', stdout);
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/escape/meson.build b/src/escape/meson.build
new file mode 100644
index 0000000..d21b372
--- /dev/null
+++ b/src/escape/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-escape',
+                'public' : true,
+                'sources' : files('escape.c'),
+        },
+]
diff --git a/src/firstboot/firstboot.c b/src/firstboot/firstboot.c
new file mode 100644
index 0000000..d402927
--- /dev/null
+++ b/src/firstboot/firstboot.c
@@ -0,0 +1,1763 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "build.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-unit-util.h"
+#include "bus-util.h"
+#include "bus-wait-for-jobs.h"
+#include "chase.h"
+#include "copy.h"
+#include "creds-util.h"
+#include "dissect-image.h"
+#include "env-file.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "hostname-util.h"
+#include "kbd-util.h"
+#include "libcrypt-util.h"
+#include "locale-util.h"
+#include "lock-util.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "os-util.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "password-quality-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "proc-cmdline.h"
+#include "random-util.h"
+#include "smack-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "tmpfile-util-label.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+static char *arg_root = NULL;
+static char *arg_image = NULL;
+static char *arg_locale = NULL;  /* $LANG */
+static char *arg_locale_messages = NULL; /* $LC_MESSAGES */
+static char *arg_keymap = NULL;
+static char *arg_timezone = NULL;
+static char *arg_hostname = NULL;
+static sd_id128_t arg_machine_id = {};
+static char *arg_root_password = NULL;
+static char *arg_root_shell = NULL;
+static char *arg_kernel_cmdline = NULL;
+static bool arg_prompt_locale = false;
+static bool arg_prompt_keymap = false;
+static bool arg_prompt_timezone = false;
+static bool arg_prompt_hostname = false;
+static bool arg_prompt_root_password = false;
+static bool arg_prompt_root_shell = false;
+static bool arg_copy_locale = false;
+static bool arg_copy_keymap = false;
+static bool arg_copy_timezone = false;
+static bool arg_copy_root_password = false;
+static bool arg_copy_root_shell = false;
+static bool arg_force = false;
+static bool arg_delete_root_password = false;
+static bool arg_root_password_is_hashed = false;
+static bool arg_welcome = true;
+static bool arg_reset = false;
+static ImagePolicy *arg_image_policy = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_root, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_locale, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_locale_messages, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_keymap, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_timezone, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_password, erase_and_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
+
+static bool press_any_key(void) {
+        char k = 0;
+        bool need_nl = true;
+
+        printf("-- Press any key to proceed --");
+        fflush(stdout);
+
+        (void) read_one_char(stdin, &k, USEC_INFINITY, &need_nl);
+
+        if (need_nl)
+                putchar('\n');
+
+        return k != 'q';
+}
+
+static void print_welcome(int rfd) {
+        _cleanup_free_ char *pretty_name = NULL, *os_name = NULL, *ansi_color = NULL;
+        static bool done = false;
+        const char *pn, *ac;
+        int r;
+
+        assert(rfd >= 0);
+
+        if (!arg_welcome)
+                return;
+
+        if (done)
+                return;
+
+        r = parse_os_release_at(rfd,
+                                "PRETTY_NAME", &pretty_name,
+                                "NAME", &os_name,
+                                "ANSI_COLOR", &ansi_color);
+        if (r < 0)
+                log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to read os-release file, ignoring: %m");
+
+        pn = os_release_pretty_name(pretty_name, os_name);
+        ac = isempty(ansi_color) ? "0" : ansi_color;
+
+        (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ false);
+
+        if (colors_enabled())
+                printf("\nWelcome to your new installation of \x1B[%sm%s\x1B[0m!\n", ac, pn);
+        else
+                printf("\nWelcome to your new installation of %s!\n", pn);
+
+        printf("\nPlease configure your system!\n\n");
+
+        press_any_key();
+
+        done = true;
+}
+
+static int show_menu(char **x, unsigned n_columns, unsigned width, unsigned percentage) {
+        unsigned break_lines, break_modulo;
+        size_t n, per_column, i, j;
+
+        assert(n_columns > 0);
+
+        n = strv_length(x);
+        per_column = DIV_ROUND_UP(n, n_columns);
+
+        break_lines = lines();
+        if (break_lines > 2)
+                break_lines--;
+
+        /* The first page gets two extra lines, since we want to show
+         * a title */
+        break_modulo = break_lines;
+        if (break_modulo > 3)
+                break_modulo -= 3;
+
+        for (i = 0; i < per_column; i++) {
+
+                for (j = 0; j < n_columns; j ++) {
+                        _cleanup_free_ char *e = NULL;
+
+                        if (j * per_column + i >= n)
+                                break;
+
+                        e = ellipsize(x[j * per_column + i], width, percentage);
+                        if (!e)
+                                return log_oom();
+
+                        printf("%4zu) %-*s", j * per_column + i + 1, (int) width, e);
+                }
+
+                putchar('\n');
+
+                /* on the first screen we reserve 2 extra lines for the title */
+                if (i % break_lines == break_modulo) {
+                        if (!press_any_key())
+                                return 0;
+                }
+        }
+
+        return 0;
+}
+
+static int prompt_loop(const char *text, char **l, unsigned percentage, bool (*is_valid)(const char *name), char **ret) {
+        int r;
+
+        assert(text);
+        assert(is_valid);
+        assert(ret);
+
+        for (;;) {
+                _cleanup_free_ char *p = NULL;
+                unsigned u;
+
+                r = ask_string(&p, "%s %s (empty to skip, \"list\" to list options): ",
+                               special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET), text);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to query user: %m");
+
+                if (isempty(p)) {
+                        log_warning("No data entered, skipping.");
+                        return 0;
+                }
+
+                if (streq(p, "list")) {
+                        r = show_menu(l, 3, 22, percentage);
+                        if (r < 0)
+                                return r;
+
+                        putchar('\n');
+                        continue;
+                };
+
+                r = safe_atou(p, &u);
+                if (r >= 0) {
+                        if (u <= 0 || u > strv_length(l)) {
+                                log_error("Specified entry number out of range.");
+                                continue;
+                        }
+
+                        log_info("Selected '%s'.", l[u-1]);
+                        return free_and_strdup_warn(ret, l[u-1]);
+                }
+
+                if (!is_valid(p)) {
+                        log_error("Entered data invalid.");
+                        continue;
+                }
+
+                return free_and_replace(*ret, p);
+        }
+}
+
+static int should_configure(int dir_fd, const char *filename) {
+        _cleanup_fclose_ FILE *passwd = NULL, *shadow = NULL;
+        int r;
+
+        assert(dir_fd >= 0);
+        assert(filename);
+
+        if (streq(filename, "passwd") && !arg_force)
+                /* We may need to do additional checks, so open the file. */
+                r = xfopenat(dir_fd, filename, "re", O_NOFOLLOW, &passwd);
+        else
+                r = RET_NERRNO(faccessat(dir_fd, filename, F_OK, AT_SYMLINK_NOFOLLOW));
+
+        if (r == -ENOENT)
+                return true; /* missing */
+        if (r < 0)
+                return log_error_errno(r, "Failed to access %s: %m", filename);
+        if (arg_force)
+                return true; /* exists, but if --force was given we should still configure the file. */
+
+        if (!passwd)
+                return false;
+
+        /* In case of /etc/passwd, do an additional check for the root password field.
+         * We first check that passwd redirects to shadow, and then we check shadow.
+         */
+        struct passwd *i;
+        while ((r = fgetpwent_sane(passwd, &i)) > 0) {
+                if (!streq(i->pw_name, "root"))
+                        continue;
+
+                if (streq_ptr(i->pw_passwd, PASSWORD_SEE_SHADOW))
+                        break;
+                log_debug("passwd: root account with non-shadow password found, treating root as configured");
+                return false;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to read %s: %m", filename);
+        if (r == 0) {
+                log_debug("No root account found in %s, assuming root is not configured.", filename);
+                return true;
+        }
+
+        r = xfopenat(dir_fd, "shadow", "re", O_NOFOLLOW, &shadow);
+        if (r == -ENOENT) {
+                log_debug("No shadow file found, assuming root is not configured.");
+                return true; /* missing */
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to access shadow: %m");
+
+        struct spwd *j;
+        while ((r = fgetspent_sane(shadow, &j)) > 0) {
+                if (!streq(j->sp_namp, "root"))
+                        continue;
+
+                bool unprovisioned = streq_ptr(j->sp_pwdp, PASSWORD_UNPROVISIONED);
+                log_debug("Root account found, %s.",
+                          unprovisioned ? "with unprovisioned password, treating root as not configured" :
+                                          "treating root as configured");
+                return unprovisioned;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to read shadow: %m");
+        assert(r == 0);
+        log_debug("No root account found in shadow, assuming root is not configured.");
+        return true;
+}
+
+static bool locale_is_installed_bool(const char *name) {
+        return locale_is_installed(name) > 0;
+}
+
+static bool locale_is_ok(int rfd, const char *name) {
+        assert(rfd >= 0);
+
+        return dir_fd_is_root(rfd) ? locale_is_installed_bool(name) : locale_is_valid(name);
+}
+
+static int prompt_locale(int rfd) {
+        _cleanup_strv_free_ char **locales = NULL;
+        bool acquired_from_creds = false;
+        int r;
+
+        assert(rfd >= 0);
+
+        if (arg_locale || arg_locale_messages)
+                return 0;
+
+        r = read_credential("firstboot.locale", (void**) &arg_locale, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read credential firstboot.locale, ignoring: %m");
+        else
+                acquired_from_creds = true;
+
+        r = read_credential("firstboot.locale-messages", (void**) &arg_locale_messages, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read credential firstboot.locale-messages, ignoring: %m");
+        else
+                acquired_from_creds = true;
+
+        if (acquired_from_creds) {
+                log_debug("Acquired locale from credentials.");
+                return 0;
+        }
+
+        if (!arg_prompt_locale) {
+                log_debug("Prompting for locale was not requested.");
+                return 0;
+        }
+
+        r = get_locales(&locales);
+        if (r < 0)
+                return log_error_errno(r, "Cannot query locales list: %m");
+
+        if (strv_isempty(locales))
+                log_debug("No locales found, skipping locale selection.");
+        else if (strv_length(locales) == 1) {
+
+                if (streq(locales[0], SYSTEMD_DEFAULT_LOCALE))
+                        log_debug("Only installed locale is default locale anyway, not setting locale explicitly.");
+                else {
+                        log_debug("Only a single locale available (%s), selecting it as default.", locales[0]);
+
+                        arg_locale = strdup(locales[0]);
+                        if (!arg_locale)
+                                return log_oom();
+
+                        /* Not setting arg_locale_message here, since it defaults to LANG anyway */
+                }
+        } else {
+                bool (*is_valid)(const char *name) = dir_fd_is_root(rfd) ? locale_is_installed_bool
+                                                                         : locale_is_valid;
+
+                print_welcome(rfd);
+
+                r = prompt_loop("Please enter system locale name or number",
+                                locales, 60, is_valid, &arg_locale);
+                if (r < 0)
+                        return r;
+
+                if (isempty(arg_locale))
+                        return 0;
+
+                r = prompt_loop("Please enter system message locale name or number",
+                                locales, 60, is_valid, &arg_locale_messages);
+                if (r < 0)
+                        return r;
+
+                /* Suppress the messages setting if it's the same as the main locale anyway */
+                if (streq_ptr(arg_locale, arg_locale_messages))
+                        arg_locale_messages = mfree(arg_locale_messages);
+        }
+
+        return 0;
+}
+
+static int process_locale(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        char* locales[3];
+        unsigned i = 0;
+        int r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/locale.conf",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN|CHASE_NOFOLLOW,
+                                       &f);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/locale.conf: %m");
+
+        r = should_configure(pfd, f);
+        if (r == 0)
+                log_debug("Found /etc/locale.conf, assuming locale information has been configured.");
+        if (r <= 0)
+                return r;
+
+        r = dir_fd_is_root(rfd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if directory file descriptor is root: %m");
+
+        if (arg_copy_locale && r == 0) {
+                r = copy_file_atomic_at(AT_FDCWD, "/etc/locale.conf", pfd, f, 0644, COPY_REFLINK);
+                if (r != -ENOENT) {
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to copy host's /etc/locale.conf: %m");
+
+                        log_info("Copied host's /etc/locale.conf.");
+                        return 0;
+                }
+        }
+
+        r = prompt_locale(rfd);
+        if (r < 0)
+                return r;
+
+        if (!isempty(arg_locale))
+                locales[i++] = strjoina("LANG=", arg_locale);
+        if (!isempty(arg_locale_messages) && !streq_ptr(arg_locale_messages, arg_locale))
+                locales[i++] = strjoina("LC_MESSAGES=", arg_locale_messages);
+
+        if (i == 0)
+                return 0;
+
+        locales[i] = NULL;
+
+        r = write_env_file(pfd, f, NULL, locales);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/locale.conf: %m");
+
+        log_info("/etc/locale.conf written.");
+        return 1;
+}
+
+static bool keymap_exists_bool(const char *name) {
+        return keymap_exists(name) > 0;
+}
+
+static typeof(&keymap_is_valid) determine_keymap_validity_func(int rfd) {
+        int r;
+
+        r = dir_fd_is_root(rfd);
+        if (r < 0)
+                log_debug_errno(r, "Unable to determine if operating on host root directory, assuming we are: %m");
+
+        return r != 0 ? keymap_exists_bool : keymap_is_valid;
+}
+
+static int prompt_keymap(int rfd) {
+        _cleanup_strv_free_ char **kmaps = NULL;
+        int r;
+
+        assert(rfd >= 0);
+
+        if (arg_keymap)
+                return 0;
+
+        r = read_credential("firstboot.keymap", (void**) &arg_keymap, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read credential firstboot.keymap, ignoring: %m");
+        else {
+                log_debug("Acquired keymap from credential.");
+                return 0;
+        }
+
+        if (!arg_prompt_keymap) {
+                log_debug("Prompting for keymap was not requested.");
+                return 0;
+        }
+
+        r = get_keymaps(&kmaps);
+        if (r == -ENOENT) /* no keymaps installed */
+                return log_debug_errno(r, "No keymaps are installed.");
+        if (r < 0)
+                return log_error_errno(r, "Failed to read keymaps: %m");
+
+        print_welcome(rfd);
+
+        return prompt_loop("Please enter system keymap name or number",
+                           kmaps, 60, determine_keymap_validity_func(rfd), &arg_keymap);
+}
+
+static int process_keymap(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        char **keymap;
+        int r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/vconsole.conf",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN|CHASE_NOFOLLOW,
+                                       &f);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/vconsole.conf: %m");
+
+        r = should_configure(pfd, f);
+        if (r == 0)
+                log_debug("Found /etc/vconsole.conf, assuming console has been configured.");
+        if (r <= 0)
+                return r;
+
+        r = dir_fd_is_root(rfd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if directory file descriptor is root: %m");
+
+        if (arg_copy_keymap && r == 0) {
+                r = copy_file_atomic_at(AT_FDCWD, "/etc/vconsole.conf", pfd, f, 0644, COPY_REFLINK);
+                if (r != -ENOENT) {
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to copy host's /etc/vconsole.conf: %m");
+
+                        log_info("Copied host's /etc/vconsole.conf.");
+                        return 0;
+                }
+        }
+
+        r = prompt_keymap(rfd);
+        if (r == -ENOENT)
+                return 0; /* don't fail if no keymaps are installed */
+        if (r < 0)
+                return r;
+
+        if (isempty(arg_keymap))
+                return 0;
+
+        keymap = STRV_MAKE(strjoina("KEYMAP=", arg_keymap));
+
+        r = write_vconsole_conf(pfd, f, keymap);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/vconsole.conf: %m");
+
+        log_info("/etc/vconsole.conf written.");
+        return 1;
+}
+
+static bool timezone_is_valid_log_error(const char *name) {
+        return timezone_is_valid(name, LOG_ERR);
+}
+
+static int prompt_timezone(int rfd) {
+        _cleanup_strv_free_ char **zones = NULL;
+        int r;
+
+        assert(rfd >= 0);
+
+        if (arg_timezone)
+                return 0;
+
+        r = read_credential("firstboot.timezone", (void**) &arg_timezone, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read credential firstboot.timezone, ignoring: %m");
+        else {
+                log_debug("Acquired timezone from credential.");
+                return 0;
+        }
+
+        if (!arg_prompt_timezone) {
+                log_debug("Prompting for timezone was not requested.");
+                return 0;
+        }
+
+        r = get_timezones(&zones);
+        if (r < 0)
+                return log_error_errno(r, "Cannot query timezone list: %m");
+
+        print_welcome(rfd);
+
+        r = prompt_loop("Please enter timezone name or number",
+                        zones, 30, timezone_is_valid_log_error, &arg_timezone);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int process_timezone(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        const char *e;
+        int r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/localtime",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN|CHASE_NOFOLLOW,
+                                       &f);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/localtime: %m");
+
+        r = should_configure(pfd, f);
+        if (r == 0)
+                log_debug("Found /etc/localtime, assuming timezone has been configured.");
+        if (r <= 0)
+                return r;
+
+        r = dir_fd_is_root(rfd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if directory file descriptor is root: %m");
+
+        if (arg_copy_timezone && r == 0) {
+                _cleanup_free_ char *s = NULL;
+
+                r = readlink_malloc("/etc/localtime", &s);
+                if (r != -ENOENT) {
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to read host's /etc/localtime: %m");
+
+                        r = symlinkat_atomic_full(s, pfd, f, /* make_relative= */ false);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create /etc/localtime symlink: %m");
+
+                        log_info("Copied host's /etc/localtime.");
+                        return 0;
+                }
+        }
+
+        r = prompt_timezone(rfd);
+        if (r < 0)
+                return r;
+
+        if (isempty(arg_timezone))
+                return 0;
+
+        e = strjoina("../usr/share/zoneinfo/", arg_timezone);
+
+        r = symlinkat_atomic_full(e, pfd, f, /* make_relative= */ false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create /etc/localtime symlink: %m");
+
+        log_info("/etc/localtime written");
+        return 0;
+}
+
+static int prompt_hostname(int rfd) {
+        int r;
+
+        assert(rfd >= 0);
+
+        if (arg_hostname)
+                return 0;
+
+        if (!arg_prompt_hostname) {
+                log_debug("Prompting for hostname was not requested.");
+                return 0;
+        }
+
+        print_welcome(rfd);
+        putchar('\n');
+
+        for (;;) {
+                _cleanup_free_ char *h = NULL;
+
+                r = ask_string(&h, "%s Please enter hostname for new system (empty to skip): ", special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to query hostname: %m");
+
+                if (isempty(h)) {
+                        log_warning("No hostname entered, skipping.");
+                        break;
+                }
+
+                if (!hostname_is_valid(h, VALID_HOSTNAME_TRAILING_DOT)) {
+                        log_error("Specified hostname invalid.");
+                        continue;
+                }
+
+                /* Get rid of the trailing dot that we allow, but don't want to see */
+                arg_hostname = hostname_cleanup(h);
+                h = NULL;
+                break;
+        }
+
+        return 0;
+}
+
+static int process_hostname(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        int r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/hostname",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN,
+                                       &f);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/hostname: %m");
+
+        r = should_configure(pfd, f);
+        if (r == 0)
+                log_debug("Found /etc/hostname, assuming hostname has been configured.");
+        if (r <= 0)
+                return r;
+
+        r = prompt_hostname(rfd);
+        if (r < 0)
+                return r;
+
+        if (isempty(arg_hostname))
+                return 0;
+
+        r = write_string_file_at(pfd, f, arg_hostname,
+                                 WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_SYNC|WRITE_STRING_FILE_ATOMIC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/hostname: %m");
+
+        log_info("/etc/hostname written.");
+        return 0;
+}
+
+static int process_machine_id(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        int r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/machine-id",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN|CHASE_NOFOLLOW,
+                                       &f);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/machine-id: %m");
+
+        r = should_configure(pfd, f);
+        if (r == 0)
+                log_debug("Found /etc/machine-id, assuming machine-id has been configured.");
+        if (r <= 0)
+                return r;
+
+        if (sd_id128_is_null(arg_machine_id)) {
+                log_debug("Initialization of machine-id was not requested, skipping.");
+                return 0;
+        }
+
+        r = write_string_file_at(pfd, "machine-id", SD_ID128_TO_STRING(arg_machine_id),
+                                 WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_SYNC|WRITE_STRING_FILE_ATOMIC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/machine-id: %m");
+
+        log_info("/etc/machine-id written.");
+        return 0;
+}
+
+static int prompt_root_password(int rfd) {
+        const char *msg1, *msg2;
+        int r;
+
+        assert(rfd >= 0);
+
+        if (arg_root_password)
+                return 0;
+
+        if (get_credential_user_password("root", &arg_root_password, &arg_root_password_is_hashed) >= 0)
+                return 0;
+
+        if (!arg_prompt_root_password) {
+                log_debug("Prompting for root password was not requested.");
+                return 0;
+        }
+
+        print_welcome(rfd);
+        putchar('\n');
+
+        msg1 = strjoina(special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET), " Please enter a new root password (empty to skip):");
+        msg2 = strjoina(special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET), " Please enter new root password again:");
+
+        suggest_passwords();
+
+        for (;;) {
+                _cleanup_strv_free_erase_ char **a = NULL, **b = NULL;
+                _cleanup_free_ char *error = NULL;
+
+                r = ask_password_tty(-1, msg1, NULL, 0, 0, NULL, &a);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to query root password: %m");
+                if (strv_length(a) != 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Received multiple passwords, where we expected one.");
+
+                if (isempty(*a)) {
+                        log_warning("No password entered, skipping.");
+                        break;
+                }
+
+                r = check_password_quality(*a, /* old */ NULL, "root", &error);
+                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                        log_warning("Password quality check is not supported, proceeding anyway.");
+                else if (r < 0)
+                        return log_error_errno(r, "Failed to check password quality: %m");
+                else if (r == 0)
+                        log_warning("Password is weak, accepting anyway: %s", error);
+
+                r = ask_password_tty(-1, msg2, NULL, 0, 0, NULL, &b);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to query root password: %m");
+                if (strv_length(b) != 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Received multiple passwords, where we expected one.");
+
+                if (!streq(*a, *b)) {
+                        log_error("Entered passwords did not match, please try again.");
+                        continue;
+                }
+
+                arg_root_password = TAKE_PTR(*a);
+                break;
+        }
+
+        return 0;
+}
+
+static int find_shell(int rfd, const char *path) {
+        int r;
+
+        assert(path);
+
+        if (!valid_shell(path))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not a valid shell", path);
+
+        r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to resolve shell %s: %m", path);
+
+        return 0;
+}
+
+static int prompt_root_shell(int rfd) {
+        int r;
+
+        assert(rfd >= 0);
+
+        if (arg_root_shell)
+                return 0;
+
+        r = read_credential("passwd.shell.root", (void**) &arg_root_shell, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read credential passwd.shell.root, ignoring: %m");
+        else {
+                log_debug("Acquired root shell from credential.");
+                return 0;
+        }
+
+        if (!arg_prompt_root_shell) {
+                log_debug("Prompting for root shell was not requested.");
+                return 0;
+        }
+
+        print_welcome(rfd);
+        putchar('\n');
+
+        for (;;) {
+                _cleanup_free_ char *s = NULL;
+
+                r = ask_string(&s, "%s Please enter root shell for new system (empty to skip): ", special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to query root shell: %m");
+
+                if (isempty(s)) {
+                        log_warning("No shell entered, skipping.");
+                        break;
+                }
+
+                r = find_shell(rfd, s);
+                if (r < 0)
+                        continue;
+
+                arg_root_shell = TAKE_PTR(s);
+                break;
+        }
+
+        return 0;
+}
+
+static int write_root_passwd(int rfd, int etc_fd, const char *password, const char *shell) {
+        _cleanup_fclose_ FILE *original = NULL, *passwd = NULL;
+        _cleanup_(unlink_and_freep) char *passwd_tmp = NULL;
+        int r;
+
+        assert(password);
+
+        r = fopen_temporary_at_label(etc_fd, "passwd", "passwd", &passwd, &passwd_tmp);
+        if (r < 0)
+                return r;
+
+        r = xfopenat(etc_fd, "passwd", "re", O_NOFOLLOW, &original);
+        if (r < 0 && r != -ENOENT)
+                return r;
+
+        if (original) {
+                struct passwd *i;
+
+                r = copy_rights(fileno(original), fileno(passwd));
+                if (r < 0)
+                        return r;
+
+                while ((r = fgetpwent_sane(original, &i)) > 0) {
+
+                        if (streq(i->pw_name, "root")) {
+                                i->pw_passwd = (char *) password;
+                                if (shell)
+                                        i->pw_shell = (char *) shell;
+                        }
+
+                        r = putpwent_sane(i, passwd);
+                        if (r < 0)
+                                return r;
+                }
+                if (r < 0)
+                        return r;
+
+        } else {
+                struct passwd root = {
+                        .pw_name = (char *) "root",
+                        .pw_passwd = (char *) password,
+                        .pw_uid = 0,
+                        .pw_gid = 0,
+                        .pw_gecos = (char *) "Super User",
+                        .pw_dir = (char *) "/root",
+                        .pw_shell = (char *) (shell ?: default_root_shell_at(rfd)),
+                };
+
+                if (errno != ENOENT)
+                        return -errno;
+
+                r = fchmod(fileno(passwd), 0644);
+                if (r < 0)
+                        return -errno;
+
+                r = putpwent_sane(&root, passwd);
+                if (r < 0)
+                        return r;
+        }
+
+        r = fflush_sync_and_check(passwd);
+        if (r < 0)
+                return r;
+
+        r = renameat_and_apply_smack_floor_label(etc_fd, passwd_tmp, etc_fd, "passwd");
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int write_root_shadow(int etc_fd, const char *hashed_password) {
+        _cleanup_fclose_ FILE *original = NULL, *shadow = NULL;
+        _cleanup_(unlink_and_freep) char *shadow_tmp = NULL;
+        int r;
+
+        assert(hashed_password);
+
+        r = fopen_temporary_at_label(etc_fd, "shadow", "shadow", &shadow, &shadow_tmp);
+        if (r < 0)
+                return r;
+
+        r = xfopenat(etc_fd, "shadow", "re", O_NOFOLLOW, &original);
+        if (r < 0 && r != -ENOENT)
+                return r;
+
+        if (original) {
+                struct spwd *i;
+
+                r = copy_rights(fileno(original), fileno(shadow));
+                if (r < 0)
+                        return r;
+
+                while ((r = fgetspent_sane(original, &i)) > 0) {
+
+                        if (streq(i->sp_namp, "root")) {
+                                i->sp_pwdp = (char *) hashed_password;
+                                i->sp_lstchg = (long) (now(CLOCK_REALTIME) / USEC_PER_DAY);
+                        }
+
+                        r = putspent_sane(i, shadow);
+                        if (r < 0)
+                                return r;
+                }
+                if (r < 0)
+                        return r;
+
+        } else {
+                struct spwd root = {
+                        .sp_namp = (char*) "root",
+                        .sp_pwdp = (char *) hashed_password,
+                        .sp_lstchg = (long) (now(CLOCK_REALTIME) / USEC_PER_DAY),
+                        .sp_min = -1,
+                        .sp_max = -1,
+                        .sp_warn = -1,
+                        .sp_inact = -1,
+                        .sp_expire = -1,
+                        .sp_flag = ULONG_MAX, /* this appears to be what everybody does ... */
+                };
+
+                if (errno != ENOENT)
+                        return -errno;
+
+                r = fchmod(fileno(shadow), 0000);
+                if (r < 0)
+                        return -errno;
+
+                r = putspent_sane(&root, shadow);
+                if (r < 0)
+                        return r;
+        }
+
+        r = fflush_sync_and_check(shadow);
+        if (r < 0)
+                return r;
+
+        r = renameat_and_apply_smack_floor_label(etc_fd, shadow_tmp, etc_fd, "shadow");
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int process_root_account(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_(release_lock_file) LockFile lock = LOCK_FILE_INIT;
+        _cleanup_(erase_and_freep) char *_hashed_password = NULL;
+        const char *password, *hashed_password;
+        int k = 0, r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/passwd",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN|CHASE_NOFOLLOW,
+                                       NULL);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/passwd: %m");
+
+        /* Ensure that passwd and shadow are in the same directory and are not symlinks. */
+
+        FOREACH_STRING(s, "passwd", "shadow") {
+                r = verify_regular_at(pfd, s, /* follow = */ false);
+                if (IN_SET(r, -EISDIR, -ELOOP, -EBADFD))
+                        return log_error_errno(r, "/etc/%s is not a regular file", s);
+                if (r < 0 && r != -ENOENT)
+                        return log_error_errno(r, "Failed to check whether /etc/%s is a regular file: %m", s);
+
+                r = should_configure(pfd, s);
+                if (r < 0)
+                        return r;
+
+                k += r;
+        }
+
+        if (k == 0) {
+                log_debug("Found /etc/passwd and /etc/shadow, assuming root account has been initialized.");
+                return 0;
+        }
+
+        /* Don't create/modify passwd and shadow if not asked */
+        if (!(arg_root_password || arg_prompt_root_password || arg_copy_root_password || arg_delete_root_password ||
+              arg_root_shell || arg_prompt_root_shell || arg_copy_root_shell)) {
+                log_debug("Initialization of root account was not requested, skipping.");
+                return 0;
+        }
+
+        r = make_lock_file_at(pfd, ETC_PASSWD_LOCK_FILENAME, LOCK_EX, &lock);
+        if (r < 0)
+                return log_error_errno(r, "Failed to take a lock on /etc/passwd: %m");
+
+        k = dir_fd_is_root(rfd);
+        if (k < 0)
+                return log_error_errno(k, "Failed to check if directory file descriptor is root: %m");
+
+        if (arg_copy_root_shell && k == 0) {
+                struct passwd *p;
+
+                errno = 0;
+                p = getpwnam("root");
+                if (!p)
+                        return log_error_errno(errno_or_else(EIO), "Failed to find passwd entry for root: %m");
+
+                r = free_and_strdup(&arg_root_shell, p->pw_shell);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        r = prompt_root_shell(rfd);
+        if (r < 0)
+                return r;
+
+        if (arg_copy_root_password && k == 0) {
+                struct spwd *p;
+
+                errno = 0;
+                p = getspnam("root");
+                if (!p)
+                        return log_error_errno(errno_or_else(EIO), "Failed to find shadow entry for root: %m");
+
+                r = free_and_strdup(&arg_root_password, p->sp_pwdp);
+                if (r < 0)
+                        return log_oom();
+
+                arg_root_password_is_hashed = true;
+        }
+
+        r = prompt_root_password(rfd);
+        if (r < 0)
+                return r;
+
+        if (arg_root_password && arg_root_password_is_hashed) {
+                password = PASSWORD_SEE_SHADOW;
+                hashed_password = arg_root_password;
+        } else if (arg_root_password) {
+                r = hash_password(arg_root_password, &_hashed_password);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to hash password: %m");
+
+                password = PASSWORD_SEE_SHADOW;
+                hashed_password = _hashed_password;
+
+        } else if (arg_delete_root_password)
+                password = hashed_password = PASSWORD_NONE;
+        else
+                password = hashed_password = PASSWORD_LOCKED_AND_INVALID;
+
+        r = write_root_passwd(rfd, pfd, password, arg_root_shell);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/passwd: %m");
+
+        log_info("/etc/passwd written.");
+
+        r = write_root_shadow(pfd, hashed_password);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/shadow: %m");
+
+        log_info("/etc/shadow written.");
+        return 0;
+}
+
+static int process_kernel_cmdline(int rfd) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        int r;
+
+        assert(rfd >= 0);
+
+        pfd = chase_and_open_parent_at(rfd, "/etc/kernel/cmdline",
+                                       CHASE_AT_RESOLVE_IN_ROOT|CHASE_MKDIR_0755|CHASE_WARN|CHASE_NOFOLLOW,
+                                       &f);
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to chase /etc/kernel/cmdline: %m");
+
+        r = should_configure(pfd, f);
+        if (r == 0)
+                log_debug("Found /etc/kernel/cmdline, assuming kernel command line has been configured.");
+        if (r <= 0)
+                return r;
+
+        if (!arg_kernel_cmdline) {
+                log_debug("Creation of /etc/kernel/cmdline was not requested, skipping.");
+                return 0;
+        }
+
+        r = write_string_file_at(pfd, "cmdline", arg_kernel_cmdline,
+                                 WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_SYNC|WRITE_STRING_FILE_ATOMIC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write /etc/kernel/cmdline: %m");
+
+        log_info("/etc/kernel/cmdline written.");
+        return 0;
+}
+
+static int reset_one(int rfd, const char *path) {
+        _cleanup_close_ int pfd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+
+        assert(rfd >= 0);
+        assert(path);
+
+        pfd = chase_and_open_parent_at(rfd, path, CHASE_AT_RESOLVE_IN_ROOT|CHASE_WARN|CHASE_NOFOLLOW, &f);
+        if (pfd == -ENOENT)
+                return 0;
+        if (pfd < 0)
+                return log_error_errno(pfd, "Failed to resolve %s: %m", path);
+
+        if (unlinkat(pfd, f, 0) < 0)
+                return errno == ENOENT ? 0 : log_error_errno(errno, "Failed to remove %s: %m", path);
+
+        log_info("Removed %s", path);
+        return 0;
+}
+
+static int process_reset(int rfd) {
+        int r;
+
+        assert(rfd >= 0);
+
+        if (!arg_reset)
+                return 0;
+
+        FOREACH_STRING(p,
+                       "/etc/locale.conf",
+                       "/etc/vconsole.conf",
+                       "/etc/hostname",
+                       "/etc/machine-id",
+                       "/etc/kernel/cmdline",
+                       "/etc/localtime") {
+                r = reset_one(rfd, p);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-firstboot", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...]\n\n"
+               "Configures basic settings of the system.\n\n"
+               "  -h --help                       Show this help\n"
+               "     --version                    Show package version\n"
+               "     --root=PATH                  Operate on an alternate filesystem root\n"
+               "     --image=PATH                 Operate on disk image as filesystem root\n"
+               "     --image-policy=POLICY        Specify disk image dissection policy\n"
+               "     --locale=LOCALE              Set primary locale (LANG=)\n"
+               "     --locale-messages=LOCALE     Set message locale (LC_MESSAGES=)\n"
+               "     --keymap=KEYMAP              Set keymap\n"
+               "     --timezone=TIMEZONE          Set timezone\n"
+               "     --hostname=NAME              Set hostname\n"
+               "     --setup-machine-id           Set a random machine ID\n"
+               "     --machine-id=ID              Set specified machine ID\n"
+               "     --root-password=PASSWORD     Set root password from plaintext password\n"
+               "     --root-password-file=FILE    Set root password from file\n"
+               "     --root-password-hashed=HASH  Set root password from hashed password\n"
+               "     --root-shell=SHELL           Set root shell\n"
+               "     --kernel-command-line=CMDLINE\n"
+               "                                  Set kernel command line\n"
+               "     --prompt-locale              Prompt the user for locale settings\n"
+               "     --prompt-keymap              Prompt the user for keymap settings\n"
+               "     --prompt-timezone            Prompt the user for timezone\n"
+               "     --prompt-hostname            Prompt the user for hostname\n"
+               "     --prompt-root-password       Prompt the user for root password\n"
+               "     --prompt-root-shell          Prompt the user for root shell\n"
+               "     --prompt                     Prompt for all of the above\n"
+               "     --copy-locale                Copy locale from host\n"
+               "     --copy-keymap                Copy keymap from host\n"
+               "     --copy-timezone              Copy timezone from host\n"
+               "     --copy-root-password         Copy root password from host\n"
+               "     --copy-root-shell            Copy root shell from host\n"
+               "     --copy                       Copy locale, keymap, timezone, root password\n"
+               "     --force                      Overwrite existing files\n"
+               "     --delete-root-password       Delete root password\n"
+               "     --welcome=no                 Disable the welcome text\n"
+               "     --reset                      Remove existing files\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_ROOT,
+                ARG_IMAGE,
+                ARG_IMAGE_POLICY,
+                ARG_LOCALE,
+                ARG_LOCALE_MESSAGES,
+                ARG_KEYMAP,
+                ARG_TIMEZONE,
+                ARG_HOSTNAME,
+                ARG_SETUP_MACHINE_ID,
+                ARG_MACHINE_ID,
+                ARG_ROOT_PASSWORD,
+                ARG_ROOT_PASSWORD_FILE,
+                ARG_ROOT_PASSWORD_HASHED,
+                ARG_ROOT_SHELL,
+                ARG_KERNEL_COMMAND_LINE,
+                ARG_PROMPT,
+                ARG_PROMPT_LOCALE,
+                ARG_PROMPT_KEYMAP,
+                ARG_PROMPT_TIMEZONE,
+                ARG_PROMPT_HOSTNAME,
+                ARG_PROMPT_ROOT_PASSWORD,
+                ARG_PROMPT_ROOT_SHELL,
+                ARG_COPY,
+                ARG_COPY_LOCALE,
+                ARG_COPY_KEYMAP,
+                ARG_COPY_TIMEZONE,
+                ARG_COPY_ROOT_PASSWORD,
+                ARG_COPY_ROOT_SHELL,
+                ARG_FORCE,
+                ARG_DELETE_ROOT_PASSWORD,
+                ARG_WELCOME,
+                ARG_RESET,
+        };
+
+        static const struct option options[] = {
+                { "help",                    no_argument,       NULL, 'h'                         },
+                { "version",                 no_argument,       NULL, ARG_VERSION                 },
+                { "root",                    required_argument, NULL, ARG_ROOT                    },
+                { "image",                   required_argument, NULL, ARG_IMAGE                   },
+                { "image-policy",            required_argument, NULL, ARG_IMAGE_POLICY            },
+                { "locale",                  required_argument, NULL, ARG_LOCALE                  },
+                { "locale-messages",         required_argument, NULL, ARG_LOCALE_MESSAGES         },
+                { "keymap",                  required_argument, NULL, ARG_KEYMAP                  },
+                { "timezone",                required_argument, NULL, ARG_TIMEZONE                },
+                { "hostname",                required_argument, NULL, ARG_HOSTNAME                },
+                { "setup-machine-id",        no_argument,       NULL, ARG_SETUP_MACHINE_ID        },
+                { "machine-id",              required_argument, NULL, ARG_MACHINE_ID              },
+                { "root-password",           required_argument, NULL, ARG_ROOT_PASSWORD           },
+                { "root-password-file",      required_argument, NULL, ARG_ROOT_PASSWORD_FILE      },
+                { "root-password-hashed",    required_argument, NULL, ARG_ROOT_PASSWORD_HASHED    },
+                { "root-shell",              required_argument, NULL, ARG_ROOT_SHELL              },
+                { "kernel-command-line",     required_argument, NULL, ARG_KERNEL_COMMAND_LINE     },
+                { "prompt",                  no_argument,       NULL, ARG_PROMPT                  },
+                { "prompt-locale",           no_argument,       NULL, ARG_PROMPT_LOCALE           },
+                { "prompt-keymap",           no_argument,       NULL, ARG_PROMPT_KEYMAP           },
+                { "prompt-timezone",         no_argument,       NULL, ARG_PROMPT_TIMEZONE         },
+                { "prompt-hostname",         no_argument,       NULL, ARG_PROMPT_HOSTNAME         },
+                { "prompt-root-password",    no_argument,       NULL, ARG_PROMPT_ROOT_PASSWORD    },
+                { "prompt-root-shell",       no_argument,       NULL, ARG_PROMPT_ROOT_SHELL       },
+                { "copy",                    no_argument,       NULL, ARG_COPY                    },
+                { "copy-locale",             no_argument,       NULL, ARG_COPY_LOCALE             },
+                { "copy-keymap",             no_argument,       NULL, ARG_COPY_KEYMAP             },
+                { "copy-timezone",           no_argument,       NULL, ARG_COPY_TIMEZONE           },
+                { "copy-root-password",      no_argument,       NULL, ARG_COPY_ROOT_PASSWORD      },
+                { "copy-root-shell",         no_argument,       NULL, ARG_COPY_ROOT_SHELL         },
+                { "force",                   no_argument,       NULL, ARG_FORCE                   },
+                { "delete-root-password",    no_argument,       NULL, ARG_DELETE_ROOT_PASSWORD    },
+                { "welcome",                 required_argument, NULL, ARG_WELCOME                 },
+                { "reset",                   no_argument,       NULL, ARG_RESET                   },
+                {}
+        };
+
+        int r, c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_ROOT:
+                        r = parse_path_argument(optarg, true, &arg_root);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE:
+                        r = parse_path_argument(optarg, false, &arg_image);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_IMAGE_POLICY:
+                        r = parse_image_policy_argument(optarg, &arg_image_policy);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_LOCALE:
+                        r = free_and_strdup(&arg_locale, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_LOCALE_MESSAGES:
+                        r = free_and_strdup(&arg_locale_messages, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_KEYMAP:
+                        if (!keymap_is_valid(optarg))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Keymap %s is not valid.", optarg);
+
+                        r = free_and_strdup(&arg_keymap, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_TIMEZONE:
+                        if (!timezone_is_valid(optarg, LOG_ERR))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Timezone %s is not valid.", optarg);
+
+                        r = free_and_strdup(&arg_timezone, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_ROOT_PASSWORD:
+                        r = free_and_strdup(&arg_root_password, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        arg_root_password_is_hashed = false;
+                        break;
+
+                case ARG_ROOT_PASSWORD_FILE:
+                        arg_root_password = mfree(arg_root_password);
+
+                        r = read_one_line_file(optarg, &arg_root_password);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to read %s: %m", optarg);
+
+                        arg_root_password_is_hashed = false;
+                        break;
+
+                case ARG_ROOT_PASSWORD_HASHED:
+                        r = free_and_strdup(&arg_root_password, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        arg_root_password_is_hashed = true;
+                        break;
+
+                case ARG_ROOT_SHELL:
+                        r = free_and_strdup(&arg_root_shell, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_HOSTNAME:
+                        if (!hostname_is_valid(optarg, VALID_HOSTNAME_TRAILING_DOT))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Host name %s is not valid.", optarg);
+
+                        r = free_and_strdup(&arg_hostname, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        hostname_cleanup(arg_hostname);
+                        break;
+
+                case ARG_SETUP_MACHINE_ID:
+                        r = sd_id128_randomize(&arg_machine_id);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to generate randomized machine ID: %m");
+
+                        break;
+
+                case ARG_MACHINE_ID:
+                        r = sd_id128_from_string(optarg, &arg_machine_id);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse machine id %s.", optarg);
+
+                        break;
+
+                case ARG_KERNEL_COMMAND_LINE:
+                        r = free_and_strdup(&arg_kernel_cmdline, optarg);
+                        if (r < 0)
+                                return log_oom();
+
+                        break;
+
+                case ARG_PROMPT:
+                        arg_prompt_locale = arg_prompt_keymap = arg_prompt_timezone = arg_prompt_hostname =
+                                arg_prompt_root_password = arg_prompt_root_shell = true;
+                        break;
+
+                case ARG_PROMPT_LOCALE:
+                        arg_prompt_locale = true;
+                        break;
+
+                case ARG_PROMPT_KEYMAP:
+                        arg_prompt_keymap = true;
+                        break;
+
+                case ARG_PROMPT_TIMEZONE:
+                        arg_prompt_timezone = true;
+                        break;
+
+                case ARG_PROMPT_HOSTNAME:
+                        arg_prompt_hostname = true;
+                        break;
+
+                case ARG_PROMPT_ROOT_PASSWORD:
+                        arg_prompt_root_password = true;
+                        break;
+
+                case ARG_PROMPT_ROOT_SHELL:
+                        arg_prompt_root_shell = true;
+                        break;
+
+                case ARG_COPY:
+                        arg_copy_locale = arg_copy_keymap = arg_copy_timezone = arg_copy_root_password =
+                                arg_copy_root_shell = true;
+                        break;
+
+                case ARG_COPY_LOCALE:
+                        arg_copy_locale = true;
+                        break;
+
+                case ARG_COPY_KEYMAP:
+                        arg_copy_keymap = true;
+                        break;
+
+                case ARG_COPY_TIMEZONE:
+                        arg_copy_timezone = true;
+                        break;
+
+                case ARG_COPY_ROOT_PASSWORD:
+                        arg_copy_root_password = true;
+                        break;
+
+                case ARG_COPY_ROOT_SHELL:
+                        arg_copy_root_shell = true;
+                        break;
+
+                case ARG_FORCE:
+                        arg_force = true;
+                        break;
+
+                case ARG_DELETE_ROOT_PASSWORD:
+                        arg_delete_root_password = true;
+                        break;
+
+                case ARG_WELCOME:
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --welcome= argument: %s", optarg);
+
+                        arg_welcome = r;
+                        break;
+
+                case ARG_RESET:
+                        arg_reset = true;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        if (arg_delete_root_password && (arg_copy_root_password || arg_root_password || arg_prompt_root_password))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--delete-root-password cannot be combined with other root password options.");
+
+        if (arg_image && arg_root)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--root= and --image= cannot be used together.");
+
+        if (!sd_id128_is_null(arg_machine_id) && !(arg_image || arg_root) && !arg_force)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "--machine-id=/--setup-machine-id only works with --root= or --image=.");
+
+        return 1;
+}
+
+static int reload_system_manager(sd_bus **bus) {
+        int r;
+
+        assert(bus);
+
+        if (!*bus) {
+                r = bus_connect_transport_systemd(BUS_TRANSPORT_LOCAL, NULL, RUNTIME_SCOPE_SYSTEM, bus);
+                if (r < 0)
+                        return bus_log_connect_error(r, BUS_TRANSPORT_LOCAL);
+        }
+
+        r = bus_service_manager_reload(*bus);
+        if (r < 0)
+                return r;
+
+        log_info("Requested manager reload to apply locale configuration.");
+        return 0;
+}
+
+static int reload_vconsole(sd_bus **bus) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL;
+        const char *object;
+        int r;
+
+        assert(bus);
+
+        if (!*bus) {
+                r = bus_connect_transport_systemd(BUS_TRANSPORT_LOCAL, NULL, RUNTIME_SCOPE_SYSTEM, bus);
+                if (r < 0)
+                        return bus_log_connect_error(r, BUS_TRANSPORT_LOCAL);
+        }
+
+        r = bus_wait_for_jobs_new(*bus, &w);
+        if (r < 0)
+                return log_error_errno(r, "Could not watch jobs: %m");
+
+        r = bus_call_method(*bus, bus_systemd_mgr, "RestartUnit", &error, &reply,
+                            "ss", "systemd-vconsole-setup.service", "replace");
+        if (r < 0)
+                return log_error_errno(r, "Failed to issue method call: %s", bus_error_message(&error, r));
+
+        r = sd_bus_message_read(reply, "o", &object);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        r = bus_wait_for_jobs_one(w, object, false, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to wait for systemd-vconsole-setup.service/restart: %m");
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+        _cleanup_(umount_and_freep) char *mounted_dir = NULL;
+        _cleanup_close_ int rfd = -EBADF;
+        int r;
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        log_setup();
+
+        umask(0022);
+
+        bool offline = arg_root || arg_image;
+
+        if (!offline) {
+                /* If we are called without --root=/--image= let's honour the systemd.firstboot kernel
+                 * command line option, because we are called to provision the host with basic settings (as
+                 * opposed to some other file system tree/image) */
+
+                bool enabled;
+                r = proc_cmdline_get_bool("systemd.firstboot", /* flags = */ 0, &enabled);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse systemd.firstboot= kernel command line argument, ignoring: %m");
+                if (r > 0 && !enabled) {
+                        log_debug("Found systemd.firstboot=no kernel command line argument, terminating.");
+                        return 0; /* disabled */
+                }
+        }
+
+        if (arg_image) {
+                assert(!arg_root);
+
+                r = mount_image_privately_interactively(
+                                arg_image,
+                                arg_image_policy,
+                                DISSECT_IMAGE_GENERIC_ROOT |
+                                DISSECT_IMAGE_REQUIRE_ROOT |
+                                DISSECT_IMAGE_VALIDATE_OS |
+                                DISSECT_IMAGE_RELAX_VAR_CHECK |
+                                DISSECT_IMAGE_FSCK |
+                                DISSECT_IMAGE_GROWFS,
+                                &mounted_dir,
+                                &rfd,
+                                &loop_device);
+                if (r < 0)
+                        return r;
+
+                arg_root = strdup(mounted_dir);
+                if (!arg_root)
+                        return log_oom();
+        } else {
+                rfd = open(empty_to_root(arg_root), O_DIRECTORY|O_CLOEXEC);
+                if (rfd < 0)
+                        return log_error_errno(errno, "Failed to open %s: %m", empty_to_root(arg_root));
+        }
+
+        LOG_SET_PREFIX(arg_image ?: arg_root);
+
+        /* We check these conditions here instead of in parse_argv() so that we can take the root directory
+         * into account. */
+
+        if (arg_keymap && !determine_keymap_validity_func(rfd)(arg_keymap))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Keymap %s is not installed.", arg_keymap);
+        if (arg_locale && !locale_is_ok(rfd, arg_locale))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Locale %s is not installed.", arg_locale);
+        if (arg_locale_messages && !locale_is_ok(rfd, arg_locale_messages))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Locale %s is not installed.", arg_locale_messages);
+
+        if (arg_root_shell) {
+                r = find_shell(rfd, arg_root_shell);
+                if (r < 0)
+                        return r;
+        }
+
+        r = process_reset(rfd);
+        if (r < 0)
+                return r;
+
+        r = process_locale(rfd);
+        if (r < 0)
+                return r;
+        if (r > 0 && !offline)
+                (void) reload_system_manager(&bus);
+
+        r = process_keymap(rfd);
+        if (r < 0)
+                return r;
+        if (r > 0 && !offline)
+                (void) reload_vconsole(&bus);
+
+        r = process_timezone(rfd);
+        if (r < 0)
+                return r;
+
+        r = process_hostname(rfd);
+        if (r < 0)
+                return r;
+
+        r = process_machine_id(rfd);
+        if (r < 0)
+                return r;
+
+        r = process_root_account(rfd);
+        if (r < 0)
+                return r;
+
+        r = process_kernel_cmdline(rfd);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/firstboot/meson.build b/src/firstboot/meson.build
new file mode 100644
index 0000000..28c1d27
--- /dev/null
+++ b/src/firstboot/meson.build
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-firstboot',
+                'public' : true,
+                'conditions' : ['ENABLE_FIRSTBOOT'],
+                'sources' : files('firstboot.c'),
+                'dependencies' : libcrypt,
+        },
+]
diff --git a/src/fsck/fsck.c b/src/fsck/fsck.c
new file mode 100644
index 0000000..000ed69
--- /dev/null
+++ b/src/fsck/fsck.c
@@ -0,0 +1,420 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+  Copyright © 2014 Holger Hans Peter Freyther
+***/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+#include "sd-device.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-util.h"
+#include "device-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "fsck-util.h"
+#include "main-func.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "special.h"
+#include "stdio-util.h"
+
+static bool arg_skip = false;
+static bool arg_force = false;
+static bool arg_show_progress = false;
+static const char *arg_repair = "-a";
+
+static void start_target(const char *target, const char *mode) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        assert(target);
+
+        r = bus_connect_system_systemd(&bus);
+        if (r < 0) {
+                log_error_errno(r, "Failed to get D-Bus connection: %m");
+                return;
+        }
+
+        log_info("Requesting %s/start/%s", target, mode);
+
+        /* Start this unit only if we can replace basic.target with it */
+        r = bus_call_method(bus, bus_systemd_mgr, "StartUnitReplace", &error, NULL, "sss", "basic.target", target, mode);
+
+        /* Don't print a warning if we aren't called during startup */
+        if (r < 0 && !sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_JOB))
+                log_error("Failed to start unit: %s", bus_error_message(&error, r));
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        int r;
+
+        assert(key);
+
+        if (streq(key, "fsck.mode")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (streq(value, "auto"))
+                        arg_force = arg_skip = false;
+                else if (streq(value, "force"))
+                        arg_force = true;
+                else if (streq(value, "skip"))
+                        arg_skip = true;
+                else
+                        log_warning("Invalid fsck.mode= parameter '%s'. Ignoring.", value);
+
+        } else if (streq(key, "fsck.repair")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (streq(value, "preen"))
+                        arg_repair = "-a";
+                else {
+                        r = parse_boolean(value);
+                        if (r > 0)
+                                arg_repair = "-y";
+                        else if (r == 0)
+                                arg_repair = "-n";
+                        else
+                                log_warning("Invalid fsck.repair= parameter '%s'. Ignoring.", value);
+                }
+        }
+
+#if HAVE_SYSV_COMPAT
+        else if (streq(key, "fastboot") && !value) {
+                log_warning("Please pass 'fsck.mode=skip' rather than 'fastboot' on the kernel command line.");
+                arg_skip = true;
+
+        } else if (streq(key, "forcefsck") && !value) {
+                log_warning("Please pass 'fsck.mode=force' rather than 'forcefsck' on the kernel command line.");
+                arg_force = true;
+        }
+#endif
+
+        return 0;
+}
+
+static void test_files(void) {
+
+#if HAVE_SYSV_COMPAT
+        if (access("/fastboot", F_OK) >= 0) {
+                log_error("Please pass 'fsck.mode=skip' on the kernel command line rather than creating /fastboot on the root file system.");
+                arg_skip = true;
+        }
+
+        if (access("/forcefsck", F_OK) >= 0) {
+                log_error("Please pass 'fsck.mode=force' on the kernel command line rather than creating /forcefsck on the root file system.");
+                arg_force = true;
+        }
+#endif
+
+        arg_show_progress = access("/run/systemd/show-status", F_OK) >= 0;
+}
+
+static double percent(int pass, unsigned long cur, unsigned long max) {
+        /* Values stolen from e2fsck */
+
+        static const int pass_table[] = {
+                0, 70, 90, 92, 95, 100
+        };
+
+        if (pass <= 0)
+                return 0.0;
+
+        if ((unsigned) pass >= ELEMENTSOF(pass_table) || max == 0)
+                return 100.0;
+
+        return (double) pass_table[pass-1] +
+                ((double) pass_table[pass] - (double) pass_table[pass-1]) *
+                (double) cur / (double) max;
+}
+
+static int process_progress(int fd, FILE* console) {
+        _cleanup_fclose_ FILE *f = NULL;
+        usec_t last = 0;
+        bool locked = false;
+        int clear = 0, r;
+
+        /* No progress pipe to process? Then we are a NOP. */
+        if (fd < 0)
+                return 0;
+
+        f = fdopen(fd, "r");
+        if (!f) {
+                safe_close(fd);
+                return log_debug_errno(errno, "Failed to use pipe: %m");
+        }
+
+        for (;;) {
+                int pass;
+                unsigned long cur, max;
+                _cleanup_free_ char *device = NULL;
+                double p;
+                usec_t t;
+
+                if (fscanf(f, "%i %lu %lu %ms", &pass, &cur, &max, &device) != 4) {
+
+                        if (ferror(f))
+                                r = log_warning_errno(errno, "Failed to read from progress pipe: %m");
+                        else if (feof(f))
+                                r = 0;
+                        else
+                                r = log_warning_errno(SYNTHETIC_ERRNO(errno), "Failed to parse progress pipe data");
+
+                        break;
+                }
+
+                /* Only show one progress counter at max */
+                if (!locked) {
+                        if (flock(fileno(console), LOCK_EX|LOCK_NB) < 0)
+                                continue;
+
+                        locked = true;
+                }
+
+                /* Only update once every 50ms */
+                t = now(CLOCK_MONOTONIC);
+                if (last + 50 * USEC_PER_MSEC > t)
+                        continue;
+
+                last = t;
+
+                p = percent(pass, cur, max);
+                r = fprintf(console, "\r%s: fsck %3.1f%% complete...\r", device, p);
+                if (r < 0)
+                        return -EIO; /* No point in continuing if something happened to our output stream */
+
+                fflush(console);
+                clear = MAX(clear, r);
+        }
+
+        if (clear > 0) {
+                fputc('\r', console);
+                for (int j = 0; j < clear; j++)
+                        fputc(' ', console);
+                fputc('\r', console);
+                fflush(console);
+        }
+
+        return r;
+}
+
+static int fsck_progress_socket(void) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        fd = socket(AF_UNIX, SOCK_STREAM, 0);
+        if (fd < 0)
+                return log_warning_errno(errno, "socket(): %m");
+
+        r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/fsck.progress");
+        if (r < 0)
+                return log_full_errno(IN_SET(r, -ECONNREFUSED, -ENOENT) ? LOG_DEBUG : LOG_WARNING,
+                                      r, "Failed to connect to progress socket, ignoring: %m");
+
+        return TAKE_FD(fd);
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_close_pair_ int progress_pipe[2] = EBADF_PAIR;
+        _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+        _cleanup_free_ char *dpath = NULL;
+        _cleanup_fclose_ FILE *console = NULL;
+        const char *device, *type;
+        bool root_directory;
+        struct stat st;
+        int r, exit_status;
+        pid_t pid;
+
+        log_setup();
+
+        if (argc > 2)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "This program expects one or no arguments.");
+
+        umask(0022);
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, PROC_CMDLINE_STRIP_RD_PREFIX);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+
+        test_files();
+
+        if (!arg_force && arg_skip)
+                return 0;
+
+        if (argc > 1) {
+                dpath = strdup(argv[1]);
+                if (!dpath)
+                        return log_oom();
+
+                device = dpath;
+
+                if (stat(device, &st) < 0)
+                        return log_error_errno(errno, "Failed to stat %s: %m", device);
+
+                if (!S_ISBLK(st.st_mode))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "%s is not a block device.",
+                                               device);
+
+                r = sd_device_new_from_stat_rdev(&dev, &st);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to detect device %s: %m", device);
+
+                root_directory = false;
+        } else {
+                struct timespec times[2];
+
+                /* Find root device */
+
+                if (stat("/", &st) < 0)
+                        return log_error_errno(errno, "Failed to stat() the root directory: %m");
+
+                /* Virtual root devices don't need an fsck */
+                if (major(st.st_dev) == 0) {
+                        log_debug("Root directory is virtual or btrfs, skipping check.");
+                        return 0;
+                }
+
+                /* check if we are already writable */
+                times[0] = st.st_atim;
+                times[1] = st.st_mtim;
+
+                if (utimensat(AT_FDCWD, "/", times, 0) == 0) {
+                        log_info("Root directory is writable, skipping check.");
+                        return 0;
+                }
+
+                r = sd_device_new_from_devnum(&dev, 'b', st.st_dev);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to detect root device: %m");
+
+                r = sd_device_get_devname(dev, &device);
+                if (r < 0)
+                        return log_device_error_errno(dev, r, "Failed to detect device node of root directory: %m");
+
+                root_directory = true;
+        }
+
+        if (sd_device_get_property_value(dev, "ID_FS_TYPE", &type) >= 0) {
+                r = fsck_exists_for_fstype(type);
+                if (r < 0)
+                        log_device_warning_errno(dev, r, "Couldn't detect if fsck.%s may be used, proceeding: %m", type);
+                else if (r == 0) {
+                        log_device_info(dev, "fsck.%s doesn't exist, not checking file system.", type);
+                        return 0;
+                }
+        } else {
+                r = fsck_exists();
+                if (r < 0)
+                        log_device_warning_errno(dev, r, "Couldn't detect if the fsck command may be used, proceeding: %m");
+                else if (r == 0) {
+                        log_device_info(dev, "The fsck command does not exist, not checking file system.");
+                        return 0;
+                }
+        }
+
+        console = fopen("/dev/console", "we");
+        if (console &&
+            arg_show_progress &&
+            pipe(progress_pipe) < 0)
+                return log_error_errno(errno, "pipe(): %m");
+
+        r = safe_fork("(fsck)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                char dash_c[STRLEN("-C") + DECIMAL_STR_MAX(int) + 1];
+                int progress_socket = -1;
+                const char *cmdline[9];
+                int i = 0;
+
+                /* Child */
+
+                /* Close the reading side of the progress pipe */
+                progress_pipe[0] = safe_close(progress_pipe[0]);
+
+                /* Try to connect to a progress management daemon, if there is one */
+                progress_socket = fsck_progress_socket();
+                if (progress_socket >= 0) {
+                        /* If this worked we close the progress pipe early, and just use the socket */
+                        progress_pipe[1] = safe_close(progress_pipe[1]);
+                        xsprintf(dash_c, "-C%i", progress_socket);
+                } else if (progress_pipe[1] >= 0) {
+                        /* Otherwise if we have the progress pipe to our own local handle, we use it */
+                        xsprintf(dash_c, "-C%i", progress_pipe[1]);
+                } else
+                        dash_c[0] = 0;
+
+                cmdline[i++] = "fsck";
+                cmdline[i++] =  arg_repair;
+                cmdline[i++] = "-T";
+
+                /*
+                 * Since util-linux v2.25 fsck uses /run/fsck/.lock files.
+                 * The previous versions use flock for the device and conflict with
+                 * udevd, see https://bugs.freedesktop.org/show_bug.cgi?id=79576#c5
+                 */
+                cmdline[i++] = "-l";
+
+                if (!root_directory)
+                        cmdline[i++] = "-M";
+
+                if (arg_force)
+                        cmdline[i++] = "-f";
+
+                if (!isempty(dash_c))
+                        cmdline[i++] = dash_c;
+
+                cmdline[i++] = device;
+                cmdline[i++] = NULL;
+
+                execvp(cmdline[0], (char**) cmdline);
+                _exit(FSCK_OPERATIONAL_ERROR);
+        }
+
+        if (console) {
+                progress_pipe[1] = safe_close(progress_pipe[1]);
+                (void) process_progress(TAKE_FD(progress_pipe[0]), console);
+        }
+
+        exit_status = wait_for_terminate_and_check("fsck", pid, WAIT_LOG_ABNORMAL);
+        if (exit_status < 0)
+                return exit_status;
+        if ((exit_status & ~FSCK_ERROR_CORRECTED) != FSCK_SUCCESS) {
+                log_error("fsck failed with exit status %i.", exit_status);
+
+                if ((exit_status & FSCK_SYSTEM_SHOULD_REBOOT) && root_directory) {
+                        /* System should be rebooted. */
+                        start_target(SPECIAL_REBOOT_TARGET, "replace-irreversibly");
+                        return -EINVAL;
+                } else if (!(exit_status & (FSCK_SYSTEM_SHOULD_REBOOT | FSCK_ERRORS_LEFT_UNCORRECTED)))
+                        log_warning("Ignoring error.");
+        }
+
+        if (exit_status & FSCK_ERROR_CORRECTED)
+                (void) touch("/run/systemd/quotacheck");
+
+        return !!(exit_status & (FSCK_SYSTEM_SHOULD_REBOOT | FSCK_ERRORS_LEFT_UNCORRECTED));
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/fsck/meson.build b/src/fsck/meson.build
new file mode 100644
index 0000000..e85e8c0
--- /dev/null
+++ b/src/fsck/meson.build
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-fsck',
+                'sources' : files('fsck.c'),
+        },
+]
diff --git a/src/fstab-generator/fstab-generator.c b/src/fstab-generator/fstab-generator.c
new file mode 100644
index 0000000..016f3ba
--- /dev/null
+++ b/src/fstab-generator/fstab-generator.c
@@ -0,0 +1,1646 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-unit-util.h"
+#include "chase.h"
+#include "creds-util.h"
+#include "efi-loader.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fstab-util.h"
+#include "generator.h"
+#include "in-addr-util.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "mkdir.h"
+#include "mount-setup.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "special.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "virt.h"
+#include "volatile-util.h"
+
+typedef enum MountPointFlags {
+        MOUNT_NOAUTO    = 1 << 0,
+        MOUNT_NOFAIL    = 1 << 1,
+        MOUNT_AUTOMOUNT = 1 << 2,
+        MOUNT_MAKEFS    = 1 << 3,
+        MOUNT_GROWFS    = 1 << 4,
+        MOUNT_RW_ONLY   = 1 << 5,
+        MOUNT_PCRFS     = 1 << 6,
+} MountPointFlags;
+
+typedef struct Mount {
+        bool for_initrd;
+        char *what;
+        char *where;
+        char *fstype;
+        char *options;
+} Mount;
+
+static void mount_array_free(Mount *mounts, size_t n);
+
+static bool arg_sysroot_check = false;
+static const char *arg_dest = NULL;
+static const char *arg_dest_late = NULL;
+static bool arg_fstab_enabled = true;
+static bool arg_swap_enabled = true;
+static char *arg_root_what = NULL;
+static char *arg_root_fstype = NULL;
+static char *arg_root_options = NULL;
+static char *arg_root_hash = NULL;
+static int arg_root_rw = -1;
+static char *arg_usr_what = NULL;
+static char *arg_usr_fstype = NULL;
+static char *arg_usr_options = NULL;
+static char *arg_usr_hash = NULL;
+static VolatileMode arg_volatile_mode = _VOLATILE_MODE_INVALID;
+static bool arg_verity = true;
+static Mount *arg_mounts = NULL;
+static size_t arg_n_mounts = 0;
+
+STATIC_DESTRUCTOR_REGISTER(arg_root_what, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_fstype, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_options, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_usr_what, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_usr_fstype, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_usr_options, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_usr_hash, freep);
+STATIC_ARRAY_DESTRUCTOR_REGISTER(arg_mounts, arg_n_mounts, mount_array_free);
+
+static void mount_done(Mount *m) {
+        assert(m);
+
+        free(m->what);
+        free(m->where);
+        free(m->fstype);
+        free(m->options);
+}
+
+static void mount_array_free(Mount *mounts, size_t n) {
+        FOREACH_ARRAY(m, mounts, n)
+                mount_done(m);
+
+        free(mounts);
+}
+
+static int mount_array_add_internal(
+                bool for_initrd,
+                char *in_what,
+                char *in_where,
+                const char *in_fstype,
+                const char *in_options) {
+
+        _cleanup_free_ char *what = NULL, *where = NULL, *fstype = NULL, *options = NULL;
+        int r;
+
+        /* This takes what and where. */
+
+        what = ASSERT_PTR(in_what);
+        where = in_where;
+
+        fstype = strdup(isempty(in_fstype) ? "auto" : in_fstype);
+        if (!fstype)
+                return -ENOMEM;
+
+        if (streq(fstype, "swap"))
+                where = mfree(where);
+
+        if (!isempty(in_options)) {
+                _cleanup_strv_free_ char **options_strv = NULL;
+
+                r = strv_split_full(&options_strv, in_options, ",", 0);
+                if (r < 0)
+                        return r;
+
+                r = strv_make_nulstr(options_strv, &options, NULL);
+        } else
+                r = strv_make_nulstr(STRV_MAKE("defaults"), &options, NULL);
+        if (r < 0)
+                return r;
+
+        if (!GREEDY_REALLOC(arg_mounts, arg_n_mounts + 1))
+                return -ENOMEM;
+
+        arg_mounts[arg_n_mounts++] = (Mount) {
+                .for_initrd = for_initrd,
+                .what = TAKE_PTR(what),
+                .where = TAKE_PTR(where),
+                .fstype = TAKE_PTR(fstype),
+                .options = TAKE_PTR(options),
+        };
+
+        return 0;
+}
+
+static int mount_array_add(bool for_initrd, const char *str) {
+        _cleanup_free_ char *what = NULL, *where = NULL, *fstype = NULL, *options = NULL;
+        int r;
+
+        assert(str);
+
+        r = extract_many_words(&str, ":", EXTRACT_CUNESCAPE | EXTRACT_DONT_COALESCE_SEPARATORS,
+                               &what, &where, &fstype, &options, NULL);
+        if (r < 0)
+                return r;
+        if (r < 2)
+                return -EINVAL;
+        if (!isempty(str))
+                return -EINVAL;
+
+        return mount_array_add_internal(for_initrd, TAKE_PTR(what), TAKE_PTR(where), fstype, options);
+}
+
+static int mount_array_add_swap(bool for_initrd, const char *str) {
+        _cleanup_free_ char *what = NULL, *options = NULL;
+        int r;
+
+        assert(str);
+
+        r = extract_many_words(&str, ":", EXTRACT_CUNESCAPE | EXTRACT_DONT_COALESCE_SEPARATORS,
+                               &what, &options, NULL);
+        if (r < 0)
+                return r;
+        if (r < 1)
+                return -EINVAL;
+        if (!isempty(str))
+                return -EINVAL;
+
+        return mount_array_add_internal(for_initrd, TAKE_PTR(what), NULL, "swap", options);
+}
+
+static int write_options(FILE *f, const char *options) {
+        _cleanup_free_ char *o = NULL;
+
+        if (isempty(options))
+                return 0;
+
+        if (streq(options, "defaults"))
+                return 0;
+
+        o = specifier_escape(options);
+        if (!o)
+                return log_oom();
+
+        fprintf(f, "Options=%s\n", o);
+        return 1;
+}
+
+static int write_what(FILE *f, const char *what) {
+        _cleanup_free_ char *w = NULL;
+
+        w = specifier_escape(what);
+        if (!w)
+                return log_oom();
+
+        fprintf(f, "What=%s\n", w);
+        return 1;
+}
+
+static int add_swap(
+                const char *source,
+                const char *what,
+                const char *options,
+                MountPointFlags flags) {
+
+        _cleanup_free_ char *name = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(what);
+
+        if (access("/proc/swaps", F_OK) < 0) {
+                log_info("Swap not supported, ignoring swap entry for %s.", what);
+                return 0;
+        }
+
+        if (detect_container() > 0) {
+                log_info("Running in a container, ignoring swap entry for %s.", what);
+                return 0;
+        }
+
+        if (arg_sysroot_check) {
+                log_info("%s should be enabled in the initrd, will request daemon-reload.", what);
+                return true;
+        }
+
+        log_debug("Found swap entry what=%s makefs=%s growfs=%s pcrfs=%s noauto=%s nofail=%s",
+                  what,
+                  yes_no(flags & MOUNT_MAKEFS), yes_no(flags & MOUNT_GROWFS), yes_no(flags & MOUNT_PCRFS),
+                  yes_no(flags & MOUNT_NOAUTO), yes_no(flags & MOUNT_NOFAIL));
+
+        r = unit_name_from_path(what, ".swap", &name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(arg_dest, source, name, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "Documentation=man:fstab(5) man:systemd-fstab-generator(8)\n"
+                "SourcePath=%s\n",
+                source);
+
+        r = generator_write_blockdev_dependency(f, what);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "\n"
+                "[Swap]\n");
+
+        r = write_what(f, what);
+        if (r < 0)
+                return r;
+
+        r = write_options(f, options);
+        if (r < 0)
+                return r;
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit file %s: %m", name);
+
+        /* use what as where, to have a nicer error message */
+        r = generator_write_timeouts(arg_dest, what, what, options, NULL);
+        if (r < 0)
+                return r;
+
+        if (flags & MOUNT_MAKEFS) {
+                r = generator_hook_up_mkswap(arg_dest, what);
+                if (r < 0)
+                        return r;
+        }
+
+        if (flags & MOUNT_GROWFS)
+                /* TODO: swap devices must be wiped and recreated */
+                log_warning("%s: growing swap devices is currently unsupported.", what);
+        if (flags & MOUNT_PCRFS)
+                log_warning("%s: measuring swap devices is currently unsupported.", what);
+
+        if (!(flags & MOUNT_NOAUTO)) {
+                r = generator_add_symlink(arg_dest, SPECIAL_SWAP_TARGET,
+                                          (flags & MOUNT_NOFAIL) ? "wants" : "requires", name);
+                if (r < 0)
+                        return r;
+        }
+
+        return true;
+}
+
+static bool mount_is_network(const char *fstype, const char *options) {
+        return fstab_test_option(options, "_netdev\0") ||
+                (fstype && fstype_is_network(fstype));
+}
+
+static bool mount_in_initrd(const char *where, const char *options, bool accept_root) {
+        return fstab_test_option(options, "x-initrd.mount\0") ||
+                (where && PATH_IN_SET(where, "/usr", accept_root ? "/" : NULL));
+}
+
+static int write_timeout(
+                FILE *f,
+                const char *where,
+                const char *opts,
+                const char *filter,
+                const char *variable) {
+
+        _cleanup_free_ char *timeout = NULL;
+        usec_t u;
+        int r;
+
+        r = fstab_filter_options(opts, filter, NULL, &timeout, NULL, NULL);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse options: %m");
+        if (r == 0)
+                return 0;
+
+        r = parse_sec_fix_0(timeout, &u);
+        if (r < 0) {
+                log_warning("Failed to parse timeout for %s, ignoring: %s", where, timeout);
+                return 0;
+        }
+
+        fprintf(f, "%s=%s\n", variable, FORMAT_TIMESPAN(u, 0));
+
+        return 0;
+}
+
+static int write_idle_timeout(FILE *f, const char *where, const char *opts) {
+        return write_timeout(f, where, opts,
+                             "x-systemd.idle-timeout\0", "TimeoutIdleSec");
+}
+
+static int write_mount_timeout(FILE *f, const char *where, const char *opts) {
+        return write_timeout(f, where, opts,
+                             "x-systemd.mount-timeout\0", "TimeoutSec");
+}
+
+static int write_dependency(
+                FILE *f,
+                const char *opts,
+                const char *filter,
+                const char *format) {
+
+        _cleanup_strv_free_ char **names = NULL, **units = NULL;
+        _cleanup_free_ char *res = NULL;
+        int r;
+
+        assert(f);
+        assert(opts);
+
+        r = fstab_filter_options(opts, filter, NULL, NULL, &names, NULL);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse options: %m");
+        if (r == 0)
+                return 0;
+
+        STRV_FOREACH(s, names) {
+                char *x;
+
+                r = unit_name_mangle_with_suffix(*s, "as dependency", 0, ".mount", &x);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate unit name: %m");
+
+                r = strv_consume(&units, x);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        if (units) {
+                res = strv_join(units, " ");
+                if (!res)
+                        return log_oom();
+
+                DISABLE_WARNING_FORMAT_NONLITERAL;
+                fprintf(f, format, res);
+                REENABLE_WARNING;
+        }
+
+        return 0;
+}
+
+static int write_after(FILE *f, const char *opts) {
+        return write_dependency(f, opts,
+                                "x-systemd.after\0", "After=%1$s\n");
+}
+
+static int write_requires_after(FILE *f, const char *opts) {
+        return write_dependency(f, opts,
+                                "x-systemd.requires\0", "After=%1$s\nRequires=%1$s\n");
+}
+
+static int write_before(FILE *f, const char *opts) {
+        return write_dependency(f, opts,
+                                "x-systemd.before\0", "Before=%1$s\n");
+}
+
+static int write_requires_mounts_for(FILE *f, const char *opts) {
+        _cleanup_strv_free_ char **paths = NULL, **paths_escaped = NULL;
+        _cleanup_free_ char *res = NULL;
+        int r;
+
+        assert(f);
+        assert(opts);
+
+        r = fstab_filter_options(opts, "x-systemd.requires-mounts-for\0", NULL, NULL, &paths, NULL);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse options: %m");
+        if (r == 0)
+                return 0;
+
+        r = specifier_escape_strv(paths, &paths_escaped);
+        if (r < 0)
+                return log_error_errno(r, "Failed to escape paths: %m");
+
+        res = strv_join(paths_escaped, " ");
+        if (!res)
+                return log_oom();
+
+        fprintf(f, "RequiresMountsFor=%s\n", res);
+
+        return 0;
+}
+
+static int write_extra_dependencies(FILE *f, const char *opts) {
+        int r;
+
+        assert(f);
+
+        if (opts) {
+                r = write_after(f, opts);
+                if (r < 0)
+                        return r;
+                r = write_requires_after(f, opts);
+                if (r < 0)
+                        return r;
+                r = write_before(f, opts);
+                if (r < 0)
+                        return r;
+                r = write_requires_mounts_for(f, opts);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int mandatory_mount_drop_unapplicable_options(
+                MountPointFlags *flags,
+                const char *where,
+                const char *options,
+                char **ret_options) {
+
+        int r;
+
+        assert(flags);
+        assert(where);
+        assert(options);
+        assert(ret_options);
+
+        if (!(*flags & (MOUNT_NOAUTO|MOUNT_NOFAIL|MOUNT_AUTOMOUNT))) {
+                _cleanup_free_ char *opts = NULL;
+
+                opts = strdup(options);
+                if (!opts)
+                        return -ENOMEM;
+
+                *ret_options = TAKE_PTR(opts);
+                return 0;
+        }
+
+        log_debug("Mount '%s' is mandatory, ignoring 'noauto', 'nofail', and 'x-systemd.automount' options.",
+                  where);
+
+        *flags &= ~(MOUNT_NOAUTO|MOUNT_NOFAIL|MOUNT_AUTOMOUNT);
+
+        r = fstab_filter_options(options, "noauto\0nofail\0x-systemd.automount\0", NULL, NULL, NULL, ret_options);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int add_mount(
+                const char *source,
+                const char *dest,
+                const char *what,
+                const char *where,
+                const char *original_where,
+                const char *fstype,
+                const char *opts,
+                int passno,
+                MountPointFlags flags,
+                const char *target_unit) {
+
+        _cleanup_free_ char *name = NULL, *automount_name = NULL, *filtered = NULL, *where_escaped = NULL,
+                *opts_root_filtered = NULL;
+        _cleanup_strv_free_ char **wanted_by = NULL, **required_by = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(what);
+        assert(where);
+        assert(opts);
+        assert(target_unit);
+        assert(source);
+
+        if (streq_ptr(fstype, "autofs"))
+                return 0;
+
+        if (!is_path(where)) {
+                log_warning("Mount point %s is not a valid path, ignoring.", where);
+                return 0;
+        }
+
+        if (mount_point_is_api(where) ||
+            mount_point_ignore(where))
+                return 0;
+
+        if (arg_sysroot_check) {
+                log_info("%s should be mounted in the initrd, will request daemon-reload.", where);
+                return true;
+        }
+
+        r = fstab_filter_options(opts, "x-systemd.wanted-by\0", NULL, NULL, &wanted_by, NULL);
+        if (r < 0)
+                return r;
+
+        r = fstab_filter_options(opts, "x-systemd.required-by\0", NULL, NULL, &required_by, NULL);
+        if (r < 0)
+                return r;
+
+        if (path_equal(where, "/")) {
+                r = mandatory_mount_drop_unapplicable_options(&flags, where, opts, &opts_root_filtered);
+                if (r < 0)
+                        return r;
+                opts = opts_root_filtered;
+
+                if (!strv_isempty(wanted_by))
+                        log_debug("Ignoring 'x-systemd.wanted-by=' option for root device.");
+                if (!strv_isempty(required_by))
+                        log_debug("Ignoring 'x-systemd.required-by=' option for root device.");
+
+                required_by = strv_free(required_by);
+                wanted_by = strv_free(wanted_by);
+        }
+
+        r = unit_name_from_path(where, ".mount", &name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(dest, source, name, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "Documentation=man:fstab(5) man:systemd-fstab-generator(8)\n"
+                "SourcePath=%s\n",
+                source);
+
+        if (STRPTR_IN_SET(fstype, "nfs", "nfs4") && !(flags & MOUNT_AUTOMOUNT) &&
+            fstab_test_yes_no_option(opts, "bg\0" "fg\0")) {
+                /* The default retry timeout that mount.nfs uses for 'bg' mounts
+                 * is 10000 minutes, where as it uses 2 minutes for 'fg' mounts.
+                 * As we are making  'bg' mounts look like an 'fg' mount to
+                 * mount.nfs (so systemd can manage the job-control aspects of 'bg'),
+                 * we need to explicitly preserve that default, and also ensure
+                 * the systemd mount-timeout doesn't interfere.
+                 * By placing these options first, they can be overridden by
+                 * settings in /etc/fstab. */
+                opts = strjoina("x-systemd.mount-timeout=infinity,retry=10000,nofail,", opts, ",fg");
+                SET_FLAG(flags, MOUNT_NOFAIL, true);
+        }
+
+        r = write_extra_dependencies(f, opts);
+        if (r < 0)
+                return r;
+
+        /* Order the mount unit we generate relative to target_unit, so that DefaultDependencies= on the
+         * target unit won't affect us. */
+        if (!FLAGS_SET(flags, MOUNT_NOFAIL))
+                fprintf(f, "Before=%s\n", target_unit);
+
+        if (passno != 0) {
+                r = generator_write_fsck_deps(f, dest, what, where, fstype);
+                if (r < 0)
+                        return r;
+        }
+
+        r = generator_write_blockdev_dependency(f, what);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "\n"
+                "[Mount]\n");
+
+        r = write_what(f, what);
+        if (r < 0)
+                return r;
+
+        if (original_where)
+                fprintf(f, "# Canonicalized from %s\n", original_where);
+
+        where_escaped = specifier_escape(where);
+        if (!where_escaped)
+                return log_oom();
+        fprintf(f, "Where=%s\n", where_escaped);
+
+        if (!isempty(fstype) && !streq(fstype, "auto")) {
+                _cleanup_free_ char *t = NULL;
+
+                t = specifier_escape(fstype);
+                if (!t)
+                        return -ENOMEM;
+
+                fprintf(f, "Type=%s\n", t);
+        }
+
+        r = generator_write_timeouts(dest, what, where, opts, &filtered);
+        if (r < 0)
+                return r;
+
+        r = generator_write_device_deps(dest, what, where, opts);
+        if (r < 0)
+                return r;
+
+        if (in_initrd() && path_equal(where, "/sysroot") && is_device_path(what)) {
+                r = generator_write_initrd_root_device_deps(dest, what);
+                if (r < 0)
+                        return r;
+        }
+
+        r = write_mount_timeout(f, where, opts);
+        if (r < 0)
+                return r;
+
+        r = write_options(f, filtered);
+        if (r < 0)
+                return r;
+
+        if (flags & MOUNT_RW_ONLY)
+                fprintf(f, "ReadWriteOnly=yes\n");
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit file %s: %m", name);
+
+        if (flags & MOUNT_MAKEFS) {
+                r = generator_hook_up_mkfs(dest, what, where, fstype);
+                if (r < 0)
+                        return r;
+        }
+
+        if (flags & MOUNT_GROWFS) {
+                r = generator_hook_up_growfs(dest, where, target_unit);
+                if (r < 0)
+                        return r;
+        }
+
+        if (flags & MOUNT_PCRFS) {
+                r = efi_measured_uki(LOG_WARNING);
+                if (r == 0)
+                        log_debug("Kernel stub did not measure kernel image into PCR, skipping userspace measurement, too.");
+                else if (r > 0) {
+                        r = generator_hook_up_pcrfs(dest, where, target_unit);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (!FLAGS_SET(flags, MOUNT_AUTOMOUNT)) {
+                if (!FLAGS_SET(flags, MOUNT_NOAUTO) && strv_isempty(wanted_by) && strv_isempty(required_by)) {
+                        r = generator_add_symlink(dest, target_unit,
+                                                  (flags & MOUNT_NOFAIL) ? "wants" : "requires", name);
+                        if (r < 0)
+                                return r;
+                } else {
+                        STRV_FOREACH(s, wanted_by) {
+                                r = generator_add_symlink(dest, *s, "wants", name);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        STRV_FOREACH(s, required_by) {
+                                r = generator_add_symlink(dest, *s, "requires", name);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+        } else {
+                r = unit_name_from_path(where, ".automount", &automount_name);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate unit name: %m");
+
+                f = safe_fclose(f);
+
+                r = generator_open_unit_file(dest, source, automount_name, &f);
+                if (r < 0)
+                        return r;
+
+                fprintf(f,
+                        "[Unit]\n"
+                        "SourcePath=%s\n"
+                        "Documentation=man:fstab(5) man:systemd-fstab-generator(8)\n",
+                        source);
+
+                fprintf(f,
+                        "\n"
+                        "[Automount]\n"
+                        "Where=%s\n",
+                        where_escaped);
+
+                r = write_idle_timeout(f, where, opts);
+                if (r < 0)
+                        return r;
+
+                r = fflush_and_check(f);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to write unit file %s: %m", automount_name);
+
+                r = generator_add_symlink(dest, target_unit,
+                                          (flags & MOUNT_NOFAIL) ? "wants" : "requires", automount_name);
+                if (r < 0)
+                        return r;
+        }
+
+        return true;
+}
+
+static int do_daemon_reload(void) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r, k;
+
+        log_debug("Calling org.freedesktop.systemd1.Manager.Reload()...");
+
+        r = bus_connect_system_systemd(&bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get D-Bus connection: %m");
+
+        r = bus_service_manager_reload(bus);
+        if (r < 0)
+                return r;
+
+        /* We need to requeue the two targets so that any new units which previously were not part of the
+         * targets, and which we now added, will be started. */
+
+        r = 0;
+        FOREACH_STRING(unit, SPECIAL_INITRD_FS_TARGET, SPECIAL_SWAP_TARGET) {
+                log_info("Requesting %s/start/replace...", unit);
+
+                k = bus_call_method(bus, bus_systemd_mgr, "StartUnit", &error, NULL, "ss", unit, "replace");
+                if (k < 0) {
+                        log_error_errno(k, "Failed to (re)start %s: %s", unit, bus_error_message(&error, r));
+                        RET_GATHER(r, k);
+                }
+        }
+
+        return r;
+}
+
+static const char* sysroot_fstab_path(void) {
+        return getenv("SYSTEMD_SYSROOT_FSTAB") ?: "/sysroot/etc/fstab";
+}
+
+static bool sysfs_check(void) {
+        static int cached = -1;
+        int r;
+
+        if (cached < 0) {
+                r = getenv_bool_secure("SYSTEMD_SYSFS_CHECK");
+                if (r < 0 && r != -ENXIO)
+                        log_debug_errno(r, "Failed to parse $SYSTEMD_SYSFS_CHECK, ignoring: %m");
+                cached = r != 0;
+        }
+
+        return cached;
+}
+
+static int add_sysusr_sysroot_usr_bind_mount(const char *source) {
+        return add_mount(source,
+                        arg_dest,
+                        "/sysusr/usr",
+                        "/sysroot/usr",
+                        NULL,
+                        NULL,
+                        "bind",
+                        0,
+                        0,
+                        SPECIAL_INITRD_FS_TARGET);
+}
+
+static MountPointFlags fstab_options_to_flags(const char *options, bool is_swap) {
+        MountPointFlags flags = 0;
+
+        if (fstab_test_option(options, "x-systemd.makefs\0"))
+                flags |= MOUNT_MAKEFS;
+        if (fstab_test_option(options, "x-systemd.growfs\0"))
+                flags |= MOUNT_GROWFS;
+        if (fstab_test_option(options, "x-systemd.pcrfs\0"))
+                flags |= MOUNT_PCRFS;
+        if (fstab_test_yes_no_option(options, "noauto\0" "auto\0"))
+                flags |= MOUNT_NOAUTO;
+        if (fstab_test_yes_no_option(options, "nofail\0" "fail\0"))
+                flags |= MOUNT_NOFAIL;
+
+        if (!is_swap) {
+                if (fstab_test_option(options, "x-systemd.rw-only\0"))
+                        flags |= MOUNT_RW_ONLY;
+                if (fstab_test_option(options,
+                                      "comment=systemd.automount\0"
+                                      "x-systemd.automount\0"))
+                        flags |= MOUNT_AUTOMOUNT;
+        }
+
+        return flags;
+}
+
+static int canonicalize_mount_path(const char *path, const char *type, bool prefix_sysroot, char **ret) {
+        _cleanup_free_ char *p = NULL;
+        bool changed;
+        int r;
+
+        assert(path);
+        assert(type);
+        assert(STR_IN_SET(type, "where", "what"));
+        assert(ret);
+
+        // FIXME: when chase() learns to chase non-existent paths, use this here and drop the prefixing with
+        // /sysroot on error below.
+        r = chase(path, prefix_sysroot ? "/sysroot" : NULL, CHASE_PREFIX_ROOT | CHASE_NONEXISTENT, &p, NULL);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to chase '%s', using as-is: %m", path);
+
+                if (prefix_sysroot)
+                        p = path_join("/sysroot", path);
+                else
+                        p = strdup(path);
+                if (!p)
+                        return log_oom();
+
+                path_simplify(p);
+        }
+
+        changed = !streq(path, p);
+        if (changed)
+                log_debug("Canonicalized %s=%s to %s", type, path, p);
+
+        *ret = TAKE_PTR(p);
+        return changed;
+}
+
+static int parse_fstab_one(
+                const char *source,
+                const char *what_original,
+                const char *where_original,
+                const char *fstype,
+                const char *options,
+                int passno,
+                bool prefix_sysroot,
+                bool accept_root, /* This takes an effect only when prefix_sysroot is true. */
+                bool use_swap_enabled) {
+
+        _cleanup_free_ char *what = NULL, *where = NULL, *opts = NULL;
+        MountPointFlags flags;
+        bool is_swap, where_changed;
+        int r;
+
+        assert(what_original);
+        assert(fstype);
+        assert(options);
+
+        if (prefix_sysroot && !mount_in_initrd(where_original, options, accept_root))
+                return 0;
+
+        is_swap = streq_ptr(fstype, "swap");
+        if (is_swap && use_swap_enabled && !arg_swap_enabled) {
+                log_info("Swap unit generation disabled on kernel command line, ignoring swap entry for %s.", what_original);
+                return 0;
+        }
+
+        what = fstab_node_to_udev_node(what_original);
+        if (!what)
+                return log_oom();
+
+        if (path_is_read_only_fs("/sys") > 0 &&
+            (streq(what, "sysfs") ||
+             (sysfs_check() && is_device_path(what)))) {
+                log_info("/sys/ is read-only (running in a container?), ignoring mount for %s.", what);
+                return 0;
+        }
+
+        flags = fstab_options_to_flags(options, is_swap);
+
+        if (is_swap)
+                return add_swap(source, what, options, flags);
+
+        if (passno < 0)
+                passno = is_device_path(what);
+
+        assert(where_original); /* 'where' is not necessary for swap entry. */
+
+        if (!is_path(where_original)) {
+                log_warning("Mount point %s is not a valid path, ignoring.", where_original);
+                return 0;
+        }
+
+        /* Follow symlinks here; see 5261ba901845c084de5a8fd06500ed09bfb0bd80 which makes sense for
+         * mount units, but causes problems since it historically worked to have symlinks in e.g.
+         * /etc/fstab. So we canonicalize here. Note that we use CHASE_NONEXISTENT to handle the case
+         * where a symlink refers to another mount target; this works assuming the sub-mountpoint
+         * target is the final directory. */
+        r = canonicalize_mount_path(where_original, "where", prefix_sysroot, &where);
+        if (r < 0)
+                return r;
+        where_changed = r > 0;
+
+        if (prefix_sysroot && fstab_is_bind(options, fstype)) {
+                /* When in initrd, the source of bind mount needs to be prepended with /sysroot as well. */
+                _cleanup_free_ char *p = NULL;
+
+                r = canonicalize_mount_path(what, "what", prefix_sysroot, &p);
+                if (r < 0)
+                        return r;
+
+                free_and_replace(what, p);
+        }
+
+        log_debug("Found entry what=%s where=%s type=%s makefs=%s growfs=%s pcrfs=%s noauto=%s nofail=%s",
+                  what, where, strna(fstype),
+                  yes_no(flags & MOUNT_MAKEFS), yes_no(flags & MOUNT_GROWFS), yes_no(flags & MOUNT_PCRFS),
+                  yes_no(flags & MOUNT_NOAUTO), yes_no(flags & MOUNT_NOFAIL));
+
+        bool is_sysroot = in_initrd() && path_equal(where, "/sysroot");
+        /* See comment from add_sysroot_usr_mount() about the need for extra indirection in case /usr needs
+         * to be mounted in order for the root fs to be synthesized based on configuration included in /usr/,
+         * e.g. systemd-repart. */
+        bool is_sysroot_usr = in_initrd() && path_equal(where, "/sysroot/usr");
+
+        const char *target_unit =
+                        is_sysroot ?                        SPECIAL_INITRD_ROOT_FS_TARGET :
+                        is_sysroot_usr ?                    SPECIAL_INITRD_USR_FS_TARGET :
+                        prefix_sysroot ?                    SPECIAL_INITRD_FS_TARGET :
+                        mount_is_network(fstype, options) ? SPECIAL_REMOTE_FS_TARGET :
+                                                            SPECIAL_LOCAL_FS_TARGET;
+
+        /* nofail, noauto and x-systemd.automount don't make sense for critical filesystems we must mount in initrd. */
+        if (is_sysroot || is_sysroot_usr) {
+                r = mandatory_mount_drop_unapplicable_options(&flags, where, options, &opts);
+                if (r < 0)
+                        return r;
+                options = opts;
+        }
+
+        r = add_mount(source,
+                      arg_dest,
+                      what,
+                      is_sysroot_usr ? "/sysusr/usr" : where,
+                      !is_sysroot_usr && where_changed ? where_original : NULL,
+                      fstype,
+                      options,
+                      passno,
+                      flags,
+                      target_unit);
+        if (r <= 0)
+                return r;
+
+        if (is_sysroot_usr) {
+                log_debug("Synthesizing fstab entry what=/sysusr/usr where=/sysroot/usr opts=bind");
+                r = add_sysusr_sysroot_usr_bind_mount(source);
+                if (r < 0)
+                        return r;
+        }
+
+        return true;
+}
+
+static int parse_fstab(bool prefix_sysroot) {
+        _cleanup_endmntent_ FILE *f = NULL;
+        const char *fstab;
+        struct mntent *me;
+        int r, ret = 0;
+
+        if (prefix_sysroot)
+                fstab = sysroot_fstab_path();
+        else {
+                fstab = fstab_path();
+                assert(!arg_sysroot_check);
+        }
+
+        log_debug("Parsing %s...", fstab);
+
+        f = setmntent(fstab, "re");
+        if (!f) {
+                if (errno == ENOENT)
+                        return 0;
+
+                return log_error_errno(errno, "Failed to open %s: %m", fstab);
+        }
+
+        while ((me = getmntent(f))) {
+                r = parse_fstab_one(fstab,
+                                    me->mnt_fsname, me->mnt_dir, me->mnt_type, me->mnt_opts, me->mnt_passno,
+                                    prefix_sysroot,
+                                    /* accept_root = */ false,
+                                    /* use_swap_enabled = */ true);
+                if (r < 0 && ret >= 0)
+                        ret = r;
+                if (arg_sysroot_check && r > 0)
+                        return true;  /* We found a mount or swap that would be started… */
+        }
+
+        return ret;
+}
+
+static int sysroot_is_nfsroot(void) {
+        union in_addr_union u;
+        const char *sep, *a;
+        int r;
+
+        assert(arg_root_what);
+
+        /* From dracut.cmdline(7).
+         *
+         * root=[:][:]
+         * root=nfs:[:][:],
+         * root=nfs4:[:][:],
+         * root={dhcp|dhcp6}
+         *
+         * mount nfs share from :/, if no server-ip is given, use dhcp next_server.
+         * If server-ip is an IPv6 address it has to be put in brackets, e.g. [2001:DB8::1]. NFS options
+         * can be appended with the prefix ":" or "," and are separated by ",". */
+
+        if (path_equal(arg_root_what, "/dev/nfs") ||
+            STR_IN_SET(arg_root_what, "dhcp", "dhcp6") ||
+            STARTSWITH_SET(arg_root_what, "nfs:", "nfs4:"))
+                return true;
+
+        /* IPv6 address */
+        if (arg_root_what[0] == '[') {
+                sep = strchr(arg_root_what + 1, ']');
+                if (!sep)
+                        return -EINVAL;
+
+                a = strndupa_safe(arg_root_what + 1, sep - arg_root_what - 1);
+
+                r = in_addr_from_string(AF_INET6, a, &u);
+                if (r < 0)
+                        return r;
+
+                return true;
+        }
+
+        /* IPv4 address */
+        sep = strchr(arg_root_what, ':');
+        if (sep) {
+                a = strndupa_safe(arg_root_what, sep - arg_root_what);
+
+                if (in_addr_from_string(AF_INET, a, &u) >= 0)
+                        return true;
+        }
+
+        /* root directory without address */
+        return path_is_absolute(arg_root_what) && !path_startswith(arg_root_what, "/dev");
+}
+
+static int add_sysroot_mount(void) {
+        _cleanup_free_ char *what = NULL;
+        const char *opts, *fstype;
+        bool default_rw, makefs;
+        MountPointFlags flags;
+        int r;
+
+        if (isempty(arg_root_what)) {
+                log_debug("Could not find a root= entry on the kernel command line.");
+                return 0;
+        }
+
+        if (streq(arg_root_what, "gpt-auto")) {
+                /* This is handled by gpt-auto-generator */
+                log_debug("Skipping root directory handling, as gpt-auto was requested.");
+                return 0;
+        } else if (streq(arg_root_what, "fstab")) {
+                /* This is handled by parse_fstab */
+                log_debug("Using initrd's fstab for /sysroot/ configuration.");
+                return 0;
+        }
+
+        r = sysroot_is_nfsroot();
+        if (r < 0)
+                log_debug_errno(r, "Failed to determine if the root directory is on NFS, assuming not: %m");
+        else if (r > 0) {
+                /* This is handled by the kernel or the initrd */
+                log_debug("Skipping root directory handling, as root on NFS was requested.");
+                return 0;
+        }
+
+        if (startswith(arg_root_what, "cifs://")) {
+                log_debug("Skipping root directory handling, as root on CIFS was requested.");
+                return 0;
+        }
+
+        if (startswith(arg_root_what, "iscsi:")) {
+                log_debug("Skipping root directory handling, as root on iSCSI was requested.");
+                return 0;
+        }
+
+        if (startswith(arg_root_what, "live:")) {
+                log_debug("Skipping root directory handling, as root on live image was requested.");
+                return 0;
+        }
+
+        if (streq(arg_root_what, "tmpfs")) {
+                /* If root=tmpfs is specified, then take this as shortcut for a writable tmpfs mount as root */
+
+                what = strdup("rootfs"); /* just a pretty name, to show up in /proc/self/mountinfo */
+                if (!what)
+                        return log_oom();
+
+                fstype = arg_root_fstype ?: "tmpfs"; /* tmpfs, unless overridden */
+
+                default_rw = true; /* writable, unless overridden */;
+        } else {
+
+                what = fstab_node_to_udev_node(arg_root_what);
+                if (!what)
+                        return log_oom();
+
+                fstype = arg_root_fstype; /* if not specified explicitly, don't default to anything here */
+
+                default_rw = false; /* read-only, unless overridden */
+        }
+
+        if (!arg_root_options)
+                opts = arg_root_rw > 0 || (arg_root_rw < 0 && default_rw) ? "rw" : "ro";
+        else if (arg_root_rw >= 0 ||
+                 !fstab_test_option(arg_root_options, "ro\0" "rw\0"))
+                opts = strjoina(arg_root_options, ",", arg_root_rw > 0 ? "rw" : "ro");
+        else
+                opts = arg_root_options;
+
+        log_debug("Found entry what=%s where=/sysroot type=%s opts=%s", what, strna(arg_root_fstype), strempty(opts));
+
+        makefs = fstab_test_option(opts, "x-systemd.makefs\0");
+        flags = makefs * MOUNT_MAKEFS;
+
+        return add_mount("/proc/cmdline",
+                         arg_dest,
+                         what,
+                         "/sysroot",
+                         NULL,
+                         fstype,
+                         opts,
+                         is_device_path(what) ? 1 : 0, /* passno */
+                         flags,                        /* makefs off, pcrfs off, noauto off, nofail off, automount off */
+                         SPECIAL_INITRD_ROOT_FS_TARGET);
+}
+
+static int add_sysroot_usr_mount(void) {
+        _cleanup_free_ char *what = NULL;
+        const char *opts;
+        bool makefs;
+        MountPointFlags flags;
+        int r;
+
+        /* Returns 0 if we didn't do anything, > 0 if we either generated a unit for the /usr/ mount, or we
+         * know for sure something else did */
+
+        if (!arg_usr_what && !arg_usr_fstype && !arg_usr_options)
+                return 0;
+
+        if (arg_root_what && !arg_usr_what) {
+                /* Copy over the root device, in case the /usr mount just differs in a mount option (consider btrfs subvolumes) */
+                arg_usr_what = strdup(arg_root_what);
+                if (!arg_usr_what)
+                        return log_oom();
+        }
+
+        if (arg_root_fstype && !arg_usr_fstype) {
+                arg_usr_fstype = strdup(arg_root_fstype);
+                if (!arg_usr_fstype)
+                        return log_oom();
+        }
+
+        if (arg_root_options && !arg_usr_options) {
+                arg_usr_options = strdup(arg_root_options);
+                if (!arg_usr_options)
+                        return log_oom();
+        }
+
+        if (isempty(arg_usr_what)) {
+                log_debug("Could not find a mount.usr= entry on the kernel command line.");
+                return 0;
+        }
+
+        if (streq(arg_usr_what, "gpt-auto")) {
+                /* This is handled by the gpt-auto generator */
+                log_debug("Skipping /usr/ directory handling, as gpt-auto was requested.");
+                return 1; /* systemd-gpt-auto-generator will generate a unit for this, hence report that a
+                           * unit file is being created for the host /usr/ mount. */
+        } else if (streq(arg_usr_what, "fstab")) {
+                /* This is handled by parse_fstab */
+                log_debug("Using initrd's fstab for /sysroot/usr/ configuration.");
+                return 1; /* parse_fstab will generate a unit for this, hence report that a
+                           * unit file is being created for the host /usr/ mount. */
+        }
+
+        if (path_equal(arg_usr_what, "/dev/nfs")) {
+                /* This is handled by the initrd (if at all supported, that is) */
+                log_debug("Skipping /usr/ directory handling, as /dev/nfs was requested.");
+                return 1; /* As above, report that NFS code will create the unit */
+        }
+
+        what = fstab_node_to_udev_node(arg_usr_what);
+        if (!what)
+                return log_oom();
+
+        if (!arg_usr_options)
+                opts = arg_root_rw > 0 ? "rw" : "ro";
+        else if (!fstab_test_option(arg_usr_options, "ro\0" "rw\0"))
+                opts = strjoina(arg_usr_options, ",", arg_root_rw > 0 ? "rw" : "ro");
+        else
+                opts = arg_usr_options;
+
+        /* When mounting /usr from the initrd, we add an extra level of indirection: we first mount the /usr/
+         * partition to /sysusr/usr/, and then afterwards bind mount that to /sysroot/usr/. We do this so
+         * that we can cover for systems that initially only have a /usr/ around and where the root fs needs
+         * to be synthesized, based on configuration included in /usr/, e.g. systemd-repart. Software like
+         * this should order itself after initrd-usr-fs.target and before initrd-fs.target; and it should
+         * look into both /sysusr/ and /sysroot/ for the configuration data to apply. */
+
+        log_debug("Found entry what=%s where=/sysusr/usr type=%s opts=%s", what, strna(arg_usr_fstype), strempty(opts));
+
+        makefs = fstab_test_option(opts, "x-systemd.makefs\0");
+        flags = makefs * MOUNT_MAKEFS;
+
+        r = add_mount("/proc/cmdline",
+                      arg_dest,
+                      what,
+                      "/sysusr/usr",
+                      NULL,
+                      arg_usr_fstype,
+                      opts,
+                      is_device_path(what) ? 1 : 0, /* passno */
+                      flags,
+                      SPECIAL_INITRD_USR_FS_TARGET);
+        if (r < 0)
+                return r;
+
+        log_debug("Synthesizing entry what=/sysusr/usr where=/sysroot/usr opts=bind");
+
+        r = add_sysusr_sysroot_usr_bind_mount("/proc/cmdline");
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int add_sysroot_usr_mount_or_fallback(void) {
+        int r;
+
+        r = add_sysroot_usr_mount();
+        if (r != 0)
+                return r;
+
+        /* OK, so we didn't write anything out for /sysusr/usr/ nor /sysroot/usr/. In this case, let's make
+         * sure that initrd-usr-fs.target is at least ordered after sysroot.mount so that services that order
+         * themselves after it get the guarantee that /usr/ is definitely mounted somewhere. */
+
+        return generator_add_symlink(
+                        arg_dest,
+                        SPECIAL_INITRD_USR_FS_TARGET,
+                        "requires",
+                        "sysroot.mount");
+}
+
+static int add_volatile_root(void) {
+
+        /* Let's add in systemd-remount-volatile.service which will remount the root device to tmpfs if this is
+         * requested (or as an overlayfs), leaving only /usr from the root mount inside. */
+
+        if (!IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY))
+                return 0;
+
+        return generator_add_symlink(arg_dest, SPECIAL_INITRD_ROOT_FS_TARGET, "requires",
+                                     SYSTEM_DATA_UNIT_DIR "/" SPECIAL_VOLATILE_ROOT_SERVICE);
+}
+
+static int add_volatile_var(void) {
+
+        if (arg_volatile_mode != VOLATILE_STATE)
+                return 0;
+
+        /* If requested, mount /var as tmpfs, but do so only if there's nothing else defined for this. */
+
+        return add_mount("/proc/cmdline",
+                         arg_dest_late,
+                         "tmpfs",
+                         "/var",
+                         NULL,
+                         "tmpfs",
+                         "mode=0755" TMPFS_LIMITS_VAR,
+                         0,
+                         0,
+                         SPECIAL_LOCAL_FS_TARGET);
+}
+
+static int add_mounts_from_cmdline(void) {
+        int r = 0;
+
+        /* Handle each entries found in cmdline as a fstab entry. */
+
+        FOREACH_ARRAY(m, arg_mounts, arg_n_mounts) {
+                if (m->for_initrd && !in_initrd())
+                        continue;
+
+                RET_GATHER(r, parse_fstab_one("/proc/cmdline",
+                                              m->what,
+                                              m->where,
+                                              m->fstype,
+                                              m->options,
+                                              /* passno = */ -1,
+                                              /* prefix_sysroot = */ !m->for_initrd && in_initrd(),
+                                              /* accept_root = */ true,
+                                              /* use_swap_enabled = */ false));
+        }
+
+        return r;
+}
+
+static int add_mounts_from_creds(bool prefix_sysroot) {
+        _cleanup_free_ void *b = NULL;
+        struct mntent *me;
+        size_t bs;
+        int r;
+
+        assert(in_initrd() || !prefix_sysroot);
+
+        r = read_credential_with_decryption(
+                        in_initrd() && !prefix_sysroot ? "fstab.extra.initrd" : "fstab.extra",
+                        &b, &bs);
+        if (r <= 0)
+                return r;
+
+        _cleanup_fclose_ FILE *f = NULL;
+        f = fmemopen_unlocked(b, bs, "r");
+        if (!f)
+                return log_oom();
+
+        r = 0;
+
+        while ((me = getmntent(f)))
+                RET_GATHER(r, parse_fstab_one("/run/credentials",
+                                              me->mnt_fsname,
+                                              me->mnt_dir,
+                                              me->mnt_type,
+                                              me->mnt_opts,
+                                              me->mnt_passno,
+                                              /* prefix_sysroot = */ prefix_sysroot,
+                                              /* accept_root = */ true,
+                                              /* use_swap_enabled = */ true));
+
+        return r;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        int r;
+
+        assert(key);
+
+        /* root=, usr=, usrfstype= and roofstype= may occur more than once, the last
+         * instance should take precedence.  In the case of multiple rootflags=
+         * or usrflags= the arguments should be concatenated */
+
+        if (STR_IN_SET(key, "fstab", "rd.fstab")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning("Failed to parse fstab switch %s. Ignoring.", value);
+                else
+                        arg_fstab_enabled = fstab_set_enabled(r);
+
+        } else if (streq(key, "root")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_root_what, empty_to_null(value));
+
+        } else if (streq(key, "rootfstype")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_root_fstype, empty_to_null(value));
+
+        } else if (streq(key, "rootflags")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!strextend_with_separator(&arg_root_options, ",", value))
+                        return log_oom();
+
+        } else if (streq(key, "roothash")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_root_hash, empty_to_null(value));
+
+        } else if (streq(key, "mount.usr")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_usr_what, empty_to_null(value));
+
+        } else if (streq(key, "mount.usrfstype")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_usr_fstype, empty_to_null(value));
+
+        } else if (streq(key, "mount.usrflags")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!strextend_with_separator(&arg_usr_options, ",", value))
+                        return log_oom();
+
+        } else if (streq(key, "usrhash")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_usr_hash, empty_to_null(value));
+
+        } else if (streq(key, "rw") && !value)
+                arg_root_rw = true;
+        else if (streq(key, "ro") && !value)
+                arg_root_rw = false;
+        else if (streq(key, "systemd.volatile")) {
+                VolatileMode m;
+
+                if (value) {
+                        m = volatile_mode_from_string(value);
+                        if (m < 0)
+                                log_warning_errno(m, "Failed to parse systemd.volatile= argument: %s", value);
+                        else
+                                arg_volatile_mode = m;
+                } else
+                        arg_volatile_mode = VOLATILE_YES;
+
+        } else if (streq(key, "systemd.swap")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning("Failed to parse systemd.swap switch %s. Ignoring.", value);
+                else
+                        arg_swap_enabled = r;
+
+        } else if (streq(key, "systemd.verity")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning("Failed to parse systemd.verity= kernel command line switch %s. Ignoring.", value);
+                else
+                        arg_verity = r;
+
+        } else if (STR_IN_SET(key, "systemd.mount-extra", "rd.systemd.mount-extra")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = mount_array_add(startswith(key, "rd."), value);
+                if (r < 0)
+                        log_warning("Failed to parse systemd.mount-extra= option, ignoring: %s", value);
+
+        } else if (STR_IN_SET(key, "systemd.swap-extra", "rd.systemd.swap-extra")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = mount_array_add_swap(startswith(key, "rd."), value);
+                if (r < 0)
+                        log_warning("Failed to parse systemd.swap-extra= option, ignoring: %s", value);
+        }
+
+        return 0;
+}
+
+static int determine_device(
+                char **what,
+                int *rw,
+                char **options,
+                const char *hash,
+                const char *name) {
+
+        assert(what);
+        assert(name);
+
+        /* If we have a hash but no device then Verity is used, and we use the DM device. */
+        if (*what)
+                return 0;
+
+        if (!hash)
+                return 0;
+
+        if (!arg_verity)
+                return 0;
+
+        *what = path_join("/dev/mapper/", name);
+        if (!*what)
+                return log_oom();
+
+        /* Verity is always read-only */
+        if (rw)
+                *rw = false;
+        if (options && !strextend_with_separator(options, ",", "ro"))
+                return log_oom();
+
+        log_info("Using verity %s device %s.", name, *what);
+        return 1;
+}
+
+static int determine_root(void) {
+        return determine_device(&arg_root_what, &arg_root_rw, NULL, arg_root_hash, "root");
+}
+
+static int determine_usr(void) {
+        return determine_device(&arg_usr_what, NULL, &arg_usr_options, arg_usr_hash, "usr");
+}
+
+/* If arg_sysroot_check is false, run as generator in the usual fashion.
+ * If it is true, check /sysroot/etc/fstab for any units that we'd want to mount
+ * in the initrd, and call daemon-reload. We will get reinvoked as a generator,
+ * with /sysroot/etc/fstab available, and then we can write additional units based
+ * on that file. */
+static int run_generator(void) {
+        int r = 0;
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+
+        (void) determine_root();
+        (void) determine_usr();
+
+        if (arg_sysroot_check) {
+                r = parse_fstab(/* prefix_sysroot = */ true);
+                if (r == 0)
+                        log_debug("Nothing interesting found, not doing daemon-reload.");
+                if (r > 0)
+                        r = do_daemon_reload();
+                return r;
+        }
+
+        r = 0;
+
+        /* Always honour root= and usr= in the kernel command line if we are in an initrd */
+        if (in_initrd()) {
+                RET_GATHER(r, add_sysroot_mount());
+
+                RET_GATHER(r, add_sysroot_usr_mount_or_fallback());
+
+                RET_GATHER(r, add_volatile_root());
+        } else
+                RET_GATHER(r, add_volatile_var());
+
+        /* Honour /etc/fstab only when that's enabled */
+        if (arg_fstab_enabled) {
+                /* Parse the local /etc/fstab, possibly from the initrd */
+                RET_GATHER(r, parse_fstab(/* prefix_sysroot = */ false));
+
+                /* If running in the initrd also parse the /etc/fstab from the host */
+                if (in_initrd())
+                        RET_GATHER(r, parse_fstab(/* prefix_sysroot = */ true));
+                else
+                        RET_GATHER(r, generator_enable_remount_fs_service(arg_dest));
+        }
+
+        RET_GATHER(r, add_mounts_from_cmdline());
+
+        RET_GATHER(r, add_mounts_from_creds(/* prefix_sysroot = */ false));
+
+        if (in_initrd())
+                RET_GATHER(r, add_mounts_from_creds(/* prefix_sysroot = */ true));
+
+        return r;
+}
+
+static int run(int argc, char **argv) {
+        arg_sysroot_check = invoked_as(argv, "systemd-sysroot-fstab-check");
+
+        if (arg_sysroot_check) {
+                /* Run as in systemd-sysroot-fstab-check mode */
+                log_setup();
+
+                if (strv_length(argv) > 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "This program takes no arguments.");
+                if (!in_initrd())
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "This program is only useful in the initrd.");
+        } else {
+                /* Run in generator mode */
+                log_setup_generator();
+
+                if (!IN_SET(strv_length(argv), 2, 4))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "This program takes one or three arguments.");
+
+                arg_dest = ASSERT_PTR(argv[1]);
+                arg_dest_late = ASSERT_PTR(argv[argc > 3 ? 3 : 1]);
+        }
+
+        return run_generator();
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/fstab-generator/meson.build b/src/fstab-generator/meson.build
new file mode 100644
index 0000000..7b90580
--- /dev/null
+++ b/src/fstab-generator/meson.build
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        generator_template + {
+                'name' : 'systemd-fstab-generator',
+                'sources' : files('fstab-generator.c'),
+        },
+]
+
+meson.add_install_script(sh, '-c',
+                         ln_s.format(systemgeneratordir / 'systemd-fstab-generator',
+                                     libexecdir / 'systemd-sysroot-fstab-check'))
diff --git a/src/fundamental/bootspec-fundamental.c b/src/fundamental/bootspec-fundamental.c
new file mode 100644
index 0000000..b2841e3
--- /dev/null
+++ b/src/fundamental/bootspec-fundamental.c
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bootspec-fundamental.h"
+
+bool bootspec_pick_name_version_sort_key(
+                const sd_char *os_pretty_name,
+                const sd_char *os_image_id,
+                const sd_char *os_name,
+                const sd_char *os_id,
+                const sd_char *os_image_version,
+                const sd_char *os_version,
+                const sd_char *os_version_id,
+                const sd_char *os_build_id,
+                const sd_char **ret_name,
+                const sd_char **ret_version,
+                const sd_char **ret_sort_key) {
+
+        const sd_char *good_name, *good_version, *good_sort_key;
+
+        /* Find the best human readable title, version string and sort key for a boot entry (using the
+         * os-release(5) fields). Precise is preferred over vague, and human readable over machine
+         * readable. Thus:
+         *
+         * 1. First priority gets the PRETTY_NAME field, which is the primary string intended for display,
+         *    and should already contain both a nice description and a version indication (if that concept
+         *    applies).
+         *
+         * 2. Otherwise we go for IMAGE_ID and IMAGE_VERSION (thus we show details about the image,
+         *    i.e. specific combination of packages and configuration), if that concept applies.
+         *
+         * 3. Otherwise we go for NAME and VERSION (i.e. human readable OS name and version)
+         *
+         * 4. Otherwise we go for ID and VERSION_ID (i.e. machine readable OS name and version)
+         *
+         * 5. Finally, for the version we'll use BUILD_ID (i.e. a machine readable version that identifies
+         *    the original OS build used during installation)
+         *
+         * Note that the display logic will show only the name by default, except if that isn't unique in
+         * which case the version is shown too.
+         *
+         * Note that name/version determined here are used only for display purposes. Boot entry preference
+         * sorting (i.e. algorithmic ordering of boot entries) is done based on the order of the sort key (if
+         * defined) or entry "id" string (i.e. entry file name) otherwise. */
+
+        good_name = os_pretty_name ?: (os_image_id ?: (os_name ?: os_id));
+        good_version = os_image_version ?: (os_version ?: (os_version_id ?: os_build_id));
+        good_sort_key = os_image_id ?: os_id;
+
+        if (!good_name)
+                return false;
+
+        if (ret_name)
+                *ret_name = good_name;
+
+        if (ret_version)
+                *ret_version = good_version;
+
+        if (ret_sort_key)
+                *ret_sort_key = good_sort_key;
+
+        return true;
+}
diff --git a/src/fundamental/bootspec-fundamental.h b/src/fundamental/bootspec-fundamental.h
new file mode 100644
index 0000000..19b489c
--- /dev/null
+++ b/src/fundamental/bootspec-fundamental.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "string-util-fundamental.h"
+
+bool bootspec_pick_name_version_sort_key(
+                const sd_char *os_pretty_name,
+                const sd_char *os_image_id,
+                const sd_char *os_name,
+                const sd_char *os_id,
+                const sd_char *os_image_version,
+                const sd_char *os_version,
+                const sd_char *os_version_id,
+                const sd_char *os_build_id,
+                const sd_char **ret_name,
+                const sd_char **ret_version,
+                const sd_char **ret_sort_key);
diff --git a/src/fundamental/confidential-virt-fundamental.h b/src/fundamental/confidential-virt-fundamental.h
new file mode 100644
index 0000000..986923e
--- /dev/null
+++ b/src/fundamental/confidential-virt-fundamental.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+/* Keep CVM detection logic in this file at feature parity with
+ * that in src/efi/boot/vmm.c */
+
+#define CPUID_PROCESSOR_INFO_AND_FEATURE_BITS UINT32_C(0x1)
+
+/*
+ * AMD64 Architecture Programmer’s Manual Volume 3:
+ * General-Purpose and System Instructions.
+ * Chapter: E4.1 - Maximum Extended Function Number and Vendor String
+ *  https://www.amd.com/system/files/TechDocs/24594.pdf
+ */
+#define CPUID_GET_HIGHEST_FUNCTION UINT32_C(0x80000000)
+
+/*
+ * AMD64 Architecture Programmer’s Manual Volume 3:
+ * General-Purpose and System Instructions.
+ * Chapter: E4.17 - Encrypted Memory Capabilities
+ *  https://www.amd.com/system/files/TechDocs/24594.pdf
+ */
+#define CPUID_AMD_GET_ENCRYPTED_MEMORY_CAPABILITIES UINT32_C(0x8000001f)
+
+/*
+ * AMD64 Architecture Programmer’s Manual Volume 3:
+ * General-Purpose and System Instructions.
+ * Chapter: 15.34.10 - SEV_STATUS MSR
+ * https://www.amd.com/system/files/TechDocs/24593.pdf
+ */
+#define MSR_AMD64_SEV UINT32_C(0xc0010131)
+
+/*
+ * Intel® TDX Module v1.5 Base Architecture Specification
+ * Chapter: 11.2
+ * https://www.intel.com/content/www/us/en/content-details/733575/intel-tdx-module-v1-5-base-architecture-specification.html
+ */
+
+#define CPUID_INTEL_TDX_ENUMERATION UINT32_C(0x21)
+
+/* Requirements for Implementing the Microsoft Hypervisor Interface
+ * https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/tlfs
+ */
+#define CPUID_HYPERV_VENDOR_AND_MAX_FUNCTIONS UINT32_C(0x40000000)
+
+#define CPUID_HYPERV_FEATURES UINT32_C(0x40000003)
+
+#define CPUID_HYPERV_ISOLATION_CONFIG UINT32_C(0x4000000C)
+
+#define CPUID_HYPERV_MIN UINT32_C(0x40000005)
+#define CPUID_HYPERV_MAX UINT32_C(0x4000ffff)
+
+#define CPUID_SIG_AMD       "AuthenticAMD"
+#define CPUID_SIG_INTEL     "GenuineIntel"
+#define CPUID_SIG_INTEL_TDX "IntelTDX    "
+#define CPUID_SIG_HYPERV    "Microsoft Hv"
+
+/* ecx bit 31: set => hyperpvisor, unset => bare metal */
+#define CPUID_FEATURE_HYPERVISOR (UINT32_C(1) << 31)
+
+/* Linux include/asm-generic/hyperv-tlfs.h */
+#define CPUID_HYPERV_CPU_MANAGEMENT (UINT32_C(1) << 12) /* root partition */
+#define CPUID_HYPERV_ISOLATION      (UINT32_C(1) << 22) /* confidential VM partition */
+
+#define CPUID_HYPERV_ISOLATION_TYPE_MASK UINT32_C(0xf)
+#define CPUID_HYPERV_ISOLATION_TYPE_SNP 2
+
+#define EAX_SEV     (UINT32_C(1) << 1)
+#define MSR_SEV     (UINT64_C(1) << 0)
+#define MSR_SEV_ES  (UINT64_C(1) << 1)
+#define MSR_SEV_SNP (UINT64_C(1) << 2)
diff --git a/src/fundamental/efivars-fundamental.c b/src/fundamental/efivars-fundamental.c
new file mode 100644
index 0000000..2ec3bfb
--- /dev/null
+++ b/src/fundamental/efivars-fundamental.c
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "efivars-fundamental.h"
+
+static const sd_char * const table[_SECURE_BOOT_MAX] = {
+        [SECURE_BOOT_UNSUPPORTED] = STR_C("unsupported"),
+        [SECURE_BOOT_DISABLED]    = STR_C("disabled"),
+        [SECURE_BOOT_UNKNOWN]     = STR_C("unknown"),
+        [SECURE_BOOT_AUDIT]       = STR_C("audit"),
+        [SECURE_BOOT_DEPLOYED]    = STR_C("deployed"),
+        [SECURE_BOOT_SETUP]       = STR_C("setup"),
+        [SECURE_BOOT_USER]        = STR_C("user"),
+};
+
+const sd_char *secure_boot_mode_to_string(SecureBootMode m) {
+        return (m >= 0 && m < _SECURE_BOOT_MAX) ? table[m] : NULL;
+}
+
+SecureBootMode decode_secure_boot_mode(bool secure, bool audit, bool deployed, bool setup) {
+        /* See figure 32-4 Secure Boot Modes from UEFI Specification 2.9 */
+        if (secure && deployed && !audit && !setup)
+                return SECURE_BOOT_DEPLOYED;
+        if (secure && !deployed && !audit && !setup)
+                return SECURE_BOOT_USER;
+        if (!secure && !deployed && audit && setup)
+                return SECURE_BOOT_AUDIT;
+        if (!secure && !deployed && !audit && setup)
+                return SECURE_BOOT_SETUP;
+
+        /* Some firmware allows disabling secure boot while not being in
+         * setup mode unless the PK is cleared. */
+        if (!secure && !deployed && !audit && !setup)
+                return SECURE_BOOT_DISABLED;
+
+        /* Well, this should not happen. */
+        return SECURE_BOOT_UNKNOWN;
+}
diff --git a/src/fundamental/efivars-fundamental.h b/src/fundamental/efivars-fundamental.h
new file mode 100644
index 0000000..2d25d22
--- /dev/null
+++ b/src/fundamental/efivars-fundamental.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#ifdef SD_BOOT
+#  define EINVAL 22
+#else
+#  include 
+#endif
+#include "string-util-fundamental.h"
+
+/* Features of the loader, i.e. systemd-boot */
+#define EFI_LOADER_FEATURE_CONFIG_TIMEOUT          (UINT64_C(1) << 0)
+#define EFI_LOADER_FEATURE_CONFIG_TIMEOUT_ONE_SHOT (UINT64_C(1) << 1)
+#define EFI_LOADER_FEATURE_ENTRY_DEFAULT           (UINT64_C(1) << 2)
+#define EFI_LOADER_FEATURE_ENTRY_ONESHOT           (UINT64_C(1) << 3)
+#define EFI_LOADER_FEATURE_BOOT_COUNTING           (UINT64_C(1) << 4)
+#define EFI_LOADER_FEATURE_XBOOTLDR                (UINT64_C(1) << 5)
+#define EFI_LOADER_FEATURE_RANDOM_SEED             (UINT64_C(1) << 6)
+#define EFI_LOADER_FEATURE_LOAD_DRIVER             (UINT64_C(1) << 7)
+#define EFI_LOADER_FEATURE_SORT_KEY                (UINT64_C(1) << 8)
+#define EFI_LOADER_FEATURE_SAVED_ENTRY             (UINT64_C(1) << 9)
+#define EFI_LOADER_FEATURE_DEVICETREE              (UINT64_C(1) << 10)
+#define EFI_LOADER_FEATURE_SECUREBOOT_ENROLL       (UINT64_C(1) << 11)
+#define EFI_LOADER_FEATURE_RETAIN_SHIM             (UINT64_C(1) << 12)
+#define EFI_LOADER_FEATURE_MENU_DISABLE            (UINT64_C(1) << 13)
+
+/* Features of the stub, i.e. systemd-stub */
+#define EFI_STUB_FEATURE_REPORT_BOOT_PARTITION     (UINT64_C(1) << 0)
+#define EFI_STUB_FEATURE_PICK_UP_CREDENTIALS       (UINT64_C(1) << 1)
+#define EFI_STUB_FEATURE_PICK_UP_SYSEXTS           (UINT64_C(1) << 2)
+#define EFI_STUB_FEATURE_THREE_PCRS                (UINT64_C(1) << 3)
+#define EFI_STUB_FEATURE_RANDOM_SEED               (UINT64_C(1) << 4)
+#define EFI_STUB_FEATURE_CMDLINE_ADDONS            (UINT64_C(1) << 5)
+#define EFI_STUB_FEATURE_CMDLINE_SMBIOS            (UINT64_C(1) << 6)
+#define EFI_STUB_FEATURE_DEVICETREE_ADDONS         (UINT64_C(1) << 7)
+
+typedef enum SecureBootMode {
+        SECURE_BOOT_UNSUPPORTED,
+        SECURE_BOOT_DISABLED,
+        SECURE_BOOT_UNKNOWN,
+        SECURE_BOOT_AUDIT,
+        SECURE_BOOT_DEPLOYED,
+        SECURE_BOOT_SETUP,
+        SECURE_BOOT_USER,
+        _SECURE_BOOT_MAX,
+        _SECURE_BOOT_INVALID = -EINVAL,
+} SecureBootMode;
+
+const sd_char *secure_boot_mode_to_string(SecureBootMode m);
+SecureBootMode decode_secure_boot_mode(bool secure, bool audit, bool deployed, bool setup);
diff --git a/src/fundamental/logarithm.h b/src/fundamental/logarithm.h
new file mode 100644
index 0000000..0b03bbd
--- /dev/null
+++ b/src/fundamental/logarithm.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+/* Note: log2(0) == log2(1) == 0 here and below. */
+
+#define CONST_LOG2ULL(x) ((x) > 1 ? (unsigned) __builtin_clzll(x) ^ 63U : 0)
+#define NONCONST_LOG2ULL(x) ({                                     \
+                unsigned long long _x = (x);                       \
+                _x > 1 ? (unsigned) __builtin_clzll(_x) ^ 63U : 0; \
+        })
+#define LOG2ULL(x) __builtin_choose_expr(__builtin_constant_p(x), CONST_LOG2ULL(x), NONCONST_LOG2ULL(x))
+
+static inline unsigned log2u64(uint64_t x) {
+#if __SIZEOF_LONG_LONG__ == 8
+        return LOG2ULL(x);
+#else
+#  error "Wut?"
+#endif
+}
+
+static inline unsigned u32ctz(uint32_t n) {
+#if __SIZEOF_INT__ == 4
+        return n != 0 ? __builtin_ctz(n) : 32;
+#else
+#  error "Wut?"
+#endif
+}
+
+#define popcount(n)                                             \
+        _Generic((n),                                           \
+                 unsigned char: __builtin_popcount(n),          \
+                 unsigned short: __builtin_popcount(n),         \
+                 unsigned: __builtin_popcount(n),               \
+                 unsigned long: __builtin_popcountl(n),         \
+                 unsigned long long: __builtin_popcountll(n))
+
+#define CONST_LOG2U(x) ((x) > 1 ? __SIZEOF_INT__ * 8 - __builtin_clz(x) - 1 : 0)
+#define NONCONST_LOG2U(x) ({                                             \
+                unsigned _x = (x);                                       \
+                _x > 1 ? __SIZEOF_INT__ * 8 - __builtin_clz(_x) - 1 : 0; \
+        })
+#define LOG2U(x) __builtin_choose_expr(__builtin_constant_p(x), CONST_LOG2U(x), NONCONST_LOG2U(x))
+
+static inline unsigned log2i(int x) {
+        return LOG2U(x);
+}
+
+static inline unsigned log2u(unsigned x) {
+        return LOG2U(x);
+}
+
+static inline unsigned log2u_round_up(unsigned x) {
+        if (x <= 1)
+                return 0;
+
+        return log2u(x - 1) + 1;
+}
diff --git a/src/fundamental/macro-fundamental.h b/src/fundamental/macro-fundamental.h
new file mode 100644
index 0000000..797330d
--- /dev/null
+++ b/src/fundamental/macro-fundamental.h
@@ -0,0 +1,515 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if !SD_BOOT
+#  include 
+#endif
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Temporarily disable some warnings */
+#define DISABLE_WARNING_DEPRECATED_DECLARATIONS                         \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+
+#define DISABLE_WARNING_FORMAT_NONLITERAL                               \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wformat-nonliteral\"")
+
+#define DISABLE_WARNING_MISSING_PROTOTYPES                              \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wmissing-prototypes\"")
+
+#define DISABLE_WARNING_NONNULL                                         \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wnonnull\"")
+
+#define DISABLE_WARNING_SHADOW                                          \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wshadow\"")
+
+#define DISABLE_WARNING_INCOMPATIBLE_POINTER_TYPES                      \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wincompatible-pointer-types\"")
+
+#define DISABLE_WARNING_TYPE_LIMITS                                     \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Wtype-limits\"")
+
+#define DISABLE_WARNING_ADDRESS                                         \
+        _Pragma("GCC diagnostic push");                                 \
+        _Pragma("GCC diagnostic ignored \"-Waddress\"")
+
+#define REENABLE_WARNING                                                \
+        _Pragma("GCC diagnostic pop")
+
+#define _align_(x) __attribute__((__aligned__(x)))
+#define _alignas_(x) __attribute__((__aligned__(alignof(x))))
+#define _alignptr_ __attribute__((__aligned__(sizeof(void *))))
+#define _cleanup_(x) __attribute__((__cleanup__(x)))
+#define _const_ __attribute__((__const__))
+#define _deprecated_ __attribute__((__deprecated__))
+#define _destructor_ __attribute__((__destructor__))
+#define _hidden_ __attribute__((__visibility__("hidden")))
+#define _likely_(x) (__builtin_expect(!!(x), 1))
+#define _malloc_ __attribute__((__malloc__))
+#define _noinline_ __attribute__((noinline))
+#define _noreturn_ _Noreturn
+#define _packed_ __attribute__((__packed__))
+#define _printf_(a, b) __attribute__((__format__(printf, a, b)))
+#define _public_ __attribute__((__visibility__("default")))
+#define _pure_ __attribute__((__pure__))
+#define _retain_ __attribute__((__retain__))
+#define _returns_nonnull_ __attribute__((__returns_nonnull__))
+#define _section_(x) __attribute__((__section__(x)))
+#define _sentinel_ __attribute__((__sentinel__))
+#define _unlikely_(x) (__builtin_expect(!!(x), 0))
+#define _unused_ __attribute__((__unused__))
+#define _used_ __attribute__((__used__))
+#define _warn_unused_result_ __attribute__((__warn_unused_result__))
+#define _weak_ __attribute__((__weak__))
+#define _weakref_(x) __attribute__((__weakref__(#x)))
+
+#ifdef __clang__
+#  define _alloc_(...)
+#else
+#  define _alloc_(...) __attribute__((__alloc_size__(__VA_ARGS__)))
+#endif
+
+#if __GNUC__ >= 7 || (defined(__clang__) && __clang_major__ >= 10)
+#  define _fallthrough_ __attribute__((__fallthrough__))
+#else
+#  define _fallthrough_
+#endif
+
+#define XSTRINGIFY(x) #x
+#define STRINGIFY(x) XSTRINGIFY(x)
+
+#ifndef __COVERITY__
+#  define VOID_0 ((void)0)
+#else
+#  define VOID_0 ((void*)0)
+#endif
+
+#define ELEMENTSOF(x)                                                   \
+        (__builtin_choose_expr(                                         \
+                !__builtin_types_compatible_p(typeof(x), typeof(&*(x))), \
+                sizeof(x)/sizeof((x)[0]),                               \
+                VOID_0))
+
+#define XCONCATENATE(x, y) x ## y
+#define CONCATENATE(x, y) XCONCATENATE(x, y)
+
+#if SD_BOOT
+        _noreturn_ void efi_assert(const char *expr, const char *file, unsigned line, const char *function);
+
+        #ifdef NDEBUG
+                #define assert(expr) ({ if (!(expr)) __builtin_unreachable(); })
+                #define assert_not_reached() __builtin_unreachable()
+        #else
+                #define assert(expr) ({ _likely_(expr) ? VOID_0 : efi_assert(#expr, __FILE__, __LINE__, __func__); })
+                #define assert_not_reached() efi_assert("Code should not be reached", __FILE__, __LINE__, __func__)
+        #endif
+        #define static_assert _Static_assert
+        #define assert_se(expr) ({ _likely_(expr) ? VOID_0 : efi_assert(#expr, __FILE__, __LINE__, __func__); })
+#endif
+
+/* This passes the argument through after (if asserts are enabled) checking that it is not null. */
+#define ASSERT_PTR(expr) _ASSERT_PTR(expr, UNIQ_T(_expr_, UNIQ), assert)
+#define ASSERT_SE_PTR(expr) _ASSERT_PTR(expr, UNIQ_T(_expr_, UNIQ), assert_se)
+#define _ASSERT_PTR(expr, var, check)      \
+        ({                                 \
+                typeof(expr) var = (expr); \
+                check(var);                \
+                var;                       \
+        })
+
+#define ASSERT_NONNEG(expr)                              \
+        ({                                               \
+                typeof(expr) _expr_ = (expr), _zero = 0; \
+                assert(_expr_ >= _zero);                 \
+                _expr_;                                  \
+        })
+
+#define ASSERT_SE_NONNEG(expr)                           \
+        ({                                               \
+                typeof(expr) _expr_ = (expr), _zero = 0; \
+                assert_se(_expr_ >= _zero);              \
+                _expr_;                                  \
+        })
+
+#define assert_cc(expr) static_assert(expr, #expr)
+
+#define UNIQ_T(x, uniq) CONCATENATE(__unique_prefix_, CONCATENATE(x, uniq))
+#define UNIQ __COUNTER__
+
+/* Note that this works differently from pthread_once(): this macro does
+ * not synchronize code execution, i.e. code that is run conditionalized
+ * on this macro will run concurrently to all other code conditionalized
+ * the same way, there's no ordering or completion enforced. */
+#define ONCE __ONCE(UNIQ_T(_once_, UNIQ))
+#define __ONCE(o)                                                  \
+        ({                                                         \
+                static bool (o) = false;                           \
+                __atomic_exchange_n(&(o), true, __ATOMIC_SEQ_CST); \
+        })
+
+#undef MAX
+#define MAX(a, b) __MAX(UNIQ, (a), UNIQ, (b))
+#define __MAX(aq, a, bq, b)                             \
+        ({                                              \
+                const typeof(a) UNIQ_T(A, aq) = (a);    \
+                const typeof(b) UNIQ_T(B, bq) = (b);    \
+                UNIQ_T(A, aq) > UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \
+        })
+
+#define IS_UNSIGNED_INTEGER_TYPE(type) \
+        (__builtin_types_compatible_p(typeof(type), unsigned char) ||   \
+         __builtin_types_compatible_p(typeof(type), unsigned short) ||  \
+         __builtin_types_compatible_p(typeof(type), unsigned) ||        \
+         __builtin_types_compatible_p(typeof(type), unsigned long) ||   \
+         __builtin_types_compatible_p(typeof(type), unsigned long long))
+
+#define IS_SIGNED_INTEGER_TYPE(type) \
+        (__builtin_types_compatible_p(typeof(type), signed char) ||   \
+         __builtin_types_compatible_p(typeof(type), signed short) ||  \
+         __builtin_types_compatible_p(typeof(type), signed) ||        \
+         __builtin_types_compatible_p(typeof(type), signed long) ||   \
+         __builtin_types_compatible_p(typeof(type), signed long long))
+
+/* Evaluates to (void) if _A or _B are not constant or of different types (being integers of different sizes
+ * is also OK as long as the signedness matches) */
+#define CONST_MAX(_A, _B) \
+        (__builtin_choose_expr(                                         \
+                __builtin_constant_p(_A) &&                             \
+                __builtin_constant_p(_B) &&                             \
+                (__builtin_types_compatible_p(typeof(_A), typeof(_B)) || \
+                 (IS_UNSIGNED_INTEGER_TYPE(_A) && IS_UNSIGNED_INTEGER_TYPE(_B)) || \
+                 (IS_SIGNED_INTEGER_TYPE(_A) && IS_SIGNED_INTEGER_TYPE(_B))), \
+                ((_A) > (_B)) ? (_A) : (_B),                            \
+                VOID_0))
+
+/* takes two types and returns the size of the larger one */
+#define MAXSIZE(A, B) (sizeof(union _packed_ { typeof(A) a; typeof(B) b; }))
+
+#define MAX3(x, y, z)                                   \
+        ({                                              \
+                const typeof(x) _c = MAX(x, y);         \
+                MAX(_c, z);                             \
+        })
+
+#define MAX4(x, y, z, a)                                \
+        ({                                              \
+                const typeof(x) _d = MAX3(x, y, z);     \
+                MAX(_d, a);                             \
+        })
+
+#undef MIN
+#define MIN(a, b) __MIN(UNIQ, (a), UNIQ, (b))
+#define __MIN(aq, a, bq, b)                             \
+        ({                                              \
+                const typeof(a) UNIQ_T(A, aq) = (a);    \
+                const typeof(b) UNIQ_T(B, bq) = (b);    \
+                UNIQ_T(A, aq) < UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \
+        })
+
+/* evaluates to (void) if _A or _B are not constant or of different types */
+#define CONST_MIN(_A, _B) \
+        (__builtin_choose_expr(                                         \
+                __builtin_constant_p(_A) &&                             \
+                __builtin_constant_p(_B) &&                             \
+                __builtin_types_compatible_p(typeof(_A), typeof(_B)),   \
+                ((_A) < (_B)) ? (_A) : (_B),                            \
+                VOID_0))
+
+#define MIN3(x, y, z)                                   \
+        ({                                              \
+                const typeof(x) _c = MIN(x, y);         \
+                MIN(_c, z);                             \
+        })
+
+/* Returns true if the passed integer is a positive power of two */
+#define CONST_ISPOWEROF2(x)                     \
+        ((x) > 0 && ((x) & ((x) - 1)) == 0)
+
+#define ISPOWEROF2(x)                                                  \
+        __builtin_choose_expr(                                         \
+                __builtin_constant_p(x),                               \
+                CONST_ISPOWEROF2(x),                                   \
+                ({                                                     \
+                        const typeof(x) _x = (x);                      \
+                        CONST_ISPOWEROF2(_x);                          \
+                }))
+
+#define LESS_BY(a, b) __LESS_BY(UNIQ, (a), UNIQ, (b))
+#define __LESS_BY(aq, a, bq, b)                         \
+        ({                                              \
+                const typeof(a) UNIQ_T(A, aq) = (a);    \
+                const typeof(b) UNIQ_T(B, bq) = (b);    \
+                UNIQ_T(A, aq) > UNIQ_T(B, bq) ? UNIQ_T(A, aq) - UNIQ_T(B, bq) : 0; \
+        })
+
+#define CMP(a, b) __CMP(UNIQ, (a), UNIQ, (b))
+#define __CMP(aq, a, bq, b)                             \
+        ({                                              \
+                const typeof(a) UNIQ_T(A, aq) = (a);    \
+                const typeof(b) UNIQ_T(B, bq) = (b);    \
+                UNIQ_T(A, aq) < UNIQ_T(B, bq) ? -1 :    \
+                UNIQ_T(A, aq) > UNIQ_T(B, bq) ? 1 : 0;  \
+        })
+
+#undef CLAMP
+#define CLAMP(x, low, high) __CLAMP(UNIQ, (x), UNIQ, (low), UNIQ, (high))
+#define __CLAMP(xq, x, lowq, low, highq, high)                          \
+        ({                                                              \
+                const typeof(x) UNIQ_T(X, xq) = (x);                    \
+                const typeof(low) UNIQ_T(LOW, lowq) = (low);            \
+                const typeof(high) UNIQ_T(HIGH, highq) = (high);        \
+                        UNIQ_T(X, xq) > UNIQ_T(HIGH, highq) ?           \
+                                UNIQ_T(HIGH, highq) :                   \
+                                UNIQ_T(X, xq) < UNIQ_T(LOW, lowq) ?     \
+                                        UNIQ_T(LOW, lowq) :             \
+                                        UNIQ_T(X, xq);                  \
+        })
+
+/* [(x + y - 1) / y] suffers from an integer overflow, even though the
+ * computation should be possible in the given type. Therefore, we use
+ * [x / y + !!(x % y)]. Note that on "Real CPUs" a division returns both the
+ * quotient and the remainder, so both should be equally fast. */
+#define DIV_ROUND_UP(x, y) __DIV_ROUND_UP(UNIQ, (x), UNIQ, (y))
+#define __DIV_ROUND_UP(xq, x, yq, y)                                    \
+        ({                                                              \
+                const typeof(x) UNIQ_T(X, xq) = (x);                    \
+                const typeof(y) UNIQ_T(Y, yq) = (y);                    \
+                (UNIQ_T(X, xq) / UNIQ_T(Y, yq) + !!(UNIQ_T(X, xq) % UNIQ_T(Y, yq))); \
+        })
+
+/* Rounds up x to the next multiple of y. Resolves to typeof(x) -1 in case of overflow */
+#define __ROUND_UP(q, x, y)                                             \
+        ({                                                              \
+                const typeof(y) UNIQ_T(A, q) = (y);                     \
+                const typeof(x) UNIQ_T(B, q) = DIV_ROUND_UP((x), UNIQ_T(A, q)); \
+                typeof(x) UNIQ_T(C, q);                                 \
+                __builtin_mul_overflow(UNIQ_T(B, q), UNIQ_T(A, q), &UNIQ_T(C, q)) ? (typeof(x)) -1 : UNIQ_T(C, q); \
+        })
+#define ROUND_UP(x, y) __ROUND_UP(UNIQ, (x), (y))
+
+#define  CASE_F_1(X)      case X:
+#define  CASE_F_2(X, ...) case X:  CASE_F_1( __VA_ARGS__)
+#define  CASE_F_3(X, ...) case X:  CASE_F_2( __VA_ARGS__)
+#define  CASE_F_4(X, ...) case X:  CASE_F_3( __VA_ARGS__)
+#define  CASE_F_5(X, ...) case X:  CASE_F_4( __VA_ARGS__)
+#define  CASE_F_6(X, ...) case X:  CASE_F_5( __VA_ARGS__)
+#define  CASE_F_7(X, ...) case X:  CASE_F_6( __VA_ARGS__)
+#define  CASE_F_8(X, ...) case X:  CASE_F_7( __VA_ARGS__)
+#define  CASE_F_9(X, ...) case X:  CASE_F_8( __VA_ARGS__)
+#define CASE_F_10(X, ...) case X:  CASE_F_9( __VA_ARGS__)
+#define CASE_F_11(X, ...) case X: CASE_F_10( __VA_ARGS__)
+#define CASE_F_12(X, ...) case X: CASE_F_11( __VA_ARGS__)
+#define CASE_F_13(X, ...) case X: CASE_F_12( __VA_ARGS__)
+#define CASE_F_14(X, ...) case X: CASE_F_13( __VA_ARGS__)
+#define CASE_F_15(X, ...) case X: CASE_F_14( __VA_ARGS__)
+#define CASE_F_16(X, ...) case X: CASE_F_15( __VA_ARGS__)
+#define CASE_F_17(X, ...) case X: CASE_F_16( __VA_ARGS__)
+#define CASE_F_18(X, ...) case X: CASE_F_17( __VA_ARGS__)
+#define CASE_F_19(X, ...) case X: CASE_F_18( __VA_ARGS__)
+#define CASE_F_20(X, ...) case X: CASE_F_19( __VA_ARGS__)
+
+#define GET_CASE_F(_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,NAME,...) NAME
+#define FOR_EACH_MAKE_CASE(...) \
+        GET_CASE_F(__VA_ARGS__,CASE_F_20,CASE_F_19,CASE_F_18,CASE_F_17,CASE_F_16,CASE_F_15,CASE_F_14,CASE_F_13,CASE_F_12,CASE_F_11, \
+                               CASE_F_10,CASE_F_9,CASE_F_8,CASE_F_7,CASE_F_6,CASE_F_5,CASE_F_4,CASE_F_3,CASE_F_2,CASE_F_1) \
+                   (__VA_ARGS__)
+
+#define IN_SET(x, first, ...)                                           \
+        ({                                                              \
+                bool _found = false;                                    \
+                /* If the build breaks in the line below, you need to extend the case macros. We use typeof(+x) \
+                 * here to widen the type of x if it is a bit-field as this would otherwise be illegal. */      \
+                static const typeof(+x) __assert_in_set[] _unused_ = { first, __VA_ARGS__ }; \
+                assert_cc(ELEMENTSOF(__assert_in_set) <= 20);           \
+                switch (x) {                                            \
+                FOR_EACH_MAKE_CASE(first, __VA_ARGS__)                  \
+                        _found = true;                                  \
+                        break;                                          \
+                default:                                                \
+                        break;                                          \
+                }                                                       \
+                _found;                                                 \
+        })
+
+/* Takes inspiration from Rust's Option::take() method: reads and returns a pointer, but at the same time
+ * resets it to NULL. See: https://doc.rust-lang.org/std/option/enum.Option.html#method.take */
+#define TAKE_GENERIC(var, type, nullvalue)                       \
+        ({                                                       \
+                type *_pvar_ = &(var);                           \
+                type _var_ = *_pvar_;                            \
+                type _nullvalue_ = nullvalue;                    \
+                *_pvar_ = _nullvalue_;                           \
+                _var_;                                           \
+        })
+#define TAKE_PTR_TYPE(ptr, type) TAKE_GENERIC(ptr, type, NULL)
+#define TAKE_PTR(ptr) TAKE_PTR_TYPE(ptr, typeof(ptr))
+#define TAKE_STRUCT_TYPE(s, type) TAKE_GENERIC(s, type, {})
+#define TAKE_STRUCT(s) TAKE_STRUCT_TYPE(s, typeof(s))
+
+/*
+ * STRLEN - return the length of a string literal, minus the trailing NUL byte.
+ *          Contrary to strlen(), this is a constant expression.
+ * @x: a string literal.
+ */
+#define STRLEN(x) (sizeof(""x"") - sizeof(typeof(x[0])))
+
+#define mfree(memory)                           \
+        ({                                      \
+                free(memory);                   \
+                (typeof(memory)) NULL;          \
+        })
+
+static inline size_t ALIGN_TO(size_t l, size_t ali) {
+        assert(ISPOWEROF2(ali));
+
+        if (l > SIZE_MAX - (ali - 1))
+                return SIZE_MAX; /* indicate overflow */
+
+        return ((l + (ali - 1)) & ~(ali - 1));
+}
+
+static inline uint64_t ALIGN_TO_U64(uint64_t l, uint64_t ali) {
+        assert(ISPOWEROF2(ali));
+
+        if (l > UINT64_MAX - (ali - 1))
+                return UINT64_MAX; /* indicate overflow */
+
+        return ((l + (ali - 1)) & ~(ali - 1));
+}
+
+static inline size_t ALIGN_DOWN(size_t l, size_t ali) {
+        assert(ISPOWEROF2(ali));
+
+        return l & ~(ali - 1);
+}
+
+static inline uint64_t ALIGN_DOWN_U64(uint64_t l, uint64_t ali) {
+        assert(ISPOWEROF2(ali));
+
+        return l & ~(ali - 1);
+}
+
+static inline size_t ALIGN_OFFSET(size_t l, size_t ali) {
+        assert(ISPOWEROF2(ali));
+
+        return l & (ali - 1);
+}
+
+static inline uint64_t ALIGN_OFFSET_U64(uint64_t l, uint64_t ali) {
+        assert(ISPOWEROF2(ali));
+
+        return l & (ali - 1);
+}
+
+#define ALIGN2(l) ALIGN_TO(l, 2)
+#define ALIGN4(l) ALIGN_TO(l, 4)
+#define ALIGN8(l) ALIGN_TO(l, 8)
+#define ALIGN2_PTR(p) ((void*) ALIGN2((uintptr_t) p))
+#define ALIGN4_PTR(p) ((void*) ALIGN4((uintptr_t) p))
+#define ALIGN8_PTR(p) ((void*) ALIGN8((uintptr_t) p))
+#define ALIGN(l)  ALIGN_TO(l, sizeof(void*))
+#define ALIGN_PTR(p) ((void*) ALIGN((uintptr_t) (p)))
+
+/* Checks if the specified pointer is aligned as appropriate for the specific type */
+#define IS_ALIGNED16(p) (((uintptr_t) p) % alignof(uint16_t) == 0)
+#define IS_ALIGNED32(p) (((uintptr_t) p) % alignof(uint32_t) == 0)
+#define IS_ALIGNED64(p) (((uintptr_t) p) % alignof(uint64_t) == 0)
+
+/* Same as ALIGN_TO but callable in constant contexts. */
+#define CONST_ALIGN_TO(l, ali)                                         \
+        __builtin_choose_expr(                                         \
+                __builtin_constant_p(l) &&                             \
+                __builtin_constant_p(ali) &&                           \
+                CONST_ISPOWEROF2(ali) &&                               \
+                (l <= SIZE_MAX - (ali - 1)),      /* overflow? */      \
+                ((l) + (ali) - 1) & ~((ali) - 1),                      \
+                VOID_0)
+
+/* Similar to ((t *) (void *) (p)) to cast a pointer. The macro asserts that the pointer has a suitable
+ * alignment for type "t". This exists for places where otherwise "-Wcast-align=strict" would issue a
+ * warning or if you want to assert that the cast gives a pointer of suitable alignment. */
+#define CAST_ALIGN_PTR(t, p)                                    \
+        ({                                                      \
+                const void *_p = (p);                           \
+                assert(((uintptr_t) _p) % alignof(t) == 0); \
+                (t *) _p;                                       \
+        })
+
+#define UPDATE_FLAG(orig, flag, b)                      \
+        ((b) ? ((orig) | (flag)) : ((orig) & ~(flag)))
+#define SET_FLAG(v, flag, b) \
+        (v) = UPDATE_FLAG(v, flag, b)
+#define FLAGS_SET(v, flags) \
+        ((~(v) & (flags)) == 0)
+
+/* A wrapper for 'func' to return void.
+ * Only useful when a void-returning function is required by some API. */
+#define DEFINE_TRIVIAL_DESTRUCTOR(name, type, func)             \
+        static inline void name(type *p) {                      \
+                func(p);                                        \
+        }
+
+/* When func() returns the void value (NULL, -1, …) of the appropriate type */
+#define DEFINE_TRIVIAL_CLEANUP_FUNC(type, func)                 \
+        static inline void func##p(type *p) {                   \
+                if (*p)                                         \
+                        *p = func(*p);                          \
+        }
+
+/* When func() doesn't return the appropriate type, set variable to empty afterwards.
+ * The func() may be provided by a dynamically loaded shared library, hence add an assertion. */
+#define DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(type, func, empty)     \
+        static inline void func##p(type *p) {                   \
+                if (*p != (empty)) {                            \
+                        DISABLE_WARNING_ADDRESS;                \
+                        assert(func);                           \
+                        REENABLE_WARNING;                       \
+                        func(*p);                               \
+                        *p = (empty);                           \
+                }                                               \
+        }
+
+/* When func() doesn't return the appropriate type, and is also a macro, set variable to empty afterwards. */
+#define DEFINE_TRIVIAL_CLEANUP_FUNC_FULL_MACRO(type, func, empty)       \
+        static inline void func##p(type *p) {                           \
+                if (*p != (empty)) {                                    \
+                        func(*p);                                       \
+                        *p = (empty);                                   \
+                }                                                       \
+        }
+
+/* Declare a flexible array usable in a union.
+ * This is essentially a work-around for a pointless constraint in C99
+ * and might go away in some future version of the standard.
+ *
+ * See https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3080ea5553cc909b000d1f1d964a9041962f2c5b
+ */
+#define DECLARE_FLEX_ARRAY(type, name)                 \
+        struct {                                       \
+                dummy_t __empty__ ## name;             \
+                type name[];                           \
+        }
+
+/* Declares an ELF read-only string section that does not occupy memory at runtime. */
+#define DECLARE_NOALLOC_SECTION(name, text)   \
+        asm(".pushsection " name ",\"S\"\n\t" \
+            ".ascii " STRINGIFY(text) "\n\t"  \
+            ".zero 1\n\t"                     \
+            ".popsection\n")
+
+#ifdef SBAT_DISTRO
+        #define DECLARE_SBAT(text) DECLARE_NOALLOC_SECTION(".sbat", text)
+#else
+        #define DECLARE_SBAT(text)
+#endif
diff --git a/src/fundamental/memory-util-fundamental.h b/src/fundamental/memory-util-fundamental.h
new file mode 100644
index 0000000..6870f54
--- /dev/null
+++ b/src/fundamental/memory-util-fundamental.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#if SD_BOOT
+#  include "efi-string.h"
+#else
+#  include 
+#endif
+
+#include "macro-fundamental.h"
+
+#define memzero(x, l)                                           \
+        ({                                                      \
+                size_t _l_ = (l);                               \
+                _l_ > 0 ? memset((x), 0, _l_) : (x);            \
+        })
+
+#if !SD_BOOT && HAVE_EXPLICIT_BZERO
+static inline void *explicit_bzero_safe(void *p, size_t l) {
+        if (p && l > 0)
+                explicit_bzero(p, l);
+
+        return p;
+}
+#else
+static inline void *explicit_bzero_safe(void *p, size_t l) {
+        if (p && l > 0) {
+                memset(p, 0, l);
+                __asm__ __volatile__("" : : "r"(p) : "memory");
+        }
+        return p;
+}
+#endif
+
+struct VarEraser {
+        /* NB: This is a pointer to memory to erase in case of CLEANUP_ERASE(). Pointer to pointer to memory
+         * to erase in case of CLEANUP_ERASE_PTR() */
+        void *p;
+        size_t size;
+};
+
+static inline void erase_var(struct VarEraser *e) {
+        explicit_bzero_safe(e->p, e->size);
+}
+
+/* Mark var to be erased when leaving scope. */
+#define CLEANUP_ERASE(var)                                              \
+        _cleanup_(erase_var) _unused_ struct VarEraser CONCATENATE(_eraser_, UNIQ) = { \
+                .p = &(var),                                            \
+                .size = sizeof(var),                                    \
+        }
+
+static inline void erase_varp(struct VarEraser *e) {
+
+        /* Very similar to erase_var(), but assumes `p` is a pointer to a pointer whose memory shall be destructed. */
+        if (!e->p)
+                return;
+
+        explicit_bzero_safe(*(void**) e->p, e->size);
+}
+
+/* Mark pointer so that memory pointed to is erased when leaving scope. Note: this takes a pointer to the
+ * specified pointer, instead of just a copy of it. This is to allow callers to invalidate the pointer after
+ * use, if they like, disabling our automatic erasure (for example because they succeeded with whatever they
+ * wanted to do and now intend to return the allocated buffer to their caller without it being erased). */
+#define CLEANUP_ERASE_PTR(ptr, sz)                                      \
+        _cleanup_(erase_varp) _unused_ struct VarEraser CONCATENATE(_eraser_, UNIQ) = { \
+                .p = (ptr),                                             \
+                .size = (sz),                                           \
+        }
+
+typedef void (*free_array_func_t)(void *p, size_t n);
+
+/* An automatic _cleanup_-like logic for destroy arrays (i.e. pointers + size) when leaving scope */
+typedef struct ArrayCleanup {
+        void **parray;
+        size_t *pn;
+        free_array_func_t pfunc;
+} ArrayCleanup;
+
+static inline void array_cleanup(const ArrayCleanup *c) {
+        assert(c);
+
+        assert(!c->parray == !c->pn);
+
+        if (!c->parray)
+                return;
+
+        if (*c->parray) {
+                assert(c->pfunc);
+                c->pfunc(*c->parray, *c->pn);
+                *c->parray = NULL;
+        }
+
+        *c->pn = 0;
+}
+
+#define CLEANUP_ARRAY(array, n, func)                                   \
+        _cleanup_(array_cleanup) _unused_ const ArrayCleanup CONCATENATE(_cleanup_array_, UNIQ) = { \
+                .parray = (void**) &(array),                            \
+                .pn = &(n),                                             \
+                .pfunc = (free_array_func_t) ({                         \
+                                void (*_f)(typeof(array[0]) *a, size_t b) = func; \
+                                _f;                                     \
+                        }),                                             \
+        }
diff --git a/src/fundamental/meson.build b/src/fundamental/meson.build
new file mode 100644
index 0000000..b7ca6cf
--- /dev/null
+++ b/src/fundamental/meson.build
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+fundamental_include = include_directories('.')
+
+fundamental_sources = files(
+        'bootspec-fundamental.c',
+        'efivars-fundamental.c',
+        'sha256.c',
+        'string-util-fundamental.c',
+        'uki.c',
+)
diff --git a/src/fundamental/sbat.h b/src/fundamental/sbat.h
new file mode 100644
index 0000000..9288e05
--- /dev/null
+++ b/src/fundamental/sbat.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#ifdef SBAT_DISTRO
+#  include "version.h"
+#  define SBAT_MAGIC "sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md\n"
+#  define SBAT_BOOT_SECTION_TEXT \
+        SBAT_MAGIC \
+        SBAT_PROJECT "-boot" ",1,The systemd Developers," SBAT_PROJECT "," PROJECT_VERSION "," PROJECT_URL "\n" \
+        SBAT_PROJECT "-boot" "." SBAT_DISTRO "," STRINGIFY(SBAT_DISTRO_GENERATION) "," SBAT_DISTRO_SUMMARY "," SBAT_DISTRO_PKGNAME "," SBAT_DISTRO_VERSION "," SBAT_DISTRO_URL "\n"
+#  define SBAT_STUB_SECTION_TEXT \
+        SBAT_MAGIC \
+        SBAT_PROJECT "-stub" ",1,The systemd Developers," SBAT_PROJECT "," PROJECT_VERSION "," PROJECT_URL "\n" \
+        SBAT_PROJECT "-stub" "." SBAT_DISTRO "," STRINGIFY(SBAT_DISTRO_GENERATION) "," SBAT_DISTRO_SUMMARY "," SBAT_DISTRO_PKGNAME "," SBAT_DISTRO_VERSION "," SBAT_DISTRO_URL "\n"
+#endif
diff --git a/src/fundamental/sha256.c b/src/fundamental/sha256.c
new file mode 100644
index 0000000..4389e9e
--- /dev/null
+++ b/src/fundamental/sha256.c
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* Stolen from glibc and converted to our style. In glibc it comes with the following copyright blurb: */
+
+/* Functions to compute SHA256 message digest of files or memory blocks.
+   according to the definition of SHA256 in FIPS 180-2.
+   Copyright (C) 2007-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   .  */
+
+#include 
+#if SD_BOOT
+#  include "efi-string.h"
+#else
+#  include 
+#endif
+
+#include "macro-fundamental.h"
+#include "sha256.h"
+#include "unaligned-fundamental.h"
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define SWAP(n)                                                        \
+        (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
+# define SWAP64(n)                              \
+        (((n) << 56)                            \
+         | (((n) & 0xff00) << 40)               \
+         | (((n) & 0xff0000) << 24)             \
+         | (((n) & 0xff000000) << 8)            \
+         | (((n) >> 8) & 0xff000000)            \
+         | (((n) >> 24) & 0xff0000)             \
+         | (((n) >> 40) & 0xff00)               \
+         | ((n) >> 56))
+#else
+# define SWAP(n) (n)
+# define SWAP64(n) (n)
+#endif
+
+/* This array contains the bytes used to pad the buffer to the next
+   64-byte boundary.  (FIPS 180-2:5.1.1)  */
+static const uint8_t fillbuf[64] = {
+        0x80, 0 /* , 0, 0, ...  */
+};
+
+/* Constants for SHA256 from FIPS 180-2:4.2.2.  */
+static const uint32_t K[64] = {
+        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void sha256_process_block(const void *, size_t, struct sha256_ctx *);
+
+/* Initialize structure containing state of computation.
+   (FIPS 180-2:5.3.2)  */
+void sha256_init_ctx(struct sha256_ctx *ctx) {
+        assert(ctx);
+
+        ctx->H[0] = 0x6a09e667;
+        ctx->H[1] = 0xbb67ae85;
+        ctx->H[2] = 0x3c6ef372;
+        ctx->H[3] = 0xa54ff53a;
+        ctx->H[4] = 0x510e527f;
+        ctx->H[5] = 0x9b05688c;
+        ctx->H[6] = 0x1f83d9ab;
+        ctx->H[7] = 0x5be0cd19;
+
+        ctx->total64 = 0;
+        ctx->buflen = 0;
+}
+
+/* Process the remaining bytes in the internal buffer and the usual
+   prolog according to the standard and write the result to RESBUF. */
+uint8_t *sha256_finish_ctx(struct sha256_ctx *ctx, uint8_t resbuf[static SHA256_DIGEST_SIZE]) {
+        /* Take yet unprocessed bytes into account.  */
+        uint32_t bytes = ctx->buflen;
+        size_t pad;
+
+        assert(ctx);
+        assert(resbuf);
+
+        /* Now count remaining bytes.  */
+        ctx->total64 += bytes;
+
+        pad = bytes >= 56 ? 64 + 56 - bytes : 56 - bytes;
+        memcpy(&ctx->buffer[bytes], fillbuf, pad);
+
+        /* Put the 64-bit file length in *bits* at the end of the buffer.  */
+        ctx->buffer32[(bytes + pad + 4) / 4] = SWAP(ctx->total[TOTAL64_low] << 3);
+        ctx->buffer32[(bytes + pad) / 4] = SWAP((ctx->total[TOTAL64_high] << 3)
+                                                | (ctx->total[TOTAL64_low] >> 29));
+
+        /* Process last bytes.  */
+        sha256_process_block(ctx->buffer, bytes + pad + 8, ctx);
+
+        /* Put result from CTX in first 32 bytes following RESBUF.  */
+        for (size_t i = 0; i < 8; ++i)
+                unaligned_write_ne32(resbuf + i * sizeof(uint32_t), SWAP(ctx->H[i]));
+        return resbuf;
+}
+
+void sha256_process_bytes(const void *buffer, size_t len, struct sha256_ctx *ctx) {
+        assert(buffer);
+        assert(ctx);
+
+        /* When we already have some bits in our internal buffer concatenate
+           both inputs first.  */
+
+        if (ctx->buflen != 0) {
+                size_t left_over = ctx->buflen;
+                size_t add = 128 - left_over > len ? len : 128 - left_over;
+
+                memcpy(&ctx->buffer[left_over], buffer, add);
+                ctx->buflen += add;
+
+                if (ctx->buflen > 64) {
+                        sha256_process_block(ctx->buffer, ctx->buflen & ~63, ctx);
+
+                        ctx->buflen &= 63;
+                        /* The regions in the following copy operation cannot overlap.  */
+                        memcpy(ctx->buffer, &ctx->buffer[(left_over + add) & ~63],
+                                ctx->buflen);
+                }
+
+                buffer = (const char *) buffer + add;
+                len -= add;
+        }
+
+        /* Process available complete blocks.  */
+        if (len >= 64) {
+                if (IS_ALIGNED32(buffer)) {
+                        sha256_process_block(buffer, len & ~63, ctx);
+                        buffer = (const char *) buffer + (len & ~63);
+                        len &= 63;
+                } else
+                        while (len > 64) {
+                                memcpy(ctx->buffer, buffer, 64);
+                                sha256_process_block(ctx->buffer, 64, ctx);
+                                buffer = (const char *) buffer + 64;
+                                len -= 64;
+                        }
+        }
+
+        /* Move remaining bytes into internal buffer.  */
+        if (len > 0) {
+                size_t left_over = ctx->buflen;
+
+                memcpy(&ctx->buffer[left_over], buffer, len);
+                left_over += len;
+                if (left_over >= 64) {
+                        sha256_process_block(ctx->buffer, 64, ctx);
+                        left_over -= 64;
+                        memcpy(ctx->buffer, &ctx->buffer[64], left_over);
+                }
+                ctx->buflen = left_over;
+        }
+}
+
+/* Process LEN bytes of BUFFER, accumulating context into CTX.
+   It is assumed that LEN % 64 == 0.  */
+static void sha256_process_block(const void *buffer, size_t len, struct sha256_ctx *ctx) {
+        const uint32_t *words = ASSERT_PTR(buffer);
+        size_t nwords = len / sizeof(uint32_t);
+
+        assert(ctx);
+
+        uint32_t a = ctx->H[0];
+        uint32_t b = ctx->H[1];
+        uint32_t c = ctx->H[2];
+        uint32_t d = ctx->H[3];
+        uint32_t e = ctx->H[4];
+        uint32_t f = ctx->H[5];
+        uint32_t g = ctx->H[6];
+        uint32_t h = ctx->H[7];
+
+        /* First increment the byte count.  FIPS 180-2 specifies the possible
+           length of the file up to 2^64 bits.  Here we only compute the
+           number of bytes.  */
+        ctx->total64 += len;
+
+        /* Process all bytes in the buffer with 64 bytes in each round of
+           the loop.  */
+        while (nwords > 0) {
+                uint32_t W[64];
+                uint32_t a_save = a;
+                uint32_t b_save = b;
+                uint32_t c_save = c;
+                uint32_t d_save = d;
+                uint32_t e_save = e;
+                uint32_t f_save = f;
+                uint32_t g_save = g;
+                uint32_t h_save = h;
+
+                /* Operators defined in FIPS 180-2:4.1.2.  */
+#define Ch(x, y, z) ((x & y) ^ (~x & z))
+#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
+#define S0(x) (CYCLIC (x, 2) ^ CYCLIC (x, 13) ^ CYCLIC (x, 22))
+#define S1(x) (CYCLIC (x, 6) ^ CYCLIC (x, 11) ^ CYCLIC (x, 25))
+#define R0(x) (CYCLIC (x, 7) ^ CYCLIC (x, 18) ^ (x >> 3))
+#define R1(x) (CYCLIC (x, 17) ^ CYCLIC (x, 19) ^ (x >> 10))
+
+                /* It is unfortunate that C does not provide an operator for
+                   cyclic rotation.  Hope the C compiler is smart enough.  */
+#define CYCLIC(w, s) ((w >> s) | (w << (32 - s)))
+
+                /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
+                for (size_t t = 0; t < 16; ++t) {
+                        W[t] = SWAP (*words);
+                        ++words;
+                }
+                for (size_t t = 16; t < 64; ++t)
+                        W[t] = R1 (W[t - 2]) + W[t - 7] + R0 (W[t - 15]) + W[t - 16];
+
+                /* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
+                for (size_t t = 0; t < 64; ++t) {
+                        uint32_t T1 = h + S1 (e) + Ch (e, f, g) + K[t] + W[t];
+                        uint32_t T2 = S0 (a) + Maj (a, b, c);
+                        h = g;
+                        g = f;
+                        f = e;
+                        e = d + T1;
+                        d = c;
+                        c = b;
+                        b = a;
+                        a = T1 + T2;
+                }
+
+                /* Add the starting values of the context according to FIPS 180-2:6.2.2
+                   step 4.  */
+                a += a_save;
+                b += b_save;
+                c += c_save;
+                d += d_save;
+                e += e_save;
+                f += f_save;
+                g += g_save;
+                h += h_save;
+
+                /* Prepare for the next round.  */
+                nwords -= 16;
+        }
+
+        /* Put checksum in context given as argument.  */
+        ctx->H[0] = a;
+        ctx->H[1] = b;
+        ctx->H[2] = c;
+        ctx->H[3] = d;
+        ctx->H[4] = e;
+        ctx->H[5] = f;
+        ctx->H[6] = g;
+        ctx->H[7] = h;
+}
+
+uint8_t* sha256_direct(const void *buffer, size_t sz, uint8_t result[static SHA256_DIGEST_SIZE]) {
+        struct sha256_ctx ctx;
+        sha256_init_ctx(&ctx);
+        sha256_process_bytes(buffer, sz, &ctx);
+        return sha256_finish_ctx(&ctx, result);
+}
diff --git a/src/fundamental/sha256.h b/src/fundamental/sha256.h
new file mode 100644
index 0000000..dbb08e3
--- /dev/null
+++ b/src/fundamental/sha256.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+#define SHA256_DIGEST_SIZE 32
+
+struct sha256_ctx {
+        uint32_t H[8];
+
+        union {
+                uint64_t total64;
+#define TOTAL64_low (1 - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#define TOTAL64_high (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+                uint32_t total[2];
+        };
+
+        uint32_t buflen;
+
+        union {
+                uint8_t  buffer[128]; /* NB: always correctly aligned for UINT32.  */
+                uint32_t buffer32[32];
+                uint64_t buffer64[16];
+        };
+};
+
+void sha256_init_ctx(struct sha256_ctx *ctx);
+uint8_t *sha256_finish_ctx(struct sha256_ctx *ctx, uint8_t resbuf[static SHA256_DIGEST_SIZE]);
+void sha256_process_bytes(const void *buffer, size_t len, struct sha256_ctx *ctx);
+
+static inline void sha256_process_bytes_and_size(const void *buffer, size_t len, struct sha256_ctx *ctx) {
+        sha256_process_bytes(&len, sizeof(len), ctx);
+        sha256_process_bytes(buffer, len, ctx);
+}
+
+uint8_t* sha256_direct(const void *buffer, size_t sz, uint8_t result[static SHA256_DIGEST_SIZE]);
+
+#define SHA256_DIRECT(buffer, sz) sha256_direct(buffer, sz, (uint8_t[SHA256_DIGEST_SIZE]) {})
diff --git a/src/fundamental/string-util-fundamental.c b/src/fundamental/string-util-fundamental.c
new file mode 100644
index 0000000..a5bafc6
--- /dev/null
+++ b/src/fundamental/string-util-fundamental.c
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if !SD_BOOT
+#  include 
+#endif
+
+#include "macro-fundamental.h"
+#include "string-util-fundamental.h"
+
+sd_char *startswith(const sd_char *s, const sd_char *prefix) {
+        size_t l;
+
+        assert(s);
+        assert(prefix);
+
+        l = strlen(prefix);
+        if (!strneq(s, prefix, l))
+                return NULL;
+
+        return (sd_char*) s + l;
+}
+
+sd_char *startswith_no_case(const sd_char *s, const sd_char *prefix) {
+        size_t l;
+
+        assert(s);
+        assert(prefix);
+
+        l = strlen(prefix);
+        if (!strncaseeq(s, prefix, l))
+                return NULL;
+
+        return (sd_char*) s + l;
+}
+
+sd_char* endswith(const sd_char *s, const sd_char *postfix) {
+        size_t sl, pl;
+
+        assert(s);
+        assert(postfix);
+
+        sl = strlen(s);
+        pl = strlen(postfix);
+
+        if (pl == 0)
+                return (sd_char*) s + sl;
+
+        if (sl < pl)
+                return NULL;
+
+        if (strcmp(s + sl - pl, postfix) != 0)
+                return NULL;
+
+        return (sd_char*) s + sl - pl;
+}
+
+sd_char* endswith_no_case(const sd_char *s, const sd_char *postfix) {
+        size_t sl, pl;
+
+        assert(s);
+        assert(postfix);
+
+        sl = strlen(s);
+        pl = strlen(postfix);
+
+        if (pl == 0)
+                return (sd_char*) s + sl;
+
+        if (sl < pl)
+                return NULL;
+
+        if (strcasecmp(s + sl - pl, postfix) != 0)
+                return NULL;
+
+        return (sd_char*) s + sl - pl;
+}
+
+static bool is_valid_version_char(sd_char a) {
+        return ascii_isdigit(a) || ascii_isalpha(a) || IN_SET(a, '~', '-', '^', '.');
+}
+
+int strverscmp_improved(const sd_char *a, const sd_char *b) {
+        /* This function is similar to strverscmp(3), but it treats '-' and '.' as separators.
+         *
+         * The logic is based on rpm's rpmvercmp(), but unlike rpmvercmp(), it distiguishes e.g.
+         * '123a' and '123.a', with '123a' being newer.
+         *
+         * It allows direct comparison of strings which contain both a version and a release; e.g.
+         * '247.2-3.1.fc33.x86_64' or '5.11.0-0.rc5.20210128git76c057c84d28.137.fc34'.
+         *
+         * The input string is split into segments. Each segment is numeric or alphabetic, and may be
+         * prefixed with the following:
+         *  '~' : used for pre-releases, a segment prefixed with this is the oldest,
+         *  '-' : used for the separator between version and release,
+         *  '^' : used for patched releases, a segment with this is newer than one with '-'.
+         *  '.' : used for point releases.
+         * Note that no prefix segment is the newest. All non-supported characters are dropped, and
+         * handled as a separator of segments, e.g., '123_a' is equivalent to '123a'.
+         *
+         * By using this, version strings can be sorted like following:
+         *  (older) 122.1
+         *     ^    123~rc1-1
+         *     |    123
+         *     |    123-a
+         *     |    123-a.1
+         *     |    123-1
+         *     |    123-1.1
+         *     |    123^post1
+         *     |    123.a-1
+         *     |    123.1-1
+         *     v    123a-1
+         *  (newer) 124-1
+         */
+
+        a = strempty(a);
+        b = strempty(b);
+
+        for (;;) {
+                const sd_char *aa, *bb;
+                int r;
+
+                /* Drop leading invalid characters. */
+                while (*a != '\0' && !is_valid_version_char(*a))
+                        a++;
+                while (*b != '\0' && !is_valid_version_char(*b))
+                        b++;
+
+                /* Handle '~'. Used for pre-releases, e.g. 123~rc1, or 4.5~alpha1 */
+                if (*a == '~' || *b == '~') {
+                        /* The string prefixed with '~' is older. */
+                        r = CMP(*a != '~', *b != '~');
+                        if (r != 0)
+                                return r;
+
+                        /* Now both strings are prefixed with '~'. Compare remaining strings. */
+                        a++;
+                        b++;
+                }
+
+                /* If at least one string reaches the end, then longer is newer.
+                 * Note that except for '~' prefixed segments, a string which has more segments is newer.
+                 * So, this check must be after the '~' check. */
+                if (*a == '\0' || *b == '\0')
+                        return CMP(*a, *b);
+
+                /* Handle '-', which separates version and release, e.g 123.4-3.1.fc33.x86_64 */
+                if (*a == '-' || *b == '-') {
+                        /* The string prefixed with '-' is older (e.g., 123-9 vs 123.1-1) */
+                        r = CMP(*a != '-', *b != '-');
+                        if (r != 0)
+                                return r;
+
+                        a++;
+                        b++;
+                }
+
+                /* Handle '^'. Used for patched release. */
+                if (*a == '^' || *b == '^') {
+                        r = CMP(*a != '^', *b != '^');
+                        if (r != 0)
+                                return r;
+
+                        a++;
+                        b++;
+                }
+
+                /* Handle '.'. Used for point releases. */
+                if (*a == '.' || *b == '.') {
+                        r = CMP(*a != '.', *b != '.');
+                        if (r != 0)
+                                return r;
+
+                        a++;
+                        b++;
+                }
+
+                if (ascii_isdigit(*a) || ascii_isdigit(*b)) {
+                        /* Find the leading numeric segments. One may be an empty string. So,
+                         * numeric segments are always newer than alpha segments. */
+                        for (aa = a; ascii_isdigit(*aa); aa++)
+                                ;
+                        for (bb = b; ascii_isdigit(*bb); bb++)
+                                ;
+
+                        /* Check if one of the strings was empty, but the other not. */
+                        r = CMP(a != aa, b != bb);
+                        if (r != 0)
+                                return r;
+
+                        /* Skip leading '0', to make 00123 equivalent to 123. */
+                        while (*a == '0')
+                                a++;
+                        while (*b == '0')
+                                b++;
+
+                        /* To compare numeric segments without parsing their values, first compare the
+                         * lengths of the segments. Eg. 12345 vs 123, longer is newer. */
+                        r = CMP(aa - a, bb - b);
+                        if (r != 0)
+                                return r;
+
+                        /* Then, compare them as strings. */
+                        r = CMP(strncmp(a, b, aa - a), 0);
+                        if (r != 0)
+                                return r;
+                } else {
+                        /* Find the leading non-numeric segments. */
+                        for (aa = a; ascii_isalpha(*aa); aa++)
+                                ;
+                        for (bb = b; ascii_isalpha(*bb); bb++)
+                                ;
+
+                        /* Note that the segments are usually not NUL-terminated. */
+                        r = CMP(strncmp(a, b, MIN(aa - a, bb - b)), 0);
+                        if (r != 0)
+                                return r;
+
+                        /* Longer is newer, e.g. abc vs abcde. */
+                        r = CMP(aa - a, bb - b);
+                        if (r != 0)
+                                return r;
+                }
+
+                /* The current segments are equivalent. Let's move to the next one. */
+                a = aa;
+                b = bb;
+        }
+}
diff --git a/src/fundamental/string-util-fundamental.h b/src/fundamental/string-util-fundamental.h
new file mode 100644
index 0000000..b537b2e
--- /dev/null
+++ b/src/fundamental/string-util-fundamental.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if SD_BOOT
+#  include "efi.h"
+#  include "efi-string.h"
+#else
+#  include 
+#endif
+
+#include "macro-fundamental.h"
+
+#if SD_BOOT
+#  define strlen strlen16
+#  define strcmp strcmp16
+#  define strncmp strncmp16
+#  define strcasecmp strcasecmp16
+#  define strncasecmp strncasecmp16
+#  define STR_C(str)       (L ## str)
+typedef char16_t sd_char;
+#else
+#  define STR_C(str)       (str)
+typedef char sd_char;
+#endif
+
+#define streq(a,b) (strcmp((a),(b)) == 0)
+#define strneq(a, b, n) (strncmp((a), (b), (n)) == 0)
+#define strcaseeq(a,b) (strcasecmp((a),(b)) == 0)
+#define strncaseeq(a, b, n) (strncasecmp((a), (b), (n)) == 0)
+
+static inline int strcmp_ptr(const sd_char *a, const sd_char *b) {
+        if (a && b)
+                return strcmp(a, b);
+
+        return CMP(a, b);
+}
+
+static inline int strcasecmp_ptr(const sd_char *a, const sd_char *b) {
+        if (a && b)
+                return strcasecmp(a, b);
+
+        return CMP(a, b);
+}
+
+static inline bool streq_ptr(const sd_char *a, const sd_char *b) {
+        return strcmp_ptr(a, b) == 0;
+}
+
+static inline bool strcaseeq_ptr(const sd_char *a, const sd_char *b) {
+        return strcasecmp_ptr(a, b) == 0;
+}
+
+static inline size_t strlen_ptr(const sd_char *s) {
+        if (!s)
+                return 0;
+
+        return strlen(s);
+}
+
+sd_char *startswith(const sd_char *s, const sd_char *prefix) _pure_;
+sd_char *startswith_no_case(const sd_char *s, const sd_char *prefix) _pure_;
+sd_char *endswith(const sd_char *s, const sd_char *postfix) _pure_;
+sd_char *endswith_no_case(const sd_char *s, const sd_char *postfix) _pure_;
+
+static inline bool isempty(const sd_char *a) {
+        return !a || a[0] == '\0';
+}
+
+static inline const sd_char *strempty(const sd_char *s) {
+        return s ?: STR_C("");
+}
+
+static inline const sd_char *yes_no(bool b) {
+        return b ? STR_C("yes") : STR_C("no");
+}
+
+static inline const sd_char *on_off(bool b) {
+        return b ? STR_C("on") : STR_C("off");
+}
+
+static inline const sd_char* comparison_operator(int result) {
+        return result < 0 ? STR_C("<") : result > 0 ? STR_C(">") : STR_C("==");
+}
+
+int strverscmp_improved(const sd_char *a, const sd_char *b);
+
+/* Like startswith(), but operates on arbitrary memory blocks */
+static inline void *memory_startswith(const void *p, size_t sz, const sd_char *token) {
+        assert(token);
+
+        size_t n = strlen(token) * sizeof(sd_char);
+        if (sz < n)
+                return NULL;
+
+        assert(p);
+
+        if (memcmp(p, token, n) != 0)
+                return NULL;
+
+        return (uint8_t*) p + n;
+}
+
+#define _STRV_FOREACH(s, l, i)                                          \
+        for (typeof(*(l)) *s, *i = (l); (s = i) && *i; i++)
+
+#define STRV_FOREACH(s, l)                      \
+        _STRV_FOREACH(s, l, UNIQ_T(i, UNIQ))
+
+static inline bool ascii_isdigit(sd_char a) {
+        /* A pure ASCII, locale independent version of isdigit() */
+        return a >= '0' && a <= '9';
+}
+
+static inline bool ascii_ishex(sd_char a) {
+        return ascii_isdigit(a) || (a >= 'a' && a <= 'f') || (a >= 'A' && a <= 'F');
+}
+
+static inline bool ascii_isalpha(sd_char a) {
+        /* A pure ASCII, locale independent version of isalpha() */
+        return (a >= 'a' && a <= 'z') || (a >= 'A' && a <= 'Z');
+}
diff --git a/src/fundamental/tpm2-pcr.h b/src/fundamental/tpm2-pcr.h
new file mode 100644
index 0000000..d0d5b74
--- /dev/null
+++ b/src/fundamental/tpm2-pcr.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro-fundamental.h"
+
+/* The various TPM PCRs we measure into from sd-stub and sd-boot. */
+
+enum {
+        /* The following names for PCRs 0…7 are based on the names in the "TCG PC Client Specific Platform
+         * Firmware Profile Specification"
+         * (https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/) */
+        TPM2_PCR_PLATFORM_CODE       = 0,
+        TPM2_PCR_PLATFORM_CONFIG     = 1,
+        TPM2_PCR_EXTERNAL_CODE       = 2,
+        TPM2_PCR_EXTERNAL_CONFIG     = 3,
+        TPM2_PCR_BOOT_LOADER_CODE    = 4,
+        TPM2_PCR_BOOT_LOADER_CONFIG  = 5,
+        TPM2_PCR_HOST_PLATFORM       = 6,
+        TPM2_PCR_SECURE_BOOT_POLICY  = 7,
+
+        /* The following names for PCRs 9…15 are based on the "Linux TPM PCR Registry"
+        (https://uapi-group.org/specifications/specs/linux_tpm_pcr_registry/) */
+        TPM2_PCR_KERNEL_INITRD       = 9,
+        TPM2_PCR_IMA                 = 10,
+
+        /* systemd: This TPM PCR is where we extend the sd-stub "payloads" into, before using them. i.e. the kernel
+         * ELF image, embedded initrd, and so on. In contrast to PCR 4 (which also contains this data, given
+         * the whole surrounding PE image is measured into it) this should be reasonably pre-calculatable,
+         * because it *only* consists of static data from the kernel PE image. */
+        TPM2_PCR_KERNEL_BOOT         = 11,
+
+        /* systemd: This TPM PCR is where sd-stub extends the kernel command line and any passed credentials into. */
+        TPM2_PCR_KERNEL_CONFIG       = 12,
+
+        /* systemd: This TPM PCR is where we extend the initrd sysext images into which we pass to the booted kernel */
+        TPM2_PCR_SYSEXTS             = 13,
+        TPM2_PCR_SHIM_POLICY         = 14,
+
+        /* systemd: This TPM PCR is where we measure the root fs volume key (and maybe /var/'s) if it is split off */
+        TPM2_PCR_SYSTEM_IDENTITY     = 15,
+
+        /* As per "TCG PC Client Specific Platform Firmware Profile Specification" again, see above */
+        TPM2_PCR_DEBUG               = 16,
+        TPM2_PCR_APPLICATION_SUPPORT = 23,
+};
+
+/* The tag used for EV_EVENT_TAG event log records covering the boot loader config */
+#define LOADER_CONF_EVENT_TAG_ID UINT32_C(0xf5bc582a)
+
+/* The tag used for EV_EVENT_TAG event log records covering Devicetree blobs */
+#define DEVICETREE_ADDON_EVENT_TAG_ID UINT32_C(0x6c46f751)
diff --git a/src/fundamental/uki.c b/src/fundamental/uki.c
new file mode 100644
index 0000000..b1fa044
--- /dev/null
+++ b/src/fundamental/uki.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "uki.h"
+
+const char* const unified_sections[_UNIFIED_SECTION_MAX + 1] = {
+        /* These section names must fit in 8ch (excluding any trailing NUL) as per PE spec for executables:
+         * https://learn.microsoft.com/en-us/windows/win32/debug/pe-format#section-table-section-headers
+         * (Note that PE *object* files may have longer section names (via indirection in the string table) but
+         * this is not allowed for PE *executables*, which UKIs are.) */
+        [UNIFIED_SECTION_LINUX]   = ".linux",
+        [UNIFIED_SECTION_OSREL]   = ".osrel",
+        [UNIFIED_SECTION_CMDLINE] = ".cmdline",
+        [UNIFIED_SECTION_INITRD]  = ".initrd",
+        [UNIFIED_SECTION_SPLASH]  = ".splash",
+        [UNIFIED_SECTION_DTB]     = ".dtb",
+        [UNIFIED_SECTION_UNAME]   = ".uname",
+        [UNIFIED_SECTION_SBAT]    = ".sbat",
+        [UNIFIED_SECTION_PCRSIG]  = ".pcrsig",
+        [UNIFIED_SECTION_PCRPKEY] = ".pcrpkey",
+        NULL,
+};
diff --git a/src/fundamental/uki.h b/src/fundamental/uki.h
new file mode 100644
index 0000000..ffa960f
--- /dev/null
+++ b/src/fundamental/uki.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro-fundamental.h"
+
+/* List of PE sections that have special meaning for us in unified kernels. This is the canonical order in
+ * which we measure the sections into TPM PCR 11. PLEASE DO NOT REORDER! */
+typedef enum UnifiedSection {
+        UNIFIED_SECTION_LINUX,
+        UNIFIED_SECTION_OSREL,
+        UNIFIED_SECTION_CMDLINE,
+        UNIFIED_SECTION_INITRD,
+        UNIFIED_SECTION_SPLASH,
+        UNIFIED_SECTION_DTB,
+        UNIFIED_SECTION_UNAME,
+        UNIFIED_SECTION_SBAT,
+        UNIFIED_SECTION_PCRSIG,
+        UNIFIED_SECTION_PCRPKEY,
+        _UNIFIED_SECTION_MAX,
+} UnifiedSection;
+
+extern const char* const unified_sections[_UNIFIED_SECTION_MAX + 1];
+
+static inline bool unified_section_measure(UnifiedSection section) {
+        /* Don't include the PCR signature in the PCR measurements, since they sign the expected result of
+         * the measurement, and hence shouldn't be input to it. */
+        return section >= 0 && section < _UNIFIED_SECTION_MAX && section != UNIFIED_SECTION_PCRSIG;
+}
diff --git a/src/fundamental/unaligned-fundamental.h b/src/fundamental/unaligned-fundamental.h
new file mode 100644
index 0000000..a4c810a
--- /dev/null
+++ b/src/fundamental/unaligned-fundamental.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+static inline uint16_t unaligned_read_ne16(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint16_t x; } *u = _u;
+
+        return u->x;
+}
+
+static inline uint32_t unaligned_read_ne32(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint32_t x; } *u = _u;
+
+        return u->x;
+}
+
+static inline uint64_t unaligned_read_ne64(const void *_u) {
+        const struct __attribute__((__packed__, __may_alias__)) { uint64_t x; } *u = _u;
+
+        return u->x;
+}
+
+static inline void unaligned_write_ne16(void *_u, uint16_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint16_t x; } *u = _u;
+
+        u->x = a;
+}
+
+static inline void unaligned_write_ne32(void *_u, uint32_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint32_t x; } *u = _u;
+
+        u->x = a;
+}
+
+static inline void unaligned_write_ne64(void *_u, uint64_t a) {
+        struct __attribute__((__packed__, __may_alias__)) { uint64_t x; } *u = _u;
+
+        u->x = a;
+}
diff --git a/src/fuzz/fuzz-bootspec-gen.py b/src/fuzz/fuzz-bootspec-gen.py
new file mode 100644
index 0000000..99af3f5
--- /dev/null
+++ b/src/fuzz/fuzz-bootspec-gen.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+"""Generate sample input for fuzz-bootspec"""
+
+import json
+import os
+import sys
+
+config = open(sys.argv[1]).read()
+loader = [entry for entry in open(sys.argv[2], encoding='utf-16-le').read().split('\0')
+          if len(entry) > 2]   # filter out fluff from bad decoding
+entries = [(os.path.basename(name), open(name).read())
+           for name in sys.argv[3:]]
+
+data = {
+    'config': config,
+    'entries': entries,
+    'loader': loader,
+}
+
+print(json.dumps(data, indent=4))
diff --git a/src/fuzz/fuzz-bootspec.c b/src/fuzz/fuzz-bootspec.c
new file mode 100644
index 0000000..0c61cbe
--- /dev/null
+++ b/src/fuzz/fuzz-bootspec.c
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "bootspec.h"
+#include "env-util.h"
+#include "escape.h"
+#include "fuzz.h"
+#include "fd-util.h"
+#include "json.h"
+
+static int json_dispatch_config(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+        BootConfig *config = ASSERT_PTR(userdata);
+
+        const char *s = json_variant_string(variant);
+        if (!s)
+                return -EINVAL;
+
+        _cleanup_fclose_ FILE *f = NULL;
+        assert_se(f = data_to_file((const uint8_t*) s, strlen(s)));
+
+        (void) boot_loader_read_conf(config, f, "memstream");
+        return 0;
+}
+
+static int json_dispatch_entries(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+        BootConfig *config = ASSERT_PTR(userdata);
+        JsonVariant *entry;
+
+        JSON_VARIANT_ARRAY_FOREACH(entry, variant) {
+                if (!json_variant_is_array(entry) ||
+                    json_variant_elements(entry) < 1)
+                        return -EINVAL;
+
+                JsonVariant *v;
+                const char *id = NULL, *raw = NULL;
+                _cleanup_free_ char *data = NULL;
+                ssize_t len = -ENODATA;
+
+                v = json_variant_by_index(entry, 0);
+                if (v)
+                        id = json_variant_string(v);
+                if (!id)
+                        continue;
+
+                v = json_variant_by_index(entry, 1);
+                if (v)
+                        raw = json_variant_string(v);
+                if (raw)
+                        len = cunescape(raw, UNESCAPE_RELAX | UNESCAPE_ACCEPT_NUL, &data);
+                if (len >= 0) {
+                        _cleanup_fclose_ FILE *f = NULL;
+                        assert_se(f = data_to_file((const uint8_t*) data, len));
+
+                        assert_se(boot_config_load_type1(config, f, "/", "/entries", id) != -ENOMEM);
+                }
+        }
+
+        return 0;
+}
+
+static int json_dispatch_loader(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+        BootConfig *config = ASSERT_PTR(userdata);
+        _cleanup_strv_free_ char **entries = NULL;
+        int r;
+
+        r = json_dispatch_strv(name, variant, flags, &entries);
+        if (r < 0)
+                return r;
+
+        (void) boot_config_augment_from_loader(config, entries, false);
+        return 0;
+}
+
+static const JsonDispatch data_dispatch[] = {
+        { "config",  JSON_VARIANT_STRING, json_dispatch_config,  0, 0 },
+        { "entries", JSON_VARIANT_ARRAY,  json_dispatch_entries, 0, 0 },
+        { "loader",  JSON_VARIANT_ARRAY,  json_dispatch_loader,  0, 0 },
+        {}
+};
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_free_ const char *datadup = NULL;
+        _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL;
+        int r;
+
+        if (outside_size_range(size, 0, 65536))
+                return 0;
+
+        fuzz_setup_logging();
+
+        assert_se(datadup = memdup_suffix0(data, size));
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        r = json_parse(datadup, 0, &v, NULL, NULL);
+        if (r < 0)
+                return 0;
+
+        r = json_dispatch(v, data_dispatch, 0, &config);
+        if (r < 0)
+                return 0;
+
+        assert_se(boot_config_finalize(&config) >= 0);
+
+        (void) boot_config_select_special_entries(&config, /* skip_efivars= */ false);
+
+        _cleanup_close_ int orig_stdout_fd = -EBADF;
+        if (getenv_bool("SYSTEMD_FUZZ_OUTPUT") <= 0) {
+                orig_stdout_fd = fcntl(fileno(stdout), F_DUPFD_CLOEXEC, 3);
+                if (orig_stdout_fd < 0)
+                        log_warning_errno(orig_stdout_fd, "Failed to duplicate fd 1: %m");
+                else
+                        assert_se(freopen("/dev/null", "w", stdout));
+        }
+
+        (void) show_boot_entries(&config, JSON_FORMAT_OFF);
+        (void) show_boot_entries(&config, JSON_FORMAT_PRETTY);
+
+        if (orig_stdout_fd >= 0)
+                assert_se(freopen(FORMAT_PROC_FD_PATH(orig_stdout_fd), "w", stdout));
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-bootspec.options b/src/fuzz/fuzz-bootspec.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/fuzz/fuzz-bootspec.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/fuzz/fuzz-bus-label.c b/src/fuzz/fuzz-bus-label.c
new file mode 100644
index 0000000..c7be82a
--- /dev/null
+++ b/src/fuzz/fuzz-bus-label.c
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "bus-label.h"
+#include "fuzz.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_free_ char *unescaped = NULL, *escaped = NULL;
+
+        fuzz_setup_logging();
+
+        unescaped = bus_label_unescape_n((const char*)data, size);
+        assert_se(unescaped != NULL);
+        escaped = bus_label_escape(unescaped);
+        assert_se(escaped != NULL);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-calendarspec.c b/src/fuzz/fuzz-calendarspec.c
new file mode 100644
index 0000000..b31a3f2
--- /dev/null
+++ b/src/fuzz/fuzz-calendarspec.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "calendarspec.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "string-util.h"
+#include "time-util.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_(calendar_spec_freep) CalendarSpec *cspec = NULL;
+        _cleanup_free_ char *str = NULL;
+        int r;
+
+        fuzz_setup_logging();
+
+        assert_se(str = memdup_suffix0(data, size));
+
+        size_t l1 = strlen(str);
+        const char* usecs = l1 < size ? str + l1 + 1 : "";
+
+        r = calendar_spec_from_string(str, &cspec);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to parse \"%s\": %m", str);
+                return 0;
+        }
+
+        _cleanup_free_ char *p = NULL;
+        assert_se(calendar_spec_valid(cspec));
+        assert_se(calendar_spec_to_string(cspec, &p) == 0);
+        assert(p);
+
+        log_debug("spec: %s → %s", str, p);
+
+        _cleanup_(calendar_spec_freep) CalendarSpec *cspec2 = NULL;
+        assert_se(calendar_spec_from_string(p, &cspec2) >= 0);
+        assert_se(calendar_spec_valid(cspec2));
+
+        usec_t usec = 0;
+        (void) parse_time(usecs, &usec, 1);
+
+        /* If timezone is set, calendar_spec_next_usec() would fork, bleh :(
+         * Let's not try that. */
+        cspec->timezone = mfree(cspec->timezone);
+
+        log_debug("00: %s", strna(FORMAT_TIMESTAMP(usec)));
+        for (unsigned i = 1; i <= 20; i++) {
+                r = calendar_spec_next_usec(cspec, usec, &usec);
+                if (r < 0) {
+                        log_debug_errno(r, "%02u: %m", i);
+                        break;
+                }
+                log_debug("%02u: %s", i, FORMAT_TIMESTAMP(usec));
+        }
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-catalog.c b/src/fuzz/fuzz-catalog.c
new file mode 100644
index 0000000..f9561f2
--- /dev/null
+++ b/src/fuzz/fuzz-catalog.c
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "catalog.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "fuzz.h"
+#include "tmpfile-util.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_(unlink_tempfilep) char name[] = "/tmp/fuzz-catalog.XXXXXX";
+        _cleanup_close_ int fd = -EBADF;
+        _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL;
+
+        fuzz_setup_logging();
+
+        assert_se(h = ordered_hashmap_new(&catalog_hash_ops));
+
+        fd = mkostemp_safe(name);
+        assert_se(fd >= 0);
+        assert_se(write(fd, data, size) == (ssize_t) size);
+
+        (void) catalog_import_file(h, name);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-compress.c b/src/fuzz/fuzz-compress.c
new file mode 100644
index 0000000..c3f68f6
--- /dev/null
+++ b/src/fuzz/fuzz-compress.c
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "compress.h"
+#include "fuzz.h"
+
+typedef struct header {
+        uint32_t alg:2; /* We have only three compression algorithms so far, but we might add more in the
+                         * future. Let's make this a bit wider so our fuzzer cases remain stable in the
+                         * future. */
+        uint32_t sw_len;
+        uint32_t sw_alloc;
+        uint32_t reserved[3]; /* Extra space to keep fuzz cases stable in case we need to
+                               * add stuff in the future. */
+        uint8_t data[];
+} header;
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_free_ void *buf = NULL, *buf2 = NULL;
+        int r;
+
+        if (size < offsetof(header, data) + 1)
+                return 0;
+
+        const header *h = (struct header*) data;
+        const size_t data_len = size - offsetof(header, data);
+
+        int alg = h->alg;
+
+        fuzz_setup_logging();
+
+        log_info("Using compression %s, data size=%zu",
+                 compression_to_string(alg),
+                 data_len);
+
+        buf = malloc(MAX(size, 128u)); /* Make the buffer a bit larger for very small data */
+        if (!buf) {
+                log_oom();
+                return 0;
+        }
+
+        size_t csize;
+        r = compress_blob(alg, h->data, data_len, buf, size, &csize);
+        if (r < 0) {
+                log_error_errno(r, "Compression failed: %m");
+                return 0;
+        }
+
+        log_debug("Compressed %zu bytes to → %zu bytes", data_len, csize);
+
+        size_t sw_alloc = MAX(h->sw_alloc, 1u);
+        buf2 = malloc(sw_alloc);
+        if (!buf2) {
+                log_oom();
+                return 0;
+        }
+
+        size_t sw_len = MIN(data_len - 1, h->sw_len);
+
+        r = decompress_startswith(alg, buf, csize, &buf2, h->data, sw_len, h->data[sw_len]);
+        assert_se(r > 0);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-env-file.c b/src/fuzz/fuzz-env-file.c
new file mode 100644
index 0000000..ff7e529
--- /dev/null
+++ b/src/fuzz/fuzz-env-file.c
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "env-file.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "strv.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_strv_free_ char **rl = NULL, **rlp =  NULL;
+
+        if (outside_size_range(size, 0, 65536))
+                return 0;
+
+        f = data_to_file(data, size);
+        assert_se(f);
+
+        fuzz_setup_logging();
+
+        (void) load_env_file(f, NULL, &rl);
+        assert_se(fseek(f, 0, SEEK_SET) == 0);
+        (void) load_env_file_pairs(f, NULL, &rlp);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-env-file.options b/src/fuzz/fuzz-env-file.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/fuzz/fuzz-env-file.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/fuzz/fuzz-hostname-setup.c b/src/fuzz/fuzz-hostname-setup.c
new file mode 100644
index 0000000..4895631
--- /dev/null
+++ b/src/fuzz/fuzz-hostname-setup.c
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "hostname-setup.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *ret = NULL;
+
+        f = data_to_file(data, size);
+        assert_se(f);
+
+        fuzz_setup_logging();
+
+        (void) read_etc_hostname_stream(f, &ret);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-json.c b/src/fuzz/fuzz-json.c
new file mode 100644
index 0000000..3d6d689
--- /dev/null
+++ b/src/fuzz/fuzz-json.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "json.h"
+#include "memstream-util.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_(memstream_done) MemStream m = {};
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        FILE *g = NULL;
+        int r;
+
+        fuzz_setup_logging();
+
+        f = data_to_file(data, size);
+        assert_se(f);
+
+        r = json_parse_file(f, NULL, 0, &v, NULL, NULL);
+        if (r < 0) {
+                log_debug_errno(r, "failed to parse input: %m");
+                return 0;
+        }
+
+        if (getenv_bool("SYSTEMD_FUZZ_OUTPUT") <= 0)
+                assert_se(g = memstream_init(&m));
+
+        json_variant_dump(v, 0, g ?: stdout, NULL);
+        json_variant_dump(v, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR|JSON_FORMAT_SOURCE, g ?: stdout, NULL);
+
+        bool sorted = json_variant_is_sorted(v);
+        log_debug("json_variant_is_sorted: %s", yes_no(sorted));
+
+        r = json_variant_sort(&v);
+        log_debug_errno(r, "json_variant_sort: %d/%m", r);
+
+        sorted = json_variant_is_sorted(v);
+        log_debug("json_variant_is_sorted: %s", yes_no(sorted));
+        assert_se(r < 0 || sorted);
+
+        bool normalized = json_variant_is_normalized(v);
+        log_debug("json_variant_is_normalized: %s", yes_no(normalized));
+
+        r = json_variant_normalize(&v);
+        log_debug_errno(r, "json_variant_normalize: %d/%m", r);
+
+        normalized = json_variant_is_normalized(v);
+        log_debug("json_variant_is_normalized: %s", yes_no(normalized));
+        assert_se(r < 0 || normalized);
+
+        double real = json_variant_real(v);
+        log_debug("json_variant_real: %lf", real);
+
+        bool negative = json_variant_is_negative(v);
+        log_debug("json_variant_is_negative: %s", yes_no(negative));
+
+        bool blank = json_variant_is_blank_object(v);
+        log_debug("json_variant_is_blank_object: %s", yes_no(blank));
+
+        blank = json_variant_is_blank_array(v);
+        log_debug("json_variant_is_blank_array: %s", yes_no(blank));
+
+        size_t elements = json_variant_elements(v);
+        log_debug("json_variant_elements: %zu", elements);
+
+        for (size_t i = 0; i <= elements + 2; i++)
+                (void) json_variant_by_index(v, i);
+
+        assert_se(json_variant_equal(v, v));
+        assert_se(!json_variant_equal(v, NULL));
+        assert_se(!json_variant_equal(NULL, v));
+
+        bool sensitive = json_variant_is_sensitive(v);
+        log_debug("json_variant_is_sensitive: %s", yes_no(sensitive));
+
+        json_variant_sensitive(v);
+
+        sensitive = json_variant_is_sensitive(v);
+        log_debug("json_variant_is_sensitive: %s", yes_no(sensitive));
+
+        const char *source;
+        unsigned line, column;
+        assert_se(json_variant_get_source(v, &source, &line, &column) == 0);
+        log_debug("json_variant_get_source: %s:%u:%u", source ?: "-", line, column);
+
+        r = json_variant_set_field_string(&v, "a", "string-a");
+        log_debug_errno(r, "json_set_field_string: %d/%m", r);
+
+        r = json_variant_set_field_integer(&v, "b", -12345);
+        log_debug_errno(r, "json_set_field_integer: %d/%m", r);
+
+        r = json_variant_set_field_unsigned(&v, "c", 12345);
+        log_debug_errno(r, "json_set_field_unsigned: %d/%m", r);
+
+        r = json_variant_set_field_boolean(&v, "d", false);
+        log_debug_errno(r, "json_set_field_boolean: %d/%m", r);
+
+        r = json_variant_set_field_strv(&v, "e", STRV_MAKE("e-1", "e-2", "e-3"));
+        log_debug_errno(r, "json_set_field_strv: %d/%m", r);
+
+        r = json_variant_filter(&v, STRV_MAKE("a", "b", "c", "d", "e"));
+        log_debug_errno(r, "json_variant_filter: %d/%m", r);
+
+        /* I assume we can merge v with itself… */
+        r = json_variant_merge_object(&v, v);
+        log_debug_errno(r, "json_variant_merge: %d/%m", r);
+
+        r = json_variant_append_array(&v, v);
+        log_debug_errno(r, "json_variant_append_array: %d/%m", r);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-main.c b/src/fuzz/fuzz-main.c
new file mode 100644
index 0000000..cf70424
--- /dev/null
+++ b/src/fuzz/fuzz-main.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "fileio.h"
+#include "fuzz.h"
+#include "log.h"
+#include "parse-util.h"
+#include "string-util.h"
+#include "tests.h"
+
+/* This is a test driver for the systemd fuzzers that provides main function
+ * for regression testing outside of oss-fuzz (https://github.com/google/oss-fuzz)
+ *
+ * It reads files named on the command line and passes them one by one into the
+ * fuzzer that it is compiled into. */
+
+/* This one was borrowed from
+ * https://github.com/google/oss-fuzz/blob/646fca1b506b056db3a60d32c4a1a7398f171c94/infra/base-images/base-runner/bad_build_check#L19
+ */
+#define NUMBER_OF_RUNS 4
+
+int main(int argc, char **argv) {
+        int r;
+
+        test_setup_logging(LOG_DEBUG);
+
+        unsigned number_of_runs = NUMBER_OF_RUNS;
+
+        const char *v = getenv("SYSTEMD_FUZZ_RUNS");
+        if (!isempty(v)) {
+                r = safe_atou(v, &number_of_runs);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse SYSTEMD_FUZZ_RUNS=%s: %m", v);
+        }
+
+        for (int i = 1; i < argc; i++) {
+                _cleanup_free_ char *buf = NULL;
+                size_t size;
+                char *name;
+
+                name = argv[i];
+                r = read_full_file(name, &buf, &size);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to open '%s': %m", name);
+                        return EXIT_FAILURE;
+                }
+                printf("%s... ", name);
+                fflush(stdout);
+                for (unsigned j = 0; j < number_of_runs; j++)
+                        if (LLVMFuzzerTestOneInput((uint8_t*)buf, size) == EXIT_TEST_SKIP)
+                                return EXIT_TEST_SKIP;
+                printf("ok\n");
+        }
+
+        return EXIT_SUCCESS;
+}
diff --git a/src/fuzz/fuzz-time-util.c b/src/fuzz/fuzz-time-util.c
new file mode 100644
index 0000000..5be2e4f
--- /dev/null
+++ b/src/fuzz/fuzz-time-util.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "time-util.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_free_ char *str = NULL;
+        usec_t usec;
+
+        fuzz_setup_logging();
+
+        assert_se(str = memdup_suffix0(data, size));
+
+        (void) parse_timestamp(str, &usec);
+        (void) parse_sec(str, &usec);
+        (void) parse_sec_fix_0(str, &usec);
+        (void) parse_sec_def_infinity(str, &usec);
+        (void) parse_time(str, &usec, USEC_PER_SEC);
+        (void) parse_nsec(str, &usec);
+
+        (void) timezone_is_valid(str, LOG_DEBUG);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-udev-database.c b/src/fuzz/fuzz-udev-database.c
new file mode 100644
index 0000000..6b4fc82
--- /dev/null
+++ b/src/fuzz/fuzz-udev-database.c
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "device-internal.h"
+#include "device-private.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "fuzz.h"
+#include "tmpfile-util.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+        _cleanup_(unlink_tempfilep) char filename[] = "/tmp/fuzz-udev-database.XXXXXX";
+        _cleanup_fclose_ FILE *f = NULL;
+
+        fuzz_setup_logging();
+
+        assert_se(fmkostemp_safe(filename, "r+", &f) == 0);
+        if (size != 0)
+                assert_se(fwrite(data, size, 1, f) == 1);
+
+        fflush(f);
+        assert_se(device_new_aux(&dev) >= 0);
+        (void) device_read_db_internal_filename(dev, filename);
+        return 0;
+}
diff --git a/src/fuzz/fuzz-varlink-idl.c b/src/fuzz/fuzz-varlink-idl.c
new file mode 100644
index 0000000..2436342
--- /dev/null
+++ b/src/fuzz/fuzz-varlink-idl.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "io-util.h"
+#include "varlink-idl.h"
+#include "log.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        _cleanup_(varlink_interface_freep) VarlinkInterface *vi = NULL;
+        _cleanup_free_ char *str = NULL, *dump = NULL;
+        int r;
+
+        if (outside_size_range(size, 0, 64 * 1024))
+                return 0;
+
+        fuzz_setup_logging();
+
+        assert_se(str = memdup_suffix0(data, size));
+
+        r = varlink_idl_parse(str, /* line= */ NULL, /* column= */ NULL, &vi);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to parse varlink interface definition: %m");
+                return 0;
+        }
+
+        assert_se(varlink_idl_format(vi, &dump) >= 0);
+        (void) varlink_idl_consistent(vi, LOG_DEBUG);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz-varlink.c b/src/fuzz/fuzz-varlink.c
new file mode 100644
index 0000000..dd63419
--- /dev/null
+++ b/src/fuzz/fuzz-varlink.c
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "hexdecoct.h"
+#include "iovec-util.h"
+#include "varlink.h"
+#include "log.h"
+
+static FILE *null = NULL;
+
+static int method_something(Varlink *v, JsonVariant *p, VarlinkMethodFlags flags, void *userdata) {
+        json_variant_dump(p, JSON_FORMAT_NEWLINE|JSON_FORMAT_PRETTY, null, NULL);
+        return 0;
+}
+
+static int reply_callback(Varlink *v, JsonVariant *p, const char *error_id, VarlinkReplyFlags flags, void *userdata) {
+        json_variant_dump(p, JSON_FORMAT_NEWLINE|JSON_FORMAT_PRETTY, null, NULL);
+        return 0;
+}
+
+static int io_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        struct iovec *iov = ASSERT_PTR(userdata);
+        bool write_eof = false, read_eof = false;
+
+        assert(s);
+        assert(fd >= 0);
+
+        if ((revents & (EPOLLOUT|EPOLLHUP|EPOLLERR)) && iov->iov_len > 0) {
+                ssize_t n;
+
+                /* never write more than 143 bytes a time, to make broken up recv()s on the other side more
+                 * likely, and thus test some additional code paths. */
+                n = send(fd, iov->iov_base, MIN(iov->iov_len, 143U), MSG_NOSIGNAL|MSG_DONTWAIT);
+                if (n < 0) {
+                        if (ERRNO_IS_DISCONNECT(errno))
+                                write_eof = true;
+                        else
+                                assert_se(errno == EAGAIN);
+                } else
+                        iovec_increment(iov, 1, n);
+        }
+
+        if (revents & EPOLLIN) {
+                char c[137];
+                ssize_t n;
+
+                n = recv(fd, c, sizeof(c), MSG_DONTWAIT);
+                if (n < 0) {
+                        if (ERRNO_IS_DISCONNECT(errno))
+                                read_eof = true;
+                        else
+                                assert_se(errno == EAGAIN);
+                } else if (n == 0)
+                        read_eof = true;
+                else
+                        hexdump(null, c, (size_t) n);
+        }
+
+        /* After we wrote everything we could turn off EPOLLOUT. And if we reached read EOF too turn off the
+         * whole thing. */
+        if (write_eof || iov->iov_len == 0) {
+
+                if (read_eof)
+                        assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
+                else
+                        assert_se(sd_event_source_set_io_events(s, EPOLLIN) >= 0);
+        }
+
+        return 0;
+}
+
+static int idle_callback(sd_event_source *s, void *userdata) {
+        assert(s);
+
+        /* Called as idle callback when there's nothing else to do anymore */
+        sd_event_exit(sd_event_source_get_event(s), 0);
+        return 0;
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+        struct iovec server_iov = IOVEC_MAKE((void*) data, size), client_iov = IOVEC_MAKE((void*) data, size);
+        /* Important: the declaration order matters here! we want that the fds are closed on return after the
+         * event sources, hence we declare the fds first, the event sources second */
+        _cleanup_close_pair_ int server_pair[2] = EBADF_PAIR, client_pair[2] = EBADF_PAIR;
+        _cleanup_(sd_event_source_unrefp) sd_event_source *idle_event_source = NULL,
+                *server_event_source = NULL, *client_event_source = NULL;
+        _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+        _cleanup_(varlink_flush_close_unrefp) Varlink *c = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+
+        fuzz_setup_logging();
+
+        assert_se(null = fopen("/dev/null", "we"));
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        /* Test one: write the data as method call to a server */
+        assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, server_pair) >= 0);
+        assert_se(varlink_server_new(&s, 0) >= 0);
+        assert_se(varlink_server_set_description(s, "myserver") >= 0);
+        assert_se(varlink_server_attach_event(s, e, 0) >= 0);
+        assert_se(varlink_server_add_connection(s, server_pair[0], NULL) >= 0);
+        TAKE_FD(server_pair[0]);
+        assert_se(varlink_server_bind_method(s, "io.test.DoSomething", method_something) >= 0);
+        assert_se(sd_event_add_io(e, &server_event_source, server_pair[1], EPOLLIN|EPOLLOUT, io_callback, &server_iov) >= 0);
+
+        /* Test two: write the data as method response to a client */
+        assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, client_pair) >= 0);
+        assert_se(varlink_connect_fd(&c, client_pair[0]) >= 0);
+        TAKE_FD(client_pair[0]);
+        assert_se(varlink_set_description(c, "myclient") >= 0);
+        assert_se(varlink_attach_event(c, e, 0) >= 0);
+        assert_se(varlink_bind_reply(c, reply_callback) >= 0);
+        assert_se(varlink_invoke(c, "io.test.DoSomething", NULL) >= 0);
+        assert_se(sd_event_add_io(e, &client_event_source, client_pair[1], EPOLLIN|EPOLLOUT, io_callback, &client_iov) >= 0);
+
+        assert_se(sd_event_add_defer(e, &idle_event_source, idle_callback, NULL) >= 0);
+        assert_se(sd_event_source_set_priority(idle_event_source, SD_EVENT_PRIORITY_IDLE) >= 0);
+
+        assert_se(sd_event_loop(e) >= 0);
+
+        null = safe_fclose(null);
+
+        return 0;
+}
diff --git a/src/fuzz/fuzz.h b/src/fuzz/fuzz.h
new file mode 100644
index 0000000..698ba42
--- /dev/null
+++ b/src/fuzz/fuzz.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+#include "env-util.h"
+#include "fileio.h"
+
+/* The entry point into the fuzzer */
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
+
+static inline FILE* data_to_file(const uint8_t *data, size_t size) {
+        if (size == 0)
+                return fopen("/dev/null", "re");
+        else
+                return fmemopen_unlocked((char*) data, size, "r");
+}
+
+/* Check if we are within the specified size range.
+ * The upper limit is ignored if FUZZ_USE_SIZE_LIMIT is unset.
+ */
+static inline bool outside_size_range(size_t size, size_t lower, size_t upper) {
+        if (size < lower)
+                return true;
+        if (size > upper)
+                return FUZZ_USE_SIZE_LIMIT;
+        return false;
+}
+
+static inline void fuzz_setup_logging(void) {
+        /* We don't want to fill the logs and slow down stuff when running
+         * in a fuzzing mode, so disable most of the logging. */
+        log_set_max_level(LOG_CRIT);
+        log_parse_environment();
+        log_open();
+}
+
+/* Force value to not be optimized away. */
+#define DO_NOT_OPTIMIZE(value) ({ asm volatile("" : : "g"(value) : "memory"); })
diff --git a/src/fuzz/meson.build b/src/fuzz/meson.build
new file mode 100644
index 0000000..8c1b2e9
--- /dev/null
+++ b/src/fuzz/meson.build
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+simple_fuzzers += files(
+        'fuzz-bootspec.c',
+        'fuzz-bus-label.c',
+        'fuzz-calendarspec.c',
+        'fuzz-catalog.c',
+        'fuzz-compress.c',
+        'fuzz-env-file.c',
+        'fuzz-hostname-setup.c',
+        'fuzz-json.c',
+        'fuzz-time-util.c',
+        'fuzz-udev-database.c',
+        'fuzz-varlink.c',
+        'fuzz-varlink-idl.c',
+)
diff --git a/src/getty-generator/getty-generator.c b/src/getty-generator/getty-generator.c
new file mode 100644
index 0000000..7486118
--- /dev/null
+++ b/src/getty-generator/getty-generator.c
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "creds-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "generator.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "unit-name.h"
+#include "virt.h"
+
+static const char *arg_dest = NULL;
+static bool arg_enabled = true;
+
+static int add_symlink(const char *fservice, const char *tservice) {
+        const char *from, *to;
+
+        assert(fservice);
+        assert(tservice);
+
+        from = strjoina(SYSTEM_DATA_UNIT_DIR "/", fservice);
+        to = strjoina(arg_dest, "/getty.target.wants/", tservice);
+
+        (void) mkdir_parents_label(to, 0755);
+
+        if (symlink(from, to) < 0) {
+                /* In case console=hvc0 is passed this will very likely result in EEXIST */
+                if (errno == EEXIST)
+                        return 0;
+
+                return log_error_errno(errno, "Failed to create symlink %s: %m", to);
+        }
+
+        return 0;
+}
+
+static int add_serial_getty(const char *tty) {
+        _cleanup_free_ char *n = NULL;
+        int r;
+
+        assert(tty);
+
+        log_debug("Automatically adding serial getty for /dev/%s.", tty);
+
+        r = unit_name_from_path_instance("serial-getty", tty, ".service", &n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate service name: %m");
+
+        return add_symlink("serial-getty@.service", n);
+}
+
+static int add_container_getty(const char *tty) {
+        _cleanup_free_ char *n = NULL;
+        int r;
+
+        assert(tty);
+
+        log_debug("Automatically adding container getty for /dev/pts/%s.", tty);
+
+        r = unit_name_from_path_instance("container-getty", tty, ".service", &n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate service name: %m");
+
+        return add_symlink("container-getty@.service", n);
+}
+
+static int verify_tty(const char *name) {
+        _cleanup_close_ int fd = -EBADF;
+        const char *p;
+
+        /* Some TTYs are weird and have been enumerated but don't work
+         * when you try to use them, such as classic ttyS0 and
+         * friends. Let's check that and open the device and run
+         * isatty() on it. */
+
+        p = strjoina("/dev/", name);
+
+        /* O_NONBLOCK is essential here, to make sure we don't wait
+         * for DCD */
+        fd = open(p, O_RDWR|O_NONBLOCK|O_NOCTTY|O_CLOEXEC|O_NOFOLLOW);
+        if (fd < 0)
+                return -errno;
+
+        errno = 0;
+        if (isatty(fd) <= 0)
+                return errno_or_else(EIO);
+
+        return 0;
+}
+
+static int run_container(void) {
+        _cleanup_free_ char *container_ttys = NULL;
+        int r;
+
+        log_debug("Automatically adding console shell.");
+
+        r = add_symlink("console-getty.service", "console-getty.service");
+        if (r < 0)
+                return r;
+
+        /* When $container_ttys is set for PID 1, spawn gettys on all ptys named therein.
+         * Note that despite the variable name we only support ptys here. */
+
+        (void) getenv_for_pid(1, "container_ttys", &container_ttys);
+
+        for (const char *p = container_ttys;;) {
+               _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, NULL, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse $container_ttys: %m");
+                if (r == 0)
+                        return 0;
+
+                const char *tty = word;
+
+                /* First strip off /dev/ if it is specified */
+                tty = path_startswith(tty, "/dev/") ?: tty;
+
+                /* Then, make sure it's actually a pty */
+                tty = path_startswith(tty, "pts/");
+                if (!tty)
+                        continue;
+
+                r = add_container_getty(tty);
+                if (r < 0)
+                        return r;
+        }
+}
+
+static int add_credential_gettys(void) {
+        static const struct {
+                const char *credential_name;
+                int (*func)(const char *tty);
+        } table[] = {
+                { "getty.ttys.serial",    add_serial_getty     },
+                { "getty.ttys.container", add_container_getty  },
+        };
+        int r;
+
+        FOREACH_ARRAY(t, table, ELEMENTSOF(table)) {
+                _cleanup_free_ char *b = NULL;
+                size_t sz = 0;
+
+                r = read_credential_with_decryption(t->credential_name, (void*) &b, &sz);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        continue;
+
+                _cleanup_fclose_ FILE *f = NULL;
+                f = fmemopen_unlocked(b, sz, "r");
+                if (!f)
+                        return log_oom();
+
+                for (;;) {
+                        _cleanup_free_ char *tty = NULL;
+
+                        r = read_stripped_line(f, PATH_MAX, &tty);
+                        if (r == 0)
+                                break;
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to parse credential %s: %m", t->credential_name);
+                                break;
+                        }
+
+                        if (startswith(tty, "#"))
+                                continue;
+
+                        r = t->func(tty);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        int r;
+
+        assert(key);
+
+        if (proc_cmdline_key_streq(key, "systemd.getty_auto")) {
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse getty_auto switch \"%s\", ignoring: %m", value);
+                else
+                        arg_enabled = r;
+        }
+
+        return 0;
+}
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+        _cleanup_free_ char *getty_auto = NULL;
+        int r;
+
+        assert_se(arg_dest = dest);
+
+        if (in_initrd()) {
+                log_debug("Skipping generator, running in the initrd.");
+                return EXIT_SUCCESS;
+        }
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+
+        r = getenv_for_pid(1, "SYSTEMD_GETTY_AUTO", &getty_auto);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_GETTY_AUTO environment variable, ignoring: %m");
+        else if (r > 0) {
+                r = parse_boolean(getty_auto);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse $SYSTEMD_GETTY_AUTO value \"%s\", ignoring: %m", getty_auto);
+                else
+                        arg_enabled = r;
+        }
+
+        if (!arg_enabled) {
+                log_debug("Disabled, exiting.");
+                return 0;
+        }
+
+        r = add_credential_gettys();
+        if (r < 0)
+                return r;
+
+        if (detect_container() > 0)
+                /* Add console shell and look at $container_ttys, but don't do add any
+                 * further magic if we are in a container. */
+                return run_container();
+
+        /* Automatically add in a serial getty on all active kernel consoles */
+        _cleanup_free_ char *active = NULL;
+        (void) read_one_line_file("/sys/class/tty/console/active", &active);
+        for (const char *p = active;;) {
+               _cleanup_free_ char *tty = NULL;
+
+                r = extract_first_word(&p, &tty, NULL, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse /sys/class/tty/console/active: %m");
+                if (r == 0)
+                        break;
+
+                /* We assume that gettys on virtual terminals are started via manual configuration and do
+                 * this magic only for non-VC terminals. */
+
+                if (isempty(tty) || tty_is_vc(tty))
+                        continue;
+
+                if (verify_tty(tty) < 0)
+                        continue;
+
+                r = add_serial_getty(tty);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Automatically add in a serial getty on the first virtualizer console */
+        FOREACH_STRING(j,
+                       "hvc0",
+                       "xvc0",
+                       "hvsi0",
+                       "sclp_line0",
+                       "ttysclp0",
+                       "3270!tty1") {
+                _cleanup_free_ char *p = NULL;
+
+                p = path_join("/sys/class/tty", j);
+                if (!p)
+                        return log_oom();
+                if (access(p, F_OK) < 0)
+                        continue;
+
+                r = add_serial_getty(j);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/getty-generator/meson.build b/src/getty-generator/meson.build
new file mode 100644
index 0000000..44eeb86
--- /dev/null
+++ b/src/getty-generator/meson.build
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        generator_template + {
+                'name' : 'systemd-getty-generator',
+                'sources' : files('getty-generator.c'),
+        },
+]
diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c
new file mode 100644
index 0000000..07531ec
--- /dev/null
+++ b/src/gpt-auto-generator/gpt-auto-generator.c
@@ -0,0 +1,1019 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "sd-device.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "blkid-util.h"
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "dirent-util.h"
+#include "dissect-image.h"
+#include "dropin.h"
+#include "efi-loader.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "fstab-util.h"
+#include "generator.h"
+#include "gpt.h"
+#include "image-policy.h"
+#include "initrd-util.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "special.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "virt.h"
+
+static const char *arg_dest = NULL;
+static bool arg_enabled = true;
+static bool arg_root_enabled = true;
+static bool arg_swap_enabled = true;
+static char *arg_root_fstype = NULL;
+static char *arg_root_options = NULL;
+static int arg_root_rw = -1;
+static ImagePolicy *arg_image_policy = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
+
+STATIC_DESTRUCTOR_REGISTER(arg_root_fstype, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_options, freep);
+
+static int add_cryptsetup(
+                const char *id,
+                const char *what,
+                bool rw,
+                bool require,
+                bool measure,
+                char **ret_device) {
+
+#if HAVE_LIBCRYPTSETUP
+        _cleanup_free_ char *e = NULL, *n = NULL, *d = NULL, *options = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(id);
+        assert(what);
+
+        r = unit_name_from_path(what, ".device", &d);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        e = unit_name_escape(id);
+        if (!e)
+                return log_oom();
+
+        r = unit_name_build("systemd-cryptsetup", e, ".service", &n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(arg_dest, /* source = */ NULL, n, &f);
+        if (r < 0)
+                return r;
+
+        r = generator_write_cryptsetup_unit_section(f, NULL);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "Before=umount.target cryptsetup.target\n"
+                "Conflicts=umount.target\n"
+                "BindsTo=%s\n"
+                "After=%s\n",
+                d, d);
+
+        if (!rw) {
+                options = strdup("read-only");
+                if (!options)
+                        return log_oom();
+        }
+
+        r = efi_measured_uki(LOG_WARNING);
+        if (r > 0)
+                /* Enable TPM2 based unlocking automatically, if we have a TPM. See #30176. */
+                if (!strextend_with_separator(&options, ",", "tpm2-device=auto"))
+                        return log_oom();
+
+        if (measure) {
+                /* We only measure the root volume key into PCR 15 if we are booted with sd-stub (i.e. in a
+                 * UKI), and sd-stub measured the UKI. We do this in order not to step into people's own PCR
+                 * assignment, under the assumption that people who are fine to use sd-stub with its PCR
+                 * assignments are also OK with our PCR 15 use here. */
+                if (r > 0)
+                        if (!strextend_with_separator(&options, ",", "tpm2-measure-pcr=yes"))
+                                return log_oom();
+                if (r == 0)
+                        log_debug("Will not measure volume key of volume '%s', not booted via systemd-stub with measurements enabled.", id);
+        }
+
+        r = generator_write_cryptsetup_service_section(f, id, what, NULL, options);
+        if (r < 0)
+                return r;
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write file %s: %m", n);
+
+        r = generator_add_symlink(arg_dest, d, "wants", n);
+        if (r < 0)
+                return r;
+
+        const char *dmname = strjoina("dev-mapper-", e, ".device");
+
+        if (require) {
+                r = generator_add_symlink(arg_dest, "cryptsetup.target", "requires", n);
+                if (r < 0)
+                        return r;
+
+                r = generator_add_symlink(arg_dest, dmname, "requires", n);
+                if (r < 0)
+                        return r;
+        }
+
+        r = write_drop_in_format(arg_dest, dmname, 50, "job-timeout",
+                                 "# Automatically generated by systemd-gpt-auto-generator\n\n"
+                                 "[Unit]\n"
+                                 "JobTimeoutSec=infinity"); /* the binary handles timeouts anyway */
+        if (r < 0)
+                log_warning_errno(r, "Failed to write device timeout drop-in, ignoring: %m");
+
+        if (ret_device) {
+                char *s;
+
+                s = path_join("/dev/mapper", id);
+                if (!s)
+                        return log_oom();
+
+                *ret_device = s;
+        }
+
+        return 0;
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "Partition is encrypted, but systemd-gpt-auto-generator was compiled without libcryptsetup support");
+#endif
+}
+
+static int add_mount(
+                const char *id,
+                const char *what,
+                const char *where,
+                const char *fstype,
+                bool rw,
+                bool growfs,
+                bool measure,
+                const char *options,
+                const char *description,
+                const char *post) {
+
+        _cleanup_free_ char *unit = NULL, *crypto_what = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        /* Note that we don't apply specifier escaping on the input strings here, since we know they are not configured
+         * externally, but all originate from our own sources here, and hence we know they contain no % characters that
+         * could potentially be understood as specifiers. */
+
+        assert(id);
+        assert(what);
+        assert(where);
+        assert(description);
+
+        log_debug("Adding %s: %s fstype=%s", where, what, fstype ?: "(any)");
+
+        if (streq_ptr(fstype, "crypto_LUKS")) {
+                r = add_cryptsetup(id, what, rw, /* require= */ true, measure, &crypto_what);
+                if (r < 0)
+                        return r;
+
+                what = crypto_what;
+                fstype = NULL;
+        } else if (fstype) {
+                r = dissect_fstype_ok(fstype);
+                if (r < 0)
+                        return log_error_errno(r, "Unable to determine of dissected file system type '%s' is permitted: %m", fstype);
+                if (!r)
+                        return log_error_errno(
+                                        SYNTHETIC_ERRNO(EIDRM),
+                                        "Refusing to automatically mount uncommon file system '%s' to '%s'.",
+                                        fstype, where);
+        }
+
+        r = unit_name_from_path(where, ".mount", &unit);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(arg_dest, /* source = */ NULL, unit, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "Description=%s\n"
+                "Documentation=man:systemd-gpt-auto-generator(8)\n",
+                description);
+
+        if (post)
+                fprintf(f, "Before=%s\n", post);
+
+        r = generator_write_fsck_deps(f, arg_dest, what, where, fstype);
+        if (r < 0)
+                return r;
+
+        r = generator_write_blockdev_dependency(f, what);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "\n"
+                "[Mount]\n"
+                "What=%s\n"
+                "Where=%s\n",
+                what, where);
+
+        if (fstype)
+                fprintf(f, "Type=%s\n", fstype);
+
+        if (options)
+                fprintf(f, "Options=%s\n", options);
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit %s: %m", unit);
+
+        if (growfs) {
+                r = generator_hook_up_growfs(arg_dest, where, post);
+                if (r < 0)
+                        return r;
+        }
+
+        if (measure) {
+                r = generator_hook_up_pcrfs(arg_dest, where, post);
+                if (r < 0)
+                        return r;
+        }
+
+        if (post) {
+                r = generator_add_symlink(arg_dest, post, "requires", unit);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int path_is_busy(const char *where) {
+        int r;
+
+        assert(where);
+
+        /* already a mountpoint; generators run during reload */
+        r = path_is_mount_point(where, NULL, AT_SYMLINK_FOLLOW);
+        if (r > 0)
+                return false;
+        /* The directory will be created by the mount or automount unit when it is started. */
+        if (r == -ENOENT)
+                return false;
+
+        if (r < 0)
+                return log_warning_errno(r, "Cannot check if \"%s\" is a mount point: %m", where);
+
+        /* not a mountpoint but it contains files */
+        r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+        if (r == -ENOTDIR) {
+                log_debug("\"%s\" is not a directory, ignoring.", where);
+                return true;
+        } else if (r < 0)
+                return log_warning_errno(r, "Cannot check if \"%s\" is empty: %m", where);
+        else if (r == 0) {
+                log_debug("\"%s\" already populated, ignoring.", where);
+                return true;
+        }
+
+        return false;
+}
+
+static int add_partition_mount(
+                PartitionDesignator d,
+                DissectedPartition *p,
+                const char *id,
+                const char *where,
+                const char *description) {
+
+        _cleanup_free_ char *options = NULL;
+        int r;
+
+        assert(p);
+
+        r = path_is_busy(where);
+        if (r != 0)
+                return r < 0 ? r : 0;
+
+        r = partition_pick_mount_options(
+                        d,
+                        dissected_partition_fstype(p),
+                        p->rw,
+                        /* discard= */ true,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return r;
+
+        return add_mount(
+                        id,
+                        p->node,
+                        where,
+                        p->fstype,
+                        p->rw,
+                        p->growfs,
+                        /* measure= */ STR_IN_SET(id, "root", "var"), /* by default measure rootfs and /var, since they contain the "identity" of the system */
+                        options,
+                        description,
+                        SPECIAL_LOCAL_FS_TARGET);
+}
+
+static int add_partition_swap(DissectedPartition *p) {
+        const char *what;
+        _cleanup_free_ char *name = NULL, *crypto_what = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(p);
+        assert(p->node);
+
+        if (!arg_swap_enabled)
+                return 0;
+
+        /* Disable the swap auto logic if at least one swap is defined in /etc/fstab, see #6192. */
+        r = fstab_has_fstype("swap");
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse fstab: %m");
+        if (r > 0) {
+                log_debug("swap specified in fstab, ignoring.");
+                return 0;
+        }
+
+        if (streq_ptr(p->fstype, "crypto_LUKS")) {
+                r = add_cryptsetup("swap", p->node, /* rw= */ true, /* require= */ true, /* measure= */ false, &crypto_what);
+                if (r < 0)
+                        return r;
+                what = crypto_what;
+        } else
+                what = p->node;
+
+        log_debug("Adding swap: %s", what);
+
+        r = unit_name_from_path(what, ".swap", &name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(arg_dest, /* source = */ NULL, name, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "Description=Swap Partition\n"
+                "Documentation=man:systemd-gpt-auto-generator(8)\n");
+
+        r = generator_write_blockdev_dependency(f, what);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "\n"
+                "[Swap]\n"
+                "What=%s\n",
+                what);
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit %s: %m", name);
+
+        return generator_add_symlink(arg_dest, SPECIAL_SWAP_TARGET, "wants", name);
+}
+
+static int add_automount(
+                const char *id,
+                const char *what,
+                const char *where,
+                const char *fstype,
+                bool rw,
+                bool growfs,
+                const char *options,
+                const char *description,
+                usec_t timeout) {
+
+        _cleanup_free_ char *unit = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(id);
+        assert(where);
+        assert(description);
+
+        r = add_mount(id,
+                      what,
+                      where,
+                      fstype,
+                      rw,
+                      growfs,
+                      /* measure= */ false,
+                      options,
+                      description,
+                      NULL);
+        if (r < 0)
+                return r;
+
+        r = unit_name_from_path(where, ".automount", &unit);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(arg_dest, /* source = */ NULL, unit, &f);
+        if (r < 0)
+                return r;
+
+        fprintf(f,
+                "[Unit]\n"
+                "Description=%s\n"
+                "Documentation=man:systemd-gpt-auto-generator(8)\n"
+                "[Automount]\n"
+                "Where=%s\n"
+                "TimeoutIdleSec="USEC_FMT"\n",
+                description,
+                where,
+                timeout / USEC_PER_SEC);
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit %s: %m", unit);
+
+        return generator_add_symlink(arg_dest, SPECIAL_LOCAL_FS_TARGET, "wants", unit);
+}
+
+static int slash_boot_in_fstab(void) {
+        static int cache = -1;
+
+        if (cache >= 0)
+                return cache;
+
+        cache = fstab_is_mount_point("/boot");
+        if (cache < 0)
+                return log_error_errno(cache, "Failed to parse fstab: %m");
+        return cache;
+}
+
+static int add_partition_xbootldr(DissectedPartition *p) {
+        _cleanup_free_ char *options = NULL;
+        int r;
+
+        assert(p);
+
+        if (in_initrd()) {
+                log_debug("In initrd, ignoring the XBOOTLDR partition.");
+                return 0;
+        }
+
+        r = slash_boot_in_fstab();
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                log_debug("/boot/ specified in fstab, ignoring XBOOTLDR partition.");
+                return 0;
+        }
+
+        r = path_is_busy("/boot");
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return 0;
+
+        r = partition_pick_mount_options(
+                        PARTITION_XBOOTLDR,
+                        dissected_partition_fstype(p),
+                        /* rw= */ true,
+                        /* discard= */ false,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine default mount options for /boot/: %m");
+
+        return add_automount(
+                        "boot",
+                        p->node,
+                        "/boot",
+                        p->fstype,
+                        /* rw= */ true,
+                        /* growfs= */ false,
+                        options,
+                        "Boot Loader Partition",
+                        120 * USEC_PER_SEC);
+}
+
+#if ENABLE_EFI
+static int slash_efi_in_fstab(void) {
+        static int cache = -1;
+
+        if (cache >= 0)
+                return cache;
+
+        cache = fstab_is_mount_point("/efi");
+        if (cache < 0)
+                return log_error_errno(cache, "Failed to parse fstab: %m");
+        return cache;
+}
+
+static bool slash_boot_exists(void) {
+        static int cache = -1;
+
+        if (cache >= 0)
+                return cache;
+
+        if (access("/boot", F_OK) >= 0)
+                return (cache = true);
+        if (errno != ENOENT)
+                log_error_errno(errno, "Failed to determine whether /boot/ exists, assuming no: %m");
+        else
+                log_debug_errno(errno, "/boot/: %m");
+        return (cache = false);
+}
+
+static int add_partition_esp(DissectedPartition *p, bool has_xbootldr) {
+        const char *esp_path = NULL, *id = NULL;
+        _cleanup_free_ char *options = NULL;
+        int r;
+
+        assert(p);
+
+        if (in_initrd()) {
+                log_debug("In initrd, ignoring the ESP.");
+                return 0;
+        }
+
+        /* Check if there's an existing fstab entry for ESP. If so, we just skip the gpt-auto logic. */
+        r = fstab_has_node(p->node);
+        if (r < 0)
+                return log_error_errno(r,
+                                       "Failed to check if fstab entry for device '%s' exists: %m", p->node);
+        if (r > 0)
+                return 0;
+
+        /* If /boot/ is present, unused, and empty, we'll take that.
+         * Otherwise, if /efi/ is unused and empty (or missing), we'll take that.
+         * Otherwise, we do nothing. */
+        if (!has_xbootldr && slash_boot_exists()) {
+                r = slash_boot_in_fstab();
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        r = path_is_busy("/boot");
+                        if (r < 0)
+                                return r;
+                        if (r == 0) {
+                                esp_path = "/boot";
+                                id = "boot";
+                        }
+                }
+        }
+
+        if (!esp_path) {
+                r = slash_efi_in_fstab();
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        return 0;
+
+                r = path_is_busy("/efi");
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        return 0;
+
+                esp_path = "/efi";
+                id = "efi";
+        }
+
+        r = partition_pick_mount_options(
+                        PARTITION_ESP,
+                        dissected_partition_fstype(p),
+                        /* rw= */ true,
+                        /* discard= */ false,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine default mount options for %s: %m", esp_path);
+
+        return add_automount(
+                        id,
+                        p->node,
+                        esp_path,
+                        p->fstype,
+                        /* rw= */ true,
+                        /* growfs= */ false,
+                        options,
+                        "EFI System Partition Automount",
+                        120 * USEC_PER_SEC);
+}
+#else
+static int add_partition_esp(DissectedPartition *p, bool has_xbootldr) {
+        return 0;
+}
+#endif
+
+static int add_partition_root_rw(DissectedPartition *p) {
+        const char *path;
+        int r;
+
+        assert(p);
+        assert(!in_initrd());
+
+        /* Invoked on the main system (not initrd), to honour GPT flag 60 on the root fs (ro) */
+
+        if (arg_root_rw >= 0) {
+                log_debug("Parameter ro/rw specified on kernel command line, not generating drop-in for systemd-remount-fs.service.");
+                return 0;
+        }
+
+        if (!p->rw) {
+                log_debug("Root partition marked read-only in GPT partition table, not generating drop-in for systemd-remount-fs.service.");
+                return 0;
+        }
+
+        r = generator_enable_remount_fs_service(arg_dest);
+        if (r < 0)
+                return r;
+
+        path = strjoina(arg_dest, "/systemd-remount-fs.service.d/50-remount-rw.conf");
+
+        r = write_string_file(path,
+                              "# Automatically generated by systemd-gpt-auto-generator\n\n"
+                              "[Service]\n"
+                              "Environment=SYSTEMD_REMOUNT_ROOT_RW=1\n",
+                              WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_NOFOLLOW|WRITE_STRING_FILE_MKDIR_0755);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write drop-in file %s: %m", path);
+
+        return 0;
+}
+
+static int add_partition_root_growfs(DissectedPartition *p) {
+
+        assert(p);
+        assert(!in_initrd());
+
+        /* Invoked on the main system (not initrd), to honour GPT flag 59 on the root fs (growfs) */
+
+        if (!p->growfs) {
+                log_debug("Root partition not marked for growing the file system in the GPT partition table, not generating drop-in for systemd-growfs-root.service.");
+                return 0;
+        }
+
+        return generator_hook_up_growfs(arg_dest, "/", SPECIAL_LOCAL_FS_TARGET);
+}
+
+static int add_partition_root_flags(DissectedPartition *p) {
+        int r = 0;
+
+        assert(p);
+        assert(!in_initrd());
+
+        RET_GATHER(r, add_partition_root_growfs(p));
+        RET_GATHER(r, add_partition_root_rw(p));
+
+        return r;
+}
+
+#if ENABLE_EFI
+static int add_root_cryptsetup(void) {
+#if HAVE_LIBCRYPTSETUP
+
+        /* If a device /dev/gpt-auto-root-luks appears, then make it pull in systemd-cryptsetup-root.service, which
+         * sets it up, and causes /dev/gpt-auto-root to appear which is all we are looking for. */
+
+        return add_cryptsetup("root", "/dev/gpt-auto-root-luks", /* rw= */ true, /* require= */ false, /* measure= */ true, NULL);
+#else
+        return 0;
+#endif
+}
+#endif
+
+static int add_root_mount(void) {
+#if ENABLE_EFI
+        _cleanup_free_ char *options = NULL;
+        int r;
+
+        if (!is_efi_boot()) {
+                log_debug("Not an EFI boot, not creating root mount.");
+                return 0;
+        }
+
+        r = efi_loader_get_device_part_uuid(NULL);
+        if (r == -ENOENT) {
+                log_notice("EFI loader partition unknown, exiting.\n"
+                           "(The boot loader did not set EFI variable LoaderDevicePartUUID.)");
+                return 0;
+        } else if (r < 0)
+                return log_error_errno(r, "Failed to read loader partition UUID: %m");
+
+        /* OK, we have an ESP/XBOOTLDR partition, this is fantastic, so let's wait for a root device to show up.
+         * A udev rule will create the link for us under the right name. */
+
+        if (in_initrd()) {
+                r = generator_write_initrd_root_device_deps(arg_dest, "/dev/gpt-auto-root");
+                if (r < 0)
+                        return 0;
+
+                r = add_root_cryptsetup();
+                if (r < 0)
+                        return r;
+        }
+
+        /* Note that we do not need to enable systemd-remount-fs.service here. If /etc/fstab exists,
+         * systemd-fstab-generator will pull it in for us, and otherwise add_partition_root_flags() will do
+         * it, after the initrd transition. */
+
+        r = partition_pick_mount_options(
+                        PARTITION_ROOT,
+                        arg_root_fstype,
+                        arg_root_rw > 0,
+                        /* discard= */ true,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to pick root mount options: %m");
+
+        if (arg_root_options)
+                if (!strextend_with_separator(&options, ",", arg_root_options))
+                        return log_oom();
+
+        return add_mount(
+                        "root",
+                        "/dev/gpt-auto-root",
+                        in_initrd() ? "/sysroot" : "/",
+                        arg_root_fstype,
+                        /* rw= */ arg_root_rw > 0,
+                        /* growfs= */ false,
+                        /* measure= */ true,
+                        options,
+                        "Root Partition",
+                        in_initrd() ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_LOCAL_FS_TARGET);
+#else
+        return 0;
+#endif
+}
+
+static int process_loader_partitions(DissectedPartition *esp, DissectedPartition *xbootldr) {
+        sd_id128_t loader_uuid;
+        int r;
+
+        assert(esp);
+        assert(xbootldr);
+
+        if (!is_efi_boot()) {
+                log_debug("Not an EFI boot, skipping loader partition UUID check.");
+                goto mount;
+        }
+
+        /* Let's check if LoaderDevicePartUUID points to either ESP or XBOOTLDR. We prefer it pointing
+         * to the ESP, but we accept XBOOTLDR too. If it points to neither of them, don't mount any
+         * loader partitions, since they are not the ones used for booting. */
+
+        r = efi_loader_get_device_part_uuid(&loader_uuid);
+        if (r == -ENOENT) {
+                log_debug_errno(r, "EFI loader partition unknown, skipping ESP and XBOOTLDR mounts.");
+                return 0;
+        }
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read loader partition UUID, ignoring: %m");
+
+        if (esp->found && sd_id128_equal(esp->uuid, loader_uuid))
+                goto mount;
+
+        if (xbootldr->found && sd_id128_equal(xbootldr->uuid, loader_uuid)) {
+                log_debug("LoaderDevicePartUUID points to XBOOTLDR partition.");
+                goto mount;
+        }
+
+        log_debug("LoaderDevicePartUUID points to neither ESP nor XBOOTLDR, ignoring.");
+        return 0;
+
+mount:
+        r = 0;
+
+        if (xbootldr->found)
+                RET_GATHER(r, add_partition_xbootldr(xbootldr));
+        if (esp->found)
+                RET_GATHER(r, add_partition_esp(esp, xbootldr->found));
+
+        return r;
+}
+
+static int enumerate_partitions(dev_t devnum) {
+        _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
+        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
+        _cleanup_free_ char *devname = NULL;
+        int r;
+
+        assert(!in_initrd());
+
+        /* Run on the final root fs (not in the initrd), to mount auxiliary partitions, and hook in rw
+         * remount and growfs of the root partition */
+
+        r = block_get_whole_disk(devnum, &devnum);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get whole block device for " DEVNUM_FORMAT_STR ": %m",
+                                       DEVNUM_FORMAT_VAL(devnum));
+
+        r = devname_from_devnum(S_IFBLK, devnum, &devname);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get device node of " DEVNUM_FORMAT_STR ": %m",
+                                       DEVNUM_FORMAT_VAL(devnum));
+
+        /* Let's take a LOCK_SH lock on the block device, in case udevd is already running. If we don't take
+         * the lock, udevd might end up issuing BLKRRPART in the middle, and we don't want that, since that
+         * might remove all partitions while we are operating on them. */
+        r = loop_device_open_from_path(devname, O_RDONLY, LOCK_SH, &loop);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to open %s: %m", devname);
+
+        r = dissect_loop_device(
+                        loop,
+                        /* verity= */ NULL,
+                        /* mount_options= */ NULL,
+                        arg_image_policy ?: &image_policy_host,
+                        DISSECT_IMAGE_GPT_ONLY|
+                        DISSECT_IMAGE_USR_NO_ROOT|
+                        DISSECT_IMAGE_DISKSEQ_DEVNODE|
+                        DISSECT_IMAGE_ALLOW_EMPTY,
+                        /* NB! Unlike most other places where we dissect block devices we do not use
+                         * DISSECT_IMAGE_ADD_PARTITION_DEVICES here: we want that the kernel finds the
+                         * devices, and udev probes them before we mount them via .mount units much later
+                         * on. And thus we also don't set DISSECT_IMAGE_PIN_PARTITION_DEVICES here, because
+                         * we don't actually mount anything immediately. */
+                        &m);
+        if (r < 0) {
+                bool ok = r == -ENOPKG;
+                dissect_log_error(ok ? LOG_DEBUG : LOG_ERR, r, devname, NULL);
+                return ok ? 0 : r;
+        }
+
+        if (m->partitions[PARTITION_SWAP].found)
+                RET_GATHER(r, add_partition_swap(m->partitions + PARTITION_SWAP));
+
+        RET_GATHER(r, process_loader_partitions(m->partitions + PARTITION_ESP, m->partitions + PARTITION_XBOOTLDR));
+
+        if (m->partitions[PARTITION_HOME].found)
+                RET_GATHER(r, add_partition_mount(PARTITION_HOME, m->partitions + PARTITION_HOME,
+                                                  "home", "/home", "Home Partition"));
+
+        if (m->partitions[PARTITION_SRV].found)
+                RET_GATHER(r, add_partition_mount(PARTITION_SRV, m->partitions + PARTITION_SRV,
+                                                  "srv", "/srv", "Server Data Partition"));
+
+        if (m->partitions[PARTITION_VAR].found)
+                RET_GATHER(r, add_partition_mount(PARTITION_VAR, m->partitions + PARTITION_VAR,
+                                                  "var", "/var", "Variable Data Partition"));
+
+        if (m->partitions[PARTITION_TMP].found)
+                RET_GATHER(r, add_partition_mount(PARTITION_TMP, m->partitions + PARTITION_TMP,
+                                                  "var-tmp", "/var/tmp", "Temporary Data Partition"));
+
+        if (m->partitions[PARTITION_ROOT].found)
+                RET_GATHER(r, add_partition_root_flags(m->partitions + PARTITION_ROOT));
+
+        return r;
+}
+
+static int add_mounts(void) {
+        dev_t devno;
+        int r;
+
+        r = blockdev_get_root(LOG_ERR, &devno);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                log_debug("Skipping automatic GPT dissection logic, root file system not backed by a (single) whole block device.");
+                return 0;
+        }
+
+        return enumerate_partitions(devno);
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        int r;
+
+        assert(key);
+
+        if (proc_cmdline_key_streq(key, "systemd.gpt_auto") ||
+            proc_cmdline_key_streq(key, "rd.systemd.gpt_auto")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse gpt-auto switch \"%s\", ignoring: %m", value);
+                else
+                        arg_enabled = r;
+
+        } else if (streq(key, "root")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                /* Disable root disk logic if there's a root= value
+                 * specified (unless it happens to be "gpt-auto") */
+
+                if (!streq(value, "gpt-auto")) {
+                        arg_root_enabled = false;
+                        log_debug("Disabling root partition auto-detection, root= is defined.");
+                }
+
+        } else if (streq(key, "roothash")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                /* Disable root disk logic if there's roothash= defined (i.e. verity enabled) */
+
+                arg_root_enabled = false;
+
+        } else if (streq(key, "rootfstype")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                return free_and_strdup_warn(&arg_root_fstype, value);
+
+        } else if (streq(key, "rootflags")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!strextend_with_separator(&arg_root_options, ",", value))
+                        return log_oom();
+
+        } else if (streq(key, "rw") && !value)
+                arg_root_rw = true;
+        else if (streq(key, "ro") && !value)
+                arg_root_rw = false;
+        else if (proc_cmdline_key_streq(key, "systemd.image_policy"))
+                return parse_image_policy_argument(value, &arg_image_policy);
+
+        else if (streq(key, "systemd.swap")) {
+
+                r = value ? parse_boolean(value) : 1;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to parse swap switch \"%s\", ignoring: %m", value);
+                else
+                        arg_swap_enabled = r;
+
+                if (!arg_swap_enabled)
+                        log_debug("Disabling swap partitions auto-detection, systemd.swap=no is defined.");
+
+        }
+
+        return 0;
+}
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+        int r, k;
+
+        assert_se(arg_dest = dest_late);
+
+        if (detect_container() > 0) {
+                log_debug("In a container, exiting.");
+                return 0;
+        }
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+
+        if (!arg_enabled) {
+                log_debug("Disabled, exiting.");
+                return 0;
+        }
+
+        if (arg_root_enabled)
+                r = add_root_mount();
+
+        if (!in_initrd()) {
+                k = add_mounts();
+                if (r >= 0)
+                        r = k;
+        }
+
+        return r;
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/gpt-auto-generator/meson.build b/src/gpt-auto-generator/meson.build
new file mode 100644
index 0000000..c7cbfbb
--- /dev/null
+++ b/src/gpt-auto-generator/meson.build
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        generator_template + {
+                'name' : 'systemd-gpt-auto-generator',
+                'conditions' : ['HAVE_BLKID'],
+                'sources' : files('gpt-auto-generator.c'),
+                'dependencies' : libblkid,
+        },
+]
diff --git a/src/hibernate-resume/hibernate-resume-config.c b/src/hibernate-resume/hibernate-resume-config.c
new file mode 100644
index 0000000..e4be7ca
--- /dev/null
+++ b/src/hibernate-resume/hibernate-resume-config.c
@@ -0,0 +1,266 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "device-nodes.h"
+#include "fstab-util.h"
+#include "hibernate-resume-config.h"
+#include "json.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "efivars.h"
+
+static KernelHibernateLocation* kernel_hibernate_location_free(KernelHibernateLocation *k) {
+        if (!k)
+                return NULL;
+
+        free(k->device);
+
+        return mfree(k);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(KernelHibernateLocation*, kernel_hibernate_location_free);
+
+static EFIHibernateLocation* efi_hibernate_location_free(EFIHibernateLocation *e) {
+        if (!e)
+                return NULL;
+
+        free(e->device);
+
+        free(e->kernel_version);
+        free(e->id);
+        free(e->image_id);
+        free(e->image_version);
+
+        return mfree(e);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(EFIHibernateLocation*, efi_hibernate_location_free);
+
+void hibernate_info_done(HibernateInfo *info) {
+        assert(info);
+
+        kernel_hibernate_location_free(info->cmdline);
+        efi_hibernate_location_free(info->efi);
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        KernelHibernateLocation *k = ASSERT_PTR(data);
+        int r;
+
+        assert(key);
+
+        if (streq(key, "resume")) {
+                _cleanup_free_ char *d = NULL;
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                d = fstab_node_to_udev_node(value);
+                if (!d)
+                        return log_oom();
+
+                free_and_replace(k->device, d);
+
+        } else if (proc_cmdline_key_streq(key, "resume_offset")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                r = safe_atou64(value, &k->offset);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse resume_offset=%s: %m", value);
+
+                k->offset_set = true;
+        }
+
+        return 0;
+}
+
+static int get_kernel_hibernate_location(KernelHibernateLocation **ret) {
+        _cleanup_(kernel_hibernate_location_freep) KernelHibernateLocation *k = NULL;
+        int r;
+
+        assert(ret);
+
+        k = new0(KernelHibernateLocation, 1);
+        if (!k)
+                return log_oom();
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, k, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse kernel command line: %m");
+
+        if (!k->device) {
+                if (k->offset_set)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Found resume_offset=%" PRIu64 " but resume= is unset, refusing.",
+                                               k->offset);
+
+                *ret = NULL;
+                return 0;
+        }
+
+        *ret = TAKE_PTR(k);
+        return 1;
+}
+
+#if ENABLE_EFI
+static bool validate_efi_hibernate_location(EFIHibernateLocation *e) {
+        _cleanup_free_ char *id = NULL, *image_id = NULL;
+        int r;
+
+        assert(e);
+
+        r = parse_os_release(NULL,
+                             "ID", &id,
+                             "IMAGE_ID", &image_id);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse os-release: %m");
+
+        if (!streq_ptr(id, e->id) ||
+            !streq_ptr(image_id, e->image_id)) {
+                log_notice("HibernateLocation system identifier doesn't match currently running system, not resuming from it.");
+                return false;
+        }
+
+        /*
+         * Note that we accept kernel version mismatches. Linux writes the old kernel to disk as part of the
+         * hibernation image, and thus resuming means the short-lived kernel that reads the image from the
+         * disk will be replaced by the original kernel and effectively removed from memory as part of that.
+         */
+
+        return true;
+}
+
+static int get_efi_hibernate_location(EFIHibernateLocation **ret) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "uuid",                  JSON_VARIANT_STRING,        json_dispatch_id128,  offsetof(EFIHibernateLocation, uuid),           JSON_MANDATORY             },
+                { "offset",                _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(EFIHibernateLocation, offset),         JSON_MANDATORY             },
+                { "kernelVersion",         JSON_VARIANT_STRING,        json_dispatch_string, offsetof(EFIHibernateLocation, kernel_version), JSON_PERMISSIVE|JSON_DEBUG },
+                { "osReleaseId",           JSON_VARIANT_STRING,        json_dispatch_string, offsetof(EFIHibernateLocation, id),             JSON_PERMISSIVE|JSON_DEBUG },
+                { "osReleaseImageId",      JSON_VARIANT_STRING,        json_dispatch_string, offsetof(EFIHibernateLocation, image_id),       JSON_PERMISSIVE|JSON_DEBUG },
+                { "osReleaseVersionId",    JSON_VARIANT_STRING,        json_dispatch_string, offsetof(EFIHibernateLocation, version_id),     JSON_PERMISSIVE|JSON_DEBUG },
+                { "osReleaseImageVersion", JSON_VARIANT_STRING,        json_dispatch_string, offsetof(EFIHibernateLocation, image_version),  JSON_PERMISSIVE|JSON_DEBUG },
+                {},
+        };
+
+        _cleanup_(efi_hibernate_location_freep) EFIHibernateLocation *e = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_free_ char *location_str = NULL;
+        int r;
+
+        assert(ret);
+
+        if (!is_efi_boot())
+                goto skip;
+
+        r = efi_get_variable_string(EFI_SYSTEMD_VARIABLE(HibernateLocation), &location_str);
+        if (r == -ENOENT) {
+                log_debug_errno(r, "EFI variable HibernateLocation is not set, skipping.");
+                goto skip;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to get EFI variable HibernateLocation: %m");
+
+        r = json_parse(location_str, 0, &v, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse HibernateLocation JSON object: %m");
+
+        e = new0(EFIHibernateLocation, 1);
+        if (!e)
+                return log_oom();
+
+        r = json_dispatch(v, dispatch_table, JSON_LOG, e);
+        if (r < 0)
+                return r;
+
+        log_info("Reported hibernation image:%s%s%s%s%s%s%s%s%s%s UUID="SD_ID128_UUID_FORMAT_STR" offset=%"PRIu64,
+                 e->id ? " ID=" : "",                       strempty(e->id),
+                 e->image_id ? " IMAGE_ID=" : "",           strempty(e->image_id),
+                 e->version_id ? " VERSION_ID=" : "",       strempty(e->version_id),
+                 e->image_version ? " IMAGE_VERSION=" : "", strempty(e->image_version),
+                 e->kernel_version ? " kernel=" : "",       strempty(e->kernel_version),
+                 SD_ID128_FORMAT_VAL(e->uuid),
+                 e->offset);
+
+        if (!validate_efi_hibernate_location(e))
+                goto skip;
+
+        if (asprintf(&e->device, "/dev/disk/by-uuid/" SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(e->uuid)) < 0)
+                return log_oom();
+
+        *ret = TAKE_PTR(e);
+        return 1;
+
+skip:
+        *ret = NULL;
+        return 0;
+}
+
+void compare_hibernate_location_and_warn(const HibernateInfo *info) {
+        int r;
+
+        assert(info);
+
+        if (!info->cmdline || !info->efi)
+                return;
+
+        assert(info->device == info->cmdline->device);
+
+        if (!path_equal(info->cmdline->device, info->efi->device)) {
+                r = devnode_same(info->cmdline->device, info->efi->device);
+                if (r < 0)
+                        log_warning_errno(r,
+                                          "Failed to check if resume=%s is the same device as EFI HibernateLocation device '%s', ignoring: %m",
+                                          info->cmdline->device, info->efi->device);
+                if (r == 0)
+                        log_warning("resume=%s doesn't match with EFI HibernateLocation device '%s', proceeding anyway with resume=.",
+                                    info->cmdline->device, info->efi->device);
+        }
+
+        if (info->cmdline->offset != info->efi->offset)
+                log_warning("resume_offset=%" PRIu64 " doesn't match with EFI HibernateLocation offset %" PRIu64 ", proceeding anyway with resume_offset=.",
+                            info->cmdline->offset, info->efi->offset);
+}
+
+void clear_efi_hibernate_location(void) {
+        int r;
+
+        if (!is_efi_boot())
+                return;
+
+        r = efi_set_variable(EFI_SYSTEMD_VARIABLE(HibernateLocation), NULL, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to clear EFI variable HibernateLocation, ignoring: %m");
+}
+#endif
+
+int acquire_hibernate_info(HibernateInfo *ret) {
+        _cleanup_(hibernate_info_done) HibernateInfo i = {};
+        int r;
+
+        r = get_kernel_hibernate_location(&i.cmdline);
+        if (r < 0)
+                return r;
+
+#if ENABLE_EFI
+        r = get_efi_hibernate_location(&i.efi);
+        if (r < 0)
+                return r;
+#endif
+
+        if (i.cmdline) {
+                i.device = i.cmdline->device;
+                i.offset = i.cmdline->offset;
+        } else if (i.efi) {
+                i.device = i.efi->device;
+                i.offset = i.efi->offset;
+        } else
+                return -ENODEV;
+
+        *ret = TAKE_STRUCT(i);
+        return 0;
+}
diff --git a/src/hibernate-resume/hibernate-resume-config.h b/src/hibernate-resume/hibernate-resume-config.h
new file mode 100644
index 0000000..365d9cc
--- /dev/null
+++ b/src/hibernate-resume/hibernate-resume-config.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "sd-id128.h"
+
+typedef struct KernelHibernateLocation {
+        char *device;
+        uint64_t offset;
+        bool offset_set;
+} KernelHibernateLocation;
+
+typedef struct EFIHibernateLocation {
+        char *device;
+
+        sd_id128_t uuid;
+        uint64_t offset;
+
+        char *kernel_version;
+        char *id;
+        char *image_id;
+        char *version_id;
+        char *image_version;
+} EFIHibernateLocation;
+
+typedef struct HibernateInfo {
+        const char *device;
+        uint64_t offset; /* in memory pages */
+
+        KernelHibernateLocation *cmdline;
+        EFIHibernateLocation *efi;
+} HibernateInfo;
+
+void hibernate_info_done(HibernateInfo *info);
+
+int acquire_hibernate_info(HibernateInfo *ret);
+
+#if ENABLE_EFI
+
+void compare_hibernate_location_and_warn(const HibernateInfo *info);
+
+void clear_efi_hibernate_location(void);
+
+#else
+
+static inline void compare_hibernate_location_and_warn(const HibernateInfo *info) {
+        return;
+}
+
+static inline void clear_efi_hibernate_location(void) {
+        return;
+}
+
+#endif
diff --git a/src/hibernate-resume/hibernate-resume-generator.c b/src/hibernate-resume/hibernate-resume-generator.c
new file mode 100644
index 0000000..0168428
--- /dev/null
+++ b/src/hibernate-resume/hibernate-resume-generator.c
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "dropin.h"
+#include "generator.h"
+#include "hibernate-resume-config.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "parse-util.h"
+#include "proc-cmdline.h"
+#include "special.h"
+#include "static-destruct.h"
+#include "string-util.h"
+#include "unit-name.h"
+
+static const char *arg_dest = NULL;
+static char *arg_resume_options = NULL;
+static char *arg_root_options = NULL;
+static bool arg_noresume = false;
+
+STATIC_DESTRUCTOR_REGISTER(arg_resume_options, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_options, freep);
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+        assert(key);
+
+        if (streq(key, "resumeflags")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!strextend_with_separator(&arg_resume_options, ",", value))
+                        return log_oom();
+
+        } else if (streq(key, "rootflags")) {
+
+                if (proc_cmdline_value_missing(key, value))
+                        return 0;
+
+                if (!strextend_with_separator(&arg_root_options, ",", value))
+                        return log_oom();
+
+        } else if (streq(key, "noresume")) {
+
+                if (value) {
+                        log_warning("'noresume' kernel command line option specified with an argument, ignoring.");
+                        return 0;
+                }
+
+                arg_noresume = true;
+        }
+
+        return 0;
+}
+
+static int process_resume(const HibernateInfo *info) {
+        _cleanup_free_ char *device_unit = NULL;
+        int r;
+
+        assert(info);
+
+        r = unit_name_from_path(info->device, ".device", &device_unit);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate device unit name from path '%s': %m", info->device);
+
+        /* If hibernate info is acquired from EFI variable, don't wait forever by default. Otherwise, if
+         * swap device is not present and HibernateLocation was not correctly cleared, we end up blocking
+         * the boot process infinitely. */
+        r = write_drop_in_format(arg_dest, device_unit, 40, "device-timeout",
+                                 "# Automatically generated by systemd-hibernate-resume-generator\n\n"
+                                 "[Unit]\n"
+                                 "JobTimeoutSec=%s\n",
+                                 info->cmdline ? "infinity" : "2min");
+        if (r < 0)
+                log_warning_errno(r, "Failed to write device timeout drop-in, ignoring: %m");
+
+        r = generator_write_timeouts(arg_dest,
+                                     info->device,
+                                     info->device,
+                                     arg_resume_options ?: arg_root_options,
+                                     NULL);
+        if (r < 0)
+                log_warning_errno(r, "Failed to write device timeout drop-in, ignoring: %m");
+
+        r = write_drop_in_format(arg_dest, SPECIAL_HIBERNATE_RESUME_SERVICE, 90, "device-dependency",
+                                 "# Automatically generated by systemd-hibernate-resume-generator\n\n"
+                                 "[Unit]\n"
+                                 "BindsTo=%1$s\n"
+                                 "After=%1$s\n",
+                                 device_unit);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write device dependency drop-in: %m");
+
+        return generator_add_symlink(arg_dest, SPECIAL_SYSINIT_TARGET, "wants", SPECIAL_HIBERNATE_RESUME_SERVICE);
+}
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+        _cleanup_(hibernate_info_done) HibernateInfo info = {};
+        int r;
+
+        arg_dest = ASSERT_PTR(dest);
+
+        /* Don't even consider resuming outside of initrd. */
+        if (!in_initrd()) {
+                log_debug("Not running in initrd, exiting.");
+                return 0;
+        }
+
+        r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+
+        if (arg_noresume) {
+                log_info("Found 'noresume' on the kernel command line, exiting.");
+                return 0;
+        }
+
+        r = acquire_hibernate_info(&info);
+        if (r == -ENODEV) {
+                log_debug_errno(r, "No resume device found, exiting.");
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        return process_resume(&info);
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/hibernate-resume/hibernate-resume.c b/src/hibernate-resume/hibernate-resume.c
new file mode 100644
index 0000000..175a0bd
--- /dev/null
+++ b/src/hibernate-resume/hibernate-resume.c
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "devnum-util.h"
+#include "hibernate-resume-config.h"
+#include "hibernate-util.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "parse-util.h"
+#include "static-destruct.h"
+
+static HibernateInfo arg_info = {};
+
+STATIC_DESTRUCTOR_REGISTER(arg_info, hibernate_info_done);
+
+static int setup_hibernate_info_and_warn(void) {
+        int r;
+
+        r = acquire_hibernate_info(&arg_info);
+        if (r == -ENODEV) {
+                log_info_errno(r, "No resume device found, exiting.");
+                return 0;
+        }
+        if (r < 0)
+                return r;
+
+        compare_hibernate_location_and_warn(&arg_info);
+
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        struct stat st;
+        int r;
+
+        log_setup();
+
+        if (argc < 1 || argc > 3)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program expects zero, one, or two arguments.");
+
+        umask(0022);
+
+        if (!in_initrd())
+                return 0;
+
+        if (argc > 1) {
+                arg_info.device = argv[1];
+
+                if (argc == 3) {
+                        r = safe_atou64(argv[2], &arg_info.offset);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse resume offset %s: %m", argv[2]);
+                }
+        } else {
+                r = setup_hibernate_info_and_warn();
+                if (r <= 0)
+                        return r;
+
+                if (arg_info.efi)
+                        clear_efi_hibernate_location();
+        }
+
+        if (stat(arg_info.device, &st) < 0)
+                return log_error_errno(errno, "Failed to stat resume device '%s': %m", arg_info.device);
+
+        if (!S_ISBLK(st.st_mode))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Resume device '%s' is not a block device.", arg_info.device);
+
+        /* The write shall not return if a resume takes place. */
+        r = write_resume_config(st.st_rdev, arg_info.offset, arg_info.device);
+        log_full_errno(r < 0 ? LOG_ERR : LOG_DEBUG,
+                       r < 0 ? r : SYNTHETIC_ERRNO(ENOENT),
+                       "Unable to resume from device '%s' (" DEVNUM_FORMAT_STR ") offset %" PRIu64 ", continuing boot process.",
+                       arg_info.device, DEVNUM_FORMAT_VAL(st.st_rdev), arg_info.offset);
+
+        return r;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/hibernate-resume/meson.build b/src/hibernate-resume/meson.build
new file mode 100644
index 0000000..5bcd8e0
--- /dev/null
+++ b/src/hibernate-resume/meson.build
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        generator_template + {
+                'name' : 'systemd-hibernate-resume-generator',
+                'conditions' : ['ENABLE_HIBERNATE'],
+                'sources' : files(
+                        'hibernate-resume-generator.c',
+                        'hibernate-resume-config.c',
+                ),
+        },
+        libexec_template + {
+                'name' : 'systemd-hibernate-resume',
+                'conditions' : ['ENABLE_HIBERNATE'],
+                'sources' : files(
+                        'hibernate-resume.c',
+                        'hibernate-resume-config.c',
+                ),
+        },
+]
diff --git a/src/home/home-util.c b/src/home/home-util.c
new file mode 100644
index 0000000..c777d7b
--- /dev/null
+++ b/src/home/home-util.c
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dns-domain.h"
+#include "home-util.h"
+#include "libcrypt-util.h"
+#include "memory-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+bool suitable_user_name(const char *name) {
+
+        /* Checks whether the specified name is suitable for management via homed. Note that client-side
+         * we usually validate with the simple valid_user_group_name(), while server-side we are a bit more
+         * restrictive, so that we can change the rules server-side without having to update things
+         * client-side too. */
+
+        if (!valid_user_group_name(name, 0))
+                return false;
+
+        /* We generally rely on NSS to tell us which users not to care for, but let's filter out some
+         * particularly well-known users. */
+        if (STR_IN_SET(name,
+                       "root",
+                       "nobody",
+                       NOBODY_USER_NAME, NOBODY_GROUP_NAME))
+                return false;
+
+        /* Let's also defend our own namespace, as well as Debian's (unwritten?) logic of prefixing system
+         * users with underscores. */
+        if (STARTSWITH_SET(name, "systemd-", "_"))
+                return false;
+
+        return true;
+}
+
+int suitable_realm(const char *realm) {
+        _cleanup_free_ char *normalized = NULL;
+        int r;
+
+        /* Similar to the above: let's validate the realm a bit stricter server-side than client side */
+
+        r = dns_name_normalize(realm, 0, &normalized); /* this also checks general validity */
+        if (r == -EINVAL)
+                return 0;
+        if (r < 0)
+                return r;
+
+        if (!streq(realm, normalized)) /* is this normalized? */
+                return false;
+
+        if (dns_name_is_root(realm)) /* Don't allow top level domain */
+                return false;
+
+        return true;
+}
+
+int suitable_image_path(const char *path) {
+
+        return !empty_or_root(path) &&
+                path_is_valid(path) &&
+                path_is_absolute(path);
+}
+
+bool supported_fstype(const char *fstype) {
+        /* Limit the set of supported file systems a bit, as protection against little tested kernel file
+         * systems. Also, we only support the resize ioctls for these file systems. */
+        return STR_IN_SET(fstype, "ext4", "btrfs", "xfs");
+}
+
+int split_user_name_realm(const char *t, char **ret_user_name, char **ret_realm) {
+        _cleanup_free_ char *user_name = NULL, *realm = NULL;
+        const char *c;
+        int r;
+
+        assert(t);
+        assert(ret_user_name);
+        assert(ret_realm);
+
+        c = strchr(t, '@');
+        if (!c) {
+                user_name = strdup(t);
+                if (!user_name)
+                        return -ENOMEM;
+        } else {
+                user_name = strndup(t, c - t);
+                if (!user_name)
+                        return -ENOMEM;
+
+                realm = strdup(c + 1);
+                if (!realm)
+                        return -ENOMEM;
+        }
+
+        if (!suitable_user_name(user_name))
+                return -EINVAL;
+
+        if (realm) {
+                r = suitable_realm(realm);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EINVAL;
+        }
+
+        *ret_user_name = TAKE_PTR(user_name);
+        *ret_realm = TAKE_PTR(realm);
+
+        return 0;
+}
+
+int bus_message_append_secret(sd_bus_message *m, UserRecord *secret) {
+        _cleanup_(erase_and_freep) char *formatted = NULL;
+        JsonVariant *v;
+        int r;
+
+        assert(m);
+        assert(secret);
+
+        if (!FLAGS_SET(secret->mask, USER_RECORD_SECRET))
+                return sd_bus_message_append(m, "s", "{}");
+
+        v = json_variant_by_key(secret->json, "secret");
+        if (!v)
+                return -EINVAL;
+
+        r = json_variant_format(v, 0, &formatted);
+        if (r < 0)
+                return r;
+
+        (void) sd_bus_message_sensitive(m);
+
+        return sd_bus_message_append(m, "s", formatted);
+}
+
+const char *home_record_dir(void) {
+        return secure_getenv("SYSTEMD_HOME_RECORD_DIR") ?: "/var/lib/systemd/home/";
+}
diff --git a/src/home/home-util.h b/src/home/home-util.h
new file mode 100644
index 0000000..36b301d
--- /dev/null
+++ b/src/home/home-util.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "sd-bus.h"
+
+#include "time-util.h"
+#include "user-record.h"
+
+/* Put some limits on disk sizes: not less than 5M, not more than 5T */
+#define USER_DISK_SIZE_MIN (UINT64_C(5)*1024*1024)
+#define USER_DISK_SIZE_MAX (UINT64_C(5)*1024*1024*1024*1024)
+
+/* The default disk size to use when nothing else is specified, relative to free disk space. We calculate
+ * this from the default rebalancing weights, so that what we create initially doesn't immediately require
+ * rebalancing. */
+#define USER_DISK_SIZE_DEFAULT_PERCENT ((unsigned) ((100 * REBALANCE_WEIGHT_DEFAULT) / (REBALANCE_WEIGHT_DEFAULT + REBALANCE_WEIGHT_BACKING)))
+
+/* This should be 83% right now, i.e. 100 of (100 + 20). Let's protect us against accidental changes. */
+assert_cc(USER_DISK_SIZE_DEFAULT_PERCENT == 83U);
+
+bool suitable_user_name(const char *name);
+int suitable_realm(const char *realm);
+int suitable_image_path(const char *path);
+
+bool supported_fstype(const char *fstype);
+
+int split_user_name_realm(const char *t, char **ret_user_name, char **ret_realm);
+
+int bus_message_append_secret(sd_bus_message *m, UserRecord *secret);
+
+/* Many of our operations might be slow due to crypto, fsck, recursive chown() and so on. For these
+ * operations permit a *very* long timeout */
+#define HOME_SLOW_BUS_CALL_TIMEOUT_USEC (2*USEC_PER_MINUTE)
+
+const char *home_record_dir(void);
diff --git a/src/home/homectl-fido2.c b/src/home/homectl-fido2.c
new file mode 100644
index 0000000..3cbdf91
--- /dev/null
+++ b/src/home/homectl-fido2.c
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_LIBFIDO2
+#include 
+#endif
+
+#include "ask-password-api.h"
+#include "errno-util.h"
+#include "format-table.h"
+#include "hexdecoct.h"
+#include "homectl-fido2.h"
+#include "homectl-pkcs11.h"
+#include "libcrypt-util.h"
+#include "libfido2-util.h"
+#include "locale-util.h"
+#include "memory-util.h"
+#include "random-util.h"
+#include "strv.h"
+
+#if HAVE_LIBFIDO2
+static int add_fido2_credential_id(
+                JsonVariant **v,
+                const void *cid,
+                size_t cid_size) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        _cleanup_strv_free_ char **l = NULL;
+        _cleanup_free_ char *escaped = NULL;
+        ssize_t escaped_size;
+        int r;
+
+        assert(v);
+        assert(cid);
+
+        escaped_size = base64mem(cid, cid_size, &escaped);
+        if (escaped_size < 0)
+                return log_error_errno(escaped_size, "Failed to base64 encode FIDO2 credential ID: %m");
+
+        w = json_variant_ref(json_variant_by_key(*v, "fido2HmacCredential"));
+        if (w) {
+                r = json_variant_strv(w, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse FIDO2 credential ID list: %m");
+
+                if (strv_contains(l, escaped))
+                        return 0;
+        }
+
+        r = strv_extend(&l, escaped);
+        if (r < 0)
+                return log_oom();
+
+        w = json_variant_unref(w);
+        r = json_variant_new_array_strv(&w, l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create FIDO2 credential ID JSON: %m");
+
+        r = json_variant_set_field(v, "fido2HmacCredential", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update FIDO2 credential ID: %m");
+
+        return 0;
+}
+
+static int add_fido2_salt(
+                JsonVariant **v,
+                const void *cid,
+                size_t cid_size,
+                const void *fido2_salt,
+                size_t fido2_salt_size,
+                const void *secret,
+                size_t secret_size,
+                Fido2EnrollFlags lock_with) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *l = NULL, *w = NULL, *e = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL, *hashed = NULL;
+        ssize_t base64_encoded_size;
+        int r;
+
+        /* Before using UNIX hashing on the supplied key we base64 encode it, since crypt_r() and friends
+         * expect a NUL terminated string, and we use a binary key */
+        base64_encoded_size = base64mem(secret, secret_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return log_error_errno(base64_encoded_size, "Failed to base64 encode secret key: %m");
+
+        r = hash_password(base64_encoded, &hashed);
+        if (r < 0)
+                return log_error_errno(errno_or_else(EINVAL), "Failed to UNIX hash secret key: %m");
+
+        r = json_build(&e, JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("credential", JSON_BUILD_BASE64(cid, cid_size)),
+                                       JSON_BUILD_PAIR("salt", JSON_BUILD_BASE64(fido2_salt, fido2_salt_size)),
+                                       JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRING(hashed)),
+                                       JSON_BUILD_PAIR("up", JSON_BUILD_BOOLEAN(FLAGS_SET(lock_with, FIDO2ENROLL_UP))),
+                                       JSON_BUILD_PAIR("uv", JSON_BUILD_BOOLEAN(FLAGS_SET(lock_with, FIDO2ENROLL_UV))),
+                                       JSON_BUILD_PAIR("clientPin", JSON_BUILD_BOOLEAN(FLAGS_SET(lock_with, FIDO2ENROLL_PIN)))));
+
+        if (r < 0)
+                return log_error_errno(r, "Failed to build FIDO2 salt JSON key object: %m");
+
+        w = json_variant_ref(json_variant_by_key(*v, "privileged"));
+        l = json_variant_ref(json_variant_by_key(w, "fido2HmacSalt"));
+
+        r = json_variant_append_array(&l, e);
+        if (r < 0)
+                return log_error_errno(r, "Failed append FIDO2 salt: %m");
+
+        r = json_variant_set_field(&w, "fido2HmacSalt", l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set FDO2 salt: %m");
+
+        r = json_variant_set_field(v, "privileged", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update privileged field: %m");
+
+        return 0;
+}
+#endif
+
+int identity_add_fido2_parameters(
+                JsonVariant **v,
+                const char *device,
+                Fido2EnrollFlags lock_with,
+                int cred_alg) {
+
+#if HAVE_LIBFIDO2
+        JsonVariant *un, *realm, *rn;
+        _cleanup_(erase_and_freep) void *secret = NULL, *salt = NULL;
+        _cleanup_(erase_and_freep) char *used_pin = NULL;
+        size_t cid_size, salt_size, secret_size;
+        _cleanup_free_ void *cid = NULL;
+        const char *fido_un;
+        int r;
+
+        assert(v);
+        assert(device);
+
+        un = json_variant_by_key(*v, "userName");
+        if (!un)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "userName field of user record is missing");
+        if (!json_variant_is_string(un))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "userName field of user record is not a string");
+
+        realm = json_variant_by_key(*v, "realm");
+        if (realm) {
+                if (!json_variant_is_string(realm))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "realm field of user record is not a string");
+
+                fido_un = strjoina(json_variant_string(un), json_variant_string(realm));
+        } else
+                fido_un = json_variant_string(un);
+
+        rn = json_variant_by_key(*v, "realName");
+        if (rn && !json_variant_is_string(rn))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "realName field of user record is not a string");
+
+        r = fido2_generate_hmac_hash(
+                        device,
+                        /* rp_id= */ "io.systemd.home",
+                        /* rp_name= */ "Home Directory",
+                        /* user_id= */ fido_un, strlen(fido_un), /* We pass the user ID and name as the same */
+                        /* user_name= */ fido_un,
+                        /* user_display_name= */ rn ? json_variant_string(rn) : NULL,
+                        /* user_icon_name= */ NULL,
+                        /* askpw_icon_name= */ "user-home",
+                        lock_with,
+                        cred_alg,
+                        &cid, &cid_size,
+                        &salt, &salt_size,
+                        &secret, &secret_size,
+                        &used_pin,
+                        &lock_with);
+        if (r < 0)
+                return r;
+
+        r = add_fido2_credential_id(
+                        v,
+                        cid,
+                        cid_size);
+        if (r < 0)
+                return r;
+
+        r = add_fido2_salt(
+                        v,
+                        cid,
+                        cid_size,
+                        salt,
+                        salt_size,
+                        secret,
+                        secret_size,
+                        lock_with);
+        if (r < 0)
+                return r;
+
+        /* If we acquired the PIN also include it in the secret section of the record, so that systemd-homed
+         * can use it if it needs to, given that it likely needs to decrypt the key again to pass to LUKS or
+         * fscrypt. */
+        r = identity_add_token_pin(v, used_pin);
+        if (r < 0)
+                return r;
+
+        return 0;
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "FIDO2 tokens not supported on this build.");
+#endif
+}
diff --git a/src/home/homectl-fido2.h b/src/home/homectl-fido2.h
new file mode 100644
index 0000000..558c674
--- /dev/null
+++ b/src/home/homectl-fido2.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+#include "libfido2-util.h"
+
+int identity_add_fido2_parameters(JsonVariant **v, const char *device, Fido2EnrollFlags lock_with, int cred_alg);
diff --git a/src/home/homectl-pkcs11.c b/src/home/homectl-pkcs11.c
new file mode 100644
index 0000000..2539af0
--- /dev/null
+++ b/src/home/homectl-pkcs11.c
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "errno-util.h"
+#include "format-table.h"
+#include "hexdecoct.h"
+#include "homectl-pkcs11.h"
+#include "libcrypt-util.h"
+#include "memory-util.h"
+#include "openssl-util.h"
+#include "pkcs11-util.h"
+#include "random-util.h"
+#include "strv.h"
+
+static int add_pkcs11_encrypted_key(
+                JsonVariant **v,
+                const char *uri,
+                const void *encrypted_key, size_t encrypted_key_size,
+                const void *decrypted_key, size_t decrypted_key_size) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *l = NULL, *w = NULL, *e = NULL;
+        _cleanup_(erase_and_freep) char *base64_encoded = NULL, *hashed = NULL;
+        ssize_t base64_encoded_size;
+        int r;
+
+        assert(v);
+        assert(uri);
+        assert(encrypted_key);
+        assert(encrypted_key_size > 0);
+        assert(decrypted_key);
+        assert(decrypted_key_size > 0);
+
+        /* Before using UNIX hashing on the supplied key we base64 encode it, since crypt_r() and friends
+         * expect a NUL terminated string, and we use a binary key */
+        base64_encoded_size = base64mem(decrypted_key, decrypted_key_size, &base64_encoded);
+        if (base64_encoded_size < 0)
+                return log_error_errno(base64_encoded_size, "Failed to base64 encode secret key: %m");
+
+        r = hash_password(base64_encoded, &hashed);
+        if (r < 0)
+                return log_error_errno(errno_or_else(EINVAL), "Failed to UNIX hash secret key: %m");
+
+        r = json_build(&e, JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("uri", JSON_BUILD_STRING(uri)),
+                                       JSON_BUILD_PAIR("data", JSON_BUILD_BASE64(encrypted_key, encrypted_key_size)),
+                                       JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRING(hashed))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to build encrypted JSON key object: %m");
+
+        w = json_variant_ref(json_variant_by_key(*v, "privileged"));
+        l = json_variant_ref(json_variant_by_key(w, "pkcs11EncryptedKey"));
+
+        r = json_variant_append_array(&l, e);
+        if (r < 0)
+                return log_error_errno(r, "Failed append PKCS#11 encrypted key: %m");
+
+        r = json_variant_set_field(&w, "pkcs11EncryptedKey", l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set PKCS#11 encrypted key: %m");
+
+        r = json_variant_set_field(v, "privileged", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update privileged field: %m");
+
+        return 0;
+}
+
+static int add_pkcs11_token_uri(JsonVariant **v, const char *uri) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(v);
+        assert(uri);
+
+        w = json_variant_ref(json_variant_by_key(*v, "pkcs11TokenUri"));
+        if (w) {
+                r = json_variant_strv(w, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse PKCS#11 token list: %m");
+
+                if (strv_contains(l, uri))
+                        return 0;
+        }
+
+        r = strv_extend(&l, uri);
+        if (r < 0)
+                return log_oom();
+
+        w = json_variant_unref(w);
+        r = json_variant_new_array_strv(&w, l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create PKCS#11 token URI JSON: %m");
+
+        r = json_variant_set_field(v, "pkcs11TokenUri", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update PKCS#11 token URI list: %m");
+
+        return 0;
+}
+
+int identity_add_token_pin(JsonVariant **v, const char *pin) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL, *l = NULL;
+        _cleanup_strv_free_erase_ char **pins = NULL;
+        int r;
+
+        assert(v);
+
+        if (isempty(pin))
+                return 0;
+
+        w = json_variant_ref(json_variant_by_key(*v, "secret"));
+        l = json_variant_ref(json_variant_by_key(w, "tokenPin"));
+
+        r = json_variant_strv(l, &pins);
+        if (r < 0)
+                return log_error_errno(r, "Failed to convert PIN array: %m");
+
+        if (strv_contains(pins, pin))
+                return 0;
+
+        r = strv_extend(&pins, pin);
+        if (r < 0)
+                return log_oom();
+
+        strv_uniq(pins);
+
+        l = json_variant_unref(l);
+
+        r = json_variant_new_array_strv(&l, pins);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate new PIN array JSON: %m");
+
+        json_variant_sensitive(l);
+
+        r = json_variant_set_field(&w, "tokenPin", l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update PIN field: %m");
+
+        r = json_variant_set_field(v, "secret", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update secret object: %m");
+
+        return 1;
+}
+
+static int acquire_pkcs11_certificate(
+                const char *uri,
+                const char *askpw_friendly_name,
+                const char *askpw_icon_name,
+                X509 **ret_cert,
+                char **ret_pin_used) {
+#if HAVE_P11KIT
+        return pkcs11_acquire_certificate(uri, askpw_friendly_name, askpw_icon_name, ret_cert, ret_pin_used);
+#else
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                               "PKCS#11 tokens not supported on this build.");
+#endif
+}
+
+int identity_add_pkcs11_key_data(JsonVariant **v, const char *uri) {
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL, *encrypted_key = NULL;
+        _cleanup_(erase_and_freep) char *pin = NULL;
+        size_t decrypted_key_size, encrypted_key_size;
+        _cleanup_(X509_freep) X509 *cert = NULL;
+        EVP_PKEY *pkey;
+        int r;
+
+        assert(v);
+
+        r = acquire_pkcs11_certificate(uri, "home directory operation", "user-home", &cert, &pin);
+        if (r < 0)
+                return r;
+
+        pkey = X509_get0_pubkey(cert);
+        if (!pkey)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to extract public key from X.509 certificate.");
+
+        r = rsa_pkey_to_suitable_key_size(pkey, &decrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract RSA key size from X509 certificate.");
+
+        log_debug("Generating %zu bytes random key.", decrypted_key_size);
+
+        decrypted_key = malloc(decrypted_key_size);
+        if (!decrypted_key)
+                return log_oom();
+
+        r = crypto_random_bytes(decrypted_key, decrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate random key: %m");
+
+        r = rsa_encrypt_bytes(pkey, decrypted_key, decrypted_key_size, &encrypted_key, &encrypted_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to encrypt key: %m");
+
+        /* Add the token URI to the public part of the record. */
+        r = add_pkcs11_token_uri(v, uri);
+        if (r < 0)
+                return r;
+
+        /* Include the encrypted version of the random key we just generated in the privileged part of the record */
+        r = add_pkcs11_encrypted_key(
+                        v,
+                        uri,
+                        encrypted_key, encrypted_key_size,
+                        decrypted_key, decrypted_key_size);
+        if (r < 0)
+                return r;
+
+        /* If we acquired the PIN also include it in the secret section of the record, so that systemd-homed
+         * can use it if it needs to, given that it likely needs to decrypt the key again to pass to LUKS or
+         * fscrypt. */
+        r = identity_add_token_pin(v, pin);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
diff --git a/src/home/homectl-pkcs11.h b/src/home/homectl-pkcs11.h
new file mode 100644
index 0000000..5c30fee
--- /dev/null
+++ b/src/home/homectl-pkcs11.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+
+int identity_add_token_pin(JsonVariant **v, const char *pin);
+
+int identity_add_pkcs11_key_data(JsonVariant **v, const char *token_uri);
+
+int list_pkcs11_tokens(void);
+int find_pkcs11_token_auto(char **ret);
diff --git a/src/home/homectl-recovery-key.c b/src/home/homectl-recovery-key.c
new file mode 100644
index 0000000..bf18ae4
--- /dev/null
+++ b/src/home/homectl-recovery-key.c
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "errno-util.h"
+#include "glyph-util.h"
+#include "homectl-recovery-key.h"
+#include "libcrypt-util.h"
+#include "memory-util.h"
+#include "qrcode-util.h"
+#include "random-util.h"
+#include "recovery-key.h"
+#include "strv.h"
+#include "terminal-util.h"
+
+static int add_privileged(JsonVariant **v, const char *hashed) {
+        _cleanup_(json_variant_unrefp) JsonVariant *e = NULL, *w = NULL, *l = NULL;
+        int r;
+
+        assert(v);
+        assert(hashed);
+
+        r = json_build(&e, JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("modhex64")),
+                                       JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRING(hashed))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to build recover key JSON object: %m");
+
+        json_variant_sensitive(e);
+
+        w = json_variant_ref(json_variant_by_key(*v, "privileged"));
+        l = json_variant_ref(json_variant_by_key(w, "recoveryKey"));
+
+        r = json_variant_append_array(&l, e);
+        if (r < 0)
+                return log_error_errno(r, "Failed append recovery key: %m");
+
+        r = json_variant_set_field(&w, "recoveryKey", l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set recovery key array: %m");
+
+        r = json_variant_set_field(v, "privileged", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update privileged field: %m");
+
+        return 0;
+}
+
+static int add_public(JsonVariant **v) {
+        _cleanup_strv_free_ char **types = NULL;
+        int r;
+
+        assert(v);
+
+        r = json_variant_strv(json_variant_by_key(*v, "recoveryKeyType"), &types);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse recovery key type list: %m");
+
+        r = strv_extend(&types, "modhex64");
+        if (r < 0)
+                return log_oom();
+
+        r = json_variant_set_field_strv(v, "recoveryKeyType", types);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update recovery key types: %m");
+
+        return 0;
+}
+
+static int add_secret(JsonVariant **v, const char *password) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL, *l = NULL;
+        _cleanup_strv_free_erase_ char **passwords = NULL;
+        int r;
+
+        assert(v);
+        assert(password);
+
+        w = json_variant_ref(json_variant_by_key(*v, "secret"));
+        l = json_variant_ref(json_variant_by_key(w, "password"));
+
+        r = json_variant_strv(l, &passwords);
+        if (r < 0)
+                return log_error_errno(r, "Failed to convert password array: %m");
+
+        r = strv_extend(&passwords, password);
+        if (r < 0)
+                return log_oom();
+
+        r = json_variant_new_array_strv(&l, passwords);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate new password array JSON: %m");
+
+        json_variant_sensitive(l);
+
+        r = json_variant_set_field(&w, "password", l);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update password field: %m");
+
+        r = json_variant_set_field(v, "secret", w);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update secret object: %m");
+
+        return 0;
+}
+
+int identity_add_recovery_key(JsonVariant **v) {
+        _cleanup_(erase_and_freep) char *password = NULL, *hashed = NULL;
+        int r;
+
+        assert(v);
+
+        /* First, let's generate a secret key  */
+        r = make_recovery_key(&password);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate recovery key: %m");
+
+        /* Let's UNIX hash it */
+        r = hash_password(password, &hashed);
+        if (r < 0)
+                return log_error_errno(errno_or_else(EINVAL), "Failed to UNIX hash secret key: %m");
+
+        /* Let's now add the "privileged" version of the recovery key */
+        r = add_privileged(v, hashed);
+        if (r < 0)
+                return r;
+
+        /* Let's then add the public information about the recovery key */
+        r = add_public(v);
+        if (r < 0)
+                return r;
+
+        /* Finally, let's add the new key to the secret part, too */
+        r = add_secret(v, password);
+        if (r < 0)
+                return r;
+
+        /* We output the key itself with a trailing newline to stdout and the decoration around it to stderr
+         * instead. */
+
+        fflush(stdout);
+        fprintf(stderr,
+                "A secret recovery key has been generated for this account:\n\n"
+                "    %s%s%s",
+                emoji_enabled() ? special_glyph(SPECIAL_GLYPH_LOCK_AND_KEY) : "",
+                emoji_enabled() ? " " : "",
+                ansi_highlight());
+        fflush(stderr);
+
+        fputs(password, stdout);
+        fflush(stdout);
+
+        fputs(ansi_normal(), stderr);
+        fflush(stderr);
+
+        fputc('\n', stdout);
+        fflush(stdout);
+
+        fputs("\nPlease save this secret recovery key at a secure location. It may be used to\n"
+              "regain access to the account if the other configured access credentials have\n"
+              "been lost or forgotten. The recovery key may be entered in place of a password\n"
+              "whenever authentication is requested.\n", stderr);
+        fflush(stderr);
+
+        (void) print_qrcode(stderr, "You may optionally scan the recovery key off screen", password);
+
+        return 0;
+}
diff --git a/src/home/homectl-recovery-key.h b/src/home/homectl-recovery-key.h
new file mode 100644
index 0000000..ab195f9
--- /dev/null
+++ b/src/home/homectl-recovery-key.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+
+int identity_add_recovery_key(JsonVariant **v);
diff --git a/src/home/homectl.c b/src/home/homectl.c
new file mode 100644
index 0000000..a6951c8
--- /dev/null
+++ b/src/home/homectl.c
@@ -0,0 +1,3875 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-bus.h"
+
+#include "ask-password-api.h"
+#include "build.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-util.h"
+#include "dns-domain.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-table.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "home-util.h"
+#include "homectl-fido2.h"
+#include "homectl-pkcs11.h"
+#include "homectl-recovery-key.h"
+#include "libfido2-util.h"
+#include "locale-util.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "password-quality-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "pkcs11-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "spawn-polkit-agent.h"
+#include "terminal-util.h"
+#include "uid-alloc-range.h"
+#include "user-record.h"
+#include "user-record-password-quality.h"
+#include "user-record-show.h"
+#include "user-record-util.h"
+#include "user-util.h"
+#include "verbs.h"
+
+static PagerFlags arg_pager_flags = 0;
+static bool arg_legend = true;
+static bool arg_ask_password = true;
+static BusTransport arg_transport = BUS_TRANSPORT_LOCAL;
+static const char *arg_host = NULL;
+static const char *arg_identity = NULL;
+static JsonVariant *arg_identity_extra = NULL;
+static JsonVariant *arg_identity_extra_privileged = NULL;
+static JsonVariant *arg_identity_extra_this_machine = NULL;
+static JsonVariant *arg_identity_extra_rlimits = NULL;
+static char **arg_identity_filter = NULL; /* this one is also applied to 'privileged' and 'thisMachine' subobjects */
+static char **arg_identity_filter_rlimits = NULL;
+static uint64_t arg_disk_size = UINT64_MAX;
+static uint64_t arg_disk_size_relative = UINT64_MAX;
+static char **arg_pkcs11_token_uri = NULL;
+static char **arg_fido2_device = NULL;
+static Fido2EnrollFlags arg_fido2_lock_with = FIDO2ENROLL_PIN | FIDO2ENROLL_UP;
+#if HAVE_LIBFIDO2
+static int arg_fido2_cred_alg = COSE_ES256;
+#else
+static int arg_fido2_cred_alg = 0;
+#endif
+static bool arg_recovery_key = false;
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+static bool arg_and_resize = false;
+static bool arg_and_change_password = false;
+static enum {
+        EXPORT_FORMAT_FULL,          /* export the full record */
+        EXPORT_FORMAT_STRIPPED,      /* strip "state" + "binding", but leave signature in place */
+        EXPORT_FORMAT_MINIMAL,       /* also strip signature */
+} arg_export_format = EXPORT_FORMAT_FULL;
+static uint64_t arg_capability_bounding_set = UINT64_MAX;
+static uint64_t arg_capability_ambient_set = UINT64_MAX;
+
+STATIC_DESTRUCTOR_REGISTER(arg_identity_extra, json_variant_unrefp);
+STATIC_DESTRUCTOR_REGISTER(arg_identity_extra_this_machine, json_variant_unrefp);
+STATIC_DESTRUCTOR_REGISTER(arg_identity_extra_privileged, json_variant_unrefp);
+STATIC_DESTRUCTOR_REGISTER(arg_identity_extra_rlimits, json_variant_unrefp);
+STATIC_DESTRUCTOR_REGISTER(arg_identity_filter, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_identity_filter_rlimits, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_pkcs11_token_uri, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_fido2_device, strv_freep);
+
+static const BusLocator *bus_mgr;
+
+static bool identity_properties_specified(void) {
+        return
+                arg_identity ||
+                !json_variant_is_blank_object(arg_identity_extra) ||
+                !json_variant_is_blank_object(arg_identity_extra_privileged) ||
+                !json_variant_is_blank_object(arg_identity_extra_this_machine) ||
+                !json_variant_is_blank_object(arg_identity_extra_rlimits) ||
+                !strv_isempty(arg_identity_filter) ||
+                !strv_isempty(arg_identity_filter_rlimits) ||
+                !strv_isempty(arg_pkcs11_token_uri) ||
+                !strv_isempty(arg_fido2_device);
+}
+
+static int acquire_bus(sd_bus **bus) {
+        int r;
+
+        assert(bus);
+
+        if (*bus)
+                return 0;
+
+        r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, bus);
+        if (r < 0)
+                return bus_log_connect_error(r, arg_transport);
+
+        (void) sd_bus_set_allow_interactive_authorization(*bus, arg_ask_password);
+
+        return 0;
+}
+
+static int list_homes(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(table_unrefp) Table *table = NULL;
+        int r;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        r = bus_call_method(bus, bus_mgr, "ListHomes", &error, &reply, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to list homes: %s", bus_error_message(&error, r));
+
+        table = table_new("name", "uid", "gid", "state", "realname", "home", "shell");
+        if (!table)
+                return log_oom();
+
+        r = sd_bus_message_enter_container(reply, 'a', "(susussso)");
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        for (;;) {
+                const char *name, *state, *realname, *home, *shell, *color;
+                TableCell *cell;
+                uint32_t uid, gid;
+
+                r = sd_bus_message_read(reply, "(susussso)", &name, &uid, &state, &gid, &realname, &home, &shell, NULL);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+                if (r == 0)
+                        break;
+
+                r = table_add_many(table,
+                                   TABLE_STRING, name,
+                                   TABLE_UID, uid,
+                                   TABLE_GID, gid);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+
+                r = table_add_cell(table, &cell, TABLE_STRING, state);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                color = user_record_state_color(state);
+                if (color)
+                        (void) table_set_color(table, cell, color);
+
+                r = table_add_many(table,
+                                   TABLE_STRING, strna(empty_to_null(realname)),
+                                   TABLE_STRING, home,
+                                   TABLE_STRING, strna(empty_to_null(shell)));
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        r = sd_bus_message_exit_container(reply);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        if (table_get_rows(table) > 1 || !FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) {
+                r = table_set_sort(table, (size_t) 0);
+                if (r < 0)
+                        return table_log_sort_error(r);
+
+                r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, arg_legend);
+                if (r < 0)
+                        return r;
+        }
+
+        if (arg_legend && (arg_json_format_flags & JSON_FORMAT_OFF)) {
+                if (table_get_rows(table) > 1)
+                        printf("\n%zu home areas listed.\n", table_get_rows(table) - 1);
+                else
+                        printf("No home areas.\n");
+        }
+
+        return 0;
+}
+
+static int acquire_existing_password(
+                const char *user_name,
+                UserRecord *hr,
+                bool emphasize_current,
+                AskPasswordFlags flags) {
+
+        _cleanup_strv_free_erase_ char **password = NULL;
+        _cleanup_(erase_and_freep) char *envpw = NULL;
+        _cleanup_free_ char *question = NULL;
+        int r;
+
+        assert(user_name);
+        assert(hr);
+
+        r = getenv_steal_erase("PASSWORD", &envpw);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire password from environment: %m");
+        if (r > 0) {
+                /* People really shouldn't use environment variables for passing passwords. We support this
+                 * only for testing purposes, and do not document the behaviour, so that people won't
+                 * actually use this outside of testing. */
+
+                r = user_record_set_password(hr, STRV_MAKE(envpw), true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to store password: %m");
+
+                return 1;
+        }
+
+        /* If this is not our own user, then don't use the password cache */
+        if (is_this_me(user_name) <= 0)
+                SET_FLAG(flags, ASK_PASSWORD_ACCEPT_CACHED|ASK_PASSWORD_PUSH_CACHE, false);
+
+        if (asprintf(&question, emphasize_current ?
+                     "Please enter current password for user %s:" :
+                     "Please enter password for user %s:",
+                     user_name) < 0)
+                return log_oom();
+
+        r = ask_password_auto(question,
+                              /* icon= */ "user-home",
+                              NULL,
+                              /* key_name= */ "home-password",
+                              /* credential_name= */ "home.password",
+                              USEC_INFINITY,
+                              flags,
+                              &password);
+        if (r == -EUNATCH) { /* EUNATCH is returned if no password was found and asking interactively was
+                              * disabled via the flags. Not an error for us. */
+                log_debug_errno(r, "No passwords acquired.");
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire password: %m");
+
+        r = user_record_set_password(hr, password, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to store password: %m");
+
+        return 1;
+}
+
+static int acquire_recovery_key(
+                const char *user_name,
+                UserRecord *hr,
+                AskPasswordFlags flags) {
+
+        _cleanup_strv_free_erase_ char **recovery_key = NULL;
+        _cleanup_(erase_and_freep) char *envpw = NULL;
+        _cleanup_free_ char *question = NULL;
+        int r;
+
+        assert(user_name);
+        assert(hr);
+
+        r = getenv_steal_erase("PASSWORD", &envpw);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire password from environment: %m");
+        if (r > 0) {
+                /* People really shouldn't use environment variables for passing secrets. We support this
+                 * only for testing purposes, and do not document the behaviour, so that people won't
+                 * actually use this outside of testing. */
+
+                r = user_record_set_password(hr, STRV_MAKE(envpw), true); /* recovery keys are stored in the record exactly like regular passwords! */
+                if (r < 0)
+                        return log_error_errno(r, "Failed to store recovery key: %m");
+
+                return 1;
+        }
+
+        /* If this is not our own user, then don't use the password cache */
+        if (is_this_me(user_name) <= 0)
+                SET_FLAG(flags, ASK_PASSWORD_ACCEPT_CACHED|ASK_PASSWORD_PUSH_CACHE, false);
+
+        if (asprintf(&question, "Please enter recovery key for user %s:", user_name) < 0)
+                return log_oom();
+
+        r = ask_password_auto(question,
+                              /* icon= */ "user-home",
+                              NULL,
+                              /* key_name= */ "home-recovery-key",
+                              /* credential_name= */ "home.recovery-key",
+                              USEC_INFINITY,
+                              flags,
+                              &recovery_key);
+        if (r == -EUNATCH) { /* EUNATCH is returned if no recovery key was found and asking interactively was
+                              * disabled via the flags. Not an error for us. */
+                log_debug_errno(r, "No recovery keys acquired.");
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire recovery keys: %m");
+
+        r = user_record_set_password(hr, recovery_key, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to store recovery keys: %m");
+
+        return 1;
+}
+
+static int acquire_token_pin(
+                const char *user_name,
+                UserRecord *hr,
+                AskPasswordFlags flags) {
+
+        _cleanup_strv_free_erase_ char **pin = NULL;
+        _cleanup_(erase_and_freep) char *envpin = NULL;
+        _cleanup_free_ char *question = NULL;
+        int r;
+
+        assert(user_name);
+        assert(hr);
+
+        r = getenv_steal_erase("PIN", &envpin);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire PIN from environment: %m");
+        if (r > 0) {
+                r = user_record_set_token_pin(hr, STRV_MAKE(envpin), false);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to store token PIN: %m");
+
+                return 1;
+        }
+
+        /* If this is not our own user, then don't use the password cache */
+        if (is_this_me(user_name) <= 0)
+                SET_FLAG(flags, ASK_PASSWORD_ACCEPT_CACHED|ASK_PASSWORD_PUSH_CACHE, false);
+
+        if (asprintf(&question, "Please enter security token PIN for user %s:", user_name) < 0)
+                return log_oom();
+
+        r = ask_password_auto(
+                        question,
+                        /* icon= */ "user-home",
+                        NULL,
+                        /* key_name= */ "token-pin",
+                        /* credential_name= */ "home.token-pin",
+                        USEC_INFINITY,
+                        flags,
+                        &pin);
+        if (r == -EUNATCH) { /* EUNATCH is returned if no PIN was found and asking interactively was disabled
+                              * via the flags. Not an error for us. */
+                log_debug_errno(r, "No security token PINs acquired.");
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire security token PIN: %m");
+
+        r = user_record_set_token_pin(hr, pin, false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to store security token PIN: %m");
+
+        return 1;
+}
+
+static int handle_generic_user_record_error(
+                const char *user_name,
+                UserRecord *hr,
+                const sd_bus_error *error,
+                int ret,
+                bool emphasize_current_password) {
+        int r;
+
+        assert(user_name);
+        assert(hr);
+
+        if (sd_bus_error_has_name(error, BUS_ERROR_HOME_ABSENT))
+                return log_error_errno(SYNTHETIC_ERRNO(EREMOTE),
+                                       "Home of user %s is currently absent, please plug in the necessary storage device or backing file system.", user_name);
+
+        else if (sd_bus_error_has_name(error, BUS_ERROR_AUTHENTICATION_LIMIT_HIT))
+                return log_error_errno(SYNTHETIC_ERRNO(ETOOMANYREFS),
+                                       "Too frequent login attempts for user %s, try again later.", user_name);
+
+        else if (sd_bus_error_has_name(error, BUS_ERROR_BAD_PASSWORD)) {
+
+                if (!strv_isempty(hr->password))
+                        log_notice("Password incorrect or not sufficient, please try again.");
+
+                /* Don't consume cache entries or credentials here, we already tried that unsuccessfully. But
+                 * let's push what we acquire here into the cache */
+                r = acquire_existing_password(
+                                user_name,
+                                hr,
+                                emphasize_current_password,
+                                ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_NO_CREDENTIAL);
+                if (r < 0)
+                        return r;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_BAD_RECOVERY_KEY)) {
+
+                if (!strv_isempty(hr->password))
+                        log_notice("Recovery key incorrect or not sufficient, please try again.");
+
+                /* Don't consume cache entries or credentials here, we already tried that unsuccessfully. But
+                 * let's push what we acquire here into the cache */
+                r = acquire_recovery_key(
+                                user_name,
+                                hr,
+                                ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_NO_CREDENTIAL);
+                if (r < 0)
+                        return r;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN)) {
+
+                if (strv_isempty(hr->password))
+                        log_notice("Security token not inserted, please enter password.");
+                else
+                        log_notice("Password incorrect or not sufficient, and configured security token not inserted, please try again.");
+
+                r = acquire_existing_password(
+                                user_name,
+                                hr,
+                                emphasize_current_password,
+                                ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_NO_CREDENTIAL);
+                if (r < 0)
+                        return r;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_PIN_NEEDED)) {
+
+                /* First time the PIN is requested, let's accept cached data, and allow using credential store */
+                r = acquire_token_pin(
+                                user_name,
+                                hr,
+                                ASK_PASSWORD_ACCEPT_CACHED | ASK_PASSWORD_PUSH_CACHE);
+                if (r < 0)
+                        return r;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_PROTECTED_AUTHENTICATION_PATH_NEEDED)) {
+
+                log_notice("%s%sPlease authenticate physically on security token.",
+                           emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+                           emoji_enabled() ? " " : "");
+
+                r = user_record_set_pkcs11_protected_authentication_path_permitted(hr, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set PKCS#11 protected authentication path permitted flag: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_USER_PRESENCE_NEEDED)) {
+
+                log_notice("%s%sPlease confirm presence on security token.",
+                           emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+                           emoji_enabled() ? " " : "");
+
+                r = user_record_set_fido2_user_presence_permitted(hr, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set FIDO2 user presence permitted flag: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_USER_VERIFICATION_NEEDED)) {
+
+                log_notice("%s%sPlease verify user on security token.",
+                           emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+                           emoji_enabled() ? " " : "");
+
+                r = user_record_set_fido2_user_verification_permitted(hr, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set FIDO2 user verification permitted flag: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_PIN_LOCKED))
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Security token PIN is locked, please unlock it first. (Hint: Removal and re-insertion might suffice.)");
+
+        else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_BAD_PIN)) {
+
+                log_notice("Security token PIN incorrect, please try again.");
+
+                /* If the previous PIN was wrong don't accept cached info anymore, but add to cache. Also, don't use the credential data */
+                r = acquire_token_pin(
+                                user_name,
+                                hr,
+                                ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_NO_CREDENTIAL);
+                if (r < 0)
+                        return r;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_BAD_PIN_FEW_TRIES_LEFT)) {
+
+                log_notice("Security token PIN incorrect, please try again (only a few tries left!).");
+
+                r = acquire_token_pin(
+                                user_name,
+                                hr,
+                                ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_NO_CREDENTIAL);
+                if (r < 0)
+                        return r;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_BAD_PIN_ONE_TRY_LEFT)) {
+
+                log_notice("Security token PIN incorrect, please try again (only one try left!).");
+
+                r = acquire_token_pin(
+                                user_name,
+                                hr,
+                                ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_NO_CREDENTIAL);
+                if (r < 0)
+                        return r;
+        } else
+                return log_error_errno(ret, "Operation on home %s failed: %s", user_name, bus_error_message(error, ret));
+
+        return 0;
+}
+
+static int acquire_passed_secrets(const char *user_name, UserRecord **ret) {
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        int r;
+
+        assert(ret);
+
+        /* Generates an initial secret objects that contains passwords supplied via $PASSWORD, the password
+         * cache or the credentials subsystem, but excluding any interactive stuff. If nothing is passed,
+         * returns an empty secret object. */
+
+        secret = user_record_new();
+        if (!secret)
+                return log_oom();
+
+        r = acquire_existing_password(
+                        user_name,
+                        secret,
+                        /* emphasize_current_password = */ false,
+                        ASK_PASSWORD_ACCEPT_CACHED | ASK_PASSWORD_NO_TTY | ASK_PASSWORD_NO_AGENT);
+        if (r < 0)
+                return r;
+
+        r = acquire_token_pin(
+                        user_name,
+                        secret,
+                        ASK_PASSWORD_ACCEPT_CACHED | ASK_PASSWORD_NO_TTY | ASK_PASSWORD_NO_AGENT);
+        if (r < 0)
+                return r;
+
+        r = acquire_recovery_key(
+                        user_name,
+                        secret,
+                        ASK_PASSWORD_ACCEPT_CACHED | ASK_PASSWORD_NO_TTY | ASK_PASSWORD_NO_AGENT);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(secret);
+        return 0;
+}
+
+static int activate_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r, ret = 0;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, strv_skip(argv, 1)) {
+                _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+
+                r = acquire_passed_secrets(*i, &secret);
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                        r = bus_message_new_method_call(bus, &m, bus_mgr, "ActivateHome");
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_message_append(m, "s", *i);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = bus_message_append_secret(m, secret);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                        if (r < 0) {
+                                r = handle_generic_user_record_error(*i, secret, &error, r, /* emphasize_current_password= */ false);
+                                if (r < 0) {
+                                        if (ret == 0)
+                                                ret = r;
+
+                                        break;
+                                }
+                        } else
+                                break;
+                }
+        }
+
+        return ret;
+}
+
+static int deactivate_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r, ret = 0;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, strv_skip(argv, 1)) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "DeactivateHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "s", *i);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to deactivate user home: %s", bus_error_message(&error, r));
+                        if (ret == 0)
+                                ret = r;
+                }
+        }
+
+        return ret;
+}
+
+static void dump_home_record(UserRecord *hr) {
+        int r;
+
+        assert(hr);
+
+        if (hr->incomplete) {
+                fflush(stdout);
+                log_warning("Warning: lacking rights to acquire privileged fields of user record of '%s', output incomplete.", hr->user_name);
+        }
+
+        if (arg_json_format_flags & JSON_FORMAT_OFF)
+                user_record_show(hr, true);
+        else {
+                _cleanup_(user_record_unrefp) UserRecord *stripped = NULL;
+
+                if (arg_export_format == EXPORT_FORMAT_STRIPPED)
+                        r = user_record_clone(hr, USER_RECORD_EXTRACT_EMBEDDED|USER_RECORD_PERMISSIVE, &stripped);
+                else if (arg_export_format == EXPORT_FORMAT_MINIMAL)
+                        r = user_record_clone(hr, USER_RECORD_EXTRACT_SIGNABLE|USER_RECORD_PERMISSIVE, &stripped);
+                else
+                        r = 0;
+                if (r < 0)
+                        log_warning_errno(r, "Failed to strip user record, ignoring: %m");
+                if (stripped)
+                        hr = stripped;
+
+                json_variant_dump(hr->json, arg_json_format_flags, stdout, NULL);
+        }
+}
+
+static char **mangle_user_list(char **list, char ***ret_allocated) {
+        _cleanup_free_ char *myself = NULL;
+        char **l;
+
+        if (!strv_isempty(list)) {
+                *ret_allocated = NULL;
+                return list;
+        }
+
+        myself = getusername_malloc();
+        if (!myself)
+                return NULL;
+
+        l = new(char*, 2);
+        if (!l)
+                return NULL;
+
+        l[0] = TAKE_PTR(myself);
+        l[1] = NULL;
+
+        *ret_allocated = l;
+        return l;
+}
+
+static int inspect_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_strv_free_ char **mangled_list = NULL;
+        int r, ret = 0;
+        char **items;
+
+        pager_open(arg_pager_flags);
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        items = mangle_user_list(strv_skip(argv, 1), &mangled_list);
+        if (!items)
+                return log_oom();
+
+        STRV_FOREACH(i, items) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+                const char *json;
+                int incomplete;
+                uid_t uid;
+
+                r = parse_uid(*i, &uid);
+                if (r < 0) {
+                        if (!valid_user_group_name(*i, 0)) {
+                                log_error("Invalid user name '%s'.", *i);
+                                if (ret == 0)
+                                        ret = -EINVAL;
+
+                                continue;
+                        }
+
+                        r = bus_call_method(bus, bus_mgr, "GetUserRecordByName", &error, &reply, "s", *i);
+                } else
+                        r = bus_call_method(bus, bus_mgr, "GetUserRecordByUID", &error, &reply, "u", (uint32_t) uid);
+
+                if (r < 0) {
+                        log_error_errno(r, "Failed to inspect home: %s", bus_error_message(&error, r));
+                        if (ret == 0)
+                                ret = r;
+
+                        continue;
+                }
+
+                r = sd_bus_message_read(reply, "sbo", &json, &incomplete, NULL);
+                if (r < 0) {
+                        bus_log_parse_error(r);
+                        if (ret == 0)
+                                ret = r;
+
+                        continue;
+                }
+
+                r = json_parse(json, JSON_PARSE_SENSITIVE, &v, NULL, NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to parse JSON identity: %m");
+                        if (ret == 0)
+                                ret = r;
+
+                        continue;
+                }
+
+                hr = user_record_new();
+                if (!hr)
+                        return log_oom();
+
+                r = user_record_load(hr, v, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_LOG|USER_RECORD_PERMISSIVE);
+                if (r < 0) {
+                        if (ret == 0)
+                                ret = r;
+
+                        continue;
+                }
+
+                hr->incomplete = incomplete;
+                dump_home_record(hr);
+        }
+
+        return ret;
+}
+
+static int authenticate_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_strv_free_ char **mangled_list = NULL;
+        int r, ret = 0;
+        char **items;
+
+        items = mangle_user_list(strv_skip(argv, 1), &mangled_list);
+        if (!items)
+                return log_oom();
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        STRV_FOREACH(i, items) {
+                _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+
+                r = acquire_passed_secrets(*i, &secret);
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                        r = bus_message_new_method_call(bus, &m, bus_mgr, "AuthenticateHome");
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_message_append(m, "s", *i);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = bus_message_append_secret(m, secret);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                        if (r < 0) {
+                                r = handle_generic_user_record_error(*i, secret, &error, r, false);
+                                if (r < 0) {
+                                        if (ret == 0)
+                                                ret = r;
+
+                                        break;
+                                }
+                        } else
+                                break;
+                }
+        }
+
+        return ret;
+}
+
+static int update_last_change(JsonVariant **v, bool with_password, bool override) {
+        JsonVariant *c;
+        usec_t n;
+        int r;
+
+        assert(v);
+
+        n = now(CLOCK_REALTIME);
+
+        c = json_variant_by_key(*v, "lastChangeUSec");
+        if (c) {
+                uint64_t u;
+
+                if (!override)
+                        goto update_password;
+
+                if (!json_variant_is_unsigned(c))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "lastChangeUSec field is not an unsigned integer, refusing.");
+
+                u = json_variant_unsigned(c);
+                if (u >= n)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "lastChangeUSec is from the future, can't update.");
+        }
+
+        r = json_variant_set_field_unsigned(v, "lastChangeUSec", n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update lastChangeUSec: %m");
+
+update_password:
+        if (!with_password)
+                return 0;
+
+        c = json_variant_by_key(*v, "lastPasswordChangeUSec");
+        if (c) {
+                uint64_t u;
+
+                if (!override)
+                        return 0;
+
+                if (!json_variant_is_unsigned(c))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "lastPasswordChangeUSec field is not an unsigned integer, refusing.");
+
+                u = json_variant_unsigned(c);
+                if (u >= n)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "lastPasswordChangeUSec is from the future, can't update.");
+        }
+
+        r = json_variant_set_field_unsigned(v, "lastPasswordChangeUSec", n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update lastPasswordChangeUSec: %m");
+
+        return 1;
+}
+
+static int apply_identity_changes(JsonVariant **_v) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        int r;
+
+        assert(_v);
+
+        v = json_variant_ref(*_v);
+
+        r = json_variant_filter(&v, arg_identity_filter);
+        if (r < 0)
+                return log_error_errno(r, "Failed to filter identity: %m");
+
+        r = json_variant_merge_object(&v, arg_identity_extra);
+        if (r < 0)
+                return log_error_errno(r, "Failed to merge identities: %m");
+
+        if (arg_identity_extra_this_machine || !strv_isempty(arg_identity_filter)) {
+                _cleanup_(json_variant_unrefp) JsonVariant *per_machine = NULL, *mmid = NULL;
+                sd_id128_t mid;
+
+                r = sd_id128_get_machine(&mid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire machine ID: %m");
+
+                r = json_variant_new_string(&mmid, SD_ID128_TO_STRING(mid));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate matchMachineId object: %m");
+
+                per_machine = json_variant_ref(json_variant_by_key(v, "perMachine"));
+                if (per_machine) {
+                        _cleanup_(json_variant_unrefp) JsonVariant *npm = NULL, *add = NULL;
+                        _cleanup_free_ JsonVariant **array = NULL;
+                        JsonVariant *z;
+                        size_t i = 0;
+
+                        if (!json_variant_is_array(per_machine))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "perMachine field is not an array, refusing.");
+
+                        array = new(JsonVariant*, json_variant_elements(per_machine) + 1);
+                        if (!array)
+                                return log_oom();
+
+                        JSON_VARIANT_ARRAY_FOREACH(z, per_machine) {
+                                JsonVariant *u;
+
+                                if (!json_variant_is_object(z))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "perMachine entry is not an object, refusing.");
+
+                                array[i++] = z;
+
+                                u = json_variant_by_key(z, "matchMachineId");
+                                if (!u)
+                                        continue;
+
+                                if (!json_variant_equal(u, mmid))
+                                        continue;
+
+                                r = json_variant_merge_object(&add, z);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to merge perMachine entry: %m");
+
+                                i--;
+                        }
+
+                        r = json_variant_filter(&add, arg_identity_filter);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to filter perMachine: %m");
+
+                        r = json_variant_merge_object(&add, arg_identity_extra_this_machine);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to merge in perMachine fields: %m");
+
+                        if (arg_identity_filter_rlimits || arg_identity_extra_rlimits) {
+                                _cleanup_(json_variant_unrefp) JsonVariant *rlv = NULL;
+
+                                rlv = json_variant_ref(json_variant_by_key(add, "resourceLimits"));
+
+                                r = json_variant_filter(&rlv, arg_identity_filter_rlimits);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to filter resource limits: %m");
+
+                                r = json_variant_merge_object(&rlv, arg_identity_extra_rlimits);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to set resource limits: %m");
+
+                                if (json_variant_is_blank_object(rlv)) {
+                                        r = json_variant_filter(&add, STRV_MAKE("resourceLimits"));
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to drop resource limits field from identity: %m");
+                                } else {
+                                        r = json_variant_set_field(&add, "resourceLimits", rlv);
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to update resource limits of identity: %m");
+                                }
+                        }
+
+                        if (!json_variant_is_blank_object(add)) {
+                                r = json_variant_set_field(&add, "matchMachineId", mmid);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to set matchMachineId field: %m");
+
+                                array[i++] = add;
+                        }
+
+                        r = json_variant_new_array(&npm, array, i);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to allocate new perMachine array: %m");
+
+                        json_variant_unref(per_machine);
+                        per_machine = TAKE_PTR(npm);
+                } else {
+                        _cleanup_(json_variant_unrefp) JsonVariant *item = json_variant_ref(arg_identity_extra_this_machine);
+
+                        if (arg_identity_extra_rlimits) {
+                                r = json_variant_set_field(&item, "resourceLimits", arg_identity_extra_rlimits);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to update resource limits of identity: %m");
+                        }
+
+                        r = json_variant_set_field(&item, "matchMachineId", mmid);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set matchMachineId field: %m");
+
+                        r = json_variant_append_array(&per_machine, item);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to append to perMachine array: %m");
+                }
+
+                r = json_variant_set_field(&v, "perMachine", per_machine);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to update per machine record: %m");
+        }
+
+        if (arg_identity_extra_privileged || arg_identity_filter) {
+                _cleanup_(json_variant_unrefp) JsonVariant *privileged = NULL;
+
+                privileged = json_variant_ref(json_variant_by_key(v, "privileged"));
+
+                r = json_variant_filter(&privileged, arg_identity_filter);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to filter identity (privileged part): %m");
+
+                r = json_variant_merge_object(&privileged, arg_identity_extra_privileged);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to merge identities (privileged part): %m");
+
+                if (json_variant_is_blank_object(privileged)) {
+                        r = json_variant_filter(&v, STRV_MAKE("privileged"));
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to drop privileged part from identity: %m");
+                } else {
+                        r = json_variant_set_field(&v, "privileged", privileged);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to update privileged part of identity: %m");
+                }
+        }
+
+        if (arg_identity_filter_rlimits) {
+                _cleanup_(json_variant_unrefp) JsonVariant *rlv = NULL;
+
+                rlv = json_variant_ref(json_variant_by_key(v, "resourceLimits"));
+
+                r = json_variant_filter(&rlv, arg_identity_filter_rlimits);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to filter resource limits: %m");
+
+                /* Note that we only filter resource limits here, but don't apply them. We do that in the perMachine section */
+
+                if (json_variant_is_blank_object(rlv)) {
+                        r = json_variant_filter(&v, STRV_MAKE("resourceLimits"));
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to drop resource limits field from identity: %m");
+                } else {
+                        r = json_variant_set_field(&v, "resourceLimits", rlv);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to update resource limits of identity: %m");
+                }
+        }
+
+        json_variant_unref(*_v);
+        *_v = TAKE_PTR(v);
+
+        return 0;
+}
+
+static int add_disposition(JsonVariant **v) {
+        int r;
+
+        assert(v);
+
+        if (json_variant_by_key(*v, "disposition"))
+                return 0;
+
+        /* Set the disposition to regular, if not configured explicitly */
+        r = json_variant_set_field_string(v, "disposition", "regular");
+        if (r < 0)
+                return log_error_errno(r, "Failed to set disposition field: %m");
+
+        return 1;
+}
+
+static int acquire_new_home_record(UserRecord **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        int r;
+
+        assert(ret);
+
+        if (arg_identity) {
+                unsigned line, column;
+
+                r = json_parse_file(
+                                streq(arg_identity, "-") ? stdin : NULL,
+                                streq(arg_identity, "-") ? "" : arg_identity, JSON_PARSE_SENSITIVE, &v, &line, &column);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse identity at %u:%u: %m", line, column);
+        }
+
+        r = apply_identity_changes(&v);
+        if (r < 0)
+                return r;
+
+        r = add_disposition(&v);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, arg_pkcs11_token_uri) {
+                r = identity_add_pkcs11_key_data(&v, *i);
+                if (r < 0)
+                        return r;
+        }
+
+        STRV_FOREACH(i, arg_fido2_device) {
+                r = identity_add_fido2_parameters(&v, *i, arg_fido2_lock_with, arg_fido2_cred_alg);
+                if (r < 0)
+                        return r;
+        }
+
+        if (arg_recovery_key) {
+                r = identity_add_recovery_key(&v);
+                if (r < 0)
+                        return r;
+        }
+
+        r = update_last_change(&v, true, false);
+        if (r < 0)
+                return r;
+
+        if (DEBUG_LOGGING)
+                json_variant_dump(v, JSON_FORMAT_PRETTY, NULL, NULL);
+
+        hr = user_record_new();
+        if (!hr)
+                return log_oom();
+
+        r = user_record_load(hr, v, USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_SECRET|USER_RECORD_ALLOW_PRIVILEGED|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_LOG|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(hr);
+        return 0;
+}
+
+static int acquire_new_password(
+                const char *user_name,
+                UserRecord *hr,
+                bool suggest,
+                char **ret) {
+
+        _cleanup_(erase_and_freep) char *envpw = NULL;
+        unsigned i = 5;
+        int r;
+
+        assert(user_name);
+        assert(hr);
+
+        r = getenv_steal_erase("NEWPASSWORD", &envpw);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire password from environment: %m");
+        if (r > 0) {
+                /* As above, this is not for use, just for testing */
+
+                r = user_record_set_password(hr, STRV_MAKE(envpw), /* prepend = */ true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to store password: %m");
+
+                if (ret)
+                        *ret = TAKE_PTR(envpw);
+
+                return 0;
+        }
+
+        if (suggest)
+                (void) suggest_passwords();
+
+        for (;;) {
+                _cleanup_strv_free_erase_ char **first = NULL, **second = NULL;
+                _cleanup_free_ char *question = NULL;
+
+                if (--i == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOKEY), "Too many attempts, giving up:");
+
+                if (asprintf(&question, "Please enter new password for user %s:", user_name) < 0)
+                        return log_oom();
+
+                r = ask_password_auto(
+                                question,
+                                /* icon= */ "user-home",
+                                NULL,
+                                /* key_name= */ "home-password",
+                                /* credential_name= */ "home.new-password",
+                                USEC_INFINITY,
+                                0, /* no caching, we want to collect a new password here after all */
+                                &first);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire password: %m");
+
+                question = mfree(question);
+                if (asprintf(&question, "Please enter new password for user %s (repeat):", user_name) < 0)
+                        return log_oom();
+
+                r = ask_password_auto(
+                                question,
+                                /* icon= */ "user-home",
+                                NULL,
+                                /* key_name= */ "home-password",
+                                /* credential_name= */ "home.new-password",
+                                USEC_INFINITY,
+                                0, /* no caching */
+                                &second);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire password: %m");
+
+                if (strv_equal(first, second)) {
+                        _cleanup_(erase_and_freep) char *copy = NULL;
+
+                        if (ret) {
+                                copy = strdup(first[0]);
+                                if (!copy)
+                                        return log_oom();
+                        }
+
+                        r = user_record_set_password(hr, first, /* prepend = */ true);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to store password: %m");
+
+                        if (ret)
+                                *ret = TAKE_PTR(copy);
+
+                        return 0;
+                }
+
+                log_error("Password didn't match, try again.");
+        }
+}
+
+static int create_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        int r;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        if (argc >= 2) {
+                /* If a username was specified, use it */
+
+                if (valid_user_group_name(argv[1], 0))
+                        r = json_variant_set_field_string(&arg_identity_extra, "userName", argv[1]);
+                else {
+                        _cleanup_free_ char *un = NULL, *rr = NULL;
+
+                        /* Before we consider the user name invalid, let's check if we can split it? */
+                        r = split_user_name_realm(argv[1], &un, &rr);
+                        if (r < 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User name '%s' is not valid: %m", argv[1]);
+
+                        if (rr) {
+                                r = json_variant_set_field_string(&arg_identity_extra, "realm", rr);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to set realm field: %m");
+                        }
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "userName", un);
+                }
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set userName field: %m");
+        } else {
+                /* If neither a username nor an identity have been specified we cannot operate. */
+                if (!arg_identity)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User name required.");
+        }
+
+        r = acquire_new_home_record(&hr);
+        if (r < 0)
+                return r;
+
+        /* If the JSON record carries no plain text password (besides the recovery key), then let's query it
+         * manually. */
+        if (strv_length(hr->password) <= arg_recovery_key) {
+
+                if (strv_isempty(hr->hashed_password)) {
+                        _cleanup_(erase_and_freep) char *new_password = NULL;
+
+                        /* No regular (i.e. non-PKCS#11) hashed passwords set in the record, let's fix that. */
+                        r = acquire_new_password(hr->user_name, hr, /* suggest = */ true, &new_password);
+                        if (r < 0)
+                                return r;
+
+                        r = user_record_make_hashed_password(hr, STRV_MAKE(new_password), /* extend = */ false);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to hash password: %m");
+                } else {
+                        /* There's a hash password set in the record, acquire the unhashed version of it. */
+                        r = acquire_existing_password(
+                                        hr->user_name,
+                                        hr,
+                                        /* emphasize_current= */ false,
+                                        ASK_PASSWORD_ACCEPT_CACHED | ASK_PASSWORD_PUSH_CACHE);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (hr->enforce_password_policy == 0) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                /* If password quality enforcement is disabled, let's at least warn client side */
+
+                r = user_record_check_password_quality(hr, hr, &error);
+                if (r < 0)
+                        log_warning_errno(r, "Specified password does not pass quality checks (%s), proceeding anyway.", bus_error_message(&error, r));
+        }
+
+        for (;;) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+                _cleanup_(erase_and_freep) char *formatted = NULL;
+
+                r = json_variant_format(hr->json, 0, &formatted);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to format user record: %m");
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "CreateHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                (void) sd_bus_message_sensitive(m);
+
+                r = sd_bus_message_append(m, "s", formatted);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        if (sd_bus_error_has_name(&error, BUS_ERROR_LOW_PASSWORD_QUALITY)) {
+                                _cleanup_(erase_and_freep) char *new_password = NULL;
+
+                                log_error_errno(r, "%s", bus_error_message(&error, r));
+                                log_info("(Use --enforce-password-policy=no to turn off password quality checks for this account.)");
+
+                                r = acquire_new_password(hr->user_name, hr, /* suggest = */ false, &new_password);
+                                if (r < 0)
+                                        return r;
+
+                                r = user_record_make_hashed_password(hr, STRV_MAKE(new_password), /* extend = */ false);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to hash passwords: %m");
+                        } else {
+                                r = handle_generic_user_record_error(hr->user_name, hr, &error, r, false);
+                                if (r < 0)
+                                        return r;
+                        }
+                } else
+                        break; /* done */
+        }
+
+        return 0;
+}
+
+static int remove_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r, ret = 0;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        STRV_FOREACH(i, strv_skip(argv, 1)) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "RemoveHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "s", *i);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to remove home: %s", bus_error_message(&error, r));
+                        if (ret == 0)
+                                ret = r;
+                }
+        }
+
+        return ret;
+}
+
+static int acquire_updated_home_record(
+                sd_bus *bus,
+                const char *username,
+                UserRecord **ret) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *json = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        int r;
+
+        assert(ret);
+
+        if (arg_identity) {
+                unsigned line, column;
+                JsonVariant *un;
+
+                r = json_parse_file(
+                                streq(arg_identity, "-") ? stdin : NULL,
+                                streq(arg_identity, "-") ? "" : arg_identity, JSON_PARSE_SENSITIVE, &json, &line, &column);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse identity at %u:%u: %m", line, column);
+
+                un = json_variant_by_key(json, "userName");
+                if (un) {
+                        if (!json_variant_is_string(un) || (username && !streq(json_variant_string(un), username)))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User name specified on command line and in JSON record do not match.");
+                } else {
+                        if (!username)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No username specified.");
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "userName", username);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set userName field: %m");
+                }
+
+        } else {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+                int incomplete;
+                const char *text;
+
+                if (!identity_properties_specified())
+                        return log_error_errno(SYNTHETIC_ERRNO(EALREADY), "No field to change specified.");
+
+                r = bus_call_method(bus, bus_mgr, "GetUserRecordByName", &error, &reply, "s", username);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire user home record: %s", bus_error_message(&error, r));
+
+                r = sd_bus_message_read(reply, "sbo", &text, &incomplete, NULL);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                if (incomplete)
+                        return log_error_errno(SYNTHETIC_ERRNO(EACCES), "Lacking rights to acquire user record including privileged metadata, can't update record.");
+
+                r = json_parse(text, JSON_PARSE_SENSITIVE, &json, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse JSON identity: %m");
+
+                reply = sd_bus_message_unref(reply);
+
+                r = json_variant_filter(&json, STRV_MAKE("binding", "status", "signature"));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to strip binding and status from record to update: %m");
+        }
+
+        r = apply_identity_changes(&json);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, arg_pkcs11_token_uri) {
+                r = identity_add_pkcs11_key_data(&json, *i);
+                if (r < 0)
+                        return r;
+        }
+
+        STRV_FOREACH(i, arg_fido2_device) {
+                r = identity_add_fido2_parameters(&json, *i, arg_fido2_lock_with, arg_fido2_cred_alg);
+                if (r < 0)
+                        return r;
+        }
+
+        /* If the user supplied a full record, then add in lastChange, but do not override. Otherwise always
+         * override. */
+        r = update_last_change(&json, arg_pkcs11_token_uri || arg_fido2_device, !arg_identity);
+        if (r < 0)
+                return r;
+
+        if (DEBUG_LOGGING)
+                json_variant_dump(json, JSON_FORMAT_PRETTY, NULL, NULL);
+
+        hr = user_record_new();
+        if (!hr)
+                return log_oom();
+
+        r = user_record_load(hr, json, USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_PRIVILEGED|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_SECRET|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_LOG|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(hr);
+        return 0;
+}
+
+static int home_record_reset_human_interaction_permission(UserRecord *hr) {
+        int r;
+
+        assert(hr);
+
+        /* When we execute multiple operations one after the other, let's reset the permission to ask the
+         * user each time, so that if interaction is necessary we will be told so again and thus can print a
+         * nice message to the user, telling the user so. */
+
+        r = user_record_set_pkcs11_protected_authentication_path_permitted(hr, -1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset PKCS#11 protected authentication path permission flag: %m");
+
+        r = user_record_set_fido2_user_presence_permitted(hr, -1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset FIDO2 user presence permission flag: %m");
+
+        r = user_record_set_fido2_user_verification_permitted(hr, -1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset FIDO2 user verification permission flag: %m");
+
+        return 0;
+}
+
+static int update_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL, *secret = NULL;
+        _cleanup_free_ char *buffer = NULL;
+        const char *username;
+        int r;
+
+        if (argc >= 2)
+                username = argv[1];
+        else if (!arg_identity) {
+                buffer = getusername_malloc();
+                if (!buffer)
+                        return log_oom();
+
+                username = buffer;
+        } else
+                username = NULL;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        r = acquire_updated_home_record(bus, username, &hr);
+        if (r < 0)
+                return r;
+
+        /* Add in all secrets we can acquire cheaply */
+        r = acquire_passed_secrets(username, &secret);
+        if (r < 0)
+                return r;
+
+        r = user_record_merge_secret(hr, secret);
+        if (r < 0)
+                return r;
+
+        /* If we do multiple operations, let's output things more verbosely, since otherwise the repeated
+         * authentication might be confusing. */
+
+        if (arg_and_resize || arg_and_change_password)
+                log_info("Updating home directory.");
+
+        for (;;) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+                _cleanup_free_ char *formatted = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "UpdateHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = json_variant_format(hr->json, 0, &formatted);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to format user record: %m");
+
+                (void) sd_bus_message_sensitive(m);
+
+                r = sd_bus_message_append(m, "s", formatted);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        if (arg_and_change_password &&
+                            sd_bus_error_has_name(&error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN))
+                                /* In the generic handler we'd ask for a password in this case, but when
+                                 * changing passwords that's not sufficient, as we need to acquire all keys
+                                 * first. */
+                                return log_error_errno(r, "Security token not inserted, refusing.");
+
+                        r = handle_generic_user_record_error(hr->user_name, hr, &error, r, false);
+                        if (r < 0)
+                                return r;
+                } else
+                        break;
+        }
+
+        if (arg_and_resize)
+                log_info("Resizing home.");
+
+        (void) home_record_reset_human_interaction_permission(hr);
+
+        /* Also sync down disk size to underlying LUKS/fscrypt/quota */
+        while (arg_and_resize) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "ResizeHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                /* Specify UINT64_MAX as size, in which case the underlying disk size will just be synced */
+                r = sd_bus_message_append(m, "st", hr->user_name, UINT64_MAX);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = bus_message_append_secret(m, hr);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        if (arg_and_change_password &&
+                            sd_bus_error_has_name(&error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN))
+                                return log_error_errno(r, "Security token not inserted, refusing.");
+
+                        r = handle_generic_user_record_error(hr->user_name, hr, &error, r, false);
+                        if (r < 0)
+                                return r;
+                } else
+                        break;
+        }
+
+        if (arg_and_change_password)
+                log_info("Synchronizing passwords and encryption keys.");
+
+        (void) home_record_reset_human_interaction_permission(hr);
+
+        /* Also sync down passwords to underlying LUKS/fscrypt */
+        while (arg_and_change_password) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "ChangePasswordHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                /* Specify an empty new secret, in which case the underlying LUKS/fscrypt password will just be synced */
+                r = sd_bus_message_append(m, "ss", hr->user_name, "{}");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = bus_message_append_secret(m, hr);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        if (sd_bus_error_has_name(&error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN))
+                                return log_error_errno(r, "Security token not inserted, refusing.");
+
+                        r = handle_generic_user_record_error(hr->user_name, hr, &error, r, false);
+                        if (r < 0)
+                                return r;
+                } else
+                        break;
+        }
+
+        return 0;
+}
+
+static int passwd_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(user_record_unrefp) UserRecord *old_secret = NULL, *new_secret = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_free_ char *buffer = NULL;
+        const char *username;
+        int r;
+
+        if (arg_pkcs11_token_uri)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "To change the PKCS#11 security token use 'homectl update --pkcs11-token-uri=%s'.",
+                                       special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+        if (arg_fido2_device)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "To change the FIDO2 security token use 'homectl update --fido2-device=%s'.",
+                                       special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+        if (identity_properties_specified())
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "The 'passwd' verb does not permit changing other record properties at the same time.");
+
+        if (argc >= 2)
+                username = argv[1];
+        else {
+                buffer = getusername_malloc();
+                if (!buffer)
+                        return log_oom();
+
+                username = buffer;
+        }
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        r = acquire_passed_secrets(username, &old_secret);
+        if (r < 0)
+                return r;
+
+        new_secret = user_record_new();
+        if (!new_secret)
+                return log_oom();
+
+        r = acquire_new_password(username, new_secret, /* suggest = */ true, NULL);
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "ChangePasswordHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "s", username);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = bus_message_append_secret(m, new_secret);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = bus_message_append_secret(m, old_secret);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        if (sd_bus_error_has_name(&error, BUS_ERROR_LOW_PASSWORD_QUALITY)) {
+
+                                log_error_errno(r, "%s", bus_error_message(&error, r));
+
+                                r = acquire_new_password(username, new_secret, /* suggest = */ false, NULL);
+
+                        } else if (sd_bus_error_has_name(&error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN))
+
+                                /* In the generic handler we'd ask for a password in this case, but when
+                                 * changing passwords that's not sufficeint, as we need to acquire all keys
+                                 * first. */
+                                return log_error_errno(r, "Security token not inserted, refusing.");
+                        else
+                                r = handle_generic_user_record_error(username, old_secret, &error, r, true);
+                        if (r < 0)
+                                return r;
+                } else
+                        break;
+        }
+
+        return 0;
+}
+
+static int parse_disk_size(const char *t, uint64_t *ret) {
+        int r;
+
+        assert(t);
+        assert(ret);
+
+        if (streq(t, "min"))
+                *ret = 0;
+        else if (streq(t, "max"))
+                *ret = UINT64_MAX-1;  /* Largest size that isn't UINT64_MAX special marker */
+        else {
+                uint64_t ds;
+
+                r = parse_size(t, 1024, &ds);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse disk size parameter: %s", t);
+
+                if (ds >= UINT64_MAX) /* UINT64_MAX has special meaning for us ("dont change"), refuse */
+                        return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Disk size out of range: %s", t);
+
+                *ret = ds;
+        }
+
+        return 0;
+}
+
+static int resize_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        uint64_t ds = UINT64_MAX;
+        int r;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        if (arg_disk_size_relative != UINT64_MAX ||
+            (argc > 2 && parse_permyriad(argv[2]) >= 0))
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                               "Relative disk size specification currently not supported when resizing.");
+
+        if (argc > 2) {
+                r = parse_disk_size(argv[2], &ds);
+                if (r < 0)
+                        return r;
+        }
+
+        if (arg_disk_size != UINT64_MAX) {
+                if (ds != UINT64_MAX && ds != arg_disk_size)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Disk size specified twice and doesn't match, refusing.");
+
+                ds = arg_disk_size;
+        }
+
+        r = acquire_passed_secrets(argv[1], &secret);
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "ResizeHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "st", argv[1], ds);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = bus_message_append_secret(m, secret);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        r = handle_generic_user_record_error(argv[1], secret, &error, r, false);
+                        if (r < 0)
+                                return r;
+                } else
+                        break;
+        }
+
+        return 0;
+}
+
+static int lock_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r, ret = 0;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, strv_skip(argv, 1)) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "LockHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "s", *i);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to lock home: %s", bus_error_message(&error, r));
+                        if (ret == 0)
+                                ret = r;
+                }
+        }
+
+        return ret;
+}
+
+static int unlock_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r, ret = 0;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(i, strv_skip(argv, 1)) {
+                _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+
+                r = acquire_passed_secrets(*i, &secret);
+                if (r < 0)
+                        return r;
+
+                for (;;) {
+                        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                        r = bus_message_new_method_call(bus, &m, bus_mgr, "UnlockHome");
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_message_append(m, "s", *i);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = bus_message_append_secret(m, secret);
+                        if (r < 0)
+                                return bus_log_create_error(r);
+
+                        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                        if (r < 0) {
+                                r = handle_generic_user_record_error(argv[1], secret, &error, r, false);
+                                if (r < 0) {
+                                        if (ret == 0)
+                                                ret = r;
+
+                                        break;
+                                }
+                        } else
+                                break;
+                }
+        }
+
+        return ret;
+}
+
+static int with_home(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        _cleanup_close_ int acquired_fd = -EBADF;
+        _cleanup_strv_free_ char **cmdline  = NULL;
+        const char *home;
+        int r, ret;
+        pid_t pid;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        if (argc < 3) {
+                _cleanup_free_ char *shell = NULL;
+
+                /* If no command is specified, spawn a shell */
+                r = get_shell(&shell);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire shell: %m");
+
+                cmdline = strv_new(shell);
+        } else
+                cmdline = strv_copy(argv + 2);
+        if (!cmdline)
+                return log_oom();
+
+        r = acquire_passed_secrets(argv[1], &secret);
+        if (r < 0)
+                return r;
+
+        for (;;) {
+                r = bus_message_new_method_call(bus, &m, bus_mgr, "AcquireHome");
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "s", argv[1]);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = bus_message_append_secret(m, secret);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_message_append(m, "b", /* please_suspend = */ getenv_bool("SYSTEMD_PLEASE_SUSPEND_HOME") > 0);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, &reply);
+                m = sd_bus_message_unref(m);
+                if (r < 0) {
+                        r = handle_generic_user_record_error(argv[1], secret, &error, r, false);
+                        if (r < 0)
+                                return r;
+
+                        sd_bus_error_free(&error);
+                } else {
+                        int fd;
+
+                        r = sd_bus_message_read(reply, "h", &fd);
+                        if (r < 0)
+                                return bus_log_parse_error(r);
+
+                        acquired_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+                        if (acquired_fd < 0)
+                                return log_error_errno(errno, "Failed to duplicate acquired fd: %m");
+
+                        reply = sd_bus_message_unref(reply);
+                        break;
+                }
+        }
+
+        r = bus_call_method(bus, bus_mgr, "GetHomeByName", &error, &reply, "s", argv[1]);
+        if (r < 0)
+                return log_error_errno(r, "Failed to inspect home: %s", bus_error_message(&error, r));
+
+        r = sd_bus_message_read(reply, "usussso", NULL, NULL, NULL, NULL, &home, NULL, NULL);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        r = safe_fork("(with)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE|FORK_REOPEN_LOG, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                if (chdir(home) < 0) {
+                        log_error_errno(errno, "Failed to change to directory %s: %m", home);
+                        _exit(255);
+                }
+
+                execvp(cmdline[0], cmdline);
+                log_error_errno(errno, "Failed to execute %s: %m", cmdline[0]);
+                _exit(255);
+        }
+
+        ret = wait_for_terminate_and_check(cmdline[0], pid, WAIT_LOG_ABNORMAL);
+
+        /* Close the fd that pings the home now. */
+        acquired_fd = safe_close(acquired_fd);
+
+        r = bus_message_new_method_call(bus, &m, bus_mgr, "ReleaseHome");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_message_append(m, "s", argv[1]);
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+        if (r < 0) {
+                if (sd_bus_error_has_name(&error, BUS_ERROR_HOME_BUSY))
+                        log_notice("Not deactivating home directory of %s, as it is still used.", argv[1]);
+                else
+                        return log_error_errno(r, "Failed to release user home: %s", bus_error_message(&error, r));
+        }
+
+        return ret;
+}
+
+static int lock_all_homes(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        r = bus_message_new_method_call(bus, &m, bus_mgr, "LockAllHomes");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to lock all homes: %s", bus_error_message(&error, r));
+
+        return 0;
+}
+
+static int deactivate_all_homes(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        r = bus_message_new_method_call(bus, &m, bus_mgr, "DeactivateAllHomes");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to deactivate all homes: %s", bus_error_message(&error, r));
+
+        return 0;
+}
+
+static int rebalance(int argc, char *argv[], void *userdata) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        r = acquire_bus(&bus);
+        if (r < 0)
+                return r;
+
+        r = bus_message_new_method_call(bus, &m, bus_mgr, "Rebalance");
+        if (r < 0)
+                return bus_log_create_error(r);
+
+        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+        if (r < 0) {
+                if (sd_bus_error_has_name(&error, BUS_ERROR_REBALANCE_NOT_NEEDED))
+                        log_info("No homes needed rebalancing.");
+                else
+                        return log_error_errno(r, "Failed to rebalance: %s", bus_error_message(&error, r));
+        } else
+                log_info("Completed rebalancing.");
+
+        return 0;
+}
+
+static int drop_from_identity(const char *field) {
+        int r;
+
+        assert(field);
+
+        /* If we are called to update an identity record and drop some field, let's keep track of what to
+         * remove from the old record */
+        r = strv_extend(&arg_identity_filter, field);
+        if (r < 0)
+                return log_oom();
+
+        /* Let's also drop the field if it was previously set to a new value on the same command line */
+        r = json_variant_filter(&arg_identity_extra, STRV_MAKE(field));
+        if (r < 0)
+                return log_error_errno(r, "Failed to filter JSON identity data: %m");
+
+        r = json_variant_filter(&arg_identity_extra_this_machine, STRV_MAKE(field));
+        if (r < 0)
+                return log_error_errno(r, "Failed to filter JSON identity data: %m");
+
+        r = json_variant_filter(&arg_identity_extra_privileged, STRV_MAKE(field));
+        if (r < 0)
+                return log_error_errno(r, "Failed to filter JSON identity data: %m");
+
+        return 0;
+}
+
+static int help(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        pager_open(arg_pager_flags);
+
+        r = terminal_urlify_man("homectl", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%1$s [OPTIONS...] COMMAND ...\n\n"
+               "%2$sCreate, manipulate or inspect home directories.%3$s\n"
+               "\n%4$sCommands:%5$s\n"
+               "  list                         List home areas\n"
+               "  activate USER…               Activate a home area\n"
+               "  deactivate USER…             Deactivate a home area\n"
+               "  inspect USER…                Inspect a home area\n"
+               "  authenticate USER…           Authenticate a home area\n"
+               "  create USER                  Create a home area\n"
+               "  remove USER…                 Remove a home area\n"
+               "  update USER                  Update a home area\n"
+               "  passwd USER                  Change password of a home area\n"
+               "  resize USER SIZE             Resize a home area\n"
+               "  lock USER…                   Temporarily lock an active home area\n"
+               "  unlock USER…                 Unlock a temporarily locked home area\n"
+               "  lock-all                     Lock all suitable home areas\n"
+               "  deactivate-all               Deactivate all active home areas\n"
+               "  rebalance                    Rebalance free space between home areas\n"
+               "  with USER [COMMAND…]         Run shell or command with access to a home area\n"
+               "\n%4$sOptions:%5$s\n"
+               "  -h --help                    Show this help\n"
+               "     --version                 Show package version\n"
+               "     --no-pager                Do not pipe output into a pager\n"
+               "     --no-legend               Do not show the headers and footers\n"
+               "     --no-ask-password         Do not ask for system passwords\n"
+               "  -H --host=[USER@]HOST        Operate on remote host\n"
+               "  -M --machine=CONTAINER       Operate on local container\n"
+               "     --identity=PATH           Read JSON identity from file\n"
+               "     --json=FORMAT             Output inspection data in JSON (takes one of\n"
+               "                               pretty, short, off)\n"
+               "  -j                           Equivalent to --json=pretty (on TTY) or\n"
+               "                               --json=short (otherwise)\n"
+               "     --export-format=          Strip JSON inspection data (full, stripped,\n"
+               "                               minimal)\n"
+               "  -E                           When specified once equals -j --export-format=\n"
+               "                               stripped, when specified twice equals\n"
+               "                               -j --export-format=minimal\n"
+               "\n%4$sGeneral User Record Properties:%5$s\n"
+               "  -c --real-name=REALNAME      Real name for user\n"
+               "     --realm=REALM             Realm to create user in\n"
+               "     --email-address=EMAIL     Email address for user\n"
+               "     --location=LOCATION       Set location of user on earth\n"
+               "     --icon-name=NAME          Icon name for user\n"
+               "  -d --home-dir=PATH           Home directory\n"
+               "  -u --uid=UID                 Numeric UID for user\n"
+               "  -G --member-of=GROUP         Add user to group\n"
+               "     --capability-bounding-set=CAPS\n"
+               "                               Bounding POSIX capability set\n"
+               "     --capability-ambient-set=CAPS\n"
+               "                               Ambient POSIX capability set\n"
+               "     --skel=PATH               Skeleton directory to use\n"
+               "     --shell=PATH              Shell for account\n"
+               "     --setenv=VARIABLE[=VALUE] Set an environment variable at log-in\n"
+               "     --timezone=TIMEZONE       Set a time-zone\n"
+               "     --language=LOCALE         Set preferred language\n"
+               "     --ssh-authorized-keys=KEYS\n"
+               "                               Specify SSH public keys\n"
+               "     --pkcs11-token-uri=URI    URI to PKCS#11 security token containing\n"
+               "                               private key and matching X.509 certificate\n"
+               "     --fido2-device=PATH       Path to FIDO2 hidraw device with hmac-secret\n"
+               "                               extension\n"
+               "     --fido2-with-client-pin=BOOL\n"
+               "                               Whether to require entering a PIN to unlock the\n"
+               "                               account\n"
+               "     --fido2-with-user-presence=BOOL\n"
+               "                               Whether to require user presence to unlock the\n"
+               "                               account\n"
+               "     --fido2-with-user-verification=BOOL\n"
+               "                               Whether to require user verification to unlock\n"
+               "                               the account\n"
+               "     --recovery-key=BOOL       Add a recovery key\n"
+               "\n%4$sAccount Management User  Record Properties:%5$s\n"
+               "     --locked=BOOL             Set locked account state\n"
+               "     --not-before=TIMESTAMP    Do not allow logins before\n"
+               "     --not-after=TIMESTAMP     Do not allow logins after\n"
+               "     --rate-limit-interval=SECS\n"
+               "                               Login rate-limit interval in seconds\n"
+               "     --rate-limit-burst=NUMBER\n"
+               "                               Login rate-limit attempts per interval\n"
+               "\n%4$sPassword Policy User Record Properties:%5$s\n"
+               "     --password-hint=HINT      Set Password hint\n"
+               "     --enforce-password-policy=BOOL\n"
+               "                               Control whether to enforce system's password\n"
+               "                               policy for this user\n"
+               "  -P                           Same as --enforce-password-password=no\n"
+               "     --password-change-now=BOOL\n"
+               "                               Require the password to be changed on next login\n"
+               "     --password-change-min=TIME\n"
+               "                               Require minimum time between password changes\n"
+               "     --password-change-max=TIME\n"
+               "                               Require maximum time between password changes\n"
+               "     --password-change-warn=TIME\n"
+               "                               How much time to warn before password expiry\n"
+               "     --password-change-inactive=TIME\n"
+               "                               How much time to block password after expiry\n"
+               "\n%4$sResource Management User Record Properties:%5$s\n"
+               "     --disk-size=BYTES         Size to assign the user on disk\n"
+               "     --access-mode=MODE        User home directory access mode\n"
+               "     --umask=MODE              Umask for user when logging in\n"
+               "     --nice=NICE               Nice level for user\n"
+               "     --rlimit=LIMIT=VALUE[:VALUE]\n"
+               "                               Set resource limits\n"
+               "     --tasks-max=MAX           Set maximum number of per-user tasks\n"
+               "     --memory-high=BYTES       Set high memory threshold in bytes\n"
+               "     --memory-max=BYTES        Set maximum memory limit\n"
+               "     --cpu-weight=WEIGHT       Set CPU weight\n"
+               "     --io-weight=WEIGHT        Set IO weight\n"
+               "\n%4$sStorage User Record Properties:%5$s\n"
+               "     --storage=STORAGE         Storage type to use (luks, fscrypt, directory,\n"
+               "                               subvolume, cifs)\n"
+               "     --image-path=PATH         Path to image file/directory\n"
+               "     --drop-caches=BOOL        Whether to automatically drop caches on logout\n"
+               "\n%4$sLUKS Storage User Record Properties:%5$s\n"
+               "     --fs-type=TYPE            File system type to use in case of luks\n"
+               "                               storage (btrfs, ext4, xfs)\n"
+               "     --luks-discard=BOOL       Whether to use 'discard' feature of file system\n"
+               "                               when activated (mounted)\n"
+               "     --luks-offline-discard=BOOL\n"
+               "                               Whether to trim file on logout\n"
+               "     --luks-cipher=CIPHER      Cipher to use for LUKS encryption\n"
+               "     --luks-cipher-mode=MODE   Cipher mode to use for LUKS encryption\n"
+               "     --luks-volume-key-size=BITS\n"
+               "                               Volume key size to use for LUKS encryption\n"
+               "     --luks-pbkdf-type=TYPE    Password-based Key Derivation Function to use\n"
+               "     --luks-pbkdf-hash-algorithm=ALGORITHM\n"
+               "                               PBKDF hash algorithm to use\n"
+               "     --luks-pbkdf-time-cost=SECS\n"
+               "                               Time cost for PBKDF in seconds\n"
+               "     --luks-pbkdf-memory-cost=BYTES\n"
+               "                               Memory cost for PBKDF in bytes\n"
+               "     --luks-pbkdf-parallel-threads=NUMBER\n"
+               "                               Number of parallel threads for PKBDF\n"
+               "     --luks-sector-size=BYTES\n"
+               "                               Sector size for LUKS encryption in bytes\n"
+               "     --luks-extra-mount-options=OPTIONS\n"
+               "                               LUKS extra mount options\n"
+               "     --auto-resize-mode=MODE   Automatically grow/shrink home on login/logout\n"
+               "     --rebalance-weight=WEIGHT Weight while rebalancing\n"
+               "\n%4$sMounting User Record Properties:%5$s\n"
+               "     --nosuid=BOOL             Control the 'nosuid' flag of the home mount\n"
+               "     --nodev=BOOL              Control the 'nodev' flag of the home mount\n"
+               "     --noexec=BOOL             Control the 'noexec' flag of the home mount\n"
+               "\n%4$sCIFS User Record Properties:%5$s\n"
+               "     --cifs-domain=DOMAIN      CIFS (Windows) domain\n"
+               "     --cifs-user-name=USER     CIFS (Windows) user name\n"
+               "     --cifs-service=SERVICE    CIFS (Windows) service to mount as home area\n"
+               "     --cifs-extra-mount-options=OPTIONS\n"
+               "                               CIFS (Windows) extra mount options\n"
+               "\n%4$sLogin Behaviour User Record Properties:%5$s\n"
+               "     --stop-delay=SECS         How long to leave user services running after\n"
+               "                               logout\n"
+               "     --kill-processes=BOOL     Whether to kill user processes when sessions\n"
+               "                               terminate\n"
+               "     --auto-login=BOOL         Try to log this user in automatically\n"
+               "\nSee the %6$s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               ansi_underline(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_PAGER,
+                ARG_NO_LEGEND,
+                ARG_NO_ASK_PASSWORD,
+                ARG_REALM,
+                ARG_EMAIL_ADDRESS,
+                ARG_DISK_SIZE,
+                ARG_ACCESS_MODE,
+                ARG_STORAGE,
+                ARG_FS_TYPE,
+                ARG_IMAGE_PATH,
+                ARG_UMASK,
+                ARG_LUKS_DISCARD,
+                ARG_LUKS_OFFLINE_DISCARD,
+                ARG_JSON,
+                ARG_SETENV,
+                ARG_TIMEZONE,
+                ARG_LANGUAGE,
+                ARG_LOCKED,
+                ARG_SSH_AUTHORIZED_KEYS,
+                ARG_LOCATION,
+                ARG_ICON_NAME,
+                ARG_PASSWORD_HINT,
+                ARG_NICE,
+                ARG_RLIMIT,
+                ARG_NOT_BEFORE,
+                ARG_NOT_AFTER,
+                ARG_LUKS_CIPHER,
+                ARG_LUKS_CIPHER_MODE,
+                ARG_LUKS_VOLUME_KEY_SIZE,
+                ARG_NOSUID,
+                ARG_NODEV,
+                ARG_NOEXEC,
+                ARG_CIFS_DOMAIN,
+                ARG_CIFS_USER_NAME,
+                ARG_CIFS_SERVICE,
+                ARG_CIFS_EXTRA_MOUNT_OPTIONS,
+                ARG_TASKS_MAX,
+                ARG_MEMORY_HIGH,
+                ARG_MEMORY_MAX,
+                ARG_CPU_WEIGHT,
+                ARG_IO_WEIGHT,
+                ARG_LUKS_PBKDF_TYPE,
+                ARG_LUKS_PBKDF_HASH_ALGORITHM,
+                ARG_LUKS_PBKDF_FORCE_ITERATIONS,
+                ARG_LUKS_PBKDF_TIME_COST,
+                ARG_LUKS_PBKDF_MEMORY_COST,
+                ARG_LUKS_PBKDF_PARALLEL_THREADS,
+                ARG_LUKS_SECTOR_SIZE,
+                ARG_RATE_LIMIT_INTERVAL,
+                ARG_RATE_LIMIT_BURST,
+                ARG_STOP_DELAY,
+                ARG_KILL_PROCESSES,
+                ARG_ENFORCE_PASSWORD_POLICY,
+                ARG_PASSWORD_CHANGE_NOW,
+                ARG_PASSWORD_CHANGE_MIN,
+                ARG_PASSWORD_CHANGE_MAX,
+                ARG_PASSWORD_CHANGE_WARN,
+                ARG_PASSWORD_CHANGE_INACTIVE,
+                ARG_EXPORT_FORMAT,
+                ARG_AUTO_LOGIN,
+                ARG_PKCS11_TOKEN_URI,
+                ARG_FIDO2_DEVICE,
+                ARG_FIDO2_WITH_PIN,
+                ARG_FIDO2_WITH_UP,
+                ARG_FIDO2_WITH_UV,
+                ARG_RECOVERY_KEY,
+                ARG_AND_RESIZE,
+                ARG_AND_CHANGE_PASSWORD,
+                ARG_DROP_CACHES,
+                ARG_LUKS_EXTRA_MOUNT_OPTIONS,
+                ARG_AUTO_RESIZE_MODE,
+                ARG_REBALANCE_WEIGHT,
+                ARG_FIDO2_CRED_ALG,
+                ARG_CAPABILITY_BOUNDING_SET,
+                ARG_CAPABILITY_AMBIENT_SET,
+        };
+
+        static const struct option options[] = {
+                { "help",                        no_argument,       NULL, 'h'                             },
+                { "version",                     no_argument,       NULL, ARG_VERSION                     },
+                { "no-pager",                    no_argument,       NULL, ARG_NO_PAGER                    },
+                { "no-legend",                   no_argument,       NULL, ARG_NO_LEGEND                   },
+                { "no-ask-password",             no_argument,       NULL, ARG_NO_ASK_PASSWORD             },
+                { "host",                        required_argument, NULL, 'H'                             },
+                { "machine",                     required_argument, NULL, 'M'                             },
+                { "identity",                    required_argument, NULL, 'I'                             },
+                { "real-name",                   required_argument, NULL, 'c'                             },
+                { "comment",                     required_argument, NULL, 'c'                             }, /* Compat alias to keep thing in sync with useradd(8) */
+                { "realm",                       required_argument, NULL, ARG_REALM                       },
+                { "email-address",               required_argument, NULL, ARG_EMAIL_ADDRESS               },
+                { "location",                    required_argument, NULL, ARG_LOCATION                    },
+                { "password-hint",               required_argument, NULL, ARG_PASSWORD_HINT               },
+                { "icon-name",                   required_argument, NULL, ARG_ICON_NAME                   },
+                { "home-dir",                    required_argument, NULL, 'd'                             }, /* Compatible with useradd(8) */
+                { "uid",                         required_argument, NULL, 'u'                             }, /* Compatible with useradd(8) */
+                { "member-of",                   required_argument, NULL, 'G'                             },
+                { "groups",                      required_argument, NULL, 'G'                             }, /* Compat alias to keep thing in sync with useradd(8) */
+                { "skel",                        required_argument, NULL, 'k'                             }, /* Compatible with useradd(8) */
+                { "shell",                       required_argument, NULL, 's'                             }, /* Compatible with useradd(8) */
+                { "setenv",                      required_argument, NULL, ARG_SETENV                      },
+                { "timezone",                    required_argument, NULL, ARG_TIMEZONE                    },
+                { "language",                    required_argument, NULL, ARG_LANGUAGE                    },
+                { "locked",                      required_argument, NULL, ARG_LOCKED                      },
+                { "not-before",                  required_argument, NULL, ARG_NOT_BEFORE                  },
+                { "not-after",                   required_argument, NULL, ARG_NOT_AFTER                   },
+                { "expiredate",                  required_argument, NULL, 'e'                             }, /* Compat alias to keep thing in sync with useradd(8) */
+                { "ssh-authorized-keys",         required_argument, NULL, ARG_SSH_AUTHORIZED_KEYS         },
+                { "disk-size",                   required_argument, NULL, ARG_DISK_SIZE                   },
+                { "access-mode",                 required_argument, NULL, ARG_ACCESS_MODE                 },
+                { "umask",                       required_argument, NULL, ARG_UMASK                       },
+                { "nice",                        required_argument, NULL, ARG_NICE                        },
+                { "rlimit",                      required_argument, NULL, ARG_RLIMIT                      },
+                { "tasks-max",                   required_argument, NULL, ARG_TASKS_MAX                   },
+                { "memory-high",                 required_argument, NULL, ARG_MEMORY_HIGH                 },
+                { "memory-max",                  required_argument, NULL, ARG_MEMORY_MAX                  },
+                { "cpu-weight",                  required_argument, NULL, ARG_CPU_WEIGHT                  },
+                { "io-weight",                   required_argument, NULL, ARG_IO_WEIGHT                   },
+                { "storage",                     required_argument, NULL, ARG_STORAGE                     },
+                { "image-path",                  required_argument, NULL, ARG_IMAGE_PATH                  },
+                { "fs-type",                     required_argument, NULL, ARG_FS_TYPE                     },
+                { "luks-discard",                required_argument, NULL, ARG_LUKS_DISCARD                },
+                { "luks-offline-discard",        required_argument, NULL, ARG_LUKS_OFFLINE_DISCARD        },
+                { "luks-cipher",                 required_argument, NULL, ARG_LUKS_CIPHER                 },
+                { "luks-cipher-mode",            required_argument, NULL, ARG_LUKS_CIPHER_MODE            },
+                { "luks-volume-key-size",        required_argument, NULL, ARG_LUKS_VOLUME_KEY_SIZE        },
+                { "luks-pbkdf-type",             required_argument, NULL, ARG_LUKS_PBKDF_TYPE             },
+                { "luks-pbkdf-hash-algorithm",   required_argument, NULL, ARG_LUKS_PBKDF_HASH_ALGORITHM   },
+                { "luks-pbkdf-force-iterations", required_argument, NULL, ARG_LUKS_PBKDF_FORCE_ITERATIONS },
+                { "luks-pbkdf-time-cost",        required_argument, NULL, ARG_LUKS_PBKDF_TIME_COST        },
+                { "luks-pbkdf-memory-cost",      required_argument, NULL, ARG_LUKS_PBKDF_MEMORY_COST      },
+                { "luks-pbkdf-parallel-threads", required_argument, NULL, ARG_LUKS_PBKDF_PARALLEL_THREADS },
+                { "luks-sector-size",            required_argument, NULL, ARG_LUKS_SECTOR_SIZE            },
+                { "nosuid",                      required_argument, NULL, ARG_NOSUID                      },
+                { "nodev",                       required_argument, NULL, ARG_NODEV                       },
+                { "noexec",                      required_argument, NULL, ARG_NOEXEC                      },
+                { "cifs-user-name",              required_argument, NULL, ARG_CIFS_USER_NAME              },
+                { "cifs-domain",                 required_argument, NULL, ARG_CIFS_DOMAIN                 },
+                { "cifs-service",                required_argument, NULL, ARG_CIFS_SERVICE                },
+                { "cifs-extra-mount-options",    required_argument, NULL, ARG_CIFS_EXTRA_MOUNT_OPTIONS    },
+                { "rate-limit-interval",         required_argument, NULL, ARG_RATE_LIMIT_INTERVAL         },
+                { "rate-limit-burst",            required_argument, NULL, ARG_RATE_LIMIT_BURST            },
+                { "stop-delay",                  required_argument, NULL, ARG_STOP_DELAY                  },
+                { "kill-processes",              required_argument, NULL, ARG_KILL_PROCESSES              },
+                { "enforce-password-policy",     required_argument, NULL, ARG_ENFORCE_PASSWORD_POLICY     },
+                { "password-change-now",         required_argument, NULL, ARG_PASSWORD_CHANGE_NOW         },
+                { "password-change-min",         required_argument, NULL, ARG_PASSWORD_CHANGE_MIN         },
+                { "password-change-max",         required_argument, NULL, ARG_PASSWORD_CHANGE_MAX         },
+                { "password-change-warn",        required_argument, NULL, ARG_PASSWORD_CHANGE_WARN        },
+                { "password-change-inactive",    required_argument, NULL, ARG_PASSWORD_CHANGE_INACTIVE    },
+                { "auto-login",                  required_argument, NULL, ARG_AUTO_LOGIN                  },
+                { "json",                        required_argument, NULL, ARG_JSON                        },
+                { "export-format",               required_argument, NULL, ARG_EXPORT_FORMAT               },
+                { "pkcs11-token-uri",            required_argument, NULL, ARG_PKCS11_TOKEN_URI            },
+                { "fido2-credential-algorithm",  required_argument, NULL, ARG_FIDO2_CRED_ALG              },
+                { "fido2-device",                required_argument, NULL, ARG_FIDO2_DEVICE                },
+                { "fido2-with-client-pin",       required_argument, NULL, ARG_FIDO2_WITH_PIN              },
+                { "fido2-with-user-presence",    required_argument, NULL, ARG_FIDO2_WITH_UP               },
+                { "fido2-with-user-verification",required_argument, NULL, ARG_FIDO2_WITH_UV               },
+                { "recovery-key",                required_argument, NULL, ARG_RECOVERY_KEY                },
+                { "and-resize",                  required_argument, NULL, ARG_AND_RESIZE                  },
+                { "and-change-password",         required_argument, NULL, ARG_AND_CHANGE_PASSWORD         },
+                { "drop-caches",                 required_argument, NULL, ARG_DROP_CACHES                 },
+                { "luks-extra-mount-options",    required_argument, NULL, ARG_LUKS_EXTRA_MOUNT_OPTIONS    },
+                { "auto-resize-mode",            required_argument, NULL, ARG_AUTO_RESIZE_MODE            },
+                { "rebalance-weight",            required_argument, NULL, ARG_REBALANCE_WEIGHT            },
+                { "capability-bounding-set",     required_argument, NULL, ARG_CAPABILITY_BOUNDING_SET     },
+                { "capability-ambient-set",      required_argument, NULL, ARG_CAPABILITY_AMBIENT_SET      },
+                {}
+        };
+
+        int r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        for (;;) {
+                int c;
+
+                c = getopt_long(argc, argv, "hH:M:I:c:d:u:k:s:e:G:jPE", options, NULL);
+                if (c < 0)
+                        break;
+
+                switch (c) {
+
+                case 'h':
+                        return help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_NO_PAGER:
+                        arg_pager_flags |= PAGER_DISABLE;
+                        break;
+
+                case ARG_NO_LEGEND:
+                        arg_legend = false;
+                        break;
+
+                case ARG_NO_ASK_PASSWORD:
+                        arg_ask_password = false;
+                        break;
+
+                case 'H':
+                        arg_transport = BUS_TRANSPORT_REMOTE;
+                        arg_host = optarg;
+                        break;
+
+                case 'M':
+                        arg_transport = BUS_TRANSPORT_MACHINE;
+                        arg_host = optarg;
+                        break;
+
+                case 'I':
+                        arg_identity = optarg;
+                        break;
+
+                case 'c':
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("realName");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        if (!valid_gecos(optarg))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Real name '%s' not a valid GECOS field.", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "realName", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set realName field: %m");
+
+                        break;
+
+                case 'd': {
+                        _cleanup_free_ char *hd = NULL;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("homeDirectory");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_path_argument(optarg, false, &hd);
+                        if (r < 0)
+                                return r;
+
+                        if (!valid_home(hd))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Home directory '%s' not valid.", hd);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "homeDirectory", hd);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set homeDirectory field: %m");
+
+                        break;
+                }
+
+                case ARG_REALM:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("realm");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = dns_name_is_valid(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to determine whether realm '%s' is a valid DNS domain: %m", optarg);
+                        if (r == 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Realm '%s' is not a valid DNS domain: %m", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "realm", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set realm field: %m");
+                        break;
+
+                case ARG_EMAIL_ADDRESS:
+                case ARG_LOCATION:
+                case ARG_ICON_NAME:
+                case ARG_CIFS_USER_NAME:
+                case ARG_CIFS_DOMAIN:
+                case ARG_CIFS_EXTRA_MOUNT_OPTIONS:
+                case ARG_LUKS_EXTRA_MOUNT_OPTIONS: {
+
+                        const char *field =
+                                           c == ARG_EMAIL_ADDRESS ? "emailAddress" :
+                                                c == ARG_LOCATION ? "location" :
+                                               c == ARG_ICON_NAME ? "iconName" :
+                                          c == ARG_CIFS_USER_NAME ? "cifsUserName" :
+                                             c == ARG_CIFS_DOMAIN ? "cifsDomain" :
+                                c == ARG_CIFS_EXTRA_MOUNT_OPTIONS ? "cifsExtraMountOptions" :
+                                c == ARG_LUKS_EXTRA_MOUNT_OPTIONS ? "luksExtraMountOptions" :
+                                                                    NULL;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = json_variant_set_field_string(&arg_identity_extra, field, optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case ARG_CIFS_SERVICE:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("cifsService");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_cifs_service(optarg, NULL, NULL, NULL);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to validate CIFS service name: %s", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "cifsService", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set cifsService field: %m");
+
+                        break;
+
+                case ARG_PASSWORD_HINT:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("passwordHint");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = json_variant_set_field_string(&arg_identity_extra_privileged, "passwordHint", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set passwordHint field: %m");
+
+                        string_erase(optarg);
+                        break;
+
+                case ARG_NICE: {
+                        int nc;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("niceLevel");
+                                if (r < 0)
+                                        return r;
+                                break;
+                        }
+
+                        r = parse_nice(optarg, &nc);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse nice level: %s", optarg);
+
+                        r = json_variant_set_field_integer(&arg_identity_extra, "niceLevel", nc);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set niceLevel field: %m");
+
+                        break;
+                }
+
+                case ARG_RLIMIT: {
+                        _cleanup_(json_variant_unrefp) JsonVariant *jcur = NULL, *jmax = NULL;
+                        _cleanup_free_ char *field = NULL, *t = NULL;
+                        const char *eq;
+                        struct rlimit rl;
+                        int l;
+
+                        if (isempty(optarg)) {
+                                /* Remove all resource limits */
+
+                                r = drop_from_identity("resourceLimits");
+                                if (r < 0)
+                                        return r;
+
+                                arg_identity_filter_rlimits = strv_free(arg_identity_filter_rlimits);
+                                arg_identity_extra_rlimits = json_variant_unref(arg_identity_extra_rlimits);
+                                break;
+                        }
+
+                        eq = strchr(optarg, '=');
+                        if (!eq)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Can't parse resource limit assignment: %s", optarg);
+
+                        field = strndup(optarg, eq - optarg);
+                        if (!field)
+                                return log_oom();
+
+                        l = rlimit_from_string_harder(field);
+                        if (l < 0)
+                                return log_error_errno(l, "Unknown resource limit type: %s", field);
+
+                        if (isempty(eq + 1)) {
+                                /* Remove only the specific rlimit */
+
+                                r = strv_extend(&arg_identity_filter_rlimits, rlimit_to_string(l));
+                                if (r < 0)
+                                        return r;
+
+                                r = json_variant_filter(&arg_identity_extra_rlimits, STRV_MAKE(field));
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to filter JSON identity data: %m");
+
+                                break;
+                        }
+
+                        r = rlimit_parse(l, eq + 1, &rl);
+                        if (r < 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse resource limit value: %s", eq + 1);
+
+                        r = rl.rlim_cur == RLIM_INFINITY ? json_variant_new_null(&jcur) : json_variant_new_unsigned(&jcur, rl.rlim_cur);
+                        if (r < 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to allocate current integer: %m");
+
+                        r = rl.rlim_max == RLIM_INFINITY ? json_variant_new_null(&jmax) : json_variant_new_unsigned(&jmax, rl.rlim_max);
+                        if (r < 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to allocate maximum integer: %m");
+
+                        t = strjoin("RLIMIT_", rlimit_to_string(l));
+                        if (!t)
+                                return log_oom();
+
+                        r = json_variant_set_fieldb(
+                                        &arg_identity_extra_rlimits, t,
+                                        JSON_BUILD_OBJECT(
+                                                        JSON_BUILD_PAIR("cur", JSON_BUILD_VARIANT(jcur)),
+                                                        JSON_BUILD_PAIR("max", JSON_BUILD_VARIANT(jmax))));
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", rlimit_to_string(l));
+
+                        break;
+                }
+
+                case 'u': {
+                        uid_t uid;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("uid");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_uid(optarg, &uid);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse UID '%s'.", optarg);
+
+                        if (uid_is_system(uid))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID " UID_FMT " is in system range, refusing.", uid);
+                        if (uid_is_dynamic(uid))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID " UID_FMT " is in dynamic range, refusing.", uid);
+                        if (uid == UID_NOBODY)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID " UID_FMT " is nobody UID, refusing.", uid);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, "uid", uid);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set realm field: %m");
+
+                        break;
+                }
+
+                case 'k':
+                case ARG_IMAGE_PATH: {
+                        const char *field = c == 'k' ? "skeletonDirectory" : "imagePath";
+                        _cleanup_free_ char *v = NULL;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_path_argument(optarg, false, &v);
+                        if (r < 0)
+                                return r;
+
+                        r = json_variant_set_field_string(&arg_identity_extra_this_machine, field, v);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", v);
+
+                        break;
+                }
+
+                case 's':
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("shell");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        if (!valid_shell(optarg))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Shell '%s' not valid.", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "shell", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set shell field: %m");
+
+                        break;
+
+                case ARG_SETENV: {
+                        _cleanup_free_ char **l = NULL;
+                        _cleanup_(json_variant_unrefp) JsonVariant *ne = NULL;
+                        JsonVariant *e;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("environment");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        e = json_variant_by_key(arg_identity_extra, "environment");
+                        if (e) {
+                                r = json_variant_strv(e, &l);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse JSON environment field: %m");
+                        }
+
+                        r = strv_env_replace_strdup_passthrough(&l, optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
+
+                        strv_sort(l);
+
+                        r = json_variant_new_array_strv(&ne, l);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to allocate environment list JSON: %m");
+
+                        r = json_variant_set_field(&arg_identity_extra, "environment", ne);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set environment list: %m");
+
+                        break;
+                }
+
+                case ARG_TIMEZONE:
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("timeZone");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        if (!timezone_is_valid(optarg, LOG_DEBUG))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Timezone '%s' is not valid.", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "timeZone", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set timezone field: %m");
+
+                        break;
+
+                case ARG_LANGUAGE:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("language");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        if (!locale_is_valid(optarg))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Locale '%s' is not valid.", optarg);
+
+                        if (locale_is_installed(optarg) <= 0)
+                                log_warning("Locale '%s' is not installed, accepting anyway.", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "preferredLanguage", optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set preferredLanguage field: %m");
+
+                        break;
+
+                case ARG_NOSUID:
+                case ARG_NODEV:
+                case ARG_NOEXEC:
+                case ARG_LOCKED:
+                case ARG_KILL_PROCESSES:
+                case ARG_ENFORCE_PASSWORD_POLICY:
+                case ARG_AUTO_LOGIN:
+                case ARG_PASSWORD_CHANGE_NOW: {
+                        const char *field =
+                                                 c == ARG_LOCKED ? "locked" :
+                                                 c == ARG_NOSUID ? "mountNoSuid" :
+                                                  c == ARG_NODEV ? "mountNoDevices" :
+                                                 c == ARG_NOEXEC ? "mountNoExecute" :
+                                         c == ARG_KILL_PROCESSES ? "killProcesses" :
+                                c == ARG_ENFORCE_PASSWORD_POLICY ? "enforcePasswordPolicy" :
+                                             c == ARG_AUTO_LOGIN ? "autoLogin" :
+                                    c == ARG_PASSWORD_CHANGE_NOW ? "passwordChangeNow" :
+                                                                   NULL;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse %s boolean: %m", field);
+
+                        r = json_variant_set_field_boolean(&arg_identity_extra, field, r > 0);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case 'P':
+                        r = json_variant_set_field_boolean(&arg_identity_extra, "enforcePasswordPolicy", false);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set enforcePasswordPolicy field: %m");
+
+                        break;
+
+                case ARG_DISK_SIZE:
+                        if (isempty(optarg)) {
+                                FOREACH_STRING(prop, "diskSize", "diskSizeRelative", "rebalanceWeight") {
+                                        r = drop_from_identity(prop);
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                arg_disk_size = arg_disk_size_relative = UINT64_MAX;
+                                break;
+                        }
+
+                        r = parse_permyriad(optarg);
+                        if (r < 0) {
+                                r = parse_disk_size(optarg, &arg_disk_size);
+                                if (r < 0)
+                                        return r;
+
+                                r = drop_from_identity("diskSizeRelative");
+                                if (r < 0)
+                                        return r;
+
+                                r = json_variant_set_field_unsigned(&arg_identity_extra_this_machine, "diskSize", arg_disk_size);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to set diskSize field: %m");
+
+                                arg_disk_size_relative = UINT64_MAX;
+                        } else {
+                                /* Normalize to UINT32_MAX == 100% */
+                                arg_disk_size_relative = UINT32_SCALE_FROM_PERMYRIAD(r);
+
+                                r = drop_from_identity("diskSize");
+                                if (r < 0)
+                                        return r;
+
+                                r = json_variant_set_field_unsigned(&arg_identity_extra_this_machine, "diskSizeRelative", arg_disk_size_relative);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to set diskSizeRelative field: %m");
+
+                                arg_disk_size = UINT64_MAX;
+                        }
+
+                        /* Automatically turn off the rebalance logic if user configured a size explicitly */
+                        r = json_variant_set_field_unsigned(&arg_identity_extra_this_machine, "rebalanceWeight", REBALANCE_WEIGHT_OFF);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set rebalanceWeight field: %m");
+
+                        break;
+
+                case ARG_ACCESS_MODE: {
+                        mode_t mode;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("accessMode");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_mode(optarg, &mode);
+                        if (r < 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Access mode '%s' not valid.", optarg);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, "accessMode", mode);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set access mode field: %m");
+
+                        break;
+                }
+
+                case ARG_LUKS_DISCARD:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("luksDiscard");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --luks-discard= parameter: %s", optarg);
+
+                        r = json_variant_set_field_boolean(&arg_identity_extra, "luksDiscard", r);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set discard field: %m");
+
+                        break;
+
+                case ARG_LUKS_OFFLINE_DISCARD:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("luksOfflineDiscard");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --luks-offline-discard= parameter: %s", optarg);
+
+                        r = json_variant_set_field_boolean(&arg_identity_extra, "luksOfflineDiscard", r);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set offline discard field: %m");
+
+                        break;
+
+                case ARG_LUKS_VOLUME_KEY_SIZE:
+                case ARG_LUKS_PBKDF_FORCE_ITERATIONS:
+                case ARG_LUKS_PBKDF_PARALLEL_THREADS:
+                case ARG_RATE_LIMIT_BURST: {
+                        const char *field =
+                                       c == ARG_LUKS_VOLUME_KEY_SIZE ? "luksVolumeKeySize" :
+                                c == ARG_LUKS_PBKDF_FORCE_ITERATIONS ? "luksPbkdfForceIterations" :
+                                c == ARG_LUKS_PBKDF_PARALLEL_THREADS ? "luksPbkdfParallelThreads" :
+                                           c == ARG_RATE_LIMIT_BURST ? "rateLimitBurst" : NULL;
+                        unsigned n;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        r = safe_atou(optarg, &n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse %s parameter: %s", field, optarg);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, field, n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case ARG_LUKS_SECTOR_SIZE: {
+                        uint64_t ss;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("luksSectorSize");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_sector_size(optarg, &ss);
+                        if (r < 0)
+                                return r;
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, "luksSectorSize", ss);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set sector size field: %m");
+
+                        break;
+                }
+
+                case ARG_UMASK: {
+                        mode_t m;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("umask");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_mode(optarg, &m);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse umask: %m");
+
+                        r = json_variant_set_field_integer(&arg_identity_extra, "umask", m);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set umask field: %m");
+
+                        break;
+                }
+
+                case ARG_SSH_AUTHORIZED_KEYS: {
+                        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                        _cleanup_strv_free_ char **l = NULL, **add = NULL;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("sshAuthorizedKeys");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        if (optarg[0] == '@') {
+                                _cleanup_fclose_ FILE *f = NULL;
+
+                                /* If prefixed with '@' read from a file */
+
+                                f = fopen(optarg+1, "re");
+                                if (!f)
+                                        return log_error_errno(errno, "Failed to open '%s': %m", optarg+1);
+
+                                for (;;) {
+                                        _cleanup_free_ char *line = NULL;
+
+                                        r = read_line(f, LONG_LINE_MAX, &line);
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to read from '%s': %m", optarg+1);
+                                        if (r == 0)
+                                                break;
+
+                                        if (isempty(line))
+                                                continue;
+
+                                        if (line[0] == '#')
+                                                continue;
+
+                                        r = strv_consume(&add, TAKE_PTR(line));
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+                        } else {
+                                /* Otherwise, assume it's a literal key. Let's do some superficial checks
+                                 * before accept it though. */
+
+                                if (string_has_cc(optarg, NULL))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Authorized key contains control characters, refusing.");
+                                if (optarg[0] == '#')
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified key is a comment?");
+
+                                add = strv_new(optarg);
+                                if (!add)
+                                        return log_oom();
+                        }
+
+                        v = json_variant_ref(json_variant_by_key(arg_identity_extra_privileged, "sshAuthorizedKeys"));
+                        if (v) {
+                                r = json_variant_strv(v, &l);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse SSH authorized keys list: %m");
+                        }
+
+                        r = strv_extend_strv(&l, add, true);
+                        if (r < 0)
+                                return log_oom();
+
+                        v = json_variant_unref(v);
+
+                        r = json_variant_new_array_strv(&v, l);
+                        if (r < 0)
+                                return log_oom();
+
+                        r = json_variant_set_field(&arg_identity_extra_privileged, "sshAuthorizedKeys", v);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set authorized keys: %m");
+
+                        break;
+                }
+
+                case ARG_NOT_BEFORE:
+                case ARG_NOT_AFTER:
+                case 'e': {
+                        const char *field;
+                        usec_t n;
+
+                        field =           c == ARG_NOT_BEFORE ? "notBeforeUSec" :
+                                IN_SET(c, ARG_NOT_AFTER, 'e') ? "notAfterUSec" : NULL;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        /* Note the minor discrepancy regarding -e parsing here: we support that for compat
+                         * reasons, and in the original useradd(8) implementation it accepts dates in the
+                         * format YYYY-MM-DD. Coincidentally, we accept dates formatted like that too, but
+                         * with greater precision. */
+                        r = parse_timestamp(optarg, &n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse %s parameter: %m", field);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, field, n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+                        break;
+                }
+
+                case ARG_PASSWORD_CHANGE_MIN:
+                case ARG_PASSWORD_CHANGE_MAX:
+                case ARG_PASSWORD_CHANGE_WARN:
+                case ARG_PASSWORD_CHANGE_INACTIVE: {
+                        const char *field;
+                        usec_t n;
+
+                        field =      c == ARG_PASSWORD_CHANGE_MIN ? "passwordChangeMinUSec" :
+                                     c == ARG_PASSWORD_CHANGE_MAX ? "passwordChangeMaxUSec" :
+                                    c == ARG_PASSWORD_CHANGE_WARN ? "passwordChangeWarnUSec" :
+                                c == ARG_PASSWORD_CHANGE_INACTIVE ? "passwordChangeInactiveUSec" :
+                                                                    NULL;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_sec(optarg, &n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse %s parameter: %m", field);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, field, n);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+                        break;
+                }
+
+                case ARG_STORAGE:
+                case ARG_FS_TYPE:
+                case ARG_LUKS_CIPHER:
+                case ARG_LUKS_CIPHER_MODE:
+                case ARG_LUKS_PBKDF_TYPE:
+                case ARG_LUKS_PBKDF_HASH_ALGORITHM: {
+
+                        const char *field =
+                                                  c == ARG_STORAGE ? "storage" :
+                                                  c == ARG_FS_TYPE ? "fileSystemType" :
+                                              c == ARG_LUKS_CIPHER ? "luksCipher" :
+                                         c == ARG_LUKS_CIPHER_MODE ? "luksCipherMode" :
+                                          c == ARG_LUKS_PBKDF_TYPE ? "luksPbkdfType" :
+                                c == ARG_LUKS_PBKDF_HASH_ALGORITHM ? "luksPbkdfHashAlgorithm" : NULL;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        if (!string_is_safe(optarg))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Parameter for %s field not valid: %s", field, optarg);
+
+                        r = json_variant_set_field_string(
+                                        IN_SET(c, ARG_STORAGE, ARG_FS_TYPE) ?
+                                        &arg_identity_extra_this_machine :
+                                        &arg_identity_extra, field, optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case ARG_LUKS_PBKDF_TIME_COST:
+                case ARG_RATE_LIMIT_INTERVAL:
+                case ARG_STOP_DELAY: {
+                        const char *field =
+                                c == ARG_LUKS_PBKDF_TIME_COST ? "luksPbkdfTimeCostUSec" :
+                                 c == ARG_RATE_LIMIT_INTERVAL ? "rateLimitIntervalUSec" :
+                                          c == ARG_STOP_DELAY ? "stopDelayUSec" :
+                                                                NULL;
+                        usec_t t;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = parse_sec(optarg, &t);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse %s field: %s", field, optarg);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, field, t);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case 'G': {
+                        const char *p = optarg;
+
+                        if (isempty(p)) {
+                                r = drop_from_identity("memberOf");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        for (;;) {
+                                _cleanup_(json_variant_unrefp) JsonVariant *mo = NULL;
+                                _cleanup_strv_free_ char **list = NULL;
+                                _cleanup_free_ char *word = NULL;
+
+                                r = extract_first_word(&p, &word, ",", 0);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse group list: %m");
+                                if (r == 0)
+                                        break;
+
+                                if (!valid_user_group_name(word, 0))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid group name %s.", word);
+
+                                mo = json_variant_ref(json_variant_by_key(arg_identity_extra, "memberOf"));
+
+                                r = json_variant_strv(mo, &list);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse group list: %m");
+
+                                r = strv_extend(&list, word);
+                                if (r < 0)
+                                        return log_oom();
+
+                                strv_sort(list);
+                                strv_uniq(list);
+
+                                mo = json_variant_unref(mo);
+                                r = json_variant_new_array_strv(&mo, list);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to create group list JSON: %m");
+
+                                r = json_variant_set_field(&arg_identity_extra, "memberOf", mo);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to update group list: %m");
+                        }
+
+                        break;
+                }
+
+                case ARG_TASKS_MAX: {
+                        uint64_t u;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("tasksMax");
+                                if (r < 0)
+                                        return r;
+                                break;
+                        }
+
+                        r = safe_atou64(optarg, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --tasks-max= parameter: %s", optarg);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, "tasksMax", u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set tasksMax field: %m");
+
+                        break;
+                }
+
+                case ARG_MEMORY_MAX:
+                case ARG_MEMORY_HIGH:
+                case ARG_LUKS_PBKDF_MEMORY_COST: {
+                        const char *field =
+                                            c == ARG_MEMORY_MAX ? "memoryMax" :
+                                           c == ARG_MEMORY_HIGH ? "memoryHigh" :
+                                c == ARG_LUKS_PBKDF_MEMORY_COST ? "luksPbkdfMemoryCost" : NULL;
+
+                        uint64_t u;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+                                break;
+                        }
+
+                        r = parse_size(optarg, 1024, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse %s parameter: %s", field, optarg);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra_this_machine, field, u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case ARG_CPU_WEIGHT:
+                case ARG_IO_WEIGHT: {
+                        const char *field = c == ARG_CPU_WEIGHT ? "cpuWeight" :
+                                            c == ARG_IO_WEIGHT ? "ioWeight" : NULL;
+                        uint64_t u;
+
+                        assert(field);
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+                                break;
+                        }
+
+                        r = safe_atou64(optarg, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --cpu-weight=/--io-weight= parameter: %s", optarg);
+
+                        if (!CGROUP_WEIGHT_IS_OK(u))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Weight %" PRIu64 " is out of valid weight range.", u);
+
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, field, u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        break;
+                }
+
+                case ARG_PKCS11_TOKEN_URI:
+                        if (streq(optarg, "list"))
+                                return pkcs11_list_tokens();
+
+                        /* If --pkcs11-token-uri= is specified we always drop everything old */
+                        FOREACH_STRING(p, "pkcs11TokenUri", "pkcs11EncryptedKey") {
+                                r = drop_from_identity(p);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        if (isempty(optarg)) {
+                                arg_pkcs11_token_uri = strv_free(arg_pkcs11_token_uri);
+                                break;
+                        }
+
+                        if (streq(optarg, "auto")) {
+                                _cleanup_free_ char *found = NULL;
+
+                                r = pkcs11_find_token_auto(&found);
+                                if (r < 0)
+                                        return r;
+                                r = strv_consume(&arg_pkcs11_token_uri, TAKE_PTR(found));
+                        } else {
+                                if (!pkcs11_uri_valid(optarg))
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Not a valid PKCS#11 URI: %s", optarg);
+
+                                r = strv_extend(&arg_pkcs11_token_uri, optarg);
+                        }
+                        if (r < 0)
+                                return r;
+
+                        strv_uniq(arg_pkcs11_token_uri);
+                        break;
+
+                case ARG_FIDO2_CRED_ALG:
+                        r = parse_fido2_algorithm(optarg, &arg_fido2_cred_alg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse COSE algorithm: %s", optarg);
+                        break;
+
+                case ARG_FIDO2_DEVICE:
+                        if (streq(optarg, "list"))
+                                return fido2_list_devices();
+
+                        FOREACH_STRING(p, "fido2HmacCredential", "fido2HmacSalt") {
+                                r = drop_from_identity(p);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        if (isempty(optarg)) {
+                                arg_fido2_device = strv_free(arg_fido2_device);
+                                break;
+                        }
+
+                        if (streq(optarg, "auto")) {
+                                _cleanup_free_ char *found = NULL;
+
+                                r = fido2_find_device_auto(&found);
+                                if (r < 0)
+                                        return r;
+
+                                r = strv_consume(&arg_fido2_device, TAKE_PTR(found));
+                        } else
+                                r = strv_extend(&arg_fido2_device, optarg);
+                        if (r < 0)
+                                return r;
+
+                        strv_uniq(arg_fido2_device);
+                        break;
+
+                case ARG_FIDO2_WITH_PIN:
+                        r = parse_boolean_argument("--fido2-with-client-pin=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_fido2_lock_with, FIDO2ENROLL_PIN, r);
+                        break;
+
+                case ARG_FIDO2_WITH_UP:
+                        r = parse_boolean_argument("--fido2-with-user-presence=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_fido2_lock_with, FIDO2ENROLL_UP, r);
+                        break;
+
+                case ARG_FIDO2_WITH_UV:
+                        r = parse_boolean_argument("--fido2-with-user-verification=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_fido2_lock_with, FIDO2ENROLL_UV, r);
+                        break;
+
+                case ARG_RECOVERY_KEY:
+                        r = parse_boolean(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --recovery-key= argument: %s", optarg);
+
+                        arg_recovery_key = r;
+
+                        FOREACH_STRING(p, "recoveryKey", "recoveryKeyType") {
+                                r = drop_from_identity(p);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        break;
+
+                case ARG_AUTO_RESIZE_MODE:
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("autoResizeMode");
+                                if (r < 0)
+                                        return r;
+
+                                break;
+                        }
+
+                        r = auto_resize_mode_from_string(optarg);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --auto-resize-mode= argument: %s", optarg);
+
+                        r = json_variant_set_field_string(&arg_identity_extra, "autoResizeMode", auto_resize_mode_to_string(r));
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set autoResizeMode field: %m");
+
+                        break;
+
+                case ARG_REBALANCE_WEIGHT: {
+                        uint64_t u;
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("rebalanceWeight");
+                                if (r < 0)
+                                        return r;
+                                break;
+                        }
+
+                        if (streq(optarg, "off"))
+                                u = REBALANCE_WEIGHT_OFF;
+                        else {
+                                r = safe_atou64(optarg, &u);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse --rebalance-weight= argument: %s", optarg);
+
+                                if (u < REBALANCE_WEIGHT_MIN || u > REBALANCE_WEIGHT_MAX)
+                                        return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Rebalancing weight out of valid range %" PRIu64 "%s%" PRIu64 ": %s",
+                                                               REBALANCE_WEIGHT_MIN, special_glyph(SPECIAL_GLYPH_ELLIPSIS), REBALANCE_WEIGHT_MAX, optarg);
+                        }
+
+                        /* Drop from per machine stuff and everywhere */
+                        r = drop_from_identity("rebalanceWeight");
+                        if (r < 0)
+                                return r;
+
+                        /* Add to main identity */
+                        r = json_variant_set_field_unsigned(&arg_identity_extra, "rebalanceWeight", u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set rebalanceWeight field: %m");
+
+                        break;
+                }
+
+                case 'j':
+                        arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO;
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case 'E':
+                        if (arg_export_format == EXPORT_FORMAT_FULL)
+                                arg_export_format = EXPORT_FORMAT_STRIPPED;
+                        else if (arg_export_format == EXPORT_FORMAT_STRIPPED)
+                                arg_export_format = EXPORT_FORMAT_MINIMAL;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specifying -E more than twice is not supported.");
+
+                        arg_json_format_flags &= ~JSON_FORMAT_OFF;
+                        if (arg_json_format_flags == 0)
+                                arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO;
+                        break;
+
+                case ARG_EXPORT_FORMAT:
+                        if (streq(optarg, "full"))
+                                arg_export_format = EXPORT_FORMAT_FULL;
+                        else if (streq(optarg, "stripped"))
+                                arg_export_format = EXPORT_FORMAT_STRIPPED;
+                        else if (streq(optarg, "minimal"))
+                                arg_export_format = EXPORT_FORMAT_MINIMAL;
+                        else if (streq(optarg, "help")) {
+                                puts("full\n"
+                                     "stripped\n"
+                                     "minimal");
+                                return 0;
+                        }
+
+                        break;
+
+                case ARG_AND_RESIZE:
+                        arg_and_resize = true;
+                        break;
+
+                case ARG_AND_CHANGE_PASSWORD:
+                        arg_and_change_password = true;
+                        break;
+
+                case ARG_DROP_CACHES: {
+                        if (isempty(optarg)) {
+                                r = drop_from_identity("dropCaches");
+                                if (r < 0)
+                                        return r;
+                                break;
+                        }
+
+                        r = parse_boolean_argument("--drop-caches=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        r = json_variant_set_field_boolean(&arg_identity_extra, "dropCaches", r);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set drop caches field: %m");
+
+                        break;
+                }
+
+                case ARG_CAPABILITY_AMBIENT_SET:
+                case ARG_CAPABILITY_BOUNDING_SET: {
+                        _cleanup_strv_free_ char **l = NULL;
+                        bool subtract = false;
+                        uint64_t parsed, *which, updated;
+                        const char *p, *field;
+
+                        if (c == ARG_CAPABILITY_AMBIENT_SET) {
+                                which = &arg_capability_ambient_set;
+                                field = "capabilityAmbientSet";
+                        } else {
+                                assert(c == ARG_CAPABILITY_BOUNDING_SET);
+                                which = &arg_capability_bounding_set;
+                                field = "capabilityBoundingSet";
+                        }
+
+                        if (isempty(optarg)) {
+                                r = drop_from_identity(field);
+                                if (r < 0)
+                                        return r;
+
+                                *which = UINT64_MAX;
+                                break;
+                        }
+
+                        p = optarg;
+                        if (*p == '~') {
+                                subtract = true;
+                                p++;
+                        }
+
+                        r = capability_set_from_string(p, &parsed);
+                        if (r == 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid capabilities in capability string '%s'.", p);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse capability string '%s': %m", p);
+
+                        if (*which == UINT64_MAX)
+                                updated = subtract ? all_capabilities() & ~parsed : parsed;
+                        else if (subtract)
+                                updated = *which & ~parsed;
+                        else
+                                updated = *which | parsed;
+
+                        if (capability_set_to_strv(updated, &l) < 0)
+                                return log_oom();
+
+                        r = json_variant_set_field_strv(&arg_identity_extra, field, l);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set %s field: %m", field);
+
+                        *which = updated;
+                        break;
+                }
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        if (!strv_isempty(arg_pkcs11_token_uri) || !strv_isempty(arg_fido2_device))
+                arg_and_change_password = true;
+
+        if (arg_disk_size != UINT64_MAX || arg_disk_size_relative != UINT64_MAX)
+                arg_and_resize = true;
+
+        return 1;
+}
+
+static int redirect_bus_mgr(void) {
+        const char *suffix;
+
+        /* Talk to a different service if that's requested. (The same env var is also understood by homed, so
+         * that it is relatively easily possible to invoke a second instance of homed for debug purposes and
+         * have homectl talk to it, without colliding with the host version. This is handy when operating
+         * from a homed-managed account.) */
+
+        suffix = getenv("SYSTEMD_HOME_DEBUG_SUFFIX");
+        if (suffix) {
+                static BusLocator locator = {
+                        .path = "/org/freedesktop/home1",
+                        .interface = "org.freedesktop.home1.Manager",
+                };
+
+                /* Yes, we leak this memory, but there's little point to collect this, given that we only do
+                 * this in a debug environment, do it only once, and the string shall live for out entire
+                 * process runtime. */
+
+                locator.destination = strjoin("org.freedesktop.home1.", suffix);
+                if (!locator.destination)
+                        return log_oom();
+
+                bus_mgr = &locator;
+        } else
+                bus_mgr = bus_home_mgr;
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help",           VERB_ANY, VERB_ANY, 0,            help                 },
+                { "list",           VERB_ANY, 1,        VERB_DEFAULT, list_homes           },
+                { "activate",       2,        VERB_ANY, 0,            activate_home        },
+                { "deactivate",     2,        VERB_ANY, 0,            deactivate_home      },
+                { "inspect",        VERB_ANY, VERB_ANY, 0,            inspect_home         },
+                { "authenticate",   VERB_ANY, VERB_ANY, 0,            authenticate_home    },
+                { "create",         VERB_ANY, 2,        0,            create_home          },
+                { "remove",         2,        VERB_ANY, 0,            remove_home          },
+                { "update",         VERB_ANY, 2,        0,            update_home          },
+                { "passwd",         VERB_ANY, 2,        0,            passwd_home          },
+                { "resize",         2,        3,        0,            resize_home          },
+                { "lock",           2,        VERB_ANY, 0,            lock_home            },
+                { "unlock",         2,        VERB_ANY, 0,            unlock_home          },
+                { "with",           2,        VERB_ANY, 0,            with_home            },
+                { "lock-all",       VERB_ANY, 1,        0,            lock_all_homes       },
+                { "deactivate-all", VERB_ANY, 1,        0,            deactivate_all_homes },
+                { "rebalance",      VERB_ANY, 1,        0,            rebalance            },
+                {}
+        };
+
+        int r;
+
+        log_setup();
+
+        r = redirect_bus_mgr();
+        if (r < 0)
+                return r;
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
diff --git a/src/home/homed-bus.c b/src/home/homed-bus.c
new file mode 100644
index 0000000..24b421a
--- /dev/null
+++ b/src/home/homed-bus.c
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "homed-bus.h"
+#include "strv.h"
+
+int bus_message_read_secret(sd_bus_message *m, UserRecord **ret, sd_bus_error *error) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *full = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        unsigned line = 0, column = 0;
+        const char *json;
+        int r;
+
+        assert(ret);
+
+        r = sd_bus_message_read(m, "s", &json);
+        if (r < 0)
+                return r;
+
+        r = json_parse(json, JSON_PARSE_SENSITIVE, &v, &line, &column);
+        if (r < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to parse JSON secret record at %u:%u: %m", line, column);
+
+        r = json_build(&full, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("secret", JSON_BUILD_VARIANT(v))));
+        if (r < 0)
+                return r;
+
+        hr = user_record_new();
+        if (!hr)
+                return -ENOMEM;
+
+        r = user_record_load(hr, full, USER_RECORD_REQUIRE_SECRET|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(hr);
+        return 0;
+}
+
+int bus_message_read_home_record(sd_bus_message *m, UserRecordLoadFlags flags, UserRecord **ret, sd_bus_error *error) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        unsigned line = 0, column = 0;
+        const char *json;
+        int r;
+
+        assert(ret);
+
+        r = sd_bus_message_read(m, "s", &json);
+        if (r < 0)
+                return r;
+
+        r = json_parse(json, JSON_PARSE_SENSITIVE, &v, &line, &column);
+        if (r < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to parse JSON identity record at %u:%u: %m", line, column);
+
+        hr = user_record_new();
+        if (!hr)
+                return -ENOMEM;
+
+        r = user_record_load(hr, v, flags);
+        if (r < 0)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "JSON data is not a valid identity record");
+
+        *ret = TAKE_PTR(hr);
+        return 0;
+}
diff --git a/src/home/homed-bus.h b/src/home/homed-bus.h
new file mode 100644
index 0000000..977679b
--- /dev/null
+++ b/src/home/homed-bus.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "user-record.h"
+#include "json.h"
+
+int bus_message_read_secret(sd_bus_message *m, UserRecord **ret, sd_bus_error *error);
+int bus_message_read_home_record(sd_bus_message *m, UserRecordLoadFlags flags, UserRecord **ret, sd_bus_error *error);
diff --git a/src/home/homed-conf.c b/src/home/homed-conf.c
new file mode 100644
index 0000000..ffa4bb3
--- /dev/null
+++ b/src/home/homed-conf.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "constants.h"
+#include "home-util.h"
+#include "homed-conf.h"
+
+int manager_parse_config_file(Manager *m) {
+
+        assert(m);
+
+        return config_parse_config_file("homed.conf", "Home\0",
+                                        config_item_perf_lookup, homed_gperf_lookup,
+                                        CONFIG_PARSE_WARN, m);
+}
+
+DEFINE_CONFIG_PARSE_ENUM(config_parse_default_storage, user_storage, UserStorage, "Failed to parse default storage setting");
+
+int config_parse_default_file_system_type(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        char **s = ASSERT_PTR(data);
+
+        assert(rvalue);
+
+        if (!isempty(rvalue) && !supported_fstype(rvalue)) {
+                log_syntax(unit, LOG_WARNING, filename, line, 0, "Unsupported file system, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        return free_and_strdup_warn(s, empty_to_null(rvalue));
+
+}
diff --git a/src/home/homed-conf.h b/src/home/homed-conf.h
new file mode 100644
index 0000000..1defaa9
--- /dev/null
+++ b/src/home/homed-conf.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "conf-parser.h"
+#include "homed-manager.h"
+
+int manager_parse_config_file(Manager *m);
+
+const struct ConfigPerfItem* homed_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_default_storage);
+CONFIG_PARSER_PROTOTYPE(config_parse_default_file_system_type);
diff --git a/src/home/homed-gperf.gperf b/src/home/homed-gperf.gperf
new file mode 100644
index 0000000..39aca35
--- /dev/null
+++ b/src/home/homed-gperf.gperf
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+%{
+#if __GNUC__ >= 7
+_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
+#endif
+#include 
+#include "conf-parser.h"
+#include "homed-conf.h"
+%}
+struct ConfigPerfItem;
+%null_strings
+%language=ANSI-C
+%define slot-name section_and_lvalue
+%define hash-function-name homed_gperf_hash
+%define lookup-function-name homed_gperf_lookup
+%readonly-tables
+%omit-struct-type
+%struct-type
+%includes
+%%
+Home.DefaultStorage,        config_parse_default_storage,          0, offsetof(Manager, default_storage)
+Home.DefaultFileSystemType, config_parse_default_file_system_type, 0, offsetof(Manager, default_file_system_type)
diff --git a/src/home/homed-home-bus.c b/src/home/homed-home-bus.c
new file mode 100644
index 0000000..a47f4d8
--- /dev/null
+++ b/src/home/homed-home-bus.c
@@ -0,0 +1,926 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "bus-common-errors.h"
+#include "bus-polkit.h"
+#include "fd-util.h"
+#include "homed-bus.h"
+#include "homed-home-bus.h"
+#include "homed-home.h"
+#include "strv.h"
+#include "user-record-util.h"
+#include "user-util.h"
+
+static int property_get_unix_record(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Home *h = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(
+                        reply, "(suusss)",
+                        h->user_name,
+                        (uint32_t) h->uid,
+                        h->record ? (uint32_t) user_record_gid(h->record) : GID_INVALID,
+                        h->record ? user_record_real_name(h->record) : NULL,
+                        h->record ? user_record_home_directory(h->record) : NULL,
+                        h->record ? user_record_shell(h->record) : NULL);
+}
+
+static int property_get_state(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Home *h = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "s", home_state_to_string(home_get_state(h)));
+}
+
+int bus_home_client_is_trusted(Home *h, sd_bus_message *message) {
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        uid_t euid;
+        int r;
+
+        assert(h);
+
+        if (!message)
+                return -EINVAL;
+
+        r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_creds_get_euid(creds, &euid);
+        if (r < 0)
+                return r;
+
+        return euid == 0 || h->uid == euid;
+}
+
+int bus_home_get_record_json(
+                Home *h,
+                sd_bus_message *message,
+                char **ret,
+                bool *ret_incomplete) {
+
+        _cleanup_(user_record_unrefp) UserRecord *augmented = NULL;
+        UserRecordLoadFlags flags;
+        int r, trusted;
+
+        assert(h);
+        assert(ret);
+
+        trusted = bus_home_client_is_trusted(h, message);
+        if (trusted < 0) {
+                log_warning_errno(trusted, "Failed to determine whether client is trusted, assuming untrusted.");
+                trusted = false;
+        }
+
+        flags = USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_BINDING|USER_RECORD_STRIP_SECRET|USER_RECORD_ALLOW_STATUS|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_PERMISSIVE;
+        if (trusted)
+                flags |= USER_RECORD_ALLOW_PRIVILEGED;
+        else
+                flags |= USER_RECORD_STRIP_PRIVILEGED;
+
+        r = home_augment_status(h, flags, &augmented);
+        if (r < 0)
+                return r;
+
+        r = json_variant_format(augmented->json, 0, ret);
+        if (r < 0)
+                return r;
+
+        if (ret_incomplete)
+                *ret_incomplete = augmented->incomplete;
+
+        return 0;
+}
+
+static int property_get_user_record(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *json = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        bool incomplete;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = bus_home_get_record_json(h, sd_bus_get_current_message(bus), &json, &incomplete);
+        if (r < 0)
+                return r;
+
+        return sd_bus_message_append(reply, "(sb)", json, incomplete);
+}
+
+int bus_home_method_activate(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = home_activate(h, secret, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        /* The operation is now in process, keep track of this message so that we can later reply to it. */
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_deactivate(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = home_deactivate(h, false, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_unregister(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.remove-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_unregister(h, error);
+        if (r < 0)
+                return r;
+
+        assert(r > 0);
+
+        /* Note that home_unregister() destroyed 'h' here, so no more accesses */
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_home_method_realize(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.create-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_create(h, secret, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        h->unregister_on_failure = false;
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_remove(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.remove-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_remove(h, error);
+        if (r < 0)
+                return r;
+        if (r > 0) /* Done already. Note that home_remove() destroyed 'h' here, so no more accesses */
+                return sd_bus_reply_method_return(message, NULL);
+
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_fixate(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = home_fixate(h, secret, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_authenticate(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.authenticate-home",
+                        NULL,
+                        true,
+                        h->uid,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_authenticate(h, secret, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_update_record(Home *h, sd_bus_message *message, UserRecord *hr, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+        assert(message);
+        assert(hr);
+
+        r = user_record_is_supported(hr, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.update-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_update(h, hr, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_update(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_home_record(message, USER_RECORD_REQUIRE_REGULAR|USER_RECORD_REQUIRE_SECRET|USER_RECORD_ALLOW_PRIVILEGED|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_PERMISSIVE, &hr, error);
+        if (r < 0)
+                return r;
+
+        return bus_home_method_update_record(h, message, hr, error);
+}
+
+int bus_home_method_resize(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        uint64_t sz;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "t", &sz);
+        if (r < 0)
+                return r;
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.resize-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_resize(h, sz, secret, /* automatic= */ false, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_change_password(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *new_secret = NULL, *old_secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &new_secret, error);
+        if (r < 0)
+                return r;
+
+        r = bus_message_read_secret(message, &old_secret, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.passwd-home",
+                        NULL,
+                        true,
+                        h->uid,
+                        &h->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = home_passwd(h, new_secret, old_secret, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_lock(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = home_lock(h, error);
+        if (r < 0)
+                return r;
+        if (r > 0) /* Done */
+                return sd_bus_reply_method_return(message, NULL);
+
+        /* The operation is now in process, keep track of this message so that we can later reply to it. */
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_unlock(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = home_unlock(h, secret, error);
+        if (r < 0)
+                return r;
+
+        assert(r == 0);
+        assert(!h->current_operation);
+
+        /* The operation is now in process, keep track of this message so that we can later reply to it. */
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_acquire(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        _cleanup_(operation_unrefp) Operation *o = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        int r, please_suspend;
+        Home *h = ASSERT_PTR(userdata);
+
+        assert(message);
+
+        r = bus_message_read_secret(message, &secret, error);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_read(message, "b", &please_suspend);
+        if (r < 0)
+                return r;
+
+        /* This operation might not be something we can executed immediately, hence queue it */
+        fd = home_create_fifo(h, please_suspend);
+        if (fd < 0)
+                return sd_bus_reply_method_errnof(message, fd, "Failed to allocate FIFO for %s: %m", h->user_name);
+
+        o = operation_new(OPERATION_ACQUIRE, message);
+        if (!o)
+                return -ENOMEM;
+
+        o->secret = TAKE_PTR(secret);
+        o->send_fd = TAKE_FD(fd);
+
+        r = home_schedule_operation(h, o, error);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+int bus_home_method_ref(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_close_ int fd = -EBADF;
+        Home *h = ASSERT_PTR(userdata);
+        HomeState state;
+        int please_suspend, r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "b", &please_suspend);
+        if (r < 0)
+                return r;
+
+        state = home_get_state(h);
+        switch (state) {
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_UNFIXATED:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_NOT_ACTIVE, "Home %s not active.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        default:
+                if (HOME_STATE_IS_ACTIVE(state))
+                        break;
+
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        fd = home_create_fifo(h, please_suspend);
+        if (fd < 0)
+                return sd_bus_reply_method_errnof(message, fd, "Failed to allocate FIFO for %s: %m", h->user_name);
+
+        return sd_bus_reply_method_return(message, "h", fd);
+}
+
+int bus_home_method_release(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(operation_unrefp) Operation *o = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        o = operation_new(OPERATION_RELEASE, message);
+        if (!o)
+                return -ENOMEM;
+
+        r = home_schedule_operation(h, o, error);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+/* We map a uid_t as uint32_t bus property, let's ensure this is safe. */
+assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+
+int bus_home_path(Home *h, char **ret) {
+        assert(ret);
+
+        return sd_bus_path_encode("/org/freedesktop/home1/home", h->user_name, ret);
+}
+
+static int bus_home_object_find(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                void *userdata,
+                void **found,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *e = NULL;
+        Manager *m = userdata;
+        uid_t uid;
+        Home *h;
+        int r;
+
+        r = sd_bus_path_decode(path, "/org/freedesktop/home1/home", &e);
+        if (r <= 0)
+                return 0;
+
+        if (parse_uid(e, &uid) >= 0)
+                h = hashmap_get(m->homes_by_uid, UID_TO_PTR(uid));
+        else
+                h = hashmap_get(m->homes_by_name, e);
+        if (!h)
+                return 0;
+
+        *found = h;
+        return 1;
+}
+
+static int bus_home_node_enumerator(
+                sd_bus *bus,
+                const char *path,
+                void *userdata,
+                char ***nodes,
+                sd_bus_error *error) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        Manager *m = userdata;
+        size_t k = 0;
+        Home *h;
+        int r;
+
+        assert(nodes);
+
+        l = new0(char*, hashmap_size(m->homes_by_uid) + 1);
+        if (!l)
+                return -ENOMEM;
+
+        HASHMAP_FOREACH(h, m->homes_by_uid) {
+                r = bus_home_path(h, l + k);
+                if (r < 0)
+                        return r;
+
+                k++;
+        }
+
+        *nodes = TAKE_PTR(l);
+        return 1;
+}
+
+const sd_bus_vtable home_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_PROPERTY("UserName", "s",
+                        NULL, offsetof(Home, user_name),
+                        SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("UID", "u",
+                        NULL, offsetof(Home, uid),
+                        SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("UnixRecord", "(suusss)",
+                        property_get_unix_record, 0,
+                        SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("State", "s",
+                        property_get_state, 0,
+                        0),
+        SD_BUS_PROPERTY("UserRecord", "(sb)",
+                        property_get_user_record, 0,
+                        SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION|SD_BUS_VTABLE_SENSITIVE),
+
+        SD_BUS_METHOD_WITH_ARGS("Activate",
+                                SD_BUS_ARGS("s", secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_activate,
+                                SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD("Deactivate", NULL, NULL, bus_home_method_deactivate, 0),
+        SD_BUS_METHOD("Unregister", NULL, NULL, bus_home_method_unregister, SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Realize",
+                                SD_BUS_ARGS("s", secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_realize,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        SD_BUS_METHOD("Remove", NULL, NULL, bus_home_method_remove, SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Fixate",
+                                SD_BUS_ARGS("s", secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_fixate,
+                                SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("Authenticate",
+                                SD_BUS_ARGS("s", secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_authenticate,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("Update",
+                                SD_BUS_ARGS("s", user_record),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_update,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("Resize",
+                                SD_BUS_ARGS("t", size, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_resize,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("ChangePassword",
+                                SD_BUS_ARGS("s", new_secret, "s", old_secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_change_password,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD("Lock", NULL, NULL, bus_home_method_lock, 0),
+        SD_BUS_METHOD_WITH_ARGS("Unlock",
+                                SD_BUS_ARGS("s", secret),
+                                SD_BUS_NO_RESULT,
+                                bus_home_method_unlock,
+                                SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("Acquire",
+                                SD_BUS_ARGS("s", secret, "b", please_suspend),
+                                SD_BUS_RESULT("h", send_fd),
+                                bus_home_method_acquire,
+                                SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("Ref",
+                                SD_BUS_ARGS("b", please_suspend),
+                                SD_BUS_RESULT("h", send_fd),
+                                bus_home_method_ref,
+                                0),
+        SD_BUS_METHOD("Release", NULL, NULL, bus_home_method_release, 0),
+        SD_BUS_VTABLE_END
+};
+
+const BusObjectImplementation home_object = {
+        "/org/freedesktop/home1/home",
+        "org.freedesktop.home1.Home",
+        .fallback_vtables = BUS_FALLBACK_VTABLES({home_vtable, bus_home_object_find}),
+        .node_enumerator = bus_home_node_enumerator,
+        .manager = true,
+};
+
+static int on_deferred_change(sd_event_source *s, void *userdata) {
+        _cleanup_free_ char *path = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int r;
+
+        h->deferred_change_event_source = sd_event_source_disable_unref(h->deferred_change_event_source);
+
+        r = bus_home_path(h, &path);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to generate home bus path, ignoring: %m");
+                return 0;
+        }
+
+        if (h->announced)
+                r = sd_bus_emit_properties_changed_strv(h->manager->bus, path, "org.freedesktop.home1.Home", NULL);
+        else
+                r = sd_bus_emit_object_added(h->manager->bus, path);
+        if (r < 0)
+                log_warning_errno(r, "Failed to send home change event, ignoring: %m");
+        else
+                h->announced = true;
+
+        return 0;
+}
+
+int bus_home_emit_change(Home *h) {
+        int r;
+
+        assert(h);
+
+        if (h->deferred_change_event_source)
+                return 1;
+
+        if (!h->manager->event)
+                return 0;
+
+        if (IN_SET(sd_event_get_state(h->manager->event), SD_EVENT_FINISHED, SD_EVENT_EXITING))
+                return 0;
+
+        r = sd_event_add_defer(h->manager->event, &h->deferred_change_event_source, on_deferred_change, h);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate deferred change event source: %m");
+
+        r = sd_event_source_set_priority(h->deferred_change_event_source, SD_EVENT_PRIORITY_IDLE+5);
+        if (r < 0)
+                log_warning_errno(r, "Failed to tweak priority of event source, ignoring: %m");
+
+        (void) sd_event_source_set_description(h->deferred_change_event_source, "deferred-change-event");
+        return 1;
+}
+
+int bus_home_emit_remove(Home *h) {
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        assert(h);
+
+        if (!h->announced)
+                return 0;
+
+        if (!h->manager)
+                return 0;
+
+        if (!h->manager->bus)
+                return 0;
+
+        r = bus_home_path(h, &path);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_emit_object_removed(h->manager->bus, path);
+        if (r < 0)
+                return r;
+
+        h->announced = false;
+        return 1;
+}
diff --git a/src/home/homed-home-bus.h b/src/home/homed-home-bus.h
new file mode 100644
index 0000000..5522178
--- /dev/null
+++ b/src/home/homed-home-bus.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "bus-object.h"
+#include "homed-home.h"
+
+int bus_home_client_is_trusted(Home *h, sd_bus_message *message);
+int bus_home_get_record_json(Home *h, sd_bus_message *message, char **ret, bool *ret_incomplete);
+
+int bus_home_method_activate(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_deactivate(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_unregister(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_realize(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_remove(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_fixate(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_authenticate(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_update(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_update_record(Home *home, sd_bus_message *message, UserRecord *hr, sd_bus_error *error);
+int bus_home_method_resize(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_change_password(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_lock(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_unlock(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_acquire(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_home_method_release(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+extern const BusObjectImplementation home_object;
+
+int bus_home_path(Home *h, char **ret);
+
+int bus_home_emit_change(Home *h);
+int bus_home_emit_remove(Home *h);
diff --git a/src/home/homed-home.c b/src/home/homed-home.c
new file mode 100644
index 0000000..37b3270
--- /dev/null
+++ b/src/home/homed-home.c
@@ -0,0 +1,3214 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_LINUX_MEMFD_H
+#include 
+#endif
+
+#include 
+#include 
+#include 
+
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "bus-common-errors.h"
+#include "bus-locator.h"
+#include "data-fd-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "filesystems.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "home-util.h"
+#include "homed-home-bus.h"
+#include "homed-home.h"
+#include "memfd-util.h"
+#include "missing_magic.h"
+#include "missing_mman.h"
+#include "missing_syscall.h"
+#include "mkdir.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "quota-util.h"
+#include "resize-fs.h"
+#include "set.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "uid-alloc-range.h"
+#include "user-record-password-quality.h"
+#include "user-record-sign.h"
+#include "user-record-util.h"
+#include "user-record.h"
+#include "user-util.h"
+
+/* Retry to deactivate home directories again and again every 15s until it works */
+#define RETRY_DEACTIVATE_USEC (15U * USEC_PER_SEC)
+
+#define HOME_USERS_MAX 500
+#define PENDING_OPERATIONS_MAX 100
+
+assert_cc(HOME_UID_MIN <= HOME_UID_MAX);
+assert_cc(HOME_USERS_MAX <= (HOME_UID_MAX - HOME_UID_MIN + 1));
+
+static int home_start_work(Home *h, const char *verb, UserRecord *hr, UserRecord *secret);
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(operation_hash_ops, void, trivial_hash_func, trivial_compare_func, Operation, operation_unref);
+
+static int suitable_home_record(UserRecord *hr) {
+        int r;
+
+        assert(hr);
+
+        if (!hr->user_name)
+                return -EUNATCH;
+
+        /* We are a bit more restrictive with what we accept as homed-managed user than what we accept in
+         * home records in general. Let's enforce the stricter rule here. */
+        if (!suitable_user_name(hr->user_name))
+                return -EINVAL;
+        if (!uid_is_valid(hr->uid))
+                return -EINVAL;
+
+        /* Insist we are outside of the dynamic and system range */
+        if (uid_is_system(hr->uid) || gid_is_system(user_record_gid(hr)) ||
+            uid_is_dynamic(hr->uid) || gid_is_dynamic(user_record_gid(hr)))
+                return -EADDRNOTAVAIL;
+
+        /* Insist that GID and UID match */
+        if (user_record_gid(hr) != (gid_t) hr->uid)
+                return -EBADSLT;
+
+        /* Similar for the realm */
+        if (hr->realm) {
+                r = suitable_realm(hr->realm);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EINVAL;
+        }
+
+        return 0;
+}
+
+int home_new(Manager *m, UserRecord *hr, const char *sysfs, Home **ret) {
+        _cleanup_(home_freep) Home *home = NULL;
+        _cleanup_free_ char *nm = NULL, *ns = NULL;
+        int r;
+
+        assert(m);
+        assert(hr);
+
+        r = suitable_home_record(hr);
+        if (r < 0)
+                return r;
+
+        if (hashmap_contains(m->homes_by_name, hr->user_name))
+                return -EBUSY;
+
+        if (hashmap_contains(m->homes_by_uid, UID_TO_PTR(hr->uid)))
+                return -EBUSY;
+
+        if (sysfs && hashmap_contains(m->homes_by_sysfs, sysfs))
+                return -EBUSY;
+
+        if (hashmap_size(m->homes_by_name) >= HOME_USERS_MAX)
+                return -EUSERS;
+
+        nm = strdup(hr->user_name);
+        if (!nm)
+                return -ENOMEM;
+
+        if (sysfs) {
+                ns = strdup(sysfs);
+                if (!ns)
+                        return -ENOMEM;
+        }
+
+        home = new(Home, 1);
+        if (!home)
+                return -ENOMEM;
+
+        *home = (Home) {
+                .manager = m,
+                .user_name = TAKE_PTR(nm),
+                .uid = hr->uid,
+                .state = _HOME_STATE_INVALID,
+                .worker_stdout_fd = -EBADF,
+                .sysfs = TAKE_PTR(ns),
+                .signed_locally = -1,
+                .pin_fd = -EBADF,
+                .luks_lock_fd = -EBADF,
+        };
+
+        r = hashmap_put(m->homes_by_name, home->user_name, home);
+        if (r < 0)
+                return r;
+
+        r = hashmap_put(m->homes_by_uid, UID_TO_PTR(home->uid), home);
+        if (r < 0)
+                return r;
+
+        if (home->sysfs) {
+                r = hashmap_put(m->homes_by_sysfs, home->sysfs, home);
+                if (r < 0)
+                        return r;
+        }
+
+        r = user_record_clone(hr, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE, &home->record);
+        if (r < 0)
+                return r;
+
+        (void) bus_manager_emit_auto_login_changed(m);
+        (void) bus_home_emit_change(home);
+        (void) manager_schedule_rebalance(m, /* immediately= */ false);
+
+        if (ret)
+                *ret = TAKE_PTR(home);
+        else
+                TAKE_PTR(home);
+
+        return 0;
+}
+
+Home *home_free(Home *h) {
+
+        if (!h)
+                return NULL;
+
+        if (h->manager) {
+                (void) bus_home_emit_remove(h);
+                (void) bus_manager_emit_auto_login_changed(h->manager);
+
+                if (h->user_name)
+                        (void) hashmap_remove_value(h->manager->homes_by_name, h->user_name, h);
+
+                if (uid_is_valid(h->uid))
+                        (void) hashmap_remove_value(h->manager->homes_by_uid, UID_TO_PTR(h->uid), h);
+
+                if (h->sysfs)
+                        (void) hashmap_remove_value(h->manager->homes_by_sysfs, h->sysfs, h);
+
+                if (h->worker_pid > 0)
+                        (void) hashmap_remove_value(h->manager->homes_by_worker_pid, PID_TO_PTR(h->worker_pid), h);
+
+                if (h->manager->gc_focus == h)
+                        h->manager->gc_focus = NULL;
+
+                (void) manager_schedule_rebalance(h->manager, /* immediately= */ false);
+        }
+
+        user_record_unref(h->record);
+        user_record_unref(h->secret);
+
+        h->worker_event_source = sd_event_source_disable_unref(h->worker_event_source);
+        safe_close(h->worker_stdout_fd);
+        free(h->user_name);
+        free(h->sysfs);
+
+        h->ref_event_source_please_suspend = sd_event_source_disable_unref(h->ref_event_source_please_suspend);
+        h->ref_event_source_dont_suspend = sd_event_source_disable_unref(h->ref_event_source_dont_suspend);
+
+        h->pending_operations = ordered_set_free(h->pending_operations);
+        h->pending_event_source = sd_event_source_disable_unref(h->pending_event_source);
+        h->deferred_change_event_source = sd_event_source_disable_unref(h->deferred_change_event_source);
+
+        h->current_operation = operation_unref(h->current_operation);
+
+        safe_close(h->pin_fd);
+        safe_close(h->luks_lock_fd);
+
+        h->retry_deactivate_event_source = sd_event_source_disable_unref(h->retry_deactivate_event_source);
+
+        return mfree(h);
+}
+
+int home_set_record(Home *h, UserRecord *hr) {
+        _cleanup_(user_record_unrefp) UserRecord *new_hr = NULL;
+        Home *other;
+        int r;
+
+        assert(h);
+        assert(h->user_name);
+        assert(h->record);
+        assert(hr);
+
+        if (user_record_equal(h->record, hr))
+                return 0;
+
+        r = suitable_home_record(hr);
+        if (r < 0)
+                return r;
+
+        if (!user_record_compatible(h->record, hr))
+                return -EREMCHG;
+
+        if (!FLAGS_SET(hr->mask, USER_RECORD_REGULAR) ||
+            FLAGS_SET(hr->mask, USER_RECORD_SECRET))
+                return -EINVAL;
+
+        if (FLAGS_SET(h->record->mask, USER_RECORD_STATUS)) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+                /* Hmm, the existing record has status fields? If so, copy them over */
+
+                v = json_variant_ref(hr->json);
+                r = json_variant_set_field(&v, "status", json_variant_by_key(h->record->json, "status"));
+                if (r < 0)
+                        return r;
+
+                new_hr = user_record_new();
+                if (!new_hr)
+                        return -ENOMEM;
+
+                r = user_record_load(new_hr, v, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE);
+                if (r < 0)
+                        return r;
+
+                hr = new_hr;
+        }
+
+        other = hashmap_get(h->manager->homes_by_uid, UID_TO_PTR(hr->uid));
+        if (other && other != h)
+                return -EBUSY;
+
+        if (h->uid != hr->uid) {
+                r = hashmap_remove_and_replace(h->manager->homes_by_uid, UID_TO_PTR(h->uid), UID_TO_PTR(hr->uid), h);
+                if (r < 0)
+                        return r;
+        }
+
+        user_record_unref(h->record);
+        h->record = user_record_ref(hr);
+        h->uid = h->record->uid;
+
+        /* The updated record might have a different autologin setting, trigger a PropertiesChanged event for it */
+        (void) bus_manager_emit_auto_login_changed(h->manager);
+        (void) bus_home_emit_change(h);
+
+        return 0;
+}
+
+int home_save_record(Home *h) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_free_ char *text = NULL;
+        const char *fn;
+        int r;
+
+        assert(h);
+
+        v = json_variant_ref(h->record->json);
+        r = json_variant_normalize(&v);
+        if (r < 0)
+                log_warning_errno(r, "User record could not be normalized.");
+
+        r = json_variant_format(v, JSON_FORMAT_PRETTY|JSON_FORMAT_NEWLINE, &text);
+        if (r < 0)
+                return r;
+
+        (void) mkdir("/var/lib/systemd/", 0755);
+        (void) mkdir(home_record_dir(), 0700);
+
+        fn = strjoina(home_record_dir(), "/", h->user_name, ".identity");
+
+        r = write_string_file(fn, text, WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0600|WRITE_STRING_FILE_SYNC);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+int home_unlink_record(Home *h) {
+        const char *fn;
+
+        assert(h);
+
+        fn = strjoina(home_record_dir(), "/", h->user_name, ".identity");
+        if (unlink(fn) < 0 && errno != ENOENT)
+                return -errno;
+
+        fn = strjoina("/run/systemd/home/", h->user_name, ".ref");
+        if (unlink(fn) < 0 && errno != ENOENT)
+                return -errno;
+
+        return 0;
+}
+
+static void home_unpin(Home *h) {
+        assert(h);
+
+        if (h->pin_fd < 0)
+                return;
+
+        h->pin_fd = safe_close(h->pin_fd);
+        log_debug("Successfully closed pin fd on home for %s.", h->user_name);
+}
+
+static void home_pin(Home *h) {
+        const char *path;
+
+        assert(h);
+
+        if (h->pin_fd >= 0) /* Already pinned? */
+                return;
+
+        path = user_record_home_directory(h->record);
+        if (!path) {
+                log_warning("No home directory path to pin for %s, ignoring.", h->user_name);
+                return;
+        }
+
+        h->pin_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+        if (h->pin_fd < 0) {
+                log_warning_errno(errno, "Couldn't open home directory '%s' for pinning, ignoring: %m", path);
+                return;
+        }
+
+        log_debug("Successfully pinned home directory '%s'.", path);
+}
+
+static void home_update_pin_fd(Home *h, HomeState state) {
+        assert(h);
+
+        if (state < 0)
+                state = home_get_state(h);
+
+        return HOME_STATE_SHALL_PIN(state) ? home_pin(h) : home_unpin(h);
+}
+
+static void home_maybe_close_luks_lock_fd(Home *h, HomeState state) {
+        assert(h);
+
+        if (h->luks_lock_fd < 0)
+                return;
+
+        if (state < 0)
+                state = home_get_state(h);
+
+        /* Keep the lock as long as the home dir is active or has some operation going */
+        if (HOME_STATE_IS_EXECUTING_OPERATION(state) || HOME_STATE_IS_ACTIVE(state) || state == HOME_LOCKED)
+                return;
+
+        h->luks_lock_fd = safe_close(h->luks_lock_fd);
+        log_debug("Successfully closed LUKS backing file lock for %s.", h->user_name);
+}
+
+static void home_maybe_stop_retry_deactivate(Home *h, HomeState state) {
+        assert(h);
+
+        /* Free the deactivation retry event source if we won't need it anymore. Specifically, we'll free the
+         * event source whenever the home directory is already deactivated (and we thus where successful) or
+         * if we start executing an operation that indicates that the home directory is going to be used or
+         * operated on again. Also, if the home is referenced again stop the timer */
+
+        if (HOME_STATE_MAY_RETRY_DEACTIVATE(state) &&
+            !h->ref_event_source_dont_suspend &&
+            !h->ref_event_source_please_suspend)
+                return;
+
+        h->retry_deactivate_event_source = sd_event_source_disable_unref(h->retry_deactivate_event_source);
+}
+
+static int home_deactivate_internal(Home *h, bool force, sd_bus_error *error);
+static void home_start_retry_deactivate(Home *h);
+
+static int home_on_retry_deactivate(sd_event_source *s, uint64_t usec, void *userdata) {
+        Home *h = ASSERT_PTR(userdata);
+        HomeState state;
+
+        assert(s);
+
+        /* 15s after the last attempt to deactivate the home directory passed. Let's try it one more time. */
+
+        h->retry_deactivate_event_source = sd_event_source_disable_unref(h->retry_deactivate_event_source);
+
+        state = home_get_state(h);
+        if (!HOME_STATE_MAY_RETRY_DEACTIVATE(state))
+                return 0;
+
+        if (IN_SET(state, HOME_ACTIVE, HOME_LINGERING)) {
+                log_info("Again trying to deactivate home directory.");
+
+                /* If we are not executing any operation, let's start deactivating now. Note that this will
+                 * restart our timer again, we are gonna be called again if this doesn't work. */
+                (void) home_deactivate_internal(h, /* force= */ false, NULL);
+        } else
+                /* if we are executing an operation (specifically, area already running a deactivation
+                 * operation), then simply reque the timer, so that we retry again. */
+                home_start_retry_deactivate(h);
+
+        return 0;
+}
+
+static void home_start_retry_deactivate(Home *h) {
+        int r;
+
+        assert(h);
+        assert(h->manager);
+
+        /* Already allocated? */
+        if (h->retry_deactivate_event_source)
+                return;
+
+        /* If the home directory is being used now don't start the timer */
+        if (h->ref_event_source_dont_suspend || h->ref_event_source_please_suspend)
+                return;
+
+        r = sd_event_add_time_relative(
+                        h->manager->event,
+                        &h->retry_deactivate_event_source,
+                        CLOCK_MONOTONIC,
+                        RETRY_DEACTIVATE_USEC,
+                        1*USEC_PER_MINUTE,
+                        home_on_retry_deactivate,
+                        h);
+        if (r < 0)
+                return (void) log_warning_errno(r, "Failed to install retry-deactivate event source, ignoring: %m");
+
+        (void) sd_event_source_set_description(h->retry_deactivate_event_source, "retry-deactivate");
+}
+
+static void home_set_state(Home *h, HomeState state) {
+        HomeState old_state, new_state;
+
+        assert(h);
+
+        old_state = home_get_state(h);
+        h->state = state;
+        new_state = home_get_state(h); /* Query the new state, since the 'state' variable might be set to -1,
+                                        * in which case we synthesize an high-level state on demand */
+
+        log_info("%s: changing state %s %s %s", h->user_name,
+                 home_state_to_string(old_state),
+                 special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+                 home_state_to_string(new_state));
+
+        home_update_pin_fd(h, new_state);
+        home_maybe_close_luks_lock_fd(h, new_state);
+        home_maybe_stop_retry_deactivate(h, new_state);
+
+        if (HOME_STATE_IS_EXECUTING_OPERATION(old_state) && !HOME_STATE_IS_EXECUTING_OPERATION(new_state)) {
+                /* If we just finished executing some operation, process the queue of pending operations. And
+                 * enqueue it for GC too. */
+
+                home_schedule_operation(h, NULL, NULL);
+                manager_reschedule_rebalance(h->manager);
+                manager_enqueue_gc(h->manager, h);
+        }
+}
+
+static int home_parse_worker_stdout(int _fd, UserRecord **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_close_ int fd = _fd; /* take possession, even on failure */
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        unsigned line, column;
+        struct stat st;
+        int r;
+
+        if (fstat(fd, &st) < 0)
+                return log_error_errno(errno, "Failed to stat stdout fd: %m");
+
+        assert(S_ISREG(st.st_mode));
+
+        if (st.st_size == 0) { /* empty record */
+                *ret = NULL;
+                return 0;
+        }
+
+        if (lseek(fd, SEEK_SET, 0) < 0)
+                return log_error_errno(errno, "Failed to seek to beginning of memfd: %m");
+
+        f = take_fdopen(&fd, "r");
+        if (!f)
+                return log_error_errno(errno, "Failed to reopen memfd: %m");
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *text = NULL;
+
+                r = read_full_stream(f, &text, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read from client: %m");
+
+                log_debug("Got from worker: %s", text);
+                rewind(f);
+        }
+
+        r = json_parse_file(f, "stdout", JSON_PARSE_SENSITIVE, &v, &line, &column);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse identity at %u:%u: %m", line, column);
+
+        hr = user_record_new();
+        if (!hr)
+                return log_oom();
+
+        r = user_record_load(hr, v, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load home record identity: %m");
+
+        *ret = TAKE_PTR(hr);
+        return 1;
+}
+
+static int home_verify_user_record(Home *h, UserRecord *hr, bool *ret_signed_locally, sd_bus_error *ret_error) {
+        int is_signed;
+
+        assert(h);
+        assert(hr);
+        assert(ret_signed_locally);
+
+        is_signed = manager_verify_user_record(h->manager, hr);
+        switch (is_signed) {
+
+        case USER_RECORD_SIGNED_EXCLUSIVE:
+                log_info("Home %s is signed exclusively by our key, accepting.", hr->user_name);
+                *ret_signed_locally = true;
+                return 0;
+
+        case USER_RECORD_SIGNED:
+                log_info("Home %s is signed by our key (and others), accepting.", hr->user_name);
+                *ret_signed_locally = false;
+                return 0;
+
+        case USER_RECORD_FOREIGN:
+                log_info("Home %s is signed by foreign key we like, accepting.", hr->user_name);
+                *ret_signed_locally = false;
+                return 0;
+
+        case USER_RECORD_UNSIGNED:
+                sd_bus_error_setf(ret_error, BUS_ERROR_BAD_SIGNATURE, "User record %s is not signed at all, refusing.", hr->user_name);
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Home %s contains user record that is not signed at all, refusing.", hr->user_name);
+
+        case -ENOKEY:
+                sd_bus_error_setf(ret_error, BUS_ERROR_BAD_SIGNATURE, "User record %s is not signed by any known key, refusing.", hr->user_name);
+                return log_error_errno(is_signed, "Home %s contains user record that is not signed by any known key, refusing.", hr->user_name);
+
+        default:
+                assert(is_signed < 0);
+                return log_error_errno(is_signed, "Failed to verify signature on user record for %s, refusing fixation: %m", hr->user_name);
+        }
+}
+
+static int convert_worker_errno(Home *h, int e, sd_bus_error *error) {
+        /* Converts the error numbers the worker process returned into somewhat sensible dbus errors */
+
+        switch (e) {
+
+        case -EMSGSIZE:
+                return sd_bus_error_set(error, BUS_ERROR_BAD_HOME_SIZE, "File systems of this type cannot be shrunk");
+        case -ETXTBSY:
+                return sd_bus_error_set(error, BUS_ERROR_BAD_HOME_SIZE, "File systems of this type can only be shrunk offline");
+        case -ERANGE:
+                return sd_bus_error_set(error, BUS_ERROR_BAD_HOME_SIZE, "File system size too small");
+        case -ENOLINK:
+                return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "System does not support selected storage backend");
+        case -EPROTONOSUPPORT:
+                return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "System does not support selected file system");
+        case -ENOTTY:
+                return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Operation not supported on storage backend");
+        case -ESOCKTNOSUPPORT:
+                return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Operation not supported on file system");
+        case -ENOKEY:
+                return sd_bus_error_setf(error, BUS_ERROR_BAD_PASSWORD, "Password for home %s is incorrect or not sufficient for authentication.", h->user_name);
+        case -EBADSLT:
+                return sd_bus_error_setf(error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN, "Password for home %s is incorrect or not sufficient, and configured security token not found either.", h->user_name);
+        case -EREMOTEIO:
+                return sd_bus_error_setf(error, BUS_ERROR_BAD_RECOVERY_KEY, "Recovery key for home %s is incorrect or not sufficient for authentication.", h->user_name);
+        case -ENOANO:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_PIN_NEEDED, "PIN for security token required.");
+        case -ERFKILL:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_PROTECTED_AUTHENTICATION_PATH_NEEDED, "Security token requires protected authentication path.");
+        case -EMEDIUMTYPE:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_USER_PRESENCE_NEEDED, "Security token requires presence confirmation.");
+        case -ENOCSI:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_USER_VERIFICATION_NEEDED, "Security token requires user verification.");
+        case -ENOSTR:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_ACTION_TIMEOUT, "Token action timeout. (User was supposed to verify presence or similar, by interacting with the token, and didn't do that in time.)");
+        case -EOWNERDEAD:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_PIN_LOCKED, "PIN of security token locked.");
+        case -ENOLCK:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_BAD_PIN, "Bad PIN of security token.");
+        case -ETOOMANYREFS:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_BAD_PIN_FEW_TRIES_LEFT, "Bad PIN of security token, and only a few tries left.");
+        case -EUCLEAN:
+                return sd_bus_error_set(error, BUS_ERROR_TOKEN_BAD_PIN_ONE_TRY_LEFT, "Bad PIN of security token, and only one try left.");
+        case -EBUSY:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "Home %s is currently being used, or an operation on home %s is currently being executed.", h->user_name, h->user_name);
+        case -ENOEXEC:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_NOT_ACTIVE, "Home %s is currently not active", h->user_name);
+        case -ENOSPC:
+                return sd_bus_error_setf(error, BUS_ERROR_NO_DISK_SPACE, "Not enough disk space for home %s", h->user_name);
+        case -EKEYREVOKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_CANT_AUTHENTICATE, "Home %s has no password or other authentication mechanism defined.", h->user_name);
+        case -EADDRINUSE:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_IN_USE, "Home %s is currently being used elsewhere.", h->user_name);
+        }
+
+        return 0;
+}
+
+static void home_count_bad_authentication(Home *h, bool save) {
+        int r;
+
+        assert(h);
+
+        r = user_record_bad_authentication(h->record);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to increase bad authentication counter, ignoring: %m");
+                return;
+        }
+
+        if (save) {
+                r = home_save_record(h);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to write home record to disk, ignoring: %m");
+        }
+}
+
+static void home_fixate_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(user_record_unrefp) UserRecord *secret = NULL;
+        bool signed_locally;
+        int r;
+
+        assert(h);
+        assert(IN_SET(h->state, HOME_FIXATING, HOME_FIXATING_FOR_ACTIVATION, HOME_FIXATING_FOR_ACQUIRE));
+
+        secret = TAKE_PTR(h->secret); /* Take possession */
+
+        if (ret < 0) {
+                if (ret == -ENOKEY)
+                        (void) home_count_bad_authentication(h, false);
+
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Fixation failed: %m");
+                goto fail;
+        }
+        if (!hr) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Did not receive user record from worker process, fixation failed.");
+                goto fail;
+        }
+
+        r = home_verify_user_record(h, hr, &signed_locally, &error);
+        if (r < 0)
+                goto fail;
+
+        r = home_set_record(h, hr);
+        if (r < 0) {
+                log_error_errno(r, "Failed to update home record: %m");
+                goto fail;
+        }
+
+        h->signed_locally = signed_locally;
+
+        /* When we finished fixating (and don't follow-up with activation), let's count this as good authentication */
+        if (h->state == HOME_FIXATING) {
+                r = user_record_good_authentication(h->record);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to increase good authentication counter, ignoring: %m");
+        }
+
+        r = home_save_record(h);
+        if (r < 0)
+                log_warning_errno(r, "Failed to write home record to disk, ignoring: %m");
+
+        if (IN_SET(h->state, HOME_FIXATING_FOR_ACTIVATION, HOME_FIXATING_FOR_ACQUIRE)) {
+
+                r = home_start_work(h, "activate", h->record, secret);
+                if (r < 0) {
+                        h->current_operation = operation_result_unref(h->current_operation, r, NULL);
+                        home_set_state(h, _HOME_STATE_INVALID);
+                } else
+                        home_set_state(h, h->state == HOME_FIXATING_FOR_ACTIVATION ? HOME_ACTIVATING : HOME_ACTIVATING_FOR_ACQUIRE);
+
+                return;
+        }
+
+        log_debug("Fixation of %s completed.", h->user_name);
+
+        h->current_operation = operation_result_unref(h->current_operation, 0, NULL);
+
+        /* Reset the state to "invalid", which makes home_get_state() test if the image exists and returns
+         * HOME_ABSENT vs. HOME_INACTIVE as necessary. */
+        home_set_state(h, _HOME_STATE_INVALID);
+        (void) manager_schedule_rebalance(h->manager, /* immediately= */ false);
+        return;
+
+fail:
+        /* If fixation fails, we stay in unfixated state! */
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, HOME_UNFIXATED);
+}
+
+static void home_activate_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(IN_SET(h->state, HOME_ACTIVATING, HOME_ACTIVATING_FOR_ACQUIRE));
+
+        if (ret < 0) {
+                if (ret == -ENOKEY)
+                        home_count_bad_authentication(h, true);
+
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Activation failed: %m");
+                goto finish;
+        }
+
+        if (hr) {
+                bool signed_locally;
+
+                r = home_verify_user_record(h, hr, &signed_locally, &error);
+                if (r < 0)
+                        goto finish;
+
+                r = home_set_record(h, hr);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to update home record, ignoring: %m");
+                        goto finish;
+                }
+
+                h->signed_locally = signed_locally;
+
+                r = user_record_good_authentication(h->record);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to increase good authentication counter, ignoring: %m");
+
+                r = home_save_record(h);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to write home record to disk, ignoring: %m");
+        }
+
+        log_debug("Activation of %s completed.", h->user_name);
+        r = 0;
+
+finish:
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, _HOME_STATE_INVALID);
+
+        if (r >= 0)
+                (void) manager_schedule_rebalance(h->manager, /* immediately= */ true);
+}
+
+static void home_deactivate_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(h->state == HOME_DEACTIVATING);
+        assert(!hr); /* We don't expect a record on this operation */
+
+        if (ret < 0) {
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Deactivation of %s failed: %m", h->user_name);
+                goto finish;
+        }
+
+        log_debug("Deactivation of %s completed.", h->user_name);
+        r = 0;
+
+finish:
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, _HOME_STATE_INVALID);
+
+        if (r >= 0)
+                (void) manager_schedule_rebalance(h->manager, /* immediately= */ true);
+}
+
+static void home_remove_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        Manager *m;
+        int r;
+
+        assert(h);
+        assert(h->state == HOME_REMOVING);
+        assert(!hr); /* We don't expect a record on this operation */
+
+        m = h->manager;
+
+        if (ret < 0 && ret != -EALREADY) {
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Removing %s failed: %m", h->user_name);
+                goto fail;
+        }
+
+        /* For a couple of storage types we can't delete the actual data storage when called (such as LUKS on
+         * partitions like USB sticks, or so). Sometimes these storage locations are among those we normally
+         * automatically discover in /home or in udev. When such a home is deleted let's hence issue a rescan
+         * after completion, so that "unfixated" entries are rediscovered.  */
+        if (!IN_SET(user_record_test_image_path(h->record), USER_TEST_UNDEFINED, USER_TEST_ABSENT))
+                manager_enqueue_rescan(m);
+
+        /* The image is now removed from disk. Now also remove our stored record */
+        r = home_unlink_record(h);
+        if (r < 0) {
+                log_error_errno(r, "Removing record file failed: %m");
+                goto fail;
+        }
+
+        log_debug("Removal of %s completed.", h->user_name);
+        h->current_operation = operation_result_unref(h->current_operation, 0, NULL);
+
+        /* Unload this record from memory too now. */
+        h = home_free(h);
+
+        (void) manager_schedule_rebalance(m, /* immediately= */ true);
+        return;
+
+fail:
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, _HOME_STATE_INVALID);
+}
+
+static void home_create_finish(Home *h, int ret, UserRecord *hr) {
+        int r;
+
+        assert(h);
+        assert(h->state == HOME_CREATING);
+
+        if (ret < 0) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                (void) convert_worker_errno(h, ret, &error);
+                log_error_errno(ret, "Operation on %s failed: %m", h->user_name);
+                h->current_operation = operation_result_unref(h->current_operation, ret, &error);
+
+                if (h->unregister_on_failure) {
+                        (void) home_unlink_record(h);
+                        h = home_free(h);
+                        return;
+                }
+
+                home_set_state(h, _HOME_STATE_INVALID);
+                return;
+        }
+
+        if (hr) {
+                r = home_set_record(h, hr);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to update home record, ignoring: %m");
+        }
+
+        r = home_save_record(h);
+        if (r < 0)
+                log_warning_errno(r, "Failed to save record to disk, ignoring: %m");
+
+        log_debug("Creation of %s completed.", h->user_name);
+
+        h->current_operation = operation_result_unref(h->current_operation, 0, NULL);
+        home_set_state(h, _HOME_STATE_INVALID);
+
+        (void) manager_schedule_rebalance(h->manager, /* immediately= */ true);
+}
+
+static void home_change_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+
+        if (ret < 0) {
+                if (ret == -ENOKEY)
+                        (void) home_count_bad_authentication(h, true);
+
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Change operation failed: %m");
+                goto finish;
+        }
+
+        if (hr) {
+                r = home_set_record(h, hr);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to update home record, ignoring: %m");
+                else {
+                        r = user_record_good_authentication(h->record);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to increase good authentication counter, ignoring: %m");
+
+                        r = home_save_record(h);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to write home record to disk, ignoring: %m");
+                }
+        }
+
+        log_debug("Change operation of %s completed.", h->user_name);
+        (void) manager_schedule_rebalance(h->manager, /* immediately= */ false);
+        r = 0;
+
+finish:
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, _HOME_STATE_INVALID);
+}
+
+static void home_locking_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(h->state == HOME_LOCKING);
+
+        if (ret < 0) {
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Locking operation failed: %m");
+                goto finish;
+        }
+
+        log_debug("Locking operation of %s completed.", h->user_name);
+        h->current_operation = operation_result_unref(h->current_operation, 0, NULL);
+        home_set_state(h, HOME_LOCKED);
+        return;
+
+finish:
+        /* If a specific home doesn't know the concept of locking, then that's totally OK, don't propagate
+         * the error if we are executing a LockAllHomes() operation. */
+
+        if (h->current_operation->type == OPERATION_LOCK_ALL && r == -ENOTTY)
+                h->current_operation = operation_result_unref(h->current_operation, 0, NULL);
+        else
+                h->current_operation = operation_result_unref(h->current_operation, r, &error);
+
+        home_set_state(h, _HOME_STATE_INVALID);
+}
+
+static void home_unlocking_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(IN_SET(h->state, HOME_UNLOCKING, HOME_UNLOCKING_FOR_ACQUIRE));
+
+        if (ret < 0) {
+                if (ret == -ENOKEY)
+                        (void) home_count_bad_authentication(h, true);
+
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Unlocking operation failed: %m");
+
+                /* Revert to locked state */
+                home_set_state(h, HOME_LOCKED);
+                h->current_operation = operation_result_unref(h->current_operation, r, &error);
+                return;
+        }
+
+        r = user_record_good_authentication(h->record);
+        if (r < 0)
+                log_warning_errno(r, "Failed to increase good authentication counter, ignoring: %m");
+        else {
+                r = home_save_record(h);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to write home record to disk, ignoring: %m");
+        }
+
+        log_debug("Unlocking operation of %s completed.", h->user_name);
+
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, _HOME_STATE_INVALID);
+        return;
+}
+
+static void home_authenticating_finish(Home *h, int ret, UserRecord *hr) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(IN_SET(h->state, HOME_AUTHENTICATING, HOME_AUTHENTICATING_WHILE_ACTIVE, HOME_AUTHENTICATING_FOR_ACQUIRE));
+
+        if (ret < 0) {
+                if (ret == -ENOKEY)
+                        (void) home_count_bad_authentication(h, true);
+
+                (void) convert_worker_errno(h, ret, &error);
+                r = log_error_errno(ret, "Authentication failed: %m");
+                goto finish;
+        }
+
+        if (hr) {
+                r = home_set_record(h, hr);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to update home record, ignoring: %m");
+                else {
+                        r = user_record_good_authentication(h->record);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to increase good authentication counter, ignoring: %m");
+
+                        r = home_save_record(h);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to write home record to disk, ignoring: %m");
+                }
+        }
+
+        log_debug("Authentication of %s completed.", h->user_name);
+        r = 0;
+
+finish:
+        h->current_operation = operation_result_unref(h->current_operation, r, &error);
+        home_set_state(h, _HOME_STATE_INVALID);
+}
+
+static int home_on_worker_process(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        Home *h = ASSERT_PTR(userdata);
+        int ret;
+
+        assert(s);
+        assert(si);
+
+        assert(h->worker_pid == si->si_pid);
+        assert(h->worker_event_source);
+        assert(h->worker_stdout_fd >= 0);
+
+        (void) hashmap_remove_value(h->manager->homes_by_worker_pid, PID_TO_PTR(h->worker_pid), h);
+
+        h->worker_pid = 0;
+        h->worker_event_source = sd_event_source_disable_unref(h->worker_event_source);
+
+        if (si->si_code != CLD_EXITED) {
+                assert(IN_SET(si->si_code, CLD_KILLED, CLD_DUMPED));
+                ret = log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Worker process died abnormally with signal %s.", signal_to_string(si->si_status));
+        } else if (si->si_status != EXIT_SUCCESS) {
+                /* If we received an error code via sd_notify(), use it */
+                if (h->worker_error_code != 0)
+                        ret = log_debug_errno(h->worker_error_code, "Worker reported error code %s.", errno_to_name(h->worker_error_code));
+                else
+                        ret = log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Worker exited with exit code %i.", si->si_status);
+        } else
+                ret = home_parse_worker_stdout(TAKE_FD(h->worker_stdout_fd), &hr);
+
+        h->worker_stdout_fd = safe_close(h->worker_stdout_fd);
+
+        switch (h->state) {
+
+        case HOME_FIXATING:
+        case HOME_FIXATING_FOR_ACTIVATION:
+        case HOME_FIXATING_FOR_ACQUIRE:
+                home_fixate_finish(h, ret, hr);
+                break;
+
+        case HOME_ACTIVATING:
+        case HOME_ACTIVATING_FOR_ACQUIRE:
+                home_activate_finish(h, ret, hr);
+                break;
+
+        case HOME_DEACTIVATING:
+                home_deactivate_finish(h, ret, hr);
+                break;
+
+        case HOME_LOCKING:
+                home_locking_finish(h, ret, hr);
+                break;
+
+        case HOME_UNLOCKING:
+        case HOME_UNLOCKING_FOR_ACQUIRE:
+                home_unlocking_finish(h, ret, hr);
+                break;
+
+        case HOME_CREATING:
+                home_create_finish(h, ret, hr);
+                break;
+
+        case HOME_REMOVING:
+                home_remove_finish(h, ret, hr);
+                break;
+
+        case HOME_UPDATING:
+        case HOME_UPDATING_WHILE_ACTIVE:
+        case HOME_RESIZING:
+        case HOME_RESIZING_WHILE_ACTIVE:
+        case HOME_PASSWD:
+        case HOME_PASSWD_WHILE_ACTIVE:
+                home_change_finish(h, ret, hr);
+                break;
+
+        case HOME_AUTHENTICATING:
+        case HOME_AUTHENTICATING_WHILE_ACTIVE:
+        case HOME_AUTHENTICATING_FOR_ACQUIRE:
+                home_authenticating_finish(h, ret, hr);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        return 0;
+}
+
+static int home_start_work(Home *h, const char *verb, UserRecord *hr, UserRecord *secret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(erase_and_freep) char *formatted = NULL;
+        _cleanup_close_ int stdin_fd = -EBADF, stdout_fd = -EBADF;
+        pid_t pid = 0;
+        int r;
+
+        assert(h);
+        assert(verb);
+        assert(hr);
+
+        if (h->worker_pid != 0)
+                return -EBUSY;
+
+        assert(h->worker_stdout_fd < 0);
+        assert(!h->worker_event_source);
+
+        v = json_variant_ref(hr->json);
+
+        if (secret) {
+                JsonVariant *sub = NULL;
+
+                sub = json_variant_by_key(secret->json, "secret");
+                if (!sub)
+                        return -ENOKEY;
+
+                r = json_variant_set_field(&v, "secret", sub);
+                if (r < 0)
+                        return r;
+        }
+
+        r = json_variant_format(v, 0, &formatted);
+        if (r < 0)
+                return r;
+
+        stdin_fd = acquire_data_fd(formatted, strlen(formatted), 0);
+        if (stdin_fd < 0)
+                return stdin_fd;
+
+        log_debug("Sending to worker: %s", formatted);
+
+        stdout_fd = memfd_create_wrapper("homework-stdout", MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+        if (stdout_fd < 0)
+                return stdout_fd;
+
+        r = safe_fork_full("(sd-homework)",
+                           (int[]) { stdin_fd, stdout_fd, STDERR_FILENO },
+                           NULL, 0,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG|FORK_REOPEN_LOG, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                _cleanup_free_ char *joined = NULL;
+                const char *homework, *suffix, *unix_path;
+
+                /* Child */
+
+                suffix = getenv("SYSTEMD_HOME_DEBUG_SUFFIX");
+                if (suffix) {
+                        joined = strjoin("/run/systemd/home/notify.", suffix);
+                        if (!joined)
+                                return log_oom();
+                        unix_path = joined;
+                } else
+                        unix_path = "/run/systemd/home/notify";
+
+                if (setenv("NOTIFY_SOCKET", unix_path, 1) < 0) {
+                        log_error_errno(errno, "Failed to set $NOTIFY_SOCKET: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                /* If we haven't locked the device yet, ask for a lock to be taken and be passed back to us via sd_notify(). */
+                if (setenv("SYSTEMD_LUKS_LOCK", one_zero(h->luks_lock_fd < 0), 1) < 0) {
+                        log_error_errno(errno, "Failed to set $SYSTEMD_LUKS_LOCK: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (h->manager->default_storage >= 0)
+                        if (setenv("SYSTEMD_HOME_DEFAULT_STORAGE", user_storage_to_string(h->manager->default_storage), 1) < 0) {
+                                log_error_errno(errno, "Failed to set $SYSTEMD_HOME_DEFAULT_STORAGE: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                if (h->manager->default_file_system_type)
+                        if (setenv("SYSTEMD_HOME_DEFAULT_FILE_SYSTEM_TYPE", h->manager->default_file_system_type, 1) < 0) {
+                                log_error_errno(errno, "Failed to set $SYSTEMD_HOME_DEFAULT_FILE_SYSTEM_TYPE: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                r = setenv_systemd_exec_pid(true);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to update $SYSTEMD_EXEC_PID, ignoring: %m");
+
+                /* Allow overriding the homework path via an environment variable, to make debugging
+                 * easier. */
+                homework = getenv("SYSTEMD_HOMEWORK_PATH") ?: SYSTEMD_HOMEWORK_PATH;
+
+                execl(homework, homework, verb, NULL);
+                log_error_errno(errno, "Failed to invoke %s: %m", homework);
+                _exit(EXIT_FAILURE);
+        }
+
+        r = sd_event_add_child(h->manager->event, &h->worker_event_source, pid, WEXITED, home_on_worker_process, h);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(h->worker_event_source, "worker");
+
+        r = hashmap_put(h->manager->homes_by_worker_pid, PID_TO_PTR(pid), h);
+        if (r < 0) {
+                h->worker_event_source = sd_event_source_disable_unref(h->worker_event_source);
+                return r;
+        }
+
+        h->worker_stdout_fd = TAKE_FD(stdout_fd);
+        h->worker_pid = pid;
+        h->worker_error_code = 0;
+
+        return 0;
+}
+
+static int home_ratelimit(Home *h, sd_bus_error *error) {
+        int r, ret;
+
+        assert(h);
+
+        ret = user_record_ratelimit(h->record);
+        if (ret < 0)
+                return ret;
+
+        if (h->state != HOME_UNFIXATED) {
+                r = home_save_record(h);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to save updated record, ignoring: %m");
+        }
+
+        if (ret == 0) {
+                usec_t t, n;
+
+                n = now(CLOCK_REALTIME);
+                t = user_record_ratelimit_next_try(h->record);
+
+                if (t != USEC_INFINITY && t > n)
+                        return sd_bus_error_setf(error, BUS_ERROR_AUTHENTICATION_LIMIT_HIT,
+                                                 "Too many login attempts, please try again in %s!",
+                                                 FORMAT_TIMESPAN(t - n, USEC_PER_SEC));
+
+                return sd_bus_error_set(error, BUS_ERROR_AUTHENTICATION_LIMIT_HIT, "Too many login attempts, please try again later.");
+        }
+
+        return 0;
+}
+
+static int home_fixate_internal(
+                Home *h,
+                UserRecord *secret,
+                HomeState for_state,
+                sd_bus_error *error) {
+
+        int r;
+
+        assert(h);
+        assert(IN_SET(for_state, HOME_FIXATING, HOME_FIXATING_FOR_ACTIVATION, HOME_FIXATING_FOR_ACQUIRE));
+
+        r = home_start_work(h, "inspect", h->record, secret);
+        if (r < 0)
+                return r;
+
+        if (IN_SET(for_state, HOME_FIXATING_FOR_ACTIVATION, HOME_FIXATING_FOR_ACQUIRE)) {
+                /* Remember the secret data, since we need it for the activation again, later on. */
+                user_record_unref(h->secret);
+                h->secret = user_record_ref(secret);
+        }
+
+        home_set_state(h, for_state);
+        return 0;
+}
+
+int home_fixate(Home *h, UserRecord *secret, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        switch (home_get_state(h)) {
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ALREADY_FIXATED, "Home %s is already fixated.", h->user_name);
+        case HOME_UNFIXATED:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        return home_fixate_internal(h, secret, HOME_FIXATING, error);
+}
+
+static int home_activate_internal(Home *h, UserRecord *secret, HomeState for_state, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+        assert(IN_SET(for_state, HOME_ACTIVATING, HOME_ACTIVATING_FOR_ACQUIRE));
+
+        r = home_start_work(h, "activate", h->record, secret);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, for_state);
+        return 0;
+}
+
+int home_activate(Home *h, UserRecord *secret, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        switch (home_get_state(h)) {
+        case HOME_UNFIXATED:
+                return home_fixate_internal(h, secret, HOME_FIXATING_FOR_ACTIVATION, error);
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_ACTIVE:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ALREADY_ACTIVE, "Home %s is already active.", h->user_name);
+        case HOME_LINGERING:
+                /* If we are lingering, i.e. active but are supposed to be deactivated, then cancel this
+                 * timer if the user explicitly asks us to be active */
+                h->retry_deactivate_event_source = sd_event_source_disable_unref(h->retry_deactivate_event_source);
+                return 0;
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        return home_activate_internal(h, secret, HOME_ACTIVATING, error);
+}
+
+static int home_authenticate_internal(Home *h, UserRecord *secret, HomeState for_state, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+        assert(IN_SET(for_state, HOME_AUTHENTICATING, HOME_AUTHENTICATING_WHILE_ACTIVE, HOME_AUTHENTICATING_FOR_ACQUIRE));
+
+        r = home_start_work(h, "inspect", h->record, secret);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, for_state);
+        return 0;
+}
+
+int home_authenticate(Home *h, UserRecord *secret, sd_bus_error *error) {
+        HomeState state;
+        int r;
+
+        assert(h);
+
+        state = home_get_state(h);
+        switch (state) {
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_UNFIXATED:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        return home_authenticate_internal(h, secret, HOME_STATE_IS_ACTIVE(state) ? HOME_AUTHENTICATING_WHILE_ACTIVE : HOME_AUTHENTICATING, error);
+}
+
+static int home_deactivate_internal(Home *h, bool force, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        home_unpin(h); /* unpin so that we can deactivate */
+
+        r = home_start_work(h, force ? "deactivate-force" : "deactivate", h->record, NULL);
+        if (r < 0)
+                /* Operation failed before it even started, reacquire pin fd, if state still dictates so */
+                home_update_pin_fd(h, _HOME_STATE_INVALID);
+        else {
+                home_set_state(h, HOME_DEACTIVATING);
+                r = 0;
+        }
+
+        /* Let's start a timer to retry deactivation in 15. We'll stop the timer once we manage to deactivate
+         * the home directory again, or we start any other operation. */
+        home_start_retry_deactivate(h);
+
+        return r;
+}
+
+int home_deactivate(Home *h, bool force, sd_bus_error *error) {
+        assert(h);
+
+        switch (home_get_state(h)) {
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_NOT_ACTIVE, "Home %s not active.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        return home_deactivate_internal(h, force, error);
+}
+
+int home_create(Home *h, UserRecord *secret, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        switch (home_get_state(h)) {
+        case HOME_INACTIVE: {
+                int t;
+
+                if (h->record->storage < 0)
+                        break; /* if no storage is defined we don't know what precisely to look for, hence
+                                * HOME_INACTIVE is OK in that case too. */
+
+                t = user_record_test_image_path(h->record);
+                if (IN_SET(t, USER_TEST_MAYBE, USER_TEST_UNDEFINED))
+                        break; /* And if the image path test isn't conclusive, let's also go on */
+
+                if (IN_SET(t, -EBADF, -ENOTDIR))
+                        return sd_bus_error_setf(error, BUS_ERROR_HOME_EXISTS, "Selected home image of user %s already exists or has wrong inode type.", h->user_name);
+
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_EXISTS, "Selected home image of user %s already exists.", h->user_name);
+        }
+        case HOME_UNFIXATED:
+        case HOME_DIRTY:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_EXISTS, "Home of user %s already exists.", h->user_name);
+        case HOME_ABSENT:
+                break;
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+        case HOME_LOCKED:
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "Home %s is currently being used, or an operation on home %s is currently being executed.", h->user_name, h->user_name);
+        }
+
+        if (h->record->enforce_password_policy == false)
+                log_debug("Password quality check turned off for account, skipping.");
+        else {
+                r = user_record_check_password_quality(h->record, secret, error);
+                if (r < 0)
+                        return r;
+        }
+
+        r = home_start_work(h, "create", h->record, secret);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, HOME_CREATING);
+        return 0;
+}
+
+int home_remove(Home *h, sd_bus_error *error) {
+        HomeState state;
+        int r;
+
+        assert(h);
+
+        state = home_get_state(h);
+        switch (state) {
+        case HOME_ABSENT: /* If the home directory is absent, then this is just like unregistering */
+                return home_unregister(h, error);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_UNFIXATED:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                break;
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "Home %s is currently being used, or an operation on home %s is currently being executed.", h->user_name, h->user_name);
+        }
+
+        r = home_start_work(h, "remove", h->record, NULL);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, HOME_REMOVING);
+        return 0;
+}
+
+static int user_record_extend_with_binding(UserRecord *hr, UserRecord *with_binding, UserRecordLoadFlags flags, UserRecord **ret) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *nr = NULL;
+        JsonVariant *binding;
+        int r;
+
+        assert(hr);
+        assert(with_binding);
+        assert(ret);
+
+        assert_se(v = json_variant_ref(hr->json));
+
+        binding = json_variant_by_key(with_binding->json, "binding");
+        if (binding) {
+                r = json_variant_set_field(&v, "binding", binding);
+                if (r < 0)
+                        return r;
+        }
+
+        nr = user_record_new();
+        if (!nr)
+                return -ENOMEM;
+
+        r = user_record_load(nr, v, flags);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(nr);
+        return 0;
+}
+
+static int home_update_internal(
+                Home *h,
+                const char *verb,
+                UserRecord *hr,
+                UserRecord *secret,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *new_hr = NULL, *saved_secret = NULL, *signed_hr = NULL;
+        int r, c;
+
+        assert(h);
+        assert(verb);
+        assert(hr);
+
+        if (!user_record_compatible(hr, h->record))
+                return sd_bus_error_set(error, BUS_ERROR_HOME_RECORD_MISMATCH, "Updated user record is not compatible with existing one.");
+        c = user_record_compare_last_change(hr, h->record); /* refuse downgrades */
+        if (c < 0)
+                return sd_bus_error_set(error, BUS_ERROR_HOME_RECORD_DOWNGRADE, "Refusing to update to older home record.");
+
+        if (!secret && FLAGS_SET(hr->mask, USER_RECORD_SECRET)) {
+                r = user_record_clone(hr, USER_RECORD_EXTRACT_SECRET|USER_RECORD_PERMISSIVE, &saved_secret);
+                if (r < 0)
+                        return r;
+
+                secret = saved_secret;
+        }
+
+        r = manager_verify_user_record(h->manager, hr);
+        switch (r) {
+
+        case USER_RECORD_UNSIGNED:
+                if (h->signed_locally <= 0) /* If the existing record is not owned by us, don't accept an
+                                             * unsigned new record. i.e. only implicitly sign new records
+                                             * that where previously signed by us too. */
+                        return sd_bus_error_setf(error, BUS_ERROR_HOME_RECORD_SIGNED, "Home %s is signed and cannot be modified locally.", h->user_name);
+
+                /* The updated record is not signed, then do so now */
+                r = manager_sign_user_record(h->manager, hr, &signed_hr, error);
+                if (r < 0)
+                        return r;
+
+                hr = signed_hr;
+                break;
+
+        case USER_RECORD_SIGNED_EXCLUSIVE:
+        case USER_RECORD_SIGNED:
+        case USER_RECORD_FOREIGN:
+                /* Has already been signed. Great! */
+                break;
+
+        case -ENOKEY:
+        default:
+                return r;
+        }
+
+        r = user_record_extend_with_binding(hr, h->record, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE, &new_hr);
+        if (r < 0)
+                return r;
+
+        if (c == 0) {
+                /* different payload but same lastChangeUSec field? That's not cool! */
+
+                r = user_record_masked_equal(new_hr, h->record, USER_RECORD_REGULAR|USER_RECORD_PRIVILEGED|USER_RECORD_PER_MACHINE);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return sd_bus_error_set(error, BUS_ERROR_HOME_RECORD_MISMATCH, "Home record different but timestamp remained the same, refusing.");
+        }
+
+        r = home_start_work(h, verb, new_hr, secret);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+int home_update(Home *h, UserRecord *hr, sd_bus_error *error) {
+        HomeState state;
+        int r;
+
+        assert(h);
+        assert(hr);
+
+        state = home_get_state(h);
+        switch (state) {
+        case HOME_UNFIXATED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_UNFIXATED, "Home %s has not been fixated yet.", h->user_name);
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        r = home_update_internal(h, "update", hr, NULL, error);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, HOME_STATE_IS_ACTIVE(state) ? HOME_UPDATING_WHILE_ACTIVE : HOME_UPDATING);
+        return 0;
+}
+
+int home_resize(Home *h,
+                uint64_t disk_size,
+                UserRecord *secret,
+                bool automatic,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *c = NULL;
+        HomeState state;
+        int r;
+
+        assert(h);
+
+        state = home_get_state(h);
+        switch (state) {
+        case HOME_UNFIXATED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_UNFIXATED, "Home %s has not been fixated yet.", h->user_name);
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        /* If the user didn't specify any size explicitly and rebalancing is on, then the disk size is
+         * determined by automatic rebalancing and hence not user configured but determined by us and thus
+         * applied anyway. */
+        if (disk_size == UINT64_MAX && h->record->rebalance_weight != REBALANCE_WEIGHT_OFF)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Disk size is being determined by automatic disk space rebalancing.");
+
+        if (disk_size == UINT64_MAX || disk_size == h->record->disk_size) {
+                if (h->record->disk_size == UINT64_MAX)
+                        return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "No disk size to resize to specified.");
+
+                c = user_record_ref(h->record); /* Shortcut if size is unspecified or matches the record */
+        } else {
+                _cleanup_(user_record_unrefp) UserRecord *signed_c = NULL;
+
+                if (h->signed_locally <= 0) /* Don't allow changing of records not signed only by us */
+                        return sd_bus_error_setf(error, BUS_ERROR_HOME_RECORD_SIGNED, "Home %s is signed and cannot be modified locally.", h->user_name);
+
+                r = user_record_clone(h->record, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE, &c);
+                if (r < 0)
+                        return r;
+
+                r = user_record_set_disk_size(c, disk_size);
+                if (r == -ERANGE)
+                        return sd_bus_error_setf(error, BUS_ERROR_BAD_HOME_SIZE, "Requested size for home %s out of acceptable range.", h->user_name);
+                if (r < 0)
+                        return r;
+
+                /* If user picked an explicit size, then turn off rebalancing, so that we don't undo what user chose */
+                r = user_record_set_rebalance_weight(c, REBALANCE_WEIGHT_OFF);
+                if (r < 0)
+                        return r;
+
+                r = user_record_update_last_changed(c, false);
+                if (r == -ECHRNG)
+                        return sd_bus_error_setf(error, BUS_ERROR_HOME_RECORD_MISMATCH, "Record last change time of %s is newer than current time, cannot update.", h->user_name);
+                if (r < 0)
+                        return r;
+
+                r = manager_sign_user_record(h->manager, c, &signed_c, error);
+                if (r < 0)
+                        return r;
+
+                user_record_unref(c);
+                c = TAKE_PTR(signed_c);
+        }
+
+        r = home_update_internal(h, automatic ? "resize-auto" : "resize", c, secret, error);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, HOME_STATE_IS_ACTIVE(state) ? HOME_RESIZING_WHILE_ACTIVE : HOME_RESIZING);
+        return 0;
+}
+
+static int home_may_change_password(
+                Home *h,
+                sd_bus_error *error) {
+
+        int r;
+
+        assert(h);
+
+        r = user_record_test_password_change_required(h->record);
+        if (IN_SET(r, -EKEYREVOKED, -EOWNERDEAD, -EKEYEXPIRED, -ESTALE))
+                return 0; /* expired in some form, but changing is allowed */
+        if (IN_SET(r, -EKEYREJECTED, -EROFS))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Expiration settings of account %s do not allow changing of password.", h->user_name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to test password expiry: %m");
+
+        return 0; /* not expired */
+}
+
+int home_passwd(Home *h,
+                UserRecord *new_secret,
+                UserRecord *old_secret,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *c = NULL, *merged_secret = NULL, *signed_c = NULL;
+        HomeState state;
+        int r;
+
+        assert(h);
+
+        if (h->signed_locally <= 0) /* Don't allow changing of records not signed only by us */
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_RECORD_SIGNED, "Home %s is signed and cannot be modified locally.", h->user_name);
+
+        state = home_get_state(h);
+        switch (state) {
+        case HOME_UNFIXATED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_UNFIXATED, "Home %s has not been fixated yet.", h->user_name);
+        case HOME_ABSENT:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_ABSENT, "Home %s is currently missing or not plugged in.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        r = home_may_change_password(h, error);
+        if (r < 0)
+                return r;
+
+        r = user_record_clone(h->record, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE, &c);
+        if (r < 0)
+                return r;
+
+        merged_secret = user_record_new();
+        if (!merged_secret)
+                return -ENOMEM;
+
+        r = user_record_merge_secret(merged_secret, old_secret);
+        if (r < 0)
+                return r;
+
+        r = user_record_merge_secret(merged_secret, new_secret);
+        if (r < 0)
+                return r;
+
+        if (!strv_isempty(new_secret->password)) {
+                /* Update the password only if one is specified, otherwise let's just reuse the old password
+                 * data. This is useful as a way to propagate updated user records into the LUKS backends
+                 * properly. */
+
+                r = user_record_make_hashed_password(c, new_secret->password, /* extend = */ false);
+                if (r < 0)
+                        return r;
+
+                r = user_record_set_password_change_now(c, -1 /* remove */);
+                if (r < 0)
+                        return r;
+        }
+
+        r = user_record_update_last_changed(c, true);
+        if (r == -ECHRNG)
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_RECORD_MISMATCH, "Record last change time of %s is newer than current time, cannot update.", h->user_name);
+        if (r < 0)
+                return r;
+
+        r = manager_sign_user_record(h->manager, c, &signed_c, error);
+        if (r < 0)
+                return r;
+
+        if (c->enforce_password_policy == false)
+                log_debug("Password quality check turned off for account, skipping.");
+        else {
+                r = user_record_check_password_quality(c, merged_secret, error);
+                if (r < 0)
+                        return r;
+        }
+
+        r = home_update_internal(h, "passwd", signed_c, merged_secret, error);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, HOME_STATE_IS_ACTIVE(state) ? HOME_PASSWD_WHILE_ACTIVE : HOME_PASSWD);
+        return 0;
+}
+
+int home_unregister(Home *h, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        switch (home_get_state(h)) {
+        case HOME_UNFIXATED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_UNFIXATED, "Home %s is not registered.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                break;
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "Home %s is currently being used, or an operation on home %s is currently being executed.", h->user_name, h->user_name);
+        }
+
+        r = home_unlink_record(h);
+        if (r < 0)
+                return r;
+
+        /* And destroy the whole entry. The caller needs to be prepared for that. */
+        h = home_free(h);
+        return 1;
+}
+
+int home_lock(Home *h, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        switch (home_get_state(h)) {
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_NOT_ACTIVE, "Home %s is not active.", h->user_name);
+        case HOME_LOCKED:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_LOCKED, "Home %s is already locked.", h->user_name);
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        r = home_start_work(h, "lock", h->record, NULL);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, HOME_LOCKING);
+        return 0;
+}
+
+static int home_unlock_internal(Home *h, UserRecord *secret, HomeState for_state, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+        assert(IN_SET(for_state, HOME_UNLOCKING, HOME_UNLOCKING_FOR_ACQUIRE));
+
+        r = home_start_work(h, "unlock", h->record, secret);
+        if (r < 0)
+                return r;
+
+        home_set_state(h, for_state);
+        return 0;
+}
+
+int home_unlock(Home *h, UserRecord *secret, sd_bus_error *error) {
+        int r;
+        assert(h);
+
+        r = home_ratelimit(h, error);
+        if (r < 0)
+                return r;
+
+        switch (home_get_state(h)) {
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+        case HOME_DIRTY:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_NOT_LOCKED, "Home %s is not locked.", h->user_name);
+        case HOME_LOCKED:
+                break;
+        default:
+                return sd_bus_error_setf(error, BUS_ERROR_HOME_BUSY, "An operation on home %s is currently being executed.", h->user_name);
+        }
+
+        return home_unlock_internal(h, secret, HOME_UNLOCKING, error);
+}
+
+HomeState home_get_state(Home *h) {
+        int r;
+        assert(h);
+
+        /* When the state field is initialized, it counts. */
+        if (h->state >= 0)
+                return h->state;
+
+        /* Otherwise, let's see if the home directory is mounted. If so, we assume for sure the home
+         * directory is active */
+        if (user_record_test_home_directory(h->record) == USER_TEST_MOUNTED)
+                return h->retry_deactivate_event_source ? HOME_LINGERING : HOME_ACTIVE;
+
+        /* And if we see the image being gone, we report this as absent */
+        r = user_record_test_image_path(h->record);
+        if (r == USER_TEST_ABSENT)
+                return HOME_ABSENT;
+        if (r == USER_TEST_DIRTY)
+                return HOME_DIRTY;
+
+        /* And for all other cases we return "inactive". */
+        return HOME_INACTIVE;
+}
+
+void home_process_notify(Home *h, char **l, int fd) {
+        _cleanup_close_ int taken_fd = TAKE_FD(fd);
+        const char *e;
+        int error;
+        int r;
+
+        assert(h);
+
+        e = strv_env_get(l, "SYSTEMD_LUKS_LOCK_FD");
+        if (e) {
+                r = parse_boolean(e);
+                if (r < 0)
+                        return (void) log_debug_errno(r, "Failed to parse SYSTEMD_LUKS_LOCK_FD value: %m");
+                if (r > 0) {
+                        if (taken_fd < 0)
+                                return (void) log_debug("Got notify message with SYSTEMD_LUKS_LOCK_FD=1 but no fd passed, ignoring: %m");
+
+                        close_and_replace(h->luks_lock_fd, taken_fd);
+
+                        log_debug("Successfully acquired LUKS lock fd from worker.");
+
+                        /* Immediately check if we actually want to keep it */
+                        home_maybe_close_luks_lock_fd(h, _HOME_STATE_INVALID);
+                } else {
+                        if (taken_fd >= 0)
+                                return (void) log_debug("Got notify message with SYSTEMD_LUKS_LOCK_FD=0 but fd passed, ignoring: %m");
+
+                        h->luks_lock_fd = safe_close(h->luks_lock_fd);
+                }
+
+                return;
+        }
+
+        e = strv_env_get(l, "ERRNO");
+        if (!e)
+                return (void) log_debug("Got notify message lacking both ERRNO= and SYSTEMD_LUKS_LOCK_FD= field, ignoring.");
+
+        r = safe_atoi(e, &error);
+        if (r < 0)
+                return (void) log_debug_errno(r, "Failed to parse received error number, ignoring: %s", e);
+        if (error <= 0)
+                return (void) log_debug("Error number is out of range: %i", error);
+
+        h->worker_error_code = error;
+}
+
+int home_killall(Home *h) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_free_ char *unit = NULL;
+        int r;
+
+        assert(h);
+
+        if (!uid_is_valid(h->uid))
+                return 0;
+
+        assert(h->uid > 0); /* We never should be UID 0 */
+
+        /* Let's kill everything matching the specified UID */
+        r = safe_fork("(sd-killer)",
+                      FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_WAIT|FORK_LOG|FORK_REOPEN_LOG,
+                      NULL);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                gid_t gid;
+
+                /* Child */
+
+                gid = user_record_gid(h->record);
+                if (setresgid(gid, gid, gid) < 0) {
+                        log_error_errno(errno, "Failed to change GID to " GID_FMT ": %m", gid);
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setgroups(0, NULL) < 0) {
+                        log_error_errno(errno, "Failed to reset auxiliary groups list: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setresuid(h->uid, h->uid, h->uid) < 0) {
+                        log_error_errno(errno, "Failed to change UID to " UID_FMT ": %m", h->uid);
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (kill(-1, SIGKILL) < 0) {
+                        log_error_errno(errno, "Failed to kill all processes of UID " UID_FMT ": %m", h->uid);
+                        _exit(EXIT_FAILURE);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        /* Let's also kill everything in the user's slice */
+        if (asprintf(&unit, "user-" UID_FMT ".slice", h->uid) < 0)
+                return log_oom();
+
+        r = bus_call_method(h->manager->bus, bus_systemd_mgr, "KillUnit", &error, NULL, "ssi", unit, "all", SIGKILL);
+        if (r < 0)
+                log_full_errno(sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_UNIT) ? LOG_DEBUG : LOG_WARNING,
+                               r, "Failed to kill login processes of user, ignoring: %s", bus_error_message(&error, r));
+
+        return 1;
+}
+
+static int home_get_disk_status_luks(
+                Home *h,
+                HomeState state,
+                uint64_t *ret_disk_size,
+                uint64_t *ret_disk_usage,
+                uint64_t *ret_disk_free,
+                uint64_t *ret_disk_ceiling,
+                uint64_t *ret_disk_floor,
+                statfs_f_type_t *ret_fstype,
+                mode_t *ret_access_mode) {
+
+        uint64_t disk_size = UINT64_MAX, disk_usage = UINT64_MAX, disk_free = UINT64_MAX,
+                disk_ceiling = UINT64_MAX, disk_floor = UINT64_MAX,
+                stat_used = UINT64_MAX, fs_size = UINT64_MAX, header_size = 0;
+        mode_t access_mode = MODE_INVALID;
+        statfs_f_type_t fstype = 0;
+        struct statfs sfs;
+        struct stat st;
+        const char *hd;
+        int r;
+
+        assert(h);
+
+        if (state != HOME_ABSENT) {
+                const char *ip;
+
+                ip = user_record_image_path(h->record);
+                if (ip) {
+                        if (stat(ip, &st) < 0)
+                                log_debug_errno(errno, "Failed to stat() %s, ignoring: %m", ip);
+                        else if (S_ISREG(st.st_mode)) {
+                                _cleanup_free_ char *parent = NULL;
+
+                                disk_size = st.st_size;
+                                stat_used = st.st_blocks * 512;
+
+                                r = path_extract_directory(ip, &parent);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to extract parent directory from image path '%s': %m", ip);
+
+                                if (statfs(parent, &sfs) < 0)
+                                        log_debug_errno(errno, "Failed to statfs() %s, ignoring: %m", parent);
+                                else
+                                        disk_ceiling = stat_used + sfs.f_bsize * sfs.f_bavail;
+
+                        } else if (S_ISBLK(st.st_mode)) {
+                                _cleanup_free_ char *szbuf = NULL;
+                                char p[SYS_BLOCK_PATH_MAX("/size")];
+
+                                /* Let's read the size off sysfs, so that we don't have to open the device */
+                                xsprintf_sys_block_path(p, "/size", st.st_rdev);
+                                r = read_one_line_file(p, &szbuf);
+                                if (r < 0)
+                                        log_debug_errno(r, "Failed to read %s, ignoring: %m", p);
+                                else {
+                                        uint64_t sz;
+
+                                        r = safe_atou64(szbuf, &sz);
+                                        if (r < 0)
+                                                log_debug_errno(r, "Failed to parse %s, ignoring: %s", p, szbuf);
+                                        else
+                                                disk_size = sz * 512;
+                                }
+                        } else
+                                log_debug("Image path is not a block device or regular file, not able to acquire size.");
+                }
+        }
+
+        if (!HOME_STATE_IS_ACTIVE(state))
+                goto finish;
+
+        hd = user_record_home_directory(h->record);
+        if (!hd)
+                goto finish;
+
+        if (stat(hd, &st) < 0) {
+                log_debug_errno(errno, "Failed to stat() %s, ignoring: %m", hd);
+                goto finish;
+        }
+
+        r = stat_verify_directory(&st);
+        if (r < 0) {
+                log_debug_errno(r, "Home directory %s is not a directory, ignoring: %m", hd);
+                goto finish;
+        }
+
+        access_mode = st.st_mode & 07777;
+
+        if (statfs(hd, &sfs) < 0) {
+                log_debug_errno(errno, "Failed to statfs() %s, ignoring: %m", hd);
+                goto finish;
+        }
+
+        fstype = sfs.f_type;
+
+        disk_free = sfs.f_bsize * sfs.f_bavail;
+        fs_size = sfs.f_bsize * sfs.f_blocks;
+        if (disk_size != UINT64_MAX && disk_size > fs_size)
+                header_size = disk_size - fs_size;
+
+        /* We take a perspective from the user here (as opposed to from the host): the used disk space is the
+         * difference from the limit and what's free. This makes a difference if sparse mode is not used: in
+         * that case the image is pre-allocated and thus appears all used from the host PoV but is not used
+         * up at all yet from the user's PoV.
+         *
+         * That said, we use the stat() reported loopback file size as upper boundary: our footprint can
+         * never be larger than what we take up on the lowest layers. */
+
+        if (disk_size != UINT64_MAX && disk_size > disk_free) {
+                disk_usage = disk_size - disk_free;
+
+                if (stat_used != UINT64_MAX && disk_usage > stat_used)
+                        disk_usage = stat_used;
+        } else
+                disk_usage = stat_used;
+
+        /* If we have the magic, determine floor preferably by magic */
+        disk_floor = minimal_size_by_fs_magic(sfs.f_type) + header_size;
+
+finish:
+        /* If we don't know the magic, go by file system name */
+        if (disk_floor == UINT64_MAX)
+                disk_floor = minimal_size_by_fs_name(user_record_file_system_type(h->record));
+
+        if (ret_disk_size)
+                *ret_disk_size = disk_size;
+        if (ret_disk_usage)
+                *ret_disk_usage = disk_usage;
+        if (ret_disk_free)
+                *ret_disk_free = disk_free;
+        if (ret_disk_ceiling)
+                *ret_disk_ceiling = disk_ceiling;
+        if (ret_disk_floor)
+                *ret_disk_floor = disk_floor;
+        if (ret_fstype)
+                *ret_fstype = fstype;
+        if (ret_access_mode)
+                *ret_access_mode = access_mode;
+
+        return 0;
+}
+
+static int home_get_disk_status_directory(
+                Home *h,
+                HomeState state,
+                uint64_t *ret_disk_size,
+                uint64_t *ret_disk_usage,
+                uint64_t *ret_disk_free,
+                uint64_t *ret_disk_ceiling,
+                uint64_t *ret_disk_floor,
+                statfs_f_type_t *ret_fstype,
+                mode_t *ret_access_mode) {
+
+        uint64_t disk_size = UINT64_MAX, disk_usage = UINT64_MAX, disk_free = UINT64_MAX,
+                disk_ceiling = UINT64_MAX, disk_floor = UINT64_MAX;
+        mode_t access_mode = MODE_INVALID;
+        statfs_f_type_t fstype = 0;
+        struct statfs sfs;
+        struct dqblk req;
+        const char *path = NULL;
+        int r;
+
+        assert(h);
+
+        if (HOME_STATE_IS_ACTIVE(state))
+                path = user_record_home_directory(h->record);
+
+        if (!path) {
+                if (state == HOME_ABSENT)
+                        goto finish;
+
+                path = user_record_image_path(h->record);
+        }
+
+        if (!path)
+                goto finish;
+
+        if (statfs(path, &sfs) < 0)
+                log_debug_errno(errno, "Failed to statfs() %s, ignoring: %m", path);
+        else {
+                disk_free = sfs.f_bsize * sfs.f_bavail;
+                disk_size = sfs.f_bsize * sfs.f_blocks;
+
+                /* We don't initialize disk_usage from statfs() data here, since the device is likely not used
+                 * by us alone, and disk_usage should only reflect our own use. */
+
+                fstype = sfs.f_type;
+        }
+
+        if (IN_SET(h->record->storage, USER_CLASSIC, USER_DIRECTORY, USER_SUBVOLUME)) {
+
+                r = btrfs_is_subvol(path);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to determine whether %s is a btrfs subvolume: %m", path);
+                else if (r > 0) {
+                        BtrfsQuotaInfo qi;
+
+                        r = btrfs_subvol_get_subtree_quota(path, 0, &qi);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to query btrfs subtree quota, ignoring: %m");
+                        else {
+                                disk_usage = qi.referenced;
+
+                                if (disk_free != UINT64_MAX) {
+                                        disk_ceiling = qi.referenced + disk_free;
+
+                                        if (disk_size != UINT64_MAX && disk_ceiling > disk_size)
+                                                disk_ceiling = disk_size;
+                                }
+
+                                if (qi.referenced_max != UINT64_MAX) {
+                                        if (disk_size != UINT64_MAX)
+                                                disk_size = MIN(qi.referenced_max, disk_size);
+                                        else
+                                                disk_size = qi.referenced_max;
+                                }
+
+                                if (disk_size != UINT64_MAX) {
+                                        if (disk_size > disk_usage)
+                                                disk_free = disk_size - disk_usage;
+                                        else
+                                                disk_free = 0;
+                                }
+                        }
+
+                        goto finish;
+                }
+        }
+
+        if (IN_SET(h->record->storage, USER_CLASSIC, USER_DIRECTORY, USER_FSCRYPT)) {
+                r = quotactl_path(QCMD_FIXED(Q_GETQUOTA, USRQUOTA), path, h->uid, &req);
+                if (r < 0) {
+                        if (ERRNO_IS_NOT_SUPPORTED(r)) {
+                                log_debug_errno(r, "No UID quota support on %s.", path);
+                                goto finish;
+                        }
+
+                        if (r != -ESRCH) {
+                                log_debug_errno(r, "Failed to query disk quota for UID " UID_FMT ": %m", h->uid);
+                                goto finish;
+                        }
+
+                        disk_usage = 0; /* No record of this user? then nothing was used */
+                } else {
+                        if (FLAGS_SET(req.dqb_valid, QIF_SPACE) && disk_free != UINT64_MAX) {
+                                disk_ceiling = req.dqb_curspace + disk_free;
+
+                                if (disk_size != UINT64_MAX && disk_ceiling > disk_size)
+                                        disk_ceiling = disk_size;
+                        }
+
+                        if (FLAGS_SET(req.dqb_valid, QIF_BLIMITS)) {
+                                uint64_t q;
+
+                                /* Take the minimum of the quota and the available disk space here */
+                                q = req.dqb_bhardlimit * QIF_DQBLKSIZE;
+                                if (disk_size != UINT64_MAX)
+                                        disk_size = MIN(disk_size, q);
+                                else
+                                        disk_size = q;
+                        }
+                        if (FLAGS_SET(req.dqb_valid, QIF_SPACE)) {
+                                disk_usage = req.dqb_curspace;
+
+                                if (disk_size != UINT64_MAX) {
+                                        if (disk_size > disk_usage)
+                                                disk_free = disk_size - disk_usage;
+                                        else
+                                                disk_free = 0;
+                                }
+                        }
+                }
+        }
+
+finish:
+        if (ret_disk_size)
+                *ret_disk_size = disk_size;
+        if (ret_disk_usage)
+                *ret_disk_usage = disk_usage;
+        if (ret_disk_free)
+                *ret_disk_free = disk_free;
+        if (ret_disk_ceiling)
+                *ret_disk_ceiling = disk_ceiling;
+        if (ret_disk_floor)
+                *ret_disk_floor = disk_floor;
+        if (ret_fstype)
+                *ret_fstype = fstype;
+        if (ret_access_mode)
+                *ret_access_mode = access_mode;
+
+        return 0;
+}
+
+static int home_get_disk_status_internal(
+                Home *h,
+                HomeState state,
+                uint64_t *ret_disk_size,
+                uint64_t *ret_disk_usage,
+                uint64_t *ret_disk_free,
+                uint64_t *ret_disk_ceiling,
+                uint64_t *ret_disk_floor,
+                statfs_f_type_t *ret_fstype,
+                mode_t *ret_access_mode) {
+
+        assert(h);
+        assert(h->record);
+
+        switch (h->record->storage) {
+
+        case USER_LUKS:
+                return home_get_disk_status_luks(h, state, ret_disk_size, ret_disk_usage, ret_disk_free, ret_disk_ceiling, ret_disk_floor, ret_fstype, ret_access_mode);
+
+        case USER_CLASSIC:
+        case USER_DIRECTORY:
+        case USER_SUBVOLUME:
+        case USER_FSCRYPT:
+        case USER_CIFS:
+                return home_get_disk_status_directory(h, state, ret_disk_size, ret_disk_usage, ret_disk_free, ret_disk_ceiling, ret_disk_floor, ret_fstype, ret_access_mode);
+
+        default:
+                /* don't know */
+
+                if (ret_disk_size)
+                        *ret_disk_size = UINT64_MAX;
+                if (ret_disk_usage)
+                        *ret_disk_usage = UINT64_MAX;
+                if (ret_disk_free)
+                        *ret_disk_free = UINT64_MAX;
+                if (ret_disk_ceiling)
+                        *ret_disk_ceiling = UINT64_MAX;
+                if (ret_disk_floor)
+                        *ret_disk_floor = UINT64_MAX;
+                if (ret_fstype)
+                        *ret_fstype = 0;
+                if (ret_access_mode)
+                        *ret_access_mode = MODE_INVALID;
+
+                return 0;
+        }
+}
+
+int home_get_disk_status(
+                Home *h,
+                uint64_t *ret_disk_size,
+                uint64_t *ret_disk_usage,
+                uint64_t *ret_disk_free,
+                uint64_t *ret_disk_ceiling,
+                uint64_t *ret_disk_floor,
+                statfs_f_type_t *ret_fstype,
+                mode_t *ret_access_mode) {
+
+        assert(h);
+
+        return home_get_disk_status_internal(
+                        h,
+                        home_get_state(h),
+                        ret_disk_size,
+                        ret_disk_usage,
+                        ret_disk_free,
+                        ret_disk_ceiling,
+                        ret_disk_floor,
+                        ret_fstype,
+                        ret_access_mode);
+}
+
+int home_augment_status(
+                Home *h,
+                UserRecordLoadFlags flags,
+                UserRecord **ret) {
+
+        uint64_t disk_size = UINT64_MAX, disk_usage = UINT64_MAX, disk_free = UINT64_MAX, disk_ceiling = UINT64_MAX, disk_floor = UINT64_MAX;
+        _cleanup_(json_variant_unrefp) JsonVariant *j = NULL, *v = NULL, *m = NULL, *status = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *ur = NULL;
+        statfs_f_type_t magic;
+        const char *fstype;
+        mode_t access_mode;
+        HomeState state;
+        sd_id128_t id;
+        int r;
+
+        assert(h);
+        assert(ret);
+
+        /* We are supposed to add this, this can't be on hence. */
+        assert(!FLAGS_SET(flags, USER_RECORD_STRIP_STATUS));
+
+        r = sd_id128_get_machine(&id);
+        if (r < 0)
+                return r;
+
+        state = home_get_state(h);
+
+        r = home_get_disk_status_internal(
+                        h, state,
+                        &disk_size,
+                        &disk_usage,
+                        &disk_free,
+                        &disk_ceiling,
+                        &disk_floor,
+                        &magic,
+                        &access_mode);
+        if (r < 0)
+                return r;
+
+        fstype = fs_type_to_string(magic);
+
+        if (disk_floor == UINT64_MAX || (disk_usage != UINT64_MAX && disk_floor < disk_usage))
+                disk_floor = disk_usage;
+        if (disk_floor == UINT64_MAX || disk_floor < USER_DISK_SIZE_MIN)
+                disk_floor = USER_DISK_SIZE_MIN;
+        if (disk_ceiling == UINT64_MAX || disk_ceiling > USER_DISK_SIZE_MAX)
+                disk_ceiling = USER_DISK_SIZE_MAX;
+
+        r = json_build(&status,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("state", JSON_BUILD_STRING(home_state_to_string(state))),
+                                       JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.Home")),
+                                       JSON_BUILD_PAIR_CONDITION(disk_size != UINT64_MAX, "diskSize", JSON_BUILD_UNSIGNED(disk_size)),
+                                       JSON_BUILD_PAIR_CONDITION(disk_usage != UINT64_MAX, "diskUsage", JSON_BUILD_UNSIGNED(disk_usage)),
+                                       JSON_BUILD_PAIR_CONDITION(disk_free != UINT64_MAX, "diskFree", JSON_BUILD_UNSIGNED(disk_free)),
+                                       JSON_BUILD_PAIR_CONDITION(disk_ceiling != UINT64_MAX, "diskCeiling", JSON_BUILD_UNSIGNED(disk_ceiling)),
+                                       JSON_BUILD_PAIR_CONDITION(disk_floor != UINT64_MAX, "diskFloor", JSON_BUILD_UNSIGNED(disk_floor)),
+                                       JSON_BUILD_PAIR_CONDITION(h->signed_locally >= 0, "signedLocally", JSON_BUILD_BOOLEAN(h->signed_locally)),
+                                       JSON_BUILD_PAIR_CONDITION(fstype, "fileSystemType", JSON_BUILD_STRING(fstype)),
+                                       JSON_BUILD_PAIR_CONDITION(access_mode != MODE_INVALID, "accessMode", JSON_BUILD_UNSIGNED(access_mode))
+                       ));
+        if (r < 0)
+                return r;
+
+        j = json_variant_ref(h->record->json);
+        v = json_variant_ref(json_variant_by_key(j, "status"));
+        m = json_variant_ref(json_variant_by_key(v, SD_ID128_TO_STRING(id)));
+
+        r = json_variant_filter(&m, STRV_MAKE("diskSize", "diskUsage", "diskFree", "diskCeiling", "diskFloor", "signedLocally"));
+        if (r < 0)
+                return r;
+
+        r = json_variant_merge_object(&m, status);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&v, SD_ID128_TO_STRING(id), m);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&j, "status", v);
+        if (r < 0)
+                return r;
+
+        ur = user_record_new();
+        if (!ur)
+                return -ENOMEM;
+
+        r = user_record_load(ur, j, flags);
+        if (r < 0)
+                return r;
+
+        ur->incomplete =
+                FLAGS_SET(h->record->mask, USER_RECORD_PRIVILEGED) &&
+                !FLAGS_SET(ur->mask, USER_RECORD_PRIVILEGED);
+
+        *ret = TAKE_PTR(ur);
+        return 0;
+}
+
+static int on_home_ref_eof(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        _cleanup_(operation_unrefp) Operation *o = NULL;
+        Home *h = ASSERT_PTR(userdata);
+
+        assert(s);
+
+        if (h->ref_event_source_please_suspend == s)
+                h->ref_event_source_please_suspend = sd_event_source_disable_unref(h->ref_event_source_please_suspend);
+
+        if (h->ref_event_source_dont_suspend == s)
+                h->ref_event_source_dont_suspend = sd_event_source_disable_unref(h->ref_event_source_dont_suspend);
+
+        if (h->ref_event_source_dont_suspend || h->ref_event_source_please_suspend)
+                return 0;
+
+        log_info("Got notification that all sessions of user %s ended, deactivating automatically.", h->user_name);
+
+        o = operation_new(OPERATION_PIPE_EOF, NULL);
+        if (!o) {
+                log_oom();
+                return 0;
+        }
+
+        home_schedule_operation(h, o, NULL);
+        return 0;
+}
+
+int home_create_fifo(Home *h, bool please_suspend) {
+        _cleanup_close_ int ret_fd = -EBADF;
+        sd_event_source **ss;
+        const char *fn, *suffix;
+        int r;
+
+        assert(h);
+
+        if (please_suspend) {
+                suffix = ".please-suspend";
+                ss = &h->ref_event_source_please_suspend;
+        } else {
+                suffix = ".dont-suspend";
+                ss = &h->ref_event_source_dont_suspend;
+        }
+
+        fn = strjoina("/run/systemd/home/", h->user_name, suffix);
+
+        if (!*ss) {
+                _cleanup_close_ int ref_fd = -EBADF;
+
+                (void) mkdir("/run/systemd/home/", 0755);
+                if (mkfifo(fn, 0600) < 0 && errno != EEXIST)
+                        return log_error_errno(errno, "Failed to create FIFO %s: %m", fn);
+
+                ref_fd = open(fn, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
+                if (ref_fd < 0)
+                        return log_error_errno(errno, "Failed to open FIFO %s for reading: %m", fn);
+
+                r = sd_event_add_io(h->manager->event, ss, ref_fd, 0, on_home_ref_eof, h);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate reference FIFO event source: %m");
+
+                (void) sd_event_source_set_description(*ss, "acquire-ref");
+
+                r = sd_event_source_set_priority(*ss, SD_EVENT_PRIORITY_IDLE-1);
+                if (r < 0)
+                        return r;
+
+                r = sd_event_source_set_io_fd_own(*ss, true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to pass ownership of FIFO event fd to event source: %m");
+
+                TAKE_FD(ref_fd);
+        }
+
+        ret_fd = open(fn, O_WRONLY|O_CLOEXEC|O_NONBLOCK);
+        if (ret_fd < 0)
+                return log_error_errno(errno, "Failed to open FIFO %s for writing: %m", fn);
+
+        return TAKE_FD(ret_fd);
+}
+
+static int home_dispatch_acquire(Home *h, Operation *o) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int (*call)(Home *h, UserRecord *secret, HomeState for_state, sd_bus_error *error) = NULL;
+        HomeState for_state;
+        int r;
+
+        assert(h);
+        assert(o);
+        assert(o->type == OPERATION_ACQUIRE);
+
+        switch (home_get_state(h)) {
+
+        case HOME_UNFIXATED:
+                for_state = HOME_FIXATING_FOR_ACQUIRE;
+                call = home_fixate_internal;
+                break;
+
+        case HOME_ABSENT:
+                r = sd_bus_error_setf(&error, BUS_ERROR_HOME_ABSENT,
+                                      "Home %s is currently missing or not plugged in.", h->user_name);
+                goto check;
+
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                for_state = HOME_ACTIVATING_FOR_ACQUIRE;
+                call = home_activate_internal;
+                break;
+
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                for_state = HOME_AUTHENTICATING_FOR_ACQUIRE;
+                call = home_authenticate_internal;
+                break;
+
+        case HOME_LOCKED:
+                for_state = HOME_UNLOCKING_FOR_ACQUIRE;
+                call = home_unlock_internal;
+                break;
+
+        default:
+                /* All other cases means we are currently executing an operation, which means the job remains
+                 * pending. */
+                return 0;
+        }
+
+        assert(!h->current_operation);
+
+        r = home_ratelimit(h, &error);
+        if (r >= 0)
+                r = call(h, o->secret, for_state, &error);
+
+ check:
+        if (r != 0) /* failure or completed */
+                operation_result(o, r, &error);
+        else /* ongoing */
+                h->current_operation = operation_ref(o);
+
+        return 1;
+}
+
+static int home_dispatch_release(Home *h, Operation *o) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(o);
+        assert(o->type == OPERATION_RELEASE);
+
+        if (h->ref_event_source_dont_suspend || h->ref_event_source_please_suspend)
+                /* If there's now a reference again, then let's abort the release attempt */
+                r = sd_bus_error_setf(&error, BUS_ERROR_HOME_BUSY, "Home %s is currently referenced.", h->user_name);
+        else {
+                switch (home_get_state(h)) {
+
+                case HOME_UNFIXATED:
+                case HOME_ABSENT:
+                case HOME_INACTIVE:
+                case HOME_DIRTY:
+                        r = 1; /* done */
+                        break;
+
+                case HOME_LOCKED:
+                        r = sd_bus_error_setf(&error, BUS_ERROR_HOME_LOCKED, "Home %s is currently locked.", h->user_name);
+                        break;
+
+                case HOME_ACTIVE:
+                case HOME_LINGERING:
+                        r = home_deactivate_internal(h, false, &error);
+                        break;
+
+                default:
+                        /* All other cases means we are currently executing an operation, which means the job remains
+                         * pending. */
+                        return 0;
+                }
+        }
+
+        assert(!h->current_operation);
+
+        if (r != 0) /* failure or completed */
+                operation_result(o, r, &error);
+        else /* ongoing */
+                h->current_operation = operation_ref(o);
+
+        return 1;
+}
+
+static int home_dispatch_lock_all(Home *h, Operation *o) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(o);
+        assert(o->type == OPERATION_LOCK_ALL);
+
+        switch (home_get_state(h)) {
+
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                log_info("Home %s is not active, no locking necessary.", h->user_name);
+                r = 1; /* done */
+                break;
+
+        case HOME_LOCKED:
+                log_info("Home %s is already locked.", h->user_name);
+                r = 1; /* done */
+                break;
+
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                log_info("Locking home %s.", h->user_name);
+                r = home_lock(h, &error);
+                break;
+
+        default:
+                /* All other cases means we are currently executing an operation, which means the job remains
+                 * pending. */
+                return 0;
+        }
+
+        assert(!h->current_operation);
+
+        if (r != 0) /* failure or completed */
+                operation_result(o, r, &error);
+        else /* ongoing */
+                h->current_operation = operation_ref(o);
+
+        return 1;
+}
+
+static int home_dispatch_deactivate_all(Home *h, Operation *o) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(o);
+        assert(o->type == OPERATION_DEACTIVATE_ALL);
+
+        switch (home_get_state(h)) {
+
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                log_info("Home %s is already deactivated.", h->user_name);
+                r = 1; /* done */
+                break;
+
+        case HOME_LOCKED:
+                log_info("Home %s is currently locked, not deactivating.", h->user_name);
+                r = 1; /* done */
+                break;
+
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                log_info("Deactivating home %s.", h->user_name);
+                r = home_deactivate_internal(h, false, &error);
+                break;
+
+        default:
+                /* All other cases means we are currently executing an operation, which means the job remains
+                 * pending. */
+                return 0;
+        }
+
+        assert(!h->current_operation);
+
+        if (r != 0) /* failure or completed */
+                operation_result(o, r, &error);
+        else /* ongoing */
+                h->current_operation = operation_ref(o);
+
+        return 1;
+}
+
+static int home_dispatch_pipe_eof(Home *h, Operation *o) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(o);
+        assert(o->type == OPERATION_PIPE_EOF);
+
+        if (h->ref_event_source_please_suspend || h->ref_event_source_dont_suspend)
+                return 1; /* Hmm, there's a reference again, let's cancel this */
+
+        switch (home_get_state(h)) {
+
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                log_info("Home %s already deactivated, no automatic deactivation needed.", h->user_name);
+                break;
+
+        case HOME_DEACTIVATING:
+                log_info("Home %s is already being deactivated, automatic deactivated unnecessary.", h->user_name);
+                break;
+
+        case HOME_ACTIVE:
+        case HOME_LINGERING:
+                r = home_deactivate_internal(h, false, &error);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to deactivate %s, ignoring: %s", h->user_name, bus_error_message(&error, r));
+                break;
+
+        case HOME_LOCKED:
+        default:
+                /* If the device is locked or any operation is being executed, let's leave this pending */
+                return 0;
+        }
+
+        /* Note that we don't call operation_fail() or operation_success() here, because this kind of
+         * operation has no message associated with it, and thus there's no need to propagate success. */
+
+        assert(!o->message);
+        return 1;
+}
+
+static int home_dispatch_deactivate_force(Home *h, Operation *o) {
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        assert(h);
+        assert(o);
+        assert(o->type == OPERATION_DEACTIVATE_FORCE);
+
+        switch (home_get_state(h)) {
+
+        case HOME_UNFIXATED:
+        case HOME_ABSENT:
+        case HOME_INACTIVE:
+        case HOME_DIRTY:
+                log_debug("Home %s already deactivated, no forced deactivation due to unplug needed.", h->user_name);
+                break;
+
+        case HOME_DEACTIVATING:
+                log_debug("Home %s is already being deactivated, forced deactivation due to unplug unnecessary.", h->user_name);
+                break;
+
+        case HOME_ACTIVE:
+        case HOME_LOCKED:
+        case HOME_LINGERING:
+                r = home_deactivate_internal(h, true, &error);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to forcibly deactivate %s, ignoring: %s", h->user_name, bus_error_message(&error, r));
+                break;
+
+        default:
+                /* If any operation is being executed, let's leave this pending */
+                return 0;
+        }
+
+        /* Note that we don't call operation_fail() or operation_success() here, because this kind of
+         * operation has no message associated with it, and thus there's no need to propagate success. */
+
+        assert(!o->message);
+        return 1;
+}
+
+static int on_pending(sd_event_source *s, void *userdata) {
+        Home *h = ASSERT_PTR(userdata);
+        Operation *o;
+        int r;
+
+        assert(s);
+
+        o = ordered_set_first(h->pending_operations);
+        if (o) {
+                static int (* const operation_table[_OPERATION_MAX])(Home *h, Operation *o) = {
+                        [OPERATION_ACQUIRE]          = home_dispatch_acquire,
+                        [OPERATION_RELEASE]          = home_dispatch_release,
+                        [OPERATION_LOCK_ALL]         = home_dispatch_lock_all,
+                        [OPERATION_DEACTIVATE_ALL]   = home_dispatch_deactivate_all,
+                        [OPERATION_PIPE_EOF]         = home_dispatch_pipe_eof,
+                        [OPERATION_DEACTIVATE_FORCE] = home_dispatch_deactivate_force,
+                };
+
+                assert(operation_table[o->type]);
+                r = operation_table[o->type](h, o);
+                if (r != 0) {
+                        /* The operation completed, let's remove it from the pending list, and exit while
+                         * leaving the event source enabled as it is. */
+                        assert_se(ordered_set_remove(h->pending_operations, o) == o);
+                        operation_unref(o);
+                        return 0;
+                }
+        }
+
+        /* Nothing to do anymore, let's turn off this event source */
+        r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to disable event source: %m");
+
+        /* No operations pending anymore, maybe this is a good time to trigger a rebalancing */
+        manager_reschedule_rebalance(h->manager);
+        return 0;
+}
+
+int home_schedule_operation(Home *h, Operation *o, sd_bus_error *error) {
+        int r;
+
+        assert(h);
+
+        if (o) {
+                if (ordered_set_size(h->pending_operations) >= PENDING_OPERATIONS_MAX)
+                        return sd_bus_error_set(error, BUS_ERROR_TOO_MANY_OPERATIONS, "Too many client operations requested");
+
+                r = ordered_set_ensure_put(&h->pending_operations, &operation_hash_ops, o);
+                if (r < 0)
+                        return r;
+
+                operation_ref(o);
+        }
+
+        if (!h->pending_event_source) {
+                r = sd_event_add_defer(h->manager->event, &h->pending_event_source, on_pending, h);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate pending defer event source: %m");
+
+                (void) sd_event_source_set_description(h->pending_event_source, "pending");
+
+                r = sd_event_source_set_priority(h->pending_event_source, SD_EVENT_PRIORITY_IDLE);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_event_source_set_enabled(h->pending_event_source, SD_EVENT_ON);
+        if (r < 0)
+                return log_error_errno(r, "Failed to trigger pending event source: %m");
+
+        return 0;
+}
+
+static int home_get_image_path_seat(Home *h, char **ret) {
+        _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+        _cleanup_free_ char *c = NULL;
+        const char *ip, *seat;
+        struct stat st;
+        int r;
+
+        assert(h);
+
+        if (user_record_storage(h->record) != USER_LUKS)
+                return -ENXIO;
+
+        ip = user_record_image_path(h->record);
+        if (!ip)
+                return -ENXIO;
+
+        if (!path_startswith(ip, "/dev/"))
+                return -ENXIO;
+
+        if (stat(ip, &st) < 0)
+                return -errno;
+
+        if (!S_ISBLK(st.st_mode))
+                return -ENOTBLK;
+
+        r = sd_device_new_from_stat_rdev(&d, &st);
+        if (r < 0)
+                return r;
+
+        r = sd_device_get_property_value(d, "ID_SEAT", &seat);
+        if (r == -ENOENT) /* no property means seat0 */
+                seat = "seat0";
+        else if (r < 0)
+                return r;
+
+        c = strdup(seat);
+        if (!c)
+                return -ENOMEM;
+
+        *ret = TAKE_PTR(c);
+        return 0;
+}
+
+int home_auto_login(Home *h, char ***ret_seats) {
+        _cleanup_free_ char *seat = NULL, *seat2 = NULL;
+
+        assert(h);
+        assert(ret_seats);
+
+        (void) home_get_image_path_seat(h, &seat);
+
+        if (h->record->auto_login > 0 && !streq_ptr(seat, "seat0")) {
+                /* For now, when the auto-login boolean is set for a user, let's make it mean
+                 * "seat0". Eventually we can extend the concept and allow configuration of any kind of seat,
+                 * but let's keep simple initially, most likely the feature is interesting on single-user
+                 * systems anyway, only.
+                 *
+                 * We filter out users marked for auto-login in we know for sure their home directory is
+                 * absent. */
+
+                if (user_record_test_image_path(h->record) != USER_TEST_ABSENT) {
+                        seat2 = strdup("seat0");
+                        if (!seat2)
+                                return -ENOMEM;
+                }
+        }
+
+        if (seat || seat2) {
+                _cleanup_strv_free_ char **list = NULL;
+                size_t i = 0;
+
+                list = new(char*, 3);
+                if (!list)
+                        return -ENOMEM;
+
+                if (seat)
+                        list[i++] = TAKE_PTR(seat);
+                if (seat2)
+                        list[i++] = TAKE_PTR(seat2);
+
+                list[i] = NULL;
+                *ret_seats = TAKE_PTR(list);
+                return 1;
+        }
+
+        *ret_seats = NULL;
+        return 0;
+}
+
+int home_set_current_message(Home *h, sd_bus_message *m) {
+        assert(h);
+
+        if (!m)
+                return 0;
+
+        if (h->current_operation)
+                return -EBUSY;
+
+        h->current_operation = operation_new(OPERATION_IMMEDIATE, m);
+        if (!h->current_operation)
+                return -ENOMEM;
+
+        return 1;
+}
+
+int home_wait_for_worker(Home *h) {
+        int r;
+
+        assert(h);
+
+        if (h->worker_pid <= 0)
+                return 0;
+
+        log_info("Worker process for home %s is still running while exiting. Waiting for it to finish.", h->user_name);
+
+        r = wait_for_terminate_with_timeout(h->worker_pid, 30 * USEC_PER_SEC);
+        if (r == -ETIMEDOUT)
+                log_warning_errno(r, "Waiting for worker process for home %s timed out. Ignoring.", h->user_name);
+        else if (r < 0)
+                log_warning_errno(r, "Failed to wait for worker process for home %s. Ignoring.", h->user_name);
+
+        (void) hashmap_remove_value(h->manager->homes_by_worker_pid, PID_TO_PTR(h->worker_pid), h);
+        h->worker_pid = 0;
+        return 1;
+}
+
+bool home_shall_rebalance(Home *h) {
+        HomeState state;
+
+        assert(h);
+
+        /* Determines if the home directory is a candidate for rebalancing */
+
+        if (!user_record_shall_rebalance(h->record))
+                return false;
+
+        state = home_get_state(h);
+        if (!HOME_STATE_SHALL_REBALANCE(state))
+                return false;
+
+        return true;
+}
+
+bool home_is_busy(Home *h) {
+        assert(h);
+
+        if (h->current_operation)
+                return true;
+
+        if (!ordered_set_isempty(h->pending_operations))
+                return true;
+
+        return HOME_STATE_IS_EXECUTING_OPERATION(home_get_state(h));
+}
+
+static const char* const home_state_table[_HOME_STATE_MAX] = {
+        [HOME_UNFIXATED]                   = "unfixated",
+        [HOME_ABSENT]                      = "absent",
+        [HOME_INACTIVE]                    = "inactive",
+        [HOME_DIRTY]                       = "dirty",
+        [HOME_FIXATING]                    = "fixating",
+        [HOME_FIXATING_FOR_ACTIVATION]     = "fixating-for-activation",
+        [HOME_FIXATING_FOR_ACQUIRE]        = "fixating-for-acquire",
+        [HOME_ACTIVATING]                  = "activating",
+        [HOME_ACTIVATING_FOR_ACQUIRE]      = "activating-for-acquire",
+        [HOME_DEACTIVATING]                = "deactivating",
+        [HOME_ACTIVE]                      = "active",
+        [HOME_LINGERING]                   = "lingering",
+        [HOME_LOCKING]                     = "locking",
+        [HOME_LOCKED]                      = "locked",
+        [HOME_UNLOCKING]                   = "unlocking",
+        [HOME_UNLOCKING_FOR_ACQUIRE]       = "unlocking-for-acquire",
+        [HOME_CREATING]                    = "creating",
+        [HOME_REMOVING]                    = "removing",
+        [HOME_UPDATING]                    = "updating",
+        [HOME_UPDATING_WHILE_ACTIVE]       = "updating-while-active",
+        [HOME_RESIZING]                    = "resizing",
+        [HOME_RESIZING_WHILE_ACTIVE]       = "resizing-while-active",
+        [HOME_PASSWD]                      = "passwd",
+        [HOME_PASSWD_WHILE_ACTIVE]         = "passwd-while-active",
+        [HOME_AUTHENTICATING]              = "authenticating",
+        [HOME_AUTHENTICATING_WHILE_ACTIVE] = "authenticating-while-active",
+        [HOME_AUTHENTICATING_FOR_ACQUIRE]  = "authenticating-for-acquire",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(home_state, HomeState);
diff --git a/src/home/homed-home.h b/src/home/homed-home.h
new file mode 100644
index 0000000..0f314aa
--- /dev/null
+++ b/src/home/homed-home.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Home Home;
+
+#include "homed-manager.h"
+#include "homed-operation.h"
+#include "list.h"
+#include "ordered-set.h"
+#include "stat-util.h"
+#include "user-record.h"
+
+typedef enum HomeState {
+        HOME_UNFIXATED,               /* home exists, but local record does not */
+        HOME_ABSENT,                  /* local record exists, but home does not */
+        HOME_INACTIVE,                /* record and home exist, but is not logged in */
+        HOME_DIRTY,                   /* like HOME_INACTIVE, but the home directory wasn't cleanly deactivated */
+        HOME_FIXATING,                /* generating local record from home */
+        HOME_FIXATING_FOR_ACTIVATION, /* fixating in order to activate soon */
+        HOME_FIXATING_FOR_ACQUIRE,    /* fixating because Acquire() was called */
+        HOME_ACTIVATING,
+        HOME_ACTIVATING_FOR_ACQUIRE,  /* activating because Acquire() was called */
+        HOME_DEACTIVATING,
+        HOME_ACTIVE,                  /* logged in right now */
+        HOME_LINGERING,               /* not logged in anymore, but we didn't manage to deactivate (because some process keeps it busy?) but we'll keep trying */
+        HOME_LOCKING,
+        HOME_LOCKED,
+        HOME_UNLOCKING,
+        HOME_UNLOCKING_FOR_ACQUIRE,   /* unlocking because Acquire() was called */
+        HOME_CREATING,
+        HOME_REMOVING,
+        HOME_UPDATING,
+        HOME_UPDATING_WHILE_ACTIVE,
+        HOME_RESIZING,
+        HOME_RESIZING_WHILE_ACTIVE,
+        HOME_PASSWD,
+        HOME_PASSWD_WHILE_ACTIVE,
+        HOME_AUTHENTICATING,
+        HOME_AUTHENTICATING_WHILE_ACTIVE,
+        HOME_AUTHENTICATING_FOR_ACQUIRE,  /* authenticating because Acquire() was called */
+        _HOME_STATE_MAX,
+        _HOME_STATE_INVALID = -EINVAL,
+} HomeState;
+
+static inline bool HOME_STATE_IS_ACTIVE(HomeState state) {
+        return IN_SET(state,
+                      HOME_ACTIVE,
+                      HOME_LINGERING,
+                      HOME_UPDATING_WHILE_ACTIVE,
+                      HOME_RESIZING_WHILE_ACTIVE,
+                      HOME_PASSWD_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_FOR_ACQUIRE);
+}
+
+static inline bool HOME_STATE_IS_EXECUTING_OPERATION(HomeState state) {
+        return IN_SET(state,
+                      HOME_FIXATING,
+                      HOME_FIXATING_FOR_ACTIVATION,
+                      HOME_FIXATING_FOR_ACQUIRE,
+                      HOME_ACTIVATING,
+                      HOME_ACTIVATING_FOR_ACQUIRE,
+                      HOME_DEACTIVATING,
+                      HOME_LOCKING,
+                      HOME_UNLOCKING,
+                      HOME_UNLOCKING_FOR_ACQUIRE,
+                      HOME_CREATING,
+                      HOME_REMOVING,
+                      HOME_UPDATING,
+                      HOME_UPDATING_WHILE_ACTIVE,
+                      HOME_RESIZING,
+                      HOME_RESIZING_WHILE_ACTIVE,
+                      HOME_PASSWD,
+                      HOME_PASSWD_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING,
+                      HOME_AUTHENTICATING_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_FOR_ACQUIRE);
+}
+
+static inline bool HOME_STATE_SHALL_PIN(HomeState state) {
+        /* Like HOME_STATE_IS_ACTIVE() – but HOME_LINGERING is missing! */
+        return IN_SET(state,
+                      HOME_ACTIVE,
+                      HOME_UPDATING_WHILE_ACTIVE,
+                      HOME_RESIZING_WHILE_ACTIVE,
+                      HOME_PASSWD_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_FOR_ACQUIRE);
+}
+
+#define HOME_STATE_SHALL_REBALANCE(state) HOME_STATE_SHALL_PIN(state)
+
+static inline bool HOME_STATE_MAY_RETRY_DEACTIVATE(HomeState state) {
+        /* Indicates when to leave the deactivate retry timer active */
+        return IN_SET(state,
+                      HOME_ACTIVE,
+                      HOME_LINGERING,
+                      HOME_DEACTIVATING,
+                      HOME_LOCKING,
+                      HOME_UNLOCKING,
+                      HOME_UNLOCKING_FOR_ACQUIRE,
+                      HOME_UPDATING_WHILE_ACTIVE,
+                      HOME_RESIZING_WHILE_ACTIVE,
+                      HOME_PASSWD_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_WHILE_ACTIVE,
+                      HOME_AUTHENTICATING_FOR_ACQUIRE);
+}
+
+struct Home {
+        Manager *manager;
+        char *user_name;
+        uid_t uid;
+
+        char *sysfs; /* When found via plugged in device, the sysfs path to it */
+
+        /* Note that the 'state' field is only set to a state while we are doing something (i.e. activating,
+         * deactivating, creating, removing, and such), or when the home is an "unfixated" one. When we are
+         * done with an operation we invalidate the state. This is hint for home_get_state() to check the
+         * state on request as needed from the mount table and similar. */
+        HomeState state;
+        int signed_locally; /* signed only by us */
+
+        UserRecord *record;
+
+        pid_t worker_pid;
+        int worker_stdout_fd;
+        sd_event_source *worker_event_source;
+        int worker_error_code;
+
+        /* The message we are currently processing, and thus need to reply to on completion */
+        Operation *current_operation;
+
+        /* Stores the raw, plaintext passwords, but only for short periods of time */
+        UserRecord *secret;
+
+        /* When we create a home area and that fails, we should possibly unregister the record altogether
+         * again, which is remembered in this boolean. */
+        bool unregister_on_failure;
+
+        /* The reading side of a FIFO stored in /run/systemd/home/, the writing side being used for reference
+         * counting. The references dropped to zero as soon as we see EOF. This concept exists twice: once
+         * for clients that are fine if we suspend the home directory on system suspend, and once for clients
+         * that are not ok with that. This allows us to determine for each home whether there are any clients
+         * that support unsuspend. */
+        sd_event_source *ref_event_source_please_suspend;
+        sd_event_source *ref_event_source_dont_suspend;
+
+        /* Any pending operations we still need to execute. These are for operations we want to queue if we
+         * can't execute them right-away. */
+        OrderedSet *pending_operations;
+
+        /* A defer event source that processes pending acquire/release/eof events. We have a common
+         * dispatcher that processes all three kinds of events. */
+        sd_event_source *pending_event_source;
+
+        /* Did we send out a D-Bus notification about this entry? */
+        bool announced;
+
+        /* Used to coalesce bus PropertiesChanged events */
+        sd_event_source *deferred_change_event_source;
+
+        /* An fd to the top-level home directory we keep while logged in, to keep the dir busy */
+        int pin_fd;
+
+        /* A time event used to repeatedly try to unmount home dir after use if it didn't work on first try */
+        sd_event_source *retry_deactivate_event_source;
+
+        /* An fd that locks the backing file of LUKS home dirs with a BSD lock. */
+        int luks_lock_fd;
+
+        /* Space metrics during rebalancing */
+        uint64_t rebalance_size, rebalance_usage, rebalance_free, rebalance_min, rebalance_weight, rebalance_goal;
+
+        /* Whether a rebalance operation is pending */
+        bool rebalance_pending;
+};
+
+int home_new(Manager *m, UserRecord *hr, const char *sysfs, Home **ret);
+Home *home_free(Home *h);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Home*, home_free);
+
+int home_set_record(Home *h, UserRecord *hr);
+int home_save_record(Home *h);
+int home_unlink_record(Home *h);
+
+int home_fixate(Home *h, UserRecord *secret, sd_bus_error *error);
+int home_activate(Home *h, UserRecord *secret, sd_bus_error *error);
+int home_authenticate(Home *h, UserRecord *secret, sd_bus_error *error);
+int home_deactivate(Home *h, bool force, sd_bus_error *error);
+int home_create(Home *h, UserRecord *secret, sd_bus_error *error);
+int home_remove(Home *h, sd_bus_error *error);
+int home_update(Home *h, UserRecord *new_record, sd_bus_error *error);
+int home_resize(Home *h, uint64_t disk_size, UserRecord *secret, bool automatic, sd_bus_error *error);
+int home_passwd(Home *h, UserRecord *new_secret, UserRecord *old_secret, sd_bus_error *error);
+int home_unregister(Home *h, sd_bus_error *error);
+int home_lock(Home *h, sd_bus_error *error);
+int home_unlock(Home *h, UserRecord *secret, sd_bus_error *error);
+
+HomeState home_get_state(Home *h);
+
+int home_get_disk_status(Home *h, uint64_t *ret_disk_size,uint64_t *ret_disk_usage, uint64_t *ret_disk_free, uint64_t *ret_disk_ceiling, uint64_t *ret_disk_floor, statfs_f_type_t *ret_fstype, mode_t *ret_access_mode);
+
+void home_process_notify(Home *h, char **l, int fd);
+
+int home_killall(Home *h);
+
+int home_augment_status(Home *h, UserRecordLoadFlags flags, UserRecord **ret);
+
+int home_create_fifo(Home *h, bool please_suspend);
+int home_schedule_operation(Home *h, Operation *o, sd_bus_error *error);
+
+int home_auto_login(Home *h, char ***ret_seats);
+
+int home_set_current_message(Home *h, sd_bus_message *m);
+
+int home_wait_for_worker(Home *h);
+
+bool home_shall_rebalance(Home *h);
+
+bool home_is_busy(Home *h);
+
+const char *home_state_to_string(HomeState state);
+HomeState home_state_from_string(const char *s);
diff --git a/src/home/homed-manager-bus.c b/src/home/homed-manager-bus.c
new file mode 100644
index 0000000..7cf5439
--- /dev/null
+++ b/src/home/homed-manager-bus.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-polkit.h"
+#include "format-util.h"
+#include "homed-bus.h"
+#include "homed-home-bus.h"
+#include "homed-manager-bus.h"
+#include "homed-manager.h"
+#include "strv.h"
+#include "user-record-sign.h"
+#include "user-record-util.h"
+#include "user-util.h"
+
+static int property_get_auto_login(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        assert(bus);
+        assert(reply);
+
+        r = sd_bus_message_open_container(reply, 'a', "(sso)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(h, m->homes_by_name) {
+                _cleanup_strv_free_ char **seats = NULL;
+                _cleanup_free_ char *home_path = NULL;
+
+                r = home_auto_login(h, &seats);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to determine whether home '%s' is candidate for auto-login, ignoring: %m", h->user_name);
+                        continue;
+                }
+                if (!r)
+                        continue;
+
+                r = bus_home_path(h, &home_path);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate home bus path: %m");
+
+                STRV_FOREACH(s, seats) {
+                        r = sd_bus_message_append(reply, "(sso)", h->user_name, *s, home_path);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return sd_bus_message_close_container(reply);
+}
+
+static int method_get_home_by_name(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *path = NULL;
+        const char *user_name;
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "s", &user_name);
+        if (r < 0)
+                return r;
+        if (!valid_user_group_name(user_name, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User name %s is not valid", user_name);
+
+        h = hashmap_get(m->homes_by_name, user_name);
+        if (!h)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_HOME, "No home for user %s known", user_name);
+
+        r = bus_home_path(h, &path);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(
+                        message, "usussso",
+                        (uint32_t) h->uid,
+                        home_state_to_string(home_get_state(h)),
+                        h->record ? (uint32_t) user_record_gid(h->record) : GID_INVALID,
+                        h->record ? user_record_real_name(h->record) : NULL,
+                        h->record ? user_record_home_directory(h->record) : NULL,
+                        h->record ? user_record_shell(h->record) : NULL,
+                        path);
+}
+
+static int method_get_home_by_uid(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *path = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        uint32_t uid;
+        int r;
+        Home *h;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "u", &uid);
+        if (r < 0)
+                return r;
+        if (!uid_is_valid(uid))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "UID " UID_FMT " is not valid", uid);
+
+        h = hashmap_get(m->homes_by_uid, UID_TO_PTR(uid));
+        if (!h)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_HOME, "No home for UID " UID_FMT " known", uid);
+
+        /* Note that we don't use bus_home_path() here, but build the path manually, since if we are queried
+         * for a UID we should also generate the bus path with a UID, and bus_home_path() uses our more
+         * typical bus path by name. */
+        if (asprintf(&path, "/org/freedesktop/home1/home/" UID_FMT, h->uid) < 0)
+                return -ENOMEM;
+
+        return sd_bus_reply_method_return(
+                        message, "ssussso",
+                        h->user_name,
+                        home_state_to_string(home_get_state(h)),
+                        h->record ? (uint32_t) user_record_gid(h->record) : GID_INVALID,
+                        h->record ? user_record_real_name(h->record) : NULL,
+                        h->record ? user_record_home_directory(h->record) : NULL,
+                        h->record ? user_record_shell(h->record) : NULL,
+                        path);
+}
+
+static int method_list_homes(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_new_method_return(message, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(susussso)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(h, m->homes_by_uid) {
+                _cleanup_free_ char *path = NULL;
+
+                r = bus_home_path(h, &path);
+                if (r < 0)
+                        return r;
+
+                r = sd_bus_message_append(
+                                reply, "(susussso)",
+                                h->user_name,
+                                (uint32_t) h->uid,
+                                home_state_to_string(home_get_state(h)),
+                                h->record ? (uint32_t) user_record_gid(h->record) : GID_INVALID,
+                                h->record ? user_record_real_name(h->record) : NULL,
+                                h->record ? user_record_home_directory(h->record) : NULL,
+                                h->record ? user_record_shell(h->record) : NULL,
+                                path);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_user_record_by_name(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *json = NULL, *path = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *user_name;
+        bool incomplete;
+        Home *h;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "s", &user_name);
+        if (r < 0)
+                return r;
+        if (!valid_user_group_name(user_name, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User name %s is not valid", user_name);
+
+        h = hashmap_get(m->homes_by_name, user_name);
+        if (!h)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_HOME, "No home for user %s known", user_name);
+
+        r = bus_home_get_record_json(h, message, &json, &incomplete);
+        if (r < 0)
+                return r;
+
+        r = bus_home_path(h, &path);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(
+                        message, "sbo",
+                        json,
+                        incomplete,
+                        path);
+}
+
+static int method_get_user_record_by_uid(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *json = NULL, *path = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        bool incomplete;
+        uint32_t uid;
+        Home *h;
+        int r;
+
+        assert(message);
+
+        r = sd_bus_message_read(message, "u", &uid);
+        if (r < 0)
+                return r;
+        if (!uid_is_valid(uid))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "UID " UID_FMT " is not valid", uid);
+
+        h = hashmap_get(m->homes_by_uid, UID_TO_PTR(uid));
+        if (!h)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_HOME, "No home for UID " UID_FMT " known", uid);
+
+        r = bus_home_get_record_json(h, message, &json, &incomplete);
+        if (r < 0)
+                return r;
+
+        if (asprintf(&path, "/org/freedesktop/home1/home/" UID_FMT, h->uid) < 0)
+                return -ENOMEM;
+
+        return sd_bus_reply_method_return(
+                        message, "sbo",
+                        json,
+                        incomplete,
+                        path);
+}
+
+static int generic_home_method(
+                Manager *m,
+                sd_bus_message *message,
+                sd_bus_message_handler_t handler,
+                sd_bus_error *error) {
+
+        const char *user_name;
+        Home *h;
+        int r;
+
+        r = sd_bus_message_read(message, "s", &user_name);
+        if (r < 0)
+                return r;
+
+        if (!valid_user_group_name(user_name, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "User name %s is not valid", user_name);
+
+        h = hashmap_get(m->homes_by_name, user_name);
+        if (!h)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_HOME, "No home for user %s known", user_name);
+
+        return handler(message, h, error);
+}
+
+static int method_activate_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_activate, error);
+}
+
+static int method_deactivate_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_deactivate, error);
+}
+
+static int validate_and_allocate_home(Manager *m, UserRecord *hr, Home **ret, sd_bus_error *error) {
+        _cleanup_(user_record_unrefp) UserRecord *signed_hr = NULL;
+        struct passwd *pw;
+        struct group *gr;
+        bool signed_locally;
+        Home *other;
+        int r;
+
+        assert(m);
+        assert(hr);
+        assert(ret);
+
+        r = user_record_is_supported(hr, error);
+        if (r < 0)
+                return r;
+
+        other = hashmap_get(m->homes_by_name, hr->user_name);
+        if (other)
+                return sd_bus_error_setf(error, BUS_ERROR_USER_NAME_EXISTS, "Specified user name %s exists already, refusing.", hr->user_name);
+
+        pw = getpwnam(hr->user_name);
+        if (pw)
+                return sd_bus_error_setf(error, BUS_ERROR_USER_NAME_EXISTS, "Specified user name %s exists in the NSS user database, refusing.", hr->user_name);
+
+        gr = getgrnam(hr->user_name);
+        if (gr)
+                return sd_bus_error_setf(error, BUS_ERROR_USER_NAME_EXISTS, "Specified user name %s conflicts with an NSS group by the same name, refusing.", hr->user_name);
+
+        r = manager_verify_user_record(m, hr);
+        switch (r) {
+
+        case USER_RECORD_UNSIGNED:
+                /* If the record is unsigned, then let's sign it with our own key */
+                r = manager_sign_user_record(m, hr, &signed_hr, error);
+                if (r < 0)
+                        return r;
+
+                hr = signed_hr;
+                _fallthrough_;
+
+        case USER_RECORD_SIGNED_EXCLUSIVE:
+                signed_locally = true;
+                break;
+
+        case USER_RECORD_SIGNED:
+        case USER_RECORD_FOREIGN:
+                signed_locally = false;
+                break;
+
+        case -ENOKEY:
+                return sd_bus_error_setf(error, BUS_ERROR_BAD_SIGNATURE, "Specified user record for %s is signed by a key we don't recognize, refusing.", hr->user_name);
+
+        default:
+                return sd_bus_error_set_errnof(error, r, "Failed to validate signature for '%s': %m", hr->user_name);
+        }
+
+        if (uid_is_valid(hr->uid)) {
+                other = hashmap_get(m->homes_by_uid, UID_TO_PTR(hr->uid));
+                if (other)
+                        return sd_bus_error_setf(error, BUS_ERROR_UID_IN_USE, "Specified UID " UID_FMT " already in use by home %s, refusing.", hr->uid, other->user_name);
+
+                pw = getpwuid(hr->uid);
+                if (pw)
+                        return sd_bus_error_setf(error, BUS_ERROR_UID_IN_USE, "Specified UID " UID_FMT " already in use by NSS user %s, refusing.", hr->uid, pw->pw_name);
+
+                gr = getgrgid(hr->uid);
+                if (gr)
+                        return sd_bus_error_setf(error, BUS_ERROR_UID_IN_USE, "Specified UID " UID_FMT " already in use as GID by NSS group %s, refusing.", hr->uid, gr->gr_name);
+        } else {
+                r = manager_augment_record_with_uid(m, hr);
+                if (r < 0)
+                        return sd_bus_error_set_errnof(error, r, "Failed to acquire UID for '%s': %m", hr->user_name);
+        }
+
+        r = home_new(m, hr, NULL, ret);
+        if (r < 0)
+                return r;
+
+        (*ret)->signed_locally = signed_locally;
+        return r;
+}
+
+static int method_register_home(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        _cleanup_(home_freep) Home *h = NULL;
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_home_record(message, USER_RECORD_LOAD_EMBEDDED|USER_RECORD_PERMISSIVE, &hr, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.create-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = validate_and_allocate_home(m, hr, &h, error);
+        if (r < 0)
+                return r;
+
+        r = home_save_record(h);
+        if (r < 0)
+                return r;
+
+        TAKE_PTR(h);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unregister_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_unregister, error);
+}
+
+static int method_create_home(
+                sd_bus_message *message,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_home_record(message, USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_SECRET|USER_RECORD_ALLOW_PRIVILEGED|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_SIGNATURE, &hr, error);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        message,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.home1.create-home",
+                        NULL,
+                        true,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = validate_and_allocate_home(m, hr, &h, error);
+        if (r < 0)
+                return r;
+
+        r = home_create(h, hr, error);
+        if (r < 0)
+                goto fail;
+
+        assert(r == 0);
+        h->unregister_on_failure = true;
+        assert(!h->current_operation);
+
+        r = home_set_current_message(h, message);
+        if (r < 0)
+                return r;
+
+        return 1;
+
+fail:
+        (void) home_unlink_record(h);
+        h = home_free(h);
+        return r;
+}
+
+static int method_realize_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_realize, error);
+}
+
+static int method_remove_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_remove, error);
+}
+
+static int method_fixate_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_fixate, error);
+}
+
+static int method_authenticate_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_authenticate, error);
+}
+
+static int method_update_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        assert(message);
+
+        r = bus_message_read_home_record(message, USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_SECRET|USER_RECORD_ALLOW_PRIVILEGED|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_PERMISSIVE, &hr, error);
+        if (r < 0)
+                return r;
+
+        assert(hr->user_name);
+
+        h = hashmap_get(m->homes_by_name, hr->user_name);
+        if (!h)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_HOME, "No home for user %s known", hr->user_name);
+
+        return bus_home_method_update_record(h, message, hr, error);
+}
+
+static int method_resize_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_resize, error);
+}
+
+static int method_change_password_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_change_password, error);
+}
+
+static int method_lock_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_lock, error);
+}
+
+static int method_unlock_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_unlock, error);
+}
+
+static int method_acquire_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_acquire, error);
+}
+
+static int method_ref_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_ref, error);
+}
+
+static int method_release_home(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        return generic_home_method(userdata, message, bus_home_method_release, error);
+}
+
+static int method_lock_all_homes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(operation_unrefp) Operation *o = NULL;
+        bool waiting = false;
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        /* This is called from logind when we are preparing for system suspend. We enqueue a lock operation
+         * for every suitable home we have and only when all of them completed we send a reply indicating
+         * completion. */
+
+        HASHMAP_FOREACH(h, m->homes_by_name) {
+
+                /* Automatically suspend all homes that have at least one client referencing it that asked
+                 * for "please suspend", and no client that asked for "please do not suspend". */
+                if (h->ref_event_source_dont_suspend ||
+                    !h->ref_event_source_please_suspend)
+                        continue;
+
+                if (!o) {
+                        o = operation_new(OPERATION_LOCK_ALL, message);
+                        if (!o)
+                                return -ENOMEM;
+                }
+
+                log_info("Automatically locking home of user %s.", h->user_name);
+
+                r = home_schedule_operation(h, o, error);
+                if (r < 0)
+                        return r;
+
+                waiting = true;
+        }
+
+        if (waiting) /* At least one lock operation was enqeued, let's leave here without a reply: it will
+                      * be sent as soon as the last of the lock operations completed. */
+                return 1;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_deactivate_all_homes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        _cleanup_(operation_unrefp) Operation *o = NULL;
+        bool waiting = false;
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        /* This is called from systemd-homed-activate.service's ExecStop= command to ensure that all home
+         * directories are shutdown before the system goes down. Note that we don't do this from
+         * systemd-homed.service itself since we want to allow restarting of it without tearing down all home
+         * directories. */
+
+        HASHMAP_FOREACH(h, m->homes_by_name) {
+
+                if (!o) {
+                        o = operation_new(OPERATION_DEACTIVATE_ALL, message);
+                        if (!o)
+                                return -ENOMEM;
+                }
+
+                log_info("Automatically deactivating home of user %s.", h->user_name);
+
+                r = home_schedule_operation(h, o, error);
+                if (r < 0)
+                        return r;
+
+                waiting = true;
+        }
+
+        if (waiting) /* At least one lock operation was enqeued, let's leave here without a reply: it will be
+                      * sent as soon as the last of the deactivation operations completed. */
+                return 1;
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_rebalance(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        r = manager_schedule_rebalance(m, /* immediately= */ true);
+        if (r == 0)
+                return sd_bus_reply_method_errorf(message, BUS_ERROR_REBALANCE_NOT_NEEDED, "No home directories need rebalancing.");
+        if (r < 0)
+                return r;
+
+        /* Keep a reference to this message, so that we can reply to it once we are done */
+        r = set_ensure_put(&m->rebalance_queued_method_calls, &bus_message_hash_ops, message);
+        if (r < 0)
+                return log_error_errno(r, "Failed to track rebalance bus message: %m");
+
+        sd_bus_message_ref(message);
+        return 1;
+}
+
+static const sd_bus_vtable manager_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_PROPERTY("AutoLogin", "a(sso)", property_get_auto_login, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+
+        SD_BUS_METHOD_WITH_ARGS("GetHomeByName",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_RESULT("u", uid,
+                                              "s", home_state,
+                                              "u", gid,
+                                              "s", real_name,
+                                              "s", home_directory,
+                                              "s", shell,
+                                              "o", bus_path),
+                                method_get_home_by_name,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetHomeByUID",
+                                SD_BUS_ARGS("u", uid),
+                                SD_BUS_RESULT("s", user_name,
+                                              "s", home_state,
+                                              "u", gid,
+                                              "s", real_name,
+                                              "s", home_directory,
+                                              "s", shell,
+                                              "o", bus_path),
+                                method_get_home_by_uid,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetUserRecordByName",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_RESULT("s", user_record, "b", incomplete, "o", bus_path),
+                                method_get_user_record_by_name,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("GetUserRecordByUID",
+                                SD_BUS_ARGS("u", uid),
+                                SD_BUS_RESULT("s", user_record, "b", incomplete, "o", bus_path),
+                                method_get_user_record_by_uid,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("ListHomes",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("a(susussso)", home_areas),
+                                method_list_homes,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        /* The following methods directly execute an operation on a home area, without ref-counting, queueing
+         * or anything, and are accessible through homectl. */
+        SD_BUS_METHOD_WITH_ARGS("ActivateHome",
+                                SD_BUS_ARGS("s", user_name, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                method_activate_home,
+                                SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("DeactivateHome",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_NO_RESULT,
+                                method_deactivate_home,
+                                0),
+
+        /* Add the JSON record to homed, but don't create actual $HOME */
+        SD_BUS_METHOD_WITH_ARGS("RegisterHome",
+                                SD_BUS_ARGS("s", user_record),
+                                SD_BUS_NO_RESULT,
+                                method_register_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        /* Remove the JSON record from homed, but don't remove actual $HOME  */
+        SD_BUS_METHOD_WITH_ARGS("UnregisterHome",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_NO_RESULT,
+                                method_unregister_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        /* Add JSON record, and create $HOME for it */
+        SD_BUS_METHOD_WITH_ARGS("CreateHome",
+                                SD_BUS_ARGS("s", user_record),
+                                SD_BUS_NO_RESULT,
+                                method_create_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        /* Create $HOME for already registered JSON entry */
+        SD_BUS_METHOD_WITH_ARGS("RealizeHome",
+                                SD_BUS_ARGS("s", user_name, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                method_realize_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        /* Remove the JSON record and remove $HOME */
+        SD_BUS_METHOD_WITH_ARGS("RemoveHome",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_NO_RESULT,
+                                method_remove_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        /* Investigate $HOME and propagate contained JSON record into our database */
+        SD_BUS_METHOD_WITH_ARGS("FixateHome",
+                                SD_BUS_ARGS("s", user_name, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                method_fixate_home,
+                                SD_BUS_VTABLE_SENSITIVE),
+
+        /* Just check credentials */
+        SD_BUS_METHOD_WITH_ARGS("AuthenticateHome",
+                                SD_BUS_ARGS("s", user_name, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                method_authenticate_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        /* Update the JSON record of existing user */
+        SD_BUS_METHOD_WITH_ARGS("UpdateHome",
+                                SD_BUS_ARGS("s", user_record),
+                                SD_BUS_NO_RESULT,
+                                method_update_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        SD_BUS_METHOD_WITH_ARGS("ResizeHome",
+                                SD_BUS_ARGS("s", user_name, "t", size, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                method_resize_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        SD_BUS_METHOD_WITH_ARGS("ChangePasswordHome",
+                                SD_BUS_ARGS("s", user_name, "s", new_secret, "s", old_secret),
+                                SD_BUS_NO_RESULT,
+                                method_change_password_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+
+        /* Prepare active home for system suspend: flush out passwords, suspend access */
+        SD_BUS_METHOD_WITH_ARGS("LockHome",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_NO_RESULT,
+                                method_lock_home,
+                                0),
+
+        /* Make $HOME usable after system resume again */
+        SD_BUS_METHOD_WITH_ARGS("UnlockHome",
+                                SD_BUS_ARGS("s", user_name, "s", secret),
+                                SD_BUS_NO_RESULT,
+                                method_unlock_home,
+                                SD_BUS_VTABLE_SENSITIVE),
+
+        /* The following methods implement ref-counted activation, and are what the PAM module and "homectl
+         * with" use. In contrast to the methods above which fail if an operation is already being executed
+         * on a home directory, these ones will queue the request, and are thus more reliable. Moreover,
+         * they are a bit smarter: AcquireHome() will fixate, activate, unlock, or authenticate depending on
+         * the state of the home area, so that the end result is always the same (i.e. the home directory is
+         * accessible), and we always validate the specified passwords. RefHome() will not authenticate, and
+         * thus only works if the home area is already active. */
+        SD_BUS_METHOD_WITH_ARGS("AcquireHome",
+                                SD_BUS_ARGS("s", user_name, "s", secret, "b", please_suspend),
+                                SD_BUS_RESULT("h", send_fd),
+                                method_acquire_home,
+                                SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_SENSITIVE),
+        SD_BUS_METHOD_WITH_ARGS("RefHome",
+                                SD_BUS_ARGS("s", user_name, "b", please_suspend),
+                                SD_BUS_RESULT("h", send_fd),
+                                method_ref_home,
+                                0),
+        SD_BUS_METHOD_WITH_ARGS("ReleaseHome",
+                                SD_BUS_ARGS("s", user_name),
+                                SD_BUS_NO_RESULT,
+                                method_release_home,
+                                0),
+
+        /* An operation that acts on all homes that allow it */
+        SD_BUS_METHOD("LockAllHomes", NULL, NULL, method_lock_all_homes, 0),
+        SD_BUS_METHOD("DeactivateAllHomes", NULL, NULL, method_deactivate_all_homes, 0),
+        SD_BUS_METHOD("Rebalance", NULL, NULL, method_rebalance, 0),
+
+        SD_BUS_VTABLE_END
+};
+
+const BusObjectImplementation manager_object = {
+        "/org/freedesktop/home1",
+        "org.freedesktop.home1.Manager",
+        .vtables = BUS_VTABLES(manager_vtable),
+        .children = BUS_IMPLEMENTATIONS(&home_object),
+};
+
+static int on_deferred_auto_login(sd_event_source *s, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        m->deferred_auto_login_event_source = sd_event_source_disable_unref(m->deferred_auto_login_event_source);
+
+        r = sd_bus_emit_properties_changed(
+                        m->bus,
+                        "/org/freedesktop/home1",
+                        "org.freedesktop.home1.Manager",
+                        "AutoLogin", NULL);
+        if (r < 0)
+                log_warning_errno(r, "Failed to send AutoLogin property change event, ignoring: %m");
+
+        return 0;
+}
+
+int bus_manager_emit_auto_login_changed(Manager *m) {
+        int r;
+        assert(m);
+
+        if (m->deferred_auto_login_event_source)
+                return 0;
+
+        if (!m->event)
+                return 0;
+
+        if (IN_SET(sd_event_get_state(m->event), SD_EVENT_FINISHED, SD_EVENT_EXITING))
+                return 0;
+
+        r = sd_event_add_defer(m->event, &m->deferred_auto_login_event_source, on_deferred_auto_login, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate auto login event source: %m");
+
+        r = sd_event_source_set_priority(m->deferred_auto_login_event_source, SD_EVENT_PRIORITY_IDLE+10);
+        if (r < 0)
+                log_warning_errno(r, "Failed to tweak priority of event source, ignoring: %m");
+
+        (void) sd_event_source_set_description(m->deferred_auto_login_event_source, "deferred-auto-login");
+        return 1;
+}
diff --git a/src/home/homed-manager-bus.h b/src/home/homed-manager-bus.h
new file mode 100644
index 0000000..7db29fa
--- /dev/null
+++ b/src/home/homed-manager-bus.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "bus-util.h"
+
+extern const BusObjectImplementation manager_object;
diff --git a/src/home/homed-manager.c b/src/home/homed-manager.c
new file mode 100644
index 0000000..c452531
--- /dev/null
+++ b/src/home/homed-manager.c
@@ -0,0 +1,2224 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-id128.h"
+
+#include "btrfs-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-log-control-api.h"
+#include "bus-polkit.h"
+#include "clean-ipc.h"
+#include "common-signal.h"
+#include "conf-files.h"
+#include "device-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "gpt.h"
+#include "home-util.h"
+#include "homed-conf.h"
+#include "homed-home-bus.h"
+#include "homed-home.h"
+#include "homed-manager-bus.h"
+#include "homed-manager.h"
+#include "homed-varlink.h"
+#include "io-util.h"
+#include "mkdir.h"
+#include "openssl-util.h"
+#include "process-util.h"
+#include "quota-util.h"
+#include "random-util.h"
+#include "resize-fs.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "sync-util.h"
+#include "tmpfile-util.h"
+#include "udev-util.h"
+#include "user-record-sign.h"
+#include "user-record-util.h"
+#include "user-record.h"
+#include "user-util.h"
+#include "varlink-io.systemd.UserDatabase.h"
+
+/* Where to look for private/public keys that are used to sign the user records. We are not using
+ * CONF_PATHS_NULSTR() here since we want to insert /var/lib/systemd/home/ in the middle. And we insert that
+ * since we want to auto-generate a persistent private/public key pair if we need to. */
+#define KEY_PATHS_NULSTR                        \
+        "/etc/systemd/home/\0"                  \
+        "/run/systemd/home/\0"                  \
+        "/var/lib/systemd/home/\0"              \
+        "/usr/local/lib/systemd/home/\0"        \
+        "/usr/lib/systemd/home/\0"
+
+static bool uid_is_home(uid_t uid) {
+        return uid >= HOME_UID_MIN && uid <= HOME_UID_MAX;
+}
+/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */
+
+#define UID_CLAMP_INTO_HOME_RANGE(rnd) (((uid_t) (rnd) % (HOME_UID_MAX - HOME_UID_MIN + 1)) + HOME_UID_MIN)
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(homes_by_uid_hash_ops, void, trivial_hash_func, trivial_compare_func, Home, home_free);
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(homes_by_name_hash_ops, char, string_hash_func, string_compare_func, Home, home_free);
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(homes_by_worker_pid_hash_ops, void, trivial_hash_func, trivial_compare_func, Home, home_free);
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(homes_by_sysfs_hash_ops, char, path_hash_func, path_compare, Home, home_free);
+
+static int on_home_inotify(sd_event_source *s, const struct inotify_event *event, void *userdata);
+static int manager_gc_images(Manager *m);
+static int manager_enumerate_images(Manager *m);
+static int manager_assess_image(Manager *m, int dir_fd, const char *dir_path, const char *dentry_name);
+static void manager_revalidate_image(Manager *m, Home *h);
+
+static void manager_watch_home(Manager *m) {
+        struct statfs sfs;
+        int r;
+
+        assert(m);
+
+        m->inotify_event_source = sd_event_source_disable_unref(m->inotify_event_source);
+        m->scan_slash_home = false;
+
+        if (statfs(get_home_root(), &sfs) < 0) {
+                log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+                               "Failed to statfs() %s directory, disabling automatic scanning.", get_home_root());
+                return;
+        }
+
+        if (is_network_fs(&sfs)) {
+                log_info("%s is a network file system, disabling automatic scanning.", get_home_root());
+                return;
+        }
+
+        if (is_fs_type(&sfs, AUTOFS_SUPER_MAGIC)) {
+                log_info("%s is on autofs, disabling automatic scanning.", get_home_root());
+                return;
+        }
+
+        m->scan_slash_home = true;
+
+        r = sd_event_add_inotify(m->event, &m->inotify_event_source, get_home_root(),
+                                 IN_CREATE|IN_CLOSE_WRITE|IN_DELETE_SELF|IN_MOVE_SELF|IN_ONLYDIR|IN_MOVED_TO|IN_MOVED_FROM|IN_DELETE,
+                                 on_home_inotify, m);
+        if (r < 0)
+                log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to create inotify watch on %s, ignoring.", get_home_root());
+
+        (void) sd_event_source_set_description(m->inotify_event_source, "home-inotify");
+
+        log_info("Watching %s.", get_home_root());
+}
+
+static int on_home_inotify(sd_event_source *s, const struct inotify_event *event, void *userdata) {
+        _cleanup_free_ char *j = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        const char *e, *n;
+
+        assert(event);
+
+        if ((event->mask & (IN_Q_OVERFLOW|IN_MOVE_SELF|IN_DELETE_SELF|IN_IGNORED|IN_UNMOUNT)) != 0) {
+
+                if (FLAGS_SET(event->mask, IN_Q_OVERFLOW))
+                        log_debug("%s inotify queue overflow, rescanning.", get_home_root());
+                else if (FLAGS_SET(event->mask, IN_MOVE_SELF))
+                        log_info("%s moved or renamed, recreating watch and rescanning.", get_home_root());
+                else if (FLAGS_SET(event->mask, IN_DELETE_SELF))
+                        log_info("%s deleted, recreating watch and rescanning.", get_home_root());
+                else if (FLAGS_SET(event->mask, IN_UNMOUNT))
+                        log_info("%s unmounted, recreating watch and rescanning.", get_home_root());
+                else if (FLAGS_SET(event->mask, IN_IGNORED))
+                        log_info("%s watch invalidated, recreating watch and rescanning.", get_home_root());
+
+                manager_watch_home(m);
+                (void) manager_gc_images(m);
+                (void) manager_enumerate_images(m);
+                (void) bus_manager_emit_auto_login_changed(m);
+                return 0;
+        }
+
+        /* For the other inotify events, let's ignore all events for file names that don't match our
+         * expectations */
+        if (isempty(event->name))
+                return 0;
+        e = endswith(event->name, FLAGS_SET(event->mask, IN_ISDIR) ? ".homedir" : ".home");
+        if (!e)
+                return 0;
+
+        n = strndupa_safe(event->name, e - event->name);
+        if (!suitable_user_name(n))
+                return 0;
+
+        j = path_join(get_home_root(), event->name);
+        if (!j)
+                return log_oom();
+
+        if ((event->mask & (IN_CREATE|IN_CLOSE_WRITE|IN_MOVED_TO)) != 0) {
+                if (FLAGS_SET(event->mask, IN_CREATE))
+                        log_debug("%s has been created, having a look.", j);
+                else if (FLAGS_SET(event->mask, IN_CLOSE_WRITE))
+                        log_debug("%s has been modified, having a look.", j);
+                else if (FLAGS_SET(event->mask, IN_MOVED_TO))
+                        log_debug("%s has been moved in, having a look.", j);
+
+                (void) manager_assess_image(m, -1, get_home_root(), event->name);
+                (void) bus_manager_emit_auto_login_changed(m);
+        }
+
+        if ((event->mask & (IN_DELETE | IN_CLOSE_WRITE | IN_MOVED_FROM)) != 0) {
+                Home *h;
+
+                if (FLAGS_SET(event->mask, IN_DELETE))
+                        log_debug("%s has been deleted, revalidating.", j);
+                else if (FLAGS_SET(event->mask, IN_CLOSE_WRITE))
+                        log_debug("%s has been closed after writing, revalidating.", j);
+                else if (FLAGS_SET(event->mask, IN_MOVED_FROM))
+                        log_debug("%s has been moved away, revalidating.", j);
+
+                h = hashmap_get(m->homes_by_name, n);
+                if (h) {
+                        manager_revalidate_image(m, h);
+                        (void) bus_manager_emit_auto_login_changed(m);
+                }
+        }
+
+        return 0;
+}
+
+int manager_new(Manager **ret) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        int r;
+
+        assert(ret);
+
+        m = new(Manager, 1);
+        if (!m)
+                return -ENOMEM;
+
+        *m = (Manager) {
+                .default_storage = _USER_STORAGE_INVALID,
+                .rebalance_interval_usec = 2 * USEC_PER_MINUTE, /* initially, rebalance every 2min */
+        };
+
+        r = manager_parse_config_file(m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_default(&m->event);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL);
+        if (r < 0)
+                log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to allocate memory pressure watch, ignoring: %m");
+
+        r = sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_set_watchdog(m->event, true);
+
+        m->homes_by_uid = hashmap_new(&homes_by_uid_hash_ops);
+        if (!m->homes_by_uid)
+                return -ENOMEM;
+
+        m->homes_by_name = hashmap_new(&homes_by_name_hash_ops);
+        if (!m->homes_by_name)
+                return -ENOMEM;
+
+        m->homes_by_worker_pid = hashmap_new(&homes_by_worker_pid_hash_ops);
+        if (!m->homes_by_worker_pid)
+                return -ENOMEM;
+
+        m->homes_by_sysfs = hashmap_new(&homes_by_sysfs_hash_ops);
+        if (!m->homes_by_sysfs)
+                return -ENOMEM;
+
+        *ret = TAKE_PTR(m);
+        return 0;
+}
+
+Manager* manager_free(Manager *m) {
+        Home *h;
+
+        assert(m);
+
+        HASHMAP_FOREACH(h, m->homes_by_worker_pid)
+                (void) home_wait_for_worker(h);
+
+        m->bus = sd_bus_flush_close_unref(m->bus);
+        m->polkit_registry = bus_verify_polkit_async_registry_free(m->polkit_registry);
+
+        m->device_monitor = sd_device_monitor_unref(m->device_monitor);
+
+        m->inotify_event_source = sd_event_source_unref(m->inotify_event_source);
+        m->notify_socket_event_source = sd_event_source_unref(m->notify_socket_event_source);
+        m->deferred_rescan_event_source = sd_event_source_unref(m->deferred_rescan_event_source);
+        m->deferred_gc_event_source = sd_event_source_unref(m->deferred_gc_event_source);
+        m->deferred_auto_login_event_source = sd_event_source_unref(m->deferred_auto_login_event_source);
+        m->rebalance_event_source = sd_event_source_unref(m->rebalance_event_source);
+
+        m->event = sd_event_unref(m->event);
+
+        m->homes_by_uid = hashmap_free(m->homes_by_uid);
+        m->homes_by_name = hashmap_free(m->homes_by_name);
+        m->homes_by_worker_pid = hashmap_free(m->homes_by_worker_pid);
+        m->homes_by_sysfs = hashmap_free(m->homes_by_sysfs);
+
+        if (m->private_key)
+                EVP_PKEY_free(m->private_key);
+
+        hashmap_free(m->public_keys);
+
+        varlink_server_unref(m->varlink_server);
+        free(m->userdb_service);
+
+        free(m->default_file_system_type);
+
+        return mfree(m);
+}
+
+int manager_verify_user_record(Manager *m, UserRecord *hr) {
+        EVP_PKEY *pkey;
+        int r;
+
+        assert(m);
+        assert(hr);
+
+        if (!m->private_key && hashmap_isempty(m->public_keys)) {
+                r = user_record_has_signature(hr);
+                if (r < 0)
+                        return r;
+
+                return r ? -ENOKEY : USER_RECORD_UNSIGNED;
+        }
+
+        /* Is it our own? */
+        if (m->private_key) {
+                r = user_record_verify(hr, m->private_key);
+                switch (r) {
+
+                case USER_RECORD_FOREIGN:
+                        /* This record is not signed by this key, but let's see below */
+                        break;
+
+                case USER_RECORD_SIGNED:               /* Signed by us, but also by others, let's propagate that */
+                case USER_RECORD_SIGNED_EXCLUSIVE:     /* Signed by us, and nothing else, ditto */
+                case USER_RECORD_UNSIGNED:             /* Not signed at all, ditto  */
+                default:
+                        return r;
+                }
+        }
+
+        HASHMAP_FOREACH(pkey, m->public_keys) {
+                r = user_record_verify(hr, pkey);
+                switch (r) {
+
+                case USER_RECORD_FOREIGN:
+                        /* This record is not signed by this key, but let's see our other keys */
+                        break;
+
+                case USER_RECORD_SIGNED:            /* It's signed by this key we are happy with, but which is not our own. */
+                case USER_RECORD_SIGNED_EXCLUSIVE:
+                        return USER_RECORD_FOREIGN;
+
+                case USER_RECORD_UNSIGNED: /* It's not signed at all */
+                default:
+                        return r;
+                }
+        }
+
+        return -ENOKEY;
+}
+
+static int manager_add_home_by_record(
+                Manager *m,
+                const char *name,
+                int dir_fd,
+                const char *fname) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        unsigned line, column;
+        int r, is_signed;
+        struct stat st;
+        Home *h;
+
+        assert(m);
+        assert(name);
+        assert(fname);
+
+        if (fstatat(dir_fd, fname, &st, 0) < 0)
+                return log_error_errno(errno, "Failed to stat identity record %s: %m", fname);
+
+        if (!S_ISREG(st.st_mode)) {
+                log_debug("Identity record file %s is not a regular file, ignoring.", fname);
+                return 0;
+        }
+
+        if (st.st_size == 0)
+                goto unlink_this_file;
+
+        r = json_parse_file_at(NULL, dir_fd, fname, JSON_PARSE_SENSITIVE, &v, &line, &column);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse identity record at %s:%u%u: %m", fname, line, column);
+
+        if (json_variant_is_blank_object(v))
+                goto unlink_this_file;
+
+        hr = user_record_new();
+        if (!hr)
+                return log_oom();
+
+        r = user_record_load(hr, v, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_LOG|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        if (!streq_ptr(hr->user_name, name))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Identity's user name %s does not match file name %s, refusing.",
+                                       hr->user_name, name);
+
+        is_signed = manager_verify_user_record(m, hr);
+        switch (is_signed) {
+
+        case -ENOKEY:
+                return log_warning_errno(is_signed, "User record %s is not signed by any accepted key, ignoring.", fname);
+        case USER_RECORD_UNSIGNED:
+                return log_warning_errno(SYNTHETIC_ERRNO(EPERM), "User record %s is not signed at all, ignoring.", fname);
+        case USER_RECORD_SIGNED:
+                log_info("User record %s is signed by us (and others), accepting.", fname);
+                break;
+        case USER_RECORD_SIGNED_EXCLUSIVE:
+                log_info("User record %s is signed only by us, accepting.", fname);
+                break;
+        case USER_RECORD_FOREIGN:
+                log_info("User record %s is signed by registered key from others, accepting.", fname);
+                break;
+        default:
+                assert(is_signed < 0);
+                return log_error_errno(is_signed, "Failed to verify signature of user record in %s: %m", fname);
+        }
+
+        h = hashmap_get(m->homes_by_name, name);
+        if (h) {
+                r = home_set_record(h, hr);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to update home record for %s: %m", name);
+
+                /* If we acquired a record now for a previously unallocated entry, then reset the state. This
+                 * makes sure home_get_state() will check for the availability of the image file dynamically
+                 * in order to detect to distinguish HOME_INACTIVE and HOME_ABSENT. */
+                if (h->state == HOME_UNFIXATED)
+                        h->state = _HOME_STATE_INVALID;
+        } else {
+                r = home_new(m, hr, NULL, &h);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate new home object: %m");
+
+                log_info("Added registered home for user %s.", hr->user_name);
+        }
+
+        /* Only entries we exclusively signed are writable to us, hence remember the result */
+        h->signed_locally = is_signed == USER_RECORD_SIGNED_EXCLUSIVE;
+
+        return 1;
+
+unlink_this_file:
+        /* If this is an empty file, then let's just remove it. An empty file is not useful in any case, and
+         * apparently xfs likes to leave empty files around when not unmounted cleanly (see
+         * https://github.com/systemd/systemd/issues/15178 for example). Note that we don't delete non-empty
+         * files even if they are invalid, because that's just too risky, we might delete data the user still
+         * needs. But empty files are never useful, hence let's just remove them. */
+
+        if (unlinkat(dir_fd, fname, 0) < 0)
+                return log_error_errno(errno, "Failed to remove empty user record file %s: %m", fname);
+
+        log_notice("Discovered empty user record file %s/%s, removed automatically.", home_record_dir(), fname);
+        return 0;
+}
+
+static int manager_enumerate_records(Manager *m) {
+        _cleanup_closedir_ DIR *d = NULL;
+
+        assert(m);
+
+        d = opendir(home_record_dir());
+        if (!d)
+                return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno,
+                                      "Failed to open %s: %m", home_record_dir());
+
+        FOREACH_DIRENT(de, d, return log_error_errno(errno, "Failed to read record directory: %m")) {
+                _cleanup_free_ char *n = NULL;
+                const char *e;
+
+                if (!dirent_is_file(de))
+                        continue;
+
+                e = endswith(de->d_name, ".identity");
+                if (!e)
+                        continue;
+
+                n = strndup(de->d_name, e - de->d_name);
+                if (!n)
+                        return log_oom();
+
+                if (!suitable_user_name(n))
+                        continue;
+
+                (void) manager_add_home_by_record(m, n, dirfd(d), de->d_name);
+        }
+
+        return 0;
+}
+
+static int search_quota(uid_t uid, const char *exclude_quota_path) {
+        struct stat exclude_st = {};
+        dev_t previous_devno = 0;
+        int r;
+
+        /* Checks whether the specified UID owns any files on the files system, but ignore any file system
+         * backing the specified file. The file is used when operating on home directories, where it's OK if
+         * the UID of them already owns files. */
+
+        if (exclude_quota_path && stat(exclude_quota_path, &exclude_st) < 0) {
+                if (errno != ENOENT)
+                        return log_warning_errno(errno, "Failed to stat %s, ignoring: %m", exclude_quota_path);
+        }
+
+        /* Check a few usual suspects where regular users might own files. Note that this is by no means
+         * comprehensive, but should cover most cases. Note that in an ideal world every user would be
+         * registered in NSS and avoid our own UID range, but for all other cases, it's a good idea to be
+         * paranoid and check quota if we can. */
+        FOREACH_STRING(where, get_home_root(), "/tmp/", "/var/", "/var/mail/", "/var/tmp/", "/var/spool/") {
+                struct dqblk req;
+                struct stat st;
+
+                if (stat(where, &st) < 0) {
+                        log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno,
+                                       "Failed to stat %s, ignoring: %m", where);
+                        continue;
+                }
+
+                if (major(st.st_dev) == 0) {
+                        log_debug("Directory %s is not on a real block device, not checking quota for UID use.", where);
+                        continue;
+                }
+
+                if (st.st_dev == exclude_st.st_dev) { /* If an exclude path is specified, then ignore quota
+                                                       * reported on the same block device as that path. */
+                        log_debug("Directory %s is where the home directory is located, not checking quota for UID use.", where);
+                        continue;
+                }
+
+                if (st.st_dev == previous_devno) { /* Does this directory have the same devno as the previous
+                                                    * one we tested? If so, there's no point in testing this
+                                                    * again. */
+                        log_debug("Directory %s is on same device as previous tested directory, not checking quota for UID use a second time.", where);
+                        continue;
+                }
+
+                previous_devno = st.st_dev;
+
+                r = quotactl_devnum(QCMD_FIXED(Q_GETQUOTA, USRQUOTA), st.st_dev, uid, &req);
+                if (r < 0) {
+                        if (ERRNO_IS_NOT_SUPPORTED(r))
+                                log_debug_errno(r, "No UID quota support on %s, ignoring.", where);
+                        else if (ERRNO_IS_PRIVILEGE(r))
+                                log_debug_errno(r, "UID quota support for %s prohibited, ignoring.", where);
+                        else
+                                log_warning_errno(r, "Failed to query quota on %s, ignoring: %m", where);
+
+                        continue;
+                }
+
+                if ((FLAGS_SET(req.dqb_valid, QIF_SPACE) && req.dqb_curspace > 0) ||
+                    (FLAGS_SET(req.dqb_valid, QIF_INODES) && req.dqb_curinodes > 0)) {
+                        log_debug_errno(errno, "Quota reports UID " UID_FMT " occupies disk space on %s.", uid, where);
+                        return 1;
+                }
+        }
+
+        return 0;
+}
+
+static int manager_acquire_uid(
+                Manager *m,
+                uid_t start_uid,
+                const char *user_name,
+                const char *exclude_quota_path,
+                uid_t *ret) {
+
+        static const uint8_t hash_key[] = {
+                0xa3, 0xb8, 0x82, 0x69, 0x9a, 0x71, 0xf7, 0xa9,
+                0xe0, 0x7c, 0xf6, 0xf1, 0x21, 0x69, 0xd2, 0x1e
+        };
+
+        enum {
+                PHASE_SUGGESTED,
+                PHASE_HASHED,
+                PHASE_RANDOM
+        } phase = PHASE_SUGGESTED;
+
+        unsigned n_tries = 100;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        for (;;) {
+                struct passwd *pw;
+                struct group *gr;
+                uid_t candidate;
+                Home *other;
+
+                if (--n_tries <= 0)
+                        return -EBUSY;
+
+                switch (phase) {
+
+                case PHASE_SUGGESTED:
+                        phase = PHASE_HASHED;
+
+                        if (!uid_is_home(start_uid))
+                                continue;
+
+                        candidate = start_uid;
+                        break;
+
+                case PHASE_HASHED:
+                        phase = PHASE_RANDOM;
+
+                        if (!user_name)
+                                continue;
+
+                        candidate = UID_CLAMP_INTO_HOME_RANGE(siphash24(user_name, strlen(user_name), hash_key));
+                        break;
+
+                case PHASE_RANDOM:
+                        random_bytes(&candidate, sizeof(candidate));
+                        candidate = UID_CLAMP_INTO_HOME_RANGE(candidate);
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+
+                other = hashmap_get(m->homes_by_uid, UID_TO_PTR(candidate));
+                if (other) {
+                        log_debug("Candidate UID " UID_FMT " already used by another home directory (%s), let's try another.",
+                                  candidate, other->user_name);
+                        continue;
+                }
+
+                pw = getpwuid(candidate);
+                if (pw) {
+                        log_debug("Candidate UID " UID_FMT " already registered by another user in NSS (%s), let's try another.",
+                                  candidate, pw->pw_name);
+                        continue;
+                }
+
+                gr = getgrgid((gid_t) candidate);
+                if (gr) {
+                        log_debug("Candidate UID " UID_FMT " already registered by another group in NSS (%s), let's try another.",
+                                  candidate, gr->gr_name);
+                        continue;
+                }
+
+                r = search_ipc(candidate, (gid_t) candidate);
+                if (r < 0)
+                        continue;
+                if (r > 0) {
+                        log_debug_errno(r, "Candidate UID " UID_FMT " already owns IPC objects, let's try another: %m",
+                                        candidate);
+                        continue;
+                }
+
+                r = search_quota(candidate, exclude_quota_path);
+                if (r != 0)
+                        continue;
+
+                *ret = candidate;
+                return 0;
+        }
+}
+
+static int manager_add_home_by_image(
+                Manager *m,
+                const char *user_name,
+                const char *realm,
+                const char *image_path,
+                const char *sysfs,
+                UserStorage storage,
+                uid_t start_uid) {
+
+        _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+        uid_t uid;
+        Home *h;
+        int r;
+
+        assert(m);
+
+        assert(m);
+        assert(user_name);
+        assert(image_path);
+        assert(storage >= 0);
+        assert(storage < _USER_STORAGE_MAX);
+
+        h = hashmap_get(m->homes_by_name, user_name);
+        if (h) {
+                bool same;
+
+                if (h->state != HOME_UNFIXATED) {
+                        log_debug("Found an image for user %s which already has a record, skipping.", user_name);
+                        return 0; /* ignore images that synthesize a user we already have a record for */
+                }
+
+                same = user_record_storage(h->record) == storage;
+                if (same) {
+                        if (h->sysfs && sysfs)
+                                same = path_equal(h->sysfs, sysfs);
+                        else if (!!h->sysfs != !!sysfs)
+                                same = false;
+                        else {
+                                const char *p;
+
+                                p = user_record_image_path(h->record);
+                                same = p && path_equal(p, image_path);
+                        }
+                }
+
+                if (!same) {
+                        log_debug("Found multiple images for user '%s', ignoring image '%s'.", user_name, image_path);
+                        return 0;
+                }
+        } else {
+                /* Check NSS, in case there's another user or group by this name */
+                if (getpwnam(user_name) || getgrnam(user_name)) {
+                        log_debug("Found an existing user or group by name '%s', ignoring image '%s'.", user_name, image_path);
+                        return 0;
+                }
+        }
+
+        if (h && uid_is_valid(h->uid))
+                uid = h->uid;
+        else {
+                r = manager_acquire_uid(m, start_uid, user_name,
+                                        IN_SET(storage, USER_SUBVOLUME, USER_DIRECTORY, USER_FSCRYPT) ? image_path : NULL,
+                                        &uid);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to acquire unused UID for %s: %m", user_name);
+        }
+
+        hr = user_record_new();
+        if (!hr)
+                return log_oom();
+
+        r = user_record_synthesize(hr, user_name, realm, image_path, storage, uid, (gid_t) uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to synthesize home record for %s (image %s): %m", user_name, image_path);
+
+        if (h) {
+                r = home_set_record(h, hr);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to update home record for %s: %m", user_name);
+        } else {
+                r = home_new(m, hr, sysfs, &h);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate new home object: %m");
+
+                h->state = HOME_UNFIXATED;
+
+                log_info("Discovered new home for user %s through image %s.", user_name, image_path);
+        }
+
+        return 1;
+}
+
+int manager_augment_record_with_uid(
+                Manager *m,
+                UserRecord *hr) {
+
+        const char *exclude_quota_path = NULL;
+        uid_t start_uid = UID_INVALID, uid;
+        int r;
+
+        assert(m);
+        assert(hr);
+
+        if (uid_is_valid(hr->uid))
+                return 0;
+
+        if (IN_SET(hr->storage, USER_CLASSIC, USER_SUBVOLUME, USER_DIRECTORY, USER_FSCRYPT)) {
+                const char * ip;
+
+                ip = user_record_image_path(hr);
+                if (ip) {
+                        struct stat st;
+
+                        if (stat(ip, &st) < 0) {
+                                if (errno != ENOENT)
+                                        log_warning_errno(errno, "Failed to stat(%s): %m", ip);
+                        }  else if (uid_is_home(st.st_uid)) {
+                                start_uid = st.st_uid;
+                                exclude_quota_path = ip;
+                        }
+                }
+        }
+
+        r = manager_acquire_uid(m, start_uid, hr->user_name, exclude_quota_path, &uid);
+        if (r < 0)
+                return r;
+
+        log_debug("Acquired new UID " UID_FMT " for %s.", uid, hr->user_name);
+
+        r = user_record_add_binding(
+                        hr,
+                        _USER_STORAGE_INVALID,
+                        NULL,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        NULL,
+                        NULL,
+                        UINT64_MAX,
+                        NULL,
+                        NULL,
+                        uid,
+                        (gid_t) uid);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static int manager_assess_image(
+                Manager *m,
+                int dir_fd,
+                const char *dir_path,
+                const char *dentry_name) {
+
+        char *luks_suffix, *directory_suffix;
+        _cleanup_free_ char *path = NULL;
+        struct stat st;
+        int r;
+
+        assert(m);
+        assert(dir_path);
+        assert(dentry_name);
+
+        luks_suffix = endswith(dentry_name, ".home");
+        if (luks_suffix)
+                directory_suffix = NULL;
+        else
+                directory_suffix = endswith(dentry_name, ".homedir");
+
+        /* Early filter out: by name */
+        if (!luks_suffix && !directory_suffix)
+                return 0;
+
+        path = path_join(dir_path, dentry_name);
+        if (!path)
+                return log_oom();
+
+        /* Follow symlinks here, to allow people to link in stuff to make them available locally. */
+        if (dir_fd >= 0)
+                r = fstatat(dir_fd, dentry_name, &st, 0);
+        else
+                r = stat(path, &st);
+        if (r < 0)
+                return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+                                      "Failed to stat() directory entry '%s', ignoring: %m", dentry_name);
+
+        if (S_ISREG(st.st_mode)) {
+                _cleanup_free_ char *n = NULL, *user_name = NULL, *realm = NULL;
+
+                if (!luks_suffix)
+                        return 0;
+
+                n = strndup(dentry_name, luks_suffix - dentry_name);
+                if (!n)
+                        return log_oom();
+
+                r = split_user_name_realm(n, &user_name, &realm);
+                if (r == -EINVAL) /* Not the right format: ignore */
+                        return 0;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to split image name into user name/realm: %m");
+
+                return manager_add_home_by_image(m, user_name, realm, path, NULL, USER_LUKS, UID_INVALID);
+        }
+
+        if (S_ISDIR(st.st_mode)) {
+                _cleanup_free_ char *n = NULL, *user_name = NULL, *realm = NULL;
+                _cleanup_close_ int fd = -EBADF;
+                UserStorage storage;
+
+                if (!directory_suffix)
+                        return 0;
+
+                n = strndup(dentry_name, directory_suffix - dentry_name);
+                if (!n)
+                        return log_oom();
+
+                r = split_user_name_realm(n, &user_name, &realm);
+                if (r == -EINVAL) /* Not the right format: ignore */
+                        return 0;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to split image name into user name/realm: %m");
+
+                if (dir_fd >= 0)
+                        fd = openat(dir_fd, dentry_name, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+                else
+                        fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+                if (fd < 0)
+                        return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+                                              "Failed to open directory '%s', ignoring: %m", path);
+
+                if (fstat(fd, &st) < 0)
+                        return log_warning_errno(errno, "Failed to fstat() %s, ignoring: %m", path);
+
+                assert(S_ISDIR(st.st_mode)); /* Must hold, we used O_DIRECTORY above */
+
+                r = btrfs_is_subvol_fd(fd);
+                if (r < 0)
+                        return log_warning_errno(errno, "Failed to determine whether %s is a btrfs subvolume: %m", path);
+                if (r > 0)
+                        storage = USER_SUBVOLUME;
+                else {
+                        struct fscrypt_policy policy;
+
+                        if (ioctl(fd, FS_IOC_GET_ENCRYPTION_POLICY, &policy) < 0) {
+
+                                if (errno == ENODATA)
+                                        log_debug_errno(errno, "Determined %s is not fscrypt encrypted.", path);
+                                else if (ERRNO_IS_NOT_SUPPORTED(errno))
+                                        log_debug_errno(errno, "Determined %s is not fscrypt encrypted because kernel or file system doesn't support it.", path);
+                                else
+                                        log_debug_errno(errno, "FS_IOC_GET_ENCRYPTION_POLICY failed with unexpected error code on %s, ignoring: %m", path);
+
+                                storage = USER_DIRECTORY;
+                        } else
+                                storage = USER_FSCRYPT;
+                }
+
+                return manager_add_home_by_image(m, user_name, realm, path, NULL, storage, st.st_uid);
+        }
+
+        return 0;
+}
+
+int manager_enumerate_images(Manager *m) {
+        _cleanup_closedir_ DIR *d = NULL;
+
+        assert(m);
+
+        if (!m->scan_slash_home)
+                return 0;
+
+        d = opendir(get_home_root());
+        if (!d)
+                return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno,
+                                      "Failed to open %s: %m", get_home_root());
+
+        FOREACH_DIRENT(de, d, return log_error_errno(errno, "Failed to read %s directory: %m", get_home_root()))
+                (void) manager_assess_image(m, dirfd(d), get_home_root(), de->d_name);
+
+        return 0;
+}
+
+static int manager_connect_bus(Manager *m) {
+        _cleanup_free_ char *b = NULL;
+        const char *suffix, *busname;
+        int r;
+
+        assert(m);
+        assert(!m->bus);
+
+        r = sd_bus_default_system(&m->bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to connect to system bus: %m");
+
+        r = bus_add_implementation(m->bus, &manager_object, m);
+        if (r < 0)
+                return r;
+
+        r = bus_log_control_api_register(m->bus);
+        if (r < 0)
+                return r;
+
+        suffix = getenv("SYSTEMD_HOME_DEBUG_SUFFIX");
+        if (suffix) {
+                b = strjoin("org.freedesktop.home1.", suffix);
+                if (!b)
+                        return log_oom();
+                busname = b;
+        } else
+                busname = "org.freedesktop.home1";
+
+        r = sd_bus_request_name_async(m->bus, NULL, busname, 0, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to request name: %m");
+
+        r = sd_bus_attach_event(m->bus, m->event, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach bus to event loop: %m");
+
+        (void) sd_bus_set_exit_on_disconnect(m->bus, true);
+
+        return 0;
+}
+
+static int manager_bind_varlink(Manager *m) {
+        _cleanup_free_ char *p = NULL;
+        const char *suffix, *socket_path;
+        int r;
+
+        assert(m);
+        assert(!m->varlink_server);
+
+        r = varlink_server_new(&m->varlink_server, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate varlink server object: %m");
+
+        varlink_server_set_userdata(m->varlink_server, m);
+
+        r = varlink_server_add_interface(m->varlink_server, &vl_interface_io_systemd_UserDatabase);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add UserDatabase interface to varlink server: %m");
+
+        r = varlink_server_bind_method_many(
+                        m->varlink_server,
+                        "io.systemd.UserDatabase.GetUserRecord",  vl_method_get_user_record,
+                        "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
+                        "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships);
+        if (r < 0)
+                return log_error_errno(r, "Failed to register varlink methods: %m");
+
+        (void) mkdir_p("/run/systemd/userdb", 0755);
+
+        /* To make things easier to debug, when working from a homed managed home directory, let's optionally
+         * use a different varlink socket name */
+        suffix = getenv("SYSTEMD_HOME_DEBUG_SUFFIX");
+        if (suffix) {
+                p = strjoin("/run/systemd/userdb/io.systemd.Home.", suffix);
+                if (!p)
+                        return log_oom();
+                socket_path = p;
+        } else
+                socket_path = "/run/systemd/userdb/io.systemd.Home";
+
+        r = varlink_server_listen_address(m->varlink_server, socket_path, 0666);
+        if (r < 0)
+                return log_error_errno(r, "Failed to bind to varlink socket: %m");
+
+        r = varlink_server_attach_event(m->varlink_server, m->event, SD_EVENT_PRIORITY_NORMAL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+        assert(!m->userdb_service);
+        r = path_extract_filename(socket_path, &m->userdb_service);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extra filename from socket path '%s': %m", socket_path);
+
+        /* Avoid recursion */
+        if (setenv("SYSTEMD_BYPASS_USERDB", m->userdb_service, 1) < 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set $SYSTEMD_BYPASS_USERDB: %m");
+
+        return 0;
+}
+
+static ssize_t read_datagram(
+                int fd,
+                struct ucred *ret_sender,
+                void **ret,
+                int *ret_passed_fd) {
+
+        CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))) control;
+        _cleanup_free_ void *buffer = NULL;
+        _cleanup_close_ int passed_fd = -EBADF;
+        struct ucred *sender = NULL;
+        struct cmsghdr *cmsg;
+        struct msghdr mh;
+        struct iovec iov;
+        ssize_t n, m;
+
+        assert(fd >= 0);
+        assert(ret_sender);
+        assert(ret);
+        assert(ret_passed_fd);
+
+        n = next_datagram_size_fd(fd);
+        if (n < 0)
+                return n;
+
+        buffer = malloc(n + 2);
+        if (!buffer)
+                return -ENOMEM;
+
+        /* Pass one extra byte, as a size check */
+        iov = IOVEC_MAKE(buffer, n + 1);
+
+        mh = (struct msghdr) {
+                .msg_iov = &iov,
+                .msg_iovlen = 1,
+                .msg_control = &control,
+                .msg_controllen = sizeof(control),
+        };
+
+        m = recvmsg_safe(fd, &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+        if (m < 0)
+                return m;
+
+        /* Ensure the size matches what we determined before */
+        if (m != n) {
+                cmsg_close_all(&mh);
+                return -EMSGSIZE;
+        }
+
+        CMSG_FOREACH(cmsg, &mh) {
+                if (cmsg->cmsg_level == SOL_SOCKET &&
+                    cmsg->cmsg_type == SCM_CREDENTIALS &&
+                    cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+                        assert(!sender);
+                        sender = CMSG_TYPED_DATA(cmsg, struct ucred);
+                }
+
+                if (cmsg->cmsg_level == SOL_SOCKET &&
+                    cmsg->cmsg_type == SCM_RIGHTS) {
+
+                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+                                cmsg_close_all(&mh);
+                                return -EMSGSIZE;
+                        }
+
+                        assert(passed_fd < 0);
+                        passed_fd = *CMSG_TYPED_DATA(cmsg, int);
+                }
+        }
+
+        if (sender)
+                *ret_sender = *sender;
+        else
+                *ret_sender = (struct ucred) UCRED_INVALID;
+
+        *ret_passed_fd = TAKE_FD(passed_fd);
+
+        /* For safety reasons: let's always NUL terminate.  */
+        ((char*) buffer)[n] = 0;
+        *ret = TAKE_PTR(buffer);
+
+        return 0;
+}
+
+static int on_notify_socket(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        _cleanup_strv_free_ char **l = NULL;
+        _cleanup_free_ void *datagram = NULL;
+        _cleanup_close_ int passed_fd = -EBADF;
+        struct ucred sender = UCRED_INVALID;
+        Manager *m = ASSERT_PTR(userdata);
+        ssize_t n;
+        Home *h;
+
+        assert(s);
+
+        n = read_datagram(fd, &sender, &datagram, &passed_fd);
+        if (n < 0) {
+                if (ERRNO_IS_TRANSIENT(n))
+                        return 0;
+                return log_error_errno(n, "Failed to read notify datagram: %m");
+        }
+
+        if (sender.pid <= 0) {
+                log_warning("Received notify datagram without valid sender PID, ignoring.");
+                return 0;
+        }
+
+        h = hashmap_get(m->homes_by_worker_pid, PID_TO_PTR(sender.pid));
+        if (!h) {
+                log_warning("Received notify datagram of unknown process, ignoring.");
+                return 0;
+        }
+
+        l = strv_split(datagram, "\n");
+        if (!l)
+                return log_oom();
+
+        home_process_notify(h, l, TAKE_FD(passed_fd));
+        return 0;
+}
+
+static int manager_listen_notify(Manager *m) {
+        _cleanup_close_ int fd = -EBADF;
+        union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/home/notify",
+        };
+        const char *suffix;
+        int r;
+
+        assert(m);
+        assert(!m->notify_socket_event_source);
+
+        suffix = getenv("SYSTEMD_HOME_DEBUG_SUFFIX");
+        if (suffix) {
+                _cleanup_free_ char *unix_path = NULL;
+
+                unix_path = strjoin("/run/systemd/home/notify.", suffix);
+                if (!unix_path)
+                        return log_oom();
+                r = sockaddr_un_set_path(&sa.un, unix_path);
+                if (r < 0)
+                        return log_error_errno(r, "Socket path %s does not fit in sockaddr_un: %m", unix_path);
+        }
+
+        fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+        if (fd < 0)
+                return log_error_errno(errno, "Failed to create listening socket: %m");
+
+        (void) mkdir_parents(sa.un.sun_path, 0755);
+        (void) sockaddr_un_unlink(&sa.un);
+
+        if (bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0)
+                return log_error_errno(errno, "Failed to bind to socket: %m");
+
+        r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_io(m->event, &m->notify_socket_event_source, fd, EPOLLIN, on_notify_socket, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event source for notify socket: %m");
+
+        (void) sd_event_source_set_description(m->notify_socket_event_source, "notify-socket");
+
+        /* Make sure we process sd_notify() before SIGCHLD for any worker, so that we always know the error
+         * number of a client before it exits. */
+        r = sd_event_source_set_priority(m->notify_socket_event_source, SD_EVENT_PRIORITY_NORMAL - 5);
+        if (r < 0)
+                return log_error_errno(r, "Failed to alter priority of NOTIFY_SOCKET event source: %m");
+
+        r = sd_event_source_set_io_fd_own(m->notify_socket_event_source, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to pass ownership of notify socket: %m");
+
+        return TAKE_FD(fd);
+}
+
+static int manager_add_device(Manager *m, sd_device *d) {
+        _cleanup_free_ char *user_name = NULL, *realm = NULL, *node = NULL;
+        const char *tabletype, *parttype, *partname, *partuuid, *sysfs;
+        sd_id128_t id;
+        int r;
+
+        assert(m);
+        assert(d);
+
+        r = sd_device_get_syspath(d, &sysfs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire sysfs path of device: %m");
+
+        r = sd_device_get_property_value(d, "ID_PART_TABLE_TYPE", &tabletype);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire ID_PART_TABLE_TYPE device property, ignoring: %m");
+
+        if (!streq(tabletype, "gpt")) {
+                log_debug("Found partition (%s) on non-GPT table, ignoring.", sysfs);
+                return 0;
+        }
+
+        r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &parttype);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire ID_PART_ENTRY_TYPE device property, ignoring: %m");
+        if (sd_id128_string_equal(parttype, SD_GPT_USER_HOME) <= 0) {
+                log_debug("Found partition (%s) we don't care about, ignoring.", sysfs);
+                return 0;
+        }
+
+        r = sd_device_get_property_value(d, "ID_PART_ENTRY_NAME", &partname);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to acquire ID_PART_ENTRY_NAME device property, ignoring: %m");
+
+        r = split_user_name_realm(partname, &user_name, &realm);
+        if (r == -EINVAL)
+                return log_warning_errno(r, "Found partition with correct partition type but a non-parsable partition name '%s', ignoring.", partname);
+        if (r < 0)
+                return log_error_errno(r, "Failed to validate partition name '%s': %m", partname);
+
+        r = sd_device_get_property_value(d, "ID_FS_UUID", &partuuid);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to acquire ID_FS_UUID device property, ignoring: %m");
+
+        r = sd_id128_from_string(partuuid, &id);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse ID_FS_UUID field '%s', ignoring: %m", partuuid);
+
+        if (asprintf(&node, "/dev/disk/by-uuid/" SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(id)) < 0)
+                return log_oom();
+
+        return manager_add_home_by_image(m, user_name, realm, node, sysfs, USER_LUKS, UID_INVALID);
+}
+
+static int manager_on_device(sd_device_monitor *monitor, sd_device *d, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        assert(d);
+
+        if (device_for_action(d, SD_DEVICE_REMOVE)) {
+                const char *sysfs;
+                Home *h;
+
+                r = sd_device_get_syspath(d, &sysfs);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to acquire sysfs path from device: %m");
+                        return 0;
+                }
+
+                log_info("block device %s has been removed.", sysfs);
+
+                /* Let's see if we previously synthesized a home record from this device, if so, let's just
+                 * revalidate that. Otherwise let's revalidate them all, but asynchronously. */
+                h = hashmap_get(m->homes_by_sysfs, sysfs);
+                if (h)
+                        manager_revalidate_image(m, h);
+                else
+                        manager_enqueue_gc(m, NULL);
+        } else
+                (void) manager_add_device(m, d);
+
+        (void) bus_manager_emit_auto_login_changed(m);
+        return 0;
+}
+
+static int manager_watch_devices(Manager *m) {
+        int r;
+
+        assert(m);
+        assert(!m->device_monitor);
+
+        r = sd_device_monitor_new(&m->device_monitor);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate device monitor: %m");
+
+        r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "block", NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to configure device monitor match: %m");
+
+        r = sd_device_monitor_attach_event(m->device_monitor, m->event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach device monitor to event loop: %m");
+
+        r = sd_device_monitor_start(m->device_monitor, manager_on_device, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start device monitor: %m");
+
+        return 0;
+}
+
+static int manager_enumerate_devices(Manager *m) {
+        _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+        int r;
+
+        assert(m);
+
+        r = sd_device_enumerator_new(&e);
+        if (r < 0)
+                return r;
+
+        r = sd_device_enumerator_add_match_subsystem(e, "block", true);
+        if (r < 0)
+                return r;
+
+        FOREACH_DEVICE(e, d)
+                (void) manager_add_device(m, d);
+
+        return 0;
+}
+
+static int manager_load_key_pair(Manager *m) {
+        _cleanup_fclose_ FILE *f = NULL;
+        struct stat st;
+        int r;
+
+        assert(m);
+
+        if (m->private_key) {
+                EVP_PKEY_free(m->private_key);
+                m->private_key = NULL;
+        }
+
+        r = search_and_fopen_nulstr("local.private", "re", NULL, KEY_PATHS_NULSTR, &f, NULL);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to read private key file: %m");
+
+        if (fstat(fileno(f), &st) < 0)
+                return log_error_errno(errno, "Failed to stat private key file: %m");
+
+        r = stat_verify_regular(&st);
+        if (r < 0)
+                return log_error_errno(r, "Private key file is not regular: %m");
+
+        if (st.st_uid != 0 || (st.st_mode & 0077) != 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Private key file is readable by more than the root user");
+
+        m->private_key = PEM_read_PrivateKey(f, NULL, NULL, NULL);
+        if (!m->private_key)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to load private key pair");
+
+        log_info("Successfully loaded private key pair.");
+
+        return 1;
+}
+
+static int manager_generate_key_pair(Manager *m) {
+        _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = NULL;
+        _cleanup_(unlink_and_freep) char *temp_public = NULL, *temp_private = NULL;
+        _cleanup_fclose_ FILE *fpublic = NULL, *fprivate = NULL;
+        int r;
+
+        if (m->private_key) {
+                EVP_PKEY_free(m->private_key);
+                m->private_key = NULL;
+        }
+
+        ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_ED25519, NULL);
+        if (!ctx)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to allocate Ed25519 key generation context.");
+
+        if (EVP_PKEY_keygen_init(ctx) <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to initialize Ed25519 key generation context.");
+
+        log_info("Generating key pair for signing local user identity records.");
+
+        if (EVP_PKEY_keygen(ctx, &m->private_key) <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to generate Ed25519 key pair");
+
+        log_info("Successfully created Ed25519 key pair.");
+
+        (void) mkdir_p("/var/lib/systemd/home", 0755);
+
+        /* Write out public key (note that we only do that as a help to the user, we don't make use of this ever */
+        r = fopen_temporary("/var/lib/systemd/home/local.public", &fpublic, &temp_public);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to open key file for writing: %m");
+
+        if (PEM_write_PUBKEY(fpublic, m->private_key) <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to write public key.");
+
+        r = fflush_sync_and_check(fpublic);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write private key: %m");
+
+        fpublic = safe_fclose(fpublic);
+
+        /* Write out the private key (this actually writes out both private and public, OpenSSL is confusing) */
+        r = fopen_temporary("/var/lib/systemd/home/local.private", &fprivate, &temp_private);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to open key file for writing: %m");
+
+        if (PEM_write_PrivateKey(fprivate, m->private_key, NULL, NULL, 0, NULL, 0) <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to write private key pair.");
+
+        r = fflush_sync_and_check(fprivate);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write private key: %m");
+
+        fprivate = safe_fclose(fprivate);
+
+        /* Both are written now, move them into place */
+
+        if (rename(temp_public, "/var/lib/systemd/home/local.public") < 0)
+                return log_error_errno(errno, "Failed to move public key file into place: %m");
+        temp_public = mfree(temp_public);
+
+        r = RET_NERRNO(rename(temp_private, "/var/lib/systemd/home/local.private"));
+        if (r < 0) {
+                (void) unlink("/var/lib/systemd/home/local.public"); /* try to remove the file we already created */
+                return log_error_errno(r, "Failed to move private key file into place: %m");
+        }
+        temp_private = mfree(temp_private);
+
+        r = fsync_path_at(AT_FDCWD, "/var/lib/systemd/home/");
+        if (r < 0)
+                log_warning_errno(r, "Failed to sync /var/lib/systemd/home/, ignoring: %m");
+
+        return 1;
+}
+
+int manager_acquire_key_pair(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* Already there? */
+        if (m->private_key)
+                return 1;
+
+        /* First try to load key off disk */
+        r = manager_load_key_pair(m);
+        if (r != 0)
+                return r;
+
+        /* Didn't work, generate a new one */
+        return manager_generate_key_pair(m);
+}
+
+int manager_sign_user_record(Manager *m, UserRecord *u, UserRecord **ret, sd_bus_error *error) {
+        int r;
+
+        assert(m);
+        assert(u);
+        assert(ret);
+
+        r = manager_acquire_key_pair(m);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return sd_bus_error_set(error, BUS_ERROR_NO_PRIVATE_KEY, "Can't sign without local key.");
+
+        return user_record_sign(u, m->private_key, ret);
+}
+
+DEFINE_PRIVATE_HASH_OPS_FULL(public_key_hash_ops, char, string_hash_func, string_compare_func, free, EVP_PKEY, EVP_PKEY_free);
+
+static int manager_load_public_key_one(Manager *m, const char *path) {
+        _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        _cleanup_free_ char *fn = NULL;
+        struct stat st;
+        int r;
+
+        assert(m);
+
+        r = path_extract_filename(path, &fn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract filename of path '%s': %m", path);
+
+        if (streq(fn, "local.public")) /* we already loaded the private key, which includes the public one */
+                return 0;
+
+        f = fopen(path, "re");
+        if (!f) {
+                if (errno == ENOENT)
+                        return 0;
+
+                return log_error_errno(errno, "Failed to open public key %s: %m", path);
+        }
+
+        if (fstat(fileno(f), &st) < 0)
+                return log_error_errno(errno, "Failed to stat public key %s: %m", path);
+
+        r = stat_verify_regular(&st);
+        if (r < 0)
+                return log_error_errno(r, "Public key file %s is not a regular file: %m", path);
+
+        if (st.st_uid != 0 || (st.st_mode & 0022) != 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Public key file %s is writable by more than the root user, refusing.", path);
+
+        r = hashmap_ensure_allocated(&m->public_keys, &public_key_hash_ops);
+        if (r < 0)
+                return log_oom();
+
+        pkey = PEM_read_PUBKEY(f, &pkey, NULL, NULL);
+        if (!pkey)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse public key file %s.", path);
+
+        r = hashmap_put(m->public_keys, fn, pkey);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add public key to set: %m");
+
+        TAKE_PTR(fn);
+        TAKE_PTR(pkey);
+
+        return 0;
+}
+
+static int manager_load_public_keys(Manager *m) {
+        _cleanup_strv_free_ char **files = NULL;
+        int r;
+
+        assert(m);
+
+        m->public_keys = hashmap_free(m->public_keys);
+
+        r = conf_files_list_nulstr(
+                        &files,
+                        ".public",
+                        NULL,
+                        CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED,
+                        KEY_PATHS_NULSTR);
+        if (r < 0)
+                return log_error_errno(r, "Failed to assemble list of public key directories: %m");
+
+        STRV_FOREACH(i, files)
+                (void) manager_load_public_key_one(m, *i);
+
+        return 0;
+}
+
+int manager_startup(Manager *m) {
+        int r;
+
+        assert(m);
+
+        r = manager_listen_notify(m);
+        if (r < 0)
+                return r;
+
+        r = manager_connect_bus(m);
+        if (r < 0)
+                return r;
+
+        r = manager_bind_varlink(m);
+        if (r < 0)
+                return r;
+
+        r = manager_load_key_pair(m); /* only try to load it, don't generate any */
+        if (r < 0)
+                return r;
+
+        r = manager_load_public_keys(m);
+        if (r < 0)
+                return r;
+
+        manager_watch_home(m);
+        (void) manager_watch_devices(m);
+
+        (void) manager_enumerate_records(m);
+        (void) manager_enumerate_images(m);
+        (void) manager_enumerate_devices(m);
+
+        /* Let's clean up home directories whose devices got removed while we were not running */
+        (void) manager_enqueue_gc(m, NULL);
+
+        return 0;
+}
+
+void manager_revalidate_image(Manager *m, Home *h) {
+        int r;
+
+        assert(m);
+        assert(h);
+
+        /* Frees an automatically discovered image, if it's synthetic and its image disappeared. Unmounts any
+         * image if it's mounted but its image vanished. */
+
+        if (h->current_operation || !ordered_set_isempty(h->pending_operations))
+                return;
+
+        if (h->state == HOME_UNFIXATED) {
+                r = user_record_test_image_path(h->record);
+                if (r < 0)
+                        log_warning_errno(r, "Can't determine if image of %s exists, freeing unfixated user: %m", h->user_name);
+                else if (r == USER_TEST_ABSENT)
+                        log_info("Image for %s disappeared, freeing unfixated user.", h->user_name);
+                else
+                        return;
+
+                home_free(h);
+
+        } else if (h->state < 0) {
+
+                r = user_record_test_home_directory(h->record);
+                if (r < 0) {
+                        log_warning_errno(r, "Unable to determine state of home directory, ignoring: %m");
+                        return;
+                }
+
+                if (r == USER_TEST_MOUNTED) {
+                        r = user_record_test_image_path(h->record);
+                        if (r < 0) {
+                                log_warning_errno(r, "Unable to determine state of image path, ignoring: %m");
+                                return;
+                        }
+
+                        if (r == USER_TEST_ABSENT) {
+                                _cleanup_(operation_unrefp) Operation *o = NULL;
+
+                                log_notice("Backing image disappeared while home directory %s was mounted, unmounting it forcibly.", h->user_name);
+                                /* Wowza, the thing is mounted, but the device is gone? Act on it. */
+
+                                r = home_killall(h);
+                                if (r < 0)
+                                        log_warning_errno(r, "Failed to kill processes of user %s, ignoring: %m", h->user_name);
+
+                                /* We enqueue the operation here, after all the home directory might
+                                 * currently already run some operation, and we can deactivate it only after
+                                 * that's complete. */
+                                o = operation_new(OPERATION_DEACTIVATE_FORCE, NULL);
+                                if (!o) {
+                                        log_oom();
+                                        return;
+                                }
+
+                                r = home_schedule_operation(h, o, NULL);
+                                if (r < 0)
+                                        log_warning_errno(r, "Failed to enqueue forced home directory %s deactivation, ignoring: %m", h->user_name);
+                        }
+                }
+        }
+}
+
+int manager_gc_images(Manager *m) {
+        Home *h;
+
+        assert_se(m);
+
+        if (m->gc_focus) {
+                /* Focus on a specific home */
+
+                h = TAKE_PTR(m->gc_focus);
+                manager_revalidate_image(m, h);
+        } else {
+                /* Gc all */
+
+                HASHMAP_FOREACH(h, m->homes_by_name)
+                        manager_revalidate_image(m, h);
+        }
+
+        return 0;
+}
+
+static int on_deferred_rescan(sd_event_source *s, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        m->deferred_rescan_event_source = sd_event_source_disable_unref(m->deferred_rescan_event_source);
+
+        manager_enumerate_devices(m);
+        manager_enumerate_images(m);
+        return 0;
+}
+
+int manager_enqueue_rescan(Manager *m) {
+        int r;
+
+        assert(m);
+
+        if (m->deferred_rescan_event_source)
+                return 0;
+
+        if (!m->event)
+                return 0;
+
+        if (IN_SET(sd_event_get_state(m->event), SD_EVENT_FINISHED, SD_EVENT_EXITING))
+                return 0;
+
+        r = sd_event_add_defer(m->event, &m->deferred_rescan_event_source, on_deferred_rescan, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate rescan event source: %m");
+
+        r = sd_event_source_set_priority(m->deferred_rescan_event_source, SD_EVENT_PRIORITY_IDLE+1);
+        if (r < 0)
+                log_warning_errno(r, "Failed to tweak priority of event source, ignoring: %m");
+
+        (void) sd_event_source_set_description(m->deferred_rescan_event_source, "deferred-rescan");
+        return 1;
+}
+
+static int on_deferred_gc(sd_event_source *s, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        m->deferred_gc_event_source = sd_event_source_disable_unref(m->deferred_gc_event_source);
+
+        manager_gc_images(m);
+        return 0;
+}
+
+int manager_enqueue_gc(Manager *m, Home *focus) {
+        int r;
+
+        assert(m);
+
+        /* This enqueues a request to GC dead homes. It may be called with focus=NULL in which case all homes
+         * will be scanned, or with the parameter set, in which case only that home is checked. */
+
+        if (!m->event)
+                return 0;
+
+        if (IN_SET(sd_event_get_state(m->event), SD_EVENT_FINISHED, SD_EVENT_EXITING))
+                return 0;
+
+        /* If a focus home is specified, then remember to focus just on this home. Otherwise invalidate any
+         * focus that might be set to look at all homes. */
+
+        if (m->deferred_gc_event_source) {
+                if (m->gc_focus != focus) /* not the same focus, then look at everything */
+                        m->gc_focus = NULL;
+
+                return 0;
+        } else
+                m->gc_focus = focus; /* start focused */
+
+        r = sd_event_add_defer(m->event, &m->deferred_gc_event_source, on_deferred_gc, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate GC event source: %m");
+
+        r = sd_event_source_set_priority(m->deferred_gc_event_source, SD_EVENT_PRIORITY_IDLE);
+        if (r < 0)
+                log_warning_errno(r, "Failed to tweak priority of event source, ignoring: %m");
+
+        (void) sd_event_source_set_description(m->deferred_gc_event_source, "deferred-gc");
+        return 1;
+}
+
+static bool manager_shall_rebalance(Manager *m) {
+        Home *h;
+
+        assert(m);
+
+        if (IN_SET(m->rebalance_state, REBALANCE_PENDING, REBALANCE_SHRINKING, REBALANCE_GROWING))
+                return true;
+
+        HASHMAP_FOREACH(h, m->homes_by_name)
+                if (home_shall_rebalance(h))
+                        return true;
+
+        return false;
+}
+
+static int home_cmp(Home *const*a, Home *const*b) {
+        int r;
+
+        assert(a);
+        assert(*a);
+        assert(b);
+        assert(*b);
+
+        /* Order user records by their weight (and by their name, to make things stable). We put the records
+         * with the highest weight last, since we distribute space from the beginning and round down, hence
+         * later entries tend to get slightly more than earlier entries. */
+
+        r = CMP(user_record_rebalance_weight((*a)->record), user_record_rebalance_weight((*b)->record));
+        if (r != 0)
+                return r;
+
+        return strcmp((*a)->user_name, (*b)->user_name);
+}
+
+static int manager_rebalance_calculate(Manager *m) {
+        uint64_t weight_sum, free_sum, usage_sum = 0, min_free = UINT64_MAX;
+        _cleanup_free_ Home **array = NULL;
+        bool relevant = false;
+        struct statfs sfs;
+        int c = 0, r;
+        Home *h;
+
+        assert(m);
+
+        if (statfs(get_home_root(), &sfs) < 0)
+                return log_error_errno(errno, "Failed to statfs() /home: %m");
+
+        free_sum = (uint64_t) sfs.f_bsize * sfs.f_bavail; /* This much free space is available on the
+                                                           * underlying pool directory */
+
+        weight_sum = REBALANCE_WEIGHT_BACKING; /* Grant the underlying pool directory a fixed weight of 20
+                                                * (home dirs get 100 by default, i.e. 5x more). This weight
+                                                * is not configurable, the per-home weights are. */
+
+        HASHMAP_FOREACH(h, m->homes_by_name) {
+                statfs_f_type_t fstype;
+                h->rebalance_pending = false; /* First, reset the flag, we only want it to be true for the
+                                               * homes that qualify for rebalancing */
+
+                if (!home_shall_rebalance(h)) /* Only look at actual candidates */
+                        continue;
+
+                if (home_is_busy(h))
+                        return -EBUSY; /* Let's not rebalance if there's a busy home directory. */
+
+                r = home_get_disk_status(
+                                h,
+                                &h->rebalance_size,
+                                &h->rebalance_usage,
+                                &h->rebalance_free,
+                                NULL,
+                                NULL,
+                                &fstype,
+                                NULL);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to get free space of home '%s', ignoring.", h->user_name);
+                        continue;
+                }
+
+                if (h->rebalance_free > UINT64_MAX - free_sum)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "Rebalance free overflow");
+                free_sum += h->rebalance_free;
+
+                if (h->rebalance_usage > UINT64_MAX - usage_sum)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "Rebalance usage overflow");
+                usage_sum += h->rebalance_usage;
+
+                h->rebalance_weight = user_record_rebalance_weight(h->record);
+                if (h->rebalance_weight > UINT64_MAX - weight_sum)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "Rebalance weight overflow");
+                weight_sum += h->rebalance_weight;
+
+                h->rebalance_min = minimal_size_by_fs_magic(fstype);
+
+                if (!GREEDY_REALLOC(array, c+1))
+                        return log_oom();
+
+                array[c++] = h;
+        }
+
+        if (c == 0) {
+                log_debug("No homes to rebalance.");
+                return 0;
+        }
+
+        assert(weight_sum > 0);
+
+        log_debug("Disk space usage by all home directories to rebalance: %s — available disk space: %s",
+                  FORMAT_BYTES(usage_sum), FORMAT_BYTES(free_sum));
+
+        /* Bring the home directories in a well-defined order, so that we distribute space in a reproducible
+         * way for the same parameters. */
+        typesafe_qsort(array, c, home_cmp);
+
+        for (int i = 0; i < c; i++) {
+                uint64_t new_free;
+                double d;
+
+                h = array[i];
+
+                assert(h->rebalance_free <= free_sum);
+                assert(h->rebalance_usage <= usage_sum);
+                assert(h->rebalance_weight <= weight_sum);
+
+                d = ((double) (free_sum / 4096) * (double) h->rebalance_weight) / (double) weight_sum; /* Calculate new space for this home in units of 4K */
+
+                /* Convert from units of 4K back to bytes */
+                if (d >= (double) (UINT64_MAX/4096))
+                        new_free = UINT64_MAX;
+                else
+                        new_free = (uint64_t) d * 4096;
+
+                /* Subtract the weight and assigned space from the sums now, to distribute the rounding noise
+                 * to the remaining home dirs */
+                free_sum = LESS_BY(free_sum, new_free);
+                weight_sum = LESS_BY(weight_sum, h->rebalance_weight);
+
+                /* Keep track of home directory with the least amount of space left: we want to schedule the
+                 * next rebalance more quickly if this is low */
+                if (new_free < min_free)
+                        min_free = h->rebalance_size;
+
+                if (new_free > UINT64_MAX - h->rebalance_usage)
+                        h->rebalance_goal = UINT64_MAX-1; /* maximum size */
+                else {
+                        h->rebalance_goal = h->rebalance_usage + new_free;
+
+                        if (h->rebalance_min != UINT64_MAX && h->rebalance_goal < h->rebalance_min)
+                                h->rebalance_goal = h->rebalance_min;
+                }
+
+                /* Skip over this home if the state doesn't match the operation */
+                if ((m->rebalance_state == REBALANCE_SHRINKING && h->rebalance_goal > h->rebalance_size) ||
+                    (m->rebalance_state == REBALANCE_GROWING && h->rebalance_goal < h->rebalance_size))
+                        h->rebalance_pending = false;
+                else {
+                        log_debug("Rebalancing home directory '%s' %s %s %s.", h->user_name,
+                                  FORMAT_BYTES(h->rebalance_size),
+                                  special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+                                  FORMAT_BYTES(h->rebalance_goal));
+                        h->rebalance_pending = true;
+                }
+
+                if ((fabs((double) h->rebalance_size - (double) h->rebalance_goal) * 100 / (double) h->rebalance_size) >= 5.0)
+                        relevant = true;
+        }
+
+        /* Scale next rebalancing interval based on the least amount of space of any of the home
+         * directories. We pick a time in the range 1min … 15min, scaled by log2(min_free), so that:
+         * 10M → ~0.7min, 100M → ~2.7min, 1G → ~4.6min, 10G → ~6.5min, 100G ~8.4 */
+        m->rebalance_interval_usec = (usec_t) CLAMP((LESS_BY(log2(min_free), 22)*15*USEC_PER_MINUTE)/26,
+                                                    1 * USEC_PER_MINUTE,
+                                                    15 * USEC_PER_MINUTE);
+
+
+        log_debug("Rebalancing interval set to %s.", FORMAT_TIMESPAN(m->rebalance_interval_usec, USEC_PER_MSEC));
+
+        /* Let's suppress small resizes, growing/shrinking file systems isn't free after all */
+        if (!relevant) {
+                log_debug("Skipping rebalancing, since all calculated size changes are below ±5%%.");
+                return 0;
+        }
+
+        return c;
+}
+
+static int manager_rebalance_apply(Manager *m) {
+        int c = 0, r;
+        Home *h;
+
+        assert(m);
+
+        HASHMAP_FOREACH(h, m->homes_by_name) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                if (!h->rebalance_pending)
+                        continue;
+
+                h->rebalance_pending = false;
+
+                r = home_resize(h, h->rebalance_goal, /* secret= */ NULL, /* automatic= */ true, &error);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to resize home '%s' for rebalancing, ignoring: %s",
+                                          h->user_name, bus_error_message(&error, r));
+                else
+                        c++;
+        }
+
+        return c;
+}
+
+static void manager_rebalance_reply_messages(Manager *m) {
+        int r;
+
+        assert(m);
+
+        for (;;) {
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *msg =
+                        set_steal_first(m->rebalance_pending_method_calls);
+
+                if (!msg)
+                        break;
+
+                r = sd_bus_reply_method_return(msg, NULL);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to reply to rebalance method call, ignoring: %m");
+        }
+}
+
+static int manager_rebalance_now(Manager *m) {
+        RebalanceState busy_state; /* the state to revert to when operation fails if busy */
+        int r;
+
+        assert(m);
+
+        log_debug("Rebalancing now...");
+
+        /* We maintain a simple state engine here to keep track of what we are doing. We'll first shrink all
+         * homes that shall be shrunk and then grow all homes that shall be grown, so that they can take up
+         * the space now freed. */
+
+        for (;;) {
+                switch (m->rebalance_state) {
+
+                case REBALANCE_IDLE:
+                case REBALANCE_PENDING:
+                case REBALANCE_WAITING:
+                        /* First shrink large home dirs */
+                        m->rebalance_state = REBALANCE_SHRINKING;
+                        busy_state = REBALANCE_PENDING;
+
+                        /* We are initiating the next rebalancing cycle now, let's make the queued methods
+                         * calls the pending ones, and flush out any pending ones (which shouldn't exist at
+                         * this time anyway) */
+                        set_clear(m->rebalance_pending_method_calls);
+                        SWAP_TWO(m->rebalance_pending_method_calls, m->rebalance_queued_method_calls);
+
+                        log_debug("Shrinking phase..");
+                        break;
+
+                case REBALANCE_SHRINKING:
+                        /* Then grow small home dirs */
+                        m->rebalance_state = REBALANCE_GROWING;
+                        busy_state = REBALANCE_SHRINKING;
+                        log_debug("Growing phase..");
+                        break;
+
+                case REBALANCE_GROWING:
+                        /* Finally, we are done */
+                        log_info("Rebalancing complete.");
+                        m->rebalance_state = REBALANCE_IDLE;
+                        r = 0;
+                        goto finish;
+
+                case REBALANCE_OFF:
+                default:
+                        assert_not_reached();
+                }
+
+                r = manager_rebalance_calculate(m);
+                if (r == -EBUSY) {
+                        /* Calculations failed because one home directory is currently busy. Revert to a state that
+                         * tells us what to do next. */
+                        log_debug("Can't enter phase, busy.");
+                        m->rebalance_state = busy_state;
+                        return r;
+                }
+                if (r < 0)
+                        goto finish;
+                if (r == 0)
+                        continue; /* got to next step immediately, if there's nothing to do */
+
+                r = manager_rebalance_apply(m);
+                if (r < 0)
+                        goto finish;
+                if (r > 0)
+                        break; /* At least one resize operation is now pending, we are done for now */
+
+                /* If there was nothing to apply, go for next state right-away */
+        }
+
+        return 0;
+
+finish:
+        /* Reset state and schedule next rebalance */
+        m->rebalance_state = REBALANCE_IDLE;
+        manager_rebalance_reply_messages(m);
+        (void) manager_schedule_rebalance(m, /* immediately= */ false);
+        return r;
+}
+
+static int on_rebalance_timer(sd_event_source *s, usec_t t, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+        assert(IN_SET(m->rebalance_state, REBALANCE_WAITING, REBALANCE_PENDING, REBALANCE_SHRINKING, REBALANCE_GROWING));
+
+        (void) manager_rebalance_now(m);
+        return 0;
+}
+
+int manager_schedule_rebalance(Manager *m, bool immediately) {
+        int r;
+
+        assert(m);
+
+        /* Check if there are any records where rebalancing is requested */
+        if (!manager_shall_rebalance(m)) {
+                log_debug("Not scheduling rebalancing, not needed.");
+                r = 0; /* report that we didn't schedule anything because nothing needed it */
+                goto turn_off;
+        }
+
+        if (immediately) {
+                /* If we are told to rebalance immediately, then mark a rebalance as pending (even if we area
+                 * already running one) */
+
+                if (m->rebalance_event_source) {
+                        r = sd_event_source_set_time(m->rebalance_event_source, 0);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to schedule immediate rebalancing: %m");
+                                goto turn_off;
+                        }
+
+                        r = sd_event_source_set_enabled(m->rebalance_event_source, SD_EVENT_ONESHOT);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to enable rebalancing event source: %m");
+                                goto turn_off;
+                        }
+                } else {
+                        r = sd_event_add_time(m->event, &m->rebalance_event_source, CLOCK_MONOTONIC, 0, USEC_PER_SEC, on_rebalance_timer, m);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to allocate rebalance event source: %m");
+                                goto turn_off;
+                        }
+
+                        r = sd_event_source_set_priority(m->rebalance_event_source, SD_EVENT_PRIORITY_IDLE + 10);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to set rebalance event source priority: %m");
+                                goto turn_off;
+                        }
+
+                        (void) sd_event_source_set_description(m->rebalance_event_source, "rebalance");
+
+                }
+
+                if (!IN_SET(m->rebalance_state, REBALANCE_PENDING, REBALANCE_SHRINKING, REBALANCE_GROWING))
+                        m->rebalance_state = REBALANCE_PENDING;
+
+                log_debug("Scheduled immediate rebalancing...");
+                return 1; /* report that we scheduled something */
+        }
+
+        /* If we are told to schedule a rebalancing eventually, then do so only if we are not executing
+         * anything yet. Also if we have something scheduled already, leave it in place */
+        if (!IN_SET(m->rebalance_state, REBALANCE_OFF, REBALANCE_IDLE))
+                return 1; /* report that there's already something scheduled */
+
+        if (m->rebalance_event_source) {
+                r = sd_event_source_set_time_relative(m->rebalance_event_source, m->rebalance_interval_usec);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to schedule immediate rebalancing: %m");
+                        goto turn_off;
+                }
+
+                r = sd_event_source_set_enabled(m->rebalance_event_source, SD_EVENT_ONESHOT);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to enable rebalancing event source: %m");
+                        goto turn_off;
+                }
+        } else {
+                r = sd_event_add_time_relative(m->event, &m->rebalance_event_source, CLOCK_MONOTONIC, m->rebalance_interval_usec, USEC_PER_SEC, on_rebalance_timer, m);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to allocate rebalance event source: %m");
+                        goto turn_off;
+                }
+
+                r = sd_event_source_set_priority(m->rebalance_event_source, SD_EVENT_PRIORITY_IDLE + 10);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to set rebalance event source priority: %m");
+                        goto turn_off;
+                }
+
+                (void) sd_event_source_set_description(m->rebalance_event_source, "rebalance");
+        }
+
+        m->rebalance_state = REBALANCE_WAITING; /* We managed to enqueue a timer event, we now wait until it fires */
+        log_debug("Scheduled rebalancing in %s...", FORMAT_TIMESPAN(m->rebalance_interval_usec, 0));
+        return 1; /* report that we scheduled something */
+
+turn_off:
+        m->rebalance_event_source = sd_event_source_disable_unref(m->rebalance_event_source);
+        m->rebalance_state = REBALANCE_OFF;
+        manager_rebalance_reply_messages(m);
+        return r;
+}
+
+int manager_reschedule_rebalance(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* If a rebalance is pending reschedules it so it gets executed immediately */
+
+        if (!IN_SET(m->rebalance_state, REBALANCE_PENDING, REBALANCE_SHRINKING, REBALANCE_GROWING))
+                return 0;
+
+        r = manager_schedule_rebalance(m, /* immediately= */ true);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
diff --git a/src/home/homed-manager.h b/src/home/homed-manager.h
new file mode 100644
index 0000000..20bbb4c
--- /dev/null
+++ b/src/home/homed-manager.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "sd-bus.h"
+#include "sd-device.h"
+#include "sd-event.h"
+
+typedef struct Manager Manager;
+
+#include "hashmap.h"
+#include "homed-home.h"
+#include "varlink.h"
+
+/* The LUKS free disk space rebalancing logic goes through this state machine */
+typedef enum RebalanceState {
+        REBALANCE_OFF,       /* No rebalancing enabled */
+        REBALANCE_IDLE,      /* Rebalancing enabled, but currently nothing scheduled */
+        REBALANCE_WAITING,   /* Rebalancing has been requested for a later point in time */
+        REBALANCE_PENDING,   /* Rebalancing has been requested and will be executed ASAP */
+        REBALANCE_SHRINKING, /* Rebalancing ongoing, and we are running all shrinking operations */
+        REBALANCE_GROWING,   /* Rebalancing ongoign, and we are running all growing operations */
+        _REBALANCE_STATE_MAX,
+        _REBALANCE_STATE_INVALID = -1,
+} RebalanceState;
+
+struct Manager {
+        sd_event *event;
+        sd_bus *bus;
+
+        Hashmap *polkit_registry;
+
+        Hashmap *homes_by_uid;
+        Hashmap *homes_by_name;
+        Hashmap *homes_by_worker_pid;
+        Hashmap *homes_by_sysfs;
+
+        bool scan_slash_home;
+        UserStorage default_storage;
+        char *default_file_system_type;
+
+        sd_event_source *inotify_event_source;
+
+        /* An event source we receive sd_notify() messages from our worker from */
+        sd_event_source *notify_socket_event_source;
+
+        sd_device_monitor *device_monitor;
+
+        sd_event_source *deferred_rescan_event_source;
+        sd_event_source *deferred_gc_event_source;
+        sd_event_source *deferred_auto_login_event_source;
+
+        sd_event_source *rebalance_event_source;
+
+        Home *gc_focus;
+
+        VarlinkServer *varlink_server;
+        char *userdb_service;
+
+        EVP_PKEY *private_key; /* actually a pair of private and public key */
+        Hashmap *public_keys; /* key name [char*] → public key [EVP_PKEY*] */
+
+        RebalanceState rebalance_state;
+        usec_t rebalance_interval_usec;
+
+        /* In order to allow synchronous rebalance requests via bus calls we maintain two pools of bus
+         * messages: 'rebalance_pending_methods' are the method calls we are currently operating on and
+         * running a rebalancing operation for. 'rebalance_queued_method_calls' are the method calls that
+         * have been queued since then and that we'll operate on once we complete the current run. */
+        Set *rebalance_pending_method_calls, *rebalance_queued_method_calls;
+};
+
+int manager_new(Manager **ret);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m);
+
+int manager_augment_record_with_uid(Manager *m, UserRecord *hr);
+
+int manager_enqueue_rescan(Manager *m);
+int manager_enqueue_gc(Manager *m, Home *focus);
+
+int manager_schedule_rebalance(Manager *m, bool immediately);
+int manager_reschedule_rebalance(Manager *m);
+
+int manager_verify_user_record(Manager *m, UserRecord *hr);
+
+int manager_acquire_key_pair(Manager *m);
+int manager_sign_user_record(Manager *m, UserRecord *u, UserRecord **ret, sd_bus_error *error);
+
+int bus_manager_emit_auto_login_changed(Manager *m);
diff --git a/src/home/homed-operation.c b/src/home/homed-operation.c
new file mode 100644
index 0000000..618e920
--- /dev/null
+++ b/src/home/homed-operation.c
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "fd-util.h"
+#include "homed-operation.h"
+
+Operation *operation_new(OperationType type, sd_bus_message *m) {
+        Operation *o;
+
+        assert(type >= 0);
+        assert(type < _OPERATION_MAX);
+
+        o = new(Operation, 1);
+        if (!o)
+                return NULL;
+
+        *o = (Operation) {
+                .type = type,
+                .n_ref = 1,
+                .message = sd_bus_message_ref(m),
+                .send_fd = -EBADF,
+                .result = -1,
+        };
+
+        return o;
+}
+
+static Operation *operation_free(Operation *o) {
+        int r;
+
+        if (!o)
+                return NULL;
+
+        if (o->message && o->result >= 0) {
+
+                if (o->result) {
+                        /* Propagate success */
+                        if (o->send_fd < 0)
+                                r = sd_bus_reply_method_return(o->message, NULL);
+                        else
+                                r = sd_bus_reply_method_return(o->message, "h", o->send_fd);
+
+                } else {
+                        /* Propagate failure */
+                        if (sd_bus_error_is_set(&o->error))
+                                r = sd_bus_reply_method_error(o->message, &o->error);
+                        else
+                                r = sd_bus_reply_method_errnof(o->message, o->ret, "Failed to execute operation: %m");
+                }
+                if (r < 0)
+                        log_warning_errno(r, "Failed to reply to %s method call, ignoring: %m", sd_bus_message_get_member(o->message));
+        }
+
+        sd_bus_message_unref(o->message);
+        user_record_unref(o->secret);
+        safe_close(o->send_fd);
+        sd_bus_error_free(&o->error);
+
+        return mfree(o);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(Operation, operation, operation_free);
+
+void operation_result(Operation *o, int ret, const sd_bus_error *error) {
+        assert(o);
+
+        if (ret >= 0)
+                o->result = true;
+        else {
+                o->ret = ret;
+
+                sd_bus_error_free(&o->error);
+                sd_bus_error_copy(&o->error, error);
+
+                o->result = false;
+        }
+}
diff --git a/src/home/homed-operation.h b/src/home/homed-operation.h
new file mode 100644
index 0000000..004246a
--- /dev/null
+++ b/src/home/homed-operation.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "user-record.h"
+
+typedef enum OperationType {
+        OPERATION_ACQUIRE,           /* enqueued on AcquireHome() */
+        OPERATION_RELEASE,           /* enqueued on ReleaseHome() */
+        OPERATION_LOCK_ALL,          /* enqueued on LockAllHomes() */
+        OPERATION_DEACTIVATE_ALL,    /* enqueued on DeactivateAllHomes() */
+        OPERATION_PIPE_EOF,          /* enqueued when we see EOF on the per-home reference pipes */
+        OPERATION_DEACTIVATE_FORCE,  /* enqueued on hard $HOME unplug */
+        OPERATION_IMMEDIATE,         /* this is never enqueued, it's just a marker we immediately started executing an operation without enqueuing anything first. */
+        _OPERATION_MAX,
+        _OPERATION_INVALID = -EINVAL,
+} OperationType;
+
+/* Encapsulates an operation on one or more home directories. This has two uses:
+ *
+ *     1) For queuing an operation when we need to execute one for some reason but there's already one being
+ *        executed.
+ *
+ *     2) When executing an operation without enqueuing it first (OPERATION_IMMEDIATE)
+ *
+ * Note that a single operation object can encapsulate operations on multiple home directories. This is used
+ * for the LockAllHomes() operation, which is one operation but applies to all homes at once. In case the
+ * operation applies to multiple homes the reference counter is increased once for each, and thus the
+ * operation is fully completed only after it reached zero again.
+ *
+ * The object (optionally) contains a reference of the D-Bus message triggering the operation, which is
+ * replied to when the operation is fully completed, i.e. when n_ref reaches zero.
+ */
+
+typedef struct Operation {
+        unsigned n_ref;
+        OperationType type;
+        sd_bus_message *message;
+
+        UserRecord *secret;
+        int send_fd;   /* pipe fd for AcquireHome() which is taken already when we start the operation */
+
+        int result;    /* < 0 if not completed yet, == 0 on failure, > 0 on success */
+        sd_bus_error error;
+        int ret;
+} Operation;
+
+Operation *operation_new(OperationType type, sd_bus_message *m);
+Operation *operation_ref(Operation *operation);
+Operation *operation_unref(Operation *operation);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Operation*, operation_unref);
+
+void operation_result(Operation *o, int ret, const sd_bus_error *error);
+
+static inline Operation* operation_result_unref(Operation *o, int ret, const sd_bus_error *error) {
+        if (!o)
+                return NULL;
+
+        operation_result(o, ret, error);
+        return operation_unref(o);
+}
diff --git a/src/home/homed-varlink.c b/src/home/homed-varlink.c
new file mode 100644
index 0000000..1cef25f
--- /dev/null
+++ b/src/home/homed-varlink.c
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "group-record.h"
+#include "homed-varlink.h"
+#include "strv.h"
+#include "user-record-util.h"
+#include "user-record.h"
+#include "user-util.h"
+#include "format-util.h"
+
+typedef struct LookupParameters {
+        const char *user_name;
+        const char *group_name;
+        union {
+                uid_t uid;
+                gid_t gid;
+        };
+        const char *service;
+} LookupParameters;
+
+static bool client_is_trusted(Varlink *link, Home *h) {
+        uid_t peer_uid;
+        int r;
+
+        assert(link);
+        assert(h);
+
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0) {
+                log_debug_errno(r, "Unable to query peer UID, ignoring: %m");
+                return false;
+        }
+
+        return peer_uid == 0 || peer_uid == h->uid;
+}
+
+static int build_user_json(Home *h, bool trusted, JsonVariant **ret) {
+        _cleanup_(user_record_unrefp) UserRecord *augmented = NULL;
+        UserRecordLoadFlags flags;
+        int r;
+
+        assert(h);
+        assert(ret);
+
+        flags = USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_BINDING|USER_RECORD_STRIP_SECRET|USER_RECORD_ALLOW_STATUS|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_PERMISSIVE;
+        if (trusted)
+                flags |= USER_RECORD_ALLOW_PRIVILEGED;
+        else
+                flags |= USER_RECORD_STRIP_PRIVILEGED;
+
+        r = home_augment_status(h, flags, &augmented);
+        if (r < 0)
+                return r;
+
+        return json_build(ret, JSON_BUILD_OBJECT(
+                                          JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(augmented->json)),
+                                          JSON_BUILD_PAIR("incomplete", JSON_BUILD_BOOLEAN(augmented->incomplete))));
+}
+
+static bool home_user_match_lookup_parameters(LookupParameters *p, Home *h) {
+        assert(p);
+        assert(h);
+
+        if (p->user_name && !streq(p->user_name, h->user_name))
+                return false;
+
+        if (uid_is_valid(p->uid) && h->uid != p->uid)
+                return false;
+
+        return true;
+}
+
+int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "uid",            JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,      offsetof(LookupParameters, uid),       0         },
+                { "userName",       JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE },
+                { "service",        JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, service),   0         },
+                {}
+        };
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        LookupParameters p = {
+                .uid = UID_INVALID,
+        };
+        Manager *m = ASSERT_PTR(userdata);
+        bool trusted;
+        Home *h;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, m->userdb_service))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (uid_is_valid(p.uid))
+                h = hashmap_get(m->homes_by_uid, UID_TO_PTR(p.uid));
+        else if (p.user_name)
+                h = hashmap_get(m->homes_by_name, p.user_name);
+        else {
+
+                /* If neither UID nor name was specified, then dump all homes. Do so with varlink_notify()
+                 * for all entries but the last, so that clients can stream the results, and easily process
+                 * them piecemeal. */
+
+                HASHMAP_FOREACH(h, m->homes_by_name) {
+
+                        if (!home_user_match_lookup_parameters(&p, h))
+                                continue;
+
+                        if (v) {
+                                /* An entry set from the previous iteration? Then send it now */
+                                r = varlink_notify(link, v);
+                                if (r < 0)
+                                        return r;
+
+                                v = json_variant_unref(v);
+                        }
+
+                        trusted = client_is_trusted(link, h);
+
+                        r = build_user_json(h, trusted, &v);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (!v)
+                        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+                return varlink_reply(link, v);
+        }
+
+        if (!h)
+                return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+        if (!home_user_match_lookup_parameters(&p, h))
+                return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+        trusted = client_is_trusted(link, h);
+
+        r = build_user_json(h, trusted, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_reply(link, v);
+}
+
+static int build_group_json(Home *h, JsonVariant **ret) {
+        _cleanup_(group_record_unrefp) GroupRecord *g = NULL;
+        int r;
+
+        assert(h);
+        assert(ret);
+
+        g = group_record_new();
+        if (!g)
+                return -ENOMEM;
+
+        r = group_record_synthesize(g, h->record);
+        if (r < 0)
+                return r;
+
+        assert(!FLAGS_SET(g->mask, USER_RECORD_SECRET));
+        assert(!FLAGS_SET(g->mask, USER_RECORD_PRIVILEGED));
+
+        return json_build(ret,
+                          JSON_BUILD_OBJECT(
+                                          JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(g->json))));
+}
+
+static bool home_group_match_lookup_parameters(LookupParameters *p, Home *h) {
+        assert(p);
+        assert(h);
+
+        if (p->group_name && !streq(h->user_name, p->group_name))
+                return false;
+
+        if (gid_is_valid(p->gid) && h->uid != (uid_t) p->gid)
+                return false;
+
+        return true;
+}
+
+int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "gid",       JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,      offsetof(LookupParameters, gid),        0         },
+                { "groupName", JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+                { "service",   JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, service),    0         },
+                {}
+        };
+
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        LookupParameters p = {
+                .gid = GID_INVALID,
+        };
+        Manager *m = ASSERT_PTR(userdata);
+        Home *h;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, m->userdb_service))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (gid_is_valid(p.gid))
+                h = hashmap_get(m->homes_by_uid, UID_TO_PTR((uid_t) p.gid));
+        else if (p.group_name)
+                h = hashmap_get(m->homes_by_name, p.group_name);
+        else {
+
+                HASHMAP_FOREACH(h, m->homes_by_name) {
+
+                        if (!home_group_match_lookup_parameters(&p, h))
+                                continue;
+
+                        if (v) {
+                                r = varlink_notify(link, v);
+                                if (r < 0)
+                                        return r;
+
+                                v = json_variant_unref(v);
+                        }
+
+                        r = build_group_json(h, &v);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (!v)
+                        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+                return varlink_reply(link, v);
+        }
+
+        if (!h)
+                return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+        if (!home_group_match_lookup_parameters(&p, h))
+                return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+        r = build_group_json(h, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_reply(link, v);
+}
+
+int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "userName",  JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name),  JSON_SAFE },
+                { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+                { "service",   JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service),    0         },
+                {}
+        };
+
+        Manager *m = ASSERT_PTR(userdata);
+        LookupParameters p = {};
+        Home *h;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, m->userdb_service))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (p.user_name) {
+                const char *last = NULL;
+
+                h = hashmap_get(m->homes_by_name, p.user_name);
+                if (!h)
+                        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+                if (p.group_name) {
+                        if (!strv_contains(h->record->member_of, p.group_name))
+                                return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+                        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(h->user_name)),
+                                                                      JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(p.group_name))));
+                }
+
+                STRV_FOREACH(i, h->record->member_of) {
+                        if (last) {
+                                r = varlink_notifyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(h->user_name)),
+                                                                            JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last))));
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        last = *i;
+                }
+
+                if (last)
+                        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(h->user_name)),
+                                                                      JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last))));
+
+        } else if (p.group_name) {
+                const char *last = NULL;
+
+                HASHMAP_FOREACH(h, m->homes_by_name) {
+
+                        if (!strv_contains(h->record->member_of, p.group_name))
+                                continue;
+
+                        if (last) {
+                                r = varlink_notifyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(last)),
+                                                                            JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(p.group_name))));
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        last = h->user_name;
+                }
+
+                if (last)
+                        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(last)),
+                                                                      JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(p.group_name))));
+        } else {
+                const char *last_user_name = NULL, *last_group_name = NULL;
+
+                HASHMAP_FOREACH(h, m->homes_by_name)
+                        STRV_FOREACH(j, h->record->member_of) {
+
+                                if (last_user_name) {
+                                        assert(last_group_name);
+
+                                        r = varlink_notifyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(last_user_name)),
+                                                                                    JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last_group_name))));
+
+                                        if (r < 0)
+                                                return r;
+                                }
+
+                                last_user_name = h->user_name;
+                                last_group_name = *j;
+                        }
+
+                if (last_user_name) {
+                        assert(last_group_name);
+                        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(last_user_name)),
+                                                                      JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last_group_name))));
+                }
+        }
+
+        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
diff --git a/src/home/homed-varlink.h b/src/home/homed-varlink.h
new file mode 100644
index 0000000..2e404f0
--- /dev/null
+++ b/src/home/homed-varlink.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "homed-manager.h"
+
+int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata);
+int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata);
+int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata);
diff --git a/src/home/homed.c b/src/home/homed.c
new file mode 100644
index 0000000..04d9b56
--- /dev/null
+++ b/src/home/homed.c
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "bus-log-control-api.h"
+#include "daemon-util.h"
+#include "homed-manager.h"
+#include "homed-manager-bus.h"
+#include "log.h"
+#include "main-func.h"
+#include "service-util.h"
+#include "signal-util.h"
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL;
+        int r;
+
+        log_setup();
+
+        r = service_parse_argv("systemd-homed.service",
+                               "A service to create, remove, change or inspect home areas.",
+                               BUS_IMPLEMENTATIONS(&manager_object,
+                                                   &log_control_object),
+                               argc, argv);
+        if (r <= 0)
+                return r;
+
+        umask(0022);
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
+
+        r = manager_new(&m);
+        if (r < 0)
+                return log_error_errno(r, "Could not create manager: %m");
+
+        r = manager_startup(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start up daemon: %m");
+
+        notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+        r = sd_event_loop(m->event);
+        if (r < 0)
+                return log_error_errno(r, "Event loop failed: %m");
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/home/homed.conf b/src/home/homed.conf
new file mode 100644
index 0000000..993122b
--- /dev/null
+++ b/src/home/homed.conf
@@ -0,0 +1,21 @@
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it under the
+#  terms of the GNU Lesser General Public License as published by the Free
+#  Software Foundation; either version 2.1 of the License, or (at your option)
+#  any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file (or a copy of it placed in
+# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in
+# the /etc/systemd/homed.conf.d/ directory. The latter is generally
+# recommended. Defaults can be restored by simply deleting the main
+# configuration file and all drop-ins located in /etc/.
+#
+# Use 'systemd-analyze cat-config systemd/homed.conf' to display the full config.
+#
+# See homed.conf(5) for details.
+
+[Home]
+#DefaultStorage=
+#DefaultFileSystemType=btrfs
diff --git a/src/home/homework-cifs.c b/src/home/homework-cifs.c
new file mode 100644
index 0000000..19f1cd5
--- /dev/null
+++ b/src/home/homework-cifs.c
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#if WANT_LINUX_FS_H
+#include 
+#endif
+
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "homework-cifs.h"
+#include "homework-mount.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "process-util.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+
+int home_setup_cifs(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup) {
+
+        _cleanup_free_ char *chost = NULL, *cservice = NULL, *cdir = NULL, *chost_and_service = NULL, *j = NULL;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_CIFS);
+        assert(setup);
+        assert(!setup->undo_mount);
+        assert(setup->root_fd < 0);
+
+        if (FLAGS_SET(flags, HOME_SETUP_ALREADY_ACTIVATED)) {
+                setup->root_fd = open(user_record_home_directory(h), O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+                if (setup->root_fd < 0)
+                        return log_error_errno(errno, "Failed to open home directory: %m");
+
+                return 0;
+        }
+
+        if (!h->cifs_service)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks CIFS service, refusing.");
+
+        r = parse_cifs_service(h->cifs_service, &chost, &cservice, &cdir);
+        if (r < 0)
+                return log_error_errno(r, "Failed parse CIFS service specification: %m");
+
+        /* Just the host and service part, without the directory */
+        chost_and_service = strjoin("//", chost, "/", cservice);
+        if (!chost_and_service)
+                return log_oom();
+
+        r = home_unshare_and_mkdir();
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(pw, h->password) {
+                _cleanup_(unlink_and_freep) char *p = NULL;
+                _cleanup_free_ char *options = NULL;
+                _cleanup_fclose_ FILE *f = NULL;
+                pid_t mount_pid;
+                int exit_status;
+
+                r = fopen_temporary_child(NULL, &f, &p);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create temporary credentials file: %m");
+
+                fprintf(f,
+                        "username=%s\n"
+                        "password=%s\n",
+                        user_record_cifs_user_name(h),
+                        *pw);
+
+                if (h->cifs_domain)
+                        fprintf(f, "domain=%s\n", h->cifs_domain);
+
+                r = fflush_and_check(f);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to write temporary credentials file: %m");
+
+                f = safe_fclose(f);
+
+                if (asprintf(&options, "credentials=%s,uid=" UID_FMT ",forceuid,gid=" GID_FMT ",forcegid,file_mode=0%3o,dir_mode=0%3o",
+                             p, h->uid, user_record_gid(h), user_record_access_mode(h), user_record_access_mode(h)) < 0)
+                        return log_oom();
+
+                if (h->cifs_extra_mount_options)
+                        if (!strextend_with_separator(&options, ",", h->cifs_extra_mount_options))
+                                return log_oom();
+
+                r = safe_fork("(mount)", FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_STDOUT_TO_STDERR, &mount_pid);
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        /* Child */
+                        execl("/bin/mount", "/bin/mount", "-n", "-t", "cifs",
+                              chost_and_service, HOME_RUNTIME_WORK_DIR,
+                              "-o", options, NULL);
+
+                        log_error_errno(errno, "Failed to execute mount: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                exit_status = wait_for_terminate_and_check("mount", mount_pid, WAIT_LOG_ABNORMAL|WAIT_LOG_NON_ZERO_EXIT_STATUS);
+                if (exit_status < 0)
+                        return exit_status;
+                if (exit_status == EXIT_SUCCESS) {
+                        setup->undo_mount = true;
+                        break;
+                }
+
+                if (pw[1])
+                        log_info("CIFS mount failed with password #%zu, trying next password.", (size_t) (pw - h->password) + 1);
+        }
+
+        if (!setup->undo_mount)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOKEY),
+                                       "Failed to mount home directory, supplied password(s) possibly wrong.");
+
+        /* Adjust MS_SUID and similar flags */
+        r = mount_nofollow_verbose(LOG_ERR, NULL, HOME_RUNTIME_WORK_DIR, NULL, MS_BIND|MS_REMOUNT|user_record_mount_flags(h), NULL);
+        if (r < 0)
+                return r;
+
+        if (cdir) {
+                j = path_join(HOME_RUNTIME_WORK_DIR, cdir);
+                if (!j)
+                        return log_oom();
+
+                if (FLAGS_SET(flags, HOME_SETUP_CIFS_MKDIR)) {
+                        setup->root_fd = open_mkdir_at(AT_FDCWD, j, O_CLOEXEC, 0700);
+                        if (setup->root_fd < 0)
+                                return log_error_errno(setup->root_fd, "Failed to create CIFS subdirectory: %m");
+                }
+        }
+
+        if (setup->root_fd < 0) {
+                setup->root_fd = open(j ?: HOME_RUNTIME_WORK_DIR, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+                if (setup->root_fd < 0)
+                        return log_error_errno(errno, "Failed to open home directory: %m");
+        }
+
+        setup->mount_suffix = TAKE_PTR(cdir);
+        return 0;
+}
+
+int home_activate_cifs(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL, *header_home = NULL;
+        const char *hdo, *hd;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_CIFS);
+        assert(setup);
+        assert(ret_home);
+
+        assert_se(hdo = user_record_home_directory(h));
+        hd = strdupa_safe(hdo); /* copy the string out, since it might change later in the home record object */
+
+        r = home_setup(h, 0, setup, cache, &header_home);
+        if (r < 0)
+                return r;
+
+        r = home_refresh(h, flags, setup, header_home, cache, NULL, &new_home);
+        if (r < 0)
+                return r;
+
+        setup->root_fd = safe_close(setup->root_fd);
+
+        r = home_move_mount(setup->mount_suffix, hd);
+        if (r < 0)
+                return r;
+
+        setup->undo_mount = false;
+        setup->do_drop_caches = false;
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 1;
+}
+
+int home_create_cifs(UserRecord *h, HomeSetup *setup, UserRecord **ret_home) {
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_CIFS);
+        assert(setup);
+        assert(ret_home);
+
+        if (!h->cifs_service)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks CIFS service, refusing.");
+
+        if (access("/sbin/mount.cifs", F_OK) < 0) {
+                if (errno == ENOENT)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), "/sbin/mount.cifs is missing.");
+
+                return log_error_errno(errno, "Unable to detect whether /sbin/mount.cifs exists: %m");
+        }
+
+        r = home_setup_cifs(h, HOME_SETUP_CIFS_MKDIR, setup);
+        if (r < 0)
+                return r;
+
+        r = dir_is_empty_at(setup->root_fd, NULL, /* ignore_hidden_or_backup= */ false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to detect if CIFS directory is empty: %m");
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTEMPTY), "Selected CIFS directory not empty, refusing.");
+
+        r = home_populate(h, setup->root_fd);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup->root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        r = user_record_clone(h, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE, &new_home);
+        if (r < 0)
+                return log_error_errno(r, "Failed to clone record: %m");
+
+        r = user_record_add_binding(
+                        new_home,
+                        USER_CIFS,
+                        NULL,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        NULL,
+                        NULL,
+                        UINT64_MAX,
+                        NULL,
+                        NULL,
+                        h->uid,
+                        (gid_t) h->uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add binding to record: %m");
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 0;
+}
diff --git a/src/home/homework-cifs.h b/src/home/homework-cifs.h
new file mode 100644
index 0000000..af8c466
--- /dev/null
+++ b/src/home/homework-cifs.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "homework.h"
+#include "user-record.h"
+
+int home_setup_cifs(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup);
+
+int home_activate_cifs(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_home);
+
+int home_create_cifs(UserRecord *h, HomeSetup *setup, UserRecord **ret_home);
diff --git a/src/home/homework-directory.c b/src/home/homework-directory.c
new file mode 100644
index 0000000..6870ae9
--- /dev/null
+++ b/src/home/homework-directory.c
@@ -0,0 +1,313 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "btrfs-util.h"
+#include "fd-util.h"
+#include "homework-directory.h"
+#include "homework-mount.h"
+#include "homework-quota.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+int home_setup_directory(UserRecord *h, HomeSetup *setup) {
+        const char *ip;
+        int r;
+
+        assert(h);
+        assert(IN_SET(user_record_storage(h), USER_DIRECTORY, USER_SUBVOLUME));
+        assert(setup);
+        assert(!setup->undo_mount);
+        assert(setup->root_fd < 0);
+
+        /* We'll bind mount the image directory to a new mount point where we'll start adjusting it. Only
+         * once that's complete we'll move the thing to its final place eventually. */
+        r = home_unshare_and_mkdir();
+        if (r < 0)
+                return r;
+
+        assert_se(ip = user_record_image_path(h));
+
+        r = mount_follow_verbose(LOG_ERR, ip, HOME_RUNTIME_WORK_DIR, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
+
+        setup->undo_mount = true;
+
+        /* Turn off any form of propagation for this */
+        r = mount_nofollow_verbose(LOG_ERR, NULL, HOME_RUNTIME_WORK_DIR, NULL, MS_PRIVATE, NULL);
+        if (r < 0)
+                return r;
+
+        /* Adjust MS_SUID and similar flags */
+        r = mount_nofollow_verbose(LOG_ERR, NULL, HOME_RUNTIME_WORK_DIR, NULL, MS_BIND|MS_REMOUNT|user_record_mount_flags(h), NULL);
+        if (r < 0)
+                return r;
+
+        setup->root_fd = open(HOME_RUNTIME_WORK_DIR, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+        if (setup->root_fd < 0)
+                return log_error_errno(errno, "Failed to open home directory: %m");
+
+        return 0;
+}
+
+int home_activate_directory(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL, *header_home = NULL;
+        const char *hd, *hdo;
+        int r;
+
+        assert(h);
+        assert(IN_SET(user_record_storage(h), USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT));
+        assert(setup);
+        assert(ret_home);
+
+        assert_se(hdo = user_record_home_directory(h));
+        hd = strdupa_safe(hdo);
+
+        r = home_setup(h, flags, setup, cache, &header_home);
+        if (r < 0)
+                return r;
+
+        r = home_refresh(h, flags, setup, header_home, cache, NULL, &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_extend_embedded_identity(new_home, h, setup);
+        if (r < 0)
+                return r;
+
+        /* Close fd to private mount before moving mount */
+        setup->root_fd = safe_close(setup->root_fd);
+
+        /* We are now done with everything, move the mount into place */
+        r = home_move_mount(NULL, hd);
+        if (r < 0)
+                return r;
+
+        setup->undo_mount = false;
+
+        setup->do_drop_caches = false;
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 0;
+}
+
+int home_create_directory_or_subvolume(UserRecord *h, HomeSetup *setup, UserRecord **ret_home) {
+        _cleanup_(rm_rf_subvolume_and_freep) char *temporary = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL;
+        _cleanup_close_ int mount_fd = -EBADF;
+        _cleanup_free_ char *d = NULL;
+        bool is_subvolume = false;
+        const char *ip;
+        int r;
+
+        assert(h);
+        assert(IN_SET(user_record_storage(h), USER_DIRECTORY, USER_SUBVOLUME));
+        assert(setup);
+        assert(ret_home);
+
+        assert_se(ip = user_record_image_path(h));
+
+        r = tempfn_random(ip, "homework", &d);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate temporary directory: %m");
+
+        (void) mkdir_parents(d, 0755);
+
+        switch (user_record_storage(h)) {
+
+        case USER_SUBVOLUME:
+                WITH_UMASK(0077)
+                        r = btrfs_subvol_make(AT_FDCWD, d);
+
+                if (r >= 0) {
+                        log_info("Subvolume created.");
+                        is_subvolume = true;
+
+                        if (h->disk_size != UINT64_MAX) {
+
+                                /* Enable quota for the subvolume we just created. Note we don't check for
+                                 * errors here and only log about debug level about this. */
+                                r = btrfs_quota_enable(d, true);
+                                if (r < 0)
+                                        log_debug_errno(r, "Failed to enable quota on %s, ignoring: %m", d);
+
+                                r = btrfs_subvol_auto_qgroup(d, 0, false);
+                                if (r < 0)
+                                        log_debug_errno(r, "Failed to set up automatic quota group on %s, ignoring: %m", d);
+
+                                /* Actually configure the quota. We also ignore errors here, but we do log
+                                 * about them loudly, to keep things discoverable even though we don't
+                                 * consider lacking quota support in kernel fatal. */
+                                (void) home_update_quota_btrfs(h, d);
+                        }
+
+                        break;
+                }
+                if (r != -ENOTTY)
+                        return log_error_errno(r, "Failed to create temporary home directory subvolume %s: %m", d);
+
+                log_info("Creating subvolume %s is not supported, as file system does not support subvolumes. Falling back to regular directory.", d);
+                _fallthrough_;
+
+        case USER_DIRECTORY:
+
+                if (mkdir(d, 0700) < 0)
+                        return log_error_errno(errno, "Failed to create temporary home directory %s: %m", d);
+
+                (void) home_update_quota_classic(h, d);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        temporary = TAKE_PTR(d); /* Needs to be destroyed now */
+
+        /* Let's decouple namespaces now, so that we can possibly mount a UID map mount into
+         * /run/systemd/user-home-mount/ that no one will see but us. */
+        r = home_unshare_and_mkdir();
+        if (r < 0)
+                return r;
+
+        setup->root_fd = open(temporary, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+        if (setup->root_fd < 0)
+                return log_error_errno(errno, "Failed to open temporary home directory: %m");
+
+        /* Try to apply a UID shift, so that the directory is actually owned by "nobody", and is only mapped
+         * to the proper UID while active. — Well, that's at least the theory. Unfortunately, only btrfs does
+         * per-subvolume quota. The others do per-uid quota. Which means mapping all home directories to the
+         * same UID of "nobody" makes quota impossible. Hence unless we actually managed to create a btrfs
+         * subvolume for this user we'll map the user's UID to itself. Now you might ask: why bother mapping
+         * at all? It's because we want to restrict the UIDs used on the home directory: we leave all other
+         * UIDs of the homed UID range unmapped, thus making them unavailable to programs accessing the
+         * mount. */
+        r = home_shift_uid(setup->root_fd, HOME_RUNTIME_WORK_DIR, is_subvolume ? UID_NOBODY : h->uid, h->uid, &mount_fd);
+        if (r > 0)
+                setup->undo_mount = true; /* If uidmaps worked we have a mount to undo again */
+
+        if (mount_fd >= 0) {
+                /* If we have established a new mount, then we can use that as new root fd to our home directory. */
+                safe_close(setup->root_fd);
+
+                setup->root_fd = fd_reopen(mount_fd, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+                if (setup->root_fd < 0)
+                        return log_error_errno(setup->root_fd, "Unable to convert mount fd into proper directory fd: %m");
+
+                mount_fd = safe_close(mount_fd);
+        }
+
+        r = home_populate(h, setup->root_fd);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup->root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        r = user_record_clone(h, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE, &new_home);
+        if (r < 0)
+                return log_error_errno(r, "Failed to clone record: %m");
+
+        r = user_record_add_binding(
+                        new_home,
+                        user_record_storage(h),
+                        ip,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        NULL,
+                        NULL,
+                        UINT64_MAX,
+                        NULL,
+                        NULL,
+                        h->uid,
+                        (gid_t) h->uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add binding to record: %m");
+
+        setup->root_fd = safe_close(setup->root_fd);
+
+        /* Unmount mapped mount before we move the dir into place */
+        r = home_setup_undo_mount(setup, LOG_ERR);
+        if (r < 0)
+                return r;
+
+        if (rename(temporary, ip) < 0)
+                return log_error_errno(errno, "Failed to rename %s to %s: %m", temporary, ip);
+
+        temporary = mfree(temporary);
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 0;
+}
+
+int home_resize_directory(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *embedded_home = NULL, *new_home = NULL;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(ret_home);
+        assert(IN_SET(user_record_storage(h), USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT));
+
+        r = home_setup(h, flags, setup, cache, NULL);
+        if (r < 0)
+                return r;
+
+        r = home_load_embedded_identity(h, setup->root_fd, NULL, USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL, cache, &embedded_home, &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_maybe_shift_uid(h, flags, setup);
+        if (r < 0)
+                return r;
+
+        r = home_update_quota_auto(h, NULL);
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                return -ESOCKTNOSUPPORT; /* make recognizable */
+        if (r < 0)
+                return r;
+
+        r = home_store_embedded_identity(new_home, setup->root_fd, h->uid, embedded_home);
+        if (r < 0)
+                return r;
+
+        r = home_extend_embedded_identity(new_home, h, setup);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup->root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        r = home_setup_done(setup);
+        if (r < 0)
+                return r;
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 0;
+}
diff --git a/src/home/homework-directory.h b/src/home/homework-directory.h
new file mode 100644
index 0000000..fe03e5d
--- /dev/null
+++ b/src/home/homework-directory.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "homework.h"
+#include "user-record.h"
+
+int home_setup_directory(UserRecord *h, HomeSetup *setup);
+int home_activate_directory(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_home);
+int home_create_directory_or_subvolume(UserRecord *h, HomeSetup *setup, UserRecord **ret_home);
+int home_resize_directory(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_home);
diff --git a/src/home/homework-fido2.c b/src/home/homework-fido2.c
new file mode 100644
index 0000000..5c7cd52
--- /dev/null
+++ b/src/home/homework-fido2.c
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "hexdecoct.h"
+#include "homework-fido2.h"
+#include "libfido2-util.h"
+#include "memory-util.h"
+#include "strv.h"
+
+int fido2_use_token(
+                UserRecord *h,
+                UserRecord *secret,
+                const Fido2HmacSalt *salt,
+                char **ret) {
+
+        _cleanup_(erase_and_freep) void *hmac = NULL;
+        size_t hmac_size;
+        Fido2EnrollFlags flags = 0;
+        ssize_t ss;
+        int r;
+
+        assert(h);
+        assert(secret);
+        assert(salt);
+        assert(ret);
+
+        /* If we know the up/uv/clientPin settings used during enrollment, let's pass this on for
+         * authentication, or generate errors immediately if interactivity of the specified kind is not
+         * allowed. */
+
+        if (salt->up > 0) {
+                if (h->fido2_user_presence_permitted <= 0)
+                        return -EMEDIUMTYPE;
+
+                flags |= FIDO2ENROLL_UP;
+        } else if (salt->up < 0) /* unset? */
+                flags |= FIDO2ENROLL_UP_IF_NEEDED; /* compat with pre-248 */
+
+        if (salt->uv > 0) {
+                if (h->fido2_user_verification_permitted <= 0)
+                        return -ENOCSI;
+
+                flags |= FIDO2ENROLL_UV;
+        } else if (salt->uv < 0)
+                flags |= FIDO2ENROLL_UV_OMIT; /* compat with pre-248 */
+
+        if (salt->client_pin > 0) {
+
+                if (strv_isempty(secret->token_pin))
+                        return -ENOANO;
+
+                flags |= FIDO2ENROLL_PIN;
+        } else if (salt->client_pin < 0)
+                flags |= FIDO2ENROLL_PIN_IF_NEEDED; /* compat with pre-248 */
+
+        r = fido2_use_hmac_hash(
+                        NULL,
+                        "io.systemd.home",
+                        salt->salt, salt->salt_size,
+                        salt->credential.id, salt->credential.size,
+                        secret->token_pin,
+                        flags,
+                        &hmac,
+                        &hmac_size);
+        if (r < 0)
+                return r;
+
+        ss = base64mem(hmac, hmac_size, ret);
+        if (ss < 0)
+                return log_error_errno(ss, "Failed to base64 encode HMAC secret: %m");
+
+        return 0;
+}
diff --git a/src/home/homework-fido2.h b/src/home/homework-fido2.h
new file mode 100644
index 0000000..a1dcba2
--- /dev/null
+++ b/src/home/homework-fido2.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "user-record.h"
+
+int fido2_use_token(UserRecord *h, UserRecord *secret, const Fido2HmacSalt *salt, char **ret);
diff --git a/src/home/homework-fscrypt.c b/src/home/homework-fscrypt.c
new file mode 100644
index 0000000..6aae1d2
--- /dev/null
+++ b/src/home/homework-fscrypt.c
@@ -0,0 +1,674 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "errno-util.h"
+#include "fd-util.h"
+#include "hexdecoct.h"
+#include "homework-fscrypt.h"
+#include "homework-mount.h"
+#include "homework-quota.h"
+#include "memory-util.h"
+#include "missing_keyctl.h"
+#include "missing_syscall.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "nulstr-util.h"
+#include "openssl-util.h"
+#include "parse-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "rm-rf.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "user-util.h"
+#include "xattr-util.h"
+
+static int fscrypt_upload_volume_key(
+                const uint8_t key_descriptor[static FS_KEY_DESCRIPTOR_SIZE],
+                const void *volume_key,
+                size_t volume_key_size,
+                key_serial_t where) {
+
+        _cleanup_free_ char *hex = NULL;
+        const char *description;
+        struct fscrypt_key key;
+        key_serial_t serial;
+
+        assert(key_descriptor);
+        assert(volume_key);
+        assert(volume_key_size > 0);
+
+        if (volume_key_size > sizeof(key.raw))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume key too long.");
+
+        hex = hexmem(key_descriptor, FS_KEY_DESCRIPTOR_SIZE);
+        if (!hex)
+                return log_oom();
+
+        description = strjoina("fscrypt:", hex);
+
+        key = (struct fscrypt_key) {
+                .size = volume_key_size,
+        };
+        memcpy(key.raw, volume_key, volume_key_size);
+
+        CLEANUP_ERASE(key);
+
+        /* Upload to the kernel */
+        serial = add_key("logon", description, &key, sizeof(key), where);
+        if (serial < 0)
+                return log_error_errno(errno, "Failed to install master key in keyring: %m");
+
+        log_info("Uploaded encryption key to kernel.");
+
+        return 0;
+}
+
+static void calculate_key_descriptor(
+                const void *key,
+                size_t key_size,
+                uint8_t ret_key_descriptor[static FS_KEY_DESCRIPTOR_SIZE]) {
+
+        uint8_t hashed[512 / 8] = {}, hashed2[512 / 8] = {};
+
+        /* Derive the key descriptor from the volume key via double SHA512, in order to be compatible with e4crypt */
+
+        assert_se(SHA512(key, key_size, hashed) == hashed);
+        assert_se(SHA512(hashed, sizeof(hashed), hashed2) == hashed2);
+
+        assert_cc(sizeof(hashed2) >= FS_KEY_DESCRIPTOR_SIZE);
+
+        memcpy(ret_key_descriptor, hashed2, FS_KEY_DESCRIPTOR_SIZE);
+}
+
+static int fscrypt_slot_try_one(
+                const char *password,
+                const void *salt, size_t salt_size,
+                const void *encrypted, size_t encrypted_size,
+                const uint8_t match_key_descriptor[static FS_KEY_DESCRIPTOR_SIZE],
+                void **ret_decrypted, size_t *ret_decrypted_size) {
+
+
+        _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL;
+        _cleanup_(erase_and_freep) void *decrypted = NULL;
+        uint8_t key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+        int decrypted_size_out1, decrypted_size_out2;
+        uint8_t derived[512 / 8] = {};
+        size_t decrypted_size;
+        const EVP_CIPHER *cc;
+        int r;
+
+        assert(password);
+        assert(salt);
+        assert(salt_size > 0);
+        assert(encrypted);
+        assert(encrypted_size > 0);
+        assert(match_key_descriptor);
+
+        /* Our construction is like this:
+         *
+         *   1. In each key slot we store a salt value plus the encrypted volume key
+         *
+         *   2. Unlocking is via calculating PBKDF2-HMAC-SHA512 of the supplied password (in combination with
+         *      the salt), then using the first 256 bit of the hash as key for decrypting the encrypted
+         *      volume key in AES256 counter mode.
+         *
+         *   3. Writing a password is similar: calculate PBKDF2-HMAC-SHA512 of the supplied password (in
+         *      combination with the salt), then encrypt the volume key in AES256 counter mode with the
+         *      resulting hash.
+         */
+
+        CLEANUP_ERASE(derived);
+
+        if (PKCS5_PBKDF2_HMAC(
+                            password, strlen(password),
+                            salt, salt_size,
+                            0xFFFF, EVP_sha512(),
+                            sizeof(derived), derived) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "PBKDF2 failed");
+
+        context = EVP_CIPHER_CTX_new();
+        if (!context)
+                return log_oom();
+
+        /* We use AES256 in counter mode */
+        assert_se(cc = EVP_aes_256_ctr());
+
+        /* We only use the first half of the derived key */
+        assert(sizeof(derived) >= (size_t) EVP_CIPHER_key_length(cc));
+
+        if (EVP_DecryptInit_ex(context, cc, NULL, derived, NULL) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize decryption context.");
+
+        decrypted_size = encrypted_size + EVP_CIPHER_key_length(cc) * 2;
+        decrypted = malloc(decrypted_size);
+        if (!decrypted)
+                return log_oom();
+
+        if (EVP_DecryptUpdate(context, (uint8_t*) decrypted, &decrypted_size_out1, encrypted, encrypted_size) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to decrypt volume key.");
+
+        assert((size_t) decrypted_size_out1 <= decrypted_size);
+
+        if (EVP_DecryptFinal_ex(context, (uint8_t*) decrypted_size + decrypted_size_out1, &decrypted_size_out2) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finish decryption of volume key.");
+
+        assert((size_t) decrypted_size_out1 + (size_t) decrypted_size_out2 < decrypted_size);
+        decrypted_size = (size_t) decrypted_size_out1 + (size_t) decrypted_size_out2;
+
+        calculate_key_descriptor(decrypted, decrypted_size, key_descriptor);
+
+        if (memcmp(key_descriptor, match_key_descriptor, FS_KEY_DESCRIPTOR_SIZE) != 0)
+                return -ENOANO; /* don't log here */
+
+        r = fscrypt_upload_volume_key(key_descriptor, decrypted, decrypted_size, KEY_SPEC_THREAD_KEYRING);
+        if (r < 0)
+                return r;
+
+        if (ret_decrypted)
+                *ret_decrypted = TAKE_PTR(decrypted);
+        if (ret_decrypted_size)
+                *ret_decrypted_size = decrypted_size;
+
+        return 0;
+}
+
+static int fscrypt_slot_try_many(
+                char **passwords,
+                const void *salt, size_t salt_size,
+                const void *encrypted, size_t encrypted_size,
+                const uint8_t match_key_descriptor[static FS_KEY_DESCRIPTOR_SIZE],
+                void **ret_decrypted, size_t *ret_decrypted_size) {
+
+        int r;
+
+        STRV_FOREACH(i, passwords) {
+                r = fscrypt_slot_try_one(*i, salt, salt_size, encrypted, encrypted_size, match_key_descriptor, ret_decrypted, ret_decrypted_size);
+                if (r != -ENOANO)
+                        return r;
+        }
+
+        return -ENOANO;
+}
+
+static int fscrypt_setup(
+                const PasswordCache *cache,
+                char **password,
+                HomeSetup *setup,
+                void **ret_volume_key,
+                size_t *ret_volume_key_size) {
+
+        _cleanup_free_ char *xattr_buf = NULL;
+        int r;
+
+        assert(setup);
+        assert(setup->root_fd >= 0);
+
+        r = flistxattr_malloc(setup->root_fd, &xattr_buf);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to retrieve xattr list: %m");
+
+        NULSTR_FOREACH(xa, xattr_buf) {
+                _cleanup_free_ void *salt = NULL, *encrypted = NULL;
+                _cleanup_free_ char *value = NULL;
+                size_t salt_size, encrypted_size;
+                const char *nr, *e;
+                char **list;
+                int n;
+
+                /* Check if this xattr has the format 'trusted.fscrypt_slot' where '' is a 32-bit unsigned integer */
+                nr = startswith(xa, "trusted.fscrypt_slot");
+                if (!nr)
+                        continue;
+                if (safe_atou32(nr, NULL) < 0)
+                        continue;
+
+                n = fgetxattr_malloc(setup->root_fd, xa, &value);
+                if (n == -ENODATA) /* deleted by now? */
+                        continue;
+                if (n < 0)
+                        return log_error_errno(n, "Failed to read %s xattr: %m", xa);
+
+                e = memchr(value, ':', n);
+                if (!e)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "xattr %s lacks ':' separator: %m", xa);
+
+                r = unbase64mem(value, e - value, &salt, &salt_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to decode salt of %s: %m", xa);
+                r = unbase64mem(e+1, n - (e - value) - 1, &encrypted, &encrypted_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to decode encrypted key of %s: %m", xa);
+
+                r = -ENOANO;
+                FOREACH_POINTER(list, cache->pkcs11_passwords, cache->fido2_passwords, password) {
+                        r = fscrypt_slot_try_many(
+                                        list,
+                                        salt, salt_size,
+                                        encrypted, encrypted_size,
+                                        setup->fscrypt_key_descriptor,
+                                        ret_volume_key, ret_volume_key_size);
+                        if (r != -ENOANO)
+                                break;
+                }
+                if (r < 0) {
+                        if (r != -ENOANO)
+                                return r;
+                } else
+                        return 0;
+        }
+
+        return log_error_errno(SYNTHETIC_ERRNO(ENOKEY), "Failed to set up home directory with provided passwords.");
+}
+
+int home_setup_fscrypt(
+                UserRecord *h,
+                HomeSetup *setup,
+                const PasswordCache *cache) {
+
+        _cleanup_(erase_and_freep) void *volume_key = NULL;
+        struct fscrypt_policy policy = {};
+        size_t volume_key_size = 0;
+        const char *ip;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_FSCRYPT);
+        assert(setup);
+        assert(setup->root_fd < 0);
+
+        assert_se(ip = user_record_image_path(h));
+
+        setup->root_fd = open(ip, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+        if (setup->root_fd < 0)
+                return log_error_errno(errno, "Failed to open home directory: %m");
+
+        if (ioctl(setup->root_fd, FS_IOC_GET_ENCRYPTION_POLICY, &policy) < 0) {
+                if (errno == ENODATA)
+                        return log_error_errno(errno, "Home directory %s is not encrypted.", ip);
+                if (ERRNO_IS_NOT_SUPPORTED(errno)) {
+                        log_error_errno(errno, "File system does not support fscrypt: %m");
+                        return -ENOLINK; /* make recognizable */
+                }
+                return log_error_errno(errno, "Failed to acquire encryption policy of %s: %m", ip);
+        }
+
+        memcpy(setup->fscrypt_key_descriptor, policy.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE);
+
+        r = fscrypt_setup(
+                        cache,
+                        h->password,
+                        setup,
+                        &volume_key,
+                        &volume_key_size);
+        if (r < 0)
+                return r;
+
+        /* Also install the access key in the user's own keyring */
+
+        if (uid_is_valid(h->uid)) {
+                r = safe_fork("(sd-addkey)",
+                              FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_REOPEN_LOG,
+                              NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed install encryption key in user's keyring: %m");
+                if (r == 0) {
+                        gid_t gid;
+
+                        /* Child */
+
+                        gid = user_record_gid(h);
+                        if (setresgid(gid, gid, gid) < 0) {
+                                log_error_errno(errno, "Failed to change GID to " GID_FMT ": %m", gid);
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        if (setgroups(0, NULL) < 0) {
+                                log_error_errno(errno, "Failed to reset auxiliary groups list: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        if (setresuid(h->uid, h->uid, h->uid) < 0) {
+                                log_error_errno(errno, "Failed to change UID to " UID_FMT ": %m", h->uid);
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        r = fscrypt_upload_volume_key(
+                                        setup->fscrypt_key_descriptor,
+                                        volume_key,
+                                        volume_key_size,
+                                        KEY_SPEC_USER_KEYRING);
+                        if (r < 0)
+                                _exit(EXIT_FAILURE);
+
+                        _exit(EXIT_SUCCESS);
+                }
+        }
+
+        /* We'll bind mount the image directory to a new mount point where we'll start adjusting it. Only
+         * once that's complete we'll move the thing to its final place eventually. */
+        r = home_unshare_and_mkdir();
+        if (r < 0)
+                return r;
+
+        r = mount_follow_verbose(LOG_ERR, ip, HOME_RUNTIME_WORK_DIR, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
+
+        setup->undo_mount = true;
+
+        /* Turn off any form of propagation for this */
+        r = mount_nofollow_verbose(LOG_ERR, NULL, HOME_RUNTIME_WORK_DIR, NULL, MS_PRIVATE, NULL);
+        if (r < 0)
+                return r;
+
+        /* Adjust MS_SUID and similar flags */
+        r = mount_nofollow_verbose(LOG_ERR, NULL, HOME_RUNTIME_WORK_DIR, NULL, MS_BIND|MS_REMOUNT|user_record_mount_flags(h), NULL);
+        if (r < 0)
+                return r;
+
+        safe_close(setup->root_fd);
+        setup->root_fd = open(HOME_RUNTIME_WORK_DIR, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+        if (setup->root_fd < 0)
+                return log_error_errno(errno, "Failed to open home directory: %m");
+
+        return 0;
+}
+
+static int fscrypt_slot_set(
+                int root_fd,
+                const void *volume_key,
+                size_t volume_key_size,
+                const char *password,
+                uint32_t nr) {
+
+        _cleanup_free_ char *salt_base64 = NULL, *encrypted_base64 = NULL, *joined = NULL;
+        char label[STRLEN("trusted.fscrypt_slot") + DECIMAL_STR_MAX(nr) + 1];
+        _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL;
+        int r, encrypted_size_out1, encrypted_size_out2;
+        uint8_t salt[64], derived[512 / 8] = {};
+        _cleanup_free_ void *encrypted = NULL;
+        const EVP_CIPHER *cc;
+        size_t encrypted_size;
+        ssize_t ss;
+
+        r = crypto_random_bytes(salt, sizeof(salt));
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate salt: %m");
+
+        CLEANUP_ERASE(derived);
+
+        if (PKCS5_PBKDF2_HMAC(
+                            password, strlen(password),
+                            salt, sizeof(salt),
+                            0xFFFF, EVP_sha512(),
+                            sizeof(derived), derived) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "PBKDF2 failed");
+
+        context = EVP_CIPHER_CTX_new();
+        if (!context)
+                return log_oom();
+
+        /* We use AES256 in counter mode */
+        cc = EVP_aes_256_ctr();
+
+        /* We only use the first half of the derived key */
+        assert(sizeof(derived) >= (size_t) EVP_CIPHER_key_length(cc));
+
+        if (EVP_EncryptInit_ex(context, cc, NULL, derived, NULL) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize encryption context.");
+
+        encrypted_size = volume_key_size + EVP_CIPHER_key_length(cc) * 2;
+        encrypted = malloc(encrypted_size);
+        if (!encrypted)
+                return log_oom();
+
+        if (EVP_EncryptUpdate(context, (uint8_t*) encrypted, &encrypted_size_out1, volume_key, volume_key_size) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to encrypt volume key.");
+
+        assert((size_t) encrypted_size_out1 <= encrypted_size);
+
+        if (EVP_EncryptFinal_ex(context, (uint8_t*) encrypted_size + encrypted_size_out1, &encrypted_size_out2) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finish encryption of volume key.");
+
+        assert((size_t) encrypted_size_out1 + (size_t) encrypted_size_out2 < encrypted_size);
+        encrypted_size = (size_t) encrypted_size_out1 + (size_t) encrypted_size_out2;
+
+        ss = base64mem(salt, sizeof(salt), &salt_base64);
+        if (ss < 0)
+                return log_oom();
+
+        ss = base64mem(encrypted, encrypted_size, &encrypted_base64);
+        if (ss < 0)
+                return log_oom();
+
+        joined = strjoin(salt_base64, ":", encrypted_base64);
+        if (!joined)
+                return log_oom();
+
+        xsprintf(label, "trusted.fscrypt_slot%" PRIu32, nr);
+        if (fsetxattr(root_fd, label, joined, strlen(joined), 0) < 0)
+                return log_error_errno(errno, "Failed to write xattr %s: %m", label);
+
+        log_info("Written key slot %s.", label);
+
+        return 0;
+}
+
+int home_create_fscrypt(
+                UserRecord *h,
+                HomeSetup *setup,
+                char **effective_passwords,
+                UserRecord **ret_home) {
+
+        _cleanup_(rm_rf_physical_and_freep) char *temporary = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL;
+        _cleanup_(erase_and_freep) void *volume_key = NULL;
+        _cleanup_close_ int mount_fd = -EBADF;
+        struct fscrypt_policy policy = {};
+        size_t volume_key_size = 512 / 8;
+        _cleanup_free_ char *d = NULL;
+        uint32_t nr = 0;
+        const char *ip;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_FSCRYPT);
+        assert(setup);
+        assert(ret_home);
+
+        assert_se(ip = user_record_image_path(h));
+
+        r = tempfn_random(ip, "homework", &d);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate temporary directory: %m");
+
+        (void) mkdir_parents(d, 0755);
+
+        if (mkdir(d, 0700) < 0)
+                return log_error_errno(errno, "Failed to create temporary home directory %s: %m", d);
+
+        temporary = TAKE_PTR(d); /* Needs to be destroyed now */
+
+        r = home_unshare_and_mkdir();
+        if (r < 0)
+                return r;
+
+        setup->root_fd = open(temporary, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+        if (setup->root_fd < 0)
+                return log_error_errno(errno, "Failed to open temporary home directory: %m");
+
+        if (ioctl(setup->root_fd, FS_IOC_GET_ENCRYPTION_POLICY, &policy) < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno)) {
+                        log_error_errno(errno, "File system does not support fscrypt: %m");
+                        return -ENOLINK; /* make recognizable */
+                }
+                if (errno != ENODATA)
+                        return log_error_errno(errno, "Failed to get fscrypt policy of directory: %m");
+        } else
+                return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Parent of %s already encrypted, refusing.", d);
+
+        volume_key = malloc(volume_key_size);
+        if (!volume_key)
+                return log_oom();
+
+        r = crypto_random_bytes(volume_key, volume_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire volume key: %m");
+
+        log_info("Generated volume key of size %zu.", volume_key_size);
+
+        policy = (struct fscrypt_policy) {
+                .contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS,
+                .filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS,
+                .flags = FS_POLICY_FLAGS_PAD_32,
+        };
+
+        calculate_key_descriptor(volume_key, volume_key_size, policy.master_key_descriptor);
+
+        r = fscrypt_upload_volume_key(policy.master_key_descriptor, volume_key, volume_key_size, KEY_SPEC_THREAD_KEYRING);
+        if (r < 0)
+                return r;
+
+        log_info("Uploaded volume key to kernel.");
+
+        if (ioctl(setup->root_fd, FS_IOC_SET_ENCRYPTION_POLICY, &policy) < 0)
+                return log_error_errno(errno, "Failed to set fscrypt policy on directory: %m");
+
+        log_info("Encryption policy set.");
+
+        STRV_FOREACH(i, effective_passwords) {
+                r = fscrypt_slot_set(setup->root_fd, volume_key, volume_key_size, *i, nr);
+                if (r < 0)
+                        return r;
+
+                nr++;
+        }
+
+        (void) home_update_quota_classic(h, temporary);
+
+        r = home_shift_uid(setup->root_fd, HOME_RUNTIME_WORK_DIR, h->uid, h->uid, &mount_fd);
+        if (r > 0)
+                setup->undo_mount = true; /* If uidmaps worked we have a mount to undo again */
+
+        if (mount_fd >= 0) {
+                /* If we have established a new mount, then we can use that as new root fd to our home directory. */
+                safe_close(setup->root_fd);
+
+                setup->root_fd = fd_reopen(mount_fd, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+                if (setup->root_fd < 0)
+                        return log_error_errno(setup->root_fd, "Unable to convert mount fd into proper directory fd: %m");
+
+                mount_fd = safe_close(mount_fd);
+        }
+
+        r = home_populate(h, setup->root_fd);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup->root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        r = user_record_clone(h, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE, &new_home);
+        if (r < 0)
+                return log_error_errno(r, "Failed to clone record: %m");
+
+        r = user_record_add_binding(
+                        new_home,
+                        USER_FSCRYPT,
+                        ip,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        SD_ID128_NULL,
+                        NULL,
+                        NULL,
+                        UINT64_MAX,
+                        NULL,
+                        NULL,
+                        h->uid,
+                        (gid_t) h->uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add binding to record: %m");
+
+        setup->root_fd = safe_close(setup->root_fd);
+
+        r = home_setup_undo_mount(setup, LOG_ERR);
+        if (r < 0)
+                return r;
+
+        if (rename(temporary, ip) < 0)
+                return log_error_errno(errno, "Failed to rename %s to %s: %m", temporary, ip);
+
+        temporary = mfree(temporary);
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 0;
+}
+
+int home_passwd_fscrypt(
+                UserRecord *h,
+                HomeSetup *setup,
+                const PasswordCache *cache,         /* the passwords acquired via PKCS#11/FIDO2 security tokens */
+                char **effective_passwords          /* new passwords */) {
+
+        _cleanup_(erase_and_freep) void *volume_key = NULL;
+        _cleanup_free_ char *xattr_buf = NULL;
+        size_t volume_key_size = 0;
+        uint32_t slot = 0;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_FSCRYPT);
+        assert(setup);
+
+        r = fscrypt_setup(
+                        cache,
+                        h->password,
+                        setup,
+                        &volume_key,
+                        &volume_key_size);
+        if (r < 0)
+                return r;
+
+        STRV_FOREACH(p, effective_passwords) {
+                r = fscrypt_slot_set(setup->root_fd, volume_key, volume_key_size, *p, slot);
+                if (r < 0)
+                        return r;
+
+                slot++;
+        }
+
+        r = flistxattr_malloc(setup->root_fd, &xattr_buf);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to retrieve xattr list: %m");
+
+        NULSTR_FOREACH(xa, xattr_buf) {
+                const char *nr;
+                uint32_t z;
+
+                /* Check if this xattr has the format 'trusted.fscrypt_slot' where '' is a 32-bit unsigned integer */
+                nr = startswith(xa, "trusted.fscrypt_slot");
+                if (!nr)
+                        continue;
+                if (safe_atou32(nr, &z) < 0)
+                        continue;
+
+                if (z < slot)
+                        continue;
+
+                if (fremovexattr(setup->root_fd, xa) < 0)
+                        if (errno != ENODATA)
+                                log_warning_errno(errno, "Failed to remove xattr %s: %m", xa);
+        }
+
+        return 0;
+}
diff --git a/src/home/homework-fscrypt.h b/src/home/homework-fscrypt.h
new file mode 100644
index 0000000..7c2d7aa
--- /dev/null
+++ b/src/home/homework-fscrypt.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "homework.h"
+#include "user-record.h"
+
+int home_setup_fscrypt(UserRecord *h, HomeSetup *setup, const PasswordCache *cache);
+
+int home_create_fscrypt(UserRecord *h, HomeSetup *setup, char **effective_passwords, UserRecord **ret_home);
+
+int home_passwd_fscrypt(UserRecord *h, HomeSetup *setup, const PasswordCache *cache, char **effective_passwords);
diff --git a/src/home/homework-luks.c b/src/home/homework-luks.c
new file mode 100644
index 0000000..5bd78a0
--- /dev/null
+++ b/src/home/homework-luks.c
@@ -0,0 +1,3925 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#if HAVE_VALGRIND_MEMCHECK_H
+#include 
+#endif
+
+#include "sd-daemon.h"
+#include "sd-device.h"
+#include "sd-event.h"
+#include "sd-id128.h"
+
+#include "blkid-util.h"
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "chattr-util.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "dm-util.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fdisk-util.h"
+#include "fileio.h"
+#include "filesystems.h"
+#include "fs-util.h"
+#include "fsck-util.h"
+#include "glyph-util.h"
+#include "gpt.h"
+#include "home-util.h"
+#include "homework-luks.h"
+#include "homework-mount.h"
+#include "io-util.h"
+#include "keyring-util.h"
+#include "memory-util.h"
+#include "missing_magic.h"
+#include "mkdir.h"
+#include "mkfs-util.h"
+#include "mount-util.h"
+#include "openssl-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "resize-fs.h"
+#include "strv.h"
+#include "sync-util.h"
+#include "tmpfile-util.h"
+#include "udev-util.h"
+#include "user-util.h"
+
+/* Round down to the nearest 4K size. Given that newer hardware generally prefers 4K sectors, let's align our
+ * partitions to that too. In the worst case we'll waste 3.5K per partition that way, but I think I can live
+ * with that. */
+#define DISK_SIZE_ROUND_DOWN(x) ((x) & ~UINT64_C(4095))
+
+/* Rounds up to the nearest 4K boundary. Returns UINT64_MAX on overflow */
+#define DISK_SIZE_ROUND_UP(x)                                           \
+        ({                                                              \
+                uint64_t _x = (x);                                      \
+                _x > UINT64_MAX - 4095U ? UINT64_MAX : (_x + 4095U) & ~UINT64_C(4095); \
+        })
+
+/* How much larger will the image on disk be than the fs inside it, i.e. the space we pay for the GPT and
+ * LUKS2 envelope. (As measured on cryptsetup 2.4.1) */
+#define GPT_LUKS2_OVERHEAD UINT64_C(18874368)
+
+static int resize_image_loop(UserRecord *h, HomeSetup *setup, uint64_t old_image_size, uint64_t new_image_size, uint64_t *ret_image_size);
+
+int run_mark_dirty(int fd, bool b) {
+        char x = '1';
+        int r, ret;
+
+        /* Sets or removes the 'user.home-dirty' xattr on the specified file. We use this to detect when a
+         * home directory was not properly unmounted. */
+
+        assert(fd >= 0);
+
+        r = fd_verify_regular(fd);
+        if (r < 0)
+                return r;
+
+        if (b) {
+                ret = fsetxattr(fd, "user.home-dirty", &x, 1, XATTR_CREATE);
+                if (ret < 0 && errno != EEXIST)
+                        return log_debug_errno(errno, "Could not mark home directory as dirty: %m");
+
+        } else {
+                r = fsync_full(fd);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to synchronize image before marking it clean: %m");
+
+                ret = fremovexattr(fd, "user.home-dirty");
+                if (ret < 0 && !ERRNO_IS_XATTR_ABSENT(errno))
+                        return log_debug_errno(errno, "Could not mark home directory as clean: %m");
+        }
+
+        r = fsync_full(fd);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to synchronize dirty flag to disk: %m");
+
+        return ret >= 0;
+}
+
+int run_mark_dirty_by_path(const char *path, bool b) {
+        _cleanup_close_ int fd = -EBADF;
+
+        assert(path);
+
+        fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY);
+        if (fd < 0)
+                return log_debug_errno(errno, "Failed to open %s to mark dirty or clean: %m", path);
+
+        return run_mark_dirty(fd, b);
+}
+
+static int probe_file_system_by_fd(
+                int fd,
+                char **ret_fstype,
+                sd_id128_t *ret_uuid) {
+
+        _cleanup_(blkid_free_probep) blkid_probe b = NULL;
+        _cleanup_free_ char *s = NULL;
+        const char *fstype = NULL, *uuid = NULL;
+        sd_id128_t id;
+        int r;
+
+        assert(fd >= 0);
+        assert(ret_fstype);
+        assert(ret_uuid);
+
+        b = blkid_new_probe();
+        if (!b)
+                return -ENOMEM;
+
+        errno = 0;
+        r = blkid_probe_set_device(b, fd, 0, 0);
+        if (r != 0)
+                return errno_or_else(ENOMEM);
+
+        (void) blkid_probe_enable_superblocks(b, 1);
+        (void) blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE|BLKID_SUBLKS_UUID);
+
+        errno = 0;
+        r = blkid_do_safeprobe(b);
+        if (r == _BLKID_SAFEPROBE_ERROR)
+                return errno_or_else(EIO);
+        if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND))
+                return -ENOPKG;
+
+        assert(r == _BLKID_SAFEPROBE_FOUND);
+
+        (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
+        if (!fstype)
+                return -ENOPKG;
+
+        (void) blkid_probe_lookup_value(b, "UUID", &uuid, NULL);
+        if (!uuid)
+                return -ENOPKG;
+
+        r = sd_id128_from_string(uuid, &id);
+        if (r < 0)
+                return r;
+
+        s = strdup(fstype);
+        if (!s)
+                return -ENOMEM;
+
+        *ret_fstype = TAKE_PTR(s);
+        *ret_uuid = id;
+
+        return 0;
+}
+
+static int probe_file_system_by_path(const char *path, char **ret_fstype, sd_id128_t *ret_uuid) {
+        _cleanup_close_ int fd = -EBADF;
+
+        fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+        if (fd < 0)
+                return negative_errno();
+
+        return probe_file_system_by_fd(fd, ret_fstype, ret_uuid);
+}
+
+static int block_get_size_by_fd(int fd, uint64_t *ret) {
+        struct stat st;
+
+        assert(fd >= 0);
+        assert(ret);
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        if (!S_ISBLK(st.st_mode))
+                return -ENOTBLK;
+
+        return RET_NERRNO(ioctl(fd, BLKGETSIZE64, ret));
+}
+
+static int block_get_size_by_path(const char *path, uint64_t *ret) {
+        _cleanup_close_ int fd = -EBADF;
+
+        fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+        if (fd < 0)
+                return -errno;
+
+        return block_get_size_by_fd(fd, ret);
+}
+
+static int run_fsck(const char *node, const char *fstype) {
+        int r, exit_status;
+        pid_t fsck_pid;
+
+        assert(node);
+        assert(fstype);
+
+        r = fsck_exists_for_fstype(fstype);
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if fsck for file system %s exists: %m", fstype);
+        if (r == 0) {
+                log_warning("No fsck for file system %s installed, ignoring.", fstype);
+                return 0;
+        }
+
+        r = safe_fork("(fsck)",
+                      FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_STDOUT_TO_STDERR|FORK_CLOSE_ALL_FDS,
+                      &fsck_pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* Child */
+                execlp("fsck", "fsck", "-aTl", node, NULL);
+                log_open();
+                log_error_errno(errno, "Failed to execute fsck: %m");
+                _exit(FSCK_OPERATIONAL_ERROR);
+        }
+
+        exit_status = wait_for_terminate_and_check("fsck", fsck_pid, WAIT_LOG_ABNORMAL);
+        if (exit_status < 0)
+                return exit_status;
+        if ((exit_status & ~FSCK_ERROR_CORRECTED) != 0) {
+                log_warning("fsck failed with exit status %i.", exit_status);
+
+                if ((exit_status & (FSCK_SYSTEM_SHOULD_REBOOT|FSCK_ERRORS_LEFT_UNCORRECTED)) != 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "File system is corrupted, refusing.");
+
+                log_warning("Ignoring fsck error.");
+        }
+
+        log_info("File system check completed.");
+
+        return 1;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(key_serial_t, keyring_unlink, -1);
+
+static int upload_to_keyring(
+                UserRecord *h,
+                const char *password,
+                key_serial_t *ret_key_serial) {
+
+        _cleanup_free_ char *name = NULL;
+        key_serial_t serial;
+
+        assert(h);
+        assert(password);
+
+        /* If auto-shrink-on-logout is turned on, we need to keep the key we used to unlock the LUKS volume
+         * around, since we'll need it when automatically resizing (since we can't ask the user there
+         * again). We do this by uploading it into the kernel keyring, specifically the "session" one. This
+         * is done under the assumption systemd-homed gets its private per-session keyring (i.e. default
+         * service behaviour, given that KeyringMode=private is the default). It will survive between our
+         * systemd-homework invocations that way.
+         *
+         * If auto-shrink-on-logout is disabled we'll skip this step, to be frugal with sensitive data. */
+
+        if (user_record_auto_resize_mode(h) != AUTO_RESIZE_SHRINK_AND_GROW) {  /* Won't need it */
+                if (ret_key_serial)
+                        *ret_key_serial = -1;
+                return 0;
+        }
+
+        name = strjoin("homework-user-", h->user_name);
+        if (!name)
+                return -ENOMEM;
+
+        serial = add_key("user", name, password, strlen(password), KEY_SPEC_SESSION_KEYRING);
+        if (serial == -1)
+                return -errno;
+
+        if (ret_key_serial)
+                *ret_key_serial = serial;
+
+        return 1;
+}
+
+static int luks_try_passwords(
+                UserRecord *h,
+                struct crypt_device *cd,
+                char **passwords,
+                void *volume_key,
+                size_t *volume_key_size,
+                key_serial_t *ret_key_serial) {
+
+        int r;
+
+        assert(h);
+        assert(cd);
+
+        STRV_FOREACH(pp, passwords) {
+                size_t vks = *volume_key_size;
+
+                r = sym_crypt_volume_key_get(
+                                cd,
+                                CRYPT_ANY_SLOT,
+                                volume_key,
+                                &vks,
+                                *pp,
+                                strlen(*pp));
+                if (r >= 0) {
+                        if (ret_key_serial) {
+                                /* If ret_key_serial is non-NULL, let's try to upload the password that
+                                 * worked, and return its serial. */
+                                r = upload_to_keyring(h, *pp, ret_key_serial);
+                                if (r < 0) {
+                                        log_debug_errno(r, "Failed to upload LUKS password to kernel keyring, ignoring: %m");
+                                        *ret_key_serial = -1;
+                                }
+                        }
+
+                        *volume_key_size = vks;
+                        return 0;
+                }
+
+                log_debug_errno(r, "Password %zu didn't work for unlocking LUKS superblock: %m", (size_t) (pp - passwords));
+        }
+
+        return -ENOKEY;
+}
+
+static int luks_setup(
+                UserRecord *h,
+                const char *node,
+                const char *dm_name,
+                sd_id128_t uuid,
+                const char *cipher,
+                const char *cipher_mode,
+                uint64_t volume_key_size,
+                char **passwords,
+                const PasswordCache *cache,
+                bool discard,
+                struct crypt_device **ret,
+                sd_id128_t *ret_found_uuid,
+                void **ret_volume_key,
+                size_t *ret_volume_key_size,
+                key_serial_t *ret_key_serial) {
+
+        _cleanup_(keyring_unlinkp) key_serial_t key_serial = -1;
+        _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL;
+        _cleanup_(erase_and_freep) void *vk = NULL;
+        sd_id128_t p;
+        size_t vks;
+        char **list;
+        int r;
+
+        assert(h);
+        assert(node);
+        assert(dm_name);
+        assert(ret);
+
+        r = sym_crypt_init(&cd, node);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate libcryptsetup context: %m");
+
+        cryptsetup_enable_logging(cd);
+
+        r = sym_crypt_load(cd, CRYPT_LUKS2, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load LUKS superblock: %m");
+
+        r = sym_crypt_get_volume_key_size(cd);
+        if (r <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine LUKS volume key size");
+        vks = (size_t) r;
+
+        if (!sd_id128_is_null(uuid) || ret_found_uuid) {
+                const char *s;
+
+                s = sym_crypt_get_uuid(cd);
+                if (!s)
+                        return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock has no UUID.");
+
+                r = sd_id128_from_string(s, &p);
+                if (r < 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock has invalid UUID.");
+
+                /* Check that the UUID matches, if specified */
+                if (!sd_id128_is_null(uuid) &&
+                    !sd_id128_equal(uuid, p))
+                        return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock has wrong UUID.");
+        }
+
+        if (cipher && !streq_ptr(cipher, sym_crypt_get_cipher(cd)))
+                return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock declares wrong cipher.");
+
+        if (cipher_mode && !streq_ptr(cipher_mode, sym_crypt_get_cipher_mode(cd)))
+                return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock declares wrong cipher mode.");
+
+        if (volume_key_size != UINT64_MAX && vks != volume_key_size)
+                return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock declares wrong volume key size.");
+
+        vk = malloc(vks);
+        if (!vk)
+                return log_oom();
+
+        r = -ENOKEY;
+        FOREACH_POINTER(list,
+                        cache ? cache->keyring_passswords : NULL,
+                        cache ? cache->pkcs11_passwords : NULL,
+                        cache ? cache->fido2_passwords : NULL,
+                        passwords) {
+                r = luks_try_passwords(h, cd, list, vk, &vks, ret_key_serial ? &key_serial : NULL);
+                if (r != -ENOKEY)
+                        break;
+        }
+        if (r == -ENOKEY)
+                return log_error_errno(r, "No valid password for LUKS superblock.");
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock LUKS superblock: %m");
+
+        r = sym_crypt_activate_by_volume_key(
+                        cd,
+                        dm_name,
+                        vk, vks,
+                        discard ? CRYPT_ACTIVATE_ALLOW_DISCARDS : 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock LUKS superblock: %m");
+
+        log_info("Setting up LUKS device /dev/mapper/%s completed.", dm_name);
+
+        *ret = TAKE_PTR(cd);
+
+        if (ret_found_uuid) /* Return the UUID actually found if the caller wants to know */
+                *ret_found_uuid = p;
+        if (ret_volume_key)
+                *ret_volume_key = TAKE_PTR(vk);
+        if (ret_volume_key_size)
+                *ret_volume_key_size = vks;
+        if (ret_key_serial)
+                *ret_key_serial = TAKE_KEY_SERIAL(key_serial);
+
+        return 0;
+}
+
+static int make_dm_names(UserRecord *h, HomeSetup *setup) {
+        assert(h);
+        assert(h->user_name);
+        assert(setup);
+
+        if (!setup->dm_name) {
+                setup->dm_name = strjoin("home-", h->user_name);
+                if (!setup->dm_name)
+                        return log_oom();
+        }
+
+        if (!setup->dm_node) {
+                setup->dm_node = path_join("/dev/mapper/", setup->dm_name);
+                if (!setup->dm_node)
+                        return log_oom();
+        }
+
+        return 0;
+}
+
+static int acquire_open_luks_device(
+                UserRecord *h,
+                HomeSetup *setup,
+                bool graceful) {
+
+        _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(!setup->crypt_device);
+
+        r = dlopen_cryptsetup();
+        if (r < 0)
+                return r;
+
+        r = make_dm_names(h, setup);
+        if (r < 0)
+                return r;
+
+        r = sym_crypt_init_by_name(&cd, setup->dm_name);
+        if ((ERRNO_IS_NEG_DEVICE_ABSENT(r) || r == -EINVAL) && graceful)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to initialize cryptsetup context for %s: %m", setup->dm_name);
+
+        cryptsetup_enable_logging(cd);
+
+        setup->crypt_device = TAKE_PTR(cd);
+        return 1;
+}
+
+static int luks_open(
+                UserRecord *h,
+                HomeSetup *setup,
+                const PasswordCache *cache,
+                sd_id128_t *ret_found_uuid,
+                void **ret_volume_key,
+                size_t *ret_volume_key_size) {
+
+        _cleanup_(erase_and_freep) void *vk = NULL;
+        sd_id128_t p;
+        char **list;
+        size_t vks;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(!setup->crypt_device);
+
+        /* Opens a LUKS device that is already set up. Re-validates the password while doing so (which also
+         * provides us with the volume key, which we want). */
+
+        r = acquire_open_luks_device(h, setup, /* graceful= */ false);
+        if (r < 0)
+                return r;
+
+        r = sym_crypt_load(setup->crypt_device, CRYPT_LUKS2, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load LUKS superblock: %m");
+
+        r = sym_crypt_get_volume_key_size(setup->crypt_device);
+        if (r <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine LUKS volume key size");
+        vks = (size_t) r;
+
+        if (ret_found_uuid) {
+                const char *s;
+
+                s = sym_crypt_get_uuid(setup->crypt_device);
+                if (!s)
+                        return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock has no UUID.");
+
+                r = sd_id128_from_string(s, &p);
+                if (r < 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "LUKS superblock has invalid UUID.");
+        }
+
+        vk = malloc(vks);
+        if (!vk)
+                return log_oom();
+
+        r = -ENOKEY;
+        FOREACH_POINTER(list,
+                        cache ? cache->keyring_passswords : NULL,
+                        cache ? cache->pkcs11_passwords : NULL,
+                        cache ? cache->fido2_passwords : NULL,
+                        h->password) {
+                r = luks_try_passwords(h, setup->crypt_device, list, vk, &vks, NULL);
+                if (r != -ENOKEY)
+                        break;
+        }
+        if (r == -ENOKEY)
+                return log_error_errno(r, "No valid password for LUKS superblock.");
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock LUKS superblock: %m");
+
+        log_info("Discovered used LUKS device /dev/mapper/%s, and validated password.", setup->dm_name);
+
+        /* This is needed so that crypt_resize() can operate correctly for pre-existing LUKS devices. We need
+         * to tell libcryptsetup the volume key explicitly, so that it is in the kernel keyring. */
+        r = sym_crypt_activate_by_volume_key(setup->crypt_device, NULL, vk, vks, CRYPT_ACTIVATE_KEYRING_KEY);
+        if (r < 0)
+                return log_error_errno(r, "Failed to upload volume key again: %m");
+
+        log_info("Successfully re-activated LUKS device.");
+
+        if (ret_found_uuid)
+                *ret_found_uuid = p;
+        if (ret_volume_key)
+                *ret_volume_key = TAKE_PTR(vk);
+        if (ret_volume_key_size)
+                *ret_volume_key_size = vks;
+
+        return 0;
+}
+
+static int fs_validate(
+                const char *dm_node,
+                sd_id128_t uuid,
+                char **ret_fstype,
+                sd_id128_t *ret_found_uuid) {
+
+        _cleanup_free_ char *fstype = NULL;
+        sd_id128_t u = SD_ID128_NULL; /* avoid false maybe-unitialized warning */
+        int r;
+
+        assert(dm_node);
+        assert(ret_fstype);
+
+        r = probe_file_system_by_path(dm_node, &fstype, &u);
+        if (r < 0)
+                return log_error_errno(r, "Failed to probe file system: %m");
+
+        /* Limit the set of supported file systems a bit, as protection against little tested kernel file
+         * systems. Also, we only support the resize ioctls for these file systems. */
+        if (!supported_fstype(fstype))
+                return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "Image contains unsupported file system: %s", strna(fstype));
+
+        if (!sd_id128_is_null(uuid) &&
+            !sd_id128_equal(uuid, u))
+                return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "File system has wrong UUID.");
+
+        log_info("Probing file system completed (found %s).", fstype);
+
+        *ret_fstype = TAKE_PTR(fstype);
+
+        if (ret_found_uuid) /* Return the UUID actually found if the caller wants to know */
+                *ret_found_uuid = u;
+
+        return 0;
+}
+
+static int luks_validate(
+                int fd,
+                const char *label,
+                sd_id128_t partition_uuid,
+                sd_id128_t *ret_partition_uuid,
+                uint64_t *ret_offset,
+                uint64_t *ret_size) {
+
+        _cleanup_(blkid_free_probep) blkid_probe b = NULL;
+        sd_id128_t found_partition_uuid = SD_ID128_NULL;
+        const char *fstype = NULL, *pttype = NULL;
+        blkid_loff_t offset = 0, size = 0;
+        blkid_partlist pl;
+        bool found = false;
+        int r, n;
+
+        assert(fd >= 0);
+        assert(label);
+        assert(ret_offset);
+        assert(ret_size);
+
+        b = blkid_new_probe();
+        if (!b)
+                return -ENOMEM;
+
+        errno = 0;
+        r = blkid_probe_set_device(b, fd, 0, 0);
+        if (r != 0)
+                return errno_or_else(ENOMEM);
+
+        (void) blkid_probe_enable_superblocks(b, 1);
+        (void) blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
+        (void) blkid_probe_enable_partitions(b, 1);
+        (void) blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
+
+        errno = 0;
+        r = blkid_do_safeprobe(b);
+        if (r == _BLKID_SAFEPROBE_ERROR)
+                return errno_or_else(EIO);
+        if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND))
+                return -ENOPKG;
+
+        assert(r == _BLKID_SAFEPROBE_FOUND);
+
+        (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
+        if (streq_ptr(fstype, "crypto_LUKS")) {
+                /* Directly a LUKS image */
+                *ret_offset = 0;
+                *ret_size = UINT64_MAX; /* full disk */
+                *ret_partition_uuid = SD_ID128_NULL;
+                return 0;
+        } else if (fstype)
+                return -ENOPKG;
+
+        (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
+        if (!streq_ptr(pttype, "gpt"))
+                return -ENOPKG;
+
+        errno = 0;
+        pl = blkid_probe_get_partitions(b);
+        if (!pl)
+                return errno_or_else(ENOMEM);
+
+        errno = 0;
+        n = blkid_partlist_numof_partitions(pl);
+        if (n < 0)
+                return errno_or_else(EIO);
+
+        for (int i = 0; i < n; i++) {
+                sd_id128_t id = SD_ID128_NULL;
+                blkid_partition pp;
+
+                errno = 0;
+                pp = blkid_partlist_get_partition(pl, i);
+                if (!pp)
+                        return errno_or_else(EIO);
+
+                if (sd_id128_string_equal(blkid_partition_get_type_string(pp), SD_GPT_USER_HOME) <= 0)
+                        continue;
+
+                if (!streq_ptr(blkid_partition_get_name(pp), label))
+                        continue;
+
+
+                r = blkid_partition_get_uuid_id128(pp, &id);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to read partition UUID, ignoring: %m");
+                else if (!sd_id128_is_null(partition_uuid) && !sd_id128_equal(id, partition_uuid))
+                        continue;
+
+                if (found)
+                        return -ENOPKG;
+
+                offset = blkid_partition_get_start(pp);
+                size = blkid_partition_get_size(pp);
+                found_partition_uuid = id;
+
+                found = true;
+        }
+
+        if (!found)
+                return -ENOPKG;
+
+        if (offset < 0)
+                return -EINVAL;
+        if ((uint64_t) offset > UINT64_MAX / 512U)
+                return -EINVAL;
+        if (size <= 0)
+                return -EINVAL;
+        if ((uint64_t) size > UINT64_MAX / 512U)
+                return -EINVAL;
+
+        *ret_offset = offset * 512U;
+        *ret_size = size * 512U;
+        *ret_partition_uuid = found_partition_uuid;
+
+        return 0;
+}
+
+static int crypt_device_to_evp_cipher(struct crypt_device *cd, const EVP_CIPHER **ret) {
+        _cleanup_free_ char *cipher_name = NULL;
+        const char *cipher, *cipher_mode, *e;
+        size_t key_size, key_bits;
+        const EVP_CIPHER *cc;
+        int r;
+
+        assert(cd);
+
+        /* Let's find the right OpenSSL EVP_CIPHER object that matches the encryption settings of the LUKS
+         * device */
+
+        cipher = sym_crypt_get_cipher(cd);
+        if (!cipher)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot get cipher from LUKS device.");
+
+        cipher_mode = sym_crypt_get_cipher_mode(cd);
+        if (!cipher_mode)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot get cipher mode from LUKS device.");
+
+        e = strchr(cipher_mode, '-');
+        if (e)
+                cipher_mode = strndupa_safe(cipher_mode, e - cipher_mode);
+
+        r = sym_crypt_get_volume_key_size(cd);
+        if (r <= 0)
+                return log_error_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL), "Cannot get volume key size from LUKS device.");
+
+        key_size = r;
+        key_bits = key_size * 8;
+        if (streq(cipher_mode, "xts"))
+                key_bits /= 2;
+
+        if (asprintf(&cipher_name, "%s-%zu-%s", cipher, key_bits, cipher_mode) < 0)
+                return log_oom();
+
+        cc = EVP_get_cipherbyname(cipher_name);
+        if (!cc)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Selected cipher mode '%s' not supported, can't encrypt JSON record.", cipher_name);
+
+        /* Verify that our key length calculations match what OpenSSL thinks */
+        r = EVP_CIPHER_key_length(cc);
+        if (r < 0 || (uint64_t) r != key_size)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Key size of selected cipher doesn't meet our expectations.");
+
+        *ret = cc;
+        return 0;
+}
+
+static int luks_validate_home_record(
+                struct crypt_device *cd,
+                UserRecord *h,
+                const void *volume_key,
+                PasswordCache *cache,
+                UserRecord **ret_luks_home_record) {
+
+        int r;
+
+        assert(cd);
+        assert(h);
+
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *rr = NULL;
+                _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL;
+                _cleanup_(user_record_unrefp) UserRecord *lhr = NULL;
+                _cleanup_free_ void *encrypted = NULL, *iv = NULL;
+                size_t decrypted_size, encrypted_size, iv_size;
+                int decrypted_size_out1, decrypted_size_out2;
+                _cleanup_free_ char *decrypted = NULL;
+                const char *text, *type;
+                crypt_token_info state;
+                JsonVariant *jr, *jiv;
+                unsigned line, column;
+                const EVP_CIPHER *cc;
+
+                state = sym_crypt_token_status(cd, token, &type);
+                if (state == CRYPT_TOKEN_INACTIVE) /* First unconfigured token, give up */
+                        break;
+                if (IN_SET(state, CRYPT_TOKEN_INTERNAL, CRYPT_TOKEN_INTERNAL_UNKNOWN, CRYPT_TOKEN_EXTERNAL))
+                        continue;
+                if (state != CRYPT_TOKEN_EXTERNAL_UNKNOWN)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected token state of token %i: %i", token, (int) state);
+
+                if (!streq(type, "systemd-homed"))
+                        continue;
+
+                r = sym_crypt_token_json_get(cd, token, &text);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read LUKS token %i: %m", token);
+
+                r = json_parse(text, JSON_PARSE_SENSITIVE, &v, &line, &column);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse LUKS token JSON data %u:%u: %m", line, column);
+
+                jr = json_variant_by_key(v, "record");
+                if (!jr)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "LUKS token lacks 'record' field.");
+                jiv = json_variant_by_key(v, "iv");
+                if (!jiv)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "LUKS token lacks 'iv' field.");
+
+                r = json_variant_unbase64(jr, &encrypted, &encrypted_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to base64 decode record: %m");
+
+                r = json_variant_unbase64(jiv, &iv, &iv_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to base64 decode IV: %m");
+
+                r = crypt_device_to_evp_cipher(cd, &cc);
+                if (r < 0)
+                        return r;
+                if (iv_size > INT_MAX || EVP_CIPHER_iv_length(cc) != (int) iv_size)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "IV size doesn't match.");
+
+                context = EVP_CIPHER_CTX_new();
+                if (!context)
+                        return log_oom();
+
+                if (EVP_DecryptInit_ex(context, cc, NULL, volume_key, iv) != 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize decryption context.");
+
+                decrypted_size = encrypted_size + EVP_CIPHER_key_length(cc) * 2;
+                decrypted = new(char, decrypted_size);
+                if (!decrypted)
+                        return log_oom();
+
+                if (EVP_DecryptUpdate(context, (uint8_t*) decrypted, &decrypted_size_out1, encrypted, encrypted_size) != 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to decrypt JSON record.");
+
+                assert((size_t) decrypted_size_out1 <= decrypted_size);
+
+                if (EVP_DecryptFinal_ex(context, (uint8_t*) decrypted + decrypted_size_out1, &decrypted_size_out2) != 1)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finish decryption of JSON record.");
+
+                assert((size_t) decrypted_size_out1 + (size_t) decrypted_size_out2 < decrypted_size);
+                decrypted_size = (size_t) decrypted_size_out1 + (size_t) decrypted_size_out2;
+
+                if (memchr(decrypted, 0, decrypted_size))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Inner NUL byte in JSON record, refusing.");
+
+                decrypted[decrypted_size] = 0;
+
+                r = json_parse(decrypted, JSON_PARSE_SENSITIVE, &rr, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse decrypted JSON record, refusing.");
+
+                lhr = user_record_new();
+                if (!lhr)
+                        return log_oom();
+
+                r = user_record_load(lhr, rr, USER_RECORD_LOAD_EMBEDDED|USER_RECORD_PERMISSIVE);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse user record: %m");
+
+                if (!user_record_compatible(h, lhr))
+                        return log_error_errno(SYNTHETIC_ERRNO(EREMCHG), "LUKS home record not compatible with host record, refusing.");
+
+                r = user_record_authenticate(lhr, h, cache, /* strict_verify= */ true);
+                if (r < 0)
+                        return r;
+                assert(r > 0); /* Insist that a password was verified */
+
+                *ret_luks_home_record = TAKE_PTR(lhr);
+                return 0;
+        }
+
+        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Couldn't find home record in LUKS2 header, refusing.");
+}
+
+static int format_luks_token_text(
+                struct crypt_device *cd,
+                UserRecord *hr,
+                const void *volume_key,
+                char **ret) {
+
+        int r, encrypted_size_out1 = 0, encrypted_size_out2 = 0, iv_size, key_size;
+        _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_free_ void *iv = NULL, *encrypted = NULL;
+        size_t text_length, encrypted_size;
+        _cleanup_free_ char *text = NULL;
+        const EVP_CIPHER *cc;
+
+        assert(cd);
+        assert(hr);
+        assert(volume_key);
+        assert(ret);
+
+        r = crypt_device_to_evp_cipher(cd, &cc);
+        if (r < 0)
+                return r;
+
+        key_size = EVP_CIPHER_key_length(cc);
+        iv_size = EVP_CIPHER_iv_length(cc);
+
+        if (iv_size > 0) {
+                iv = malloc(iv_size);
+                if (!iv)
+                        return log_oom();
+
+                r = crypto_random_bytes(iv, iv_size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to generate IV: %m");
+        }
+
+        context = EVP_CIPHER_CTX_new();
+        if (!context)
+                return log_oom();
+
+        if (EVP_EncryptInit_ex(context, cc, NULL, volume_key, iv) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize encryption context.");
+
+        r = json_variant_format(hr->json, 0, &text);
+        if (r < 0)
+                return log_error_errno(r, "Failed to format user record for LUKS: %m");
+
+        text_length = strlen(text);
+        encrypted_size = text_length + 2*key_size - 1;
+
+        encrypted = malloc(encrypted_size);
+        if (!encrypted)
+                return log_oom();
+
+        if (EVP_EncryptUpdate(context, encrypted, &encrypted_size_out1, (uint8_t*) text, text_length) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to encrypt JSON record.");
+
+        assert((size_t) encrypted_size_out1 <= encrypted_size);
+
+        if (EVP_EncryptFinal_ex(context, (uint8_t*) encrypted + encrypted_size_out1, &encrypted_size_out2) != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finish encryption of JSON record. ");
+
+        assert((size_t) encrypted_size_out1 + (size_t) encrypted_size_out2 <= encrypted_size);
+
+        r = json_build(&v,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("systemd-homed")),
+                                       JSON_BUILD_PAIR("keyslots", JSON_BUILD_EMPTY_ARRAY),
+                                       JSON_BUILD_PAIR("record", JSON_BUILD_BASE64(encrypted, encrypted_size_out1 + encrypted_size_out2)),
+                                       JSON_BUILD_PAIR("iv", JSON_BUILD_BASE64(iv, iv_size))));
+        if (r < 0)
+                return log_error_errno(r, "Failed to prepare LUKS JSON token object: %m");
+
+        r = json_variant_format(v, 0, ret);
+        if (r < 0)
+                return log_error_errno(r, "Failed to format encrypted user record for LUKS: %m");
+
+        return 0;
+}
+
+int home_store_header_identity_luks(
+                UserRecord *h,
+                HomeSetup *setup,
+                UserRecord *old_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *header_home = NULL;
+        _cleanup_free_ char *text = NULL;
+        int r;
+
+        assert(h);
+
+        if (!setup->crypt_device)
+                return 0;
+
+        assert(setup->volume_key);
+
+        /* Let's store the user's identity record in the LUKS2 "token" header data fields, in an encrypted
+         * fashion. Why that? If we'd rely on the record being embedded in the payload file system itself we
+         * would have to mount the file system before we can validate the JSON record, its signatures and
+         * whether it matches what we are looking for. However, kernel file system implementations are
+         * generally not ready to be used on untrusted media. Hence let's store the record independently of
+         * the file system, so that we can validate it first, and only then mount the file system. To keep
+         * things simple we use the same encryption settings for this record as for the file system itself. */
+
+        r = user_record_clone(h, USER_RECORD_EXTRACT_EMBEDDED|USER_RECORD_PERMISSIVE, &header_home);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine new header record: %m");
+
+        if (old_home && user_record_equal(old_home, header_home)) {
+                log_debug("Not updating header home record.");
+                return 0;
+        }
+
+        r = format_luks_token_text(setup->crypt_device, header_home, setup->volume_key, &text);
+        if (r < 0)
+                return r;
+
+        for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token++) {
+                crypt_token_info state;
+                const char *type;
+
+                state = sym_crypt_token_status(setup->crypt_device, token, &type);
+                if (state == CRYPT_TOKEN_INACTIVE) /* First unconfigured token, we are done */
+                        break;
+                if (IN_SET(state, CRYPT_TOKEN_INTERNAL, CRYPT_TOKEN_INTERNAL_UNKNOWN, CRYPT_TOKEN_EXTERNAL))
+                        continue; /* Not ours */
+                if (state != CRYPT_TOKEN_EXTERNAL_UNKNOWN)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected token state of token %i: %i", token, (int) state);
+
+                if (!streq(type, "systemd-homed"))
+                        continue;
+
+                r = sym_crypt_token_json_set(setup->crypt_device, token, text);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set JSON token for slot %i: %m", token);
+
+                /* Now, let's free the text so that for all further matching tokens we all crypt_json_token_set()
+                 * with a NULL text in order to invalidate the tokens. */
+                text = mfree(text);
+        }
+
+        if (text)
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Didn't find any record token to update.");
+
+        log_info("Wrote LUKS header user record.");
+
+        return 1;
+}
+
+int run_fitrim(int root_fd) {
+        struct fstrim_range range = {
+                .len = UINT64_MAX,
+        };
+
+        /* If discarding is on, discard everything right after mounting, so that the discard setting takes
+         * effect on activation. (Also, optionally, trim on logout) */
+
+        assert(root_fd >= 0);
+
+        if (ioctl(root_fd, FITRIM, &range) < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno) || errno == EBADF) {
+                        log_debug_errno(errno, "File system does not support FITRIM, not trimming.");
+                        return 0;
+                }
+
+                return log_warning_errno(errno, "Failed to invoke FITRIM, ignoring: %m");
+        }
+
+        log_info("Discarded unused %s.", FORMAT_BYTES(range.len));
+        return 1;
+}
+
+int run_fallocate(int backing_fd, const struct stat *st) {
+        struct stat stbuf;
+
+        assert(backing_fd >= 0);
+
+        /* If discarding is off, let's allocate the whole image before mounting, so that the setting takes
+         * effect on activation */
+
+        if (!st) {
+                if (fstat(backing_fd, &stbuf) < 0)
+                        return log_error_errno(errno, "Failed to fstat(): %m");
+
+                st = &stbuf;
+        }
+
+        if (!S_ISREG(st->st_mode))
+                return 0;
+
+        if (st->st_blocks >= DIV_ROUND_UP(st->st_size, 512)) {
+                log_info("Backing file is fully allocated already.");
+                return 0;
+        }
+
+        if (fallocate(backing_fd, FALLOC_FL_KEEP_SIZE, 0, st->st_size) < 0) {
+
+                if (ERRNO_IS_NOT_SUPPORTED(errno)) {
+                        log_debug_errno(errno, "fallocate() not supported on file system, ignoring.");
+                        return 0;
+                }
+
+                if (ERRNO_IS_DISK_SPACE(errno)) {
+                        log_debug_errno(errno, "Not enough disk space to fully allocate home.");
+                        return -ENOSPC; /* make recognizable */
+                }
+
+                return log_error_errno(errno, "Failed to allocate backing file blocks: %m");
+        }
+
+        log_info("Allocated additional %s.",
+                 FORMAT_BYTES((DIV_ROUND_UP(st->st_size, 512) - st->st_blocks) * 512));
+        return 1;
+}
+
+int run_fallocate_by_path(const char *backing_path) {
+        _cleanup_close_ int backing_fd = -EBADF;
+
+        backing_fd = open(backing_path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+        if (backing_fd < 0)
+                return log_error_errno(errno, "Failed to open '%s' for fallocate(): %m", backing_path);
+
+        return run_fallocate(backing_fd, NULL);
+}
+
+static int lock_image_fd(int image_fd, const char *ip) {
+        int r;
+
+        /* If the $SYSTEMD_LUKS_LOCK environment variable is set we'll take an exclusive BSD lock on the
+         * image file, and send it to our parent. homed will keep it open to ensure no other instance of
+         * homed (across the network or such) will also mount the file. */
+
+        assert(image_fd >= 0);
+        assert(ip);
+
+        r = getenv_bool("SYSTEMD_LUKS_LOCK");
+        if (r == -ENXIO)
+                return 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse $SYSTEMD_LUKS_LOCK environment variable: %m");
+        if (r == 0)
+                return 0;
+
+        if (flock(image_fd, LOCK_EX|LOCK_NB) < 0) {
+
+                if (errno == EAGAIN)
+                        log_error_errno(errno, "Image file '%s' already locked, can't use.", ip);
+                else
+                        log_error_errno(errno, "Failed to lock image file '%s': %m", ip);
+
+                return errno != EAGAIN ? -errno : -EADDRINUSE; /* Make error recognizable */
+        }
+
+        log_info("Successfully locked image file '%s'.", ip);
+
+        /* Now send it to our parent to keep safe while the home dir is active */
+        r = sd_pid_notify_with_fds(0, false, "SYSTEMD_LUKS_LOCK_FD=1", &image_fd, 1);
+        if (r < 0)
+                log_warning_errno(r, "Failed to send LUKS lock fd to parent, ignoring: %m");
+
+        return 0;
+}
+
+static int open_image_file(
+                UserRecord *h,
+                const char *force_image_path,
+                struct stat *ret_stat) {
+
+        _cleanup_close_ int image_fd = -EBADF;
+        struct stat st;
+        const char *ip;
+        int r;
+
+        assert(h || force_image_path);
+
+        ip = force_image_path ?: user_record_image_path(h);
+
+        image_fd = open(ip, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+        if (image_fd < 0)
+                return log_error_errno(errno, "Failed to open image file %s: %m", ip);
+
+        if (fstat(image_fd, &st) < 0)
+                return log_error_errno(errno, "Failed to fstat() image file: %m");
+        if (!S_ISREG(st.st_mode) && !S_ISBLK(st.st_mode))
+                return log_error_errno(
+                                S_ISDIR(st.st_mode) ? SYNTHETIC_ERRNO(EISDIR) : SYNTHETIC_ERRNO(EBADFD),
+                                "Image file %s is not a regular file or block device: %m", ip);
+
+        /* Locking block devices doesn't really make sense, as this might interfere with
+         * udev's workings, and these locks aren't network propagated anyway, hence not what
+         * we are after here. */
+        if (S_ISREG(st.st_mode)) {
+                r = lock_image_fd(image_fd, ip);
+                if (r < 0)
+                        return r;
+        }
+
+        if (ret_stat)
+                *ret_stat = st;
+
+        return TAKE_FD(image_fd);
+}
+
+int home_setup_luks(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                const char *force_image_path,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_luks_home) {
+
+        sd_id128_t found_partition_uuid, found_fs_uuid = SD_ID128_NULL, found_luks_uuid = SD_ID128_NULL;
+        _cleanup_(user_record_unrefp) UserRecord *luks_home = NULL;
+        _cleanup_(erase_and_freep) void *volume_key = NULL;
+        size_t volume_key_size = 0;
+        uint64_t offset, size;
+        struct stat st;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(user_record_storage(h) == USER_LUKS);
+
+        r = dlopen_cryptsetup();
+        if (r < 0)
+                return r;
+
+        r = make_dm_names(h, setup);
+        if (r < 0)
+                return r;
+
+        /* Reuse the image fd if it has already been opened by an earlier step */
+        if (setup->image_fd < 0) {
+                setup->image_fd = open_image_file(h, force_image_path, &st);
+                if (setup->image_fd < 0)
+                        return setup->image_fd;
+        } else if (fstat(setup->image_fd, &st) < 0)
+                return log_error_errno(errno, "Failed to stat image: %m");
+
+        if (FLAGS_SET(flags, HOME_SETUP_ALREADY_ACTIVATED)) {
+                struct loop_info64 info;
+                const char *n;
+
+                if (!setup->crypt_device) {
+                        r = luks_open(h,
+                                      setup,
+                                      cache,
+                                      &found_luks_uuid,
+                                      &volume_key,
+                                      &volume_key_size);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (ret_luks_home) {
+                        r = luks_validate_home_record(setup->crypt_device, h, volume_key, cache, &luks_home);
+                        if (r < 0)
+                                return r;
+                }
+
+                n = sym_crypt_get_device_name(setup->crypt_device);
+                if (!n)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine backing device for DM %s.", setup->dm_name);
+
+                if (!setup->loop) {
+                        r = loop_device_open_from_path(n, O_RDWR, LOCK_UN, &setup->loop);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to open loopback device %s: %m", n);
+                }
+
+                if (ioctl(setup->loop->fd, LOOP_GET_STATUS64, &info) < 0) {
+                        _cleanup_free_ char *sysfs = NULL;
+
+                        if (!IN_SET(errno, ENOTTY, EINVAL))
+                                return log_error_errno(errno, "Failed to get block device metrics of %s: %m", n);
+
+                        if (ioctl(setup->loop->fd, BLKGETSIZE64, &size) < 0)
+                                return log_error_errno(r, "Failed to read block device size of %s: %m", n);
+
+                        if (fstat(setup->loop->fd, &st) < 0)
+                                return log_error_errno(r, "Failed to stat block device %s: %m", n);
+                        assert(S_ISBLK(st.st_mode));
+
+                        if (asprintf(&sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
+                                return log_oom();
+
+                        if (access(sysfs, F_OK) < 0) {
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno, "Failed to determine whether %s exists: %m", sysfs);
+
+                                offset = 0;
+                        } else {
+                                _cleanup_free_ char *buffer = NULL;
+
+                                if (asprintf(&sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/start", DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
+                                        return log_oom();
+
+                                r = read_one_line_file(sysfs, &buffer);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to read partition start offset: %m");
+
+                                r = safe_atou64(buffer, &offset);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse partition start offset: %m");
+
+                                if (offset > UINT64_MAX / 512U)
+                                        return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "Offset too large for 64 byte range, refusing.");
+
+                                offset *= 512U;
+                        }
+                } else {
+#if HAVE_VALGRIND_MEMCHECK_H
+                        VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
+#endif
+
+                        offset = info.lo_offset;
+                        size = info.lo_sizelimit;
+                }
+
+                found_partition_uuid = found_fs_uuid = SD_ID128_NULL;
+
+                log_info("Discovered used loopback device %s.", setup->loop->node);
+
+                if (setup->root_fd < 0) {
+                        setup->root_fd = open(user_record_home_directory(h), O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+                        if (setup->root_fd < 0)
+                                return log_error_errno(errno, "Failed to open home directory: %m");
+                }
+        } else {
+                _cleanup_free_ char *fstype = NULL, *subdir = NULL;
+                const char *ip;
+
+                /* When we aren't reopening the home directory we are allocating it fresh, hence the relevant
+                 * objects can't be allocated yet. */
+                assert(setup->root_fd < 0);
+                assert(!setup->crypt_device);
+                assert(!setup->loop);
+
+                ip = force_image_path ?: user_record_image_path(h);
+
+                subdir = path_join(HOME_RUNTIME_WORK_DIR, user_record_user_name_and_realm(h));
+                if (!subdir)
+                        return log_oom();
+
+                r = luks_validate(setup->image_fd, user_record_user_name_and_realm(h), h->partition_uuid, &found_partition_uuid, &offset, &size);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to validate disk label: %m");
+
+                /* Everything before this point left the image untouched. We are now starting to make
+                 * changes, hence mark the image dirty */
+                if (run_mark_dirty(setup->image_fd, true) > 0)
+                        setup->do_mark_clean = true;
+
+                if (!user_record_luks_discard(h)) {
+                        r = run_fallocate(setup->image_fd, &st);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = loop_device_make(
+                                setup->image_fd,
+                                O_RDWR,
+                                offset,
+                                size,
+                                h->luks_sector_size == UINT64_MAX ? UINT32_MAX : user_record_luks_sector_size(h), /* if sector size is not specified, select UINT32_MAX, i.e. auto-probe */
+                                /* loop_flags= */ 0,
+                                LOCK_UN,
+                                &setup->loop);
+                if (r == -ENOENT) {
+                        log_error_errno(r, "Loopback block device support is not available on this system.");
+                        return -ENOLINK; /* make recognizable */
+                }
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate loopback context: %m");
+
+                log_info("Setting up loopback device %s completed.", setup->loop->node ?: ip);
+
+                r = luks_setup(h,
+                               setup->loop->node ?: ip,
+                               setup->dm_name,
+                               h->luks_uuid,
+                               h->luks_cipher,
+                               h->luks_cipher_mode,
+                               h->luks_volume_key_size,
+                               h->password,
+                               cache,
+                               user_record_luks_discard(h) || user_record_luks_offline_discard(h),
+                               &setup->crypt_device,
+                               &found_luks_uuid,
+                               &volume_key,
+                               &volume_key_size,
+                               &setup->key_serial);
+                if (r < 0)
+                        return r;
+
+                setup->undo_dm = true;
+
+                if (ret_luks_home) {
+                        r = luks_validate_home_record(setup->crypt_device, h, volume_key, cache, &luks_home);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = fs_validate(setup->dm_node, h->file_system_uuid, &fstype, &found_fs_uuid);
+                if (r < 0)
+                        return r;
+
+                r = run_fsck(setup->dm_node, fstype);
+                if (r < 0)
+                        return r;
+
+                r = home_unshare_and_mount(setup->dm_node, fstype, user_record_luks_discard(h), user_record_mount_flags(h), h->luks_extra_mount_options);
+                if (r < 0)
+                        return r;
+
+                setup->undo_mount = true;
+
+                setup->root_fd = open(subdir, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+                if (setup->root_fd < 0)
+                        return log_error_errno(errno, "Failed to open home directory: %m");
+
+                if (user_record_luks_discard(h))
+                        (void) run_fitrim(setup->root_fd);
+
+                setup->do_offline_fallocate = !(setup->do_offline_fitrim = user_record_luks_offline_discard(h));
+        }
+
+        if (!sd_id128_is_null(found_partition_uuid))
+                setup->found_partition_uuid = found_partition_uuid;
+        if (!sd_id128_is_null(found_luks_uuid))
+                setup->found_luks_uuid = found_luks_uuid;
+        if (!sd_id128_is_null(found_fs_uuid))
+                setup->found_fs_uuid = found_fs_uuid;
+
+        setup->partition_offset = offset;
+        setup->partition_size = size;
+
+        if (volume_key) {
+                erase_and_free(setup->volume_key);
+                setup->volume_key = TAKE_PTR(volume_key);
+                setup->volume_key_size = volume_key_size;
+        }
+
+        if (ret_luks_home)
+                *ret_luks_home = TAKE_PTR(luks_home);
+
+        return 0;
+}
+
+static void print_size_summary(uint64_t host_size, uint64_t encrypted_size, const struct statfs *sfs) {
+        assert(sfs);
+
+        log_info("Image size is %s, file system size is %s, file system payload size is %s, file system free is %s.",
+                 FORMAT_BYTES(host_size),
+                 FORMAT_BYTES(encrypted_size),
+                 FORMAT_BYTES((uint64_t) sfs->f_blocks * (uint64_t) sfs->f_frsize),
+                 FORMAT_BYTES((uint64_t) sfs->f_bfree * (uint64_t) sfs->f_frsize));
+}
+
+static int home_auto_grow_luks(
+                UserRecord *h,
+                HomeSetup *setup,
+                PasswordCache *cache) {
+
+        struct statfs sfs;
+
+        assert(h);
+        assert(setup);
+
+        if (!IN_SET(user_record_auto_resize_mode(h), AUTO_RESIZE_GROW, AUTO_RESIZE_SHRINK_AND_GROW))
+                return 0;
+
+        assert(setup->root_fd >= 0);
+
+        if (fstatfs(setup->root_fd, &sfs) < 0)
+                return log_error_errno(errno, "Failed to statfs home directory: %m");
+
+        if (!fs_can_online_shrink_and_grow(sfs.f_type)) {
+                log_debug("Not auto-grow file system, since selected file system cannot do both online shrink and grow.");
+                return 0;
+        }
+
+        log_debug("Initiating auto-grow...");
+
+        return home_resize_luks(
+                        h,
+                        HOME_SETUP_ALREADY_ACTIVATED|
+                        HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES|
+                        HOME_SETUP_RESIZE_DONT_SHRINK|
+                        HOME_SETUP_RESIZE_DONT_UNDO,
+                        setup,
+                        cache,
+                        NULL);
+}
+
+int home_activate_luks(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL, *luks_home_record = NULL;
+        uint64_t host_size, encrypted_size;
+        const char *hdo, *hd;
+        struct statfs sfs;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_LUKS);
+        assert(setup);
+        assert(ret_home);
+
+        r = dlopen_cryptsetup();
+        if (r < 0)
+                return r;
+
+        assert_se(hdo = user_record_home_directory(h));
+        hd = strdupa_safe(hdo); /* copy the string out, since it might change later in the home record object */
+
+        r = home_get_state_luks(h, setup);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Device mapper device %s already exists, refusing.", setup->dm_node);
+
+        r = home_setup_luks(
+                        h,
+                        0,
+                        NULL,
+                        setup,
+                        cache,
+                        &luks_home_record);
+        if (r < 0)
+                return r;
+
+        r = home_auto_grow_luks(h, setup, cache);
+        if (r < 0)
+                return r;
+
+        r = block_get_size_by_fd(setup->loop->fd, &host_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get loopback block device size: %m");
+
+        r = block_get_size_by_path(setup->dm_node, &encrypted_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get LUKS block device size: %m");
+
+        r = home_refresh(
+                        h,
+                        flags,
+                        setup,
+                        luks_home_record,
+                        cache,
+                        &sfs,
+                        &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_extend_embedded_identity(new_home, h, setup);
+        if (r < 0)
+                return r;
+
+        setup->root_fd = safe_close(setup->root_fd);
+
+        r = home_move_mount(user_record_user_name_and_realm(h), hd);
+        if (r < 0)
+                return r;
+
+        setup->undo_mount = false;
+        setup->do_offline_fitrim = false;
+
+        loop_device_relinquish(setup->loop);
+
+        r = sym_crypt_deactivate_by_name(NULL, setup->dm_name, CRYPT_DEACTIVATE_DEFERRED);
+        if (r < 0)
+                log_warning_errno(r, "Failed to relinquish DM device, ignoring: %m");
+
+        setup->undo_dm = false;
+        setup->do_offline_fallocate = false;
+        setup->do_mark_clean = false;
+        setup->do_drop_caches = false;
+        TAKE_KEY_SERIAL(setup->key_serial); /* Leave key in kernel keyring */
+
+        log_info("Activation completed.");
+
+        print_size_summary(host_size, encrypted_size, &sfs);
+
+        *ret_home = TAKE_PTR(new_home);
+        return 1;
+}
+
+int home_deactivate_luks(UserRecord *h, HomeSetup *setup) {
+        bool we_detached = false;
+        int r;
+
+        assert(h);
+        assert(setup);
+
+        /* Note that the DM device and loopback device are set to auto-detach, hence strictly speaking we
+         * don't have to explicitly have to detach them. However, we do that nonetheless (in case of the DM
+         * device), to avoid races: by explicitly detaching them we know when the detaching is complete. We
+         * don't bother about the loopback device because unlike the DM device it doesn't have a fixed
+         * name. */
+
+        if (!setup->crypt_device) {
+                r = acquire_open_luks_device(h, setup, /* graceful= */ true);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to initialize cryptsetup context for %s: %m", setup->dm_name);
+                if (r == 0)
+                        log_debug("LUKS device %s has already been detached.", setup->dm_name);
+        }
+
+        if (setup->crypt_device) {
+                log_info("Discovered used LUKS device %s.", setup->dm_node);
+
+                cryptsetup_enable_logging(setup->crypt_device);
+
+                r = sym_crypt_deactivate_by_name(setup->crypt_device, setup->dm_name, 0);
+                if (ERRNO_IS_NEG_DEVICE_ABSENT(r) || r == -EINVAL)
+                        log_debug_errno(r, "LUKS device %s is already detached.", setup->dm_node);
+                else if (r < 0)
+                        return log_info_errno(r, "LUKS device %s couldn't be deactivated: %m", setup->dm_node);
+                else {
+                        log_info("LUKS device detaching completed.");
+                        we_detached = true;
+                }
+        }
+
+        (void) wait_for_block_device_gone(setup, USEC_PER_SEC * 30);
+        setup->undo_dm = false;
+
+        if (user_record_luks_offline_discard(h))
+                log_debug("Not allocating on logout.");
+        else
+                (void) run_fallocate_by_path(user_record_image_path(h));
+
+        run_mark_dirty_by_path(user_record_image_path(h), false);
+        return we_detached;
+}
+
+int home_trim_luks(UserRecord *h, HomeSetup *setup) {
+        assert(h);
+        assert(setup);
+        assert(setup->root_fd >= 0);
+
+        if (!user_record_luks_offline_discard(h)) {
+                log_debug("Not trimming on logout.");
+                return 0;
+        }
+
+        (void) run_fitrim(setup->root_fd);
+        return 0;
+}
+
+static struct crypt_pbkdf_type* build_good_pbkdf(struct crypt_pbkdf_type *buffer, UserRecord *hr) {
+        assert(buffer);
+        assert(hr);
+
+        bool benchmark = user_record_luks_pbkdf_force_iterations(hr) == UINT64_MAX;
+
+        *buffer = (struct crypt_pbkdf_type) {
+                .hash = user_record_luks_pbkdf_hash_algorithm(hr),
+                .type = user_record_luks_pbkdf_type(hr),
+                .time_ms = benchmark ? user_record_luks_pbkdf_time_cost_usec(hr) / USEC_PER_MSEC : 0,
+                .iterations = benchmark ? 0 : user_record_luks_pbkdf_force_iterations(hr),
+                .max_memory_kb = user_record_luks_pbkdf_memory_cost(hr) / 1024,
+                .parallel_threads = user_record_luks_pbkdf_parallel_threads(hr),
+                .flags = benchmark ? 0 : CRYPT_PBKDF_NO_BENCHMARK,
+        };
+
+        return buffer;
+}
+
+static struct crypt_pbkdf_type* build_minimal_pbkdf(struct crypt_pbkdf_type *buffer, UserRecord *hr) {
+        assert(buffer);
+        assert(hr);
+
+        /* For PKCS#11 derived keys (which are generated randomly and are of high quality already) we use a
+         * minimal PBKDF */
+        *buffer = (struct crypt_pbkdf_type) {
+                .hash = user_record_luks_pbkdf_hash_algorithm(hr),
+                .type = CRYPT_KDF_PBKDF2,
+                .iterations = 1,
+                .time_ms = 1,
+        };
+
+        return buffer;
+}
+
+static int luks_format(
+                const char *node,
+                const char *dm_name,
+                sd_id128_t uuid,
+                const char *label,
+                const PasswordCache *cache,
+                char **effective_passwords,
+                bool discard,
+                UserRecord *hr,
+                struct crypt_device **ret) {
+
+        _cleanup_(user_record_unrefp) UserRecord *reduced = NULL;
+        _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL;
+        _cleanup_(erase_and_freep) void *volume_key = NULL;
+        struct crypt_pbkdf_type good_pbkdf, minimal_pbkdf;
+        _cleanup_free_ char *text = NULL;
+        size_t volume_key_size;
+        int slot = 0, r;
+
+        assert(node);
+        assert(dm_name);
+        assert(hr);
+        assert(ret);
+
+        r = sym_crypt_init(&cd, node);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate libcryptsetup context: %m");
+
+        cryptsetup_enable_logging(cd);
+
+        /* Normally we'd, just leave volume key generation to libcryptsetup. However, we can't, since we
+         * can't extract the volume key from the library again, but we need it in order to encrypt the JSON
+         * record. Hence, let's generate it on our own, so that we can keep track of it. */
+
+        volume_key_size = user_record_luks_volume_key_size(hr);
+        volume_key = malloc(volume_key_size);
+        if (!volume_key)
+                return log_oom();
+
+        r = crypto_random_bytes(volume_key, volume_key_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate volume key: %m");
+
+#if HAVE_CRYPT_SET_METADATA_SIZE
+        /* Increase the metadata space to 4M, the largest LUKS2 supports */
+        r = sym_crypt_set_metadata_size(cd, 4096U*1024U, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to change LUKS2 metadata size: %m");
+#endif
+
+        build_good_pbkdf(&good_pbkdf, hr);
+        build_minimal_pbkdf(&minimal_pbkdf, hr);
+
+        r = sym_crypt_format(
+                        cd,
+                        CRYPT_LUKS2,
+                        user_record_luks_cipher(hr),
+                        user_record_luks_cipher_mode(hr),
+                        SD_ID128_TO_UUID_STRING(uuid),
+                        volume_key,
+                        volume_key_size,
+                        &(struct crypt_params_luks2) {
+                                .label = label,
+                                .subsystem = "systemd-home",
+                                .sector_size = user_record_luks_sector_size(hr),
+                                .pbkdf = &good_pbkdf,
+                        });
+        if (r < 0)
+                return log_error_errno(r, "Failed to format LUKS image: %m");
+
+        log_info("LUKS formatting completed.");
+
+        STRV_FOREACH(pp, effective_passwords) {
+
+                if (password_cache_contains(cache, *pp)) { /* is this a fido2 or pkcs11 password? */
+                        log_debug("Using minimal PBKDF for slot %i", slot);
+                        r = sym_crypt_set_pbkdf_type(cd, &minimal_pbkdf);
+                } else {
+                        log_debug("Using good PBKDF for slot %i", slot);
+                        r = sym_crypt_set_pbkdf_type(cd, &good_pbkdf);
+                }
+                if (r < 0)
+                        return log_error_errno(r, "Failed to tweak PBKDF for slot %i: %m", slot);
+
+                r = sym_crypt_keyslot_add_by_volume_key(
+                                cd,
+                                slot,
+                                volume_key,
+                                volume_key_size,
+                                *pp,
+                                strlen(*pp));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set up LUKS password for slot %i: %m", slot);
+
+                log_info("Writing password to LUKS keyslot %i completed.", slot);
+                slot++;
+        }
+
+        r = sym_crypt_activate_by_volume_key(
+                        cd,
+                        dm_name,
+                        volume_key,
+                        volume_key_size,
+                        discard ? CRYPT_ACTIVATE_ALLOW_DISCARDS : 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to activate LUKS superblock: %m");
+
+        log_info("LUKS activation by volume key succeeded.");
+
+        r = user_record_clone(hr, USER_RECORD_EXTRACT_EMBEDDED|USER_RECORD_PERMISSIVE, &reduced);
+        if (r < 0)
+                return log_error_errno(r, "Failed to prepare home record for LUKS: %m");
+
+        r = format_luks_token_text(cd, reduced, volume_key, &text);
+        if (r < 0)
+                return r;
+
+        r = sym_crypt_token_json_set(cd, CRYPT_ANY_TOKEN, text);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set LUKS JSON token: %m");
+
+        log_info("Writing user record as LUKS token completed.");
+
+        if (ret)
+                *ret = TAKE_PTR(cd);
+
+        return 0;
+}
+
+static int make_partition_table(
+                int fd,
+                uint32_t sector_size,
+                const char *label,
+                sd_id128_t uuid,
+                uint64_t *ret_offset,
+                uint64_t *ret_size,
+                sd_id128_t *ret_disk_uuid) {
+
+        _cleanup_(fdisk_unref_partitionp) struct fdisk_partition *p = NULL, *q = NULL;
+        _cleanup_(fdisk_unref_parttypep) struct fdisk_parttype *t = NULL;
+        _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL;
+        _cleanup_free_ char *disk_uuid_as_string = NULL;
+        uint64_t offset, size, first_lba, start, last_lba, end;
+        sd_id128_t disk_uuid;
+        int r;
+
+        assert(fd >= 0);
+        assert(label);
+        assert(ret_offset);
+        assert(ret_size);
+
+        t = fdisk_new_parttype();
+        if (!t)
+                return log_oom();
+
+        r = fdisk_parttype_set_typestr(t, SD_GPT_USER_HOME_STR);
+        if (r < 0)
+                return log_error_errno(r, "Failed to initialize partition type: %m");
+
+        r = fdisk_new_context_at(fd, /* path= */ NULL, /* read_only= */ false, sector_size, &c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open device: %m");
+
+        r = fdisk_create_disklabel(c, "gpt");
+        if (r < 0)
+                return log_error_errno(r, "Failed to create GPT disk label: %m");
+
+        p = fdisk_new_partition();
+        if (!p)
+                return log_oom();
+
+        r = fdisk_partition_set_type(p, t);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set partition type: %m");
+
+        r = fdisk_partition_partno_follow_default(p, 1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to place partition at first free partition index: %m");
+
+        first_lba = fdisk_get_first_lba(c); /* Boundary where usable space starts */
+        assert(first_lba <= UINT64_MAX/512);
+        start = DISK_SIZE_ROUND_UP(first_lba * 512); /* Round up to multiple of 4K */
+
+        log_debug("Starting partition at offset %" PRIu64, start);
+
+        if (start == UINT64_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Overflow while rounding up start LBA.");
+
+        last_lba = fdisk_get_last_lba(c); /* One sector before boundary where usable space ends */
+        assert(last_lba < UINT64_MAX/512);
+        end = DISK_SIZE_ROUND_DOWN((last_lba + 1) * 512); /* Round down to multiple of 4K */
+
+        if (end <= start)
+                return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Resulting partition size zero or negative.");
+
+        r = fdisk_partition_set_start(p, start / 512);
+        if (r < 0)
+                return log_error_errno(r, "Failed to place partition at offset %" PRIu64 ": %m", start);
+
+        r = fdisk_partition_set_size(p, (end - start) / 512);
+        if (r < 0)
+                return log_error_errno(r, "Failed to end partition at offset %" PRIu64 ": %m", end);
+
+        r = fdisk_partition_set_name(p, label);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set partition name: %m");
+
+        r = fdisk_partition_set_uuid(p, SD_ID128_TO_UUID_STRING(uuid));
+        if (r < 0)
+                return log_error_errno(r, "Failed to set partition UUID: %m");
+
+        r = fdisk_add_partition(c, p, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add partition: %m");
+
+        r = fdisk_write_disklabel(c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write disk label: %m");
+
+        r = fdisk_get_disklabel_id(c, &disk_uuid_as_string);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine disk label UUID: %m");
+
+        r = sd_id128_from_string(disk_uuid_as_string, &disk_uuid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse disk label UUID: %m");
+
+        r = fdisk_get_partition(c, 0, &q);
+        if (r < 0)
+                return log_error_errno(r, "Failed to read created partition metadata: %m");
+
+        assert(fdisk_partition_has_start(q));
+        offset = fdisk_partition_get_start(q);
+        if (offset > UINT64_MAX / 512U)
+                return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Partition offset too large.");
+
+        assert(fdisk_partition_has_size(q));
+        size = fdisk_partition_get_size(q);
+        if (size > UINT64_MAX / 512U)
+                return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Partition size too large.");
+
+        *ret_offset = offset * 512U;
+        *ret_size = size * 512U;
+        *ret_disk_uuid = disk_uuid;
+
+        return 0;
+}
+
+static bool supported_fs_size(const char *fstype, uint64_t host_size) {
+        uint64_t m;
+
+        m = minimal_size_by_fs_name(fstype);
+        if (m == UINT64_MAX)
+                return false;
+
+        return host_size >= m;
+}
+
+static int wait_for_devlink(const char *path) {
+        _cleanup_close_ int inotify_fd = -EBADF;
+        usec_t until;
+        int r;
+
+        /* let's wait for a device link to show up in /dev, with a timeout. This is good to do since we
+         * return a /dev/disk/by-uuid/… link to our callers and they likely want to access it right-away,
+         * hence let's wait until udev has caught up with our changes, and wait for the symlink to be
+         * created. */
+
+        until = usec_add(now(CLOCK_MONOTONIC), 45 * USEC_PER_SEC);
+
+        for (;;) {
+                _cleanup_free_ char *dn = NULL;
+                usec_t w;
+
+                if (laccess(path, F_OK) < 0) {
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to determine whether %s exists: %m", path);
+                } else
+                        return 0; /* Found it */
+
+                if (inotify_fd < 0) {
+                        /* We need to wait for the device symlink to show up, let's create an inotify watch for it */
+                        inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+                        if (inotify_fd < 0)
+                                return log_error_errno(errno, "Failed to allocate inotify fd: %m");
+                }
+
+                r = path_extract_directory(path, &dn);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract directory from device node path '%s': %m", path);
+                for (;;) {
+                        _cleanup_free_ char *ndn = NULL;
+
+                        log_info("Watching %s", dn);
+
+                        if (inotify_add_watch(inotify_fd, dn, IN_CREATE|IN_MOVED_TO|IN_ONLYDIR|IN_DELETE_SELF|IN_MOVE_SELF) < 0) {
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno, "Failed to add watch on %s: %m", dn);
+                        } else
+                                break;
+
+                        r = path_extract_directory(dn, &ndn);
+                        if (r == -EADDRNOTAVAIL) /* Arrived at the top? */
+                                break;
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to extract directory from device node path '%s': %m", dn);
+
+                        free_and_replace(dn, ndn);
+                }
+
+                w = now(CLOCK_MONOTONIC);
+                if (w >= until)
+                        return log_error_errno(SYNTHETIC_ERRNO(ETIMEDOUT), "Device link %s still hasn't shown up, giving up.", path);
+
+                r = fd_wait_for_event(inotify_fd, POLLIN, until - w);
+                if (ERRNO_IS_NEG_TRANSIENT(r))
+                        continue;
+                if (r < 0)
+                        return log_error_errno(r, "Failed to watch inotify: %m");
+
+                (void) flush_fd(inotify_fd);
+        }
+}
+
+static int calculate_initial_image_size(UserRecord *h, int image_fd, const char *fstype, uint64_t *ret) {
+        uint64_t upper_boundary, lower_boundary;
+        struct statfs sfs;
+
+        assert(h);
+        assert(image_fd >= 0);
+        assert(ret);
+
+        if (fstatfs(image_fd, &sfs) < 0)
+                return log_error_errno(errno, "statfs() on image failed: %m");
+
+        upper_boundary = DISK_SIZE_ROUND_DOWN((uint64_t) sfs.f_bsize * sfs.f_bavail);
+
+        if (h->disk_size != UINT64_MAX)
+                *ret = MIN(DISK_SIZE_ROUND_DOWN(h->disk_size), upper_boundary);
+        else if (h->disk_size_relative == UINT64_MAX) {
+
+                if (upper_boundary > UINT64_MAX / USER_DISK_SIZE_DEFAULT_PERCENT)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "Disk size too large.");
+
+                *ret = DISK_SIZE_ROUND_DOWN(upper_boundary * USER_DISK_SIZE_DEFAULT_PERCENT / 100);
+
+                log_info("Sizing home to %u%% of available disk space, which is %s.",
+                         USER_DISK_SIZE_DEFAULT_PERCENT,
+                         FORMAT_BYTES(*ret));
+        } else {
+                *ret = DISK_SIZE_ROUND_DOWN((uint64_t) ((double) upper_boundary * (double) CLAMP(h->disk_size_relative, 0U, UINT32_MAX) / (double) UINT32_MAX));
+
+                log_info("Sizing home to %" PRIu64 ".%01" PRIu64 "%% of available disk space, which is %s.",
+                         (h->disk_size_relative * 100) / UINT32_MAX,
+                         ((h->disk_size_relative * 1000) / UINT32_MAX) % 10,
+                         FORMAT_BYTES(*ret));
+        }
+
+        lower_boundary = minimal_size_by_fs_name(fstype);
+        if (lower_boundary != UINT64_MAX) {
+                assert(GPT_LUKS2_OVERHEAD < UINT64_MAX - lower_boundary);
+                lower_boundary += GPT_LUKS2_OVERHEAD;
+        }
+        if (lower_boundary == UINT64_MAX || lower_boundary < USER_DISK_SIZE_MIN)
+                lower_boundary = USER_DISK_SIZE_MIN;
+
+        if (*ret < lower_boundary)
+                *ret = lower_boundary;
+
+        return 0;
+}
+
+static int home_truncate(
+                UserRecord *h,
+                int fd,
+                uint64_t size) {
+
+        bool trunc;
+        int r;
+
+        assert(h);
+        assert(fd >= 0);
+
+        trunc = user_record_luks_discard(h);
+        if (!trunc) {
+                r = fallocate(fd, 0, 0, size);
+                if (r < 0 && ERRNO_IS_NOT_SUPPORTED(errno)) {
+                        /* Some file systems do not support fallocate(), let's gracefully degrade
+                         * (ZFS, reiserfs, …) and fall back to truncation */
+                        log_notice_errno(errno, "Backing file system does not support fallocate(), falling back to ftruncate(), i.e. implicitly using non-discard mode.");
+                        trunc = true;
+                }
+        }
+
+        if (trunc)
+                r = ftruncate(fd, size);
+
+        if (r < 0) {
+                if (ERRNO_IS_DISK_SPACE(errno)) {
+                        log_debug_errno(errno, "Not enough disk space to allocate home of size %s.", FORMAT_BYTES(size));
+                        return -ENOSPC; /* make recognizable */
+                }
+
+                return log_error_errno(errno, "Failed to truncate home image: %m");
+        }
+
+        return !trunc; /* Return == 0 if we managed to truncate, > 0 if we managed to allocate */
+}
+
+int home_create_luks(
+                UserRecord *h,
+                HomeSetup *setup,
+                const PasswordCache *cache,
+                char **effective_passwords,
+                UserRecord **ret_home) {
+
+        _cleanup_free_ char *subdir = NULL, *disk_uuid_path = NULL;
+        uint64_t encrypted_size,
+                host_size = 0, partition_offset = 0, partition_size = 0; /* Unnecessary initialization to appease gcc */
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL;
+        sd_id128_t partition_uuid, fs_uuid, luks_uuid, disk_uuid;
+        _cleanup_close_ int mount_fd = -EBADF;
+        const char *fstype, *ip;
+        struct statfs sfs;
+        int r;
+        _cleanup_strv_free_ char **extra_mkfs_options = NULL;
+
+        assert(h);
+        assert(h->storage < 0 || h->storage == USER_LUKS);
+        assert(setup);
+        assert(!setup->temporary_image_path);
+        assert(setup->image_fd < 0);
+        assert(ret_home);
+
+        r = dlopen_cryptsetup();
+        if (r < 0)
+                return r;
+
+        assert_se(ip = user_record_image_path(h));
+
+        fstype = user_record_file_system_type(h);
+        if (!supported_fstype(fstype))
+                return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "Unsupported file system type: %s", fstype);
+
+        r = mkfs_exists(fstype);
+        if (r < 0)
+                return log_error_errno(r, "Failed to check if mkfs binary for %s exists: %m", fstype);
+        if (r == 0) {
+                if (h->file_system_type || streq(fstype, "ext4") || !supported_fstype("ext4"))
+                        return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkfs binary for file system type %s does not exist.", fstype);
+
+                /* If the record does not explicitly declare a file system to use, and the compiled-in
+                 * default does not actually exist, than do an automatic fallback onto ext4, as the baseline
+                 * fs of Linux. We won't search for a working fs type here beyond ext4, i.e. nothing fancier
+                 * than a single, conservative fallback to baseline. This should be useful in minimal
+                 * environments where mkfs.btrfs or so are not made available, but mkfs.ext4 as Linux' most
+                 * boring, most basic fs is. */
+                log_info("Formatting tool for compiled-in default file system %s not available, falling back to ext4 instead.", fstype);
+                fstype = "ext4";
+        }
+
+        if (sd_id128_is_null(h->partition_uuid)) {
+                r = sd_id128_randomize(&partition_uuid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire partition UUID: %m");
+        } else
+                partition_uuid = h->partition_uuid;
+
+        if (sd_id128_is_null(h->luks_uuid)) {
+                r = sd_id128_randomize(&luks_uuid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire LUKS UUID: %m");
+        } else
+                luks_uuid = h->luks_uuid;
+
+        if (sd_id128_is_null(h->file_system_uuid)) {
+                r = sd_id128_randomize(&fs_uuid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire file system UUID: %m");
+        } else
+                fs_uuid = h->file_system_uuid;
+
+        r = make_dm_names(h, setup);
+        if (r < 0)
+                return r;
+
+        r = access(setup->dm_node, F_OK);
+        if (r < 0) {
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to determine whether %s exists: %m", setup->dm_node);
+        } else
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Device mapper device %s already exists, refusing.", setup->dm_node);
+
+        if (path_startswith(ip, "/dev/")) {
+                _cleanup_free_ char *sysfs = NULL;
+                uint64_t block_device_size;
+                struct stat st;
+
+                /* Let's place the home directory on a real device, i.e. a USB stick or such */
+
+                setup->image_fd = open_image_file(h, ip, &st);
+                if (setup->image_fd < 0)
+                        return setup->image_fd;
+
+                if (!S_ISBLK(st.st_mode))
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Device is not a block device, refusing.");
+
+                if (asprintf(&sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
+                        return log_oom();
+                if (access(sysfs, F_OK) < 0) {
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to check whether %s exists: %m", sysfs);
+                } else
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Operating on partitions is currently not supported, sorry. Please specify a top-level block device.");
+
+                if (flock(setup->image_fd, LOCK_EX) < 0) /* make sure udev doesn't read from it while we operate on the device */
+                        return log_error_errno(errno, "Failed to lock block device %s: %m", ip);
+
+                if (ioctl(setup->image_fd, BLKGETSIZE64, &block_device_size) < 0)
+                        return log_error_errno(errno, "Failed to read block device size: %m");
+
+                if (h->disk_size == UINT64_MAX) {
+
+                        /* If a relative disk size is requested, apply it relative to the block device size */
+                        if (h->disk_size_relative < UINT32_MAX)
+                                host_size = CLAMP(DISK_SIZE_ROUND_DOWN(block_device_size * h->disk_size_relative / UINT32_MAX),
+                                                  USER_DISK_SIZE_MIN, USER_DISK_SIZE_MAX);
+                        else
+                                host_size = block_device_size; /* Otherwise, take the full device */
+
+                } else if (h->disk_size > block_device_size)
+                        return log_error_errno(SYNTHETIC_ERRNO(EMSGSIZE), "Selected disk size larger than backing block device, refusing.");
+                else
+                        host_size = DISK_SIZE_ROUND_DOWN(h->disk_size);
+
+                if (!supported_fs_size(fstype, LESS_BY(host_size, GPT_LUKS2_OVERHEAD)))
+                        return log_error_errno(SYNTHETIC_ERRNO(ERANGE),
+                                               "Selected file system size too small for %s.", fstype);
+
+                /* After creation we should reference this partition by its UUID instead of the block
+                 * device. That's preferable since the user might have specified a device node such as
+                 * /dev/sdb to us, which might look very different when replugged. */
+                if (asprintf(&disk_uuid_path, "/dev/disk/by-uuid/" SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(luks_uuid)) < 0)
+                        return log_oom();
+
+                if (user_record_luks_discard(h) || user_record_luks_offline_discard(h)) {
+                        /* If we want online or offline discard, discard once before we start using things. */
+
+                        if (ioctl(setup->image_fd, BLKDISCARD, (uint64_t[]) { 0, block_device_size }) < 0)
+                                log_full_errno(errno == EOPNOTSUPP ? LOG_DEBUG : LOG_WARNING, errno,
+                                               "Failed to issue full-device BLKDISCARD on device, ignoring: %m");
+                        else
+                                log_info("Full device discard completed.");
+                }
+        } else {
+                _cleanup_free_ char *t = NULL;
+
+                r = mkdir_parents(ip, 0755);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create parent directory of %s: %m", ip);
+
+                r = tempfn_random(ip, "homework", &t);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to derive temporary file name for %s: %m", ip);
+
+                setup->image_fd = open(t, O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0600);
+                if (setup->image_fd < 0)
+                        return log_error_errno(errno, "Failed to create home image %s: %m", t);
+
+                setup->temporary_image_path = TAKE_PTR(t);
+
+                r = chattr_full(setup->image_fd, NULL, FS_NOCOW_FL|FS_NOCOMP_FL, FS_NOCOW_FL|FS_NOCOMP_FL, NULL, NULL, CHATTR_FALLBACK_BITWISE);
+                if (r < 0 && r != -ENOANO) /* ENOANO → some bits didn't work; which we skip logging about because chattr_full() already debug logs about those flags */
+                        log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                       "Failed to set file attributes on %s, ignoring: %m", setup->temporary_image_path);
+
+                r = calculate_initial_image_size(h, setup->image_fd, fstype, &host_size);
+                if (r < 0)
+                        return r;
+
+                r = resize_image_loop(h, setup, 0, host_size, &host_size);
+                if (r < 0)
+                        return r;
+
+                log_info("Allocating image file completed.");
+        }
+
+        r = make_partition_table(
+                        setup->image_fd,
+                        user_record_luks_sector_size(h),
+                        user_record_user_name_and_realm(h),
+                        partition_uuid,
+                        &partition_offset,
+                        &partition_size,
+                        &disk_uuid);
+        if (r < 0)
+                return r;
+
+        log_info("Writing of partition table completed.");
+
+        r = loop_device_make(
+                        setup->image_fd,
+                        O_RDWR,
+                        partition_offset,
+                        partition_size,
+                        user_record_luks_sector_size(h),
+                        0,
+                        LOCK_EX,
+                        &setup->loop);
+        if (r < 0) {
+                if (r == -ENOENT) { /* this means /dev/loop-control doesn't exist, i.e. we are in a container
+                                     * or similar and loopback bock devices are not available, return a
+                                     * recognizable error in this case. */
+                        log_error_errno(r, "Loopback block device support is not available on this system.");
+                        return -ENOLINK; /* Make recognizable */
+                }
+
+                return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path);
+        }
+
+        log_info("Setting up loopback device %s completed.", setup->loop->node ?: ip);
+
+        r = luks_format(setup->loop->node,
+                        setup->dm_name,
+                        luks_uuid,
+                        user_record_user_name_and_realm(h),
+                        cache,
+                        effective_passwords,
+                        user_record_luks_discard(h) || user_record_luks_offline_discard(h),
+                        h,
+                        &setup->crypt_device);
+        if (r < 0)
+                return r;
+
+        setup->undo_dm = true;
+
+        r = block_get_size_by_path(setup->dm_node, &encrypted_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get encrypted block device size: %m");
+
+        log_info("Setting up LUKS device %s completed.", setup->dm_node);
+
+        r = mkfs_options_from_env("HOME", fstype, &extra_mkfs_options);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine mkfs command line options for '%s': %m", fstype);
+
+        r = make_filesystem(setup->dm_node,
+                            fstype,
+                            user_record_user_name_and_realm(h),
+                            /* root = */ NULL,
+                            fs_uuid,
+                            user_record_luks_discard(h),
+                            /* quiet = */ true,
+                            /* sector_size = */ 0,
+                            extra_mkfs_options);
+        if (r < 0)
+                return r;
+
+        log_info("Formatting file system completed.");
+
+        r = home_unshare_and_mount(setup->dm_node, fstype, user_record_luks_discard(h), user_record_mount_flags(h), h->luks_extra_mount_options);
+        if (r < 0)
+                return r;
+
+        setup->undo_mount = true;
+
+        subdir = path_join(HOME_RUNTIME_WORK_DIR, user_record_user_name_and_realm(h));
+        if (!subdir)
+                return log_oom();
+
+        /* Prefer using a btrfs subvolume if we can, fall back to directory otherwise */
+        r = btrfs_subvol_make_fallback(AT_FDCWD, subdir, 0700);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create user directory in mounted image file: %m");
+
+        setup->root_fd = open(subdir, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+        if (setup->root_fd < 0)
+                return log_error_errno(errno, "Failed to open user directory in mounted image file: %m");
+
+        (void) home_shift_uid(setup->root_fd, NULL, UID_NOBODY, h->uid, &mount_fd);
+
+        if (mount_fd >= 0) {
+                /* If we have established a new mount, then we can use that as new root fd to our home directory. */
+                safe_close(setup->root_fd);
+
+                setup->root_fd = fd_reopen(mount_fd, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+                if (setup->root_fd < 0)
+                        return log_error_errno(setup->root_fd, "Unable to convert mount fd into proper directory fd: %m");
+
+                mount_fd = safe_close(mount_fd);
+        }
+
+        r = home_populate(h, setup->root_fd);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup->root_fd, &sfs);
+        if (r < 0)
+                return r;
+
+        r = user_record_clone(h, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_LOG|USER_RECORD_PERMISSIVE, &new_home);
+        if (r < 0)
+                return log_error_errno(r, "Failed to clone record: %m");
+
+        r = user_record_add_binding(
+                        new_home,
+                        USER_LUKS,
+                        disk_uuid_path ?: ip,
+                        partition_uuid,
+                        luks_uuid,
+                        fs_uuid,
+                        sym_crypt_get_cipher(setup->crypt_device),
+                        sym_crypt_get_cipher_mode(setup->crypt_device),
+                        luks_volume_key_size_convert(setup->crypt_device),
+                        fstype,
+                        NULL,
+                        h->uid,
+                        (gid_t) h->uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add binding to record: %m");
+
+        if (user_record_luks_offline_discard(h)) {
+                r = run_fitrim(setup->root_fd);
+                if (r < 0)
+                        return r;
+        }
+
+        setup->root_fd = safe_close(setup->root_fd);
+
+        r = home_setup_undo_mount(setup, LOG_ERR);
+        if (r < 0)
+                return r;
+
+        r = home_setup_undo_dm(setup, LOG_ERR);
+        if (r < 0)
+                return r;
+
+        setup->loop = loop_device_unref(setup->loop);
+
+        if (!user_record_luks_offline_discard(h)) {
+                r= run_fallocate(setup->image_fd, NULL /* refresh stat() data */);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Sync everything to disk before we move things into place under the final name. */
+        if (fsync(setup->image_fd) < 0)
+                return log_error_errno(r, "Failed to synchronize image to disk: %m");
+
+        if (disk_uuid_path)
+                /* Reread partition table if this is a block device */
+                (void) ioctl(setup->image_fd, BLKRRPART, 0);
+        else {
+                assert(setup->temporary_image_path);
+
+                if (rename(setup->temporary_image_path, ip) < 0)
+                        return log_error_errno(errno, "Failed to rename image file: %m");
+
+                setup->temporary_image_path = mfree(setup->temporary_image_path);
+
+                /* If we operate on a file, sync the containing directory too. */
+                r = fsync_directory_of_file(setup->image_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to synchronize directory of image file to disk: %m");
+
+                log_info("Moved image file into place.");
+        }
+
+        /* Let's close the image fd now. If we are operating on a real block device this will release the BSD
+         * lock that ensures udev doesn't interfere with what we are doing */
+        setup->image_fd = safe_close(setup->image_fd);
+
+        if (disk_uuid_path)
+                (void) wait_for_devlink(disk_uuid_path);
+
+        log_info("Creation completed.");
+
+        print_size_summary(host_size, encrypted_size, &sfs);
+
+        log_debug("GPT + LUKS2 overhead is %" PRIu64 " (expected %" PRIu64 ")", host_size - encrypted_size, GPT_LUKS2_OVERHEAD);
+
+        *ret_home = TAKE_PTR(new_home);
+        return 0;
+}
+
+int home_get_state_luks(UserRecord *h, HomeSetup *setup) {
+        int r;
+
+        assert(h);
+        assert(setup);
+
+        r = make_dm_names(h, setup);
+        if (r < 0)
+                return r;
+
+        r = access(setup->dm_node, F_OK);
+        if (r < 0 && errno != ENOENT)
+                return log_error_errno(errno, "Failed to determine whether %s exists: %m", setup->dm_node);
+
+        return r >= 0;
+}
+
+enum {
+        CAN_RESIZE_ONLINE,
+        CAN_RESIZE_OFFLINE,
+};
+
+static int can_resize_fs(int fd, uint64_t old_size, uint64_t new_size) {
+        struct statfs sfs;
+
+        assert(fd >= 0);
+
+        /* Filter out bogus requests early */
+        if (old_size == 0 || old_size == UINT64_MAX ||
+            new_size == 0 || new_size == UINT64_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid resize parameters.");
+
+        if ((old_size & 511) != 0 || (new_size & 511) != 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Resize parameters not multiple of 512.");
+
+        if (fstatfs(fd, &sfs) < 0)
+                return log_error_errno(errno, "Failed to fstatfs() file system: %m");
+
+        if (is_fs_type(&sfs, BTRFS_SUPER_MAGIC)) {
+
+                if (new_size < BTRFS_MINIMAL_SIZE)
+                        return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "New file system size too small for btrfs (needs to be 256M at least.");
+
+                /* btrfs can grow and shrink online */
+
+        } else if (is_fs_type(&sfs, XFS_SB_MAGIC)) {
+
+                if (new_size < XFS_MINIMAL_SIZE)
+                        return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "New file system size too small for xfs (needs to be 14M at least).");
+
+                /* XFS can grow, but not shrink */
+                if (new_size < old_size)
+                        return log_error_errno(SYNTHETIC_ERRNO(EMSGSIZE), "Shrinking this type of file system is not supported.");
+
+        } else if (is_fs_type(&sfs, EXT4_SUPER_MAGIC)) {
+
+                if (new_size < EXT4_MINIMAL_SIZE)
+                        return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "New file system size too small for ext4 (needs to be 1M at least).");
+
+                /* ext4 can grow online, and shrink offline */
+                if (new_size < old_size)
+                        return CAN_RESIZE_OFFLINE;
+
+        } else
+                return log_error_errno(SYNTHETIC_ERRNO(ESOCKTNOSUPPORT), "Resizing this type of file system is not supported.");
+
+        return CAN_RESIZE_ONLINE;
+}
+
+static int ext4_offline_resize_fs(
+                HomeSetup *setup,
+                uint64_t new_size,
+                bool discard,
+                unsigned long flags,
+                const char *extra_mount_options) {
+
+        _cleanup_free_ char *size_str = NULL;
+        bool re_open = false, re_mount = false;
+        pid_t resize_pid, fsck_pid;
+        int r, exit_status;
+
+        assert(setup);
+        assert(setup->dm_node);
+
+        /* First, unmount the file system */
+        if (setup->root_fd >= 0) {
+                setup->root_fd = safe_close(setup->root_fd);
+                re_open = true;
+        }
+
+        if (setup->undo_mount) {
+                r = home_setup_undo_mount(setup, LOG_ERR);
+                if (r < 0)
+                        return r;
+
+                re_mount = true;
+        }
+
+        log_info("Temporary unmounting of file system completed.");
+
+        /* resize2fs requires that the file system is force checked first, do so. */
+        r = safe_fork("(e2fsck)",
+                      FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_STDOUT_TO_STDERR|FORK_CLOSE_ALL_FDS,
+                      &fsck_pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* Child */
+                execlp("e2fsck" ,"e2fsck", "-fp", setup->dm_node, NULL);
+                log_open();
+                log_error_errno(errno, "Failed to execute e2fsck: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        exit_status = wait_for_terminate_and_check("e2fsck", fsck_pid, WAIT_LOG_ABNORMAL);
+        if (exit_status < 0)
+                return exit_status;
+        if ((exit_status & ~FSCK_ERROR_CORRECTED) != 0) {
+                log_warning("e2fsck failed with exit status %i.", exit_status);
+
+                if ((exit_status & (FSCK_SYSTEM_SHOULD_REBOOT|FSCK_ERRORS_LEFT_UNCORRECTED)) != 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "File system is corrupted, refusing.");
+
+                log_warning("Ignoring fsck error.");
+        }
+
+        log_info("Forced file system check completed.");
+
+        /* We use 512 sectors here, because resize2fs doesn't do byte sizes */
+        if (asprintf(&size_str, "%" PRIu64 "s", new_size / 512) < 0)
+                return log_oom();
+
+        /* Resize the thing */
+        r = safe_fork("(e2resize)",
+                      FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_STDOUT_TO_STDERR|FORK_CLOSE_ALL_FDS,
+                      &resize_pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* Child */
+                execlp("resize2fs" ,"resize2fs", setup->dm_node, size_str, NULL);
+                log_open();
+                log_error_errno(errno, "Failed to execute resize2fs: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        log_info("Offline file system resize completed.");
+
+        /* Re-establish mounts and reopen the directory */
+        if (re_mount) {
+                r = home_mount_node(setup->dm_node, "ext4", discard, flags, extra_mount_options);
+                if (r < 0)
+                        return r;
+
+                setup->undo_mount = true;
+        }
+
+        if (re_open) {
+                setup->root_fd = open(HOME_RUNTIME_WORK_DIR, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+                if (setup->root_fd < 0)
+                        return log_error_errno(errno, "Failed to reopen file system: %m");
+        }
+
+        log_info("File system mounted again.");
+
+        return 0;
+}
+
+static int prepare_resize_partition(
+                int fd,
+                uint64_t partition_offset,
+                uint64_t old_partition_size,
+                sd_id128_t *ret_disk_uuid,
+                struct fdisk_table **ret_table,
+                struct fdisk_partition **ret_partition) {
+
+        _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL;
+        _cleanup_(fdisk_unref_tablep) struct fdisk_table *t = NULL;
+        _cleanup_free_ char *disk_uuid_as_string = NULL;
+        struct fdisk_partition *found = NULL;
+        sd_id128_t disk_uuid;
+        size_t n_partitions;
+        int r;
+
+        assert(fd >= 0);
+        assert(ret_disk_uuid);
+        assert(ret_table);
+
+        assert((partition_offset & 511) == 0);
+        assert((old_partition_size & 511) == 0);
+        assert(UINT64_MAX - old_partition_size >= partition_offset);
+
+        if (partition_offset == 0) {
+                /* If the offset is at the beginning we assume no partition table, let's exit early. */
+                log_debug("Not rewriting partition table, operating on naked device.");
+                *ret_disk_uuid = SD_ID128_NULL;
+                *ret_table = NULL;
+                *ret_partition = NULL;
+                return 0;
+        }
+
+        r = fdisk_new_context_at(fd, /* path= */ NULL, /* read_only= */ false, UINT32_MAX, &c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open device: %m");
+
+        if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOMEDIUM), "Disk has no GPT partition table.");
+
+        r = fdisk_get_disklabel_id(c, &disk_uuid_as_string);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire disk UUID: %m");
+
+        r = sd_id128_from_string(disk_uuid_as_string, &disk_uuid);
+        if (r < 0)
+                return log_error_errno(r, "Failed parse disk UUID: %m");
+
+        r = fdisk_get_partitions(c, &t);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire partition table: %m");
+
+        n_partitions = fdisk_table_get_nents(t);
+        for (size_t i = 0; i < n_partitions; i++)  {
+                struct fdisk_partition *p;
+
+                p = fdisk_table_get_partition(t, i);
+                if (!p)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to read partition metadata: %m");
+
+                if (fdisk_partition_is_used(p) <= 0)
+                        continue;
+                if (fdisk_partition_has_start(p) <= 0 || fdisk_partition_has_size(p) <= 0 || fdisk_partition_has_end(p) <= 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Found partition without a size.");
+
+                if (fdisk_partition_get_start(p) == partition_offset / 512U &&
+                    fdisk_partition_get_size(p) == old_partition_size / 512U) {
+
+                        if (found)
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Partition found twice, refusing.");
+
+                        found = p;
+                } else if (fdisk_partition_get_end(p) > partition_offset / 512U)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Can't extend, not last partition in image.");
+        }
+
+        if (!found)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "Failed to find matching partition to resize.");
+
+        *ret_disk_uuid = disk_uuid;
+        *ret_table = TAKE_PTR(t);
+        *ret_partition = found;
+
+        return 1;
+}
+
+static int get_maximum_partition_size(
+                int fd,
+                struct fdisk_partition *p,
+                uint64_t *ret_maximum_partition_size) {
+
+        _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL;
+        uint64_t start_lba, start, last_lba, end;
+        int r;
+
+        assert(fd >= 0);
+        assert(p);
+        assert(ret_maximum_partition_size);
+
+        r = fdisk_new_context_at(fd, /* path= */ NULL, /* read_only= */ true, /* sector_size= */ UINT32_MAX, &c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create fdisk context: %m");
+
+        start_lba = fdisk_partition_get_start(p);
+        assert(start_lba <= UINT64_MAX/512);
+        start = start_lba * 512;
+
+        last_lba = fdisk_get_last_lba(c); /* One sector before boundary where usable space ends */
+        assert(last_lba < UINT64_MAX/512);
+        end = DISK_SIZE_ROUND_DOWN((last_lba + 1) * 512); /* Round down to multiple of 4K */
+
+        if (start > end)
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Last LBA is before partition start.");
+
+        *ret_maximum_partition_size = DISK_SIZE_ROUND_DOWN(end - start);
+
+        return 1;
+}
+
+static int ask_cb(struct fdisk_context *c, struct fdisk_ask *ask, void *userdata) {
+        char *result;
+
+        assert(c);
+
+        switch (fdisk_ask_get_type(ask)) {
+
+        case FDISK_ASKTYPE_STRING:
+                result = new(char, 37);
+                if (!result)
+                        return log_oom();
+
+                fdisk_ask_string_set_result(ask, sd_id128_to_uuid_string(*(sd_id128_t*) userdata, result));
+                break;
+
+        default:
+                log_debug("Unexpected question from libfdisk, ignoring.");
+        }
+
+        return 0;
+}
+
+static int apply_resize_partition(
+                int fd,
+                sd_id128_t disk_uuids,
+                struct fdisk_table *t,
+                struct fdisk_partition *p,
+                size_t new_partition_size) {
+
+        _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL;
+        _cleanup_free_ void *two_zero_lbas = NULL;
+        uint32_t ssz;
+        ssize_t n;
+        int r;
+
+        assert(fd >= 0);
+        assert(!t == !p);
+
+        if (!t) /* no partition table to apply, exit early */
+                return 0;
+
+        assert(p);
+
+        /* Before writing our partition patch the final size in */
+        r = fdisk_partition_size_explicit(p, 1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable explicit partition size: %m");
+
+        r = fdisk_partition_set_size(p, new_partition_size / 512U);
+        if (r < 0)
+                return log_error_errno(r, "Failed to change partition size: %m");
+
+        r = probe_sector_size(fd, &ssz);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine current sector size: %m");
+
+        two_zero_lbas = malloc0(ssz * 2);
+        if (!two_zero_lbas)
+                return log_oom();
+
+        /* libfdisk appears to get confused by the existing PMBR. Let's explicitly flush it out. */
+        n = pwrite(fd, two_zero_lbas, ssz * 2, 0);
+        if (n < 0)
+                return log_error_errno(errno, "Failed to wipe partition table: %m");
+        if ((size_t) n != ssz * 2)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while wiping partition table.");
+
+        r = fdisk_new_context_at(fd, /* path= */ NULL, /* read_only= */ false, ssz, &c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open device: %m");
+
+        r = fdisk_create_disklabel(c, "gpt");
+        if (r < 0)
+                return log_error_errno(r, "Failed to create GPT disk label: %m");
+
+        r = fdisk_apply_table(c, t);
+        if (r < 0)
+                return log_error_errno(r, "Failed to apply partition table: %m");
+
+        r = fdisk_set_ask(c, ask_cb, &disk_uuids);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set libfdisk query function: %m");
+
+        r = fdisk_set_disklabel_id(c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to change disklabel ID: %m");
+
+        r = fdisk_write_disklabel(c);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write disk label: %m");
+
+        return 1;
+}
+
+/* Always keep at least 16M free, so that we can safely log in and update the user record while doing so */
+#define HOME_MIN_FREE (16U*1024U*1024U)
+
+static int get_smallest_fs_size(int fd, uint64_t *ret) {
+        uint64_t minsz, needed;
+        struct statfs sfs;
+
+        assert(fd >= 0);
+        assert(ret);
+
+        /* Determines the minimal disk size we might be able to shrink the file system referenced by the fd to. */
+
+        if (syncfs(fd) < 0) /* let's sync before we query the size, so that the values returned are accurate */
+                return log_error_errno(errno, "Failed to synchronize home file system: %m");
+
+        if (fstatfs(fd, &sfs) < 0)
+                return log_error_errno(errno, "Failed to statfs() home file system: %m");
+
+        /* Let's determine the minimal file system size of the used fstype */
+        minsz = minimal_size_by_fs_magic(sfs.f_type);
+        if (minsz == UINT64_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Don't know minimum file system size of file system type '%s' of home directory.", fs_type_to_string(sfs.f_type));
+
+        if (minsz < USER_DISK_SIZE_MIN)
+                minsz = USER_DISK_SIZE_MIN;
+
+        if (sfs.f_bfree > sfs.f_blocks)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Detected amount of free blocks is greater than the total amount of file system blocks. Refusing.");
+
+        /* Calculate how much disk space is currently in use. */
+        needed = sfs.f_blocks - sfs.f_bfree;
+        if (needed > UINT64_MAX / sfs.f_bsize)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File system size out of range.");
+
+        needed *= sfs.f_bsize;
+
+        /* Add some safety margin of free space we'll always keep */
+        if (needed > UINT64_MAX - HOME_MIN_FREE) /* Check for overflow */
+                needed = UINT64_MAX;
+        else
+                needed += HOME_MIN_FREE;
+
+        *ret = DISK_SIZE_ROUND_UP(MAX(needed, minsz));
+        return 0;
+}
+
+static int get_largest_image_size(int fd, const struct stat *st, uint64_t *ret) {
+        uint64_t used, avail, sum;
+        struct statfs sfs;
+        int r;
+
+        assert(fd >= 0);
+        assert(st);
+        assert(ret);
+
+        /* Determines the maximum file size we might be able to grow the image file referenced by the fd to. */
+
+        r = stat_verify_regular(st);
+        if (r < 0)
+                return log_error_errno(r, "Image file is not a regular file, refusing: %m");
+
+        if (syncfs(fd) < 0)
+                return log_error_errno(errno, "Failed to synchronize file system backing image file: %m");
+
+        if (fstatfs(fd, &sfs) < 0)
+                return log_error_errno(errno, "Failed to statfs() image file: %m");
+
+        used = (uint64_t) st->st_blocks * 512;
+        avail = (uint64_t) sfs.f_bsize * sfs.f_bavail;
+
+        if (avail > UINT64_MAX - used)
+                sum = UINT64_MAX;
+        else
+                sum = avail + used;
+
+        *ret = DISK_SIZE_ROUND_DOWN(MIN(sum, USER_DISK_SIZE_MAX));
+        return 0;
+}
+
+static int resize_fs_loop(
+                UserRecord *h,
+                HomeSetup *setup,
+                int resize_type,
+                uint64_t old_fs_size,
+                uint64_t new_fs_size,
+                uint64_t *ret_fs_size) {
+
+        uint64_t current_fs_size;
+        unsigned n_iterations = 0;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(setup->root_fd >= 0);
+
+        /* A bisection loop trying to find the closest size to what the user asked for. (Well, we bisect like
+         * this only when we *shrink* the fs — if we grow the fs there's no need to bisect.) */
+
+        current_fs_size = old_fs_size;
+        for (uint64_t lower_boundary = new_fs_size, upper_boundary = old_fs_size, try_fs_size = new_fs_size;;) {
+                bool worked;
+
+                n_iterations++;
+
+                /* Now resize the file system */
+                if (resize_type == CAN_RESIZE_ONLINE) {
+                        r = resize_fs(setup->root_fd, try_fs_size, NULL);
+                        if (r < 0) {
+                                if (!ERRNO_IS_DISK_SPACE(r) || new_fs_size > old_fs_size) /* Not a disk space issue? Not trying to shrink? */
+                                        return log_error_errno(r, "Failed to resize file system: %m");
+
+                                log_debug_errno(r, "Shrinking from %s to %s didn't work, not enough space for contained data.", FORMAT_BYTES(current_fs_size), FORMAT_BYTES(try_fs_size));
+                                worked = false;
+                        } else {
+                                log_debug("Successfully resized from %s to %s.", FORMAT_BYTES(current_fs_size), FORMAT_BYTES(try_fs_size));
+                                current_fs_size = try_fs_size;
+                                worked = true;
+                        }
+
+                        /* If we hit a disk space issue and are shrinking the fs, then maybe it helps to
+                         * increase the image size. */
+                } else {
+                        r = ext4_offline_resize_fs(setup, try_fs_size, user_record_luks_discard(h), user_record_mount_flags(h), h->luks_extra_mount_options);
+                        if (r < 0)
+                                return r;
+
+                        /* For now, when we fail to shrink an ext4 image we'll not try again via the
+                         * bisection logic. We might add that later, but given this involves shelling out
+                         * multiple programs, it's a bit too cumbersome for my taste. */
+
+                        worked = true;
+                        current_fs_size = try_fs_size;
+                }
+
+                if (new_fs_size > old_fs_size) /* If we are growing we are done after one iteration */
+                        break;
+
+                /* If we are shrinking then let's adjust our bisection boundaries and try again. */
+                if (worked)
+                        upper_boundary = MIN(upper_boundary, try_fs_size);
+                else
+                        lower_boundary = MAX(lower_boundary, try_fs_size);
+
+                /* OK, this attempt to shrink didn't work. Let's try between the old size and what worked. */
+                if (lower_boundary >= upper_boundary) {
+                        log_debug("Image can't be shrunk further (range to try is empty).");
+                        break;
+                }
+
+                /* Let's find a new value to try half-way between the lower boundary and the upper boundary
+                 * to try now. */
+                try_fs_size = DISK_SIZE_ROUND_DOWN(lower_boundary + (upper_boundary - lower_boundary) / 2);
+                if (try_fs_size <= lower_boundary || try_fs_size >= upper_boundary) {
+                        log_debug("Image can't be shrunk further (remaining range to try too small).");
+                        break;
+                }
+        }
+
+        log_debug("Bisection loop completed after %u iterations.", n_iterations);
+
+        if (ret_fs_size)
+                *ret_fs_size = current_fs_size;
+
+        return 0;
+}
+
+static int resize_image_loop(
+                UserRecord *h,
+                HomeSetup *setup,
+                uint64_t old_image_size,
+                uint64_t new_image_size,
+                uint64_t *ret_image_size) {
+
+        uint64_t current_image_size;
+        unsigned n_iterations = 0;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(setup->image_fd >= 0);
+
+        /* A bisection loop trying to find the closest size to what the user asked for. (Well, we bisect like
+         * this only when we *grow* the image — if we shrink the image then there's no need to bisect.) */
+
+        current_image_size = old_image_size;
+        for (uint64_t lower_boundary = old_image_size, upper_boundary = new_image_size, try_image_size = new_image_size;;) {
+                bool worked;
+
+                n_iterations++;
+
+                r = home_truncate(h, setup->image_fd, try_image_size);
+                if (r < 0) {
+                        if (!ERRNO_IS_DISK_SPACE(r) || new_image_size < old_image_size) /* Not a disk space issue? Not trying to grow? */
+                                return r;
+
+                        log_debug_errno(r, "Growing from %s to %s didn't work, not enough space on backing disk.", FORMAT_BYTES(current_image_size), FORMAT_BYTES(try_image_size));
+                        worked = false;
+                } else if (r > 0) { /* Success: allocation worked */
+                        log_debug("Resizing from %s to %s via allocation worked successfully.", FORMAT_BYTES(current_image_size), FORMAT_BYTES(try_image_size));
+                        current_image_size = try_image_size;
+                        worked = true;
+                } else { /* Success, but through truncation, not allocation. */
+                        log_debug("Resizing from %s to %s via truncation worked successfully.", FORMAT_BYTES(old_image_size), FORMAT_BYTES(try_image_size));
+                        current_image_size = try_image_size;
+                        break; /* there's no point in the bisection logic if this was plain truncation and
+                                * not allocation, let's exit immediately. */
+                }
+
+                if (new_image_size < old_image_size) /* If we are shrinking we are done after one iteration */
+                        break;
+
+                /* If we are growing then let's adjust our bisection boundaries and try again */
+                if (worked)
+                        lower_boundary = MAX(lower_boundary, try_image_size);
+                else
+                        upper_boundary = MIN(upper_boundary, try_image_size);
+
+                if (lower_boundary >= upper_boundary) {
+                        log_debug("Image can't be grown further (range to try is empty).");
+                        break;
+                }
+
+                try_image_size = DISK_SIZE_ROUND_DOWN(lower_boundary + (upper_boundary - lower_boundary) / 2);
+                if (try_image_size <= lower_boundary || try_image_size >= upper_boundary) {
+                        log_debug("Image can't be grown further (remaining range to try too small).");
+                        break;
+                }
+        }
+
+        log_debug("Bisection loop completed after %u iterations.", n_iterations);
+
+        if (ret_image_size)
+                *ret_image_size = current_image_size;
+
+        return 0;
+}
+
+int home_resize_luks(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_home) {
+
+        uint64_t old_image_size, new_image_size, old_fs_size, new_fs_size, crypto_offset, crypto_offset_bytes,
+                new_partition_size, smallest_fs_size, resized_fs_size;
+        _cleanup_(user_record_unrefp) UserRecord *header_home = NULL, *embedded_home = NULL, *new_home = NULL;
+        _cleanup_(fdisk_unref_tablep) struct fdisk_table *table = NULL;
+        struct fdisk_partition *partition = NULL;
+        _cleanup_close_ int opened_image_fd = -EBADF;
+        _cleanup_free_ char *whole_disk = NULL;
+        int r, resize_type, image_fd = -EBADF;
+        sd_id128_t disk_uuid;
+        const char *ip, *ipo;
+        struct statfs sfs;
+        struct stat st;
+        enum {
+                INTENTION_DONT_KNOW = 0,    /* These happen to match the return codes of CMP() */
+                INTENTION_SHRINK = -1,
+                INTENTION_GROW = 1,
+        } intention = INTENTION_DONT_KNOW;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_LUKS);
+        assert(setup);
+
+        r = dlopen_cryptsetup();
+        if (r < 0)
+                return r;
+
+        assert_se(ipo = user_record_image_path(h));
+        ip = strdupa_safe(ipo); /* copy out since original might change later in home record object */
+
+        if (setup->image_fd < 0) {
+                setup->image_fd = open_image_file(h, NULL, &st);
+                if (setup->image_fd < 0)
+                        return setup->image_fd;
+        } else {
+                if (fstat(setup->image_fd, &st) < 0)
+                        return log_error_errno(errno, "Failed to stat image file %s: %m", ip);
+        }
+
+        image_fd = setup->image_fd;
+
+        if (S_ISBLK(st.st_mode)) {
+                dev_t parent;
+
+                r = block_get_whole_disk(st.st_rdev, &parent);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire whole block device for %s: %m", ip);
+                if (r > 0) {
+                        /* If we shall resize a file system on a partition device, then let's figure out the
+                         * whole disk device and operate on that instead, since we need to rewrite the
+                         * partition table to resize the partition. */
+
+                        log_info("Operating on partition device %s, using parent device.", ip);
+
+                        opened_image_fd = r = device_open_from_devnum(S_IFBLK, parent, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK, &whole_disk);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to open whole block device for %s: %m", ip);
+
+                        image_fd = opened_image_fd;
+
+                        if (fstat(image_fd, &st) < 0)
+                                return log_error_errno(errno, "Failed to stat whole block device %s: %m", whole_disk);
+                } else
+                        log_info("Operating on whole block device %s.", ip);
+
+                if (ioctl(image_fd, BLKGETSIZE64, &old_image_size) < 0)
+                        return log_error_errno(errno, "Failed to determine size of original block device: %m");
+
+                if (flock(image_fd, LOCK_EX) < 0) /* make sure udev doesn't read from it while we operate on the device */
+                        return log_error_errno(errno, "Failed to lock block device %s: %m", ip);
+
+                new_image_size = old_image_size; /* we can't resize physical block devices */
+        } else {
+                r = stat_verify_regular(&st);
+                if (r < 0)
+                        return log_error_errno(r, "Image %s is not a block device nor regular file: %m", ip);
+
+                old_image_size = st.st_size;
+
+                /* Note an asymmetry here: when we operate on loopback files the specified disk size we get we
+                 * apply onto the loopback file as a whole. When we operate on block devices we instead apply
+                 * to the partition itself only. */
+
+                if (FLAGS_SET(flags, HOME_SETUP_RESIZE_MINIMIZE)) {
+                        new_image_size = 0;
+                        intention = INTENTION_SHRINK;
+                } else {
+                        uint64_t new_image_size_rounded;
+
+                        new_image_size_rounded = DISK_SIZE_ROUND_DOWN(h->disk_size);
+
+                        if (old_image_size >= new_image_size_rounded && old_image_size <= h->disk_size) {
+                                /* If exact match, or a match after we rounded down, don't do a thing */
+                                log_info("Image size already matching, skipping operation.");
+                                return 0;
+                        }
+
+                        new_image_size = new_image_size_rounded;
+                        intention = CMP(new_image_size, old_image_size); /* Is this a shrink */
+                }
+        }
+
+        r = home_setup_luks(
+                        h,
+                        flags,
+                        whole_disk,
+                        setup,
+                        cache,
+                        FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES) ? NULL : &header_home);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES)) {
+                r = home_load_embedded_identity(h, setup->root_fd, header_home, USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL, cache, &embedded_home, &new_home);
+                if (r < 0)
+                        return r;
+        }
+
+        r = home_maybe_shift_uid(h, flags, setup);
+        if (r < 0)
+                return r;
+
+        log_info("offset = %" PRIu64 ", size = %" PRIu64 ", image = %" PRIu64, setup->partition_offset, setup->partition_size, old_image_size);
+
+        if ((UINT64_MAX - setup->partition_offset) < setup->partition_size ||
+            setup->partition_offset + setup->partition_size > old_image_size)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Old partition doesn't fit in backing storage, refusing.");
+
+        /* Get target partition information in here for new_partition_size calculation */
+        r = prepare_resize_partition(
+                        image_fd,
+                        setup->partition_offset,
+                        setup->partition_size,
+                        &disk_uuid,
+                        &table,
+                        &partition);
+        if (r < 0)
+                return r;
+
+        if (S_ISREG(st.st_mode)) {
+                uint64_t partition_table_extra, largest_size;
+
+                partition_table_extra = old_image_size - setup->partition_size;
+
+                r = get_largest_image_size(setup->image_fd, &st, &largest_size);
+                if (r < 0)
+                        return r;
+                if (new_image_size > largest_size)
+                        new_image_size = largest_size;
+
+                if (new_image_size < partition_table_extra)
+                        new_image_size = partition_table_extra;
+
+                new_partition_size = DISK_SIZE_ROUND_DOWN(new_image_size - partition_table_extra);
+        } else {
+                assert(S_ISBLK(st.st_mode));
+
+                if (FLAGS_SET(flags, HOME_SETUP_RESIZE_MINIMIZE)) {
+                        new_partition_size = 0;
+                        intention = INTENTION_SHRINK;
+                } else {
+                        uint64_t new_partition_size_rounded = DISK_SIZE_ROUND_DOWN(h->disk_size);
+
+                        if (h->disk_size == UINT64_MAX && partition) {
+                                r = get_maximum_partition_size(image_fd, partition, &new_partition_size_rounded);
+                                if (r < 0)
+                                        return r;
+                        }
+
+                        if (setup->partition_size >= new_partition_size_rounded &&
+                            setup->partition_size <= h->disk_size) {
+                                log_info("Partition size already matching, skipping operation.");
+                                return 0;
+                        }
+
+                        new_partition_size = new_partition_size_rounded;
+                        intention = CMP(new_partition_size, setup->partition_size);
+                }
+        }
+
+        if ((UINT64_MAX - setup->partition_offset) < new_partition_size ||
+            setup->partition_offset + new_partition_size > new_image_size)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "New partition doesn't fit into backing storage, refusing.");
+
+        crypto_offset = sym_crypt_get_data_offset(setup->crypt_device);
+        if (crypto_offset > UINT64_MAX/512U)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "LUKS2 data offset out of range, refusing.");
+        crypto_offset_bytes = (uint64_t) crypto_offset * 512U;
+        if (setup->partition_size <= crypto_offset_bytes)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Weird, old crypto payload offset doesn't actually fit in partition size?");
+
+        /* Make sure at least the LUKS header fit in */
+        if (new_partition_size <= crypto_offset_bytes) {
+                uint64_t add;
+
+                add = DISK_SIZE_ROUND_UP(crypto_offset_bytes) - new_partition_size;
+                new_partition_size += add;
+                if (S_ISREG(st.st_mode))
+                        new_image_size += add;
+        }
+
+        old_fs_size = setup->partition_size - crypto_offset_bytes;
+        new_fs_size = DISK_SIZE_ROUND_DOWN(new_partition_size - crypto_offset_bytes);
+
+        r = get_smallest_fs_size(setup->root_fd, &smallest_fs_size);
+        if (r < 0)
+                return r;
+
+        if (new_fs_size < smallest_fs_size) {
+                uint64_t add;
+
+                add = DISK_SIZE_ROUND_UP(smallest_fs_size) - new_fs_size;
+                new_fs_size += add;
+                new_partition_size += add;
+                if (S_ISREG(st.st_mode))
+                        new_image_size += add;
+        }
+
+        if (new_fs_size == old_fs_size) {
+                log_info("New file system size identical to old file system size, skipping operation.");
+                return 0;
+        }
+
+        if (FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_GROW) && new_fs_size > old_fs_size) {
+                log_info("New file system size would be larger than old, but shrinking requested, skipping operation.");
+                return 0;
+        }
+
+        if (FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_SHRINK) && new_fs_size < old_fs_size) {
+                log_info("New file system size would be smaller than old, but growing requested, skipping operation.");
+                return 0;
+        }
+
+        if (CMP(new_fs_size, old_fs_size) != intention) {
+                if (intention < 0)
+                        log_info("Shrink operation would enlarge file system, skipping operation.");
+                else {
+                        assert(intention > 0);
+                        log_info("Grow operation would shrink file system, skipping operation.");
+                }
+                return 0;
+        }
+
+        /* Before we start doing anything, let's figure out if we actually can */
+        resize_type = can_resize_fs(setup->root_fd, old_fs_size, new_fs_size);
+        if (resize_type < 0)
+                return resize_type;
+        if (resize_type == CAN_RESIZE_OFFLINE && FLAGS_SET(flags, HOME_SETUP_ALREADY_ACTIVATED))
+                return log_error_errno(SYNTHETIC_ERRNO(ETXTBSY), "File systems of this type can only be resized offline, but is currently online.");
+
+        log_info("Ready to resize image size %s %s %s, partition size %s %s %s, file system size %s %s %s.",
+                 FORMAT_BYTES(old_image_size),
+                 special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+                 FORMAT_BYTES(new_image_size),
+                 FORMAT_BYTES(setup->partition_size),
+                 special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+                 FORMAT_BYTES(new_partition_size),
+                 FORMAT_BYTES(old_fs_size),
+                 special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+                 FORMAT_BYTES(new_fs_size));
+
+        if (new_fs_size > old_fs_size) { /* → Grow */
+
+                if (S_ISREG(st.st_mode)) {
+                        uint64_t resized_image_size;
+
+                        /* Grow file size */
+                        r = resize_image_loop(h, setup, old_image_size, new_image_size, &resized_image_size);
+                        if (r < 0)
+                                return r;
+
+                        if (resized_image_size == old_image_size) {
+                                log_info("Couldn't change image size.");
+                                return 0;
+                        }
+
+                        assert(resized_image_size > old_image_size);
+
+                        log_info("Growing of image file from %s to %s completed.", FORMAT_BYTES(old_image_size), FORMAT_BYTES(resized_image_size));
+
+                        if (resized_image_size < new_image_size) {
+                                uint64_t sub;
+
+                                /* If the growing we managed to do is smaller than what we wanted we need to
+                                 * adjust the partition/file system sizes we are going for, too */
+                                sub = new_image_size - resized_image_size;
+                                assert(new_partition_size >= sub);
+                                new_partition_size -= sub;
+                                assert(new_fs_size >= sub);
+                                new_fs_size -= sub;
+                        }
+
+                        new_image_size = resized_image_size;
+                } else {
+                        assert(S_ISBLK(st.st_mode));
+                        assert(new_image_size == old_image_size);
+                }
+
+                /* Make sure loopback device sees the new bigger size */
+                r = loop_device_refresh_size(setup->loop, UINT64_MAX, new_partition_size);
+                if (r == -ENOTTY)
+                        log_debug_errno(r, "Device is not a loopback device, not refreshing size.");
+                else if (r < 0)
+                        return log_error_errno(r, "Failed to refresh loopback device size: %m");
+                else
+                        log_info("Refreshing loop device size completed.");
+
+                r = apply_resize_partition(image_fd, disk_uuid, table, partition, new_partition_size);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        log_info("Growing of partition completed.");
+
+                if (S_ISBLK(st.st_mode) && ioctl(image_fd, BLKRRPART, 0) < 0)
+                        log_debug_errno(errno, "BLKRRPART failed on block device, ignoring: %m");
+
+                /* Tell LUKS about the new bigger size too */
+                r = sym_crypt_resize(setup->crypt_device, setup->dm_name, new_fs_size / 512U);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to grow LUKS device: %m");
+
+                log_info("LUKS device growing completed.");
+        } else {
+                /* → Shrink */
+
+                if (!FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES)) {
+                        r = home_store_embedded_identity(new_home, setup->root_fd, h->uid, embedded_home);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (S_ISREG(st.st_mode)) {
+                        if (user_record_luks_discard(h))
+                                /* Before we shrink, let's trim the file system, so that we need less space on disk during the shrinking */
+                                (void) run_fitrim(setup->root_fd);
+                        else {
+                                /* If discard is off, let's ensure all backing blocks are allocated, so that our resize operation doesn't fail half-way */
+                                r = run_fallocate(image_fd, &st);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+        }
+
+        /* Now try to resize the file system. The requested size might not always be possible, in which case
+         * we'll try to get as close as we can get. The result is returned in 'resized_fs_size' */
+        r = resize_fs_loop(h, setup, resize_type, old_fs_size, new_fs_size, &resized_fs_size);
+        if (r < 0)
+                return r;
+
+        if (resized_fs_size == old_fs_size) {
+                log_info("Couldn't change file system size.");
+                return 0;
+        }
+
+        log_info("File system resizing from %s to %s completed.", FORMAT_BYTES(old_fs_size), FORMAT_BYTES(resized_fs_size));
+
+        if (resized_fs_size > new_fs_size) {
+                uint64_t add;
+
+                /* If the shrinking we managed to do is larger than what we wanted we need to adjust the partition/image sizes. */
+                add = resized_fs_size - new_fs_size;
+                new_partition_size += add;
+                if (S_ISREG(st.st_mode))
+                        new_image_size += add;
+        }
+
+        new_fs_size = resized_fs_size;
+
+        /* Immediately sync afterwards */
+        r = home_sync_and_statfs(setup->root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        if (new_fs_size < old_fs_size) { /* → Shrink */
+
+                /* Shrink the LUKS device now, matching the new file system size */
+                r = sym_crypt_resize(setup->crypt_device, setup->dm_name, new_fs_size / 512);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to shrink LUKS device: %m");
+
+                log_info("LUKS device shrinking completed.");
+
+                /* Refresh the loop devices size */
+                r = loop_device_refresh_size(setup->loop, UINT64_MAX, new_partition_size);
+                if (r == -ENOTTY)
+                        log_debug_errno(r, "Device is not a loopback device, not refreshing size.");
+                else if (r < 0)
+                        return log_error_errno(r, "Failed to refresh loopback device size: %m");
+                else
+                        log_info("Refreshing loop device size completed.");
+
+                if (S_ISREG(st.st_mode)) {
+                        /* Shrink the image file */
+                        if (ftruncate(image_fd, new_image_size) < 0)
+                                return log_error_errno(errno, "Failed to shrink image file %s: %m", ip);
+
+                        log_info("Shrinking of image file completed.");
+                } else {
+                        assert(S_ISBLK(st.st_mode));
+                        assert(new_image_size == old_image_size);
+                }
+
+                r = apply_resize_partition(image_fd, disk_uuid, table, partition, new_partition_size);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        log_info("Shrinking of partition completed.");
+
+                if (S_ISBLK(st.st_mode) && ioctl(image_fd, BLKRRPART, 0) < 0)
+                        log_debug_errno(errno, "BLKRRPART failed on block device, ignoring: %m");
+
+        } else { /* → Grow */
+                if (!FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES)) {
+                        r = home_store_embedded_identity(new_home, setup->root_fd, h->uid, embedded_home);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (!FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES)) {
+                r = home_store_header_identity_luks(new_home, setup, header_home);
+                if (r < 0)
+                        return r;
+
+                r = home_extend_embedded_identity(new_home, h, setup);
+                if (r < 0)
+                        return r;
+        }
+
+        if (user_record_luks_discard(h))
+                (void) run_fitrim(setup->root_fd);
+
+        r = home_sync_and_statfs(setup->root_fd, &sfs);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(flags, HOME_SETUP_RESIZE_DONT_UNDO)) {
+                r = home_setup_done(setup);
+                if (r < 0)
+                        return r;
+        }
+
+        log_info("Resizing completed.");
+
+        print_size_summary(new_image_size, new_fs_size, &sfs);
+
+        if (ret_home)
+                *ret_home = TAKE_PTR(new_home);
+
+        return 0;
+}
+
+int home_passwd_luks(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                const PasswordCache *cache, /* the passwords acquired via PKCS#11/FIDO2 security tokens */
+                char **effective_passwords  /* new passwords */) {
+
+        size_t volume_key_size, max_key_slots, n_effective;
+        _cleanup_(erase_and_freep) void *volume_key = NULL;
+        struct crypt_pbkdf_type good_pbkdf, minimal_pbkdf;
+        const char *type;
+        char **list;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_LUKS);
+        assert(setup);
+
+        r = dlopen_cryptsetup();
+        if (r < 0)
+                return r;
+
+        type = sym_crypt_get_type(setup->crypt_device);
+        if (!type)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine crypto device type.");
+
+        r = sym_crypt_keyslot_max(type);
+        if (r <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine number of key slots.");
+        max_key_slots = r;
+
+        r = sym_crypt_get_volume_key_size(setup->crypt_device);
+        if (r <= 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine volume key size.");
+        volume_key_size = (size_t) r;
+
+        volume_key = malloc(volume_key_size);
+        if (!volume_key)
+                return log_oom();
+
+        r = -ENOKEY;
+        FOREACH_POINTER(list,
+                        cache ? cache->keyring_passswords : NULL,
+                        cache ? cache->pkcs11_passwords : NULL,
+                        cache ? cache->fido2_passwords : NULL,
+                        h->password) {
+
+                r = luks_try_passwords(h, setup->crypt_device, list, volume_key, &volume_key_size, NULL);
+                if (r != -ENOKEY)
+                        break;
+        }
+        if (r == -ENOKEY)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOKEY), "Failed to unlock LUKS superblock with supplied passwords.");
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock LUKS superblock: %m");
+
+        n_effective = strv_length(effective_passwords);
+
+        build_good_pbkdf(&good_pbkdf, h);
+        build_minimal_pbkdf(&minimal_pbkdf, h);
+
+        for (size_t i = 0; i < max_key_slots; i++) {
+                r = sym_crypt_keyslot_destroy(setup->crypt_device, i);
+                if (r < 0 && !IN_SET(r, -ENOENT, -EINVAL)) /* Returns EINVAL or ENOENT if there's no key in this slot already */
+                        return log_error_errno(r, "Failed to destroy LUKS password: %m");
+
+                if (i >= n_effective) {
+                        if (r >= 0)
+                                log_info("Destroyed LUKS key slot %zu.", i);
+                        continue;
+                }
+
+                if (password_cache_contains(cache, effective_passwords[i])) { /* Is this a FIDO2 or PKCS#11 password? */
+                        log_debug("Using minimal PBKDF for slot %zu", i);
+                        r = sym_crypt_set_pbkdf_type(setup->crypt_device, &minimal_pbkdf);
+                } else {
+                        log_debug("Using good PBKDF for slot %zu", i);
+                        r = sym_crypt_set_pbkdf_type(setup->crypt_device, &good_pbkdf);
+                }
+                if (r < 0)
+                        return log_error_errno(r, "Failed to tweak PBKDF for slot %zu: %m", i);
+
+                r = sym_crypt_keyslot_add_by_volume_key(
+                                setup->crypt_device,
+                                i,
+                                volume_key,
+                                volume_key_size,
+                                effective_passwords[i],
+                                strlen(effective_passwords[i]));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set up LUKS password: %m");
+
+                log_info("Updated LUKS key slot %zu.", i);
+
+                /* If we changed the password, then make sure to update the copy in the keyring, so that
+                 * auto-rebalance continues to work. We only do this if we operate on an active home dir. */
+                if (i == 0 && FLAGS_SET(flags, HOME_SETUP_ALREADY_ACTIVATED))
+                        upload_to_keyring(h, effective_passwords[i], NULL);
+        }
+
+        return 1;
+}
+
+int home_lock_luks(UserRecord *h, HomeSetup *setup) {
+        const char *p;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(setup->root_fd < 0);
+        assert(!setup->crypt_device);
+
+        r = acquire_open_luks_device(h, setup, /* graceful= */ false);
+        if (r < 0)
+                return r;
+
+        log_info("Discovered used LUKS device %s.", setup->dm_node);
+
+        assert_se(p = user_record_home_directory(h));
+        r = syncfs_path(AT_FDCWD, p);
+        if (r < 0) /* Snake oil, but let's better be safe than sorry */
+                return log_error_errno(r, "Failed to synchronize file system %s: %m", p);
+
+        log_info("File system synchronized.");
+
+        /* Note that we don't invoke FIFREEZE here, it appears libcryptsetup/device-mapper already does that on its own for us */
+
+        r = sym_crypt_suspend(setup->crypt_device, setup->dm_name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to suspend cryptsetup device: %s: %m", setup->dm_node);
+
+        log_info("LUKS device suspended.");
+        return 0;
+}
+
+static int luks_try_resume(
+                struct crypt_device *cd,
+                const char *dm_name,
+                char **password) {
+
+        int r;
+
+        assert(cd);
+        assert(dm_name);
+
+        STRV_FOREACH(pp, password) {
+                r = sym_crypt_resume_by_passphrase(
+                                cd,
+                                dm_name,
+                                CRYPT_ANY_SLOT,
+                                *pp,
+                                strlen(*pp));
+                if (r >= 0) {
+                        log_info("Resumed LUKS device %s.", dm_name);
+                        return 0;
+                }
+
+                log_debug_errno(r, "Password %zu didn't work for resuming device: %m", (size_t) (pp - password));
+        }
+
+        return -ENOKEY;
+}
+
+int home_unlock_luks(UserRecord *h, HomeSetup *setup, const PasswordCache *cache) {
+        char **list;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(!setup->crypt_device);
+
+        r = acquire_open_luks_device(h, setup, /* graceful= */ false);
+        if (r < 0)
+                return r;
+
+        log_info("Discovered used LUKS device %s.", setup->dm_node);
+
+        r = -ENOKEY;
+        FOREACH_POINTER(list,
+                        cache ? cache->pkcs11_passwords : NULL,
+                        cache ? cache->fido2_passwords : NULL,
+                        h->password) {
+                r = luks_try_resume(setup->crypt_device, setup->dm_name, list);
+                if (r != -ENOKEY)
+                        break;
+        }
+        if (r == -ENOKEY)
+                return log_error_errno(r, "No valid password for LUKS superblock.");
+        if (r < 0)
+                return log_error_errno(r, "Failed to resume LUKS superblock: %m");
+
+        log_info("LUKS device resumed.");
+        return 0;
+}
+
+static int device_is_gone(HomeSetup *setup) {
+        _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+        struct stat st;
+        int r;
+
+        assert(setup);
+
+        if (!setup->dm_node)
+                return true;
+
+        if (stat(setup->dm_node, &st) < 0) {
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to stat block device node %s: %m", setup->dm_node);
+
+                return true;
+        }
+
+        r = sd_device_new_from_stat_rdev(&d, &st);
+        if (r < 0) {
+                if (r != -ENODEV)
+                        return log_error_errno(errno, "Failed to allocate device object from block device node %s: %m", setup->dm_node);
+
+                return true;
+        }
+
+        return false;
+}
+
+static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) {
+        HomeSetup *setup = ASSERT_PTR(userdata);
+        int r;
+
+        if (!device_for_action(device, SD_DEVICE_REMOVE))
+                return 0;
+
+        /* We don't really care for the device object passed to us, we just check if the device node still
+         * exists */
+
+        r = device_is_gone(setup);
+        if (r < 0)
+                return r;
+        if (r > 0) /* Yay! we are done! */
+                (void) sd_event_exit(sd_device_monitor_get_event(monitor), 0);
+
+        return 0;
+}
+
+int wait_for_block_device_gone(HomeSetup *setup, usec_t timeout_usec) {
+        _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *m = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        int r;
+
+        assert(setup);
+
+        /* So here's the thing: we enable "deferred deactivation" on our dm-crypt volumes. This means they
+         * are automatically torn down once not used anymore (i.e. once unmounted). Which is great. It also
+         * means that when we deactivate a home directory and try to tear down the volume that backs it, it
+         * possibly is already torn down or in the process of being torn down, since we race against the
+         * automatic tearing down. Which is fine, we handle errors from that. However, we lose the ability to
+         * naturally wait for the tear down operation to complete: if we are not the ones who tear down the
+         * device we are also not the ones who naturally block on that operation. Hence let's add some code
+         * to actively wait for the device to go away, via sd-device. We'll call this whenever tearing down a
+         * LUKS device, to ensure the device is really really gone before we proceed. Net effect: "homectl
+         * deactivate foo && homectl activate foo" will work reliably, i.e. deactivation immediately followed
+         * by activation will work. Also, by the time deactivation completes we can guarantee that all data
+         * is sync'ed down to the lowest block layer as all higher levels are fully and entirely
+         * destructed. */
+
+        if (!setup->dm_name)
+                return 0;
+
+        assert(setup->dm_node);
+        log_debug("Waiting until %s disappears.", setup->dm_node);
+
+        r = sd_event_new(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        r = sd_device_monitor_new(&m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate device monitor: %m");
+
+        r = sd_device_monitor_filter_add_match_subsystem_devtype(m, "block", "disk");
+        if (r < 0)
+                return log_error_errno(r, "Failed to configure device monitor match: %m");
+
+        r = sd_device_monitor_attach_event(m, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach device monitor to event loop: %m");
+
+        r = sd_device_monitor_start(m, device_monitor_handler, setup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start device monitor: %m");
+
+        r = device_is_gone(setup);
+        if (r < 0)
+                return r;
+        if (r > 0) {
+                log_debug("%s has already disappeared before entering wait loop.", setup->dm_node);
+                return 0; /* gone already */
+        }
+
+        if (timeout_usec != USEC_INFINITY) {
+                r = sd_event_add_time_relative(event, NULL, CLOCK_MONOTONIC, timeout_usec, 0, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add timer event: %m");
+        }
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        r = device_is_gone(setup);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return log_error_errno(r, "Device %s still around.", setup->dm_node);
+
+        log_debug("Successfully waited until device %s disappeared.", setup->dm_node);
+        return 0;
+}
+
+int home_auto_shrink_luks(UserRecord *h, HomeSetup *setup, PasswordCache *cache) {
+        struct statfs sfs;
+        int r;
+
+        assert(h);
+        assert(user_record_storage(h) == USER_LUKS);
+        assert(setup);
+        assert(setup->root_fd >= 0);
+
+        if (user_record_auto_resize_mode(h) != AUTO_RESIZE_SHRINK_AND_GROW)
+                return 0;
+
+        if (fstatfs(setup->root_fd, &sfs) < 0)
+                return log_error_errno(errno, "Failed to statfs home directory: %m");
+
+        if (!fs_can_online_shrink_and_grow(sfs.f_type)) {
+                log_debug("Not auto-shrinking file system, since selected file system cannot do both online shrink and grow.");
+                return 0;
+        }
+
+        r = home_resize_luks(
+                        h,
+                        HOME_SETUP_ALREADY_ACTIVATED|
+                        HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES|
+                        HOME_SETUP_RESIZE_MINIMIZE|
+                        HOME_SETUP_RESIZE_DONT_GROW|
+                        HOME_SETUP_RESIZE_DONT_UNDO,
+                        setup,
+                        cache,
+                        NULL);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
diff --git a/src/home/homework-luks.h b/src/home/homework-luks.h
new file mode 100644
index 0000000..0218de8
--- /dev/null
+++ b/src/home/homework-luks.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "cryptsetup-util.h"
+#include "homework.h"
+#include "user-record.h"
+
+int home_setup_luks(UserRecord *h, HomeSetupFlags flags, const char *force_image_path, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_luks_home);
+
+int home_activate_luks(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_home);
+int home_deactivate_luks(UserRecord *h, HomeSetup *setup);
+int home_trim_luks(UserRecord *h, HomeSetup *setup);
+
+int home_store_header_identity_luks(UserRecord *h, HomeSetup *setup, UserRecord *old_home);
+
+int home_create_luks(UserRecord *h, HomeSetup *setup, const PasswordCache *cache, char **effective_passwords, UserRecord **ret_home);
+
+int home_get_state_luks(UserRecord *h, HomeSetup *setup);
+
+int home_resize_luks(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_home);
+
+int home_passwd_luks(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, const PasswordCache *cache, char **effective_passwords);
+
+int home_lock_luks(UserRecord *h, HomeSetup *setup);
+int home_unlock_luks(UserRecord *h, HomeSetup *setup, const PasswordCache *cache);
+
+int home_auto_shrink_luks(UserRecord *h, HomeSetup *setup, PasswordCache *cache);
+
+static inline uint64_t luks_volume_key_size_convert(struct crypt_device *cd) {
+        int k;
+
+        assert(cd);
+
+        /* Convert the "int" to uint64_t, which we usually use for byte sizes stored on disk. */
+
+        k = sym_crypt_get_volume_key_size(cd);
+        if (k <= 0)
+                return UINT64_MAX;
+
+        return (uint64_t) k;
+}
+
+int run_fitrim(int root_fd);
+int run_fallocate(int backing_fd, const struct stat *st);
+int run_fallocate_by_path(const char *backing_path);
+int run_mark_dirty(int fd, bool b);
+int run_mark_dirty_by_path(const char *path, bool b);
+
+int wait_for_block_device_gone(HomeSetup *setup, usec_t timeout_usec);
diff --git a/src/home/homework-mount.c b/src/home/homework-mount.c
new file mode 100644
index 0000000..28f09b9
--- /dev/null
+++ b/src/home/homework-mount.c
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#if WANT_LINUX_FS_H
+#include 
+#endif
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "glyph-util.h"
+#include "home-util.h"
+#include "homework-mount.h"
+#include "homework.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "user-util.h"
+
+static const char *mount_options_for_fstype(const char *fstype) {
+        const char *e;
+        char *n;
+
+        assert(fstype);
+
+        /* Allow overriding our built-in defaults with an environment variable */
+        n = strjoina("SYSTEMD_HOME_MOUNT_OPTIONS_", fstype);
+        e = getenv(ascii_strupper(n));
+        if (e)
+                return e;
+
+        if (streq(fstype, "ext4"))
+                return "noquota,user_xattr";
+        if (streq(fstype, "xfs"))
+                return "noquota";
+        if (streq(fstype, "btrfs"))
+                return "noacl,compress=zstd:1";
+        return NULL;
+}
+
+int home_mount_node(
+                const char *node,
+                const char *fstype,
+                bool discard,
+                unsigned long flags,
+                const char *extra_mount_options) {
+
+        _cleanup_free_ char *joined = NULL;
+        const char *default_options;
+        int r;
+
+        assert(node);
+        assert(fstype);
+
+        default_options = mount_options_for_fstype(fstype);
+        if (default_options) {
+                if (!strextend_with_separator(&joined, ",", default_options))
+                        return log_oom();
+        }
+
+        if (!strextend_with_separator(&joined, ",", discard ? "discard" : "nodiscard"))
+                return log_oom();
+
+        if (extra_mount_options) {
+                if (!strextend_with_separator(&joined, ",", extra_mount_options))
+                        return log_oom();
+        }
+
+        r = mount_nofollow_verbose(LOG_ERR, node, HOME_RUNTIME_WORK_DIR, fstype, flags|MS_RELATIME, joined);
+        if (r < 0)
+                return r;
+
+        log_info("Mounting file system completed.");
+        return 0;
+}
+
+int home_unshare_and_mkdir(void) {
+        int r;
+
+        if (unshare(CLONE_NEWNS) < 0)
+                return log_error_errno(errno, "Couldn't unshare file system namespace: %m");
+
+        assert(path_startswith(HOME_RUNTIME_WORK_DIR, "/run"));
+
+        r = mount_nofollow_verbose(LOG_ERR, "/run", "/run", NULL, MS_SLAVE|MS_REC, NULL); /* Mark /run as MS_SLAVE in our new namespace */
+        if (r < 0)
+                return r;
+
+        (void) mkdir_p(HOME_RUNTIME_WORK_DIR, 0700);
+        return 0;
+}
+
+int home_unshare_and_mount(
+                const char *node,
+                const char *fstype,
+                bool discard,
+                unsigned long flags,
+                const char *extra_mount_options) {
+
+        int r;
+
+        assert(node);
+        assert(fstype);
+
+        r = home_unshare_and_mkdir();
+        if (r < 0)
+                return r;
+
+        r = home_mount_node(node, fstype, discard, flags, extra_mount_options);
+        if (r < 0)
+                return r;
+
+        r = mount_nofollow_verbose(LOG_ERR, NULL, HOME_RUNTIME_WORK_DIR, NULL, MS_PRIVATE, NULL);
+        if (r < 0) {
+                (void) umount_verbose(LOG_ERR, HOME_RUNTIME_WORK_DIR, UMOUNT_NOFOLLOW);
+                return r;
+        }
+
+        return 0;
+}
+
+int home_move_mount(const char *mount_suffix, const char *target) {
+        _cleanup_free_ char *subdir = NULL;
+        const char *d;
+        int r;
+
+        assert(target);
+
+        /* If 'mount_suffix' is set, then we'll mount a subdir of the source mount into the host. If it's
+         * NULL we'll move the mount itself */
+        if (mount_suffix) {
+                subdir = path_join(HOME_RUNTIME_WORK_DIR, mount_suffix);
+                if (!subdir)
+                        return log_oom();
+
+                d = subdir;
+        } else
+                d = HOME_RUNTIME_WORK_DIR;
+
+        (void) mkdir_p(target, 0700);
+
+        r = mount_nofollow_verbose(LOG_ERR, d, target, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
+
+        r = umount_recursive(HOME_RUNTIME_WORK_DIR, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unmount %s: %m", HOME_RUNTIME_WORK_DIR);
+
+        log_info("Moving to final mount point %s completed.", target);
+        return 0;
+}
+
+static int append_identity_range(char **text, uid_t start, uid_t next_start, uid_t exclude) {
+        /* Creates an identity range ranging from 'start' to 'next_start-1'. Excludes the UID specified by 'exclude' if
+         * it is in that range. */
+
+        assert(text);
+
+        if (next_start <= start) /* Empty range? */
+                return 0;
+
+        if (exclude < start || exclude >= next_start) /* UID to exclude it outside of the range? */
+                return strextendf(text, UID_FMT " " UID_FMT " " UID_FMT "\n", start, start, next_start - start);
+
+        if (start == exclude && next_start == exclude + 1) /* The only UID in the range is the one to exclude? */
+                return 0;
+
+        if (exclude == start) /* UID to exclude at beginning of range? */
+                return strextendf(text, UID_FMT " " UID_FMT " " UID_FMT "\n", start+1, start+1, next_start - start - 1);
+
+        if (exclude == next_start - 1) /* UID to exclude at end of range? */
+                return strextendf(text, UID_FMT " " UID_FMT " " UID_FMT "\n", start, start, next_start - start - 1);
+
+        return strextendf(text,
+                          UID_FMT " " UID_FMT " " UID_FMT "\n"
+                          UID_FMT " " UID_FMT " " UID_FMT "\n",
+                          start, start, exclude - start,
+                          exclude + 1, exclude + 1, next_start - exclude - 1);
+}
+
+static int make_home_userns(uid_t stored_uid, uid_t exposed_uid) {
+        _cleanup_free_ char *text = NULL;
+        _cleanup_close_ int userns_fd = -EBADF;
+        int r;
+
+        assert(uid_is_valid(stored_uid));
+        assert(uid_is_valid(exposed_uid));
+
+        assert_cc(HOME_UID_MIN <= HOME_UID_MAX);
+        assert_cc(HOME_UID_MAX < UID_NOBODY);
+
+        /* Map everything below the homed UID range to itself (except for the UID we actually care about if
+         * it is inside this range) */
+        r = append_identity_range(&text, 0, HOME_UID_MIN, stored_uid);
+        if (r < 0)
+                return log_oom();
+
+        /* Now map the UID we are doing this for to the target UID. */
+        r = strextendf(&text, UID_FMT " " UID_FMT " " UID_FMT "\n", stored_uid, exposed_uid, 1u);
+        if (r < 0)
+                return log_oom();
+
+        /* Map everything above the homed UID range to itself (again, excluding the UID we actually care
+         * about if it is in that range). Also we leave "nobody" itself excluded) */
+        r = append_identity_range(&text, HOME_UID_MAX, UID_NOBODY, stored_uid);
+        if (r < 0)
+                return log_oom();
+
+        /* Also map the container range. People can use that to place containers owned by high UIDs in their
+         * home directories if they really want. We won't manage this UID range for them but pass it through
+         * 1:1, and it will lose its meaning once migrated between hosts. */
+        r = append_identity_range(&text, CONTAINER_UID_BASE_MIN, CONTAINER_UID_BASE_MAX+1, stored_uid);
+        if (r < 0)
+                return log_oom();
+
+        /* Map nspawn's mapped root UID as identity mapping so that people can run nspawn uidmap mounted
+         * containers off $HOME, if they want. */
+        r = strextendf(&text, UID_FMT " " UID_FMT " " UID_FMT "\n", UID_MAPPED_ROOT, UID_MAPPED_ROOT, 1u);
+        if (r < 0)
+                return log_oom();
+
+        /* Leave everything else unmapped, starting from UID_NOBODY itself. Specifically, this means the
+         * whole space outside of 16-bit remains unmapped */
+
+        log_debug("Creating userns with mapping:\n%s", text);
+
+        userns_fd = userns_acquire(text, text); /* same uid + gid mapping */
+        if (userns_fd < 0)
+                return log_error_errno(userns_fd, "Failed to allocate user namespace: %m");
+
+        return TAKE_FD(userns_fd);
+}
+
+int home_shift_uid(int dir_fd, const char *target, uid_t stored_uid, uid_t exposed_uid, int *ret_mount_fd) {
+        _cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF;
+        int r;
+
+        assert(dir_fd >= 0);
+        assert(uid_is_valid(stored_uid));
+        assert(uid_is_valid(exposed_uid));
+
+        /* Let's try to set up a UID mapping for this directory. This is called when first creating a home
+         * directory or when activating it again. We do this as optimization only, to avoid having to
+         * recursively chown() things on each activation. If the kernel or file system doesn't support this
+         * scheme we'll handle this gracefully, and not do anything, so that the later recursive chown()ing
+         * then fixes up things for us. Note that the chown()ing is smart enough to skip things if they look
+         * alright already.
+         *
+         * Note that this always creates a new mount (i.e. we use OPEN_TREE_CLONE), since applying idmaps is
+         * not allowed once the mount is put in place. */
+
+        mount_fd = open_tree(dir_fd, "", AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+        if (mount_fd < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno)) {
+                        log_debug_errno(errno, "The open_tree() syscall is not supported, not setting up UID shift mount: %m");
+
+                        if (ret_mount_fd)
+                                *ret_mount_fd = -EBADF;
+
+                        return 0;
+                }
+
+                return log_error_errno(errno, "Failed to open tree of home directory: %m");
+        }
+
+        userns_fd = make_home_userns(stored_uid, exposed_uid);
+        if (userns_fd < 0)
+                return userns_fd;
+
+        /* Set the user namespace mapping attribute on the cloned mount point */
+        if (mount_setattr(mount_fd, "", AT_EMPTY_PATH,
+                          &(struct mount_attr) {
+                                  .attr_set = MOUNT_ATTR_IDMAP,
+                                  .userns_fd = userns_fd,
+                          }, MOUNT_ATTR_SIZE_VER0) < 0) {
+
+                if (ERRNO_IS_NOT_SUPPORTED(errno) || errno == EINVAL) { /* EINVAL is documented in mount_attr() as fs doesn't support idmapping */
+                        log_debug_errno(errno, "UID/GID mapping for shifted mount not available, not setting it up: %m");
+
+                        if (ret_mount_fd)
+                                *ret_mount_fd = -EBADF;
+
+                        return 0;
+                }
+
+                return log_error_errno(errno, "Failed to apply UID/GID mapping: %m");
+        }
+
+        if (target)
+                r = move_mount(mount_fd, "", AT_FDCWD, target, MOVE_MOUNT_F_EMPTY_PATH);
+        else
+                r = move_mount(mount_fd, "", dir_fd, "", MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_T_EMPTY_PATH);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to apply UID/GID map: %m");
+
+        log_debug("Applied uidmap mount to %s. Mapping is " UID_FMT " %s " UID_FMT ".",
+                  strna(target), stored_uid, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), exposed_uid);
+
+        if (ret_mount_fd)
+                *ret_mount_fd = TAKE_FD(mount_fd);
+
+        return 1;
+}
diff --git a/src/home/homework-mount.h b/src/home/homework-mount.h
new file mode 100644
index 0000000..255df26
--- /dev/null
+++ b/src/home/homework-mount.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+int home_mount_node(const char *node, const char *fstype, bool discard, unsigned long flags, const char *extra_mount_options);
+int home_unshare_and_mkdir(void);
+int home_unshare_and_mount(const char *node, const char *fstype, bool discard, unsigned long flags, const char *extra_mount_options);
+int home_move_mount(const char *user_name_and_realm, const char *target);
+int home_shift_uid(int dir_fd, const char *target, uid_t stored_uid, uid_t exposed_uid, int *ret_mount_fd);
diff --git a/src/home/homework-password-cache.c b/src/home/homework-password-cache.c
new file mode 100644
index 0000000..00a0f69
--- /dev/null
+++ b/src/home/homework-password-cache.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "homework-password-cache.h"
+#include "keyring-util.h"
+#include "missing_syscall.h"
+#include "user-record.h"
+
+void password_cache_free(PasswordCache *cache) {
+        if (!cache)
+                return;
+
+        cache->pkcs11_passwords = strv_free_erase(cache->pkcs11_passwords);
+        cache->fido2_passwords = strv_free_erase(cache->fido2_passwords);
+        cache->keyring_passswords = strv_free_erase(cache->keyring_passswords);
+}
+
+void password_cache_load_keyring(UserRecord *h, PasswordCache *cache) {
+        _cleanup_(erase_and_freep) void *p = NULL;
+        _cleanup_free_ char *name = NULL;
+        char **strv;
+        key_serial_t serial;
+        size_t sz;
+        int r;
+
+        assert(h);
+        assert(cache);
+
+        /* Loads the password we need to for automatic resizing from the kernel keyring */
+
+        name = strjoin("homework-user-", h->user_name);
+        if (!name)
+                return (void) log_oom();
+
+        serial = request_key("user", name, NULL, 0);
+        if (serial == -1)
+                return (void) log_debug_errno(errno, "Failed to request key '%s', ignoring: %m", name);
+
+        r = keyring_read(serial, &p, &sz);
+        if (r < 0)
+                return (void) log_debug_errno(r, "Failed to read keyring key '%s', ignoring: %m", name);
+
+        if (memchr(p, 0, sz))
+                return (void) log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Cached password contains embedded NUL byte, ignoring.");
+
+        strv = new(char*, 2);
+        if (!strv)
+                return (void) log_oom();
+
+        strv[0] = TAKE_PTR(p); /* Note that keyring_read() will NUL terminate implicitly, hence we don't have
+                                * to NUL terminate manually here: it's a valid string. */
+        strv[1] = NULL;
+
+        strv_free_erase(cache->keyring_passswords);
+        cache->keyring_passswords = strv;
+
+        log_debug("Successfully acquired home key from kernel keyring.");
+}
diff --git a/src/home/homework-password-cache.h b/src/home/homework-password-cache.h
new file mode 100644
index 0000000..fdfbcfe
--- /dev/null
+++ b/src/home/homework-password-cache.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "strv.h"
+#include "user-record.h"
+
+typedef struct PasswordCache {
+        /* Passwords acquired from the kernel keyring */
+        char **keyring_passswords;
+
+        /* Decoding passwords from security tokens is expensive and typically requires user interaction,
+         * hence cache any we already figured out. */
+        char **pkcs11_passwords;
+        char **fido2_passwords;
+} PasswordCache;
+
+void password_cache_free(PasswordCache *cache);
+
+static inline bool password_cache_contains(const PasswordCache *cache, const char *p) {
+        if (!cache)
+                return false;
+
+        return strv_contains(cache->pkcs11_passwords, p) ||
+                strv_contains(cache->fido2_passwords, p) ||
+                strv_contains(cache->keyring_passswords, p);
+}
+
+void password_cache_load_keyring(UserRecord *h, PasswordCache *cache);
diff --git a/src/home/homework-pkcs11.c b/src/home/homework-pkcs11.c
new file mode 100644
index 0000000..f371994
--- /dev/null
+++ b/src/home/homework-pkcs11.c
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "hexdecoct.h"
+#include "homework-pkcs11.h"
+#include "pkcs11-util.h"
+#include "strv.h"
+
+int pkcs11_callback(
+                CK_FUNCTION_LIST *m,
+                CK_SESSION_HANDLE session,
+                CK_SLOT_ID slot_id,
+                const CK_SLOT_INFO *slot_info,
+                const CK_TOKEN_INFO *token_info,
+                P11KitUri *uri,
+                void *userdata) {
+
+        _cleanup_(erase_and_freep) void *decrypted_key = NULL;
+        struct pkcs11_callback_data *data = ASSERT_PTR(userdata);
+        _cleanup_free_ char *token_label = NULL;
+        CK_TOKEN_INFO updated_token_info;
+        size_t decrypted_key_size;
+        CK_OBJECT_HANDLE object;
+        CK_RV rv;
+        int r;
+
+        assert(m);
+        assert(slot_info);
+        assert(token_info);
+        assert(uri);
+
+        /* Special return values:
+         *
+         * -ENOANO       → if we need a PIN but have none
+         * -ERFKILL      → if a "protected authentication path" is needed but we have no OK to use it
+         * -EOWNERDEAD   → if the PIN is locked
+         * -ENOLCK       → if the supplied PIN is incorrect
+         * -ETOOMANYREFS → ditto, but only a few tries left
+         * -EUCLEAN      → ditto, but only a single try left
+         */
+
+        token_label = pkcs11_token_label(token_info);
+        if (!token_label)
+                return log_oom();
+
+        if (FLAGS_SET(token_info->flags, CKF_PROTECTED_AUTHENTICATION_PATH)) {
+
+                if (data->secret->pkcs11_protected_authentication_path_permitted <= 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(ERFKILL), "Security token requires authentication through protected authentication path.");
+
+                rv = m->C_Login(session, CKU_USER, NULL, 0);
+                if (rv != CKR_OK)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to log into security token '%s': %s", token_label, sym_p11_kit_strerror(rv));
+
+                log_info("Successfully logged into security token '%s' via protected authentication path.", token_label);
+                goto decrypt;
+        }
+
+        if (!FLAGS_SET(token_info->flags, CKF_LOGIN_REQUIRED)) {
+                log_info("No login into security token '%s' required.", token_label);
+                goto decrypt;
+        }
+
+        if (strv_isempty(data->secret->token_pin))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOANO), "Security token requires PIN.");
+
+        STRV_FOREACH(i, data->secret->token_pin) {
+                rv = m->C_Login(session, CKU_USER, (CK_UTF8CHAR*) *i, strlen(*i));
+                if (rv == CKR_OK) {
+                        log_info("Successfully logged into security token '%s' with PIN.", token_label);
+                        goto decrypt;
+                }
+                if (rv == CKR_PIN_LOCKED)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOWNERDEAD), "PIN of security token is blocked. Please unblock it first.");
+                if (!IN_SET(rv, CKR_PIN_INCORRECT, CKR_PIN_LEN_RANGE))
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to log into security token '%s': %s", token_label, sym_p11_kit_strerror(rv));
+        }
+
+        rv = m->C_GetTokenInfo(slot_id, &updated_token_info);
+        if (rv != CKR_OK)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to acquire updated security token information for slot %lu: %s", slot_id, sym_p11_kit_strerror(rv));
+
+        if (FLAGS_SET(updated_token_info.flags, CKF_USER_PIN_FINAL_TRY))
+                return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "PIN of security token incorrect, only a single try left.");
+        if (FLAGS_SET(updated_token_info.flags, CKF_USER_PIN_COUNT_LOW))
+                return log_error_errno(SYNTHETIC_ERRNO(ETOOMANYREFS), "PIN of security token incorrect, only a few tries left.");
+
+        return log_error_errno(SYNTHETIC_ERRNO(ENOLCK), "PIN of security token incorrect.");
+
+decrypt:
+        r = pkcs11_token_find_private_key(m, session, uri, &object);
+        if (r < 0)
+                return r;
+
+        r = pkcs11_token_decrypt_data(m, session, object, data->encrypted_key->data, data->encrypted_key->size, &decrypted_key, &decrypted_key_size);
+        if (r < 0)
+                return r;
+
+        if (base64mem(decrypted_key, decrypted_key_size, &data->decrypted_password) < 0)
+                return log_oom();
+
+        return 1;
+}
diff --git a/src/home/homework-pkcs11.h b/src/home/homework-pkcs11.h
new file mode 100644
index 0000000..c8674e0
--- /dev/null
+++ b/src/home/homework-pkcs11.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_P11KIT
+#include "memory-util.h"
+#include "user-record.h"
+#include "pkcs11-util.h"
+
+struct pkcs11_callback_data {
+        UserRecord *user_record;
+        UserRecord *secret;
+        Pkcs11EncryptedKey *encrypted_key;
+        char *decrypted_password;
+};
+
+static inline void pkcs11_callback_data_release(struct pkcs11_callback_data *data) {
+        erase_and_free(data->decrypted_password);
+}
+
+int pkcs11_callback(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_SLOT_ID slot_id, const CK_SLOT_INFO *slot_info, const CK_TOKEN_INFO *token_info, P11KitUri *uri, void *userdata);
+#endif
diff --git a/src/home/homework-quota.c b/src/home/homework-quota.c
new file mode 100644
index 0000000..508c0c0
--- /dev/null
+++ b/src/home/homework-quota.c
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include 
+
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "errno-util.h"
+#include "format-util.h"
+#include "homework-quota.h"
+#include "missing_magic.h"
+#include "quota-util.h"
+#include "stat-util.h"
+#include "user-util.h"
+
+int home_update_quota_btrfs(UserRecord *h, const char *path) {
+        int r;
+
+        assert(h);
+        assert(path);
+
+        if (h->disk_size == UINT64_MAX)
+                return 0;
+
+        /* If the user wants quota, enable it */
+        r = btrfs_quota_enable(path, true);
+        if (r == -ENOTTY)
+                return log_error_errno(r, "No btrfs quota support on subvolume %s.", path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable btrfs quota support on %s.", path);
+
+        r = btrfs_qgroup_set_limit(path, 0, h->disk_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set disk quota on subvolume %s: %m", path);
+
+        log_info("Set btrfs quota.");
+
+        return 0;
+}
+
+int home_update_quota_classic(UserRecord *h, const char *path) {
+        struct dqblk req;
+        dev_t devno;
+        int r;
+
+        assert(h);
+        assert(uid_is_valid(h->uid));
+        assert(path);
+
+        if (h->disk_size == UINT64_MAX)
+                return 0;
+
+        r = get_block_device(path, &devno);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine block device of %s: %m", path);
+        if (devno == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system %s not backed by a block device.", path);
+
+        r = quotactl_devnum(QCMD_FIXED(Q_GETQUOTA, USRQUOTA), devno, h->uid, &req);
+        if (r == -ESRCH)
+                zero(req);
+        else if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                return log_error_errno(r, "No UID quota support on %s.", path);
+        else if (r < 0)
+                return log_error_errno(r, "Failed to query disk quota for UID " UID_FMT ": %m", h->uid);
+        else if (FLAGS_SET(req.dqb_valid, QIF_BLIMITS) && h->disk_size / QIF_DQBLKSIZE == req.dqb_bhardlimit) {
+                /* Shortcut things if everything is set up properly already */
+                log_info("Configured quota already matches the intended setting, not updating quota.");
+                return 0;
+        }
+
+        req.dqb_valid = QIF_BLIMITS;
+        req.dqb_bsoftlimit = req.dqb_bhardlimit = h->disk_size / QIF_DQBLKSIZE;
+
+        r = quotactl_devnum(QCMD_FIXED(Q_SETQUOTA, USRQUOTA), devno, h->uid, &req);
+        if (r == -ESRCH)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "UID quota not available on %s.", path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set disk quota for UID " UID_FMT ": %m", h->uid);
+
+        log_info("Updated per-UID quota.");
+
+        return 0;
+}
+
+int home_update_quota_auto(UserRecord *h, const char *path) {
+        struct statfs sfs;
+        int r;
+
+        assert(h);
+
+        if (h->disk_size == UINT64_MAX)
+                return 0;
+
+        if (!path) {
+                path = user_record_image_path(h);
+                if (!path)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Home record lacks image path.");
+        }
+
+        if (statfs(path, &sfs) < 0)
+                return log_error_errno(errno, "Failed to statfs() file system: %m");
+
+        if (is_fs_type(&sfs, XFS_SB_MAGIC) ||
+            is_fs_type(&sfs, EXT4_SUPER_MAGIC))
+                return home_update_quota_classic(h, path);
+
+        if (is_fs_type(&sfs, BTRFS_SUPER_MAGIC)) {
+
+                r = btrfs_is_subvol(path);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to test if %s is a subvolume: %m", path);
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Directory %s is not a subvolume, cannot apply quota.", path);
+
+                return home_update_quota_btrfs(h, path);
+        }
+
+        return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Type of directory %s not known, cannot apply quota.", path);
+}
diff --git a/src/home/homework-quota.h b/src/home/homework-quota.h
new file mode 100644
index 0000000..a21c9ba
--- /dev/null
+++ b/src/home/homework-quota.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "user-record.h"
+
+int home_update_quota_btrfs(UserRecord *h, const char *path);
+int home_update_quota_classic(UserRecord *h, const char *path);
+int home_update_quota_auto(UserRecord *h, const char *path);
diff --git a/src/home/homework.c b/src/home/homework.c
new file mode 100644
index 0000000..066483e
--- /dev/null
+++ b/src/home/homework.c
@@ -0,0 +1,1979 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "blockdev-util.h"
+#include "chown-recursive.h"
+#include "copy.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "filesystems.h"
+#include "fs-util.h"
+#include "home-util.h"
+#include "homework-cifs.h"
+#include "homework-directory.h"
+#include "homework-fido2.h"
+#include "homework-fscrypt.h"
+#include "homework-luks.h"
+#include "homework-mount.h"
+#include "homework-pkcs11.h"
+#include "homework.h"
+#include "libcrypt-util.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "missing_magic.h"
+#include "mount-util.h"
+#include "path-util.h"
+#include "recovery-key.h"
+#include "rm-rf.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "sync-util.h"
+#include "tmpfile-util.h"
+#include "user-util.h"
+#include "virt.h"
+
+/* Make sure a bad password always results in a 3s delay, no matter what */
+#define BAD_PASSWORD_DELAY_USEC (3 * USEC_PER_SEC)
+
+int user_record_authenticate(
+                UserRecord *h,
+                UserRecord *secret,
+                PasswordCache *cache,
+                bool strict_verify) {
+
+        bool need_password = false, need_recovery_key = false, need_token = false, need_pin = false,
+                need_protected_authentication_path_permitted = false, need_user_presence_permitted = false,
+                need_user_verification_permitted = false, pin_locked = false, pin_incorrect = false,
+                pin_incorrect_few_tries_left = false, pin_incorrect_one_try_left = false, token_action_timeout = false;
+        int r;
+
+        assert(h);
+        assert(secret);
+
+        /* Tries to authenticate a user record with the supplied secrets. i.e. checks whether at least one
+         * supplied plaintext passwords matches a hashed password field of the user record. Or if a
+         * configured PKCS#11 or FIDO2 token is around and can unlock the record.
+         *
+         * Note that the 'cache' parameter is both an input and output parameter: it contains lists of
+         * configured, decrypted PKCS#11/FIDO2 passwords. We typically have to call this function multiple
+         * times over the course of an operation (think: on login we authenticate the host user record, the
+         * record embedded in the LUKS record and the one embedded in $HOME). Hence we keep a list of
+         * passwords we already decrypted, so that we don't have to do the (slow and potentially interactive)
+         * PKCS#11/FIDO2 dance for the relevant token again and again. */
+
+        /* First, let's see if the supplied plain-text passwords work? */
+        r = user_record_test_password(h, secret);
+        if (r == -ENOKEY)
+                need_password = true;
+        else if (r == -ENXIO)
+                log_debug_errno(r, "User record has no hashed passwords, plaintext passwords not tested.");
+        else if (r < 0)
+                return log_error_errno(r, "Failed to validate password of record: %m");
+        else {
+                log_info("Provided password unlocks user record.");
+                return 1;
+        }
+
+        /* Similar, but test against the recovery keys */
+        r = user_record_test_recovery_key(h, secret);
+        if (r == -ENOKEY)
+                need_recovery_key = true;
+        else if (r == -ENXIO)
+                log_debug_errno(r, "User record has no recovery keys, plaintext passwords not tested against it.");
+        else if (r < 0)
+                return log_error_errno(r, "Failed to validate the recovery key of the record: %m");
+        else {
+                log_info("Provided password is a recovery key that unlocks the user record.");
+                return 1;
+        }
+
+        if (need_password && need_recovery_key)
+                log_info("None of the supplied plaintext passwords unlock the user record's hashed passwords or recovery keys.");
+        else if (need_password)
+                log_info("None of the supplied plaintext passwords unlock the user record's hashed passwords.");
+        else
+                log_info("None of the supplied plaintext passwords unlock the user record's hashed recovery keys.");
+
+        /* Second, test cached PKCS#11 passwords */
+        for (size_t n = 0; n < h->n_pkcs11_encrypted_key; n++)
+                STRV_FOREACH(pp, cache->pkcs11_passwords) {
+                        r = test_password_one(h->pkcs11_encrypted_key[n].hashed_password, *pp);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to check supplied PKCS#11 password: %m");
+                        if (r > 0) {
+                                log_info("Previously acquired PKCS#11 password unlocks user record.");
+                                return 1;
+                        }
+                }
+
+        /* Third, test cached FIDO2 passwords */
+        for (size_t n = 0; n < h->n_fido2_hmac_salt; n++)
+                /* See if any of the previously calculated passwords work */
+                STRV_FOREACH(pp, cache->fido2_passwords) {
+                        r = test_password_one(h->fido2_hmac_salt[n].hashed_password, *pp);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to check supplied FIDO2 password: %m");
+                        if (r > 0) {
+                                log_info("Previously acquired FIDO2 password unlocks user record.");
+                                return 1;
+                        }
+                }
+
+        /* Fourth, let's see if any of the PKCS#11 security tokens are plugged in and help us */
+        for (size_t n = 0; n < h->n_pkcs11_encrypted_key; n++) {
+#if HAVE_P11KIT
+                _cleanup_(pkcs11_callback_data_release) struct pkcs11_callback_data data = {
+                        .user_record = h,
+                        .secret = secret,
+                        .encrypted_key = h->pkcs11_encrypted_key + n,
+                };
+
+                r = pkcs11_find_token(data.encrypted_key->uri, pkcs11_callback, &data);
+                switch (r) {
+                case -EAGAIN:
+                        need_token = true;
+                        break;
+                case -ENOANO:
+                        need_pin = true;
+                        break;
+                case -ERFKILL:
+                        need_protected_authentication_path_permitted = true;
+                        break;
+                case -EOWNERDEAD:
+                        pin_locked = true;
+                        break;
+                case -ENOLCK:
+                        pin_incorrect = true;
+                        break;
+                case -ETOOMANYREFS:
+                        pin_incorrect = pin_incorrect_few_tries_left = true;
+                        break;
+                case -EUCLEAN:
+                        pin_incorrect = pin_incorrect_few_tries_left = pin_incorrect_one_try_left = true;
+                        break;
+                default:
+                        if (r < 0)
+                                return r;
+
+                        r = test_password_one(data.encrypted_key->hashed_password, data.decrypted_password);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to test PKCS#11 password: %m");
+                        if (r == 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Configured PKCS#11 security token %s does not decrypt encrypted key correctly.", data.encrypted_key->uri);
+
+                        log_info("Decrypted password from PKCS#11 security token %s unlocks user record.", data.encrypted_key->uri);
+
+                        r = strv_extend(&cache->pkcs11_passwords, data.decrypted_password);
+                        if (r < 0)
+                                return log_oom();
+
+                        return 1;
+                }
+#else
+                need_token = true;
+                break;
+#endif
+        }
+
+        /* Fifth, let's see if any of the FIDO2 security tokens are plugged in and help us */
+        for (size_t n = 0; n < h->n_fido2_hmac_salt; n++) {
+#if HAVE_LIBFIDO2
+                _cleanup_(erase_and_freep) char *decrypted_password = NULL;
+
+                r = fido2_use_token(h, secret, h->fido2_hmac_salt + n, &decrypted_password);
+                switch (r) {
+                case -EAGAIN:
+                        need_token = true;
+                        break;
+                case -ENOANO:
+                        need_pin = true;
+                        break;
+                case -EOWNERDEAD:
+                        pin_locked = true;
+                        break;
+                case -ENOLCK:
+                        pin_incorrect = true;
+                        break;
+                case -EMEDIUMTYPE:
+                        need_user_presence_permitted = true;
+                        break;
+                case -ENOCSI:
+                        need_user_verification_permitted = true;
+                        break;
+                case -ENOSTR:
+                        token_action_timeout = true;
+                        break;
+                default:
+                        if (r < 0)
+                                return r;
+
+                        r = test_password_one(h->fido2_hmac_salt[n].hashed_password, decrypted_password);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to test FIDO2 password: %m");
+                        if (r == 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Configured FIDO2 security token does not decrypt encrypted key correctly.");
+
+                        log_info("Decrypted password from FIDO2 security token unlocks user record.");
+
+                        r = strv_extend(&cache->fido2_passwords, decrypted_password);
+                        if (r < 0)
+                                return log_oom();
+
+                        return 1;
+                }
+#else
+                need_token = true;
+                break;
+#endif
+        }
+
+        /* Ordered by "relevance", i.e. the most "important" or "interesting" error condition is returned. */
+        if (pin_incorrect_one_try_left)
+                return -EUCLEAN;
+        if (pin_incorrect_few_tries_left)
+                return -ETOOMANYREFS;
+        if (pin_incorrect)
+                return -ENOLCK;
+        if (pin_locked)
+                return -EOWNERDEAD;
+        if (token_action_timeout)
+                return -ENOSTR;
+        if (need_protected_authentication_path_permitted)
+                return -ERFKILL;
+        if (need_user_presence_permitted)
+                return -EMEDIUMTYPE;
+        if (need_user_verification_permitted)
+                return -ENOCSI;
+        if (need_pin)
+                return -ENOANO;
+        if (need_token)
+                return -EBADSLT;
+        if (need_password)
+                return -ENOKEY;
+        if (need_recovery_key)
+                return -EREMOTEIO;
+
+        /* Hmm, this means neither PCKS#11/FIDO2 nor classic hashed passwords or recovery keys were supplied,
+         * we cannot authenticate this reasonably */
+        if (strict_verify)
+                return log_debug_errno(SYNTHETIC_ERRNO(EKEYREVOKED),
+                                       "No hashed passwords, no recovery keys and no PKCS#11/FIDO2 tokens defined, cannot authenticate user record, refusing.");
+
+        /* If strict verification is off this means we are possibly in the case where we encountered an
+         * unfixated record, i.e. a synthetic one that accordingly lacks any authentication data. In this
+         * case, allow the authentication to pass for now, so that the second (or third) authentication level
+         * (the ones of the user record in the LUKS header or inside the home directory) will then catch
+         * invalid passwords. The second/third authentication always runs in strict verification mode. */
+        log_debug("No hashed passwords, not recovery keys and no PKCS#11 tokens defined in record, cannot authenticate user record. "
+                  "Deferring to embedded user record.");
+        return 0;
+}
+
+static void drop_caches_now(void) {
+        int r;
+
+        /* Drop file system caches now. See https://docs.kernel.org/admin-guide/sysctl/vm.html
+         * for details. We write "2" into /proc/sys/vm/drop_caches to ensure dentries/inodes are flushed, but
+         * not more. */
+
+        r = write_string_file("/proc/sys/vm/drop_caches", "2\n", WRITE_STRING_FILE_DISABLE_BUFFER);
+        if (r < 0)
+                log_warning_errno(r, "Failed to drop caches, ignoring: %m");
+        else
+                log_debug("Dropped caches.");
+}
+
+int home_setup_undo_mount(HomeSetup *setup, int level) {
+        int r;
+
+        assert(setup);
+
+        if (!setup->undo_mount)
+                return 0;
+
+        r = umount_recursive(HOME_RUNTIME_WORK_DIR, 0);
+        if (r < 0) {
+                if (level >= LOG_DEBUG) /* umount_recursive() does debug level logging anyway, no need to
+                                         * repeat that here */
+                        return r;
+
+                /* If a higher log level is requested, the generate a non-debug message here too. */
+                return log_full_errno(level, r, "Failed to unmount mount tree below %s: %m", HOME_RUNTIME_WORK_DIR);
+        }
+
+        setup->undo_mount = false;
+        return 1;
+}
+
+int home_setup_undo_dm(HomeSetup *setup, int level) {
+        int r, ret;
+
+        assert(setup);
+
+        if (setup->undo_dm) {
+                assert(setup->crypt_device);
+                assert(setup->dm_name);
+
+                r = sym_crypt_deactivate_by_name(setup->crypt_device, setup->dm_name, 0);
+                if (r < 0)
+                        return log_full_errno(level, r, "Failed to deactivate LUKS device: %m");
+
+                /* In case the device was already remove asynchronously by an early unmount via the deferred
+                 * remove logic, let's wait for it */
+                (void) wait_for_block_device_gone(setup, USEC_PER_SEC * 30);
+
+                setup->undo_dm = false;
+                ret = 1;
+        } else
+                ret = 0;
+
+        if (setup->crypt_device) {
+                sym_crypt_free(setup->crypt_device);
+                setup->crypt_device = NULL;
+        }
+
+        return ret;
+}
+
+int keyring_unlink(key_serial_t k) {
+
+        if (k == -1) /* already invalidated? */
+                return -1;
+
+        if (keyctl(KEYCTL_UNLINK, k, KEY_SPEC_SESSION_KEYRING, 0, 0) < 0)
+                log_debug_errno(errno, "Failed to unlink key from session kernel keyring, ignoring: %m");
+
+        return -1; /* Always return the key_serial_t value for "invalid" */
+}
+
+static int keyring_flush(UserRecord *h) {
+        _cleanup_free_ char *name = NULL;
+        long serial;
+
+        assert(h);
+
+        name = strjoin("homework-user-", h->user_name);
+        if (!name)
+                return log_oom();
+
+        serial = keyctl(KEYCTL_SEARCH, (unsigned long) KEY_SPEC_SESSION_KEYRING, (unsigned long) "user", (unsigned long) name, 0);
+        if (serial == -1)
+                return log_debug_errno(errno, "Failed to find kernel keyring entry for user, ignoring: %m");
+
+        return keyring_unlink(serial);
+}
+
+int home_setup_done(HomeSetup *setup) {
+        int r = 0, q;
+
+        assert(setup);
+
+        if (setup->root_fd >= 0) {
+                if (setup->do_offline_fitrim) {
+                        q = run_fitrim(setup->root_fd);
+                        if (q < 0)
+                                r = q;
+                }
+
+                if (syncfs(setup->root_fd) < 0)
+                        log_debug_errno(errno, "Failed to synchronize home directory, ignoring: %m");
+
+                setup->root_fd = safe_close(setup->root_fd);
+        }
+
+        q = home_setup_undo_mount(setup, LOG_DEBUG);
+        if (q < 0)
+                r = q;
+
+        q = home_setup_undo_dm(setup, LOG_DEBUG);
+        if (q < 0)
+                r = q;
+
+        if (setup->image_fd >= 0) {
+                if (setup->do_offline_fallocate) {
+                        q = run_fallocate(setup->image_fd, NULL);
+                        if (q < 0)
+                                r = q;
+                }
+
+                if (setup->do_mark_clean) {
+                        q = run_mark_dirty(setup->image_fd, false);
+                        if (q < 0)
+                                r = q;
+                }
+
+                setup->image_fd = safe_close(setup->image_fd);
+        }
+
+        if (setup->temporary_image_path) {
+                if (unlink(setup->temporary_image_path) < 0)
+                        log_debug_errno(errno, "Failed to remove temporary image file '%s', ignoring: %m",
+                                        setup->temporary_image_path);
+
+                setup->temporary_image_path = mfree(setup->temporary_image_path);
+        }
+
+        setup->key_serial = keyring_unlink(setup->key_serial);
+
+        setup->undo_mount = false;
+        setup->undo_dm = false;
+        setup->do_offline_fitrim = false;
+        setup->do_offline_fallocate = false;
+        setup->do_mark_clean = false;
+
+        setup->dm_name = mfree(setup->dm_name);
+        setup->dm_node = mfree(setup->dm_node);
+
+        setup->loop = loop_device_unref(setup->loop);
+
+        setup->volume_key = erase_and_free(setup->volume_key);
+        setup->volume_key_size = 0;
+
+        if (setup->do_drop_caches)
+                drop_caches_now();
+
+        setup->mount_suffix = mfree(setup->mount_suffix);
+
+        return r;
+}
+
+int home_setup(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                PasswordCache *cache,
+                UserRecord **ret_header_home) {
+
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(!setup->loop);
+        assert(!setup->crypt_device);
+        assert(setup->root_fd < 0);
+        assert(!setup->undo_dm);
+        assert(!setup->undo_mount);
+
+        /* Makes a home directory accessible (through the root_fd file descriptor, not by path!). */
+
+        if (!FLAGS_SET(flags, HOME_SETUP_ALREADY_ACTIVATED)) /* If we set up the directory, we should also drop caches once we are done */
+                setup->do_drop_caches = setup->do_drop_caches || user_record_drop_caches(h);
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS:
+                return home_setup_luks(h, flags, NULL, setup, cache, ret_header_home);
+
+        case USER_SUBVOLUME:
+        case USER_DIRECTORY:
+                r = home_setup_directory(h, setup);
+                break;
+
+        case USER_FSCRYPT:
+                r = home_setup_fscrypt(h, setup, cache);
+                break;
+
+        case USER_CIFS:
+                r = home_setup_cifs(h, flags, setup);
+                break;
+
+        default:
+                return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), "Processing home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+        }
+
+        if (r < 0)
+                return r;
+
+        if (ret_header_home)
+                *ret_header_home = NULL;
+
+        return r;
+}
+
+int home_sync_and_statfs(int root_fd, struct statfs *ret) {
+        assert(root_fd >= 0);
+
+        /* Let's sync this to disk, so that the disk space reported by fstatfs() below is accurate (for file
+         * systems such as btrfs where this is determined lazily). */
+
+        if (syncfs(root_fd) < 0)
+                return log_error_errno(errno, "Failed to synchronize file system: %m");
+
+        if (ret)
+                if (fstatfs(root_fd, ret) < 0)
+                        return log_error_errno(errno, "Failed to statfs() file system: %m");
+
+        log_info("Synchronized disk.");
+
+        return 0;
+}
+
+static int read_identity_file(int root_fd, JsonVariant **ret) {
+        _cleanup_fclose_ FILE *identity_file = NULL;
+        _cleanup_close_ int identity_fd = -EBADF;
+        unsigned line, column;
+        int r;
+
+        assert(root_fd >= 0);
+        assert(ret);
+
+        identity_fd = openat(root_fd, ".identity", O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|O_NONBLOCK);
+        if (identity_fd < 0)
+                return log_error_errno(errno, "Failed to open .identity file in home directory: %m");
+
+        r = fd_verify_regular(identity_fd);
+        if (r < 0)
+                return log_error_errno(r, "Embedded identity file is not a regular file, refusing: %m");
+
+        identity_file = take_fdopen(&identity_fd, "r");
+        if (!identity_file)
+                return log_oom();
+
+        r = json_parse_file(identity_file, ".identity", JSON_PARSE_SENSITIVE, ret, &line, &column);
+        if (r < 0)
+                return log_error_errno(r, "[.identity:%u:%u] Failed to parse JSON data: %m", line, column);
+
+        log_info("Read embedded .identity file.");
+
+        return 0;
+}
+
+static int write_identity_file(int root_fd, JsonVariant *v, uid_t uid) {
+        _cleanup_(json_variant_unrefp) JsonVariant *normalized = NULL;
+        _cleanup_fclose_ FILE *identity_file = NULL;
+        _cleanup_close_ int identity_fd = -EBADF;
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        assert(root_fd >= 0);
+        assert(v);
+
+        normalized = json_variant_ref(v);
+
+        r = json_variant_normalize(&normalized);
+        if (r < 0)
+                log_warning_errno(r, "Failed to normalize user record, ignoring: %m");
+
+        r = tempfn_random(".identity", NULL, &fn);
+        if (r < 0)
+                return r;
+
+        identity_fd = openat(root_fd, fn, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0600);
+        if (identity_fd < 0)
+                return log_error_errno(errno, "Failed to create .identity file in home directory: %m");
+
+        identity_file = take_fdopen(&identity_fd, "w");
+        if (!identity_file) {
+                r = log_oom();
+                goto fail;
+        }
+
+        json_variant_dump(normalized, JSON_FORMAT_PRETTY, identity_file, NULL);
+
+        r = fflush_and_check(identity_file);
+        if (r < 0) {
+                log_error_errno(r, "Failed to write .identity file: %m");
+                goto fail;
+        }
+
+        if (fchown(fileno(identity_file), uid, uid) < 0) {
+                r = log_error_errno(errno, "Failed to change ownership of identity file: %m");
+                goto fail;
+        }
+
+        if (renameat(root_fd, fn, root_fd, ".identity") < 0) {
+                r = log_error_errno(errno, "Failed to move identity file into place: %m");
+                goto fail;
+        }
+
+        log_info("Wrote embedded .identity file.");
+
+        return 0;
+
+fail:
+        (void) unlinkat(root_fd, fn, 0);
+        return r;
+}
+
+int home_load_embedded_identity(
+                UserRecord *h,
+                int root_fd,
+                UserRecord *header_home,
+                UserReconcileMode mode,
+                PasswordCache *cache,
+                UserRecord **ret_embedded_home,
+                UserRecord **ret_new_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *embedded_home = NULL, *intermediate_home = NULL, *new_home = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        int r;
+
+        assert(h);
+        assert(root_fd >= 0);
+
+        r = read_identity_file(root_fd, &v);
+        if (r < 0)
+                return r;
+
+        embedded_home = user_record_new();
+        if (!embedded_home)
+                return log_oom();
+
+        r = user_record_load(embedded_home, v, USER_RECORD_LOAD_EMBEDDED|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        if (!user_record_compatible(h, embedded_home))
+                return log_error_errno(SYNTHETIC_ERRNO(EREMCHG), "Embedded home record not compatible with host record, refusing.");
+
+        /* Insist that credentials the user supplies also unlocks any embedded records. */
+        r = user_record_authenticate(embedded_home, h, cache, /* strict_verify= */ true);
+        if (r < 0)
+                return r;
+        assert(r > 0); /* Insist that a password was verified */
+
+        /* At this point we have three records to deal with:
+         *
+         *      · The record we got passed from the host
+         *      · The record included in the LUKS header (only if LUKS is used)
+         *      · The record in the home directory itself (~.identity)
+         *
+         *  Now we have to reconcile all three, and let the newest one win. */
+
+        if (header_home) {
+                /* Note we relax the requirements here. Instead of insisting that the host record is strictly
+                 * newer, let's also be OK if its equally new. If it is, we'll however insist that the
+                 * embedded record must be newer, so that we update at least one of the two. */
+
+                r = user_record_reconcile(h, header_home, mode == USER_RECONCILE_REQUIRE_NEWER ? USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL : mode, &intermediate_home);
+                if (r == -EREMCHG) /* this was supposed to be checked earlier already, but let's check this again */
+                        return log_error_errno(r, "Identity stored on host and in header don't match, refusing.");
+                if (r == -ESTALE)
+                        return log_error_errno(r, "Embedded identity record is newer than supplied record, refusing.");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to reconcile host and header identities: %m");
+                if (r == USER_RECONCILE_EMBEDDED_WON)
+                        log_info("Reconciling header user identity completed (header version was newer).");
+                else if (r == USER_RECONCILE_HOST_WON) {
+                        log_info("Reconciling header user identity completed (host version was newer).");
+
+                        if (mode == USER_RECONCILE_REQUIRE_NEWER) /* Host version is newer than the header
+                                                                   * version, hence we'll update
+                                                                   * something. This means we can relax the
+                                                                   * requirements on the embedded
+                                                                   * identity. */
+                                mode = USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL;
+                } else {
+                        assert(r == USER_RECONCILE_IDENTICAL);
+                        log_info("Reconciling user identities completed (host and header version were identical).");
+                }
+
+                h = intermediate_home;
+        }
+
+        r = user_record_reconcile(h, embedded_home, mode, &new_home);
+        if (r == -EREMCHG)
+                return log_error_errno(r, "Identity stored on host and in home don't match, refusing.");
+        if (r == -ESTALE)
+                return log_error_errno(r, "Embedded identity record is equally new or newer than supplied record, refusing.");
+        if (r < 0)
+                return log_error_errno(r, "Failed to reconcile host and embedded identities: %m");
+        if (r == USER_RECONCILE_EMBEDDED_WON)
+                log_info("Reconciling embedded user identity completed (embedded version was newer).");
+        else if (r == USER_RECONCILE_HOST_WON)
+                log_info("Reconciling embedded user identity completed (host version was newer).");
+        else {
+                assert(r == USER_RECONCILE_IDENTICAL);
+                log_info("Reconciling embedded user identity completed (host and embedded version were identical).");
+        }
+
+        if (ret_embedded_home)
+                *ret_embedded_home = TAKE_PTR(embedded_home);
+
+        if (ret_new_home)
+                *ret_new_home = TAKE_PTR(new_home);
+
+        return 0;
+}
+
+int home_store_embedded_identity(UserRecord *h, int root_fd, uid_t uid, UserRecord *old_home) {
+        _cleanup_(user_record_unrefp) UserRecord *embedded = NULL;
+        int r;
+
+        assert(h);
+        assert(root_fd >= 0);
+        assert(uid_is_valid(uid));
+
+        r = user_record_clone(h, USER_RECORD_EXTRACT_EMBEDDED|USER_RECORD_PERMISSIVE, &embedded);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine new embedded record: %m");
+
+        if (old_home && user_record_equal(old_home, embedded)) {
+                log_debug("Not updating embedded home record.");
+                return 0;
+        }
+
+        /* The identity has changed, let's update it in the image */
+        r = write_identity_file(root_fd, embedded->json, h->uid);
+        if (r < 0)
+                return r;
+
+        return 1;
+}
+
+static const char *file_system_type_fd(int fd) {
+        struct statfs sfs;
+
+        assert(fd >= 0);
+
+        if (fstatfs(fd, &sfs) < 0) {
+                log_debug_errno(errno, "Failed to statfs(): %m");
+                return NULL;
+        }
+
+        return fs_type_to_string(sfs.f_type);
+}
+
+int home_extend_embedded_identity(UserRecord *h, UserRecord *used, HomeSetup *setup) {
+        int r;
+
+        assert(h);
+        assert(used);
+        assert(setup);
+
+        r = user_record_add_binding(
+                        h,
+                        user_record_storage(used),
+                        user_record_image_path(used),
+                        setup->found_partition_uuid,
+                        setup->found_luks_uuid,
+                        setup->found_fs_uuid,
+                        setup->crypt_device ? sym_crypt_get_cipher(setup->crypt_device) : NULL,
+                        setup->crypt_device ? sym_crypt_get_cipher_mode(setup->crypt_device) : NULL,
+                        setup->crypt_device ? luks_volume_key_size_convert(setup->crypt_device) : UINT64_MAX,
+                        file_system_type_fd(setup->root_fd),
+                        user_record_home_directory(used),
+                        used->uid,
+                        (gid_t) used->uid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to update binding in record: %m");
+
+        return 0;
+}
+
+static int chown_recursive_directory(int root_fd, uid_t uid) {
+        int r;
+
+        assert(root_fd >= 0);
+        assert(uid_is_valid(uid));
+
+        r = fd_chown_recursive(root_fd, uid, (gid_t) uid, 0777);
+        if (r < 0)
+                return log_error_errno(r, "Failed to change ownership of files and directories: %m");
+        if (r == 0)
+                log_info("Recursive changing of ownership not necessary, skipped.");
+        else
+                log_info("Recursive changing of ownership completed.");
+
+        return 0;
+}
+
+int home_maybe_shift_uid(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup) {
+
+        _cleanup_close_ int mount_fd = -EBADF;
+        struct stat st;
+
+        assert(h);
+        assert(setup);
+        assert(setup->root_fd >= 0);
+
+        /* If the home dir is already activated, then the UID shift is already applied. */
+        if (FLAGS_SET(flags, HOME_SETUP_ALREADY_ACTIVATED))
+                return 0;
+
+        if (fstat(setup->root_fd, &st) < 0)
+                return log_error_errno(errno, "Failed to stat() home directory: %m");
+
+        /* Let's shift UIDs of this mount. Hopefully this makes the later chowning unnecessary. (Note that we
+         * also prefer to do UID mapping even if the UID already matches our goal UID. That's because we want
+         * to leave UIDs in the homed managed range unmapped.) */
+        (void) home_shift_uid(setup->root_fd, NULL, st.st_uid, h->uid, &mount_fd);
+
+        /* If this worked, then we'll have a reference to the mount now, which we can also use like an O_PATH
+         * fd to the new dir. Let's convert it into a proper O_DIRECTORY fd. */
+        if (mount_fd >= 0) {
+                safe_close(setup->root_fd);
+
+                setup->root_fd = fd_reopen(mount_fd, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+                if (setup->root_fd < 0)
+                        return log_error_errno(setup->root_fd, "Failed to convert mount fd into regular directory fd: %m");
+        }
+
+        return 0;
+}
+
+int home_refresh(
+                UserRecord *h,
+                HomeSetupFlags flags,
+                HomeSetup *setup,
+                UserRecord *header_home,
+                PasswordCache *cache,
+                struct statfs *ret_statfs,
+                UserRecord **ret_new_home) {
+
+        _cleanup_(user_record_unrefp) UserRecord *embedded_home = NULL, *new_home = NULL;
+        int r;
+
+        assert(h);
+        assert(setup);
+        assert(ret_new_home);
+
+        /* When activating a home directory, does the identity work: loads the identity from the $HOME
+         * directory, reconciles it with our idea, chown()s everything. */
+
+        r = home_load_embedded_identity(h, setup->root_fd, header_home, USER_RECONCILE_ANY, cache, &embedded_home, &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_maybe_shift_uid(h, flags, setup);
+        if (r < 0)
+                return r;
+
+        r = home_store_header_identity_luks(new_home, setup, header_home);
+        if (r < 0)
+                return r;
+
+        r = home_store_embedded_identity(new_home, setup->root_fd, h->uid, embedded_home);
+        if (r < 0)
+                return r;
+
+        r = chown_recursive_directory(setup->root_fd, h->uid);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup->root_fd, ret_statfs);
+        if (r < 0)
+                return r;
+
+        *ret_new_home = TAKE_PTR(new_home);
+        return 0;
+}
+
+static int home_activate(UserRecord *h, UserRecord **ret_home) {
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        HomeSetupFlags flags = 0;
+        int r;
+
+        assert(h);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks user name, refusing.");
+        if (!uid_is_valid(h->uid))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks UID, refusing.");
+        if (!IN_SET(user_record_storage(h), USER_LUKS, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT, USER_CIFS))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Activating home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        r = user_record_authenticate(h, h, &cache, /* strict_verify= */ false);
+        if (r < 0)
+                return r;
+
+        r = user_record_test_home_directory_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r == USER_TEST_MOUNTED)
+                return log_error_errno(SYNTHETIC_ERRNO(EALREADY), "Home directory %s is already mounted, refusing.", user_record_home_directory(h));
+
+        r = user_record_test_image_path_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r == USER_TEST_ABSENT)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Image path %s is missing, refusing.", user_record_image_path(h));
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS:
+                r = home_activate_luks(h, flags, &setup, &cache, &new_home);
+                if (r < 0)
+                        return r;
+
+                break;
+
+        case USER_SUBVOLUME:
+        case USER_DIRECTORY:
+        case USER_FSCRYPT:
+                r = home_activate_directory(h, flags, &setup, &cache, &new_home);
+                if (r < 0)
+                        return r;
+
+                break;
+
+        case USER_CIFS:
+                r = home_activate_cifs(h, flags, &setup, &cache, &new_home);
+                if (r < 0)
+                        return r;
+
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        /* Note that the returned object might either be a reference to an updated version of the existing
+         * home object, or a reference to a newly allocated home object. The caller has to be able to deal
+         * with both, and consider the old object out-of-date. */
+        if (user_record_equal(h, new_home)) {
+                *ret_home = NULL;
+                return 0; /* no identity change */
+        }
+
+        *ret_home = TAKE_PTR(new_home);
+        return 1; /* identity updated */
+}
+
+static int home_deactivate(UserRecord *h, bool force) {
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        bool done = false;
+        int r;
+
+        assert(h);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record incomplete, refusing.");
+        if (!IN_SET(user_record_storage(h), USER_LUKS, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT, USER_CIFS))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Deactivating home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        r = user_record_test_home_directory_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r == USER_TEST_MOUNTED) {
+                /* Before we do anything, let's move the home mount away. */
+                r = home_unshare_and_mkdir();
+                if (r < 0)
+                        return r;
+
+                r = mount_nofollow_verbose(LOG_ERR, user_record_home_directory(h), HOME_RUNTIME_WORK_DIR, NULL, MS_BIND, NULL);
+                if (r < 0)
+                        return r;
+
+                setup.undo_mount = true; /* remember to unmount the new bind mount from HOME_RUNTIME_WORK_DIR */
+
+                /* Let's explicitly open the new root fs, using the moved path */
+                setup.root_fd = open(HOME_RUNTIME_WORK_DIR, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+                if (setup.root_fd < 0)
+                        return log_error_errno(errno, "Failed to open moved home directory: %m");
+
+                /* Now get rid of the home at its original place (we only keep the bind mount we created above) */
+                r = umount_verbose(LOG_ERR, user_record_home_directory(h), UMOUNT_NOFOLLOW | (force ? MNT_FORCE|MNT_DETACH : 0));
+                if (r < 0)
+                        return r;
+
+                if (user_record_storage(h) == USER_LUKS) {
+                        /* Automatically shrink on logout if that's enabled. To be able to shrink we need the
+                         * keys to the device. */
+                        password_cache_load_keyring(h, &cache);
+                        (void) home_trim_luks(h, &setup);
+                }
+
+                /* Sync explicitly, so that the drop caches logic below can work as documented */
+                if (syncfs(setup.root_fd) < 0)
+                        log_debug_errno(errno, "Failed to synchronize home directory, ignoring: %m");
+                else
+                        log_info("Syncing completed.");
+
+                if (user_record_storage(h) == USER_LUKS)
+                        (void) home_auto_shrink_luks(h, &setup, &cache);
+
+                setup.root_fd = safe_close(setup.root_fd);
+
+                /* Now get rid of the bind mount, too */
+                r = umount_verbose(LOG_ERR, HOME_RUNTIME_WORK_DIR, UMOUNT_NOFOLLOW | (force ? MNT_FORCE|MNT_DETACH : 0));
+                if (r < 0)
+                        return r;
+
+                setup.undo_mount = false; /* Remember that the bind mount doesn't need to be unmounted anymore */
+
+                if (user_record_drop_caches(h))
+                        setup.do_drop_caches = true;
+
+                log_info("Unmounting completed.");
+                done = true;
+        } else
+                log_info("Directory %s is already unmounted.", user_record_home_directory(h));
+
+        if (user_record_storage(h) == USER_LUKS) {
+                r = home_deactivate_luks(h, &setup);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        done = true;
+        }
+
+        /* Explicitly flush any per-user key from the keyring */
+        (void) keyring_flush(h);
+
+        if (!done)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOEXEC), "Home is not active.");
+
+        if (setup.do_drop_caches) {
+                setup.do_drop_caches = false;
+                drop_caches_now();
+        }
+
+        log_info("Everything completed.");
+        return 0;
+}
+
+static int copy_skel(int root_fd, const char *skel) {
+        int r;
+
+        assert(root_fd >= 0);
+
+        r = copy_tree_at(AT_FDCWD, skel, root_fd, ".", UID_INVALID, GID_INVALID, COPY_MERGE|COPY_REPLACE, NULL, NULL);
+        if (r == -ENOENT) {
+                log_info("Skeleton directory %s missing, ignoring.", skel);
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to copy in %s: %m", skel);
+
+        log_info("Copying in %s completed.", skel);
+        return 0;
+}
+
+static int change_access_mode(int root_fd, mode_t m) {
+        assert(root_fd >= 0);
+
+        if (fchmod(root_fd, m) < 0)
+                return log_error_errno(errno, "Failed to change access mode of top-level directory: %m");
+
+        log_info("Changed top-level directory access mode to 0%o.", m);
+        return 0;
+}
+
+int home_populate(UserRecord *h, int dir_fd) {
+        int r;
+
+        assert(h);
+        assert(dir_fd >= 0);
+
+        r = copy_skel(dir_fd, user_record_skeleton_directory(h));
+        if (r < 0)
+                return r;
+
+        r = home_store_embedded_identity(h, dir_fd, h->uid, NULL);
+        if (r < 0)
+                return r;
+
+        r = chown_recursive_directory(dir_fd, h->uid);
+        if (r < 0)
+                return r;
+
+        r = change_access_mode(dir_fd, user_record_access_mode(h));
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int user_record_compile_effective_passwords(
+                UserRecord *h,
+                PasswordCache *cache,
+                char ***ret_effective_passwords) {
+
+        _cleanup_strv_free_erase_ char **effective = NULL;
+        size_t n;
+        int r;
+
+        assert(h);
+        assert(cache);
+
+        /* We insist on at least one classic hashed password to be defined in addition to any PKCS#11 one, as
+         * a safe fallback, but also to simplify the password changing algorithm: there we require providing
+         * the old literal password only (and do not care for the old PKCS#11 token) */
+
+        if (strv_isempty(h->hashed_password))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "User record has no hashed passwords, refusing.");
+
+        /* Generates the list of plaintext passwords to propagate to LUKS/fscrypt devices, and checks whether
+         * we have a plaintext password for each hashed one. If we are missing one we'll fail, since we
+         * couldn't sync fscrypt/LUKS to the login account properly. */
+
+        STRV_FOREACH(i, h->hashed_password) {
+                bool found = false;
+
+                log_debug("Looking for plaintext password for: %s", *i);
+
+                /* Let's scan all provided plaintext passwords */
+                STRV_FOREACH(j, h->password) {
+                        r = test_password_one(*i, *j);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to test plaintext password: %m");
+                        if (r > 0) {
+                                if (ret_effective_passwords) {
+                                        r = strv_extend(&effective, *j);
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+
+                                log_debug("Found literal plaintext password.");
+                                found = true;
+                                break;
+                        }
+                }
+
+                if (!found)
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOKEY), "Missing plaintext password for defined hashed password");
+        }
+
+        for (n = 0; n < h->n_recovery_key; n++) {
+                bool found = false;
+
+                log_debug("Looking for plaintext recovery key for: %s", h->recovery_key[n].hashed_password);
+
+                STRV_FOREACH(j, h->password) {
+                        _cleanup_(erase_and_freep) char *mangled = NULL;
+                        const char *p;
+
+                        if (streq(h->recovery_key[n].type, "modhex64")) {
+
+                                r = normalize_recovery_key(*j, &mangled);
+                                if (r == -EINVAL) /* Not properly formatted, probably a regular password. */
+                                        continue;
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to normalize recovery key: %m");
+
+                                p = mangled;
+                        } else
+                                p = *j;
+
+                        r = test_password_one(h->recovery_key[n].hashed_password, p);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to test plaintext recovery key: %m");
+                        if (r > 0) {
+                                if (ret_effective_passwords) {
+                                        r = strv_extend(&effective, p);
+                                        if (r < 0)
+                                                return log_oom();
+                                }
+
+                                log_debug("Found plaintext recovery key.");
+                                found = true;
+                                break;
+                        }
+                }
+
+                if (!found)
+                        return log_error_errno(SYNTHETIC_ERRNO(EREMOTEIO), "Missing plaintext recovery key for defined recovery key");
+        }
+
+        for (n = 0; n < h->n_pkcs11_encrypted_key; n++) {
+#if HAVE_P11KIT
+                _cleanup_(pkcs11_callback_data_release) struct pkcs11_callback_data data = {
+                        .user_record = h,
+                        .secret = h,
+                        .encrypted_key = h->pkcs11_encrypted_key + n,
+                };
+
+                r = pkcs11_find_token(data.encrypted_key->uri, pkcs11_callback, &data);
+                if (r == -EAGAIN)
+                        return -EBADSLT;
+                if (r < 0)
+                        return r;
+
+                r = test_password_one(data.encrypted_key->hashed_password, data.decrypted_password);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to test PKCS#11 password: %m");
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Decrypted password from token is not correct, refusing.");
+
+                if (ret_effective_passwords) {
+                        r = strv_extend(&effective, data.decrypted_password);
+                        if (r < 0)
+                                return log_oom();
+                }
+
+                r = strv_extend(&cache->pkcs11_passwords, data.decrypted_password);
+                if (r < 0)
+                        return log_oom();
+#else
+                return -EBADSLT;
+#endif
+        }
+
+        for (n = 0; n < h->n_fido2_hmac_salt; n++) {
+#if HAVE_LIBFIDO2
+                _cleanup_(erase_and_freep) char *decrypted_password = NULL;
+
+                r = fido2_use_token(h, h, h->fido2_hmac_salt + n, &decrypted_password);
+                if (r < 0)
+                        return r;
+
+                r = test_password_one(h->fido2_hmac_salt[n].hashed_password, decrypted_password);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to test FIDO2 password: %m");
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Decrypted password from token is not correct, refusing.");
+
+                if (ret_effective_passwords) {
+                        r = strv_extend(&effective, decrypted_password);
+                        if (r < 0)
+                                return log_oom();
+                }
+
+                r = strv_extend(&cache->fido2_passwords, decrypted_password);
+                if (r < 0)
+                        return log_oom();
+#else
+                return -EBADSLT;
+#endif
+        }
+
+        if (ret_effective_passwords)
+                *ret_effective_passwords = TAKE_PTR(effective);
+
+        return 0;
+}
+
+static int determine_default_storage(UserStorage *ret) {
+        UserStorage storage = _USER_STORAGE_INVALID;
+        const char *e;
+        int r;
+
+        assert(ret);
+
+        /* homed tells us via an environment variable which default storage to use */
+        e = getenv("SYSTEMD_HOME_DEFAULT_STORAGE");
+        if (e) {
+                storage = user_storage_from_string(e);
+                if (storage < 0)
+                        log_warning("$SYSTEMD_HOME_DEFAULT_STORAGE set to invalid storage type, ignoring: %s", e);
+                else {
+                        log_info("Using configured default storage '%s'.", user_storage_to_string(storage));
+                        *ret = storage;
+                        return 0;
+                }
+        }
+
+        /* When neither user nor admin specified the storage type to use, fix it to be LUKS — unless we run
+         * in a container where loopback devices and LUKS/DM are not available. Also, if /home is encrypted
+         * anyway, let's avoid duplicate encryption. Note that we typically default to the assumption of
+         * "classic" storage for most operations. However, if we create a new home, then let's user LUKS if
+         * nothing is specified. */
+
+        r = detect_container();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether we are in a container: %m");
+        if (r == 0) {
+                r = path_is_encrypted(get_home_root());
+                if (r > 0)
+                        log_info("%s is encrypted, not using '%s' storage, in order to avoid double encryption.", get_home_root(), user_storage_to_string(USER_LUKS));
+                else {
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to determine if %s is encrypted, ignoring: %m", get_home_root());
+
+                        r = dlopen_cryptsetup();
+                        if (r < 0)
+                                log_info("Not using '%s' storage, since libcryptsetup could not be loaded.", user_storage_to_string(USER_LUKS));
+                        else {
+                                log_info("Using automatic default storage of '%s'.", user_storage_to_string(USER_LUKS));
+                                *ret = USER_LUKS;
+                                return 0;
+                        }
+                }
+        } else
+                log_info("Running in container, not using '%s' storage.", user_storage_to_string(USER_LUKS));
+
+        r = path_is_fs_type(get_home_root(), BTRFS_SUPER_MAGIC);
+        if (r < 0)
+                log_warning_errno(r, "Failed to determine file system of %s, ignoring: %m", get_home_root());
+        if (r > 0) {
+                log_info("%s is on btrfs, using '%s' as storage.", get_home_root(), user_storage_to_string(USER_SUBVOLUME));
+                *ret = USER_SUBVOLUME;
+        } else {
+                log_info("%s is on simple file system, using '%s' as storage.", get_home_root(), user_storage_to_string(USER_DIRECTORY));
+                *ret = USER_DIRECTORY;
+        }
+
+        return 0;
+}
+
+static int home_create(UserRecord *h, UserRecord **ret_home) {
+        _cleanup_strv_free_erase_ char **effective_passwords = NULL;
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        UserStorage new_storage = _USER_STORAGE_INVALID;
+        const char *new_fs = NULL;
+        int r;
+
+        assert(h);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks name, refusing.");
+        if (!uid_is_valid(h->uid))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks UID, refusing.");
+
+        r = user_record_compile_effective_passwords(h, &cache, &effective_passwords);
+        if (r < 0)
+                return r;
+
+        r = user_record_test_home_directory_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r != USER_TEST_ABSENT)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Home directory %s already exists, refusing.", user_record_home_directory(h));
+
+        if (h->storage < 0) {
+                r = determine_default_storage(&new_storage);
+                if (r < 0)
+                        return r;
+        }
+
+        if ((h->storage == USER_LUKS ||
+             (h->storage < 0 && new_storage == USER_LUKS)) &&
+            !h->file_system_type)
+                new_fs = getenv("SYSTEMD_HOME_DEFAULT_FILE_SYSTEM_TYPE");
+
+        if (new_storage >= 0 || new_fs) {
+                r = user_record_add_binding(
+                                h,
+                                new_storage,
+                                NULL,
+                                SD_ID128_NULL,
+                                SD_ID128_NULL,
+                                SD_ID128_NULL,
+                                NULL,
+                                NULL,
+                                UINT64_MAX,
+                                new_fs,
+                                NULL,
+                                UID_INVALID,
+                                GID_INVALID);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to change storage type to LUKS: %m");
+        }
+
+        r = user_record_test_image_path_and_warn(h);
+        if (r < 0)
+                return r;
+        if (!IN_SET(r, USER_TEST_ABSENT, USER_TEST_UNDEFINED, USER_TEST_MAYBE))
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Image path %s already exists, refusing.", user_record_image_path(h));
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS:
+                r = home_create_luks(h, &setup, &cache, effective_passwords, &new_home);
+                break;
+
+        case USER_DIRECTORY:
+        case USER_SUBVOLUME:
+                r = home_create_directory_or_subvolume(h, &setup, &new_home);
+                break;
+
+        case USER_FSCRYPT:
+                r = home_create_fscrypt(h, &setup, effective_passwords, &new_home);
+                break;
+
+        case USER_CIFS:
+                r = home_create_cifs(h, &setup, &new_home);
+                break;
+
+        default:
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY),
+                                       "Creating home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+        }
+        if (r < 0)
+                return r;
+
+        if (user_record_equal(h, new_home)) {
+                *ret_home = NULL;
+                return 0;
+        }
+
+        *ret_home = TAKE_PTR(new_home);
+        return 1;
+}
+
+static int home_remove(UserRecord *h) {
+        bool deleted = false;
+        const char *ip, *hd;
+        int r;
+
+        assert(h);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks user name, refusing.");
+        if (!IN_SET(user_record_storage(h), USER_LUKS, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT, USER_CIFS))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Removing home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        hd = user_record_home_directory(h);
+
+        r = user_record_test_home_directory_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r == USER_TEST_MOUNTED)
+                return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Directory %s is still mounted, refusing.", hd);
+
+        assert(hd);
+
+        r = user_record_test_image_path_and_warn(h);
+        if (r < 0)
+                return r;
+
+        ip = user_record_image_path(h);
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS: {
+                struct stat st;
+
+                assert(ip);
+
+                if (stat(ip, &st) < 0) {
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to stat() %s: %m", ip);
+
+                } else {
+                        if (S_ISREG(st.st_mode)) {
+                                if (unlink(ip) < 0) {
+                                        if (errno != ENOENT)
+                                                return log_error_errno(errno, "Failed to remove %s: %m", ip);
+                                } else {
+                                        _cleanup_free_ char *parent = NULL;
+
+                                        deleted = true;
+
+                                        r = path_extract_directory(ip, &parent);
+                                        if (r < 0)
+                                                log_debug_errno(r, "Failed to determine parent directory of '%s': %m", ip);
+                                        else {
+                                                r = fsync_path_at(AT_FDCWD, parent);
+                                                if (r < 0)
+                                                        log_debug_errno(r, "Failed to synchronize disk after deleting '%s', ignoring: %m", ip);
+                                        }
+                                }
+
+                        } else if (S_ISBLK(st.st_mode))
+                                log_info("Not removing file system on block device %s.", ip);
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Image file %s is neither block device, nor regular, refusing removal.", ip);
+                }
+
+                break;
+        }
+
+        case USER_SUBVOLUME:
+        case USER_DIRECTORY:
+        case USER_FSCRYPT:
+                assert(ip);
+
+                r = rm_rf(ip, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_SYNCFS);
+                if (r < 0) {
+                        if (r != -ENOENT)
+                                return log_warning_errno(r, "Failed to remove %s: %m", ip);
+                } else
+                        deleted = true;
+
+                /* If the image path and the home directory are the same invalidate the home directory, so
+                 * that we don't remove it anymore */
+                if (path_equal(ip, hd))
+                        hd = NULL;
+
+                break;
+
+        case USER_CIFS:
+                /* Nothing else to do here: we won't remove remote stuff. */
+                log_info("Not removing home directory on remote server.");
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        if (hd) {
+                if (rmdir(hd) < 0) {
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to remove %s, ignoring: %m", hd);
+                } else
+                        deleted = true;
+        }
+
+        if (deleted) {
+                if (user_record_drop_caches(h))
+                        drop_caches_now();
+
+                log_info("Everything completed.");
+        } else
+                return log_notice_errno(SYNTHETIC_ERRNO(EALREADY),
+                                        "Nothing to remove.");
+
+        return 0;
+}
+
+static int home_validate_update(UserRecord *h, HomeSetup *setup, HomeSetupFlags *flags) {
+        bool has_mount = false;
+        int r;
+
+        assert(h);
+        assert(setup);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks user name, refusing.");
+        if (!uid_is_valid(h->uid))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record lacks UID, refusing.");
+        if (!IN_SET(user_record_storage(h), USER_LUKS, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT, USER_CIFS))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Processing home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        r = user_record_test_home_directory_and_warn(h);
+        if (r < 0)
+                return r;
+
+        has_mount = r == USER_TEST_MOUNTED;
+
+        r = user_record_test_image_path_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r == USER_TEST_ABSENT)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Image path %s does not exist", user_record_image_path(h));
+
+        switch (user_record_storage(h)) {
+
+        case USER_DIRECTORY:
+        case USER_SUBVOLUME:
+        case USER_FSCRYPT:
+        case USER_CIFS:
+                break;
+
+        case USER_LUKS: {
+                r = home_get_state_luks(h, setup);
+                if (r < 0)
+                        return r;
+                if ((r > 0) != has_mount)
+                        return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Home mount incompletely set up.");
+
+                break;
+        }
+
+        default:
+                assert_not_reached();
+        }
+
+        if (flags)
+                SET_FLAG(*flags, HOME_SETUP_ALREADY_ACTIVATED, has_mount);
+
+        return has_mount; /* return true if the home record is already active */
+}
+
+static int home_update(UserRecord *h, UserRecord **ret) {
+        _cleanup_(user_record_unrefp) UserRecord *new_home = NULL, *header_home = NULL, *embedded_home = NULL;
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        HomeSetupFlags flags = 0;
+        int r;
+
+        assert(h);
+        assert(ret);
+
+        r = user_record_authenticate(h, h, &cache, /* strict_verify= */ true);
+        if (r < 0)
+                return r;
+        assert(r > 0); /* Insist that a password was verified */
+
+        r = home_validate_update(h, &setup, &flags);
+        if (r < 0)
+                return r;
+
+        r = home_setup(h, flags, &setup, &cache, &header_home);
+        if (r < 0)
+                return r;
+
+        r = home_load_embedded_identity(h, setup.root_fd, header_home, USER_RECONCILE_REQUIRE_NEWER, &cache, &embedded_home, &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_maybe_shift_uid(h, flags, &setup);
+        if (r < 0)
+                return r;
+
+        r = home_store_header_identity_luks(new_home, &setup, header_home);
+        if (r < 0)
+                return r;
+
+        r = home_store_embedded_identity(new_home, setup.root_fd, h->uid, embedded_home);
+        if (r < 0)
+                return r;
+
+        r = home_extend_embedded_identity(new_home, h, &setup);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup.root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        r = home_setup_done(&setup);
+        if (r < 0)
+                return r;
+
+        log_info("Everything completed.");
+
+        *ret = TAKE_PTR(new_home);
+        return 0;
+}
+
+static int home_resize(UserRecord *h, bool automatic, UserRecord **ret) {
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        HomeSetupFlags flags = 0;
+        int r;
+
+        assert(h);
+        assert(ret);
+
+        if (h->disk_size == UINT64_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No target size specified, refusing.");
+
+        if (automatic)
+                /* In automatic mode don't want to ask the user for the password, hence load it from the kernel keyring */
+                password_cache_load_keyring(h, &cache);
+        else {
+                /* In manual mode let's ensure the user is fully authenticated */
+                r = user_record_authenticate(h, h, &cache, /* strict_verify= */ true);
+                if (r < 0)
+                        return r;
+                assert(r > 0); /* Insist that a password was verified */
+        }
+
+        r = home_validate_update(h, &setup, &flags);
+        if (r < 0)
+                return r;
+
+        /* In automatic mode let's skip syncing identities, because we can't validate them, since we can't
+         * ask the user for reauthentication */
+        if (automatic)
+                flags |= HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES;
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS:
+                return home_resize_luks(h, flags, &setup, &cache, ret);
+
+        case USER_DIRECTORY:
+        case USER_SUBVOLUME:
+        case USER_FSCRYPT:
+                return home_resize_directory(h, flags, &setup, &cache, ret);
+
+        default:
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Resizing home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+        }
+}
+
+static int home_passwd(UserRecord *h, UserRecord **ret_home) {
+        _cleanup_(user_record_unrefp) UserRecord *header_home = NULL, *embedded_home = NULL, *new_home = NULL;
+        _cleanup_strv_free_erase_ char **effective_passwords = NULL;
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        HomeSetupFlags flags = 0;
+        int r;
+
+        assert(h);
+        assert(ret_home);
+
+        if (!IN_SET(user_record_storage(h), USER_LUKS, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT))
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Changing password of home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        r = user_record_compile_effective_passwords(h, &cache, &effective_passwords);
+        if (r < 0)
+                return r;
+
+        r = home_validate_update(h, &setup, &flags);
+        if (r < 0)
+                return r;
+
+        r = home_setup(h, flags, &setup, &cache, &header_home);
+        if (r < 0)
+                return r;
+
+        r = home_load_embedded_identity(h, setup.root_fd, header_home, USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL, &cache, &embedded_home, &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_maybe_shift_uid(h, flags, &setup);
+        if (r < 0)
+                return r;
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS:
+                r = home_passwd_luks(h, flags, &setup, &cache, effective_passwords);
+                if (r < 0)
+                        return r;
+                break;
+
+        case USER_FSCRYPT:
+                r = home_passwd_fscrypt(h, &setup, &cache, effective_passwords);
+                if (r < 0)
+                        return r;
+                break;
+
+        default:
+                break;
+        }
+
+        r = home_store_header_identity_luks(new_home, &setup, header_home);
+        if (r < 0)
+                return r;
+
+        r = home_store_embedded_identity(new_home, setup.root_fd, h->uid, embedded_home);
+        if (r < 0)
+                return r;
+
+        r = home_extend_embedded_identity(new_home, h, &setup);
+        if (r < 0)
+                return r;
+
+        r = home_sync_and_statfs(setup.root_fd, NULL);
+        if (r < 0)
+                return r;
+
+        r = home_setup_done(&setup);
+        if (r < 0)
+                return r;
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 1;
+}
+
+static int home_inspect(UserRecord *h, UserRecord **ret_home) {
+        _cleanup_(user_record_unrefp) UserRecord *header_home = NULL, *new_home = NULL;
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        HomeSetupFlags flags = 0;
+        int r;
+
+        assert(h);
+        assert(ret_home);
+
+        r = user_record_authenticate(h, h, &cache, /* strict_verify= */ false);
+        if (r < 0)
+                return r;
+
+        r = home_validate_update(h, &setup, &flags);
+        if (r < 0)
+                return r;
+
+        r = home_setup(h, flags, &setup, &cache, &header_home);
+        if (r < 0)
+                return r;
+
+        r = home_load_embedded_identity(h, setup.root_fd, header_home, USER_RECONCILE_ANY, &cache, NULL, &new_home);
+        if (r < 0)
+                return r;
+
+        r = home_extend_embedded_identity(new_home, h, &setup);
+        if (r < 0)
+                return r;
+
+        r = home_setup_done(&setup);
+        if (r < 0)
+                return r;
+
+        log_info("Everything completed.");
+
+        *ret_home = TAKE_PTR(new_home);
+        return 1;
+}
+
+static int home_lock(UserRecord *h) {
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        int r;
+
+        assert(h);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record incomplete, refusing.");
+        if (user_record_storage(h) != USER_LUKS)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Locking home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        r = user_record_test_home_directory_and_warn(h);
+        if (r < 0)
+                return r;
+        if (r != USER_TEST_MOUNTED)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOEXEC), "Home directory of %s is not mounted, can't lock.", h->user_name);
+
+        r = home_lock_luks(h, &setup);
+        if (r < 0)
+                return r;
+
+        log_info("Everything completed.");
+        return 1;
+}
+
+static int home_unlock(UserRecord *h) {
+        _cleanup_(home_setup_done) HomeSetup setup = HOME_SETUP_INIT;
+        _cleanup_(password_cache_free) PasswordCache cache = {};
+        int r;
+
+        assert(h);
+
+        if (!h->user_name)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "User record incomplete, refusing.");
+        if (user_record_storage(h) != USER_LUKS)
+                return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "Unlocking home directories of type '%s' currently not supported.", user_storage_to_string(user_record_storage(h)));
+
+        /* Note that we don't check if $HOME is actually mounted, since we want to avoid disk accesses on
+         * that mount until we have resumed the device. */
+
+        r = user_record_authenticate(h, h, &cache, /* strict_verify= */ false);
+        if (r < 0)
+                return r;
+
+        r = home_unlock_luks(h, &setup, &cache);
+        if (r < 0)
+                return r;
+
+        log_info("Everything completed.");
+        return 1;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(user_record_unrefp) UserRecord *home = NULL, *new_home = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_fclose_ FILE *opened_file = NULL;
+        unsigned line = 0, column = 0;
+        const char *json_path = NULL;
+        FILE *json_file;
+        usec_t start;
+        int r;
+
+        start = now(CLOCK_MONOTONIC);
+
+        log_setup();
+
+        cryptsetup_enable_logging(NULL);
+
+        umask(0022);
+
+        if (argc < 2 || argc > 3)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes one or two arguments.");
+
+        if (argc > 2) {
+                json_path = argv[2];
+
+                opened_file = fopen(json_path, "re");
+                if (!opened_file)
+                        return log_error_errno(errno, "Failed to open %s: %m", json_path);
+
+                json_file = opened_file;
+        } else {
+                json_path = "";
+                json_file = stdin;
+        }
+
+        r = json_parse_file(json_file, json_path, JSON_PARSE_SENSITIVE, &v, &line, &column);
+        if (r < 0)
+                return log_error_errno(r, "[%s:%u:%u] Failed to parse JSON data: %m", json_path, line, column);
+
+        home = user_record_new();
+        if (!home)
+                return log_oom();
+
+        r = user_record_load(home, v, USER_RECORD_LOAD_FULL|USER_RECORD_LOG|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        /* Well known return values of these operations, that systemd-homed knows and converts to proper D-Bus errors:
+         *
+         * EMSGSIZE        → file systems of this type cannot be shrunk
+         * ETXTBSY         → file systems of this type can only be shrunk offline
+         * ERANGE          → file system size too small
+         * ENOLINK         → system does not support selected storage backend
+         * EPROTONOSUPPORT → system does not support selected file system
+         * ENOTTY          → operation not support on this storage
+         * ESOCKTNOSUPPORT → operation not support on this file system
+         * ENOKEY          → password incorrect (or not sufficient, or not supplied)
+         * EREMOTEIO       → recovery key incorrect (or not sufficeint, or not supplied — only if no passwords defined)
+         * EBADSLT         → similar, but PKCS#11 device is defined and might be able to provide password, if it was plugged in which it is not
+         * ENOANO          → suitable PKCS#11/FIDO2 device found, but PIN is missing to unlock it
+         * ERFKILL         → suitable PKCS#11 device found, but OK to ask for on-device interactive authentication not given
+         * EMEDIUMTYPE     → suitable FIDO2 device found, but OK to ask for user presence not given
+         * ENOCSI          → suitable FIDO2 device found, but OK to ask for user verification not given
+         * ENOSTR          → suitable FIDO2 device found, but user didn't react to action request on token quickly enough
+         * EOWNERDEAD      → suitable PKCS#11/FIDO2 device found, but its PIN is locked
+         * ENOLCK          → suitable PKCS#11/FIDO2 device found, but PIN incorrect
+         * ETOOMANYREFS    → suitable PKCS#11 device found, but PIN incorrect, and only few tries left
+         * EUCLEAN         → suitable PKCS#11 device found, but PIN incorrect, and only one try left
+         * EBUSY           → file system is currently active
+         * ENOEXEC         → file system is currently not active
+         * ENOSPC          → not enough disk space for operation
+         * EKEYREVOKED     → user record has not suitable hashed password or pkcs#11 entry, we cannot authenticate
+         * EADDRINUSE      → home image is already used elsewhere (lock taken)
+         */
+
+        if (streq(argv[1], "activate"))
+                r = home_activate(home, &new_home);
+        else if (streq(argv[1], "deactivate"))
+                r = home_deactivate(home, false);
+        else if (streq(argv[1], "deactivate-force"))
+                r = home_deactivate(home, true);
+        else if (streq(argv[1], "create"))
+                r = home_create(home, &new_home);
+        else if (streq(argv[1], "remove"))
+                r = home_remove(home);
+        else if (streq(argv[1], "update"))
+                r = home_update(home, &new_home);
+        else if (streq(argv[1], "resize")) /* Resize on user request */
+                r = home_resize(home, false, &new_home);
+        else if (streq(argv[1], "resize-auto")) /* Automatic resize */
+                r = home_resize(home, true, &new_home);
+        else if (streq(argv[1], "passwd"))
+                r = home_passwd(home, &new_home);
+        else if (streq(argv[1], "inspect"))
+                r = home_inspect(home, &new_home);
+        else if (streq(argv[1], "lock"))
+                r = home_lock(home);
+        else if (streq(argv[1], "unlock"))
+                r = home_unlock(home);
+        else
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown verb '%s'.", argv[1]);
+        if (IN_SET(r, -ENOKEY, -EREMOTEIO) && !strv_isempty(home->password) ) { /* There were passwords specified but they were incorrect */
+                usec_t end, n, d;
+
+                /* Make sure bad password replies always take at least 3s, and if longer multiples of 3s, so
+                 * that it's not clear how long we actually needed for our calculations. */
+                n = now(CLOCK_MONOTONIC);
+                assert(n >= start);
+
+                d = usec_sub_unsigned(n, start);
+                if (d > BAD_PASSWORD_DELAY_USEC)
+                        end = start + DIV_ROUND_UP(d, BAD_PASSWORD_DELAY_USEC) * BAD_PASSWORD_DELAY_USEC;
+                else
+                        end = start + BAD_PASSWORD_DELAY_USEC;
+
+                if (n < end)
+                        (void) usleep_safe(usec_sub_unsigned(end, n));
+        }
+        if (r < 0)
+                return r;
+
+        /* We always pass the new record back, regardless if it changed or not. This allows our caller to
+         * prepare a fresh record, send to us, and only if it works use it without having to keep a local
+         * copy. */
+        if (new_home)
+                json_variant_dump(new_home->json, JSON_FORMAT_NEWLINE, stdout, NULL);
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/home/homework.h b/src/home/homework.h
new file mode 100644
index 0000000..cef3f4e
--- /dev/null
+++ b/src/home/homework.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "sd-id128.h"
+
+#include "cryptsetup-util.h"
+#include "homework-password-cache.h"
+#include "loop-util.h"
+#include "missing_fs.h" /* for FS_KEY_DESCRIPTOR_SIZE, do not include linux/fs.h */
+#include "missing_keyctl.h"
+#include "missing_syscall.h"
+#include "user-record.h"
+#include "user-record-util.h"
+
+typedef struct HomeSetup {
+        char *dm_name;   /* "home-" */
+        char *dm_node;   /* "/dev/mapper/home-" */
+
+        LoopDevice *loop;
+        struct crypt_device *crypt_device;
+        int root_fd;
+        int image_fd;
+        sd_id128_t found_partition_uuid;
+        sd_id128_t found_luks_uuid;
+        sd_id128_t found_fs_uuid;
+
+        uint8_t fscrypt_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+
+        void *volume_key;
+        size_t volume_key_size;
+
+        key_serial_t key_serial;
+
+        bool undo_dm:1;
+        bool undo_mount:1;            /* Whether to unmount /run/systemd/user-home-mount */
+        bool do_offline_fitrim:1;
+        bool do_offline_fallocate:1;
+        bool do_mark_clean:1;
+        bool do_drop_caches:1;
+
+        uint64_t partition_offset;
+        uint64_t partition_size;
+
+        char *mount_suffix;           /* The directory to use as home dir is this path below /run/systemd/user-home-mount */
+
+        char *temporary_image_path;
+} HomeSetup;
+
+#define HOME_SETUP_INIT                                 \
+        {                                               \
+                .root_fd = -EBADF,                      \
+                .image_fd = -EBADF,                     \
+                .partition_offset = UINT64_MAX,         \
+                .partition_size = UINT64_MAX,           \
+                .key_serial = -1,                       \
+        }
+
+/* Various flags for the operation of setting up a home directory */
+typedef enum HomeSetupFlags {
+        HOME_SETUP_ALREADY_ACTIVATED           = 1 << 0, /* Open an already activated home, rather than activate it afresh */
+
+        /* CIFS backend: */
+        HOME_SETUP_CIFS_MKDIR                  = 1 << 1, /* Create CIFS subdir when missing */
+
+        /* Applies only for resize operations */
+        HOME_SETUP_RESIZE_DONT_SYNC_IDENTITIES = 1 << 2, /* Don't sync identity records into home and LUKS header */
+        HOME_SETUP_RESIZE_MINIMIZE             = 1 << 3, /* Shrink to minimal size */
+        HOME_SETUP_RESIZE_DONT_GROW            = 1 << 4, /* If the resize would grow, gracefully terminate operation */
+        HOME_SETUP_RESIZE_DONT_SHRINK          = 1 << 5, /* If the resize would shrink, gracefully terminate operation */
+        HOME_SETUP_RESIZE_DONT_UNDO            = 1 << 6, /* Leave loopback/DM device context open after successful operation */
+} HomeSetupFlags;
+
+int home_setup_done(HomeSetup *setup);
+
+int home_setup_undo_mount(HomeSetup *setup, int level);
+int home_setup_undo_dm(HomeSetup *setup, int level);
+
+int keyring_unlink(key_serial_t k);
+
+int home_setup(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, PasswordCache *cache, UserRecord **ret_header_home);
+
+int home_refresh(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup, UserRecord *header_home, PasswordCache *cache, struct statfs *ret_statfs, UserRecord **ret_new_home);
+
+int home_maybe_shift_uid(UserRecord *h, HomeSetupFlags flags, HomeSetup *setup);
+int home_populate(UserRecord *h, int dir_fd);
+
+int home_load_embedded_identity(UserRecord *h, int root_fd, UserRecord *header_home, UserReconcileMode mode, PasswordCache *cache, UserRecord **ret_embedded_home, UserRecord **ret_new_home);
+int home_store_embedded_identity(UserRecord *h, int root_fd, uid_t uid, UserRecord *old_home);
+int home_extend_embedded_identity(UserRecord *h, UserRecord *used, HomeSetup *setup);
+
+int user_record_authenticate(UserRecord *h, UserRecord *secret, PasswordCache *cache, bool strict_verify);
+
+int home_sync_and_statfs(int root_fd, struct statfs *ret);
+
+#define HOME_RUNTIME_WORK_DIR "/run/systemd/user-home-mount"
diff --git a/src/home/meson.build b/src/home/meson.build
new file mode 100644
index 0000000..09831de
--- /dev/null
+++ b/src/home/meson.build
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+systemd_homework_sources = files(
+        'home-util.c',
+        'homework-cifs.c',
+        'homework-directory.c',
+        'homework-fscrypt.c',
+        'homework-luks.c',
+        'homework-mount.c',
+        'homework-password-cache.c',
+        'homework-quota.c',
+        'homework.c',
+        'user-record-util.c',
+)
+
+if conf.get('HAVE_P11KIT') == 1
+        systemd_homework_sources += files('homework-pkcs11.c')
+endif
+if conf.get('HAVE_LIBFIDO2') == 1
+        systemd_homework_sources += files('homework-fido2.c')
+endif
+
+systemd_homed_sources = files(
+        'home-util.c',
+        'homed-bus.c',
+        'homed-conf.c',
+        'homed-home-bus.c',
+        'homed-home.c',
+        'homed-manager-bus.c',
+        'homed-manager.c',
+        'homed-operation.c',
+        'homed-varlink.c',
+        'homed.c',
+        'user-record-password-quality.c',
+        'user-record-sign.c',
+        'user-record-util.c',
+)
+
+homed_gperf_c = custom_target(
+        'homed_gperf.c',
+        input : 'homed-gperf.gperf',
+        output : 'homed-gperf.c',
+        command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@'])
+
+systemd_homed_sources += [homed_gperf_c]
+
+homectl_sources = files(
+        'home-util.c',
+        'homectl-fido2.c',
+        'homectl-pkcs11.c',
+        'homectl-recovery-key.c',
+        'homectl.c',
+        'user-record-password-quality.c',
+        'user-record-util.c',
+)
+
+pam_systemd_home_sources = files(
+        'home-util.c',
+        'pam_systemd_home.c',
+        'user-record-util.c',
+)
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-homework',
+                'conditions' : ['ENABLE_HOMED'],
+                'sources' : systemd_homework_sources,
+                'link_with' : [
+                        libshared,
+                        libshared_fdisk
+                ],
+                'dependencies' : [
+                        libblkid,
+                        libcrypt,
+                        libfdisk,
+                        libopenssl,
+                        libp11kit_cflags,
+                        threads,
+                ],
+        },
+        libexec_template + {
+                'name' : 'systemd-homed',
+                'dbus' : true,
+                'conditions' : ['ENABLE_HOMED'],
+                'sources' : systemd_homed_sources,
+                'include_directories' : includes +
+                                        include_directories('.'),
+                'dependencies' : [
+                        libcrypt,
+                        libm,
+                        libopenssl,
+                        threads,
+                ],
+        },
+        executable_template + {
+                'name' : 'homectl',
+                'public' : true,
+                'conditions' : ['ENABLE_HOMED'],
+                'sources' : homectl_sources,
+                'dependencies' : [
+                        libcrypt,
+                        libdl,
+                        libopenssl,
+                        libp11kit_cflags,
+                        threads,
+                ],
+        },
+]
+
+modules += [
+        pam_template + {
+                'name' : 'pam_systemd_home',
+                'conditions' : [
+                        'ENABLE_HOMED',
+                        'HAVE_PAM',
+                ],
+                'sources' : pam_systemd_home_sources,
+                'dependencies' : [
+                        libcrypt,
+                        libpam_misc,
+                        libpam,
+                        threads,
+                ],
+                'version-script' : meson.current_source_dir() / 'pam_systemd_home.sym',
+        },
+]
+
+if conf.get('ENABLE_HOMED') == 1
+        install_data('org.freedesktop.home1.conf',
+                     install_dir : dbuspolicydir)
+        install_data('org.freedesktop.home1.service',
+                     install_dir : dbussystemservicedir)
+        install_data('org.freedesktop.home1.policy',
+                     install_dir : polkitpolicydir)
+
+        if install_sysconfdir_samples
+                install_data('homed.conf',
+                             install_dir : pkgconfigfiledir)
+        endif
+endif
diff --git a/src/home/org.freedesktop.home1.conf b/src/home/org.freedesktop.home1.conf
new file mode 100644
index 0000000..5af1a68
--- /dev/null
+++ b/src/home/org.freedesktop.home1.conf
@@ -0,0 +1,201 @@
+ 
+
+
+
+
+
+
+        
+                
+                
+                
+        
+
+        
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+        
+
+
diff --git a/src/home/org.freedesktop.home1.policy b/src/home/org.freedesktop.home1.policy
new file mode 100644
index 0000000..a337b32
--- /dev/null
+++ b/src/home/org.freedesktop.home1.policy
@@ -0,0 +1,72 @@
+ 
+
+
+
+
+
+
+        The systemd Project
+        https://systemd.io
+
+        
+                Create a home area
+                Authentication is required to create a user's home area.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Remove a home area
+                Authentication is required to remove a user's home area.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Check credentials of a home area
+                Authentication is required to check credentials against a user's home area.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Update a home area
+                Authentication is required to update a user's home area.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Resize a home area
+                Authentication is required to resize a user's home area.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Change password of a home area
+                Authentication is required to change the password of a user's home area.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+
diff --git a/src/home/org.freedesktop.home1.service b/src/home/org.freedesktop.home1.service
new file mode 100644
index 0000000..fb03914
--- /dev/null
+++ b/src/home/org.freedesktop.home1.service
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+[D-BUS Service]
+Name=org.freedesktop.home1
+Exec=/bin/false
+User=root
+SystemdService=dbus-org.freedesktop.home1.service
diff --git a/src/home/pam_systemd_home.c b/src/home/pam_systemd_home.c
new file mode 100644
index 0000000..ba8d8f6
--- /dev/null
+++ b/src/home/pam_systemd_home.c
@@ -0,0 +1,1064 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-bus.h"
+
+#include "bus-common-errors.h"
+#include "bus-locator.h"
+#include "bus-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "home-util.h"
+#include "locale-util.h"
+#include "memory-util.h"
+#include "pam-util.h"
+#include "parse-util.h"
+#include "strv.h"
+#include "user-record-util.h"
+#include "user-record.h"
+#include "user-util.h"
+
+static int parse_argv(
+                pam_handle_t *handle,
+                int argc, const char **argv,
+                bool *please_suspend,
+                bool *debug) {
+
+        assert(argc >= 0);
+        assert(argc == 0 || argv);
+
+        for (int i = 0; i < argc; i++) {
+                const char *v;
+
+                if ((v = startswith(argv[i], "suspend="))) {
+                        int k;
+
+                        k = parse_boolean(v);
+                        if (k < 0)
+                                pam_syslog(handle, LOG_WARNING, "Failed to parse suspend= argument, ignoring: %s", v);
+                        else if (please_suspend)
+                                *please_suspend = k;
+
+                } else if (streq(argv[i], "debug")) {
+                        if (debug)
+                                *debug = true;
+
+                } else if ((v = startswith(argv[i], "debug="))) {
+                        int k;
+                        k = parse_boolean(v);
+                        if (k < 0)
+                                pam_syslog(handle, LOG_WARNING, "Failed to parse debug= argument, ignoring: %s", v);
+                        else if (debug)
+                                *debug = k;
+
+                } else
+                        pam_syslog(handle, LOG_WARNING, "Unknown parameter '%s', ignoring", argv[i]);
+        }
+
+        return 0;
+}
+
+static int parse_env(
+                pam_handle_t *handle,
+                bool *please_suspend) {
+
+        const char *v;
+        int r;
+
+        /* Let's read the suspend setting from an env var in addition to the PAM command line. That makes it
+         * easy to declare the features of a display manager in code rather than configuration, and this is
+         * really a feature of code */
+
+        v = pam_getenv(handle, "SYSTEMD_HOME_SUSPEND");
+        if (!v) {
+                /* Also check the process env block, so that people can control this via an env var from the
+                 * outside of our process. */
+                v = secure_getenv("SYSTEMD_HOME_SUSPEND");
+                if (!v)
+                        return 0;
+        }
+
+        r = parse_boolean(v);
+        if (r < 0)
+                pam_syslog(handle, LOG_WARNING, "Failed to parse $SYSTEMD_HOME_SUSPEND argument, ignoring: %s", v);
+        else if (please_suspend)
+                *please_suspend = r;
+
+        return 0;
+}
+
+static int acquire_user_record(
+                pam_handle_t *handle,
+                const char *username,
+                bool debug,
+                UserRecord **ret_record,
+                PamBusData **bus_data) {
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *ur = NULL;
+        _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL;
+        _cleanup_free_ char *homed_field = NULL;
+        const char *json = NULL;
+        int r;
+
+        assert(handle);
+
+        if (!username) {
+                r = pam_get_user(handle, &username, NULL);
+                if (r != PAM_SUCCESS)
+                        return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get user name: @PAMERR@");
+
+                if (isempty(username))
+                        return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR, "User name not set.");
+        }
+
+        /* Let's bypass all IPC complexity for the two user names we know for sure we don't manage, and for
+         * user names we don't consider valid. */
+        if (STR_IN_SET(username, "root", NOBODY_USER_NAME) || !valid_user_group_name(username, 0))
+                return PAM_USER_UNKNOWN;
+
+        /* We cache the user record in the PAM context. We use a field name that includes the username, since
+         * clients might change the user name associated with a PAM context underneath us. Notably, 'sudo'
+         * creates a single PAM context and first authenticates it with the user set to the originating user,
+         * then updates the user for the destination user and issues the session stack with the same PAM
+         * context. We thus must be prepared that the user record changes between calls and we keep any
+         * caching separate. */
+        homed_field = strjoin("systemd-home-user-record-", username);
+        if (!homed_field)
+                return pam_log_oom(handle);
+
+        /* Let's use the cache, so that we can share it between the session and the authentication hooks */
+        r = pam_get_data(handle, homed_field, (const void**) &json);
+        if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA))
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM user record data: @PAMERR@");
+        if (r == PAM_SUCCESS && json) {
+                /* We determined earlier that this is not a homed user? Then exit early. (We use -1 as
+                 * negative cache indicator) */
+                if (json == POINTER_MAX)
+                        return PAM_USER_UNKNOWN;
+        } else {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_free_ char *generic_field = NULL, *json_copy = NULL;
+
+                r = pam_acquire_bus_connection(handle, "pam-systemd-home", &bus, bus_data);
+                if (r != PAM_SUCCESS)
+                        return r;
+
+                r = bus_call_method(bus, bus_home_mgr, "GetUserRecordByName", &error, &reply, "s", username);
+                if (r < 0) {
+                        if (bus_error_is_unknown_service(&error)) {
+                                pam_debug_syslog(handle, debug,
+                                                 "systemd-homed is not available: %s",
+                                                 bus_error_message(&error, r));
+                                goto user_unknown;
+                        }
+
+                        if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_HOME)) {
+                                pam_debug_syslog(handle, debug,
+                                                 "Not a user managed by systemd-homed: %s",
+                                                 bus_error_message(&error, r));
+                                goto user_unknown;
+                        }
+
+                        pam_syslog(handle, LOG_ERR,
+                                   "Failed to query user record: %s", bus_error_message(&error, r));
+                        return PAM_SERVICE_ERR;
+                }
+
+                r = sd_bus_message_read(reply, "sbo", &json, NULL, NULL);
+                if (r < 0)
+                        return pam_bus_log_parse_error(handle, r);
+
+                /* First copy: for the homed-specific data field, i.e. where we know the user record is from
+                 * homed */
+                json_copy = strdup(json);
+                if (!json_copy)
+                        return pam_log_oom(handle);
+
+                r = pam_set_data(handle, homed_field, json_copy, pam_cleanup_free);
+                if (r != PAM_SUCCESS)
+                        return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                                    "Failed to set PAM user record data '%s': @PAMERR@", homed_field);
+
+                /* Take a second copy: for the generic data field, the one which we share with
+                 * pam_systemd. While we insist on only reusing homed records, pam_systemd is fine with homed
+                 * and non-homed user records. */
+                json_copy = strdup(json);
+                if (!json_copy)
+                        return pam_log_oom(handle);
+
+                generic_field = strjoin("systemd-user-record-", username);
+                if (!generic_field)
+                        return pam_log_oom(handle);
+
+                r = pam_set_data(handle, generic_field, json_copy, pam_cleanup_free);
+                if (r != PAM_SUCCESS)
+                        return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                                    "Failed to set PAM user record data '%s': @PAMERR@", homed_field);
+
+                TAKE_PTR(json_copy);
+        }
+
+        r = json_parse(json, JSON_PARSE_SENSITIVE, &v, NULL, NULL);
+        if (r < 0)
+                return pam_syslog_errno(handle, LOG_ERR, r, "Failed to parse JSON user record: %m");
+
+        ur = user_record_new();
+        if (!ur)
+                return pam_log_oom(handle);
+
+        r = user_record_load(ur, v, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return pam_syslog_errno(handle, LOG_ERR, r, "Failed to load user record: %m");
+
+        /* Safety check if cached record actually matches what we are looking for */
+        if (!streq_ptr(username, ur->user_name))
+                return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR,
+                                            "Acquired user record does not match user name.");
+
+        if (ret_record)
+                *ret_record = TAKE_PTR(ur);
+
+        return PAM_SUCCESS;
+
+user_unknown:
+        /* Cache this, so that we don't check again */
+        r = pam_set_data(handle, homed_field, POINTER_MAX, NULL);
+        if (r != PAM_SUCCESS)
+                pam_syslog_pam_error(handle, LOG_ERR, r,
+                                     "Failed to set PAM user record data '%s' to invalid, ignoring: @PAMERR@",
+                                     homed_field);
+
+        return PAM_USER_UNKNOWN;
+}
+
+static int release_user_record(pam_handle_t *handle, const char *username) {
+        _cleanup_free_ char *homed_field = NULL, *generic_field = NULL;
+        int r, k;
+
+        assert(handle);
+        assert(username);
+
+        homed_field = strjoin("systemd-home-user-record-", username);
+        if (!homed_field)
+                return pam_log_oom(handle);
+
+        r = pam_set_data(handle, homed_field, NULL, NULL);
+        if (r != PAM_SUCCESS)
+                pam_syslog_pam_error(handle, LOG_ERR, r,
+                                     "Failed to release PAM user record data '%s': @PAMERR@", homed_field);
+
+        generic_field = strjoin("systemd-user-record-", username);
+        if (!generic_field)
+                return pam_log_oom(handle);
+
+        k = pam_set_data(handle, generic_field, NULL, NULL);
+        if (k != PAM_SUCCESS)
+                pam_syslog_pam_error(handle, LOG_ERR, k,
+                                     "Failed to release PAM user record data '%s': @PAMERR@", generic_field);
+
+        return IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA) ? k : r;
+}
+
+static void cleanup_home_fd(pam_handle_t *handle, void *data, int error_status) {
+        safe_close(PTR_TO_FD(data));
+}
+
+static int handle_generic_user_record_error(
+                pam_handle_t *handle,
+                const char *user_name,
+                UserRecord *secret,
+                int ret,
+                const sd_bus_error *error,
+                bool debug) {
+
+        assert(user_name);
+        assert(error);
+
+        int r;
+
+        /* Logs about all errors, except for PAM_CONV_ERR, i.e. when requesting more info failed. */
+
+        if (sd_bus_error_has_name(error, BUS_ERROR_HOME_ABSENT)) {
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL,
+                                  _("Home of user %s is currently absent, please plug in the necessary storage device or backing file system."), user_name);
+                return pam_syslog_pam_error(handle, LOG_ERR, PAM_PERM_DENIED,
+                                            "Failed to acquire home for user %s: %s", user_name, bus_error_message(error, ret));
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_AUTHENTICATION_LIMIT_HIT)) {
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Too frequent login attempts for user %s, try again later."), user_name);
+                return pam_syslog_pam_error(handle, LOG_ERR, PAM_MAXTRIES,
+                                            "Failed to acquire home for user %s: %s", user_name, bus_error_message(error, ret));
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_BAD_PASSWORD)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                /* This didn't work? Ask for an (additional?) password */
+
+                if (strv_isempty(secret->password))
+                        r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Password: "));
+                else {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password incorrect or not sufficient for authentication of user %s."), user_name);
+                        r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Sorry, try again: "));
+                }
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "Password request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = user_record_set_password(secret, STRV_MAKE(newp), true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store password: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_BAD_RECOVERY_KEY)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                /* Hmm, homed asks for recovery key (because no regular password is defined maybe)? Provide it. */
+
+                if (strv_isempty(secret->password))
+                        r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Recovery key: "));
+                else {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password/recovery key incorrect or not sufficient for authentication of user %s."), user_name);
+                        r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Sorry, reenter recovery key: "));
+                }
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "Recovery key request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = user_record_set_password(secret, STRV_MAKE(newp), true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store recovery key: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                if (strv_isempty(secret->password)) {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Security token of user %s not inserted."), user_name);
+                        r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Try again with password: "));
+                } else {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password incorrect or not sufficient, and configured security token of user %s not inserted."), user_name);
+                        r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Try again with password: "));
+                }
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "Password request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+
+                r = user_record_set_password(secret, STRV_MAKE(newp), true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store password: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_PIN_NEEDED)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Security token PIN: "));
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "PIN request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = user_record_set_token_pin(secret, STRV_MAKE(newp), false);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store PIN: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_PROTECTED_AUTHENTICATION_PATH_NEEDED)) {
+
+                assert(secret);
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Please authenticate physically on security token of user %s."), user_name);
+
+                r = user_record_set_pkcs11_protected_authentication_path_permitted(secret, true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r,
+                                                "Failed to set PKCS#11 protected authentication path permitted flag: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_USER_PRESENCE_NEEDED)) {
+
+                assert(secret);
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Please confirm presence on security token of user %s."), user_name);
+
+                r = user_record_set_fido2_user_presence_permitted(secret, true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r,
+                                                "Failed to set FIDO2 user presence permitted flag: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_USER_VERIFICATION_NEEDED)) {
+
+                assert(secret);
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Please verify user on security token of user %s."), user_name);
+
+                r = user_record_set_fido2_user_verification_permitted(secret, true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r,
+                                                "Failed to set FIDO2 user verification permitted flag: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_PIN_LOCKED)) {
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Security token PIN is locked, please unlock it first. (Hint: Removal and re-insertion might suffice.)"));
+                return PAM_SERVICE_ERR;
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_BAD_PIN)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Security token PIN incorrect for user %s."), user_name);
+                r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Sorry, retry security token PIN: "));
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "PIN request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = user_record_set_token_pin(secret, STRV_MAKE(newp), false);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store PIN: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_BAD_PIN_FEW_TRIES_LEFT)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Security token PIN of user %s incorrect (only a few tries left!)"), user_name);
+                r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Sorry, retry security token PIN: "));
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "PIN request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = user_record_set_token_pin(secret, STRV_MAKE(newp), false);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store PIN: %m");
+
+        } else if (sd_bus_error_has_name(error, BUS_ERROR_TOKEN_BAD_PIN_ONE_TRY_LEFT)) {
+                _cleanup_(erase_and_freep) char *newp = NULL;
+
+                assert(secret);
+
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Security token PIN of user %s incorrect (only one try left!)"), user_name);
+                r = pam_prompt(handle, PAM_PROMPT_ECHO_OFF, &newp, _("Sorry, retry security token PIN: "));
+                if (r != PAM_SUCCESS)
+                        return PAM_CONV_ERR; /* no logging here */
+
+                if (isempty(newp)) {
+                        pam_debug_syslog(handle, debug, "PIN request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = user_record_set_token_pin(secret, STRV_MAKE(newp), false);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store PIN: %m");
+
+        } else
+                return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR,
+                                            "Failed to acquire home for user %s: %s", user_name, bus_error_message(error, ret));
+
+        return PAM_SUCCESS;
+}
+
+static int acquire_home(
+                pam_handle_t *handle,
+                bool please_authenticate,
+                bool please_suspend,
+                bool debug,
+                PamBusData **bus_data) {
+
+        _cleanup_(user_record_unrefp) UserRecord *ur = NULL, *secret = NULL;
+        bool do_auth = please_authenticate, home_not_active = false, home_locked = false;
+        _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL;
+        _cleanup_close_ int acquired_fd = -EBADF;
+        _cleanup_free_ char *fd_field = NULL;
+        const void *home_fd_ptr = NULL;
+        const char *username = NULL;
+        unsigned n_attempts = 0;
+        int r;
+
+        assert(handle);
+
+        /* This acquires a reference to a home directory in one of two ways: if please_authenticate is true,
+         * then we'll call AcquireHome() after asking the user for a password. Otherwise it tries to call
+         * RefHome() and if that fails queries the user for a password and uses AcquireHome().
+         *
+         * The idea is that the PAM authentication hook sets please_authenticate and thus always
+         * authenticates, while the other PAM hooks unset it so that they can a ref of their own without
+         * authentication if possible, but with authentication if necessary. */
+
+        r = pam_get_user(handle, &username, NULL);
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get user name: @PAMERR@");
+
+        if (isempty(username))
+                return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR, "User name not set.");
+
+        /* If we already have acquired the fd, let's shortcut this */
+        fd_field = strjoin("systemd-home-fd-", username);
+        if (!fd_field)
+                return pam_log_oom(handle);
+
+        r = pam_get_data(handle, fd_field, &home_fd_ptr);
+        if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA))
+                return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                            "Failed to retrieve PAM home reference fd: @PAMERR@");
+        if (r == PAM_SUCCESS && PTR_TO_FD(home_fd_ptr) >= 0)
+                return PAM_SUCCESS;
+
+        r = pam_acquire_bus_connection(handle, "pam-systemd-home", &bus, bus_data);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = acquire_user_record(handle, username, debug, &ur, bus_data);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        /* Implement our own retry loop here instead of relying on the PAM client's one. That's because it
+         * might happen that the record we stored on the host does not match the encryption password of
+         * the LUKS image in case the image was used in a different system where the password was
+         * changed. In that case it will happen that the LUKS password and the host password are
+         * different, and we handle that by collecting and passing multiple passwords in that case. Hence we
+         * treat bad passwords as a request to collect one more password and pass the new all all previously
+         * used passwords again. */
+
+        for (;;) {
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                if (do_auth && !secret) {
+                        const char *cached_password = NULL;
+
+                        secret = user_record_new();
+                        if (!secret)
+                                return pam_log_oom(handle);
+
+                        /* If there's already a cached password, use it. But if not let's authenticate
+                         * without anything, maybe some other authentication mechanism systemd-homed
+                         * implements (such as PKCS#11) allows us to authenticate without anything else. */
+                        r = pam_get_item(handle, PAM_AUTHTOK, (const void**) &cached_password);
+                        if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS))
+                                return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                                            "Failed to get cached password: @PAMERR@");
+
+                        if (!isempty(cached_password)) {
+                                r = user_record_set_password(secret, STRV_MAKE(cached_password), true);
+                                if (r < 0)
+                                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store password: %m");
+                        }
+                }
+
+                r = bus_message_new_method_call(bus, &m, bus_home_mgr, do_auth ? "AcquireHome" : "RefHome");
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                r = sd_bus_message_append(m, "s", ur->user_name);
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                if (do_auth) {
+                        r = bus_message_append_secret(m, secret);
+                        if (r < 0)
+                                return pam_bus_log_create_error(handle, r);
+                }
+
+                r = sd_bus_message_append(m, "b", please_suspend);
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, &reply);
+                if (r < 0) {
+
+                        if (sd_bus_error_has_name(&error, BUS_ERROR_HOME_NOT_ACTIVE))
+                                /* Only on RefHome(): We can't access the home directory currently, unless
+                                 * it's unlocked with a password. Hence, let's try this again, this time with
+                                 * authentication. */
+                                home_not_active = true;
+                        else if (sd_bus_error_has_name(&error, BUS_ERROR_HOME_LOCKED))
+                                home_locked = true; /* Similar */
+                        else {
+                                r = handle_generic_user_record_error(handle, ur->user_name, secret, r, &error, debug);
+                                if (r == PAM_CONV_ERR) {
+                                        /* Password/PIN prompts will fail in certain environments, for example when
+                                         * we are called from OpenSSH's account or session hooks, or in systemd's
+                                         * per-service PAM logic. In that case, print a friendly message and accept
+                                         * failure. */
+
+                                        if (home_not_active)
+                                                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Home of user %s is currently not active, please log in locally first."), ur->user_name);
+                                        if (home_locked)
+                                                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Home of user %s is currently locked, please unlock locally first."), ur->user_name);
+
+                                        if (please_authenticate || debug)
+                                                pam_syslog(handle, please_authenticate ? LOG_ERR : LOG_DEBUG, "Failed to prompt for password/prompt.");
+
+                                        return home_not_active || home_locked ? PAM_PERM_DENIED : PAM_CONV_ERR;
+                                }
+                                if (r != PAM_SUCCESS)
+                                        return r;
+                        }
+
+                } else {
+                        int fd;
+
+                        r = sd_bus_message_read(reply, "h", &fd);
+                        if (r < 0)
+                                return pam_bus_log_parse_error(handle, r);
+
+                        acquired_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+                        if (acquired_fd < 0)
+                                return pam_syslog_errno(handle, LOG_ERR, errno,
+                                                        "Failed to duplicate acquired fd: %m");
+                        break;
+                }
+
+                if (++n_attempts >= 5) {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL,
+                                          _("Too many unsuccessful login attempts for user %s, refusing."), ur->user_name);
+                        return pam_syslog_pam_error(handle, LOG_ERR, PAM_MAXTRIES,
+                                                    "Failed to acquire home for user %s: %s", ur->user_name, bus_error_message(&error, r));
+                }
+
+                /* Try again, this time with authentication if we didn't do that before. */
+                do_auth = true;
+        }
+
+        /* Later PAM modules may need the auth token, but only during pam_authenticate. */
+        if (please_authenticate && !strv_isempty(secret->password)) {
+                r = pam_set_item(handle, PAM_AUTHTOK, *secret->password);
+                if (r != PAM_SUCCESS)
+                        return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set PAM auth token: @PAMERR@");
+        }
+
+        r = pam_set_data(handle, fd_field, FD_TO_PTR(acquired_fd), cleanup_home_fd);
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set PAM bus data: @PAMERR@");
+        TAKE_FD(acquired_fd);
+
+        if (do_auth) {
+                /* We likely just activated the home directory, let's flush out the user record, since a
+                 * newer embedded user record might have been acquired from the activation. */
+
+                r = release_user_record(handle, ur->user_name);
+                if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA))
+                        return r;
+        }
+
+        pam_syslog(handle, LOG_NOTICE, "Home for user %s successfully acquired.", ur->user_name);
+        return PAM_SUCCESS;
+}
+
+static int release_home_fd(pam_handle_t *handle, const char *username) {
+        _cleanup_free_ char *fd_field = NULL;
+        const void *home_fd_ptr = NULL;
+        int r;
+
+        assert(handle);
+        assert(username);
+
+        fd_field = strjoin("systemd-home-fd-", username);
+        if (!fd_field)
+                return pam_log_oom(handle);
+
+        r = pam_get_data(handle, fd_field, &home_fd_ptr);
+        if (r == PAM_NO_MODULE_DATA || (r == PAM_SUCCESS && PTR_TO_FD(home_fd_ptr) < 0))
+                return PAM_NO_MODULE_DATA;
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to retrieve PAM home reference fd: @PAMERR@");
+
+        r = pam_set_data(handle, fd_field, NULL, NULL);
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to release PAM home reference fd: @PAMERR@");
+
+        return PAM_SUCCESS;
+}
+
+_public_ PAM_EXTERN int pam_sm_authenticate(
+                pam_handle_t *handle,
+                int flags,
+                int argc, const char **argv) {
+
+        bool debug = false, suspend_please = false;
+
+        if (parse_env(handle, &suspend_please) < 0)
+                return PAM_AUTH_ERR;
+
+        if (parse_argv(handle,
+                       argc, argv,
+                       &suspend_please,
+                       &debug) < 0)
+                return PAM_AUTH_ERR;
+
+        pam_debug_syslog(handle, debug, "pam-systemd-homed authenticating");
+
+        return acquire_home(handle, /* please_authenticate= */ true, suspend_please, debug, NULL);
+}
+
+_public_ PAM_EXTERN int pam_sm_setcred(pam_handle_t *pamh, int flags, int argc, const char **argv) {
+        return PAM_SUCCESS;
+}
+
+_public_ PAM_EXTERN int pam_sm_open_session(
+                pam_handle_t *handle,
+                int flags,
+                int argc, const char **argv) {
+
+        /* Let's release the D-Bus connection once this function exits, after all the session might live
+         * quite a long time, and we are not going to process the bus connection in that time, so let's
+         * better close before the daemon kicks us off because we are not processing anything. */
+        _cleanup_(pam_bus_data_disconnectp) PamBusData *d = NULL;
+        bool debug = false, suspend_please = false;
+        int r;
+
+        if (parse_env(handle, &suspend_please) < 0)
+                return PAM_SESSION_ERR;
+
+        if (parse_argv(handle,
+                       argc, argv,
+                       &suspend_please,
+                       &debug) < 0)
+                return PAM_SESSION_ERR;
+
+        pam_debug_syslog(handle, debug, "pam-systemd-homed session start");
+
+        r = acquire_home(handle, /* please_authenticate = */ false, suspend_please, debug, &d);
+        if (r == PAM_USER_UNKNOWN) /* Not managed by us? Don't complain. */
+                return PAM_SUCCESS;
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = pam_putenv(handle, "SYSTEMD_HOME=1");
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                            "Failed to set PAM environment variable $SYSTEMD_HOME: @PAMERR@");
+
+        r = pam_putenv(handle, suspend_please ? "SYSTEMD_HOME_SUSPEND=1" : "SYSTEMD_HOME_SUSPEND=0");
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                            "Failed to set PAM environment variable $SYSTEMD_HOME_SUSPEND: @PAMERR@");
+
+        return PAM_SUCCESS;
+}
+
+_public_ PAM_EXTERN int pam_sm_close_session(
+                pam_handle_t *handle,
+                int flags,
+                int argc, const char **argv) {
+
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+        _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL;
+        const char *username = NULL;
+        bool debug = false;
+        int r;
+
+        if (parse_argv(handle,
+                       argc, argv,
+                       NULL,
+                       &debug) < 0)
+                return PAM_SESSION_ERR;
+
+        pam_debug_syslog(handle, debug, "pam-systemd-homed session end");
+
+        r = pam_get_user(handle, &username, NULL);
+        if (r != PAM_SUCCESS)
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get user name: @PAMERR@");
+
+        if (isempty(username))
+                return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR, "User name not set.");
+
+        /* Let's explicitly drop the reference to the homed session, so that the subsequent ReleaseHome()
+         * call will be able to do its thing. */
+        r = release_home_fd(handle, username);
+        if (r == PAM_NO_MODULE_DATA) /* Nothing to do, we never acquired an fd */
+                return PAM_SUCCESS;
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = pam_acquire_bus_connection(handle, "pam-systemd-home", &bus, NULL);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = bus_message_new_method_call(bus, &m, bus_home_mgr, "ReleaseHome");
+        if (r < 0)
+                return pam_bus_log_create_error(handle, r);
+
+        r = sd_bus_message_append(m, "s", username);
+        if (r < 0)
+                return pam_bus_log_create_error(handle, r);
+
+        r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+        if (r < 0) {
+                if (!sd_bus_error_has_name(&error, BUS_ERROR_HOME_BUSY))
+                        return pam_syslog_pam_error(handle, LOG_ERR, PAM_SESSION_ERR,
+                                                    "Failed to release user home: %s", bus_error_message(&error, r));
+
+                pam_syslog(handle, LOG_NOTICE, "Not deactivating home directory of %s, as it is still used.", username);
+        }
+
+        return PAM_SUCCESS;
+}
+
+_public_ PAM_EXTERN int pam_sm_acct_mgmt(
+                pam_handle_t *handle,
+                int flags,
+                int argc,
+                const char **argv) {
+
+        _cleanup_(user_record_unrefp) UserRecord *ur = NULL;
+        bool debug = false, please_suspend = false;
+        usec_t t;
+        int r;
+
+        if (parse_env(handle, &please_suspend) < 0)
+                return PAM_AUTH_ERR;
+
+        if (parse_argv(handle,
+                       argc, argv,
+                       &please_suspend,
+                       &debug) < 0)
+                return PAM_AUTH_ERR;
+
+        pam_debug_syslog(handle, debug, "pam-systemd-homed account management");
+
+        r = acquire_home(handle, /* please_authenticate = */ false, please_suspend, debug, NULL);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = acquire_user_record(handle, NULL, debug, &ur, NULL);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = user_record_test_blocked(ur);
+        switch (r) {
+
+        case -ESTALE:
+                pam_syslog(handle, LOG_WARNING, "User record for '%s' is newer than current system time, assuming incorrect system clock, allowing access.", ur->user_name);
+                break;
+
+        case -ENOLCK:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("User record is blocked, prohibiting access."));
+                return PAM_ACCT_EXPIRED;
+
+        case -EL2HLT:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("User record is not valid yet, prohibiting access."));
+                return PAM_ACCT_EXPIRED;
+
+        case -EL3HLT:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("User record is not valid anymore, prohibiting access."));
+                return PAM_ACCT_EXPIRED;
+
+        default:
+                if (r < 0) {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("User record not valid, prohibiting access."));
+                        return PAM_ACCT_EXPIRED;
+                }
+
+                break;
+        }
+
+        t = user_record_ratelimit_next_try(ur);
+        if (t != USEC_INFINITY) {
+                usec_t n = now(CLOCK_REALTIME);
+
+                if (t > n) {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Too many logins, try again in %s."),
+                                          FORMAT_TIMESPAN(t - n, USEC_PER_SEC));
+
+                        return PAM_MAXTRIES;
+                }
+        }
+
+        r = user_record_test_password_change_required(ur);
+        switch (r) {
+
+        case -EKEYREVOKED:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password change required."));
+                return PAM_NEW_AUTHTOK_REQD;
+
+        case -EOWNERDEAD:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password expired, change required."));
+                return PAM_NEW_AUTHTOK_REQD;
+
+        /* Strictly speaking this is only about password expiration, and we might want to allow
+         * authentication via PKCS#11 or so, but let's ignore this fine distinction for now. */
+        case -EKEYREJECTED:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password is expired, but can't change, refusing login."));
+                return PAM_AUTHTOK_EXPIRED;
+
+        case -EKEYEXPIRED:
+                (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("Password will expire soon, please change."));
+                break;
+
+        case -ESTALE:
+                /* If the system clock is wrong, let's log but continue */
+                pam_syslog(handle, LOG_WARNING, "Couldn't check if password change is required, last change is in the future, system clock likely wrong.");
+                break;
+
+        case -EROFS:
+                /* All good, just means the password if we wanted to change we couldn't, but we don't need to */
+                break;
+
+        default:
+                if (r < 0) {
+                        (void) pam_prompt(handle, PAM_ERROR_MSG, NULL, _("User record not valid, prohibiting access."));
+                        return PAM_AUTHTOK_EXPIRED;
+                }
+
+                break;
+        }
+
+        return PAM_SUCCESS;
+}
+
+_public_ PAM_EXTERN int pam_sm_chauthtok(
+                pam_handle_t *handle,
+                int flags,
+                int argc,
+                const char **argv) {
+
+        _cleanup_(user_record_unrefp) UserRecord *ur = NULL, *old_secret = NULL, *new_secret = NULL;
+        const char *old_password = NULL, *new_password = NULL;
+        _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL;
+        unsigned n_attempts = 0;
+        bool debug = false;
+        int r;
+
+        if (parse_argv(handle,
+                       argc, argv,
+                       NULL,
+                       &debug) < 0)
+                return PAM_AUTH_ERR;
+
+        pam_debug_syslog(handle, debug, "pam-systemd-homed account management");
+
+        r = pam_acquire_bus_connection(handle, "pam-systemd-home", &bus, NULL);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        r = acquire_user_record(handle, NULL, debug, &ur, NULL);
+        if (r != PAM_SUCCESS)
+                return r;
+
+        /* Start with cached credentials */
+        r = pam_get_item(handle, PAM_OLDAUTHTOK, (const void**) &old_password);
+        if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS))
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get old password: @PAMERR@");
+
+        r = pam_get_item(handle, PAM_AUTHTOK, (const void**) &new_password);
+        if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS))
+                return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get cached password: @PAMERR@");
+
+        if (isempty(new_password)) {
+                /* No, it's not cached, then let's ask for the password and its verification, and cache
+                 * it. */
+
+                r = pam_get_authtok_noverify(handle, &new_password, "New password: ");
+                if (r != PAM_SUCCESS)
+                        return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get new password: @PAMERR@");
+
+                if (isempty(new_password)) {
+                        pam_debug_syslog(handle, debug, "Password request aborted.");
+                        return PAM_AUTHTOK_ERR;
+                }
+
+                r = pam_get_authtok_verify(handle, &new_password, "new password: "); /* Lower case, since PAM prefixes 'Repeat' */
+                if (r != PAM_SUCCESS)
+                        return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get password again: @PAMERR@");
+
+                // FIXME: pam_pwquality will ask for the password a third time. It really shouldn't do
+                // that, and instead assume the password was already verified once when it is found to be
+                // cached already. needs to be fixed in pam_pwquality
+        }
+
+        /* Now everything is cached and checked, let's exit from the preliminary check */
+        if (FLAGS_SET(flags, PAM_PRELIM_CHECK))
+                return PAM_SUCCESS;
+
+        old_secret = user_record_new();
+        if (!old_secret)
+                return pam_log_oom(handle);
+
+        if (!isempty(old_password)) {
+                r = user_record_set_password(old_secret, STRV_MAKE(old_password), true);
+                if (r < 0)
+                        return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store old password: %m");
+        }
+
+        new_secret = user_record_new();
+        if (!new_secret)
+                return pam_log_oom(handle);
+
+        r = user_record_set_password(new_secret, STRV_MAKE(new_password), true);
+        if (r < 0)
+                return pam_syslog_errno(handle, LOG_ERR, r, "Failed to store new password: %m");
+
+        for (;;) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+
+                r = bus_message_new_method_call(bus, &m, bus_home_mgr, "ChangePasswordHome");
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                r = sd_bus_message_append(m, "s", ur->user_name);
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                r = bus_message_append_secret(m, new_secret);
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                r = bus_message_append_secret(m, old_secret);
+                if (r < 0)
+                        return pam_bus_log_create_error(handle, r);
+
+                r = sd_bus_call(bus, m, HOME_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL);
+                if (r < 0) {
+                        r = handle_generic_user_record_error(handle, ur->user_name, old_secret, r, &error, debug);
+                        if (r == PAM_CONV_ERR)
+                                return pam_syslog_pam_error(handle, LOG_ERR, r,
+                                                            "Failed to prompt for password/prompt.");
+                        if (r != PAM_SUCCESS)
+                                return r;
+                } else
+                        return pam_syslog_pam_error(handle, LOG_NOTICE, PAM_SUCCESS,
+                                                    "Successfully changed password for user %s.", ur->user_name);
+
+                if (++n_attempts >= 5)
+                        break;
+
+                /* Try again */
+        };
+
+        return pam_syslog_pam_error(handle, LOG_NOTICE, PAM_MAXTRIES,
+                                    "Failed to change password for user %s: @PAMERR@", ur->user_name);
+}
diff --git a/src/home/pam_systemd_home.sym b/src/home/pam_systemd_home.sym
new file mode 100644
index 0000000..293c06f
--- /dev/null
+++ b/src/home/pam_systemd_home.sym
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+{
+global:
+        pam_sm_authenticate;
+        pam_sm_setcred;
+        pam_sm_open_session;
+        pam_sm_close_session;
+        pam_sm_acct_mgmt;
+        pam_sm_chauthtok;
+local: *;
+};
diff --git a/src/home/user-record-password-quality.c b/src/home/user-record-password-quality.c
new file mode 100644
index 0000000..38f4acb
--- /dev/null
+++ b/src/home/user-record-password-quality.c
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-common-errors.h"
+#include "errno-util.h"
+#include "home-util.h"
+#include "libcrypt-util.h"
+#include "password-quality-util.h"
+#include "strv.h"
+#include "user-record-password-quality.h"
+#include "user-record-util.h"
+
+#if HAVE_PASSWDQC || HAVE_PWQUALITY
+
+int user_record_check_password_quality(
+                UserRecord *hr,
+                UserRecord *secret,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *auxerror = NULL;
+        int r;
+
+        assert(hr);
+        assert(secret);
+
+        /* This is a bit more complex than one might think at first. check_password_quality() would like to know the
+         * old password to make security checks. We support arbitrary numbers of passwords however, hence we
+         * call the function once for each combination of old and new password. */
+
+        /* Iterate through all new passwords */
+        STRV_FOREACH(pp, secret->password) {
+                bool called = false;
+
+                r = test_password_many(hr->hashed_password, *pp);
+                if (r < 0)
+                        return r;
+                if (r == 0) /* This is an old password as it isn't listed in the hashedPassword field, skip it */
+                        continue;
+
+                /* Check this password against all old passwords */
+                STRV_FOREACH(old, secret->password) {
+
+                        if (streq(*pp, *old))
+                                continue;
+
+                        r = test_password_many(hr->hashed_password, *old);
+                        if (r < 0)
+                                return r;
+                        if (r > 0) /* This is a new password, not suitable as old password */
+                                continue;
+
+                        r = check_password_quality(*pp, *old, hr->user_name, &auxerror);
+                        if (r <= 0)
+                                goto error;
+
+                        called = true;
+                }
+
+                if (called)
+                        continue;
+
+                /* If there are no old passwords, let's call check_password_quality() without any. */
+                r = check_password_quality(*pp, /* old */ NULL, hr->user_name, &auxerror);
+                if (r <= 0)
+                        goto error;
+        }
+        return 1;
+
+error:
+        if (r == 0)
+                return sd_bus_error_setf(error, BUS_ERROR_LOW_PASSWORD_QUALITY,
+                                         "Password too weak: %s", auxerror);
+        if (ERRNO_IS_NOT_SUPPORTED(r))
+                return 0;
+        return log_debug_errno(r, "Failed to check password quality: %m");
+}
+
+#else
+
+int user_record_check_password_quality(
+                UserRecord *hr,
+                UserRecord *secret,
+                sd_bus_error *error) {
+
+        return 0;
+}
+
+#endif
diff --git a/src/home/user-record-password-quality.h b/src/home/user-record-password-quality.h
new file mode 100644
index 0000000..c7d6ec6
--- /dev/null
+++ b/src/home/user-record-password-quality.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "user-record.h"
+
+int user_record_check_password_quality(UserRecord *hr, UserRecord *secret, sd_bus_error *error);
diff --git a/src/home/user-record-sign.c b/src/home/user-record-sign.c
new file mode 100644
index 0000000..dd099a0
--- /dev/null
+++ b/src/home/user-record-sign.c
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "fd-util.h"
+#include "fileio.h"
+#include "memstream-util.h"
+#include "openssl-util.h"
+#include "user-record-sign.h"
+
+static int user_record_signable_json(UserRecord *ur, char **ret) {
+        _cleanup_(user_record_unrefp) UserRecord *reduced = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *j = NULL;
+        int r;
+
+        assert(ur);
+        assert(ret);
+
+        r = user_record_clone(ur, USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_PRIVILEGED|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_STRIP_SECRET|USER_RECORD_STRIP_BINDING|USER_RECORD_STRIP_STATUS|USER_RECORD_STRIP_SIGNATURE|USER_RECORD_PERMISSIVE, &reduced);
+        if (r < 0)
+                return r;
+
+        j = json_variant_ref(reduced->json);
+
+        r = json_variant_normalize(&j);
+        if (r < 0)
+                return r;
+
+        return json_variant_format(j, 0, ret);
+}
+
+int user_record_sign(UserRecord *ur, EVP_PKEY *private_key, UserRecord **ret) {
+        _cleanup_(memstream_done) MemStream m = {};
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_(user_record_unrefp) UserRecord *signed_ur = NULL;
+        _cleanup_free_ char *text = NULL, *key = NULL;
+        _cleanup_free_ void *signature = NULL;
+        size_t signature_size = 0;
+        FILE *f;
+        int r;
+
+        assert(ur);
+        assert(private_key);
+        assert(ret);
+
+        r = user_record_signable_json(ur, &text);
+        if (r < 0)
+                return r;
+
+        r = digest_and_sign(/* md= */ NULL, private_key, text, SIZE_MAX, &signature, &signature_size);
+        if (r < 0)
+                return r;
+
+        f = memstream_init(&m);
+        if (!f)
+                return -ENOMEM;
+
+        if (PEM_write_PUBKEY(f, private_key) <= 0)
+                return -EIO;
+
+        r = memstream_finalize(&m, &key, NULL);
+        if (r < 0)
+                return r;
+
+        v = json_variant_ref(ur->json);
+
+        r = json_variant_set_fieldb(
+                        &v,
+                        "signature",
+                        JSON_BUILD_ARRAY(
+                                        JSON_BUILD_OBJECT(JSON_BUILD_PAIR("data", JSON_BUILD_BASE64(signature, signature_size)),
+                                                          JSON_BUILD_PAIR("key", JSON_BUILD_STRING(key)))));
+        if (r < 0)
+                return r;
+
+        if (DEBUG_LOGGING)
+                json_variant_dump(v, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR_AUTO, NULL, NULL);
+
+        signed_ur = user_record_new();
+        if (!signed_ur)
+                return log_oom();
+
+        r = user_record_load(signed_ur, v, USER_RECORD_LOAD_FULL|USER_RECORD_PERMISSIVE);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(signed_ur);
+        return 0;
+}
+
+int user_record_verify(UserRecord *ur, EVP_PKEY *public_key) {
+        _cleanup_free_ char *text = NULL;
+        unsigned n_good = 0, n_bad = 0;
+        JsonVariant *array, *e;
+        int r;
+
+        assert(ur);
+        assert(public_key);
+
+        array = json_variant_by_key(ur->json, "signature");
+        if (!array)
+                return USER_RECORD_UNSIGNED;
+
+        if (!json_variant_is_array(array))
+                return -EINVAL;
+
+        if (json_variant_elements(array) == 0)
+                return USER_RECORD_UNSIGNED;
+
+        r = user_record_signable_json(ur, &text);
+        if (r < 0)
+                return r;
+
+        JSON_VARIANT_ARRAY_FOREACH(e, array) {
+                _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *md_ctx = NULL;
+                _cleanup_free_ void *signature = NULL;
+                size_t signature_size = 0;
+                JsonVariant *data;
+
+                if (!json_variant_is_object(e))
+                        return -EINVAL;
+
+                data = json_variant_by_key(e, "data");
+                if (!data)
+                        return -EINVAL;
+
+                r = json_variant_unbase64(data, &signature, &signature_size);
+                if (r < 0)
+                        return r;
+
+                md_ctx = EVP_MD_CTX_new();
+                if (!md_ctx)
+                        return -ENOMEM;
+
+                if (EVP_DigestVerifyInit(md_ctx, NULL, NULL, NULL, public_key) <= 0)
+                        return -EIO;
+
+                if (EVP_DigestVerify(md_ctx, signature, signature_size, (uint8_t*) text, strlen(text)) <= 0) {
+                        n_bad ++;
+                        continue;
+                }
+
+                n_good ++;
+        }
+
+        return n_good > 0 ? (n_bad == 0 ? USER_RECORD_SIGNED_EXCLUSIVE : USER_RECORD_SIGNED) :
+                (n_bad == 0 ? USER_RECORD_UNSIGNED : USER_RECORD_FOREIGN);
+}
+
+int user_record_has_signature(UserRecord *ur) {
+        JsonVariant *array;
+
+        array = json_variant_by_key(ur->json, "signature");
+        if (!array)
+                return false;
+
+        if (!json_variant_is_array(array))
+                return -EINVAL;
+
+        return json_variant_elements(array) > 0;
+}
diff --git a/src/home/user-record-sign.h b/src/home/user-record-sign.h
new file mode 100644
index 0000000..87c6813
--- /dev/null
+++ b/src/home/user-record-sign.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "user-record.h"
+
+int user_record_sign(UserRecord *ur, EVP_PKEY *private_key, UserRecord **ret);
+
+enum {
+        USER_RECORD_UNSIGNED,           /* user record has no signature */
+        USER_RECORD_SIGNED_EXCLUSIVE,   /* user record has only a signature by our own key */
+        USER_RECORD_SIGNED,             /* user record is signed by us, but by others too */
+        USER_RECORD_FOREIGN,            /* user record is not signed by us, but by others */
+};
+
+int user_record_verify(UserRecord *ur, EVP_PKEY *public_key);
+
+int user_record_has_signature(UserRecord *ur);
diff --git a/src/home/user-record-util.c b/src/home/user-record-util.c
new file mode 100644
index 0000000..089cbb1
--- /dev/null
+++ b/src/home/user-record-util.c
@@ -0,0 +1,1512 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "errno-util.h"
+#include "home-util.h"
+#include "id128-util.h"
+#include "libcrypt-util.h"
+#include "memory-util.h"
+#include "recovery-key.h"
+#include "mountpoint-util.h"
+#include "path-util.h"
+#include "stat-util.h"
+#include "user-record-util.h"
+#include "user-util.h"
+
+int user_record_synthesize(
+                UserRecord *h,
+                const char *user_name,
+                const char *realm,
+                const char *image_path,
+                UserStorage storage,
+                uid_t uid,
+                gid_t gid) {
+
+        _cleanup_free_ char *hd = NULL, *un = NULL, *ip = NULL, *rr = NULL, *user_name_and_realm = NULL;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+        assert(user_name);
+        assert(image_path);
+        assert(IN_SET(storage, USER_LUKS, USER_SUBVOLUME, USER_FSCRYPT, USER_DIRECTORY));
+        assert(uid_is_valid(uid));
+        assert(gid_is_valid(gid));
+
+        /* Fill in a home record from just a username and an image path. */
+
+        if (h->json)
+                return -EBUSY;
+
+        if (!suitable_user_name(user_name))
+                return -EINVAL;
+
+        if (realm) {
+                r = suitable_realm(realm);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EINVAL;
+        }
+
+        if (!suitable_image_path(image_path))
+                return -EINVAL;
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        un = strdup(user_name);
+        if (!un)
+                return -ENOMEM;
+
+        if (realm) {
+                rr = strdup(realm);
+                if (!rr)
+                        return -ENOMEM;
+
+                user_name_and_realm = strjoin(user_name, "@", realm);
+                if (!user_name_and_realm)
+                        return -ENOMEM;
+        }
+
+        ip = strdup(image_path);
+        if (!ip)
+                return -ENOMEM;
+
+        hd = path_join(get_home_root(), user_name);
+        if (!hd)
+                return -ENOMEM;
+
+        r = json_build(&h->json,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)),
+                                       JSON_BUILD_PAIR_CONDITION(!!rr, "realm", JSON_BUILD_STRING(realm)),
+                                       JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("regular")),
+                                       JSON_BUILD_PAIR("binding", JSON_BUILD_OBJECT(
+                                                                       JSON_BUILD_PAIR(SD_ID128_TO_STRING(mid), JSON_BUILD_OBJECT(
+                                                                                                       JSON_BUILD_PAIR("imagePath", JSON_BUILD_STRING(image_path)),
+                                                                                                       JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_STRING(hd)),
+                                                                                                       JSON_BUILD_PAIR("storage", JSON_BUILD_STRING(user_storage_to_string(storage))),
+                                                                                                       JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)),
+                                                                                                       JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid))))))));
+        if (r < 0)
+                return r;
+
+        free_and_replace(h->user_name, un);
+        free_and_replace(h->realm, rr);
+        free_and_replace(h->user_name_and_realm_auto, user_name_and_realm);
+        free_and_replace(h->image_path, ip);
+        free_and_replace(h->home_directory, hd);
+        h->storage = storage;
+        h->uid = uid;
+
+        h->mask = USER_RECORD_REGULAR|USER_RECORD_BINDING;
+        return 0;
+}
+
+int group_record_synthesize(GroupRecord *g, UserRecord *h) {
+        _cleanup_free_ char *un = NULL, *rr = NULL, *group_name_and_realm = NULL, *description = NULL;
+        sd_id128_t mid;
+        int r;
+
+        assert(g);
+        assert(h);
+
+        if (g->json)
+                return -EBUSY;
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        un = strdup(h->user_name);
+        if (!un)
+                return -ENOMEM;
+
+        if (h->realm) {
+                rr = strdup(h->realm);
+                if (!rr)
+                        return -ENOMEM;
+
+                group_name_and_realm = strjoin(un, "@", rr);
+                if (!group_name_and_realm)
+                        return -ENOMEM;
+        }
+
+        description = strjoin("Primary Group of User ", un);
+        if (!description)
+                return -ENOMEM;
+
+        r = json_build(&g->json,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(un)),
+                                       JSON_BUILD_PAIR_CONDITION(!!rr, "realm", JSON_BUILD_STRING(rr)),
+                                       JSON_BUILD_PAIR("description", JSON_BUILD_STRING(description)),
+                                       JSON_BUILD_PAIR("binding", JSON_BUILD_OBJECT(
+                                                                       JSON_BUILD_PAIR(SD_ID128_TO_STRING(mid), JSON_BUILD_OBJECT(
+                                                                                                       JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(user_record_gid(h))))))),
+                                       JSON_BUILD_PAIR_CONDITION(h->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(user_record_disposition(h)))),
+                                       JSON_BUILD_PAIR("status", JSON_BUILD_OBJECT(
+                                                                       JSON_BUILD_PAIR(SD_ID128_TO_STRING(mid), JSON_BUILD_OBJECT(
+                                                                                                       JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.Home"))))))));
+        if (r < 0)
+                return r;
+
+        free_and_replace(g->group_name, un);
+        free_and_replace(g->realm, rr);
+        free_and_replace(g->group_name_and_realm_auto, group_name_and_realm);
+        g->gid = user_record_gid(h);
+        g->disposition = h->disposition;
+
+        g->mask = USER_RECORD_REGULAR|USER_RECORD_BINDING;
+        return 0;
+}
+
+int user_record_reconcile(
+                UserRecord *host,
+                UserRecord *embedded,
+                UserReconcileMode mode,
+                UserRecord **ret) {
+
+        int r, result;
+
+        /* Reconciles the identity record stored on the host with the one embedded in a $HOME
+         * directory. Returns the following error codes:
+         *
+         *     -EINVAL: one of the records not valid
+         *     -REMCHG: identity records are not about the same user
+         *     -ESTALE: embedded identity record is equally new or newer than supplied record
+         *
+         * Return the new record to use, which is either the embedded record updated with the host
+         * binding or the host record. In both cases the secret data is stripped. */
+
+        assert(host);
+        assert(embedded);
+
+        /* Make sure both records are initialized */
+        if (!host->json || !embedded->json)
+                return -EINVAL;
+
+        /* Ensure these records actually contain user data */
+        if (!(embedded->mask & host->mask & USER_RECORD_REGULAR))
+                return -EINVAL;
+
+        /* Make sure the user name and realm matches */
+        if (!user_record_compatible(host, embedded))
+                return -EREMCHG;
+
+        /* Embedded identities may not contain secrets or binding info */
+        if ((embedded->mask & (USER_RECORD_SECRET|USER_RECORD_BINDING)) != 0)
+                return -EINVAL;
+
+        /* The embedded record checked out, let's now figure out which of the two identities we'll consider
+         * in effect from now on. We do this by checking the last change timestamp, and in doubt always let
+         * the embedded data win. */
+        if (host->last_change_usec != UINT64_MAX &&
+            (embedded->last_change_usec == UINT64_MAX || host->last_change_usec > embedded->last_change_usec))
+
+                /* The host version is definitely newer, either because it has a version at all and the
+                 * embedded version doesn't or because it is numerically newer. */
+                result = USER_RECONCILE_HOST_WON;
+
+        else if (host->last_change_usec == embedded->last_change_usec) {
+
+                /* The nominal version number of the host and the embedded identity is the same. If so, let's
+                 * verify that, and tell the caller if we are ignoring embedded data. */
+
+                r = user_record_masked_equal(host, embedded, USER_RECORD_REGULAR|USER_RECORD_PRIVILEGED|USER_RECORD_PER_MACHINE);
+                if (r < 0)
+                        return r;
+                if (r > 0) {
+                        if (mode == USER_RECONCILE_REQUIRE_NEWER)
+                                return -ESTALE;
+
+                        result = USER_RECONCILE_IDENTICAL;
+                } else
+                        result = USER_RECONCILE_HOST_WON;
+        } else {
+                _cleanup_(json_variant_unrefp) JsonVariant *extended = NULL;
+                _cleanup_(user_record_unrefp) UserRecord *merged = NULL;
+                JsonVariant *e;
+
+                /* The embedded version is newer */
+
+                if (mode == USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL)
+                        return -ESTALE;
+
+                /* Copy in the binding data */
+                extended = json_variant_ref(embedded->json);
+
+                e = json_variant_by_key(host->json, "binding");
+                if (e) {
+                        r = json_variant_set_field(&extended, "binding", e);
+                        if (r < 0)
+                                return r;
+                }
+
+                merged = user_record_new();
+                if (!merged)
+                        return -ENOMEM;
+
+                r = user_record_load(merged, extended, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE);
+                if (r < 0)
+                        return r;
+
+                *ret = TAKE_PTR(merged);
+                return USER_RECONCILE_EMBEDDED_WON; /* update */
+        }
+
+        /* Strip out secrets */
+        r = user_record_clone(host, USER_RECORD_LOAD_MASK_SECRET|USER_RECORD_PERMISSIVE, ret);
+        if (r < 0)
+                return r;
+
+        return result;
+}
+
+int user_record_add_binding(
+                UserRecord *h,
+                UserStorage storage,
+                const char *image_path,
+                sd_id128_t partition_uuid,
+                sd_id128_t luks_uuid,
+                sd_id128_t fs_uuid,
+                const char *luks_cipher,
+                const char *luks_cipher_mode,
+                uint64_t luks_volume_key_size,
+                const char *file_system_type,
+                const char *home_directory,
+                uid_t uid,
+                gid_t gid) {
+
+        _cleanup_(json_variant_unrefp) JsonVariant *new_binding_entry = NULL, *binding = NULL;
+        _cleanup_free_ char *ip = NULL, *hd = NULL, *ip_auto = NULL, *lc = NULL, *lcm = NULL, *fst = NULL;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+
+        if (!h->json)
+                return -EUNATCH;
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        if (image_path) {
+                ip = strdup(image_path);
+                if (!ip)
+                        return -ENOMEM;
+        } else if (!h->image_path && storage >= 0) {
+                r = user_record_build_image_path(storage, user_record_user_name_and_realm(h), &ip_auto);
+                if (r < 0)
+                        return r;
+        }
+
+        if (home_directory) {
+                hd = strdup(home_directory);
+                if (!hd)
+                        return -ENOMEM;
+        }
+
+        if (file_system_type) {
+                fst = strdup(file_system_type);
+                if (!fst)
+                        return -ENOMEM;
+        }
+
+        if (luks_cipher) {
+                lc = strdup(luks_cipher);
+                if (!lc)
+                        return -ENOMEM;
+        }
+
+        if (luks_cipher_mode) {
+                lcm = strdup(luks_cipher_mode);
+                if (!lcm)
+                        return -ENOMEM;
+        }
+
+        r = json_build(&new_binding_entry,
+                       JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR_CONDITION(!!image_path, "imagePath", JSON_BUILD_STRING(image_path)),
+                                       JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(partition_uuid), "partitionUuid", JSON_BUILD_STRING(SD_ID128_TO_UUID_STRING(partition_uuid))),
+                                       JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(luks_uuid), "luksUuid", JSON_BUILD_STRING(SD_ID128_TO_UUID_STRING(luks_uuid))),
+                                       JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(fs_uuid), "fileSystemUuid", JSON_BUILD_STRING(SD_ID128_TO_UUID_STRING(fs_uuid))),
+                                       JSON_BUILD_PAIR_CONDITION(!!luks_cipher, "luksCipher", JSON_BUILD_STRING(luks_cipher)),
+                                       JSON_BUILD_PAIR_CONDITION(!!luks_cipher_mode, "luksCipherMode", JSON_BUILD_STRING(luks_cipher_mode)),
+                                       JSON_BUILD_PAIR_CONDITION(luks_volume_key_size != UINT64_MAX, "luksVolumeKeySize", JSON_BUILD_UNSIGNED(luks_volume_key_size)),
+                                       JSON_BUILD_PAIR_CONDITION(!!file_system_type, "fileSystemType", JSON_BUILD_STRING(file_system_type)),
+                                       JSON_BUILD_PAIR_CONDITION(!!home_directory, "homeDirectory", JSON_BUILD_STRING(home_directory)),
+                                       JSON_BUILD_PAIR_CONDITION(uid_is_valid(uid), "uid", JSON_BUILD_UNSIGNED(uid)),
+                                       JSON_BUILD_PAIR_CONDITION(gid_is_valid(gid), "gid", JSON_BUILD_UNSIGNED(gid)),
+                                       JSON_BUILD_PAIR_CONDITION(storage >= 0, "storage", JSON_BUILD_STRING(user_storage_to_string(storage)))));
+        if (r < 0)
+                return r;
+
+        binding = json_variant_ref(json_variant_by_key(h->json, "binding"));
+        if (binding) {
+                _cleanup_(json_variant_unrefp) JsonVariant *be = NULL;
+
+                /* Merge the new entry with an old one, if that exists */
+                be = json_variant_ref(json_variant_by_key(binding, SD_ID128_TO_STRING(mid)));
+                if (be) {
+                        r = json_variant_merge_object(&be, new_binding_entry);
+                        if (r < 0)
+                                return r;
+
+                        json_variant_unref(new_binding_entry);
+                        new_binding_entry = TAKE_PTR(be);
+                }
+        }
+
+        r = json_variant_set_field(&binding, SD_ID128_TO_STRING(mid), new_binding_entry);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&h->json, "binding", binding);
+        if (r < 0)
+                return r;
+
+        if (storage >= 0)
+                h->storage = storage;
+
+        if (ip)
+                free_and_replace(h->image_path, ip);
+        if (ip_auto)
+                free_and_replace(h->image_path_auto, ip_auto);
+
+        if (!sd_id128_is_null(partition_uuid))
+                h->partition_uuid = partition_uuid;
+
+        if (!sd_id128_is_null(luks_uuid))
+                h->luks_uuid = luks_uuid;
+
+        if (!sd_id128_is_null(fs_uuid))
+                h->file_system_uuid = fs_uuid;
+
+        if (lc)
+                free_and_replace(h->luks_cipher, lc);
+        if (lcm)
+                free_and_replace(h->luks_cipher_mode, lcm);
+        if (luks_volume_key_size != UINT64_MAX)
+                h->luks_volume_key_size = luks_volume_key_size;
+
+        if (fst)
+                free_and_replace(h->file_system_type, fst);
+        if (hd)
+                free_and_replace(h->home_directory, hd);
+
+        if (uid_is_valid(uid))
+                h->uid = uid;
+        if (gid_is_valid(gid))
+                h->gid = gid;
+
+        h->mask |= USER_RECORD_BINDING;
+        return 1;
+}
+
+int user_record_test_home_directory(UserRecord *h) {
+        const char *hd;
+        int r;
+
+        assert(h);
+
+        /* Returns one of USER_TEST_ABSENT, USER_TEST_MOUNTED, USER_TEST_EXISTS on success */
+
+        hd = user_record_home_directory(h);
+        if (!hd)
+                return -ENXIO;
+
+        r = is_dir(hd, false);
+        if (r == -ENOENT)
+                return USER_TEST_ABSENT;
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -ENOTDIR;
+
+        r = path_is_mount_point(hd, NULL, 0);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return USER_TEST_MOUNTED;
+
+        /* If the image path and the home directory are identical, then it's OK if the directory is
+         * populated. */
+        if (IN_SET(user_record_storage(h), USER_CLASSIC, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT)) {
+                const char *ip;
+
+                ip = user_record_image_path(h);
+                if (ip && path_equal(ip, hd))
+                        return USER_TEST_EXISTS;
+        }
+
+        /* Otherwise it's not OK */
+        r = dir_is_empty(hd, /* ignore_hidden_or_backup= */ false);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EBUSY;
+
+        return USER_TEST_EXISTS;
+}
+
+int user_record_test_home_directory_and_warn(UserRecord *h) {
+        int r;
+
+        assert(h);
+
+        r = user_record_test_home_directory(h);
+        if (r == -ENXIO)
+                return log_error_errno(r, "User record lacks home directory, refusing.");
+        if (r == -ENOTDIR)
+                return log_error_errno(r, "Home directory %s is not a directory, refusing.", user_record_home_directory(h));
+        if (r == -EBUSY)
+                return log_error_errno(r, "Home directory %s exists, is not mounted but populated, refusing.", user_record_home_directory(h));
+        if (r < 0)
+                return log_error_errno(r, "Failed to test whether the home directory %s exists: %m", user_record_home_directory(h));
+
+        return r;
+}
+
+int user_record_test_image_path(UserRecord *h) {
+        const char *ip;
+        struct stat st;
+
+        assert(h);
+
+        if (user_record_storage(h) == USER_CIFS)
+                return USER_TEST_UNDEFINED;
+
+        ip = user_record_image_path(h);
+        if (!ip)
+                return -ENXIO;
+
+        if (stat(ip, &st) < 0) {
+                if (errno == ENOENT)
+                        return USER_TEST_ABSENT;
+
+                return -errno;
+        }
+
+        switch (user_record_storage(h)) {
+
+        case USER_LUKS:
+                if (S_ISREG(st.st_mode)) {
+                        ssize_t n;
+                        char x[2];
+
+                        n = getxattr(ip, "user.home-dirty", x, sizeof(x));
+                        if (n < 0) {
+                                if (!ERRNO_IS_XATTR_ABSENT(errno))
+                                        log_debug_errno(errno, "Unable to read dirty xattr off image file, ignoring: %m");
+
+                        } else if (n == 1 && x[0] == '1')
+                                return USER_TEST_DIRTY;
+
+                        return USER_TEST_EXISTS;
+                }
+
+                if (S_ISBLK(st.st_mode)) {
+                        /* For block devices we can't really be sure if the device referenced actually is the
+                         * fs we look for or some other file system (think: what does /dev/sdb1 refer
+                         * to?). Hence, let's return USER_TEST_MAYBE as an ambiguous return value for these
+                         * case, except if the device path used is one of the paths that is based on a
+                         * filesystem or partition UUID or label, because in those cases we can be sure we
+                         * are referring to the right device. */
+
+                        if (PATH_STARTSWITH_SET(ip,
+                                                "/dev/disk/by-uuid/",
+                                                "/dev/disk/by-partuuid/",
+                                                "/dev/disk/by-partlabel/",
+                                                "/dev/disk/by-label/"))
+                                return USER_TEST_EXISTS;
+
+                        return USER_TEST_MAYBE;
+                }
+
+                return -EBADFD;
+
+        case USER_CLASSIC:
+        case USER_DIRECTORY:
+        case USER_SUBVOLUME:
+        case USER_FSCRYPT:
+                if (S_ISDIR(st.st_mode))
+                        return USER_TEST_EXISTS;
+
+                return -ENOTDIR;
+
+        default:
+                assert_not_reached();
+        }
+}
+
+int user_record_test_image_path_and_warn(UserRecord *h) {
+        int r;
+
+        assert(h);
+
+        r = user_record_test_image_path(h);
+        if (r == -ENXIO)
+                return log_error_errno(r, "User record lacks image path, refusing.");
+        if (r == -EBADFD)
+                return log_error_errno(r, "Image path %s is not a regular file or block device, refusing.", user_record_image_path(h));
+        if (r == -ENOTDIR)
+                return log_error_errno(r, "Image path %s is not a directory, refusing.", user_record_image_path(h));
+        if (r < 0)
+                return log_error_errno(r, "Failed to test whether image path %s exists: %m", user_record_image_path(h));
+
+        return r;
+}
+
+int user_record_test_password(UserRecord *h, UserRecord *secret) {
+        int r;
+
+        assert(h);
+
+        /* Checks whether any of the specified passwords matches any of the hashed passwords of the entry */
+
+        if (strv_isempty(h->hashed_password))
+                return -ENXIO;
+
+        STRV_FOREACH(i, secret->password) {
+                r = test_password_many(h->hashed_password, *i);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        return 0;
+        }
+
+        return -ENOKEY;
+}
+
+int user_record_test_recovery_key(UserRecord *h, UserRecord *secret) {
+        int r;
+
+        assert(h);
+
+        /* Checks whether any of the specified passwords matches any of the hashed recovery keys of the entry */
+
+        if (h->n_recovery_key == 0)
+                return -ENXIO;
+
+        STRV_FOREACH(i, secret->password) {
+                for (size_t j = 0; j < h->n_recovery_key; j++) {
+                        _cleanup_(erase_and_freep) char *mangled = NULL;
+                        const char *p;
+
+                        if (streq(h->recovery_key[j].type, "modhex64")) {
+                                /* If this key is for a modhex64 recovery key, then try to normalize the
+                                 * passphrase to make things more robust: that way the password becomes case
+                                 * insensitive and the dashes become optional. */
+
+                                r = normalize_recovery_key(*i, &mangled);
+                                if (r == -EINVAL) /* Not a valid modhex64 passphrase, don't bother */
+                                        continue;
+                                if (r < 0)
+                                        return r;
+
+                                p = mangled;
+                        } else
+                                p = *i; /* Unknown recovery key types process as is */
+
+                        r = test_password_one(h->recovery_key[j].hashed_password, p);
+                        if (r < 0)
+                                return r;
+                        if (r > 0)
+                                return 0;
+                }
+        }
+
+        return -ENOKEY;
+}
+
+int user_record_set_disk_size(UserRecord *h, uint64_t disk_size) {
+        _cleanup_(json_variant_unrefp) JsonVariant *new_per_machine = NULL, *midv = NULL, *midav = NULL, *ne = NULL;
+        _cleanup_free_ JsonVariant **array = NULL;
+        size_t idx = SIZE_MAX, n;
+        JsonVariant *per_machine;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+
+        if (!h->json)
+                return -EUNATCH;
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        r = json_variant_new_string(&midv, SD_ID128_TO_STRING(mid));
+        if (r < 0)
+                return r;
+
+        r = json_variant_new_array(&midav, (JsonVariant*[]) { midv }, 1);
+        if (r < 0)
+                return r;
+
+        per_machine = json_variant_by_key(h->json, "perMachine");
+        if (per_machine) {
+                size_t i;
+
+                if (!json_variant_is_array(per_machine))
+                        return -EINVAL;
+
+                n = json_variant_elements(per_machine);
+
+                array = new(JsonVariant*, n + 1);
+                if (!array)
+                        return -ENOMEM;
+
+                for (i = 0; i < n; i++) {
+                        JsonVariant *m;
+
+                        array[i] = json_variant_by_index(per_machine, i);
+
+                        if (!json_variant_is_object(array[i]))
+                                return -EINVAL;
+
+                        m = json_variant_by_key(array[i], "matchMachineId");
+                        if (!m) {
+                                /* No machineId field? Let's ignore this, but invalidate what we found so far */
+                                idx = SIZE_MAX;
+                                continue;
+                        }
+
+                        if (json_variant_equal(m, midv) ||
+                            json_variant_equal(m, midav)) {
+                                /* Matches exactly what we are looking for. Let's use this */
+                                idx = i;
+                                continue;
+                        }
+
+                        r = per_machine_id_match(m, JSON_PERMISSIVE);
+                        if (r < 0)
+                                return r;
+                        if (r > 0)
+                                /* Also matches what we are looking for, but with a broader match. In this
+                                 * case let's ignore this entry, and add a new specific one to the end. */
+                                idx = SIZE_MAX;
+                }
+
+                if (idx == SIZE_MAX)
+                        idx = n++; /* Nothing suitable found, place new entry at end */
+                else
+                        ne = json_variant_ref(array[idx]);
+
+        } else {
+                array = new(JsonVariant*, 1);
+                if (!array)
+                        return -ENOMEM;
+
+                idx = 0;
+                n = 1;
+        }
+
+        if (!ne) {
+                r = json_variant_set_field(&ne, "matchMachineId", midav);
+                if (r < 0)
+                        return r;
+        }
+
+        r = json_variant_set_field_unsigned(&ne, "diskSize", disk_size);
+        if (r < 0)
+                return r;
+
+        assert(idx < n);
+        array[idx] = ne;
+
+        r = json_variant_new_array(&new_per_machine, array, n);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&h->json, "perMachine", new_per_machine);
+        if (r < 0)
+                return r;
+
+        h->disk_size = disk_size;
+        h->mask |= USER_RECORD_PER_MACHINE;
+        return 0;
+}
+
+int user_record_update_last_changed(UserRecord *h, bool with_password) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        usec_t n;
+        int r;
+
+        assert(h);
+
+        if (!h->json)
+                return -EUNATCH;
+
+        n = now(CLOCK_REALTIME);
+
+        /* refuse downgrading */
+        if (h->last_change_usec != UINT64_MAX && h->last_change_usec >= n)
+                return -ECHRNG;
+        if (h->last_password_change_usec != UINT64_MAX && h->last_password_change_usec >= n)
+                return -ECHRNG;
+
+        v = json_variant_ref(h->json);
+
+        r = json_variant_set_field_unsigned(&v, "lastChangeUSec", n);
+        if (r < 0)
+                return r;
+
+        if (with_password) {
+                r = json_variant_set_field_unsigned(&v, "lastPasswordChangeUSec", n);
+                if (r < 0)
+                        return r;
+
+                h->last_password_change_usec = n;
+        }
+
+        h->last_change_usec = n;
+
+        json_variant_unref(h->json);
+        h->json = TAKE_PTR(v);
+
+        h->mask |= USER_RECORD_REGULAR;
+        return 0;
+}
+
+int user_record_make_hashed_password(UserRecord *h, char **secret, bool extend) {
+        _cleanup_(json_variant_unrefp) JsonVariant *priv = NULL;
+        _cleanup_strv_free_ char **np = NULL;
+        int r;
+
+        assert(h);
+        assert(secret);
+
+        /* Initializes the hashed password list from the specified plaintext passwords */
+
+        if (extend) {
+                np = strv_copy(h->hashed_password);
+                if (!np)
+                        return -ENOMEM;
+
+                strv_uniq(np);
+        }
+
+        STRV_FOREACH(i, secret) {
+                _cleanup_(erase_and_freep) char *hashed = NULL;
+
+                r = hash_password(*i, &hashed);
+                if (r < 0)
+                        return r;
+
+                r = strv_consume(&np, TAKE_PTR(hashed));
+                if (r < 0)
+                        return r;
+        }
+
+        priv = json_variant_ref(json_variant_by_key(h->json, "privileged"));
+
+        if (strv_isempty(np))
+                r = json_variant_filter(&priv, STRV_MAKE("hashedPassword"));
+        else {
+                _cleanup_(json_variant_unrefp) JsonVariant *new_array = NULL;
+
+                r = json_variant_new_array_strv(&new_array, np);
+                if (r < 0)
+                        return r;
+
+                r = json_variant_set_field(&priv, "hashedPassword", new_array);
+                if (r < 0)
+                        return r;
+        }
+
+        r = json_variant_set_field(&h->json, "privileged", priv);
+        if (r < 0)
+                return r;
+
+        strv_free_and_replace(h->hashed_password, np);
+
+        SET_FLAG(h->mask, USER_RECORD_PRIVILEGED, !json_variant_is_blank_object(priv));
+        return 0;
+}
+
+int user_record_set_hashed_password(UserRecord *h, char **hashed_password) {
+        _cleanup_(json_variant_unrefp) JsonVariant *priv = NULL;
+        _cleanup_strv_free_ char **copy = NULL;
+        int r;
+
+        assert(h);
+
+        priv = json_variant_ref(json_variant_by_key(h->json, "privileged"));
+
+        if (strv_isempty(hashed_password))
+                r = json_variant_filter(&priv, STRV_MAKE("hashedPassword"));
+        else {
+                _cleanup_(json_variant_unrefp) JsonVariant *array = NULL;
+
+                copy = strv_copy(hashed_password);
+                if (!copy)
+                        return -ENOMEM;
+
+                strv_uniq(copy);
+
+                r = json_variant_new_array_strv(&array, copy);
+                if (r < 0)
+                        return r;
+
+                r = json_variant_set_field(&priv, "hashedPassword", array);
+        }
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&h->json, "privileged", priv);
+        if (r < 0)
+                return r;
+
+        strv_free_and_replace(h->hashed_password, copy);
+
+        SET_FLAG(h->mask, USER_RECORD_PRIVILEGED, !json_variant_is_blank_object(priv));
+        return 0;
+}
+
+int user_record_set_password(UserRecord *h, char **password, bool prepend) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        _cleanup_strv_free_erase_ char **e = NULL;
+        int r;
+
+        assert(h);
+
+        if (prepend) {
+                e = strv_copy(password);
+                if (!e)
+                        return -ENOMEM;
+
+                r = strv_extend_strv(&e, h->password, true);
+                if (r < 0)
+                        return r;
+
+                strv_uniq(e);
+
+                if (strv_equal(h->password, e))
+                        return 0;
+
+        } else {
+                if (strv_equal(h->password, password))
+                        return 0;
+
+                e = strv_copy(password);
+                if (!e)
+                        return -ENOMEM;
+
+                strv_uniq(e);
+        }
+
+        w = json_variant_ref(json_variant_by_key(h->json, "secret"));
+
+        if (strv_isempty(e))
+                r = json_variant_filter(&w, STRV_MAKE("password"));
+        else {
+                _cleanup_(json_variant_unrefp) JsonVariant *l = NULL;
+
+                r = json_variant_new_array_strv(&l, e);
+                if (r < 0)
+                        return r;
+
+                json_variant_sensitive(l);
+
+                r = json_variant_set_field(&w, "password", l);
+        }
+        if (r < 0)
+                return r;
+
+        json_variant_sensitive(w);
+
+        r = json_variant_set_field(&h->json, "secret", w);
+        if (r < 0)
+                return r;
+
+        strv_free_and_replace(h->password, e);
+
+        SET_FLAG(h->mask, USER_RECORD_SECRET, !json_variant_is_blank_object(w));
+        return 0;
+}
+
+int user_record_set_token_pin(UserRecord *h, char **pin, bool prepend) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        _cleanup_strv_free_erase_ char **e = NULL;
+        int r;
+
+        assert(h);
+
+        if (prepend) {
+                e = strv_copy(pin);
+                if (!e)
+                        return -ENOMEM;
+
+                r = strv_extend_strv(&e, h->token_pin, true);
+                if (r < 0)
+                        return r;
+
+                strv_uniq(e);
+
+                if (strv_equal(h->token_pin, e))
+                        return 0;
+
+        } else {
+                if (strv_equal(h->token_pin, pin))
+                        return 0;
+
+                e = strv_copy(pin);
+                if (!e)
+                        return -ENOMEM;
+
+                strv_uniq(e);
+        }
+
+        w = json_variant_ref(json_variant_by_key(h->json, "secret"));
+
+        if (strv_isempty(e))
+                r = json_variant_filter(&w, STRV_MAKE("tokenPin"));
+        else {
+                _cleanup_(json_variant_unrefp) JsonVariant *l = NULL;
+
+                r = json_variant_new_array_strv(&l, e);
+                if (r < 0)
+                        return r;
+
+                json_variant_sensitive(l);
+
+                r = json_variant_set_field(&w, "tokenPin", l);
+        }
+        if (r < 0)
+                return r;
+
+        json_variant_sensitive(w);
+
+        r = json_variant_set_field(&h->json, "secret", w);
+        if (r < 0)
+                return r;
+
+        strv_free_and_replace(h->token_pin, e);
+
+        SET_FLAG(h->mask, USER_RECORD_SECRET, !json_variant_is_blank_object(w));
+        return 0;
+}
+
+int user_record_set_pkcs11_protected_authentication_path_permitted(UserRecord *h, int b) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        int r;
+
+        assert(h);
+
+        w = json_variant_ref(json_variant_by_key(h->json, "secret"));
+
+        if (b < 0)
+                r = json_variant_filter(&w, STRV_MAKE("pkcs11ProtectedAuthenticationPathPermitted"));
+        else
+                r = json_variant_set_field_boolean(&w, "pkcs11ProtectedAuthenticationPathPermitted", b);
+        if (r < 0)
+                return r;
+
+        if (json_variant_is_blank_object(w))
+                r = json_variant_filter(&h->json, STRV_MAKE("secret"));
+        else {
+                json_variant_sensitive(w);
+
+                r = json_variant_set_field(&h->json, "secret", w);
+        }
+        if (r < 0)
+                return r;
+
+        h->pkcs11_protected_authentication_path_permitted = b;
+
+        SET_FLAG(h->mask, USER_RECORD_SECRET, !json_variant_is_blank_object(w));
+        return 0;
+}
+
+int user_record_set_fido2_user_presence_permitted(UserRecord *h, int b) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        int r;
+
+        assert(h);
+
+        w = json_variant_ref(json_variant_by_key(h->json, "secret"));
+
+        if (b < 0)
+                r = json_variant_filter(&w, STRV_MAKE("fido2UserPresencePermitted"));
+        else
+                r = json_variant_set_field_boolean(&w, "fido2UserPresencePermitted", b);
+        if (r < 0)
+                return r;
+
+        if (json_variant_is_blank_object(w))
+                r = json_variant_filter(&h->json, STRV_MAKE("secret"));
+        else
+                r = json_variant_set_field(&h->json, "secret", w);
+        if (r < 0)
+                return r;
+
+        h->fido2_user_presence_permitted = b;
+
+        SET_FLAG(h->mask, USER_RECORD_SECRET, !json_variant_is_blank_object(w));
+        return 0;
+}
+
+int user_record_set_fido2_user_verification_permitted(UserRecord *h, int b) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        int r;
+
+        assert(h);
+
+        w = json_variant_ref(json_variant_by_key(h->json, "secret"));
+
+        if (b < 0)
+                r = json_variant_filter(&w, STRV_MAKE("fido2UserVerificationPermitted"));
+        else
+                r = json_variant_set_field_boolean(&w, "fido2UserVerificationPermitted", b);
+        if (r < 0)
+                return r;
+
+        if (json_variant_is_blank_object(w))
+                r = json_variant_filter(&h->json, STRV_MAKE("secret"));
+        else
+                r = json_variant_set_field(&h->json, "secret", w);
+        if (r < 0)
+                return r;
+
+        h->fido2_user_verification_permitted = b;
+
+        SET_FLAG(h->mask, USER_RECORD_SECRET, !json_variant_is_blank_object(w));
+        return 0;
+}
+
+static bool per_machine_entry_empty(JsonVariant *v) {
+        const char *k;
+        _unused_ JsonVariant *e;
+
+        JSON_VARIANT_OBJECT_FOREACH(k, e, v)
+                if (!STR_IN_SET(k, "matchMachineId", "matchHostname"))
+                        return false;
+
+        return true;
+}
+
+int user_record_set_password_change_now(UserRecord *h, int b) {
+        _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+        JsonVariant *per_machine;
+        int r;
+
+        assert(h);
+
+        w = json_variant_ref(h->json);
+
+        if (b < 0)
+                r = json_variant_filter(&w, STRV_MAKE("passwordChangeNow"));
+        else
+                r = json_variant_set_field_boolean(&w, "passwordChangeNow", b);
+        if (r < 0)
+                return r;
+
+        /* Also drop the field from all perMachine entries */
+        per_machine = json_variant_by_key(w, "perMachine");
+        if (per_machine) {
+                _cleanup_(json_variant_unrefp) JsonVariant *array = NULL;
+                JsonVariant *e;
+
+                JSON_VARIANT_ARRAY_FOREACH(e, per_machine) {
+                        _cleanup_(json_variant_unrefp) JsonVariant *z = NULL;
+
+                        if (!json_variant_is_object(e))
+                                return -EINVAL;
+
+                        z = json_variant_ref(e);
+
+                        r = json_variant_filter(&z, STRV_MAKE("passwordChangeNow"));
+                        if (r < 0)
+                                return r;
+
+                        if (per_machine_entry_empty(z))
+                                continue;
+
+                        r = json_variant_append_array(&array, z);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (json_variant_is_blank_array(array))
+                        r = json_variant_filter(&w, STRV_MAKE("perMachine"));
+                else
+                        r = json_variant_set_field(&w, "perMachine", array);
+                if (r < 0)
+                        return r;
+
+                SET_FLAG(h->mask, USER_RECORD_PER_MACHINE, !json_variant_is_blank_array(array));
+        }
+
+        json_variant_unref(h->json);
+        h->json = TAKE_PTR(w);
+
+        h->password_change_now = b;
+
+        return 0;
+}
+
+int user_record_merge_secret(UserRecord *h, UserRecord *secret) {
+        int r;
+
+        assert(h);
+
+        /* Merges the secrets from 'secret' into 'h'. */
+
+        r = user_record_set_password(h, secret->password, true);
+        if (r < 0)
+                return r;
+
+        r = user_record_set_token_pin(h, secret->token_pin, true);
+        if (r < 0)
+                return r;
+
+        if (secret->pkcs11_protected_authentication_path_permitted >= 0) {
+                r = user_record_set_pkcs11_protected_authentication_path_permitted(
+                                h,
+                                secret->pkcs11_protected_authentication_path_permitted);
+                if (r < 0)
+                        return r;
+        }
+
+        if (secret->fido2_user_presence_permitted >= 0) {
+                r = user_record_set_fido2_user_presence_permitted(
+                                h,
+                                secret->fido2_user_presence_permitted);
+                if (r < 0)
+                        return r;
+        }
+
+        if (secret->fido2_user_verification_permitted >= 0) {
+                r = user_record_set_fido2_user_verification_permitted(
+                                h,
+                                secret->fido2_user_verification_permitted);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int user_record_good_authentication(UserRecord *h) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL, *z = NULL;
+        uint64_t counter, usec;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+
+        switch (h->good_authentication_counter) {
+        case UINT64_MAX:
+                counter = 1;
+                break;
+        case UINT64_MAX-1:
+                counter = h->good_authentication_counter; /* saturate */
+                break;
+        default:
+                counter = h->good_authentication_counter + 1;
+                break;
+        }
+
+        usec = now(CLOCK_REALTIME);
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        v = json_variant_ref(h->json);
+        w = json_variant_ref(json_variant_by_key(v, "status"));
+        z = json_variant_ref(json_variant_by_key(w, SD_ID128_TO_STRING(mid)));
+
+        r = json_variant_set_field_unsigned(&z, "goodAuthenticationCounter", counter);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field_unsigned(&z, "lastGoodAuthenticationUSec", usec);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&w, SD_ID128_TO_STRING(mid), z);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&v, "status", w);
+        if (r < 0)
+                return r;
+
+        json_variant_unref(h->json);
+        h->json = TAKE_PTR(v);
+
+        h->good_authentication_counter = counter;
+        h->last_good_authentication_usec = usec;
+
+        h->mask |= USER_RECORD_STATUS;
+        return 0;
+}
+
+int user_record_bad_authentication(UserRecord *h) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL, *z = NULL;
+        uint64_t counter, usec;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+
+        switch (h->bad_authentication_counter) {
+        case UINT64_MAX:
+                counter = 1;
+                break;
+        case UINT64_MAX-1:
+                counter = h->bad_authentication_counter; /* saturate */
+                break;
+        default:
+                counter = h->bad_authentication_counter + 1;
+                break;
+        }
+
+        usec = now(CLOCK_REALTIME);
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        v = json_variant_ref(h->json);
+        w = json_variant_ref(json_variant_by_key(v, "status"));
+        z = json_variant_ref(json_variant_by_key(w, SD_ID128_TO_STRING(mid)));
+
+        r = json_variant_set_field_unsigned(&z, "badAuthenticationCounter", counter);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field_unsigned(&z, "lastBadAuthenticationUSec", usec);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&w, SD_ID128_TO_STRING(mid), z);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&v, "status", w);
+        if (r < 0)
+                return r;
+
+        json_variant_unref(h->json);
+        h->json = TAKE_PTR(v);
+
+        h->bad_authentication_counter = counter;
+        h->last_bad_authentication_usec = usec;
+
+        h->mask |= USER_RECORD_STATUS;
+        return 0;
+}
+
+int user_record_ratelimit(UserRecord *h) {
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL, *z = NULL;
+        usec_t usec, new_ratelimit_begin_usec, new_ratelimit_count;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+
+        usec = now(CLOCK_REALTIME);
+
+        if (h->ratelimit_begin_usec != UINT64_MAX && h->ratelimit_begin_usec > usec) {
+                /* Hmm, start-time is after the current time? If so, the RTC most likely doesn't work. */
+                new_ratelimit_begin_usec = usec;
+                new_ratelimit_count = 1;
+                log_debug("Rate limit timestamp is in the future, assuming incorrect system clock, resetting limit.");
+        } else if (h->ratelimit_begin_usec == UINT64_MAX ||
+                 usec_add(h->ratelimit_begin_usec, user_record_ratelimit_interval_usec(h)) <= usec) {
+                /* Fresh start */
+                new_ratelimit_begin_usec = usec;
+                new_ratelimit_count = 1;
+        } else if (h->ratelimit_count < user_record_ratelimit_burst(h)) {
+                /* Count up */
+                new_ratelimit_begin_usec = h->ratelimit_begin_usec;
+                new_ratelimit_count = h->ratelimit_count + 1;
+        } else
+                /* Limit hit */
+                return 0;
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        v = json_variant_ref(h->json);
+        w = json_variant_ref(json_variant_by_key(v, "status"));
+        z = json_variant_ref(json_variant_by_key(w, SD_ID128_TO_STRING(mid)));
+
+        r = json_variant_set_field_unsigned(&z, "rateLimitBeginUSec", new_ratelimit_begin_usec);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field_unsigned(&z, "rateLimitCount", new_ratelimit_count);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&w, SD_ID128_TO_STRING(mid), z);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&v, "status", w);
+        if (r < 0)
+                return r;
+
+        json_variant_unref(h->json);
+        h->json = TAKE_PTR(v);
+
+        h->ratelimit_begin_usec = new_ratelimit_begin_usec;
+        h->ratelimit_count = new_ratelimit_count;
+
+        h->mask |= USER_RECORD_STATUS;
+        return 1;
+}
+
+int user_record_is_supported(UserRecord *hr, sd_bus_error *error) {
+        assert(hr);
+
+        if (hr->disposition >= 0 && hr->disposition != USER_REGULAR)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot manage anything but regular users.");
+
+        if (hr->storage >= 0 && !IN_SET(hr->storage, USER_LUKS, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT, USER_CIFS))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "User record has storage type this service cannot manage.");
+
+        if (gid_is_valid(hr->gid) && hr->uid != (uid_t) hr->gid)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "User record has to have matching UID/GID fields.");
+
+        if (hr->service && !streq(hr->service, "io.systemd.Home"))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Not accepted with service not matching io.systemd.Home.");
+
+        return 0;
+}
+
+bool user_record_shall_rebalance(UserRecord *h) {
+        assert(h);
+
+        if (user_record_rebalance_weight(h) == REBALANCE_WEIGHT_OFF)
+                return false;
+
+        if (user_record_storage(h) != USER_LUKS)
+                return false;
+
+        if (!path_startswith(user_record_image_path(h), get_home_root())) /* This is the only pool we rebalance in */
+                return false;
+
+        return true;
+}
+
+int user_record_set_rebalance_weight(UserRecord *h, uint64_t weight) {
+        _cleanup_(json_variant_unrefp) JsonVariant *new_per_machine_array = NULL, *machine_id_variant = NULL,
+                *machine_id_array = NULL, *per_machine_entry = NULL;
+        _cleanup_free_ JsonVariant **array = NULL;
+        size_t idx = SIZE_MAX, n;
+        JsonVariant *per_machine;
+        sd_id128_t mid;
+        int r;
+
+        assert(h);
+
+        if (!h->json)
+                return -EUNATCH;
+
+        r = sd_id128_get_machine(&mid);
+        if (r < 0)
+                return r;
+
+        r = json_variant_new_id128(&machine_id_variant, mid);
+        if (r < 0)
+                return r;
+
+        r = json_variant_new_array(&machine_id_array, (JsonVariant*[]) { machine_id_variant }, 1);
+        if (r < 0)
+                return r;
+
+        per_machine = json_variant_by_key(h->json, "perMachine");
+        if (per_machine) {
+                if (!json_variant_is_array(per_machine))
+                        return -EINVAL;
+
+                n = json_variant_elements(per_machine);
+
+                array = new(JsonVariant*, n + 1);
+                if (!array)
+                        return -ENOMEM;
+
+                for (size_t i = 0; i < n; i++) {
+                        JsonVariant *m;
+
+                        array[i] = json_variant_by_index(per_machine, i);
+
+                        if (!json_variant_is_object(array[i]))
+                                return -EINVAL;
+
+                        m = json_variant_by_key(array[i], "matchMachineId");
+                        if (!m) {
+                                /* No machineId field? Let's ignore this, but invalidate what we found so far */
+                                idx = SIZE_MAX;
+                                continue;
+                        }
+
+                        if (json_variant_equal(m, machine_id_variant) ||
+                            json_variant_equal(m, machine_id_array)) {
+                                /* Matches exactly what we are looking for. Let's use this */
+                                idx = i;
+                                continue;
+                        }
+
+                        r = per_machine_id_match(m, JSON_PERMISSIVE);
+                        if (r < 0)
+                                return r;
+                        if (r > 0)
+                                /* Also matches what we are looking for, but with a broader match. In this
+                                 * case let's ignore this entry, and add a new specific one to the end. */
+                                idx = SIZE_MAX;
+                }
+
+                if (idx == SIZE_MAX)
+                        idx = n++; /* Nothing suitable found, place new entry at end */
+                else
+                        per_machine_entry = json_variant_ref(array[idx]);
+
+        } else {
+                array = new(JsonVariant*, 1);
+                if (!array)
+                        return -ENOMEM;
+
+                idx = 0;
+                n = 1;
+        }
+
+        if (!per_machine_entry) {
+                r = json_variant_set_field(&per_machine_entry, "matchMachineId", machine_id_array);
+                if (r < 0)
+                        return r;
+        }
+
+        if (weight == REBALANCE_WEIGHT_UNSET)
+                r = json_variant_set_field(&per_machine_entry, "rebalanceWeight", NULL); /* set explicitly to NULL (so that the perMachine setting we are setting here can override the global setting) */
+        else
+                r = json_variant_set_field_unsigned(&per_machine_entry, "rebalanceWeight", weight);
+        if (r < 0)
+                return r;
+
+        assert(idx < n);
+        array[idx] = per_machine_entry;
+
+        r = json_variant_new_array(&new_per_machine_array, array, n);
+        if (r < 0)
+                return r;
+
+        r = json_variant_set_field(&h->json, "perMachine", new_per_machine_array);
+        if (r < 0)
+                return r;
+
+        h->rebalance_weight = weight;
+        h->mask |= USER_RECORD_PER_MACHINE;
+        return 0;
+}
diff --git a/src/home/user-record-util.h b/src/home/user-record-util.h
new file mode 100644
index 0000000..508e2bd
--- /dev/null
+++ b/src/home/user-record-util.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "user-record.h"
+#include "group-record.h"
+
+int user_record_synthesize(UserRecord *h, const char *user_name, const char *realm, const char *image_path, UserStorage storage, uid_t uid, gid_t gid);
+int group_record_synthesize(GroupRecord *g, UserRecord *u);
+
+typedef enum UserReconcileMode {
+        USER_RECONCILE_ANY,
+        USER_RECONCILE_REQUIRE_NEWER,          /* host version must be newer than embedded version */
+        USER_RECONCILE_REQUIRE_NEWER_OR_EQUAL, /* similar, but may also be equal */
+        _USER_RECONCILE_MODE_MAX,
+        _USER_RECONCILE_MODE_INVALID = -EINVAL,
+} UserReconcileMode;
+
+enum { /* return values */
+        USER_RECONCILE_HOST_WON,
+        USER_RECONCILE_EMBEDDED_WON,
+        USER_RECONCILE_IDENTICAL,
+};
+
+int user_record_reconcile(UserRecord *host, UserRecord *embedded, UserReconcileMode mode, UserRecord **ret);
+int user_record_add_binding(UserRecord *h, UserStorage storage, const char *image_path, sd_id128_t partition_uuid, sd_id128_t luks_uuid, sd_id128_t fs_uuid, const char *luks_cipher, const char *luks_cipher_mode, uint64_t luks_volume_key_size, const char *file_system_type, const char *home_directory, uid_t uid, gid_t gid);
+
+/* Results of the two test functions below. */
+enum {
+        USER_TEST_UNDEFINED, /* Returned by user_record_test_image_path() if the storage type knows no image paths */
+        USER_TEST_ABSENT,
+        USER_TEST_EXISTS,
+        USER_TEST_DIRTY,     /* Only applies to user_record_test_image_path(), when the image exists but is marked dirty */
+        USER_TEST_MOUNTED,   /* Only applies to user_record_test_home_directory(), when the home directory exists. */
+        USER_TEST_MAYBE,     /* Only applies to LUKS devices: block device exists, but we don't know if it's the right one */
+};
+
+int user_record_test_home_directory(UserRecord *h);
+int user_record_test_home_directory_and_warn(UserRecord *h);
+int user_record_test_image_path(UserRecord *h);
+int user_record_test_image_path_and_warn(UserRecord *h);
+
+int user_record_test_password(UserRecord *h, UserRecord *secret);
+int user_record_test_recovery_key(UserRecord *h, UserRecord *secret);
+
+int user_record_update_last_changed(UserRecord *h, bool with_password);
+int user_record_set_disk_size(UserRecord *h, uint64_t disk_size);
+int user_record_set_password(UserRecord *h, char **password, bool prepend);
+int user_record_make_hashed_password(UserRecord *h, char **password, bool extend);
+int user_record_set_hashed_password(UserRecord *h, char **hashed_password);
+int user_record_set_token_pin(UserRecord *h, char **pin, bool prepend);
+int user_record_set_pkcs11_protected_authentication_path_permitted(UserRecord *h, int b);
+int user_record_set_fido2_user_presence_permitted(UserRecord *h, int b);
+int user_record_set_fido2_user_verification_permitted(UserRecord *h, int b);
+int user_record_set_password_change_now(UserRecord *h, int b);
+int user_record_merge_secret(UserRecord *h, UserRecord *secret);
+int user_record_good_authentication(UserRecord *h);
+int user_record_bad_authentication(UserRecord *h);
+int user_record_ratelimit(UserRecord *h);
+
+int user_record_is_supported(UserRecord *hr, sd_bus_error *error);
+
+bool user_record_shall_rebalance(UserRecord *h);
+int user_record_set_rebalance_weight(UserRecord *h, uint64_t weight);
diff --git a/src/hostname/hostnamectl.c b/src/hostname/hostnamectl.c
new file mode 100644
index 0000000..14fc160
--- /dev/null
+++ b/src/hostname/hostnamectl.c
@@ -0,0 +1,740 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "build.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-map-properties.h"
+#include "format-table.h"
+#include "hostname-setup.h"
+#include "hostname-util.h"
+#include "json.h"
+#include "main-func.h"
+#include "parse-argument.h"
+#include "pretty-print.h"
+#include "spawn-polkit-agent.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static bool arg_ask_password = true;
+static BusTransport arg_transport = BUS_TRANSPORT_LOCAL;
+static char *arg_host = NULL;
+static bool arg_transient = false;
+static bool arg_pretty = false;
+static bool arg_static = false;
+static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF;
+
+typedef struct StatusInfo {
+        const char *hostname;
+        const char *static_hostname;
+        const char *pretty_hostname;
+        const char *icon_name;
+        const char *chassis;
+        const char *deployment;
+        const char *location;
+        const char *kernel_name;
+        const char *kernel_release;
+        const char *os_pretty_name;
+        const char *os_cpe_name;
+        usec_t os_support_end;
+        const char *virtualization;
+        const char *architecture;
+        const char *home_url;
+        const char *hardware_vendor;
+        const char *hardware_model;
+        const char *firmware_version;
+        usec_t firmware_date;
+        sd_id128_t machine_id;
+        sd_id128_t boot_id;
+} StatusInfo;
+
+static const char* chassis_string_to_glyph(const char *chassis) {
+        if (streq_ptr(chassis, "laptop"))
+                return u8"💻"; /* Personal Computer */
+        if (streq_ptr(chassis, "desktop"))
+                return u8"🖥️"; /* Desktop Computer */
+        if (streq_ptr(chassis, "server"))
+                return u8"🖳"; /* Old Personal Computer */
+        if (streq_ptr(chassis, "tablet"))
+                return u8"具"; /* Ideograph tool, implement; draw up, write, looks vaguely tabletty */
+        if (streq_ptr(chassis, "watch"))
+                return u8"⌚"; /* Watch */
+        if (streq_ptr(chassis, "handset"))
+                return u8"🕻"; /* Left Hand Telephone Receiver */
+        if (streq_ptr(chassis, "vm"))
+                return u8"🖴"; /* Hard disk */
+        if (streq_ptr(chassis, "container"))
+                return u8"☐"; /* Ballot Box  */
+        return NULL;
+}
+
+static const char *os_support_end_color(usec_t n, usec_t eol) {
+        usec_t left;
+
+        /* If the end of support is over, color output in red. If only a month is left, color output in
+         * yellow. If more than a year is left, color green. In between just show in regular color. */
+
+        if (n >= eol)
+                return ANSI_HIGHLIGHT_RED;
+        left = eol - n;
+        if (left < USEC_PER_MONTH)
+                return ANSI_HIGHLIGHT_YELLOW;
+        if (left > USEC_PER_YEAR)
+                return ANSI_HIGHLIGHT_GREEN;
+
+        return NULL;
+}
+
+static int print_status_info(StatusInfo *i) {
+        _cleanup_(table_unrefp) Table *table = NULL;
+        TableCell *cell;
+        int r;
+
+        assert(i);
+
+        table = table_new_vertical();
+        if (!table)
+                return log_oom();
+
+        assert_se(cell = table_get_cell(table, 0, 0));
+        (void) table_set_ellipsize_percent(table, cell, 100);
+
+        table_set_ersatz_string(table, TABLE_ERSATZ_UNSET);
+
+        r = table_add_many(table,
+                           TABLE_FIELD, "Static hostname",
+                           TABLE_STRING, i->static_hostname);
+        if (r < 0)
+                return table_log_add_error(r);
+
+        if (!isempty(i->pretty_hostname) &&
+            !streq_ptr(i->pretty_hostname, i->static_hostname)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Pretty hostname",
+                                   TABLE_STRING, i->pretty_hostname);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->hostname) &&
+            !streq_ptr(i->hostname, i->static_hostname)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Transient hostname",
+                                   TABLE_STRING, i->hostname);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->icon_name)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Icon name",
+                                   TABLE_STRING, i->icon_name);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->chassis)) {
+                /* Possibly add a pretty symbol. Let's not bother with non-unicode fallbacks, because this is
+                 * just a prettification and we can't really express this with ASCII anyway. */
+                const char *v = chassis_string_to_glyph(i->chassis);
+                if (v)
+                        v = strjoina(i->chassis, " ", v);
+
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Chassis",
+                                   TABLE_STRING, v ?: i->chassis);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->deployment)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Deployment",
+                                   TABLE_STRING, i->deployment);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->location)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Location",
+                                   TABLE_STRING, i->location);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!sd_id128_is_null(i->machine_id)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Machine ID",
+                                   TABLE_ID128, i->machine_id);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!sd_id128_is_null(i->boot_id)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Boot ID",
+                                   TABLE_ID128, i->boot_id);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->virtualization)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Virtualization",
+                                   TABLE_STRING, i->virtualization);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->os_pretty_name)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Operating System",
+                                   TABLE_STRING, i->os_pretty_name,
+                                   TABLE_SET_URL, i->home_url);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->os_cpe_name)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "CPE OS Name",
+                                   TABLE_STRING, i->os_cpe_name);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (i->os_support_end != USEC_INFINITY) {
+                usec_t n = now(CLOCK_REALTIME);
+
+                r = table_add_many(table,
+                                   TABLE_FIELD, "OS Support End",
+                                   TABLE_TIMESTAMP_DATE, i->os_support_end,
+                                   TABLE_FIELD, n < i->os_support_end ? "OS Support Remaining" : "OS Support Expired",
+                                   TABLE_TIMESPAN_DAY, n < i->os_support_end ? i->os_support_end - n : n - i->os_support_end,
+                                   TABLE_SET_COLOR, os_support_end_color(n, i->os_support_end));
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->kernel_name) && !isempty(i->kernel_release)) {
+                const char *v;
+
+                v = strjoina(i->kernel_name, " ", i->kernel_release);
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Kernel",
+                                   TABLE_STRING, v);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->architecture)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Architecture",
+                                   TABLE_STRING, i->architecture);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->hardware_vendor)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Hardware Vendor",
+                                   TABLE_STRING, i->hardware_vendor);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->hardware_model)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Hardware Model",
+                                   TABLE_STRING, i->hardware_model);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (!isempty(i->firmware_version)) {
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Firmware Version",
+                                   TABLE_STRING, i->firmware_version);
+                if (r < 0)
+                        return table_log_add_error(r);
+        }
+
+        if (timestamp_is_set(i->firmware_date)) {
+                usec_t n = now(CLOCK_REALTIME);
+
+                r = table_add_many(table,
+                                   TABLE_FIELD, "Firmware Date",
+                                   TABLE_TIMESTAMP_DATE, i->firmware_date);
+                if (r < 0)
+                        return table_log_add_error(r);
+
+                if (i->firmware_date < n) {
+                        r = table_add_many(table,
+                                           TABLE_FIELD, "Firmware Age",
+                                           TABLE_TIMESPAN_DAY, n - i->firmware_date,
+                                           TABLE_SET_COLOR, n - i->firmware_date > USEC_PER_YEAR*2 ? ANSI_HIGHLIGHT_YELLOW : NULL);
+                        if (r < 0)
+                                return table_log_add_error(r);
+                }
+        }
+
+        r = table_print(table, NULL);
+        if (r < 0)
+                return table_log_print_error(r);
+
+        return 0;
+}
+
+static int get_one_name(sd_bus *bus, const char* attr, char **ret) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        const char *s;
+        int r;
+
+        assert(bus);
+        assert(attr);
+
+        /* This obtains one string property, and copy it if 'ret' is set, or print it otherwise. */
+
+        r = bus_get_property(bus, bus_hostname, attr, &error, &reply, "s");
+        if (r < 0)
+                return log_error_errno(r, "Could not get property: %s", bus_error_message(&error, r));
+
+        r = sd_bus_message_read(reply, "s", &s);
+        if (r < 0)
+                return bus_log_parse_error(r);
+
+        if (ret) {
+                char *str;
+
+                str = strdup(s);
+                if (!str)
+                        return log_oom();
+
+                *ret = str;
+        } else
+                printf("%s\n", s);
+
+        return 0;
+}
+
+static int show_all_names(sd_bus *bus) {
+        StatusInfo info = {};
+
+        static const struct bus_properties_map hostname_map[]  = {
+                { "Hostname",                  "s",  NULL,          offsetof(StatusInfo, hostname)         },
+                { "StaticHostname",            "s",  NULL,          offsetof(StatusInfo, static_hostname)  },
+                { "PrettyHostname",            "s",  NULL,          offsetof(StatusInfo, pretty_hostname)  },
+                { "IconName",                  "s",  NULL,          offsetof(StatusInfo, icon_name)        },
+                { "Chassis",                   "s",  NULL,          offsetof(StatusInfo, chassis)          },
+                { "Deployment",                "s",  NULL,          offsetof(StatusInfo, deployment)       },
+                { "Location",                  "s",  NULL,          offsetof(StatusInfo, location)         },
+                { "KernelName",                "s",  NULL,          offsetof(StatusInfo, kernel_name)      },
+                { "KernelRelease",             "s",  NULL,          offsetof(StatusInfo, kernel_release)   },
+                { "OperatingSystemPrettyName", "s",  NULL,          offsetof(StatusInfo, os_pretty_name)   },
+                { "OperatingSystemCPEName",    "s",  NULL,          offsetof(StatusInfo, os_cpe_name)      },
+                { "OperatingSystemSupportEnd", "t",  NULL,          offsetof(StatusInfo, os_support_end)   },
+                { "HomeURL",                   "s",  NULL,          offsetof(StatusInfo, home_url)         },
+                { "HardwareVendor",            "s",  NULL,          offsetof(StatusInfo, hardware_vendor)  },
+                { "HardwareModel",             "s",  NULL,          offsetof(StatusInfo, hardware_model)   },
+                { "FirmwareVersion",           "s",  NULL,          offsetof(StatusInfo, firmware_version) },
+                { "FirmwareDate",              "t",  NULL,          offsetof(StatusInfo, firmware_date)    },
+                { "MachineID",                 "ay", bus_map_id128, offsetof(StatusInfo, machine_id)       },
+                { "BootID",                    "ay", bus_map_id128, offsetof(StatusInfo, boot_id)          },
+                {}
+        }, manager_map[] = {
+                { "Virtualization",            "s",  NULL,          offsetof(StatusInfo, virtualization)   },
+                { "Architecture",              "s",  NULL,          offsetof(StatusInfo, architecture)     },
+                {}
+        };
+
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *host_message = NULL, *manager_message = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        int r;
+
+        r = bus_map_all_properties(bus,
+                                   "org.freedesktop.hostname1",
+                                   "/org/freedesktop/hostname1",
+                                   hostname_map,
+                                   0,
+                                   &error,
+                                   &host_message,
+                                   &info);
+        if (r < 0)
+                return log_error_errno(r, "Failed to query system properties: %s", bus_error_message(&error, r));
+
+        r = bus_map_all_properties(bus,
+                                   "org.freedesktop.systemd1",
+                                   "/org/freedesktop/systemd1",
+                                   manager_map,
+                                   0,
+                                   &error,
+                                   &manager_message,
+                                   &info);
+        if (r < 0)
+                return log_error_errno(r, "Failed to query system properties: %s", bus_error_message(&error, r));
+
+        /* For older version of hostnamed. */
+        if (!arg_host) {
+                if (sd_id128_is_null(info.machine_id))
+                        (void) sd_id128_get_machine(&info.machine_id);
+                if (sd_id128_is_null(info.boot_id))
+                        (void) sd_id128_get_boot(&info.boot_id);
+        }
+
+        return print_status_info(&info);
+}
+
+static int get_hostname_based_on_flag(sd_bus *bus) {
+        const char *attr;
+
+        if (!!arg_static + !!arg_pretty + !!arg_transient > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Cannot query more than one name type at a time");
+
+        attr = arg_pretty ? "PrettyHostname" :
+                arg_static ? "StaticHostname" : "Hostname";
+
+        return get_one_name(bus, attr, NULL);
+}
+
+static int show_status(int argc, char **argv, void *userdata) {
+        sd_bus *bus = userdata;
+        int r;
+
+        if (arg_json_format_flags != JSON_FORMAT_OFF) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+                _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+                const char *text = NULL;
+
+                r = bus_call_method(bus, bus_hostname, "Describe", &error, &reply, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Could not get description: %s", bus_error_message(&error, r));
+
+                r = sd_bus_message_read(reply, "s", &text);
+                if (r < 0)
+                        return bus_log_parse_error(r);
+
+                r = json_parse(text, 0, &v, NULL, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse JSON: %m");
+
+                json_variant_dump(v, arg_json_format_flags, NULL, NULL);
+                return 0;
+        }
+
+        if (arg_pretty || arg_static || arg_transient)
+                return get_hostname_based_on_flag(bus);
+
+        return show_all_names(bus);
+}
+
+
+static int set_simple_string_internal(sd_bus *bus, sd_bus_error *error, const char *target, const char *method, const char *value) {
+        _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+        int r;
+
+        polkit_agent_open_if_enabled(arg_transport, arg_ask_password);
+
+        if (!error)
+                error = &e;
+
+        r = bus_call_method(bus, bus_hostname, method, error, NULL, "sb", value, arg_ask_password);
+        if (r < 0)
+                return log_error_errno(r, "Could not set %s: %s", target, bus_error_message(error, r));
+
+        return 0;
+}
+
+static int set_simple_string(sd_bus *bus, const char *target, const char *method, const char *value) {
+        return set_simple_string_internal(bus, NULL, target, method, value);
+}
+
+static int set_hostname(int argc, char **argv, void *userdata) {
+        _cleanup_free_ char *h = NULL;
+        const char *hostname = argv[1];
+        sd_bus *bus = userdata;
+        bool implicit = false, show_hint = false;
+        int r, ret = 0;
+
+        if (!arg_pretty && !arg_static && !arg_transient)
+                arg_pretty = arg_static = arg_transient = implicit = true;
+
+        if (!implicit && !arg_static && arg_transient) {
+                _cleanup_free_ char *source = NULL;
+
+                r = get_one_name(bus, "HostnameSource", &source);
+                if (r < 0)
+                        return r;
+
+                if (hostname_source_from_string(source) == HOSTNAME_STATIC)
+                        log_info("Hint: static hostname is already set, so the specified transient hostname will not be used.");
+        }
+
+        if (arg_pretty) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+                const char *p;
+
+                /* If the passed hostname is already valid, then assume the user doesn't know anything about pretty
+                 * hostnames, so let's unset the pretty hostname, and just set the passed hostname as static/dynamic
+                 * hostname. */
+                if (implicit && hostname_is_valid(hostname, VALID_HOSTNAME_TRAILING_DOT))
+                        p = ""; /* No pretty hostname (as it is redundant), just a static one */
+                else
+                        p = hostname; /* Use the passed name as pretty hostname */
+
+                r = set_simple_string_internal(bus, &error, "pretty hostname", "SetPrettyHostname", p);
+                if (r < 0) {
+                        if (implicit &&
+                            sd_bus_error_has_names(&error,
+                                                   BUS_ERROR_FILE_IS_PROTECTED,
+                                                   BUS_ERROR_READ_ONLY_FILESYSTEM)) {
+                                show_hint = true;
+                                ret = r;
+                        } else
+                                return r;
+                }
+
+                /* Now that we set the pretty hostname, let's clean up the parameter and use that as static
+                 * hostname. If the hostname was already valid as static hostname, this will only chop off the trailing
+                 * dot if there is one. If it was not valid, then it will be made fully valid by truncating, dropping
+                 * multiple dots, and dropping weird chars. Note that we clean the name up only if we also are
+                 * supposed to set the pretty name. If the pretty name is not being set we assume the user knows what
+                 * they are doing and pass the name as-is. */
+                h = strdup(hostname);
+                if (!h)
+                        return log_oom();
+
+                hostname = hostname_cleanup(h); /* Use the cleaned up name as static hostname */
+        }
+
+        if (arg_static) {
+                _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+                r = set_simple_string_internal(bus, &error, "static hostname", "SetStaticHostname", hostname);
+                if (r < 0) {
+                        if (implicit &&
+                            sd_bus_error_has_names(&error,
+                                                   BUS_ERROR_FILE_IS_PROTECTED,
+                                                   BUS_ERROR_READ_ONLY_FILESYSTEM)) {
+                                show_hint = true;
+                                ret = r;
+                        } else
+                                return r;
+                }
+        }
+
+        if (arg_transient) {
+                r = set_simple_string(bus, "transient hostname", "SetHostname", hostname);
+                if (r < 0)
+                        return r;
+        }
+
+        if (show_hint)
+                log_info("Hint: use --transient option when /etc/machine-info or /etc/hostname cannot be modified (e.g. located in read-only filesystem).");
+
+        return ret;
+}
+
+static int get_or_set_hostname(int argc, char **argv, void *userdata) {
+        return argc == 1 ? get_hostname_based_on_flag(userdata) :
+                           set_hostname(argc, argv, userdata);
+}
+
+static int get_or_set_icon_name(int argc, char **argv, void *userdata) {
+        return argc == 1 ? get_one_name(userdata, "IconName", NULL) :
+                           set_simple_string(userdata, "icon", "SetIconName", argv[1]);
+}
+
+static int get_or_set_chassis(int argc, char **argv, void *userdata) {
+        return argc == 1 ? get_one_name(userdata, "Chassis", NULL) :
+                           set_simple_string(userdata, "chassis", "SetChassis", argv[1]);
+}
+
+static int get_or_set_deployment(int argc, char **argv, void *userdata) {
+        return argc == 1 ? get_one_name(userdata, "Deployment", NULL) :
+                           set_simple_string(userdata, "deployment", "SetDeployment", argv[1]);
+}
+
+static int get_or_set_location(int argc, char **argv, void *userdata) {
+        return argc == 1 ? get_one_name(userdata, "Location", NULL) :
+                           set_simple_string(userdata, "location", "SetLocation", argv[1]);
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("hostnamectl", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] COMMAND ...\n\n"
+               "%sQuery or change system hostname.%s\n"
+               "\nCommands:\n"
+               "  status                 Show current hostname settings\n"
+               "  hostname [NAME]        Get/set system hostname\n"
+               "  icon-name [NAME]       Get/set icon name for host\n"
+               "  chassis [NAME]         Get/set chassis type for host\n"
+               "  deployment [NAME]      Get/set deployment environment for host\n"
+               "  location [NAME]        Get/set location for host\n"
+               "\nOptions:\n"
+               "  -h --help              Show this help\n"
+               "     --version           Show package version\n"
+               "     --no-ask-password   Do not prompt for password\n"
+               "  -H --host=[USER@]HOST  Operate on remote host\n"
+               "  -M --machine=CONTAINER Operate on local container\n"
+               "     --transient         Only set transient hostname\n"
+               "     --static            Only set static hostname\n"
+               "     --pretty            Only set pretty hostname\n"
+               "     --json=pretty|short|off\n"
+               "                         Generate JSON output\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int verb_help(int argc, char **argv, void *userdata) {
+        return help();
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_NO_ASK_PASSWORD,
+                ARG_TRANSIENT,
+                ARG_STATIC,
+                ARG_PRETTY,
+                ARG_JSON,
+        };
+
+        static const struct option options[] = {
+                { "help",            no_argument,       NULL, 'h'                 },
+                { "version",         no_argument,       NULL, ARG_VERSION         },
+                { "transient",       no_argument,       NULL, ARG_TRANSIENT       },
+                { "static",          no_argument,       NULL, ARG_STATIC          },
+                { "pretty",          no_argument,       NULL, ARG_PRETTY          },
+                { "host",            required_argument, NULL, 'H'                 },
+                { "machine",         required_argument, NULL, 'M'                 },
+                { "no-ask-password", no_argument,       NULL, ARG_NO_ASK_PASSWORD },
+                { "json",            required_argument, NULL, ARG_JSON            },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hH:M:", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case 'H':
+                        arg_transport = BUS_TRANSPORT_REMOTE;
+                        arg_host = optarg;
+                        break;
+
+                case 'M':
+                        arg_transport = BUS_TRANSPORT_MACHINE;
+                        arg_host = optarg;
+                        break;
+
+                case ARG_TRANSIENT:
+                        arg_transient = true;
+                        break;
+
+                case ARG_PRETTY:
+                        arg_pretty = true;
+                        break;
+
+                case ARG_STATIC:
+                        arg_static = true;
+                        break;
+
+                case ARG_NO_ASK_PASSWORD:
+                        arg_ask_password = false;
+                        break;
+
+                case ARG_JSON:
+                        r = parse_json_argument(optarg, &arg_json_format_flags);
+                        if (r <= 0)
+                                return r;
+
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int hostnamectl_main(sd_bus *bus, int argc, char *argv[]) {
+
+        static const Verb verbs[] = {
+                { "status",         VERB_ANY, 1,        VERB_DEFAULT, show_status           },
+                { "hostname",       VERB_ANY, 2,        0,            get_or_set_hostname   },
+                { "set-hostname",   2,        2,        0,            get_or_set_hostname   }, /* obsolete */
+                { "icon-name",      VERB_ANY, 2,        0,            get_or_set_icon_name  },
+                { "set-icon-name",  2,        2,        0,            get_or_set_icon_name  }, /* obsolete */
+                { "chassis",        VERB_ANY, 2,        0,            get_or_set_chassis    },
+                { "set-chassis",    2,        2,        0,            get_or_set_chassis    }, /* obsolete */
+                { "deployment",     VERB_ANY, 2,        0,            get_or_set_deployment },
+                { "set-deployment", 2,        2,        0,            get_or_set_deployment }, /* obsolete */
+                { "location",       VERB_ANY, 2,        0,            get_or_set_location   },
+                { "set-location",   2,        2,        0,            get_or_set_location   }, /* obsolete */
+                { "help",           VERB_ANY, VERB_ANY, 0,            verb_help             }, /* Not documented, but supported since it is created. */
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, bus);
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        setlocale(LC_ALL, "");
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, &bus);
+        if (r < 0)
+                return bus_log_connect_error(r, arg_transport);
+
+        return hostnamectl_main(bus, argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/hostname/hostnamed.c b/src/hostname/hostnamed.c
new file mode 100644
index 0000000..fc7a97f
--- /dev/null
+++ b/src/hostname/hostnamed.c
@@ -0,0 +1,1633 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-log-control-api.h"
+#include "bus-polkit.h"
+#include "constants.h"
+#include "env-file-label.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "hostname-setup.h"
+#include "hostname-util.h"
+#include "id128-util.h"
+#include "json.h"
+#include "main-func.h"
+#include "missing_capability.h"
+#include "nscd-flush.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "sd-device.h"
+#include "selinux-util.h"
+#include "service-util.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "user-util.h"
+#include "virt.h"
+
+#define VALID_DEPLOYMENT_CHARS (DIGITS LETTERS "-.:")
+
+/* Properties we cache are indexed by an enum, to make invalidation easy and systematic (as we can iterate
+ * through them all, and they are uniformly strings). */
+typedef enum {
+        /* Read from /etc/hostname */
+        PROP_STATIC_HOSTNAME,
+
+        /* Read from /etc/machine-info */
+        PROP_PRETTY_HOSTNAME,
+        PROP_ICON_NAME,
+        PROP_CHASSIS,
+        PROP_DEPLOYMENT,
+        PROP_LOCATION,
+        PROP_HARDWARE_VENDOR,
+        PROP_HARDWARE_MODEL,
+
+        /* Read from /etc/os-release (or /usr/lib/os-release) */
+        PROP_OS_PRETTY_NAME,
+        PROP_OS_CPE_NAME,
+        PROP_OS_HOME_URL,
+        PROP_OS_SUPPORT_END,
+        _PROP_MAX,
+        _PROP_INVALID = -EINVAL,
+} HostProperty;
+
+typedef struct Context {
+        char *data[_PROP_MAX];
+
+        HostnameSource hostname_source;
+
+        struct stat etc_hostname_stat;
+        struct stat etc_os_release_stat;
+        struct stat etc_machine_info_stat;
+
+        Hashmap *polkit_registry;
+} Context;
+
+static void context_reset(Context *c, uint64_t mask) {
+        assert(c);
+
+        for (int p = 0; p < _PROP_MAX; p++) {
+                if (!FLAGS_SET(mask, UINT64_C(1) << p))
+                        continue;
+
+                c->data[p] = mfree(c->data[p]);
+        }
+}
+
+static void context_destroy(Context *c) {
+        assert(c);
+
+        context_reset(c, UINT64_MAX);
+        bus_verify_polkit_async_registry_free(c->polkit_registry);
+}
+
+static void context_read_etc_hostname(Context *c) {
+        struct stat current_stat = {};
+        int r;
+
+        assert(c);
+
+        if (stat("/etc/hostname", ¤t_stat) >= 0 &&
+            stat_inode_unmodified(&c->etc_hostname_stat, ¤t_stat))
+                return;
+
+        context_reset(c, UINT64_C(1) << PROP_STATIC_HOSTNAME);
+
+        r = read_etc_hostname(NULL, &c->data[PROP_STATIC_HOSTNAME]);
+        if (r < 0 && r != -ENOENT)
+                log_warning_errno(r, "Failed to read /etc/hostname, ignoring: %m");
+
+        c->etc_hostname_stat = current_stat;
+}
+
+static void context_read_machine_info(Context *c) {
+        struct stat current_stat = {};
+        int r;
+
+        assert(c);
+
+        if (stat("/etc/machine-info", ¤t_stat) >= 0 &&
+            stat_inode_unmodified(&c->etc_machine_info_stat, ¤t_stat))
+                return;
+
+        context_reset(c,
+                      (UINT64_C(1) << PROP_PRETTY_HOSTNAME) |
+                      (UINT64_C(1) << PROP_ICON_NAME) |
+                      (UINT64_C(1) << PROP_CHASSIS) |
+                      (UINT64_C(1) << PROP_DEPLOYMENT) |
+                      (UINT64_C(1) << PROP_LOCATION) |
+                      (UINT64_C(1) << PROP_HARDWARE_VENDOR) |
+                      (UINT64_C(1) << PROP_HARDWARE_MODEL));
+
+        r = parse_env_file(NULL, "/etc/machine-info",
+                           "PRETTY_HOSTNAME", &c->data[PROP_PRETTY_HOSTNAME],
+                           "ICON_NAME", &c->data[PROP_ICON_NAME],
+                           "CHASSIS", &c->data[PROP_CHASSIS],
+                           "DEPLOYMENT", &c->data[PROP_DEPLOYMENT],
+                           "LOCATION", &c->data[PROP_LOCATION],
+                           "HARDWARE_VENDOR", &c->data[PROP_HARDWARE_VENDOR],
+                           "HARDWARE_MODEL", &c->data[PROP_HARDWARE_MODEL]);
+        if (r < 0 && r != -ENOENT)
+                log_warning_errno(r, "Failed to read /etc/machine-info, ignoring: %m");
+
+        c->etc_machine_info_stat = current_stat;
+}
+
+static void context_read_os_release(Context *c) {
+        _cleanup_free_ char *os_name = NULL, *os_pretty_name = NULL;
+        struct stat current_stat = {};
+        int r;
+
+        assert(c);
+
+        if ((stat("/etc/os-release", ¤t_stat) >= 0 ||
+             stat("/usr/lib/os-release", ¤t_stat) >= 0) &&
+            stat_inode_unmodified(&c->etc_os_release_stat, ¤t_stat))
+                return;
+
+        context_reset(c,
+                      (UINT64_C(1) << PROP_OS_PRETTY_NAME) |
+                      (UINT64_C(1) << PROP_OS_CPE_NAME) |
+                      (UINT64_C(1) << PROP_OS_HOME_URL) |
+                      (UINT64_C(1) << PROP_OS_SUPPORT_END));
+
+        r = parse_os_release(NULL,
+                             "PRETTY_NAME", &os_pretty_name,
+                             "NAME",        &os_name,
+                             "CPE_NAME",    &c->data[PROP_OS_CPE_NAME],
+                             "HOME_URL",    &c->data[PROP_OS_HOME_URL],
+                             "SUPPORT_END", &c->data[PROP_OS_SUPPORT_END]);
+        if (r < 0 && r != -ENOENT)
+                log_warning_errno(r, "Failed to read os-release file, ignoring: %m");
+
+        if (free_and_strdup(&c->data[PROP_OS_PRETTY_NAME], os_release_pretty_name(os_pretty_name, os_name)) < 0)
+                log_oom();
+
+        c->etc_os_release_stat = current_stat;
+}
+
+static bool use_dmi_data(void) {
+        int r;
+
+        r = getenv_bool("SYSTEMD_HOSTNAME_FORCE_DMI");
+        if (r >= 0) {
+                log_debug("Honouring $SYSTEMD_HOSTNAME_FORCE_DMI override: %s", yes_no(r));
+                return r;
+        }
+        if (r != -ENXIO)
+                log_debug_errno(r, "Failed to parse $SYSTEMD_HOSTNAME_FORCE_DMI, ignoring: %m");
+
+        if (detect_container() > 0) {
+                log_debug("Running in a container, not using DMI hardware data.");
+                return false;
+        }
+
+        return true;
+}
+
+static int get_dmi_data(const char *database_key, const char *regular_key, char **ret) {
+        _cleanup_(sd_device_unrefp) sd_device *device = NULL;
+        _cleanup_free_ char *b = NULL;
+        const char *s = NULL;
+        int r;
+
+        if (!use_dmi_data())
+                return -ENOENT;
+
+        r = sd_device_new_from_syspath(&device, "/sys/class/dmi/id");
+        if (r < 0)
+                return log_debug_errno(r, "Failed to open /sys/class/dmi/id device, ignoring: %m");
+
+        if (database_key)
+                (void) sd_device_get_property_value(device, database_key, &s);
+        if (!s && regular_key)
+                (void) sd_device_get_property_value(device, regular_key, &s);
+
+        if (!ret)
+                return !!s;
+
+        if (s) {
+                b = strdup(s);
+                if (!b)
+                        return -ENOMEM;
+        }
+
+        *ret = TAKE_PTR(b);
+        return !!s;
+}
+
+static int get_hardware_vendor(char **ret) {
+        return get_dmi_data("ID_VENDOR_FROM_DATABASE", "ID_VENDOR", ret);
+}
+
+static int get_hardware_model(char **ret) {
+        return get_dmi_data("ID_MODEL_FROM_DATABASE", "ID_MODEL", ret);
+}
+
+static int get_hardware_firmware_data(const char *sysattr, char **ret) {
+        _cleanup_(sd_device_unrefp) sd_device *device = NULL;
+        _cleanup_free_ char *b = NULL;
+        const char *s = NULL;
+        int r;
+
+        assert(sysattr);
+
+        if (!use_dmi_data())
+                return -ENOENT;
+
+        r = sd_device_new_from_syspath(&device, "/sys/class/dmi/id");
+        if (r < 0)
+                return log_debug_errno(r, "Failed to open /sys/class/dmi/id device, ignoring: %m");
+
+        (void) sd_device_get_sysattr_value(device, sysattr, &s);
+        if (!isempty(s)) {
+                b = strdup(s);
+                if (!b)
+                        return -ENOMEM;
+        }
+
+        if (ret)
+                *ret = TAKE_PTR(b);
+
+        return !isempty(s);
+}
+
+static int get_hardware_serial(char **ret) {
+         int r;
+
+         r = get_hardware_firmware_data("product_serial", ret);
+         if (r <= 0)
+                return get_hardware_firmware_data("board_serial", ret);
+
+         return r;
+}
+
+static int get_firmware_version(char **ret) {
+         return get_hardware_firmware_data("bios_version", ret);
+}
+
+static int get_firmware_vendor(char **ret) {
+         return get_hardware_firmware_data("bios_vendor", ret);
+}
+
+static int get_firmware_date(usec_t *ret) {
+         _cleanup_free_ char *bios_date = NULL, *month = NULL, *day = NULL, *year = NULL;
+         int r;
+
+         assert(ret);
+
+         r = get_hardware_firmware_data("bios_date", &bios_date);
+         if (r < 0)
+                return r;
+         if (r == 0) {
+                *ret = USEC_INFINITY;
+                return 0;
+         }
+
+         const char *p = bios_date;
+         r = extract_many_words(&p, "/", EXTRACT_DONT_COALESCE_SEPARATORS, &month, &day, &year, NULL);
+         if (r < 0)
+                return r;
+         if (r != 3) /* less than three args read? */
+                return -EINVAL;
+         if (!isempty(p)) /* more left in the string? */
+                return -EINVAL;
+
+         unsigned m, d, y;
+         r = safe_atou_full(month, 10 | SAFE_ATO_REFUSE_PLUS_MINUS | SAFE_ATO_REFUSE_LEADING_WHITESPACE, &m);
+         if (r < 0)
+                return r;
+         if (m < 1 || m > 12)
+                return -EINVAL;
+         m -= 1;
+
+         r = safe_atou_full(day, 10 | SAFE_ATO_REFUSE_PLUS_MINUS | SAFE_ATO_REFUSE_LEADING_WHITESPACE, &d);
+         if (r < 0)
+                return r;
+         if (d < 1 || d > 31)
+                return -EINVAL;
+
+         r = safe_atou_full(year, 10 | SAFE_ATO_REFUSE_PLUS_MINUS | SAFE_ATO_REFUSE_LEADING_WHITESPACE, &y);
+         if (r < 0)
+                return r;
+         if (y < 1970 || y > (unsigned) INT_MAX)
+                return -EINVAL;
+         y -= 1900;
+
+         struct tm tm = {
+                .tm_mday = d,
+                .tm_mon = m,
+                .tm_year = y,
+         };
+         time_t v = timegm(&tm);
+         if (v == (time_t) -1)
+                return -errno;
+         if (tm.tm_mday != (int) d || tm.tm_mon != (int) m || tm.tm_year != (int) y)
+                return -EINVAL; /* date was not normalized? (e.g. "30th of feb") */
+
+         *ret = (usec_t) v * USEC_PER_SEC;
+
+         return 0;
+}
+
+static const char* valid_chassis(const char *chassis) {
+        assert(chassis);
+
+        return nulstr_get(
+                        "vm\0"
+                        "container\0"
+                        "desktop\0"
+                        "laptop\0"
+                        "convertible\0"
+                        "server\0"
+                        "tablet\0"
+                        "handset\0"
+                        "watch\0"
+                        "embedded\0",
+                        chassis);
+}
+
+static bool valid_deployment(const char *deployment) {
+        assert(deployment);
+
+        return in_charset(deployment, VALID_DEPLOYMENT_CHARS);
+}
+
+static const char* fallback_chassis(void) {
+        const char *chassis;
+        _cleanup_free_ char *type = NULL;
+        Virtualization v;
+        unsigned t;
+        int r;
+
+        v = detect_virtualization();
+        if (v < 0)
+                log_debug_errno(v, "Failed to detect virtualization, ignoring: %m");
+        else if (VIRTUALIZATION_IS_VM(v))
+                return "vm";
+        else if (VIRTUALIZATION_IS_CONTAINER(v))
+                return "container";
+
+        r = read_one_line_file("/sys/class/dmi/id/chassis_type", &type);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to read DMI chassis type, ignoring: %m");
+                goto try_acpi;
+        }
+
+        r = safe_atou(type, &t);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to parse DMI chassis type \"%s\", ignoring: %m", type);
+                goto try_acpi;
+        }
+
+        /* We only list the really obvious cases here. The DMI data is unreliable enough, so let's not do any
+         * additional guesswork on top of that.
+         *
+         * See the SMBIOS Specification 3.5.0 section 7.4.1 for details about the values listed here:
+         *
+         * https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.5.0.pdf
+         */
+
+        switch (t) {
+
+        case 0x03: /* Desktop */
+        case 0x04: /* Low Profile Desktop */
+        case 0x06: /* Mini Tower */
+        case 0x07: /* Tower */
+        case 0x0D: /* All in one (i.e. PC built into monitor) */
+        case 0x23: /* Mini PC */
+        case 0x24: /* Stick PC */
+                return "desktop";
+
+        case 0x8: /* Portable */
+        case 0x9: /* Laptop */
+        case 0xA: /* Notebook */
+        case 0xE: /* Sub Notebook */
+                return "laptop";
+
+        case 0xB: /* Hand Held */
+                return "handset";
+
+        case 0x11: /* Main Server Chassis */
+        case 0x1C: /* Blade */
+        case 0x1D: /* Blade Enclosure */
+                return "server";
+
+        case 0x1E: /* Tablet */
+                return "tablet";
+
+        case 0x1F: /* Convertible */
+        case 0x20: /* Detachable */
+                return "convertible";
+
+        case 0x21: /* IoT Gateway */
+        case 0x22: /* Embedded PC */
+                return "embedded";
+
+        default:
+                log_debug("Unhandled DMI chassis type 0x%02x, ignoring.", t);
+        }
+
+try_acpi:
+        type = mfree(type);
+        r = read_one_line_file("/sys/firmware/acpi/pm_profile", &type);
+        if (r < 0) {
+                log_debug_errno(r, "Failed read ACPI PM profile, ignoring: %m");
+                goto try_devicetree;
+        }
+
+        r = safe_atou(type, &t);
+        if (r < 0) {
+                log_debug_errno(r, "Failed parse ACPI PM profile \"%s\", ignoring: %m", type);
+                goto try_devicetree;
+        }
+
+        /* We only list the really obvious cases here as the ACPI data is not really super reliable.
+         *
+         * See the ACPI 5.0 Spec Section 5.2.9.1 for details:
+         *
+         * http://www.acpi.info/DOWNLOADS/ACPIspec50.pdf
+         */
+
+        switch (t) {
+
+        case 1: /* Desktop */
+        case 3: /* Workstation */
+        case 6: /* Appliance PC */
+                return "desktop";
+
+        case 2: /* Mobile */
+                return "laptop";
+
+        case 4: /* Enterprise Server */
+        case 5: /* SOHO Server */
+        case 7: /* Performance Server */
+                return "server";
+
+        case 8: /* Tablet */
+                return "tablet";
+
+        default:
+                log_debug("Unhandled ACPI PM profile 0x%02x, ignoring.", t);
+        }
+
+try_devicetree:
+        type = mfree(type);
+        r = read_one_line_file("/proc/device-tree/chassis-type", &type);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to read device-tree chassis type, ignoring: %m");
+                return NULL;
+        }
+
+        /* Note that the Devicetree specification uses the very same vocabulary
+         * of chassis types as we do, hence we do not need to translate these types:
+         *
+         * https://github.com/devicetree-org/devicetree-specification/blob/master/source/chapter3-devicenodes.rst */
+        chassis = valid_chassis(type);
+        if (!chassis)
+                log_debug("Invalid device-tree chassis type \"%s\", ignoring.", type);
+        return chassis;
+}
+
+static char* context_get_chassis(Context *c) {
+        const char *fallback;
+        char *dmi;
+
+        assert(c);
+
+        if (!isempty(c->data[PROP_CHASSIS]))
+                return strdup(c->data[PROP_CHASSIS]);
+
+        if (get_dmi_data("ID_CHASSIS", NULL, &dmi) > 0)
+                return dmi;
+
+        fallback = fallback_chassis();
+        if (fallback)
+                return strdup(fallback);
+
+        return NULL;
+}
+
+static char* context_fallback_icon_name(Context *c) {
+        _cleanup_free_ char *chassis = NULL;
+
+        assert(c);
+
+        chassis = context_get_chassis(c);
+        if (chassis)
+                return strjoin("computer-", chassis);
+
+        return strdup("computer");
+}
+
+static int context_update_kernel_hostname(
+                Context *c,
+                const char *transient_hn) {
+
+        _cleanup_free_ char *_hn_free = NULL;
+        const char *hn;
+        HostnameSource hns;
+        int r;
+
+        assert(c);
+
+        /* /etc/hostname has the highest preference ... */
+        if (c->data[PROP_STATIC_HOSTNAME]) {
+                hn = c->data[PROP_STATIC_HOSTNAME];
+                hns = HOSTNAME_STATIC;
+
+        /* ... the transient hostname, (ie: DHCP) comes next ... */
+        } else if (transient_hn) {
+                hn = transient_hn;
+                hns = HOSTNAME_TRANSIENT;
+
+        /* ... and the ultimate fallback */
+        } else {
+                hn = _hn_free = get_default_hostname();
+                if (!hn)
+                        return log_oom();
+
+                hns = HOSTNAME_DEFAULT;
+        }
+
+        r = sethostname_idempotent(hn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set hostname: %m");
+
+        if (c->hostname_source != hns) {
+                c->hostname_source = hns;
+                r = 1;
+        }
+
+        (void) nscd_flush_cache(STRV_MAKE("hosts"));
+
+        if (r == 0)
+                log_debug("Hostname was already set to <%s>.", hn);
+        else {
+                log_info("Hostname set to <%s> (%s)", hn, hostname_source_to_string(hns));
+
+                hostname_update_source_hint(hn, hns);
+        }
+
+        return r; /* 0 if no change, 1 if something was done  */
+}
+
+static void unset_statp(struct stat **p) {
+        if (!*p)
+                return;
+
+        **p = (struct stat) {};
+}
+
+static int context_write_data_static_hostname(Context *c) {
+        _cleanup_(unset_statp) struct stat *s = NULL;
+        int r;
+
+        assert(c);
+
+        /* Make sure that if we fail here, we invalidate the cached information, since it was updated
+         * already, even if we can't make it hit the disk. */
+        s = &c->etc_hostname_stat;
+
+        if (isempty(c->data[PROP_STATIC_HOSTNAME])) {
+                if (unlink("/etc/hostname") < 0 && errno != ENOENT)
+                        return -errno;
+
+                TAKE_PTR(s);
+                return 0;
+        }
+
+        r = write_string_file_atomic_label("/etc/hostname", c->data[PROP_STATIC_HOSTNAME]);
+        if (r < 0)
+                return r;
+
+        TAKE_PTR(s);
+        return 0;
+}
+
+static int context_write_data_machine_info(Context *c) {
+        _cleanup_(unset_statp) struct stat *s = NULL;
+        static const char * const name[_PROP_MAX] = {
+                [PROP_PRETTY_HOSTNAME] = "PRETTY_HOSTNAME",
+                [PROP_ICON_NAME] = "ICON_NAME",
+                [PROP_CHASSIS] = "CHASSIS",
+                [PROP_DEPLOYMENT] = "DEPLOYMENT",
+                [PROP_LOCATION] = "LOCATION",
+        };
+        _cleanup_strv_free_ char **l = NULL;
+        int r;
+
+        assert(c);
+
+        /* Make sure that if we fail here, we invalidate the cached information, since it was updated
+         * already, even if we can't make it hit the disk. */
+        s = &c->etc_machine_info_stat;
+
+        r = load_env_file(NULL, "/etc/machine-info", &l);
+        if (r < 0 && r != -ENOENT)
+                return r;
+
+        for (int p = PROP_PRETTY_HOSTNAME; p <= PROP_LOCATION; p++) {
+                assert(name[p]);
+
+                r = strv_env_assign(&l, name[p], empty_to_null(c->data[p]));
+                if (r < 0)
+                        return r;
+        }
+
+        if (strv_isempty(l)) {
+                if (unlink("/etc/machine-info") < 0 && errno != ENOENT)
+                        return -errno;
+
+                TAKE_PTR(s);
+                return 0;
+        }
+
+        r = write_env_file_label(AT_FDCWD, "/etc/machine-info", NULL, l);
+        if (r < 0)
+                return r;
+
+        TAKE_PTR(s);
+        return 0;
+}
+
+static int property_get_hardware_property(
+                sd_bus_message *reply,
+                Context *c,
+                HostProperty prop,
+                int (*getter)(char **)) {
+
+        _cleanup_free_ char *from_dmi = NULL;
+
+        assert(reply);
+        assert(c);
+        assert(IN_SET(prop, PROP_HARDWARE_VENDOR, PROP_HARDWARE_MODEL));
+        assert(getter);
+
+        context_read_machine_info(c);
+
+        if (isempty(c->data[prop]))
+                (void) getter(&from_dmi);
+
+        return sd_bus_message_append(reply, "s", from_dmi ?: c->data[prop]);
+}
+
+static int property_get_hardware_vendor(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        return property_get_hardware_property(reply, userdata, PROP_HARDWARE_VENDOR, get_hardware_vendor);
+}
+
+static int property_get_hardware_model(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        return property_get_hardware_property(reply, userdata, PROP_HARDWARE_MODEL, get_hardware_model);
+}
+
+static int property_get_firmware_version(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *firmware_version = NULL;
+
+        (void) get_firmware_version(&firmware_version);
+
+        return sd_bus_message_append(reply, "s", firmware_version);
+}
+
+static int property_get_firmware_vendor(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *firmware_vendor = NULL;
+
+        (void) get_firmware_vendor(&firmware_vendor);
+
+        return sd_bus_message_append(reply, "s", firmware_vendor);
+}
+
+static int property_get_firmware_date(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        usec_t firmware_date = USEC_INFINITY;
+
+        (void) get_firmware_date(&firmware_date);
+
+        return sd_bus_message_append(reply, "t", firmware_date);
+}
+static int property_get_hostname(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *hn = NULL;
+        int r;
+
+        r = gethostname_strict(&hn);
+        if (r < 0) {
+                if (r != -ENXIO)
+                        return r;
+
+                hn = get_default_hostname();
+                if (!hn)
+                        return -ENOMEM;
+        }
+
+        return sd_bus_message_append(reply, "s", hn);
+}
+
+static int property_get_static_hostname(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Context *c = ASSERT_PTR(userdata);
+
+        context_read_etc_hostname(c);
+
+        return sd_bus_message_append(reply, "s", c->data[PROP_STATIC_HOSTNAME]);
+}
+
+static int property_get_default_hostname(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *hn = NULL;
+
+        hn = get_default_hostname();
+        if (!hn)
+                return log_oom();
+
+        return sd_bus_message_append(reply, "s", hn);
+}
+
+static void context_determine_hostname_source(Context *c) {
+        _cleanup_free_ char *hostname = NULL;
+        int r;
+
+        assert(c);
+
+        if (c->hostname_source >= 0)
+                return;
+
+        (void) gethostname_full(GET_HOSTNAME_ALLOW_LOCALHOST, &hostname);
+
+        if (streq_ptr(hostname, c->data[PROP_STATIC_HOSTNAME]))
+                c->hostname_source = HOSTNAME_STATIC;
+        else {
+                _cleanup_free_ char *fallback = NULL;
+
+                /* If the hostname was not set by us, try to figure out where it came from. If we set it to
+                 * the default hostname, the file will tell us. We compare the string because it is possible
+                 * that the hostname was set by an older version that had a different fallback, in the initrd
+                 * or before we reexecuted. */
+
+                r = read_one_line_file("/run/systemd/default-hostname", &fallback);
+                if (r < 0 && r != -ENOENT)
+                        log_warning_errno(r, "Failed to read /run/systemd/default-hostname, ignoring: %m");
+
+                if (streq_ptr(fallback, hostname))
+                        c->hostname_source = HOSTNAME_DEFAULT;
+                else
+                        c->hostname_source = HOSTNAME_TRANSIENT;
+        }
+}
+
+static int property_get_hostname_source(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Context *c = ASSERT_PTR(userdata);
+
+        context_read_etc_hostname(c);
+        context_determine_hostname_source(c);
+
+        return sd_bus_message_append(reply, "s", hostname_source_to_string(c->hostname_source));
+}
+
+static int property_get_machine_info_field(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        sd_bus_slot *slot;
+        Context *c;
+
+        /* Acquire the context object without this property's userdata offset added. Explanation: we want
+         * access to two pointers here: a) the main context object we cache all properties in, and b) the
+         * pointer to the property field inside the context object that we are supposed to update and
+         * use. The latter (b) we get in the 'userdata' function parameter, and sd-bus calculates that for us
+         * from the 'userdata' pointer we supplied when the vtable was registered, with the offset we
+         * specified in the vtable added on top. To get the former (a) we need the 'userdata' pointer from
+         * the vtable registration directly, without the offset added. Hence we ask sd-bus what the slot
+         * object is (which encapsulates the vtable registration), and then query the 'userdata' field
+         * directly off it. */
+        assert_se(slot = sd_bus_get_current_slot(bus));
+        assert_se(c = sd_bus_slot_get_userdata(slot));
+
+        context_read_machine_info(c);
+
+        return sd_bus_message_append(reply, "s", *(char**) userdata);
+}
+
+static int property_get_os_release_field(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        sd_bus_slot *slot;
+        Context *c;
+
+        /* As above, acquire the current context without this property's userdata offset added. */
+        assert_se(slot = sd_bus_get_current_slot(bus));
+        assert_se(c = sd_bus_slot_get_userdata(slot));
+
+        context_read_os_release(c);
+
+        return sd_bus_message_append(reply, "s", *(char**) userdata);
+}
+
+static int property_get_os_support_end(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Context *c = userdata;
+        usec_t eol = USEC_INFINITY;
+
+        context_read_os_release(c);
+
+        if (c->data[PROP_OS_SUPPORT_END])
+                (void) os_release_support_ended(c->data[PROP_OS_SUPPORT_END], /* quiet= */ false, &eol);
+
+        return sd_bus_message_append(reply, "t", eol);
+}
+
+static int property_get_icon_name(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *n = NULL;
+        Context *c = userdata;
+        const char *name;
+
+        context_read_machine_info(c);
+
+        if (isempty(c->data[PROP_ICON_NAME]))
+                name = n = context_fallback_icon_name(c);
+        else
+                name = c->data[PROP_ICON_NAME];
+
+        if (!name)
+                return -ENOMEM;
+
+        return sd_bus_message_append(reply, "s", name);
+}
+
+static int property_get_chassis(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        _cleanup_free_ char *chassis = NULL;
+        Context *c = userdata;
+
+        context_read_machine_info(c);
+
+        chassis = context_get_chassis(c);
+
+        return sd_bus_message_append(reply, "s", chassis);
+}
+
+static int property_get_uname_field(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        struct utsname u;
+
+        assert_se(uname(&u) >= 0);
+
+        return sd_bus_message_append(reply, "s", (char*) &u + PTR_TO_SIZE(userdata));
+}
+
+static int property_get_machine_id(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        sd_id128_t id;
+        int r;
+
+        r = sd_id128_get_machine(&id);
+        if (r < 0)
+                return r;
+
+        return bus_property_get_id128(bus, path, interface, property, reply, &id, error);
+}
+
+static int property_get_boot_id(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        sd_id128_t id;
+        int r;
+
+        r = sd_id128_get_boot(&id);
+        if (r < 0)
+                return r;
+
+        return bus_property_get_id128(bus, path, interface, property, reply, &id, error);
+}
+
+static int method_set_hostname(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        Context *c = ASSERT_PTR(userdata);
+        const char *name;
+        int interactive, r;
+
+        assert(m);
+
+        r = sd_bus_message_read(m, "sb", &name, &interactive);
+        if (r < 0)
+                return r;
+
+        name = empty_to_null(name);
+
+        /* We always go through with the procedure below without comparing to the current hostname, because
+         * we might want to adjust hostname source information even if the actual hostname is unchanged. */
+
+        if (name && !hostname_is_valid(name, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid hostname '%s'", name);
+
+        context_read_etc_hostname(c);
+
+        r = bus_verify_polkit_async(
+                        m,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.hostname1.set-hostname",
+                        NULL,
+                        interactive,
+                        UID_INVALID,
+                        &c->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = context_update_kernel_hostname(c, name);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to set hostname: %m");
+        else if (r > 0)
+                (void) sd_bus_emit_properties_changed(sd_bus_message_get_bus(m),
+                                                      "/org/freedesktop/hostname1", "org.freedesktop.hostname1",
+                                                      "Hostname", "HostnameSource", NULL);
+
+        return sd_bus_reply_method_return(m, NULL);
+}
+
+static int method_set_static_hostname(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        Context *c = ASSERT_PTR(userdata);
+        const char *name;
+        int interactive;
+        int r;
+
+        assert(m);
+
+        r = sd_bus_message_read(m, "sb", &name, &interactive);
+        if (r < 0)
+                return r;
+
+        name = empty_to_null(name);
+
+        context_read_etc_hostname(c);
+
+        if (streq_ptr(name, c->data[PROP_STATIC_HOSTNAME]))
+                return sd_bus_reply_method_return(m, NULL);
+
+        if (name && !hostname_is_valid(name, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid static hostname '%s'", name);
+
+        r = bus_verify_polkit_async(
+                        m,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.hostname1.set-static-hostname",
+                        NULL,
+                        interactive,
+                        UID_INVALID,
+                        &c->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = free_and_strdup_warn(&c->data[PROP_STATIC_HOSTNAME], name);
+        if (r < 0)
+                return r;
+
+        r = context_write_data_static_hostname(c);
+        if (r < 0) {
+                log_error_errno(r, "Failed to write static hostname: %m");
+                if (ERRNO_IS_PRIVILEGE(r))
+                        return sd_bus_error_set(error, BUS_ERROR_FILE_IS_PROTECTED, "Not allowed to update /etc/hostname.");
+                if (r == -EROFS)
+                        return sd_bus_error_set(error, BUS_ERROR_READ_ONLY_FILESYSTEM, "/etc/hostname is in a read-only filesystem.");
+                return sd_bus_error_set_errnof(error, r, "Failed to set static hostname: %m");
+        }
+
+        r = context_update_kernel_hostname(c, NULL);
+        if (r < 0) {
+                log_error_errno(r, "Failed to set hostname: %m");
+                return sd_bus_error_set_errnof(error, r, "Failed to set hostname: %m");
+        }
+
+        (void) sd_bus_emit_properties_changed(sd_bus_message_get_bus(m),
+                                              "/org/freedesktop/hostname1", "org.freedesktop.hostname1",
+                                              "StaticHostname", "Hostname", "HostnameSource", NULL);
+
+        return sd_bus_reply_method_return(m, NULL);
+}
+
+static int set_machine_info(Context *c, sd_bus_message *m, int prop, sd_bus_message_handler_t cb, sd_bus_error *error) {
+        int interactive;
+        const char *name;
+        int r;
+
+        assert(c);
+        assert(m);
+
+        r = sd_bus_message_read(m, "sb", &name, &interactive);
+        if (r < 0)
+                return r;
+
+        name = empty_to_null(name);
+
+        context_read_machine_info(c);
+
+        if (streq_ptr(name, c->data[prop]))
+                return sd_bus_reply_method_return(m, NULL);
+
+        if (!isempty(name)) {
+                /* The icon name might ultimately be used as file
+                 * name, so better be safe than sorry */
+
+                if (prop == PROP_ICON_NAME && !filename_is_valid(name))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid icon name '%s'", name);
+                if (prop == PROP_PRETTY_HOSTNAME && string_has_cc(name, NULL))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid pretty hostname '%s'", name);
+                if (prop == PROP_CHASSIS && !valid_chassis(name))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid chassis '%s'", name);
+                if (prop == PROP_DEPLOYMENT && !valid_deployment(name))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid deployment '%s'", name);
+                if (prop == PROP_LOCATION && string_has_cc(name, NULL))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid location '%s'", name);
+        }
+
+        /* Since the pretty hostname should always be changed at the
+         * same time as the static one, use the same policy action for
+         * both... */
+
+        r = bus_verify_polkit_async(
+                        m,
+                        CAP_SYS_ADMIN,
+                        prop == PROP_PRETTY_HOSTNAME ? "org.freedesktop.hostname1.set-static-hostname" : "org.freedesktop.hostname1.set-machine-info",
+                        NULL,
+                        interactive,
+                        UID_INVALID,
+                        &c->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = free_and_strdup_warn(&c->data[prop], name);
+        if (r < 0)
+                return r;
+
+        r = context_write_data_machine_info(c);
+        if (r < 0) {
+                log_error_errno(r, "Failed to write machine info: %m");
+                if (ERRNO_IS_PRIVILEGE(r))
+                        return sd_bus_error_set(error, BUS_ERROR_FILE_IS_PROTECTED, "Not allowed to update /etc/machine-info.");
+                if (r == -EROFS)
+                        return sd_bus_error_set(error, BUS_ERROR_READ_ONLY_FILESYSTEM, "/etc/machine-info is in a read-only filesystem.");
+                return sd_bus_error_set_errnof(error, r, "Failed to write machine info: %m");
+        }
+
+        log_info("Changed %s to '%s'",
+                 prop == PROP_PRETTY_HOSTNAME ? "pretty hostname" :
+                 prop == PROP_DEPLOYMENT ? "deployment" :
+                 prop == PROP_LOCATION ? "location" :
+                 prop == PROP_CHASSIS ? "chassis" : "icon name", strna(c->data[prop]));
+
+        (void) sd_bus_emit_properties_changed(
+                        sd_bus_message_get_bus(m),
+                        "/org/freedesktop/hostname1",
+                        "org.freedesktop.hostname1",
+                        prop == PROP_PRETTY_HOSTNAME ? "PrettyHostname" :
+                        prop == PROP_DEPLOYMENT ? "Deployment" :
+                        prop == PROP_LOCATION ? "Location" :
+                        prop == PROP_CHASSIS ? "Chassis" : "IconName" , NULL);
+
+        return sd_bus_reply_method_return(m, NULL);
+}
+
+static int method_set_pretty_hostname(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        return set_machine_info(userdata, m, PROP_PRETTY_HOSTNAME, method_set_pretty_hostname, error);
+}
+
+static int method_set_icon_name(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        return set_machine_info(userdata, m, PROP_ICON_NAME, method_set_icon_name, error);
+}
+
+static int method_set_chassis(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        return set_machine_info(userdata, m, PROP_CHASSIS, method_set_chassis, error);
+}
+
+static int method_set_deployment(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        return set_machine_info(userdata, m, PROP_DEPLOYMENT, method_set_deployment, error);
+}
+
+static int method_set_location(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        return set_machine_info(userdata, m, PROP_LOCATION, method_set_location, error);
+}
+
+static int method_get_product_uuid(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Context *c = ASSERT_PTR(userdata);
+        int interactive, r;
+        sd_id128_t uuid;
+
+        assert(m);
+
+        r = sd_bus_message_read(m, "b", &interactive);
+        if (r < 0)
+                return r;
+
+        r = bus_verify_polkit_async(
+                        m,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.hostname1.get-product-uuid",
+                        NULL,
+                        interactive,
+                        UID_INVALID,
+                        &c->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = id128_get_product(&uuid);
+        if (r < 0) {
+                if (r == -EADDRNOTAVAIL)
+                        log_debug_errno(r, "DMI product UUID is all 0x00 or all 0xFF, ignoring.");
+                else
+                        log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                                       "Failed to read product UUID, ignoring: %m");
+
+                return sd_bus_error_set(error, BUS_ERROR_NO_PRODUCT_UUID,
+                                        "Failed to read product UUID from firmware.");
+        }
+
+        r = sd_bus_message_new_method_return(m, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append_array(reply, 'y', uuid.bytes, sizeof(uuid.bytes));
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_hardware_serial(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_free_ char *serial = NULL;
+        Context *c = ASSERT_PTR(userdata);
+        int r;
+
+        assert(m);
+
+        r = bus_verify_polkit_async(
+                        m,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.hostname1.get-hardware-serial",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &c->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        r = get_hardware_serial(&serial);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_new_method_return(m, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "s", serial);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_describe(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+        _cleanup_free_ char *hn = NULL, *dhn = NULL, *in = NULL, *text = NULL,
+                *chassis = NULL, *vendor = NULL, *model = NULL, *serial = NULL, *firmware_version = NULL,
+                *firmware_vendor = NULL;
+        usec_t firmware_date = USEC_INFINITY, eol = USEC_INFINITY;
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        sd_id128_t machine_id, boot_id, product_uuid = SD_ID128_NULL;
+        Context *c = ASSERT_PTR(userdata);
+        bool privileged;
+        struct utsname u;
+        int r;
+
+        assert(m);
+
+        r = bus_verify_polkit_async(
+                        m,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.hostname1.get-description",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &c->polkit_registry,
+                        error);
+        if (r == 0)
+                return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+        /* We ignore all authentication errors here, since most data is unprivileged, the one exception being
+         * the product ID which we'll check explicitly. */
+        privileged = r > 0;
+
+        context_read_etc_hostname(c);
+        context_read_machine_info(c);
+        context_read_os_release(c);
+        context_determine_hostname_source(c);
+
+        r = gethostname_strict(&hn);
+        if (r < 0) {
+                if (r != -ENXIO)
+                        return log_error_errno(r, "Failed to read local host name: %m");
+
+                hn = get_default_hostname();
+                if (!hn)
+                        return log_oom();
+        }
+
+        dhn = get_default_hostname();
+        if (!dhn)
+                return log_oom();
+
+        if (isempty(c->data[PROP_ICON_NAME]))
+                in = context_fallback_icon_name(c);
+
+        chassis = context_get_chassis(c);
+
+        assert_se(uname(&u) >= 0);
+
+        if (isempty(c->data[PROP_HARDWARE_VENDOR]))
+                (void) get_hardware_vendor(&vendor);
+        if (isempty(c->data[PROP_HARDWARE_MODEL]))
+                (void) get_hardware_model(&model);
+
+        if (privileged) {
+                /* The product UUID and hardware serial is only available to privileged clients */
+                (void) id128_get_product(&product_uuid);
+                (void) get_hardware_serial(&serial);
+        }
+        (void) get_firmware_version(&firmware_version);
+        (void) get_firmware_vendor(&firmware_vendor);
+        (void) get_firmware_date(&firmware_date);
+
+        if (c->data[PROP_OS_SUPPORT_END])
+                (void) os_release_support_ended(c->data[PROP_OS_SUPPORT_END], /* quiet= */ false, &eol);
+
+        r = sd_id128_get_machine(&machine_id);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get machine ID: %m");
+
+        r = sd_id128_get_boot(&boot_id);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get boot ID: %m");
+
+        r = json_build(&v, JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("Hostname", JSON_BUILD_STRING(hn)),
+                                       JSON_BUILD_PAIR("StaticHostname", JSON_BUILD_STRING(c->data[PROP_STATIC_HOSTNAME])),
+                                       JSON_BUILD_PAIR("PrettyHostname", JSON_BUILD_STRING(c->data[PROP_PRETTY_HOSTNAME])),
+                                       JSON_BUILD_PAIR("DefaultHostname", JSON_BUILD_STRING(dhn)),
+                                       JSON_BUILD_PAIR("HostnameSource", JSON_BUILD_STRING(hostname_source_to_string(c->hostname_source))),
+                                       JSON_BUILD_PAIR("IconName", JSON_BUILD_STRING(in ?: c->data[PROP_ICON_NAME])),
+                                       JSON_BUILD_PAIR("Chassis", JSON_BUILD_STRING(chassis)),
+                                       JSON_BUILD_PAIR("Deployment", JSON_BUILD_STRING(c->data[PROP_DEPLOYMENT])),
+                                       JSON_BUILD_PAIR("Location", JSON_BUILD_STRING(c->data[PROP_LOCATION])),
+                                       JSON_BUILD_PAIR("KernelName", JSON_BUILD_STRING(u.sysname)),
+                                       JSON_BUILD_PAIR("KernelRelease", JSON_BUILD_STRING(u.release)),
+                                       JSON_BUILD_PAIR("KernelVersion", JSON_BUILD_STRING(u.version)),
+                                       JSON_BUILD_PAIR("OperatingSystemPrettyName", JSON_BUILD_STRING(c->data[PROP_OS_PRETTY_NAME])),
+                                       JSON_BUILD_PAIR("OperatingSystemCPEName", JSON_BUILD_STRING(c->data[PROP_OS_CPE_NAME])),
+                                       JSON_BUILD_PAIR("OperatingSystemHomeURL", JSON_BUILD_STRING(c->data[PROP_OS_HOME_URL])),
+                                       JSON_BUILD_PAIR_FINITE_USEC("OperatingSystemSupportEnd", eol),
+                                       JSON_BUILD_PAIR("HardwareVendor", JSON_BUILD_STRING(vendor ?: c->data[PROP_HARDWARE_VENDOR])),
+                                       JSON_BUILD_PAIR("HardwareModel", JSON_BUILD_STRING(model ?: c->data[PROP_HARDWARE_MODEL])),
+                                       JSON_BUILD_PAIR("HardwareSerial", JSON_BUILD_STRING(serial)),
+                                       JSON_BUILD_PAIR("FirmwareVersion", JSON_BUILD_STRING(firmware_version)),
+                                       JSON_BUILD_PAIR("FirmwareVendor", JSON_BUILD_STRING(firmware_vendor)),
+                                       JSON_BUILD_PAIR_FINITE_USEC("FirmwareDate", firmware_date),
+                                       JSON_BUILD_PAIR_ID128("MachineID", machine_id),
+                                       JSON_BUILD_PAIR_ID128("BootID", boot_id),
+                                       JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(product_uuid), "ProductUUID", JSON_BUILD_ID128(product_uuid)),
+                                       JSON_BUILD_PAIR_CONDITION(sd_id128_is_null(product_uuid), "ProductUUID", JSON_BUILD_NULL)));
+
+        if (r < 0)
+                return log_error_errno(r, "Failed to build JSON data: %m");
+
+        r = json_variant_format(v, 0, &text);
+        if (r < 0)
+                return log_error_errno(r, "Failed to format JSON data: %m");
+
+        r = sd_bus_message_new_method_return(m, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_append(reply, "s", text);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static const sd_bus_vtable hostname_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+        SD_BUS_PROPERTY("Hostname", "s", property_get_hostname, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("StaticHostname", "s", property_get_static_hostname, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("PrettyHostname", "s", property_get_machine_info_field, offsetof(Context, data) + sizeof(char*) * PROP_PRETTY_HOSTNAME, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("DefaultHostname", "s", property_get_default_hostname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("HostnameSource", "s", property_get_hostname_source, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("IconName", "s", property_get_icon_name, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Chassis", "s", property_get_chassis, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Deployment", "s", property_get_machine_info_field, offsetof(Context, data) + sizeof(char*) * PROP_DEPLOYMENT, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("Location", "s", property_get_machine_info_field, offsetof(Context, data) + sizeof(char*) * PROP_LOCATION, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("KernelName", "s", property_get_uname_field, offsetof(struct utsname, sysname), SD_BUS_VTABLE_ABSOLUTE_OFFSET|SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KernelRelease", "s", property_get_uname_field, offsetof(struct utsname, release), SD_BUS_VTABLE_ABSOLUTE_OFFSET|SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("KernelVersion", "s", property_get_uname_field, offsetof(struct utsname, version), SD_BUS_VTABLE_ABSOLUTE_OFFSET|SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OperatingSystemPrettyName", "s", property_get_os_release_field, offsetof(Context, data) + sizeof(char*) * PROP_OS_PRETTY_NAME, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OperatingSystemCPEName", "s", property_get_os_release_field, offsetof(Context, data) + sizeof(char*) * PROP_OS_CPE_NAME, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("OperatingSystemSupportEnd", "t", property_get_os_support_end, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("HomeURL", "s", property_get_os_release_field, offsetof(Context, data) + sizeof(char*) * PROP_OS_HOME_URL, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("HardwareVendor", "s", property_get_hardware_vendor, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("HardwareModel", "s", property_get_hardware_model, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FirmwareVersion", "s", property_get_firmware_version, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FirmwareVendor", "s", property_get_firmware_vendor, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FirmwareDate", "t", property_get_firmware_date, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MachineID", "ay", property_get_machine_id, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("BootID", "ay", property_get_boot_id, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+
+        SD_BUS_METHOD_WITH_ARGS("SetHostname",
+                                SD_BUS_ARGS("s", hostname, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_hostname,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetStaticHostname",
+                                SD_BUS_ARGS("s", hostname, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_static_hostname,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetPrettyHostname",
+                                SD_BUS_ARGS("s", hostname, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_pretty_hostname,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetIconName",
+                                SD_BUS_ARGS("s", icon, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_icon_name,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetChassis",
+                                SD_BUS_ARGS("s", chassis, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_chassis,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetDeployment",
+                                SD_BUS_ARGS("s", deployment, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_deployment,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("SetLocation",
+                                SD_BUS_ARGS("s", location, "b", interactive),
+                                SD_BUS_NO_RESULT,
+                                method_set_location,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetProductUUID",
+                                SD_BUS_ARGS("b", interactive),
+                                SD_BUS_RESULT("ay", uuid),
+                                method_get_product_uuid,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("GetHardwareSerial",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("s", serial),
+                                method_get_hardware_serial,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("Describe",
+                                SD_BUS_NO_ARGS,
+                                SD_BUS_RESULT("s", json),
+                                method_describe,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_VTABLE_END,
+};
+
+static const BusObjectImplementation manager_object = {
+        "/org/freedesktop/hostname1",
+        "org.freedesktop.hostname1",
+        .vtables = BUS_VTABLES(hostname_vtable),
+};
+
+static int connect_bus(Context *c, sd_event *event, sd_bus **ret) {
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        assert(c);
+        assert(event);
+        assert(ret);
+
+        r = sd_bus_default_system(&bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get system bus connection: %m");
+
+        r = bus_add_implementation(bus, &manager_object, c);
+        if (r < 0)
+                return r;
+
+        r = bus_log_control_api_register(bus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.hostname1", 0, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to request name: %m");
+
+        r = sd_bus_attach_event(bus, event, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach bus to event loop: %m");
+
+        *ret = TAKE_PTR(bus);
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(context_destroy) Context context = {
+                .hostname_source = _HOSTNAME_INVALID, /* appropriate value will be set later */
+        };
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        int r;
+
+        log_setup();
+
+        r = service_parse_argv("systemd-hostnamed.service",
+                               "Manage the system hostname and related metadata.",
+                               BUS_IMPLEMENTATIONS(&manager_object,
+                                                   &log_control_object),
+                               argc, argv);
+        if (r <= 0)
+                return r;
+
+        umask(0022);
+
+        r = mac_init();
+        if (r < 0)
+                return r;
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
+
+        r = sd_event_default(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        (void) sd_event_set_watchdog(event, true);
+
+        r = sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to install SIGINT handler: %m");
+
+        r = sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to install SIGTERM handler: %m");
+
+        r = connect_bus(&context, event, &bus);
+        if (r < 0)
+                return r;
+
+        r = bus_event_loop_with_idle(event, bus, "org.freedesktop.hostname1", DEFAULT_EXIT_USEC, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/hostname/meson.build b/src/hostname/meson.build
new file mode 100644
index 0000000..27c9b84
--- /dev/null
+++ b/src/hostname/meson.build
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-hostnamed',
+                'dbus' : true,
+                'conditions' : ['ENABLE_HOSTNAMED'],
+                'sources' : files('hostnamed.c'),
+        },
+        executable_template + {
+                'name' : 'hostnamectl',
+                'public' : true,
+                'conditions' : ['ENABLE_HOSTNAMED'],
+                'sources' : files('hostnamectl.c'),
+        },
+]
+
+if conf.get('ENABLE_HOSTNAMED') == 1
+        install_data('org.freedesktop.hostname1.conf',
+                     install_dir : dbuspolicydir)
+        install_data('org.freedesktop.hostname1.service',
+                     install_dir : dbussystemservicedir)
+        install_data('org.freedesktop.hostname1.policy',
+                     install_dir : polkitpolicydir)
+endif
diff --git a/src/hostname/org.freedesktop.hostname1.conf b/src/hostname/org.freedesktop.hostname1.conf
new file mode 100644
index 0000000..5b70625
--- /dev/null
+++ b/src/hostname/org.freedesktop.hostname1.conf
@@ -0,0 +1,29 @@
+ 
+
+
+
+
+
+
+        
+                
+                
+                
+        
+
+        
+                
+                
+        
+
+
diff --git a/src/hostname/org.freedesktop.hostname1.policy b/src/hostname/org.freedesktop.hostname1.policy
new file mode 100644
index 0000000..bfc36d1
--- /dev/null
+++ b/src/hostname/org.freedesktop.hostname1.policy
@@ -0,0 +1,80 @@
+ 
+
+
+
+
+
+
+        The systemd Project
+        https://systemd.io
+
+        
+                Set hostname
+                Authentication is required to set the local hostname.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Set static hostname
+                Authentication is required to set the statically configured local hostname, as well as the pretty hostname.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+                org.freedesktop.hostname1.set-hostname org.freedesktop.hostname1.set-machine-info
+        
+
+        
+                Set machine information
+                Authentication is required to set local machine information.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Get product UUID
+                Authentication is required to get product UUID.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Get hardware serial number
+                Authentication is required to get hardware serial number.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+        
+                Get system description
+                Authentication is required to get system description.
+                
+                        auth_admin_keep
+                        auth_admin_keep
+                        auth_admin_keep
+                
+        
+
+
diff --git a/src/hostname/org.freedesktop.hostname1.service b/src/hostname/org.freedesktop.hostname1.service
new file mode 100644
index 0000000..1d6b9c8
--- /dev/null
+++ b/src/hostname/org.freedesktop.hostname1.service
@@ -0,0 +1,12 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.hostname1
+Exec=/bin/false
+User=root
+SystemdService=dbus-org.freedesktop.hostname1.service
diff --git a/src/hwdb/hwdb.c b/src/hwdb/hwdb.c
new file mode 100644
index 0000000..4287b1f
--- /dev/null
+++ b/src/hwdb/hwdb.c
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-hwdb.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "hwdb-util.h"
+#include "main-func.h"
+#include "pretty-print.h"
+#include "selinux-util.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static const char *arg_hwdb_bin_dir = NULL;
+static const char *arg_root = NULL;
+static bool arg_strict = false;
+
+static int verb_query(int argc, char *argv[], void *userdata) {
+        return hwdb_query(argv[1], arg_root);
+}
+
+static int verb_update(int argc, char *argv[], void *userdata) {
+        return hwdb_update(arg_root, arg_hwdb_bin_dir, arg_strict, false);
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-hwdb", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] COMMAND ...\n\n"
+               "%sUpdate or query the hardware database.%s\n"
+               "\nCommands:\n"
+               "  update          Update the hwdb database\n"
+               "  query MODALIAS  Query database and print result\n"
+               "\nOptions:\n"
+               "  -h --help       Show this help\n"
+               "     --version    Show package version\n"
+               "  -s --strict     When updating, return non-zero exit value on any parsing error\n"
+               "     --usr        Generate in " UDEVLIBEXECDIR " instead of /etc/udev\n"
+               "  -r --root=PATH  Alternative root path in the filesystem\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_USR,
+        };
+
+        static const struct option options[] = {
+                { "help",     no_argument,       NULL, 'h'         },
+                { "version",  no_argument,       NULL, ARG_VERSION },
+                { "usr",      no_argument,       NULL, ARG_USR     },
+                { "strict",   no_argument,       NULL, 's'         },
+                { "root",     required_argument, NULL, 'r'         },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "sr:h", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_USR:
+                        arg_hwdb_bin_dir = UDEVLIBEXECDIR;
+                        break;
+
+                case 's':
+                        arg_strict = true;
+                        break;
+
+                case 'r':
+                        arg_root = optarg;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int hwdb_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "update", 1, 1, 0, verb_update },
+                { "query",  2, 2, 0, verb_query  },
+                {},
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        r = mac_init();
+        if (r < 0)
+                return r;
+
+        return hwdb_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/hwdb/meson.build b/src/hwdb/meson.build
new file mode 100644
index 0000000..385ed85
--- /dev/null
+++ b/src/hwdb/meson.build
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-hwdb',
+                'public' : true,
+                'conditions' : ['ENABLE_HWDB'],
+                'sources' : files('hwdb.c'),
+                'link_with' : udev_link_with,
+                'install_rpath' : udev_rpath,
+        },
+]
diff --git a/src/id128/id128.c b/src/id128/id128.c
new file mode 100644
index 0000000..d726ab7
--- /dev/null
+++ b/src/id128/id128.c
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "build.h"
+#include "gpt.h"
+#include "id128-print.h"
+#include "main-func.h"
+#include "pretty-print.h"
+#include "strv.h"
+#include "format-table.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static Id128PrettyPrintMode arg_mode = ID128_PRINT_ID128;
+static sd_id128_t arg_app = {};
+static bool arg_value = false;
+
+static int verb_new(int argc, char **argv, void *userdata) {
+        return id128_print_new(arg_mode);
+}
+
+static int verb_machine_id(int argc, char **argv, void *userdata) {
+        sd_id128_t id;
+        int r;
+
+        if (sd_id128_is_null(arg_app))
+                r = sd_id128_get_machine(&id);
+        else
+                r = sd_id128_get_machine_app_specific(arg_app, &id);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get %smachine-ID: %m",
+                                       sd_id128_is_null(arg_app) ? "" : "app-specific ");
+
+        return id128_pretty_print(id, arg_mode);
+}
+
+static int verb_boot_id(int argc, char **argv, void *userdata) {
+        sd_id128_t id;
+        int r;
+
+        if (sd_id128_is_null(arg_app))
+                r = sd_id128_get_boot(&id);
+        else
+                r = sd_id128_get_boot_app_specific(arg_app, &id);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get %sboot-ID: %m",
+                                       sd_id128_is_null(arg_app) ? "" : "app-specific ");
+
+        return id128_pretty_print(id, arg_mode);
+}
+
+static int verb_invocation_id(int argc, char **argv, void *userdata) {
+        sd_id128_t id;
+        int r;
+
+        if (!sd_id128_is_null(arg_app))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Verb \"invocation-id\" cannot be combined with --app-specific=.");
+
+        r = sd_id128_get_invocation(&id);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get invocation-ID: %m");
+
+        return id128_pretty_print(id, arg_mode);
+}
+
+static int show_one(Table **table, const char *name, sd_id128_t uuid, bool first) {
+        sd_id128_t u;
+        int r;
+
+        assert(table);
+
+        if (sd_id128_is_null(arg_app))
+                u = uuid;
+        else
+                assert_se(sd_id128_get_app_specific(uuid, arg_app, &u) == 0);
+
+        if (arg_mode == ID128_PRINT_PRETTY) {
+                _cleanup_free_ char *id = NULL;
+
+                id = strreplace(name, "-", "_");
+                if (!id)
+                        return log_oom();
+
+                ascii_strupper(id);
+
+                r = id128_pretty_print_sample(id, u);
+                if (r < 0)
+                        return r;
+                if (!first)
+                        puts("");
+                return 0;
+        }
+
+        if (arg_value)
+                return id128_pretty_print(u, arg_mode);
+
+        if (!*table) {
+                *table = table_new("name", "id");
+                if (!*table)
+                        return log_oom();
+                table_set_width(*table, 0);
+        }
+
+        return table_add_many(*table,
+                              TABLE_STRING, name,
+                              arg_mode == ID128_PRINT_ID128 ? TABLE_ID128 : TABLE_UUID,
+                              u);
+}
+
+static int verb_show(int argc, char **argv, void *userdata) {
+        _cleanup_(table_unrefp) Table *table = NULL;
+        int r;
+
+        argv = strv_skip(argv, 1);
+        if (strv_isempty(argv))
+                for (const GptPartitionType *e = gpt_partition_type_table; e->name; e++) {
+                        r = show_one(&table, e->name, e->uuid, e == gpt_partition_type_table);
+                        if (r < 0)
+                                return r;
+                }
+        else
+                STRV_FOREACH(p, argv) {
+                        sd_id128_t uuid;
+                        bool have_uuid;
+                        const char *id;
+
+                        /* Check if the argument is an actual UUID first */
+                        have_uuid = sd_id128_from_string(*p, &uuid) >= 0;
+
+                        if (have_uuid)
+                                id = gpt_partition_type_uuid_to_string(uuid) ?: "XYZ";
+                        else {
+                                GptPartitionType type;
+
+                                r = gpt_partition_type_from_string(*p, &type);
+                                if (r < 0)
+                                        return log_error_errno(r, "Unknown identifier \"%s\".", *p);
+
+                                uuid = type.uuid;
+                                id = *p;
+                        }
+
+                        r = show_one(&table, id, uuid, p == argv);
+                        if (r < 0)
+                                return r;
+                }
+
+        if (table) {
+                r = table_print(table, NULL);
+                if (r < 0)
+                        return table_log_print_error(r);
+        }
+
+        return 0;
+}
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-id128", "1", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s [OPTIONS...] COMMAND\n\n"
+               "%sGenerate and print 128-bit identifiers.%s\n"
+               "\nCommands:\n"
+               "  new                     Generate a new ID\n"
+               "  machine-id              Print the ID of current machine\n"
+               "  boot-id                 Print the ID of current boot\n"
+               "  invocation-id           Print the ID of current invocation\n"
+               "  show [NAME|UUID]        Print one or more UUIDs\n"
+               "  help                    Show this help\n"
+               "\nOptions:\n"
+               "  -h --help               Show this help\n"
+               "  -p --pretty             Generate samples of program code\n"
+               "  -P --value              Only print the value\n"
+               "  -a --app-specific=ID    Generate app-specific IDs\n"
+               "  -u --uuid               Output in UUID format\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               ansi_highlight(),
+               ansi_normal(),
+               link);
+
+        return 0;
+}
+
+static int verb_help(int argc, char **argv, void *userdata) {
+        return help();
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        enum {
+                ARG_VERSION = 0x100,
+        };
+
+        static const struct option options[] = {
+                { "help",         no_argument,       NULL, 'h'              },
+                { "version",      no_argument,       NULL, ARG_VERSION      },
+                { "pretty",       no_argument,       NULL, 'p'              },
+                { "value",        no_argument,       NULL, 'P'              },
+                { "app-specific", required_argument, NULL, 'a'              },
+                { "uuid",         no_argument,       NULL, 'u'              },
+                {},
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "hpa:uP", options, NULL)) >= 0)
+                switch (c) {
+
+                case 'h':
+                        return help();
+
+                case ARG_VERSION:
+                        return version();
+
+                case 'p':
+                        arg_mode = ID128_PRINT_PRETTY;
+                        arg_value = false;
+                        break;
+
+                case 'P':
+                        arg_value = true;
+                        if (arg_mode == ID128_PRINT_PRETTY)
+                                arg_mode = ID128_PRINT_ID128;
+                        break;
+
+                case 'a':
+                        r = id128_from_string_nonzero(optarg, &arg_app);
+                        if (r == -ENXIO)
+                                return log_error_errno(r, "Application ID cannot be all zeros.");
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse \"%s\" as application-ID: %m", optarg);
+                        break;
+
+                case 'u':
+                        arg_mode = ID128_PRINT_UUID;
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int id128_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "new",            VERB_ANY, 1,        0,  verb_new           },
+                { "machine-id",     VERB_ANY, 1,        0,  verb_machine_id    },
+                { "boot-id",        VERB_ANY, 1,        0,  verb_boot_id       },
+                { "invocation-id",  VERB_ANY, 1,        0,  verb_invocation_id },
+                { "show",           VERB_ANY, VERB_ANY, 0,  verb_show          },
+                { "help",           VERB_ANY, VERB_ANY, 0,  verb_help          },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return id128_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/id128/meson.build b/src/id128/meson.build
new file mode 100644
index 0000000..13798ae
--- /dev/null
+++ b/src/id128/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        executable_template + {
+                'name' : 'systemd-id128',
+                'public' : true,
+                'sources' : files('id128.c'),
+        },
+]
diff --git a/src/import/curl-util.c b/src/import/curl-util.c
new file mode 100644
index 0000000..94f718d
--- /dev/null
+++ b/src/import/curl-util.c
@@ -0,0 +1,384 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "curl-util.h"
+#include "fd-util.h"
+#include "locale-util.h"
+#include "string-util.h"
+#include "version.h"
+
+static void curl_glue_check_finished(CurlGlue *g) {
+        CURLMsg *msg;
+        int k = 0;
+
+        assert(g);
+
+        msg = curl_multi_info_read(g->curl, &k);
+        if (!msg)
+                return;
+
+        if (msg->msg != CURLMSG_DONE)
+                return;
+
+        if (g->on_finished)
+                g->on_finished(g, msg->easy_handle, msg->data.result);
+}
+
+static int curl_glue_on_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        CurlGlue *g = ASSERT_PTR(userdata);
+        int action, k = 0;
+
+        assert(s);
+
+        if (FLAGS_SET(revents, EPOLLIN | EPOLLOUT))
+                action = CURL_POLL_INOUT;
+        else if (revents & EPOLLIN)
+                action = CURL_POLL_IN;
+        else if (revents & EPOLLOUT)
+                action = CURL_POLL_OUT;
+        else
+                action = 0;
+
+        if (curl_multi_socket_action(g->curl, fd, action, &k) != CURLM_OK)
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Failed to propagate IO event.");
+
+        curl_glue_check_finished(g);
+        return 0;
+}
+
+static int curl_glue_socket_callback(CURL *curl, curl_socket_t s, int action, void *userdata, void *socketp) {
+        sd_event_source *io = socketp;
+        CurlGlue *g = ASSERT_PTR(userdata);
+        uint32_t events = 0;
+        int r;
+
+        assert(curl);
+
+        if (action == CURL_POLL_REMOVE) {
+                if (io) {
+                        sd_event_source_disable_unref(io);
+
+                        hashmap_remove(g->ios, FD_TO_PTR(s));
+                }
+
+                return 0;
+        }
+
+        r = hashmap_ensure_allocated(&g->ios, &trivial_hash_ops);
+        if (r < 0) {
+                log_oom();
+                return -1;
+        }
+
+        if (action == CURL_POLL_IN)
+                events = EPOLLIN;
+        else if (action == CURL_POLL_OUT)
+                events = EPOLLOUT;
+        else if (action == CURL_POLL_INOUT)
+                events = EPOLLIN|EPOLLOUT;
+
+        if (io) {
+                if (sd_event_source_set_io_events(io, events) < 0)
+                        return -1;
+
+                if (sd_event_source_set_enabled(io, SD_EVENT_ON) < 0)
+                        return -1;
+        } else {
+                if (sd_event_add_io(g->event, &io, s, events, curl_glue_on_io, g) < 0)
+                        return -1;
+
+                if (curl_multi_assign(g->curl, s, io) != CURLM_OK)
+                        return -1;
+
+                (void) sd_event_source_set_description(io, "curl-io");
+
+                r = hashmap_put(g->ios, FD_TO_PTR(s), io);
+                if (r < 0) {
+                        log_oom();
+                        sd_event_source_unref(io);
+                        return -1;
+                }
+        }
+
+        return 0;
+}
+
+static int curl_glue_on_timer(sd_event_source *s, uint64_t usec, void *userdata) {
+        CurlGlue *g = ASSERT_PTR(userdata);
+        int k = 0;
+
+        assert(s);
+
+        if (curl_multi_socket_action(g->curl, CURL_SOCKET_TIMEOUT, 0, &k) != CURLM_OK)
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Failed to propagate timeout.");
+
+        curl_glue_check_finished(g);
+        return 0;
+}
+
+static int curl_glue_timer_callback(CURLM *curl, long timeout_ms, void *userdata) {
+        CurlGlue *g = ASSERT_PTR(userdata);
+        usec_t usec;
+
+        assert(curl);
+
+        if (timeout_ms < 0) {
+                if (g->timer) {
+                        if (sd_event_source_set_enabled(g->timer, SD_EVENT_OFF) < 0)
+                                return -1;
+                }
+
+                return 0;
+        }
+
+        usec = (usec_t) timeout_ms * USEC_PER_MSEC + USEC_PER_MSEC - 1;
+
+        if (g->timer) {
+                if (sd_event_source_set_time_relative(g->timer, usec) < 0)
+                        return -1;
+
+                if (sd_event_source_set_enabled(g->timer, SD_EVENT_ONESHOT) < 0)
+                        return -1;
+        } else {
+                if (sd_event_add_time_relative(g->event, &g->timer, CLOCK_BOOTTIME, usec, 0, curl_glue_on_timer, g) < 0)
+                        return -1;
+
+                (void) sd_event_source_set_description(g->timer, "curl-timer");
+        }
+
+        return 0;
+}
+
+CurlGlue *curl_glue_unref(CurlGlue *g) {
+        sd_event_source *io;
+
+        if (!g)
+                return NULL;
+
+        if (g->curl)
+                curl_multi_cleanup(g->curl);
+
+        while ((io = hashmap_steal_first(g->ios)))
+                sd_event_source_unref(io);
+
+        hashmap_free(g->ios);
+
+        sd_event_source_unref(g->timer);
+        sd_event_unref(g->event);
+        return mfree(g);
+}
+
+int curl_glue_new(CurlGlue **glue, sd_event *event) {
+        _cleanup_(curl_glue_unrefp) CurlGlue *g = NULL;
+        _cleanup_(curl_multi_cleanupp) CURLM *c = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        int r;
+
+        if (event)
+                e = sd_event_ref(event);
+        else {
+                r = sd_event_default(&e);
+                if (r < 0)
+                        return r;
+        }
+
+        c = curl_multi_init();
+        if (!c)
+                return -ENOMEM;
+
+        g = new(CurlGlue, 1);
+        if (!g)
+                return -ENOMEM;
+
+        *g = (CurlGlue) {
+                .event = TAKE_PTR(e),
+                .curl = TAKE_PTR(c),
+        };
+
+        if (curl_multi_setopt(g->curl, CURLMOPT_SOCKETDATA, g) != CURLM_OK)
+                return -EINVAL;
+
+        if (curl_multi_setopt(g->curl, CURLMOPT_SOCKETFUNCTION, curl_glue_socket_callback) != CURLM_OK)
+                return -EINVAL;
+
+        if (curl_multi_setopt(g->curl, CURLMOPT_TIMERDATA, g) != CURLM_OK)
+                return -EINVAL;
+
+        if (curl_multi_setopt(g->curl, CURLMOPT_TIMERFUNCTION, curl_glue_timer_callback) != CURLM_OK)
+                return -EINVAL;
+
+        *glue = TAKE_PTR(g);
+
+        return 0;
+}
+
+int curl_glue_make(CURL **ret, const char *url, void *userdata) {
+        _cleanup_(curl_easy_cleanupp) CURL *c = NULL;
+        const char *useragent;
+
+        assert(ret);
+        assert(url);
+
+        c = curl_easy_init();
+        if (!c)
+                return -ENOMEM;
+
+        if (DEBUG_LOGGING)
+                (void) curl_easy_setopt(c, CURLOPT_VERBOSE, 1L);
+
+        if (curl_easy_setopt(c, CURLOPT_URL, url) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(c, CURLOPT_PRIVATE, userdata) != CURLE_OK)
+                return -EIO;
+
+        useragent = strjoina(program_invocation_short_name, "/" GIT_VERSION);
+        if (curl_easy_setopt(c, CURLOPT_USERAGENT, useragent) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(c, CURLOPT_FOLLOWLOCATION, 1L) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(c, CURLOPT_NOSIGNAL, 1L) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(c, CURLOPT_LOW_SPEED_TIME, 60L) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(c, CURLOPT_LOW_SPEED_LIMIT, 30L) != CURLE_OK)
+                return -EIO;
+
+#if LIBCURL_VERSION_NUM >= 0x075500 /* libcurl 7.85.0 */
+        if (curl_easy_setopt(c, CURLOPT_PROTOCOLS_STR, "HTTP,HTTPS,FILE") != CURLE_OK)
+#else
+        if (curl_easy_setopt(c, CURLOPT_PROTOCOLS, CURLPROTO_HTTP|CURLPROTO_HTTPS|CURLPROTO_FILE) != CURLE_OK)
+#endif
+                return -EIO;
+
+        *ret = TAKE_PTR(c);
+        return 0;
+}
+
+int curl_glue_add(CurlGlue *g, CURL *c) {
+        assert(g);
+        assert(c);
+
+        if (curl_multi_add_handle(g->curl, c) != CURLM_OK)
+                return -EIO;
+
+        return 0;
+}
+
+void curl_glue_remove_and_free(CurlGlue *g, CURL *c) {
+        assert(g);
+
+        if (!c)
+                return;
+
+        if (g->curl)
+                curl_multi_remove_handle(g->curl, c);
+
+        curl_easy_cleanup(c);
+}
+
+struct curl_slist *curl_slist_new(const char *first, ...) {
+        struct curl_slist *l;
+        va_list ap;
+
+        if (!first)
+                return NULL;
+
+        l = curl_slist_append(NULL, first);
+        if (!l)
+                return NULL;
+
+        va_start(ap, first);
+
+        for (;;) {
+                struct curl_slist *n;
+                const char *i;
+
+                i = va_arg(ap, const char*);
+                if (!i)
+                        break;
+
+                n = curl_slist_append(l, i);
+                if (!n) {
+                        va_end(ap);
+                        curl_slist_free_all(l);
+                        return NULL;
+                }
+
+                l = n;
+        }
+
+        va_end(ap);
+        return l;
+}
+
+int curl_header_strdup(const void *contents, size_t sz, const char *field, char **value) {
+        const char *p;
+        char *s;
+
+        p = memory_startswith_no_case(contents, sz, field);
+        if (!p)
+                return 0;
+
+        sz -= p - (const char*) contents;
+
+        if (memchr(p, 0, sz))
+                return 0;
+
+        /* Skip over preceding whitespace */
+        while (sz > 0 && strchr(WHITESPACE, p[0])) {
+                p++;
+                sz--;
+        }
+
+        /* Truncate trailing whitespace */
+        while (sz > 0 && strchr(WHITESPACE, p[sz-1]))
+                sz--;
+
+        s = strndup(p, sz);
+        if (!s)
+                return -ENOMEM;
+
+        *value = s;
+        return 1;
+}
+
+int curl_parse_http_time(const char *t, usec_t *ret) {
+        _cleanup_(freelocalep) locale_t loc = (locale_t) 0;
+        const char *e;
+        struct tm tm;
+        time_t v;
+
+        assert(t);
+        assert(ret);
+
+        loc = newlocale(LC_TIME_MASK, "C", (locale_t) 0);
+        if (loc == (locale_t) 0)
+                return -errno;
+
+        /* RFC822 */
+        e = strptime_l(t, "%a, %d %b %Y %H:%M:%S %Z", &tm, loc);
+        if (!e || *e != 0)
+                /* RFC 850 */
+                e = strptime_l(t, "%A, %d-%b-%y %H:%M:%S %Z", &tm, loc);
+        if (!e || *e != 0)
+                /* ANSI C */
+                e = strptime_l(t, "%a %b %d %H:%M:%S %Y", &tm, loc);
+        if (!e || *e != 0)
+                return -EINVAL;
+
+        v = timegm(&tm);
+        if (v == (time_t) -1)
+                return -EINVAL;
+
+        *ret = (usec_t) v * USEC_PER_SEC;
+        return 0;
+}
diff --git a/src/import/curl-util.h b/src/import/curl-util.h
new file mode 100644
index 0000000..6b4f992
--- /dev/null
+++ b/src/import/curl-util.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+#include 
+
+#include "sd-event.h"
+
+#include "hashmap.h"
+#include "time-util.h"
+
+typedef struct CurlGlue CurlGlue;
+
+struct CurlGlue {
+        sd_event *event;
+        CURLM *curl;
+        sd_event_source *timer;
+        Hashmap *ios;
+
+        void (*on_finished)(CurlGlue *g, CURL *curl, CURLcode code);
+        void *userdata;
+};
+
+int curl_glue_new(CurlGlue **glue, sd_event *event);
+CurlGlue* curl_glue_unref(CurlGlue *glue);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(CurlGlue*, curl_glue_unref);
+
+int curl_glue_make(CURL **ret, const char *url, void *userdata);
+int curl_glue_add(CurlGlue *g, CURL *c);
+void curl_glue_remove_and_free(CurlGlue *g, CURL *c);
+
+struct curl_slist *curl_slist_new(const char *first, ...) _sentinel_;
+int curl_header_strdup(const void *contents, size_t sz, const char *field, char **value);
+int curl_parse_http_time(const char *t, usec_t *ret);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(CURL*, curl_easy_cleanup, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(CURLM*, curl_multi_cleanup, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct curl_slist*, curl_slist_free_all, NULL);
diff --git a/src/import/export-raw.c b/src/import/export-raw.c
new file mode 100644
index 0000000..f425396
--- /dev/null
+++ b/src/import/export-raw.c
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-daemon.h"
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "copy.h"
+#include "export-raw.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "import-common.h"
+#include "missing_fcntl.h"
+#include "ratelimit.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+
+#define COPY_BUFFER_SIZE (16*1024)
+
+struct RawExport {
+        sd_event *event;
+
+        RawExportFinished on_finished;
+        void *userdata;
+
+        char *path;
+
+        int input_fd;
+        int output_fd;
+
+        ImportCompress compress;
+
+        sd_event_source *output_event_source;
+
+        void *buffer;
+        size_t buffer_size;
+        size_t buffer_allocated;
+
+        uint64_t written_compressed;
+        uint64_t written_uncompressed;
+
+        unsigned last_percent;
+        RateLimit progress_ratelimit;
+
+        struct stat st;
+
+        bool eof;
+        bool tried_reflink;
+        bool tried_sendfile;
+};
+
+RawExport *raw_export_unref(RawExport *e) {
+        if (!e)
+                return NULL;
+
+        sd_event_source_unref(e->output_event_source);
+
+        import_compress_free(&e->compress);
+
+        sd_event_unref(e->event);
+
+        safe_close(e->input_fd);
+
+        free(e->buffer);
+        free(e->path);
+        return mfree(e);
+}
+
+int raw_export_new(
+                RawExport **ret,
+                sd_event *event,
+                RawExportFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(raw_export_unrefp) RawExport *e = NULL;
+        int r;
+
+        assert(ret);
+
+        e = new(RawExport, 1);
+        if (!e)
+                return -ENOMEM;
+
+        *e = (RawExport) {
+                .output_fd = -EBADF,
+                .input_fd = -EBADF,
+                .on_finished = on_finished,
+                .userdata = userdata,
+                .last_percent = UINT_MAX,
+                .progress_ratelimit = { 100 * USEC_PER_MSEC, 1 },
+        };
+
+        if (event)
+                e->event = sd_event_ref(event);
+        else {
+                r = sd_event_default(&e->event);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret = TAKE_PTR(e);
+
+        return 0;
+}
+
+static void raw_export_report_progress(RawExport *e) {
+        unsigned percent;
+        assert(e);
+
+        if (e->written_uncompressed >= (uint64_t) e->st.st_size)
+                percent = 100;
+        else
+                percent = (unsigned) ((e->written_uncompressed * UINT64_C(100)) / (uint64_t) e->st.st_size);
+
+        if (percent == e->last_percent)
+                return;
+
+        if (!ratelimit_below(&e->progress_ratelimit))
+                return;
+
+        sd_notifyf(false, "X_IMPORT_PROGRESS=%u%%", percent);
+        log_info("Exported %u%%.", percent);
+
+        e->last_percent = percent;
+}
+
+static int raw_export_process(RawExport *e) {
+        ssize_t l;
+        int r;
+
+        assert(e);
+
+        if (!e->tried_reflink && e->compress.type == IMPORT_COMPRESS_UNCOMPRESSED) {
+
+                /* If we shall take an uncompressed snapshot we can
+                 * reflink source to destination directly. Let's see
+                 * if this works. */
+
+                r = reflink(e->input_fd, e->output_fd);
+                if (r >= 0) {
+                        r = 0;
+                        goto finish;
+                }
+
+                e->tried_reflink = true;
+        }
+
+        if (!e->tried_sendfile && e->compress.type == IMPORT_COMPRESS_UNCOMPRESSED) {
+
+                l = sendfile(e->output_fd, e->input_fd, NULL, COPY_BUFFER_SIZE);
+                if (l < 0) {
+                        if (errno == EAGAIN)
+                                return 0;
+
+                        e->tried_sendfile = true;
+                } else if (l == 0) {
+                        r = 0;
+                        goto finish;
+                } else {
+                        e->written_uncompressed += l;
+                        e->written_compressed += l;
+
+                        raw_export_report_progress(e);
+
+                        return 0;
+                }
+        }
+
+        while (e->buffer_size <= 0) {
+                uint8_t input[COPY_BUFFER_SIZE];
+
+                if (e->eof) {
+                        r = 0;
+                        goto finish;
+                }
+
+                l = read(e->input_fd, input, sizeof(input));
+                if (l < 0) {
+                        r = log_error_errno(errno, "Failed to read raw file: %m");
+                        goto finish;
+                }
+
+                if (l == 0) {
+                        e->eof = true;
+                        r = import_compress_finish(&e->compress, &e->buffer, &e->buffer_size, &e->buffer_allocated);
+                } else {
+                        e->written_uncompressed += l;
+                        r = import_compress(&e->compress, input, l, &e->buffer, &e->buffer_size, &e->buffer_allocated);
+                }
+                if (r < 0) {
+                        r = log_error_errno(r, "Failed to encode: %m");
+                        goto finish;
+                }
+        }
+
+        l = write(e->output_fd, e->buffer, e->buffer_size);
+        if (l < 0) {
+                if (errno == EAGAIN)
+                        return 0;
+
+                r = log_error_errno(errno, "Failed to write output file: %m");
+                goto finish;
+        }
+
+        assert((size_t) l <= e->buffer_size);
+        memmove(e->buffer, (uint8_t*) e->buffer + l, e->buffer_size - l);
+        e->buffer_size -= l;
+        e->written_compressed += l;
+
+        raw_export_report_progress(e);
+
+        return 0;
+
+finish:
+        if (r >= 0) {
+                (void) copy_times(e->input_fd, e->output_fd, COPY_CRTIME);
+                (void) copy_xattr(e->input_fd, NULL, e->output_fd, NULL, 0);
+        }
+
+        if (e->on_finished)
+                e->on_finished(e, r, e->userdata);
+        else
+                sd_event_exit(e->event, r);
+
+        return 0;
+}
+
+static int raw_export_on_output(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        RawExport *i = userdata;
+
+        return raw_export_process(i);
+}
+
+static int raw_export_on_defer(sd_event_source *s, void *userdata) {
+        RawExport *i = userdata;
+
+        return raw_export_process(i);
+}
+
+static int reflink_snapshot(int fd, const char *path) {
+        int new_fd, r;
+
+        new_fd = open_parent(path, O_TMPFILE|O_CLOEXEC|O_RDWR, 0600);
+        if (new_fd < 0) {
+                _cleanup_free_ char *t = NULL;
+
+                r = tempfn_random(path, NULL, &t);
+                if (r < 0)
+                        return r;
+
+                new_fd = open(t, O_CLOEXEC|O_CREAT|O_NOCTTY|O_RDWR, 0600);
+                if (new_fd < 0)
+                        return -errno;
+
+                (void) unlink(t);
+        }
+
+        r = reflink(fd, new_fd);
+        if (r < 0) {
+                safe_close(new_fd);
+                return r;
+        }
+
+        return new_fd;
+}
+
+int raw_export_start(RawExport *e, const char *path, int fd, ImportCompressType compress) {
+        _cleanup_close_ int sfd = -EBADF, tfd = -EBADF;
+        int r;
+
+        assert(e);
+        assert(path);
+        assert(fd >= 0);
+        assert(compress < _IMPORT_COMPRESS_TYPE_MAX);
+        assert(compress != IMPORT_COMPRESS_UNKNOWN);
+
+        if (e->output_fd >= 0)
+                return -EBUSY;
+
+        r = fd_nonblock(fd, true);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&e->path, path);
+        if (r < 0)
+                return r;
+
+        sfd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+        if (sfd < 0)
+                return -errno;
+
+        if (fstat(sfd, &e->st) < 0)
+                return -errno;
+        r = stat_verify_regular(&e->st);
+        if (r < 0)
+                return r;
+
+        /* Try to take a reflink snapshot of the file, if we can t make the export atomic */
+        tfd = reflink_snapshot(sfd, path);
+        if (tfd >= 0)
+                e->input_fd = TAKE_FD(tfd);
+        else
+                e->input_fd = TAKE_FD(sfd);
+
+        r = import_compress_init(&e->compress, compress);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_io(e->event, &e->output_event_source, fd, EPOLLOUT, raw_export_on_output, e);
+        if (r == -EPERM) {
+                r = sd_event_add_defer(e->event, &e->output_event_source, raw_export_on_defer, e);
+                if (r < 0)
+                        return r;
+
+                r = sd_event_source_set_enabled(e->output_event_source, SD_EVENT_ON);
+        }
+        if (r < 0)
+                return r;
+
+        e->output_fd = fd;
+        return r;
+}
diff --git a/src/import/export-raw.h b/src/import/export-raw.h
new file mode 100644
index 0000000..27009e4
--- /dev/null
+++ b/src/import/export-raw.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "import-compress.h"
+#include "macro.h"
+
+typedef struct RawExport RawExport;
+
+typedef void (*RawExportFinished)(RawExport *export, int error, void *userdata);
+
+int raw_export_new(RawExport **export, sd_event *event, RawExportFinished on_finished, void *userdata);
+RawExport* raw_export_unref(RawExport *export);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(RawExport*, raw_export_unref);
+
+int raw_export_start(RawExport *export, const char *path, int fd, ImportCompressType compress);
diff --git a/src/import/export-tar.c b/src/import/export-tar.c
new file mode 100644
index 0000000..9e92bad
--- /dev/null
+++ b/src/import/export-tar.c
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-daemon.h"
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "export-tar.h"
+#include "fd-util.h"
+#include "import-common.h"
+#include "process-util.h"
+#include "ratelimit.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+
+#define COPY_BUFFER_SIZE (16*1024)
+
+struct TarExport {
+        sd_event *event;
+
+        TarExportFinished on_finished;
+        void *userdata;
+
+        char *path;
+        char *temp_path;
+
+        int output_fd;
+        int tar_fd;
+
+        ImportCompress compress;
+
+        sd_event_source *output_event_source;
+
+        void *buffer;
+        size_t buffer_size;
+        size_t buffer_allocated;
+
+        uint64_t written_compressed;
+        uint64_t written_uncompressed;
+
+        pid_t tar_pid;
+
+        struct stat st;
+        uint64_t quota_referenced;
+
+        unsigned last_percent;
+        RateLimit progress_ratelimit;
+
+        bool eof;
+        bool tried_splice;
+};
+
+TarExport *tar_export_unref(TarExport *e) {
+        if (!e)
+                return NULL;
+
+        sd_event_source_unref(e->output_event_source);
+
+        if (e->tar_pid > 1)
+                sigkill_wait(e->tar_pid);
+
+        if (e->temp_path) {
+                (void) btrfs_subvol_remove(e->temp_path, BTRFS_REMOVE_QUOTA);
+                free(e->temp_path);
+        }
+
+        import_compress_free(&e->compress);
+
+        sd_event_unref(e->event);
+
+        safe_close(e->tar_fd);
+
+        free(e->buffer);
+        free(e->path);
+        return mfree(e);
+}
+
+int tar_export_new(
+                TarExport **ret,
+                sd_event *event,
+                TarExportFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(tar_export_unrefp) TarExport *e = NULL;
+        int r;
+
+        assert(ret);
+
+        e = new(TarExport, 1);
+        if (!e)
+                return -ENOMEM;
+
+        *e = (TarExport) {
+                .output_fd = -EBADF,
+                .tar_fd = -EBADF,
+                .on_finished = on_finished,
+                .userdata = userdata,
+                .quota_referenced = UINT64_MAX,
+                .last_percent = UINT_MAX,
+                .progress_ratelimit = { 100 * USEC_PER_MSEC, 1 },
+        };
+
+        if (event)
+                e->event = sd_event_ref(event);
+        else {
+                r = sd_event_default(&e->event);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret = TAKE_PTR(e);
+
+        return 0;
+}
+
+static void tar_export_report_progress(TarExport *e) {
+        unsigned percent;
+        assert(e);
+
+        /* Do we have any quota info? If not, we don't know anything about the progress */
+        if (e->quota_referenced == UINT64_MAX)
+                return;
+
+        if (e->written_uncompressed >= e->quota_referenced)
+                percent = 100;
+        else
+                percent = (unsigned) ((e->written_uncompressed * UINT64_C(100)) / e->quota_referenced);
+
+        if (percent == e->last_percent)
+                return;
+
+        if (!ratelimit_below(&e->progress_ratelimit))
+                return;
+
+        sd_notifyf(false, "X_IMPORT_PROGRESS=%u%%", percent);
+        log_info("Exported %u%%.", percent);
+
+        e->last_percent = percent;
+}
+
+static int tar_export_finish(TarExport *e) {
+        int r;
+
+        assert(e);
+        assert(e->tar_fd >= 0);
+
+        if (e->tar_pid > 0) {
+                r = wait_for_terminate_and_check("tar", TAKE_PID(e->tar_pid), WAIT_LOG);
+                if (r < 0)
+                        return r;
+                if (r != EXIT_SUCCESS)
+                        return -EPROTO;
+        }
+
+        e->tar_fd = safe_close(e->tar_fd);
+
+        return 0;
+}
+
+static int tar_export_process(TarExport *e) {
+        ssize_t l;
+        int r;
+
+        assert(e);
+
+        if (!e->tried_splice && e->compress.type == IMPORT_COMPRESS_UNCOMPRESSED) {
+
+                l = splice(e->tar_fd, NULL, e->output_fd, NULL, COPY_BUFFER_SIZE, 0);
+                if (l < 0) {
+                        if (errno == EAGAIN)
+                                return 0;
+
+                        e->tried_splice = true;
+                } else if (l == 0) {
+                        r = tar_export_finish(e);
+                        goto finish;
+                } else {
+                        e->written_uncompressed += l;
+                        e->written_compressed += l;
+
+                        tar_export_report_progress(e);
+
+                        return 0;
+                }
+        }
+
+        while (e->buffer_size <= 0) {
+                uint8_t input[COPY_BUFFER_SIZE];
+
+                if (e->eof) {
+                        r = tar_export_finish(e);
+                        goto finish;
+                }
+
+                l = read(e->tar_fd, input, sizeof(input));
+                if (l < 0) {
+                        r = log_error_errno(errno, "Failed to read tar file: %m");
+                        goto finish;
+                }
+
+                if (l == 0) {
+                        e->eof = true;
+                        r = import_compress_finish(&e->compress, &e->buffer, &e->buffer_size, &e->buffer_allocated);
+                } else {
+                        e->written_uncompressed += l;
+                        r = import_compress(&e->compress, input, l, &e->buffer, &e->buffer_size, &e->buffer_allocated);
+                }
+                if (r < 0) {
+                        r = log_error_errno(r, "Failed to encode: %m");
+                        goto finish;
+                }
+        }
+
+        l = write(e->output_fd, e->buffer, e->buffer_size);
+        if (l < 0) {
+                if (errno == EAGAIN)
+                        return 0;
+
+                r = log_error_errno(errno, "Failed to write output file: %m");
+                goto finish;
+        }
+
+        assert((size_t) l <= e->buffer_size);
+        memmove(e->buffer, (uint8_t*) e->buffer + l, e->buffer_size - l);
+        e->buffer_size -= l;
+        e->written_compressed += l;
+
+        tar_export_report_progress(e);
+
+        return 0;
+
+finish:
+        if (e->on_finished)
+                e->on_finished(e, r, e->userdata);
+        else
+                sd_event_exit(e->event, r);
+
+        return 0;
+}
+
+static int tar_export_on_output(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        TarExport *i = userdata;
+
+        return tar_export_process(i);
+}
+
+static int tar_export_on_defer(sd_event_source *s, void *userdata) {
+        TarExport *i = userdata;
+
+        return tar_export_process(i);
+}
+
+int tar_export_start(TarExport *e, const char *path, int fd, ImportCompressType compress) {
+        _cleanup_close_ int sfd = -EBADF;
+        int r;
+
+        assert(e);
+        assert(path);
+        assert(fd >= 0);
+        assert(compress < _IMPORT_COMPRESS_TYPE_MAX);
+        assert(compress != IMPORT_COMPRESS_UNKNOWN);
+
+        if (e->output_fd >= 0)
+                return -EBUSY;
+
+        sfd = open(path, O_DIRECTORY|O_RDONLY|O_NOCTTY|O_CLOEXEC);
+        if (sfd < 0)
+                return -errno;
+
+        if (fstat(sfd, &e->st) < 0)
+                return -errno;
+
+        r = fd_nonblock(fd, true);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&e->path, path);
+        if (r < 0)
+                return r;
+
+        e->quota_referenced = UINT64_MAX;
+
+        if (btrfs_might_be_subvol(&e->st)) {
+                BtrfsQuotaInfo q;
+
+                r = btrfs_subvol_get_subtree_quota_fd(sfd, 0, &q);
+                if (r >= 0)
+                        e->quota_referenced = q.referenced;
+
+                e->temp_path = mfree(e->temp_path);
+
+                r = tempfn_random(path, NULL, &e->temp_path);
+                if (r < 0)
+                        return r;
+
+                /* Let's try to make a snapshot, if we can, so that the export is atomic */
+                r = btrfs_subvol_snapshot_at(sfd, NULL, AT_FDCWD, e->temp_path, BTRFS_SNAPSHOT_READ_ONLY|BTRFS_SNAPSHOT_RECURSIVE);
+                if (r < 0) {
+                        log_debug_errno(r, "Couldn't create snapshot %s of %s, not exporting atomically: %m", e->temp_path, path);
+                        e->temp_path = mfree(e->temp_path);
+                }
+        }
+
+        r = import_compress_init(&e->compress, compress);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_io(e->event, &e->output_event_source, fd, EPOLLOUT, tar_export_on_output, e);
+        if (r == -EPERM) {
+                r = sd_event_add_defer(e->event, &e->output_event_source, tar_export_on_defer, e);
+                if (r < 0)
+                        return r;
+
+                r = sd_event_source_set_enabled(e->output_event_source, SD_EVENT_ON);
+        }
+        if (r < 0)
+                return r;
+
+        e->tar_fd = import_fork_tar_c(e->temp_path ?: e->path, &e->tar_pid);
+        if (e->tar_fd < 0) {
+                e->output_event_source = sd_event_source_unref(e->output_event_source);
+                return e->tar_fd;
+        }
+
+        e->output_fd = fd;
+        return r;
+}
diff --git a/src/import/export-tar.h b/src/import/export-tar.h
new file mode 100644
index 0000000..3b55d12
--- /dev/null
+++ b/src/import/export-tar.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "import-compress.h"
+#include "macro.h"
+
+typedef struct TarExport TarExport;
+
+typedef void (*TarExportFinished)(TarExport *export, int error, void *userdata);
+
+int tar_export_new(TarExport **export, sd_event *event, TarExportFinished on_finished, void *userdata);
+TarExport* tar_export_unref(TarExport *export);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(TarExport*, tar_export_unref);
+
+int tar_export_start(TarExport *export, const char *path, int fd, ImportCompressType compress);
diff --git a/src/import/export.c b/src/import/export.c
new file mode 100644
index 0000000..7e941a2
--- /dev/null
+++ b/src/import/export.c
@@ -0,0 +1,303 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-event.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "discover-image.h"
+#include "export-raw.h"
+#include "export-tar.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-util.h"
+#include "main-func.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static ImportCompressType arg_compress = IMPORT_COMPRESS_UNKNOWN;
+
+static void determine_compression_from_filename(const char *p) {
+
+        if (arg_compress != IMPORT_COMPRESS_UNKNOWN)
+                return;
+
+        if (!p) {
+                arg_compress = IMPORT_COMPRESS_UNCOMPRESSED;
+                return;
+        }
+
+        if (endswith(p, ".xz"))
+                arg_compress = IMPORT_COMPRESS_XZ;
+        else if (endswith(p, ".gz"))
+                arg_compress = IMPORT_COMPRESS_GZIP;
+        else if (endswith(p, ".bz2"))
+                arg_compress = IMPORT_COMPRESS_BZIP2;
+        else
+                arg_compress = IMPORT_COMPRESS_UNCOMPRESSED;
+}
+
+static int interrupt_signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        log_notice("Transfer aborted.");
+        sd_event_exit(sd_event_source_get_event(s), EINTR);
+        return 0;
+}
+
+static void on_tar_finished(TarExport *export, int error, void *userdata) {
+        sd_event *event = userdata;
+        assert(export);
+
+        if (error == 0)
+                log_info("Operation completed successfully.");
+
+        sd_event_exit(event, abs(error));
+}
+
+static int export_tar(int argc, char *argv[], void *userdata) {
+        _cleanup_(tar_export_unrefp) TarExport *export = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(image_unrefp) Image *image = NULL;
+        const char *path = NULL, *local = NULL;
+        _cleanup_close_ int open_fd = -EBADF;
+        int r, fd;
+
+        if (hostname_is_valid(argv[1], 0)) {
+                r = image_find(IMAGE_MACHINE, argv[1], NULL, &image);
+                if (r == -ENOENT)
+                        return log_error_errno(r, "Machine image %s not found.", argv[1]);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to look for machine %s: %m", argv[1]);
+
+                local = image->path;
+        } else
+                local = argv[1];
+
+        if (argc >= 3)
+                path = argv[2];
+        path = empty_or_dash_to_null(path);
+
+        determine_compression_from_filename(path);
+
+        if (path) {
+                open_fd = open(path, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC|O_NOCTTY, 0666);
+                if (open_fd < 0)
+                        return log_error_errno(errno, "Failed to open tar image for export: %m");
+
+                fd = open_fd;
+
+                log_info("Exporting '%s', saving to '%s' with compression '%s'.", local, path, import_compress_type_to_string(arg_compress));
+        } else {
+                _cleanup_free_ char *pretty = NULL;
+
+                fd = STDOUT_FILENO;
+
+                (void) fd_get_path(fd, &pretty);
+                log_info("Exporting '%s', saving to '%s' with compression '%s'.", local, strna(pretty), import_compress_type_to_string(arg_compress));
+        }
+
+        r = sd_event_default(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
+        (void) sd_event_add_signal(event, NULL, SIGTERM, interrupt_signal_handler,  NULL);
+        (void) sd_event_add_signal(event, NULL, SIGINT, interrupt_signal_handler, NULL);
+
+        r = tar_export_new(&export, event, on_tar_finished, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate exporter: %m");
+
+        r = tar_export_start(export, local, fd, arg_compress);
+        if (r < 0)
+                return log_error_errno(r, "Failed to export image: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        log_info("Exiting.");
+        return -r;
+}
+
+static void on_raw_finished(RawExport *export, int error, void *userdata) {
+        sd_event *event = userdata;
+        assert(export);
+
+        if (error == 0)
+                log_info("Operation completed successfully.");
+
+        sd_event_exit(event, abs(error));
+}
+
+static int export_raw(int argc, char *argv[], void *userdata) {
+        _cleanup_(raw_export_unrefp) RawExport *export = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(image_unrefp) Image *image = NULL;
+        const char *path = NULL, *local = NULL;
+        _cleanup_close_ int open_fd = -EBADF;
+        int r, fd;
+
+        if (hostname_is_valid(argv[1], 0)) {
+                r = image_find(IMAGE_MACHINE, argv[1], NULL, &image);
+                if (r == -ENOENT)
+                        return log_error_errno(r, "Machine image %s not found.", argv[1]);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to look for machine %s: %m", argv[1]);
+
+                local = image->path;
+        } else
+                local = argv[1];
+
+        if (argc >= 3)
+                path = argv[2];
+        path = empty_or_dash_to_null(path);
+
+        determine_compression_from_filename(path);
+
+        if (path) {
+                open_fd = open(path, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC|O_NOCTTY, 0666);
+                if (open_fd < 0)
+                        return log_error_errno(errno, "Failed to open raw image for export: %m");
+
+                fd = open_fd;
+
+                log_info("Exporting '%s', saving to '%s' with compression '%s'.", local, path, import_compress_type_to_string(arg_compress));
+        } else {
+                _cleanup_free_ char *pretty = NULL;
+
+                fd = STDOUT_FILENO;
+
+                (void) fd_get_path(fd, &pretty);
+                log_info("Exporting '%s', saving to '%s' with compression '%s'.", local, strna(pretty), import_compress_type_to_string(arg_compress));
+        }
+
+        r = sd_event_default(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
+        (void) sd_event_add_signal(event, NULL, SIGTERM, interrupt_signal_handler,  NULL);
+        (void) sd_event_add_signal(event, NULL, SIGINT, interrupt_signal_handler, NULL);
+
+        r = raw_export_new(&export, event, on_raw_finished, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate exporter: %m");
+
+        r = raw_export_start(export, local, fd, arg_compress);
+        if (r < 0)
+                return log_error_errno(r, "Failed to export image: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        log_info("Exiting.");
+        return -r;
+}
+
+static int help(int argc, char *argv[], void *userdata) {
+        printf("%1$s [OPTIONS...] {COMMAND} ...\n"
+               "\n%4$sExport container or virtual machine images.%5$s\n"
+               "\n%2$sCommands:%3$s\n"
+               "  tar NAME [FILE]              Export a TAR image\n"
+               "  raw NAME [FILE]              Export a RAW image\n"
+               "\n%2$sOptions:%3$s\n"
+               "  -h --help                    Show this help\n"
+               "     --version                 Show package version\n"
+               "     --format=FORMAT           Select format\n\n",
+               program_invocation_short_name,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_FORMAT,
+        };
+
+        static const struct option options[] = {
+                { "help",    no_argument,       NULL, 'h'         },
+                { "version", no_argument,       NULL, ARG_VERSION },
+                { "format",  required_argument, NULL, ARG_FORMAT  },
+                {}
+        };
+
+        int c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_FORMAT:
+                        if (streq(optarg, "uncompressed"))
+                                arg_compress = IMPORT_COMPRESS_UNCOMPRESSED;
+                        else if (streq(optarg, "xz"))
+                                arg_compress = IMPORT_COMPRESS_XZ;
+                        else if (streq(optarg, "gzip"))
+                                arg_compress = IMPORT_COMPRESS_GZIP;
+                        else if (streq(optarg, "bzip2"))
+                                arg_compress = IMPORT_COMPRESS_BZIP2;
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Unknown format: %s", optarg);
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int export_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help", VERB_ANY, VERB_ANY, 0, help       },
+                { "tar",  2,        3,        0, export_tar },
+                { "raw",  2,        3,        0, export_raw },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        setlocale(LC_ALL, "");
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        (void) ignore_signals(SIGPIPE);
+
+        return export_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/import/import-common.c b/src/import/import-common.c
new file mode 100644
index 0000000..319aa07
--- /dev/null
+++ b/src/import/import-common.c
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "capability-util.h"
+#include "chattr-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "os-util.h"
+#include "process-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "tmpfile-util.h"
+
+int import_fork_tar_x(const char *path, pid_t *ret) {
+        _cleanup_close_pair_ int pipefd[2] = EBADF_PAIR;
+        bool use_selinux;
+        pid_t pid;
+        int r;
+
+        assert(path);
+        assert(ret);
+
+        if (pipe2(pipefd, O_CLOEXEC) < 0)
+                return log_error_errno(errno, "Failed to create pipe for tar: %m");
+
+        use_selinux = mac_selinux_use();
+
+        r = safe_fork_full("(tar)",
+                           (int[]) { pipefd[0], -EBADF, STDERR_FILENO },
+                           NULL, 0,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                const char *cmdline[] = {
+                       "tar",
+                       "--ignore-zeros",
+                       "--numeric-owner",
+                       "-C", path,
+                       "-pxf",
+                       "-",
+                       "--xattrs",
+                       "--xattrs-include=*",
+                       use_selinux ? "--selinux" : "--no-selinux",
+                       NULL
+                };
+
+                uint64_t retain =
+                        (1ULL << CAP_CHOWN) |
+                        (1ULL << CAP_FOWNER) |
+                        (1ULL << CAP_FSETID) |
+                        (1ULL << CAP_MKNOD) |
+                        (1ULL << CAP_SETFCAP) |
+                        (1ULL << CAP_DAC_OVERRIDE);
+
+                /* Child */
+
+                if (unshare(CLONE_NEWNET) < 0)
+                        log_warning_errno(errno, "Failed to lock tar into network namespace, ignoring: %m");
+
+                r = capability_bounding_set_drop(retain, true);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to drop capabilities, ignoring: %m");
+
+                /* Try "gtar" before "tar". We only test things upstream with GNU tar. Some distros appear to
+                 * install a different implementation as "tar" (in particular some that do not support the
+                 * same command line switches), but then provide "gtar" as alias for the real thing, hence
+                 * let's prefer that. (Yes, it's a bad idea they do that, given they don't provide equivalent
+                 * command line support, but we are not here to argue, let's just expose the same
+                 * behaviour/implementation everywhere.) */
+                execvp("gtar", (char* const*) cmdline);
+                execvp("tar", (char* const*) cmdline);
+
+                log_error_errno(errno, "Failed to execute tar: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        *ret = pid;
+
+        return TAKE_FD(pipefd[1]);
+}
+
+int import_fork_tar_c(const char *path, pid_t *ret) {
+        _cleanup_close_pair_ int pipefd[2] = EBADF_PAIR;
+        bool use_selinux;
+        pid_t pid;
+        int r;
+
+        assert(path);
+        assert(ret);
+
+        if (pipe2(pipefd, O_CLOEXEC) < 0)
+                return log_error_errno(errno, "Failed to create pipe for tar: %m");
+
+        use_selinux = mac_selinux_use();
+
+        r = safe_fork_full("(tar)",
+                           (int[]) { -EBADF, pipefd[1], STDERR_FILENO },
+                           NULL, 0,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                const char *cmdline[] = {
+                        "tar",
+                        "-C", path,
+                        "-c",
+                        "--xattrs",
+                        "--xattrs-include=*",
+                       use_selinux ? "--selinux" : "--no-selinux",
+                        ".",
+                        NULL
+                };
+
+                uint64_t retain = (1ULL << CAP_DAC_OVERRIDE);
+
+                /* Child */
+
+                if (unshare(CLONE_NEWNET) < 0)
+                        log_error_errno(errno, "Failed to lock tar into network namespace, ignoring: %m");
+
+                r = capability_bounding_set_drop(retain, true);
+                if (r < 0)
+                        log_error_errno(r, "Failed to drop capabilities, ignoring: %m");
+
+                execvp("gtar", (char* const*) cmdline);
+                execvp("tar", (char* const*) cmdline);
+
+                log_error_errno(errno, "Failed to execute tar: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        *ret = pid;
+
+        return TAKE_FD(pipefd[0]);
+}
+
+int import_mangle_os_tree(const char *path) {
+        _cleanup_free_ char *child = NULL, *t = NULL, *joined = NULL;
+        _cleanup_closedir_ DIR *d = NULL, *cd = NULL;
+        struct dirent *dent;
+        struct stat st;
+        int r;
+
+        assert(path);
+
+        /* Some tarballs contain a single top-level directory that contains the actual OS directory tree. Try to
+         * recognize this, and move the tree one level up. */
+
+        r = path_is_os_tree(path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether '%s' is an OS tree: %m", path);
+        if (r > 0) {
+                log_debug("Directory tree '%s' is a valid OS tree.", path);
+                return 0;
+        }
+
+        log_debug("Directory tree '%s' is not recognizable as OS tree, checking whether to rearrange it.", path);
+
+        d = opendir(path);
+        if (!d)
+                return log_error_errno(r, "Failed to open directory '%s': %m", path);
+
+        errno = 0;
+        dent = readdir_no_dot(d);
+        if (!dent) {
+                if (errno != 0)
+                        return log_error_errno(errno, "Failed to iterate through directory '%s': %m", path);
+
+                log_debug("Directory '%s' is empty, leaving it as it is.", path);
+                return 0;
+        }
+
+        child = strdup(dent->d_name);
+        if (!child)
+                return log_oom();
+
+        errno = 0;
+        dent = readdir_no_dot(d);
+        if (dent) {
+                if (errno != 0)
+                        return log_error_errno(errno, "Failed to iterate through directory '%s': %m", path);
+
+                log_debug("Directory '%s' does not look like an OS tree, and has multiple children, leaving as it is.", path);
+                return 0;
+        }
+
+        if (fstatat(dirfd(d), child, &st, AT_SYMLINK_NOFOLLOW) < 0)
+                return log_debug_errno(errno, "Failed to stat file '%s/%s': %m", path, child);
+        r = stat_verify_directory(&st);
+        if (r < 0) {
+                log_debug_errno(r, "Child '%s' of directory '%s' is not a directory, leaving things as they are.", child, path);
+                return 0;
+        }
+
+        joined = path_join(path, child);
+        if (!joined)
+                return log_oom();
+        r = path_is_os_tree(joined);
+        if (r == -ENOTDIR) {
+                log_debug("Directory '%s' does not look like an OS tree, and contains a single regular file only, leaving as it is.", path);
+                return 0;
+        }
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether '%s' is an OS tree: %m", joined);
+        if (r == 0) {
+                log_debug("Neither '%s' nor '%s' is a valid OS tree, leaving them as they are.", path, joined);
+                return 0;
+        }
+
+        /* Nice, we have checked now:
+         *
+         * 1. The top-level directory does not qualify as OS tree
+         * 1. The top-level directory only contains one item
+         * 2. That item is a directory
+         * 3. And that directory qualifies as OS tree
+         *
+         * Let's now rearrange things, moving everything in the inner directory one level up */
+
+        cd = xopendirat(dirfd(d), child, O_NOFOLLOW);
+        if (!cd)
+                return log_error_errno(errno, "Can't open directory '%s': %m", joined);
+
+        log_info("Rearranging '%s', moving OS tree one directory up.", joined);
+
+        /* Let's rename the child to an unguessable name so that we can be sure all files contained in it can be
+         * safely moved up and won't collide with the name. */
+        r = tempfn_random(child, NULL, &t);
+        if (r < 0)
+                return log_oom();
+        r = rename_noreplace(dirfd(d), child, dirfd(d), t);
+        if (r < 0)
+                return log_error_errno(r, "Unable to rename '%s' to '%s/%s': %m", joined, path, t);
+
+        FOREACH_DIRENT_ALL(de, cd, return log_error_errno(errno, "Failed to iterate through directory '%s': %m", joined)) {
+                if (dot_or_dot_dot(de->d_name))
+                        continue;
+
+                r = rename_noreplace(dirfd(cd), de->d_name, dirfd(d), de->d_name);
+                if (r < 0)
+                        return log_error_errno(r, "Unable to move '%s/%s/%s' to '%s/%s': %m", path, t, de->d_name, path, de->d_name);
+        }
+
+        if (unlinkat(dirfd(d), t, AT_REMOVEDIR) < 0)
+                return log_error_errno(errno, "Failed to remove temporary directory '%s/%s': %m", path, t);
+
+        r = futimens(dirfd(d), (struct timespec[2]) { st.st_atim, st.st_mtim });
+        if (r < 0)
+                log_debug_errno(r, "Failed to adjust top-level timestamps '%s', ignoring: %m", path);
+
+        r = fchmod_and_chown(dirfd(d), st.st_mode, st.st_uid, st.st_gid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to adjust top-level directory mode/ownership '%s': %m", path);
+
+        log_info("Successfully rearranged OS tree.");
+
+        return 0;
+}
+
+bool import_validate_local(const char *name, ImportFlags flags) {
+
+        /* By default we insist on a valid hostname for naming images. But optionally we relax that, in which
+         * case it can be any path name */
+
+        if (FLAGS_SET(flags, IMPORT_DIRECT))
+                return path_is_valid(name);
+
+        return hostname_is_valid(name, 0);
+}
+
+static int interrupt_signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        log_notice("Transfer aborted.");
+        sd_event_exit(sd_event_source_get_event(s), EINTR);
+        return 0;
+}
+
+int import_allocate_event_with_signals(sd_event **ret) {
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        int r;
+
+        assert(ret);
+
+        r = sd_event_default(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event loop: %m");
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
+        (void) sd_event_add_signal(event, NULL, SIGTERM, interrupt_signal_handler,  NULL);
+        (void) sd_event_add_signal(event, NULL, SIGINT, interrupt_signal_handler, NULL);
+
+        *ret = TAKE_PTR(event);
+        return 0;
+}
diff --git a/src/import/import-common.h b/src/import/import-common.h
new file mode 100644
index 0000000..97fc16d
--- /dev/null
+++ b/src/import/import-common.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "sd-event.h"
+
+typedef enum ImportFlags {
+        IMPORT_FORCE          = 1 << 0, /* replace existing image */
+        IMPORT_READ_ONLY      = 1 << 1, /* make generated image read-only */
+        IMPORT_BTRFS_SUBVOL   = 1 << 2, /* tar: preferably create images as btrfs subvols */
+        IMPORT_BTRFS_QUOTA    = 1 << 3, /* tar: set up btrfs quota for new subvolume as child of parent subvolume */
+        IMPORT_CONVERT_QCOW2  = 1 << 4, /* raw: if we detect a qcow2 image, unpack it */
+        IMPORT_DIRECT         = 1 << 5, /* import without rename games */
+        IMPORT_SYNC           = 1 << 6, /* fsync() right before we are done */
+
+        IMPORT_FLAGS_MASK_TAR = IMPORT_FORCE|IMPORT_READ_ONLY|IMPORT_BTRFS_SUBVOL|IMPORT_BTRFS_QUOTA|IMPORT_DIRECT|IMPORT_SYNC,
+        IMPORT_FLAGS_MASK_RAW = IMPORT_FORCE|IMPORT_READ_ONLY|IMPORT_CONVERT_QCOW2|IMPORT_DIRECT|IMPORT_SYNC,
+} ImportFlags;
+
+int import_fork_tar_c(const char *path, pid_t *ret);
+int import_fork_tar_x(const char *path, pid_t *ret);
+
+int import_mangle_os_tree(const char *path);
+
+bool import_validate_local(const char *name, ImportFlags flags);
+
+int import_allocate_event_with_signals(sd_event **ret);
diff --git a/src/import/import-compress.c b/src/import/import-compress.c
new file mode 100644
index 0000000..28cf6f8
--- /dev/null
+++ b/src/import/import-compress.c
@@ -0,0 +1,478 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "import-compress.h"
+#include "string-table.h"
+
+void import_compress_free(ImportCompress *c) {
+        assert(c);
+
+        if (c->type == IMPORT_COMPRESS_XZ)
+                lzma_end(&c->xz);
+        else if (c->type == IMPORT_COMPRESS_GZIP) {
+                if (c->encoding)
+                        deflateEnd(&c->gzip);
+                else
+                        inflateEnd(&c->gzip);
+#if HAVE_BZIP2
+        } else if (c->type == IMPORT_COMPRESS_BZIP2) {
+                if (c->encoding)
+                        BZ2_bzCompressEnd(&c->bzip2);
+                else
+                        BZ2_bzDecompressEnd(&c->bzip2);
+#endif
+        }
+
+        c->type = IMPORT_COMPRESS_UNKNOWN;
+}
+
+int import_uncompress_detect(ImportCompress *c, const void *data, size_t size) {
+        static const uint8_t xz_signature[] = {
+                0xfd, '7', 'z', 'X', 'Z', 0x00
+        };
+        static const uint8_t gzip_signature[] = {
+                0x1f, 0x8b
+        };
+        static const uint8_t bzip2_signature[] = {
+                'B', 'Z', 'h'
+        };
+
+        int r;
+
+        assert(c);
+
+        if (c->type != IMPORT_COMPRESS_UNKNOWN)
+                return 1;
+
+        if (size < MAX3(sizeof(xz_signature),
+                        sizeof(gzip_signature),
+                        sizeof(bzip2_signature)))
+                return 0;
+
+        assert(data);
+
+        if (memcmp(data, xz_signature, sizeof(xz_signature)) == 0) {
+                lzma_ret xzr;
+
+                xzr = lzma_stream_decoder(&c->xz, UINT64_MAX, LZMA_TELL_UNSUPPORTED_CHECK | LZMA_CONCATENATED);
+                if (xzr != LZMA_OK)
+                        return -EIO;
+
+                c->type = IMPORT_COMPRESS_XZ;
+
+        } else if (memcmp(data, gzip_signature, sizeof(gzip_signature)) == 0) {
+                r = inflateInit2(&c->gzip, 15+16);
+                if (r != Z_OK)
+                        return -EIO;
+
+                c->type = IMPORT_COMPRESS_GZIP;
+
+#if HAVE_BZIP2
+        } else if (memcmp(data, bzip2_signature, sizeof(bzip2_signature)) == 0) {
+                r = BZ2_bzDecompressInit(&c->bzip2, 0, 0);
+                if (r != BZ_OK)
+                        return -EIO;
+
+                c->type = IMPORT_COMPRESS_BZIP2;
+#endif
+        } else
+                c->type = IMPORT_COMPRESS_UNCOMPRESSED;
+
+        c->encoding = false;
+
+        return 1;
+}
+
+void import_uncompress_force_off(ImportCompress *c) {
+        assert(c);
+
+        c->type = IMPORT_COMPRESS_UNCOMPRESSED;
+        c->encoding = false;
+}
+
+int import_uncompress(ImportCompress *c, const void *data, size_t size, ImportCompressCallback callback, void *userdata) {
+        int r;
+
+        assert(c);
+        assert(callback);
+
+        r = import_uncompress_detect(c, data, size);
+        if (r <= 0)
+                return r;
+
+        if (c->encoding)
+                return -EINVAL;
+
+        if (size <= 0)
+                return 1;
+
+        assert(data);
+
+        switch (c->type) {
+
+        case IMPORT_COMPRESS_UNCOMPRESSED:
+                r = callback(data, size, userdata);
+                if (r < 0)
+                        return r;
+
+                break;
+
+        case IMPORT_COMPRESS_XZ:
+                c->xz.next_in = data;
+                c->xz.avail_in = size;
+
+                while (c->xz.avail_in > 0) {
+                        uint8_t buffer[16 * 1024];
+                        lzma_ret lzr;
+
+                        c->xz.next_out = buffer;
+                        c->xz.avail_out = sizeof(buffer);
+
+                        lzr = lzma_code(&c->xz, LZMA_RUN);
+                        if (!IN_SET(lzr, LZMA_OK, LZMA_STREAM_END))
+                                return -EIO;
+
+                        if (c->xz.avail_out < sizeof(buffer)) {
+                                r = callback(buffer, sizeof(buffer) - c->xz.avail_out, userdata);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+
+                break;
+
+        case IMPORT_COMPRESS_GZIP:
+                c->gzip.next_in = (void*) data;
+                c->gzip.avail_in = size;
+
+                while (c->gzip.avail_in > 0) {
+                        uint8_t buffer[16 * 1024];
+
+                        c->gzip.next_out = buffer;
+                        c->gzip.avail_out = sizeof(buffer);
+
+                        r = inflate(&c->gzip, Z_NO_FLUSH);
+                        if (!IN_SET(r, Z_OK, Z_STREAM_END))
+                                return -EIO;
+
+                        if (c->gzip.avail_out < sizeof(buffer)) {
+                                r = callback(buffer, sizeof(buffer) - c->gzip.avail_out, userdata);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+
+                break;
+
+#if HAVE_BZIP2
+        case IMPORT_COMPRESS_BZIP2:
+                c->bzip2.next_in = (void*) data;
+                c->bzip2.avail_in = size;
+
+                while (c->bzip2.avail_in > 0) {
+                        uint8_t buffer[16 * 1024];
+
+                        c->bzip2.next_out = (char*) buffer;
+                        c->bzip2.avail_out = sizeof(buffer);
+
+                        r = BZ2_bzDecompress(&c->bzip2);
+                        if (!IN_SET(r, BZ_OK, BZ_STREAM_END))
+                                return -EIO;
+
+                        if (c->bzip2.avail_out < sizeof(buffer)) {
+                                r = callback(buffer, sizeof(buffer) - c->bzip2.avail_out, userdata);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+
+                break;
+#endif
+
+        default:
+                assert_not_reached();
+        }
+
+        return 1;
+}
+
+int import_compress_init(ImportCompress *c, ImportCompressType t) {
+        int r;
+
+        assert(c);
+
+        switch (t) {
+
+        case IMPORT_COMPRESS_XZ: {
+                lzma_ret xzr;
+
+                xzr = lzma_easy_encoder(&c->xz, LZMA_PRESET_DEFAULT, LZMA_CHECK_CRC64);
+                if (xzr != LZMA_OK)
+                        return -EIO;
+
+                c->type = IMPORT_COMPRESS_XZ;
+                break;
+        }
+
+        case IMPORT_COMPRESS_GZIP:
+                r = deflateInit2(&c->gzip, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 15 + 16, 8, Z_DEFAULT_STRATEGY);
+                if (r != Z_OK)
+                        return -EIO;
+
+                c->type = IMPORT_COMPRESS_GZIP;
+                break;
+
+#if HAVE_BZIP2
+        case IMPORT_COMPRESS_BZIP2:
+                r = BZ2_bzCompressInit(&c->bzip2, 9, 0, 0);
+                if (r != BZ_OK)
+                        return -EIO;
+
+                c->type = IMPORT_COMPRESS_BZIP2;
+                break;
+#endif
+
+        case IMPORT_COMPRESS_UNCOMPRESSED:
+                c->type = IMPORT_COMPRESS_UNCOMPRESSED;
+                break;
+
+        default:
+                return -EOPNOTSUPP;
+        }
+
+        c->encoding = true;
+        return 0;
+}
+
+static int enlarge_buffer(void **buffer, size_t *buffer_size, size_t *buffer_allocated) {
+        size_t l;
+        void *p;
+
+        if (*buffer_allocated > *buffer_size)
+                return 0;
+
+        l = MAX(16*1024U, (*buffer_size * 2));
+        p = realloc(*buffer, l);
+        if (!p)
+                return -ENOMEM;
+
+        *buffer = p;
+        *buffer_allocated = l;
+
+        return 1;
+}
+
+int import_compress(ImportCompress *c, const void *data, size_t size, void **buffer, size_t *buffer_size, size_t *buffer_allocated) {
+        int r;
+
+        assert(c);
+        assert(buffer);
+        assert(buffer_size);
+        assert(buffer_allocated);
+
+        if (!c->encoding)
+                return -EINVAL;
+
+        if (size <= 0)
+                return 0;
+
+        assert(data);
+
+        *buffer_size = 0;
+
+        switch (c->type) {
+
+        case IMPORT_COMPRESS_XZ:
+
+                c->xz.next_in = data;
+                c->xz.avail_in = size;
+
+                while (c->xz.avail_in > 0) {
+                        lzma_ret lzr;
+
+                        r = enlarge_buffer(buffer, buffer_size, buffer_allocated);
+                        if (r < 0)
+                                return r;
+
+                        c->xz.next_out = (uint8_t*) *buffer + *buffer_size;
+                        c->xz.avail_out = *buffer_allocated - *buffer_size;
+
+                        lzr = lzma_code(&c->xz, LZMA_RUN);
+                        if (lzr != LZMA_OK)
+                                return -EIO;
+
+                        *buffer_size += (*buffer_allocated - *buffer_size) - c->xz.avail_out;
+                }
+
+                break;
+
+        case IMPORT_COMPRESS_GZIP:
+
+                c->gzip.next_in = (void*) data;
+                c->gzip.avail_in = size;
+
+                while (c->gzip.avail_in > 0) {
+                        r = enlarge_buffer(buffer, buffer_size, buffer_allocated);
+                        if (r < 0)
+                                return r;
+
+                        c->gzip.next_out = (uint8_t*) *buffer + *buffer_size;
+                        c->gzip.avail_out = *buffer_allocated - *buffer_size;
+
+                        r = deflate(&c->gzip, Z_NO_FLUSH);
+                        if (r != Z_OK)
+                                return -EIO;
+
+                        *buffer_size += (*buffer_allocated - *buffer_size) - c->gzip.avail_out;
+                }
+
+                break;
+
+#if HAVE_BZIP2
+        case IMPORT_COMPRESS_BZIP2:
+
+                c->bzip2.next_in = (void*) data;
+                c->bzip2.avail_in = size;
+
+                while (c->bzip2.avail_in > 0) {
+                        r = enlarge_buffer(buffer, buffer_size, buffer_allocated);
+                        if (r < 0)
+                                return r;
+
+                        c->bzip2.next_out = (void*) ((uint8_t*) *buffer + *buffer_size);
+                        c->bzip2.avail_out = *buffer_allocated - *buffer_size;
+
+                        r = BZ2_bzCompress(&c->bzip2, BZ_RUN);
+                        if (r != BZ_RUN_OK)
+                                return -EIO;
+
+                        *buffer_size += (*buffer_allocated - *buffer_size) - c->bzip2.avail_out;
+                }
+
+                break;
+#endif
+
+        case IMPORT_COMPRESS_UNCOMPRESSED:
+
+                if (*buffer_allocated < size) {
+                        void *p;
+
+                        p = realloc(*buffer, size);
+                        if (!p)
+                                return -ENOMEM;
+
+                        *buffer = p;
+                        *buffer_allocated = size;
+                }
+
+                memcpy(*buffer, data, size);
+                *buffer_size = size;
+                break;
+
+        default:
+                return -EOPNOTSUPP;
+        }
+
+        return 0;
+}
+
+int import_compress_finish(ImportCompress *c, void **buffer, size_t *buffer_size, size_t *buffer_allocated) {
+        int r;
+
+        assert(c);
+        assert(buffer);
+        assert(buffer_size);
+        assert(buffer_allocated);
+
+        if (!c->encoding)
+                return -EINVAL;
+
+        *buffer_size = 0;
+
+        switch (c->type) {
+
+        case IMPORT_COMPRESS_XZ: {
+                lzma_ret lzr;
+
+                c->xz.avail_in = 0;
+
+                do {
+                        r = enlarge_buffer(buffer, buffer_size, buffer_allocated);
+                        if (r < 0)
+                                return r;
+
+                        c->xz.next_out = (uint8_t*) *buffer + *buffer_size;
+                        c->xz.avail_out = *buffer_allocated - *buffer_size;
+
+                        lzr = lzma_code(&c->xz, LZMA_FINISH);
+                        if (!IN_SET(lzr, LZMA_OK, LZMA_STREAM_END))
+                                return -EIO;
+
+                        *buffer_size += (*buffer_allocated - *buffer_size) - c->xz.avail_out;
+                } while (lzr != LZMA_STREAM_END);
+
+                break;
+        }
+
+        case IMPORT_COMPRESS_GZIP:
+                c->gzip.avail_in = 0;
+
+                do {
+                        r = enlarge_buffer(buffer, buffer_size, buffer_allocated);
+                        if (r < 0)
+                                return r;
+
+                        c->gzip.next_out = (uint8_t*) *buffer + *buffer_size;
+                        c->gzip.avail_out = *buffer_allocated - *buffer_size;
+
+                        r = deflate(&c->gzip, Z_FINISH);
+                        if (!IN_SET(r, Z_OK, Z_STREAM_END))
+                                return -EIO;
+
+                        *buffer_size += (*buffer_allocated - *buffer_size) - c->gzip.avail_out;
+                } while (r != Z_STREAM_END);
+
+                break;
+
+#if HAVE_BZIP2
+        case IMPORT_COMPRESS_BZIP2:
+                c->bzip2.avail_in = 0;
+
+                do {
+                        r = enlarge_buffer(buffer, buffer_size, buffer_allocated);
+                        if (r < 0)
+                                return r;
+
+                        c->bzip2.next_out = (void*) ((uint8_t*) *buffer + *buffer_size);
+                        c->bzip2.avail_out = *buffer_allocated - *buffer_size;
+
+                        r = BZ2_bzCompress(&c->bzip2, BZ_FINISH);
+                        if (!IN_SET(r, BZ_FINISH_OK, BZ_STREAM_END))
+                                return -EIO;
+
+                        *buffer_size += (*buffer_allocated - *buffer_size) - c->bzip2.avail_out;
+                } while (r != BZ_STREAM_END);
+
+                break;
+#endif
+
+        case IMPORT_COMPRESS_UNCOMPRESSED:
+                break;
+
+        default:
+                return -EOPNOTSUPP;
+        }
+
+        return 0;
+}
+
+static const char* const import_compress_type_table[_IMPORT_COMPRESS_TYPE_MAX] = {
+        [IMPORT_COMPRESS_UNKNOWN] = "unknown",
+        [IMPORT_COMPRESS_UNCOMPRESSED] = "uncompressed",
+        [IMPORT_COMPRESS_XZ] = "xz",
+        [IMPORT_COMPRESS_GZIP] = "gzip",
+#if HAVE_BZIP2
+        [IMPORT_COMPRESS_BZIP2] = "bzip2",
+#endif
+};
+
+DEFINE_STRING_TABLE_LOOKUP(import_compress_type, ImportCompressType);
diff --git a/src/import/import-compress.h b/src/import/import-compress.h
new file mode 100644
index 0000000..0a42103
--- /dev/null
+++ b/src/import/import-compress.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_BZIP2
+#include 
+#endif
+#include 
+#include 
+#include 
+
+#include "macro.h"
+
+typedef enum ImportCompressType {
+        IMPORT_COMPRESS_UNKNOWN,
+        IMPORT_COMPRESS_UNCOMPRESSED,
+        IMPORT_COMPRESS_XZ,
+        IMPORT_COMPRESS_GZIP,
+        IMPORT_COMPRESS_BZIP2,
+        _IMPORT_COMPRESS_TYPE_MAX,
+        _IMPORT_COMPRESS_TYPE_INVALID = -EINVAL,
+} ImportCompressType;
+
+typedef struct ImportCompress {
+        ImportCompressType type;
+        bool encoding;
+        union {
+                lzma_stream xz;
+                z_stream gzip;
+#if HAVE_BZIP2
+                bz_stream bzip2;
+#endif
+        };
+} ImportCompress;
+
+typedef int (*ImportCompressCallback)(const void *data, size_t size, void *userdata);
+
+void import_compress_free(ImportCompress *c);
+
+int import_uncompress_detect(ImportCompress *c, const void *data, size_t size);
+void import_uncompress_force_off(ImportCompress *c);
+int import_uncompress(ImportCompress *c, const void *data, size_t size, ImportCompressCallback callback, void *userdata);
+
+int import_compress_init(ImportCompress *c, ImportCompressType t);
+int import_compress(ImportCompress *c, const void *data, size_t size, void **buffer, size_t *buffer_size, size_t *buffer_allocated);
+int import_compress_finish(ImportCompress *c, void **buffer, size_t *buffer_size, size_t *buffer_allocated);
+
+const char* import_compress_type_to_string(ImportCompressType t) _const_;
+ImportCompressType import_compress_type_from_string(const char *s) _pure_;
diff --git a/src/import/import-fs.c b/src/import/import-fs.c
new file mode 100644
index 0000000..fd79c8f
--- /dev/null
+++ b/src/import/import-fs.c
@@ -0,0 +1,392 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "build.h"
+#include "btrfs-util.h"
+#include "discover-image.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "import-util.h"
+#include "install-file.h"
+#include "main-func.h"
+#include "mkdir-label.h"
+#include "parse-argument.h"
+#include "ratelimit.h"
+#include "rm-rf.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "verbs.h"
+
+static bool arg_force = false;
+static bool arg_read_only = false;
+static bool arg_btrfs_subvol = true;
+static bool arg_btrfs_quota = true;
+static bool arg_sync = true;
+static bool arg_direct = false;
+static const char *arg_image_root = "/var/lib/machines";
+
+typedef struct ProgressInfo {
+        RateLimit limit;
+        char *path;
+        uint64_t size;
+        bool started;
+        bool logged_incomplete;
+} ProgressInfo;
+
+static void progress_info_free(ProgressInfo *p) {
+        free(p->path);
+}
+
+static void progress_show(ProgressInfo *p) {
+        assert(p);
+
+        /* Show progress only every now and then. */
+        if (!ratelimit_below(&p->limit))
+                return;
+
+        /* Suppress the first message, start with the second one */
+        if (!p->started) {
+                p->started = true;
+                return;
+        }
+
+        /* Mention the list is incomplete before showing first output. */
+        if (!p->logged_incomplete) {
+                log_notice("(Note: file list shown below is incomplete, and is intended as sporadic progress report only.)");
+                p->logged_incomplete = true;
+        }
+
+        if (p->size == 0)
+                log_info("Copying tree, currently at '%s'...", p->path);
+        else
+                log_info("Copying tree, currently at '%s' (@%s)...", p->path, FORMAT_BYTES(p->size));
+}
+
+static int progress_path(const char *path, const struct stat *st, void *userdata) {
+        ProgressInfo *p = ASSERT_PTR(userdata);
+        int r;
+
+        r = free_and_strdup(&p->path, path);
+        if (r < 0)
+                return r;
+
+        p->size = 0;
+
+        progress_show(p);
+        return 0;
+}
+
+static int progress_bytes(uint64_t nbytes, void *userdata) {
+        ProgressInfo *p = ASSERT_PTR(userdata);
+
+        assert(p->size != UINT64_MAX);
+
+        p->size += nbytes;
+
+        progress_show(p);
+        return 0;
+}
+
+static int import_fs(int argc, char *argv[], void *userdata) {
+        _cleanup_(rm_rf_subvolume_and_freep) char *temp_path = NULL;
+        _cleanup_(progress_info_free) ProgressInfo progress = {};
+        _cleanup_free_ char *l = NULL, *final_path = NULL;
+        const char *path = NULL, *local = NULL, *dest = NULL;
+        _cleanup_close_ int open_fd = -EBADF;
+        int r, fd;
+
+        if (argc >= 2)
+                path = empty_or_dash_to_null(argv[1]);
+
+        if (argc >= 3)
+                local = empty_or_dash_to_null(argv[2]);
+        else if (path) {
+                r = path_extract_filename(path, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from path '%s': %m", path);
+
+                local = l;
+        }
+
+        if (arg_direct) {
+                if (!local)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No local path specified.");
+
+                if (path_is_absolute(local))
+                        final_path = strdup(local);
+                else
+                        final_path = path_join(arg_image_root, local);
+                if (!final_path)
+                        return log_oom();
+
+                if (!path_is_valid(final_path))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Local path name '%s' is not valid.", final_path);
+        } else {
+                if (local) {
+                        if (!hostname_is_valid(local, 0))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Local image name '%s' is not valid.", local);
+                } else
+                        local = "imported";
+
+                final_path = path_join(arg_image_root, local);
+                if (!final_path)
+                        return log_oom();
+
+                if (!arg_force) {
+                        r = image_find(IMAGE_MACHINE, local, NULL, NULL);
+                        if (r < 0) {
+                                if (r != -ENOENT)
+                                        return log_error_errno(r, "Failed to check whether image '%s' exists: %m", local);
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                                       "Image '%s' already exists.", local);
+                }
+        }
+
+        if (path) {
+                open_fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+                if (open_fd < 0)
+                        return log_error_errno(errno, "Failed to open directory to import: %m");
+
+                fd = open_fd;
+
+                log_info("Importing '%s', saving as '%s'.", path, local);
+        } else {
+                _cleanup_free_ char *pretty = NULL;
+
+                fd = STDIN_FILENO;
+
+                (void) fd_get_path(fd, &pretty);
+                log_info("Importing '%s', saving as '%s'.", strempty(pretty), local);
+        }
+
+        if (!arg_sync)
+                log_info("File system synchronization on completion is off.");
+
+        if (arg_direct) {
+                if (arg_force)
+                        (void) rm_rf(final_path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
+
+                dest = final_path;
+        } else {
+                r = tempfn_random(final_path, NULL, &temp_path);
+                if (r < 0)
+                        return log_oom();
+
+                dest = temp_path;
+        }
+
+        (void) mkdir_parents_label(dest, 0700);
+
+        progress.limit = (const RateLimit) { 200*USEC_PER_MSEC, 1 };
+
+        {
+                BLOCK_SIGNALS(SIGINT, SIGTERM);
+
+                if (arg_btrfs_subvol)
+                        r = btrfs_subvol_snapshot_at_full(
+                                        fd, NULL,
+                                        AT_FDCWD, dest,
+                                        BTRFS_SNAPSHOT_FALLBACK_COPY|
+                                        BTRFS_SNAPSHOT_FALLBACK_DIRECTORY|
+                                        BTRFS_SNAPSHOT_RECURSIVE|
+                                        BTRFS_SNAPSHOT_SIGINT|
+                                        BTRFS_SNAPSHOT_SIGTERM,
+                                        progress_path,
+                                        progress_bytes,
+                                        &progress);
+                else
+                        r = copy_directory_at_full(
+                                        fd, NULL,
+                                        AT_FDCWD, dest,
+                                        COPY_REFLINK|
+                                        COPY_SAME_MOUNT|
+                                        COPY_HARDLINKS|
+                                        COPY_SIGINT|
+                                        COPY_SIGTERM|
+                                        (arg_direct ? COPY_MERGE_EMPTY : 0),
+                                        progress_path,
+                                        progress_bytes,
+                                        &progress);
+                if (r == -EINTR) /* SIGINT/SIGTERM hit */
+                        return log_error_errno(r, "Copy cancelled.");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to copy directory: %m");
+        }
+
+        r = import_mangle_os_tree(dest);
+        if (r < 0)
+                return r;
+
+        if (arg_btrfs_quota) {
+                if (!arg_direct)
+                        (void) import_assign_pool_quota_and_warn(arg_image_root);
+                (void) import_assign_pool_quota_and_warn(dest);
+        }
+
+        r = install_file(AT_FDCWD, dest,
+                         AT_FDCWD, arg_direct ? NULL : final_path, /* pass NULL as target in case of direct
+                                                                    * mode since file is already in place */
+                         (arg_force ? INSTALL_REPLACE : 0) |
+                         (arg_read_only ? INSTALL_READ_ONLY : 0) |
+                         (arg_sync ? INSTALL_SYNCFS : 0));
+        if (r < 0)
+                return log_error_errno(r, "Failed install directory as '%s': %m", final_path);
+
+        temp_path = mfree(temp_path);
+
+        log_info("Directory '%s successfully installed. Exiting.", final_path);
+        return 0;
+}
+
+static int help(int argc, char *argv[], void *userdata) {
+
+        printf("%1$s [OPTIONS...] {COMMAND} ...\n"
+               "\n%4$sImport container images from a file system directories.%5$s\n"
+               "\n%2$sCommands:%3$s\n"
+               "  run DIRECTORY [NAME]        Import a directory\n"
+               "\n%2$sOptions:%3$s\n"
+               "  -h --help                   Show this help\n"
+               "     --version                Show package version\n"
+               "     --force                  Force creation of image\n"
+               "     --image-root=PATH        Image root directory\n"
+               "     --read-only              Create a read-only image\n"
+               "     --direct                 Import directly to specified directory\n"
+               "     --btrfs-subvol=BOOL      Controls whether to create a btrfs subvolume\n"
+               "                              instead of a directory\n"
+               "     --btrfs-quota=BOOL       Controls whether to set up quota for btrfs\n"
+               "                              subvolume\n"
+               "     --sync=BOOL              Controls whether to sync() before completing\n",
+               program_invocation_short_name,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_FORCE,
+                ARG_IMAGE_ROOT,
+                ARG_READ_ONLY,
+                ARG_DIRECT,
+                ARG_BTRFS_SUBVOL,
+                ARG_BTRFS_QUOTA,
+                ARG_SYNC,
+        };
+
+        static const struct option options[] = {
+                { "help",            no_argument,       NULL, 'h'                 },
+                { "version",         no_argument,       NULL, ARG_VERSION         },
+                { "force",           no_argument,       NULL, ARG_FORCE           },
+                { "image-root",      required_argument, NULL, ARG_IMAGE_ROOT      },
+                { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
+                { "direct",          no_argument,       NULL, ARG_DIRECT          },
+                { "btrfs-subvol",    required_argument, NULL, ARG_BTRFS_SUBVOL    },
+                { "btrfs-quota",     required_argument, NULL, ARG_BTRFS_QUOTA     },
+                { "sync",            required_argument, NULL, ARG_SYNC            },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_FORCE:
+                        arg_force = true;
+                        break;
+
+                case ARG_IMAGE_ROOT:
+                        arg_image_root = optarg;
+                        break;
+
+                case ARG_READ_ONLY:
+                        arg_read_only = true;
+                        break;
+
+                case ARG_DIRECT:
+                        arg_direct = true;
+                        break;
+
+                case ARG_BTRFS_SUBVOL:
+                        r = parse_boolean_argument("--btrfs-subvol=", optarg, &arg_btrfs_subvol);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_BTRFS_QUOTA:
+                        r = parse_boolean_argument("--btrfs-quota=", optarg, &arg_btrfs_quota);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case ARG_SYNC:
+                        r = parse_boolean_argument("--sync=", optarg, &arg_sync);
+                        if (r < 0)
+                                return r;
+
+                        break;
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        return 1;
+}
+
+static int import_fs_main(int argc, char *argv[]) {
+
+        static const Verb verbs[] = {
+                { "help", VERB_ANY, VERB_ANY, 0, help      },
+                { "run",  2,        3,        0, import_fs },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        setlocale(LC_ALL, "");
+        log_parse_environment();
+        log_open();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return import_fs_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/import/import-pubring.gpg b/src/import/import-pubring.gpg
new file mode 100644
index 0000000..be27776
Binary files /dev/null and b/src/import/import-pubring.gpg differ
diff --git a/src/import/import-raw.c b/src/import/import-raw.c
new file mode 100644
index 0000000..f7ed163
--- /dev/null
+++ b/src/import/import-raw.c
@@ -0,0 +1,529 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-daemon.h"
+#include "sd-event.h"
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "copy.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "import-compress.h"
+#include "import-raw.h"
+#include "install-file.h"
+#include "io-util.h"
+#include "machine-pool.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "qcow2-util.h"
+#include "ratelimit.h"
+#include "rm-rf.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+
+struct RawImport {
+        sd_event *event;
+
+        char *image_root;
+
+        RawImportFinished on_finished;
+        void *userdata;
+
+        char *local;
+        ImportFlags flags;
+
+        char *temp_path;
+        char *final_path;
+
+        int input_fd;
+        int output_fd;
+
+        ImportCompress compress;
+
+        sd_event_source *input_event_source;
+
+        uint8_t buffer[16*1024];
+        size_t buffer_size;
+
+        uint64_t written_compressed;
+        uint64_t written_uncompressed;
+
+        struct stat input_stat;
+        struct stat output_stat;
+
+        unsigned last_percent;
+        RateLimit progress_ratelimit;
+
+        uint64_t offset;
+        uint64_t size_max;
+};
+
+RawImport* raw_import_unref(RawImport *i) {
+        if (!i)
+                return NULL;
+
+        sd_event_source_unref(i->input_event_source);
+
+        unlink_and_free(i->temp_path);
+
+        import_compress_free(&i->compress);
+
+        sd_event_unref(i->event);
+
+        safe_close(i->output_fd);
+
+        free(i->final_path);
+        free(i->image_root);
+        free(i->local);
+        return mfree(i);
+}
+
+int raw_import_new(
+                RawImport **ret,
+                sd_event *event,
+                const char *image_root,
+                RawImportFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(raw_import_unrefp) RawImport *i = NULL;
+        _cleanup_free_ char *root = NULL;
+        int r;
+
+        assert(ret);
+
+        root = strdup(image_root ?: "/var/lib/machines");
+        if (!root)
+                return -ENOMEM;
+
+        i = new(RawImport, 1);
+        if (!i)
+                return -ENOMEM;
+
+        *i = (RawImport) {
+                .input_fd = -EBADF,
+                .output_fd = -EBADF,
+                .on_finished = on_finished,
+                .userdata = userdata,
+                .last_percent = UINT_MAX,
+                .image_root = TAKE_PTR(root),
+                .progress_ratelimit = { 100 * USEC_PER_MSEC, 1 },
+                .offset = UINT64_MAX,
+                .size_max = UINT64_MAX,
+        };
+
+        if (event)
+                i->event = sd_event_ref(event);
+        else {
+                r = sd_event_default(&i->event);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret = TAKE_PTR(i);
+        return 0;
+}
+
+static void raw_import_report_progress(RawImport *i) {
+        unsigned percent;
+        assert(i);
+
+        /* We have no size information, unless the source is a regular file */
+        if (!S_ISREG(i->input_stat.st_mode))
+                return;
+
+        if (i->written_compressed >= (uint64_t) i->input_stat.st_size)
+                percent = 100;
+        else
+                percent = (unsigned) ((i->written_compressed * UINT64_C(100)) / (uint64_t) i->input_stat.st_size);
+
+        if (percent == i->last_percent)
+                return;
+
+        if (!ratelimit_below(&i->progress_ratelimit))
+                return;
+
+        sd_notifyf(false, "X_IMPORT_PROGRESS=%u%%", percent);
+        log_info("Imported %u%%.", percent);
+
+        i->last_percent = percent;
+}
+
+static int raw_import_maybe_convert_qcow2(RawImport *i) {
+        _cleanup_close_ int converted_fd = -EBADF;
+        _cleanup_(unlink_and_freep) char *t = NULL;
+        _cleanup_free_ char *f = NULL;
+        int r;
+
+        assert(i);
+
+        /* Do QCOW2 conversion if enabled and not in direct mode */
+        if ((i->flags & (IMPORT_CONVERT_QCOW2|IMPORT_DIRECT)) != IMPORT_CONVERT_QCOW2)
+                return 0;
+
+        assert(i->final_path);
+
+        r = qcow2_detect(i->output_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to detect whether this is a QCOW2 image: %m");
+        if (r == 0)
+                return 0;
+
+        /* This is a QCOW2 image, let's convert it */
+        r = tempfn_random(i->final_path, NULL, &f);
+        if (r < 0)
+                return log_oom();
+
+        converted_fd = open(f, O_RDWR|O_CREAT|O_EXCL|O_NOCTTY|O_CLOEXEC, 0664);
+        if (converted_fd < 0)
+                return log_error_errno(errno, "Failed to create %s: %m", f);
+
+        t = TAKE_PTR(f);
+
+        (void) import_set_nocow_and_log(converted_fd, t);
+
+        log_info("Unpacking QCOW2 file.");
+
+        r = qcow2_convert(i->output_fd, converted_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to convert qcow2 image: %m");
+
+        unlink_and_free(i->temp_path);
+        i->temp_path = TAKE_PTR(t);
+        close_and_replace(i->output_fd, converted_fd);
+
+        return 1;
+}
+
+static int raw_import_finish(RawImport *i) {
+        int r;
+
+        assert(i);
+        assert(i->output_fd >= 0);
+
+        /* Nothing of what is below applies to block devices */
+        if (S_ISBLK(i->output_stat.st_mode)) {
+
+                if (i->flags & IMPORT_SYNC) {
+                        if (fsync(i->output_fd) < 0)
+                                return log_error_errno(errno, "Failed to synchronize block device: %m");
+                }
+
+                return 0;
+        }
+
+        assert(S_ISREG(i->output_stat.st_mode));
+
+        /* If an offset is specified we only are supposed to affect part of an existing output file or block
+         * device, thus don't manipulate file properties in that case */
+
+        if (i->offset == UINT64_MAX) {
+                /* In case this was a sparse file, make sure the file size is right */
+                if (i->written_uncompressed > 0) {
+                        if (ftruncate(i->output_fd, i->written_uncompressed) < 0)
+                                return log_error_errno(errno, "Failed to truncate file: %m");
+                }
+
+                r = raw_import_maybe_convert_qcow2(i);
+                if (r < 0)
+                        return r;
+
+                if (S_ISREG(i->input_stat.st_mode)) {
+                        (void) copy_times(i->input_fd, i->output_fd, COPY_CRTIME);
+                        (void) copy_xattr(i->input_fd, NULL, i->output_fd, NULL, 0);
+                }
+        }
+
+        r = install_file(AT_FDCWD, i->temp_path ?: i->local,
+                         AT_FDCWD, i->final_path,
+                         (i->flags & IMPORT_FORCE ? INSTALL_REPLACE : 0) |
+                         (i->flags & IMPORT_READ_ONLY ? INSTALL_READ_ONLY : 0) |
+                         (i->flags & IMPORT_SYNC ? INSTALL_FSYNC_FULL : 0));
+        if (r < 0)
+                return log_error_errno(r, "Failed to move image into place: %m");
+
+        i->temp_path = mfree(i->temp_path);
+
+        log_info("Wrote %s.", FORMAT_BYTES(i->written_uncompressed));
+
+        return 0;
+}
+
+static int raw_import_open_disk(RawImport *i) {
+        int r;
+
+        assert(i);
+        assert(i->local);
+        assert(!i->final_path);
+        assert(!i->temp_path);
+        assert(i->output_fd < 0);
+
+        if (i->flags & IMPORT_DIRECT) {
+                (void) mkdir_parents_label(i->local, 0700);
+
+                /* In direct mode we just open/create the local path and truncate it (like shell >
+                 * redirection would do it) — except if an offset was passed, in which case we are supposed
+                 * to operate on a section of the file only, in which case we apparently work on an some
+                 * existing thing (i.e. are not the sole thing stored in the file), in which case we will
+                 * neither truncate nor create. */
+
+                i->output_fd = open(i->local, O_RDWR|O_NOCTTY|O_CLOEXEC|(i->offset == UINT64_MAX ? O_TRUNC|O_CREAT : 0), 0664);
+                if (i->output_fd < 0)
+                        return log_error_errno(errno, "Failed to open destination '%s': %m", i->local);
+
+                if (i->offset == UINT64_MAX)
+                        (void) import_set_nocow_and_log(i->output_fd, i->local);
+        } else {
+                i->final_path = strjoin(i->image_root, "/", i->local, ".raw");
+                if (!i->final_path)
+                        return log_oom();
+
+                r = tempfn_random(i->final_path, NULL, &i->temp_path);
+                if (r < 0)
+                        return log_oom();
+
+                (void) mkdir_parents_label(i->temp_path, 0700);
+
+                i->output_fd = open(i->temp_path, O_RDWR|O_CREAT|O_EXCL|O_NOCTTY|O_CLOEXEC, 0664);
+                if (i->output_fd < 0)
+                        return log_error_errno(errno, "Failed to open destination '%s': %m", i->temp_path);
+
+                (void) import_set_nocow_and_log(i->output_fd, i->temp_path);
+        }
+
+        if (fstat(i->output_fd, &i->output_stat) < 0)
+                return log_error_errno(errno, "Failed to stat() output file: %m");
+
+        if (!S_ISREG(i->output_stat.st_mode) && !S_ISBLK(i->output_stat.st_mode))
+                return log_error_errno(SYNTHETIC_ERRNO(EBADFD),
+                                       "Target file is not a regular file or block device");
+
+        if (i->offset != UINT64_MAX) {
+                if (lseek(i->output_fd, i->offset, SEEK_SET) < 0)
+                        return log_error_errno(errno, "Failed to seek to offset: %m");
+        }
+
+        return 0;
+}
+
+static int raw_import_try_reflink(RawImport *i) {
+        off_t p;
+        int r;
+
+        assert(i);
+        assert(i->input_fd >= 0);
+        assert(i->output_fd >= 0);
+
+        if (i->compress.type != IMPORT_COMPRESS_UNCOMPRESSED)
+                return 0;
+
+        if (i->offset != UINT64_MAX || i->size_max != UINT64_MAX)
+                return 0;
+
+        if (!S_ISREG(i->input_stat.st_mode) || !S_ISREG(i->output_stat.st_mode))
+                return 0;
+
+        p = lseek(i->input_fd, 0, SEEK_CUR);
+        if (p < 0)
+                return log_error_errno(errno, "Failed to read file offset of input file: %m");
+
+        /* Let's only try a btrfs reflink, if we are reading from the beginning of the file */
+        if ((uint64_t) p != (uint64_t) i->buffer_size)
+                return 0;
+
+        r = reflink(i->input_fd, i->output_fd);
+        if (r >= 0)
+                return 1;
+
+        log_debug_errno(r, "Couldn't establish reflink, using copy: %m");
+        return 0;
+}
+
+static int raw_import_write(const void *p, size_t sz, void *userdata) {
+        RawImport *i = ASSERT_PTR(userdata);
+        bool too_much = false;
+        int r;
+
+        assert(p);
+        assert(sz > 0);
+
+        if (i->written_uncompressed >= UINT64_MAX - sz)
+                return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "File too large, overflow");
+
+        if (i->size_max != UINT64_MAX) {
+                if (i->written_uncompressed >= i->size_max) {
+                        too_much = true;
+                        goto finish;
+                }
+
+                if (i->written_uncompressed + sz > i->size_max) {
+                        too_much = true;
+                        sz = i->size_max - i->written_uncompressed; /* since we have the data in memory
+                                                                     * already, we might as well write it to
+                                                                     * disk to the max */
+                }
+        }
+
+        /* Generate sparse file if we created/truncated the file */
+        if (S_ISREG(i->output_stat.st_mode) && i->offset == UINT64_MAX) {
+                ssize_t n;
+
+                n = sparse_write(i->output_fd, p, sz, 64);
+                if (n < 0)
+                        return log_error_errno((int) n, "Failed to write file: %m");
+                if ((size_t) n < sz)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write");
+        } else {
+                r = loop_write(i->output_fd, p, sz);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to write file: %m");
+        }
+
+        i->written_uncompressed += sz;
+
+finish:
+        if (too_much)
+                return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "File too large");
+
+        return 0;
+}
+
+static int raw_import_process(RawImport *i) {
+        ssize_t l;
+        int r;
+
+        assert(i);
+        assert(i->buffer_size < sizeof(i->buffer));
+
+        l = read(i->input_fd, i->buffer + i->buffer_size, sizeof(i->buffer) - i->buffer_size);
+        if (l < 0) {
+                if (errno == EAGAIN)
+                        return 0;
+
+                r = log_error_errno(errno, "Failed to read input file: %m");
+                goto finish;
+        }
+
+        i->buffer_size += l;
+
+        if (i->compress.type == IMPORT_COMPRESS_UNKNOWN) {
+
+                if (l == 0) { /* EOF */
+                        log_debug("File too short to be compressed, as no compression signature fits in, thus assuming uncompressed.");
+                        import_uncompress_force_off(&i->compress);
+                } else {
+                        r = import_uncompress_detect(&i->compress, i->buffer, i->buffer_size);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to detect file compression: %m");
+                                goto finish;
+                        }
+                        if (r == 0) /* Need more data */
+                                return 0;
+                }
+
+                r = raw_import_open_disk(i);
+                if (r < 0)
+                        goto finish;
+
+                r = raw_import_try_reflink(i);
+                if (r < 0)
+                        goto finish;
+                if (r > 0)
+                        goto complete;
+        }
+
+        r = import_uncompress(&i->compress, i->buffer, i->buffer_size, raw_import_write, i);
+        if (r < 0) {
+                log_error_errno(r, "Failed to decode and write: %m");
+                goto finish;
+        }
+
+        i->written_compressed += i->buffer_size;
+        i->buffer_size = 0;
+
+        if (l == 0) /* EOF */
+                goto complete;
+
+        raw_import_report_progress(i);
+
+        return 0;
+
+complete:
+        r = raw_import_finish(i);
+
+finish:
+        if (i->on_finished)
+                i->on_finished(i, r, i->userdata);
+        else
+                sd_event_exit(i->event, r);
+
+        return 0;
+}
+
+static int raw_import_on_input(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        RawImport *i = userdata;
+
+        return raw_import_process(i);
+}
+
+static int raw_import_on_defer(sd_event_source *s, void *userdata) {
+        RawImport *i = userdata;
+
+        return raw_import_process(i);
+}
+
+int raw_import_start(
+                RawImport *i,
+                int fd,
+                const char *local,
+                uint64_t offset,
+                uint64_t size_max,
+                ImportFlags flags) {
+        int r;
+
+        assert(i);
+        assert(fd >= 0);
+        assert(local);
+        assert(!(flags & ~IMPORT_FLAGS_MASK_RAW));
+        assert(offset == UINT64_MAX || FLAGS_SET(flags, IMPORT_DIRECT));
+
+        if (!import_validate_local(local, flags))
+                return -EINVAL;
+
+        if (i->input_fd >= 0)
+                return -EBUSY;
+
+        r = fd_nonblock(fd, true);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&i->local, local);
+        if (r < 0)
+                return r;
+
+        i->flags = flags;
+        i->offset = offset;
+        i->size_max = size_max;
+
+        if (fstat(fd, &i->input_stat) < 0)
+                return -errno;
+
+        r = sd_event_add_io(i->event, &i->input_event_source, fd, EPOLLIN, raw_import_on_input, i);
+        if (r == -EPERM) {
+                /* This fd does not support epoll, for example because it is a regular file. Busy read in that case */
+                r = sd_event_add_defer(i->event, &i->input_event_source, raw_import_on_defer, i);
+                if (r < 0)
+                        return r;
+
+                r = sd_event_source_set_enabled(i->input_event_source, SD_EVENT_ON);
+        }
+        if (r < 0)
+                return r;
+
+        i->input_fd = fd;
+        return 0;
+}
diff --git a/src/import/import-raw.h b/src/import/import-raw.h
new file mode 100644
index 0000000..63384eb
--- /dev/null
+++ b/src/import/import-raw.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "import-common.h"
+#include "import-util.h"
+#include "macro.h"
+
+typedef struct RawImport RawImport;
+
+typedef void (*RawImportFinished)(RawImport *import, int error, void *userdata);
+
+int raw_import_new(RawImport **import, sd_event *event, const char *image_root, RawImportFinished on_finished, void *userdata);
+RawImport* raw_import_unref(RawImport *import);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(RawImport*, raw_import_unref);
+
+int raw_import_start(RawImport *i, int fd, const char *local, uint64_t offset, uint64_t size_max, ImportFlags flags);
diff --git a/src/import/import-tar.c b/src/import/import-tar.c
new file mode 100644
index 0000000..9020270
--- /dev/null
+++ b/src/import/import-tar.c
@@ -0,0 +1,380 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "sd-daemon.h"
+#include "sd-event.h"
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "copy.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "import-compress.h"
+#include "import-tar.h"
+#include "install-file.h"
+#include "io-util.h"
+#include "machine-pool.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "qcow2-util.h"
+#include "ratelimit.h"
+#include "rm-rf.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+
+struct TarImport {
+        sd_event *event;
+
+        char *image_root;
+
+        TarImportFinished on_finished;
+        void *userdata;
+
+        char *local;
+        ImportFlags flags;
+
+        char *temp_path;
+        char *final_path;
+
+        int input_fd;
+        int tar_fd;
+
+        ImportCompress compress;
+
+        sd_event_source *input_event_source;
+
+        uint8_t buffer[16*1024];
+        size_t buffer_size;
+
+        uint64_t written_compressed;
+        uint64_t written_uncompressed;
+
+        struct stat input_stat;
+
+        pid_t tar_pid;
+
+        unsigned last_percent;
+        RateLimit progress_ratelimit;
+};
+
+TarImport* tar_import_unref(TarImport *i) {
+        if (!i)
+                return NULL;
+
+        sd_event_source_unref(i->input_event_source);
+
+        if (i->tar_pid > 1)
+                sigkill_wait(i->tar_pid);
+
+        rm_rf_subvolume_and_free(i->temp_path);
+
+        import_compress_free(&i->compress);
+
+        sd_event_unref(i->event);
+
+        safe_close(i->tar_fd);
+
+        free(i->final_path);
+        free(i->image_root);
+        free(i->local);
+        return mfree(i);
+}
+
+int tar_import_new(
+                TarImport **ret,
+                sd_event *event,
+                const char *image_root,
+                TarImportFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(tar_import_unrefp) TarImport *i = NULL;
+        _cleanup_free_ char *root = NULL;
+        int r;
+
+        assert(ret);
+
+        root = strdup(image_root ?: "/var/lib/machines");
+        if (!root)
+                return -ENOMEM;
+
+        i = new(TarImport, 1);
+        if (!i)
+                return -ENOMEM;
+
+        *i = (TarImport) {
+                .input_fd = -EBADF,
+                .tar_fd = -EBADF,
+                .on_finished = on_finished,
+                .userdata = userdata,
+                .last_percent = UINT_MAX,
+                .image_root = TAKE_PTR(root),
+                .progress_ratelimit = { 100 * USEC_PER_MSEC, 1 },
+        };
+
+        if (event)
+                i->event = sd_event_ref(event);
+        else {
+                r = sd_event_default(&i->event);
+                if (r < 0)
+                        return r;
+        }
+
+        *ret = TAKE_PTR(i);
+
+        return 0;
+}
+
+static void tar_import_report_progress(TarImport *i) {
+        unsigned percent;
+        assert(i);
+
+        /* We have no size information, unless the source is a regular file */
+        if (!S_ISREG(i->input_stat.st_mode))
+                return;
+
+        if (i->written_compressed >= (uint64_t) i->input_stat.st_size)
+                percent = 100;
+        else
+                percent = (unsigned) ((i->written_compressed * UINT64_C(100)) / (uint64_t) i->input_stat.st_size);
+
+        if (percent == i->last_percent)
+                return;
+
+        if (!ratelimit_below(&i->progress_ratelimit))
+                return;
+
+        sd_notifyf(false, "X_IMPORT_PROGRESS=%u%%", percent);
+        log_info("Imported %u%%.", percent);
+
+        i->last_percent = percent;
+}
+
+static int tar_import_finish(TarImport *i) {
+        const char *d;
+        int r;
+
+        assert(i);
+        assert(i->tar_fd >= 0);
+
+        i->tar_fd = safe_close(i->tar_fd);
+
+        if (i->tar_pid > 0) {
+                r = wait_for_terminate_and_check("tar", TAKE_PID(i->tar_pid), WAIT_LOG);
+                if (r < 0)
+                        return r;
+                if (r != EXIT_SUCCESS)
+                        return -EPROTO;
+        }
+
+        assert_se(d = i->temp_path ?: i->local);
+
+        r = import_mangle_os_tree(d);
+        if (r < 0)
+                return r;
+
+        r = install_file(
+                        AT_FDCWD, d,
+                        AT_FDCWD, i->final_path,
+                        (i->flags & IMPORT_FORCE ? INSTALL_REPLACE : 0) |
+                        (i->flags & IMPORT_READ_ONLY ? INSTALL_READ_ONLY : 0) |
+                        (i->flags & IMPORT_SYNC ? INSTALL_SYNCFS : 0));
+        if (r < 0)
+                return log_error_errno(r, "Failed to move '%s' into place: %m", i->final_path ?: i->local);
+
+        i->temp_path = mfree(i->temp_path);
+
+        return 0;
+}
+
+static int tar_import_fork_tar(TarImport *i) {
+        const char *d, *root;
+        int r;
+
+        assert(i);
+        assert(i->local);
+        assert(!i->final_path);
+        assert(!i->temp_path);
+        assert(i->tar_fd < 0);
+
+        if (i->flags & IMPORT_DIRECT) {
+                d = i->local;
+                root = NULL;
+        } else {
+                i->final_path = path_join(i->image_root, i->local);
+                if (!i->final_path)
+                        return log_oom();
+
+                r = tempfn_random(i->final_path, NULL, &i->temp_path);
+                if (r < 0)
+                        return log_oom();
+
+                d = i->temp_path;
+                root = i->image_root;
+        }
+
+        assert(d);
+
+        (void) mkdir_parents_label(d, 0700);
+
+        if (FLAGS_SET(i->flags, IMPORT_DIRECT|IMPORT_FORCE))
+                (void) rm_rf(d, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
+
+        if (i->flags & IMPORT_BTRFS_SUBVOL)
+                r = btrfs_subvol_make_fallback(AT_FDCWD, d, 0755);
+        else
+                r = RET_NERRNO(mkdir(d, 0755));
+        if (r == -EEXIST && (i->flags & IMPORT_DIRECT)) /* EEXIST is OK if in direct mode, but not otherwise,
+                                                         * because in that case our temporary path collided */
+                r = 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to create directory/subvolume %s: %m", d);
+        if (r > 0 && (i->flags & IMPORT_BTRFS_QUOTA)) { /* actually btrfs subvol */
+                if (!(i->flags & IMPORT_DIRECT))
+                        (void) import_assign_pool_quota_and_warn(root);
+                (void) import_assign_pool_quota_and_warn(d);
+        }
+
+        i->tar_fd = import_fork_tar_x(d, &i->tar_pid);
+        if (i->tar_fd < 0)
+                return i->tar_fd;
+
+        return 0;
+}
+
+static int tar_import_write(const void *p, size_t sz, void *userdata) {
+        TarImport *i = userdata;
+        int r;
+
+        r = loop_write(i->tar_fd, p, sz);
+        if (r < 0)
+                return r;
+
+        i->written_uncompressed += sz;
+
+        return 0;
+}
+
+static int tar_import_process(TarImport *i) {
+        ssize_t l;
+        int r;
+
+        assert(i);
+        assert(i->buffer_size < sizeof(i->buffer));
+
+        l = read(i->input_fd, i->buffer + i->buffer_size, sizeof(i->buffer) - i->buffer_size);
+        if (l < 0) {
+                if (errno == EAGAIN)
+                        return 0;
+
+                r = log_error_errno(errno, "Failed to read input file: %m");
+                goto finish;
+        }
+
+        i->buffer_size += l;
+
+        if (i->compress.type == IMPORT_COMPRESS_UNKNOWN) {
+
+                if (l == 0) { /* EOF */
+                        log_debug("File too short to be compressed, as no compression signature fits in, thus assuming uncompressed.");
+                        import_uncompress_force_off(&i->compress);
+                } else {
+                        r = import_uncompress_detect(&i->compress, i->buffer, i->buffer_size);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to detect file compression: %m");
+                                goto finish;
+                        }
+                        if (r == 0) /* Need more data */
+                                return 0;
+                }
+
+                r = tar_import_fork_tar(i);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = import_uncompress(&i->compress, i->buffer, i->buffer_size, tar_import_write, i);
+        if (r < 0) {
+                log_error_errno(r, "Failed to decode and write: %m");
+                goto finish;
+        }
+
+        i->written_compressed += i->buffer_size;
+        i->buffer_size = 0;
+
+        if (l == 0) { /* EOF */
+                r = tar_import_finish(i);
+                goto finish;
+        }
+
+        tar_import_report_progress(i);
+
+        return 0;
+
+finish:
+        if (i->on_finished)
+                i->on_finished(i, r, i->userdata);
+        else
+                sd_event_exit(i->event, r);
+
+        return 0;
+}
+
+static int tar_import_on_input(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        TarImport *i = userdata;
+
+        return tar_import_process(i);
+}
+
+static int tar_import_on_defer(sd_event_source *s, void *userdata) {
+        TarImport *i = userdata;
+
+        return tar_import_process(i);
+}
+
+int tar_import_start(TarImport *i, int fd, const char *local, ImportFlags flags) {
+        int r;
+
+        assert(i);
+        assert(fd >= 0);
+        assert(local);
+        assert(!(flags & ~IMPORT_FLAGS_MASK_TAR));
+
+        if (!import_validate_local(local, flags))
+                return -EINVAL;
+
+        if (i->input_fd >= 0)
+                return -EBUSY;
+
+        r = fd_nonblock(fd, true);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&i->local, local);
+        if (r < 0)
+                return r;
+
+        i->flags = flags;
+
+        if (fstat(fd, &i->input_stat) < 0)
+                return -errno;
+
+        r = sd_event_add_io(i->event, &i->input_event_source, fd, EPOLLIN, tar_import_on_input, i);
+        if (r == -EPERM) {
+                /* This fd does not support epoll, for example because it is a regular file. Busy read in that case */
+                r = sd_event_add_defer(i->event, &i->input_event_source, tar_import_on_defer, i);
+                if (r < 0)
+                        return r;
+
+                r = sd_event_source_set_enabled(i->input_event_source, SD_EVENT_ON);
+        }
+        if (r < 0)
+                return r;
+
+        i->input_fd = fd;
+        return 0;
+}
diff --git a/src/import/import-tar.h b/src/import/import-tar.h
new file mode 100644
index 0000000..63b0bd4
--- /dev/null
+++ b/src/import/import-tar.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "import-common.h"
+#include "import-util.h"
+#include "macro.h"
+
+typedef struct TarImport TarImport;
+
+typedef void (*TarImportFinished)(TarImport *import, int error, void *userdata);
+
+int tar_import_new(TarImport **import, sd_event *event, const char *image_root, TarImportFinished on_finished, void *userdata);
+TarImport* tar_import_unref(TarImport *import);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(TarImport*, tar_import_unref);
+
+int tar_import_start(TarImport *import, int fd, const char *local, ImportFlags flags);
diff --git a/src/import/import.c b/src/import/import.c
new file mode 100644
index 0000000..a81617d
--- /dev/null
+++ b/src/import/import.c
@@ -0,0 +1,492 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-event.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "discover-image.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-raw.h"
+#include "import-tar.h"
+#include "import-util.h"
+#include "io-util.h"
+#include "main-func.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static const char *arg_image_root = "/var/lib/machines";
+static ImportFlags arg_import_flags = IMPORT_BTRFS_SUBVOL | IMPORT_BTRFS_QUOTA | IMPORT_CONVERT_QCOW2 | IMPORT_SYNC;
+static uint64_t arg_offset = UINT64_MAX, arg_size_max = UINT64_MAX;
+
+static int normalize_local(const char *local, char **ret) {
+        _cleanup_free_ char *ll = NULL;
+        int r;
+
+        assert(ret);
+
+        if (arg_import_flags & IMPORT_DIRECT) {
+
+                if (!local)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No local path specified.");
+
+                if (!path_is_absolute(local))  {
+                        ll = path_join(arg_image_root, local);
+                        if (!ll)
+                                return log_oom();
+
+                        local = ll;
+                }
+
+                if (!path_is_valid(local))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Local path name '%s' is not valid.", local);
+        } else {
+                if (local) {
+                        if (!hostname_is_valid(local, 0))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                       "Local image name '%s' is not valid.",
+                                                       local);
+                } else
+                        local = "imported";
+
+                if (!FLAGS_SET(arg_import_flags, IMPORT_FORCE)) {
+                        r = image_find(IMAGE_MACHINE, local, NULL, NULL);
+                        if (r < 0) {
+                                if (r != -ENOENT)
+                                        return log_error_errno(r, "Failed to check whether image '%s' exists: %m", local);
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                                       "Image '%s' already exists.",
+                                                       local);
+                }
+        }
+
+        if (!ll) {
+                ll = strdup(local);
+                if (!ll)
+                        return log_oom();
+        }
+
+        *ret = TAKE_PTR(ll);
+        return 0;
+}
+
+static int open_source(const char *path, const char *local, int *ret_open_fd) {
+        _cleanup_close_ int open_fd = -EBADF;
+        int retval;
+
+        assert(local);
+        assert(ret_open_fd);
+
+        if (path) {
+                open_fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+                if (open_fd < 0)
+                        return log_error_errno(errno, "Failed to open source file '%s': %m", path);
+
+                retval = open_fd;
+
+                if (arg_offset != UINT64_MAX)
+                        log_info("Importing '%s', saving at offset %" PRIu64 " in '%s'.", path, arg_offset, local);
+                else
+                        log_info("Importing '%s', saving as '%s'.", path, local);
+        } else {
+                _cleanup_free_ char *pretty = NULL;
+
+                retval = STDIN_FILENO;
+
+                (void) fd_get_path(STDIN_FILENO, &pretty);
+
+                if (arg_offset != UINT64_MAX)
+                        log_info("Importing '%s', saving at offset %" PRIu64 " in '%s'.", strempty(pretty), arg_offset, local);
+                else
+                        log_info("Importing '%s', saving as '%s'.", strempty(pretty), local);
+        }
+
+        *ret_open_fd = TAKE_FD(open_fd);
+        return retval;
+}
+
+static void on_tar_finished(TarImport *import, int error, void *userdata) {
+        sd_event *event = userdata;
+        assert(import);
+
+        if (error == 0)
+                log_info("Operation completed successfully.");
+
+        sd_event_exit(event, abs(error));
+}
+
+static int import_tar(int argc, char *argv[], void *userdata) {
+        _cleanup_(tar_import_unrefp) TarImport *import = NULL;
+        _cleanup_free_ char *ll = NULL, *normalized = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        const char *path = NULL, *local = NULL;
+        _cleanup_close_ int open_fd = -EBADF;
+        int r, fd;
+
+        if (argc >= 2)
+                path = empty_or_dash_to_null(argv[1]);
+
+        if (argc >= 3)
+                local = empty_or_dash_to_null(argv[2]);
+        else if (path) {
+                _cleanup_free_ char *l = NULL;
+
+                r = path_extract_filename(path, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from path '%s': %m", path);
+
+                r = tar_strip_suffixes(l, &ll);
+                if (r < 0)
+                        return log_oom();
+
+                local = ll;
+        }
+
+        r = normalize_local(local, &normalized);
+        if (r < 0)
+                return r;
+
+        fd = open_source(path, normalized, &open_fd);
+        if (fd < 0)
+                return r;
+
+        r = import_allocate_event_with_signals(&event);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(arg_import_flags, IMPORT_SYNC))
+                log_info("File system synchronization on completion is off.");
+
+        r = tar_import_new(&import, event, arg_image_root, on_tar_finished, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate importer: %m");
+
+        r = tar_import_start(
+                        import,
+                        fd,
+                        normalized,
+                        arg_import_flags & IMPORT_FLAGS_MASK_TAR);
+        if (r < 0)
+                return log_error_errno(r, "Failed to import image: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        log_info("Exiting.");
+        return -r;
+}
+
+static void on_raw_finished(RawImport *import, int error, void *userdata) {
+        sd_event *event = userdata;
+        assert(import);
+
+        if (error == 0)
+                log_info("Operation completed successfully.");
+
+        sd_event_exit(event, abs(error));
+}
+
+static int import_raw(int argc, char *argv[], void *userdata) {
+        _cleanup_(raw_import_unrefp) RawImport *import = NULL;
+        _cleanup_free_ char *ll = NULL, *normalized = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        const char *path = NULL, *local = NULL;
+        _cleanup_close_ int open_fd = -EBADF;
+        int r, fd;
+
+        if (argc >= 2)
+                path = empty_or_dash_to_null(argv[1]);
+
+        if (argc >= 3)
+                local = empty_or_dash_to_null(argv[2]);
+        else if (path) {
+                _cleanup_free_ char *l = NULL;
+
+                r = path_extract_filename(path, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to extract filename from path '%s': %m", path);
+
+                r = raw_strip_suffixes(l, &ll);
+                if (r < 0)
+                        return log_oom();
+
+                local = ll;
+        }
+
+        r = normalize_local(local, &normalized);
+        if (r < 0)
+                return r;
+
+        fd = open_source(path, normalized, &open_fd);
+        if (fd < 0)
+                return fd;
+
+        r = import_allocate_event_with_signals(&event);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(arg_import_flags, IMPORT_SYNC))
+                log_info("File system synchronization on completion is off.");
+
+        r = raw_import_new(&import, event, arg_image_root, on_raw_finished, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate importer: %m");
+
+        r = raw_import_start(
+                        import,
+                        fd,
+                        normalized,
+                        arg_offset,
+                        arg_size_max,
+                        arg_import_flags & IMPORT_FLAGS_MASK_RAW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to import image: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        log_info("Exiting.");
+        return -r;
+}
+
+static int help(int argc, char *argv[], void *userdata) {
+
+        printf("%1$s [OPTIONS...] {COMMAND} ...\n"
+               "\n%4$sImport container or virtual machine images.%5$s\n"
+               "\n%2$sCommands:%3$s\n"
+               "  tar FILE [NAME]             Import a TAR image\n"
+               "  raw FILE [NAME]             Import a RAW image\n"
+               "\n%2$sOptions:%3$s\n"
+               "  -h --help                   Show this help\n"
+               "     --version                Show package version\n"
+               "     --force                  Force creation of image\n"
+               "     --image-root=PATH        Image root directory\n"
+               "     --read-only              Create a read-only image\n"
+               "     --direct                 Import directly to specified file\n"
+               "     --btrfs-subvol=BOOL      Controls whether to create a btrfs subvolume\n"
+               "                              instead of a directory\n"
+               "     --btrfs-quota=BOOL       Controls whether to set up quota for btrfs\n"
+               "                              subvolume\n"
+               "     --convert-qcow2=BOOL     Controls whether to convert QCOW2 images to\n"
+               "                              regular disk images\n"
+               "     --sync=BOOL              Controls whether to sync() before completing\n"
+               "     --offset=BYTES           Offset to seek to in destination\n"
+               "     --size-max=BYTES         Maximum number of bytes to write to destination\n",
+               program_invocation_short_name,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_FORCE,
+                ARG_IMAGE_ROOT,
+                ARG_READ_ONLY,
+                ARG_DIRECT,
+                ARG_BTRFS_SUBVOL,
+                ARG_BTRFS_QUOTA,
+                ARG_CONVERT_QCOW2,
+                ARG_SYNC,
+                ARG_OFFSET,
+                ARG_SIZE_MAX,
+        };
+
+        static const struct option options[] = {
+                { "help",            no_argument,       NULL, 'h'                 },
+                { "version",         no_argument,       NULL, ARG_VERSION         },
+                { "force",           no_argument,       NULL, ARG_FORCE           },
+                { "image-root",      required_argument, NULL, ARG_IMAGE_ROOT      },
+                { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
+                { "direct",          no_argument,       NULL, ARG_DIRECT          },
+                { "btrfs-subvol",    required_argument, NULL, ARG_BTRFS_SUBVOL    },
+                { "btrfs-quota",     required_argument, NULL, ARG_BTRFS_QUOTA     },
+                { "convert-qcow2",   required_argument, NULL, ARG_CONVERT_QCOW2   },
+                { "sync",            required_argument, NULL, ARG_SYNC            },
+                { "offset",          required_argument, NULL, ARG_OFFSET          },
+                { "size-max",        required_argument, NULL, ARG_SIZE_MAX        },
+                {}
+        };
+
+        int r, c;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_FORCE:
+                        arg_import_flags |= IMPORT_FORCE;
+                        break;
+
+                case ARG_IMAGE_ROOT:
+                        arg_image_root = optarg;
+                        break;
+
+                case ARG_READ_ONLY:
+                        arg_import_flags |= IMPORT_READ_ONLY;
+                        break;
+
+                case ARG_DIRECT:
+                        arg_import_flags |= IMPORT_DIRECT;
+                        break;
+
+                case ARG_BTRFS_SUBVOL:
+                        r = parse_boolean_argument("--btrfs-subvol=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_import_flags, IMPORT_BTRFS_SUBVOL, r);
+                        break;
+
+                case ARG_BTRFS_QUOTA:
+                        r = parse_boolean_argument("--btrfs-quota=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_import_flags, IMPORT_BTRFS_QUOTA, r);
+                        break;
+
+                case ARG_CONVERT_QCOW2:
+                        r = parse_boolean_argument("--convert-qcow2=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_import_flags, IMPORT_CONVERT_QCOW2, r);
+                        break;
+
+                case ARG_SYNC:
+                        r = parse_boolean_argument("--sync=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_import_flags, IMPORT_SYNC, r);
+                        break;
+
+                case ARG_OFFSET: {
+                        uint64_t u;
+
+                        r = safe_atou64(optarg, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --offset= argument: %s", optarg);
+                        if (!FILE_SIZE_VALID(u))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Argument to --offset= switch too large: %s", optarg);
+
+                        arg_offset = u;
+                        break;
+                }
+
+                case ARG_SIZE_MAX: {
+                        uint64_t u;
+
+                        r = parse_size(optarg, 1024, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --size-max= argument: %s", optarg);
+                        if (!FILE_SIZE_VALID(u))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Argument to --size-max= switch too large: %s", optarg);
+
+                        arg_size_max = u;
+                        break;
+                }
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        /* Make sure offset+size is still in the valid range if both set */
+        if (arg_offset != UINT64_MAX && arg_size_max != UINT64_MAX &&
+            ((arg_size_max > (UINT64_MAX - arg_offset)) ||
+             !FILE_SIZE_VALID(arg_offset + arg_size_max)))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File offset und maximum size out of range.");
+
+        if (arg_offset != UINT64_MAX && !FLAGS_SET(arg_import_flags, IMPORT_DIRECT))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File offset only supported in --direct mode.");
+
+        return 1;
+}
+
+static int import_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help", VERB_ANY, VERB_ANY, 0, help       },
+                { "tar",  2,        3,        0, import_tar },
+                { "raw",  2,        3,        0, import_raw },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static void parse_env(void) {
+        int r;
+
+        /* Let's make these relatively low-level settings also controllable via env vars. User can then set
+         * them to systemd-import if they like to tweak behaviour */
+
+        r = getenv_bool("SYSTEMD_IMPORT_BTRFS_SUBVOL");
+        if (r >= 0)
+                SET_FLAG(arg_import_flags, IMPORT_BTRFS_SUBVOL, r);
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_BTRFS_SUBVOL: %m");
+
+        r = getenv_bool("SYSTEMD_IMPORT_BTRFS_QUOTA");
+        if (r >= 0)
+                SET_FLAG(arg_import_flags, IMPORT_BTRFS_QUOTA, r);
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_BTRFS_QUOTA: %m");
+
+        r = getenv_bool("SYSTEMD_IMPORT_SYNC");
+        if (r >= 0)
+                SET_FLAG(arg_import_flags, IMPORT_SYNC, r);
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_SYNC: %m");
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        setlocale(LC_ALL, "");
+        log_parse_environment();
+        log_open();
+
+        parse_env();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        (void) ignore_signals(SIGPIPE);
+
+        return import_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/import/importd.c b/src/import/importd.c
new file mode 100644
index 0000000..e1a1ddc
--- /dev/null
+++ b/src/import/importd.c
@@ -0,0 +1,1422 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-log-control-api.h"
+#include "bus-polkit.h"
+#include "common-signal.h"
+#include "constants.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "float.h"
+#include "hostname-util.h"
+#include "import-util.h"
+#include "machine-pool.h"
+#include "main-func.h"
+#include "missing_capability.h"
+#include "mkdir-label.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "service-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "user-util.h"
+#include "web-util.h"
+
+typedef struct Transfer Transfer;
+typedef struct Manager Manager;
+
+typedef enum TransferType {
+        TRANSFER_IMPORT_TAR,
+        TRANSFER_IMPORT_RAW,
+        TRANSFER_IMPORT_FS,
+        TRANSFER_EXPORT_TAR,
+        TRANSFER_EXPORT_RAW,
+        TRANSFER_PULL_TAR,
+        TRANSFER_PULL_RAW,
+        _TRANSFER_TYPE_MAX,
+        _TRANSFER_TYPE_INVALID = -EINVAL,
+} TransferType;
+
+struct Transfer {
+        Manager *manager;
+
+        uint32_t id;
+        char *object_path;
+
+        TransferType type;
+        ImportVerify verify;
+
+        char *remote;
+        char *local;
+        bool force_local;
+        bool read_only;
+
+        char *format;
+
+        pid_t pid;
+
+        int log_fd;
+
+        char log_message[LINE_MAX];
+        size_t log_message_size;
+
+        sd_event_source *pid_event_source;
+        sd_event_source *log_event_source;
+
+        unsigned n_canceled;
+        unsigned progress_percent;
+
+        int stdin_fd;
+        int stdout_fd;
+};
+
+struct Manager {
+        sd_event *event;
+        sd_bus *bus;
+
+        uint32_t current_transfer_id;
+        Hashmap *transfers;
+
+        Hashmap *polkit_registry;
+
+        int notify_fd;
+
+        sd_event_source *notify_event_source;
+
+        bool use_btrfs_subvol;
+        bool use_btrfs_quota;
+};
+
+#define TRANSFERS_MAX 64
+
+static const char* const transfer_type_table[_TRANSFER_TYPE_MAX] = {
+        [TRANSFER_IMPORT_TAR] = "import-tar",
+        [TRANSFER_IMPORT_RAW] = "import-raw",
+        [TRANSFER_IMPORT_FS] = "import-fs",
+        [TRANSFER_EXPORT_TAR] = "export-tar",
+        [TRANSFER_EXPORT_RAW] = "export-raw",
+        [TRANSFER_PULL_TAR] = "pull-tar",
+        [TRANSFER_PULL_RAW] = "pull-raw",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(transfer_type, TransferType);
+
+static Transfer *transfer_unref(Transfer *t) {
+        if (!t)
+                return NULL;
+
+        if (t->manager)
+                hashmap_remove(t->manager->transfers, UINT32_TO_PTR(t->id));
+
+        sd_event_source_unref(t->pid_event_source);
+        sd_event_source_unref(t->log_event_source);
+
+        free(t->remote);
+        free(t->local);
+        free(t->format);
+        free(t->object_path);
+
+        if (t->pid > 1)
+                sigkill_wait(t->pid);
+
+        safe_close(t->log_fd);
+        safe_close(t->stdin_fd);
+        safe_close(t->stdout_fd);
+
+        return mfree(t);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Transfer*, transfer_unref);
+
+static int transfer_new(Manager *m, Transfer **ret) {
+        _cleanup_(transfer_unrefp) Transfer *t = NULL;
+        uint32_t id;
+        int r;
+
+        assert(m);
+        assert(ret);
+
+        if (hashmap_size(m->transfers) >= TRANSFERS_MAX)
+                return -E2BIG;
+
+        t = new(Transfer, 1);
+        if (!t)
+                return -ENOMEM;
+
+        *t = (Transfer) {
+                .type = _TRANSFER_TYPE_INVALID,
+                .log_fd = -EBADF,
+                .stdin_fd = -EBADF,
+                .stdout_fd = -EBADF,
+                .verify = _IMPORT_VERIFY_INVALID,
+                .progress_percent= UINT_MAX,
+        };
+
+        id = m->current_transfer_id + 1;
+
+        if (asprintf(&t->object_path, "/org/freedesktop/import1/transfer/_%" PRIu32, id) < 0)
+                return -ENOMEM;
+
+        r = hashmap_ensure_put(&m->transfers, &trivial_hash_ops, UINT32_TO_PTR(id), t);
+        if (r < 0)
+                return r;
+
+        m->current_transfer_id = id;
+
+        t->manager = m;
+        t->id = id;
+
+        *ret = TAKE_PTR(t);
+
+        return 0;
+}
+
+static double transfer_percent_as_double(Transfer *t) {
+        assert(t);
+
+        if (t->progress_percent == UINT_MAX)
+                return -DBL_MAX;
+
+        return (double) t->progress_percent / 100.0;
+}
+
+static void transfer_send_log_line(Transfer *t, const char *line) {
+        int r, priority = LOG_INFO;
+
+        assert(t);
+        assert(line);
+
+        syslog_parse_priority(&line, &priority, true);
+
+        log_full(priority, "(transfer%" PRIu32 ") %s", t->id, line);
+
+        r = sd_bus_emit_signal(
+                        t->manager->bus,
+                        t->object_path,
+                        "org.freedesktop.import1.Transfer",
+                        "LogMessage",
+                        "us",
+                        priority,
+                        line);
+        if (r < 0)
+                log_warning_errno(r, "Cannot emit log message signal, ignoring: %m");
+ }
+
+static void transfer_send_logs(Transfer *t, bool flush) {
+        assert(t);
+
+        /* Try to send out all log messages, if we can. But if we
+         * can't we remove the messages from the buffer, but don't
+         * fail */
+
+        while (t->log_message_size > 0) {
+                _cleanup_free_ char *n = NULL;
+                char *e;
+
+                if (t->log_message_size >= sizeof(t->log_message))
+                        e = t->log_message + sizeof(t->log_message);
+                else {
+                        char *a, *b;
+
+                        a = memchr(t->log_message, 0, t->log_message_size);
+                        b = memchr(t->log_message, '\n', t->log_message_size);
+
+                        if (a && b)
+                                e = a < b ? a : b;
+                        else if (a)
+                                e = a;
+                        else
+                                e = b;
+                }
+
+                if (!e) {
+                        if (!flush)
+                                return;
+
+                        e = t->log_message + t->log_message_size;
+                }
+
+                n = strndup(t->log_message, e - t->log_message);
+
+                /* Skip over NUL and newlines */
+                while (e < t->log_message + t->log_message_size && IN_SET(*e, 0, '\n'))
+                        e++;
+
+                memmove(t->log_message, e, t->log_message + sizeof(t->log_message) - e);
+                t->log_message_size -= e - t->log_message;
+
+                if (!n) {
+                        log_oom();
+                        continue;
+                }
+
+                if (isempty(n))
+                        continue;
+
+                transfer_send_log_line(t, n);
+        }
+}
+
+static int transfer_finalize(Transfer *t, bool success) {
+        int r;
+
+        assert(t);
+
+        transfer_send_logs(t, true);
+
+        r = sd_bus_emit_signal(
+                        t->manager->bus,
+                        "/org/freedesktop/import1",
+                        "org.freedesktop.import1.Manager",
+                        "TransferRemoved",
+                        "uos",
+                        t->id,
+                        t->object_path,
+                        success ? "done" :
+                        t->n_canceled > 0 ? "canceled" : "failed");
+
+        if (r < 0)
+                log_error_errno(r, "Cannot emit message: %m");
+
+        transfer_unref(t);
+        return 0;
+}
+
+static int transfer_cancel(Transfer *t) {
+        int r;
+
+        assert(t);
+
+        r = kill_and_sigcont(t->pid, t->n_canceled < 3 ? SIGTERM : SIGKILL);
+        if (r < 0)
+                return r;
+
+        t->n_canceled++;
+        return 0;
+}
+
+static int transfer_on_pid(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        Transfer *t = ASSERT_PTR(userdata);
+        bool success = false;
+
+        assert(s);
+
+        if (si->si_code == CLD_EXITED) {
+                if (si->si_status != 0)
+                        log_error("Transfer process failed with exit code %i.", si->si_status);
+                else {
+                        log_debug("Transfer process succeeded.");
+                        success = true;
+                }
+
+        } else if (IN_SET(si->si_code, CLD_KILLED, CLD_DUMPED))
+                log_error("Transfer process terminated by signal %s.", signal_to_string(si->si_status));
+        else
+                log_error("Transfer process failed due to unknown reason.");
+
+        t->pid = 0;
+
+        return transfer_finalize(t, success);
+}
+
+static int transfer_on_log(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        Transfer *t = ASSERT_PTR(userdata);
+        ssize_t l;
+
+        assert(s);
+
+        l = read(fd, t->log_message + t->log_message_size, sizeof(t->log_message) - t->log_message_size);
+        if (l < 0)
+                log_error_errno(errno, "Failed to read log message: %m");
+        if (l <= 0) {
+                /* EOF/read error. We just close the pipe here, and
+                 * close the watch, waiting for the SIGCHLD to arrive,
+                 * before we do anything else. */
+                t->log_event_source = sd_event_source_unref(t->log_event_source);
+                return 0;
+        }
+
+        t->log_message_size += l;
+
+        transfer_send_logs(t, false);
+
+        return 0;
+}
+
+static int transfer_start(Transfer *t) {
+        _cleanup_close_pair_ int pipefd[2] = EBADF_PAIR;
+        int r;
+
+        assert(t);
+        assert(t->pid <= 0);
+
+        if (pipe2(pipefd, O_CLOEXEC) < 0)
+                return -errno;
+
+        r = safe_fork_full("(sd-transfer)",
+                           (int[]) { t->stdin_fd, t->stdout_fd < 0 ? pipefd[1] : t->stdout_fd, pipefd[1] },
+                           NULL, 0,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO, &t->pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                const char *cmd[] = {
+                        NULL, /* systemd-import, systemd-import-fs, systemd-export or systemd-pull */
+                        NULL, /* tar, raw  */
+                        NULL, /* --verify= */
+                        NULL, /* verify argument */
+                        NULL, /* maybe --force */
+                        NULL, /* maybe --read-only */
+                        NULL, /* if so: the actual URL */
+                        NULL, /* maybe --format= */
+                        NULL, /* if so: the actual format */
+                        NULL, /* remote */
+                        NULL, /* local */
+                        NULL
+                };
+                unsigned k = 0;
+
+                /* Child */
+
+                if (setenv("SYSTEMD_LOG_TARGET", "console-prefixed", 1) < 0 ||
+                    setenv("NOTIFY_SOCKET", "/run/systemd/import/notify", 1) < 0) {
+                        log_error_errno(errno, "setenv() failed: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = setenv_systemd_exec_pid(true);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to update $SYSTEMD_EXEC_PID, ignoring: %m");
+
+                switch (t->type) {
+
+                case TRANSFER_IMPORT_TAR:
+                case TRANSFER_IMPORT_RAW:
+                        cmd[k++] = SYSTEMD_IMPORT_PATH;
+                        break;
+
+                case TRANSFER_IMPORT_FS:
+                        cmd[k++] = SYSTEMD_IMPORT_FS_PATH;
+                        break;
+
+                case TRANSFER_EXPORT_TAR:
+                case TRANSFER_EXPORT_RAW:
+                        cmd[k++] = SYSTEMD_EXPORT_PATH;
+                        break;
+
+                case TRANSFER_PULL_TAR:
+                case TRANSFER_PULL_RAW:
+                        cmd[k++] = SYSTEMD_PULL_PATH;
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+
+                switch (t->type) {
+
+                case TRANSFER_IMPORT_TAR:
+                case TRANSFER_EXPORT_TAR:
+                case TRANSFER_PULL_TAR:
+                        cmd[k++] = "tar";
+                        break;
+
+                case TRANSFER_IMPORT_RAW:
+                case TRANSFER_EXPORT_RAW:
+                case TRANSFER_PULL_RAW:
+                        cmd[k++] = "raw";
+                        break;
+
+                case TRANSFER_IMPORT_FS:
+                        cmd[k++] = "run";
+                        break;
+
+                default:
+                        break;
+                }
+
+                if (t->verify != _IMPORT_VERIFY_INVALID) {
+                        cmd[k++] = "--verify";
+                        cmd[k++] = import_verify_to_string(t->verify);
+                }
+
+                if (t->force_local)
+                        cmd[k++] = "--force";
+                if (t->read_only)
+                        cmd[k++] = "--read-only";
+
+                if (t->format) {
+                        cmd[k++] = "--format";
+                        cmd[k++] = t->format;
+                }
+
+                if (!IN_SET(t->type, TRANSFER_EXPORT_TAR, TRANSFER_EXPORT_RAW)) {
+                        if (t->remote)
+                                cmd[k++] = t->remote;
+                        else
+                                cmd[k++] = "-";
+                }
+
+                if (t->local)
+                        cmd[k++] = t->local;
+                cmd[k] = NULL;
+
+                execv(cmd[0], (char * const *) cmd);
+                log_error_errno(errno, "Failed to execute %s tool: %m", cmd[0]);
+                _exit(EXIT_FAILURE);
+        }
+
+        pipefd[1] = safe_close(pipefd[1]);
+        t->log_fd = TAKE_FD(pipefd[0]);
+
+        t->stdin_fd = safe_close(t->stdin_fd);
+
+        r = sd_event_add_child(t->manager->event, &t->pid_event_source,
+                               t->pid, WEXITED, transfer_on_pid, t);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_io(t->manager->event, &t->log_event_source,
+                            t->log_fd, EPOLLIN, transfer_on_log, t);
+        if (r < 0)
+                return r;
+
+        /* Make sure always process logging before SIGCHLD */
+        r = sd_event_source_set_priority(t->log_event_source, SD_EVENT_PRIORITY_NORMAL -5);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_emit_signal(
+                        t->manager->bus,
+                        "/org/freedesktop/import1",
+                        "org.freedesktop.import1.Manager",
+                        "TransferNew",
+                        "uo",
+                        t->id,
+                        t->object_path);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static Manager *manager_unref(Manager *m) {
+        Transfer *t;
+
+        if (!m)
+                return NULL;
+
+        sd_event_source_unref(m->notify_event_source);
+        safe_close(m->notify_fd);
+
+        while ((t = hashmap_first(m->transfers)))
+                transfer_unref(t);
+
+        hashmap_free(m->transfers);
+
+        bus_verify_polkit_async_registry_free(m->polkit_registry);
+
+        m->bus = sd_bus_flush_close_unref(m->bus);
+        sd_event_unref(m->event);
+
+        return mfree(m);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unref);
+
+static int manager_on_notify(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+
+        char buf[NOTIFY_BUFFER_MAX+1];
+        struct iovec iovec = {
+                .iov_base = buf,
+                .iov_len = sizeof(buf)-1,
+        };
+        CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
+                         CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
+        struct msghdr msghdr = {
+                .msg_iov = &iovec,
+                .msg_iovlen = 1,
+                .msg_control = &control,
+                .msg_controllen = sizeof(control),
+        };
+        struct ucred *ucred;
+        Manager *m = userdata;
+        Transfer *t;
+        ssize_t n;
+        char *p;
+        int r;
+
+        n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+        if (n < 0) {
+                if (ERRNO_IS_TRANSIENT(n))
+                        return 0;
+                return (int) n;
+        }
+
+        cmsg_close_all(&msghdr);
+
+        if (msghdr.msg_flags & MSG_TRUNC) {
+                log_warning("Got overly long notification datagram, ignoring.");
+                return 0;
+        }
+
+        ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
+        if (!ucred || ucred->pid <= 0) {
+                log_warning("Got notification datagram lacking credential information, ignoring.");
+                return 0;
+        }
+
+        HASHMAP_FOREACH(t, m->transfers)
+                if (ucred->pid == t->pid)
+                        break;
+
+        if (!t) {
+                log_warning("Got notification datagram from unexpected peer, ignoring.");
+                return 0;
+        }
+
+        buf[n] = 0;
+
+        p = find_line_startswith(buf, "X_IMPORT_PROGRESS=");
+        if (!p)
+                return 0;
+
+        truncate_nl(p);
+
+        r = parse_percent(p);
+        if (r < 0) {
+                log_warning("Got invalid percent value '%s', ignoring.", p);
+                return 0;
+        }
+
+        t->progress_percent = (unsigned) r;
+
+        log_debug("Got percentage from client: %u%%", t->progress_percent);
+        return 0;
+}
+
+static int manager_new(Manager **ret) {
+        _cleanup_(manager_unrefp) Manager *m = NULL;
+        static const union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/import/notify",
+        };
+        int r;
+
+        assert(ret);
+
+        m = new(Manager, 1);
+        if (!m)
+                return -ENOMEM;
+
+        *m = (Manager) {
+                .use_btrfs_subvol = true,
+                .use_btrfs_quota = true,
+        };
+
+        r = sd_event_default(&m->event);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_set_watchdog(m->event, true);
+
+        r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
+
+        r = sd_bus_default_system(&m->bus);
+        if (r < 0)
+                return r;
+
+        m->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+        if (m->notify_fd < 0)
+                return -errno;
+
+        (void) mkdir_parents_label(sa.un.sun_path, 0755);
+        (void) sockaddr_un_unlink(&sa.un);
+
+        if (bind(m->notify_fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0)
+                return -errno;
+
+        r = setsockopt_int(m->notify_fd, SOL_SOCKET, SO_PASSCRED, true);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_io(m->event, &m->notify_event_source,
+                            m->notify_fd, EPOLLIN, manager_on_notify, m);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(m);
+
+        return 0;
+}
+
+static Transfer *manager_find(Manager *m, TransferType type, const char *remote) {
+        Transfer *t;
+
+        assert(m);
+        assert(type >= 0);
+        assert(type < _TRANSFER_TYPE_MAX);
+
+        HASHMAP_FOREACH(t, m->transfers)
+                if (t->type == type && streq_ptr(t->remote, remote))
+                        return t;
+
+        return NULL;
+}
+
+static int method_import_tar_or_raw(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        _cleanup_(transfer_unrefp) Transfer *t = NULL;
+        int fd, force, read_only, r;
+        const char *local, *object;
+        Manager *m = ASSERT_PTR(userdata);
+        TransferType type;
+        struct stat st;
+        uint32_t id;
+
+        assert(msg);
+
+        r = bus_verify_polkit_async(
+                        msg,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.import1.import",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = sd_bus_message_read(msg, "hsbb", &fd, &local, &force, &read_only);
+        if (r < 0)
+                return r;
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        if (!S_ISREG(st.st_mode) && !S_ISFIFO(st.st_mode))
+                return -EINVAL;
+
+        if (!hostname_is_valid(local, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Local name %s is invalid", local);
+
+        r = setup_machine_directory(error, m->use_btrfs_subvol, m->use_btrfs_quota);
+        if (r < 0)
+                return r;
+
+        type = streq_ptr(sd_bus_message_get_member(msg), "ImportTar") ?
+                TRANSFER_IMPORT_TAR : TRANSFER_IMPORT_RAW;
+
+        r = transfer_new(m, &t);
+        if (r < 0)
+                return r;
+
+        t->type = type;
+        t->force_local = force;
+        t->read_only = read_only;
+
+        t->local = strdup(local);
+        if (!t->local)
+                return -ENOMEM;
+
+        t->stdin_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+        if (t->stdin_fd < 0)
+                return -errno;
+
+        r = transfer_start(t);
+        if (r < 0)
+                return r;
+
+        object = t->object_path;
+        id = t->id;
+        t = NULL;
+
+        return sd_bus_reply_method_return(msg, "uo", id, object);
+}
+
+static int method_import_fs(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        _cleanup_(transfer_unrefp) Transfer *t = NULL;
+        int fd, force, read_only, r;
+        const char *local, *object;
+        Manager *m = ASSERT_PTR(userdata);
+        uint32_t id;
+
+        assert(msg);
+
+        r = bus_verify_polkit_async(
+                        msg,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.import1.import",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = sd_bus_message_read(msg, "hsbb", &fd, &local, &force, &read_only);
+        if (r < 0)
+                return r;
+
+        r = fd_verify_directory(fd);
+        if (r < 0)
+                return r;
+
+        if (!hostname_is_valid(local, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Local name %s is invalid", local);
+
+        r = setup_machine_directory(error, m->use_btrfs_subvol, m->use_btrfs_quota);
+        if (r < 0)
+                return r;
+
+        r = transfer_new(m, &t);
+        if (r < 0)
+                return r;
+
+        t->type = TRANSFER_IMPORT_FS;
+        t->force_local = force;
+        t->read_only = read_only;
+
+        t->local = strdup(local);
+        if (!t->local)
+                return -ENOMEM;
+
+        t->stdin_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+        if (t->stdin_fd < 0)
+                return -errno;
+
+        r = transfer_start(t);
+        if (r < 0)
+                return r;
+
+        object = t->object_path;
+        id = t->id;
+        t = NULL;
+
+        return sd_bus_reply_method_return(msg, "uo", id, object);
+}
+
+static int method_export_tar_or_raw(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        _cleanup_(transfer_unrefp) Transfer *t = NULL;
+        int fd, r;
+        const char *local, *object, *format;
+        Manager *m = ASSERT_PTR(userdata);
+        TransferType type;
+        struct stat st;
+        uint32_t id;
+
+        assert(msg);
+
+        r = bus_verify_polkit_async(
+                        msg,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.import1.export",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = sd_bus_message_read(msg, "shs", &local, &fd, &format);
+        if (r < 0)
+                return r;
+
+        if (!hostname_is_valid(local, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Local name %s is invalid", local);
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        if (!S_ISREG(st.st_mode) && !S_ISFIFO(st.st_mode))
+                return -EINVAL;
+
+        type = streq_ptr(sd_bus_message_get_member(msg), "ExportTar") ?
+                TRANSFER_EXPORT_TAR : TRANSFER_EXPORT_RAW;
+
+        r = transfer_new(m, &t);
+        if (r < 0)
+                return r;
+
+        t->type = type;
+
+        if (!isempty(format)) {
+                t->format = strdup(format);
+                if (!t->format)
+                        return -ENOMEM;
+        }
+
+        t->local = strdup(local);
+        if (!t->local)
+                return -ENOMEM;
+
+        t->stdout_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+        if (t->stdout_fd < 0)
+                return -errno;
+
+        r = transfer_start(t);
+        if (r < 0)
+                return r;
+
+        object = t->object_path;
+        id = t->id;
+        t = NULL;
+
+        return sd_bus_reply_method_return(msg, "uo", id, object);
+}
+
+static int method_pull_tar_or_raw(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        _cleanup_(transfer_unrefp) Transfer *t = NULL;
+        const char *remote, *local, *verify, *object;
+        Manager *m = ASSERT_PTR(userdata);
+        ImportVerify v;
+        TransferType type;
+        int force, r;
+        uint32_t id;
+
+        assert(msg);
+
+        r = bus_verify_polkit_async(
+                        msg,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.import1.pull",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = sd_bus_message_read(msg, "sssb", &remote, &local, &verify, &force);
+        if (r < 0)
+                return r;
+
+        if (!http_url_is_valid(remote) && !file_url_is_valid(remote))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "URL %s is invalid", remote);
+
+        if (isempty(local))
+                local = NULL;
+        else if (!hostname_is_valid(local, 0))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Local name %s is invalid", local);
+
+        if (isempty(verify))
+                v = IMPORT_VERIFY_SIGNATURE;
+        else
+                v = import_verify_from_string(verify);
+        if (v < 0)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+                                         "Unknown verification mode %s", verify);
+
+        r = setup_machine_directory(error, m->use_btrfs_subvol, m->use_btrfs_quota);
+        if (r < 0)
+                return r;
+
+        type = streq_ptr(sd_bus_message_get_member(msg), "PullTar") ?
+                TRANSFER_PULL_TAR : TRANSFER_PULL_RAW;
+
+        if (manager_find(m, type, remote))
+                return sd_bus_error_setf(error, BUS_ERROR_TRANSFER_IN_PROGRESS,
+                                         "Transfer for %s already in progress.", remote);
+
+        r = transfer_new(m, &t);
+        if (r < 0)
+                return r;
+
+        t->type = type;
+        t->verify = v;
+        t->force_local = force;
+
+        t->remote = strdup(remote);
+        if (!t->remote)
+                return -ENOMEM;
+
+        if (local) {
+                t->local = strdup(local);
+                if (!t->local)
+                        return -ENOMEM;
+        }
+
+        r = transfer_start(t);
+        if (r < 0)
+                return r;
+
+        object = t->object_path;
+        id = t->id;
+        t = NULL;
+
+        return sd_bus_reply_method_return(msg, "uo", id, object);
+}
+
+static int method_list_transfers(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+        Manager *m = ASSERT_PTR(userdata);
+        Transfer *t;
+        int r;
+
+        assert(msg);
+
+        r = sd_bus_message_new_method_return(msg, &reply);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_message_open_container(reply, 'a', "(usssdo)");
+        if (r < 0)
+                return r;
+
+        HASHMAP_FOREACH(t, m->transfers) {
+
+                r = sd_bus_message_append(
+                                reply,
+                                "(usssdo)",
+                                t->id,
+                                transfer_type_to_string(t->type),
+                                t->remote,
+                                t->local,
+                                transfer_percent_as_double(t),
+                                t->object_path);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sd_bus_message_close_container(reply);
+        if (r < 0)
+                return r;
+
+        return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_cancel(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        Transfer *t = ASSERT_PTR(userdata);
+        int r;
+
+        assert(msg);
+
+        r = bus_verify_polkit_async(
+                        msg,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.import1.pull",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &t->manager->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = transfer_cancel(t);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(msg, NULL);
+}
+
+static int method_cancel_transfer(sd_bus_message *msg, void *userdata, sd_bus_error *error) {
+        Manager *m = ASSERT_PTR(userdata);
+        Transfer *t;
+        uint32_t id;
+        int r;
+
+        assert(msg);
+
+        r = bus_verify_polkit_async(
+                        msg,
+                        CAP_SYS_ADMIN,
+                        "org.freedesktop.import1.pull",
+                        NULL,
+                        false,
+                        UID_INVALID,
+                        &m->polkit_registry,
+                        error);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return 1; /* Will call us back */
+
+        r = sd_bus_message_read(msg, "u", &id);
+        if (r < 0)
+                return r;
+        if (id <= 0)
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid transfer id");
+
+        t = hashmap_get(m->transfers, UINT32_TO_PTR(id));
+        if (!t)
+                return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_TRANSFER, "No transfer by id %" PRIu32, id);
+
+        r = transfer_cancel(t);
+        if (r < 0)
+                return r;
+
+        return sd_bus_reply_method_return(msg, NULL);
+}
+
+static int property_get_progress(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        Transfer *t = ASSERT_PTR(userdata);
+
+        assert(bus);
+        assert(reply);
+
+        return sd_bus_message_append(reply, "d", transfer_percent_as_double(t));
+}
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, transfer_type, TransferType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_verify, import_verify, ImportVerify);
+
+static int transfer_object_find(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                void *userdata,
+                void **found,
+                sd_bus_error *error) {
+
+        Manager *m = ASSERT_PTR(userdata);
+        Transfer *t;
+        const char *p;
+        uint32_t id;
+        int r;
+
+        assert(bus);
+        assert(path);
+        assert(interface);
+        assert(found);
+
+        p = startswith(path, "/org/freedesktop/import1/transfer/_");
+        if (!p)
+                return 0;
+
+        r = safe_atou32(p, &id);
+        if (r < 0 || id == 0)
+                return 0;
+
+        t = hashmap_get(m->transfers, UINT32_TO_PTR(id));
+        if (!t)
+                return 0;
+
+        *found = t;
+        return 1;
+}
+
+static int transfer_node_enumerator(
+                sd_bus *bus,
+                const char *path,
+                void *userdata,
+                char ***nodes,
+                sd_bus_error *error) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        Manager *m = userdata;
+        Transfer *t;
+        unsigned k = 0;
+
+        l = new0(char*, hashmap_size(m->transfers) + 1);
+        if (!l)
+                return -ENOMEM;
+
+        HASHMAP_FOREACH(t, m->transfers) {
+
+                l[k] = strdup(t->object_path);
+                if (!l[k])
+                        return -ENOMEM;
+
+                k++;
+        }
+
+        *nodes = TAKE_PTR(l);
+
+        return 1;
+}
+
+static const sd_bus_vtable transfer_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Transfer, id), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Local", "s", NULL, offsetof(Transfer, local), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Remote", "s", NULL, offsetof(Transfer, remote), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Transfer, type), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Verify", "s", property_get_verify, offsetof(Transfer, verify), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0),
+
+        SD_BUS_METHOD("Cancel", NULL, NULL, method_cancel, SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_SIGNAL_WITH_NAMES("LogMessage",
+                                 "us",
+                                 SD_BUS_PARAM(priority)
+                                 SD_BUS_PARAM(line),
+                                 0),
+
+        SD_BUS_VTABLE_END,
+};
+
+static const BusObjectImplementation transfer_object = {
+        "/org/freedesktop/import1/transfer",
+        "org.freedesktop.import1.Transfer",
+        .fallback_vtables = BUS_FALLBACK_VTABLES({transfer_vtable, transfer_object_find}),
+        .node_enumerator = transfer_node_enumerator,
+};
+
+static const sd_bus_vtable manager_vtable[] = {
+        SD_BUS_VTABLE_START(0),
+
+        SD_BUS_METHOD_WITH_NAMES("ImportTar",
+                                 "hsbb",
+                                 SD_BUS_PARAM(fd)
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(force)
+                                 SD_BUS_PARAM(read_only),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_import_tar_or_raw,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("ImportRaw",
+                                 "hsbb",
+                                 SD_BUS_PARAM(fd)
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(force)
+                                 SD_BUS_PARAM(read_only),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_import_tar_or_raw,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("ImportFileSystem",
+                                 "hsbb",
+                                 SD_BUS_PARAM(fd)
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(force)
+                                 SD_BUS_PARAM(read_only),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_import_fs,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("ExportTar",
+                                 "shs",
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(fd)
+                                 SD_BUS_PARAM(format),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_export_tar_or_raw,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("ExportRaw",
+                                 "shs",
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(fd)
+                                 SD_BUS_PARAM(format),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_export_tar_or_raw,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("PullTar",
+                                 "sssb",
+                                 SD_BUS_PARAM(url)
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(verify_mode)
+                                 SD_BUS_PARAM(force),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_pull_tar_or_raw,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("PullRaw",
+                                 "sssb",
+                                 SD_BUS_PARAM(url)
+                                 SD_BUS_PARAM(local_name)
+                                 SD_BUS_PARAM(verify_mode)
+                                 SD_BUS_PARAM(force),
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 method_pull_tar_or_raw,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("ListTransfers",
+                                 NULL,,
+                                 "a(usssdo)",
+                                 SD_BUS_PARAM(transfers),
+                                 method_list_transfers,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_NAMES("CancelTransfer",
+                                 "u",
+                                 SD_BUS_PARAM(transfer_id),
+                                 NULL,,
+                                 method_cancel_transfer,
+                                 SD_BUS_VTABLE_UNPRIVILEGED),
+
+        SD_BUS_SIGNAL_WITH_NAMES("TransferNew",
+                                 "uo",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path),
+                                 0),
+        SD_BUS_SIGNAL_WITH_NAMES("TransferRemoved",
+                                 "uos",
+                                 SD_BUS_PARAM(transfer_id)
+                                 SD_BUS_PARAM(transfer_path)
+                                 SD_BUS_PARAM(result),
+                                 0),
+
+        SD_BUS_VTABLE_END,
+};
+
+static const BusObjectImplementation manager_object = {
+        "/org/freedesktop/import1",
+        "org.freedesktop.import1.Manager",
+        .vtables = BUS_VTABLES(manager_vtable),
+        .children = BUS_IMPLEMENTATIONS(&transfer_object),
+};
+
+static int manager_add_bus_objects(Manager *m) {
+        int r;
+
+        assert(m);
+
+        r = bus_add_implementation(m->bus, &manager_object, m);
+        if (r < 0)
+                return r;
+
+        r = bus_log_control_api_register(m->bus);
+        if (r < 0)
+                return r;
+
+        r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.import1", 0, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to request name: %m");
+
+        r = sd_bus_attach_event(m->bus, m->event, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach bus to event loop: %m");
+
+        return 0;
+}
+
+static bool manager_check_idle(void *userdata) {
+        Manager *m = userdata;
+
+        return hashmap_isempty(m->transfers);
+}
+
+static int manager_run(Manager *m) {
+        assert(m);
+
+        return bus_event_loop_with_idle(
+                        m->event,
+                        m->bus,
+                        "org.freedesktop.import1",
+                        DEFAULT_EXIT_USEC,
+                        manager_check_idle,
+                        m);
+}
+
+static void manager_parse_env(Manager *m) {
+        int r;
+
+        assert(m);
+
+        /* Same as src/import/{import,pull}.c:
+         * Let's make these relatively low-level settings also controllable via env vars. User can then set
+         * them for systemd-importd.service if they like to tweak behaviour */
+
+        r = getenv_bool("SYSTEMD_IMPORT_BTRFS_SUBVOL");
+        if (r >= 0)
+                m->use_btrfs_subvol = r;
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_BTRFS_SUBVOL: %m");
+
+        r = getenv_bool("SYSTEMD_IMPORT_BTRFS_QUOTA");
+        if (r >= 0)
+                m->use_btrfs_quota = r;
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_BTRFS_QUOTA: %m");
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(manager_unrefp) Manager *m = NULL;
+        int r;
+
+        log_setup();
+
+        r = service_parse_argv("systemd-importd.service",
+                               "VM and container image import and export service.",
+                               BUS_IMPLEMENTATIONS(&manager_object,
+                                                   &log_control_object),
+                               argc, argv);
+        if (r <= 0)
+                return r;
+
+        umask(0022);
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
+
+        r = manager_new(&m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate manager object: %m");
+
+        manager_parse_env(m);
+
+        r = manager_add_bus_objects(m);
+        if (r < 0)
+                return r;
+
+        r = manager_run(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/import/meson.build b/src/import/meson.build
new file mode 100644
index 0000000..3f0acf8
--- /dev/null
+++ b/src/import/meson.build
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+systemd_importd_sources = files(
+        'importd.c',
+)
+
+systemd_pull_sources = files(
+        'pull.c',
+        'pull-raw.c',
+        'pull-tar.c',
+        'pull-job.c',
+        'pull-common.c',
+        'curl-util.c',
+)
+
+systemd_import_sources = files(
+        'import.c',
+        'import-raw.c',
+        'import-tar.c',
+)
+
+systemd_import_fs_sources = files(
+        'import-fs.c',
+)
+
+systemd_export_sources = files(
+        'export.c',
+        'export-tar.c',
+        'export-raw.c',
+)
+
+importd_common_sources = files(
+        'import-common.c',
+        'import-compress.c',
+        'qcow2-util.c',
+)
+
+lib_import_common = static_library(
+        'import-common',
+        sources : importd_common_sources,
+        include_directories : includes,
+        dependencies : [
+                libbzip2,
+                libxz,
+                libz,
+                userspace,
+        ],
+        build_by_default : false)
+
+common_libs = [
+        lib_import_common,
+        libshared,
+]
+
+common_deps = [
+        libbzip2,
+        libcurl,
+        libxz,
+        libz,
+]
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-importd',
+                'dbus' : true,
+                'conditions' : ['ENABLE_IMPORTD'],
+                'sources' : systemd_importd_sources,
+                'dependencies' : threads,
+        },
+        libexec_template + {
+                'name' : 'systemd-pull',
+                'public' : true,
+                'conditions' : ['ENABLE_IMPORTD'],
+                'sources' : systemd_pull_sources,
+                'link_with' : common_libs,
+                'dependencies' : common_deps + [
+                        lib_openssl_or_gcrypt,
+                ],
+        },
+        libexec_template + {
+                'name' : 'systemd-import',
+                'public' : true,
+                'conditions' : ['ENABLE_IMPORTD'],
+                'sources' : systemd_import_sources,
+                'link_with' : common_libs,
+                'dependencies' : common_deps,
+        },
+        libexec_template + {
+                'name' : 'systemd-import-fs',
+                'public' : true,
+                'conditions' : ['ENABLE_IMPORTD'],
+                'sources' : systemd_import_fs_sources,
+                'link_with' : common_libs,
+        },
+        libexec_template + {
+                'name' : 'systemd-export',
+                'public' : true,
+                'conditions' : ['ENABLE_IMPORTD'],
+                'sources' : systemd_export_sources,
+                'link_with' : common_libs,
+                'dependencies' : common_deps,
+        },
+        test_template + {
+                'sources' : files(
+                        'test-qcow2.c',
+                        'qcow2-util.c',
+                ),
+                'dependencies' : libz,
+                'conditions' : ['HAVE_ZLIB'],
+                'type' : 'manual',
+        },
+]
+
+if conf.get('ENABLE_IMPORTD') == 1
+        install_data('org.freedesktop.import1.conf',
+                     install_dir : dbuspolicydir)
+        install_data('org.freedesktop.import1.service',
+                     install_dir : dbussystemservicedir)
+        install_data('org.freedesktop.import1.policy',
+                     install_dir : polkitpolicydir)
+
+        install_data('import-pubring.gpg',
+                     install_dir : libexecdir)
+        # TODO: shouldn't this be in pkgdatadir?
+endif
diff --git a/src/import/org.freedesktop.import1.conf b/src/import/org.freedesktop.import1.conf
new file mode 100644
index 0000000..d252ff6
--- /dev/null
+++ b/src/import/org.freedesktop.import1.conf
@@ -0,0 +1,84 @@
+ 
+
+
+
+
+
+
+        
+                
+                
+                
+        
+
+        
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+
+                
+        
+
+
diff --git a/src/import/org.freedesktop.import1.policy b/src/import/org.freedesktop.import1.policy
new file mode 100644
index 0000000..88e436d
--- /dev/null
+++ b/src/import/org.freedesktop.import1.policy
@@ -0,0 +1,51 @@
+ 
+
+
+
+
+
+
+        The systemd Project
+        https://systemd.io
+
+        
+                Import a VM or container image
+                Authentication is required to import a VM or container image
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+        
+                Export a VM or container image
+                Authentication is required to export a VM or container image
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+        
+                Download a VM or container image
+                Authentication is required to download a VM or container image
+                
+                        auth_admin
+                        auth_admin
+                        auth_admin_keep
+                
+        
+
+
diff --git a/src/import/org.freedesktop.import1.service b/src/import/org.freedesktop.import1.service
new file mode 100644
index 0000000..4fe921f
--- /dev/null
+++ b/src/import/org.freedesktop.import1.service
@@ -0,0 +1,14 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.import1
+Exec=/bin/false
+User=root
+SystemdService=dbus-org.freedesktop.import1.service
diff --git a/src/import/pull-common.c b/src/import/pull-common.c
new file mode 100644
index 0000000..5e1ea20
--- /dev/null
+++ b/src/import/pull-common.c
@@ -0,0 +1,669 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "capability-util.h"
+#include "copy.h"
+#include "dirent-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "hostname-util.h"
+#include "io-util.h"
+#include "memory-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "pull-common.h"
+#include "pull-job.h"
+#include "rm-rf.h"
+#include "signal-util.h"
+#include "siphash24.h"
+#include "string-util.h"
+#include "strv.h"
+#include "web-util.h"
+
+#define FILENAME_ESCAPE "/.#\"\'"
+#define HASH_URL_THRESHOLD_LENGTH (_POSIX_PATH_MAX - 16)
+
+int pull_find_old_etags(
+                const char *url,
+                const char *image_root,
+                int dt,
+                const char *prefix,
+                const char *suffix,
+                char ***etags) {
+
+        int r;
+
+        assert(url);
+        assert(etags);
+
+        if (!image_root)
+                image_root = "/var/lib/machines";
+
+        _cleanup_free_ char *escaped_url = xescape(url, FILENAME_ESCAPE);
+        if (!escaped_url)
+                return -ENOMEM;
+
+        _cleanup_closedir_ DIR *d = opendir(image_root);
+        if (!d) {
+                if (errno == ENOENT) {
+                        *etags = NULL;
+                        return 0;
+                }
+
+                return -errno;
+        }
+
+        _cleanup_strv_free_ char **ans = NULL;
+
+        FOREACH_DIRENT_ALL(de, d, return -errno) {
+                _cleanup_free_ char *u = NULL;
+                const char *a, *b;
+
+                if (de->d_type != DT_UNKNOWN &&
+                    de->d_type != dt)
+                        continue;
+
+                if (prefix) {
+                        a = startswith(de->d_name, prefix);
+                        if (!a)
+                                continue;
+                } else
+                        a = de->d_name;
+
+                a = startswith(a, escaped_url);
+                if (!a)
+                        continue;
+
+                a = startswith(a, ".");
+                if (!a)
+                        continue;
+
+                if (suffix) {
+                        b = endswith(de->d_name, suffix);
+                        if (!b)
+                                continue;
+                } else
+                        b = strchr(de->d_name, 0);
+
+                if (a >= b)
+                        continue;
+
+                ssize_t l = cunescape_length(a, b - a, 0, &u);
+                if (l < 0) {
+                        assert(l >= INT8_MIN);
+                        return l;
+                }
+
+                if (!http_etag_is_valid(u))
+                        continue;
+
+                r = strv_consume(&ans, TAKE_PTR(u));
+                if (r < 0)
+                        return r;
+        }
+
+        *etags = TAKE_PTR(ans);
+
+        return 0;
+}
+
+static int hash_url(const char *url, char **ret) {
+        uint64_t h;
+        static const sd_id128_t k = SD_ID128_ARRAY(df,89,16,87,01,cc,42,30,98,ab,4a,19,a6,a5,63,4f);
+
+        assert(url);
+
+        h = siphash24(url, strlen(url), k.bytes);
+        if (asprintf(ret, "%"PRIx64, h) < 0)
+                return -ENOMEM;
+
+        return 0;
+}
+
+int pull_make_path(const char *url, const char *etag, const char *image_root, const char *prefix, const char *suffix, char **ret) {
+        _cleanup_free_ char *escaped_url = NULL, *escaped_etag = NULL;
+        char *path;
+
+        assert(url);
+        assert(ret);
+
+        if (!image_root)
+                image_root = "/var/lib/machines";
+
+        escaped_url = xescape(url, FILENAME_ESCAPE);
+        if (!escaped_url)
+                return -ENOMEM;
+
+        if (etag) {
+                escaped_etag = xescape(etag, FILENAME_ESCAPE);
+                if (!escaped_etag)
+                        return -ENOMEM;
+        }
+
+        path = strjoin(image_root, "/", strempty(prefix), escaped_url, escaped_etag ? "." : "",
+                       strempty(escaped_etag), strempty(suffix));
+        if (!path)
+                return -ENOMEM;
+
+        /* URLs might make the path longer than the maximum allowed length for a file name.
+         * When that happens, a URL hash is used instead. Paths returned by this function
+         * can be later used with tempfn_random() which adds 16 bytes to the resulting name. */
+        if (strlen(path) >= HASH_URL_THRESHOLD_LENGTH) {
+                _cleanup_free_ char *hash = NULL;
+                int r;
+
+                free(path);
+
+                r = hash_url(url, &hash);
+                if (r < 0)
+                        return r;
+
+                path = strjoin(image_root, "/", strempty(prefix), hash, escaped_etag ? "." : "",
+                               strempty(escaped_etag), strempty(suffix));
+                if (!path)
+                        return -ENOMEM;
+        }
+
+        *ret = path;
+        return 0;
+}
+
+int pull_make_auxiliary_job(
+                PullJob **ret,
+                const char *url,
+                int (*strip_suffixes)(const char *name, char **ret),
+                const char *suffix,
+                ImportVerify verify,
+                CurlGlue *glue,
+                PullJobOpenDisk on_open_disk,
+                PullJobFinished on_finished,
+                void *userdata) {
+
+        _cleanup_free_ char *last_component = NULL, *ll = NULL, *auxiliary_url = NULL;
+        _cleanup_(pull_job_unrefp) PullJob *job = NULL;
+        const char *q;
+        int r;
+
+        assert(ret);
+        assert(url);
+        assert(strip_suffixes);
+        assert(glue);
+
+        r = import_url_last_component(url, &last_component);
+        if (r < 0)
+                return r;
+
+        r = strip_suffixes(last_component, &ll);
+        if (r < 0)
+                return r;
+
+        q = strjoina(ll, suffix);
+
+        r = import_url_change_last_component(url, q, &auxiliary_url);
+        if (r < 0)
+                return r;
+
+        r = pull_job_new(&job, auxiliary_url, glue, userdata);
+        if (r < 0)
+                return r;
+
+        job->on_open_disk = on_open_disk;
+        job->on_finished = on_finished;
+        job->compressed_max = job->uncompressed_max = 1ULL * 1024ULL * 1024ULL;
+        job->calc_checksum = IN_SET(verify, IMPORT_VERIFY_CHECKSUM, IMPORT_VERIFY_SIGNATURE);
+
+        *ret = TAKE_PTR(job);
+        return 0;
+}
+
+static bool is_checksum_file(const char *fn) {
+        /* Returns true if the specified filename refers to a checksum file we grok */
+
+        if (!fn)
+                return false;
+
+        return streq(fn, "SHA256SUMS") || endswith(fn, ".sha256");
+}
+
+static bool is_signature_file(const char *fn) {
+        /* Returns true if the specified filename refers to a signature file we grok (reminder:
+         * suse-style .sha256 files are inline signed) */
+
+        if (!fn)
+                return false;
+
+        return streq(fn, "SHA256SUMS.gpg") || endswith(fn, ".sha256");
+}
+
+int pull_make_verification_jobs(
+                PullJob **ret_checksum_job,
+                PullJob **ret_signature_job,
+                ImportVerify verify,
+                const char *checksum, /* set if literal checksum verification is requested, in which case 'verify' is set to _IMPORT_VERIFY_INVALID */
+                const char *url,
+                CurlGlue *glue,
+                PullJobFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(pull_job_unrefp) PullJob *checksum_job = NULL, *signature_job = NULL;
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        assert(ret_checksum_job);
+        assert(ret_signature_job);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify < _IMPORT_VERIFY_MAX);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify >= 0);
+        assert((verify < 0) || !checksum);
+        assert(url);
+        assert(glue);
+
+        /* If verification is turned off, or if the checksum to validate is already specified we don't need
+         * to download a checksum file or signature, hence shortcut things */
+        if (verify == IMPORT_VERIFY_NO || checksum) {
+                *ret_checksum_job = *ret_signature_job = NULL;
+                return 0;
+        }
+
+        r = import_url_last_component(url, &fn);
+        if (r < 0 && r != -EADDRNOTAVAIL) /* EADDRNOTAVAIL means there was no last component, which is OK for
+                                           * us, we'll just assume it's not a checksum/signature file */
+                return r;
+
+        /* Acquire the checksum file if verification or signature verification is requested and the main file
+         * to acquire isn't a checksum or signature file anyway */
+        if (verify != IMPORT_VERIFY_NO && !is_checksum_file(fn) && !is_signature_file(fn)) {
+                _cleanup_free_ char *checksum_url = NULL;
+                const char *suffixed = NULL;
+
+                /* Queue jobs for the checksum file for the image. */
+
+                if (fn)
+                        suffixed = strjoina(fn, ".sha256"); /* Start with the suse-style checksum (if there's a base filename) */
+                else
+                        suffixed = "SHA256SUMS";
+
+                r = import_url_change_last_component(url, suffixed, &checksum_url);
+                if (r < 0)
+                        return r;
+
+                r = pull_job_new(&checksum_job, checksum_url, glue, userdata);
+                if (r < 0)
+                        return r;
+
+                checksum_job->on_finished = on_finished;
+                checksum_job->uncompressed_max = checksum_job->compressed_max = 1ULL * 1024ULL * 1024ULL;
+                checksum_job->on_not_found = pull_job_restart_with_sha256sum; /* if this fails, look for ubuntu-style checksum */
+        }
+
+        if (verify == IMPORT_VERIFY_SIGNATURE && !is_signature_file(fn)) {
+                _cleanup_free_ char *signature_url = NULL;
+
+                /* Queue job for the SHA256SUMS.gpg file for the image. */
+                r = import_url_change_last_component(url, "SHA256SUMS.gpg", &signature_url);
+                if (r < 0)
+                        return r;
+
+                r = pull_job_new(&signature_job, signature_url, glue, userdata);
+                if (r < 0)
+                        return r;
+
+                signature_job->on_finished = on_finished;
+                signature_job->uncompressed_max = signature_job->compressed_max = 1ULL * 1024ULL * 1024ULL;
+        }
+
+        *ret_checksum_job = TAKE_PTR(checksum_job);
+        *ret_signature_job = TAKE_PTR(signature_job);
+        return 0;
+}
+
+static int verify_one(PullJob *checksum_job, PullJob *job) {
+        _cleanup_free_ char *fn = NULL;
+        const char *line, *p;
+        int r;
+
+        assert(checksum_job);
+
+        if (!job)
+                return 0;
+
+        assert(IN_SET(job->state, PULL_JOB_DONE, PULL_JOB_FAILED));
+
+        /* Don't verify the checksum if we didn't actually successfully download something new */
+        if (job->state != PULL_JOB_DONE)
+                return 0;
+        if (job->error != 0)
+                return 0;
+        if (job->etag_exists)
+                return 0;
+
+        assert(job->calc_checksum);
+        assert(job->checksum);
+
+        r = import_url_last_component(job->url, &fn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract filename from URL '%s': %m", job->url);
+
+        if (!filename_is_valid(fn))
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                       "Cannot verify checksum, could not determine server-side file name.");
+
+        if (is_checksum_file(fn) || is_signature_file(fn)) /* We cannot verify checksum files or signature files with a checksum file */
+                return log_error_errno(SYNTHETIC_ERRNO(ELOOP),
+                                       "Cannot verify checksum/signature files via themselves.");
+
+        line = strjoina(job->checksum, " *", fn, "\n"); /* string for binary mode */
+        p = memmem_safe(checksum_job->payload,
+                        checksum_job->payload_size,
+                        line,
+                        strlen(line));
+        if (!p) {
+                line = strjoina(job->checksum, "  ", fn, "\n"); /* string for text mode */
+                p = memmem_safe(checksum_job->payload,
+                                checksum_job->payload_size,
+                                line,
+                                strlen(line));
+        }
+
+        /* Only counts if found at beginning of a line */
+        if (!p || (p != (char*) checksum_job->payload && p[-1] != '\n'))
+                return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                       "DOWNLOAD INVALID: Checksum of %s file did not check out, file has been tampered with.", fn);
+
+        log_info("SHA256 checksum of %s is valid.", job->url);
+        return 1;
+}
+
+static int verify_gpg(
+                const void *payload, size_t payload_size,
+                const void *signature, size_t signature_size) {
+
+        _cleanup_close_pair_ int gpg_pipe[2] = EBADF_PAIR;
+        char sig_file_path[] = "/tmp/sigXXXXXX", gpg_home[] = "/tmp/gpghomeXXXXXX";
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        bool gpg_home_created = false;
+        int r;
+
+        assert(payload || payload_size == 0);
+        assert(signature || signature_size == 0);
+
+        r = pipe2(gpg_pipe, O_CLOEXEC);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to create pipe for gpg: %m");
+
+        if (signature_size > 0) {
+                _cleanup_close_ int sig_file = -EBADF;
+
+                sig_file = mkostemp(sig_file_path, O_RDWR);
+                if (sig_file < 0)
+                        return log_error_errno(errno, "Failed to create temporary file: %m");
+
+                r = loop_write(sig_file, signature, signature_size);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to write to temporary file: %m");
+                        goto finish;
+                }
+        }
+
+        if (!mkdtemp(gpg_home)) {
+                r = log_error_errno(errno, "Failed to create temporary home for gpg: %m");
+                goto finish;
+        }
+
+        gpg_home_created = true;
+
+        r = safe_fork_full("(gpg)",
+                           (int[]) { gpg_pipe[0], -EBADF, STDERR_FILENO },
+                           NULL, 0,
+                           FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE,
+                           &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                const char *cmd[] = {
+                        "gpg",
+                        "--no-options",
+                        "--no-default-keyring",
+                        "--no-auto-key-locate",
+                        "--no-auto-check-trustdb",
+                        "--batch",
+                        "--trust-model=always",
+                        NULL, /* --homedir=  */
+                        NULL, /* --keyring= */
+                        NULL, /* --verify */
+                        NULL, /* signature file */
+                        NULL, /* dash */
+                        NULL  /* trailing NULL */
+                };
+                size_t k = ELEMENTSOF(cmd) - 6;
+
+                /* Child */
+
+                cmd[k++] = strjoina("--homedir=", gpg_home);
+
+                /* We add the user keyring only to the command line arguments, if it's around since gpg fails
+                 * otherwise. */
+                if (access(USER_KEYRING_PATH, F_OK) >= 0)
+                        cmd[k++] = "--keyring=" USER_KEYRING_PATH;
+                else
+                        cmd[k++] = "--keyring=" VENDOR_KEYRING_PATH;
+
+                cmd[k++] = "--verify";
+                if (signature) {
+                        cmd[k++] = sig_file_path;
+                        cmd[k++] = "-";
+                        cmd[k++] = NULL;
+                }
+
+                execvp("gpg2", (char * const *) cmd);
+                execvp("gpg", (char * const *) cmd);
+                log_error_errno(errno, "Failed to execute gpg: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        gpg_pipe[0] = safe_close(gpg_pipe[0]);
+
+        r = loop_write(gpg_pipe[1], payload, payload_size);
+        if (r < 0) {
+                log_error_errno(r, "Failed to write to pipe: %m");
+                goto finish;
+        }
+
+        gpg_pipe[1] = safe_close(gpg_pipe[1]);
+
+        r = wait_for_terminate_and_check("gpg", TAKE_PID(pid), WAIT_LOG_ABNORMAL);
+        if (r < 0)
+                goto finish;
+        if (r != EXIT_SUCCESS)
+                r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                    "DOWNLOAD INVALID: Signature verification failed.");
+        else {
+                log_info("Signature verification succeeded.");
+                r = 0;
+        }
+
+finish:
+        if (signature_size > 0)
+                (void) unlink(sig_file_path);
+
+        if (gpg_home_created)
+                (void) rm_rf(gpg_home, REMOVE_ROOT|REMOVE_PHYSICAL);
+
+        return r;
+}
+
+int pull_verify(ImportVerify verify,
+                const char *checksum, /* Verify with literal checksum */
+                PullJob *main_job,
+                PullJob *checksum_job,
+                PullJob *signature_job,
+                PullJob *settings_job,
+                PullJob *roothash_job,
+                PullJob *roothash_signature_job,
+                PullJob *verity_job) {
+
+        _cleanup_free_ char *fn = NULL;
+        VerificationStyle style;
+        PullJob *verify_job;
+        int r;
+
+        assert(verify == _IMPORT_VERIFY_INVALID || verify < _IMPORT_VERIFY_MAX);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify >= 0);
+        assert((verify < 0) || !checksum);
+        assert(main_job);
+        assert(main_job->state == PULL_JOB_DONE);
+
+        if (verify == IMPORT_VERIFY_NO) /* verification turned off */
+                return 0;
+
+        if (checksum) {
+                /* Verification by literal checksum */
+                assert(!checksum_job);
+                assert(!signature_job);
+                assert(!settings_job);
+                assert(!roothash_job);
+                assert(!roothash_signature_job);
+                assert(!verity_job);
+
+                assert(main_job->calc_checksum);
+                assert(main_job->checksum);
+
+                if (!strcaseeq(checksum, main_job->checksum))
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                               "DOWNLOAD INVALID: Checksum of %s file did not check out, file has been tampered with.",
+                                               main_job->url);
+
+                return 0;
+        }
+
+        r = import_url_last_component(main_job->url, &fn);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract filename from URL '%s': %m", main_job->url);
+
+        if (is_signature_file(fn))
+                return log_error_errno(SYNTHETIC_ERRNO(ELOOP),
+                                       "Main download is a signature file, can't verify it.");
+
+        if (is_checksum_file(fn)) {
+                log_debug("Main download is a checksum file, can't validate its checksum with itself, skipping.");
+                verify_job = main_job;
+        } else {
+                PullJob *j;
+                assert(main_job->calc_checksum);
+                assert(main_job->checksum);
+                assert(checksum_job);
+                assert(checksum_job->state == PULL_JOB_DONE);
+
+                if (!checksum_job->payload || checksum_job->payload_size <= 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                               "Checksum is empty, cannot verify.");
+
+                FOREACH_POINTER(j, main_job, settings_job, roothash_job, roothash_signature_job, verity_job) {
+                        r = verify_one(checksum_job, j);
+                        if (r < 0)
+                                return r;
+                }
+
+                verify_job = checksum_job;
+        }
+
+        if (verify != IMPORT_VERIFY_SIGNATURE)
+                return 0;
+
+        assert(verify_job);
+
+        r = verification_style_from_url(verify_job->url, &style);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine verification style from URL '%s': %m", verify_job->url);
+
+        if (style == VERIFICATION_PER_DIRECTORY) {
+                assert(signature_job);
+                assert(signature_job->state == PULL_JOB_DONE);
+
+                if (!signature_job->payload || signature_job->payload_size <= 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+                                               "Signature is empty, cannot verify.");
+
+                return verify_gpg(verify_job->payload, verify_job->payload_size, signature_job->payload, signature_job->payload_size);
+        } else
+                return verify_gpg(verify_job->payload, verify_job->payload_size, NULL, 0);
+}
+
+int verification_style_from_url(const char *url, VerificationStyle *ret) {
+        _cleanup_free_ char *last = NULL;
+        int r;
+
+        assert(url);
+        assert(ret);
+
+        /* Determines which kind of verification style is appropriate for this url */
+
+        r = import_url_last_component(url, &last);
+        if (r < 0)
+                return r;
+
+        if (streq(last, "SHA256SUMS")) {
+                *ret = VERIFICATION_PER_DIRECTORY;
+                return 0;
+        }
+
+        if (endswith(last, ".sha256")) {
+                *ret = VERIFICATION_PER_FILE;
+                return 0;
+        }
+
+        return -EINVAL;
+}
+
+int pull_job_restart_with_sha256sum(PullJob *j, char **ret) {
+        VerificationStyle style;
+        int r;
+
+        assert(j);
+
+        /* Generic implementation of a PullJobNotFound handler, that restarts the job requesting SHA256SUMS */
+
+        r = verification_style_from_url(j->url, &style);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine verification style of URL '%s': %m", j->url);
+
+        if (style == VERIFICATION_PER_DIRECTORY) /* Nothing to do anymore */
+                return 0;
+
+        assert(style == VERIFICATION_PER_FILE); /* This must have been .sha256 style URL before */
+
+        log_debug("Got 404 for %s, now trying to get SHA256SUMS instead.", j->url);
+
+        r = import_url_change_last_component(j->url, "SHA256SUMS", ret);
+        if (r < 0)
+                return log_error_errno(r, "Failed to replace SHA256SUMS suffix: %m");
+
+        return 1;
+}
+
+bool pull_validate_local(const char *name, PullFlags flags) {
+
+        if (FLAGS_SET(flags, PULL_DIRECT))
+                return path_is_valid(name);
+
+        return hostname_is_valid(name, 0);
+}
+
+int pull_url_needs_checksum(const char *url) {
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        /* Returns true if we need to validate this resource via a hash value. This returns true for all
+         * files — except for gpg signature files and SHA256SUMS files and the like, which are validated with
+         * a validation tool like gpg. */
+
+        r = import_url_last_component(url, &fn);
+        if (r == -EADDRNOTAVAIL) /* no last component? then let's assume it's not a signature/checksum file */
+                return false;
+        if (r < 0)
+                return r;
+
+        return !is_checksum_file(fn) && !is_signature_file(fn);
+}
diff --git a/src/import/pull-common.h b/src/import/pull-common.h
new file mode 100644
index 0000000..475613a
--- /dev/null
+++ b/src/import/pull-common.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "import-util.h"
+#include "pull-job.h"
+
+typedef enum PullFlags {
+        PULL_FORCE              = 1 << 0, /* replace existing image */
+        PULL_READ_ONLY          = 1 << 1, /* make generated image read-only */
+        PULL_SETTINGS           = 1 << 2, /* download .nspawn settings file */
+        PULL_ROOTHASH           = 1 << 3, /* only for raw: download .roothash file for verity */
+        PULL_ROOTHASH_SIGNATURE = 1 << 4, /* only for raw: download .roothash.p7s file for verity */
+        PULL_VERITY             = 1 << 5, /* only for raw: download .verity file for verity */
+        PULL_BTRFS_SUBVOL       = 1 << 6, /* tar: preferably create images as btrfs subvols */
+        PULL_BTRFS_QUOTA        = 1 << 7, /* tar: set up btrfs quota for new subvolume as child of parent subvolume */
+        PULL_CONVERT_QCOW2      = 1 << 8, /* raw: if we detect a qcow2 image, unpack it */
+        PULL_DIRECT             = 1 << 9, /* download without rename games */
+        PULL_SYNC               = 1 << 10, /* fsync() right before we are done */
+
+        /* The supported flags for the tar and the raw pulling */
+        PULL_FLAGS_MASK_TAR     = PULL_FORCE|PULL_READ_ONLY|PULL_SETTINGS|PULL_BTRFS_SUBVOL|PULL_BTRFS_QUOTA|PULL_DIRECT|PULL_SYNC,
+        PULL_FLAGS_MASK_RAW     = PULL_FORCE|PULL_READ_ONLY|PULL_SETTINGS|PULL_ROOTHASH|PULL_ROOTHASH_SIGNATURE|PULL_VERITY|PULL_CONVERT_QCOW2|PULL_DIRECT|PULL_SYNC,
+} PullFlags;
+
+int pull_find_old_etags(const char *url, const char *root, int dt, const char *prefix, const char *suffix, char ***etags);
+
+int pull_make_path(const char *url, const char *etag, const char *image_root, const char *prefix, const char *suffix, char **ret);
+
+int pull_make_auxiliary_job(PullJob **ret, const char *url, int (*strip_suffixes)(const char *name, char **ret), const char *suffix, ImportVerify verify, CurlGlue *glue, PullJobOpenDisk on_open_disk, PullJobFinished on_finished, void *userdata);
+int pull_make_verification_jobs(PullJob **ret_checksum_job, PullJob **ret_signature_job, ImportVerify verify, const char *checksum, const char *url, CurlGlue *glue, PullJobFinished on_finished, void *userdata);
+
+int pull_verify(ImportVerify verify, const char *checksum, PullJob *main_job, PullJob *checksum_job, PullJob *signature_job, PullJob *settings_job, PullJob *roothash_job, PullJob *roothash_signature_job, PullJob *verity_job);
+
+typedef enum VerificationStyle {
+        VERIFICATION_PER_FILE,      /* SuSE-style ".sha256" files with inline gpg signature */
+        VERIFICATION_PER_DIRECTORY, /* Ubuntu-style SHA256SUM files with detached SHA256SUM.gpg signatures */
+        _VERIFICATION_STYLE_MAX,
+        _VERIFICATION_STYLE_INVALID = -EINVAL,
+} VerificationStyle;
+
+int verification_style_from_url(const char *url, VerificationStyle *style);
+
+int pull_job_restart_with_sha256sum(PullJob *job, char **ret);
+
+bool pull_validate_local(const char *name, PullFlags flags);
+
+int pull_url_needs_checksum(const char *url);
diff --git a/src/import/pull-job.c b/src/import/pull-job.c
new file mode 100644
index 0000000..bed7e64
--- /dev/null
+++ b/src/import/pull-job.c
@@ -0,0 +1,784 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "gcrypt-util.h"
+#include "hexdecoct.h"
+#include "import-util.h"
+#include "io-util.h"
+#include "machine-pool.h"
+#include "parse-util.h"
+#include "pull-common.h"
+#include "pull-job.h"
+#include "string-util.h"
+#include "strv.h"
+#include "sync-util.h"
+#include "xattr-util.h"
+
+void pull_job_close_disk_fd(PullJob *j) {
+        if (!j)
+                return;
+
+        if (j->close_disk_fd)
+                safe_close(j->disk_fd);
+
+        j->disk_fd = -EBADF;
+}
+
+PullJob* pull_job_unref(PullJob *j) {
+        if (!j)
+                return NULL;
+
+        pull_job_close_disk_fd(j);
+
+        curl_glue_remove_and_free(j->glue, j->curl);
+        curl_slist_free_all(j->request_header);
+
+        import_compress_free(&j->compress);
+
+        if (j->checksum_ctx)
+#if PREFER_OPENSSL
+                EVP_MD_CTX_free(j->checksum_ctx);
+#else
+                gcry_md_close(j->checksum_ctx);
+#endif
+
+        free(j->url);
+        free(j->etag);
+        strv_free(j->old_etags);
+        free(j->payload);
+        free(j->checksum);
+
+        return mfree(j);
+}
+
+static void pull_job_finish(PullJob *j, int ret) {
+        assert(j);
+
+        if (IN_SET(j->state, PULL_JOB_DONE, PULL_JOB_FAILED))
+                return;
+
+        if (ret == 0) {
+                j->state = PULL_JOB_DONE;
+                j->progress_percent = 100;
+                log_info("Download of %s complete.", j->url);
+        } else {
+                j->state = PULL_JOB_FAILED;
+                j->error = ret;
+        }
+
+        if (j->on_finished)
+                j->on_finished(j);
+}
+
+static int pull_job_restart(PullJob *j, const char *new_url) {
+        int r;
+
+        assert(j);
+        assert(new_url);
+
+        r = free_and_strdup(&j->url, new_url);
+        if (r < 0)
+                return r;
+
+        j->state = PULL_JOB_INIT;
+        j->error = 0;
+        j->payload = mfree(j->payload);
+        j->payload_size = 0;
+        j->written_compressed = 0;
+        j->written_uncompressed = 0;
+        j->content_length = UINT64_MAX;
+        j->etag = mfree(j->etag);
+        j->etag_exists = false;
+        j->mtime = 0;
+        j->checksum = mfree(j->checksum);
+
+        curl_glue_remove_and_free(j->glue, j->curl);
+        j->curl = NULL;
+
+        curl_slist_free_all(j->request_header);
+        j->request_header = NULL;
+
+        import_compress_free(&j->compress);
+
+        if (j->checksum_ctx) {
+#if PREFER_OPENSSL
+                EVP_MD_CTX_free(j->checksum_ctx);
+#else
+                gcry_md_close(j->checksum_ctx);
+#endif
+                j->checksum_ctx = NULL;
+        }
+
+        r = pull_job_begin(j);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+void pull_job_curl_on_finished(CurlGlue *g, CURL *curl, CURLcode result) {
+        PullJob *j = NULL;
+        char *scheme = NULL;
+        CURLcode code;
+        int r;
+
+        if (curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&j) != CURLE_OK)
+                return;
+
+        if (!j || IN_SET(j->state, PULL_JOB_DONE, PULL_JOB_FAILED))
+                return;
+
+        if (result != CURLE_OK) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Transfer failed: %s", curl_easy_strerror(result));
+                goto finish;
+        }
+
+        code = curl_easy_getinfo(curl, CURLINFO_SCHEME, &scheme);
+        if (code != CURLE_OK || !scheme) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve URL scheme.");
+                goto finish;
+        }
+
+        if (STRCASE_IN_SET(scheme, "HTTP", "HTTPS")) {
+                long status;
+
+                code = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status);
+                if (code != CURLE_OK) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve response code: %s", curl_easy_strerror(code));
+                        goto finish;
+                }
+
+                if (status == 304) {
+                        log_info("Image already downloaded. Skipping download.");
+                        j->etag_exists = true;
+                        r = 0;
+                        goto finish;
+                } else if (status >= 300) {
+
+                        if (status == 404 && j->on_not_found) {
+                                _cleanup_free_ char *new_url = NULL;
+
+                                /* This resource wasn't found, but the implementor wants to maybe let us know a new URL, query for it. */
+                                r = j->on_not_found(j, &new_url);
+                                if (r < 0)
+                                        goto finish;
+
+                                if (r > 0) { /* A new url to use */
+                                        assert(new_url);
+
+                                        r = pull_job_restart(j, new_url);
+                                        if (r < 0)
+                                                goto finish;
+
+                                        code = curl_easy_getinfo(j->curl, CURLINFO_RESPONSE_CODE, &status);
+                                        if (code != CURLE_OK) {
+                                                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve response code: %s", curl_easy_strerror(code));
+                                                goto finish;
+                                        }
+
+                                        if (status == 0)
+                                                return;
+                                }
+                        }
+
+                        r = log_error_errno(
+                                        status == 404 ? SYNTHETIC_ERRNO(ENOMEDIUM) : SYNTHETIC_ERRNO(EIO), /* Make the most common error recognizable */
+                                        "HTTP request to %s failed with code %li.", j->url, status);
+                        goto finish;
+                } else if (status < 200) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EIO), "HTTP request to %s finished with unexpected code %li.", j->url, status);
+                        goto finish;
+                }
+        }
+
+        if (j->state != PULL_JOB_RUNNING) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Premature connection termination.");
+                goto finish;
+        }
+
+        if (j->content_length != UINT64_MAX &&
+            j->content_length != j->written_compressed) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Download truncated.");
+                goto finish;
+        }
+
+        if (j->checksum_ctx) {
+                unsigned checksum_len;
+#if PREFER_OPENSSL
+                uint8_t k[EVP_MAX_MD_SIZE];
+
+                r = EVP_DigestFinal_ex(j->checksum_ctx, k, &checksum_len);
+                if (r == 0) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to get checksum.");
+                        goto finish;
+                }
+                assert(checksum_len <= sizeof k);
+#else
+                const uint8_t *k;
+
+                k = gcry_md_read(j->checksum_ctx, GCRY_MD_SHA256);
+                if (!k) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to get checksum.");
+                        goto finish;
+                }
+
+                checksum_len = gcry_md_get_algo_dlen(GCRY_MD_SHA256);
+#endif
+
+                j->checksum = hexmem(k, checksum_len);
+                if (!j->checksum) {
+                        r = log_oom();
+                        goto finish;
+                }
+
+                log_debug("SHA256 of %s is %s.", j->url, j->checksum);
+        }
+
+        /* Do a couple of finishing disk operations, but only if we are the sole owner of the file (i.e. no
+         * offset is specified, which indicates we only own the file partially) */
+
+        if (j->disk_fd >= 0) {
+
+                if (S_ISREG(j->disk_stat.st_mode)) {
+
+                        if (j->offset == UINT64_MAX) {
+
+                                if (j->written_compressed > 0) {
+                                        /* Make sure the file size is right, in case the file was sparse and
+                                         * we just moved to the last part. */
+                                        if (ftruncate(j->disk_fd, j->written_uncompressed) < 0) {
+                                                r = log_error_errno(errno, "Failed to truncate file: %m");
+                                                goto finish;
+                                        }
+                                }
+
+                                if (j->etag)
+                                        (void) fsetxattr(j->disk_fd, "user.source_etag", j->etag, strlen(j->etag), 0);
+                                if (j->url)
+                                        (void) fsetxattr(j->disk_fd, "user.source_url", j->url, strlen(j->url), 0);
+
+                                if (j->mtime != 0) {
+                                        struct timespec ut;
+
+                                        timespec_store(&ut, j->mtime);
+
+                                        if (futimens(j->disk_fd, (struct timespec[]) { ut, ut }) < 0)
+                                                log_debug_errno(errno, "Failed to adjust atime/mtime of created image, ignoring: %m");
+
+                                        r = fd_setcrtime(j->disk_fd, j->mtime);
+                                        if (r < 0)
+                                                log_debug_errno(r, "Failed to adjust crtime of created image, ignoring: %m");
+                                }
+                        }
+
+                        if (j->sync) {
+                                r = fsync_full(j->disk_fd);
+                                if (r < 0) {
+                                        log_error_errno(r, "Failed to synchronize file to disk: %m");
+                                        goto finish;
+                                }
+                        }
+
+                } else if (S_ISBLK(j->disk_stat.st_mode) && j->sync) {
+
+                        if (fsync(j->disk_fd) < 0) {
+                                r = log_error_errno(errno, "Failed to synchronize block device: %m");
+                                goto finish;
+                        }
+                }
+        }
+
+        log_info("Acquired %s.", FORMAT_BYTES(j->written_uncompressed));
+
+        r = 0;
+
+finish:
+        pull_job_finish(j, r);
+}
+
+static int pull_job_write_uncompressed(const void *p, size_t sz, void *userdata) {
+        PullJob *j = ASSERT_PTR(userdata);
+        bool too_much = false;
+        int r;
+
+        assert(p);
+        assert(sz > 0);
+
+        if (j->written_uncompressed > UINT64_MAX - sz)
+                return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "File too large, overflow");
+
+        if (j->written_uncompressed >= j->uncompressed_max) {
+                too_much = true;
+                goto finish;
+        }
+
+        if (j->written_uncompressed + sz > j->uncompressed_max) {
+                too_much = true;
+                sz = j->uncompressed_max - j->written_uncompressed; /* since we have the data in memory
+                                                                     * already, we might as well write it to
+                                                                     * disk to the max */
+        }
+
+        if (j->disk_fd >= 0) {
+
+                if (S_ISREG(j->disk_stat.st_mode) && j->offset == UINT64_MAX) {
+                        ssize_t n;
+
+                        n = sparse_write(j->disk_fd, p, sz, 64);
+                        if (n < 0)
+                                return log_error_errno((int) n, "Failed to write file: %m");
+                        if ((size_t) n < sz)
+                                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write");
+                } else {
+                        r = loop_write(j->disk_fd, p, sz);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to write file: %m");
+                }
+        }
+
+        if (j->disk_fd < 0 || j->force_memory) {
+                if (!GREEDY_REALLOC(j->payload, j->payload_size + sz))
+                        return log_oom();
+
+                memcpy(j->payload + j->payload_size, p, sz);
+                j->payload_size += sz;
+        }
+
+        j->written_uncompressed += sz;
+
+finish:
+        if (too_much)
+                return log_error_errno(SYNTHETIC_ERRNO(EFBIG), "File overly large, refusing.");
+
+        return 0;
+}
+
+static int pull_job_write_compressed(PullJob *j, void *p, size_t sz) {
+        int r;
+
+        assert(j);
+        assert(p);
+
+        if (sz <= 0)
+                return 0;
+
+        if (j->written_compressed + sz < j->written_compressed)
+                return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "File too large, overflow");
+
+        if (j->written_compressed + sz > j->compressed_max)
+                return log_error_errno(SYNTHETIC_ERRNO(EFBIG), "File overly large, refusing.");
+
+        if (j->content_length != UINT64_MAX &&
+            j->written_compressed + sz > j->content_length)
+                return log_error_errno(SYNTHETIC_ERRNO(EFBIG),
+                                       "Content length incorrect.");
+
+        if (j->checksum_ctx) {
+#if PREFER_OPENSSL
+                r = EVP_DigestUpdate(j->checksum_ctx, p, sz);
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Could not hash chunk.");
+#else
+                gcry_md_write(j->checksum_ctx, p, sz);
+#endif
+        }
+
+        r = import_uncompress(&j->compress, p, sz, pull_job_write_uncompressed, j);
+        if (r < 0)
+                return r;
+
+        j->written_compressed += sz;
+
+        return 0;
+}
+
+static int pull_job_open_disk(PullJob *j) {
+        int r;
+
+        assert(j);
+
+        if (j->on_open_disk) {
+                r = j->on_open_disk(j);
+                if (r < 0)
+                        return r;
+        }
+
+        if (j->disk_fd >= 0) {
+                if (fstat(j->disk_fd, &j->disk_stat) < 0)
+                        return log_error_errno(errno, "Failed to stat disk file: %m");
+
+                if (j->offset != UINT64_MAX) {
+                        if (lseek(j->disk_fd, j->offset, SEEK_SET) < 0)
+                                return log_error_errno(errno, "Failed to seek on file descriptor: %m");
+                }
+        }
+
+        if (j->calc_checksum) {
+#if PREFER_OPENSSL
+                j->checksum_ctx = EVP_MD_CTX_new();
+                if (!j->checksum_ctx)
+                        return log_oom();
+
+                r = EVP_DigestInit_ex(j->checksum_ctx, EVP_sha256(), NULL);
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Failed to initialize hash context.");
+#else
+                initialize_libgcrypt(false);
+
+                if (gcry_md_open(&j->checksum_ctx, GCRY_MD_SHA256, 0) != 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO),
+                                               "Failed to initialize hash context.");
+#endif
+        }
+
+        return 0;
+}
+
+static int pull_job_detect_compression(PullJob *j) {
+        _cleanup_free_ uint8_t *stub = NULL;
+        size_t stub_size;
+
+        int r;
+
+        assert(j);
+
+        r = import_uncompress_detect(&j->compress, j->payload, j->payload_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to initialize compressor: %m");
+        if (r == 0)
+                return 0;
+
+        log_debug("Stream is compressed: %s", import_compress_type_to_string(j->compress.type));
+
+        r = pull_job_open_disk(j);
+        if (r < 0)
+                return r;
+
+        /* Now, take the payload we read so far, and decompress it */
+        stub = j->payload;
+        stub_size = j->payload_size;
+
+        j->payload = NULL;
+        j->payload_size = 0;
+
+        j->state = PULL_JOB_RUNNING;
+
+        r = pull_job_write_compressed(j, stub, stub_size);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static size_t pull_job_write_callback(void *contents, size_t size, size_t nmemb, void *userdata) {
+        PullJob *j = ASSERT_PTR(userdata);
+        size_t sz = size * nmemb;
+        int r;
+
+        assert(contents);
+
+        switch (j->state) {
+
+        case PULL_JOB_ANALYZING:
+                /* Let's first check what it actually is */
+
+                if (!GREEDY_REALLOC(j->payload, j->payload_size + sz)) {
+                        r = log_oom();
+                        goto fail;
+                }
+
+                memcpy(j->payload + j->payload_size, contents, sz);
+                j->payload_size += sz;
+
+                r = pull_job_detect_compression(j);
+                if (r < 0)
+                        goto fail;
+
+                break;
+
+        case PULL_JOB_RUNNING:
+
+                r = pull_job_write_compressed(j, contents, sz);
+                if (r < 0)
+                        goto fail;
+
+                break;
+
+        case PULL_JOB_DONE:
+        case PULL_JOB_FAILED:
+                r = -ESTALE;
+                goto fail;
+
+        default:
+                assert_not_reached();
+        }
+
+        return sz;
+
+fail:
+        pull_job_finish(j, r);
+        return 0;
+}
+
+static int http_status_ok(CURLcode status) {
+        /* Consider all HTTP status code in the 2xx range as OK */
+        return status >= 200 && status <= 299;
+}
+
+static int http_status_etag_exists(CURLcode status) {
+        /* This one is special, it's triggered by our etag mgmt logic */
+        return status == 304;
+}
+
+static size_t pull_job_header_callback(void *contents, size_t size, size_t nmemb, void *userdata) {
+        _cleanup_free_ char *length = NULL, *last_modified = NULL, *etag = NULL;
+        size_t sz = size * nmemb;
+        PullJob *j = ASSERT_PTR(userdata);
+        CURLcode code;
+        long status;
+        int r;
+
+        assert(contents);
+
+        if (IN_SET(j->state, PULL_JOB_DONE, PULL_JOB_FAILED)) {
+                r = -ESTALE;
+                goto fail;
+        }
+
+        assert(j->state == PULL_JOB_ANALYZING);
+
+        code = curl_easy_getinfo(j->curl, CURLINFO_RESPONSE_CODE, &status);
+        if (code != CURLE_OK) {
+                r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve response code: %s", curl_easy_strerror(code));
+                goto fail;
+        }
+
+        if (http_status_ok(status) || http_status_etag_exists(status)) {
+                /* Check Etag on OK and etag exists responses. */
+
+                r = curl_header_strdup(contents, sz, "ETag:", &etag);
+                if (r < 0) {
+                        log_oom();
+                        goto fail;
+                }
+                if (r > 0) {
+                        free_and_replace(j->etag, etag);
+
+                        if (strv_contains(j->old_etags, j->etag)) {
+                                log_info("Image already downloaded. Skipping download. (%s)", j->etag);
+                                j->etag_exists = true;
+                                pull_job_finish(j, 0);
+                                return sz;
+                        }
+
+                        return sz;
+                }
+        }
+
+        if (!http_status_ok(status)) /* Let's ignore the rest here, these requests are probably redirects and
+                                      * stuff where the headers aren't interesting to us */
+                return sz;
+
+        r = curl_header_strdup(contents, sz, "Content-Length:", &length);
+        if (r < 0) {
+                log_oom();
+                goto fail;
+        }
+        if (r > 0) {
+                (void) safe_atou64(length, &j->content_length);
+
+                if (j->content_length != UINT64_MAX) {
+                        if (j->content_length > j->compressed_max) {
+                                r = log_error_errno(SYNTHETIC_ERRNO(EFBIG), "Content too large.");
+                                goto fail;
+                        }
+
+                        log_info("Downloading %s for %s.", FORMAT_BYTES(j->content_length), j->url);
+                }
+
+                return sz;
+        }
+
+        r = curl_header_strdup(contents, sz, "Last-Modified:", &last_modified);
+        if (r < 0) {
+                log_oom();
+                goto fail;
+        }
+        if (r > 0) {
+                (void) curl_parse_http_time(last_modified, &j->mtime);
+                return sz;
+        }
+
+        if (j->on_header) {
+                r = j->on_header(j, contents, sz);
+                if (r < 0)
+                        goto fail;
+        }
+
+        return sz;
+
+fail:
+        pull_job_finish(j, r);
+        return 0;
+}
+
+static int pull_job_progress_callback(void *userdata, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow) {
+        PullJob *j = ASSERT_PTR(userdata);
+        unsigned percent;
+        usec_t n;
+
+        if (dltotal <= 0)
+                return 0;
+
+        percent = ((100 * dlnow) / dltotal);
+        n = now(CLOCK_MONOTONIC);
+
+        if (n > j->last_status_usec + USEC_PER_SEC &&
+            percent != j->progress_percent &&
+            dlnow < dltotal) {
+
+                if (n - j->start_usec > USEC_PER_SEC && dlnow > 0) {
+                        usec_t left, done;
+
+                        done = n - j->start_usec;
+                        left = (usec_t) (((double) done * (double) dltotal) / dlnow) - done;
+
+                        log_info("Got %u%% of %s. %s left at %s/s.",
+                                 percent,
+                                 j->url,
+                                 FORMAT_TIMESPAN(left, USEC_PER_SEC),
+                                 FORMAT_BYTES((uint64_t) ((double) dlnow / ((double) done / (double) USEC_PER_SEC))));
+                } else
+                        log_info("Got %u%% of %s.", percent, j->url);
+
+                j->progress_percent = percent;
+                j->last_status_usec = n;
+
+                if (j->on_progress)
+                        j->on_progress(j);
+        }
+
+        return 0;
+}
+
+int pull_job_new(
+                PullJob **ret,
+                const char *url,
+                CurlGlue *glue,
+                void *userdata) {
+
+        _cleanup_(pull_job_unrefp) PullJob *j = NULL;
+        _cleanup_free_ char *u = NULL;
+
+        assert(url);
+        assert(glue);
+        assert(ret);
+
+        u = strdup(url);
+        if (!u)
+                return -ENOMEM;
+
+        j = new(PullJob, 1);
+        if (!j)
+                return -ENOMEM;
+
+        *j = (PullJob) {
+                .state = PULL_JOB_INIT,
+                .disk_fd = -EBADF,
+                .close_disk_fd = true,
+                .userdata = userdata,
+                .glue = glue,
+                .content_length = UINT64_MAX,
+                .start_usec = now(CLOCK_MONOTONIC),
+                .compressed_max = 64LLU * 1024LLU * 1024LLU * 1024LLU, /* 64GB safety limit */
+                .uncompressed_max = 64LLU * 1024LLU * 1024LLU * 1024LLU, /* 64GB safety limit */
+                .url = TAKE_PTR(u),
+                .offset = UINT64_MAX,
+                .sync = true,
+        };
+
+        *ret = TAKE_PTR(j);
+
+        return 0;
+}
+
+int pull_job_begin(PullJob *j) {
+        int r;
+
+        assert(j);
+
+        if (j->state != PULL_JOB_INIT)
+                return -EBUSY;
+
+        r = curl_glue_make(&j->curl, j->url, j);
+        if (r < 0)
+                return r;
+
+        if (!strv_isempty(j->old_etags)) {
+                _cleanup_free_ char *cc = NULL, *hdr = NULL;
+
+                cc = strv_join(j->old_etags, ", ");
+                if (!cc)
+                        return -ENOMEM;
+
+                hdr = strjoin("If-None-Match: ", cc);
+                if (!hdr)
+                        return -ENOMEM;
+
+                if (!j->request_header) {
+                        j->request_header = curl_slist_new(hdr, NULL);
+                        if (!j->request_header)
+                                return -ENOMEM;
+                } else {
+                        struct curl_slist *l;
+
+                        l = curl_slist_append(j->request_header, hdr);
+                        if (!l)
+                                return -ENOMEM;
+
+                        j->request_header = l;
+                }
+        }
+
+        if (j->request_header) {
+                if (curl_easy_setopt(j->curl, CURLOPT_HTTPHEADER, j->request_header) != CURLE_OK)
+                        return -EIO;
+        }
+
+        if (curl_easy_setopt(j->curl, CURLOPT_WRITEFUNCTION, pull_job_write_callback) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(j->curl, CURLOPT_WRITEDATA, j) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(j->curl, CURLOPT_HEADERFUNCTION, pull_job_header_callback) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(j->curl, CURLOPT_HEADERDATA, j) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(j->curl, CURLOPT_XFERINFOFUNCTION, pull_job_progress_callback) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(j->curl, CURLOPT_XFERINFODATA, j) != CURLE_OK)
+                return -EIO;
+
+        if (curl_easy_setopt(j->curl, CURLOPT_NOPROGRESS, 0) != CURLE_OK)
+                return -EIO;
+
+        r = curl_glue_add(j->glue, j->curl);
+        if (r < 0)
+                return r;
+
+        j->state = PULL_JOB_ANALYZING;
+
+        return 0;
+}
diff --git a/src/import/pull-job.h b/src/import/pull-job.h
new file mode 100644
index 0000000..7a98b0f
--- /dev/null
+++ b/src/import/pull-job.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "curl-util.h"
+#include "import-compress.h"
+#include "macro.h"
+#include "openssl-util.h"
+#include "pull-common.h"
+
+typedef struct PullJob PullJob;
+
+typedef void (*PullJobFinished)(PullJob *job);
+typedef int (*PullJobOpenDisk)(PullJob *job);
+typedef int (*PullJobHeader)(PullJob *job, const char *header, size_t sz);
+typedef void (*PullJobProgress)(PullJob *job);
+typedef int (*PullJobNotFound)(PullJob *job, char **ret_new_url);
+
+typedef enum PullJobState {
+        PULL_JOB_INIT,
+        PULL_JOB_ANALYZING, /* Still reading into ->payload, to figure out what we have */
+        PULL_JOB_RUNNING,   /* Writing to destination */
+        PULL_JOB_DONE,
+        PULL_JOB_FAILED,
+        _PULL_JOB_STATE_MAX,
+        _PULL_JOB_STATE_INVALID = -EINVAL,
+} PullJobState;
+
+#define PULL_JOB_IS_COMPLETE(j) (IN_SET((j)->state, PULL_JOB_DONE, PULL_JOB_FAILED))
+
+struct PullJob {
+        PullJobState state;
+        int error;
+
+        char *url;
+
+        void *userdata;
+        PullJobFinished on_finished;
+        PullJobOpenDisk on_open_disk;
+        PullJobHeader on_header;
+        PullJobProgress on_progress;
+        PullJobNotFound on_not_found;
+
+        CurlGlue *glue;
+        CURL *curl;
+        struct curl_slist *request_header;
+
+        char *etag;
+        char **old_etags;
+        bool etag_exists;
+
+        uint64_t content_length;
+        uint64_t written_compressed;
+        uint64_t written_uncompressed;
+        uint64_t offset;
+
+        uint64_t uncompressed_max;
+        uint64_t compressed_max;
+
+        uint8_t *payload;
+        size_t payload_size;
+
+        int disk_fd;
+        bool close_disk_fd;
+        struct stat disk_stat;
+
+        usec_t mtime;
+
+        ImportCompress compress;
+
+        unsigned progress_percent;
+        usec_t start_usec;
+        usec_t last_status_usec;
+
+        bool calc_checksum;
+        hash_context_t checksum_ctx;
+
+        char *checksum;
+        bool sync;
+        bool force_memory;
+};
+
+int pull_job_new(PullJob **job, const char *url, CurlGlue *glue, void *userdata);
+PullJob* pull_job_unref(PullJob *job);
+
+int pull_job_begin(PullJob *j);
+
+void pull_job_curl_on_finished(CurlGlue *g, CURL *curl, CURLcode result);
+
+void pull_job_close_disk_fd(PullJob *j);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(PullJob*, pull_job_unref);
diff --git a/src/import/pull-raw.c b/src/import/pull-raw.c
new file mode 100644
index 0000000..66c3f65
--- /dev/null
+++ b/src/import/pull-raw.c
@@ -0,0 +1,983 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "sd-daemon.h"
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "copy.h"
+#include "curl-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "import-util.h"
+#include "install-file.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "pull-common.h"
+#include "pull-job.h"
+#include "pull-raw.h"
+#include "qcow2-util.h"
+#include "rm-rf.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "utf8.h"
+#include "web-util.h"
+
+typedef enum RawProgress {
+        RAW_DOWNLOADING,
+        RAW_VERIFYING,
+        RAW_UNPACKING,
+        RAW_FINALIZING,
+        RAW_COPYING,
+} RawProgress;
+
+struct RawPull {
+        sd_event *event;
+        CurlGlue *glue;
+
+        PullFlags flags;
+        ImportVerify verify;
+        char *image_root;
+
+        uint64_t offset;
+
+        PullJob *raw_job;
+        PullJob *checksum_job;
+        PullJob *signature_job;
+        PullJob *settings_job;
+        PullJob *roothash_job;
+        PullJob *roothash_signature_job;
+        PullJob *verity_job;
+
+        RawPullFinished on_finished;
+        void *userdata;
+
+        char *local; /* In PULL_DIRECT mode the path we are supposed to place things in, otherwise the
+                      * machine name of the final copy we make */
+
+        char *final_path;
+        char *temp_path;
+
+        char *settings_path;
+        char *settings_temp_path;
+
+        char *roothash_path;
+        char *roothash_temp_path;
+
+        char *roothash_signature_path;
+        char *roothash_signature_temp_path;
+
+        char *verity_path;
+        char *verity_temp_path;
+
+        char *checksum;
+};
+
+RawPull* raw_pull_unref(RawPull *i) {
+        if (!i)
+                return NULL;
+
+        pull_job_unref(i->raw_job);
+        pull_job_unref(i->checksum_job);
+        pull_job_unref(i->signature_job);
+        pull_job_unref(i->settings_job);
+        pull_job_unref(i->roothash_job);
+        pull_job_unref(i->roothash_signature_job);
+        pull_job_unref(i->verity_job);
+
+        curl_glue_unref(i->glue);
+        sd_event_unref(i->event);
+
+        unlink_and_free(i->temp_path);
+        unlink_and_free(i->settings_temp_path);
+        unlink_and_free(i->roothash_temp_path);
+        unlink_and_free(i->roothash_signature_temp_path);
+        unlink_and_free(i->verity_temp_path);
+
+        free(i->final_path);
+        free(i->settings_path);
+        free(i->roothash_path);
+        free(i->roothash_signature_path);
+        free(i->verity_path);
+        free(i->image_root);
+        free(i->local);
+        free(i->checksum);
+
+        return mfree(i);
+}
+
+int raw_pull_new(
+                RawPull **ret,
+                sd_event *event,
+                const char *image_root,
+                RawPullFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(curl_glue_unrefp) CurlGlue *g = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        _cleanup_(raw_pull_unrefp) RawPull *i = NULL;
+        _cleanup_free_ char *root = NULL;
+        int r;
+
+        assert(ret);
+
+        root = strdup(image_root ?: "/var/lib/machines");
+        if (!root)
+                return -ENOMEM;
+
+        if (event)
+                e = sd_event_ref(event);
+        else {
+                r = sd_event_default(&e);
+                if (r < 0)
+                        return r;
+        }
+
+        r = curl_glue_new(&g, e);
+        if (r < 0)
+                return r;
+
+        i = new(RawPull, 1);
+        if (!i)
+                return -ENOMEM;
+
+        *i = (RawPull) {
+                .on_finished = on_finished,
+                .userdata = userdata,
+                .image_root = TAKE_PTR(root),
+                .event = TAKE_PTR(e),
+                .glue = TAKE_PTR(g),
+                .offset = UINT64_MAX,
+        };
+
+        i->glue->on_finished = pull_job_curl_on_finished;
+        i->glue->userdata = i;
+
+        *ret = TAKE_PTR(i);
+
+        return 0;
+}
+
+static void raw_pull_report_progress(RawPull *i, RawProgress p) {
+        unsigned percent;
+
+        assert(i);
+
+        switch (p) {
+
+        case RAW_DOWNLOADING: {
+                unsigned remain = 80;
+
+                percent = 0;
+
+                if (i->checksum_job) {
+                        percent += i->checksum_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->signature_job) {
+                        percent += i->signature_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->settings_job) {
+                        percent += i->settings_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->roothash_job) {
+                        percent += i->roothash_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->roothash_signature_job) {
+                        percent += i->roothash_signature_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->verity_job) {
+                        percent += i->verity_job->progress_percent * 10 / 100;
+                        remain -= 10;
+                }
+
+                if (i->raw_job)
+                        percent += i->raw_job->progress_percent * remain / 100;
+                break;
+        }
+
+        case RAW_VERIFYING:
+                percent = 80;
+                break;
+
+        case RAW_UNPACKING:
+                percent = 85;
+                break;
+
+        case RAW_FINALIZING:
+                percent = 90;
+                break;
+
+        case RAW_COPYING:
+                percent = 95;
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        sd_notifyf(false, "X_IMPORT_PROGRESS=%u%%", percent);
+        log_debug("Combined progress %u%%", percent);
+}
+
+static int raw_pull_maybe_convert_qcow2(RawPull *i) {
+        _cleanup_(unlink_and_freep) char *t = NULL;
+        _cleanup_close_ int converted_fd = -EBADF;
+        _cleanup_free_ char *f = NULL;
+        int r;
+
+        assert(i);
+        assert(i->raw_job);
+        assert(!FLAGS_SET(i->flags, PULL_DIRECT));
+
+        if (!FLAGS_SET(i->flags, PULL_CONVERT_QCOW2))
+                return 0;
+
+        assert(i->final_path);
+        assert(i->raw_job->close_disk_fd);
+
+        r = qcow2_detect(i->raw_job->disk_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to detect whether this is a QCOW2 image: %m");
+        if (r == 0)
+                return 0;
+
+        /* This is a QCOW2 image, let's convert it */
+        r = tempfn_random(i->final_path, NULL, &f);
+        if (r < 0)
+                return log_oom();
+
+        converted_fd = open(f, O_RDWR|O_CREAT|O_EXCL|O_NOCTTY|O_CLOEXEC, 0664);
+        if (converted_fd < 0)
+                return log_error_errno(errno, "Failed to create %s: %m", f);
+
+        t = TAKE_PTR(f);
+
+        (void) import_set_nocow_and_log(converted_fd, t);
+
+        log_info("Unpacking QCOW2 file.");
+
+        r = qcow2_convert(i->raw_job->disk_fd, converted_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to convert qcow2 image: %m");
+
+        unlink_and_free(i->temp_path);
+        i->temp_path = TAKE_PTR(t);
+        close_and_replace(i->raw_job->disk_fd, converted_fd);
+
+        return 1;
+}
+
+static int raw_pull_determine_path(
+                RawPull *i,
+                const char *suffix,
+                char **field /* input + output (!) */) {
+        int r;
+
+        assert(i);
+        assert(field);
+
+        if (*field)
+                return 0;
+
+        assert(i->raw_job);
+
+        r = pull_make_path(i->raw_job->url, i->raw_job->etag, i->image_root, ".raw-", suffix, field);
+        if (r < 0)
+                return log_oom();
+
+        return 1;
+}
+
+static int raw_pull_copy_auxiliary_file(
+                RawPull *i,
+                const char *suffix,
+                char **path /* input + output (!) */) {
+
+        const char *local;
+        int r;
+
+        assert(i);
+        assert(suffix);
+        assert(path);
+
+        r = raw_pull_determine_path(i, suffix, path);
+        if (r < 0)
+                return r;
+
+        local = strjoina(i->image_root, "/", i->local, suffix);
+
+        r = copy_file_atomic(
+                        *path,
+                        local,
+                        0644,
+                        COPY_REFLINK |
+                        (FLAGS_SET(i->flags, PULL_FORCE) ? COPY_REPLACE : 0) |
+                        (FLAGS_SET(i->flags, PULL_SYNC) ? COPY_FSYNC_FULL : 0));
+        if (r == -EEXIST)
+                log_warning_errno(r, "File %s already exists, not replacing.", local);
+        else if (r == -ENOENT)
+                log_debug_errno(r, "Skipping creation of auxiliary file, since none was found.");
+        else if (r < 0)
+                log_warning_errno(r, "Failed to copy file %s, ignoring: %m", local);
+        else
+                log_info("Created new file %s.", local);
+
+        return 0;
+}
+
+static int raw_pull_make_local_copy(RawPull *i) {
+        _cleanup_(unlink_and_freep) char *tp = NULL;
+        _cleanup_free_ char *f = NULL;
+        _cleanup_close_ int dfd = -EBADF;
+        const char *p;
+        int r;
+
+        assert(i);
+        assert(i->raw_job);
+        assert(!FLAGS_SET(i->flags, PULL_DIRECT));
+
+        if (!i->local)
+                return 0;
+
+        if (i->raw_job->etag_exists) {
+                /* We have downloaded this one previously, reopen it */
+
+                assert(i->raw_job->disk_fd < 0);
+
+                i->raw_job->disk_fd = open(i->final_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+                if (i->raw_job->disk_fd < 0)
+                        return log_error_errno(errno, "Failed to open vendor image: %m");
+        } else {
+                /* We freshly downloaded the image, use it */
+
+                assert(i->raw_job->disk_fd >= 0);
+                assert(i->offset == UINT64_MAX);
+
+                if (lseek(i->raw_job->disk_fd, SEEK_SET, 0) < 0)
+                        return log_error_errno(errno, "Failed to seek to beginning of vendor image: %m");
+        }
+
+        p = strjoina(i->image_root, "/", i->local, ".raw");
+
+        r = tempfn_random(p, NULL, &f);
+        if (r < 0)
+                return log_oom();
+
+        dfd = open(f, O_WRONLY|O_CREAT|O_EXCL|O_NOCTTY|O_CLOEXEC, 0664);
+        if (dfd < 0)
+                return log_error_errno(errno, "Failed to create writable copy of image: %m");
+
+        tp = TAKE_PTR(f);
+
+        /* Turn off COW writing. This should greatly improve performance on COW file systems like btrfs,
+         * since it reduces fragmentation caused by not allowing in-place writes. */
+        (void) import_set_nocow_and_log(dfd, tp);
+
+        r = copy_bytes(i->raw_job->disk_fd, dfd, UINT64_MAX, COPY_REFLINK);
+        if (r < 0)
+                return log_error_errno(r, "Failed to make writable copy of image: %m");
+
+        (void) copy_times(i->raw_job->disk_fd, dfd, COPY_CRTIME);
+        (void) copy_xattr(i->raw_job->disk_fd, NULL, dfd, NULL, 0);
+
+        dfd = safe_close(dfd);
+
+        r = install_file(AT_FDCWD, tp,
+                         AT_FDCWD, p,
+                         (i->flags & PULL_FORCE ? INSTALL_REPLACE : 0) |
+                         (i->flags & PULL_READ_ONLY ? INSTALL_READ_ONLY : 0) |
+                         (i->flags & PULL_SYNC ? INSTALL_FSYNC_FULL : 0));
+        if (r < 0)
+                return log_error_errno(errno, "Failed to move local image into place '%s': %m", p);
+
+        tp = mfree(tp);
+
+        log_info("Created new local image '%s'.", i->local);
+
+        if (FLAGS_SET(i->flags, PULL_SETTINGS)) {
+                r = raw_pull_copy_auxiliary_file(i, ".nspawn", &i->settings_path);
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(i->flags, PULL_ROOTHASH)) {
+                r = raw_pull_copy_auxiliary_file(i, ".roothash", &i->roothash_path);
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(i->flags, PULL_ROOTHASH_SIGNATURE)) {
+                r = raw_pull_copy_auxiliary_file(i, ".roothash.p7s", &i->roothash_signature_path);
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(i->flags, PULL_VERITY)) {
+                r = raw_pull_copy_auxiliary_file(i, ".verity", &i->verity_path);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static bool raw_pull_is_done(RawPull *i) {
+        assert(i);
+        assert(i->raw_job);
+
+        if (!PULL_JOB_IS_COMPLETE(i->raw_job))
+                return false;
+        if (i->checksum_job && !PULL_JOB_IS_COMPLETE(i->checksum_job))
+                return false;
+        if (i->signature_job && !PULL_JOB_IS_COMPLETE(i->signature_job))
+                return false;
+        if (i->settings_job && !PULL_JOB_IS_COMPLETE(i->settings_job))
+                return false;
+        if (i->roothash_job && !PULL_JOB_IS_COMPLETE(i->roothash_job))
+                return false;
+        if (i->roothash_signature_job && !PULL_JOB_IS_COMPLETE(i->roothash_signature_job))
+                return false;
+        if (i->verity_job && !PULL_JOB_IS_COMPLETE(i->verity_job))
+                return false;
+
+        return true;
+}
+
+static int raw_pull_rename_auxiliary_file(
+                RawPull *i,
+                const char *suffix,
+                char **temp_path,
+                char **path) {
+
+        int r;
+
+        assert(i);
+        assert(path);
+        assert(temp_path);
+        assert(*temp_path);
+        assert(suffix);
+
+        /* Regenerate final name for this auxiliary file, we might know the etag of the file now, and we should
+         * incorporate it in the file name if we can */
+        *path = mfree(*path);
+        r = raw_pull_determine_path(i, suffix, path);
+        if (r < 0)
+                return r;
+
+        r = install_file(
+                        AT_FDCWD, *temp_path,
+                        AT_FDCWD, *path,
+                        INSTALL_READ_ONLY|
+                        (i->flags & PULL_SYNC ? INSTALL_FSYNC_FULL : 0));
+        if (r < 0)
+                return log_error_errno(r, "Failed to move '%s' into place: %m", *path);
+
+        *temp_path = mfree(*temp_path);
+        return 1;
+}
+
+static void raw_pull_job_on_finished(PullJob *j) {
+        RawPull *i;
+        PullJob *jj;
+        int r;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+
+        if (j->error != 0) {
+                /* Only the main job and the checksum job are fatal if they fail. The other fails are just
+                 * "decoration", that we'll download if we can. The signature job isn't fatal here because we
+                 * might not actually need it in case Suse style signatures are used, that are inline in the
+                 * checksum file. */
+
+                if (j == i->raw_job) {
+                        if (j->error == ENOMEDIUM) /* HTTP 404 */
+                                r = log_error_errno(j->error, "Failed to retrieve image file. (Wrong URL?)");
+                        else
+                                r = log_error_errno(j->error, "Failed to retrieve image file.");
+                        goto finish;
+                } else if (j == i->checksum_job) {
+                        r = log_error_errno(j->error, "Failed to retrieve SHA256 checksum, cannot verify. (Try --verify=no?)");
+                        goto finish;
+                } else if (j == i->signature_job)
+                        log_debug_errno(j->error, "Signature job for %s failed, proceeding for now.", j->url);
+                else if (j == i->settings_job)
+                        log_info_errno(j->error, "Settings file could not be retrieved, proceeding without.");
+                else if (j == i->roothash_job)
+                        log_info_errno(j->error, "Root hash file could not be retrieved, proceeding without.");
+                else if (j == i->roothash_signature_job)
+                        log_info_errno(j->error, "Root hash signature file could not be retrieved, proceeding without.");
+                else if (j == i->verity_job)
+                        log_info_errno(j->error, "Verity integrity file could not be retrieved, proceeding without.");
+                else
+                        assert_not_reached();
+        }
+
+        /* This is invoked if either the download completed successfully, or the download was skipped because
+         * we already have the etag. In this case ->etag_exists is true.
+         *
+         * We only do something when we got all files */
+
+        if (!raw_pull_is_done(i))
+                return;
+
+        if (i->signature_job && i->signature_job->error != 0) {
+                VerificationStyle style;
+                PullJob *verify_job;
+
+                /* The signature job failed. Let's see if we actually need it */
+
+                verify_job = i->checksum_job ?: i->raw_job; /* if the checksum job doesn't exist this must be
+                                                             * because the main job is the checksum file
+                                                             * itself */
+
+                assert(verify_job);
+
+                r = verification_style_from_url(verify_job->url, &style);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to determine verification style from checksum URL: %m");
+                        goto finish;
+                }
+
+                if (style == VERIFICATION_PER_DIRECTORY) { /* A failed signature file download only matters
+                                                            * in per-directory verification mode, since only
+                                                            * then the signature is detached, and thus a file
+                                                            * of its own. */
+                        r = log_error_errno(i->signature_job->error,
+                                            "Failed to retrieve signature file, cannot verify. (Try --verify=no?)");
+                        goto finish;
+                }
+        }
+
+        /* Let's close these auxiliary files now, we don't need access to them anymore. */
+        FOREACH_POINTER(jj, i->settings_job, i->roothash_job, i->roothash_signature_job, i->verity_job)
+                pull_job_close_disk_fd(jj);
+
+        if (!i->raw_job->etag_exists) {
+                raw_pull_report_progress(i, RAW_VERIFYING);
+
+                r = pull_verify(i->verify,
+                                i->checksum,
+                                i->raw_job,
+                                i->checksum_job,
+                                i->signature_job,
+                                i->settings_job,
+                                i->roothash_job,
+                                i->roothash_signature_job,
+                                i->verity_job);
+                if (r < 0)
+                        goto finish;
+        }
+
+        if (i->flags & PULL_DIRECT) {
+                assert(!i->settings_job);
+                assert(!i->roothash_job);
+                assert(!i->roothash_signature_job);
+                assert(!i->verity_job);
+
+                raw_pull_report_progress(i, RAW_FINALIZING);
+
+                if (i->local) {
+                        r = install_file(AT_FDCWD, i->local,
+                                         AT_FDCWD, NULL,
+                                         ((i->flags & PULL_READ_ONLY) && i->offset == UINT64_MAX ? INSTALL_READ_ONLY : 0) |
+                                         (i->flags & PULL_SYNC ? INSTALL_FSYNC_FULL : 0));
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to finalize raw file to '%s': %m", i->local);
+                                goto finish;
+                        }
+                }
+        } else {
+                r = raw_pull_determine_path(i, ".raw", &i->final_path);
+                if (r < 0)
+                        goto finish;
+
+                if (!i->raw_job->etag_exists) {
+                        /* This is a new download, verify it, and move it into place */
+
+                        assert(i->temp_path);
+                        assert(i->final_path);
+
+                        raw_pull_report_progress(i, RAW_UNPACKING);
+
+                        r = raw_pull_maybe_convert_qcow2(i);
+                        if (r < 0)
+                                goto finish;
+
+                        raw_pull_report_progress(i, RAW_FINALIZING);
+
+                        r = install_file(AT_FDCWD, i->temp_path,
+                                         AT_FDCWD, i->final_path,
+                                         INSTALL_READ_ONLY|
+                                         (i->flags & PULL_SYNC ? INSTALL_FSYNC_FULL : 0));
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to move raw file to '%s': %m", i->final_path);
+                                goto finish;
+                        }
+
+                        i->temp_path = mfree(i->temp_path);
+
+                        if (i->settings_job &&
+                            i->settings_job->error == 0) {
+                                r = raw_pull_rename_auxiliary_file(i, ".nspawn", &i->settings_temp_path, &i->settings_path);
+                                if (r < 0)
+                                        goto finish;
+                        }
+
+                        if (i->roothash_job &&
+                            i->roothash_job->error == 0) {
+                                r = raw_pull_rename_auxiliary_file(i, ".roothash", &i->roothash_temp_path, &i->roothash_path);
+                                if (r < 0)
+                                        goto finish;
+                        }
+
+                        if (i->roothash_signature_job &&
+                            i->roothash_signature_job->error == 0) {
+                                r = raw_pull_rename_auxiliary_file(i, ".roothash.p7s", &i->roothash_signature_temp_path, &i->roothash_signature_path);
+                                if (r < 0)
+                                        goto finish;
+                        }
+
+                        if (i->verity_job &&
+                            i->verity_job->error == 0) {
+                                r = raw_pull_rename_auxiliary_file(i, ".verity", &i->verity_temp_path, &i->verity_path);
+                                if (r < 0)
+                                        goto finish;
+                        }
+                }
+
+                raw_pull_report_progress(i, RAW_COPYING);
+
+                r = raw_pull_make_local_copy(i);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = 0;
+
+finish:
+        if (i->on_finished)
+                i->on_finished(i, r, i->userdata);
+        else
+                sd_event_exit(i->event, r);
+}
+
+static int raw_pull_job_on_open_disk_generic(
+                RawPull *i,
+                PullJob *j,
+                const char *extra,
+                char **temp_path /* input + output */) {
+
+        int r;
+
+        assert(i);
+        assert(j);
+        assert(extra);
+        assert(temp_path);
+
+        assert(!FLAGS_SET(i->flags, PULL_DIRECT));
+
+        if (!*temp_path) {
+                r = tempfn_random_child(i->image_root, extra, temp_path);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        (void) mkdir_parents_label(*temp_path, 0700);
+
+        j->disk_fd = open(*temp_path, O_RDWR|O_CREAT|O_EXCL|O_NOCTTY|O_CLOEXEC, 0664);
+        if (j->disk_fd < 0)
+                return log_error_errno(errno, "Failed to create %s: %m", *temp_path);
+
+        return 0;
+}
+
+static int raw_pull_job_on_open_disk_raw(PullJob *j) {
+        RawPull *i;
+        int r;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->raw_job == j);
+        assert(j->disk_fd < 0);
+
+        if (i->flags & PULL_DIRECT) {
+
+                if (!i->local) { /* If no local name specified, the pull job will write its data to stdout */
+                        j->disk_fd = STDOUT_FILENO;
+                        j->close_disk_fd = false;
+                        return 0;
+                }
+
+                (void) mkdir_parents_label(i->local, 0700);
+
+                j->disk_fd = open(i->local, O_RDWR|O_NOCTTY|O_CLOEXEC|(i->offset == UINT64_MAX ? O_TRUNC|O_CREAT : 0), 0664);
+                if (j->disk_fd < 0)
+                        return log_error_errno(errno, "Failed to open destination '%s': %m", i->local);
+
+                if (i->offset == UINT64_MAX)
+                        (void) import_set_nocow_and_log(j->disk_fd, i->local);
+
+        } else {
+                r = raw_pull_job_on_open_disk_generic(i, j, "raw", &i->temp_path);
+                if (r < 0)
+                        return r;
+
+                assert(i->offset == UINT64_MAX);
+                (void) import_set_nocow_and_log(j->disk_fd, i->temp_path);
+        }
+
+        return 0;
+}
+
+static int raw_pull_job_on_open_disk_settings(PullJob *j) {
+        RawPull *i;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->settings_job == j);
+
+        return raw_pull_job_on_open_disk_generic(i, j, "settings", &i->settings_temp_path);
+}
+
+static int raw_pull_job_on_open_disk_roothash(PullJob *j) {
+        RawPull *i;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->roothash_job == j);
+
+        return raw_pull_job_on_open_disk_generic(i, j, "roothash", &i->roothash_temp_path);
+}
+
+static int raw_pull_job_on_open_disk_roothash_signature(PullJob *j) {
+        RawPull *i;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->roothash_signature_job == j);
+
+        return raw_pull_job_on_open_disk_generic(i, j, "roothash.p7s", &i->roothash_signature_temp_path);
+}
+
+static int raw_pull_job_on_open_disk_verity(PullJob *j) {
+        RawPull *i;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->verity_job == j);
+
+        return raw_pull_job_on_open_disk_generic(i, j, "verity", &i->verity_temp_path);
+}
+
+static void raw_pull_job_on_progress(PullJob *j) {
+        RawPull *i;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+
+        raw_pull_report_progress(i, RAW_DOWNLOADING);
+}
+
+int raw_pull_start(
+                RawPull *i,
+                const char *url,
+                const char *local,
+                uint64_t offset,
+                uint64_t size_max,
+                PullFlags flags,
+                ImportVerify verify,
+                const char *checksum) {
+
+        PullJob *j;
+        int r;
+
+        assert(i);
+        assert(url);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify < _IMPORT_VERIFY_MAX);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify >= 0);
+        assert((verify < 0) || !checksum);
+        assert(!(flags & ~PULL_FLAGS_MASK_RAW));
+        assert(offset == UINT64_MAX || FLAGS_SET(flags, PULL_DIRECT));
+        assert(!(flags & (PULL_SETTINGS|PULL_ROOTHASH|PULL_ROOTHASH_SIGNATURE|PULL_VERITY)) || !(flags & PULL_DIRECT));
+        assert(!(flags & (PULL_SETTINGS|PULL_ROOTHASH|PULL_ROOTHASH_SIGNATURE|PULL_VERITY)) || !checksum);
+
+        if (!http_url_is_valid(url) && !file_url_is_valid(url))
+                return -EINVAL;
+
+        if (local && !pull_validate_local(local, flags))
+                return -EINVAL;
+
+        if (i->raw_job)
+                return -EBUSY;
+
+        r = free_and_strdup(&i->local, local);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&i->checksum, checksum);
+        if (r < 0)
+                return r;
+
+        i->flags = flags;
+        i->verify = verify;
+
+        /* Queue job for the image itself */
+        r = pull_job_new(&i->raw_job, url, i->glue, i);
+        if (r < 0)
+                return r;
+
+        i->raw_job->on_finished = raw_pull_job_on_finished;
+        i->raw_job->on_open_disk = raw_pull_job_on_open_disk_raw;
+
+        if (checksum)
+                i->raw_job->calc_checksum = true;
+        else if (verify != IMPORT_VERIFY_NO) {
+                /* Calculate checksum of the main download unless the users asks for a SHA256SUM file or its
+                 * signature, which we let gpg verify instead. */
+
+                r = pull_url_needs_checksum(url);
+                if (r < 0)
+                        return r;
+
+                i->raw_job->calc_checksum = r;
+                i->raw_job->force_memory = true; /* make sure this is both written to disk if that's
+                                                  * requested and into memory, since we need to verify it */
+        }
+
+        if (size_max != UINT64_MAX)
+                i->raw_job->uncompressed_max = size_max;
+        if (offset != UINT64_MAX)
+                i->raw_job->offset = i->offset = offset;
+
+        if (!FLAGS_SET(flags, PULL_DIRECT)) {
+                r = pull_find_old_etags(url, i->image_root, DT_REG, ".raw-", ".raw", &i->raw_job->old_etags);
+                if (r < 0)
+                        return r;
+        }
+
+        r = pull_make_verification_jobs(
+                        &i->checksum_job,
+                        &i->signature_job,
+                        verify,
+                        i->checksum,
+                        url,
+                        i->glue,
+                        raw_pull_job_on_finished,
+                        i);
+        if (r < 0)
+                return r;
+
+        if (FLAGS_SET(flags, PULL_SETTINGS)) {
+                r = pull_make_auxiliary_job(
+                                &i->settings_job,
+                                url,
+                                raw_strip_suffixes,
+                                ".nspawn",
+                                verify,
+                                i->glue,
+                                raw_pull_job_on_open_disk_settings,
+                                raw_pull_job_on_finished,
+                                i);
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(flags, PULL_ROOTHASH)) {
+                r = pull_make_auxiliary_job(
+                                &i->roothash_job,
+                                url,
+                                raw_strip_suffixes,
+                                ".roothash",
+                                verify,
+                                i->glue,
+                                raw_pull_job_on_open_disk_roothash,
+                                raw_pull_job_on_finished,
+                                i);
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(flags, PULL_ROOTHASH_SIGNATURE)) {
+                r = pull_make_auxiliary_job(
+                                &i->roothash_signature_job,
+                                url,
+                                raw_strip_suffixes,
+                                ".roothash.p7s",
+                                verify,
+                                i->glue,
+                                raw_pull_job_on_open_disk_roothash_signature,
+                                raw_pull_job_on_finished,
+                                i);
+                if (r < 0)
+                        return r;
+        }
+
+        if (FLAGS_SET(flags, PULL_VERITY)) {
+                r = pull_make_auxiliary_job(
+                                &i->verity_job,
+                                url,
+                                raw_strip_suffixes,
+                                ".verity",
+                                verify,
+                                i->glue,
+                                raw_pull_job_on_open_disk_verity,
+                                raw_pull_job_on_finished,
+                                i);
+                if (r < 0)
+                        return r;
+        }
+
+        FOREACH_POINTER(j,
+                        i->raw_job,
+                        i->checksum_job,
+                        i->signature_job,
+                        i->settings_job,
+                        i->roothash_job,
+                        i->roothash_signature_job,
+                        i->verity_job) {
+
+                if (!j)
+                        continue;
+
+                j->on_progress = raw_pull_job_on_progress;
+                j->sync = FLAGS_SET(flags, PULL_SYNC);
+
+                r = pull_job_begin(j);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
diff --git a/src/import/pull-raw.h b/src/import/pull-raw.h
new file mode 100644
index 0000000..b39e4e2
--- /dev/null
+++ b/src/import/pull-raw.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "import-util.h"
+#include "macro.h"
+#include "pull-common.h"
+
+typedef struct RawPull RawPull;
+
+typedef void (*RawPullFinished)(RawPull *pull, int error, void *userdata);
+
+int raw_pull_new(RawPull **pull, sd_event *event, const char *image_root, RawPullFinished on_finished, void *userdata);
+RawPull* raw_pull_unref(RawPull *pull);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(RawPull*, raw_pull_unref);
+
+int raw_pull_start(RawPull *pull, const char *url, const char *local, uint64_t offset, uint64_t size_max, PullFlags flags, ImportVerify verify, const char *checksum);
diff --git a/src/import/pull-tar.c b/src/import/pull-tar.c
new file mode 100644
index 0000000..c32fc29
--- /dev/null
+++ b/src/import/pull-tar.c
@@ -0,0 +1,677 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-daemon.h"
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "copy.h"
+#include "curl-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "import-util.h"
+#include "install-file.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "pull-common.h"
+#include "pull-job.h"
+#include "pull-tar.h"
+#include "rm-rf.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "user-util.h"
+#include "utf8.h"
+#include "web-util.h"
+
+typedef enum TarProgress {
+        TAR_DOWNLOADING,
+        TAR_VERIFYING,
+        TAR_FINALIZING,
+        TAR_COPYING,
+} TarProgress;
+
+struct TarPull {
+        sd_event *event;
+        CurlGlue *glue;
+
+        PullFlags flags;
+        ImportVerify verify;
+        char *image_root;
+
+        PullJob *tar_job;
+        PullJob *checksum_job;
+        PullJob *signature_job;
+        PullJob *settings_job;
+
+        TarPullFinished on_finished;
+        void *userdata;
+
+        char *local;
+
+        pid_t tar_pid;
+
+        char *final_path;
+        char *temp_path;
+
+        char *settings_path;
+        char *settings_temp_path;
+
+        char *checksum;
+};
+
+TarPull* tar_pull_unref(TarPull *i) {
+        if (!i)
+                return NULL;
+
+        if (i->tar_pid > 1)
+                sigkill_wait(i->tar_pid);
+
+        pull_job_unref(i->tar_job);
+        pull_job_unref(i->checksum_job);
+        pull_job_unref(i->signature_job);
+        pull_job_unref(i->settings_job);
+
+        curl_glue_unref(i->glue);
+        sd_event_unref(i->event);
+
+        rm_rf_subvolume_and_free(i->temp_path);
+        unlink_and_free(i->settings_temp_path);
+
+        free(i->final_path);
+        free(i->settings_path);
+        free(i->image_root);
+        free(i->local);
+        free(i->checksum);
+
+        return mfree(i);
+}
+
+int tar_pull_new(
+                TarPull **ret,
+                sd_event *event,
+                const char *image_root,
+                TarPullFinished on_finished,
+                void *userdata) {
+
+        _cleanup_(curl_glue_unrefp) CurlGlue *g = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        _cleanup_(tar_pull_unrefp) TarPull *i = NULL;
+        _cleanup_free_ char *root = NULL;
+        int r;
+
+        assert(ret);
+
+        root = strdup(image_root ?: "/var/lib/machines");
+        if (!root)
+                return -ENOMEM;
+
+        if (event)
+                e = sd_event_ref(event);
+        else {
+                r = sd_event_default(&e);
+                if (r < 0)
+                        return r;
+        }
+
+        r = curl_glue_new(&g, e);
+        if (r < 0)
+                return r;
+
+        i = new(TarPull, 1);
+        if (!i)
+                return -ENOMEM;
+
+        *i = (TarPull) {
+                .on_finished = on_finished,
+                .userdata = userdata,
+                .image_root = TAKE_PTR(root),
+                .event = TAKE_PTR(e),
+                .glue = TAKE_PTR(g),
+        };
+
+        i->glue->on_finished = pull_job_curl_on_finished;
+        i->glue->userdata = i;
+
+        *ret = TAKE_PTR(i);
+
+        return 0;
+}
+
+static void tar_pull_report_progress(TarPull *i, TarProgress p) {
+        unsigned percent;
+
+        assert(i);
+
+        switch (p) {
+
+        case TAR_DOWNLOADING: {
+                unsigned remain = 85;
+
+                percent = 0;
+
+                if (i->checksum_job) {
+                        percent += i->checksum_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->signature_job) {
+                        percent += i->signature_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->settings_job) {
+                        percent += i->settings_job->progress_percent * 5 / 100;
+                        remain -= 5;
+                }
+
+                if (i->tar_job)
+                        percent += i->tar_job->progress_percent * remain / 100;
+                break;
+        }
+
+        case TAR_VERIFYING:
+                percent = 85;
+                break;
+
+        case TAR_FINALIZING:
+                percent = 90;
+                break;
+
+        case TAR_COPYING:
+                percent = 95;
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        sd_notifyf(false, "X_IMPORT_PROGRESS=%u%%", percent);
+        log_debug("Combined progress %u%%", percent);
+}
+
+static int tar_pull_determine_path(
+                TarPull *i,
+                const char *suffix,
+                char **field /* input + output (!) */) {
+        int r;
+
+        assert(i);
+        assert(field);
+
+        if (*field)
+                return 0;
+
+        assert(i->tar_job);
+
+        r = pull_make_path(i->tar_job->url, i->tar_job->etag, i->image_root, ".tar-", suffix, field);
+        if (r < 0)
+                return log_oom();
+
+        return 1;
+}
+
+static int tar_pull_make_local_copy(TarPull *i) {
+        _cleanup_(rm_rf_subvolume_and_freep) char *t = NULL;
+        const char *p;
+        int r;
+
+        assert(i);
+        assert(i->tar_job);
+
+        if (!i->local)
+                return 0;
+
+        assert(i->final_path);
+
+        p = prefix_roota(i->image_root, i->local);
+
+        r = tempfn_random(p, NULL, &t);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate temporary filename for %s: %m", p);
+
+        if (i->flags & PULL_BTRFS_SUBVOL)
+                r = btrfs_subvol_snapshot_at(
+                                AT_FDCWD, i->final_path,
+                                AT_FDCWD, t,
+                                (i->flags & PULL_BTRFS_QUOTA ? BTRFS_SNAPSHOT_QUOTA : 0)|
+                                BTRFS_SNAPSHOT_FALLBACK_COPY|
+                                BTRFS_SNAPSHOT_FALLBACK_DIRECTORY|
+                                BTRFS_SNAPSHOT_RECURSIVE);
+        else
+                r = copy_tree(i->final_path, t, UID_INVALID, GID_INVALID, COPY_REFLINK|COPY_HARDLINKS, NULL, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create local image: %m");
+
+        r = install_file(AT_FDCWD, t,
+                         AT_FDCWD, p,
+                         (i->flags & PULL_FORCE ? INSTALL_REPLACE : 0) |
+                         (i->flags & PULL_READ_ONLY ? INSTALL_READ_ONLY : 0) |
+                         (i->flags & PULL_SYNC ? INSTALL_SYNCFS : 0));
+        if (r < 0)
+                return log_error_errno(r, "Failed to install local image '%s': %m", p);
+
+        t = mfree(t);
+
+        log_info("Created new local image '%s'.", i->local);
+
+        if (FLAGS_SET(i->flags, PULL_SETTINGS)) {
+                const char *local_settings;
+                assert(i->settings_job);
+
+                r = tar_pull_determine_path(i, ".nspawn", &i->settings_path);
+                if (r < 0)
+                        return r;
+
+                local_settings = strjoina(i->image_root, "/", i->local, ".nspawn");
+
+                r = copy_file_atomic(
+                                i->settings_path,
+                                local_settings,
+                                0664,
+                                COPY_REFLINK |
+                                (FLAGS_SET(i->flags, PULL_FORCE) ? COPY_REPLACE : 0) |
+                                (FLAGS_SET(i->flags, PULL_SYNC) ? COPY_FSYNC_FULL : 0));
+                if (r == -EEXIST)
+                        log_warning_errno(r, "Settings file %s already exists, not replacing.", local_settings);
+                else if (r == -ENOENT)
+                        log_debug_errno(r, "Skipping creation of settings file, since none was found.");
+                else if (r < 0)
+                        log_warning_errno(r, "Failed to copy settings files %s, ignoring: %m", local_settings);
+                else
+                        log_info("Created new settings file %s.", local_settings);
+        }
+
+        return 0;
+}
+
+static bool tar_pull_is_done(TarPull *i) {
+        assert(i);
+        assert(i->tar_job);
+
+        if (!PULL_JOB_IS_COMPLETE(i->tar_job))
+                return false;
+        if (i->checksum_job && !PULL_JOB_IS_COMPLETE(i->checksum_job))
+                return false;
+        if (i->signature_job && !PULL_JOB_IS_COMPLETE(i->signature_job))
+                return false;
+        if (i->settings_job && !PULL_JOB_IS_COMPLETE(i->settings_job))
+                return false;
+
+        return true;
+}
+
+static void tar_pull_job_on_finished(PullJob *j) {
+        TarPull *i;
+        int r;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+
+        if (j->error != 0) {
+                if (j == i->tar_job) {
+                        if (j->error == ENOMEDIUM) /* HTTP 404 */
+                                r = log_error_errno(j->error, "Failed to retrieve image file. (Wrong URL?)");
+                        else
+                                r = log_error_errno(j->error, "Failed to retrieve image file.");
+                        goto finish;
+                } else if (j == i->checksum_job) {
+                        r = log_error_errno(j->error, "Failed to retrieve SHA256 checksum, cannot verify. (Try --verify=no?)");
+                        goto finish;
+                } else if (j == i->signature_job)
+                        log_debug_errno(j->error, "Signature job for %s failed, proceeding for now.", j->url);
+                else if (j == i->settings_job)
+                        log_info_errno(j->error, "Settings file could not be retrieved, proceeding without.");
+                else
+                        assert("unexpected job");
+        }
+
+        /* This is invoked if either the download completed successfully, or the download was skipped because
+         * we already have the etag. */
+
+        if (!tar_pull_is_done(i))
+                return;
+
+        if (i->signature_job && i->signature_job->error != 0) {
+                VerificationStyle style;
+
+                assert(i->checksum_job);
+
+                r = verification_style_from_url(i->checksum_job->url, &style);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to determine verification style from checksum URL: %m");
+                        goto finish;
+                }
+
+                if (style == VERIFICATION_PER_DIRECTORY) { /* A failed signature file download only matters
+                                                            * in per-directory verification mode, since only
+                                                            * then the signature is detached, and thus a file
+                                                            * of its own. */
+                        r = log_error_errno(i->signature_job->error,
+                                            "Failed to retrieve signature file, cannot verify. (Try --verify=no?)");
+                        goto finish;
+                }
+        }
+
+        pull_job_close_disk_fd(i->tar_job);
+        pull_job_close_disk_fd(i->settings_job);
+
+        if (i->tar_pid > 0) {
+                r = wait_for_terminate_and_check("tar", TAKE_PID(i->tar_pid), WAIT_LOG);
+                if (r < 0)
+                        goto finish;
+                if (r != EXIT_SUCCESS) {
+                        r = -EIO;
+                        goto finish;
+                }
+        }
+
+        if (!i->tar_job->etag_exists) {
+                /* This is a new download, verify it, and move it into place */
+
+                tar_pull_report_progress(i, TAR_VERIFYING);
+
+                r = pull_verify(i->verify,
+                                i->checksum,
+                                i->tar_job,
+                                i->checksum_job,
+                                i->signature_job,
+                                i->settings_job,
+                                /* roothash_job = */ NULL,
+                                /* roothash_signature_job = */ NULL,
+                                /* verity_job = */ NULL);
+                if (r < 0)
+                        goto finish;
+        }
+
+        if (i->flags & PULL_DIRECT) {
+                assert(!i->settings_job);
+                assert(i->local);
+                assert(!i->temp_path);
+
+                tar_pull_report_progress(i, TAR_FINALIZING);
+
+                r = import_mangle_os_tree(i->local);
+                if (r < 0)
+                        goto finish;
+
+                r = install_file(
+                                AT_FDCWD, i->local,
+                                AT_FDCWD, NULL,
+                                (i->flags & PULL_READ_ONLY) ? INSTALL_READ_ONLY : 0 |
+                                (i->flags & PULL_SYNC ? INSTALL_SYNCFS : 0));
+                if (r < 0) {
+                        log_error_errno(r, "Failed to finalize '%s': %m", i->local);
+                        goto finish;
+                }
+        } else {
+                r = tar_pull_determine_path(i, NULL, &i->final_path);
+                if (r < 0)
+                        goto finish;
+
+                if (!i->tar_job->etag_exists) {
+                        /* This is a new download, verify it, and move it into place */
+
+                        assert(i->temp_path);
+                        assert(i->final_path);
+
+                        tar_pull_report_progress(i, TAR_FINALIZING);
+
+                        r = import_mangle_os_tree(i->temp_path);
+                        if (r < 0)
+                                goto finish;
+
+                        r = install_file(
+                                        AT_FDCWD, i->temp_path,
+                                        AT_FDCWD, i->final_path,
+                                        INSTALL_READ_ONLY|
+                                        (i->flags & PULL_SYNC ? INSTALL_SYNCFS : 0));
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to rename to final image name to %s: %m", i->final_path);
+                                goto finish;
+                        }
+
+                        i->temp_path = mfree(i->temp_path);
+
+                        if (i->settings_job &&
+                            i->settings_job->error == 0) {
+
+                                /* Also move the settings file into place, if it exists. Note that we do so only if we also
+                                 * moved the tar file in place, to keep things strictly in sync. */
+                                assert(i->settings_temp_path);
+
+                                /* Regenerate final name for this auxiliary file, we might know the etag of the file now, and
+                                 * we should incorporate it in the file name if we can */
+                                i->settings_path = mfree(i->settings_path);
+
+                                r = tar_pull_determine_path(i, ".nspawn", &i->settings_path);
+                                if (r < 0)
+                                        goto finish;
+
+                                r = install_file(
+                                                AT_FDCWD, i->settings_temp_path,
+                                                AT_FDCWD, i->settings_path,
+                                                INSTALL_READ_ONLY|
+                                                (i->flags & PULL_SYNC ? INSTALL_FSYNC_FULL : 0));
+                                if (r < 0) {
+                                        log_error_errno(r, "Failed to rename settings file to %s: %m", i->settings_path);
+                                        goto finish;
+                                }
+
+                                i->settings_temp_path = mfree(i->settings_temp_path);
+                        }
+                }
+
+                tar_pull_report_progress(i, TAR_COPYING);
+
+                r = tar_pull_make_local_copy(i);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = 0;
+
+finish:
+        if (i->on_finished)
+                i->on_finished(i, r, i->userdata);
+        else
+                sd_event_exit(i->event, r);
+}
+
+static int tar_pull_job_on_open_disk_tar(PullJob *j) {
+        const char *where;
+        TarPull *i;
+        int r;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->tar_job == j);
+        assert(i->tar_pid <= 0);
+
+        if (i->flags & PULL_DIRECT)
+                where = i->local;
+        else {
+                if (!i->temp_path) {
+                        r = tempfn_random_child(i->image_root, "tar", &i->temp_path);
+                        if (r < 0)
+                                return log_oom();
+                }
+
+                where = i->temp_path;
+        }
+
+        (void) mkdir_parents_label(where, 0700);
+
+        if (FLAGS_SET(i->flags, PULL_DIRECT|PULL_FORCE))
+                (void) rm_rf(where, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
+
+        if (i->flags & PULL_BTRFS_SUBVOL)
+                r = btrfs_subvol_make_fallback(AT_FDCWD, where, 0755);
+        else
+                r = RET_NERRNO(mkdir(where, 0755));
+        if (r == -EEXIST && (i->flags & PULL_DIRECT)) /* EEXIST is OK if in direct mode, but not otherwise,
+                                                       * because in that case our temporary path collided */
+                r = 0;
+        if (r < 0)
+                return log_error_errno(r, "Failed to create directory/subvolume %s: %m", where);
+        if (r > 0 && (i->flags & PULL_BTRFS_QUOTA)) { /* actually btrfs subvol */
+                if (!(i->flags & PULL_DIRECT))
+                        (void) import_assign_pool_quota_and_warn(i->image_root);
+                (void) import_assign_pool_quota_and_warn(where);
+        }
+
+        j->disk_fd = import_fork_tar_x(where, &i->tar_pid);
+        if (j->disk_fd < 0)
+                return j->disk_fd;
+
+        return 0;
+}
+
+static int tar_pull_job_on_open_disk_settings(PullJob *j) {
+        TarPull *i;
+        int r;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+        assert(i->settings_job == j);
+
+        if (!i->settings_temp_path) {
+                r = tempfn_random_child(i->image_root, "settings", &i->settings_temp_path);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        (void) mkdir_parents_label(i->settings_temp_path, 0700);
+
+        j->disk_fd = open(i->settings_temp_path, O_RDWR|O_CREAT|O_EXCL|O_NOCTTY|O_CLOEXEC, 0664);
+        if (j->disk_fd < 0)
+                return log_error_errno(errno, "Failed to create %s: %m", i->settings_temp_path);
+
+        return 0;
+}
+
+static void tar_pull_job_on_progress(PullJob *j) {
+        TarPull *i;
+
+        assert(j);
+        assert(j->userdata);
+
+        i = j->userdata;
+
+        tar_pull_report_progress(i, TAR_DOWNLOADING);
+}
+
+int tar_pull_start(
+                TarPull *i,
+                const char *url,
+                const char *local,
+                PullFlags flags,
+                ImportVerify verify,
+                const char *checksum) {
+
+        PullJob *j;
+        int r;
+
+        assert(i);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify < _IMPORT_VERIFY_MAX);
+        assert(verify == _IMPORT_VERIFY_INVALID || verify >= 0);
+        assert((verify < 0) || !checksum);
+        assert(!(flags & ~PULL_FLAGS_MASK_TAR));
+        assert(!(flags & PULL_SETTINGS) || !(flags & PULL_DIRECT));
+        assert(!(flags & PULL_SETTINGS) || !checksum);
+
+        if (!http_url_is_valid(url) && !file_url_is_valid(url))
+                return -EINVAL;
+
+        if (local && !pull_validate_local(local, flags))
+                return -EINVAL;
+
+        if (i->tar_job)
+                return -EBUSY;
+
+        r = free_and_strdup(&i->local, local);
+        if (r < 0)
+                return r;
+
+        r = free_and_strdup(&i->checksum, checksum);
+        if (r < 0)
+                return r;
+
+        i->flags = flags;
+        i->verify = verify;
+
+        /* Set up download job for TAR file */
+        r = pull_job_new(&i->tar_job, url, i->glue, i);
+        if (r < 0)
+                return r;
+
+        i->tar_job->on_finished = tar_pull_job_on_finished;
+        i->tar_job->on_open_disk = tar_pull_job_on_open_disk_tar;
+        i->tar_job->calc_checksum = checksum || IN_SET(verify, IMPORT_VERIFY_CHECKSUM, IMPORT_VERIFY_SIGNATURE);
+
+        if (!FLAGS_SET(flags, PULL_DIRECT)) {
+                r = pull_find_old_etags(url, i->image_root, DT_DIR, ".tar-", NULL, &i->tar_job->old_etags);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Set up download of checksum/signature files */
+        r = pull_make_verification_jobs(
+                        &i->checksum_job,
+                        &i->signature_job,
+                        verify,
+                        checksum,
+                        url,
+                        i->glue,
+                        tar_pull_job_on_finished,
+                        i);
+        if (r < 0)
+                return r;
+
+        /* Set up download job for the settings file (.nspawn) */
+        if (FLAGS_SET(flags, PULL_SETTINGS)) {
+                r = pull_make_auxiliary_job(
+                                &i->settings_job,
+                                url,
+                                tar_strip_suffixes,
+                                ".nspawn",
+                                verify,
+                                i->glue,
+                                tar_pull_job_on_open_disk_settings,
+                                tar_pull_job_on_finished,
+                                i);
+                if (r < 0)
+                        return r;
+        }
+
+        FOREACH_POINTER(j,
+                        i->tar_job,
+                        i->checksum_job,
+                        i->signature_job,
+                        i->settings_job) {
+
+                if (!j)
+                        continue;
+
+                j->on_progress = tar_pull_job_on_progress;
+                j->sync = FLAGS_SET(flags, PULL_SYNC);
+
+                r = pull_job_begin(j);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
diff --git a/src/import/pull-tar.h b/src/import/pull-tar.h
new file mode 100644
index 0000000..e54c01c
--- /dev/null
+++ b/src/import/pull-tar.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "import-util.h"
+#include "macro.h"
+#include "pull-common.h"
+
+typedef struct TarPull TarPull;
+
+typedef void (*TarPullFinished)(TarPull *pull, int error, void *userdata);
+
+int tar_pull_new(TarPull **pull, sd_event *event, const char *image_root, TarPullFinished on_finished, void *userdata);
+TarPull* tar_pull_unref(TarPull *pull);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(TarPull*, tar_pull_unref);
+
+int tar_pull_start(TarPull *pull, const char *url, const char *local, PullFlags flags, ImportVerify verify, const char *checksum);
diff --git a/src/import/pull.c b/src/import/pull.c
new file mode 100644
index 0000000..38821b5
--- /dev/null
+++ b/src/import/pull.c
@@ -0,0 +1,556 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+
+#include "sd-event.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "discover-image.h"
+#include "env-util.h"
+#include "hexdecoct.h"
+#include "hostname-util.h"
+#include "import-common.h"
+#include "import-util.h"
+#include "io-util.h"
+#include "main-func.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "pull-raw.h"
+#include "pull-tar.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "verbs.h"
+#include "web-util.h"
+
+static const char *arg_image_root = "/var/lib/machines";
+static ImportVerify arg_verify = IMPORT_VERIFY_SIGNATURE;
+static PullFlags arg_pull_flags = PULL_SETTINGS | PULL_ROOTHASH | PULL_ROOTHASH_SIGNATURE | PULL_VERITY | PULL_BTRFS_SUBVOL | PULL_BTRFS_QUOTA | PULL_CONVERT_QCOW2 | PULL_SYNC;
+static uint64_t arg_offset = UINT64_MAX, arg_size_max = UINT64_MAX;
+static char *arg_checksum = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_checksum, freep);
+
+static int normalize_local(const char *local, const char *url, char **ret) {
+        _cleanup_free_ char *ll = NULL;
+        int r;
+
+        if (arg_pull_flags & PULL_DIRECT) {
+
+                if (!local)
+                        log_debug("Writing downloaded data to STDOUT.");
+                else {
+                        if (!path_is_absolute(local)) {
+                                ll = path_join(arg_image_root, local);
+                                if (!ll)
+                                        return log_oom();
+
+                                local = ll;
+                        }
+
+                        if (!path_is_valid(local))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Local path name '%s' is not valid.", local);
+                }
+
+        } else if (local) {
+
+                if (!hostname_is_valid(local, 0))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Local image name '%s' is not valid.",
+                                               local);
+
+                if (!FLAGS_SET(arg_pull_flags, PULL_FORCE)) {
+                        r = image_find(IMAGE_MACHINE, local, NULL, NULL);
+                        if (r < 0) {
+                                if (r != -ENOENT)
+                                        return log_error_errno(r, "Failed to check whether image '%s' exists: %m", local);
+                        } else
+                                return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+                                                       "Image '%s' already exists.",
+                                                       local);
+                }
+        }
+
+        if (!ll && local) {
+                ll = strdup(local);
+                if (!ll)
+                        return log_oom();
+        }
+
+        if (ll) {
+                if (arg_offset != UINT64_MAX)
+                        log_info("Pulling '%s', saving at offset %" PRIu64 " in '%s'.", url, arg_offset, ll);
+                else
+                        log_info("Pulling '%s', saving as '%s'.", url, ll);
+        } else
+                log_info("Pulling '%s'.", url);
+
+        *ret = TAKE_PTR(ll);
+        return 0;
+}
+
+static void on_tar_finished(TarPull *pull, int error, void *userdata) {
+        sd_event *event = userdata;
+        assert(pull);
+
+        if (error == 0)
+                log_info("Operation completed successfully.");
+
+        sd_event_exit(event, abs(error));
+}
+
+static int pull_tar(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *ll = NULL, *normalized = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(tar_pull_unrefp) TarPull *pull = NULL;
+        const char *url, *local;
+        int r;
+
+        url = argv[1];
+        if (!http_url_is_valid(url) && !file_url_is_valid(url))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "URL '%s' is not valid.", url);
+
+        if (argc >= 3)
+                local = empty_or_dash_to_null(argv[2]);
+        else {
+                _cleanup_free_ char *l = NULL;
+
+                r = import_url_last_component(url, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get final component of URL: %m");
+
+                r = tar_strip_suffixes(l, &ll);
+                if (r < 0)
+                        return log_oom();
+
+                local = ll;
+        }
+
+        if (!local && FLAGS_SET(arg_pull_flags, PULL_DIRECT))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Pulling tar images to STDOUT is not supported.");
+
+        r = normalize_local(local, url, &normalized);
+        if (r < 0)
+                return r;
+
+        r = import_allocate_event_with_signals(&event);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(arg_pull_flags, PULL_SYNC))
+                log_info("File system synchronization on completion is off.");
+
+        r = tar_pull_new(&pull, event, arg_image_root, on_tar_finished, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate puller: %m");
+
+        r = tar_pull_start(
+                        pull,
+                        url,
+                        normalized,
+                        arg_pull_flags & PULL_FLAGS_MASK_TAR,
+                        arg_verify,
+                        arg_checksum);
+        if (r < 0)
+                return log_error_errno(r, "Failed to pull image: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        log_info("Exiting.");
+        return -r;
+}
+
+static void on_raw_finished(RawPull *pull, int error, void *userdata) {
+        sd_event *event = userdata;
+        assert(pull);
+
+        if (error == 0)
+                log_info("Operation completed successfully.");
+
+        sd_event_exit(event, abs(error));
+}
+
+static int pull_raw(int argc, char *argv[], void *userdata) {
+        _cleanup_free_ char *ll = NULL, *normalized = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(raw_pull_unrefp) RawPull *pull = NULL;
+        const char *url, *local;
+        int r;
+
+        url = argv[1];
+        if (!http_url_is_valid(url) && !file_url_is_valid(url))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "URL '%s' is not valid.", url);
+
+        if (argc >= 3)
+                local = empty_or_dash_to_null(argv[2]);
+        else {
+                _cleanup_free_ char *l = NULL;
+
+                r = import_url_last_component(url, &l);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get final component of URL: %m");
+
+                r = raw_strip_suffixes(l, &ll);
+                if (r < 0)
+                        return log_oom();
+
+                local = ll;
+        }
+
+        r = normalize_local(local, url, &normalized);
+        if (r < 0)
+                return r;
+
+        r = import_allocate_event_with_signals(&event);
+        if (r < 0)
+                return r;
+
+        if (!FLAGS_SET(arg_pull_flags, PULL_SYNC))
+                log_info("File system synchronization on completion is off.");
+         r = raw_pull_new(&pull, event, arg_image_root, on_raw_finished, event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate puller: %m");
+
+        r = raw_pull_start(
+                        pull,
+                        url,
+                        normalized,
+                        arg_offset,
+                        arg_size_max,
+                        arg_pull_flags & PULL_FLAGS_MASK_RAW,
+                        arg_verify,
+                        arg_checksum);
+        if (r < 0)
+                return log_error_errno(r, "Failed to pull image: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        log_info("Exiting.");
+        return -r;
+}
+
+static int help(int argc, char *argv[], void *userdata) {
+
+        printf("%1$s [OPTIONS...] {COMMAND} ...\n"
+               "\n%4$sDownload container or virtual machine images.%5$s\n"
+               "\n%2$sCommands:%3$s\n"
+               "  tar URL [NAME]              Download a TAR image\n"
+               "  raw URL [NAME]              Download a RAW image\n"
+               "\n%2$sOptions:%3$s\n"
+               "  -h --help                   Show this help\n"
+               "     --version                Show package version\n"
+               "     --force                  Force creation of image\n"
+               "     --verify=MODE            Verify downloaded image, one of: 'no',\n"
+               "                              'checksum', 'signature' or literal SHA256 hash\n"
+               "     --settings=BOOL          Download settings file with image\n"
+               "     --roothash=BOOL          Download root hash file with image\n"
+               "     --roothash-signature=BOOL\n"
+               "                              Download root hash signature file with image\n"
+               "     --verity=BOOL            Download verity file with image\n"
+               "     --image-root=PATH        Image root directory\n\n"
+               "     --read-only              Create a read-only image\n"
+               "     --direct                 Download directly to specified file\n"
+               "     --btrfs-subvol=BOOL      Controls whether to create a btrfs subvolume\n"
+               "                              instead of a directory\n"
+               "     --btrfs-quota=BOOL       Controls whether to set up quota for btrfs\n"
+               "                              subvolume\n"
+               "     --convert-qcow2=BOOL     Controls whether to convert QCOW2 images to\n"
+               "                              regular disk images\n"
+               "     --sync=BOOL              Controls whether to sync() before completing\n"
+               "     --offset=BYTES           Offset to seek to in destination\n"
+               "     --size-max=BYTES         Maximum number of bytes to write to destination\n",
+               program_invocation_short_name,
+               ansi_underline(),
+               ansi_normal(),
+               ansi_highlight(),
+               ansi_normal());
+
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+
+        enum {
+                ARG_VERSION = 0x100,
+                ARG_FORCE,
+                ARG_IMAGE_ROOT,
+                ARG_VERIFY,
+                ARG_SETTINGS,
+                ARG_ROOTHASH,
+                ARG_ROOTHASH_SIGNATURE,
+                ARG_VERITY,
+                ARG_READ_ONLY,
+                ARG_DIRECT,
+                ARG_BTRFS_SUBVOL,
+                ARG_BTRFS_QUOTA,
+                ARG_CONVERT_QCOW2,
+                ARG_SYNC,
+                ARG_OFFSET,
+                ARG_SIZE_MAX,
+        };
+
+        static const struct option options[] = {
+                { "help",               no_argument,       NULL, 'h'                    },
+                { "version",            no_argument,       NULL, ARG_VERSION            },
+                { "force",              no_argument,       NULL, ARG_FORCE              },
+                { "image-root",         required_argument, NULL, ARG_IMAGE_ROOT         },
+                { "verify",             required_argument, NULL, ARG_VERIFY             },
+                { "settings",           required_argument, NULL, ARG_SETTINGS           },
+                { "roothash",           required_argument, NULL, ARG_ROOTHASH           },
+                { "roothash-signature", required_argument, NULL, ARG_ROOTHASH_SIGNATURE },
+                { "verity",             required_argument, NULL, ARG_VERITY             },
+                { "read-only",          no_argument,       NULL, ARG_READ_ONLY          },
+                { "direct",             no_argument,       NULL, ARG_DIRECT             },
+                { "btrfs-subvol",       required_argument, NULL, ARG_BTRFS_SUBVOL       },
+                { "btrfs-quota",        required_argument, NULL, ARG_BTRFS_QUOTA        },
+                { "convert-qcow2",      required_argument, NULL, ARG_CONVERT_QCOW2      },
+                { "sync",               required_argument, NULL, ARG_SYNC               },
+                { "offset",             required_argument, NULL, ARG_OFFSET             },
+                { "size-max",           required_argument, NULL, ARG_SIZE_MAX           },
+                {}
+        };
+
+        int c, r;
+
+        assert(argc >= 0);
+        assert(argv);
+
+        while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+                switch (c) {
+
+                case 'h':
+                        return help(0, NULL, NULL);
+
+                case ARG_VERSION:
+                        return version();
+
+                case ARG_FORCE:
+                        arg_pull_flags |= PULL_FORCE;
+                        break;
+
+                case ARG_IMAGE_ROOT:
+                        arg_image_root = optarg;
+                        break;
+
+                case ARG_VERIFY: {
+                        ImportVerify v;
+
+                        v = import_verify_from_string(optarg);
+                        if (v < 0) {
+                                _cleanup_free_ void *h = NULL;
+                                char *hh;
+                                size_t n;
+
+                                /* If this is not a valid verification mode, maybe it's a literally specified
+                                 * SHA256 hash? We can handle that too... */
+
+                                r = unhexmem(optarg, (size_t) -1, &h, &n);
+                                if (r < 0 || n == 0)
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "Invalid verification setting: %s", optarg);
+                                if (n != 32)
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                               "64 hex character SHA256 hash required when specifying explicit checksum, %zu specified", n * 2);
+
+                                hh = hexmem(h, n); /* bring into canonical (lowercase) form */
+                                if (!hh)
+                                        return log_oom();
+
+                                free_and_replace(arg_checksum, hh);
+                                arg_pull_flags &= ~(PULL_SETTINGS|PULL_ROOTHASH|PULL_ROOTHASH_SIGNATURE|PULL_VERITY);
+                                arg_verify = _IMPORT_VERIFY_INVALID;
+                        } else
+                                arg_verify = v;
+
+                        break;
+                }
+
+                case ARG_SETTINGS:
+                        r = parse_boolean_argument("--settings=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_SETTINGS, r);
+                        break;
+
+                case ARG_ROOTHASH:
+                        r = parse_boolean_argument("--roothash=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_ROOTHASH, r);
+
+                        /* If we were asked to turn off the root hash, implicitly also turn off the root hash signature */
+                        if (!r)
+                                SET_FLAG(arg_pull_flags, PULL_ROOTHASH_SIGNATURE, false);
+                        break;
+
+                case ARG_ROOTHASH_SIGNATURE:
+                        r = parse_boolean_argument("--roothash-signature=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_ROOTHASH_SIGNATURE, r);
+                        break;
+
+                case ARG_VERITY:
+                        r = parse_boolean_argument("--verity=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_VERITY, r);
+                        break;
+
+                case ARG_READ_ONLY:
+                        arg_pull_flags |= PULL_READ_ONLY;
+                        break;
+
+                case ARG_DIRECT:
+                        arg_pull_flags |= PULL_DIRECT;
+                        arg_pull_flags &= ~(PULL_SETTINGS|PULL_ROOTHASH|PULL_ROOTHASH_SIGNATURE|PULL_VERITY);
+                        break;
+
+                case ARG_BTRFS_SUBVOL:
+                        r = parse_boolean_argument("--btrfs-subvol=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_BTRFS_SUBVOL, r);
+                        break;
+
+                case ARG_BTRFS_QUOTA:
+                        r = parse_boolean_argument("--btrfs-quota=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_BTRFS_QUOTA, r);
+                        break;
+
+                case ARG_CONVERT_QCOW2:
+                        r = parse_boolean_argument("--convert-qcow2=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_CONVERT_QCOW2, r);
+                        break;
+
+                case ARG_SYNC:
+                        r = parse_boolean_argument("--sync=", optarg, NULL);
+                        if (r < 0)
+                                return r;
+
+                        SET_FLAG(arg_pull_flags, PULL_SYNC, r);
+                        break;
+
+                case ARG_OFFSET: {
+                        uint64_t u;
+
+                        r = safe_atou64(optarg, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --offset= argument: %s", optarg);
+                        if (!FILE_SIZE_VALID(u))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Argument to --offset= switch too large: %s", optarg);
+
+                        arg_offset = u;
+                        break;
+                }
+
+                case ARG_SIZE_MAX: {
+                        uint64_t u;
+
+                        r = parse_size(optarg, 1024, &u);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse --size-max= argument: %s", optarg);
+                        if (!FILE_SIZE_VALID(u))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Argument to --size-max= switch too large: %s", optarg);
+
+                        arg_size_max = u;
+                        break;
+                }
+
+                case '?':
+                        return -EINVAL;
+
+                default:
+                        assert_not_reached();
+                }
+
+        /* Make sure offset+size is still in the valid range if both set */
+        if (arg_offset != UINT64_MAX && arg_size_max != UINT64_MAX &&
+            ((arg_size_max > (UINT64_MAX - arg_offset)) ||
+             !FILE_SIZE_VALID(arg_offset + arg_size_max)))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File offset und maximum size out of range.");
+
+        if (arg_offset != UINT64_MAX && !FLAGS_SET(arg_pull_flags, PULL_DIRECT))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File offset only supported in --direct mode.");
+
+        if (arg_checksum && (arg_pull_flags & (PULL_SETTINGS|PULL_ROOTHASH|PULL_ROOTHASH_SIGNATURE|PULL_VERITY)) != 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Literal checksum verification only supported if no associated files are downloaded.");
+
+        return 1;
+}
+
+static void parse_env(void) {
+        int r;
+
+        /* Let's make these relatively low-level settings also controllable via env vars. User can then set
+         * them for systemd-importd.service if they like to tweak behaviour */
+
+        r = getenv_bool("SYSTEMD_IMPORT_BTRFS_SUBVOL");
+        if (r >= 0)
+                SET_FLAG(arg_pull_flags, PULL_BTRFS_SUBVOL, r);
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_BTRFS_SUBVOL: %m");
+
+        r = getenv_bool("SYSTEMD_IMPORT_BTRFS_QUOTA");
+        if (r >= 0)
+                SET_FLAG(arg_pull_flags, PULL_BTRFS_QUOTA, r);
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_BTRFS_QUOTA: %m");
+
+        r = getenv_bool("SYSTEMD_IMPORT_SYNC");
+        if (r >= 0)
+                SET_FLAG(arg_pull_flags, PULL_SYNC, r);
+        else if (r != -ENXIO)
+                log_warning_errno(r, "Failed to parse $SYSTEMD_IMPORT_SYNC: %m");
+}
+
+static int pull_main(int argc, char *argv[]) {
+        static const Verb verbs[] = {
+                { "help", VERB_ANY, VERB_ANY, 0, help     },
+                { "tar",  2,        3,        0, pull_tar },
+                { "raw",  2,        3,        0, pull_raw },
+                {}
+        };
+
+        return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+static int run(int argc, char *argv[]) {
+        int r;
+
+        setlocale(LC_ALL, "");
+        log_parse_environment();
+        log_open();
+
+        parse_env();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        (void) ignore_signals(SIGPIPE);
+
+        return pull_main(argc, argv);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/import/qcow2-util.c b/src/import/qcow2-util.c
new file mode 100644
index 0000000..c70656b
--- /dev/null
+++ b/src/import/qcow2-util.c
@@ -0,0 +1,333 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "qcow2-util.h"
+#include "sparse-endian.h"
+
+#define QCOW2_MAGIC 0x514649fb
+
+#define QCOW2_COPIED (1ULL << 63)
+#define QCOW2_COMPRESSED (1ULL << 62)
+#define QCOW2_ZERO (1ULL << 0)
+
+typedef struct _packed_ Header {
+      be32_t magic;
+      be32_t version;
+
+      be64_t backing_file_offset;
+      be32_t backing_file_size;
+
+      be32_t cluster_bits;
+      be64_t size;
+      be32_t crypt_method;
+
+      be32_t l1_size;
+      be64_t l1_table_offset;
+
+      be64_t refcount_table_offset;
+      be32_t refcount_table_clusters;
+
+      be32_t nb_snapshots;
+      be64_t snapshots_offset;
+
+      /* The remainder is only present on QCOW3 */
+      be64_t incompatible_features;
+      be64_t compatible_features;
+      be64_t autoclear_features;
+
+      be32_t refcount_order;
+      be32_t header_length;
+} Header;
+
+#define HEADER_MAGIC(header) be32toh((header)->magic)
+#define HEADER_VERSION(header) be32toh((header)->version)
+#define HEADER_CLUSTER_BITS(header) be32toh((header)->cluster_bits)
+#define HEADER_CLUSTER_SIZE(header) (1ULL << HEADER_CLUSTER_BITS(header))
+#define HEADER_L2_BITS(header) (HEADER_CLUSTER_BITS(header) - 3)
+#define HEADER_SIZE(header) be64toh((header)->size)
+#define HEADER_CRYPT_METHOD(header) be32toh((header)->crypt_method)
+#define HEADER_L1_SIZE(header) be32toh((header)->l1_size)
+#define HEADER_L2_SIZE(header) (HEADER_CLUSTER_SIZE(header)/sizeof(uint64_t))
+#define HEADER_L1_TABLE_OFFSET(header) be64toh((header)->l1_table_offset)
+
+static uint32_t HEADER_HEADER_LENGTH(const Header *h) {
+        if (HEADER_VERSION(h) < 3)
+                return offsetof(Header, incompatible_features);
+
+        return be32toh(h->header_length);
+}
+
+static int copy_cluster(
+                int sfd, uint64_t soffset,
+                int dfd, uint64_t doffset,
+                uint64_t cluster_size,
+                void *buffer) {
+
+        ssize_t l;
+        int r;
+
+        r = reflink_range(sfd, soffset, dfd, doffset, cluster_size);
+        if (r >= 0)
+                return r;
+
+        l = pread(sfd, buffer, cluster_size, soffset);
+        if (l < 0)
+                return -errno;
+        if ((uint64_t) l != cluster_size)
+                return -EIO;
+
+        l = pwrite(dfd, buffer, cluster_size, doffset);
+        if (l < 0)
+                return -errno;
+        if ((uint64_t) l != cluster_size)
+                return -EIO;
+
+        return 0;
+}
+
+static int decompress_cluster(
+                int sfd, uint64_t soffset,
+                int dfd, uint64_t doffset,
+                uint64_t compressed_size,
+                uint64_t cluster_size,
+                void *buffer1,
+                void *buffer2) {
+
+        _cleanup_free_ void *large_buffer = NULL;
+        z_stream s = {};
+        uint64_t sz;
+        ssize_t l;
+        int r;
+
+        if (compressed_size > cluster_size) {
+                /* The usual cluster buffer doesn't suffice, let's
+                 * allocate a larger one, temporarily */
+
+                large_buffer = malloc(compressed_size);
+                if (!large_buffer)
+                        return -ENOMEM;
+
+                buffer1 = large_buffer;
+        }
+
+        l = pread(sfd, buffer1, compressed_size, soffset);
+        if (l < 0)
+                return -errno;
+        if ((uint64_t) l != compressed_size)
+                return -EIO;
+
+        s.next_in = buffer1;
+        s.avail_in = compressed_size;
+        s.next_out = buffer2;
+        s.avail_out = cluster_size;
+
+        r = inflateInit2(&s, -12);
+        if (r != Z_OK)
+                return -EIO;
+
+        r = inflate(&s, Z_FINISH);
+        sz = (uint8_t*) s.next_out - (uint8_t*) buffer2;
+        inflateEnd(&s);
+        if (r != Z_STREAM_END || sz != cluster_size)
+                return -EIO;
+
+        l = pwrite(dfd, buffer2, cluster_size, doffset);
+        if (l < 0)
+                return -errno;
+        if ((uint64_t) l != cluster_size)
+                return -EIO;
+
+        return 0;
+}
+
+static int normalize_offset(
+                const Header *header,
+                uint64_t p,
+                uint64_t *ret,
+                bool *compressed,
+                uint64_t *compressed_size) {
+
+        uint64_t q;
+
+        q = be64toh(p);
+
+        if (q & QCOW2_COMPRESSED) {
+                uint64_t sz, csize_shift, csize_mask;
+
+                if (!compressed)
+                        return -EOPNOTSUPP;
+
+                csize_shift = 64 - 2 - (HEADER_CLUSTER_BITS(header) - 8);
+                csize_mask = (1ULL << (HEADER_CLUSTER_BITS(header) - 8)) - 1;
+                sz = (((q >> csize_shift) & csize_mask) + 1) * 512 - (q & 511);
+                q &= ((1ULL << csize_shift) - 1);
+
+                if (compressed_size)
+                        *compressed_size = sz;
+
+                *compressed = true;
+
+        } else {
+                if (compressed)  {
+                        *compressed = false;
+                        *compressed_size = 0;
+                }
+
+                if (q & QCOW2_ZERO) {
+                        /* We make no distinction between zero blocks and holes */
+                        *ret = 0;
+                        return 0;
+                }
+
+                q &= ~QCOW2_COPIED;
+        }
+
+        *ret = q;
+        return q > 0;  /* returns positive if not a hole */
+}
+
+static int verify_header(const Header *header) {
+        assert(header);
+
+        if (HEADER_MAGIC(header) != QCOW2_MAGIC)
+                return -EBADMSG;
+
+        if (!IN_SET(HEADER_VERSION(header), 2, 3))
+                return -EOPNOTSUPP;
+
+        if (HEADER_CRYPT_METHOD(header) != 0)
+                return -EOPNOTSUPP;
+
+        if (HEADER_CLUSTER_BITS(header) < 9) /* 512K */
+                return -EBADMSG;
+
+        if (HEADER_CLUSTER_BITS(header) > 21) /* 2MB */
+                return -EBADMSG;
+
+        if (HEADER_SIZE(header) % HEADER_CLUSTER_SIZE(header) != 0)
+                return -EBADMSG;
+
+        if (HEADER_L1_SIZE(header) > 32*1024*1024) /* 32MB */
+                return -EBADMSG;
+
+        if (HEADER_VERSION(header) == 3) {
+
+                if (header->incompatible_features != 0)
+                        return -EOPNOTSUPP;
+
+                if (HEADER_HEADER_LENGTH(header) < sizeof(Header))
+                        return -EBADMSG;
+        }
+
+        return 0;
+}
+
+int qcow2_convert(int qcow2_fd, int raw_fd) {
+        _cleanup_free_ void *buffer1 = NULL, *buffer2 = NULL;
+        _cleanup_free_ be64_t *l1_table = NULL, *l2_table = NULL;
+        uint64_t sz, i;
+        Header header;
+        ssize_t l;
+        int r;
+
+        l = pread(qcow2_fd, &header, sizeof(header), 0);
+        if (l < 0)
+                return -errno;
+        if (l != sizeof(header))
+                return -EIO;
+
+        r = verify_header(&header);
+        if (r < 0)
+                return r;
+
+        l1_table = new(be64_t, HEADER_L1_SIZE(&header));
+        if (!l1_table)
+                return -ENOMEM;
+
+        l2_table = malloc(HEADER_CLUSTER_SIZE(&header));
+        if (!l2_table)
+                return -ENOMEM;
+
+        buffer1 = malloc(HEADER_CLUSTER_SIZE(&header));
+        if (!buffer1)
+                return -ENOMEM;
+
+        buffer2 = malloc(HEADER_CLUSTER_SIZE(&header));
+        if (!buffer2)
+                return -ENOMEM;
+
+        /* Empty the file if it exists, we rely on zero bits */
+        if (ftruncate(raw_fd, 0) < 0)
+                return -errno;
+
+        if (ftruncate(raw_fd, HEADER_SIZE(&header)) < 0)
+                return -errno;
+
+        sz = sizeof(uint64_t) * HEADER_L1_SIZE(&header);
+        l = pread(qcow2_fd, l1_table, sz, HEADER_L1_TABLE_OFFSET(&header));
+        if (l < 0)
+                return -errno;
+        if ((uint64_t) l != sz)
+                return -EIO;
+
+        for (i = 0; i < HEADER_L1_SIZE(&header); i ++) {
+                uint64_t l2_begin, j;
+
+                r = normalize_offset(&header, l1_table[i], &l2_begin, NULL, NULL);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        continue;
+
+                l = pread(qcow2_fd, l2_table, HEADER_CLUSTER_SIZE(&header), l2_begin);
+                if (l < 0)
+                        return -errno;
+                if ((uint64_t) l != HEADER_CLUSTER_SIZE(&header))
+                        return -EIO;
+
+                for (j = 0; j < HEADER_L2_SIZE(&header); j++) {
+                        uint64_t data_begin, p, compressed_size;
+                        bool compressed;
+
+                        p = ((i << HEADER_L2_BITS(&header)) + j) << HEADER_CLUSTER_BITS(&header);
+
+                        r = normalize_offset(&header, l2_table[j], &data_begin, &compressed, &compressed_size);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                continue;
+
+                        if (compressed)
+                                r = decompress_cluster(
+                                                qcow2_fd, data_begin,
+                                                raw_fd, p,
+                                                compressed_size, HEADER_CLUSTER_SIZE(&header),
+                                                buffer1, buffer2);
+                        else
+                                r = copy_cluster(
+                                                qcow2_fd, data_begin,
+                                                raw_fd, p,
+                                                HEADER_CLUSTER_SIZE(&header), buffer1);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+int qcow2_detect(int fd) {
+        be32_t id;
+        ssize_t l;
+
+        l = pread(fd, &id, sizeof(id), 0);
+        if (l < 0)
+                return -errno;
+        if (l != sizeof(id))
+                return -EIO;
+
+        return htobe32(QCOW2_MAGIC) == id;
+}
diff --git a/src/import/qcow2-util.h b/src/import/qcow2-util.h
new file mode 100644
index 0000000..f17c159
--- /dev/null
+++ b/src/import/qcow2-util.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int qcow2_detect(int fd);
+int qcow2_convert(int qcow2_fd, int raw_fd);
diff --git a/src/import/test-qcow2.c b/src/import/test-qcow2.c
new file mode 100644
index 0000000..8893207
--- /dev/null
+++ b/src/import/test-qcow2.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "fd-util.h"
+#include "log.h"
+#include "qcow2-util.h"
+#include "tests.h"
+
+int main(int argc, char *argv[]) {
+        _cleanup_close_ int sfd = -EBADF, dfd = -EBADF;
+        int r;
+
+        test_setup_logging(LOG_DEBUG);
+
+        if (argc != 3) {
+                log_error("Needs two arguments.");
+                return EXIT_FAILURE;
+        }
+
+        sfd = open(argv[1], O_RDONLY|O_CLOEXEC|O_NOCTTY);
+        if (sfd < 0) {
+                log_error_errno(errno, "Can't open source file: %m");
+                return EXIT_FAILURE;
+        }
+
+        dfd = open(argv[2], O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, 0666);
+        if (dfd < 0) {
+                log_error_errno(errno, "Can't open destination file: %m");
+                return EXIT_FAILURE;
+        }
+
+        r = qcow2_convert(sfd, dfd);
+        if (r < 0) {
+                log_error_errno(r, "Failed to unpack: %m");
+                return EXIT_FAILURE;
+        }
+
+        return EXIT_SUCCESS;
+}
diff --git a/src/initctl/initctl.c b/src/initctl/initctl.c
new file mode 100644
index 0000000..d1b7c30
--- /dev/null
+++ b/src/initctl/initctl.c
@@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "sd-bus.h"
+#include "sd-daemon.h"
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-util.h"
+#include "constants.h"
+#include "daemon-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "initreq.h"
+#include "list.h"
+#include "log.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "process-util.h"
+#include "reboot-util.h"
+#include "special.h"
+
+#define SERVER_FD_MAX 16
+#define TIMEOUT_MSEC ((int) (DEFAULT_EXIT_USEC/USEC_PER_MSEC))
+
+typedef struct Fifo Fifo;
+
+typedef struct Server {
+        int epoll_fd;
+
+        LIST_HEAD(Fifo, fifos);
+        unsigned n_fifos;
+
+        sd_bus *bus;
+
+        bool quit;
+} Server;
+
+struct Fifo {
+        Server *server;
+
+        int fd;
+
+        struct init_request buffer;
+        size_t bytes_read;
+
+        LIST_FIELDS(Fifo, fifo);
+};
+
+static const char *translate_runlevel(int runlevel, bool *isolate) {
+        static const struct {
+                const int runlevel;
+                const char *special;
+                bool isolate;
+        } table[] = {
+                { '0', SPECIAL_POWEROFF_TARGET,   false },
+                { '1', SPECIAL_RESCUE_TARGET,     true  },
+                { 's', SPECIAL_RESCUE_TARGET,     true  },
+                { 'S', SPECIAL_RESCUE_TARGET,     true  },
+                { '2', SPECIAL_MULTI_USER_TARGET, true  },
+                { '3', SPECIAL_MULTI_USER_TARGET, true  },
+                { '4', SPECIAL_MULTI_USER_TARGET, true  },
+                { '5', SPECIAL_GRAPHICAL_TARGET,  true  },
+                { '6', SPECIAL_REBOOT_TARGET,     false },
+        };
+
+        assert(isolate);
+
+        for (size_t i = 0; i < ELEMENTSOF(table); i++)
+                if (table[i].runlevel == runlevel) {
+                        *isolate = table[i].isolate;
+                        if (runlevel == '6' && kexec_loaded())
+                                return SPECIAL_KEXEC_TARGET;
+                        return table[i].special;
+                }
+
+        return NULL;
+}
+
+static int change_runlevel(Server *s, int runlevel) {
+        const char *target;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        const char *mode;
+        bool isolate = false;
+        int r;
+
+        assert(s);
+
+        target = translate_runlevel(runlevel, &isolate);
+        if (!target) {
+                log_warning("Got request for unknown runlevel %c, ignoring.", runlevel);
+                return 0;
+        }
+
+        if (isolate)
+                mode = "isolate";
+        else
+                mode = "replace-irreversibly";
+
+        log_debug("Requesting %s/start/%s", target, mode);
+
+        r = bus_call_method(s->bus, bus_systemd_mgr, "StartUnit", &error, NULL, "ss", target, mode);
+        if (r < 0)
+                return log_error_errno(r, "Failed to change runlevel: %s", bus_error_message(&error, r));
+
+        return 0;
+}
+
+static void request_process(Server *s, const struct init_request *req) {
+        assert(s);
+        assert(req);
+
+        if (req->magic != INIT_MAGIC) {
+                log_error("Got initctl request with invalid magic. Ignoring.");
+                return;
+        }
+
+        switch (req->cmd) {
+
+        case INIT_CMD_RUNLVL:
+                if (!isprint(req->runlevel))
+                        log_error("Got invalid runlevel. Ignoring.");
+                else
+                        switch (req->runlevel) {
+
+                        /* we are async anyway, so just use kill for reexec/reload */
+                        case 'u':
+                        case 'U':
+                                if (kill(1, SIGTERM) < 0)
+                                        log_error_errno(errno, "kill() failed: %m");
+
+                                /* The bus connection will be
+                                 * terminated if PID 1 is reexecuted,
+                                 * hence let's just exit here, and
+                                 * rely on that we'll be restarted on
+                                 * the next request */
+                                s->quit = true;
+                                break;
+
+                        case 'q':
+                        case 'Q':
+                                if (kill(1, SIGHUP) < 0)
+                                        log_error_errno(errno, "kill() failed: %m");
+                                break;
+
+                        default:
+                                (void) change_runlevel(s, req->runlevel);
+                        }
+                return;
+
+        case INIT_CMD_POWERFAIL:
+        case INIT_CMD_POWERFAILNOW:
+        case INIT_CMD_POWEROK:
+                log_warning("Received UPS/power initctl request. This is not implemented in systemd. Upgrade your UPS daemon!");
+                return;
+
+        case INIT_CMD_CHANGECONS:
+                log_warning("Received console change initctl request. This is not implemented in systemd.");
+                return;
+
+        case INIT_CMD_SETENV:
+        case INIT_CMD_UNSETENV:
+                log_warning("Received environment initctl request. This is not implemented in systemd.");
+                return;
+
+        default:
+                log_warning("Received unknown initctl request. Ignoring.");
+                return;
+        }
+}
+
+static int fifo_process(Fifo *f) {
+        ssize_t l;
+
+        assert(f);
+
+        errno = EIO;
+        l = read(f->fd,
+                 ((uint8_t*) &f->buffer) + f->bytes_read,
+                 sizeof(f->buffer) - f->bytes_read);
+        if (l <= 0) {
+                if (errno == EAGAIN)
+                        return 0;
+
+                return log_warning_errno(errno, "Failed to read from fifo: %m");
+        }
+
+        f->bytes_read += l;
+        assert(f->bytes_read <= sizeof(f->buffer));
+
+        if (f->bytes_read == sizeof(f->buffer)) {
+                request_process(f->server, &f->buffer);
+                f->bytes_read = 0;
+        }
+
+        return 0;
+}
+
+static Fifo* fifo_free(Fifo *f) {
+        if (!f)
+                return NULL;
+
+        if (f->server) {
+                assert(f->server->n_fifos > 0);
+                f->server->n_fifos--;
+                LIST_REMOVE(fifo, f->server->fifos, f);
+        }
+
+        if (f->fd >= 0) {
+                if (f->server)
+                        (void) epoll_ctl(f->server->epoll_fd, EPOLL_CTL_DEL, f->fd, NULL);
+
+                safe_close(f->fd);
+        }
+
+        return mfree(f);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(Fifo*, fifo_free);
+
+static void server_done(Server *s) {
+        assert(s);
+
+        while (s->fifos)
+                fifo_free(s->fifos);
+
+        s->epoll_fd = safe_close(s->epoll_fd);
+        s->bus = sd_bus_flush_close_unref(s->bus);
+}
+
+static int server_init(Server *s, unsigned n_sockets) {
+        int r;
+
+        /* This function will leave s partially initialized on failure. Caller needs to clean up. */
+
+        assert(s);
+        assert(n_sockets > 0);
+
+        s->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+        if (s->epoll_fd < 0)
+                return log_error_errno(errno, "Failed to create epoll object: %m");
+
+        for (unsigned i = 0; i < n_sockets; i++) {
+                _cleanup_(fifo_freep) Fifo *f = NULL;
+                int fd = SD_LISTEN_FDS_START + i;
+
+                r = sd_is_fifo(fd, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine file descriptor type: %m");
+                if (!r)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Wrong file descriptor type.");
+
+                f = new0(Fifo, 1);
+                if (!f)
+                        return log_oom();
+
+                struct epoll_event ev = {
+                        .events = EPOLLIN,
+                        .data.ptr = f,
+                };
+
+                if (epoll_ctl(s->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
+                        return log_error_errno(errno, "Failed to add fifo fd to epoll object: %m");
+
+                f->fd = fd;
+                f->server = s;
+                LIST_PREPEND(fifo, s->fifos, TAKE_PTR(f));
+                s->n_fifos++;
+        }
+
+        r = bus_connect_system_systemd(&s->bus);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get D-Bus connection: %m");
+
+        return 0;
+}
+
+static int process_event(Server *s, struct epoll_event *ev) {
+        int r;
+        _cleanup_(fifo_freep) Fifo *f = NULL;
+
+        assert(s);
+        assert(ev);
+
+        if (!(ev->events & EPOLLIN))
+                return log_info_errno(SYNTHETIC_ERRNO(EIO),
+                                      "Got invalid event from epoll. (3)");
+
+        f = (Fifo*) ev->data.ptr;
+        r = fifo_process(f);
+        if (r < 0)
+                return log_info_errno(r, "Got error on fifo: %m");
+
+        TAKE_PTR(f);
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(server_done) Server server = { .epoll_fd = -EBADF };
+        _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL;
+        int r, n;
+
+        if (argc > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "This program does not take arguments.");
+
+        log_setup();
+
+        umask(0022);
+
+        n = sd_listen_fds(true);
+        if (n < 0)
+                return log_error_errno(errno,
+                                       "Failed to read listening file descriptors from environment: %m");
+
+        if (n <= 0 || n > SERVER_FD_MAX)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "No or too many file descriptors passed.");
+
+        r = server_init(&server, (unsigned) n);
+        if (r < 0)
+                return r;
+
+        notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+        while (!server.quit) {
+                struct epoll_event event;
+                int k;
+
+                k = epoll_wait(server.epoll_fd, &event, 1, TIMEOUT_MSEC);
+                if (k < 0) {
+                        if (errno == EINTR)
+                                continue;
+                        return log_error_errno(errno, "epoll_wait() failed: %m");
+                }
+                if (k == 0)
+                        break;
+
+                r = process_event(&server, &event);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/initctl/meson.build b/src/initctl/meson.build
new file mode 100644
index 0000000..c9fddc9
--- /dev/null
+++ b/src/initctl/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-initctl',
+                'conditions' : ['HAVE_SYSV_COMPAT'],
+                'sources' : files('initctl.c'),
+        },
+]
diff --git a/src/integritysetup/integrity-util.c b/src/integritysetup/integrity-util.c
new file mode 100644
index 0000000..c29d4fc
--- /dev/null
+++ b/src/integritysetup/integrity-util.c
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include "integrity-util.h"
+
+#include "extract-word.h"
+#include "fileio.h"
+#include "path-util.h"
+#include "percent-util.h"
+
+
+static int supported_integrity_algorithm(char *user_supplied) {
+        if (!STR_IN_SET(user_supplied, "crc32", "crc32c", "sha1", "sha256", "hmac-sha256"))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unsupported integrity algorithm (%s)", user_supplied);
+        return 0;
+}
+
+int parse_integrity_options(
+                const char *options,
+                uint32_t *ret_activate_flags,
+                int *ret_percent,
+                usec_t *ret_commit_time,
+                char **ret_data_device,
+                char **ret_integrity_alg) {
+        int r;
+
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+                char *val;
+
+                r = extract_first_word(&options, &word, ",", EXTRACT_DONT_COALESCE_SEPARATORS | EXTRACT_UNESCAPE_SEPARATORS);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse options: %m");
+                if (r == 0)
+                        break;
+                else if (streq(word, "allow-discards")) {
+                        if (ret_activate_flags)
+                                *ret_activate_flags |= CRYPT_ACTIVATE_ALLOW_DISCARDS;
+                } else if ((val = startswith(word, "mode="))) {
+                        if (streq(val, "journal")) {
+                                if (ret_activate_flags)
+                                        *ret_activate_flags &= ~(CRYPT_ACTIVATE_NO_JOURNAL | CRYPT_ACTIVATE_NO_JOURNAL_BITMAP);
+                        } else if (streq(val, "bitmap")) {
+                                if (ret_activate_flags) {
+                                        *ret_activate_flags &= ~CRYPT_ACTIVATE_NO_JOURNAL;
+                                        *ret_activate_flags |= CRYPT_ACTIVATE_NO_JOURNAL_BITMAP;
+                                }
+                        } else if (streq(val, "direct")) {
+                                if (ret_activate_flags) {
+                                        *ret_activate_flags |= CRYPT_ACTIVATE_NO_JOURNAL;
+                                        *ret_activate_flags &= ~CRYPT_ACTIVATE_NO_JOURNAL_BITMAP;
+                                }
+                        } else
+                                log_warning("Encountered unknown mode option '%s', ignoring.", val);
+                } else if ((val = startswith(word, "journal-watermark="))) {
+                        r = parse_percent(val);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse journal-watermark value or value out of range (%s)", val);
+                        if (ret_percent)
+                                *ret_percent = r;
+                } else if ((val = startswith(word, "journal-commit-time="))) {
+                        usec_t tmp_commit_time;
+                        r = parse_sec(val, &tmp_commit_time);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse journal-commit-time value (%s)", val);
+                        if (ret_commit_time)
+                                *ret_commit_time = tmp_commit_time;
+                } else if ((val = startswith(word, "data-device="))) {
+                        if (ret_data_device) {
+                                r = free_and_strdup(ret_data_device, val);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+                } else if ((val = startswith(word, "integrity-algorithm="))) {
+                        r = supported_integrity_algorithm(val);
+                        if (r < 0)
+                                return r;
+                        if (ret_integrity_alg) {
+                                r = free_and_strdup(ret_integrity_alg, val);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+                } else
+                        log_warning("Encountered unknown option '%s', ignoring.", word);
+        }
+
+        return r;
+}
diff --git a/src/integritysetup/integrity-util.h b/src/integritysetup/integrity-util.h
new file mode 100644
index 0000000..b27975c
--- /dev/null
+++ b/src/integritysetup/integrity-util.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include 
+
+#include "cryptsetup-util.h"
+#include "time-util.h"
+
+
+int parse_integrity_options(
+                const char *options,
+                uint32_t *ret_activate_flags,
+                int *ret_percent,
+                usec_t *ret_commit_time,
+                char **ret_data_device,
+                char **ret_integrity_alg);
+
+#define DM_HMAC_256 "hmac(sha256)"
+#define DM_MAX_KEY_SIZE 4096            /* Maximum size of key allowed for dm-integrity */
diff --git a/src/integritysetup/integritysetup-generator.c b/src/integritysetup/integritysetup-generator.c
new file mode 100644
index 0000000..72b8905
--- /dev/null
+++ b/src/integritysetup/integritysetup-generator.c
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fstab-util.h"
+#include "generator.h"
+#include "hexdecoct.h"
+#include "id128-util.h"
+#include "integrity-util.h"
+#include "main-func.h"
+#include "mkdir.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "unit-name.h"
+
+static const char *arg_dest = NULL;
+static const char *arg_integritytab = NULL;
+static char *arg_options = NULL;
+STATIC_DESTRUCTOR_REGISTER(arg_options, freep);
+
+static int create_disk(
+                const char *name,
+                const char *device,
+                const char *key_file,
+                const char *options) {
+
+        _cleanup_free_ char *n = NULL, *dd = NULL, *e = NULL, *name_escaped = NULL, *key_file_escaped = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+        char *dmname = NULL;
+
+        assert(name);
+        assert(device);
+
+        name_escaped = specifier_escape(name);
+        if (!name_escaped)
+                return log_oom();
+
+        e = unit_name_escape(name);
+        if (!e)
+                return log_oom();
+
+        r = unit_name_build("systemd-integritysetup", e, ".service", &n);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = unit_name_from_path(device, ".device", &dd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate unit name: %m");
+
+        r = generator_open_unit_file(arg_dest, NULL, n, &f);
+        if (r < 0)
+                return r;
+
+        if (key_file) {
+                if (!path_is_absolute(key_file))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "key file not absolute file path %s", key_file);
+
+                key_file_escaped = specifier_escape(key_file);
+                if (!key_file_escaped)
+                        return log_oom();
+        }
+
+        if (options) {
+                r = parse_integrity_options(options, NULL, NULL, NULL, NULL, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        fprintf(f,
+                "[Unit]\n"
+                "Description=Integrity Setup for %%I\n"
+                "Documentation=man:integritytab(5) man:systemd-integritysetup-generator(8) man:systemd-integritysetup@.service(8)\n"
+                "SourcePath=%s\n"
+                "DefaultDependencies=no\n"
+                "IgnoreOnIsolate=true\n"
+                "After=integritysetup-pre.target systemd-udevd-kernel.socket\n"
+                "Before=blockdev@dev-mapper-%%i.target\n"
+                "Wants=blockdev@dev-mapper-%%i.target\n"
+                "Conflicts=umount.target\n"
+                "Before=integritysetup.target\n"
+                "BindsTo=%s\n"
+                "After=%s\n"
+                "Before=umount.target\n",
+                arg_integritytab,
+                dd, dd);
+
+        fprintf(f,
+                "\n"
+                "[Service]\n"
+                "Type=oneshot\n"
+                "RemainAfterExit=yes\n"
+                "TimeoutSec=infinity\n"
+                "ExecStart=" LIBEXECDIR "/systemd-integritysetup attach '%s' '%s' '%s' '%s'\n"
+                "ExecStop=" LIBEXECDIR "/systemd-integritysetup detach '%s'\n",
+                name_escaped, device, empty_to_dash(key_file_escaped), empty_to_dash(options),
+                name_escaped);
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write unit file %s: %m", n);
+
+        r = generator_add_symlink(arg_dest, "integritysetup.target", "requires", n);
+        if (r < 0)
+                return r;
+
+        dmname = strjoina("dev-mapper-", e, ".device");
+        return generator_add_symlink(arg_dest, dmname, "requires", n);
+}
+
+static int add_integritytab_devices(void) {
+        _cleanup_fclose_ FILE *f = NULL;
+        unsigned integritytab_line = 0;
+        int r;
+
+        r = fopen_unlocked(arg_integritytab, "re", &f);
+        if (r < 0) {
+                if (errno != ENOENT)
+                        log_error_errno(errno, "Failed to open %s: %m", arg_integritytab);
+                return 0;
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL, *name = NULL, *device_id = NULL, *device_path = NULL, *key_file = NULL, *options = NULL;
+
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to read %s: %m", arg_integritytab);
+                if (r == 0)
+                        break;
+
+                integritytab_line++;
+
+                if (IN_SET(line[0], 0, '#'))
+                        continue;
+
+                /* The key file and the options are optional */
+                r = sscanf(line, "%ms %ms %ms %ms", &name, &device_id, &key_file, &options);
+                if (!IN_SET(r, 2, 3, 4)) {
+                        log_error("Failed to parse %s:%u, ignoring.", arg_integritytab, integritytab_line);
+                        continue;
+                }
+
+                device_path = fstab_node_to_udev_node(device_id);
+                if (!device_path) {
+                        log_error("Failed to find device %s:%u, ignoring.", device_id, integritytab_line);
+                        continue;
+                }
+
+                r = create_disk(name, device_path, empty_or_dash_to_null(key_file), empty_or_dash_to_null(options));
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int run(const char *dest, const char *dest_early, const char *dest_late) {
+        assert_se(arg_dest = dest);
+
+        arg_integritytab = getenv("SYSTEMD_INTEGRITYTAB") ?: "/etc/integritytab";
+
+        return add_integritytab_devices();
+}
+
+DEFINE_MAIN_GENERATOR_FUNCTION(run);
diff --git a/src/integritysetup/integritysetup.c b/src/integritysetup/integritysetup.c
new file mode 100644
index 0000000..a602886
--- /dev/null
+++ b/src/integritysetup/integritysetup.c
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include 
+#include 
+#include 
+
+#include "alloc-util.h"
+#include "cryptsetup-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "integrity-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "memory-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+
+static uint32_t arg_activate_flags;
+static int arg_percent;
+static usec_t arg_commit_time;
+static char *arg_existing_data_device;
+static char *arg_integrity_algorithm;
+
+STATIC_DESTRUCTOR_REGISTER(arg_existing_data_device, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_integrity_algorithm, freep);
+
+static int help(void) {
+        _cleanup_free_ char *link = NULL;
+        int r;
+
+        r = terminal_urlify_man("systemd-integritysetup@.service", "8", &link);
+        if (r < 0)
+                return log_oom();
+
+        printf("%s attach VOLUME DEVICE [HMAC_KEY_FILE|-] [OPTIONS]\n"
+               "%s detach VOLUME\n\n"
+               "Attach or detach an integrity protected block device.\n"
+               "\nSee the %s for details.\n",
+               program_invocation_short_name,
+               program_invocation_short_name,
+               link);
+
+        return 0;
+}
+
+static int load_key_file(
+                const char *key_file,
+                void **ret_key_file_contents,
+                size_t *ret_key_file_size) {
+        int r;
+        _cleanup_(erase_and_freep) char *tmp_key_file_contents = NULL;
+        size_t tmp_key_file_size;
+
+        if (!path_is_absolute(key_file))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "key file not absolute path: %s", key_file);
+
+        r = read_full_file_full(
+                        AT_FDCWD, key_file, UINT64_MAX, DM_MAX_KEY_SIZE,
+                        READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET|READ_FULL_FILE_FAIL_WHEN_LARGER,
+                        NULL,
+                        &tmp_key_file_contents, &tmp_key_file_size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to process key file: %m");
+
+        if (ret_key_file_contents && ret_key_file_size) {
+                *ret_key_file_contents = TAKE_PTR(tmp_key_file_contents);
+                *ret_key_file_size = tmp_key_file_size;
+        }
+
+        return 0;
+}
+
+static const char *integrity_algorithm_select(const void *key_file_buf) {
+        /*  To keep a bit of sanity for end users, the subset of integrity
+            algorithms we support will match what is used in integritysetup */
+        if (arg_integrity_algorithm) {
+                if (streq("hmac-sha256", arg_integrity_algorithm))
+                        return DM_HMAC_256;
+                return arg_integrity_algorithm;
+        } else if (key_file_buf)
+                return DM_HMAC_256;
+        return "crc32c";
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(crypt_freep) struct crypt_device *cd = NULL;
+        char *verb, *volume;
+        int r;
+
+        if (argv_looks_like_help(argc, argv))
+                return help();
+
+        if (argc < 3)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program requires at least two arguments.");
+
+        verb = argv[1];
+        volume = argv[2];
+
+        log_setup();
+
+        cryptsetup_enable_logging(NULL);
+
+        umask(0022);
+
+        if (streq(verb, "attach")) {
+                /* attach name device optional_key_file optional_options */
+
+                crypt_status_info status;
+                _cleanup_(erase_and_freep) void *key_buf = NULL;
+                const char *device, *key_file, *options;
+                size_t key_buf_size = 0;
+
+                if (argc < 4)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "attach requires at least three arguments.");
+
+                if (argc > 6)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "attach has a maximum of five arguments.");
+
+                device = argv[3];
+                key_file = mangle_none(argc > 4 ? argv[4] : NULL);
+                options = mangle_none(argc > 5 ? argv[5] : NULL);
+
+                if (!filename_is_valid(volume))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume name '%s' is not valid.", volume);
+
+                if (key_file) {
+                        r = load_key_file(key_file, &key_buf, &key_buf_size);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (options) {
+                        r = parse_integrity_options(options, &arg_activate_flags, &arg_percent,
+                                                    &arg_commit_time, &arg_existing_data_device, &arg_integrity_algorithm);
+                        if (r < 0)
+                                return r;
+                }
+
+                r = crypt_init(&cd, device);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open integrity device %s: %m", device);
+
+                cryptsetup_enable_logging(cd);
+
+                status = crypt_status(cd, volume);
+                if (IN_SET(status, CRYPT_ACTIVE, CRYPT_BUSY)) {
+                        log_info("Volume %s already active.", volume);
+                        return 0;
+                }
+
+                r = crypt_load(cd,
+                        CRYPT_INTEGRITY,
+                        &(struct crypt_params_integrity) {
+                                .journal_watermark = arg_percent,
+                                .journal_commit_time = DIV_ROUND_UP(arg_commit_time, USEC_PER_SEC),
+                                .integrity = integrity_algorithm_select(key_buf),
+                        });
+                if (r < 0)
+                        return log_error_errno(r, "Failed to load integrity superblock: %m");
+
+                if (!isempty(arg_existing_data_device)) {
+                        r = crypt_set_data_device(cd, arg_existing_data_device);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to add separate data device: %m");
+                }
+
+                r = crypt_activate_by_volume_key(cd, volume, key_buf, key_buf_size, arg_activate_flags);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set up integrity device: %m");
+
+        } else if (streq(verb, "detach")) {
+
+                if (argc > 3)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "detach has a maximum of two arguments.");
+
+                if (!filename_is_valid(volume))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume name '%s' is not valid.", volume);
+
+                r = crypt_init_by_name(&cd, volume);
+                if (r == -ENODEV) {
+                        log_info("Volume %s already inactive.", volume);
+                        return 0;
+                }
+                if (r < 0)
+                        return log_error_errno(r, "crypt_init_by_name() failed: %m");
+
+                cryptsetup_enable_logging(cd);
+
+                r = crypt_deactivate(cd, volume);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to deactivate: %m");
+
+        } else
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown verb %s.", verb);
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/integritysetup/meson.build b/src/integritysetup/meson.build
new file mode 100644
index 0000000..6b9d78d
--- /dev/null
+++ b/src/integritysetup/meson.build
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-integritysetup',
+                'conditions' : ['HAVE_LIBCRYPTSETUP'],
+                'sources' : files(
+                        'integrity-util.c',
+                        'integritysetup.c',
+                ),
+                'dependencies' : libcryptsetup,
+        },
+        generator_template + {
+                'name' : 'systemd-integritysetup-generator',
+                'conditions' : ['HAVE_LIBCRYPTSETUP'],
+                'sources' : files(
+                        'integrity-util.c',
+                        'integritysetup-generator.c',
+                ),
+        },
+]
diff --git a/src/journal-remote/browse.html b/src/journal-remote/browse.html
new file mode 100644
index 0000000..4fe2cd8
--- /dev/null
+++ b/src/journal-remote/browse.html
@@ -0,0 +1,548 @@
+
+
+
+
+        Journal
+        
+        
+
+
+
+        
+
+        

+ +
+
+
+
+
+
+ +
+ +      + Only current boot +
+ +
+ +
+ +
+ + + + +      + + +
+ +
+ g: First Page      + ←, k, BACKSPACE: Previous Page      + →, j, SPACE: Next Page      + G: Last Page      + +: More entries      + -: Fewer entries +
+ + + + diff --git a/src/journal-remote/fuzz-journal-remote.c b/src/journal-remote/fuzz-journal-remote.c new file mode 100644 index 0000000..557100b --- /dev/null +++ b/src/journal-remote/fuzz-journal-remote.c @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fuzz.h" + +#include + +#include "sd-journal.h" + +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "journal-remote.h" +#include "logs-show.h" +#include "memfd-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "strv.h" +#include "tmpfile-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_close_ int fdin_close = -EBADF, fdout = -EBADF; + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_(unlink_and_freep) char *name = NULL; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_(journal_remote_server_destroy) RemoteServer s = {}; + void *mem; + int fdin, r; + + if (outside_size_range(size, 3, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(mkdtemp_malloc("/tmp/fuzz-journal-remote-XXXXXX", &tmp) >= 0); + assert_se(name = path_join(tmp, "fuzz-journal-remote.XXXXXX.journal")); + + fdin = fdin_close = memfd_new_and_map("fuzz-journal-remote", size, &mem); + if (fdin < 0) + return log_error_errno(fdin, "memfd_new_and_map() failed: %m"); + + memcpy(mem, data, size); + assert_se(munmap(mem, size) == 0); + + fdout = mkostemps(name, STRLEN(".journal"), O_CLOEXEC); + if (fdout < 0) + return log_error_errno(errno, "mkostemps() failed: %m"); + + /* In */ + + r = journal_remote_server_init(&s, name, JOURNAL_WRITE_SPLIT_NONE, 0); + if (r < 0) { + assert_se(IN_SET(r, -ENOMEM, -EMFILE, -ENFILE)); + return r; + } + + r = journal_remote_add_source(&s, fdin, (char*) "fuzz-data", false); + if (r < 0) + return r; + TAKE_FD(fdin_close); + assert(r > 0); + + while (s.active) + assert_se(journal_remote_handle_raw_source(NULL, fdin, 0, &s) >= 0); + + assert_se(close(fdin) < 0 && errno == EBADF); /* Check that the fd is closed already */ + + /* Out */ + + r = sd_journal_open_files(&j, (const char**) STRV_MAKE(name), 0); + if (r < 0) { + log_error_errno(r, "sd_journal_open_files([\"%s\"]) failed: %m", name); + assert_se(IN_SET(r, -ENOMEM, -EMFILE, -ENFILE, -ENODATA)); + return r; + } + + _cleanup_fclose_ FILE *dev_null = NULL; + if (getenv_bool("SYSTEMD_FUZZ_OUTPUT") <= 0) { + dev_null = fopen("/dev/null", "we"); + if (!dev_null) + return log_error_errno(errno, "fopen(\"/dev/null\") failed: %m"); + } + + for (OutputMode mode = 0; mode < _OUTPUT_MODE_MAX; mode++) { + if (!dev_null) + log_info("/* %s */", output_mode_to_string(mode)); + r = show_journal(dev_null ?: stdout, j, mode, 0, 0, -1, 0, NULL); + assert_se(r >= 0); + + r = sd_journal_seek_head(j); + assert_se(r >= 0); + } + + return 0; +} diff --git a/src/journal-remote/fuzz-journal-remote.options b/src/journal-remote/fuzz-journal-remote.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/journal-remote/fuzz-journal-remote.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/journal-remote/journal-gatewayd.c b/src/journal-remote/journal-gatewayd.c new file mode 100644 index 0000000..0919471 --- /dev/null +++ b/src/journal-remote/journal-gatewayd.c @@ -0,0 +1,1092 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-daemon.h" +#include "sd-journal.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "glob-util.h" +#include "hostname-util.h" +#include "log.h" +#include "logs-show.h" +#include "main-func.h" +#include "memory-util.h" +#include "microhttpd-util.h" +#include "os-util.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "sigbus.h" +#include "signal-util.h" +#include "tmpfile-util.h" + +#define JOURNAL_WAIT_TIMEOUT (10*USEC_PER_SEC) + +static char *arg_key_pem = NULL; +static char *arg_cert_pem = NULL; +static char *arg_trust_pem = NULL; +static bool arg_merge = false; +static int arg_journal_type = 0; +static const char *arg_directory = NULL; +static char **arg_file = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_key_pem, erase_and_freep); +STATIC_DESTRUCTOR_REGISTER(arg_cert_pem, freep); +STATIC_DESTRUCTOR_REGISTER(arg_trust_pem, freep); + +typedef struct RequestMeta { + sd_journal *journal; + + OutputMode mode; + + char *cursor; + int64_t n_skip; + uint64_t n_entries; + bool n_entries_set; + + FILE *tmp; + uint64_t delta, size; + + int argument_parse_error; + + bool follow; + bool discrete; +} RequestMeta; + +static const char* const mime_types[_OUTPUT_MODE_MAX] = { + [OUTPUT_SHORT] = "text/plain", + [OUTPUT_JSON] = "application/json", + [OUTPUT_JSON_SSE] = "text/event-stream", + [OUTPUT_JSON_SEQ] = "application/json-seq", + [OUTPUT_EXPORT] = "application/vnd.fdo.journal", +}; + +static RequestMeta *request_meta(void **connection_cls) { + RequestMeta *m; + + assert(connection_cls); + if (*connection_cls) + return *connection_cls; + + m = new0(RequestMeta, 1); + if (!m) + return NULL; + + *connection_cls = m; + return m; +} + +static void request_meta_free( + void *cls, + struct MHD_Connection *connection, + void **connection_cls, + enum MHD_RequestTerminationCode toe) { + + RequestMeta *m = *connection_cls; + + if (!m) + return; + + sd_journal_close(m->journal); + + safe_fclose(m->tmp); + + free(m->cursor); + free(m); +} + +static int open_journal(RequestMeta *m) { + assert(m); + + if (m->journal) + return 0; + + if (arg_directory) + return sd_journal_open_directory(&m->journal, arg_directory, arg_journal_type); + else if (arg_file) + return sd_journal_open_files(&m->journal, (const char**) arg_file, 0); + else + return sd_journal_open(&m->journal, (arg_merge ? 0 : SD_JOURNAL_LOCAL_ONLY) | arg_journal_type); +} + +static int request_meta_ensure_tmp(RequestMeta *m) { + assert(m); + + if (m->tmp) + rewind(m->tmp); + else { + _cleanup_close_ int fd = -EBADF; + + fd = open_tmpfile_unlinkable("/tmp", O_RDWR|O_CLOEXEC); + if (fd < 0) + return fd; + + m->tmp = take_fdopen(&fd, "w+"); + if (!m->tmp) + return -errno; + } + + return 0; +} + +static ssize_t request_reader_entries( + void *cls, + uint64_t pos, + char *buf, + size_t max) { + + RequestMeta *m = ASSERT_PTR(cls); + dual_timestamp previous_ts = DUAL_TIMESTAMP_NULL; + sd_id128_t previous_boot_id = SD_ID128_NULL; + int r; + size_t n, k; + + assert(buf); + assert(max > 0); + assert(pos >= m->delta); + + pos -= m->delta; + + while (pos >= m->size) { + off_t sz; + + /* End of this entry, so let's serialize the next + * one */ + + if (m->n_entries_set && + m->n_entries <= 0) + return MHD_CONTENT_READER_END_OF_STREAM; + + if (m->n_skip < 0) + r = sd_journal_previous_skip(m->journal, (uint64_t) -m->n_skip + 1); + else if (m->n_skip > 0) + r = sd_journal_next_skip(m->journal, (uint64_t) m->n_skip + 1); + else + r = sd_journal_next(m->journal); + + if (r < 0) { + log_error_errno(r, "Failed to advance journal pointer: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } else if (r == 0) { + + if (m->follow) { + r = sd_journal_wait(m->journal, (uint64_t) JOURNAL_WAIT_TIMEOUT); + if (r < 0) { + log_error_errno(r, "Couldn't wait for journal event: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + if (r == SD_JOURNAL_NOP) + break; + + continue; + } + + return MHD_CONTENT_READER_END_OF_STREAM; + } + + if (m->discrete) { + assert(m->cursor); + + r = sd_journal_test_cursor(m->journal, m->cursor); + if (r < 0) { + log_error_errno(r, "Failed to test cursor: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + if (r == 0) + return MHD_CONTENT_READER_END_OF_STREAM; + } + + pos -= m->size; + m->delta += m->size; + + if (m->n_entries_set) + m->n_entries -= 1; + + m->n_skip = 0; + + r = request_meta_ensure_tmp(m); + if (r < 0) { + log_error_errno(r, "Failed to create temporary file: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + r = show_journal_entry(m->tmp, m->journal, m->mode, 0, OUTPUT_FULL_WIDTH, + NULL, NULL, NULL, &previous_ts, &previous_boot_id); + if (r < 0) { + log_error_errno(r, "Failed to serialize item: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + sz = ftello(m->tmp); + if (sz < 0) { + log_error_errno(errno, "Failed to retrieve file position: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + m->size = (uint64_t) sz; + } + + if (m->tmp == NULL && m->follow) + return 0; + + if (fseeko(m->tmp, pos, SEEK_SET) < 0) { + log_error_errno(errno, "Failed to seek to position: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + n = m->size - pos; + if (n < 1) + return 0; + if (n > max) + n = max; + + errno = 0; + k = fread(buf, 1, n, m->tmp); + if (k != n) { + log_error("Failed to read from file: %s", STRERROR_OR_EOF(errno)); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + return (ssize_t) k; +} + +static int request_parse_accept( + RequestMeta *m, + struct MHD_Connection *connection) { + + const char *header; + + assert(m); + assert(connection); + + header = MHD_lookup_connection_value(connection, MHD_HEADER_KIND, "Accept"); + if (!header) + return 0; + + if (streq(header, mime_types[OUTPUT_JSON])) + m->mode = OUTPUT_JSON; + else if (streq(header, mime_types[OUTPUT_JSON_SSE])) + m->mode = OUTPUT_JSON_SSE; + else if (streq(header, mime_types[OUTPUT_JSON_SEQ])) + m->mode = OUTPUT_JSON_SEQ; + else if (streq(header, mime_types[OUTPUT_EXPORT])) + m->mode = OUTPUT_EXPORT; + else + m->mode = OUTPUT_SHORT; + + return 0; +} + +static int request_parse_range( + RequestMeta *m, + struct MHD_Connection *connection) { + + const char *range, *colon, *colon2; + int r; + + assert(m); + assert(connection); + + range = MHD_lookup_connection_value(connection, MHD_HEADER_KIND, "Range"); + if (!range) + return 0; + + if (!startswith(range, "entries=")) + return 0; + + range += 8; + range += strspn(range, WHITESPACE); + + colon = strchr(range, ':'); + if (!colon) + m->cursor = strdup(range); + else { + const char *p; + + colon2 = strchr(colon + 1, ':'); + if (colon2) { + _cleanup_free_ char *t = NULL; + + t = strndup(colon + 1, colon2 - colon - 1); + if (!t) + return -ENOMEM; + + r = safe_atoi64(t, &m->n_skip); + if (r < 0) + return r; + } + + p = (colon2 ?: colon) + 1; + if (*p) { + r = safe_atou64(p, &m->n_entries); + if (r < 0) + return r; + + if (m->n_entries <= 0) + return -EINVAL; + + m->n_entries_set = true; + } + + m->cursor = strndup(range, colon - range); + } + + if (!m->cursor) + return -ENOMEM; + + m->cursor[strcspn(m->cursor, WHITESPACE)] = 0; + if (isempty(m->cursor)) + m->cursor = mfree(m->cursor); + + return 0; +} + +static mhd_result request_parse_arguments_iterator( + void *cls, + enum MHD_ValueKind kind, + const char *key, + const char *value) { + + RequestMeta *m = ASSERT_PTR(cls); + _cleanup_free_ char *p = NULL; + int r; + + if (isempty(key)) { + m->argument_parse_error = -EINVAL; + return MHD_NO; + } + + if (streq(key, "follow")) { + if (isempty(value)) { + m->follow = true; + return MHD_YES; + } + + r = parse_boolean(value); + if (r < 0) { + m->argument_parse_error = r; + return MHD_NO; + } + + m->follow = r; + return MHD_YES; + } + + if (streq(key, "discrete")) { + if (isempty(value)) { + m->discrete = true; + return MHD_YES; + } + + r = parse_boolean(value); + if (r < 0) { + m->argument_parse_error = r; + return MHD_NO; + } + + m->discrete = r; + return MHD_YES; + } + + if (streq(key, "boot")) { + if (isempty(value)) + r = true; + else { + r = parse_boolean(value); + if (r < 0) { + m->argument_parse_error = r; + return MHD_NO; + } + } + + if (r) { + char match[9 + 32 + 1] = "_BOOT_ID="; + sd_id128_t bid; + + r = sd_id128_get_boot(&bid); + if (r < 0) { + log_error_errno(r, "Failed to get boot ID: %m"); + return MHD_NO; + } + + sd_id128_to_string(bid, match + 9); + r = sd_journal_add_match(m->journal, match, sizeof(match)-1); + if (r < 0) { + m->argument_parse_error = r; + return MHD_NO; + } + } + + return MHD_YES; + } + + p = strjoin(key, "=", strempty(value)); + if (!p) { + m->argument_parse_error = log_oom(); + return MHD_NO; + } + + r = sd_journal_add_match(m->journal, p, 0); + if (r < 0) { + m->argument_parse_error = r; + return MHD_NO; + } + + return MHD_YES; +} + +static int request_parse_arguments( + RequestMeta *m, + struct MHD_Connection *connection) { + + assert(m); + assert(connection); + + m->argument_parse_error = 0; + MHD_get_connection_values(connection, MHD_GET_ARGUMENT_KIND, request_parse_arguments_iterator, m); + + return m->argument_parse_error; +} + +static int request_handler_entries( + struct MHD_Connection *connection, + void *connection_cls) { + + _cleanup_(MHD_destroy_responsep) struct MHD_Response *response = NULL; + RequestMeta *m = ASSERT_PTR(connection_cls); + int r; + + assert(connection); + + r = open_journal(m); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to open journal: %m"); + + if (request_parse_accept(m, connection) < 0) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Failed to parse Accept header."); + + if (request_parse_range(m, connection) < 0) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Failed to parse Range header."); + + if (request_parse_arguments(m, connection) < 0) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Failed to parse URL arguments."); + + if (m->discrete) { + if (!m->cursor) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Discrete seeks require a cursor specification."); + + m->n_entries = 1; + m->n_entries_set = true; + } + + if (m->cursor) + r = sd_journal_seek_cursor(m->journal, m->cursor); + else if (m->n_skip >= 0) + r = sd_journal_seek_head(m->journal); + else if (m->n_skip < 0) + r = sd_journal_seek_tail(m->journal); + if (r < 0) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Failed to seek in journal."); + + response = MHD_create_response_from_callback(MHD_SIZE_UNKNOWN, 4*1024, request_reader_entries, m, NULL); + if (!response) + return respond_oom(connection); + + if (MHD_add_response_header(response, "Content-Type", mime_types[m->mode]) == MHD_NO) + return respond_oom(connection); + + return MHD_queue_response(connection, MHD_HTTP_OK, response); +} + +static int output_field(FILE *f, OutputMode m, const char *d, size_t l) { + const char *eq; + size_t j; + + eq = memchr(d, '=', l); + if (!eq) + return -EINVAL; + + j = l - (eq - d + 1); + + if (m == OUTPUT_JSON) { + fprintf(f, "{ \"%.*s\" : ", (int) (eq - d), d); + json_escape(f, eq+1, j, OUTPUT_FULL_WIDTH); + fputs(" }\n", f); + } else { + fwrite(eq+1, 1, j, f); + fputc('\n', f); + } + + return 0; +} + +static ssize_t request_reader_fields( + void *cls, + uint64_t pos, + char *buf, + size_t max) { + + RequestMeta *m = ASSERT_PTR(cls); + int r; + size_t n, k; + + assert(buf); + assert(max > 0); + assert(pos >= m->delta); + + pos -= m->delta; + + while (pos >= m->size) { + off_t sz; + const void *d; + size_t l; + + /* End of this field, so let's serialize the next + * one */ + + r = sd_journal_enumerate_unique(m->journal, &d, &l); + if (r < 0) { + log_error_errno(r, "Failed to advance field index: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } else if (r == 0) + return MHD_CONTENT_READER_END_OF_STREAM; + + pos -= m->size; + m->delta += m->size; + + r = request_meta_ensure_tmp(m); + if (r < 0) { + log_error_errno(r, "Failed to create temporary file: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + r = output_field(m->tmp, m->mode, d, l); + if (r < 0) { + log_error_errno(r, "Failed to serialize item: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + sz = ftello(m->tmp); + if (sz < 0) { + log_error_errno(errno, "Failed to retrieve file position: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + m->size = (uint64_t) sz; + } + + if (fseeko(m->tmp, pos, SEEK_SET) < 0) { + log_error_errno(errno, "Failed to seek to position: %m"); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + n = m->size - pos; + if (n > max) + n = max; + + errno = 0; + k = fread(buf, 1, n, m->tmp); + if (k != n) { + log_error("Failed to read from file: %s", STRERROR_OR_EOF(errno)); + return MHD_CONTENT_READER_END_WITH_ERROR; + } + + return (ssize_t) k; +} + +static int request_handler_fields( + struct MHD_Connection *connection, + const char *field, + void *connection_cls) { + + _cleanup_(MHD_destroy_responsep) struct MHD_Response *response = NULL; + RequestMeta *m = ASSERT_PTR(connection_cls); + int r; + + assert(connection); + + r = open_journal(m); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to open journal: %m"); + + if (request_parse_accept(m, connection) < 0) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Failed to parse Accept header."); + + r = sd_journal_query_unique(m->journal, field); + if (r < 0) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, "Failed to query unique fields."); + + response = MHD_create_response_from_callback(MHD_SIZE_UNKNOWN, 4*1024, request_reader_fields, m, NULL); + if (!response) + return respond_oom(connection); + + if (MHD_add_response_header(response, "Content-Type", mime_types[m->mode == OUTPUT_JSON ? OUTPUT_JSON : OUTPUT_SHORT]) == MHD_NO) + return respond_oom(connection); + + return MHD_queue_response(connection, MHD_HTTP_OK, response); +} + +static int request_handler_redirect( + struct MHD_Connection *connection, + const char *target) { + + _cleanup_free_ char *page = NULL; + _cleanup_(MHD_destroy_responsep) struct MHD_Response *response = NULL; + + assert(connection); + assert(target); + + if (asprintf(&page, "Please continue to the journal browser.", target) < 0) + return respond_oom(connection); + + response = MHD_create_response_from_buffer(strlen(page), page, MHD_RESPMEM_MUST_FREE); + if (!response) + return respond_oom(connection); + TAKE_PTR(page); + + if (MHD_add_response_header(response, "Content-Type", "text/html") == MHD_NO || + MHD_add_response_header(response, "Location", target) == MHD_NO) + return respond_oom(connection); + + return MHD_queue_response(connection, MHD_HTTP_MOVED_PERMANENTLY, response); +} + +static int request_handler_file( + struct MHD_Connection *connection, + const char *path, + const char *mime_type) { + + _cleanup_(MHD_destroy_responsep) struct MHD_Response *response = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + + assert(connection); + assert(path); + assert(mime_type); + + fd = open(path, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return mhd_respondf(connection, errno, MHD_HTTP_NOT_FOUND, "Failed to open file %s: %m", path); + + if (fstat(fd, &st) < 0) + return mhd_respondf(connection, errno, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to stat file: %m"); + + response = MHD_create_response_from_fd_at_offset64(st.st_size, fd, 0); + if (!response) + return respond_oom(connection); + TAKE_FD(fd); + + if (MHD_add_response_header(response, "Content-Type", mime_type) == MHD_NO) + return respond_oom(connection); + + return MHD_queue_response(connection, MHD_HTTP_OK, response); +} + +static int get_virtualization(char **v) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + char *b = NULL; + int r; + + r = sd_bus_default_system(&bus); + if (r < 0) + return r; + + r = bus_get_property_string(bus, bus_systemd_mgr, "Virtualization", NULL, &b); + if (r < 0) + return r; + + if (isempty(b)) { + free(b); + *v = NULL; + return 0; + } + + *v = b; + return 1; +} + +static int request_handler_machine( + struct MHD_Connection *connection, + void *connection_cls) { + + _cleanup_(MHD_destroy_responsep) struct MHD_Response *response = NULL; + RequestMeta *m = ASSERT_PTR(connection_cls); + int r; + _cleanup_free_ char* hostname = NULL, *pretty_name = NULL, *os_name = NULL; + uint64_t cutoff_from = 0, cutoff_to = 0, usage = 0; + sd_id128_t mid, bid; + _cleanup_free_ char *v = NULL, *json = NULL; + + assert(connection); + + r = open_journal(m); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to open journal: %m"); + + r = sd_id128_get_machine(&mid); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to determine machine ID: %m"); + + r = sd_id128_get_boot(&bid); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to determine boot ID: %m"); + + hostname = gethostname_malloc(); + if (!hostname) + return respond_oom(connection); + + r = sd_journal_get_usage(m->journal, &usage); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to determine disk usage: %m"); + + r = sd_journal_get_cutoff_realtime_usec(m->journal, &cutoff_from, &cutoff_to); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "Failed to determine disk usage: %m"); + + (void) parse_os_release( + NULL, + "PRETTY_NAME", &pretty_name, + "NAME=", &os_name); + (void) get_virtualization(&v); + + r = asprintf(&json, + "{ \"machine_id\" : \"" SD_ID128_FORMAT_STR "\"," + "\"boot_id\" : \"" SD_ID128_FORMAT_STR "\"," + "\"hostname\" : \"%s\"," + "\"os_pretty_name\" : \"%s\"," + "\"virtualization\" : \"%s\"," + "\"usage\" : \"%"PRIu64"\"," + "\"cutoff_from_realtime\" : \"%"PRIu64"\"," + "\"cutoff_to_realtime\" : \"%"PRIu64"\" }\n", + SD_ID128_FORMAT_VAL(mid), + SD_ID128_FORMAT_VAL(bid), + hostname_cleanup(hostname), + os_release_pretty_name(pretty_name, os_name), + v ? v : "bare", + usage, + cutoff_from, + cutoff_to); + if (r < 0) + return respond_oom(connection); + + response = MHD_create_response_from_buffer(strlen(json), json, MHD_RESPMEM_MUST_FREE); + if (!response) + return respond_oom(connection); + TAKE_PTR(json); + + if (MHD_add_response_header(response, "Content-Type", "application/json") == MHD_NO) + return respond_oom(connection); + + return MHD_queue_response(connection, MHD_HTTP_OK, response); +} + +static mhd_result request_handler( + void *cls, + struct MHD_Connection *connection, + const char *url, + const char *method, + const char *version, + const char *upload_data, + size_t *upload_data_size, + void **connection_cls) { + int r, code; + + assert(connection); + assert(connection_cls); + assert(url); + assert(method); + + if (!streq(method, "GET")) + return mhd_respond(connection, MHD_HTTP_NOT_ACCEPTABLE, "Unsupported method."); + + if (!*connection_cls) { + if (!request_meta(connection_cls)) + return respond_oom(connection); + return MHD_YES; + } + + if (arg_trust_pem) { + r = check_permissions(connection, &code, NULL); + if (r < 0) + return code; + } + + if (streq(url, "/")) + return request_handler_redirect(connection, "/browse"); + + if (streq(url, "/entries")) + return request_handler_entries(connection, *connection_cls); + + if (startswith(url, "/fields/")) + return request_handler_fields(connection, url + 8, *connection_cls); + + if (streq(url, "/browse")) + return request_handler_file(connection, DOCUMENT_ROOT "/browse.html", "text/html"); + + if (streq(url, "/machine")) + return request_handler_machine(connection, *connection_cls); + + return mhd_respond(connection, MHD_HTTP_NOT_FOUND, "Not found."); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-journal-gatewayd.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] ...\n\n" + "HTTP server for journal events.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --cert=CERT.PEM Server certificate in PEM format\n" + " --key=KEY.PEM Server key in PEM format\n" + " --trust=CERT.PEM Certificate authority certificate in PEM format\n" + " --system Serve system journal\n" + " --user Serve the user journal for the current user\n" + " -m --merge Serve all available journals\n" + " -D --directory=PATH Serve journal files in directory\n" + " --file=PATH Serve this journal file\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_KEY, + ARG_CERT, + ARG_TRUST, + ARG_USER, + ARG_SYSTEM, + ARG_MERGE, + ARG_FILE, + }; + + int r, c; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "key", required_argument, NULL, ARG_KEY }, + { "cert", required_argument, NULL, ARG_CERT }, + { "trust", required_argument, NULL, ARG_TRUST }, + { "user", no_argument, NULL, ARG_USER }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "merge", no_argument, NULL, 'm' }, + { "directory", required_argument, NULL, 'D' }, + { "file", required_argument, NULL, ARG_FILE }, + {} + }; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hD:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_KEY: + if (arg_key_pem) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Key file specified twice"); + r = read_full_file_full( + AT_FDCWD, optarg, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &arg_key_pem, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read key file: %m"); + assert(arg_key_pem); + break; + + case ARG_CERT: + if (arg_cert_pem) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Certificate file specified twice"); + r = read_full_file_full( + AT_FDCWD, optarg, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &arg_cert_pem, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read certificate file: %m"); + assert(arg_cert_pem); + break; + + case ARG_TRUST: +#if HAVE_GNUTLS + if (arg_trust_pem) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "CA certificate file specified twice"); + r = read_full_file_full( + AT_FDCWD, optarg, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &arg_trust_pem, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read CA certificate file: %m"); + assert(arg_trust_pem); + break; +#else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --trust= is not available."); +#endif + + case ARG_SYSTEM: + arg_journal_type |= SD_JOURNAL_SYSTEM; + break; + + case ARG_USER: + arg_journal_type |= SD_JOURNAL_CURRENT_USER; + break; + + case 'm': + arg_merge = true; + break; + + case 'D': + arg_directory = optarg; + break; + + case ARG_FILE: + r = glob_extend(&arg_file, optarg, GLOB_NOCHECK); + if (r < 0) + return log_error_errno(r, "Failed to add paths: %m"); + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program does not take arguments."); + + if (!!arg_key_pem != !!arg_cert_pem) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Certificate and key files must be specified together"); + + if (arg_trust_pem && !arg_key_pem) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "CA certificate can only be used with certificate file"); + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(MHD_stop_daemonp) struct MHD_Daemon *d = NULL; + static const struct sigaction sigterm = { + .sa_handler = nop_signal_handler, + .sa_flags = SA_RESTART, + }; + struct MHD_OptionItem opts[] = { + { MHD_OPTION_EXTERNAL_LOGGER, + (intptr_t) microhttpd_logger, NULL }, + { MHD_OPTION_NOTIFY_COMPLETED, + (intptr_t) request_meta_free, NULL }, + { MHD_OPTION_END, 0, NULL }, + { MHD_OPTION_END, 0, NULL }, + { MHD_OPTION_END, 0, NULL }, + { MHD_OPTION_END, 0, NULL }, + { MHD_OPTION_END, 0, NULL }, + }; + int opts_pos = 2; + + /* We force MHD_USE_ITC here, in order to make sure + * libmicrohttpd doesn't use shutdown() on our listening + * socket, which would break socket re-activation. See + * + * https://lists.gnu.org/archive/html/libmicrohttpd/2015-09/msg00014.html + * https://github.com/systemd/systemd/pull/1286 + */ + + int flags = + MHD_USE_DEBUG | + MHD_USE_DUAL_STACK | + MHD_USE_ITC | + MHD_USE_POLL_INTERNAL_THREAD | + MHD_USE_THREAD_PER_CONNECTION; + int r, n; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + sigbus_install(); + assert_se(sigaction(SIGTERM, &sigterm, NULL) >= 0); + + r = setup_gnutls_logger(NULL); + if (r < 0) + return r; + + n = sd_listen_fds(1); + if (n < 0) + return log_error_errno(n, "Failed to determine passed sockets: %m"); + if (n > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Can't listen on more than one socket."); + + if (n == 1) + opts[opts_pos++] = (struct MHD_OptionItem) + { MHD_OPTION_LISTEN_SOCKET, SD_LISTEN_FDS_START }; + + if (arg_key_pem) { + assert(arg_cert_pem); + opts[opts_pos++] = (struct MHD_OptionItem) + { MHD_OPTION_HTTPS_MEM_KEY, 0, arg_key_pem }; + opts[opts_pos++] = (struct MHD_OptionItem) + { MHD_OPTION_HTTPS_MEM_CERT, 0, arg_cert_pem }; + flags |= MHD_USE_TLS; + } + + if (arg_trust_pem) { + assert(flags & MHD_USE_TLS); + opts[opts_pos++] = (struct MHD_OptionItem) + { MHD_OPTION_HTTPS_MEM_TRUST, 0, arg_trust_pem }; + } + + d = MHD_start_daemon(flags, 19531, + NULL, NULL, + request_handler, NULL, + MHD_OPTION_ARRAY, opts, + MHD_OPTION_END); + if (!d) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to start daemon!"); + + pause(); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal-remote/journal-remote-main.c b/src/journal-remote/journal-remote-main.c new file mode 100644 index 0000000..da0f20d --- /dev/null +++ b/src/journal-remote/journal-remote-main.c @@ -0,0 +1,1161 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-daemon.h" + +#include "build.h" +#include "conf-parser.h" +#include "constants.h" +#include "daemon-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "journal-remote-write.h" +#include "journal-remote.h" +#include "main-func.h" +#include "memory-util.h" +#include "parse-argument.h" +#include "parse-helpers.h" +#include "pretty-print.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "strv.h" + +#define PRIV_KEY_FILE CERTIFICATE_ROOT "/private/journal-remote.pem" +#define CERT_FILE CERTIFICATE_ROOT "/certs/journal-remote.pem" +#define TRUST_FILE CERTIFICATE_ROOT "/ca/trusted.pem" + +static const char* arg_url = NULL; +static const char* arg_getter = NULL; +static const char* arg_listen_raw = NULL; +static const char* arg_listen_http = NULL; +static const char* arg_listen_https = NULL; +static char** arg_files = NULL; /* Do not free this. */ +static bool arg_compress = true; +static bool arg_seal = false; +static int http_socket = -1, https_socket = -1; +static char** arg_gnutls_log = NULL; + +static JournalWriteSplitMode arg_split_mode = _JOURNAL_WRITE_SPLIT_INVALID; +static char *arg_output = NULL; + +static char *arg_key = NULL; +static char *arg_cert = NULL; +static char *arg_trust = NULL; +#if HAVE_GNUTLS +static bool arg_trust_all = false; +#else +static bool arg_trust_all = true; +#endif + +static uint64_t arg_max_use = UINT64_MAX; +static uint64_t arg_max_size = UINT64_MAX; +static uint64_t arg_n_max_files = UINT64_MAX; +static uint64_t arg_keep_free = UINT64_MAX; + +STATIC_DESTRUCTOR_REGISTER(arg_gnutls_log, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_key, freep); +STATIC_DESTRUCTOR_REGISTER(arg_cert, freep); +STATIC_DESTRUCTOR_REGISTER(arg_trust, freep); +STATIC_DESTRUCTOR_REGISTER(arg_output, freep); + +static const char* const journal_write_split_mode_table[_JOURNAL_WRITE_SPLIT_MAX] = { + [JOURNAL_WRITE_SPLIT_NONE] = "none", + [JOURNAL_WRITE_SPLIT_HOST] = "host", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(journal_write_split_mode, JournalWriteSplitMode); +static DEFINE_CONFIG_PARSE_ENUM(config_parse_write_split_mode, + journal_write_split_mode, + JournalWriteSplitMode, + "Failed to parse split mode setting"); + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +static int spawn_child(const char* child, char** argv) { + pid_t child_pid; + int fd[2], r; + + if (pipe(fd) < 0) + return log_error_errno(errno, "Failed to create pager pipe: %m"); + + r = safe_fork_full("(remote)", + (int[]) {STDIN_FILENO, fd[1], STDERR_FILENO }, + NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, &child_pid); + if (r < 0) { + safe_close_pair(fd); + return r; + } + + /* In the child */ + if (r == 0) { + execvp(child, argv); + log_error_errno(errno, "Failed to exec child %s: %m", child); + _exit(EXIT_FAILURE); + } + + safe_close(fd[1]); + + r = fd_nonblock(fd[0], true); + if (r < 0) + log_warning_errno(errno, "Failed to set child pipe to non-blocking: %m"); + + return fd[0]; +} + +static int spawn_curl(const char* url) { + char **argv = STRV_MAKE("curl", + "-HAccept: application/vnd.fdo.journal", + "--silent", + "--show-error", + url); + int r; + + r = spawn_child("curl", argv); + if (r < 0) + log_error_errno(r, "Failed to spawn curl: %m"); + return r; +} + +static int spawn_getter(const char *getter) { + int r; + _cleanup_strv_free_ char **words = NULL; + + assert(getter); + r = strv_split_full(&words, getter, WHITESPACE, EXTRACT_UNQUOTE); + if (r < 0) + return log_error_errno(r, "Failed to split getter option: %m"); + + r = spawn_child(words[0], words); + if (r < 0) + log_error_errno(r, "Failed to spawn getter %s: %m", getter); + + return r; +} + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +static int null_timer_event_handler(sd_event_source *s, + uint64_t usec, + void *userdata); +static int dispatch_http_event(sd_event_source *event, + int fd, + uint32_t revents, + void *userdata); + +static int request_meta(void **connection_cls, int fd, char *hostname) { + RemoteSource *source; + Writer *writer; + int r; + + assert(connection_cls); + if (*connection_cls) + return 0; + + r = journal_remote_get_writer(journal_remote_server_global, hostname, &writer); + if (r < 0) + return log_warning_errno(r, "Failed to get writer for source %s: %m", + hostname); + + source = source_new(fd, true, hostname, writer); + if (!source) { + writer_unref(writer); + return log_oom(); + } + + log_debug("Added RemoteSource as connection metadata %p", source); + + *connection_cls = source; + return 0; +} + +static void request_meta_free(void *cls, + struct MHD_Connection *connection, + void **connection_cls, + enum MHD_RequestTerminationCode toe) { + RemoteSource *s; + + assert(connection_cls); + s = *connection_cls; + + if (s) { + log_debug("Cleaning up connection metadata %p", s); + source_free(s); + *connection_cls = NULL; + } +} + +static int process_http_upload( + struct MHD_Connection *connection, + const char *upload_data, + size_t *upload_data_size, + RemoteSource *source) { + + bool finished = false; + size_t remaining; + int r; + + assert(source); + + log_trace("%s: connection %p, %zu bytes", + __func__, connection, *upload_data_size); + + if (*upload_data_size) { + log_trace("Received %zu bytes", *upload_data_size); + + r = journal_importer_push_data(&source->importer, + upload_data, *upload_data_size); + if (r < 0) + return mhd_respond_oom(connection); + + *upload_data_size = 0; + } else + finished = true; + + for (;;) { + r = process_source(source, journal_remote_server_global->file_flags); + if (r == -EAGAIN) + break; + if (r < 0) { + if (r == -ENOBUFS) + log_warning_errno(r, "Entry is above the maximum of %u, aborting connection %p.", + DATA_SIZE_MAX, connection); + else if (r == -E2BIG) + log_warning_errno(r, "Entry with more fields than the maximum of %u, aborting connection %p.", + ENTRY_FIELD_COUNT_MAX, connection); + else + log_warning_errno(r, "Failed to process data, aborting connection %p: %m", + connection); + return MHD_NO; + } + } + + if (!finished) + return MHD_YES; + + /* The upload is finished */ + + remaining = journal_importer_bytes_remaining(&source->importer); + if (remaining > 0) { + log_warning("Premature EOF byte. %zu bytes lost.", remaining); + return mhd_respondf(connection, + 0, MHD_HTTP_EXPECTATION_FAILED, + "Premature EOF. %zu bytes of trailing data not processed.", + remaining); + } + + return mhd_respond(connection, MHD_HTTP_ACCEPTED, "OK."); +}; + +static mhd_result request_handler( + void *cls, + struct MHD_Connection *connection, + const char *url, + const char *method, + const char *version, + const char *upload_data, + size_t *upload_data_size, + void **connection_cls) { + + const char *header; + int r, code, fd; + _cleanup_free_ char *hostname = NULL; + bool chunked = false; + + assert(connection); + assert(connection_cls); + assert(url); + assert(method); + + log_trace("Handling a connection %s %s %s", method, url, version); + + if (*connection_cls) + return process_http_upload(connection, + upload_data, upload_data_size, + *connection_cls); + + if (!streq(method, "POST")) + return mhd_respond(connection, MHD_HTTP_NOT_ACCEPTABLE, "Unsupported method."); + + if (!streq(url, "/upload")) + return mhd_respond(connection, MHD_HTTP_NOT_FOUND, "Not found."); + + header = MHD_lookup_connection_value(connection, MHD_HEADER_KIND, "Content-Type"); + if (!header || !streq(header, "application/vnd.fdo.journal")) + return mhd_respond(connection, MHD_HTTP_UNSUPPORTED_MEDIA_TYPE, + "Content-Type: application/vnd.fdo.journal is required."); + + header = MHD_lookup_connection_value(connection, MHD_HEADER_KIND, "Transfer-Encoding"); + if (header) { + if (!strcaseeq(header, "chunked")) + return mhd_respondf(connection, 0, MHD_HTTP_BAD_REQUEST, + "Unsupported Transfer-Encoding type: %s", header); + + chunked = true; + } + + header = MHD_lookup_connection_value(connection, MHD_HEADER_KIND, "Content-Length"); + if (header) { + size_t len; + + if (chunked) + return mhd_respond(connection, MHD_HTTP_BAD_REQUEST, + "Content-Length not allowed when Transfer-Encoding type is 'chunked'"); + + r = safe_atozu(header, &len); + if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_LENGTH_REQUIRED, + "Content-Length: %s cannot be parsed: %m", header); + + if (len > ENTRY_SIZE_MAX) + /* When serialized, an entry of maximum size might be slightly larger, + * so this does not correspond exactly to the limit in journald. Oh well. + */ + return mhd_respondf(connection, 0, MHD_HTTP_CONTENT_TOO_LARGE, + "Payload larger than maximum size of %u bytes", ENTRY_SIZE_MAX); + } + + { + const union MHD_ConnectionInfo *ci; + + ci = MHD_get_connection_info(connection, + MHD_CONNECTION_INFO_CONNECTION_FD); + if (!ci) { + log_error("MHD_get_connection_info failed: cannot get remote fd"); + return mhd_respond(connection, MHD_HTTP_INTERNAL_SERVER_ERROR, + "Cannot check remote address."); + } + + fd = ci->connect_fd; + assert(fd >= 0); + } + + if (journal_remote_server_global->check_trust) { + r = check_permissions(connection, &code, &hostname); + if (r < 0) + return code; + } else { + r = getpeername_pretty(fd, false, &hostname); + if (r < 0) + return mhd_respond(connection, MHD_HTTP_INTERNAL_SERVER_ERROR, + "Cannot check remote hostname."); + } + + assert(hostname); + + r = request_meta(connection_cls, fd, hostname); + if (r == -ENOMEM) + return respond_oom(connection); + else if (r < 0) + return mhd_respondf(connection, r, MHD_HTTP_INTERNAL_SERVER_ERROR, "%m"); + + hostname = NULL; + return MHD_YES; +} + +static int setup_microhttpd_server(RemoteServer *s, + int fd, + const char *key, + const char *cert, + const char *trust) { + struct MHD_OptionItem opts[] = { + { MHD_OPTION_EXTERNAL_LOGGER, (intptr_t) microhttpd_logger}, + { MHD_OPTION_NOTIFY_COMPLETED, (intptr_t) request_meta_free}, + { MHD_OPTION_LISTEN_SOCKET, fd}, + { MHD_OPTION_CONNECTION_MEMORY_LIMIT, 128*1024}, + { MHD_OPTION_END}, + { MHD_OPTION_END}, + { MHD_OPTION_END}, + { MHD_OPTION_END}, + { MHD_OPTION_END}}; + int opts_pos = 4; + int flags = + MHD_USE_DEBUG | + MHD_USE_DUAL_STACK | + MHD_USE_EPOLL | + MHD_USE_ITC; + + _cleanup_(MHDDaemonWrapper_freep) MHDDaemonWrapper *d = NULL; + const union MHD_DaemonInfo *info; + int r, epoll_fd; + + assert(fd >= 0); + + r = fd_nonblock(fd, true); + if (r < 0) + return log_error_errno(r, "Failed to make fd:%d nonblocking: %m", fd); + +/* MHD_OPTION_STRICT_FOR_CLIENT is introduced in microhttpd 0.9.54, + * and MHD_USE_PEDANTIC_CHECKS will be deprecated in future. + * If MHD_USE_PEDANTIC_CHECKS is '#define'd, then it is deprecated + * and we should use MHD_OPTION_STRICT_FOR_CLIENT. On the other hand, + * if MHD_USE_PEDANTIC_CHECKS is not '#define'd, then it is not + * deprecated yet and there exists an enum element with the same name. + * So we can safely use it. */ +#ifdef MHD_USE_PEDANTIC_CHECKS + opts[opts_pos++] = (struct MHD_OptionItem) + {MHD_OPTION_STRICT_FOR_CLIENT, 1}; +#else + flags |= MHD_USE_PEDANTIC_CHECKS; +#endif + + if (key) { + assert(cert); + + opts[opts_pos++] = (struct MHD_OptionItem) + {MHD_OPTION_HTTPS_MEM_KEY, 0, (char*) key}; + opts[opts_pos++] = (struct MHD_OptionItem) + {MHD_OPTION_HTTPS_MEM_CERT, 0, (char*) cert}; + + flags |= MHD_USE_TLS; + + if (trust) + opts[opts_pos++] = (struct MHD_OptionItem) + {MHD_OPTION_HTTPS_MEM_TRUST, 0, (char*) trust}; + } + + d = new(MHDDaemonWrapper, 1); + if (!d) + return log_oom(); + + d->fd = (uint64_t) fd; + + d->daemon = MHD_start_daemon(flags, 0, + NULL, NULL, + request_handler, NULL, + MHD_OPTION_ARRAY, opts, + MHD_OPTION_END); + if (!d->daemon) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to start μhttp daemon"); + + log_debug("Started MHD %s daemon on fd:%d (wrapper @ %p)", + key ? "HTTPS" : "HTTP", fd, d); + + info = MHD_get_daemon_info(d->daemon, MHD_DAEMON_INFO_EPOLL_FD_LINUX_ONLY); + if (!info) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "μhttp returned NULL daemon info"); + + epoll_fd = info->listen_fd; + if (epoll_fd < 0) + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "μhttp epoll fd is invalid"); + + r = sd_event_add_io(s->events, &d->io_event, + epoll_fd, EPOLLIN, + dispatch_http_event, d); + if (r < 0) + return log_error_errno(r, "Failed to add event callback: %m"); + + r = sd_event_source_set_description(d->io_event, "io_event"); + if (r < 0) + return log_error_errno(r, "Failed to set source name: %m"); + + r = sd_event_add_time(s->events, &d->timer_event, + CLOCK_MONOTONIC, UINT64_MAX, 0, + null_timer_event_handler, d); + if (r < 0) + return log_error_errno(r, "Failed to add timer_event: %m"); + + r = sd_event_source_set_description(d->timer_event, "timer_event"); + if (r < 0) + return log_error_errno(r, "Failed to set source name: %m"); + + r = hashmap_ensure_put(&s->daemons, &uint64_hash_ops, &d->fd, d); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to add daemon to hashmap: %m"); + + TAKE_PTR(d); + s->active++; + return 0; +} + +static int setup_microhttpd_socket(RemoteServer *s, + const char *address, + const char *key, + const char *cert, + const char *trust) { + int fd; + + fd = make_socket_fd(LOG_DEBUG, address, SOCK_STREAM, SOCK_CLOEXEC); + if (fd < 0) + return fd; + + return setup_microhttpd_server(s, fd, key, cert, trust); +} + +static int null_timer_event_handler(sd_event_source *timer_event, + uint64_t usec, + void *userdata) { + return dispatch_http_event(timer_event, 0, 0, userdata); +} + +static int dispatch_http_event(sd_event_source *event, + int fd, + uint32_t revents, + void *userdata) { + MHDDaemonWrapper *d = ASSERT_PTR(userdata); + int r; + MHD_UNSIGNED_LONG_LONG timeout = ULLONG_MAX; + + r = MHD_run(d->daemon); + if (r == MHD_NO) + // FIXME: unregister daemon + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "MHD_run failed!"); + if (MHD_get_timeout(d->daemon, &timeout) == MHD_NO) + timeout = ULLONG_MAX; + + r = sd_event_source_set_time(d->timer_event, timeout); + if (r < 0) { + log_warning_errno(r, "Unable to set event loop timeout: %m, this may result in indefinite blocking!"); + return 1; + } + + r = sd_event_source_set_enabled(d->timer_event, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Unable to enable timer_event: %m, this may result in indefinite blocking!"); + + return 1; /* work to do */ +} + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +static int setup_signals(RemoteServer *s) { + int r; + + assert(s); + + assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, -1) >= 0); + + r = sd_event_add_signal(s->events, &s->sigterm_event, SIGTERM, NULL, s); + if (r < 0) + return r; + + r = sd_event_add_signal(s->events, &s->sigint_event, SIGINT, NULL, s); + if (r < 0) + return r; + + return 0; +} + +static int setup_raw_socket(RemoteServer *s, const char *address) { + int fd; + + fd = make_socket_fd(LOG_INFO, address, SOCK_STREAM, SOCK_CLOEXEC); + if (fd < 0) + return fd; + + return journal_remote_add_raw_socket(s, fd); +} + +static int create_remoteserver( + RemoteServer *s, + const char* key, + const char* cert, + const char* trust) { + + int r, n, fd; + + r = journal_remote_server_init( + s, + arg_output, + arg_split_mode, + (arg_compress ? JOURNAL_COMPRESS : 0) | + (arg_seal ? JOURNAL_SEAL : 0)); + if (r < 0) + return r; + + r = setup_signals(s); + if (r < 0) + return log_error_errno(r, "Failed to set up signals: %m"); + + n = sd_listen_fds(true); + if (n < 0) + return log_error_errno(n, "Failed to read listening file descriptors from environment: %m"); + else + log_debug("Received %d descriptors", n); + + if (MAX(http_socket, https_socket) >= SD_LISTEN_FDS_START + n) + return log_error_errno(SYNTHETIC_ERRNO(EBADFD), + "Received fewer sockets than expected"); + + for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) { + if (sd_is_socket(fd, AF_UNSPEC, 0, true)) { + log_debug("Received a listening socket (fd:%d)", fd); + + if (fd == http_socket) + r = setup_microhttpd_server(s, fd, NULL, NULL, NULL); + else if (fd == https_socket) + r = setup_microhttpd_server(s, fd, key, cert, trust); + else + r = journal_remote_add_raw_socket(s, fd); + } else if (sd_is_socket(fd, AF_UNSPEC, 0, false)) { + char *hostname; + + r = getpeername_pretty(fd, false, &hostname); + if (r < 0) + return log_error_errno(r, "Failed to retrieve remote name: %m"); + + log_debug("Received a connection socket (fd:%d) from %s", fd, hostname); + + r = journal_remote_add_source(s, fd, hostname, true); + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown socket passed on fd:%d", fd); + + if (r < 0) + return log_error_errno(r, "Failed to register socket (fd:%d): %m", fd); + } + + if (arg_getter) { + log_info("Spawning getter %s...", arg_getter); + fd = spawn_getter(arg_getter); + if (fd < 0) + return fd; + + r = journal_remote_add_source(s, fd, (char*) arg_output, false); + if (r < 0) + return r; + } + + if (arg_url) { + const char *url, *hostname; + + if (!strstr(arg_url, "/entries")) { + if (endswith(arg_url, "/")) + url = strjoina(arg_url, "entries"); + else + url = strjoina(arg_url, "/entries"); + } else + url = strdupa_safe(arg_url); + + log_info("Spawning curl %s...", url); + fd = spawn_curl(url); + if (fd < 0) + return fd; + + hostname = STARTSWITH_SET(arg_url, "https://", "http://"); + if (!hostname) + hostname = arg_url; + + hostname = strndupa_safe(hostname, strcspn(hostname, "/:")); + + r = journal_remote_add_source(s, fd, (char *) hostname, false); + if (r < 0) + return r; + } + + if (arg_listen_raw) { + log_debug("Listening on a socket..."); + r = setup_raw_socket(s, arg_listen_raw); + if (r < 0) + return r; + } + + if (arg_listen_http) { + r = setup_microhttpd_socket(s, arg_listen_http, NULL, NULL, NULL); + if (r < 0) + return r; + } + + if (arg_listen_https) { + r = setup_microhttpd_socket(s, arg_listen_https, key, cert, trust); + if (r < 0) + return r; + } + + STRV_FOREACH(file, arg_files) { + const char *output_name; + + if (streq(*file, "-")) { + log_debug("Using standard input as source."); + + fd = STDIN_FILENO; + output_name = "stdin"; + } else { + log_debug("Reading file %s...", *file); + + fd = open(*file, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", *file); + output_name = *file; + } + + r = journal_remote_add_source(s, fd, (char*) output_name, false); + if (r < 0) + return r; + } + + if (s->active == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Zero sources specified"); + + if (arg_split_mode == JOURNAL_WRITE_SPLIT_NONE) { + /* In this case we know what the writer will be + called, so we can create it and verify that we can + create output as expected. */ + r = journal_remote_get_writer(s, NULL, &s->_single_writer); + if (r < 0) + return log_warning_errno(r, "Failed to get writer: %m"); + } + + return 0; +} + +static int negative_fd(const char *spec) { + /* Return a non-positive number as its inverse, -EINVAL otherwise. */ + + int fd, r; + + r = safe_atoi(spec, &fd); + if (r < 0) + return r; + + if (fd > 0) + return -EINVAL; + else + return -fd; +} + +static int parse_config(void) { + const ConfigTableItem items[] = { + { "Remote", "Seal", config_parse_bool, 0, &arg_seal }, + { "Remote", "SplitMode", config_parse_write_split_mode, 0, &arg_split_mode }, + { "Remote", "ServerKeyFile", config_parse_path, 0, &arg_key }, + { "Remote", "ServerCertificateFile", config_parse_path, 0, &arg_cert }, + { "Remote", "TrustedCertificateFile", config_parse_path_or_ignore, 0, &arg_trust }, + { "Remote", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use }, + { "Remote", "MaxFileSize", config_parse_iec_uint64, 0, &arg_max_size }, + { "Remote", "MaxFiles", config_parse_uint64, 0, &arg_n_max_files }, + { "Remote", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free }, + {} + }; + + return config_parse_config_file("journal-remote.conf", "Remote\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, NULL); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-journal-remote.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] {FILE|-}...\n\n" + "Write external journal events to journal file(s).\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --url=URL Read events from systemd-journal-gatewayd at URL\n" + " --getter=COMMAND Read events from the output of COMMAND\n" + " --listen-raw=ADDR Listen for connections at ADDR\n" + " --listen-http=ADDR Listen for HTTP connections at ADDR\n" + " --listen-https=ADDR Listen for HTTPS connections at ADDR\n" + " -o --output=FILE|DIR Write output to FILE or DIR/external-*.journal\n" + " --compress[=BOOL] Use compression in the output journal (default: yes)\n" + " --seal[=BOOL] Use event sealing (default: no)\n" + " --key=FILENAME SSL key in PEM format (default:\n" + " \"" PRIV_KEY_FILE "\")\n" + " --cert=FILENAME SSL certificate in PEM format (default:\n" + " \"" CERT_FILE "\")\n" + " --trust=FILENAME|all SSL CA certificate or disable checking (default:\n" + " \"" TRUST_FILE "\")\n" + " --gnutls-log=CATEGORY...\n" + " Specify a list of gnutls logging categories\n" + " --split-mode=none|host How many output files to create\n" + "\nNote: file descriptors from sd_listen_fds() will be consumed, too.\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_URL, + ARG_LISTEN_RAW, + ARG_LISTEN_HTTP, + ARG_LISTEN_HTTPS, + ARG_GETTER, + ARG_SPLIT_MODE, + ARG_COMPRESS, + ARG_SEAL, + ARG_KEY, + ARG_CERT, + ARG_TRUST, + ARG_GNUTLS_LOG, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "url", required_argument, NULL, ARG_URL }, + { "getter", required_argument, NULL, ARG_GETTER }, + { "listen-raw", required_argument, NULL, ARG_LISTEN_RAW }, + { "listen-http", required_argument, NULL, ARG_LISTEN_HTTP }, + { "listen-https", required_argument, NULL, ARG_LISTEN_HTTPS }, + { "output", required_argument, NULL, 'o' }, + { "split-mode", required_argument, NULL, ARG_SPLIT_MODE }, + { "compress", optional_argument, NULL, ARG_COMPRESS }, + { "seal", optional_argument, NULL, ARG_SEAL }, + { "key", required_argument, NULL, ARG_KEY }, + { "cert", required_argument, NULL, ARG_CERT }, + { "trust", required_argument, NULL, ARG_TRUST }, + { "gnutls-log", required_argument, NULL, ARG_GNUTLS_LOG }, + {} + }; + + int c, r; + bool type_a, type_b; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "ho:", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_URL: + if (arg_url) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot currently set more than one --url="); + + arg_url = optarg; + break; + + case ARG_GETTER: + if (arg_getter) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot currently use --getter= more than once"); + + arg_getter = optarg; + break; + + case ARG_LISTEN_RAW: + if (arg_listen_raw) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot currently use --listen-raw= more than once"); + + arg_listen_raw = optarg; + break; + + case ARG_LISTEN_HTTP: + if (arg_listen_http || http_socket >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot currently use --listen-http= more than once"); + + r = negative_fd(optarg); + if (r >= 0) + http_socket = r; + else + arg_listen_http = optarg; + break; + + case ARG_LISTEN_HTTPS: + if (arg_listen_https || https_socket >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot currently use --listen-https= more than once"); + + r = negative_fd(optarg); + if (r >= 0) + https_socket = r; + else + arg_listen_https = optarg; + + break; + + case ARG_KEY: + if (arg_key) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Key file specified twice"); + + arg_key = strdup(optarg); + if (!arg_key) + return log_oom(); + + break; + + case ARG_CERT: + if (arg_cert) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Certificate file specified twice"); + + arg_cert = strdup(optarg); + if (!arg_cert) + return log_oom(); + + break; + + case ARG_TRUST: +#if HAVE_GNUTLS + if (arg_trust) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use --trust more= than once"); + + arg_trust = strdup(optarg); + if (!arg_trust) + return log_oom(); +#else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --trust= is not available."); +#endif + break; + + case 'o': + if (arg_output) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use --output=/-o more than once"); + + r = parse_path_argument(optarg, /* suppress_root = */ false, &arg_output); + if (r < 0) + return r; + break; + + case ARG_SPLIT_MODE: + arg_split_mode = journal_write_split_mode_from_string(optarg); + if (arg_split_mode == _JOURNAL_WRITE_SPLIT_INVALID) + return log_error_errno(arg_split_mode, "Invalid split mode: %s", optarg); + break; + + case ARG_COMPRESS: + r = parse_boolean_argument("--compress", optarg, &arg_compress); + if (r < 0) + return r; + break; + + case ARG_SEAL: + r = parse_boolean_argument("--seal", optarg, &arg_seal); + if (r < 0) + return r; + break; + + case ARG_GNUTLS_LOG: +#if HAVE_GNUTLS + for (const char* p = optarg;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse --gnutls-log= argument: %m"); + if (r == 0) + break; + + if (strv_push(&arg_gnutls_log, word) < 0) + return log_oom(); + + word = NULL; + } + break; +#else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --gnutls-log= is not available."); +#endif + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + arg_files = argv + optind; + + type_a = arg_getter || !strv_isempty(arg_files); + type_b = arg_url + || arg_listen_raw + || arg_listen_http || arg_listen_https + || sd_listen_fds(false) > 0; + if (type_a && type_b) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use file input or --getter= with " + "--listen-...= or socket activation."); + if (type_a) { + if (!arg_output) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --output= must be specified with file input or --getter=."); + + if (!IN_SET(arg_split_mode, JOURNAL_WRITE_SPLIT_NONE, _JOURNAL_WRITE_SPLIT_INVALID)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "For active sources, only --split-mode=none is allowed."); + + arg_split_mode = JOURNAL_WRITE_SPLIT_NONE; + } + + if (arg_split_mode == _JOURNAL_WRITE_SPLIT_INVALID) + arg_split_mode = JOURNAL_WRITE_SPLIT_HOST; + + if (arg_split_mode == JOURNAL_WRITE_SPLIT_NONE && arg_output) { + if (is_dir(arg_output, true) > 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "For SplitMode=none, output must be a file."); + if (!endswith(arg_output, ".journal")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "For SplitMode=none, output file name must end with .journal."); + } + + if (arg_split_mode == JOURNAL_WRITE_SPLIT_HOST + && arg_output && is_dir(arg_output, true) <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "For SplitMode=host, output must be a directory."); + + if (STRPTR_IN_SET(arg_trust, "-", "all")) { + arg_trust_all = true; + arg_trust = mfree(arg_trust); + } + + log_debug("Full config: SplitMode=%s Key=%s Cert=%s Trust=%s", + journal_write_split_mode_to_string(arg_split_mode), + strna(arg_key), + strna(arg_cert), + strna(arg_trust)); + + return 1 /* work to do */; +} + +static int load_certificates(char **key, char **cert, char **trust) { + int r; + + r = read_full_file_full( + AT_FDCWD, arg_key ?: PRIV_KEY_FILE, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET, + NULL, + key, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read key from file '%s': %m", + arg_key ?: PRIV_KEY_FILE); + + r = read_full_file_full( + AT_FDCWD, arg_cert ?: CERT_FILE, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_CONNECT_SOCKET, + NULL, + cert, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read certificate from file '%s': %m", + arg_cert ?: CERT_FILE); + + if (arg_trust_all) + log_info("Certificate checking disabled."); + else { + r = read_full_file_full( + AT_FDCWD, arg_trust ?: TRUST_FILE, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_CONNECT_SOCKET, + NULL, + trust, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read CA certificate file '%s': %m", + arg_trust ?: TRUST_FILE); + } + + if ((arg_listen_raw || arg_listen_http) && *trust) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --trust= makes all non-HTTPS connections untrusted."); + + return 0; +} + +static int run(int argc, char **argv) { + _cleanup_(journal_remote_server_destroy) RemoteServer s = {}; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = NULL; + _cleanup_(erase_and_freep) char *key = NULL; + _cleanup_free_ char *cert = NULL, *trust = NULL; + int r; + + log_show_color(true); + log_parse_environment(); + + /* The journal merging logic potentially needs a lot of fds. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + sigbus_install(); + + r = parse_config(); + if (r < 0) + return r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_listen_http || arg_listen_https) { + r = setup_gnutls_logger(arg_gnutls_log); + if (r < 0) + return r; + } + + if (arg_listen_https || https_socket >= 0) { + r = load_certificates(&key, &cert, &trust); + if (r < 0) + return r; + + s.check_trust = !arg_trust_all; + } + + journal_reset_metrics(&s.metrics); + s.metrics.max_use = arg_max_use; + s.metrics.max_size = arg_max_size; + s.metrics.keep_free = arg_keep_free; + s.metrics.n_max_files = arg_n_max_files; + + r = create_remoteserver(&s, key, cert, trust); + if (r < 0) + return r; + + r = sd_event_set_watchdog(s.events, true); + if (r < 0) + return log_error_errno(r, "Failed to enable watchdog: %m"); + + log_debug("Watchdog is %sd.", enable_disable(r > 0)); + + log_debug("%s running as pid "PID_FMT, + program_invocation_short_name, getpid_cached()); + + notify_message = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + while (s.active) { + r = sd_event_get_state(s.events); + if (r < 0) + return r; + if (r == SD_EVENT_FINISHED) + break; + + r = sd_event_run(s.events, -1); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + } + + notify_message = NULL; + (void) sd_notifyf(false, + "STOPPING=1\n" + "STATUS=Shutting down after writing %" PRIu64 " entries...", s.event_count); + + log_info("Finishing after writing %" PRIu64 " entries", s.event_count); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal-remote/journal-remote-parse.c b/src/journal-remote/journal-remote-parse.c new file mode 100644 index 0000000..e23012c --- /dev/null +++ b/src/journal-remote/journal-remote-parse.c @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "journal-remote-parse.h" +#include "parse-util.h" +#include "string-util.h" + +void source_free(RemoteSource *source) { + if (!source) + return; + + journal_importer_cleanup(&source->importer); + + log_debug("Writer ref count %u", source->writer->n_ref); + writer_unref(source->writer); + + sd_event_source_unref(source->event); + sd_event_source_unref(source->buffer_event); + + free(source); +} + +/** + * Initialize zero-filled source with given values. On success, takes + * ownership of fd, name, and writer, otherwise does not touch them. + */ +RemoteSource* source_new(int fd, bool passive_fd, char *name, Writer *writer) { + RemoteSource *source; + + log_debug("Creating source for %sfd:%d (%s)", + passive_fd ? "passive " : "", fd, name); + + assert(fd >= 0); + + source = new0(RemoteSource, 1); + if (!source) + return NULL; + + source->importer = JOURNAL_IMPORTER_MAKE(fd); + source->importer.passive_fd = passive_fd; + source->importer.name = name; + + source->writer = writer; + + return source; +} + +int process_source(RemoteSource *source, JournalFileFlags file_flags) { + int r; + + assert(source); + assert(source->writer); + + r = journal_importer_process_data(&source->importer); + if (r <= 0) + return r; + + /* We have a full event */ + log_trace("Received full event from source@%p fd:%d (%s)", + source, source->importer.fd, source->importer.name); + + if (source->importer.iovw.count == 0) { + log_warning("Entry with no payload, skipping"); + goto freeing; + } + + assert(source->importer.iovw.iovec); + + r = writer_write(source->writer, + &source->importer.iovw, + &source->importer.ts, + &source->importer.boot_id, + file_flags); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_warning_errno(r, "Entry is invalid, ignoring."); + r = 0; + } else if (r < 0) + log_error_errno(r, "Failed to write entry of %zu bytes: %m", + iovw_size(&source->importer.iovw)); + else + r = 1; + + freeing: + journal_importer_drop_iovw(&source->importer); + return r; +} diff --git a/src/journal-remote/journal-remote-parse.h b/src/journal-remote/journal-remote-parse.h new file mode 100644 index 0000000..703035b --- /dev/null +++ b/src/journal-remote/journal-remote-parse.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" + +#include "journal-importer.h" +#include "journal-remote-write.h" + +typedef struct RemoteSource { + JournalImporter importer; + + Writer *writer; + + sd_event_source *event; + sd_event_source *buffer_event; +} RemoteSource; + +RemoteSource* source_new(int fd, bool passive_fd, char *name, Writer *writer); +void source_free(RemoteSource *source); +int process_source(RemoteSource *source, JournalFileFlags file_flags); diff --git a/src/journal-remote/journal-remote-write.c b/src/journal-remote/journal-remote-write.c new file mode 100644 index 0000000..d794277 --- /dev/null +++ b/src/journal-remote/journal-remote-write.c @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "journal-file-util.h" +#include "journal-remote.h" +#include "path-util.h" +#include "stat-util.h" + +static int do_rotate(JournalFile **f, MMapCache *m, JournalFileFlags file_flags) { + int r; + + r = journal_file_rotate(f, m, file_flags, UINT64_MAX, NULL); + if (r < 0) { + if (*f) + log_error_errno(r, "Failed to rotate %s: %m", (*f)->path); + else + log_error_errno(r, "Failed to create rotated journal: %m"); + } + + return r; +} + +int writer_new(RemoteServer *server, Writer **ret) { + _cleanup_(writer_unrefp) Writer *w = NULL; + int r; + + assert(server); + assert(ret); + + w = new(Writer, 1); + if (!w) + return -ENOMEM; + + *w = (Writer) { + .n_ref = 1, + .metrics = server->metrics, + .server = server, + }; + + w->mmap = mmap_cache_new(); + if (!w->mmap) + return -ENOMEM; + + if (is_dir(server->output, /* follow = */ true) > 0) { + w->output = strdup(server->output); + if (!w->output) + return -ENOMEM; + } else { + r = path_extract_directory(server->output, &w->output); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(w); + return 0; +} + +static Writer* writer_free(Writer *w) { + if (!w) + return NULL; + + if (w->journal) { + log_debug("Closing journal file %s.", w->journal->path); + journal_file_offline_close(w->journal); + } + + if (w->server && w->hashmap_key) + hashmap_remove(w->server->writers, w->hashmap_key); + + free(w->hashmap_key); + + if (w->mmap) + mmap_cache_unref(w->mmap); + + free(w->output); + + return mfree(w); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Writer, writer, writer_free); + +int writer_write(Writer *w, + const struct iovec_wrapper *iovw, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + JournalFileFlags file_flags) { + int r; + + assert(w); + assert(!iovw_isempty(iovw)); + + if (journal_file_rotate_suggested(w->journal, 0, LOG_DEBUG)) { + log_info("%s: Journal header limits reached or header out-of-date, rotating", + w->journal->path); + r = do_rotate(&w->journal, w->mmap, file_flags); + if (r < 0) + return r; + r = journal_directory_vacuum(w->output, w->metrics.max_use, w->metrics.n_max_files, 0, NULL, /* verbose = */ true); + if (r < 0) + return r; + } + + r = journal_file_append_entry( + w->journal, + ts, + boot_id, + iovw->iovec, + iovw->count, + &w->seqnum, + /* seqnum_id= */ NULL, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); + if (r >= 0) { + if (w->server) + w->server->event_count += 1; + return 0; + } else if (r == -EBADMSG) + return r; + + log_debug_errno(r, "%s: Write failed, rotating: %m", w->journal->path); + r = do_rotate(&w->journal, w->mmap, file_flags); + if (r < 0) + return r; + else + log_debug("%s: Successfully rotated journal", w->journal->path); + r = journal_directory_vacuum(w->output, w->metrics.max_use, w->metrics.n_max_files, 0, NULL, /* verbose = */ true); + if (r < 0) + return r; + + log_debug("Retrying write."); + r = journal_file_append_entry( + w->journal, + ts, + boot_id, + iovw->iovec, iovw->count, + &w->seqnum, + /* seqnum_id= */ NULL, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); + if (r < 0) + return r; + + if (w->server) + w->server->event_count += 1; + return 0; +} diff --git a/src/journal-remote/journal-remote-write.h b/src/journal-remote/journal-remote-write.h new file mode 100644 index 0000000..005eaac --- /dev/null +++ b/src/journal-remote/journal-remote-write.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journal-file.h" +#include "journal-importer.h" + +typedef struct RemoteServer RemoteServer; + +typedef struct Writer { + JournalFile *journal; + JournalMetrics metrics; + char *output; /* directory where we write, for vacuuming */ + + MMapCache *mmap; + RemoteServer *server; + char *hashmap_key; + + uint64_t seqnum; + + unsigned n_ref; +} Writer; + +int writer_new(RemoteServer *server, Writer **ret); +Writer* writer_ref(Writer *w); +Writer* writer_unref(Writer *w); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Writer*, writer_unref); + +int writer_write(Writer *s, + const struct iovec_wrapper *iovw, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + JournalFileFlags file_flags); + +typedef enum JournalWriteSplitMode { + JOURNAL_WRITE_SPLIT_NONE, + JOURNAL_WRITE_SPLIT_HOST, + _JOURNAL_WRITE_SPLIT_MAX, + _JOURNAL_WRITE_SPLIT_INVALID = -EINVAL, +} JournalWriteSplitMode; diff --git a/src/journal-remote/journal-remote.c b/src/journal-remote/journal-remote.c new file mode 100644 index 0000000..79010d0 --- /dev/null +++ b/src/journal-remote/journal-remote.c @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-daemon.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "constants.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "journal-file-util.h" +#include "journal-remote-write.h" +#include "journal-remote.h" +#include "macro.h" +#include "parse-util.h" +#include "parse-helpers.h" +#include "process-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" + +#define REMOTE_JOURNAL_PATH "/var/log/journal/remote" + +#define filename_escape(s) xescape((s), "/ ") + +#if HAVE_MICROHTTPD +MHDDaemonWrapper *MHDDaemonWrapper_free(MHDDaemonWrapper *d) { + if (!d) + return NULL; + + if (d->daemon) + MHD_stop_daemon(d->daemon); + sd_event_source_unref(d->io_event); + sd_event_source_unref(d->timer_event); + + return mfree(d); +} +#endif + +static int open_output(RemoteServer *s, Writer *w, const char* host) { + _cleanup_free_ char *_filename = NULL; + const char *filename; + int r; + + assert(s); + assert(w); + + switch (s->split_mode) { + case JOURNAL_WRITE_SPLIT_NONE: + filename = s->output; + break; + + case JOURNAL_WRITE_SPLIT_HOST: { + _cleanup_free_ char *name = NULL; + + assert(host); + + name = filename_escape(host); + if (!name) + return log_oom(); + + r = asprintf(&_filename, "%s/remote-%s.journal", s->output, name); + if (r < 0) + return log_oom(); + + filename = _filename; + break; + } + + default: + assert_not_reached(); + } + + r = journal_file_open_reliably( + filename, + O_RDWR|O_CREAT, + s->file_flags, + 0640, + UINT64_MAX, + &w->metrics, + w->mmap, + NULL, + &w->journal); + if (r < 0) + return log_error_errno(r, "Failed to open output journal %s: %m", filename); + + log_debug("Opened output file %s", w->journal->path); + return 0; +} + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +static int init_writer_hashmap(RemoteServer *s) { + static const struct hash_ops* const hash_ops[] = { + [JOURNAL_WRITE_SPLIT_NONE] = NULL, + [JOURNAL_WRITE_SPLIT_HOST] = &string_hash_ops, + }; + + assert(s); + assert(s->split_mode >= 0 && s->split_mode < (int) ELEMENTSOF(hash_ops)); + + s->writers = hashmap_new(hash_ops[s->split_mode]); + if (!s->writers) + return log_oom(); + + return 0; +} + +int journal_remote_get_writer(RemoteServer *s, const char *host, Writer **writer) { + _cleanup_(writer_unrefp) Writer *w = NULL; + const void *key; + int r; + + assert(s); + assert(writer); + + switch (s->split_mode) { + case JOURNAL_WRITE_SPLIT_NONE: + key = "one and only"; + break; + + case JOURNAL_WRITE_SPLIT_HOST: + assert(host); + key = host; + break; + + default: + assert_not_reached(); + } + + w = hashmap_get(s->writers, key); + if (w) + writer_ref(w); + else { + r = writer_new(s, &w); + if (r < 0) + return r; + + if (s->split_mode == JOURNAL_WRITE_SPLIT_HOST) { + w->hashmap_key = strdup(key); + if (!w->hashmap_key) + return -ENOMEM; + } + + r = open_output(s, w, host); + if (r < 0) + return r; + + r = hashmap_put(s->writers, w->hashmap_key ?: key, w); + if (r < 0) + return r; + } + + *writer = TAKE_PTR(w); + return 0; +} + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +/* This should go away as soon as μhttpd allows state to be passed around. */ +RemoteServer *journal_remote_server_global; + +static int dispatch_raw_source_event(sd_event_source *event, + int fd, + uint32_t revents, + void *userdata); +static int dispatch_raw_source_until_block(sd_event_source *event, + void *userdata); +static int dispatch_blocking_source_event(sd_event_source *event, + void *userdata); +static int dispatch_raw_connection_event(sd_event_source *event, + int fd, + uint32_t revents, + void *userdata); + +static int get_source_for_fd(RemoteServer *s, + int fd, char *name, RemoteSource **source) { + Writer *writer; + int r; + + /* This takes ownership of name, but only on success. */ + + assert(s); + assert(fd >= 0); + assert(source); + + if (!GREEDY_REALLOC0(s->sources, fd + 1)) + return log_oom(); + + r = journal_remote_get_writer(s, name, &writer); + if (r < 0) + return log_warning_errno(r, "Failed to get writer for source %s: %m", + name); + + if (!s->sources[fd]) { + s->sources[fd] = source_new(fd, false, name, writer); + if (!s->sources[fd]) { + writer_unref(writer); + return log_oom(); + } + + s->active++; + } + + *source = s->sources[fd]; + return 0; +} + +static int remove_source(RemoteServer *s, int fd) { + RemoteSource *source; + + assert(s); + assert(fd >= 0 && fd < (ssize_t) MALLOC_ELEMENTSOF(s->sources)); + + source = s->sources[fd]; + if (source) { + /* this closes fd too */ + source_free(source); + s->sources[fd] = NULL; + s->active--; + } + + return 0; +} + +int journal_remote_add_source(RemoteServer *s, int fd, char* name, bool own_name) { + RemoteSource *source = NULL; + int r; + + /* This takes ownership of name, even on failure, if own_name is true. */ + + assert(s); + assert(fd >= 0); + assert(name); + + if (!own_name) { + name = strdup(name); + if (!name) + return log_oom(); + } + + r = get_source_for_fd(s, fd, name, &source); + if (r < 0) { + log_error_errno(r, "Failed to create source for fd:%d (%s): %m", + fd, name); + free(name); + return r; + } + + r = sd_event_add_io(s->events, &source->event, + fd, EPOLLIN|EPOLLRDHUP|EPOLLPRI, + dispatch_raw_source_event, source); + if (r == 0) { + /* Add additional source for buffer processing. It will be + * enabled later. */ + r = sd_event_add_defer(s->events, &source->buffer_event, + dispatch_raw_source_until_block, source); + if (r == 0) + r = sd_event_source_set_enabled(source->buffer_event, SD_EVENT_OFF); + } else if (r == -EPERM) { + log_debug("Falling back to sd_event_add_defer for fd:%d (%s)", fd, name); + r = sd_event_add_defer(s->events, &source->event, + dispatch_blocking_source_event, source); + if (r == 0) + r = sd_event_source_set_enabled(source->event, SD_EVENT_ON); + } + if (r < 0) { + log_error_errno(r, "Failed to register event source for fd:%d: %m", + fd); + goto error; + } + + r = sd_event_source_set_description(source->event, name); + if (r < 0) { + log_error_errno(r, "Failed to set source name for fd:%d: %m", fd); + goto error; + } + + return 1; /* work to do */ + + error: + remove_source(s, fd); + return r; +} + +int journal_remote_add_raw_socket(RemoteServer *s, int fd) { + _unused_ _cleanup_close_ int fd_ = fd; + char name[STRLEN("raw-socket-") + DECIMAL_STR_MAX(int) + 1]; + int r; + + assert(s); + assert(fd >= 0); + + r = sd_event_add_io(s->events, &s->listen_event, + fd, EPOLLIN, + dispatch_raw_connection_event, s); + if (r < 0) + return r; + + xsprintf(name, "raw-socket-%d", fd); + + r = sd_event_source_set_description(s->listen_event, name); + if (r < 0) + return r; + + TAKE_FD(fd_); + s->active++; + return 0; +} + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +int journal_remote_server_init( + RemoteServer *s, + const char *output, + JournalWriteSplitMode split_mode, + JournalFileFlags file_flags) { + + int r; + + assert(s); + + assert(journal_remote_server_global == NULL); + journal_remote_server_global = s; + + s->split_mode = split_mode; + s->file_flags = file_flags; + + if (output) + s->output = output; + else if (split_mode == JOURNAL_WRITE_SPLIT_NONE) + s->output = REMOTE_JOURNAL_PATH "/remote.journal"; + else if (split_mode == JOURNAL_WRITE_SPLIT_HOST) + s->output = REMOTE_JOURNAL_PATH; + else + assert_not_reached(); + + r = sd_event_default(&s->events); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = init_writer_hashmap(s); + if (r < 0) + return r; + + return 0; +} + +void journal_remote_server_destroy(RemoteServer *s) { + size_t i; + + if (!s) + return; + +#if HAVE_MICROHTTPD + hashmap_free_with_destructor(s->daemons, MHDDaemonWrapper_free); +#endif + + for (i = 0; i < MALLOC_ELEMENTSOF(s->sources); i++) + remove_source(s, i); + free(s->sources); + + writer_unref(s->_single_writer); + hashmap_free(s->writers); + + sd_event_source_unref(s->sigterm_event); + sd_event_source_unref(s->sigint_event); + sd_event_source_unref(s->listen_event); + sd_event_unref(s->events); + + if (s == journal_remote_server_global) + journal_remote_server_global = NULL; + + /* fds that we're listening on remain open... */ +} + +/********************************************************************** + ********************************************************************** + **********************************************************************/ + +int journal_remote_handle_raw_source( + sd_event_source *event, + int fd, + uint32_t revents, + RemoteServer *s) { + + RemoteSource *source; + int r; + + /* Returns 1 if there might be more data pending, + * 0 if data is currently exhausted, negative on error. + */ + + assert(s); + assert(fd >= 0 && fd < (ssize_t) MALLOC_ELEMENTSOF(s->sources)); + source = s->sources[fd]; + assert(source->importer.fd == fd); + + r = process_source(source, s->file_flags); + if (journal_importer_eof(&source->importer)) { + size_t remaining; + + log_debug("EOF reached with source %s (fd=%d)", + source->importer.name, source->importer.fd); + + remaining = journal_importer_bytes_remaining(&source->importer); + if (remaining > 0) + log_notice("Premature EOF. %zu bytes lost.", remaining); + remove_source(s, source->importer.fd); + log_debug("%zu active sources remaining", s->active); + return 0; + } else if (r == -E2BIG) { + log_notice("Entry with too many fields, skipped"); + return 1; + } else if (r == -ENOBUFS) { + log_notice("Entry too big, skipped"); + return 1; + } else if (r == -EAGAIN) { + return 0; + } else if (r < 0) { + log_debug_errno(r, "Closing connection: %m"); + remove_source(s, fd); + return 0; + } else + return 1; +} + +static int dispatch_raw_source_until_block(sd_event_source *event, + void *userdata) { + RemoteSource *source = ASSERT_PTR(userdata); + int r; + + assert(event); + + /* Make sure event stays around even if source is destroyed */ + sd_event_source_ref(event); + + r = journal_remote_handle_raw_source(event, source->importer.fd, EPOLLIN, journal_remote_server_global); + if (r != 1) { + int k; + + /* No more data for now */ + k = sd_event_source_set_enabled(event, SD_EVENT_OFF); + if (k < 0) + r = k; + } + + sd_event_source_unref(event); + + return r; +} + +static int dispatch_raw_source_event(sd_event_source *event, + int fd, + uint32_t revents, + void *userdata) { + RemoteSource *source = ASSERT_PTR(userdata); + int r; + + assert(source->event); + assert(source->buffer_event); + + r = journal_remote_handle_raw_source(event, fd, EPOLLIN, journal_remote_server_global); + if (r == 1) { + int k; + + /* Might have more data. We need to rerun the handler + * until we are sure the buffer is exhausted. */ + k = sd_event_source_set_enabled(source->buffer_event, SD_EVENT_ON); + if (k < 0) + r = k; + } + + return r; +} + +static int dispatch_blocking_source_event(sd_event_source *event, + void *userdata) { + RemoteSource *source = ASSERT_PTR(userdata); + + return journal_remote_handle_raw_source(event, source->importer.fd, EPOLLIN, journal_remote_server_global); +} + +static int accept_connection( + const char* type, + int fd, + SocketAddress *addr, + char **hostname) { + + _cleanup_close_ int fd2 = -EBADF; + int r; + + assert(addr); + assert(hostname); + + log_debug("Accepting new %s connection on fd:%d", type, fd); + fd2 = accept4(fd, &addr->sockaddr.sa, &addr->size, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (fd2 < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return -EAGAIN; + + return log_error_errno(errno, "accept() on fd:%d failed: %m", fd); + } + + switch (socket_address_family(addr)) { + case AF_INET: + case AF_INET6: { + _cleanup_free_ char *a = NULL; + char *b; + + r = socket_address_print(addr, &a); + if (r < 0) + return log_error_errno(r, "socket_address_print(): %m"); + + r = socknameinfo_pretty(&addr->sockaddr, addr->size, &b); + if (r < 0) + return log_error_errno(r, "Resolving hostname failed: %m"); + + log_debug("Accepted %s %s connection from %s", + type, + af_to_ipv4_ipv6(socket_address_family(addr)), + a); + + *hostname = b; + return TAKE_FD(fd2); + } + + default: + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Rejected %s connection with unsupported family %d", + type, socket_address_family(addr)); + } +} + +static int dispatch_raw_connection_event( + sd_event_source *event, + int fd, + uint32_t revents, + void *userdata) { + + RemoteServer *s = ASSERT_PTR(userdata); + int fd2; + SocketAddress addr = { + .size = sizeof(union sockaddr_union), + .type = SOCK_STREAM, + }; + char *hostname = NULL; + + fd2 = accept_connection("raw", fd, &addr, &hostname); + if (fd2 == -EAGAIN) + return 0; + if (fd2 < 0) + return fd2; + + return journal_remote_add_source(s, fd2, hostname, true); +} diff --git a/src/journal-remote/journal-remote.conf.in b/src/journal-remote/journal-remote.conf.in new file mode 100644 index 0000000..e517569 --- /dev/null +++ b/src/journal-remote/journal-remote.conf.in @@ -0,0 +1,28 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/journal-remote.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/journal-remote.conf' to display the full config. +# +# See journal-remote.conf(5) for details. + +[Remote] +# Seal=false +# SplitMode=host +# ServerKeyFile={{CERTIFICATE_ROOT}}/private/journal-remote.pem +# ServerCertificateFile={{CERTIFICATE_ROOT}}/certs/journal-remote.pem +# TrustedCertificateFile={{CERTIFICATE_ROOT}}/ca/trusted.pem +# MaxUse= +# KeepFree= +# MaxFileSize= +# MaxFiles= diff --git a/src/journal-remote/journal-remote.h b/src/journal-remote/journal-remote.h new file mode 100644 index 0000000..8d73f95 --- /dev/null +++ b/src/journal-remote/journal-remote.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" + +#include "hashmap.h" +#include "journal-remote-parse.h" +#include "journal-remote-write.h" +#include "journal-vacuum.h" + +#if HAVE_MICROHTTPD +#include "microhttpd-util.h" + +typedef struct MHDDaemonWrapper MHDDaemonWrapper; + +struct MHDDaemonWrapper { + uint64_t fd; + struct MHD_Daemon *daemon; + + sd_event_source *io_event; + sd_event_source *timer_event; +}; + +MHDDaemonWrapper *MHDDaemonWrapper_free(MHDDaemonWrapper *d); +DEFINE_TRIVIAL_CLEANUP_FUNC(MHDDaemonWrapper*, MHDDaemonWrapper_free); +#endif + +struct RemoteServer { + RemoteSource **sources; + size_t active; + + sd_event *events; + sd_event_source *sigterm_event, *sigint_event, *listen_event; + + Hashmap *writers; + Writer *_single_writer; + uint64_t event_count; + +#if HAVE_MICROHTTPD + Hashmap *daemons; +#endif + const char *output; /* either the output file or directory */ + + JournalWriteSplitMode split_mode; + JournalFileFlags file_flags; + bool check_trust; + JournalMetrics metrics; +}; +extern RemoteServer *journal_remote_server_global; + +int journal_remote_server_init( + RemoteServer *s, + const char *output, + JournalWriteSplitMode split_mode, + JournalFileFlags file_flags); + +int journal_remote_get_writer(RemoteServer *s, const char *host, Writer **writer); + +int journal_remote_add_source(RemoteServer *s, int fd, char* name, bool own_name); +int journal_remote_add_raw_socket(RemoteServer *s, int fd); +int journal_remote_handle_raw_source( + sd_event_source *event, + int fd, + uint32_t revents, + RemoteServer *s); + +void journal_remote_server_destroy(RemoteServer *s); diff --git a/src/journal-remote/journal-upload-journal.c b/src/journal-remote/journal-upload-journal.c new file mode 100644 index 0000000..8206ca8 --- /dev/null +++ b/src/journal-remote/journal-upload-journal.c @@ -0,0 +1,409 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "journal-upload.h" +#include "log.h" +#include "string-util.h" +#include "utf8.h" + +/** + * Write up to size bytes to buf. Return negative on error, and number of + * bytes written otherwise. The last case is a kind of an error too. + */ +static ssize_t write_entry(char *buf, size_t size, Uploader *u) { + int r; + size_t pos = 0; + + assert(size <= SSIZE_MAX); + + for (;;) { + + switch (u->entry_state) { + case ENTRY_CURSOR: { + u->current_cursor = mfree(u->current_cursor); + + r = sd_journal_get_cursor(u->journal, &u->current_cursor); + if (r < 0) + return log_error_errno(r, "Failed to get cursor: %m"); + + r = snprintf(buf + pos, size - pos, + "__CURSOR=%s\n", u->current_cursor); + assert(r >= 0); + if ((size_t) r > size - pos) + /* not enough space */ + return pos; + + u->entry_state++; + + if (pos + r == size) { + /* exactly one character short, but we don't need it */ + buf[size - 1] = '\n'; + return size; + } + + pos += r; + } + _fallthrough_; + case ENTRY_REALTIME: { + usec_t realtime; + + r = sd_journal_get_realtime_usec(u->journal, &realtime); + if (r < 0) + return log_error_errno(r, "Failed to get realtime timestamp: %m"); + + r = snprintf(buf + pos, size - pos, + "__REALTIME_TIMESTAMP="USEC_FMT"\n", realtime); + assert(r >= 0); + if ((size_t) r > size - pos) + /* not enough space */ + return pos; + + u->entry_state++; + + if (r + pos == size) { + /* exactly one character short, but we don't need it */ + buf[size - 1] = '\n'; + return size; + } + + pos += r; + } + _fallthrough_; + case ENTRY_MONOTONIC: { + usec_t monotonic; + sd_id128_t boot_id; + + r = sd_journal_get_monotonic_usec(u->journal, &monotonic, &boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get monotonic timestamp: %m"); + + r = snprintf(buf + pos, size - pos, + "__MONOTONIC_TIMESTAMP="USEC_FMT"\n", monotonic); + assert(r >= 0); + if ((size_t) r > size - pos) + /* not enough space */ + return pos; + + u->entry_state++; + + if (r + pos == size) { + /* exactly one character short, but we don't need it */ + buf[size - 1] = '\n'; + return size; + } + + pos += r; + } + _fallthrough_; + case ENTRY_BOOT_ID: { + sd_id128_t boot_id; + + r = sd_journal_get_monotonic_usec(u->journal, NULL, &boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get monotonic timestamp: %m"); + + r = snprintf(buf + pos, size - pos, + "_BOOT_ID=%s\n", SD_ID128_TO_STRING(boot_id)); + assert(r >= 0); + if ((size_t) r > size - pos) + /* not enough space */ + return pos; + + u->entry_state++; + + if (r + pos == size) { + /* exactly one character short, but we don't need it */ + buf[size - 1] = '\n'; + return size; + } + + pos += r; + } + _fallthrough_; + case ENTRY_NEW_FIELD: { + u->field_pos = 0; + + r = sd_journal_enumerate_data(u->journal, + &u->field_data, + &u->field_length); + if (r < 0) + return log_error_errno(r, "Failed to move to next field in entry: %m"); + else if (r == 0) { + u->entry_state = ENTRY_OUTRO; + continue; + } + + /* We already printed the boot id from the data in + * the header, hence let's suppress it here */ + if (memory_startswith(u->field_data, u->field_length, "_BOOT_ID=")) + continue; + + if (!utf8_is_printable_newline(u->field_data, u->field_length, false)) { + u->entry_state = ENTRY_BINARY_FIELD_START; + continue; + } + + u->entry_state++; + } + _fallthrough_; + case ENTRY_TEXT_FIELD: + case ENTRY_BINARY_FIELD: { + bool done; + size_t tocopy; + + done = size - pos > u->field_length - u->field_pos; + if (done) + tocopy = u->field_length - u->field_pos; + else + tocopy = size - pos; + + memcpy(buf + pos, + (char*) u->field_data + u->field_pos, + tocopy); + + if (done) { + buf[pos + tocopy] = '\n'; + pos += tocopy + 1; + u->entry_state = ENTRY_NEW_FIELD; + continue; + } else { + u->field_pos += tocopy; + return size; + } + } + + case ENTRY_BINARY_FIELD_START: { + const char *c; + size_t len; + + c = memchr(u->field_data, '=', u->field_length); + if (!c || c == u->field_data) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid field."); + + len = c - (const char*)u->field_data; + + /* need space for label + '\n' */ + if (size - pos < len + 1) + return pos; + + memcpy(buf + pos, u->field_data, len); + buf[pos + len] = '\n'; + pos += len + 1; + + u->field_pos = len + 1; + u->entry_state++; + } + _fallthrough_; + case ENTRY_BINARY_FIELD_SIZE: { + uint64_t le64; + + /* need space for uint64_t */ + if (size - pos < 8) + return pos; + + le64 = htole64(u->field_length - u->field_pos); + memcpy(buf + pos, &le64, 8); + pos += 8; + + u->entry_state++; + continue; + } + + case ENTRY_OUTRO: + /* need space for '\n' */ + if (size - pos < 1) + return pos; + + buf[pos++] = '\n'; + u->entry_state++; + u->entries_sent++; + + return pos; + + default: + assert_not_reached(); + } + } + assert_not_reached(); +} + +static void check_update_watchdog(Uploader *u) { + usec_t after; + usec_t elapsed_time; + + if (u->watchdog_usec <= 0) + return; + + after = now(CLOCK_MONOTONIC); + elapsed_time = usec_sub_unsigned(after, u->watchdog_timestamp); + if (elapsed_time > u->watchdog_usec / 2) { + log_debug("Update watchdog timer"); + sd_notify(false, "WATCHDOG=1"); + u->watchdog_timestamp = after; + } +} + +static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void *userp) { + Uploader *u = ASSERT_PTR(userp); + int r; + sd_journal *j; + size_t filled = 0; + ssize_t w; + + assert(nmemb <= SSIZE_MAX / size); + + check_update_watchdog(u); + + j = u->journal; + + while (j && filled < size * nmemb) { + if (u->entry_state == ENTRY_DONE) { + r = sd_journal_next(j); + if (r < 0) { + log_error_errno(r, "Failed to move to next entry in journal: %m"); + return CURL_READFUNC_ABORT; + } else if (r == 0) { + if (u->input_event) + log_debug("No more entries, waiting for journal."); + else { + log_info("No more entries, closing journal."); + close_journal_input(u); + } + + u->uploading = false; + + break; + } + + u->entry_state = ENTRY_CURSOR; + } + + w = write_entry((char*)buf + filled, size * nmemb - filled, u); + if (w < 0) + return CURL_READFUNC_ABORT; + filled += w; + + if (filled == 0) { + log_error("Buffer space is too small to write entry."); + return CURL_READFUNC_ABORT; + } else if (u->entry_state != ENTRY_DONE) + /* This means that all available space was used up */ + break; + + log_debug("Entry %zu (%s) has been uploaded.", + u->entries_sent, u->current_cursor); + } + + return filled; +} + +void close_journal_input(Uploader *u) { + assert(u); + + if (u->journal) { + log_debug("Closing journal input."); + + sd_journal_close(u->journal); + u->journal = NULL; + } + u->timeout = 0; +} + +static int process_journal_input(Uploader *u, int skip) { + int r; + + if (u->uploading) + return 0; + + r = sd_journal_next_skip(u->journal, skip); + if (r < 0) + return log_error_errno(r, "Failed to skip to next entry: %m"); + else if (r < skip) + return 0; + + /* have data */ + u->entry_state = ENTRY_CURSOR; + return start_upload(u, journal_input_callback, u); +} + +int check_journal_input(Uploader *u) { + if (u->input_event) { + int r; + + r = sd_journal_process(u->journal); + if (r < 0) { + log_error_errno(r, "Failed to process journal: %m"); + close_journal_input(u); + return r; + } + + if (r == SD_JOURNAL_NOP) + return 0; + } + + return process_journal_input(u, 1); +} + +static int dispatch_journal_input(sd_event_source *event, + int fd, + uint32_t revents, + void *userp) { + Uploader *u = ASSERT_PTR(userp); + + if (u->uploading) + return 0; + + log_debug("Detected journal input, checking for new data."); + return check_journal_input(u); +} + +int open_journal_for_upload(Uploader *u, + sd_journal *j, + const char *cursor, + bool after_cursor, + bool follow) { + int fd, r, events; + + u->journal = j; + + sd_journal_set_data_threshold(j, 0); + + if (follow) { + fd = sd_journal_get_fd(j); + if (fd < 0) + return log_error_errno(fd, "sd_journal_get_fd failed: %m"); + + events = sd_journal_get_events(j); + + r = sd_journal_reliable_fd(j); + assert(r >= 0); + if (r > 0) + u->timeout = -1; + else + u->timeout = JOURNAL_UPLOAD_POLL_TIMEOUT; + + r = sd_event_add_io(u->events, &u->input_event, + fd, events, dispatch_journal_input, u); + if (r < 0) + return log_error_errno(r, "Failed to register input event: %m"); + + log_debug("Listening for journal events on fd:%d, timeout %d", + fd, u->timeout == UINT64_MAX ? -1 : (int) u->timeout); + } else + log_debug("Not listening for journal events."); + + if (cursor) { + r = sd_journal_seek_cursor(j, cursor); + if (r < 0) + return log_error_errno(r, "Failed to seek to cursor %s: %m", + cursor); + } + + return process_journal_input(u, !!after_cursor); +} diff --git a/src/journal-remote/journal-upload.c b/src/journal-remote/journal-upload.c new file mode 100644 index 0000000..db74355 --- /dev/null +++ b/src/journal-remote/journal-upload.c @@ -0,0 +1,877 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "build.h" +#include "conf-parser.h" +#include "constants.h" +#include "daemon-util.h" +#include "env-file.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "journal-upload.h" +#include "journal-util.h" +#include "log.h" +#include "main-func.h" +#include "mkdir.h" +#include "parse-argument.h" +#include "parse-helpers.h" +#include "pretty-print.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "version.h" + +#define PRIV_KEY_FILE CERTIFICATE_ROOT "/private/journal-upload.pem" +#define CERT_FILE CERTIFICATE_ROOT "/certs/journal-upload.pem" +#define TRUST_FILE CERTIFICATE_ROOT "/ca/trusted.pem" +#define DEFAULT_PORT 19532 + +static const char* arg_url = NULL; +static const char *arg_key = NULL; +static const char *arg_cert = NULL; +static const char *arg_trust = NULL; +static const char *arg_directory = NULL; +static char **arg_file = NULL; +static const char *arg_cursor = NULL; +static bool arg_after_cursor = false; +static int arg_journal_type = 0; +static int arg_namespace_flags = 0; +static const char *arg_machine = NULL; +static const char *arg_namespace = NULL; +static bool arg_merge = false; +static int arg_follow = -1; +static const char *arg_save_state = NULL; +static usec_t arg_network_timeout_usec = USEC_INFINITY; + +static void close_fd_input(Uploader *u); + +#define SERVER_ANSWER_KEEP 2048 + +#define STATE_FILE "/var/lib/systemd/journal-upload/state" + +#define easy_setopt(curl, opt, value, level, cmd) \ + do { \ + code = curl_easy_setopt(curl, opt, value); \ + if (code) { \ + log_full(level, \ + "curl_easy_setopt " #opt " failed: %s", \ + curl_easy_strerror(code)); \ + cmd; \ + } \ + } while (0) + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(CURL*, curl_easy_cleanup, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct curl_slist*, curl_slist_free_all, NULL); + +static size_t output_callback(char *buf, + size_t size, + size_t nmemb, + void *userp) { + Uploader *u = ASSERT_PTR(userp); + + log_debug("The server answers (%zu bytes): %.*s", + size*nmemb, (int)(size*nmemb), buf); + + if (nmemb && !u->answer) { + u->answer = strndup(buf, size*nmemb); + if (!u->answer) + log_warning("Failed to store server answer (%zu bytes): out of memory", size*nmemb); + } + + return size * nmemb; +} + +static int check_cursor_updating(Uploader *u) { + _cleanup_free_ char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + if (!u->state_file) + return 0; + + r = mkdir_parents(u->state_file, 0755); + if (r < 0) + return log_error_errno(r, "Cannot create parent directory of state file %s: %m", + u->state_file); + + r = fopen_temporary(u->state_file, &f, &temp_path); + if (r < 0) + return log_error_errno(r, "Cannot save state to %s: %m", + u->state_file); + (void) unlink(temp_path); + + return 0; +} + +static int update_cursor_state(Uploader *u) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + if (!u->state_file || !u->last_cursor) + return 0; + + r = fopen_temporary(u->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + fprintf(f, + "# This is private data. Do not parse.\n" + "LAST_CURSOR=%s\n", + u->last_cursor); + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, u->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + return 0; + +fail: + (void) unlink(u->state_file); + + return log_error_errno(r, "Failed to save state %s: %m", u->state_file); +} + +static int load_cursor_state(Uploader *u) { + int r; + + if (!u->state_file) + return 0; + + r = parse_env_file(NULL, u->state_file, "LAST_CURSOR", &u->last_cursor); + if (r == -ENOENT) + log_debug("State file %s is not present.", u->state_file); + else if (r < 0) + return log_error_errno(r, "Failed to read state file %s: %m", + u->state_file); + else + log_debug("Last cursor was %s", u->last_cursor); + + return 0; +} + +int start_upload(Uploader *u, + size_t (*input_callback)(void *ptr, + size_t size, + size_t nmemb, + void *userdata), + void *data) { + CURLcode code; + + assert(u); + assert(input_callback); + + if (!u->header) { + _cleanup_(curl_slist_free_allp) struct curl_slist *h = NULL; + struct curl_slist *l; + + h = curl_slist_append(NULL, "Content-Type: application/vnd.fdo.journal"); + if (!h) + return log_oom(); + + l = curl_slist_append(h, "Transfer-Encoding: chunked"); + if (!l) + return log_oom(); + h = l; + + l = curl_slist_append(h, "Accept: text/plain"); + if (!l) + return log_oom(); + h = l; + + u->header = TAKE_PTR(h); + } + + if (!u->easy) { + _cleanup_(curl_easy_cleanupp) CURL *curl = NULL; + + curl = curl_easy_init(); + if (!curl) + return log_error_errno(SYNTHETIC_ERRNO(ENOSR), + "Call to curl_easy_init failed."); + + /* If configured, set a timeout for the curl operation. */ + if (arg_network_timeout_usec != USEC_INFINITY) + easy_setopt(curl, CURLOPT_TIMEOUT, + (long) DIV_ROUND_UP(arg_network_timeout_usec, USEC_PER_SEC), + LOG_ERR, return -EXFULL); + + /* tell it to POST to the URL */ + easy_setopt(curl, CURLOPT_POST, 1L, + LOG_ERR, return -EXFULL); + + easy_setopt(curl, CURLOPT_ERRORBUFFER, u->error, + LOG_ERR, return -EXFULL); + + /* set where to write to */ + easy_setopt(curl, CURLOPT_WRITEFUNCTION, output_callback, + LOG_ERR, return -EXFULL); + + easy_setopt(curl, CURLOPT_WRITEDATA, data, + LOG_ERR, return -EXFULL); + + /* set where to read from */ + easy_setopt(curl, CURLOPT_READFUNCTION, input_callback, + LOG_ERR, return -EXFULL); + + easy_setopt(curl, CURLOPT_READDATA, data, + LOG_ERR, return -EXFULL); + + /* use our special own mime type and chunked transfer */ + easy_setopt(curl, CURLOPT_HTTPHEADER, u->header, + LOG_ERR, return -EXFULL); + + if (DEBUG_LOGGING) + /* enable verbose for easier tracing */ + easy_setopt(curl, CURLOPT_VERBOSE, 1L, LOG_WARNING, ); + + easy_setopt(curl, CURLOPT_USERAGENT, + "systemd-journal-upload " GIT_VERSION, + LOG_WARNING, ); + + if (!streq_ptr(arg_key, "-") && (arg_key || startswith(u->url, "https://"))) { + easy_setopt(curl, CURLOPT_SSLKEY, arg_key ?: PRIV_KEY_FILE, + LOG_ERR, return -EXFULL); + easy_setopt(curl, CURLOPT_SSLCERT, arg_cert ?: CERT_FILE, + LOG_ERR, return -EXFULL); + } + + if (STRPTR_IN_SET(arg_trust, "-", "all")) + easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0, + LOG_ERR, return -EUCLEAN); + else if (arg_trust || startswith(u->url, "https://")) + easy_setopt(curl, CURLOPT_CAINFO, arg_trust ?: TRUST_FILE, + LOG_ERR, return -EXFULL); + + if (arg_key || arg_trust) + easy_setopt(curl, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1, + LOG_WARNING, ); + + u->easy = TAKE_PTR(curl); + } else { + /* truncate the potential old error message */ + u->error[0] = '\0'; + + u->answer = mfree(u->answer); + } + + /* upload to this place */ + code = curl_easy_setopt(u->easy, CURLOPT_URL, u->url); + if (code) + return log_error_errno(SYNTHETIC_ERRNO(EXFULL), + "curl_easy_setopt CURLOPT_URL failed: %s", + curl_easy_strerror(code)); + + u->uploading = true; + + return 0; +} + +static size_t fd_input_callback(void *buf, size_t size, size_t nmemb, void *userp) { + Uploader *u = ASSERT_PTR(userp); + ssize_t n; + + assert(nmemb < SSIZE_MAX / size); + + if (u->input < 0) + return 0; + + assert(!size_multiply_overflow(size, nmemb)); + + n = read(u->input, buf, size * nmemb); + log_debug("%s: allowed %zu, read %zd", __func__, size*nmemb, n); + if (n > 0) + return n; + + u->uploading = false; + if (n < 0) { + log_error_errno(errno, "Aborting transfer after read error on input: %m."); + return CURL_READFUNC_ABORT; + } + + log_debug("Reached EOF"); + close_fd_input(u); + return 0; +} + +static void close_fd_input(Uploader *u) { + assert(u); + + u->input = safe_close(u->input); + u->timeout = 0; +} + +static int dispatch_fd_input(sd_event_source *event, + int fd, + uint32_t revents, + void *userp) { + Uploader *u = ASSERT_PTR(userp); + + assert(fd >= 0); + + if (revents & EPOLLHUP) { + log_debug("Received HUP"); + close_fd_input(u); + return 0; + } + + if (!(revents & EPOLLIN)) { + log_warning("Unexpected poll event %"PRIu32".", revents); + return -EINVAL; + } + + if (u->uploading) { + log_warning("dispatch_fd_input called when uploading, ignoring."); + return 0; + } + + return start_upload(u, fd_input_callback, u); +} + +static int open_file_for_upload(Uploader *u, const char *filename) { + int fd, r = 0; + + if (streq(filename, "-")) + fd = STDIN_FILENO; + else { + fd = open(filename, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", filename); + } + + u->input = fd; + + if (arg_follow != 0) { + r = sd_event_add_io(u->events, &u->input_event, + fd, EPOLLIN, dispatch_fd_input, u); + if (r < 0) { + if (r != -EPERM || arg_follow > 0) + return log_error_errno(r, "Failed to register input event: %m"); + + /* Normal files should just be consumed without polling. */ + r = start_upload(u, fd_input_callback, u); + } + } + + return r; +} + +static int dispatch_sigterm(sd_event_source *event, + const struct signalfd_siginfo *si, + void *userdata) { + Uploader *u = ASSERT_PTR(userdata); + + log_received_signal(LOG_INFO, si); + + close_fd_input(u); + close_journal_input(u); + + sd_event_exit(u->events, 0); + return 0; +} + +static int setup_signals(Uploader *u) { + int r; + + assert(u); + + assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, -1) >= 0); + + r = sd_event_add_signal(u->events, &u->sigterm_event, SIGTERM, dispatch_sigterm, u); + if (r < 0) + return r; + + r = sd_event_add_signal(u->events, &u->sigint_event, SIGINT, dispatch_sigterm, u); + if (r < 0) + return r; + + return 0; +} + +static int setup_uploader(Uploader *u, const char *url, const char *state_file) { + int r; + const char *host, *proto = ""; + + assert(u); + assert(url); + + *u = (Uploader) { + .input = -1, + }; + + host = STARTSWITH_SET(url, "http://", "https://"); + if (!host) { + host = url; + proto = "https://"; + } + + if (strchr(host, ':')) + u->url = strjoin(proto, url, "/upload"); + else { + char *t; + size_t x; + + t = strdupa_safe(url); + x = strlen(t); + while (x > 0 && t[x - 1] == '/') + t[x - 1] = '\0'; + + u->url = strjoin(proto, t, ":" STRINGIFY(DEFAULT_PORT), "/upload"); + } + if (!u->url) + return log_oom(); + + u->state_file = state_file; + + r = sd_event_default(&u->events); + if (r < 0) + return log_error_errno(r, "sd_event_default failed: %m"); + + r = setup_signals(u); + if (r < 0) + return log_error_errno(r, "Failed to set up signals: %m"); + + (void) sd_watchdog_enabled(false, &u->watchdog_usec); + + return load_cursor_state(u); +} + +static void destroy_uploader(Uploader *u) { + assert(u); + + curl_easy_cleanup(u->easy); + curl_slist_free_all(u->header); + free(u->answer); + + free(u->last_cursor); + free(u->current_cursor); + + free(u->url); + + u->input_event = sd_event_source_unref(u->input_event); + + close_fd_input(u); + close_journal_input(u); + + sd_event_source_unref(u->sigterm_event); + sd_event_source_unref(u->sigint_event); + sd_event_unref(u->events); +} + +static int perform_upload(Uploader *u) { + CURLcode code; + long status; + + assert(u); + + u->watchdog_timestamp = now(CLOCK_MONOTONIC); + code = curl_easy_perform(u->easy); + if (code) { + if (u->error[0]) + log_error("Upload to %s failed: %.*s", + u->url, (int) sizeof(u->error), u->error); + else + log_error("Upload to %s failed: %s", + u->url, curl_easy_strerror(code)); + return -EIO; + } + + code = curl_easy_getinfo(u->easy, CURLINFO_RESPONSE_CODE, &status); + if (code) + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), + "Failed to retrieve response code: %s", + curl_easy_strerror(code)); + + if (status >= 300) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Upload to %s failed with code %ld: %s", + u->url, status, strna(u->answer)); + else if (status < 200) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Upload to %s finished with unexpected code %ld: %s", + u->url, status, strna(u->answer)); + else + log_debug("Upload finished successfully with code %ld: %s", + status, strna(u->answer)); + + free_and_replace(u->last_cursor, u->current_cursor); + + return update_cursor_state(u); +} + +static int parse_config(void) { + const ConfigTableItem items[] = { + { "Upload", "URL", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_url }, + { "Upload", "ServerKeyFile", config_parse_path_or_ignore, 0, &arg_key }, + { "Upload", "ServerCertificateFile", config_parse_path_or_ignore, 0, &arg_cert }, + { "Upload", "TrustedCertificateFile", config_parse_path_or_ignore, 0, &arg_trust }, + { "Upload", "NetworkTimeoutSec", config_parse_sec, 0, &arg_network_timeout_usec }, + {} + }; + + return config_parse_config_file("journal-upload.conf", "Upload\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, NULL); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-journal-upload.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s -u URL {FILE|-}...\n\n" + "Upload journal events to a remote server.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -u --url=URL Upload to this address (default port " + STRINGIFY(DEFAULT_PORT) ")\n" + " --key=FILENAME Specify key in PEM format (default:\n" + " \"" PRIV_KEY_FILE "\")\n" + " --cert=FILENAME Specify certificate in PEM format (default:\n" + " \"" CERT_FILE "\")\n" + " --trust=FILENAME|all Specify CA certificate or disable checking (default:\n" + " \"" TRUST_FILE "\")\n" + " --system Use the system journal\n" + " --user Use the user journal for the current user\n" + " -m --merge Use all available journals\n" + " -M --machine=CONTAINER Operate on local container\n" + " --namespace=NAMESPACE Use journal files from namespace\n" + " -D --directory=PATH Use journal files from directory\n" + " --file=PATH Use this journal file\n" + " --cursor=CURSOR Start at the specified cursor\n" + " --after-cursor=CURSOR Start after the specified cursor\n" + " --follow[=BOOL] Do [not] wait for input\n" + " --save-state[=FILE] Save uploaded cursors (default \n" + " " STATE_FILE ")\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_KEY, + ARG_CERT, + ARG_TRUST, + ARG_USER, + ARG_SYSTEM, + ARG_FILE, + ARG_CURSOR, + ARG_AFTER_CURSOR, + ARG_FOLLOW, + ARG_SAVE_STATE, + ARG_NAMESPACE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "url", required_argument, NULL, 'u' }, + { "key", required_argument, NULL, ARG_KEY }, + { "cert", required_argument, NULL, ARG_CERT }, + { "trust", required_argument, NULL, ARG_TRUST }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "user", no_argument, NULL, ARG_USER }, + { "merge", no_argument, NULL, 'm' }, + { "machine", required_argument, NULL, 'M' }, + { "namespace", required_argument, NULL, ARG_NAMESPACE }, + { "directory", required_argument, NULL, 'D' }, + { "file", required_argument, NULL, ARG_FILE }, + { "cursor", required_argument, NULL, ARG_CURSOR }, + { "after-cursor", required_argument, NULL, ARG_AFTER_CURSOR }, + { "follow", optional_argument, NULL, ARG_FOLLOW }, + { "save-state", optional_argument, NULL, ARG_SAVE_STATE }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + opterr = 0; + + while ((c = getopt_long(argc, argv, "hu:mM:D:", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'u': + if (arg_url) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --url="); + + arg_url = optarg; + break; + + case ARG_KEY: + if (arg_key) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --key="); + + arg_key = optarg; + break; + + case ARG_CERT: + if (arg_cert) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --cert="); + + arg_cert = optarg; + break; + + case ARG_TRUST: + if (arg_trust) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --trust="); + + arg_trust = optarg; + break; + + case ARG_SYSTEM: + arg_journal_type |= SD_JOURNAL_SYSTEM; + break; + + case ARG_USER: + arg_journal_type |= SD_JOURNAL_CURRENT_USER; + break; + + case 'm': + arg_merge = true; + break; + + case 'M': + if (arg_machine) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --machine=/-M"); + + arg_machine = optarg; + break; + + case ARG_NAMESPACE: + if (streq(optarg, "*")) { + arg_namespace_flags = SD_JOURNAL_ALL_NAMESPACES; + arg_namespace = NULL; + } else if (startswith(optarg, "+")) { + arg_namespace_flags = SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE; + arg_namespace = optarg + 1; + } else if (isempty(optarg)) { + arg_namespace_flags = 0; + arg_namespace = NULL; + } else { + arg_namespace_flags = 0; + arg_namespace = optarg; + } + + break; + + case 'D': + if (arg_directory) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --directory=/-D"); + + arg_directory = optarg; + break; + + case ARG_FILE: + r = glob_extend(&arg_file, optarg, GLOB_NOCHECK); + if (r < 0) + return log_error_errno(r, "Failed to add paths: %m"); + break; + + case ARG_CURSOR: + if (arg_cursor) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --cursor=/--after-cursor="); + + arg_cursor = optarg; + break; + + case ARG_AFTER_CURSOR: + if (arg_cursor) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot use more than one --cursor=/--after-cursor="); + + arg_cursor = optarg; + arg_after_cursor = true; + break; + + case ARG_FOLLOW: + r = parse_boolean_argument("--follow", optarg, NULL); + if (r < 0) + return r; + arg_follow = r; + break; + + case ARG_SAVE_STATE: + arg_save_state = optarg ?: STATE_FILE; + break; + + case '?': + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown option %s.", + argv[optind - 1]); + + case ':': + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Missing argument to %s.", + argv[optind - 1]); + + default: + assert_not_reached(); + } + + if (!arg_url) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Required --url=/-u option missing."); + + if (!!arg_key != !!arg_cert) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Options --key= and --cert= must be used together."); + + if (optind < argc && (arg_directory || arg_file || arg_machine || arg_journal_type)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Input arguments make no sense with journal input."); + + return 1; +} + +static int open_journal(sd_journal **j) { + int r; + + assert(j); + + if (arg_directory) + r = sd_journal_open_directory(j, arg_directory, arg_journal_type); + else if (arg_file) + r = sd_journal_open_files(j, (const char**) arg_file, 0); + else if (arg_machine) + r = journal_open_machine(j, arg_machine); + else + r = sd_journal_open_namespace(j, arg_namespace, + (arg_merge ? 0 : SD_JOURNAL_LOCAL_ONLY) | arg_namespace_flags | arg_journal_type); + if (r < 0) + log_error_errno(r, "Failed to open %s: %m", + arg_directory ?: (arg_file ? "files" : "journal")); + return r; +} + +static int run(int argc, char **argv) { + _cleanup_(destroy_uploader) Uploader u = {}; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = NULL; + bool use_journal; + int r; + + log_show_color(true); + log_parse_environment(); + + /* The journal merging logic potentially needs a lot of fds. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + r = parse_config(); + if (r < 0) + return r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + sigbus_install(); + + r = setup_uploader(&u, arg_url, arg_save_state); + if (r < 0) + return r; + + sd_event_set_watchdog(u.events, true); + + r = check_cursor_updating(&u); + if (r < 0) + return r; + + log_debug("%s running as pid "PID_FMT, + program_invocation_short_name, getpid_cached()); + + use_journal = optind >= argc; + if (use_journal) { + sd_journal *j; + r = open_journal(&j); + if (r < 0) + return r; + r = open_journal_for_upload(&u, j, + arg_cursor ?: u.last_cursor, + arg_cursor ? arg_after_cursor : true, + arg_follow != 0); + if (r < 0) + return r; + } + + notify_message = notify_start("READY=1\n" + "STATUS=Processing input...", + NOTIFY_STOPPING); + + for (;;) { + r = sd_event_get_state(u.events); + if (r < 0) + return r; + if (r == SD_EVENT_FINISHED) + return 0; + + if (use_journal) { + if (!u.journal) + return 0; + + r = check_journal_input(&u); + } else if (u.input < 0 && !use_journal) { + if (optind >= argc) + return 0; + + log_debug("Using %s as input.", argv[optind]); + r = open_file_for_upload(&u, argv[optind++]); + } + if (r < 0) + return r; + + if (u.uploading) { + r = perform_upload(&u); + if (r < 0) + return r; + } + + r = sd_event_run(u.events, u.timeout); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + } +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal-remote/journal-upload.conf.in b/src/journal-remote/journal-upload.conf.in new file mode 100644 index 0000000..41c53b6 --- /dev/null +++ b/src/journal-remote/journal-upload.conf.in @@ -0,0 +1,23 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/journal-upload.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/journal-upload.conf' to display the full config. +# +# See journal-upload.conf(5) for details. + +[Upload] +# URL= +# ServerKeyFile={{CERTIFICATE_ROOT}}/private/journal-upload.pem +# ServerCertificateFile={{CERTIFICATE_ROOT}}/certs/journal-upload.pem +# TrustedCertificateFile={{CERTIFICATE_ROOT}}/ca/trusted.pem diff --git a/src/journal-remote/journal-upload.h b/src/journal-remote/journal-upload.h new file mode 100644 index 0000000..9ff5a7b --- /dev/null +++ b/src/journal-remote/journal-upload.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include + +#include "sd-event.h" +#include "sd-journal.h" + +#include "time-util.h" + +typedef enum { + ENTRY_CURSOR = 0, /* Nothing actually written yet. */ + ENTRY_REALTIME, + ENTRY_MONOTONIC, + ENTRY_BOOT_ID, + ENTRY_NEW_FIELD, /* In between fields. */ + ENTRY_TEXT_FIELD, /* In the middle of a text field. */ + ENTRY_BINARY_FIELD_START, /* Writing the name of a binary field. */ + ENTRY_BINARY_FIELD_SIZE, /* Writing the size of a binary field. */ + ENTRY_BINARY_FIELD, /* In the middle of a binary field. */ + ENTRY_OUTRO, /* Writing '\n' */ + ENTRY_DONE, /* Need to move to a new field. */ +} entry_state; + +typedef struct Uploader { + sd_event *events; + sd_event_source *sigint_event, *sigterm_event; + + char *url; + CURL *easy; + bool uploading; + char error[CURL_ERROR_SIZE]; + struct curl_slist *header; + char *answer; + + sd_event_source *input_event; + uint64_t timeout; + + /* fd stuff */ + int input; + + /* journal stuff */ + sd_journal* journal; + + entry_state entry_state; + const void *field_data; + size_t field_pos, field_length; + + /* general metrics */ + const char *state_file; + + size_t entries_sent; + char *last_cursor, *current_cursor; + usec_t watchdog_timestamp; + usec_t watchdog_usec; +} Uploader; + +#define JOURNAL_UPLOAD_POLL_TIMEOUT (10 * USEC_PER_SEC) + +int start_upload(Uploader *u, + size_t (*input_callback)(void *ptr, + size_t size, + size_t nmemb, + void *userdata), + void *data); + +int open_journal_for_upload(Uploader *u, + sd_journal *j, + const char *cursor, + bool after_cursor, + bool follow); +void close_journal_input(Uploader *u); +int check_journal_input(Uploader *u); diff --git a/src/journal-remote/log-generator.py b/src/journal-remote/log-generator.py new file mode 100755 index 0000000..2843afb --- /dev/null +++ b/src/journal-remote/log-generator.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later + +import sys +import argparse + +PARSER = argparse.ArgumentParser() +PARSER.add_argument('n', type=int) +PARSER.add_argument('--dots', action='store_true') +PARSER.add_argument('-m', '--message-size', type=int, default=200) +PARSER.add_argument('-d', '--data-size', type=int, default=4000) +PARSER.add_argument('--data-type', choices={'random', 'simple'}) +OPTIONS = PARSER.parse_args() + +template = """\ +__CURSOR=s=6863c726210b4560b7048889d8ada5c5;i=3e931;b=f446871715504074bf7049ef0718fa93;m={m:x};t=4fd05c +__REALTIME_TIMESTAMP={realtime_ts} +__MONOTONIC_TIMESTAMP={monotonic_ts} +_BOOT_ID=f446871715504074bf7049ef0718fa93 +_TRANSPORT=syslog +PRIORITY={priority} +SYSLOG_FACILITY={facility} +SYSLOG_IDENTIFIER=/USR/SBIN/CRON +MESSAGE={message} +_UID=0 +_GID=0 +_MACHINE_ID=69121ca41d12c1b69a7960174c27b618 +_HOSTNAME=hostname +SYSLOG_PID=25721 +_PID=25721 +_SOURCE_REALTIME_TIMESTAMP={source_realtime_ts} +DATA={data} +""" + +priority = 3 +facility = 6 + +src = open('/dev/urandom', 'rb') + +bytes = 0 +counter = 0 + +for i in range(OPTIONS.n): + message = src.read(OPTIONS.message_size) + message = repr(message)[2:-1] + + if OPTIONS.data_type == 'random': + data = repr(src.read(OPTIONS.data_size)) + else: + # keep the pattern non-repeating so we get a different blob every time + data = '{:0{}}'.format(counter, OPTIONS.data_size) + counter += 1 + + entry = template.format(m=0x198603b12d7 + i, + realtime_ts=1404101101501873 + i, + monotonic_ts=1753961140951 + i, + source_realtime_ts=1404101101483516 + i, + priority=priority, + facility=facility, + message=message, + data=data) + + bytes += len(entry) + + print(entry) + + if OPTIONS.dots: + print('.', file=sys.stderr, end='', flush=True) + +if OPTIONS.dots: + print(file=sys.stderr) +print('Wrote {} bytes'.format(bytes), file=sys.stderr) diff --git a/src/journal-remote/meson.build b/src/journal-remote/meson.build new file mode 100644 index 0000000..964a251 --- /dev/null +++ b/src/journal-remote/meson.build @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_journal_upload_sources = files( + 'journal-upload-journal.c', + 'journal-upload.c', +) + +libsystemd_journal_remote_sources = files( + 'journal-remote-parse.c', + 'journal-remote-write.c', + 'journal-remote.c', +) + +if conf.get('HAVE_MICROHTTPD') == 1 + libsystemd_journal_remote_sources += files( + 'microhttpd-util.c', + ) +endif + +libsystemd_journal_remote = static_library( + 'systemd-journal-remote', + libsystemd_journal_remote_sources, + include_directories : includes, + dependencies : [libgnutls, + liblz4, + libmicrohttpd, + libxz, + threads, + userspace], + build_by_default : false) + +systemd_journal_remote_sources = files('journal-remote-main.c') + +systemd_journal_gatewayd_sources = files( + 'journal-gatewayd.c', + 'microhttpd-util.c', +) + +common_deps = [ + libgnutls, + liblz4, + libxz, + libzstd, + threads, +] + +executables += [ + libexec_template + { + 'name' : 'systemd-journal-upload', + 'public' : true, + 'conditions' : [ + 'ENABLE_REMOTE', + 'HAVE_LIBCURL', + ], + 'sources' : systemd_journal_upload_sources, + 'dependencies' : common_deps + [libcurl], + }, + libexec_template + { + 'name' : 'systemd-journal-remote', + 'public' : true, + 'conditions' : [ + 'ENABLE_REMOTE', + 'HAVE_MICROHTTPD', + ], + 'sources' : systemd_journal_remote_sources, + 'link_with' : [ + libshared, + libsystemd_journal_remote, + ], + 'dependencies' : common_deps + [libmicrohttpd], + }, + libexec_template + { + 'name' : 'systemd-journal-gatewayd', + 'public' : true, + 'conditions' : [ + 'ENABLE_REMOTE', + 'HAVE_MICROHTTPD', + ], + 'sources' : systemd_journal_gatewayd_sources, + 'dependencies' : common_deps + [libmicrohttpd], + }, + fuzz_template + { + 'sources' : files('fuzz-journal-remote.c'), + 'link_with' : [ + libshared, + libsystemd_journal_remote, + ], + }, +] + +in_files = [ + ['journal-upload.conf', + conf.get('ENABLE_REMOTE') == 1 and conf.get('HAVE_LIBCURL') == 1 and install_sysconfdir_samples], + ['journal-remote.conf', + conf.get('ENABLE_REMOTE') == 1 and conf.get('HAVE_MICROHTTPD') == 1 and install_sysconfdir_samples]] + +foreach tuple : in_files + file = tuple[0] + custom_target( + file, + input : file + '.in', + output: file, + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : tuple[1], + install_dir : pkgconfigfiledir) +endforeach + +if conf.get('ENABLE_REMOTE') == 1 and conf.get('HAVE_MICROHTTPD') == 1 + install_data('browse.html', + install_dir : pkgdatadir / 'gatewayd') + + if get_option('create-log-dirs') + install_emptydir('/var/log/journal/remote', + install_mode : 'rwxr-xr-x') + endif +endif diff --git a/src/journal-remote/microhttpd-util.c b/src/journal-remote/microhttpd-util.c new file mode 100644 index 0000000..c1e35b7 --- /dev/null +++ b/src/journal-remote/microhttpd-util.c @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#if HAVE_GNUTLS +#include +#include +#endif + +#include "alloc-util.h" +#include "log.h" +#include "macro.h" +#include "microhttpd-util.h" +#include "string-util.h" +#include "strv.h" + +void microhttpd_logger(void *arg, const char *fmt, va_list ap) { + char *f; + + f = strjoina("microhttpd: ", fmt); + + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_INFO, 0, NULL, 0, NULL, f, ap); + REENABLE_WARNING; +} + +int mhd_respond_internal( + struct MHD_Connection *connection, + enum MHD_RequestTerminationCode code, + const char *buffer, + size_t size, + enum MHD_ResponseMemoryMode mode) { + + assert(connection); + + _cleanup_(MHD_destroy_responsep) struct MHD_Response *response + = MHD_create_response_from_buffer(size, (char*) buffer, mode); + if (!response) + return MHD_NO; + + log_debug("Queueing response %u: %s", code, buffer); + if (MHD_add_response_header(response, "Content-Type", "text/plain") == MHD_NO) + return MHD_NO; + return MHD_queue_response(connection, code, response); +} + +int mhd_respond_oom(struct MHD_Connection *connection) { + return mhd_respond(connection, MHD_HTTP_SERVICE_UNAVAILABLE, "Out of memory."); +} + +int mhd_respondf_internal( + struct MHD_Connection *connection, + int error, + enum MHD_RequestTerminationCode code, + const char *format, ...) { + + char *m; + int r; + va_list ap; + + assert(connection); + assert(format); + + if (error < 0) + error = -error; + errno = -error; + va_start(ap, format); + r = vasprintf(&m, format, ap); + va_end(ap); + + if (r < 0) + return respond_oom(connection); + + return mhd_respond_internal(connection, code, m, r, MHD_RESPMEM_MUST_FREE); +} + +#if HAVE_GNUTLS + +static struct { + const char *const names[4]; + int level; + bool enabled; +} gnutls_log_map[] = { + { {"0"}, LOG_DEBUG }, + { {"1", "audit"}, LOG_WARNING, true}, /* gnutls session audit */ + { {"2", "assert"}, LOG_DEBUG }, /* gnutls assert log */ + { {"3", "hsk", "ext"}, LOG_DEBUG }, /* gnutls handshake log */ + { {"4", "rec"}, LOG_DEBUG }, /* gnutls record log */ + { {"5", "dtls"}, LOG_DEBUG }, /* gnutls DTLS log */ + { {"6", "buf"}, LOG_DEBUG }, + { {"7", "write", "read"}, LOG_DEBUG }, + { {"8"}, LOG_DEBUG }, + { {"9", "enc", "int"}, LOG_DEBUG }, +}; + +static void log_func_gnutls(int level, const char *message) { + assert_se(message); + + if (0 <= level && level < (int) ELEMENTSOF(gnutls_log_map)) { + if (gnutls_log_map[level].enabled) + log_internal(gnutls_log_map[level].level, 0, NULL, 0, NULL, "gnutls %d/%s: %s", level, gnutls_log_map[level].names[1], message); + } else { + log_debug("Received GNUTLS message with unknown level %d.", level); + log_internal(LOG_DEBUG, 0, NULL, 0, NULL, "gnutls: %s", message); + } +} + +static void log_reset_gnutls_level(void) { + int i; + + for (i = ELEMENTSOF(gnutls_log_map) - 1; i >= 0; i--) + if (gnutls_log_map[i].enabled) { + log_debug("Setting gnutls log level to %d", i); + gnutls_global_set_log_level(i); + break; + } +} + +static int log_enable_gnutls_category(const char *cat) { + unsigned i; + + if (streq(cat, "all")) { + for (i = 0; i < ELEMENTSOF(gnutls_log_map); i++) + gnutls_log_map[i].enabled = true; + log_reset_gnutls_level(); + return 0; + } else + for (i = 0; i < ELEMENTSOF(gnutls_log_map); i++) + if (strv_contains((char**)gnutls_log_map[i].names, cat)) { + gnutls_log_map[i].enabled = true; + log_reset_gnutls_level(); + return 0; + } + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No such log category: %s", cat); +} + +int setup_gnutls_logger(char **categories) { + int r; + + gnutls_global_set_log_function(log_func_gnutls); + + if (categories) + STRV_FOREACH(cat, categories) { + r = log_enable_gnutls_category(*cat); + if (r < 0) + return r; + } + else + log_reset_gnutls_level(); + + return 0; +} + +static int verify_cert_authorized(gnutls_session_t session) { + unsigned status; + gnutls_certificate_type_t type; + gnutls_datum_t out; + int r; + + r = gnutls_certificate_verify_peers2(session, &status); + if (r < 0) + return log_error_errno(r, "gnutls_certificate_verify_peers2 failed: %m"); + + type = gnutls_certificate_type_get(session); + r = gnutls_certificate_verification_status_print(status, type, &out, 0); + if (r < 0) + return log_error_errno(r, "gnutls_certificate_verification_status_print failed: %m"); + + log_debug("Certificate status: %s", out.data); + gnutls_free(out.data); + + return status == 0 ? 0 : -EPERM; +} + +static int get_client_cert(gnutls_session_t session, gnutls_x509_crt_t *client_cert) { + const gnutls_datum_t *pcert; + unsigned listsize; + gnutls_x509_crt_t cert; + int r; + + assert(session); + assert(client_cert); + + pcert = gnutls_certificate_get_peers(session, &listsize); + if (!pcert || !listsize) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to retrieve certificate chain"); + + r = gnutls_x509_crt_init(&cert); + if (r < 0) { + log_error("Failed to initialize client certificate"); + return r; + } + + /* Note that by passing values between 0 and listsize here, you + can get access to the CA's certs */ + r = gnutls_x509_crt_import(cert, &pcert[0], GNUTLS_X509_FMT_DER); + if (r < 0) { + log_error("Failed to import client certificate"); + gnutls_x509_crt_deinit(cert); + return r; + } + + *client_cert = cert; + return 0; +} + +static int get_auth_dn(gnutls_x509_crt_t client_cert, char **buf) { + size_t len = 0; + int r; + + assert(buf); + assert(*buf == NULL); + + r = gnutls_x509_crt_get_dn(client_cert, NULL, &len); + if (r != GNUTLS_E_SHORT_MEMORY_BUFFER) { + log_error("gnutls_x509_crt_get_dn failed"); + return r; + } + + *buf = malloc(len); + if (!*buf) + return log_oom(); + + gnutls_x509_crt_get_dn(client_cert, *buf, &len); + return 0; +} + +static void gnutls_x509_crt_deinitp(gnutls_x509_crt_t *p) { + gnutls_x509_crt_deinit(*p); +} + +int check_permissions(struct MHD_Connection *connection, int *code, char **hostname) { + const union MHD_ConnectionInfo *ci; + gnutls_session_t session; + _cleanup_(gnutls_x509_crt_deinitp) gnutls_x509_crt_t client_cert = NULL; + _cleanup_free_ char *buf = NULL; + int r; + + assert(connection); + assert(code); + + *code = 0; + + ci = MHD_get_connection_info(connection, + MHD_CONNECTION_INFO_GNUTLS_SESSION); + if (!ci) { + log_error("MHD_get_connection_info failed: session is unencrypted"); + *code = mhd_respond(connection, MHD_HTTP_FORBIDDEN, + "Encrypted connection is required"); + return -EPERM; + } + session = ci->tls_session; + assert(session); + + r = get_client_cert(session, &client_cert); + if (r < 0) { + *code = mhd_respond(connection, MHD_HTTP_UNAUTHORIZED, + "Authorization through certificate is required"); + return -EPERM; + } + + r = get_auth_dn(client_cert, &buf); + if (r < 0) { + *code = mhd_respond(connection, MHD_HTTP_UNAUTHORIZED, + "Failed to determine distinguished name from certificate"); + return -EPERM; + } + + log_debug("Connection from %s", buf); + + if (hostname) + *hostname = TAKE_PTR(buf); + + r = verify_cert_authorized(session); + if (r < 0) { + log_warning("Client is not authorized"); + *code = mhd_respond(connection, MHD_HTTP_UNAUTHORIZED, + "Client certificate not signed by recognized authority"); + } + return r; +} + +#else +_noreturn_ int check_permissions(struct MHD_Connection *connection, int *code, char **hostname) { + assert_not_reached(); +} + +int setup_gnutls_logger(char **categories) { + if (categories) + log_notice("Ignoring specified gnutls logging categories — gnutls not available."); + return 0; +} +#endif diff --git a/src/journal-remote/microhttpd-util.h b/src/journal-remote/microhttpd-util.h new file mode 100644 index 0000000..309c39a --- /dev/null +++ b/src/journal-remote/microhttpd-util.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +/* Those defines are added when options are renamed. If the old names + * are not '#define'd, then they are not deprecated yet and there are + * enum elements with the same name. Hence let's check for the *old* name, + * and define the new name by the value of the old name. */ + +/* Renamed in μhttpd 0.9.51 */ +#ifndef MHD_USE_PIPE_FOR_SHUTDOWN +# define MHD_USE_ITC MHD_USE_PIPE_FOR_SHUTDOWN +#endif + +/* Renamed in μhttpd 0.9.52 */ +#ifndef MHD_USE_EPOLL_LINUX_ONLY +# define MHD_USE_EPOLL MHD_USE_EPOLL_LINUX_ONLY +#endif + +/* Renamed in μhttpd 0.9.52 */ +#ifndef MHD_USE_SSL +# define MHD_USE_TLS MHD_USE_SSL +#endif + +/* Renamed in μhttpd 0.9.53 */ +#ifndef MHD_USE_POLL_INTERNALLY +# define MHD_USE_POLL_INTERNAL_THREAD MHD_USE_POLL_INTERNALLY +#endif + +/* Both the old and new names are defines, check for the new one. */ + +/* Compatibility with libmicrohttpd < 0.9.38 */ +#ifndef MHD_HTTP_NOT_ACCEPTABLE +# define MHD_HTTP_NOT_ACCEPTABLE MHD_HTTP_METHOD_NOT_ACCEPTABLE +#endif + +/* Renamed in μhttpd 0.9.74 (8c644fc1f4d498ea489add8d40a68f5d3e5899fa) */ +#ifndef MHD_HTTP_CONTENT_TOO_LARGE +# ifdef MHD_HTTP_PAYLOAD_TOO_LARGE +# define MHD_HTTP_CONTENT_TOO_LARGE MHD_HTTP_PAYLOAD_TOO_LARGE /* 0.9.53 or newer */ +# else +# define MHD_HTTP_CONTENT_TOO_LARGE MHD_HTTP_REQUEST_ENTITY_TOO_LARGE +# endif +#endif + +#if MHD_VERSION < 0x00094203 +# define MHD_create_response_from_fd_at_offset64 MHD_create_response_from_fd_at_offset +#endif + +#if MHD_VERSION >= 0x00097002 +# define mhd_result enum MHD_Result +#else +# define mhd_result int +#endif + +void microhttpd_logger(void *arg, const char *fmt, va_list ap) _printf_(2, 0); + +/* respond_oom() must be usable with return, hence this form. */ +#define respond_oom(connection) log_oom(), mhd_respond_oom(connection) + +int mhd_respond_internal( + struct MHD_Connection *connection, + enum MHD_RequestTerminationCode code, + const char *buffer, + size_t size, + enum MHD_ResponseMemoryMode mode); + +#define mhd_respond(connection, code, message) \ + mhd_respond_internal( \ + connection, code, \ + message "\n", \ + strlen(message) + 1, \ + MHD_RESPMEM_PERSISTENT) + +int mhd_respond_oom(struct MHD_Connection *connection); + +int mhd_respondf_internal( + struct MHD_Connection *connection, + int error, + enum MHD_RequestTerminationCode code, + const char *format, ...) _printf_(4,5); + +#define mhd_respondf(connection, error, code, format, ...) \ + mhd_respondf_internal( \ + connection, error, code, \ + format "\n", \ + ##__VA_ARGS__) + +int check_permissions(struct MHD_Connection *connection, int *code, char **hostname); + +/* Set gnutls internal logging function to a callback which uses our + * own logging framework. + * + * gnutls categories are additionally filtered by our internal log + * level, so it should be set fairly high to capture all potentially + * interesting events without overwhelming detail. + */ +int setup_gnutls_logger(char **categories); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct MHD_Daemon*, MHD_stop_daemon, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct MHD_Response*, MHD_destroy_response, NULL); diff --git a/src/journal/bsod.c b/src/journal/bsod.c new file mode 100644 index 0000000..a88cb66 --- /dev/null +++ b/src/journal/bsod.c @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-id128.h" +#include "sd-journal.h" + +#include "alloc-util.h" +#include "build.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "log.h" +#include "logs-show.h" +#include "main-func.h" +#include "pretty-print.h" +#include "qrcode-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "sysctl-util.h" +#include "terminal-util.h" + +static bool arg_continuous = false; + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-bsod", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s\n\n" + "%sFilter the journal to fetch the first message from the\n" + "current boot with an emergency log level and displays it\n" + "as a string and a QR code.\n\n%s" + " -h --help Show this help\n" + " --version Show package version\n" + " -c --continuous Make systemd-bsod wait continuously\n" + " for changes in the journal\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int acquire_first_emergency_log_message(char **ret) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_free_ char *message = NULL; + const void *d; + size_t l; + int r; + + assert(ret); + + r = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to open journal: %m"); + + r = add_match_this_boot(j, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to add boot ID filter: %m"); + + r = sd_journal_add_match(j, "_UID=0", 0); + if (r < 0) + return log_warning_errno(r, "Failed to add User ID filter: %m"); + + assert_cc(0 == LOG_EMERG); + r = sd_journal_add_match(j, "PRIORITY=0", 0); + if (r < 0) + return log_warning_errno(r, "Failed to add Emergency filter: %m"); + + r = sd_journal_seek_head(j); + if (r < 0) + return log_error_errno(r, "Failed to seek to start of journal: %m"); + + for (;;) { + r = sd_journal_next(j); + if (r < 0) + return log_error_errno(r, "Failed to read next journal entry: %m"); + if (r > 0) + break; + + if (!arg_continuous) { + log_debug("No emergency level entries in the journal"); + *ret = NULL; + return 0; + } + + r = sd_journal_wait(j, UINT64_MAX); + if (r < 0) + return log_error_errno(r, "Failed to wait for changes: %m"); + } + + r = sd_journal_get_data(j, "MESSAGE", &d, &l); + if (r < 0) + return log_error_errno(r, "Failed to read journal message: %m"); + + message = memdup_suffix0((const char*)d + STRLEN("MESSAGE="), l - STRLEN("MESSAGE=")); + if (!message) + return log_oom(); + + *ret = TAKE_PTR(message); + + return 0; +} + +static int find_next_free_vt(int fd, int *ret_free_vt, int *ret_original_vt) { + struct vt_stat terminal_status; + + assert(fd >= 0); + assert(ret_free_vt); + assert(ret_original_vt); + + if (ioctl(fd, VT_GETSTATE, &terminal_status) < 0) + return -errno; + + for (size_t i = 0; i < sizeof(terminal_status.v_state) * 8; i++) + if ((terminal_status.v_state & (1 << i)) == 0) { + *ret_free_vt = i; + *ret_original_vt = terminal_status.v_active; + return 0; + } + + return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), "No free VT found: %m"); +} + +static int display_emergency_message_fullscreen(const char *message) { + int r, ret = 0, free_vt = 0, original_vt = 0; + unsigned qr_code_start_row = 1, qr_code_start_column = 1; + char tty[STRLEN("/dev/tty") + DECIMAL_STR_MAX(int) + 1]; + _cleanup_close_ int fd = -EBADF; + _cleanup_fclose_ FILE *stream = NULL; + char read_character_buffer = '\0'; + struct winsize w = { + .ws_col = 80, + .ws_row = 25, + }; + + assert(message); + + fd = open_terminal("/dev/tty1", O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(fd, "Failed to open tty1: %m"); + + r = find_next_free_vt(fd, &free_vt, &original_vt); + if (r < 0) + return log_error_errno(r, "Failed to find a free VT: %m"); + + xsprintf(tty, "/dev/tty%d", free_vt + 1); + + r = open_terminal(tty, O_RDWR|O_NOCTTY|O_CLOEXEC); + if (r < 0) + return log_error_errno(fd, "Failed to open tty: %m"); + + close_and_replace(fd, r); + + if (ioctl(fd, TIOCGWINSZ, &w) < 0) + log_warning_errno(errno, "Failed to fetch tty size, ignoring: %m"); + + if (ioctl(fd, VT_ACTIVATE, free_vt + 1) < 0) + return log_error_errno(errno, "Failed to activate tty: %m"); + + r = loop_write(fd, ANSI_BACKGROUND_BLUE ANSI_HOME_CLEAR, SIZE_MAX); + if (r < 0) + log_warning_errno(r, "Failed to clear terminal, ignoring: %m"); + + r = set_terminal_cursor_position(fd, 2, 4); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + + r = loop_write(fd, "The current boot has failed!", SIZE_MAX); + if (r < 0) { + ret = log_warning_errno(r, "Failed to write to terminal: %m"); + goto cleanup; + } + + qr_code_start_row = w.ws_row * 3U / 5U; + qr_code_start_column = w.ws_col * 3U / 4U; + r = set_terminal_cursor_position(fd, 4, 4); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + + r = loop_write(fd, message, SIZE_MAX); + if (r < 0) { + ret = log_warning_errno(r, "Failed to write emergency message to terminal: %m"); + goto cleanup; + } + + r = fdopen_independent(fd, "r+", &stream); + if (r < 0) { + ret = log_error_errno(errno, "Failed to open output file: %m"); + goto cleanup; + } + + r = print_qrcode_full(stream, "Scan the QR code", message, qr_code_start_row, qr_code_start_column, w.ws_col, w.ws_row); + if (r < 0) + log_warning_errno(r, "QR code could not be printed, ignoring: %m"); + + r = set_terminal_cursor_position(fd, w.ws_row - 1, w.ws_col * 2U / 5U); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + + r = loop_write(fd, "Press any key to exit...", SIZE_MAX); + if (r < 0) { + ret = log_warning_errno(r, "Failed to write to terminal: %m"); + goto cleanup; + } + + r = read_one_char(stream, &read_character_buffer, USEC_INFINITY, NULL); + if (r < 0 && r != -EINTR) + ret = log_error_errno(r, "Failed to read character: %m"); + +cleanup: + if (ioctl(fd, VT_ACTIVATE, original_vt) < 0) + return log_error_errno(errno, "Failed to switch back to original VT: %m"); + + return ret; +} + +static int parse_argv(int argc, char * argv[]) { + + enum { + ARG_VERSION = 0x100, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "continuous", no_argument, NULL, 'c' }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hc", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'c': + arg_continuous = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s takes no argument.", + program_invocation_short_name); + return 1; +} + +static int run(int argc, char *argv[]) { + /* Don't use SA_RESTART here, as we don't want to restart syscalls on signal + * to get out of read_one_char() when needed */ + static const struct sigaction nop_sigaction = { + .sa_handler = nop_signal_handler, + .sa_flags = 0, + }; + _cleanup_free_ char *message = NULL; + int r; + + log_open(); + log_parse_environment(); + + sigbus_install(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = acquire_first_emergency_log_message(&message); + if (r < 0) + return log_error_errno(r, "Failed to acquire first emergency log message: %m"); + + if (!message) { + log_debug("No emergency-level entries"); + return 0; + } + + assert_se(sigaction_many(&nop_sigaction, SIGTERM, SIGINT) >= 0); + + r = display_emergency_message_fullscreen((const char*) message); + if (r < 0) + return log_error_errno(r, "Failed to display emergency message on terminal: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal/cat.c b/src/journal/cat.c new file mode 100644 index 0000000..609ddba --- /dev/null +++ b/src/journal/cat.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "build.h" +#include "fd-util.h" +#include "format-util.h" +#include "main-func.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "string-util.h" +#include "syslog-util.h" +#include "terminal-util.h" + +static const char *arg_identifier = NULL; +static int arg_priority = LOG_INFO; +static int arg_stderr_priority = -1; +static bool arg_level_prefix = true; + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-cat", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n" + "\n%sExecute process with stdout/stderr connected to the journal.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -t --identifier=STRING Set syslog identifier\n" + " -p --priority=PRIORITY Set priority value (0..7)\n" + " --stderr-priority=PRIORITY Set priority value (0..7) used for stderr\n" + " --level-prefix=BOOL Control whether level prefix shall be parsed\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_STDERR_PRIORITY, + ARG_LEVEL_PREFIX + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "identifier", required_argument, NULL, 't' }, + { "priority", required_argument, NULL, 'p' }, + { "stderr-priority", required_argument, NULL, ARG_STDERR_PRIORITY }, + { "level-prefix", required_argument, NULL, ARG_LEVEL_PREFIX }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+ht:p:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + help(); + return 0; + + case ARG_VERSION: + return version(); + + case 't': + if (isempty(optarg)) + arg_identifier = NULL; + else + arg_identifier = optarg; + break; + + case 'p': + arg_priority = log_level_from_string(optarg); + if (arg_priority < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse priority value."); + break; + + case ARG_STDERR_PRIORITY: + arg_stderr_priority = log_level_from_string(optarg); + if (arg_stderr_priority < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse stderr priority value."); + break; + + case ARG_LEVEL_PREFIX: + r = parse_boolean_argument("--level-prefix=", optarg, &arg_level_prefix); + if (r < 0) + return r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_close_ int outfd = -EBADF, errfd = -EBADF, saved_stderr = -EBADF; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + outfd = sd_journal_stream_fd(arg_identifier, arg_priority, arg_level_prefix); + if (outfd < 0) + return log_error_errno(outfd, "Failed to create stream fd: %m"); + + if (arg_stderr_priority >= 0 && arg_stderr_priority != arg_priority) { + errfd = sd_journal_stream_fd(arg_identifier, arg_stderr_priority, arg_level_prefix); + if (errfd < 0) + return log_error_errno(errfd, "Failed to create stream fd: %m"); + } + + saved_stderr = fcntl(STDERR_FILENO, F_DUPFD_CLOEXEC, 3); + + r = rearrange_stdio(STDIN_FILENO, outfd, errfd < 0 ? outfd : errfd); /* Invalidates fd on success + error! */ + TAKE_FD(outfd); + TAKE_FD(errfd); + if (r < 0) + return log_error_errno(r, "Failed to rearrange stdout/stderr: %m"); + + if (argc <= optind) + (void) execl("/bin/cat", "/bin/cat", NULL); + else { + _cleanup_free_ char *s = NULL; + struct stat st; + + if (fstat(STDERR_FILENO, &st) < 0) + return log_error_errno(errno, + "Failed to fstat(%s): %m", + FORMAT_PROC_FD_PATH(STDERR_FILENO)); + + if (asprintf(&s, DEV_FMT ":" INO_FMT, (dev_t)st.st_dev, st.st_ino) < 0) + return log_oom(); + + if (setenv("JOURNAL_STREAM", s, /* overwrite = */ true) < 0) + return log_error_errno(errno, "Failed to set environment variable JOURNAL_STREAM: %m"); + + (void) execvp(argv[optind], argv + optind); + } + r = -errno; + + /* Let's try to restore a working stderr, so we can print the error message */ + if (saved_stderr >= 0) + (void) dup3(saved_stderr, STDERR_FILENO, 0); + + return log_error_errno(r, "Failed to execute process: %m"); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal/fuzz-journald-audit.c b/src/journal/fuzz-journald-audit.c new file mode 100644 index 0000000..9bf7d01 --- /dev/null +++ b/src/journal/fuzz-journald-audit.c @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fuzz.h" +#include "fuzz-journald.h" +#include "journald-audit.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + Server s; + + fuzz_setup_logging(); + + dummy_server_init(&s, data, size); + process_audit_string(&s, 0, s.buffer, size); + server_done(&s); + + return 0; +} diff --git a/src/journal/fuzz-journald-kmsg.c b/src/journal/fuzz-journald-kmsg.c new file mode 100644 index 0000000..104a9b3 --- /dev/null +++ b/src/journal/fuzz-journald-kmsg.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fuzz.h" +#include "fuzz-journald.h" +#include "journald-kmsg.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + Server s; + + if (size == 0) + return 0; + + fuzz_setup_logging(); + + dummy_server_init(&s, data, size); + dev_kmsg_record(&s, s.buffer, size); + server_done(&s); + + return 0; +} diff --git a/src/journal/fuzz-journald-native-fd.c b/src/journal/fuzz-journald-native-fd.c new file mode 100644 index 0000000..110eb7f --- /dev/null +++ b/src/journal/fuzz-journald-native-fd.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fs-util.h" +#include "fuzz-journald.h" +#include "fuzz.h" +#include "journald-native.h" +#include "memfd-util.h" +#include "process-util.h" +#include "tmpfile-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + Server s; + _cleanup_close_ int sealed_fd = -EBADF, unsealed_fd = -EBADF; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/fuzz-journald-native-fd.XXXXXX"; + char *label = NULL; + size_t label_len = 0; + struct ucred ucred; + struct timeval *tv = NULL; + + fuzz_setup_logging(); + + dummy_server_init(&s, NULL, 0); + + sealed_fd = memfd_new_and_seal(NULL, data, size); + assert_se(sealed_fd >= 0); + ucred = (struct ucred) { + .pid = getpid_cached(), + .uid = geteuid(), + .gid = getegid(), + }; + server_process_native_file(&s, sealed_fd, &ucred, tv, label, label_len); + + unsealed_fd = mkostemp_safe(name); + assert_se(unsealed_fd >= 0); + assert_se(write(unsealed_fd, data, size) == (ssize_t) size); + assert_se(lseek(unsealed_fd, 0, SEEK_SET) == 0); + server_process_native_file(&s, unsealed_fd, &ucred, tv, label, label_len); + + server_done(&s); + + return 0; +} diff --git a/src/journal/fuzz-journald-native.c b/src/journal/fuzz-journald-native.c new file mode 100644 index 0000000..6738d26 --- /dev/null +++ b/src/journal/fuzz-journald-native.c @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fuzz.h" +#include "fuzz-journald.h" +#include "journald-native.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + fuzz_setup_logging(); + + fuzz_journald_processing_function(data, size, server_process_native_message); + return 0; +} diff --git a/src/journal/fuzz-journald-stream.c b/src/journal/fuzz-journald-stream.c new file mode 100644 index 0000000..6b2055f --- /dev/null +++ b/src/journal/fuzz-journald-stream.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "fd-util.h" +#include "fuzz.h" +#include "fuzz-journald.h" +#include "journald-stream.h" + +static int stream_fds[2] = EBADF_PAIR; + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + Server s; + StdoutStream *stream; + int v; + + if (outside_size_range(size, 1, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0, stream_fds) >= 0); + dummy_server_init(&s, NULL, 0); + assert_se(stdout_stream_install(&s, stream_fds[0], &stream) >= 0); + assert_se(write(stream_fds[1], data, size) == (ssize_t) size); + while (ioctl(stream_fds[0], SIOCINQ, &v) == 0 && v) + sd_event_run(s.event, UINT64_MAX); + if (s.n_stdout_streams) + stdout_stream_destroy(stream); + server_done(&s); + stream_fds[1] = safe_close(stream_fds[1]); + + return 0; +} diff --git a/src/journal/fuzz-journald-stream.options b/src/journal/fuzz-journald-stream.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/journal/fuzz-journald-stream.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/journal/fuzz-journald-syslog.c b/src/journal/fuzz-journald-syslog.c new file mode 100644 index 0000000..b3e2d3c --- /dev/null +++ b/src/journal/fuzz-journald-syslog.c @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fuzz.h" +#include "fuzz-journald.h" +#include "journald-syslog.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + fuzz_setup_logging(); + + fuzz_journald_processing_function(data, size, server_process_syslog_message); + return 0; +} diff --git a/src/journal/fuzz-journald.c b/src/journal/fuzz-journald.c new file mode 100644 index 0000000..c96fad5 --- /dev/null +++ b/src/journal/fuzz-journald.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fuzz-journald.h" +#include "journald-server.h" +#include "sd-event.h" + +void dummy_server_init(Server *s, const uint8_t *buffer, size_t size) { + *s = (Server) { + .syslog_fd = -EBADF, + .native_fd = -EBADF, + .stdout_fd = -EBADF, + .dev_kmsg_fd = -EBADF, + .audit_fd = -EBADF, + .hostname_fd = -EBADF, + .notify_fd = -EBADF, + .storage = STORAGE_NONE, + .line_max = 64, + }; + assert_se(sd_event_default(&s->event) >= 0); + + if (buffer) { + s->buffer = memdup_suffix0(buffer, size); + assert_se(s->buffer); + } +} + +void fuzz_journald_processing_function( + const uint8_t *data, + size_t size, + void (*f)(Server *s, const char *buf, size_t raw_len, const struct ucred *ucred, const struct timeval *tv, const char *label, size_t label_len) + ) { + Server s; + char *label = NULL; + size_t label_len = 0; + struct ucred *ucred = NULL; + struct timeval *tv = NULL; + + if (size == 0) + return; + + dummy_server_init(&s, data, size); + (*f)(&s, s.buffer, size, ucred, tv, label, label_len); + server_done(&s); +} diff --git a/src/journal/fuzz-journald.h b/src/journal/fuzz-journald.h new file mode 100644 index 0000000..4abb100 --- /dev/null +++ b/src/journal/fuzz-journald.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +void dummy_server_init(Server *s, const uint8_t *buffer, size_t size); + +void fuzz_journald_processing_function( + const uint8_t *data, + size_t size, + void (*f)(Server *s, const char *buf, size_t raw_len, const struct ucred *ucred, const struct timeval *tv, const char *label, size_t label_len) +); diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c new file mode 100644 index 0000000..7f3dcd5 --- /dev/null +++ b/src/journal/journalctl.c @@ -0,0 +1,2631 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-journal.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "catalog.h" +#include "chase.h" +#include "chattr-util.h" +#include "constants.h" +#include "devnum-util.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "format-util.h" +#include "fs-util.h" +#include "fsprg.h" +#include "glob-util.h" +#include "hostname-util.h" +#include "id128-print.h" +#include "io-util.h" +#include "journal-def.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "journal-vacuum.h" +#include "journal-verify.h" +#include "locale-util.h" +#include "log.h" +#include "logs-show.h" +#include "main-func.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "missing_sched.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pcre2-util.h" +#include "pretty-print.h" +#include "qrcode-util.h" +#include "random-util.h" +#include "rlimit-util.h" +#include "set.h" +#include "sigbus.h" +#include "signal-util.h" +#include "static-destruct.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "varlink.h" + +#define DEFAULT_FSS_INTERVAL_USEC (15*USEC_PER_MINUTE) +#define PROCESS_INOTIFY_INTERVAL 1024 /* Every 1,024 messages processed */ + +enum { + /* Special values for arg_lines */ + ARG_LINES_DEFAULT = -2, + ARG_LINES_ALL = -1, +}; + +static OutputMode arg_output = OUTPUT_SHORT; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static bool arg_utc = false; +static bool arg_follow = false; +static bool arg_full = true; +static bool arg_all = false; +static PagerFlags arg_pager_flags = 0; +static int arg_lines = ARG_LINES_DEFAULT; +static bool arg_lines_oldest = false; +static bool arg_no_tail = false; +static bool arg_truncate_newline = false; +static bool arg_quiet = false; +static bool arg_merge = false; +static bool arg_boot = false; +static sd_id128_t arg_boot_id = {}; +static int arg_boot_offset = 0; +static bool arg_dmesg = false; +static bool arg_no_hostname = false; +static const char *arg_cursor = NULL; +static const char *arg_cursor_file = NULL; +static const char *arg_after_cursor = NULL; +static bool arg_show_cursor = false; +static const char *arg_directory = NULL; +static char **arg_file = NULL; +static bool arg_file_stdin = false; +static int arg_priorities = 0xFF; +static Set *arg_facilities = NULL; +static char *arg_verify_key = NULL; +#if HAVE_GCRYPT +static usec_t arg_interval = DEFAULT_FSS_INTERVAL_USEC; +static bool arg_force = false; +#endif +static usec_t arg_since = 0, arg_until = 0; +static bool arg_since_set = false, arg_until_set = false; +static char **arg_syslog_identifier = NULL; +static char **arg_system_units = NULL; +static char **arg_user_units = NULL; +static const char *arg_field = NULL; +static bool arg_catalog = false; +static bool arg_reverse = false; +static int arg_journal_type = 0; +static int arg_namespace_flags = 0; +static char *arg_root = NULL; +static char *arg_image = NULL; +static const char *arg_machine = NULL; +static const char *arg_namespace = NULL; +static uint64_t arg_vacuum_size = 0; +static uint64_t arg_vacuum_n_files = 0; +static usec_t arg_vacuum_time = 0; +static Set *arg_output_fields = NULL; +static const char *arg_pattern = NULL; +static pcre2_code *arg_compiled_pattern = NULL; +static PatternCompileCase arg_case = PATTERN_COMPILE_CASE_AUTO; +ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_file, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_facilities, set_freep); +STATIC_DESTRUCTOR_REGISTER(arg_verify_key, freep); +STATIC_DESTRUCTOR_REGISTER(arg_syslog_identifier, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_system_units, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_user_units, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_output_fields, set_freep); +STATIC_DESTRUCTOR_REGISTER(arg_compiled_pattern, pattern_freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +static enum { + ACTION_SHOW, + ACTION_NEW_ID128, + ACTION_PRINT_HEADER, + ACTION_SETUP_KEYS, + ACTION_VERIFY, + ACTION_DISK_USAGE, + ACTION_LIST_CATALOG, + ACTION_DUMP_CATALOG, + ACTION_UPDATE_CATALOG, + ACTION_LIST_BOOTS, + ACTION_FLUSH, + ACTION_RELINQUISH_VAR, + ACTION_SYNC, + ACTION_ROTATE, + ACTION_VACUUM, + ACTION_ROTATE_AND_VACUUM, + ACTION_LIST_FIELDS, + ACTION_LIST_FIELD_NAMES, +} arg_action = ACTION_SHOW; + +static int add_matches_for_device(sd_journal *j, const char *devpath) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + sd_device *d = NULL; + struct stat st; + int r; + + assert(j); + assert(devpath); + + if (!path_startswith(devpath, "/dev/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Devpath does not start with /dev/"); + + if (stat(devpath, &st) < 0) + return log_error_errno(errno, "Couldn't stat file: %m"); + + r = sd_device_new_from_stat_rdev(&device, &st); + if (r < 0) + return log_error_errno(r, "Failed to get device from devnum " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(st.st_rdev)); + + for (d = device; d; ) { + _cleanup_free_ char *match = NULL; + const char *subsys, *sysname, *devnode; + sd_device *parent; + + r = sd_device_get_subsystem(d, &subsys); + if (r < 0) + goto get_parent; + + r = sd_device_get_sysname(d, &sysname); + if (r < 0) + goto get_parent; + + match = strjoin("_KERNEL_DEVICE=+", subsys, ":", sysname); + if (!match) + return log_oom(); + + r = sd_journal_add_match(j, match, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + if (sd_device_get_devname(d, &devnode) >= 0) { + _cleanup_free_ char *match1 = NULL; + + r = stat(devnode, &st); + if (r < 0) + return log_error_errno(r, "Failed to stat() device node \"%s\": %m", devnode); + + r = asprintf(&match1, "_KERNEL_DEVICE=%c" DEVNUM_FORMAT_STR, S_ISBLK(st.st_mode) ? 'b' : 'c', DEVNUM_FORMAT_VAL(st.st_rdev)); + if (r < 0) + return log_oom(); + + r = sd_journal_add_match(j, match1, 0); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + } + +get_parent: + if (sd_device_get_parent(d, &parent) < 0) + break; + + d = parent; + } + + r = add_match_this_boot(j, arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to add match for the current boot: %m"); + + return 0; +} + +static char *format_timestamp_maybe_utc(char *buf, size_t l, usec_t t) { + + if (arg_utc) + return format_timestamp_style(buf, l, t, TIMESTAMP_UTC); + + return format_timestamp(buf, l, t); +} + +static int parse_boot_descriptor(const char *x, sd_id128_t *boot_id, int *offset) { + sd_id128_t id = SD_ID128_NULL; + int off = 0, r; + + if (streq(x, "all")) { + *boot_id = SD_ID128_NULL; + *offset = 0; + return 0; + } else if (strlen(x) >= SD_ID128_STRING_MAX - 1) { + char *t; + + t = strndupa_safe(x, SD_ID128_STRING_MAX - 1); + r = sd_id128_from_string(t, &id); + if (r >= 0) + x += SD_ID128_STRING_MAX - 1; + + if (!IN_SET(*x, 0, '-', '+')) + return -EINVAL; + + if (*x != 0) { + r = safe_atoi(x, &off); + if (r < 0) + return r; + } + } else { + r = safe_atoi(x, &off); + if (r < 0) + return r; + } + + if (boot_id) + *boot_id = id; + + if (offset) + *offset = off; + + return 1; +} + +static int parse_lines(const char *arg, bool graceful) { + const char *l; + int n, r; + + assert(arg || graceful); + + if (!arg) + goto default_noarg; + + if (streq(arg, "all")) { + arg_lines = ARG_LINES_ALL; + return 1; + } + + l = startswith(arg, "+"); + + r = safe_atoi(l ?: arg, &n); + if (r < 0 || n < 0) { + if (graceful) + goto default_noarg; + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse --lines='%s'.", arg); + } + + arg_lines = n; + arg_lines_oldest = l; + + return 1; + +default_noarg: + arg_lines = 10; + arg_lines_oldest = false; + return 0; +} + +static bool arg_lines_needs_seek_end(void) { + return arg_lines >= 0 && !arg_lines_oldest; +} + +static int help_facilities(void) { + if (!arg_quiet) + puts("Available facilities:"); + + for (int i = 0; i < LOG_NFACILITIES; i++) { + _cleanup_free_ char *t = NULL; + + if (log_facility_unshifted_to_string_alloc(i, &t)) + return log_oom(); + puts(t); + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("journalctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] [MATCHES...]\n\n" + "%5$sQuery the journal.%6$s\n\n" + "%3$sSource Options:%4$s\n" + " --system Show the system journal\n" + " --user Show the user journal for the current user\n" + " -M --machine=CONTAINER Operate on local container\n" + " -m --merge Show entries from all available journals\n" + " -D --directory=PATH Show journal files from directory\n" + " --file=PATH Show journal file\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + " --namespace=NAMESPACE Show journal data from specified journal namespace\n" + "\n%3$sFiltering Options:%4$s\n" + " -S --since=DATE Show entries not older than the specified date\n" + " -U --until=DATE Show entries not newer than the specified date\n" + " -c --cursor=CURSOR Show entries starting at the specified cursor\n" + " --after-cursor=CURSOR Show entries after the specified cursor\n" + " --cursor-file=FILE Show entries after cursor in FILE and update FILE\n" + " -b --boot[=ID] Show current boot or the specified boot\n" + " -u --unit=UNIT Show logs from the specified unit\n" + " --user-unit=UNIT Show logs from the specified user unit\n" + " -t --identifier=STRING Show entries with the specified syslog identifier\n" + " -p --priority=RANGE Show entries with the specified priority\n" + " --facility=FACILITY... Show entries with the specified facilities\n" + " -g --grep=PATTERN Show entries with MESSAGE matching PATTERN\n" + " --case-sensitive[=BOOL] Force case sensitive or insensitive matching\n" + " -k --dmesg Show kernel message log from the current boot\n" + "\n%3$sOutput Control Options:%4$s\n" + " -o --output=STRING Change journal output mode (short, short-precise,\n" + " short-iso, short-iso-precise, short-full,\n" + " short-monotonic, short-unix, verbose, export,\n" + " json, json-pretty, json-sse, json-seq, cat,\n" + " with-unit)\n" + " --output-fields=LIST Select fields to print in verbose/export/json modes\n" + " -n --lines[=[+]INTEGER] Number of journal entries to show\n" + " -r --reverse Show the newest entries first\n" + " --show-cursor Print the cursor after all the entries\n" + " --utc Express time in Coordinated Universal Time (UTC)\n" + " -x --catalog Add message explanations where available\n" + " --no-hostname Suppress output of hostname field\n" + " --no-full Ellipsize fields\n" + " -a --all Show all fields, including long and unprintable\n" + " -f --follow Follow the journal\n" + " --no-tail Show all lines, even in follow mode\n" + " --truncate-newline Truncate entries by first newline character\n" + " -q --quiet Do not show info messages and privilege warning\n" + "\n%3$sPager Control Options:%4$s\n" + " --no-pager Do not pipe output into a pager\n" + " -e --pager-end Immediately jump to the end in the pager\n" + "\n%3$sForward Secure Sealing (FSS) Options:%4$s\n" + " --interval=TIME Time interval for changing the FSS sealing key\n" + " --verify-key=KEY Specify FSS verification key\n" + " --force Override of the FSS key pair with --setup-keys\n" + "\n%3$sCommands:%4$s\n" + " -h --help Show this help text\n" + " --version Show package version\n" + " -N --fields List all field names currently used\n" + " -F --field=FIELD List all values that a specified field takes\n" + " --list-boots Show terse information about recorded boots\n" + " --disk-usage Show total disk usage of all journal files\n" + " --vacuum-size=BYTES Reduce disk usage below specified size\n" + " --vacuum-files=INT Leave only the specified number of journal files\n" + " --vacuum-time=TIME Remove journal files older than specified time\n" + " --verify Verify journal file consistency\n" + " --sync Synchronize unwritten journal messages to disk\n" + " --relinquish-var Stop logging to disk, log to temporary file system\n" + " --smart-relinquish-var Similar, but NOP if log directory is on root mount\n" + " --flush Flush all journal data from /run into /var\n" + " --rotate Request immediate rotation of the journal files\n" + " --header Show journal header information\n" + " --list-catalog Show all message IDs in the catalog\n" + " --dump-catalog Show entries in the message catalog\n" + " --update-catalog Update the message catalog database\n" + " --setup-keys Generate a new FSS key pair\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_FULL, + ARG_NO_TAIL, + ARG_NEW_ID128, + ARG_THIS_BOOT, + ARG_LIST_BOOTS, + ARG_USER, + ARG_SYSTEM, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_HEADER, + ARG_FACILITY, + ARG_SETUP_KEYS, + ARG_FILE, + ARG_INTERVAL, + ARG_VERIFY, + ARG_VERIFY_KEY, + ARG_DISK_USAGE, + ARG_AFTER_CURSOR, + ARG_CURSOR_FILE, + ARG_SHOW_CURSOR, + ARG_USER_UNIT, + ARG_LIST_CATALOG, + ARG_DUMP_CATALOG, + ARG_UPDATE_CATALOG, + ARG_FORCE, + ARG_CASE_SENSITIVE, + ARG_UTC, + ARG_SYNC, + ARG_FLUSH, + ARG_RELINQUISH_VAR, + ARG_SMART_RELINQUISH_VAR, + ARG_ROTATE, + ARG_TRUNCATE_NEWLINE, + ARG_VACUUM_SIZE, + ARG_VACUUM_FILES, + ARG_VACUUM_TIME, + ARG_NO_HOSTNAME, + ARG_OUTPUT_FIELDS, + ARG_NAMESPACE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version" , no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "pager-end", no_argument, NULL, 'e' }, + { "follow", no_argument, NULL, 'f' }, + { "force", no_argument, NULL, ARG_FORCE }, + { "output", required_argument, NULL, 'o' }, + { "all", no_argument, NULL, 'a' }, + { "full", no_argument, NULL, 'l' }, + { "no-full", no_argument, NULL, ARG_NO_FULL }, + { "lines", optional_argument, NULL, 'n' }, + { "truncate-newline", no_argument, NULL, ARG_TRUNCATE_NEWLINE }, + { "no-tail", no_argument, NULL, ARG_NO_TAIL }, + { "new-id128", no_argument, NULL, ARG_NEW_ID128 }, /* deprecated */ + { "quiet", no_argument, NULL, 'q' }, + { "merge", no_argument, NULL, 'm' }, + { "this-boot", no_argument, NULL, ARG_THIS_BOOT }, /* deprecated */ + { "boot", optional_argument, NULL, 'b' }, + { "list-boots", no_argument, NULL, ARG_LIST_BOOTS }, + { "dmesg", no_argument, NULL, 'k' }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "user", no_argument, NULL, ARG_USER }, + { "directory", required_argument, NULL, 'D' }, + { "file", required_argument, NULL, ARG_FILE }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "header", no_argument, NULL, ARG_HEADER }, + { "identifier", required_argument, NULL, 't' }, + { "priority", required_argument, NULL, 'p' }, + { "facility", required_argument, NULL, ARG_FACILITY }, + { "grep", required_argument, NULL, 'g' }, + { "case-sensitive", optional_argument, NULL, ARG_CASE_SENSITIVE }, + { "setup-keys", no_argument, NULL, ARG_SETUP_KEYS }, + { "interval", required_argument, NULL, ARG_INTERVAL }, + { "verify", no_argument, NULL, ARG_VERIFY }, + { "verify-key", required_argument, NULL, ARG_VERIFY_KEY }, + { "disk-usage", no_argument, NULL, ARG_DISK_USAGE }, + { "cursor", required_argument, NULL, 'c' }, + { "cursor-file", required_argument, NULL, ARG_CURSOR_FILE }, + { "after-cursor", required_argument, NULL, ARG_AFTER_CURSOR }, + { "show-cursor", no_argument, NULL, ARG_SHOW_CURSOR }, + { "since", required_argument, NULL, 'S' }, + { "until", required_argument, NULL, 'U' }, + { "unit", required_argument, NULL, 'u' }, + { "user-unit", required_argument, NULL, ARG_USER_UNIT }, + { "field", required_argument, NULL, 'F' }, + { "fields", no_argument, NULL, 'N' }, + { "catalog", no_argument, NULL, 'x' }, + { "list-catalog", no_argument, NULL, ARG_LIST_CATALOG }, + { "dump-catalog", no_argument, NULL, ARG_DUMP_CATALOG }, + { "update-catalog", no_argument, NULL, ARG_UPDATE_CATALOG }, + { "reverse", no_argument, NULL, 'r' }, + { "machine", required_argument, NULL, 'M' }, + { "utc", no_argument, NULL, ARG_UTC }, + { "flush", no_argument, NULL, ARG_FLUSH }, + { "relinquish-var", no_argument, NULL, ARG_RELINQUISH_VAR }, + { "smart-relinquish-var", no_argument, NULL, ARG_SMART_RELINQUISH_VAR }, + { "sync", no_argument, NULL, ARG_SYNC }, + { "rotate", no_argument, NULL, ARG_ROTATE }, + { "vacuum-size", required_argument, NULL, ARG_VACUUM_SIZE }, + { "vacuum-files", required_argument, NULL, ARG_VACUUM_FILES }, + { "vacuum-time", required_argument, NULL, ARG_VACUUM_TIME }, + { "no-hostname", no_argument, NULL, ARG_NO_HOSTNAME }, + { "output-fields", required_argument, NULL, ARG_OUTPUT_FIELDS }, + { "namespace", required_argument, NULL, ARG_NAMESPACE }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hefo:aln::qmb::kD:p:g:c:S:U:t:u:NF:xrM:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case 'e': + arg_pager_flags |= PAGER_JUMP_TO_END; + + if (arg_lines == ARG_LINES_DEFAULT) + arg_lines = 1000; + + arg_boot = true; + + break; + + case 'f': + arg_follow = true; + break; + + case 'o': + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(output_mode, OutputMode, _OUTPUT_MODE_MAX); + return 0; + } + + arg_output = output_mode_from_string(optarg); + if (arg_output < 0) + return log_error_errno(arg_output, "Unknown output format '%s'.", optarg); + + if (IN_SET(arg_output, OUTPUT_EXPORT, OUTPUT_JSON, OUTPUT_JSON_PRETTY, OUTPUT_JSON_SSE, OUTPUT_JSON_SEQ, OUTPUT_CAT)) + arg_quiet = true; + + if (OUTPUT_MODE_IS_JSON(arg_output)) + arg_json_format_flags = output_mode_to_json_format_flags(arg_output) | JSON_FORMAT_COLOR_AUTO; + else + arg_json_format_flags = JSON_FORMAT_OFF; + + break; + + case 'l': + arg_full = true; + break; + + case ARG_NO_FULL: + arg_full = false; + break; + + case 'a': + arg_all = true; + break; + + case 'n': + r = parse_lines(optarg ?: argv[optind], !optarg); + if (r < 0) + return r; + if (r > 0 && !optarg) + optind++; + + break; + + case ARG_NO_TAIL: + arg_no_tail = true; + break; + + case ARG_TRUNCATE_NEWLINE: + arg_truncate_newline = true; + break; + + case ARG_NEW_ID128: + arg_action = ACTION_NEW_ID128; + break; + + case 'q': + arg_quiet = true; + break; + + case 'm': + arg_merge = true; + break; + + case ARG_THIS_BOOT: + arg_boot = true; + arg_boot_id = SD_ID128_NULL; + arg_boot_offset = 0; + break; + + case 'b': + arg_boot = true; + arg_boot_id = SD_ID128_NULL; + arg_boot_offset = 0; + + if (optarg) { + r = parse_boot_descriptor(optarg, &arg_boot_id, &arg_boot_offset); + if (r < 0) + return log_error_errno(r, "Failed to parse boot descriptor '%s'", optarg); + + arg_boot = r; + + /* Hmm, no argument? Maybe the next + * word on the command line is + * supposed to be the argument? Let's + * see if there is one and is parsable + * as a boot descriptor... */ + } else if (optind < argc) { + r = parse_boot_descriptor(argv[optind], &arg_boot_id, &arg_boot_offset); + if (r >= 0) { + arg_boot = r; + optind++; + } + } + break; + + case ARG_LIST_BOOTS: + arg_action = ACTION_LIST_BOOTS; + break; + + case 'k': + arg_boot = arg_dmesg = true; + break; + + case ARG_SYSTEM: + arg_journal_type |= SD_JOURNAL_SYSTEM; + break; + + case ARG_USER: + arg_journal_type |= SD_JOURNAL_CURRENT_USER; + break; + + case 'M': + arg_machine = optarg; + break; + + case ARG_NAMESPACE: + if (streq(optarg, "*")) { + arg_namespace_flags = SD_JOURNAL_ALL_NAMESPACES; + arg_namespace = NULL; + } else if (startswith(optarg, "+")) { + arg_namespace_flags = SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE; + arg_namespace = optarg + 1; + } else if (isempty(optarg)) { + arg_namespace_flags = 0; + arg_namespace = NULL; + } else { + arg_namespace_flags = 0; + arg_namespace = optarg; + } + + break; + + case 'D': + arg_directory = optarg; + break; + + case ARG_FILE: + if (streq(optarg, "-")) + /* An undocumented feature: we can read journal files from STDIN. We don't document + * this though, since after all we only support this for mmap-able, seekable files, and + * not for example pipes which are probably the primary use case for reading things from + * STDIN. To avoid confusion we hence don't document this feature. */ + arg_file_stdin = true; + else { + r = glob_extend(&arg_file, optarg, GLOB_NOCHECK); + if (r < 0) + return log_error_errno(r, "Failed to add paths: %m"); + } + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ true, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case 'c': + arg_cursor = optarg; + break; + + case ARG_CURSOR_FILE: + arg_cursor_file = optarg; + break; + + case ARG_AFTER_CURSOR: + arg_after_cursor = optarg; + break; + + case ARG_SHOW_CURSOR: + arg_show_cursor = true; + break; + + case ARG_HEADER: + arg_action = ACTION_PRINT_HEADER; + break; + + case ARG_VERIFY: + arg_action = ACTION_VERIFY; + break; + + case ARG_DISK_USAGE: + arg_action = ACTION_DISK_USAGE; + break; + + case ARG_VACUUM_SIZE: + r = parse_size(optarg, 1024, &arg_vacuum_size); + if (r < 0) + return log_error_errno(r, "Failed to parse vacuum size: %s", optarg); + + arg_action = arg_action == ACTION_ROTATE ? ACTION_ROTATE_AND_VACUUM : ACTION_VACUUM; + break; + + case ARG_VACUUM_FILES: + r = safe_atou64(optarg, &arg_vacuum_n_files); + if (r < 0) + return log_error_errno(r, "Failed to parse vacuum files: %s", optarg); + + arg_action = arg_action == ACTION_ROTATE ? ACTION_ROTATE_AND_VACUUM : ACTION_VACUUM; + break; + + case ARG_VACUUM_TIME: + r = parse_sec(optarg, &arg_vacuum_time); + if (r < 0) + return log_error_errno(r, "Failed to parse vacuum time: %s", optarg); + + arg_action = arg_action == ACTION_ROTATE ? ACTION_ROTATE_AND_VACUUM : ACTION_VACUUM; + break; + +#if HAVE_GCRYPT + case ARG_FORCE: + arg_force = true; + break; + + case ARG_SETUP_KEYS: + arg_action = ACTION_SETUP_KEYS; + break; + + case ARG_VERIFY_KEY: + r = free_and_strdup(&arg_verify_key, optarg); + if (r < 0) + return r; + /* Use memset not explicit_bzero() or similar so this doesn't look confusing + * in ps or htop output. */ + memset(optarg, 'x', strlen(optarg)); + + arg_action = ACTION_VERIFY; + arg_merge = false; + break; + + case ARG_INTERVAL: + r = parse_sec(optarg, &arg_interval); + if (r < 0 || arg_interval <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse sealing key change interval: %s", optarg); + break; +#else + case ARG_SETUP_KEYS: + case ARG_VERIFY_KEY: + case ARG_INTERVAL: + case ARG_FORCE: + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Compiled without forward-secure sealing support."); +#endif + + case 'p': { + const char *dots; + + dots = strstr(optarg, ".."); + if (dots) { + _cleanup_free_ char *a = NULL; + int from, to, i; + + /* a range */ + a = strndup(optarg, dots - optarg); + if (!a) + return log_oom(); + + from = log_level_from_string(a); + to = log_level_from_string(dots + 2); + + if (from < 0 || to < 0) + return log_error_errno(from < 0 ? from : to, + "Failed to parse log level range %s", optarg); + + arg_priorities = 0; + + if (from < to) { + for (i = from; i <= to; i++) + arg_priorities |= 1 << i; + } else { + for (i = to; i <= from; i++) + arg_priorities |= 1 << i; + } + + } else { + int p, i; + + p = log_level_from_string(optarg); + if (p < 0) + return log_error_errno(p, "Unknown log level %s", optarg); + + arg_priorities = 0; + + for (i = 0; i <= p; i++) + arg_priorities |= 1 << i; + } + + break; + } + + case ARG_FACILITY: { + const char *p; + + for (p = optarg;;) { + _cleanup_free_ char *fac = NULL; + int num; + + r = extract_first_word(&p, &fac, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse facilities: %s", optarg); + if (r == 0) + break; + + if (streq(fac, "help")) { + help_facilities(); + return 0; + } + + num = log_facility_unshifted_from_string(fac); + if (num < 0) + return log_error_errno(num, "Bad --facility= argument \"%s\".", fac); + + if (set_ensure_put(&arg_facilities, NULL, INT_TO_PTR(num)) < 0) + return log_oom(); + } + + break; + } + + case 'g': + arg_pattern = optarg; + break; + + case ARG_CASE_SENSITIVE: + if (optarg) { + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Bad --case-sensitive= argument \"%s\": %m", optarg); + arg_case = r ? PATTERN_COMPILE_CASE_SENSITIVE : PATTERN_COMPILE_CASE_INSENSITIVE; + } else + arg_case = PATTERN_COMPILE_CASE_SENSITIVE; + + break; + + case 'S': + r = parse_timestamp(optarg, &arg_since); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse timestamp: %s", optarg); + arg_since_set = true; + break; + + case 'U': + r = parse_timestamp(optarg, &arg_until); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse timestamp: %s", optarg); + arg_until_set = true; + break; + + case 't': + r = strv_extend(&arg_syslog_identifier, optarg); + if (r < 0) + return log_oom(); + break; + + case 'u': + r = strv_extend(&arg_system_units, optarg); + if (r < 0) + return log_oom(); + break; + + case ARG_USER_UNIT: + r = strv_extend(&arg_user_units, optarg); + if (r < 0) + return log_oom(); + break; + + case 'F': + arg_action = ACTION_LIST_FIELDS; + arg_field = optarg; + break; + + case 'N': + arg_action = ACTION_LIST_FIELD_NAMES; + break; + + case ARG_NO_HOSTNAME: + arg_no_hostname = true; + break; + + case 'x': + arg_catalog = true; + break; + + case ARG_LIST_CATALOG: + arg_action = ACTION_LIST_CATALOG; + break; + + case ARG_DUMP_CATALOG: + arg_action = ACTION_DUMP_CATALOG; + break; + + case ARG_UPDATE_CATALOG: + arg_action = ACTION_UPDATE_CATALOG; + break; + + case 'r': + arg_reverse = true; + break; + + case ARG_UTC: + arg_utc = true; + break; + + case ARG_FLUSH: + arg_action = ACTION_FLUSH; + break; + + case ARG_SMART_RELINQUISH_VAR: { + int root_mnt_id, log_mnt_id; + + /* Try to be smart about relinquishing access to /var/log/journal/ during shutdown: + * if it's on the same mount as the root file system there's no point in + * relinquishing access and we can leave journald write to it until the very last + * moment. */ + + r = path_get_mnt_id("/", &root_mnt_id); + if (r < 0) + log_debug_errno(r, "Failed to get root mount ID, ignoring: %m"); + else { + r = path_get_mnt_id("/var/log/journal/", &log_mnt_id); + if (r < 0) + log_debug_errno(r, "Failed to get journal directory mount ID, ignoring: %m"); + else if (root_mnt_id == log_mnt_id) { + log_debug("/var/log/journal/ is on root file system, not relinquishing access to /var."); + return 0; + } else + log_debug("/var/log/journal/ is not on the root file system, relinquishing access to it."); + } + + _fallthrough_; + } + + case ARG_RELINQUISH_VAR: + arg_action = ACTION_RELINQUISH_VAR; + break; + + case ARG_ROTATE: + arg_action = arg_action == ACTION_VACUUM ? ACTION_ROTATE_AND_VACUUM : ACTION_ROTATE; + break; + + case ARG_SYNC: + arg_action = ACTION_SYNC; + break; + + case ARG_OUTPUT_FIELDS: { + _cleanup_strv_free_ char **v = NULL; + + v = strv_split(optarg, ","); + if (!v) + return log_oom(); + + r = set_put_strdupv(&arg_output_fields, v); + if (r < 0) + return log_oom(); + + break; + } + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_no_tail) + arg_lines = ARG_LINES_ALL; + + if (arg_follow && !arg_since_set && arg_lines == ARG_LINES_DEFAULT) + arg_lines = 10; + + if (arg_follow && !arg_merge && !arg_boot) { + arg_boot = true; + arg_boot_id = SD_ID128_NULL; + arg_boot_offset = 0; + } + + if (!!arg_directory + !!arg_file + !!arg_machine + !!arg_root + !!arg_image > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Please specify at most one of -D/--directory=, --file=, -M/--machine=, --root=, --image=."); + + if (arg_since_set && arg_until_set && arg_since > arg_until) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--since= must be before --until=."); + + if (!!arg_cursor + !!arg_after_cursor + !!arg_since_set > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Please specify only one of --since=, --cursor=, and --after-cursor=."); + + if (arg_follow && arg_reverse) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Please specify either --reverse or --follow, not both."); + + if (arg_lines >= 0 && arg_lines_oldest && (arg_reverse || arg_follow)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--lines=+N is unsupported when --reverse or --follow is specified."); + + if (!IN_SET(arg_action, ACTION_SHOW, ACTION_DUMP_CATALOG, ACTION_LIST_CATALOG) && optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Extraneous arguments starting with '%s'", + argv[optind]); + + if ((arg_boot || arg_action == ACTION_LIST_BOOTS) && arg_merge) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Using --boot or --list-boots with --merge is not supported."); + + if (!strv_isempty(arg_system_units) && arg_journal_type == SD_JOURNAL_CURRENT_USER) { + /* Specifying --user and --unit= at the same time makes no sense (as the former excludes the user + * journal, but the latter excludes the system journal, thus resulting in empty output). Let's be nice + * to users, and automatically turn --unit= into --user-unit= if combined with --user. */ + r = strv_extend_strv(&arg_user_units, arg_system_units, true); + if (r < 0) + return r; + + arg_system_units = strv_free(arg_system_units); + } + + if (arg_pattern) { + r = pattern_compile_and_log(arg_pattern, arg_case, &arg_compiled_pattern); + if (r < 0) + return r; + + /* When --grep is used along with --lines without '+', i.e. when we start from the end of the + * journal, we don't know how many lines we can print. So we search backwards and count until + * enough lines have been printed or we hit the head. + * An exception is that --follow might set arg_lines, so let's not imply --reverse + * if that is specified. */ + if (arg_lines_needs_seek_end() && !arg_follow) + arg_reverse = true; + } + + return 1; +} + +static int add_matches(sd_journal *j, char **args) { + bool have_term = false; + + assert(j); + + STRV_FOREACH(i, args) { + int r; + + if (streq(*i, "+")) { + if (!have_term) + break; + r = sd_journal_add_disjunction(j); + have_term = false; + + } else if (path_is_absolute(*i)) { + _cleanup_free_ char *p = NULL, *t = NULL, *t2 = NULL, *interpreter = NULL; + struct stat st; + + r = chase(*i, NULL, CHASE_TRAIL_SLASH, &p, NULL); + if (r < 0) + return log_error_errno(r, "Couldn't canonicalize path: %m"); + + if (lstat(p, &st) < 0) + return log_error_errno(errno, "Couldn't stat file: %m"); + + if (S_ISREG(st.st_mode) && (0111 & st.st_mode)) { + if (executable_is_script(p, &interpreter) > 0) { + _cleanup_free_ char *comm = NULL; + + r = path_extract_filename(p, &comm); + if (r < 0) + return log_error_errno(r, "Failed to extract filename of '%s': %m", p); + + t = strjoin("_COMM=", strshorten(comm, TASK_COMM_LEN-1)); + if (!t) + return log_oom(); + + /* Append _EXE only if the interpreter is not a link. + Otherwise, it might be outdated often. */ + if (lstat(interpreter, &st) == 0 && !S_ISLNK(st.st_mode)) { + t2 = strjoin("_EXE=", interpreter); + if (!t2) + return log_oom(); + } + } else { + t = strjoin("_EXE=", p); + if (!t) + return log_oom(); + } + + r = sd_journal_add_match(j, t, 0); + + if (r >=0 && t2) + r = sd_journal_add_match(j, t2, 0); + + } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + r = add_matches_for_device(j, p); + if (r < 0) + return r; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "File is neither a device node, nor regular file, nor executable: %s", + *i); + + have_term = true; + } else { + r = sd_journal_add_match(j, *i, 0); + have_term = true; + } + + if (r < 0) + return log_error_errno(r, "Failed to add match '%s': %m", *i); + } + + if (!strv_isempty(args) && !have_term) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "\"+\" can only be used between terms"); + + return 0; +} + +static int list_boots(sd_journal *j) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ BootId *boots = NULL; + size_t n_boots; + int r; + + assert(j); + + r = journal_get_boots(j, &boots, &n_boots); + if (r < 0) + return log_error_errno(r, "Failed to determine boots: %m"); + if (r == 0) + return 0; + + table = table_new("idx", "boot id", "first entry", "last entry"); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + r = table_set_json_field_name(table, 0, "index"); + if (r < 0) + return log_error_errno(r, "Failed to set JSON field name of column 0: %m"); + + (void) table_set_sort(table, (size_t) 0); + (void) table_set_reverse(table, 0, arg_reverse); + + FOREACH_ARRAY(i, boots, n_boots) { + r = table_add_many(table, + TABLE_INT, (int)(i - boots) - (int) n_boots + 1, + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_ID128, i->id, + TABLE_TIMESTAMP, i->first_usec, + TABLE_TIMESTAMP, i->last_usec); + if (r < 0) + return table_log_add_error(r); + } + + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, !arg_quiet); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int add_boot(sd_journal *j) { + int r; + + assert(j); + + if (!arg_boot) + return 0; + + /* Take a shortcut and use the current boot_id, which we can do very quickly. + * We can do this only when we logs are coming from the current machine, + * so take the slow path if log location is specified. */ + if (arg_boot_offset == 0 && sd_id128_is_null(arg_boot_id) && + !arg_directory && !arg_file && !arg_root) + return add_match_this_boot(j, arg_machine); + + if (sd_id128_is_null(arg_boot_id)) { + r = journal_find_boot_by_offset(j, arg_boot_offset, &arg_boot_id); + if (r < 0) + return log_error_errno(r, "Failed to find journal entry from the specified boot offset (%+i): %m", + arg_boot_offset); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), + "No journal boot entry found from the specified boot offset (%+i).", + arg_boot_offset); + } else { + r = journal_find_boot_by_id(j, arg_boot_id); + if (r < 0) + return log_error_errno(r, "Failed to find journal entry from the specified boot ID (%s): %m", + SD_ID128_TO_STRING(arg_boot_id)); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), + "No journal boot entry found from the specified boot ID (%s).", + SD_ID128_TO_STRING(arg_boot_id)); + } + + r = add_match_boot_id(j, arg_boot_id); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +static int add_dmesg(sd_journal *j) { + int r; + assert(j); + + if (!arg_dmesg) + return 0; + + r = sd_journal_add_match(j, "_TRANSPORT=kernel", + STRLEN("_TRANSPORT=kernel")); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +static int get_possible_units( + sd_journal *j, + const char *fields, + char **patterns, + Set **units) { + + _cleanup_set_free_free_ Set *found = NULL; + int r; + + found = set_new(&string_hash_ops); + if (!found) + return -ENOMEM; + + NULSTR_FOREACH(field, fields) { + const void *data; + size_t size; + + r = sd_journal_query_unique(j, field); + if (r < 0) + return r; + + SD_JOURNAL_FOREACH_UNIQUE(j, data, size) { + char *eq; + size_t prefix; + _cleanup_free_ char *u = NULL; + + eq = memchr(data, '=', size); + if (eq) + prefix = eq - (char*) data + 1; + else + prefix = 0; + + u = strndup((char*) data + prefix, size - prefix); + if (!u) + return -ENOMEM; + + STRV_FOREACH(pattern, patterns) + if (fnmatch(*pattern, u, FNM_NOESCAPE) == 0) { + log_debug("Matched %s with pattern %s=%s", u, field, *pattern); + + r = set_consume(found, u); + u = NULL; + if (r < 0 && r != -EEXIST) + return r; + + break; + } + } + } + + *units = TAKE_PTR(found); + + return 0; +} + +/* This list is supposed to return the superset of unit names + * possibly matched by rules added with add_matches_for_unit... */ +#define SYSTEM_UNITS \ + "_SYSTEMD_UNIT\0" \ + "COREDUMP_UNIT\0" \ + "UNIT\0" \ + "OBJECT_SYSTEMD_UNIT\0" \ + "_SYSTEMD_SLICE\0" + +/* ... and add_matches_for_user_unit */ +#define USER_UNITS \ + "_SYSTEMD_USER_UNIT\0" \ + "USER_UNIT\0" \ + "COREDUMP_USER_UNIT\0" \ + "OBJECT_SYSTEMD_USER_UNIT\0" \ + "_SYSTEMD_USER_SLICE\0" + +static int add_units(sd_journal *j) { + _cleanup_strv_free_ char **patterns = NULL; + int r, count = 0; + + assert(j); + + STRV_FOREACH(i, arg_system_units) { + _cleanup_free_ char *u = NULL; + + r = unit_name_mangle(*i, UNIT_NAME_MANGLE_GLOB | (arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN), &u); + if (r < 0) + return r; + + if (string_is_glob(u)) { + r = strv_push(&patterns, u); + if (r < 0) + return r; + u = NULL; + } else { + r = add_matches_for_unit(j, u); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + if (!strv_isempty(patterns)) { + _cleanup_set_free_free_ Set *units = NULL; + char *u; + + r = get_possible_units(j, SYSTEM_UNITS, patterns, &units); + if (r < 0) + return r; + + SET_FOREACH(u, units) { + r = add_matches_for_unit(j, u); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + patterns = strv_free(patterns); + + STRV_FOREACH(i, arg_user_units) { + _cleanup_free_ char *u = NULL; + + r = unit_name_mangle(*i, UNIT_NAME_MANGLE_GLOB | (arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN), &u); + if (r < 0) + return r; + + if (string_is_glob(u)) { + r = strv_push(&patterns, u); + if (r < 0) + return r; + u = NULL; + } else { + r = add_matches_for_user_unit(j, u, getuid()); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + if (!strv_isempty(patterns)) { + _cleanup_set_free_free_ Set *units = NULL; + char *u; + + r = get_possible_units(j, USER_UNITS, patterns, &units); + if (r < 0) + return r; + + SET_FOREACH(u, units) { + r = add_matches_for_user_unit(j, u, getuid()); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + count++; + } + } + + /* Complain if the user request matches but nothing whatsoever was + * found, since otherwise everything would be matched. */ + if (!(strv_isempty(arg_system_units) && strv_isempty(arg_user_units)) && count == 0) + return -ENODATA; + + r = sd_journal_add_conjunction(j); + if (r < 0) + return r; + + return 0; +} + +static int add_priorities(sd_journal *j) { + char match[] = "PRIORITY=0"; + int i, r; + assert(j); + + if (arg_priorities == 0xFF) + return 0; + + for (i = LOG_EMERG; i <= LOG_DEBUG; i++) + if (arg_priorities & (1 << i)) { + match[sizeof(match)-2] = '0' + i; + + r = sd_journal_add_match(j, match, strlen(match)); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + } + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +static int add_facilities(sd_journal *j) { + void *p; + int r; + + SET_FOREACH(p, arg_facilities) { + char match[STRLEN("SYSLOG_FACILITY=") + DECIMAL_STR_MAX(int)]; + + xsprintf(match, "SYSLOG_FACILITY=%d", PTR_TO_INT(p)); + + r = sd_journal_add_match(j, match, strlen(match)); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + } + + return 0; +} + +static int add_syslog_identifier(sd_journal *j) { + int r; + + assert(j); + + STRV_FOREACH(i, arg_syslog_identifier) { + _cleanup_free_ char *u = NULL; + + u = strjoin("SYSLOG_IDENTIFIER=", *i); + if (!u) + return -ENOMEM; + r = sd_journal_add_match(j, u, 0); + if (r < 0) + return r; + r = sd_journal_add_disjunction(j); + if (r < 0) + return r; + } + + r = sd_journal_add_conjunction(j); + if (r < 0) + return r; + + return 0; +} + +#if HAVE_GCRYPT +static int format_journal_url( + const void *seed, + size_t seed_size, + uint64_t start, + uint64_t interval, + const char *hn, + sd_id128_t machine, + bool full, + char **ret_url) { + + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + assert(seed); + assert(seed_size > 0); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + if (full) + fputs("fss://", f); + + for (size_t i = 0; i < seed_size; i++) { + if (i > 0 && i % 3 == 0) + fputc('-', f); + fprintf(f, "%02x", ((uint8_t*) seed)[i]); + } + + fprintf(f, "/%"PRIx64"-%"PRIx64, start, interval); + + if (full) { + fprintf(f, "?machine=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(machine)); + if (hn) + fprintf(f, ";hostname=%s", hn); + } + + return memstream_finalize(&m, ret_url, NULL); +} +#endif + +static int setup_keys(void) { +#if HAVE_GCRYPT + size_t mpk_size, seed_size, state_size; + _cleanup_(unlink_and_freep) char *k = NULL; + _cleanup_free_ char *p = NULL; + uint8_t *mpk, *seed, *state; + _cleanup_close_ int fd = -EBADF; + sd_id128_t machine, boot; + struct stat st; + uint64_t n; + int r; + + r = stat("/var/log/journal", &st); + if (r < 0 && !IN_SET(errno, ENOENT, ENOTDIR)) + return log_error_errno(errno, "stat(\"%s\") failed: %m", "/var/log/journal"); + + if (r < 0 || !S_ISDIR(st.st_mode)) { + log_error("%s is not a directory, must be using persistent logging for FSS.", + "/var/log/journal"); + return r < 0 ? -errno : -ENOTDIR; + } + + r = sd_id128_get_machine(&machine); + if (r < 0) + return log_error_errno(r, "Failed to get machine ID: %m"); + + r = sd_id128_get_boot(&boot); + if (r < 0) + return log_error_errno(r, "Failed to get boot ID: %m"); + + if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fss", + SD_ID128_FORMAT_VAL(machine)) < 0) + return log_oom(); + + if (arg_force) { + r = unlink(p); + if (r < 0 && errno != ENOENT) + return log_error_errno(errno, "unlink(\"%s\") failed: %m", p); + } else if (access(p, F_OK) >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "Sealing key file %s exists already. Use --force to recreate.", p); + + if (asprintf(&k, "/var/log/journal/" SD_ID128_FORMAT_STR "/fss.tmp.XXXXXX", + SD_ID128_FORMAT_VAL(machine)) < 0) + return log_oom(); + + mpk_size = FSPRG_mskinbytes(FSPRG_RECOMMENDED_SECPAR); + mpk = alloca_safe(mpk_size); + + seed_size = FSPRG_RECOMMENDED_SEEDLEN; + seed = alloca_safe(seed_size); + + state_size = FSPRG_stateinbytes(FSPRG_RECOMMENDED_SECPAR); + state = alloca_safe(state_size); + + log_info("Generating seed..."); + r = crypto_random_bytes(seed, seed_size); + if (r < 0) + return log_error_errno(r, "Failed to acquire random seed: %m"); + + log_info("Generating key pair..."); + FSPRG_GenMK(NULL, mpk, seed, seed_size, FSPRG_RECOMMENDED_SECPAR); + + log_info("Generating sealing key..."); + FSPRG_GenState0(state, mpk, seed, seed_size); + + assert(arg_interval > 0); + + n = now(CLOCK_REALTIME); + n /= arg_interval; + + safe_close(fd); + fd = mkostemp_safe(k); + if (fd < 0) + return log_error_errno(fd, "Failed to open %s: %m", k); + + r = chattr_secret(fd, CHATTR_WARN_UNSUPPORTED_FLAGS); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set file attributes on '%s', ignoring: %m", k); + + struct FSSHeader h = { + .signature = { 'K', 'S', 'H', 'H', 'R', 'H', 'L', 'P' }, + .machine_id = machine, + .boot_id = boot, + .header_size = htole64(sizeof(h)), + .start_usec = htole64(n * arg_interval), + .interval_usec = htole64(arg_interval), + .fsprg_secpar = htole16(FSPRG_RECOMMENDED_SECPAR), + .fsprg_state_size = htole64(state_size), + }; + + r = loop_write(fd, &h, sizeof(h)); + if (r < 0) + return log_error_errno(r, "Failed to write header: %m"); + + r = loop_write(fd, state, state_size); + if (r < 0) + return log_error_errno(r, "Failed to write state: %m"); + + if (rename(k, p) < 0) + return log_error_errno(errno, "Failed to link file: %m"); + + k = mfree(k); + + _cleanup_free_ char *hn = NULL, *key = NULL; + + r = format_journal_url(seed, seed_size, n, arg_interval, hn, machine, false, &key); + if (r < 0) + return r; + + if (on_tty()) { + hn = gethostname_malloc(); + if (hn) + hostname_cleanup(hn); + + fprintf(stderr, + "\nNew keys have been generated for host %s%s" SD_ID128_FORMAT_STR ".\n" + "\n" + "The %ssecret sealing key%s has been written to the following local file.\n" + "This key file is automatically updated when the sealing key is advanced.\n" + "It should not be used on multiple hosts.\n" + "\n" + "\t%s\n" + "\n" + "The sealing key is automatically changed every %s.\n" + "\n" + "Please write down the following %ssecret verification key%s. It should be stored\n" + "in a safe location and should not be saved locally on disk.\n" + "\n\t%s", + strempty(hn), hn ? "/" : "", + SD_ID128_FORMAT_VAL(machine), + ansi_highlight(), ansi_normal(), + p, + FORMAT_TIMESPAN(arg_interval, 0), + ansi_highlight(), ansi_normal(), + ansi_highlight_red()); + fflush(stderr); + } + + puts(key); + + if (on_tty()) { + fprintf(stderr, "%s", ansi_normal()); +#if HAVE_QRENCODE + _cleanup_free_ char *url = NULL; + r = format_journal_url(seed, seed_size, n, arg_interval, hn, machine, true, &url); + if (r < 0) + return r; + + (void) print_qrcode(stderr, + "To transfer the verification key to your phone scan the QR code below", + url); +#endif + } + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Forward-secure sealing not available."); +#endif +} + +static int verify(sd_journal *j, bool verbose) { + int r = 0; + JournalFile *f; + + assert(j); + + log_show_color(true); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + int k; + usec_t first = 0, validated = 0, last = 0; + +#if HAVE_GCRYPT + if (!arg_verify_key && JOURNAL_HEADER_SEALED(f->header)) + log_notice("Journal file %s has sealing enabled but verification key has not been passed using --verify-key=.", f->path); +#endif + + k = journal_file_verify(f, arg_verify_key, &first, &validated, &last, verbose); + if (k == -EINVAL) + /* If the key was invalid give up right-away. */ + return k; + else if (k < 0) + r = log_warning_errno(k, "FAIL: %s (%m)", f->path); + else { + char a[FORMAT_TIMESTAMP_MAX], b[FORMAT_TIMESTAMP_MAX]; + log_full(verbose ? LOG_INFO : LOG_DEBUG, "PASS: %s", f->path); + + if (arg_verify_key && JOURNAL_HEADER_SEALED(f->header)) { + if (validated > 0) { + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "=> Validated from %s to %s, final %s entries not sealed.", + format_timestamp_maybe_utc(a, sizeof(a), first), + format_timestamp_maybe_utc(b, sizeof(b), validated), + FORMAT_TIMESPAN(last > validated ? last - validated : 0, 0)); + } else if (last > 0) + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "=> No sealing yet, %s of entries not sealed.", + FORMAT_TIMESPAN(last - first, 0)); + else + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "=> No sealing yet, no entries in file."); + } + } + } + + return r; +} + +static int simple_varlink_call(const char *option, const char *method) { + _cleanup_(varlink_flush_close_unrefp) Varlink *link = NULL; + const char *error, *fn; + int r; + + if (arg_machine) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "%s is not supported in conjunction with --machine=.", option); + + fn = arg_namespace ? + strjoina("/run/systemd/journal.", arg_namespace, "/io.systemd.journal") : + "/run/systemd/journal/io.systemd.journal"; + + r = varlink_connect_address(&link, fn); + if (r < 0) + return log_error_errno(r, "Failed to connect to %s: %m", fn); + + (void) varlink_set_description(link, "journal"); + (void) varlink_set_relative_timeout(link, USEC_INFINITY); + + r = varlink_call(link, method, NULL, NULL, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to execute varlink call: %m"); + if (error) + return log_error_errno(SYNTHETIC_ERRNO(ENOANO), + "Failed to execute varlink call: %s", error); + + return 0; +} + +static int flush_to_var(void) { + if (access("/run/systemd/journal/flushed", F_OK) >= 0) + return 0; /* Already flushed, no need to contact journald */ + if (errno != ENOENT) + return log_error_errno(errno, "Unable to check for existence of /run/systemd/journal/flushed: %m"); + + return simple_varlink_call("--flush", "io.systemd.Journal.FlushToVar"); +} + +static int relinquish_var(void) { + return simple_varlink_call("--relinquish-var/--smart-relinquish-var", "io.systemd.Journal.RelinquishVar"); +} + +static int rotate(void) { + return simple_varlink_call("--rotate", "io.systemd.Journal.Rotate"); +} + +static int sync_journal(void) { + return simple_varlink_call("--sync", "io.systemd.Journal.Synchronize"); +} + +static int action_list_fields(sd_journal *j) { + const void *data; + size_t size; + int r, n_shown = 0; + + assert(arg_field); + + r = sd_journal_set_data_threshold(j, 0); + if (r < 0) + return log_error_errno(r, "Failed to unset data size threshold: %m"); + + r = sd_journal_query_unique(j, arg_field); + if (r < 0) + return log_error_errno(r, "Failed to query unique data objects: %m"); + + SD_JOURNAL_FOREACH_UNIQUE(j, data, size) { + const void *eq; + + if (arg_lines >= 0 && n_shown >= arg_lines) + break; + + eq = memchr(data, '=', size); + if (eq) + printf("%.*s\n", (int) (size - ((const uint8_t*) eq - (const uint8_t*) data + 1)), (const char*) eq + 1); + else + printf("%.*s\n", (int) size, (const char*) data); + + n_shown++; + } + + return 0; +} + +static int update_cursor(sd_journal *j) { + _cleanup_free_ char *cursor = NULL; + int r; + + assert(j); + + if (!arg_show_cursor && !arg_cursor_file) + return 0; + + r = sd_journal_get_cursor(j, &cursor); + if (r == -EADDRNOTAVAIL) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to get cursor: %m"); + + if (arg_show_cursor) + printf("-- cursor: %s\n", cursor); + + if (arg_cursor_file) { + r = write_string_file(arg_cursor_file, cursor, WRITE_STRING_FILE_CREATE | WRITE_STRING_FILE_ATOMIC); + if (r < 0) + return log_error_errno(r, "Failed to write new cursor to %s: %m", arg_cursor_file); + } + + return 0; +} + +typedef struct Context { + sd_journal *journal; + bool need_seek; + bool since_seeked; + bool ellipsized; + bool previous_boot_id_valid; + sd_id128_t previous_boot_id; + sd_id128_t previous_boot_id_output; + dual_timestamp previous_ts_output; +} Context; + +static int show(Context *c) { + sd_journal *j; + int r, n_shown = 0; + + assert(c); + + j = ASSERT_PTR(c->journal); + + while (arg_lines < 0 || n_shown < arg_lines || arg_follow) { + int flags; + size_t highlight[2] = {}; + + if (c->need_seek) { + r = sd_journal_step_one(j, !arg_reverse); + if (r < 0) + return log_error_errno(r, "Failed to iterate through journal: %m"); + if (r == 0) + break; + } + + if (arg_until_set && !arg_reverse && (arg_lines < 0 || arg_since_set)) { + /* If --lines= is set, we usually rely on the n_shown to tell us + * when to stop. However, if --since= is set too, we may end up + * having less than --lines= to output. In this case let's also + * check if the entry is in range. */ + + usec_t usec; + + r = sd_journal_get_realtime_usec(j, &usec); + if (r < 0) + return log_error_errno(r, "Failed to determine timestamp: %m"); + if (usec > arg_until) + break; + } + + if (arg_since_set && (arg_reverse || !c->since_seeked)) { + usec_t usec; + + r = sd_journal_get_realtime_usec(j, &usec); + if (r < 0) + return log_error_errno(r, "Failed to determine timestamp: %m"); + + if (usec < arg_since) { + if (arg_reverse) + break; /* Reached the earliest entry */ + + /* arg_lines >= 0 (!since_seeked): + * We jumped arg_lines back and it seems to be too much */ + r = sd_journal_seek_realtime_usec(j, arg_since); + if (r < 0) + return log_error_errno(r, "Failed to seek to date: %m"); + c->since_seeked = true; + + c->need_seek = true; + continue; + } + c->since_seeked = true; /* We're surely within the range of --since now */ + } + + if (!arg_merge && !arg_quiet) { + sd_id128_t boot_id; + + r = sd_journal_get_monotonic_usec(j, NULL, &boot_id); + if (r >= 0) { + if (c->previous_boot_id_valid && + !sd_id128_equal(boot_id, c->previous_boot_id)) + printf("%s-- Boot "SD_ID128_FORMAT_STR" --%s\n", + ansi_highlight(), SD_ID128_FORMAT_VAL(boot_id), ansi_normal()); + + c->previous_boot_id = boot_id; + c->previous_boot_id_valid = true; + } + } + + if (arg_compiled_pattern) { + const void *message; + size_t len; + + r = sd_journal_get_data(j, "MESSAGE", &message, &len); + if (r < 0) { + if (r == -ENOENT) { + c->need_seek = true; + continue; + } + + return log_error_errno(r, "Failed to get MESSAGE field: %m"); + } + + assert_se(message = startswith(message, "MESSAGE=")); + + r = pattern_matches_and_log(arg_compiled_pattern, message, + len - strlen("MESSAGE="), highlight); + if (r < 0) + return r; + if (r == 0) { + c->need_seek = true; + continue; + } + } + + flags = + arg_all * OUTPUT_SHOW_ALL | + arg_full * OUTPUT_FULL_WIDTH | + colors_enabled() * OUTPUT_COLOR | + arg_catalog * OUTPUT_CATALOG | + arg_utc * OUTPUT_UTC | + arg_truncate_newline * OUTPUT_TRUNCATE_NEWLINE | + arg_no_hostname * OUTPUT_NO_HOSTNAME; + + r = show_journal_entry(stdout, j, arg_output, 0, flags, + arg_output_fields, highlight, &c->ellipsized, + &c->previous_ts_output, &c->previous_boot_id_output); + c->need_seek = true; + if (r == -EADDRNOTAVAIL) + break; + if (r < 0) + return r; + + n_shown++; + + /* If journalctl take a long time to process messages, and during that time journal file + * rotation occurs, a journalctl client will keep those rotated files open until it calls + * sd_journal_process(), which typically happens as a result of calling sd_journal_wait() below + * in the "following" case. By periodically calling sd_journal_process() during the processing + * loop we shrink the window of time a client instance has open file descriptors for rotated + * (deleted) journal files. */ + if ((n_shown % PROCESS_INOTIFY_INTERVAL) == 0) { + r = sd_journal_process(j); + if (r < 0) + return log_error_errno(r, "Failed to process inotify events: %m"); + } + } + + return n_shown; +} + +static int show_and_fflush(Context *c, sd_event_source *s) { + int r; + + assert(c); + assert(s); + + r = show(c); + if (r < 0) + return sd_event_exit(sd_event_source_get_event(s), r); + + fflush(stdout); + return 0; +} + +static int on_journal_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Context *c = ASSERT_PTR(userdata); + int r; + + assert(s); + + r = sd_journal_process(c->journal); + if (r < 0) { + log_error_errno(r, "Failed to process journal events: %m"); + return sd_event_exit(sd_event_source_get_event(s), r); + } + + return show_and_fflush(c, s); +} + +static int on_first_event(sd_event_source *s, void *userdata) { + return show_and_fflush(userdata, s); +} + +static int on_signal(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + assert(s); + assert(si); + assert(IN_SET(si->ssi_signo, SIGTERM, SIGINT)); + + return sd_event_exit(sd_event_source_get_event(s), si->ssi_signo); +} + +static int setup_event(Context *c, int fd, sd_event **ret) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + int r; + + assert(arg_follow); + assert(c); + assert(fd >= 0); + assert(ret); + + r = sd_event_default(&e); + if (r < 0) + return log_error_errno(r, "Failed to allocate sd_event object: %m"); + + (void) sd_event_add_signal(e, NULL, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, on_signal, NULL); + (void) sd_event_add_signal(e, NULL, SIGINT | SD_EVENT_SIGNAL_PROCMASK, on_signal, NULL); + + r = sd_event_add_io(e, NULL, fd, EPOLLIN, &on_journal_event, c); + if (r < 0) + return log_error_errno(r, "Failed to add io event source for journal: %m"); + + /* Also keeps an eye on STDOUT, and exits as soon as we see a POLLHUP on that, i.e. when it is closed. */ + r = sd_event_add_io(e, NULL, STDOUT_FILENO, EPOLLHUP|EPOLLERR, NULL, INT_TO_PTR(-ECANCELED)); + if (r == -EPERM) + /* Installing an epoll watch on a regular file doesn't work and fails with EPERM. Which is + * totally OK, handle it gracefully. epoll_ctl() documents EPERM as the error returned when + * the specified fd doesn't support epoll, hence it's safe to check for that. */ + log_debug_errno(r, "Unable to install EPOLLHUP watch on stderr, not watching for hangups."); + else if (r < 0) + return log_error_errno(r, "Failed to add io event source for stdout: %m"); + + if (arg_lines != 0 || arg_since_set) { + r = sd_event_add_defer(e, NULL, on_first_event, c); + if (r < 0) + return log_error_errno(r, "Failed to add defer event source: %m"); + } + + *ret = TAKE_PTR(e); + return 0; +} + +static int run(int argc, char *argv[]) { + bool need_seek = false, since_seeked = false, after_cursor = false; + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_free_ char *cursor_from_file = NULL; + const char *cursor = NULL; + int n_shown, r, poll_fd = -EBADF; + + setlocale(LC_ALL, ""); + log_setup(); + + /* Increase max number of open files if we can, we might needs this when browsing journal files, which might be + * split up into many files. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_VALIDATE_OS | + DISSECT_IMAGE_RELAX_VAR_CHECK | + (arg_action == ACTION_UPDATE_CATALOG ? DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS : DISSECT_IMAGE_READ_ONLY), + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } + + signal(SIGWINCH, columns_lines_cache_reset); + sigbus_install(); + + switch (arg_action) { + + case ACTION_NEW_ID128: + return id128_print_new(ID128_PRINT_PRETTY); + + case ACTION_SETUP_KEYS: + return setup_keys(); + + case ACTION_LIST_CATALOG: + case ACTION_DUMP_CATALOG: + case ACTION_UPDATE_CATALOG: { + _cleanup_free_ char *database = NULL; + + database = path_join(arg_root, secure_getenv("SYSTEMD_CATALOG") ?: CATALOG_DATABASE); + if (!database) + return log_oom(); + + if (arg_action == ACTION_UPDATE_CATALOG) { + const char *e; + + e = secure_getenv("SYSTEMD_CATALOG_SOURCES"); + + r = catalog_update( + database, + arg_root, + e ? (const char* const*) STRV_MAKE(e) : catalog_file_dirs); + if (r < 0) + return log_error_errno(r, "Failed to list catalog: %m"); + } else { + bool oneline = arg_action == ACTION_LIST_CATALOG; + + pager_open(arg_pager_flags); + + if (optind < argc) + r = catalog_list_items(stdout, database, oneline, argv + optind); + else + r = catalog_list(stdout, database, oneline); + if (r < 0) + return log_error_errno(r, "Failed to list catalog: %m"); + } + + return 0; + } + + case ACTION_FLUSH: + return flush_to_var(); + + case ACTION_RELINQUISH_VAR: + return relinquish_var(); + + case ACTION_SYNC: + return sync_journal(); + + case ACTION_ROTATE: + return rotate(); + + case ACTION_SHOW: + case ACTION_PRINT_HEADER: + case ACTION_VERIFY: + case ACTION_DISK_USAGE: + case ACTION_LIST_BOOTS: + case ACTION_VACUUM: + case ACTION_ROTATE_AND_VACUUM: + case ACTION_LIST_FIELDS: + case ACTION_LIST_FIELD_NAMES: + /* These ones require access to the journal files, continue below. */ + break; + + default: + assert_not_reached(); + } + + if (arg_directory) + r = sd_journal_open_directory(&j, arg_directory, arg_journal_type); + else if (arg_root) + r = sd_journal_open_directory(&j, arg_root, arg_journal_type | SD_JOURNAL_OS_ROOT); + else if (arg_file_stdin) + r = sd_journal_open_files_fd(&j, (int[]) { STDIN_FILENO }, 1, 0); + else if (arg_file) + r = sd_journal_open_files(&j, (const char**) arg_file, 0); + else if (arg_machine) + r = journal_open_machine(&j, arg_machine); + else + r = sd_journal_open_namespace( + &j, + arg_namespace, + (arg_merge ? 0 : SD_JOURNAL_LOCAL_ONLY) | + arg_namespace_flags | arg_journal_type); + if (r < 0) + return log_error_errno(r, "Failed to open %s: %m", arg_directory ?: arg_file ? "files" : "journal"); + + r = journal_access_check_and_warn(j, arg_quiet, + !(arg_journal_type == SD_JOURNAL_CURRENT_USER || arg_user_units)); + if (r < 0) + return r; + + switch (arg_action) { + + case ACTION_NEW_ID128: + case ACTION_SETUP_KEYS: + case ACTION_LIST_CATALOG: + case ACTION_DUMP_CATALOG: + case ACTION_UPDATE_CATALOG: + case ACTION_FLUSH: + case ACTION_SYNC: + case ACTION_ROTATE: + assert_not_reached(); + + case ACTION_PRINT_HEADER: + journal_print_header(j); + return 0; + + case ACTION_VERIFY: + return verify(j, !arg_quiet); + + case ACTION_DISK_USAGE: { + uint64_t bytes = 0; + + r = sd_journal_get_usage(j, &bytes); + if (r < 0) + return r; + + printf("Archived and active journals take up %s in the file system.\n", + FORMAT_BYTES(bytes)); + + return 0; + } + + case ACTION_LIST_BOOTS: + return list_boots(j); + + case ACTION_ROTATE_AND_VACUUM: + + r = rotate(); + if (r < 0) + return r; + + _fallthrough_; + + case ACTION_VACUUM: { + Directory *d; + int ret = 0; + + HASHMAP_FOREACH(d, j->directories_by_path) { + r = journal_directory_vacuum(d->path, arg_vacuum_size, arg_vacuum_n_files, arg_vacuum_time, NULL, !arg_quiet); + if (r < 0) { + log_error_errno(r, "Failed to vacuum %s: %m", d->path); + if (ret >= 0) + ret = r; + } + } + + return ret; + } + + case ACTION_LIST_FIELD_NAMES: { + const char *field; + + SD_JOURNAL_FOREACH_FIELD(j, field) + printf("%s\n", field); + + return 0; + } + + case ACTION_SHOW: + case ACTION_LIST_FIELDS: + break; + + default: + assert_not_reached(); + } + + if (arg_boot_offset != 0 && + sd_journal_has_runtime_files(j) > 0 && + sd_journal_has_persistent_files(j) == 0) { + log_info("Specifying boot ID or boot offset has no effect, no persistent journal was found."); + + if (arg_action == ACTION_SHOW && arg_compiled_pattern) + return -ENOENT; + + return 0; + } + /* add_boot() must be called first! + * It may need to seek the journal to find parent boot IDs. */ + r = add_boot(j); + if (r < 0) + return r; + + r = add_dmesg(j); + if (r < 0) + return r; + + r = add_units(j); + if (r < 0) + return log_error_errno(r, "Failed to add filter for units: %m"); + + r = add_syslog_identifier(j); + if (r < 0) + return log_error_errno(r, "Failed to add filter for syslog identifiers: %m"); + + r = add_priorities(j); + if (r < 0) + return r; + + r = add_facilities(j); + if (r < 0) + return r; + + r = add_matches(j, argv + optind); + if (r < 0) + return r; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *filter = NULL; + + filter = journal_make_match_string(j); + if (!filter) + return log_oom(); + + log_debug("Journal filter: %s", filter); + } + + if (arg_action == ACTION_LIST_FIELDS) + return action_list_fields(j); + + /* Opening the fd now means the first sd_journal_wait() will actually wait */ + if (arg_follow) { + poll_fd = sd_journal_get_fd(j); + if (poll_fd == -EMFILE) { + log_warning_errno(poll_fd, "Insufficient watch descriptors available. Reverting to -n."); + arg_follow = false; + } else if (poll_fd == -EMEDIUMTYPE) + return log_error_errno(poll_fd, "The --follow switch is not supported in conjunction with reading from STDIN."); + else if (poll_fd < 0) + return log_error_errno(poll_fd, "Failed to get journal fd: %m"); + } + + if (arg_cursor || arg_after_cursor || arg_cursor_file) { + cursor = arg_cursor ?: arg_after_cursor; + + if (arg_cursor_file) { + r = read_one_line_file(arg_cursor_file, &cursor_from_file); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to read cursor file %s: %m", arg_cursor_file); + + if (r > 0) { + cursor = cursor_from_file; + after_cursor = true; + } + } else + after_cursor = arg_after_cursor; + } + + if (cursor) { + r = sd_journal_seek_cursor(j, cursor); + if (r < 0) + return log_error_errno(r, "Failed to seek to cursor: %m"); + + r = sd_journal_step_one(j, !arg_reverse); + if (r < 0) + return log_error_errno(r, "Failed to iterate through journal: %m"); + + if (after_cursor && r > 0) { + /* With --after-cursor=/--cursor-file= we want to skip the first entry only if it's + * the entry the cursor is pointing at, otherwise, if some journal filters are used, + * we might skip the first entry of the filter match, which leads to unexpectedly + * missing journal entries. */ + int k; + + k = sd_journal_test_cursor(j, cursor); + if (k < 0) + return log_error_errno(k, "Failed to test cursor against current entry: %m"); + if (k > 0) + /* Current entry matches the one our cursor is pointing at, so let's try + * to advance the next entry. */ + r = sd_journal_step_one(j, !arg_reverse); + } + + if (r == 0) { + /* We couldn't find the next entry after the cursor. */ + if (arg_follow) + need_seek = true; + else + arg_lines = 0; + } + } else if (arg_until_set && (arg_reverse || arg_lines_needs_seek_end())) { + /* If both --until and any of --reverse and --lines=N is specified, things get + * a little tricky. We seek to the place of --until first. If only --reverse or + * --reverse and --lines is specified, we search backwards and let the output + * counter handle --lines for us. If only --lines is used, we just jump backwards + * arg_lines and search afterwards from there. */ + + r = sd_journal_seek_realtime_usec(j, arg_until); + if (r < 0) + return log_error_errno(r, "Failed to seek to date: %m"); + + if (arg_reverse) + r = sd_journal_previous(j); + else /* arg_lines_needs_seek_end */ + r = sd_journal_previous_skip(j, arg_lines); + + } else if (arg_reverse) { + r = sd_journal_seek_tail(j); + if (r < 0) + return log_error_errno(r, "Failed to seek to tail: %m"); + + r = sd_journal_previous(j); + + } else if (arg_lines_needs_seek_end()) { + r = sd_journal_seek_tail(j); + if (r < 0) + return log_error_errno(r, "Failed to seek to tail: %m"); + + r = sd_journal_previous_skip(j, arg_lines); + + } else if (arg_since_set) { + /* This is placed after arg_reverse and arg_lines. If --since is used without + * both, we seek to the place of --since and search afterwards from there. + * If used with --reverse or --lines, we seek to the tail first and check if + * the entry is within the range of --since later. */ + + r = sd_journal_seek_realtime_usec(j, arg_since); + if (r < 0) + return log_error_errno(r, "Failed to seek to date: %m"); + since_seeked = true; + + r = sd_journal_next(j); + + } else { + r = sd_journal_seek_head(j); + if (r < 0) + return log_error_errno(r, "Failed to seek to head: %m"); + + r = sd_journal_next(j); + } + if (r < 0) + return log_error_errno(r, "Failed to iterate through journal: %m"); + if (r == 0) + need_seek = true; + + if (!arg_follow) + pager_open(arg_pager_flags); + + if (!arg_quiet && (arg_lines != 0 || arg_follow) && DEBUG_LOGGING) { + usec_t start, end; + char start_buf[FORMAT_TIMESTAMP_MAX], end_buf[FORMAT_TIMESTAMP_MAX]; + + r = sd_journal_get_cutoff_realtime_usec(j, &start, &end); + if (r < 0) + return log_error_errno(r, "Failed to get cutoff: %m"); + if (r > 0) { + if (arg_follow) + printf("-- Journal begins at %s. --\n", + format_timestamp_maybe_utc(start_buf, sizeof(start_buf), start)); + else + printf("-- Journal begins at %s, ends at %s. --\n", + format_timestamp_maybe_utc(start_buf, sizeof(start_buf), start), + format_timestamp_maybe_utc(end_buf, sizeof(end_buf), end)); + } + } + + Context c = { + .journal = j, + .need_seek = need_seek, + .since_seeked = since_seeked, + }; + + if (arg_follow) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + int sig; + + assert(poll_fd >= 0); + + r = setup_event(&c, poll_fd, &e); + if (r < 0) + return r; + + r = sd_event_loop(e); + if (r < 0) + return r; + sig = r; + + /* unref signal event sources. */ + e = sd_event_unref(e); + + r = update_cursor(j); + if (r < 0) + return r; + + /* re-send the original signal. */ + assert(SIGNAL_VALID(sig)); + if (raise(sig) < 0) + log_error("Failed to raise the original signal SIG%s, ignoring: %m", signal_to_string(sig)); + + return 0; + } + + r = show(&c); + if (r < 0) + return r; + n_shown = r; + + if (n_shown == 0 && !arg_quiet) + printf("-- No entries --\n"); + + r = update_cursor(j); + if (r < 0) + return r; + + if (arg_compiled_pattern && n_shown == 0) + /* --grep was used, no error was thrown, but the pattern didn't + * match anything. Let's mimic grep's behavior here and return + * a non-zero exit code, so journalctl --grep can be used + * in scripts and such */ + return -ENOENT; + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/journal/journald-audit.c b/src/journal/journald-audit.c new file mode 100644 index 0000000..bddfe76 --- /dev/null +++ b/src/journal/journald-audit.c @@ -0,0 +1,556 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "audit-type.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "iovec-util.h" +#include "journal-internal.h" +#include "journald-audit.h" +#include "missing_audit.h" +#include "string-util.h" + +typedef struct MapField { + const char *audit_field; + const char *journal_field; + int (*map)(const char *field, const char **p, struct iovec *iovec, size_t *n); +} MapField; + +static int map_simple_field( + const char *field, + const char **p, + struct iovec *iovec, + size_t *n) { + + _cleanup_free_ char *c = NULL; + size_t l = 0; + const char *e; + + assert(field); + assert(p); + assert(iovec); + assert(n); + + l = strlen(field); + c = malloc(l + 1); + if (!c) + return -ENOMEM; + + memcpy(c, field, l); + for (e = *p; !IN_SET(*e, 0, ' '); e++) { + if (!GREEDY_REALLOC(c, l+2)) + return -ENOMEM; + + c[l++] = *e; + } + + c[l] = 0; + + iovec[(*n)++] = IOVEC_MAKE(c, l); + + *p = e; + c = NULL; + + return 1; +} + +static int map_string_field_internal( + const char *field, + const char **p, + struct iovec *iovec, + size_t *n, + bool filter_printable) { + + _cleanup_free_ char *c = NULL; + const char *s, *e; + size_t l; + + assert(field); + assert(p); + assert(iovec); + assert(n); + + /* The kernel formats string fields in one of two formats. */ + + if (**p == '"') { + /* Normal quoted syntax */ + s = *p + 1; + e = strchr(s, '"'); + if (!e) + return 0; + + l = strlen(field) + (e - s); + c = malloc(l+1); + if (!c) + return -ENOMEM; + + *((char*) mempcpy(stpcpy(c, field), s, e - s)) = 0; + + e += 1; + + } else if (unhexchar(**p) >= 0) { + /* Hexadecimal escaping */ + l = strlen(field); + c = malloc(l + 2); + if (!c) + return -ENOMEM; + + memcpy(c, field, l); + for (e = *p; !IN_SET(*e, 0, ' '); e += 2) { + int a, b; + uint8_t x; + + a = unhexchar(e[0]); + if (a < 0) + return 0; + + b = unhexchar(e[1]); + if (b < 0) + return 0; + + x = ((uint8_t) a << 4 | (uint8_t) b); + + if (filter_printable && x < (uint8_t) ' ') + x = (uint8_t) ' '; + + if (!GREEDY_REALLOC(c, l+2)) + return -ENOMEM; + + c[l++] = (char) x; + } + + c[l] = 0; + } else + return 0; + + iovec[(*n)++] = IOVEC_MAKE(c, l); + + *p = e; + c = NULL; + + return 1; +} + +static int map_string_field(const char *field, const char **p, struct iovec *iovec, size_t *n) { + return map_string_field_internal(field, p, iovec, n, false); +} + +static int map_string_field_printable(const char *field, const char **p, struct iovec *iovec, size_t *n) { + return map_string_field_internal(field, p, iovec, n, true); +} + +static int map_generic_field( + const char *prefix, + const char **p, + struct iovec *iovec, + size_t *n) { + + const char *e, *f; + char *c, *t; + int r; + + /* Implements fallback mappings for all fields we don't know */ + + for (e = *p; e < *p + 16; e++) { + + if (IN_SET(*e, 0, ' ')) + return 0; + + if (*e == '=') + break; + + if (!(ascii_isalpha(*e) || + ascii_isdigit(*e) || + IN_SET(*e, '_', '-'))) + return 0; + } + + if (e <= *p || e >= *p + 16) + return 0; + + c = newa(char, strlen(prefix) + (e - *p) + 2); + + t = stpcpy(c, prefix); + for (f = *p; f < e; f++) { + char x; + + if (*f >= 'a' && *f <= 'z') + x = (*f - 'a') + 'A'; /* uppercase */ + else if (*f == '-') + x = '_'; /* dashes → underscores */ + else + x = *f; + + *(t++) = x; + } + strcpy(t, "="); + + e++; + + r = map_simple_field(c, &e, iovec, n); + if (r < 0) + return r; + + *p = e; + return r; +} + +/* Kernel fields are those occurring in the audit string before + * msg='. All of these fields are trusted, hence carry the "_" prefix. + * We try to translate the fields we know into our native names. The + * other's are generically mapped to _AUDIT_FIELD_XYZ= */ +static const MapField map_fields_kernel[] = { + + /* First, we map certain well-known audit fields into native + * well-known fields */ + { "pid=", "_PID=", map_simple_field }, + { "ppid=", "_PPID=", map_simple_field }, + { "uid=", "_UID=", map_simple_field }, + { "euid=", "_EUID=", map_simple_field }, + { "fsuid=", "_FSUID=", map_simple_field }, + { "gid=", "_GID=", map_simple_field }, + { "egid=", "_EGID=", map_simple_field }, + { "fsgid=", "_FSGID=", map_simple_field }, + { "tty=", "_TTY=", map_simple_field }, + { "ses=", "_AUDIT_SESSION=", map_simple_field }, + { "auid=", "_AUDIT_LOGINUID=", map_simple_field }, + { "subj=", "_SELINUX_CONTEXT=", map_simple_field }, + { "comm=", "_COMM=", map_string_field }, + { "exe=", "_EXE=", map_string_field }, + { "proctitle=", "_CMDLINE=", map_string_field_printable }, + + /* Some fields don't map to native well-known fields. However, + * we know that they are string fields, hence let's undo + * string field escaping for them, though we stick to the + * generic field names. */ + { "path=", "_AUDIT_FIELD_PATH=", map_string_field }, + { "dev=", "_AUDIT_FIELD_DEV=", map_string_field }, + { "name=", "_AUDIT_FIELD_NAME=", map_string_field }, + {} +}; + +/* Userspace fields are those occurring in the audit string after + * msg='. All of these fields are untrusted, hence carry no "_" + * prefix. We map the fields we don't know to AUDIT_FIELD_XYZ= */ +static const MapField map_fields_userspace[] = { + { "cwd=", "AUDIT_FIELD_CWD=", map_string_field }, + { "cmd=", "AUDIT_FIELD_CMD=", map_string_field }, + { "acct=", "AUDIT_FIELD_ACCT=", map_string_field }, + { "exe=", "AUDIT_FIELD_EXE=", map_string_field }, + { "comm=", "AUDIT_FIELD_COMM=", map_string_field }, + {} +}; + +static int map_all_fields( + const char *p, + const MapField map_fields[], + const char *prefix, + bool handle_msg, + struct iovec *iovec, + size_t *n, + size_t m) { + + int r; + + assert(p); + assert(iovec); + assert(n); + + for (;;) { + bool mapped = false; + const MapField *mf; + const char *v; + + if (*n >= m) { + log_debug( + "More fields in audit message than audit field limit (%i), skipping remaining fields", + N_IOVEC_AUDIT_FIELDS); + return 0; + } + + p += strspn(p, WHITESPACE); + + if (*p == 0) + return 0; + + if (handle_msg) { + v = startswith(p, "msg='"); + if (v) { + _cleanup_free_ char *c = NULL; + const char *e; + + /* Userspace message. It's enclosed in + simple quotation marks, is not + escaped, but the last field in the + line, hence let's remove the + quotation mark, and apply the + userspace mapping instead of the + kernel mapping. */ + + e = endswith(v, "'"); + if (!e) + return 0; /* don't continue splitting up if the final quotation mark is missing */ + + c = strndup(v, e - v); + if (!c) + return -ENOMEM; + + return map_all_fields(c, map_fields_userspace, "AUDIT_FIELD_", false, iovec, n, m); + } + } + + /* Try to map the kernel fields to our own names */ + for (mf = map_fields; mf->audit_field; mf++) { + v = startswith(p, mf->audit_field); + if (!v) + continue; + + r = mf->map(mf->journal_field, &v, iovec, n); + if (r < 0) + return log_debug_errno(r, "Failed to parse audit array: %m"); + + if (r > 0) { + mapped = true; + p = v; + break; + } + } + + if (!mapped) { + r = map_generic_field(prefix, &p, iovec, n); + if (r < 0) + return log_debug_errno(r, "Failed to parse audit array: %m"); + + if (r == 0) + /* Couldn't process as generic field, let's just skip over it */ + p += strcspn(p, WHITESPACE); + } + } +} + +void process_audit_string(Server *s, int type, const char *data, size_t size) { + size_t n = 0, z; + uint64_t seconds, msec, id; + const char *p, *type_name; + char id_field[sizeof("_AUDIT_ID=") + DECIMAL_STR_MAX(uint64_t)], + type_field[sizeof("_AUDIT_TYPE=") + DECIMAL_STR_MAX(int)], + source_time_field[sizeof("_SOURCE_REALTIME_TIMESTAMP=") + DECIMAL_STR_MAX(usec_t)]; + struct iovec iovec[N_IOVEC_META_FIELDS + 8 + N_IOVEC_AUDIT_FIELDS]; + char *m, *type_field_name; + int k; + + assert(s); + + if (size <= 0) + return; + + if (!data) + return; + + /* Note that the input buffer is NUL terminated, but let's + * check whether there is a spurious NUL byte */ + if (memchr(data, 0, size)) + return; + + p = startswith(data, "audit"); + if (!p) + return; + + k = 0; + if (sscanf(p, "(%" PRIu64 ".%" PRIu64 ":%" PRIu64 "):%n", + &seconds, + &msec, + &id, + &k) != 3 || k == 0) + return; + + p += k; + p += strspn(p, WHITESPACE); + + if (isempty(p)) + return; + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=audit"); + + sprintf(source_time_field, "_SOURCE_REALTIME_TIMESTAMP=%" PRIu64, + (usec_t) seconds * USEC_PER_SEC + (usec_t) msec * USEC_PER_MSEC); + iovec[n++] = IOVEC_MAKE_STRING(source_time_field); + + sprintf(type_field, "_AUDIT_TYPE=%i", type); + iovec[n++] = IOVEC_MAKE_STRING(type_field); + + sprintf(id_field, "_AUDIT_ID=%" PRIu64, id); + iovec[n++] = IOVEC_MAKE_STRING(id_field); + + assert_cc(4 == LOG_FAC(LOG_AUTH)); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=4"); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=audit"); + + type_name = audit_type_name_alloca(type); + + type_field_name = strjoina("_AUDIT_TYPE_NAME=", type_name); + iovec[n++] = IOVEC_MAKE_STRING(type_field_name); + + m = strjoina("MESSAGE=", type_name, " ", p); + iovec[n++] = IOVEC_MAKE_STRING(m); + + z = n; + + map_all_fields(p, map_fields_kernel, "_AUDIT_FIELD_", true, iovec, &n, n + N_IOVEC_AUDIT_FIELDS); + + server_dispatch_message(s, iovec, n, ELEMENTSOF(iovec), NULL, NULL, LOG_NOTICE, 0); + + /* free() all entries that map_all_fields() added. All others + * are allocated on the stack or are constant. */ + + for (; z < n; z++) + free(iovec[z].iov_base); +} + +void server_process_audit_message( + Server *s, + const void *buffer, + size_t buffer_size, + const struct ucred *ucred, + const union sockaddr_union *sa, + socklen_t salen) { + + const struct nlmsghdr *nl = buffer; + + assert(s); + + if (buffer_size < ALIGN(sizeof(struct nlmsghdr))) + return; + + assert(buffer); + + /* Filter out fake data */ + if (!sa || + salen != sizeof(struct sockaddr_nl) || + sa->nl.nl_family != AF_NETLINK || + sa->nl.nl_pid != 0) { + log_debug("Audit netlink message from invalid sender."); + return; + } + + if (!ucred || ucred->pid != 0) { + log_debug("Audit netlink message with invalid credentials."); + return; + } + + if (!NLMSG_OK(nl, buffer_size)) { + log_ratelimit_error(JOURNAL_LOG_RATELIMIT, "Audit netlink message truncated."); + return; + } + + /* Ignore special Netlink messages */ + if (IN_SET(nl->nlmsg_type, NLMSG_NOOP, NLMSG_ERROR)) + return; + + /* Except AUDIT_USER, all messages below AUDIT_FIRST_USER_MSG are control messages, let's ignore those */ + if (nl->nlmsg_type < AUDIT_FIRST_USER_MSG && nl->nlmsg_type != AUDIT_USER) + return; + + process_audit_string(s, nl->nlmsg_type, NLMSG_DATA(nl), nl->nlmsg_len - ALIGN(sizeof(struct nlmsghdr))); +} + +static int enable_audit(int fd, bool b) { + struct { + union { + struct nlmsghdr header; + uint8_t header_space[NLMSG_HDRLEN]; + }; + struct audit_status body; + } _packed_ request = { + .header.nlmsg_len = NLMSG_LENGTH(sizeof(struct audit_status)), + .header.nlmsg_type = AUDIT_SET, + .header.nlmsg_flags = NLM_F_REQUEST, + .header.nlmsg_seq = 1, + .header.nlmsg_pid = 0, + .body.mask = AUDIT_STATUS_ENABLED, + .body.enabled = b, + }; + union sockaddr_union sa = { + .nl.nl_family = AF_NETLINK, + .nl.nl_pid = 0, + }; + struct iovec iovec = { + .iov_base = &request, + .iov_len = NLMSG_LENGTH(sizeof(struct audit_status)), + }; + struct msghdr mh = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa.nl), + }; + + ssize_t n; + + n = sendmsg(fd, &mh, MSG_NOSIGNAL); + if (n < 0) + return -errno; + if (n != NLMSG_LENGTH(sizeof(struct audit_status))) + return -EIO; + + /* We don't wait for the result here, we can't do anything + * about it anyway */ + + return 0; +} + +int server_open_audit(Server *s) { + int r; + + if (s->audit_fd < 0) { + static const union sockaddr_union sa = { + .nl.nl_family = AF_NETLINK, + .nl.nl_pid = 0, + .nl.nl_groups = AUDIT_NLGRP_READLOG, + }; + + s->audit_fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_AUDIT); + if (s->audit_fd < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + log_debug("Audit not supported in the kernel."); + else + log_warning_errno(errno, "Failed to create audit socket, ignoring: %m"); + + return 0; + } + + if (bind(s->audit_fd, &sa.sa, sizeof(sa.nl)) < 0) { + log_warning_errno(errno, + "Failed to join audit multicast group. " + "The kernel is probably too old or multicast reading is not supported. " + "Ignoring: %m"); + s->audit_fd = safe_close(s->audit_fd); + return 0; + } + } else + (void) fd_nonblock(s->audit_fd, true); + + r = setsockopt_int(s->audit_fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "Failed to set SO_PASSCRED on audit socket: %m"); + + r = sd_event_add_io(s->event, &s->audit_event_source, s->audit_fd, EPOLLIN, server_process_datagram, s); + if (r < 0) + return log_error_errno(r, "Failed to add audit fd to event loop: %m"); + + if (s->set_audit >= 0) { + /* We are listening now, try to enable audit if configured so */ + r = enable_audit(s->audit_fd, s->set_audit); + if (r < 0) + log_warning_errno(r, "Failed to issue audit enable call: %m"); + else if (s->set_audit > 0) + log_debug("Auditing in kernel turned on."); + else + log_debug("Auditing in kernel turned off."); + } + + return 0; +} diff --git a/src/journal/journald-audit.h b/src/journal/journald-audit.h new file mode 100644 index 0000000..79f3da9 --- /dev/null +++ b/src/journal/journald-audit.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" +#include "socket-util.h" + +void server_process_audit_message(Server *s, const void *buffer, size_t buffer_size, const struct ucred *ucred, const union sockaddr_union *sa, socklen_t salen); + +void process_audit_string(Server *s, int type, const char *data, size_t size); + +int server_open_audit(Server *s); diff --git a/src/journal/journald-client.c b/src/journal/journald-client.c new file mode 100644 index 0000000..25970c1 --- /dev/null +++ b/src/journal/journald-client.c @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "cgroup-util.h" +#include "errno-util.h" +#include "journald-client.h" +#include "nulstr-util.h" +#include "pcre2-util.h" + +/* This consumes both `allow_list` and `deny_list` arguments. Hence, those arguments are not owned by the + * caller anymore and should not be freed. */ +static void client_set_filtering_patterns(ClientContext *c, Set *allow_list, Set *deny_list) { + assert(c); + + set_free_and_replace(c->log_filter_allowed_patterns, allow_list); + set_free_and_replace(c->log_filter_denied_patterns, deny_list); +} + +static int client_parse_log_filter_nulstr(const char *nulstr, size_t len, Set **ret) { + _cleanup_set_free_ Set *s = NULL; + _cleanup_strv_free_ char **patterns_strv = NULL; + int r; + + assert(nulstr); + assert(ret); + + patterns_strv = strv_parse_nulstr(nulstr, len); + if (!patterns_strv) + return log_oom_debug(); + + STRV_FOREACH(pattern, patterns_strv) { + _cleanup_(pattern_freep) pcre2_code *compiled_pattern = NULL; + + r = pattern_compile_and_log(*pattern, 0, &compiled_pattern); + if (r < 0) + return r; + + r = set_ensure_consume(&s, &pcre2_code_hash_ops_free, TAKE_PTR(compiled_pattern)); + if (r < 0) + return log_debug_errno(r, "Failed to insert regex into set: %m"); + } + + *ret = TAKE_PTR(s); + + return 0; +} + +int client_context_read_log_filter_patterns(ClientContext *c, const char *cgroup) { + char *deny_list_xattr, *xattr_end; + _cleanup_free_ char *xattr = NULL, *unit_cgroup = NULL; + _cleanup_set_free_ Set *allow_list = NULL, *deny_list = NULL; + int r; + + assert(c); + + r = cg_path_get_unit_path(cgroup, &unit_cgroup); + if (r < 0) + return log_debug_errno(r, "Failed to get the unit's cgroup path for %s: %m", cgroup); + + r = cg_get_xattr_malloc(unit_cgroup, "user.journald_log_filter_patterns", &xattr); + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) { + client_set_filtering_patterns(c, NULL, NULL); + return 0; + } else if (r < 0) + return log_debug_errno(r, "Failed to get user.journald_log_filter_patterns xattr for %s: %m", unit_cgroup); + + xattr_end = xattr + r; + + /* We expect '0xff' to be present in the attribute, even if the lists are empty. We expect the + * following: + * - Allow list, but no deny list: 0xXX, ...., 0xff + * - No allow list, but deny list: 0xff, 0xXX, .... + * - Allow list, and deny list: 0xXX, ...., 0xff, 0xXX, .... + * This is due to the fact allowed and denied patterns list are two nulstr joined together with '0xff'. + * None of the allowed or denied nulstr have a nul-termination character. + * + * We do not expect both the allow list and deny list to be empty, as this condition is tested + * before writing to xattr. */ + deny_list_xattr = memchr(xattr, (char)0xff, r); + if (!deny_list_xattr) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Missing delimiter in cgroup user.journald_log_filter_patterns attribute: %m"); + + r = client_parse_log_filter_nulstr(xattr, deny_list_xattr - xattr, &allow_list); + if (r < 0) + return r; + + /* Use 'deny_list_xattr + 1' to skip '0xff'. */ + ++deny_list_xattr; + r = client_parse_log_filter_nulstr(deny_list_xattr, xattr_end - deny_list_xattr, &deny_list); + if (r < 0) + return r; + + client_set_filtering_patterns(c, TAKE_PTR(allow_list), TAKE_PTR(deny_list)); + + return 0; +} + +int client_context_check_keep_log(ClientContext *c, const char *message, size_t len) { + pcre2_code *regex; + + if (!c || !message) + return true; + + SET_FOREACH(regex, c->log_filter_denied_patterns) + if (pattern_matches_and_log(regex, message, len, NULL) > 0) + return false; + + SET_FOREACH(regex, c->log_filter_allowed_patterns) + if (pattern_matches_and_log(regex, message, len, NULL) > 0) + return true; + + return set_isempty(c->log_filter_allowed_patterns); +} diff --git a/src/journal/journald-client.h b/src/journal/journald-client.h new file mode 100644 index 0000000..629cd41 --- /dev/null +++ b/src/journal/journald-client.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-context.h" + +int client_context_read_log_filter_patterns(ClientContext *c, const char *cgroup); +int client_context_check_keep_log(ClientContext *c, const char *message, size_t len); diff --git a/src/journal/journald-console.c b/src/journal/journald-console.c new file mode 100644 index 0000000..dfa4096 --- /dev/null +++ b/src/journal/journald-console.c @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "iovec-util.h" +#include "journald-console.h" +#include "journald-server.h" +#include "parse-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "terminal-util.h" + +static bool prefix_timestamp(void) { + + static int cached_printk_time = -1; + + if (_unlikely_(cached_printk_time < 0)) { + _cleanup_free_ char *p = NULL; + + cached_printk_time = + read_one_line_file("/sys/module/printk/parameters/time", &p) >= 0 + && parse_boolean(p) > 0; + } + + return cached_printk_time; +} + +void server_forward_console( + Server *s, + int priority, + const char *identifier, + const char *message, + const struct ucred *ucred) { + + struct iovec iovec[7]; + struct timespec ts; + char tbuf[STRLEN("[] ") + DECIMAL_STR_MAX(ts.tv_sec) + DECIMAL_STR_MAX(ts.tv_nsec)-3 + 1]; + char header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t)]; + _cleanup_free_ char *ident_buf = NULL; + _cleanup_close_ int fd = -EBADF; + const char *tty, *color_on = "", *color_off = ""; + int n = 0; + + assert(s); + assert(message); + + if (LOG_PRI(priority) > s->max_level_console) + return; + + /* First: timestamp */ + if (prefix_timestamp()) { + assert_se(clock_gettime(CLOCK_MONOTONIC, &ts) == 0); + xsprintf(tbuf, "[%5"PRI_TIME".%06"PRI_NSEC"] ", + ts.tv_sec, + (nsec_t)ts.tv_nsec / 1000); + + iovec[n++] = IOVEC_MAKE_STRING(tbuf); + } + + /* Second: identifier and PID */ + if (ucred) { + if (!identifier) { + (void) pid_get_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); + + if (identifier) + iovec[n++] = IOVEC_MAKE_STRING(identifier); + + iovec[n++] = IOVEC_MAKE_STRING(header_pid); + } else if (identifier) { + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + + get_log_colors(LOG_PRI(priority), &color_on, &color_off, NULL); + + /* Fourth: message */ + iovec[n++] = IOVEC_MAKE_STRING(color_on); + iovec[n++] = IOVEC_MAKE_STRING(message); + iovec[n++] = IOVEC_MAKE_STRING(color_off); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + tty = s->tty_path ?: "/dev/console"; + + /* Before you ask: yes, on purpose we open/close the console for each log line we write individually. This is a + * good strategy to avoid journald getting killed by the kernel's SAK concept (it doesn't fix this entirely, + * but minimizes the time window the kernel might end up killing journald due to SAK). It also makes things + * easier for us so that we don't have to recover from hangups and suchlike triggered on the console. */ + + fd = open_terminal(tty, O_WRONLY|O_NOCTTY|O_CLOEXEC); + if (fd < 0) { + log_debug_errno(fd, "Failed to open %s for logging: %m", tty); + return; + } + + if (writev(fd, iovec, n) < 0) + log_debug_errno(errno, "Failed to write to %s for logging: %m", tty); +} diff --git a/src/journal/journald-console.h b/src/journal/journald-console.h new file mode 100644 index 0000000..0a26f9c --- /dev/null +++ b/src/journal/journald-console.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +void server_forward_console(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred); diff --git a/src/journal/journald-context.c b/src/journal/journald-context.c new file mode 100644 index 0000000..f5f6ec5 --- /dev/null +++ b/src/journal/journald-context.c @@ -0,0 +1,799 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_SELINUX +#include +#endif + +#include "alloc-util.h" +#include "audit-util.h" +#include "cgroup-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "journald-client.h" +#include "journald-context.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "string-util.h" +#include "syslog-util.h" +#include "unaligned.h" +#include "user-util.h" + +/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc + * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we + * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the + * log entry was originally created. We hence just increase the "window of inaccuracy" a bit. + * + * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed + * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s + * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are + * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not + * flushed out). Data newer than 1s is used immediately without refresh. + * + * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry + * as long as their socket is connected. Note that cache entries are shared between different transports. That means a + * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols. + * + * Caching metadata like this has two major benefits: + * + * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood. + * + * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache + * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating + * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a + * stream connection. This should improve cases where a service process logs immediately before exiting and we + * previously had trouble associating the log message with the service. + * + * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of + * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slightly older + * and sometimes slightly newer than what was current at the log event). + */ + +/* We refresh every 1s */ +#define REFRESH_USEC (1*USEC_PER_SEC) + +/* Data older than 5s we flush out */ +#define MAX_USEC (5*USEC_PER_SEC) + +/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in + * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream + * clients itself is limited.) */ +#define CACHE_MAX_FALLBACK 128U +#define CACHE_MAX_MAX (16*1024U) +#define CACHE_MAX_MIN 64U + +static size_t cache_max(void) { + static size_t cached = -1; + + if (cached == SIZE_MAX) { + uint64_t mem_total; + int r; + + r = procfs_memory_get(&mem_total, NULL); + if (r < 0) { + log_warning_errno(r, "Cannot query /proc/meminfo for MemTotal: %m"); + cached = CACHE_MAX_FALLBACK; + } else + /* Cache entries are usually a few kB, but the process cmdline is controlled by the + * user and can be up to _SC_ARG_MAX, usually 2MB. Let's say that approximately up to + * 1/8th of memory may be used by the cache. + * + * In the common case, this formula gives 64 cache entries for each GB of RAM. + */ + cached = CLAMP(mem_total / 8 / sc_arg_max(), CACHE_MAX_MIN, CACHE_MAX_MAX); + } + + return cached; +} + +static int client_context_compare(const void *a, const void *b) { + const ClientContext *x = a, *y = b; + int r; + + r = CMP(x->timestamp, y->timestamp); + if (r != 0) + return r; + + return CMP(x->pid, y->pid); +} + +static int client_context_new(Server *s, pid_t pid, ClientContext **ret) { + _cleanup_free_ ClientContext *c = NULL; + int r; + + assert(s); + assert(pid_is_valid(pid)); + assert(ret); + + r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare); + if (r < 0) + return r; + + c = new(ClientContext, 1); + if (!c) + return -ENOMEM; + + *c = (ClientContext) { + .pid = pid, + .uid = UID_INVALID, + .gid = GID_INVALID, + .auditid = AUDIT_SESSION_INVALID, + .loginuid = UID_INVALID, + .owner_uid = UID_INVALID, + .lru_index = PRIOQ_IDX_NULL, + .timestamp = USEC_INFINITY, + .extra_fields_mtime = NSEC_INFINITY, + .log_level_max = -1, + .log_ratelimit_interval = s->ratelimit_interval, + .log_ratelimit_burst = s->ratelimit_burst, + }; + + r = hashmap_ensure_put(&s->client_contexts, NULL, PID_TO_PTR(pid), c); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + return 0; +} + +static void client_context_reset(Server *s, ClientContext *c) { + assert(s); + assert(c); + + c->timestamp = USEC_INFINITY; + + c->uid = UID_INVALID; + c->gid = GID_INVALID; + + c->comm = mfree(c->comm); + c->exe = mfree(c->exe); + c->cmdline = mfree(c->cmdline); + c->capeff = mfree(c->capeff); + + c->auditid = AUDIT_SESSION_INVALID; + c->loginuid = UID_INVALID; + + c->cgroup = mfree(c->cgroup); + c->session = mfree(c->session); + c->owner_uid = UID_INVALID; + c->unit = mfree(c->unit); + c->user_unit = mfree(c->user_unit); + c->slice = mfree(c->slice); + c->user_slice = mfree(c->user_slice); + + c->invocation_id = SD_ID128_NULL; + + c->label = mfree(c->label); + c->label_size = 0; + + c->extra_fields_iovec = mfree(c->extra_fields_iovec); + c->extra_fields_n_iovec = 0; + c->extra_fields_data = mfree(c->extra_fields_data); + c->extra_fields_mtime = NSEC_INFINITY; + + c->log_level_max = -1; + + c->log_ratelimit_interval = s->ratelimit_interval; + c->log_ratelimit_burst = s->ratelimit_burst; + + c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns); + c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns); +} + +static ClientContext* client_context_free(Server *s, ClientContext *c) { + assert(s); + + if (!c) + return NULL; + + assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c); + + if (c->in_lru) + assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0); + + client_context_reset(s, c); + + return mfree(c); +} + +static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) { + assert(c); + assert(pid_is_valid(c->pid)); + + /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */ + if (ucred && uid_is_valid(ucred->uid)) + c->uid = ucred->uid; + else + (void) pid_get_uid(c->pid, &c->uid); + + if (ucred && gid_is_valid(ucred->gid)) + c->gid = ucred->gid; + else + (void) get_process_gid(c->pid, &c->gid); +} + +static void client_context_read_basic(ClientContext *c) { + char *t; + + assert(c); + assert(pid_is_valid(c->pid)); + + if (pid_get_comm(c->pid, &t) >= 0) + free_and_replace(c->comm, t); + + if (get_process_exe(c->pid, &t) >= 0) + free_and_replace(c->exe, t); + + if (pid_get_cmdline(c->pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE, &t) >= 0) + free_and_replace(c->cmdline, t); + + if (get_process_capeff(c->pid, &t) >= 0) + free_and_replace(c->capeff, t); +} + +static int client_context_read_label( + ClientContext *c, + const char *label, size_t label_size) { + + assert(c); + assert(pid_is_valid(c->pid)); + assert(label_size == 0 || label); + + if (label_size > 0) { + char *l; + + /* If we got an SELinux label passed in it counts. */ + + l = newdup_suffix0(char, label, label_size); + if (!l) + return -ENOMEM; + + free_and_replace(c->label, l); + c->label_size = label_size; + } +#if HAVE_SELINUX + else { + char *con; + + /* If we got no SELinux label passed in, let's try to acquire one */ + + if (getpidcon(c->pid, &con) >= 0 && con) { + free_and_replace(c->label, con); + c->label_size = strlen(c->label); + } + } +#endif + + return 0; +} + +static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) { + _cleanup_free_ char *t = NULL; + int r; + + assert(c); + + /* Try to acquire the current cgroup path */ + r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t); + if (r < 0 || empty_or_root(t)) { + /* We use the unit ID passed in as fallback if we have nothing cached yet and cg_pid_get_path_shifted() + * failed or process is running in a root cgroup. Zombie processes are automatically migrated to root cgroup + * on cgroup v1 and we want to be able to map log messages from them too. */ + if (unit_id && !c->unit) { + c->unit = strdup(unit_id); + if (c->unit) + return 0; + } + + return r; + } + + (void) client_context_read_log_filter_patterns(c, t); + + /* Let's shortcut this if the cgroup path didn't change */ + if (streq_ptr(c->cgroup, t)) + return 0; + + free_and_replace(c->cgroup, t); + + (void) cg_path_get_session(c->cgroup, &t); + free_and_replace(c->session, t); + + if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0) + c->owner_uid = UID_INVALID; + + (void) cg_path_get_unit(c->cgroup, &t); + free_and_replace(c->unit, t); + + (void) cg_path_get_user_unit(c->cgroup, &t); + free_and_replace(c->user_unit, t); + + (void) cg_path_get_slice(c->cgroup, &t); + free_and_replace(c->slice, t); + + (void) cg_path_get_user_slice(c->cgroup, &t); + free_and_replace(c->user_slice, t); + + return 0; +} + +static int client_context_read_invocation_id( + Server *s, + ClientContext *c) { + + _cleanup_free_ char *p = NULL, *value = NULL; + int r; + + assert(s); + assert(c); + + /* Read the invocation ID of a unit off a unit. + * PID 1 stores it in a per-unit symlink in /run/systemd/units/ + * User managers store it in a per-unit symlink under /run/user//systemd/units/ */ + + if (!c->unit) + return 0; + + if (c->user_unit) { + r = asprintf(&p, "/run/user/" UID_FMT "/systemd/units/invocation:%s", c->owner_uid, c->user_unit); + if (r < 0) + return r; + } else { + p = strjoin("/run/systemd/units/invocation:", c->unit); + if (!p) + return -ENOMEM; + } + + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + return sd_id128_from_string(value, &c->invocation_id); +} + +static int client_context_read_log_level_max( + Server *s, + ClientContext *c) { + + _cleanup_free_ char *value = NULL; + const char *p; + int r, ll; + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-level-max:", c->unit); + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + ll = log_level_from_string(value); + if (ll < 0) + return ll; + + c->log_level_max = ll; + return 0; +} + +static int client_context_read_extra_fields( + Server *s, + ClientContext *c) { + + _cleanup_free_ struct iovec *iovec = NULL; + size_t size = 0, n_iovec = 0, left; + _cleanup_free_ void *data = NULL; + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + const char *p; + uint8_t *q; + int r; + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-extra-fields:", c->unit); + + if (c->extra_fields_mtime != NSEC_INFINITY) { + if (stat(p, &st) < 0) { + if (errno == ENOENT) + return 0; + + return -errno; + } + + if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime) + return 0; + } + + f = fopen(p, "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return -errno; + } + + if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new + * one, that matches the stuff we are reading */ + return -errno; + + r = read_full_stream(f, (char**) &data, &size); + if (r < 0) + return r; + + q = data, left = size; + while (left > 0) { + uint8_t *field, *eq; + uint64_t v, n; + + if (left < sizeof(uint64_t)) + return -EBADMSG; + + v = unaligned_read_le64(q); + if (v < 2) + return -EBADMSG; + + n = sizeof(uint64_t) + v; + if (left < n) + return -EBADMSG; + + field = q + sizeof(uint64_t); + + eq = memchr(field, '=', v); + if (!eq) + return -EBADMSG; + + if (!journal_field_valid((const char *) field, eq - field, false)) + return -EBADMSG; + + if (!GREEDY_REALLOC(iovec, n_iovec+1)) + return -ENOMEM; + + iovec[n_iovec++] = IOVEC_MAKE(field, v); + + left -= n, q += n; + } + + free(c->extra_fields_iovec); + free(c->extra_fields_data); + + c->extra_fields_iovec = TAKE_PTR(iovec); + c->extra_fields_n_iovec = n_iovec; + c->extra_fields_data = TAKE_PTR(data); + c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim); + + return 0; +} + +static int client_context_read_log_ratelimit_interval(ClientContext *c) { + _cleanup_free_ char *value = NULL; + const char *p; + int r; + + assert(c); + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-interval:", c->unit); + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + return safe_atou64(value, &c->log_ratelimit_interval); +} + +static int client_context_read_log_ratelimit_burst(ClientContext *c) { + _cleanup_free_ char *value = NULL; + const char *p; + int r; + + assert(c); + + if (!c->unit) + return 0; + + p = strjoina("/run/systemd/units/log-rate-limit-burst:", c->unit); + r = readlink_malloc(p, &value); + if (r < 0) + return r; + + return safe_atou(value, &c->log_ratelimit_burst); +} + +static void client_context_really_refresh( + Server *s, + ClientContext *c, + const struct ucred *ucred, + const char *label, size_t label_size, + const char *unit_id, + usec_t timestamp) { + + assert(s); + assert(c); + assert(pid_is_valid(c->pid)); + + if (timestamp == USEC_INFINITY) + timestamp = now(CLOCK_MONOTONIC); + + client_context_read_uid_gid(c, ucred); + client_context_read_basic(c); + (void) client_context_read_label(c, label, label_size); + + (void) audit_session_from_pid(c->pid, &c->auditid); + (void) audit_loginuid_from_pid(c->pid, &c->loginuid); + + (void) client_context_read_cgroup(s, c, unit_id); + (void) client_context_read_invocation_id(s, c); + (void) client_context_read_log_level_max(s, c); + (void) client_context_read_extra_fields(s, c); + (void) client_context_read_log_ratelimit_interval(c); + (void) client_context_read_log_ratelimit_burst(c); + + c->timestamp = timestamp; + + if (c->in_lru) { + assert(c->n_ref == 0); + prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index); + } +} + +void client_context_maybe_refresh( + Server *s, + ClientContext *c, + const struct ucred *ucred, + const char *label, size_t label_size, + const char *unit_id, + usec_t timestamp) { + + assert(s); + assert(c); + + if (timestamp == USEC_INFINITY) + timestamp = now(CLOCK_MONOTONIC); + + /* No cached data so far? Let's fill it up */ + if (c->timestamp == USEC_INFINITY) + goto refresh; + + /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out + * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */ + if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) { + client_context_reset(s, c); + goto refresh; + } + + /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */ + if (c->timestamp + REFRESH_USEC < timestamp) + goto refresh; + + /* If the data passed along doesn't match the cached data we also do a refresh */ + if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid) + goto refresh; + + if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid) + goto refresh; + + if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0)) + goto refresh; + + return; + +refresh: + client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp); +} + +static void client_context_try_shrink_to(Server *s, size_t limit) { + ClientContext *c; + usec_t t; + + assert(s); + + /* Flush any cache entries for PIDs that have already moved on. Don't do this + * too often, since it's a slow process. */ + t = now(CLOCK_MONOTONIC); + if (s->last_cache_pid_flush + MAX_USEC < t) { + unsigned n = prioq_size(s->client_contexts_lru), idx = 0; + + /* We do a number of iterations based on the initial size of the prioq. When we remove an + * item, a new item is moved into its places, and items to the right might be reshuffled. + */ + for (unsigned i = 0; i < n; i++) { + c = prioq_peek_by_index(s->client_contexts_lru, idx); + + assert(c->n_ref == 0); + + if (pid_is_unwaited(c->pid) == 0) + client_context_free(s, c); + else + idx ++; + } + + s->last_cache_pid_flush = t; + } + + /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without + * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of + * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */ + + while (hashmap_size(s->client_contexts) > limit) { + c = prioq_pop(s->client_contexts_lru); + if (!c) + break; /* All remaining entries are pinned, give up */ + + assert(c->in_lru); + assert(c->n_ref == 0); + + c->in_lru = false; + + client_context_free(s, c); + } +} + +void client_context_flush_regular(Server *s) { + client_context_try_shrink_to(s, 0); +} + +void client_context_flush_all(Server *s) { + assert(s); + + /* Flush out all remaining entries. This assumes all references are already dropped. */ + + s->my_context = client_context_release(s, s->my_context); + s->pid1_context = client_context_release(s, s->pid1_context); + + client_context_flush_regular(s); + + assert(prioq_size(s->client_contexts_lru) == 0); + assert(hashmap_size(s->client_contexts) == 0); + + s->client_contexts_lru = prioq_free(s->client_contexts_lru); + s->client_contexts = hashmap_free(s->client_contexts); +} + +static int client_context_get_internal( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + bool add_ref, + ClientContext **ret) { + + ClientContext *c; + int r; + + assert(s); + assert(ret); + + if (!pid_is_valid(pid)) + return -EINVAL; + + c = hashmap_get(s->client_contexts, PID_TO_PTR(pid)); + if (c) { + + if (add_ref) { + if (c->in_lru) { + /* The entry wasn't pinned so far, let's remove it from the LRU list then */ + assert(c->n_ref == 0); + assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0); + c->in_lru = false; + } + + c->n_ref++; + } + + client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY); + + *ret = c; + return 0; + } + + client_context_try_shrink_to(s, cache_max()-1); + + r = client_context_new(s, pid, &c); + if (r < 0) + return r; + + if (add_ref) + c->n_ref++; + else { + r = prioq_put(s->client_contexts_lru, c, &c->lru_index); + if (r < 0) { + client_context_free(s, c); + return r; + } + + c->in_lru = true; + } + + client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY); + + *ret = c; + return 0; +} + +int client_context_get( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret) { + + return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret); +} + +int client_context_acquire( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret) { + + return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret); +}; + +ClientContext *client_context_release(Server *s, ClientContext *c) { + assert(s); + + if (!c) + return NULL; + + assert(c->n_ref > 0); + assert(!c->in_lru); + + c->n_ref--; + if (c->n_ref > 0) + return NULL; + + /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it + * right-away */ + + if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0) + client_context_free(s, c); + else + c->in_lru = true; + + return NULL; +} + +void client_context_acquire_default(Server *s) { + int r; + + assert(s); + + /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to + * generate driver messages. */ + + if (!s->my_context) { + struct ucred ucred = { + .pid = getpid_cached(), + .uid = getuid(), + .gid = getgid(), + }; + + r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to acquire our own context, ignoring: %m"); + } + + if (!s->namespace && !s->pid1_context) { + /* Acquire PID1's context, but only if we are in non-namespaced mode, since PID 1 is only + * going to log to the non-namespaced journal instance. */ + + r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to acquire PID1's context, ignoring: %m"); + + } +} diff --git a/src/journal/journald-context.h b/src/journal/journald-context.h new file mode 100644 index 0000000..6e0d9f6 --- /dev/null +++ b/src/journal/journald-context.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +#include "set.h" +#include "time-util.h" + +typedef struct ClientContext ClientContext; + +#include "journald-server.h" + +struct ClientContext { + unsigned n_ref; + unsigned lru_index; + usec_t timestamp; + bool in_lru; + + pid_t pid; + uid_t uid; + gid_t gid; + + char *comm; + char *exe; + char *cmdline; + char *capeff; + + uint32_t auditid; + uid_t loginuid; + + char *cgroup; + char *session; + uid_t owner_uid; + + char *unit; + char *user_unit; + + char *slice; + char *user_slice; + + sd_id128_t invocation_id; + + char *label; + size_t label_size; + + int log_level_max; + + struct iovec *extra_fields_iovec; + size_t extra_fields_n_iovec; + void *extra_fields_data; + nsec_t extra_fields_mtime; + + usec_t log_ratelimit_interval; + unsigned log_ratelimit_burst; + + Set *log_filter_allowed_patterns; + Set *log_filter_denied_patterns; +}; + +int client_context_get( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret); + +int client_context_acquire( + Server *s, + pid_t pid, + const struct ucred *ucred, + const char *label, size_t label_len, + const char *unit_id, + ClientContext **ret); + +ClientContext* client_context_release(Server *s, ClientContext *c); + +void client_context_maybe_refresh( + Server *s, + ClientContext *c, + const struct ucred *ucred, + const char *label, size_t label_size, + const char *unit_id, + usec_t tstamp); + +void client_context_acquire_default(Server *s); +void client_context_flush_all(Server *s); +void client_context_flush_regular(Server *s); + +static inline size_t client_context_extra_fields_n_iovec(const ClientContext *c) { + return c ? c->extra_fields_n_iovec : 0; +} + +static inline bool client_context_test_priority(const ClientContext *c, int priority) { + if (!c) + return true; + + if (c->log_level_max < 0) + return true; + + return LOG_PRI(priority) <= c->log_level_max; +} diff --git a/src/journal/journald-gperf.gperf b/src/journal/journald-gperf.gperf new file mode 100644 index 0000000..9076597 --- /dev/null +++ b/src/journal/journald-gperf.gperf @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include +#include "conf-parser.h" +#include "journald-server.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name journald_gperf_hash +%define lookup-function-name journald_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Journal.Storage, config_parse_storage, 0, offsetof(Server, storage) +Journal.Compress, config_parse_compress, 0, offsetof(Server, compress) +Journal.Seal, config_parse_bool, 0, offsetof(Server, seal) +Journal.ReadKMsg, config_parse_bool, 0, offsetof(Server, read_kmsg) +Journal.Audit, config_parse_tristate, 0, offsetof(Server, set_audit) +Journal.SyncIntervalSec, config_parse_sec, 0, offsetof(Server, sync_interval_usec) +# The following is a legacy name for compatibility +Journal.RateLimitInterval, config_parse_sec, 0, offsetof(Server, ratelimit_interval) +Journal.RateLimitIntervalSec,config_parse_sec, 0, offsetof(Server, ratelimit_interval) +Journal.RateLimitBurst, config_parse_unsigned, 0, offsetof(Server, ratelimit_burst) +Journal.SystemMaxUse, config_parse_iec_uint64, 0, offsetof(Server, system_storage.metrics.max_use) +Journal.SystemMaxFileSize, config_parse_iec_uint64, 0, offsetof(Server, system_storage.metrics.max_size) +Journal.SystemKeepFree, config_parse_iec_uint64, 0, offsetof(Server, system_storage.metrics.keep_free) +Journal.SystemMaxFiles, config_parse_uint64, 0, offsetof(Server, system_storage.metrics.n_max_files) +Journal.RuntimeMaxUse, config_parse_iec_uint64, 0, offsetof(Server, runtime_storage.metrics.max_use) +Journal.RuntimeMaxFileSize, config_parse_iec_uint64, 0, offsetof(Server, runtime_storage.metrics.max_size) +Journal.RuntimeKeepFree, config_parse_iec_uint64, 0, offsetof(Server, runtime_storage.metrics.keep_free) +Journal.RuntimeMaxFiles, config_parse_uint64, 0, offsetof(Server, runtime_storage.metrics.n_max_files) +Journal.MaxRetentionSec, config_parse_sec, 0, offsetof(Server, max_retention_usec) +Journal.MaxFileSec, config_parse_sec, 0, offsetof(Server, max_file_usec) +Journal.ForwardToSyslog, config_parse_bool, 0, offsetof(Server, forward_to_syslog) +Journal.ForwardToKMsg, config_parse_bool, 0, offsetof(Server, forward_to_kmsg) +Journal.ForwardToConsole, config_parse_bool, 0, offsetof(Server, forward_to_console) +Journal.ForwardToWall, config_parse_bool, 0, offsetof(Server, forward_to_wall) +Journal.TTYPath, config_parse_path, 0, offsetof(Server, tty_path) +Journal.MaxLevelStore, config_parse_log_level, 0, offsetof(Server, max_level_store) +Journal.MaxLevelSyslog, config_parse_log_level, 0, offsetof(Server, max_level_syslog) +Journal.MaxLevelKMsg, config_parse_log_level, 0, offsetof(Server, max_level_kmsg) +Journal.MaxLevelConsole, config_parse_log_level, 0, offsetof(Server, max_level_console) +Journal.MaxLevelWall, config_parse_log_level, 0, offsetof(Server, max_level_wall) +Journal.SplitMode, config_parse_split_mode, 0, offsetof(Server, split_mode) +Journal.LineMax, config_parse_line_max, 0, offsetof(Server, line_max) diff --git a/src/journal/journald-kmsg.c b/src/journal/journald-kmsg.c new file mode 100644 index 0000000..28d4880 --- /dev/null +++ b/src/journal/journald-kmsg.c @@ -0,0 +1,441 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-device.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "journal-internal.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "log.h" +#include "parse-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "string-util.h" + +void server_forward_kmsg( + Server *s, + int priority, + const char *identifier, + const char *message, + const struct ucred *ucred) { + + _cleanup_free_ char *ident_buf = NULL; + struct iovec iovec[5]; + char header_priority[DECIMAL_STR_MAX(priority) + 3], + header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t) + 1]; + size_t n = 0; + + assert(s); + assert(priority >= 0); + assert(priority <= 999); + assert(message); + + if (_unlikely_(LOG_PRI(priority) > s->max_level_kmsg)) + return; + + if (_unlikely_(s->dev_kmsg_fd < 0)) + return; + + /* Never allow messages with kernel facility to be written to + * kmsg, regardless where the data comes from. */ + priority = syslog_fixup_facility(priority); + + /* First: priority field */ + xsprintf(header_priority, "<%i>", priority); + iovec[n++] = IOVEC_MAKE_STRING(header_priority); + + /* Second: identifier and PID */ + if (ucred) { + if (!identifier) { + (void) pid_get_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + if (identifier) + iovec[n++] = IOVEC_MAKE_STRING(identifier); + + xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); + iovec[n++] = IOVEC_MAKE_STRING(header_pid); + } else if (identifier) { + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + + /* Fourth: message */ + iovec[n++] = IOVEC_MAKE_STRING(message); + iovec[n++] = IOVEC_MAKE_STRING("\n"); + + if (writev(s->dev_kmsg_fd, iovec, n) < 0) + log_debug_errno(errno, "Failed to write to /dev/kmsg for logging, ignoring: %m"); +} + +static bool is_us(const char *identifier, const char *pid) { + pid_t pid_num; + + if (!identifier || !pid) + return false; + + if (parse_pid(pid, &pid_num) < 0) + return false; + + return pid_num == getpid_cached() && + streq(identifier, program_invocation_short_name); +} + +void dev_kmsg_record(Server *s, char *p, size_t l) { + + _cleanup_free_ char *message = NULL, *syslog_priority = NULL, *syslog_pid = NULL, *syslog_facility = NULL, *syslog_identifier = NULL, *source_time = NULL, *identifier = NULL, *pid = NULL; + struct iovec iovec[N_IOVEC_META_FIELDS + 7 + N_IOVEC_KERNEL_FIELDS + 2 + N_IOVEC_UDEV_FIELDS]; + char *kernel_device = NULL; + unsigned long long usec; + size_t n = 0, z = 0, j; + int priority, r; + char *e, *f, *k; + uint64_t serial; + size_t pl; + int saved_log_max_level = INT_MAX; + ClientContext *c = NULL; + + assert(s); + assert(p); + + if (l <= 0) + return; + + e = memchr(p, ',', l); + if (!e) + return; + *e = 0; + + r = safe_atoi(p, &priority); + if (r < 0 || priority < 0 || priority > 999) + return; + + if (s->forward_to_kmsg && LOG_FAC(priority) != LOG_KERN) + return; + + l -= (e - p) + 1; + p = e + 1; + e = memchr(p, ',', l); + if (!e) + return; + *e = 0; + + r = safe_atou64(p, &serial); + if (r < 0) + return; + + if (s->kernel_seqnum) { + /* We already read this one? */ + if (serial < *s->kernel_seqnum) + return; + + /* Did we lose any? */ + if (serial > *s->kernel_seqnum) + server_driver_message(s, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_MISSED_STR, + LOG_MESSAGE("Missed %"PRIu64" kernel messages", + serial - *s->kernel_seqnum), + NULL); + + /* Make sure we never read this one again. Note that + * we always store the next message serial we expect + * here, simply because this makes handling the first + * message with serial 0 easy. */ + *s->kernel_seqnum = serial + 1; + } + + l -= (e - p) + 1; + p = e + 1; + f = memchr(p, ';', l); + if (!f) + return; + /* Kernel 3.6 has the flags field, kernel 3.5 lacks that */ + e = memchr(p, ',', l); + if (!e || f < e) + e = f; + *e = 0; + + r = safe_atollu(p, &usec); + if (r < 0) + return; + + l -= (f - p) + 1; + p = f + 1; + e = memchr(p, '\n', l); + if (!e) + return; + *e = 0; + + pl = e - p; + l -= (e - p) + 1; + k = e + 1; + + for (j = 0; l > 0 && j < N_IOVEC_KERNEL_FIELDS; j++) { + char *m; + /* Metadata fields attached */ + + if (*k != ' ') + break; + + k++, l--; + + e = memchr(k, '\n', l); + if (!e) + goto finish; + + *e = 0; + + if (cunescape_length_with_prefix(k, e - k, "_KERNEL_", UNESCAPE_RELAX, &m) < 0) + break; + + if (startswith(m, "_KERNEL_DEVICE=")) + kernel_device = m + 15; + + iovec[n++] = IOVEC_MAKE_STRING(m); + z++; + + l -= (e - k) + 1; + k = e + 1; + } + + if (kernel_device) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + + if (sd_device_new_from_device_id(&d, kernel_device) >= 0) { + const char *g; + char *b; + + if (sd_device_get_devname(d, &g) >= 0) { + b = strjoin("_UDEV_DEVNODE=", g); + if (b) { + iovec[n++] = IOVEC_MAKE_STRING(b); + z++; + } + } + + if (sd_device_get_sysname(d, &g) >= 0) { + b = strjoin("_UDEV_SYSNAME=", g); + if (b) { + iovec[n++] = IOVEC_MAKE_STRING(b); + z++; + } + } + + j = 0; + FOREACH_DEVICE_DEVLINK(d, link) { + + if (j >= N_IOVEC_UDEV_FIELDS) + break; + + b = strjoin("_UDEV_DEVLINK=", link); + if (b) { + iovec[n++] = IOVEC_MAKE_STRING(b); + z++; + } + + j++; + } + } + } + + if (asprintf(&source_time, "_SOURCE_MONOTONIC_TIMESTAMP=%llu", usec) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(source_time); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=kernel"); + + if (asprintf(&syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); + + if (asprintf(&syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); + + if (LOG_FAC(priority) == LOG_KERN) + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=kernel"); + else { + pl -= syslog_parse_identifier((const char**) &p, &identifier, &pid); + + /* Avoid logging any new messages when we're processing messages generated by ourselves via + * log_info() and friends to avoid infinite loops. */ + if (is_us(identifier, pid)) { + if (!ratelimit_below(&s->kmsg_own_ratelimit)) + return; + + saved_log_max_level = log_get_max_level(); + c = s->my_context; + log_set_max_level(LOG_NULL); + } + + if (identifier) { + syslog_identifier = strjoin("SYSLOG_IDENTIFIER=", identifier); + if (syslog_identifier) + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); + } + + if (pid) { + syslog_pid = strjoin("SYSLOG_PID=", pid); + if (syslog_pid) + iovec[n++] = IOVEC_MAKE_STRING(syslog_pid); + } + } + + if (cunescape_length_with_prefix(p, pl, "MESSAGE=", UNESCAPE_RELAX, &message) >= 0) + iovec[n++] = IOVEC_MAKE_STRING(message); + + server_dispatch_message(s, iovec, n, ELEMENTSOF(iovec), c, NULL, priority, 0); + + if (saved_log_max_level != INT_MAX) + log_set_max_level(saved_log_max_level); + +finish: + for (j = 0; j < z; j++) + free(iovec[j].iov_base); +} + +static int server_read_dev_kmsg(Server *s) { + char buffer[8192+1]; /* the kernel-side limit per record is 8K currently */ + ssize_t l; + + assert(s); + assert(s->dev_kmsg_fd >= 0); + + l = read(s->dev_kmsg_fd, buffer, sizeof(buffer) - 1); + if (l == 0) + return 0; + if (l < 0) { + /* Old kernels which don't allow reading from /dev/kmsg return EINVAL when we try. So handle + * this cleanly, but don't try to ever read from it again. */ + if (errno == EINVAL) { + s->dev_kmsg_event_source = sd_event_source_unref(s->dev_kmsg_event_source); + s->dev_kmsg_readable = false; + return 0; + } + + if (ERRNO_IS_TRANSIENT(errno) || errno == EPIPE) + return 0; + + return log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, "Failed to read from /dev/kmsg: %m"); + } + + dev_kmsg_record(s, buffer, l); + return 1; +} + +int server_flush_dev_kmsg(Server *s) { + int r; + + assert(s); + + if (s->dev_kmsg_fd < 0) + return 0; + + if (!s->dev_kmsg_readable) + return 0; + + log_debug("Flushing /dev/kmsg..."); + + for (;;) { + r = server_read_dev_kmsg(s); + if (r < 0) + return r; + + if (r == 0) + break; + } + + return 0; +} + +static int dispatch_dev_kmsg(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(es); + assert(fd == s->dev_kmsg_fd); + + if (revents & EPOLLERR) + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, + "/dev/kmsg buffer overrun, some messages lost."); + + if (!(revents & EPOLLIN)) + log_error("Got invalid event from epoll for /dev/kmsg: %"PRIx32, revents); + + return server_read_dev_kmsg(s); +} + +int server_open_dev_kmsg(Server *s) { + mode_t mode; + int r; + + assert(s); + + if (s->read_kmsg) + mode = O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY; + else + mode = O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY; + + s->dev_kmsg_fd = open("/dev/kmsg", mode); + if (s->dev_kmsg_fd < 0) { + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, + errno, "Failed to open /dev/kmsg, ignoring: %m"); + return 0; + } + + if (!s->read_kmsg) + return 0; + + r = sd_event_add_io(s->event, &s->dev_kmsg_event_source, s->dev_kmsg_fd, EPOLLIN, dispatch_dev_kmsg, s); + if (r == -EPERM) { /* This will fail with EPERM on older kernels where /dev/kmsg is not readable. */ + r = 0; + goto finish; + } + if (r < 0) { + log_error_errno(r, "Failed to add /dev/kmsg fd to event loop: %m"); + goto finish; + } + + r = sd_event_source_set_priority(s->dev_kmsg_event_source, SD_EVENT_PRIORITY_IMPORTANT+10); + if (r < 0) { + log_error_errno(r, "Failed to adjust priority of kmsg event source: %m"); + goto finish; + } + + s->dev_kmsg_readable = true; + return 0; + +finish: + s->dev_kmsg_event_source = sd_event_source_unref(s->dev_kmsg_event_source); + s->dev_kmsg_fd = safe_close(s->dev_kmsg_fd); + return r; +} + +int server_open_kernel_seqnum(Server *s) { + int r; + + assert(s); + + /* We store the seqnum we last read in an mmapped file. That way we can just use it like a variable, + * but it is persistent and automatically flushed at reboot. */ + + if (!s->dev_kmsg_readable) + return 0; + + r = server_map_seqnum_file(s, "kernel-seqnum", sizeof(uint64_t), (void**) &s->kernel_seqnum); + if (r < 0) + return log_error_errno(r, "Failed to map kernel seqnum file: %m"); + + return 0; +} diff --git a/src/journal/journald-kmsg.h b/src/journal/journald-kmsg.h new file mode 100644 index 0000000..bd288c5 --- /dev/null +++ b/src/journal/journald-kmsg.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +int server_open_dev_kmsg(Server *s); +int server_flush_dev_kmsg(Server *s); + +void server_forward_kmsg(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred); + +int server_open_kernel_seqnum(Server *s); + +void dev_kmsg_record(Server *s, char *p, size_t l); diff --git a/src/journal/journald-native.c b/src/journal/journald-native.c new file mode 100644 index 0000000..315ec0b --- /dev/null +++ b/src/journal/journald-native.c @@ -0,0 +1,536 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "journal-importer.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "journald-client.h" +#include "journald-console.h" +#include "journald-kmsg.h" +#include "journald-native.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "journald-wall.h" +#include "memfd-util.h" +#include "memory-util.h" +#include "missing_fcntl.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "unaligned.h" + +static bool allow_object_pid(const struct ucred *ucred) { + return ucred && ucred->uid == 0; +} + +static void server_process_entry_meta( + const char *p, size_t l, + const struct ucred *ucred, + int *priority, + char **identifier, + char **message, + pid_t *object_pid) { + + /* We need to determine the priority of this entry for the rate limiting logic */ + + if (l == 10 && + startswith(p, "PRIORITY=") && + p[9] >= '0' && p[9] <= '9') + *priority = (*priority & LOG_FACMASK) | (p[9] - '0'); + + else if (l == 17 && + startswith(p, "SYSLOG_FACILITY=") && + p[16] >= '0' && p[16] <= '9') + *priority = (*priority & LOG_PRIMASK) | ((p[16] - '0') << 3); + + else if (l == 18 && + startswith(p, "SYSLOG_FACILITY=") && + p[16] >= '0' && p[16] <= '9' && + p[17] >= '0' && p[17] <= '9') + *priority = (*priority & LOG_PRIMASK) | (((p[16] - '0')*10 + (p[17] - '0')) << 3); + + else if (l >= 19 && + startswith(p, "SYSLOG_IDENTIFIER=")) { + char *t; + + t = memdup_suffix0(p + 18, l - 18); + if (t) + free_and_replace(*identifier, t); + + } else if (l >= 8 && + startswith(p, "MESSAGE=")) { + char *t; + + t = memdup_suffix0(p + 8, l - 8); + if (t) + free_and_replace(*message, t); + + } else if (l > STRLEN("OBJECT_PID=") && + l < STRLEN("OBJECT_PID=") + DECIMAL_STR_MAX(pid_t) && + startswith(p, "OBJECT_PID=") && + allow_object_pid(ucred)) { + char buf[DECIMAL_STR_MAX(pid_t)]; + memcpy(buf, p + STRLEN("OBJECT_PID="), + l - STRLEN("OBJECT_PID=")); + buf[l-STRLEN("OBJECT_PID=")] = '\0'; + + (void) parse_pid(buf, object_pid); + } +} + +static int server_process_entry( + Server *s, + const void *buffer, size_t *remaining, + ClientContext *context, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, size_t label_len) { + + /* Process a single entry from a native message. Returns 0 if nothing special happened and the message + * processing should continue, and a negative or positive value otherwise. + * + * Note that *remaining is altered on both success and failure. */ + + size_t n = 0, j, tn = SIZE_MAX, entry_size = 0; + char *identifier = NULL, *message = NULL; + struct iovec *iovec = NULL; + int priority = LOG_INFO; + pid_t object_pid = 0; + const char *p; + int r = 1; + + p = buffer; + + while (*remaining > 0) { + const char *e, *q; + + e = memchr(p, '\n', *remaining); + + if (!e) { + /* Trailing noise, let's ignore it, and flush what we collected */ + log_debug("Received message with trailing noise, ignoring."); + break; /* finish processing of the message */ + } + + if (e == p) { + /* Entry separator */ + *remaining -= 1; + break; + } + + if (IN_SET(*p, '.', '#')) { + /* Ignore control commands for now, and comments too. */ + *remaining -= (e - p) + 1; + p = e + 1; + continue; + } + + /* A property follows */ + if (n > ENTRY_FIELD_COUNT_MAX) { + log_debug("Received an entry that has more than " STRINGIFY(ENTRY_FIELD_COUNT_MAX) " fields, ignoring entry."); + goto finish; + } + + /* n existing properties, 1 new, +1 for _TRANSPORT */ + if (!GREEDY_REALLOC(iovec, + n + 2 + + N_IOVEC_META_FIELDS + N_IOVEC_OBJECT_FIELDS + + client_context_extra_fields_n_iovec(context))) { + r = log_oom(); + goto finish; + } + + q = memchr(p, '=', e - p); + if (q) { + if (journal_field_valid(p, q - p, false)) { + size_t l; + + l = e - p; + if (l > DATA_SIZE_MAX) { + log_debug("Received text block of %zu bytes is too large, ignoring entry.", l); + goto finish; + } + + if (entry_size + l + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ + log_debug("Entry is too big (%zu bytes after processing %zu entries), ignoring entry.", + entry_size + l, n + 1); + goto finish; + } + + /* If the field name starts with an underscore, skip the variable, since that indicates + * a trusted field */ + iovec[n++] = IOVEC_MAKE((char*) p, l); + entry_size += l; + + server_process_entry_meta(p, l, ucred, + &priority, + &identifier, + &message, + &object_pid); + } + + *remaining -= (e - p) + 1; + p = e + 1; + continue; + } else { + uint64_t l, total; + char *k; + + if (*remaining < e - p + 1 + sizeof(uint64_t) + 1) { + log_debug("Failed to parse message, ignoring."); + break; + } + + l = unaligned_read_le64(e + 1); + if (l > DATA_SIZE_MAX) { + log_debug("Received binary data block of %"PRIu64" bytes is too large, ignoring entry.", l); + goto finish; + } + + total = (e - p) + 1 + l; + if (entry_size + total + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ + log_debug("Entry is too big (%"PRIu64"bytes after processing %zu fields), ignoring.", + entry_size + total, n + 1); + goto finish; + } + + if ((uint64_t) *remaining < e - p + 1 + sizeof(uint64_t) + l + 1 || + e[1+sizeof(uint64_t)+l] != '\n') { + log_debug("Failed to parse message, ignoring."); + break; + } + + k = malloc(total); + if (!k) { + log_oom(); + break; + } + + memcpy(k, p, e - p); + k[e - p] = '='; + memcpy(k + (e - p) + 1, e + 1 + sizeof(uint64_t), l); + + if (journal_field_valid(p, e - p, false)) { + iovec[n] = IOVEC_MAKE(k, (e - p) + 1 + l); + entry_size += iovec[n].iov_len; + n++; + + server_process_entry_meta(k, (e - p) + 1 + l, ucred, + &priority, + &identifier, + &message, + &object_pid); + } else + free(k); + + *remaining -= (e - p) + 1 + sizeof(uint64_t) + l + 1; + p = e + 1 + sizeof(uint64_t) + l + 1; + } + } + + if (n <= 0) + goto finish; + + tn = n++; + iovec[tn] = IOVEC_MAKE_STRING("_TRANSPORT=journal"); + entry_size += STRLEN("_TRANSPORT=journal"); + + if (entry_size + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ + log_debug("Entry is too big with %zu properties and %zu bytes, ignoring.", n, entry_size); + goto finish; + } + + r = 0; /* Success, we read the message. */ + + if (!client_context_test_priority(context, priority)) + goto finish; + + if (message) { + /* Ensure message is not NULL, otherwise strlen(message) would crash. This check needs to + * be here until server_process_entry() is able to process messages containing \0 characters, + * as we would have access to the actual size of message. */ + r = client_context_check_keep_log(context, message, strlen(message)); + if (r <= 0) + goto finish; + + if (s->forward_to_syslog) + server_forward_syslog(s, syslog_fixup_facility(priority), identifier, message, ucred, tv); + + if (s->forward_to_kmsg) + server_forward_kmsg(s, priority, identifier, message, ucred); + + if (s->forward_to_console) + server_forward_console(s, priority, identifier, message, ucred); + + if (s->forward_to_wall) + server_forward_wall(s, priority, identifier, message, ucred); + } + + server_dispatch_message(s, iovec, n, MALLOC_ELEMENTSOF(iovec), context, tv, priority, object_pid); + +finish: + for (j = 0; j < n; j++) { + if (j == tn) + continue; + + if (iovec[j].iov_base < buffer || + (const char*) iovec[j].iov_base >= p + *remaining) + free(iovec[j].iov_base); + } + + free(iovec); + free(identifier); + free(message); + + return r; +} + +void server_process_native_message( + Server *s, + const char *buffer, size_t buffer_size, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, size_t label_len) { + + size_t remaining = buffer_size; + ClientContext *context = NULL; + int r; + + assert(s); + assert(buffer || buffer_size == 0); + + if (ucred && pid_is_valid(ucred->pid)) { + r = client_context_get(s, ucred->pid, ucred, label, label_len, NULL, &context); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to retrieve credentials for PID " PID_FMT ", ignoring: %m", + ucred->pid); + } + + do { + r = server_process_entry(s, + (const uint8_t*) buffer + (buffer_size - remaining), &remaining, + context, ucred, tv, label, label_len); + } while (r == 0); +} + +void server_process_native_file( + Server *s, + int fd, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, size_t label_len) { + + struct stat st; + bool sealed; + int r; + + /* Data is in the passed fd, probably it didn't fit in a datagram. */ + + assert(s); + assert(fd >= 0); + + if (fstat(fd, &st) < 0) { + log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, "Failed to stat passed file, ignoring: %m"); + return; + } + + r = stat_verify_regular(&st); + if (r < 0) { + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "File passed is not regular, ignoring: %m"); + return; + } + + if (st.st_size <= 0) + return; + + int flags = fcntl(fd, F_GETFL); + if (flags < 0) { + log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, "Failed to get flags of passed file, ignoring: %m"); + return; + } + + if ((flags & ~(O_ACCMODE|RAW_O_LARGEFILE)) != 0) { + log_ratelimit_error(JOURNAL_LOG_RATELIMIT, "Unexpected flags of passed memory fd, ignoring message: %m"); + return; + } + + /* If it's a memfd, check if it is sealed. If so, we can just mmap it and use it, and do not need to + * copy the data out. */ + sealed = memfd_get_sealed(fd) > 0; + + if (!sealed && (!ucred || ucred->uid != 0)) { + _cleanup_free_ char *k = NULL; + const char *e; + + /* If this is not a sealed memfd, and the peer is unknown or + * unprivileged, then verify the path. */ + + r = fd_get_path(fd, &k); + if (r < 0) { + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "readlink(/proc/self/fd/%i) failed: %m", fd); + return; + } + + e = PATH_STARTSWITH_SET(k, "/dev/shm/", "/tmp/", "/var/tmp/"); + if (!e) { + log_ratelimit_error(JOURNAL_LOG_RATELIMIT, + "Received file outside of allowed directories. Refusing."); + return; + } + + if (!filename_is_valid(e)) { + log_ratelimit_error(JOURNAL_LOG_RATELIMIT, + "Received file in subdirectory of allowed directories. Refusing."); + return; + } + } + + /* When !sealed, set a lower memory limit. We have to read the file, effectively doubling memory + * use. */ + if (st.st_size > ENTRY_SIZE_MAX / (sealed ? 1 : 2)) { + log_ratelimit_error(JOURNAL_LOG_RATELIMIT, + "File passed too large (%"PRIu64" bytes). Ignoring.", + (uint64_t) st.st_size); + return; + } + + if (sealed) { + void *p; + size_t ps; + + /* The file is sealed, we can just map it and use it. */ + + ps = PAGE_ALIGN(st.st_size); + assert(ps < SIZE_MAX); + p = mmap(NULL, ps, PROT_READ, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) { + log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to map memfd, ignoring: %m"); + return; + } + + server_process_native_message(s, p, st.st_size, ucred, tv, label, label_len); + assert_se(munmap(p, ps) >= 0); + } else { + _cleanup_free_ void *p = NULL; + struct statvfs vfs; + ssize_t n; + + if (fstatvfs(fd, &vfs) < 0) { + log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to stat file system of passed file, not processing it: %m"); + return; + } + + /* Refuse operating on file systems that have + * mandatory locking enabled, see: + * + * https://github.com/systemd/systemd/issues/1822 + */ + if (vfs.f_flag & ST_MANDLOCK) { + log_ratelimit_error(JOURNAL_LOG_RATELIMIT, + "Received file descriptor from file system with mandatory locking enabled, not processing it."); + return; + } + + /* Make the fd non-blocking. On regular files this has + * the effect of bypassing mandatory locking. Of + * course, this should normally not be necessary given + * the check above, but let's better be safe than + * sorry, after all NFS is pretty confusing regarding + * file system flags, and we better don't trust it, + * and so is SMB. */ + r = fd_nonblock(fd, true); + if (r < 0) { + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to make fd non-blocking, not processing it: %m"); + return; + } + + /* The file is not sealed, we can't map the file here, since + * clients might then truncate it and trigger a SIGBUS for + * us. So let's stupidly read it. */ + + p = malloc(st.st_size); + if (!p) { + log_oom(); + return; + } + + n = pread(fd, p, st.st_size, 0); + if (n < 0) + log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to read file, ignoring: %m"); + else if (n > 0) + server_process_native_message(s, p, n, ucred, tv, label, label_len); + } +} + +int server_open_native_socket(Server *s, const char *native_socket) { + int r; + + assert(s); + assert(native_socket); + + if (s->native_fd < 0) { + union sockaddr_union sa; + size_t sa_len; + + r = sockaddr_un_set_path(&sa.un, native_socket); + if (r < 0) + return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", native_socket); + sa_len = r; + + s->native_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->native_fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + (void) sockaddr_un_unlink(&sa.un); + + r = bind(s->native_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + (void) chmod(sa.un.sun_path, 0666); + } else + (void) fd_nonblock(s->native_fd, true); + + r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + if (mac_selinux_use()) { + r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_warning_errno(r, "SO_PASSSEC failed: %m"); + } + + r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return log_error_errno(r, "SO_TIMESTAMP failed: %m"); + + r = sd_event_add_io(s->event, &s->native_event_source, s->native_fd, EPOLLIN, server_process_datagram, s); + if (r < 0) + return log_error_errno(r, "Failed to add native server fd to event loop: %m"); + + r = sd_event_source_set_priority(s->native_event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust native event source priority: %m"); + + return 0; +} diff --git a/src/journal/journald-native.h b/src/journal/journald-native.h new file mode 100644 index 0000000..7bbaaed --- /dev/null +++ b/src/journal/journald-native.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +void server_process_native_message( + Server *s, + const char *buffer, + size_t buffer_size, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, + size_t label_len); + +void server_process_native_file( + Server *s, + int fd, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, + size_t label_len); + +int server_open_native_socket(Server *s, const char *native_socket); diff --git a/src/journal/journald-rate-limit.c b/src/journal/journald-rate-limit.c new file mode 100644 index 0000000..1028e38 --- /dev/null +++ b/src/journal/journald-rate-limit.c @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "hashmap.h" +#include "journald-rate-limit.h" +#include "list.h" +#include "logarithm.h" +#include "random-util.h" +#include "string-util.h" +#include "time-util.h" + +#define POOLS_MAX 5 +#define BUCKETS_MAX 127 +#define GROUPS_MAX 2047 + +static const int priority_map[] = { + [LOG_EMERG] = 0, + [LOG_ALERT] = 0, + [LOG_CRIT] = 0, + [LOG_ERR] = 1, + [LOG_WARNING] = 2, + [LOG_NOTICE] = 3, + [LOG_INFO] = 3, + [LOG_DEBUG] = 4 +}; + +typedef struct JournalRateLimitPool JournalRateLimitPool; +typedef struct JournalRateLimitGroup JournalRateLimitGroup; + +struct JournalRateLimitPool { + usec_t begin; + unsigned num; + unsigned suppressed; +}; + +struct JournalRateLimitGroup { + JournalRateLimit *parent; + + char *id; + + /* Interval is stored to keep track of when the group expires */ + usec_t interval; + + JournalRateLimitPool pools[POOLS_MAX]; + uint64_t hash; + + LIST_FIELDS(JournalRateLimitGroup, bucket); + LIST_FIELDS(JournalRateLimitGroup, lru); +}; + +struct JournalRateLimit { + + JournalRateLimitGroup* buckets[BUCKETS_MAX]; + JournalRateLimitGroup *lru, *lru_tail; + + unsigned n_groups; + + uint8_t hash_key[16]; +}; + +JournalRateLimit *journal_ratelimit_new(void) { + JournalRateLimit *r; + + r = new0(JournalRateLimit, 1); + if (!r) + return NULL; + + random_bytes(r->hash_key, sizeof(r->hash_key)); + + return r; +} + +static void journal_ratelimit_group_free(JournalRateLimitGroup *g) { + assert(g); + + if (g->parent) { + assert(g->parent->n_groups > 0); + + if (g->parent->lru_tail == g) + g->parent->lru_tail = g->lru_prev; + + LIST_REMOVE(lru, g->parent->lru, g); + LIST_REMOVE(bucket, g->parent->buckets[g->hash % BUCKETS_MAX], g); + + g->parent->n_groups--; + } + + free(g->id); + free(g); +} + +void journal_ratelimit_free(JournalRateLimit *r) { + assert(r); + + while (r->lru) + journal_ratelimit_group_free(r->lru); + + free(r); +} + +static bool journal_ratelimit_group_expired(JournalRateLimitGroup *g, usec_t ts) { + unsigned i; + + assert(g); + + for (i = 0; i < POOLS_MAX; i++) + if (g->pools[i].begin + g->interval >= ts) + return false; + + return true; +} + +static void journal_ratelimit_vacuum(JournalRateLimit *r, usec_t ts) { + assert(r); + + /* Makes room for at least one new item, but drop all expired items too. */ + + while (r->n_groups >= GROUPS_MAX || + (r->lru_tail && journal_ratelimit_group_expired(r->lru_tail, ts))) + journal_ratelimit_group_free(r->lru_tail); +} + +static JournalRateLimitGroup* journal_ratelimit_group_new(JournalRateLimit *r, const char *id, usec_t interval, usec_t ts) { + JournalRateLimitGroup *g; + + assert(r); + assert(id); + + g = new0(JournalRateLimitGroup, 1); + if (!g) + return NULL; + + g->id = strdup(id); + if (!g->id) + goto fail; + + g->hash = siphash24_string(g->id, r->hash_key); + + g->interval = interval; + + journal_ratelimit_vacuum(r, ts); + + LIST_PREPEND(bucket, r->buckets[g->hash % BUCKETS_MAX], g); + LIST_PREPEND(lru, r->lru, g); + if (!g->lru_next) + r->lru_tail = g; + r->n_groups++; + + g->parent = r; + return g; + +fail: + journal_ratelimit_group_free(g); + return NULL; +} + +static unsigned burst_modulate(unsigned burst, uint64_t available) { + unsigned k; + + /* Modulates the burst rate a bit with the amount of available + * disk space */ + + k = log2u64(available); + + /* 1MB */ + if (k <= 20) + return burst; + + burst = (burst * (k-16)) / 4; + + /* + * Example: + * + * <= 1MB = rate * 1 + * 16MB = rate * 2 + * 256MB = rate * 3 + * 4GB = rate * 4 + * 64GB = rate * 5 + * 1TB = rate * 6 + */ + + return burst; +} + +int journal_ratelimit_test(JournalRateLimit *r, const char *id, usec_t rl_interval, unsigned rl_burst, int priority, uint64_t available) { + JournalRateLimitGroup *g, *found = NULL; + JournalRateLimitPool *p; + unsigned burst; + uint64_t h; + usec_t ts; + + assert(id); + + /* Returns: + * + * 0 → the log message shall be suppressed, + * 1 + n → the log message shall be permitted, and n messages were dropped from the peer before + * < 0 → error + */ + + if (!r) + return 1; + + ts = now(CLOCK_MONOTONIC); + + h = siphash24_string(id, r->hash_key); + g = r->buckets[h % BUCKETS_MAX]; + + LIST_FOREACH(bucket, i, g) + if (streq(i->id, id)) { + found = i; + break; + } + + if (!found) { + found = journal_ratelimit_group_new(r, id, rl_interval, ts); + if (!found) + return -ENOMEM; + } else + found->interval = rl_interval; + + if (rl_interval == 0 || rl_burst == 0) + return 1; + + burst = burst_modulate(rl_burst, available); + + p = &found->pools[priority_map[priority]]; + + if (p->begin <= 0) { + p->suppressed = 0; + p->num = 1; + p->begin = ts; + return 1; + } + + if (p->begin + rl_interval < ts) { + unsigned s; + + s = p->suppressed; + p->suppressed = 0; + p->num = 1; + p->begin = ts; + + return 1 + s; + } + + if (p->num < burst) { + p->num++; + return 1; + } + + p->suppressed++; + return 0; +} diff --git a/src/journal/journald-rate-limit.h b/src/journal/journald-rate-limit.h new file mode 100644 index 0000000..8def60f --- /dev/null +++ b/src/journal/journald-rate-limit.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +typedef struct JournalRateLimit JournalRateLimit; + +JournalRateLimit *journal_ratelimit_new(void); +void journal_ratelimit_free(JournalRateLimit *r); +int journal_ratelimit_test(JournalRateLimit *r, const char *id, usec_t rl_interval, unsigned rl_burst, int priority, uint64_t available); diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c new file mode 100644 index 0000000..1c3a2a0 --- /dev/null +++ b/src/journal/journald-server.c @@ -0,0 +1,2914 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_SELINUX +#include +#endif +#include +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-journal.h" +#include "sd-messages.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "audit-util.h" +#include "cgroup-util.h" +#include "conf-parser.h" +#include "dirent-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "initrd-util.h" +#include "iovec-util.h" +#include "journal-authenticate.h" +#include "journal-file-util.h" +#include "journal-internal.h" +#include "journal-vacuum.h" +#include "journald-audit.h" +#include "journald-context.h" +#include "journald-kmsg.h" +#include "journald-native.h" +#include "journald-rate-limit.h" +#include "journald-server.h" +#include "journald-stream.h" +#include "journald-syslog.h" +#include "log.h" +#include "missing_audit.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "syslog-util.h" +#include "uid-alloc-range.h" +#include "user-util.h" +#include "varlink-io.systemd.Journal.h" + +#define USER_JOURNALS_MAX 1024 + +#define DEFAULT_SYNC_INTERVAL_USEC (5*USEC_PER_MINUTE) +#define DEFAULT_RATE_LIMIT_INTERVAL (30*USEC_PER_SEC) +#define DEFAULT_RATE_LIMIT_BURST 10000 +#define DEFAULT_MAX_FILE_USEC USEC_PER_MONTH + +#define DEFAULT_KMSG_OWN_INTERVAL (5 * USEC_PER_SEC) +#define DEFAULT_KMSG_OWN_BURST 50 + +#define RECHECK_SPACE_USEC (30*USEC_PER_SEC) + +#define NOTIFY_SNDBUF_SIZE (8*1024*1024) + +/* The period to insert between posting changes for coalescing */ +#define POST_CHANGE_TIMER_INTERVAL_USEC (250*USEC_PER_MSEC) + +/* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room + * for a bit of additional metadata. */ +#define DEFAULT_LINE_MAX (48*1024) + +#define DEFERRED_CLOSES_MAX (4096) + +#define IDLE_TIMEOUT_USEC (30*USEC_PER_SEC) + +#define FAILED_TO_WRITE_ENTRY_RATELIMIT ((const RateLimit) { .interval = 1 * USEC_PER_SEC, .burst = 1 }) + +static int server_determine_path_usage( + Server *s, + const char *path, + uint64_t *ret_used, + uint64_t *ret_free) { + + _cleanup_closedir_ DIR *d = NULL; + struct statvfs ss; + + assert(s); + assert(path); + assert(ret_used); + assert(ret_free); + + d = opendir(path); + if (!d) + return log_ratelimit_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, + errno, JOURNAL_LOG_RATELIMIT, "Failed to open %s: %m", path); + + if (fstatvfs(dirfd(d), &ss) < 0) + return log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to fstatvfs(%s): %m", path); + + *ret_free = ss.f_bsize * ss.f_bavail; + *ret_used = 0; + FOREACH_DIRENT_ALL(de, d, break) { + struct stat st; + + if (!endswith(de->d_name, ".journal") && + !endswith(de->d_name, ".journal~")) + continue; + + if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) { + log_debug_errno(errno, "Failed to stat %s/%s, ignoring: %m", path, de->d_name); + continue; + } + + if (!S_ISREG(st.st_mode)) + continue; + + *ret_used += (uint64_t) st.st_blocks * 512UL; + } + + return 0; +} + +static void cache_space_invalidate(JournalStorageSpace *space) { + zero(*space); +} + +static int cache_space_refresh(Server *s, JournalStorage *storage) { + JournalStorageSpace *space; + JournalMetrics *metrics; + uint64_t vfs_used, vfs_avail, avail; + usec_t ts; + int r; + + assert(s); + + metrics = &storage->metrics; + space = &storage->space; + + ts = now(CLOCK_MONOTONIC); + + if (space->timestamp != 0 && usec_add(space->timestamp, RECHECK_SPACE_USEC) > ts) + return 0; + + r = server_determine_path_usage(s, storage->path, &vfs_used, &vfs_avail); + if (r < 0) + return r; + + space->vfs_used = vfs_used; + space->vfs_available = vfs_avail; + + avail = LESS_BY(vfs_avail, metrics->keep_free); + + space->limit = CLAMP(vfs_used + avail, metrics->min_use, metrics->max_use); + space->available = LESS_BY(space->limit, vfs_used); + space->timestamp = ts; + return 1; +} + +static void patch_min_use(JournalStorage *storage) { + assert(storage); + + /* Let's bump the min_use limit to the current usage on disk. We do + * this when starting up and first opening the journal files. This way + * sudden spikes in disk usage will not cause journald to vacuum files + * without bounds. Note that this means that only a restart of journald + * will make it reset this value. */ + + storage->metrics.min_use = MAX(storage->metrics.min_use, storage->space.vfs_used); +} + +static JournalStorage* server_current_storage(Server *s) { + assert(s); + + return s->system_journal ? &s->system_storage : &s->runtime_storage; +} + +static int server_determine_space(Server *s, uint64_t *available, uint64_t *limit) { + JournalStorage *js; + int r; + + assert(s); + + js = server_current_storage(s); + + r = cache_space_refresh(s, js); + if (r >= 0) { + if (available) + *available = js->space.available; + if (limit) + *limit = js->space.limit; + } + return r; +} + +void server_space_usage_message(Server *s, JournalStorage *storage) { + assert(s); + + if (!storage) + storage = server_current_storage(s); + + if (cache_space_refresh(s, storage) < 0) + return; + + const JournalMetrics *metrics = &storage->metrics; + + server_driver_message(s, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_USAGE_STR, + LOG_MESSAGE("%s (%s) is %s, max %s, %s free.", + storage->name, storage->path, + FORMAT_BYTES(storage->space.vfs_used), + FORMAT_BYTES(storage->space.limit), + FORMAT_BYTES(storage->space.available)), + "JOURNAL_NAME=%s", storage->name, + "JOURNAL_PATH=%s", storage->path, + "CURRENT_USE=%"PRIu64, storage->space.vfs_used, + "CURRENT_USE_PRETTY=%s", FORMAT_BYTES(storage->space.vfs_used), + "MAX_USE=%"PRIu64, metrics->max_use, + "MAX_USE_PRETTY=%s", FORMAT_BYTES(metrics->max_use), + "DISK_KEEP_FREE=%"PRIu64, metrics->keep_free, + "DISK_KEEP_FREE_PRETTY=%s", FORMAT_BYTES(metrics->keep_free), + "DISK_AVAILABLE=%"PRIu64, storage->space.vfs_available, + "DISK_AVAILABLE_PRETTY=%s", FORMAT_BYTES(storage->space.vfs_available), + "LIMIT=%"PRIu64, storage->space.limit, + "LIMIT_PRETTY=%s", FORMAT_BYTES(storage->space.limit), + "AVAILABLE=%"PRIu64, storage->space.available, + "AVAILABLE_PRETTY=%s", FORMAT_BYTES(storage->space.available), + NULL); +} + +static void server_add_acls(JournalFile *f, uid_t uid) { + assert(f); + +#if HAVE_ACL + int r; + + if (uid_for_system_journal(uid)) + return; + + r = fd_add_uid_acl_permission(f->fd, uid, ACL_READ); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to set ACL on %s, ignoring: %m", f->path); +#endif +} + +static int server_open_journal( + Server *s, + bool reliably, + const char *fname, + int open_flags, + bool seal, + JournalMetrics *metrics, + JournalFile **ret) { + + _cleanup_(journal_file_offline_closep) JournalFile *f = NULL; + JournalFileFlags file_flags; + int r; + + assert(s); + assert(fname); + assert(ret); + + file_flags = + (s->compress.enabled ? JOURNAL_COMPRESS : 0) | + (seal ? JOURNAL_SEAL : 0) | + JOURNAL_STRICT_ORDER; + + set_clear_with_destructor(s->deferred_closes, journal_file_offline_close); + + if (reliably) + r = journal_file_open_reliably( + fname, + open_flags, + file_flags, + 0640, + s->compress.threshold_bytes, + metrics, + s->mmap, + /* template= */ NULL, + &f); + else + r = journal_file_open( + /* fd= */ -1, + fname, + open_flags, + file_flags, + 0640, + s->compress.threshold_bytes, + metrics, + s->mmap, + /* template= */ NULL, + &f); + if (r < 0) + return r; + + r = journal_file_enable_post_change_timer(f, s->event, POST_CHANGE_TIMER_INTERVAL_USEC); + if (r < 0) + return r; + + *ret = TAKE_PTR(f); + return r; +} + +static bool server_flushed_flag_is_set(Server *s) { + const char *fn; + + assert(s); + + /* We don't support the "flushing" concept for namespace instances, we assume them to always have + * access to /var */ + if (s->namespace) + return true; + + fn = strjoina(s->runtime_directory, "/flushed"); + return access(fn, F_OK) >= 0; +} + +static int server_system_journal_open( + Server *s, + bool flush_requested, + bool relinquish_requested) { + + const char *fn; + int r = 0; + + if (!s->system_journal && + IN_SET(s->storage, STORAGE_PERSISTENT, STORAGE_AUTO) && + (flush_requested || server_flushed_flag_is_set(s)) && + !relinquish_requested) { + + /* If in auto mode: first try to create the machine path, but not the prefix. + * + * If in persistent mode: create /var/log/journal and the machine path */ + + if (s->storage == STORAGE_PERSISTENT) + (void) mkdir_parents(s->system_storage.path, 0755); + + (void) mkdir(s->system_storage.path, 0755); + + fn = strjoina(s->system_storage.path, "/system.journal"); + r = server_open_journal( + s, + /* reliably= */ true, + fn, + O_RDWR|O_CREAT, + s->seal, + &s->system_storage.metrics, + &s->system_journal); + if (r >= 0) { + server_add_acls(s->system_journal, 0); + (void) cache_space_refresh(s, &s->system_storage); + patch_min_use(&s->system_storage); + } else { + if (!IN_SET(r, -ENOENT, -EROFS)) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to open system journal: %m"); + + r = 0; + } + + /* If the runtime journal is open, and we're post-flush, we're recovering from a failed + * system journal rotate (ENOSPC) for which the runtime journal was reopened. + * + * Perform an implicit flush to var, leaving the runtime journal closed, now that the system + * journal is back. + */ + if (!flush_requested) + (void) server_flush_to_var(s, true); + } + + if (!s->runtime_journal && + (s->storage != STORAGE_NONE)) { + + fn = strjoina(s->runtime_storage.path, "/system.journal"); + + if (!s->system_journal || relinquish_requested) { + + /* OK, we really need the runtime journal, so create it if necessary. */ + + (void) mkdir_parents(s->runtime_storage.path, 0755); + (void) mkdir(s->runtime_storage.path, 0750); + + r = server_open_journal( + s, + /* reliably= */ true, + fn, + O_RDWR|O_CREAT, + /* seal= */ false, + &s->runtime_storage.metrics, + &s->runtime_journal); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to open runtime journal: %m"); + + } else if (!server_flushed_flag_is_set(s)) { + /* Try to open the runtime journal, but only if it already exists, so that we can + * flush it into the system journal */ + + r = server_open_journal( + s, + /* reliably= */ false, + fn, + O_RDWR, + /* seal= */ false, + &s->runtime_storage.metrics, + &s->runtime_journal); + if (r < 0) { + if (r != -ENOENT) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to open runtime journal: %m"); + + r = 0; + } + } + + if (s->runtime_journal) { + server_add_acls(s->runtime_journal, 0); + (void) cache_space_refresh(s, &s->runtime_storage); + patch_min_use(&s->runtime_storage); + } + } + + return r; +} + +static int server_find_user_journal(Server *s, uid_t uid, JournalFile **ret) { + _cleanup_(journal_file_offline_closep) JournalFile *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(!uid_for_system_journal(uid)); + + f = ordered_hashmap_get(s->user_journals, UID_TO_PTR(uid)); + if (f) + goto found; + + if (asprintf(&p, "%s/user-" UID_FMT ".journal", s->system_storage.path, uid) < 0) + return log_oom(); + + /* Too many open? Then let's close one (or more) */ + while (ordered_hashmap_size(s->user_journals) >= USER_JOURNALS_MAX) { + JournalFile *first; + + assert_se(first = ordered_hashmap_steal_first(s->user_journals)); + (void) journal_file_offline_close(first); + } + + r = server_open_journal( + s, + /* reliably= */ true, + p, + O_RDWR|O_CREAT, + s->seal, + &s->system_storage.metrics, + &f); + if (r < 0) + return r; + + r = ordered_hashmap_put(s->user_journals, UID_TO_PTR(uid), f); + if (r < 0) + return r; + + server_add_acls(f, uid); + +found: + *ret = TAKE_PTR(f); + return 0; +} + +static JournalFile* server_find_journal(Server *s, uid_t uid) { + int r; + + assert(s); + + /* A rotate that fails to create the new journal (ENOSPC) leaves the rotated journal as NULL. Unless + * we revisit opening, even after space is made available we'll continue to return NULL indefinitely. + * + * system_journal_open() is a noop if the journals are already open, so we can just call it here to + * recover from failed rotates (or anything else that's left the journals as NULL). + * + * Fixes https://github.com/systemd/systemd/issues/3968 */ + (void) server_system_journal_open(s, /* flush_requested= */ false, /* relinquish_requested= */ false); + + /* We split up user logs only on /var, not on /run. If the runtime file is open, we write to it + * exclusively, in order to guarantee proper order as soon as we flush /run to /var and close the + * runtime file. */ + + if (s->runtime_journal) + return s->runtime_journal; + + /* If we are not in persistent mode, then we need return NULL immediately rather than opening a + * persistent journal of any sort. + * + * Fixes https://github.com/systemd/systemd/issues/20390 */ + if (!IN_SET(s->storage, STORAGE_AUTO, STORAGE_PERSISTENT)) + return NULL; + + if (!uid_for_system_journal(uid)) { + JournalFile *f = NULL; + + r = server_find_user_journal(s, uid, &f); + if (r >= 0) + return ASSERT_PTR(f); + + log_warning_errno(r, "Failed to open user journal file, falling back to system journal: %m"); + } + + return s->system_journal; +} + +static int server_do_rotate( + Server *s, + JournalFile **f, + const char* name, + bool seal, + uint32_t uid) { + + JournalFileFlags file_flags; + int r; + + assert(s); + + if (!*f) + return -EINVAL; + + file_flags = + (s->compress.enabled ? JOURNAL_COMPRESS : 0)| + (seal ? JOURNAL_SEAL : 0) | + JOURNAL_STRICT_ORDER; + + r = journal_file_rotate(f, s->mmap, file_flags, s->compress.threshold_bytes, s->deferred_closes); + if (r < 0) { + if (*f) + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to rotate %s: %m", (*f)->path); + else + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to create new %s journal: %m", name); + } + + server_add_acls(*f, uid); + return r; +} + +static void server_process_deferred_closes(Server *s) { + JournalFile *f; + + /* Perform any deferred closes which aren't still offlining. */ + SET_FOREACH(f, s->deferred_closes) { + if (journal_file_is_offlining(f)) + continue; + + (void) set_remove(s->deferred_closes, f); + (void) journal_file_offline_close(f); + } +} + +static void server_vacuum_deferred_closes(Server *s) { + assert(s); + + /* Make some room in the deferred closes list, so that it doesn't grow without bounds */ + if (set_size(s->deferred_closes) < DEFERRED_CLOSES_MAX) + return; + + /* Let's first remove all journal files that might already have completed closing */ + server_process_deferred_closes(s); + + /* And now, let's close some more until we reach the limit again. */ + while (set_size(s->deferred_closes) >= DEFERRED_CLOSES_MAX) { + JournalFile *f; + + assert_se(f = set_steal_first(s->deferred_closes)); + journal_file_offline_close(f); + } +} + +static int server_archive_offline_user_journals(Server *s) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(s); + + d = opendir(s->system_storage.path); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to open %s: %m", s->system_storage.path); + } + + for (;;) { + _cleanup_free_ char *full = NULL; + _cleanup_close_ int fd = -EBADF; + struct dirent *de; + JournalFile *f; + uid_t uid; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + log_ratelimit_warning_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to enumerate %s, ignoring: %m", + s->system_storage.path); + break; + } + + r = journal_file_parse_uid_from_filename(de->d_name, &uid); + if (r < 0) { + /* Don't warn if the file is not an online or offline user journal. */ + if (r != -EREMOTE) + log_warning_errno(r, "Failed to parse UID from file name '%s', ignoring: %m", de->d_name); + continue; + } + + /* Already rotated in the above loop? i.e. is it an open user journal? */ + if (ordered_hashmap_contains(s->user_journals, UID_TO_PTR(uid))) + continue; + + full = path_join(s->system_storage.path, de->d_name); + if (!full) + return log_oom(); + + fd = openat(dirfd(d), de->d_name, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|O_NONBLOCK); + if (fd < 0) { + log_ratelimit_full_errno(IN_SET(errno, ELOOP, ENOENT) ? LOG_DEBUG : LOG_WARNING, + errno, JOURNAL_LOG_RATELIMIT, + "Failed to open journal file '%s' for rotation: %m", full); + continue; + } + + /* Make some room in the set of deferred close()s */ + server_vacuum_deferred_closes(s); + + /* Open the file briefly, so that we can archive it */ + r = journal_file_open( + fd, + full, + O_RDWR, + (s->compress.enabled ? JOURNAL_COMPRESS : 0) | + (s->seal ? JOURNAL_SEAL : 0), /* strict order does not matter here */ + 0640, + s->compress.threshold_bytes, + &s->system_storage.metrics, + s->mmap, + /* template= */ NULL, + &f); + if (r < 0) { + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to read journal file %s for rotation, trying to move it out of the way: %m", + full); + + r = journal_file_dispose(dirfd(d), de->d_name); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to move %s out of the way, ignoring: %m", + full); + else + log_debug("Successfully moved %s out of the way.", full); + + continue; + } + + TAKE_FD(fd); /* Donated to journal_file_open() */ + + journal_file_write_final_tag(f); + r = journal_file_archive(f, NULL); + if (r < 0) + log_debug_errno(r, "Failed to archive journal file '%s', ignoring: %m", full); + + journal_file_initiate_close(TAKE_PTR(f), s->deferred_closes); + } + + return 0; +} + +void server_rotate(Server *s) { + JournalFile *f; + void *k; + int r; + + log_debug("Rotating..."); + + /* First, rotate the system journal (either in its runtime flavour or in its runtime flavour) */ + (void) server_do_rotate(s, &s->runtime_journal, "runtime", /* seal= */ false, /* uid= */ 0); + (void) server_do_rotate(s, &s->system_journal, "system", s->seal, /* uid= */ 0); + + /* Then, rotate all user journals we have open (keeping them open) */ + ORDERED_HASHMAP_FOREACH_KEY(f, k, s->user_journals) { + r = server_do_rotate(s, &f, "user", s->seal, PTR_TO_UID(k)); + if (r >= 0) + ordered_hashmap_replace(s->user_journals, k, f); + else if (!f) + /* Old file has been closed and deallocated */ + ordered_hashmap_remove(s->user_journals, k); + } + + /* Finally, also rotate all user journals we currently do not have open. (But do so only if we + * actually have access to /var, i.e. are not in the log-to-runtime-journal mode). */ + if (!s->runtime_journal) + (void) server_archive_offline_user_journals(s); + + server_process_deferred_closes(s); +} + +void server_sync(Server *s) { + JournalFile *f; + int r; + + if (s->system_journal) { + r = journal_file_set_offline(s->system_journal, false); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to sync system journal, ignoring: %m"); + } + + ORDERED_HASHMAP_FOREACH(f, s->user_journals) { + r = journal_file_set_offline(f, false); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to sync user journal, ignoring: %m"); + } + + if (s->sync_event_source) { + r = sd_event_source_set_enabled(s->sync_event_source, SD_EVENT_OFF); + if (r < 0) + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to disable sync timer source: %m"); + } + + s->sync_scheduled = false; +} + +static void server_do_vacuum(Server *s, JournalStorage *storage, bool verbose) { + + int r; + + assert(s); + assert(storage); + + (void) cache_space_refresh(s, storage); + + if (verbose) + server_space_usage_message(s, storage); + + r = journal_directory_vacuum(storage->path, storage->space.limit, + storage->metrics.n_max_files, s->max_retention_usec, + &s->oldest_file_usec, verbose); + if (r < 0 && r != -ENOENT) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to vacuum %s, ignoring: %m", storage->path); + + cache_space_invalidate(&storage->space); +} + +void server_vacuum(Server *s, bool verbose) { + assert(s); + + log_debug("Vacuuming..."); + + s->oldest_file_usec = 0; + + if (s->system_journal) + server_do_vacuum(s, &s->system_storage, verbose); + if (s->runtime_journal) + server_do_vacuum(s, &s->runtime_storage, verbose); +} + +static void server_cache_machine_id(Server *s) { + sd_id128_t id; + int r; + + assert(s); + + r = sd_id128_get_machine(&id); + if (r < 0) + return; + + sd_id128_to_string(id, stpcpy(s->machine_id_field, "_MACHINE_ID=")); +} + +static void server_cache_boot_id(Server *s) { + sd_id128_t id; + int r; + + assert(s); + + r = sd_id128_get_boot(&id); + if (r < 0) + return; + + sd_id128_to_string(id, stpcpy(s->boot_id_field, "_BOOT_ID=")); +} + +static void server_cache_hostname(Server *s) { + _cleanup_free_ char *t = NULL; + char *x; + + assert(s); + + t = gethostname_malloc(); + if (!t) + return; + + x = strjoin("_HOSTNAME=", t); + if (!x) + return; + + free_and_replace(s->hostname_field, x); +} + +static bool shall_try_append_again(JournalFile *f, int r) { + switch (r) { + + case -E2BIG: /* Hit configured limit */ + case -EFBIG: /* Hit fs limit */ + case -EDQUOT: /* Quota limit hit */ + case -ENOSPC: /* Disk full */ + log_debug_errno(r, "%s: Allocation limit reached, rotating.", f->path); + return true; + + case -EROFS: /* Read-only file system */ + /* When appending an entry fails if shall_try_append_again returns true, the journal is + * rotated. If the FS is read-only, rotation will fail and s->system_journal will be set to + * NULL. After that, when find_journal will try to open the journal since s->system_journal + * will be NULL, it will open the runtime journal. */ + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Read-only file system, rotating.", f->path); + return true; + + case -EIO: /* I/O error of some kind (mmap) */ + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "%s: IO error, rotating.", f->path); + return true; + + case -EHOSTDOWN: /* Other machine */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Journal file from other machine, rotating.", f->path); + return true; + + case -EBUSY: /* Unclean shutdown */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Unclean shutdown, rotating.", f->path); + return true; + + case -EPROTONOSUPPORT: /* Unsupported feature */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Unsupported feature, rotating.", f->path); + return true; + + case -EBADMSG: /* Corrupted */ + case -ENODATA: /* Truncated */ + case -ESHUTDOWN: /* Already archived */ + case -EADDRNOTAVAIL: /* Referenced object offset out of bounds */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Journal file corrupted, rotating.", f->path); + return true; + + case -EIDRM: /* Journal file has been deleted */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Journal file has been deleted, rotating.", f->path); + return true; + + case -EREMCHG: /* Wallclock time (CLOCK_REALTIME) jumped backwards relative to last journal entry */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Realtime clock jumped backwards relative to last journal entry, rotating.", f->path); + return true; + + case -ENOTNAM: /* Monotonic time (CLOCK_MONOTONIC) jumped backwards relative to last journal entry with the same boot ID */ + log_ratelimit_info_errno( + r, + JOURNAL_LOG_RATELIMIT, + "%s: Monotonic clock jumped backwards relative to last journal entry with the same boot ID, rotating.", + f->path); + return true; + + case -EILSEQ: /* seqnum ID last used in the file doesn't match the one we'd passed when writing an entry to it */ + log_ratelimit_info_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Journal file uses a different sequence number ID, rotating.", f->path); + return true; + + case -EAFNOSUPPORT: + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Underlying file system does not support memory mapping or another required file system feature.", f->path); + return false; + + default: + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "%s: Unexpected error while writing to journal file: %m", f->path); + return false; + } +} + +static void server_write_to_journal( + Server *s, + uid_t uid, + const struct iovec *iovec, + size_t n, + int priority) { + + bool vacuumed = false, rotate = false; + struct dual_timestamp ts; + JournalFile *f; + int r; + + assert(s); + assert(iovec); + assert(n > 0); + + /* Get the closest, linearized time we have for this log event from the event loop. (Note that we do not use + * the source time, and not even the time the event was originally seen, but instead simply the time we started + * processing it, as we want strictly linear ordering in what we write out.) */ + assert_se(sd_event_now(s->event, CLOCK_REALTIME, &ts.realtime) >= 0); + assert_se(sd_event_now(s->event, CLOCK_MONOTONIC, &ts.monotonic) >= 0); + + if (ts.realtime < s->last_realtime_clock) { + /* When the time jumps backwards, let's immediately rotate. Of course, this should not happen during + * regular operation. However, when it does happen, then we should make sure that we start fresh files + * to ensure that the entries in the journal files are strictly ordered by time, in order to ensure + * bisection works correctly. */ + + log_ratelimit_info(JOURNAL_LOG_RATELIMIT, "Time jumped backwards, rotating."); + rotate = true; + } else { + + f = server_find_journal(s, uid); + if (!f) + return; + + if (journal_file_rotate_suggested(f, s->max_file_usec, LOG_DEBUG)) { + log_debug("%s: Journal header limits reached or header out-of-date, rotating.", + f->path); + rotate = true; + } + } + + if (rotate) { + server_rotate(s); + server_vacuum(s, false); + vacuumed = true; + + f = server_find_journal(s, uid); + if (!f) + return; + } + + s->last_realtime_clock = ts.realtime; + + r = journal_file_append_entry( + f, + &ts, + /* boot_id= */ NULL, + iovec, n, + &s->seqnum->seqnum, + &s->seqnum->id, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); + if (r >= 0) { + server_schedule_sync(s, priority); + return; + } + + log_debug_errno(r, "Failed to write entry to %s (%zu items, %zu bytes): %m", f->path, n, iovec_total_size(iovec, n)); + + if (!shall_try_append_again(f, r)) + return; + if (vacuumed) { + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Suppressing rotation, as we already rotated immediately before write attempt. Giving up."); + return; + } + + server_rotate(s); + server_vacuum(s, false); + + f = server_find_journal(s, uid); + if (!f) + return; + + log_debug_errno(r, "Retrying write."); + r = journal_file_append_entry( + f, + &ts, + /* boot_id= */ NULL, + iovec, n, + &s->seqnum->seqnum, + &s->seqnum->id, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); + if (r < 0) + log_ratelimit_error_errno(r, FAILED_TO_WRITE_ENTRY_RATELIMIT, + "Failed to write entry to %s (%zu items, %zu bytes) despite vacuuming, ignoring: %m", + f->path, n, iovec_total_size(iovec, n)); + else + server_schedule_sync(s, priority); +} + +#define IOVEC_ADD_NUMERIC_FIELD(iovec, n, value, type, isset, format, field) \ + if (isset(value)) { \ + char *k; \ + k = newa(char, STRLEN(field "=") + DECIMAL_STR_MAX(type) + 1); \ + sprintf(k, field "=" format, value); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } + +#define IOVEC_ADD_STRING_FIELD(iovec, n, value, field) \ + if (!isempty(value)) { \ + char *k; \ + k = strjoina(field "=", value); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } + +#define IOVEC_ADD_ID128_FIELD(iovec, n, value, field) \ + if (!sd_id128_is_null(value)) { \ + char *k; \ + k = newa(char, STRLEN(field "=") + SD_ID128_STRING_MAX); \ + sd_id128_to_string(value, stpcpy(k, field "=")); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } + +#define IOVEC_ADD_SIZED_FIELD(iovec, n, value, value_size, field) \ + if (value_size > 0) { \ + char *k; \ + k = newa(char, STRLEN(field "=") + value_size + 1); \ + *((char*) mempcpy(stpcpy(k, field "="), value, value_size)) = 0; \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ + } \ + +static void server_dispatch_message_real( + Server *s, + struct iovec *iovec, size_t n, size_t m, + const ClientContext *c, + const struct timeval *tv, + int priority, + pid_t object_pid) { + + char source_time[sizeof("_SOURCE_REALTIME_TIMESTAMP=") + DECIMAL_STR_MAX(usec_t)]; + _unused_ _cleanup_free_ char *cmdline1 = NULL, *cmdline2 = NULL; + uid_t journal_uid; + ClientContext *o; + + assert(s); + assert(iovec); + assert(n > 0); + assert(n + + N_IOVEC_META_FIELDS + + (pid_is_valid(object_pid) ? N_IOVEC_OBJECT_FIELDS : 0) + + client_context_extra_fields_n_iovec(c) <= m); + + if (c) { + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->pid, pid_t, pid_is_valid, PID_FMT, "_PID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->uid, uid_t, uid_is_valid, UID_FMT, "_UID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->gid, gid_t, gid_is_valid, GID_FMT, "_GID"); + + IOVEC_ADD_STRING_FIELD(iovec, n, c->comm, "_COMM"); /* At most TASK_COMM_LENGTH (16 bytes) */ + IOVEC_ADD_STRING_FIELD(iovec, n, c->exe, "_EXE"); /* A path, so at most PATH_MAX (4096 bytes) */ + + if (c->cmdline) + /* At most _SC_ARG_MAX (2MB usually), which is too much to put on stack. + * Let's use a heap allocation for this one. */ + cmdline1 = set_iovec_string_field(iovec, &n, "_CMDLINE=", c->cmdline); + + IOVEC_ADD_STRING_FIELD(iovec, n, c->capeff, "_CAP_EFFECTIVE"); /* Read from /proc/.../status */ + IOVEC_ADD_SIZED_FIELD(iovec, n, c->label, c->label_size, "_SELINUX_CONTEXT"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->auditid, uint32_t, audit_session_is_valid, "%" PRIu32, "_AUDIT_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->loginuid, uid_t, uid_is_valid, UID_FMT, "_AUDIT_LOGINUID"); + + IOVEC_ADD_STRING_FIELD(iovec, n, c->cgroup, "_SYSTEMD_CGROUP"); /* A path */ + IOVEC_ADD_STRING_FIELD(iovec, n, c->session, "_SYSTEMD_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->owner_uid, uid_t, uid_is_valid, UID_FMT, "_SYSTEMD_OWNER_UID"); + IOVEC_ADD_STRING_FIELD(iovec, n, c->unit, "_SYSTEMD_UNIT"); /* Unit names are bounded by UNIT_NAME_MAX */ + IOVEC_ADD_STRING_FIELD(iovec, n, c->user_unit, "_SYSTEMD_USER_UNIT"); + IOVEC_ADD_STRING_FIELD(iovec, n, c->slice, "_SYSTEMD_SLICE"); + IOVEC_ADD_STRING_FIELD(iovec, n, c->user_slice, "_SYSTEMD_USER_SLICE"); + + IOVEC_ADD_ID128_FIELD(iovec, n, c->invocation_id, "_SYSTEMD_INVOCATION_ID"); + + if (c->extra_fields_n_iovec > 0) { + memcpy(iovec + n, c->extra_fields_iovec, c->extra_fields_n_iovec * sizeof(struct iovec)); + n += c->extra_fields_n_iovec; + } + } + + assert(n <= m); + + if (pid_is_valid(object_pid) && client_context_get(s, object_pid, NULL, NULL, 0, NULL, &o) >= 0) { + + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->pid, pid_t, pid_is_valid, PID_FMT, "OBJECT_PID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->uid, uid_t, uid_is_valid, UID_FMT, "OBJECT_UID"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->gid, gid_t, gid_is_valid, GID_FMT, "OBJECT_GID"); + + /* See above for size limits, only ->cmdline may be large, so use a heap allocation for it. */ + IOVEC_ADD_STRING_FIELD(iovec, n, o->comm, "OBJECT_COMM"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->exe, "OBJECT_EXE"); + if (o->cmdline) + cmdline2 = set_iovec_string_field(iovec, &n, "OBJECT_CMDLINE=", o->cmdline); + + IOVEC_ADD_STRING_FIELD(iovec, n, o->capeff, "OBJECT_CAP_EFFECTIVE"); + IOVEC_ADD_SIZED_FIELD(iovec, n, o->label, o->label_size, "OBJECT_SELINUX_CONTEXT"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->auditid, uint32_t, audit_session_is_valid, "%" PRIu32, "OBJECT_AUDIT_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->loginuid, uid_t, uid_is_valid, UID_FMT, "OBJECT_AUDIT_LOGINUID"); + + IOVEC_ADD_STRING_FIELD(iovec, n, o->cgroup, "OBJECT_SYSTEMD_CGROUP"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->session, "OBJECT_SYSTEMD_SESSION"); + IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->owner_uid, uid_t, uid_is_valid, UID_FMT, "OBJECT_SYSTEMD_OWNER_UID"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->unit, "OBJECT_SYSTEMD_UNIT"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->user_unit, "OBJECT_SYSTEMD_USER_UNIT"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->slice, "OBJECT_SYSTEMD_SLICE"); + IOVEC_ADD_STRING_FIELD(iovec, n, o->user_slice, "OBJECT_SYSTEMD_USER_SLICE"); + + IOVEC_ADD_ID128_FIELD(iovec, n, o->invocation_id, "OBJECT_SYSTEMD_INVOCATION_ID="); + } + + assert(n <= m); + + if (tv) { + sprintf(source_time, "_SOURCE_REALTIME_TIMESTAMP=" USEC_FMT, timeval_load(tv)); + iovec[n++] = IOVEC_MAKE_STRING(source_time); + } + + /* Note that strictly speaking storing the boot id here is + * redundant since the entry includes this in-line + * anyway. However, we need this indexed, too. */ + if (!isempty(s->boot_id_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->boot_id_field); + + if (!isempty(s->machine_id_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->machine_id_field); + + if (!isempty(s->hostname_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->hostname_field); + + if (!isempty(s->namespace_field)) + iovec[n++] = IOVEC_MAKE_STRING(s->namespace_field); + + iovec[n++] = in_initrd() ? IOVEC_MAKE_STRING("_RUNTIME_SCOPE=initrd") : IOVEC_MAKE_STRING("_RUNTIME_SCOPE=system"); + assert(n <= m); + + if (s->split_mode == SPLIT_UID && c && uid_is_valid(c->uid)) + /* Split up strictly by (non-root) UID */ + journal_uid = c->uid; + else if (s->split_mode == SPLIT_LOGIN && c && c->uid > 0 && uid_is_valid(c->owner_uid)) + /* Split up by login UIDs. We do this only if the + * realuid is not root, in order not to accidentally + * leak privileged information to the user that is + * logged by a privileged process that is part of an + * unprivileged session. */ + journal_uid = c->owner_uid; + else + journal_uid = 0; + + server_write_to_journal(s, journal_uid, iovec, n, priority); +} + +void server_driver_message(Server *s, pid_t object_pid, const char *message_id, const char *format, ...) { + + struct iovec *iovec; + size_t n = 0, k, m; + va_list ap; + int r; + + assert(s); + assert(format); + + m = N_IOVEC_META_FIELDS + 5 + N_IOVEC_PAYLOAD_FIELDS + client_context_extra_fields_n_iovec(s->my_context) + N_IOVEC_OBJECT_FIELDS; + iovec = newa(struct iovec, m); + + assert_cc(3 == LOG_FAC(LOG_DAEMON)); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=3"); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=systemd-journald"); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=driver"); + assert_cc(6 == LOG_INFO); + iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=6"); + + if (message_id) + iovec[n++] = IOVEC_MAKE_STRING(message_id); + k = n; + + va_start(ap, format); + r = log_format_iovec(iovec, m, &n, false, 0, format, ap); + /* Error handling below */ + va_end(ap); + + if (r >= 0) + server_dispatch_message_real(s, iovec, n, m, s->my_context, /* tv= */ NULL, LOG_INFO, object_pid); + + while (k < n) + free(iovec[k++].iov_base); + + if (r < 0) { + /* We failed to format the message. Emit a warning instead. */ + char buf[LINE_MAX]; + + errno = -r; + xsprintf(buf, "MESSAGE=Entry printing failed: %m"); + + n = 3; + iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=4"); + iovec[n++] = IOVEC_MAKE_STRING(buf); + server_dispatch_message_real(s, iovec, n, m, s->my_context, /* tv= */ NULL, LOG_INFO, object_pid); + } +} + +void server_dispatch_message( + Server *s, + struct iovec *iovec, size_t n, size_t m, + ClientContext *c, + const struct timeval *tv, + int priority, + pid_t object_pid) { + + uint64_t available = 0; + int rl; + + assert(s); + assert(iovec || n == 0); + + if (n == 0) + return; + + if (LOG_PRI(priority) > s->max_level_store) + return; + + /* Stop early in case the information will not be stored + * in a journal. */ + if (s->storage == STORAGE_NONE) + return; + + if (c && c->unit) { + (void) server_determine_space(s, &available, /* limit= */ NULL); + + rl = journal_ratelimit_test(s->ratelimit, c->unit, c->log_ratelimit_interval, c->log_ratelimit_burst, priority & LOG_PRIMASK, available); + if (rl == 0) + return; + + /* Write a suppression message if we suppressed something */ + if (rl > 1) + server_driver_message(s, c->pid, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_DROPPED_STR, + LOG_MESSAGE("Suppressed %i messages from %s", rl - 1, c->unit), + "N_DROPPED=%i", rl - 1, + NULL); + } + + server_dispatch_message_real(s, iovec, n, m, c, tv, priority, object_pid); +} + +int server_flush_to_var(Server *s, bool require_flag_file) { + sd_journal *j = NULL; + const char *fn; + unsigned n = 0; + usec_t start; + int r, k; + + assert(s); + + if (!IN_SET(s->storage, STORAGE_AUTO, STORAGE_PERSISTENT)) + return 0; + + if (s->namespace) /* Flushing concept does not exist for namespace instances */ + return 0; + + if (!s->runtime_journal) /* Nothing to flush? */ + return 0; + + if (require_flag_file && !server_flushed_flag_is_set(s)) + return 0; + + (void) server_system_journal_open(s, /* flush_requested=*/ true, /* relinquish_requested= */ false); + + if (!s->system_journal) + return 0; + + log_debug("Flushing to %s...", s->system_storage.path); + + start = now(CLOCK_MONOTONIC); + + r = sd_journal_open(&j, SD_JOURNAL_RUNTIME_ONLY); + if (r < 0) + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to read runtime journal: %m"); + + sd_journal_set_data_threshold(j, 0); + + SD_JOURNAL_FOREACH(j) { + Object *o = NULL; + JournalFile *f; + + f = j->current_file; + assert(f && f->current_offset > 0); + + n++; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) { + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Can't read entry: %m"); + goto finish; + } + + r = journal_file_copy_entry( + f, + s->system_journal, + o, + f->current_offset, + &s->seqnum->seqnum, + &s->seqnum->id); + if (r >= 0) + continue; + + if (!shall_try_append_again(s->system_journal, r)) { + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Can't write entry: %m"); + goto finish; + } + + log_ratelimit_info(JOURNAL_LOG_RATELIMIT, "Rotating system journal."); + + server_rotate(s); + server_vacuum(s, false); + + if (!s->system_journal) { + log_ratelimit_notice(JOURNAL_LOG_RATELIMIT, + "Didn't flush runtime journal since rotation of system journal wasn't successful."); + r = -EIO; + goto finish; + } + + log_debug("Retrying write."); + r = journal_file_copy_entry( + f, + s->system_journal, + o, + f->current_offset, + &s->seqnum->seqnum, + &s->seqnum->id); + if (r < 0) { + log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Can't write entry: %m"); + goto finish; + } + } + + r = 0; + +finish: + if (s->system_journal) + journal_file_post_change(s->system_journal); + + s->runtime_journal = journal_file_offline_close(s->runtime_journal); + + if (r >= 0) + (void) rm_rf(s->runtime_storage.path, REMOVE_ROOT); + + sd_journal_close(j); + + server_driver_message(s, 0, NULL, + LOG_MESSAGE("Time spent on flushing to %s is %s for %u entries.", + s->system_storage.path, + FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0), + n), + NULL); + + fn = strjoina(s->runtime_directory, "/flushed"); + k = touch(fn); + if (k < 0) + log_ratelimit_warning_errno(k, JOURNAL_LOG_RATELIMIT, + "Failed to touch %s, ignoring: %m", fn); + + server_refresh_idle_timer(s); + return r; +} + +static int server_relinquish_var(Server *s) { + const char *fn; + assert(s); + + if (s->storage == STORAGE_NONE) + return 0; + + if (s->namespace) /* Concept does not exist for namespaced instances */ + return -EOPNOTSUPP; + + if (s->runtime_journal && !s->system_journal) + return 0; + + log_debug("Relinquishing %s...", s->system_storage.path); + + (void) server_system_journal_open(s, /* flush_requested */ false, /* relinquish_requested=*/ true); + + s->system_journal = journal_file_offline_close(s->system_journal); + ordered_hashmap_clear_with_destructor(s->user_journals, journal_file_offline_close); + set_clear_with_destructor(s->deferred_closes, journal_file_offline_close); + + fn = strjoina(s->runtime_directory, "/flushed"); + if (unlink(fn) < 0 && errno != ENOENT) + log_ratelimit_warning_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to unlink %s, ignoring: %m", fn); + + server_refresh_idle_timer(s); + return 0; +} + +int server_process_datagram( + sd_event_source *es, + int fd, + uint32_t revents, + void *userdata) { + + size_t label_len = 0, m; + Server *s = ASSERT_PTR(userdata); + struct ucred *ucred = NULL; + struct timeval tv_buf, *tv = NULL; + struct cmsghdr *cmsg; + char *label = NULL; + struct iovec iovec; + ssize_t n; + int *fds = NULL, v = 0; + size_t n_fds = 0; + + /* We use NAME_MAX space for the SELinux label here. The kernel currently enforces no limit, but + * according to suggestions from the SELinux people this will change and it will probably be + * identical to NAME_MAX. For now we use that, but this should be updated one day when the final + * limit is known. + * + * Here, we need to explicitly initialize the buffer with zero, as glibc has a bug in + * __convert_scm_timestamps(), which assumes the buffer is initialized. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE_TIMEVAL + + CMSG_SPACE(sizeof(int)) + /* fd */ + CMSG_SPACE(NAME_MAX) /* selinux label */) control = {}; + + union sockaddr_union sa = {}; + + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_name = &sa, + .msg_namelen = sizeof(sa), + }; + + assert(fd == s->native_fd || fd == s->syslog_fd || fd == s->audit_fd); + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Got invalid event from epoll for datagram fd: %" PRIx32, + revents); + + /* Try to get the right size, if we can. (Not all sockets support SIOCINQ, hence we just try, but don't rely on + * it.) */ + (void) ioctl(fd, SIOCINQ, &v); + + /* Fix it up, if it is too small. We use the same fixed value as auditd here. Awful! */ + m = PAGE_ALIGN(MAX3((size_t) v + 1, + (size_t) LINE_MAX, + ALIGN(sizeof(struct nlmsghdr)) + ALIGN((size_t) MAX_AUDIT_MESSAGE_LENGTH)) + 1); + + if (!GREEDY_REALLOC(s->buffer, m)) + return log_oom(); + + iovec = IOVEC_MAKE(s->buffer, MALLOC_ELEMENTSOF(s->buffer) - 1); /* Leave room for trailing NUL we add later */ + + n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (n < 0) { + if (ERRNO_IS_TRANSIENT(n)) + return 0; + if (n == -EXFULL) { + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, + "Got message with truncated control data (too many fds sent?), ignoring."); + return 0; + } + return log_ratelimit_error_errno(n, JOURNAL_LOG_RATELIMIT, "recvmsg() failed: %m"); + } + + CMSG_FOREACH(cmsg, &msghdr) + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + assert(!ucred); + ucred = CMSG_TYPED_DATA(cmsg, struct ucred); + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_SECURITY) { + assert(!label); + label = CMSG_TYPED_DATA(cmsg, char); + label_len = cmsg->cmsg_len - CMSG_LEN(0); + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_TIMESTAMP && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct timeval))) { + assert(!tv); + tv = memcpy(&tv_buf, CMSG_DATA(cmsg), sizeof(struct timeval)); + } else if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + assert(!fds); + fds = CMSG_TYPED_DATA(cmsg, int); + n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + } + + /* And a trailing NUL, just in case */ + s->buffer[n] = 0; + + if (fd == s->syslog_fd) { + if (n > 0 && n_fds == 0) + server_process_syslog_message(s, s->buffer, n, ucred, tv, label, label_len); + else if (n_fds > 0) + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, + "Got file descriptors via syslog socket. Ignoring."); + + } else if (fd == s->native_fd) { + if (n > 0 && n_fds == 0) + server_process_native_message(s, s->buffer, n, ucred, tv, label, label_len); + else if (n == 0 && n_fds == 1) + server_process_native_file(s, fds[0], ucred, tv, label, label_len); + else if (n_fds > 0) + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, + "Got too many file descriptors via native socket. Ignoring."); + + } else { + assert(fd == s->audit_fd); + + if (n > 0 && n_fds == 0) + server_process_audit_message(s, s->buffer, n, ucred, &sa, msghdr.msg_namelen); + else if (n_fds > 0) + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, + "Got file descriptors via audit socket. Ignoring."); + } + + close_many(fds, n_fds); + + server_refresh_idle_timer(s); + return 0; +} + +static void server_full_flush(Server *s) { + assert(s); + + (void) server_flush_to_var(s, false); + server_sync(s); + server_vacuum(s, false); + + server_space_usage_message(s, NULL); + + server_refresh_idle_timer(s); +} + +static int dispatch_sigusr1(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + if (s->namespace) { + log_error("Received SIGUSR1 signal from PID %u, but flushing runtime journals not supported for namespaced instances.", si->ssi_pid); + return 0; + } + + log_info("Received SIGUSR1 signal from PID %u, as request to flush runtime journal.", si->ssi_pid); + server_full_flush(s); + + return 0; +} + +static void server_full_rotate(Server *s) { + const char *fn; + int r; + + assert(s); + + server_rotate(s); + server_vacuum(s, true); + + if (s->system_journal) + patch_min_use(&s->system_storage); + if (s->runtime_journal) + patch_min_use(&s->runtime_storage); + + /* Let clients know when the most recent rotation happened. */ + fn = strjoina(s->runtime_directory, "/rotated"); + r = write_timestamp_file_atomic(fn, now(CLOCK_MONOTONIC)); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to write %s, ignoring: %m", fn); +} + +static int dispatch_sigusr2(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + log_info("Received SIGUSR2 signal from PID %u, as request to rotate journal, rotating.", si->ssi_pid); + server_full_rotate(s); + + return 0; +} + +static int dispatch_sigterm(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *news = NULL; + Server *s = ASSERT_PTR(userdata); + int r; + + log_received_signal(LOG_INFO, si); + + (void) sd_event_source_set_enabled(es, SD_EVENT_OFF); /* Make sure this handler is called at most once */ + + /* So on one hand we want to ensure that SIGTERMs are definitely handled in appropriate, bounded + * time. On the other hand we want that everything pending is first comprehensively processed and + * written to disk. These goals are incompatible, hence we try to find a middle ground: we'll process + * SIGTERM with high priority, but from the handler (this one right here) we'll install two new event + * sources: one low priority idle one that will issue the exit once everything else is processed (and + * which is hopefully the regular, clean codepath); and one high priority timer that acts as safety + * net: if our idle handler isn't run within 10s, we'll exit anyway. + * + * TLDR: we'll exit either when everything is processed, or after 10s max, depending on what happens + * first. + * + * Note that exiting before the idle event is hit doesn't typically mean that we lose any data, as + * messages will remain queued in the sockets they came in from, and thus can be processed when we + * start up next – unless we are going down for the final system shutdown, in which case everything + * is lost. */ + + r = sd_event_add_defer(s->event, &news, NULL, NULL); /* NULL handler means → exit when triggered */ + if (r < 0) { + log_error_errno(r, "Failed to allocate exit idle event handler: %m"); + goto fail; + } + + (void) sd_event_source_set_description(news, "exit-idle"); + + /* Run everything relevant before this. */ + r = sd_event_source_set_priority(news, SD_EVENT_PRIORITY_NORMAL+20); + if (r < 0) { + log_error_errno(r, "Failed to adjust priority of exit idle event handler: %m"); + goto fail; + } + + /* Give up ownership, so that this event source is freed automatically when the event loop is freed. */ + r = sd_event_source_set_floating(news, true); + if (r < 0) { + log_error_errno(r, "Failed to make exit idle event handler floating: %m"); + goto fail; + } + + news = sd_event_source_unref(news); + + r = sd_event_add_time_relative(s->event, &news, CLOCK_MONOTONIC, 10 * USEC_PER_SEC, 0, NULL, NULL); + if (r < 0) { + log_error_errno(r, "Failed to allocate exit timeout event handler: %m"); + goto fail; + } + + (void) sd_event_source_set_description(news, "exit-timeout"); + + r = sd_event_source_set_priority(news, SD_EVENT_PRIORITY_IMPORTANT-20); /* This is a safety net, with highest priority */ + if (r < 0) { + log_error_errno(r, "Failed to adjust priority of exit timeout event handler: %m"); + goto fail; + } + + r = sd_event_source_set_floating(news, true); + if (r < 0) { + log_error_errno(r, "Failed to make exit timeout event handler floating: %m"); + goto fail; + } + + news = sd_event_source_unref(news); + + log_debug("Exit event sources are now pending."); + return 0; + +fail: + sd_event_exit(s->event, 0); + return 0; +} + +static void server_full_sync(Server *s) { + const char *fn; + int r; + + assert(s); + + server_sync(s); + + /* Let clients know when the most recent sync happened. */ + fn = strjoina(s->runtime_directory, "/synced"); + r = write_timestamp_file_atomic(fn, now(CLOCK_MONOTONIC)); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to write %s, ignoring: %m", fn); + + return; +} + +static int dispatch_sigrtmin1(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + log_debug("Received SIGRTMIN1 signal from PID %u, as request to sync.", si->ssi_pid); + server_full_sync(s); + + return 0; +} + +static int server_setup_signals(Server *s) { + int r; + + assert(s); + + assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, SIGUSR1, SIGUSR2, SIGRTMIN+1, SIGRTMIN+18, -1) >= 0); + + r = sd_event_add_signal(s->event, &s->sigusr1_event_source, SIGUSR1, dispatch_sigusr1, s); + if (r < 0) + return r; + + r = sd_event_add_signal(s->event, &s->sigusr2_event_source, SIGUSR2, dispatch_sigusr2, s); + if (r < 0) + return r; + + r = sd_event_add_signal(s->event, &s->sigterm_event_source, SIGTERM, dispatch_sigterm, s); + if (r < 0) + return r; + + /* Let's process SIGTERM early, so that we definitely react to it */ + r = sd_event_source_set_priority(s->sigterm_event_source, SD_EVENT_PRIORITY_IMPORTANT-10); + if (r < 0) + return r; + + /* When journald is invoked on the terminal (when debugging), it's useful if C-c is handled + * equivalent to SIGTERM. */ + r = sd_event_add_signal(s->event, &s->sigint_event_source, SIGINT, dispatch_sigterm, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->sigint_event_source, SD_EVENT_PRIORITY_IMPORTANT-10); + if (r < 0) + return r; + + /* SIGRTMIN+1 causes an immediate sync. We process this very late, so that everything else queued at + * this point is really written to disk. Clients can watch /run/systemd/journal/synced with inotify + * until its mtime changes to see when a sync happened. */ + r = sd_event_add_signal(s->event, &s->sigrtmin1_event_source, SIGRTMIN+1, dispatch_sigrtmin1, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->sigrtmin1_event_source, SD_EVENT_PRIORITY_NORMAL+15); + if (r < 0) + return r; + + r = sd_event_add_signal(s->event, NULL, SIGRTMIN+18, sigrtmin18_handler, &s->sigrtmin18_info); + if (r < 0) + return r; + + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + Server *s = ASSERT_PTR(data); + int r; + + if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_syslog")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to syslog switch \"%s\". Ignoring.", value); + else + s->forward_to_syslog = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_kmsg")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to kmsg switch \"%s\". Ignoring.", value); + else + s->forward_to_kmsg = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_console")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to console switch \"%s\". Ignoring.", value); + else + s->forward_to_console = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_wall")) { + + r = value ? parse_boolean(value) : true; + if (r < 0) + log_warning("Failed to parse forward to wall switch \"%s\". Ignoring.", value); + else + s->forward_to_wall = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_console")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level console value \"%s\". Ignoring.", value); + else + s->max_level_console = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_store")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level store value \"%s\". Ignoring.", value); + else + s->max_level_store = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_syslog")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level syslog value \"%s\". Ignoring.", value); + else + s->max_level_syslog = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_kmsg")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level kmsg value \"%s\". Ignoring.", value); + else + s->max_level_kmsg = r; + + } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_wall")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r < 0) + log_warning("Failed to parse max level wall value \"%s\". Ignoring.", value); + else + s->max_level_wall = r; + + } else if (startswith(key, "systemd.journald")) + log_warning("Unknown journald kernel command line option \"%s\". Ignoring.", key); + + /* do not warn about state here, since probably systemd already did */ + return 0; +} + +static int server_parse_config_file(Server *s) { + const char *conf_file = "journald.conf"; + + assert(s); + + if (s->namespace) + conf_file = strjoina("journald@", s->namespace, ".conf"); + + return config_parse_config_file(conf_file, "Journal\0", + config_item_perf_lookup, journald_gperf_lookup, + CONFIG_PARSE_WARN, s); +} + +static int server_dispatch_sync(sd_event_source *es, usec_t t, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + server_sync(s); + return 0; +} + +int server_schedule_sync(Server *s, int priority) { + int r; + + assert(s); + + if (priority <= LOG_CRIT) { + /* Immediately sync to disk when this is of priority CRIT, ALERT, EMERG */ + server_sync(s); + return 0; + } + + if (!s->event || sd_event_get_state(s->event) == SD_EVENT_FINISHED) { + /* Shutting down the server? Let's sync immediately. */ + server_sync(s); + return 0; + } + + if (s->sync_scheduled) + return 0; + + if (s->sync_interval_usec > 0) { + + if (!s->sync_event_source) { + r = sd_event_add_time_relative( + s->event, + &s->sync_event_source, + CLOCK_MONOTONIC, + s->sync_interval_usec, 0, + server_dispatch_sync, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->sync_event_source, SD_EVENT_PRIORITY_IMPORTANT); + } else { + r = sd_event_source_set_time_relative(s->sync_event_source, s->sync_interval_usec); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s->sync_event_source, SD_EVENT_ONESHOT); + } + if (r < 0) + return r; + + s->sync_scheduled = true; + } + + return 0; +} + +static int dispatch_hostname_change(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + server_cache_hostname(s); + return 0; +} + +static int server_open_hostname(Server *s) { + int r; + + assert(s); + + s->hostname_fd = open("/proc/sys/kernel/hostname", + O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (s->hostname_fd < 0) + return log_error_errno(errno, "Failed to open /proc/sys/kernel/hostname: %m"); + + r = sd_event_add_io(s->event, &s->hostname_event_source, s->hostname_fd, 0, dispatch_hostname_change, s); + if (r < 0) { + /* kernels prior to 3.2 don't support polling this file. Ignore + * the failure. */ + if (r == -EPERM) { + log_warning_errno(r, "Failed to register hostname fd in event loop, ignoring: %m"); + s->hostname_fd = safe_close(s->hostname_fd); + return 0; + } + + return log_error_errno(r, "Failed to register hostname fd in event loop: %m"); + } + + r = sd_event_source_set_priority(s->hostname_event_source, SD_EVENT_PRIORITY_IMPORTANT-10); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of hostname event source: %m"); + + return 0; +} + +static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = ASSERT_PTR(userdata); + int r; + + assert(s->notify_event_source == es); + assert(s->notify_fd == fd); + + /* The $NOTIFY_SOCKET is writable again, now send exactly one + * message on it. Either it's the watchdog event, the initial + * READY=1 event or an stdout stream event. If there's nothing + * to write anymore, turn our event source off. The next time + * there's something to send it will be turned on again. */ + + if (!s->sent_notify_ready) { + static const char p[] = "READY=1\n" + "STATUS=Processing requests..."; + + if (send(s->notify_fd, p, strlen(p), MSG_DONTWAIT) < 0) { + if (errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to send READY=1 notification message: %m"); + } + + s->sent_notify_ready = true; + log_debug("Sent READY=1 notification."); + + } else if (s->send_watchdog) { + static const char p[] = "WATCHDOG=1"; + + if (send(s->notify_fd, p, strlen(p), MSG_DONTWAIT) < 0) { + if (errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to send WATCHDOG=1 notification message: %m"); + } + + s->send_watchdog = false; + log_debug("Sent WATCHDOG=1 notification."); + + } else if (s->stdout_streams_notify_queue) + /* Dispatch one stream notification event */ + stdout_stream_send_notify(s->stdout_streams_notify_queue); + + /* Leave us enabled if there's still more to do. */ + if (s->send_watchdog || s->stdout_streams_notify_queue) + return 0; + + /* There was nothing to do anymore, let's turn ourselves off. */ + r = sd_event_source_set_enabled(es, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to turn off notify event source: %m"); + + return 0; +} + +static int dispatch_watchdog(sd_event_source *es, uint64_t usec, void *userdata) { + Server *s = ASSERT_PTR(userdata); + int r; + + s->send_watchdog = true; + + r = sd_event_source_set_enabled(s->notify_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to turn on notify event source: %m"); + + r = sd_event_source_set_time(s->watchdog_event_source, usec + s->watchdog_usec / 2); + if (r < 0) + return log_error_errno(r, "Failed to restart watchdog event source: %m"); + + r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to enable watchdog event source: %m"); + + return 0; +} + +static int server_connect_notify(Server *s) { + union sockaddr_union sa; + socklen_t sa_len; + const char *e; + int r; + + assert(s); + assert(s->notify_fd < 0); + assert(!s->notify_event_source); + + /* + * So here's the problem: we'd like to send notification messages to PID 1, but we cannot do that via + * sd_notify(), since that's synchronous, and we might end up blocking on it. Specifically: given + * that PID 1 might block on dbus-daemon during IPC, and dbus-daemon is logging to us, and might + * hence block on us, we might end up in a deadlock if we block on sending PID 1 notification + * messages — by generating a full blocking circle. To avoid this, let's create a non-blocking + * socket, and connect it to the notification socket, and then wait for POLLOUT before we send + * anything. This should efficiently avoid any deadlocks, as we'll never block on PID 1, hence PID 1 + * can safely block on dbus-daemon which can safely block on us again. + * + * Don't think that this issue is real? It is, see: https://github.com/systemd/systemd/issues/1505 + */ + + e = getenv("NOTIFY_SOCKET"); + if (!e) + return 0; + + r = sockaddr_un_set_path(&sa.un, e); + if (r < 0) + return log_error_errno(r, "NOTIFY_SOCKET set to invalid value '%s': %m", e); + sa_len = r; + + s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->notify_fd < 0) + return log_error_errno(errno, "Failed to create notify socket: %m"); + + (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE); + + r = connect(s->notify_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "Failed to connect to notify socket: %m"); + + r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s); + if (r < 0) + return log_error_errno(r, "Failed to watch notification socket: %m"); + + if (sd_watchdog_enabled(false, &s->watchdog_usec) > 0) { + s->send_watchdog = true; + + r = sd_event_add_time_relative(s->event, &s->watchdog_event_source, CLOCK_MONOTONIC, s->watchdog_usec/2, s->watchdog_usec/4, dispatch_watchdog, s); + if (r < 0) + return log_error_errno(r, "Failed to add watchdog time event: %m"); + } + + /* This should fire pretty soon, which we'll use to send the READY=1 event. */ + + return 0; +} + +static int synchronize_second_half(sd_event_source *event_source, void *userdata) { + Varlink *link = ASSERT_PTR(userdata); + Server *s; + int r; + + assert_se(s = varlink_get_userdata(link)); + + /* This is the "second half" of the Synchronize() varlink method. This function is called as deferred + * event source at a low priority to ensure the synchronization completes after all queued log + * messages are processed. */ + server_full_sync(s); + + /* Let's get rid of the event source now, by marking it as non-floating again. It then has no ref + * anymore and is immediately destroyed after we return from this function, i.e. from this event + * source handler at the end. */ + r = sd_event_source_set_floating(event_source, false); + if (r < 0) + return log_error_errno(r, "Failed to mark event source as non-floating: %m"); + + return varlink_reply(link, NULL); +} + +static void synchronize_destroy(void *userdata) { + varlink_unref(userdata); +} + +static int vl_method_synchronize(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(sd_event_source_unrefp) sd_event_source *event_source = NULL; + Server *s = ASSERT_PTR(userdata); + int r; + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + log_info("Received client request to sync journal."); + + /* We don't do the main work now, but instead enqueue a deferred event loop job which will do + * it. That job is scheduled at low priority, so that we return from this method call only after all + * queued but not processed log messages are written to disk, so that this method call returning can + * be used as nice synchronization point. */ + r = sd_event_add_defer(s->event, &event_source, synchronize_second_half, link); + if (r < 0) + return log_error_errno(r, "Failed to allocate defer event source: %m"); + + r = sd_event_source_set_destroy_callback(event_source, synchronize_destroy); + if (r < 0) + return log_error_errno(r, "Failed to set event source destroy callback: %m"); + + varlink_ref(link); /* The varlink object is now left to the destroy callback to unref */ + + r = sd_event_source_set_priority(event_source, SD_EVENT_PRIORITY_NORMAL+15); + if (r < 0) + return log_error_errno(r, "Failed to set defer event source priority: %m"); + + /* Give up ownership of this event source. It will now be destroyed along with event loop itself, + * unless it destroys itself earlier. */ + r = sd_event_source_set_floating(event_source, true); + if (r < 0) + return log_error_errno(r, "Failed to mark event source as floating: %m"); + + (void) sd_event_source_set_description(event_source, "deferred-sync"); + + return 0; +} + +static int vl_method_rotate(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + log_info("Received client request to rotate journal, rotating."); + server_full_rotate(s); + + return varlink_reply(link, NULL); +} + +static int vl_method_flush_to_var(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + if (s->namespace) + return varlink_error(link, "io.systemd.Journal.NotSupportedByNamespaces", NULL); + + log_info("Received client request to flush runtime journal."); + server_full_flush(s); + + return varlink_reply(link, NULL); +} + +static int vl_method_relinquish_var(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + if (s->namespace) + return varlink_error(link, "io.systemd.Journal.NotSupportedByNamespaces", NULL); + + log_info("Received client request to relinquish %s access.", s->system_storage.path); + server_relinquish_var(s); + + return varlink_reply(link, NULL); +} + +static int vl_connect(VarlinkServer *server, Varlink *link, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(server); + assert(link); + + (void) server_start_or_stop_idle_timer(s); /* maybe we are no longer idle */ + + return 0; +} + +static void vl_disconnect(VarlinkServer *server, Varlink *link, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(server); + assert(link); + + (void) server_start_or_stop_idle_timer(s); /* maybe we are idle now */ +} + +static int server_open_varlink(Server *s, const char *socket, int fd) { + int r; + + assert(s); + + r = varlink_server_new(&s->varlink_server, VARLINK_SERVER_ROOT_ONLY|VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return r; + + varlink_server_set_userdata(s->varlink_server, s); + + r = varlink_server_add_interface(s->varlink_server, &vl_interface_io_systemd_Journal); + if (r < 0) + return log_error_errno(r, "Failed to add Journal interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + s->varlink_server, + "io.systemd.Journal.Synchronize", vl_method_synchronize, + "io.systemd.Journal.Rotate", vl_method_rotate, + "io.systemd.Journal.FlushToVar", vl_method_flush_to_var, + "io.systemd.Journal.RelinquishVar", vl_method_relinquish_var); + if (r < 0) + return r; + + r = varlink_server_bind_connect(s->varlink_server, vl_connect); + if (r < 0) + return r; + + r = varlink_server_bind_disconnect(s->varlink_server, vl_disconnect); + if (r < 0) + return r; + + if (fd < 0) + r = varlink_server_listen_address(s->varlink_server, socket, 0600); + else + r = varlink_server_listen_fd(s->varlink_server, fd); + if (r < 0) + return r; + + r = varlink_server_attach_event(s->varlink_server, s->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return r; + + return 0; +} + +int server_map_seqnum_file( + Server *s, + const char *fname, + size_t size, + void **ret) { + + _cleanup_free_ char *fn = NULL; + _cleanup_close_ int fd = -EBADF; + uint64_t *p; + int r; + + assert(s); + assert(fname); + assert(size > 0); + assert(ret); + + fn = path_join(s->runtime_directory, fname); + if (!fn) + return -ENOMEM; + + fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644); + if (fd < 0) + return -errno; + + r = posix_fallocate_loop(fd, 0, size); + if (r < 0) + return r; + + p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + return -errno; + + *ret = p; + return 0; +} + +void server_unmap_seqnum_file(void *p, size_t size) { + assert(size > 0); + + if (!p) + return; + + assert_se(munmap(p, size) >= 0); +} + +static bool server_is_idle(Server *s) { + assert(s); + + /* The server for the main namespace is never idle */ + if (!s->namespace) + return false; + + /* If a retention maximum is set larger than the idle time we need to be running to enforce it, hence + * turn off the idle logic. */ + if (s->max_retention_usec > IDLE_TIMEOUT_USEC) + return false; + + /* We aren't idle if we have a varlink client */ + if (varlink_server_current_connections(s->varlink_server) > 0) + return false; + + /* If we have stdout streams we aren't idle */ + if (s->n_stdout_streams > 0) + return false; + + return true; +} + +static int server_idle_handler(sd_event_source *source, uint64_t usec, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + assert(source); + + log_debug("Server is idle, exiting."); + sd_event_exit(s->event, 0); + return 0; +} + +int server_start_or_stop_idle_timer(Server *s) { + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + + if (!server_is_idle(s)) { + s->idle_event_source = sd_event_source_disable_unref(s->idle_event_source); + return 0; + } + + if (s->idle_event_source) + return 1; + + r = sd_event_add_time_relative(s->event, &source, CLOCK_MONOTONIC, IDLE_TIMEOUT_USEC, 0, server_idle_handler, s); + if (r < 0) + return log_error_errno(r, "Failed to allocate idle timer: %m"); + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to set idle timer priority: %m"); + + (void) sd_event_source_set_description(source, "idle-timer"); + + s->idle_event_source = TAKE_PTR(source); + return 1; +} + +int server_refresh_idle_timer(Server *s) { + int r; + + assert(s); + + if (!s->idle_event_source) + return 0; + + r = sd_event_source_set_time_relative(s->idle_event_source, IDLE_TIMEOUT_USEC); + if (r < 0) + return log_error_errno(r, "Failed to refresh idle timer: %m"); + + return 1; +} + +static int server_set_namespace(Server *s, const char *namespace) { + assert(s); + + if (!namespace) + return 0; + + if (!log_namespace_name_valid(namespace)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified namespace name not valid, refusing: %s", namespace); + + s->namespace = strdup(namespace); + if (!s->namespace) + return log_oom(); + + s->namespace_field = strjoin("_NAMESPACE=", namespace); + if (!s->namespace_field) + return log_oom(); + + return 1; +} + +static int server_memory_pressure(sd_event_source *es, void *userdata) { + Server *s = ASSERT_PTR(userdata); + + log_info("Under memory pressure, flushing caches."); + + /* Flushed the cached info we might have about client processes */ + client_context_flush_regular(s); + + /* Let's also close all user files (but keep the system/runtime one open) */ + for (;;) { + JournalFile *first = ordered_hashmap_steal_first(s->user_journals); + + if (!first) + break; + + (void) journal_file_offline_close(first); + } + + sd_event_trim_memory(); + + return 0; +} + +static int server_setup_memory_pressure(Server *s) { + int r; + + assert(s); + + r = sd_event_add_memory_pressure(s->event, NULL, server_memory_pressure, s); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r, + "Failed to install memory pressure event source, ignoring: %m"); + + return 0; +} + +int server_init(Server *s, const char *namespace) { + const char *native_socket, *syslog_socket, *stdout_socket, *varlink_socket, *e; + _cleanup_fdset_free_ FDSet *fds = NULL; + int n, r, fd, varlink_fd = -EBADF; + bool no_sockets; + + assert(s); + + *s = (Server) { + .syslog_fd = -EBADF, + .native_fd = -EBADF, + .stdout_fd = -EBADF, + .dev_kmsg_fd = -EBADF, + .audit_fd = -EBADF, + .hostname_fd = -EBADF, + .notify_fd = -EBADF, + + .compress.enabled = true, + .compress.threshold_bytes = UINT64_MAX, + .seal = true, + + .set_audit = true, + + .watchdog_usec = USEC_INFINITY, + + .sync_interval_usec = DEFAULT_SYNC_INTERVAL_USEC, + .sync_scheduled = false, + + .ratelimit_interval = DEFAULT_RATE_LIMIT_INTERVAL, + .ratelimit_burst = DEFAULT_RATE_LIMIT_BURST, + + .forward_to_wall = true, + + .max_file_usec = DEFAULT_MAX_FILE_USEC, + + .max_level_store = LOG_DEBUG, + .max_level_syslog = LOG_DEBUG, + .max_level_kmsg = LOG_NOTICE, + .max_level_console = LOG_INFO, + .max_level_wall = LOG_EMERG, + + .line_max = DEFAULT_LINE_MAX, + + .runtime_storage.name = "Runtime Journal", + .system_storage.name = "System Journal", + + .kmsg_own_ratelimit = { + .interval = DEFAULT_KMSG_OWN_INTERVAL, + .burst = DEFAULT_KMSG_OWN_BURST, + }, + + .sigrtmin18_info.memory_pressure_handler = server_memory_pressure, + .sigrtmin18_info.memory_pressure_userdata = s, + }; + + r = server_set_namespace(s, namespace); + if (r < 0) + return r; + + /* By default, only read from /dev/kmsg if are the main namespace */ + s->read_kmsg = !s->namespace; + s->storage = s->namespace ? STORAGE_PERSISTENT : STORAGE_AUTO; + + journal_reset_metrics(&s->system_storage.metrics); + journal_reset_metrics(&s->runtime_storage.metrics); + + server_parse_config_file(s); + + if (!s->namespace) { + /* Parse kernel command line, but only if we are not a namespace instance */ + r = proc_cmdline_parse(parse_proc_cmdline_item, s, PROC_CMDLINE_STRIP_RD_PREFIX); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + } + + if (!!s->ratelimit_interval != !!s->ratelimit_burst) { /* One set to 0 and the other not? */ + log_debug("Setting both rate limit interval and burst from "USEC_FMT",%u to 0,0", + s->ratelimit_interval, s->ratelimit_burst); + s->ratelimit_interval = s->ratelimit_burst = 0; + } + + e = getenv("RUNTIME_DIRECTORY"); + if (e) + s->runtime_directory = strdup(e); + else if (s->namespace) + s->runtime_directory = strjoin("/run/systemd/journal.", s->namespace); + else + s->runtime_directory = strdup("/run/systemd/journal"); + if (!s->runtime_directory) + return log_oom(); + + (void) mkdir_p(s->runtime_directory, 0755); + + s->user_journals = ordered_hashmap_new(NULL); + if (!s->user_journals) + return log_oom(); + + s->mmap = mmap_cache_new(); + if (!s->mmap) + return log_oom(); + + s->deferred_closes = set_new(NULL); + if (!s->deferred_closes) + return log_oom(); + + r = sd_event_default(&s->event); + if (r < 0) + return log_error_errno(r, "Failed to create event loop: %m"); + + n = sd_listen_fds(true); + if (n < 0) + return log_error_errno(n, "Failed to read listening file descriptors from environment: %m"); + + native_socket = strjoina(s->runtime_directory, "/socket"); + stdout_socket = strjoina(s->runtime_directory, "/stdout"); + syslog_socket = strjoina(s->runtime_directory, "/dev-log"); + varlink_socket = strjoina(s->runtime_directory, "/io.systemd.journal"); + + for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) { + + if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, native_socket, 0) > 0) { + + if (s->native_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many native sockets passed."); + + s->native_fd = fd; + + } else if (sd_is_socket_unix(fd, SOCK_STREAM, 1, stdout_socket, 0) > 0) { + + if (s->stdout_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many stdout sockets passed."); + + s->stdout_fd = fd; + + } else if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, syslog_socket, 0) > 0) { + + if (s->syslog_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many /dev/log sockets passed."); + + s->syslog_fd = fd; + + } else if (sd_is_socket_unix(fd, SOCK_STREAM, 1, varlink_socket, 0) > 0) { + + if (varlink_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many varlink sockets passed."); + + varlink_fd = fd; + } else if (sd_is_socket(fd, AF_NETLINK, SOCK_RAW, -1) > 0) { + + if (s->audit_fd >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many audit sockets passed."); + + s->audit_fd = fd; + + } else { + + if (!fds) { + fds = fdset_new(); + if (!fds) + return log_oom(); + } + + r = fdset_put(fds, fd); + if (r < 0) + return log_oom(); + } + } + + /* Try to restore streams, but don't bother if this fails */ + (void) server_restore_streams(s, fds); + + if (fdset_size(fds) > 0) { + log_warning("%u unknown file descriptors passed, closing.", fdset_size(fds)); + fds = fdset_free(fds); + } + + no_sockets = s->native_fd < 0 && s->stdout_fd < 0 && s->syslog_fd < 0 && s->audit_fd < 0 && varlink_fd < 0; + + /* always open stdout, syslog, native, and kmsg sockets */ + + /* systemd-journald.socket: /run/systemd/journal/stdout */ + r = server_open_stdout_socket(s, stdout_socket); + if (r < 0) + return r; + + /* systemd-journald-dev-log.socket: /run/systemd/journal/dev-log */ + r = server_open_syslog_socket(s, syslog_socket); + if (r < 0) + return r; + + /* systemd-journald.socket: /run/systemd/journal/socket */ + r = server_open_native_socket(s, native_socket); + if (r < 0) + return r; + + /* /dev/kmsg */ + r = server_open_dev_kmsg(s); + if (r < 0) + return r; + + /* Unless we got *some* sockets and not audit, open audit socket */ + if (s->audit_fd >= 0 || no_sockets) { + log_info("Collecting audit messages is enabled."); + + r = server_open_audit(s); + if (r < 0) + return r; + } else + log_info("Collecting audit messages is disabled."); + + r = server_open_varlink(s, varlink_socket, varlink_fd); + if (r < 0) + return r; + + r = server_map_seqnum_file(s, "seqnum", sizeof(SeqnumData), (void**) &s->seqnum); + if (r < 0) + return log_error_errno(r, "Failed to map main seqnum file: %m"); + + r = server_open_kernel_seqnum(s); + if (r < 0) + return r; + + r = server_open_hostname(s); + if (r < 0) + return r; + + r = server_setup_signals(s); + if (r < 0) + return r; + + r = server_setup_memory_pressure(s); + if (r < 0) + return r; + + s->ratelimit = journal_ratelimit_new(); + if (!s->ratelimit) + return log_oom(); + + r = cg_get_root_path(&s->cgroup_root); + if (r < 0) + return log_error_errno(r, "Failed to acquire cgroup root path: %m"); + + server_cache_hostname(s); + server_cache_boot_id(s); + server_cache_machine_id(s); + + if (s->namespace) + s->runtime_storage.path = strjoin("/run/log/journal/", SERVER_MACHINE_ID(s), ".", s->namespace); + else + s->runtime_storage.path = strjoin("/run/log/journal/", SERVER_MACHINE_ID(s)); + if (!s->runtime_storage.path) + return log_oom(); + + e = getenv("LOGS_DIRECTORY"); + if (e) + s->system_storage.path = strdup(e); + else if (s->namespace) + s->system_storage.path = strjoin("/var/log/journal/", SERVER_MACHINE_ID(s), ".", s->namespace); + else + s->system_storage.path = strjoin("/var/log/journal/", SERVER_MACHINE_ID(s)); + if (!s->system_storage.path) + return log_oom(); + + (void) server_connect_notify(s); + + (void) client_context_acquire_default(s); + + r = server_system_journal_open(s, /* flush_requested= */ false, /* relinquish_requested= */ false); + if (r < 0) + return r; + + server_start_or_stop_idle_timer(s); + return 0; +} + +void server_maybe_append_tags(Server *s) { +#if HAVE_GCRYPT + JournalFile *f; + usec_t n; + + n = now(CLOCK_REALTIME); + + if (s->system_journal) + journal_file_maybe_append_tag(s->system_journal, n); + + ORDERED_HASHMAP_FOREACH(f, s->user_journals) + journal_file_maybe_append_tag(f, n); +#endif +} + +void server_done(Server *s) { + assert(s); + + free(s->namespace); + free(s->namespace_field); + + set_free_with_destructor(s->deferred_closes, journal_file_offline_close); + + while (s->stdout_streams) + stdout_stream_free(s->stdout_streams); + + client_context_flush_all(s); + + (void) journal_file_offline_close(s->system_journal); + (void) journal_file_offline_close(s->runtime_journal); + + ordered_hashmap_free_with_destructor(s->user_journals, journal_file_offline_close); + + varlink_server_unref(s->varlink_server); + + sd_event_source_unref(s->syslog_event_source); + sd_event_source_unref(s->native_event_source); + sd_event_source_unref(s->stdout_event_source); + sd_event_source_unref(s->dev_kmsg_event_source); + sd_event_source_unref(s->audit_event_source); + sd_event_source_unref(s->sync_event_source); + sd_event_source_unref(s->sigusr1_event_source); + sd_event_source_unref(s->sigusr2_event_source); + sd_event_source_unref(s->sigterm_event_source); + sd_event_source_unref(s->sigint_event_source); + sd_event_source_unref(s->sigrtmin1_event_source); + sd_event_source_unref(s->hostname_event_source); + sd_event_source_unref(s->notify_event_source); + sd_event_source_unref(s->watchdog_event_source); + sd_event_source_unref(s->idle_event_source); + sd_event_unref(s->event); + + safe_close(s->syslog_fd); + safe_close(s->native_fd); + safe_close(s->stdout_fd); + safe_close(s->dev_kmsg_fd); + safe_close(s->audit_fd); + safe_close(s->hostname_fd); + safe_close(s->notify_fd); + + if (s->ratelimit) + journal_ratelimit_free(s->ratelimit); + + server_unmap_seqnum_file(s->seqnum, sizeof(*s->seqnum)); + server_unmap_seqnum_file(s->kernel_seqnum, sizeof(*s->kernel_seqnum)); + + free(s->buffer); + free(s->tty_path); + free(s->cgroup_root); + free(s->hostname_field); + free(s->runtime_storage.path); + free(s->system_storage.path); + free(s->runtime_directory); + + mmap_cache_unref(s->mmap); +} + +static const char* const storage_table[_STORAGE_MAX] = { + [STORAGE_AUTO] = "auto", + [STORAGE_VOLATILE] = "volatile", + [STORAGE_PERSISTENT] = "persistent", + [STORAGE_NONE] = "none" +}; + +DEFINE_STRING_TABLE_LOOKUP(storage, Storage); +DEFINE_CONFIG_PARSE_ENUM(config_parse_storage, storage, Storage, "Failed to parse storage setting"); + +static const char* const split_mode_table[_SPLIT_MAX] = { + [SPLIT_LOGIN] = "login", + [SPLIT_UID] = "uid", + [SPLIT_NONE] = "none", +}; + +DEFINE_STRING_TABLE_LOOKUP(split_mode, SplitMode); +DEFINE_CONFIG_PARSE_ENUM(config_parse_split_mode, split_mode, SplitMode, "Failed to parse split mode setting"); + +int config_parse_line_max( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + size_t *sz = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + /* Empty assignment means default */ + *sz = DEFAULT_LINE_MAX; + else { + uint64_t v; + + r = parse_size(rvalue, 1024, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse LineMax= value, ignoring: %s", rvalue); + return 0; + } + + if (v < 79) { + /* Why specify 79 here as minimum line length? Simply, because the most common traditional + * terminal size is 80ch, and it might make sense to break one character before the natural + * line break would occur on that. */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "LineMax= too small, clamping to 79: %s", rvalue); + *sz = 79; + } else if (v > (uint64_t) (SSIZE_MAX-1)) { + /* So, why specify SSIZE_MAX-1 here? Because that's one below the largest size value read() + * can return, and we need one extra byte for the trailing NUL byte. Of course IRL such large + * memory allocations will fail anyway, hence this limit is mostly theoretical anyway, as we'll + * fail much earlier anyway. */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "LineMax= too large, clamping to %" PRIu64 ": %s", (uint64_t) (SSIZE_MAX-1), rvalue); + *sz = SSIZE_MAX-1; + } else + *sz = (size_t) v; + } + + return 0; +} + +int config_parse_compress( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + JournalCompressOptions* compress = data; + int r; + + if (isempty(rvalue)) { + compress->enabled = true; + compress->threshold_bytes = UINT64_MAX; + } else if (streq(rvalue, "1")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Compress= ambiguously specified as 1, enabling compression with default threshold"); + compress->enabled = true; + } else if (streq(rvalue, "0")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Compress= ambiguously specified as 0, disabling compression"); + compress->enabled = false; + } else { + r = parse_boolean(rvalue); + if (r < 0) { + r = parse_size(rvalue, 1024, &compress->threshold_bytes); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse Compress= value, ignoring: %s", rvalue); + else + compress->enabled = true; + } else + compress->enabled = r; + } + + return 0; +} diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h new file mode 100644 index 0000000..2a17676 --- /dev/null +++ b/src/journal/journald-server.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-event.h" + +typedef struct Server Server; + +#include "common-signal.h" +#include "conf-parser.h" +#include "hashmap.h" +#include "journal-file.h" +#include "journald-context.h" +#include "journald-rate-limit.h" +#include "journald-stream.h" +#include "list.h" +#include "prioq.h" +#include "ratelimit.h" +#include "time-util.h" +#include "varlink.h" + +typedef enum Storage { + STORAGE_AUTO, + STORAGE_VOLATILE, + STORAGE_PERSISTENT, + STORAGE_NONE, + _STORAGE_MAX, + _STORAGE_INVALID = -EINVAL, +} Storage; + +typedef enum SplitMode { + SPLIT_UID, + SPLIT_LOGIN, /* deprecated */ + SPLIT_NONE, + _SPLIT_MAX, + _SPLIT_INVALID = -EINVAL, +} SplitMode; + +typedef struct JournalCompressOptions { + bool enabled; + uint64_t threshold_bytes; +} JournalCompressOptions; + +typedef struct JournalStorageSpace { + usec_t timestamp; + + uint64_t available; + uint64_t limit; + + uint64_t vfs_used; /* space used by journal files */ + uint64_t vfs_available; +} JournalStorageSpace; + +typedef struct JournalStorage { + const char *name; + char *path; + + JournalMetrics metrics; + JournalStorageSpace space; +} JournalStorage; + +/* This structure will be kept in $RUNTIME_DIRECTORY/seqnum and is mapped by journald, and is used to + * maintain the sequence number counter with its seqnum ID */ +typedef struct SeqnumData { + sd_id128_t id; + uint64_t seqnum; +} SeqnumData; + +struct Server { + char *namespace; + + int syslog_fd; + int native_fd; + int stdout_fd; + int dev_kmsg_fd; + int audit_fd; + int hostname_fd; + int notify_fd; + + sd_event *event; + + sd_event_source *syslog_event_source; + sd_event_source *native_event_source; + sd_event_source *stdout_event_source; + sd_event_source *dev_kmsg_event_source; + sd_event_source *audit_event_source; + sd_event_source *sync_event_source; + sd_event_source *sigusr1_event_source; + sd_event_source *sigusr2_event_source; + sd_event_source *sigterm_event_source; + sd_event_source *sigint_event_source; + sd_event_source *sigrtmin1_event_source; + sd_event_source *hostname_event_source; + sd_event_source *notify_event_source; + sd_event_source *watchdog_event_source; + sd_event_source *idle_event_source; + struct sigrtmin18_info sigrtmin18_info; + + JournalFile *runtime_journal; + JournalFile *system_journal; + OrderedHashmap *user_journals; + + SeqnumData *seqnum; + + char *buffer; + + JournalRateLimit *ratelimit; + usec_t sync_interval_usec; + usec_t ratelimit_interval; + unsigned ratelimit_burst; + + JournalStorage runtime_storage; + JournalStorage system_storage; + + JournalCompressOptions compress; + bool seal; + bool read_kmsg; + int set_audit; + + bool forward_to_kmsg; + bool forward_to_syslog; + bool forward_to_console; + bool forward_to_wall; + + unsigned n_forward_syslog_missed; + usec_t last_warn_forward_syslog_missed; + + usec_t max_retention_usec; + usec_t max_file_usec; + usec_t oldest_file_usec; + + LIST_HEAD(StdoutStream, stdout_streams); + LIST_HEAD(StdoutStream, stdout_streams_notify_queue); + unsigned n_stdout_streams; + + char *tty_path; + + int max_level_store; + int max_level_syslog; + int max_level_kmsg; + int max_level_console; + int max_level_wall; + + Storage storage; + SplitMode split_mode; + + MMapCache *mmap; + + Set *deferred_closes; + + uint64_t *kernel_seqnum; + bool dev_kmsg_readable:1; + RateLimit kmsg_own_ratelimit; + + bool send_watchdog:1; + bool sent_notify_ready:1; + bool sync_scheduled:1; + + char machine_id_field[sizeof("_MACHINE_ID=") + 32]; + char boot_id_field[sizeof("_BOOT_ID=") + 32]; + char *hostname_field; + char *namespace_field; + char *runtime_directory; + + /* Cached cgroup root, so that we don't have to query that all the time */ + char *cgroup_root; + + usec_t watchdog_usec; + + usec_t last_realtime_clock; + + size_t line_max; + + /* Caching of client metadata */ + Hashmap *client_contexts; + Prioq *client_contexts_lru; + + usec_t last_cache_pid_flush; + + ClientContext *my_context; /* the context of journald itself */ + ClientContext *pid1_context; /* the context of PID 1 */ + + VarlinkServer *varlink_server; +}; + +#define SERVER_MACHINE_ID(s) ((s)->machine_id_field + STRLEN("_MACHINE_ID=")) + +/* Extra fields for any log messages */ +#define N_IOVEC_META_FIELDS 24 + +/* Extra fields for log messages that contain OBJECT_PID= (i.e. log about another process) */ +#define N_IOVEC_OBJECT_FIELDS 18 + +/* Maximum number of fields we'll add in for driver (i.e. internal) messages */ +#define N_IOVEC_PAYLOAD_FIELDS 16 + +/* kmsg: Maximum number of extra fields we'll import from the kernel's /dev/kmsg */ +#define N_IOVEC_KERNEL_FIELDS 64 + +/* kmsg: Maximum number of extra fields we'll import from udev's devices */ +#define N_IOVEC_UDEV_FIELDS 32 + +/* audit: Maximum number of extra fields we'll import from audit messages */ +#define N_IOVEC_AUDIT_FIELDS 64 + +void server_dispatch_message(Server *s, struct iovec *iovec, size_t n, size_t m, ClientContext *c, const struct timeval *tv, int priority, pid_t object_pid); +void server_driver_message(Server *s, pid_t object_pid, const char *message_id, const char *format, ...) _sentinel_ _printf_(4,0); + +/* gperf lookup function */ +const struct ConfigPerfItem* journald_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +CONFIG_PARSER_PROTOTYPE(config_parse_storage); +CONFIG_PARSER_PROTOTYPE(config_parse_line_max); +CONFIG_PARSER_PROTOTYPE(config_parse_compress); + +const char *storage_to_string(Storage s) _const_; +Storage storage_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_split_mode); + +const char *split_mode_to_string(SplitMode s) _const_; +SplitMode split_mode_from_string(const char *s) _pure_; + +int server_init(Server *s, const char *namespace); +void server_done(Server *s); +void server_sync(Server *s); +void server_vacuum(Server *s, bool verbose); +void server_rotate(Server *s); +int server_schedule_sync(Server *s, int priority); +int server_flush_to_var(Server *s, bool require_flag_file); +void server_maybe_append_tags(Server *s); +int server_process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata); +void server_space_usage_message(Server *s, JournalStorage *storage); + +int server_start_or_stop_idle_timer(Server *s); +int server_refresh_idle_timer(Server *s); + +int server_map_seqnum_file(Server *s, const char *fname, size_t size, void **ret); +void server_unmap_seqnum_file(void *p, size_t size); diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c new file mode 100644 index 0000000..81a0e68 --- /dev/null +++ b/src/journal/journald-stream.c @@ -0,0 +1,1004 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#if HAVE_SELINUX +#include +#endif + +#include "sd-daemon.h" +#include "sd-event.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "env-file.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "journal-internal.h" +#include "journald-client.h" +#include "journald-console.h" +#include "journald-context.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-stream.h" +#include "journald-syslog.h" +#include "journald-wall.h" +#include "mkdir.h" +#include "parse-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "syslog-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "user-util.h" + +#define STDOUT_STREAMS_MAX 4096 + +/* During the "setup" protocol phase of the stream logic let's define a different maximum line length than + * during the actual operational phase. We want to allow users to specify very short line lengths after all, + * but the unit name we embed in the setup protocol might be longer than that. Hence, during the setup phase + * let's enforce a line length matching the maximum unit name length (255) */ +#define STDOUT_STREAM_SETUP_PROTOCOL_LINE_MAX (UNIT_NAME_MAX-1U) + +typedef enum StdoutStreamState { + STDOUT_STREAM_IDENTIFIER, + STDOUT_STREAM_UNIT_ID, + STDOUT_STREAM_PRIORITY, + STDOUT_STREAM_LEVEL_PREFIX, + STDOUT_STREAM_FORWARD_TO_SYSLOG, + STDOUT_STREAM_FORWARD_TO_KMSG, + STDOUT_STREAM_FORWARD_TO_CONSOLE, + STDOUT_STREAM_RUNNING, +} StdoutStreamState; + +/* The different types of log record terminators: a real \n was read, a NUL character was read, the maximum line length + * was reached, or the end of the stream was reached */ + +typedef enum LineBreak { + LINE_BREAK_NEWLINE, + LINE_BREAK_NUL, + LINE_BREAK_LINE_MAX, + LINE_BREAK_EOF, + LINE_BREAK_PID_CHANGE, + _LINE_BREAK_MAX, + _LINE_BREAK_INVALID = -EINVAL, +} LineBreak; + +struct StdoutStream { + Server *server; + StdoutStreamState state; + + int fd; + + struct ucred ucred; + char *label; + char *identifier; + char *unit_id; + int priority; + bool level_prefix:1; + bool forward_to_syslog:1; + bool forward_to_kmsg:1; + bool forward_to_console:1; + + bool fdstore:1; + bool in_notify_queue:1; + + char *buffer; + size_t length; + + sd_event_source *event_source; + + char *state_file; + + ClientContext *context; + + LIST_FIELDS(StdoutStream, stdout_stream); + LIST_FIELDS(StdoutStream, stdout_stream_notify_queue); + + char id_field[STRLEN("_STREAM_ID=") + SD_ID128_STRING_MAX]; +}; + +StdoutStream* stdout_stream_free(StdoutStream *s) { + if (!s) + return NULL; + + if (s->server) { + if (s->context) + client_context_release(s->server, s->context); + + assert(s->server->n_stdout_streams > 0); + s->server->n_stdout_streams--; + LIST_REMOVE(stdout_stream, s->server->stdout_streams, s); + + if (s->in_notify_queue) + LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + + (void) server_start_or_stop_idle_timer(s->server); /* Maybe we are idle now? */ + } + + sd_event_source_disable_unref(s->event_source); + safe_close(s->fd); + free(s->label); + free(s->identifier); + free(s->unit_id); + free(s->state_file); + free(s->buffer); + + return mfree(s); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(StdoutStream*, stdout_stream_free); + +void stdout_stream_destroy(StdoutStream *s) { + if (!s) + return; + + if (s->state_file) + (void) unlink(s->state_file); + + stdout_stream_free(s); +} + +static int stdout_stream_save(StdoutStream *s) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(s); + + if (s->state != STDOUT_STREAM_RUNNING) + return 0; + + if (!s->state_file) { + struct stat st; + + r = fstat(s->fd, &st); + if (r < 0) + return log_ratelimit_warning_errno(errno, JOURNAL_LOG_RATELIMIT, + "Failed to stat connected stream: %m"); + + /* We use device and inode numbers as identifier for the stream */ + r = asprintf(&s->state_file, "%s/streams/%lu:%lu", s->server->runtime_directory, (unsigned long) st.st_dev, (unsigned long) st.st_ino); + if (r < 0) + return log_oom(); + } + + (void) mkdir_parents(s->state_file, 0755); + + r = fopen_temporary(s->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + fprintf(f, + "# This is private data. Do not parse\n" + "PRIORITY=%i\n" + "LEVEL_PREFIX=%i\n" + "FORWARD_TO_SYSLOG=%i\n" + "FORWARD_TO_KMSG=%i\n" + "FORWARD_TO_CONSOLE=%i\n" + "STREAM_ID=%s\n", + s->priority, + s->level_prefix, + s->forward_to_syslog, + s->forward_to_kmsg, + s->forward_to_console, + s->id_field + STRLEN("_STREAM_ID=")); + + if (!isempty(s->identifier)) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->identifier); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "IDENTIFIER=%s\n", escaped); + } + + if (!isempty(s->unit_id)) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->unit_id); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "UNIT=%s\n", escaped); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, s->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + + if (!s->fdstore && !s->in_notify_queue) { + LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + s->in_notify_queue = true; + + if (s->server->notify_event_source) { + r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to enable notify event source: %m"); + } + } + + return 0; + +fail: + (void) unlink(s->state_file); + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to save stream data %s: %m", s->state_file); +} + +static int stdout_stream_log( + StdoutStream *s, + const char *p, + LineBreak line_break) { + + struct iovec *iovec; + int priority; + char syslog_priority[] = "PRIORITY=\0"; + char syslog_facility[STRLEN("SYSLOG_FACILITY=") + DECIMAL_STR_MAX(int) + 1]; + _cleanup_free_ char *message = NULL, *syslog_identifier = NULL; + size_t n = 0, m; + int r; + + assert(s); + assert(p); + + assert(line_break >= 0); + assert(line_break < _LINE_BREAK_MAX); + + if (s->context) + (void) client_context_maybe_refresh(s->server, s->context, NULL, NULL, 0, NULL, USEC_INFINITY); + else if (pid_is_valid(s->ucred.pid)) { + r = client_context_acquire(s->server, s->ucred.pid, &s->ucred, s->label, strlen_ptr(s->label), s->unit_id, &s->context); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to acquire client context, ignoring: %m"); + } + + priority = s->priority; + + if (s->level_prefix) + syslog_parse_priority(&p, &priority, false); + + if (!client_context_test_priority(s->context, priority)) + return 0; + + if (isempty(p)) + return 0; + + r = client_context_check_keep_log(s->context, p, strlen(p)); + if (r <= 0) + return r; + + if (s->forward_to_syslog || s->server->forward_to_syslog) + server_forward_syslog(s->server, syslog_fixup_facility(priority), s->identifier, p, &s->ucred, NULL); + + if (s->forward_to_kmsg || s->server->forward_to_kmsg) + server_forward_kmsg(s->server, priority, s->identifier, p, &s->ucred); + + if (s->forward_to_console || s->server->forward_to_console) + server_forward_console(s->server, priority, s->identifier, p, &s->ucred); + + if (s->server->forward_to_wall) + server_forward_wall(s->server, priority, s->identifier, p, &s->ucred); + + m = N_IOVEC_META_FIELDS + 7 + client_context_extra_fields_n_iovec(s->context); + iovec = newa(struct iovec, m); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=stdout"); + iovec[n++] = IOVEC_MAKE_STRING(s->id_field); + + syslog_priority[STRLEN("PRIORITY=")] = '0' + LOG_PRI(priority); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); + + if (priority & LOG_FACMASK) { + xsprintf(syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); + } + + if (s->identifier) { + syslog_identifier = strjoin("SYSLOG_IDENTIFIER=", s->identifier); + if (syslog_identifier) + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); + } + + static const char * const line_break_field_table[_LINE_BREAK_MAX] = { + [LINE_BREAK_NEWLINE] = NULL, /* Do not add field if traditional newline */ + [LINE_BREAK_NUL] = "_LINE_BREAK=nul", + [LINE_BREAK_LINE_MAX] = "_LINE_BREAK=line-max", + [LINE_BREAK_EOF] = "_LINE_BREAK=eof", + [LINE_BREAK_PID_CHANGE] = "_LINE_BREAK=pid-change", + }; + + const char *c = line_break_field_table[line_break]; + + /* If this log message was generated due to an uncommon line break then mention this in the log + * entry */ + if (c) + iovec[n++] = IOVEC_MAKE_STRING(c); + + message = strjoin("MESSAGE=", p); + if (message) + iovec[n++] = IOVEC_MAKE_STRING(message); + + server_dispatch_message(s->server, iovec, n, m, s->context, NULL, priority, 0); + return 0; +} + +static int syslog_parse_priority_and_facility(const char *s) { + int prio, r; + + /* Parses both facility and priority in one value, i.e. is different from log_level_from_string() + * which only parses the priority and refuses any facility value */ + + r = safe_atoi(s, &prio); + if (r < 0) + return r; + + if (prio < 0 || prio > 999) + return -ERANGE; + + return prio; +} + +static int stdout_stream_line(StdoutStream *s, char *p, LineBreak line_break) { + char *orig; + int r; + + assert(s); + assert(p); + + orig = p; + p = strstrip(p); + + /* line breaks by NUL, line max length or EOF are not permissible during the negotiation part of the protocol */ + if (line_break != LINE_BREAK_NEWLINE && s->state != STDOUT_STREAM_RUNNING) + return log_ratelimit_warning_errno(SYNTHETIC_ERRNO(EINVAL), JOURNAL_LOG_RATELIMIT, + "Control protocol line not properly terminated."); + + switch (s->state) { + + case STDOUT_STREAM_IDENTIFIER: + if (!isempty(p)) { + s->identifier = strdup(p); + if (!s->identifier) + return log_oom(); + } + + s->state = STDOUT_STREAM_UNIT_ID; + return 0; + + case STDOUT_STREAM_UNIT_ID: + if (s->ucred.uid == 0 && + unit_name_is_valid(p, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + + s->unit_id = strdup(p); + if (!s->unit_id) + return log_oom(); + } + + s->state = STDOUT_STREAM_PRIORITY; + return 0; + + case STDOUT_STREAM_PRIORITY: { + int priority; + + priority = syslog_parse_priority_and_facility(p); + if (priority < 0) + return log_ratelimit_warning_errno(priority, JOURNAL_LOG_RATELIMIT, + "Failed to parse log priority line: %m"); + + s->priority = priority; + s->state = STDOUT_STREAM_LEVEL_PREFIX; + return 0; + } + + case STDOUT_STREAM_LEVEL_PREFIX: + r = parse_boolean(p); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to parse level prefix line: %m"); + + s->level_prefix = r; + s->state = STDOUT_STREAM_FORWARD_TO_SYSLOG; + return 0; + + case STDOUT_STREAM_FORWARD_TO_SYSLOG: + r = parse_boolean(p); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to parse forward to syslog line: %m"); + + s->forward_to_syslog = r; + s->state = STDOUT_STREAM_FORWARD_TO_KMSG; + return 0; + + case STDOUT_STREAM_FORWARD_TO_KMSG: + r = parse_boolean(p); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to parse copy to kmsg line: %m"); + + s->forward_to_kmsg = r; + s->state = STDOUT_STREAM_FORWARD_TO_CONSOLE; + return 0; + + case STDOUT_STREAM_FORWARD_TO_CONSOLE: + r = parse_boolean(p); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to parse copy to console line."); + + s->forward_to_console = r; + s->state = STDOUT_STREAM_RUNNING; + + /* Try to save the stream, so that journald can be restarted and we can recover */ + (void) stdout_stream_save(s); + return 0; + + case STDOUT_STREAM_RUNNING: + return stdout_stream_log(s, orig, line_break); + } + + assert_not_reached(); +} + +static int stdout_stream_found( + StdoutStream *s, + char *p, + size_t l, + LineBreak line_break) { + + char saved; + int r; + + assert(s); + assert(p); + + /* Let's NUL terminate the specified buffer for this call, and revert back afterwards */ + saved = p[l]; + p[l] = 0; + r = stdout_stream_line(s, p, line_break); + p[l] = saved; + + return r; +} + +static size_t stdout_stream_line_max(StdoutStream *s) { + assert(s); + + /* During the "setup" phase of our protocol, let's ensure we use a line length where a full unit name + * can fit in */ + if (s->state != STDOUT_STREAM_RUNNING) + return STDOUT_STREAM_SETUP_PROTOCOL_LINE_MAX; + + /* After the protocol's "setup" phase is complete, let's use whatever the user configured */ + return s->server->line_max; +} + +static int stdout_stream_scan( + StdoutStream *s, + char *p, + size_t remaining, + LineBreak force_flush, + size_t *ret_consumed) { + + size_t consumed = 0; + int r; + + assert(s); + assert(p); + + + for (;;) { + LineBreak line_break; + size_t skip, found; + char *end1, *end2; + size_t tmp_remaining, line_max; + + line_max = stdout_stream_line_max(s); + tmp_remaining = MIN(remaining, line_max); + + end1 = memchr(p, '\n', tmp_remaining); + end2 = memchr(p, 0, end1 ? (size_t) (end1 - p) : tmp_remaining); + + if (end2) { + /* We found a NUL terminator */ + found = end2 - p; + skip = found + 1; + line_break = LINE_BREAK_NUL; + } else if (end1) { + /* We found a \n terminator */ + found = end1 - p; + skip = found + 1; + line_break = LINE_BREAK_NEWLINE; + } else if (remaining >= line_max) { + /* Force a line break after the maximum line length */ + found = skip = line_max; + line_break = LINE_BREAK_LINE_MAX; + } else + break; + + r = stdout_stream_found(s, p, found, line_break); + if (r < 0) + return r; + + p += skip; + consumed += skip; + remaining -= skip; + } + + if (force_flush >= 0 && remaining > 0) { + r = stdout_stream_found(s, p, remaining, force_flush); + if (r < 0) + return r; + + consumed += remaining; + } + + if (ret_consumed) + *ret_consumed = consumed; + + return 0; +} + +static int stdout_stream_process(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + size_t limit, consumed, allocated; + StdoutStream *s = ASSERT_PTR(userdata); + struct ucred *ucred; + struct iovec iovec; + ssize_t l; + char *p; + int r; + + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + if ((revents|EPOLLIN|EPOLLHUP) != (EPOLLIN|EPOLLHUP)) { + log_error("Got invalid event from epoll for stdout stream: %"PRIx32, revents); + goto terminate; + } + + /* If the buffer is almost full, add room for another 1K */ + allocated = MALLOC_ELEMENTSOF(s->buffer); + if (s->length + 512 >= allocated) { + if (!GREEDY_REALLOC(s->buffer, s->length + 1 + 1024)) { + log_oom(); + goto terminate; + } + + allocated = MALLOC_ELEMENTSOF(s->buffer); + } + + /* Try to make use of the allocated buffer in full, but never read more than the configured line size. Also, + * always leave room for a terminating NUL we might need to add. */ + limit = MIN(allocated - 1, MAX(s->server->line_max, STDOUT_STREAM_SETUP_PROTOCOL_LINE_MAX)); + assert(s->length <= limit); + iovec = IOVEC_MAKE(s->buffer + s->length, limit - s->length); + + l = recvmsg(s->fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + log_ratelimit_warning_errno(errno, JOURNAL_LOG_RATELIMIT, "Failed to read from stream: %m"); + goto terminate; + } + cmsg_close_all(&msghdr); + + if (l == 0) { + (void) stdout_stream_scan(s, s->buffer, s->length, /* force_flush = */ LINE_BREAK_EOF, NULL); + goto terminate; + } + + /* Invalidate the context if the PID of the sender changed. This happens when a forked process + * inherits stdout/stderr from a parent. In this case getpeercred() returns the ucred of the parent, + * which can be invalid if the parent has exited in the meantime. */ + ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (ucred && ucred->pid != s->ucred.pid) { + /* Force out any previously half-written lines from a different process, before we switch to + * the new ucred structure for everything we just added */ + r = stdout_stream_scan(s, s->buffer, s->length, /* force_flush = */ LINE_BREAK_PID_CHANGE, NULL); + if (r < 0) + goto terminate; + + s->context = client_context_release(s->server, s->context); + + p = s->buffer + s->length; + } else { + p = s->buffer; + l += s->length; + } + + /* Always copy in the new credentials */ + if (ucred) + s->ucred = *ucred; + + r = stdout_stream_scan(s, p, l, _LINE_BREAK_INVALID, &consumed); + if (r < 0) + goto terminate; + + /* Move what wasn't consumed to the front of the buffer */ + assert(consumed <= (size_t) l); + s->length = l - consumed; + memmove(s->buffer, p + consumed, s->length); + + return 1; + +terminate: + stdout_stream_destroy(s); + return 0; +} + +int stdout_stream_install(Server *s, int fd, StdoutStream **ret) { + _cleanup_(stdout_stream_freep) StdoutStream *stream = NULL; + sd_id128_t id; + int r; + + assert(s); + assert(fd >= 0); + + r = sd_id128_randomize(&id); + if (r < 0) + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to generate stream ID: %m"); + + stream = new(StdoutStream, 1); + if (!stream) + return log_oom(); + + *stream = (StdoutStream) { + .fd = -EBADF, + .priority = LOG_INFO, + .ucred = UCRED_INVALID, + }; + + xsprintf(stream->id_field, "_STREAM_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)); + + r = getpeercred(fd, &stream->ucred); + if (r < 0) + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to determine peer credentials: %m"); + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + if (mac_selinux_use()) { + r = getpeersec(fd, &stream->label); + if (r < 0 && r != -EOPNOTSUPP) + (void) log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to determine peer security context: %m"); + } + + (void) shutdown(fd, SHUT_WR); + + r = sd_event_add_io(s->event, &stream->event_source, fd, EPOLLIN, stdout_stream_process, stream); + if (r < 0) + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to add stream to event loop: %m"); + + r = sd_event_source_set_priority(stream->event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to adjust stdout event source priority: %m"); + + stream->fd = fd; + + stream->server = s; + LIST_PREPEND(stdout_stream, s->stdout_streams, stream); + s->n_stdout_streams++; + + (void) server_start_or_stop_idle_timer(s); /* Maybe no longer idle? */ + + if (ret) + *ret = stream; + + TAKE_PTR(stream); + return 0; +} + +static int stdout_stream_new(sd_event_source *es, int listen_fd, uint32_t revents, void *userdata) { + _cleanup_close_ int fd = -EBADF; + Server *s = ASSERT_PTR(userdata); + int r; + + if (revents != EPOLLIN) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Got invalid event from epoll for stdout server fd: %" PRIx32, + revents); + + fd = accept4(s->stdout_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (fd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return log_ratelimit_error_errno(errno, JOURNAL_LOG_RATELIMIT, "Failed to accept stdout connection: %m"); + } + + if (s->n_stdout_streams >= STDOUT_STREAMS_MAX) { + struct ucred u = UCRED_INVALID; + + (void) getpeercred(fd, &u); + + /* By closing fd here we make sure that the client won't wait too long for journald to + * gather all the data it adds to the error message to find out that the connection has + * just been refused. + */ + fd = safe_close(fd); + + server_driver_message(s, u.pid, NULL, LOG_MESSAGE("Too many stdout streams, refusing connection."), NULL); + return 0; + } + + r = stdout_stream_install(s, fd, NULL); + if (r < 0) + return r; + + TAKE_FD(fd); + return 0; +} + +static int stdout_stream_load(StdoutStream *stream, const char *fname) { + _cleanup_free_ char + *priority = NULL, + *level_prefix = NULL, + *forward_to_syslog = NULL, + *forward_to_kmsg = NULL, + *forward_to_console = NULL, + *stream_id = NULL; + int r; + + assert(stream); + assert(fname); + + if (!stream->state_file) { + stream->state_file = path_join(stream->server->runtime_directory, "streams", fname); + if (!stream->state_file) + return log_oom(); + } + + r = parse_env_file(NULL, stream->state_file, + "PRIORITY", &priority, + "LEVEL_PREFIX", &level_prefix, + "FORWARD_TO_SYSLOG", &forward_to_syslog, + "FORWARD_TO_KMSG", &forward_to_kmsg, + "FORWARD_TO_CONSOLE", &forward_to_console, + "IDENTIFIER", &stream->identifier, + "UNIT", &stream->unit_id, + "STREAM_ID", &stream_id); + if (r < 0) + return log_error_errno(r, "Failed to read: %s", stream->state_file); + + if (priority) { + int p; + + p = syslog_parse_priority_and_facility(priority); + if (p >= 0) + stream->priority = p; + } + + if (level_prefix) { + r = parse_boolean(level_prefix); + if (r >= 0) + stream->level_prefix = r; + } + + if (forward_to_syslog) { + r = parse_boolean(forward_to_syslog); + if (r >= 0) + stream->forward_to_syslog = r; + } + + if (forward_to_kmsg) { + r = parse_boolean(forward_to_kmsg); + if (r >= 0) + stream->forward_to_kmsg = r; + } + + if (forward_to_console) { + r = parse_boolean(forward_to_console); + if (r >= 0) + stream->forward_to_console = r; + } + + if (stream_id) { + sd_id128_t id; + + r = sd_id128_from_string(stream_id, &id); + if (r >= 0) + xsprintf(stream->id_field, "_STREAM_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)); + } + + return 0; +} + +static int stdout_stream_restore(Server *s, const char *fname, int fd) { + StdoutStream *stream; + int r; + + assert(s); + assert(fname); + assert(fd >= 0); + + if (s->n_stdout_streams >= STDOUT_STREAMS_MAX) { + log_warning("Too many stdout streams, refusing restoring of stream."); + return -ENOBUFS; + } + + r = stdout_stream_install(s, fd, &stream); + if (r < 0) + return r; + + stream->state = STDOUT_STREAM_RUNNING; + stream->fdstore = true; + + /* Ignore all parsing errors */ + (void) stdout_stream_load(stream, fname); + + return 0; +} + +int server_restore_streams(Server *s, FDSet *fds) { + _cleanup_closedir_ DIR *d = NULL; + const char *path; + int r; + + path = strjoina(s->runtime_directory, "/streams"); + d = opendir(path); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to enumerate %s: %m", path); + } + + FOREACH_DIRENT(de, d, goto fail) { + unsigned long st_dev, st_ino; + bool found = false; + int fd; + + if (sscanf(de->d_name, "%lu:%lu", &st_dev, &st_ino) != 2) + continue; + + FDSET_FOREACH(fd, fds) { + struct stat st; + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", de->d_name); + + if (S_ISSOCK(st.st_mode) && st.st_dev == st_dev && st.st_ino == st_ino) { + found = true; + break; + } + } + + if (!found) { + /* No file descriptor? Then let's delete the state file */ + log_debug("Cannot restore stream file %s", de->d_name); + if (unlinkat(dirfd(d), de->d_name, 0) < 0) + log_warning_errno(errno, "Failed to remove %s/%s: %m", path, de->d_name); + continue; + } + + fdset_remove(fds, fd); + + r = stdout_stream_restore(s, de->d_name, fd); + if (r < 0) + safe_close(fd); + } + + return 0; + +fail: + return log_error_errno(errno, "Failed to read streams directory: %m"); +} + +int server_open_stdout_socket(Server *s, const char *stdout_socket) { + int r; + + assert(s); + assert(stdout_socket); + + if (s->stdout_fd < 0) { + union sockaddr_union sa; + socklen_t sa_len; + + r = sockaddr_un_set_path(&sa.un, stdout_socket); + if (r < 0) + return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", stdout_socket); + sa_len = r; + + s->stdout_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->stdout_fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + (void) sockaddr_un_unlink(&sa.un); + + r = bind(s->stdout_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + (void) chmod(sa.un.sun_path, 0666); + + if (listen(s->stdout_fd, SOMAXCONN_DELUXE) < 0) + return log_error_errno(errno, "listen(%s) failed: %m", sa.un.sun_path); + } else + (void) fd_nonblock(s->stdout_fd, true); + + r = sd_event_add_io(s->event, &s->stdout_event_source, s->stdout_fd, EPOLLIN, stdout_stream_new, s); + if (r < 0) + return log_error_errno(r, "Failed to add stdout server fd to event source: %m"); + + r = sd_event_source_set_priority(s->stdout_event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of stdout server event source: %m"); + + return 0; +} + +void stdout_stream_send_notify(StdoutStream *s) { + struct iovec iovec = { + .iov_base = (char*) "FDSTORE=1", + .iov_len = STRLEN("FDSTORE=1"), + }; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + }; + struct cmsghdr *cmsg; + ssize_t l; + + assert(s); + assert(!s->fdstore); + assert(s->in_notify_queue); + assert(s->server); + assert(s->server->notify_fd >= 0); + + /* Store the connection fd in PID 1, so that we get it passed + * in again on next start */ + + msghdr.msg_controllen = CMSG_SPACE(sizeof(int)); + msghdr.msg_control = alloca0(msghdr.msg_controllen); + + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int)); + + l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL); + if (l < 0) { + if (errno == EAGAIN) + return; + + log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m"); + } else { + log_debug("Successfully sent stream file descriptor to service manager."); + s->fdstore = 1; + } + + LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + s->in_notify_queue = false; + +} diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h new file mode 100644 index 0000000..7b756c0 --- /dev/null +++ b/src/journal/journald-stream.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct StdoutStream StdoutStream; + +#include "fdset.h" +#include "journald-server.h" + +int server_open_stdout_socket(Server *s, const char *stdout_socket); +int server_restore_streams(Server *s, FDSet *fds); + +StdoutStream* stdout_stream_free(StdoutStream *s); +int stdout_stream_install(Server *s, int fd, StdoutStream **ret); +void stdout_stream_destroy(StdoutStream *s); +void stdout_stream_send_notify(StdoutStream *s); diff --git a/src/journal/journald-syslog.c b/src/journal/journald-syslog.c new file mode 100644 index 0000000..f6accb5 --- /dev/null +++ b/src/journal/journald-syslog.c @@ -0,0 +1,533 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "iovec-util.h" +#include "journal-internal.h" +#include "journald-client.h" +#include "journald-console.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "journald-wall.h" +#include "process-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "syslog-util.h" + +/* Warn once every 30s if we missed syslog message */ +#define WARN_FORWARD_SYSLOG_MISSED_USEC (30 * USEC_PER_SEC) + +static void forward_syslog_iovec( + Server *s, + const struct iovec *iovec, + unsigned n_iovec, + const struct ucred *ucred, + const struct timeval *tv) { + + union sockaddr_union sa; + + struct msghdr msghdr = { + .msg_iov = (struct iovec *) iovec, + .msg_iovlen = n_iovec, + }; + struct cmsghdr *cmsg; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + const char *j; + int r; + + assert(s); + assert(iovec); + assert(n_iovec > 0); + + j = strjoina(s->runtime_directory, "/syslog"); + r = sockaddr_un_set_path(&sa.un, j); + if (r < 0) { + log_debug_errno(r, "Forwarding socket path %s too long for AF_UNIX, not forwarding: %m", j); + return; + } + + msghdr.msg_name = &sa.sa; + msghdr.msg_namelen = r; + + if (ucred) { + zero(control); + msghdr.msg_control = &control; + msghdr.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred)); + msghdr.msg_controllen = cmsg->cmsg_len; + } + + /* Forward the syslog message we received via /dev/log to /run/systemd/syslog. Unfortunately we + * currently can't set the SO_TIMESTAMP auxiliary data, and hence we don't. */ + + if (sendmsg(s->syslog_fd, &msghdr, MSG_NOSIGNAL) >= 0) + return; + + /* The socket is full? I guess the syslog implementation is + * too slow, and we shouldn't wait for that... */ + if (errno == EAGAIN) { + s->n_forward_syslog_missed++; + return; + } + + if (ucred && IN_SET(errno, ESRCH, EPERM)) { + struct ucred u; + + /* Hmm, presumably the sender process vanished + * by now, or we don't have CAP_SYS_AMDIN, so + * let's fix it as good as we can, and retry */ + + u = *ucred; + u.pid = getpid_cached(); + memcpy(CMSG_DATA(cmsg), &u, sizeof(struct ucred)); + + if (sendmsg(s->syslog_fd, &msghdr, MSG_NOSIGNAL) >= 0) + return; + + if (errno == EAGAIN) { + s->n_forward_syslog_missed++; + return; + } + } + + if (errno != ENOENT) + log_debug_errno(errno, "Failed to forward syslog message: %m"); +} + +static void forward_syslog_raw(Server *s, int priority, const char *buffer, size_t buffer_len, const struct ucred *ucred, const struct timeval *tv) { + struct iovec iovec; + + assert(s); + assert(buffer); + + if (LOG_PRI(priority) > s->max_level_syslog) + return; + + iovec = IOVEC_MAKE((char *) buffer, buffer_len); + forward_syslog_iovec(s, &iovec, 1, ucred, tv); +} + +void server_forward_syslog(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred, const struct timeval *tv) { + struct iovec iovec[5]; + char header_priority[DECIMAL_STR_MAX(priority) + 3], header_time[64], + header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t) + 1]; + int n = 0; + time_t t; + struct tm tm; + _cleanup_free_ char *ident_buf = NULL; + + assert(s); + assert(priority >= 0); + assert(priority <= 999); + assert(message); + + if (LOG_PRI(priority) > s->max_level_syslog) + return; + + /* First: priority field */ + xsprintf(header_priority, "<%i>", priority); + iovec[n++] = IOVEC_MAKE_STRING(header_priority); + + /* Second: timestamp */ + t = tv ? tv->tv_sec : ((time_t) (now(CLOCK_REALTIME) / USEC_PER_SEC)); + if (!localtime_r(&t, &tm)) + return; + if (strftime(header_time, sizeof(header_time), "%h %e %T ", &tm) <= 0) + return; + iovec[n++] = IOVEC_MAKE_STRING(header_time); + + /* Third: identifier and PID */ + if (ucred) { + if (!identifier) { + (void) pid_get_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); + + if (identifier) + iovec[n++] = IOVEC_MAKE_STRING(identifier); + + iovec[n++] = IOVEC_MAKE_STRING(header_pid); + } else if (identifier) { + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); + } + + /* Fourth: message */ + iovec[n++] = IOVEC_MAKE_STRING(message); + + forward_syslog_iovec(s, iovec, n, ucred, tv); +} + +int syslog_fixup_facility(int priority) { + + if ((priority & LOG_FACMASK) == 0) + return (priority & LOG_PRIMASK) | LOG_USER; + + return priority; +} + +size_t syslog_parse_identifier(const char **buf, char **identifier, char **pid) { + const char *p; + char *t; + size_t l, e; + + assert(buf); + assert(identifier); + assert(pid); + + p = *buf; + + p += strspn(p, WHITESPACE); + l = strcspn(p, WHITESPACE); + + if (l <= 0 || + p[l-1] != ':') + return 0; + + e = l; + l--; + + if (l > 0 && p[l-1] == ']') { + size_t k = l-1; + + for (;;) { + + if (p[k] == '[') { + t = strndup(p+k+1, l-k-2); + if (t) + *pid = t; + + l = k; + break; + } + + if (k == 0) + break; + + k--; + } + } + + t = strndup(p, l); + if (t) + *identifier = t; + + /* Single space is used as separator */ + if (p[e] != '\0' && strchr(WHITESPACE, p[e])) + e++; + + l = (p - *buf) + e; + *buf = p + e; + return l; +} + +static int syslog_skip_timestamp(const char **buf) { + enum { + LETTER, + SPACE, + NUMBER, + SPACE_OR_NUMBER, + COLON + } sequence[] = { + LETTER, LETTER, LETTER, + SPACE, + SPACE_OR_NUMBER, NUMBER, + SPACE, + SPACE_OR_NUMBER, NUMBER, + COLON, + SPACE_OR_NUMBER, NUMBER, + COLON, + SPACE_OR_NUMBER, NUMBER, + SPACE + }; + + const char *p, *t; + unsigned i; + + assert(buf); + assert(*buf); + + for (i = 0, p = *buf; i < ELEMENTSOF(sequence); i++, p++) { + if (!*p) + return 0; + + switch (sequence[i]) { + + case SPACE: + if (*p != ' ') + return 0; + break; + + case SPACE_OR_NUMBER: + if (*p == ' ') + break; + + _fallthrough_; + case NUMBER: + if (!ascii_isdigit(*p)) + return 0; + + break; + + case LETTER: + if (!ascii_isalpha(*p)) + return 0; + + break; + + case COLON: + if (*p != ':') + return 0; + break; + + } + } + + t = *buf; + *buf = p; + return p - t; +} + +void server_process_syslog_message( + Server *s, + const char *buf, + size_t raw_len, + const struct ucred *ucred, + const struct timeval *tv, + const char *label, + size_t label_len) { + + char *t, syslog_priority[sizeof("PRIORITY=") + DECIMAL_STR_MAX(int)], + syslog_facility[sizeof("SYSLOG_FACILITY=") + DECIMAL_STR_MAX(int)]; + const char *msg, *syslog_ts, *a; + _cleanup_free_ char *identifier = NULL, *pid = NULL, + *dummy = NULL, *msg_msg = NULL, *msg_raw = NULL; + int priority = LOG_USER | LOG_INFO, r; + ClientContext *context = NULL; + struct iovec *iovec; + size_t n = 0, m, i, leading_ws, syslog_ts_len; + bool store_raw; + + assert(s); + assert(buf); + /* The message cannot be empty. */ + assert(raw_len > 0); + /* The buffer NUL-terminated and can be used a string. raw_len is the length + * without the terminating NUL byte, the buffer is actually one bigger. */ + assert(buf[raw_len] == '\0'); + + if (ucred && pid_is_valid(ucred->pid)) { + r = client_context_get(s, ucred->pid, ucred, label, label_len, NULL, &context); + if (r < 0) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to retrieve credentials for PID " PID_FMT ", ignoring: %m", + ucred->pid); + } + + /* We are creating a copy of the message because we want to forward the original message + verbatim to the legacy syslog implementation */ + for (i = raw_len; i > 0; i--) + if (!strchr(WHITESPACE, buf[i-1])) + break; + + leading_ws = strspn(buf, WHITESPACE); + + if (i == 0) + /* The message contains only whitespaces */ + msg = buf + raw_len; + else if (i == raw_len) + /* Nice! No need to strip anything on the end, let's optimize this a bit */ + msg = buf + leading_ws; + else { + msg = dummy = new(char, i - leading_ws + 1); + if (!dummy) { + log_oom(); + return; + } + + memcpy(dummy, buf + leading_ws, i - leading_ws); + dummy[i - leading_ws] = 0; + } + + /* We will add the SYSLOG_RAW= field when we stripped anything + * _or_ if the input message contained NUL bytes. */ + store_raw = msg != buf || strlen(msg) != raw_len; + + syslog_parse_priority(&msg, &priority, true); + + if (!client_context_test_priority(context, priority)) + return; + + if (client_context_check_keep_log(context, msg, strlen(msg)) <= 0) + return; + + syslog_ts = msg; + syslog_ts_len = syslog_skip_timestamp(&msg); + if (syslog_ts_len == 0) + /* We failed to parse the full timestamp, store the raw message too */ + store_raw = true; + + syslog_parse_identifier(&msg, &identifier, &pid); + + if (s->forward_to_syslog) + forward_syslog_raw(s, priority, buf, raw_len, ucred, tv); + + if (s->forward_to_kmsg) + server_forward_kmsg(s, priority, identifier, msg, ucred); + + if (s->forward_to_console) + server_forward_console(s, priority, identifier, msg, ucred); + + if (s->forward_to_wall) + server_forward_wall(s, priority, identifier, msg, ucred); + + m = N_IOVEC_META_FIELDS + 8 + client_context_extra_fields_n_iovec(context); + iovec = newa(struct iovec, m); + + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=syslog"); + + xsprintf(syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); + + if (priority & LOG_FACMASK) { + xsprintf(syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); + } + + if (identifier) { + a = strjoina("SYSLOG_IDENTIFIER=", identifier); + iovec[n++] = IOVEC_MAKE_STRING(a); + } + + if (pid) { + a = strjoina("SYSLOG_PID=", pid); + iovec[n++] = IOVEC_MAKE_STRING(a); + } + + if (syslog_ts_len > 0) { + const size_t hlen = STRLEN("SYSLOG_TIMESTAMP="); + + t = newa(char, hlen + syslog_ts_len); + memcpy(t, "SYSLOG_TIMESTAMP=", hlen); + memcpy(t + hlen, syslog_ts, syslog_ts_len); + + iovec[n++] = IOVEC_MAKE(t, hlen + syslog_ts_len); + } + + msg_msg = strjoin("MESSAGE=", msg); + if (!msg_msg) { + log_oom(); + return; + } + iovec[n++] = IOVEC_MAKE_STRING(msg_msg); + + if (store_raw) { + const size_t hlen = STRLEN("SYSLOG_RAW="); + + msg_raw = new(char, hlen + raw_len); + if (!msg_raw) { + log_oom(); + return; + } + + memcpy(msg_raw, "SYSLOG_RAW=", hlen); + memcpy(msg_raw + hlen, buf, raw_len); + + iovec[n++] = IOVEC_MAKE(msg_raw, hlen + raw_len); + } + + server_dispatch_message(s, iovec, n, m, context, tv, priority, 0); +} + +int server_open_syslog_socket(Server *s, const char *syslog_socket) { + int r; + + assert(s); + assert(syslog_socket); + + if (s->syslog_fd < 0) { + union sockaddr_union sa; + socklen_t sa_len; + + r = sockaddr_un_set_path(&sa.un, syslog_socket); + if (r < 0) + return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", syslog_socket); + sa_len = r; + + s->syslog_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->syslog_fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + (void) sockaddr_un_unlink(&sa.un); + + r = bind(s->syslog_fd, &sa.sa, sa_len); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + (void) chmod(sa.un.sun_path, 0666); + } else + (void) fd_nonblock(s->syslog_fd, true); + + r = setsockopt_int(s->syslog_fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + if (mac_selinux_use()) { + r = setsockopt_int(s->syslog_fd, SOL_SOCKET, SO_PASSSEC, true); + if (r < 0) + log_warning_errno(r, "SO_PASSSEC failed: %m"); + } + + r = setsockopt_int(s->syslog_fd, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return log_error_errno(r, "SO_TIMESTAMP failed: %m"); + + r = sd_event_add_io(s->event, &s->syslog_event_source, s->syslog_fd, EPOLLIN, server_process_datagram, s); + if (r < 0) + return log_error_errno(r, "Failed to add syslog server fd to event loop: %m"); + + r = sd_event_source_set_priority(s->syslog_event_source, SD_EVENT_PRIORITY_NORMAL+5); + if (r < 0) + return log_error_errno(r, "Failed to adjust syslog event source priority: %m"); + + return 0; +} + +void server_maybe_warn_forward_syslog_missed(Server *s) { + usec_t n; + + assert(s); + + if (s->n_forward_syslog_missed <= 0) + return; + + n = now(CLOCK_MONOTONIC); + if (s->last_warn_forward_syslog_missed + WARN_FORWARD_SYSLOG_MISSED_USEC > n) + return; + + server_driver_message(s, 0, + "MESSAGE_ID=" SD_MESSAGE_FORWARD_SYSLOG_MISSED_STR, + LOG_MESSAGE("Forwarding to syslog missed %u messages.", + s->n_forward_syslog_missed), + NULL); + + s->n_forward_syslog_missed = 0; + s->last_warn_forward_syslog_missed = n; +} diff --git a/src/journal/journald-syslog.h b/src/journal/journald-syslog.h new file mode 100644 index 0000000..3bc3ffd --- /dev/null +++ b/src/journal/journald-syslog.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journald-server.h" + +int syslog_fixup_facility(int priority) _const_; + +size_t syslog_parse_identifier(const char **buf, char **identifier, char **pid); + +void server_forward_syslog(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred, const struct timeval *tv); + +void server_process_syslog_message(Server *s, const char *buf, size_t buf_len, const struct ucred *ucred, const struct timeval *tv, const char *label, size_t label_len); +int server_open_syslog_socket(Server *s, const char *syslog_socket); + +void server_maybe_warn_forward_syslog_missed(Server *s); diff --git a/src/journal/journald-wall.c b/src/journal/journald-wall.c new file mode 100644 index 0000000..79eaac1 --- /dev/null +++ b/src/journal/journald-wall.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "format-util.h" +#include "journald-server.h" +#include "journald-wall.h" +#include "process-util.h" +#include "string-util.h" +#include "wall.h" + +void server_forward_wall( + Server *s, + int priority, + const char *identifier, + const char *message, + const struct ucred *ucred) { + + _cleanup_free_ char *ident_buf = NULL, *l_buf = NULL; + const char *l; + int r; + + assert(s); + assert(message); + + if (LOG_PRI(priority) > s->max_level_wall) + return; + + if (ucred) { + if (!identifier) { + (void) pid_get_comm(ucred->pid, &ident_buf); + identifier = ident_buf; + } + + if (asprintf(&l_buf, "%s["PID_FMT"]: %s", strempty(identifier), ucred->pid, message) < 0) { + log_oom(); + return; + } + + l = l_buf; + + } else if (identifier) { + + l = l_buf = strjoin(identifier, ": ", message); + if (!l_buf) { + log_oom(); + return; + } + } else + l = message; + + r = wall(l, "systemd-journald", NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send wall message: %m"); +} diff --git a/src/journal/journald-wall.h b/src/journal/journald-wall.h new file mode 100644 index 0000000..3f98c35 --- /dev/null +++ b/src/journal/journald-wall.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "journald-server.h" + +void server_forward_wall(Server *s, int priority, const char *identifier, const char *message, const struct ucred *ucred); diff --git a/src/journal/journald.c b/src/journal/journald.c new file mode 100644 index 0000000..94aad05 --- /dev/null +++ b/src/journal/journald.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "format-util.h" +#include "journal-authenticate.h" +#include "journald-kmsg.h" +#include "journald-server.h" +#include "journald-syslog.h" +#include "process-util.h" +#include "sigbus.h" + +int main(int argc, char *argv[]) { + const char *namespace; + LogTarget log_target; + Server server; + int r; + + if (argc > 2) { + log_error("This program takes one or no arguments."); + return EXIT_FAILURE; + } + + namespace = argc > 1 ? empty_to_null(argv[1]) : NULL; + + log_set_facility(LOG_SYSLOG); + + if (namespace) + /* If we run for a log namespace, then we ourselves can log to the main journald. */ + log_setup(); + else { + /* So here's the deal if we run as the main journald: we can't be considered as regular + * daemon when it comes to logging hence LOG_TARGET_AUTO won't do the right thing for + * us. Hence explicitly log to the console if we're started from a console or to kmsg + * otherwise. */ + log_target = isatty(STDERR_FILENO) > 0 ? LOG_TARGET_CONSOLE : LOG_TARGET_KMSG; + + log_set_prohibit_ipc(true); /* better safe than sorry */ + log_set_target(log_target); + log_parse_environment(); + log_open(); + } + + umask(0022); + + sigbus_install(); + + r = server_init(&server, namespace); + if (r < 0) + goto finish; + + server_vacuum(&server, false); + server_flush_to_var(&server, true); + server_flush_dev_kmsg(&server); + + if (server.namespace) + log_debug("systemd-journald running as PID "PID_FMT" for namespace '%s'.", getpid_cached(), server.namespace); + else + log_debug("systemd-journald running as PID "PID_FMT" for the system.", getpid_cached()); + + server_driver_message(&server, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_START_STR, + LOG_MESSAGE("Journal started"), + NULL); + + /* Make sure to send the usage message *after* flushing the + * journal so entries from the runtime journals are ordered + * before this message. See #4190 for some details. */ + server_space_usage_message(&server, NULL); + + for (;;) { + usec_t t = USEC_INFINITY, n; + + r = sd_event_get_state(server.event); + if (r < 0) { + log_error_errno(r, "Failed to get event loop state: %m"); + goto finish; + } + if (r == SD_EVENT_FINISHED) + break; + + n = now(CLOCK_REALTIME); + + if (server.max_retention_usec > 0 && server.oldest_file_usec > 0) { + + /* The retention time is reached, so let's vacuum! */ + if (server.oldest_file_usec + server.max_retention_usec < n) { + log_info("Retention time reached, rotating."); + server_rotate(&server); + server_vacuum(&server, false); + continue; + } + + /* Calculate when to rotate the next time */ + t = server.oldest_file_usec + server.max_retention_usec - n; + } + +#if HAVE_GCRYPT + if (server.system_journal) { + usec_t u; + + if (journal_file_next_evolve_usec(server.system_journal, &u)) { + if (n >= u) + t = 0; + else + t = MIN(t, u - n); + } + } +#endif + + r = sd_event_run(server.event, t); + if (r < 0) { + log_error_errno(r, "Failed to run event loop: %m"); + goto finish; + } + + server_maybe_append_tags(&server); + server_maybe_warn_forward_syslog_missed(&server); + } + + if (server.namespace) + log_debug("systemd-journald stopped as PID "PID_FMT" for namespace '%s'.", getpid_cached(), server.namespace); + else + log_debug("systemd-journald stopped as PID "PID_FMT" for the system.", getpid_cached()); + + server_driver_message(&server, 0, + "MESSAGE_ID=" SD_MESSAGE_JOURNAL_STOP_STR, + LOG_MESSAGE("Journal stopped"), + NULL); + +finish: + server_done(&server); + + return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/src/journal/journald.conf b/src/journal/journald.conf new file mode 100644 index 0000000..7b9e232 --- /dev/null +++ b/src/journal/journald.conf @@ -0,0 +1,49 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/journald.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/journald.conf' to display the full config. +# +# See journald.conf(5) for details. + +[Journal] +#Storage=auto +#Compress=yes +#Seal=yes +#SplitMode=uid +#SyncIntervalSec=5m +#RateLimitIntervalSec=30s +#RateLimitBurst=10000 +#SystemMaxUse= +#SystemKeepFree= +#SystemMaxFileSize= +#SystemMaxFiles=100 +#RuntimeMaxUse= +#RuntimeKeepFree= +#RuntimeMaxFileSize= +#RuntimeMaxFiles=100 +#MaxRetentionSec= +#MaxFileSec=1month +#ForwardToSyslog=no +#ForwardToKMsg=no +#ForwardToConsole=no +#ForwardToWall=yes +#TTYPath=/dev/console +#MaxLevelStore=debug +#MaxLevelSyslog=debug +#MaxLevelKMsg=notice +#MaxLevelConsole=info +#MaxLevelWall=emerg +#LineMax=48K +#ReadKMsg=yes +#Audit=yes diff --git a/src/journal/meson.build b/src/journal/meson.build new file mode 100644 index 0000000..36600bf --- /dev/null +++ b/src/journal/meson.build @@ -0,0 +1,182 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +sources = files( + 'journald-audit.c', + 'journald-client.c', + 'journald-console.c', + 'journald-context.c', + 'journald-kmsg.c', + 'journald-native.c', + 'journald-rate-limit.c', + 'journald-server.c', + 'journald-stream.c', + 'journald-syslog.c', + 'journald-wall.c', +) + +sources += custom_target( + 'journald-gperf.c', + input : 'journald-gperf.gperf', + output : 'journald-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +libjournal_core = static_library( + 'journal-core', + sources, + include_directories : includes, + dependencies: [threads, + userspace], + build_by_default : false) + +if get_option('link-journalctl-shared') + journalctl_link_with = [libshared] +else + journalctl_link_with = [ + libbasic_gcrypt, + libshared_static, + libsystemd_static, + ] +endif + +journal_test_template = test_template + { + 'link_with' : [ + libjournal_core, + libshared, + ], +} + +journal_fuzz_template = fuzz_template + { + 'link_with' : [ + libjournal_core, + libshared, + ], + 'dependencies' : libselinux, +} + +executables += [ + libexec_template + { + 'name' : 'systemd-journald', + 'sources' : files('journald.c'), + 'link_with' : [ + libjournal_core, + libshared, + ], + 'dependencies' : [ + liblz4, + libselinux, + libxz, + libzstd, + threads, + ], + }, + libexec_template + { + 'name' : 'systemd-bsod', + 'public' : true, + 'conditions' : ['HAVE_QRENCODE'], + 'sources' : files('bsod.c'), + 'link_with' : libshared, + 'dependencies' : libqrencode, + }, + executable_template + { + 'name' : 'systemd-cat', + 'public' : true, + 'sources' : files('cat.c'), + 'link_with' : [ + libjournal_core, + libshared, + ], + 'dependencies' : threads, + }, + executable_template + { + 'name' : 'journalctl', + 'public' : true, + 'sources' : files('journalctl.c'), + 'link_with' : journalctl_link_with, + 'dependencies' : [ + libdl, + liblz4, + libxz, + libzstd, + threads, + ], + }, + journal_test_template + { + 'sources' : files('test-journald-config.c'), + 'dependencies' : [ + liblz4, + libselinux, + libxz, + ], + }, + journal_test_template + { + 'sources' : files('test-journald-syslog.c'), + 'dependencies' : [ + liblz4, + libselinux, + libxz, + threads, + ], + }, + journal_test_template + { + 'sources' : files('test-journald-tables.c'), + 'dependencies' : [ + libselinux, + ], + }, + journal_fuzz_template + { + 'sources' : files( + 'fuzz-journald-audit.c', + 'fuzz-journald.c', + ), + }, + journal_fuzz_template + { + 'sources' : files( + 'fuzz-journald-kmsg.c', + 'fuzz-journald.c', + ), + }, + journal_fuzz_template + { + 'sources' : files( + 'fuzz-journald-native.c', + 'fuzz-journald.c', + ), + }, + journal_fuzz_template + { + 'sources' : files( + 'fuzz-journald-native-fd.c', + 'fuzz-journald.c', + ), + }, + journal_fuzz_template + { + 'sources' : files( + 'fuzz-journald-stream.c', + 'fuzz-journald.c', + ), + }, + journal_fuzz_template + { + 'sources' : files( + 'fuzz-journald-syslog.c', + 'fuzz-journald.c', + ), + }, +] + +if install_sysconfdir_samples + install_data('journald.conf', + install_dir : pkgconfigfiledir) +endif + +if get_option('create-log-dirs') + install_emptydir('/var/log/journal', + install_mode : 'rwxr-xr-x') + if get_option('adm-group') + meson.add_install_script( + sh, '-c', + 'setfacl -nm g:adm:rx,d:g:adm:rx $DESTDIR/var/log/journal || :') + endif + if get_option('wheel-group') + meson.add_install_script( + sh, '-c', + 'setfacl -nm g:wheel:rx,d:g:wheel:rx $DESTDIR/var/log/journal || :') + endif +endif diff --git a/src/journal/test-journald-config.c b/src/journal/test-journald-config.c new file mode 100644 index 0000000..1a6c531 --- /dev/null +++ b/src/journal/test-journald-config.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "journald-server.h" +#include "tests.h" + +#define _COMPRESS_PARSE_CHECK(str, enab, thresh, varname) \ + do { \ + JournalCompressOptions varname = {true, 111}; \ + config_parse_compress("", "", 0, "", 0, "", 0, str, \ + &varname, NULL); \ + assert_se((enab) == varname.enabled); \ + if (varname.enabled) \ + assert_se((thresh) == varname.threshold_bytes); \ + } while (0) + +#define COMPRESS_PARSE_CHECK(str, enabled, threshold) \ + _COMPRESS_PARSE_CHECK(str, enabled, threshold, conf##__COUNTER__) + +TEST(config_compress) { + COMPRESS_PARSE_CHECK("yes", true, 111); + COMPRESS_PARSE_CHECK("no", false, 111); + COMPRESS_PARSE_CHECK("y", true, 111); + COMPRESS_PARSE_CHECK("n", false, 111); + COMPRESS_PARSE_CHECK("true", true, 111); + COMPRESS_PARSE_CHECK("false", false, 111); + COMPRESS_PARSE_CHECK("t", true, 111); + COMPRESS_PARSE_CHECK("f", false, 111); + COMPRESS_PARSE_CHECK("on", true, 111); + COMPRESS_PARSE_CHECK("off", false, 111); + + /* Weird size/bool overlapping case. We preserve backward compatibility instead of assuming these are byte + * counts. */ + COMPRESS_PARSE_CHECK("1", true, 111); + COMPRESS_PARSE_CHECK("0", false, 111); + + /* IEC sizing */ + COMPRESS_PARSE_CHECK("1B", true, 1); + COMPRESS_PARSE_CHECK("1K", true, 1024); + COMPRESS_PARSE_CHECK("1M", true, 1024 * 1024); + COMPRESS_PARSE_CHECK("1G", true, 1024 * 1024 * 1024); + + /* Invalid Case */ + COMPRESS_PARSE_CHECK("-1", true, 111); + COMPRESS_PARSE_CHECK("blah blah", true, 111); + COMPRESS_PARSE_CHECK("", true, UINT64_MAX); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/journal/test-journald-syslog.c b/src/journal/test-journald-syslog.c new file mode 100644 index 0000000..84cfcef --- /dev/null +++ b/src/journal/test-journald-syslog.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "journald-syslog.h" +#include "macro.h" +#include "string-util.h" +#include "syslog-util.h" +#include "tests.h" + +static void test_syslog_parse_identifier_one(const char *str, + const char *ident, const char *pid, const char *rest, int ret) { + const char *buf = str; + _cleanup_free_ char *ident2 = NULL, *pid2 = NULL; + int ret2; + + ret2 = syslog_parse_identifier(&buf, &ident2, &pid2); + + assert_se(ret == ret2); + assert_se(ident == ident2 || streq_ptr(ident, ident2)); + assert_se(pid == pid2 || streq_ptr(pid, pid2)); + assert_se(streq(buf, rest)); +} + +static void test_syslog_parse_priority_one(const char *str, bool with_facility, int priority, int ret) { + int priority2 = 0, ret2; + + ret2 = syslog_parse_priority(&str, &priority2, with_facility); + + assert_se(ret == ret2); + if (ret2 == 1) + assert_se(priority == priority2); +} + +TEST(syslog_parse_identifier) { + test_syslog_parse_identifier_one("pidu[111]: xxx", "pidu", "111", "xxx", 11); + test_syslog_parse_identifier_one("pidu: xxx", "pidu", NULL, "xxx", 6); + test_syslog_parse_identifier_one("pidu: xxx", "pidu", NULL, " xxx", 6); + test_syslog_parse_identifier_one("pidu xxx", NULL, NULL, "pidu xxx", 0); + test_syslog_parse_identifier_one(" pidu xxx", NULL, NULL, " pidu xxx", 0); + test_syslog_parse_identifier_one("", NULL, NULL, "", 0); + test_syslog_parse_identifier_one(" ", NULL, NULL, " ", 0); + test_syslog_parse_identifier_one(":", "", NULL, "", 1); + test_syslog_parse_identifier_one(": ", "", NULL, " ", 2); + test_syslog_parse_identifier_one(" :", "", NULL, "", 2); + test_syslog_parse_identifier_one(" pidu:", "pidu", NULL, "", 8); + test_syslog_parse_identifier_one("pidu:", "pidu", NULL, "", 5); + test_syslog_parse_identifier_one("pidu: ", "pidu", NULL, "", 6); + test_syslog_parse_identifier_one("pidu : ", NULL, NULL, "pidu : ", 0); +} + +TEST(syslog_parse_priority) { + test_syslog_parse_priority_one("", false, 0, 0); + test_syslog_parse_priority_one("<>", false, 0, 0); + test_syslog_parse_priority_one("<>aaa", false, 0, 0); + test_syslog_parse_priority_one("", false, 0, 0); + test_syslog_parse_priority_one("aaa", false, 0, 0); + test_syslog_parse_priority_one(" ", false, 0, 0); + test_syslog_parse_priority_one(" aaa", false, 0, 0); + test_syslog_parse_priority_one(" aaa", false, 0, 0); + test_syslog_parse_priority_one(" <1>", false, 0, 0); + test_syslog_parse_priority_one("<1>", false, 1, 1); + test_syslog_parse_priority_one("<7>", false, 7, 1); + test_syslog_parse_priority_one("<8>", false, 0, 0); + test_syslog_parse_priority_one("<9>", true, 9, 1); + test_syslog_parse_priority_one("<22>", true, 22, 1); + test_syslog_parse_priority_one("<111>", false, 0, 0); + test_syslog_parse_priority_one("<111>", true, 111, 1); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/journal/test-journald-tables.c b/src/journal/test-journald-tables.c new file mode 100644 index 0000000..d726c3f --- /dev/null +++ b/src/journal/test-journald-tables.c @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "journald-server.h" +#include "test-tables.h" +#include "tests.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(split_mode, SPLIT); + test_table(storage, STORAGE); + + return EXIT_SUCCESS; +} diff --git a/src/kernel-install/50-depmod.install b/src/kernel-install/50-depmod.install new file mode 100755 index 0000000..88f858f --- /dev/null +++ b/src/kernel-install/50-depmod.install @@ -0,0 +1,53 @@ +#!/bin/sh +# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*- +# ex: ts=8 sw=4 sts=4 et filetype=sh +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# systemd is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with systemd; If not, see . + +set -e + +COMMAND="${1:?}" +KERNEL_VERSION="${2:?}" + +[ -w "/lib/modules" ] || exit 0 + +case "$COMMAND" in + add) + [ -d "/lib/modules/$KERNEL_VERSION/kernel" ] || exit 0 + command -v depmod >/dev/null || exit 0 + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "+depmod -a $KERNEL_VERSION" + exec depmod -a "$KERNEL_VERSION" + ;; + remove) + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && \ + echo "Removing /lib/modules/${KERNEL_VERSION}/modules.dep and associated files" + exec rm -f \ + "/lib/modules/$KERNEL_VERSION/modules.alias" \ + "/lib/modules/$KERNEL_VERSION/modules.alias.bin" \ + "/lib/modules/$KERNEL_VERSION/modules.builtin.bin" \ + "/lib/modules/$KERNEL_VERSION/modules.builtin.alias.bin" \ + "/lib/modules/$KERNEL_VERSION/modules.dep" \ + "/lib/modules/$KERNEL_VERSION/modules.dep.bin" \ + "/lib/modules/$KERNEL_VERSION/modules.devname" \ + "/lib/modules/$KERNEL_VERSION/modules.softdep" \ + "/lib/modules/$KERNEL_VERSION/modules.symbols" \ + "/lib/modules/$KERNEL_VERSION/modules.symbols.bin" + ;; + *) + exit 0 + ;; +esac diff --git a/src/kernel-install/60-ukify.install.in b/src/kernel-install/60-ukify.install.in new file mode 100755 index 0000000..be1e21b --- /dev/null +++ b/src/kernel-install/60-ukify.install.in @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later +# -*- mode: python-mode -*- +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# systemd is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with systemd; If not, see . + +# pylint: disable=import-outside-toplevel,consider-using-with,disable=redefined-builtin + +import argparse +import os +import runpy +import shlex +from shutil import which +from pathlib import Path +from typing import Optional + +__version__ = '{{PROJECT_VERSION}} ({{GIT_VERSION}})' + +try: + VERBOSE = int(os.environ['KERNEL_INSTALL_VERBOSE']) > 0 +except (KeyError, ValueError): + VERBOSE = False + +# Override location of ukify and the boot stub for testing and debugging. +UKIFY = os.getenv('KERNEL_INSTALL_UKIFY', which('ukify')) +BOOT_STUB = os.getenv('KERNEL_INSTALL_BOOT_STUB') + + +def shell_join(cmd): + # TODO: drop in favour of shlex.join once shlex.join supports pathlib.Path. + return ' '.join(shlex.quote(str(x)) for x in cmd) + +def log(*args, **kwargs): + if VERBOSE: + print(*args, **kwargs) + +def path_is_readable(p: Path, dir=False) -> None: + """Verify access to a file or directory.""" + try: + p.open().close() + except IsADirectoryError: + if dir: + return + raise + +def mandatory_variable(name): + try: + return os.environ[name] + except KeyError as e: + raise KeyError(f'${name} must be set in the environment') from e + +def parse_args(args=None): + p = argparse.ArgumentParser( + description='kernel-install plugin to build a Unified Kernel Image', + allow_abbrev=False, + usage='60-ukify.install COMMAND KERNEL_VERSION ENTRY_DIR KERNEL_IMAGE INITRD…', + ) + + # Suppress printing of usage synopsis on errors + p.error = lambda message: p.exit(2, f'{p.prog}: error: {message}\n') + + p.add_argument('command', + metavar='COMMAND', + help="The action to perform. Only 'add' is supported.") + p.add_argument('kernel_version', + metavar='KERNEL_VERSION', + help='Kernel version string') + p.add_argument('entry_dir', + metavar='ENTRY_DIR', + type=Path, + nargs='?', + help='Type#1 entry directory (ignored)') + p.add_argument('kernel_image', + metavar='KERNEL_IMAGE', + type=Path, + nargs='?', + help='Kernel binary') + p.add_argument('initrd', + metavar='INITRD…', + type=Path, + nargs='*', + help='Initrd files') + p.add_argument('--version', + action='version', + version=f'systemd {__version__}') + + opts = p.parse_args(args) + + if opts.command == 'add': + opts.staging_area = Path(mandatory_variable('KERNEL_INSTALL_STAGING_AREA')) + path_is_readable(opts.staging_area, dir=True) + + opts.entry_token = mandatory_variable('KERNEL_INSTALL_ENTRY_TOKEN') + opts.machine_id = mandatory_variable('KERNEL_INSTALL_MACHINE_ID') + + return opts + +def we_are_wanted() -> bool: + KERNEL_INSTALL_LAYOUT = os.getenv('KERNEL_INSTALL_LAYOUT') + + if KERNEL_INSTALL_LAYOUT != 'uki': + log(f'{KERNEL_INSTALL_LAYOUT=}, quitting.') + return False + + KERNEL_INSTALL_UKI_GENERATOR = os.getenv('KERNEL_INSTALL_UKI_GENERATOR') or 'ukify' + + if KERNEL_INSTALL_UKI_GENERATOR != 'ukify': + log(f'{KERNEL_INSTALL_UKI_GENERATOR=}, quitting.') + return False + + log('KERNEL_INSTALL_LAYOUT and KERNEL_INSTALL_UKI_GENERATOR are good') + return True + + +def input_file_location( + filename: str, + *search_directories: str) -> Optional[Path]: + + if root := os.getenv('KERNEL_INSTALL_CONF_ROOT'): + search_directories = (root,) + elif not search_directories: + # This is the default search path. + search_directories = ('/etc/kernel', + '/usr/lib/kernel') + + for dir in search_directories: + p = Path(dir) / filename + if p.exists(): + return p + return None + + +def uki_conf_location() -> Optional[Path]: + return input_file_location('uki.conf', + '/etc/kernel') + + +def devicetree_config_location() -> Optional[Path]: + return input_file_location('devicetree') + + +def devicetree_file_location(opts) -> Optional[Path]: + # This mirrors the logic in 90-loaderentry.install. Keep in sync. + configfile = devicetree_config_location() + if configfile is None: + return None + + devicetree = configfile.read_text().strip() + if not devicetree: + raise ValueError(f'{configfile!r} is empty') + + path = input_file_location( + devicetree, + f'/usr/lib/firmware/{opts.kernel_version}/device-tree', + f'/usr/lib/linux-image-{opts.kernel_version}', + f'/usr/lib/modules/{opts.kernel_version}/dtb', + ) + if path is None: + raise FileNotFoundError(f'DeviceTree file {devicetree} not found') + return path + + +def kernel_cmdline_base() -> list[str]: + path = input_file_location('cmdline') + if path: + return path.read_text().split() + + # If we read /proc/cmdline, we need to do some additional filtering. + options = Path('/proc/cmdline').read_text().split() + return [opt for opt in options + if not opt.startswith(('BOOT_IMAGE=', 'initrd='))] + + +def kernel_cmdline(opts) -> str: + options = kernel_cmdline_base() + + # If the boot entries are named after the machine ID, then suffix the kernel + # command line with the machine ID we use, so that the machine ID remains + # stable, even during factory reset, in the initrd (where the system's machine + # ID is not directly accessible yet), and if the root file system is volatile. + if (opts.entry_token == opts.machine_id and + not any(opt.startswith('systemd.machine_id=') for opt in options)): + options += [f'systemd.machine_id={opts.machine_id}'] + + # TODO: we unconditionally set the cmdline here, ignoring the setting in + # the config file. Should we not do that? + + # Prepend a space so that '@' does not get misinterpreted + return ' ' + ' '.join(options) + + +def initrd_list(opts) -> list[Path]: + microcode = sorted(opts.staging_area.glob('microcode*')) + initrd = sorted(opts.staging_area.glob('initrd*')) + + # Order taken from 90-loaderentry.install + return [*microcode, *opts.initrd, *initrd] + + +def call_ukify(opts): + # Punish me harder. + # We want this: + # ukify = importlib.machinery.SourceFileLoader('ukify', UKIFY).load_module() + # but it throws a DeprecationWarning. + # https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path + # https://github.com/python/cpython/issues/65635 + # offer "explanations", but to actually load a python file without a .py extension, + # the "solution" is 4+ incomprehensible lines. + # The solution with runpy gives a dictionary, which isn't great, but will do. + ukify = runpy.run_path(UKIFY, run_name='ukify') + + # Create "empty" namespace. We want to override just a few settings, so it + # doesn't make sense to configure everything. We pretend to parse an empty + # argument set to prepopulate the namespace with the defaults. + opts2 = ukify['create_parser']().parse_args(['build']) + + opts2.config = uki_conf_location() + opts2.uname = opts.kernel_version + opts2.linux = opts.kernel_image + opts2.initrd = initrd_list(opts) + # Note that 'uki.efi' is the name required by 90-uki-copy.install. + opts2.output = opts.staging_area / 'uki.efi' + + if devicetree := devicetree_file_location(opts): + opts2.devicetree = devicetree + + opts2.cmdline = kernel_cmdline(opts) + if BOOT_STUB: + opts2.stub = BOOT_STUB + + # opts2.summary = True + + ukify['apply_config'](opts2) + ukify['finalize_options'](opts2) + ukify['check_inputs'](opts2) + ukify['make_uki'](opts2) + + log(f'{opts2.output} has been created') + + +def main(): + opts = parse_args() + if opts.command != 'add': + return + if not we_are_wanted(): + return + + call_ukify(opts) + + +if __name__ == '__main__': + main() diff --git a/src/kernel-install/90-loaderentry.install.in b/src/kernel-install/90-loaderentry.install.in new file mode 100755 index 0000000..a52dd81 --- /dev/null +++ b/src/kernel-install/90-loaderentry.install.in @@ -0,0 +1,210 @@ +#!/bin/sh +# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*- +# ex: ts=8 sw=4 sts=4 et filetype=sh +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# systemd is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with systemd; If not, see . + +set -e + +COMMAND="${1:?}" +KERNEL_VERSION="${2:?}" +ENTRY_DIR_ABS="${3:?}" +KERNEL_IMAGE="$4" +INITRD_OPTIONS_SHIFT=4 + +[ "$KERNEL_INSTALL_LAYOUT" = "bls" ] || exit 0 + +MACHINE_ID="${KERNEL_INSTALL_MACHINE_ID:?}" +ENTRY_TOKEN="${KERNEL_INSTALL_ENTRY_TOKEN:?}" +BOOT_ROOT="${KERNEL_INSTALL_BOOT_ROOT:?}" + +[ -n "$BOOT_MNT" ] || BOOT_MNT="$(stat -c %m "$BOOT_ROOT")" +if [ "$BOOT_MNT" = '/' ]; then + ENTRY_DIR="$ENTRY_DIR_ABS" +else + ENTRY_DIR="${ENTRY_DIR_ABS#"$BOOT_MNT"}" +fi + +KERNEL_DEST="$ENTRY_DIR_ABS/linux" +KERNEL_ENTRY="$ENTRY_DIR/linux" +LOADER_ENTRY="$BOOT_ROOT/loader/entries/$ENTRY_TOKEN-$KERNEL_VERSION.conf" + +case "$COMMAND" in + remove) + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && \ + echo "Removing ${LOADER_ENTRY%.conf}*.conf" + exec rm -f \ + "$LOADER_ENTRY" \ + "${LOADER_ENTRY%.conf}"*".conf" + ;; + add) + ;; + *) + exit 0 + ;; +esac + +if [ -f /etc/os-release ]; then + # shellcheck source=/dev/null + . /etc/os-release +elif [ -f /usr/lib/os-release ]; then + # shellcheck source=/dev/null + . /usr/lib/os-release +fi + +[ -n "$PRETTY_NAME" ] || PRETTY_NAME="Linux $KERNEL_VERSION" + +SORT_KEY="$IMAGE_ID" +[ -z "$SORT_KEY" ] && SORT_KEY="$ID" + +if [ -n "$KERNEL_INSTALL_CONF_ROOT" ]; then + if [ -f "$KERNEL_INSTALL_CONF_ROOT/cmdline" ]; then + BOOT_OPTIONS="$(tr -s "$IFS" ' ' <"$KERNEL_INSTALL_CONF_ROOT/cmdline")" + fi +elif [ -f /etc/kernel/cmdline ]; then + BOOT_OPTIONS="$(tr -s "$IFS" ' ' &2 + exit 1 + fi + LOADER_ENTRY="${LOADER_ENTRY%.conf}+$TRIES.conf" +fi + +if ! [ -d "$ENTRY_DIR_ABS" ]; then + echo "Error: entry directory '$ENTRY_DIR_ABS' does not exist" >&2 + exit 1 +fi + +install -m 0644 "$KERNEL_IMAGE" "$KERNEL_DEST" || { + echo "Error: could not copy '$KERNEL_IMAGE' to '$KERNEL_DEST'." >&2 + exit 1 +} +chown root:root "$KERNEL_DEST" || : + +if [ -n "$KERNEL_INSTALL_CONF_ROOT" ]; then + if [ -f "$KERNEL_INSTALL_CONF_ROOT/devicetree" ]; then + read -r DEVICETREE <"$KERNEL_INSTALL_CONF_ROOT/devicetree" + fi +elif [ -f /etc/kernel/devicetree ]; then + read -r DEVICETREE &2 + exit 1 + } + + DEVICETREE_DEST="$ENTRY_DIR_ABS/${DEVICETREE##*/}" + DEVICETREE_ENTRY="$ENTRY_DIR/${DEVICETREE##*/}" + + install -m 0644 "$DEVICETREE_SRC" "$DEVICETREE_DEST" || { + echo "Error: could not copy '$DEVICETREE_SRC' to '$DEVICETREE_DEST'." >&2 + exit 1 + } + chown root:root "$DEVICETREE_DEST" || : +fi + +shift "$INITRD_OPTIONS_SHIFT" +# All files listed as arguments, and staged files starting with "initrd" are installed as initrds. +for initrd in "${KERNEL_INSTALL_STAGING_AREA}"/microcode* "${@}" "${KERNEL_INSTALL_STAGING_AREA}"/initrd*; do + [ -f "$initrd" ] || { + case "$initrd" in + "${KERNEL_INSTALL_STAGING_AREA}/initrd*" | "${KERNEL_INSTALL_STAGING_AREA}/microcode*") + continue ;; + esac + echo "Error: '$initrd' is not a file." >&2 + exit 1 + } + + initrd_basename="${initrd##*/}" + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "Installing $ENTRY_DIR_ABS/$initrd_basename" + install -m 0644 "$initrd" "$ENTRY_DIR_ABS/$initrd_basename" || { + echo "Error: could not copy '$initrd' to '$ENTRY_DIR_ABS/$initrd_basename'." >&2 + exit 1 + } + chown root:root "$ENTRY_DIR_ABS/$initrd_basename" || : +done + +mkdir -p "${LOADER_ENTRY%/*}" || { + echo "Error: could not create loader entry directory '${LOADER_ENTRY%/*}'." >&2 + exit 1 +} + +[ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "Creating $LOADER_ENTRY" +{ + echo "# Boot Loader Specification type#1 entry" + echo "# File created by $0 (systemd {{GIT_VERSION}})" + echo "title $PRETTY_NAME" + echo "version $KERNEL_VERSION" + if [ "$ENTRY_TOKEN" = "$MACHINE_ID" ]; then + # See similar logic above for the systemd.machine_id= kernel command line option + echo "machine-id $MACHINE_ID" + fi + [ -n "$SORT_KEY" ] && echo "sort-key $SORT_KEY" + echo "options $BOOT_OPTIONS" + echo "linux $KERNEL_ENTRY" + [ -n "$DEVICETREE_ENTRY" ] && echo "devicetree $DEVICETREE_ENTRY" + + have_initrd= + for initrd in "${KERNEL_INSTALL_STAGING_AREA}"/microcode* "${@}" "${KERNEL_INSTALL_STAGING_AREA}"/initrd*; do + [ -f "$initrd" ] || continue + echo "initrd $ENTRY_DIR/${initrd##*/}" + have_initrd=yes + done + + # Try "initrd", generated by dracut in its kernel-install hook, if no initrds were supplied + [ -z "$have_initrd" ] && [ -f "$ENTRY_DIR_ABS/initrd" ] && echo "initrd $ENTRY_DIR/initrd" + : +} >"$LOADER_ENTRY" || { + echo "Error: could not create loader entry '$LOADER_ENTRY'." >&2 + exit 1 +} +exit 0 diff --git a/src/kernel-install/90-uki-copy.install b/src/kernel-install/90-uki-copy.install new file mode 100755 index 0000000..c66c097 --- /dev/null +++ b/src/kernel-install/90-uki-copy.install @@ -0,0 +1,97 @@ +#!/bin/sh +# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*- +# ex: ts=8 sw=4 sts=4 et filetype=sh +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# systemd is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with systemd; If not, see . + +set -e + +COMMAND="${1:?}" +KERNEL_VERSION="${2:?}" +# shellcheck disable=SC2034 +ENTRY_DIR_ABS="$3" +KERNEL_IMAGE="$4" + +[ "$KERNEL_INSTALL_LAYOUT" = "uki" ] || exit 0 + +ENTRY_TOKEN="$KERNEL_INSTALL_ENTRY_TOKEN" +BOOT_ROOT="$KERNEL_INSTALL_BOOT_ROOT" + +UKI_DIR="$BOOT_ROOT/EFI/Linux" + +case "$COMMAND" in + remove) + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && \ + echo "Removing $UKI_DIR/$ENTRY_TOKEN-$KERNEL_VERSION*.efi" + exec rm -f \ + "$UKI_DIR/$ENTRY_TOKEN-$KERNEL_VERSION.efi" \ + "$UKI_DIR/$ENTRY_TOKEN-$KERNEL_VERSION+"*".efi" + ;; + add) + ;; + *) + exit 0 + ;; +esac + +if ! [ -d "$UKI_DIR" ]; then + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "creating $UKI_DIR" + mkdir -p "$UKI_DIR" +fi + +TRIES_FILE="${KERNEL_INSTALL_CONF_ROOT:-/etc/kernel}/tries" + +if [ -f "$TRIES_FILE" ]; then + read -r TRIES <"$TRIES_FILE" + if ! echo "$TRIES" | grep -q '^[0-9][0-9]*$'; then + echo "$TRIES_FILE does not contain an integer." >&2 + exit 1 + fi + UKI_FILE="$UKI_DIR/$ENTRY_TOKEN-$KERNEL_VERSION+$TRIES.efi" +else + UKI_FILE="$UKI_DIR/$ENTRY_TOKEN-$KERNEL_VERSION.efi" +fi + +# If there is a UKI named uki.efi on the staging area use that, if not use what +# was passed in as $KERNEL_IMAGE but insist it has a .efi extension +if [ -f "$KERNEL_INSTALL_STAGING_AREA/uki.efi" ]; then + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "Installing $KERNEL_INSTALL_STAGING_AREA/uki.efi as $UKI_FILE" + install -m 0644 "$KERNEL_INSTALL_STAGING_AREA/uki.efi" "$UKI_FILE" || { + echo "Error: could not copy '$KERNEL_INSTALL_STAGING_AREA/uki.efi' to '$UKI_FILE'." >&2 + exit 1 + } +elif [ -n "$KERNEL_IMAGE" ]; then + [ -f "$KERNEL_IMAGE" ] || { + echo "Error: UKI '$KERNEL_IMAGE' not a file." >&2 + exit 1 + } + [ "$KERNEL_IMAGE" != "${KERNEL_IMAGE%*.efi}.efi" ] && { + echo "Error: $KERNEL_IMAGE is missing .efi suffix." >&2 + exit 1 + } + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "Installing $KERNEL_IMAGE as $UKI_FILE" + install -m 0644 "$KERNEL_IMAGE" "$UKI_FILE" || { + echo "Error: could not copy '$KERNEL_IMAGE' to '$UKI_FILE'." >&2 + exit 1 + } +else + [ "$KERNEL_INSTALL_VERBOSE" -gt 0 ] && echo "No UKI available. Nothing to do." + exit 0 +fi +chown root:root "$UKI_FILE" || : + +exit 0 diff --git a/src/kernel-install/install.conf b/src/kernel-install/install.conf new file mode 100644 index 0000000..4520c01 --- /dev/null +++ b/src/kernel-install/install.conf @@ -0,0 +1,12 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# See kernel-install(8) for details. + +#layout=bls|other|... +#initrd_generator=dracut|... +#uki_generator=ukify|... diff --git a/src/kernel-install/kernel-install.c b/src/kernel-install/kernel-install.c new file mode 100644 index 0000000..14ae1a8 --- /dev/null +++ b/src/kernel-install/kernel-install.c @@ -0,0 +1,1743 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "boot-entry.h" +#include "build.h" +#include "chase.h" +#include "conf-files.h" +#include "dirent-util.h" +#include "env-file.h" +#include "env-util.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "find-esp.h" +#include "format-table.h" +#include "fs-util.h" +#include "id128-util.h" +#include "image-policy.h" +#include "kernel-image.h" +#include "main-func.h" +#include "mkdir.h" +#include "mount-util.h" +#include "parse-argument.h" +#include "path-util.h" +#include "pretty-print.h" +#include "recurse-dir.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "verbs.h" + +static bool arg_verbose = false; +static char *arg_esp_path = NULL; +static char *arg_xbootldr_path = NULL; +static int arg_make_entry_directory = -1; /* tristate */ +static PagerFlags arg_pager_flags = 0; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static char *arg_root = NULL; +static char *arg_image = NULL; +static ImagePolicy *arg_image_policy = NULL; +static bool arg_legend = true; + +STATIC_DESTRUCTOR_REGISTER(arg_esp_path, freep); +STATIC_DESTRUCTOR_REGISTER(arg_xbootldr_path, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +typedef enum Action { + ACTION_ADD, + ACTION_REMOVE, + ACTION_INSPECT, + _ACTION_MAX, + _ACTION_INVALID = -EINVAL, +} Action; + +typedef enum Layout { + LAYOUT_AUTO, + LAYOUT_UKI, + LAYOUT_BLS, + LAYOUT_OTHER, + _LAYOUT_MAX, + _LAYOUT_INVALID = -EINVAL, +} Layout; + +static const char * const layout_table[_LAYOUT_MAX] = { + [LAYOUT_AUTO] = "auto", + [LAYOUT_UKI] = "uki", + [LAYOUT_BLS] = "bls", + [LAYOUT_OTHER] = "other", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(layout, Layout); + +typedef struct Context { + int rfd; + Action action; + sd_id128_t machine_id; + bool machine_id_is_random; + KernelImageType kernel_image_type; + Layout layout; + char *layout_other; + char *conf_root; + char *boot_root; + BootEntryTokenType entry_token_type; + char *entry_token; + char *entry_dir; + char *version; + char *kernel; + char **initrds; + char *initrd_generator; + char *uki_generator; + char *staging_area; + char **plugins; + char **argv; + char **envp; +} Context; + +#define CONTEXT_NULL (Context) { .rfd = -EBADF } + +static void context_done(Context *c) { + assert(c); + + free(c->layout_other); + free(c->conf_root); + free(c->boot_root); + free(c->entry_token); + free(c->entry_dir); + free(c->version); + free(c->kernel); + strv_free(c->initrds); + free(c->initrd_generator); + free(c->uki_generator); + if (c->action == ACTION_INSPECT) + free(c->staging_area); + else + rm_rf_physical_and_free(c->staging_area); + strv_free(c->plugins); + strv_free(c->argv); + strv_free(c->envp); + + safe_close(c->rfd); +} + +static int context_copy(const Context *source, Context *ret) { + int r; + + assert(source); + assert(ret); + assert(source->rfd >= 0 || source->rfd == AT_FDCWD); + + _cleanup_(context_done) Context copy = (Context) { + .rfd = AT_FDCWD, + .action = source->action, + .machine_id = source->machine_id, + .machine_id_is_random = source->machine_id_is_random, + .kernel_image_type = source->kernel_image_type, + .layout = source->layout, + .entry_token_type = source->entry_token_type, + }; + + if (source->rfd >= 0) { + copy.rfd = fd_reopen(source->rfd, O_CLOEXEC|O_DIRECTORY|O_PATH); + if (copy.rfd < 0) + return copy.rfd; + } + + r = strdup_or_null(source->layout_other, ©.layout_other); + if (r < 0) + return r; + r = strdup_or_null(source->conf_root, ©.conf_root); + if (r < 0) + return r; + r = strdup_or_null(source->boot_root, ©.boot_root); + if (r < 0) + return r; + r = strdup_or_null(source->entry_token, ©.entry_token); + if (r < 0) + return r; + r = strdup_or_null(source->entry_dir, ©.entry_dir); + if (r < 0) + return r; + r = strdup_or_null(source->version, ©.version); + if (r < 0) + return r; + r = strdup_or_null(source->kernel, ©.kernel); + if (r < 0) + return r; + r = strv_copy_unless_empty(source->initrds, ©.initrds); + if (r < 0) + return r; + r = strdup_or_null(source->initrd_generator, ©.initrd_generator); + if (r < 0) + return r; + r = strdup_or_null(source->uki_generator, ©.uki_generator); + if (r < 0) + return r; + r = strdup_or_null(source->staging_area, ©.staging_area); + if (r < 0) + return r; + r = strv_copy_unless_empty(source->plugins, ©.plugins); + if (r < 0) + return r; + r = strv_copy_unless_empty(source->argv, ©.argv); + if (r < 0) + return r; + r = strv_copy_unless_empty(source->envp, ©.envp); + if (r < 0) + return r; + + *ret = copy; + copy = CONTEXT_NULL; + + return 0; +} + +static int context_open_root(Context *c) { + int r; + + assert(c); + assert(c->rfd < 0); + + if (isempty(arg_root)) + return 0; + + r = path_is_root(arg_root); + if (r < 0) + return log_error_errno(r, "Failed to determine if '%s' is the root directory: %m", arg_root); + if (r > 0) + return 0; + + c->rfd = open(empty_to_root(arg_root), O_CLOEXEC | O_DIRECTORY | O_PATH); + if (c->rfd < 0) + return log_error_errno(errno, "Failed to open root directory '%s': %m", empty_to_root(arg_root)); + + return 0; +} + +static const char* context_get_layout(const Context *c) { + assert(c); + assert(c->layout >= 0); + + return c->layout_other ?: layout_to_string(c->layout); +} + +static int context_set_layout(Context *c, const char *s, const char *source) { + Layout t; + + assert(c); + assert(source); + + if (c->layout >= 0 || !s) + return 0; + + assert(!c->layout_other); + + t = layout_from_string(s); + if (t >= 0) + c->layout = t; + else if (isempty(s)) + c->layout = LAYOUT_AUTO; + else { + c->layout_other = strdup(s); + if (!c->layout_other) + return log_oom(); + + c->layout = LAYOUT_OTHER; + } + + log_debug("layout=%s set via %s", context_get_layout(c), source); + return 1; +} + +static int context_set_machine_id(Context *c, const char *s, const char *source) { + int r; + + assert(c); + assert(source); + + if (!sd_id128_is_null(c->machine_id) || !s) + return 0; + + r = sd_id128_from_string(s, &c->machine_id); + if (r < 0) + return log_warning_errno(r, "Failed to parse machine ID specified via %s, ignoring.", source); + + if (sd_id128_is_null(c->machine_id)) + return 0; + + log_debug("MACHINE_ID=%s set via %s.", SD_ID128_TO_STRING(c->machine_id), source); + return 1; +} + +static int context_set_string(const char *s, const char *source, const char *name, char **dest) { + char *p; + + assert(source); + assert(name); + assert(dest); + + if (*dest || !s) + return 0; + + p = strdup(s); + if (!p) + return log_oom(); + + log_debug("%s (%s) set via %s.", name, p, source); + + *dest = p; + return 1; +} + +static int context_set_initrd_generator(Context *c, const char *s, const char *source) { + assert(c); + return context_set_string(s, source, "INITRD_GENERATOR", &c->initrd_generator); +} + +static int context_set_uki_generator(Context *c, const char *s, const char *source) { + assert(c); + return context_set_string(s, source, "UKI_GENERATOR", &c->uki_generator); +} + +static int context_set_version(Context *c, const char *s) { + assert(c); + + if (s && !filename_is_valid(s)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid version specified: %s", s); + + return context_set_string(s, "command line", "kernel version", &c->version); +} + +static int context_set_path(Context *c, const char *s, const char *source, const char *name, char **dest) { + char *p; + int r; + + assert(c); + assert(source); + assert(name); + assert(dest); + + if (*dest || !s) + return 0; + + if (c->rfd >= 0) { + r = chaseat(c->rfd, s, CHASE_AT_RESOLVE_IN_ROOT, &p, /* ret_fd = */ NULL); + if (r < 0) + return log_warning_errno(r, "Failed to chase path %s for %s specified via %s, ignoring: %m", + s, name, source); + } else { + r = path_make_absolute_cwd(s, &p); + if (r < 0) + return log_warning_errno(r, "Failed to make path '%s' for %s specified via %s absolute, ignoring: %m", + s, name, source); + } + + log_debug("%s (%s) set via %s.", name, p, source); + + *dest = p; + return 1; +} + +static int context_set_boot_root(Context *c, const char *s, const char *source) { + assert(c); + return context_set_path(c, s, source, "BOOT_ROOT", &c->boot_root); +} + +static int context_set_conf_root(Context *c, const char *s, const char *source) { + assert(c); + return context_set_path(c, s, source, "CONF_ROOT", &c->conf_root); +} + +static int context_set_kernel(Context *c, const char *s) { + assert(c); + return context_set_path(c, s, "command line", "kernel image file", &c->kernel); +} + +static int context_set_path_strv(Context *c, char* const* strv, const char *source, const char *name, char ***dest) { + _cleanup_strv_free_ char **w = NULL; + int r; + + assert(c); + assert(source); + assert(name); + assert(dest); + + if (*dest) + return 0; + + STRV_FOREACH(s, strv) { + char *p; + + if (c->rfd >= 0) { + r = chaseat(c->rfd, *s, CHASE_AT_RESOLVE_IN_ROOT, &p, /* ret_fd = */ NULL); + if (r < 0) + return log_warning_errno(r, "Failed to chase path %s for %s specified via %s: %m", + *s, name, source); + } else { + r = path_make_absolute_cwd(*s, &p); + if (r < 0) + return log_warning_errno(r, "Failed to make path '%s' for %s specified via %s absolute, ignoring: %m", + *s, name, source); + } + r = strv_consume(&w, p); + if (r < 0) + return log_oom(); + } + + if (strv_isempty(w)) + return 0; + + log_debug("%s set via %s", name, source); + + *dest = TAKE_PTR(w); + return 1; +} + +static int context_set_plugins(Context *c, const char *s, const char *source) { + _cleanup_strv_free_ char **v = NULL; + + assert(c); + + if (c->plugins || !s) + return 0; + + v = strv_split(s, NULL); + if (!v) + return log_oom(); + + return context_set_path_strv(c, v, source, "plugins", &c->plugins); +} + +static int context_set_initrds(Context *c, char* const* strv) { + assert(c); + return context_set_path_strv(c, strv, "command line", "initrds", &c->initrds); +} + +static int context_load_environment(Context *c) { + assert(c); + + (void) context_set_machine_id(c, getenv("MACHINE_ID"), "environment"); + (void) context_set_boot_root(c, getenv("BOOT_ROOT"), "environment"); + (void) context_set_conf_root(c, getenv("KERNEL_INSTALL_CONF_ROOT"), "environment"); + (void) context_set_plugins(c, getenv("KERNEL_INSTALL_PLUGINS"), "environment"); + return 0; +} + +static int context_ensure_conf_root(Context *c) { + int r; + + assert(c); + + if (c->conf_root) + return 0; + + r = chaseat(c->rfd, "/etc/kernel", CHASE_AT_RESOLVE_IN_ROOT, &c->conf_root, /* ret_fd = */ NULL); + if (r < 0) + log_debug_errno(r, "Failed to chase /etc/kernel, ignoring: %m"); + + return 0; +} + +static int context_load_install_conf_one(Context *c, const char *path) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char + *conf = NULL, *machine_id = NULL, *boot_root = NULL, *layout = NULL, + *initrd_generator = NULL, *uki_generator = NULL; + int r; + + assert(c); + assert(path); + + conf = path_join(path, "install.conf"); + if (!conf) + return log_oom(); + + r = chase_and_fopenat_unlocked(c->rfd, conf, CHASE_AT_RESOLVE_IN_ROOT, "re", NULL, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to chase %s: %m", conf); + + log_debug("Loading %s…", conf); + + r = parse_env_file(f, conf, + "MACHINE_ID", &machine_id, + "BOOT_ROOT", &boot_root, + "layout", &layout, + "initrd_generator", &initrd_generator, + "uki_generator", &uki_generator); + if (r < 0) + return log_error_errno(r, "Failed to parse '%s': %m", conf); + + (void) context_set_machine_id(c, machine_id, conf); + (void) context_set_boot_root(c, boot_root, conf); + (void) context_set_layout(c, layout, conf); + (void) context_set_initrd_generator(c, initrd_generator, conf); + (void) context_set_uki_generator(c, uki_generator, conf); + + log_debug("Loaded %s.", conf); + return 1; +} + +static int context_load_install_conf(Context *c) { + int r; + + assert(c); + + if (c->conf_root) { + r = context_load_install_conf_one(c, c->conf_root); + if (r != 0) + return r; + } + + STRV_FOREACH(p, STRV_MAKE("/etc/kernel", "/usr/lib/kernel")) { + r = context_load_install_conf_one(c, *p); + if (r != 0) + return r; + } + + return 0; +} + +static int context_load_machine_info(Context *c) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *machine_id = NULL, *layout = NULL; + static const char *path = "/etc/machine-info"; + int r; + + assert(c); + + /* If the user configured an explicit machine ID in /etc/machine-info to use for our purpose, we'll + * use that instead (for compatibility). */ + + if (!sd_id128_is_null(c->machine_id) && c->layout >= 0) + return 0; + + /* For testing. To make not read host's /etc/machine-info. */ + r = getenv_bool("KERNEL_INSTALL_READ_MACHINE_INFO"); + if (r < 0 && r != -ENXIO) + log_warning_errno(r, "Failed to read $KERNEL_INSTALL_READ_MACHINE_INFO, assuming yes: %m"); + if (r == 0) { + log_debug("Skipping to read /etc/machine-info."); + return 0; + } + + r = chase_and_fopenat_unlocked(c->rfd, path, CHASE_AT_RESOLVE_IN_ROOT, "re", NULL, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to chase %s: %m", path); + + log_debug("Loading %s…", path); + + r = parse_env_file(f, path, + "KERNEL_INSTALL_MACHINE_ID", &machine_id, + "KERNEL_INSTALL_LAYOUT", &layout); + if (r < 0) + return log_error_errno(r, "Failed to parse '%s': %m", path); + + (void) context_set_machine_id(c, machine_id, path); + (void) context_set_layout(c, layout, path); + return 0; +} + +static int context_load_machine_id(Context *c) { + int r; + + assert(c); + + r = id128_get_machine_at(c->rfd, &c->machine_id); + if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to load machine ID from /etc/machine-id: %m"); + + log_debug("MACHINE_ID=%s set via /etc/machine-id.", SD_ID128_TO_STRING(c->machine_id)); + return 1; /* loaded */ +} + +static int context_ensure_machine_id(Context *c) { + int r; + + assert(c); + + if (!sd_id128_is_null(c->machine_id)) + return 0; + + /* If /etc/machine-id is initialized we'll use it. */ + r = context_load_machine_id(c); + if (r != 0) + return r; + + /* Otherwise we'll use a freshly generated one. */ + r = sd_id128_randomize(&c->machine_id); + if (r < 0) + return log_error_errno(r, "Failed to generate random ID: %m"); + + c->machine_id_is_random = true; + log_debug("New machine ID '%s' generated.", SD_ID128_TO_STRING(c->machine_id)); + return 0; +} + +static int context_acquire_xbootldr(Context *c) { + int r; + + assert(c); + assert(!c->boot_root); + + r = find_xbootldr_and_warn_at( + /* rfd = */ c->rfd, + /* path = */ arg_xbootldr_path, + /* unprivileged_mode= */ -1, + /* ret_path = */ &c->boot_root, + /* ret_uuid = */ NULL, + /* ret_devid = */ NULL); + if (r == -ENOKEY) { + log_debug_errno(r, "Couldn't find an XBOOTLDR partition."); + return 0; + } + if (r == -EACCES && geteuid() != 0) + return log_error_errno(r, "Failed to determine XBOOTLDR partition: %m"); + if (r < 0) + return r; + + log_debug("Using XBOOTLDR partition at %s as $BOOT_ROOT.", c->boot_root); + return 1; /* found */ +} + +static int context_acquire_esp(Context *c) { + int r; + + assert(c); + assert(!c->boot_root); + + r = find_esp_and_warn_at( + /* rfd = */ c->rfd, + /* path = */ arg_esp_path, + /* unprivileged_mode= */ -1, + /* ret_path = */ &c->boot_root, + /* ret_part = */ NULL, + /* ret_pstart = */ NULL, + /* ret_psize = */ NULL, + /* ret_uuid = */ NULL, + /* ret_devid = */ NULL); + if (r == -ENOKEY) { + log_debug_errno(r, "Couldn't find EFI system partition, ignoring."); + return 0; + } + if (r == -EACCES && geteuid() != 0) + return log_error_errno(r, "Failed to determine EFI system partition: %m"); + if (r < 0) + return r; + + log_debug("Using EFI System Partition at %s as $BOOT_ROOT.", c->boot_root); + return 1; /* found */ +} + +static int context_ensure_boot_root(Context *c) { + int r; + + assert(c); + + /* If BOOT_ROOT is specified via environment or install.conf, then use it. */ + if (c->boot_root) + return 0; + + /* Otherwise, use XBOOTLDR partition, if mounted. */ + r = context_acquire_xbootldr(c); + if (r != 0) + return r; + + /* Otherwise, use EFI system partition, if mounted. */ + r = context_acquire_esp(c); + if (r != 0) + return r; + + /* If all else fails, use /boot. */ + if (c->rfd >= 0) { + r = chaseat(c->rfd, "/boot", CHASE_AT_RESOLVE_IN_ROOT, &c->boot_root, /* ret_fd = */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to chase '/boot': %m"); + } else { + c->boot_root = strdup("/boot"); + if (!c->boot_root) + return log_oom(); + } + + log_debug("KERNEL_INSTALL_BOOT_ROOT autodetection yielded no candidates, using \"%s\".", c->boot_root); + return 0; +} + +static int context_ensure_entry_token(Context *c) { + int r; + + assert(c); + + /* Now that we determined the machine ID to use, let's determine the "token" for the boot loader + * entry to generate. We use that for naming the directory below $BOOT where we want to place the + * kernel/initrd and related resources, as well for naming the .conf boot loader spec entry. + * Typically this is just the machine ID, but it can be anything else, too, if we are told so. */ + + r = boot_entry_token_ensure_at( + c->rfd, + c->conf_root, + c->machine_id, + c->machine_id_is_random, + &c->entry_token_type, + &c->entry_token); + if (r < 0) + return r; + + log_debug("Using entry token: %s", c->entry_token); + return 0; +} + +static int context_load_plugins(Context *c) { + int r; + + assert(c); + + if (c->plugins) + return 0; + + r = conf_files_list_strv_at( + &c->plugins, + ".install", + c->rfd, + CONF_FILES_EXECUTABLE | CONF_FILES_REGULAR | CONF_FILES_FILTER_MASKED, + STRV_MAKE_CONST("/etc/kernel/install.d", "/usr/lib/kernel/install.d")); + if (r < 0) + return log_error_errno(r, "Failed to find plugins: %m"); + + return 0; +} + +static int context_init(Context *c) { + int r; + + assert(c); + + r = context_open_root(c); + if (r < 0) + return r; + + r = context_load_environment(c); + if (r < 0) + return r; + + r = context_ensure_conf_root(c); + if (r < 0) + return r; + + r = context_load_install_conf(c); + if (r < 0) + return r; + + r = context_load_machine_info(c); + if (r < 0) + return r; + + r = context_ensure_machine_id(c); + if (r < 0) + return r; + + r = context_ensure_boot_root(c); + if (r < 0) + return r; + + r = context_ensure_entry_token(c); + if (r < 0) + return r; + + r = context_load_plugins(c); + if (r < 0) + return r; + + return 0; +} + +static int context_inspect_kernel(Context *c) { + assert(c); + + if (!c->kernel) + return 0; + + return inspect_kernel(c->rfd, c->kernel, &c->kernel_image_type, NULL, NULL, NULL); +} + +static int context_ensure_layout(Context *c) { + int r; + + assert(c); + assert(c->boot_root); + assert(c->entry_token); + + if (c->layout >= 0 && c->layout != LAYOUT_AUTO) + return 0; + + /* No layout configured by the administrator. Let's try to figure it out automatically from metadata + * already contained in $BOOT_ROOT. */ + + if (c->kernel_image_type == KERNEL_IMAGE_TYPE_UKI) { + c->layout = LAYOUT_UKI; + log_debug("Kernel image type is %s, using layout=%s.", + kernel_image_type_to_string(c->kernel_image_type), layout_to_string(c->layout)); + return 0; + } + + _cleanup_free_ char *srel_path = path_join(c->boot_root, "loader/entries.srel"); + if (!srel_path) + return log_oom(); + + _cleanup_free_ char *srel = NULL; + r = read_one_line_file_at(c->rfd, srel_path, &srel); + if (r >= 0) { + if (streq(srel, "type1")) + /* The loader/entries.srel file clearly indicates that the installed boot loader + * implements the proper standard upstream boot loader spec for Type #1 entries. + * Let's default to that, then. */ + c->layout = LAYOUT_BLS; + else + /* The loader/entries.srel file indicates some other spec is implemented and owns the + * /loader/entries/ directory. Since we have no idea what that means, let's stay away + * from it by default. */ + c->layout = LAYOUT_OTHER; + + log_debug("%s with '%s' found, using layout=%s.", srel_path, srel, layout_to_string(c->layout)); + return 0; + } else if (r != -ENOENT) + return log_error_errno(r, "Failed to read %s: %m", srel_path); + + _cleanup_free_ char *entry_token_path = path_join(c->boot_root, c->entry_token); + if (!entry_token_path) + return log_oom(); + + r = is_dir_full(c->rfd, entry_token_path, /* follow = */ false); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to check if '%s' is a directory: %m", entry_token_path); + if (r > 0) { + /* If the metadata in $BOOT_ROOT doesn't tell us anything, then check if the entry token + * directory already exists. If so, let's assume it's the standard boot loader spec, too. */ + c->layout = LAYOUT_BLS; + log_debug("%s exists, using layout=%s.", entry_token_path, layout_to_string(c->layout)); + return 0; + } + + /* There's no metadata in $BOOT_ROOT, and apparently no entry token directory installed? Then we + * really don't know anything. */ + c->layout = LAYOUT_OTHER; + log_debug("Entry-token directory not found, using layout=%s.", layout_to_string(c->layout)); + return 0; +} + +static int context_set_up_staging_area(Context *c) { + static const char *template = "/tmp/kernel-install.staging.XXXXXX"; + int r; + + assert(c); + + if (c->staging_area) + return 0; + + if (c->action == ACTION_INSPECT) { + /* This is only used for display. The directory will not be created. */ + c->staging_area = strdup(template); + if (!c->staging_area) + return log_oom(); + } else { + r = mkdtemp_malloc(template, &c->staging_area); + if (r < 0) + return log_error_errno(r, "Failed to create staging area: %m"); + } + + return 0; +} + +static int context_build_entry_dir(Context *c) { + assert(c); + assert(c->boot_root); + assert(c->entry_token); + assert(c->version || c->action == ACTION_INSPECT); + + if (c->entry_dir) + return 0; + + c->entry_dir = path_join(c->boot_root, c->entry_token, c->version ?: "KERNEL_VERSION"); + if (!c->entry_dir) + return log_oom(); + + log_debug("Using ENTRY_DIR=%s", c->entry_dir); + return 0; +} + +static bool context_should_make_entry_dir(Context *c) { + assert(c); + + /* Compatibility with earlier versions that used the presence of $BOOT_ROOT/$ENTRY_TOKEN to signal to + * 00-entry-directory to create $ENTRY_DIR to serve as the indication to use or to not use the BLS */ + + if (arg_make_entry_directory < 0) + return c->layout == LAYOUT_BLS; + + return arg_make_entry_directory; +} + +static int context_make_entry_dir(Context *c) { + _cleanup_close_ int fd = -EBADF; + + assert(c); + assert(c->entry_dir); + + if (c->action != ACTION_ADD) + return 0; + + if (!context_should_make_entry_dir(c)) + return 0; + + log_debug("mkdir -p %s", c->entry_dir); + fd = chase_and_openat(c->rfd, c->entry_dir, CHASE_AT_RESOLVE_IN_ROOT | CHASE_MKDIR_0755, + O_CLOEXEC | O_CREAT | O_DIRECTORY | O_PATH, NULL); + if (fd < 0) + return log_error_errno(fd, "Failed to make directory '%s': %m", c->entry_dir); + + return 0; +} + +static int context_remove_entry_dir(Context *c) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + assert(c); + assert(c->entry_dir); + + if (c->action != ACTION_REMOVE) + return 0; + + if (!context_should_make_entry_dir(c)) + return 0; + + log_debug("rm -rf %s", c->entry_dir); + fd = chase_and_openat(c->rfd, c->entry_dir, CHASE_AT_RESOLVE_IN_ROOT, O_CLOEXEC | O_DIRECTORY, &p); + if (fd < 0) { + if (IN_SET(fd, -ENOTDIR, -ENOENT)) + return 0; + return log_debug_errno(fd, "Failed to chase and open %s, ignoring: %m", c->entry_dir); + } + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat %s: %m", p); + + r = rm_rf_children(TAKE_FD(fd), REMOVE_PHYSICAL|REMOVE_MISSING_OK|REMOVE_CHMOD, &st); + if (r < 0) + log_debug_errno(r, "Failed to remove children of %s, ignoring: %m", p); + + if (unlinkat(c->rfd, p, AT_REMOVEDIR) < 0) + log_debug_errno(errno, "Failed to remove %s, ignoring: %m", p); + + return 0; +} + +static int context_build_arguments(Context *c) { + _cleanup_strv_free_ char **a = NULL; + const char *verb; + int r; + + assert(c); + assert(c->entry_dir); + + if (c->argv) + return 0; + + switch (c->action) { + case ACTION_ADD: + assert(c->version); + assert(c->kernel); + verb = "add"; + break; + + case ACTION_REMOVE: + assert(c->version); + assert(!c->kernel); + assert(!c->initrds); + verb = "remove"; + break; + + case ACTION_INSPECT: + verb = "add|remove"; + break; + + default: + assert_not_reached(); + } + + a = strv_new("dummy-arg", /* to make strv_free() works for this variable. */ + verb, + c->version ?: "KERNEL_VERSION", + c->entry_dir); + if (!a) + return log_oom(); + + if (c->action == ACTION_ADD) { + r = strv_extend(&a, c->kernel); + if (r < 0) + return log_oom(); + + r = strv_extend_strv(&a, c->initrds, /* filter_duplicates = */ false); + if (r < 0) + return log_oom(); + + } else if (c->action == ACTION_INSPECT) { + r = strv_extend(&a, c->kernel ?: "[KERNEL_IMAGE]"); + if (r < 0) + return log_oom(); + + r = strv_extend(&a, "[INITRD...]"); + if (r < 0) + return log_oom(); + } + + c->argv = TAKE_PTR(a); + return 0; +} + +static int context_build_environment(Context *c) { + _cleanup_strv_free_ char **e = NULL; + int r; + + assert(c); + + if (c->envp) + return 0; + + r = strv_env_assign_many(&e, + "LC_COLLATE", SYSTEMD_DEFAULT_LOCALE, + "KERNEL_INSTALL_VERBOSE", one_zero(arg_verbose), + "KERNEL_INSTALL_IMAGE_TYPE", kernel_image_type_to_string(c->kernel_image_type), + "KERNEL_INSTALL_MACHINE_ID", SD_ID128_TO_STRING(c->machine_id), + "KERNEL_INSTALL_ENTRY_TOKEN", c->entry_token, + "KERNEL_INSTALL_BOOT_ROOT", c->boot_root, + "KERNEL_INSTALL_LAYOUT", context_get_layout(c), + "KERNEL_INSTALL_INITRD_GENERATOR", strempty(c->initrd_generator), + "KERNEL_INSTALL_UKI_GENERATOR", strempty(c->uki_generator), + "KERNEL_INSTALL_STAGING_AREA", c->staging_area); + if (r < 0) + return log_error_errno(r, "Failed to build environment variables for plugins: %m"); + + c->envp = TAKE_PTR(e); + return 0; +} + +static int context_prepare_execution(Context *c) { + int r; + + assert(c); + + r = context_inspect_kernel(c); + if (r < 0) + return r; + + r = context_ensure_layout(c); + if (r < 0) + return r; + + r = context_set_up_staging_area(c); + if (r < 0) + return r; + + r = context_build_entry_dir(c); + if (r < 0) + return r; + + r = context_build_arguments(c); + if (r < 0) + return r; + + r = context_build_environment(c); + if (r < 0) + return r; + + return 0; +} + +static int context_execute(Context *c) { + int r, ret; + + assert(c); + + r = context_make_entry_dir(c); + if (r < 0) + return r; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *x = strv_join_full(c->plugins, "", "\n ", /* escape_separator = */ false); + log_debug("Using plugins: %s", strna(x)); + + _cleanup_free_ char *y = strv_join_full(c->envp, "", "\n ", /* escape_separator = */ false); + log_debug("Plugin environment: %s", strna(y)); + + _cleanup_free_ char *z = strv_join(strv_skip(c->argv, 1), " "); + log_debug("Plugin arguments: %s", strna(z)); + } + + ret = execute_strv( + /* name = */ NULL, + c->plugins, + /* root = */ NULL, + USEC_INFINITY, + /* callbacks = */ NULL, + /* callback_args = */ NULL, + c->argv, + c->envp, + EXEC_DIR_SKIP_REMAINING); + + r = context_remove_entry_dir(c); + if (r < 0) + return r; + + /* This returns 0 on success, positive exit code on plugin failure, negative errno on other failures. */ + return ret; +} + +static bool bypass(void) { + int r; + + r = getenv_bool("KERNEL_INSTALL_BYPASS"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $KERNEL_INSTALL_BYPASS, assuming no."); + if (r <= 0) + return false; + + log_debug("$KERNEL_INSTALL_BYPASS is enabled, skipping execution."); + return true; +} + +static int do_add( + Context *c, + const char *version, + const char *kernel, + char **initrds) { + + int r; + + assert(c); + assert(version); + assert(kernel); + + r = context_set_version(c, version); + if (r < 0) + return r; + + r = context_set_kernel(c, kernel); + if (r < 0) + return r; + + r = context_set_initrds(c, initrds); + if (r < 0) + return r; + + r = context_prepare_execution(c); + if (r < 0) + return r; + + return context_execute(c); +} + +static int kernel_from_version(const char *version, char **ret_kernel) { + _cleanup_free_ char *vmlinuz = NULL; + int r; + + assert(version); + + vmlinuz = path_join("/usr/lib/modules/", version, "/vmlinuz"); + if (!vmlinuz) + return log_oom(); + + r = laccess(vmlinuz, F_OK); + if (r < 0) { + if (r == -ENOENT) + return log_error_errno(r, "Kernel image not installed to '%s', requiring manual kernel image path specification.", vmlinuz); + + return log_error_errno(r, "Failed to determine if kernel image is installed to '%s': %m", vmlinuz); + } + + *ret_kernel = TAKE_PTR(vmlinuz); + return 0; +} + +static int verb_add(int argc, char *argv[], void *userdata) { + Context *c = ASSERT_PTR(userdata); + _cleanup_free_ char *vmlinuz = NULL; + const char *version, *kernel; + char **initrds; + struct utsname un; + int r; + + assert(argv); + + if (arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "'add' does not support --root=."); + + if (bypass()) + return 0; + + c->action = ACTION_ADD; + + /* We use the same order of arguments that "inspect" introduced, i.e. if only on argument is + * specified we take it as the kernel path, not the version, i.e. it's the first argument that is + * optional, not the 2nd. */ + version = argc > 2 ? empty_or_dash_to_null(argv[1]) : NULL; + kernel = argc > 2 ? empty_or_dash_to_null(argv[2]) : + (argc > 1 ? empty_or_dash_to_null(argv[1]) : NULL); + initrds = strv_skip(argv, 3); + + if (!version) { + assert_se(uname(&un) >= 0); + version = un.release; + } + + if (!kernel) { + r = kernel_from_version(version, &vmlinuz); + if (r < 0) + return r; + + kernel = vmlinuz; + } + + return do_add(c, version, kernel, initrds); +} + +static int verb_add_all(int argc, char *argv[], void *userdata) { + Context *c = ASSERT_PTR(userdata); + _cleanup_close_ int fd = -EBADF; + size_t n = 0; + int ret = 0, r; + + assert(argv); + + if (bypass()) + return 0; + + c->action = ACTION_ADD; + + fd = open("/usr/lib/modules", O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(fd, "Failed to open /usr/lib/modules/: %m"); + + _cleanup_free_ DirectoryEntries *de = NULL; + r = readdir_all(fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + return log_error_errno(r, "Failed to numerate /usr/lib/modules/ contents: %m"); + + FOREACH_ARRAY(d, de->entries, de->n_entries) { + + _cleanup_free_ char *j = path_join("/usr/lib/modules/", (*d)->d_name); + if (!j) + return log_oom(); + + r = dirent_ensure_type(fd, *d); + if (r < 0) { + if (r != -ENOENT) /* don't log if just gone by now */ + log_debug_errno(r, "Failed to check if '%s' is a directory, ignoring: %m", j); + continue; + } + + if ((*d)->d_type != DT_DIR) + continue; + + _cleanup_free_ char *fn = path_join((*d)->d_name, "vmlinuz"); + if (!fn) + return log_oom(); + + if (faccessat(fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if '/usr/lib/modules/%s/vmlinuz' exists, ignoring: %m", (*d)->d_name); + + log_notice("Not adding version '%s', because kernel image not found.", (*d)->d_name); + continue; + } + + _cleanup_(context_done) Context copy = CONTEXT_NULL; + + r = context_copy(c, ©); + if (r < 0) + return log_error_errno(r, "Failed to copy execution context: %m"); + + _cleanup_free_ char *full = path_join("/usr/lib/modules/", fn); + if (!full) + return log_oom(); + + r = do_add(©, + /* version= */ (*d)->d_name, + /* kernel= */ full, + /* initrds= */ NULL); + if (r == 0) + n++; + else if (ret == 0) + ret = r; + } + + if (n > 0) + log_debug("Installed %zu kernel(s).", n); + else if (ret == 0) + ret = log_error_errno(SYNTHETIC_ERRNO(ENOENT), "No kernels to install found."); + + return ret; +} + +static int run_as_installkernel(int argc, char *argv[], Context *c) { + /* kernel's install.sh invokes us as + * /sbin/installkernel + * We ignore the last two arguments. */ + if (optind + 2 > argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "'installkernel' command requires at least two arguments."); + + return verb_add(3, STRV_MAKE("add", argv[optind], argv[optind+1]), c); +} + +static int verb_remove(int argc, char *argv[], void *userdata) { + Context *c = ASSERT_PTR(userdata); + int r; + + assert(argc >= 2); + assert(argv); + + if (arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "'remove' does not support --root=."); + + if (argc > 2) + log_debug("Too many arguments specified. 'kernel-install remove' takes only kernel version. " + "Ignoring residual arguments."); + + if (bypass()) + return 0; + + c->action = ACTION_REMOVE; + + /* Note, we do not automatically derive the kernel version to remove from uname() here (unlike we do + * it for the "add" verb), since we don't want to make it too easy to uninstall your running + * kernel, as a safety precaution */ + + r = context_set_version(c, argv[1]); + if (r < 0) + return r; + + r = context_prepare_execution(c); + if (r < 0) + return r; + + return context_execute(c); +} + +static int verb_inspect(int argc, char *argv[], void *userdata) { + Context *c = ASSERT_PTR(userdata); + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_free_ char *vmlinuz = NULL; + const char *version, *kernel; + char **initrds; + struct utsname un; + int r; + + c->action = ACTION_INSPECT; + + /* When only a single parameter is specified 'inspect' it's the kernel image path, and not the kernel + * version. i.e. it's the first argument that is optional, not the 2nd. That's a bit unfortunate, but + * we keep the behaviour for compatibility. If users want to specify only the version (and have the + * kernel image path derived automatically), then they may specify an empty string or "dash" as + * kernel image path. */ + version = argc > 2 ? empty_or_dash_to_null(argv[1]) : NULL; + kernel = argc > 2 ? empty_or_dash_to_null(argv[2]) : + (argc > 1 ? empty_or_dash_to_null(argv[1]) : NULL); + initrds = strv_skip(argv, 3); + + if (!version && !arg_root) { + assert_se(uname(&un) >= 0); + version = un.release; + } + + if (!kernel && version) { + r = kernel_from_version(version, &vmlinuz); + if (r < 0) + return r; + + kernel = vmlinuz; + } + + r = context_set_version(c, version); + if (r < 0) + return r; + + r = context_set_kernel(c, kernel); + if (r < 0) + return r; + + r = context_set_initrds(c, initrds); + if (r < 0) + return r; + + r = context_prepare_execution(c); + if (r < 0) + return r; + + t = table_new_vertical(); + if (!t) + return log_oom(); + + r = table_add_many(t, + TABLE_FIELD, "Machine ID", + TABLE_ID128, c->machine_id, + TABLE_FIELD, "Kernel Image Type", + TABLE_STRING, kernel_image_type_to_string(c->kernel_image_type), + TABLE_FIELD, "Layout", + TABLE_STRING, context_get_layout(c), + TABLE_FIELD, "Boot Root", + TABLE_STRING, c->boot_root, + TABLE_FIELD, "Entry Token Type", + TABLE_STRING, boot_entry_token_type_to_string(c->entry_token_type), + TABLE_FIELD, "Entry Token", + TABLE_STRING, c->entry_token, + TABLE_FIELD, "Entry Directory", + TABLE_STRING, c->entry_dir, + TABLE_FIELD, "Kernel Version", + TABLE_STRING, c->version, + TABLE_FIELD, "Kernel", + TABLE_STRING, c->kernel, + TABLE_FIELD, "Initrds", + TABLE_STRV, c->initrds, + TABLE_FIELD, "Initrd Generator", + TABLE_STRING, c->initrd_generator, + TABLE_FIELD, "UKI Generator", + TABLE_STRING, c->uki_generator, + TABLE_FIELD, "Plugins", + TABLE_STRV, c->plugins, + TABLE_FIELD, "Plugin Environment", + TABLE_STRV, c->envp); + if (r < 0) + return table_log_add_error(r); + + if (arg_json_format_flags & JSON_FORMAT_OFF) { + r = table_add_many(t, + TABLE_FIELD, "Plugin Arguments", + TABLE_STRV, strv_skip(c->argv, 1)); + if (r < 0) + return table_log_add_error(r); + } + + table_set_ersatz_string(t, TABLE_ERSATZ_UNSET); + + for (size_t row = 1; row < table_get_rows(t); row++) { + _cleanup_free_ char *name = NULL; + + name = strdup(table_get_at(t, row, 0)); + if (!name) + return log_oom(); + + r = table_set_json_field_name(t, row - 1, delete_chars(name, " ")); + if (r < 0) + return log_error_errno(r, "Failed to set JSON field name: %m"); + } + + return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, /* show_header= */ false); +} + +static int verb_list(int argc, char *argv[], void *userdata) { + _cleanup_close_ int fd = -EBADF; + int r; + + fd = open("/usr/lib/modules", O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(fd, "Failed to open /usr/lib/modules/: %m"); + + _cleanup_free_ DirectoryEntries *de = NULL; + r = readdir_all(fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + return log_error_errno(r, "Failed to numerate /usr/lib/modules/ contents: %m"); + + _cleanup_(table_unrefp) Table *table = NULL; + table = table_new("version", "has kernel", "path"); + if (!table) + return log_oom(); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + table_set_align_percent(table, table_get_cell(table, 0, 1), 100); + + FOREACH_ARRAY(d, de->entries, de->n_entries) { + + _cleanup_free_ char *j = path_join("/usr/lib/modules/", (*d)->d_name); + if (!j) + return log_oom(); + + r = dirent_ensure_type(fd, *d); + if (r < 0) { + if (r != -ENOENT) /* don't log if just gone by now */ + log_debug_errno(r, "Failed to check if '%s' is a directory, ignoring: %m", j); + continue; + } + + if ((*d)->d_type != DT_DIR) + continue; + + _cleanup_free_ char *fn = path_join((*d)->d_name, "vmlinuz"); + if (!fn) + return log_oom(); + + bool exists; + if (faccessat(fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if '/usr/lib/modules/%s/vmlinuz' exists, ignoring: %m", (*d)->d_name); + + exists = false; + } else + exists = true; + + r = table_add_many(table, + TABLE_STRING, (*d)->d_name, + TABLE_BOOLEAN_CHECKMARK, exists, + TABLE_SET_COLOR, ansi_highlight_green_red(exists), + TABLE_PATH, j); + if (r < 0) + return table_log_add_error(r); + } + + return table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, arg_legend); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("kernel-install", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%5$sAdd and remove kernel and initrd images to and from /boot/%6$s\n" + "\n%3$sUsage:%4$s\n" + " kernel-install [OPTIONS...] add [[[KERNEL-VERSION] KERNEL-IMAGE] [INITRD ...]]\n" + " kernel-install [OPTIONS...] add-all\n" + " kernel-install [OPTIONS...] remove KERNEL-VERSION\n" + " kernel-install [OPTIONS...] inspect [[[KERNEL-VERSION] KERNEL-IMAGE]\n" + " [INITRD ...]]\n" + " kernel-install [OPTIONS...] list\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -v --verbose Increase verbosity\n" + " --esp-path=PATH Path to the EFI System Partition (ESP)\n" + " --boot-path=PATH Path to the $BOOT partition\n" + " --make-entry-directory=yes|no|auto\n" + " Create $BOOT/ENTRY-TOKEN/ directory\n" + " --entry-token=machine-id|os-id|os-image-id|auto|literal:…\n" + " Entry token to use for this installation\n" + " --no-pager Do not pipe inspect output into a pager\n" + " --json=pretty|short|off Generate JSON output\n" + " --no-legend Do not show the headers and footers\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + "\n" + "This program may also be invoked as 'installkernel':\n" + " installkernel [OPTIONS...] VERSION VMLINUZ [MAP] [INSTALLATION-DIR]\n" + "(The optional arguments are passed by kernel build system, but ignored.)\n" + "\n" + "See the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[], Context *c) { + enum { + ARG_VERSION = 0x100, + ARG_NO_LEGEND, + ARG_ESP_PATH, + ARG_BOOT_PATH, + ARG_MAKE_ENTRY_DIRECTORY, + ARG_ENTRY_TOKEN, + ARG_NO_PAGER, + ARG_JSON, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + }; + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "verbose", no_argument, NULL, 'v' }, + { "esp-path", required_argument, NULL, ARG_ESP_PATH }, + { "boot-path", required_argument, NULL, ARG_BOOT_PATH }, + { "make-entry-directory", required_argument, NULL, ARG_MAKE_ENTRY_DIRECTORY }, + { "entry-token", required_argument, NULL, ARG_ENTRY_TOKEN }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "json", required_argument, NULL, ARG_JSON }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + {} + }; + int t, r; + + assert(argc >= 0); + assert(argv); + assert(c); + + while ((t = getopt_long(argc, argv, "hv", options, NULL)) >= 0) + switch (t) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case 'v': + log_set_max_level(LOG_DEBUG); + arg_verbose = true; + break; + + case ARG_ESP_PATH: + r = parse_path_argument(optarg, /* suppress_root = */ false, &arg_esp_path); + if (r < 0) + return log_oom(); + break; + + case ARG_BOOT_PATH: + r = parse_path_argument(optarg, /* suppress_root = */ false, &arg_xbootldr_path); + if (r < 0) + return log_oom(); + break; + + case ARG_MAKE_ENTRY_DIRECTORY: + if (streq(optarg, "auto")) + arg_make_entry_directory = -1; + else { + r = parse_boolean_argument("--make-entry-directory=", optarg, NULL); + if (r < 0) + return r; + + arg_make_entry_directory = r; + } + break; + + case ARG_ENTRY_TOKEN: + r = parse_boot_entry_token_type(optarg, &c->entry_token_type, &c->entry_token); + if (r < 0) + return r; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r < 0) + return r; + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + + return 1; +} + +static int run(int argc, char* argv[]) { + static const Verb verbs[] = { + { "add", 1, VERB_ANY, 0, verb_add }, + { "add-all", 1, 1, 0, verb_add_all }, + { "remove", 2, VERB_ANY, 0, verb_remove }, + { "inspect", 1, VERB_ANY, VERB_DEFAULT, verb_inspect }, + { "list", 1, 1, 0, verb_list }, + {} + }; + _cleanup_(context_done) Context c = { + .rfd = AT_FDCWD, + .action = _ACTION_INVALID, + .kernel_image_type = KERNEL_IMAGE_TYPE_UNKNOWN, + .layout = _LAYOUT_INVALID, + .entry_token_type = BOOT_ENTRY_TOKEN_AUTO, + }; + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + int r; + + log_setup(); + + r = parse_argv(argc, argv, &c); + if (r <= 0) + return r; + + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_VALIDATE_OS, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } + + r = context_init(&c); + if (r < 0) + return r; + + if (invoked_as(argv, "installkernel")) + return run_as_installkernel(argc, argv, &c); + + return dispatch_verb(argc, argv, verbs, &c); +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/kernel-install/meson.build b/src/kernel-install/meson.build new file mode 100644 index 0000000..7f61fcc --- /dev/null +++ b/src/kernel-install/meson.build @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +want_kernel_install = conf.get('ENABLE_KERNEL_INSTALL') == 1 + +executables += [ + executable_template + { + 'name' : 'kernel-install', + 'public' : true, + 'conditions' : ['ENABLE_KERNEL_INSTALL'], + 'sources' : files('kernel-install.c'), + }, +] + +ukify_install = custom_target( + '60-ukify.install', + input : '60-ukify.install.in', + output : '60-ukify.install', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : want_kernel_install and want_ukify, + install_mode : 'rwxr-xr-x', + install_dir : kernelinstalldir) + +loaderentry_install = custom_target( + '90-loaderentry.install', + input : '90-loaderentry.install.in', + output : '90-loaderentry.install', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : want_kernel_install, + install_mode : 'rwxr-xr-x', + install_dir : kernelinstalldir) + +uki_copy_install = files('90-uki-copy.install') + +kernel_install_files = uki_copy_install + files( + '50-depmod.install', +) + +if want_kernel_install + install_data(kernel_install_files, + install_mode : 'rwxr-xr-x', + install_dir : kernelinstalldir) + + install_data('install.conf', + install_dir : kerneldir) + + if install_sysconfdir + install_emptydir(sysconfdir / 'kernel/install.d') + endif +endif + +test_kernel_install_sh = find_program('test-kernel-install.sh') diff --git a/src/kernel-install/test-kernel-install.sh b/src/kernel-install/test-kernel-install.sh new file mode 100755 index 0000000..338d811 --- /dev/null +++ b/src/kernel-install/test-kernel-install.sh @@ -0,0 +1,333 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2235 +set -eux +set -o pipefail + +export SYSTEMD_LOG_LEVEL=debug + +kernel_install="${1:?}" +loaderentry_install="${2:?}" +uki_copy_install="${3:?}" +ukify="${4:-}" +ukify_install="${5:-}" +boot_stub="${6:-}" +if [[ -d "${PROJECT_BUILD_ROOT:-}" ]]; then + bootctl="${PROJECT_BUILD_ROOT}/bootctl" +else + bootctl= +fi + +D="$(mktemp --tmpdir --directory "test-kernel-install.XXXXXXXXXX")" + +# shellcheck disable=SC2064 +trap "rm -rf '$D'" EXIT INT QUIT PIPE +mkdir -p "$D/boot" +mkdir -p "$D/efi" +mkdir -p "$D/sources" + +echo 'buzy image' >"$D/sources/linux" +echo 'the initrd' >"$D/sources/initrd" +echo 'the-token' >"$D/sources/entry-token" +echo 'opt1 opt2' >"$D/sources/cmdline" + +cat >"$D/sources/install.conf" <"$D/sources/devicetree" +mkdir "$D/sources/subdir" +echo 'DTBDTBDTBDTB' >"$D/sources/subdir/whatever.dtb" + +export KERNEL_INSTALL_CONF_ROOT="$D/sources" +# We "install" multiple plugins, but control which ones will be active via install.conf. +export KERNEL_INSTALL_PLUGINS="${ukify_install} ${loaderentry_install} ${uki_copy_install}" +export BOOT_ROOT="$D/boot" +export BOOT_MNT="$D/boot" +export MACHINE_ID='3e0484f3634a418b8e6a39e8828b03e3' +export KERNEL_INSTALL_UKIFY="$ukify" +export KERNEL_INSTALL_BOOT_STUB="$boot_stub" +export KERNEL_INSTALL_READ_MACHINE_INFO="no" +export KERNEL_INSTALL_BYPASS="no" + +# Test type#1 installation +"$kernel_install" -v add 1.1.1 "$D/sources/linux" "$D/sources/initrd" + +entry="$BOOT_ROOT/loader/entries/the-token-1.1.1.conf" +test -f "$entry" +grep -qE '^title ' "$entry" +grep -qE '^version +1.1.1' "$entry" +grep -qE '^options +opt1 opt2' "$entry" +grep -qE '^linux .*/the-token/1.1.1/linux' "$entry" +grep -qE '^initrd .*/the-token/1.1.1/initrd' "$entry" +grep -qE '^devicetree .*/the-token/1.1.1/whatever.dtb' "$entry" + +grep -qE 'image' "$BOOT_ROOT/the-token/1.1.1/linux" +grep -qE 'initrd' "$BOOT_ROOT/the-token/1.1.1/initrd" +grep -qE 'DTBDTB' "$BOOT_ROOT/the-token/1.1.1/whatever.dtb" + +test -f /usr/lib/modules/"$(uname -r)"/vmlinuz && "$kernel_install" inspect +"$kernel_install" inspect "$D/sources/linux" + +"$kernel_install" -v remove 1.1.1 +test ! -e "$entry" +test ! -e "$BOOT_ROOT/the-token/1.1.1/linux" +test ! -e "$BOOT_ROOT/the-token/1.1.1/initrd" + +# Test again with too many arguments for 'remove' command. See #28448. +"$kernel_install" -v add 1.1.1 "$D/sources/linux" "$D/sources/initrd" + +test -f "$entry" +test -f "$BOOT_ROOT/the-token/1.1.1/linux" +test -f "$BOOT_ROOT/the-token/1.1.1/initrd" + +"$kernel_install" -v remove 1.1.1 hoge foo bar +test ! -e "$entry" +test ! -e "$BOOT_ROOT/the-token/1.1.1/linux" +test ! -e "$BOOT_ROOT/the-token/1.1.1/initrd" + +# Invoke kernel-install as installkernel +ln -s --relative -v "$kernel_install" "$D/sources/installkernel" +"$D/sources/installkernel" -v 1.1.2 "$D/sources/linux" System.map /somedirignored + +entry="$BOOT_ROOT/loader/entries/the-token-1.1.2.conf" +test -f "$entry" +grep -qE '^title ' "$entry" +grep -qE '^version +1.1.2' "$entry" +grep -qE '^options +opt1 opt2' "$entry" +grep -qE '^linux .*/the-token/1.1.2/linux' "$entry" +( ! grep -qE '^initrd' "$entry" ) +grep -qE '^devicetree .*/the-token/1.1.2/whatever.dtb' "$entry" + +grep -qE 'image' "$BOOT_ROOT/the-token/1.1.2/linux" +test ! -e "$BOOT_ROOT/the-token/1.1.2/initrd" +grep -qE 'DTBDTB' "$BOOT_ROOT/the-token/1.1.2/whatever.dtb" + +# Check installation with boot counting +echo '56' >"$D/sources/tries" + +"$kernel_install" -v add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +entry="$BOOT_ROOT/loader/entries/the-token-1.1.1+56.conf" +test -f "$entry" +grep -qE '^title ' "$entry" +grep -qE '^version +1.1.1' "$entry" +grep -qE '^options +opt1 opt2' "$entry" +grep -qE '^linux .*/the-token/1.1.1/linux' "$entry" +grep -qE '^initrd .*/the-token/1.1.1/initrd' "$entry" + +grep -qE 'image' "$BOOT_ROOT/the-token/1.1.1/linux" +grep -qE 'initrd' "$BOOT_ROOT/the-token/1.1.1/initrd" + +# Install UKI +if [ -f "$ukify" ]; then + cat >>"$D/sources/install.conf" < "$BOOT_ROOT/the-token/1.1.2/initrd" + "$bootctl" --root="$D" cleanup + test ! -e "$BOOT_ROOT/the-token/1.1.2/initrd" + test -e "$BOOT_ROOT/the-token/1.1.2/linux" + test -e "$BOOT_ROOT/the-token/1.1.1/linux" + test -e "$BOOT_ROOT/the-token/1.1.1/initrd" + + # now remove duplicated entry and make sure files are left over + "$bootctl" --root="$D" unlink "${e2##*/}" + test -e "$BOOT_ROOT/the-token/1.1.1/linux" + test -e "$BOOT_ROOT/the-token/1.1.1/initrd" + test -e "$entry" + test ! -e "$e2" + # remove last entry referencing those files + entry_id="${entry##*/}" + entry_id="${entry_id%+*}.conf" + "$bootctl" --root="$D" unlink "$entry_id" + test ! -e "$entry" + test ! -e "$BOOT_ROOT/the-token/1.1.1/linux" + test ! -e "$BOOT_ROOT/the-token/1.1.1/initrd" +fi + +########################################### +# tests for --make-entry-directory= +########################################### + +# disable all dropins +cat >"$D/00-skip.install" <"$D/sources/install.conf" < layout is other. +"$kernel_install" -v add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test ! -e "$BOOT_ROOT/the-token/1.1.1" +"$kernel_install" -v remove 1.1.1 +test ! -e "$BOOT_ROOT/the-token/1.1.1" + +# 1.2 token directory exists -> layout is BLS +mkdir -p "$BOOT_ROOT/the-token" +"$kernel_install" -v add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test -d "$BOOT_ROOT/the-token/1.1.1" +"$kernel_install" -v remove 1.1.1 +test ! -e "$BOOT_ROOT/the-token/1.1.1" +rmdir "$BOOT_ROOT/the-token" + +# 2. --make-entry-directory=yes +# 2.1 token directory does not exist -> layout is other. +"$kernel_install" -v --make-entry-directory=yes add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test -d "$BOOT_ROOT/the-token/1.1.1" +"$kernel_install" -v --make-entry-directory=yes remove 1.1.1 +test ! -e "$BOOT_ROOT/the-token/1.1.1" +test -d "$BOOT_ROOT/the-token" + +# 2.2 token directory exists -> layout is BLS +mkdir -p "$BOOT_ROOT/the-token" +"$kernel_install" -v --make-entry-directory=yes add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test -d "$BOOT_ROOT/the-token/1.1.1" +"$kernel_install" -v --make-entry-directory=yes remove 1.1.1 +test ! -e "$BOOT_ROOT/the-token/1.1.1" +test -d "$BOOT_ROOT/the-token" +rmdir "$BOOT_ROOT/the-token" + +# 3. --make-entry-directory=no +# 3.1 token directory does not exist -> layout is other. +"$kernel_install" -v --make-entry-directory=no add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test ! -e "$BOOT_ROOT/the-token/1.1.1" +"$kernel_install" -v --make-entry-directory=no remove 1.1.1 +test ! -e "$BOOT_ROOT/the-token/1.1.1" + +# 3.2 token directory exists -> layout is BLS +mkdir -p "$BOOT_ROOT/the-token" +"$kernel_install" -v --make-entry-directory=no add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test ! -e "$BOOT_ROOT/the-token/1.1.1" +"$kernel_install" -v --make-entry-directory=no remove 1.1.1 +test ! -e "$BOOT_ROOT/the-token/1.1.1" +test -d "$BOOT_ROOT/the-token" +rmdir "$BOOT_ROOT/the-token" + +########################################### +# tests for --entry-token= +########################################### +"$kernel_install" -v --make-entry-directory=yes --entry-token=machine-id add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test -d "$BOOT_ROOT/$MACHINE_ID/1.1.1" +"$kernel_install" -v --make-entry-directory=yes --entry-token=machine-id remove 1.1.1 +test ! -e "$BOOT_ROOT/$MACHINE_ID/1.1.1" +test -d "$BOOT_ROOT/$MACHINE_ID" +rmdir "$BOOT_ROOT/$MACHINE_ID" + +"$kernel_install" -v --make-entry-directory=yes --entry-token=literal:hoge add 1.1.1 "$D/sources/linux" "$D/sources/initrd" +test -d "$BOOT_ROOT/hoge/1.1.1" +"$kernel_install" -v --make-entry-directory=yes --entry-token=literal:hoge remove 1.1.1 +test ! -e "$BOOT_ROOT/hoge/1.1.1" +test -d "$BOOT_ROOT/hoge" +rmdir "$BOOT_ROOT/hoge" + +########################################### +# tests for --json= +########################################### +output="$("$kernel_install" -v --json=pretty inspect 1.1.1 "$D/sources/linux")" + +diff -u <(echo "$output") - <"$D/00-plugin-skip" <"$D/10-plugin-fail" < +#include +#include + +#include "arp-util.h" +#include "ether-addr-util.h" +#include "fd-util.h" +#include "in-addr-util.h" +#include "unaligned.h" + +int arp_update_filter(int fd, const struct in_addr *a, const struct ether_addr *mac) { + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0), /* A <- packet length */ + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, sizeof(struct ether_arp), 1, 0), /* packet >= arp packet ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hrd)), /* A <- header */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPHRD_ETHER, 1, 0), /* header == ethernet ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pro)), /* A <- protocol */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_IP, 1, 0), /* protocol == IP ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hln)), /* A <- hardware address length */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct ether_addr), 1, 0), /* length == sizeof(ether_addr)? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pln)), /* A <- protocol address length */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct in_addr), 1, 0), /* length == sizeof(in_addr) ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_op)), /* A <- operation */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REQUEST, 2, 0), /* protocol == request ? */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REPLY, 1, 0), /* protocol == reply ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + /* Sender Hardware Address must be different from our own */ + BPF_STMT(BPF_LDX + BPF_IMM, unaligned_read_be32(&mac->ether_addr_octet[0])), /* X <- 4 bytes of client's MAC */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_sha)), /* A <- 4 bytes of SHA */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_X, 0, 0, 4), /* A == X ? */ + BPF_STMT(BPF_LDX + BPF_IMM, unaligned_read_be16(&mac->ether_addr_octet[4])), /* X <- remainder of client's MAC */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, arp_sha) + 4), /* A <- remainder of SHA */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_X, 0, 0, 1), /* A == X ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + /* Sender Protocol Address or Target Protocol Address must be equal to the one we care about */ + BPF_STMT(BPF_LDX + BPF_IMM, htobe32(a->s_addr)), /* X <- clients IP */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_spa)), /* A <- SPA */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_X, 0, 0, 1), /* A == X ? */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_tpa)), /* A <- TPA */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_X, 0, 0, 1), /* A == 0 ? */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + }; + struct sock_fprog fprog = { + .len = ELEMENTSOF(filter), + .filter = (struct sock_filter*) filter, + }; + + assert(fd >= 0); + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog)) < 0) + return -errno; + + return 0; +} + +int arp_network_bind_raw_socket(int ifindex, const struct in_addr *a, const struct ether_addr *mac) { + union sockaddr_union link = { + .ll.sll_family = AF_PACKET, + .ll.sll_protocol = htobe16(ETH_P_ARP), + .ll.sll_ifindex = ifindex, + .ll.sll_halen = ETH_ALEN, + .ll.sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(ifindex > 0); + assert(mac); + + s = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (s < 0) + return -errno; + + r = arp_update_filter(s, a, mac); + if (r < 0) + return r; + + if (bind(s, &link.sa, sizeof(link.ll)) < 0) + return -errno; + + return TAKE_FD(s); +} + +int arp_send_packet( + int fd, + int ifindex, + const struct in_addr *pa, + const struct ether_addr *ha, + bool announce) { + + union sockaddr_union link = { + .ll.sll_family = AF_PACKET, + .ll.sll_protocol = htobe16(ETH_P_ARP), + .ll.sll_ifindex = ifindex, + .ll.sll_halen = ETH_ALEN, + .ll.sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + }; + struct ether_arp arp = { + .ea_hdr.ar_hrd = htobe16(ARPHRD_ETHER), /* HTYPE */ + .ea_hdr.ar_pro = htobe16(ETHERTYPE_IP), /* PTYPE */ + .ea_hdr.ar_hln = ETH_ALEN, /* HLEN */ + .ea_hdr.ar_pln = sizeof(struct in_addr), /* PLEN */ + .ea_hdr.ar_op = htobe16(ARPOP_REQUEST), /* REQUEST */ + }; + ssize_t n; + + assert(fd >= 0); + assert(ifindex > 0); + assert(pa); + assert(in4_addr_is_set(pa)); + assert(ha); + assert(!ether_addr_is_null(ha)); + + memcpy(&arp.arp_sha, ha, ETH_ALEN); + memcpy(&arp.arp_tpa, pa, sizeof(struct in_addr)); + + if (announce) + memcpy(&arp.arp_spa, pa, sizeof(struct in_addr)); + + n = sendto(fd, &arp, sizeof(struct ether_arp), 0, &link.sa, sizeof(link.ll)); + if (n < 0) + return -errno; + if (n != sizeof(struct ether_arp)) + return -EIO; + + return 0; +} diff --git a/src/libsystemd-network/arp-util.h b/src/libsystemd-network/arp-util.h new file mode 100644 index 0000000..b66a81b --- /dev/null +++ b/src/libsystemd-network/arp-util.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014 Axis Communications AB. All rights reserved. +***/ + +#include +#include + +#include "socket-util.h" +#include "sparse-endian.h" + +int arp_update_filter(int fd, const struct in_addr *a, const struct ether_addr *mac); +int arp_network_bind_raw_socket(int ifindex, const struct in_addr *a, const struct ether_addr *mac); + +int arp_send_packet( + int fd, + int ifindex, + const struct in_addr *pa, + const struct ether_addr *ha, + bool announce); +static inline int arp_send_probe( + int fd, + int ifindex, + const struct in_addr *pa, + const struct ether_addr *ha) { + return arp_send_packet(fd, ifindex, pa, ha, false); +} +static inline int arp_send_announcement( + int fd, + int ifindex, + const struct in_addr *pa, + const struct ether_addr *ha) { + return arp_send_packet(fd, ifindex, pa, ha, true); +} diff --git a/src/libsystemd-network/dhcp-client-internal.h b/src/libsystemd-network/dhcp-client-internal.h new file mode 100644 index 0000000..28ce80c --- /dev/null +++ b/src/libsystemd-network/dhcp-client-internal.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-dhcp-client.h" + +#include "macro.h" +#include "network-common.h" + +typedef enum DHCPState { + DHCP_STATE_STOPPED, + DHCP_STATE_INIT, + DHCP_STATE_SELECTING, + DHCP_STATE_INIT_REBOOT, + DHCP_STATE_REBOOTING, + DHCP_STATE_REQUESTING, + DHCP_STATE_BOUND, + DHCP_STATE_RENEWING, + DHCP_STATE_REBINDING, + _DHCP_STATE_MAX, + _DHCP_STATE_INVALID = -EINVAL, +} DHCPState; + +const char *dhcp_state_to_string(DHCPState s) _const_; + +typedef struct sd_dhcp_client sd_dhcp_client; + +int dhcp_client_set_state_callback( + sd_dhcp_client *client, + sd_dhcp_client_callback_t cb, + void *userdata); +int dhcp_client_get_state(sd_dhcp_client *client); + +/* If we are invoking callbacks of a dhcp-client, ensure unreffing the + * client from the callback doesn't destroy the object we are working + * on */ +#define DHCP_CLIENT_DONT_DESTROY(client) \ + _cleanup_(sd_dhcp_client_unrefp) _unused_ sd_dhcp_client *_dont_destroy_##client = sd_dhcp_client_ref(client) + +#define log_dhcp_client_errno(client, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "DHCPv4 client: ", \ + sd_dhcp_client, client, \ + error, fmt, ##__VA_ARGS__) +#define log_dhcp_client(client, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "DHCPv4 client: ", \ + sd_dhcp_client, client, \ + 0, fmt, ##__VA_ARGS__) diff --git a/src/libsystemd-network/dhcp-identifier.c b/src/libsystemd-network/dhcp-identifier.c new file mode 100644 index 0000000..f65cdbe --- /dev/null +++ b/src/libsystemd-network/dhcp-identifier.c @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "dhcp-identifier.h" +#include "netif-util.h" +#include "network-common.h" +#include "siphash24.h" +#include "sparse-endian.h" +#include "string-table.h" + +#define HASH_KEY SD_ID128_MAKE(80,11,8c,c2,fe,4a,03,ee,3e,d6,0c,6f,36,39,14,09) +#define APPLICATION_ID SD_ID128_MAKE(a5,0a,d1,12,bf,60,45,77,a2,fb,74,1a,b1,95,5b,03) +#define USEC_2000 ((usec_t) 946684800000000) /* 2000-01-01 00:00:00 UTC */ + +static const char * const duid_type_table[_DUID_TYPE_MAX] = { + [DUID_TYPE_LLT] = "DUID-LLT", + [DUID_TYPE_EN] = "DUID-EN/Vendor", + [DUID_TYPE_LL] = "DUID-LL", + [DUID_TYPE_UUID] = "UUID", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(duid_type, DUIDType); + +int dhcp_identifier_set_duid_llt( + const struct hw_addr_data *hw_addr, + uint16_t arp_type, + usec_t t, + struct duid *ret_duid, + size_t *ret_len) { + + uint16_t time_from_2000y; + + assert(hw_addr); + assert(ret_duid); + assert(ret_len); + + if (hw_addr->length == 0) + return -EOPNOTSUPP; + + if (arp_type == ARPHRD_ETHER) + assert_return(hw_addr->length == ETH_ALEN, -EINVAL); + else if (arp_type == ARPHRD_INFINIBAND) + assert_return(hw_addr->length == INFINIBAND_ALEN, -EINVAL); + else + return -EOPNOTSUPP; + + if (t < USEC_2000) + time_from_2000y = 0; + else + time_from_2000y = (uint16_t) (((t - USEC_2000) / USEC_PER_SEC) & 0xffffffff); + + unaligned_write_be16(&ret_duid->type, DUID_TYPE_LLT); + unaligned_write_be16(&ret_duid->llt.htype, arp_type); + unaligned_write_be32(&ret_duid->llt.time, time_from_2000y); + memcpy(ret_duid->llt.haddr, hw_addr->bytes, hw_addr->length); + + *ret_len = offsetof(struct duid, llt.haddr) + hw_addr->length; + + return 0; +} + +int dhcp_identifier_set_duid_ll( + const struct hw_addr_data *hw_addr, + uint16_t arp_type, + struct duid *ret_duid, + size_t *ret_len) { + + assert(hw_addr); + assert(ret_duid); + assert(ret_len); + + if (hw_addr->length == 0) + return -EOPNOTSUPP; + + if (arp_type == ARPHRD_ETHER) + assert_return(hw_addr->length == ETH_ALEN, -EINVAL); + else if (arp_type == ARPHRD_INFINIBAND) + assert_return(hw_addr->length == INFINIBAND_ALEN, -EINVAL); + else + return -EOPNOTSUPP; + + unaligned_write_be16(&ret_duid->type, DUID_TYPE_LL); + unaligned_write_be16(&ret_duid->ll.htype, arp_type); + memcpy(ret_duid->ll.haddr, hw_addr->bytes, hw_addr->length); + + *ret_len = offsetof(struct duid, ll.haddr) + hw_addr->length; + + return 0; +} + +int dhcp_identifier_set_duid_en(struct duid *ret_duid, size_t *ret_len) { + sd_id128_t machine_id; + bool test_mode; + uint64_t hash; + int r; + + assert(ret_duid); + assert(ret_len); + + test_mode = network_test_mode_enabled(); + + if (!test_mode) { + r = sd_id128_get_machine(&machine_id); + if (r < 0) + return r; + } else + /* For tests, especially for fuzzers, reproducibility is important. + * Hence, use a static and constant machine ID. + * See 9216fddc5a8ac2742e6cfa7660f95c20ca4f2193. */ + machine_id = SD_ID128_MAKE(01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f, 10); + + unaligned_write_be16(&ret_duid->type, DUID_TYPE_EN); + unaligned_write_be32(&ret_duid->en.pen, SYSTEMD_PEN); + + /* a bit of snake-oil perhaps, but no need to expose the machine-id + * directly; duid->en.id might not be aligned, so we need to copy */ + hash = htole64(siphash24(&machine_id, sizeof(machine_id), HASH_KEY.bytes)); + memcpy(ret_duid->en.id, &hash, sizeof(hash)); + + *ret_len = offsetof(struct duid, en.id) + sizeof(hash); + + if (test_mode) + assert_se(memcmp(ret_duid, (const uint8_t[]) { 0x00, 0x02, 0x00, 0x00, 0xab, 0x11, 0x61, 0x77, 0x40, 0xde, 0x13, 0x42, 0xc3, 0xa2 }, *ret_len) == 0); + + return 0; +} + +int dhcp_identifier_set_duid_uuid(struct duid *ret_duid, size_t *ret_len) { + sd_id128_t machine_id; + int r; + + assert(ret_duid); + assert(ret_len); + + r = sd_id128_get_machine_app_specific(APPLICATION_ID, &machine_id); + if (r < 0) + return r; + + unaligned_write_be16(&ret_duid->type, DUID_TYPE_UUID); + memcpy(&ret_duid->uuid.uuid, &machine_id, sizeof(machine_id)); + + *ret_len = offsetof(struct duid, uuid.uuid) + sizeof(machine_id); + + return 0; +} + +int dhcp_identifier_set_duid_raw( + DUIDType duid_type, + const uint8_t *buf, + size_t buf_len, + struct duid *ret_duid, + size_t *ret_len) { + + assert(buf || buf_len == 0); + assert(ret_duid); + assert(ret_len); + + if (duid_type < 0 || duid_type > UINT16_MAX) + return -EINVAL; + + if (buf_len > MAX_DUID_DATA_LEN) + return -EINVAL; + + unaligned_write_be16(&ret_duid->type, duid_type); + memcpy_safe(ret_duid->raw.data, buf, buf_len); + + *ret_len = offsetof(struct duid, raw.data) + buf_len; + return 0; +} + +int dhcp_identifier_set_iaid( + sd_device *dev, + const struct hw_addr_data *hw_addr, + bool legacy_unstable_byteorder, + void *ret) { + + const char *name = NULL; + uint32_t id32; + uint64_t id; + + assert(hw_addr); + assert(ret); + + if (dev) + name = net_get_persistent_name(dev); + if (name) + id = siphash24(name, strlen(name), HASH_KEY.bytes); + else + /* fall back to MAC address if no predictable name available */ + id = siphash24(hw_addr->bytes, hw_addr->length, HASH_KEY.bytes); + + id32 = (id & 0xffffffff) ^ (id >> 32); + + if (legacy_unstable_byteorder) + /* for historical reasons (a bug), the bits were swapped and thus + * the result was endianness dependent. Preserve that behavior. */ + id32 = bswap_32(id32); + else + /* the fixed behavior returns a stable byte order. Since LE is expected + * to be more common, swap the bytes on LE to give the same as legacy + * behavior. */ + id32 = be32toh(id32); + + unaligned_write_ne32(ret, id32); + return 0; +} diff --git a/src/libsystemd-network/dhcp-identifier.h b/src/libsystemd-network/dhcp-identifier.h new file mode 100644 index 0000000..96db588 --- /dev/null +++ b/src/libsystemd-network/dhcp-identifier.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-device.h" +#include "sd-id128.h" + +#include "ether-addr-util.h" +#include "macro.h" +#include "sparse-endian.h" +#include "time-util.h" +#include "unaligned.h" + +#define SYSTEMD_PEN 43793 + +typedef enum DUIDType { + DUID_TYPE_LLT = 1, + DUID_TYPE_EN = 2, + DUID_TYPE_LL = 3, + DUID_TYPE_UUID = 4, + _DUID_TYPE_MAX, + _DUID_TYPE_INVALID = -EINVAL, + _DUID_TYPE_FORCE_U16 = UINT16_MAX, +} DUIDType; + +/* RFC 8415 section 11.1: + * A DUID consists of a 2-octet type code represented in network byte order, followed by a variable number of + * octets that make up the actual identifier. The length of the DUID (not including the type code) is at + * least 1 octet and at most 128 octets. */ +#define MAX_DUID_DATA_LEN 128 +#define MAX_DUID_LEN (sizeof(be16_t) + MAX_DUID_DATA_LEN) + +/* https://tools.ietf.org/html/rfc3315#section-9.1 */ +struct duid { + be16_t type; + union { + struct { + /* DUID_TYPE_LLT */ + be16_t htype; + be32_t time; + uint8_t haddr[]; + } _packed_ llt; + struct { + /* DUID_TYPE_EN */ + be32_t pen; + uint8_t id[]; + } _packed_ en; + struct { + /* DUID_TYPE_LL */ + be16_t htype; + uint8_t haddr[]; + } _packed_ ll; + struct { + /* DUID_TYPE_UUID */ + sd_id128_t uuid; + } _packed_ uuid; + struct { + uint8_t data[MAX_DUID_DATA_LEN]; + } _packed_ raw; + }; +} _packed_; + +int dhcp_identifier_set_duid_llt( + const struct hw_addr_data *hw_addr, + uint16_t arp_type, + usec_t t, + struct duid *ret_duid, + size_t *ret_len); +int dhcp_identifier_set_duid_ll( + const struct hw_addr_data *hw_addr, + uint16_t arp_type, + struct duid *ret_duid, + size_t *ret_len); +int dhcp_identifier_set_duid_en(struct duid *ret_duid, size_t *ret_len); +int dhcp_identifier_set_duid_uuid(struct duid *ret_duid, size_t *ret_len); +int dhcp_identifier_set_duid_raw( + DUIDType duid_type, + const uint8_t *buf, + size_t buf_len, + struct duid *ret_duid, + size_t *ret_len); +int dhcp_identifier_set_iaid( + sd_device *dev, + const struct hw_addr_data *hw_addr, + bool legacy_unstable_byteorder, + void *ret); + +const char *duid_type_to_string(DUIDType t) _const_; diff --git a/src/libsystemd-network/dhcp-lease-internal.h b/src/libsystemd-network/dhcp-lease-internal.h new file mode 100644 index 0000000..a3d8bb4 --- /dev/null +++ b/src/libsystemd-network/dhcp-lease-internal.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include "sd-dhcp-client.h" + +#include "alloc-util.h" +#include "dhcp-option.h" +#include "list.h" +#include "time-util.h" + +struct sd_dhcp_route { + struct in_addr dst_addr; + struct in_addr gw_addr; + unsigned char dst_prefixlen; +}; + +struct sd_dhcp_raw_option { + LIST_FIELDS(struct sd_dhcp_raw_option, options); + + uint8_t tag; + uint8_t length; + void *data; +}; + +struct sd_dhcp_lease { + unsigned n_ref; + + /* each 0 if unset */ + usec_t t1; + usec_t t2; + usec_t lifetime; + triple_timestamp timestamp; + usec_t ipv6_only_preferred_usec; + + /* each 0 if unset */ + be32_t address; + be32_t server_address; + be32_t next_server; + + bool have_subnet_mask; + be32_t subnet_mask; + + bool have_broadcast; + be32_t broadcast; + + struct in_addr *router; + size_t router_size; + + bool rapid_commit; + + DHCPServerData servers[_SD_DHCP_LEASE_SERVER_TYPE_MAX]; + + struct sd_dhcp_route *static_routes; + size_t n_static_routes; + struct sd_dhcp_route *classless_routes; + size_t n_classless_routes; + + uint16_t mtu; /* 0 if unset */ + + char *domainname; + char **search_domains; + char *hostname; + char *root_path; + char *captive_portal; + + void *client_id; + size_t client_id_len; + + void *vendor_specific; + size_t vendor_specific_len; + + char *timezone; + + uint8_t sixrd_ipv4masklen; + uint8_t sixrd_prefixlen; + struct in6_addr sixrd_prefix; + struct in_addr *sixrd_br_addresses; + size_t sixrd_n_br_addresses; + + LIST_HEAD(struct sd_dhcp_raw_option, private_options); +}; + +int dhcp_lease_new(sd_dhcp_lease **ret); + +int dhcp_lease_parse_options(uint8_t code, uint8_t len, const void *option, void *userdata); +int dhcp_lease_parse_search_domains(const uint8_t *option, size_t len, char ***domains); +int dhcp_lease_insert_private_option(sd_dhcp_lease *lease, uint8_t tag, const void *data, uint8_t len); + +void dhcp_lease_set_timestamp(sd_dhcp_lease *lease, const triple_timestamp *timestamp); +int dhcp_lease_set_default_subnet_mask(sd_dhcp_lease *lease); +int dhcp_lease_set_client_id(sd_dhcp_lease *lease, const void *client_id, size_t client_id_len); + +#define dhcp_lease_unref_and_replace(a, b) \ + unref_and_replace_full(a, b, sd_dhcp_lease_ref, sd_dhcp_lease_unref) diff --git a/src/libsystemd-network/dhcp-network.c b/src/libsystemd-network/dhcp-network.c new file mode 100644 index 0000000..1f4ad09 --- /dev/null +++ b/src/libsystemd-network/dhcp-network.c @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dhcp-network.h" +#include "dhcp-protocol.h" +#include "fd-util.h" +#include "unaligned.h" + +static int _bind_raw_socket( + int ifindex, + union sockaddr_union *link, + uint32_t xid, + const struct hw_addr_data *hw_addr, + const struct hw_addr_data *bcast_addr, + uint16_t arp_type, + uint16_t port, + bool so_priority_set, + int so_priority) { + + assert(ifindex > 0); + assert(link); + assert(hw_addr); + assert(bcast_addr); + assert(IN_SET(arp_type, ARPHRD_ETHER, ARPHRD_INFINIBAND)); + + switch (arp_type) { + case ARPHRD_ETHER: + assert(hw_addr->length == ETH_ALEN); + assert(bcast_addr->length == ETH_ALEN); + break; + case ARPHRD_INFINIBAND: + assert(hw_addr->length == 0); + assert(bcast_addr->length == INFINIBAND_ALEN); + break; + default: + assert_not_reached(); + } + + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0), /* A <- packet length */ + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, sizeof(DHCPPacket), 1, 0), /* packet >= DHCPPacket ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(DHCPPacket, ip.protocol)), /* A <- IP protocol */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 1, 0), /* IP protocol == UDP ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(DHCPPacket, ip.frag_off)), /* A <- Flags */ + BPF_STMT(BPF_ALU + BPF_AND + BPF_K, 0x20), /* A <- A & 0x20 (More Fragments bit) */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 1, 0), /* A == 0 ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(DHCPPacket, ip.frag_off)), /* A <- Flags + Fragment offset */ + BPF_STMT(BPF_ALU + BPF_AND + BPF_K, 0x1fff), /* A <- A & 0x1fff (Fragment offset) */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 1, 0), /* A == 0 ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(DHCPPacket, udp.dest)), /* A <- UDP destination port */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, port, 1, 0), /* UDP destination port == DHCP client port ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(DHCPPacket, dhcp.op)), /* A <- DHCP op */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, BOOTREPLY, 1, 0), /* op == BOOTREPLY ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(DHCPPacket, dhcp.htype)), /* A <- DHCP header type */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arp_type, 1, 0), /* header type == arp_type ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(DHCPPacket, dhcp.xid)), /* A <- client identifier */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, xid, 1, 0), /* client identifier == xid ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(DHCPPacket, dhcp.hlen)), /* A <- MAC address length */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, (uint8_t) hw_addr->length, 1, 0), /* address length == hw_addr->length ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + + /* We only support MAC address length to be either 0 or 6 (ETH_ALEN). Optionally + * compare chaddr for ETH_ALEN bytes. */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETH_ALEN, 0, 8), /* A (the MAC address length) == ETH_ALEN ? */ + BPF_STMT(BPF_LDX + BPF_IMM, unaligned_read_be32(hw_addr->bytes)), /* X <- 4 bytes of client's MAC */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(DHCPPacket, dhcp.chaddr)), /* A <- 4 bytes of MAC from dhcp.chaddr */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_X, 0, 1, 0), /* A == X ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_LDX + BPF_IMM, unaligned_read_be16(hw_addr->bytes + 4)), /* X <- remainder of client's MAC */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(DHCPPacket, dhcp.chaddr) + 4), /* A <- remainder of MAC from dhcp.chaddr */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_X, 0, 1, 0), /* A == X ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(DHCPPacket, dhcp.magic)), /* A <- DHCP magic cookie */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DHCP_MAGIC_COOKIE, 1, 0), /* cookie == DHCP magic cookie ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept */ + }; + struct sock_fprog fprog = { + .len = ELEMENTSOF(filter), + .filter = filter + }; + _cleanup_close_ int s = -EBADF; + int r; + + s = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (s < 0) + return -errno; + + r = setsockopt_int(s, SOL_PACKET, PACKET_AUXDATA, true); + if (r < 0) + return r; + + r = setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog)); + if (r < 0) + return -errno; + + r = setsockopt_int(s, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return r; + + if (so_priority_set) { + r = setsockopt_int(s, SOL_SOCKET, SO_PRIORITY, so_priority); + if (r < 0) + return r; + } + + link->ll = (struct sockaddr_ll) { + .sll_family = AF_PACKET, + .sll_protocol = htobe16(ETH_P_IP), + .sll_ifindex = ifindex, + .sll_hatype = htobe16(arp_type), + .sll_halen = bcast_addr->length, + }; + /* We may overflow link->ll. link->ll_buffer ensures we have enough space. */ + memcpy(link->ll.sll_addr, bcast_addr->bytes, bcast_addr->length); + + r = bind(s, &link->sa, SOCKADDR_LL_LEN(link->ll)); + if (r < 0) + return -errno; + + return TAKE_FD(s); +} + +int dhcp_network_bind_raw_socket( + int ifindex, + union sockaddr_union *link, + uint32_t xid, + const struct hw_addr_data *hw_addr, + const struct hw_addr_data *bcast_addr, + uint16_t arp_type, + uint16_t port, + bool so_priority_set, + int so_priority) { + + static struct hw_addr_data default_eth_bcast = { + .length = ETH_ALEN, + .ether = {{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}, + }, default_ib_bcast = { + .length = INFINIBAND_ALEN, + .infiniband = { + 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff + }, + }; + + assert(ifindex > 0); + assert(link); + assert(hw_addr); + + switch (arp_type) { + case ARPHRD_ETHER: + return _bind_raw_socket(ifindex, link, xid, + hw_addr, + (bcast_addr && !hw_addr_is_null(bcast_addr)) ? bcast_addr : &default_eth_bcast, + arp_type, port, so_priority_set, so_priority); + + case ARPHRD_INFINIBAND: + return _bind_raw_socket(ifindex, link, xid, + &HW_ADDR_NULL, + (bcast_addr && !hw_addr_is_null(bcast_addr)) ? bcast_addr : &default_ib_bcast, + arp_type, port, so_priority_set, so_priority); + default: + return -EINVAL; + } +} + +int dhcp_network_bind_udp_socket(int ifindex, be32_t address, uint16_t port, int ip_service_type) { + union sockaddr_union src = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(port), + .in.sin_addr.s_addr = address, + }; + _cleanup_close_ int s = -EBADF; + int r; + + s = socket(AF_INET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (s < 0) + return -errno; + + if (ip_service_type >= 0) + r = setsockopt_int(s, IPPROTO_IP, IP_TOS, ip_service_type); + else + r = setsockopt_int(s, IPPROTO_IP, IP_TOS, IPTOS_CLASS_CS6); + if (r < 0) + return r; + + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return r; + + r = setsockopt_int(s, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return r; + + if (ifindex > 0) { + r = socket_bind_to_ifindex(s, ifindex); + if (r < 0) + return r; + } + + if (port == DHCP_PORT_SERVER) { + r = setsockopt_int(s, SOL_SOCKET, SO_BROADCAST, true); + if (r < 0) + return r; + if (address == INADDR_ANY) { + /* IP_PKTINFO filter should not be applied when packets are + allowed to enter/leave through the interface other than + DHCP server sits on(BindToInterface option). */ + r = setsockopt_int(s, IPPROTO_IP, IP_PKTINFO, true); + if (r < 0) + return r; + } + } else { + r = setsockopt_int(s, IPPROTO_IP, IP_FREEBIND, true); + if (r < 0) + return r; + } + + if (bind(s, &src.sa, sizeof(src.in)) < 0) + return -errno; + + return TAKE_FD(s); +} + +int dhcp_network_send_raw_socket( + int s, + const union sockaddr_union *link, + const void *packet, + size_t len) { + + /* Do not add assert(s >= 0) here, as this is called in fuzz-dhcp-server, and in that case this + * function should fail with negative errno. */ + + assert(link); + assert(packet); + assert(len > 0); + + if (sendto(s, packet, len, 0, &link->sa, SOCKADDR_LL_LEN(link->ll)) < 0) + return -errno; + + return 0; +} + +int dhcp_network_send_udp_socket( + int s, + be32_t address, + uint16_t port, + const void *packet, + size_t len) { + + union sockaddr_union dest = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(port), + .in.sin_addr.s_addr = address, + }; + + assert(s >= 0); + assert(packet); + assert(len > 0); + + if (sendto(s, packet, len, 0, &dest.sa, sizeof(dest.in)) < 0) + return -errno; + + return 0; +} diff --git a/src/libsystemd-network/dhcp-network.h b/src/libsystemd-network/dhcp-network.h new file mode 100644 index 0000000..eb9dab4 --- /dev/null +++ b/src/libsystemd-network/dhcp-network.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "ether-addr-util.h" +#include "socket-util.h" + +int dhcp_network_bind_raw_socket( + int ifindex, + union sockaddr_union *link, + uint32_t xid, + const struct hw_addr_data *hw_addr, + const struct hw_addr_data *bcast_addr, + uint16_t arp_type, + uint16_t port, + bool so_priority_set, + int so_priority); +int dhcp_network_bind_udp_socket( + int ifindex, + be32_t address, + uint16_t port, + int ip_service_type); +int dhcp_network_send_raw_socket( + int s, + const union sockaddr_union *link, + const void *packet, + size_t len); +int dhcp_network_send_udp_socket( + int s, + be32_t address, + uint16_t port, + const void *packet, + size_t len); diff --git a/src/libsystemd-network/dhcp-option.c b/src/libsystemd-network/dhcp-option.c new file mode 100644 index 0000000..5e216c5 --- /dev/null +++ b/src/libsystemd-network/dhcp-option.c @@ -0,0 +1,461 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include + +#include "alloc-util.h" +#include "dhcp-option.h" +#include "dhcp-server-internal.h" +#include "memory-util.h" +#include "ordered-set.h" +#include "strv.h" +#include "utf8.h" + +/* Append type-length value structure to the options buffer */ +static int dhcp_option_append_tlv(uint8_t options[], size_t size, size_t *offset, uint8_t code, size_t optlen, const void *optval) { + assert(options); + assert(size > 0); + assert(offset); + assert(optlen <= UINT8_MAX); + assert(*offset < size); + + if (*offset + 2 + optlen > size) + return -ENOBUFS; + + options[*offset] = code; + options[*offset + 1] = optlen; + + memcpy_safe(&options[*offset + 2], optval, optlen); + *offset += 2 + optlen; + return 0; +} + +static int option_append(uint8_t options[], size_t size, size_t *offset, + uint8_t code, size_t optlen, const void *optval) { + assert(options); + assert(size > 0); + assert(offset); + + int r; + + if (code != SD_DHCP_OPTION_END) + /* always make sure there is space for an END option */ + size--; + + switch (code) { + + case SD_DHCP_OPTION_PAD: + case SD_DHCP_OPTION_END: + if (*offset + 1 > size) + return -ENOBUFS; + + options[*offset] = code; + *offset += 1; + break; + + case SD_DHCP_OPTION_USER_CLASS: { + size_t total = 0; + + if (strv_isempty((char **) optval)) + return -EINVAL; + + STRV_FOREACH(s, (const char* const*) optval) { + size_t len = strlen(*s); + + if (len > 255 || len == 0) + return -EINVAL; + + total += 1 + len; + } + + if (*offset + 2 + total > size) + return -ENOBUFS; + + options[*offset] = code; + options[*offset + 1] = total; + *offset += 2; + + STRV_FOREACH(s, (const char* const*) optval) { + size_t len = strlen(*s); + + options[*offset] = len; + memcpy(&options[*offset + 1], *s, len); + *offset += 1 + len; + } + + break; + } + case SD_DHCP_OPTION_SIP_SERVER: + if (*offset + 3 + optlen > size) + return -ENOBUFS; + + options[*offset] = code; + options[*offset + 1] = optlen + 1; + options[*offset + 2] = 1; + + memcpy_safe(&options[*offset + 3], optval, optlen); + *offset += 3 + optlen; + + break; + case SD_DHCP_OPTION_VENDOR_SPECIFIC: { + OrderedSet *s = (OrderedSet *) optval; + struct sd_dhcp_option *p; + size_t l = 0; + + ORDERED_SET_FOREACH(p, s) + l += p->length + 2; + + if (*offset + l + 2 > size) + return -ENOBUFS; + + options[*offset] = code; + options[*offset + 1] = l; + *offset += 2; + + ORDERED_SET_FOREACH(p, s) { + r = dhcp_option_append_tlv(options, size, offset, p->option, p->length, p->data); + if (r < 0) + return r; + } + break; + } + case SD_DHCP_OPTION_RELAY_AGENT_INFORMATION: { + sd_dhcp_server *server = (sd_dhcp_server *) optval; + size_t current_offset = *offset + 2; + + if (server->agent_circuit_id) { + r = dhcp_option_append_tlv(options, size, ¤t_offset, SD_DHCP_RELAY_AGENT_CIRCUIT_ID, + strlen(server->agent_circuit_id), server->agent_circuit_id); + if (r < 0) + return r; + } + if (server->agent_remote_id) { + r = dhcp_option_append_tlv(options, size, ¤t_offset, SD_DHCP_RELAY_AGENT_REMOTE_ID, + strlen(server->agent_remote_id), server->agent_remote_id); + if (r < 0) + return r; + } + + options[*offset] = code; + options[*offset + 1] = current_offset - *offset - 2; + assert(current_offset - *offset - 2 <= UINT8_MAX); + *offset = current_offset; + break; + } + default: + return dhcp_option_append_tlv(options, size, offset, code, optlen, optval); + } + return 0; +} + +static int option_length(uint8_t *options, size_t length, size_t offset) { + assert(options); + assert(offset < length); + + if (IN_SET(options[offset], SD_DHCP_OPTION_PAD, SD_DHCP_OPTION_END)) + return 1; + if (length < offset + 2) + return -ENOBUFS; + + /* validating that buffer is long enough */ + if (length < offset + 2 + options[offset + 1]) + return -ENOBUFS; + + return options[offset + 1] + 2; +} + +int dhcp_option_find_option(uint8_t *options, size_t length, uint8_t code, size_t *ret_offset) { + int r; + + assert(options); + assert(ret_offset); + + for (size_t offset = 0; offset < length; offset += r) { + r = option_length(options, length, offset); + if (r < 0) + return r; + + if (code == options[offset]) { + *ret_offset = offset; + return r; + } + } + return -ENOENT; +} + +int dhcp_option_remove_option(uint8_t *options, size_t length, uint8_t option_code) { + int r; + size_t offset; + + assert(options); + + r = dhcp_option_find_option(options, length, option_code, &offset); + if (r < 0) + return r; + + memmove(options + offset, options + offset + r, length - offset - r); + return length - r; +} + +int dhcp_option_append(DHCPMessage *message, size_t size, size_t *offset, + uint8_t overload, + uint8_t code, size_t optlen, const void *optval) { + const bool use_file = overload & DHCP_OVERLOAD_FILE; + const bool use_sname = overload & DHCP_OVERLOAD_SNAME; + int r; + + assert(message); + assert(offset); + + /* If *offset is in range [0, size), we are writing to ->options, + * if *offset is in range [size, size + sizeof(message->file)) and use_file, we are writing to ->file, + * if *offset is in range [size + use_file*sizeof(message->file), size + use_file*sizeof(message->file) + sizeof(message->sname)) + * and use_sname, we are writing to ->sname. + */ + + if (*offset < size) { + /* still space in the options array */ + r = option_append(message->options, size, offset, code, optlen, optval); + if (r >= 0) + return 0; + else if (r == -ENOBUFS && (use_file || use_sname)) { + /* did not fit, but we have more buffers to try + close the options array and move the offset to its end */ + r = option_append(message->options, size, offset, SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + *offset = size; + } else + return r; + } + + if (use_file) { + size_t file_offset = *offset - size; + + if (file_offset < sizeof(message->file)) { + /* still space in the 'file' array */ + r = option_append(message->file, sizeof(message->file), &file_offset, code, optlen, optval); + if (r >= 0) { + *offset = size + file_offset; + return 0; + } else if (r == -ENOBUFS && use_sname) { + /* did not fit, but we have more buffers to try + close the file array and move the offset to its end */ + r = option_append(message->file, sizeof(message->file), &file_offset, SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + *offset = size + sizeof(message->file); + } else + return r; + } + } + + if (use_sname) { + size_t sname_offset = *offset - size - use_file*sizeof(message->file); + + if (sname_offset < sizeof(message->sname)) { + /* still space in the 'sname' array */ + r = option_append(message->sname, sizeof(message->sname), &sname_offset, code, optlen, optval); + if (r >= 0) { + *offset = size + use_file*sizeof(message->file) + sname_offset; + return 0; + } else + /* no space, or other error, give up */ + return r; + } + } + + return -ENOBUFS; +} + +static int parse_options(const uint8_t options[], size_t buflen, uint8_t *overload, + uint8_t *message_type, char **error_message, dhcp_option_callback_t cb, + void *userdata) { + uint8_t code, len; + const uint8_t *option; + size_t offset = 0; + int r; + + while (offset < buflen) { + code = options[offset ++]; + + switch (code) { + case SD_DHCP_OPTION_PAD: + continue; + + case SD_DHCP_OPTION_END: + return 0; + } + + if (buflen < offset + 1) + return -ENOBUFS; + + len = options[offset ++]; + + if (buflen < offset + len) + return -EINVAL; + + option = &options[offset]; + + switch (code) { + case SD_DHCP_OPTION_MESSAGE_TYPE: + if (len != 1) + return -EINVAL; + + if (message_type) + *message_type = *option; + + break; + + case SD_DHCP_OPTION_ERROR_MESSAGE: + if (len == 0) + return -EINVAL; + + if (error_message) { + _cleanup_free_ char *string = NULL; + + r = make_cstring((const char*) option, len, MAKE_CSTRING_ALLOW_TRAILING_NUL, &string); + if (r < 0) + return r; + + if (!ascii_is_valid(string)) + return -EINVAL; + + free_and_replace(*error_message, string); + } + + break; + case SD_DHCP_OPTION_OVERLOAD: + if (len != 1) + return -EINVAL; + + if (overload) + *overload = *option; + + break; + + default: + if (cb) + cb(code, len, option, userdata); + + break; + } + + offset += len; + } + + if (offset < buflen) + return -EINVAL; + + return 0; +} + +int dhcp_option_parse(DHCPMessage *message, size_t len, dhcp_option_callback_t cb, void *userdata, char **ret_error_message) { + _cleanup_free_ char *error_message = NULL; + uint8_t overload = 0; + uint8_t message_type = 0; + int r; + + if (!message) + return -EINVAL; + + if (len < sizeof(DHCPMessage)) + return -EINVAL; + + len -= sizeof(DHCPMessage); + + r = parse_options(message->options, len, &overload, &message_type, &error_message, cb, userdata); + if (r < 0) + return r; + + if (overload & DHCP_OVERLOAD_FILE) { + r = parse_options(message->file, sizeof(message->file), NULL, &message_type, &error_message, cb, userdata); + if (r < 0) + return r; + } + + if (overload & DHCP_OVERLOAD_SNAME) { + r = parse_options(message->sname, sizeof(message->sname), NULL, &message_type, &error_message, cb, userdata); + if (r < 0) + return r; + } + + if (message_type == 0) + return -ENOMSG; + + if (ret_error_message && IN_SET(message_type, DHCP_NAK, DHCP_DECLINE)) + *ret_error_message = TAKE_PTR(error_message); + + return message_type; +} + +int dhcp_option_parse_string(const uint8_t *option, size_t len, char **ret) { + int r; + + assert(option); + assert(ret); + + if (len <= 0) + *ret = mfree(*ret); + else { + char *string; + + /* + * One trailing NUL byte is OK, we don't mind. See: + * https://github.com/systemd/systemd/issues/1337 + */ + r = make_cstring((const char *) option, len, MAKE_CSTRING_ALLOW_TRAILING_NUL, &string); + if (r < 0) + return r; + + free_and_replace(*ret, string); + } + + return 0; +} + +static sd_dhcp_option* dhcp_option_free(sd_dhcp_option *i) { + if (!i) + return NULL; + + free(i->data); + return mfree(i); +} + +int sd_dhcp_option_new(uint8_t option, const void *data, size_t length, sd_dhcp_option **ret) { + assert_return(ret, -EINVAL); + assert_return(length == 0 || data, -EINVAL); + + _cleanup_free_ void *q = memdup(data, length); + if (!q) + return -ENOMEM; + + sd_dhcp_option *p = new(sd_dhcp_option, 1); + if (!p) + return -ENOMEM; + + *p = (sd_dhcp_option) { + .n_ref = 1, + .option = option, + .length = length, + .data = TAKE_PTR(q), + }; + + *ret = TAKE_PTR(p); + return 0; +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp_option, sd_dhcp_option, dhcp_option_free); +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + dhcp_option_hash_ops, + void, + trivial_hash_func, + trivial_compare_func, + sd_dhcp_option, + sd_dhcp_option_unref); diff --git a/src/libsystemd-network/dhcp-option.h b/src/libsystemd-network/dhcp-option.h new file mode 100644 index 0000000..425f5b5 --- /dev/null +++ b/src/libsystemd-network/dhcp-option.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-dhcp-option.h" + +#include "dhcp-protocol.h" +#include "hash-funcs.h" + +struct sd_dhcp_option { + unsigned n_ref; + + uint8_t option; + void *data; + size_t length; +}; + +extern const struct hash_ops dhcp_option_hash_ops; + +typedef struct DHCPServerData { + struct in_addr *addr; + size_t size; +} DHCPServerData; + +int dhcp_option_append( + DHCPMessage *message, + size_t size, + size_t *offset, + uint8_t overload, + uint8_t code, + size_t optlen, + const void *optval); +int dhcp_option_find_option(uint8_t *options, size_t length, uint8_t wanted_code, size_t *ret_offset); +int dhcp_option_remove_option(uint8_t *options, size_t buflen, uint8_t option_code); + +typedef int (*dhcp_option_callback_t)(uint8_t code, uint8_t len, const void *option, void *userdata); + +int dhcp_option_parse( + DHCPMessage *message, + size_t len, + dhcp_option_callback_t cb, + void *userdata, + char **ret_error_message); + +int dhcp_option_parse_string(const uint8_t *option, size_t len, char **ret); diff --git a/src/libsystemd-network/dhcp-packet.c b/src/libsystemd-network/dhcp-packet.c new file mode 100644 index 0000000..75b1d7e --- /dev/null +++ b/src/libsystemd-network/dhcp-packet.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include + +#include "dhcp-option.h" +#include "dhcp-packet.h" +#include "memory-util.h" + +#define DHCP_CLIENT_MIN_OPTIONS_SIZE 312 + +int dhcp_message_init( + DHCPMessage *message, + uint8_t op, + uint32_t xid, + uint8_t type, + uint16_t arp_type, + uint8_t hlen, + const uint8_t *chaddr, + size_t optlen, + size_t *optoffset) { + + size_t offset = 0; + int r; + + assert(IN_SET(op, BOOTREQUEST, BOOTREPLY)); + assert(chaddr || hlen == 0); + + message->op = op; + message->htype = arp_type; + + /* RFC2131 section 4.1.1: + The client MUST include its hardware address in the ’chaddr’ field, if + necessary for delivery of DHCP reply messages. + + RFC 4390 section 2.1: + A DHCP client, when working over an IPoIB interface, MUST follow the + following rules: + "htype" (hardware address type) MUST be 32 [ARPPARAM]. + "hlen" (hardware address length) MUST be 0. + "chaddr" (client hardware address) field MUST be zeroed. + */ + message->hlen = arp_type == ARPHRD_INFINIBAND ? 0 : hlen; + memcpy_safe(message->chaddr, chaddr, message->hlen); + + message->xid = htobe32(xid); + message->magic = htobe32(DHCP_MAGIC_COOKIE); + + r = dhcp_option_append(message, optlen, &offset, 0, + SD_DHCP_OPTION_MESSAGE_TYPE, 1, &type); + if (r < 0) + return r; + + *optoffset = offset; + + return 0; +} + +uint16_t dhcp_packet_checksum(uint8_t *buf, size_t len) { + uint64_t *buf_64 = (uint64_t*)buf; + uint64_t *end_64 = buf_64 + (len / sizeof(uint64_t)); + uint64_t sum = 0; + + /* See RFC1071 */ + + while (buf_64 < end_64) { + sum += *buf_64; + if (sum < *buf_64) + /* wrap around in one's complement */ + sum++; + + buf_64++; + } + + if (len % sizeof(uint64_t)) { + /* If the buffer is not aligned to 64-bit, we need + to zero-pad the last few bytes and add them in */ + uint64_t buf_tail = 0; + + memcpy(&buf_tail, buf_64, len % sizeof(uint64_t)); + + sum += buf_tail; + if (sum < buf_tail) + /* wrap around */ + sum++; + } + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +void dhcp_packet_append_ip_headers(DHCPPacket *packet, be32_t source_addr, + uint16_t source_port, be32_t destination_addr, + uint16_t destination_port, uint16_t len, int ip_service_type) { + packet->ip.version = IPVERSION; + packet->ip.ihl = DHCP_IP_SIZE / 4; + packet->ip.tot_len = htobe16(len); + + if (ip_service_type >= 0) + packet->ip.tos = ip_service_type; + else + packet->ip.tos = IPTOS_CLASS_CS6; + + packet->ip.protocol = IPPROTO_UDP; + packet->ip.saddr = source_addr; + packet->ip.daddr = destination_addr; + + packet->udp.source = htobe16(source_port); + packet->udp.dest = htobe16(destination_port); + + packet->udp.len = htobe16(len - DHCP_IP_SIZE); + + packet->ip.check = packet->udp.len; + packet->udp.check = dhcp_packet_checksum((uint8_t*)&packet->ip.ttl, len - 8); + + packet->ip.ttl = IPDEFTTL; + packet->ip.check = 0; + packet->ip.check = dhcp_packet_checksum((uint8_t*)&packet->ip, DHCP_IP_SIZE); +} + +int dhcp_packet_verify_headers(DHCPPacket *packet, size_t len, bool checksum, uint16_t port) { + size_t hdrlen; + + assert(packet); + + if (len < sizeof(DHCPPacket)) + return 0; + + /* IP */ + + if (packet->ip.version != IPVERSION) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: not IPv4"); + + if (packet->ip.ihl < 5) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: IPv4 IHL (%i words) invalid", + packet->ip.ihl); + + hdrlen = packet->ip.ihl * 4; + if (hdrlen < 20) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: IPv4 IHL (%zu bytes) smaller than minimum (20 bytes)", + hdrlen); + + if (len < hdrlen) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: packet (%zu bytes) smaller than expected (%zu) by IP header", + len, hdrlen); + + /* UDP */ + + if (packet->ip.protocol != IPPROTO_UDP) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: not UDP"); + + if (len < hdrlen + be16toh(packet->udp.len)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: packet (%zu bytes) smaller than expected (%zu) by UDP header", + len, hdrlen + be16toh(packet->udp.len)); + + if (be16toh(packet->udp.dest) != port) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: to port %u, which is not the DHCP client port (%u)", + be16toh(packet->udp.dest), port); + + /* checksums - computing these is relatively expensive, so only do it + if all the other checks have passed + */ + + if (dhcp_packet_checksum((uint8_t*)&packet->ip, hdrlen)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: invalid IP checksum"); + + if (checksum && packet->udp.check) { + packet->ip.check = packet->udp.len; + packet->ip.ttl = 0; + + if (dhcp_packet_checksum((uint8_t*)&packet->ip.ttl, + be16toh(packet->udp.len) + 12)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "ignoring packet: invalid UDP checksum"); + } + + return 0; +} diff --git a/src/libsystemd-network/dhcp-packet.h b/src/libsystemd-network/dhcp-packet.h new file mode 100644 index 0000000..751321b --- /dev/null +++ b/src/libsystemd-network/dhcp-packet.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "dhcp-protocol.h" + +int dhcp_message_init( + DHCPMessage *message, + uint8_t op, + uint32_t xid, + uint8_t type, + uint16_t arp_type, + uint8_t hlen, + const uint8_t *chaddr, + size_t optlen, + size_t *optoffset); + +uint16_t dhcp_packet_checksum(uint8_t *buf, size_t len); + +void dhcp_packet_append_ip_headers( + DHCPPacket *packet, + be32_t source_addr, + uint16_t source, + be32_t destination_addr, + uint16_t destination, + uint16_t len, + int ip_service_type); + +int dhcp_packet_verify_headers(DHCPPacket *packet, size_t len, bool checksum, uint16_t port); diff --git a/src/libsystemd-network/dhcp-protocol.h b/src/libsystemd-network/dhcp-protocol.h new file mode 100644 index 0000000..d7bb203 --- /dev/null +++ b/src/libsystemd-network/dhcp-protocol.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include + +#include "sd-dhcp-protocol.h" + +#include "macro.h" +#include "sparse-endian.h" +#include "time-util.h" + +/* RFC 8925 - IPv6-Only Preferred Option for DHCPv4 3.4. + * MIN_V6ONLY_WAIT: The lower boundary for V6ONLY_WAIT. Value: 300 seconds */ +#define MIN_V6ONLY_WAIT_USEC (300U * USEC_PER_SEC) + +struct DHCPMessage { + uint8_t op; + uint8_t htype; + uint8_t hlen; + uint8_t hops; + be32_t xid; + be16_t secs; + be16_t flags; + be32_t ciaddr; + be32_t yiaddr; + be32_t siaddr; + be32_t giaddr; + uint8_t chaddr[16]; + uint8_t sname[64]; + uint8_t file[128]; + be32_t magic; + uint8_t options[]; +} _packed_; + +typedef struct DHCPMessage DHCPMessage; + +struct DHCPPacket { + struct iphdr ip; + struct udphdr udp; + DHCPMessage dhcp; +} _packed_; + +typedef struct DHCPPacket DHCPPacket; + +#define DHCP_IP_SIZE (int32_t)(sizeof(struct iphdr)) +#define DHCP_IP_UDP_SIZE (int32_t)(sizeof(struct udphdr) + DHCP_IP_SIZE) +#define DHCP_HEADER_SIZE (int32_t)(sizeof(DHCPMessage)) +#define DHCP_MIN_MESSAGE_SIZE 576 /* the minimum internet hosts must be able to receive, see RFC 2132 Section 9.10 */ +#define DHCP_MIN_OPTIONS_SIZE (DHCP_MIN_MESSAGE_SIZE - DHCP_HEADER_SIZE) +#define DHCP_MIN_PACKET_SIZE (DHCP_MIN_MESSAGE_SIZE + DHCP_IP_UDP_SIZE) +#define DHCP_MAGIC_COOKIE (uint32_t)(0x63825363) + +enum { + DHCP_PORT_SERVER = 67, + DHCP_PORT_CLIENT = 68, +}; + +enum { + BOOTREQUEST = 1, + BOOTREPLY = 2, +}; + +enum { + DHCP_DISCOVER = 1, /* [RFC2132] */ + DHCP_OFFER = 2, /* [RFC2132] */ + DHCP_REQUEST = 3, /* [RFC2132] */ + DHCP_DECLINE = 4, /* [RFC2132] */ + DHCP_ACK = 5, /* [RFC2132] */ + DHCP_NAK = 6, /* [RFC2132] */ + DHCP_RELEASE = 7, /* [RFC2132] */ + DHCP_INFORM = 8, /* [RFC2132] */ + DHCP_FORCERENEW = 9, /* [RFC3203] */ + DHCPLEASEQUERY = 10, /* [RFC4388] */ + DHCPLEASEUNASSIGNED = 11, /* [RFC4388] */ + DHCPLEASEUNKNOWN = 12, /* [RFC4388] */ + DHCPLEASEACTIVE = 13, /* [RFC4388] */ + DHCPBULKLEASEQUERY = 14, /* [RFC6926] */ + DHCPLEASEQUERYDONE = 15, /* [RFC6926] */ + DHCPACTIVELEASEQUERY = 16, /* [RFC7724] */ + DHCPLEASEQUERYSTATUS = 17, /* [RFC7724] */ + DHCPTLS = 18, /* [RFC7724] */ +}; + +enum { + DHCP_OVERLOAD_FILE = 1, + DHCP_OVERLOAD_SNAME = 2, +}; + +#define DHCP_MAX_FQDN_LENGTH 255 + +enum { + DHCP_FQDN_FLAG_S = (1 << 0), + DHCP_FQDN_FLAG_O = (1 << 1), + DHCP_FQDN_FLAG_E = (1 << 2), + DHCP_FQDN_FLAG_N = (1 << 3), +}; diff --git a/src/libsystemd-network/dhcp-server-internal.h b/src/libsystemd-network/dhcp-server-internal.h new file mode 100644 index 0000000..da9e56b --- /dev/null +++ b/src/libsystemd-network/dhcp-server-internal.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include "sd-dhcp-server.h" +#include "sd-event.h" + +#include "dhcp-option.h" +#include "network-common.h" +#include "ordered-set.h" +#include "time-util.h" + +typedef enum DHCPRawOption { + DHCP_RAW_OPTION_DATA_UINT8, + DHCP_RAW_OPTION_DATA_UINT16, + DHCP_RAW_OPTION_DATA_UINT32, + DHCP_RAW_OPTION_DATA_STRING, + DHCP_RAW_OPTION_DATA_IPV4ADDRESS, + DHCP_RAW_OPTION_DATA_IPV6ADDRESS, + _DHCP_RAW_OPTION_DATA_MAX, + _DHCP_RAW_OPTION_DATA_INVALID, +} DHCPRawOption; + +typedef struct DHCPClientId { + size_t length; + uint8_t *data; +} DHCPClientId; + +typedef struct DHCPLease { + sd_dhcp_server *server; + + DHCPClientId client_id; + + uint8_t htype; /* e.g. ARPHRD_ETHER */ + uint8_t hlen; /* e.g. ETH_ALEN */ + be32_t address; + be32_t gateway; + uint8_t chaddr[16]; + usec_t expiration; + char *hostname; +} DHCPLease; + +struct sd_dhcp_server { + unsigned n_ref; + + sd_event *event; + int event_priority; + sd_event_source *receive_message; + sd_event_source *receive_broadcast; + int fd; + int fd_raw; + int fd_broadcast; + + int ifindex; + char *ifname; + bool bind_to_interface; + be32_t address; + be32_t netmask; + be32_t subnet; + uint32_t pool_offset; + uint32_t pool_size; + + char *timezone; + + DHCPServerData servers[_SD_DHCP_LEASE_SERVER_TYPE_MAX]; + struct in_addr boot_server_address; + char *boot_server_name; + char *boot_filename; + + OrderedSet *extra_options; + OrderedSet *vendor_options; + + bool emit_router; + struct in_addr router_address; + + Hashmap *bound_leases_by_client_id; + Hashmap *bound_leases_by_address; + Hashmap *static_leases_by_client_id; + Hashmap *static_leases_by_address; + + usec_t max_lease_time; + usec_t default_lease_time; + usec_t ipv6_only_preferred_usec; + bool rapid_commit; + + sd_dhcp_server_callback_t callback; + void *callback_userdata; + + struct in_addr relay_target; + + char *agent_circuit_id; + char *agent_remote_id; +}; + +typedef struct DHCPRequest { + /* received message */ + DHCPMessage *message; + + /* options */ + DHCPClientId client_id; + size_t max_optlen; + be32_t server_id; + be32_t requested_ip; + usec_t lifetime; + const uint8_t *agent_info_option; + char *hostname; + const uint8_t *parameter_request_list; + size_t parameter_request_list_len; + bool rapid_commit; + triple_timestamp timestamp; +} DHCPRequest; + +extern const struct hash_ops dhcp_lease_hash_ops; + +int dhcp_server_handle_message(sd_dhcp_server *server, DHCPMessage *message, + size_t length, const triple_timestamp *timestamp); +int dhcp_server_send_packet(sd_dhcp_server *server, + DHCPRequest *req, DHCPPacket *packet, + int type, size_t optoffset); + +void client_id_hash_func(const DHCPClientId *p, struct siphash *state); +int client_id_compare_func(const DHCPClientId *a, const DHCPClientId *b); + +DHCPLease *dhcp_lease_free(DHCPLease *lease); +DEFINE_TRIVIAL_CLEANUP_FUNC(DHCPLease*, dhcp_lease_free); + +#define log_dhcp_server_errno(server, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "DHCPv4 server: ", \ + sd_dhcp_server, server, \ + error, fmt, ##__VA_ARGS__) +#define log_dhcp_server(server, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "DHCPv4 server: ", \ + sd_dhcp_server, server, \ + 0, fmt, ##__VA_ARGS__) diff --git a/src/libsystemd-network/dhcp6-client-internal.h b/src/libsystemd-network/dhcp6-client-internal.h new file mode 100644 index 0000000..6c17f57 --- /dev/null +++ b/src/libsystemd-network/dhcp6-client-internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-dhcp6-client.h" + +int dhcp6_client_set_state_callback( + sd_dhcp6_client *client, + sd_dhcp6_client_callback_t cb, + void *userdata); +int dhcp6_client_get_state(sd_dhcp6_client *client); diff --git a/src/libsystemd-network/dhcp6-internal.h b/src/libsystemd-network/dhcp6-internal.h new file mode 100644 index 0000000..e5b3b13 --- /dev/null +++ b/src/libsystemd-network/dhcp6-internal.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "sd-event.h" +#include "sd-dhcp6-client.h" + +#include "dhcp-identifier.h" +#include "dhcp6-client-internal.h" +#include "dhcp6-option.h" +#include "dhcp6-protocol.h" +#include "ether-addr-util.h" +#include "hashmap.h" +#include "macro.h" +#include "network-common.h" +#include "ordered-set.h" +#include "sparse-endian.h" +#include "time-util.h" + +/* what to request from the server, addresses (IA_NA) and/or prefixes (IA_PD) */ +typedef enum DHCP6RequestIA { + DHCP6_REQUEST_IA_NA = 1 << 0, + DHCP6_REQUEST_IA_TA = 1 << 1, /* currently not used */ + DHCP6_REQUEST_IA_PD = 1 << 2, +} DHCP6RequestIA; + +struct sd_dhcp6_client { + unsigned n_ref; + + int ifindex; + char *ifname; + + struct in6_addr local_address; + struct hw_addr_data hw_addr; + uint16_t arp_type; + + sd_event *event; + sd_event_source *receive_message; + sd_event_source *timeout_resend; + sd_event_source *timeout_expire; + sd_event_source *timeout_t1; + sd_event_source *timeout_t2; + int event_priority; + int fd; + + sd_device *dev; + + DHCP6State state; + bool information_request; + usec_t information_request_time_usec; + usec_t information_refresh_time_usec; + be32_t transaction_id; + usec_t transaction_start; + usec_t retransmit_time; + uint8_t retransmit_count; + + bool iaid_set; + DHCP6IA ia_na; + DHCP6IA ia_pd; + DHCP6RequestIA request_ia; + struct duid duid; + size_t duid_len; + be16_t *req_opts; + size_t n_req_opts; + char *fqdn; + char *mudurl; + char **user_class; + char **vendor_class; + OrderedHashmap *extra_options; + OrderedSet *vendor_options; + bool rapid_commit; + + struct sd_dhcp6_lease *lease; + + sd_dhcp6_client_callback_t callback; + void *userdata; + sd_dhcp6_client_callback_t state_callback; + void *state_userdata; + bool send_release; +}; + +int dhcp6_network_bind_udp_socket(int ifindex, struct in6_addr *address); +int dhcp6_network_send_udp_socket(int s, struct in6_addr *address, + const void *packet, size_t len); + +int dhcp6_client_send_message(sd_dhcp6_client *client); +int dhcp6_client_set_transaction_id(sd_dhcp6_client *client, uint32_t transaction_id); + +#define log_dhcp6_client_errno(client, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "DHCPv6 client: ", \ + sd_dhcp6_client, client, \ + error, fmt, ##__VA_ARGS__) +#define log_dhcp6_client(client, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "DHCPv6 client: ", \ + sd_dhcp6_client, client, \ + 0, fmt, ##__VA_ARGS__) diff --git a/src/libsystemd-network/dhcp6-lease-internal.h b/src/libsystemd-network/dhcp6-lease-internal.h new file mode 100644 index 0000000..e76a108 --- /dev/null +++ b/src/libsystemd-network/dhcp6-lease-internal.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. +***/ + +#include + +#include "sd-dhcp6-lease.h" + +#include "dhcp6-option.h" +#include "dhcp6-protocol.h" +#include "macro.h" +#include "set.h" +#include "time-util.h" + +struct sd_dhcp6_lease { + unsigned n_ref; + + uint8_t *clientid; + size_t clientid_len; + uint8_t *serverid; + size_t serverid_len; + uint8_t preference; + bool rapid_commit; + triple_timestamp timestamp; + usec_t lifetime_t1; + usec_t lifetime_t2; + usec_t lifetime_valid; + struct in6_addr server_address; + + DHCP6IA *ia_na; /* Identity association non-temporary addresses */ + DHCP6IA *ia_pd; /* Identity association prefix delegation */ + + DHCP6Address *addr_iter; + DHCP6Address *prefix_iter; + + struct in6_addr *dns; + size_t dns_count; + char **domains; + struct in6_addr *ntp; + size_t ntp_count; + char **ntp_fqdn; + struct in6_addr *sntp; + size_t sntp_count; + char *fqdn; + char *captive_portal; + struct sd_dhcp6_option **sorted_vendor_options; + Set *vendor_options; +}; + +int dhcp6_lease_set_clientid(sd_dhcp6_lease *lease, const uint8_t *id, size_t len); +int dhcp6_lease_get_clientid(sd_dhcp6_lease *lease, uint8_t **ret_id, size_t *ret_len); +int dhcp6_lease_set_serverid(sd_dhcp6_lease *lease, const uint8_t *id, size_t len); +int dhcp6_lease_get_serverid(sd_dhcp6_lease *lease, uint8_t **ret_id, size_t *ret_len); +int dhcp6_lease_set_preference(sd_dhcp6_lease *lease, uint8_t preference); +int dhcp6_lease_get_preference(sd_dhcp6_lease *lease, uint8_t *ret); +int dhcp6_lease_set_rapid_commit(sd_dhcp6_lease *lease); +int dhcp6_lease_get_rapid_commit(sd_dhcp6_lease *lease, bool *ret); + +int dhcp6_lease_add_dns(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen); +int dhcp6_lease_add_domains(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen); +int dhcp6_lease_add_ntp(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen); +int dhcp6_lease_add_sntp(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen); +int dhcp6_lease_set_fqdn(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen); +int dhcp6_lease_set_captive_portal(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen); + +int dhcp6_lease_new(sd_dhcp6_lease **ret); +int dhcp6_lease_new_from_message( + sd_dhcp6_client *client, + const DHCP6Message *message, + size_t len, + const triple_timestamp *timestamp, + const struct in6_addr *server_address, + sd_dhcp6_lease **ret); + +#define _FOREACH_DHCP6_ADDRESS(lease, it) \ + for (int it = sd_dhcp6_lease_address_iterator_reset(lease); \ + it > 0; \ + it = sd_dhcp6_lease_address_iterator_next(lease)) +#define FOREACH_DHCP6_ADDRESS(lease) \ + _FOREACH_DHCP6_ADDRESS(lease, UNIQ_T(i, UNIQ)) + +#define _FOREACH_DHCP6_PD_PREFIX(lease, it) \ + for (int it = sd_dhcp6_lease_pd_iterator_reset(lease); \ + it > 0; \ + it = sd_dhcp6_lease_pd_iterator_next(lease)) +#define FOREACH_DHCP6_PD_PREFIX(lease) \ + _FOREACH_DHCP6_PD_PREFIX(lease, UNIQ_T(i, UNIQ)) diff --git a/src/libsystemd-network/dhcp6-network.c b/src/libsystemd-network/dhcp6-network.c new file mode 100644 index 0000000..a3e4e19 --- /dev/null +++ b/src/libsystemd-network/dhcp6-network.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dhcp6-internal.h" +#include "dhcp6-protocol.h" +#include "fd-util.h" +#include "socket-util.h" + +int dhcp6_network_bind_udp_socket(int ifindex, struct in6_addr *local_address) { + union sockaddr_union src = { + .in6.sin6_family = AF_INET6, + .in6.sin6_port = htobe16(DHCP6_PORT_CLIENT), + .in6.sin6_scope_id = ifindex, + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(ifindex > 0); + assert(local_address); + + src.in6.sin6_addr = *local_address; + + s = socket(AF_INET6, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, IPPROTO_UDP); + if (s < 0) + return -errno; + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_V6ONLY, true); + if (r < 0) + return r; + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, false); + if (r < 0) + return r; + + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return r; + + r = setsockopt_int(s, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return r; + + r = bind(s, &src.sa, sizeof(src.in6)); + if (r < 0) + return -errno; + + return TAKE_FD(s); +} + +int dhcp6_network_send_udp_socket(int s, struct in6_addr *server_address, + const void *packet, size_t len) { + union sockaddr_union dest = { + .in6.sin6_family = AF_INET6, + .in6.sin6_port = htobe16(DHCP6_PORT_SERVER), + }; + int r; + + assert(server_address); + + memcpy(&dest.in6.sin6_addr, server_address, sizeof(dest.in6.sin6_addr)); + + r = sendto(s, packet, len, 0, &dest.sa, sizeof(dest.in6)); + if (r < 0) + return -errno; + + return 0; +} diff --git a/src/libsystemd-network/dhcp6-option.c b/src/libsystemd-network/dhcp6-option.c new file mode 100644 index 0000000..83f40f3 --- /dev/null +++ b/src/libsystemd-network/dhcp6-option.c @@ -0,0 +1,979 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "sd-dhcp6-client.h" + +#include "alloc-util.h" +#include "dhcp6-internal.h" +#include "dhcp6-option.h" +#include "dhcp6-protocol.h" +#include "dns-domain.h" +#include "escape.h" +#include "memory-util.h" +#include "network-common.h" +#include "strv.h" +#include "unaligned.h" + +#define DHCP6_OPTION_IA_NA_LEN (sizeof(struct ia_na)) +#define DHCP6_OPTION_IA_PD_LEN (sizeof(struct ia_pd)) +#define DHCP6_OPTION_IA_TA_LEN (sizeof(struct ia_ta)) + +bool dhcp6_option_can_request(uint16_t option) { + /* See Client ORO field in + * https://www.iana.org/assignments/dhcpv6-parameters/dhcpv6-parameters.xhtml#dhcpv6-parameters-2 */ + + switch (option) { + case SD_DHCP6_OPTION_CLIENTID: + case SD_DHCP6_OPTION_SERVERID: + case SD_DHCP6_OPTION_IA_NA: + case SD_DHCP6_OPTION_IA_TA: + case SD_DHCP6_OPTION_IAADDR: + case SD_DHCP6_OPTION_ORO: + case SD_DHCP6_OPTION_PREFERENCE: + case SD_DHCP6_OPTION_ELAPSED_TIME: + case SD_DHCP6_OPTION_RELAY_MSG: + case SD_DHCP6_OPTION_AUTH: + case SD_DHCP6_OPTION_UNICAST: + case SD_DHCP6_OPTION_STATUS_CODE: + case SD_DHCP6_OPTION_RAPID_COMMIT: + case SD_DHCP6_OPTION_USER_CLASS: + case SD_DHCP6_OPTION_VENDOR_CLASS: + return false; + case SD_DHCP6_OPTION_VENDOR_OPTS: + return true; + case SD_DHCP6_OPTION_INTERFACE_ID: + case SD_DHCP6_OPTION_RECONF_MSG: + case SD_DHCP6_OPTION_RECONF_ACCEPT: + return false; + case SD_DHCP6_OPTION_SIP_SERVER_DOMAIN_NAME: + case SD_DHCP6_OPTION_SIP_SERVER_ADDRESS: + case SD_DHCP6_OPTION_DNS_SERVER: + case SD_DHCP6_OPTION_DOMAIN: + return true; + case SD_DHCP6_OPTION_IA_PD: + case SD_DHCP6_OPTION_IA_PD_PREFIX: + return false; + case SD_DHCP6_OPTION_NIS_SERVER: + case SD_DHCP6_OPTION_NISP_SERVER: + case SD_DHCP6_OPTION_NIS_DOMAIN_NAME: + case SD_DHCP6_OPTION_NISP_DOMAIN_NAME: + case SD_DHCP6_OPTION_SNTP_SERVER: + return true; + case SD_DHCP6_OPTION_INFORMATION_REFRESH_TIME: + return false; /* This is automatically set when sending INFORMATION_REQUEST message. */ + case SD_DHCP6_OPTION_BCMCS_SERVER_D: + case SD_DHCP6_OPTION_BCMCS_SERVER_A: + case SD_DHCP6_OPTION_GEOCONF_CIVIC: + return true; + case SD_DHCP6_OPTION_REMOTE_ID: + case SD_DHCP6_OPTION_SUBSCRIBER_ID: + return false; + case SD_DHCP6_OPTION_CLIENT_FQDN: + case SD_DHCP6_OPTION_PANA_AGENT: + case SD_DHCP6_OPTION_POSIX_TIMEZONE: + case SD_DHCP6_OPTION_TZDB_TIMEZONE: + return true; + case SD_DHCP6_OPTION_ERO: + case SD_DHCP6_OPTION_LQ_QUERY: + case SD_DHCP6_OPTION_CLIENT_DATA: + case SD_DHCP6_OPTION_CLT_TIME: + case SD_DHCP6_OPTION_LQ_RELAY_DATA: + case SD_DHCP6_OPTION_LQ_CLIENT_LINK: + return false; + case SD_DHCP6_OPTION_MIP6_HNIDF: + case SD_DHCP6_OPTION_MIP6_VDINF: + case SD_DHCP6_OPTION_V6_LOST: + case SD_DHCP6_OPTION_CAPWAP_AC_V6: + return true; + case SD_DHCP6_OPTION_RELAY_ID: + return false; + case SD_DHCP6_OPTION_IPV6_ADDRESS_MOS: + case SD_DHCP6_OPTION_IPV6_FQDN_MOS: + case SD_DHCP6_OPTION_NTP_SERVER: + case SD_DHCP6_OPTION_V6_ACCESS_DOMAIN: + case SD_DHCP6_OPTION_SIP_UA_CS_LIST: + case SD_DHCP6_OPTION_BOOTFILE_URL: + case SD_DHCP6_OPTION_BOOTFILE_PARAM: + return true; + case SD_DHCP6_OPTION_CLIENT_ARCH_TYPE: + return false; + case SD_DHCP6_OPTION_NII: + case SD_DHCP6_OPTION_GEOLOCATION: + case SD_DHCP6_OPTION_AFTR_NAME: + case SD_DHCP6_OPTION_ERP_LOCAL_DOMAIN_NAME: + return true; + case SD_DHCP6_OPTION_RSOO: + return false; + case SD_DHCP6_OPTION_PD_EXCLUDE: + return true; + case SD_DHCP6_OPTION_VSS: + return false; + case SD_DHCP6_OPTION_MIP6_IDINF: + case SD_DHCP6_OPTION_MIP6_UDINF: + case SD_DHCP6_OPTION_MIP6_HNP: + case SD_DHCP6_OPTION_MIP6_HAA: + case SD_DHCP6_OPTION_MIP6_HAF: + case SD_DHCP6_OPTION_RDNSS_SELECTION: + case SD_DHCP6_OPTION_KRB_PRINCIPAL_NAME: + case SD_DHCP6_OPTION_KRB_REALM_NAME: + case SD_DHCP6_OPTION_KRB_DEFAULT_REALM_NAME: + case SD_DHCP6_OPTION_KRB_KDC: + return true; + case SD_DHCP6_OPTION_CLIENT_LINKLAYER_ADDR: + case SD_DHCP6_OPTION_LINK_ADDRESS: + case SD_DHCP6_OPTION_RADIUS: + case SD_DHCP6_OPTION_SOL_MAX_RT: /* Automatically set when sending SOLICIT message. */ + case SD_DHCP6_OPTION_INF_MAX_RT: /* Automatically set when sending INFORMATION_REQUEST message. */ + return false; + case SD_DHCP6_OPTION_ADDRSEL: + case SD_DHCP6_OPTION_ADDRSEL_TABLE: + case SD_DHCP6_OPTION_V6_PCP_SERVER: + return true; + case SD_DHCP6_OPTION_DHCPV4_MSG: + return false; + case SD_DHCP6_OPTION_DHCP4_O_DHCP6_SERVER: + return true; + case SD_DHCP6_OPTION_S46_RULE: + return false; + case SD_DHCP6_OPTION_S46_BR: + return true; + case SD_DHCP6_OPTION_S46_DMR: + case SD_DHCP6_OPTION_S46_V4V6BIND: + case SD_DHCP6_OPTION_S46_PORTPARAMS: + return false; + case SD_DHCP6_OPTION_S46_CONT_MAPE: + case SD_DHCP6_OPTION_S46_CONT_MAPT: + case SD_DHCP6_OPTION_S46_CONT_LW: + case SD_DHCP6_OPTION_4RD: + case SD_DHCP6_OPTION_4RD_MAP_RULE: + case SD_DHCP6_OPTION_4RD_NON_MAP_RULE: + return true; + case SD_DHCP6_OPTION_LQ_BASE_TIME: + case SD_DHCP6_OPTION_LQ_START_TIME: + case SD_DHCP6_OPTION_LQ_END_TIME: + return false; + case SD_DHCP6_OPTION_CAPTIVE_PORTAL: + case SD_DHCP6_OPTION_MPL_PARAMETERS: + return true; + case SD_DHCP6_OPTION_ANI_ATT: + case SD_DHCP6_OPTION_ANI_NETWORK_NAME: + case SD_DHCP6_OPTION_ANI_AP_NAME: + case SD_DHCP6_OPTION_ANI_AP_BSSID: + case SD_DHCP6_OPTION_ANI_OPERATOR_ID: + case SD_DHCP6_OPTION_ANI_OPERATOR_REALM: + return false; + case SD_DHCP6_OPTION_S46_PRIORITY: + return true; + case SD_DHCP6_OPTION_MUD_URL_V6: + return false; + case SD_DHCP6_OPTION_V6_PREFIX64: + return true; + case SD_DHCP6_OPTION_F_BINDING_STATUS: + case SD_DHCP6_OPTION_F_CONNECT_FLAGS: + case SD_DHCP6_OPTION_F_DNS_REMOVAL_INFO: + case SD_DHCP6_OPTION_F_DNS_HOST_NAME: + case SD_DHCP6_OPTION_F_DNS_ZONE_NAME: + case SD_DHCP6_OPTION_F_DNS_FLAGS: + case SD_DHCP6_OPTION_F_EXPIRATION_TIME: + case SD_DHCP6_OPTION_F_MAX_UNACKED_BNDUPD: + case SD_DHCP6_OPTION_F_MCLT: + case SD_DHCP6_OPTION_F_PARTNER_LIFETIME: + case SD_DHCP6_OPTION_F_PARTNER_LIFETIME_SENT: + case SD_DHCP6_OPTION_F_PARTNER_DOWN_TIME: + case SD_DHCP6_OPTION_F_PARTNER_RAW_CLT_TIME: + case SD_DHCP6_OPTION_F_PROTOCOL_VERSION: + case SD_DHCP6_OPTION_F_KEEPALIVE_TIME: + case SD_DHCP6_OPTION_F_RECONFIGURE_DATA: + case SD_DHCP6_OPTION_F_RELATIONSHIP_NAME: + case SD_DHCP6_OPTION_F_SERVER_FLAGS: + case SD_DHCP6_OPTION_F_SERVER_STATE: + case SD_DHCP6_OPTION_F_START_TIME_OF_STATE: + case SD_DHCP6_OPTION_F_STATE_EXPIRATION_TIME: + case SD_DHCP6_OPTION_RELAY_PORT: + return false; + case SD_DHCP6_OPTION_V6_SZTP_REDIRECT: + case SD_DHCP6_OPTION_S46_BIND_IPV6_PREFIX: + return true; + case SD_DHCP6_OPTION_IA_LL: + case SD_DHCP6_OPTION_LLADDR: + case SD_DHCP6_OPTION_SLAP_QUAD: + return false; + case SD_DHCP6_OPTION_V6_DOTS_RI: + case SD_DHCP6_OPTION_V6_DOTS_ADDRESS: + case SD_DHCP6_OPTION_IPV6_ADDRESS_ANDSF: + return true; + default: + return false; + } +} + +static int option_append_hdr(uint8_t **buf, size_t *offset, uint16_t optcode, size_t optlen) { + assert(buf); + assert(*buf); + assert(offset); + + if (optlen > 0xffff) + return -ENOBUFS; + + if (optlen + offsetof(DHCP6Option, data) > SIZE_MAX - *offset) + return -ENOBUFS; + + if (!GREEDY_REALLOC(*buf, *offset + optlen + offsetof(DHCP6Option, data))) + return -ENOMEM; + + unaligned_write_be16(*buf + *offset + offsetof(DHCP6Option, code), optcode); + unaligned_write_be16(*buf + *offset + offsetof(DHCP6Option, len), optlen); + + *offset += offsetof(DHCP6Option, data); + return 0; +} + +int dhcp6_option_append( + uint8_t **buf, + size_t *offset, + uint16_t code, + size_t optlen, + const void *optval) { + + int r; + + assert(optval || optlen == 0); + + r = option_append_hdr(buf, offset, code, optlen); + if (r < 0) + return r; + + memcpy_safe(*buf + *offset, optval, optlen); + *offset += optlen; + + return 0; +} + +int dhcp6_option_append_vendor_option(uint8_t **buf, size_t *offset, OrderedSet *vendor_options) { + sd_dhcp6_option *options; + int r; + + assert(buf); + assert(*buf); + assert(offset); + + ORDERED_SET_FOREACH(options, vendor_options) { + _cleanup_free_ uint8_t *p = NULL; + size_t total; + + total = 4 + 2 + 2 + options->length; + + p = malloc(total); + if (!p) + return -ENOMEM; + + unaligned_write_be32(p, options->enterprise_identifier); + unaligned_write_be16(p + 4, options->option); + unaligned_write_be16(p + 6, options->length); + memcpy(p + 8, options->data, options->length); + + r = dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_VENDOR_OPTS, total, p); + if (r < 0) + return r; + } + + return 0; +} + +static int option_append_ia_address(uint8_t **buf, size_t *offset, const struct iaaddr *address) { + assert(buf); + assert(*buf); + assert(offset); + assert(address); + + /* Do not append T1 and T2. */ + const struct iaaddr a = { + .address = address->address, + }; + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_IAADDR, sizeof(struct iaaddr), &a); +} + +static int option_append_pd_prefix(uint8_t **buf, size_t *offset, const struct iapdprefix *prefix) { + assert(buf); + assert(*buf); + assert(offset); + assert(prefix); + + if (prefix->prefixlen == 0) + return -EINVAL; + + /* Do not append T1 and T2. */ + const struct iapdprefix p = { + .prefixlen = prefix->prefixlen, + .address = prefix->address, + }; + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_IA_PD_PREFIX, sizeof(struct iapdprefix), &p); +} + +int dhcp6_option_append_ia(uint8_t **buf, size_t *offset, const DHCP6IA *ia) { + _cleanup_free_ uint8_t *data = NULL; + struct ia_header header; + size_t len; + int r; + + assert(buf); + assert(*buf); + assert(offset); + assert(ia); + + /* client should not send set T1 and T2. See, RFC 8415, and issue #18090. */ + + switch (ia->type) { + case SD_DHCP6_OPTION_IA_NA: + case SD_DHCP6_OPTION_IA_PD: + len = sizeof(struct ia_header); + header = (struct ia_header) { + .id = ia->header.id, + }; + break; + + case SD_DHCP6_OPTION_IA_TA: + len = sizeof(header.id); /* IA_TA does not have lifetime. */ + header = (struct ia_header) { + .id = ia->header.id, + }; + break; + + default: + assert_not_reached(); + } + + if (!GREEDY_REALLOC(data, len)) + return -ENOMEM; + + memcpy(data, &header, len); + + LIST_FOREACH(addresses, addr, ia->addresses) { + if (ia->type == SD_DHCP6_OPTION_IA_PD) + r = option_append_pd_prefix(&data, &len, &addr->iapdprefix); + else + r = option_append_ia_address(&data, &len, &addr->iaaddr); + if (r < 0) + return r; + } + + return dhcp6_option_append(buf, offset, ia->type, len, data); +} + +int dhcp6_option_append_fqdn(uint8_t **buf, size_t *offset, const char *fqdn) { + uint8_t buffer[1 + DNS_WIRE_FORMAT_HOSTNAME_MAX]; + int r; + + assert(buf); + assert(*buf); + assert(offset); + + if (isempty(fqdn)) + return 0; + + buffer[0] = DHCP6_FQDN_FLAG_S; /* Request server to perform AAAA RR DNS updates */ + + /* Store domain name after flags field */ + r = dns_name_to_wire_format(fqdn, buffer + 1, sizeof(buffer) - 1, false); + if (r <= 0) + return r; + + /* + * According to RFC 4704, chapter 4.2 only add terminating zero-length + * label in case a FQDN is provided. Since dns_name_to_wire_format + * always adds terminating zero-length label remove if only a hostname + * is provided. + */ + if (dns_name_is_single_label(fqdn)) + r--; + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_CLIENT_FQDN, 1 + r, buffer); +} + +int dhcp6_option_append_user_class(uint8_t **buf, size_t *offset, char * const *user_class) { + _cleanup_free_ uint8_t *p = NULL; + size_t n = 0; + + assert(buf); + assert(*buf); + assert(offset); + + if (strv_isempty(user_class)) + return 0; + + STRV_FOREACH(s, user_class) { + size_t len = strlen(*s); + + if (len > UINT16_MAX || len == 0) + return -EINVAL; + + if (!GREEDY_REALLOC(p, n + len + 2)) + return -ENOMEM; + + unaligned_write_be16(p + n, len); + memcpy(p + n + 2, *s, len); + n += len + 2; + } + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_USER_CLASS, n, p); +} + +int dhcp6_option_append_vendor_class(uint8_t **buf, size_t *offset, char * const *vendor_class) { + _cleanup_free_ uint8_t *p = NULL; + size_t n = 0; + + assert(buf); + assert(*buf); + assert(offset); + + if (strv_isempty(vendor_class)) + return 0; + + if (!GREEDY_REALLOC(p, sizeof(be32_t))) + return -ENOMEM; + + /* Enterprise Identifier */ + unaligned_write_be32(p, SYSTEMD_PEN); + n += sizeof(be32_t); + + STRV_FOREACH(s, vendor_class) { + size_t len = strlen(*s); + + if (len > UINT16_MAX || len == 0) + return -EINVAL; + + if (!GREEDY_REALLOC(p, n + len + 2)) + return -ENOMEM; + + unaligned_write_be16(p + n, len); + memcpy(p + n + 2, *s, len); + n += len + 2; + } + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_VENDOR_CLASS, n, p); +} + +int dhcp6_option_parse( + const uint8_t *buf, + size_t buflen, + size_t *offset, + uint16_t *ret_option_code, + size_t *ret_option_data_len, + const uint8_t **ret_option_data) { + + size_t len; + + assert(buf); + assert(offset); + assert(ret_option_code); + assert(ret_option_data_len); + assert(ret_option_data); + + if (buflen < offsetof(DHCP6Option, data)) + return -EBADMSG; + + if (*offset > buflen - offsetof(DHCP6Option, data)) + return -EBADMSG; + + len = unaligned_read_be16(buf + *offset + offsetof(DHCP6Option, len)); + + if (len > buflen - offsetof(DHCP6Option, data) - *offset) + return -EBADMSG; + + *ret_option_code = unaligned_read_be16(buf + *offset + offsetof(DHCP6Option, code)); + *ret_option_data_len = len; + *ret_option_data = len == 0 ? NULL : buf + *offset + offsetof(DHCP6Option, data); + *offset += offsetof(DHCP6Option, data) + len; + + return 0; +} + +int dhcp6_option_parse_status(const uint8_t *data, size_t data_len, char **ret_status_message) { + DHCP6Status status; + + assert(data || data_len == 0); + + if (data_len < sizeof(uint16_t)) + return -EBADMSG; + + status = unaligned_read_be16(data); + + if (ret_status_message) { + _cleanup_free_ char *msg = NULL; + const char *s; + + /* The status message MUST NOT be null-terminated. See section 21.13 of RFC8415. + * Let's escape unsafe characters for safety. */ + msg = cescape_length((const char*) (data + sizeof(uint16_t)), data_len - sizeof(uint16_t)); + if (!msg) + return -ENOMEM; + + s = dhcp6_message_status_to_string(status); + if (s && !strextend_with_separator(&msg, ": ", s)) + return -ENOMEM; + + *ret_status_message = TAKE_PTR(msg); + } + + return status; +} + +/* parse a string from dhcp option field. *ret must be initialized */ +int dhcp6_option_parse_string(const uint8_t *data, size_t data_len, char **ret) { + _cleanup_free_ char *string = NULL; + int r; + + assert(data || data_len == 0); + assert(ret); + + if (data_len <= 0) { + *ret = mfree(*ret); + return 0; + } + + r = make_cstring((const char *) data, data_len, MAKE_CSTRING_REFUSE_TRAILING_NUL, &string); + if (r < 0) + return r; + + return free_and_replace(*ret, string); +} + +static int dhcp6_option_parse_ia_options(sd_dhcp6_client *client, const uint8_t *buf, size_t buflen) { + int r; + + assert(buf || buflen == 0); + + for (size_t offset = 0; offset < buflen;) { + const uint8_t *data; + size_t data_len; + uint16_t code; + + r = dhcp6_option_parse(buf, buflen, &offset, &code, &data_len, &data); + if (r < 0) + return r; + + switch (code) { + case SD_DHCP6_OPTION_STATUS_CODE: { + _cleanup_free_ char *msg = NULL; + + r = dhcp6_option_parse_status(data, data_len, &msg); + if (r == -ENOMEM) + return r; + if (r > 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA address or PD prefix option with non-zero status%s%s", + isempty(msg) ? "." : ": ", strempty(msg)); + if (r < 0) + /* Let's log but ignore the invalid status option. */ + log_dhcp6_client_errno(client, r, + "Received an IA address or PD prefix option with an invalid status sub option, ignoring: %m"); + break; + } + default: + log_dhcp6_client(client, "Received an unknown sub option %u in IA address or PD prefix, ignoring.", code); + } + } + + return 0; +} + +static int dhcp6_option_parse_ia_address(sd_dhcp6_client *client, DHCP6IA *ia, const uint8_t *data, size_t len) { + _cleanup_free_ DHCP6Address *a = NULL; + usec_t lt_valid, lt_pref; + int r; + + assert(ia); + assert(data || len == 0); + + if (!IN_SET(ia->type, SD_DHCP6_OPTION_IA_NA, SD_DHCP6_OPTION_IA_TA)) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA address sub-option in an invalid option, ignoring."); + + if (len < sizeof(struct iaaddr)) + return -EBADMSG; + + a = new(DHCP6Address, 1); + if (!a) + return -ENOMEM; + + memcpy(&a->iaaddr, data, sizeof(struct iaaddr)); + + lt_valid = be32_sec_to_usec(a->iaaddr.lifetime_valid, /* max_as_infinity = */ true); + lt_pref = be32_sec_to_usec(a->iaaddr.lifetime_preferred, /* max_as_infinity = */ true); + + if (lt_valid == 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA address with zero valid lifetime, ignoring."); + if (lt_pref > lt_valid) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA address with preferred lifetime %s " + "larger than valid lifetime %s, ignoring.", + FORMAT_TIMESPAN(lt_pref, USEC_PER_SEC), + FORMAT_TIMESPAN(lt_valid, USEC_PER_SEC)); + + if (len > sizeof(struct iaaddr)) { + r = dhcp6_option_parse_ia_options(client, data + sizeof(struct iaaddr), len - sizeof(struct iaaddr)); + if (r < 0) + return r; + } + + LIST_PREPEND(addresses, ia->addresses, TAKE_PTR(a)); + return 0; +} + +static int dhcp6_option_parse_ia_pdprefix(sd_dhcp6_client *client, DHCP6IA *ia, const uint8_t *data, size_t len) { + _cleanup_free_ DHCP6Address *a = NULL; + usec_t lt_valid, lt_pref; + int r; + + assert(ia); + assert(data || len == 0); + + if (ia->type != SD_DHCP6_OPTION_IA_PD) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an PD prefix sub-option in an invalid option, ignoring"); + + if (len < sizeof(struct iapdprefix)) + return -EBADMSG; + + a = new(DHCP6Address, 1); + if (!a) + return -ENOMEM; + + memcpy(&a->iapdprefix, data, sizeof(struct iapdprefix)); + + lt_valid = be32_sec_to_usec(a->iapdprefix.lifetime_valid, /* max_as_infinity = */ true); + lt_pref = be32_sec_to_usec(a->iapdprefix.lifetime_preferred, /* max_as_infinity = */ true); + + if (lt_valid == 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received a PD prefix with zero valid lifetime, ignoring."); + if (lt_pref > lt_valid) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received a PD prefix with preferred lifetime %s " + "larger than valid lifetime %s, ignoring.", + FORMAT_TIMESPAN(lt_pref, USEC_PER_SEC), + FORMAT_TIMESPAN(lt_valid, USEC_PER_SEC)); + + if (len > sizeof(struct iapdprefix)) { + r = dhcp6_option_parse_ia_options(client, data + sizeof(struct iapdprefix), len - sizeof(struct iapdprefix)); + if (r < 0) + return r; + } + + LIST_PREPEND(addresses, ia->addresses, TAKE_PTR(a)); + return 0; +} + +int dhcp6_option_parse_ia( + sd_dhcp6_client *client, + be32_t iaid, + uint16_t option_code, + size_t option_data_len, + const uint8_t *option_data, + DHCP6IA **ret) { + + _cleanup_(dhcp6_ia_freep) DHCP6IA *ia = NULL; + usec_t lt_t1, lt_t2; + size_t header_len; + int r; + + assert(IN_SET(option_code, SD_DHCP6_OPTION_IA_NA, SD_DHCP6_OPTION_IA_TA, SD_DHCP6_OPTION_IA_PD)); + assert(option_data || option_data_len == 0); + assert(ret); + + /* This will return the following: + * -ENOMEM: memory allocation error, + * -ENOANO: unmatching IAID, + * -EINVAL: non-zero status code, or invalid lifetime, + * -EBADMSG: invalid message format, + * -ENODATA: no valid address or PD prefix, + * 0: success. */ + + switch (option_code) { + case SD_DHCP6_OPTION_IA_NA: + case SD_DHCP6_OPTION_IA_PD: + header_len = sizeof(struct ia_header); + break; + + case SD_DHCP6_OPTION_IA_TA: + header_len = sizeof(be32_t); /* IA_TA does not have lifetime. */ + break; + + default: + assert_not_reached(); + } + + if (option_data_len < header_len) + return -EBADMSG; + + ia = new(DHCP6IA, 1); + if (!ia) + return -ENOMEM; + + *ia = (DHCP6IA) { + .type = option_code, + }; + memcpy(&ia->header, option_data, header_len); + + /* According to RFC8415, IAs which do not match the client's IAID should be ignored, + * but not necessary to ignore or refuse the whole message. */ + if (ia->header.id != iaid) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(ENOANO), + "Received an IA option with a different IAID " + "from the one chosen by the client, ignoring."); + + /* It is not necessary to check if the lifetime_t2 is zero here, as in that case it will be updated later. */ + lt_t1 = be32_sec_to_usec(ia->header.lifetime_t1, /* max_as_infinity = */ true); + lt_t2 = be32_sec_to_usec(ia->header.lifetime_t2, /* max_as_infinity = */ true); + + if (lt_t1 > lt_t2) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA option with T1 %s > T2 %s, ignoring.", + FORMAT_TIMESPAN(lt_t1, USEC_PER_SEC), + FORMAT_TIMESPAN(lt_t2, USEC_PER_SEC)); + if (lt_t1 == 0 && lt_t2 > 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA option with zero T1 and non-zero T2 (%s), ignoring.", + FORMAT_TIMESPAN(lt_t2, USEC_PER_SEC)); + + for (size_t offset = header_len; offset < option_data_len;) { + const uint8_t *subdata; + size_t subdata_len; + uint16_t subopt; + + r = dhcp6_option_parse(option_data, option_data_len, &offset, &subopt, &subdata_len, &subdata); + if (r < 0) + return r; + + switch (subopt) { + case SD_DHCP6_OPTION_IAADDR: { + r = dhcp6_option_parse_ia_address(client, ia, subdata, subdata_len); + if (r == -ENOMEM) + return r; + + /* Ignore non-critical errors in the sub-option. */ + break; + } + case SD_DHCP6_OPTION_IA_PD_PREFIX: { + r = dhcp6_option_parse_ia_pdprefix(client, ia, subdata, subdata_len); + if (r == -ENOMEM) + return r; + + /* Ignore non-critical errors in the sub-option. */ + break; + } + case SD_DHCP6_OPTION_STATUS_CODE: { + _cleanup_free_ char *msg = NULL; + + r = dhcp6_option_parse_status(subdata, subdata_len, &msg); + if (r == -ENOMEM) + return r; + if (r > 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received an IA option with non-zero status%s%s", + isempty(msg) ? "." : ": ", strempty(msg)); + if (r < 0) + log_dhcp6_client_errno(client, r, + "Received an IA option with an invalid status sub option, ignoring: %m"); + break; + } + default: + log_dhcp6_client(client, "Received an IA option with an unknown sub-option %u, ignoring", subopt); + } + } + + if (!ia->addresses) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(ENODATA), + "Received an IA option without valid IA addresses or PD prefixes, ignoring."); + + *ret = TAKE_PTR(ia); + return 0; +} + +int dhcp6_option_parse_addresses( + const uint8_t *optval, + size_t optlen, + struct in6_addr **addrs, + size_t *count) { + + assert(optval || optlen == 0); + assert(addrs); + assert(count); + + if (optlen == 0 || optlen % sizeof(struct in6_addr) != 0) + return -EBADMSG; + + if (!GREEDY_REALLOC(*addrs, *count + optlen / sizeof(struct in6_addr))) + return -ENOMEM; + + memcpy(*addrs + *count, optval, optlen); + *count += optlen / sizeof(struct in6_addr); + + return 0; +} + +static int parse_domain(const uint8_t **data, size_t *len, char **ret) { + _cleanup_free_ char *domain = NULL; + const uint8_t *optval; + size_t optlen, n = 0; + int r; + + assert(data); + assert(len); + assert(*data || *len == 0); + assert(ret); + + optval = *data; + optlen = *len; + + if (optlen <= 1) + return -ENODATA; + + for (;;) { + const char *label; + uint8_t c; + + if (optlen == 0) + break; + + c = *optval; + optval++; + optlen--; + + if (c == 0) + /* End label */ + break; + if (c > 63) + return -EBADMSG; + if (c > optlen) + return -EMSGSIZE; + + /* Literal label */ + label = (const char*) optval; + optval += c; + optlen -= c; + + if (!GREEDY_REALLOC(domain, n + (n != 0) + DNS_LABEL_ESCAPED_MAX)) + return -ENOMEM; + + if (n != 0) + domain[n++] = '.'; + + r = dns_label_escape(label, c, domain + n, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + n += r; + } + + if (n > 0) { + if (!GREEDY_REALLOC(domain, n + 1)) + return -ENOMEM; + + domain[n] = '\0'; + } + + *ret = TAKE_PTR(domain); + *data = optval; + *len = optlen; + + return n; +} + +int dhcp6_option_parse_domainname(const uint8_t *optval, size_t optlen, char **ret) { + _cleanup_free_ char *domain = NULL; + int r; + + assert(optval || optlen == 0); + assert(ret); + + r = parse_domain(&optval, &optlen, &domain); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + if (optlen != 0) + return -EINVAL; + + *ret = TAKE_PTR(domain); + return 0; +} + +int dhcp6_option_parse_domainname_list(const uint8_t *optval, size_t optlen, char ***ret) { + _cleanup_strv_free_ char **names = NULL; + int r; + + assert(optval || optlen == 0); + assert(ret); + + if (optlen <= 1) + return -ENODATA; + if (optval[optlen - 1] != '\0') + return -EINVAL; + + while (optlen > 0) { + _cleanup_free_ char *name = NULL; + + r = parse_domain(&optval, &optlen, &name); + if (r < 0) + return r; + if (r == 0) + continue; + + r = strv_consume(&names, TAKE_PTR(name)); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(names); + return 0; +} + +static sd_dhcp6_option* dhcp6_option_free(sd_dhcp6_option *i) { + if (!i) + return NULL; + + free(i->data); + return mfree(i); +} + +int sd_dhcp6_option_new(uint16_t option, const void *data, size_t length, uint32_t enterprise_identifier, sd_dhcp6_option **ret) { + assert_return(ret, -EINVAL); + assert_return(length == 0 || data, -EINVAL); + + _cleanup_free_ void *q = memdup(data, length); + if (!q) + return -ENOMEM; + + sd_dhcp6_option *p = new(sd_dhcp6_option, 1); + if (!p) + return -ENOMEM; + + *p = (sd_dhcp6_option) { + .n_ref = 1, + .option = option, + .enterprise_identifier = enterprise_identifier, + .length = length, + .data = TAKE_PTR(q), + }; + + *ret = p; + return 0; +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp6_option, sd_dhcp6_option, dhcp6_option_free); +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + dhcp6_option_hash_ops, + void, + trivial_hash_func, + trivial_compare_func, + sd_dhcp6_option, + sd_dhcp6_option_unref); diff --git a/src/libsystemd-network/dhcp6-option.h b/src/libsystemd-network/dhcp6-option.h new file mode 100644 index 0000000..614b4f8 --- /dev/null +++ b/src/libsystemd-network/dhcp6-option.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-dhcp6-client.h" + +#include "hash-funcs.h" +#include "list.h" +#include "macro.h" +#include "ordered-set.h" +#include "sparse-endian.h" + +typedef struct sd_dhcp6_option { + unsigned n_ref; + + uint32_t enterprise_identifier; + uint16_t option; + void *data; + size_t length; +} sd_dhcp6_option; + +extern const struct hash_ops dhcp6_option_hash_ops; + +/* Common option header */ +typedef struct DHCP6Option { + be16_t code; + be16_t len; + uint8_t data[]; +} _packed_ DHCP6Option; + +/* Address option */ +struct iaaddr { + struct in6_addr address; + be32_t lifetime_preferred; + be32_t lifetime_valid; +} _packed_; + +/* Prefix Delegation Prefix option */ +struct iapdprefix { + be32_t lifetime_preferred; + be32_t lifetime_valid; + uint8_t prefixlen; + struct in6_addr address; +} _packed_; + +typedef struct DHCP6Address DHCP6Address; + +struct DHCP6Address { + LIST_FIELDS(DHCP6Address, addresses); + + union { + struct iaaddr iaaddr; + struct iapdprefix iapdprefix; + }; +}; + +struct ia_header { + be32_t id; + be32_t lifetime_t1; + be32_t lifetime_t2; +} _packed_; + +typedef struct DHCP6IA { + uint16_t type; + struct ia_header header; + + LIST_HEAD(DHCP6Address, addresses); +} DHCP6IA; + +void dhcp6_ia_clear_addresses(DHCP6IA *ia); +DHCP6IA *dhcp6_ia_free(DHCP6IA *ia); +DEFINE_TRIVIAL_CLEANUP_FUNC(DHCP6IA*, dhcp6_ia_free); + +bool dhcp6_option_can_request(uint16_t option); + +int dhcp6_option_append(uint8_t **buf, size_t *offset, uint16_t code, + size_t optlen, const void *optval); +int dhcp6_option_append_ia(uint8_t **buf, size_t *offset, const DHCP6IA *ia); +int dhcp6_option_append_fqdn(uint8_t **buf, size_t *offset, const char *fqdn); +int dhcp6_option_append_user_class(uint8_t **buf, size_t *offset, char * const *user_class); +int dhcp6_option_append_vendor_class(uint8_t **buf, size_t *offset, char * const *vendor_class); +int dhcp6_option_append_vendor_option(uint8_t **buf, size_t *offset, OrderedSet *vendor_options); + +int dhcp6_option_parse( + const uint8_t *buf, + size_t buflen, + size_t *offset, + uint16_t *ret_option_code, + size_t *ret_option_data_len, + const uint8_t **ret_option_data); +int dhcp6_option_parse_status(const uint8_t *data, size_t data_len, char **ret_status_message); +int dhcp6_option_parse_string(const uint8_t *data, size_t data_len, char **ret); +int dhcp6_option_parse_ia( + sd_dhcp6_client *client, + be32_t iaid, + uint16_t option_code, + size_t option_data_len, + const uint8_t *option_data, + DHCP6IA **ret); +int dhcp6_option_parse_addresses( + const uint8_t *optval, + size_t optlen, + struct in6_addr **addrs, + size_t *count); +int dhcp6_option_parse_domainname_list(const uint8_t *optval, size_t optlen, char ***ret); +int dhcp6_option_parse_domainname(const uint8_t *optval, size_t optlen, char **ret); diff --git a/src/libsystemd-network/dhcp6-protocol.c b/src/libsystemd-network/dhcp6-protocol.c new file mode 100644 index 0000000..be0f651 --- /dev/null +++ b/src/libsystemd-network/dhcp6-protocol.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dhcp6-protocol.h" +#include "string-table.h" + +static const char * const dhcp6_state_table[_DHCP6_STATE_MAX] = { + [DHCP6_STATE_STOPPED] = "stopped", + [DHCP6_STATE_INFORMATION_REQUEST] = "information-request", + [DHCP6_STATE_SOLICITATION] = "solicitation", + [DHCP6_STATE_REQUEST] = "request", + [DHCP6_STATE_BOUND] = "bound", + [DHCP6_STATE_RENEW] = "renew", + [DHCP6_STATE_REBIND] = "rebind", + [DHCP6_STATE_STOPPING] = "stopping", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(dhcp6_state, DHCP6State); + +static const char * const dhcp6_message_type_table[_DHCP6_MESSAGE_TYPE_MAX] = { + [DHCP6_MESSAGE_SOLICIT] = "Solicit", + [DHCP6_MESSAGE_ADVERTISE] = "Advertise", + [DHCP6_MESSAGE_REQUEST] = "Request", + [DHCP6_MESSAGE_CONFIRM] = "Confirm", + [DHCP6_MESSAGE_RENEW] = "Renew", + [DHCP6_MESSAGE_REBIND] = "Rebind", + [DHCP6_MESSAGE_REPLY] = "Reply", + [DHCP6_MESSAGE_RELEASE] = "Release", + [DHCP6_MESSAGE_DECLINE] = "Decline", + [DHCP6_MESSAGE_RECONFIGURE] = "Reconfigure", + [DHCP6_MESSAGE_INFORMATION_REQUEST] = "Information Request", + [DHCP6_MESSAGE_RELAY_FORWARD] = "Relay Forward", + [DHCP6_MESSAGE_RELAY_REPLY] = "Relay Reply", + [DHCP6_MESSAGE_LEASE_QUERY] = "Lease Query", + [DHCP6_MESSAGE_LEASE_QUERY_REPLY] = "Lease Query Reply", + [DHCP6_MESSAGE_LEASE_QUERY_DONE] = "Lease Query Done", + [DHCP6_MESSAGE_LEASE_QUERY_DATA] = "Lease Query Data", + [DHCP6_MESSAGE_RECONFIGURE_REQUEST] = "Reconfigure Request", + [DHCP6_MESSAGE_RECONFIGURE_REPLY] = "Reconfigure Reply", + [DHCP6_MESSAGE_DHCPV4_QUERY] = "DHCPv4 Query", + [DHCP6_MESSAGE_DHCPV4_RESPONSE] = "DHCPv4 Response", + [DHCP6_MESSAGE_ACTIVE_LEASE_QUERY] = "Active Lease Query", + [DHCP6_MESSAGE_START_TLS] = "Start TLS", + [DHCP6_MESSAGE_BINDING_UPDATE] = "Binding Update", + [DHCP6_MESSAGE_BINDING_REPLY] = "Binding Reply", + [DHCP6_MESSAGE_POOL_REQUEST] = "Pool Request", + [DHCP6_MESSAGE_POOL_RESPONSE] = "Pool Response", + [DHCP6_MESSAGE_UPDATE_REQUEST] = "Update Request", + [DHCP6_MESSAGE_UPDATE_REQUEST_ALL] = "Update Request All", + [DHCP6_MESSAGE_UPDATE_DONE] = "Update Done", + [DHCP6_MESSAGE_CONNECT] = "Connect", + [DHCP6_MESSAGE_CONNECT_REPLY] = "Connect Reply", + [DHCP6_MESSAGE_DISCONNECT] = "Disconnect", + [DHCP6_MESSAGE_STATE] = "State", + [DHCP6_MESSAGE_CONTACT] = "Contact", +}; + +DEFINE_STRING_TABLE_LOOKUP(dhcp6_message_type, DHCP6MessageType); + +static const char * const dhcp6_message_status_table[_DHCP6_STATUS_MAX] = { + [DHCP6_STATUS_SUCCESS] = "Success", + [DHCP6_STATUS_UNSPEC_FAIL] = "Unspecified failure", + [DHCP6_STATUS_NO_ADDRS_AVAIL] = "No addresses available", + [DHCP6_STATUS_NO_BINDING] = "Binding unavailable", + [DHCP6_STATUS_NOT_ON_LINK] = "Not on link", + [DHCP6_STATUS_USE_MULTICAST] = "Use multicast", + [DHCP6_STATUS_NO_PREFIX_AVAIL] = "No prefix available", + [DHCP6_STATUS_UNKNOWN_QUERY_TYPE] = "Unknown query type", + [DHCP6_STATUS_MALFORMED_QUERY] = "Malformed query", + [DHCP6_STATUS_NOT_CONFIGURED] = "Not configured", + [DHCP6_STATUS_NOT_ALLOWED] = "Not allowed", + [DHCP6_STATUS_QUERY_TERMINATED] = "Query terminated", + [DHCP6_STATUS_DATA_MISSING] = "Data missing", + [DHCP6_STATUS_CATCHUP_COMPLETE] = "Catch up complete", + [DHCP6_STATUS_NOT_SUPPORTED] = "Not supported", + [DHCP6_STATUS_TLS_CONNECTION_REFUSED] = "TLS connection refused", + [DHCP6_STATUS_ADDRESS_IN_USE] = "Address in use", + [DHCP6_STATUS_CONFIGURATION_CONFLICT] = "Configuration conflict", + [DHCP6_STATUS_MISSING_BINDING_INFORMATION] = "Missing binding information", + [DHCP6_STATUS_OUTDATED_BINDING_INFORMATION] = "Outdated binding information", + [DHCP6_STATUS_SERVER_SHUTTING_DOWN] = "Server shutting down", + [DHCP6_STATUS_DNS_UPDATE_NOT_SUPPORTED] = "DNS update not supported", + [DHCP6_STATUS_EXCESSIVE_TIME_SKEW] = "Excessive time skew", +}; + +DEFINE_STRING_TABLE_LOOKUP(dhcp6_message_status, DHCP6Status); + +int dhcp6_message_status_to_errno(DHCP6Status s) { + switch (s) { + case DHCP6_STATUS_SUCCESS: + return 0; + case DHCP6_STATUS_NO_BINDING: + return -EADDRNOTAVAIL; + default: + return -EINVAL; + } +} diff --git a/src/libsystemd-network/dhcp6-protocol.h b/src/libsystemd-network/dhcp6-protocol.h new file mode 100644 index 0000000..c70f932 --- /dev/null +++ b/src/libsystemd-network/dhcp6-protocol.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include + +#include "macro.h" +#include "sparse-endian.h" + +struct DHCP6Message { + union { + struct { + uint8_t type; + uint8_t _pad[3]; + } _packed_; + be32_t transaction_id; + }; + uint8_t options[]; +} _packed_; + +typedef struct DHCP6Message DHCP6Message; + +#define DHCP6_MIN_OPTIONS_SIZE \ + 1280 - sizeof(struct ip6_hdr) - sizeof(struct udphdr) + +#define IN6ADDR_ALL_DHCP6_RELAY_AGENTS_AND_SERVERS_INIT \ + { { { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x02 } } } + +enum { + DHCP6_PORT_SERVER = 547, + DHCP6_PORT_CLIENT = 546, +}; + +#define DHCP6_INF_TIMEOUT (1 * USEC_PER_SEC) +#define DHCP6_INF_MAX_RT (120 * USEC_PER_SEC) +#define DHCP6_SOL_MAX_DELAY (1 * USEC_PER_SEC) +#define DHCP6_SOL_TIMEOUT (1 * USEC_PER_SEC) +#define DHCP6_SOL_MAX_RT (120 * USEC_PER_SEC) +#define DHCP6_REQ_TIMEOUT (1 * USEC_PER_SEC) +#define DHCP6_REQ_MAX_RT (120 * USEC_PER_SEC) +#define DHCP6_REQ_MAX_RC 10 +#define DHCP6_REN_TIMEOUT (10 * USEC_PER_SEC) +#define DHCP6_REN_MAX_RT (600 * USEC_PER_SEC) +#define DHCP6_REB_TIMEOUT (10 * USEC_PER_SEC) +#define DHCP6_REB_MAX_RT (600 * USEC_PER_SEC) + +typedef enum DHCP6State { + DHCP6_STATE_STOPPED, + DHCP6_STATE_INFORMATION_REQUEST, + DHCP6_STATE_SOLICITATION, + DHCP6_STATE_REQUEST, + DHCP6_STATE_BOUND, + DHCP6_STATE_RENEW, + DHCP6_STATE_REBIND, + DHCP6_STATE_STOPPING, + _DHCP6_STATE_MAX, + _DHCP6_STATE_INVALID = -EINVAL, +} DHCP6State; + +/* https://www.iana.org/assignments/dhcpv6-parameters/dhcpv6-parameters.xhtml#dhcpv6-parameters-1 */ +typedef enum DHCP6MessageType { + DHCP6_MESSAGE_SOLICIT = 1, /* RFC 8415 */ + DHCP6_MESSAGE_ADVERTISE = 2, /* RFC 8415 */ + DHCP6_MESSAGE_REQUEST = 3, /* RFC 8415 */ + DHCP6_MESSAGE_CONFIRM = 4, /* RFC 8415 */ + DHCP6_MESSAGE_RENEW = 5, /* RFC 8415 */ + DHCP6_MESSAGE_REBIND = 6, /* RFC 8415 */ + DHCP6_MESSAGE_REPLY = 7, /* RFC 8415 */ + DHCP6_MESSAGE_RELEASE = 8, /* RFC 8415 */ + DHCP6_MESSAGE_DECLINE = 9, /* RFC 8415 */ + DHCP6_MESSAGE_RECONFIGURE = 10, /* RFC 8415 */ + DHCP6_MESSAGE_INFORMATION_REQUEST = 11, /* RFC 8415 */ + DHCP6_MESSAGE_RELAY_FORWARD = 12, /* RFC 8415 */ + DHCP6_MESSAGE_RELAY_REPLY = 13, /* RFC 8415 */ + DHCP6_MESSAGE_LEASE_QUERY = 14, /* RFC 5007 */ + DHCP6_MESSAGE_LEASE_QUERY_REPLY = 15, /* RFC 5007 */ + DHCP6_MESSAGE_LEASE_QUERY_DONE = 16, /* RFC 5460 */ + DHCP6_MESSAGE_LEASE_QUERY_DATA = 17, /* RFC 5460 */ + DHCP6_MESSAGE_RECONFIGURE_REQUEST = 18, /* RFC 6977 */ + DHCP6_MESSAGE_RECONFIGURE_REPLY = 19, /* RFC 6977 */ + DHCP6_MESSAGE_DHCPV4_QUERY = 20, /* RFC 7341 */ + DHCP6_MESSAGE_DHCPV4_RESPONSE = 21, /* RFC 7341 */ + DHCP6_MESSAGE_ACTIVE_LEASE_QUERY = 22, /* RFC 7653 */ + DHCP6_MESSAGE_START_TLS = 23, /* RFC 7653 */ + DHCP6_MESSAGE_BINDING_UPDATE = 24, /* RFC 8156 */ + DHCP6_MESSAGE_BINDING_REPLY = 25, /* RFC 8156 */ + DHCP6_MESSAGE_POOL_REQUEST = 26, /* RFC 8156 */ + DHCP6_MESSAGE_POOL_RESPONSE = 27, /* RFC 8156 */ + DHCP6_MESSAGE_UPDATE_REQUEST = 28, /* RFC 8156 */ + DHCP6_MESSAGE_UPDATE_REQUEST_ALL = 29, /* RFC 8156 */ + DHCP6_MESSAGE_UPDATE_DONE = 30, /* RFC 8156 */ + DHCP6_MESSAGE_CONNECT = 31, /* RFC 8156 */ + DHCP6_MESSAGE_CONNECT_REPLY = 32, /* RFC 8156 */ + DHCP6_MESSAGE_DISCONNECT = 33, /* RFC 8156 */ + DHCP6_MESSAGE_STATE = 34, /* RFC 8156 */ + DHCP6_MESSAGE_CONTACT = 35, /* RFC 8156 */ + _DHCP6_MESSAGE_TYPE_MAX, + _DHCP6_MESSAGE_TYPE_INVALID = -EINVAL, +} DHCP6MessageType; + +typedef enum DHCP6NTPSubOption { + DHCP6_NTP_SUBOPTION_SRV_ADDR = 1, + DHCP6_NTP_SUBOPTION_MC_ADDR = 2, + DHCP6_NTP_SUBOPTION_SRV_FQDN = 3, + _DHCP6_NTP_SUBOPTION_MAX, + _DHCP6_NTP_SUBOPTION_INVALID = -EINVAL, +} DHCP6NTPSubOption; + +/* + * RFC 8415, RFC 5007 and RFC 7653 status codes: + * https://www.iana.org/assignments/dhcpv6-parameters/dhcpv6-parameters.xhtml#dhcpv6-parameters-5 + */ +typedef enum DHCP6Status { + DHCP6_STATUS_SUCCESS = 0, + DHCP6_STATUS_UNSPEC_FAIL = 1, + DHCP6_STATUS_NO_ADDRS_AVAIL = 2, + DHCP6_STATUS_NO_BINDING = 3, + DHCP6_STATUS_NOT_ON_LINK = 4, + DHCP6_STATUS_USE_MULTICAST = 5, + DHCP6_STATUS_NO_PREFIX_AVAIL = 6, + DHCP6_STATUS_UNKNOWN_QUERY_TYPE = 7, + DHCP6_STATUS_MALFORMED_QUERY = 8, + DHCP6_STATUS_NOT_CONFIGURED = 9, + DHCP6_STATUS_NOT_ALLOWED = 10, + DHCP6_STATUS_QUERY_TERMINATED = 11, + DHCP6_STATUS_DATA_MISSING = 12, + DHCP6_STATUS_CATCHUP_COMPLETE = 13, + DHCP6_STATUS_NOT_SUPPORTED = 14, + DHCP6_STATUS_TLS_CONNECTION_REFUSED = 15, + DHCP6_STATUS_ADDRESS_IN_USE = 16, + DHCP6_STATUS_CONFIGURATION_CONFLICT = 17, + DHCP6_STATUS_MISSING_BINDING_INFORMATION = 18, + DHCP6_STATUS_OUTDATED_BINDING_INFORMATION = 19, + DHCP6_STATUS_SERVER_SHUTTING_DOWN = 20, + DHCP6_STATUS_DNS_UPDATE_NOT_SUPPORTED = 21, + DHCP6_STATUS_EXCESSIVE_TIME_SKEW = 22, + _DHCP6_STATUS_MAX, + _DHCP6_STATUS_INVALID = -EINVAL, +} DHCP6Status; + +typedef enum DHCP6FQDNFlag { + DHCP6_FQDN_FLAG_S = 1 << 0, + DHCP6_FQDN_FLAG_O = 1 << 1, + DHCP6_FQDN_FLAG_N = 1 << 2, +} DHCP6FQDNFlag; + +const char *dhcp6_state_to_string(DHCP6State s) _const_; +const char *dhcp6_message_type_to_string(DHCP6MessageType s) _const_; +DHCP6MessageType dhcp6_message_type_from_string(const char *s) _pure_; +const char *dhcp6_message_status_to_string(DHCP6Status s) _const_; +DHCP6Status dhcp6_message_status_from_string(const char *s) _pure_; +int dhcp6_message_status_to_errno(DHCP6Status s); diff --git a/src/libsystemd-network/fuzz-dhcp-client.c b/src/libsystemd-network/fuzz-dhcp-client.c new file mode 100644 index 0000000..384972f --- /dev/null +++ b/src/libsystemd-network/fuzz-dhcp-client.c @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-dhcp-client.c" + +#include "alloc-util.h" +#include "dhcp-network.h" +#include "fuzz.h" + +int dhcp_network_bind_raw_socket( + int ifindex, + union sockaddr_union *link, + uint32_t id, + const struct hw_addr_data *hw_addr, + const struct hw_addr_data *bcast_addr, + uint16_t arp_type, + uint16_t port, + bool so_priority_set, + int so_priority) { + + int fd; + fd = socket(AF_INET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + return fd; +} + +int dhcp_network_send_raw_socket(int s, const union sockaddr_union *link, const void *packet, size_t len) { + return len; +} + +int dhcp_network_bind_udp_socket(int ifindex, be32_t address, uint16_t port, int ip_service_type) { + int fd; + + fd = socket(AF_INET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + return fd; +} + +int dhcp_network_send_udp_socket(int s, be32_t address, uint16_t port, const void *packet, size_t len) { + return len; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + uint8_t mac_addr[] = {'A', 'B', 'C', '1', '2', '3'}; + uint8_t bcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; + _cleanup_(sd_dhcp_client_unrefp) sd_dhcp_client *client = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + int res, r; + + assert_se(setenv("SYSTEMD_NETWORK_TEST_MODE", "1", 1) >= 0); + + fuzz_setup_logging(); + + r = sd_dhcp_client_new(&client, false); + assert_se(r >= 0); + assert_se(client); + + assert_se(sd_event_new(&e) >= 0); + + r = sd_dhcp_client_attach_event(client, e, 0); + assert_se(r >= 0); + + assert_se(sd_dhcp_client_set_ifindex(client, 42) >= 0); + assert_se(sd_dhcp_client_set_mac(client, mac_addr, bcast_addr, ETH_ALEN, ARPHRD_ETHER) >= 0); + + res = sd_dhcp_client_start(client); + assert_se(IN_SET(res, 0, -EINPROGRESS)); + client->xid = 2; + client->state = DHCP_STATE_SELECTING; + + (void) client_handle_offer_or_rapid_ack(client, (DHCPMessage*) data, size, NULL); + + assert_se(sd_dhcp_client_stop(client) >= 0); + + return 0; +} diff --git a/src/libsystemd-network/fuzz-dhcp-server-relay.c b/src/libsystemd-network/fuzz-dhcp-server-relay.c new file mode 100644 index 0000000..5520003 --- /dev/null +++ b/src/libsystemd-network/fuzz-dhcp-server-relay.c @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-dhcp-server.c" + +#include "fuzz.h" + +ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen) { + return len; +} + +ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags) { + return 0; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + struct in_addr address = {.s_addr = htobe32(UINT32_C(10) << 24 | UINT32_C(1))}; + union in_addr_union relay_address; + _cleanup_free_ uint8_t *message = NULL; + + if (size < sizeof(DHCPMessage)) + return 0; + + fuzz_setup_logging(); + + assert_se(sd_dhcp_server_new(&server, 1) >= 0); + assert_se(sd_dhcp_server_attach_event(server, NULL, 0) >= 0); + assert_se(sd_dhcp_server_configure_pool(server, &address, 24, 0, 0) >= 0); + assert_se(in_addr_from_string(AF_INET, "192.168.5.1", &relay_address) >= 0); + assert_se(sd_dhcp_server_set_relay_target(server, &relay_address.in) >= 0); + assert_se(sd_dhcp_server_set_bind_to_interface(server, false) >= 0); + assert_se(sd_dhcp_server_set_relay_agent_information(server, "string:sample_circuit_id", "string:sample_remote_id") >= 0); + + size_t buflen = size; + buflen += relay_agent_information_length(server->agent_circuit_id, server->agent_remote_id) + 2; + assert_se(message = malloc(buflen)); + memcpy(message, data, size); + + server->fd = open("/dev/null", O_RDWR|O_CLOEXEC|O_NOCTTY); + assert_se(server->fd >= 0); + + (void) dhcp_server_relay_message(server, (DHCPMessage *) message, size - sizeof(DHCPMessage), buflen); + return 0; +} diff --git a/src/libsystemd-network/fuzz-dhcp-server.c b/src/libsystemd-network/fuzz-dhcp-server.c new file mode 100644 index 0000000..fddb3a5 --- /dev/null +++ b/src/libsystemd-network/fuzz-dhcp-server.c @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-dhcp-server.c" + +#include "fuzz.h" + +/* stub out network so that the server doesn't send */ +ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen) { + return len; +} + +ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags) { + return 0; +} + +static int add_lease(sd_dhcp_server *server, const struct in_addr *server_address, uint8_t i) { + _cleanup_(dhcp_lease_freep) DHCPLease *lease = NULL; + int r; + + assert(server); + + lease = new(DHCPLease, 1); + if (!lease) + return -ENOMEM; + + *lease = (DHCPLease) { + .address = htobe32(UINT32_C(10) << 24 | i), + .chaddr = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + .expiration = UINT64_MAX, + .gateway = server_address->s_addr, + .hlen = ETH_ALEN, + .htype = ARPHRD_ETHER, + + .client_id.length = 2, + }; + + lease->client_id.data = new(uint8_t, lease->client_id.length); + if (!lease->client_id.data) + return -ENOMEM; + + lease->client_id.data[0] = 2; + lease->client_id.data[1] = i; + + lease->server = server; /* This must be set just before hashmap_put(). */ + + r = hashmap_ensure_put(&server->bound_leases_by_client_id, &dhcp_lease_hash_ops, &lease->client_id, lease); + if (r < 0) + return r; + + r = hashmap_ensure_put(&server->bound_leases_by_address, NULL, UINT32_TO_PTR(lease->address), lease); + if (r < 0) + return r; + + TAKE_PTR(lease); + return 0; +} + +static int add_static_lease(sd_dhcp_server *server, uint8_t i) { + uint8_t id[2] = { 2, i }; + + assert(server); + + return sd_dhcp_server_set_static_lease( + server, + &(struct in_addr) { .s_addr = htobe32(UINT32_C(10) << 24 | i)}, + id, ELEMENTSOF(id)); +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + struct in_addr address = { .s_addr = htobe32(UINT32_C(10) << 24 | UINT32_C(1))}; + _cleanup_free_ uint8_t *duped = NULL; + + if (size < sizeof(DHCPMessage)) + return 0; + + fuzz_setup_logging(); + + assert_se(duped = memdup(data, size)); + + assert_se(sd_dhcp_server_new(&server, 1) >= 0); + assert_se(sd_dhcp_server_attach_event(server, NULL, 0) >= 0); + server->fd = open("/dev/null", O_RDWR|O_CLOEXEC|O_NOCTTY); + assert_se(server->fd >= 0); + assert_se(sd_dhcp_server_configure_pool(server, &address, 24, 0, 0) >= 0); + + /* add leases to the pool to expose additional code paths */ + assert_se(add_lease(server, &address, 2) >= 0); + assert_se(add_lease(server, &address, 3) >= 0); + + /* add static leases */ + assert_se(add_static_lease(server, 3) >= 0); + assert_se(add_static_lease(server, 4) >= 0); + + (void) dhcp_server_handle_message(server, (DHCPMessage*) duped, size, NULL); + + return 0; +} diff --git a/src/libsystemd-network/fuzz-dhcp6-client.c b/src/libsystemd-network/fuzz-dhcp6-client.c new file mode 100644 index 0000000..2d42844 --- /dev/null +++ b/src/libsystemd-network/fuzz-dhcp6-client.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-dhcp6-client.h" +#include "sd-event.h" + +#include "dhcp6-internal.h" +#include "event-util.h" +#include "fd-util.h" +#include "fuzz.h" + +static int test_dhcp_fd[2] = EBADF_PAIR; + +int dhcp6_network_send_udp_socket(int s, struct in6_addr *server_address, const void *packet, size_t len) { + return len; +} + +int dhcp6_network_bind_udp_socket(int index, struct in6_addr *local_address) { + assert_se(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_dhcp_fd) >= 0); + return TAKE_FD(test_dhcp_fd[0]); +} + +static void fuzz_client(sd_dhcp6_client *client, const uint8_t *data, size_t size, DHCP6State state) { + assert_se(sd_dhcp6_client_set_information_request(client, state == DHCP6_STATE_INFORMATION_REQUEST) >= 0); + assert_se(sd_dhcp6_client_start(client) >= 0); + + client->state = state; + + if (size >= sizeof(DHCP6Message)) + assert_se(dhcp6_client_set_transaction_id(client, ((const DHCP6Message *) data)->transaction_id) == 0); + + /* These states does not require lease to send message. */ + if (IN_SET(client->state, DHCP6_STATE_INFORMATION_REQUEST, DHCP6_STATE_SOLICITATION)) + assert_se(dhcp6_client_send_message(client) >= 0); + + assert_se(write(test_dhcp_fd[1], data, size) == (ssize_t) size); + + assert_se(sd_event_run(sd_dhcp6_client_get_event(client), UINT64_MAX) > 0); + + /* Check the state transition. */ + if (client->state != state) + switch (state) { + case DHCP6_STATE_INFORMATION_REQUEST: + assert_se(client->state == DHCP6_STATE_STOPPED); + break; + case DHCP6_STATE_SOLICITATION: + assert_se(IN_SET(client->state, DHCP6_STATE_REQUEST, DHCP6_STATE_BOUND)); + break; + case DHCP6_STATE_REQUEST: + assert_se(IN_SET(client->state, DHCP6_STATE_BOUND, DHCP6_STATE_SOLICITATION)); + break; + default: + assert_not_reached(); + } + + /* Send message if the client has a lease. */ + if (state != DHCP6_STATE_INFORMATION_REQUEST && sd_dhcp6_client_get_lease(client, NULL) >= 0) { + client->state = DHCP6_STATE_REQUEST; + dhcp6_client_send_message(client); + } + + assert_se(sd_dhcp6_client_stop(client) >= 0); + + test_dhcp_fd[1] = safe_close(test_dhcp_fd[1]); +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + _cleanup_(sd_dhcp6_option_unrefp) sd_dhcp6_option *v1 = NULL, *v2 = NULL; + struct in6_addr address = { { { 0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x01 } } }; + struct in6_addr hint = { { { 0x3f, 0xfe, 0x05, 0x01, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } }; + static const char *v1_data = "hogehoge", *v2_data = "foobar"; + + assert_se(setenv("SYSTEMD_NETWORK_TEST_MODE", "1", 1) >= 0); + + fuzz_setup_logging(); + + if (outside_size_range(size, 0, 65536)) + return 0; + + assert_se(sd_event_new(&e) >= 0); + assert_se(sd_dhcp6_client_new(&client) >= 0); + assert_se(sd_dhcp6_client_attach_event(client, e, 0) >= 0); + assert_se(sd_dhcp6_client_set_ifindex(client, 42) >= 0); + assert_se(sd_dhcp6_client_set_local_address(client, &address) >= 0); + + /* Used when sending message. */ + assert_se(sd_dhcp6_client_set_fqdn(client, "example.com") == 1); + assert_se(sd_dhcp6_client_set_request_mud_url(client, "https://www.example.com/mudfile.json") >= 0); + assert_se(sd_dhcp6_client_set_request_user_class(client, STRV_MAKE("u1", "u2", "u3")) >= 0); + assert_se(sd_dhcp6_client_set_request_vendor_class(client, STRV_MAKE("v1", "v2", "v3")) >= 0); + assert_se(sd_dhcp6_client_set_prefix_delegation_hint(client, 48, &hint) >= 0); + assert_se(sd_dhcp6_option_new(123, v1_data, strlen(v1_data), 12345, &v1) >= 0); + assert_se(sd_dhcp6_option_new(456, v2_data, strlen(v2_data), 45678, &v2) >= 0); + assert_se(sd_dhcp6_client_add_vendor_option(client, v1) >= 0); + assert_se(sd_dhcp6_client_add_vendor_option(client, v2) >= 0); + + fuzz_client(client, data, size, DHCP6_STATE_INFORMATION_REQUEST); + fuzz_client(client, data, size, DHCP6_STATE_SOLICITATION); + + /* If size is zero, then the resend timer will be triggered at first, + * but in the REQUEST state the client must have a lease. */ + if (size == 0) + return 0; + + fuzz_client(client, data, size, DHCP6_STATE_REQUEST); + + return 0; +} diff --git a/src/libsystemd-network/fuzz-dhcp6-client.options b/src/libsystemd-network/fuzz-dhcp6-client.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/libsystemd-network/fuzz-dhcp6-client.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/libsystemd-network/fuzz-lldp-rx.c b/src/libsystemd-network/fuzz-lldp-rx.c new file mode 100644 index 0000000..844957c --- /dev/null +++ b/src/libsystemd-network/fuzz-lldp-rx.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-event.h" +#include "sd-lldp-rx.h" + +#include "fd-util.h" +#include "fuzz.h" +#include "lldp-network.h" + +static int test_fd[2] = EBADF_PAIR; + +int lldp_network_bind_raw_socket(int ifindex) { + if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) < 0) + return -errno; + + return test_fd[0]; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_lldp_rx_unrefp) sd_lldp_rx *lldp_rx = NULL; + + if (outside_size_range(size, 0, 2048)) + return 0; + + fuzz_setup_logging(); + + assert_se(sd_event_new(&e) == 0); + assert_se(sd_lldp_rx_new(&lldp_rx) >= 0); + assert_se(sd_lldp_rx_set_ifindex(lldp_rx, 42) >= 0); + assert_se(sd_lldp_rx_attach_event(lldp_rx, e, 0) >= 0); + assert_se(sd_lldp_rx_start(lldp_rx) >= 0); + + assert_se(write(test_fd[1], data, size) == (ssize_t) size); + assert_se(sd_event_run(e, 0) >= 0); + + assert_se(sd_lldp_rx_stop(lldp_rx) >= 0); + assert_se(sd_lldp_rx_detach_event(lldp_rx) >= 0); + test_fd[1] = safe_close(test_fd[1]); + + return 0; +} diff --git a/src/libsystemd-network/fuzz-lldp-rx.options b/src/libsystemd-network/fuzz-lldp-rx.options new file mode 100644 index 0000000..60bd9b0 --- /dev/null +++ b/src/libsystemd-network/fuzz-lldp-rx.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 2048 diff --git a/src/libsystemd-network/fuzz-ndisc-rs.c b/src/libsystemd-network/fuzz-ndisc-rs.c new file mode 100644 index 0000000..a89e2b0 --- /dev/null +++ b/src/libsystemd-network/fuzz-ndisc-rs.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-ndisc.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "fuzz.h" +#include "icmp6-util-unix.h" +#include "ndisc-internal.h" +#include "socket-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + struct ether_addr mac_addr = { + .ether_addr_octet = {'A', 'B', 'C', '1', '2', '3'} + }; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_ndisc_unrefp) sd_ndisc *nd = NULL; + + if (outside_size_range(size, 0, 2048)) + return 0; + + fuzz_setup_logging(); + + assert_se(sd_event_new(&e) >= 0); + assert_se(sd_ndisc_new(&nd) >= 0); + assert_se(sd_ndisc_attach_event(nd, e, 0) >= 0); + assert_se(sd_ndisc_set_ifindex(nd, 42) >= 0); + assert_se(sd_ndisc_set_mac(nd, &mac_addr) >= 0); + assert_se(sd_ndisc_start(nd) >= 0); + assert_se(write(test_fd[1], data, size) == (ssize_t) size); + (void) sd_event_run(e, UINT64_MAX); + assert_se(sd_ndisc_stop(nd) >= 0); + close(test_fd[1]); + + return 0; +} diff --git a/src/libsystemd-network/fuzz-ndisc-rs.options b/src/libsystemd-network/fuzz-ndisc-rs.options new file mode 100644 index 0000000..60bd9b0 --- /dev/null +++ b/src/libsystemd-network/fuzz-ndisc-rs.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 2048 diff --git a/src/libsystemd-network/icmp6-util-unix.c b/src/libsystemd-network/icmp6-util-unix.c new file mode 100644 index 0000000..01edb85 --- /dev/null +++ b/src/libsystemd-network/icmp6-util-unix.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "icmp6-util-unix.h" + +send_ra_t send_ra_function = NULL; +int test_fd[2] = EBADF_PAIR; + +static struct in6_addr dummy_link_local = { + .s6_addr = { + 0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x12, 0x34, 0x56, 0xff, 0xfe, 0x78, 0x9a, 0xbc, + }, +}; + +int icmp6_bind_router_solicitation(int ifindex) { + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) < 0) + return -errno; + + return test_fd[0]; +} + +int icmp6_bind_router_advertisement(int ifindex) { + return test_fd[1]; +} + +int icmp6_send_router_solicitation(int s, const struct ether_addr *ether_addr) { + if (!send_ra_function) + return 0; + + return send_ra_function(0); +} + +int icmp6_receive( + int fd, + void *iov_base, + size_t iov_len, + struct in6_addr *ret_sender, + triple_timestamp *ret_timestamp) { + + assert_se(read (fd, iov_base, iov_len) == (ssize_t) iov_len); + + if (ret_timestamp) + triple_timestamp_now(ret_timestamp); + + if (ret_sender) + *ret_sender = dummy_link_local; + + return 0; +} diff --git a/src/libsystemd-network/icmp6-util-unix.h b/src/libsystemd-network/icmp6-util-unix.h new file mode 100644 index 0000000..a9cb05a --- /dev/null +++ b/src/libsystemd-network/icmp6-util-unix.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "icmp6-util.h" + +typedef int (*send_ra_t)(uint8_t flags); + +extern send_ra_t send_ra_function; +extern int test_fd[2]; diff --git a/src/libsystemd-network/icmp6-util.c b/src/libsystemd-network/icmp6-util.c new file mode 100644 index 0000000..72c20ba --- /dev/null +++ b/src/libsystemd-network/icmp6-util.c @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fd-util.h" +#include "icmp6-util.h" +#include "in-addr-util.h" +#include "iovec-util.h" +#include "network-common.h" +#include "socket-util.h" + +#define IN6ADDR_ALL_ROUTERS_MULTICAST_INIT \ + { { { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 } } } + +#define IN6ADDR_ALL_NODES_MULTICAST_INIT \ + { { { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 } } } + +static int icmp6_bind_router_message(const struct icmp6_filter *filter, + const struct ipv6_mreq *mreq) { + int ifindex = mreq->ipv6mr_interface; + _cleanup_close_ int s = -EBADF; + int r; + + assert(filter); + assert(mreq); + + s = socket(AF_INET6, SOCK_RAW | SOCK_CLOEXEC | SOCK_NONBLOCK, IPPROTO_ICMPV6); + if (s < 0) + return -errno; + + if (setsockopt(s, IPPROTO_ICMPV6, ICMP6_FILTER, filter, sizeof(*filter)) < 0) + return -errno; + + if (setsockopt(s, IPPROTO_IPV6, IPV6_ADD_MEMBERSHIP, mreq, sizeof(*mreq)) < 0) + return -errno; + + /* RFC 3315, section 6.7, bullet point 2 may indicate that an + IPV6_PKTINFO socket option also applies for ICMPv6 multicast. + Empirical experiments indicates otherwise and therefore an + IPV6_MULTICAST_IF socket option is used here instead */ + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_IF, ifindex); + if (r < 0) + return r; + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, false); + if (r < 0) + return r; + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, 255); + if (r < 0) + return r; + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_UNICAST_HOPS, 255); + if (r < 0) + return r; + + r = setsockopt_int(s, SOL_IPV6, IPV6_RECVHOPLIMIT, true); + if (r < 0) + return r; + + r = setsockopt_int(s, SOL_SOCKET, SO_TIMESTAMP, true); + if (r < 0) + return r; + + r = socket_bind_to_ifindex(s, ifindex); + if (r < 0) + return r; + + return TAKE_FD(s); +} + +int icmp6_bind_router_solicitation(int ifindex) { + struct icmp6_filter filter = {}; + struct ipv6_mreq mreq = { + .ipv6mr_multiaddr = IN6ADDR_ALL_NODES_MULTICAST_INIT, + .ipv6mr_interface = ifindex, + }; + + ICMP6_FILTER_SETBLOCKALL(&filter); + ICMP6_FILTER_SETPASS(ND_ROUTER_ADVERT, &filter); + + return icmp6_bind_router_message(&filter, &mreq); +} + +int icmp6_bind_router_advertisement(int ifindex) { + struct icmp6_filter filter = {}; + struct ipv6_mreq mreq = { + .ipv6mr_multiaddr = IN6ADDR_ALL_ROUTERS_MULTICAST_INIT, + .ipv6mr_interface = ifindex, + }; + + ICMP6_FILTER_SETBLOCKALL(&filter); + ICMP6_FILTER_SETPASS(ND_ROUTER_SOLICIT, &filter); + + return icmp6_bind_router_message(&filter, &mreq); +} + +int icmp6_send_router_solicitation(int s, const struct ether_addr *ether_addr) { + struct sockaddr_in6 dst = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ALL_ROUTERS_MULTICAST_INIT, + }; + struct { + struct nd_router_solicit rs; + struct nd_opt_hdr rs_opt; + struct ether_addr rs_opt_mac; + } _packed_ rs = { + .rs.nd_rs_type = ND_ROUTER_SOLICIT, + .rs_opt.nd_opt_type = ND_OPT_SOURCE_LINKADDR, + .rs_opt.nd_opt_len = 1, + }; + struct iovec iov = { + .iov_base = &rs, + .iov_len = sizeof(rs), + }; + struct msghdr msg = { + .msg_name = &dst, + .msg_namelen = sizeof(dst), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + assert(s >= 0); + assert(ether_addr); + + rs.rs_opt_mac = *ether_addr; + + if (sendmsg(s, &msg, 0) < 0) + return -errno; + + return 0; +} + +int icmp6_receive( + int fd, + void *buffer, + size_t size, + struct in6_addr *ret_sender, + triple_timestamp *ret_timestamp) { + + /* This needs to be initialized with zero. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int)) + /* ttl */ + CMSG_SPACE_TIMEVAL) control = {}; + struct iovec iov = {}; + union sockaddr_union sa = {}; + struct msghdr msg = { + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct in6_addr addr = {}; + ssize_t len; + + iov = IOVEC_MAKE(buffer, size); + + len = recvmsg_safe(fd, &msg, MSG_DONTWAIT); + if (len < 0) + return (int) len; + + if ((size_t) len != size) + return -EINVAL; + + if (msg.msg_namelen == sizeof(struct sockaddr_in6) && + sa.in6.sin6_family == AF_INET6) { + + addr = sa.in6.sin6_addr; + if (!in6_addr_is_link_local(&addr) && !in6_addr_is_null(&addr)) + return -EADDRNOTAVAIL; + + } else if (msg.msg_namelen > 0) + return -EPFNOSUPPORT; + + /* namelen == 0 only happens when running the test-suite over a socketpair */ + + assert(!(msg.msg_flags & MSG_TRUNC)); + + int *hops = CMSG_FIND_DATA(&msg, SOL_IPV6, IPV6_HOPLIMIT, int); + if (hops && *hops != 255) + return -EMULTIHOP; + + if (ret_timestamp) + triple_timestamp_from_cmsg(ret_timestamp, &msg); + if (ret_sender) + *ret_sender = addr; + return 0; +} diff --git a/src/libsystemd-network/icmp6-util.h b/src/libsystemd-network/icmp6-util.h new file mode 100644 index 0000000..0a9ecb4 --- /dev/null +++ b/src/libsystemd-network/icmp6-util.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. +***/ + +#include + +#include "time-util.h" + +#define IN6ADDR_ALL_ROUTERS_MULTICAST_INIT \ + { { { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 } } } + +#define IN6ADDR_ALL_NODES_MULTICAST_INIT \ + { { { 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 } } } + +int icmp6_bind_router_solicitation(int ifindex); +int icmp6_bind_router_advertisement(int ifindex); +int icmp6_send_router_solicitation(int s, const struct ether_addr *ether_addr); +int icmp6_receive( + int fd, + void *buffer, + size_t size, + struct in6_addr *ret_sender, + triple_timestamp *ret_timestamp); diff --git a/src/libsystemd-network/lldp-neighbor.c b/src/libsystemd-network/lldp-neighbor.c new file mode 100644 index 0000000..af61c9b --- /dev/null +++ b/src/libsystemd-network/lldp-neighbor.c @@ -0,0 +1,795 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "escape.h" +#include "ether-addr-util.h" +#include "hexdecoct.h" +#include "in-addr-util.h" +#include "lldp-neighbor.h" +#include "memory-util.h" +#include "missing_network.h" +#include "unaligned.h" + +static void lldp_neighbor_id_hash_func(const LLDPNeighborID *id, struct siphash *state) { + assert(id); + assert(state); + + siphash24_compress(id->chassis_id, id->chassis_id_size, state); + siphash24_compress(&id->chassis_id_size, sizeof(id->chassis_id_size), state); + siphash24_compress(id->port_id, id->port_id_size, state); + siphash24_compress(&id->port_id_size, sizeof(id->port_id_size), state); +} + +int lldp_neighbor_id_compare_func(const LLDPNeighborID *x, const LLDPNeighborID *y) { + assert(x); + assert(y); + + return memcmp_nn(x->chassis_id, x->chassis_id_size, y->chassis_id, y->chassis_id_size) + ?: memcmp_nn(x->port_id, x->port_id_size, y->port_id, y->port_id_size); +} + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + lldp_neighbor_hash_ops, + LLDPNeighborID, + lldp_neighbor_id_hash_func, + lldp_neighbor_id_compare_func, + sd_lldp_neighbor, + lldp_neighbor_unlink); + +int lldp_neighbor_prioq_compare_func(const void *a, const void *b) { + const sd_lldp_neighbor *x = a, *y = b; + + assert(x); + assert(y); + + return CMP(x->until, y->until); +} + +sd_lldp_neighbor *sd_lldp_neighbor_ref(sd_lldp_neighbor *n) { + if (!n) + return NULL; + + assert(n->n_ref > 0 || n->lldp_rx); + n->n_ref++; + + return n; +} + +static sd_lldp_neighbor *lldp_neighbor_free(sd_lldp_neighbor *n) { + if (!n) + return NULL; + + free(n->id.port_id); + free(n->id.chassis_id); + free(n->port_description); + free(n->system_name); + free(n->system_description); + free(n->mud_url); + free(n->chassis_id_as_string); + free(n->port_id_as_string); + return mfree(n); +} + +sd_lldp_neighbor *sd_lldp_neighbor_unref(sd_lldp_neighbor *n) { + + /* Drops one reference from the neighbor. Note that the object is not freed unless it is already unlinked from + * the sd_lldp object. */ + + if (!n) + return NULL; + + assert(n->n_ref > 0); + n->n_ref--; + + if (n->n_ref <= 0 && !n->lldp_rx) + lldp_neighbor_free(n); + + return NULL; +} + +sd_lldp_neighbor *lldp_neighbor_unlink(sd_lldp_neighbor *n) { + + /* Removes the neighbor object from the LLDP object, and frees it if it also has no other reference. */ + + if (!n) + return NULL; + + if (!n->lldp_rx) + return NULL; + + /* Only remove the neighbor object from the hash table if it's in there, don't complain if it isn't. This is + * because we are used as destructor call for hashmap_clear() and thus sometimes are called to de-register + * ourselves from the hashtable and sometimes are called after we already are de-registered. */ + + (void) hashmap_remove_value(n->lldp_rx->neighbor_by_id, &n->id, n); + + assert_se(prioq_remove(n->lldp_rx->neighbor_by_expiry, n, &n->prioq_idx) >= 0); + + n->lldp_rx = NULL; + + if (n->n_ref <= 0) + lldp_neighbor_free(n); + + return NULL; +} + +sd_lldp_neighbor *lldp_neighbor_new(size_t raw_size) { + sd_lldp_neighbor *n; + + if (raw_size > SIZE_MAX - ALIGN(sizeof(sd_lldp_neighbor))) + return NULL; + + n = malloc0(ALIGN(sizeof(sd_lldp_neighbor)) + raw_size); + if (!n) + return NULL; + + n->raw_size = raw_size; + n->n_ref = 1; + + return n; +} + +static int parse_string(sd_lldp_rx *lldp_rx, char **s, const void *q, size_t n) { + const char *p = q; + char *k; + + assert(s); + assert(p || n == 0); + + if (*s) { + log_lldp_rx(lldp_rx, "Found duplicate string, ignoring field."); + return 0; + } + + /* Strip trailing NULs, just to be nice */ + while (n > 0 && p[n-1] == 0) + n--; + + if (n <= 0) /* Ignore empty strings */ + return 0; + + /* Look for inner NULs */ + if (memchr(p, 0, n)) { + log_lldp_rx(lldp_rx, "Found inner NUL in string, ignoring field."); + return 0; + } + + /* Let's escape weird chars, for security reasons */ + k = cescape_length(p, n); + if (!k) + return log_oom_debug(); + + free_and_replace(*s, k); + + return 1; +} + +int lldp_neighbor_parse(sd_lldp_neighbor *n) { + struct ether_header h; + const uint8_t *p; + size_t left; + int r; + + assert(n); + + if (n->raw_size < sizeof(struct ether_header)) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Received truncated packet, ignoring."); + + memcpy(&h, LLDP_NEIGHBOR_RAW(n), sizeof(h)); + + if (h.ether_type != htobe16(ETHERTYPE_LLDP)) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Received packet with wrong type, ignoring."); + + if (h.ether_dhost[0] != 0x01 || + h.ether_dhost[1] != 0x80 || + h.ether_dhost[2] != 0xc2 || + h.ether_dhost[3] != 0x00 || + h.ether_dhost[4] != 0x00 || + !IN_SET(h.ether_dhost[5], 0x00, 0x03, 0x0e)) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Received packet with wrong destination address, ignoring."); + + memcpy(&n->source_address, h.ether_shost, sizeof(struct ether_addr)); + memcpy(&n->destination_address, h.ether_dhost, sizeof(struct ether_addr)); + + p = (const uint8_t*) LLDP_NEIGHBOR_RAW(n) + sizeof(struct ether_header); + left = n->raw_size - sizeof(struct ether_header); + + for (;;) { + uint8_t type; + uint16_t length; + + if (left < 2) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "TLV lacks header, ignoring."); + + type = p[0] >> 1; + length = p[1] + (((uint16_t) (p[0] & 1)) << 8); + p += 2, left -= 2; + + if (left < length) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "TLV truncated, ignoring datagram."); + + switch (type) { + + case SD_LLDP_TYPE_END: + if (length != 0) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "End marker TLV not zero-sized, ignoring datagram."); + + /* Note that after processing the SD_LLDP_TYPE_END left could still be > 0 + * as the message may contain padding (see IEEE 802.1AB-2016, sec. 8.5.12) */ + + goto end_marker; + + case SD_LLDP_TYPE_CHASSIS_ID: + if (length < 2 || length > 256) + /* includes the chassis subtype, hence one extra byte */ + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Chassis ID field size out of range, ignoring datagram."); + + if (n->id.chassis_id) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Duplicate chassis ID field, ignoring datagram."); + + n->id.chassis_id = memdup(p, length); + if (!n->id.chassis_id) + return log_oom_debug(); + + n->id.chassis_id_size = length; + break; + + case SD_LLDP_TYPE_PORT_ID: + if (length < 2 || length > 256) + /* includes the port subtype, hence one extra byte */ + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Port ID field size out of range, ignoring datagram."); + + if (n->id.port_id) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Duplicate port ID field, ignoring datagram."); + + n->id.port_id = memdup(p, length); + if (!n->id.port_id) + return log_oom_debug(); + + n->id.port_id_size = length; + break; + + case SD_LLDP_TYPE_TTL: + if (length != 2) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "TTL field has wrong size, ignoring datagram."); + + if (n->has_ttl) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Duplicate TTL field, ignoring datagram."); + + n->ttl = unaligned_read_be16(p); + n->has_ttl = true; + break; + + case SD_LLDP_TYPE_PORT_DESCRIPTION: + r = parse_string(n->lldp_rx, &n->port_description, p, length); + if (r < 0) + return r; + break; + + case SD_LLDP_TYPE_SYSTEM_NAME: + r = parse_string(n->lldp_rx, &n->system_name, p, length); + if (r < 0) + return r; + break; + + case SD_LLDP_TYPE_SYSTEM_DESCRIPTION: + r = parse_string(n->lldp_rx, &n->system_description, p, length); + if (r < 0) + return r; + break; + + case SD_LLDP_TYPE_SYSTEM_CAPABILITIES: + if (length != 4) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "System capabilities field has wrong size."); + + n->system_capabilities = unaligned_read_be16(p); + n->enabled_capabilities = unaligned_read_be16(p + 2); + n->has_capabilities = true; + break; + + case SD_LLDP_TYPE_PRIVATE: + if (length < 4) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "Found private TLV that is too short, ignoring."); + + /* RFC 8520: MUD URL */ + if (memcmp(p, SD_LLDP_OUI_IANA_MUD, sizeof(SD_LLDP_OUI_IANA_MUD)) == 0) { + r = parse_string(n->lldp_rx, &n->mud_url, p + sizeof(SD_LLDP_OUI_IANA_MUD), + length - sizeof(SD_LLDP_OUI_IANA_MUD)); + if (r < 0) + return r; + } + break; + } + + p += length, left -= length; + } + +end_marker: + if (!n->id.chassis_id || !n->id.port_id || !n->has_ttl) + return log_lldp_rx_errno(n->lldp_rx, SYNTHETIC_ERRNO(EBADMSG), + "One or more mandatory TLV missing in datagram. Ignoring."); + + n->rindex = sizeof(struct ether_header); + + return 0; +} + +void lldp_neighbor_start_ttl(sd_lldp_neighbor *n) { + assert(n); + + if (n->ttl > 0) { + usec_t base; + + /* Use the packet's timestamp if there is one known */ + base = triple_timestamp_by_clock(&n->timestamp, CLOCK_BOOTTIME); + if (!timestamp_is_set(base)) + base = now(CLOCK_BOOTTIME); /* Otherwise, take the current time */ + + n->until = usec_add(base, n->ttl * USEC_PER_SEC); + } else + n->until = 0; + + if (n->lldp_rx) + prioq_reshuffle(n->lldp_rx->neighbor_by_expiry, n, &n->prioq_idx); +} + +bool lldp_neighbor_equal(const sd_lldp_neighbor *a, const sd_lldp_neighbor *b) { + if (a == b) + return true; + + if (!a || !b) + return false; + + if (a->raw_size != b->raw_size) + return false; + + return memcmp(LLDP_NEIGHBOR_RAW(a), LLDP_NEIGHBOR_RAW(b), a->raw_size) == 0; +} + +int sd_lldp_neighbor_get_source_address(sd_lldp_neighbor *n, struct ether_addr* address) { + assert_return(n, -EINVAL); + assert_return(address, -EINVAL); + + *address = n->source_address; + return 0; +} + +int sd_lldp_neighbor_get_destination_address(sd_lldp_neighbor *n, struct ether_addr* address) { + assert_return(n, -EINVAL); + assert_return(address, -EINVAL); + + *address = n->destination_address; + return 0; +} + +int sd_lldp_neighbor_get_raw(sd_lldp_neighbor *n, const void **ret, size_t *size) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(size, -EINVAL); + + *ret = LLDP_NEIGHBOR_RAW(n); + *size = n->raw_size; + + return 0; +} + +int sd_lldp_neighbor_get_chassis_id(sd_lldp_neighbor *n, uint8_t *type, const void **ret, size_t *size) { + assert_return(n, -EINVAL); + assert_return(type, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(size, -EINVAL); + + assert(n->id.chassis_id_size > 0); + + *type = *(uint8_t*) n->id.chassis_id; + *ret = (uint8_t*) n->id.chassis_id + 1; + *size = n->id.chassis_id_size - 1; + + return 0; +} + +static int format_mac_address(const void *data, size_t sz, char **ret) { + struct ether_addr a; + char *k; + + assert(data || sz <= 0); + + if (sz != 7) + return 0; + + memcpy(&a, (uint8_t*) data + 1, sizeof(a)); + + k = new(char, ETHER_ADDR_TO_STRING_MAX); + if (!k) + return -ENOMEM; + + *ret = ether_addr_to_string(&a, k); + return 1; +} + +static int format_network_address(const void *data, size_t sz, char **ret) { + union in_addr_union a; + int family, r; + + if (sz == 6 && ((uint8_t*) data)[1] == 1) { + memcpy(&a.in, (uint8_t*) data + 2, sizeof(a.in)); + family = AF_INET; + } else if (sz == 18 && ((uint8_t*) data)[1] == 2) { + memcpy(&a.in6, (uint8_t*) data + 2, sizeof(a.in6)); + family = AF_INET6; + } else + return 0; + + r = in_addr_to_string(family, &a, ret); + if (r < 0) + return r; + return 1; +} + +int sd_lldp_neighbor_get_chassis_id_as_string(sd_lldp_neighbor *n, const char **ret) { + char *k; + int r; + + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (n->chassis_id_as_string) { + *ret = n->chassis_id_as_string; + return 0; + } + + assert(n->id.chassis_id_size > 0); + + switch (*(uint8_t*) n->id.chassis_id) { + + case SD_LLDP_CHASSIS_SUBTYPE_CHASSIS_COMPONENT: + case SD_LLDP_CHASSIS_SUBTYPE_INTERFACE_ALIAS: + case SD_LLDP_CHASSIS_SUBTYPE_PORT_COMPONENT: + case SD_LLDP_CHASSIS_SUBTYPE_INTERFACE_NAME: + case SD_LLDP_CHASSIS_SUBTYPE_LOCALLY_ASSIGNED: + k = cescape_length((char*) n->id.chassis_id + 1, n->id.chassis_id_size - 1); + if (!k) + return -ENOMEM; + + goto done; + + case SD_LLDP_CHASSIS_SUBTYPE_MAC_ADDRESS: + r = format_mac_address(n->id.chassis_id, n->id.chassis_id_size, &k); + if (r < 0) + return r; + if (r > 0) + goto done; + + break; + + case SD_LLDP_CHASSIS_SUBTYPE_NETWORK_ADDRESS: + r = format_network_address(n->id.chassis_id, n->id.chassis_id_size, &k); + if (r < 0) + return r; + if (r > 0) + goto done; + + break; + } + + /* Generic fallback */ + k = hexmem(n->id.chassis_id, n->id.chassis_id_size); + if (!k) + return -ENOMEM; + +done: + *ret = n->chassis_id_as_string = k; + return 0; +} + +int sd_lldp_neighbor_get_port_id(sd_lldp_neighbor *n, uint8_t *type, const void **ret, size_t *size) { + assert_return(n, -EINVAL); + assert_return(type, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(size, -EINVAL); + + assert(n->id.port_id_size > 0); + + *type = *(uint8_t*) n->id.port_id; + *ret = (uint8_t*) n->id.port_id + 1; + *size = n->id.port_id_size - 1; + + return 0; +} + +int sd_lldp_neighbor_get_port_id_as_string(sd_lldp_neighbor *n, const char **ret) { + char *k; + int r; + + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (n->port_id_as_string) { + *ret = n->port_id_as_string; + return 0; + } + + assert(n->id.port_id_size > 0); + + switch (*(uint8_t*) n->id.port_id) { + + case SD_LLDP_PORT_SUBTYPE_INTERFACE_ALIAS: + case SD_LLDP_PORT_SUBTYPE_PORT_COMPONENT: + case SD_LLDP_PORT_SUBTYPE_INTERFACE_NAME: + case SD_LLDP_PORT_SUBTYPE_LOCALLY_ASSIGNED: + k = cescape_length((char*) n->id.port_id + 1, n->id.port_id_size - 1); + if (!k) + return -ENOMEM; + + goto done; + + case SD_LLDP_PORT_SUBTYPE_MAC_ADDRESS: + r = format_mac_address(n->id.port_id, n->id.port_id_size, &k); + if (r < 0) + return r; + if (r > 0) + goto done; + + break; + + case SD_LLDP_PORT_SUBTYPE_NETWORK_ADDRESS: + r = format_network_address(n->id.port_id, n->id.port_id_size, &k); + if (r < 0) + return r; + if (r > 0) + goto done; + + break; + } + + /* Generic fallback */ + k = hexmem(n->id.port_id, n->id.port_id_size); + if (!k) + return -ENOMEM; + +done: + *ret = n->port_id_as_string = k; + return 0; +} + +int sd_lldp_neighbor_get_ttl(sd_lldp_neighbor *n, uint16_t *ret_sec) { + assert_return(n, -EINVAL); + assert_return(ret_sec, -EINVAL); + + *ret_sec = n->ttl; + return 0; +} + +int sd_lldp_neighbor_get_system_name(sd_lldp_neighbor *n, const char **ret) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (!n->system_name) + return -ENODATA; + + *ret = n->system_name; + return 0; +} + +int sd_lldp_neighbor_get_system_description(sd_lldp_neighbor *n, const char **ret) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (!n->system_description) + return -ENODATA; + + *ret = n->system_description; + return 0; +} + +int sd_lldp_neighbor_get_port_description(sd_lldp_neighbor *n, const char **ret) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (!n->port_description) + return -ENODATA; + + *ret = n->port_description; + return 0; +} + +int sd_lldp_neighbor_get_mud_url(sd_lldp_neighbor *n, const char **ret) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (!n->mud_url) + return -ENODATA; + + *ret = n->mud_url; + return 0; +} + +int sd_lldp_neighbor_get_system_capabilities(sd_lldp_neighbor *n, uint16_t *ret) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (!n->has_capabilities) + return -ENODATA; + + *ret = n->system_capabilities; + return 0; +} + +int sd_lldp_neighbor_get_enabled_capabilities(sd_lldp_neighbor *n, uint16_t *ret) { + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + + if (!n->has_capabilities) + return -ENODATA; + + *ret = n->enabled_capabilities; + return 0; +} + +int sd_lldp_neighbor_from_raw(sd_lldp_neighbor **ret, const void *raw, size_t raw_size) { + _cleanup_(sd_lldp_neighbor_unrefp) sd_lldp_neighbor *n = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(raw || raw_size <= 0, -EINVAL); + + n = lldp_neighbor_new(raw_size); + if (!n) + return -ENOMEM; + + memcpy_safe(LLDP_NEIGHBOR_RAW(n), raw, raw_size); + + r = lldp_neighbor_parse(n); + if (r < 0) + return r; + + *ret = TAKE_PTR(n); + + return r; +} + +int sd_lldp_neighbor_tlv_rewind(sd_lldp_neighbor *n) { + assert_return(n, -EINVAL); + + assert(n->raw_size >= sizeof(struct ether_header)); + n->rindex = sizeof(struct ether_header); + + return n->rindex < n->raw_size; +} + +int sd_lldp_neighbor_tlv_next(sd_lldp_neighbor *n) { + size_t length; + + assert_return(n, -EINVAL); + + if (n->rindex == n->raw_size) /* EOF */ + return -ESPIPE; + + if (n->rindex + 2 > n->raw_size) /* Truncated message */ + return -EBADMSG; + + length = LLDP_NEIGHBOR_TLV_LENGTH(n); + if (n->rindex + 2 + length > n->raw_size) + return -EBADMSG; + + n->rindex += 2 + length; + return n->rindex < n->raw_size; +} + +int sd_lldp_neighbor_tlv_get_type(sd_lldp_neighbor *n, uint8_t *type) { + assert_return(n, -EINVAL); + assert_return(type, -EINVAL); + + if (n->rindex == n->raw_size) /* EOF */ + return -ESPIPE; + + if (n->rindex + 2 > n->raw_size) + return -EBADMSG; + + *type = LLDP_NEIGHBOR_TLV_TYPE(n); + return 0; +} + +int sd_lldp_neighbor_tlv_is_type(sd_lldp_neighbor *n, uint8_t type) { + uint8_t k; + int r; + + assert_return(n, -EINVAL); + + r = sd_lldp_neighbor_tlv_get_type(n, &k); + if (r < 0) + return r; + + return type == k; +} + +int sd_lldp_neighbor_tlv_get_oui(sd_lldp_neighbor *n, uint8_t oui[_SD_ARRAY_STATIC 3], uint8_t *subtype) { + const uint8_t *d; + size_t length; + int r; + + assert_return(n, -EINVAL); + assert_return(oui, -EINVAL); + assert_return(subtype, -EINVAL); + + r = sd_lldp_neighbor_tlv_is_type(n, SD_LLDP_TYPE_PRIVATE); + if (r < 0) + return r; + if (r == 0) + return -ENXIO; + + length = LLDP_NEIGHBOR_TLV_LENGTH(n); + if (length < 4) + return -EBADMSG; + + if (n->rindex + 2 + length > n->raw_size) + return -EBADMSG; + + d = LLDP_NEIGHBOR_TLV_DATA(n); + memcpy(oui, d, 3); + *subtype = d[3]; + + return 0; +} + +int sd_lldp_neighbor_tlv_is_oui(sd_lldp_neighbor *n, const uint8_t oui[_SD_ARRAY_STATIC 3], uint8_t subtype) { + uint8_t k[3], st; + int r; + + r = sd_lldp_neighbor_tlv_get_oui(n, k, &st); + if (r == -ENXIO) + return 0; + if (r < 0) + return r; + + return memcmp(k, oui, 3) == 0 && st == subtype; +} + +int sd_lldp_neighbor_tlv_get_raw(sd_lldp_neighbor *n, const void **ret, size_t *size) { + size_t length; + + assert_return(n, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(size, -EINVAL); + + /* Note that this returns the full TLV, including the TLV header */ + + if (n->rindex + 2 > n->raw_size) + return -EBADMSG; + + length = LLDP_NEIGHBOR_TLV_LENGTH(n); + if (n->rindex + 2 + length > n->raw_size) + return -EBADMSG; + + *ret = (uint8_t*) LLDP_NEIGHBOR_RAW(n) + n->rindex; + *size = length + 2; + + return 0; +} + +int sd_lldp_neighbor_get_timestamp(sd_lldp_neighbor *n, clockid_t clock, uint64_t *ret) { + assert_return(n, -EINVAL); + assert_return(TRIPLE_TIMESTAMP_HAS_CLOCK(clock), -EOPNOTSUPP); + assert_return(clock_supported(clock), -EOPNOTSUPP); + assert_return(ret, -EINVAL); + + if (!triple_timestamp_is_set(&n->timestamp)) + return -ENODATA; + + *ret = triple_timestamp_by_clock(&n->timestamp, clock); + return 0; +} diff --git a/src/libsystemd-network/lldp-neighbor.h b/src/libsystemd-network/lldp-neighbor.h new file mode 100644 index 0000000..016286b --- /dev/null +++ b/src/libsystemd-network/lldp-neighbor.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-lldp-rx.h" + +#include "hash-funcs.h" +#include "lldp-rx-internal.h" +#include "time-util.h" + +typedef struct LLDPNeighborID { + /* The spec calls this an "MSAP identifier" */ + void *chassis_id; + size_t chassis_id_size; + + void *port_id; + size_t port_id_size; +} LLDPNeighborID; + +struct sd_lldp_neighbor { + /* Neighbor objects stay around as long as they are linked into an "sd_lldp_rx" object or n_ref > 0. */ + sd_lldp_rx *lldp_rx; + unsigned n_ref; + + triple_timestamp timestamp; + + usec_t until; + unsigned prioq_idx; + + struct ether_addr source_address; + struct ether_addr destination_address; + + LLDPNeighborID id; + + /* The raw packet size. The data is appended to the object, accessible via LLDP_NEIGHBOR_RAW() */ + size_t raw_size; + + /* The current read index for the iterative TLV interface */ + size_t rindex; + + /* And a couple of fields parsed out. */ + bool has_ttl:1; + bool has_capabilities:1; + bool has_port_vlan_id:1; + + uint16_t ttl; + + uint16_t system_capabilities; + uint16_t enabled_capabilities; + + char *port_description; + char *system_name; + char *system_description; + char *mud_url; + + uint16_t port_vlan_id; + + char *chassis_id_as_string; + char *port_id_as_string; +}; + +static inline void *LLDP_NEIGHBOR_RAW(const sd_lldp_neighbor *n) { + return (uint8_t*) n + ALIGN(sizeof(sd_lldp_neighbor)); +} + +static inline uint8_t LLDP_NEIGHBOR_TLV_TYPE(const sd_lldp_neighbor *n) { + return ((uint8_t*) LLDP_NEIGHBOR_RAW(n))[n->rindex] >> 1; +} + +static inline size_t LLDP_NEIGHBOR_TLV_LENGTH(const sd_lldp_neighbor *n) { + uint8_t *p; + + p = (uint8_t*) LLDP_NEIGHBOR_RAW(n) + n->rindex; + return p[1] + (((size_t) (p[0] & 1)) << 8); +} + +static inline void* LLDP_NEIGHBOR_TLV_DATA(const sd_lldp_neighbor *n) { + return ((uint8_t*) LLDP_NEIGHBOR_RAW(n)) + n->rindex + 2; +} + +extern const struct hash_ops lldp_neighbor_hash_ops; +int lldp_neighbor_id_compare_func(const LLDPNeighborID *x, const LLDPNeighborID *y); +int lldp_neighbor_prioq_compare_func(const void *a, const void *b); + +sd_lldp_neighbor *lldp_neighbor_unlink(sd_lldp_neighbor *n); +sd_lldp_neighbor *lldp_neighbor_new(size_t raw_size); +int lldp_neighbor_parse(sd_lldp_neighbor *n); +void lldp_neighbor_start_ttl(sd_lldp_neighbor *n); +bool lldp_neighbor_equal(const sd_lldp_neighbor *a, const sd_lldp_neighbor *b); diff --git a/src/libsystemd-network/lldp-network.c b/src/libsystemd-network/lldp-network.c new file mode 100644 index 0000000..598669f --- /dev/null +++ b/src/libsystemd-network/lldp-network.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "lldp-network.h" +#include "missing_network.h" +#include "socket-util.h" + +int lldp_network_bind_raw_socket(int ifindex) { + static const struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ethhdr, h_dest)), /* A <- 4 bytes of destination MAC */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0x0180c200, 1, 0), /* A != 01:80:c2:00 */ + BPF_STMT(BPF_RET + BPF_K, 0), /* drop packet */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ethhdr, h_dest) + 4), /* A <- remaining 2 bytes of destination MAC */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0x0000, 3, 0), /* A != 00:00 */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0x0003, 2, 0), /* A != 00:03 */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0x000e, 1, 0), /* A != 00:0e */ + BPF_STMT(BPF_RET + BPF_K, 0), /* drop packet */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ethhdr, h_proto)), /* A <- protocol */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_LLDP, 1, 0), /* A != ETHERTYPE_LLDP */ + BPF_STMT(BPF_RET + BPF_K, 0), /* drop packet */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept packet */ + }; + static const struct sock_fprog fprog = { + .len = ELEMENTSOF(filter), + .filter = (struct sock_filter*) filter, + }; + struct packet_mreq mreq = { + .mr_ifindex = ifindex, + .mr_type = PACKET_MR_MULTICAST, + .mr_alen = ETH_ALEN, + .mr_address = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x00 } + }; + union sockaddr_union saddrll = { + .ll.sll_family = AF_PACKET, + .ll.sll_ifindex = ifindex, + }; + _cleanup_close_ int fd = -EBADF; + + assert(ifindex > 0); + + fd = socket(AF_PACKET, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, + htobe16(ETHERTYPE_LLDP)); + if (fd < 0) + return -errno; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog)) < 0) + return -errno; + + /* customer bridge */ + if (setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) + return -errno; + + /* non TPMR bridge */ + mreq.mr_address[ETH_ALEN - 1] = 0x03; + if (setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) + return -errno; + + /* nearest bridge */ + mreq.mr_address[ETH_ALEN - 1] = 0x0E; + if (setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) + return -errno; + + if (bind(fd, &saddrll.sa, sizeof(saddrll.ll)) < 0) + return -errno; + + return TAKE_FD(fd); +} diff --git a/src/libsystemd-network/lldp-network.h b/src/libsystemd-network/lldp-network.h new file mode 100644 index 0000000..bc69b32 --- /dev/null +++ b/src/libsystemd-network/lldp-network.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" + +int lldp_network_bind_raw_socket(int ifindex); diff --git a/src/libsystemd-network/lldp-rx-internal.h b/src/libsystemd-network/lldp-rx-internal.h new file mode 100644 index 0000000..83d0bc4 --- /dev/null +++ b/src/libsystemd-network/lldp-rx-internal.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" +#include "sd-lldp-rx.h" + +#include "hashmap.h" +#include "network-common.h" +#include "prioq.h" + +struct sd_lldp_rx { + unsigned n_ref; + + int ifindex; + char *ifname; + int fd; + + sd_event *event; + int64_t event_priority; + sd_event_source *io_event_source; + sd_event_source *timer_event_source; + + Prioq *neighbor_by_expiry; + Hashmap *neighbor_by_id; + + uint64_t neighbors_max; + + sd_lldp_rx_callback_t callback; + void *userdata; + + uint16_t capability_mask; + + struct ether_addr filter_address; +}; + +const char* lldp_rx_event_to_string(sd_lldp_rx_event_t e) _const_; +sd_lldp_rx_event_t lldp_rx_event_from_string(const char *s) _pure_; + +#define log_lldp_rx_errno(lldp_rx, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "LLDP Rx: ", \ + sd_lldp_rx, lldp_rx, \ + error, fmt, ##__VA_ARGS__) +#define log_lldp_rx(lldp_rx, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "LLDP Rx: ", \ + sd_lldp_rx, lldp_rx, \ + 0, fmt, ##__VA_ARGS__) diff --git a/src/libsystemd-network/meson.build b/src/libsystemd-network/meson.build new file mode 100644 index 0000000..93186e2 --- /dev/null +++ b/src/libsystemd-network/meson.build @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +sources = files( + 'arp-util.c', + 'dhcp-identifier.c', + 'dhcp-network.c', + 'dhcp-option.c', + 'dhcp-packet.c', + 'dhcp6-network.c', + 'dhcp6-option.c', + 'dhcp6-protocol.c', + 'icmp6-util.c', + 'lldp-neighbor.c', + 'lldp-network.c', + 'ndisc-protocol.c', + 'ndisc-router.c', + 'network-common.c', + 'network-internal.c', + 'sd-dhcp-client.c', + 'sd-dhcp-lease.c', + 'sd-dhcp-server.c', + 'sd-dhcp6-client.c', + 'sd-dhcp6-lease.c', + 'sd-ipv4acd.c', + 'sd-ipv4ll.c', + 'sd-lldp-rx.c', + 'sd-lldp-tx.c', + 'sd-ndisc.c', + 'sd-radv.c', +) + +libsystemd_network = static_library( + 'systemd-network', + sources, + include_directories : includes, + dependencies : userspace, + build_by_default : false) + +libsystemd_network_includes = [includes, include_directories('.')] + +############################################################ + +network_test_template = test_template + { + 'link_with' : [ + libshared, + libsystemd_network, + ], + 'suite' : 'network', +} + +network_fuzz_template = fuzz_template + { + 'link_with' : [ + libshared, + libsystemd_network, + ], +} + +executables += [ + network_test_template + { + 'sources' : files('test-acd.c'), + 'type' : 'manual', + }, + network_test_template + { + 'sources' : files('test-dhcp-client.c'), + }, + network_test_template + { + 'sources' : files('test-dhcp-option.c'), + }, + network_test_template + { + 'sources' : files('test-dhcp-server.c'), + }, + network_test_template + { + 'sources' : files('test-dhcp6-client.c'), + }, + network_test_template + { + 'sources' : files('test-ipv4ll-manual.c'), + 'type' : 'manual', + }, + network_test_template + { + 'sources' : files('test-ipv4ll.c'), + }, + network_test_template + { + 'sources' : files('test-lldp-rx.c'), + }, + network_test_template + { + 'sources' : files( + 'test-ndisc-ra.c', + 'icmp6-util-unix.c', + ), + }, + network_test_template + { + 'sources' : files( + 'test-ndisc-rs.c', + 'icmp6-util-unix.c', + ), + }, + network_test_template + { + 'sources' : files('test-sd-dhcp-lease.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-dhcp-client.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-dhcp6-client.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-dhcp-server.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-dhcp-server-relay.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-lldp-rx.c'), + }, + network_fuzz_template + { + 'sources' : files( + 'fuzz-ndisc-rs.c', + 'icmp6-util-unix.c', + ), + }, +] diff --git a/src/libsystemd-network/ndisc-internal.h b/src/libsystemd-network/ndisc-internal.h new file mode 100644 index 0000000..615de0d --- /dev/null +++ b/src/libsystemd-network/ndisc-internal.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include "sd-ndisc.h" + +#include "network-common.h" +#include "time-util.h" + +#define NDISC_ROUTER_SOLICITATION_INTERVAL (4U * USEC_PER_SEC) +#define NDISC_MAX_ROUTER_SOLICITATION_INTERVAL (3600U * USEC_PER_SEC) +#define NDISC_MAX_ROUTER_SOLICITATIONS 3U + +struct sd_ndisc { + unsigned n_ref; + + int ifindex; + char *ifname; + int fd; + + sd_event *event; + int event_priority; + + struct ether_addr mac_addr; + + sd_event_source *recv_event_source; + sd_event_source *timeout_event_source; + sd_event_source *timeout_no_ra; + + usec_t retransmit_time; + + sd_ndisc_callback_t callback; + void *userdata; +}; + +const char* ndisc_event_to_string(sd_ndisc_event_t e) _const_; +sd_ndisc_event_t ndisc_event_from_string(const char *s) _pure_; + +#define log_ndisc_errno(ndisc, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "NDISC: ", \ + sd_ndisc, ndisc, \ + error, fmt, ##__VA_ARGS__) +#define log_ndisc(ndisc, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "NDISC: ", \ + sd_ndisc, ndisc, \ + 0, fmt, ##__VA_ARGS__) diff --git a/src/libsystemd-network/ndisc-protocol.c b/src/libsystemd-network/ndisc-protocol.c new file mode 100644 index 0000000..fae4a58 --- /dev/null +++ b/src/libsystemd-network/ndisc-protocol.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "ndisc-protocol.h" + +static const uint8_t prefix_length_code_to_prefix_length[_PREFIX_LENGTH_CODE_MAX] = { + [PREFIX_LENGTH_CODE_96] = 96, + [PREFIX_LENGTH_CODE_64] = 64, + [PREFIX_LENGTH_CODE_56] = 56, + [PREFIX_LENGTH_CODE_48] = 48, + [PREFIX_LENGTH_CODE_40] = 40, + [PREFIX_LENGTH_CODE_32] = 32, +}; + +int pref64_plc_to_prefix_length(uint16_t plc, uint8_t *ret) { + plc &= PREF64_PLC_MASK; + if (plc >= _PREFIX_LENGTH_CODE_MAX) + return -EINVAL; + + if (ret) + *ret = prefix_length_code_to_prefix_length[plc]; + return 0; +} + +int pref64_prefix_length_to_plc(uint8_t prefixlen, uint8_t *ret) { + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(prefix_length_code_to_prefix_length); i++) + if (prefix_length_code_to_prefix_length[i] == prefixlen) { + *ret = i; + return 0; + } + + return -EINVAL; +} diff --git a/src/libsystemd-network/ndisc-protocol.h b/src/libsystemd-network/ndisc-protocol.h new file mode 100644 index 0000000..8e403e3 --- /dev/null +++ b/src/libsystemd-network/ndisc-protocol.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +/* RFC 8781: PREF64 or (NAT64 prefix) */ +#define PREF64_SCALED_LIFETIME_MASK 0xfff8 +#define PREF64_PLC_MASK 0x0007 +#define PREF64_MAX_LIFETIME_USEC (65528 * USEC_PER_SEC) + +typedef enum PrefixLengthCode { + PREFIX_LENGTH_CODE_96, + PREFIX_LENGTH_CODE_64, + PREFIX_LENGTH_CODE_56, + PREFIX_LENGTH_CODE_48, + PREFIX_LENGTH_CODE_40, + PREFIX_LENGTH_CODE_32, + _PREFIX_LENGTH_CODE_MAX, + _PREFIX_LENGTH_CODE_INVALID = -EINVAL, +} PrefixLengthCode; + +/* rfc8781: section 4 - Scaled Lifetime: 13-bit unsigned integer. PREFIX_LEN (Prefix Length Code): 3-bit unsigned integer */ +struct nd_opt_prefix64_info { + uint8_t type; + uint8_t length; + uint16_t lifetime_and_plc; + uint8_t prefix[12]; +} __attribute__((__packed__)); + +int pref64_plc_to_prefix_length(uint16_t plc, uint8_t *ret); +int pref64_prefix_length_to_plc(uint8_t prefixlen, uint8_t *ret); diff --git a/src/libsystemd-network/ndisc-router.c b/src/libsystemd-network/ndisc-router.c new file mode 100644 index 0000000..5162df7 --- /dev/null +++ b/src/libsystemd-network/ndisc-router.c @@ -0,0 +1,913 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include + +#include "sd-ndisc.h" + +#include "alloc-util.h" +#include "dns-domain.h" +#include "hostname-util.h" +#include "memory-util.h" +#include "missing_network.h" +#include "ndisc-internal.h" +#include "ndisc-protocol.h" +#include "ndisc-router.h" +#include "strv.h" + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_ndisc_router, sd_ndisc_router, mfree); + +sd_ndisc_router *ndisc_router_new(size_t raw_size) { + sd_ndisc_router *rt; + + if (raw_size > SIZE_MAX - ALIGN(sizeof(sd_ndisc_router))) + return NULL; + + rt = malloc0(ALIGN(sizeof(sd_ndisc_router)) + raw_size); + if (!rt) + return NULL; + + rt->raw_size = raw_size; + rt->n_ref = 1; + + return rt; +} + +int sd_ndisc_router_get_address(sd_ndisc_router *rt, struct in6_addr *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + if (in6_addr_is_null(&rt->address)) + return -ENODATA; + + *ret = rt->address; + return 0; +} + +int sd_ndisc_router_get_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret) { + assert_return(rt, -EINVAL); + assert_return(TRIPLE_TIMESTAMP_HAS_CLOCK(clock), -EOPNOTSUPP); + assert_return(clock_supported(clock), -EOPNOTSUPP); + assert_return(ret, -EINVAL); + + if (!triple_timestamp_is_set(&rt->timestamp)) + return -ENODATA; + + *ret = triple_timestamp_by_clock(&rt->timestamp, clock); + return 0; +} + +#define DEFINE_GET_TIMESTAMP(name) \ + int sd_ndisc_router_##name##_timestamp( \ + sd_ndisc_router *rt, \ + clockid_t clock, \ + uint64_t *ret) { \ + \ + usec_t s, t; \ + int r; \ + \ + assert_return(rt, -EINVAL); \ + assert_return(ret, -EINVAL); \ + \ + r = sd_ndisc_router_##name(rt, &s); \ + if (r < 0) \ + return r; \ + \ + r = sd_ndisc_router_get_timestamp(rt, clock, &t); \ + if (r < 0) \ + return r; \ + \ + *ret = time_span_to_stamp(s, t); \ + return 0; \ + } + +DEFINE_GET_TIMESTAMP(get_lifetime); +DEFINE_GET_TIMESTAMP(prefix_get_valid_lifetime); +DEFINE_GET_TIMESTAMP(prefix_get_preferred_lifetime); +DEFINE_GET_TIMESTAMP(route_get_lifetime); +DEFINE_GET_TIMESTAMP(rdnss_get_lifetime); +DEFINE_GET_TIMESTAMP(dnssl_get_lifetime); +DEFINE_GET_TIMESTAMP(prefix64_get_lifetime); + +int sd_ndisc_router_get_raw(sd_ndisc_router *rt, const void **ret, size_t *ret_size) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(ret_size, -EINVAL); + + *ret = NDISC_ROUTER_RAW(rt); + *ret_size = rt->raw_size; + + return 0; +} + +static bool pref64_option_verify(const struct nd_opt_prefix64_info *p, size_t length) { + uint16_t lifetime_and_plc; + + assert(p); + + if (length != sizeof(struct nd_opt_prefix64_info)) + return false; + + lifetime_and_plc = be16toh(p->lifetime_and_plc); + if (pref64_plc_to_prefix_length(lifetime_and_plc, NULL) < 0) + return false; + + return true; +} + +int ndisc_router_parse(sd_ndisc *nd, sd_ndisc_router *rt) { + struct nd_router_advert *a; + const uint8_t *p; + bool has_mtu = false, has_flag_extension = false; + size_t left; + + assert(rt); + + if (rt->raw_size < sizeof(struct nd_router_advert)) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Too small to be a router advertisement, ignoring."); + + /* Router advertisement packets are neatly aligned to 64-bit boundaries, hence we can access them directly */ + a = NDISC_ROUTER_RAW(rt); + + if (a->nd_ra_type != ND_ROUTER_ADVERT) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Received ND packet that is not a router advertisement, ignoring."); + + if (a->nd_ra_code != 0) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Received ND packet with wrong RA code, ignoring."); + + rt->hop_limit = a->nd_ra_curhoplimit; + rt->flags = a->nd_ra_flags_reserved; /* the first 8 bits */ + rt->lifetime_usec = be16_sec_to_usec(a->nd_ra_router_lifetime, /* max_as_infinity = */ false); + rt->icmp6_ratelimit_usec = be32_msec_to_usec(a->nd_ra_retransmit, /* max_as_infinity = */ false); + + rt->preference = (rt->flags >> 3) & 3; + if (!IN_SET(rt->preference, SD_NDISC_PREFERENCE_LOW, SD_NDISC_PREFERENCE_HIGH)) + rt->preference = SD_NDISC_PREFERENCE_MEDIUM; + + p = (const uint8_t*) NDISC_ROUTER_RAW(rt) + sizeof(struct nd_router_advert); + left = rt->raw_size - sizeof(struct nd_router_advert); + + for (;;) { + uint8_t type; + size_t length; + + if (left == 0) + break; + + if (left < 2) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Option lacks header, ignoring datagram."); + + type = p[0]; + length = p[1] * 8; + + if (length == 0) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Zero-length option, ignoring datagram."); + if (left < length) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Option truncated, ignoring datagram."); + + switch (type) { + + case SD_NDISC_OPTION_PREFIX_INFORMATION: + + if (length != 4*8) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Prefix option of invalid size, ignoring datagram."); + + if (p[2] > 128) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Bad prefix length, ignoring datagram."); + + break; + + case SD_NDISC_OPTION_MTU: { + uint32_t m; + + if (has_mtu) { + log_ndisc(nd, "MTU option specified twice, ignoring."); + break; + } + + if (length != 8) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "MTU option of invalid size, ignoring datagram."); + + m = be32toh(*(uint32_t*) (p + 4)); + if (m >= IPV6_MIN_MTU) /* ignore invalidly small MTUs */ + rt->mtu = m; + + has_mtu = true; + break; + } + + case SD_NDISC_OPTION_ROUTE_INFORMATION: + if (length < 1*8 || length > 3*8) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Route information option of invalid size, ignoring datagram."); + + if (p[2] > 128) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Bad route prefix length, ignoring datagram."); + + break; + + case SD_NDISC_OPTION_RDNSS: + if (length < 3*8 || (length % (2*8)) != 1*8) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), "RDNSS option has invalid size."); + + break; + + case SD_NDISC_OPTION_FLAGS_EXTENSION: + + if (has_flag_extension) { + log_ndisc(nd, "Flags extension option specified twice, ignoring."); + break; + } + + if (length < 1*8) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "Flags extension option has invalid size."); + + /* Add in the additional flags bits */ + rt->flags |= + ((uint64_t) p[2] << 8) | + ((uint64_t) p[3] << 16) | + ((uint64_t) p[4] << 24) | + ((uint64_t) p[5] << 32) | + ((uint64_t) p[6] << 40) | + ((uint64_t) p[7] << 48); + + has_flag_extension = true; + break; + + case SD_NDISC_OPTION_DNSSL: + if (length < 2*8) + return log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "DNSSL option has invalid size."); + + break; + case SD_NDISC_OPTION_PREF64: { + if (!pref64_option_verify((struct nd_opt_prefix64_info *) p, length)) + log_ndisc_errno(nd, SYNTHETIC_ERRNO(EBADMSG), + "PREF64 prefix has invalid prefix length."); + break; + }} + + p += length, left -= length; + } + + rt->rindex = sizeof(struct nd_router_advert); + return 0; +} + +int sd_ndisc_router_get_hop_limit(sd_ndisc_router *rt, uint8_t *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = rt->hop_limit; + return 0; +} + +int sd_ndisc_router_get_icmp6_ratelimit(sd_ndisc_router *rt, uint64_t *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = rt->icmp6_ratelimit_usec; + return 0; +} + +int sd_ndisc_router_get_flags(sd_ndisc_router *rt, uint64_t *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = rt->flags; + return 0; +} + +int sd_ndisc_router_get_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = rt->lifetime_usec; + return 0; +} + +int sd_ndisc_router_get_preference(sd_ndisc_router *rt, unsigned *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = rt->preference; + return 0; +} + +int sd_ndisc_router_get_mtu(sd_ndisc_router *rt, uint32_t *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + if (rt->mtu <= 0) + return -ENODATA; + + *ret = rt->mtu; + return 0; +} + +int sd_ndisc_router_option_rewind(sd_ndisc_router *rt) { + assert_return(rt, -EINVAL); + + assert(rt->raw_size >= sizeof(struct nd_router_advert)); + rt->rindex = sizeof(struct nd_router_advert); + + return rt->rindex < rt->raw_size; +} + +int sd_ndisc_router_option_next(sd_ndisc_router *rt) { + size_t length; + + assert_return(rt, -EINVAL); + + if (rt->rindex == rt->raw_size) /* EOF */ + return -ESPIPE; + + if (rt->rindex + 2 > rt->raw_size) /* Truncated message */ + return -EBADMSG; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (rt->rindex + length > rt->raw_size) + return -EBADMSG; + + rt->rindex += length; + return rt->rindex < rt->raw_size; +} + +int sd_ndisc_router_option_get_type(sd_ndisc_router *rt, uint8_t *ret) { + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + if (rt->rindex == rt->raw_size) /* EOF */ + return -ESPIPE; + + if (rt->rindex + 2 > rt->raw_size) /* Truncated message */ + return -EBADMSG; + + *ret = NDISC_ROUTER_OPTION_TYPE(rt); + return 0; +} + +int sd_ndisc_router_option_is_type(sd_ndisc_router *rt, uint8_t type) { + uint8_t k; + int r; + + assert_return(rt, -EINVAL); + + r = sd_ndisc_router_option_get_type(rt, &k); + if (r < 0) + return r; + + return type == k; +} + +int sd_ndisc_router_option_get_raw(sd_ndisc_router *rt, const void **ret, size_t *ret_size) { + size_t length; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(ret_size, -EINVAL); + + /* Note that this returns the full option, including the option header */ + + if (rt->rindex + 2 > rt->raw_size) + return -EBADMSG; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (rt->rindex + length > rt->raw_size) + return -EBADMSG; + + *ret = (uint8_t*) NDISC_ROUTER_RAW(rt) + rt->rindex; + *ret_size = length; + + return 0; +} + +static int get_prefix_info(sd_ndisc_router *rt, struct nd_opt_prefix_info **ret) { + struct nd_opt_prefix_info *ri; + size_t length; + int r; + + assert(rt); + assert(ret); + + r = sd_ndisc_router_option_is_type(rt, SD_NDISC_OPTION_PREFIX_INFORMATION); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (length != sizeof(struct nd_opt_prefix_info)) + return -EBADMSG; + + ri = (struct nd_opt_prefix_info*) ((uint8_t*) NDISC_ROUTER_RAW(rt) + rt->rindex); + if (ri->nd_opt_pi_prefix_len > 128) + return -EBADMSG; + + *ret = ri; + return 0; +} + +int sd_ndisc_router_prefix_get_valid_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + struct nd_opt_prefix_info *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_prefix_info(rt, &ri); + if (r < 0) + return r; + + *ret = be32_sec_to_usec(ri->nd_opt_pi_valid_time, /* max_as_infinity = */ true); + return 0; +} + +int sd_ndisc_router_prefix_get_preferred_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + struct nd_opt_prefix_info *pi; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_prefix_info(rt, &pi); + if (r < 0) + return r; + + *ret = be32_sec_to_usec(pi->nd_opt_pi_preferred_time, /* max_as_infinity = */ true); + return 0; +} + +int sd_ndisc_router_prefix_get_flags(sd_ndisc_router *rt, uint8_t *ret) { + struct nd_opt_prefix_info *pi; + uint8_t flags; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_prefix_info(rt, &pi); + if (r < 0) + return r; + + flags = pi->nd_opt_pi_flags_reserved; + + if ((flags & ND_OPT_PI_FLAG_AUTO) && (pi->nd_opt_pi_prefix_len != 64)) { + log_ndisc(NULL, "Invalid prefix length, ignoring prefix for stateless autoconfiguration."); + flags &= ~ND_OPT_PI_FLAG_AUTO; + } + + *ret = flags; + return 0; +} + +int sd_ndisc_router_prefix_get_address(sd_ndisc_router *rt, struct in6_addr *ret) { + struct nd_opt_prefix_info *pi; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_prefix_info(rt, &pi); + if (r < 0) + return r; + + *ret = pi->nd_opt_pi_prefix; + return 0; +} + +int sd_ndisc_router_prefix_get_prefixlen(sd_ndisc_router *rt, unsigned *ret) { + struct nd_opt_prefix_info *pi; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_prefix_info(rt, &pi); + if (r < 0) + return r; + + if (pi->nd_opt_pi_prefix_len > 128) + return -EBADMSG; + + *ret = pi->nd_opt_pi_prefix_len; + return 0; +} + +static int get_route_info(sd_ndisc_router *rt, uint8_t **ret) { + uint8_t *ri; + size_t length; + int r; + + assert(rt); + assert(ret); + + r = sd_ndisc_router_option_is_type(rt, SD_NDISC_OPTION_ROUTE_INFORMATION); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (length < 1*8 || length > 3*8) + return -EBADMSG; + + ri = (uint8_t*) NDISC_ROUTER_RAW(rt) + rt->rindex; + + if (ri[2] > 128) + return -EBADMSG; + + *ret = ri; + return 0; +} + +int sd_ndisc_router_route_get_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_route_info(rt, &ri); + if (r < 0) + return r; + + *ret = unaligned_be32_sec_to_usec(ri + 4, /* max_as_infinity = */ true); + return 0; +} + +int sd_ndisc_router_route_get_address(sd_ndisc_router *rt, struct in6_addr *ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_route_info(rt, &ri); + if (r < 0) + return r; + + zero(*ret); + memcpy(ret, ri + 8, NDISC_ROUTER_OPTION_LENGTH(rt) - 8); + + return 0; +} + +int sd_ndisc_router_route_get_prefixlen(sd_ndisc_router *rt, unsigned *ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_route_info(rt, &ri); + if (r < 0) + return r; + + *ret = ri[2]; + return 0; +} + +int sd_ndisc_router_route_get_preference(sd_ndisc_router *rt, unsigned *ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_route_info(rt, &ri); + if (r < 0) + return r; + + if (!IN_SET((ri[3] >> 3) & 3, SD_NDISC_PREFERENCE_LOW, SD_NDISC_PREFERENCE_MEDIUM, SD_NDISC_PREFERENCE_HIGH)) + return -EOPNOTSUPP; + + *ret = (ri[3] >> 3) & 3; + return 0; +} + +static int get_rdnss_info(sd_ndisc_router *rt, uint8_t **ret) { + size_t length; + int r; + + assert(rt); + assert(ret); + + r = sd_ndisc_router_option_is_type(rt, SD_NDISC_OPTION_RDNSS); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (length < 3*8 || (length % (2*8)) != 1*8) + return -EBADMSG; + + *ret = (uint8_t*) NDISC_ROUTER_RAW(rt) + rt->rindex; + return 0; +} + +int sd_ndisc_router_rdnss_get_addresses(sd_ndisc_router *rt, const struct in6_addr **ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_rdnss_info(rt, &ri); + if (r < 0) + return r; + + *ret = (const struct in6_addr*) (ri + 8); + return (NDISC_ROUTER_OPTION_LENGTH(rt) - 8) / 16; +} + +int sd_ndisc_router_rdnss_get_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_rdnss_info(rt, &ri); + if (r < 0) + return r; + + *ret = unaligned_be32_sec_to_usec(ri + 4, /* max_as_infinity = */ true); + return 0; +} + +static int get_dnssl_info(sd_ndisc_router *rt, uint8_t **ret) { + size_t length; + int r; + + assert(rt); + assert(ret); + + r = sd_ndisc_router_option_is_type(rt, SD_NDISC_OPTION_DNSSL); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (length < 2*8) + return -EBADMSG; + + *ret = (uint8_t*) NDISC_ROUTER_RAW(rt) + rt->rindex; + return 0; +} + +int sd_ndisc_router_dnssl_get_domains(sd_ndisc_router *rt, char ***ret) { + _cleanup_strv_free_ char **l = NULL; + _cleanup_free_ char *e = NULL; + size_t n = 0, left; + uint8_t *ri, *p; + bool first = true; + int r; + unsigned k = 0; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_dnssl_info(rt, &ri); + if (r < 0) + return r; + + p = ri + 8; + left = NDISC_ROUTER_OPTION_LENGTH(rt) - 8; + + for (;;) { + if (left == 0) { + + if (n > 0) /* Not properly NUL terminated */ + return -EBADMSG; + + break; + } + + if (*p == 0) { + /* Found NUL termination */ + + if (n > 0) { + _cleanup_free_ char *normalized = NULL; + + e[n] = 0; + r = dns_name_normalize(e, 0, &normalized); + if (r < 0) + return r; + + /* Ignore the root domain name or "localhost" and friends */ + if (!is_localhost(normalized) && + !dns_name_is_root(normalized)) { + + if (strv_push(&l, normalized) < 0) + return -ENOMEM; + + normalized = NULL; + k++; + } + } + + n = 0; + first = true; + p++, left--; + continue; + } + + /* Check for compression (which is not allowed) */ + if (*p > 63) + return -EBADMSG; + + if (1U + *p + 1U > left) + return -EBADMSG; + + if (!GREEDY_REALLOC(e, n + !first + DNS_LABEL_ESCAPED_MAX + 1U)) + return -ENOMEM; + + if (first) + first = false; + else + e[n++] = '.'; + + r = dns_label_escape((char*) p+1, *p, e + n, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + n += r; + + left -= 1 + *p; + p += 1 + *p; + } + + if (strv_isempty(l)) { + *ret = NULL; + return 0; + } + + *ret = TAKE_PTR(l); + + return k; +} + +int sd_ndisc_router_dnssl_get_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + uint8_t *ri; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_dnssl_info(rt, &ri); + if (r < 0) + return r; + + *ret = unaligned_be32_sec_to_usec(ri + 4, /* max_as_infinity = */ true); + return 0; +} + +int sd_ndisc_router_captive_portal_get_uri(sd_ndisc_router *rt, const char **ret, size_t *ret_size) { + int r; + const char *nd_opt_captive_portal; + size_t length; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(ret_size, -EINVAL); + + r = sd_ndisc_router_option_is_type(rt, SD_NDISC_OPTION_CAPTIVE_PORTAL); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + r = sd_ndisc_router_option_get_raw(rt, (void *)&nd_opt_captive_portal, &length); + if (r < 0) + return r; + + /* The length field has units of 8 octets */ + assert(length % 8 == 0); + if (length == 0) + return -EBADMSG; + + /* Check that the message is not truncated by an embedded NUL. + * NUL padding to a multiple of 8 is expected. */ + size_t size = strnlen(nd_opt_captive_portal + 2, length - 2); + if (DIV_ROUND_UP(size + 2, 8) != length / 8) + return -EBADMSG; + + /* Let's not return an empty buffer */ + if (size == 0) { + *ret = NULL; + *ret_size = 0; + return 0; + } + + *ret = nd_opt_captive_portal + 2; + *ret_size = size; + + return 0; +} + +static int get_pref64_prefix_info(sd_ndisc_router *rt, struct nd_opt_prefix64_info **ret) { + struct nd_opt_prefix64_info *ri; + size_t length; + int r; + + assert(rt); + assert(ret); + + r = sd_ndisc_router_option_is_type(rt, SD_NDISC_OPTION_PREF64); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + length = NDISC_ROUTER_OPTION_LENGTH(rt); + if (length != sizeof(struct nd_opt_prefix64_info)) + return -EBADMSG; + + ri = (struct nd_opt_prefix64_info *) ((uint8_t*) NDISC_ROUTER_RAW(rt) + rt->rindex); + if (!pref64_option_verify(ri, length)) + return -EBADMSG; + + *ret = ri; + return 0; +} + +int sd_ndisc_router_prefix64_get_prefix(sd_ndisc_router *rt, struct in6_addr *ret) { + struct nd_opt_prefix64_info *pi; + struct in6_addr a = {}; + unsigned prefixlen; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_pref64_prefix_info(rt, &pi); + if (r < 0) + return r; + + r = sd_ndisc_router_prefix64_get_prefixlen(rt, &prefixlen); + if (r < 0) + return r; + + memcpy(&a, pi->prefix, sizeof(pi->prefix)); + in6_addr_mask(&a, prefixlen); + /* extra safety check for refusing malformed prefix. */ + if (memcmp(&a, pi->prefix, sizeof(pi->prefix)) != 0) + return -EBADMSG; + + *ret = a; + return 0; +} + +int sd_ndisc_router_prefix64_get_prefixlen(sd_ndisc_router *rt, unsigned *ret) { + struct nd_opt_prefix64_info *pi; + uint16_t lifetime_prefix_len; + uint8_t prefix_len; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_pref64_prefix_info(rt, &pi); + if (r < 0) + return r; + + lifetime_prefix_len = be16toh(pi->lifetime_and_plc); + pref64_plc_to_prefix_length(lifetime_prefix_len, &prefix_len); + + *ret = prefix_len; + return 0; +} + +int sd_ndisc_router_prefix64_get_lifetime(sd_ndisc_router *rt, uint64_t *ret) { + struct nd_opt_prefix64_info *pi; + uint16_t lifetime_prefix_len; + int r; + + assert_return(rt, -EINVAL); + assert_return(ret, -EINVAL); + + r = get_pref64_prefix_info(rt, &pi); + if (r < 0) + return r; + + lifetime_prefix_len = be16toh(pi->lifetime_and_plc); + + *ret = (lifetime_prefix_len & PREF64_SCALED_LIFETIME_MASK) * USEC_PER_SEC; + return 0; +} diff --git a/src/libsystemd-network/ndisc-router.h b/src/libsystemd-network/ndisc-router.h new file mode 100644 index 0000000..0a55e1a --- /dev/null +++ b/src/libsystemd-network/ndisc-router.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include "sd-ndisc.h" + +#include "time-util.h" + +struct sd_ndisc_router { + unsigned n_ref; + + triple_timestamp timestamp; + struct in6_addr address; + + /* The raw packet size. The data is appended to the object, accessible via NDIS_ROUTER_RAW() */ + size_t raw_size; + + /* The current read index for the iterative option interface */ + size_t rindex; + + uint64_t flags; + unsigned preference; + uint64_t lifetime_usec; + + uint8_t hop_limit; + uint32_t mtu; + uint64_t icmp6_ratelimit_usec; +}; + +static inline void* NDISC_ROUTER_RAW(const sd_ndisc_router *rt) { + return (uint8_t*) rt + ALIGN(sizeof(sd_ndisc_router)); +} + +static inline void *NDISC_ROUTER_OPTION_DATA(const sd_ndisc_router *rt) { + return ((uint8_t*) NDISC_ROUTER_RAW(rt)) + rt->rindex; +} + +static inline uint8_t NDISC_ROUTER_OPTION_TYPE(const sd_ndisc_router *rt) { + return ((uint8_t*) NDISC_ROUTER_OPTION_DATA(rt))[0]; +} +static inline size_t NDISC_ROUTER_OPTION_LENGTH(const sd_ndisc_router *rt) { + return ((uint8_t*) NDISC_ROUTER_OPTION_DATA(rt))[1] * 8; +} + +sd_ndisc_router *ndisc_router_new(size_t raw_size); +int ndisc_router_parse(sd_ndisc *nd, sd_ndisc_router *rt); diff --git a/src/libsystemd-network/network-common.c b/src/libsystemd-network/network-common.c new file mode 100644 index 0000000..b639e9c --- /dev/null +++ b/src/libsystemd-network/network-common.c @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "env-util.h" +#include "format-util.h" +#include "network-common.h" +#include "socket-util.h" +#include "unaligned.h" + +int get_ifname(int ifindex, char **ifname) { + assert(ifname); + + /* This sets ifname only when it is not set yet. */ + + if (*ifname) + return 0; + + return format_ifname_alloc(ifindex, ifname); +} + +usec_t unaligned_be32_sec_to_usec(const void *p, bool max_as_infinity) { + uint32_t s = unaligned_read_be32(ASSERT_PTR(p)); + + if (s == UINT32_MAX && max_as_infinity) + return USEC_INFINITY; + + return s * USEC_PER_SEC; +} + +usec_t be32_sec_to_usec(be32_t t, bool max_as_infinity) { + uint32_t s = be32toh(t); + + if (s == UINT32_MAX && max_as_infinity) + return USEC_INFINITY; + + return s * USEC_PER_SEC; +} + +usec_t be32_msec_to_usec(be32_t t, bool max_as_infinity) { + uint32_t s = be32toh(t); + + if (s == UINT32_MAX && max_as_infinity) + return USEC_INFINITY; + + return s * USEC_PER_MSEC; +} + +usec_t be16_sec_to_usec(be16_t t, bool max_as_infinity) { + uint16_t s = be16toh(t); + + if (s == UINT16_MAX && max_as_infinity) + return USEC_INFINITY; + + return s * USEC_PER_SEC; +} + +be32_t usec_to_be32_sec(usec_t t) { + if (t == USEC_INFINITY) + /* Some settings, e.g. a lifetime of an address, UINT32_MAX is handled as infinity. so let's + * map USEC_INFINITY to UINT32_MAX. */ + return htobe32(UINT32_MAX); + + if (t >= (UINT32_MAX - 1) * USEC_PER_SEC) + /* Finite but too large. Let's use the largest (or off-by-one from the largest) finite value. */ + return htobe32(UINT32_MAX - 1); + + return htobe32((uint32_t) DIV_ROUND_UP(t, USEC_PER_SEC)); +} + +be32_t usec_to_be32_msec(usec_t t) { + if (t == USEC_INFINITY) + return htobe32(UINT32_MAX); + + if (t >= (UINT32_MAX - 1) * USEC_PER_MSEC) + return htobe32(UINT32_MAX - 1); + + return htobe32((uint32_t) DIV_ROUND_UP(t, USEC_PER_MSEC)); +} + +be16_t usec_to_be16_sec(usec_t t) { + if (t == USEC_INFINITY) + return htobe16(UINT16_MAX); + + if (t >= (UINT16_MAX - 1) * USEC_PER_SEC) + return htobe16(UINT16_MAX - 1); + + return htobe16((uint16_t) DIV_ROUND_UP(t, USEC_PER_SEC)); +} + +usec_t time_span_to_stamp(usec_t span, usec_t base) { + /* Typically, 0 lifetime (timespan) indicates the corresponding configuration (address or so) must be + * dropped. So, when the timespan is zero, here we return 0 rather than 'base'. This makes the caller + * easily understand that the configuration needs to be dropped immediately. */ + if (span == 0) + return 0; + + return usec_add(base, span); +} + +bool network_test_mode_enabled(void) { + static int test_mode = -1; + int r; + + if (test_mode < 0) { + r = getenv_bool("SYSTEMD_NETWORK_TEST_MODE"); + if (r < 0) { + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_NETWORK_TEST_MODE environment variable, ignoring: %m"); + + test_mode = false; + } else + test_mode = r; + } + + return test_mode; +} + +triple_timestamp* triple_timestamp_from_cmsg(triple_timestamp *t, struct msghdr *mh) { + assert(t); + assert(mh); + + struct timeval *tv = CMSG_FIND_AND_COPY_DATA(mh, SOL_SOCKET, SCM_TIMESTAMP, struct timeval); + if (tv) + return triple_timestamp_from_realtime(t, timeval_load(tv)); + + return triple_timestamp_now(t); +} diff --git a/src/libsystemd-network/network-common.h b/src/libsystemd-network/network-common.h new file mode 100644 index 0000000..1750f18 --- /dev/null +++ b/src/libsystemd-network/network-common.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "log-link.h" +#include "sparse-endian.h" +#include "time-util.h" + +#define log_interface_prefix_full_errno_zerook(prefix, type, val, error, fmt, ...) \ + ({ \ + int _e = (error); \ + if (DEBUG_LOGGING) { \ + const char *_n = NULL; \ + type *_v = (val); \ + \ + if (_v) \ + (void) type##_get_ifname(_v, &_n); \ + log_interface_full_errno_zerook( \ + _n, LOG_DEBUG, _e, prefix fmt, \ + ##__VA_ARGS__); \ + } \ + -ERRNO_VALUE(_e); \ + }) + +#define log_interface_prefix_full_errno(prefix, type, val, error, fmt, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_interface_prefix_full_errno_zerook( \ + prefix, type, val, _error, fmt, ##__VA_ARGS__); \ + }) + +int get_ifname(int ifindex, char **ifname); + +usec_t unaligned_be32_sec_to_usec(const void *p, bool max_as_infinity); +usec_t be32_sec_to_usec(be32_t t, bool max_as_infinity); +usec_t be32_msec_to_usec(be32_t t, bool max_as_infinity); +usec_t be16_sec_to_usec(be16_t t, bool max_as_infinity); +be32_t usec_to_be32_sec(usec_t t); +be32_t usec_to_be32_msec(usec_t t); +be16_t usec_to_be16_sec(usec_t t); +usec_t time_span_to_stamp(usec_t span, usec_t base); + +bool network_test_mode_enabled(void); + +triple_timestamp* triple_timestamp_from_cmsg(triple_timestamp *t, struct msghdr *mh); +#define TRIPLE_TIMESTAMP_FROM_CMSG(mh) \ + triple_timestamp_from_cmsg(&(triple_timestamp) {}, mh) diff --git a/src/libsystemd-network/network-internal.c b/src/libsystemd-network/network-internal.c new file mode 100644 index 0000000..c8aa021 --- /dev/null +++ b/src/libsystemd-network/network-internal.c @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-ndisc.h" + +#include "alloc-util.h" +#include "dhcp-lease-internal.h" +#include "extract-word.h" +#include "hexdecoct.h" +#include "in-addr-util.h" +#include "log.h" +#include "network-internal.h" +#include "parse-util.h" + +size_t serialize_in_addrs(FILE *f, + const struct in_addr *addresses, + size_t size, + bool *with_leading_space, + bool (*predicate)(const struct in_addr *addr)) { + assert(f); + assert(addresses); + + size_t count = 0; + bool _space = false; + if (!with_leading_space) + with_leading_space = &_space; + + for (size_t i = 0; i < size; i++) { + if (predicate && !predicate(&addresses[i])) + continue; + + if (*with_leading_space) + fputc(' ', f); + fputs(IN4_ADDR_TO_STRING(&addresses[i]), f); + count++; + *with_leading_space = true; + } + + return count; +} + +int deserialize_in_addrs(struct in_addr **ret, const char *string) { + _cleanup_free_ struct in_addr *addresses = NULL; + int size = 0; + + assert(ret); + assert(string); + + for (;;) { + _cleanup_free_ char *word = NULL; + struct in_addr *new_addresses; + int r; + + r = extract_first_word(&string, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + new_addresses = reallocarray(addresses, size + 1, sizeof(struct in_addr)); + if (!new_addresses) + return -ENOMEM; + else + addresses = new_addresses; + + r = inet_pton(AF_INET, word, &(addresses[size])); + if (r <= 0) + continue; + + size++; + } + + *ret = size > 0 ? TAKE_PTR(addresses) : NULL; + + return size; +} + +void serialize_in6_addrs(FILE *f, const struct in6_addr *addresses, size_t size, bool *with_leading_space) { + assert(f); + assert(addresses); + assert(size); + + bool _space = false; + if (!with_leading_space) + with_leading_space = &_space; + + for (size_t i = 0; i < size; i++) { + if (*with_leading_space) + fputc(' ', f); + fputs(IN6_ADDR_TO_STRING(&addresses[i]), f); + *with_leading_space = true; + } +} + +int deserialize_in6_addrs(struct in6_addr **ret, const char *string) { + _cleanup_free_ struct in6_addr *addresses = NULL; + int size = 0; + + assert(ret); + assert(string); + + for (;;) { + _cleanup_free_ char *word = NULL; + struct in6_addr *new_addresses; + int r; + + r = extract_first_word(&string, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + new_addresses = reallocarray(addresses, size + 1, sizeof(struct in6_addr)); + if (!new_addresses) + return -ENOMEM; + else + addresses = new_addresses; + + r = inet_pton(AF_INET6, word, &(addresses[size])); + if (r <= 0) + continue; + + size++; + } + + *ret = TAKE_PTR(addresses); + + return size; +} + +void serialize_dhcp_routes(FILE *f, const char *key, sd_dhcp_route **routes, size_t size) { + assert(f); + assert(key); + assert(routes); + assert(size); + + fprintf(f, "%s=", key); + + for (size_t i = 0; i < size; i++) { + struct in_addr dest, gw; + uint8_t length; + + assert_se(sd_dhcp_route_get_destination(routes[i], &dest) >= 0); + assert_se(sd_dhcp_route_get_gateway(routes[i], &gw) >= 0); + assert_se(sd_dhcp_route_get_destination_prefix_length(routes[i], &length) >= 0); + + fprintf(f, "%s,%s%s", + IN4_ADDR_PREFIX_TO_STRING(&dest, length), + IN4_ADDR_TO_STRING(&gw), + i < size - 1 ? " ": ""); + } + + fputs("\n", f); +} + +int deserialize_dhcp_routes(struct sd_dhcp_route **ret, size_t *ret_size, const char *string) { + _cleanup_free_ struct sd_dhcp_route *routes = NULL; + size_t size = 0; + + assert(ret); + assert(ret_size); + assert(string); + + /* WORD FORMAT: dst_ip/dst_prefixlen,gw_ip */ + for (;;) { + _cleanup_free_ char *word = NULL; + char *tok, *tok_end; + unsigned n; + int r; + + r = extract_first_word(&string, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + if (!GREEDY_REALLOC(routes, size + 1)) + return -ENOMEM; + + tok = word; + + /* get the subnet */ + tok_end = strchr(tok, '/'); + if (!tok_end) + continue; + *tok_end = '\0'; + + r = inet_aton(tok, &routes[size].dst_addr); + if (r == 0) + continue; + + tok = tok_end + 1; + + /* get the prefixlen */ + tok_end = strchr(tok, ','); + if (!tok_end) + continue; + + *tok_end = '\0'; + + r = safe_atou(tok, &n); + if (r < 0 || n > 32) + continue; + + routes[size].dst_prefixlen = (uint8_t) n; + tok = tok_end + 1; + + /* get the gateway */ + r = inet_aton(tok, &routes[size].gw_addr); + if (r == 0) + continue; + + size++; + } + + *ret_size = size; + *ret = TAKE_PTR(routes); + + return 0; +} + +int serialize_dhcp_option(FILE *f, const char *key, const void *data, size_t size) { + _cleanup_free_ char *hex_buf = NULL; + + assert(f); + assert(key); + assert(data); + + hex_buf = hexmem(data, size); + if (!hex_buf) + return -ENOMEM; + + fprintf(f, "%s=%s\n", key, hex_buf); + + return 0; +} diff --git a/src/libsystemd-network/network-internal.h b/src/libsystemd-network/network-internal.h new file mode 100644 index 0000000..5aa225e --- /dev/null +++ b/src/libsystemd-network/network-internal.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-dhcp-lease.h" + +size_t serialize_in_addrs(FILE *f, + const struct in_addr *addresses, + size_t size, + bool *with_leading_space, + bool (*predicate)(const struct in_addr *addr)); +int deserialize_in_addrs(struct in_addr **addresses, const char *string); +void serialize_in6_addrs(FILE *f, const struct in6_addr *addresses, + size_t size, + bool *with_leading_space); +int deserialize_in6_addrs(struct in6_addr **addresses, const char *string); + +/* don't include "dhcp-lease-internal.h" as it causes conflicts between netinet/ip.h and linux/ip.h */ +struct sd_dhcp_route; +struct sd_dhcp_lease; + +void serialize_dhcp_routes(FILE *f, const char *key, struct sd_dhcp_route **routes, size_t size); +int deserialize_dhcp_routes(struct sd_dhcp_route **ret, size_t *ret_size, const char *string); + +/* It is not necessary to add deserialize_dhcp_option(). Use unhexmem() instead. */ +int serialize_dhcp_option(FILE *f, const char *key, const void *data, size_t size); + +int dhcp_lease_save(sd_dhcp_lease *lease, const char *lease_file); +int dhcp_lease_load(sd_dhcp_lease **ret, const char *lease_file); diff --git a/src/libsystemd-network/radv-internal.h b/src/libsystemd-network/radv-internal.h new file mode 100644 index 0000000..d6cec90 --- /dev/null +++ b/src/libsystemd-network/radv-internal.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2017 Intel Corporation. All rights reserved. +***/ + +#include + +#include "sd-radv.h" + +#include "list.h" +#include "ndisc-protocol.h" +#include "network-common.h" +#include "sparse-endian.h" +#include "time-util.h" + +/* RFC 4861 section 6.2.1. + * MaxRtrAdvInterval + * The maximum time allowed between sending unsolicited multicast Router Advertisements from the + * interface, in seconds. MUST be no less than 4 seconds and no greater than 1800 seconds. + * Default: 600 seconds */ +#define RADV_MIN_MAX_TIMEOUT_USEC (4 * USEC_PER_SEC) +#define RADV_MAX_MAX_TIMEOUT_USEC (1800 * USEC_PER_SEC) +#define RADV_DEFAULT_MAX_TIMEOUT_USEC (600 * USEC_PER_SEC) +/* RFC 4861 section 6.2.1. + * MinRtrAdvInterval + * The minimum time allowed between sending unsolicited multicast Router Advertisements from the + * interface, in seconds. MUST be no less than 3 seconds and no greater than .75 * MaxRtrAdvInterval. + * Default: 0.33 * MaxRtrAdvInterval If MaxRtrAdvInterval >= 9 seconds; otherwise, the Default is + * MaxRtrAdvInterval (Note, this should be a typo. We use 0.75 * MaxRtrAdvInterval). */ +#define RADV_MIN_MIN_TIMEOUT_USEC (3 * USEC_PER_SEC) +/* RFC 4861 section 6.2.4. + * AdvDefaultLifetime + * The value to be placed in the Router Lifetime field of Router Advertisements sent from the interface, + * in seconds. MUST be either zero or between MaxRtrAdvInterval and 9000 seconds. A value of zero + * indicates that the router is not to be used as a default router. These limits may be overridden by + * specific documents that describe how IPv6 operates over different link layers. For instance, in a + * point-to-point link the peers may have enough information about the number and status of devices at + * the other end so that advertisements are needed less frequently. + * Default: 3 * MaxRtrAdvInterval */ +#define RADV_MIN_ROUTER_LIFETIME_USEC RADV_MIN_MAX_TIMEOUT_USEC +#define RADV_MAX_ROUTER_LIFETIME_USEC (9000 * USEC_PER_SEC) +#define RADV_DEFAULT_ROUTER_LIFETIME_USEC (3 * RADV_DEFAULT_MAX_TIMEOUT_USEC) +/* RFC 4861 section 4.2. + * Retrans Timer + * 32-bit unsigned integer. The time, in milliseconds. */ +#define RADV_MAX_RETRANSMIT_USEC (UINT32_MAX * USEC_PER_MSEC) +/* draft-ietf-6man-slaac-renum-02 section 4.1.1. + * AdvPreferredLifetime: max(AdvDefaultLifetime, 3 * MaxRtrAdvInterval) + * AdvValidLifetime: 2 * AdvPreferredLifetime */ +#define RADV_DEFAULT_PREFERRED_LIFETIME_USEC CONST_MAX(RADV_DEFAULT_ROUTER_LIFETIME_USEC, 3 * RADV_DEFAULT_MAX_TIMEOUT_USEC) +#define RADV_DEFAULT_VALID_LIFETIME_USEC (2 * RADV_DEFAULT_PREFERRED_LIFETIME_USEC) +/* RFC 4861 section 10. + * MAX_INITIAL_RTR_ADVERT_INTERVAL 16 seconds + * MAX_INITIAL_RTR_ADVERTISEMENTS 3 transmissions + * MAX_FINAL_RTR_ADVERTISEMENTS 3 transmissions + * MIN_DELAY_BETWEEN_RAS 3 seconds + * MAX_RA_DELAY_TIME .5 seconds */ +#define RADV_MAX_INITIAL_RTR_ADVERT_INTERVAL_USEC (16 * USEC_PER_SEC) +#define RADV_MAX_INITIAL_RTR_ADVERTISEMENTS 3 +#define RADV_MAX_FINAL_RTR_ADVERTISEMENTS 3 +#define RADV_MIN_DELAY_BETWEEN_RAS 3 +#define RADV_MAX_RA_DELAY_TIME_USEC (500 * USEC_PER_MSEC) +/* From RFC 8781 section 4.1 + * By default, the value of the Scaled Lifetime field SHOULD be set to the lesser of 3 x MaxRtrAdvInterval */ +#define RADV_PREF64_DEFAULT_LIFETIME_USEC (3 * RADV_DEFAULT_MAX_TIMEOUT_USEC) + +#define RADV_RDNSS_MAX_LIFETIME_USEC (UINT32_MAX * USEC_PER_SEC) +#define RADV_DNSSL_MAX_LIFETIME_USEC (UINT32_MAX * USEC_PER_SEC) +/* rfc6275 7.4 Neighbor Discovery Home Agent Lifetime. + * The default value is the same as the Router Lifetime. + * The maximum value corresponds to 18.2 hours. 0 MUST NOT be used. */ +#define RADV_HOME_AGENT_MAX_LIFETIME_USEC (UINT16_MAX * USEC_PER_SEC) + +#define RADV_OPT_ROUTE_INFORMATION 24 +#define RADV_OPT_RDNSS 25 +#define RADV_OPT_DNSSL 31 +/* Pref64 option type (RFC8781, section 4) */ +#define RADV_OPT_PREF64 38 + +enum RAdvState { + RADV_STATE_IDLE = 0, + RADV_STATE_ADVERTISING = 1, +}; +typedef enum RAdvState RAdvState; + +struct sd_radv_opt_dns { + uint8_t type; + uint8_t length; + uint16_t reserved; + be32_t lifetime; +} _packed_; + +struct sd_radv { + unsigned n_ref; + RAdvState state; + + int ifindex; + char *ifname; + + sd_event *event; + int event_priority; + + struct ether_addr mac_addr; + uint8_t hop_limit; + uint8_t flags; + uint32_t mtu; + usec_t retransmit_usec; + usec_t lifetime_usec; /* timespan */ + + int fd; + unsigned ra_sent; + sd_event_source *recv_event_source; + sd_event_source *timeout_event_source; + + unsigned n_prefixes; + LIST_HEAD(sd_radv_prefix, prefixes); + + unsigned n_route_prefixes; + LIST_HEAD(sd_radv_route_prefix, route_prefixes); + + unsigned n_pref64_prefixes; + LIST_HEAD(sd_radv_pref64_prefix, pref64_prefixes); + + size_t n_rdnss; + struct sd_radv_opt_dns *rdnss; + struct sd_radv_opt_dns *dnssl; + + /* Mobile IPv6 extension: Home Agent Info. */ + struct nd_opt_home_agent_info home_agent; +}; + +#define radv_prefix_opt__contents { \ + uint8_t type; \ + uint8_t length; \ + uint8_t prefixlen; \ + uint8_t flags; \ + be32_t lifetime_valid; \ + be32_t lifetime_preferred; \ + uint32_t reserved; \ + struct in6_addr in6_addr; \ +} + +struct radv_prefix_opt radv_prefix_opt__contents; + +/* We need the opt substructure to be packed, because we use it in send(). But + * if we use _packed_, this means that the structure cannot be used directly in + * normal code in general, because the fields might not be properly aligned. + * But in this particular case, the structure is defined in a way that gives + * proper alignment, even without the explicit _packed_ attribute. To appease + * the compiler we use the "unpacked" structure, but we also verify that + * structure contains no holes, so offsets are the same when _packed_ is used. + */ +struct radv_prefix_opt__packed radv_prefix_opt__contents _packed_; +assert_cc(sizeof(struct radv_prefix_opt) == sizeof(struct radv_prefix_opt__packed)); + +struct sd_radv_prefix { + unsigned n_ref; + + struct radv_prefix_opt opt; + + LIST_FIELDS(struct sd_radv_prefix, prefix); + + /* These are timespans, NOT points in time. */ + usec_t lifetime_valid_usec; + usec_t lifetime_preferred_usec; + /* These are points in time specified with clock_boottime_or_monotonic(), NOT timespans. */ + usec_t valid_until; + usec_t preferred_until; +}; + +#define radv_route_prefix_opt__contents { \ + uint8_t type; \ + uint8_t length; \ + uint8_t prefixlen; \ + uint8_t flags_reserved; \ + be32_t lifetime; \ + struct in6_addr in6_addr; \ +} + +struct radv_route_prefix_opt radv_route_prefix_opt__contents; + +struct radv_route_prefix_opt__packed radv_route_prefix_opt__contents _packed_; +assert_cc(sizeof(struct radv_route_prefix_opt) == sizeof(struct radv_route_prefix_opt__packed)); + +struct sd_radv_route_prefix { + unsigned n_ref; + + struct radv_route_prefix_opt opt; + + LIST_FIELDS(struct sd_radv_route_prefix, prefix); + + /* This is a timespan, NOT a point in time. */ + usec_t lifetime_usec; + /* This is a point in time specified with clock_boottime_or_monotonic(), NOT a timespan. */ + usec_t valid_until; +}; + +struct sd_radv_pref64_prefix { + unsigned n_ref; + + struct nd_opt_prefix64_info opt; + + struct in6_addr in6_addr; + uint8_t prefixlen; + + usec_t lifetime_usec; + + LIST_FIELDS(struct sd_radv_pref64_prefix, prefix); +}; + +#define log_radv_errno(radv, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "RADV: ", \ + sd_radv, radv, \ + error, fmt, ##__VA_ARGS__) +#define log_radv(radv, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "RADV: ", \ + sd_radv, radv, \ + 0, fmt, ##__VA_ARGS__) diff --git a/src/libsystemd-network/sd-dhcp-client.c b/src/libsystemd-network/sd-dhcp-client.c new file mode 100644 index 0000000..24bcd74 --- /dev/null +++ b/src/libsystemd-network/sd-dhcp-client.c @@ -0,0 +1,2568 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-dhcp-client.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "dhcp-client-internal.h" +#include "dhcp-identifier.h" +#include "dhcp-lease-internal.h" +#include "dhcp-network.h" +#include "dhcp-option.h" +#include "dhcp-packet.h" +#include "dns-domain.h" +#include "ether-addr-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "hostname-util.h" +#include "iovec-util.h" +#include "memory-util.h" +#include "network-common.h" +#include "random-util.h" +#include "set.h" +#include "sort-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "utf8.h" +#include "web-util.h" + +#define MAX_CLIENT_ID_LEN (sizeof(uint32_t) + MAX_DUID_LEN) /* Arbitrary limit */ +#define MAX_MAC_ADDR_LEN CONST_MAX(INFINIBAND_ALEN, ETH_ALEN) + +#define RESTART_AFTER_NAK_MIN_USEC (1 * USEC_PER_SEC) +#define RESTART_AFTER_NAK_MAX_USEC (30 * USEC_PER_MINUTE) + +#define TRANSIENT_FAILURE_ATTEMPTS 3 /* Arbitrary limit: how many attempts are considered enough to report + * transient failure. */ + +typedef struct sd_dhcp_client_id { + uint8_t type; + union { + struct { + /* 0: Generic (non-LL) (RFC 2132) */ + uint8_t data[MAX_CLIENT_ID_LEN]; + } _packed_ gen; + struct { + /* 1: Ethernet Link-Layer (RFC 2132) */ + uint8_t haddr[ETH_ALEN]; + } _packed_ eth; + struct { + /* 2 - 254: ARP/Link-Layer (RFC 2132) */ + uint8_t haddr[0]; + } _packed_ ll; + struct { + /* 255: Node-specific (RFC 4361) */ + be32_t iaid; + struct duid duid; + } _packed_ ns; + struct { + uint8_t data[MAX_CLIENT_ID_LEN]; + } _packed_ raw; + }; +} _packed_ sd_dhcp_client_id; + +struct sd_dhcp_client { + unsigned n_ref; + + DHCPState state; + sd_event *event; + int event_priority; + sd_event_source *timeout_resend; + + int ifindex; + char *ifname; + + sd_device *dev; + + int fd; + uint16_t port; + union sockaddr_union link; + sd_event_source *receive_message; + bool request_broadcast; + Set *req_opts; + bool anonymize; + bool rapid_commit; + be32_t last_addr; + struct hw_addr_data hw_addr; + struct hw_addr_data bcast_addr; + uint16_t arp_type; + sd_dhcp_client_id client_id; + size_t client_id_len; + char *hostname; + char *vendor_class_identifier; + char *mudurl; + char **user_class; + uint32_t mtu; + usec_t fallback_lease_lifetime; + uint32_t xid; + usec_t start_time; + usec_t t1_time; + usec_t t2_time; + usec_t expire_time; + uint64_t discover_attempt; + uint64_t request_attempt; + uint64_t max_discover_attempts; + uint64_t max_request_attempts; + OrderedHashmap *extra_options; + OrderedHashmap *vendor_options; + sd_event_source *timeout_t1; + sd_event_source *timeout_t2; + sd_event_source *timeout_expire; + sd_event_source *timeout_ipv6_only_mode; + sd_dhcp_client_callback_t callback; + void *userdata; + sd_dhcp_client_callback_t state_callback; + void *state_userdata; + sd_dhcp_lease *lease; + usec_t start_delay; + int ip_service_type; + int socket_priority; + bool socket_priority_set; + bool ipv6_acquired; +}; + +static const uint8_t default_req_opts[] = { + SD_DHCP_OPTION_SUBNET_MASK, + SD_DHCP_OPTION_ROUTER, + SD_DHCP_OPTION_HOST_NAME, + SD_DHCP_OPTION_DOMAIN_NAME, + SD_DHCP_OPTION_DOMAIN_NAME_SERVER, +}; + +/* RFC7844 section 3: + MAY contain the Parameter Request List option. + RFC7844 section 3.6: + The client intending to protect its privacy SHOULD only request a + minimal number of options in the PRL and SHOULD also randomly shuffle + the ordering of option codes in the PRL. If this random ordering + cannot be implemented, the client MAY order the option codes in the + PRL by option code number (lowest to highest). +*/ +/* NOTE: using PRL options that Windows 10 RFC7844 implementation uses */ +static const uint8_t default_req_opts_anonymize[] = { + SD_DHCP_OPTION_SUBNET_MASK, /* 1 */ + SD_DHCP_OPTION_ROUTER, /* 3 */ + SD_DHCP_OPTION_DOMAIN_NAME_SERVER, /* 6 */ + SD_DHCP_OPTION_DOMAIN_NAME, /* 15 */ + SD_DHCP_OPTION_ROUTER_DISCOVERY, /* 31 */ + SD_DHCP_OPTION_STATIC_ROUTE, /* 33 */ + SD_DHCP_OPTION_VENDOR_SPECIFIC, /* 43 */ + SD_DHCP_OPTION_NETBIOS_NAME_SERVER, /* 44 */ + SD_DHCP_OPTION_NETBIOS_NODE_TYPE, /* 46 */ + SD_DHCP_OPTION_NETBIOS_SCOPE, /* 47 */ + SD_DHCP_OPTION_CLASSLESS_STATIC_ROUTE, /* 121 */ + SD_DHCP_OPTION_PRIVATE_CLASSLESS_STATIC_ROUTE, /* 249 */ + SD_DHCP_OPTION_PRIVATE_PROXY_AUTODISCOVERY, /* 252 */ +}; + +static int client_receive_message_raw( + sd_event_source *s, + int fd, + uint32_t revents, + void *userdata); +static int client_receive_message_udp( + sd_event_source *s, + int fd, + uint32_t revents, + void *userdata); +static void client_stop(sd_dhcp_client *client, int error); +static int client_restart(sd_dhcp_client *client); + +int sd_dhcp_client_id_to_string(const void *data, size_t len, char **ret) { + const sd_dhcp_client_id *client_id = data; + _cleanup_free_ char *t = NULL; + int r = 0; + + assert_return(data, -EINVAL); + assert_return(len >= 1, -EINVAL); + assert_return(ret, -EINVAL); + + len -= 1; + if (len > MAX_CLIENT_ID_LEN) + return -EINVAL; + + switch (client_id->type) { + case 0: + if (utf8_is_printable((char *) client_id->gen.data, len)) + r = asprintf(&t, "%.*s", (int) len, client_id->gen.data); + else + r = asprintf(&t, "DATA"); + break; + case 1: + if (len == sizeof_field(sd_dhcp_client_id, eth)) + r = asprintf(&t, "%02x:%02x:%02x:%02x:%02x:%02x", + client_id->eth.haddr[0], + client_id->eth.haddr[1], + client_id->eth.haddr[2], + client_id->eth.haddr[3], + client_id->eth.haddr[4], + client_id->eth.haddr[5]); + else + r = asprintf(&t, "ETHER"); + break; + case 2 ... 254: + r = asprintf(&t, "ARP/LL"); + break; + case 255: + if (len < sizeof(uint32_t)) + r = asprintf(&t, "IAID/DUID"); + else { + uint32_t iaid = be32toh(client_id->ns.iaid); + /* TODO: check and stringify DUID */ + r = asprintf(&t, "IAID:0x%x/DUID", iaid); + } + break; + } + if (r < 0) + return -ENOMEM; + + *ret = TAKE_PTR(t); + return 0; +} + +int dhcp_client_set_state_callback( + sd_dhcp_client *client, + sd_dhcp_client_callback_t cb, + void *userdata) { + + assert_return(client, -EINVAL); + + client->state_callback = cb; + client->state_userdata = userdata; + + return 0; +} + +int sd_dhcp_client_set_callback( + sd_dhcp_client *client, + sd_dhcp_client_callback_t cb, + void *userdata) { + + assert_return(client, -EINVAL); + + client->callback = cb; + client->userdata = userdata; + + return 0; +} + +int sd_dhcp_client_set_request_broadcast(sd_dhcp_client *client, int broadcast) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + client->request_broadcast = broadcast; + + return 0; +} + +int sd_dhcp_client_set_request_option(sd_dhcp_client *client, uint8_t option) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + switch (option) { + + case SD_DHCP_OPTION_PAD: + case SD_DHCP_OPTION_OVERLOAD: + case SD_DHCP_OPTION_MESSAGE_TYPE: + case SD_DHCP_OPTION_PARAMETER_REQUEST_LIST: + case SD_DHCP_OPTION_END: + return -EINVAL; + + default: + break; + } + + return set_ensure_put(&client->req_opts, NULL, UINT8_TO_PTR(option)); +} + +static int client_request_contains(sd_dhcp_client *client, uint8_t option) { + assert(client); + + return set_contains(client->req_opts, UINT8_TO_PTR(option)); +} + +int sd_dhcp_client_set_request_address( + sd_dhcp_client *client, + const struct in_addr *last_addr) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + if (last_addr) + client->last_addr = last_addr->s_addr; + else + client->last_addr = INADDR_ANY; + + return 0; +} + +int sd_dhcp_client_set_ifindex(sd_dhcp_client *client, int ifindex) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(ifindex > 0, -EINVAL); + + client->ifindex = ifindex; + return 0; +} + +int sd_dhcp_client_set_ifname(sd_dhcp_client *client, const char *ifname) { + assert_return(client, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&client->ifname, ifname); +} + +int sd_dhcp_client_get_ifname(sd_dhcp_client *client, const char **ret) { + int r; + + assert_return(client, -EINVAL); + + r = get_ifname(client->ifindex, &client->ifname); + if (r < 0) + return r; + + if (ret) + *ret = client->ifname; + + return 0; +} + +int sd_dhcp_client_set_mac( + sd_dhcp_client *client, + const uint8_t *hw_addr, + const uint8_t *bcast_addr, + size_t addr_len, + uint16_t arp_type) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(IN_SET(arp_type, ARPHRD_ETHER, ARPHRD_INFINIBAND), -EINVAL); + assert_return(hw_addr, -EINVAL); + assert_return(addr_len == (arp_type == ARPHRD_ETHER ? ETH_ALEN : INFINIBAND_ALEN), -EINVAL); + + client->arp_type = arp_type; + hw_addr_set(&client->hw_addr, hw_addr, addr_len); + hw_addr_set(&client->bcast_addr, bcast_addr, bcast_addr ? addr_len : 0); + + return 0; +} + +int sd_dhcp_client_get_client_id( + sd_dhcp_client *client, + uint8_t *ret_type, + const uint8_t **ret_data, + size_t *ret_data_len) { + + assert_return(client, -EINVAL); + + if (client->client_id_len > 0) { + if (client->client_id_len <= offsetof(sd_dhcp_client_id, raw.data)) + return -EINVAL; + + if (ret_type) + *ret_type = client->client_id.type; + if (ret_data) + *ret_data = client->client_id.raw.data; + if (ret_data_len) + *ret_data_len = client->client_id_len - offsetof(sd_dhcp_client_id, raw.data); + return 1; + } + + if (ret_type) + *ret_type = 0; + if (ret_data) + *ret_data = NULL; + if (ret_data_len) + *ret_data_len = 0; + + return 0; +} + +int sd_dhcp_client_set_client_id( + sd_dhcp_client *client, + uint8_t type, + const uint8_t *data, + size_t data_len) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(data, -EINVAL); + assert_return(data_len > 0 && data_len <= MAX_CLIENT_ID_LEN, -EINVAL); + + /* For hardware types, log debug message about unexpected data length. + * + * Note that infiniband's INFINIBAND_ALEN is 20 bytes long, but only + * the last 8 bytes of the address are stable and suitable to put into + * the client-id. The caller is advised to account for that. */ + if ((type == ARPHRD_ETHER && data_len != ETH_ALEN) || + (type == ARPHRD_INFINIBAND && data_len != 8)) + log_dhcp_client(client, + "Changing client ID to hardware type %u with unexpected address length %zu", + type, data_len); + + client->client_id.type = type; + memcpy(&client->client_id.raw.data, data, data_len); + client->client_id_len = data_len + sizeof (client->client_id.type); + + return 0; +} + +/** + * Sets IAID and DUID. If duid is non-null, the DUID is set to duid_type + duid + * without further modification. Otherwise, if duid_type is supported, DUID + * is set based on that type. Otherwise, an error is returned. + */ +static int dhcp_client_set_iaid( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid) { + + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + zero(client->client_id); + client->client_id.type = 255; + + if (iaid_set) + client->client_id.ns.iaid = htobe32(iaid); + else { + r = dhcp_identifier_set_iaid(client->dev, &client->hw_addr, + /* legacy_unstable_byteorder = */ true, + &client->client_id.ns.iaid); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to set IAID: %m"); + } + + return 0; +} + +int sd_dhcp_client_set_iaid_duid_llt( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid, + usec_t llt_time) { + + size_t len; + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + r = dhcp_client_set_iaid(client, iaid_set, iaid); + if (r < 0) + return r; + + r = dhcp_identifier_set_duid_llt(&client->hw_addr, client->arp_type, llt_time, &client->client_id.ns.duid, &len); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to set DUID-LLT: %m"); + + client->client_id_len = sizeof(client->client_id.type) + sizeof(client->client_id.ns.iaid) + len; + + return 0; +} + +int sd_dhcp_client_set_iaid_duid_ll( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid) { + + size_t len; + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + r = dhcp_client_set_iaid(client, iaid_set, iaid); + if (r < 0) + return r; + + r = dhcp_identifier_set_duid_ll(&client->hw_addr, client->arp_type, &client->client_id.ns.duid, &len); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to set DUID-LL: %m"); + + client->client_id_len = sizeof(client->client_id.type) + sizeof(client->client_id.ns.iaid) + len; + + return 0; +} + +int sd_dhcp_client_set_iaid_duid_en( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid) { + + size_t len; + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + r = dhcp_client_set_iaid(client, iaid_set, iaid); + if (r < 0) + return r; + + r = dhcp_identifier_set_duid_en(&client->client_id.ns.duid, &len); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to set DUID-EN: %m"); + + client->client_id_len = sizeof(client->client_id.type) + sizeof(client->client_id.ns.iaid) + len; + + return 0; +} + +int sd_dhcp_client_set_iaid_duid_uuid( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid) { + + size_t len; + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + r = dhcp_client_set_iaid(client, iaid_set, iaid); + if (r < 0) + return r; + + r = dhcp_identifier_set_duid_uuid(&client->client_id.ns.duid, &len); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to set DUID-UUID: %m"); + + client->client_id_len = sizeof(client->client_id.type) + sizeof(client->client_id.ns.iaid) + len; + + return 0; +} + +int sd_dhcp_client_set_iaid_duid_raw( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid, + uint16_t duid_type, + const uint8_t *duid, + size_t duid_len) { + + size_t len; + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(duid || duid_len == 0, -EINVAL); + + r = dhcp_client_set_iaid(client, iaid_set, iaid); + if (r < 0) + return r; + + r = dhcp_identifier_set_duid_raw(duid_type, duid, duid_len, &client->client_id.ns.duid, &len); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to set DUID: %m"); + + client->client_id_len = sizeof(client->client_id.type) + sizeof(client->client_id.ns.iaid) + len; + + return 0; +} + +int sd_dhcp_client_set_rapid_commit(sd_dhcp_client *client, bool rapid_commit) { + assert_return(client, -EINVAL); + + client->rapid_commit = !client->anonymize && rapid_commit; + return 0; +} + +int sd_dhcp_client_set_hostname( + sd_dhcp_client *client, + const char *hostname) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + /* Make sure hostnames qualify as DNS and as Linux hostnames */ + if (hostname && + !(hostname_is_valid(hostname, 0) && dns_name_is_valid(hostname) > 0)) + return -EINVAL; + + return free_and_strdup(&client->hostname, hostname); +} + +int sd_dhcp_client_set_vendor_class_identifier( + sd_dhcp_client *client, + const char *vci) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + return free_and_strdup(&client->vendor_class_identifier, vci); +} + +int sd_dhcp_client_set_mud_url( + sd_dhcp_client *client, + const char *mudurl) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(mudurl, -EINVAL); + assert_return(strlen(mudurl) <= 255, -EINVAL); + assert_return(http_url_is_valid(mudurl), -EINVAL); + + return free_and_strdup(&client->mudurl, mudurl); +} + +int sd_dhcp_client_set_user_class( + sd_dhcp_client *client, + char * const *user_class) { + + char **s = NULL; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(!strv_isempty(user_class), -EINVAL); + + STRV_FOREACH(p, user_class) { + size_t n = strlen(*p); + + if (n > 255 || n == 0) + return -EINVAL; + } + + s = strv_copy(user_class); + if (!s) + return -ENOMEM; + + return strv_free_and_replace(client->user_class, s); +} + +int sd_dhcp_client_set_client_port( + sd_dhcp_client *client, + uint16_t port) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + client->port = port; + + return 0; +} + +int sd_dhcp_client_set_mtu(sd_dhcp_client *client, uint32_t mtu) { + assert_return(client, -EINVAL); + assert_return(mtu >= DHCP_MIN_PACKET_SIZE, -ERANGE); + + /* MTU may be changed by the acquired lease. Hence, we cannot require that the client is stopped here. + * Please do not add assertion for !sd_dhcp_client_is_running(client) here. */ + + client->mtu = mtu; + + return 0; +} + +int sd_dhcp_client_set_max_attempts(sd_dhcp_client *client, uint64_t max_attempts) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + client->max_discover_attempts = max_attempts; + + return 0; +} + +int sd_dhcp_client_add_option(sd_dhcp_client *client, sd_dhcp_option *v) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(v, -EINVAL); + + r = ordered_hashmap_ensure_put(&client->extra_options, &dhcp_option_hash_ops, UINT_TO_PTR(v->option), v); + if (r < 0) + return r; + + sd_dhcp_option_ref(v); + return 0; +} + +int sd_dhcp_client_add_vendor_option(sd_dhcp_client *client, sd_dhcp_option *v) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(v, -EINVAL); + + r = ordered_hashmap_ensure_allocated(&client->vendor_options, &dhcp_option_hash_ops); + if (r < 0) + return -ENOMEM; + + r = ordered_hashmap_put(client->vendor_options, v, v); + if (r < 0) + return r; + + sd_dhcp_option_ref(v); + + return 1; +} + +int sd_dhcp_client_get_lease(sd_dhcp_client *client, sd_dhcp_lease **ret) { + assert_return(client, -EINVAL); + + if (!client->lease) + return -EADDRNOTAVAIL; + + if (ret) + *ret = client->lease; + + return 0; +} + +int sd_dhcp_client_set_service_type(sd_dhcp_client *client, int type) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + client->ip_service_type = type; + + return 0; +} + +int sd_dhcp_client_set_socket_priority(sd_dhcp_client *client, int socket_priority) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + client->socket_priority_set = true; + client->socket_priority = socket_priority; + + return 0; +} + +int sd_dhcp_client_set_fallback_lease_lifetime(sd_dhcp_client *client, uint64_t fallback_lease_lifetime) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + assert_return(fallback_lease_lifetime > 0, -EINVAL); + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + client->fallback_lease_lifetime = fallback_lease_lifetime; + + return 0; +} + +static void client_set_state(sd_dhcp_client *client, DHCPState state) { + assert(client); + + if (client->state == state) + return; + + log_dhcp_client(client, "State changed: %s -> %s", + dhcp_state_to_string(client->state), dhcp_state_to_string(state)); + + client->state = state; + + if (client->state_callback) + client->state_callback(client, state, client->state_userdata); +} + +int dhcp_client_get_state(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + + return client->state; +} + +static int client_notify(sd_dhcp_client *client, int event) { + assert(client); + + if (client->callback) + return client->callback(client, event, client->userdata); + + return 0; +} + +static int client_initialize(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + + client->receive_message = sd_event_source_disable_unref(client->receive_message); + + client->fd = safe_close(client->fd); + + (void) event_source_disable(client->timeout_resend); + (void) event_source_disable(client->timeout_t1); + (void) event_source_disable(client->timeout_t2); + (void) event_source_disable(client->timeout_expire); + (void) event_source_disable(client->timeout_ipv6_only_mode); + + client->discover_attempt = 0; + client->request_attempt = 0; + + client_set_state(client, DHCP_STATE_STOPPED); + client->xid = 0; + + client->lease = sd_dhcp_lease_unref(client->lease); + + return 0; +} + +static void client_stop(sd_dhcp_client *client, int error) { + assert(client); + + if (error < 0) + log_dhcp_client_errno(client, error, "STOPPED: %m"); + else if (error == SD_DHCP_CLIENT_EVENT_STOP) + log_dhcp_client(client, "STOPPED"); + else + log_dhcp_client(client, "STOPPED: Unknown event"); + + client_notify(client, error); + + client_initialize(client); +} + +/* RFC2131 section 4.1: + * retransmission delays should include -1 to +1 sec of random 'fuzz'. */ +#define RFC2131_RANDOM_FUZZ \ + ((int64_t)(random_u64() % (2 * USEC_PER_SEC)) - (int64_t)USEC_PER_SEC) + +/* RFC2131 section 4.1: + * for retransmission delays, timeout should start at 4s then double + * each attempt with max of 64s, with -1 to +1 sec of random 'fuzz' added. + * This assumes the first call will be using attempt 1. */ +static usec_t client_compute_request_timeout(usec_t now, uint64_t attempt) { + usec_t timeout = (UINT64_C(1) << MIN(attempt + 1, UINT64_C(6))) * USEC_PER_SEC; + + return usec_sub_signed(usec_add(now, timeout), RFC2131_RANDOM_FUZZ); +} + +/* RFC2131 section 4.4.5: + * T1 defaults to (0.5 * duration_of_lease). + * T2 defaults to (0.875 * duration_of_lease). */ +#define T1_DEFAULT(lifetime) ((lifetime) / 2) +#define T2_DEFAULT(lifetime) (((lifetime) * 7) / 8) + +/* RFC2131 section 4.4.5: + * the client SHOULD wait one-half of the remaining time until T2 (in RENEWING state) + * and one-half of the remaining lease time (in REBINDING state), down to a minimum + * of 60 seconds. + * Note that while the default T1/T2 initial times do have random 'fuzz' applied, + * the RFC sec 4.4.5 does not mention adding any fuzz to retries. */ +static usec_t client_compute_reacquisition_timeout(usec_t now, usec_t expire) { + return now + MAX(usec_sub_unsigned(expire, now) / 2, 60 * USEC_PER_SEC); +} + +static int cmp_uint8(const uint8_t *a, const uint8_t *b) { + return CMP(*a, *b); +} + +static int client_message_init( + sd_dhcp_client *client, + DHCPPacket **ret, + uint8_t type, + size_t *_optlen, + size_t *_optoffset) { + + _cleanup_free_ DHCPPacket *packet = NULL; + size_t optlen, optoffset, size; + usec_t time_now; + uint16_t secs; + int r; + + assert(client); + assert(client->start_time); + assert(ret); + assert(_optlen); + assert(_optoffset); + assert(IN_SET(type, DHCP_DISCOVER, DHCP_REQUEST, DHCP_RELEASE, DHCP_DECLINE)); + + optlen = DHCP_MIN_OPTIONS_SIZE; + size = sizeof(DHCPPacket) + optlen; + + packet = malloc0(size); + if (!packet) + return -ENOMEM; + + r = dhcp_message_init(&packet->dhcp, BOOTREQUEST, client->xid, type, + client->arp_type, client->hw_addr.length, client->hw_addr.bytes, + optlen, &optoffset); + if (r < 0) + return r; + + /* Although 'secs' field is a SHOULD in RFC 2131, certain DHCP servers + refuse to issue an DHCP lease if 'secs' is set to zero */ + r = sd_event_now(client->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + return r; + assert(time_now >= client->start_time); + + /* seconds between sending first and last DISCOVER + * must always be strictly positive to deal with broken servers */ + secs = ((time_now - client->start_time) / USEC_PER_SEC) ?: 1; + packet->dhcp.secs = htobe16(secs); + + /* RFC2131 section 4.1 + A client that cannot receive unicast IP datagrams until its protocol + software has been configured with an IP address SHOULD set the + BROADCAST bit in the 'flags' field to 1 in any DHCPDISCOVER or + DHCPREQUEST messages that client sends. The BROADCAST bit will + provide a hint to the DHCP server and BOOTP relay agent to broadcast + any messages to the client on the client's subnet. + + Note: some interfaces needs this to be enabled, but some networks + needs this to be disabled as broadcasts are filteretd, so this + needs to be configurable */ + if (client->request_broadcast || client->arp_type != ARPHRD_ETHER) + packet->dhcp.flags = htobe16(0x8000); + + /* Some DHCP servers will refuse to issue an DHCP lease if the Client + Identifier option is not set */ + r = dhcp_option_append(&packet->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_CLIENT_IDENTIFIER, + client->client_id_len, + &client->client_id); + if (r < 0) + return r; + + /* RFC2131 section 3.5: + in its initial DHCPDISCOVER or DHCPREQUEST message, a + client may provide the server with a list of specific + parameters the client is interested in. If the client + includes a list of parameters in a DHCPDISCOVER message, + it MUST include that list in any subsequent DHCPREQUEST + messages. + */ + + /* RFC7844 section 3: + MAY contain the Parameter Request List option. */ + /* NOTE: in case that there would be an option to do not send + * any PRL at all, the size should be checked before sending */ + if (!set_isempty(client->req_opts) && type != DHCP_RELEASE) { + _cleanup_free_ uint8_t *opts = NULL; + size_t n_opts, i = 0; + void *val; + + n_opts = set_size(client->req_opts); + opts = new(uint8_t, n_opts); + if (!opts) + return -ENOMEM; + + SET_FOREACH(val, client->req_opts) + opts[i++] = PTR_TO_UINT8(val); + assert(i == n_opts); + + /* For anonymizing the request, let's sort the options. */ + typesafe_qsort(opts, n_opts, cmp_uint8); + + r = dhcp_option_append(&packet->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_PARAMETER_REQUEST_LIST, + n_opts, opts); + if (r < 0) + return r; + } + + /* RFC2131 section 3.5: + The client SHOULD include the ’maximum DHCP message size’ option to + let the server know how large the server may make its DHCP messages. + + Note (from ConnMan): Some DHCP servers will send bigger DHCP packets + than the defined default size unless the Maximum Message Size option + is explicitly set + + RFC3442 "Requirements to Avoid Sizing Constraints": + Because a full routing table can be quite large, the standard 576 + octet maximum size for a DHCP message may be too short to contain + some legitimate Classless Static Route options. Because of this, + clients implementing the Classless Static Route option SHOULD send a + Maximum DHCP Message Size [4] option if the DHCP client's TCP/IP + stack is capable of receiving larger IP datagrams. In this case, the + client SHOULD set the value of this option to at least the MTU of the + interface that the client is configuring. The client MAY set the + value of this option higher, up to the size of the largest UDP packet + it is prepared to accept. (Note that the value specified in the + Maximum DHCP Message Size option is the total maximum packet size, + including IP and UDP headers.) + */ + /* RFC7844 section 3: + SHOULD NOT contain any other option. */ + if (!client->anonymize && IN_SET(type, DHCP_DISCOVER, DHCP_REQUEST)) { + be16_t max_size = htobe16(MIN(client->mtu - DHCP_IP_UDP_SIZE, (uint32_t) UINT16_MAX)); + r = dhcp_option_append(&packet->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_MAXIMUM_MESSAGE_SIZE, + 2, &max_size); + if (r < 0) + return r; + } + + *_optlen = optlen; + *_optoffset = optoffset; + *ret = TAKE_PTR(packet); + + return 0; +} + +static int client_append_fqdn_option( + DHCPMessage *message, + size_t optlen, + size_t *optoffset, + const char *fqdn) { + + uint8_t buffer[3 + DHCP_MAX_FQDN_LENGTH]; + int r; + + buffer[0] = DHCP_FQDN_FLAG_S | /* Request server to perform A RR DNS updates */ + DHCP_FQDN_FLAG_E; /* Canonical wire format */ + buffer[1] = 0; /* RCODE1 (deprecated) */ + buffer[2] = 0; /* RCODE2 (deprecated) */ + + r = dns_name_to_wire_format(fqdn, buffer + 3, sizeof(buffer) - 3, false); + if (r > 0) + r = dhcp_option_append(message, optlen, optoffset, 0, + SD_DHCP_OPTION_FQDN, 3 + r, buffer); + + return r; +} + +static int dhcp_client_send_raw( + sd_dhcp_client *client, + DHCPPacket *packet, + size_t len) { + + dhcp_packet_append_ip_headers(packet, INADDR_ANY, client->port, + INADDR_BROADCAST, DHCP_PORT_SERVER, len, client->ip_service_type); + + return dhcp_network_send_raw_socket(client->fd, &client->link, + packet, len); +} + +static int client_append_common_discover_request_options(sd_dhcp_client *client, DHCPPacket *packet, size_t *optoffset, size_t optlen) { + sd_dhcp_option *j; + int r; + + assert(client); + + if (client->hostname) { + /* According to RFC 4702 "clients that send the Client FQDN option in + their messages MUST NOT also send the Host Name option". Just send + one of the two depending on the hostname type. + */ + if (dns_name_is_single_label(client->hostname)) { + /* it is unclear from RFC 2131 if client should send hostname in + DHCPDISCOVER but dhclient does and so we do as well + */ + r = dhcp_option_append(&packet->dhcp, optlen, optoffset, 0, + SD_DHCP_OPTION_HOST_NAME, + strlen(client->hostname), client->hostname); + } else + r = client_append_fqdn_option(&packet->dhcp, optlen, optoffset, + client->hostname); + if (r < 0) + return r; + } + + if (client->vendor_class_identifier) { + r = dhcp_option_append(&packet->dhcp, optlen, optoffset, 0, + SD_DHCP_OPTION_VENDOR_CLASS_IDENTIFIER, + strlen(client->vendor_class_identifier), + client->vendor_class_identifier); + if (r < 0) + return r; + } + + if (client->mudurl) { + r = dhcp_option_append(&packet->dhcp, optlen, optoffset, 0, + SD_DHCP_OPTION_MUD_URL, + strlen(client->mudurl), + client->mudurl); + if (r < 0) + return r; + } + + if (client->user_class) { + r = dhcp_option_append(&packet->dhcp, optlen, optoffset, 0, + SD_DHCP_OPTION_USER_CLASS, + strv_length(client->user_class), + client->user_class); + if (r < 0) + return r; + } + + ORDERED_HASHMAP_FOREACH(j, client->extra_options) { + r = dhcp_option_append(&packet->dhcp, optlen, optoffset, 0, + j->option, j->length, j->data); + if (r < 0) + return r; + } + + if (!ordered_hashmap_isempty(client->vendor_options)) { + r = dhcp_option_append( + &packet->dhcp, optlen, optoffset, 0, + SD_DHCP_OPTION_VENDOR_SPECIFIC, + ordered_hashmap_size(client->vendor_options), client->vendor_options); + if (r < 0) + return r; + } + + + return 0; +} + +static int client_send_discover(sd_dhcp_client *client) { + _cleanup_free_ DHCPPacket *discover = NULL; + size_t optoffset, optlen; + int r; + + assert(client); + assert(IN_SET(client->state, DHCP_STATE_INIT, DHCP_STATE_SELECTING)); + + r = client_message_init(client, &discover, DHCP_DISCOVER, + &optlen, &optoffset); + if (r < 0) + return r; + + /* the client may suggest values for the network address + and lease time in the DHCPDISCOVER message. The client may include + the ’requested IP address’ option to suggest that a particular IP + address be assigned, and may include the ’IP address lease time’ + option to suggest the lease time it would like. + */ + /* RFC7844 section 3: + SHOULD NOT contain any other option. */ + if (!client->anonymize && client->last_addr != INADDR_ANY) { + r = dhcp_option_append(&discover->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_REQUESTED_IP_ADDRESS, + 4, &client->last_addr); + if (r < 0) + return r; + } + + if (client->rapid_commit) { + r = dhcp_option_append(&discover->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_RAPID_COMMIT, 0, NULL); + if (r < 0) + return r; + } + + r = client_append_common_discover_request_options(client, discover, &optoffset, optlen); + if (r < 0) + return r; + + r = dhcp_option_append(&discover->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + /* We currently ignore: + The client SHOULD wait a random time between one and ten seconds to + desynchronize the use of DHCP at startup. + */ + r = dhcp_client_send_raw(client, discover, sizeof(DHCPPacket) + optoffset); + if (r < 0) + return r; + + log_dhcp_client(client, "DISCOVER"); + + return 0; +} + +static int client_send_request(sd_dhcp_client *client) { + _cleanup_free_ DHCPPacket *request = NULL; + size_t optoffset, optlen; + int r; + + assert(client); + + r = client_message_init(client, &request, DHCP_REQUEST, &optlen, &optoffset); + if (r < 0) + return r; + + switch (client->state) { + /* See RFC2131 section 4.3.2 (note that there is a typo in the RFC, + SELECTING should be REQUESTING) + */ + + case DHCP_STATE_REQUESTING: + /* Client inserts the address of the selected server in ’server + identifier’, ’ciaddr’ MUST be zero, ’requested IP address’ MUST be + filled in with the yiaddr value from the chosen DHCPOFFER. + */ + + r = dhcp_option_append(&request->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_SERVER_IDENTIFIER, + 4, &client->lease->server_address); + if (r < 0) + return r; + + r = dhcp_option_append(&request->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_REQUESTED_IP_ADDRESS, + 4, &client->lease->address); + if (r < 0) + return r; + + break; + + case DHCP_STATE_INIT_REBOOT: + /* ’server identifier’ MUST NOT be filled in, ’requested IP address’ + option MUST be filled in with client’s notion of its previously + assigned address. ’ciaddr’ MUST be zero. + */ + r = dhcp_option_append(&request->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_REQUESTED_IP_ADDRESS, + 4, &client->last_addr); + if (r < 0) + return r; + break; + + case DHCP_STATE_RENEWING: + /* ’server identifier’ MUST NOT be filled in, ’requested IP address’ + option MUST NOT be filled in, ’ciaddr’ MUST be filled in with + client’s IP address. + */ + + case DHCP_STATE_REBINDING: + /* ’server identifier’ MUST NOT be filled in, ’requested IP address’ + option MUST NOT be filled in, ’ciaddr’ MUST be filled in with + client’s IP address. + + This message MUST be broadcast to the 0xffffffff IP broadcast address. + */ + request->dhcp.ciaddr = client->lease->address; + + break; + + case DHCP_STATE_INIT: + case DHCP_STATE_SELECTING: + case DHCP_STATE_REBOOTING: + case DHCP_STATE_BOUND: + case DHCP_STATE_STOPPED: + default: + return -EINVAL; + } + + r = client_append_common_discover_request_options(client, request, &optoffset, optlen); + if (r < 0) + return r; + + r = dhcp_option_append(&request->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + if (client->state == DHCP_STATE_RENEWING) + r = dhcp_network_send_udp_socket(client->fd, + client->lease->server_address, + DHCP_PORT_SERVER, + &request->dhcp, + sizeof(DHCPMessage) + optoffset); + else + r = dhcp_client_send_raw(client, request, sizeof(DHCPPacket) + optoffset); + if (r < 0) + return r; + + switch (client->state) { + + case DHCP_STATE_REQUESTING: + log_dhcp_client(client, "REQUEST (requesting)"); + break; + + case DHCP_STATE_INIT_REBOOT: + log_dhcp_client(client, "REQUEST (init-reboot)"); + break; + + case DHCP_STATE_RENEWING: + log_dhcp_client(client, "REQUEST (renewing)"); + break; + + case DHCP_STATE_REBINDING: + log_dhcp_client(client, "REQUEST (rebinding)"); + break; + + default: + log_dhcp_client(client, "REQUEST (invalid)"); + break; + } + + return 0; +} + +static int client_start(sd_dhcp_client *client); + +static int client_timeout_resend( + sd_event_source *s, + uint64_t usec, + void *userdata) { + + sd_dhcp_client *client = ASSERT_PTR(userdata); + DHCP_CLIENT_DONT_DESTROY(client); + usec_t time_now, next_timeout; + int r; + + assert(s); + assert(client->event); + + r = sd_event_now(client->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + goto error; + + switch (client->state) { + + case DHCP_STATE_RENEWING: + next_timeout = client_compute_reacquisition_timeout(time_now, client->t2_time); + break; + + case DHCP_STATE_REBINDING: + next_timeout = client_compute_reacquisition_timeout(time_now, client->expire_time); + break; + + case DHCP_STATE_REBOOTING: + /* start over as we did not receive a timely ack or nak */ + r = client_initialize(client); + if (r < 0) + goto error; + + r = client_start(client); + if (r < 0) + goto error; + + log_dhcp_client(client, "REBOOTED"); + return 0; + + case DHCP_STATE_INIT: + case DHCP_STATE_INIT_REBOOT: + case DHCP_STATE_SELECTING: + if (client->discover_attempt >= client->max_discover_attempts) + goto error; + + client->discover_attempt++; + next_timeout = client_compute_request_timeout(time_now, client->discover_attempt); + break; + case DHCP_STATE_REQUESTING: + case DHCP_STATE_BOUND: + if (client->request_attempt >= client->max_request_attempts) + goto error; + + client->request_attempt++; + next_timeout = client_compute_request_timeout(time_now, client->request_attempt); + break; + + case DHCP_STATE_STOPPED: + r = -EINVAL; + goto error; + + default: + assert_not_reached(); + } + + r = event_reset_time(client->event, &client->timeout_resend, + CLOCK_BOOTTIME, + next_timeout, 10 * USEC_PER_MSEC, + client_timeout_resend, client, + client->event_priority, "dhcp4-resend-timer", true); + if (r < 0) + goto error; + + switch (client->state) { + case DHCP_STATE_INIT: + r = client_send_discover(client); + if (r >= 0) { + client_set_state(client, DHCP_STATE_SELECTING); + client->discover_attempt = 0; + } else if (client->discover_attempt >= client->max_discover_attempts) + goto error; + break; + + case DHCP_STATE_SELECTING: + r = client_send_discover(client); + if (r < 0 && client->discover_attempt >= client->max_discover_attempts) + goto error; + break; + + case DHCP_STATE_INIT_REBOOT: + case DHCP_STATE_REQUESTING: + case DHCP_STATE_RENEWING: + case DHCP_STATE_REBINDING: + r = client_send_request(client); + if (r < 0 && client->request_attempt >= client->max_request_attempts) + goto error; + + if (client->state == DHCP_STATE_INIT_REBOOT) + client_set_state(client, DHCP_STATE_REBOOTING); + break; + + case DHCP_STATE_REBOOTING: + case DHCP_STATE_BOUND: + break; + + case DHCP_STATE_STOPPED: + default: + r = -EINVAL; + goto error; + } + + if (client->discover_attempt >= TRANSIENT_FAILURE_ATTEMPTS) + client_notify(client, SD_DHCP_CLIENT_EVENT_TRANSIENT_FAILURE); + + return 0; + +error: + /* Avoid REQUEST infinite loop. Per RFC 2131 section 3.1.5: if the client receives + neither a DHCPACK or a DHCPNAK message after employing the retransmission algorithm, + the client reverts to INIT state and restarts the initialization process */ + if (client->request_attempt >= client->max_request_attempts) { + log_dhcp_client(client, "Max REQUEST attempts reached. Restarting..."); + client_restart(client); + return 0; + } + client_stop(client, r); + + /* Errors were dealt with when stopping the client, don't spill + errors into the event loop handler */ + return 0; +} + +static int client_initialize_io_events( + sd_dhcp_client *client, + sd_event_io_handler_t io_callback) { + + int r; + + assert(client); + assert(client->event); + + r = sd_event_add_io(client->event, &client->receive_message, + client->fd, EPOLLIN, io_callback, + client); + if (r < 0) + goto error; + + r = sd_event_source_set_priority(client->receive_message, + client->event_priority); + if (r < 0) + goto error; + + r = sd_event_source_set_description(client->receive_message, "dhcp4-receive-message"); + if (r < 0) + goto error; + +error: + if (r < 0) + client_stop(client, r); + + return 0; +} + +static int client_initialize_time_events(sd_dhcp_client *client) { + usec_t usec = 0; + int r; + + assert(client); + assert(client->event); + + (void) event_source_disable(client->timeout_ipv6_only_mode); + + if (client->start_delay > 0) { + assert_se(sd_event_now(client->event, CLOCK_BOOTTIME, &usec) >= 0); + usec = usec_add(usec, client->start_delay); + } + + r = event_reset_time(client->event, &client->timeout_resend, + CLOCK_BOOTTIME, + usec, 0, + client_timeout_resend, client, + client->event_priority, "dhcp4-resend-timer", true); + if (r < 0) + client_stop(client, r); + + return 0; + +} + +static int client_initialize_events(sd_dhcp_client *client, sd_event_io_handler_t io_callback) { + client_initialize_io_events(client, io_callback); + client_initialize_time_events(client); + + return 0; +} + +static int client_start_delayed(sd_dhcp_client *client) { + int r; + + assert_return(client, -EINVAL); + assert_return(client->event, -EINVAL); + assert_return(client->ifindex > 0, -EINVAL); + assert_return(client->fd < 0, -EBUSY); + assert_return(client->xid == 0, -EINVAL); + assert_return(IN_SET(client->state, DHCP_STATE_STOPPED, DHCP_STATE_INIT_REBOOT), -EBUSY); + + client->xid = random_u32(); + + r = dhcp_network_bind_raw_socket(client->ifindex, &client->link, client->xid, + &client->hw_addr, &client->bcast_addr, + client->arp_type, client->port, + client->socket_priority_set, client->socket_priority); + if (r < 0) { + client_stop(client, r); + return r; + } + client->fd = r; + + client->start_time = now(CLOCK_BOOTTIME); + + if (client->state == DHCP_STATE_STOPPED) + client->state = DHCP_STATE_INIT; + + return client_initialize_events(client, client_receive_message_raw); +} + +static int client_start(sd_dhcp_client *client) { + client->start_delay = 0; + return client_start_delayed(client); +} + +static int client_timeout_expire(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp_client *client = userdata; + DHCP_CLIENT_DONT_DESTROY(client); + + log_dhcp_client(client, "EXPIRED"); + + client_notify(client, SD_DHCP_CLIENT_EVENT_EXPIRED); + + /* lease was lost, start over if not freed or stopped in callback */ + if (client->state != DHCP_STATE_STOPPED) { + client_initialize(client); + client_start(client); + } + + return 0; +} + +static int client_timeout_t2(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp_client *client = ASSERT_PTR(userdata); + DHCP_CLIENT_DONT_DESTROY(client); + int r; + + client->receive_message = sd_event_source_disable_unref(client->receive_message); + client->fd = safe_close(client->fd); + + client_set_state(client, DHCP_STATE_REBINDING); + client->discover_attempt = 0; + client->request_attempt = 0; + + r = dhcp_network_bind_raw_socket(client->ifindex, &client->link, client->xid, + &client->hw_addr, &client->bcast_addr, + client->arp_type, client->port, + client->socket_priority_set, client->socket_priority); + if (r < 0) { + client_stop(client, r); + return 0; + } + client->fd = r; + + return client_initialize_events(client, client_receive_message_raw); +} + +static int client_timeout_t1(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp_client *client = userdata; + DHCP_CLIENT_DONT_DESTROY(client); + + if (client->lease) + client_set_state(client, DHCP_STATE_RENEWING); + else if (client->state != DHCP_STATE_INIT) + client_set_state(client, DHCP_STATE_INIT_REBOOT); + client->discover_attempt = 0; + client->request_attempt = 0; + + return client_initialize_time_events(client); +} + +static int client_parse_message( + sd_dhcp_client *client, + DHCPMessage *message, + size_t len, + sd_dhcp_lease **ret) { + + _cleanup_(sd_dhcp_lease_unrefp) sd_dhcp_lease *lease = NULL; + _cleanup_free_ char *error_message = NULL; + int r; + + assert(client); + assert(message); + assert(ret); + + r = dhcp_lease_new(&lease); + if (r < 0) + return r; + + if (client->client_id_len > 0) { + r = dhcp_lease_set_client_id(lease, + (uint8_t *) &client->client_id, + client->client_id_len); + if (r < 0) + return r; + } + + r = dhcp_option_parse(message, len, dhcp_lease_parse_options, lease, &error_message); + if (r < 0) + return log_dhcp_client_errno(client, r, "Failed to parse DHCP options, ignoring: %m"); + + switch (client->state) { + case DHCP_STATE_SELECTING: + if (r == DHCP_ACK) { + if (!client->rapid_commit) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "received unexpected ACK, ignoring."); + if (!lease->rapid_commit) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "received rapid ACK without Rapid Commit option, ignoring."); + } else if (r == DHCP_OFFER) { + if (lease->rapid_commit) { + /* Some RFC incompliant servers provides an OFFER with a rapid commit option. + * See https://github.com/systemd/systemd/issues/29904. + * Let's support such servers gracefully. */ + log_dhcp_client(client, "received OFFER with Rapid Commit option, ignoring."); + lease->rapid_commit = false; + } + if (lease->lifetime == 0 && client->fallback_lease_lifetime > 0) + lease->lifetime = client->fallback_lease_lifetime; + } else + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "received unexpected message, ignoring."); + + break; + + case DHCP_STATE_REBOOTING: + case DHCP_STATE_REQUESTING: + case DHCP_STATE_RENEWING: + case DHCP_STATE_REBINDING: + if (r == DHCP_NAK) { + if (client->lease && client->lease->server_address != lease->server_address) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "NAK from unexpected server, ignoring: %s", + strna(error_message)); + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "NAK: %s", strna(error_message)); + } + if (r != DHCP_ACK) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "received message was not an ACK, ignoring."); + break; + + default: + assert_not_reached(); + } + + lease->next_server = message->siaddr; + lease->address = message->yiaddr; + + if (lease->address == 0 || + lease->server_address == 0 || + lease->lifetime == 0) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "received lease lacks address, server address or lease lifetime, ignoring."); + + r = dhcp_lease_set_default_subnet_mask(lease); + if (r < 0) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "received lease lacks subnet mask, and a fallback one cannot be generated, ignoring."); + + /* RFC 8925 section 3.2 + * If the client did not include the IPv6-Only Preferred option code in the Parameter Request List in + * the DHCPDISCOVER or DHCPREQUEST message, it MUST ignore the IPv6-Only Preferred option in any + * messages received from the server. */ + if (lease->ipv6_only_preferred_usec > 0 && + !client_request_contains(client, SD_DHCP_OPTION_IPV6_ONLY_PREFERRED)) { + log_dhcp_client(client, "Received message with unrequested IPv6-only preferred option, ignoring the option."); + lease->ipv6_only_preferred_usec = 0; + } + + *ret = TAKE_PTR(lease); + return 0; +} + +static int client_handle_offer_or_rapid_ack(sd_dhcp_client *client, DHCPMessage *message, size_t len, const triple_timestamp *timestamp) { + _cleanup_(sd_dhcp_lease_unrefp) sd_dhcp_lease *lease = NULL; + int r; + + assert(client); + assert(message); + + r = client_parse_message(client, message, len, &lease); + if (r < 0) + return r; + + dhcp_lease_set_timestamp(lease, timestamp); + + dhcp_lease_unref_and_replace(client->lease, lease); + + if (client->lease->rapid_commit) { + log_dhcp_client(client, "ACK"); + return SD_DHCP_CLIENT_EVENT_IP_ACQUIRE; + } + + if (client_notify(client, SD_DHCP_CLIENT_EVENT_SELECTING) < 0) + return -ENOMSG; + + log_dhcp_client(client, "OFFER"); + return 0; +} + +static int client_enter_requesting_now(sd_dhcp_client *client) { + assert(client); + + client_set_state(client, DHCP_STATE_REQUESTING); + client->discover_attempt = 0; + client->request_attempt = 0; + + return event_reset_time(client->event, &client->timeout_resend, + CLOCK_BOOTTIME, 0, 0, + client_timeout_resend, client, + client->event_priority, "dhcp4-resend-timer", + /* force_reset = */ true); +} + +static int client_enter_requesting_delayed(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp_client *client = ASSERT_PTR(userdata); + DHCP_CLIENT_DONT_DESTROY(client); + int r; + + r = client_enter_requesting_now(client); + if (r < 0) + client_stop(client, r); + + return 0; +} + +static int client_enter_requesting(sd_dhcp_client *client) { + assert(client); + assert(client->lease); + + (void) event_source_disable(client->timeout_resend); + + if (client->lease->ipv6_only_preferred_usec > 0) { + if (client->ipv6_acquired) { + log_dhcp_client(client, + "Received an OFFER with IPv6-only preferred option, and the host already acquired IPv6 connectivity, stopping DHCPv4 client."); + return sd_dhcp_client_stop(client); + } + + log_dhcp_client(client, + "Received an OFFER with IPv6-only preferred option, delaying to send REQUEST with %s.", + FORMAT_TIMESPAN(client->lease->ipv6_only_preferred_usec, USEC_PER_SEC)); + + return event_reset_time_relative(client->event, &client->timeout_ipv6_only_mode, + CLOCK_BOOTTIME, + client->lease->ipv6_only_preferred_usec, 0, + client_enter_requesting_delayed, client, + client->event_priority, "dhcp4-ipv6-only-mode-timer", + /* force_reset = */ true); + } + + return client_enter_requesting_now(client); +} + +static int client_handle_forcerenew(sd_dhcp_client *client, DHCPMessage *force, size_t len) { + int r; + + r = dhcp_option_parse(force, len, NULL, NULL, NULL); + if (r != DHCP_FORCERENEW) + return -ENOMSG; + +#if 0 + log_dhcp_client(client, "FORCERENEW"); + return 0; +#else + /* FIXME: Ignore FORCERENEW requests until we implement RFC3118 (Authentication for DHCP + * Messages) and/or RFC6704 (Forcerenew Nonce Authentication), as unauthenticated FORCERENEW + * requests causes a security issue (TALOS-2020-1142, CVE-2020-13529). */ + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(ENOMSG), + "Received FORCERENEW, ignoring."); +#endif +} + +static bool lease_equal(const sd_dhcp_lease *a, const sd_dhcp_lease *b) { + if (a->address != b->address) + return false; + + if (a->subnet_mask != b->subnet_mask) + return false; + + if (a->router_size != b->router_size) + return false; + + for (size_t i = 0; i < a->router_size; i++) + if (a->router[i].s_addr != b->router[i].s_addr) + return false; + + return true; +} + +static int client_handle_ack(sd_dhcp_client *client, DHCPMessage *message, size_t len, const triple_timestamp *timestamp) { + _cleanup_(sd_dhcp_lease_unrefp) sd_dhcp_lease *lease = NULL; + int r; + + assert(client); + assert(message); + + r = client_parse_message(client, message, len, &lease); + if (r < 0) + return r; + + dhcp_lease_set_timestamp(lease, timestamp); + + if (!client->lease) + r = SD_DHCP_CLIENT_EVENT_IP_ACQUIRE; + else if (lease_equal(client->lease, lease)) + r = SD_DHCP_CLIENT_EVENT_RENEW; + else + r = SD_DHCP_CLIENT_EVENT_IP_CHANGE; + + dhcp_lease_unref_and_replace(client->lease, lease); + + log_dhcp_client(client, "ACK"); + return r; +} + +static int client_set_lease_timeouts(sd_dhcp_client *client) { + usec_t time_now; + int r; + + assert(client); + assert(client->event); + assert(client->lease); + assert(client->lease->lifetime > 0); + assert(triple_timestamp_is_set(&client->lease->timestamp)); + + /* don't set timers for infinite leases */ + if (client->lease->lifetime == USEC_INFINITY) { + (void) event_source_disable(client->timeout_t1); + (void) event_source_disable(client->timeout_t2); + (void) event_source_disable(client->timeout_expire); + + return 0; + } + + r = sd_event_now(client->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + return r; + + /* verify that 0 < t2 < lifetime */ + if (client->lease->t2 == 0 || client->lease->t2 >= client->lease->lifetime) + client->lease->t2 = T2_DEFAULT(client->lease->lifetime); + /* verify that 0 < t1 < lifetime */ + if (client->lease->t1 == 0 || client->lease->t1 >= client->lease->t2) + client->lease->t1 = T1_DEFAULT(client->lease->lifetime); + /* now, if t1 >= t2, t1 *must* be T1_DEFAULT, since the previous check + * could not evaluate to false if t1 >= t2; so setting t2 to T2_DEFAULT + * guarantees t1 < t2. */ + if (client->lease->t1 >= client->lease->t2) + client->lease->t2 = T2_DEFAULT(client->lease->lifetime); + + assert(client->lease->t1 > 0); + assert(client->lease->t1 < client->lease->t2); + assert(client->lease->t2 < client->lease->lifetime); + + r = sd_dhcp_lease_get_lifetime_timestamp(client->lease, CLOCK_BOOTTIME, &client->expire_time); + if (r < 0) + return r; + r = sd_dhcp_lease_get_t1_timestamp(client->lease, CLOCK_BOOTTIME, &client->t1_time); + if (r < 0) + return r; + r = sd_dhcp_lease_get_t2_timestamp(client->lease, CLOCK_BOOTTIME, &client->t2_time); + if (r < 0) + return r; + + /* RFC2131 section 4.4.5: + * Times T1 and T2 SHOULD be chosen with some random "fuzz". + * Since the RFC doesn't specify here the exact 'fuzz' to use, + * we use the range from section 4.1: -1 to +1 sec. */ + client->t1_time = usec_sub_signed(client->t1_time, RFC2131_RANDOM_FUZZ); + client->t2_time = usec_sub_signed(client->t2_time, RFC2131_RANDOM_FUZZ); + + /* after fuzzing, ensure t2 is still >= t1 */ + client->t2_time = MAX(client->t1_time, client->t2_time); + + /* arm lifetime timeout */ + r = event_reset_time(client->event, &client->timeout_expire, + CLOCK_BOOTTIME, + client->expire_time, 10 * USEC_PER_MSEC, + client_timeout_expire, client, + client->event_priority, "dhcp4-lifetime", true); + if (r < 0) + return r; + + /* don't arm earlier timeouts if this has already expired */ + if (client->expire_time <= time_now) + return 0; + + log_dhcp_client(client, "lease expires in %s", + FORMAT_TIMESPAN(client->expire_time - time_now, USEC_PER_SEC)); + + /* arm T2 timeout */ + r = event_reset_time(client->event, &client->timeout_t2, + CLOCK_BOOTTIME, + client->t2_time, 10 * USEC_PER_MSEC, + client_timeout_t2, client, + client->event_priority, "dhcp4-t2-timeout", true); + if (r < 0) + return r; + + /* don't arm earlier timeout if this has already expired */ + if (client->t2_time <= time_now) + return 0; + + log_dhcp_client(client, "T2 expires in %s", + FORMAT_TIMESPAN(client->t2_time - time_now, USEC_PER_SEC)); + + /* arm T1 timeout */ + r = event_reset_time(client->event, &client->timeout_t1, + CLOCK_BOOTTIME, + client->t1_time, 10 * USEC_PER_MSEC, + client_timeout_t1, client, + client->event_priority, "dhcp4-t1-timer", true); + if (r < 0) + return r; + + if (client->t1_time > time_now) + log_dhcp_client(client, "T1 expires in %s", + FORMAT_TIMESPAN(client->t1_time - time_now, USEC_PER_SEC)); + + return 0; +} + +static int client_enter_bound_now(sd_dhcp_client *client, int notify_event) { + int r; + + assert(client); + + if (IN_SET(client->state, DHCP_STATE_REQUESTING, DHCP_STATE_REBOOTING)) + notify_event = SD_DHCP_CLIENT_EVENT_IP_ACQUIRE; + + client_set_state(client, DHCP_STATE_BOUND); + client->discover_attempt = 0; + client->request_attempt = 0; + + client->last_addr = client->lease->address; + + r = client_set_lease_timeouts(client); + if (r < 0) + log_dhcp_client_errno(client, r, "could not set lease timeouts: %m"); + + r = dhcp_network_bind_udp_socket(client->ifindex, client->lease->address, client->port, client->ip_service_type); + if (r < 0) + return log_dhcp_client_errno(client, r, "could not bind UDP socket: %m"); + + client->receive_message = sd_event_source_disable_unref(client->receive_message); + close_and_replace(client->fd, r); + client_initialize_io_events(client, client_receive_message_udp); + + client_notify(client, notify_event); + + return 0; +} + +static int client_enter_bound_delayed(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp_client *client = ASSERT_PTR(userdata); + DHCP_CLIENT_DONT_DESTROY(client); + int r; + + r = client_enter_bound_now(client, SD_DHCP_CLIENT_EVENT_IP_ACQUIRE); + if (r < 0) + client_stop(client, r); + + return 0; +} + +static int client_enter_bound(sd_dhcp_client *client, int notify_event) { + assert(client); + assert(client->lease); + + client->start_delay = 0; + (void) event_source_disable(client->timeout_resend); + + /* RFC 8925 section 3.2 + * If the client is in the INIT-REBOOT state, it SHOULD stop the DHCPv4 configuration process or + * disable the IPv4 stack completely for V6ONLY_WAIT seconds or until the network attachment event, + * whichever happens first. + * + * In the below, the condition uses REBOOTING, instead of INIT-REBOOT, as the client state has + * already transitioned from INIT-REBOOT to REBOOTING after sending a DHCPREQUEST message. */ + if (client->state == DHCP_STATE_REBOOTING && client->lease->ipv6_only_preferred_usec > 0) { + if (client->ipv6_acquired) { + log_dhcp_client(client, + "Received an ACK with IPv6-only preferred option, and the host already acquired IPv6 connectivity, stopping DHCPv4 client."); + return sd_dhcp_client_stop(client); + } + + log_dhcp_client(client, + "Received an ACK with IPv6-only preferred option, delaying to enter bound state with %s.", + FORMAT_TIMESPAN(client->lease->ipv6_only_preferred_usec, USEC_PER_SEC)); + + return event_reset_time_relative(client->event, &client->timeout_ipv6_only_mode, + CLOCK_BOOTTIME, + client->lease->ipv6_only_preferred_usec, 0, + client_enter_bound_delayed, client, + client->event_priority, "dhcp4-ipv6-only-mode", + /* force_reset = */ true); + } + + return client_enter_bound_now(client, notify_event); +} + +static int client_restart(sd_dhcp_client *client) { + int r; + assert(client); + + client_notify(client, SD_DHCP_CLIENT_EVENT_EXPIRED); + + r = client_initialize(client); + if (r < 0) + return r; + + r = client_start_delayed(client); + if (r < 0) + return r; + + log_dhcp_client(client, "REBOOT in %s", FORMAT_TIMESPAN(client->start_delay, USEC_PER_SEC)); + + client->start_delay = CLAMP(client->start_delay * 2, + RESTART_AFTER_NAK_MIN_USEC, RESTART_AFTER_NAK_MAX_USEC); + return 0; +} + +static int client_verify_message_header(sd_dhcp_client *client, DHCPMessage *message, size_t len) { + const uint8_t *expected_chaddr = NULL; + uint8_t expected_hlen = 0; + + assert(client); + assert(message); + + if (len < sizeof(DHCPMessage)) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Too small to be a DHCP message, ignoring."); + + if (be32toh(message->magic) != DHCP_MAGIC_COOKIE) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Not a DHCP message, ignoring."); + + if (message->op != BOOTREPLY) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Not a BOOTREPLY message, ignoring."); + + if (message->htype != client->arp_type) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Packet type does not match client type, ignoring."); + + if (client->arp_type == ARPHRD_ETHER) { + expected_hlen = ETH_ALEN; + expected_chaddr = client->hw_addr.bytes; + } + + if (message->hlen != expected_hlen) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Received packet hlen (%u) does not match expected (%u), ignoring.", + message->hlen, expected_hlen); + + if (memcmp_safe(message->chaddr, expected_chaddr, expected_hlen)) + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Received chaddr does not match expected, ignoring."); + + if (client->state != DHCP_STATE_BOUND && + be32toh(message->xid) != client->xid) + /* in BOUND state, we may receive FORCERENEW with xid set by server, + so ignore the xid in this case */ + return log_dhcp_client_errno(client, SYNTHETIC_ERRNO(EBADMSG), + "Received xid (%u) does not match expected (%u), ignoring.", + be32toh(message->xid), client->xid); + + return 0; +} + +static int client_handle_message(sd_dhcp_client *client, DHCPMessage *message, size_t len, const triple_timestamp *timestamp) { + DHCP_CLIENT_DONT_DESTROY(client); + int r; + + assert(client); + assert(message); + assert(timestamp); + + if (client_verify_message_header(client, message, len) < 0) + return 0; + + switch (client->state) { + case DHCP_STATE_SELECTING: + + r = client_handle_offer_or_rapid_ack(client, message, len, timestamp); + if (ERRNO_IS_NEG_RESOURCE(r)) + return r; + if (r == -EADDRNOTAVAIL) + /* got a rapid NAK, let's restart the client */ + return client_restart(client); + if (r < 0) + return 0; /* invalid message, let's ignore it */ + + if (client->lease->rapid_commit) + /* got a successful rapid commit */ + return client_enter_bound(client, r); + + return client_enter_requesting(client); + + case DHCP_STATE_REBOOTING: + case DHCP_STATE_REQUESTING: + case DHCP_STATE_RENEWING: + case DHCP_STATE_REBINDING: + + r = client_handle_ack(client, message, len, timestamp); + if (ERRNO_IS_NEG_RESOURCE(r)) + return r; + if (r == -EADDRNOTAVAIL) + /* got a NAK, let's restart the client */ + return client_restart(client); + if (r < 0) + return 0; /* invalid message, let's ignore it */ + + return client_enter_bound(client, r); + + case DHCP_STATE_BOUND: + r = client_handle_forcerenew(client, message, len); + if (ERRNO_IS_NEG_RESOURCE(r)) + return r; + if (r < 0) + return 0; /* invalid message, let's ignore it */ + + return client_timeout_t1(NULL, 0, client); + + case DHCP_STATE_INIT: + case DHCP_STATE_INIT_REBOOT: + log_dhcp_client(client, "Unexpectedly receive message without sending any requests, ignoring."); + return 0; + + default: + assert_not_reached(); + } + + return 0; +} + +static int client_receive_message_udp( + sd_event_source *s, + int fd, + uint32_t revents, + void *userdata) { + + sd_dhcp_client *client = ASSERT_PTR(userdata); + _cleanup_free_ DHCPMessage *message = NULL; + ssize_t len, buflen; + /* This needs to be initialized with zero. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE_TIMEVAL) control = {}; + struct iovec iov; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + int r; + + assert(s); + + buflen = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(buflen) || ERRNO_IS_NEG_DISCONNECT(buflen)) + return 0; + if (buflen < 0) { + log_dhcp_client_errno(client, buflen, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + message = malloc0(buflen); + if (!message) + return -ENOMEM; + + iov = IOVEC_MAKE(message, buflen); + + len = recvmsg_safe(fd, &msg, MSG_DONTWAIT); + if (ERRNO_IS_NEG_TRANSIENT(len) || ERRNO_IS_NEG_DISCONNECT(len)) + return 0; + if (len < 0) { + log_dhcp_client_errno(client, len, "Could not receive message from UDP socket, ignoring: %m"); + return 0; + } + + log_dhcp_client(client, "Received message from UDP socket, processing."); + r = client_handle_message(client, message, len, TRIPLE_TIMESTAMP_FROM_CMSG(&msg)); + if (r < 0) + client_stop(client, r); + + return 0; +} + +static int client_receive_message_raw( + sd_event_source *s, + int fd, + uint32_t revents, + void *userdata) { + + sd_dhcp_client *client = ASSERT_PTR(userdata); + _cleanup_free_ DHCPPacket *packet = NULL; + /* This needs to be initialized with zero. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE_TIMEVAL + + CMSG_SPACE(sizeof(struct tpacket_auxdata))) control = {}; + struct iovec iov = {}; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + bool checksum = true; + ssize_t buflen, len; + int r; + + assert(s); + + buflen = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(buflen) || ERRNO_IS_NEG_DISCONNECT(buflen)) + return 0; + if (buflen < 0) { + log_dhcp_client_errno(client, buflen, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + packet = malloc0(buflen); + if (!packet) + return -ENOMEM; + + iov = IOVEC_MAKE(packet, buflen); + + len = recvmsg_safe(fd, &msg, 0); + if (ERRNO_IS_NEG_TRANSIENT(len) || ERRNO_IS_NEG_DISCONNECT(len)) + return 0; + if (len < 0) { + log_dhcp_client_errno(client, len, "Could not receive message from raw socket, ignoring: %m"); + return 0; + } + + struct tpacket_auxdata *aux = CMSG_FIND_DATA(&msg, SOL_PACKET, PACKET_AUXDATA, struct tpacket_auxdata); + if (aux) + checksum = !(aux->tp_status & TP_STATUS_CSUMNOTREADY); + + if (dhcp_packet_verify_headers(packet, len, checksum, client->port) < 0) + return 0; + + len -= DHCP_IP_UDP_SIZE; + + log_dhcp_client(client, "Received message from RAW socket, processing."); + r = client_handle_message(client, &packet->dhcp, len, TRIPLE_TIMESTAMP_FROM_CMSG(&msg)); + if (r < 0) + client_stop(client, r); + + return 0; +} + +int sd_dhcp_client_send_renew(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + assert_return(sd_dhcp_client_is_running(client), -ESTALE); + assert_return(client->fd >= 0, -EINVAL); + + if (client->state != DHCP_STATE_BOUND) + return 0; + + assert(client->lease); + + client->start_delay = 0; + client->discover_attempt = 1; + client->request_attempt = 1; + client_set_state(client, DHCP_STATE_RENEWING); + + return client_initialize_time_events(client); +} + +int sd_dhcp_client_is_running(sd_dhcp_client *client) { + if (!client) + return 0; + + return client->state != DHCP_STATE_STOPPED; +} + +int sd_dhcp_client_start(sd_dhcp_client *client) { + int r; + + assert_return(client, -EINVAL); + + /* Note, do not reset the flag in client_initialize(), as it is also called on expire. */ + client->ipv6_acquired = false; + + r = client_initialize(client); + if (r < 0) + return r; + + /* If no client identifier exists, construct an RFC 4361-compliant one */ + if (client->client_id_len == 0) { + r = sd_dhcp_client_set_iaid_duid_en(client, /* iaid_set = */ false, /* iaid = */ 0); + if (r < 0) + return r; + } + + /* RFC7844 section 3.3: + SHOULD perform a complete four-way handshake, starting with a + DHCPDISCOVER, to obtain a new address lease. If the client can + ascertain that this is exactly the same network to which it was + previously connected, and if the link-layer address did not change, + the client MAY issue a DHCPREQUEST to try to reclaim the current + address. */ + if (client->last_addr && !client->anonymize) + client_set_state(client, DHCP_STATE_INIT_REBOOT); + + r = client_start(client); + if (r >= 0) + log_dhcp_client(client, "STARTED on ifindex %i", client->ifindex); + + return r; +} + +int sd_dhcp_client_send_release(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + assert_return(sd_dhcp_client_is_running(client), -ESTALE); + assert_return(client->lease, -EUNATCH); + + _cleanup_free_ DHCPPacket *release = NULL; + size_t optoffset, optlen; + int r; + + r = client_message_init(client, &release, DHCP_RELEASE, &optlen, &optoffset); + if (r < 0) + return r; + + /* Fill up release IP and MAC */ + release->dhcp.ciaddr = client->lease->address; + memcpy(&release->dhcp.chaddr, client->hw_addr.bytes, client->hw_addr.length); + + r = dhcp_option_append(&release->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + r = dhcp_network_send_udp_socket(client->fd, + client->lease->server_address, + DHCP_PORT_SERVER, + &release->dhcp, + sizeof(DHCPMessage) + optoffset); + if (r < 0) + return r; + + log_dhcp_client(client, "RELEASE"); + + return 0; +} + +int sd_dhcp_client_send_decline(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + assert_return(sd_dhcp_client_is_running(client), -ESTALE); + assert_return(client->lease, -EUNATCH); + + _cleanup_free_ DHCPPacket *release = NULL; + size_t optoffset, optlen; + int r; + + r = client_message_init(client, &release, DHCP_DECLINE, &optlen, &optoffset); + if (r < 0) + return r; + + release->dhcp.ciaddr = client->lease->address; + memcpy(&release->dhcp.chaddr, client->hw_addr.bytes, client->hw_addr.length); + + r = dhcp_option_append(&release->dhcp, optlen, &optoffset, 0, + SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + r = dhcp_network_send_udp_socket(client->fd, + client->lease->server_address, + DHCP_PORT_SERVER, + &release->dhcp, + sizeof(DHCPMessage) + optoffset); + if (r < 0) + return r; + + log_dhcp_client(client, "DECLINE"); + + client_stop(client, SD_DHCP_CLIENT_EVENT_STOP); + + if (client->state != DHCP_STATE_STOPPED) { + r = sd_dhcp_client_start(client); + if (r < 0) + return r; + } + + return 0; +} + +int sd_dhcp_client_stop(sd_dhcp_client *client) { + if (!client) + return 0; + + DHCP_CLIENT_DONT_DESTROY(client); + + client_stop(client, SD_DHCP_CLIENT_EVENT_STOP); + + return 0; +} + +int sd_dhcp_client_set_ipv6_connectivity(sd_dhcp_client *client, int have) { + if (!client) + return 0; + + /* We have already received a message with IPv6-Only preferred option, and are waiting for IPv6 + * connectivity or timeout, let's stop the client. */ + if (have && sd_event_source_get_enabled(client->timeout_ipv6_only_mode, NULL) > 0) + return sd_dhcp_client_stop(client); + + /* Otherwise, save that the host already has IPv6 connectivity. */ + client->ipv6_acquired = have; + return 0; +} + +int sd_dhcp_client_interrupt_ipv6_only_mode(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + assert_return(sd_dhcp_client_is_running(client), -ESTALE); + assert_return(client->fd >= 0, -EINVAL); + + if (sd_event_source_get_enabled(client->timeout_ipv6_only_mode, NULL) <= 0) + return 0; + + client_initialize(client); + return client_start(client); +} + +int sd_dhcp_client_attach_event(sd_dhcp_client *client, sd_event *event, int64_t priority) { + int r; + + assert_return(client, -EINVAL); + assert_return(!client->event, -EBUSY); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + if (event) + client->event = sd_event_ref(event); + else { + r = sd_event_default(&client->event); + if (r < 0) + return 0; + } + + client->event_priority = priority; + + return 0; +} + +int sd_dhcp_client_detach_event(sd_dhcp_client *client) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp_client_is_running(client), -EBUSY); + + client->event = sd_event_unref(client->event); + + return 0; +} + +sd_event *sd_dhcp_client_get_event(sd_dhcp_client *client) { + assert_return(client, NULL); + + return client->event; +} + +int sd_dhcp_client_attach_device(sd_dhcp_client *client, sd_device *dev) { + assert_return(client, -EINVAL); + + return device_unref_and_replace(client->dev, dev); +} + +static sd_dhcp_client *dhcp_client_free(sd_dhcp_client *client) { + if (!client) + return NULL; + + log_dhcp_client(client, "FREE"); + + client_initialize(client); + + client->timeout_resend = sd_event_source_unref(client->timeout_resend); + client->timeout_t1 = sd_event_source_unref(client->timeout_t1); + client->timeout_t2 = sd_event_source_unref(client->timeout_t2); + client->timeout_expire = sd_event_source_unref(client->timeout_expire); + + sd_dhcp_client_detach_event(client); + + sd_device_unref(client->dev); + + set_free(client->req_opts); + free(client->hostname); + free(client->vendor_class_identifier); + free(client->mudurl); + client->user_class = strv_free(client->user_class); + ordered_hashmap_free(client->extra_options); + ordered_hashmap_free(client->vendor_options); + free(client->ifname); + return mfree(client); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp_client, sd_dhcp_client, dhcp_client_free); + +int sd_dhcp_client_new(sd_dhcp_client **ret, int anonymize) { + const uint8_t *opts; + size_t n_opts; + int r; + + assert_return(ret, -EINVAL); + + _cleanup_(sd_dhcp_client_unrefp) sd_dhcp_client *client = new(sd_dhcp_client, 1); + if (!client) + return -ENOMEM; + + *client = (sd_dhcp_client) { + .n_ref = 1, + .state = DHCP_STATE_STOPPED, + .ifindex = -1, + .fd = -EBADF, + .mtu = DHCP_MIN_PACKET_SIZE, + .port = DHCP_PORT_CLIENT, + .anonymize = !!anonymize, + .max_discover_attempts = UINT64_MAX, + .max_request_attempts = 5, + .ip_service_type = -1, + }; + /* NOTE: this could be moved to a function. */ + if (anonymize) { + n_opts = ELEMENTSOF(default_req_opts_anonymize); + opts = default_req_opts_anonymize; + } else { + n_opts = ELEMENTSOF(default_req_opts); + opts = default_req_opts; + } + + for (size_t i = 0; i < n_opts; i++) { + r = sd_dhcp_client_set_request_option(client, opts[i]); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(client); + + return 0; +} + +static const char* const dhcp_state_table[_DHCP_STATE_MAX] = { + [DHCP_STATE_STOPPED] = "stopped", + [DHCP_STATE_INIT] = "initialization", + [DHCP_STATE_SELECTING] = "selecting", + [DHCP_STATE_INIT_REBOOT] = "init-reboot", + [DHCP_STATE_REBOOTING] = "rebooting", + [DHCP_STATE_REQUESTING] = "requesting", + [DHCP_STATE_BOUND] = "bound", + [DHCP_STATE_RENEWING] = "renewing", + [DHCP_STATE_REBINDING] = "rebinding", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(dhcp_state, DHCPState); diff --git a/src/libsystemd-network/sd-dhcp-lease.c b/src/libsystemd-network/sd-dhcp-lease.c new file mode 100644 index 0000000..4e3be98 --- /dev/null +++ b/src/libsystemd-network/sd-dhcp-lease.c @@ -0,0 +1,1607 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include + +#include "sd-dhcp-lease.h" + +#include "alloc-util.h" +#include "dhcp-lease-internal.h" +#include "dhcp-option.h" +#include "dns-domain.h" +#include "env-file.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "network-common.h" +#include "network-internal.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "unaligned.h" + +void dhcp_lease_set_timestamp(sd_dhcp_lease *lease, const triple_timestamp *timestamp) { + assert(lease); + + if (timestamp && triple_timestamp_is_set(timestamp)) + lease->timestamp = *timestamp; + else + triple_timestamp_now(&lease->timestamp); +} + +int sd_dhcp_lease_get_timestamp(sd_dhcp_lease *lease, clockid_t clock, uint64_t *ret) { + assert_return(lease, -EINVAL); + assert_return(TRIPLE_TIMESTAMP_HAS_CLOCK(clock), -EOPNOTSUPP); + assert_return(clock_supported(clock), -EOPNOTSUPP); + assert_return(ret, -EINVAL); + + if (!triple_timestamp_is_set(&lease->timestamp)) + return -ENODATA; + + *ret = triple_timestamp_by_clock(&lease->timestamp, clock); + return 0; +} + +int sd_dhcp_lease_get_address(sd_dhcp_lease *lease, struct in_addr *addr) { + assert_return(lease, -EINVAL); + assert_return(addr, -EINVAL); + + if (lease->address == 0) + return -ENODATA; + + addr->s_addr = lease->address; + return 0; +} + +int sd_dhcp_lease_get_broadcast(sd_dhcp_lease *lease, struct in_addr *addr) { + assert_return(lease, -EINVAL); + assert_return(addr, -EINVAL); + + if (!lease->have_broadcast) + return -ENODATA; + + addr->s_addr = lease->broadcast; + return 0; +} + +int sd_dhcp_lease_get_lifetime(sd_dhcp_lease *lease, uint64_t *ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (lease->lifetime <= 0) + return -ENODATA; + + *ret = lease->lifetime; + return 0; +} + +int sd_dhcp_lease_get_t1(sd_dhcp_lease *lease, uint64_t *ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (lease->t1 <= 0) + return -ENODATA; + + *ret = lease->t1; + return 0; +} + +int sd_dhcp_lease_get_t2(sd_dhcp_lease *lease, uint64_t *ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (lease->t2 <= 0) + return -ENODATA; + + *ret = lease->t2; + return 0; +} + +#define DEFINE_GET_TIMESTAMP(name) \ + int sd_dhcp_lease_get_##name##_timestamp( \ + sd_dhcp_lease *lease, \ + clockid_t clock, \ + uint64_t *ret) { \ + \ + usec_t t, timestamp; \ + int r; \ + \ + assert_return(ret, -EINVAL); \ + \ + r = sd_dhcp_lease_get_##name(lease, &t); \ + if (r < 0) \ + return r; \ + \ + r = sd_dhcp_lease_get_timestamp(lease, clock, ×tamp); \ + if (r < 0) \ + return r; \ + \ + *ret = usec_add(t, timestamp); \ + return 0; \ + } + +DEFINE_GET_TIMESTAMP(lifetime); +DEFINE_GET_TIMESTAMP(t1); +DEFINE_GET_TIMESTAMP(t2); + +int sd_dhcp_lease_get_mtu(sd_dhcp_lease *lease, uint16_t *mtu) { + assert_return(lease, -EINVAL); + assert_return(mtu, -EINVAL); + + if (lease->mtu <= 0) + return -ENODATA; + + *mtu = lease->mtu; + return 0; +} + +int sd_dhcp_lease_get_servers( + sd_dhcp_lease *lease, + sd_dhcp_lease_server_type_t what, + const struct in_addr **addr) { + + assert_return(lease, -EINVAL); + assert_return(what >= 0, -EINVAL); + assert_return(what < _SD_DHCP_LEASE_SERVER_TYPE_MAX, -EINVAL); + + if (lease->servers[what].size <= 0) + return -ENODATA; + + if (addr) + *addr = lease->servers[what].addr; + + return (int) lease->servers[what].size; +} + +int sd_dhcp_lease_get_dns(sd_dhcp_lease *lease, const struct in_addr **addr) { + return sd_dhcp_lease_get_servers(lease, SD_DHCP_LEASE_DNS, addr); +} +int sd_dhcp_lease_get_ntp(sd_dhcp_lease *lease, const struct in_addr **addr) { + return sd_dhcp_lease_get_servers(lease, SD_DHCP_LEASE_NTP, addr); +} +int sd_dhcp_lease_get_sip(sd_dhcp_lease *lease, const struct in_addr **addr) { + return sd_dhcp_lease_get_servers(lease, SD_DHCP_LEASE_SIP, addr); +} +int sd_dhcp_lease_get_pop3(sd_dhcp_lease *lease, const struct in_addr **addr) { + return sd_dhcp_lease_get_servers(lease, SD_DHCP_LEASE_POP3, addr); +} +int sd_dhcp_lease_get_smtp(sd_dhcp_lease *lease, const struct in_addr **addr) { + return sd_dhcp_lease_get_servers(lease, SD_DHCP_LEASE_SMTP, addr); +} +int sd_dhcp_lease_get_lpr(sd_dhcp_lease *lease, const struct in_addr **addr) { + return sd_dhcp_lease_get_servers(lease, SD_DHCP_LEASE_LPR, addr); +} + +int sd_dhcp_lease_get_domainname(sd_dhcp_lease *lease, const char **domainname) { + assert_return(lease, -EINVAL); + assert_return(domainname, -EINVAL); + + if (!lease->domainname) + return -ENODATA; + + *domainname = lease->domainname; + return 0; +} + +int sd_dhcp_lease_get_hostname(sd_dhcp_lease *lease, const char **hostname) { + assert_return(lease, -EINVAL); + assert_return(hostname, -EINVAL); + + if (!lease->hostname) + return -ENODATA; + + *hostname = lease->hostname; + return 0; +} + +int sd_dhcp_lease_get_root_path(sd_dhcp_lease *lease, const char **root_path) { + assert_return(lease, -EINVAL); + assert_return(root_path, -EINVAL); + + if (!lease->root_path) + return -ENODATA; + + *root_path = lease->root_path; + return 0; +} + +int sd_dhcp_lease_get_captive_portal(sd_dhcp_lease *lease, const char **ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (!lease->captive_portal) + return -ENODATA; + + *ret = lease->captive_portal; + return 0; +} + +int sd_dhcp_lease_get_router(sd_dhcp_lease *lease, const struct in_addr **addr) { + assert_return(lease, -EINVAL); + assert_return(addr, -EINVAL); + + if (lease->router_size <= 0) + return -ENODATA; + + *addr = lease->router; + return (int) lease->router_size; +} + +int sd_dhcp_lease_get_netmask(sd_dhcp_lease *lease, struct in_addr *addr) { + assert_return(lease, -EINVAL); + assert_return(addr, -EINVAL); + + if (!lease->have_subnet_mask) + return -ENODATA; + + addr->s_addr = lease->subnet_mask; + return 0; +} + +int sd_dhcp_lease_get_prefix(sd_dhcp_lease *lease, struct in_addr *ret_prefix, uint8_t *ret_prefixlen) { + struct in_addr address, netmask; + uint8_t prefixlen; + int r; + + assert_return(lease, -EINVAL); + + r = sd_dhcp_lease_get_address(lease, &address); + if (r < 0) + return r; + + r = sd_dhcp_lease_get_netmask(lease, &netmask); + if (r < 0) + return r; + + prefixlen = in4_addr_netmask_to_prefixlen(&netmask); + + r = in4_addr_mask(&address, prefixlen); + if (r < 0) + return r; + + if (ret_prefix) + *ret_prefix = address; + if (ret_prefixlen) + *ret_prefixlen = prefixlen; + return 0; +} + +int sd_dhcp_lease_get_server_identifier(sd_dhcp_lease *lease, struct in_addr *addr) { + assert_return(lease, -EINVAL); + assert_return(addr, -EINVAL); + + if (lease->server_address == 0) + return -ENODATA; + + addr->s_addr = lease->server_address; + return 0; +} + +int sd_dhcp_lease_get_next_server(sd_dhcp_lease *lease, struct in_addr *addr) { + assert_return(lease, -EINVAL); + assert_return(addr, -EINVAL); + + if (lease->next_server == 0) + return -ENODATA; + + addr->s_addr = lease->next_server; + return 0; +} + +/* + * The returned routes array must be freed by the caller. + * Route objects have the same lifetime of the lease and must not be freed. + */ +static int dhcp_lease_get_routes(sd_dhcp_route *routes, size_t n_routes, sd_dhcp_route ***ret) { + assert(routes || n_routes == 0); + + if (n_routes <= 0) + return -ENODATA; + + if (ret) { + sd_dhcp_route **buf; + + buf = new(sd_dhcp_route*, n_routes); + if (!buf) + return -ENOMEM; + + for (size_t i = 0; i < n_routes; i++) + buf[i] = &routes[i]; + + *ret = buf; + } + + return (int) n_routes; +} + +int sd_dhcp_lease_get_static_routes(sd_dhcp_lease *lease, sd_dhcp_route ***ret) { + assert_return(lease, -EINVAL); + + return dhcp_lease_get_routes(lease->static_routes, lease->n_static_routes, ret); +} + +int sd_dhcp_lease_get_classless_routes(sd_dhcp_lease *lease, sd_dhcp_route ***ret) { + assert_return(lease, -EINVAL); + + return dhcp_lease_get_routes(lease->classless_routes, lease->n_classless_routes, ret); +} + +int sd_dhcp_lease_get_search_domains(sd_dhcp_lease *lease, char ***domains) { + size_t r; + + assert_return(lease, -EINVAL); + assert_return(domains, -EINVAL); + + r = strv_length(lease->search_domains); + if (r > 0) { + *domains = lease->search_domains; + return (int) r; + } + + return -ENODATA; +} + +int sd_dhcp_lease_get_6rd( + sd_dhcp_lease *lease, + uint8_t *ret_ipv4masklen, + uint8_t *ret_prefixlen, + struct in6_addr *ret_prefix, + const struct in_addr **ret_br_addresses, + size_t *ret_n_br_addresses) { + + assert_return(lease, -EINVAL); + + if (lease->sixrd_n_br_addresses <= 0) + return -ENODATA; + + if (ret_ipv4masklen) + *ret_ipv4masklen = lease->sixrd_ipv4masklen; + if (ret_prefixlen) + *ret_prefixlen = lease->sixrd_prefixlen; + if (ret_prefix) + *ret_prefix = lease->sixrd_prefix; + if (ret_br_addresses) + *ret_br_addresses = lease->sixrd_br_addresses; + if (ret_n_br_addresses) + *ret_n_br_addresses = lease->sixrd_n_br_addresses; + + return 0; +} + +int sd_dhcp_lease_has_6rd(sd_dhcp_lease *lease) { + return lease && lease->sixrd_n_br_addresses > 0; +} + +int sd_dhcp_lease_get_vendor_specific(sd_dhcp_lease *lease, const void **data, size_t *data_len) { + assert_return(lease, -EINVAL); + assert_return(data, -EINVAL); + assert_return(data_len, -EINVAL); + + if (lease->vendor_specific_len <= 0) + return -ENODATA; + + *data = lease->vendor_specific; + *data_len = lease->vendor_specific_len; + return 0; +} + +static sd_dhcp_lease *dhcp_lease_free(sd_dhcp_lease *lease) { + struct sd_dhcp_raw_option *option; + + assert(lease); + + while ((option = LIST_POP(options, lease->private_options))) { + free(option->data); + free(option); + } + + free(lease->root_path); + free(lease->router); + free(lease->timezone); + free(lease->hostname); + free(lease->domainname); + free(lease->captive_portal); + + for (sd_dhcp_lease_server_type_t i = 0; i < _SD_DHCP_LEASE_SERVER_TYPE_MAX; i++) + free(lease->servers[i].addr); + + free(lease->static_routes); + free(lease->classless_routes); + free(lease->client_id); + free(lease->vendor_specific); + strv_free(lease->search_domains); + free(lease->sixrd_br_addresses); + return mfree(lease); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp_lease, sd_dhcp_lease, dhcp_lease_free); + +static int lease_parse_be32_seconds(const uint8_t *option, size_t len, bool max_as_infinity, usec_t *ret) { + assert(option); + assert(ret); + + if (len != 4) + return -EINVAL; + + *ret = unaligned_be32_sec_to_usec(option, max_as_infinity); + return 0; +} + +static int lease_parse_u16(const uint8_t *option, size_t len, uint16_t *ret, uint16_t min) { + assert(option); + assert(ret); + + if (len != 2) + return -EINVAL; + + *ret = unaligned_read_be16((be16_t*) option); + if (*ret < min) + *ret = min; + + return 0; +} + +static int lease_parse_be32(const uint8_t *option, size_t len, be32_t *ret) { + assert(option); + assert(ret); + + if (len != 4) + return -EINVAL; + + memcpy(ret, option, 4); + return 0; +} + +static int lease_parse_domain(const uint8_t *option, size_t len, char **ret) { + _cleanup_free_ char *name = NULL, *normalized = NULL; + int r; + + assert(option); + assert(ret); + + r = dhcp_option_parse_string(option, len, &name); + if (r < 0) + return r; + if (!name) { + *ret = mfree(*ret); + return 0; + } + + r = dns_name_normalize(name, 0, &normalized); + if (r < 0) + return r; + + if (is_localhost(normalized)) + return -EINVAL; + + if (dns_name_is_root(normalized)) + return -EINVAL; + + free_and_replace(*ret, normalized); + + return 0; +} + +static int lease_parse_captive_portal(const uint8_t *option, size_t len, char **ret) { + _cleanup_free_ char *uri = NULL; + int r; + + assert(option); + assert(ret); + + r = dhcp_option_parse_string(option, len, &uri); + if (r < 0) + return r; + if (uri && !in_charset(uri, URI_VALID)) + return -EINVAL; + + return free_and_replace(*ret, uri); +} + +static int lease_parse_in_addrs(const uint8_t *option, size_t len, struct in_addr **ret, size_t *n_ret) { + assert(option || len == 0); + assert(ret); + assert(n_ret); + + if (len <= 0) { + *ret = mfree(*ret); + *n_ret = 0; + } else { + size_t n_addresses; + struct in_addr *addresses; + + if (len % 4 != 0) + return -EINVAL; + + n_addresses = len / 4; + + addresses = newdup(struct in_addr, option, n_addresses); + if (!addresses) + return -ENOMEM; + + free_and_replace(*ret, addresses); + *n_ret = n_addresses; + } + + return 0; +} + +static int lease_parse_sip_server(const uint8_t *option, size_t len, struct in_addr **ret, size_t *n_ret) { + assert(option || len == 0); + assert(ret); + assert(n_ret); + + if (len <= 0) + return -EINVAL; + + /* The SIP record is like the other, regular server records, but prefixed with a single "encoding" + * byte that is either 0 or 1. We only support it to be 1 for now. Let's drop it and parse it like + * the other fields */ + + if (option[0] != 1) { /* We only support IP address encoding for now */ + *ret = mfree(*ret); + *n_ret = 0; + return 0; + } + + return lease_parse_in_addrs(option + 1, len - 1, ret, n_ret); +} + +static int lease_parse_static_routes(sd_dhcp_lease *lease, const uint8_t *option, size_t len) { + int r; + + assert(lease); + assert(option || len <= 0); + + if (len % 8 != 0) + return -EINVAL; + + while (len >= 8) { + struct in_addr dst, gw; + uint8_t prefixlen; + + assert_se(lease_parse_be32(option, 4, &dst.s_addr) >= 0); + option += 4; + + assert_se(lease_parse_be32(option, 4, &gw.s_addr) >= 0); + option += 4; + + len -= 8; + + r = in4_addr_default_prefixlen(&dst, &prefixlen); + if (r < 0) { + log_debug("sd-dhcp-lease: cannot determine class of received static route, ignoring."); + continue; + } + + (void) in4_addr_mask(&dst, prefixlen); + + if (!GREEDY_REALLOC(lease->static_routes, lease->n_static_routes + 1)) + return -ENOMEM; + + lease->static_routes[lease->n_static_routes++] = (struct sd_dhcp_route) { + .dst_addr = dst, + .gw_addr = gw, + .dst_prefixlen = prefixlen, + }; + } + + return 0; +} + +/* parses RFC3442 Classless Static Route Option */ +static int lease_parse_classless_routes(sd_dhcp_lease *lease, const uint8_t *option, size_t len) { + assert(lease); + assert(option || len <= 0); + + /* option format: (subnet-mask-width significant-subnet-octets gateway-ip) */ + + while (len > 0) { + uint8_t prefixlen, dst_octets; + struct in_addr dst = {}, gw; + + prefixlen = *option; + option++; + len--; + + dst_octets = DIV_ROUND_UP(prefixlen, 8); + + /* can't have more than 4 octets in IPv4 */ + if (dst_octets > 4 || len < dst_octets) + return -EINVAL; + + memcpy(&dst, option, dst_octets); + option += dst_octets; + len -= dst_octets; + + if (len < 4) + return -EINVAL; + + assert_se(lease_parse_be32(option, 4, &gw.s_addr) >= 0); + option += 4; + len -= 4; + + if (!GREEDY_REALLOC(lease->classless_routes, lease->n_classless_routes + 1)) + return -ENOMEM; + + lease->classless_routes[lease->n_classless_routes++] = (struct sd_dhcp_route) { + .dst_addr = dst, + .gw_addr = gw, + .dst_prefixlen = prefixlen, + }; + } + + return 0; +} + +static int lease_parse_6rd(sd_dhcp_lease *lease, const uint8_t *option, size_t len) { + uint8_t ipv4masklen, prefixlen; + struct in6_addr prefix; + _cleanup_free_ struct in_addr *br_addresses = NULL; + size_t n_br_addresses; + + assert(lease); + assert(option); + + /* See RFC 5969 Section 7.1.1 */ + + if (lease->sixrd_n_br_addresses > 0) + /* Multiple 6rd option?? */ + return -EINVAL; + + /* option-length: The length of the DHCP option in octets (22 octets with one BR IPv4 address). */ + if (len < 2 + sizeof(struct in6_addr) + sizeof(struct in_addr) || + (len - 2 - sizeof(struct in6_addr)) % sizeof(struct in_addr) != 0) + return -EINVAL; + + /* IPv4MaskLen: The number of high-order bits that are identical across all CE IPv4 addresses + * within a given 6rd domain. This may be any value between 0 and 32. Any value + * greater than 32 is invalid. */ + ipv4masklen = option[0]; + if (ipv4masklen > 32) + return -EINVAL; + + /* 6rdPrefixLen: The IPv6 prefix length of the SP's 6rd IPv6 prefix in number of bits. For the + * purpose of bounds checking by DHCP option processing, the sum of + * (32 - IPv4MaskLen) + 6rdPrefixLen MUST be less than or equal to 128. */ + prefixlen = option[1]; + if (32 - ipv4masklen + prefixlen > 128) + return -EINVAL; + + /* 6rdPrefix: The service provider's 6rd IPv6 prefix represented as a 16-octet IPv6 address. + * The bits in the prefix after the 6rdPrefixlen number of bits are reserved and + * MUST be initialized to zero by the sender and ignored by the receiver. */ + memcpy(&prefix, option + 2, sizeof(struct in6_addr)); + (void) in6_addr_mask(&prefix, prefixlen); + + /* 6rdBRIPv4Address: One or more IPv4 addresses of the 6rd Border Relays for a given 6rd domain. */ + n_br_addresses = (len - 2 - sizeof(struct in6_addr)) / sizeof(struct in_addr); + br_addresses = newdup(struct in_addr, option + 2 + sizeof(struct in6_addr), n_br_addresses); + if (!br_addresses) + return -ENOMEM; + + lease->sixrd_ipv4masklen = ipv4masklen; + lease->sixrd_prefixlen = prefixlen; + lease->sixrd_prefix = prefix; + lease->sixrd_br_addresses = TAKE_PTR(br_addresses); + lease->sixrd_n_br_addresses = n_br_addresses; + + return 0; +} + +int dhcp_lease_parse_options(uint8_t code, uint8_t len, const void *option, void *userdata) { + sd_dhcp_lease *lease = ASSERT_PTR(userdata); + int r; + + switch (code) { + + case SD_DHCP_OPTION_IP_ADDRESS_LEASE_TIME: + r = lease_parse_be32_seconds(option, len, /* max_as_infinity = */ true, &lease->lifetime); + if (r < 0) + log_debug_errno(r, "Failed to parse lease time, ignoring: %m"); + + break; + + case SD_DHCP_OPTION_SERVER_IDENTIFIER: + r = lease_parse_be32(option, len, &lease->server_address); + if (r < 0) + log_debug_errno(r, "Failed to parse server identifier, ignoring: %m"); + + break; + + case SD_DHCP_OPTION_SUBNET_MASK: + r = lease_parse_be32(option, len, &lease->subnet_mask); + if (r < 0) + log_debug_errno(r, "Failed to parse subnet mask, ignoring: %m"); + else + lease->have_subnet_mask = true; + break; + + case SD_DHCP_OPTION_BROADCAST: + r = lease_parse_be32(option, len, &lease->broadcast); + if (r < 0) + log_debug_errno(r, "Failed to parse broadcast address, ignoring: %m"); + else + lease->have_broadcast = true; + break; + + case SD_DHCP_OPTION_ROUTER: + r = lease_parse_in_addrs(option, len, &lease->router, &lease->router_size); + if (r < 0) + log_debug_errno(r, "Failed to parse router addresses, ignoring: %m"); + break; + + case SD_DHCP_OPTION_RAPID_COMMIT: + if (len > 0) + log_debug("Invalid DHCP Rapid Commit option, ignoring."); + lease->rapid_commit = true; + break; + + case SD_DHCP_OPTION_DOMAIN_NAME_SERVER: + r = lease_parse_in_addrs(option, len, &lease->servers[SD_DHCP_LEASE_DNS].addr, &lease->servers[SD_DHCP_LEASE_DNS].size); + if (r < 0) + log_debug_errno(r, "Failed to parse DNS server, ignoring: %m"); + break; + + case SD_DHCP_OPTION_NTP_SERVER: + r = lease_parse_in_addrs(option, len, &lease->servers[SD_DHCP_LEASE_NTP].addr, &lease->servers[SD_DHCP_LEASE_NTP].size); + if (r < 0) + log_debug_errno(r, "Failed to parse NTP server, ignoring: %m"); + break; + + case SD_DHCP_OPTION_SIP_SERVER: + r = lease_parse_sip_server(option, len, &lease->servers[SD_DHCP_LEASE_SIP].addr, &lease->servers[SD_DHCP_LEASE_SIP].size); + if (r < 0) + log_debug_errno(r, "Failed to parse SIP server, ignoring: %m"); + break; + + case SD_DHCP_OPTION_POP3_SERVER: + r = lease_parse_in_addrs(option, len, &lease->servers[SD_DHCP_LEASE_POP3].addr, &lease->servers[SD_DHCP_LEASE_POP3].size); + if (r < 0) + log_debug_errno(r, "Failed to parse POP3 server, ignoring: %m"); + break; + + case SD_DHCP_OPTION_SMTP_SERVER: + r = lease_parse_in_addrs(option, len, &lease->servers[SD_DHCP_LEASE_SMTP].addr, &lease->servers[SD_DHCP_LEASE_SMTP].size); + if (r < 0) + log_debug_errno(r, "Failed to parse SMTP server, ignoring: %m"); + break; + + case SD_DHCP_OPTION_LPR_SERVER: + r = lease_parse_in_addrs(option, len, &lease->servers[SD_DHCP_LEASE_LPR].addr, &lease->servers[SD_DHCP_LEASE_LPR].size); + if (r < 0) + log_debug_errno(r, "Failed to parse LPR server, ignoring: %m"); + break; + + case SD_DHCP_OPTION_DHCP_CAPTIVE_PORTAL: + r = lease_parse_captive_portal(option, len, &lease->captive_portal); + if (r < 0) + log_debug_errno(r, "Failed to parse captive portal, ignoring: %m"); + break; + + case SD_DHCP_OPTION_STATIC_ROUTE: + r = lease_parse_static_routes(lease, option, len); + if (r < 0) + log_debug_errno(r, "Failed to parse static routes, ignoring: %m"); + break; + + case SD_DHCP_OPTION_MTU_INTERFACE: + r = lease_parse_u16(option, len, &lease->mtu, 68); + if (r < 0) + log_debug_errno(r, "Failed to parse MTU, ignoring: %m"); + if (lease->mtu < DHCP_MIN_PACKET_SIZE) { + log_debug("MTU value of %" PRIu16 " too small. Using default MTU value of %d instead.", lease->mtu, DHCP_MIN_PACKET_SIZE); + lease->mtu = DHCP_MIN_PACKET_SIZE; + } + + break; + + case SD_DHCP_OPTION_DOMAIN_NAME: + r = lease_parse_domain(option, len, &lease->domainname); + if (r < 0) { + log_debug_errno(r, "Failed to parse domain name, ignoring: %m"); + return 0; + } + + break; + + case SD_DHCP_OPTION_DOMAIN_SEARCH: + r = dhcp_lease_parse_search_domains(option, len, &lease->search_domains); + if (r < 0) + log_debug_errno(r, "Failed to parse Domain Search List, ignoring: %m"); + break; + + case SD_DHCP_OPTION_HOST_NAME: + r = lease_parse_domain(option, len, &lease->hostname); + if (r < 0) { + log_debug_errno(r, "Failed to parse hostname, ignoring: %m"); + return 0; + } + + break; + + case SD_DHCP_OPTION_ROOT_PATH: + r = dhcp_option_parse_string(option, len, &lease->root_path); + if (r < 0) + log_debug_errno(r, "Failed to parse root path, ignoring: %m"); + break; + + case SD_DHCP_OPTION_RENEWAL_TIME: + r = lease_parse_be32_seconds(option, len, /* max_as_infinity = */ true, &lease->t1); + if (r < 0) + log_debug_errno(r, "Failed to parse T1 time, ignoring: %m"); + break; + + case SD_DHCP_OPTION_REBINDING_TIME: + r = lease_parse_be32_seconds(option, len, /* max_as_infinity = */ true, &lease->t2); + if (r < 0) + log_debug_errno(r, "Failed to parse T2 time, ignoring: %m"); + break; + + case SD_DHCP_OPTION_CLASSLESS_STATIC_ROUTE: + r = lease_parse_classless_routes(lease, option, len); + if (r < 0) + log_debug_errno(r, "Failed to parse classless routes, ignoring: %m"); + break; + + case SD_DHCP_OPTION_TZDB_TIMEZONE: { + _cleanup_free_ char *tz = NULL; + + r = dhcp_option_parse_string(option, len, &tz); + if (r < 0) { + log_debug_errno(r, "Failed to parse timezone option, ignoring: %m"); + return 0; + } + + if (!timezone_is_valid(tz, LOG_DEBUG)) { + log_debug("Timezone is not valid, ignoring."); + return 0; + } + + free_and_replace(lease->timezone, tz); + + break; + } + + case SD_DHCP_OPTION_VENDOR_SPECIFIC: + + if (len <= 0) + lease->vendor_specific = mfree(lease->vendor_specific); + else { + void *p; + + p = memdup(option, len); + if (!p) + return -ENOMEM; + + free_and_replace(lease->vendor_specific, p); + } + + lease->vendor_specific_len = len; + break; + + case SD_DHCP_OPTION_6RD: + r = lease_parse_6rd(lease, option, len); + if (r < 0) + log_debug_errno(r, "Failed to parse 6rd option, ignoring: %m"); + break; + + case SD_DHCP_OPTION_IPV6_ONLY_PREFERRED: + r = lease_parse_be32_seconds(option, len, /* max_as_infinity = */ false, &lease->ipv6_only_preferred_usec); + if (r < 0) + log_debug_errno(r, "Failed to parse IPv6 only preferred option, ignoring: %m"); + + else if (lease->ipv6_only_preferred_usec < MIN_V6ONLY_WAIT_USEC && + !network_test_mode_enabled()) + lease->ipv6_only_preferred_usec = MIN_V6ONLY_WAIT_USEC; + break; + + case SD_DHCP_OPTION_PRIVATE_BASE ... SD_DHCP_OPTION_PRIVATE_LAST: + r = dhcp_lease_insert_private_option(lease, code, option, len); + if (r < 0) + return r; + + break; + + default: + log_debug("Ignoring DHCP option %"PRIu8" while parsing.", code); + break; + } + + return 0; +} + +/* Parses compressed domain names. */ +int dhcp_lease_parse_search_domains(const uint8_t *option, size_t len, char ***domains) { + _cleanup_strv_free_ char **names = NULL; + size_t pos = 0, cnt = 0; + int r; + + assert(domains); + assert(option || len == 0); + + if (len == 0) + return -EBADMSG; + + while (pos < len) { + _cleanup_free_ char *name = NULL; + size_t n = 0; + size_t jump_barrier = pos, next_chunk = 0; + bool first = true; + + for (;;) { + uint8_t c; + c = option[pos++]; + + if (c == 0) { + /* End of name */ + break; + } else if (c <= 63) { + const char *label; + + /* Literal label */ + label = (const char*) (option + pos); + pos += c; + if (pos >= len) + return -EBADMSG; + + if (!GREEDY_REALLOC(name, n + !first + DNS_LABEL_ESCAPED_MAX)) + return -ENOMEM; + + if (first) + first = false; + else + name[n++] = '.'; + + r = dns_label_escape(label, c, name + n, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + n += r; + } else if (FLAGS_SET(c, 0xc0)) { + /* Pointer */ + + uint8_t d; + uint16_t ptr; + + if (pos >= len) + return -EBADMSG; + + d = option[pos++]; + ptr = (uint16_t) (c & ~0xc0) << 8 | (uint16_t) d; + + /* Jumps are limited to a "prior occurrence" (RFC-1035 4.1.4) */ + if (ptr >= jump_barrier) + return -EBADMSG; + jump_barrier = ptr; + + /* Save current location so we don't end up re-parsing what's parsed so far. */ + if (next_chunk == 0) + next_chunk = pos; + + pos = ptr; + } else + return -EBADMSG; + } + + if (!GREEDY_REALLOC(name, n + 1)) + return -ENOMEM; + name[n] = 0; + + r = strv_extend(&names, name); + if (r < 0) + return r; + + cnt++; + + if (next_chunk != 0) + pos = next_chunk; + } + + strv_free_and_replace(*domains, names); + + return cnt; +} + +int dhcp_lease_insert_private_option(sd_dhcp_lease *lease, uint8_t tag, const void *data, uint8_t len) { + struct sd_dhcp_raw_option *option, *before = NULL; + + assert(lease); + + LIST_FOREACH(options, cur, lease->private_options) { + if (tag < cur->tag) { + before = cur; + break; + } + if (tag == cur->tag) { + log_debug("Ignoring duplicate option, tagged %i.", tag); + return 0; + } + } + + option = new(struct sd_dhcp_raw_option, 1); + if (!option) + return -ENOMEM; + + option->tag = tag; + option->length = len; + option->data = memdup(data, len); + if (!option->data) { + free(option); + return -ENOMEM; + } + + LIST_INSERT_BEFORE(options, lease->private_options, before, option); + return 0; +} + +int dhcp_lease_new(sd_dhcp_lease **ret) { + sd_dhcp_lease *lease; + + lease = new0(sd_dhcp_lease, 1); + if (!lease) + return -ENOMEM; + + lease->n_ref = 1; + + *ret = lease; + return 0; +} + +int dhcp_lease_save(sd_dhcp_lease *lease, const char *lease_file) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + struct in_addr address; + const struct in_addr *addresses; + const void *client_id, *data; + size_t client_id_len, data_len; + const char *string; + uint16_t mtu; + _cleanup_free_ sd_dhcp_route **routes = NULL; + char **search_domains; + usec_t t; + int r; + + assert(lease); + assert(lease_file); + + r = fopen_temporary(lease_file, &f, &temp_path); + if (r < 0) + return r; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n"); + + r = sd_dhcp_lease_get_address(lease, &address); + if (r >= 0) + fprintf(f, "ADDRESS=%s\n", IN4_ADDR_TO_STRING(&address)); + + r = sd_dhcp_lease_get_netmask(lease, &address); + if (r >= 0) + fprintf(f, "NETMASK=%s\n", IN4_ADDR_TO_STRING(&address)); + + r = sd_dhcp_lease_get_router(lease, &addresses); + if (r > 0) { + fputs("ROUTER=", f); + serialize_in_addrs(f, addresses, r, NULL, NULL); + fputc('\n', f); + } + + r = sd_dhcp_lease_get_server_identifier(lease, &address); + if (r >= 0) + fprintf(f, "SERVER_ADDRESS=%s\n", IN4_ADDR_TO_STRING(&address)); + + r = sd_dhcp_lease_get_next_server(lease, &address); + if (r >= 0) + fprintf(f, "NEXT_SERVER=%s\n", IN4_ADDR_TO_STRING(&address)); + + r = sd_dhcp_lease_get_broadcast(lease, &address); + if (r >= 0) + fprintf(f, "BROADCAST=%s\n", IN4_ADDR_TO_STRING(&address)); + + r = sd_dhcp_lease_get_mtu(lease, &mtu); + if (r >= 0) + fprintf(f, "MTU=%" PRIu16 "\n", mtu); + + r = sd_dhcp_lease_get_t1(lease, &t); + if (r >= 0) + fprintf(f, "T1=%s\n", FORMAT_TIMESPAN(t, USEC_PER_SEC)); + + r = sd_dhcp_lease_get_t2(lease, &t); + if (r >= 0) + fprintf(f, "T2=%s\n", FORMAT_TIMESPAN(t, USEC_PER_SEC)); + + r = sd_dhcp_lease_get_lifetime(lease, &t); + if (r >= 0) + fprintf(f, "LIFETIME=%s\n", FORMAT_TIMESPAN(t, USEC_PER_SEC)); + + r = sd_dhcp_lease_get_dns(lease, &addresses); + if (r > 0) { + fputs("DNS=", f); + serialize_in_addrs(f, addresses, r, NULL, NULL); + fputc('\n', f); + } + + r = sd_dhcp_lease_get_ntp(lease, &addresses); + if (r > 0) { + fputs("NTP=", f); + serialize_in_addrs(f, addresses, r, NULL, NULL); + fputc('\n', f); + } + + r = sd_dhcp_lease_get_sip(lease, &addresses); + if (r > 0) { + fputs("SIP=", f); + serialize_in_addrs(f, addresses, r, NULL, NULL); + fputc('\n', f); + } + + r = sd_dhcp_lease_get_domainname(lease, &string); + if (r >= 0) + fprintf(f, "DOMAINNAME=%s\n", string); + + r = sd_dhcp_lease_get_search_domains(lease, &search_domains); + if (r > 0) { + fputs("DOMAIN_SEARCH_LIST=", f); + fputstrv(f, search_domains, NULL, NULL); + fputc('\n', f); + } + + r = sd_dhcp_lease_get_hostname(lease, &string); + if (r >= 0) + fprintf(f, "HOSTNAME=%s\n", string); + + r = sd_dhcp_lease_get_root_path(lease, &string); + if (r >= 0) + fprintf(f, "ROOT_PATH=%s\n", string); + + r = sd_dhcp_lease_get_static_routes(lease, &routes); + if (r > 0) + serialize_dhcp_routes(f, "STATIC_ROUTES", routes, r); + + routes = mfree(routes); + r = sd_dhcp_lease_get_classless_routes(lease, &routes); + if (r > 0) + serialize_dhcp_routes(f, "CLASSLESS_ROUTES", routes, r); + + r = sd_dhcp_lease_get_timezone(lease, &string); + if (r >= 0) + fprintf(f, "TIMEZONE=%s\n", string); + + r = sd_dhcp_lease_get_client_id(lease, &client_id, &client_id_len); + if (r >= 0) { + _cleanup_free_ char *client_id_hex = NULL; + + client_id_hex = hexmem(client_id, client_id_len); + if (!client_id_hex) + return -ENOMEM; + fprintf(f, "CLIENTID=%s\n", client_id_hex); + } + + r = sd_dhcp_lease_get_vendor_specific(lease, &data, &data_len); + if (r >= 0) { + _cleanup_free_ char *option_hex = NULL; + + option_hex = hexmem(data, data_len); + if (!option_hex) + return -ENOMEM; + fprintf(f, "VENDOR_SPECIFIC=%s\n", option_hex); + } + + LIST_FOREACH(options, option, lease->private_options) { + char key[STRLEN("OPTION_000")+1]; + + xsprintf(key, "OPTION_%" PRIu8, option->tag); + r = serialize_dhcp_option(f, key, option->data, option->length); + if (r < 0) + return r; + } + + r = fflush_and_check(f); + if (r < 0) + return r; + + r = conservative_rename(temp_path, lease_file); + if (r < 0) + return r; + + temp_path = mfree(temp_path); + + return 0; +} + +static char **private_options_free(char **options) { + if (!options) + return NULL; + + free_many_charp(options, SD_DHCP_OPTION_PRIVATE_LAST - SD_DHCP_OPTION_PRIVATE_BASE + 1); + + return mfree(options); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(char**, private_options_free); + +int dhcp_lease_load(sd_dhcp_lease **ret, const char *lease_file) { + _cleanup_(sd_dhcp_lease_unrefp) sd_dhcp_lease *lease = NULL; + _cleanup_free_ char + *address = NULL, + *router = NULL, + *netmask = NULL, + *server_address = NULL, + *next_server = NULL, + *broadcast = NULL, + *dns = NULL, + *ntp = NULL, + *sip = NULL, + *pop3 = NULL, + *smtp = NULL, + *lpr = NULL, + *mtu = NULL, + *static_routes = NULL, + *classless_routes = NULL, + *domains = NULL, + *client_id_hex = NULL, + *vendor_specific_hex = NULL, + *lifetime = NULL, + *t1 = NULL, + *t2 = NULL; + _cleanup_(private_options_freep) char **options = NULL; + + int r, i; + + assert(lease_file); + assert(ret); + + r = dhcp_lease_new(&lease); + if (r < 0) + return r; + + options = new0(char*, SD_DHCP_OPTION_PRIVATE_LAST - SD_DHCP_OPTION_PRIVATE_BASE + 1); + if (!options) + return -ENOMEM; + + r = parse_env_file(NULL, lease_file, + "ADDRESS", &address, + "ROUTER", &router, + "NETMASK", &netmask, + "SERVER_ADDRESS", &server_address, + "NEXT_SERVER", &next_server, + "BROADCAST", &broadcast, + "DNS", &dns, + "NTP", &ntp, + "SIP", &sip, + "POP3", &pop3, + "SMTP", &smtp, + "LPR", &lpr, + "MTU", &mtu, + "DOMAINNAME", &lease->domainname, + "HOSTNAME", &lease->hostname, + "DOMAIN_SEARCH_LIST", &domains, + "ROOT_PATH", &lease->root_path, + "STATIC_ROUTES", &static_routes, + "CLASSLESS_ROUTES", &classless_routes, + "CLIENTID", &client_id_hex, + "TIMEZONE", &lease->timezone, + "VENDOR_SPECIFIC", &vendor_specific_hex, + "LIFETIME", &lifetime, + "T1", &t1, + "T2", &t2, + "OPTION_224", &options[0], + "OPTION_225", &options[1], + "OPTION_226", &options[2], + "OPTION_227", &options[3], + "OPTION_228", &options[4], + "OPTION_229", &options[5], + "OPTION_230", &options[6], + "OPTION_231", &options[7], + "OPTION_232", &options[8], + "OPTION_233", &options[9], + "OPTION_234", &options[10], + "OPTION_235", &options[11], + "OPTION_236", &options[12], + "OPTION_237", &options[13], + "OPTION_238", &options[14], + "OPTION_239", &options[15], + "OPTION_240", &options[16], + "OPTION_241", &options[17], + "OPTION_242", &options[18], + "OPTION_243", &options[19], + "OPTION_244", &options[20], + "OPTION_245", &options[21], + "OPTION_246", &options[22], + "OPTION_247", &options[23], + "OPTION_248", &options[24], + "OPTION_249", &options[25], + "OPTION_250", &options[26], + "OPTION_251", &options[27], + "OPTION_252", &options[28], + "OPTION_253", &options[29], + "OPTION_254", &options[30]); + if (r < 0) + return r; + + if (address) { + r = inet_pton(AF_INET, address, &lease->address); + if (r <= 0) + log_debug("Failed to parse address %s, ignoring.", address); + } + + if (router) { + r = deserialize_in_addrs(&lease->router, router); + if (r < 0) + log_debug_errno(r, "Failed to deserialize router addresses %s, ignoring: %m", router); + else + lease->router_size = r; + } + + if (netmask) { + r = inet_pton(AF_INET, netmask, &lease->subnet_mask); + if (r <= 0) + log_debug("Failed to parse netmask %s, ignoring.", netmask); + else + lease->have_subnet_mask = true; + } + + if (server_address) { + r = inet_pton(AF_INET, server_address, &lease->server_address); + if (r <= 0) + log_debug("Failed to parse server address %s, ignoring.", server_address); + } + + if (next_server) { + r = inet_pton(AF_INET, next_server, &lease->next_server); + if (r <= 0) + log_debug("Failed to parse next server %s, ignoring.", next_server); + } + + if (broadcast) { + r = inet_pton(AF_INET, broadcast, &lease->broadcast); + if (r <= 0) + log_debug("Failed to parse broadcast address %s, ignoring.", broadcast); + else + lease->have_broadcast = true; + } + + if (dns) { + r = deserialize_in_addrs(&lease->servers[SD_DHCP_LEASE_DNS].addr, dns); + if (r < 0) + log_debug_errno(r, "Failed to deserialize DNS servers %s, ignoring: %m", dns); + else + lease->servers[SD_DHCP_LEASE_DNS].size = r; + } + + if (ntp) { + r = deserialize_in_addrs(&lease->servers[SD_DHCP_LEASE_NTP].addr, ntp); + if (r < 0) + log_debug_errno(r, "Failed to deserialize NTP servers %s, ignoring: %m", ntp); + else + lease->servers[SD_DHCP_LEASE_NTP].size = r; + } + + if (sip) { + r = deserialize_in_addrs(&lease->servers[SD_DHCP_LEASE_SIP].addr, sip); + if (r < 0) + log_debug_errno(r, "Failed to deserialize SIP servers %s, ignoring: %m", sip); + else + lease->servers[SD_DHCP_LEASE_SIP].size = r; + } + + if (pop3) { + r = deserialize_in_addrs(&lease->servers[SD_DHCP_LEASE_POP3].addr, pop3); + if (r < 0) + log_debug_errno(r, "Failed to deserialize POP3 server %s, ignoring: %m", pop3); + else + lease->servers[SD_DHCP_LEASE_POP3].size = r; + } + + if (smtp) { + r = deserialize_in_addrs(&lease->servers[SD_DHCP_LEASE_SMTP].addr, smtp); + if (r < 0) + log_debug_errno(r, "Failed to deserialize SMTP server %s, ignoring: %m", smtp); + else + lease->servers[SD_DHCP_LEASE_SMTP].size = r; + } + + if (lpr) { + r = deserialize_in_addrs(&lease->servers[SD_DHCP_LEASE_LPR].addr, lpr); + if (r < 0) + log_debug_errno(r, "Failed to deserialize LPR server %s, ignoring: %m", lpr); + else + lease->servers[SD_DHCP_LEASE_LPR].size = r; + } + + if (mtu) { + r = safe_atou16(mtu, &lease->mtu); + if (r < 0) + log_debug_errno(r, "Failed to parse MTU %s, ignoring: %m", mtu); + } + + if (domains) { + _cleanup_strv_free_ char **a = NULL; + a = strv_split(domains, " "); + if (!a) + return -ENOMEM; + + if (!strv_isempty(a)) + lease->search_domains = TAKE_PTR(a); + } + + if (static_routes) { + r = deserialize_dhcp_routes( + &lease->static_routes, + &lease->n_static_routes, + static_routes); + if (r < 0) + log_debug_errno(r, "Failed to parse DHCP static routes %s, ignoring: %m", static_routes); + } + + if (classless_routes) { + r = deserialize_dhcp_routes( + &lease->classless_routes, + &lease->n_classless_routes, + classless_routes); + if (r < 0) + log_debug_errno(r, "Failed to parse DHCP classless routes %s, ignoring: %m", classless_routes); + } + + if (lifetime) { + r = parse_sec(lifetime, &lease->lifetime); + if (r < 0) + log_debug_errno(r, "Failed to parse lifetime %s, ignoring: %m", lifetime); + } + + if (t1) { + r = parse_sec(t1, &lease->t1); + if (r < 0) + log_debug_errno(r, "Failed to parse T1 %s, ignoring: %m", t1); + } + + if (t2) { + r = parse_sec(t2, &lease->t2); + if (r < 0) + log_debug_errno(r, "Failed to parse T2 %s, ignoring: %m", t2); + } + + if (client_id_hex) { + r = unhexmem(client_id_hex, SIZE_MAX, &lease->client_id, &lease->client_id_len); + if (r < 0) + log_debug_errno(r, "Failed to parse client ID %s, ignoring: %m", client_id_hex); + } + + if (vendor_specific_hex) { + r = unhexmem(vendor_specific_hex, SIZE_MAX, &lease->vendor_specific, &lease->vendor_specific_len); + if (r < 0) + log_debug_errno(r, "Failed to parse vendor specific data %s, ignoring: %m", vendor_specific_hex); + } + + for (i = 0; i <= SD_DHCP_OPTION_PRIVATE_LAST - SD_DHCP_OPTION_PRIVATE_BASE; i++) { + _cleanup_free_ void *data = NULL; + size_t len; + + if (!options[i]) + continue; + + r = unhexmem(options[i], SIZE_MAX, &data, &len); + if (r < 0) { + log_debug_errno(r, "Failed to parse private DHCP option %s, ignoring: %m", options[i]); + continue; + } + + r = dhcp_lease_insert_private_option(lease, SD_DHCP_OPTION_PRIVATE_BASE + i, data, len); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(lease); + + return 0; +} + +int dhcp_lease_set_default_subnet_mask(sd_dhcp_lease *lease) { + struct in_addr address, mask; + int r; + + assert(lease); + + if (lease->have_subnet_mask) + return 0; + + if (lease->address == 0) + return -ENODATA; + + address.s_addr = lease->address; + + /* fall back to the default subnet masks based on address class */ + r = in4_addr_default_subnet_mask(&address, &mask); + if (r < 0) + return r; + + lease->subnet_mask = mask.s_addr; + lease->have_subnet_mask = true; + + return 0; +} + +int sd_dhcp_lease_get_client_id(sd_dhcp_lease *lease, const void **client_id, size_t *client_id_len) { + assert_return(lease, -EINVAL); + assert_return(client_id, -EINVAL); + assert_return(client_id_len, -EINVAL); + + if (!lease->client_id) + return -ENODATA; + + *client_id = lease->client_id; + *client_id_len = lease->client_id_len; + + return 0; +} + +int dhcp_lease_set_client_id(sd_dhcp_lease *lease, const void *client_id, size_t client_id_len) { + assert_return(lease, -EINVAL); + assert_return(client_id || client_id_len <= 0, -EINVAL); + + if (client_id_len <= 0) + lease->client_id = mfree(lease->client_id); + else { + void *p; + + p = memdup(client_id, client_id_len); + if (!p) + return -ENOMEM; + + free_and_replace(lease->client_id, p); + lease->client_id_len = client_id_len; + } + + return 0; +} + +int sd_dhcp_lease_get_timezone(sd_dhcp_lease *lease, const char **tz) { + assert_return(lease, -EINVAL); + assert_return(tz, -EINVAL); + + if (!lease->timezone) + return -ENODATA; + + *tz = lease->timezone; + return 0; +} + +int sd_dhcp_route_get_destination(sd_dhcp_route *route, struct in_addr *destination) { + assert_return(route, -EINVAL); + assert_return(destination, -EINVAL); + + *destination = route->dst_addr; + return 0; +} + +int sd_dhcp_route_get_destination_prefix_length(sd_dhcp_route *route, uint8_t *length) { + assert_return(route, -EINVAL); + assert_return(length, -EINVAL); + + *length = route->dst_prefixlen; + return 0; +} + +int sd_dhcp_route_get_gateway(sd_dhcp_route *route, struct in_addr *gateway) { + assert_return(route, -EINVAL); + assert_return(gateway, -EINVAL); + + *gateway = route->gw_addr; + return 0; +} diff --git a/src/libsystemd-network/sd-dhcp-server.c b/src/libsystemd-network/sd-dhcp-server.c new file mode 100644 index 0000000..fcc5b74 --- /dev/null +++ b/src/libsystemd-network/sd-dhcp-server.c @@ -0,0 +1,1792 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "sd-dhcp-server.h" +#include "sd-id128.h" + +#include "alloc-util.h" +#include "dhcp-network.h" +#include "dhcp-option.h" +#include "dhcp-packet.h" +#include "dhcp-server-internal.h" +#include "dns-domain.h" +#include "fd-util.h" +#include "in-addr-util.h" +#include "iovec-util.h" +#include "memory-util.h" +#include "network-common.h" +#include "ordered-set.h" +#include "siphash24.h" +#include "string-util.h" +#include "unaligned.h" +#include "utf8.h" + +#define DHCP_DEFAULT_LEASE_TIME_USEC USEC_PER_HOUR +#define DHCP_MAX_LEASE_TIME_USEC (USEC_PER_HOUR*12) + +DHCPLease *dhcp_lease_free(DHCPLease *lease) { + if (!lease) + return NULL; + + if (lease->server) { + hashmap_remove_value(lease->server->bound_leases_by_address, UINT32_TO_PTR(lease->address), lease); + hashmap_remove_value(lease->server->bound_leases_by_client_id, &lease->client_id, lease); + hashmap_remove_value(lease->server->static_leases_by_address, UINT32_TO_PTR(lease->address), lease); + hashmap_remove_value(lease->server->static_leases_by_client_id, &lease->client_id, lease); + } + + free(lease->client_id.data); + free(lease->hostname); + return mfree(lease); +} + +/* configures the server's address and subnet, and optionally the pool's size and offset into the subnet + * the whole pool must fit into the subnet, and may not contain the first (any) nor last (broadcast) address + * moreover, the server's own address may be in the pool, and is in that case reserved in order not to + * accidentally hand it out */ +int sd_dhcp_server_configure_pool( + sd_dhcp_server *server, + const struct in_addr *address, + unsigned char prefixlen, + uint32_t offset, + uint32_t size) { + + struct in_addr netmask_addr; + be32_t netmask; + uint32_t server_off, broadcast_off, size_max; + + assert_return(server, -EINVAL); + assert_return(address, -EINVAL); + assert_return(address->s_addr != INADDR_ANY, -EINVAL); + assert_return(prefixlen <= 32, -ERANGE); + + assert_se(in4_addr_prefixlen_to_netmask(&netmask_addr, prefixlen)); + netmask = netmask_addr.s_addr; + + server_off = be32toh(address->s_addr & ~netmask); + broadcast_off = be32toh(~netmask); + + /* the server address cannot be the subnet address */ + assert_return(server_off != 0, -ERANGE); + + /* nor the broadcast address */ + assert_return(server_off != broadcast_off, -ERANGE); + + /* 0 offset means we should set a default, we skip the first (subnet) address + and take the next one */ + if (offset == 0) + offset = 1; + + size_max = (broadcast_off + 1) /* the number of addresses in the subnet */ + - offset /* exclude the addresses before the offset */ + - 1; /* exclude the last (broadcast) address */ + + /* The pool must contain at least one address */ + assert_return(size_max >= 1, -ERANGE); + + if (size != 0) + assert_return(size <= size_max, -ERANGE); + else + size = size_max; + + if (server->address != address->s_addr || server->netmask != netmask || server->pool_size != size || server->pool_offset != offset) { + + server->pool_offset = offset; + server->pool_size = size; + + server->address = address->s_addr; + server->netmask = netmask; + server->subnet = address->s_addr & netmask; + + /* Drop any leases associated with the old address range */ + hashmap_clear(server->bound_leases_by_address); + hashmap_clear(server->bound_leases_by_client_id); + + if (server->callback) + server->callback(server, SD_DHCP_SERVER_EVENT_LEASE_CHANGED, server->callback_userdata); + } + + return 0; +} + +int sd_dhcp_server_is_running(sd_dhcp_server *server) { + if (!server) + return false; + + return !!server->receive_message; +} + +int sd_dhcp_server_is_in_relay_mode(sd_dhcp_server *server) { + assert_return(server, -EINVAL); + + return in4_addr_is_set(&server->relay_target); +} + +void client_id_hash_func(const DHCPClientId *id, struct siphash *state) { + assert(id); + assert(id->length > 0); + assert(id->data); + + siphash24_compress(&id->length, sizeof(id->length), state); + siphash24_compress(id->data, id->length, state); +} + +int client_id_compare_func(const DHCPClientId *a, const DHCPClientId *b) { + int r; + + assert(a->length > 0); + assert(a->data); + assert(b->length > 0); + assert(b->data); + + r = CMP(a->length, b->length); + if (r != 0) + return r; + + return memcmp(a->data, b->data, a->length); +} + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + dhcp_lease_hash_ops, + DHCPClientId, + client_id_hash_func, + client_id_compare_func, + DHCPLease, + dhcp_lease_free); + +static sd_dhcp_server *dhcp_server_free(sd_dhcp_server *server) { + assert(server); + + sd_dhcp_server_stop(server); + + sd_event_unref(server->event); + + free(server->boot_server_name); + free(server->boot_filename); + free(server->timezone); + + for (sd_dhcp_lease_server_type_t i = 0; i < _SD_DHCP_LEASE_SERVER_TYPE_MAX; i++) + free(server->servers[i].addr); + + server->bound_leases_by_address = hashmap_free(server->bound_leases_by_address); + server->bound_leases_by_client_id = hashmap_free(server->bound_leases_by_client_id); + server->static_leases_by_address = hashmap_free(server->static_leases_by_address); + server->static_leases_by_client_id = hashmap_free(server->static_leases_by_client_id); + + ordered_set_free(server->extra_options); + ordered_set_free(server->vendor_options); + + free(server->agent_circuit_id); + free(server->agent_remote_id); + + free(server->ifname); + return mfree(server); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp_server, sd_dhcp_server, dhcp_server_free); + +int sd_dhcp_server_new(sd_dhcp_server **ret, int ifindex) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + + assert_return(ret, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + + server = new(sd_dhcp_server, 1); + if (!server) + return -ENOMEM; + + *server = (sd_dhcp_server) { + .n_ref = 1, + .fd_raw = -EBADF, + .fd = -EBADF, + .fd_broadcast = -EBADF, + .address = htobe32(INADDR_ANY), + .netmask = htobe32(INADDR_ANY), + .ifindex = ifindex, + .bind_to_interface = true, + .default_lease_time = DHCP_DEFAULT_LEASE_TIME_USEC, + .max_lease_time = DHCP_MAX_LEASE_TIME_USEC, + .rapid_commit = true, + }; + + *ret = TAKE_PTR(server); + + return 0; +} + +int sd_dhcp_server_set_ifname(sd_dhcp_server *server, const char *ifname) { + assert_return(server, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&server->ifname, ifname); +} + +int sd_dhcp_server_get_ifname(sd_dhcp_server *server, const char **ret) { + int r; + + assert_return(server, -EINVAL); + + r = get_ifname(server->ifindex, &server->ifname); + if (r < 0) + return r; + + if (ret) + *ret = server->ifname; + + return 0; +} + +int sd_dhcp_server_attach_event(sd_dhcp_server *server, sd_event *event, int64_t priority) { + int r; + + assert_return(server, -EINVAL); + assert_return(!server->event, -EBUSY); + + if (event) + server->event = sd_event_ref(event); + else { + r = sd_event_default(&server->event); + if (r < 0) + return r; + } + + server->event_priority = priority; + + return 0; +} + +int sd_dhcp_server_detach_event(sd_dhcp_server *server) { + assert_return(server, -EINVAL); + + server->event = sd_event_unref(server->event); + + return 0; +} + +sd_event *sd_dhcp_server_get_event(sd_dhcp_server *server) { + assert_return(server, NULL); + + return server->event; +} + +int sd_dhcp_server_set_boot_server_address(sd_dhcp_server *server, const struct in_addr *address) { + assert_return(server, -EINVAL); + + if (address) + server->boot_server_address = *address; + else + server->boot_server_address = (struct in_addr) {}; + + return 0; +} + +int sd_dhcp_server_set_boot_server_name(sd_dhcp_server *server, const char *name) { + int r; + + assert_return(server, -EINVAL); + + if (name) { + r = dns_name_is_valid(name); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + } + + return free_and_strdup(&server->boot_server_name, name); +} + +int sd_dhcp_server_set_boot_filename(sd_dhcp_server *server, const char *filename) { + assert_return(server, -EINVAL); + + if (filename && (!string_is_safe(filename) || !ascii_is_valid(filename))) + return -EINVAL; + + return free_and_strdup(&server->boot_filename, filename); +} + +int sd_dhcp_server_stop(sd_dhcp_server *server) { + bool running; + + if (!server) + return 0; + + running = sd_dhcp_server_is_running(server); + + server->receive_message = sd_event_source_disable_unref(server->receive_message); + server->receive_broadcast = sd_event_source_disable_unref(server->receive_broadcast); + + server->fd_raw = safe_close(server->fd_raw); + server->fd = safe_close(server->fd); + server->fd_broadcast = safe_close(server->fd_broadcast); + + if (running) + log_dhcp_server(server, "STOPPED"); + + return 0; +} + +static bool dhcp_request_contains(DHCPRequest *req, uint8_t option) { + assert(req); + + if (!req->parameter_request_list) + return false; + + return memchr(req->parameter_request_list, option, req->parameter_request_list_len); +} + +static int dhcp_server_send_unicast_raw( + sd_dhcp_server *server, + uint8_t hlen, + const uint8_t *chaddr, + DHCPPacket *packet, + size_t len) { + + union sockaddr_union link = { + .ll.sll_family = AF_PACKET, + .ll.sll_protocol = htobe16(ETH_P_IP), + .ll.sll_ifindex = server->ifindex, + .ll.sll_halen = hlen, + }; + + assert(server); + assert(server->ifindex > 0); + assert(server->address != 0); + assert(hlen > 0); + assert(chaddr); + assert(packet); + assert(len > sizeof(DHCPPacket)); + + memcpy(link.ll.sll_addr, chaddr, hlen); + + if (len > UINT16_MAX) + return -EOVERFLOW; + + dhcp_packet_append_ip_headers(packet, server->address, DHCP_PORT_SERVER, + packet->dhcp.yiaddr, + DHCP_PORT_CLIENT, len, -1); + + return dhcp_network_send_raw_socket(server->fd_raw, &link, packet, len); +} + +static int dhcp_server_send_udp(sd_dhcp_server *server, be32_t destination, + uint16_t destination_port, + DHCPMessage *message, size_t len) { + union sockaddr_union dest = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(destination_port), + .in.sin_addr.s_addr = destination, + }; + struct iovec iov = { + .iov_base = message, + .iov_len = len, + }; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct in_pktinfo))) control = {}; + struct msghdr msg = { + .msg_name = &dest, + .msg_namelen = sizeof(dest.in), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + struct cmsghdr *cmsg; + struct in_pktinfo *pktinfo; + + assert(server); + assert(server->fd >= 0); + assert(message); + assert(len >= sizeof(DHCPMessage)); + + if (server->bind_to_interface) { + msg.msg_control = &control; + msg.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msg); + assert(cmsg); + + cmsg->cmsg_level = IPPROTO_IP; + cmsg->cmsg_type = IP_PKTINFO; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); + + /* we attach source interface and address info to the message + rather than binding the socket. This will be mostly useful + when we gain support for arbitrary number of server addresses + */ + pktinfo = CMSG_TYPED_DATA(cmsg, struct in_pktinfo); + assert(pktinfo); + + pktinfo->ipi_ifindex = server->ifindex; + pktinfo->ipi_spec_dst.s_addr = server->address; + } + + if (sendmsg(server->fd, &msg, 0) < 0) + return -errno; + + return 0; +} + +static bool requested_broadcast(DHCPMessage *message) { + assert(message); + return message->flags & htobe16(0x8000); +} + +static int dhcp_server_send( + sd_dhcp_server *server, + uint8_t hlen, + const uint8_t *chaddr, + be32_t destination, + uint16_t destination_port, + DHCPPacket *packet, + size_t optoffset, + bool l2_broadcast) { + + if (destination != INADDR_ANY) + return dhcp_server_send_udp(server, destination, + destination_port, &packet->dhcp, + sizeof(DHCPMessage) + optoffset); + else if (l2_broadcast) + return dhcp_server_send_udp(server, INADDR_BROADCAST, + destination_port, &packet->dhcp, + sizeof(DHCPMessage) + optoffset); + else + /* we cannot send UDP packet to specific MAC address when the + address is not yet configured, so must fall back to raw + packets */ + return dhcp_server_send_unicast_raw(server, hlen, chaddr, packet, + sizeof(DHCPPacket) + optoffset); +} + +int dhcp_server_send_packet(sd_dhcp_server *server, + DHCPRequest *req, DHCPPacket *packet, + int type, size_t optoffset) { + be32_t destination = INADDR_ANY; + uint16_t destination_port = DHCP_PORT_CLIENT; + int r; + + assert(server); + assert(req); + assert(req->max_optlen > 0); + assert(req->message); + assert(optoffset <= req->max_optlen); + assert(packet); + + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &optoffset, 0, + SD_DHCP_OPTION_SERVER_IDENTIFIER, + 4, &server->address); + if (r < 0) + return r; + + if (req->agent_info_option) { + size_t opt_full_length = *(req->agent_info_option + 1) + 2; + /* there must be space left for SD_DHCP_OPTION_END */ + if (optoffset + opt_full_length < req->max_optlen) { + memcpy(packet->dhcp.options + optoffset, req->agent_info_option, opt_full_length); + optoffset += opt_full_length; + } + } + + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &optoffset, 0, + SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + /* RFC 2131 Section 4.1 + + If the ’giaddr’ field in a DHCP message from a client is non-zero, + the server sends any return messages to the ’DHCP server’ port on the + BOOTP relay agent whose address appears in ’giaddr’. If the ’giaddr’ + field is zero and the ’ciaddr’ field is nonzero, then the server + unicasts DHCPOFFER and DHCPACK messages to the address in ’ciaddr’. + If ’giaddr’ is zero and ’ciaddr’ is zero, and the broadcast bit is + set, then the server broadcasts DHCPOFFER and DHCPACK messages to + 0xffffffff. If the broadcast bit is not set and ’giaddr’ is zero and + ’ciaddr’ is zero, then the server unicasts DHCPOFFER and DHCPACK + messages to the client’s hardware address and ’yiaddr’ address. In + all cases, when ’giaddr’ is zero, the server broadcasts any DHCPNAK + messages to 0xffffffff. + + Section 4.3.2 + + If ’giaddr’ is set in the DHCPREQUEST message, the client is on a + different subnet. The server MUST set the broadcast bit in the + DHCPNAK, so that the relay agent will broadcast the DHCPNAK to the + client, because the client may not have a correct network address + or subnet mask, and the client may not be answering ARP requests. + */ + if (req->message->giaddr != 0) { + destination = req->message->giaddr; + destination_port = DHCP_PORT_SERVER; + if (type == DHCP_NAK) + packet->dhcp.flags = htobe16(0x8000); + } else if (req->message->ciaddr != 0 && type != DHCP_NAK) + destination = req->message->ciaddr; + + bool l2_broadcast = requested_broadcast(req->message) || type == DHCP_NAK; + return dhcp_server_send(server, req->message->hlen, req->message->chaddr, + destination, destination_port, packet, optoffset, l2_broadcast); +} + +static int server_message_init( + sd_dhcp_server *server, + DHCPPacket **ret, + uint8_t type, + size_t *ret_optoffset, + DHCPRequest *req) { + + _cleanup_free_ DHCPPacket *packet = NULL; + size_t optoffset = 0; + int r; + + assert(server); + assert(ret); + assert(ret_optoffset); + assert(IN_SET(type, DHCP_OFFER, DHCP_ACK, DHCP_NAK)); + assert(req); + + packet = malloc0(sizeof(DHCPPacket) + req->max_optlen); + if (!packet) + return -ENOMEM; + + r = dhcp_message_init(&packet->dhcp, BOOTREPLY, + be32toh(req->message->xid), type, + req->message->htype, req->message->hlen, req->message->chaddr, + req->max_optlen, &optoffset); + if (r < 0) + return r; + + packet->dhcp.flags = req->message->flags; + packet->dhcp.giaddr = req->message->giaddr; + + *ret_optoffset = optoffset; + *ret = TAKE_PTR(packet); + + return 0; +} + +static int server_send_offer_or_ack( + sd_dhcp_server *server, + DHCPRequest *req, + be32_t address, + uint8_t type) { + + static const uint8_t option_map[_SD_DHCP_LEASE_SERVER_TYPE_MAX] = { + [SD_DHCP_LEASE_DNS] = SD_DHCP_OPTION_DOMAIN_NAME_SERVER, + [SD_DHCP_LEASE_NTP] = SD_DHCP_OPTION_NTP_SERVER, + [SD_DHCP_LEASE_SIP] = SD_DHCP_OPTION_SIP_SERVER, + [SD_DHCP_LEASE_POP3] = SD_DHCP_OPTION_POP3_SERVER, + [SD_DHCP_LEASE_SMTP] = SD_DHCP_OPTION_SMTP_SERVER, + [SD_DHCP_LEASE_LPR] = SD_DHCP_OPTION_LPR_SERVER, + }; + + _cleanup_free_ DHCPPacket *packet = NULL; + sd_dhcp_option *j; + be32_t lease_time; + size_t offset; + int r; + + assert(server); + assert(req); + assert(IN_SET(type, DHCP_OFFER, DHCP_ACK)); + + r = server_message_init(server, &packet, type, &offset, req); + if (r < 0) + return r; + + packet->dhcp.yiaddr = address; + packet->dhcp.siaddr = server->boot_server_address.s_addr; + + lease_time = usec_to_be32_sec(req->lifetime); + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_IP_ADDRESS_LEASE_TIME, 4, + &lease_time); + if (r < 0) + return r; + + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_SUBNET_MASK, 4, &server->netmask); + if (r < 0) + return r; + + if (server->emit_router) { + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_ROUTER, 4, + in4_addr_is_set(&server->router_address) ? + &server->router_address.s_addr : + &server->address); + if (r < 0) + return r; + } + + if (server->boot_server_name) { + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_BOOT_SERVER_NAME, + strlen(server->boot_server_name), server->boot_server_name); + if (r < 0) + return r; + } + + if (server->boot_filename) { + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_BOOT_FILENAME, + strlen(server->boot_filename), server->boot_filename); + if (r < 0) + return r; + } + + for (sd_dhcp_lease_server_type_t k = 0; k < _SD_DHCP_LEASE_SERVER_TYPE_MAX; k++) { + if (server->servers[k].size <= 0) + continue; + + r = dhcp_option_append( + &packet->dhcp, req->max_optlen, &offset, 0, + option_map[k], + sizeof(struct in_addr) * server->servers[k].size, + server->servers[k].addr); + if (r < 0) + return r; + } + + if (server->timezone) { + r = dhcp_option_append( + &packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_TZDB_TIMEZONE, + strlen(server->timezone), server->timezone); + if (r < 0) + return r; + } + + /* RFC 8925 section 3.3. DHCPv4 Server Behavior + * The server MUST NOT include the IPv6-Only Preferred option in the DHCPOFFER or DHCPACK message if + * the option was not present in the Parameter Request List sent by the client. */ + if (dhcp_request_contains(req, SD_DHCP_OPTION_IPV6_ONLY_PREFERRED) && + server->ipv6_only_preferred_usec > 0) { + be32_t sec = usec_to_be32_sec(server->ipv6_only_preferred_usec); + + r = dhcp_option_append( + &packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_IPV6_ONLY_PREFERRED, + sizeof(sec), &sec); + if (r < 0) + return r; + } + + ORDERED_SET_FOREACH(j, server->extra_options) { + r = dhcp_option_append(&packet->dhcp, req->max_optlen, &offset, 0, + j->option, j->length, j->data); + if (r < 0) + return r; + } + + if (!ordered_set_isempty(server->vendor_options)) { + r = dhcp_option_append( + &packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_VENDOR_SPECIFIC, + ordered_set_size(server->vendor_options), server->vendor_options); + if (r < 0) + return r; + } + + if (server->rapid_commit && req->rapid_commit && type == DHCP_ACK) { + r = dhcp_option_append( + &packet->dhcp, req->max_optlen, &offset, 0, + SD_DHCP_OPTION_RAPID_COMMIT, + 0, NULL); + if (r < 0) + return r; + } + + return dhcp_server_send_packet(server, req, packet, type, offset); +} + +static int server_send_nak_or_ignore(sd_dhcp_server *server, bool init_reboot, DHCPRequest *req) { + _cleanup_free_ DHCPPacket *packet = NULL; + size_t offset; + int r; + + /* When a request is refused, RFC 2131, section 4.3.2 mentioned we should send NAK when the + * client is in INITREBOOT. If the client is in other state, there is nothing mentioned in the + * RFC whether we should send NAK or not. Hence, let's silently ignore the request. */ + + if (!init_reboot) + return 0; + + r = server_message_init(server, &packet, DHCP_NAK, &offset, req); + if (r < 0) + return log_dhcp_server_errno(server, r, "Failed to create NAK message: %m"); + + r = dhcp_server_send_packet(server, req, packet, DHCP_NAK, offset); + if (r < 0) + return log_dhcp_server_errno(server, r, "Could not send NAK message: %m"); + + log_dhcp_server(server, "NAK (0x%x)", be32toh(req->message->xid)); + return DHCP_NAK; +} + +static int server_send_forcerenew( + sd_dhcp_server *server, + be32_t address, + be32_t gateway, + uint8_t htype, + uint8_t hlen, + const uint8_t *chaddr) { + + _cleanup_free_ DHCPPacket *packet = NULL; + size_t optoffset = 0; + int r; + + assert(server); + assert(address != INADDR_ANY); + assert(chaddr); + + packet = malloc0(sizeof(DHCPPacket) + DHCP_MIN_OPTIONS_SIZE); + if (!packet) + return -ENOMEM; + + r = dhcp_message_init(&packet->dhcp, BOOTREPLY, 0, + DHCP_FORCERENEW, htype, hlen, chaddr, + DHCP_MIN_OPTIONS_SIZE, &optoffset); + if (r < 0) + return r; + + r = dhcp_option_append(&packet->dhcp, DHCP_MIN_OPTIONS_SIZE, + &optoffset, 0, SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + + return dhcp_server_send_udp(server, address, DHCP_PORT_CLIENT, + &packet->dhcp, + sizeof(DHCPMessage) + optoffset); +} + +static int parse_request(uint8_t code, uint8_t len, const void *option, void *userdata) { + DHCPRequest *req = ASSERT_PTR(userdata); + int r; + + switch (code) { + case SD_DHCP_OPTION_IP_ADDRESS_LEASE_TIME: + if (len == 4) + req->lifetime = unaligned_be32_sec_to_usec(option, /* max_as_infinity = */ true); + + break; + case SD_DHCP_OPTION_REQUESTED_IP_ADDRESS: + if (len == 4) + memcpy(&req->requested_ip, option, sizeof(be32_t)); + + break; + case SD_DHCP_OPTION_SERVER_IDENTIFIER: + if (len == 4) + memcpy(&req->server_id, option, sizeof(be32_t)); + + break; + case SD_DHCP_OPTION_CLIENT_IDENTIFIER: + if (len >= 2) { + uint8_t *data; + + data = memdup(option, len); + if (!data) + return -ENOMEM; + + free_and_replace(req->client_id.data, data); + req->client_id.length = len; + } + + break; + case SD_DHCP_OPTION_MAXIMUM_MESSAGE_SIZE: + + if (len == 2 && unaligned_read_be16(option) >= sizeof(DHCPPacket)) + req->max_optlen = unaligned_read_be16(option) - sizeof(DHCPPacket); + + break; + case SD_DHCP_OPTION_RELAY_AGENT_INFORMATION: + req->agent_info_option = (uint8_t*)option - 2; + + break; + case SD_DHCP_OPTION_HOST_NAME: + r = dhcp_option_parse_string(option, len, &req->hostname); + if (r < 0) { + log_debug_errno(r, "Failed to parse hostname, ignoring: %m"); + return 0; + } + + break; + case SD_DHCP_OPTION_PARAMETER_REQUEST_LIST: + req->parameter_request_list = option; + req->parameter_request_list_len = len; + break; + + case SD_DHCP_OPTION_RAPID_COMMIT: + req->rapid_commit = true; + break; + } + + return 0; +} + +static DHCPRequest* dhcp_request_free(DHCPRequest *req) { + if (!req) + return NULL; + + free(req->client_id.data); + free(req->hostname); + return mfree(req); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(DHCPRequest*, dhcp_request_free); + +static int ensure_sane_request(sd_dhcp_server *server, DHCPRequest *req, DHCPMessage *message) { + assert(req); + assert(message); + + req->message = message; + + if (message->hlen > sizeof(message->chaddr)) + return -EBADMSG; + + /* set client id based on MAC address if client did not send an explicit one */ + if (!req->client_id.data) { + uint8_t *data; + + if (message->hlen == 0) + return -EBADMSG; + + data = new0(uint8_t, message->hlen + 1); + if (!data) + return -ENOMEM; + + data[0] = 0x01; + memcpy(data + 1, message->chaddr, message->hlen); + + req->client_id.length = message->hlen + 1; + req->client_id.data = data; + } + + if (message->hlen == 0 || memeqzero(message->chaddr, message->hlen)) { + /* See RFC2131 section 4.1.1. + * hlen and chaddr may not be set for non-ethernet interface. + * Let's try to retrieve it from the client ID. */ + + if (!req->client_id.data) + return -EBADMSG; + + if (req->client_id.length <= 1 || req->client_id.length > sizeof(message->chaddr) + 1) + return -EBADMSG; + + if (req->client_id.data[0] != 0x01) + return -EBADMSG; + + message->hlen = req->client_id.length - 1; + memcpy(message->chaddr, req->client_id.data + 1, message->hlen); + } + + if (req->max_optlen < DHCP_MIN_OPTIONS_SIZE) + req->max_optlen = DHCP_MIN_OPTIONS_SIZE; + + if (req->lifetime <= 0) + req->lifetime = MAX(USEC_PER_SEC, server->default_lease_time); + + if (server->max_lease_time > 0 && req->lifetime > server->max_lease_time) + req->lifetime = server->max_lease_time; + + return 0; +} + +static void request_set_timestamp(DHCPRequest *req, const triple_timestamp *timestamp) { + assert(req); + + if (timestamp && triple_timestamp_is_set(timestamp)) + req->timestamp = *timestamp; + else + triple_timestamp_now(&req->timestamp); +} + +static int request_get_lifetime_timestamp(DHCPRequest *req, clockid_t clock, usec_t *ret) { + assert(req); + assert(TRIPLE_TIMESTAMP_HAS_CLOCK(clock)); + assert(clock_supported(clock)); + assert(ret); + + if (req->lifetime <= 0) + return -ENODATA; + + if (!triple_timestamp_is_set(&req->timestamp)) + return -ENODATA; + + *ret = usec_add(triple_timestamp_by_clock(&req->timestamp, clock), req->lifetime); + return 0; +} + +static bool address_is_in_pool(sd_dhcp_server *server, be32_t address) { + assert(server); + + if (server->pool_size == 0) + return false; + + if (address == server->address) + return false; + + if (be32toh(address) < (be32toh(server->subnet) | server->pool_offset) || + be32toh(address) >= (be32toh(server->subnet) | (server->pool_offset + server->pool_size))) + return false; + + if (hashmap_contains(server->static_leases_by_address, UINT32_TO_PTR(address))) + return false; + + return true; +} + +static int append_agent_information_option(sd_dhcp_server *server, DHCPMessage *message, size_t opt_length, size_t size) { + int r; + size_t offset; + + assert(server); + assert(message); + + r = dhcp_option_find_option(message->options, opt_length, SD_DHCP_OPTION_END, &offset); + if (r < 0) + return r; + + r = dhcp_option_append(message, size, &offset, 0, SD_DHCP_OPTION_RELAY_AGENT_INFORMATION, 0, server); + if (r < 0) + return r; + + r = dhcp_option_append(message, size, &offset, 0, SD_DHCP_OPTION_END, 0, NULL); + if (r < 0) + return r; + return offset; +} + +static int dhcp_server_relay_message(sd_dhcp_server *server, DHCPMessage *message, size_t opt_length, size_t buflen) { + _cleanup_free_ DHCPPacket *packet = NULL; + int r; + + assert(server); + assert(message); + assert(sd_dhcp_server_is_in_relay_mode(server)); + + if (message->hlen == 0 || message->hlen > sizeof(message->chaddr) || memeqzero(message->chaddr, message->hlen)) + return log_dhcp_server_errno(server, SYNTHETIC_ERRNO(EBADMSG), + "(relay agent) received message without/invalid hardware address, discarding."); + + if (message->op == BOOTREQUEST) { + log_dhcp_server(server, "(relay agent) BOOTREQUEST (0x%x)", be32toh(message->xid)); + if (message->hops >= 16) + return -ETIME; + message->hops++; + + /* https://tools.ietf.org/html/rfc1542#section-4.1.1 */ + if (message->giaddr == 0) + message->giaddr = server->address; + + if (server->agent_circuit_id || server->agent_remote_id) { + r = append_agent_information_option(server, message, opt_length, buflen - sizeof(DHCPMessage)); + if (r < 0) + return log_dhcp_server_errno(server, r, "could not append relay option: %m"); + opt_length = r; + } + + return dhcp_server_send_udp(server, server->relay_target.s_addr, DHCP_PORT_SERVER, message, sizeof(DHCPMessage) + opt_length); + } else if (message->op == BOOTREPLY) { + log_dhcp_server(server, "(relay agent) BOOTREPLY (0x%x)", be32toh(message->xid)); + if (message->giaddr != server->address) + return log_dhcp_server_errno(server, SYNTHETIC_ERRNO(EBADMSG), + "(relay agent) BOOTREPLY giaddr mismatch, discarding"); + + int message_type = dhcp_option_parse(message, sizeof(DHCPMessage) + opt_length, NULL, NULL, NULL); + if (message_type < 0) + return message_type; + + packet = malloc0(sizeof(DHCPPacket) + opt_length); + if (!packet) + return -ENOMEM; + memcpy(&packet->dhcp, message, sizeof(DHCPMessage) + opt_length); + + r = dhcp_option_remove_option(packet->dhcp.options, opt_length, SD_DHCP_OPTION_RELAY_AGENT_INFORMATION); + if (r > 0) + opt_length = r; + + bool l2_broadcast = requested_broadcast(message) || message_type == DHCP_NAK; + const be32_t destination = message_type == DHCP_NAK ? INADDR_ANY : message->ciaddr; + return dhcp_server_send(server, message->hlen, message->chaddr, destination, DHCP_PORT_CLIENT, packet, opt_length, l2_broadcast); + } + return -EBADMSG; +} + +static int prepare_new_lease(DHCPLease **ret_lease, be32_t address, DHCPRequest *req, usec_t expiration) { + _cleanup_(dhcp_lease_freep) DHCPLease *lease = NULL; + + assert(ret_lease); + assert(address != 0); + assert(req); + assert(expiration != 0); + + lease = new(DHCPLease, 1); + if (!lease) + return -ENOMEM; + + *lease = (DHCPLease) { + .address = address, + .client_id.length = req->client_id.length, + .htype = req->message->htype, + .hlen = req->message->hlen, + .gateway = req->message->giaddr, + .expiration = expiration, + }; + lease->client_id.data = memdup(req->client_id.data, req->client_id.length); + if (!lease->client_id.data) + return -ENOMEM; + + memcpy(lease->chaddr, req->message->chaddr, req->message->hlen); + + if (req->hostname) { + lease->hostname = strdup(req->hostname); + if (!lease->hostname) + return -ENOMEM; + } + + *ret_lease = TAKE_PTR(lease); + + return 0; +} + +static int server_ack_request(sd_dhcp_server *server, DHCPRequest *req, DHCPLease *existing_lease, be32_t address) { + usec_t expiration; + int r; + + assert(server); + assert(req); + assert(address != 0); + + r = request_get_lifetime_timestamp(req, CLOCK_BOOTTIME, &expiration); + if (r < 0) + return r; + + if (existing_lease) { + assert(existing_lease->server); + assert(existing_lease->address == address); + existing_lease->expiration = expiration; + + } else { + _cleanup_(dhcp_lease_freep) DHCPLease *lease = NULL; + + r = prepare_new_lease(&lease, address, req, expiration); + if (r < 0) + return log_dhcp_server_errno(server, r, "Failed to create new lease: %m"); + + lease->server = server; /* This must be set just before hashmap_put(). */ + + r = hashmap_ensure_put(&server->bound_leases_by_client_id, &dhcp_lease_hash_ops, &lease->client_id, lease); + if (r < 0) + return log_dhcp_server_errno(server, r, "Could not save lease: %m"); + + r = hashmap_ensure_put(&server->bound_leases_by_address, NULL, UINT32_TO_PTR(lease->address), lease); + if (r < 0) + return log_dhcp_server_errno(server, r, "Could not save lease: %m"); + + TAKE_PTR(lease); + } + + r = server_send_offer_or_ack(server, req, address, DHCP_ACK); + if (r < 0) + return log_dhcp_server_errno(server, r, "Could not send ACK: %m"); + + log_dhcp_server(server, "ACK (0x%x)", be32toh(req->message->xid)); + + if (server->callback) + server->callback(server, SD_DHCP_SERVER_EVENT_LEASE_CHANGED, server->callback_userdata); + + return DHCP_ACK; +} + +static int dhcp_server_cleanup_expired_leases(sd_dhcp_server *server) { + DHCPLease *lease; + usec_t time_now; + int r; + + assert(server); + + r = sd_event_now(server->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + return r; + + HASHMAP_FOREACH(lease, server->bound_leases_by_client_id) + if (lease->expiration < time_now) { + log_dhcp_server(server, "CLEAN (0x%x)", be32toh(lease->address)); + dhcp_lease_free(lease); + } + + return 0; +} + +static bool address_available(sd_dhcp_server *server, be32_t address) { + assert(server); + + if (hashmap_contains(server->bound_leases_by_address, UINT32_TO_PTR(address)) || + hashmap_contains(server->static_leases_by_address, UINT32_TO_PTR(address)) || + address == server->address) + return false; + + return true; +} + +static int server_get_static_lease(sd_dhcp_server *server, const DHCPRequest *req, DHCPLease **ret) { + DHCPLease *static_lease; + _cleanup_free_ uint8_t *data = NULL; + + assert(server); + assert(req); + assert(ret); + + static_lease = hashmap_get(server->static_leases_by_client_id, &req->client_id); + if (static_lease) { + *ret = static_lease; + return 0; + } + + /* when no lease is found based on the client id fall back to chaddr */ + data = new(uint8_t, req->message->hlen + 1); + if (!data) + return -ENOMEM; + + /* set client id type to 1: Ethernet Link-Layer (RFC 2132) */ + data[0] = 0x01; + memcpy(data + 1, req->message->chaddr, req->message->hlen); + + static_lease = hashmap_get(server->static_leases_by_client_id, + &(DHCPClientId) { + .length = req->message->hlen + 1, + .data = data, + }); + + *ret = static_lease; + + return 0; +} + +#define HASH_KEY SD_ID128_MAKE(0d,1d,fe,bd,f1,24,bd,b3,47,f1,dd,6e,73,21,93,30) + +int dhcp_server_handle_message(sd_dhcp_server *server, DHCPMessage *message, size_t length, const triple_timestamp *timestamp) { + _cleanup_(dhcp_request_freep) DHCPRequest *req = NULL; + _cleanup_free_ char *error_message = NULL; + DHCPLease *existing_lease, *static_lease; + int type, r; + + assert(server); + assert(message); + + if (message->op != BOOTREQUEST) + return 0; + + req = new0(DHCPRequest, 1); + if (!req) + return -ENOMEM; + + type = dhcp_option_parse(message, length, parse_request, req, &error_message); + if (type < 0) + return type; + + r = ensure_sane_request(server, req, message); + if (r < 0) + return r; + + request_set_timestamp(req, timestamp); + + r = dhcp_server_cleanup_expired_leases(server); + if (r < 0) + return r; + + existing_lease = hashmap_get(server->bound_leases_by_client_id, &req->client_id); + r = server_get_static_lease(server, req, &static_lease); + if (r < 0) + return r; + + switch (type) { + + case DHCP_DISCOVER: { + be32_t address = INADDR_ANY; + + log_dhcp_server(server, "DISCOVER (0x%x)", be32toh(req->message->xid)); + + if (server->pool_size == 0) + /* no pool allocated */ + return 0; + + /* for now pick a random free address from the pool */ + if (static_lease) + address = static_lease->address; + else if (existing_lease) + address = existing_lease->address; + else { + struct siphash state; + uint64_t hash; + + /* even with no persistence of leases, we try to offer the same client + the same IP address. we do this by using the hash of the client id + as the offset into the pool of leases when finding the next free one */ + + siphash24_init(&state, HASH_KEY.bytes); + client_id_hash_func(&req->client_id, &state); + hash = htole64(siphash24_finalize(&state)); + + for (unsigned i = 0; i < server->pool_size; i++) { + be32_t tmp_address; + + tmp_address = server->subnet | htobe32(server->pool_offset + (hash + i) % server->pool_size); + if (address_available(server, tmp_address)) { + address = tmp_address; + break; + } + } + } + + if (address == INADDR_ANY) + /* no free addresses left */ + return 0; + + if (server->rapid_commit && req->rapid_commit) + return server_ack_request(server, req, existing_lease, address); + + r = server_send_offer_or_ack(server, req, address, DHCP_OFFER); + if (r < 0) + /* this only fails on critical errors */ + return log_dhcp_server_errno(server, r, "Could not send offer: %m"); + + log_dhcp_server(server, "OFFER (0x%x)", be32toh(req->message->xid)); + return DHCP_OFFER; + } + case DHCP_DECLINE: + log_dhcp_server(server, "DECLINE (0x%x): %s", be32toh(req->message->xid), strna(error_message)); + + /* TODO: make sure we don't offer this address again */ + + return 1; + + case DHCP_REQUEST: { + be32_t address; + bool init_reboot = false; + + /* see RFC 2131, section 4.3.2 */ + + if (req->server_id != 0) { + log_dhcp_server(server, "REQUEST (selecting) (0x%x)", + be32toh(req->message->xid)); + + /* SELECTING */ + if (req->server_id != server->address) + /* client did not pick us */ + return 0; + + if (req->message->ciaddr != 0) + /* this MUST be zero */ + return 0; + + if (req->requested_ip == 0) + /* this must be filled in with the yiaddr + from the chosen OFFER */ + return 0; + + address = req->requested_ip; + } else if (req->requested_ip != 0) { + log_dhcp_server(server, "REQUEST (init-reboot) (0x%x)", + be32toh(req->message->xid)); + + /* INIT-REBOOT */ + if (req->message->ciaddr != 0) + /* this MUST be zero */ + return 0; + + /* TODO: check more carefully if IP is correct */ + address = req->requested_ip; + init_reboot = true; + } else { + log_dhcp_server(server, "REQUEST (rebinding/renewing) (0x%x)", + be32toh(req->message->xid)); + + /* REBINDING / RENEWING */ + if (req->message->ciaddr == 0) + /* this MUST be filled in with clients IP address */ + return 0; + + address = req->message->ciaddr; + } + + /* Silently ignore Rapid Commit option in REQUEST message. */ + req->rapid_commit = false; + + /* disallow our own address */ + if (address == server->address) + return 0; + + if (static_lease) { + /* Found a static lease for the client ID. */ + + if (static_lease->address != address) + /* The client requested an address which is different from the static lease. Refuse. */ + return server_send_nak_or_ignore(server, init_reboot, req); + + return server_ack_request(server, req, existing_lease, address); + } + + if (address_is_in_pool(server, address)) { + /* The requested address is in the pool. */ + + if (existing_lease && existing_lease->address != address) + /* We previously assigned an address, but the client requested another one. Refuse. */ + return server_send_nak_or_ignore(server, init_reboot, req); + + return server_ack_request(server, req, existing_lease, address); + } + + return server_send_nak_or_ignore(server, init_reboot, req); + } + + case DHCP_RELEASE: { + log_dhcp_server(server, "RELEASE (0x%x)", + be32toh(req->message->xid)); + + if (!existing_lease) + return 0; + + if (existing_lease->address != req->message->ciaddr) + return 0; + + dhcp_lease_free(existing_lease); + + if (server->callback) + server->callback(server, SD_DHCP_SERVER_EVENT_LEASE_CHANGED, server->callback_userdata); + + return 0; + }} + + return 0; +} + +static size_t relay_agent_information_length(const char* agent_circuit_id, const char* agent_remote_id) { + size_t sum = 0; + if (agent_circuit_id) + sum += 2 + strlen(agent_circuit_id); + if (agent_remote_id) + sum += 2 + strlen(agent_remote_id); + return sum; +} + +static int server_receive_message(sd_event_source *s, int fd, + uint32_t revents, void *userdata) { + _cleanup_free_ DHCPMessage *message = NULL; + /* This needs to be initialized with zero. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE_TIMEVAL + + CMSG_SPACE(sizeof(struct in_pktinfo))) control = {}; + sd_dhcp_server *server = ASSERT_PTR(userdata); + struct iovec iov = {}; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + ssize_t datagram_size, len; + int r; + + datagram_size = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(datagram_size) || ERRNO_IS_NEG_DISCONNECT(datagram_size)) + return 0; + if (datagram_size < 0) { + log_dhcp_server_errno(server, datagram_size, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + size_t buflen = datagram_size; + if (sd_dhcp_server_is_in_relay_mode(server)) + /* Preallocate the additional size for DHCP Relay Agent Information Option if needed */ + buflen += relay_agent_information_length(server->agent_circuit_id, server->agent_remote_id) + 2; + + message = malloc(buflen); + if (!message) + return -ENOMEM; + + iov = IOVEC_MAKE(message, datagram_size); + + len = recvmsg_safe(fd, &msg, 0); + if (ERRNO_IS_NEG_TRANSIENT(len) || ERRNO_IS_NEG_DISCONNECT(len)) + return 0; + if (len < 0) { + log_dhcp_server_errno(server, len, "Could not receive message, ignoring: %m"); + return 0; + } + + if ((size_t) len < sizeof(DHCPMessage)) + return 0; + + /* TODO figure out if this can be done as a filter on the socket, like for IPv6 */ + struct in_pktinfo *info = CMSG_FIND_DATA(&msg, IPPROTO_IP, IP_PKTINFO, struct in_pktinfo); + if (info && info->ipi_ifindex != server->ifindex) + return 0; + + if (sd_dhcp_server_is_in_relay_mode(server)) { + r = dhcp_server_relay_message(server, message, len - sizeof(DHCPMessage), buflen); + if (r < 0) + log_dhcp_server_errno(server, r, "Couldn't relay message, ignoring: %m"); + } else { + r = dhcp_server_handle_message(server, message, (size_t) len, TRIPLE_TIMESTAMP_FROM_CMSG(&msg)); + if (r < 0) + log_dhcp_server_errno(server, r, "Couldn't process incoming message, ignoring: %m"); + } + return 0; +} + +static void dhcp_server_update_lease_servers(sd_dhcp_server *server) { + assert(server); + assert(server->address != 0); + + /* Convert null address -> server address */ + + for (sd_dhcp_lease_server_type_t k = 0; k < _SD_DHCP_LEASE_SERVER_TYPE_MAX; k++) + for (size_t i = 0; i < server->servers[k].size; i++) + if (in4_addr_is_null(&server->servers[k].addr[i])) + server->servers[k].addr[i].s_addr = server->address; +} + +int sd_dhcp_server_start(sd_dhcp_server *server) { + int r; + + assert_return(server, -EINVAL); + assert_return(server->event, -EINVAL); + + if (sd_dhcp_server_is_running(server)) + return 0; + + assert_return(!server->receive_message, -EBUSY); + assert_return(server->fd_raw < 0, -EBUSY); + assert_return(server->fd < 0, -EBUSY); + assert_return(server->address != htobe32(INADDR_ANY), -EUNATCH); + + dhcp_server_update_lease_servers(server); + + r = socket(AF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (r < 0) { + r = -errno; + goto on_error; + } + server->fd_raw = r; + + if (server->bind_to_interface) + r = dhcp_network_bind_udp_socket(server->ifindex, INADDR_ANY, DHCP_PORT_SERVER, -1); + else + r = dhcp_network_bind_udp_socket(0, server->address, DHCP_PORT_SERVER, -1); + if (r < 0) + goto on_error; + server->fd = r; + + r = sd_event_add_io(server->event, &server->receive_message, + server->fd, EPOLLIN, + server_receive_message, server); + if (r < 0) + goto on_error; + + r = sd_event_source_set_priority(server->receive_message, + server->event_priority); + if (r < 0) + goto on_error; + + if (!server->bind_to_interface) { + r = dhcp_network_bind_udp_socket(server->ifindex, INADDR_BROADCAST, DHCP_PORT_SERVER, -1); + if (r < 0) + goto on_error; + + server->fd_broadcast = r; + + r = sd_event_add_io(server->event, &server->receive_broadcast, + server->fd_broadcast, EPOLLIN, + server_receive_message, server); + if (r < 0) + goto on_error; + + r = sd_event_source_set_priority(server->receive_broadcast, + server->event_priority); + if (r < 0) + goto on_error; + } + + log_dhcp_server(server, "STARTED"); + + return 0; + +on_error: + sd_dhcp_server_stop(server); + return r; +} + +int sd_dhcp_server_forcerenew(sd_dhcp_server *server) { + DHCPLease *lease; + int r = 0; + + assert_return(server, -EINVAL); + + log_dhcp_server(server, "FORCERENEW"); + + HASHMAP_FOREACH(lease, server->bound_leases_by_client_id) + RET_GATHER(r, + server_send_forcerenew(server, lease->address, lease->gateway, + lease->htype, lease->hlen, lease->chaddr)); + return r; +} + +int sd_dhcp_server_set_bind_to_interface(sd_dhcp_server *server, int enabled) { + assert_return(server, -EINVAL); + assert_return(!sd_dhcp_server_is_running(server), -EBUSY); + + if (!!enabled == server->bind_to_interface) + return 0; + + server->bind_to_interface = enabled; + + return 1; +} + +int sd_dhcp_server_set_timezone(sd_dhcp_server *server, const char *tz) { + int r; + + assert_return(server, -EINVAL); + assert_return(timezone_is_valid(tz, LOG_DEBUG), -EINVAL); + + if (streq_ptr(tz, server->timezone)) + return 0; + + r = free_and_strdup(&server->timezone, tz); + if (r < 0) + return r; + + return 1; +} + +int sd_dhcp_server_set_max_lease_time(sd_dhcp_server *server, uint64_t t) { + assert_return(server, -EINVAL); + + server->max_lease_time = t; + return 0; +} + +int sd_dhcp_server_set_default_lease_time(sd_dhcp_server *server, uint64_t t) { + assert_return(server, -EINVAL); + + server->default_lease_time = t; + return 0; +} + +int sd_dhcp_server_set_ipv6_only_preferred_usec(sd_dhcp_server *server, uint64_t t) { + assert_return(server, -EINVAL); + + /* When 0 is set, disables the IPv6 only mode. */ + + /* Refuse too short timespan unless test mode is enabled. */ + if (t > 0 && t < MIN_V6ONLY_WAIT_USEC && !network_test_mode_enabled()) + return -EINVAL; + + server->ipv6_only_preferred_usec = t; + return 0; +} + +int sd_dhcp_server_set_rapid_commit(sd_dhcp_server *server, int enabled) { + assert_return(server, -EINVAL); + + server->rapid_commit = enabled; + return 0; +} + +int sd_dhcp_server_set_servers( + sd_dhcp_server *server, + sd_dhcp_lease_server_type_t what, + const struct in_addr addresses[], + size_t n_addresses) { + + struct in_addr *c = NULL; + + assert_return(server, -EINVAL); + assert_return(!sd_dhcp_server_is_running(server), -EBUSY); + assert_return(addresses || n_addresses == 0, -EINVAL); + assert_return(what >= 0, -EINVAL); + assert_return(what < _SD_DHCP_LEASE_SERVER_TYPE_MAX, -EINVAL); + + if (server->servers[what].size == n_addresses && + memcmp(server->servers[what].addr, addresses, sizeof(struct in_addr) * n_addresses) == 0) + return 0; + + if (n_addresses > 0) { + c = newdup(struct in_addr, addresses, n_addresses); + if (!c) + return -ENOMEM; + } + + free_and_replace(server->servers[what].addr, c); + server->servers[what].size = n_addresses; + return 1; +} + +int sd_dhcp_server_set_dns(sd_dhcp_server *server, const struct in_addr dns[], size_t n) { + return sd_dhcp_server_set_servers(server, SD_DHCP_LEASE_DNS, dns, n); +} +int sd_dhcp_server_set_ntp(sd_dhcp_server *server, const struct in_addr ntp[], size_t n) { + return sd_dhcp_server_set_servers(server, SD_DHCP_LEASE_NTP, ntp, n); +} +int sd_dhcp_server_set_sip(sd_dhcp_server *server, const struct in_addr sip[], size_t n) { + return sd_dhcp_server_set_servers(server, SD_DHCP_LEASE_SIP, sip, n); +} +int sd_dhcp_server_set_pop3(sd_dhcp_server *server, const struct in_addr pop3[], size_t n) { + return sd_dhcp_server_set_servers(server, SD_DHCP_LEASE_POP3, pop3, n); +} +int sd_dhcp_server_set_smtp(sd_dhcp_server *server, const struct in_addr smtp[], size_t n) { + return sd_dhcp_server_set_servers(server, SD_DHCP_LEASE_SMTP, smtp, n); +} +int sd_dhcp_server_set_lpr(sd_dhcp_server *server, const struct in_addr lpr[], size_t n) { + return sd_dhcp_server_set_servers(server, SD_DHCP_LEASE_LPR, lpr, n); +} + +int sd_dhcp_server_set_router(sd_dhcp_server *server, const struct in_addr *router) { + assert_return(server, -EINVAL); + + /* router is NULL: router option will not be appended. + * router is null address (0.0.0.0): the server address will be used as the router address. + * otherwise: the specified address will be used as the router address. */ + + server->emit_router = router; + if (router) + server->router_address = *router; + + return 0; +} + +int sd_dhcp_server_add_option(sd_dhcp_server *server, sd_dhcp_option *v) { + int r; + + assert_return(server, -EINVAL); + assert_return(v, -EINVAL); + + r = ordered_set_ensure_put(&server->extra_options, &dhcp_option_hash_ops, v); + if (r < 0) + return r; + + sd_dhcp_option_ref(v); + return 0; +} + +int sd_dhcp_server_add_vendor_option(sd_dhcp_server *server, sd_dhcp_option *v) { + int r; + + assert_return(server, -EINVAL); + assert_return(v, -EINVAL); + + r = ordered_set_ensure_put(&server->vendor_options, &dhcp_option_hash_ops, v); + if (r < 0) + return r; + + sd_dhcp_option_ref(v); + + return 1; +} + +int sd_dhcp_server_set_callback(sd_dhcp_server *server, sd_dhcp_server_callback_t cb, void *userdata) { + assert_return(server, -EINVAL); + + server->callback = cb; + server->callback_userdata = userdata; + + return 0; +} + +int sd_dhcp_server_set_relay_target(sd_dhcp_server *server, const struct in_addr *address) { + assert_return(server, -EINVAL); + assert_return(!sd_dhcp_server_is_running(server), -EBUSY); + + if (memcmp(address, &server->relay_target, sizeof(struct in_addr)) == 0) + return 0; + + server->relay_target = *address; + return 1; +} + +int sd_dhcp_server_set_relay_agent_information( + sd_dhcp_server *server, + const char *agent_circuit_id, + const char *agent_remote_id) { + _cleanup_free_ char *circuit_id_dup = NULL, *remote_id_dup = NULL; + + assert_return(server, -EINVAL); + + if (relay_agent_information_length(agent_circuit_id, agent_remote_id) > UINT8_MAX) + return -ENOBUFS; + + if (agent_circuit_id) { + circuit_id_dup = strdup(agent_circuit_id); + if (!circuit_id_dup) + return -ENOMEM; + } + + if (agent_remote_id) { + remote_id_dup = strdup(agent_remote_id); + if (!remote_id_dup) + return -ENOMEM; + } + + free_and_replace(server->agent_circuit_id, circuit_id_dup); + free_and_replace(server->agent_remote_id, remote_id_dup); + return 0; +} + +int sd_dhcp_server_set_static_lease( + sd_dhcp_server *server, + const struct in_addr *address, + uint8_t *client_id, + size_t client_id_size) { + + _cleanup_(dhcp_lease_freep) DHCPLease *lease = NULL; + int r; + + assert_return(server, -EINVAL); + assert_return(client_id, -EINVAL); + assert_return(client_id_size > 0, -EINVAL); + assert_return(!sd_dhcp_server_is_running(server), -EBUSY); + + /* Static lease with an empty or omitted address is a valid entry, + * the server removes any static lease with the specified mac address. */ + if (!address || address->s_addr == 0) { + DHCPClientId c; + + c = (DHCPClientId) { + .length = client_id_size, + .data = client_id, + }; + + dhcp_lease_free(hashmap_get(server->static_leases_by_client_id, &c)); + return 0; + } + + lease = new(DHCPLease, 1); + if (!lease) + return -ENOMEM; + + *lease = (DHCPLease) { + .address = address->s_addr, + .client_id.length = client_id_size, + }; + lease->client_id.data = memdup(client_id, client_id_size); + if (!lease->client_id.data) + return -ENOMEM; + + lease->server = server; /* This must be set just before hashmap_put(). */ + + r = hashmap_ensure_put(&server->static_leases_by_client_id, &dhcp_lease_hash_ops, &lease->client_id, lease); + if (r < 0) + return r; + r = hashmap_ensure_put(&server->static_leases_by_address, NULL, UINT32_TO_PTR(lease->address), lease); + if (r < 0) + return r; + + TAKE_PTR(lease); + return 0; +} diff --git a/src/libsystemd-network/sd-dhcp6-client.c b/src/libsystemd-network/sd-dhcp6-client.c new file mode 100644 index 0000000..c20367d --- /dev/null +++ b/src/libsystemd-network/sd-dhcp6-client.c @@ -0,0 +1,1594 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include + +#include "sd-dhcp6-client.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "dhcp-identifier.h" +#include "dhcp6-internal.h" +#include "dhcp6-lease-internal.h" +#include "dns-domain.h" +#include "event-util.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "iovec-util.h" +#include "random-util.h" +#include "socket-util.h" +#include "sort-util.h" +#include "strv.h" +#include "web-util.h" + +#define DHCP6_CLIENT_DONT_DESTROY(client) \ + _cleanup_(sd_dhcp6_client_unrefp) _unused_ sd_dhcp6_client *_dont_destroy_##client = sd_dhcp6_client_ref(client) + +static int client_start_transaction(sd_dhcp6_client *client, DHCP6State state); + +int sd_dhcp6_client_set_callback( + sd_dhcp6_client *client, + sd_dhcp6_client_callback_t cb, + void *userdata) { + + assert_return(client, -EINVAL); + + client->callback = cb; + client->userdata = userdata; + + return 0; +} + +int dhcp6_client_set_state_callback( + sd_dhcp6_client *client, + sd_dhcp6_client_callback_t cb, + void *userdata) { + + assert_return(client, -EINVAL); + + client->state_callback = cb; + client->state_userdata = userdata; + + return 0; +} + +int sd_dhcp6_client_set_ifindex(sd_dhcp6_client *client, int ifindex) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(ifindex > 0, -EINVAL); + + client->ifindex = ifindex; + return 0; +} + +int sd_dhcp6_client_set_ifname(sd_dhcp6_client *client, const char *ifname) { + assert_return(client, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&client->ifname, ifname); +} + +int sd_dhcp6_client_get_ifname(sd_dhcp6_client *client, const char **ret) { + int r; + + assert_return(client, -EINVAL); + + r = get_ifname(client->ifindex, &client->ifname); + if (r < 0) + return r; + + if (ret) + *ret = client->ifname; + + return 0; +} + +int sd_dhcp6_client_set_local_address( + sd_dhcp6_client *client, + const struct in6_addr *local_address) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(local_address, -EINVAL); + assert_return(in6_addr_is_link_local(local_address) > 0, -EINVAL); + + client->local_address = *local_address; + + return 0; +} + +int sd_dhcp6_client_set_mac( + sd_dhcp6_client *client, + const uint8_t *addr, + size_t addr_len, + uint16_t arp_type) { + + assert_return(client, -EINVAL); + assert_return(addr, -EINVAL); + assert_return(addr_len <= sizeof(client->hw_addr.bytes), -EINVAL); + + /* Unlike the other setters, it is OK to set a new MAC address while the client is running, + * as the MAC address is used only when setting DUID or IAID. */ + + if (arp_type == ARPHRD_ETHER) + assert_return(addr_len == ETH_ALEN, -EINVAL); + else if (arp_type == ARPHRD_INFINIBAND) + assert_return(addr_len == INFINIBAND_ALEN, -EINVAL); + else { + client->arp_type = ARPHRD_NONE; + client->hw_addr.length = 0; + return 0; + } + + client->arp_type = arp_type; + hw_addr_set(&client->hw_addr, addr, addr_len); + + return 0; +} + +int sd_dhcp6_client_set_prefix_delegation_hint( + sd_dhcp6_client *client, + uint8_t prefixlen, + const struct in6_addr *pd_prefix) { + + _cleanup_free_ DHCP6Address *prefix = NULL; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + if (!pd_prefix) { + /* clear previous assignments. */ + dhcp6_ia_clear_addresses(&client->ia_pd); + return 0; + } + + assert_return(prefixlen > 0 && prefixlen <= 128, -EINVAL); + + prefix = new(DHCP6Address, 1); + if (!prefix) + return -ENOMEM; + + *prefix = (DHCP6Address) { + .iapdprefix.address = *pd_prefix, + .iapdprefix.prefixlen = prefixlen, + }; + + LIST_PREPEND(addresses, client->ia_pd.addresses, TAKE_PTR(prefix)); + return 1; +} + +int sd_dhcp6_client_add_vendor_option(sd_dhcp6_client *client, sd_dhcp6_option *v) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + if (!v) { + /* Clear the previous assignments. */ + ordered_set_clear(client->vendor_options); + return 0; + } + + r = ordered_set_ensure_put(&client->vendor_options, &dhcp6_option_hash_ops, v); + if (r < 0) + return r; + + sd_dhcp6_option_ref(v); + + return 1; +} + +static int client_ensure_duid(sd_dhcp6_client *client) { + assert(client); + + if (client->duid_len != 0) + return 0; + + return dhcp_identifier_set_duid_en(&client->duid, &client->duid_len); +} + +/** + * Sets DUID. If duid is non-null, the DUID is set to duid_type + duid + * without further modification. Otherwise, if duid_type is supported, DUID + * is set based on that type. Otherwise, an error is returned. + */ +int sd_dhcp6_client_set_duid_llt(sd_dhcp6_client *client, uint64_t llt_time) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + r = dhcp_identifier_set_duid_llt(&client->hw_addr, client->arp_type, llt_time, &client->duid, &client->duid_len); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set DUID-LLT: %m"); + + return 0; +} + +int sd_dhcp6_client_set_duid_ll(sd_dhcp6_client *client) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + r = dhcp_identifier_set_duid_ll(&client->hw_addr, client->arp_type, &client->duid, &client->duid_len); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set DUID-LL: %m"); + + return 0; +} + +int sd_dhcp6_client_set_duid_en(sd_dhcp6_client *client) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + r = dhcp_identifier_set_duid_en(&client->duid, &client->duid_len); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set DUID-EN: %m"); + + return 0; +} + +int sd_dhcp6_client_set_duid_uuid(sd_dhcp6_client *client) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + r = dhcp_identifier_set_duid_uuid(&client->duid, &client->duid_len); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set DUID-UUID: %m"); + + return 0; +} + +int sd_dhcp6_client_set_duid_raw(sd_dhcp6_client *client, uint16_t duid_type, const uint8_t *duid, size_t duid_len) { + int r; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(duid || duid_len == 0, -EINVAL); + + r = dhcp_identifier_set_duid_raw(duid_type, duid, duid_len, &client->duid, &client->duid_len); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set DUID: %m"); + + return 0; +} + +int sd_dhcp6_client_duid_as_string( + sd_dhcp6_client *client, + char **duid) { + _cleanup_free_ char *p = NULL, *s = NULL, *t = NULL; + const char *v; + int r; + + assert_return(client, -EINVAL); + assert_return(client->duid_len > offsetof(struct duid, raw.data), -ENODATA); + assert_return(duid, -EINVAL); + + v = duid_type_to_string(be16toh(client->duid.type)); + if (v) { + s = strdup(v); + if (!s) + return -ENOMEM; + } else { + r = asprintf(&s, "%0x", client->duid.type); + if (r < 0) + return -ENOMEM; + } + + t = hexmem(client->duid.raw.data, client->duid_len - offsetof(struct duid, raw.data)); + if (!t) + return -ENOMEM; + + p = strjoin(s, ":", t); + if (!p) + return -ENOMEM; + + *duid = TAKE_PTR(p); + + return 0; +} + +int sd_dhcp6_client_set_iaid(sd_dhcp6_client *client, uint32_t iaid) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + client->ia_na.header.id = htobe32(iaid); + client->ia_pd.header.id = htobe32(iaid); + client->iaid_set = true; + + return 0; +} + +static int client_ensure_iaid(sd_dhcp6_client *client) { + int r; + uint32_t iaid; + + assert(client); + + if (client->iaid_set) + return 0; + + r = dhcp_identifier_set_iaid(client->dev, &client->hw_addr, + /* legacy_unstable_byteorder = */ true, + &iaid); + if (r < 0) + return r; + + client->ia_na.header.id = iaid; + client->ia_pd.header.id = iaid; + client->iaid_set = true; + + return 0; +} + +int sd_dhcp6_client_get_iaid(sd_dhcp6_client *client, uint32_t *iaid) { + assert_return(client, -EINVAL); + assert_return(iaid, -EINVAL); + + if (!client->iaid_set) + return -ENODATA; + + *iaid = be32toh(client->ia_na.header.id); + + return 0; +} + +int sd_dhcp6_client_set_fqdn( + sd_dhcp6_client *client, + const char *fqdn) { + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + /* Make sure FQDN qualifies as DNS and as Linux hostname */ + if (fqdn && + !(hostname_is_valid(fqdn, 0) && dns_name_is_valid(fqdn) > 0)) + return -EINVAL; + + return free_and_strdup(&client->fqdn, fqdn); +} + +int sd_dhcp6_client_set_information_request(sd_dhcp6_client *client, int enabled) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + client->information_request = enabled; + + return 0; +} + +int sd_dhcp6_client_get_information_request(sd_dhcp6_client *client, int *enabled) { + assert_return(client, -EINVAL); + assert_return(enabled, -EINVAL); + + *enabled = client->information_request; + + return 0; +} + +static int be16_compare_func(const be16_t *a, const be16_t *b) { + return CMP(be16toh(*a), be16toh(*b)); +} + +int sd_dhcp6_client_set_request_option(sd_dhcp6_client *client, uint16_t option) { + be16_t opt; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + if (!dhcp6_option_can_request(option)) + return -EINVAL; + + opt = htobe16(option); + if (typesafe_bsearch(&opt, client->req_opts, client->n_req_opts, be16_compare_func)) + return -EEXIST; + + if (!GREEDY_REALLOC(client->req_opts, client->n_req_opts + 1)) + return -ENOMEM; + + client->req_opts[client->n_req_opts++] = opt; + + /* Sort immediately to make the above binary search will work for the next time. */ + typesafe_qsort(client->req_opts, client->n_req_opts, be16_compare_func); + return 0; +} + +int sd_dhcp6_client_set_request_mud_url(sd_dhcp6_client *client, const char *mudurl) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(mudurl, -EINVAL); + assert_return(strlen(mudurl) <= UINT8_MAX, -EINVAL); + assert_return(http_url_is_valid(mudurl), -EINVAL); + + return free_and_strdup(&client->mudurl, mudurl); +} + +int sd_dhcp6_client_set_request_user_class(sd_dhcp6_client *client, char * const *user_class) { + char **s; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(!strv_isempty(user_class), -EINVAL); + + STRV_FOREACH(p, user_class) { + size_t len = strlen(*p); + + if (len > UINT16_MAX || len == 0) + return -EINVAL; + } + + s = strv_copy(user_class); + if (!s) + return -ENOMEM; + + return strv_free_and_replace(client->user_class, s); +} + +int sd_dhcp6_client_set_request_vendor_class(sd_dhcp6_client *client, char * const *vendor_class) { + char **s; + + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(!strv_isempty(vendor_class), -EINVAL); + + STRV_FOREACH(p, vendor_class) { + size_t len = strlen(*p); + + if (len > UINT16_MAX || len == 0) + return -EINVAL; + } + + s = strv_copy(vendor_class); + if (!s) + return -ENOMEM; + + return strv_free_and_replace(client->vendor_class, s); +} + +int sd_dhcp6_client_get_prefix_delegation(sd_dhcp6_client *client, int *delegation) { + assert_return(client, -EINVAL); + assert_return(delegation, -EINVAL); + + *delegation = FLAGS_SET(client->request_ia, DHCP6_REQUEST_IA_PD); + + return 0; +} + +int sd_dhcp6_client_set_prefix_delegation(sd_dhcp6_client *client, int delegation) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + SET_FLAG(client->request_ia, DHCP6_REQUEST_IA_PD, delegation); + + return 0; +} + +int sd_dhcp6_client_get_address_request(sd_dhcp6_client *client, int *request) { + assert_return(client, -EINVAL); + assert_return(request, -EINVAL); + + *request = FLAGS_SET(client->request_ia, DHCP6_REQUEST_IA_NA); + + return 0; +} + +int sd_dhcp6_client_set_address_request(sd_dhcp6_client *client, int request) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + SET_FLAG(client->request_ia, DHCP6_REQUEST_IA_NA, request); + + return 0; +} + +int dhcp6_client_set_transaction_id(sd_dhcp6_client *client, uint32_t transaction_id) { + assert(client); + assert_se(network_test_mode_enabled()); + + /* This is for tests or fuzzers. */ + + client->transaction_id = transaction_id & htobe32(0x00ffffff); + + return 0; +} + +int sd_dhcp6_client_set_rapid_commit(sd_dhcp6_client *client, int enable) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + client->rapid_commit = enable; + return 0; +} + +int sd_dhcp6_client_set_send_release(sd_dhcp6_client *client, int enable) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + client->send_release = enable; + return 0; +} + +int sd_dhcp6_client_get_lease(sd_dhcp6_client *client, sd_dhcp6_lease **ret) { + assert_return(client, -EINVAL); + + if (!client->lease) + return -ENOMSG; + + if (ret) + *ret = client->lease; + + return 0; +} + +int sd_dhcp6_client_add_option(sd_dhcp6_client *client, sd_dhcp6_option *v) { + int r; + + assert_return(client, -EINVAL); + assert_return(v, -EINVAL); + + r = ordered_hashmap_ensure_put(&client->extra_options, &dhcp6_option_hash_ops, UINT_TO_PTR(v->option), v); + if (r < 0) + return r; + + sd_dhcp6_option_ref(v); + return 0; +} + +static void client_set_state(sd_dhcp6_client *client, DHCP6State state) { + assert(client); + + if (client->state == state) + return; + + log_dhcp6_client(client, "State changed: %s -> %s", + dhcp6_state_to_string(client->state), dhcp6_state_to_string(state)); + + client->state = state; + + if (client->state_callback) + client->state_callback(client, state, client->state_userdata); +} + +int dhcp6_client_get_state(sd_dhcp6_client *client) { + assert_return(client, -EINVAL); + + return client->state; +} + +static void client_notify(sd_dhcp6_client *client, int event) { + assert(client); + + if (client->callback) + client->callback(client, event, client->userdata); +} + +static void client_cleanup(sd_dhcp6_client *client) { + assert(client); + + client->lease = sd_dhcp6_lease_unref(client->lease); + + /* Reset IRT here. Otherwise, we cannot restart the client in the information requesting mode, + * even though the lease is freed below. */ + client->information_request_time_usec = 0; + client->information_refresh_time_usec = 0; + + (void) event_source_disable(client->receive_message); + (void) event_source_disable(client->timeout_resend); + (void) event_source_disable(client->timeout_expire); + (void) event_source_disable(client->timeout_t1); + (void) event_source_disable(client->timeout_t2); + + client_set_state(client, DHCP6_STATE_STOPPED); +} + +static void client_stop(sd_dhcp6_client *client, int error) { + DHCP6_CLIENT_DONT_DESTROY(client); + + assert(client); + + client_notify(client, error); + + client_cleanup(client); +} + +static int client_append_common_options_in_managed_mode( + sd_dhcp6_client *client, + uint8_t **buf, + size_t *offset, + const DHCP6IA *ia_na, + const DHCP6IA *ia_pd) { + + int r; + + assert(client); + assert(IN_SET(client->state, + DHCP6_STATE_SOLICITATION, + DHCP6_STATE_REQUEST, + DHCP6_STATE_RENEW, + DHCP6_STATE_REBIND, + DHCP6_STATE_STOPPING)); + assert(buf); + assert(*buf); + assert(offset); + + if (FLAGS_SET(client->request_ia, DHCP6_REQUEST_IA_NA) && ia_na) { + r = dhcp6_option_append_ia(buf, offset, ia_na); + if (r < 0) + return r; + } + + if (FLAGS_SET(client->request_ia, DHCP6_REQUEST_IA_PD) && ia_pd) { + r = dhcp6_option_append_ia(buf, offset, ia_pd); + if (r < 0) + return r; + } + + if (client->state != DHCP6_STATE_STOPPING) { + r = dhcp6_option_append_fqdn(buf, offset, client->fqdn); + if (r < 0) + return r; + } + + r = dhcp6_option_append_user_class(buf, offset, client->user_class); + if (r < 0) + return r; + + r = dhcp6_option_append_vendor_class(buf, offset, client->vendor_class); + if (r < 0) + return r; + + r = dhcp6_option_append_vendor_option(buf, offset, client->vendor_options); + if (r < 0) + return r; + + return 0; +} + +static DHCP6MessageType client_message_type_from_state(sd_dhcp6_client *client) { + assert(client); + + switch (client->state) { + case DHCP6_STATE_INFORMATION_REQUEST: + return DHCP6_MESSAGE_INFORMATION_REQUEST; + case DHCP6_STATE_SOLICITATION: + return DHCP6_MESSAGE_SOLICIT; + case DHCP6_STATE_REQUEST: + return DHCP6_MESSAGE_REQUEST; + case DHCP6_STATE_RENEW: + return DHCP6_MESSAGE_RENEW; + case DHCP6_STATE_REBIND: + return DHCP6_MESSAGE_REBIND; + case DHCP6_STATE_STOPPING: + return DHCP6_MESSAGE_RELEASE; + default: + assert_not_reached(); + } +} + +static int client_append_oro(sd_dhcp6_client *client, uint8_t **buf, size_t *offset) { + _cleanup_free_ be16_t *p = NULL; + be16_t *req_opts; + size_t n; + + assert(client); + assert(buf); + assert(*buf); + assert(offset); + + switch (client->state) { + case DHCP6_STATE_INFORMATION_REQUEST: + n = client->n_req_opts; + p = new(be16_t, n + 2); + if (!p) + return -ENOMEM; + + memcpy_safe(p, client->req_opts, n * sizeof(be16_t)); + p[n++] = htobe16(SD_DHCP6_OPTION_INFORMATION_REFRESH_TIME); /* RFC 8415 section 21.23 */ + p[n++] = htobe16(SD_DHCP6_OPTION_INF_MAX_RT); /* RFC 8415 section 21.25 */ + + typesafe_qsort(p, n, be16_compare_func); + req_opts = p; + break; + + case DHCP6_STATE_SOLICITATION: + n = client->n_req_opts; + p = new(be16_t, n + 1); + if (!p) + return -ENOMEM; + + memcpy_safe(p, client->req_opts, n * sizeof(be16_t)); + p[n++] = htobe16(SD_DHCP6_OPTION_SOL_MAX_RT); /* RFC 8415 section 21.24 */ + + typesafe_qsort(p, n, be16_compare_func); + req_opts = p; + break; + + case DHCP6_STATE_STOPPING: + return 0; + + default: + n = client->n_req_opts; + req_opts = client->req_opts; + } + + if (n == 0) + return 0; + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_ORO, n * sizeof(be16_t), req_opts); +} + +static int client_append_mudurl(sd_dhcp6_client *client, uint8_t **buf, size_t *offset) { + assert(client); + assert(buf); + assert(*buf); + assert(offset); + + if (!client->mudurl) + return 0; + + if (client->state == DHCP6_STATE_STOPPING) + return 0; + + return dhcp6_option_append(buf, offset, SD_DHCP6_OPTION_MUD_URL_V6, + strlen(client->mudurl), client->mudurl); +} + +int dhcp6_client_send_message(sd_dhcp6_client *client) { + _cleanup_free_ uint8_t *buf = NULL; + struct in6_addr all_servers = + IN6ADDR_ALL_DHCP6_RELAY_AGENTS_AND_SERVERS_INIT; + struct sd_dhcp6_option *j; + usec_t elapsed_usec, time_now; + be16_t elapsed_time; + DHCP6Message *message; + size_t offset; + int r; + + assert(client); + assert(client->event); + + r = sd_event_now(client->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + return r; + + if (!GREEDY_REALLOC0(buf, offsetof(DHCP6Message, options))) + return -ENOMEM; + + message = (DHCP6Message*) buf; + message->transaction_id = client->transaction_id; + message->type = client_message_type_from_state(client); + offset = offsetof(DHCP6Message, options); + + switch (client->state) { + case DHCP6_STATE_INFORMATION_REQUEST: + break; + + case DHCP6_STATE_SOLICITATION: + if (client->rapid_commit) { + r = dhcp6_option_append(&buf, &offset, SD_DHCP6_OPTION_RAPID_COMMIT, 0, NULL); + if (r < 0) + return r; + } + + r = client_append_common_options_in_managed_mode(client, &buf, &offset, + &client->ia_na, &client->ia_pd); + if (r < 0) + return r; + break; + + case DHCP6_STATE_REQUEST: + case DHCP6_STATE_RENEW: + case DHCP6_STATE_STOPPING: + r = dhcp6_option_append(&buf, &offset, SD_DHCP6_OPTION_SERVERID, + client->lease->serverid_len, + client->lease->serverid); + if (r < 0) + return r; + + _fallthrough_; + case DHCP6_STATE_REBIND: + + assert(client->lease); + + r = client_append_common_options_in_managed_mode(client, &buf, &offset, + client->lease->ia_na, client->lease->ia_pd); + if (r < 0) + return r; + break; + + case DHCP6_STATE_BOUND: + case DHCP6_STATE_STOPPED: + default: + assert_not_reached(); + } + + r = client_append_mudurl(client, &buf, &offset); + if (r < 0) + return r; + + r = client_append_oro(client, &buf, &offset); + if (r < 0) + return r; + + assert(client->duid_len > 0); + r = dhcp6_option_append(&buf, &offset, SD_DHCP6_OPTION_CLIENTID, + client->duid_len, &client->duid); + if (r < 0) + return r; + + ORDERED_HASHMAP_FOREACH(j, client->extra_options) { + r = dhcp6_option_append(&buf, &offset, j->option, j->length, j->data); + if (r < 0) + return r; + } + + /* RFC 8415 Section 21.9. + * A client MUST include an Elapsed Time option in messages to indicate how long the client has + * been trying to complete a DHCP message exchange. */ + elapsed_usec = MIN(usec_sub_unsigned(time_now, client->transaction_start) / USEC_PER_MSEC / 10, (usec_t) UINT16_MAX); + elapsed_time = htobe16(elapsed_usec); + r = dhcp6_option_append(&buf, &offset, SD_DHCP6_OPTION_ELAPSED_TIME, sizeof(elapsed_time), &elapsed_time); + if (r < 0) + return r; + + r = dhcp6_network_send_udp_socket(client->fd, &all_servers, buf, offset); + if (r < 0) + return r; + + log_dhcp6_client(client, "Sent %s", + dhcp6_message_type_to_string(client_message_type_from_state(client))); + return 0; +} + +static usec_t client_timeout_compute_random(usec_t val) { + return usec_sub_unsigned(val, random_u64_range(val / 10)); +} + +static int client_timeout_resend(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp6_client *client = ASSERT_PTR(userdata); + usec_t init_retransmit_time, max_retransmit_time; + int r; + + assert(client->event); + + switch (client->state) { + case DHCP6_STATE_INFORMATION_REQUEST: + init_retransmit_time = DHCP6_INF_TIMEOUT; + max_retransmit_time = DHCP6_INF_MAX_RT; + break; + + case DHCP6_STATE_SOLICITATION: + + if (client->retransmit_count > 0 && client->lease) { + (void) client_start_transaction(client, DHCP6_STATE_REQUEST); + return 0; + } + + init_retransmit_time = DHCP6_SOL_TIMEOUT; + max_retransmit_time = DHCP6_SOL_MAX_RT; + break; + + case DHCP6_STATE_REQUEST: + + if (client->retransmit_count >= DHCP6_REQ_MAX_RC) { + client_stop(client, SD_DHCP6_CLIENT_EVENT_RETRANS_MAX); + return 0; + } + + init_retransmit_time = DHCP6_REQ_TIMEOUT; + max_retransmit_time = DHCP6_REQ_MAX_RT; + break; + + case DHCP6_STATE_RENEW: + init_retransmit_time = DHCP6_REN_TIMEOUT; + max_retransmit_time = DHCP6_REN_MAX_RT; + + /* RFC 3315, section 18.1.3. says max retransmit duration will + be the remaining time until T2. Instead of setting MRD, + wait for T2 to trigger with the same end result */ + break; + + case DHCP6_STATE_REBIND: + init_retransmit_time = DHCP6_REB_TIMEOUT; + max_retransmit_time = DHCP6_REB_MAX_RT; + + /* Also, instead of setting MRD, the expire timer is already set in client_enter_bound_state(). */ + break; + + case DHCP6_STATE_STOPPED: + case DHCP6_STATE_STOPPING: + case DHCP6_STATE_BOUND: + default: + assert_not_reached(); + } + + r = dhcp6_client_send_message(client); + if (r >= 0) + client->retransmit_count++; + + if (client->retransmit_time == 0) { + client->retransmit_time = client_timeout_compute_random(init_retransmit_time); + + if (client->state == DHCP6_STATE_SOLICITATION) + client->retransmit_time += init_retransmit_time / 10; + + } else if (client->retransmit_time > max_retransmit_time / 2) + client->retransmit_time = client_timeout_compute_random(max_retransmit_time); + else + client->retransmit_time += client_timeout_compute_random(client->retransmit_time); + + log_dhcp6_client(client, "Next retransmission in %s", + FORMAT_TIMESPAN(client->retransmit_time, USEC_PER_SEC)); + + r = event_reset_time_relative(client->event, &client->timeout_resend, + CLOCK_BOOTTIME, + client->retransmit_time, 10 * USEC_PER_MSEC, + client_timeout_resend, client, + client->event_priority, "dhcp6-resend-timer", true); + if (r < 0) + client_stop(client, r); + + return 0; +} + +static int client_start_transaction(sd_dhcp6_client *client, DHCP6State state) { + int r; + + assert(client); + assert(client->event); + + switch (state) { + case DHCP6_STATE_INFORMATION_REQUEST: + case DHCP6_STATE_SOLICITATION: + assert(client->state == DHCP6_STATE_STOPPED); + break; + case DHCP6_STATE_REQUEST: + assert(client->state == DHCP6_STATE_SOLICITATION); + break; + case DHCP6_STATE_RENEW: + assert(client->state == DHCP6_STATE_BOUND); + break; + case DHCP6_STATE_REBIND: + assert(IN_SET(client->state, DHCP6_STATE_BOUND, DHCP6_STATE_RENEW)); + break; + case DHCP6_STATE_STOPPED: + case DHCP6_STATE_STOPPING: + case DHCP6_STATE_BOUND: + default: + assert_not_reached(); + } + + client_set_state(client, state); + + client->retransmit_time = 0; + client->retransmit_count = 0; + client->transaction_id = random_u32() & htobe32(0x00ffffff); + + r = sd_event_now(client->event, CLOCK_BOOTTIME, &client->transaction_start); + if (r < 0) + goto error; + + r = event_reset_time(client->event, &client->timeout_resend, + CLOCK_BOOTTIME, + 0, 0, + client_timeout_resend, client, + client->event_priority, "dhcp6-resend-timeout", true); + if (r < 0) + goto error; + + r = sd_event_source_set_enabled(client->receive_message, SD_EVENT_ON); + if (r < 0) + goto error; + + return 0; + +error: + client_stop(client, r); + return r; +} + +static int client_timeout_expire(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp6_client *client = ASSERT_PTR(userdata); + DHCP6_CLIENT_DONT_DESTROY(client); + DHCP6State state; + + (void) event_source_disable(client->timeout_expire); + (void) event_source_disable(client->timeout_t2); + (void) event_source_disable(client->timeout_t1); + + state = client->state; + + client_stop(client, SD_DHCP6_CLIENT_EVENT_RESEND_EXPIRE); + + /* RFC 3315, section 18.1.4., says that "...the client may choose to + use a Solicit message to locate a new DHCP server..." */ + if (state == DHCP6_STATE_REBIND) + (void) client_start_transaction(client, DHCP6_STATE_SOLICITATION); + + return 0; +} + +static int client_timeout_t2(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp6_client *client = ASSERT_PTR(userdata); + + (void) event_source_disable(client->timeout_t2); + (void) event_source_disable(client->timeout_t1); + + log_dhcp6_client(client, "Timeout T2"); + + (void) client_start_transaction(client, DHCP6_STATE_REBIND); + + return 0; +} + +static int client_timeout_t1(sd_event_source *s, uint64_t usec, void *userdata) { + sd_dhcp6_client *client = ASSERT_PTR(userdata); + + (void) event_source_disable(client->timeout_t1); + + log_dhcp6_client(client, "Timeout T1"); + + (void) client_start_transaction(client, DHCP6_STATE_RENEW); + + return 0; +} + +static int client_enter_bound_state(sd_dhcp6_client *client) { + usec_t lifetime_t1, lifetime_t2, lifetime_valid; + int r; + + assert(client); + assert(client->lease); + assert(IN_SET(client->state, + DHCP6_STATE_SOLICITATION, + DHCP6_STATE_REQUEST, + DHCP6_STATE_RENEW, + DHCP6_STATE_REBIND)); + + (void) event_source_disable(client->receive_message); + (void) event_source_disable(client->timeout_resend); + + r = sd_dhcp6_lease_get_t1(client->lease, &lifetime_t1); + if (r < 0) + goto error; + + r = sd_dhcp6_lease_get_t2(client->lease, &lifetime_t2); + if (r < 0) + goto error; + + r = sd_dhcp6_lease_get_valid_lifetime(client->lease, &lifetime_valid); + if (r < 0) + goto error; + + lifetime_t2 = client_timeout_compute_random(lifetime_t2); + lifetime_t1 = client_timeout_compute_random(MIN(lifetime_t1, lifetime_t2)); + + if (lifetime_t1 == USEC_INFINITY) { + log_dhcp6_client(client, "Infinite T1"); + event_source_disable(client->timeout_t1); + } else { + log_dhcp6_client(client, "T1 expires in %s", FORMAT_TIMESPAN(lifetime_t1, USEC_PER_SEC)); + r = event_reset_time_relative(client->event, &client->timeout_t1, + CLOCK_BOOTTIME, + lifetime_t1, 10 * USEC_PER_SEC, + client_timeout_t1, client, + client->event_priority, "dhcp6-t1-timeout", true); + if (r < 0) + goto error; + } + + if (lifetime_t2 == USEC_INFINITY) { + log_dhcp6_client(client, "Infinite T2"); + event_source_disable(client->timeout_t2); + } else { + log_dhcp6_client(client, "T2 expires in %s", FORMAT_TIMESPAN(lifetime_t2, USEC_PER_SEC)); + r = event_reset_time_relative(client->event, &client->timeout_t2, + CLOCK_BOOTTIME, + lifetime_t2, 10 * USEC_PER_SEC, + client_timeout_t2, client, + client->event_priority, "dhcp6-t2-timeout", true); + if (r < 0) + goto error; + } + + if (lifetime_valid == USEC_INFINITY) { + log_dhcp6_client(client, "Infinite valid lifetime"); + event_source_disable(client->timeout_expire); + } else { + log_dhcp6_client(client, "Valid lifetime expires in %s", FORMAT_TIMESPAN(lifetime_valid, USEC_PER_SEC)); + + r = event_reset_time_relative(client->event, &client->timeout_expire, + CLOCK_BOOTTIME, + lifetime_valid, USEC_PER_SEC, + client_timeout_expire, client, + client->event_priority, "dhcp6-lease-expire", true); + if (r < 0) + goto error; + } + + client_set_state(client, DHCP6_STATE_BOUND); + client_notify(client, SD_DHCP6_CLIENT_EVENT_IP_ACQUIRE); + return 0; + +error: + client_stop(client, r); + return r; +} + +static int log_invalid_message_type(sd_dhcp6_client *client, const DHCP6Message *message) { + const char *type_str; + + assert(client); + assert(message); + + type_str = dhcp6_message_type_to_string(message->type); + if (type_str) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received unexpected %s message, ignoring.", type_str); + else + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received unsupported message type %u, ignoring.", message->type); +} + +static int client_process_information( + sd_dhcp6_client *client, + DHCP6Message *message, + size_t len, + const triple_timestamp *timestamp, + const struct in6_addr *server_address) { + + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease = NULL; + int r; + + assert(client); + assert(message); + + if (message->type != DHCP6_MESSAGE_REPLY) + return log_invalid_message_type(client, message); + + r = dhcp6_lease_new_from_message(client, message, len, timestamp, server_address, &lease); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to process received reply message, ignoring: %m"); + + log_dhcp6_client(client, "Processed %s message", dhcp6_message_type_to_string(message->type)); + + sd_dhcp6_lease_unref(client->lease); + client->lease = TAKE_PTR(lease); + + /* Do not call client_stop() here, as it frees the acquired lease. */ + (void) event_source_disable(client->receive_message); + (void) event_source_disable(client->timeout_resend); + client_set_state(client, DHCP6_STATE_STOPPED); + + client_notify(client, SD_DHCP6_CLIENT_EVENT_INFORMATION_REQUEST); + return 0; +} + +static int client_process_reply( + sd_dhcp6_client *client, + DHCP6Message *message, + size_t len, + const triple_timestamp *timestamp, + const struct in6_addr *server_address) { + + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease = NULL; + int r; + + assert(client); + assert(message); + + if (message->type != DHCP6_MESSAGE_REPLY) + return log_invalid_message_type(client, message); + + r = dhcp6_lease_new_from_message(client, message, len, timestamp, server_address, &lease); + if (r == -EADDRNOTAVAIL) { + + /* If NoBinding status code is received, we cannot request the address anymore. + * Let's restart transaction from the beginning. */ + + if (client->state == DHCP6_STATE_REQUEST) + /* The lease is not acquired yet, hence it is not necessary to notify the restart. */ + client_cleanup(client); + else + /* We need to notify the previous lease was expired. */ + client_stop(client, SD_DHCP6_CLIENT_EVENT_RESEND_EXPIRE); + + return client_start_transaction(client, DHCP6_STATE_SOLICITATION); + } + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to process received reply message, ignoring: %m"); + + log_dhcp6_client(client, "Processed %s message", dhcp6_message_type_to_string(message->type)); + + sd_dhcp6_lease_unref(client->lease); + client->lease = TAKE_PTR(lease); + + return client_enter_bound_state(client); +} + +static int client_process_advertise_or_rapid_commit_reply( + sd_dhcp6_client *client, + DHCP6Message *message, + size_t len, + const triple_timestamp *timestamp, + const struct in6_addr *server_address) { + + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease = NULL; + uint8_t pref_advertise, pref_lease = 0; + int r; + + assert(client); + assert(message); + + if (!IN_SET(message->type, DHCP6_MESSAGE_ADVERTISE, DHCP6_MESSAGE_REPLY)) + return log_invalid_message_type(client, message); + + r = dhcp6_lease_new_from_message(client, message, len, timestamp, server_address, &lease); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to process received %s message, ignoring: %m", + dhcp6_message_type_to_string(message->type)); + + if (message->type == DHCP6_MESSAGE_REPLY) { + bool rapid_commit; + + if (!client->rapid_commit) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received unexpected reply message, even we sent a solicit message without the rapid commit option, ignoring."); + + r = dhcp6_lease_get_rapid_commit(lease, &rapid_commit); + if (r < 0) + return r; + + if (!rapid_commit) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received reply message without rapid commit flag, ignoring."); + + log_dhcp6_client(client, "Processed %s message", dhcp6_message_type_to_string(message->type)); + + sd_dhcp6_lease_unref(client->lease); + client->lease = TAKE_PTR(lease); + + return client_enter_bound_state(client); + } + + r = dhcp6_lease_get_preference(lease, &pref_advertise); + if (r < 0) + return r; + + if (client->lease) { + r = dhcp6_lease_get_preference(client->lease, &pref_lease); + if (r < 0) + return r; + } + + log_dhcp6_client(client, "Processed %s message", dhcp6_message_type_to_string(message->type)); + + if (!client->lease || pref_advertise > pref_lease) { + /* If this is the first advertise message or has higher preference, then save the lease. */ + sd_dhcp6_lease_unref(client->lease); + client->lease = TAKE_PTR(lease); + } + + if (pref_advertise == 255 || client->retransmit_count > 1) + (void) client_start_transaction(client, DHCP6_STATE_REQUEST); + + return 0; +} + +static int client_receive_message( + sd_event_source *s, + int fd, uint32_t + revents, + void *userdata) { + + sd_dhcp6_client *client = ASSERT_PTR(userdata); + DHCP6_CLIENT_DONT_DESTROY(client); + /* This needs to be initialized with zero. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE_TIMEVAL) control = {}; + struct iovec iov; + union sockaddr_union sa = {}; + struct msghdr msg = { + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + triple_timestamp t; + _cleanup_free_ DHCP6Message *message = NULL; + struct in6_addr *server_address = NULL; + ssize_t buflen, len; + + buflen = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(buflen) || ERRNO_IS_NEG_DISCONNECT(buflen)) + return 0; + if (buflen < 0) { + log_dhcp6_client_errno(client, buflen, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + message = malloc(buflen); + if (!message) + return -ENOMEM; + + iov = IOVEC_MAKE(message, buflen); + + len = recvmsg_safe(fd, &msg, MSG_DONTWAIT); + if (ERRNO_IS_NEG_TRANSIENT(len) || ERRNO_IS_NEG_DISCONNECT(len)) + return 0; + if (len < 0) { + log_dhcp6_client_errno(client, len, "Could not receive message from UDP socket, ignoring: %m"); + return 0; + } + if ((size_t) len < sizeof(DHCP6Message)) { + log_dhcp6_client(client, "Too small to be DHCP6 message: ignoring"); + return 0; + } + + /* msg_namelen == 0 happens when running the test-suite over a socketpair */ + if (msg.msg_namelen > 0) { + if (msg.msg_namelen != sizeof(struct sockaddr_in6) || sa.in6.sin6_family != AF_INET6) { + log_dhcp6_client(client, "Received message from invalid source, ignoring."); + return 0; + } + + server_address = &sa.in6.sin6_addr; + } + + triple_timestamp_from_cmsg(&t, &msg); + + if (client->transaction_id != (message->transaction_id & htobe32(0x00ffffff))) + return 0; + + switch (client->state) { + case DHCP6_STATE_INFORMATION_REQUEST: + if (client_process_information(client, message, len, &t, server_address) < 0) + return 0; + break; + + case DHCP6_STATE_SOLICITATION: + if (client_process_advertise_or_rapid_commit_reply(client, message, len, &t, server_address) < 0) + return 0; + break; + + case DHCP6_STATE_REQUEST: + case DHCP6_STATE_RENEW: + case DHCP6_STATE_REBIND: + if (client_process_reply(client, message, len, &t, server_address) < 0) + return 0; + break; + + case DHCP6_STATE_BOUND: + case DHCP6_STATE_STOPPED: + case DHCP6_STATE_STOPPING: + default: + assert_not_reached(); + } + + return 0; +} + +static int client_send_release(sd_dhcp6_client *client) { + sd_dhcp6_lease *lease; + + assert(client); + + if (!client->send_release) + return 0; + + if (sd_dhcp6_client_get_lease(client, &lease) < 0) + return 0; + + if (!lease->ia_na && !lease->ia_pd) + return 0; + + client_set_state(client, DHCP6_STATE_STOPPING); + return dhcp6_client_send_message(client); +} + +int sd_dhcp6_client_stop(sd_dhcp6_client *client) { + int r; + + if (!client) + return 0; + + /* Intentionally ignoring failure to send DHCP6 release. The DHCPv6 client + * engine is about to release its UDP socket unconditionally. */ + r = client_send_release(client); + if (r < 0) + log_dhcp6_client_errno(client, r, + "Failed to send DHCP6 release message, ignoring: %m"); + + client_stop(client, SD_DHCP6_CLIENT_EVENT_STOP); + + client->receive_message = sd_event_source_unref(client->receive_message); + client->fd = safe_close(client->fd); + + return 0; +} + +int sd_dhcp6_client_is_running(sd_dhcp6_client *client) { + assert_return(client, -EINVAL); + + return client->state != DHCP6_STATE_STOPPED; +} + +int sd_dhcp6_client_start(sd_dhcp6_client *client) { + DHCP6State state = DHCP6_STATE_SOLICITATION; + int r; + + assert_return(client, -EINVAL); + assert_return(client->event, -EINVAL); + assert_return(client->ifindex > 0, -EINVAL); + assert_return(in6_addr_is_link_local(&client->local_address) > 0, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + assert_return(client->information_request || client->request_ia != 0, -EINVAL); + + /* Even if the client is in the STOPPED state, the lease acquired in the previous information + * request may be stored. */ + client->lease = sd_dhcp6_lease_unref(client->lease); + + r = client_ensure_iaid(client); + if (r < 0) + return r; + + r = client_ensure_duid(client); + if (r < 0) + return r; + + if (client->fd < 0) { + r = dhcp6_network_bind_udp_socket(client->ifindex, &client->local_address); + if (r < 0) + return log_dhcp6_client_errno(client, r, + "Failed to bind to UDP socket at address %s: %m", + IN6_ADDR_TO_STRING(&client->local_address)); + + client->fd = r; + } + + if (!client->receive_message) { + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_io(client->event, &s, client->fd, EPOLLIN, client_receive_message, client); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s, client->event_priority); + if (r < 0) + return r; + + r = sd_event_source_set_description(s, "dhcp6-receive-message"); + if (r < 0) + return r; + + client->receive_message = TAKE_PTR(s); + } + + if (client->information_request) { + usec_t t = now(CLOCK_MONOTONIC); + + if (t < usec_add(client->information_request_time_usec, client->information_refresh_time_usec)) + return 0; + + client->information_request_time_usec = t; + state = DHCP6_STATE_INFORMATION_REQUEST; + } + + log_dhcp6_client(client, "Starting in %s mode", + client->information_request ? "Information request" : "Solicit"); + + return client_start_transaction(client, state); +} + +int sd_dhcp6_client_attach_event(sd_dhcp6_client *client, sd_event *event, int64_t priority) { + int r; + + assert_return(client, -EINVAL); + assert_return(!client->event, -EBUSY); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + if (event) + client->event = sd_event_ref(event); + else { + r = sd_event_default(&client->event); + if (r < 0) + return 0; + } + + client->event_priority = priority; + + return 0; +} + +int sd_dhcp6_client_detach_event(sd_dhcp6_client *client) { + assert_return(client, -EINVAL); + assert_return(!sd_dhcp6_client_is_running(client), -EBUSY); + + client->event = sd_event_unref(client->event); + + return 0; +} + +sd_event *sd_dhcp6_client_get_event(sd_dhcp6_client *client) { + assert_return(client, NULL); + + return client->event; +} + +int sd_dhcp6_client_attach_device(sd_dhcp6_client *client, sd_device *dev) { + assert_return(client, -EINVAL); + + return device_unref_and_replace(client->dev, dev); +} + +static sd_dhcp6_client *dhcp6_client_free(sd_dhcp6_client *client) { + if (!client) + return NULL; + + sd_dhcp6_lease_unref(client->lease); + + sd_event_source_disable_unref(client->receive_message); + sd_event_source_disable_unref(client->timeout_resend); + sd_event_source_disable_unref(client->timeout_expire); + sd_event_source_disable_unref(client->timeout_t1); + sd_event_source_disable_unref(client->timeout_t2); + sd_event_unref(client->event); + + client->fd = safe_close(client->fd); + + sd_device_unref(client->dev); + + free(client->req_opts); + free(client->fqdn); + free(client->mudurl); + dhcp6_ia_clear_addresses(&client->ia_pd); + ordered_hashmap_free(client->extra_options); + ordered_set_free(client->vendor_options); + strv_free(client->user_class); + strv_free(client->vendor_class); + free(client->ifname); + + return mfree(client); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp6_client, sd_dhcp6_client, dhcp6_client_free); + +int sd_dhcp6_client_new(sd_dhcp6_client **ret) { + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + + assert_return(ret, -EINVAL); + + client = new(sd_dhcp6_client, 1); + if (!client) + return -ENOMEM; + + *client = (sd_dhcp6_client) { + .n_ref = 1, + .ia_na.type = SD_DHCP6_OPTION_IA_NA, + .ia_pd.type = SD_DHCP6_OPTION_IA_PD, + .ifindex = -1, + .request_ia = DHCP6_REQUEST_IA_NA | DHCP6_REQUEST_IA_PD, + .fd = -EBADF, + .rapid_commit = true, + }; + + *ret = TAKE_PTR(client); + + return 0; +} diff --git a/src/libsystemd-network/sd-dhcp6-lease.c b/src/libsystemd-network/sd-dhcp6-lease.c new file mode 100644 index 0000000..674248b --- /dev/null +++ b/src/libsystemd-network/sd-dhcp6-lease.c @@ -0,0 +1,964 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. +***/ + +#include + +#include "alloc-util.h" +#include "dhcp6-internal.h" +#include "dhcp6-lease-internal.h" +#include "network-common.h" +#include "strv.h" + +#define IRT_DEFAULT (1 * USEC_PER_DAY) +#define IRT_MINIMUM (600 * USEC_PER_SEC) + +static void dhcp6_lease_set_timestamp(sd_dhcp6_lease *lease, const triple_timestamp *timestamp) { + assert(lease); + + if (timestamp && triple_timestamp_is_set(timestamp)) + lease->timestamp = *timestamp; + else + triple_timestamp_now(&lease->timestamp); +} + +int sd_dhcp6_lease_get_timestamp(sd_dhcp6_lease *lease, clockid_t clock, uint64_t *ret) { + assert_return(lease, -EINVAL); + assert_return(TRIPLE_TIMESTAMP_HAS_CLOCK(clock), -EOPNOTSUPP); + assert_return(clock_supported(clock), -EOPNOTSUPP); + assert_return(ret, -EINVAL); + + if (!triple_timestamp_is_set(&lease->timestamp)) + return -ENODATA; + + *ret = triple_timestamp_by_clock(&lease->timestamp, clock); + return 0; +} + +static void dhcp6_lease_set_lifetime(sd_dhcp6_lease *lease) { + usec_t t1 = USEC_INFINITY, t2 = USEC_INFINITY, min_valid_lt = USEC_INFINITY; + + assert(lease); + assert(lease->ia_na || lease->ia_pd); + + if (lease->ia_na) { + t1 = MIN(t1, be32_sec_to_usec(lease->ia_na->header.lifetime_t1, /* max_as_infinity = */ true)); + t2 = MIN(t2, be32_sec_to_usec(lease->ia_na->header.lifetime_t2, /* max_as_infinity = */ true)); + + LIST_FOREACH(addresses, a, lease->ia_na->addresses) + min_valid_lt = MIN(min_valid_lt, be32_sec_to_usec(a->iaaddr.lifetime_valid, /* max_as_infinity = */ true)); + } + + if (lease->ia_pd) { + t1 = MIN(t1, be32_sec_to_usec(lease->ia_pd->header.lifetime_t1, /* max_as_infinity = */ true)); + t2 = MIN(t2, be32_sec_to_usec(lease->ia_pd->header.lifetime_t2, /* max_as_infinity = */ true)); + + LIST_FOREACH(addresses, a, lease->ia_pd->addresses) + min_valid_lt = MIN(min_valid_lt, be32_sec_to_usec(a->iapdprefix.lifetime_valid, /* max_as_infinity = */ true)); + } + + if (t2 == 0 || t2 > min_valid_lt) { + /* If T2 is zero or longer than the minimum valid lifetime of the addresses or prefixes, + * then adjust lifetime with it. */ + t1 = min_valid_lt / 2; + t2 = min_valid_lt / 10 * 8; + } + + lease->lifetime_valid = min_valid_lt; + lease->lifetime_t1 = t1; + lease->lifetime_t2 = t2; +} + +#define DEFINE_GET_TIME_FUNCTIONS(name, val) \ + int sd_dhcp6_lease_get_##name( \ + sd_dhcp6_lease *lease, \ + uint64_t *ret) { \ + \ + assert_return(lease, -EINVAL); \ + \ + if (!lease->ia_na && !lease->ia_pd) \ + return -ENODATA; \ + \ + if (ret) \ + *ret = lease->val; \ + return 0; \ + } \ + \ + int sd_dhcp6_lease_get_##name##_timestamp( \ + sd_dhcp6_lease *lease, \ + clockid_t clock, \ + uint64_t *ret) { \ + \ + usec_t s, t; \ + int r; \ + \ + assert_return(lease, -EINVAL); \ + \ + r = sd_dhcp6_lease_get_##name(lease, &s); \ + if (r < 0) \ + return r; \ + \ + r = sd_dhcp6_lease_get_timestamp(lease, clock, &t); \ + if (r < 0) \ + return r; \ + \ + if (ret) \ + *ret = time_span_to_stamp(s, t); \ + return 0; \ + } + +DEFINE_GET_TIME_FUNCTIONS(t1, lifetime_t1); +DEFINE_GET_TIME_FUNCTIONS(t2, lifetime_t1); +DEFINE_GET_TIME_FUNCTIONS(valid_lifetime, lifetime_valid); + +static void dhcp6_lease_set_server_address(sd_dhcp6_lease *lease, const struct in6_addr *server_address) { + assert(lease); + + if (server_address) + lease->server_address = *server_address; + else + lease->server_address = (struct in6_addr) {}; +} + +int sd_dhcp6_lease_get_server_address(sd_dhcp6_lease *lease, struct in6_addr *ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = lease->server_address; + return 0; +} + +void dhcp6_ia_clear_addresses(DHCP6IA *ia) { + assert(ia); + + LIST_FOREACH(addresses, a, ia->addresses) + free(a); + + ia->addresses = NULL; +} + +DHCP6IA *dhcp6_ia_free(DHCP6IA *ia) { + if (!ia) + return NULL; + + dhcp6_ia_clear_addresses(ia); + + return mfree(ia); +} + +int dhcp6_lease_set_clientid(sd_dhcp6_lease *lease, const uint8_t *id, size_t len) { + uint8_t *clientid = NULL; + + assert(lease); + assert(id || len == 0); + + if (len > 0) { + clientid = memdup(id, len); + if (!clientid) + return -ENOMEM; + } + + free_and_replace(lease->clientid, clientid); + lease->clientid_len = len; + + return 0; +} + +int dhcp6_lease_get_clientid(sd_dhcp6_lease *lease, uint8_t **ret_id, size_t *ret_len) { + assert(lease); + + if (!lease->clientid) + return -ENODATA; + + if (ret_id) + *ret_id = lease->clientid; + if (ret_len) + *ret_len = lease->clientid_len; + + return 0; +} + +int dhcp6_lease_set_serverid(sd_dhcp6_lease *lease, const uint8_t *id, size_t len) { + uint8_t *serverid = NULL; + + assert(lease); + assert(id || len == 0); + + if (len > 0) { + serverid = memdup(id, len); + if (!serverid) + return -ENOMEM; + } + + free_and_replace(lease->serverid, serverid); + lease->serverid_len = len; + + return 0; +} + +int dhcp6_lease_get_serverid(sd_dhcp6_lease *lease, uint8_t **ret_id, size_t *ret_len) { + assert(lease); + + if (!lease->serverid) + return -ENODATA; + + if (ret_id) + *ret_id = lease->serverid; + if (ret_len) + *ret_len = lease->serverid_len; + return 0; +} + +int dhcp6_lease_set_preference(sd_dhcp6_lease *lease, uint8_t preference) { + assert(lease); + + lease->preference = preference; + return 0; +} + +int dhcp6_lease_get_preference(sd_dhcp6_lease *lease, uint8_t *ret) { + assert(lease); + assert(ret); + + *ret = lease->preference; + return 0; +} + +int dhcp6_lease_set_rapid_commit(sd_dhcp6_lease *lease) { + assert(lease); + + lease->rapid_commit = true; + return 0; +} + +int dhcp6_lease_get_rapid_commit(sd_dhcp6_lease *lease, bool *ret) { + assert(lease); + assert(ret); + + *ret = lease->rapid_commit; + return 0; +} + +int sd_dhcp6_lease_get_address(sd_dhcp6_lease *lease, struct in6_addr *ret) { + assert_return(lease, -EINVAL); + + if (!lease->addr_iter) + return -ENODATA; + + if (ret) + *ret = lease->addr_iter->iaaddr.address; + return 0; +} + +int sd_dhcp6_lease_get_address_lifetime( + sd_dhcp6_lease *lease, + usec_t *ret_lifetime_preferred, + usec_t *ret_lifetime_valid) { + + const struct iaaddr *a; + + assert_return(lease, -EINVAL); + + if (!lease->addr_iter) + return -ENODATA; + + a = &lease->addr_iter->iaaddr; + + if (ret_lifetime_preferred) + *ret_lifetime_preferred = be32_sec_to_usec(a->lifetime_preferred, /* max_as_infinity = */ true); + if (ret_lifetime_valid) + *ret_lifetime_valid = be32_sec_to_usec(a->lifetime_valid, /* max_as_infinity = */ true); + return 0; +} + +int sd_dhcp6_lease_address_iterator_reset(sd_dhcp6_lease *lease) { + if (!lease) + return false; + + lease->addr_iter = lease->ia_na ? lease->ia_na->addresses : NULL; + return !!lease->addr_iter; +} + +int sd_dhcp6_lease_address_iterator_next(sd_dhcp6_lease *lease) { + if (!lease || !lease->addr_iter) + return false; + + lease->addr_iter = lease->addr_iter->addresses_next; + return !!lease->addr_iter; +} + +int sd_dhcp6_lease_has_address(sd_dhcp6_lease *lease) { + return lease && lease->ia_na; +} + +int sd_dhcp6_lease_get_pd_prefix( + sd_dhcp6_lease *lease, + struct in6_addr *ret_prefix, + uint8_t *ret_prefix_len) { + + const struct iapdprefix *a; + + assert_return(lease, -EINVAL); + + if (!lease->prefix_iter) + return -ENODATA; + + a = &lease->prefix_iter->iapdprefix; + + if (ret_prefix) + *ret_prefix = a->address; + if (ret_prefix_len) + *ret_prefix_len = a->prefixlen; + return 0; +} + +int sd_dhcp6_lease_get_pd_lifetime( + sd_dhcp6_lease *lease, + uint64_t *ret_lifetime_preferred, + uint64_t *ret_lifetime_valid) { + + const struct iapdprefix *a; + + assert_return(lease, -EINVAL); + + if (!lease->prefix_iter) + return -ENODATA; + + a = &lease->prefix_iter->iapdprefix; + + if (ret_lifetime_preferred) + *ret_lifetime_preferred = be32_sec_to_usec(a->lifetime_preferred, /* max_as_infinity = */ true); + if (ret_lifetime_valid) + *ret_lifetime_valid = be32_sec_to_usec(a->lifetime_valid, /* max_as_infinity = */ true); + return 0; +} + +int sd_dhcp6_lease_pd_iterator_reset(sd_dhcp6_lease *lease) { + if (!lease) + return false; + + lease->prefix_iter = lease->ia_pd ? lease->ia_pd->addresses : NULL; + return !!lease->prefix_iter; +} + +int sd_dhcp6_lease_pd_iterator_next(sd_dhcp6_lease *lease) { + if (!lease || !lease->prefix_iter) + return false; + + lease->prefix_iter = lease->prefix_iter->addresses_next; + return !!lease->prefix_iter; +} + +#define DEFINE_GET_TIMESTAMP2(name) \ + int sd_dhcp6_lease_get_##name##_lifetime_timestamp( \ + sd_dhcp6_lease *lease, \ + clockid_t clock, \ + uint64_t *ret_lifetime_preferred, \ + uint64_t *ret_lifetime_valid) { \ + \ + usec_t t, p, v; \ + int r; \ + \ + assert_return(lease, -EINVAL); \ + \ + r = sd_dhcp6_lease_get_##name##_lifetime( \ + lease, \ + ret_lifetime_preferred ? &p : NULL, \ + ret_lifetime_valid ? &v : NULL); \ + if (r < 0) \ + return r; \ + \ + r = sd_dhcp6_lease_get_timestamp(lease, clock, &t); \ + if (r < 0) \ + return r; \ + \ + if (ret_lifetime_preferred) \ + *ret_lifetime_preferred = time_span_to_stamp(p, t); \ + if (ret_lifetime_valid) \ + *ret_lifetime_valid = time_span_to_stamp(v, t); \ + return 0; \ + } + +DEFINE_GET_TIMESTAMP2(address); +DEFINE_GET_TIMESTAMP2(pd); + +int sd_dhcp6_lease_has_pd_prefix(sd_dhcp6_lease *lease) { + return lease && lease->ia_pd; +} + +int dhcp6_lease_add_dns(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + assert(lease); + assert(optval || optlen == 0); + + if (optlen == 0) + return 0; + + return dhcp6_option_parse_addresses(optval, optlen, &lease->dns, &lease->dns_count); +} + +int sd_dhcp6_lease_get_dns(sd_dhcp6_lease *lease, const struct in6_addr **ret) { + assert_return(lease, -EINVAL); + + if (!lease->dns) + return -ENODATA; + + if (ret) + *ret = lease->dns; + + return lease->dns_count; +} + +int dhcp6_lease_add_domains(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + _cleanup_strv_free_ char **domains = NULL; + int r; + + assert(lease); + assert(optval || optlen == 0); + + if (optlen == 0) + return 0; + + r = dhcp6_option_parse_domainname_list(optval, optlen, &domains); + if (r < 0) + return r; + + return strv_extend_strv(&lease->domains, domains, true); +} + +int sd_dhcp6_lease_get_domains(sd_dhcp6_lease *lease, char ***ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (!lease->domains) + return -ENODATA; + + *ret = lease->domains; + return strv_length(lease->domains); +} + +int dhcp6_lease_add_ntp(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + int r; + + assert(lease); + assert(optval || optlen == 0); + + for (size_t offset = 0; offset < optlen;) { + const uint8_t *subval; + size_t sublen; + uint16_t subopt; + + r = dhcp6_option_parse(optval, optlen, &offset, &subopt, &sublen, &subval); + if (r < 0) + return r; + + switch (subopt) { + case DHCP6_NTP_SUBOPTION_SRV_ADDR: + case DHCP6_NTP_SUBOPTION_MC_ADDR: + if (sublen != 16) + return -EINVAL; + + r = dhcp6_option_parse_addresses(subval, sublen, &lease->ntp, &lease->ntp_count); + if (r < 0) + return r; + + break; + + case DHCP6_NTP_SUBOPTION_SRV_FQDN: { + _cleanup_free_ char *server = NULL; + + r = dhcp6_option_parse_domainname(subval, sublen, &server); + if (r < 0) + return r; + + if (strv_contains(lease->ntp_fqdn, server)) + continue; + + r = strv_consume(&lease->ntp_fqdn, TAKE_PTR(server)); + if (r < 0) + return r; + + break; + }} + } + + return 0; +} + +int dhcp6_lease_add_sntp(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + assert(lease); + assert(optval || optlen == 0); + + if (optlen == 0) + return 0; + + /* SNTP option is defined in RFC4075, and deprecated by RFC5908. */ + return dhcp6_option_parse_addresses(optval, optlen, &lease->sntp, &lease->sntp_count); +} + +int sd_dhcp6_lease_get_ntp_addrs(sd_dhcp6_lease *lease, const struct in6_addr **ret) { + assert_return(lease, -EINVAL); + + if (lease->ntp) { + if (ret) + *ret = lease->ntp; + return lease->ntp_count; + } + + if (lease->sntp && !lease->ntp_fqdn) { + /* Fallback to the deprecated SNTP option. */ + if (ret) + *ret = lease->sntp; + return lease->sntp_count; + } + + return -ENODATA; +} + +int sd_dhcp6_lease_get_ntp_fqdn(sd_dhcp6_lease *lease, char ***ret) { + assert_return(lease, -EINVAL); + + if (!lease->ntp_fqdn) + return -ENODATA; + + if (ret) + *ret = lease->ntp_fqdn; + return strv_length(lease->ntp_fqdn); +} + +int dhcp6_lease_set_fqdn(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + char *fqdn; + int r; + + assert(lease); + assert(optval || optlen == 0); + + if (optlen == 0) + return 0; + + if (optlen < 2) + return -ENODATA; + + /* Ignore the flags field, it doesn't carry any useful + information for clients. */ + r = dhcp6_option_parse_domainname(optval + 1, optlen - 1, &fqdn); + if (r < 0) + return r; + + return free_and_replace(lease->fqdn, fqdn); +} + +int sd_dhcp6_lease_get_fqdn(sd_dhcp6_lease *lease, const char **ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (!lease->fqdn) + return -ENODATA; + + *ret = lease->fqdn; + return 0; +} + +int dhcp6_lease_set_captive_portal(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + _cleanup_free_ char *uri = NULL; + int r; + + assert(lease); + assert(optval || optlen == 0); + + r = dhcp6_option_parse_string(optval, optlen, &uri); + if (r < 0) + return r; + + if (uri && !in_charset(uri, URI_VALID)) + return -EINVAL; + + return free_and_replace(lease->captive_portal, uri); +} + +int sd_dhcp6_lease_get_captive_portal(sd_dhcp6_lease *lease, const char **ret) { + assert_return(lease, -EINVAL); + assert_return(ret, -EINVAL); + + if (!lease->captive_portal) + return -ENODATA; + + *ret = lease->captive_portal; + return 0; +} + +int sd_dhcp6_lease_get_vendor_options(sd_dhcp6_lease *lease, sd_dhcp6_option ***ret) { + int r; + + assert_return(lease, -EINVAL); + + if (set_isempty(lease->vendor_options)) + return -ENODATA; + + if (ret) { + if (!lease->sorted_vendor_options) { + r = set_dump_sorted(lease->vendor_options, (void***) &lease->sorted_vendor_options, NULL); + if (r < 0) + return r; + } + + *ret = lease->sorted_vendor_options; + } + + return set_size(lease->vendor_options); +} + +static int dhcp6_lease_insert_vendor_option( + sd_dhcp6_lease *lease, + uint16_t option_code, + const void *data, + size_t len, + uint32_t enterprise_id) { + + _cleanup_(sd_dhcp6_option_unrefp) sd_dhcp6_option *option = NULL; + + assert(lease); + + option = new(sd_dhcp6_option, 1); + if (!option) + return -ENOMEM; + + *option = (sd_dhcp6_option) { + .n_ref = 1, + .enterprise_identifier = enterprise_id, + .option = option_code, + .length = len, + }; + option->data = memdup_suffix0(data, len); + if (!option->data) + return -ENOMEM; + + return set_ensure_consume(&lease->vendor_options, &dhcp6_option_hash_ops, TAKE_PTR(option)); +} + +static int dhcp6_lease_add_vendor_option(sd_dhcp6_lease *lease, const uint8_t *optval, size_t optlen) { + int r; + uint32_t enterprise_id; + + assert(lease); + assert(optval || optlen == 0); + + if (optlen < sizeof(be32_t)) + return -EBADMSG; + + enterprise_id = unaligned_read_be32(optval); + + for (size_t offset = 4; offset < optlen;) { + const uint8_t *subval; + size_t sublen; + uint16_t subopt; + + r = dhcp6_option_parse(optval, optlen, &offset, &subopt, &sublen, &subval); + if (r < 0) + return r; + + r = dhcp6_lease_insert_vendor_option(lease, subopt, subval, sublen, enterprise_id); + if (r < 0) + return r; + } + return 0; +} + +static int dhcp6_lease_parse_message( + sd_dhcp6_client *client, + sd_dhcp6_lease *lease, + const DHCP6Message *message, + size_t len) { + + usec_t irt = IRT_DEFAULT; + int r; + + assert(client); + assert(lease); + assert(message); + assert(len >= sizeof(DHCP6Message)); + + len -= sizeof(DHCP6Message); + for (size_t offset = 0; offset < len;) { + uint16_t optcode; + size_t optlen; + const uint8_t *optval; + + if (len - offset < offsetof(DHCP6Option, data)) { + log_dhcp6_client(client, "Ignoring %zu invalid byte(s) at the end of the packet", len - offset); + break; + } + + r = dhcp6_option_parse(message->options, len, &offset, &optcode, &optlen, &optval); + if (r < 0) + return log_dhcp6_client_errno(client, r, + "Failed to parse option header at offset %zu of total length %zu: %m", + offset, len); + + switch (optcode) { + case SD_DHCP6_OPTION_CLIENTID: + if (dhcp6_lease_get_clientid(lease, NULL, NULL) >= 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), "%s contains multiple client IDs", + dhcp6_message_type_to_string(message->type)); + + r = dhcp6_lease_set_clientid(lease, optval, optlen); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set client ID: %m"); + + break; + + case SD_DHCP6_OPTION_SERVERID: + if (dhcp6_lease_get_serverid(lease, NULL, NULL) >= 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), "%s contains multiple server IDs", + dhcp6_message_type_to_string(message->type)); + + r = dhcp6_lease_set_serverid(lease, optval, optlen); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set server ID: %m"); + + break; + + case SD_DHCP6_OPTION_PREFERENCE: + if (optlen != 1) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), "Received invalid length for preference."); + + r = dhcp6_lease_set_preference(lease, optval[0]); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set preference: %m"); + + break; + + case SD_DHCP6_OPTION_STATUS_CODE: { + _cleanup_free_ char *msg = NULL; + + r = dhcp6_option_parse_status(optval, optlen, &msg); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to parse status code: %m"); + if (r > 0) + return log_dhcp6_client_errno(client, dhcp6_message_status_to_errno(r), + "Received %s message with non-zero status%s%s", + dhcp6_message_type_to_string(message->type), + isempty(msg) ? "." : ": ", strempty(msg)); + break; + } + case SD_DHCP6_OPTION_IA_NA: { + _cleanup_(dhcp6_ia_freep) DHCP6IA *ia = NULL; + + if (client->state == DHCP6_STATE_INFORMATION_REQUEST) { + log_dhcp6_client(client, "Ignoring IA NA option in information requesting mode."); + break; + } + + r = dhcp6_option_parse_ia(client, client->ia_na.header.id, optcode, optlen, optval, &ia); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0) { + log_dhcp6_client_errno(client, r, "Failed to parse IA_NA option, ignoring: %m"); + continue; + } + + if (lease->ia_na) { + log_dhcp6_client(client, "Received duplicate matching IA_NA option, ignoring."); + continue; + } + + dhcp6_ia_free(lease->ia_na); + lease->ia_na = TAKE_PTR(ia); + break; + } + case SD_DHCP6_OPTION_IA_PD: { + _cleanup_(dhcp6_ia_freep) DHCP6IA *ia = NULL; + + if (client->state == DHCP6_STATE_INFORMATION_REQUEST) { + log_dhcp6_client(client, "Ignoring IA PD option in information requesting mode."); + break; + } + + r = dhcp6_option_parse_ia(client, client->ia_pd.header.id, optcode, optlen, optval, &ia); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0) { + log_dhcp6_client_errno(client, r, "Failed to parse IA_PD option, ignoring: %m"); + continue; + } + + if (lease->ia_pd) { + log_dhcp6_client(client, "Received duplicate matching IA_PD option, ignoring."); + continue; + } + + dhcp6_ia_free(lease->ia_pd); + lease->ia_pd = TAKE_PTR(ia); + break; + } + case SD_DHCP6_OPTION_RAPID_COMMIT: + if (optlen != 0) + log_dhcp6_client(client, "Received rapid commit option with an invalid length (%zu), ignoring.", optlen); + + r = dhcp6_lease_set_rapid_commit(lease); + if (r < 0) + return log_dhcp6_client_errno(client, r, "Failed to set rapid commit flag: %m"); + + break; + + case SD_DHCP6_OPTION_DNS_SERVER: + r = dhcp6_lease_add_dns(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse DNS server option, ignoring: %m"); + + break; + + case SD_DHCP6_OPTION_DOMAIN: + r = dhcp6_lease_add_domains(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse domain list option, ignoring: %m"); + + break; + + case SD_DHCP6_OPTION_NTP_SERVER: + r = dhcp6_lease_add_ntp(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse NTP server option, ignoring: %m"); + + break; + + case SD_DHCP6_OPTION_SNTP_SERVER: + r = dhcp6_lease_add_sntp(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse SNTP server option, ignoring: %m"); + + break; + + case SD_DHCP6_OPTION_CAPTIVE_PORTAL: + r = dhcp6_lease_set_captive_portal(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse captive portal option, ignoring: %m"); + break; + + case SD_DHCP6_OPTION_CLIENT_FQDN: + r = dhcp6_lease_set_fqdn(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse FQDN option, ignoring: %m"); + + break; + + case SD_DHCP6_OPTION_INFORMATION_REFRESH_TIME: + if (optlen != 4) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "Received information refresh time option with an invalid length (%zu).", optlen); + + irt = unaligned_be32_sec_to_usec(optval, /* max_as_infinity = */ false); + break; + + case SD_DHCP6_OPTION_VENDOR_OPTS: + r = dhcp6_lease_add_vendor_option(lease, optval, optlen); + if (r < 0) + log_dhcp6_client_errno(client, r, "Failed to parse vendor option, ignoring: %m"); + + break; + } + } + + uint8_t *clientid; + size_t clientid_len; + if (dhcp6_lease_get_clientid(lease, &clientid, &clientid_len) < 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "%s message does not contain client ID. Ignoring.", + dhcp6_message_type_to_string(message->type)); + + if (memcmp_nn(clientid, clientid_len, &client->duid, client->duid_len) != 0) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "The client ID in %s message does not match. Ignoring.", + dhcp6_message_type_to_string(message->type)); + + if (client->state == DHCP6_STATE_INFORMATION_REQUEST) { + client->information_refresh_time_usec = MAX(irt, IRT_MINIMUM); + log_dhcp6_client(client, "New information request will be refused in %s.", + FORMAT_TIMESPAN(client->information_refresh_time_usec, USEC_PER_SEC)); + + } else { + r = dhcp6_lease_get_serverid(lease, NULL, NULL); + if (r < 0) + return log_dhcp6_client_errno(client, r, "%s has no server id", + dhcp6_message_type_to_string(message->type)); + + if (!lease->ia_na && !lease->ia_pd) + return log_dhcp6_client_errno(client, SYNTHETIC_ERRNO(EINVAL), + "No IA_PD prefix or IA_NA address received. Ignoring."); + + dhcp6_lease_set_lifetime(lease); + } + + return 0; +} + +static sd_dhcp6_lease *dhcp6_lease_free(sd_dhcp6_lease *lease) { + if (!lease) + return NULL; + + set_free(lease->vendor_options); + free(lease->sorted_vendor_options); + free(lease->clientid); + free(lease->serverid); + dhcp6_ia_free(lease->ia_na); + dhcp6_ia_free(lease->ia_pd); + free(lease->dns); + free(lease->fqdn); + free(lease->captive_portal); + strv_free(lease->domains); + free(lease->ntp); + strv_free(lease->ntp_fqdn); + free(lease->sntp); + + return mfree(lease); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_dhcp6_lease, sd_dhcp6_lease, dhcp6_lease_free); + +int dhcp6_lease_new(sd_dhcp6_lease **ret) { + sd_dhcp6_lease *lease; + + assert(ret); + + lease = new(sd_dhcp6_lease, 1); + if (!lease) + return -ENOMEM; + + *lease = (sd_dhcp6_lease) { + .n_ref = 1, + }; + + *ret = lease; + return 0; +} + +int dhcp6_lease_new_from_message( + sd_dhcp6_client *client, + const DHCP6Message *message, + size_t len, + const triple_timestamp *timestamp, + const struct in6_addr *server_address, + sd_dhcp6_lease **ret) { + + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease = NULL; + int r; + + assert(client); + assert(message); + assert(len >= sizeof(DHCP6Message)); + assert(ret); + + r = dhcp6_lease_new(&lease); + if (r < 0) + return r; + + dhcp6_lease_set_timestamp(lease, timestamp); + dhcp6_lease_set_server_address(lease, server_address); + + r = dhcp6_lease_parse_message(client, lease, message, len); + if (r < 0) + return r; + + *ret = TAKE_PTR(lease); + return 0; +} diff --git a/src/libsystemd-network/sd-ipv4acd.c b/src/libsystemd-network/sd-ipv4acd.c new file mode 100644 index 0000000..0cc37a6 --- /dev/null +++ b/src/libsystemd-network/sd-ipv4acd.c @@ -0,0 +1,617 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Axis Communications AB. All rights reserved. +***/ + +#include +#include +#include +#include +#include + +#include "sd-ipv4acd.h" + +#include "alloc-util.h" +#include "arp-util.h" +#include "ether-addr-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "in-addr-util.h" +#include "memory-util.h" +#include "network-common.h" +#include "random-util.h" +#include "siphash24.h" +#include "string-table.h" +#include "string-util.h" +#include "time-util.h" + +/* Constants from the RFC */ +#define PROBE_WAIT_USEC (1U * USEC_PER_SEC) +#define PROBE_NUM 3U +#define PROBE_MIN_USEC (1U * USEC_PER_SEC) +#define PROBE_MAX_USEC (2U * USEC_PER_SEC) +#define ANNOUNCE_WAIT_USEC (2U * USEC_PER_SEC) +#define ANNOUNCE_NUM 2U +#define ANNOUNCE_INTERVAL_USEC (2U * USEC_PER_SEC) +#define MAX_CONFLICTS 10U +#define RATE_LIMIT_INTERVAL_USEC (60U * USEC_PER_SEC) +#define DEFEND_INTERVAL_USEC (10U * USEC_PER_SEC) + +typedef enum IPv4ACDState { + IPV4ACD_STATE_INIT, + IPV4ACD_STATE_STARTED, + IPV4ACD_STATE_WAITING_PROBE, + IPV4ACD_STATE_PROBING, + IPV4ACD_STATE_WAITING_ANNOUNCE, + IPV4ACD_STATE_ANNOUNCING, + IPV4ACD_STATE_RUNNING, + _IPV4ACD_STATE_MAX, + _IPV4ACD_STATE_INVALID = -EINVAL, +} IPv4ACDState; + +struct sd_ipv4acd { + unsigned n_ref; + + IPv4ACDState state; + int ifindex; + int fd; + + char *ifname; + unsigned n_iteration; + unsigned n_conflict; + + sd_event_source *receive_message_event_source; + sd_event_source *timer_event_source; + + usec_t defend_window; + struct in_addr address; + + /* External */ + struct ether_addr mac_addr; + + sd_event *event; + int event_priority; + sd_ipv4acd_callback_t callback; + void *userdata; + sd_ipv4acd_check_mac_callback_t check_mac_callback; + void *check_mac_userdata; +}; + +#define log_ipv4acd_errno(acd, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "IPv4ACD: ", \ + sd_ipv4acd, acd, \ + error, fmt, ##__VA_ARGS__) +#define log_ipv4acd(acd, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "IPv4ACD: ", \ + sd_ipv4acd, acd, \ + 0, fmt, ##__VA_ARGS__) + +static const char * const ipv4acd_state_table[_IPV4ACD_STATE_MAX] = { + [IPV4ACD_STATE_INIT] = "init", + [IPV4ACD_STATE_STARTED] = "started", + [IPV4ACD_STATE_WAITING_PROBE] = "waiting-probe", + [IPV4ACD_STATE_PROBING] = "probing", + [IPV4ACD_STATE_WAITING_ANNOUNCE] = "waiting-announce", + [IPV4ACD_STATE_ANNOUNCING] = "announcing", + [IPV4ACD_STATE_RUNNING] = "running", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(ipv4acd_state, IPv4ACDState); + +static void ipv4acd_set_state(sd_ipv4acd *acd, IPv4ACDState st, bool reset_counter) { + assert(acd); + assert(st < _IPV4ACD_STATE_MAX); + + if (st != acd->state) + log_ipv4acd(acd, "%s -> %s", ipv4acd_state_to_string(acd->state), ipv4acd_state_to_string(st)); + + if (st == acd->state && !reset_counter) + acd->n_iteration++; + else { + acd->state = st; + acd->n_iteration = 0; + } +} + +static void ipv4acd_reset(sd_ipv4acd *acd) { + assert(acd); + + (void) event_source_disable(acd->timer_event_source); + acd->receive_message_event_source = sd_event_source_disable_unref(acd->receive_message_event_source); + + acd->fd = safe_close(acd->fd); + + ipv4acd_set_state(acd, IPV4ACD_STATE_INIT, true); +} + +static sd_ipv4acd *ipv4acd_free(sd_ipv4acd *acd) { + assert(acd); + + ipv4acd_reset(acd); + sd_event_source_unref(acd->timer_event_source); + sd_ipv4acd_detach_event(acd); + free(acd->ifname); + return mfree(acd); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_ipv4acd, sd_ipv4acd, ipv4acd_free); + +int sd_ipv4acd_new(sd_ipv4acd **ret) { + _cleanup_(sd_ipv4acd_unrefp) sd_ipv4acd *acd = NULL; + + assert_return(ret, -EINVAL); + + acd = new(sd_ipv4acd, 1); + if (!acd) + return -ENOMEM; + + *acd = (sd_ipv4acd) { + .n_ref = 1, + .state = IPV4ACD_STATE_INIT, + .ifindex = -1, + .fd = -EBADF, + }; + + *ret = TAKE_PTR(acd); + + return 0; +} + +static void ipv4acd_client_notify(sd_ipv4acd *acd, int event) { + assert(acd); + + if (!acd->callback) + return; + + acd->callback(acd, event, acd->userdata); +} + +int sd_ipv4acd_stop(sd_ipv4acd *acd) { + IPv4ACDState old_state; + + if (!acd) + return 0; + + old_state = acd->state; + + ipv4acd_reset(acd); + + if (old_state == IPV4ACD_STATE_INIT) + return 0; + + log_ipv4acd(acd, "STOPPED"); + + ipv4acd_client_notify(acd, SD_IPV4ACD_EVENT_STOP); + + return 0; +} + +static int ipv4acd_on_timeout(sd_event_source *s, uint64_t usec, void *userdata); + +static int ipv4acd_set_next_wakeup(sd_ipv4acd *acd, usec_t usec, usec_t random_usec) { + usec_t next_timeout, time_now; + + assert(acd); + + next_timeout = usec; + + if (random_usec > 0) + next_timeout += (usec_t) random_u64() % random_usec; + + assert_se(sd_event_now(acd->event, CLOCK_BOOTTIME, &time_now) >= 0); + + return event_reset_time(acd->event, &acd->timer_event_source, + CLOCK_BOOTTIME, + time_now + next_timeout, 0, + ipv4acd_on_timeout, acd, + acd->event_priority, "ipv4acd-timer", true); +} + +static int ipv4acd_on_timeout(sd_event_source *s, uint64_t usec, void *userdata) { + sd_ipv4acd *acd = ASSERT_PTR(userdata); + int r = 0; + + switch (acd->state) { + + case IPV4ACD_STATE_STARTED: + acd->defend_window = 0; + + ipv4acd_set_state(acd, IPV4ACD_STATE_WAITING_PROBE, true); + + if (acd->n_conflict >= MAX_CONFLICTS) { + log_ipv4acd(acd, "Max conflicts reached, delaying by %s", + FORMAT_TIMESPAN(RATE_LIMIT_INTERVAL_USEC, 0)); + r = ipv4acd_set_next_wakeup(acd, RATE_LIMIT_INTERVAL_USEC, PROBE_WAIT_USEC); + } else + r = ipv4acd_set_next_wakeup(acd, 0, PROBE_WAIT_USEC); + if (r < 0) + goto fail; + + break; + + case IPV4ACD_STATE_WAITING_PROBE: + case IPV4ACD_STATE_PROBING: + /* Send a probe */ + r = arp_send_probe(acd->fd, acd->ifindex, &acd->address, &acd->mac_addr); + if (r < 0) { + log_ipv4acd_errno(acd, r, "Failed to send ARP probe: %m"); + goto fail; + } + + log_ipv4acd(acd, "Probing "IPV4_ADDRESS_FMT_STR, IPV4_ADDRESS_FMT_VAL(acd->address)); + + if (acd->n_iteration < PROBE_NUM - 2) { + ipv4acd_set_state(acd, IPV4ACD_STATE_PROBING, false); + + r = ipv4acd_set_next_wakeup(acd, PROBE_MIN_USEC, (PROBE_MAX_USEC-PROBE_MIN_USEC)); + if (r < 0) + goto fail; + } else { + ipv4acd_set_state(acd, IPV4ACD_STATE_WAITING_ANNOUNCE, true); + + r = ipv4acd_set_next_wakeup(acd, ANNOUNCE_WAIT_USEC, 0); + if (r < 0) + goto fail; + } + + break; + + case IPV4ACD_STATE_ANNOUNCING: + if (acd->n_iteration >= ANNOUNCE_NUM - 1) { + ipv4acd_set_state(acd, IPV4ACD_STATE_RUNNING, false); + break; + } + + _fallthrough_; + case IPV4ACD_STATE_WAITING_ANNOUNCE: + /* Send announcement packet */ + r = arp_send_announcement(acd->fd, acd->ifindex, &acd->address, &acd->mac_addr); + if (r < 0) { + log_ipv4acd_errno(acd, r, "Failed to send ARP announcement: %m"); + goto fail; + } + + log_ipv4acd(acd, "Announcing "IPV4_ADDRESS_FMT_STR, IPV4_ADDRESS_FMT_VAL(acd->address)); + + ipv4acd_set_state(acd, IPV4ACD_STATE_ANNOUNCING, false); + + r = ipv4acd_set_next_wakeup(acd, ANNOUNCE_INTERVAL_USEC, 0); + if (r < 0) + goto fail; + + if (acd->n_iteration == 0) { + acd->n_conflict = 0; + ipv4acd_client_notify(acd, SD_IPV4ACD_EVENT_BIND); + } + + break; + + default: + assert_not_reached(); + } + + return 0; + +fail: + sd_ipv4acd_stop(acd); + return 0; +} + +static bool ipv4acd_arp_conflict(sd_ipv4acd *acd, const struct ether_arp *arp, bool announced) { + assert(acd); + assert(arp); + + /* RFC 5227 section 2.1.1. + * "the host receives any ARP packet (Request *or* Reply) on the interface where the probe is + * being performed, where the packet's 'sender IP address' is the address being probed for, + * then the host MUST treat this address as being in use by some other host" */ + if (memcmp(arp->arp_spa, &acd->address, sizeof(struct in_addr)) == 0) + return true; + + if (announced) + /* the TPA matched instead of SPA, this is not a conflict */ + return false; + + /* "any ARP Probe where the packet's 'target IP address' is the address being probed for, and + * the packet's 'sender hardware address' is not the hardware address of any of the host's + * interfaces, then the host SHOULD similarly treat this as an address conflict" */ + if (arp->ea_hdr.ar_op != htobe16(ARPOP_REQUEST)) + return false; /* not ARP Request, ignoring. */ + if (memeqzero(arp->arp_spa, sizeof(struct in_addr)) == 0) + return false; /* not ARP Probe, ignoring. */ + if (memcmp(arp->arp_tpa, &acd->address, sizeof(struct in_addr)) != 0) + return false; /* target IP address does not match, BPF code is broken? */ + + if (acd->check_mac_callback && + acd->check_mac_callback(acd, (const struct ether_addr*) arp->arp_sha, acd->check_mac_userdata) > 0) + /* sender hardware is one of the host's interfaces, ignoring. */ + return false; + + return true; /* conflict! */ +} + +static void ipv4acd_on_conflict(sd_ipv4acd *acd) { + assert(acd); + + acd->n_conflict++; + + log_ipv4acd(acd, "Conflict on "IPV4_ADDRESS_FMT_STR" (%u)", IPV4_ADDRESS_FMT_VAL(acd->address), acd->n_conflict); + + ipv4acd_reset(acd); + ipv4acd_client_notify(acd, SD_IPV4ACD_EVENT_CONFLICT); +} + +static int ipv4acd_on_packet( + sd_event_source *s, + int fd, + uint32_t revents, + void *userdata) { + + sd_ipv4acd *acd = ASSERT_PTR(userdata); + struct ether_arp packet; + ssize_t n; + int r; + + assert(s); + assert(fd >= 0); + + n = recv(fd, &packet, sizeof(struct ether_arp), 0); + if (n < 0) { + if (ERRNO_IS_TRANSIENT(errno) || ERRNO_IS_DISCONNECT(errno)) + return 0; + + log_ipv4acd_errno(acd, errno, "Failed to read ARP packet: %m"); + goto fail; + } + if ((size_t) n != sizeof(struct ether_arp)) { + log_ipv4acd(acd, "Ignoring too short ARP packet."); + return 0; + } + + switch (acd->state) { + + case IPV4ACD_STATE_ANNOUNCING: + case IPV4ACD_STATE_RUNNING: + + if (ipv4acd_arp_conflict(acd, &packet, true)) { + usec_t ts; + + assert_se(sd_event_now(acd->event, CLOCK_BOOTTIME, &ts) >= 0); + + /* Defend address */ + if (ts > acd->defend_window) { + acd->defend_window = ts + DEFEND_INTERVAL_USEC; + r = arp_send_announcement(acd->fd, acd->ifindex, &acd->address, &acd->mac_addr); + if (r < 0) { + log_ipv4acd_errno(acd, r, "Failed to send ARP announcement: %m"); + goto fail; + } + + log_ipv4acd(acd, "Defending "IPV4_ADDRESS_FMT_STR, IPV4_ADDRESS_FMT_VAL(acd->address)); + + } else + ipv4acd_on_conflict(acd); + } + break; + + case IPV4ACD_STATE_WAITING_PROBE: + case IPV4ACD_STATE_PROBING: + case IPV4ACD_STATE_WAITING_ANNOUNCE: + if (ipv4acd_arp_conflict(acd, &packet, false)) + ipv4acd_on_conflict(acd); + break; + + default: + assert_not_reached(); + } + + return 0; + +fail: + sd_ipv4acd_stop(acd); + return 0; +} + +int sd_ipv4acd_set_ifindex(sd_ipv4acd *acd, int ifindex) { + assert_return(acd, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + assert_return(acd->state == IPV4ACD_STATE_INIT, -EBUSY); + + acd->ifindex = ifindex; + + return 0; +} + +int sd_ipv4acd_get_ifindex(sd_ipv4acd *acd) { + if (!acd) + return -EINVAL; + + return acd->ifindex; +} + +int sd_ipv4acd_set_ifname(sd_ipv4acd *acd, const char *ifname) { + assert_return(acd, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&acd->ifname, ifname); +} + +int sd_ipv4acd_get_ifname(sd_ipv4acd *acd, const char **ret) { + int r; + + assert_return(acd, -EINVAL); + + r = get_ifname(acd->ifindex, &acd->ifname); + if (r < 0) + return r; + + if (ret) + *ret = acd->ifname; + + return 0; +} + +int sd_ipv4acd_set_mac(sd_ipv4acd *acd, const struct ether_addr *addr) { + int r; + + assert_return(acd, -EINVAL); + assert_return(addr, -EINVAL); + assert_return(!ether_addr_is_null(addr), -EINVAL); + + acd->mac_addr = *addr; + + if (!sd_ipv4acd_is_running(acd)) + return 0; + + assert(acd->fd >= 0); + r = arp_update_filter(acd->fd, &acd->address, &acd->mac_addr); + if (r < 0) { + ipv4acd_reset(acd); + return r; + } + + return 0; +} + +int sd_ipv4acd_detach_event(sd_ipv4acd *acd) { + assert_return(acd, -EINVAL); + + acd->event = sd_event_unref(acd->event); + + return 0; +} + +int sd_ipv4acd_attach_event(sd_ipv4acd *acd, sd_event *event, int64_t priority) { + int r; + + assert_return(acd, -EINVAL); + assert_return(!acd->event, -EBUSY); + + if (event) + acd->event = sd_event_ref(event); + else { + r = sd_event_default(&acd->event); + if (r < 0) + return r; + } + + acd->event_priority = priority; + + return 0; +} + +int sd_ipv4acd_set_callback(sd_ipv4acd *acd, sd_ipv4acd_callback_t cb, void *userdata) { + assert_return(acd, -EINVAL); + + acd->callback = cb; + acd->userdata = userdata; + + return 0; +} + +int sd_ipv4acd_set_check_mac_callback(sd_ipv4acd *acd, sd_ipv4acd_check_mac_callback_t cb, void *userdata) { + assert_return(acd, -EINVAL); + + acd->check_mac_callback = cb; + acd->check_mac_userdata = userdata; + return 0; +} + +int sd_ipv4acd_set_address(sd_ipv4acd *acd, const struct in_addr *address) { + int r; + + assert_return(acd, -EINVAL); + assert_return(address, -EINVAL); + assert_return(in4_addr_is_set(address), -EINVAL); + + if (in4_addr_equal(&acd->address, address)) + return 0; + + acd->address = *address; + + if (!sd_ipv4acd_is_running(acd)) + return 0; + + assert(acd->fd >= 0); + r = arp_update_filter(acd->fd, &acd->address, &acd->mac_addr); + if (r < 0) + goto fail; + + r = ipv4acd_set_next_wakeup(acd, 0, 0); + if (r < 0) + goto fail; + + ipv4acd_set_state(acd, IPV4ACD_STATE_STARTED, true); + return 0; + +fail: + ipv4acd_reset(acd); + return r; +} + +int sd_ipv4acd_get_address(sd_ipv4acd *acd, struct in_addr *address) { + assert_return(acd, -EINVAL); + assert_return(address, -EINVAL); + + *address = acd->address; + + return 0; +} + +int sd_ipv4acd_is_running(sd_ipv4acd *acd) { + assert_return(acd, false); + + return acd->state != IPV4ACD_STATE_INIT; +} + +int sd_ipv4acd_is_bound(sd_ipv4acd *acd) { + assert_return(acd, false); + + return IN_SET(acd->state, IPV4ACD_STATE_ANNOUNCING, IPV4ACD_STATE_RUNNING); +} + +int sd_ipv4acd_start(sd_ipv4acd *acd, bool reset_conflicts) { + int r; + + assert_return(acd, -EINVAL); + assert_return(acd->event, -EINVAL); + assert_return(acd->ifindex > 0, -EINVAL); + assert_return(in4_addr_is_set(&acd->address), -EINVAL); + assert_return(!ether_addr_is_null(&acd->mac_addr), -EINVAL); + assert_return(acd->state == IPV4ACD_STATE_INIT, -EBUSY); + + r = arp_network_bind_raw_socket(acd->ifindex, &acd->address, &acd->mac_addr); + if (r < 0) + return r; + + close_and_replace(acd->fd, r); + + if (reset_conflicts) + acd->n_conflict = 0; + + r = sd_event_add_io(acd->event, &acd->receive_message_event_source, acd->fd, EPOLLIN, ipv4acd_on_packet, acd); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(acd->receive_message_event_source, acd->event_priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(acd->receive_message_event_source, "ipv4acd-receive-message"); + + r = ipv4acd_set_next_wakeup(acd, 0, 0); + if (r < 0) + goto fail; + + ipv4acd_set_state(acd, IPV4ACD_STATE_STARTED, true); + return 0; + +fail: + ipv4acd_reset(acd); + return r; +} diff --git a/src/libsystemd-network/sd-ipv4ll.c b/src/libsystemd-network/sd-ipv4ll.c new file mode 100644 index 0000000..a29279e --- /dev/null +++ b/src/libsystemd-network/sd-ipv4ll.c @@ -0,0 +1,365 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Axis Communications AB. All rights reserved. +***/ + +#include +#include +#include +#include + +#include "sd-id128.h" +#include "sd-ipv4acd.h" +#include "sd-ipv4ll.h" + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "in-addr-util.h" +#include "network-common.h" +#include "random-util.h" +#include "siphash24.h" +#include "sparse-endian.h" +#include "string-util.h" + +#define IPV4LL_NETWORK UINT32_C(0xA9FE0000) +#define IPV4LL_NETMASK UINT32_C(0xFFFF0000) + +#define IPV4LL_DONT_DESTROY(ll) \ + _cleanup_(sd_ipv4ll_unrefp) _unused_ sd_ipv4ll *_dont_destroy_##ll = sd_ipv4ll_ref(ll) + +struct sd_ipv4ll { + unsigned n_ref; + + sd_ipv4acd *acd; + + be32_t address; /* the address pushed to ACD */ + struct ether_addr mac; + + struct { + le64_t value; + le64_t generation; + } seed; + bool seed_set; + + /* External */ + be32_t claimed_address; + + sd_ipv4ll_callback_t callback; + void *userdata; + + sd_ipv4ll_check_mac_callback_t check_mac_callback; + void *check_mac_userdata; +}; + +#define log_ipv4ll_errno(ll, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "IPv4LL: ", \ + sd_ipv4ll, ll, \ + error, fmt, ##__VA_ARGS__) +#define log_ipv4ll(ll, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "IPv4LL: ", \ + sd_ipv4ll, ll, \ + 0, fmt, ##__VA_ARGS__) + +static void ipv4ll_on_acd(sd_ipv4acd *acd, int event, void *userdata); +static int ipv4ll_check_mac(sd_ipv4acd *acd, const struct ether_addr *mac, void *userdata); + +static sd_ipv4ll *ipv4ll_free(sd_ipv4ll *ll) { + assert(ll); + + sd_ipv4acd_unref(ll->acd); + return mfree(ll); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_ipv4ll, sd_ipv4ll, ipv4ll_free); + +int sd_ipv4ll_new(sd_ipv4ll **ret) { + _cleanup_(sd_ipv4ll_unrefp) sd_ipv4ll *ll = NULL; + int r; + + assert_return(ret, -EINVAL); + + ll = new0(sd_ipv4ll, 1); + if (!ll) + return -ENOMEM; + + ll->n_ref = 1; + + r = sd_ipv4acd_new(&ll->acd); + if (r < 0) + return r; + + r = sd_ipv4acd_set_callback(ll->acd, ipv4ll_on_acd, ll); + if (r < 0) + return r; + + r = sd_ipv4acd_set_check_mac_callback(ll->acd, ipv4ll_check_mac, ll); + if (r < 0) + return r; + + *ret = TAKE_PTR(ll); + + return 0; +} + +int sd_ipv4ll_stop(sd_ipv4ll *ll) { + if (!ll) + return 0; + + return sd_ipv4acd_stop(ll->acd); +} + +int sd_ipv4ll_set_ifindex(sd_ipv4ll *ll, int ifindex) { + assert_return(ll, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + assert_return(sd_ipv4ll_is_running(ll) == 0, -EBUSY); + + return sd_ipv4acd_set_ifindex(ll->acd, ifindex); +} + +int sd_ipv4ll_get_ifindex(sd_ipv4ll *ll) { + if (!ll) + return -EINVAL; + + return sd_ipv4acd_get_ifindex(ll->acd); +} + +int sd_ipv4ll_set_ifname(sd_ipv4ll *ll, const char *ifname) { + assert_return(ll, -EINVAL); + assert_return(ifname, -EINVAL); + + return sd_ipv4acd_set_ifname(ll->acd, ifname); +} + +int sd_ipv4ll_get_ifname(sd_ipv4ll *ll, const char **ret) { + assert_return(ll, -EINVAL); + + return sd_ipv4acd_get_ifname(ll->acd, ret); +} + +int sd_ipv4ll_set_mac(sd_ipv4ll *ll, const struct ether_addr *addr) { + int r; + + assert_return(ll, -EINVAL); + assert_return(addr, -EINVAL); + assert_return(!ether_addr_is_null(addr), -EINVAL); + + r = sd_ipv4acd_set_mac(ll->acd, addr); + if (r < 0) + return r; + + ll->mac = *addr; + return 0; +} + +int sd_ipv4ll_detach_event(sd_ipv4ll *ll) { + assert_return(ll, -EINVAL); + + return sd_ipv4acd_detach_event(ll->acd); +} + +int sd_ipv4ll_attach_event(sd_ipv4ll *ll, sd_event *event, int64_t priority) { + assert_return(ll, -EINVAL); + + return sd_ipv4acd_attach_event(ll->acd, event, priority); +} + +int sd_ipv4ll_set_callback(sd_ipv4ll *ll, sd_ipv4ll_callback_t cb, void *userdata) { + assert_return(ll, -EINVAL); + + ll->callback = cb; + ll->userdata = userdata; + + return 0; +} + +int sd_ipv4ll_set_check_mac_callback(sd_ipv4ll *ll, sd_ipv4ll_check_mac_callback_t cb, void *userdata) { + assert_return(ll, -EINVAL); + + ll->check_mac_callback = cb; + ll->check_mac_userdata = userdata; + + return 0; +} + +int sd_ipv4ll_get_address(sd_ipv4ll *ll, struct in_addr *address) { + assert_return(ll, -EINVAL); + assert_return(address, -EINVAL); + + if (ll->claimed_address == 0) + return -ENOENT; + + address->s_addr = ll->claimed_address; + + return 0; +} + +int sd_ipv4ll_set_address_seed(sd_ipv4ll *ll, uint64_t seed) { + assert_return(ll, -EINVAL); + assert_return(sd_ipv4ll_is_running(ll) == 0, -EBUSY); + + ll->seed.value = htole64(seed); + ll->seed_set = true; + + return 0; +} + +int sd_ipv4ll_is_running(sd_ipv4ll *ll) { + assert_return(ll, false); + + return sd_ipv4acd_is_running(ll->acd); +} + +int sd_ipv4ll_set_address(sd_ipv4ll *ll, const struct in_addr *address) { + int r; + + assert_return(ll, -EINVAL); + assert_return(address, -EINVAL); + assert_return(in4_addr_is_link_local_dynamic(address), -EINVAL); + + r = sd_ipv4acd_set_address(ll->acd, address); + if (r < 0) + return r; + + ll->address = address->s_addr; + + return 0; +} + +#define PICK_HASH_KEY SD_ID128_MAKE(15,ac,82,a6,d6,3f,49,78,98,77,5d,0c,69,02,94,0b) + +static int ipv4ll_pick_address(sd_ipv4ll *ll) { + be32_t addr; + + assert(ll); + + do { + uint64_t h; + + h = siphash24(&ll->seed, sizeof(ll->seed), PICK_HASH_KEY.bytes); + + /* Increase the generation counter by one */ + ll->seed.generation = htole64(le64toh(ll->seed.generation) + 1); + + addr = htobe32((h & UINT32_C(0x0000FFFF)) | IPV4LL_NETWORK); + } while (addr == ll->address || + IN_SET(be32toh(addr) & 0x0000FF00U, 0x0000U, 0xFF00U)); + + log_ipv4ll(ll, "Picked new IP address %s.", IN4_ADDR_TO_STRING((const struct in_addr*) &addr)); + + return sd_ipv4ll_set_address(ll, &(struct in_addr) { addr }); +} + +#define MAC_HASH_KEY SD_ID128_MAKE(df,04,22,98,3f,ad,14,52,f9,87,2e,d1,9c,70,e2,f2) + +static int ipv4ll_start_internal(sd_ipv4ll *ll, bool reset_generation) { + int r; + bool picked_address = false; + + assert_return(ll, -EINVAL); + assert_return(!ether_addr_is_null(&ll->mac), -EINVAL); + + /* If no random seed is set, generate some from the MAC address */ + if (!ll->seed_set) + ll->seed.value = htole64(siphash24(ll->mac.ether_addr_octet, ETH_ALEN, MAC_HASH_KEY.bytes)); + + if (reset_generation) + ll->seed.generation = 0; + + if (ll->address == 0) { + r = ipv4ll_pick_address(ll); + if (r < 0) + return r; + + picked_address = true; + } + + r = sd_ipv4acd_start(ll->acd, reset_generation); + if (r < 0) { + + /* We couldn't start? If so, let's forget the picked address again, the user might make a change and + * retry, and we want the new data to take effect when picking an address. */ + if (picked_address) + ll->address = 0; + + return r; + } + + return 1; +} + +int sd_ipv4ll_start(sd_ipv4ll *ll) { + assert_return(ll, -EINVAL); + + if (sd_ipv4ll_is_running(ll)) + return 0; + + return ipv4ll_start_internal(ll, true); +} + +int sd_ipv4ll_restart(sd_ipv4ll *ll) { + ll->address = 0; + + return ipv4ll_start_internal(ll, false); +} + +static void ipv4ll_client_notify(sd_ipv4ll *ll, int event) { + assert(ll); + + if (ll->callback) + ll->callback(ll, event, ll->userdata); +} + +void ipv4ll_on_acd(sd_ipv4acd *acd, int event, void *userdata) { + sd_ipv4ll *ll = ASSERT_PTR(userdata); + IPV4LL_DONT_DESTROY(ll); + int r; + + assert(acd); + + switch (event) { + + case SD_IPV4ACD_EVENT_STOP: + ipv4ll_client_notify(ll, SD_IPV4LL_EVENT_STOP); + ll->claimed_address = 0; + break; + + case SD_IPV4ACD_EVENT_BIND: + ll->claimed_address = ll->address; + ipv4ll_client_notify(ll, SD_IPV4LL_EVENT_BIND); + break; + + case SD_IPV4ACD_EVENT_CONFLICT: + /* if an address was already bound we must call up to the + user to handle this, otherwise we just try again */ + if (ll->claimed_address != 0) { + ipv4ll_client_notify(ll, SD_IPV4LL_EVENT_CONFLICT); + + ll->claimed_address = 0; + } else { + r = sd_ipv4ll_restart(ll); + if (r < 0) + goto error; + } + + break; + + default: + assert_not_reached(); + } + + return; + +error: + ipv4ll_client_notify(ll, SD_IPV4LL_EVENT_STOP); +} + +static int ipv4ll_check_mac(sd_ipv4acd *acd, const struct ether_addr *mac, void *userdata) { + sd_ipv4ll *ll = ASSERT_PTR(userdata); + + if (ll->check_mac_callback) + return ll->check_mac_callback(ll, mac, ll->check_mac_userdata); + + return 0; +} diff --git a/src/libsystemd-network/sd-lldp-rx.c b/src/libsystemd-network/sd-lldp-rx.c new file mode 100644 index 0000000..2fc9a55 --- /dev/null +++ b/src/libsystemd-network/sd-lldp-rx.c @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-lldp-rx.h" + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "lldp-neighbor.h" +#include "lldp-network.h" +#include "lldp-rx-internal.h" +#include "memory-util.h" +#include "network-common.h" +#include "socket-util.h" +#include "sort-util.h" +#include "string-table.h" + +#define LLDP_DEFAULT_NEIGHBORS_MAX 128U + +static const char * const lldp_rx_event_table[_SD_LLDP_RX_EVENT_MAX] = { + [SD_LLDP_RX_EVENT_ADDED] = "added", + [SD_LLDP_RX_EVENT_REMOVED] = "removed", + [SD_LLDP_RX_EVENT_UPDATED] = "updated", + [SD_LLDP_RX_EVENT_REFRESHED] = "refreshed", +}; + +DEFINE_STRING_TABLE_LOOKUP(lldp_rx_event, sd_lldp_rx_event_t); + +static void lldp_rx_flush_neighbors(sd_lldp_rx *lldp_rx) { + assert(lldp_rx); + + hashmap_clear(lldp_rx->neighbor_by_id); +} + +static void lldp_rx_callback(sd_lldp_rx *lldp_rx, sd_lldp_rx_event_t event, sd_lldp_neighbor *n) { + assert(lldp_rx); + assert(event >= 0 && event < _SD_LLDP_RX_EVENT_MAX); + + if (!lldp_rx->callback) + return (void) log_lldp_rx(lldp_rx, "Received '%s' event.", lldp_rx_event_to_string(event)); + + log_lldp_rx(lldp_rx, "Invoking callback for '%s' event.", lldp_rx_event_to_string(event)); + lldp_rx->callback(lldp_rx, event, n, lldp_rx->userdata); +} + +static int lldp_rx_make_space(sd_lldp_rx *lldp_rx, size_t extra) { + usec_t t = USEC_INFINITY; + bool changed = false; + + assert(lldp_rx); + + /* Remove all entries that are past their TTL, and more until at least the specified number of extra entries + * are free. */ + + for (;;) { + _cleanup_(sd_lldp_neighbor_unrefp) sd_lldp_neighbor *n = NULL; + + n = prioq_peek(lldp_rx->neighbor_by_expiry); + if (!n) + break; + + sd_lldp_neighbor_ref(n); + + if (hashmap_size(lldp_rx->neighbor_by_id) > LESS_BY(lldp_rx->neighbors_max, extra)) + goto remove_one; + + if (t == USEC_INFINITY) + t = now(CLOCK_BOOTTIME); + + if (n->until > t) + break; + + remove_one: + lldp_neighbor_unlink(n); + lldp_rx_callback(lldp_rx, SD_LLDP_RX_EVENT_REMOVED, n); + changed = true; + } + + return changed; +} + +static bool lldp_rx_keep_neighbor(sd_lldp_rx *lldp_rx, sd_lldp_neighbor *n) { + assert(lldp_rx); + assert(n); + + /* Don't keep data with a zero TTL */ + if (n->ttl <= 0) + return false; + + /* Filter out data from the filter address */ + if (!ether_addr_is_null(&lldp_rx->filter_address) && + ether_addr_equal(&lldp_rx->filter_address, &n->source_address)) + return false; + + /* Only add if the neighbor has a capability we are interested in. Note that we also store all neighbors with + * no caps field set. */ + if (n->has_capabilities && + (n->enabled_capabilities & lldp_rx->capability_mask) == 0) + return false; + + /* Keep everything else */ + return true; +} + +static int lldp_rx_start_timer(sd_lldp_rx *lldp_rx, sd_lldp_neighbor *neighbor); + +static int lldp_rx_add_neighbor(sd_lldp_rx *lldp_rx, sd_lldp_neighbor *n) { + _cleanup_(sd_lldp_neighbor_unrefp) sd_lldp_neighbor *old = NULL; + bool keep; + int r; + + assert(lldp_rx); + assert(n); + assert(!n->lldp_rx); + + keep = lldp_rx_keep_neighbor(lldp_rx, n); + + /* First retrieve the old entry for this MSAP */ + old = hashmap_get(lldp_rx->neighbor_by_id, &n->id); + if (old) { + sd_lldp_neighbor_ref(old); + + if (!keep) { + lldp_neighbor_unlink(old); + lldp_rx_callback(lldp_rx, SD_LLDP_RX_EVENT_REMOVED, old); + return 0; + } + + if (lldp_neighbor_equal(n, old)) { + /* Is this equal, then restart the TTL counter, but don't do anything else. */ + old->timestamp = n->timestamp; + lldp_rx_start_timer(lldp_rx, old); + lldp_rx_callback(lldp_rx, SD_LLDP_RX_EVENT_REFRESHED, old); + return 0; + } + + /* Data changed, remove the old entry, and add a new one */ + lldp_neighbor_unlink(old); + + } else if (!keep) + return 0; + + /* Then, make room for at least one new neighbor */ + lldp_rx_make_space(lldp_rx, 1); + + r = hashmap_ensure_put(&lldp_rx->neighbor_by_id, &lldp_neighbor_hash_ops, &n->id, n); + if (r < 0) + goto finish; + + r = prioq_ensure_put(&lldp_rx->neighbor_by_expiry, lldp_neighbor_prioq_compare_func, n, &n->prioq_idx); + if (r < 0) { + assert_se(hashmap_remove(lldp_rx->neighbor_by_id, &n->id) == n); + goto finish; + } + + n->lldp_rx = lldp_rx; + + lldp_rx_start_timer(lldp_rx, n); + lldp_rx_callback(lldp_rx, old ? SD_LLDP_RX_EVENT_UPDATED : SD_LLDP_RX_EVENT_ADDED, n); + + return 1; + +finish: + if (old) + lldp_rx_callback(lldp_rx, SD_LLDP_RX_EVENT_REMOVED, old); + + return r; +} + +static int lldp_rx_handle_datagram(sd_lldp_rx *lldp_rx, sd_lldp_neighbor *n) { + int r; + + assert(lldp_rx); + assert(n); + + r = lldp_neighbor_parse(n); + if (r < 0) + return r; + + r = lldp_rx_add_neighbor(lldp_rx, n); + if (r < 0) + return log_lldp_rx_errno(lldp_rx, r, "Failed to add datagram. Ignoring."); + + log_lldp_rx(lldp_rx, "Successfully processed LLDP datagram."); + return 0; +} + +static int lldp_rx_receive_datagram(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(sd_lldp_neighbor_unrefp) sd_lldp_neighbor *n = NULL; + ssize_t space, length; + sd_lldp_rx *lldp_rx = ASSERT_PTR(userdata); + struct timespec ts; + + assert(fd >= 0); + + space = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(space) || ERRNO_IS_NEG_DISCONNECT(space)) + return 0; + if (space < 0) { + log_lldp_rx_errno(lldp_rx, space, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + n = lldp_neighbor_new(space); + if (!n) { + log_oom_debug(); + return 0; + } + + length = recv(fd, LLDP_NEIGHBOR_RAW(n), n->raw_size, MSG_DONTWAIT); + if (length < 0) { + if (ERRNO_IS_TRANSIENT(errno) || ERRNO_IS_DISCONNECT(errno)) + return 0; + + log_lldp_rx_errno(lldp_rx, errno, "Failed to read LLDP datagram, ignoring: %m"); + return 0; + } + + if ((size_t) length != n->raw_size) { + log_lldp_rx(lldp_rx, "Packet size mismatch, ignoring"); + return 0; + } + + /* Try to get the timestamp of this packet if it is known */ + if (ioctl(fd, SIOCGSTAMPNS, &ts) >= 0) + triple_timestamp_from_realtime(&n->timestamp, timespec_load(&ts)); + else + triple_timestamp_now(&n->timestamp); + + (void) lldp_rx_handle_datagram(lldp_rx, n); + return 0; +} + +static void lldp_rx_reset(sd_lldp_rx *lldp_rx) { + assert(lldp_rx); + + (void) event_source_disable(lldp_rx->timer_event_source); + lldp_rx->io_event_source = sd_event_source_disable_unref(lldp_rx->io_event_source); + lldp_rx->fd = safe_close(lldp_rx->fd); +} + +int sd_lldp_rx_is_running(sd_lldp_rx *lldp_rx) { + if (!lldp_rx) + return false; + + return lldp_rx->fd >= 0; +} + +int sd_lldp_rx_start(sd_lldp_rx *lldp_rx) { + int r; + + assert_return(lldp_rx, -EINVAL); + assert_return(lldp_rx->event, -EINVAL); + assert_return(lldp_rx->ifindex > 0, -EINVAL); + + if (sd_lldp_rx_is_running(lldp_rx)) + return 0; + + assert(!lldp_rx->io_event_source); + + lldp_rx->fd = lldp_network_bind_raw_socket(lldp_rx->ifindex); + if (lldp_rx->fd < 0) + return lldp_rx->fd; + + r = sd_event_add_io(lldp_rx->event, &lldp_rx->io_event_source, lldp_rx->fd, EPOLLIN, lldp_rx_receive_datagram, lldp_rx); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(lldp_rx->io_event_source, lldp_rx->event_priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(lldp_rx->io_event_source, "lldp-rx-io"); + + log_lldp_rx(lldp_rx, "Started LLDP client"); + return 1; + +fail: + lldp_rx_reset(lldp_rx); + return r; +} + +int sd_lldp_rx_stop(sd_lldp_rx *lldp_rx) { + if (!sd_lldp_rx_is_running(lldp_rx)) + return 0; + + log_lldp_rx(lldp_rx, "Stopping LLDP client"); + + lldp_rx_reset(lldp_rx); + lldp_rx_flush_neighbors(lldp_rx); + + return 1; +} + +int sd_lldp_rx_attach_event(sd_lldp_rx *lldp_rx, sd_event *event, int64_t priority) { + int r; + + assert_return(lldp_rx, -EINVAL); + assert_return(!sd_lldp_rx_is_running(lldp_rx), -EBUSY); + assert_return(!lldp_rx->event, -EBUSY); + + if (event) + lldp_rx->event = sd_event_ref(event); + else { + r = sd_event_default(&lldp_rx->event); + if (r < 0) + return r; + } + + lldp_rx->event_priority = priority; + + return 0; +} + +int sd_lldp_rx_detach_event(sd_lldp_rx *lldp_rx) { + assert_return(lldp_rx, -EINVAL); + assert_return(!sd_lldp_rx_is_running(lldp_rx), -EBUSY); + + lldp_rx->io_event_source = sd_event_source_disable_unref(lldp_rx->io_event_source); + lldp_rx->timer_event_source = sd_event_source_disable_unref(lldp_rx->timer_event_source); + lldp_rx->event = sd_event_unref(lldp_rx->event); + return 0; +} + +sd_event* sd_lldp_rx_get_event(sd_lldp_rx *lldp_rx) { + assert_return(lldp_rx, NULL); + + return lldp_rx->event; +} + +int sd_lldp_rx_set_callback(sd_lldp_rx *lldp_rx, sd_lldp_rx_callback_t cb, void *userdata) { + assert_return(lldp_rx, -EINVAL); + + lldp_rx->callback = cb; + lldp_rx->userdata = userdata; + + return 0; +} + +int sd_lldp_rx_set_ifindex(sd_lldp_rx *lldp_rx, int ifindex) { + assert_return(lldp_rx, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + assert_return(!sd_lldp_rx_is_running(lldp_rx), -EBUSY); + + lldp_rx->ifindex = ifindex; + return 0; +} + +int sd_lldp_rx_set_ifname(sd_lldp_rx *lldp_rx, const char *ifname) { + assert_return(lldp_rx, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&lldp_rx->ifname, ifname); +} + +int sd_lldp_rx_get_ifname(sd_lldp_rx *lldp_rx, const char **ret) { + int r; + + assert_return(lldp_rx, -EINVAL); + + r = get_ifname(lldp_rx->ifindex, &lldp_rx->ifname); + if (r < 0) + return r; + + if (ret) + *ret = lldp_rx->ifname; + + return 0; +} + +static sd_lldp_rx *lldp_rx_free(sd_lldp_rx *lldp_rx) { + if (!lldp_rx) + return NULL; + + lldp_rx_reset(lldp_rx); + + sd_lldp_rx_detach_event(lldp_rx); + + lldp_rx_flush_neighbors(lldp_rx); + + hashmap_free(lldp_rx->neighbor_by_id); + prioq_free(lldp_rx->neighbor_by_expiry); + free(lldp_rx->ifname); + return mfree(lldp_rx); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_lldp_rx, sd_lldp_rx, lldp_rx_free); + +int sd_lldp_rx_new(sd_lldp_rx **ret) { + _cleanup_(sd_lldp_rx_unrefp) sd_lldp_rx *lldp_rx = NULL; + + assert_return(ret, -EINVAL); + + lldp_rx = new(sd_lldp_rx, 1); + if (!lldp_rx) + return -ENOMEM; + + *lldp_rx = (sd_lldp_rx) { + .n_ref = 1, + .fd = -EBADF, + .neighbors_max = LLDP_DEFAULT_NEIGHBORS_MAX, + .capability_mask = UINT16_MAX, + }; + + *ret = TAKE_PTR(lldp_rx); + return 0; +} + +static int on_timer_event(sd_event_source *s, uint64_t usec, void *userdata) { + sd_lldp_rx *lldp_rx = userdata; + int r; + + r = lldp_rx_make_space(lldp_rx, 0); + if (r < 0) { + log_lldp_rx_errno(lldp_rx, r, "Failed to make space, ignoring: %m"); + return 0; + } + + r = lldp_rx_start_timer(lldp_rx, NULL); + if (r < 0) { + log_lldp_rx_errno(lldp_rx, r, "Failed to restart timer, ignoring: %m"); + return 0; + } + + return 0; +} + +static int lldp_rx_start_timer(sd_lldp_rx *lldp_rx, sd_lldp_neighbor *neighbor) { + sd_lldp_neighbor *n; + + assert(lldp_rx); + assert(lldp_rx->event); + + if (neighbor) + lldp_neighbor_start_ttl(neighbor); + + n = prioq_peek(lldp_rx->neighbor_by_expiry); + if (!n) + return event_source_disable(lldp_rx->timer_event_source); + + return event_reset_time(lldp_rx->event, &lldp_rx->timer_event_source, + CLOCK_BOOTTIME, + n->until, 0, + on_timer_event, lldp_rx, + lldp_rx->event_priority, "lldp-rx-timer", true); +} + +static int neighbor_compare_func(sd_lldp_neighbor * const *a, sd_lldp_neighbor * const *b) { + assert(a); + assert(b); + assert(*a); + assert(*b); + + return lldp_neighbor_id_compare_func(&(*a)->id, &(*b)->id); +} + +int sd_lldp_rx_get_neighbors(sd_lldp_rx *lldp_rx, sd_lldp_neighbor ***ret) { + _cleanup_free_ sd_lldp_neighbor **l = NULL; + sd_lldp_neighbor *n; + int k = 0; + + assert_return(lldp_rx, -EINVAL); + assert_return(ret, -EINVAL); + + if (hashmap_isempty(lldp_rx->neighbor_by_id)) { /* Special shortcut */ + *ret = NULL; + return 0; + } + + l = new0(sd_lldp_neighbor*, hashmap_size(lldp_rx->neighbor_by_id)); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(n, lldp_rx->neighbor_by_id) + l[k++] = sd_lldp_neighbor_ref(n); + + assert((size_t) k == hashmap_size(lldp_rx->neighbor_by_id)); + + /* Return things in a stable order */ + typesafe_qsort(l, k, neighbor_compare_func); + *ret = TAKE_PTR(l); + + return k; +} + +int sd_lldp_rx_set_neighbors_max(sd_lldp_rx *lldp_rx, uint64_t m) { + assert_return(lldp_rx, -EINVAL); + assert_return(m > 0, -EINVAL); + + lldp_rx->neighbors_max = m; + lldp_rx_make_space(lldp_rx, 0); + + return 0; +} + +int sd_lldp_rx_match_capabilities(sd_lldp_rx *lldp_rx, uint16_t mask) { + assert_return(lldp_rx, -EINVAL); + assert_return(mask != 0, -EINVAL); + + lldp_rx->capability_mask = mask; + + return 0; +} + +int sd_lldp_rx_set_filter_address(sd_lldp_rx *lldp_rx, const struct ether_addr *addr) { + assert_return(lldp_rx, -EINVAL); + + /* In order to deal nicely with bridges that send back our own packets, allow one address to be filtered, so + * that our own can be filtered out here. */ + + if (addr) + lldp_rx->filter_address = *addr; + else + zero(lldp_rx->filter_address); + + return 0; +} diff --git a/src/libsystemd-network/sd-lldp-tx.c b/src/libsystemd-network/sd-lldp-tx.c new file mode 100644 index 0000000..2b822af --- /dev/null +++ b/src/libsystemd-network/sd-lldp-tx.c @@ -0,0 +1,628 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-event.h" +#include "sd-id128.h" +#include "sd-lldp-tx.h" + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "fd-util.h" +#include "hostname-util.h" +#include "network-common.h" +#include "random-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "time-util.h" +#include "unaligned.h" +#include "web-util.h" + +/* The LLDP spec calls this "txFastInit", see 9.2.5.19 */ +#define LLDP_FAST_TX_INIT 4U + +/* The LLDP spec calls this "msgTxHold", see 9.2.5.6 */ +#define LLDP_TX_HOLD 4U + +/* The jitter range to add, see 9.2.2. */ +#define LLDP_TX_JITTER_USEC (400U * USEC_PER_MSEC) + +/* The LLDP spec calls this msgTxInterval, but we subtract half the jitter off it. */ +#define LLDP_TX_INTERVAL_USEC (30U * USEC_PER_SEC - LLDP_TX_JITTER_USEC / 2) + +/* The LLDP spec calls this msgFastTx, but we subtract half the jitter off it. */ +#define LLDP_FAST_TX_INTERVAL_USEC (1U * USEC_PER_SEC - LLDP_TX_JITTER_USEC / 2) + +#define LLDP_TX_TTL ((uint16_t) DIV_ROUND_UP(LLDP_TX_INTERVAL_USEC * LLDP_TX_HOLD + 1, USEC_PER_SEC)) + +static const struct ether_addr lldp_multicast_addr[_SD_LLDP_MULTICAST_MODE_MAX] = { + [SD_LLDP_MULTICAST_MODE_NEAREST_BRIDGE] = {{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x0e }}, + [SD_LLDP_MULTICAST_MODE_NON_TPMR_BRIDGE] = {{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03 }}, + [SD_LLDP_MULTICAST_MODE_CUSTOMER_BRIDGE] = {{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }}, +}; + +struct sd_lldp_tx { + unsigned n_ref; + + int ifindex; + char *ifname; + + sd_event *event; + int64_t event_priority; + sd_event_source *timer_event_source; + + unsigned fast_tx; + + sd_lldp_multicast_mode_t mode; + struct ether_addr hwaddr; + + char *port_description; + char *hostname; + char *pretty_hostname; + char *mud_url; + uint16_t supported_capabilities; + uint16_t enabled_capabilities; +}; + +#define log_lldp_tx_errno(lldp_tx, error, fmt, ...) \ + log_interface_prefix_full_errno( \ + "LLDP Tx: ", \ + sd_lldp_tx, lldp_tx, \ + error, fmt, ##__VA_ARGS__) +#define log_lldp_tx(lldp_tx, fmt, ...) \ + log_interface_prefix_full_errno_zerook( \ + "LLDP Tx: ", \ + sd_lldp_tx, lldp_tx, \ + 0, fmt, ##__VA_ARGS__) + +static sd_lldp_tx *lldp_tx_free(sd_lldp_tx *lldp_tx) { + if (!lldp_tx) + return NULL; + + sd_lldp_tx_detach_event(lldp_tx); + + free(lldp_tx->port_description); + free(lldp_tx->hostname); + free(lldp_tx->pretty_hostname); + free(lldp_tx->mud_url); + + free(lldp_tx->ifname); + return mfree(lldp_tx); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_lldp_tx, sd_lldp_tx, lldp_tx_free); + +int sd_lldp_tx_new(sd_lldp_tx **ret) { + _cleanup_(sd_lldp_tx_unrefp) sd_lldp_tx *lldp_tx = NULL; + + assert_return(ret, -EINVAL); + + lldp_tx = new(sd_lldp_tx, 1); + if (!lldp_tx) + return -ENOMEM; + + *lldp_tx = (sd_lldp_tx) { + .n_ref = 1, + .mode = _SD_LLDP_MULTICAST_MODE_INVALID, + }; + + *ret = TAKE_PTR(lldp_tx); + return 0; +} + +int sd_lldp_tx_set_ifindex(sd_lldp_tx *lldp_tx, int ifindex) { + assert_return(lldp_tx, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + + lldp_tx->ifindex = ifindex; + return 0; +} + +int sd_lldp_tx_set_ifname(sd_lldp_tx *lldp_tx, const char *ifname) { + assert_return(lldp_tx, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&lldp_tx->ifname, ifname); +} + +int sd_lldp_tx_get_ifname(sd_lldp_tx *lldp_tx, const char **ret) { + int r; + + assert_return(lldp_tx, -EINVAL); + + r = get_ifname(lldp_tx->ifindex, &lldp_tx->ifname); + if (r < 0) + return r; + + if (ret) + *ret = lldp_tx->ifname; + + return 0; +} + +int sd_lldp_tx_set_multicast_mode(sd_lldp_tx *lldp_tx, sd_lldp_multicast_mode_t mode) { + assert_return(lldp_tx, -EINVAL); + assert_return(mode >= 0 && mode < _SD_LLDP_MULTICAST_MODE_MAX, -EINVAL); + + lldp_tx->mode = mode; + return 0; +} + +int sd_lldp_tx_set_hwaddr(sd_lldp_tx *lldp_tx, const struct ether_addr *hwaddr) { + assert_return(lldp_tx, -EINVAL); + assert_return(!ether_addr_is_null(hwaddr), -EINVAL); + + lldp_tx->hwaddr = *hwaddr; + return 0; +} + +int sd_lldp_tx_set_capabilities(sd_lldp_tx *lldp_tx, uint16_t supported, uint16_t enabled) { + assert_return(lldp_tx, -EINVAL); + assert_return((enabled & ~supported) == 0, -EINVAL); + + lldp_tx->supported_capabilities = supported; + lldp_tx->enabled_capabilities = enabled; + return 0; +} + +int sd_lldp_tx_set_port_description(sd_lldp_tx *lldp_tx, const char *port_description) { + assert_return(lldp_tx, -EINVAL); + + /* An empty string unset the previously set hostname. */ + if (strlen_ptr(port_description) >= 512) + return -EINVAL; + + return free_and_strdup(&lldp_tx->port_description, empty_to_null(port_description)); +} + +int sd_lldp_tx_set_hostname(sd_lldp_tx *lldp_tx, const char *hostname) { + assert_return(lldp_tx, -EINVAL); + + /* An empty string unset the previously set hostname. */ + if (!isempty(hostname)) { + assert_cc(HOST_NAME_MAX < 512); + + if (!hostname_is_valid(hostname, 0)) + return -EINVAL; + } + + return free_and_strdup(&lldp_tx->hostname, empty_to_null(hostname)); +} + +int sd_lldp_tx_set_pretty_hostname(sd_lldp_tx *lldp_tx, const char *pretty_hostname) { + assert_return(lldp_tx, -EINVAL); + + /* An empty string unset the previously set hostname. */ + if (strlen_ptr(pretty_hostname) >= 512) + return -EINVAL; + + return free_and_strdup(&lldp_tx->pretty_hostname, empty_to_null(pretty_hostname)); +} + +int sd_lldp_tx_set_mud_url(sd_lldp_tx *lldp_tx, const char *mud_url) { + assert_return(lldp_tx, -EINVAL); + + /* An empty string unset the previously set hostname. */ + if (!isempty(mud_url)) { + /* Unless the maximum length of each value is 511, the MUD url must be smaller than 256. + * See RFC 8520. */ + if (strlen(mud_url) >= 256) + return -EINVAL; + + if (!http_url_is_valid(mud_url)) + return -EINVAL; + } + + return free_and_strdup(&lldp_tx->mud_url, empty_to_null(mud_url)); +} + +static size_t lldp_tx_calculate_maximum_packet_size(sd_lldp_tx *lldp_tx, const char *hostname, const char *pretty_hostname) { + assert(lldp_tx); + assert(lldp_tx->ifindex > 0); + + return sizeof(struct ether_header) + + /* Chassis ID */ + 2 + 1 + (SD_ID128_STRING_MAX - 1) + + /* Port ID */ + 2 + 1 + strlen_ptr(lldp_tx->ifname) + + /* TTL */ + 2 + 2 + + /* Port description */ + 2 + strlen_ptr(lldp_tx->port_description) + + /* System name */ + 2 + strlen_ptr(hostname) + + /* System description */ + 2 + strlen_ptr(pretty_hostname) + + /* MUD URL */ + 2 + sizeof(SD_LLDP_OUI_IANA_MUD) + strlen_ptr(lldp_tx->mud_url) + + /* System Capabilities */ + 2 + 4 + + /* End */ + 2; +} + +static int packet_append_tlv_header(uint8_t *packet, size_t packet_size, size_t *offset, uint8_t type, size_t data_len) { + assert(packet); + assert(offset); + + /* + * +--------+--------+-------------- + * |TLV Type| len | value + * |(7 bits)|(9 bits)|(0-511 octets) + * +--------+--------+-------------- + * where: + * + * len = indicates the length of value + */ + + /* The type field is 7-bits. */ + if (type >= 128) + return -EINVAL; + + /* The data length field is 9-bits. */ + if (data_len >= 512) + return -EINVAL; + + if (packet_size < 2 + data_len) + return -ENOBUFS; + + if (*offset > packet_size - 2 - data_len) + return -ENOBUFS; + + packet[(*offset)++] = (type << 1) | !!(data_len >> 8); + packet[(*offset)++] = data_len & (size_t) UINT8_MAX; + + return 0; +} + +static int packet_append_prefixed_string( + uint8_t *packet, + size_t packet_size, + size_t *offset, + uint8_t type, + size_t prefix_len, + const void *prefix, + const char *str) { + + size_t len; + int r; + + assert(packet); + assert(offset); + assert(prefix_len == 0 || prefix); + + if (isempty(str)) + return 0; + + len = strlen(str); + + /* Check for overflow */ + if (len > SIZE_MAX - prefix_len) + return -ENOBUFS; + + r = packet_append_tlv_header(packet, packet_size, offset, type, prefix_len + len); + if (r < 0) + return r; + + memcpy_safe(packet + *offset, prefix, prefix_len); + *offset += prefix_len; + + memcpy(packet + *offset, str, len); + *offset += len; + + return 0; +} + +static int packet_append_string( + uint8_t *packet, + size_t packet_size, + size_t *offset, + uint8_t type, + const char *str) { + + return packet_append_prefixed_string(packet, packet_size, offset, type, 0, NULL, str); +} + +static int lldp_tx_create_packet(sd_lldp_tx *lldp_tx, size_t *ret_packet_size, uint8_t **ret_packet) { + _cleanup_free_ char *hostname = NULL, *pretty_hostname = NULL; + _cleanup_free_ uint8_t *packet = NULL; + struct ether_header *header; + size_t packet_size, offset; + sd_id128_t machine_id; + int r; + + assert(lldp_tx); + assert(lldp_tx->ifindex > 0); + assert(ret_packet_size); + assert(ret_packet); + + /* If ifname is not set yet, set ifname from ifindex. */ + r = sd_lldp_tx_get_ifname(lldp_tx, NULL); + if (r < 0) + return r; + + r = sd_id128_get_machine(&machine_id); + if (r < 0) + return r; + + if (!lldp_tx->hostname) + (void) gethostname_strict(&hostname); + if (!lldp_tx->pretty_hostname) + (void) get_pretty_hostname(&pretty_hostname); + + packet_size = lldp_tx_calculate_maximum_packet_size(lldp_tx, + lldp_tx->hostname ?: hostname, + lldp_tx->pretty_hostname ?: pretty_hostname); + + packet = new(uint8_t, packet_size); + if (!packet) + return -ENOMEM; + + header = (struct ether_header*) packet; + header->ether_type = htobe16(ETHERTYPE_LLDP); + memcpy(header->ether_dhost, lldp_multicast_addr + lldp_tx->mode, ETH_ALEN); + memcpy(header->ether_shost, &lldp_tx->hwaddr, ETH_ALEN); + + offset = sizeof(struct ether_header); + + /* The three mandatory TLVs must appear first, in this specific order: + * 1. Chassis ID + * 2. Port ID + * 3. Time To Live + */ + + r = packet_append_prefixed_string(packet, packet_size, &offset, SD_LLDP_TYPE_CHASSIS_ID, + 1, (const uint8_t[]) { SD_LLDP_CHASSIS_SUBTYPE_LOCALLY_ASSIGNED }, + SD_ID128_TO_STRING(machine_id)); + if (r < 0) + return r; + + r = packet_append_prefixed_string(packet, packet_size, &offset, SD_LLDP_TYPE_PORT_ID, + 1, (const uint8_t[]) { SD_LLDP_PORT_SUBTYPE_INTERFACE_NAME }, + lldp_tx->ifname); + if (r < 0) + return r; + + r = packet_append_tlv_header(packet, packet_size, &offset, SD_LLDP_TYPE_TTL, 2); + if (r < 0) + return r; + + unaligned_write_be16(packet + offset, LLDP_TX_TTL); + offset += 2; + + /* Optional TLVs follow, in no specific order: */ + + r = packet_append_string(packet, packet_size, &offset, SD_LLDP_TYPE_PORT_DESCRIPTION, + lldp_tx->port_description); + if (r < 0) + return r; + + r = packet_append_string(packet, packet_size, &offset, SD_LLDP_TYPE_SYSTEM_NAME, + lldp_tx->hostname ?: hostname); + if (r < 0) + return r; + + r = packet_append_string(packet, packet_size, &offset, SD_LLDP_TYPE_SYSTEM_DESCRIPTION, + lldp_tx->pretty_hostname ?: pretty_hostname); + if (r < 0) + return r; + + /* See section 12 of RFC 8520. + * +--------+--------+----------+---------+-------------- + * |TLV Type| len | OUI |subtype | MUDString + * | =127 | |= 00 00 5E| = 1 | + * |(7 bits)|(9 bits)|(3 octets)|(1 octet)|(1-255 octets) + * +--------+--------+----------+---------+-------------- + * where: + * + * o TLV Type = 127 indicates a vendor-specific TLV + * o len = indicates the TLV string length + * o OUI = 00 00 5E is the organizationally unique identifier of IANA + * o subtype = 1 (as assigned by IANA for the MUDstring) + * o MUDstring = the length MUST NOT exceed 255 octets + */ + r = packet_append_prefixed_string(packet, packet_size, &offset, SD_LLDP_TYPE_PRIVATE, + sizeof(SD_LLDP_OUI_IANA_MUD), SD_LLDP_OUI_IANA_MUD, + lldp_tx->mud_url); + if (r < 0) + return r; + + r = packet_append_tlv_header(packet, packet_size, &offset, SD_LLDP_TYPE_SYSTEM_CAPABILITIES, 4); + if (r < 0) + return r; + + unaligned_write_be16(packet + offset, lldp_tx->supported_capabilities); + offset += 2; + unaligned_write_be16(packet + offset, lldp_tx->enabled_capabilities); + offset += 2; + + r = packet_append_tlv_header(packet, packet_size, &offset, SD_LLDP_TYPE_END, 0); + if (r < 0) + return r; + + *ret_packet_size = offset; + *ret_packet = TAKE_PTR(packet); + return 0; +} + +static int lldp_tx_send_packet(sd_lldp_tx *lldp_tx, size_t packet_size, const uint8_t *packet) { + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + ssize_t l; + + assert(lldp_tx); + assert(lldp_tx->ifindex > 0); + assert(packet_size > sizeof(struct ether_header)); + assert(packet); + + sa = (union sockaddr_union) { + .ll.sll_family = AF_PACKET, + .ll.sll_protocol = htobe16(ETHERTYPE_LLDP), + .ll.sll_ifindex = lldp_tx->ifindex, + .ll.sll_halen = ETH_ALEN, + }; + memcpy(sa.ll.sll_addr, lldp_multicast_addr + lldp_tx->mode, ETH_ALEN); + + fd = socket(AF_PACKET, SOCK_RAW | SOCK_CLOEXEC, IPPROTO_RAW); + if (fd < 0) + return -errno; + + l = sendto(fd, packet, packet_size, MSG_NOSIGNAL, &sa.sa, sizeof(sa.ll)); + if (l < 0) + return -errno; + + if ((size_t) l != packet_size) + return -EIO; + + return 0; +} + +static int lldp_tx_send(sd_lldp_tx *lldp_tx) { + _cleanup_free_ uint8_t *packet = NULL; + size_t packet_size = 0; /* avoid false maybe-uninitialized warning */ + int r; + + assert(lldp_tx); + + r = lldp_tx_create_packet(lldp_tx, &packet_size, &packet); + if (r < 0) + return r; + + return lldp_tx_send_packet(lldp_tx, packet_size, packet); +} + +int sd_lldp_tx_attach_event(sd_lldp_tx *lldp_tx, sd_event *event, int64_t priority) { + int r; + + assert_return(lldp_tx, -EINVAL); + assert_return(!lldp_tx->event, -EBUSY); + + if (event) + lldp_tx->event = sd_event_ref(event); + else { + r = sd_event_default(&lldp_tx->event); + if (r < 0) + return r; + } + + lldp_tx->event_priority = priority; + + return 0; +} + +int sd_lldp_tx_detach_event(sd_lldp_tx *lldp_tx) { + assert_return(lldp_tx, -EINVAL); + + lldp_tx->timer_event_source = sd_event_source_disable_unref(lldp_tx->timer_event_source); + lldp_tx->event = sd_event_unref(lldp_tx->event); + return 0; +} + +static usec_t lldp_tx_get_delay(sd_lldp_tx *lldp_tx) { + assert(lldp_tx); + + return usec_add(lldp_tx->fast_tx > 0 ? LLDP_FAST_TX_INTERVAL_USEC : LLDP_TX_INTERVAL_USEC, + (usec_t) random_u64() % LLDP_TX_JITTER_USEC); +} + +static int lldp_tx_reset_timer(sd_lldp_tx *lldp_tx) { + usec_t delay; + int r; + + assert(lldp_tx); + assert(lldp_tx->timer_event_source); + + delay = lldp_tx_get_delay(lldp_tx); + + r = sd_event_source_set_time_relative(lldp_tx->timer_event_source, delay); + if (r < 0) + return r; + + return sd_event_source_set_enabled(lldp_tx->timer_event_source, SD_EVENT_ONESHOT); +} + +static int on_timer_event(sd_event_source *s, uint64_t usec, void *userdata) { + sd_lldp_tx *lldp_tx = ASSERT_PTR(userdata); + int r; + + r = lldp_tx_send(lldp_tx); + if (r < 0) + log_lldp_tx_errno(lldp_tx, r, "Failed to send packet, ignoring: %m"); + + if (lldp_tx->fast_tx > 0) + lldp_tx->fast_tx--; + + r = lldp_tx_reset_timer(lldp_tx); + if (r < 0) + log_lldp_tx_errno(lldp_tx, r, "Failed to reset timer: %m"); + + return 0; +} + +int sd_lldp_tx_is_running(sd_lldp_tx *lldp_tx) { + int enabled; + + if (!lldp_tx) + return 0; + + if (!lldp_tx->timer_event_source) + return 0; + + if (sd_event_source_get_enabled(lldp_tx->timer_event_source, &enabled) < 0) + return 0; + + return enabled == SD_EVENT_ONESHOT; +} + +int sd_lldp_tx_stop(sd_lldp_tx *lldp_tx) { + if (!lldp_tx) + return 0; + + if (!lldp_tx->timer_event_source) + return 0; + + (void) sd_event_source_set_enabled(lldp_tx->timer_event_source, SD_EVENT_OFF); + + return 1; +} +int sd_lldp_tx_start(sd_lldp_tx *lldp_tx) { + usec_t delay; + int r; + + assert_return(lldp_tx, -EINVAL); + assert_return(lldp_tx->event, -EINVAL); + assert_return(lldp_tx->ifindex > 0, -EINVAL); + assert_return(lldp_tx->mode >= 0 && lldp_tx->mode < _SD_LLDP_MULTICAST_MODE_MAX, -EINVAL); + assert_return(!ether_addr_is_null(&lldp_tx->hwaddr), -EINVAL); + + if (sd_lldp_tx_is_running(lldp_tx)) + return 0; + + lldp_tx->fast_tx = LLDP_FAST_TX_INIT; + + if (lldp_tx->timer_event_source) { + r = lldp_tx_reset_timer(lldp_tx); + if (r < 0) + return log_lldp_tx_errno(lldp_tx, r, "Failed to re-enable timer: %m"); + + return 0; + } + + delay = lldp_tx_get_delay(lldp_tx); + + r = sd_event_add_time_relative(lldp_tx->event, &lldp_tx->timer_event_source, + CLOCK_BOOTTIME, delay, 0, + on_timer_event, lldp_tx); + if (r < 0) + return r; + + (void) sd_event_source_set_description(lldp_tx->timer_event_source, "lldp-tx-timer"); + (void) sd_event_source_set_priority(lldp_tx->timer_event_source, lldp_tx->event_priority); + + return 0; +} diff --git a/src/libsystemd-network/sd-ndisc.c b/src/libsystemd-network/sd-ndisc.c new file mode 100644 index 0000000..1beed5d --- /dev/null +++ b/src/libsystemd-network/sd-ndisc.c @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "sd-ndisc.h" + +#include "alloc-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "icmp6-util.h" +#include "in-addr-util.h" +#include "memory-util.h" +#include "ndisc-internal.h" +#include "ndisc-router.h" +#include "network-common.h" +#include "random-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" + +#define NDISC_TIMEOUT_NO_RA_USEC (NDISC_ROUTER_SOLICITATION_INTERVAL * NDISC_MAX_ROUTER_SOLICITATIONS) + +static const char * const ndisc_event_table[_SD_NDISC_EVENT_MAX] = { + [SD_NDISC_EVENT_TIMEOUT] = "timeout", + [SD_NDISC_EVENT_ROUTER] = "router", +}; + +DEFINE_STRING_TABLE_LOOKUP(ndisc_event, sd_ndisc_event_t); + +static void ndisc_callback(sd_ndisc *ndisc, sd_ndisc_event_t event, sd_ndisc_router *rt) { + assert(ndisc); + assert(event >= 0 && event < _SD_NDISC_EVENT_MAX); + + if (!ndisc->callback) + return (void) log_ndisc(ndisc, "Received '%s' event.", ndisc_event_to_string(event)); + + log_ndisc(ndisc, "Invoking callback for '%s' event.", ndisc_event_to_string(event)); + ndisc->callback(ndisc, event, rt, ndisc->userdata); +} + +int sd_ndisc_set_callback( + sd_ndisc *nd, + sd_ndisc_callback_t callback, + void *userdata) { + + assert_return(nd, -EINVAL); + + nd->callback = callback; + nd->userdata = userdata; + + return 0; +} + +int sd_ndisc_set_ifindex(sd_ndisc *nd, int ifindex) { + assert_return(nd, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + assert_return(nd->fd < 0, -EBUSY); + + nd->ifindex = ifindex; + return 0; +} + +int sd_ndisc_set_ifname(sd_ndisc *nd, const char *ifname) { + assert_return(nd, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&nd->ifname, ifname); +} + +int sd_ndisc_get_ifname(sd_ndisc *nd, const char **ret) { + int r; + + assert_return(nd, -EINVAL); + + r = get_ifname(nd->ifindex, &nd->ifname); + if (r < 0) + return r; + + if (ret) + *ret = nd->ifname; + + return 0; +} + +int sd_ndisc_set_mac(sd_ndisc *nd, const struct ether_addr *mac_addr) { + assert_return(nd, -EINVAL); + + if (mac_addr) + nd->mac_addr = *mac_addr; + else + zero(nd->mac_addr); + + return 0; +} + +int sd_ndisc_attach_event(sd_ndisc *nd, sd_event *event, int64_t priority) { + int r; + + assert_return(nd, -EINVAL); + assert_return(nd->fd < 0, -EBUSY); + assert_return(!nd->event, -EBUSY); + + if (event) + nd->event = sd_event_ref(event); + else { + r = sd_event_default(&nd->event); + if (r < 0) + return 0; + } + + nd->event_priority = priority; + + return 0; +} + +int sd_ndisc_detach_event(sd_ndisc *nd) { + + assert_return(nd, -EINVAL); + assert_return(nd->fd < 0, -EBUSY); + + nd->event = sd_event_unref(nd->event); + return 0; +} + +sd_event *sd_ndisc_get_event(sd_ndisc *nd) { + assert_return(nd, NULL); + + return nd->event; +} + +static void ndisc_reset(sd_ndisc *nd) { + assert(nd); + + (void) event_source_disable(nd->timeout_event_source); + (void) event_source_disable(nd->timeout_no_ra); + nd->retransmit_time = 0; + nd->recv_event_source = sd_event_source_disable_unref(nd->recv_event_source); + nd->fd = safe_close(nd->fd); +} + +static sd_ndisc *ndisc_free(sd_ndisc *nd) { + assert(nd); + + ndisc_reset(nd); + + sd_event_source_unref(nd->timeout_event_source); + sd_event_source_unref(nd->timeout_no_ra); + sd_ndisc_detach_event(nd); + + free(nd->ifname); + return mfree(nd); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_ndisc, sd_ndisc, ndisc_free); + +int sd_ndisc_new(sd_ndisc **ret) { + _cleanup_(sd_ndisc_unrefp) sd_ndisc *nd = NULL; + + assert_return(ret, -EINVAL); + + nd = new(sd_ndisc, 1); + if (!nd) + return -ENOMEM; + + *nd = (sd_ndisc) { + .n_ref = 1, + .fd = -EBADF, + }; + + *ret = TAKE_PTR(nd); + + return 0; +} + +static int ndisc_handle_datagram(sd_ndisc *nd, sd_ndisc_router *rt) { + int r; + + assert(nd); + assert(rt); + + r = ndisc_router_parse(nd, rt); + if (r < 0) + return r; + + log_ndisc(nd, "Received Router Advertisement: flags %s preference %s lifetime %s", + rt->flags & ND_RA_FLAG_MANAGED ? "MANAGED" : rt->flags & ND_RA_FLAG_OTHER ? "OTHER" : "none", + rt->preference == SD_NDISC_PREFERENCE_HIGH ? "high" : rt->preference == SD_NDISC_PREFERENCE_LOW ? "low" : "medium", + FORMAT_TIMESPAN(rt->lifetime_usec, USEC_PER_SEC)); + + ndisc_callback(nd, SD_NDISC_EVENT_ROUTER, rt); + return 0; +} + +static int ndisc_recv(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(sd_ndisc_router_unrefp) sd_ndisc_router *rt = NULL; + sd_ndisc *nd = ASSERT_PTR(userdata); + ssize_t buflen; + int r; + + assert(s); + assert(nd->event); + + buflen = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(buflen) || ERRNO_IS_NEG_DISCONNECT(buflen)) + return 0; + if (buflen < 0) { + log_ndisc_errno(nd, buflen, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + rt = ndisc_router_new(buflen); + if (!rt) + return -ENOMEM; + + r = icmp6_receive(fd, NDISC_ROUTER_RAW(rt), rt->raw_size, &rt->address, &rt->timestamp); + if (ERRNO_IS_NEG_TRANSIENT(r) || ERRNO_IS_NEG_DISCONNECT(r)) + return 0; + if (r < 0) + switch (r) { + case -EADDRNOTAVAIL: + log_ndisc(nd, "Received RA from neither link-local nor null address. Ignoring."); + return 0; + + case -EMULTIHOP: + log_ndisc(nd, "Received RA with invalid hop limit. Ignoring."); + return 0; + + case -EPFNOSUPPORT: + log_ndisc(nd, "Received invalid source address from ICMPv6 socket. Ignoring."); + return 0; + + default: + log_ndisc_errno(nd, r, "Unexpected error while reading from ICMPv6, ignoring: %m"); + return 0; + } + + /* The function icmp6_receive() accepts the null source address, but RFC 4861 Section 6.1.2 states + * that hosts MUST discard messages with the null source address. */ + if (in6_addr_is_null(&rt->address)) + log_ndisc(nd, "Received RA from null address. Ignoring."); + + (void) event_source_disable(nd->timeout_event_source); + (void) ndisc_handle_datagram(nd, rt); + return 0; +} + +static usec_t ndisc_timeout_compute_random(usec_t val) { + /* compute a time that is random within ±10% of the given value */ + return val - val / 10 + + (random_u64() % (2 * USEC_PER_SEC)) * val / 10 / USEC_PER_SEC; +} + +static int ndisc_timeout(sd_event_source *s, uint64_t usec, void *userdata) { + sd_ndisc *nd = ASSERT_PTR(userdata); + usec_t time_now; + int r; + + assert(s); + assert(nd->event); + + assert_se(sd_event_now(nd->event, CLOCK_BOOTTIME, &time_now) >= 0); + + if (!nd->retransmit_time) + nd->retransmit_time = ndisc_timeout_compute_random(NDISC_ROUTER_SOLICITATION_INTERVAL); + else { + if (nd->retransmit_time > NDISC_MAX_ROUTER_SOLICITATION_INTERVAL / 2) + nd->retransmit_time = ndisc_timeout_compute_random(NDISC_MAX_ROUTER_SOLICITATION_INTERVAL); + else + nd->retransmit_time += ndisc_timeout_compute_random(nd->retransmit_time); + } + + r = event_reset_time(nd->event, &nd->timeout_event_source, + CLOCK_BOOTTIME, + time_now + nd->retransmit_time, 10 * USEC_PER_MSEC, + ndisc_timeout, nd, + nd->event_priority, "ndisc-timeout-no-ra", true); + if (r < 0) + goto fail; + + r = icmp6_send_router_solicitation(nd->fd, &nd->mac_addr); + if (r < 0) + log_ndisc_errno(nd, r, "Failed to send Router Solicitation, next solicitation in %s, ignoring: %m", + FORMAT_TIMESPAN(nd->retransmit_time, USEC_PER_SEC)); + else + log_ndisc(nd, "Sent Router Solicitation, next solicitation in %s", + FORMAT_TIMESPAN(nd->retransmit_time, USEC_PER_SEC)); + + return 0; + +fail: + (void) sd_ndisc_stop(nd); + return 0; +} + +static int ndisc_timeout_no_ra(sd_event_source *s, uint64_t usec, void *userdata) { + sd_ndisc *nd = ASSERT_PTR(userdata); + + assert(s); + + log_ndisc(nd, "No RA received before link confirmation timeout"); + + (void) event_source_disable(nd->timeout_no_ra); + ndisc_callback(nd, SD_NDISC_EVENT_TIMEOUT, NULL); + + return 0; +} + +int sd_ndisc_stop(sd_ndisc *nd) { + if (!nd) + return 0; + + if (nd->fd < 0) + return 0; + + log_ndisc(nd, "Stopping IPv6 Router Solicitation client"); + + ndisc_reset(nd); + return 1; +} + +int sd_ndisc_start(sd_ndisc *nd) { + int r; + usec_t time_now; + + assert_return(nd, -EINVAL); + assert_return(nd->event, -EINVAL); + assert_return(nd->ifindex > 0, -EINVAL); + + if (nd->fd >= 0) + return 0; + + assert(!nd->recv_event_source); + + r = sd_event_now(nd->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + goto fail; + + nd->fd = icmp6_bind_router_solicitation(nd->ifindex); + if (nd->fd < 0) + return nd->fd; + + r = sd_event_add_io(nd->event, &nd->recv_event_source, nd->fd, EPOLLIN, ndisc_recv, nd); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(nd->recv_event_source, nd->event_priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(nd->recv_event_source, "ndisc-receive-message"); + + r = event_reset_time(nd->event, &nd->timeout_event_source, + CLOCK_BOOTTIME, + time_now + USEC_PER_SEC / 2, 1 * USEC_PER_SEC, /* See RFC 8415 sec. 18.2.1 */ + ndisc_timeout, nd, + nd->event_priority, "ndisc-timeout", true); + if (r < 0) + goto fail; + + r = event_reset_time(nd->event, &nd->timeout_no_ra, + CLOCK_BOOTTIME, + time_now + NDISC_TIMEOUT_NO_RA_USEC, 10 * USEC_PER_MSEC, + ndisc_timeout_no_ra, nd, + nd->event_priority, "ndisc-timeout-no-ra", true); + if (r < 0) + goto fail; + + log_ndisc(nd, "Started IPv6 Router Solicitation client"); + return 1; + +fail: + ndisc_reset(nd); + return r; +} diff --git a/src/libsystemd-network/sd-radv.c b/src/libsystemd-network/sd-radv.c new file mode 100644 index 0000000..97d306c --- /dev/null +++ b/src/libsystemd-network/sd-radv.c @@ -0,0 +1,1161 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2017 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include + +#include "sd-radv.h" + +#include "alloc-util.h" +#include "dns-domain.h" +#include "ether-addr-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "icmp6-util.h" +#include "in-addr-util.h" +#include "iovec-util.h" +#include "macro.h" +#include "memory-util.h" +#include "network-common.h" +#include "radv-internal.h" +#include "random-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "unaligned.h" + +int sd_radv_new(sd_radv **ret) { + _cleanup_(sd_radv_unrefp) sd_radv *ra = NULL; + + assert_return(ret, -EINVAL); + + ra = new(sd_radv, 1); + if (!ra) + return -ENOMEM; + + *ra = (sd_radv) { + .n_ref = 1, + .fd = -EBADF, + .lifetime_usec = RADV_DEFAULT_ROUTER_LIFETIME_USEC, + }; + + *ret = TAKE_PTR(ra); + + return 0; +} + +int sd_radv_attach_event(sd_radv *ra, sd_event *event, int64_t priority) { + int r; + + assert_return(ra, -EINVAL); + assert_return(!ra->event, -EBUSY); + + if (event) + ra->event = sd_event_ref(event); + else { + r = sd_event_default(&ra->event); + if (r < 0) + return 0; + } + + ra->event_priority = priority; + + return 0; +} + +int sd_radv_detach_event(sd_radv *ra) { + + assert_return(ra, -EINVAL); + + ra->event = sd_event_unref(ra->event); + return 0; +} + +sd_event *sd_radv_get_event(sd_radv *ra) { + assert_return(ra, NULL); + + return ra->event; +} + +int sd_radv_is_running(sd_radv *ra) { + assert_return(ra, false); + + return ra->state != RADV_STATE_IDLE; +} + +static void radv_reset(sd_radv *ra) { + assert(ra); + + (void) event_source_disable(ra->timeout_event_source); + + ra->recv_event_source = sd_event_source_disable_unref(ra->recv_event_source); + + ra->ra_sent = 0; +} + +static sd_radv *radv_free(sd_radv *ra) { + if (!ra) + return NULL; + + LIST_CLEAR(prefix, ra->prefixes, sd_radv_prefix_unref); + LIST_CLEAR(prefix, ra->route_prefixes, sd_radv_route_prefix_unref); + LIST_CLEAR(prefix, ra->pref64_prefixes, sd_radv_pref64_prefix_unref); + + free(ra->rdnss); + free(ra->dnssl); + + radv_reset(ra); + + sd_event_source_unref(ra->timeout_event_source); + sd_radv_detach_event(ra); + + ra->fd = safe_close(ra->fd); + free(ra->ifname); + + return mfree(ra); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_radv, sd_radv, radv_free); + +static bool router_lifetime_is_valid(usec_t lifetime_usec) { + return lifetime_usec == 0 || + (lifetime_usec >= RADV_MIN_ROUTER_LIFETIME_USEC && + lifetime_usec <= RADV_MAX_ROUTER_LIFETIME_USEC); +} + +static int radv_send(sd_radv *ra, const struct in6_addr *dst, usec_t lifetime_usec) { + struct sockaddr_in6 dst_addr = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ALL_NODES_MULTICAST_INIT, + }; + struct nd_router_advert adv = {}; + struct { + struct nd_opt_hdr opthdr; + struct ether_addr slladdr; + } _packed_ opt_mac = { + .opthdr = { + .nd_opt_type = ND_OPT_SOURCE_LINKADDR, + .nd_opt_len = (sizeof(struct nd_opt_hdr) + + sizeof(struct ether_addr) - 1) /8 + 1, + }, + }; + struct nd_opt_mtu opt_mtu = { + .nd_opt_mtu_type = ND_OPT_MTU, + .nd_opt_mtu_len = 1, + }; + /* Reserve iov space for RA header, linkaddr, MTU, N prefixes, N routes, N pref64 prefixes, RDNSS, + * DNSSL, and home agent. */ + struct iovec iov[6 + ra->n_prefixes + ra->n_route_prefixes + ra->n_pref64_prefixes]; + struct msghdr msg = { + .msg_name = &dst_addr, + .msg_namelen = sizeof(dst_addr), + .msg_iov = iov, + }; + usec_t time_now; + int r; + + assert(ra); + assert(router_lifetime_is_valid(lifetime_usec)); + + r = sd_event_now(ra->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + return r; + + if (dst && in6_addr_is_set(dst)) + dst_addr.sin6_addr = *dst; + + adv.nd_ra_type = ND_ROUTER_ADVERT; + adv.nd_ra_curhoplimit = ra->hop_limit; + adv.nd_ra_retransmit = usec_to_be32_msec(ra->retransmit_usec); + adv.nd_ra_flags_reserved = ra->flags; + assert_cc(RADV_MAX_ROUTER_LIFETIME_USEC <= UINT16_MAX * USEC_PER_SEC); + adv.nd_ra_router_lifetime = usec_to_be16_sec(lifetime_usec); + iov[msg.msg_iovlen++] = IOVEC_MAKE(&adv, sizeof(adv)); + + /* MAC address is optional, either because the link does not use L2 + addresses or load sharing is desired. See RFC 4861, Section 4.2 */ + if (!ether_addr_is_null(&ra->mac_addr)) { + opt_mac.slladdr = ra->mac_addr; + iov[msg.msg_iovlen++] = IOVEC_MAKE(&opt_mac, sizeof(opt_mac)); + } + + if (ra->mtu > 0) { + opt_mtu.nd_opt_mtu_mtu = htobe32(ra->mtu); + iov[msg.msg_iovlen++] = IOVEC_MAKE(&opt_mtu, sizeof(opt_mtu)); + } + + LIST_FOREACH(prefix, p, ra->prefixes) { + usec_t lifetime_valid_usec, lifetime_preferred_usec; + + lifetime_valid_usec = MIN(usec_sub_unsigned(p->valid_until, time_now), + p->lifetime_valid_usec); + + lifetime_preferred_usec = MIN3(usec_sub_unsigned(p->preferred_until, time_now), + p->lifetime_preferred_usec, + lifetime_valid_usec); + + p->opt.lifetime_valid = usec_to_be32_sec(lifetime_valid_usec); + p->opt.lifetime_preferred = usec_to_be32_sec(lifetime_preferred_usec); + + iov[msg.msg_iovlen++] = IOVEC_MAKE(&p->opt, sizeof(p->opt)); + } + + LIST_FOREACH(prefix, rt, ra->route_prefixes) { + rt->opt.lifetime = usec_to_be32_sec(MIN(usec_sub_unsigned(rt->valid_until, time_now), + rt->lifetime_usec)); + + iov[msg.msg_iovlen++] = IOVEC_MAKE(&rt->opt, sizeof(rt->opt)); + } + + LIST_FOREACH(prefix, p, ra->pref64_prefixes) + iov[msg.msg_iovlen++] = IOVEC_MAKE(&p->opt, sizeof(p->opt)); + + if (ra->rdnss) + iov[msg.msg_iovlen++] = IOVEC_MAKE(ra->rdnss, ra->rdnss->length * 8); + + if (ra->dnssl) + iov[msg.msg_iovlen++] = IOVEC_MAKE(ra->dnssl, ra->dnssl->length * 8); + + if (FLAGS_SET(ra->flags, ND_RA_FLAG_HOME_AGENT)) { + ra->home_agent.nd_opt_home_agent_info_type = ND_OPT_HOME_AGENT_INFO; + ra->home_agent.nd_opt_home_agent_info_len = 1; + + /* 0 means to place the current Router Lifetime value */ + if (ra->home_agent.nd_opt_home_agent_info_lifetime == 0) + ra->home_agent.nd_opt_home_agent_info_lifetime = adv.nd_ra_router_lifetime; + + iov[msg.msg_iovlen++] = IOVEC_MAKE(&ra->home_agent, sizeof(ra->home_agent)); + } + + if (sendmsg(ra->fd, &msg, 0) < 0) + return -errno; + + return 0; +} + +static int radv_recv(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + sd_radv *ra = ASSERT_PTR(userdata); + struct in6_addr src; + triple_timestamp timestamp; + int r; + + assert(s); + assert(ra->event); + + ssize_t buflen = next_datagram_size_fd(fd); + if (ERRNO_IS_NEG_TRANSIENT(buflen) || ERRNO_IS_NEG_DISCONNECT(buflen)) + return 0; + if (buflen < 0) { + log_radv_errno(ra, buflen, "Failed to determine datagram size to read, ignoring: %m"); + return 0; + } + + _cleanup_free_ char *buf = new0(char, buflen); + if (!buf) + return -ENOMEM; + + r = icmp6_receive(fd, buf, buflen, &src, ×tamp); + if (ERRNO_IS_NEG_TRANSIENT(r) || ERRNO_IS_NEG_DISCONNECT(r)) + return 0; + if (r < 0) + switch (r) { + case -EADDRNOTAVAIL: + log_radv(ra, "Received RS from neither link-local nor null address. Ignoring"); + return 0; + + case -EMULTIHOP: + log_radv(ra, "Received RS with invalid hop limit. Ignoring."); + return 0; + + case -EPFNOSUPPORT: + log_radv(ra, "Received invalid source address from ICMPv6 socket. Ignoring."); + return 0; + + default: + log_radv_errno(ra, r, "Unexpected error receiving from ICMPv6 socket, ignoring: %m"); + return 0; + } + + if ((size_t) buflen < sizeof(struct nd_router_solicit)) { + log_radv(ra, "Too short packet received, ignoring"); + return 0; + } + + /* TODO: if the sender address is null, check that the message does not have the source link-layer + * address option. See RFC 4861 Section 6.1.1. */ + + const char *addr = IN6_ADDR_TO_STRING(&src); + + r = radv_send(ra, &src, ra->lifetime_usec); + if (r < 0) + log_radv_errno(ra, r, "Unable to send solicited Router Advertisement to %s, ignoring: %m", addr); + else + log_radv(ra, "Sent solicited Router Advertisement to %s", addr); + + return 0; +} + +static int radv_timeout(sd_event_source *s, uint64_t usec, void *userdata) { + usec_t min_timeout, max_timeout, time_now, timeout; + sd_radv *ra = ASSERT_PTR(userdata); + int r; + + assert(s); + assert(ra->event); + assert(router_lifetime_is_valid(ra->lifetime_usec)); + + r = sd_event_now(ra->event, CLOCK_BOOTTIME, &time_now); + if (r < 0) + goto fail; + + r = radv_send(ra, NULL, ra->lifetime_usec); + if (r < 0) + log_radv_errno(ra, r, "Unable to send Router Advertisement, ignoring: %m"); + + /* RFC 4861, Section 6.2.4, sending initial Router Advertisements */ + if (ra->ra_sent < RADV_MAX_INITIAL_RTR_ADVERTISEMENTS) + max_timeout = RADV_MAX_INITIAL_RTR_ADVERT_INTERVAL_USEC; + else + max_timeout = RADV_DEFAULT_MAX_TIMEOUT_USEC; + + /* RFC 4861, Section 6.2.1, lifetime must be at least MaxRtrAdvInterval, + * so lower the interval here */ + if (ra->lifetime_usec > 0) + max_timeout = MIN(max_timeout, ra->lifetime_usec); + + if (max_timeout >= 9 * USEC_PER_SEC) + min_timeout = max_timeout / 3; + else + min_timeout = max_timeout * 3 / 4; + + /* RFC 4861, Section 6.2.1. + * MaxRtrAdvInterval MUST be no less than 4 seconds and no greater than 1800 seconds. + * MinRtrAdvInterval MUST be no less than 3 seconds and no greater than .75 * MaxRtrAdvInterval. */ + assert(max_timeout >= RADV_MIN_MAX_TIMEOUT_USEC); + assert(max_timeout <= RADV_MAX_MAX_TIMEOUT_USEC); + assert(min_timeout >= RADV_MIN_MIN_TIMEOUT_USEC); + assert(min_timeout <= max_timeout * 3 / 4); + + timeout = min_timeout + random_u64_range(max_timeout - min_timeout); + log_radv(ra, "Next Router Advertisement in %s", FORMAT_TIMESPAN(timeout, USEC_PER_SEC)); + + r = event_reset_time(ra->event, &ra->timeout_event_source, + CLOCK_BOOTTIME, + usec_add(time_now, timeout), MSEC_PER_SEC, + radv_timeout, ra, + ra->event_priority, "radv-timeout", true); + if (r < 0) + goto fail; + + ra->ra_sent++; + + return 0; + +fail: + sd_radv_stop(ra); + + return 0; +} + +int sd_radv_stop(sd_radv *ra) { + int r; + + if (!ra) + return 0; + + if (ra->state == RADV_STATE_IDLE) + return 0; + + log_radv(ra, "Stopping IPv6 Router Advertisement daemon"); + + /* RFC 4861, Section 6.2.5, send at least one Router Advertisement + with zero lifetime */ + r = radv_send(ra, NULL, 0); + if (r < 0) + log_radv_errno(ra, r, "Unable to send last Router Advertisement with router lifetime set to zero, ignoring: %m"); + + radv_reset(ra); + ra->fd = safe_close(ra->fd); + ra->state = RADV_STATE_IDLE; + + return 0; +} + +int sd_radv_start(sd_radv *ra) { + int r; + + assert_return(ra, -EINVAL); + assert_return(ra->event, -EINVAL); + assert_return(ra->ifindex > 0, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return 0; + + r = event_reset_time(ra->event, &ra->timeout_event_source, + CLOCK_BOOTTIME, + 0, 0, + radv_timeout, ra, + ra->event_priority, "radv-timeout", true); + if (r < 0) + goto fail; + + r = icmp6_bind_router_advertisement(ra->ifindex); + if (r < 0) + goto fail; + + ra->fd = r; + + r = sd_event_add_io(ra->event, &ra->recv_event_source, ra->fd, EPOLLIN, radv_recv, ra); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(ra->recv_event_source, ra->event_priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(ra->recv_event_source, "radv-receive-message"); + + ra->state = RADV_STATE_ADVERTISING; + + log_radv(ra, "Started IPv6 Router Advertisement daemon"); + + return 0; + + fail: + radv_reset(ra); + + return r; +} + +int sd_radv_set_ifindex(sd_radv *ra, int ifindex) { + assert_return(ra, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + ra->ifindex = ifindex; + + return 0; +} + +int sd_radv_set_ifname(sd_radv *ra, const char *ifname) { + assert_return(ra, -EINVAL); + assert_return(ifname, -EINVAL); + + if (!ifname_valid_full(ifname, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + return free_and_strdup(&ra->ifname, ifname); +} + +int sd_radv_get_ifname(sd_radv *ra, const char **ret) { + int r; + + assert_return(ra, -EINVAL); + + r = get_ifname(ra->ifindex, &ra->ifname); + if (r < 0) + return r; + + if (ret) + *ret = ra->ifname; + + return 0; +} + +int sd_radv_set_mac(sd_radv *ra, const struct ether_addr *mac_addr) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + if (mac_addr) + ra->mac_addr = *mac_addr; + else + zero(ra->mac_addr); + + return 0; +} + +int sd_radv_set_mtu(sd_radv *ra, uint32_t mtu) { + assert_return(ra, -EINVAL); + assert_return(mtu >= 1280, -EINVAL); + + ra->mtu = mtu; + + return 0; +} + +int sd_radv_set_hop_limit(sd_radv *ra, uint8_t hop_limit) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + ra->hop_limit = hop_limit; + + return 0; +} + +int sd_radv_set_retransmit(sd_radv *ra, uint64_t usec) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + if (usec > RADV_MAX_RETRANSMIT_USEC) + return -EINVAL; + + ra->retransmit_usec = usec; + return 0; +} + +int sd_radv_set_router_lifetime(sd_radv *ra, uint64_t usec) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + if (!router_lifetime_is_valid(usec)) + return -EINVAL; + + /* RFC 4191, Section 2.2, "...If the Router Lifetime is zero, the preference value MUST be set + * to (00) by the sender..." */ + if (usec == 0 && + (ra->flags & (0x3 << 3)) != (SD_NDISC_PREFERENCE_MEDIUM << 3)) + return -EINVAL; + + ra->lifetime_usec = usec; + return 0; +} + +int sd_radv_set_managed_information(sd_radv *ra, int managed) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + SET_FLAG(ra->flags, ND_RA_FLAG_MANAGED, managed); + + return 0; +} + +int sd_radv_set_other_information(sd_radv *ra, int other) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + SET_FLAG(ra->flags, ND_RA_FLAG_OTHER, other); + + return 0; +} + +int sd_radv_set_preference(sd_radv *ra, unsigned preference) { + assert_return(ra, -EINVAL); + assert_return(IN_SET(preference, + SD_NDISC_PREFERENCE_LOW, + SD_NDISC_PREFERENCE_MEDIUM, + SD_NDISC_PREFERENCE_HIGH), -EINVAL); + + /* RFC 4191, Section 2.2, "...If the Router Lifetime is zero, the preference value MUST be set + * to (00) by the sender..." */ + if (ra->lifetime_usec == 0 && preference != SD_NDISC_PREFERENCE_MEDIUM) + return -EINVAL; + + ra->flags = (ra->flags & ~(0x3 << 3)) | (preference << 3); + + return 0; +} + +int sd_radv_set_home_agent_information(sd_radv *ra, int home_agent) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + SET_FLAG(ra->flags, ND_RA_FLAG_HOME_AGENT, home_agent); + + return 0; +} + +int sd_radv_set_home_agent_preference(sd_radv *ra, uint16_t preference) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + ra->home_agent.nd_opt_home_agent_info_preference = htobe16(preference); + + return 0; +} + +int sd_radv_set_home_agent_lifetime(sd_radv *ra, uint64_t lifetime_usec) { + assert_return(ra, -EINVAL); + + if (ra->state != RADV_STATE_IDLE) + return -EBUSY; + + if (lifetime_usec > RADV_HOME_AGENT_MAX_LIFETIME_USEC) + return -EINVAL; + + ra->home_agent.nd_opt_home_agent_info_lifetime = usec_to_be16_sec(lifetime_usec); + return 0; +} + +int sd_radv_add_prefix(sd_radv *ra, sd_radv_prefix *p) { + sd_radv_prefix *found = NULL; + int r; + + assert_return(ra, -EINVAL); + assert_return(p, -EINVAL); + + /* Refuse prefixes that don't have a prefix set */ + if (in6_addr_is_null(&p->opt.in6_addr)) + return -ENOEXEC; + + const char *addr_p = IN6_ADDR_PREFIX_TO_STRING(&p->opt.in6_addr, p->opt.prefixlen); + + LIST_FOREACH(prefix, cur, ra->prefixes) { + r = in_addr_prefix_intersect(AF_INET6, + (const union in_addr_union*) &cur->opt.in6_addr, + cur->opt.prefixlen, + (const union in_addr_union*) &p->opt.in6_addr, + p->opt.prefixlen); + if (r < 0) + return r; + if (r == 0) + continue; + + if (cur->opt.prefixlen == p->opt.prefixlen) { + found = cur; + break; + } + + return log_radv_errno(ra, SYNTHETIC_ERRNO(EEXIST), + "IPv6 prefix %s conflicts with %s, ignoring.", + addr_p, + IN6_ADDR_PREFIX_TO_STRING(&cur->opt.in6_addr, cur->opt.prefixlen)); + } + + if (found) { + /* p and cur may be equivalent. First increment the reference counter. */ + sd_radv_prefix_ref(p); + + /* Then, remove the old entry. */ + LIST_REMOVE(prefix, ra->prefixes, found); + sd_radv_prefix_unref(found); + + /* Finally, add the new entry. */ + LIST_APPEND(prefix, ra->prefixes, p); + + log_radv(ra, "Updated/replaced IPv6 prefix %s (preferred: %s, valid: %s)", + addr_p, + FORMAT_TIMESPAN(p->lifetime_preferred_usec, USEC_PER_SEC), + FORMAT_TIMESPAN(p->lifetime_valid_usec, USEC_PER_SEC)); + } else { + /* The prefix is new. Let's simply add it. */ + + sd_radv_prefix_ref(p); + LIST_APPEND(prefix, ra->prefixes, p); + ra->n_prefixes++; + + log_radv(ra, "Added prefix %s", addr_p); + } + + if (ra->state == RADV_STATE_IDLE) + return 0; + + if (ra->ra_sent == 0) + return 0; + + /* If RAs have already been sent, send an RA immediately to announce the newly-added prefix */ + r = radv_send(ra, NULL, ra->lifetime_usec); + if (r < 0) + log_radv_errno(ra, r, "Unable to send Router Advertisement for added prefix %s, ignoring: %m", addr_p); + else + log_radv(ra, "Sent Router Advertisement for added/updated prefix %s.", addr_p); + + return 0; +} + +void sd_radv_remove_prefix( + sd_radv *ra, + const struct in6_addr *prefix, + unsigned char prefixlen) { + + if (!ra) + return; + + if (!prefix) + return; + + LIST_FOREACH(prefix, cur, ra->prefixes) { + if (prefixlen != cur->opt.prefixlen) + continue; + + if (!in6_addr_equal(prefix, &cur->opt.in6_addr)) + continue; + + LIST_REMOVE(prefix, ra->prefixes, cur); + ra->n_prefixes--; + sd_radv_prefix_unref(cur); + return; + } +} + +int sd_radv_add_route_prefix(sd_radv *ra, sd_radv_route_prefix *p) { + sd_radv_route_prefix *found = NULL; + int r; + + assert_return(ra, -EINVAL); + assert_return(p, -EINVAL); + + const char *addr_p = IN6_ADDR_PREFIX_TO_STRING(&p->opt.in6_addr, p->opt.prefixlen); + + LIST_FOREACH(prefix, cur, ra->route_prefixes) { + r = in_addr_prefix_intersect(AF_INET6, + (const union in_addr_union*) &cur->opt.in6_addr, + cur->opt.prefixlen, + (const union in_addr_union*) &p->opt.in6_addr, + p->opt.prefixlen); + if (r < 0) + return r; + if (r == 0) + continue; + + if (cur->opt.prefixlen == p->opt.prefixlen) { + found = cur; + break; + } + + return log_radv_errno(ra, SYNTHETIC_ERRNO(EEXIST), + "IPv6 route prefix %s conflicts with %s, ignoring.", + addr_p, + IN6_ADDR_PREFIX_TO_STRING(&cur->opt.in6_addr, cur->opt.prefixlen)); + } + + if (found) { + /* p and cur may be equivalent. First increment the reference counter. */ + sd_radv_route_prefix_ref(p); + + /* Then, remove the old entry. */ + LIST_REMOVE(prefix, ra->route_prefixes, found); + sd_radv_route_prefix_unref(found); + + /* Finally, add the new entry. */ + LIST_APPEND(prefix, ra->route_prefixes, p); + + log_radv(ra, "Updated/replaced IPv6 route prefix %s (lifetime: %s)", + strna(addr_p), + FORMAT_TIMESPAN(p->lifetime_usec, USEC_PER_SEC)); + } else { + /* The route prefix is new. Let's simply add it. */ + + sd_radv_route_prefix_ref(p); + LIST_APPEND(prefix, ra->route_prefixes, p); + ra->n_route_prefixes++; + + log_radv(ra, "Added route prefix %s", strna(addr_p)); + } + + if (ra->state == RADV_STATE_IDLE) + return 0; + + if (ra->ra_sent == 0) + return 0; + + /* If RAs have already been sent, send an RA immediately to announce the newly-added route prefix */ + r = radv_send(ra, NULL, ra->lifetime_usec); + if (r < 0) + log_radv_errno(ra, r, "Unable to send Router Advertisement for added route prefix %s, ignoring: %m", + strna(addr_p)); + else + log_radv(ra, "Sent Router Advertisement for added route prefix %s.", strna(addr_p)); + + return 0; +} + +int sd_radv_add_pref64_prefix(sd_radv *ra, sd_radv_pref64_prefix *p) { + sd_radv_pref64_prefix *found = NULL; + int r; + + assert_return(ra, -EINVAL); + assert_return(p, -EINVAL); + + const char *addr_p = IN6_ADDR_PREFIX_TO_STRING(&p->in6_addr, p->prefixlen); + + LIST_FOREACH(prefix, cur, ra->pref64_prefixes) { + r = in_addr_prefix_intersect(AF_INET6, + (const union in_addr_union*) &cur->in6_addr, + cur->prefixlen, + (const union in_addr_union*) &p->in6_addr, + p->prefixlen); + if (r < 0) + return r; + if (r == 0) + continue; + + if (cur->prefixlen == p->prefixlen) { + found = cur; + break; + } + + return log_radv_errno(ra, SYNTHETIC_ERRNO(EEXIST), + "IPv6 PREF64 prefix %s conflicts with %s, ignoring.", + addr_p, + IN6_ADDR_PREFIX_TO_STRING(&cur->in6_addr, cur->prefixlen)); + } + + if (found) { + /* p and cur may be equivalent. First increment the reference counter. */ + sd_radv_pref64_prefix_ref(p); + + /* Then, remove the old entry. */ + LIST_REMOVE(prefix, ra->pref64_prefixes, found); + sd_radv_pref64_prefix_unref(found); + + /* Finally, add the new entry. */ + LIST_APPEND(prefix, ra->pref64_prefixes, p); + + log_radv(ra, "Updated/replaced IPv6 PREF64 prefix %s (lifetime: %s)", + strna(addr_p), + FORMAT_TIMESPAN(p->lifetime_usec, USEC_PER_SEC)); + } else { + /* The route prefix is new. Let's simply add it. */ + + sd_radv_pref64_prefix_ref(p); + LIST_APPEND(prefix, ra->pref64_prefixes, p); + ra->n_pref64_prefixes++; + + log_radv(ra, "Added PREF64 prefix %s", strna(addr_p)); + } + + if (ra->state == RADV_STATE_IDLE) + return 0; + + if (ra->ra_sent == 0) + return 0; + + /* If RAs have already been sent, send an RA immediately to announce the newly-added route prefix */ + r = radv_send(ra, NULL, ra->lifetime_usec); + if (r < 0) + log_radv_errno(ra, r, "Unable to send Router Advertisement for added PREF64 prefix %s, ignoring: %m", + strna(addr_p)); + else + log_radv(ra, "Sent Router Advertisement for added PREF64 prefix %s.", strna(addr_p)); + + return 0; +} + +int sd_radv_set_rdnss( + sd_radv *ra, + uint64_t lifetime_usec, + const struct in6_addr *dns, + size_t n_dns) { + + _cleanup_free_ struct sd_radv_opt_dns *opt_rdnss = NULL; + size_t len; + + assert_return(ra, -EINVAL); + assert_return(n_dns < 128, -EINVAL); + + if (lifetime_usec > RADV_RDNSS_MAX_LIFETIME_USEC) + return -EINVAL; + + if (!dns || n_dns == 0) { + ra->rdnss = mfree(ra->rdnss); + ra->n_rdnss = 0; + + return 0; + } + + len = sizeof(struct sd_radv_opt_dns) + sizeof(struct in6_addr) * n_dns; + + opt_rdnss = malloc0(len); + if (!opt_rdnss) + return -ENOMEM; + + opt_rdnss->type = RADV_OPT_RDNSS; + opt_rdnss->length = len / 8; + opt_rdnss->lifetime = usec_to_be32_sec(lifetime_usec); + + memcpy(opt_rdnss + 1, dns, n_dns * sizeof(struct in6_addr)); + + free_and_replace(ra->rdnss, opt_rdnss); + + ra->n_rdnss = n_dns; + + return 0; +} + +int sd_radv_set_dnssl( + sd_radv *ra, + uint64_t lifetime_usec, + char **search_list) { + + _cleanup_free_ struct sd_radv_opt_dns *opt_dnssl = NULL; + size_t len = 0; + uint8_t *p; + + assert_return(ra, -EINVAL); + + if (lifetime_usec > RADV_DNSSL_MAX_LIFETIME_USEC) + return -EINVAL; + + if (strv_isempty(search_list)) { + ra->dnssl = mfree(ra->dnssl); + return 0; + } + + STRV_FOREACH(s, search_list) + len += strlen(*s) + 2; + + len = (sizeof(struct sd_radv_opt_dns) + len + 7) & ~0x7; + + opt_dnssl = malloc0(len); + if (!opt_dnssl) + return -ENOMEM; + + opt_dnssl->type = RADV_OPT_DNSSL; + opt_dnssl->length = len / 8; + opt_dnssl->lifetime = usec_to_be32_sec(lifetime_usec); + + p = (uint8_t *)(opt_dnssl + 1); + len -= sizeof(struct sd_radv_opt_dns); + + STRV_FOREACH(s, search_list) { + int r; + + r = dns_name_to_wire_format(*s, p, len, false); + if (r < 0) + return r; + + if (len < (size_t)r) + return -ENOBUFS; + + p += r; + len -= r; + } + + free_and_replace(ra->dnssl, opt_dnssl); + + return 0; +} + +int sd_radv_prefix_new(sd_radv_prefix **ret) { + sd_radv_prefix *p; + + assert_return(ret, -EINVAL); + + p = new(sd_radv_prefix, 1); + if (!p) + return -ENOMEM; + + *p = (sd_radv_prefix) { + .n_ref = 1, + + .opt.type = ND_OPT_PREFIX_INFORMATION, + .opt.length = (sizeof(p->opt) - 1)/8 + 1, + .opt.prefixlen = 64, + + /* RFC 4861, Section 6.2.1 */ + .opt.flags = ND_OPT_PI_FLAG_ONLINK|ND_OPT_PI_FLAG_AUTO, + + .lifetime_valid_usec = RADV_DEFAULT_VALID_LIFETIME_USEC, + .lifetime_preferred_usec = RADV_DEFAULT_PREFERRED_LIFETIME_USEC, + .valid_until = USEC_INFINITY, + .preferred_until = USEC_INFINITY, + }; + + *ret = p; + return 0; +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_radv_prefix, sd_radv_prefix, mfree); + +int sd_radv_prefix_set_prefix( + sd_radv_prefix *p, + const struct in6_addr *in6_addr, + unsigned char prefixlen) { + + assert_return(p, -EINVAL); + assert_return(in6_addr, -EINVAL); + + if (prefixlen < 3 || prefixlen > 128) + return -EINVAL; + + if (prefixlen > 64) + /* unusual but allowed, log it */ + log_radv(NULL, "Unusual prefix length %d greater than 64", prefixlen); + + p->opt.in6_addr = *in6_addr; + p->opt.prefixlen = prefixlen; + + return 0; +} + +int sd_radv_prefix_get_prefix( + sd_radv_prefix *p, + struct in6_addr *ret_in6_addr, + unsigned char *ret_prefixlen) { + + assert_return(p, -EINVAL); + assert_return(ret_in6_addr, -EINVAL); + assert_return(ret_prefixlen, -EINVAL); + + *ret_in6_addr = p->opt.in6_addr; + *ret_prefixlen = p->opt.prefixlen; + + return 0; +} + +int sd_radv_prefix_set_onlink(sd_radv_prefix *p, int onlink) { + assert_return(p, -EINVAL); + + SET_FLAG(p->opt.flags, ND_OPT_PI_FLAG_ONLINK, onlink); + + return 0; +} + +int sd_radv_prefix_set_address_autoconfiguration(sd_radv_prefix *p, int address_autoconfiguration) { + assert_return(p, -EINVAL); + + SET_FLAG(p->opt.flags, ND_OPT_PI_FLAG_AUTO, address_autoconfiguration); + + return 0; +} + +int sd_radv_prefix_set_valid_lifetime(sd_radv_prefix *p, uint64_t lifetime_usec, uint64_t valid_until) { + assert_return(p, -EINVAL); + + p->lifetime_valid_usec = lifetime_usec; + p->valid_until = valid_until; + + return 0; +} + +int sd_radv_prefix_set_preferred_lifetime(sd_radv_prefix *p, uint64_t lifetime_usec, uint64_t valid_until) { + assert_return(p, -EINVAL); + + p->lifetime_preferred_usec = lifetime_usec; + p->preferred_until = valid_until; + + return 0; +} + +int sd_radv_route_prefix_new(sd_radv_route_prefix **ret) { + sd_radv_route_prefix *p; + + assert_return(ret, -EINVAL); + + p = new(sd_radv_route_prefix, 1); + if (!p) + return -ENOMEM; + + *p = (sd_radv_route_prefix) { + .n_ref = 1, + + .opt.type = RADV_OPT_ROUTE_INFORMATION, + .opt.length = DIV_ROUND_UP(sizeof(p->opt), 8), + .opt.prefixlen = 64, + + .lifetime_usec = RADV_DEFAULT_VALID_LIFETIME_USEC, + .valid_until = USEC_INFINITY, + }; + + *ret = p; + return 0; +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_radv_route_prefix, sd_radv_route_prefix, mfree); + +int sd_radv_route_prefix_set_prefix( + sd_radv_route_prefix *p, + const struct in6_addr *in6_addr, + unsigned char prefixlen) { + + assert_return(p, -EINVAL); + assert_return(in6_addr, -EINVAL); + + if (prefixlen > 128) + return -EINVAL; + + if (prefixlen > 64) + /* unusual but allowed, log it */ + log_radv(NULL, "Unusual prefix length %u greater than 64", prefixlen); + + p->opt.in6_addr = *in6_addr; + p->opt.prefixlen = prefixlen; + + return 0; +} + +int sd_radv_route_prefix_set_lifetime(sd_radv_route_prefix *p, uint64_t lifetime_usec, uint64_t valid_until) { + assert_return(p, -EINVAL); + + p->lifetime_usec = lifetime_usec; + p->valid_until = valid_until; + + return 0; +} + +int sd_radv_pref64_prefix_new(sd_radv_pref64_prefix **ret) { + sd_radv_pref64_prefix *p; + + assert_return(ret, -EINVAL); + + p = new(sd_radv_pref64_prefix, 1); + if (!p) + return -ENOMEM; + + *p = (sd_radv_pref64_prefix) { + .n_ref = 1, + + .opt.type = RADV_OPT_PREF64, + .opt.length = 2, + }; + + *ret = p; + return 0; +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_radv_pref64_prefix, sd_radv_pref64_prefix, mfree); + +int sd_radv_pref64_prefix_set_prefix( + sd_radv_pref64_prefix *p, + const struct in6_addr *prefix, + uint8_t prefixlen, + uint64_t lifetime_usec) { + + uint16_t pref64_lifetime; + uint8_t prefixlen_code; + int r; + + assert_return(p, -EINVAL); + assert_return(prefix, -EINVAL); + + r = pref64_prefix_length_to_plc(prefixlen, &prefixlen_code); + if (r < 0) + return log_radv_errno(NULL, r, + "Unsupported PREF64 prefix length %u. Valid lengths are 32, 40, 48, 56, 64 and 96", prefixlen); + + if (lifetime_usec > PREF64_MAX_LIFETIME_USEC) + return -EINVAL; + + /* RFC 8781 - 4.1 rounding up lifetime to multiply of 8 */ + pref64_lifetime = DIV_ROUND_UP(lifetime_usec, 8 * USEC_PER_SEC) << 3; + pref64_lifetime |= prefixlen_code; + + unaligned_write_be16(&p->opt.lifetime_and_plc, pref64_lifetime); + memcpy(&p->opt.prefix, prefix, sizeof(p->opt.prefix)); + + p->in6_addr = *prefix; + p->prefixlen = prefixlen; + + return 0; +} diff --git a/src/libsystemd-network/test-acd.c b/src/libsystemd-network/test-acd.c new file mode 100644 index 0000000..4b5ad70 --- /dev/null +++ b/src/libsystemd-network/test-acd.c @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include +#include + +#include "sd-event.h" +#include "sd-ipv4acd.h" +#include "sd-netlink.h" + +#include "in-addr-util.h" +#include "tests.h" + +static void acd_handler(sd_ipv4acd *acd, int event, void *userdata) { + assert_se(acd); + + switch (event) { + case SD_IPV4ACD_EVENT_BIND: + log_info("bound"); + break; + case SD_IPV4ACD_EVENT_CONFLICT: + log_info("conflict"); + break; + case SD_IPV4ACD_EVENT_STOP: + log_error("the client was stopped"); + break; + default: + assert_not_reached(); + } +} + +static int client_run(int ifindex, const struct in_addr *pa, const struct ether_addr *ha, sd_event *e) { + sd_ipv4acd *acd; + + assert_se(sd_ipv4acd_new(&acd) >= 0); + assert_se(sd_ipv4acd_attach_event(acd, e, 0) >= 0); + + assert_se(sd_ipv4acd_set_ifindex(acd, ifindex) >= 0); + assert_se(sd_ipv4acd_set_mac(acd, ha) >= 0); + assert_se(sd_ipv4acd_set_address(acd, pa) >= 0); + assert_se(sd_ipv4acd_set_callback(acd, acd_handler, NULL) >= 0); + + log_info("starting IPv4ACD client"); + + assert_se(sd_ipv4acd_start(acd, true) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(!sd_ipv4acd_unref(acd)); + + return EXIT_SUCCESS; +} + +static int test_acd(const char *ifname, const char *address) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL; + union in_addr_union pa; + struct ether_addr ha; + int ifindex; + + assert_se(in_addr_from_string(AF_INET, address, &pa) >= 0); + + assert_se(sd_event_new(&e) >= 0); + + assert_se(sd_netlink_open(&rtnl) >= 0); + assert_se(sd_netlink_attach_event(rtnl, e, 0) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, 0) >= 0); + assert_se(sd_netlink_message_append_string(m, IFLA_IFNAME, ifname) >= 0); + assert_se(sd_netlink_call(rtnl, m, 0, &reply) >= 0); + + assert_se(sd_rtnl_message_link_get_ifindex(reply, &ifindex) >= 0); + assert_se(sd_netlink_message_read_ether_addr(reply, IFLA_ADDRESS, &ha) >= 0); + + client_run(ifindex, &pa.in, &ha, e); + + return EXIT_SUCCESS; +} + +int main(int argc, char *argv[]) { + test_setup_logging(LOG_DEBUG); + + if (argc == 3) + return test_acd(argv[1], argv[2]); + else { + log_error("This program takes two arguments.\n" + "\t %s ", program_invocation_short_name); + return EXIT_FAILURE; + } +} diff --git a/src/libsystemd-network/test-dhcp-client.c b/src/libsystemd-network/test-dhcp-client.c new file mode 100644 index 0000000..e3f148d --- /dev/null +++ b/src/libsystemd-network/test-dhcp-client.c @@ -0,0 +1,562 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +# include +#endif + +#include "sd-dhcp-client.h" +#include "sd-event.h" + +#include "alloc-util.h" +#include "dhcp-identifier.h" +#include "dhcp-network.h" +#include "dhcp-option.h" +#include "dhcp-packet.h" +#include "ether-addr-util.h" +#include "fd-util.h" +#include "random-util.h" +#include "tests.h" + +static struct hw_addr_data hw_addr = { + .length = ETH_ALEN, + .ether = {{ 'A', 'B', 'C', '1', '2', '3' }}, +}, bcast_addr = { + .length = ETH_ALEN, + .ether = {{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }}, +}; +typedef int (*test_callback_recv_t)(size_t size, DHCPMessage *dhcp); + +static bool verbose = true; +static int test_fd[2]; +static test_callback_recv_t callback_recv; +static be32_t xid; + +static void test_request_basic(sd_event *e) { + int r; + + sd_dhcp_client *client; + + if (verbose) + printf("* %s\n", __func__); + + /* Initialize client without Anonymize settings. */ + r = sd_dhcp_client_new(&client, false); + + assert_se(r >= 0); + assert_se(client); + + r = sd_dhcp_client_attach_event(client, e, 0); + assert_se(r >= 0); + + assert_se(sd_dhcp_client_set_request_option(NULL, 0) == -EINVAL); + assert_se(sd_dhcp_client_set_request_address(NULL, NULL) == -EINVAL); + assert_se(sd_dhcp_client_set_ifindex(NULL, 0) == -EINVAL); + + assert_se(sd_dhcp_client_set_ifindex(client, 15) == 0); + assert_se(sd_dhcp_client_set_ifindex(client, -42) == -EINVAL); + assert_se(sd_dhcp_client_set_ifindex(client, -1) == -EINVAL); + assert_se(sd_dhcp_client_set_ifindex(client, 0) == -EINVAL); + assert_se(sd_dhcp_client_set_ifindex(client, 1) == 0); + + assert_se(sd_dhcp_client_set_hostname(client, "host") == 1); + assert_se(sd_dhcp_client_set_hostname(client, "host.domain") == 1); + assert_se(sd_dhcp_client_set_hostname(client, NULL) == 1); + assert_se(sd_dhcp_client_set_hostname(client, "~host") == -EINVAL); + assert_se(sd_dhcp_client_set_hostname(client, "~host.domain") == -EINVAL); + + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_SUBNET_MASK) == 0); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_ROUTER) == 0); + /* This PRL option is not set when using Anonymize, but in this test + * Anonymize settings are not being used. */ + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_HOST_NAME) == 0); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_DOMAIN_NAME) == 0); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_DOMAIN_NAME_SERVER) == 0); + + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_PAD) == -EINVAL); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_END) == -EINVAL); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_MESSAGE_TYPE) == -EINVAL); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_OVERLOAD) == -EINVAL); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_PARAMETER_REQUEST_LIST) == -EINVAL); + + /* RFC7844: option 33 (SD_DHCP_OPTION_STATIC_ROUTE) is set in the + * default PRL when using Anonymize, so it is changed to other option + * that is not set by default, to check that it was set successfully. + * Options not set by default (using or not anonymize) are option 17 + * (SD_DHCP_OPTION_ROOT_PATH) and 42 (SD_DHCP_OPTION_NTP_SERVER) */ + assert_se(sd_dhcp_client_set_request_option(client, 17) == 1); + assert_se(sd_dhcp_client_set_request_option(client, 17) == 0); + assert_se(sd_dhcp_client_set_request_option(client, 42) == 1); + assert_se(sd_dhcp_client_set_request_option(client, 17) == 0); + + sd_dhcp_client_unref(client); +} + +static void test_request_anonymize(sd_event *e) { + int r; + + sd_dhcp_client *client; + + if (verbose) + printf("* %s\n", __func__); + + /* Initialize client with Anonymize settings. */ + r = sd_dhcp_client_new(&client, true); + + assert_se(r >= 0); + assert_se(client); + + r = sd_dhcp_client_attach_event(client, e, 0); + assert_se(r >= 0); + + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_NETBIOS_NAME_SERVER) == 0); + /* This PRL option is not set when using Anonymize */ + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_HOST_NAME) == 1); + assert_se(sd_dhcp_client_set_request_option(client, SD_DHCP_OPTION_PARAMETER_REQUEST_LIST) == -EINVAL); + + /* RFC7844: option 101 (SD_DHCP_OPTION_NEW_TZDB_TIMEZONE) is not set in the + * default PRL when using Anonymize, */ + assert_se(sd_dhcp_client_set_request_option(client, 101) == 1); + assert_se(sd_dhcp_client_set_request_option(client, 101) == 0); + + sd_dhcp_client_unref(client); +} + +static void test_checksum(void) { + uint8_t buf[20] = { + 0x45, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, + 0x40, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff + }; + + if (verbose) + printf("* %s\n", __func__); + + assert_se(dhcp_packet_checksum((uint8_t*)&buf, 20) == be16toh(0x78ae)); +} + +static void test_dhcp_identifier_set_iaid(void) { + uint32_t iaid_legacy; + be32_t iaid; + + assert_se(dhcp_identifier_set_iaid(NULL, &hw_addr, /* legacy = */ true, &iaid_legacy) >= 0); + assert_se(dhcp_identifier_set_iaid(NULL, &hw_addr, /* legacy = */ false, &iaid) >= 0); + + /* we expect, that the MAC address was hashed. The legacy value is in native + * endianness. */ + assert_se(iaid_legacy == 0x8dde4ba8u); + assert_se(iaid == htole32(0x8dde4ba8u)); +#if __BYTE_ORDER == __LITTLE_ENDIAN + assert_se(iaid == iaid_legacy); +#else + assert_se(iaid == bswap_32(iaid_legacy)); +#endif +} + +static int check_options(uint8_t code, uint8_t len, const void *option, void *userdata) { + switch (code) { + case SD_DHCP_OPTION_CLIENT_IDENTIFIER: + { + uint32_t iaid; + struct duid duid; + size_t duid_len; + + assert_se(dhcp_identifier_set_duid_en(&duid, &duid_len) >= 0); + assert_se(dhcp_identifier_set_iaid(NULL, &hw_addr, /* legacy = */ true, &iaid) >= 0); + + assert_se(len == sizeof(uint8_t) + sizeof(uint32_t) + duid_len); + assert_se(len == 19); + assert_se(((uint8_t*) option)[0] == 0xff); + + assert_se(memcmp((uint8_t*) option + 1, &iaid, sizeof(iaid)) == 0); + assert_se(memcmp((uint8_t*) option + 5, &duid, duid_len) == 0); + break; + } + + default: + break; + } + + return 0; +} + +int dhcp_network_send_raw_socket(int s, const union sockaddr_union *link, const void *packet, size_t len) { + size_t size; + _cleanup_free_ DHCPPacket *discover = NULL; + uint16_t ip_check, udp_check; + + assert_se(s >= 0); + assert_se(packet); + + size = sizeof(DHCPPacket); + assert_se(len > size); + + discover = memdup(packet, len); + + assert_se(discover->ip.ttl == IPDEFTTL); + assert_se(discover->ip.protocol == IPPROTO_UDP); + assert_se(discover->ip.saddr == INADDR_ANY); + assert_se(discover->ip.daddr == INADDR_BROADCAST); + assert_se(discover->udp.source == be16toh(DHCP_PORT_CLIENT)); + assert_se(discover->udp.dest == be16toh(DHCP_PORT_SERVER)); + + ip_check = discover->ip.check; + + discover->ip.ttl = 0; + discover->ip.check = discover->udp.len; + + udp_check = ~dhcp_packet_checksum((uint8_t*)&discover->ip.ttl, len - 8); + assert_se(udp_check == 0xffff); + + discover->ip.ttl = IPDEFTTL; + discover->ip.check = ip_check; + + ip_check = ~dhcp_packet_checksum((uint8_t*)&discover->ip, sizeof(discover->ip)); + assert_se(ip_check == 0xffff); + + assert_se(discover->dhcp.xid); + assert_se(memcmp(discover->dhcp.chaddr, hw_addr.bytes, hw_addr.length) == 0); + + size = len - sizeof(struct iphdr) - sizeof(struct udphdr); + + assert_se(callback_recv); + callback_recv(size, &discover->dhcp); + + return 575; +} + +int dhcp_network_bind_raw_socket( + int ifindex, + union sockaddr_union *link, + uint32_t id, + const struct hw_addr_data *_hw_addr, + const struct hw_addr_data *_bcast_addr, + uint16_t arp_type, + uint16_t port, + bool so_priority_set, + int so_priority) { + + if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) < 0) + return -errno; + + return test_fd[0]; +} + +int dhcp_network_bind_udp_socket(int ifindex, be32_t address, uint16_t port, int ip_service_type) { + int fd; + + fd = socket(AF_INET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + return fd; +} + +int dhcp_network_send_udp_socket(int s, be32_t address, uint16_t port, const void *packet, size_t len) { + return 0; +} + +static int test_discover_message_verify(size_t size, struct DHCPMessage *dhcp) { + int res; + + res = dhcp_option_parse(dhcp, size, check_options, NULL, NULL); + assert_se(res == DHCP_DISCOVER); + + if (verbose) + printf(" recv DHCP Discover 0x%08x\n", be32toh(dhcp->xid)); + + return 0; +} + +static void test_discover_message(sd_event *e) { + sd_dhcp_client *client; + int res, r; + + if (verbose) + printf("* %s\n", __func__); + + r = sd_dhcp_client_new(&client, false); + assert_se(r >= 0); + assert_se(client); + + r = sd_dhcp_client_attach_event(client, e, 0); + assert_se(r >= 0); + + assert_se(sd_dhcp_client_set_ifindex(client, 42) >= 0); + assert_se(sd_dhcp_client_set_mac(client, hw_addr.bytes, bcast_addr.bytes, hw_addr.length, ARPHRD_ETHER) >= 0); + + assert_se(sd_dhcp_client_set_request_option(client, 248) >= 0); + + callback_recv = test_discover_message_verify; + + res = sd_dhcp_client_start(client); + + assert_se(IN_SET(res, 0, -EINPROGRESS)); + + sd_event_run(e, UINT64_MAX); + + sd_dhcp_client_stop(client); + sd_dhcp_client_unref(client); + + test_fd[1] = safe_close(test_fd[1]); + + callback_recv = NULL; +} + +static uint8_t test_addr_acq_offer[] = { + 0x45, 0x10, 0x01, 0x48, 0x00, 0x00, 0x00, 0x00, + 0x80, 0x11, 0xb3, 0x84, 0xc0, 0xa8, 0x02, 0x01, + 0xc0, 0xa8, 0x02, 0xbf, 0x00, 0x43, 0x00, 0x44, + 0x01, 0x34, 0x00, 0x00, 0x02, 0x01, 0x06, 0x00, + 0x6f, 0x95, 0x2f, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0xa8, 0x02, 0xbf, + 0xc0, 0xa8, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x63, 0x82, 0x53, 0x63, 0x35, 0x01, 0x02, 0x36, + 0x04, 0xc0, 0xa8, 0x02, 0x01, 0x33, 0x04, 0x00, + 0x00, 0x02, 0x58, 0x01, 0x04, 0xff, 0xff, 0xff, + 0x00, 0x2a, 0x04, 0xc0, 0xa8, 0x02, 0x01, 0x0f, + 0x09, 0x6c, 0x61, 0x62, 0x2e, 0x69, 0x6e, 0x74, + 0x72, 0x61, 0x03, 0x04, 0xc0, 0xa8, 0x02, 0x01, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static uint8_t test_addr_acq_ack[] = { + 0x45, 0x10, 0x01, 0x48, 0x00, 0x00, 0x00, 0x00, + 0x80, 0x11, 0xb3, 0x84, 0xc0, 0xa8, 0x02, 0x01, + 0xc0, 0xa8, 0x02, 0xbf, 0x00, 0x43, 0x00, 0x44, + 0x01, 0x34, 0x00, 0x00, 0x02, 0x01, 0x06, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0xa8, 0x02, 0xbf, + 0xc0, 0xa8, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x63, 0x82, 0x53, 0x63, 0x35, 0x01, 0x05, 0x36, + 0x04, 0xc0, 0xa8, 0x02, 0x01, 0x33, 0x04, 0x00, + 0x00, 0x02, 0x58, 0x01, 0x04, 0xff, 0xff, 0xff, + 0x00, 0x2a, 0x04, 0xc0, 0xa8, 0x02, 0x01, 0x0f, + 0x09, 0x6c, 0x61, 0x62, 0x2e, 0x69, 0x6e, 0x74, + 0x72, 0x61, 0x03, 0x04, 0xc0, 0xa8, 0x02, 0x01, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static int test_addr_acq_acquired(sd_dhcp_client *client, int event, + void *userdata) { + sd_event *e = userdata; + sd_dhcp_lease *lease; + struct in_addr addr; + const struct in_addr *addrs; + + assert_se(client); + assert_se(IN_SET(event, SD_DHCP_CLIENT_EVENT_IP_ACQUIRE, SD_DHCP_CLIENT_EVENT_SELECTING)); + + assert_se(sd_dhcp_client_get_lease(client, &lease) >= 0); + assert_se(lease); + + assert_se(sd_dhcp_lease_get_address(lease, &addr) >= 0); + assert_se(memcmp(&addr.s_addr, &test_addr_acq_ack[44], + sizeof(addr.s_addr)) == 0); + + assert_se(sd_dhcp_lease_get_netmask(lease, &addr) >= 0); + assert_se(memcmp(&addr.s_addr, &test_addr_acq_ack[285], + sizeof(addr.s_addr)) == 0); + + assert_se(sd_dhcp_lease_get_router(lease, &addrs) == 1); + assert_se(memcmp(&addrs[0].s_addr, &test_addr_acq_ack[308], + sizeof(addrs[0].s_addr)) == 0); + + if (verbose) + printf(" DHCP address acquired\n"); + + sd_event_exit(e, 0); + + return 0; +} + +static int test_addr_acq_recv_request(size_t size, DHCPMessage *request) { + uint16_t udp_check = 0; + uint8_t *msg_bytes = (uint8_t *)request; + int res; + + res = dhcp_option_parse(request, size, check_options, NULL, NULL); + assert_se(res == DHCP_REQUEST); + assert_se(xid == request->xid); + + assert_se(msg_bytes[size - 1] == SD_DHCP_OPTION_END); + + if (verbose) + printf(" recv DHCP Request 0x%08x\n", be32toh(xid)); + + memcpy(&test_addr_acq_ack[26], &udp_check, sizeof(udp_check)); + memcpy(&test_addr_acq_ack[32], &xid, sizeof(xid)); + memcpy(&test_addr_acq_ack[56], hw_addr.bytes, hw_addr.length); + + callback_recv = NULL; + + res = write(test_fd[1], test_addr_acq_ack, + sizeof(test_addr_acq_ack)); + assert_se(res == sizeof(test_addr_acq_ack)); + + if (verbose) + printf(" send DHCP Ack\n"); + + return 0; +}; + +static int test_addr_acq_recv_discover(size_t size, DHCPMessage *discover) { + uint16_t udp_check = 0; + uint8_t *msg_bytes = (uint8_t *)discover; + int res; + + res = dhcp_option_parse(discover, size, check_options, NULL, NULL); + assert_se(res == DHCP_DISCOVER); + + assert_se(msg_bytes[size - 1] == SD_DHCP_OPTION_END); + + xid = discover->xid; + + if (verbose) + printf(" recv DHCP Discover 0x%08x\n", be32toh(xid)); + + memcpy(&test_addr_acq_offer[26], &udp_check, sizeof(udp_check)); + memcpy(&test_addr_acq_offer[32], &xid, sizeof(xid)); + memcpy(&test_addr_acq_offer[56], hw_addr.bytes, hw_addr.length); + + callback_recv = test_addr_acq_recv_request; + + res = write(test_fd[1], test_addr_acq_offer, + sizeof(test_addr_acq_offer)); + assert_se(res == sizeof(test_addr_acq_offer)); + + if (verbose) + printf(" sent DHCP Offer\n"); + + return 0; +} + +static void test_addr_acq(sd_event *e) { + sd_dhcp_client *client; + int res, r; + + if (verbose) + printf("* %s\n", __func__); + + r = sd_dhcp_client_new(&client, false); + assert_se(r >= 0); + assert_se(client); + + r = sd_dhcp_client_attach_event(client, e, 0); + assert_se(r >= 0); + + assert_se(sd_dhcp_client_set_ifindex(client, 42) >= 0); + assert_se(sd_dhcp_client_set_mac(client, hw_addr.bytes, bcast_addr.bytes, hw_addr.length, ARPHRD_ETHER) >= 0); + + assert_se(sd_dhcp_client_set_callback(client, test_addr_acq_acquired, e) >= 0); + + callback_recv = test_addr_acq_recv_discover; + + assert_se(sd_event_add_time_relative(e, NULL, CLOCK_BOOTTIME, + 2 * USEC_PER_SEC, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)) >= 0); + + res = sd_dhcp_client_start(client); + assert_se(IN_SET(res, 0, -EINPROGRESS)); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(sd_dhcp_client_set_callback(client, NULL, NULL) >= 0); + assert_se(sd_dhcp_client_stop(client) >= 0); + sd_dhcp_client_unref(client); + + test_fd[1] = safe_close(test_fd[1]); + + callback_recv = NULL; + xid = 0; +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_event_unrefp) sd_event *e; + + assert_se(setenv("SYSTEMD_NETWORK_TEST_MODE", "1", 1) >= 0); + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_event_new(&e) >= 0); + + test_request_basic(e); + test_request_anonymize(e); + test_checksum(); + test_dhcp_identifier_set_iaid(); + + test_discover_message(e); + test_addr_acq(e); + +#if HAVE_VALGRIND_VALGRIND_H + /* Make sure the async_close thread has finished. + * valgrind would report some of the phread_* structures + * as not cleaned up properly. */ + if (RUNNING_ON_VALGRIND) + sleep(1); +#endif + + return 0; +} diff --git a/src/libsystemd-network/test-dhcp-option.c b/src/libsystemd-network/test-dhcp-option.c new file mode 100644 index 0000000..bcd46e4 --- /dev/null +++ b/src/libsystemd-network/test-dhcp-option.c @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dhcp-option.h" +#include "dhcp-packet.h" +#include "ether-addr-util.h" +#include "macro.h" +#include "memory-util.h" +#include "tests.h" + +struct option_desc { + uint8_t sname[64]; + int snamelen; + uint8_t file[128]; + int filelen; + uint8_t options[128]; + int len; + bool success; + int filepos; + int snamepos; + int pos; +}; + +static bool verbose = false; + +static struct option_desc option_tests[] = { + { {}, 0, {}, 0, { 42, 5, 65, 66, 67, 68, 69 }, 7, false, }, + { {}, 0, {}, 0, { 42, 5, 65, 66, 67, 68, 69, 0, 0, + SD_DHCP_OPTION_MESSAGE_TYPE, 1, DHCP_ACK }, 12, true, }, + { {}, 0, {}, 0, { 8, 255, 70, 71, 72 }, 5, false, }, + { {}, 0, {}, 0, { 0x35, 0x01, 0x05, 0x36, 0x04, 0x01, 0x00, 0xa8, + 0xc0, 0x33, 0x04, 0x00, 0x01, 0x51, 0x80, 0x01, + 0x04, 0xff, 0xff, 0xff, 0x00, 0x03, 0x04, 0xc0, + 0xa8, 0x00, 0x01, 0x06, 0x04, 0xc0, 0xa8, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }, + 40, true, }, + { {}, 0, {}, 0, { SD_DHCP_OPTION_MESSAGE_TYPE, 1, DHCP_OFFER, + 42, 3, 0, 0, 0 }, 8, true, }, + { {}, 0, {}, 0, { 42, 2, 1, 2, 44 }, 5, false, }, + + { {}, 0, + { 222, 3, 1, 2, 3, SD_DHCP_OPTION_MESSAGE_TYPE, 1, DHCP_NAK }, 8, + { SD_DHCP_OPTION_OVERLOAD, 1, DHCP_OVERLOAD_FILE }, 3, true, }, + + { { 1, 4, 1, 2, 3, 4, SD_DHCP_OPTION_MESSAGE_TYPE, 1, DHCP_ACK }, 9, + { 222, 3, 1, 2, 3 }, 5, + { SD_DHCP_OPTION_OVERLOAD, 1, + DHCP_OVERLOAD_FILE|DHCP_OVERLOAD_SNAME }, 3, true, }, +}; + +static const char *dhcp_type(int type) { + switch (type) { + case DHCP_DISCOVER: + return "DHCPDISCOVER"; + case DHCP_OFFER: + return "DHCPOFFER"; + case DHCP_REQUEST: + return "DHCPREQUEST"; + case DHCP_DECLINE: + return "DHCPDECLINE"; + case DHCP_ACK: + return "DHCPACK"; + case DHCP_NAK: + return "DHCPNAK"; + case DHCP_RELEASE: + return "DHCPRELEASE"; + default: + return "unknown"; + } +} + +static void test_invalid_buffer_length(void) { + DHCPMessage message; + + assert_se(dhcp_option_parse(&message, 0, NULL, NULL, NULL) == -EINVAL); + assert_se(dhcp_option_parse(&message, sizeof(DHCPMessage) - 1, NULL, NULL, NULL) == -EINVAL); +} + +static void test_message_init(void) { + _cleanup_free_ DHCPMessage *message = NULL; + size_t optlen = 4, optoffset; + size_t len = sizeof(DHCPMessage) + optlen; + uint8_t *magic; + + message = malloc0(len); + + assert_se(dhcp_message_init(message, BOOTREQUEST, 0x12345678, + DHCP_DISCOVER, ARPHRD_ETHER, ETH_ALEN, (uint8_t[16]){}, + optlen, &optoffset) >= 0); + + assert_se(message->xid == htobe32(0x12345678)); + assert_se(message->op == BOOTREQUEST); + + magic = (uint8_t*)&message->magic; + + assert_se(magic[0] == 99); + assert_se(magic[1] == 130); + assert_se(magic[2] == 83); + assert_se(magic[3] == 99); + + assert_se(dhcp_option_parse(message, len, NULL, NULL, NULL) >= 0); +} + +static DHCPMessage *create_message(uint8_t *options, uint16_t optlen, + uint8_t *file, uint8_t filelen, + uint8_t *sname, uint8_t snamelen) { + DHCPMessage *message; + size_t len = sizeof(DHCPMessage) + optlen; + + message = malloc0(len); + assert_se(message); + + memcpy_safe(&message->options, options, optlen); + memcpy_safe(&message->file, file, filelen); + memcpy_safe(&message->sname, sname, snamelen); + + return message; +} + +static void test_ignore_opts(uint8_t *descoption, int *descpos, int *desclen) { + assert_se(*descpos >= 0); + + while (*descpos < *desclen) { + switch (descoption[*descpos]) { + case SD_DHCP_OPTION_PAD: + *descpos += 1; + break; + + case SD_DHCP_OPTION_MESSAGE_TYPE: + case SD_DHCP_OPTION_OVERLOAD: + *descpos += 3; + break; + + default: + return; + } + } +} + +static int test_options_cb(uint8_t code, uint8_t len, const void *option, void *userdata) { + struct option_desc *desc = userdata; + uint8_t *descoption = NULL; + int *desclen = NULL, *descpos = NULL; + uint8_t optcode = 0; + uint8_t optlen = 0; + + assert_se((!desc && !code && !len) || desc); + + if (!desc) + return -EINVAL; + + assert_se(code != SD_DHCP_OPTION_PAD); + assert_se(code != SD_DHCP_OPTION_END); + assert_se(code != SD_DHCP_OPTION_MESSAGE_TYPE); + assert_se(code != SD_DHCP_OPTION_OVERLOAD); + + while (desc->pos >= 0 || desc->filepos >= 0 || desc->snamepos >= 0) { + + if (desc->pos >= 0) { + descoption = &desc->options[0]; + desclen = &desc->len; + descpos = &desc->pos; + } else if (desc->filepos >= 0) { + descoption = &desc->file[0]; + desclen = &desc->filelen; + descpos = &desc->filepos; + } else if (desc->snamepos >= 0) { + descoption = &desc->sname[0]; + desclen = &desc->snamelen; + descpos = &desc->snamepos; + } + + assert_se(descoption && desclen && descpos); + + if (*desclen) + test_ignore_opts(descoption, descpos, desclen); + + if (*descpos < *desclen) + break; + + if (*descpos == *desclen) + *descpos = -1; + } + + assert_se(descpos); + assert_se(*descpos != -1); + + optcode = descoption[*descpos]; + optlen = descoption[*descpos + 1]; + + if (verbose) + printf("DHCP code %2d(%2d) len %2d(%2d) ", code, optcode, + len, optlen); + + assert_se(code == optcode); + assert_se(len == optlen); + + for (unsigned i = 0; i < len; i++) { + if (verbose) + printf("0x%02x(0x%02x) ", + ((uint8_t*) option)[i], + descoption[*descpos + 2 + i]); + + assert_se(((uint8_t*) option)[i] == descoption[*descpos + 2 + i]); + } + + if (verbose) + printf("\n"); + + *descpos += optlen + 2; + + test_ignore_opts(descoption, descpos, desclen); + + if (desc->pos != -1 && desc->pos == desc->len) + desc->pos = -1; + + if (desc->filepos != -1 && desc->filepos == desc->filelen) + desc->filepos = -1; + + if (desc->snamepos != -1 && desc->snamepos == desc->snamelen) + desc->snamepos = -1; + + return 0; +} + +static void test_options(struct option_desc *desc) { + uint8_t *options = NULL; + uint8_t *file = NULL; + uint8_t *sname = NULL; + int optlen = 0; + int filelen = 0; + int snamelen = 0; + int buflen = 0; + _cleanup_free_ DHCPMessage *message = NULL; + int res; + + if (desc) { + file = &desc->file[0]; + filelen = desc->filelen; + if (!filelen) + desc->filepos = -1; + + sname = &desc->sname[0]; + snamelen = desc->snamelen; + if (!snamelen) + desc->snamepos = -1; + + options = &desc->options[0]; + optlen = desc->len; + desc->pos = 0; + } + message = create_message(options, optlen, file, filelen, + sname, snamelen); + + buflen = sizeof(DHCPMessage) + optlen; + + if (!desc) { + assert_se((res = dhcp_option_parse(message, buflen, test_options_cb, NULL, NULL)) == -ENOMSG); + } else if (desc->success) { + assert_se((res = dhcp_option_parse(message, buflen, test_options_cb, desc, NULL)) >= 0); + assert_se(desc->pos == -1 && desc->filepos == -1 && desc->snamepos == -1); + } else + assert_se((res = dhcp_option_parse(message, buflen, test_options_cb, desc, NULL)) < 0); + + if (verbose) + printf("DHCP type %s\n", dhcp_type(res)); +} + +static void test_option_removal(struct option_desc *desc) { + _cleanup_free_ DHCPMessage *message = create_message(&desc->options[0], desc->len, NULL, 0, NULL, 0); + + assert_se(dhcp_option_parse(message, sizeof(DHCPMessage) + desc->len, NULL, NULL, NULL) >= 0); + assert_se((desc->len = dhcp_option_remove_option(message->options, desc->len, SD_DHCP_OPTION_MESSAGE_TYPE)) >= 0); + assert_se(dhcp_option_parse(message, sizeof(DHCPMessage) + desc->len, NULL, NULL, NULL) < 0); +} + +static uint8_t the_options[64] = { + 'A', 'B', 'C', 'D', + 160, 2, 0x11, 0x12, + 0, + 31, 8, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0, + 55, 3, 0x51, 0x52, 0x53, + 17, 7, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 255 +}; + +static void test_option_set(void) { + _cleanup_free_ DHCPMessage *result = NULL; + size_t offset = 0, len, pos; + + result = malloc0(sizeof(DHCPMessage) + 11); + assert_se(result); + + result->options[0] = 'A'; + result->options[1] = 'B'; + result->options[2] = 'C'; + result->options[3] = 'D'; + + assert_se(dhcp_option_append(result, 0, &offset, 0, SD_DHCP_OPTION_PAD, + 0, NULL) == -ENOBUFS); + assert_se(offset == 0); + + offset = 4; + assert_se(dhcp_option_append(result, 5, &offset, 0, SD_DHCP_OPTION_PAD, + 0, NULL) == -ENOBUFS); + assert_se(offset == 4); + assert_se(dhcp_option_append(result, 6, &offset, 0, SD_DHCP_OPTION_PAD, + 0, NULL) >= 0); + assert_se(offset == 5); + + offset = pos = 4; + len = 11; + while (pos < len && the_options[pos] != SD_DHCP_OPTION_END) { + assert_se(dhcp_option_append(result, len, &offset, DHCP_OVERLOAD_SNAME, + the_options[pos], + the_options[pos + 1], + &the_options[pos + 2]) >= 0); + + if (the_options[pos] == SD_DHCP_OPTION_PAD) + pos++; + else + pos += 2 + the_options[pos + 1]; + + if (pos < len) + assert_se(offset == pos); + } + + for (unsigned i = 0; i < 9; i++) { + if (verbose) + printf("%2u: 0x%02x(0x%02x) (options)\n", i, result->options[i], + the_options[i]); + assert_se(result->options[i] == the_options[i]); + } + + if (verbose) + printf("%2d: 0x%02x(0x%02x) (options)\n", 9, result->options[9], + (unsigned) SD_DHCP_OPTION_END); + + assert_se(result->options[9] == SD_DHCP_OPTION_END); + + if (verbose) + printf("%2d: 0x%02x(0x%02x) (options)\n", 10, result->options[10], + (unsigned) SD_DHCP_OPTION_PAD); + + assert_se(result->options[10] == SD_DHCP_OPTION_PAD); + + for (unsigned i = 0; i < pos - 8; i++) { + if (verbose) + printf("%2u: 0x%02x(0x%02x) (sname)\n", i, result->sname[i], + the_options[i + 9]); + assert_se(result->sname[i] == the_options[i + 9]); + } + + if (verbose) + printf ("\n"); +} + +int main(int argc, char *argv[]) { + test_setup_logging(LOG_DEBUG); + + test_invalid_buffer_length(); + test_message_init(); + + test_options(NULL); + + for (unsigned i = 0; i < ELEMENTSOF(option_tests); i++) + test_options(&option_tests[i]); + + test_option_set(); + + for (unsigned i = 0; i < ELEMENTSOF(option_tests); i++) { + struct option_desc *desc = &option_tests[i]; + if (!desc->success || desc->snamelen > 0 || desc->filelen > 0) + continue; + test_option_removal(desc); + } + + return 0; +} diff --git a/src/libsystemd-network/test-dhcp-server.c b/src/libsystemd-network/test-dhcp-server.c new file mode 100644 index 0000000..b2e6034 --- /dev/null +++ b/src/libsystemd-network/test-dhcp-server.c @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "sd-dhcp-server.h" +#include "sd-event.h" + +#include "dhcp-server-internal.h" +#include "tests.h" + +static void test_pool(struct in_addr *address, unsigned size, int ret) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + + assert_se(sd_dhcp_server_new(&server, 1) >= 0); + + assert_se(sd_dhcp_server_configure_pool(server, address, 8, 0, size) == ret); +} + +static int test_basic(bool bind_to_interface) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + struct in_addr address_lo = { + .s_addr = htobe32(INADDR_LOOPBACK), + }; + struct in_addr address_any = { + .s_addr = htobe32(INADDR_ANY), + }; + int r; + + log_debug("/* %s(bind_to_interface=%s) */", __func__, yes_no(bind_to_interface)); + + assert_se(sd_event_new(&event) >= 0); + + /* attach to loopback interface */ + assert_se(sd_dhcp_server_new(&server, 1) >= 0); + assert_se(server); + server->bind_to_interface = bind_to_interface; + + assert_se(sd_dhcp_server_attach_event(server, event, 0) >= 0); + assert_se(sd_dhcp_server_attach_event(server, event, 0) == -EBUSY); + assert_se(sd_dhcp_server_get_event(server) == event); + assert_se(sd_dhcp_server_detach_event(server) >= 0); + assert_se(!sd_dhcp_server_get_event(server)); + assert_se(sd_dhcp_server_attach_event(server, NULL, 0) >= 0); + assert_se(sd_dhcp_server_attach_event(server, NULL, 0) == -EBUSY); + + assert_se(sd_dhcp_server_ref(server) == server); + assert_se(!sd_dhcp_server_unref(server)); + + assert_se(sd_dhcp_server_start(server) == -EUNATCH); + + assert_se(sd_dhcp_server_configure_pool(server, &address_any, 28, 0, 0) == -EINVAL); + assert_se(sd_dhcp_server_configure_pool(server, &address_lo, 38, 0, 0) == -ERANGE); + assert_se(sd_dhcp_server_configure_pool(server, &address_lo, 8, 0, 0) >= 0); + assert_se(sd_dhcp_server_configure_pool(server, &address_lo, 8, 0, 0) >= 0); + + test_pool(&address_any, 1, -EINVAL); + test_pool(&address_lo, 1, 0); + + r = sd_dhcp_server_start(server); + if (r == -EPERM) + return r; + assert_se(r >= 0); + + assert_se(sd_dhcp_server_start(server) >= 0); + assert_se(sd_dhcp_server_stop(server) >= 0); + assert_se(sd_dhcp_server_stop(server) >= 0); + assert_se(sd_dhcp_server_start(server) >= 0); + + return 0; +} + +static void test_message_handler(void) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + struct { + DHCPMessage message; + struct { + uint8_t code; + uint8_t length; + uint8_t type; + } _packed_ option_type; + struct { + uint8_t code; + uint8_t length; + be32_t address; + } _packed_ option_requested_ip; + struct { + uint8_t code; + uint8_t length; + be32_t address; + } _packed_ option_server_id; + struct { + uint8_t code; + uint8_t length; + uint8_t id[7]; + } _packed_ option_client_id; + struct { + uint8_t code; + uint8_t length; + uint8_t hostname[6]; + } _packed_ option_hostname; + uint8_t end; + } _packed_ test = { + .message.op = BOOTREQUEST, + .message.htype = ARPHRD_ETHER, + .message.hlen = ETHER_ADDR_LEN, + .message.xid = htobe32(0x12345678), + .message.chaddr = { 'A', 'B', 'C', 'D', 'E', 'F' }, + .option_type.code = SD_DHCP_OPTION_MESSAGE_TYPE, + .option_type.length = 1, + .option_type.type = DHCP_DISCOVER, + .option_hostname.code = SD_DHCP_OPTION_HOST_NAME, + .option_hostname.length = 6, + .option_hostname.hostname = { 'T', 'E', 'S', 'T', 'H', 'N' }, + .end = SD_DHCP_OPTION_END, + }; + struct in_addr address_lo = { + .s_addr = htobe32(INADDR_LOOPBACK), + }; + struct in_addr static_lease_address = { + .s_addr = htobe32(INADDR_LOOPBACK + 42), + }; + static uint8_t static_lease_client_id[7] = {0x01, 'A', 'B', 'C', 'D', 'E', 'G' }; + + log_debug("/* %s */", __func__); + + assert_se(sd_dhcp_server_new(&server, 1) >= 0); + assert_se(sd_dhcp_server_configure_pool(server, &address_lo, 8, 0, 0) >= 0); + assert_se(sd_dhcp_server_set_static_lease(server, &static_lease_address, static_lease_client_id, + ELEMENTSOF(static_lease_client_id)) >= 0); + assert_se(sd_dhcp_server_attach_event(server, NULL, 0) >= 0); + assert_se(sd_dhcp_server_start(server) >= 0); + + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + + test.end = 0; + /* TODO, shouldn't this fail? */ + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + test.end = SD_DHCP_OPTION_END; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + + test.option_type.code = 0; + test.option_type.length = 0; + test.option_type.type = 0; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == -ENOMSG); + test.option_type.code = SD_DHCP_OPTION_MESSAGE_TYPE; + test.option_type.length = 1; + test.option_type.type = DHCP_DISCOVER; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + + test.message.op = 0; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + test.message.op = BOOTREQUEST; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + + test.message.htype = 0; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + test.message.htype = ARPHRD_ETHER; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + + test.message.hlen = 0; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == -EBADMSG); + test.message.hlen = ETHER_ADDR_LEN; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_OFFER); + + test.option_type.type = DHCP_REQUEST; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + test.option_requested_ip.code = SD_DHCP_OPTION_REQUESTED_IP_ADDRESS; + test.option_requested_ip.length = 4; + test.option_requested_ip.address = htobe32(0x12345678); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_NAK); + test.option_server_id.code = SD_DHCP_OPTION_SERVER_IDENTIFIER; + test.option_server_id.length = 4; + test.option_server_id.address = htobe32(INADDR_LOOPBACK); + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 3); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_ACK); + + test.option_server_id.address = htobe32(0x12345678); + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 3); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + test.option_server_id.address = htobe32(INADDR_LOOPBACK); + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 4); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 3); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_ACK); + + test.option_client_id.code = SD_DHCP_OPTION_CLIENT_IDENTIFIER; + test.option_client_id.length = 7; + test.option_client_id.id[0] = 0x01; + test.option_client_id.id[1] = 'A'; + test.option_client_id.id[2] = 'B'; + test.option_client_id.id[3] = 'C'; + test.option_client_id.id[4] = 'D'; + test.option_client_id.id[5] = 'E'; + test.option_client_id.id[6] = 'F'; + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_ACK); + + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 30); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + + /* request address reserved for static lease (unmatching client ID) */ + test.option_client_id.id[6] = 'H'; + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 42); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + + /* request unmatching address */ + test.option_client_id.id[6] = 'G'; + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 41); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == 0); + + /* request matching address */ + test.option_client_id.id[6] = 'G'; + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 42); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_ACK); + + /* try again */ + test.option_client_id.id[6] = 'G'; + test.option_requested_ip.address = htobe32(INADDR_LOOPBACK + 42); + assert_se(dhcp_server_handle_message(server, (DHCPMessage*)&test, sizeof(test), NULL) == DHCP_ACK); +} + +static uint64_t client_id_hash_helper(DHCPClientId *id, uint8_t key[HASH_KEY_SIZE]) { + struct siphash state; + + siphash24_init(&state, key); + client_id_hash_func(id, &state); + + return htole64(siphash24_finalize(&state)); +} + +static void test_client_id_hash(void) { + DHCPClientId a = { + .length = 4, + }, b = { + .length = 4, + }; + uint8_t hash_key[HASH_KEY_SIZE] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + }; + + log_debug("/* %s */", __func__); + + a.data = (uint8_t*)strdup("abcd"); + b.data = (uint8_t*)strdup("abcd"); + + assert_se(client_id_compare_func(&a, &b) == 0); + assert_se(client_id_hash_helper(&a, hash_key) == client_id_hash_helper(&b, hash_key)); + a.length = 3; + assert_se(client_id_compare_func(&a, &b) != 0); + a.length = 4; + assert_se(client_id_compare_func(&a, &b) == 0); + assert_se(client_id_hash_helper(&a, hash_key) == client_id_hash_helper(&b, hash_key)); + + b.length = 3; + assert_se(client_id_compare_func(&a, &b) != 0); + b.length = 4; + assert_se(client_id_compare_func(&a, &b) == 0); + assert_se(client_id_hash_helper(&a, hash_key) == client_id_hash_helper(&b, hash_key)); + + free(b.data); + b.data = (uint8_t*)strdup("abce"); + assert_se(client_id_compare_func(&a, &b) != 0); + + free(a.data); + free(b.data); +} + +static void test_static_lease(void) { + _cleanup_(sd_dhcp_server_unrefp) sd_dhcp_server *server = NULL; + + log_debug("/* %s */", __func__); + + assert_se(sd_dhcp_server_new(&server, 1) >= 0); + + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x01020304 }, + (uint8_t*) &(uint32_t) { 0x01020304 }, sizeof(uint32_t)) >= 0); + /* Duplicated entry. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x01020304 }, + (uint8_t*) &(uint32_t) { 0x01020304 }, sizeof(uint32_t)) == -EEXIST); + /* Address is conflicted. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x01020304 }, + (uint8_t*) &(uint32_t) { 0x01020305 }, sizeof(uint32_t)) == -EEXIST); + /* Client ID is conflicted. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x01020305 }, + (uint8_t*) &(uint32_t) { 0x01020304 }, sizeof(uint32_t)) == -EEXIST); + + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x01020305 }, + (uint8_t*) &(uint32_t) { 0x01020305 }, sizeof(uint32_t)) >= 0); + /* Remove the previous entry. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x00000000 }, + (uint8_t*) &(uint32_t) { 0x01020305 }, sizeof(uint32_t)) >= 0); + /* Then, set a different address. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x01020306 }, + (uint8_t*) &(uint32_t) { 0x01020305 }, sizeof(uint32_t)) >= 0); + /* Remove again. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x00000000 }, + (uint8_t*) &(uint32_t) { 0x01020305 }, sizeof(uint32_t)) >= 0); + /* Try to remove non-existent entry. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x00000000 }, + (uint8_t*) &(uint32_t) { 0x01020305 }, sizeof(uint32_t)) >= 0); + /* Try to remove non-existent entry. */ + assert_se(sd_dhcp_server_set_static_lease(server, &(struct in_addr) { .s_addr = 0x00000000 }, + (uint8_t*) &(uint32_t) { 0x01020306 }, sizeof(uint32_t)) >= 0); +} + +int main(int argc, char *argv[]) { + int r; + + test_setup_logging(LOG_DEBUG); + + test_client_id_hash(); + test_static_lease(); + + r = test_basic(true); + if (r < 0) + return log_tests_skipped_errno(r, "cannot start dhcp server(bound to interface)"); + + r = test_basic(false); + if (r < 0) + return log_tests_skipped_errno(r, "cannot start dhcp server(non-bound to interface)"); + + test_message_handler(); + + return 0; +} diff --git a/src/libsystemd-network/test-dhcp6-client.c b/src/libsystemd-network/test-dhcp6-client.c new file mode 100644 index 0000000..ae3cdb8 --- /dev/null +++ b/src/libsystemd-network/test-dhcp6-client.c @@ -0,0 +1,1127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include + +#include "sd-dhcp6-client.h" +#include "sd-event.h" + +#include "dhcp-identifier.h" +#include "dhcp6-internal.h" +#include "dhcp6-lease-internal.h" +#include "dhcp6-protocol.h" +#include "fd-util.h" +#include "macro.h" +#include "memory-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "time-util.h" + +#define DHCP6_CLIENT_EVENT_TEST_ADVERTISED 77 +#define IA_ID_BYTES \ + 0x0e, 0xcf, 0xa3, 0x7d +#define IA_NA_ADDRESS1_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x78, 0xee, 0x1c, 0xf3, 0x09, 0x3c, 0x55, 0xad +#define IA_NA_ADDRESS2_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x78, 0xee, 0x1c, 0xf3, 0x09, 0x3c, 0x55, 0xae +#define IA_PD_PREFIX1_BYTES \ + 0x2a, 0x02, 0x81, 0x0d, 0x98, 0x80, 0x37, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +#define IA_PD_PREFIX2_BYTES \ + 0x2a, 0x02, 0x81, 0x0d, 0x98, 0x80, 0x37, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +#define DNS1_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 +#define DNS2_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 +#define SNTP1_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 +#define SNTP2_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04 +#define NTP1_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05 +#define NTP2_BYTES \ + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06 +#define CLIENT_ID_BYTES \ + 0x00, 0x02, 0x00, 0x00, 0xab, 0x11, 0x61, 0x77, 0x40, 0xde, 0x13, 0x42, 0xc3, 0xa2 +#define SERVER_ID_BYTES \ + 0x00, 0x01, 0x00, 0x01, 0x19, 0x40, 0x5c, 0x53, 0x78, 0x2b, 0xcb, 0xb3, 0x6d, 0x53 +#define VENDOR_SUBOPTION_BYTES \ + 0x01 + +static const struct in6_addr local_address = + { { { 0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, } } }; +static const struct in6_addr mcast_address = + IN6ADDR_ALL_DHCP6_RELAY_AGENTS_AND_SERVERS_INIT; +static const struct in6_addr ia_na_address1 = { { { IA_NA_ADDRESS1_BYTES } } }; +static const struct in6_addr ia_na_address2 = { { { IA_NA_ADDRESS2_BYTES } } }; +static const struct in6_addr ia_pd_prefix1 = { { { IA_PD_PREFIX1_BYTES } } }; +static const struct in6_addr ia_pd_prefix2 = { { { IA_PD_PREFIX2_BYTES } } }; +static const struct in6_addr dns1 = { { { DNS1_BYTES } } }; +static const struct in6_addr dns2 = { { { DNS2_BYTES } } }; +static const struct in6_addr sntp1 = { { { SNTP1_BYTES } } }; +static const struct in6_addr sntp2 = { { { SNTP2_BYTES } } }; +static const struct in6_addr ntp1 = { { { NTP1_BYTES } } }; +static const struct in6_addr ntp2 = { { { NTP2_BYTES } } }; +static const uint8_t client_id[] = { CLIENT_ID_BYTES }; +static const uint8_t server_id[] = { SERVER_ID_BYTES }; +static uint8_t vendor_suboption_data[] = { VENDOR_SUBOPTION_BYTES }; +static const struct ether_addr mac = { + .ether_addr_octet = { 'A', 'B', 'C', '1', '2', '3' }, +}; +static int test_fd[2] = EBADF_PAIR; +static sd_dhcp6_option vendor_suboption = { + .n_ref = 1, + .enterprise_identifier = 32, + .option = 247, + .data = vendor_suboption_data, + .length = 1, +}; +static int test_ifindex = 42; +static unsigned test_client_sent_message_count = 0; +static sd_dhcp6_client *client_ref = NULL; + +TEST(client_basic) { + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + int v; + + assert_se(sd_dhcp6_client_new(&client) >= 0); + assert_se(client); + + assert_se(sd_dhcp6_client_set_ifindex(client, 15) == 0); + assert_se(sd_dhcp6_client_set_ifindex(client, 42) >= 0); + + assert_se(sd_dhcp6_client_set_mac(client, mac.ether_addr_octet, sizeof(mac), ARPHRD_ETHER) >= 0); + + assert_se(sd_dhcp6_client_set_fqdn(client, "host") == 1); + assert_se(sd_dhcp6_client_set_fqdn(client, "host.domain") == 1); + assert_se(sd_dhcp6_client_set_fqdn(client, NULL) == 1); + assert_se(sd_dhcp6_client_set_fqdn(client, "~host") == -EINVAL); + assert_se(sd_dhcp6_client_set_fqdn(client, "~host.domain") == -EINVAL); + + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_CLIENTID) == -EINVAL); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_DNS_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NTP_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_SNTP_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_VENDOR_OPTS) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_DOMAIN) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, 10) == -EINVAL); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NIS_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NISP_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NIS_SERVER) == -EEXIST); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NISP_SERVER) == -EEXIST); + + assert_se(sd_dhcp6_client_set_information_request(client, 1) >= 0); + v = 0; + assert_se(sd_dhcp6_client_get_information_request(client, &v) >= 0); + assert_se(v); + assert_se(sd_dhcp6_client_set_information_request(client, 0) >= 0); + v = 42; + assert_se(sd_dhcp6_client_get_information_request(client, &v) >= 0); + assert_se(v == 0); + + v = 0; + assert_se(sd_dhcp6_client_get_address_request(client, &v) >= 0); + assert_se(v); + v = 0; + assert_se(sd_dhcp6_client_set_address_request(client, 1) >= 0); + assert_se(sd_dhcp6_client_get_address_request(client, &v) >= 0); + assert_se(v); + v = 42; + assert_se(sd_dhcp6_client_set_address_request(client, 1) >= 0); + assert_se(sd_dhcp6_client_get_address_request(client, &v) >= 0); + assert_se(v); + + assert_se(sd_dhcp6_client_set_address_request(client, 1) >= 0); + assert_se(sd_dhcp6_client_set_prefix_delegation(client, 1) >= 0); + v = 0; + assert_se(sd_dhcp6_client_get_address_request(client, &v) >= 0); + assert_se(v); + v = 0; + assert_se(sd_dhcp6_client_get_prefix_delegation(client, &v) >= 0); + assert_se(v); + + assert_se(sd_dhcp6_client_set_callback(client, NULL, NULL) >= 0); + + assert_se(sd_dhcp6_client_detach_event(client) >= 0); +} + +TEST(parse_domain) { + _cleanup_free_ char *domain = NULL; + _cleanup_strv_free_ char **list = NULL; + uint8_t *data; + + data = (uint8_t []) { 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 3, 'c', 'o', 'm', 0 }; + assert_se(dhcp6_option_parse_domainname(data, 13, &domain) >= 0); + assert_se(domain); + assert_se(streq(domain, "example.com")); + domain = mfree(domain); + + data = (uint8_t []) { 4, 't', 'e', 's', 't' }; + assert_se(dhcp6_option_parse_domainname(data, 5, &domain) >= 0); + assert_se(domain); + assert_se(streq(domain, "test")); + domain = mfree(domain); + + data = (uint8_t []) { 0 }; + assert_se(dhcp6_option_parse_domainname(data, 1, &domain) < 0); + + data = (uint8_t []) { 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 3, 'c', 'o', 'm', 0, + 6, 'f', 'o', 'o', 'b', 'a', 'r', 0 }; + assert_se(dhcp6_option_parse_domainname_list(data, 21, &list) >= 0); + assert_se(list); + assert_se(streq(list[0], "example.com")); + assert_se(streq(list[1], "foobar")); + assert_se(!list[2]); + list = strv_free(list); + + data = (uint8_t []) { 1, 'a', 0, 20, 'b', 'c' }; + assert_se(dhcp6_option_parse_domainname_list(data, 6, &list) < 0); + + data = (uint8_t []) { 0 , 0 }; + assert_se(dhcp6_option_parse_domainname_list(data, 2, &list) < 0); +} + +TEST(option) { + static const uint8_t packet[] = { + 'F', 'O', 'O', 'H', 'O', 'G', 'E', + 0x00, SD_DHCP6_OPTION_ORO, 0x00, 0x07, + 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 0x00, SD_DHCP6_OPTION_VENDOR_CLASS, 0x00, 0x09, + '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'B', 'A', 'R', + }; + static const uint8_t result[] = { + 'F', 'O', 'O', 'H', 'O', 'G', 'E', + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 'B', 'A', 'R', + }; + _cleanup_free_ uint8_t *buf = NULL; + size_t offset, pos, optlen; + const uint8_t *optval; + uint16_t optcode; + + assert_se(sizeof(packet) == sizeof(result)); + + offset = 0; + assert_se(dhcp6_option_parse(packet, 0, &offset, &optcode, &optlen, &optval) == -EBADMSG); + + offset = 3; + assert_se(dhcp6_option_parse(packet, 0, &offset, &optcode, &optlen, &optval) == -EBADMSG); + + /* Tests for reading unaligned data. */ + assert_se(buf = new(uint8_t, sizeof(packet))); + for (size_t i = 0; i <= 7; i++) { + memcpy(buf, packet + i, sizeof(packet) - i); + offset = 7 - i; + assert_se(dhcp6_option_parse(buf, sizeof(packet), &offset, &optcode, &optlen, &optval) >= 0); + + assert_se(optcode == SD_DHCP6_OPTION_ORO); + assert_se(optlen == 7); + assert_se(optval == buf + 11 - i); + } + + offset = 7; + assert_se(dhcp6_option_parse(packet, sizeof(packet), &offset, &optcode, &optlen, &optval) >= 0); + + assert_se(optcode == SD_DHCP6_OPTION_ORO); + assert_se(optlen == 7); + assert_se(optval == packet + 11); + + free(buf); + assert_se(buf = memdup(result, sizeof(result))); + pos = 7; + assert_se(dhcp6_option_append(&buf, &pos, optcode, optlen, optval) >= 0); + + assert_se(dhcp6_option_parse(packet, sizeof(packet), &offset, &optcode, &optlen, &optval) >= 0); + + assert_se(optcode == SD_DHCP6_OPTION_VENDOR_CLASS); + assert_se(optlen == 9); + assert_se(optval == packet + 22); + + assert_se(dhcp6_option_append(&buf, &pos, optcode, optlen, optval) >= 0); + + assert_se(memcmp(packet, buf, sizeof(packet)) == 0); +} + +TEST(option_status) { + uint8_t option1[] = { + /* IA NA */ + 0x00, 0x03, 0x00, 0x12, 0x1a, 0x1d, 0x1a, 0x1d, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x02, 0x00, 0x02, + /* status option */ + 0x00, 0x0d, 0x00, 0x02, 0x00, 0x01, + }; + static const uint8_t option2[] = { + /* IA NA */ + 0x00, 0x03, 0x00, 0x2e, 0x1a, 0x1d, 0x1a, 0x1d, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x02, 0x00, 0x02, + /* IA Addr */ + 0x00, 0x05, 0x00, 0x1e, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x01, 0x02, 0x03, 0x04, 0x0a, 0x0b, 0x0c, 0x0d, + /* IA address status option */ + 0x00, 0x0d, 0x00, 0x02, 0x00, 0x01, + }; + static const uint8_t option3[] = { + /* IA NA */ + 0x00, 0x03, 0x00, 0x34, 0x1a, 0x1d, 0x1a, 0x1d, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x02, 0x00, 0x02, + /* IA Addr */ + 0x00, 0x05, 0x00, 0x24, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x01, 0x02, 0x03, 0x04, 0x0a, 0x0b, 0x0c, 0x0d, + /* IA address status option */ + 0x00, 0x0d, 0x00, 0x08, 0x00, 0x00, 'f', 'o', + 'o', 'b', 'a', 'r', + }; + static const uint8_t option4[] = { + /* IA PD */ + 0x00, 0x19, 0x00, 0x2f, 0x1a, 0x1d, 0x1a, 0x1d, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x02, 0x00, 0x02, + /* IA PD Prefix */ + 0x00, 0x1a, 0x00, 0x1f, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x80, 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, + 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, + /* PD prefix status option */ + 0x00, 0x0d, 0x00, 0x02, 0x00, 0x00, + }; + static const uint8_t option5[] = { + /* IA PD */ + 0x00, 0x19, 0x00, 0x52, 0x1a, 0x1d, 0x1a, 0x1d, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x02, 0x00, 0x02, + /* IA PD Prefix #1 */ + 0x00, 0x1a, 0x00, 0x1f, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x80, 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, + 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, + /* PD prefix status option */ + 0x00, 0x0d, 0x00, 0x02, 0x00, 0x00, + /* IA PD Prefix #2 */ + 0x00, 0x1a, 0x00, 0x1f, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x80, 0x20, 0x01, 0x0d, 0xb8, 0xc0, 0x0l, 0xd0, + 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, + /* PD prefix status option */ + 0x00, 0x0d, 0x00, 0x02, 0x00, 0x00, + }; + _cleanup_(dhcp6_ia_freep) DHCP6IA *ia = NULL; + DHCP6Option *option; + be32_t iaid; + int r; + + memcpy(&iaid, option1 + 4, sizeof(iaid)); + + option = (DHCP6Option*) option1; + assert_se(sizeof(option1) == sizeof(DHCP6Option) + be16toh(option->len)); + + r = dhcp6_option_parse_ia(NULL, 0, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r == -ENOANO); + + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r == -EINVAL); + + option->len = htobe16(17); + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r == -EBADMSG); + + option->len = htobe16(sizeof(DHCP6Option)); + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r == -EBADMSG); + + option = (DHCP6Option*) option2; + assert_se(sizeof(option2) == sizeof(DHCP6Option) + be16toh(option->len)); + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r == -ENODATA); + + option = (DHCP6Option*) option3; + assert_se(sizeof(option3) == sizeof(DHCP6Option) + be16toh(option->len)); + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r >= 0); + assert_se(ia); + assert_se(ia->addresses); + ia = dhcp6_ia_free(ia); + + option = (DHCP6Option*) option4; + assert_se(sizeof(option4) == sizeof(DHCP6Option) + be16toh(option->len)); + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r >= 0); + assert_se(ia); + assert_se(ia->addresses); + assert_se(memcmp(&ia->header.id, &option4[4], 4) == 0); + assert_se(memcmp(&ia->header.lifetime_t1, &option4[8], 4) == 0); + assert_se(memcmp(&ia->header.lifetime_t2, &option4[12], 4) == 0); + ia = dhcp6_ia_free(ia); + + option = (DHCP6Option*) option5; + assert_se(sizeof(option5) == sizeof(DHCP6Option) + be16toh(option->len)); + r = dhcp6_option_parse_ia(NULL, iaid, be16toh(option->code), be16toh(option->len), option->data, &ia); + assert_se(r >= 0); + assert_se(ia); + assert_se(ia->addresses); + ia = dhcp6_ia_free(ia); +} + +TEST(client_parse_message_issue_22099) { + static const uint8_t msg[] = { + /* Message type */ + DHCP6_MESSAGE_REPLY, + /* Transaction ID */ + 0x7c, 0x4c, 0x16, + /* Rapid commit */ + 0x00, SD_DHCP6_OPTION_RAPID_COMMIT, 0x00, 0x00, + /* NTP servers */ + 0x00, SD_DHCP6_OPTION_NTP_SERVER, 0x00, 0x14, + /* NTP server (broken sub option and sub option length) */ + 0x01, 0x00, 0x10, 0x00, + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x15, 0xc8, 0xff, 0xfe, 0xef, 0x1e, 0x4e, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + 0x00, 0x02, /* DUID-EN */ + 0x00, 0x00, 0xab, 0x11, /* pen */ + 0x5c, 0x6b, 0x90, 0xec, 0xda, 0x95, 0x15, 0x45, /* id */ + /* Server ID */ + 0x00, SD_DHCP6_OPTION_SERVERID, 0x00, 0x0a, + 0x00, 0x03, /* DUID-LL */ + 0x00, 0x01, /* htype */ + 0xdc, 0x15, 0xc8, 0xef, 0x1e, 0x4e, /* haddr */ + /* preference */ + 0x00, SD_DHCP6_OPTION_PREFERENCE, 0x00, 0x01, + 0x00, + /* DNS servers */ + 0x00, SD_DHCP6_OPTION_DNS_SERVER, 0x00, 0x10, + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x15, 0xc8, 0xff, 0xfe, 0xef, 0x1e, 0x4e, + /* v6 pcp server */ + 0x00, SD_DHCP6_OPTION_V6_PCP_SERVER, 0x00, 0x10, + 0x2a, 0x02, 0x81, 0x0d, 0x98, 0x80, 0x37, 0x00, 0xde, 0x15, 0xc8, 0xff, 0xfe, 0xef, 0x1e, 0x4e, + /* IA_NA */ + 0x00, SD_DHCP6_OPTION_IA_NA, 0x00, 0x28, + 0xcc, 0x59, 0x11, 0x7b, /* iaid */ + 0x00, 0x00, 0x07, 0x08, /* lifetime T1 */ + 0x00, 0x00, 0x0b, 0x40, /* lifetime T2 */ + /* IA_NA (iaaddr suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + 0x2a, 0x02, 0x81, 0x0d, 0x98, 0x80, 0x37, 0x00, 0x6a, 0x05, 0xca, 0xff, 0xfe, 0xf1, 0x51, 0x53, /* address */ + 0x00, 0x00, 0x0e, 0x10, /* preferred lifetime */ + 0x00, 0x00, 0x1c, 0x20, /* valid lifetime */ + /* IA_PD */ + 0x00, SD_DHCP6_OPTION_IA_PD, 0x00, 0x29, + 0xcc, 0x59, 0x11, 0x7b, /* iaid */ + 0x00, 0x00, 0x07, 0x08, /* lifetime T1 */ + 0x00, 0x00, 0x0b, 0x40, /* lifetime T2 */ + /* IA_PD (iaprefix suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x0e, 0x10, /* preferred lifetime */ + 0x00, 0x00, 0x1c, 0x20, /* valid lifetime */ + 0x3a, /* prefixlen */ + 0x2a, 0x02, 0x81, 0x0d, 0x98, 0x80, 0x37, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* prefix */ + }; + static const uint8_t duid[] = { + 0x00, 0x00, 0xab, 0x11, 0x5c, 0x6b, 0x90, 0xec, 0xda, 0x95, 0x15, 0x45, + }; + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease = NULL; + + assert_se(sd_dhcp6_client_new(&client) >= 0); + assert_se(sd_dhcp6_client_set_iaid(client, 0xcc59117b) >= 0); + assert_se(sd_dhcp6_client_set_duid_raw(client, 2, duid, sizeof(duid)) >= 0); + + assert_se(dhcp6_lease_new_from_message(client, (const DHCP6Message*) msg, sizeof(msg), NULL, NULL, &lease) >= 0); +} + +TEST(client_parse_message_issue_24002) { + static const uint8_t msg[] = { + /* Message Type */ + 0x07, + /* Transaction ID */ + 0x0e, 0xa5, 0x7c, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + 0x00, 0x02, /* DUID-EN */ + 0x00, 0x00, 0xab, 0x11, /* pen */ + 0x5c, 0x6b, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, /* id */ + /* Server ID */ + 0x00, 0x02, 0x00, 0x1a, + 0x00, 0x02, 0x00, 0x00, 0x05, 0x83, 0x30, 0x63, 0x3a, 0x38, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, + 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, + /* IA_PD */ + 0x00, 0x19, 0x00, 0x29, + 0xaa, 0xbb, 0xcc, 0xdd, /* iaid */ + 0x00, 0x00, 0x03, 0x84, /* lifetime (T1) */ + 0x00, 0x00, 0x05, 0xa0, /* lifetime (T2) */ + /* IA_PD (iaprefix suboption) */ + 0x00, 0x1a, 0x00, 0x19, + 0x00, 0x00, 0x07, 0x08, /* preferred lifetime */ + 0x00, 0x00, 0x38, 0x40, /* valid lifetime */ + 0x38, /* prefixlen */ + 0x20, 0x03, 0x00, 0xff, 0xaa, 0xbb, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* prefix */ + /* Rapid commit */ + 0x00, 0x0e, 0x00, 0x00, + /* Trailing invalid byte at the end. See issue #28183. */ + 00, + }; + static const uint8_t duid[] = { + 0x00, 0x00, 0xab, 0x11, 0x5c, 0x6b, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, + }; + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease = NULL; + + assert_se(sd_dhcp6_client_new(&client) >= 0); + assert_se(sd_dhcp6_client_set_iaid(client, 0xaabbccdd) >= 0); + assert_se(sd_dhcp6_client_set_duid_raw(client, 2, duid, sizeof(duid)) >= 0); + + assert_se(dhcp6_lease_new_from_message(client, (const DHCP6Message*) msg, sizeof(msg), NULL, NULL, &lease) >= 0); +} + +static const uint8_t msg_information_request[] = { + /* Message type */ + DHCP6_MESSAGE_INFORMATION_REQUEST, + /* Transaction ID */ + 0x0f, 0xb4, 0xe5, + /* MUD URL */ + /* ORO */ + 0x00, SD_DHCP6_OPTION_ORO, 0x00, 0x0c, + 0x00, SD_DHCP6_OPTION_DNS_SERVER, + 0x00, SD_DHCP6_OPTION_DOMAIN, + 0x00, SD_DHCP6_OPTION_SNTP_SERVER, + 0x00, SD_DHCP6_OPTION_INFORMATION_REFRESH_TIME, + 0x00, SD_DHCP6_OPTION_NTP_SERVER, + 0x00, SD_DHCP6_OPTION_INF_MAX_RT, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + CLIENT_ID_BYTES, + /* Extra options */ + /* Elapsed time */ + 0x00, SD_DHCP6_OPTION_ELAPSED_TIME, 0x00, 0x02, + 0x00, 0x00, +}; + +static const uint8_t msg_solicit[] = { + /* Message type */ + DHCP6_MESSAGE_SOLICIT, + /* Transaction ID */ + 0x0f, 0xb4, 0xe5, + /* Rapid commit */ + 0x00, SD_DHCP6_OPTION_RAPID_COMMIT, 0x00, 0x00, + /* IA_NA */ + 0x00, SD_DHCP6_OPTION_IA_NA, 0x00, 0x0c, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x00, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x00, /* lifetime T2 */ + /* IA_PD */ + 0x00, SD_DHCP6_OPTION_IA_PD, 0x00, 0x0c, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x00, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x00, /* lifetime T2 */ + /* Client FQDN */ + 0x00, SD_DHCP6_OPTION_CLIENT_FQDN, 0x00, 0x11, + DHCP6_FQDN_FLAG_S, + 0x04, 'h', 'o', 's', 't', 0x03, 'l', 'a', 'b', 0x05, 'i', 'n', 't', 'r', 'a', 0x00, + /* User Class */ + /* Vendor Class */ + /* Vendor Options */ + /* MUD URL */ + /* ORO */ + 0x00, SD_DHCP6_OPTION_ORO, 0x00, 0x0a, + 0x00, SD_DHCP6_OPTION_DNS_SERVER, + 0x00, SD_DHCP6_OPTION_DOMAIN, + 0x00, SD_DHCP6_OPTION_SNTP_SERVER, + 0x00, SD_DHCP6_OPTION_NTP_SERVER, + 0x00, SD_DHCP6_OPTION_SOL_MAX_RT, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + CLIENT_ID_BYTES, + /* Extra options */ + /* Elapsed time */ + 0x00, SD_DHCP6_OPTION_ELAPSED_TIME, 0x00, 0x02, + 0x00, 0x00, +}; + +static const uint8_t msg_request[] = { + /* Message type */ + DHCP6_MESSAGE_REQUEST, + /* Transaction ID */ + 0x00, 0x00, 0x00, + /* Server ID */ + 0x00, SD_DHCP6_OPTION_SERVERID, 0x00, 0x0e, + SERVER_ID_BYTES, + /* IA_NA */ + 0x00, SD_DHCP6_OPTION_IA_NA, 0x00, 0x44, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x00, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x00, /* lifetime T2 */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS1_BYTES, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS2_BYTES, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + /* IA_PD */ + 0x00, SD_DHCP6_OPTION_IA_PD, 0x00, 0x46, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x00, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x00, /* lifetime T2 */ + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX1_BYTES, + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX2_BYTES, + /* Client FQDN */ + 0x00, SD_DHCP6_OPTION_CLIENT_FQDN, 0x00, 0x11, + DHCP6_FQDN_FLAG_S, + 0x04, 'h', 'o', 's', 't', 0x03, 'l', 'a', 'b', 0x05, 'i', 'n', 't', 'r', 'a', 0x00, + /* User Class */ + /* Vendor Class */ + /* Vendor Options */ + /* MUD URL */ + /* ORO */ + 0x00, SD_DHCP6_OPTION_ORO, 0x00, 0x08, + 0x00, SD_DHCP6_OPTION_DNS_SERVER, + 0x00, SD_DHCP6_OPTION_DOMAIN, + 0x00, SD_DHCP6_OPTION_SNTP_SERVER, + 0x00, SD_DHCP6_OPTION_NTP_SERVER, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + CLIENT_ID_BYTES, + /* Extra options */ + /* Elapsed time */ + 0x00, SD_DHCP6_OPTION_ELAPSED_TIME, 0x00, 0x02, + 0x00, 0x00, +}; + +/* RFC 3315 section 18.1.6. The DHCP6 Release message must include: + - transaction id + - server identifier + - client identifier + - all released IA with addresses included + - elapsed time (required for all messages). + All other options aren't required. */ +static const uint8_t msg_release[] = { + /* Message type */ + DHCP6_MESSAGE_RELEASE, + /* Transaction ID */ + 0x00, 0x00, 0x00, + /* Server ID */ + 0x00, SD_DHCP6_OPTION_SERVERID, 0x00, 0x0e, + SERVER_ID_BYTES, + /* IA_NA */ + 0x00, SD_DHCP6_OPTION_IA_NA, 0x00, 0x44, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x00, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x00, /* lifetime T2 */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS1_BYTES, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS2_BYTES, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + /* IA_PD */ + 0x00, SD_DHCP6_OPTION_IA_PD, 0x00, 0x46, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x00, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x00, /* lifetime T2 */ + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX1_BYTES, + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x00, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0x00, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX2_BYTES, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + CLIENT_ID_BYTES, + /* Extra options */ + /* Elapsed time */ + 0x00, SD_DHCP6_OPTION_ELAPSED_TIME, 0x00, 0x02, + 0x00, 0x00, +}; + +static const uint8_t msg_reply[] = { + /* Message type */ + DHCP6_MESSAGE_REPLY, + /* Transaction ID */ + 0x0f, 0xb4, 0xe5, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + CLIENT_ID_BYTES, + /* Server ID */ + 0x00, SD_DHCP6_OPTION_SERVERID, 0x00, 0x0e, + SERVER_ID_BYTES, + /* Rapid commit */ + 0x00, SD_DHCP6_OPTION_RAPID_COMMIT, 0x00, 0x01, + 0x00, + /* IA_NA */ + 0x00, SD_DHCP6_OPTION_IA_NA, 0x00, 0x66, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x50, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x78, /* lifetime T2 */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS2_BYTES, + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS1_BYTES, + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + /* IA_NA (status code suboption) */ + 0x00, SD_DHCP6_OPTION_STATUS_CODE, 0x00, 0x1e, + 0x00, 0x00, /* status code */ + 0x41, 0x6c, 0x6c, 0x20, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x65, 0x73, 0x20, 0x77, 0x65, + 0x72, 0x65, 0x20, 0x61, 0x73, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x2e, /* status message */ + /* IA_PD */ + 0x00, SD_DHCP6_OPTION_IA_PD, 0x00, 0x46, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x50, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x78, /* lifetime T2 */ + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX2_BYTES, + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX1_BYTES, + /* DNS servers */ + 0x00, SD_DHCP6_OPTION_DNS_SERVER, 0x00, 0x20, + DNS1_BYTES, + DNS2_BYTES, + /* SNTP servers */ + 0x00, SD_DHCP6_OPTION_SNTP_SERVER, 0x00, 0x20, + SNTP1_BYTES, + SNTP2_BYTES, + /* NTP servers */ + 0x00, SD_DHCP6_OPTION_NTP_SERVER, 0x00, 0x37, + /* NTP server (address suboption) */ + 0x00, DHCP6_NTP_SUBOPTION_SRV_ADDR, 0x00, 0x10, + NTP1_BYTES, + /* NTP server (address suboption) */ + 0x00, DHCP6_NTP_SUBOPTION_SRV_ADDR, 0x00, 0x10, + NTP2_BYTES, + /* NTP server (fqdn suboption) */ + 0x00, DHCP6_NTP_SUBOPTION_SRV_FQDN, 0x00, 0x0b, + 0x03, 'n', 't', 'p', 0x05, 'i', 'n', 't', 'r', 'a', 0x00, + /* Domain list */ + 0x00, SD_DHCP6_OPTION_DOMAIN, 0x00, 0x0b, + 0x03, 'l', 'a', 'b', 0x05, 'i', 'n', 't', 'r', 'a', 0x00, + /* Client FQDN */ + 0x00, SD_DHCP6_OPTION_CLIENT_FQDN, 0x00, 0x12, + 0x01, 0x06, 'c', 'l', 'i', 'e', 'n', 't', 0x03, 'l', 'a', 'b', 0x05, 'i', 'n', 't', 'r', 'a', + /* Vendor specific options */ + 0x00, SD_DHCP6_OPTION_VENDOR_OPTS, 0x00, 0x09, + 0x00, 0x00, 0x00, 0x20, 0x00, 0xf7, 0x00, 0x01, VENDOR_SUBOPTION_BYTES, +}; + +static const uint8_t msg_advertise[] = { + /* Message type */ + DHCP6_MESSAGE_ADVERTISE, + /* Transaction ID */ + 0x0f, 0xb4, 0xe5, + /* Client ID */ + 0x00, SD_DHCP6_OPTION_CLIENTID, 0x00, 0x0e, + CLIENT_ID_BYTES, + /* Server ID */ + 0x00, SD_DHCP6_OPTION_SERVERID, 0x00, 0x0e, + SERVER_ID_BYTES, + /* Preference */ + 0x00, SD_DHCP6_OPTION_PREFERENCE, 0x00, 0x01, + 0xff, + /* IA_NA */ + 0x00, SD_DHCP6_OPTION_IA_NA, 0x00, 0x7a, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x50, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x78, /* lifetime T2 */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS2_BYTES, /* address */ + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + /* IA_NA (IAADDR suboption) */ + 0x00, SD_DHCP6_OPTION_IAADDR, 0x00, 0x18, + IA_NA_ADDRESS1_BYTES, /* address */ + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + /* IA_NA (status code suboption) */ + 0x00, SD_DHCP6_OPTION_STATUS_CODE, 0x00, 0x32, + 0x00, 0x00, /* status code */ + 0x41, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x28, 0x65, 0x73, 0x29, 0x20, 0x72, 0x65, 0x6e, 0x65, + 0x77, 0x65, 0x64, 0x2e, 0x20, 0x47, 0x72, 0x65, 0x65, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x66, + 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x6c, 0x61, 0x6e, 0x65, 0x74, 0x20, 0x45, 0x61, 0x72, 0x74, 0x68, /* status message */ + /* IA_PD */ + 0x00, SD_DHCP6_OPTION_IA_PD, 0x00, 0x46, + IA_ID_BYTES, + 0x00, 0x00, 0x00, 0x50, /* lifetime T1 */ + 0x00, 0x00, 0x00, 0x78, /* lifetime T2 */ + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX2_BYTES, + /* IA_PD (IA_PD_PREFIX suboption) */ + 0x00, SD_DHCP6_OPTION_IA_PD_PREFIX, 0x00, 0x19, + 0x00, 0x00, 0x00, 0x96, /* preferred lifetime */ + 0x00, 0x00, 0x00, 0xb4, /* valid lifetime */ + 0x40, /* prefixlen */ + IA_PD_PREFIX1_BYTES, + /* DNS servers */ + 0x00, SD_DHCP6_OPTION_DNS_SERVER, 0x00, 0x20, + DNS1_BYTES, + DNS2_BYTES, + /* SNTP servers */ + 0x00, SD_DHCP6_OPTION_SNTP_SERVER, 0x00, 0x20, + SNTP1_BYTES, + SNTP2_BYTES, + /* NTP servers */ + 0x00, SD_DHCP6_OPTION_NTP_SERVER, 0x00, 0x37, + /* NTP server (address suboption) */ + 0x00, DHCP6_NTP_SUBOPTION_SRV_ADDR, 0x00, 0x10, + NTP1_BYTES, + /* NTP server (address suboption) */ + 0x00, DHCP6_NTP_SUBOPTION_SRV_ADDR, 0x00, 0x10, + NTP2_BYTES, + /* NTP server (fqdn suboption) */ + 0x00, DHCP6_NTP_SUBOPTION_SRV_FQDN, 0x00, 0x0b, + 0x03, 'n', 't', 'p', 0x05, 'i', 'n', 't', 'r', 'a', 0x00, + /* Domain list */ + 0x00, SD_DHCP6_OPTION_DOMAIN, 0x00, 0x0b, + 0x03, 'l', 'a', 'b', 0x05, 'i', 'n', 't', 'r', 'a', 0x00, + /* Client FQDN */ + 0x00, SD_DHCP6_OPTION_CLIENT_FQDN, 0x00, 0x12, + 0x01, 0x06, 'c', 'l', 'i', 'e', 'n', 't', 0x03, 'l', 'a', 'b', 0x05, 'i', 'n', 't', 'r', 'a', + /* Vendor specific options */ + 0x00, SD_DHCP6_OPTION_VENDOR_OPTS, 0x00, 0x09, + 0x00, 0x00, 0x00, 0x20, 0x00, 0xf7, 0x00, 0x01, VENDOR_SUBOPTION_BYTES, +}; + +static void test_client_verify_information_request(const DHCP6Message *msg, size_t len) { + log_debug("/* %s */", __func__); + + assert_se(len == sizeof(msg_information_request)); + /* The elapsed time value is not deterministic. Skip it. */ + assert_se(memcmp(msg, msg_information_request, len - sizeof(be16_t)) == 0); +} + +static void test_client_verify_solicit(const DHCP6Message *msg, size_t len) { + log_debug("/* %s */", __func__); + + assert_se(len == sizeof(msg_solicit)); + /* The elapsed time value is not deterministic. Skip it. */ + assert_se(memcmp(msg, msg_solicit, len - sizeof(be16_t)) == 0); +} + +static void test_client_verify_release(const DHCP6Message *msg, size_t len) { + log_debug("/* %s */", __func__); + + assert_se(len == sizeof(msg_release)); + assert_se(msg->type == DHCP6_MESSAGE_RELEASE); + /* The transaction ID and elapsed time value are not deterministic. Skip them. */ + assert_se(memcmp(msg->options, msg_release + offsetof(DHCP6Message, options), + len - offsetof(DHCP6Message, options) - sizeof(be16_t)) == 0); +} + +static void test_client_verify_request(const DHCP6Message *msg, size_t len) { + log_debug("/* %s */", __func__); + + assert_se(len == sizeof(msg_request)); + assert_se(msg->type == DHCP6_MESSAGE_REQUEST); + /* The transaction ID and elapsed time value are not deterministic. Skip them. */ + assert_se(memcmp(msg->options, msg_request + offsetof(DHCP6Message, options), + len - offsetof(DHCP6Message, options) - sizeof(be16_t)) == 0); +} + +static void test_lease_common(sd_dhcp6_client *client) { + sd_dhcp6_lease *lease; + sd_dhcp6_option **suboption; + const struct in6_addr *addrs; + const char *str; + char **strv; + uint8_t *id; + size_t len; + + assert_se(sd_dhcp6_client_get_lease(client, &lease) >= 0); + + assert_se(dhcp6_lease_get_clientid(lease, &id, &len) >= 0); + assert_se(memcmp_nn(id, len, client_id, sizeof(client_id)) == 0); + + assert_se(sd_dhcp6_lease_get_domains(lease, &strv) == 1); + assert_se(streq(strv[0], "lab.intra")); + assert_se(!strv[1]); + + assert_se(sd_dhcp6_lease_get_fqdn(lease, &str) >= 0); + assert_se(streq(str, "client.lab.intra")); + + assert_se(sd_dhcp6_lease_get_dns(lease, &addrs) == 2); + assert_se(in6_addr_equal(&addrs[0], &dns1)); + assert_se(in6_addr_equal(&addrs[1], &dns2)); + + assert_se(sd_dhcp6_lease_get_ntp_addrs(lease, &addrs) == 2); + assert_se(in6_addr_equal(&addrs[0], &ntp1)); + assert_se(in6_addr_equal(&addrs[1], &ntp2)); + + assert_se(sd_dhcp6_lease_get_ntp_fqdn(lease, &strv) == 1); + assert_se(streq(strv[0], "ntp.intra")); + assert_se(!strv[1]); + + assert_se(lease->sntp_count == 2); + assert_se(in6_addr_equal(&lease->sntp[0], &sntp1)); + assert_se(in6_addr_equal(&lease->sntp[1], &sntp2)); + + assert_se(sd_dhcp6_lease_get_vendor_options(lease, &suboption) > 0); + assert_se((*suboption)->enterprise_identifier == vendor_suboption.enterprise_identifier); + assert_se((*suboption)->option == vendor_suboption.option); + assert_se(*(uint8_t*)(*suboption)->data == *(uint8_t*)vendor_suboption.data); +} + +static void test_lease_managed(sd_dhcp6_client *client) { + sd_dhcp6_lease *lease; + struct in6_addr addr; + usec_t lt_pref, lt_valid; + uint8_t *id, prefixlen; + size_t len; + + assert_se(sd_dhcp6_client_get_lease(client, &lease) >= 0); + + assert_se(dhcp6_lease_get_serverid(lease, &id, &len) >= 0); + assert_se(memcmp_nn(id, len, server_id, sizeof(server_id)) == 0); + + assert_se(sd_dhcp6_lease_has_address(lease)); + assert_se(sd_dhcp6_lease_has_pd_prefix(lease)); + + for (unsigned i = 0; i < 2; i++) { + assert_se(sd_dhcp6_lease_address_iterator_reset(lease)); + assert_se(sd_dhcp6_lease_get_address(lease, &addr) >= 0); + assert_se(sd_dhcp6_lease_get_address_lifetime(lease, <_pref, <_valid) >= 0); + assert_se(in6_addr_equal(&addr, &ia_na_address1)); + assert_se(lt_pref == 150 * USEC_PER_SEC); + assert_se(lt_valid == 180 * USEC_PER_SEC); + assert_se(sd_dhcp6_lease_address_iterator_next(lease)); + assert_se(sd_dhcp6_lease_get_address(lease, &addr) >= 0); + assert_se(sd_dhcp6_lease_get_address_lifetime(lease, <_pref, <_valid) >= 0); + assert_se(in6_addr_equal(&addr, &ia_na_address2)); + assert_se(lt_pref == 150 * USEC_PER_SEC); + assert_se(lt_valid == 180 * USEC_PER_SEC); + assert_se(!sd_dhcp6_lease_address_iterator_next(lease)); + + assert_se(sd_dhcp6_lease_pd_iterator_reset(lease)); + assert_se(sd_dhcp6_lease_get_pd_prefix(lease, &addr, &prefixlen) >= 0); + assert_se(sd_dhcp6_lease_get_pd_lifetime(lease, <_pref, <_valid) >= 0); + assert_se(in6_addr_equal(&addr, &ia_pd_prefix1)); + assert_se(prefixlen == 64); + assert_se(lt_pref == 150 * USEC_PER_SEC); + assert_se(lt_valid == 180 * USEC_PER_SEC); + assert_se(sd_dhcp6_lease_pd_iterator_next(lease)); + assert_se(sd_dhcp6_lease_get_pd_prefix(lease, &addr, &prefixlen) >= 0); + assert_se(sd_dhcp6_lease_get_pd_lifetime(lease, <_pref, <_valid) >= 0); + assert_se(in6_addr_equal(&addr, &ia_pd_prefix2)); + assert_se(prefixlen == 64); + assert_se(lt_pref == 150 * USEC_PER_SEC); + assert_se(lt_valid == 180 * USEC_PER_SEC); + assert_se(!sd_dhcp6_lease_pd_iterator_next(lease)); + } + + test_lease_common(client); +} + +static void test_client_callback(sd_dhcp6_client *client, int event, void *userdata) { + switch (event) { + case SD_DHCP6_CLIENT_EVENT_STOP: + log_debug("/* %s (event=stop) */", __func__); + return; + + case SD_DHCP6_CLIENT_EVENT_INFORMATION_REQUEST: + log_debug("/* %s (event=information-request) */", __func__); + + assert_se(test_client_sent_message_count == 1); + + test_lease_common(client); + + assert_se(sd_dhcp6_client_set_information_request(client, false) >= 0); + assert_se(sd_dhcp6_client_start(client) >= 0); + assert_se(dhcp6_client_set_transaction_id(client, ((const DHCP6Message*) msg_advertise)->transaction_id) >= 0); + break; + + case SD_DHCP6_CLIENT_EVENT_IP_ACQUIRE: + log_debug("/* %s (event=ip-acquire) */", __func__); + + assert_se(IN_SET(test_client_sent_message_count, 3, 5)); + + test_lease_managed(client); + + switch (test_client_sent_message_count) { + case 3: + assert_se(sd_dhcp6_client_stop(client) >= 0); + assert_se(sd_dhcp6_client_start(client) >= 0); + assert_se(dhcp6_client_set_transaction_id(client, ((const DHCP6Message*) msg_reply)->transaction_id) >= 0); + break; + + case 5: + assert_se(sd_event_exit(sd_dhcp6_client_get_event(client), 0) >= 0); + break; + + default: + assert_not_reached(); + } + + break; + + case DHCP6_CLIENT_EVENT_TEST_ADVERTISED: { + sd_dhcp6_lease *lease; + uint8_t preference; + + log_debug("/* %s (event=test-advertised) */", __func__); + + assert_se(test_client_sent_message_count == 2); + + test_lease_managed(client); + + assert_se(sd_dhcp6_client_get_lease(client, &lease) >= 0); + assert_se(dhcp6_lease_get_preference(lease, &preference) >= 0); + assert_se(preference == 0xff); + + assert_se(dhcp6_client_set_transaction_id(client, ((const DHCP6Message*) msg_reply)->transaction_id) >= 0); + break; + } + default: + assert_not_reached(); + } +} + +int dhcp6_network_send_udp_socket(int s, struct in6_addr *a, const void *packet, size_t len) { + log_debug("/* %s(count=%u) */", __func__, test_client_sent_message_count); + + assert_se(a); + assert_se(in6_addr_equal(a, &mcast_address)); + assert_se(packet); + assert_se(len >= sizeof(DHCP6Message)); + + switch (test_client_sent_message_count) { + case 0: + test_client_verify_information_request(packet, len); + assert_se(write(test_fd[1], msg_reply, sizeof(msg_reply)) == sizeof(msg_reply)); + break; + + case 1: + test_client_verify_solicit(packet, len); + assert_se(write(test_fd[1], msg_advertise, sizeof(msg_advertise)) == sizeof(msg_advertise)); + break; + + case 2: + test_client_callback(client_ref, DHCP6_CLIENT_EVENT_TEST_ADVERTISED, NULL); + test_client_verify_request(packet, len); + assert_se(write(test_fd[1], msg_reply, sizeof(msg_reply)) == sizeof(msg_reply)); + break; + + case 3: + test_client_verify_release(packet, len); + /* when stopping, dhcp6 client doesn't wait for release server reply */ + assert_se(write(test_fd[1], msg_reply, sizeof(msg_reply)) == sizeof(msg_reply)); + break; + + case 4: + test_client_verify_solicit(packet, len); + assert_se(write(test_fd[1], msg_reply, sizeof(msg_reply)) == sizeof(msg_reply)); + break; + + default: + assert_not_reached(); + } + + test_client_sent_message_count++; + return len; +} + +int dhcp6_network_bind_udp_socket(int ifindex, struct in6_addr *a) { + assert_se(ifindex == test_ifindex); + assert_se(a); + assert_se(in6_addr_equal(a, &local_address)); + + assert_se(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) >= 0); + return TAKE_FD(test_fd[0]); +} + +TEST(dhcp6_client) { + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + + assert_se(sd_event_new(&e) >= 0); + assert_se(sd_event_add_time_relative(e, NULL, CLOCK_BOOTTIME, + 2 * USEC_PER_SEC, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)) >= 0); + + assert_se(sd_dhcp6_client_new(&client) >= 0); + assert_se(sd_dhcp6_client_attach_event(client, e, 0) >= 0); + assert_se(sd_dhcp6_client_set_ifindex(client, test_ifindex) == 0); + assert_se(sd_dhcp6_client_set_local_address(client, &local_address) >= 0); + assert_se(sd_dhcp6_client_set_fqdn(client, "host.lab.intra") >= 0); + assert_se(sd_dhcp6_client_set_iaid(client, unaligned_read_be32((uint8_t[]) { IA_ID_BYTES })) >= 0); + assert_se(sd_dhcp6_client_set_send_release(client, true) >= 0); + + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_DNS_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_DOMAIN) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NTP_SERVER) >= 0); + assert_se(sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_SNTP_SERVER) >= 0); + + assert_se(sd_dhcp6_client_set_information_request(client, true) >= 0); + assert_se(sd_dhcp6_client_set_callback(client, test_client_callback, NULL) >= 0); + + assert_se(sd_dhcp6_client_start(client) >= 0); + + assert_se(dhcp6_client_set_transaction_id(client, ((const DHCP6Message*) msg_reply)->transaction_id) >= 0); + + assert_se(client_ref = sd_dhcp6_client_ref(client)); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(test_client_sent_message_count == 5); + + assert_se(!sd_dhcp6_client_unref(client_ref)); + test_fd[1] = safe_close(test_fd[1]); +} + +static int intro(void) { + assert_se(setenv("SYSTEMD_NETWORK_TEST_MODE", "1", 1) >= 0); + return 0; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/libsystemd-network/test-ipv4ll-manual.c b/src/libsystemd-network/test-ipv4ll-manual.c new file mode 100644 index 0000000..5dc6b10 --- /dev/null +++ b/src/libsystemd-network/test-ipv4ll-manual.c @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-event.h" +#include "sd-ipv4ll.h" +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "in-addr-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "tests.h" + +static void ll_handler(sd_ipv4ll *ll, int event, void *userdata) { + assert_se(ll); + + struct in_addr addr; + const char *pretty = sd_ipv4ll_get_address(ll, &addr) >= 0 ? IN4_ADDR_TO_STRING(&addr) : NULL; + + switch (event) { + case SD_IPV4LL_EVENT_BIND: + log_info("bound %s", strna(pretty)); + break; + case SD_IPV4LL_EVENT_CONFLICT: + log_info("conflict on %s", strna(pretty)); + break; + case SD_IPV4LL_EVENT_STOP: + log_error("the client was stopped with address %s", strna(pretty)); + break; + default: + assert_not_reached(); + } +} + +static int client_run(int ifindex, const char *seed_str, const struct in_addr *start_address, const struct ether_addr *ha, sd_event *e) { + sd_ipv4ll *ll; + + assert_se(sd_ipv4ll_new(&ll) >= 0); + assert_se(sd_ipv4ll_attach_event(ll, e, 0) >= 0); + + assert_se(sd_ipv4ll_set_ifindex(ll, ifindex) >= 0); + assert_se(sd_ipv4ll_set_mac(ll, ha) >= 0); + assert_se(sd_ipv4ll_set_callback(ll, ll_handler, NULL) >= 0); + + if (seed_str) { + unsigned seed; + + assert_se(safe_atou(seed_str, &seed) >= 0); + + assert_se(sd_ipv4ll_set_address_seed(ll, seed) >= 0); + } + + if (start_address && in4_addr_is_set(start_address)) + assert_se(sd_ipv4ll_set_address(ll, start_address) >= 0); + + log_info("starting IPv4LL client"); + + assert_se(sd_ipv4ll_start(ll) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(!sd_ipv4ll_unref(ll)); + + return EXIT_SUCCESS; +} + +static int test_ll(const char *ifname, const char *seed, const struct in_addr *start_address) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL; + struct ether_addr ha; + int ifindex; + + assert_se(sd_event_new(&e) >= 0); + + assert_se(sd_netlink_open(&rtnl) >= 0); + assert_se(sd_netlink_attach_event(rtnl, e, 0) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, 0) >= 0); + assert_se(sd_netlink_message_append_string(m, IFLA_IFNAME, ifname) >= 0); + assert_se(sd_netlink_call(rtnl, m, 0, &reply) >= 0); + + assert_se(sd_rtnl_message_link_get_ifindex(reply, &ifindex) >= 0); + assert_se(sd_netlink_message_read_ether_addr(reply, IFLA_ADDRESS, &ha) >= 0); + + client_run(ifindex, seed, start_address, &ha, e); + + return EXIT_SUCCESS; +} + +int main(int argc, char *argv[]) { + test_setup_logging(LOG_DEBUG); + + if (argc == 2) + return test_ll(argv[1], NULL, NULL); + else if (argc == 3) { + int r; + union in_addr_union a; + + r = in_addr_from_string(AF_INET, argv[2], &a); + if (r < 0) + return test_ll(argv[1], argv[2], NULL); + else + return test_ll(argv[1], NULL, &a.in); + } else { + log_error("This program takes one or two arguments.\n" + "\t %s [|]", program_invocation_short_name); + return EXIT_FAILURE; + } +} diff --git a/src/libsystemd-network/test-ipv4ll.c b/src/libsystemd-network/test-ipv4ll.c new file mode 100644 index 0000000..bb42930 --- /dev/null +++ b/src/libsystemd-network/test-ipv4ll.c @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Axis Communications AB. All rights reserved. +***/ + +#include +#include +#include +#include +#include +#include + +#include "sd-ipv4ll.h" + +#include "arp-util.h" +#include "fd-util.h" +#include "socket-util.h" +#include "tests.h" + +static bool verbose = false; +static bool extended = false; +static int test_fd[2]; + +static int basic_request_handler_bind = 0; +static int basic_request_handler_stop = 0; +static void* basic_request_handler_userdata = (void*) 0xCABCAB; + +static void basic_request_handler(sd_ipv4ll *ll, int event, void *userdata) { + assert_se(userdata == basic_request_handler_userdata); + + switch (event) { + case SD_IPV4LL_EVENT_STOP: + basic_request_handler_stop = 1; + break; + case SD_IPV4LL_EVENT_BIND: + basic_request_handler_bind = 1; + break; + default: + assert_se(0); + break; + } +} + +int arp_send_packet( + int fd, + int ifindex, + const struct in_addr *pa, + const struct ether_addr *ha, + bool announce) { + + struct ether_arp ea = {}; + + assert_se(fd >= 0); + assert_se(ifindex > 0); + assert_se(pa); + assert_se(ha); + + if (send(fd, &ea, sizeof(struct ether_arp), 0) < 0) + return -errno; + + return 0; +} + +int arp_update_filter(int fd, const struct in_addr *a, const struct ether_addr *eth_mac) { + return 0; +} + +int arp_network_bind_raw_socket(int ifindex, const struct in_addr *a, const struct ether_addr *eth_mac) { + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) < 0) + return -errno; + + return test_fd[0]; +} + +static void test_public_api_setters(sd_event *e) { + struct in_addr address = {}; + uint64_t seed = 0; + sd_ipv4ll *ll; + struct ether_addr mac_addr = { + .ether_addr_octet = {'A', 'B', 'C', '1', '2', '3'}}; + + if (verbose) + printf("* %s\n", __func__); + + assert_se(sd_ipv4ll_new(&ll) == 0); + assert_se(ll); + + assert_se(sd_ipv4ll_attach_event(NULL, NULL, 0) == -EINVAL); + assert_se(sd_ipv4ll_attach_event(ll, e, 0) == 0); + assert_se(sd_ipv4ll_attach_event(ll, e, 0) == -EBUSY); + + assert_se(sd_ipv4ll_set_callback(NULL, NULL, NULL) == -EINVAL); + assert_se(sd_ipv4ll_set_callback(ll, NULL, NULL) == 0); + + assert_se(sd_ipv4ll_set_address(ll, &address) == -EINVAL); + address.s_addr |= htobe32(169U << 24 | 254U << 16); + assert_se(sd_ipv4ll_set_address(ll, &address) == -EINVAL); + address.s_addr |= htobe32(0x00FF); + assert_se(sd_ipv4ll_set_address(ll, &address) == -EINVAL); + address.s_addr |= htobe32(0xF000); + assert_se(sd_ipv4ll_set_address(ll, &address) == 0); + address.s_addr |= htobe32(0x0F00); + assert_se(sd_ipv4ll_set_address(ll, &address) == -EINVAL); + + assert_se(sd_ipv4ll_set_address_seed(NULL, seed) == -EINVAL); + assert_se(sd_ipv4ll_set_address_seed(ll, seed) == 0); + + assert_se(sd_ipv4ll_set_mac(NULL, NULL) == -EINVAL); + assert_se(sd_ipv4ll_set_mac(ll, NULL) == -EINVAL); + assert_se(sd_ipv4ll_set_mac(ll, &mac_addr) == 0); + + assert_se(sd_ipv4ll_set_ifindex(NULL, -1) == -EINVAL); + assert_se(sd_ipv4ll_set_ifindex(ll, -1) == -EINVAL); + assert_se(sd_ipv4ll_set_ifindex(ll, -99) == -EINVAL); + assert_se(sd_ipv4ll_set_ifindex(ll, 1) == 0); + + assert_se(sd_ipv4ll_ref(ll) == ll); + assert_se(sd_ipv4ll_unref(ll) == NULL); + + /* Cleanup */ + assert_se(sd_ipv4ll_unref(ll) == NULL); +} + +static void test_basic_request(sd_event *e, const struct in_addr *start_address) { + + sd_ipv4ll *ll; + struct ether_arp arp; + struct ether_addr mac_addr = { + .ether_addr_octet = {'A', 'B', 'C', '1', '2', '3'}}; + + if (verbose) + printf("* %s\n", __func__); + + assert_se(sd_ipv4ll_new(&ll) == 0); + if (in4_addr_is_set(start_address)) + assert_se(sd_ipv4ll_set_address(ll, start_address) >= 0); + assert_se(sd_ipv4ll_start(ll) == -EINVAL); + + assert_se(sd_ipv4ll_attach_event(ll, e, 0) == 0); + assert_se(sd_ipv4ll_start(ll) == -EINVAL); + + assert_se(sd_ipv4ll_set_mac(ll, &mac_addr) == 0); + assert_se(sd_ipv4ll_start(ll) == -EINVAL); + + assert_se(sd_ipv4ll_set_callback(ll, basic_request_handler, + basic_request_handler_userdata) == 0); + assert_se(sd_ipv4ll_start(ll) == -EINVAL); + + assert_se(sd_ipv4ll_set_ifindex(ll, 1) == 0); + assert_se(sd_ipv4ll_start(ll) == 1); + + sd_event_run(e, UINT64_MAX); + assert_se(sd_ipv4ll_start(ll) == 0); + + assert_se(sd_ipv4ll_is_running(ll)); + + /* PROBE */ + sd_event_run(e, UINT64_MAX); + assert_se(recv(test_fd[1], &arp, sizeof(struct ether_arp), 0) == sizeof(struct ether_arp)); + + if (extended) { + /* PROBE */ + sd_event_run(e, UINT64_MAX); + assert_se(recv(test_fd[1], &arp, sizeof(struct ether_arp), 0) == sizeof(struct ether_arp)); + + /* PROBE */ + sd_event_run(e, UINT64_MAX); + assert_se(recv(test_fd[1], &arp, sizeof(struct ether_arp), 0) == sizeof(struct ether_arp)); + + sd_event_run(e, UINT64_MAX); + assert_se(basic_request_handler_bind == 1); + + if (in4_addr_is_set(start_address)) { + struct in_addr address; + + assert_se(sd_ipv4ll_get_address(ll, &address) >= 0); + assert_se(start_address->s_addr == address.s_addr); + } + } + + sd_ipv4ll_stop(ll); + assert_se(basic_request_handler_stop == 1); + + /* Cleanup */ + assert_se(sd_ipv4ll_unref(ll) == NULL); + safe_close(test_fd[1]); +} + +int main(int argc, char *argv[]) { + struct in_addr start_address = {}; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_event_new(&e) >= 0); + + test_public_api_setters(e); + test_basic_request(e, &start_address); + + basic_request_handler_bind = 0; + basic_request_handler_stop = 0; + start_address.s_addr = htobe32(169U << 24 | 254U << 16 | 1U << 8 | 2U); + test_basic_request(e, &start_address); + + return 0; +} diff --git a/src/libsystemd-network/test-lldp-rx.c b/src/libsystemd-network/test-lldp-rx.c new file mode 100644 index 0000000..feb53b5 --- /dev/null +++ b/src/libsystemd-network/test-lldp-rx.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-event.h" +#include "sd-lldp-rx.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "lldp-network.h" +#include "macro.h" +#include "string-util.h" +#include "tests.h" + +#define TEST_LLDP_PORT "em1" +#define TEST_LLDP_TYPE_SYSTEM_NAME "systemd-lldp" +#define TEST_LLDP_TYPE_SYSTEM_DESC "systemd-lldp-desc" + +static int test_fd[2] = EBADF_PAIR; +static int lldp_rx_handler_calls; + +int lldp_network_bind_raw_socket(int ifindex) { + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) < 0) + return -errno; + + return test_fd[0]; +} + +static void lldp_rx_handler(sd_lldp_rx *lldp_rx, sd_lldp_rx_event_t event, sd_lldp_neighbor *n, void *userdata) { + lldp_rx_handler_calls++; +} + +static int start_lldp_rx(sd_lldp_rx **lldp_rx, sd_event *e, sd_lldp_rx_callback_t cb, void *cb_data) { + int r; + + r = sd_lldp_rx_new(lldp_rx); + if (r < 0) + return r; + + r = sd_lldp_rx_set_ifindex(*lldp_rx, 42); + if (r < 0) + return r; + + r = sd_lldp_rx_set_callback(*lldp_rx, cb, cb_data); + if (r < 0) + return r; + + r = sd_lldp_rx_attach_event(*lldp_rx, e, 0); + if (r < 0) + return r; + + r = sd_lldp_rx_start(*lldp_rx); + if (r < 0) + return r; + + return 0; +} + +static int stop_lldp_rx(sd_lldp_rx *lldp_rx) { + int r; + + r = sd_lldp_rx_stop(lldp_rx); + if (r < 0) + return r; + + r = sd_lldp_rx_detach_event(lldp_rx); + if (r < 0) + return r; + + sd_lldp_rx_unref(lldp_rx); + safe_close(test_fd[1]); + + return 0; +} + +static void test_receive_basic_packet(sd_event *e) { + + static const uint8_t frame[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x07, 0x04, 0x00, 0x01, 0x02, /* Chassis: MAC, 00:01:02:03:04:05 */ + 0x03, 0x04, 0x05, + 0x04, 0x04, 0x05, 0x31, 0x2f, 0x33, /* Port: interface name, "1/3" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + /* LLDP optional TLVs */ + 0x08, 0x04, 0x50, 0x6f, 0x72, 0x74, /* Port Description: "Port" */ + 0x0a, 0x03, 0x53, 0x59, 0x53, /* System Name: "SYS" */ + 0x0c, 0x04, 0x66, 0x6f, 0x6f, 0x00, /* System Description: "foo" (NULL-terminated) */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + + sd_lldp_rx *lldp_rx; + sd_lldp_neighbor **neighbors; + uint8_t type; + const void *data; + uint16_t ttl; + size_t length; + const char *str; + + lldp_rx_handler_calls = 0; + assert_se(start_lldp_rx(&lldp_rx, e, lldp_rx_handler, NULL) == 0); + + assert_se(write(test_fd[1], frame, sizeof(frame)) == sizeof(frame)); + sd_event_run(e, 0); + assert_se(lldp_rx_handler_calls == 1); + assert_se(sd_lldp_rx_get_neighbors(lldp_rx, &neighbors) == 1); + + assert_se(sd_lldp_neighbor_get_chassis_id(neighbors[0], &type, &data, &length) == 0); + assert_se(type == SD_LLDP_CHASSIS_SUBTYPE_MAC_ADDRESS); + assert_se(length == ETH_ALEN); + assert_se(!memcmp(data, "\x00\x01\x02\x03\x04\x05", ETH_ALEN)); + + assert_se(sd_lldp_neighbor_get_port_id(neighbors[0], &type, &data, &length) == 0); + assert_se(type == SD_LLDP_PORT_SUBTYPE_INTERFACE_NAME); + assert_se(length == 3); + assert_se(!memcmp(data, "1/3", 3)); + + assert_se(sd_lldp_neighbor_get_port_description(neighbors[0], &str) == 0); + assert_se(streq(str, "Port")); + + assert_se(sd_lldp_neighbor_get_system_name(neighbors[0], &str) == 0); + assert_se(streq(str, "SYS")); + + assert_se(sd_lldp_neighbor_get_system_description(neighbors[0], &str) == 0); + assert_se(streq(str, "foo")); + + assert_se(sd_lldp_neighbor_get_ttl(neighbors[0], &ttl) == 0); + assert_se(ttl == 120); + + sd_lldp_neighbor_unref(neighbors[0]); + free(neighbors); + + assert_se(stop_lldp_rx(lldp_rx) == 0); +} + +static void test_receive_incomplete_packet(sd_event *e) { + sd_lldp_rx *lldp_rx; + sd_lldp_neighbor **neighbors; + uint8_t frame[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x07, 0x04, 0x00, 0x01, 0x02, /* Chassis: MAC, 00:01:02:03:04:05 */ + 0x03, 0x04, 0x05, + 0x04, 0x04, 0x05, 0x31, 0x2f, 0x33, /* Port: interface name, "1/3" */ + /* Missing TTL */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + + lldp_rx_handler_calls = 0; + assert_se(start_lldp_rx(&lldp_rx, e, lldp_rx_handler, NULL) == 0); + + assert_se(write(test_fd[1], frame, sizeof(frame)) == sizeof(frame)); + sd_event_run(e, 0); + assert_se(lldp_rx_handler_calls == 0); + assert_se(sd_lldp_rx_get_neighbors(lldp_rx, &neighbors) == 0); + + assert_se(stop_lldp_rx(lldp_rx) == 0); +} + +static void test_receive_oui_packet(sd_event *e) { + sd_lldp_rx *lldp_rx; + sd_lldp_neighbor **neighbors; + uint8_t frame[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x07, 0x04, 0x00, 0x01, 0x02, /* Chassis: MAC, 00:01:02:03:04:05 */ + 0x03, 0x04, 0x05, + 0x04, 0x04, 0x05, 0x31, 0x2f, 0x33, /* Port TLV: interface name, "1/3" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + /* LLDP optional TLVs */ + 0xfe, 0x06, 0x00, 0x80, 0xc2, 0x01, /* Port VLAN ID: 0x1234 */ + 0x12, 0x34, + 0xfe, 0x07, 0x00, 0x80, 0xc2, 0x02, /* Port and protocol: flag 1, PPVID 0x7788 */ + 0x01, 0x77, 0x88, + 0xfe, 0x0d, 0x00, 0x80, 0xc2, 0x03, /* VLAN Name: ID 0x1234, name "Vlan51" */ + 0x12, 0x34, 0x06, 0x56, 0x6c, 0x61, + 0x6e, 0x35, 0x31, + 0xfe, 0x06, 0x00, 0x80, 0xc2, 0x06, /* Management VID: 0x0102 */ + 0x01, 0x02, + 0xfe, 0x09, 0x00, 0x80, 0xc2, 0x07, /* Link aggregation: status 1, ID 0x00140012 */ + 0x01, 0x00, 0x14, 0x00, 0x12, + 0xfe, 0x07, 0x00, 0x12, 0x0f, 0x02, /* 802.3 Power via MDI: PSE, MDI enabled */ + 0x07, 0x01, 0x00, + 0x00, 0x00 /* End of LLDPDU */ + }; + + lldp_rx_handler_calls = 0; + assert_se(start_lldp_rx(&lldp_rx, e, lldp_rx_handler, NULL) == 0); + + assert_se(write(test_fd[1], frame, sizeof(frame)) == sizeof(frame)); + sd_event_run(e, 0); + assert_se(lldp_rx_handler_calls == 1); + assert_se(sd_lldp_rx_get_neighbors(lldp_rx, &neighbors) == 1); + + assert_se(sd_lldp_neighbor_tlv_rewind(neighbors[0]) >= 0); + assert_se(sd_lldp_neighbor_tlv_is_type(neighbors[0], SD_LLDP_TYPE_CHASSIS_ID) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_type(neighbors[0], SD_LLDP_TYPE_PORT_ID) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_type(neighbors[0], SD_LLDP_TYPE_TTL) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_oui(neighbors[0], SD_LLDP_OUI_802_1, SD_LLDP_OUI_802_1_SUBTYPE_PORT_VLAN_ID) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_oui(neighbors[0], SD_LLDP_OUI_802_1, SD_LLDP_OUI_802_1_SUBTYPE_PORT_PROTOCOL_VLAN_ID) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_oui(neighbors[0], SD_LLDP_OUI_802_1, SD_LLDP_OUI_802_1_SUBTYPE_VLAN_NAME) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_oui(neighbors[0], SD_LLDP_OUI_802_1, SD_LLDP_OUI_802_1_SUBTYPE_MANAGEMENT_VID) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_oui(neighbors[0], SD_LLDP_OUI_802_1, SD_LLDP_OUI_802_1_SUBTYPE_LINK_AGGREGATION) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_oui(neighbors[0], SD_LLDP_OUI_802_3, SD_LLDP_OUI_802_3_SUBTYPE_POWER_VIA_MDI) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) > 0); + assert_se(sd_lldp_neighbor_tlv_is_type(neighbors[0], SD_LLDP_TYPE_END) > 0); + assert_se(sd_lldp_neighbor_tlv_next(neighbors[0]) == 0); + + sd_lldp_neighbor_unref(neighbors[0]); + free(neighbors); + + assert_se(stop_lldp_rx(lldp_rx) == 0); +} + +static void test_multiple_neighbors_sorted(sd_event *e) { + + static const uint8_t frame1[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x04, 0x01, '1', '/', '2', /* Chassis component: "1/2" */ + 0x04, 0x04, 0x02, '2', '/', '3', /* Port component: "2/3" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + static const uint8_t frame2[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x04, 0x01, '2', '/', '1', /* Chassis component: "2/1" */ + 0x04, 0x04, 0x02, '1', '/', '3', /* Port component: "1/3" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + static const uint8_t frame3[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x05, 0x01, '2', '/', '1', '0', /* Chassis component: "2/10" */ + 0x04, 0x04, 0x02, '1', '/', '0', /* Port component: "1/0" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + static const uint8_t frame4[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x05, 0x01, '2', '/', '1', '9', /* Chassis component: "2/19" */ + 0x04, 0x04, 0x02, '1', '/', '0', /* Port component: "1/0" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + static const uint8_t frame5[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x04, 0x01, '1', '/', '2', /* Chassis component: "1/2" */ + 0x04, 0x05, 0x02, '2', '/', '1', '0', /* Port component: "2/10" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + static const uint8_t frame6[] = { + /* Ethernet header */ + 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03, /* Destination MAC */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, /* Source MAC */ + 0x88, 0xcc, /* Ethertype */ + /* LLDP mandatory TLVs */ + 0x02, 0x04, 0x01, '1', '/', '2', /* Chassis component: "1/2" */ + 0x04, 0x05, 0x02, '2', '/', '3', '9', /* Port component: "2/10" */ + 0x06, 0x02, 0x00, 0x78, /* TTL: 120 seconds */ + 0x00, 0x00 /* End Of LLDPDU */ + }; + static const char* expected[] = { + /* ordered pairs of Chassis+Port */ + "1/2", "2/10", + "1/2", "2/3", + "1/2", "2/39", + "2/1", "1/3", + "2/10", "1/0", + "2/19", "1/0", + }; + + sd_lldp_rx *lldp_rx; + sd_lldp_neighbor **neighbors; + int i; + uint8_t type; + const void *data; + size_t length, expected_length; + uint16_t ttl; + + lldp_rx_handler_calls = 0; + assert_se(start_lldp_rx(&lldp_rx, e, lldp_rx_handler, NULL) == 0); + + assert_se(write(test_fd[1], frame1, sizeof(frame1)) == sizeof(frame1)); + sd_event_run(e, 0); + assert_se(write(test_fd[1], frame2, sizeof(frame2)) == sizeof(frame2)); + sd_event_run(e, 0); + assert_se(write(test_fd[1], frame3, sizeof(frame3)) == sizeof(frame3)); + sd_event_run(e, 0); + assert_se(write(test_fd[1], frame4, sizeof(frame4)) == sizeof(frame4)); + sd_event_run(e, 0); + assert_se(write(test_fd[1], frame5, sizeof(frame5)) == sizeof(frame5)); + sd_event_run(e, 0); + assert_se(write(test_fd[1], frame6, sizeof(frame6)) == sizeof(frame6)); + sd_event_run(e, 0); + assert_se(lldp_rx_handler_calls == 6); + + assert_se(sd_lldp_rx_get_neighbors(lldp_rx, &neighbors) == 6); + + for (i = 0; i < 6; i++) { + assert_se(sd_lldp_neighbor_get_chassis_id(neighbors[i], &type, &data, &length) == 0); + assert_se(type == SD_LLDP_CHASSIS_SUBTYPE_CHASSIS_COMPONENT); + expected_length = strlen(expected[2 * i]); + assert_se(length == expected_length); + assert_se(memcmp(data, expected[2 * i], expected_length) == 0); + + assert_se(sd_lldp_neighbor_get_port_id(neighbors[i], &type, &data, &length) == 0); + assert_se(type == SD_LLDP_PORT_SUBTYPE_PORT_COMPONENT); + expected_length = strlen(expected[2 * i + 1]); + assert_se(length == expected_length); + assert_se(memcmp(data, expected[2 * i + 1], expected_length) == 0); + + assert_se(sd_lldp_neighbor_get_ttl(neighbors[i], &ttl) == 0); + assert_se(ttl == 120); + } + + for (i = 0; i < 6; i++) + sd_lldp_neighbor_unref(neighbors[i]); + free(neighbors); + + assert_se(stop_lldp_rx(lldp_rx) == 0); +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + + test_setup_logging(LOG_DEBUG); + + /* LLDP reception tests */ + assert_se(sd_event_new(&e) == 0); + test_receive_basic_packet(e); + test_receive_incomplete_packet(e); + test_receive_oui_packet(e); + test_multiple_neighbors_sorted(e); + + return 0; +} diff --git a/src/libsystemd-network/test-ndisc-ra.c b/src/libsystemd-network/test-ndisc-ra.c new file mode 100644 index 0000000..23abe78 --- /dev/null +++ b/src/libsystemd-network/test-ndisc-ra.c @@ -0,0 +1,376 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2017 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include + +#include "sd-radv.h" + +#include "alloc-util.h" +#include "hexdecoct.h" +#include "icmp6-util-unix.h" +#include "socket-util.h" +#include "strv.h" +#include "tests.h" + +static struct ether_addr mac_addr = { + .ether_addr_octet = { 0x78, 0x2b, 0xcb, 0xb3, 0x6d, 0x53 } +}; + +static uint8_t advertisement[] = { + /* ICMPv6 Router Advertisement, no checksum */ + 0x86, 0x00, 0x00, 0x00, 0x40, 0xc0, 0x00, 0xb4, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* Source Link Layer Address Option */ + 0x01, 0x01, 0x78, 0x2b, 0xcb, 0xb3, 0x6d, 0x53, + /* Prefix Information Option */ + 0x03, 0x04, 0x40, 0xc0, 0x00, 0x00, 0x01, 0xf4, + 0x00, 0x00, 0x01, 0xb8, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* Prefix Information Option */ + 0x03, 0x04, 0x40, 0xc0, 0x00, 0x00, 0x0e, 0x10, + 0x00, 0x00, 0x07, 0x08, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x01, 0x0d, 0xb8, 0x0b, 0x16, 0xd0, 0x0d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* Prefix Information Option */ + 0x03, 0x04, 0x30, 0xc0, 0x00, 0x00, 0x0e, 0x10, + 0x00, 0x00, 0x07, 0x08, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x01, 0x0d, 0xb8, 0xc0, 0x01, 0x0d, 0xad, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* Recursive DNS Server Option */ + 0x19, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + /* DNS Search List Option */ + 0x1f, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, + 0x03, 0x6c, 0x61, 0x62, 0x05, 0x69, 0x6e, 0x74, + 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static bool test_stopped; +static struct { + struct in6_addr address; + unsigned char prefixlen; + uint32_t valid; + uint32_t preferred; + bool successful; +} prefix[] = { + { { { { 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 64, + 500, 440, true }, + { { { { 0x20, 0x01, 0x0d, 0xb8, 0x0b, 0x16, 0xd0, 0x0d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 64, + /* indicate default valid and preferred lifetimes for the test code */ + 0, 0, true }, + { { { { 0x20, 0x01, 0x0d, 0xb8, 0x0b, 0x16, 0xd0, 0x0d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 58, + 0, 0, + /* indicate that this prefix already exists */ + false }, + { { { { 0x20, 0x01, 0x0d, 0xb8, 0x0b, 0x16, 0xd0, 0x0d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 120, + 0, 0, + /* indicate that this prefix already exists */ + false }, + { { { { 0x20, 0x01, 0x0d, 0xb8, 0x0b, 0x16, 0xd0, 0x0d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 12, + 0, 0, + /* indicate that this prefix already exists */ + false }, + { { { { 0x20, 0x01, 0x0d, 0xb8, 0xc0, 0x01, 0x0d, 0xad, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 48, + 0, 0, true }, + { { { { 0x20, 0x01, 0x0d, 0xb8, 0xc0, 0x01, 0x0d, 0xad, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } }, 60, + 0, 0, + /* indicate that this prefix already exists */ + false }, +}; + +static const struct in6_addr test_rdnss = { { { 0x20, 0x01, 0x0d, 0xb8, + 0xde, 0xad, 0xbe, 0xef, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01 } } }; +static const char *test_dnssl[] = { "lab.intra", + NULL }; + +TEST(radv_prefix) { + sd_radv_prefix *p; + + assert_se(sd_radv_prefix_new(&p) >= 0); + + assert_se(sd_radv_prefix_set_onlink(NULL, true) < 0); + assert_se(sd_radv_prefix_set_onlink(p, true) >= 0); + assert_se(sd_radv_prefix_set_onlink(p, false) >= 0); + + assert_se(sd_radv_prefix_set_address_autoconfiguration(NULL, true) < 0); + assert_se(sd_radv_prefix_set_address_autoconfiguration(p, true) >= 0); + assert_se(sd_radv_prefix_set_address_autoconfiguration(p, false) >= 0); + + assert_se(sd_radv_prefix_set_valid_lifetime(NULL, 1, 1) < 0); + assert_se(sd_radv_prefix_set_valid_lifetime(p, 0, 0) >= 0); + assert_se(sd_radv_prefix_set_valid_lifetime(p, 300 * USEC_PER_SEC, USEC_INFINITY) >= 0); + assert_se(sd_radv_prefix_set_valid_lifetime(p, 300 * USEC_PER_SEC, USEC_PER_YEAR) >= 0); + + assert_se(sd_radv_prefix_set_preferred_lifetime(NULL, 1, 1) < 0); + assert_se(sd_radv_prefix_set_preferred_lifetime(p, 0, 0) >= 0); + assert_se(sd_radv_prefix_set_preferred_lifetime(p, 300 * USEC_PER_SEC, USEC_INFINITY) >= 0); + assert_se(sd_radv_prefix_set_preferred_lifetime(p, 300 * USEC_PER_SEC, USEC_PER_YEAR) >= 0); + + assert_se(sd_radv_prefix_set_prefix(NULL, NULL, 0) < 0); + assert_se(sd_radv_prefix_set_prefix(p, NULL, 0) < 0); + + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 64) >= 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 0) < 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 1) < 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 2) < 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 3) >= 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 125) >= 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 128) >= 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 129) < 0); + assert_se(sd_radv_prefix_set_prefix(p, &prefix[0].address, 255) < 0); + + assert_se(!sd_radv_prefix_unref(p)); +} + +TEST(radv_route_prefix) { + sd_radv_route_prefix *p; + + assert_se(sd_radv_route_prefix_new(&p) >= 0); + + assert_se(sd_radv_route_prefix_set_lifetime(NULL, 1, 1) < 0); + assert_se(sd_radv_route_prefix_set_lifetime(p, 0, 0) >= 0); + assert_se(sd_radv_route_prefix_set_lifetime(p, 300 * USEC_PER_SEC, USEC_INFINITY) >= 0); + assert_se(sd_radv_route_prefix_set_lifetime(p, 300 * USEC_PER_SEC, USEC_PER_YEAR) >= 0); + + assert_se(sd_radv_route_prefix_set_prefix(NULL, NULL, 0) < 0); + assert_se(sd_radv_route_prefix_set_prefix(p, NULL, 0) < 0); + + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 64) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 0) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 1) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 2) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 3) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 125) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 128) >= 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 129) < 0); + assert_se(sd_radv_route_prefix_set_prefix(p, &prefix[0].address, 255) < 0); + + assert_se(!sd_radv_route_prefix_unref(p)); +} + +TEST(radv_pref64_prefix) { + sd_radv_pref64_prefix *p; + + assert_se(sd_radv_pref64_prefix_new(&p) >= 0); + + assert_se(sd_radv_pref64_prefix_set_prefix(NULL, NULL, 0, 0) < 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, NULL, 0, 0) < 0); + + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 32, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 40, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 48, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 56, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 64, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 96, 300 * USEC_PER_SEC) >= 0); + + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 80, 300 * USEC_PER_SEC) < 0); + assert_se(sd_radv_pref64_prefix_set_prefix(p, &prefix[0].address, 80, USEC_PER_DAY) < 0); + + assert_se(!sd_radv_pref64_prefix_unref(p)); +} + +TEST(radv) { + sd_radv *ra; + + assert_se(sd_radv_new(&ra) >= 0); + assert_se(ra); + + assert_se(sd_radv_set_ifindex(NULL, 0) < 0); + assert_se(sd_radv_set_ifindex(ra, 0) < 0); + assert_se(sd_radv_set_ifindex(ra, -1) < 0); + assert_se(sd_radv_set_ifindex(ra, -2) < 0); + assert_se(sd_radv_set_ifindex(ra, 42) >= 0); + + assert_se(sd_radv_set_mac(NULL, NULL) < 0); + assert_se(sd_radv_set_mac(ra, NULL) >= 0); + assert_se(sd_radv_set_mac(ra, &mac_addr) >= 0); + + assert_se(sd_radv_set_mtu(NULL, 0) < 0); + assert_se(sd_radv_set_mtu(ra, 0) < 0); + assert_se(sd_radv_set_mtu(ra, 1279) < 0); + assert_se(sd_radv_set_mtu(ra, 1280) >= 0); + assert_se(sd_radv_set_mtu(ra, ~0) >= 0); + + assert_se(sd_radv_set_hop_limit(NULL, 0) < 0); + assert_se(sd_radv_set_hop_limit(ra, 0) >= 0); + assert_se(sd_radv_set_hop_limit(ra, ~0) >= 0); + + assert_se(sd_radv_set_router_lifetime(NULL, 0) < 0); + assert_se(sd_radv_set_router_lifetime(ra, 0) >= 0); + assert_se(sd_radv_set_router_lifetime(ra, USEC_INFINITY) < 0); + assert_se(sd_radv_set_router_lifetime(ra, USEC_PER_YEAR) < 0); + assert_se(sd_radv_set_router_lifetime(ra, 300 * USEC_PER_SEC) >= 0); + + assert_se(sd_radv_set_preference(NULL, 0) < 0); + assert_se(sd_radv_set_preference(ra, SD_NDISC_PREFERENCE_LOW) >= 0); + assert_se(sd_radv_set_preference(ra, SD_NDISC_PREFERENCE_MEDIUM) >= 0); + assert_se(sd_radv_set_preference(ra, SD_NDISC_PREFERENCE_HIGH) >= 0); + assert_se(sd_radv_set_preference(ra, ~0) < 0); + + assert_se(sd_radv_set_preference(ra, SD_NDISC_PREFERENCE_HIGH) >= 0); + assert_se(sd_radv_set_router_lifetime(ra, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_set_router_lifetime(ra, 0) < 0); + assert_se(sd_radv_set_preference(ra, SD_NDISC_PREFERENCE_MEDIUM) >= 0); + assert_se(sd_radv_set_router_lifetime(ra, 0) >= 0); + + assert_se(sd_radv_set_managed_information(NULL, true) < 0); + assert_se(sd_radv_set_managed_information(ra, true) >= 0); + assert_se(sd_radv_set_managed_information(ra, false) >= 0); + + assert_se(sd_radv_set_other_information(NULL, true) < 0); + assert_se(sd_radv_set_other_information(ra, true) >= 0); + assert_se(sd_radv_set_other_information(ra, false) >= 0); + + assert_se(sd_radv_set_retransmit(NULL, 10 * USEC_PER_MSEC) < 0); + assert_se(sd_radv_set_retransmit(ra, 10 * USEC_PER_MSEC) >= 0); + assert_se(sd_radv_set_retransmit(ra, 0) >= 0); + assert_se(sd_radv_set_retransmit(ra, usec_add(UINT32_MAX * USEC_PER_MSEC, USEC_PER_MSEC)) < 0); + + assert_se(sd_radv_set_rdnss(NULL, 0, NULL, 0) < 0); + assert_se(sd_radv_set_rdnss(ra, 0, NULL, 0) >= 0); + assert_se(sd_radv_set_rdnss(ra, 0, NULL, 128) < 0); + assert_se(sd_radv_set_rdnss(ra, 600 * USEC_PER_SEC, &test_rdnss, 0) >= 0); + assert_se(sd_radv_set_rdnss(ra, 600 * USEC_PER_SEC, &test_rdnss, 1) >= 0); + assert_se(sd_radv_set_rdnss(ra, 0, &test_rdnss, 1) >= 0); + assert_se(sd_radv_set_rdnss(ra, 0, NULL, 0) >= 0); + + assert_se(sd_radv_set_dnssl(ra, 0, NULL) >= 0); + assert_se(sd_radv_set_dnssl(ra, 600 * USEC_PER_SEC, NULL) >= 0); + assert_se(sd_radv_set_dnssl(ra, 0, (char **)test_dnssl) >= 0); + assert_se(sd_radv_set_dnssl(ra, 600 * USEC_PER_SEC, (char **)test_dnssl) >= 0); + + assert_se(sd_radv_set_home_agent_information(NULL, true) < 0); + assert_se(sd_radv_set_home_agent_information(ra, true) >= 0); + assert_se(sd_radv_set_home_agent_information(ra, false) >= 0); + + assert_se(sd_radv_set_home_agent_preference(NULL, 10) < 0); + assert_se(sd_radv_set_home_agent_preference(ra, 10) >= 0); + assert_se(sd_radv_set_home_agent_preference(ra, 0) >= 0); + + assert_se(sd_radv_set_home_agent_lifetime(NULL, 300 * USEC_PER_SEC) < 0); + assert_se(sd_radv_set_home_agent_lifetime(ra, 300 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_set_home_agent_lifetime(ra, 0) >= 0); + assert_se(sd_radv_set_home_agent_lifetime(ra, USEC_PER_DAY) < 0); + + ra = sd_radv_unref(ra); + assert_se(!ra); +} + +static int radv_recv(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + sd_radv *ra = userdata; + unsigned char buf[168]; + size_t i; + + assert_se(read(test_fd[0], &buf, sizeof(buf)) == sizeof(buf)); + + /* router lifetime must be zero when test is stopped */ + if (test_stopped) { + advertisement[6] = 0x00; + advertisement[7] = 0x00; + } + + printf ("Received Router Advertisement with lifetime %i\n", + (advertisement[6] << 8) + advertisement[7]); + + /* test only up to buf size, rest is not yet implemented */ + for (i = 0; i < sizeof(buf); i++) { + if (!(i % 8)) + printf("%3zu: ", i); + + printf("0x%02x", buf[i]); + + assert_se(buf[i] == advertisement[i]); + + if ((i + 1) % 8) + printf(", "); + else + printf("\n"); + } + + if (test_stopped) { + sd_event *e; + + e = sd_radv_get_event(ra); + sd_event_exit(e, 0); + + return 0; + } + + assert_se(sd_radv_stop(ra) >= 0); + test_stopped = true; + + return 0; +} + +TEST(ra) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *recv_router_advertisement = NULL; + _cleanup_(sd_radv_unrefp) sd_radv *ra = NULL; + + assert_se(socketpair(AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, test_fd) >= 0); + + assert_se(sd_event_new(&e) >= 0); + + assert_se(sd_radv_new(&ra) >= 0); + assert_se(ra); + + assert_se(sd_radv_attach_event(ra, e, 0) >= 0); + + assert_se(sd_radv_set_ifindex(ra, 42) >= 0); + assert_se(sd_radv_set_mac(ra, &mac_addr) >= 0); + assert_se(sd_radv_set_router_lifetime(ra, 180 * USEC_PER_SEC) >= 0); + assert_se(sd_radv_set_hop_limit(ra, 64) >= 0); + assert_se(sd_radv_set_managed_information(ra, true) >= 0); + assert_se(sd_radv_set_other_information(ra, true) >= 0); + assert_se(sd_radv_set_rdnss(ra, 60 * USEC_PER_SEC, &test_rdnss, 1) >= 0); + assert_se(sd_radv_set_dnssl(ra, 60 * USEC_PER_SEC, (char **)test_dnssl) >= 0); + + for (unsigned i = 0; i < ELEMENTSOF(prefix); i++) { + sd_radv_prefix *p; + + printf("Test prefix %u\n", i); + assert_se(sd_radv_prefix_new(&p) >= 0); + + assert_se(sd_radv_prefix_set_prefix(p, &prefix[i].address, + prefix[i].prefixlen) >= 0); + if (prefix[i].valid > 0) + assert_se(sd_radv_prefix_set_valid_lifetime(p, prefix[i].valid * USEC_PER_SEC, USEC_INFINITY) >= 0); + if (prefix[i].preferred > 0) + assert_se(sd_radv_prefix_set_preferred_lifetime(p, prefix[i].preferred * USEC_PER_SEC, USEC_INFINITY) >= 0); + + assert_se((sd_radv_add_prefix(ra, p) >= 0) == prefix[i].successful); + /* If the previous sd_radv_add_prefix() succeeds, then also the second call should also succeed. */ + assert_se((sd_radv_add_prefix(ra, p) >= 0) == prefix[i].successful); + + p = sd_radv_prefix_unref(p); + assert_se(!p); + } + + assert_se(sd_event_add_io(e, &recv_router_advertisement, test_fd[0], EPOLLIN, radv_recv, ra) >= 0); + assert_se(sd_event_source_set_io_fd_own(recv_router_advertisement, true) >= 0); + + assert_se(sd_event_add_time_relative(e, NULL, CLOCK_BOOTTIME, + 2 * USEC_PER_SEC, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)) >= 0); + + assert_se(sd_radv_start(ra) >= 0); + + assert_se(sd_event_loop(e) >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/libsystemd-network/test-ndisc-rs.c b/src/libsystemd-network/test-ndisc-rs.c new file mode 100644 index 0000000..d94cc1c --- /dev/null +++ b/src/libsystemd-network/test-ndisc-rs.c @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include + +#include "sd-ndisc.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "icmp6-util-unix.h" +#include "socket-util.h" +#include "strv.h" +#include "ndisc-internal.h" +#include "tests.h" + +static struct ether_addr mac_addr = { + .ether_addr_octet = {'A', 'B', 'C', '1', '2', '3'} +}; + +static bool verbose = false; +static sd_ndisc *test_timeout_nd; + +static void router_dump(sd_ndisc_router *rt) { + struct in6_addr addr; + uint8_t hop_limit; + usec_t t, lifetime; + uint64_t flags; + uint32_t mtu; + unsigned preference; + int r; + + assert_se(rt); + + log_info("--"); + assert_se(sd_ndisc_router_get_address(rt, &addr) >= 0); + log_info("Sender: %s", IN6_ADDR_TO_STRING(&addr)); + + assert_se(sd_ndisc_router_get_timestamp(rt, CLOCK_REALTIME, &t) >= 0); + log_info("Timestamp: %s", FORMAT_TIMESTAMP(t)); + + assert_se(sd_ndisc_router_get_timestamp(rt, CLOCK_MONOTONIC, &t) >= 0); + log_info("Monotonic: %" PRIu64, t); + + if (sd_ndisc_router_get_hop_limit(rt, &hop_limit) < 0) + log_info("No hop limit set"); + else + log_info("Hop limit: %u", hop_limit); + + assert_se(sd_ndisc_router_get_flags(rt, &flags) >= 0); + log_info("Flags: <%s|%s>", + flags & ND_RA_FLAG_OTHER ? "OTHER" : "", + flags & ND_RA_FLAG_MANAGED ? "MANAGED" : ""); + + assert_se(sd_ndisc_router_get_preference(rt, &preference) >= 0); + log_info("Preference: %s", + preference == SD_NDISC_PREFERENCE_LOW ? "low" : + preference == SD_NDISC_PREFERENCE_HIGH ? "high" : "medium"); + + assert_se(sd_ndisc_router_get_lifetime(rt, &lifetime) >= 0); + assert_se(sd_ndisc_router_get_lifetime_timestamp(rt, CLOCK_REALTIME, &t) >= 0); + log_info("Lifetime: %s (%s)", FORMAT_TIMESPAN(lifetime, USEC_PER_SEC), FORMAT_TIMESTAMP(t)); + + if (sd_ndisc_router_get_mtu(rt, &mtu) < 0) + log_info("No MTU set"); + else + log_info("MTU: %" PRIu32, mtu); + + r = sd_ndisc_router_option_rewind(rt); + for (;;) { + uint8_t type; + + assert_se(r >= 0); + + if (r == 0) + break; + + assert_se(sd_ndisc_router_option_get_type(rt, &type) >= 0); + + log_info(">> Option %u", type); + + switch (type) { + + case SD_NDISC_OPTION_SOURCE_LL_ADDRESS: + case SD_NDISC_OPTION_TARGET_LL_ADDRESS: { + _cleanup_free_ char *c = NULL; + const void *p; + size_t n; + + assert_se(sd_ndisc_router_option_get_raw(rt, &p, &n) >= 0); + assert_se(n > 2); + assert_se(c = hexmem((uint8_t*) p + 2, n - 2)); + + log_info("Address: %s", c); + break; + } + + case SD_NDISC_OPTION_PREFIX_INFORMATION: { + unsigned prefix_len; + uint8_t pfl; + struct in6_addr a; + + assert_se(sd_ndisc_router_prefix_get_valid_lifetime(rt, &lifetime) >= 0); + assert_se(sd_ndisc_router_prefix_get_valid_lifetime_timestamp(rt, CLOCK_REALTIME, &t) >= 0); + log_info("Valid Lifetime: %s (%s)", FORMAT_TIMESPAN(lifetime, USEC_PER_SEC), FORMAT_TIMESTAMP(t)); + + assert_se(sd_ndisc_router_prefix_get_preferred_lifetime(rt, &lifetime) >= 0); + assert_se(sd_ndisc_router_prefix_get_preferred_lifetime_timestamp(rt, CLOCK_REALTIME, &t) >= 0); + log_info("Preferred Lifetime: %s (%s)", FORMAT_TIMESPAN(lifetime, USEC_PER_SEC), FORMAT_TIMESTAMP(t)); + + assert_se(sd_ndisc_router_prefix_get_flags(rt, &pfl) >= 0); + log_info("Flags: <%s|%s>", + pfl & ND_OPT_PI_FLAG_ONLINK ? "ONLINK" : "", + pfl & ND_OPT_PI_FLAG_AUTO ? "AUTO" : ""); + + assert_se(sd_ndisc_router_prefix_get_prefixlen(rt, &prefix_len) >= 0); + log_info("Prefix Length: %u", prefix_len); + + assert_se(sd_ndisc_router_prefix_get_address(rt, &a) >= 0); + log_info("Prefix: %s", IN6_ADDR_TO_STRING(&a)); + + break; + } + + case SD_NDISC_OPTION_RDNSS: { + const struct in6_addr *a; + int n, i; + + n = sd_ndisc_router_rdnss_get_addresses(rt, &a); + assert_se(n > 0); + + for (i = 0; i < n; i++) + log_info("DNS: %s", IN6_ADDR_TO_STRING(a + i)); + + assert_se(sd_ndisc_router_rdnss_get_lifetime(rt, &lifetime) >= 0); + assert_se(sd_ndisc_router_rdnss_get_lifetime_timestamp(rt, CLOCK_REALTIME, &t) >= 0); + log_info("Lifetime: %s (%s)", FORMAT_TIMESPAN(lifetime, USEC_PER_SEC), FORMAT_TIMESTAMP(t)); + break; + } + + case SD_NDISC_OPTION_DNSSL: { + _cleanup_strv_free_ char **l = NULL; + int n, i; + + n = sd_ndisc_router_dnssl_get_domains(rt, &l); + assert_se(n > 0); + + for (i = 0; i < n; i++) + log_info("Domain: %s", l[i]); + + assert_se(sd_ndisc_router_dnssl_get_lifetime(rt, &lifetime) >= 0); + assert_se(sd_ndisc_router_dnssl_get_lifetime_timestamp(rt, CLOCK_REALTIME, &t) >= 0); + log_info("Lifetime: %s (%s)", FORMAT_TIMESPAN(lifetime, USEC_PER_SEC), FORMAT_TIMESTAMP(t)); + break; + }} + + r = sd_ndisc_router_option_next(rt); + } +} + +static int send_ra(uint8_t flags) { + uint8_t advertisement[] = { + 0x86, 0x00, 0xde, 0x83, 0x40, 0xc0, 0x00, 0xb4, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x04, 0x40, 0xc0, 0x00, 0x00, 0x01, 0xf4, + 0x00, 0x00, 0x01, 0xb8, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x19, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, + 0x20, 0x01, 0x0d, 0xb8, 0xde, 0xad, 0xbe, 0xef, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x1f, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, + 0x03, 0x6c, 0x61, 0x62, 0x05, 0x69, 0x6e, 0x74, + 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x78, 0x2b, 0xcb, 0xb3, 0x6d, 0x53, + }; + + advertisement[5] = flags; + + assert_se(write(test_fd[1], advertisement, sizeof(advertisement)) == + sizeof(advertisement)); + + if (verbose) + printf(" sent RA with flag 0x%02x\n", flags); + + return 0; +} + +static void test_callback(sd_ndisc *nd, sd_ndisc_event_t event, sd_ndisc_router *rt, void *userdata) { + sd_event *e = userdata; + static unsigned idx = 0; + uint64_t flags_array[] = { + 0, + 0, + 0, + ND_RA_FLAG_OTHER, + ND_RA_FLAG_MANAGED + }; + uint64_t flags; + + assert_se(nd); + + if (event != SD_NDISC_EVENT_ROUTER) + return; + + router_dump(rt); + + assert_se(sd_ndisc_router_get_flags(rt, &flags) >= 0); + assert_se(flags == flags_array[idx]); + idx++; + + if (verbose) + printf(" got event 0x%02" PRIx64 "\n", flags); + + if (idx < ELEMENTSOF(flags_array)) { + send_ra(flags_array[idx]); + return; + } + + sd_event_exit(e, 0); +} + +TEST(rs) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_ndisc_unrefp) sd_ndisc *nd = NULL; + + send_ra_function = send_ra; + + assert_se(sd_event_new(&e) >= 0); + + assert_se(sd_ndisc_new(&nd) >= 0); + assert_se(nd); + + assert_se(sd_ndisc_attach_event(nd, e, 0) >= 0); + + assert_se(sd_ndisc_set_ifindex(nd, 42) >= 0); + assert_se(sd_ndisc_set_mac(nd, &mac_addr) >= 0); + assert_se(sd_ndisc_set_callback(nd, test_callback, e) >= 0); + + assert_se(sd_event_add_time_relative(e, NULL, CLOCK_BOOTTIME, + 30 * USEC_PER_SEC, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)) >= 0); + + assert_se(sd_ndisc_stop(nd) >= 0); + assert_se(sd_ndisc_start(nd) >= 0); + assert_se(sd_ndisc_start(nd) >= 0); + assert_se(sd_ndisc_stop(nd) >= 0); + test_fd[1] = safe_close(test_fd[1]); + + assert_se(sd_ndisc_start(nd) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + test_fd[1] = safe_close(test_fd[1]); +} + +static int test_timeout_value(uint8_t flags) { + static int count = 0; + static usec_t last = 0; + sd_ndisc *nd = test_timeout_nd; + usec_t min, max; + + assert_se(nd); + assert_se(nd->event); + + if (++count >= 20) + sd_event_exit(nd->event, 0); + + if (last == 0) { + /* initial RT = IRT + RAND*IRT */ + min = NDISC_ROUTER_SOLICITATION_INTERVAL - + NDISC_ROUTER_SOLICITATION_INTERVAL / 10; + max = NDISC_ROUTER_SOLICITATION_INTERVAL + + NDISC_ROUTER_SOLICITATION_INTERVAL / 10; + } else { + /* next RT = 2*RTprev + RAND*RTprev */ + min = 2 * last - last / 10; + max = 2 * last + last / 10; + } + + /* final RT > MRT */ + if (last * 2 > NDISC_MAX_ROUTER_SOLICITATION_INTERVAL) { + min = NDISC_MAX_ROUTER_SOLICITATION_INTERVAL - + NDISC_MAX_ROUTER_SOLICITATION_INTERVAL / 10; + max = NDISC_MAX_ROUTER_SOLICITATION_INTERVAL + + NDISC_MAX_ROUTER_SOLICITATION_INTERVAL / 10; + } + + log_info("backoff timeout interval %2d %s%s <= %s <= %s", + count, + last * 2 > NDISC_MAX_ROUTER_SOLICITATION_INTERVAL ? "(max) ": "", + FORMAT_TIMESPAN(min, USEC_PER_MSEC), + FORMAT_TIMESPAN(nd->retransmit_time, USEC_PER_MSEC), + FORMAT_TIMESPAN(max, USEC_PER_MSEC)); + + assert_se(min <= nd->retransmit_time); + assert_se(max >= nd->retransmit_time); + + last = nd->retransmit_time; + + assert_se(sd_event_source_set_time(nd->timeout_event_source, 0) >= 0); + + return 0; +} + +TEST(timeout) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_ndisc_unrefp) sd_ndisc *nd = NULL; + + send_ra_function = test_timeout_value; + + assert_se(sd_event_new(&e) >= 0); + + assert_se(sd_ndisc_new(&nd) >= 0); + assert_se(nd); + + test_timeout_nd = nd; + + assert_se(sd_ndisc_attach_event(nd, e, 0) >= 0); + + assert_se(sd_ndisc_set_ifindex(nd, 42) >= 0); + assert_se(sd_ndisc_set_mac(nd, &mac_addr) >= 0); + + assert_se(sd_event_add_time_relative(e, NULL, CLOCK_BOOTTIME, + 30 * USEC_PER_SEC, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)) >= 0); + + assert_se(sd_ndisc_start(nd) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + test_fd[1] = safe_close(test_fd[1]); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/libsystemd-network/test-sd-dhcp-lease.c b/src/libsystemd-network/test-sd-dhcp-lease.c new file mode 100644 index 0000000..910b622 --- /dev/null +++ b/src/libsystemd-network/test-sd-dhcp-lease.c @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "dhcp-lease-internal.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +/* According to RFC1035 section 4.1.4, a domain name in a message can be either: + * - a sequence of labels ending in a zero octet + * - a pointer + * - a sequence of labels ending with a pointer + */ +TEST(dhcp_lease_parse_search_domains_basic) { + int r; + _cleanup_strv_free_ char **domains = NULL; + static const uint8_t optionbuf[] = { + 0x03, 'F', 'O', 'O', 0x03, 'B', 'A', 'R', 0x00, + 0x04, 'A', 'B', 'C', 'D', 0x03, 'E', 'F', 'G', 0x00, + }; + + r = dhcp_lease_parse_search_domains(optionbuf, sizeof(optionbuf), &domains); + assert_se(r == 2); + assert_se(streq(domains[0], "FOO.BAR")); + assert_se(streq(domains[1], "ABCD.EFG")); +} + +TEST(dhcp_lease_parse_search_domains_ptr) { + int r; + _cleanup_strv_free_ char **domains = NULL; + static const uint8_t optionbuf[] = { + 0x03, 'F', 'O', 'O', 0x00, 0xC0, 0x00, + }; + + r = dhcp_lease_parse_search_domains(optionbuf, sizeof(optionbuf), &domains); + assert_se(r == 2); + assert_se(streq(domains[0], "FOO")); + assert_se(streq(domains[1], "FOO")); +} + +TEST(dhcp_lease_parse_search_domains_labels_and_ptr) { + int r; + _cleanup_strv_free_ char **domains = NULL; + static const uint8_t optionbuf[] = { + 0x03, 'F', 'O', 'O', 0x03, 'B', 'A', 'R', 0x00, + 0x03, 'A', 'B', 'C', 0xC0, 0x04, + }; + + r = dhcp_lease_parse_search_domains(optionbuf, sizeof(optionbuf), &domains); + assert_se(r == 2); + assert_se(streq(domains[0], "FOO.BAR")); + assert_se(streq(domains[1], "ABC.BAR")); +} + +/* Tests for exceptions. */ + +TEST(dhcp_lease_parse_search_domains_no_data) { + _cleanup_strv_free_ char **domains = NULL; + static const uint8_t optionbuf[3] = {0, 0, 0}; + + assert_se(dhcp_lease_parse_search_domains(NULL, 0, &domains) == -EBADMSG); + assert_se(dhcp_lease_parse_search_domains(optionbuf, 0, &domains) == -EBADMSG); +} + +TEST(dhcp_lease_parse_search_domains_loops) { + _cleanup_strv_free_ char **domains = NULL; + static const uint8_t optionbuf[] = { + 0x03, 'F', 'O', 'O', 0x00, 0x03, 'B', 'A', 'R', 0xC0, 0x06, + }; + + assert_se(dhcp_lease_parse_search_domains(optionbuf, sizeof(optionbuf), &domains) == -EBADMSG); +} + +TEST(dhcp_lease_parse_search_domains_wrong_len) { + _cleanup_strv_free_ char **domains = NULL; + static const uint8_t optionbuf[] = { + 0x03, 'F', 'O', 'O', 0x03, 'B', 'A', 'R', 0x00, + 0x04, 'A', 'B', 'C', 'D', 0x03, 'E', 'F', 'G', 0x00, + }; + + assert_se(dhcp_lease_parse_search_domains(optionbuf, sizeof(optionbuf) - 5, &domains) == -EBADMSG); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/libsystemd.pc.in b/src/libsystemd/libsystemd.pc.in new file mode 100644 index 0000000..3a43ef6 --- /dev/null +++ b/src/libsystemd/libsystemd.pc.in @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +prefix={{PREFIX}} +exec_prefix={{PREFIX}} +libdir={{LIBDIR}} +includedir={{INCLUDE_DIR}} + +Name: systemd +Description: systemd Library +URL: {{PROJECT_URL}} +Version: {{PROJECT_VERSION}} +Libs: -L${libdir} -lsystemd +Cflags: -I${includedir} diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym new file mode 100644 index 0000000..4113920 --- /dev/null +++ b/src/libsystemd/libsystemd.sym @@ -0,0 +1,836 @@ +/*** + SPDX-License-Identifier: LGPL-2.1-or-later + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +LIBSYSTEMD_209 { +global: + /* sd-journal */ + sd_journal_print; + sd_journal_printv; + sd_journal_send; + sd_journal_sendv; + sd_journal_stream_fd; + sd_journal_open; + sd_journal_close; + sd_journal_previous; + sd_journal_next; + sd_journal_previous_skip; + sd_journal_next_skip; + sd_journal_get_realtime_usec; + sd_journal_get_monotonic_usec; + sd_journal_get_data; + sd_journal_enumerate_data; + sd_journal_restart_data; + sd_journal_add_match; + sd_journal_flush_matches; + sd_journal_seek_head; + sd_journal_seek_tail; + sd_journal_seek_monotonic_usec; + sd_journal_seek_realtime_usec; + sd_journal_seek_cursor; + sd_journal_get_cursor; + sd_journal_get_fd; + sd_journal_process; + sd_journal_print_with_location; + sd_journal_printv_with_location; + sd_journal_send_with_location; + sd_journal_sendv_with_location; + sd_journal_get_cutoff_realtime_usec; + sd_journal_get_cutoff_monotonic_usec; + sd_journal_wait; + sd_journal_open_directory; + sd_journal_add_disjunction; + sd_journal_perror; + sd_journal_perror_with_location; + sd_journal_get_usage; + sd_journal_test_cursor; + sd_journal_query_unique; + sd_journal_enumerate_unique; + sd_journal_restart_unique; + sd_journal_get_catalog; + sd_journal_get_catalog_for_message_id; + sd_journal_set_data_threshold; + sd_journal_get_data_threshold; + sd_journal_reliable_fd; + sd_journal_get_events; + sd_journal_get_timeout; + sd_journal_add_conjunction; + sd_journal_open_files; + sd_journal_open_container; + + /* sd-daemon */ + sd_booted; + sd_is_fifo; + sd_is_mq; + sd_is_socket; + sd_is_socket_inet; + sd_is_socket_unix; + sd_is_special; + sd_listen_fds; + sd_notify; + sd_notifyf; + sd_watchdog_enabled; + + /* sd-id128 */ + sd_id128_to_string; + sd_id128_from_string; + sd_id128_randomize; + sd_id128_get_machine; + sd_id128_get_boot; + + /* sd-login */ + sd_get_seats; + sd_get_sessions; + sd_get_uids; + sd_login_monitor_flush; + sd_login_monitor_get_fd; + sd_login_monitor_new; + sd_login_monitor_unref; + sd_pid_get_owner_uid; + sd_pid_get_session; + sd_seat_can_multi_session; + sd_seat_get_active; + sd_seat_get_sessions; + sd_session_get_seat; + sd_session_get_uid; + sd_session_is_active; + sd_uid_get_seats; + sd_uid_get_sessions; + sd_uid_get_state; + sd_uid_is_on_seat; + sd_pid_get_unit; + sd_session_get_service; + sd_session_get_type; + sd_session_get_class; + sd_session_get_display; + sd_session_get_state; + sd_seat_can_tty; + sd_seat_can_graphical; + sd_session_get_tty; + sd_login_monitor_get_events; + sd_login_monitor_get_timeout; + sd_pid_get_user_unit; + sd_pid_get_machine_name; + sd_get_machine_names; + sd_pid_get_slice; + sd_session_get_vt; + sd_session_is_remote; + sd_session_get_remote_user; + sd_session_get_remote_host; +local: + *; +}; + +LIBSYSTEMD_211 { +global: + sd_machine_get_class; + sd_peer_get_session; + sd_peer_get_owner_uid; + sd_peer_get_unit; + sd_peer_get_user_unit; + sd_peer_get_machine_name; + sd_peer_get_slice; +} LIBSYSTEMD_209; + +LIBSYSTEMD_213 { +global: + sd_uid_get_display; +} LIBSYSTEMD_211; + +LIBSYSTEMD_214 { +global: + sd_pid_notify; + sd_pid_notifyf; +} LIBSYSTEMD_213; + +LIBSYSTEMD_216 { +global: + sd_machine_get_ifindices; +} LIBSYSTEMD_214; + +LIBSYSTEMD_217 { +global: + sd_session_get_desktop; +} LIBSYSTEMD_216; + +LIBSYSTEMD_219 { +global: + sd_pid_notify_with_fds; +} LIBSYSTEMD_217; + +LIBSYSTEMD_220 { +global: + sd_pid_get_user_slice; + sd_peer_get_user_slice; +} LIBSYSTEMD_219; + +LIBSYSTEMD_221 { +global: + /* sd-bus */ + sd_bus_default; + sd_bus_default_user; + sd_bus_default_system; + sd_bus_open; + sd_bus_open_user; + sd_bus_open_system; + sd_bus_open_system_remote; + sd_bus_open_system_machine; + sd_bus_new; + sd_bus_set_address; + sd_bus_set_fd; + sd_bus_set_exec; + sd_bus_get_address; + sd_bus_set_bus_client; + sd_bus_is_bus_client; + sd_bus_set_server; + sd_bus_is_server; + sd_bus_set_anonymous; + sd_bus_is_anonymous; + sd_bus_set_trusted; + sd_bus_is_trusted; + sd_bus_set_monitor; + sd_bus_is_monitor; + sd_bus_set_description; + sd_bus_get_description; + sd_bus_negotiate_creds; + sd_bus_negotiate_timestamp; + sd_bus_negotiate_fds; + sd_bus_can_send; + sd_bus_get_creds_mask; + sd_bus_set_allow_interactive_authorization; + sd_bus_get_allow_interactive_authorization; + sd_bus_start; + sd_bus_close; + sd_bus_try_close; + sd_bus_ref; + sd_bus_unref; + sd_bus_is_open; + sd_bus_get_bus_id; + sd_bus_get_scope; + sd_bus_get_tid; + sd_bus_get_owner_creds; + sd_bus_send; + sd_bus_send_to; + sd_bus_call; + sd_bus_call_async; + sd_bus_get_fd; + sd_bus_get_events; + sd_bus_get_timeout; + sd_bus_process; + sd_bus_process_priority; + sd_bus_wait; + sd_bus_flush; + sd_bus_get_current_slot; + sd_bus_get_current_message; + sd_bus_get_current_handler; + sd_bus_get_current_userdata; + sd_bus_attach_event; + sd_bus_detach_event; + sd_bus_get_event; + sd_bus_add_filter; + sd_bus_add_match; + sd_bus_add_object; + sd_bus_add_fallback; + sd_bus_add_object_vtable; + sd_bus_add_fallback_vtable; + sd_bus_add_node_enumerator; + sd_bus_add_object_manager; + sd_bus_slot_ref; + sd_bus_slot_unref; + sd_bus_slot_get_bus; + sd_bus_slot_get_userdata; + sd_bus_slot_set_userdata; + sd_bus_slot_get_description; + sd_bus_slot_set_description; + sd_bus_slot_get_current_message; + sd_bus_slot_get_current_handler; + sd_bus_slot_get_current_userdata; + sd_bus_message_new_signal; + sd_bus_message_new_method_call; + sd_bus_message_new_method_return; + sd_bus_message_new_method_error; + sd_bus_message_new_method_errorf; + sd_bus_message_new_method_errno; + sd_bus_message_new_method_errnof; + sd_bus_message_ref; + sd_bus_message_unref; + sd_bus_message_get_type; + sd_bus_message_get_cookie; + sd_bus_message_get_reply_cookie; + sd_bus_message_get_priority; + sd_bus_message_get_expect_reply; + sd_bus_message_get_auto_start; + sd_bus_message_get_allow_interactive_authorization; + sd_bus_message_get_signature; + sd_bus_message_get_path; + sd_bus_message_get_interface; + sd_bus_message_get_member; + sd_bus_message_get_destination; + sd_bus_message_get_sender; + sd_bus_message_get_error; + sd_bus_message_get_errno; + sd_bus_message_get_monotonic_usec; + sd_bus_message_get_realtime_usec; + sd_bus_message_get_seqnum; + sd_bus_message_get_bus; + sd_bus_message_get_creds; + sd_bus_message_is_signal; + sd_bus_message_is_method_call; + sd_bus_message_is_method_error; + sd_bus_message_is_empty; + sd_bus_message_has_signature; + sd_bus_message_set_expect_reply; + sd_bus_message_set_auto_start; + sd_bus_message_set_allow_interactive_authorization; + sd_bus_message_set_destination; + sd_bus_message_set_priority; + sd_bus_message_append; + sd_bus_message_append_basic; + sd_bus_message_append_array; + sd_bus_message_append_array_space; + sd_bus_message_append_array_iovec; + sd_bus_message_append_array_memfd; + sd_bus_message_append_string_space; + sd_bus_message_append_string_iovec; + sd_bus_message_append_string_memfd; + sd_bus_message_append_strv; + sd_bus_message_open_container; + sd_bus_message_close_container; + sd_bus_message_copy; + sd_bus_message_read; + sd_bus_message_read_basic; + sd_bus_message_read_array; + sd_bus_message_read_strv; + sd_bus_message_skip; + sd_bus_message_enter_container; + sd_bus_message_exit_container; + sd_bus_message_peek_type; + sd_bus_message_verify_type; + sd_bus_message_at_end; + sd_bus_message_rewind; + sd_bus_get_unique_name; + sd_bus_request_name; + sd_bus_release_name; + sd_bus_list_names; + sd_bus_get_name_creds; + sd_bus_get_name_machine_id; + sd_bus_call_method; + sd_bus_call_method_async; + sd_bus_get_property; + sd_bus_get_property_trivial; + sd_bus_get_property_string; + sd_bus_get_property_strv; + sd_bus_set_property; + sd_bus_reply_method_return; + sd_bus_reply_method_error; + sd_bus_reply_method_errorf; + sd_bus_reply_method_errno; + sd_bus_reply_method_errnof; + sd_bus_emit_signal; + sd_bus_emit_properties_changed_strv; + sd_bus_emit_properties_changed; + sd_bus_emit_interfaces_added_strv; + sd_bus_emit_interfaces_added; + sd_bus_emit_interfaces_removed_strv; + sd_bus_emit_interfaces_removed; + sd_bus_query_sender_creds; + sd_bus_query_sender_privilege; + sd_bus_creds_new_from_pid; + sd_bus_creds_ref; + sd_bus_creds_unref; + sd_bus_creds_get_mask; + sd_bus_creds_get_augmented_mask; + sd_bus_creds_get_pid; + sd_bus_creds_get_ppid; + sd_bus_creds_get_tid; + sd_bus_creds_get_uid; + sd_bus_creds_get_euid; + sd_bus_creds_get_suid; + sd_bus_creds_get_fsuid; + sd_bus_creds_get_gid; + sd_bus_creds_get_egid; + sd_bus_creds_get_sgid; + sd_bus_creds_get_fsgid; + sd_bus_creds_get_supplementary_gids; + sd_bus_creds_get_comm; + sd_bus_creds_get_tid_comm; + sd_bus_creds_get_exe; + sd_bus_creds_get_cmdline; + sd_bus_creds_get_cgroup; + sd_bus_creds_get_unit; + sd_bus_creds_get_slice; + sd_bus_creds_get_user_unit; + sd_bus_creds_get_user_slice; + sd_bus_creds_get_session; + sd_bus_creds_get_owner_uid; + sd_bus_creds_has_effective_cap; + sd_bus_creds_has_permitted_cap; + sd_bus_creds_has_inheritable_cap; + sd_bus_creds_has_bounding_cap; + sd_bus_creds_get_selinux_context; + sd_bus_creds_get_audit_session_id; + sd_bus_creds_get_audit_login_uid; + sd_bus_creds_get_tty; + sd_bus_creds_get_unique_name; + sd_bus_creds_get_well_known_names; + sd_bus_creds_get_description; + sd_bus_error_free; + sd_bus_error_set; + sd_bus_error_setf; + sd_bus_error_set_const; + sd_bus_error_set_errno; + sd_bus_error_set_errnof; + sd_bus_error_set_errnofv; + sd_bus_error_get_errno; + sd_bus_error_copy; + sd_bus_error_is_set; + sd_bus_error_has_name; + sd_bus_error_add_map; + sd_bus_path_encode; + sd_bus_path_decode; + sd_bus_track_new; + sd_bus_track_ref; + sd_bus_track_unref; + sd_bus_track_get_bus; + sd_bus_track_get_userdata; + sd_bus_track_set_userdata; + sd_bus_track_add_sender; + sd_bus_track_remove_sender; + sd_bus_track_add_name; + sd_bus_track_remove_name; + sd_bus_track_count; + sd_bus_track_contains; + sd_bus_track_first; + sd_bus_track_next; + + /* sd-event */ + sd_event_default; + sd_event_new; + sd_event_ref; + sd_event_unref; + sd_event_add_io; + sd_event_add_time; + sd_event_add_signal; + sd_event_add_child; + sd_event_add_defer; + sd_event_add_post; + sd_event_add_exit; + sd_event_prepare; + sd_event_wait; + sd_event_dispatch; + sd_event_run; + sd_event_loop; + sd_event_exit; + sd_event_now; + sd_event_get_fd; + sd_event_get_state; + sd_event_get_tid; + sd_event_get_exit_code; + sd_event_set_watchdog; + sd_event_get_watchdog; + sd_event_source_ref; + sd_event_source_unref; + sd_event_source_get_event; + sd_event_source_get_userdata; + sd_event_source_set_userdata; + sd_event_source_set_description; + sd_event_source_get_description; + sd_event_source_set_prepare; + sd_event_source_get_pending; + sd_event_source_get_priority; + sd_event_source_set_priority; + sd_event_source_get_enabled; + sd_event_source_set_enabled; + sd_event_source_get_io_fd; + sd_event_source_set_io_fd; + sd_event_source_get_io_events; + sd_event_source_set_io_events; + sd_event_source_get_io_revents; + sd_event_source_get_time; + sd_event_source_set_time; + sd_event_source_set_time_accuracy; + sd_event_source_get_time_accuracy; + sd_event_source_get_time_clock; + sd_event_source_get_signal; + sd_event_source_get_child_pid; +} LIBSYSTEMD_220; + +LIBSYSTEMD_222 { +global: + /* sd-bus */ + sd_bus_emit_object_added; + sd_bus_emit_object_removed; + sd_bus_flush_close_unref; +} LIBSYSTEMD_221; + +LIBSYSTEMD_226 { +global: + sd_pid_get_cgroup; + sd_peer_get_cgroup; +} LIBSYSTEMD_222; + +LIBSYSTEMD_227 { +global: + sd_bus_default_flush_close; + sd_bus_path_decode_many; + sd_bus_path_encode_many; + sd_listen_fds_with_names; +} LIBSYSTEMD_226; + +LIBSYSTEMD_229 { +global: + sd_journal_has_runtime_files; + sd_journal_has_persistent_files; + sd_journal_enumerate_fields; + sd_journal_restart_fields; +} LIBSYSTEMD_227; + +LIBSYSTEMD_230 { +global: + sd_journal_open_directory_fd; + sd_journal_open_files_fd; +} LIBSYSTEMD_229; + +LIBSYSTEMD_231 { +global: + sd_event_get_iteration; +} LIBSYSTEMD_230; + +LIBSYSTEMD_232 { +global: + sd_bus_track_set_recursive; + sd_bus_track_get_recursive; + sd_bus_track_count_name; + sd_bus_track_count_sender; + sd_bus_set_exit_on_disconnect; + sd_bus_get_exit_on_disconnect; + sd_id128_get_invocation; +} LIBSYSTEMD_231; + +LIBSYSTEMD_233 { +global: + sd_id128_get_machine_app_specific; + sd_is_socket_sockaddr; +} LIBSYSTEMD_232; + +LIBSYSTEMD_234 { +global: + sd_bus_message_appendv; +} LIBSYSTEMD_233; + +LIBSYSTEMD_236 { +global: + sd_bus_message_new; + sd_bus_message_seal; +} LIBSYSTEMD_234; + +LIBSYSTEMD_237 { +global: + sd_bus_set_watch_bind; + sd_bus_get_watch_bind; + sd_bus_request_name_async; + sd_bus_release_name_async; + sd_bus_add_match_async; + sd_bus_match_signal; + sd_bus_match_signal_async; + sd_bus_is_ready; + sd_bus_set_connected_signal; + sd_bus_get_connected_signal; + sd_bus_set_sender; + sd_bus_get_sender; + sd_bus_message_set_sender; + sd_event_source_get_io_fd_own; + sd_event_source_set_io_fd_own; +} LIBSYSTEMD_236; + +LIBSYSTEMD_238 { +global: + sd_bus_get_n_queued_read; + sd_bus_get_n_queued_write; +} LIBSYSTEMD_237; + +LIBSYSTEMD_239 { +global: + sd_bus_open_with_description; + sd_bus_open_user_with_description; + sd_bus_open_system_with_description; + sd_bus_slot_get_floating; + sd_bus_slot_set_floating; + sd_bus_slot_get_destroy_callback; + sd_bus_slot_set_destroy_callback; + sd_bus_track_get_destroy_callback; + sd_bus_track_set_destroy_callback; + sd_event_add_inotify; + sd_event_source_get_inotify_mask; + sd_event_source_set_destroy_callback; + sd_event_source_get_destroy_callback; +} LIBSYSTEMD_238; + +LIBSYSTEMD_240 { +global: + sd_bus_message_readv; + sd_bus_set_method_call_timeout; + sd_bus_get_method_call_timeout; + + sd_bus_error_move; + + sd_bus_set_close_on_exit; + sd_bus_get_close_on_exit; + + sd_device_ref; + sd_device_unref; + + sd_device_new_from_syspath; + sd_device_new_from_devnum; + sd_device_new_from_subsystem_sysname; + sd_device_new_from_device_id; + + sd_device_get_parent; + sd_device_get_parent_with_subsystem_devtype; + + sd_device_get_syspath; + sd_device_get_subsystem; + sd_device_get_devtype; + sd_device_get_devnum; + sd_device_get_ifindex; + sd_device_get_driver; + sd_device_get_devpath; + sd_device_get_devname; + sd_device_get_sysname; + sd_device_get_sysnum; + + sd_device_get_is_initialized; + sd_device_get_usec_since_initialized; + + sd_device_get_tag_first; + sd_device_get_tag_next; + sd_device_get_devlink_first; + sd_device_get_devlink_next; + sd_device_get_property_first; + sd_device_get_property_next; + sd_device_get_sysattr_first; + sd_device_get_sysattr_next; + + sd_device_has_tag; + sd_device_get_property_value; + sd_device_get_sysattr_value; + + sd_device_set_sysattr_value; + + sd_device_enumerator_new; + sd_device_enumerator_ref; + sd_device_enumerator_unref; + + sd_device_enumerator_get_device_first; + sd_device_enumerator_get_device_next; + sd_device_enumerator_get_subsystem_first; + sd_device_enumerator_get_subsystem_next; + + sd_device_enumerator_add_match_subsystem; + sd_device_enumerator_add_match_sysattr; + sd_device_enumerator_add_match_property; + sd_device_enumerator_add_match_sysname; + sd_device_enumerator_add_match_tag; + sd_device_enumerator_add_match_parent; + sd_device_enumerator_allow_uninitialized; + + sd_hwdb_ref; + sd_hwdb_unref; + + sd_hwdb_new; + + sd_hwdb_get; + + sd_hwdb_seek; + sd_hwdb_enumerate; + + sd_id128_get_boot_app_specific; + + sd_device_monitor_new; + sd_device_monitor_ref; + sd_device_monitor_unref; + + sd_device_monitor_set_receive_buffer_size; + sd_device_monitor_attach_event; + sd_device_monitor_detach_event; + sd_device_monitor_get_event; + sd_device_monitor_get_event_source; + sd_device_monitor_start; + sd_device_monitor_stop; + + sd_device_monitor_filter_add_match_subsystem_devtype; + sd_device_monitor_filter_add_match_tag; + sd_device_monitor_filter_update; + sd_device_monitor_filter_remove; + + sd_event_source_get_floating; + sd_event_source_set_floating; +} LIBSYSTEMD_239; + +LIBSYSTEMD_241 { +global: + sd_bus_close_unref; +} LIBSYSTEMD_240; + +LIBSYSTEMD_243 { +global: + sd_bus_object_vtable_format; + sd_event_source_disable_unref; +} LIBSYSTEMD_241; + +LIBSYSTEMD_245 { +global: + sd_bus_enqueue_for_read; + sd_bus_message_dump; + sd_bus_message_sensitive; + sd_event_add_child_pidfd; + sd_event_source_get_child_pidfd; + sd_event_source_get_child_pidfd_own; + sd_event_source_set_child_pidfd_own; + sd_event_source_get_child_process_own; + sd_event_source_set_child_process_own; + sd_event_source_send_child_signal; + sd_journal_open_namespace; +} LIBSYSTEMD_243; + +LIBSYSTEMD_246 { +global: + sd_bus_interface_name_is_valid; + sd_bus_service_name_is_valid; + sd_bus_member_name_is_valid; + sd_bus_object_path_is_valid; + + sd_bus_call_methodv; + sd_bus_call_method_asyncv; + sd_bus_emit_signalv; + sd_bus_reply_method_errnofv; + sd_bus_reply_method_errorfv; + sd_bus_reply_method_returnv; + sd_bus_set_propertyv; + + sd_path_lookup; + sd_path_lookup_strv; + + sd_notify_barrier; + + sd_journal_enumerate_available_data; + sd_journal_enumerate_available_unique; +} LIBSYSTEMD_245; + +LIBSYSTEMD_247 { +global: + sd_event_add_time_relative; + sd_event_source_set_time_relative; + sd_event_source_get_exit_on_failure; + sd_event_source_set_exit_on_failure; + + sd_bus_error_has_names_sentinel; + + sd_device_get_current_tag_first; + sd_device_get_current_tag_next; + sd_device_has_current_tag; + sd_device_set_sysattr_valuef; +} LIBSYSTEMD_246; + +LIBSYSTEMD_248 { +global: + sd_bus_open_user_machine; + sd_bus_message_send; + + sd_event_source_set_ratelimit; + sd_event_source_get_ratelimit; + sd_event_source_is_ratelimited; + + sd_device_get_action; + sd_device_get_seqnum; + sd_device_new_from_stat_rdev; + sd_device_trigger; +} LIBSYSTEMD_247; + +LIBSYSTEMD_249 { +global: + sd_device_monitor_filter_add_match_sysattr; + sd_device_monitor_filter_add_match_parent; + sd_device_get_usec_initialized; + sd_device_trigger_with_uuid; + sd_device_get_trigger_uuid; + sd_device_new_from_ifname; + sd_device_new_from_ifindex; +} LIBSYSTEMD_248; + +LIBSYSTEMD_250 { +global: + sd_device_get_diskseq; + sd_event_add_inotify_fd; + sd_event_source_set_ratelimit_expire_callback; +} LIBSYSTEMD_249; + +LIBSYSTEMD_251 { +global: + sd_id128_to_uuid_string; + sd_device_new_from_devname; + sd_device_new_from_path; + sd_device_open; + sd_device_enumerator_add_nomatch_sysname; +} LIBSYSTEMD_250; + +LIBSYSTEMD_252 { +global: + sd_bus_message_read_strv_extend; + sd_bus_error_setfv; + + sd_device_new_child; + sd_device_get_child_first; + sd_device_get_child_next; + sd_device_monitor_set_description; + sd_device_monitor_get_description; + + sd_event_set_signal_exit; + + sd_id128_string_equal; + + sd_hwdb_new_from_path; +} LIBSYSTEMD_251; + +LIBSYSTEMD_253 { +global: + sd_bus_emit_signal_to; + sd_bus_emit_signal_tov; + sd_bus_message_new_signal_to; + sd_pidfd_get_cgroup; + sd_pidfd_get_machine_name; + sd_pidfd_get_owner_uid; + sd_pidfd_get_session; + sd_pidfd_get_slice; + sd_pidfd_get_unit; + sd_pidfd_get_user_slice; + sd_pidfd_get_user_unit; +} LIBSYSTEMD_252; + +LIBSYSTEMD_254 { +global: + sd_journal_get_seqnum; + sd_session_get_username; + sd_session_get_start_time; + sd_uid_get_login_time; + sd_pid_notifyf_with_fds; + sd_event_add_memory_pressure; + sd_event_source_set_memory_pressure_type; + sd_event_source_set_memory_pressure_period; + sd_event_trim_memory; + sd_pid_notify_barrier; + sd_event_source_leave_ratelimit; + sd_journal_step_one; + sd_session_get_leader; +} LIBSYSTEMD_253; + +LIBSYSTEMD_255 { +global: + sd_id128_get_app_specific; + sd_device_enumerator_add_match_property_required; +} LIBSYSTEMD_254; diff --git a/src/libsystemd/meson.build b/src/libsystemd/meson.build new file mode 100644 index 0000000..5d18f97 --- /dev/null +++ b/src/libsystemd/meson.build @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +sd_journal_sources = files( + 'sd-journal/audit-type.c', + 'sd-journal/catalog.c', + 'sd-journal/journal-file.c', + 'sd-journal/journal-send.c', + 'sd-journal/journal-vacuum.c', + 'sd-journal/journal-verify.c', + 'sd-journal/lookup3.c', + 'sd-journal/mmap-cache.c', + 'sd-journal/sd-journal.c', +) + +if conf.get('HAVE_GCRYPT') == 1 + sd_journal_sources += files( + 'sd-journal/fsprg.c', + 'sd-journal/journal-authenticate.c', + ) +endif + +audit_type_includes = [config_h, + missing_audit_h, + 'linux/audit.h'] +if conf.get('HAVE_AUDIT') == 1 + audit_type_includes += 'libaudit.h' +endif + +generate_audit_type_list = find_program('sd-journal/generate-audit_type-list.sh') +audit_type_list_txt = custom_target( + 'audit_type-list.txt', + output : 'audit_type-list.txt', + command : [generate_audit_type_list, cpp] + audit_type_includes, + capture : true) + +audit_type_to_name = custom_target( + 'audit_type-to-name.h', + input : ['sd-journal/audit_type-to-name.awk', audit_type_list_txt], + output : 'audit_type-to-name.h', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +sd_journal_sources += [audit_type_to_name] + +############################################################ + +id128_sources = files( + 'sd-id128/id128-util.c', + 'sd-id128/sd-id128.c', +) + +############################################################ + +sd_daemon_sources = files('sd-daemon/sd-daemon.c') + +############################################################ + +sd_event_sources = files( + 'sd-event/event-util.c', + 'sd-event/sd-event.c', +) + +############################################################ + +sd_login_sources = files('sd-login/sd-login.c') + +############################################################ + +libsystemd_sources = files( + 'sd-bus/bus-common-errors.c', + 'sd-bus/bus-container.c', + 'sd-bus/bus-control.c', + 'sd-bus/bus-convenience.c', + 'sd-bus/bus-creds.c', + 'sd-bus/bus-dump.c', + 'sd-bus/bus-error.c', + 'sd-bus/bus-internal.c', + 'sd-bus/bus-introspect.c', + 'sd-bus/bus-kernel.c', + 'sd-bus/bus-match.c', + 'sd-bus/bus-message.c', + 'sd-bus/bus-objects.c', + 'sd-bus/bus-signature.c', + 'sd-bus/bus-slot.c', + 'sd-bus/bus-socket.c', + 'sd-bus/bus-track.c', + 'sd-bus/bus-type.c', + 'sd-bus/sd-bus.c', + 'sd-device/device-enumerator.c', + 'sd-device/device-filter.c', + 'sd-device/device-monitor.c', + 'sd-device/device-private.c', + 'sd-device/device-util.c', + 'sd-device/sd-device.c', + 'sd-hwdb/sd-hwdb.c', + 'sd-netlink/netlink-genl.c', + 'sd-netlink/netlink-message-nfnl.c', + 'sd-netlink/netlink-message-rtnl.c', + 'sd-netlink/netlink-message.c', + 'sd-netlink/netlink-slot.c', + 'sd-netlink/netlink-socket.c', + 'sd-netlink/netlink-types-genl.c', + 'sd-netlink/netlink-types-nfnl.c', + 'sd-netlink/netlink-types-rtnl.c', + 'sd-netlink/netlink-types.c', + 'sd-netlink/netlink-util.c', + 'sd-netlink/sd-netlink.c', + 'sd-network/network-util.c', + 'sd-network/sd-network.c', + 'sd-path/sd-path.c', + 'sd-resolve/sd-resolve.c', +) + sd_journal_sources + id128_sources + sd_daemon_sources + sd_event_sources + sd_login_sources + +libsystemd_c_args = ['-fvisibility=default'] + +libsystemd_static = static_library( + 'systemd_static', + libsystemd_sources, + include_directories : libsystemd_includes, + c_args : libsystemd_c_args, + link_with : [libbasic, + libbasic_compress], + dependencies : [threads, + librt, + userspace], + build_by_default : false) + +libsystemd_dir_path = meson.current_source_dir() + +libsystemd_sym = files('libsystemd.sym') +libsystemd_sym_path = libsystemd_dir_path / 'libsystemd.sym' + +static_libsystemd = get_option('static-libsystemd') +static_libsystemd_pic = static_libsystemd == 'true' or static_libsystemd == 'pic' + +libsystemd_pc = custom_target( + 'libsystemd.pc', + input : 'libsystemd.pc.in', + output : 'libsystemd.pc', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : pkgconfiglibdir != 'no', + install_tag : 'devel', + install_dir : pkgconfiglibdir) + +############################################################ + +simple_tests += files( + 'sd-journal/test-audit-type.c', + 'sd-journal/test-catalog.c', + 'sd-journal/test-journal-file.c', + 'sd-journal/test-journal-init.c', + 'sd-journal/test-journal-match.c', + 'sd-journal/test-journal-send.c', + 'sd-journal/test-mmap-cache.c', +) + +libsystemd_tests += [ + { + 'sources' : files('sd-journal/test-journal-enum.c'), + 'timeout' : 360, + }, +] + +############################################################ + +simple_tests += files( + 'sd-bus/test-bus-creds.c', + 'sd-bus/test-bus-introspect.c', + 'sd-bus/test-bus-match.c', + 'sd-bus/test-bus-vtable.c', + 'sd-device/test-device-util.c', + 'sd-device/test-sd-device-monitor.c', + 'sd-device/test-sd-device.c', + 'sd-event/test-event.c', + 'sd-journal/test-journal-flush.c', + 'sd-journal/test-journal-interleaving.c', + 'sd-journal/test-journal-stream.c', + 'sd-journal/test-journal.c', + 'sd-login/test-login.c', + 'sd-netlink/test-netlink.c', +) + +libsystemd_tests += [ + { + 'sources' : files('sd-bus/test-bus-address.c'), + 'dependencies' : threads + }, + { + 'sources' : files('sd-bus/test-bus-benchmark.c'), + 'dependencies' : threads, + 'type' : 'manual', + }, + { + 'sources' : files('sd-bus/test-bus-chat.c'), + 'dependencies' : threads, + }, + { + 'sources' : files('sd-bus/test-bus-cleanup.c'), + 'dependencies' : [threads, libseccomp], + }, + { + 'sources' : files('sd-bus/test-bus-marshal.c'), + 'dependencies' : [ + libdbus, + libgio, + libglib, + libgobject, + libm, + threads, + ], + }, + { + 'sources' : files('sd-bus/test-bus-objects.c'), + 'dependencies' : threads, + }, + { + 'sources' : files('sd-bus/test-bus-peersockaddr.c'), + 'dependencies' : threads, + }, + { + 'sources' : files('sd-bus/test-bus-queue-ref-cycle.c'), + 'dependencies' : threads, + }, + { + 'sources' : files('sd-bus/test-bus-server.c'), + 'dependencies' : threads, + }, + { + 'sources' : files('sd-bus/test-bus-signature.c'), + 'dependencies' : threads, + }, + { + 'sources' : files('sd-bus/test-bus-track.c'), + 'dependencies' : libseccomp, + }, + { + 'sources' : files('sd-bus/test-bus-watch-bind.c'), + 'dependencies' : threads, + 'timeout' : 120, + }, + { + 'sources' : files('sd-journal/test-journal-append.c'), + 'type' : 'manual', + }, + { + 'sources' : files('sd-journal/test-journal-verify.c'), + 'timeout' : 90, + }, + { + 'sources' : files('sd-resolve/test-resolve.c'), + 'dependencies' : threads, + 'timeout' : 120, + }, +] + +if cxx_cmd != '' + simple_tests += files('sd-bus/test-bus-vtable-cc.cc') +endif + +############################################################ + +simple_fuzzers += files( + 'sd-bus/fuzz-bus-match.c', + 'sd-bus/fuzz-bus-message.c', +) diff --git a/src/libsystemd/sd-bus/bus-common-errors.c b/src/libsystemd/sd-bus/bus-common-errors.c new file mode 100644 index 0000000..df26fd7 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-common-errors.c @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-bus.h" + +#include "bus-common-errors.h" +#include "bus-error.h" + +BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map bus_common_errors[] = { + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_UNIT, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_UNIT_FOR_PID, ESRCH), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_UNIT_EXISTS, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_LOAD_FAILED, EIO), + SD_BUS_ERROR_MAP(BUS_ERROR_BAD_UNIT_SETTING, ENOEXEC), + SD_BUS_ERROR_MAP(BUS_ERROR_JOB_FAILED, EREMOTEIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_JOB, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_NOT_SUBSCRIBED, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_ALREADY_SUBSCRIBED, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_ONLY_BY_DEPENDENCY, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_TRANSACTION_JOBS_CONFLICTING, EDEADLK), + SD_BUS_ERROR_MAP(BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC, EDEADLK), + SD_BUS_ERROR_MAP(BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE, EDEADLK), + SD_BUS_ERROR_MAP(BUS_ERROR_UNIT_MASKED, ERFKILL), + SD_BUS_ERROR_MAP(BUS_ERROR_UNIT_GENERATED, EADDRNOTAVAIL), + SD_BUS_ERROR_MAP(BUS_ERROR_UNIT_LINKED, ELOOP), + SD_BUS_ERROR_MAP(BUS_ERROR_JOB_TYPE_NOT_APPLICABLE, EBADR), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_ISOLATION, EPERM), + SD_BUS_ERROR_MAP(BUS_ERROR_SHUTTING_DOWN, ECANCELED), + SD_BUS_ERROR_MAP(BUS_ERROR_SCOPE_NOT_RUNNING, EHOSTDOWN), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_DYNAMIC_USER, ESRCH), + SD_BUS_ERROR_MAP(BUS_ERROR_NOT_REFERENCED, EUNATCH), + SD_BUS_ERROR_MAP(BUS_ERROR_DISK_FULL, ENOSPC), + SD_BUS_ERROR_MAP(BUS_ERROR_FILE_DESCRIPTOR_STORE_DISABLED, + EHOSTDOWN), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_MACHINE, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_IMAGE, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_MACHINE_FOR_PID, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_MACHINE_EXISTS, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_PRIVATE_NETWORKING, ENOSYS), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_USER_MAPPING, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_GROUP_MAPPING, ENXIO), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_PORTABLE_IMAGE, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_BAD_PORTABLE_IMAGE_TYPE, EMEDIUMTYPE), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_SESSION, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SESSION_FOR_PID, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_USER, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_USER_FOR_PID, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_SEAT, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_SESSION_NOT_ON_SEAT, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_NOT_IN_CONTROL, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_DEVICE_IS_TAKEN, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_DEVICE_NOT_TAKEN, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_OPERATION_IN_PROGRESS, EINPROGRESS), + SD_BUS_ERROR_MAP(BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, EOPNOTSUPP), + SD_BUS_ERROR_MAP(BUS_ERROR_SESSION_BUSY, EBUSY), + SD_BUS_ERROR_MAP(BUS_ERROR_NOT_YOUR_DEVICE, EPERM), + + SD_BUS_ERROR_MAP(BUS_ERROR_AUTOMATIC_TIME_SYNC_ENABLED, EALREADY), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_NTP_SUPPORT, EOPNOTSUPP), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_PROCESS, ESRCH), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_NAME_SERVERS, ESRCH), + SD_BUS_ERROR_MAP(BUS_ERROR_INVALID_REPLY, EINVAL), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_RR, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_CNAME_LOOP, EDEADLK), + SD_BUS_ERROR_MAP(BUS_ERROR_ABORTED, ECANCELED), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_SERVICE, EUNATCH), + SD_BUS_ERROR_MAP(BUS_ERROR_DNSSEC_FAILED, EHOSTUNREACH), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_TRUST_ANCHOR, EHOSTUNREACH), + SD_BUS_ERROR_MAP(BUS_ERROR_RR_TYPE_UNSUPPORTED, EOPNOTSUPP), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_LINK, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_LINK_BUSY, EBUSY), + SD_BUS_ERROR_MAP(BUS_ERROR_NETWORK_DOWN, ENETDOWN), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SOURCE, ESRCH), + SD_BUS_ERROR_MAP(BUS_ERROR_STUB_LOOP, ELOOP), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_DNSSD_SERVICE, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_DNSSD_SERVICE_EXISTS, EEXIST), + + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_FORMERR, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_SERVFAIL, EHOSTDOWN), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_NXDOMAIN, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_NOTIMP, ENOSYS), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_REFUSED, EACCES), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_YXDOMAIN, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_YRRSET, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_NXRRSET, ENOENT), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_NOTAUTH, EACCES), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_NOTZONE, EREMOTE), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADVERS, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADKEY, EKEYREJECTED), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADTIME, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADMODE, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADNAME, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADALG, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADTRUNC, EBADMSG), + SD_BUS_ERROR_MAP(BUS_ERROR_DNS_BADCOOKIE, EBADR), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_TRANSFER, ENXIO), + SD_BUS_ERROR_MAP(BUS_ERROR_TRANSFER_IN_PROGRESS, EBUSY), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_PRODUCT_UUID, EOPNOTSUPP), + SD_BUS_ERROR_MAP(BUS_ERROR_FILE_IS_PROTECTED, EACCES), + SD_BUS_ERROR_MAP(BUS_ERROR_READ_ONLY_FILESYSTEM, EROFS), + + SD_BUS_ERROR_MAP(BUS_ERROR_SPEED_METER_INACTIVE, EOPNOTSUPP), + SD_BUS_ERROR_MAP(BUS_ERROR_UNMANAGED_INTERFACE, EOPNOTSUPP), + + SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_HOME, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_UID_IN_USE, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_USER_NAME_EXISTS, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_EXISTS, EEXIST), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_ALREADY_ACTIVE, EALREADY), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_ALREADY_FIXATED, EALREADY), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_UNFIXATED, EADDRNOTAVAIL), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_NOT_ACTIVE, EALREADY), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_ABSENT, EREMOTE), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_BUSY, EBUSY), + SD_BUS_ERROR_MAP(BUS_ERROR_BAD_PASSWORD, ENOKEY), + SD_BUS_ERROR_MAP(BUS_ERROR_LOW_PASSWORD_QUALITY, EUCLEAN), + SD_BUS_ERROR_MAP(BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN, EBADSLT), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_PIN_NEEDED, ENOANO), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_PROTECTED_AUTHENTICATION_PATH_NEEDED, ERFKILL), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_USER_PRESENCE_NEEDED, EMEDIUMTYPE), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_USER_VERIFICATION_NEEDED, ENOCSI), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_ACTION_TIMEOUT, ENOSTR), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_PIN_LOCKED, EOWNERDEAD), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_BAD_PIN, ENOLCK), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_BAD_PIN_FEW_TRIES_LEFT, ETOOMANYREFS), + SD_BUS_ERROR_MAP(BUS_ERROR_TOKEN_BAD_PIN_ONE_TRY_LEFT, EUCLEAN), + SD_BUS_ERROR_MAP(BUS_ERROR_BAD_SIGNATURE, EKEYREJECTED), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_RECORD_MISMATCH, EUCLEAN), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_RECORD_DOWNGRADE, ESTALE), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_RECORD_SIGNED, EROFS), + SD_BUS_ERROR_MAP(BUS_ERROR_BAD_HOME_SIZE, ERANGE), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_PRIVATE_KEY, ENOPKG), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_LOCKED, ENOEXEC), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_NOT_LOCKED, ENOEXEC), + SD_BUS_ERROR_MAP(BUS_ERROR_TOO_MANY_OPERATIONS, ENOBUFS), + SD_BUS_ERROR_MAP(BUS_ERROR_AUTHENTICATION_LIMIT_HIT, ETOOMANYREFS), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_CANT_AUTHENTICATE, EKEYREVOKED), + SD_BUS_ERROR_MAP(BUS_ERROR_HOME_IN_USE, EADDRINUSE), + SD_BUS_ERROR_MAP(BUS_ERROR_REBALANCE_NOT_NEEDED, EALREADY), + + SD_BUS_ERROR_MAP_END +}; diff --git a/src/libsystemd/sd-bus/bus-common-errors.h b/src/libsystemd/sd-bus/bus-common-errors.h new file mode 100644 index 0000000..3a0eef4 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-common-errors.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-error.h" + +#define BUS_ERROR_NO_SUCH_UNIT "org.freedesktop.systemd1.NoSuchUnit" +#define BUS_ERROR_NO_UNIT_FOR_PID "org.freedesktop.systemd1.NoUnitForPID" +#define BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID "org.freedesktop.systemd1.NoUnitForInvocationID" +#define BUS_ERROR_UNIT_EXISTS "org.freedesktop.systemd1.UnitExists" +#define BUS_ERROR_LOAD_FAILED "org.freedesktop.systemd1.LoadFailed" +#define BUS_ERROR_BAD_UNIT_SETTING "org.freedesktop.systemd1.BadUnitSetting" +#define BUS_ERROR_JOB_FAILED "org.freedesktop.systemd1.JobFailed" +#define BUS_ERROR_NO_SUCH_JOB "org.freedesktop.systemd1.NoSuchJob" +#define BUS_ERROR_NOT_SUBSCRIBED "org.freedesktop.systemd1.NotSubscribed" +#define BUS_ERROR_ALREADY_SUBSCRIBED "org.freedesktop.systemd1.AlreadySubscribed" +#define BUS_ERROR_ONLY_BY_DEPENDENCY "org.freedesktop.systemd1.OnlyByDependency" +#define BUS_ERROR_TRANSACTION_JOBS_CONFLICTING "org.freedesktop.systemd1.TransactionJobsConflicting" +#define BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC "org.freedesktop.systemd1.TransactionOrderIsCyclic" +#define BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE "org.freedesktop.systemd1.TransactionIsDestructive" +#define BUS_ERROR_UNIT_MASKED "org.freedesktop.systemd1.UnitMasked" +#define BUS_ERROR_UNIT_GENERATED "org.freedesktop.systemd1.UnitGenerated" +#define BUS_ERROR_UNIT_LINKED "org.freedesktop.systemd1.UnitLinked" +#define BUS_ERROR_UNIT_BAD_PATH "org.freedesktop.systemd1.UnitBadPath" +#define BUS_ERROR_JOB_TYPE_NOT_APPLICABLE "org.freedesktop.systemd1.JobTypeNotApplicable" +#define BUS_ERROR_NO_ISOLATION "org.freedesktop.systemd1.NoIsolation" +#define BUS_ERROR_SHUTTING_DOWN "org.freedesktop.systemd1.ShuttingDown" +#define BUS_ERROR_SCOPE_NOT_RUNNING "org.freedesktop.systemd1.ScopeNotRunning" +#define BUS_ERROR_NO_SUCH_DYNAMIC_USER "org.freedesktop.systemd1.NoSuchDynamicUser" +#define BUS_ERROR_NOT_REFERENCED "org.freedesktop.systemd1.NotReferenced" +#define BUS_ERROR_DISK_FULL "org.freedesktop.systemd1.DiskFull" +#define BUS_ERROR_NOTHING_TO_CLEAN "org.freedesktop.systemd1.NothingToClean" +#define BUS_ERROR_UNIT_BUSY "org.freedesktop.systemd1.UnitBusy" +#define BUS_ERROR_UNIT_INACTIVE "org.freedesktop.systemd1.UnitInactive" +#define BUS_ERROR_FREEZE_CANCELLED "org.freedesktop.systemd1.FreezeCancelled" +#define BUS_ERROR_FILE_DESCRIPTOR_STORE_DISABLED \ + "org.freedesktop.systemd1.FileDescriptorStoreDisabled" + +#define BUS_ERROR_NO_SUCH_MACHINE "org.freedesktop.machine1.NoSuchMachine" +#define BUS_ERROR_NO_SUCH_IMAGE "org.freedesktop.machine1.NoSuchImage" +#define BUS_ERROR_NO_MACHINE_FOR_PID "org.freedesktop.machine1.NoMachineForPID" +#define BUS_ERROR_MACHINE_EXISTS "org.freedesktop.machine1.MachineExists" +#define BUS_ERROR_NO_PRIVATE_NETWORKING "org.freedesktop.machine1.NoPrivateNetworking" +#define BUS_ERROR_NO_SUCH_USER_MAPPING "org.freedesktop.machine1.NoSuchUserMapping" +#define BUS_ERROR_NO_SUCH_GROUP_MAPPING "org.freedesktop.machine1.NoSuchGroupMapping" + +#define BUS_ERROR_NO_SUCH_PORTABLE_IMAGE "org.freedesktop.portable1.NoSuchImage" +#define BUS_ERROR_BAD_PORTABLE_IMAGE_TYPE "org.freedesktop.portable1.BadImageType" + +#define BUS_ERROR_NO_SUCH_SESSION "org.freedesktop.login1.NoSuchSession" +#define BUS_ERROR_NO_SESSION_FOR_PID "org.freedesktop.login1.NoSessionForPID" +#define BUS_ERROR_NO_SUCH_USER "org.freedesktop.login1.NoSuchUser" +#define BUS_ERROR_NO_USER_FOR_PID "org.freedesktop.login1.NoUserForPID" +#define BUS_ERROR_NO_SUCH_SEAT "org.freedesktop.login1.NoSuchSeat" +#define BUS_ERROR_SESSION_NOT_ON_SEAT "org.freedesktop.login1.SessionNotOnSeat" +#define BUS_ERROR_NOT_IN_CONTROL "org.freedesktop.login1.NotInControl" +#define BUS_ERROR_DEVICE_IS_TAKEN "org.freedesktop.login1.DeviceIsTaken" +#define BUS_ERROR_DEVICE_NOT_TAKEN "org.freedesktop.login1.DeviceNotTaken" +#define BUS_ERROR_OPERATION_IN_PROGRESS "org.freedesktop.login1.OperationInProgress" +#define BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED "org.freedesktop.login1.SleepVerbNotSupported" +#define BUS_ERROR_SESSION_BUSY "org.freedesktop.login1.SessionBusy" +#define BUS_ERROR_NOT_YOUR_DEVICE "org.freedesktop.login1.NotYourDevice" + +#define BUS_ERROR_AUTOMATIC_TIME_SYNC_ENABLED "org.freedesktop.timedate1.AutomaticTimeSyncEnabled" +#define BUS_ERROR_NO_NTP_SUPPORT "org.freedesktop.timedate1.NoNTPSupport" + +#define BUS_ERROR_NO_SUCH_PROCESS "org.freedesktop.systemd1.NoSuchProcess" + +#define BUS_ERROR_NO_NAME_SERVERS "org.freedesktop.resolve1.NoNameServers" +#define BUS_ERROR_INVALID_REPLY "org.freedesktop.resolve1.InvalidReply" +#define BUS_ERROR_NO_SUCH_RR "org.freedesktop.resolve1.NoSuchRR" +#define BUS_ERROR_CNAME_LOOP "org.freedesktop.resolve1.CNameLoop" +#define BUS_ERROR_ABORTED "org.freedesktop.resolve1.Aborted" +#define BUS_ERROR_NO_SUCH_SERVICE "org.freedesktop.resolve1.NoSuchService" +#define BUS_ERROR_DNSSEC_FAILED "org.freedesktop.resolve1.DnssecFailed" +#define BUS_ERROR_NO_TRUST_ANCHOR "org.freedesktop.resolve1.NoTrustAnchor" +#define BUS_ERROR_RR_TYPE_UNSUPPORTED "org.freedesktop.resolve1.ResourceRecordTypeUnsupported" +#define BUS_ERROR_NO_SUCH_LINK "org.freedesktop.resolve1.NoSuchLink" +#define BUS_ERROR_LINK_BUSY "org.freedesktop.resolve1.LinkBusy" +#define BUS_ERROR_NETWORK_DOWN "org.freedesktop.resolve1.NetworkDown" +#define BUS_ERROR_NO_SOURCE "org.freedesktop.resolve1.NoSource" +#define BUS_ERROR_STUB_LOOP "org.freedesktop.resolve1.StubLoop" +#define BUS_ERROR_NO_SUCH_DNSSD_SERVICE "org.freedesktop.resolve1.NoSuchDnssdService" +#define BUS_ERROR_DNSSD_SERVICE_EXISTS "org.freedesktop.resolve1.DnssdServiceExists" + +#define _BUS_ERROR_DNS "org.freedesktop.resolve1.DnsError." +#define BUS_ERROR_DNS_FORMERR _BUS_ERROR_DNS "FORMERR" +#define BUS_ERROR_DNS_SERVFAIL _BUS_ERROR_DNS "SERVFAIL" +#define BUS_ERROR_DNS_NXDOMAIN _BUS_ERROR_DNS "NXDOMAIN" +#define BUS_ERROR_DNS_NOTIMP _BUS_ERROR_DNS "NOTIMP" +#define BUS_ERROR_DNS_REFUSED _BUS_ERROR_DNS "REFUSED" +#define BUS_ERROR_DNS_YXDOMAIN _BUS_ERROR_DNS "YXDOMAIN" +#define BUS_ERROR_DNS_YRRSET _BUS_ERROR_DNS "YRRSET" +#define BUS_ERROR_DNS_NXRRSET _BUS_ERROR_DNS "NXRRSET" +#define BUS_ERROR_DNS_NOTAUTH _BUS_ERROR_DNS "NOTAUTH" +#define BUS_ERROR_DNS_NOTZONE _BUS_ERROR_DNS "NOTZONE" +#define BUS_ERROR_DNS_BADVERS _BUS_ERROR_DNS "BADVERS" +#define BUS_ERROR_DNS_BADKEY _BUS_ERROR_DNS "BADKEY" +#define BUS_ERROR_DNS_BADTIME _BUS_ERROR_DNS "BADTIME" +#define BUS_ERROR_DNS_BADMODE _BUS_ERROR_DNS "BADMODE" +#define BUS_ERROR_DNS_BADNAME _BUS_ERROR_DNS "BADNAME" +#define BUS_ERROR_DNS_BADALG _BUS_ERROR_DNS "BADALG" +#define BUS_ERROR_DNS_BADTRUNC _BUS_ERROR_DNS "BADTRUNC" +#define BUS_ERROR_DNS_BADCOOKIE _BUS_ERROR_DNS "BADCOOKIE" + +#define BUS_ERROR_NO_SUCH_TRANSFER "org.freedesktop.import1.NoSuchTransfer" +#define BUS_ERROR_TRANSFER_IN_PROGRESS "org.freedesktop.import1.TransferInProgress" + +#define BUS_ERROR_NO_PRODUCT_UUID "org.freedesktop.hostname1.NoProductUUID" +#define BUS_ERROR_FILE_IS_PROTECTED "org.freedesktop.hostname1.FileIsProtected" +#define BUS_ERROR_READ_ONLY_FILESYSTEM "org.freedesktop.hostname1.ReadOnlyFilesystem" + +#define BUS_ERROR_SPEED_METER_INACTIVE "org.freedesktop.network1.SpeedMeterInactive" +#define BUS_ERROR_UNMANAGED_INTERFACE "org.freedesktop.network1.UnmanagedInterface" + +#define BUS_ERROR_NO_SUCH_HOME "org.freedesktop.home1.NoSuchHome" +#define BUS_ERROR_UID_IN_USE "org.freedesktop.home1.UIDInUse" +#define BUS_ERROR_USER_NAME_EXISTS "org.freedesktop.home1.UserNameExists" +#define BUS_ERROR_HOME_EXISTS "org.freedesktop.home1.HomeExists" +#define BUS_ERROR_HOME_ALREADY_ACTIVE "org.freedesktop.home1.HomeAlreadyActive" +#define BUS_ERROR_HOME_ALREADY_FIXATED "org.freedesktop.home1.HomeAlreadyFixated" +#define BUS_ERROR_HOME_UNFIXATED "org.freedesktop.home1.HomeUnfixated" +#define BUS_ERROR_HOME_NOT_ACTIVE "org.freedesktop.home1.HomeNotActive" +#define BUS_ERROR_HOME_ABSENT "org.freedesktop.home1.HomeAbsent" +#define BUS_ERROR_HOME_BUSY "org.freedesktop.home1.HomeBusy" +#define BUS_ERROR_BAD_PASSWORD "org.freedesktop.home1.BadPassword" +#define BUS_ERROR_BAD_RECOVERY_KEY "org.freedesktop.home1.BadRecoveryKey" +#define BUS_ERROR_LOW_PASSWORD_QUALITY "org.freedesktop.home1.LowPasswordQuality" +#define BUS_ERROR_BAD_PASSWORD_AND_NO_TOKEN "org.freedesktop.home1.BadPasswordAndNoToken" +#define BUS_ERROR_TOKEN_PIN_NEEDED "org.freedesktop.home1.TokenPinNeeded" +#define BUS_ERROR_TOKEN_PROTECTED_AUTHENTICATION_PATH_NEEDED \ + "org.freedesktop.home1.TokenProtectedAuthenticationPathNeeded" +#define BUS_ERROR_TOKEN_USER_PRESENCE_NEEDED "org.freedesktop.home1.TokenUserPresenceNeeded" +#define BUS_ERROR_TOKEN_USER_VERIFICATION_NEEDED \ + "org.freedesktop.home1.TokenUserVerificationNeeded" +#define BUS_ERROR_TOKEN_ACTION_TIMEOUT "org.freedesktop.home1.TokenActionTimeout" +#define BUS_ERROR_TOKEN_PIN_LOCKED "org.freedesktop.home1.TokenPinLocked" +#define BUS_ERROR_TOKEN_BAD_PIN "org.freedesktop.home1.BadPin" +#define BUS_ERROR_TOKEN_BAD_PIN_FEW_TRIES_LEFT "org.freedesktop.home1.BadPinFewTriesLeft" +#define BUS_ERROR_TOKEN_BAD_PIN_ONE_TRY_LEFT "org.freedesktop.home1.BadPinOneTryLeft" +#define BUS_ERROR_BAD_SIGNATURE "org.freedesktop.home1.BadSignature" +#define BUS_ERROR_HOME_RECORD_MISMATCH "org.freedesktop.home1.RecordMismatch" +#define BUS_ERROR_HOME_RECORD_DOWNGRADE "org.freedesktop.home1.RecordDowngrade" +#define BUS_ERROR_HOME_RECORD_SIGNED "org.freedesktop.home1.RecordSigned" +#define BUS_ERROR_BAD_HOME_SIZE "org.freedesktop.home1.BadHomeSize" +#define BUS_ERROR_NO_PRIVATE_KEY "org.freedesktop.home1.NoPrivateKey" +#define BUS_ERROR_HOME_LOCKED "org.freedesktop.home1.HomeLocked" +#define BUS_ERROR_HOME_NOT_LOCKED "org.freedesktop.home1.HomeNotLocked" +#define BUS_ERROR_NO_DISK_SPACE "org.freedesktop.home1.NoDiskSpace" +#define BUS_ERROR_TOO_MANY_OPERATIONS "org.freedesktop.home1.TooManyOperations" +#define BUS_ERROR_AUTHENTICATION_LIMIT_HIT "org.freedesktop.home1.AuthenticationLimitHit" +#define BUS_ERROR_HOME_CANT_AUTHENTICATE "org.freedesktop.home1.HomeCantAuthenticate" +#define BUS_ERROR_HOME_IN_USE "org.freedesktop.home1.HomeInUse" +#define BUS_ERROR_REBALANCE_NOT_NEEDED "org.freedesktop.home1.RebalanceNotNeeded" + +BUS_ERROR_MAP_ELF_USE(bus_common_errors); diff --git a/src/libsystemd/sd-bus/bus-container.c b/src/libsystemd/sd-bus/bus-container.c new file mode 100644 index 0000000..4146a6e --- /dev/null +++ b/src/libsystemd/sd-bus/bus-container.c @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "bus-container.h" +#include "bus-internal.h" +#include "bus-socket.h" +#include "fd-util.h" +#include "namespace-util.h" +#include "process-util.h" +#include "string-util.h" + +int bus_container_connect_socket(sd_bus *b) { + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF; + int r, error_buf = 0; + pid_t child; + ssize_t n; + + assert(b); + assert(b->input_fd < 0); + assert(b->output_fd < 0); + assert(b->nspid > 0 || b->machine); + + if (b->nspid <= 0) { + log_debug("sd-bus: connecting bus%s%s to machine %s...", + b->description ? " " : "", strempty(b->description), b->machine); + + r = container_get_leader(b->machine, &b->nspid); + if (r < 0) + return r; + } else + log_debug("sd-bus: connecting bus%s%s to namespace of PID "PID_FMT"...", + b->description ? " " : "", strempty(b->description), b->nspid); + + r = namespace_open(b->nspid, &pidnsfd, &mntnsfd, NULL, &usernsfd, &rootfd); + if (r < 0) + return log_debug_errno(r, "Failed to open namespace of PID "PID_FMT": %m", b->nspid); + + b->input_fd = socket(b->sockaddr.sa.sa_family, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (b->input_fd < 0) + return log_debug_errno(errno, "Failed to create a socket: %m"); + + b->input_fd = fd_move_above_stdio(b->input_fd); + + b->output_fd = b->input_fd; + + bus_socket_setup(b); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_debug_errno(errno, "Failed to create a socket pair: %m"); + + r = namespace_fork("(sd-buscntrns)", "(sd-buscntr)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + pidnsfd, mntnsfd, -1, usernsfd, rootfd, &child); + if (r < 0) + return log_debug_errno(r, "Failed to create namespace for (sd-buscntr): %m"); + if (r == 0) { + pair[0] = safe_close(pair[0]); + + r = connect(b->input_fd, &b->sockaddr.sa, b->sockaddr_size); + if (r < 0) { + /* Try to send error up */ + error_buf = errno; + (void) write(pair[1], &error_buf, sizeof(error_buf)); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + r = wait_for_terminate_and_check("(sd-buscntrns)", child, 0); + if (r < 0) + return r; + bool nonzero_exit_status = r != EXIT_SUCCESS; + + n = read(pair[0], &error_buf, sizeof(error_buf)); + if (n < 0) + return log_debug_errno(errno, "Failed to read error status from (sd-buscntr): %m"); + + if (n > 0) { + if (n != sizeof(error_buf)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Read error status of unexpected length %zd from (sd-buscntr): %m", n); + + if (error_buf < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Got unexpected error status from (sd-buscntr): %m"); + + if (error_buf == EINPROGRESS) + return 1; + + if (error_buf > 0) + return log_debug_errno(error_buf, "(sd-buscntr) failed to connect to D-Bus socket: %m"); + } + + if (nonzero_exit_status) + return -EPROTO; + + return bus_socket_start_auth(b); +} diff --git a/src/libsystemd/sd-bus/bus-container.h b/src/libsystemd/sd-bus/bus-container.h new file mode 100644 index 0000000..cb503a5 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-container.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +int bus_container_connect_socket(sd_bus *b); diff --git a/src/libsystemd/sd-bus/bus-control.c b/src/libsystemd/sd-bus/bus-control.c new file mode 100644 index 0000000..1355e41 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-control.c @@ -0,0 +1,1038 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-control.h" +#include "bus-internal.h" +#include "bus-message.h" +#include "capability-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +_public_ int sd_bus_get_unique_name(sd_bus *bus, const char **unique) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(unique, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!bus->bus_client) + return -EINVAL; + + r = bus_ensure_running(bus); + if (r < 0) + return r; + + *unique = bus->unique_name; + return 0; +} + +static int validate_request_name_parameters( + sd_bus *bus, + const char *name, + uint64_t flags, + uint32_t *ret_param) { + + uint32_t param = 0; + + assert(bus); + assert(name); + assert(ret_param); + + assert_return(!(flags & ~(SD_BUS_NAME_ALLOW_REPLACEMENT|SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_QUEUE)), -EINVAL); + assert_return(service_name_is_valid(name), -EINVAL); + assert_return(name[0] != ':', -EINVAL); + + if (!bus->bus_client) + return -EINVAL; + + /* Don't allow requesting the special driver and local names */ + if (STR_IN_SET(name, "org.freedesktop.DBus", "org.freedesktop.DBus.Local")) + return -EINVAL; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (flags & SD_BUS_NAME_ALLOW_REPLACEMENT) + param |= BUS_NAME_ALLOW_REPLACEMENT; + if (flags & SD_BUS_NAME_REPLACE_EXISTING) + param |= BUS_NAME_REPLACE_EXISTING; + if (!(flags & SD_BUS_NAME_QUEUE)) + param |= BUS_NAME_DO_NOT_QUEUE; + + *ret_param = param; + + return 0; +} + +_public_ int sd_bus_request_name( + sd_bus *bus, + const char *name, + uint64_t flags) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + uint32_t ret, param = 0; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(name, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + r = validate_request_name_parameters(bus, name, flags, ¶m); + if (r < 0) + return r; + + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "RequestName", + NULL, + &reply, + "su", + name, + param); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "u", &ret); + if (r < 0) + return r; + + switch (ret) { + + case BUS_NAME_ALREADY_OWNER: + return -EALREADY; + + case BUS_NAME_EXISTS: + return -EEXIST; + + case BUS_NAME_IN_QUEUE: + return 0; + + case BUS_NAME_PRIMARY_OWNER: + return 1; + } + + return -EIO; +} + +static int default_request_name_handler( + sd_bus_message *m, + void *userdata, + sd_bus_error *ret_error) { + + uint32_t ret; + int r; + + assert(m); + + if (sd_bus_message_is_method_error(m, NULL)) { + log_debug_errno(sd_bus_message_get_errno(m), + "Unable to request name, failing connection: %s", + sd_bus_message_get_error(m)->message); + + bus_enter_closing(sd_bus_message_get_bus(m)); + return 1; + } + + r = sd_bus_message_read(m, "u", &ret); + if (r < 0) + return r; + + switch (ret) { + + case BUS_NAME_ALREADY_OWNER: + log_debug("Already owner of requested service name, ignoring."); + return 1; + + case BUS_NAME_IN_QUEUE: + log_debug("In queue for requested service name."); + return 1; + + case BUS_NAME_PRIMARY_OWNER: + log_debug("Successfully acquired requested service name."); + return 1; + + case BUS_NAME_EXISTS: + log_debug("Requested service name already owned, failing connection."); + bus_enter_closing(sd_bus_message_get_bus(m)); + return 1; + } + + log_debug("Unexpected response from RequestName(), failing connection."); + bus_enter_closing(sd_bus_message_get_bus(m)); + return 1; +} + +_public_ int sd_bus_request_name_async( + sd_bus *bus, + sd_bus_slot **ret_slot, + const char *name, + uint64_t flags, + sd_bus_message_handler_t callback, + void *userdata) { + + uint32_t param = 0; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(name, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + r = validate_request_name_parameters(bus, name, flags, ¶m); + if (r < 0) + return r; + + return sd_bus_call_method_async( + bus, + ret_slot, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "RequestName", + callback ?: default_request_name_handler, + userdata, + "su", + name, + param); +} + +static int validate_release_name_parameters( + sd_bus *bus, + const char *name) { + + assert(bus); + assert(name); + + assert_return(service_name_is_valid(name), -EINVAL); + assert_return(name[0] != ':', -EINVAL); + + if (!bus->bus_client) + return -EINVAL; + + /* Don't allow releasing the special driver and local names */ + if (STR_IN_SET(name, "org.freedesktop.DBus", "org.freedesktop.DBus.Local")) + return -EINVAL; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + return 0; +} + +_public_ int sd_bus_release_name( + sd_bus *bus, + const char *name) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + uint32_t ret; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(name, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + r = validate_release_name_parameters(bus, name); + if (r < 0) + return r; + + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "ReleaseName", + NULL, + &reply, + "s", + name); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "u", &ret); + if (r < 0) + return r; + + switch (ret) { + + case BUS_NAME_NON_EXISTENT: + return -ESRCH; + + case BUS_NAME_NOT_OWNER: + return -EADDRINUSE; + + case BUS_NAME_RELEASED: + return 0; + } + + return -EIO; +} + +static int default_release_name_handler( + sd_bus_message *m, + void *userdata, + sd_bus_error *ret_error) { + + uint32_t ret; + int r; + + assert(m); + + if (sd_bus_message_is_method_error(m, NULL)) { + log_debug_errno(sd_bus_message_get_errno(m), + "Unable to release name, failing connection: %s", + sd_bus_message_get_error(m)->message); + + bus_enter_closing(sd_bus_message_get_bus(m)); + return 1; + } + + r = sd_bus_message_read(m, "u", &ret); + if (r < 0) + return r; + + switch (ret) { + + case BUS_NAME_NON_EXISTENT: + log_debug("Name asked to release is not taken currently, ignoring."); + return 1; + + case BUS_NAME_NOT_OWNER: + log_debug("Name asked to release is owned by somebody else, ignoring."); + return 1; + + case BUS_NAME_RELEASED: + log_debug("Name successfully released."); + return 1; + } + + log_debug("Unexpected response from ReleaseName(), failing connection."); + bus_enter_closing(sd_bus_message_get_bus(m)); + return 1; +} + +_public_ int sd_bus_release_name_async( + sd_bus *bus, + sd_bus_slot **ret_slot, + const char *name, + sd_bus_message_handler_t callback, + void *userdata) { + + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(name, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + r = validate_release_name_parameters(bus, name); + if (r < 0) + return r; + + return sd_bus_call_method_async( + bus, + ret_slot, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "ReleaseName", + callback ?: default_release_name_handler, + userdata, + "s", + name); +} + +_public_ int sd_bus_list_names(sd_bus *bus, char ***acquired, char ***activatable) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_strv_free_ char **x = NULL, **y = NULL; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(acquired || activatable, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!bus->bus_client) + return -EINVAL; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (acquired) { + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "ListNames", + NULL, + &reply, + NULL); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(reply, &x); + if (r < 0) + return r; + + reply = sd_bus_message_unref(reply); + } + + if (activatable) { + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "ListActivatableNames", + NULL, + &reply, + NULL); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(reply, &y); + if (r < 0) + return r; + + *activatable = TAKE_PTR(y); + } + + if (acquired) + *acquired = TAKE_PTR(x); + + return 0; +} + +_public_ int sd_bus_get_name_creds( + sd_bus *bus, + const char *name, + uint64_t mask, + sd_bus_creds **creds) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply_unique = NULL, *reply = NULL; + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL; + const char *unique; + pid_t pid = 0; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(name, -EINVAL); + assert_return((mask & ~SD_BUS_CREDS_AUGMENT) <= _SD_BUS_CREDS_ALL, -EOPNOTSUPP); + assert_return(mask == 0 || creds, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(service_name_is_valid(name), -EINVAL); + + if (!bus->bus_client) + return -EINVAL; + + /* Turn off augmenting if this isn't a local connection. If the connection is not local, then /proc is not + * going to match. */ + if (!bus->is_local) + mask &= ~SD_BUS_CREDS_AUGMENT; + + if (streq(name, "org.freedesktop.DBus.Local")) + return -EINVAL; + + if (streq(name, "org.freedesktop.DBus")) + return sd_bus_get_owner_creds(bus, mask, creds); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + /* If the name is unique anyway, we can use it directly */ + unique = name[0] == ':' ? name : NULL; + + /* Only query the owner if the caller wants to know it and the name is not unique anyway, or if the caller just + * wants to check whether a name exists */ + if ((FLAGS_SET(mask, SD_BUS_CREDS_UNIQUE_NAME) && !unique) || mask == 0) { + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetNameOwner", + NULL, + &reply_unique, + "s", + name); + if (r < 0) + return r; + + r = sd_bus_message_read(reply_unique, "s", &unique); + if (r < 0) + return r; + } + + if (mask != 0) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool need_pid, need_uid, need_selinux, need_separate_calls; + + c = bus_creds_new(); + if (!c) + return -ENOMEM; + + if ((mask & SD_BUS_CREDS_UNIQUE_NAME) && unique) { + c->unique_name = strdup(unique); + if (!c->unique_name) + return -ENOMEM; + + c->mask |= SD_BUS_CREDS_UNIQUE_NAME; + } + + need_pid = (mask & SD_BUS_CREDS_PID) || + ((mask & SD_BUS_CREDS_AUGMENT) && + (mask & (SD_BUS_CREDS_UID|SD_BUS_CREDS_SUID|SD_BUS_CREDS_FSUID| + SD_BUS_CREDS_GID|SD_BUS_CREDS_EGID|SD_BUS_CREDS_SGID|SD_BUS_CREDS_FSGID| + SD_BUS_CREDS_SUPPLEMENTARY_GIDS| + SD_BUS_CREDS_COMM|SD_BUS_CREDS_EXE|SD_BUS_CREDS_CMDLINE| + SD_BUS_CREDS_CGROUP|SD_BUS_CREDS_UNIT|SD_BUS_CREDS_USER_UNIT|SD_BUS_CREDS_SLICE|SD_BUS_CREDS_SESSION|SD_BUS_CREDS_OWNER_UID| + SD_BUS_CREDS_EFFECTIVE_CAPS|SD_BUS_CREDS_PERMITTED_CAPS|SD_BUS_CREDS_INHERITABLE_CAPS|SD_BUS_CREDS_BOUNDING_CAPS| + SD_BUS_CREDS_SELINUX_CONTEXT| + SD_BUS_CREDS_AUDIT_SESSION_ID|SD_BUS_CREDS_AUDIT_LOGIN_UID))); + need_uid = mask & SD_BUS_CREDS_EUID; + need_selinux = mask & SD_BUS_CREDS_SELINUX_CONTEXT; + + if (need_pid + need_uid + need_selinux > 1) { + + /* If we need more than one of the credentials, then use GetConnectionCredentials() */ + + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionCredentials", + &error, + &reply, + "s", + unique ?: name); + + if (r < 0) { + + if (!sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)) + return r; + + /* If we got an unknown method error, fall back to the individual calls... */ + need_separate_calls = true; + sd_bus_error_free(&error); + + } else { + need_separate_calls = false; + + r = sd_bus_message_enter_container(reply, 'a', "{sv}"); + if (r < 0) + return r; + + for (;;) { + const char *m; + + r = sd_bus_message_enter_container(reply, 'e', "sv"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(reply, "s", &m); + if (r < 0) + return r; + + if (need_uid && streq(m, "UnixUserID")) { + uint32_t u; + + r = sd_bus_message_read(reply, "v", "u", &u); + if (r < 0) + return r; + + c->euid = u; + c->mask |= SD_BUS_CREDS_EUID; + + } else if (need_pid && streq(m, "ProcessID")) { + uint32_t p; + + r = sd_bus_message_read(reply, "v", "u", &p); + if (r < 0) + return r; + + pid = p; + if (mask & SD_BUS_CREDS_PID) { + c->pid = p; + c->mask |= SD_BUS_CREDS_PID; + } + + } else if (need_selinux && streq(m, "LinuxSecurityLabel")) { + const void *p = NULL; + size_t sz = 0; + + r = sd_bus_message_enter_container(reply, 'v', "ay"); + if (r < 0) + return r; + + r = sd_bus_message_read_array(reply, 'y', &p, &sz); + if (r < 0) + return r; + + r = free_and_strndup(&c->label, p, sz); + if (r < 0) + return r; + + c->mask |= SD_BUS_CREDS_SELINUX_CONTEXT; + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + } else { + r = sd_bus_message_skip(reply, "v"); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + if (need_pid && pid == 0) + return -EPROTO; + } + + } else /* When we only need a single field, then let's use separate calls */ + need_separate_calls = true; + + if (need_separate_calls) { + if (need_pid) { + uint32_t u; + + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionUnixProcessID", + NULL, + &reply, + "s", + unique ?: name); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "u", &u); + if (r < 0) + return r; + + pid = u; + if (mask & SD_BUS_CREDS_PID) { + c->pid = u; + c->mask |= SD_BUS_CREDS_PID; + } + + reply = sd_bus_message_unref(reply); + } + + if (need_uid) { + uint32_t u; + + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionUnixUser", + NULL, + &reply, + "s", + unique ?: name); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "u", &u); + if (r < 0) + return r; + + c->euid = u; + c->mask |= SD_BUS_CREDS_EUID; + + reply = sd_bus_message_unref(reply); + } + + if (need_selinux) { + const void *p = NULL; + size_t sz = 0; + + r = sd_bus_call_method( + bus, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionSELinuxSecurityContext", + &error, + &reply, + "s", + unique ?: name); + if (r < 0) { + if (!sd_bus_error_has_name(&error, SD_BUS_ERROR_SELINUX_SECURITY_CONTEXT_UNKNOWN)) + return r; + + /* no data is fine */ + } else { + r = sd_bus_message_read_array(reply, 'y', &p, &sz); + if (r < 0) + return r; + + c->label = memdup_suffix0(p, sz); + if (!c->label) + return -ENOMEM; + + c->mask |= SD_BUS_CREDS_SELINUX_CONTEXT; + } + } + } + + r = bus_creds_add_more(c, mask, pid, 0); + if (r < 0 && r != -ESRCH) /* Return the error, but ignore ESRCH which just means the process is already gone */ + return r; + } + + if (creds) + *creds = TAKE_PTR(c); + + return 0; +} + +static int parse_sockaddr_string(const char *t, char **ret_comm, char **ret_description) { + _cleanup_free_ char *comm = NULL, *description = NULL; + const char *e, *sl; + + assert(t); + assert(ret_comm); + assert(ret_description); + + e = strstrafter(t, "/bus/"); + if (!e) { + log_debug("Didn't find /bus/ substring in peer socket address, ignoring."); + goto not_found; + } + + sl = strchr(e, '/'); + if (!sl) { + log_debug("Didn't find / substring after /bus/ in peer socket address, ignoring."); + goto not_found; + } + + if (sl - e > 0) { + comm = strndup(e, sl - e); + if (!comm) + return -ENOMEM; + } + + sl++; + if (!isempty(sl)) { + description = strdup(sl); + if (!description) + return -ENOMEM; + } + + *ret_comm = TAKE_PTR(comm); + *ret_description = TAKE_PTR(description); + return 0; + +not_found: + *ret_comm = *ret_description = NULL; + return 0; +} + +_public_ int sd_bus_get_owner_creds(sd_bus *bus, uint64_t mask, sd_bus_creds **ret) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL; + bool do_label, do_groups, do_sockaddr_peer; + pid_t pid = 0; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return((mask & ~SD_BUS_CREDS_AUGMENT) <= _SD_BUS_CREDS_ALL, -EOPNOTSUPP); + assert_return(ret, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (!bus->is_local) + mask &= ~SD_BUS_CREDS_AUGMENT; + + do_label = bus->label && (mask & SD_BUS_CREDS_SELINUX_CONTEXT); + do_groups = bus->n_groups != SIZE_MAX && (mask & SD_BUS_CREDS_SUPPLEMENTARY_GIDS); + do_sockaddr_peer = bus->sockaddr_size_peer >= offsetof(struct sockaddr_un, sun_path) + 1 && + bus->sockaddr_peer.sa.sa_family == AF_UNIX && + bus->sockaddr_peer.un.sun_path[0] == 0; + + /* Avoid allocating anything if we have no chance of returning useful data */ + if (!bus->ucred_valid && !do_label && !do_groups && !do_sockaddr_peer) + return -ENODATA; + + c = bus_creds_new(); + if (!c) + return -ENOMEM; + + if (bus->ucred_valid) { + if (pid_is_valid(bus->ucred.pid)) { + pid = c->pid = bus->ucred.pid; + c->mask |= SD_BUS_CREDS_PID & mask; + } + + if (uid_is_valid(bus->ucred.uid)) { + c->euid = bus->ucred.uid; + c->mask |= SD_BUS_CREDS_EUID & mask; + } + + if (gid_is_valid(bus->ucred.gid)) { + c->egid = bus->ucred.gid; + c->mask |= SD_BUS_CREDS_EGID & mask; + } + } + + if (do_label) { + c->label = strdup(bus->label); + if (!c->label) + return -ENOMEM; + + c->mask |= SD_BUS_CREDS_SELINUX_CONTEXT; + } + + if (do_groups) { + c->supplementary_gids = newdup(gid_t, bus->groups, bus->n_groups); + if (!c->supplementary_gids) + return -ENOMEM; + + c->n_supplementary_gids = bus->n_groups; + + c->mask |= SD_BUS_CREDS_SUPPLEMENTARY_GIDS; + } + + if (do_sockaddr_peer) { + _cleanup_free_ char *t = NULL; + + assert(bus->sockaddr_size_peer >= offsetof(struct sockaddr_un, sun_path) + 1); + assert(bus->sockaddr_peer.sa.sa_family == AF_UNIX); + assert(bus->sockaddr_peer.un.sun_path[0] == 0); + + /* So this is an abstract namespace socket, good. Now let's find the data we are interested in */ + r = make_cstring(bus->sockaddr_peer.un.sun_path + 1, + bus->sockaddr_size_peer - offsetof(struct sockaddr_un, sun_path) - 1, + MAKE_CSTRING_ALLOW_TRAILING_NUL, + &t); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Can't extract string from peer socket address, ignoring: %m"); + else { + r = parse_sockaddr_string(t, &c->comm, &c->description); + if (r < 0) + return r; + + if (c->comm) + c->mask |= SD_BUS_CREDS_COMM & mask; + + if (c->description) + c->mask |= SD_BUS_CREDS_DESCRIPTION & mask; + } + } + + r = bus_creds_add_more(c, mask, pid, 0); + if (r < 0 && r != -ESRCH) /* If the process vanished, then don't complain, just return what we got */ + return r; + + *ret = TAKE_PTR(c); + + return 0; +} + +#define append_eavesdrop(bus, m) \ + ((bus)->is_monitor \ + ? (isempty(m) ? "eavesdrop='true'" : strjoina((m), ",eavesdrop='true'")) \ + : (m)) + +int bus_add_match_internal( + sd_bus *bus, + const char *match, + uint64_t timeout_usec, + uint64_t *ret_counter) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + const char *e; + int r; + + assert(bus); + + if (!bus->bus_client) + return -EINVAL; + + e = append_eavesdrop(bus, match); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "AddMatch"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "s", e); + if (r < 0) + return r; + + r = sd_bus_call( + bus, + m, + timeout_usec, + NULL, + &reply); + if (r < 0) + return r; + + /* If the caller asked for it, return the read counter of the reply */ + if (ret_counter) + *ret_counter = reply->read_counter; + + return r; +} + +int bus_add_match_internal_async( + sd_bus *bus, + sd_bus_slot **ret_slot, + const char *match, + sd_bus_message_handler_t callback, + void *userdata, + uint64_t timeout_usec) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + const char *e; + int r; + + assert(bus); + + if (!bus->bus_client) + return -EINVAL; + + e = append_eavesdrop(bus, match); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "AddMatch"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "s", e); + if (r < 0) + return r; + + return sd_bus_call_async( + bus, + ret_slot, + m, + callback, + userdata, + timeout_usec); +} + +int bus_remove_match_internal( + sd_bus *bus, + const char *match) { + + const char *e; + + assert(bus); + assert(match); + + if (!bus->bus_client) + return -EINVAL; + + e = append_eavesdrop(bus, match); + + /* Fire and forget */ + + return sd_bus_call_method_async( + bus, + NULL, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "RemoveMatch", + NULL, + NULL, + "s", + e); +} + +_public_ int sd_bus_get_name_machine_id(sd_bus *bus, const char *name, sd_id128_t *machine) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + const char *mid; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(name, -EINVAL); + assert_return(machine, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(service_name_is_valid(name), -EINVAL); + + if (!bus->bus_client) + return -EINVAL; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (streq_ptr(name, bus->unique_name)) + return sd_id128_get_machine(machine); + + r = sd_bus_message_new_method_call( + bus, + &m, + name, + "/", + "org.freedesktop.DBus.Peer", + "GetMachineId"); + if (r < 0) + return r; + + r = sd_bus_message_set_auto_start(m, false); + if (r < 0) + return r; + + r = sd_bus_call(bus, m, 0, NULL, &reply); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "s", &mid); + if (r < 0) + return r; + + return sd_id128_from_string(mid, machine); +} diff --git a/src/libsystemd/sd-bus/bus-control.h b/src/libsystemd/sd-bus/bus-control.h new file mode 100644 index 0000000..1cd4fb8 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-control.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +int bus_add_match_internal(sd_bus *bus, const char *match, uint64_t timeout_usec, uint64_t *ret_counter); +int bus_add_match_internal_async(sd_bus *bus, sd_bus_slot **ret, const char *match, sd_bus_message_handler_t callback, void *userdata, uint64_t timeout_usec); + +int bus_remove_match_internal(sd_bus *bus, const char *match); diff --git a/src/libsystemd/sd-bus/bus-convenience.c b/src/libsystemd/sd-bus/bus-convenience.c new file mode 100644 index 0000000..989e577 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-convenience.c @@ -0,0 +1,824 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "bus-internal.h" +#include "bus-message.h" +#include "bus-signature.h" +#include "bus-type.h" +#include "string-util.h" + +_public_ int sd_bus_message_send(sd_bus_message *reply) { + assert_return(reply, -EINVAL); + assert_return(reply->bus, -EINVAL); + assert_return(!bus_origin_changed(reply->bus), -ECHILD); + + return sd_bus_send(reply->bus, reply, NULL); +} + +_public_ int sd_bus_emit_signal_tov( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + const char *types, va_list ap) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + r = sd_bus_message_new_signal_to(bus, &m, destination, path, interface, member); + if (r < 0) + return r; + + if (!isempty(types)) { + r = sd_bus_message_appendv(m, types, ap); + if (r < 0) + return r; + } + + return sd_bus_send(bus, m, NULL); +} + +_public_ int sd_bus_emit_signal_to( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + const char *types, ...) { + + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_emit_signal_tov(bus, destination, path, interface, member, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_emit_signalv( + sd_bus *bus, + const char *path, + const char *interface, + const char *member, + const char *types, va_list ap) { + + return sd_bus_emit_signal_tov(bus, NULL, path, interface, member, types, ap); +} + +_public_ int sd_bus_emit_signal( + sd_bus *bus, + const char *path, + const char *interface, + const char *member, + const char *types, ...) { + + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_emit_signalv(bus, path, interface, member, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_call_method_asyncv( + sd_bus *bus, + sd_bus_slot **slot, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_message_handler_t callback, + void *userdata, + const char *types, va_list ap) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + r = sd_bus_message_new_method_call(bus, &m, destination, path, interface, member); + if (r < 0) + return r; + + if (!isempty(types)) { + r = sd_bus_message_appendv(m, types, ap); + if (r < 0) + return r; + } + + return sd_bus_call_async(bus, slot, m, callback, userdata, 0); +} + +_public_ int sd_bus_call_method_async( + sd_bus *bus, + sd_bus_slot **slot, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_message_handler_t callback, + void *userdata, + const char *types, ...) { + + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_call_method_asyncv(bus, slot, destination, path, interface, member, callback, userdata, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_call_methodv( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + sd_bus_message **reply, + const char *types, va_list ap) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + bus_assert_return(bus, -EINVAL, error); + bus_assert_return(bus = bus_resolve(bus), -ENOPKG, error); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = sd_bus_message_new_method_call(bus, &m, destination, path, interface, member); + if (r < 0) + goto fail; + + if (!isempty(types)) { + r = sd_bus_message_appendv(m, types, ap); + if (r < 0) + goto fail; + } + + return sd_bus_call(bus, m, 0, error, reply); + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_call_method( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + sd_bus_message **reply, + const char *types, ...) { + + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_call_methodv(bus, destination, path, interface, member, error, reply, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_reply_method_returnv( + sd_bus_message *call, + const char *types, va_list ap) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + if (call->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED) + return 0; + + r = sd_bus_message_new_method_return(call, &m); + if (r < 0) + return r; + + if (!isempty(types)) { + r = sd_bus_message_appendv(m, types, ap); + if (r < 0) + return r; + } + + return sd_bus_message_send(m); +} + +_public_ int sd_bus_reply_method_return( + sd_bus_message *call, + const char *types, ...) { + + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_reply_method_returnv(call, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_reply_method_error( + sd_bus_message *call, + const sd_bus_error *e) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(sd_bus_error_is_set(e), -EINVAL); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + if (call->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED) + return 0; + + r = sd_bus_message_new_method_error(call, &m, e); + if (r < 0) + return r; + + return sd_bus_message_send(m); +} + +_public_ int sd_bus_reply_method_errorfv( + sd_bus_message *call, + const char *name, + const char *format, + va_list ap) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + if (call->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED) + return 0; + + sd_bus_error_setfv(&error, name, format, ap); + + return sd_bus_reply_method_error(call, &error); +} + +_public_ int sd_bus_reply_method_errorf( + sd_bus_message *call, + const char *name, + const char *format, + ...) { + + va_list ap; + int r; + + va_start(ap, format); + r = sd_bus_reply_method_errorfv(call, name, format, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_reply_method_errno( + sd_bus_message *call, + int error, + const sd_bus_error *p) { + + _cleanup_(sd_bus_error_free) sd_bus_error berror = SD_BUS_ERROR_NULL; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + if (call->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED) + return 0; + + if (sd_bus_error_is_set(p)) + return sd_bus_reply_method_error(call, p); + + sd_bus_error_set_errno(&berror, error); + + return sd_bus_reply_method_error(call, &berror); +} + +_public_ int sd_bus_reply_method_errnofv( + sd_bus_message *call, + int error, + const char *format, + va_list ap) { + + _cleanup_(sd_bus_error_free) sd_bus_error berror = SD_BUS_ERROR_NULL; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + if (call->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED) + return 0; + + sd_bus_error_set_errnofv(&berror, error, format, ap); + + return sd_bus_reply_method_error(call, &berror); +} + +_public_ int sd_bus_reply_method_errnof( + sd_bus_message *call, + int error, + const char *format, + ...) { + + va_list ap; + int r; + + va_start(ap, format); + r = sd_bus_reply_method_errnofv(call, error, format, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_get_property( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + sd_bus_message **reply, + const char *type) { + + sd_bus_message *rep = NULL; + int r; + + bus_assert_return(bus, -EINVAL, error); + bus_assert_return(bus = bus_resolve(bus), -ENOPKG, error); + bus_assert_return(isempty(interface) || interface_name_is_valid(interface), -EINVAL, error); + bus_assert_return(member_name_is_valid(member), -EINVAL, error); + bus_assert_return(reply, -EINVAL, error); + bus_assert_return(signature_is_single(type, false), -EINVAL, error); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = sd_bus_call_method(bus, destination, path, + "org.freedesktop.DBus.Properties", "Get", + error, &rep, + "ss", strempty(interface), member); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(rep, 'v', type); + if (r < 0) { + sd_bus_message_unref(rep); + goto fail; + } + + *reply = rep; + return 0; + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_get_property_trivial( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + char type, void *ptr) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + bus_assert_return(bus, -EINVAL, error); + bus_assert_return(bus = bus_resolve(bus), -ENOPKG, error); + bus_assert_return(isempty(interface) || interface_name_is_valid(interface), -EINVAL, error); + bus_assert_return(member_name_is_valid(member), -EINVAL, error); + bus_assert_return(bus_type_is_trivial(type), -EINVAL, error); + bus_assert_return(ptr, -EINVAL, error); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = sd_bus_call_method(bus, destination, path, "org.freedesktop.DBus.Properties", "Get", error, &reply, "ss", strempty(interface), member); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(reply, 'v', CHAR_TO_STR(type)); + if (r < 0) + goto fail; + + r = sd_bus_message_read_basic(reply, type, ptr); + if (r < 0) + goto fail; + + return 0; + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_get_property_string( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + char **ret) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *s; + char *n; + int r; + + bus_assert_return(bus, -EINVAL, error); + bus_assert_return(bus = bus_resolve(bus), -ENOPKG, error); + bus_assert_return(isempty(interface) || interface_name_is_valid(interface), -EINVAL, error); + bus_assert_return(member_name_is_valid(member), -EINVAL, error); + bus_assert_return(ret, -EINVAL, error); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = sd_bus_call_method(bus, destination, path, "org.freedesktop.DBus.Properties", "Get", error, &reply, "ss", strempty(interface), member); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(reply, 'v', "s"); + if (r < 0) + goto fail; + + r = sd_bus_message_read_basic(reply, 's', &s); + if (r < 0) + goto fail; + + n = strdup(s); + if (!n) { + r = -ENOMEM; + goto fail; + } + + *ret = n; + return 0; + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_get_property_strv( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + char ***ret) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + bus_assert_return(bus, -EINVAL, error); + bus_assert_return(bus = bus_resolve(bus), -ENOPKG, error); + bus_assert_return(isempty(interface) || interface_name_is_valid(interface), -EINVAL, error); + bus_assert_return(member_name_is_valid(member), -EINVAL, error); + bus_assert_return(ret, -EINVAL, error); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = sd_bus_call_method(bus, destination, path, "org.freedesktop.DBus.Properties", "Get", error, &reply, "ss", strempty(interface), member); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(reply, 'v', NULL); + if (r < 0) + goto fail; + + r = sd_bus_message_read_strv(reply, ret); + if (r < 0) + goto fail; + + return 0; + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_set_propertyv( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + const char *type, va_list ap) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + bus_assert_return(bus, -EINVAL, error); + bus_assert_return(bus = bus_resolve(bus), -ENOPKG, error); + bus_assert_return(isempty(interface) || interface_name_is_valid(interface), -EINVAL, error); + bus_assert_return(member_name_is_valid(member), -EINVAL, error); + bus_assert_return(signature_is_single(type, false), -EINVAL, error); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = sd_bus_message_new_method_call(bus, &m, destination, path, "org.freedesktop.DBus.Properties", "Set"); + if (r < 0) + goto fail; + + r = sd_bus_message_append(m, "ss", strempty(interface), member); + if (r < 0) + goto fail; + + r = sd_bus_message_open_container(m, 'v', type); + if (r < 0) + goto fail; + + r = sd_bus_message_appendv(m, type, ap); + if (r < 0) + goto fail; + + r = sd_bus_message_close_container(m); + if (r < 0) + goto fail; + + return sd_bus_call(bus, m, 0, error, NULL); + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_set_property( + sd_bus *bus, + const char *destination, + const char *path, + const char *interface, + const char *member, + sd_bus_error *error, + const char *type, ...) { + + va_list ap; + int r; + + va_start(ap, type); + r = sd_bus_set_propertyv(bus, destination, path, interface, member, error, type, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_query_sender_creds(sd_bus_message *call, uint64_t mask, sd_bus_creds **ret) { + sd_bus_creds *c; + int r; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + assert_return(ret, -EINVAL); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + c = sd_bus_message_get_creds(call); + + /* All data we need? */ + if (c && (mask & ~SD_BUS_CREDS_AUGMENT & ~c->mask) == 0) { + *ret = sd_bus_creds_ref(c); + return 0; + } + + /* No data passed? Or not enough data passed to retrieve the missing bits? */ + if (!c || !(c->mask & SD_BUS_CREDS_PID)) { + /* We couldn't read anything from the call, let's try + * to get it from the sender or peer. */ + + if (call->sender) + /* There's a sender, but the creds are missing. */ + return sd_bus_get_name_creds(call->bus, call->sender, mask, ret); + else + /* There's no sender. For direct connections + * the credentials of the AF_UNIX peer matter, + * which may be queried via sd_bus_get_owner_creds(). */ + return sd_bus_get_owner_creds(call->bus, mask, ret); + } + + r = bus_creds_extend_by_pid(c, mask, ret); + if (r == -ESRCH) { + /* Process doesn't exist anymore? propagate the few things we have */ + *ret = sd_bus_creds_ref(c); + return 0; + } + + return r; +} + +_public_ int sd_bus_query_sender_privilege(sd_bus_message *call, int capability) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + uid_t our_uid; + bool know_caps = false; + int r; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->bus, -EINVAL); + assert_return(!bus_origin_changed(call->bus), -ECHILD); + + if (!BUS_IS_OPEN(call->bus->state)) + return -ENOTCONN; + + if (capability >= 0) { + + r = sd_bus_query_sender_creds(call, SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS, &creds); + if (r < 0) + return r; + + /* We cannot use augmented caps for authorization, + * since then data is acquired raceful from + * /proc. This can never actually happen, but let's + * better be safe than sorry, and do an extra check + * here. */ + assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_EFFECTIVE_CAPS) == 0, -EPERM); + + r = sd_bus_creds_has_effective_cap(creds, capability); + if (r > 0) + return 1; + if (r == 0) + know_caps = true; + } else { + r = sd_bus_query_sender_creds(call, SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + } + + /* Now, check the UID, but only if the capability check wasn't + * sufficient */ + our_uid = getuid(); + if (our_uid != 0 || !know_caps || capability < 0) { + uid_t sender_uid; + + /* We cannot use augmented uid/euid for authorization, + * since then data is acquired raceful from + * /proc. This can never actually happen, but let's + * better be safe than sorry, and do an extra check + * here. */ + assert_return((sd_bus_creds_get_augmented_mask(creds) & (SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID)) == 0, -EPERM); + + /* Try to use the EUID, if we have it. */ + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + r = sd_bus_creds_get_uid(creds, &sender_uid); + + if (r >= 0) { + /* Sender has same UID as us, then let's grant access */ + if (sender_uid == our_uid) + return 1; + + /* Sender is root, we are not root. */ + if (our_uid != 0 && sender_uid == 0) + return 1; + } + } + + return 0; +} + +#define make_expression(sender, path, interface, member) \ + strjoina( \ + "type='signal'", \ + sender ? ",sender='" : "", \ + sender ?: "", \ + sender ? "'" : "", \ + path ? ",path='" : "", \ + path ?: "", \ + path ? "'" : "", \ + interface ? ",interface='" : "", \ + interface ?: "", \ + interface ? "'" : "", \ + member ? ",member='" : "", \ + member ?: "", \ + member ? "'" : "" \ + ) + +_public_ int sd_bus_match_signal( + sd_bus *bus, + sd_bus_slot **ret, + const char *sender, + const char *path, + const char *interface, + const char *member, + sd_bus_message_handler_t callback, + void *userdata) { + + const char *expression; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(!sender || service_name_is_valid(sender), -EINVAL); + assert_return(!path || object_path_is_valid(path), -EINVAL); + assert_return(!interface || interface_name_is_valid(interface), -EINVAL); + assert_return(!member || member_name_is_valid(member), -EINVAL); + + expression = make_expression(sender, path, interface, member); + + return sd_bus_add_match(bus, ret, expression, callback, userdata); +} + +_public_ int sd_bus_match_signal_async( + sd_bus *bus, + sd_bus_slot **ret, + const char *sender, + const char *path, + const char *interface, + const char *member, + sd_bus_message_handler_t callback, + sd_bus_message_handler_t install_callback, + void *userdata) { + + const char *expression; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(!sender || service_name_is_valid(sender), -EINVAL); + assert_return(!path || object_path_is_valid(path), -EINVAL); + assert_return(!interface || interface_name_is_valid(interface), -EINVAL); + assert_return(!member || member_name_is_valid(member), -EINVAL); + + expression = make_expression(sender, path, interface, member); + + return sd_bus_add_match_async(bus, ret, expression, callback, install_callback, userdata); +} diff --git a/src/libsystemd/sd-bus/bus-creds.c b/src/libsystemd/sd-bus/bus-creds.c new file mode 100644 index 0000000..c6d8caa --- /dev/null +++ b/src/libsystemd/sd-bus/bus-creds.c @@ -0,0 +1,1337 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "audit-util.h" +#include "bus-creds.h" +#include "bus-label.h" +#include "bus-message.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "hexdecoct.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "user-util.h" + +enum { + CAP_OFFSET_INHERITABLE = 0, + CAP_OFFSET_PERMITTED = 1, + CAP_OFFSET_EFFECTIVE = 2, + CAP_OFFSET_BOUNDING = 3 +}; + +void bus_creds_done(sd_bus_creds *c) { + assert(c); + + /* For internal bus cred structures that are allocated by + * something else */ + + free(c->session); + free(c->unit); + free(c->user_unit); + free(c->slice); + free(c->user_slice); + free(c->unescaped_description); + free(c->supplementary_gids); + free(c->tty); + + free(c->well_known_names); /* note that this is an strv, but + * we only free the array, not the + * strings the array points to. The + * full strv we only free if + * c->allocated is set, see + * below. */ + + strv_free(c->cmdline_array); +} + +_public_ sd_bus_creds *sd_bus_creds_ref(sd_bus_creds *c) { + + if (!c) + return NULL; + + if (c->allocated) { + assert(c->n_ref > 0); + c->n_ref++; + } else { + sd_bus_message *m; + + /* If this is an embedded creds structure, then + * forward ref counting to the message */ + m = container_of(c, sd_bus_message, creds); + sd_bus_message_ref(m); + } + + return c; +} + +_public_ sd_bus_creds *sd_bus_creds_unref(sd_bus_creds *c) { + + if (!c) + return NULL; + + if (c->allocated) { + assert(c->n_ref > 0); + c->n_ref--; + + if (c->n_ref == 0) { + free(c->comm); + free(c->tid_comm); + free(c->exe); + free(c->cmdline); + free(c->cgroup); + free(c->capability); + free(c->label); + free(c->unique_name); + free(c->cgroup_root); + free(c->description); + + c->supplementary_gids = mfree(c->supplementary_gids); + + c->well_known_names = strv_free(c->well_known_names); + + bus_creds_done(c); + + free(c); + } + } else { + sd_bus_message *m; + + m = container_of(c, sd_bus_message, creds); + sd_bus_message_unref(m); + } + + return NULL; +} + +_public_ uint64_t sd_bus_creds_get_mask(const sd_bus_creds *c) { + assert_return(c, 0); + + return c->mask; +} + +_public_ uint64_t sd_bus_creds_get_augmented_mask(const sd_bus_creds *c) { + assert_return(c, 0); + + return c->augmented; +} + +sd_bus_creds* bus_creds_new(void) { + sd_bus_creds *c; + + c = new0(sd_bus_creds, 1); + if (!c) + return NULL; + + c->allocated = true; + c->n_ref = 1; + return c; +} + +_public_ int sd_bus_creds_new_from_pid(sd_bus_creds **ret, pid_t pid, uint64_t mask) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL; + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(mask <= _SD_BUS_CREDS_ALL, -EOPNOTSUPP); + assert_return(ret, -EINVAL); + + if (pid == 0) + pid = getpid_cached(); + + c = bus_creds_new(); + if (!c) + return -ENOMEM; + + r = bus_creds_add_more(c, mask | SD_BUS_CREDS_AUGMENT, pid, 0); + if (r < 0) + return r; + + /* Check if the process existed at all, in case we haven't + * figured that out already */ + r = pid_is_alive(pid); + if (r < 0) + return r; + if (r == 0) + return -ESRCH; + + *ret = TAKE_PTR(c); + return 0; +} + +_public_ int sd_bus_creds_get_uid(sd_bus_creds *c, uid_t *uid) { + assert_return(c, -EINVAL); + assert_return(uid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_UID)) + return -ENODATA; + + *uid = c->uid; + return 0; +} + +_public_ int sd_bus_creds_get_euid(sd_bus_creds *c, uid_t *euid) { + assert_return(c, -EINVAL); + assert_return(euid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_EUID)) + return -ENODATA; + + *euid = c->euid; + return 0; +} + +_public_ int sd_bus_creds_get_suid(sd_bus_creds *c, uid_t *suid) { + assert_return(c, -EINVAL); + assert_return(suid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_SUID)) + return -ENODATA; + + *suid = c->suid; + return 0; +} + +_public_ int sd_bus_creds_get_fsuid(sd_bus_creds *c, uid_t *fsuid) { + assert_return(c, -EINVAL); + assert_return(fsuid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_FSUID)) + return -ENODATA; + + *fsuid = c->fsuid; + return 0; +} + +_public_ int sd_bus_creds_get_gid(sd_bus_creds *c, gid_t *gid) { + assert_return(c, -EINVAL); + assert_return(gid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_GID)) + return -ENODATA; + + *gid = c->gid; + return 0; +} + +_public_ int sd_bus_creds_get_egid(sd_bus_creds *c, gid_t *egid) { + assert_return(c, -EINVAL); + assert_return(egid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_EGID)) + return -ENODATA; + + *egid = c->egid; + return 0; +} + +_public_ int sd_bus_creds_get_sgid(sd_bus_creds *c, gid_t *sgid) { + assert_return(c, -EINVAL); + assert_return(sgid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_SGID)) + return -ENODATA; + + *sgid = c->sgid; + return 0; +} + +_public_ int sd_bus_creds_get_fsgid(sd_bus_creds *c, gid_t *fsgid) { + assert_return(c, -EINVAL); + assert_return(fsgid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_FSGID)) + return -ENODATA; + + *fsgid = c->fsgid; + return 0; +} + +_public_ int sd_bus_creds_get_supplementary_gids(sd_bus_creds *c, const gid_t **gids) { + assert_return(c, -EINVAL); + assert_return(gids, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_SUPPLEMENTARY_GIDS)) + return -ENODATA; + + *gids = c->supplementary_gids; + return (int) c->n_supplementary_gids; +} + +_public_ int sd_bus_creds_get_pid(sd_bus_creds *c, pid_t *pid) { + assert_return(c, -EINVAL); + assert_return(pid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_PID)) + return -ENODATA; + + assert(c->pid > 0); + *pid = c->pid; + return 0; +} + +_public_ int sd_bus_creds_get_ppid(sd_bus_creds *c, pid_t *ppid) { + assert_return(c, -EINVAL); + assert_return(ppid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_PPID)) + return -ENODATA; + + /* PID 1 has no parent process. Let's distinguish the case of + * not knowing and not having a parent process by the returned + * error code. */ + if (c->ppid == 0) + return -ENXIO; + + *ppid = c->ppid; + return 0; +} + +_public_ int sd_bus_creds_get_tid(sd_bus_creds *c, pid_t *tid) { + assert_return(c, -EINVAL); + assert_return(tid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_TID)) + return -ENODATA; + + assert(c->tid > 0); + *tid = c->tid; + return 0; +} + +_public_ int sd_bus_creds_get_selinux_context(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_SELINUX_CONTEXT)) + return -ENODATA; + + assert(c->label); + *ret = c->label; + return 0; +} + +_public_ int sd_bus_creds_get_comm(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_COMM)) + return -ENODATA; + + assert(c->comm); + *ret = c->comm; + return 0; +} + +_public_ int sd_bus_creds_get_tid_comm(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_TID_COMM)) + return -ENODATA; + + assert(c->tid_comm); + *ret = c->tid_comm; + return 0; +} + +_public_ int sd_bus_creds_get_exe(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_EXE)) + return -ENODATA; + + if (!c->exe) + return -ENXIO; + + *ret = c->exe; + return 0; +} + +_public_ int sd_bus_creds_get_cgroup(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_CGROUP)) + return -ENODATA; + + assert(c->cgroup); + *ret = c->cgroup; + return 0; +} + +_public_ int sd_bus_creds_get_unit(sd_bus_creds *c, const char **ret) { + int r; + + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_UNIT)) + return -ENODATA; + + assert(c->cgroup); + + if (!c->unit) { + const char *shifted; + + r = cg_shift_path(c->cgroup, c->cgroup_root, &shifted); + if (r < 0) + return r; + + r = cg_path_get_unit(shifted, (char**) &c->unit); + if (r < 0) + return r; + } + + *ret = c->unit; + return 0; +} + +_public_ int sd_bus_creds_get_user_unit(sd_bus_creds *c, const char **ret) { + int r; + + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_USER_UNIT)) + return -ENODATA; + + assert(c->cgroup); + + if (!c->user_unit) { + const char *shifted; + + r = cg_shift_path(c->cgroup, c->cgroup_root, &shifted); + if (r < 0) + return r; + + r = cg_path_get_user_unit(shifted, (char**) &c->user_unit); + if (r < 0) + return r; + } + + *ret = c->user_unit; + return 0; +} + +_public_ int sd_bus_creds_get_slice(sd_bus_creds *c, const char **ret) { + int r; + + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_SLICE)) + return -ENODATA; + + assert(c->cgroup); + + if (!c->slice) { + const char *shifted; + + r = cg_shift_path(c->cgroup, c->cgroup_root, &shifted); + if (r < 0) + return r; + + r = cg_path_get_slice(shifted, (char**) &c->slice); + if (r < 0) + return r; + } + + *ret = c->slice; + return 0; +} + +_public_ int sd_bus_creds_get_user_slice(sd_bus_creds *c, const char **ret) { + int r; + + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_USER_SLICE)) + return -ENODATA; + + assert(c->cgroup); + + if (!c->user_slice) { + const char *shifted; + + r = cg_shift_path(c->cgroup, c->cgroup_root, &shifted); + if (r < 0) + return r; + + r = cg_path_get_user_slice(shifted, (char**) &c->user_slice); + if (r < 0) + return r; + } + + *ret = c->user_slice; + return 0; +} + +_public_ int sd_bus_creds_get_session(sd_bus_creds *c, const char **ret) { + int r; + + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_SESSION)) + return -ENODATA; + + assert(c->cgroup); + + if (!c->session) { + const char *shifted; + + r = cg_shift_path(c->cgroup, c->cgroup_root, &shifted); + if (r < 0) + return r; + + r = cg_path_get_session(shifted, (char**) &c->session); + if (r < 0) + return r; + } + + *ret = c->session; + return 0; +} + +_public_ int sd_bus_creds_get_owner_uid(sd_bus_creds *c, uid_t *uid) { + const char *shifted; + int r; + + assert_return(c, -EINVAL); + assert_return(uid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_OWNER_UID)) + return -ENODATA; + + assert(c->cgroup); + + r = cg_shift_path(c->cgroup, c->cgroup_root, &shifted); + if (r < 0) + return r; + + return cg_path_get_owner_uid(shifted, uid); +} + +_public_ int sd_bus_creds_get_cmdline(sd_bus_creds *c, char ***cmdline) { + assert_return(c, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_CMDLINE)) + return -ENODATA; + + if (!c->cmdline) + return -ENXIO; + + if (!c->cmdline_array) { + c->cmdline_array = strv_parse_nulstr(c->cmdline, c->cmdline_size); + if (!c->cmdline_array) + return -ENOMEM; + } + + *cmdline = c->cmdline_array; + return 0; +} + +_public_ int sd_bus_creds_get_audit_session_id(sd_bus_creds *c, uint32_t *sessionid) { + assert_return(c, -EINVAL); + assert_return(sessionid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_AUDIT_SESSION_ID)) + return -ENODATA; + + if (!audit_session_is_valid(c->audit_session_id)) + return -ENXIO; + + *sessionid = c->audit_session_id; + return 0; +} + +_public_ int sd_bus_creds_get_audit_login_uid(sd_bus_creds *c, uid_t *uid) { + assert_return(c, -EINVAL); + assert_return(uid, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_AUDIT_LOGIN_UID)) + return -ENODATA; + + if (!uid_is_valid(c->audit_login_uid)) + return -ENXIO; + + *uid = c->audit_login_uid; + return 0; +} + +_public_ int sd_bus_creds_get_tty(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_TTY)) + return -ENODATA; + + if (!c->tty) + return -ENXIO; + + *ret = c->tty; + return 0; +} + +_public_ int sd_bus_creds_get_unique_name(sd_bus_creds *c, const char **unique_name) { + assert_return(c, -EINVAL); + assert_return(unique_name, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_UNIQUE_NAME)) + return -ENODATA; + + *unique_name = c->unique_name; + return 0; +} + +_public_ int sd_bus_creds_get_well_known_names(sd_bus_creds *c, char ***well_known_names) { + assert_return(c, -EINVAL); + assert_return(well_known_names, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_WELL_KNOWN_NAMES)) + return -ENODATA; + + /* As a special hack we return the bus driver as well-known + * names list when this is requested. */ + if (c->well_known_names_driver) { + static const char* const wkn[] = { + "org.freedesktop.DBus", + NULL + }; + + *well_known_names = (char**) wkn; + return 0; + } + + if (c->well_known_names_local) { + static const char* const wkn[] = { + "org.freedesktop.DBus.Local", + NULL + }; + + *well_known_names = (char**) wkn; + return 0; + } + + *well_known_names = c->well_known_names; + return 0; +} + +_public_ int sd_bus_creds_get_description(sd_bus_creds *c, const char **ret) { + assert_return(c, -EINVAL); + assert_return(ret, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_DESCRIPTION)) + return -ENODATA; + + assert(c->description); + + if (!c->unescaped_description) { + c->unescaped_description = bus_label_unescape(c->description); + if (!c->unescaped_description) + return -ENOMEM; + } + + *ret = c->unescaped_description; + return 0; +} + +static int has_cap(sd_bus_creds *c, size_t offset, int capability) { + size_t sz; + + assert(c); + assert(capability >= 0); + assert(c->capability); + + unsigned lc = cap_last_cap(); + + if ((unsigned) capability > lc) + return 0; + + /* If the last cap is 63, then there are 64 caps defined, and we need 2 entries à 32-bit hence. * + * If the last cap is 64, then there are 65 caps defined, and we need 3 entries à 32-bit hence. */ + sz = DIV_ROUND_UP(lc+1, 32LU); + + return !!(c->capability[offset * sz + CAP_TO_INDEX((uint32_t) capability)] & CAP_TO_MASK_CORRECTED((uint32_t) capability)); +} + +_public_ int sd_bus_creds_has_effective_cap(sd_bus_creds *c, int capability) { + assert_return(c, -EINVAL); + assert_return(capability >= 0, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_EFFECTIVE_CAPS)) + return -ENODATA; + + return has_cap(c, CAP_OFFSET_EFFECTIVE, capability); +} + +_public_ int sd_bus_creds_has_permitted_cap(sd_bus_creds *c, int capability) { + assert_return(c, -EINVAL); + assert_return(capability >= 0, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_PERMITTED_CAPS)) + return -ENODATA; + + return has_cap(c, CAP_OFFSET_PERMITTED, capability); +} + +_public_ int sd_bus_creds_has_inheritable_cap(sd_bus_creds *c, int capability) { + assert_return(c, -EINVAL); + assert_return(capability >= 0, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_INHERITABLE_CAPS)) + return -ENODATA; + + return has_cap(c, CAP_OFFSET_INHERITABLE, capability); +} + +_public_ int sd_bus_creds_has_bounding_cap(sd_bus_creds *c, int capability) { + assert_return(c, -EINVAL); + assert_return(capability >= 0, -EINVAL); + + if (!(c->mask & SD_BUS_CREDS_BOUNDING_CAPS)) + return -ENODATA; + + return has_cap(c, CAP_OFFSET_BOUNDING, capability); +} + +static int parse_caps(sd_bus_creds *c, unsigned offset, const char *p) { + size_t sz, max; + unsigned i, j; + + assert(c); + assert(p); + + max = DIV_ROUND_UP(cap_last_cap()+1, 32U); + p += strspn(p, WHITESPACE); + + sz = strlen(p); + if (sz % 8 != 0) + return -EINVAL; + + sz /= 8; + if (sz > max) + return -EINVAL; + + if (!c->capability) { + c->capability = new0(uint32_t, max * 4); + if (!c->capability) + return -ENOMEM; + } + + for (i = 0; i < sz; i ++) { + uint32_t v = 0; + + for (j = 0; j < 8; ++j) { + int t; + + t = unhexchar(*p++); + if (t < 0) + return -EINVAL; + + v = (v << 4) | t; + } + + c->capability[offset * max + (sz - i - 1)] = v; + } + + return 0; +} + +int bus_creds_add_more(sd_bus_creds *c, uint64_t mask, pid_t pid, pid_t tid) { + uint64_t missing; + int r; + + assert(c); + assert(c->allocated); + + if (!(mask & SD_BUS_CREDS_AUGMENT)) + return 0; + + /* Try to retrieve PID from creds if it wasn't passed to us */ + if (pid > 0) { + c->pid = pid; + c->mask |= SD_BUS_CREDS_PID; + } else if (c->mask & SD_BUS_CREDS_PID) + pid = c->pid; + else + /* Without pid we cannot do much... */ + return 0; + + /* Try to retrieve TID from creds if it wasn't passed to us */ + if (tid <= 0 && (c->mask & SD_BUS_CREDS_TID)) + tid = c->tid; + + /* Calculate what we shall and can add */ + missing = mask & ~(c->mask|SD_BUS_CREDS_PID|SD_BUS_CREDS_TID|SD_BUS_CREDS_UNIQUE_NAME|SD_BUS_CREDS_WELL_KNOWN_NAMES|SD_BUS_CREDS_DESCRIPTION|SD_BUS_CREDS_AUGMENT); + if (missing == 0) + return 0; + + if (tid > 0) { + c->tid = tid; + c->mask |= SD_BUS_CREDS_TID; + } + + if (missing & (SD_BUS_CREDS_PPID | + SD_BUS_CREDS_UID | SD_BUS_CREDS_EUID | SD_BUS_CREDS_SUID | SD_BUS_CREDS_FSUID | + SD_BUS_CREDS_GID | SD_BUS_CREDS_EGID | SD_BUS_CREDS_SGID | SD_BUS_CREDS_FSGID | + SD_BUS_CREDS_SUPPLEMENTARY_GIDS | + SD_BUS_CREDS_EFFECTIVE_CAPS | SD_BUS_CREDS_INHERITABLE_CAPS | + SD_BUS_CREDS_PERMITTED_CAPS | SD_BUS_CREDS_BOUNDING_CAPS)) { + + _cleanup_fclose_ FILE *f = NULL; + const char *p; + + p = procfs_file_alloca(pid, "status"); + + f = fopen(p, "re"); + if (!f) { + if (errno == ENOENT) + return -ESRCH; + else if (!ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else { + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + if (missing & SD_BUS_CREDS_PPID) { + p = startswith(line, "PPid:"); + if (p) { + p += strspn(p, WHITESPACE); + + /* Explicitly check for PPID 0 (which is the case for PID 1) */ + if (!streq(p, "0")) { + r = parse_pid(p, &c->ppid); + if (r < 0) + return r; + + } else + c->ppid = 0; + + c->mask |= SD_BUS_CREDS_PPID; + continue; + } + } + + if (missing & (SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_SUID|SD_BUS_CREDS_FSUID)) { + p = startswith(line, "Uid:"); + if (p) { + unsigned long uid, euid, suid, fsuid; + + p += strspn(p, WHITESPACE); + if (sscanf(p, "%lu %lu %lu %lu", &uid, &euid, &suid, &fsuid) != 4) + return -EIO; + + if (missing & SD_BUS_CREDS_UID) + c->uid = (uid_t) uid; + if (missing & SD_BUS_CREDS_EUID) + c->euid = (uid_t) euid; + if (missing & SD_BUS_CREDS_SUID) + c->suid = (uid_t) suid; + if (missing & SD_BUS_CREDS_FSUID) + c->fsuid = (uid_t) fsuid; + + c->mask |= missing & (SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_SUID|SD_BUS_CREDS_FSUID); + continue; + } + } + + if (missing & (SD_BUS_CREDS_GID|SD_BUS_CREDS_EGID|SD_BUS_CREDS_SGID|SD_BUS_CREDS_FSGID)) { + p = startswith(line, "Gid:"); + if (p) { + unsigned long gid, egid, sgid, fsgid; + + p += strspn(p, WHITESPACE); + if (sscanf(p, "%lu %lu %lu %lu", &gid, &egid, &sgid, &fsgid) != 4) + return -EIO; + + if (missing & SD_BUS_CREDS_GID) + c->gid = (gid_t) gid; + if (missing & SD_BUS_CREDS_EGID) + c->egid = (gid_t) egid; + if (missing & SD_BUS_CREDS_SGID) + c->sgid = (gid_t) sgid; + if (missing & SD_BUS_CREDS_FSGID) + c->fsgid = (gid_t) fsgid; + + c->mask |= missing & (SD_BUS_CREDS_GID|SD_BUS_CREDS_EGID|SD_BUS_CREDS_SGID|SD_BUS_CREDS_FSGID); + continue; + } + } + + if (missing & SD_BUS_CREDS_SUPPLEMENTARY_GIDS) { + p = startswith(line, "Groups:"); + if (p) { + for (;;) { + unsigned long g; + int n = 0; + + p += strspn(p, WHITESPACE); + if (*p == 0) + break; + + if (sscanf(p, "%lu%n", &g, &n) != 1) + return -EIO; + + if (!GREEDY_REALLOC(c->supplementary_gids, c->n_supplementary_gids+1)) + return -ENOMEM; + + c->supplementary_gids[c->n_supplementary_gids++] = (gid_t) g; + p += n; + } + + c->mask |= SD_BUS_CREDS_SUPPLEMENTARY_GIDS; + continue; + } + } + + if (missing & SD_BUS_CREDS_EFFECTIVE_CAPS) { + p = startswith(line, "CapEff:"); + if (p) { + r = parse_caps(c, CAP_OFFSET_EFFECTIVE, p); + if (r < 0) + return r; + + c->mask |= SD_BUS_CREDS_EFFECTIVE_CAPS; + continue; + } + } + + if (missing & SD_BUS_CREDS_PERMITTED_CAPS) { + p = startswith(line, "CapPrm:"); + if (p) { + r = parse_caps(c, CAP_OFFSET_PERMITTED, p); + if (r < 0) + return r; + + c->mask |= SD_BUS_CREDS_PERMITTED_CAPS; + continue; + } + } + + if (missing & SD_BUS_CREDS_INHERITABLE_CAPS) { + p = startswith(line, "CapInh:"); + if (p) { + r = parse_caps(c, CAP_OFFSET_INHERITABLE, p); + if (r < 0) + return r; + + c->mask |= SD_BUS_CREDS_INHERITABLE_CAPS; + continue; + } + } + + if (missing & SD_BUS_CREDS_BOUNDING_CAPS) { + p = startswith(line, "CapBnd:"); + if (p) { + r = parse_caps(c, CAP_OFFSET_BOUNDING, p); + if (r < 0) + return r; + + c->mask |= SD_BUS_CREDS_BOUNDING_CAPS; + continue; + } + } + } + } + } + + if (missing & SD_BUS_CREDS_SELINUX_CONTEXT) { + const char *p; + + p = procfs_file_alloca(pid, "attr/current"); + r = read_one_line_file(p, &c->label); + if (r < 0) { + if (!IN_SET(r, -ENOENT, -EINVAL, -EPERM, -EACCES)) + return r; + } else + c->mask |= SD_BUS_CREDS_SELINUX_CONTEXT; + } + + if (missing & SD_BUS_CREDS_COMM) { + r = pid_get_comm(pid, &c->comm); + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) + return r; + } else + c->mask |= SD_BUS_CREDS_COMM; + } + + if (missing & SD_BUS_CREDS_EXE) { + r = get_process_exe(pid, &c->exe); + if (r == -ESRCH) { + /* Unfortunately we cannot really distinguish + * the case here where the process does not + * exist, and /proc/$PID/exe being unreadable + * because $PID is a kernel thread. Hence, + * assume it is a kernel thread, and rely on + * that this case is caught with a later + * call. */ + c->exe = NULL; + c->mask |= SD_BUS_CREDS_EXE; + } else if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) + return r; + } else + c->mask |= SD_BUS_CREDS_EXE; + } + + if (missing & SD_BUS_CREDS_CMDLINE) { + const char *p; + + p = procfs_file_alloca(pid, "cmdline"); + r = read_full_virtual_file(p, &c->cmdline, &c->cmdline_size); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) + return r; + } else { + if (c->cmdline_size == 0) + c->cmdline = mfree(c->cmdline); + + c->mask |= SD_BUS_CREDS_CMDLINE; + } + } + + if (tid > 0 && (missing & SD_BUS_CREDS_TID_COMM)) { + _cleanup_free_ char *p = NULL; + + if (asprintf(&p, "/proc/"PID_FMT"/task/"PID_FMT"/comm", pid, tid) < 0) + return -ENOMEM; + + r = read_one_line_file(p, &c->tid_comm); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) + return r; + } else + c->mask |= SD_BUS_CREDS_TID_COMM; + } + + if (missing & (SD_BUS_CREDS_CGROUP|SD_BUS_CREDS_UNIT|SD_BUS_CREDS_USER_UNIT|SD_BUS_CREDS_SLICE|SD_BUS_CREDS_USER_SLICE|SD_BUS_CREDS_SESSION|SD_BUS_CREDS_OWNER_UID)) { + + if (!c->cgroup) { + r = cg_pid_get_path(NULL, pid, &c->cgroup); + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) + return r; + } + } + + if (!c->cgroup_root) { + r = cg_get_root_path(&c->cgroup_root); + if (r < 0) + return r; + } + + if (c->cgroup) + c->mask |= missing & (SD_BUS_CREDS_CGROUP|SD_BUS_CREDS_UNIT|SD_BUS_CREDS_USER_UNIT|SD_BUS_CREDS_SLICE|SD_BUS_CREDS_USER_SLICE|SD_BUS_CREDS_SESSION|SD_BUS_CREDS_OWNER_UID); + } + + if (missing & SD_BUS_CREDS_AUDIT_SESSION_ID) { + r = audit_session_from_pid(pid, &c->audit_session_id); + if (r == -ENODATA) { + /* ENODATA means: no audit session id assigned */ + c->audit_session_id = AUDIT_SESSION_INVALID; + c->mask |= SD_BUS_CREDS_AUDIT_SESSION_ID; + } else if (r < 0) { + if (!IN_SET(r, -EOPNOTSUPP, -ENOENT, -EPERM, -EACCES)) + return r; + } else + c->mask |= SD_BUS_CREDS_AUDIT_SESSION_ID; + } + + if (missing & SD_BUS_CREDS_AUDIT_LOGIN_UID) { + r = audit_loginuid_from_pid(pid, &c->audit_login_uid); + if (r == -ENODATA) { + /* ENODATA means: no audit login uid assigned */ + c->audit_login_uid = UID_INVALID; + c->mask |= SD_BUS_CREDS_AUDIT_LOGIN_UID; + } else if (r < 0) { + if (!IN_SET(r, -EOPNOTSUPP, -ENOENT, -EPERM, -EACCES)) + return r; + } else + c->mask |= SD_BUS_CREDS_AUDIT_LOGIN_UID; + } + + if (missing & SD_BUS_CREDS_TTY) { + r = get_ctty(pid, NULL, &c->tty); + if (r == -ENXIO) { + /* ENXIO means: process has no controlling TTY */ + c->tty = NULL; + c->mask |= SD_BUS_CREDS_TTY; + } else if (r < 0) { + if (!IN_SET(r, -EPERM, -EACCES, -ENOENT)) + return r; + } else + c->mask |= SD_BUS_CREDS_TTY; + } + + /* In case only the exe path was to be read we cannot distinguish the case where the exe path was + * unreadable because the process was a kernel thread, or when the process didn't exist at + * all. Hence, let's do a final check, to be sure. */ + r = pid_is_alive(pid); + if (r < 0) + return r; + if (r == 0) + return -ESRCH; + + if (tid > 0 && tid != pid && pid_is_unwaited(tid) == 0) + return -ESRCH; + + c->augmented = missing & c->mask; + + return 0; +} + +int bus_creds_extend_by_pid(sd_bus_creds *c, uint64_t mask, sd_bus_creds **ret) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *n = NULL; + int r; + + assert(c); + assert(ret); + + if ((mask & ~c->mask) == 0 || (!(mask & SD_BUS_CREDS_AUGMENT))) { + /* There's already all data we need, or augmentation + * wasn't turned on. */ + + *ret = sd_bus_creds_ref(c); + return 0; + } + + n = bus_creds_new(); + if (!n) + return -ENOMEM; + + /* Copy the original data over */ + + if (c->mask & mask & SD_BUS_CREDS_PID) { + n->pid = c->pid; + n->mask |= SD_BUS_CREDS_PID; + } + + if (c->mask & mask & SD_BUS_CREDS_TID) { + n->tid = c->tid; + n->mask |= SD_BUS_CREDS_TID; + } + + if (c->mask & mask & SD_BUS_CREDS_PPID) { + n->ppid = c->ppid; + n->mask |= SD_BUS_CREDS_PPID; + } + + if (c->mask & mask & SD_BUS_CREDS_UID) { + n->uid = c->uid; + n->mask |= SD_BUS_CREDS_UID; + } + + if (c->mask & mask & SD_BUS_CREDS_EUID) { + n->euid = c->euid; + n->mask |= SD_BUS_CREDS_EUID; + } + + if (c->mask & mask & SD_BUS_CREDS_SUID) { + n->suid = c->suid; + n->mask |= SD_BUS_CREDS_SUID; + } + + if (c->mask & mask & SD_BUS_CREDS_FSUID) { + n->fsuid = c->fsuid; + n->mask |= SD_BUS_CREDS_FSUID; + } + + if (c->mask & mask & SD_BUS_CREDS_GID) { + n->gid = c->gid; + n->mask |= SD_BUS_CREDS_GID; + } + + if (c->mask & mask & SD_BUS_CREDS_EGID) { + n->egid = c->egid; + n->mask |= SD_BUS_CREDS_EGID; + } + + if (c->mask & mask & SD_BUS_CREDS_SGID) { + n->sgid = c->sgid; + n->mask |= SD_BUS_CREDS_SGID; + } + + if (c->mask & mask & SD_BUS_CREDS_FSGID) { + n->fsgid = c->fsgid; + n->mask |= SD_BUS_CREDS_FSGID; + } + + if (c->mask & mask & SD_BUS_CREDS_SUPPLEMENTARY_GIDS) { + if (c->supplementary_gids) { + n->supplementary_gids = newdup(gid_t, c->supplementary_gids, c->n_supplementary_gids); + if (!n->supplementary_gids) + return -ENOMEM; + n->n_supplementary_gids = c->n_supplementary_gids; + } else { + n->supplementary_gids = NULL; + n->n_supplementary_gids = 0; + } + + n->mask |= SD_BUS_CREDS_SUPPLEMENTARY_GIDS; + } + + if (c->mask & mask & SD_BUS_CREDS_COMM) { + assert(c->comm); + + n->comm = strdup(c->comm); + if (!n->comm) + return -ENOMEM; + + n->mask |= SD_BUS_CREDS_COMM; + } + + if (c->mask & mask & SD_BUS_CREDS_TID_COMM) { + assert(c->tid_comm); + + n->tid_comm = strdup(c->tid_comm); + if (!n->tid_comm) + return -ENOMEM; + + n->mask |= SD_BUS_CREDS_TID_COMM; + } + + if (c->mask & mask & SD_BUS_CREDS_EXE) { + if (c->exe) { + n->exe = strdup(c->exe); + if (!n->exe) + return -ENOMEM; + } else + n->exe = NULL; + + n->mask |= SD_BUS_CREDS_EXE; + } + + if (c->mask & mask & SD_BUS_CREDS_CMDLINE) { + if (c->cmdline) { + n->cmdline = memdup(c->cmdline, c->cmdline_size); + if (!n->cmdline) + return -ENOMEM; + + n->cmdline_size = c->cmdline_size; + } else { + n->cmdline = NULL; + n->cmdline_size = 0; + } + + n->mask |= SD_BUS_CREDS_CMDLINE; + } + + if (c->mask & mask & (SD_BUS_CREDS_CGROUP|SD_BUS_CREDS_SESSION|SD_BUS_CREDS_UNIT|SD_BUS_CREDS_USER_UNIT|SD_BUS_CREDS_SLICE|SD_BUS_CREDS_USER_SLICE|SD_BUS_CREDS_OWNER_UID)) { + assert(c->cgroup); + + n->cgroup = strdup(c->cgroup); + if (!n->cgroup) + return -ENOMEM; + + n->cgroup_root = strdup(c->cgroup_root); + if (!n->cgroup_root) + return -ENOMEM; + + n->mask |= mask & (SD_BUS_CREDS_CGROUP|SD_BUS_CREDS_SESSION|SD_BUS_CREDS_UNIT|SD_BUS_CREDS_USER_UNIT|SD_BUS_CREDS_SLICE|SD_BUS_CREDS_USER_SLICE|SD_BUS_CREDS_OWNER_UID); + } + + if (c->mask & mask & (SD_BUS_CREDS_EFFECTIVE_CAPS|SD_BUS_CREDS_PERMITTED_CAPS|SD_BUS_CREDS_INHERITABLE_CAPS|SD_BUS_CREDS_BOUNDING_CAPS)) { + assert(c->capability); + + n->capability = memdup(c->capability, DIV_ROUND_UP(cap_last_cap()+1, 32U) * 4 * 4); + if (!n->capability) + return -ENOMEM; + + n->mask |= c->mask & mask & (SD_BUS_CREDS_EFFECTIVE_CAPS|SD_BUS_CREDS_PERMITTED_CAPS|SD_BUS_CREDS_INHERITABLE_CAPS|SD_BUS_CREDS_BOUNDING_CAPS); + } + + if (c->mask & mask & SD_BUS_CREDS_SELINUX_CONTEXT) { + assert(c->label); + + n->label = strdup(c->label); + if (!n->label) + return -ENOMEM; + n->mask |= SD_BUS_CREDS_SELINUX_CONTEXT; + } + + if (c->mask & mask & SD_BUS_CREDS_AUDIT_SESSION_ID) { + n->audit_session_id = c->audit_session_id; + n->mask |= SD_BUS_CREDS_AUDIT_SESSION_ID; + } + if (c->mask & mask & SD_BUS_CREDS_AUDIT_LOGIN_UID) { + n->audit_login_uid = c->audit_login_uid; + n->mask |= SD_BUS_CREDS_AUDIT_LOGIN_UID; + } + + if (c->mask & mask & SD_BUS_CREDS_TTY) { + if (c->tty) { + n->tty = strdup(c->tty); + if (!n->tty) + return -ENOMEM; + } else + n->tty = NULL; + n->mask |= SD_BUS_CREDS_TTY; + } + + if (c->mask & mask & SD_BUS_CREDS_UNIQUE_NAME) { + assert(c->unique_name); + + n->unique_name = strdup(c->unique_name); + if (!n->unique_name) + return -ENOMEM; + n->mask |= SD_BUS_CREDS_UNIQUE_NAME; + } + + if (c->mask & mask & SD_BUS_CREDS_WELL_KNOWN_NAMES) { + if (strv_isempty(c->well_known_names)) + n->well_known_names = NULL; + else { + n->well_known_names = strv_copy(c->well_known_names); + if (!n->well_known_names) + return -ENOMEM; + } + n->well_known_names_driver = c->well_known_names_driver; + n->well_known_names_local = c->well_known_names_local; + n->mask |= SD_BUS_CREDS_WELL_KNOWN_NAMES; + } + + if (c->mask & mask & SD_BUS_CREDS_DESCRIPTION) { + assert(c->description); + n->description = strdup(c->description); + if (!n->description) + return -ENOMEM; + n->mask |= SD_BUS_CREDS_DESCRIPTION; + } + + n->augmented = c->augmented & n->mask; + + /* Get more data */ + + r = bus_creds_add_more(n, mask, 0, 0); + if (r < 0) + return r; + + *ret = TAKE_PTR(n); + + return 0; +} diff --git a/src/libsystemd/sd-bus/bus-creds.h b/src/libsystemd/sd-bus/bus-creds.h new file mode 100644 index 0000000..7806d9e --- /dev/null +++ b/src/libsystemd/sd-bus/bus-creds.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +struct sd_bus_creds { + bool allocated; + unsigned n_ref; + + uint64_t mask; + uint64_t augmented; + + uid_t uid; + uid_t euid; + uid_t suid; + uid_t fsuid; + gid_t gid; + gid_t egid; + gid_t sgid; + gid_t fsgid; + + gid_t *supplementary_gids; + unsigned n_supplementary_gids; + + pid_t ppid; + pid_t pid; + pid_t tid; + + char *comm; + char *tid_comm; + char *exe; + + char *cmdline; + size_t cmdline_size; + char **cmdline_array; + + char *cgroup; + char *session; + char *unit; + char *user_unit; + char *slice; + char *user_slice; + + char *tty; + + uint32_t *capability; + + uint32_t audit_session_id; + uid_t audit_login_uid; + + char *label; + + char *unique_name; + + char **well_known_names; + bool well_known_names_driver:1; + bool well_known_names_local:1; + + char *cgroup_root; + + char *description, *unescaped_description; +}; + +sd_bus_creds* bus_creds_new(void); + +void bus_creds_done(sd_bus_creds *c); + +int bus_creds_add_more(sd_bus_creds *c, uint64_t mask, pid_t pid, pid_t tid); + +int bus_creds_extend_by_pid(sd_bus_creds *c, uint64_t mask, sd_bus_creds **ret); diff --git a/src/libsystemd/sd-bus/bus-dump.c b/src/libsystemd/sd-bus/bus-dump.c new file mode 100644 index 0000000..6d24f3b --- /dev/null +++ b/src/libsystemd/sd-bus/bus-dump.c @@ -0,0 +1,649 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-dump.h" +#include "bus-internal.h" +#include "bus-message.h" +#include "bus-type.h" +#include "cap-list.h" +#include "capability-util.h" +#include "fileio.h" +#include "format-util.h" +#include "glyph-util.h" +#include "macro.h" +#include "pcapng.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +static char *indent(unsigned level, uint64_t flags) { + char *p; + unsigned n, i = 0; + + n = 0; + + if (flags & SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY && level > 0) + level -= 1; + + if (flags & SD_BUS_MESSAGE_DUMP_WITH_HEADER) + n += 2; + + p = new(char, n + level*8 + 1); + if (!p) + return NULL; + + if (flags & SD_BUS_MESSAGE_DUMP_WITH_HEADER) { + p[i++] = ' '; + p[i++] = ' '; + } + + memset(p + i, ' ', level*8); + p[i + level*8] = 0; + + return p; +} + +_public_ int sd_bus_message_dump(sd_bus_message *m, FILE *f, uint64_t flags) { + unsigned level = 1; + int r; + + assert_return(m, -EINVAL); + assert_return((flags & ~_SD_BUS_MESSAGE_DUMP_KNOWN_FLAGS) == 0, -EINVAL); + + if (!f) + f = stdout; + + if (flags & SD_BUS_MESSAGE_DUMP_WITH_HEADER) { + usec_t ts = m->realtime; + + if (ts == 0) + ts = now(CLOCK_REALTIME); + + fprintf(f, + "%s%s%s Type=%s%s%s Endian=%c Flags=%u Version=%u", + m->header->type == SD_BUS_MESSAGE_METHOD_ERROR ? ansi_highlight_red() : + m->header->type == SD_BUS_MESSAGE_METHOD_RETURN ? ansi_highlight_green() : + m->header->type != SD_BUS_MESSAGE_SIGNAL ? ansi_highlight() : "", + special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET), + ansi_normal(), + + ansi_highlight(), + bus_message_type_to_string(m->header->type) ?: "(unknown)", + ansi_normal(), + + m->header->endian, + m->header->flags, + m->header->version); + + /* Display synthetic message serial number in a more readable + * format than UINT32_MAX */ + if (BUS_MESSAGE_COOKIE(m) == UINT32_MAX) + fprintf(f, " Cookie=-1"); + else + fprintf(f, " Cookie=%" PRIu64, BUS_MESSAGE_COOKIE(m)); + + if (m->reply_cookie != 0) + fprintf(f, " ReplyCookie=%" PRIu64, m->reply_cookie); + + fprintf(f, " Timestamp=\"%s\"\n", strna(FORMAT_TIMESTAMP_STYLE(ts, TIMESTAMP_US_UTC))); + + if (m->sender) + fprintf(f, " Sender=%s%s%s", ansi_highlight(), m->sender, ansi_normal()); + if (m->destination) + fprintf(f, " Destination=%s%s%s", ansi_highlight(), m->destination, ansi_normal()); + if (m->path) + fprintf(f, " Path=%s%s%s", ansi_highlight(), m->path, ansi_normal()); + if (m->interface) + fprintf(f, " Interface=%s%s%s", ansi_highlight(), m->interface, ansi_normal()); + if (m->member) + fprintf(f, " Member=%s%s%s", ansi_highlight(), m->member, ansi_normal()); + + if (m->sender || m->destination || m->path || m->interface || m->member) + fputs("\n", f); + + if (sd_bus_error_is_set(&m->error)) + fprintf(f, + " ErrorName=%s%s%s" + " ErrorMessage=%s\"%s\"%s\n", + ansi_highlight_red(), strna(m->error.name), ansi_normal(), + ansi_highlight_red(), strna(m->error.message), ansi_normal()); + + if (m->monotonic != 0) + fprintf(f, " Monotonic="USEC_FMT, m->monotonic); + if (m->realtime != 0) + fprintf(f, " Realtime="USEC_FMT, m->realtime); + if (m->seqnum != 0) + fprintf(f, " SequenceNumber=%"PRIu64, m->seqnum); + + if (m->monotonic != 0 || m->realtime != 0 || m->seqnum != 0) + fputs("\n", f); + + bus_creds_dump(&m->creds, f, true); + } + + r = sd_bus_message_rewind(m, !(flags & SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY)); + if (r < 0) + return log_error_errno(r, "Failed to rewind: %m"); + + if (!(flags & SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY)) { + _cleanup_free_ char *prefix = NULL; + + prefix = indent(0, flags); + if (!prefix) + return log_oom(); + + fprintf(f, "%sMESSAGE \"%s\" {\n", prefix, strempty(m->root_container.signature)); + } + + for (;;) { + _cleanup_free_ char *prefix = NULL; + const char *contents = NULL; + char type; + union { + uint8_t u8; + uint16_t u16; + int16_t s16; + uint32_t u32; + int32_t s32; + uint64_t u64; + int64_t s64; + double d64; + const char *string; + int i; + } basic; + + r = sd_bus_message_peek_type(m, &type, &contents); + if (r < 0) + return log_error_errno(r, "Failed to peek type: %m"); + + if (r == 0) { + if (level <= 1) + break; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return log_error_errno(r, "Failed to exit container: %m"); + + level--; + + prefix = indent(level, flags); + if (!prefix) + return log_oom(); + + fprintf(f, "%s};\n", prefix); + continue; + } + + prefix = indent(level, flags); + if (!prefix) + return log_oom(); + + if (bus_type_is_container(type) > 0) { + r = sd_bus_message_enter_container(m, type, contents); + if (r < 0) + return log_error_errno(r, "Failed to enter container: %m"); + + if (type == SD_BUS_TYPE_ARRAY) + fprintf(f, "%sARRAY \"%s\" {\n", prefix, contents); + else if (type == SD_BUS_TYPE_VARIANT) + fprintf(f, "%sVARIANT \"%s\" {\n", prefix, contents); + else if (type == SD_BUS_TYPE_STRUCT) + fprintf(f, "%sSTRUCT \"%s\" {\n", prefix, contents); + else if (type == SD_BUS_TYPE_DICT_ENTRY) + fprintf(f, "%sDICT_ENTRY \"%s\" {\n", prefix, contents); + + level++; + + continue; + } + + r = sd_bus_message_read_basic(m, type, &basic); + if (r < 0) + return log_error_errno(r, "Failed to get basic: %m"); + + assert(r > 0); + + switch (type) { + + case SD_BUS_TYPE_BYTE: + fprintf(f, "%sBYTE %s%u%s;\n", prefix, ansi_highlight(), basic.u8, ansi_normal()); + break; + + case SD_BUS_TYPE_BOOLEAN: + fprintf(f, "%sBOOLEAN %s%s%s;\n", prefix, ansi_highlight(), true_false(basic.i), ansi_normal()); + break; + + case SD_BUS_TYPE_INT16: + fprintf(f, "%sINT16 %s%i%s;\n", prefix, ansi_highlight(), basic.s16, ansi_normal()); + break; + + case SD_BUS_TYPE_UINT16: + fprintf(f, "%sUINT16 %s%u%s;\n", prefix, ansi_highlight(), basic.u16, ansi_normal()); + break; + + case SD_BUS_TYPE_INT32: + fprintf(f, "%sINT32 %s%i%s;\n", prefix, ansi_highlight(), basic.s32, ansi_normal()); + break; + + case SD_BUS_TYPE_UINT32: + fprintf(f, "%sUINT32 %s%u%s;\n", prefix, ansi_highlight(), basic.u32, ansi_normal()); + break; + + case SD_BUS_TYPE_INT64: + fprintf(f, "%sINT64 %s%"PRIi64"%s;\n", prefix, ansi_highlight(), basic.s64, ansi_normal()); + break; + + case SD_BUS_TYPE_UINT64: + fprintf(f, "%sUINT64 %s%"PRIu64"%s;\n", prefix, ansi_highlight(), basic.u64, ansi_normal()); + break; + + case SD_BUS_TYPE_DOUBLE: + fprintf(f, "%sDOUBLE %s%g%s;\n", prefix, ansi_highlight(), basic.d64, ansi_normal()); + break; + + case SD_BUS_TYPE_STRING: + fprintf(f, "%sSTRING \"%s%s%s\";\n", prefix, ansi_highlight(), basic.string, ansi_normal()); + break; + + case SD_BUS_TYPE_OBJECT_PATH: + fprintf(f, "%sOBJECT_PATH \"%s%s%s\";\n", prefix, ansi_highlight(), basic.string, ansi_normal()); + break; + + case SD_BUS_TYPE_SIGNATURE: + fprintf(f, "%sSIGNATURE \"%s%s%s\";\n", prefix, ansi_highlight(), basic.string, ansi_normal()); + break; + + case SD_BUS_TYPE_UNIX_FD: + fprintf(f, "%sUNIX_FD %s%i%s;\n", prefix, ansi_highlight(), basic.i, ansi_normal()); + break; + + default: + assert_not_reached(); + } + } + + if (!(flags & SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY)) { + _cleanup_free_ char *prefix = NULL; + + prefix = indent(0, flags); + if (!prefix) + return log_oom(); + + fprintf(f, "%s};\n\n", prefix); + } + + return 0; +} + +static void dump_capabilities( + sd_bus_creds *c, + FILE *f, + const char *name, + bool terse, + int (*has)(sd_bus_creds *c, int capability)) { + + unsigned long i, last_cap; + unsigned n = 0; + int r; + + assert(c); + assert(f); + assert(name); + assert(has); + + i = 0; + r = has(c, i); + if (r < 0) + return; + + fprintf(f, "%s%s=%s", terse ? " " : "", name, terse ? "" : ansi_highlight()); + last_cap = cap_last_cap(); + + for (;;) { + if (r > 0) { + + if (n > 0) + fputc(' ', f); + if (n % 4 == 3) + fprintf(f, terse ? "\n " : "\n "); + + fprintf(f, "%s", strna(capability_to_name(i))); + n++; + } + + i++; + + if (i > last_cap) + break; + + r = has(c, i); + } + + fputs("\n", f); + + if (!terse) + fputs(ansi_normal(), f); +} + +int bus_creds_dump(sd_bus_creds *c, FILE *f, bool terse) { + uid_t owner, audit_loginuid; + uint32_t audit_sessionid; + char **cmdline = NULL, **well_known = NULL; + const char *prefix, *color, *suffix, *s; + int r, q, v, w, z; + + assert(c); + + if (!f) + f = stdout; + + if (terse) { + prefix = " "; + suffix = ""; + color = ""; + } else { + const char *off; + + prefix = ""; + color = ansi_highlight(); + + off = ansi_normal(); + suffix = strjoina(off, "\n"); + } + + if (c->mask & SD_BUS_CREDS_PID) + fprintf(f, "%sPID=%s"PID_FMT"%s", prefix, color, c->pid, suffix); + if (c->mask & SD_BUS_CREDS_TID) + fprintf(f, "%sTID=%s"PID_FMT"%s", prefix, color, c->tid, suffix); + if (c->mask & SD_BUS_CREDS_PPID) { + if (c->ppid == 0) + fprintf(f, "%sPPID=%sn/a%s", prefix, color, suffix); + else + fprintf(f, "%sPPID=%s"PID_FMT"%s", prefix, color, c->ppid, suffix); + } + if (c->mask & SD_BUS_CREDS_TTY) + fprintf(f, "%sTTY=%s%s%s", prefix, color, strna(c->tty), suffix); + + if (terse && ((c->mask & (SD_BUS_CREDS_PID|SD_BUS_CREDS_TID|SD_BUS_CREDS_PPID|SD_BUS_CREDS_TTY)))) + fputs("\n", f); + + if (c->mask & SD_BUS_CREDS_UID) + fprintf(f, "%sUID=%s"UID_FMT"%s", prefix, color, c->uid, suffix); + if (c->mask & SD_BUS_CREDS_EUID) + fprintf(f, "%sEUID=%s"UID_FMT"%s", prefix, color, c->euid, suffix); + if (c->mask & SD_BUS_CREDS_SUID) + fprintf(f, "%sSUID=%s"UID_FMT"%s", prefix, color, c->suid, suffix); + if (c->mask & SD_BUS_CREDS_FSUID) + fprintf(f, "%sFSUID=%s"UID_FMT"%s", prefix, color, c->fsuid, suffix); + r = sd_bus_creds_get_owner_uid(c, &owner); + if (r >= 0) + fprintf(f, "%sOwnerUID=%s"UID_FMT"%s", prefix, color, owner, suffix); + if (c->mask & SD_BUS_CREDS_GID) + fprintf(f, "%sGID=%s"GID_FMT"%s", prefix, color, c->gid, suffix); + if (c->mask & SD_BUS_CREDS_EGID) + fprintf(f, "%sEGID=%s"GID_FMT"%s", prefix, color, c->egid, suffix); + if (c->mask & SD_BUS_CREDS_SGID) + fprintf(f, "%sSGID=%s"GID_FMT"%s", prefix, color, c->sgid, suffix); + if (c->mask & SD_BUS_CREDS_FSGID) + fprintf(f, "%sFSGID=%s"GID_FMT"%s", prefix, color, c->fsgid, suffix); + + if (c->mask & SD_BUS_CREDS_SUPPLEMENTARY_GIDS) { + fprintf(f, "%sSupplementaryGIDs=%s", prefix, color); + for (unsigned i = 0; i < c->n_supplementary_gids; i++) + fprintf(f, "%s" GID_FMT, i > 0 ? " " : "", c->supplementary_gids[i]); + fprintf(f, "%s", suffix); + } + + if (terse && ((c->mask & (SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_SUID|SD_BUS_CREDS_FSUID| + SD_BUS_CREDS_GID|SD_BUS_CREDS_EGID|SD_BUS_CREDS_SGID|SD_BUS_CREDS_FSGID| + SD_BUS_CREDS_SUPPLEMENTARY_GIDS)) || r >= 0)) + fputs("\n", f); + + if (c->mask & SD_BUS_CREDS_COMM) + fprintf(f, "%sComm=%s%s%s", prefix, color, c->comm, suffix); + if (c->mask & SD_BUS_CREDS_TID_COMM) + fprintf(f, "%sTIDComm=%s%s%s", prefix, color, c->tid_comm, suffix); + if (c->mask & SD_BUS_CREDS_EXE) + fprintf(f, "%sExe=%s%s%s", prefix, color, strna(c->exe), suffix); + + if (terse && (c->mask & (SD_BUS_CREDS_EXE|SD_BUS_CREDS_COMM|SD_BUS_CREDS_TID_COMM))) + fputs("\n", f); + + r = sd_bus_creds_get_cmdline(c, &cmdline); + if (r >= 0) { + fprintf(f, "%sCommandLine=%s", prefix, color); + STRV_FOREACH(i, cmdline) { + if (i != cmdline) + fputc(' ', f); + + fputs(*i, f); + } + + fprintf(f, "%s", suffix); + } else if (r != -ENODATA) + fprintf(f, "%sCommandLine=%sn/a%s", prefix, color, suffix); + + if (c->mask & SD_BUS_CREDS_SELINUX_CONTEXT) + fprintf(f, "%sLabel=%s%s%s", prefix, color, c->label, suffix); + if (c->mask & SD_BUS_CREDS_DESCRIPTION) + fprintf(f, "%sDescription=%s%s%s", prefix, color, c->description, suffix); + + if (terse && (c->mask & (SD_BUS_CREDS_SELINUX_CONTEXT|SD_BUS_CREDS_DESCRIPTION))) + fputs("\n", f); + + if (c->mask & SD_BUS_CREDS_CGROUP) + fprintf(f, "%sCGroup=%s%s%s", prefix, color, c->cgroup, suffix); + s = NULL; + r = sd_bus_creds_get_unit(c, &s); + if (r != -ENODATA) + fprintf(f, "%sUnit=%s%s%s", prefix, color, strna(s), suffix); + s = NULL; + v = sd_bus_creds_get_slice(c, &s); + if (v != -ENODATA) + fprintf(f, "%sSlice=%s%s%s", prefix, color, strna(s), suffix); + s = NULL; + q = sd_bus_creds_get_user_unit(c, &s); + if (q != -ENODATA) + fprintf(f, "%sUserUnit=%s%s%s", prefix, color, strna(s), suffix); + s = NULL; + w = sd_bus_creds_get_user_slice(c, &s); + if (w != -ENODATA) + fprintf(f, "%sUserSlice=%s%s%s", prefix, color, strna(s), suffix); + s = NULL; + z = sd_bus_creds_get_session(c, &s); + if (z != -ENODATA) + fprintf(f, "%sSession=%s%s%s", prefix, color, strna(s), suffix); + + if (terse && ((c->mask & SD_BUS_CREDS_CGROUP) || r != -ENODATA || q != -ENODATA || v != -ENODATA || w != -ENODATA || z != -ENODATA)) + fputs("\n", f); + + r = sd_bus_creds_get_audit_login_uid(c, &audit_loginuid); + if (r >= 0) + fprintf(f, "%sAuditLoginUID=%s"UID_FMT"%s", prefix, color, audit_loginuid, suffix); + else if (r != -ENODATA) + fprintf(f, "%sAuditLoginUID=%sn/a%s", prefix, color, suffix); + q = sd_bus_creds_get_audit_session_id(c, &audit_sessionid); + if (q >= 0) + fprintf(f, "%sAuditSessionID=%s%"PRIu32"%s", prefix, color, audit_sessionid, suffix); + else if (q != -ENODATA) + fprintf(f, "%sAuditSessionID=%sn/a%s", prefix, color, suffix); + + if (terse && (r != -ENODATA || q != -ENODATA)) + fputs("\n", f); + + if (c->mask & SD_BUS_CREDS_UNIQUE_NAME) + fprintf(f, "%sUniqueName=%s%s%s", prefix, color, c->unique_name, suffix); + + if (sd_bus_creds_get_well_known_names(c, &well_known) >= 0) { + fprintf(f, "%sWellKnownNames=%s", prefix, color); + STRV_FOREACH(i, well_known) { + if (i != well_known) + fputc(' ', f); + + fputs(*i, f); + } + + fprintf(f, "%s", suffix); + } + + if (terse && (c->mask & SD_BUS_CREDS_UNIQUE_NAME || well_known)) + fputc('\n', f); + + dump_capabilities(c, f, "EffectiveCapabilities", terse, sd_bus_creds_has_effective_cap); + dump_capabilities(c, f, "PermittedCapabilities", terse, sd_bus_creds_has_permitted_cap); + dump_capabilities(c, f, "InheritableCapabilities", terse, sd_bus_creds_has_inheritable_cap); + dump_capabilities(c, f, "BoundingCapabilities", terse, sd_bus_creds_has_bounding_cap); + + return 0; +} + +static uint16_t pcapng_optlen(size_t len) { + return ALIGN4(len + sizeof(struct pcapng_option)); +} + +static void pcapng_putopt(FILE *f, uint16_t code, const void *data, size_t len) { + struct pcapng_option opt = { + .code = code, + .length = len, + }; + + assert(f); + assert((uint16_t) len == len); + assert(data || len == 0); + + fwrite(&opt, 1, sizeof(opt), f); + if (len > 0) { + size_t pad = ALIGN4(len) - len; + + fwrite(data, 1, len, f); + + assert(pad < sizeof(uint32_t)); + while (pad-- > 0) + fputc('\0', f); + } +} + +static void pcapng_section_header(FILE *f, const char *os, const char *app) { + uint32_t len; + + assert(f); + + /* determine length of section header and options */ + len = sizeof(struct pcapng_section); + if (os) + len += pcapng_optlen(strlen(os)); + if (app) + len += pcapng_optlen(strlen(app)); + len += pcapng_optlen(0); /* OPT_END */ + len += sizeof(uint32_t); /* trailer length */ + + struct pcapng_section hdr = { + .block_type = PCAPNG_SECTION_BLOCK, + .block_length = len, + .byte_order_magic = PCAPNG_BYTE_ORDER_MAGIC, + .major_version = PCAPNG_MAJOR_VERS, + .minor_version = PCAPNG_MINOR_VERS, + .section_length = UINT64_MAX, + }; + + fwrite(&hdr, 1, sizeof(hdr), f); + if (os) + pcapng_putopt(f, PCAPNG_SHB_OS, os, strlen(os)); + if (app) + pcapng_putopt(f, PCAPNG_SHB_USERAPPL, app, strlen(app)); + pcapng_putopt(f, PCAPNG_OPT_END, NULL, 0); + fwrite(&len, 1, sizeof(uint32_t), f); +} + +/* Only have a single instance of dbus pseudo interface */ +static void pcapng_interface_header(FILE *f, size_t snaplen) { + uint32_t len; + + assert(f); + assert(snaplen > 0); + assert((size_t) (uint32_t) snaplen == snaplen); + + /* no options (yet) */ + len = sizeof(struct pcapng_interface_block) + sizeof(uint32_t); + struct pcapng_interface_block hdr = { + .block_type = PCAPNG_INTERFACE_BLOCK, + .block_length = len, + .link_type = 231, /* D-Bus */ + .snap_len = snaplen, + }; + + fwrite(&hdr, 1, sizeof(hdr), f); + fwrite(&len, 1, sizeof(uint32_t), f); +} + +int bus_pcap_header(size_t snaplen, const char *os, const char *info, FILE *f) { + if (!f) + f = stdout; + + pcapng_section_header(f, os, info); + pcapng_interface_header(f, snaplen); + return fflush_and_check(f); +} + +int bus_message_pcap_frame(sd_bus_message *m, size_t snaplen, FILE *f) { + struct bus_body_part *part; + size_t msglen, caplen, pad; + uint32_t length; + uint64_t ts; + unsigned i; + size_t w; + + if (!f) + f = stdout; + + assert(m); + assert(snaplen > 0); + assert((size_t) (uint32_t) snaplen == snaplen); + + ts = m->realtime ?: now(CLOCK_REALTIME); + msglen = BUS_MESSAGE_SIZE(m); + caplen = MIN(msglen, snaplen); + pad = ALIGN4(caplen) - caplen; + + /* packet block has no options */ + length = sizeof(struct pcapng_enhance_packet_block) + + caplen + pad + sizeof(uint32_t); + + struct pcapng_enhance_packet_block epb = { + .block_type = PCAPNG_ENHANCED_PACKET_BLOCK, + .block_length = length, + .interface_id = 0, + .timestamp_hi = (uint32_t)(ts >> 32), + .timestamp_lo = (uint32_t)ts, + .original_length = msglen, + .capture_length = caplen, + }; + + /* write the pcapng enhanced packet block header */ + fwrite(&epb, 1, sizeof(epb), f); + + /* write the dbus header */ + w = MIN(BUS_MESSAGE_BODY_BEGIN(m), snaplen); + fwrite(m->header, 1, w, f); + snaplen -= w; + + /* write the dbus body */ + MESSAGE_FOREACH_PART(part, i, m) { + if (snaplen <= 0) + break; + + w = MIN(part->size, snaplen); + fwrite(part->data, 1, w, f); + snaplen -= w; + } + + while (pad-- > 0) + fputc('\0', f); + + /* trailing block length */ + fwrite(&length, 1, sizeof(uint32_t), f); + + return fflush_and_check(f); +} diff --git a/src/libsystemd/sd-bus/bus-dump.h b/src/libsystemd/sd-bus/bus-dump.h new file mode 100644 index 0000000..e7470ba --- /dev/null +++ b/src/libsystemd/sd-bus/bus-dump.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" + +int bus_creds_dump(sd_bus_creds *c, FILE *f, bool terse); + +int bus_pcap_header(size_t snaplen, const char *os, const char *app, FILE *f); +int bus_message_pcap_frame(sd_bus_message *m, size_t snaplen, FILE *f); diff --git a/src/libsystemd/sd-bus/bus-error.c b/src/libsystemd/sd-bus/bus-error.c new file mode 100644 index 0000000..77b2e1a --- /dev/null +++ b/src/libsystemd/sd-bus/bus-error.c @@ -0,0 +1,628 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "errno-list.h" +#include "errno-util.h" +#include "string-util.h" +#include "strv.h" + +BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map bus_standard_errors[] = { + SD_BUS_ERROR_MAP(SD_BUS_ERROR_FAILED, EACCES), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_NO_MEMORY, ENOMEM), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_SERVICE_UNKNOWN, EHOSTUNREACH), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_NAME_HAS_NO_OWNER, ENXIO), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_NO_REPLY, ETIMEDOUT), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_IO_ERROR, EIO), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_BAD_ADDRESS, EADDRNOTAVAIL), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_NOT_SUPPORTED, EOPNOTSUPP), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_LIMITS_EXCEEDED, ENOBUFS), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_ACCESS_DENIED, EACCES), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_AUTH_FAILED, EACCES), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_NO_SERVER, EHOSTDOWN), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_TIMEOUT, ETIMEDOUT), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_NO_NETWORK, ENONET), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_ADDRESS_IN_USE, EADDRINUSE), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_DISCONNECTED, ECONNRESET), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_INVALID_ARGS, EINVAL), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_FILE_NOT_FOUND, ENOENT), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_FILE_EXISTS, EEXIST), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_UNKNOWN_METHOD, EBADR), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_UNKNOWN_OBJECT, EBADR), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_UNKNOWN_INTERFACE, EBADR), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_UNKNOWN_PROPERTY, EBADR), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_PROPERTY_READ_ONLY, EROFS), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, ESRCH), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_INVALID_SIGNATURE, EINVAL), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_INCONSISTENT_MESSAGE, EBADMSG), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_TIMED_OUT, ETIMEDOUT), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_MATCH_RULE_NOT_FOUND, ENOENT), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_MATCH_RULE_INVALID, EINVAL), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_INTERACTIVE_AUTHORIZATION_REQUIRED, EACCES), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_INVALID_FILE_CONTENT, EINVAL), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_SELINUX_SECURITY_CONTEXT_UNKNOWN, ESRCH), + SD_BUS_ERROR_MAP(SD_BUS_ERROR_OBJECT_PATH_IN_USE, EBUSY), + SD_BUS_ERROR_MAP_END +}; + +/* GCC maps this magically to the beginning and end of the BUS_ERROR_MAP section */ +extern const sd_bus_error_map __start_SYSTEMD_BUS_ERROR_MAP[]; +extern const sd_bus_error_map __stop_SYSTEMD_BUS_ERROR_MAP[]; + +/* Additional maps registered with sd_bus_error_add_map() are in this + * NULL terminated array */ +static const sd_bus_error_map **additional_error_maps = NULL; + +static int bus_error_name_to_errno(const char *name) { + const sd_bus_error_map **map, *m; + const char *p; + int r; + + if (!name) + return EINVAL; + + p = startswith(name, "System.Error."); + if (p) { + r = errno_from_name(p); + if (r < 0) + return EIO; + + return r; + } + + if (additional_error_maps) + for (map = additional_error_maps; *map; map++) + for (m = *map;; m++) { + /* For additional error maps the end marker is actually the end marker */ + if (m->code == BUS_ERROR_MAP_END_MARKER) + break; + + if (streq(m->name, name)) { + assert(m->code > 0); + return m->code; + } + } + + m = ALIGN_PTR(__start_SYSTEMD_BUS_ERROR_MAP); + while (m < __stop_SYSTEMD_BUS_ERROR_MAP) { + /* For magic ELF error maps, the end marker might + * appear in the middle of things, since multiple maps + * might appear in the same section. Hence, let's skip + * over it, but realign the pointer to the next 8 byte + * boundary, which is the selected alignment for the + * arrays. */ + if (m->code == BUS_ERROR_MAP_END_MARKER) { + m = ALIGN_PTR(m + 1); + continue; + } + + if (streq(m->name, name)) { + assert(m->code > 0); + return m->code; + } + + m++; + } + + return EIO; +} + +static sd_bus_error errno_to_bus_error_const(int error) { + + if (error < 0) + error = -error; + + switch (error) { + + case ENOMEM: + return BUS_ERROR_OOM; + + case EPERM: + case EACCES: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_ACCESS_DENIED, "Access denied"); + + case EINVAL: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_INVALID_ARGS, "Invalid argument"); + + case ESRCH: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "No such process"); + + case ENOENT: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_FILE_NOT_FOUND, "File not found"); + + case EEXIST: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_FILE_EXISTS, "File exists"); + + case ETIMEDOUT: + case ETIME: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_TIMEOUT, "Timed out"); + + case EIO: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_IO_ERROR, "Input/output error"); + + case ENETRESET: + case ECONNABORTED: + case ECONNRESET: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_DISCONNECTED, "Disconnected"); + + case EOPNOTSUPP: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_NOT_SUPPORTED, "Not supported"); + + case EADDRNOTAVAIL: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_BAD_ADDRESS, "Address not available"); + + case ENOBUFS: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_LIMITS_EXCEEDED, "Limits exceeded"); + + case EADDRINUSE: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_ADDRESS_IN_USE, "Address in use"); + + case EBADMSG: + return SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_INCONSISTENT_MESSAGE, "Inconsistent message"); + } + + return SD_BUS_ERROR_NULL; +} + +static int errno_to_bus_error_name_new(int error, char **ret) { + const char *name; + char *n; + + if (error < 0) + error = -error; + + name = errno_to_name(error); + if (!name) + return 0; + + n = strjoin("System.Error.", name); + if (!n) + return -ENOMEM; + + *ret = n; + return 1; +} + +bool bus_error_is_dirty(sd_bus_error *e) { + if (!e) + return false; + + return e->name || e->message || e->_need_free != 0; +} + +_public_ void sd_bus_error_free(sd_bus_error *e) { + if (!e) + return; + + if (e->_need_free > 0) { + free((void*) e->name); + free((void*) e->message); + } + + *e = SD_BUS_ERROR_NULL; +} + +_public_ int sd_bus_error_set(sd_bus_error *e, const char *name, const char *message) { + int r; + + if (!name) + return 0; + + if (e) { + assert_return(!bus_error_is_dirty(e), -EINVAL); + + e->name = strdup(name); + if (!e->name) { + *e = BUS_ERROR_OOM; + return -ENOMEM; + } + + if (message) + e->message = strdup(message); + + e->_need_free = 1; + } + + r = bus_error_name_to_errno(name); + assert(r > 0); + return -r; +} + +_public_ int sd_bus_error_setfv(sd_bus_error *e, const char *name, const char *format, va_list ap) { + int r; + + if (!name) + return 0; + + if (e) { + assert_return(!bus_error_is_dirty(e), -EINVAL); + + e->name = strdup(name); + if (!e->name) { + *e = BUS_ERROR_OOM; + return -ENOMEM; + } + + if (format) { + _cleanup_free_ char *mesg = NULL; + + /* If we hit OOM on formatting the pretty message, we ignore + * this, since we at least managed to write the error name */ + + if (vasprintf(&mesg, format, ap) >= 0) + e->message = TAKE_PTR(mesg); + } + + e->_need_free = 1; + } + + r = bus_error_name_to_errno(name); + assert(r > 0); + return -r; +} + +_public_ int sd_bus_error_setf(sd_bus_error *e, const char *name, const char *format, ...) { + int r; + + if (format) { + va_list ap; + + va_start(ap, format); + r = sd_bus_error_setfv(e, name, format, ap); + assert(!name || r < 0); + va_end(ap); + + return r; + } + + r = sd_bus_error_set(e, name, NULL); + assert(!name || r < 0); + return r; +} + +_public_ int sd_bus_error_copy(sd_bus_error *dest, const sd_bus_error *e) { + + if (!sd_bus_error_is_set(e)) + return 0; + if (!dest) + goto finish; + + assert_return(!bus_error_is_dirty(dest), -EINVAL); + + /* + * _need_free < 0 indicates that the error is temporarily const, needs deep copying + * _need_free == 0 indicates that the error is perpetually const, needs no deep copying + * _need_free > 0 indicates that the error is fully dynamic, needs deep copying + */ + + if (e->_need_free == 0) + *dest = *e; + else { + dest->name = strdup(e->name); + if (!dest->name) { + *dest = BUS_ERROR_OOM; + return -ENOMEM; + } + + if (e->message) + dest->message = strdup(e->message); + + dest->_need_free = 1; + } + +finish: + return -bus_error_name_to_errno(e->name); +} + +_public_ int sd_bus_error_move(sd_bus_error *dest, sd_bus_error *e) { + int r; + + if (!sd_bus_error_is_set(e)) { + + if (dest) + *dest = SD_BUS_ERROR_NULL; + + return 0; + } + + r = -bus_error_name_to_errno(e->name); + + if (dest) { + *dest = *e; + *e = SD_BUS_ERROR_NULL; + } else + sd_bus_error_free(e); + + return r; +} + +_public_ int sd_bus_error_set_const(sd_bus_error *e, const char *name, const char *message) { + if (!name) + return 0; + if (!e) + goto finish; + + assert_return(!bus_error_is_dirty(e), -EINVAL); + + *e = SD_BUS_ERROR_MAKE_CONST(name, message); + +finish: + return -bus_error_name_to_errno(name); +} + +_public_ int sd_bus_error_is_set(const sd_bus_error *e) { + if (!e) + return 0; + + return !!e->name; +} + +_public_ int sd_bus_error_has_name(const sd_bus_error *e, const char *name) { + if (!e) + return 0; + + return streq_ptr(e->name, name); +} + +_public_ int sd_bus_error_has_names_sentinel(const sd_bus_error *e, ...) { + if (!e || !e->name) + return 0; + + va_list ap; + const char *p; + + va_start(ap, e); + while ((p = va_arg(ap, const char *))) + if (streq(p, e->name)) + break; + va_end(ap); + return !!p; +} + +_public_ int sd_bus_error_get_errno(const sd_bus_error* e) { + if (!e || !e->name) + return 0; + + return bus_error_name_to_errno(e->name); +} + +static void bus_error_strerror(sd_bus_error *e, int error) { + size_t k = 64; + char *m; + + assert(e); + + for (;;) { + char *x; + + m = new(char, k); + if (!m) + return; + + errno = 0; + x = strerror_r(error, m, k); + if (errno == ERANGE || strlen(x) >= k - 1) { + free(m); + k *= 2; + continue; + } + + if (errno) { + free(m); + return; + } + + if (x == m) { + if (e->_need_free > 0) { + /* Error is already dynamic, let's just update the message */ + free((char*) e->message); + e->message = x; + + } else { + char *t; + /* Error was const so far, let's make it dynamic, if we can */ + + t = strdup(e->name); + if (!t) { + free(m); + return; + } + + e->_need_free = 1; + e->name = t; + e->message = x; + } + } else { + free(m); + + if (e->_need_free > 0) { + char *t; + + /* Error is dynamic, let's hence make the message also dynamic */ + t = strdup(x); + if (!t) + return; + + free((char*) e->message); + e->message = t; + } else { + /* Error is const, hence we can just override */ + e->message = x; + } + } + + return; + } +} + +_public_ int sd_bus_error_set_errno(sd_bus_error *e, int error) { + + if (error < 0) + error = -error; + + if (!e) + return -error; + if (error == 0) + return 0; + + assert_return(!bus_error_is_dirty(e), -EINVAL); + + /* First, try a const translation */ + *e = errno_to_bus_error_const(error); + + if (!sd_bus_error_is_set(e)) { + int k; + + /* If that didn't work, try a dynamic one. */ + + k = errno_to_bus_error_name_new(error, (char**) &e->name); + if (k > 0) + e->_need_free = 1; + else if (k < 0) { + *e = BUS_ERROR_OOM; + return -error; + } else + *e = BUS_ERROR_FAILED; + } + + /* Now, fill in the message from strerror_r() if we can */ + bus_error_strerror(e, error); + return -error; +} + +_public_ int sd_bus_error_set_errnofv(sd_bus_error *e, int error, const char *format, va_list ap) { + PROTECT_ERRNO; + + if (error < 0) + error = -error; + + if (!e) + return -error; + if (error == 0) + return 0; + + assert_return(!bus_error_is_dirty(e), -EINVAL); + + /* First, try a const translation */ + *e = errno_to_bus_error_const(error); + + if (!sd_bus_error_is_set(e)) { + int k; + + /* If that didn't work, try a dynamic one */ + + k = errno_to_bus_error_name_new(error, (char**) &e->name); + if (k > 0) + e->_need_free = 1; + else if (k < 0) { + *e = BUS_ERROR_OOM; + return -ENOMEM; + } else + *e = BUS_ERROR_FAILED; + } + + if (format) { + _cleanup_free_ char *m = NULL; + + /* Then, let's try to fill in the supplied message */ + + errno = error; /* Make sure that %m resolves to the specified error */ + if (vasprintf(&m, format, ap) < 0) + goto fail; + + if (e->_need_free <= 0) { + char *t; + + t = strdup(e->name); + if (!t) + goto fail; + + e->_need_free = 1; + e->name = t; + } + + e->message = TAKE_PTR(m); + return -error; + } + +fail: + /* If that didn't work, use strerror_r() for the message */ + bus_error_strerror(e, error); + return -error; +} + +_public_ int sd_bus_error_set_errnof(sd_bus_error *e, int error, const char *format, ...) { + int r; + + if (error < 0) + error = -error; + + if (!e) + return -error; + if (error == 0) + return 0; + + assert_return(!bus_error_is_dirty(e), -EINVAL); + + if (format) { + va_list ap; + + va_start(ap, format); + r = sd_bus_error_set_errnofv(e, error, format, ap); + va_end(ap); + + return r; + } + + return sd_bus_error_set_errno(e, error); +} + +const char* _bus_error_message(const sd_bus_error *e, int error, char buf[static ERRNO_BUF_LEN]) { + /* Sometimes, the D-Bus server is a little bit too verbose with + * its error messages, so let's override them here */ + if (sd_bus_error_has_name(e, SD_BUS_ERROR_ACCESS_DENIED)) + return "Access denied"; + + if (e && e->message) + return e->message; + + return strerror_r(abs(error), buf, ERRNO_BUF_LEN); +} + +static bool map_ok(const sd_bus_error_map *map) { + for (; map->code != BUS_ERROR_MAP_END_MARKER; map++) + if (!map->name || map->code <= 0) + return false; + return true; +} + +_public_ int sd_bus_error_add_map(const sd_bus_error_map *map) { + const sd_bus_error_map **maps = NULL; + unsigned n = 0; + + assert_return(map, -EINVAL); + assert_return(map_ok(map), -EINVAL); + + if (additional_error_maps) + for (; additional_error_maps[n] != NULL; n++) + if (additional_error_maps[n] == map) + return 0; + + maps = reallocarray(additional_error_maps, n + 2, sizeof(struct sd_bus_error_map*)); + if (!maps) + return -ENOMEM; + + maps[n] = map; + maps[n+1] = NULL; + + additional_error_maps = maps; + return 1; +} diff --git a/src/libsystemd/sd-bus/bus-error.h b/src/libsystemd/sd-bus/bus-error.h new file mode 100644 index 0000000..c8768c9 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-error.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +#include "errno-util.h" +#include "macro.h" + +bool bus_error_is_dirty(sd_bus_error *e); + +const char* _bus_error_message(const sd_bus_error *e, int error, char buf[static ERRNO_BUF_LEN]); + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define bus_error_message(e, error) _bus_error_message(e, error, (char[ERRNO_BUF_LEN]){}) + +#define BUS_ERROR_OOM SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_NO_MEMORY, "Out of memory") +#define BUS_ERROR_FAILED SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_FAILED, "Operation failed") + +/* + * There are two ways to register error maps with the error translation + * logic: by using BUS_ERROR_MAP_ELF_REGISTER, which however only + * works when linked into the same ELF module, or via + * sd_bus_error_add_map() which is the official, external API, that + * works from any module. + * + * Note that BUS_ERROR_MAP_ELF_REGISTER has to be used as decorator in + * the bus error table, and BUS_ERROR_MAP_ELF_USE has to be used at + * least once per compilation unit (i.e. per library), to ensure that + * the error map is really added to the final binary. + * + * In addition, set the retain attribute so that the section cannot be + * discarded by ld --gc-sections -z start-stop-gc. Older compilers would + * warn for the unknown attribute, so just disable -Wattributes. + */ + +#define BUS_ERROR_MAP_ELF_REGISTER \ + _Pragma("GCC diagnostic ignored \"-Wattributes\"") \ + _section_("SYSTEMD_BUS_ERROR_MAP") \ + _used_ \ + _retain_ \ + _alignptr_ \ + _variable_no_sanitize_address_ + +#define BUS_ERROR_MAP_ELF_USE(errors) \ + extern const sd_bus_error_map errors[]; \ + _used_ \ + static const sd_bus_error_map * const CONCATENATE(errors ## _copy_, __COUNTER__) = errors; + +/* We use something exotic as end marker, to ensure people build the + * maps using the macsd-ros. */ +#define BUS_ERROR_MAP_END_MARKER -'x' + +BUS_ERROR_MAP_ELF_USE(bus_standard_errors); diff --git a/src/libsystemd/sd-bus/bus-internal.c b/src/libsystemd/sd-bus/bus-internal.c new file mode 100644 index 0000000..a249b84 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-internal.c @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-message.h" +#include "escape.h" +#include "hexdecoct.h" +#include "string-util.h" + +bool object_path_is_valid(const char *p) { + const char *q; + bool slash; + + if (!p) + return false; + + if (p[0] != '/') + return false; + + if (p[1] == 0) + return true; + + for (slash = true, q = p+1; *q; q++) + if (*q == '/') { + if (slash) + return false; + + slash = true; + } else { + bool good; + + good = ascii_isalpha(*q) || + ascii_isdigit(*q) || + *q == '_'; + + if (!good) + return false; + + slash = false; + } + + if (slash) + return false; + + return (q - p) <= BUS_PATH_SIZE_MAX; +} + +char* object_path_startswith(const char *a, const char *b) { + const char *p; + + if (!object_path_is_valid(a) || + !object_path_is_valid(b)) + return NULL; + + if (streq(b, "/")) + return (char*) a + 1; + + p = startswith(a, b); + if (!p) + return NULL; + + if (*p == 0) + return (char*) p; + + if (*p == '/') + return (char*) p + 1; + + return NULL; +} + +bool interface_name_is_valid(const char *p) { + const char *q; + bool dot, found_dot = false; + + if (isempty(p)) + return false; + + for (dot = true, q = p; *q; q++) + if (*q == '.') { + if (dot) + return false; + + found_dot = dot = true; + } else { + bool good; + + good = + ascii_isalpha(*q) || + (!dot && ascii_isdigit(*q)) || + *q == '_'; + + if (!good) { + if (DEBUG_LOGGING) { + _cleanup_free_ char *iface = cescape(p); + log_debug("The interface %s is invalid as it contains special character", strnull(iface)); + } + return false; + } + + dot = false; + } + + if (q - p > SD_BUS_MAXIMUM_NAME_LENGTH) + return false; + + if (dot) + return false; + + if (!found_dot) + return false; + + return true; +} + +bool service_name_is_valid(const char *p) { + const char *q; + bool dot, found_dot = false, unique; + + if (isempty(p)) + return false; + + unique = p[0] == ':'; + + for (dot = true, q = unique ? p+1 : p; *q; q++) + if (*q == '.') { + if (dot) + return false; + + found_dot = dot = true; + } else { + bool good; + + good = + ascii_isalpha(*q) || + ((!dot || unique) && ascii_isdigit(*q)) || + IN_SET(*q, '_', '-'); + + if (!good) + return false; + + dot = false; + } + + if (q - p > SD_BUS_MAXIMUM_NAME_LENGTH) + return false; + + if (dot) + return false; + + if (!found_dot) + return false; + + return true; +} + +bool member_name_is_valid(const char *p) { + const char *q; + + if (isempty(p)) + return false; + + for (q = p; *q; q++) { + bool good; + + good = + ascii_isalpha(*q) || + ascii_isdigit(*q) || + *q == '_'; + + if (!good) + return false; + } + + if (q - p > SD_BUS_MAXIMUM_NAME_LENGTH) + return false; + + return true; +} + +/* + * Complex pattern match + * This checks whether @a is a 'complex-prefix' of @b, or @b is a + * 'complex-prefix' of @a, based on strings that consist of labels with @c as + * separator. This function returns true if: + * - both strings are equal + * - either is a prefix of the other and ends with @c + * The second rule makes sure that either string needs to be fully included in + * the other, and the string which is considered the prefix needs to end with a + * separator. + */ +static bool complex_pattern_check(char c, const char *a, const char *b) { + bool separator = false; + + if (!a && !b) + return true; + + if (!a || !b) + return false; + + for (;;) { + if (*a != *b) + return (separator && (*a == 0 || *b == 0)); + + if (*a == 0) + return true; + + separator = *a == c; + + a++, b++; + } +} + +bool namespace_complex_pattern(const char *pattern, const char *value) { + return complex_pattern_check('.', pattern, value); +} + +bool path_complex_pattern(const char *pattern, const char *value) { + return complex_pattern_check('/', pattern, value); +} + +/* + * Simple pattern match + * This checks whether @a is a 'simple-prefix' of @b, based on strings that + * consist of labels with @c as separator. This function returns true, if: + * - if @a and @b are equal + * - if @a is a prefix of @b, and the first following character in @b (or the + * last character in @a) is @c + * The second rule basically makes sure that if @a is a prefix of @b, then @b + * must follow with a new label separated by @c. It cannot extend the label. + */ +static bool simple_pattern_check(char c, const char *a, const char *b) { + bool separator = false; + + if (!a && !b) + return true; + + if (!a || !b) + return false; + + for (;;) { + if (*a != *b) + return *a == 0 && (*b == c || separator); + + if (*a == 0) + return true; + + separator = *a == c; + + a++, b++; + } +} + +bool namespace_simple_pattern(const char *pattern, const char *value) { + return simple_pattern_check('.', pattern, value); +} + +bool path_simple_pattern(const char *pattern, const char *value) { + return simple_pattern_check('/', pattern, value); +} + +int bus_message_type_from_string(const char *s, uint8_t *u) { + if (streq(s, "signal")) + *u = SD_BUS_MESSAGE_SIGNAL; + else if (streq(s, "method_call")) + *u = SD_BUS_MESSAGE_METHOD_CALL; + else if (streq(s, "error")) + *u = SD_BUS_MESSAGE_METHOD_ERROR; + else if (streq(s, "method_return")) + *u = SD_BUS_MESSAGE_METHOD_RETURN; + else + return -EINVAL; + + return 0; +} + +const char *bus_message_type_to_string(uint8_t u) { + if (u == SD_BUS_MESSAGE_SIGNAL) + return "signal"; + else if (u == SD_BUS_MESSAGE_METHOD_CALL) + return "method_call"; + else if (u == SD_BUS_MESSAGE_METHOD_ERROR) + return "error"; + else if (u == SD_BUS_MESSAGE_METHOD_RETURN) + return "method_return"; + else + return NULL; +} + +char *bus_address_escape(const char *v) { + const char *a; + char *r, *b; + + r = new(char, strlen(v)*3+1); + if (!r) + return NULL; + + for (a = v, b = r; *a; a++) { + + if (ascii_isdigit(*a) || + ascii_isalpha(*a) || + strchr("_-/.", *a)) + *(b++) = *a; + else { + *(b++) = '%'; + *(b++) = hexchar(*a >> 4); + *(b++) = hexchar(*a & 0xF); + } + } + + *b = 0; + return r; +} + +int bus_maybe_reply_error(sd_bus_message *m, int r, sd_bus_error *error) { + assert(m); + + if (sd_bus_error_is_set(error) || r < 0) { + if (m->header->type == SD_BUS_MESSAGE_METHOD_CALL) + sd_bus_reply_method_errno(m, r, error); + } else + return r; + + log_debug("Failed to process message type=%s sender=%s destination=%s path=%s interface=%s member=%s cookie=%" PRIu64 " reply_cookie=%" PRIu64 " signature=%s error-name=%s error-message=%s: %s", + bus_message_type_to_string(m->header->type), + strna(sd_bus_message_get_sender(m)), + strna(sd_bus_message_get_destination(m)), + strna(sd_bus_message_get_path(m)), + strna(sd_bus_message_get_interface(m)), + strna(sd_bus_message_get_member(m)), + BUS_MESSAGE_COOKIE(m), + m->reply_cookie, + strna(m->root_container.signature), + strna(m->error.name), + strna(m->error.message), + bus_error_message(error, r)); + + return 1; +} diff --git a/src/libsystemd/sd-bus/bus-internal.h b/src/libsystemd/sd-bus/bus-internal.h new file mode 100644 index 0000000..098a518 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-internal.h @@ -0,0 +1,427 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +#include "bus-error.h" +#include "bus-kernel.h" +#include "bus-match.h" +#include "constants.h" +#include "hashmap.h" +#include "list.h" +#include "prioq.h" +#include "runtime-scope.h" +#include "socket-util.h" +#include "time-util.h" + +/* Note that we use the new /run prefix here (instead of /var/run) since we require them to be aliases and + * that way we become independent of /var being mounted */ +#define DEFAULT_SYSTEM_BUS_ADDRESS "unix:path=/run/dbus/system_bus_socket" +#define DEFAULT_USER_BUS_ADDRESS_FMT "unix:path=%s/bus" + +struct reply_callback { + sd_bus_message_handler_t callback; + usec_t timeout_usec; /* this is a relative timeout until we reach the BUS_HELLO state, and an absolute one right after */ + uint64_t cookie; + unsigned prioq_idx; +}; + +struct filter_callback { + sd_bus_message_handler_t callback; + + unsigned last_iteration; + + LIST_FIELDS(struct filter_callback, callbacks); +}; + +struct match_callback { + sd_bus_message_handler_t callback; + sd_bus_message_handler_t install_callback; + + sd_bus_slot *install_slot; /* The AddMatch() call */ + + unsigned last_iteration; + + /* Don't dispatch this slot with messages that arrived in any iteration before or at the this + * one. We use this to ensure that matches don't apply "retroactively" and confuse the caller: + * only messages received after the match was installed will be considered. */ + uint64_t after; + + char *match_string; + + struct bus_match_node *match_node; +}; + +struct node { + char *path; + struct node *parent; + LIST_HEAD(struct node, child); + LIST_FIELDS(struct node, siblings); + + LIST_HEAD(struct node_callback, callbacks); + LIST_HEAD(struct node_vtable, vtables); + LIST_HEAD(struct node_enumerator, enumerators); + LIST_HEAD(struct node_object_manager, object_managers); +}; + +struct node_callback { + struct node *node; + + bool is_fallback:1; + unsigned last_iteration; + + sd_bus_message_handler_t callback; + + LIST_FIELDS(struct node_callback, callbacks); +}; + +struct node_enumerator { + struct node *node; + + sd_bus_node_enumerator_t callback; + + unsigned last_iteration; + + LIST_FIELDS(struct node_enumerator, enumerators); +}; + +struct node_object_manager { + struct node *node; + + LIST_FIELDS(struct node_object_manager, object_managers); +}; + +struct node_vtable { + struct node *node; + + bool is_fallback:1; + unsigned last_iteration; + + char *interface; + const sd_bus_vtable *vtable; + sd_bus_object_find_t find; + + LIST_FIELDS(struct node_vtable, vtables); +}; + +struct vtable_member { + const char *path; + const char *interface; + const char *member; + struct node_vtable *parent; + unsigned last_iteration; + const sd_bus_vtable *vtable; +}; + +typedef enum BusSlotType { + BUS_REPLY_CALLBACK, + BUS_FILTER_CALLBACK, + BUS_MATCH_CALLBACK, + BUS_NODE_CALLBACK, + BUS_NODE_ENUMERATOR, + BUS_NODE_VTABLE, + BUS_NODE_OBJECT_MANAGER, + _BUS_SLOT_INVALID = -EINVAL, +} BusSlotType; + +struct sd_bus_slot { + unsigned n_ref; + BusSlotType type:8; + + /* Slots can be "floating" or not. If they are not floating (the usual case) then they reference the + * bus object they are associated with. This means the bus object stays allocated at least as long as + * there is a slot around associated with it. If it is floating, then the slot's lifecycle is bound + * to the lifecycle of the bus: it will be disconnected from the bus when the bus is destroyed, and + * it keeping the slot reffed hence won't mean the bus stays reffed too. Internally this means the + * reference direction is reversed: floating slots objects are referenced by the bus object, and not + * vice versa. */ + bool floating; + bool match_added; + + sd_bus *bus; + void *userdata; + sd_bus_destroy_t destroy_callback; + + char *description; + + LIST_FIELDS(sd_bus_slot, slots); + + union { + struct reply_callback reply_callback; + struct filter_callback filter_callback; + struct match_callback match_callback; + struct node_callback node_callback; + struct node_enumerator node_enumerator; + struct node_object_manager node_object_manager; + struct node_vtable node_vtable; + }; +}; + +enum bus_state { + BUS_UNSET, + BUS_WATCH_BIND, /* waiting for the socket to appear via inotify */ + BUS_OPENING, /* the kernel's connect() is still not ready */ + BUS_AUTHENTICATING, /* we are currently in the "SASL" authorization phase of dbus */ + BUS_HELLO, /* we are waiting for the Hello() response */ + BUS_RUNNING, + BUS_CLOSING, + BUS_CLOSED, + _BUS_STATE_MAX, +}; + +static inline bool BUS_IS_OPEN(enum bus_state state) { + return state > BUS_UNSET && state < BUS_CLOSING; +} + +enum bus_auth { + _BUS_AUTH_INVALID, + BUS_AUTH_EXTERNAL, + BUS_AUTH_ANONYMOUS +}; + +struct sd_bus { + unsigned n_ref; + + enum bus_state state; + int input_fd, output_fd; + int inotify_fd; + int message_version; + int message_endian; + + bool can_fds:1; + bool bus_client:1; + bool ucred_valid:1; + bool is_server:1; + bool anonymous_auth:1; + bool prefer_readv:1; + bool prefer_writev:1; + bool match_callbacks_modified:1; + bool filter_callbacks_modified:1; + bool nodes_modified:1; + bool trusted:1; + bool manual_peer_interface:1; + bool allow_interactive_authorization:1; + bool exit_on_disconnect:1; + bool exited:1; + bool exit_triggered:1; + bool is_local:1; + bool watch_bind:1; + bool is_monitor:1; + bool accept_fd:1; + bool attach_timestamp:1; + bool connected_signal:1; + bool close_on_exit:1; + + RuntimeScope runtime_scope; + + signed int use_memfd:2; + + void *rbuffer; + size_t rbuffer_size; + + sd_bus_message **rqueue; + size_t rqueue_size; + + sd_bus_message **wqueue; + size_t wqueue_size; + size_t windex; + + uint64_t cookie; + uint64_t read_counter; /* A counter for each incoming msg */ + + char *unique_name; + uint64_t unique_id; + + struct bus_match_node match_callbacks; + Prioq *reply_callbacks_prioq; + OrderedHashmap *reply_callbacks; + LIST_HEAD(struct filter_callback, filter_callbacks); + + Hashmap *nodes; + Hashmap *vtable_methods; + Hashmap *vtable_properties; + + union sockaddr_union sockaddr; + socklen_t sockaddr_size; + + pid_t nspid; + char *machine; + + sd_id128_t server_id; + + char *address; + unsigned address_index; + + int last_connect_error; + + enum bus_auth auth; + unsigned auth_index; + struct iovec auth_iovec[3]; + size_t auth_rbegin; + char *auth_buffer; + usec_t auth_timeout; + + struct ucred ucred; + char *label; + gid_t *groups; + size_t n_groups; + union sockaddr_union sockaddr_peer; + socklen_t sockaddr_size_peer; + + uint64_t creds_mask; + + int *fds; + size_t n_fds; + + char *exec_path; + char **exec_argv; + + /* We do locking around the memfd cache, since we want to + * allow people to process a sd_bus_message in a different + * thread then it was generated on and free it there. Since + * adding something to the memfd cache might happen when a + * message is released, we hence need to protect this bit with + * a mutex. */ + pthread_mutex_t memfd_cache_mutex; + struct memfd_cache memfd_cache[MEMFD_CACHE_MAX]; + unsigned n_memfd_cache; + + uint64_t origin_id; + pid_t busexec_pid; + + unsigned iteration_counter; + + sd_event_source *input_io_event_source; + sd_event_source *output_io_event_source; + sd_event_source *time_event_source; + sd_event_source *quit_event_source; + sd_event_source *inotify_event_source; + sd_event *event; + int event_priority; + + pid_t tid; + + sd_bus_message *current_message; + sd_bus_slot *current_slot; + sd_bus_message_handler_t current_handler; + void *current_userdata; + + sd_bus **default_bus_ptr; + + char *description; + char *patch_sender; + + sd_bus_track *track_queue; + + LIST_HEAD(sd_bus_slot, slots); + LIST_HEAD(sd_bus_track, tracks); + + int *inotify_watches; + size_t n_inotify_watches; + + /* zero means use value specified by $SYSTEMD_BUS_TIMEOUT= environment variable or built-in default */ + usec_t method_call_timeout; +}; + +/* For method calls we timeout at 25s, like in the D-Bus reference implementation */ +#define BUS_DEFAULT_TIMEOUT ((usec_t) (25 * USEC_PER_SEC)) + +/* For the authentication phase we grant 90s, to provide extra room during boot, when RNGs and such are not filled up + * with enough entropy yet and might delay the boot */ +#define BUS_AUTH_TIMEOUT ((usec_t) DEFAULT_TIMEOUT_USEC) + +#define BUS_WQUEUE_MAX (384*1024) +#define BUS_RQUEUE_MAX (384*1024) + +#define BUS_MESSAGE_SIZE_MAX (128*1024*1024) +#define BUS_AUTH_SIZE_MAX (64*1024) +/* Note that the D-Bus specification states that bus paths shall have no size limit. We enforce here one + * anyway, since truly unbounded strings are a security problem. The limit we pick is relatively large however, + * to not clash unnecessarily with real-life applications. */ +#define BUS_PATH_SIZE_MAX (64*1024) + +#define BUS_CONTAINER_DEPTH 128 + +/* Defined by the specification as maximum size of an array in bytes */ +#define BUS_ARRAY_MAX_SIZE 67108864 + +#define BUS_FDS_MAX 1024 + +#define BUS_EXEC_ARGV_MAX 256 + +bool interface_name_is_valid(const char *p) _pure_; +bool service_name_is_valid(const char *p) _pure_; +bool member_name_is_valid(const char *p) _pure_; +bool object_path_is_valid(const char *p) _pure_; + +char *object_path_startswith(const char *a, const char *b) _pure_; + +bool namespace_complex_pattern(const char *pattern, const char *value) _pure_; +bool path_complex_pattern(const char *pattern, const char *value) _pure_; + +bool namespace_simple_pattern(const char *pattern, const char *value) _pure_; +bool path_simple_pattern(const char *pattern, const char *value) _pure_; + +int bus_message_type_from_string(const char *s, uint8_t *u); +const char *bus_message_type_to_string(uint8_t u) _pure_; + +#define error_name_is_valid interface_name_is_valid + +sd_bus *bus_resolve(sd_bus *bus); + +int bus_ensure_running(sd_bus *bus); +int bus_start_running(sd_bus *bus); +int bus_next_address(sd_bus *bus); + +int bus_seal_synthetic_message(sd_bus *b, sd_bus_message *m); + +int bus_rqueue_make_room(sd_bus *bus); + +bool bus_origin_changed(sd_bus *bus); + +char *bus_address_escape(const char *v); + +int bus_attach_io_events(sd_bus *b); +int bus_attach_inotify_event(sd_bus *b); + +void bus_close_inotify_fd(sd_bus *b); +void bus_close_io_fds(sd_bus *b); + +int bus_add_match_full( + sd_bus *bus, + sd_bus_slot **slot, + bool asynchronous, + const char *match, + sd_bus_message_handler_t callback, + sd_bus_message_handler_t install_callback, + void *userdata, + uint64_t timeout_usec); + +#define OBJECT_PATH_FOREACH_PREFIX(prefix, path) \ + for (char *_slash = ({ strcpy((prefix), (path)); streq((prefix), "/") ? NULL : strrchr((prefix), '/'); }) ; \ + _slash && ((_slash[(_slash) == (prefix)] = 0), true); \ + _slash = streq((prefix), "/") ? NULL : strrchr((prefix), '/')) + +/* If we are invoking callbacks of a bus object, ensure unreffing the + * bus from the callback doesn't destroy the object we are working on */ +#define BUS_DONT_DESTROY(bus) \ + _cleanup_(sd_bus_unrefp) _unused_ sd_bus *_dont_destroy_##bus = sd_bus_ref(bus) + +int bus_set_address_system(sd_bus *bus); +int bus_set_address_user(sd_bus *bus); +int bus_set_address_system_remote(sd_bus *b, const char *host); +int bus_set_address_machine(sd_bus *b, RuntimeScope runtime_scope, const char *machine); + +int bus_maybe_reply_error(sd_bus_message *m, int r, sd_bus_error *error); + +#define bus_assert_return(expr, r, error) \ + do { \ + if (!assert_log(expr, #expr)) \ + return sd_bus_error_set_errno(error, r); \ + } while (false) + +void bus_enter_closing(sd_bus *bus); + +void bus_set_state(sd_bus *bus, enum bus_state state); diff --git a/src/libsystemd/sd-bus/bus-introspect.c b/src/libsystemd/sd-bus/bus-introspect.c new file mode 100644 index 0000000..84c8774 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-introspect.c @@ -0,0 +1,290 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-internal.h" +#include "bus-introspect.h" +#include "bus-objects.h" +#include "bus-protocol.h" +#include "bus-signature.h" +#include "fd-util.h" +#include "fileio.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "string-util.h" + +#define BUS_INTROSPECT_DOCTYPE \ + "\n" + +#define BUS_INTROSPECT_INTERFACE_PEER \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" + +#define BUS_INTROSPECT_INTERFACE_INTROSPECTABLE \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" + +#define BUS_INTROSPECT_INTERFACE_PROPERTIES \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" + +#define BUS_INTROSPECT_INTERFACE_OBJECT_MANAGER \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" \ + " \n" + +int introspect_begin(struct introspect *i, bool trusted) { + FILE *f; + + assert(i); + + *i = (struct introspect) { + .trusted = trusted, + }; + + f = memstream_init(&i->m); + if (!f) + return -ENOMEM; + + fputs(BUS_INTROSPECT_DOCTYPE + "\n", f); + + return 0; +} + +int introspect_write_default_interfaces(struct introspect *i, bool object_manager) { + assert(i); + assert(i->m.f); + + fputs(BUS_INTROSPECT_INTERFACE_PEER + BUS_INTROSPECT_INTERFACE_INTROSPECTABLE + BUS_INTROSPECT_INTERFACE_PROPERTIES, i->m.f); + + if (object_manager) + fputs(BUS_INTROSPECT_INTERFACE_OBJECT_MANAGER, i->m.f); + + return 0; +} + +static int set_interface_name(struct introspect *i, const char *interface_name) { + assert(i); + assert(i->m.f); + + if (streq_ptr(i->interface_name, interface_name)) + return 0; + + if (i->interface_name) + fputs("
\n", i->m.f); + + if (interface_name) + fprintf(i->m.f, " \n", interface_name); + + return free_and_strdup(&i->interface_name, interface_name); +} + +int introspect_write_child_nodes(struct introspect *i, OrderedSet *s, const char *prefix) { + char *node; + + assert(i); + assert(i->m.f); + assert(prefix); + + assert_se(set_interface_name(i, NULL) >= 0); + + while ((node = ordered_set_steal_first(s))) { + const char *e; + + e = object_path_startswith(node, prefix); + if (e && e[0]) + fprintf(i->m.f, " \n", e); + + free(node); + } + + return 0; +} + +static void introspect_write_flags(struct introspect *i, int type, uint64_t flags) { + assert(i); + assert(i->m.f); + + if (flags & SD_BUS_VTABLE_DEPRECATED) + fputs(" \n", i->m.f); + + if (type == _SD_BUS_VTABLE_METHOD && (flags & SD_BUS_VTABLE_METHOD_NO_REPLY)) + fputs(" \n", i->m.f); + + if (IN_SET(type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY)) { + if (flags & SD_BUS_VTABLE_PROPERTY_EXPLICIT) + fputs(" \n", i->m.f); + + if (flags & SD_BUS_VTABLE_PROPERTY_CONST) + fputs(" \n", i->m.f); + else if (flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION) + fputs(" \n", i->m.f); + else if (!(flags & SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE)) + fputs(" \n", i->m.f); + } + + if (!i->trusted && + IN_SET(type, _SD_BUS_VTABLE_METHOD, _SD_BUS_VTABLE_WRITABLE_PROPERTY) && + !(flags & SD_BUS_VTABLE_UNPRIVILEGED)) + fputs(" \n", i->m.f); +} + +/* Note that "names" is both an input and an output parameter. It initially points to the first argument name in a + NULL-separated list of strings, and is then advanced with each argument, and the resulting pointer is returned. */ +static int introspect_write_arguments(struct introspect *i, const char *signature, const char **names, const char *direction) { + int r; + + assert(i); + assert(i->m.f); + + for (;;) { + size_t l; + + if (!*signature) + return 0; + + r = signature_element_length(signature, &l); + if (r < 0) + return r; + + fprintf(i->m.f, " m.f, " name=\"%s\"", *names); + *names += strlen(*names) + 1; + } + + if (direction) + fprintf(i->m.f, " direction=\"%s\"/>\n", direction); + else + fputs("/>\n", i->m.f); + + signature += l; + } +} + +int introspect_write_interface( + struct introspect *i, + const char *interface_name, + const sd_bus_vtable *v) { + + const sd_bus_vtable *vtable = ASSERT_PTR(v); + const char *names = ""; + int r; + + assert(i); + assert(i->m.f); + assert(interface_name); + + r = set_interface_name(i, interface_name); + if (r < 0) + return r; + + for (; v->type != _SD_BUS_VTABLE_END; v = bus_vtable_next(vtable, v)) { + + /* Ignore methods, signals and properties that are + * marked "hidden", but do show the interface + * itself */ + + if (v->type != _SD_BUS_VTABLE_START && (v->flags & SD_BUS_VTABLE_HIDDEN)) + continue; + + switch (v->type) { + + case _SD_BUS_VTABLE_START: + if (v->flags & SD_BUS_VTABLE_DEPRECATED) + fputs(" \n", i->m.f); + break; + + case _SD_BUS_VTABLE_METHOD: + fprintf(i->m.f, " \n", v->x.method.member); + if (bus_vtable_has_names(vtable)) + names = strempty(v->x.method.names); + introspect_write_arguments(i, strempty(v->x.method.signature), &names, "in"); + introspect_write_arguments(i, strempty(v->x.method.result), &names, "out"); + introspect_write_flags(i, v->type, v->flags); + fputs(" \n", i->m.f); + break; + + case _SD_BUS_VTABLE_PROPERTY: + case _SD_BUS_VTABLE_WRITABLE_PROPERTY: + fprintf(i->m.f, " \n", + v->x.property.member, + v->x.property.signature, + v->type == _SD_BUS_VTABLE_WRITABLE_PROPERTY ? "readwrite" : "read"); + introspect_write_flags(i, v->type, v->flags); + fputs(" \n", i->m.f); + break; + + case _SD_BUS_VTABLE_SIGNAL: + fprintf(i->m.f, " \n", v->x.signal.member); + if (bus_vtable_has_names(vtable)) + names = strempty(v->x.signal.names); + introspect_write_arguments(i, strempty(v->x.signal.signature), &names, NULL); + introspect_write_flags(i, v->type, v->flags); + fputs(" \n", i->m.f); + break; + } + + } + + return 0; +} + +int introspect_finish(struct introspect *i, char **ret) { + assert(i); + assert(i->m.f); + + assert_se(set_interface_name(i, NULL) >= 0); + + fputs("\n", i->m.f); + + return memstream_finalize(&i->m, ret, NULL); +} + +void introspect_done(struct introspect *i) { + assert(i); + + /* Normally introspect_finish() does all the work, this is just a backup for error paths */ + + memstream_done(&i->m); + free(i->interface_name); +} diff --git a/src/libsystemd/sd-bus/bus-introspect.h b/src/libsystemd/sd-bus/bus-introspect.h new file mode 100644 index 0000000..83bcfb2 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-introspect.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +#include "memstream-util.h" +#include "ordered-set.h" + +struct introspect { + MemStream m; + char *interface_name; + bool trusted; +}; + +int introspect_begin(struct introspect *i, bool trusted); +int introspect_write_default_interfaces(struct introspect *i, bool object_manager); +int introspect_write_child_nodes(struct introspect *i, OrderedSet *s, const char *prefix); +int introspect_write_interface( + struct introspect *i, + const char *interface_name, + const sd_bus_vtable *v); +int introspect_finish(struct introspect *i, char **ret); +void introspect_done(struct introspect *i); diff --git a/src/libsystemd/sd-bus/bus-kernel.c b/src/libsystemd/sd-bus/bus-kernel.c new file mode 100644 index 0000000..d7ff834 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-kernel.c @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-kernel.h" +#include "bus-label.h" +#include "bus-message.h" +#include "capability-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "memfd-util.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" +#include "memory-util.h" + +void close_and_munmap(int fd, void *address, size_t size) { + if (size > 0) { + size = PAGE_ALIGN(size); + assert(size < SIZE_MAX); + assert_se(munmap(address, size) >= 0); + } + + safe_close(fd); +} + +void bus_flush_memfd(sd_bus *b) { + assert(b); + + for (unsigned i = 0; i < b->n_memfd_cache; i++) + close_and_munmap(b->memfd_cache[i].fd, b->memfd_cache[i].address, b->memfd_cache[i].mapped); +} diff --git a/src/libsystemd/sd-bus/bus-kernel.h b/src/libsystemd/sd-bus/bus-kernel.h new file mode 100644 index 0000000..be8e0ce --- /dev/null +++ b/src/libsystemd/sd-bus/bus-kernel.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#define MEMFD_CACHE_MAX 32 + +/* When we cache a memfd block for reuse, we will truncate blocks + * longer than this in order not to keep too much data around. */ +#define MEMFD_CACHE_ITEM_SIZE_MAX (128*1024) + +/* This determines at which minimum size we prefer sending memfds over + * sending vectors */ +#define MEMFD_MIN_SIZE (512*1024) + +struct memfd_cache { + int fd; + void *address; + size_t mapped; + size_t allocated; +}; + +void close_and_munmap(int fd, void *address, size_t size); +void bus_flush_memfd(sd_bus *bus); diff --git a/src/libsystemd/sd-bus/bus-match.c b/src/libsystemd/sd-bus/bus-match.c new file mode 100644 index 0000000..606304d --- /dev/null +++ b/src/libsystemd/sd-bus/bus-match.c @@ -0,0 +1,1058 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-match.h" +#include "bus-message.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "memstream-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" + +/* Example: + * + * A: type=signal,sender=foo,interface=bar + * B: type=signal,sender=quux,interface=fips + * C: type=signal,sender=quux,interface=waldo + * D: type=signal,member=test + * E: sender=miau + * F: type=signal + * G: type=signal + * + * results in this tree: + * + * BUS_MATCH_ROOT + * + BUS_MATCH_MESSAGE_TYPE + * | ` BUS_MATCH_VALUE: value == signal + * | + DBUS_MATCH_SENDER + * | | + BUS_MATCH_VALUE: value == foo + * | | | ` DBUS_MATCH_INTERFACE + * | | | ` BUS_MATCH_VALUE: value == bar + * | | | ` BUS_MATCH_LEAF: A + * | | ` BUS_MATCH_VALUE: value == quux + * | | ` DBUS_MATCH_INTERFACE + * | | | BUS_MATCH_VALUE: value == fips + * | | | ` BUS_MATCH_LEAF: B + * | | ` BUS_MATCH_VALUE: value == waldo + * | | ` BUS_MATCH_LEAF: C + * | + DBUS_MATCH_MEMBER + * | | ` BUS_MATCH_VALUE: value == test + * | | ` BUS_MATCH_LEAF: D + * | + BUS_MATCH_LEAF: F + * | ` BUS_MATCH_LEAF: G + * ` BUS_MATCH_SENDER + * ` BUS_MATCH_VALUE: value == miau + * ` BUS_MATCH_LEAF: E + */ + +static bool BUS_MATCH_IS_COMPARE(enum bus_match_node_type t) { + return t >= BUS_MATCH_SENDER && t <= BUS_MATCH_ARG_HAS_LAST; +} + +static bool BUS_MATCH_CAN_HASH(enum bus_match_node_type t) { + return (t >= BUS_MATCH_MESSAGE_TYPE && t <= BUS_MATCH_PATH) || + (t >= BUS_MATCH_ARG && t <= BUS_MATCH_ARG_LAST) || + (t >= BUS_MATCH_ARG_HAS && t <= BUS_MATCH_ARG_HAS_LAST); +} + +static void bus_match_node_free(struct bus_match_node *node) { + assert(node); + assert(node->parent); + assert(!node->child); + assert(node->type != BUS_MATCH_ROOT); + assert(node->type < _BUS_MATCH_NODE_TYPE_MAX); + + if (node->parent->child) { + /* We are apparently linked into the parent's child + * list. Let's remove us from there. */ + if (node->prev) { + assert(node->prev->next == node); + node->prev->next = node->next; + } else { + assert(node->parent->child == node); + node->parent->child = node->next; + } + + if (node->next) + node->next->prev = node->prev; + } + + if (node->type == BUS_MATCH_VALUE) { + /* We might be in the parent's hash table, so clean + * this up */ + + if (node->parent->type == BUS_MATCH_MESSAGE_TYPE) + hashmap_remove(node->parent->compare.children, UINT_TO_PTR(node->value.u8)); + else if (BUS_MATCH_CAN_HASH(node->parent->type) && node->value.str) + hashmap_remove(node->parent->compare.children, node->value.str); + + free(node->value.str); + } + + if (BUS_MATCH_IS_COMPARE(node->type)) { + assert(hashmap_isempty(node->compare.children)); + hashmap_free(node->compare.children); + } + + free(node); +} + +static bool bus_match_node_maybe_free(struct bus_match_node *node) { + assert(node); + + if (node->type == BUS_MATCH_ROOT) + return false; + + if (node->child) + return false; + + if (BUS_MATCH_IS_COMPARE(node->type) && !hashmap_isempty(node->compare.children)) + return true; + + bus_match_node_free(node); + return true; +} + +static bool value_node_test( + struct bus_match_node *node, + enum bus_match_node_type parent_type, + uint8_t value_u8, + const char *value_str, + char **value_strv, + sd_bus_message *m) { + + assert(node); + assert(node->type == BUS_MATCH_VALUE); + + /* Tests parameters against this value node, doing prefix + * magic and stuff. */ + + switch (parent_type) { + + case BUS_MATCH_MESSAGE_TYPE: + return node->value.u8 == value_u8; + + case BUS_MATCH_SENDER: + if (streq_ptr(node->value.str, value_str)) + return true; + + if (m->creds.mask & SD_BUS_CREDS_WELL_KNOWN_NAMES) { + /* on kdbus we have the well known names list + * in the credentials, let's make use of that + * for an accurate match */ + + STRV_FOREACH(i, m->creds.well_known_names) + if (streq_ptr(node->value.str, *i)) + return true; + + } else { + + /* If we don't have kdbus, we don't know the + * well-known names of the senders. In that, + * let's just hope that dbus-daemon doesn't + * send us stuff we didn't want. */ + + if (node->value.str[0] != ':' && value_str && value_str[0] == ':') + return true; + } + + return false; + + case BUS_MATCH_DESTINATION: + case BUS_MATCH_INTERFACE: + case BUS_MATCH_MEMBER: + case BUS_MATCH_PATH: + case BUS_MATCH_ARG ... BUS_MATCH_ARG_LAST: + + if (value_str) + return streq_ptr(node->value.str, value_str); + + return false; + + case BUS_MATCH_ARG_HAS ... BUS_MATCH_ARG_HAS_LAST: { + STRV_FOREACH(i, value_strv) + if (streq_ptr(node->value.str, *i)) + return true; + + return false; + } + + case BUS_MATCH_ARG_NAMESPACE ... BUS_MATCH_ARG_NAMESPACE_LAST: + if (value_str) + return namespace_simple_pattern(node->value.str, value_str); + + return false; + + case BUS_MATCH_PATH_NAMESPACE: + return path_simple_pattern(node->value.str, value_str); + + case BUS_MATCH_ARG_PATH ... BUS_MATCH_ARG_PATH_LAST: + if (value_str) + return path_complex_pattern(node->value.str, value_str); + + return false; + + default: + assert_not_reached(); + } +} + +static bool value_node_same( + struct bus_match_node *node, + enum bus_match_node_type parent_type, + uint8_t value_u8, + const char *value_str) { + + /* Tests parameters against this value node, not doing prefix + * magic and stuff, i.e. this one actually compares the match + * itself. */ + + assert(node); + assert(node->type == BUS_MATCH_VALUE); + + switch (parent_type) { + + case BUS_MATCH_MESSAGE_TYPE: + return node->value.u8 == value_u8; + + case BUS_MATCH_SENDER: + case BUS_MATCH_DESTINATION: + case BUS_MATCH_INTERFACE: + case BUS_MATCH_MEMBER: + case BUS_MATCH_PATH: + case BUS_MATCH_ARG ... BUS_MATCH_ARG_LAST: + case BUS_MATCH_ARG_HAS ... BUS_MATCH_ARG_HAS_LAST: + case BUS_MATCH_ARG_NAMESPACE ... BUS_MATCH_ARG_NAMESPACE_LAST: + case BUS_MATCH_PATH_NAMESPACE: + case BUS_MATCH_ARG_PATH ... BUS_MATCH_ARG_PATH_LAST: + return streq(node->value.str, value_str); + + default: + assert_not_reached(); + } +} + +int bus_match_run( + sd_bus *bus, + struct bus_match_node *node, + sd_bus_message *m) { + + _cleanup_strv_free_ char **test_strv = NULL; + const char *test_str = NULL; + uint8_t test_u8 = 0; + int r; + + assert(m); + + if (!node) + return 0; + + if (bus && bus->match_callbacks_modified) + return 0; + + /* Not these special semantics: when traversing the tree we + * usually let bus_match_run() when called for a node + * recursively invoke bus_match_run(). There's are two + * exceptions here though, which are BUS_NODE_ROOT (which + * cannot have a sibling), and BUS_NODE_VALUE (whose siblings + * are invoked anyway by its parent. */ + + switch (node->type) { + + case BUS_MATCH_ROOT: + + /* Run all children. Since we cannot have any siblings + * we won't call any. The children of the root node + * are compares or leaves, they will automatically + * call their siblings. */ + return bus_match_run(bus, node->child, m); + + case BUS_MATCH_VALUE: + + /* Run all children. We don't execute any siblings, we + * assume our caller does that. The children of value + * nodes are compares or leaves, they will + * automatically call their siblings */ + + assert(node->child); + return bus_match_run(bus, node->child, m); + + case BUS_MATCH_LEAF: + + if (bus) { + /* Don't run this match as long as the AddMatch() call is not complete yet. + * + * Don't run this match unless the 'after' counter has been reached. + * + * Don't run this match more than once per iteration */ + + if (node->leaf.callback->install_slot || + m->read_counter <= node->leaf.callback->after || + node->leaf.callback->last_iteration == bus->iteration_counter) + return bus_match_run(bus, node->next, m); + + node->leaf.callback->last_iteration = bus->iteration_counter; + } + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + /* Run the callback. And then invoke siblings. */ + if (node->leaf.callback->callback) { + _cleanup_(sd_bus_error_free) sd_bus_error error_buffer = SD_BUS_ERROR_NULL; + sd_bus_slot *slot; + + slot = container_of(node->leaf.callback, sd_bus_slot, match_callback); + if (bus) { + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = node->leaf.callback->callback; + bus->current_userdata = slot->userdata; + } + r = node->leaf.callback->callback(m, slot->userdata, &error_buffer); + if (bus) { + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + } + + r = bus_maybe_reply_error(m, r, &error_buffer); + if (r != 0) + return r; + + if (bus && bus->match_callbacks_modified) + return 0; + } + + return bus_match_run(bus, node->next, m); + + case BUS_MATCH_MESSAGE_TYPE: + test_u8 = m->header->type; + break; + + case BUS_MATCH_SENDER: + test_str = m->sender; + /* FIXME: resolve test_str from a well-known to a unique name first */ + break; + + case BUS_MATCH_DESTINATION: + test_str = m->destination; + break; + + case BUS_MATCH_INTERFACE: + test_str = m->interface; + break; + + case BUS_MATCH_MEMBER: + test_str = m->member; + break; + + case BUS_MATCH_PATH: + case BUS_MATCH_PATH_NAMESPACE: + test_str = m->path; + break; + + case BUS_MATCH_ARG ... BUS_MATCH_ARG_LAST: + (void) bus_message_get_arg(m, node->type - BUS_MATCH_ARG, &test_str); + break; + + case BUS_MATCH_ARG_PATH ... BUS_MATCH_ARG_PATH_LAST: + (void) bus_message_get_arg(m, node->type - BUS_MATCH_ARG_PATH, &test_str); + break; + + case BUS_MATCH_ARG_NAMESPACE ... BUS_MATCH_ARG_NAMESPACE_LAST: + (void) bus_message_get_arg(m, node->type - BUS_MATCH_ARG_NAMESPACE, &test_str); + break; + + case BUS_MATCH_ARG_HAS ... BUS_MATCH_ARG_HAS_LAST: + (void) bus_message_get_arg_strv(m, node->type - BUS_MATCH_ARG_HAS, &test_strv); + break; + + default: + assert_not_reached(); + } + + if (BUS_MATCH_CAN_HASH(node->type)) { + struct bus_match_node *found; + + /* Lookup via hash table, nice! So let's jump directly. */ + + if (test_str) + found = hashmap_get(node->compare.children, test_str); + else if (test_strv) { + STRV_FOREACH(i, test_strv) { + found = hashmap_get(node->compare.children, *i); + if (found) { + r = bus_match_run(bus, found, m); + if (r != 0) + return r; + } + } + + found = NULL; + } else if (node->type == BUS_MATCH_MESSAGE_TYPE) + found = hashmap_get(node->compare.children, UINT_TO_PTR(test_u8)); + else + found = NULL; + + if (found) { + r = bus_match_run(bus, found, m); + if (r != 0) + return r; + } + } else + /* No hash table, so let's iterate manually... */ + for (struct bus_match_node *c = node->child; c; c = c->next) { + if (!value_node_test(c, node->type, test_u8, test_str, test_strv, m)) + continue; + + r = bus_match_run(bus, c, m); + if (r != 0) + return r; + + if (bus && bus->match_callbacks_modified) + return 0; + } + + if (bus && bus->match_callbacks_modified) + return 0; + + /* And now, let's invoke our siblings */ + return bus_match_run(bus, node->next, m); +} + +static int bus_match_add_compare_value( + struct bus_match_node *where, + enum bus_match_node_type t, + uint8_t value_u8, + const char *value_str, + struct bus_match_node **ret) { + + struct bus_match_node *c, *n = NULL; + int r; + + assert(where); + assert(IN_SET(where->type, BUS_MATCH_ROOT, BUS_MATCH_VALUE)); + assert(BUS_MATCH_IS_COMPARE(t)); + assert(ret); + + for (c = where->child; c && c->type != t; c = c->next) + ; + + if (c) { + /* Comparison node already exists? Then let's see if the value node exists too. */ + + if (t == BUS_MATCH_MESSAGE_TYPE) + n = hashmap_get(c->compare.children, UINT_TO_PTR(value_u8)); + else if (BUS_MATCH_CAN_HASH(t)) + n = hashmap_get(c->compare.children, value_str); + else + for (n = c->child; n && !value_node_same(n, t, value_u8, value_str); n = n->next) + ; + + if (n) { + *ret = n; + return 0; + } + } else { + /* Comparison node, doesn't exist yet? Then let's create it. */ + + c = new0(struct bus_match_node, 1); + if (!c) { + r = -ENOMEM; + goto fail; + } + + c->type = t; + c->parent = where; + c->next = where->child; + if (c->next) + c->next->prev = c; + where->child = c; + + if (t == BUS_MATCH_MESSAGE_TYPE) { + c->compare.children = hashmap_new(NULL); + if (!c->compare.children) { + r = -ENOMEM; + goto fail; + } + } else if (BUS_MATCH_CAN_HASH(t)) { + c->compare.children = hashmap_new(&string_hash_ops); + if (!c->compare.children) { + r = -ENOMEM; + goto fail; + } + } + } + + n = new0(struct bus_match_node, 1); + if (!n) { + r = -ENOMEM; + goto fail; + } + + n->type = BUS_MATCH_VALUE; + n->value.u8 = value_u8; + if (value_str) { + n->value.str = strdup(value_str); + if (!n->value.str) { + r = -ENOMEM; + goto fail; + } + } + + n->parent = c; + if (c->compare.children) { + + if (t == BUS_MATCH_MESSAGE_TYPE) + r = hashmap_put(c->compare.children, UINT_TO_PTR(value_u8), n); + else + r = hashmap_put(c->compare.children, n->value.str, n); + + if (r < 0) + goto fail; + } else { + n->next = c->child; + if (n->next) + n->next->prev = n; + c->child = n; + } + + *ret = n; + return 1; + +fail: + if (c) + bus_match_node_maybe_free(c); + + if (n) { + free(n->value.str); + free(n); + } + + return r; +} + +static int bus_match_add_leaf( + struct bus_match_node *where, + struct match_callback *callback) { + + struct bus_match_node *n; + + assert(where); + assert(IN_SET(where->type, BUS_MATCH_ROOT, BUS_MATCH_VALUE)); + assert(callback); + + n = new0(struct bus_match_node, 1); + if (!n) + return -ENOMEM; + + n->type = BUS_MATCH_LEAF; + n->parent = where; + n->next = where->child; + if (n->next) + n->next->prev = n; + + n->leaf.callback = callback; + callback->match_node = n; + + where->child = n; + + return 1; +} + +enum bus_match_node_type bus_match_node_type_from_string(const char *k, size_t n) { + assert(k); + + if (n == 4 && startswith(k, "type")) + return BUS_MATCH_MESSAGE_TYPE; + if (n == 6 && startswith(k, "sender")) + return BUS_MATCH_SENDER; + if (n == 11 && startswith(k, "destination")) + return BUS_MATCH_DESTINATION; + if (n == 9 && startswith(k, "interface")) + return BUS_MATCH_INTERFACE; + if (n == 6 && startswith(k, "member")) + return BUS_MATCH_MEMBER; + if (n == 4 && startswith(k, "path")) + return BUS_MATCH_PATH; + if (n == 14 && startswith(k, "path_namespace")) + return BUS_MATCH_PATH_NAMESPACE; + + if (n == 4 && startswith(k, "arg")) { + int j; + + j = undecchar(k[3]); + if (j < 0) + return -EINVAL; + + return BUS_MATCH_ARG + j; + } + + if (n == 5 && startswith(k, "arg")) { + int a, b; + enum bus_match_node_type t; + + a = undecchar(k[3]); + b = undecchar(k[4]); + if (a <= 0 || b < 0) + return -EINVAL; + + t = BUS_MATCH_ARG + a * 10 + b; + if (t > BUS_MATCH_ARG_LAST) + return -EINVAL; + + return t; + } + + if (n == 8 && startswith(k, "arg") && startswith(k + 4, "path")) { + int j; + + j = undecchar(k[3]); + if (j < 0) + return -EINVAL; + + return BUS_MATCH_ARG_PATH + j; + } + + if (n == 9 && startswith(k, "arg") && startswith(k + 5, "path")) { + enum bus_match_node_type t; + int a, b; + + a = undecchar(k[3]); + b = undecchar(k[4]); + if (a <= 0 || b < 0) + return -EINVAL; + + t = BUS_MATCH_ARG_PATH + a * 10 + b; + if (t > BUS_MATCH_ARG_PATH_LAST) + return -EINVAL; + + return t; + } + + if (n == 13 && startswith(k, "arg") && startswith(k + 4, "namespace")) { + int j; + + j = undecchar(k[3]); + if (j < 0) + return -EINVAL; + + return BUS_MATCH_ARG_NAMESPACE + j; + } + + if (n == 14 && startswith(k, "arg") && startswith(k + 5, "namespace")) { + enum bus_match_node_type t; + int a, b; + + a = undecchar(k[3]); + b = undecchar(k[4]); + if (a <= 0 || b < 0) + return -EINVAL; + + t = BUS_MATCH_ARG_NAMESPACE + a * 10 + b; + if (t > BUS_MATCH_ARG_NAMESPACE_LAST) + return -EINVAL; + + return t; + } + + if (n == 7 && startswith(k, "arg") && startswith(k + 4, "has")) { + int j; + + j = undecchar(k[3]); + if (j < 0) + return -EINVAL; + + return BUS_MATCH_ARG_HAS + j; + } + + if (n == 8 && startswith(k, "arg") && startswith(k + 5, "has")) { + enum bus_match_node_type t; + int a, b; + + a = undecchar(k[3]); + b = undecchar(k[4]); + if (a <= 0 || b < 0) + return -EINVAL; + + t = BUS_MATCH_ARG_HAS + a * 10 + b; + if (t > BUS_MATCH_ARG_HAS_LAST) + return -EINVAL; + + return t; + } + + return -EINVAL; +} + +static int match_component_compare(const struct bus_match_component *a, const struct bus_match_component *b) { + return CMP(a->type, b->type); +} + +void bus_match_parse_free(struct bus_match_component *components, size_t n_components) { + for (size_t i = 0; i < n_components; i++) + free(components[i].value_str); + + free(components); +} + +int bus_match_parse( + const char *match, + struct bus_match_component **ret_components, + size_t *ret_n_components) { + + struct bus_match_component *components = NULL; + size_t n_components = 0; + int r; + + assert(match); + assert(ret_components); + assert(ret_n_components); + + CLEANUP_ARRAY(components, n_components, bus_match_parse_free); + + while (*match != '\0') { + const char *eq, *q; + enum bus_match_node_type t; + size_t j = 0; + _cleanup_free_ char *value = NULL; + bool escaped = false, quoted; + uint8_t u; + + /* Avahi's match rules appear to include whitespace, skip over it */ + match += strspn(match, " "); + + eq = strchr(match, '='); + if (!eq) + return -EINVAL; + + t = bus_match_node_type_from_string(match, eq - match); + if (t < 0) + return -EINVAL; + + quoted = eq[1] == '\''; + + for (q = eq + 1 + quoted;; q++) { + + if (*q == '\0') { + + if (quoted) + return -EINVAL; + + if (value) + value[j] = '\0'; + break; + } + + if (!escaped) { + if (*q == '\\') { + escaped = true; + continue; + } + + if (quoted) { + if (*q == '\'') { + if (value) + value[j] = '\0'; + break; + } + } else { + if (*q == ',') { + if (value) + value[j] = '\0'; + break; + } + } + } + + if (!GREEDY_REALLOC(value, j + 2)) + return -ENOMEM; + + value[j++] = *q; + escaped = false; + } + + if (!value) { + value = strdup(""); + if (!value) + return -ENOMEM; + } + + if (t == BUS_MATCH_MESSAGE_TYPE) { + r = bus_message_type_from_string(value, &u); + if (r < 0) + return r; + + value = mfree(value); + } else + u = 0; + + if (!GREEDY_REALLOC(components, n_components + 1)) + return -ENOMEM; + + components[n_components++] = (struct bus_match_component) { + .type = t, + .value_str = TAKE_PTR(value), + .value_u8 = u, + }; + + if (q[quoted] == 0) + break; + + if (q[quoted] != ',') + return -EINVAL; + + match = q + 1 + quoted; + } + + /* Order the whole thing, so that we always generate the same tree */ + typesafe_qsort(components, n_components, match_component_compare); + + /* Check for duplicates */ + for (size_t i = 0; i+1 < n_components; i++) + if (components[i].type == components[i+1].type) + return -EINVAL; + + *ret_components = TAKE_PTR(components); + *ret_n_components = n_components; + + return 0; +} + +char *bus_match_to_string(struct bus_match_component *components, size_t n_components) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + int r; + + if (n_components <= 0) + return strdup(""); + + assert(components); + + f = memstream_init(&m); + if (!f) + return NULL; + + for (size_t i = 0; i < n_components; i++) { + char buf[32]; + + if (i != 0) + fputc(',', f); + + fputs(bus_match_node_type_to_string(components[i].type, buf, sizeof(buf)), f); + fputc('=', f); + fputc('\'', f); + + if (components[i].type == BUS_MATCH_MESSAGE_TYPE) + fputs(bus_message_type_to_string(components[i].value_u8), f); + else + fputs(components[i].value_str, f); + + fputc('\'', f); + } + + char *buffer; + r = memstream_finalize(&m, &buffer, NULL); + if (r < 0) + return NULL; + + return buffer; +} + +int bus_match_add( + struct bus_match_node *root, + struct bus_match_component *components, + size_t n_components, + struct match_callback *callback) { + + int r; + + assert(root); + assert(callback); + + for (size_t i = 0; i < n_components; i++) { + r = bus_match_add_compare_value(root, + components[i].type, + components[i].value_u8, + components[i].value_str, + &root); + if (r < 0) + return r; + } + + return bus_match_add_leaf(root, callback); +} + +int bus_match_remove( + struct bus_match_node *root, + struct match_callback *callback) { + + struct bus_match_node *node, *pp; + + assert(root); + assert(callback); + + node = callback->match_node; + if (!node) + return 0; + + assert(node->type == BUS_MATCH_LEAF); + + callback->match_node = NULL; + + /* Free the leaf */ + pp = node->parent; + bus_match_node_free(node); + + /* Prune the tree above */ + while (pp) { + node = pp; + pp = node->parent; + + if (!bus_match_node_maybe_free(node)) + break; + } + + return 1; +} + +void bus_match_free(struct bus_match_node *node) { + struct bus_match_node *c; + + if (!node) + return; + + if (BUS_MATCH_CAN_HASH(node->type)) { + + HASHMAP_FOREACH(c, node->compare.children) + bus_match_free(c); + + assert(hashmap_isempty(node->compare.children)); + } + + while ((c = node->child)) + bus_match_free(c); + + if (node->type != BUS_MATCH_ROOT) + bus_match_node_free(node); +} + +const char* bus_match_node_type_to_string(enum bus_match_node_type t, char buf[], size_t l) { + switch (t) { + + case BUS_MATCH_ROOT: + return "root"; + + case BUS_MATCH_VALUE: + return "value"; + + case BUS_MATCH_LEAF: + return "leaf"; + + case BUS_MATCH_MESSAGE_TYPE: + return "type"; + + case BUS_MATCH_SENDER: + return "sender"; + + case BUS_MATCH_DESTINATION: + return "destination"; + + case BUS_MATCH_INTERFACE: + return "interface"; + + case BUS_MATCH_MEMBER: + return "member"; + + case BUS_MATCH_PATH: + return "path"; + + case BUS_MATCH_PATH_NAMESPACE: + return "path_namespace"; + + case BUS_MATCH_ARG ... BUS_MATCH_ARG_LAST: + return snprintf_ok(buf, l, "arg%i", t - BUS_MATCH_ARG); + + case BUS_MATCH_ARG_PATH ... BUS_MATCH_ARG_PATH_LAST: + return snprintf_ok(buf, l, "arg%ipath", t - BUS_MATCH_ARG_PATH); + + case BUS_MATCH_ARG_NAMESPACE ... BUS_MATCH_ARG_NAMESPACE_LAST: + return snprintf_ok(buf, l, "arg%inamespace", t - BUS_MATCH_ARG_NAMESPACE); + + case BUS_MATCH_ARG_HAS ... BUS_MATCH_ARG_HAS_LAST: + return snprintf_ok(buf, l, "arg%ihas", t - BUS_MATCH_ARG_HAS); + + default: + return NULL; + } +} + +void bus_match_dump(FILE *out, struct bus_match_node *node, unsigned level) { + char buf[32]; + + if (!node) + return; + + fprintf(out, "%*s[%s]", 2 * (int) level, "", bus_match_node_type_to_string(node->type, buf, sizeof(buf))); + + if (node->type == BUS_MATCH_VALUE) { + if (node->parent->type == BUS_MATCH_MESSAGE_TYPE) + fprintf(out, " <%u>\n", node->value.u8); + else + fprintf(out, " <%s>\n", node->value.str); + } else if (node->type == BUS_MATCH_ROOT) + fputs(" root\n", out); + else if (node->type == BUS_MATCH_LEAF) + fprintf(out, " %p/%p\n", node->leaf.callback->callback, + container_of(node->leaf.callback, sd_bus_slot, match_callback)->userdata); + else + putc('\n', out); + + if (BUS_MATCH_CAN_HASH(node->type)) { + struct bus_match_node *c; + HASHMAP_FOREACH(c, node->compare.children) + bus_match_dump(out, c, level + 1); + } + + for (struct bus_match_node *c = node->child; c; c = c->next) + bus_match_dump(out, c, level + 1); +} + +enum bus_match_scope bus_match_get_scope(const struct bus_match_component *components, size_t n_components) { + bool found_driver = false; + + if (n_components <= 0) + return BUS_MATCH_GENERIC; + + assert(components); + + /* Checks whether the specified match can only match the + * pseudo-service for local messages, which we detect by + * sender, interface or path. If a match is not restricted to + * local messages, then we check if it only matches on the + * driver. */ + + for (size_t i = 0; i < n_components; i++) { + const struct bus_match_component *c = components + i; + + if (c->type == BUS_MATCH_SENDER) { + if (streq_ptr(c->value_str, "org.freedesktop.DBus.Local")) + return BUS_MATCH_LOCAL; + + if (streq_ptr(c->value_str, "org.freedesktop.DBus")) + found_driver = true; + } + + if (c->type == BUS_MATCH_INTERFACE && streq_ptr(c->value_str, "org.freedesktop.DBus.Local")) + return BUS_MATCH_LOCAL; + + if (c->type == BUS_MATCH_PATH && streq_ptr(c->value_str, "/org/freedesktop/DBus/Local")) + return BUS_MATCH_LOCAL; + } + + return found_driver ? BUS_MATCH_DRIVER : BUS_MATCH_GENERIC; +} diff --git a/src/libsystemd/sd-bus/bus-match.h b/src/libsystemd/sd-bus/bus-match.h new file mode 100644 index 0000000..ccb6aae --- /dev/null +++ b/src/libsystemd/sd-bus/bus-match.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +#include "hashmap.h" + +enum bus_match_node_type { + BUS_MATCH_ROOT, + BUS_MATCH_VALUE, + BUS_MATCH_LEAF, + + /* The following are all different kinds of compare nodes */ + BUS_MATCH_SENDER, + BUS_MATCH_MESSAGE_TYPE, + BUS_MATCH_DESTINATION, + BUS_MATCH_INTERFACE, + BUS_MATCH_MEMBER, + BUS_MATCH_PATH, + BUS_MATCH_PATH_NAMESPACE, + BUS_MATCH_ARG, + BUS_MATCH_ARG_LAST = BUS_MATCH_ARG + 63, + BUS_MATCH_ARG_PATH, + BUS_MATCH_ARG_PATH_LAST = BUS_MATCH_ARG_PATH + 63, + BUS_MATCH_ARG_NAMESPACE, + BUS_MATCH_ARG_NAMESPACE_LAST = BUS_MATCH_ARG_NAMESPACE + 63, + BUS_MATCH_ARG_HAS, + BUS_MATCH_ARG_HAS_LAST = BUS_MATCH_ARG_HAS + 63, + _BUS_MATCH_NODE_TYPE_MAX, + _BUS_MATCH_NODE_TYPE_INVALID = -EINVAL, +}; + +struct bus_match_node { + enum bus_match_node_type type; + struct bus_match_node *parent, *next, *prev, *child; + + union { + struct { + char *str; + uint8_t u8; + } value; + struct { + struct match_callback *callback; + } leaf; + struct { + /* If this is set, then the child is NULL */ + Hashmap *children; + } compare; + }; +}; + +struct bus_match_component { + enum bus_match_node_type type; + uint8_t value_u8; + char *value_str; +}; + +enum bus_match_scope { + BUS_MATCH_GENERIC, + BUS_MATCH_LOCAL, + BUS_MATCH_DRIVER, +}; + +int bus_match_run(sd_bus *bus, struct bus_match_node *root, sd_bus_message *m); + +int bus_match_add(struct bus_match_node *root, struct bus_match_component *components, size_t n_components, struct match_callback *callback); +int bus_match_remove(struct bus_match_node *root, struct match_callback *callback); + +void bus_match_free(struct bus_match_node *node); + +void bus_match_dump(FILE *out, struct bus_match_node *node, unsigned level); + +const char* bus_match_node_type_to_string(enum bus_match_node_type t, char buf[], size_t l); +enum bus_match_node_type bus_match_node_type_from_string(const char *k, size_t n); + +int bus_match_parse(const char *match, struct bus_match_component **ret_components, size_t *ret_n_components); +void bus_match_parse_free(struct bus_match_component *components, size_t n_components); +char *bus_match_to_string(struct bus_match_component *components, size_t n_components); + +enum bus_match_scope bus_match_get_scope(const struct bus_match_component *components, size_t n_components); diff --git a/src/libsystemd/sd-bus/bus-message.c b/src/libsystemd/sd-bus/bus-message.c new file mode 100644 index 0000000..ab8b068 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-message.c @@ -0,0 +1,4712 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-message.h" +#include "bus-signature.h" +#include "bus-type.h" +#include "fd-util.h" +#include "iovec-util.h" +#include "memfd-util.h" +#include "memory-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "utf8.h" + +static int message_append_basic(sd_bus_message *m, char type, const void *p, const void **stored); +static int message_parse_fields(sd_bus_message *m); + +static void *adjust_pointer(const void *p, void *old_base, size_t sz, void *new_base) { + + if (!p) + return NULL; + + if (old_base == new_base) + return (void*) p; + + if ((uint8_t*) p < (uint8_t*) old_base) + return (void*) p; + + if ((uint8_t*) p >= (uint8_t*) old_base + sz) + return (void*) p; + + return (uint8_t*) new_base + ((uint8_t*) p - (uint8_t*) old_base); +} + +static void message_free_part(sd_bus_message *m, struct bus_body_part *part) { + assert(m); + assert(part); + + if (part->memfd >= 0) { + /* erase if requested, but only if the memfd is not sealed yet, i.e. is writable */ + if (m->sensitive && !m->sealed) + explicit_bzero_safe(part->data, part->size); + + close_and_munmap(part->memfd, part->mmap_begin, part->mapped); + } else if (part->munmap_this) + /* We don't erase sensitive data here, since the data is memory mapped from someone else, and + * we just don't know if it's OK to write to it */ + munmap(part->mmap_begin, part->mapped); + else { + /* Erase this if that is requested. Since this is regular memory we know we can write it. */ + if (m->sensitive) + explicit_bzero_safe(part->data, part->size); + + if (part->free_this) + free(part->data); + } + + if (part != &m->body) + free(part); +} + +static void message_reset_parts(sd_bus_message *m) { + struct bus_body_part *part; + + assert(m); + + part = &m->body; + while (m->n_body_parts > 0) { + struct bus_body_part *next = part->next; + message_free_part(m, part); + part = next; + m->n_body_parts--; + } + + m->body_end = NULL; + + m->cached_rindex_part = NULL; + m->cached_rindex_part_begin = 0; +} + +static struct bus_container *message_get_last_container(sd_bus_message *m) { + assert(m); + + if (m->n_containers == 0) + return &m->root_container; + + assert(m->containers); + return m->containers + m->n_containers - 1; +} + +static void message_free_last_container(sd_bus_message *m) { + struct bus_container *c; + + c = message_get_last_container(m); + + free(c->signature); + free(c->peeked_signature); + + /* Move to previous container, but not if we are on root container */ + if (m->n_containers > 0) + m->n_containers--; +} + +static void message_reset_containers(sd_bus_message *m) { + assert(m); + + while (m->n_containers > 0) + message_free_last_container(m); + + m->containers = mfree(m->containers); + m->root_container.index = 0; +} + +static sd_bus_message* message_free(sd_bus_message *m) { + assert(m); + + message_reset_parts(m); + + if (m->free_header) + free(m->header); + + /* Note that we don't unref m->bus here. That's already done by sd_bus_message_unref() as each user + * reference to the bus message also is considered a reference to the bus connection itself. */ + + if (m->free_fds) { + close_many(m->fds, m->n_fds); + free(m->fds); + } + + if (m->iovec != m->iovec_fixed) + free(m->iovec); + + message_reset_containers(m); + assert(m->n_containers == 0); + message_free_last_container(m); + + bus_creds_done(&m->creds); + return mfree(m); +} + +static void *message_extend_fields(sd_bus_message *m, size_t sz, bool add_offset) { + void *op, *np; + size_t old_size, new_size, start; + + assert(m); + + if (m->poisoned) + return NULL; + + old_size = sizeof(struct bus_header) + m->fields_size; + start = ALIGN8(old_size); + new_size = start + sz; + + if (new_size < start || new_size > UINT32_MAX) + goto poison; + + if (old_size == new_size) + return (uint8_t*) m->header + old_size; + + if (m->free_header) { + np = realloc(m->header, ALIGN8(new_size)); + if (!np) + goto poison; + } else { + /* Initially, the header is allocated as part of + * the sd_bus_message itself, let's replace it by + * dynamic data */ + + np = malloc(ALIGN8(new_size)); + if (!np) + goto poison; + + memcpy(np, m->header, sizeof(struct bus_header)); + } + + /* Zero out padding */ + if (start > old_size) + memzero((uint8_t*) np + old_size, start - old_size); + + op = m->header; + m->header = np; + m->fields_size = new_size - sizeof(struct bus_header); + + /* Adjust quick access pointers */ + m->path = adjust_pointer(m->path, op, old_size, m->header); + m->interface = adjust_pointer(m->interface, op, old_size, m->header); + m->member = adjust_pointer(m->member, op, old_size, m->header); + m->destination = adjust_pointer(m->destination, op, old_size, m->header); + m->sender = adjust_pointer(m->sender, op, old_size, m->header); + m->error.name = adjust_pointer(m->error.name, op, old_size, m->header); + + m->free_header = true; + + if (add_offset) { + if (m->n_header_offsets >= ELEMENTSOF(m->header_offsets)) + goto poison; + + m->header_offsets[m->n_header_offsets++] = new_size - sizeof(struct bus_header); + } + + return (uint8_t*) np + start; + +poison: + m->poisoned = true; + return NULL; +} + +static int message_append_field_string( + sd_bus_message *m, + uint64_t h, + char type, + const char *s, + const char **ret) { + + size_t l; + uint8_t *p; + + assert(m); + + /* dbus only allows 8-bit header field ids */ + if (h > 0xFF) + return -EINVAL; + + /* dbus doesn't allow strings over 32-bit */ + l = strlen(s); + if (l > UINT32_MAX) + return -EINVAL; + + /* Signature "(yv)" where the variant contains "s" */ + + /* (field id byte + (signature length + signature 's' + NUL) + (string length + string + NUL)) */ + p = message_extend_fields(m, 4 + 4 + l + 1, false); + if (!p) + return -ENOMEM; + + p[0] = (uint8_t) h; + p[1] = 1; + p[2] = type; + p[3] = 0; + + ((uint32_t*) p)[1] = l; + memcpy(p + 8, s, l + 1); + + if (ret) + *ret = (char*) p + 8; + + return 0; +} + +static int message_append_field_signature( + sd_bus_message *m, + uint64_t h, + const char *s, + const char **ret) { + + size_t l; + uint8_t *p; + + assert(m); + + /* dbus only allows 8-bit header field ids */ + if (h > 0xFF) + return -EINVAL; + + /* dbus doesn't allow signatures over 8-bit */ + l = strlen(s); + if (l > SD_BUS_MAXIMUM_SIGNATURE_LENGTH) + return -EINVAL; + + /* Signature "(yv)" where the variant contains "g" */ + + /* (field id byte + (signature length + signature 'g' + NUL) + (string length + string + NUL)) */ + p = message_extend_fields(m, 4 + 1 + l + 1, false); + if (!p) + return -ENOMEM; + + p[0] = (uint8_t) h; + p[1] = 1; + p[2] = SD_BUS_TYPE_SIGNATURE; + p[3] = 0; + p[4] = l; + memcpy(p + 5, s, l + 1); + + if (ret) + *ret = (const char*) p + 5; + + return 0; +} + +static int message_append_field_uint32(sd_bus_message *m, uint64_t h, uint32_t x) { + uint8_t *p; + + assert(m); + + /* dbus only allows 8-bit header field ids */ + if (h > 0xFF) + return -EINVAL; + + /* (field id byte + (signature length + signature 'u' + NUL) + value) */ + p = message_extend_fields(m, 4 + 4, false); + if (!p) + return -ENOMEM; + + p[0] = (uint8_t) h; + p[1] = 1; + p[2] = 'u'; + p[3] = 0; + + ((uint32_t*) p)[1] = x; + + return 0; +} + +static int message_append_reply_cookie(sd_bus_message *m, uint64_t cookie) { + assert(m); + + /* 64-bit cookies are not supported */ + if (cookie > UINT32_MAX) + return -EOPNOTSUPP; + + return message_append_field_uint32(m, BUS_MESSAGE_HEADER_REPLY_SERIAL, (uint32_t) cookie); +} + +static int message_from_header( + sd_bus *bus, + void *buffer, + size_t message_size, + int *fds, + size_t n_fds, + const char *label, + sd_bus_message **ret) { + + _cleanup_free_ sd_bus_message *m = NULL; + struct bus_header *h; + size_t a, label_sz = 0; /* avoid false maybe-uninitialized warning */ + + assert(bus); + assert(buffer || message_size <= 0); + assert(fds || n_fds <= 0); + assert(ret); + + if (message_size < sizeof(struct bus_header)) + return -EBADMSG; + + h = buffer; + if (!IN_SET(h->version, 1, 2)) + return -EBADMSG; + + if (h->type == _SD_BUS_MESSAGE_TYPE_INVALID) + return -EBADMSG; + + if (!IN_SET(h->endian, BUS_LITTLE_ENDIAN, BUS_BIG_ENDIAN)) + return -EBADMSG; + + /* Note that we are happy with unknown flags in the flags header! */ + + a = ALIGN(sizeof(sd_bus_message)); + + if (label) { + label_sz = strlen(label); + a += label_sz + 1; + } + + m = malloc0(a); + if (!m) + return -ENOMEM; + + m->sealed = true; + m->header = buffer; + + if (h->serial == 0) + return -EBADMSG; + + m->fields_size = BUS_MESSAGE_BSWAP32(m, h->fields_size); + m->body_size = BUS_MESSAGE_BSWAP32(m, h->body_size); + + assert(message_size >= sizeof(struct bus_header)); + if (ALIGN8(m->fields_size) > message_size - sizeof(struct bus_header) || + m->body_size != message_size - sizeof(struct bus_header) - ALIGN8(m->fields_size)) + return -EBADMSG; + + m->fds = fds; + m->n_fds = n_fds; + + if (label) { + m->creds.label = (char*) m + ALIGN(sizeof(sd_bus_message)); + memcpy(m->creds.label, label, label_sz + 1); + + m->creds.mask |= SD_BUS_CREDS_SELINUX_CONTEXT; + } + + m->n_ref = 1; + m->bus = sd_bus_ref(bus); + + *ret = TAKE_PTR(m); + + return 0; +} + +int bus_message_from_malloc( + sd_bus *bus, + void *buffer, + size_t length, + int *fds, + size_t n_fds, + const char *label, + sd_bus_message **ret) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + size_t sz; + int r; + + r = message_from_header( + bus, + buffer, length, + fds, n_fds, + label, + &m); + if (r < 0) + return r; + + sz = length - sizeof(struct bus_header) - ALIGN8(m->fields_size); + if (sz > 0) { + m->n_body_parts = 1; + m->body.data = (uint8_t*) buffer + sizeof(struct bus_header) + ALIGN8(m->fields_size); + m->body.size = sz; + m->body.sealed = true; + m->body.memfd = -EBADF; + } + + m->n_iovec = 1; + m->iovec = m->iovec_fixed; + m->iovec[0] = IOVEC_MAKE(buffer, length); + + r = message_parse_fields(m); + if (r < 0) + return r; + + /* We take possession of the memory and fds now */ + m->free_header = true; + m->free_fds = true; + + *ret = TAKE_PTR(m); + return 0; +} + +_public_ int sd_bus_message_new( + sd_bus *bus, + sd_bus_message **m, + uint8_t type) { + + assert_return(bus, -ENOTCONN); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state != BUS_UNSET, -ENOTCONN); + assert_return(m, -EINVAL); + /* Creation of messages with _SD_BUS_MESSAGE_TYPE_INVALID is allowed. */ + assert_return(type < _SD_BUS_MESSAGE_TYPE_MAX, -EINVAL); + + sd_bus_message *t = malloc0(ALIGN(sizeof(sd_bus_message)) + sizeof(struct bus_header)); + if (!t) + return -ENOMEM; + + t->n_ref = 1; + t->bus = sd_bus_ref(bus); + t->header = (struct bus_header*) ((uint8_t*) t + ALIGN(sizeof(struct sd_bus_message))); + t->header->endian = BUS_NATIVE_ENDIAN; + t->header->type = type; + t->header->version = bus->message_version; + t->allow_fds = bus->can_fds || !IN_SET(bus->state, BUS_HELLO, BUS_RUNNING); + + if (bus->allow_interactive_authorization) + t->header->flags |= BUS_MESSAGE_ALLOW_INTERACTIVE_AUTHORIZATION; + + *m = t; + return 0; +} + +_public_ int sd_bus_message_new_signal_to( + sd_bus *bus, + sd_bus_message **m, + const char *destination, + const char *path, + const char *interface, + const char *member) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *t = NULL; + int r; + + assert_return(bus, -ENOTCONN); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state != BUS_UNSET, -ENOTCONN); + assert_return(!destination || service_name_is_valid(destination), -EINVAL); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(interface_name_is_valid(interface), -EINVAL); + assert_return(member_name_is_valid(member), -EINVAL); + assert_return(m, -EINVAL); + + r = sd_bus_message_new(bus, &t, SD_BUS_MESSAGE_SIGNAL); + if (r < 0) + return -ENOMEM; + + assert(t); + + t->header->flags |= BUS_MESSAGE_NO_REPLY_EXPECTED; + + r = message_append_field_string(t, BUS_MESSAGE_HEADER_PATH, SD_BUS_TYPE_OBJECT_PATH, path, &t->path); + if (r < 0) + return r; + r = message_append_field_string(t, BUS_MESSAGE_HEADER_INTERFACE, SD_BUS_TYPE_STRING, interface, &t->interface); + if (r < 0) + return r; + r = message_append_field_string(t, BUS_MESSAGE_HEADER_MEMBER, SD_BUS_TYPE_STRING, member, &t->member); + if (r < 0) + return r; + + if (destination) { + r = message_append_field_string(t, BUS_MESSAGE_HEADER_DESTINATION, SD_BUS_TYPE_STRING, destination, &t->destination); + if (r < 0) + return r; + } + + *m = TAKE_PTR(t); + return 0; +} + +_public_ int sd_bus_message_new_signal( + sd_bus *bus, + sd_bus_message **m, + const char *path, + const char *interface, + const char *member) { + + return sd_bus_message_new_signal_to(bus, m, NULL, path, interface, member); +} + +_public_ int sd_bus_message_new_method_call( + sd_bus *bus, + sd_bus_message **m, + const char *destination, + const char *path, + const char *interface, + const char *member) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *t = NULL; + int r; + + assert_return(bus, -ENOTCONN); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state != BUS_UNSET, -ENOTCONN); + assert_return(!destination || service_name_is_valid(destination), -EINVAL); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!interface || interface_name_is_valid(interface), -EINVAL); + assert_return(member_name_is_valid(member), -EINVAL); + assert_return(m, -EINVAL); + + r = sd_bus_message_new(bus, &t, SD_BUS_MESSAGE_METHOD_CALL); + if (r < 0) + return -ENOMEM; + + assert(t); + + r = message_append_field_string(t, BUS_MESSAGE_HEADER_PATH, SD_BUS_TYPE_OBJECT_PATH, path, &t->path); + if (r < 0) + return r; + r = message_append_field_string(t, BUS_MESSAGE_HEADER_MEMBER, SD_BUS_TYPE_STRING, member, &t->member); + if (r < 0) + return r; + + if (interface) { + r = message_append_field_string(t, BUS_MESSAGE_HEADER_INTERFACE, SD_BUS_TYPE_STRING, interface, &t->interface); + if (r < 0) + return r; + } + + if (destination) { + r = message_append_field_string(t, BUS_MESSAGE_HEADER_DESTINATION, SD_BUS_TYPE_STRING, destination, &t->destination); + if (r < 0) + return r; + } + + *m = TAKE_PTR(t); + return 0; +} + +static int message_new_reply( + sd_bus_message *call, + uint8_t type, + sd_bus_message **m) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *t = NULL; + uint64_t cookie; + int r; + + assert_return(call, -EINVAL); + assert_return(call->sealed, -EPERM); + assert_return(call->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(call->bus->state != BUS_UNSET, -ENOTCONN); + assert_return(m, -EINVAL); + + cookie = BUS_MESSAGE_COOKIE(call); + if (cookie == 0) + return -EOPNOTSUPP; + + r = sd_bus_message_new(call->bus, &t, type); + if (r < 0) + return -ENOMEM; + + assert(t); + + t->header->flags |= BUS_MESSAGE_NO_REPLY_EXPECTED; + t->reply_cookie = cookie; + r = message_append_reply_cookie(t, t->reply_cookie); + if (r < 0) + return r; + + if (call->sender) { + r = message_append_field_string(t, BUS_MESSAGE_HEADER_DESTINATION, SD_BUS_TYPE_STRING, call->sender, &t->destination); + if (r < 0) + return r; + } + + t->dont_send = !!(call->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED); + t->enforced_reply_signature = call->enforced_reply_signature; + + /* let's copy the sensitive flag over. Let's do that as a safety precaution to keep a transaction + * wholly sensitive if already the incoming message was sensitive. This is particularly useful when a + * vtable record sets the SD_BUS_VTABLE_SENSITIVE flag on a method call, since this means it applies + * to both the message call and the reply. */ + t->sensitive = call->sensitive; + + *m = TAKE_PTR(t); + return 0; +} + +_public_ int sd_bus_message_new_method_return( + sd_bus_message *call, + sd_bus_message **m) { + + return message_new_reply(call, SD_BUS_MESSAGE_METHOD_RETURN, m); +} + +_public_ int sd_bus_message_new_method_error( + sd_bus_message *call, + sd_bus_message **m, + const sd_bus_error *e) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *t = NULL; + int r; + + assert_return(sd_bus_error_is_set(e), -EINVAL); + assert_return(m, -EINVAL); + + r = message_new_reply(call, SD_BUS_MESSAGE_METHOD_ERROR, &t); + if (r < 0) + return r; + + r = message_append_field_string(t, BUS_MESSAGE_HEADER_ERROR_NAME, SD_BUS_TYPE_STRING, e->name, &t->error.name); + if (r < 0) + return r; + + if (e->message) { + r = message_append_basic(t, SD_BUS_TYPE_STRING, e->message, (const void**) &t->error.message); + if (r < 0) + return r; + } + + t->error._need_free = -1; + + *m = TAKE_PTR(t); + return 0; +} + +_public_ int sd_bus_message_new_method_errorf( + sd_bus_message *call, + sd_bus_message **m, + const char *name, + const char *format, + ...) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + va_list ap; + + assert_return(name, -EINVAL); + assert_return(m, -EINVAL); + + va_start(ap, format); + sd_bus_error_setfv(&error, name, format, ap); + va_end(ap); + + return sd_bus_message_new_method_error(call, m, &error); +} + +_public_ int sd_bus_message_new_method_errno( + sd_bus_message *call, + sd_bus_message **m, + int error, + const sd_bus_error *p) { + + _cleanup_(sd_bus_error_free) sd_bus_error berror = SD_BUS_ERROR_NULL; + + if (sd_bus_error_is_set(p)) + return sd_bus_message_new_method_error(call, m, p); + + sd_bus_error_set_errno(&berror, error); + + return sd_bus_message_new_method_error(call, m, &berror); +} + +_public_ int sd_bus_message_new_method_errnof( + sd_bus_message *call, + sd_bus_message **m, + int error, + const char *format, + ...) { + + _cleanup_(sd_bus_error_free) sd_bus_error berror = SD_BUS_ERROR_NULL; + va_list ap; + + va_start(ap, format); + sd_bus_error_set_errnofv(&berror, error, format, ap); + va_end(ap); + + return sd_bus_message_new_method_error(call, m, &berror); +} + +void bus_message_set_sender_local(sd_bus *bus, sd_bus_message *m) { + assert(bus); + assert(m); + + m->sender = m->creds.unique_name = (char*) "org.freedesktop.DBus.Local"; + m->creds.well_known_names_local = true; + m->creds.mask |= (SD_BUS_CREDS_UNIQUE_NAME|SD_BUS_CREDS_WELL_KNOWN_NAMES) & bus->creds_mask; +} + +void bus_message_set_sender_driver(sd_bus *bus, sd_bus_message *m) { + assert(bus); + assert(m); + + m->sender = m->creds.unique_name = (char*) "org.freedesktop.DBus"; + m->creds.well_known_names_driver = true; + m->creds.mask |= (SD_BUS_CREDS_UNIQUE_NAME|SD_BUS_CREDS_WELL_KNOWN_NAMES) & bus->creds_mask; +} + +int bus_message_new_synthetic_error( + sd_bus *bus, + uint64_t cookie, + const sd_bus_error *e, + sd_bus_message **m) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *t = NULL; + int r; + + assert(bus); + assert(sd_bus_error_is_set(e)); + assert(m); + + r = sd_bus_message_new(bus, &t, SD_BUS_MESSAGE_METHOD_ERROR); + if (r < 0) + return -ENOMEM; + + assert(t); + + t->header->flags |= BUS_MESSAGE_NO_REPLY_EXPECTED; + t->reply_cookie = cookie; + + r = message_append_reply_cookie(t, t->reply_cookie); + if (r < 0) + return r; + + if (bus && bus->unique_name) { + r = message_append_field_string(t, BUS_MESSAGE_HEADER_DESTINATION, SD_BUS_TYPE_STRING, bus->unique_name, &t->destination); + if (r < 0) + return r; + } + + r = message_append_field_string(t, BUS_MESSAGE_HEADER_ERROR_NAME, SD_BUS_TYPE_STRING, e->name, &t->error.name); + if (r < 0) + return r; + + if (e->message) { + r = message_append_basic(t, SD_BUS_TYPE_STRING, e->message, (const void**) &t->error.message); + if (r < 0) + return r; + } + + t->error._need_free = -1; + + bus_message_set_sender_driver(bus, t); + + *m = TAKE_PTR(t); + return 0; +} + +_public_ sd_bus_message* sd_bus_message_ref(sd_bus_message *m) { + if (!m) + return NULL; + + /* We are fine if this message so far was either explicitly reffed or not reffed but queued into at + * least one bus connection object. */ + assert(m->n_ref > 0 || m->n_queued > 0); + + m->n_ref++; + + /* Each user reference to a bus message shall also be considered a ref on the bus */ + sd_bus_ref(m->bus); + return m; +} + +_public_ sd_bus_message* sd_bus_message_unref(sd_bus_message *m) { + if (!m) + return NULL; + + assert(m->n_ref > 0); + + sd_bus_unref(m->bus); /* Each regular ref is also a ref on the bus connection. Let's hence drop it + * here. Note we have to do this before decrementing our own n_ref here, since + * otherwise, if this message is currently queued sd_bus_unref() might call + * bus_message_unref_queued() for this which might then destroy the message + * while we are still processing it. */ + m->n_ref--; + + if (m->n_ref > 0 || m->n_queued > 0) + return NULL; + + /* Unset the bus field if neither the user has a reference nor this message is queued. We are careful + * to reset the field only after the last reference to the bus is dropped, after all we might keep + * multiple references to the bus, once for each reference kept on ourselves. */ + m->bus = NULL; + + return message_free(m); +} + +sd_bus_message* bus_message_ref_queued(sd_bus_message *m, sd_bus *bus) { + if (!m) + return NULL; + + /* If this is a different bus than the message is associated with, then implicitly turn this into a + * regular reference. This means that you can create a memory leak by enqueuing a message generated + * on one bus onto another at the same time as enqueueing a message from the second one on the first, + * as we'll not detect the cyclic references there. */ + if (bus != m->bus) + return sd_bus_message_ref(m); + + assert(m->n_ref > 0 || m->n_queued > 0); + m->n_queued++; + + return m; +} + +sd_bus_message* bus_message_unref_queued(sd_bus_message *m, sd_bus *bus) { + if (!m) + return NULL; + + if (bus != m->bus) + return sd_bus_message_unref(m); + + assert(m->n_queued > 0); + m->n_queued--; + + if (m->n_ref > 0 || m->n_queued > 0) + return NULL; + + m->bus = NULL; + + return message_free(m); +} + +_public_ int sd_bus_message_get_type(sd_bus_message *m, uint8_t *type) { + assert_return(m, -EINVAL); + assert_return(type, -EINVAL); + + *type = m->header->type; + return 0; +} + +_public_ int sd_bus_message_get_cookie(sd_bus_message *m, uint64_t *cookie) { + uint64_t c; + + assert_return(m, -EINVAL); + assert_return(cookie, -EINVAL); + + c = BUS_MESSAGE_COOKIE(m); + if (c == 0) + return -ENODATA; + + *cookie = BUS_MESSAGE_COOKIE(m); + return 0; +} + +_public_ int sd_bus_message_get_reply_cookie(sd_bus_message *m, uint64_t *cookie) { + assert_return(m, -EINVAL); + assert_return(cookie, -EINVAL); + + if (m->reply_cookie == 0) + return -ENODATA; + + *cookie = m->reply_cookie; + return 0; +} + +_public_ int sd_bus_message_get_expect_reply(sd_bus_message *m) { + assert_return(m, -EINVAL); + + return m->header->type == SD_BUS_MESSAGE_METHOD_CALL && + !(m->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED); +} + +_public_ int sd_bus_message_get_auto_start(sd_bus_message *m) { + assert_return(m, -EINVAL); + + return !(m->header->flags & BUS_MESSAGE_NO_AUTO_START); +} + +_public_ int sd_bus_message_get_allow_interactive_authorization(sd_bus_message *m) { + assert_return(m, -EINVAL); + + return m->header->type == SD_BUS_MESSAGE_METHOD_CALL && + (m->header->flags & BUS_MESSAGE_ALLOW_INTERACTIVE_AUTHORIZATION); +} + +_public_ const char *sd_bus_message_get_path(sd_bus_message *m) { + assert_return(m, NULL); + + return m->path; +} + +_public_ const char *sd_bus_message_get_interface(sd_bus_message *m) { + assert_return(m, NULL); + + return m->interface; +} + +_public_ const char *sd_bus_message_get_member(sd_bus_message *m) { + assert_return(m, NULL); + + return m->member; +} + +_public_ const char *sd_bus_message_get_destination(sd_bus_message *m) { + assert_return(m, NULL); + + return m->destination; +} + +_public_ const char *sd_bus_message_get_sender(sd_bus_message *m) { + assert_return(m, NULL); + + return m->sender; +} + +_public_ const sd_bus_error *sd_bus_message_get_error(sd_bus_message *m) { + assert_return(m, NULL); + + if (!sd_bus_error_is_set(&m->error)) + return NULL; + + return &m->error; +} + +_public_ int sd_bus_message_get_monotonic_usec(sd_bus_message *m, uint64_t *usec) { + assert_return(m, -EINVAL); + assert_return(usec, -EINVAL); + + if (m->monotonic <= 0) + return -ENODATA; + + *usec = m->monotonic; + return 0; +} + +_public_ int sd_bus_message_get_realtime_usec(sd_bus_message *m, uint64_t *usec) { + assert_return(m, -EINVAL); + assert_return(usec, -EINVAL); + + if (m->realtime <= 0) + return -ENODATA; + + *usec = m->realtime; + return 0; +} + +_public_ int sd_bus_message_get_seqnum(sd_bus_message *m, uint64_t *seqnum) { + assert_return(m, -EINVAL); + assert_return(seqnum, -EINVAL); + + if (m->seqnum <= 0) + return -ENODATA; + + *seqnum = m->seqnum; + return 0; +} + +_public_ sd_bus_creds *sd_bus_message_get_creds(sd_bus_message *m) { + assert_return(m, NULL); + + if (m->creds.mask == 0) + return NULL; + + return &m->creds; +} + +_public_ int sd_bus_message_is_signal( + sd_bus_message *m, + const char *interface, + const char *member) { + + assert_return(m, -EINVAL); + + if (m->header->type != SD_BUS_MESSAGE_SIGNAL) + return 0; + + if (interface && !streq_ptr(m->interface, interface)) + return 0; + + if (member && !streq_ptr(m->member, member)) + return 0; + + return 1; +} + +_public_ int sd_bus_message_is_method_call( + sd_bus_message *m, + const char *interface, + const char *member) { + + assert_return(m, -EINVAL); + + if (m->header->type != SD_BUS_MESSAGE_METHOD_CALL) + return 0; + + if (interface && !streq_ptr(m->interface, interface)) + return 0; + + if (member && !streq_ptr(m->member, member)) + return 0; + + return 1; +} + +_public_ int sd_bus_message_is_method_error(sd_bus_message *m, const char *name) { + assert_return(m, -EINVAL); + + if (m->header->type != SD_BUS_MESSAGE_METHOD_ERROR) + return 0; + + if (name && !streq_ptr(m->error.name, name)) + return 0; + + return 1; +} + +_public_ int sd_bus_message_set_expect_reply(sd_bus_message *m, int b) { + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(m->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EPERM); + + SET_FLAG(m->header->flags, BUS_MESSAGE_NO_REPLY_EXPECTED, !b); + + return 0; +} + +_public_ int sd_bus_message_set_auto_start(sd_bus_message *m, int b) { + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + SET_FLAG(m->header->flags, BUS_MESSAGE_NO_AUTO_START, !b); + + return 0; +} + +_public_ int sd_bus_message_set_allow_interactive_authorization(sd_bus_message *m, int b) { + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + SET_FLAG(m->header->flags, BUS_MESSAGE_ALLOW_INTERACTIVE_AUTHORIZATION, b); + + return 0; +} + +static struct bus_body_part *message_append_part(sd_bus_message *m) { + struct bus_body_part *part; + + assert(m); + + if (m->poisoned) + return NULL; + + if (m->n_body_parts <= 0) { + part = &m->body; + zero(*part); + } else { + assert(m->body_end); + + part = new0(struct bus_body_part, 1); + if (!part) { + m->poisoned = true; + return NULL; + } + + m->body_end->next = part; + } + + part->memfd = -EBADF; + m->body_end = part; + m->n_body_parts++; + + return part; +} + +static void part_zero(struct bus_body_part *part, size_t sz) { + assert(part); + assert(sz > 0); + assert(sz < 8); + + /* All other fields can be left in their defaults */ + assert(!part->data); + assert(part->memfd < 0); + + part->size = sz; + part->is_zero = true; + part->sealed = true; +} + +static int part_make_space( + struct sd_bus_message *m, + struct bus_body_part *part, + size_t sz, + void **q) { + + void *n; + + assert(m); + assert(part); + assert(!part->sealed); + + if (m->poisoned) + return -ENOMEM; + + if (part->allocated == 0 || sz > part->allocated) { + size_t new_allocated; + + new_allocated = sz > 0 ? 2 * sz : 64; + n = realloc(part->data, new_allocated); + if (!n) { + m->poisoned = true; + return -ENOMEM; + } + + part->data = n; + part->allocated = new_allocated; + part->free_this = true; + } + + if (q) + *q = part->data ? (uint8_t*) part->data + part->size : NULL; + + part->size = sz; + return 0; +} + +static void message_extend_containers(sd_bus_message *m, size_t expand) { + assert(m); + + if (expand <= 0) + return; + + if (m->n_containers <= 0) + return; + + /* Update counters */ + for (struct bus_container *c = m->containers; c < m->containers + m->n_containers; c++) + if (c->array_size) + *c->array_size += expand; +} + +static void *message_extend_body( + sd_bus_message *m, + size_t align, + size_t sz) { + + size_t start_body, end_body, padding, added; + void *p; + int r; + + assert(m); + assert(align > 0); + assert(!m->sealed); + + if (m->poisoned) + return NULL; + + start_body = ALIGN_TO(m->body_size, align); + end_body = start_body + sz; + + padding = start_body - m->body_size; + added = padding + sz; + + /* Check for 32-bit overflows */ + if (end_body < start_body || end_body > UINT32_MAX) { + m->poisoned = true; + return NULL; + } + + if (added > 0) { + struct bus_body_part *part = NULL; + bool add_new_part; + + add_new_part = + m->n_body_parts <= 0 || + m->body_end->sealed || + (padding != ALIGN_TO(m->body_end->size, align) - m->body_end->size); + /* If this must be an inlined extension, let's create a new part if + * the previous part is large enough to be inlined. */ + + if (add_new_part) { + if (padding > 0) { + part = message_append_part(m); + if (!part) + return NULL; + + part_zero(part, padding); + } + + part = message_append_part(m); + if (!part) + return NULL; + + r = part_make_space(m, part, sz, &p); + if (r < 0) + return NULL; + } else { + void *op; + size_t os, start_part, end_part; + + part = m->body_end; + op = part->data; + os = part->size; + + start_part = ALIGN_TO(part->size, align); + end_part = start_part + sz; + + r = part_make_space(m, part, end_part, &p); + if (r < 0) + return NULL; + + if (padding > 0) { + memzero(p, padding); + p = (uint8_t*) p + padding; + } + + /* Readjust pointers */ + if (m->n_containers > 0) + for (struct bus_container *c = m->containers; c < m->containers + m->n_containers; c++) + c->array_size = adjust_pointer(c->array_size, op, os, part->data); + + m->error.message = (const char*) adjust_pointer(m->error.message, op, os, part->data); + } + } else + /* Return something that is not NULL and is aligned */ + p = (uint8_t*) align; + + m->body_size = end_body; + message_extend_containers(m, added); + + return p; +} + +static int message_push_fd(sd_bus_message *m, int fd) { + int *f, copy; + + assert(m); + + if (fd < 0) + return -EINVAL; + + if (!m->allow_fds) + return -EOPNOTSUPP; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + f = reallocarray(m->fds, m->n_fds + 1, sizeof(int)); + if (!f) { + m->poisoned = true; + safe_close(copy); + return -ENOMEM; + } + + m->fds = f; + m->fds[m->n_fds] = copy; + m->free_fds = true; + + return copy; +} + +int message_append_basic(sd_bus_message *m, char type, const void *p, const void **stored) { + _cleanup_close_ int fd = -EBADF; + struct bus_container *c; + ssize_t align, sz; + uint32_t u32; + void *a; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(bus_type_is_basic(type), -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + c = message_get_last_container(m); + + if (c->signature && c->signature[c->index]) { + /* Container signature is already set */ + + if (c->signature[c->index] != type) + return -ENXIO; + } else { + char *e; + + /* Maybe we can append to the signature? But only if this is the top-level container */ + if (c->enclosing != 0) + return -ENXIO; + + e = strextend(&c->signature, CHAR_TO_STR(type)); + if (!e) { + m->poisoned = true; + return -ENOMEM; + } + } + + switch (type) { + + case SD_BUS_TYPE_STRING: + /* To make things easy we'll serialize a NULL string + * into the empty string */ + p = strempty(p); + + if (!utf8_is_valid(p)) + return -EINVAL; + + align = 4; + sz = 4 + strlen(p) + 1; + break; + + case SD_BUS_TYPE_OBJECT_PATH: + + if (!p) + return -EINVAL; + + if (!object_path_is_valid(p)) + return -EINVAL; + + align = 4; + sz = 4 + strlen(p) + 1; + break; + + case SD_BUS_TYPE_SIGNATURE: + + p = strempty(p); + + if (!signature_is_valid(p, /* allow_dict_entry = */ true)) + return -EINVAL; + + align = 1; + sz = 1 + strlen(p) + 1; + break; + + case SD_BUS_TYPE_BOOLEAN: + + u32 = p && *(int*) p; + p = &u32; + + align = sz = 4; + break; + + case SD_BUS_TYPE_UNIX_FD: + + if (!p) + return -EINVAL; + + fd = message_push_fd(m, *(int*) p); + if (fd < 0) + return fd; + + u32 = m->n_fds; + p = &u32; + + align = sz = 4; + break; + + default: + align = bus_type_get_alignment(type); + sz = bus_type_get_size(type); + break; + } + + assert(align > 0); + assert(sz > 0); + + a = message_extend_body(m, align, sz); + if (!a) + return -ENOMEM; + + if (IN_SET(type, SD_BUS_TYPE_STRING, SD_BUS_TYPE_OBJECT_PATH)) { + *(uint32_t*) a = sz - 5; + memcpy((uint8_t*) a + 4, p, sz - 4); + + if (stored) + *stored = (const uint8_t*) a + 4; + + } else if (type == SD_BUS_TYPE_SIGNATURE) { + *(uint8_t*) a = sz - 2; + memcpy((uint8_t*) a + 1, p, sz - 1); + + if (stored) + *stored = (const uint8_t*) a + 1; + } else { + memcpy(a, p, sz); + + if (stored) + *stored = a; + } + + if (type == SD_BUS_TYPE_UNIX_FD) + m->n_fds++; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index++; + + fd = -EBADF; + return 0; +} + +_public_ int sd_bus_message_append_basic(sd_bus_message *m, char type, const void *p) { + return message_append_basic(m, type, p, NULL); +} + +_public_ int sd_bus_message_append_string_space( + sd_bus_message *m, + size_t size, + char **s) { + + struct bus_container *c; + void *a; + + assert_return(m, -EINVAL); + assert_return(s, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->poisoned, -ESTALE); + + c = message_get_last_container(m); + + if (c->signature && c->signature[c->index]) { + /* Container signature is already set */ + + if (c->signature[c->index] != SD_BUS_TYPE_STRING) + return -ENXIO; + } else { + char *e; + + /* Maybe we can append to the signature? But only if this is the top-level container */ + if (c->enclosing != 0) + return -ENXIO; + + e = strextend(&c->signature, CHAR_TO_STR(SD_BUS_TYPE_STRING)); + if (!e) { + m->poisoned = true; + return -ENOMEM; + } + } + + a = message_extend_body(m, 4, 4 + size + 1); + if (!a) + return -ENOMEM; + + *(uint32_t*) a = size; + *s = (char*) a + 4; + + (*s)[size] = 0; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index++; + + return 0; +} + +_public_ int sd_bus_message_append_string_iovec( + sd_bus_message *m, + const struct iovec *iov, + unsigned n /* should be size_t, but is API now… 😞 */) { + + size_t size; + unsigned i; + char *p; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(iov || n == 0, -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + size = iovec_total_size(iov, n); + + r = sd_bus_message_append_string_space(m, size, &p); + if (r < 0) + return r; + + for (i = 0; i < n; i++) { + + if (iov[i].iov_base) + memcpy(p, iov[i].iov_base, iov[i].iov_len); + else + memset(p, ' ', iov[i].iov_len); + + p += iov[i].iov_len; + } + + return 0; +} + +static int bus_message_open_array( + sd_bus_message *m, + struct bus_container *c, + const char *contents, + uint32_t **array_size, + size_t *begin) { + + unsigned nindex; + int alignment; + void *a, *op; + size_t os; + struct bus_body_part *o; + + assert(m); + assert(c); + assert(contents); + assert(array_size); + assert(begin); + + if (!signature_is_single(contents, true)) + return -EINVAL; + + if (c->signature && c->signature[c->index]) { + + /* Verify the existing signature */ + + if (c->signature[c->index] != SD_BUS_TYPE_ARRAY) + return -ENXIO; + + if (!startswith(c->signature + c->index + 1, contents)) + return -ENXIO; + + nindex = c->index + 1 + strlen(contents); + } else { + char *e; + + if (c->enclosing != 0) + return -ENXIO; + + /* Extend the existing signature */ + + e = strextend(&c->signature, CHAR_TO_STR(SD_BUS_TYPE_ARRAY), contents); + if (!e) { + m->poisoned = true; + return -ENOMEM; + } + + nindex = e - c->signature; + } + + alignment = bus_type_get_alignment(contents[0]); + if (alignment < 0) + return alignment; + + a = message_extend_body(m, 4, 4); + if (!a) + return -ENOMEM; + + o = m->body_end; + op = m->body_end->data; + os = m->body_end->size; + + /* Add alignment between size and first element */ + if (!message_extend_body(m, alignment, 0)) + return -ENOMEM; + + /* location of array size might have changed so let's readjust a */ + if (o == m->body_end) + a = adjust_pointer(a, op, os, m->body_end->data); + + *(uint32_t*) a = 0; + *array_size = a; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index = nindex; + + return 0; +} + +static int bus_message_open_variant( + sd_bus_message *m, + struct bus_container *c, + const char *contents) { + + size_t l; + void *a; + + assert(m); + assert(c); + assert(contents); + + if (!signature_is_single(contents, false)) + return -EINVAL; + + if (*contents == SD_BUS_TYPE_DICT_ENTRY_BEGIN) + return -EINVAL; + + if (c->signature && c->signature[c->index]) { + + if (c->signature[c->index] != SD_BUS_TYPE_VARIANT) + return -ENXIO; + + } else { + char *e; + + if (c->enclosing != 0) + return -ENXIO; + + e = strextend(&c->signature, CHAR_TO_STR(SD_BUS_TYPE_VARIANT)); + if (!e) { + m->poisoned = true; + return -ENOMEM; + } + } + + l = strlen(contents); + a = message_extend_body(m, 1, 1 + l + 1); + if (!a) + return -ENOMEM; + + *(uint8_t*) a = l; + memcpy((uint8_t*) a + 1, contents, l + 1); + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index++; + + return 0; +} + +static int bus_message_open_struct( + sd_bus_message *m, + struct bus_container *c, + const char *contents, + size_t *begin) { + + size_t nindex; + + assert(m); + assert(c); + assert(contents); + assert(begin); + + if (!signature_is_valid(contents, false)) + return -EINVAL; + + if (c->signature && c->signature[c->index]) { + size_t l; + + l = strlen(contents); + + if (c->signature[c->index] != SD_BUS_TYPE_STRUCT_BEGIN || + !startswith(c->signature + c->index + 1, contents) || + c->signature[c->index + 1 + l] != SD_BUS_TYPE_STRUCT_END) + return -ENXIO; + + nindex = c->index + 1 + l + 1; + } else { + char *e; + + if (c->enclosing != 0) + return -ENXIO; + + e = strextend(&c->signature, CHAR_TO_STR(SD_BUS_TYPE_STRUCT_BEGIN), contents, CHAR_TO_STR(SD_BUS_TYPE_STRUCT_END)); + if (!e) { + m->poisoned = true; + return -ENOMEM; + } + + nindex = e - c->signature; + } + + /* Align contents to 8 byte boundary */ + if (!message_extend_body(m, 8, 0)) + return -ENOMEM; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index = nindex; + + return 0; +} + +static int bus_message_open_dict_entry( + sd_bus_message *m, + struct bus_container *c, + const char *contents, + size_t *begin) { + + assert(m); + assert(c); + assert(contents); + assert(begin); + + if (!signature_is_pair(contents)) + return -EINVAL; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + return -ENXIO; + + if (c->signature && c->signature[c->index]) { + size_t l; + + l = strlen(contents); + + if (c->signature[c->index] != SD_BUS_TYPE_DICT_ENTRY_BEGIN || + !startswith(c->signature + c->index + 1, contents) || + c->signature[c->index + 1 + l] != SD_BUS_TYPE_DICT_ENTRY_END) + return -ENXIO; + } else + return -ENXIO; + + /* Align contents to 8 byte boundary */ + if (!message_extend_body(m, 8, 0)) + return -ENOMEM; + + return 0; +} + +_public_ int sd_bus_message_open_container( + sd_bus_message *m, + char type, + const char *contents) { + + struct bus_container *c; + uint32_t *array_size = NULL; + _cleanup_free_ char *signature = NULL; + size_t before, begin = 0; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(contents, -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + /* Make sure we have space for one more container */ + if (!GREEDY_REALLOC(m->containers, m->n_containers + 1)) { + m->poisoned = true; + return -ENOMEM; + } + + c = message_get_last_container(m); + + signature = strdup(contents); + if (!signature) { + m->poisoned = true; + return -ENOMEM; + } + + /* Save old index in the parent container, in case we have to + * abort this container */ + c->saved_index = c->index; + before = m->body_size; + + if (type == SD_BUS_TYPE_ARRAY) + r = bus_message_open_array(m, c, contents, &array_size, &begin); + else if (type == SD_BUS_TYPE_VARIANT) + r = bus_message_open_variant(m, c, contents); + else if (type == SD_BUS_TYPE_STRUCT) + r = bus_message_open_struct(m, c, contents, &begin); + else if (type == SD_BUS_TYPE_DICT_ENTRY) + r = bus_message_open_dict_entry(m, c, contents, &begin); + else + r = -EINVAL; + if (r < 0) + return r; + + /* OK, let's fill it in */ + m->containers[m->n_containers++] = (struct bus_container) { + .enclosing = type, + .signature = TAKE_PTR(signature), + .array_size = array_size, + .before = before, + .begin = begin, + }; + + return 0; +} + +_public_ int sd_bus_message_close_container(sd_bus_message *m) { + struct bus_container *c; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(m->n_containers > 0, -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + c = message_get_last_container(m); + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + if (c->signature && c->signature[c->index] != 0) + return -EINVAL; + + m->n_containers--; + + free(c->signature); + + return 0; +} + +typedef struct { + const char *types; + unsigned n_struct; + unsigned n_array; +} TypeStack; + +static int type_stack_push(TypeStack *stack, unsigned max, unsigned *i, const char *types, unsigned n_struct, unsigned n_array) { + assert(stack); + assert(max > 0); + + if (*i >= max) + return -EINVAL; + + stack[*i].types = types; + stack[*i].n_struct = n_struct; + stack[*i].n_array = n_array; + (*i)++; + + return 0; +} + +static int type_stack_pop(TypeStack *stack, unsigned max, unsigned *i, const char **types, unsigned *n_struct, unsigned *n_array) { + assert(stack); + assert(max > 0); + assert(types); + assert(n_struct); + assert(n_array); + + if (*i <= 0) + return 0; + + (*i)--; + *types = stack[*i].types; + *n_struct = stack[*i].n_struct; + *n_array = stack[*i].n_array; + + return 1; +} + +_public_ int sd_bus_message_appendv( + sd_bus_message *m, + const char *types, + va_list ap) { + + unsigned n_array, n_struct; + TypeStack stack[BUS_CONTAINER_DEPTH]; + unsigned stack_ptr = 0; + int r; + + assert_return(m, -EINVAL); + assert_return(types, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->poisoned, -ESTALE); + + n_array = UINT_MAX; + n_struct = strlen(types); + + for (;;) { + const char *t; + + if (n_array == 0 || (n_array == UINT_MAX && n_struct == 0)) { + r = type_stack_pop(stack, ELEMENTSOF(stack), &stack_ptr, &types, &n_struct, &n_array); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + continue; + } + + t = types; + if (n_array != UINT_MAX) + n_array--; + else { + types++; + n_struct--; + } + + switch (*t) { + + case SD_BUS_TYPE_BYTE: { + uint8_t x; + + x = (uint8_t) va_arg(ap, int); + r = sd_bus_message_append_basic(m, *t, &x); + break; + } + + case SD_BUS_TYPE_BOOLEAN: + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: + case SD_BUS_TYPE_UNIX_FD: { + uint32_t x; + + /* We assume a boolean is the same as int32_t */ + assert_cc(sizeof(int32_t) == sizeof(int)); + + x = va_arg(ap, uint32_t); + r = sd_bus_message_append_basic(m, *t, &x); + break; + } + + case SD_BUS_TYPE_INT16: + case SD_BUS_TYPE_UINT16: { + uint16_t x; + + x = (uint16_t) va_arg(ap, int); + r = sd_bus_message_append_basic(m, *t, &x); + break; + } + + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: { + uint64_t x; + + x = va_arg(ap, uint64_t); + r = sd_bus_message_append_basic(m, *t, &x); + break; + } + + case SD_BUS_TYPE_DOUBLE: { + double x; + + x = va_arg(ap, double); + r = sd_bus_message_append_basic(m, *t, &x); + break; + } + + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_OBJECT_PATH: + case SD_BUS_TYPE_SIGNATURE: { + const char *x; + + x = va_arg(ap, const char*); + r = sd_bus_message_append_basic(m, *t, x); + break; + } + + case SD_BUS_TYPE_ARRAY: { + size_t k; + + r = signature_element_length(t + 1, &k); + if (r < 0) + return r; + + { + char s[k + 1]; + memcpy(s, t + 1, k); + s[k] = 0; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_ARRAY, s); + if (r < 0) + return r; + } + + if (n_array == UINT_MAX) { + types += k; + n_struct -= k; + } + + r = type_stack_push(stack, ELEMENTSOF(stack), &stack_ptr, types, n_struct, n_array); + if (r < 0) + return r; + + types = t + 1; + n_struct = k; + n_array = va_arg(ap, unsigned); + + break; + } + + case SD_BUS_TYPE_VARIANT: { + const char *s; + + s = va_arg(ap, const char*); + if (!s) + return -EINVAL; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_VARIANT, s); + if (r < 0) + return r; + + r = type_stack_push(stack, ELEMENTSOF(stack), &stack_ptr, types, n_struct, n_array); + if (r < 0) + return r; + + types = s; + n_struct = strlen(s); + n_array = UINT_MAX; + + break; + } + + case SD_BUS_TYPE_STRUCT_BEGIN: + case SD_BUS_TYPE_DICT_ENTRY_BEGIN: { + size_t k; + + r = signature_element_length(t, &k); + if (r < 0) + return r; + if (k < 2) + return -ERANGE; + + { + char s[k - 1]; + + memcpy(s, t + 1, k - 2); + s[k - 2] = 0; + + r = sd_bus_message_open_container(m, *t == SD_BUS_TYPE_STRUCT_BEGIN ? SD_BUS_TYPE_STRUCT : SD_BUS_TYPE_DICT_ENTRY, s); + if (r < 0) + return r; + } + + if (n_array == UINT_MAX) { + types += k - 1; + n_struct -= k - 1; + } + + r = type_stack_push(stack, ELEMENTSOF(stack), &stack_ptr, types, n_struct, n_array); + if (r < 0) + return r; + + types = t + 1; + n_struct = k - 2; + n_array = UINT_MAX; + + break; + } + + default: + r = -EINVAL; + } + + if (r < 0) + return r; + } + + return 1; +} + +_public_ int sd_bus_message_append(sd_bus_message *m, const char *types, ...) { + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_message_appendv(m, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_message_append_array_space( + sd_bus_message *m, + char type, + size_t size, + void **ptr) { + + ssize_t align, sz; + void *a; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(bus_type_is_trivial(type) && type != SD_BUS_TYPE_BOOLEAN, -EINVAL); + assert_return(ptr || size == 0, -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + align = bus_type_get_alignment(type); + sz = bus_type_get_size(type); + + assert_se(align > 0); + assert_se(sz > 0); + + if (size % sz != 0) + return -EINVAL; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_ARRAY, CHAR_TO_STR(type)); + if (r < 0) + return r; + + a = message_extend_body(m, align, size); + if (!a) + return -ENOMEM; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + *ptr = a; + return 0; +} + +_public_ int sd_bus_message_append_array( + sd_bus_message *m, + char type, + const void *ptr, + size_t size) { + int r; + void *p; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(bus_type_is_trivial(type), -EINVAL); + assert_return(ptr || size == 0, -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + r = sd_bus_message_append_array_space(m, type, size, &p); + if (r < 0) + return r; + + memcpy_safe(p, ptr, size); + + return 0; +} + +_public_ int sd_bus_message_append_array_iovec( + sd_bus_message *m, + char type, + const struct iovec *iov, + unsigned n /* should be size_t, but is API now… 😞 */) { + + size_t size; + unsigned i; + void *p; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(bus_type_is_trivial(type), -EINVAL); + assert_return(iov || n == 0, -EINVAL); + assert_return(!m->poisoned, -ESTALE); + + size = iovec_total_size(iov, n); + + r = sd_bus_message_append_array_space(m, type, size, &p); + if (r < 0) + return r; + + for (i = 0; i < n; i++) { + + if (iov[i].iov_base) + memcpy(p, iov[i].iov_base, iov[i].iov_len); + else + memzero(p, iov[i].iov_len); + + p = (uint8_t*) p + iov[i].iov_len; + } + + return 0; +} + +_public_ int sd_bus_message_append_array_memfd( + sd_bus_message *m, + char type, + int memfd, + uint64_t offset, + uint64_t size) { + + _cleanup_close_ int copy_fd = -EBADF; + struct bus_body_part *part; + ssize_t align, sz; + uint64_t real_size; + void *a; + int r; + + assert_return(m, -EINVAL); + assert_return(memfd >= 0, -EBADF); + assert_return(bus_type_is_trivial(type), -EINVAL); + assert_return(size > 0, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->poisoned, -ESTALE); + + r = memfd_set_sealed(memfd); + if (r < 0) + return r; + + copy_fd = fcntl(memfd, F_DUPFD_CLOEXEC, 3); + if (copy_fd < 0) + return copy_fd; + + r = memfd_get_size(memfd, &real_size); + if (r < 0) + return r; + + if (offset == 0 && size == UINT64_MAX) + size = real_size; + else if (offset + size > real_size) + return -EMSGSIZE; + + align = bus_type_get_alignment(type); + sz = bus_type_get_size(type); + + assert_se(align > 0); + assert_se(sz > 0); + + if (offset % align != 0) + return -EINVAL; + + if (size % sz != 0) + return -EINVAL; + + if (size > (uint64_t) UINT32_MAX) + return -EINVAL; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_ARRAY, CHAR_TO_STR(type)); + if (r < 0) + return r; + + a = message_extend_body(m, align, 0); + if (!a) + return -ENOMEM; + + part = message_append_part(m); + if (!part) + return -ENOMEM; + + part->memfd = copy_fd; + part->memfd_offset = offset; + part->sealed = true; + part->size = size; + copy_fd = -EBADF; + + m->body_size += size; + message_extend_containers(m, size); + + return sd_bus_message_close_container(m); +} + +_public_ int sd_bus_message_append_string_memfd( + sd_bus_message *m, + int memfd, + uint64_t offset, + uint64_t size) { + + _cleanup_close_ int copy_fd = -EBADF; + struct bus_body_part *part; + struct bus_container *c; + uint64_t real_size; + void *a; + int r; + + assert_return(m, -EINVAL); + assert_return(memfd >= 0, -EBADF); + assert_return(size > 0, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->poisoned, -ESTALE); + + r = memfd_set_sealed(memfd); + if (r < 0) + return r; + + copy_fd = fcntl(memfd, FD_CLOEXEC, 3); + if (copy_fd < 0) + return copy_fd; + + r = memfd_get_size(memfd, &real_size); + if (r < 0) + return r; + + if (offset == 0 && size == UINT64_MAX) + size = real_size; + else if (offset + size > real_size) + return -EMSGSIZE; + + /* We require this to be NUL terminated */ + if (size == 0) + return -EINVAL; + + if (size > (uint64_t) UINT32_MAX) + return -EINVAL; + + c = message_get_last_container(m); + if (c->signature && c->signature[c->index]) { + /* Container signature is already set */ + + if (c->signature[c->index] != SD_BUS_TYPE_STRING) + return -ENXIO; + } else { + char *e; + + /* Maybe we can append to the signature? But only if this is the top-level container */ + if (c->enclosing != 0) + return -ENXIO; + + e = strextend(&c->signature, CHAR_TO_STR(SD_BUS_TYPE_STRING)); + if (!e) { + m->poisoned = true; + return -ENOMEM; + } + } + + a = message_extend_body(m, 4, 4); + if (!a) + return -ENOMEM; + + *(uint32_t*) a = size - 1; + + part = message_append_part(m); + if (!part) + return -ENOMEM; + + part->memfd = copy_fd; + part->memfd_offset = offset; + part->sealed = true; + part->size = size; + copy_fd = -EBADF; + + m->body_size += size; + message_extend_containers(m, size); + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index++; + + return 0; +} + +_public_ int sd_bus_message_append_strv(sd_bus_message *m, char **l) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->poisoned, -ESTALE); + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return r; + + STRV_FOREACH(i, l) { + r = sd_bus_message_append_basic(m, 's', *i); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(m); +} + +static int bus_message_close_header(sd_bus_message *m) { + assert(m); + + /* The actual user data is finished now, we just complete the variant and struct now. Remember + * this position, so that during parsing we know where to put the outer container end. */ + m->user_body_size = m->body_size; + + m->header->fields_size = m->fields_size; + m->header->body_size = m->body_size; + + return 0; +} + +_public_ int sd_bus_message_seal(sd_bus_message *m, uint64_t cookie, uint64_t timeout_usec) { + struct bus_body_part *part; + size_t a; + unsigned i; + int r; + + assert_return(m, -EINVAL); + + if (m->sealed) + return -EPERM; + + if (m->n_containers > 0) + return -EBADMSG; + + if (m->poisoned) + return -ESTALE; + + if (cookie > UINT32_MAX) + return -EOPNOTSUPP; + + /* In vtables the return signature of method calls is listed, + * let's check if they match if this is a response */ + if (m->header->type == SD_BUS_MESSAGE_METHOD_RETURN && + m->enforced_reply_signature && + !streq(strempty(m->root_container.signature), m->enforced_reply_signature)) + return -ENOMSG; + + /* If there's a non-trivial signature set, then add it in here */ + if (!isempty(m->root_container.signature)) { + r = message_append_field_signature(m, BUS_MESSAGE_HEADER_SIGNATURE, m->root_container.signature, NULL); + if (r < 0) + return r; + } + + if (m->n_fds > 0) { + r = message_append_field_uint32(m, BUS_MESSAGE_HEADER_UNIX_FDS, m->n_fds); + if (r < 0) + return r; + } + + r = bus_message_close_header(m); + if (r < 0) + return r; + + m->header->serial = (uint32_t) cookie; + + m->timeout = m->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED ? 0 : timeout_usec; + + /* Add padding at the end of the fields part, since we know + * the body needs to start at an 8 byte alignment. We made + * sure we allocated enough space for this, so all we need to + * do here is to zero it out. */ + a = ALIGN8(m->fields_size) - m->fields_size; + if (a > 0) + memzero((uint8_t*) BUS_MESSAGE_FIELDS(m) + m->fields_size, a); + + /* If this is something we can send as memfd, then let's seal + the memfd now. Note that we can send memfds as payload only + for directed messages, and not for broadcasts. */ + if (m->destination && m->bus->use_memfd) { + MESSAGE_FOREACH_PART(part, i, m) + if (part->memfd >= 0 && + !part->sealed && + (part->size > MEMFD_MIN_SIZE || m->bus->use_memfd < 0) && + part != m->body_end) { /* The last part may never be sent as memfd */ + uint64_t sz; + + /* Try to seal it if that makes + * sense. First, unmap our own map to + * make sure we don't keep it busy. */ + bus_body_part_unmap(part); + + /* Then, sync up real memfd size */ + sz = part->size; + r = memfd_set_size(part->memfd, sz); + if (r < 0) + return r; + + /* Finally, try to seal */ + if (memfd_set_sealed(part->memfd) >= 0) + part->sealed = true; + } + } + + m->root_container.end = m->user_body_size; + m->root_container.index = 0; + + m->sealed = true; + + return 0; +} + +int bus_body_part_map(struct bus_body_part *part) { + void *p; + size_t psz, shift; + + assert_se(part); + + if (part->data) + return 0; + + if (part->size <= 0) + return 0; + + /* For smaller zero parts (as used for padding) we don't need to map anything... */ + if (part->memfd < 0 && part->is_zero && part->size < 8) { + static const uint8_t zeroes[7] = { }; + part->data = (void*) zeroes; + return 0; + } + + shift = PAGE_OFFSET(part->memfd_offset); + psz = PAGE_ALIGN(part->size + shift); + if (psz >= SIZE_MAX) + return -EFBIG; + + if (part->memfd >= 0) + p = mmap(NULL, psz, PROT_READ, MAP_PRIVATE, part->memfd, part->memfd_offset - shift); + else if (part->is_zero) + p = mmap(NULL, psz, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + else + return -EINVAL; + + if (p == MAP_FAILED) + return -errno; + + part->mapped = psz; + part->mmap_begin = p; + part->data = (uint8_t*) p + shift; + part->munmap_this = true; + + return 0; +} + +void bus_body_part_unmap(struct bus_body_part *part) { + + assert_se(part); + + if (part->memfd < 0) + return; + + if (!part->mmap_begin) + return; + + if (!part->munmap_this) + return; + + assert_se(munmap(part->mmap_begin, part->mapped) == 0); + + part->mmap_begin = NULL; + part->data = NULL; + part->mapped = 0; + part->munmap_this = false; + + return; +} + +static bool message_end_of_signature(sd_bus_message *m) { + struct bus_container *c; + + assert(m); + + c = message_get_last_container(m); + return !c->signature || c->signature[c->index] == 0; +} + +static bool message_end_of_array(sd_bus_message *m, size_t index) { + struct bus_container *c; + + assert(m); + + c = message_get_last_container(m); + if (c->enclosing != SD_BUS_TYPE_ARRAY) + return false; + + assert(c->array_size); + return index >= c->begin + BUS_MESSAGE_BSWAP32(m, *c->array_size); +} + +_public_ int sd_bus_message_at_end(sd_bus_message *m, int complete) { + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + + if (complete && m->n_containers > 0) + return false; + + if (message_end_of_signature(m)) + return true; + + if (message_end_of_array(m, m->rindex)) + return true; + + return false; +} + +static struct bus_body_part* find_part(sd_bus_message *m, size_t index, size_t sz, void **p) { + struct bus_body_part *part; + size_t begin; + int r; + + assert(m); + + if (m->cached_rindex_part && index >= m->cached_rindex_part_begin) { + part = m->cached_rindex_part; + begin = m->cached_rindex_part_begin; + } else { + part = &m->body; + begin = 0; + } + + while (part) { + if (index < begin) + return NULL; + + if (index + sz <= begin + part->size) { + + r = bus_body_part_map(part); + if (r < 0) + return NULL; + + if (p) + *p = part->data ? (uint8_t*) part->data + index - begin + : NULL; /* Avoid dereferencing a NULL pointer. */ + + m->cached_rindex_part = part; + m->cached_rindex_part_begin = begin; + + return part; + } + + begin += part->size; + part = part->next; + } + + return NULL; +} + +static int message_peek_body( + sd_bus_message *m, + size_t *rindex, + size_t align, + size_t nbytes, + void **ret) { + + size_t k, start, end, padding; + struct bus_body_part *part; + uint8_t *q; + + assert(m); + assert(rindex); + assert(align > 0); + + start = ALIGN_TO(*rindex, align); + if (start > m->user_body_size) + return -EBADMSG; + + padding = start - *rindex; + + /* Avoid overflow below */ + if (nbytes > SIZE_MAX - start) + return -EBADMSG; + + end = start + nbytes; + if (end > m->user_body_size) + return -EBADMSG; + + part = find_part(m, *rindex, padding, (void**) &q); + if (!part) + return -EBADMSG; + + if (q) { + /* Verify padding */ + for (k = 0; k < padding; k++) + if (q[k] != 0) + return -EBADMSG; + } + + part = find_part(m, start, nbytes, (void**) &q); + if (!part || (nbytes > 0 && !q)) + return -EBADMSG; + + *rindex = end; + + if (ret) + *ret = q; + + return 0; +} + +static bool validate_nul(const char *s, size_t l) { + + /* Check for NUL chars in the string */ + if (memchr(s, 0, l)) + return false; + + /* Check for NUL termination */ + if (s[l] != 0) + return false; + + return true; +} + +static bool validate_string(const char *s, size_t l) { + + if (!validate_nul(s, l)) + return false; + + /* Check if valid UTF8 */ + if (!utf8_is_valid(s)) + return false; + + return true; +} + +static bool validate_signature(const char *s, size_t l) { + + if (!validate_nul(s, l)) + return false; + + /* Check if valid signature */ + if (!signature_is_valid(s, true)) + return false; + + return true; +} + +static bool validate_object_path(const char *s, size_t l) { + + if (!validate_nul(s, l)) + return false; + + if (!object_path_is_valid(s)) + return false; + + return true; +} + +_public_ int sd_bus_message_read_basic(sd_bus_message *m, char type, void *p) { + struct bus_container *c; + size_t rindex; + void *q; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(bus_type_is_basic(type), -EINVAL); + + if (message_end_of_signature(m)) + return -ENXIO; + + if (message_end_of_array(m, m->rindex)) + return 0; + + c = message_get_last_container(m); + if (c->signature[c->index] != type) + return -ENXIO; + + rindex = m->rindex; + + if (IN_SET(type, SD_BUS_TYPE_STRING, SD_BUS_TYPE_OBJECT_PATH)) { + uint32_t l; + bool ok; + + r = message_peek_body(m, &rindex, 4, 4, &q); + if (r < 0) + return r; + + l = BUS_MESSAGE_BSWAP32(m, *(uint32_t*) q); + if (l == UINT32_MAX) + /* avoid overflow right below */ + return -EBADMSG; + + r = message_peek_body(m, &rindex, 1, l+1, &q); + if (r < 0) + return r; + + if (type == SD_BUS_TYPE_OBJECT_PATH) + ok = validate_object_path(q, l); + else + ok = validate_string(q, l); + if (!ok) + return -EBADMSG; + + if (p) + *(const char**) p = q; + + } else if (type == SD_BUS_TYPE_SIGNATURE) { + uint8_t l; + + r = message_peek_body(m, &rindex, 1, 1, &q); + if (r < 0) + return r; + + l = *(uint8_t*) q; + if (l == UINT8_MAX) + /* avoid overflow right below */ + return -EBADMSG; + + r = message_peek_body(m, &rindex, 1, l+1, &q); + if (r < 0) + return r; + + if (!validate_signature(q, l)) + return -EBADMSG; + + if (p) + *(const char**) p = q; + + } else { + ssize_t sz, align; + + align = bus_type_get_alignment(type); + assert(align > 0); + + sz = bus_type_get_size(type); + assert(sz > 0); + + r = message_peek_body(m, &rindex, align, sz, &q); + if (r < 0) + return r; + + switch (type) { + + case SD_BUS_TYPE_BYTE: + if (p) + *(uint8_t*) p = *(uint8_t*) q; + break; + + case SD_BUS_TYPE_BOOLEAN: + if (p) + *(int*) p = !!*(uint32_t*) q; + break; + + case SD_BUS_TYPE_INT16: + case SD_BUS_TYPE_UINT16: + if (p) + *(uint16_t*) p = BUS_MESSAGE_BSWAP16(m, *(uint16_t*) q); + break; + + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: + if (p) + *(uint32_t*) p = BUS_MESSAGE_BSWAP32(m, *(uint32_t*) q); + break; + + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: + case SD_BUS_TYPE_DOUBLE: + if (p) + *(uint64_t*) p = BUS_MESSAGE_BSWAP64(m, *(uint64_t*) q); + break; + + case SD_BUS_TYPE_UNIX_FD: { + uint32_t j; + + j = BUS_MESSAGE_BSWAP32(m, *(uint32_t*) q); + if (j >= m->n_fds) + return -EBADMSG; + + if (p) + *(int*) p = m->fds[j]; + break; + } + + default: + assert_not_reached(); + } + } + + m->rindex = rindex; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index++; + + return 1; +} + +static int bus_message_enter_array( + sd_bus_message *m, + struct bus_container *c, + const char *contents, + uint32_t **array_size) { + + size_t rindex; + void *q; + int alignment, r; + + assert(m); + assert(c); + assert(contents); + assert(array_size); + + if (!signature_is_single(contents, true)) + return -EINVAL; + + if (!c->signature || c->signature[c->index] == 0) + return -ENXIO; + + if (c->signature[c->index] != SD_BUS_TYPE_ARRAY) + return -ENXIO; + + if (!startswith(c->signature + c->index + 1, contents)) + return -ENXIO; + + rindex = m->rindex; + + r = message_peek_body(m, &rindex, 4, 4, &q); + if (r < 0) + return r; + + if (BUS_MESSAGE_BSWAP32(m, *(uint32_t*) q) > BUS_ARRAY_MAX_SIZE) + return -EBADMSG; + + alignment = bus_type_get_alignment(contents[0]); + if (alignment < 0) + return alignment; + + r = message_peek_body(m, &rindex, alignment, 0, NULL); + if (r < 0) + return r; + + *array_size = (uint32_t*) q; + + m->rindex = rindex; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index += 1 + strlen(contents); + + return 1; +} + +static int bus_message_enter_variant( + sd_bus_message *m, + struct bus_container *c, + const char *contents) { + + size_t rindex; + uint8_t l; + void *q; + int r; + + assert(m); + assert(c); + assert(contents); + + if (!signature_is_single(contents, false)) + return -EINVAL; + + if (*contents == SD_BUS_TYPE_DICT_ENTRY_BEGIN) + return -EINVAL; + + if (!c->signature || c->signature[c->index] == 0) + return -ENXIO; + + if (c->signature[c->index] != SD_BUS_TYPE_VARIANT) + return -ENXIO; + + rindex = m->rindex; + + r = message_peek_body(m, &rindex, 1, 1, &q); + if (r < 0) + return r; + + l = *(uint8_t*) q; + if (l == UINT8_MAX) + /* avoid overflow right below */ + return -EBADMSG; + + r = message_peek_body(m, &rindex, 1, l+1, &q); + if (r < 0) + return r; + + if (!validate_signature(q, l)) + return -EBADMSG; + + if (!streq(q, contents)) + return -ENXIO; + + m->rindex = rindex; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index++; + + return 1; +} + +static int bus_message_enter_struct( + sd_bus_message *m, + struct bus_container *c, + const char *contents) { + + size_t l; + int r; + + assert(m); + assert(c); + assert(contents); + + if (!signature_is_valid(contents, false)) + return -EINVAL; + + if (!c->signature || c->signature[c->index] == 0) + return -ENXIO; + + l = strlen(contents); + + if (c->signature[c->index] != SD_BUS_TYPE_STRUCT_BEGIN || + !startswith(c->signature + c->index + 1, contents) || + c->signature[c->index + 1 + l] != SD_BUS_TYPE_STRUCT_END) + return -ENXIO; + + r = message_peek_body(m, &m->rindex, 8, 0, NULL); + if (r < 0) + return r; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index += 1 + l + 1; + + return 1; +} + +static int bus_message_enter_dict_entry( + sd_bus_message *m, + struct bus_container *c, + const char *contents) { + + size_t l; + int r; + + assert(m); + assert(c); + assert(contents); + + if (!signature_is_pair(contents)) + return -EINVAL; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + return -ENXIO; + + if (!c->signature || c->signature[c->index] == 0) + return 0; + + l = strlen(contents); + + if (c->signature[c->index] != SD_BUS_TYPE_DICT_ENTRY_BEGIN || + !startswith(c->signature + c->index + 1, contents) || + c->signature[c->index + 1 + l] != SD_BUS_TYPE_DICT_ENTRY_END) + return -ENXIO; + + r = message_peek_body(m, &m->rindex, 8, 0, NULL); + if (r < 0) + return r; + + if (c->enclosing != SD_BUS_TYPE_ARRAY) + c->index += 1 + l + 1; + + return 1; +} + +_public_ int sd_bus_message_enter_container(sd_bus_message *m, + char type, + const char *contents) { + struct bus_container *c; + uint32_t *array_size = NULL; + _cleanup_free_ char *signature = NULL; + size_t before; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(type != 0 || !contents, -EINVAL); + + if (type == 0 || !contents) { + const char *cc; + char tt; + + /* Allow entering into anonymous containers */ + r = sd_bus_message_peek_type(m, &tt, &cc); + if (r < 0) + return r; + + if (type != 0 && type != tt) + return -ENXIO; + + if (contents && !streq(contents, cc)) + return -ENXIO; + + type = tt; + contents = cc; + } + + /* + * We enforce a global limit on container depth, that is much + * higher than the 32 structs and 32 arrays the specification + * mandates. This is simpler to implement for us, and we need + * this only to ensure our container array doesn't grow + * without bounds. We are happy to return any data from a + * message as long as the data itself is valid, even if the + * overall message might be not. + * + * Note that the message signature is validated when + * parsing the headers, and that validation does check the + * 32/32 limit. + * + * Note that the specification defines no limits on the depth + * of stacked variants, but we do. + */ + if (m->n_containers >= BUS_CONTAINER_DEPTH) + return -EBADMSG; + + if (!GREEDY_REALLOC(m->containers, m->n_containers + 1)) + return -ENOMEM; + + if (message_end_of_signature(m)) + return -ENXIO; + + if (message_end_of_array(m, m->rindex)) + return 0; + + c = message_get_last_container(m); + + signature = strdup(contents); + if (!signature) + return -ENOMEM; + + c->saved_index = c->index; + before = m->rindex; + + if (type == SD_BUS_TYPE_ARRAY) + r = bus_message_enter_array(m, c, contents, &array_size); + else if (type == SD_BUS_TYPE_VARIANT) + r = bus_message_enter_variant(m, c, contents); + else if (type == SD_BUS_TYPE_STRUCT) + r = bus_message_enter_struct(m, c, contents); + else if (type == SD_BUS_TYPE_DICT_ENTRY) + r = bus_message_enter_dict_entry(m, c, contents); + else + r = -EINVAL; + if (r <= 0) + return r; + + /* OK, let's fill it in */ + m->containers[m->n_containers++] = (struct bus_container) { + .enclosing = type, + .signature = TAKE_PTR(signature), + + .before = before, + .begin = m->rindex, + /* Unary type has fixed size of 1, but virtual size of 0 */ + .end = m->rindex, + .array_size = array_size, + }; + + return 1; +} + +_public_ int sd_bus_message_exit_container(sd_bus_message *m) { + struct bus_container *c; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(m->n_containers > 0, -ENXIO); + + c = message_get_last_container(m); + + if (c->enclosing != SD_BUS_TYPE_ARRAY) { + if (c->signature && c->signature[c->index] != 0) + return -EBUSY; + } + + if (c->enclosing == SD_BUS_TYPE_ARRAY) { + uint32_t l; + + l = BUS_MESSAGE_BSWAP32(m, *c->array_size); + if (c->begin + l != m->rindex) + return -EBUSY; + } + + message_free_last_container(m); + + return 1; +} + +static void message_quit_container(sd_bus_message *m) { + struct bus_container *c; + + assert(m); + assert(m->sealed); + assert(m->n_containers > 0); + + /* Undo seeks */ + c = message_get_last_container(m); + assert(m->rindex >= c->before); + m->rindex = c->before; + + /* Free container */ + message_free_last_container(m); + + /* Correct index of new top-level container */ + c = message_get_last_container(m); + c->index = c->saved_index; +} + +_public_ int sd_bus_message_peek_type(sd_bus_message *m, char *type, const char **contents) { + struct bus_container *c; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + + if (message_end_of_signature(m)) + goto eof; + + if (message_end_of_array(m, m->rindex)) + goto eof; + + c = message_get_last_container(m); + + if (bus_type_is_basic(c->signature[c->index])) { + if (contents) + *contents = NULL; + if (type) + *type = c->signature[c->index]; + return 1; + } + + if (c->signature[c->index] == SD_BUS_TYPE_ARRAY) { + + if (contents) { + size_t l; + + r = signature_element_length(c->signature+c->index+1, &l); + if (r < 0) + return r; + + /* signature_element_length does verification internally */ + + /* The array element must not be empty */ + assert(l >= 1); + if (free_and_strndup(&c->peeked_signature, + c->signature + c->index + 1, l) < 0) + return -ENOMEM; + + *contents = c->peeked_signature; + } + + if (type) + *type = SD_BUS_TYPE_ARRAY; + + return 1; + } + + if (IN_SET(c->signature[c->index], SD_BUS_TYPE_STRUCT_BEGIN, SD_BUS_TYPE_DICT_ENTRY_BEGIN)) { + + if (contents) { + size_t l; + + r = signature_element_length(c->signature+c->index, &l); + if (r < 0) + return r; + + assert(l >= 3); + if (free_and_strndup(&c->peeked_signature, + c->signature + c->index + 1, l - 2) < 0) + return -ENOMEM; + + *contents = c->peeked_signature; + } + + if (type) + *type = c->signature[c->index] == SD_BUS_TYPE_STRUCT_BEGIN ? SD_BUS_TYPE_STRUCT : SD_BUS_TYPE_DICT_ENTRY; + + return 1; + } + + if (c->signature[c->index] == SD_BUS_TYPE_VARIANT) { + if (contents) { + size_t rindex, l; + void *q; + + rindex = m->rindex; + r = message_peek_body(m, &rindex, 1, 1, &q); + if (r < 0) + return r; + + l = *(uint8_t*) q; + if (l == UINT8_MAX) + /* avoid overflow right below */ + return -EBADMSG; + + r = message_peek_body(m, &rindex, 1, l+1, &q); + if (r < 0) + return r; + + if (!validate_signature(q, l)) + return -EBADMSG; + + *contents = q; + } + + if (type) + *type = SD_BUS_TYPE_VARIANT; + + return 1; + } + + return -EINVAL; + +eof: + if (type) + *type = 0; + if (contents) + *contents = NULL; + return 0; +} + +_public_ int sd_bus_message_rewind(sd_bus_message *m, int complete) { + struct bus_container *c; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + + if (complete) { + message_reset_containers(m); + m->rindex = 0; + + c = message_get_last_container(m); + } else { + c = message_get_last_container(m); + + c->index = 0; + m->rindex = c->begin; + } + + return !isempty(c->signature); +} + +_public_ int sd_bus_message_readv( + sd_bus_message *m, + const char *types, + va_list ap) { + + unsigned n_array, n_struct; + TypeStack stack[BUS_CONTAINER_DEPTH]; + unsigned stack_ptr = 0; + unsigned n_loop = 0; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(types, -EINVAL); + + if (isempty(types)) + return 0; + + /* Ideally, we'd just call ourselves recursively on every + * complex type. However, the state of a va_list that is + * passed to a function is undefined after that function + * returns. This means we need to decode the va_list linearly + * in a single stackframe. We hence implement our own + * home-grown stack in an array. */ + + n_array = UINT_MAX; /* length of current array entries */ + n_struct = strlen(types); /* length of current struct contents signature */ + + for (;;) { + const char *t; + + n_loop++; + + if (n_array == 0 || (n_array == UINT_MAX && n_struct == 0)) { + r = type_stack_pop(stack, ELEMENTSOF(stack), &stack_ptr, &types, &n_struct, &n_array); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + continue; + } + + t = types; + if (n_array != UINT_MAX) + n_array--; + else { + types++; + n_struct--; + } + + switch (*t) { + + case SD_BUS_TYPE_BYTE: + case SD_BUS_TYPE_BOOLEAN: + case SD_BUS_TYPE_INT16: + case SD_BUS_TYPE_UINT16: + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: + case SD_BUS_TYPE_DOUBLE: + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_OBJECT_PATH: + case SD_BUS_TYPE_SIGNATURE: + case SD_BUS_TYPE_UNIX_FD: { + void *p; + + p = va_arg(ap, void*); + r = sd_bus_message_read_basic(m, *t, p); + if (r < 0) + return r; + if (r == 0) { + if (n_loop <= 1) + return 0; + + return -ENXIO; + } + + break; + } + + case SD_BUS_TYPE_ARRAY: { + size_t k; + + r = signature_element_length(t + 1, &k); + if (r < 0) + return r; + + { + char s[k + 1]; + memcpy(s, t + 1, k); + s[k] = 0; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, s); + if (r < 0) + return r; + if (r == 0) { + if (n_loop <= 1) + return 0; + + return -ENXIO; + } + } + + if (n_array == UINT_MAX) { + types += k; + n_struct -= k; + } + + r = type_stack_push(stack, ELEMENTSOF(stack), &stack_ptr, types, n_struct, n_array); + if (r < 0) + return r; + + types = t + 1; + n_struct = k; + n_array = va_arg(ap, unsigned); + + break; + } + + case SD_BUS_TYPE_VARIANT: { + const char *s; + + s = va_arg(ap, const char *); + if (!s) + return -EINVAL; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_VARIANT, s); + if (r < 0) + return r; + if (r == 0) { + if (n_loop <= 1) + return 0; + + return -ENXIO; + } + + r = type_stack_push(stack, ELEMENTSOF(stack), &stack_ptr, types, n_struct, n_array); + if (r < 0) + return r; + + types = s; + n_struct = strlen(s); + n_array = UINT_MAX; + + break; + } + + case SD_BUS_TYPE_STRUCT_BEGIN: + case SD_BUS_TYPE_DICT_ENTRY_BEGIN: { + size_t k; + + r = signature_element_length(t, &k); + if (r < 0) + return r; + if (k < 2) + return -ERANGE; + + { + char s[k - 1]; + memcpy(s, t + 1, k - 2); + s[k - 2] = 0; + + r = sd_bus_message_enter_container(m, *t == SD_BUS_TYPE_STRUCT_BEGIN ? SD_BUS_TYPE_STRUCT : SD_BUS_TYPE_DICT_ENTRY, s); + if (r < 0) + return r; + if (r == 0) { + if (n_loop <= 1) + return 0; + return -ENXIO; + } + } + + if (n_array == UINT_MAX) { + types += k - 1; + n_struct -= k - 1; + } + + r = type_stack_push(stack, ELEMENTSOF(stack), &stack_ptr, types, n_struct, n_array); + if (r < 0) + return r; + + types = t + 1; + n_struct = k - 2; + n_array = UINT_MAX; + + break; + } + + default: + return -EINVAL; + } + } + + return 1; +} + +_public_ int sd_bus_message_read(sd_bus_message *m, const char *types, ...) { + va_list ap; + int r; + + va_start(ap, types); + r = sd_bus_message_readv(m, types, ap); + va_end(ap); + + return r; +} + +_public_ int sd_bus_message_skip(sd_bus_message *m, const char *types) { + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + + /* If types is NULL, read exactly one element */ + if (!types) { + struct bus_container *c; + size_t l; + + if (message_end_of_signature(m)) + return -ENXIO; + + if (message_end_of_array(m, m->rindex)) + return 0; + + c = message_get_last_container(m); + + r = signature_element_length(c->signature + c->index, &l); + if (r < 0) + return r; + + types = strndupa_safe(c->signature + c->index, l); + } + + switch (*types) { + + case 0: /* Nothing to drop */ + return 0; + + case SD_BUS_TYPE_BYTE: + case SD_BUS_TYPE_BOOLEAN: + case SD_BUS_TYPE_INT16: + case SD_BUS_TYPE_UINT16: + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: + case SD_BUS_TYPE_DOUBLE: + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_OBJECT_PATH: + case SD_BUS_TYPE_SIGNATURE: + case SD_BUS_TYPE_UNIX_FD: + + r = sd_bus_message_read_basic(m, *types, NULL); + if (r <= 0) + return r; + + r = sd_bus_message_skip(m, types + 1); + if (r < 0) + return r; + + return 1; + + case SD_BUS_TYPE_ARRAY: { + size_t k; + + r = signature_element_length(types + 1, &k); + if (r < 0) + return r; + + { + char s[k+1]; + memcpy(s, types+1, k); + s[k] = 0; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, s); + if (r <= 0) + return r; + + for (;;) { + r = sd_bus_message_skip(m, s); + if (r < 0) + return r; + if (r == 0) + break; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + } + + r = sd_bus_message_skip(m, types + 1 + k); + if (r < 0) + return r; + + return 1; + } + + case SD_BUS_TYPE_VARIANT: { + const char *contents; + char x; + + r = sd_bus_message_peek_type(m, &x, &contents); + if (r <= 0) + return r; + + if (x != SD_BUS_TYPE_VARIANT) + return -ENXIO; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_VARIANT, contents); + if (r <= 0) + return r; + + r = sd_bus_message_skip(m, contents); + if (r < 0) + return r; + assert(r != 0); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + r = sd_bus_message_skip(m, types + 1); + if (r < 0) + return r; + + return 1; + } + + case SD_BUS_TYPE_STRUCT_BEGIN: + case SD_BUS_TYPE_DICT_ENTRY_BEGIN: { + size_t k; + + r = signature_element_length(types, &k); + if (r < 0) + return r; + if (k < 2) + return -ERANGE; + + { + char s[k-1]; + memcpy(s, types+1, k-2); + s[k-2] = 0; + + r = sd_bus_message_enter_container(m, *types == SD_BUS_TYPE_STRUCT_BEGIN ? SD_BUS_TYPE_STRUCT : SD_BUS_TYPE_DICT_ENTRY, s); + if (r <= 0) + return r; + + r = sd_bus_message_skip(m, s); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + } + + r = sd_bus_message_skip(m, types + k); + if (r < 0) + return r; + + return 1; + } + + default: + return -EINVAL; + } +} + +_public_ int sd_bus_message_read_array( + sd_bus_message *m, + char type, + const void **ptr, + size_t *size) { + + struct bus_container *c; + void *p; + size_t sz; + ssize_t align; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(bus_type_is_trivial(type), -EINVAL); + assert_return(ptr, -EINVAL); + assert_return(size, -EINVAL); + assert_return(!BUS_MESSAGE_NEED_BSWAP(m), -EOPNOTSUPP); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, CHAR_TO_STR(type)); + if (r < 0) + return r; + if (r == 0) { + *ptr = NULL; + *size = 0; + return 0; + } + + c = message_get_last_container(m); + + align = bus_type_get_alignment(type); + if (align < 0) + return align; + + sz = BUS_MESSAGE_BSWAP32(m, *c->array_size); + + if (sz == 0) + /* Zero length array, let's return some aligned + * pointer that is not NULL */ + p = (uint8_t*) align; + else { + r = message_peek_body(m, &m->rindex, align, sz, &p); + if (r < 0) + goto fail; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + goto fail; + + *ptr = (const void*) p; + *size = sz; + + return 1; + +fail: + message_quit_container(m); + return r; +} + +static int message_peek_fields( + sd_bus_message *m, + size_t *rindex, + size_t align, + size_t nbytes, + void **ret) { + + size_t start, end; + + assert(m); + assert(rindex); + assert(align > 0); + + start = ALIGN_TO(*rindex, align); + if (start > m->fields_size) + return -EBADMSG; + + /* Avoid overflow below */ + if (nbytes > SIZE_MAX - start) + return -EBADMSG; + + end = start + nbytes; + if (end > m->fields_size) + return -EBADMSG; + + /* Verify that padding is 0 */ + uint8_t *p = BUS_MESSAGE_FIELDS(m); + for (size_t k = *rindex; k < start; k++) + if (p[k] != 0) + return -EBADMSG; + + if (ret) + *ret = p + start; + + *rindex = end; + return 1; +} + +static int message_peek_field_uint32( + sd_bus_message *m, + size_t *ri, + size_t item_size, + uint32_t *ret) { + + int r; + void *q; + + assert(m); + assert(ri); + + r = message_peek_fields(m, ri, 4, 4, &q); + if (r < 0) + return r; + + if (ret) + *ret = BUS_MESSAGE_BSWAP32(m, *(uint32_t*) q); + + return 0; +} + +static int message_peek_field_string( + sd_bus_message *m, + bool (*validate)(const char *p), + size_t *ri, + size_t item_size, + const char **ret) { + + uint32_t l; + int r; + void *q; + + assert(m); + assert(ri); + + r = message_peek_field_uint32(m, ri, 4, &l); + if (r < 0) + return r; + + if (l == UINT32_MAX) + /* avoid overflow right below */ + return -EBADMSG; + + r = message_peek_fields(m, ri, 1, l+1, &q); + if (r < 0) + return r; + + if (validate) { + if (!validate_nul(q, l)) + return -EBADMSG; + + if (!validate(q)) + return -EBADMSG; + } else { + if (!validate_string(q, l)) + return -EBADMSG; + } + + if (ret) + *ret = q; + + return 0; +} + +static int message_peek_field_signature( + sd_bus_message *m, + size_t *ri, + size_t item_size, + const char **ret) { + + size_t l; + int r; + void *q; + + assert(m); + assert(ri); + + r = message_peek_fields(m, ri, 1, 1, &q); + if (r < 0) + return r; + + l = *(uint8_t*) q; + if (l == UINT8_MAX) + /* avoid overflow right below */ + return -EBADMSG; + + r = message_peek_fields(m, ri, 1, l+1, &q); + if (r < 0) + return r; + + if (!validate_signature(q, l)) + return -EBADMSG; + + if (ret) + *ret = q; + + return 0; +} + +static int message_skip_fields( + sd_bus_message *m, + size_t *ri, + uint32_t array_size, + const char **signature) { + + size_t original_index; + int r; + + assert(m); + assert(ri); + assert(signature); + + original_index = *ri; + + for (;;) { + char t; + size_t l; + + if (array_size != UINT32_MAX && + array_size <= *ri - original_index) + return 0; + + t = **signature; + if (!t) + return 0; + + if (t == SD_BUS_TYPE_STRING) { + + r = message_peek_field_string(m, NULL, ri, 0, NULL); + if (r < 0) + return r; + + (*signature)++; + + } else if (t == SD_BUS_TYPE_OBJECT_PATH) { + + r = message_peek_field_string(m, object_path_is_valid, ri, 0, NULL); + if (r < 0) + return r; + + (*signature)++; + + } else if (t == SD_BUS_TYPE_SIGNATURE) { + + r = message_peek_field_signature(m, ri, 0, NULL); + if (r < 0) + return r; + + (*signature)++; + + } else if (bus_type_is_basic(t)) { + ssize_t align, k; + + align = bus_type_get_alignment(t); + k = bus_type_get_size(t); + assert(align > 0 && k > 0); + + r = message_peek_fields(m, ri, align, k, NULL); + if (r < 0) + return r; + + (*signature)++; + + } else if (t == SD_BUS_TYPE_ARRAY) { + + r = signature_element_length(*signature + 1, &l); + if (r < 0) + return r; + + assert(l >= 1); + { + char sig[l + 1], *s = sig; + uint32_t nas; + int alignment; + + strncpy(sig, *signature + 1, l); + sig[l] = '\0'; + + alignment = bus_type_get_alignment(sig[0]); + if (alignment < 0) + return alignment; + + r = message_peek_field_uint32(m, ri, 0, &nas); + if (r < 0) + return r; + if (nas > BUS_ARRAY_MAX_SIZE) + return -EBADMSG; + + r = message_peek_fields(m, ri, alignment, 0, NULL); + if (r < 0) + return r; + + r = message_skip_fields(m, ri, nas, (const char**) &s); + if (r < 0) + return r; + } + + (*signature) += 1 + l; + + } else if (t == SD_BUS_TYPE_VARIANT) { + const char *s; + + r = message_peek_field_signature(m, ri, 0, &s); + if (r < 0) + return r; + + r = message_skip_fields(m, ri, UINT32_MAX, (const char**) &s); + if (r < 0) + return r; + + (*signature)++; + + } else if (IN_SET(t, SD_BUS_TYPE_STRUCT, SD_BUS_TYPE_DICT_ENTRY)) { + + r = signature_element_length(*signature, &l); + if (r < 0) + return r; + + assert(l >= 2); + { + char sig[l + 1], *s = sig; + strncpy(sig, *signature + 1, l); + sig[l] = '\0'; + + r = message_skip_fields(m, ri, UINT32_MAX, (const char**) &s); + if (r < 0) + return r; + } + + *signature += l; + } else + return -EBADMSG; + } +} + +static int message_parse_fields(sd_bus_message *m) { + uint32_t unix_fds = 0; + bool unix_fds_set = false; + int r; + + assert(m); + + m->user_body_size = m->body_size; + + for (size_t ri = 0; ri < m->fields_size; ) { + const char *signature; + uint64_t field_type; + size_t item_size = SIZE_MAX; + uint8_t *u8; + + r = message_peek_fields(m, &ri, 8, 1, (void**) &u8); + if (r < 0) + return r; + + field_type = *u8; + + r = message_peek_field_signature(m, &ri, 0, &signature); + if (r < 0) + return r; + + switch (field_type) { + + case _BUS_MESSAGE_HEADER_INVALID: + return -EBADMSG; + + case BUS_MESSAGE_HEADER_PATH: + + if (m->path) + return -EBADMSG; + + if (!streq(signature, "o")) + return -EBADMSG; + + r = message_peek_field_string(m, object_path_is_valid, &ri, item_size, &m->path); + break; + + case BUS_MESSAGE_HEADER_INTERFACE: + + if (m->interface) + return -EBADMSG; + + if (!streq(signature, "s")) + return -EBADMSG; + + r = message_peek_field_string(m, interface_name_is_valid, &ri, item_size, &m->interface); + break; + + case BUS_MESSAGE_HEADER_MEMBER: + + if (m->member) + return -EBADMSG; + + if (!streq(signature, "s")) + return -EBADMSG; + + r = message_peek_field_string(m, member_name_is_valid, &ri, item_size, &m->member); + break; + + case BUS_MESSAGE_HEADER_ERROR_NAME: + + if (m->error.name) + return -EBADMSG; + + if (!streq(signature, "s")) + return -EBADMSG; + + r = message_peek_field_string(m, error_name_is_valid, &ri, item_size, &m->error.name); + if (r >= 0) + m->error._need_free = -1; + + break; + + case BUS_MESSAGE_HEADER_DESTINATION: + + if (m->destination) + return -EBADMSG; + + if (!streq(signature, "s")) + return -EBADMSG; + + r = message_peek_field_string(m, service_name_is_valid, &ri, item_size, &m->destination); + break; + + case BUS_MESSAGE_HEADER_SENDER: + + if (m->sender) + return -EBADMSG; + + if (!streq(signature, "s")) + return -EBADMSG; + + r = message_peek_field_string(m, service_name_is_valid, &ri, item_size, &m->sender); + + if (r >= 0 && m->sender[0] == ':' && m->bus->bus_client) { + m->creds.unique_name = (char*) m->sender; + m->creds.mask |= SD_BUS_CREDS_UNIQUE_NAME & m->bus->creds_mask; + } + + break; + + case BUS_MESSAGE_HEADER_SIGNATURE: { + const char *s; + char *c; + + if (m->root_container.signature) + return -EBADMSG; + + if (!streq(signature, "g")) + return -EBADMSG; + + r = message_peek_field_signature(m, &ri, item_size, &s); + if (r < 0) + return r; + + c = strdup(s); + if (!c) + return -ENOMEM; + + free_and_replace(m->root_container.signature, c); + break; + } + + case BUS_MESSAGE_HEADER_REPLY_SERIAL: { + uint32_t serial; + + if (m->reply_cookie != 0) + return -EBADMSG; + + + if (!streq(signature, "u")) + return -EBADMSG; + + r = message_peek_field_uint32(m, &ri, item_size, &serial); + if (r < 0) + return r; + + m->reply_cookie = serial; + + if (m->reply_cookie == 0) + return -EBADMSG; + + break; + } + case BUS_MESSAGE_HEADER_UNIX_FDS: + if (unix_fds_set) + return -EBADMSG; + + if (!streq(signature, "u")) + return -EBADMSG; + + r = message_peek_field_uint32(m, &ri, item_size, &unix_fds); + if (r < 0) + return -EBADMSG; + + unix_fds_set = true; + break; + + default: + r = message_skip_fields(m, &ri, UINT32_MAX, (const char **) &signature); + } + if (r < 0) + return r; + } + + if (m->n_fds != unix_fds) + return -EBADMSG; + + switch (m->header->type) { + + case SD_BUS_MESSAGE_SIGNAL: + if (!m->path || !m->interface || !m->member) + return -EBADMSG; + + if (m->reply_cookie != 0) + return -EBADMSG; + + break; + + case SD_BUS_MESSAGE_METHOD_CALL: + + if (!m->path || !m->member) + return -EBADMSG; + + if (m->reply_cookie != 0) + return -EBADMSG; + + break; + + case SD_BUS_MESSAGE_METHOD_RETURN: + + if (m->reply_cookie == 0) + return -EBADMSG; + break; + + case SD_BUS_MESSAGE_METHOD_ERROR: + + if (m->reply_cookie == 0 || !m->error.name) + return -EBADMSG; + break; + } + + /* Refuse non-local messages that claim they are local */ + if (streq_ptr(m->path, "/org/freedesktop/DBus/Local")) + return -EBADMSG; + if (streq_ptr(m->interface, "org.freedesktop.DBus.Local")) + return -EBADMSG; + if (streq_ptr(m->sender, "org.freedesktop.DBus.Local")) + return -EBADMSG; + + m->root_container.end = m->user_body_size; + + /* Try to read the error message, but if we can't it's a non-issue */ + if (m->header->type == SD_BUS_MESSAGE_METHOD_ERROR) + (void) sd_bus_message_read(m, "s", &m->error.message); + + return 0; +} + +_public_ int sd_bus_message_set_destination(sd_bus_message *m, const char *destination) { + assert_return(m, -EINVAL); + assert_return(destination, -EINVAL); + assert_return(service_name_is_valid(destination), -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->destination, -EEXIST); + + return message_append_field_string(m, BUS_MESSAGE_HEADER_DESTINATION, SD_BUS_TYPE_STRING, destination, &m->destination); +} + +_public_ int sd_bus_message_set_sender(sd_bus_message *m, const char *sender) { + assert_return(m, -EINVAL); + assert_return(sender, -EINVAL); + assert_return(service_name_is_valid(sender), -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(!m->sender, -EEXIST); + + return message_append_field_string(m, BUS_MESSAGE_HEADER_SENDER, SD_BUS_TYPE_STRING, sender, &m->sender); +} + +int bus_message_get_blob(sd_bus_message *m, void **buffer, size_t *sz) { + size_t total; + void *p, *e; + size_t i; + struct bus_body_part *part; + + assert(m); + assert(buffer); + assert(sz); + + total = BUS_MESSAGE_SIZE(m); + + p = malloc(total); + if (!p) + return -ENOMEM; + + e = mempcpy(p, m->header, BUS_MESSAGE_BODY_BEGIN(m)); + MESSAGE_FOREACH_PART(part, i, m) + e = mempcpy(e, part->data, part->size); + + assert(total == (size_t) ((uint8_t*) e - (uint8_t*) p)); + + *buffer = p; + *sz = total; + + return 0; +} + +_public_ int sd_bus_message_read_strv_extend(sd_bus_message *m, char ***l) { + char type; + const char *contents, *s; + int r; + + assert(m); + assert(l); + + r = sd_bus_message_peek_type(m, &type, &contents); + if (r < 0) + return r; + + if (type != SD_BUS_TYPE_ARRAY || !STR_IN_SET(contents, "s", "o", "g")) + return -ENXIO; + + r = sd_bus_message_enter_container(m, 'a', NULL); + if (r <= 0) + return r; + + /* sd_bus_message_read_basic() does content validation for us. */ + while ((r = sd_bus_message_read_basic(m, *contents, &s)) > 0) { + r = strv_extend(l, s); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 1; +} + +_public_ int sd_bus_message_read_strv(sd_bus_message *m, char ***l) { + _cleanup_strv_free_ char **strv = NULL; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(l, -EINVAL); + + r = sd_bus_message_read_strv_extend(m, &strv); + if (r <= 0) + return r; + + *l = TAKE_PTR(strv); + return 1; +} + +static int bus_message_get_arg_skip( + sd_bus_message *m, + unsigned i, + char *_type, + const char **_contents) { + + unsigned j; + int r; + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + for (j = 0;; j++) { + const char *contents; + char type; + + r = sd_bus_message_peek_type(m, &type, &contents); + if (r < 0) + return r; + if (r == 0) + return -ENXIO; + + /* Don't match against arguments after the first one we don't understand */ + if (!IN_SET(type, SD_BUS_TYPE_STRING, SD_BUS_TYPE_OBJECT_PATH, SD_BUS_TYPE_SIGNATURE) && + !(type == SD_BUS_TYPE_ARRAY && STR_IN_SET(contents, "s", "o", "g"))) + return -ENXIO; + + if (j >= i) { + if (_contents) + *_contents = contents; + if (_type) + *_type = type; + return 0; + } + + r = sd_bus_message_skip(m, NULL); + if (r < 0) + return r; + } + +} + +int bus_message_get_arg(sd_bus_message *m, unsigned i, const char **str) { + char type; + int r; + + assert(m); + assert(str); + + r = bus_message_get_arg_skip(m, i, &type, NULL); + if (r < 0) + return r; + + if (!IN_SET(type, SD_BUS_TYPE_STRING, SD_BUS_TYPE_OBJECT_PATH, SD_BUS_TYPE_SIGNATURE)) + return -ENXIO; + + return sd_bus_message_read_basic(m, type, str); +} + +int bus_message_get_arg_strv(sd_bus_message *m, unsigned i, char ***strv) { + const char *contents; + char type; + int r; + + assert(m); + assert(strv); + + r = bus_message_get_arg_skip(m, i, &type, &contents); + if (r < 0) + return r; + + if (type != SD_BUS_TYPE_ARRAY) + return -ENXIO; + if (!STR_IN_SET(contents, "s", "o", "g")) + return -ENXIO; + + return sd_bus_message_read_strv(m, strv); +} + +_public_ int sd_bus_message_get_errno(sd_bus_message *m) { + assert_return(m, EINVAL); + + if (m->header->type != SD_BUS_MESSAGE_METHOD_ERROR) + return 0; + + return sd_bus_error_get_errno(&m->error); +} + +_public_ const char* sd_bus_message_get_signature(sd_bus_message *m, int complete) { + struct bus_container *c; + + assert_return(m, NULL); + + c = complete ? &m->root_container : message_get_last_container(m); + return strempty(c->signature); +} + +_public_ int sd_bus_message_is_empty(sd_bus_message *m) { + assert_return(m, -EINVAL); + + return isempty(m->root_container.signature); +} + +_public_ int sd_bus_message_has_signature(sd_bus_message *m, const char *signature) { + assert_return(m, -EINVAL); + + return streq(strempty(m->root_container.signature), strempty(signature)); +} + +_public_ int sd_bus_message_copy(sd_bus_message *m, sd_bus_message *source, int all) { + bool done_something = false; + int r; + + assert_return(m, -EINVAL); + assert_return(source, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(source->sealed, -EPERM); + + do { + const char *contents; + char type; + union { + uint8_t u8; + uint16_t u16; + int16_t s16; + uint32_t u32; + int32_t s32; + uint64_t u64; + int64_t s64; + double d64; + const char *string; + int i; + } basic; + + r = sd_bus_message_peek_type(source, &type, &contents); + if (r < 0) + return r; + if (r == 0) + break; + + done_something = true; + + if (bus_type_is_container(type) > 0) { + + r = sd_bus_message_enter_container(source, type, contents); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, type, contents); + if (r < 0) + return r; + + r = sd_bus_message_copy(m, source, true); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(source); + if (r < 0) + return r; + + continue; + } + + r = sd_bus_message_read_basic(source, type, &basic); + if (r < 0) + return r; + + assert(r > 0); + + if (IN_SET(type, SD_BUS_TYPE_OBJECT_PATH, SD_BUS_TYPE_SIGNATURE, SD_BUS_TYPE_STRING)) + r = sd_bus_message_append_basic(m, type, basic.string); + else + r = sd_bus_message_append_basic(m, type, &basic); + + if (r < 0) + return r; + + } while (all); + + return done_something; +} + +_public_ int sd_bus_message_verify_type(sd_bus_message *m, char type, const char *contents) { + const char *c; + char t; + int r; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + assert_return(!type || bus_type_is_valid(type), -EINVAL); + assert_return(!contents || signature_is_valid(contents, true), -EINVAL); + assert_return(type || contents, -EINVAL); + assert_return(!contents || !type || bus_type_is_container(type), -EINVAL); + + r = sd_bus_message_peek_type(m, &t, &c); + if (r <= 0) + return r; + + if (type != 0 && type != t) + return 0; + + if (contents && !streq_ptr(contents, c)) + return 0; + + return 1; +} + +_public_ sd_bus *sd_bus_message_get_bus(sd_bus_message *m) { + assert_return(m, NULL); + + return m->bus; +} + +int bus_message_remarshal(sd_bus *bus, sd_bus_message **m) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *n = NULL; + usec_t timeout; + int r; + + assert(bus); + assert(m); + assert(*m); + + switch ((*m)->header->type) { + + case SD_BUS_MESSAGE_SIGNAL: + r = sd_bus_message_new_signal(bus, &n, (*m)->path, (*m)->interface, (*m)->member); + if (r < 0) + return r; + + break; + + case SD_BUS_MESSAGE_METHOD_CALL: + r = sd_bus_message_new_method_call(bus, &n, (*m)->destination, (*m)->path, (*m)->interface, (*m)->member); + if (r < 0) + return r; + + break; + + case SD_BUS_MESSAGE_METHOD_RETURN: + case SD_BUS_MESSAGE_METHOD_ERROR: + + r = sd_bus_message_new(bus, &n, (*m)->header->type); + if (r < 0) + return -ENOMEM; + + assert(n); + + n->reply_cookie = (*m)->reply_cookie; + + r = message_append_reply_cookie(n, n->reply_cookie); + if (r < 0) + return r; + + if ((*m)->header->type == SD_BUS_MESSAGE_METHOD_ERROR && (*m)->error.name) { + r = message_append_field_string(n, BUS_MESSAGE_HEADER_ERROR_NAME, SD_BUS_TYPE_STRING, (*m)->error.name, &n->error.message); + if (r < 0) + return r; + + n->error._need_free = -1; + } + + break; + + default: + return -EINVAL; + } + + if ((*m)->destination && !n->destination) { + r = message_append_field_string(n, BUS_MESSAGE_HEADER_DESTINATION, SD_BUS_TYPE_STRING, (*m)->destination, &n->destination); + if (r < 0) + return r; + } + + if ((*m)->sender && !n->sender) { + r = message_append_field_string(n, BUS_MESSAGE_HEADER_SENDER, SD_BUS_TYPE_STRING, (*m)->sender, &n->sender); + if (r < 0) + return r; + } + + n->header->flags |= (*m)->header->flags & (BUS_MESSAGE_NO_REPLY_EXPECTED|BUS_MESSAGE_NO_AUTO_START); + + r = sd_bus_message_copy(n, *m, true); + if (r < 0) + return r; + + timeout = (*m)->timeout; + if (timeout == 0 && !((*m)->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED)) { + r = sd_bus_get_method_call_timeout(bus, &timeout); + if (r < 0) + return r; + } + + r = sd_bus_message_seal(n, BUS_MESSAGE_COOKIE(*m), timeout); + if (r < 0) + return r; + + sd_bus_message_unref(*m); + *m = TAKE_PTR(n); + + return 0; +} + +_public_ int sd_bus_message_get_priority(sd_bus_message *m, int64_t *priority) { + static bool warned = false; + + assert_return(m, -EINVAL); + assert_return(priority, -EINVAL); + + if (!warned) { + log_debug("sd_bus_message_get_priority() is deprecated and always returns 0."); + warned = true; + } + + *priority = 0; + return 0; +} + +_public_ int sd_bus_message_set_priority(sd_bus_message *m, int64_t priority) { + static bool warned = false; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + if (!warned) { + log_debug("sd_bus_message_set_priority() is deprecated and does nothing."); + warned = true; + } + + return 0; +} + +_public_ int sd_bus_message_sensitive(sd_bus_message *m) { + assert_return(m, -EINVAL); + + m->sensitive = true; + return 0; +} + +char** bus_message_make_log_fields(sd_bus_message *m) { + _cleanup_strv_free_ char **strv = NULL; + + assert(m); + + (void) strv_extend_assignment(&strv, "DBUS_MESSAGE_TYPE", bus_message_type_to_string(m->header->type)); + (void) strv_extend_assignment(&strv, "DBUS_SENDER", sd_bus_message_get_sender(m)); + (void) strv_extend_assignment(&strv, "DBUS_DESTINATION", sd_bus_message_get_destination(m)); + (void) strv_extend_assignment(&strv, "DBUS_PATH", sd_bus_message_get_path(m)); + (void) strv_extend_assignment(&strv, "DBUS_INTERFACE", sd_bus_message_get_interface(m)); + (void) strv_extend_assignment(&strv, "DBUS_MEMBER", sd_bus_message_get_member(m)); + + (void) strv_extendf(&strv, "DBUS_MESSAGE_COOKIE=%" PRIu64, BUS_MESSAGE_COOKIE(m)); + if (m->reply_cookie != 0) + (void) strv_extendf(&strv, "DBUS_MESSAGE_REPLY_COOKIE=%" PRIu64, m->reply_cookie); + + (void) strv_extend_assignment(&strv, "DBUS_SIGNATURE", m->root_container.signature); + (void) strv_extend_assignment(&strv, "DBUS_ERROR_NAME", m->error.name); + (void) strv_extend_assignment(&strv, "DBUS_ERROR_MESSAGE", m->error.message); + + return TAKE_PTR(strv); +} diff --git a/src/libsystemd/sd-bus/bus-message.h b/src/libsystemd/sd-bus/bus-message.h new file mode 100644 index 0000000..76f0d85 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-message.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-bus.h" + +#include "bus-creds.h" +#include "bus-protocol.h" +#include "macro.h" +#include "time-util.h" + +struct bus_container { + char enclosing; + + /* Indexes into the signature string */ + unsigned index, saved_index; + char *signature; + + size_t before, begin, end; + + /* pointer to the array size value, if this is a value */ + uint32_t *array_size; + + char *peeked_signature; +}; + +struct bus_body_part { + struct bus_body_part *next; + void *data; + void *mmap_begin; + size_t size; + size_t mapped; + size_t allocated; + uint64_t memfd_offset; + int memfd; + bool free_this:1; + bool munmap_this:1; + bool sealed:1; + bool is_zero:1; +}; + +struct sd_bus_message { + /* Caveat: a message can be referenced in two different ways: the main (user-facing) way will also + * pin the bus connection object the message is associated with. The secondary way ("queued") is used + * when a message is in the read or write queues of the bus connection object, which will not pin the + * bus connection object. This is necessary so that we don't have to have a pair of cyclic references + * between a message that is queued and its connection: as soon as a message is only referenced by + * the connection (by means of being queued) and the connection itself has no other references it + * will be freed. */ + + unsigned n_ref; /* Counter of references that pin the connection */ + unsigned n_queued; /* Counter of references that do not pin the connection */ + + sd_bus *bus; + + uint64_t reply_cookie; + + const char *path; + const char *interface; + const char *member; + const char *destination; + const char *sender; + + sd_bus_error error; + + sd_bus_creds creds; + + usec_t monotonic; + usec_t realtime; + uint64_t seqnum; + uint64_t verify_destination_id; + + bool sealed:1; + bool dont_send:1; + bool allow_fds:1; + bool free_header:1; + bool free_fds:1; + bool poisoned:1; + bool sensitive:1; + + /* The first bytes of the message */ + struct bus_header *header; + + size_t fields_size; + size_t body_size; + size_t user_body_size; + + struct bus_body_part body; + struct bus_body_part *body_end; + unsigned n_body_parts; + + size_t rindex; + struct bus_body_part *cached_rindex_part; + size_t cached_rindex_part_begin; + + uint32_t n_fds; + int *fds; + + struct bus_container root_container, *containers; + size_t n_containers; + + struct iovec *iovec; + struct iovec iovec_fixed[2]; + unsigned n_iovec; + + char *peeked_signature; + + /* If set replies to this message must carry the signature + * specified here to successfully seal. This is initialized + * from the vtable data */ + const char *enforced_reply_signature; + + usec_t timeout; + + size_t header_offsets[_BUS_MESSAGE_HEADER_MAX]; + unsigned n_header_offsets; + + uint64_t read_counter; +}; + +static inline bool BUS_MESSAGE_NEED_BSWAP(sd_bus_message *m) { + return m->header->endian != BUS_NATIVE_ENDIAN; +} + +static inline uint16_t BUS_MESSAGE_BSWAP16(sd_bus_message *m, uint16_t u) { + return BUS_MESSAGE_NEED_BSWAP(m) ? bswap_16(u) : u; +} + +static inline uint32_t BUS_MESSAGE_BSWAP32(sd_bus_message *m, uint32_t u) { + return BUS_MESSAGE_NEED_BSWAP(m) ? bswap_32(u) : u; +} + +static inline uint64_t BUS_MESSAGE_BSWAP64(sd_bus_message *m, uint64_t u) { + return BUS_MESSAGE_NEED_BSWAP(m) ? bswap_64(u) : u; +} + +static inline uint64_t BUS_MESSAGE_COOKIE(sd_bus_message *m) { + return BUS_MESSAGE_BSWAP32(m, m->header->serial); +} + +static inline size_t BUS_MESSAGE_SIZE(sd_bus_message *m) { + return + sizeof(struct bus_header) + + ALIGN8(m->fields_size) + + m->body_size; +} + +static inline size_t BUS_MESSAGE_BODY_BEGIN(sd_bus_message *m) { + return + sizeof(struct bus_header) + + ALIGN8(m->fields_size); +} + +static inline void* BUS_MESSAGE_FIELDS(sd_bus_message *m) { + return (uint8_t*) m->header + sizeof(struct bus_header); +} + +int bus_message_get_blob(sd_bus_message *m, void **buffer, size_t *sz); + +int bus_message_from_malloc( + sd_bus *bus, + void *buffer, + size_t length, + int *fds, + size_t n_fds, + const char *label, + sd_bus_message **ret); + +int bus_message_get_arg(sd_bus_message *m, unsigned i, const char **str); +int bus_message_get_arg_strv(sd_bus_message *m, unsigned i, char ***strv); + +#define MESSAGE_FOREACH_PART(part, i, m) \ + for ((i) = 0, (part) = &(m)->body; (i) < (m)->n_body_parts; (i)++, (part) = (part)->next) + +int bus_body_part_map(struct bus_body_part *part); +void bus_body_part_unmap(struct bus_body_part *part); + +int bus_message_new_synthetic_error(sd_bus *bus, uint64_t serial, const sd_bus_error *e, sd_bus_message **m); + +int bus_message_remarshal(sd_bus *bus, sd_bus_message **m); + +void bus_message_set_sender_driver(sd_bus *bus, sd_bus_message *m); +void bus_message_set_sender_local(sd_bus *bus, sd_bus_message *m); + +sd_bus_message* bus_message_ref_queued(sd_bus_message *m, sd_bus *bus); +sd_bus_message* bus_message_unref_queued(sd_bus_message *m, sd_bus *bus); + +char** bus_message_make_log_fields(sd_bus_message *m); diff --git a/src/libsystemd/sd-bus/bus-objects.c b/src/libsystemd/sd-bus/bus-objects.c new file mode 100644 index 0000000..c25c40f --- /dev/null +++ b/src/libsystemd/sd-bus/bus-objects.c @@ -0,0 +1,3033 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-introspect.h" +#include "bus-message.h" +#include "bus-objects.h" +#include "bus-signature.h" +#include "bus-slot.h" +#include "bus-type.h" +#include "missing_capability.h" +#include "string-util.h" +#include "strv.h" + +static int node_vtable_get_userdata( + sd_bus *bus, + const char *path, + struct node_vtable *c, + void **userdata, + sd_bus_error *error) { + + sd_bus_slot *s; + void *u, *found_u = NULL; + int r; + + assert(bus); + assert(path); + assert(c); + + s = container_of(c, sd_bus_slot, node_vtable); + u = s->userdata; + if (c->find) { + bus->current_slot = sd_bus_slot_ref(s); + bus->current_userdata = u; + r = c->find(bus, path, c->interface, u, &found_u, error); + bus->current_userdata = NULL; + bus->current_slot = sd_bus_slot_unref(s); + + if (r < 0) + return r; + if (sd_bus_error_is_set(error)) + return -sd_bus_error_get_errno(error); + if (r == 0) + return r; + } else + found_u = u; + + if (userdata) + *userdata = found_u; + + return 1; +} + +static void *vtable_method_convert_userdata(const sd_bus_vtable *p, void *u) { + assert(p); + + if (!u || FLAGS_SET(p->flags, SD_BUS_VTABLE_ABSOLUTE_OFFSET)) + return SIZE_TO_PTR(p->x.method.offset); /* don't add offset on NULL, to make ubsan happy */ + + return (uint8_t*) u + p->x.method.offset; +} + +static void *vtable_property_convert_userdata(const sd_bus_vtable *p, void *u) { + assert(p); + + if (!u || FLAGS_SET(p->flags, SD_BUS_VTABLE_ABSOLUTE_OFFSET)) + return SIZE_TO_PTR(p->x.property.offset); /* as above */ + + return (uint8_t*) u + p->x.property.offset; +} + +static int vtable_property_get_userdata( + sd_bus *bus, + const char *path, + struct vtable_member *p, + void **userdata, + sd_bus_error *error) { + + void *u; + int r; + + assert(bus); + assert(path); + assert(p); + assert(userdata); + + r = node_vtable_get_userdata(bus, path, p->parent, &u, error); + if (r <= 0) + return r; + if (bus->nodes_modified) + return 0; + + *userdata = vtable_property_convert_userdata(p->vtable, u); + return 1; +} + +static int add_enumerated_to_set( + sd_bus *bus, + const char *prefix, + struct node_enumerator *first, + OrderedSet *s, + sd_bus_error *error) { + + int r; + + assert(bus); + assert(prefix); + assert(s); + + LIST_FOREACH(enumerators, c, first) { + char **children = NULL; + sd_bus_slot *slot; + + if (bus->nodes_modified) + return 0; + + slot = container_of(c, sd_bus_slot, node_enumerator); + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_userdata = slot->userdata; + r = c->callback(bus, prefix, slot->userdata, &children, error); + bus->current_userdata = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + + if (r < 0) + return r; + if (sd_bus_error_is_set(error)) + return -sd_bus_error_get_errno(error); + + STRV_FOREACH(k, children) { + if (r < 0) { + free(*k); + continue; + } + + if (!object_path_is_valid(*k)) { + free(*k); + r = -EINVAL; + continue; + } + + if (!object_path_startswith(*k, prefix)) { + free(*k); + continue; + } + + r = ordered_set_consume(s, *k); + if (r == -EEXIST) + r = 0; + } + + free(children); + if (r < 0) + return r; + } + + return 0; +} + +enum { + /* if set, add_subtree() works recursively */ + CHILDREN_RECURSIVE = 1 << 0, + /* if set, add_subtree() scans object-manager hierarchies recursively */ + CHILDREN_SUBHIERARCHIES = 1 << 1, +}; + +static int add_subtree_to_set( + sd_bus *bus, + const char *prefix, + struct node *n, + unsigned flags, + OrderedSet *s, + sd_bus_error *error) { + + int r; + + assert(bus); + assert(prefix); + assert(n); + assert(s); + + r = add_enumerated_to_set(bus, prefix, n->enumerators, s, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + + LIST_FOREACH(siblings, i, n->child) { + char *t; + + if (!object_path_startswith(i->path, prefix)) + continue; + + t = strdup(i->path); + if (!t) + return -ENOMEM; + + r = ordered_set_consume(s, t); + if (r < 0 && r != -EEXIST) + return r; + + if ((flags & CHILDREN_RECURSIVE) && + ((flags & CHILDREN_SUBHIERARCHIES) || !i->object_managers)) { + r = add_subtree_to_set(bus, prefix, i, flags, s, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + } + + return 0; +} + +static int get_child_nodes( + sd_bus *bus, + const char *prefix, + struct node *n, + unsigned flags, + OrderedSet **ret, + sd_bus_error *error) { + + _cleanup_ordered_set_free_free_ OrderedSet *s = NULL; + int r; + + assert(bus); + assert(prefix); + assert(n); + assert(ret); + + s = ordered_set_new(&string_hash_ops); + if (!s) + return -ENOMEM; + + r = add_subtree_to_set(bus, prefix, n, flags, s, error); + if (r < 0) + return r; + + *ret = TAKE_PTR(s); + return 0; +} + +static int node_callbacks_run( + sd_bus *bus, + sd_bus_message *m, + struct node_callback *first, + bool require_fallback, + bool *found_object) { + + int r; + + assert(bus); + assert(m); + assert(found_object); + + LIST_FOREACH(callbacks, c, first) { + _cleanup_(sd_bus_error_free) sd_bus_error error_buffer = SD_BUS_ERROR_NULL; + sd_bus_slot *slot; + + if (bus->nodes_modified) + return 0; + + if (require_fallback && !c->is_fallback) + continue; + + *found_object = true; + + if (c->last_iteration == bus->iteration_counter) + continue; + + c->last_iteration = bus->iteration_counter; + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + slot = container_of(c, sd_bus_slot, node_callback); + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = c->callback; + bus->current_userdata = slot->userdata; + r = c->callback(m, slot->userdata, &error_buffer); + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + + r = bus_maybe_reply_error(m, r, &error_buffer); + if (r != 0) + return r; + } + + return 0; +} + +#define CAPABILITY_SHIFT(x) (((x) >> __builtin_ctzll(_SD_BUS_VTABLE_CAPABILITY_MASK)) & 0xFFFF) + +static int check_access(sd_bus *bus, sd_bus_message *m, struct vtable_member *c, sd_bus_error *error) { + uint64_t cap; + int r; + + assert(bus); + assert(m); + assert(c); + + /* If the entire bus is trusted let's grant access */ + if (bus->trusted) + return 0; + + /* If the member is marked UNPRIVILEGED let's grant access */ + if (c->vtable->flags & SD_BUS_VTABLE_UNPRIVILEGED) + return 0; + + /* Check that the caller has the requested capability set. Note that the flags value contains the + * capability number plus one, which we need to subtract here. We do this so that we have 0 as + * special value for the default. */ + cap = CAPABILITY_SHIFT(c->vtable->flags); + if (cap == 0) + cap = CAPABILITY_SHIFT(c->parent->vtable[0].flags); + if (cap == 0) + cap = CAP_SYS_ADMIN; + else + cap--; + + r = sd_bus_query_sender_privilege(m, cap); + if (r < 0) + return r; + if (r > 0) + return 0; + + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Access to %s.%s() not permitted.", c->interface, c->member); +} + +static int method_callbacks_run( + sd_bus *bus, + sd_bus_message *m, + struct vtable_member *c, + bool require_fallback, + bool *found_object) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *signature; + void *u; + int r; + + assert(bus); + assert(m); + assert(c); + assert(found_object); + + if (require_fallback && !c->parent->is_fallback) + return 0; + + if (FLAGS_SET(c->vtable->flags, SD_BUS_VTABLE_SENSITIVE)) { + r = sd_bus_message_sensitive(m); + if (r < 0) + return r; + } + + r = check_access(bus, m, c, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + + r = node_vtable_get_userdata(bus, m->path, c->parent, &u, &error); + if (r <= 0) + return bus_maybe_reply_error(m, r, &error); + if (bus->nodes_modified) + return 0; + + u = vtable_method_convert_userdata(c->vtable, u); + + *found_object = true; + + if (c->last_iteration == bus->iteration_counter) + return 0; + + c->last_iteration = bus->iteration_counter; + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + signature = sd_bus_message_get_signature(m, true); + if (!signature) + return -EINVAL; + + if (!streq(strempty(c->vtable->x.method.signature), signature)) + return sd_bus_reply_method_errorf( + m, + SD_BUS_ERROR_INVALID_ARGS, + "Invalid arguments '%s' to call %s.%s(), expecting '%s'.", + signature, c->interface, c->member, strempty(c->vtable->x.method.signature)); + + /* Keep track what the signature of the reply to this message + * should be, so that this can be enforced when sealing the + * reply. */ + m->enforced_reply_signature = strempty(c->vtable->x.method.result); + + if (c->vtable->x.method.handler) { + sd_bus_slot *slot; + + slot = container_of(c->parent, sd_bus_slot, node_vtable); + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = c->vtable->x.method.handler; + bus->current_userdata = u; + r = c->vtable->x.method.handler(m, u, &error); + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + + return bus_maybe_reply_error(m, r, &error); + } + + /* If the method callback is NULL, make this a successful NOP */ + r = sd_bus_reply_method_return(m, NULL); + if (r < 0) + return r; + + return 1; +} + +static int invoke_property_get( + sd_bus *bus, + sd_bus_slot *slot, + const sd_bus_vtable *v, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const void *p; + int r; + + assert(bus); + assert(slot); + assert(v); + assert(path); + assert(interface); + assert(property); + assert(reply); + + if (v->x.property.get) { + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_userdata = userdata; + r = v->x.property.get(bus, path, interface, property, reply, userdata, error); + bus->current_userdata = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + + if (r < 0) + return r; + if (sd_bus_error_is_set(error)) + return -sd_bus_error_get_errno(error); + return r; + } + + /* Automatic handling if no callback is defined. */ + + if (streq(v->x.property.signature, "as")) + return sd_bus_message_append_strv(reply, *(char***) userdata); + + assert(signature_is_single(v->x.property.signature, false)); + assert(bus_type_is_basic(v->x.property.signature[0])); + + switch (v->x.property.signature[0]) { + + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_SIGNATURE: + p = strempty(*(char**) userdata); + break; + + case SD_BUS_TYPE_OBJECT_PATH: + p = *(char**) userdata; + assert(p); + break; + + default: + p = userdata; + break; + } + + return sd_bus_message_append_basic(reply, v->x.property.signature[0], p); +} + +static int invoke_property_set( + sd_bus *bus, + sd_bus_slot *slot, + const sd_bus_vtable *v, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + int r; + + assert(bus); + assert(slot); + assert(v); + assert(path); + assert(interface); + assert(property); + assert(value); + + if (v->x.property.set) { + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_userdata = userdata; + r = v->x.property.set(bus, path, interface, property, value, userdata, error); + bus->current_userdata = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + + if (r < 0) + return r; + if (sd_bus_error_is_set(error)) + return -sd_bus_error_get_errno(error); + return r; + } + + /* Automatic handling if no callback is defined. */ + + assert(signature_is_single(v->x.property.signature, false)); + assert(bus_type_is_basic(v->x.property.signature[0])); + + switch (v->x.property.signature[0]) { + + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_OBJECT_PATH: + case SD_BUS_TYPE_SIGNATURE: { + const char *p; + char *n; + + r = sd_bus_message_read_basic(value, v->x.property.signature[0], &p); + if (r < 0) + return r; + + n = strdup(p); + if (!n) + return -ENOMEM; + + free(*(char**) userdata); + *(char**) userdata = n; + + break; + } + + default: + r = sd_bus_message_read_basic(value, v->x.property.signature[0], userdata); + if (r < 0) + return r; + + break; + } + + return 1; +} + +static int property_get_set_callbacks_run( + sd_bus *bus, + sd_bus_message *m, + struct vtable_member *c, + bool require_fallback, + bool is_get, + bool *found_object) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + sd_bus_slot *slot; + void *u = NULL; + int r; + + assert(bus); + assert(m); + assert(c); + assert(found_object); + + if (require_fallback && !c->parent->is_fallback) + return 0; + + if (FLAGS_SET(c->vtable->flags, SD_BUS_VTABLE_SENSITIVE)) { + r = sd_bus_message_sensitive(m); + if (r < 0) + return r; + } + + r = vtable_property_get_userdata(bus, m->path, c, &u, &error); + if (r <= 0) + return bus_maybe_reply_error(m, r, &error); + if (bus->nodes_modified) + return 0; + + slot = container_of(c->parent, sd_bus_slot, node_vtable); + + *found_object = true; + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + if (FLAGS_SET(c->vtable->flags, SD_BUS_VTABLE_SENSITIVE)) { + r = sd_bus_message_sensitive(reply); + if (r < 0) + return r; + } + + if (is_get) { + /* Note that we do not protect against reexecution + * here (using the last_iteration check, see below), + * should the node tree have changed and we got called + * again. We assume that property Get() calls are + * ultimately without side-effects or if they aren't + * then at least idempotent. */ + + r = sd_bus_message_open_container(reply, 'v', c->vtable->x.property.signature); + if (r < 0) + return r; + + /* Note that we do not do an access check here. Read + * access to properties is always unrestricted, since + * PropertiesChanged signals broadcast contents + * anyway. */ + + r = invoke_property_get(bus, slot, c->vtable, m->path, c->interface, c->member, reply, u, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + + if (bus->nodes_modified) + return 0; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + } else { + const char *signature = NULL; + char type = 0; + + if (c->vtable->type != _SD_BUS_VTABLE_WRITABLE_PROPERTY) + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_PROPERTY_READ_ONLY, "Property '%s' is not writable.", c->member); + + /* Avoid that we call the set routine more than once + * if the processing of this message got restarted + * because the node tree changed. */ + if (c->last_iteration == bus->iteration_counter) + return 0; + + c->last_iteration = bus->iteration_counter; + + r = sd_bus_message_peek_type(m, &type, &signature); + if (r < 0) + return r; + + if (type != 'v') + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INVALID_SIGNATURE, + "Incorrect signature when setting property '%s', expected 'v', got '%c'.", + c->member, type); + if (!streq(strempty(signature), strempty(c->vtable->x.property.signature))) + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INVALID_ARGS, + "Incorrect parameters for property '%s', expected '%s', got '%s'.", + c->member, strempty(c->vtable->x.property.signature), strempty(signature)); + + r = sd_bus_message_enter_container(m, 'v', c->vtable->x.property.signature); + if (r < 0) + return r; + + r = check_access(bus, m, c, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + + r = invoke_property_set(bus, slot, c->vtable, m->path, c->interface, c->member, m, u, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + + if (bus->nodes_modified) + return 0; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + } + + r = sd_bus_send(bus, reply, NULL); + if (r < 0) + return r; + + return 1; +} + +static int vtable_append_one_property( + sd_bus *bus, + sd_bus_message *reply, + const char *path, + struct node_vtable *c, + const sd_bus_vtable *v, + void *userdata, + sd_bus_error *error) { + + sd_bus_slot *slot; + int r; + + assert(bus); + assert(reply); + assert(path); + assert(c); + assert(v); + + if (FLAGS_SET(c->vtable->flags, SD_BUS_VTABLE_SENSITIVE)) { + r = sd_bus_message_sensitive(reply); + if (r < 0) + return r; + } + + r = sd_bus_message_open_container(reply, 'e', "sv"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", v->x.property.member); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'v', v->x.property.signature); + if (r < 0) + return r; + + slot = container_of(c, sd_bus_slot, node_vtable); + + r = invoke_property_get(bus, slot, v, path, c->interface, v->x.property.member, reply, vtable_property_convert_userdata(v, userdata), error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return 0; +} + +static int vtable_append_all_properties( + sd_bus *bus, + sd_bus_message *reply, + const char *path, + struct node_vtable *c, + void *userdata, + sd_bus_error *error) { + + const sd_bus_vtable *v; + int r; + + assert(bus); + assert(reply); + assert(path); + assert(c); + + if (c->vtable[0].flags & SD_BUS_VTABLE_HIDDEN) + return 1; + + v = c->vtable; + for (v = bus_vtable_next(c->vtable, v); v->type != _SD_BUS_VTABLE_END; v = bus_vtable_next(c->vtable, v)) { + if (!IN_SET(v->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY)) + continue; + + if (v->flags & SD_BUS_VTABLE_HIDDEN) + continue; + + /* Let's not include properties marked as "explicit" in any message that contains a generic + * dump of properties, but only in those generated as a response to an explicit request. */ + if (v->flags & SD_BUS_VTABLE_PROPERTY_EXPLICIT) + continue; + + /* Let's not include properties marked only for invalidation on change (i.e. in contrast to + * those whose new values are included in PropertiesChanges message) in any signals. This is + * useful to ensure they aren't included in InterfacesAdded messages. */ + if (reply->header->type != SD_BUS_MESSAGE_METHOD_RETURN && + FLAGS_SET(v->flags, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION)) + continue; + + r = vtable_append_one_property(bus, reply, path, c, v, userdata, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + + return 1; +} + +static int property_get_all_callbacks_run( + sd_bus *bus, + sd_bus_message *m, + struct node_vtable *first, + bool require_fallback, + const char *iface, + bool *found_object) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + bool found_interface; + int r; + + assert(bus); + assert(m); + assert(found_object); + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "{sv}"); + if (r < 0) + return r; + + found_interface = !iface || STR_IN_SET(iface, + "org.freedesktop.DBus.Properties", + "org.freedesktop.DBus.Peer", + "org.freedesktop.DBus.Introspectable"); + + LIST_FOREACH(vtables, c, first) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + void *u; + + if (require_fallback && !c->is_fallback) + continue; + + r = node_vtable_get_userdata(bus, m->path, c, &u, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + *found_object = true; + + if (iface && !streq(c->interface, iface)) + continue; + found_interface = true; + + r = vtable_append_all_properties(bus, reply, m->path, c, u, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + if (bus->nodes_modified) + return 0; + } + + if (!*found_object) + return 0; + + if (!found_interface) { + r = sd_bus_reply_method_errorf( + m, + SD_BUS_ERROR_UNKNOWN_INTERFACE, + "Unknown interface '%s'.", iface); + if (r < 0) + return r; + + return 1; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_send(bus, reply, NULL); + if (r < 0) + return r; + + return 1; +} + +static int bus_node_exists( + sd_bus *bus, + struct node *n, + const char *path, + bool require_fallback) { + + int r; + + assert(bus); + assert(n); + assert(path); + + /* Tests if there's anything attached directly to this node + * for the specified path */ + + if (!require_fallback && (n->enumerators || n->object_managers)) + return true; + + LIST_FOREACH(callbacks, k, n->callbacks) { + if (require_fallback && !k->is_fallback) + continue; + + return 1; + } + + LIST_FOREACH(vtables, c, n->vtables) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (require_fallback && !c->is_fallback) + continue; + + r = node_vtable_get_userdata(bus, path, c, NULL, &error); + if (r != 0) + return r; + if (bus->nodes_modified) + return 0; + } + + return 0; +} + +int introspect_path( + sd_bus *bus, + const char *path, + struct node *n, + bool require_fallback, + bool ignore_nodes_modified, + bool *found_object, + char **ret, + sd_bus_error *error) { + + _cleanup_ordered_set_free_free_ OrderedSet *s = NULL; + _cleanup_(introspect_done) struct introspect intro = {}; + bool empty; + int r; + + if (!n) { + n = hashmap_get(bus->nodes, path); + if (!n) + return -ENOENT; + } + + r = get_child_nodes(bus, path, n, 0, &s, error); + if (r < 0) + return r; + if (bus->nodes_modified && !ignore_nodes_modified) + return 0; + + r = introspect_begin(&intro, bus->trusted); + if (r < 0) + return r; + + r = introspect_write_default_interfaces(&intro, !require_fallback && n->object_managers); + if (r < 0) + return r; + + empty = ordered_set_isempty(s); + + LIST_FOREACH(vtables, c, n->vtables) { + if (require_fallback && !c->is_fallback) + continue; + + r = node_vtable_get_userdata(bus, path, c, NULL, error); + if (r < 0) + return r; + if (bus->nodes_modified && !ignore_nodes_modified) + return 0; + if (r == 0) + continue; + + empty = false; + + if (c->vtable[0].flags & SD_BUS_VTABLE_HIDDEN) + continue; + + r = introspect_write_interface(&intro, c->interface, c->vtable); + if (r < 0) + return r; + } + + if (empty) { + /* Nothing?, let's see if we exist at all, and if not + * refuse to do anything */ + r = bus_node_exists(bus, n, path, require_fallback); + if (r <= 0) + return r; + if (bus->nodes_modified && !ignore_nodes_modified) + return 0; + } + + if (found_object) + *found_object = true; + + r = introspect_write_child_nodes(&intro, s, path); + if (r < 0) + return r; + + r = introspect_finish(&intro, ret); + if (r < 0) + return r; + + return 1; +} + +static int process_introspect( + sd_bus *bus, + sd_bus_message *m, + struct node *n, + bool require_fallback, + bool *found_object) { + + _cleanup_free_ char *s = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(bus); + assert(m); + assert(n); + assert(found_object); + + r = introspect_path(bus, m->path, n, require_fallback, false, found_object, &s, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + if (r == 0) + /* nodes_modified == true */ + return 0; + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", s); + if (r < 0) + return r; + + r = sd_bus_send(bus, reply, NULL); + if (r < 0) + return r; + + return 1; +} + +static int object_manager_serialize_path( + sd_bus *bus, + sd_bus_message *reply, + const char *prefix, + const char *path, + bool require_fallback, + bool *found_object_manager, + sd_bus_error *error) { + + const char *previous_interface = NULL; + bool found_something = false; + struct node *n; + int r; + + assert(bus); + assert(reply); + assert(prefix); + assert(path); + assert(found_object_manager); + assert(error); + + n = hashmap_get(bus->nodes, prefix); + if (!n) + return 0; + + if (!require_fallback && n->object_managers) + *found_object_manager = true; + + LIST_FOREACH(vtables, i, n->vtables) { + void *u; + + if (require_fallback && !i->is_fallback) + continue; + + r = node_vtable_get_userdata(bus, path, i, &u, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + if (!found_something) { + + /* Open the object part */ + + r = sd_bus_message_open_container(reply, 'e', "oa{sa{sv}}"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "o", path); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "{sa{sv}}"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "{sa{sv}}", "org.freedesktop.DBus.Peer", 0); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "{sa{sv}}", "org.freedesktop.DBus.Introspectable", 0); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "{sa{sv}}", "org.freedesktop.DBus.Properties", 0); + if (r < 0) + return r; + + if (*found_object_manager) { + r = sd_bus_message_append( + reply, "{sa{sv}}", "org.freedesktop.DBus.ObjectManager", 0); + if (r < 0) + return r; + } + + found_something = true; + } + + if (!streq_ptr(previous_interface, i->interface)) { + + /* Maybe close the previous interface part */ + + if (previous_interface) { + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + /* Open the new interface part */ + + r = sd_bus_message_open_container(reply, 'e', "sa{sv}"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", i->interface); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "{sv}"); + if (r < 0) + return r; + } + + r = vtable_append_all_properties(bus, reply, path, i, u, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + + previous_interface = i->interface; + } + + if (previous_interface) { + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + if (found_something) { + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return 1; +} + +static int object_manager_serialize_path_and_fallbacks( + sd_bus *bus, + sd_bus_message *reply, + const char *path, + sd_bus_error *error) { + + _cleanup_free_ char *prefix = NULL; + size_t pl; + int r; + bool found_object_manager = false; + + assert(bus); + assert(reply); + assert(path); + assert(error); + + /* First, add all vtables registered for this path */ + r = object_manager_serialize_path(bus, reply, path, path, false, &found_object_manager, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + + /* Second, add fallback vtables registered for any of the prefixes */ + pl = strlen(path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + OBJECT_PATH_FOREACH_PREFIX(prefix, path) { + r = object_manager_serialize_path(bus, reply, prefix, path, true, &found_object_manager, error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + + return 0; +} + +static int process_get_managed_objects( + sd_bus *bus, + sd_bus_message *m, + struct node *n, + bool require_fallback, + bool *found_object) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_ordered_set_free_free_ OrderedSet *s = NULL; + char *path; + int r; + + assert(bus); + assert(m); + assert(n); + assert(found_object); + + /* Spec says, GetManagedObjects() is only implemented on the root of a + * sub-tree. Therefore, we require a registered object-manager on + * exactly the queried path, otherwise, we refuse to respond. */ + + if (require_fallback || !n->object_managers) + return 0; + + r = get_child_nodes(bus, m->path, n, CHILDREN_RECURSIVE, &s, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + if (bus->nodes_modified) + return 0; + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "{oa{sa{sv}}}"); + if (r < 0) + return r; + + ORDERED_SET_FOREACH(path, s) { + r = object_manager_serialize_path_and_fallbacks(bus, reply, path, &error); + if (r < 0) + return bus_maybe_reply_error(m, r, &error); + + if (bus->nodes_modified) + return 0; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_send(bus, reply, NULL); + if (r < 0) + return r; + + return 1; +} + +static int object_find_and_run( + sd_bus *bus, + sd_bus_message *m, + const char *p, + bool require_fallback, + bool *found_object) { + + struct node *n; + struct vtable_member vtable_key, *v; + int r; + + assert(bus); + assert(m); + assert(p); + assert(found_object); + + n = hashmap_get(bus->nodes, p); + if (!n) + return 0; + + /* First, try object callbacks */ + r = node_callbacks_run(bus, m, n->callbacks, require_fallback, found_object); + if (r != 0) + return r; + if (bus->nodes_modified) + return 0; + + if (!m->interface || !m->member) + return 0; + + /* Then, look for a known method */ + vtable_key.path = (char*) p; + vtable_key.interface = m->interface; + vtable_key.member = m->member; + + v = hashmap_get(bus->vtable_methods, &vtable_key); + if (v) { + r = method_callbacks_run(bus, m, v, require_fallback, found_object); + if (r != 0) + return r; + if (bus->nodes_modified) + return 0; + } + + /* Then, look for a known property */ + if (streq(m->interface, "org.freedesktop.DBus.Properties")) { + bool get = false; + + get = streq(m->member, "Get"); + + if (get || streq(m->member, "Set")) { + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + vtable_key.path = (char*) p; + + r = sd_bus_message_read(m, "ss", &vtable_key.interface, &vtable_key.member); + if (r < 0) + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INVALID_ARGS, "Expected interface and member parameters"); + + v = hashmap_get(bus->vtable_properties, &vtable_key); + if (v) { + r = property_get_set_callbacks_run(bus, m, v, require_fallback, get, found_object); + if (r != 0) + return r; + } + + } else if (streq(m->member, "GetAll")) { + const char *iface; + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "s", &iface); + if (r < 0) + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INVALID_ARGS, "Expected interface parameter"); + + if (iface[0] == 0) + iface = NULL; + + r = property_get_all_callbacks_run(bus, m, n->vtables, require_fallback, iface, found_object); + if (r != 0) + return r; + } + + } else if (sd_bus_message_is_method_call(m, "org.freedesktop.DBus.Introspectable", "Introspect")) { + + if (!isempty(sd_bus_message_get_signature(m, true))) + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INVALID_ARGS, "Expected no parameters"); + + r = process_introspect(bus, m, n, require_fallback, found_object); + if (r != 0) + return r; + + } else if (sd_bus_message_is_method_call(m, "org.freedesktop.DBus.ObjectManager", "GetManagedObjects")) { + + if (!isempty(sd_bus_message_get_signature(m, true))) + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INVALID_ARGS, "Expected no parameters"); + + r = process_get_managed_objects(bus, m, n, require_fallback, found_object); + if (r != 0) + return r; + } + + if (bus->nodes_modified) + return 0; + + if (!*found_object) { + r = bus_node_exists(bus, n, m->path, require_fallback); + if (r < 0) + return bus_maybe_reply_error(m, r, NULL); + if (bus->nodes_modified) + return 0; + if (r > 0) + *found_object = true; + } + + return 0; +} + +int bus_process_object(sd_bus *bus, sd_bus_message *m) { + _cleanup_free_ char *prefix = NULL; + int r; + size_t pl; + bool found_object = false; + + assert(bus); + assert(m); + + if (bus->is_monitor) + return 0; + + if (m->header->type != SD_BUS_MESSAGE_METHOD_CALL) + return 0; + + if (hashmap_isempty(bus->nodes)) + return 0; + + /* Never respond to broadcast messages */ + if (bus->bus_client && !m->destination) + return 0; + + assert(m->path); + assert(m->member); + + pl = strlen(m->path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + do { + bus->nodes_modified = false; + + r = object_find_and_run(bus, m, m->path, false, &found_object); + if (r != 0) + return r; + + /* Look for fallback prefixes */ + OBJECT_PATH_FOREACH_PREFIX(prefix, m->path) { + + if (bus->nodes_modified) + break; + + r = object_find_and_run(bus, m, prefix, true, &found_object); + if (r != 0) + return r; + } + + } while (bus->nodes_modified); + + if (!found_object) + return 0; + + if (sd_bus_message_is_method_call(m, "org.freedesktop.DBus.Properties", "Get") || + sd_bus_message_is_method_call(m, "org.freedesktop.DBus.Properties", "Set")) { + const char *interface = NULL, *property = NULL; + + (void) sd_bus_message_rewind(m, true); + (void) sd_bus_message_read_basic(m, 's', &interface); + (void) sd_bus_message_read_basic(m, 's', &property); + + r = sd_bus_reply_method_errorf( + m, + SD_BUS_ERROR_UNKNOWN_PROPERTY, + "Unknown interface %s or property %s.", strnull(interface), strnull(property)); + } else + r = sd_bus_reply_method_errorf( + m, + SD_BUS_ERROR_UNKNOWN_METHOD, + "Unknown method %s or interface %s.", m->member, m->interface); + + if (r < 0) + return r; + + return 1; +} + +static struct node* bus_node_allocate(sd_bus *bus, const char *path) { + struct node *n, *parent; + const char *e; + _cleanup_free_ char *s = NULL; + char *p; + int r; + + assert(bus); + assert(path); + assert(path[0] == '/'); + + n = hashmap_get(bus->nodes, path); + if (n) + return n; + + r = hashmap_ensure_allocated(&bus->nodes, &string_hash_ops); + if (r < 0) + return NULL; + + s = strdup(path); + if (!s) + return NULL; + + if (streq(path, "/")) + parent = NULL; + else { + assert_se(e = strrchr(path, '/')); + + p = strndupa_safe(path, MAX(1, e - path)); + + parent = bus_node_allocate(bus, p); + if (!parent) + return NULL; + } + + n = new0(struct node, 1); + if (!n) + return NULL; + + n->parent = parent; + n->path = TAKE_PTR(s); + + r = hashmap_put(bus->nodes, n->path, n); + if (r < 0) { + free(n->path); + return mfree(n); + } + + if (parent) + LIST_PREPEND(siblings, parent->child, n); + + return n; +} + +void bus_node_gc(sd_bus *b, struct node *n) { + assert(b); + + if (!n) + return; + + if (n->child || + n->callbacks || + n->vtables || + n->enumerators || + n->object_managers) + return; + + assert_se(hashmap_remove(b->nodes, n->path) == n); + + if (n->parent) + LIST_REMOVE(siblings, n->parent->child, n); + + free(n->path); + bus_node_gc(b, n->parent); + free(n); +} + +static int bus_find_parent_object_manager(sd_bus *bus, struct node **out, const char *path, bool* path_has_object_manager) { + struct node *n; + + assert(bus); + assert(path); + assert(path_has_object_manager); + + n = hashmap_get(bus->nodes, path); + + if (n) + *path_has_object_manager = n->object_managers; + + if (!n) { + _cleanup_free_ char *prefix = NULL; + size_t pl; + + pl = strlen(path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + OBJECT_PATH_FOREACH_PREFIX(prefix, path) { + n = hashmap_get(bus->nodes, prefix); + if (n) + break; + } + } + + while (n && !n->object_managers) + n = n->parent; + + if (out) + *out = n; + return !!n; +} + +static int bus_add_object( + sd_bus *bus, + sd_bus_slot **slot, + bool fallback, + const char *path, + sd_bus_message_handler_t callback, + void *userdata) { + + sd_bus_slot *s; + struct node *n; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(callback, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + n = bus_node_allocate(bus, path); + if (!n) + return -ENOMEM; + + s = bus_slot_allocate(bus, !slot, BUS_NODE_CALLBACK, sizeof(struct node_callback), userdata); + if (!s) { + r = -ENOMEM; + goto fail; + } + + s->node_callback.callback = callback; + s->node_callback.is_fallback = fallback; + + s->node_callback.node = n; + LIST_PREPEND(callbacks, n->callbacks, &s->node_callback); + bus->nodes_modified = true; + + if (slot) + *slot = s; + + return 0; + +fail: + sd_bus_slot_unref(s); + bus_node_gc(bus, n); + + return r; +} + +_public_ int sd_bus_add_object( + sd_bus *bus, + sd_bus_slot **slot, + const char *path, + sd_bus_message_handler_t callback, + void *userdata) { + + return bus_add_object(bus, slot, false, path, callback, userdata); +} + +_public_ int sd_bus_add_fallback( + sd_bus *bus, + sd_bus_slot **slot, + const char *prefix, + sd_bus_message_handler_t callback, + void *userdata) { + + return bus_add_object(bus, slot, true, prefix, callback, userdata); +} + +static void vtable_member_hash_func(const struct vtable_member *m, struct siphash *state) { + assert(m); + + string_hash_func(m->path, state); + string_hash_func(m->interface, state); + string_hash_func(m->member, state); +} + +static int vtable_member_compare_func(const struct vtable_member *x, const struct vtable_member *y) { + int r; + + assert(x); + assert(y); + + r = strcmp(x->path, y->path); + if (r != 0) + return r; + + r = strcmp(x->interface, y->interface); + if (r != 0) + return r; + + return strcmp(x->member, y->member); +} + +DEFINE_PRIVATE_HASH_OPS(vtable_member_hash_ops, struct vtable_member, vtable_member_hash_func, vtable_member_compare_func); + +typedef enum { + NAMES_FIRST_PART = 1 << 0, /* first part of argument name list (input names). It is reset by names_are_valid() */ + NAMES_PRESENT = 1 << 1, /* at least one argument name is present, so the names will checked. + This flag is set and used internally by names_are_valid(), but needs to be stored across calls for 2-parts list */ + NAMES_SINGLE_PART = 1 << 2, /* argument name list consisting of a single part */ +} names_flags; + +static bool names_are_valid(const char *signature, const char **names, names_flags *flags) { + int r; + + if ((*flags & NAMES_FIRST_PART || *flags & NAMES_SINGLE_PART) && **names != '\0') + *flags |= NAMES_PRESENT; + + for (;*flags & NAMES_PRESENT;) { + size_t l; + + if (!*signature) + break; + + r = signature_element_length(signature, &l); + if (r < 0) + return false; + + if (**names != '\0') { + if (!member_name_is_valid(*names)) + return false; + *names += strlen(*names) + 1; + } else if (*flags & NAMES_PRESENT) + return false; + + signature += l; + } + /* let's check if there are more argument names specified than the signature allows */ + if (*flags & NAMES_PRESENT && **names != '\0' && !(*flags & NAMES_FIRST_PART)) + return false; + *flags &= ~NAMES_FIRST_PART; + return true; +} + +/* the current version of this struct is defined in sd-bus-vtable.h, but we need to list here the historical versions + to make sure the calling code is compatible with one of these */ +struct sd_bus_vtable_221 { + uint8_t type:8; + uint64_t flags:56; + union { + struct { + size_t element_size; + } start; + struct { + const char *member; + const char *signature; + const char *result; + sd_bus_message_handler_t handler; + size_t offset; + } method; + struct { + const char *member; + const char *signature; + } signal; + struct { + const char *member; + const char *signature; + sd_bus_property_get_t get; + sd_bus_property_set_t set; + size_t offset; + } property; + } x; +}; +/* Structure size up to v241 */ +#define VTABLE_ELEMENT_SIZE_221 sizeof(struct sd_bus_vtable_221) + +/* Size of the structure when "features" field was added. If the structure definition is augmented, a copy of + * the structure definition will need to be made (similarly to the sd_bus_vtable_221 above), and this + * definition updated to refer to it. */ +#define VTABLE_ELEMENT_SIZE_242 sizeof(struct sd_bus_vtable) + +static int vtable_features(const sd_bus_vtable *vtable) { + if (vtable[0].x.start.element_size < VTABLE_ELEMENT_SIZE_242 || + !vtable[0].x.start.vtable_format_reference) + return 0; + return vtable[0].x.start.features; +} + +bool bus_vtable_has_names(const sd_bus_vtable *vtable) { + return vtable_features(vtable) & _SD_BUS_VTABLE_PARAM_NAMES; +} + +const sd_bus_vtable* bus_vtable_next(const sd_bus_vtable *vtable, const sd_bus_vtable *v) { + return (const sd_bus_vtable*) ((char*) v + vtable[0].x.start.element_size); +} + +static int add_object_vtable_internal( + sd_bus *bus, + sd_bus_slot **slot, + const char *path, + const char *interface, + const sd_bus_vtable *vtable, + bool fallback, + sd_bus_object_find_t find, + void *userdata) { + + sd_bus_slot *s = NULL; + struct node_vtable *existing = NULL; + const sd_bus_vtable *v; + struct node *n; + int r; + const char *names = ""; + names_flags nf; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(interface_name_is_valid(interface), -EINVAL); + assert_return(vtable, -EINVAL); + assert_return(vtable[0].type == _SD_BUS_VTABLE_START, -EINVAL); + assert_return(vtable[0].x.start.element_size == VTABLE_ELEMENT_SIZE_221 || + vtable[0].x.start.element_size >= VTABLE_ELEMENT_SIZE_242, + -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(!streq(interface, "org.freedesktop.DBus.Properties") && + !streq(interface, "org.freedesktop.DBus.Introspectable") && + !streq(interface, "org.freedesktop.DBus.Peer") && + !streq(interface, "org.freedesktop.DBus.ObjectManager"), -EINVAL); + + r = hashmap_ensure_allocated(&bus->vtable_methods, &vtable_member_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&bus->vtable_properties, &vtable_member_hash_ops); + if (r < 0) + return r; + + n = bus_node_allocate(bus, path); + if (!n) + return -ENOMEM; + + LIST_FOREACH(vtables, i, n->vtables) { + if (i->is_fallback != fallback) { + r = -EPROTOTYPE; + goto fail; + } + + if (streq(i->interface, interface)) { + + if (i->vtable == vtable) { + r = -EEXIST; + goto fail; + } + + existing = i; + } + } + + s = bus_slot_allocate(bus, !slot, BUS_NODE_VTABLE, sizeof(struct node_vtable), userdata); + if (!s) { + r = -ENOMEM; + goto fail; + } + + s->node_vtable.is_fallback = fallback; + s->node_vtable.vtable = vtable; + s->node_vtable.find = find; + + s->node_vtable.interface = strdup(interface); + if (!s->node_vtable.interface) { + r = -ENOMEM; + goto fail; + } + + v = s->node_vtable.vtable; + for (v = bus_vtable_next(vtable, v); v->type != _SD_BUS_VTABLE_END; v = bus_vtable_next(vtable, v)) { + + switch (v->type) { + + case _SD_BUS_VTABLE_METHOD: { + struct vtable_member *m; + nf = NAMES_FIRST_PART; + + if (bus_vtable_has_names(vtable)) + names = strempty(v->x.method.names); + + if (!member_name_is_valid(v->x.method.member) || + !signature_is_valid(strempty(v->x.method.signature), false) || + !signature_is_valid(strempty(v->x.method.result), false) || + !names_are_valid(strempty(v->x.method.signature), &names, &nf) || + !names_are_valid(strempty(v->x.method.result), &names, &nf) || + !(v->x.method.handler || (isempty(v->x.method.signature) && isempty(v->x.method.result))) || + v->flags & (SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE|SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION)) { + r = -EINVAL; + goto fail; + } + + m = new0(struct vtable_member, 1); + if (!m) { + r = -ENOMEM; + goto fail; + } + + m->parent = &s->node_vtable; + m->path = n->path; + m->interface = s->node_vtable.interface; + m->member = v->x.method.member; + m->vtable = v; + + r = hashmap_put(bus->vtable_methods, m, m); + if (r < 0) { + free(m); + goto fail; + } + + break; + } + + case _SD_BUS_VTABLE_WRITABLE_PROPERTY: + + if (!(v->x.property.set || bus_type_is_basic(v->x.property.signature[0]))) { + r = -EINVAL; + goto fail; + } + + if (v->flags & SD_BUS_VTABLE_PROPERTY_CONST) { + r = -EINVAL; + goto fail; + } + + _fallthrough_; + case _SD_BUS_VTABLE_PROPERTY: { + struct vtable_member *m; + + if (!member_name_is_valid(v->x.property.member) || + !signature_is_single(v->x.property.signature, false) || + !(v->x.property.get || bus_type_is_basic(v->x.property.signature[0]) || streq(v->x.property.signature, "as")) || + (v->flags & SD_BUS_VTABLE_METHOD_NO_REPLY) || + (!!(v->flags & SD_BUS_VTABLE_PROPERTY_CONST) + !!(v->flags & SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE) + !!(v->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION)) > 1 || + ((v->flags & SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE) && (v->flags & SD_BUS_VTABLE_PROPERTY_EXPLICIT)) || + (v->flags & SD_BUS_VTABLE_UNPRIVILEGED && v->type == _SD_BUS_VTABLE_PROPERTY)) { + r = -EINVAL; + goto fail; + } + + m = new0(struct vtable_member, 1); + if (!m) { + r = -ENOMEM; + goto fail; + } + + m->parent = &s->node_vtable; + m->path = n->path; + m->interface = s->node_vtable.interface; + m->member = v->x.property.member; + m->vtable = v; + + r = hashmap_put(bus->vtable_properties, m, m); + if (r < 0) { + free(m); + goto fail; + } + + break; + } + + case _SD_BUS_VTABLE_SIGNAL: + nf = NAMES_SINGLE_PART; + + if (bus_vtable_has_names(vtable)) + names = strempty(v->x.signal.names); + + if (!member_name_is_valid(v->x.signal.member) || + !signature_is_valid(strempty(v->x.signal.signature), false) || + !names_are_valid(strempty(v->x.signal.signature), &names, &nf) || + v->flags & SD_BUS_VTABLE_UNPRIVILEGED) { + r = -EINVAL; + goto fail; + } + + break; + + default: + r = -EINVAL; + goto fail; + } + } + + s->node_vtable.node = n; + LIST_INSERT_AFTER(vtables, n->vtables, existing, &s->node_vtable); + bus->nodes_modified = true; + + if (slot) + *slot = s; + + return 0; + +fail: + sd_bus_slot_unref(s); + bus_node_gc(bus, n); + + return r; +} + +/* This symbol exists solely to tell the linker that the "new" vtable format is used. */ +_public_ const unsigned sd_bus_object_vtable_format = 242; + +_public_ int sd_bus_add_object_vtable( + sd_bus *bus, + sd_bus_slot **slot, + const char *path, + const char *interface, + const sd_bus_vtable *vtable, + void *userdata) { + + return add_object_vtable_internal(bus, slot, path, interface, vtable, false, NULL, userdata); +} + +_public_ int sd_bus_add_fallback_vtable( + sd_bus *bus, + sd_bus_slot **slot, + const char *prefix, + const char *interface, + const sd_bus_vtable *vtable, + sd_bus_object_find_t find, + void *userdata) { + + return add_object_vtable_internal(bus, slot, prefix, interface, vtable, true, find, userdata); +} + +_public_ int sd_bus_add_node_enumerator( + sd_bus *bus, + sd_bus_slot **slot, + const char *path, + sd_bus_node_enumerator_t callback, + void *userdata) { + + sd_bus_slot *s; + struct node *n; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(callback, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + n = bus_node_allocate(bus, path); + if (!n) + return -ENOMEM; + + s = bus_slot_allocate(bus, !slot, BUS_NODE_ENUMERATOR, sizeof(struct node_enumerator), userdata); + if (!s) { + r = -ENOMEM; + goto fail; + } + + s->node_enumerator.callback = callback; + + s->node_enumerator.node = n; + LIST_PREPEND(enumerators, n->enumerators, &s->node_enumerator); + bus->nodes_modified = true; + + if (slot) + *slot = s; + + return 0; + +fail: + sd_bus_slot_unref(s); + bus_node_gc(bus, n); + + return r; +} + +static int emit_properties_changed_on_interface( + sd_bus *bus, + const char *prefix, + const char *path, + const char *interface, + bool require_fallback, + bool *found_interface, + char **names) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + bool has_invalidating = false, has_changing = false; + struct vtable_member key = {}; + struct node *n; + void *u = NULL; + int r; + + assert(bus); + assert(prefix); + assert(path); + assert(interface); + assert(found_interface); + + n = hashmap_get(bus->nodes, prefix); + if (!n) + return 0; + + r = sd_bus_message_new_signal(bus, &m, path, "org.freedesktop.DBus.Properties", "PropertiesChanged"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "s", interface); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "{sv}"); + if (r < 0) + return r; + + key.path = prefix; + key.interface = interface; + + LIST_FOREACH(vtables, c, n->vtables) { + if (require_fallback && !c->is_fallback) + continue; + + if (!streq(c->interface, interface)) + continue; + + r = node_vtable_get_userdata(bus, path, c, &u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + *found_interface = true; + + if (names) { + /* If the caller specified a list of + * properties we include exactly those in the + * PropertiesChanged message */ + + STRV_FOREACH(property, names) { + struct vtable_member *v; + + assert_return(member_name_is_valid(*property), -EINVAL); + + key.member = *property; + v = hashmap_get(bus->vtable_properties, &key); + if (!v) + return -ENOENT; + + /* If there are two vtables for the same + * interface, let's handle this property when + * we come to that vtable. */ + if (c != v->parent) + continue; + + assert_return(v->vtable->flags & SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE || + v->vtable->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION, -EDOM); + + assert_return(!(v->vtable->flags & SD_BUS_VTABLE_HIDDEN), -EDOM); + + if (v->vtable->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION) { + has_invalidating = true; + continue; + } + + has_changing = true; + + r = vtable_append_one_property(bus, m, m->path, c, v->vtable, u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + } else { + const sd_bus_vtable *v; + + /* If the caller specified no properties list + * we include all properties that are marked + * as changing in the message. */ + + v = c->vtable; + for (v = bus_vtable_next(c->vtable, v); v->type != _SD_BUS_VTABLE_END; v = bus_vtable_next(c->vtable, v)) { + if (!IN_SET(v->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY)) + continue; + + if (v->flags & SD_BUS_VTABLE_HIDDEN) + continue; + + if (v->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION) { + has_invalidating = true; + continue; + } + + if (!(v->flags & SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE)) + continue; + + has_changing = true; + + r = vtable_append_one_property(bus, m, m->path, c, v, u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + } + } + + if (!has_invalidating && !has_changing) + return 0; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return r; + + if (has_invalidating) { + LIST_FOREACH(vtables, c, n->vtables) { + if (require_fallback && !c->is_fallback) + continue; + + if (!streq(c->interface, interface)) + continue; + + r = node_vtable_get_userdata(bus, path, c, &u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + if (names) { + STRV_FOREACH(property, names) { + struct vtable_member *v; + + key.member = *property; + assert_se(v = hashmap_get(bus->vtable_properties, &key)); + assert(c == v->parent); + + if (!(v->vtable->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION)) + continue; + + r = sd_bus_message_append(m, "s", *property); + if (r < 0) + return r; + } + } else { + const sd_bus_vtable *v; + + v = c->vtable; + for (v = bus_vtable_next(c->vtable, v); v->type != _SD_BUS_VTABLE_END; v = bus_vtable_next(c->vtable, v)) { + if (!IN_SET(v->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY)) + continue; + + if (v->flags & SD_BUS_VTABLE_HIDDEN) + continue; + + if (!(v->flags & SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION)) + continue; + + r = sd_bus_message_append(m, "s", v->x.property.member); + if (r < 0) + return r; + } + } + } + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + r = sd_bus_send(bus, m, NULL); + if (r < 0) + return r; + + return 1; +} + +_public_ int sd_bus_emit_properties_changed_strv( + sd_bus *bus, + const char *path, + const char *interface, + char **names) { + + _cleanup_free_ char *prefix = NULL; + bool found_interface = false; + size_t pl; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(interface_name_is_valid(interface), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + /* A non-NULL but empty names list means nothing needs to be + generated. A NULL list OTOH indicates that all properties + that are set to EMITS_CHANGE or EMITS_INVALIDATION shall be + included in the PropertiesChanged message. */ + if (names && names[0] == NULL) + return 0; + + BUS_DONT_DESTROY(bus); + + pl = strlen(path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + do { + bus->nodes_modified = false; + + r = emit_properties_changed_on_interface(bus, path, path, interface, false, &found_interface, names); + if (r != 0) + return r; + if (bus->nodes_modified) + continue; + + OBJECT_PATH_FOREACH_PREFIX(prefix, path) { + r = emit_properties_changed_on_interface(bus, prefix, path, interface, true, &found_interface, names); + if (r != 0) + return r; + if (bus->nodes_modified) + break; + } + + } while (bus->nodes_modified); + + return found_interface ? 0 : -ENOENT; +} + +_public_ int sd_bus_emit_properties_changed( + sd_bus *bus, + const char *path, + const char *interface, + const char *name, ...) { + + char **names; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(interface_name_is_valid(interface), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (!name) + return 0; + + names = strv_from_stdarg_alloca(name); + + return sd_bus_emit_properties_changed_strv(bus, path, interface, names); +} + +static int object_added_append_all_prefix( + sd_bus *bus, + sd_bus_message *m, + OrderedSet *s, + const char *prefix, + const char *path, + bool require_fallback) { + + const char *previous_interface = NULL; + struct node *n; + int r; + + assert(bus); + assert(m); + assert(s); + assert(prefix); + assert(path); + + n = hashmap_get(bus->nodes, prefix); + if (!n) + return 0; + + LIST_FOREACH(vtables, c, n->vtables) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + void *u = NULL; + + if (require_fallback && !c->is_fallback) + continue; + + r = node_vtable_get_userdata(bus, path, c, &u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + if (!streq_ptr(c->interface, previous_interface)) { + /* If a child-node already handled this interface, we + * skip it on any of its parents. The child vtables + * always fully override any conflicting vtables of + * any parent node. */ + if (ordered_set_get(s, c->interface)) + continue; + + r = ordered_set_put(s, c->interface); + if (r < 0) + return r; + + if (previous_interface) { + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + } + + r = sd_bus_message_open_container(m, 'e', "sa{sv}"); + if (r < 0) + return r; + r = sd_bus_message_append(m, "s", c->interface); + if (r < 0) + return r; + r = sd_bus_message_open_container(m, 'a', "{sv}"); + if (r < 0) + return r; + + previous_interface = c->interface; + } + + r = vtable_append_all_properties(bus, m, path, c, u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + + if (previous_interface) { + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + } + + return 0; +} + +static int object_added_append_all(sd_bus *bus, sd_bus_message *m, const char *path, bool path_has_object_manager) { + _cleanup_ordered_set_free_ OrderedSet *s = NULL; + _cleanup_free_ char *prefix = NULL; + size_t pl; + int r; + + assert(bus); + assert(m); + assert(path); + + /* + * This appends all interfaces registered on path @path. We first add + * the builtin interfaces, which are always available and handled by + * sd-bus. Then, we add all interfaces registered on the exact node, + * followed by all fallback interfaces registered on any parent prefix. + * + * If an interface is registered multiple times on the same node with + * different vtables, we merge all the properties across all vtables. + * However, if a child node has the same interface registered as one of + * its parent nodes has as fallback, we make the child overwrite the + * parent instead of extending it. Therefore, we keep a "Set" of all + * handled interfaces during parent traversal, so we skip interfaces on + * a parent that were overwritten by a child. + */ + + s = ordered_set_new(&string_hash_ops); + if (!s) + return -ENOMEM; + + r = sd_bus_message_append(m, "{sa{sv}}", "org.freedesktop.DBus.Peer", 0); + if (r < 0) + return r; + r = sd_bus_message_append(m, "{sa{sv}}", "org.freedesktop.DBus.Introspectable", 0); + if (r < 0) + return r; + r = sd_bus_message_append(m, "{sa{sv}}", "org.freedesktop.DBus.Properties", 0); + if (r < 0) + return r; + if (path_has_object_manager){ + r = sd_bus_message_append(m, "{sa{sv}}", "org.freedesktop.DBus.ObjectManager", 0); + if (r < 0) + return r; + } + + r = object_added_append_all_prefix(bus, m, s, path, path, false); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + + pl = strlen(path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + OBJECT_PATH_FOREACH_PREFIX(prefix, path) { + r = object_added_append_all_prefix(bus, m, s, prefix, path, true); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + + return 0; +} + +_public_ int sd_bus_emit_object_added(sd_bus *bus, const char *path) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + struct node *object_manager; + int r; + + /* + * This emits an InterfacesAdded signal on the given path, by iterating + * all registered vtables and fallback vtables on the path. All + * properties are queried and included in the signal. + * This call is equivalent to sd_bus_emit_interfaces_added() with an + * explicit list of registered interfaces. However, unlike + * interfaces_added(), this call can figure out the list of supported + * interfaces itself. Furthermore, it properly adds the builtin + * org.freedesktop.DBus.* interfaces. + */ + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + bool path_has_object_manager = false; + r = bus_find_parent_object_manager(bus, &object_manager, path, &path_has_object_manager); + if (r < 0) + return r; + if (r == 0) + return -ESRCH; + + BUS_DONT_DESTROY(bus); + + do { + bus->nodes_modified = false; + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_signal(bus, &m, object_manager->path, "org.freedesktop.DBus.ObjectManager", "InterfacesAdded"); + if (r < 0) + return r; + + r = sd_bus_message_append_basic(m, 'o', path); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "{sa{sv}}"); + if (r < 0) + return r; + + r = object_added_append_all(bus, m, path, path_has_object_manager); + if (r < 0) + return r; + + if (bus->nodes_modified) + continue; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + } while (bus->nodes_modified); + + return sd_bus_send(bus, m, NULL); +} + +static int object_removed_append_all_prefix( + sd_bus *bus, + sd_bus_message *m, + OrderedSet *s, + const char *prefix, + const char *path, + bool require_fallback) { + + const char *previous_interface = NULL; + struct node *n; + int r; + + assert(bus); + assert(m); + assert(s); + assert(prefix); + assert(path); + + n = hashmap_get(bus->nodes, prefix); + if (!n) + return 0; + + LIST_FOREACH(vtables, c, n->vtables) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + void *u = NULL; + + if (require_fallback && !c->is_fallback) + continue; + if (streq_ptr(c->interface, previous_interface)) + continue; + + /* If a child-node already handled this interface, we + * skip it on any of its parents. The child vtables + * always fully override any conflicting vtables of + * any parent node. */ + if (ordered_set_get(s, c->interface)) + continue; + + r = node_vtable_get_userdata(bus, path, c, &u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + r = ordered_set_put(s, c->interface); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "s", c->interface); + if (r < 0) + return r; + + previous_interface = c->interface; + } + + return 0; +} + +static int object_removed_append_all(sd_bus *bus, sd_bus_message *m, const char *path, bool path_has_object_manager) { + _cleanup_ordered_set_free_ OrderedSet *s = NULL; + _cleanup_free_ char *prefix = NULL; + size_t pl; + int r; + + assert(bus); + assert(m); + assert(path); + + /* see sd_bus_emit_object_added() for details */ + + s = ordered_set_new(&string_hash_ops); + if (!s) + return -ENOMEM; + + r = sd_bus_message_append(m, "s", "org.freedesktop.DBus.Peer"); + if (r < 0) + return r; + r = sd_bus_message_append(m, "s", "org.freedesktop.DBus.Introspectable"); + if (r < 0) + return r; + r = sd_bus_message_append(m, "s", "org.freedesktop.DBus.Properties"); + if (r < 0) + return r; + + if (path_has_object_manager){ + r = sd_bus_message_append(m, "s", "org.freedesktop.DBus.ObjectManager"); + if (r < 0) + return r; + } + + r = object_removed_append_all_prefix(bus, m, s, path, path, false); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + + pl = strlen(path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + OBJECT_PATH_FOREACH_PREFIX(prefix, path) { + r = object_removed_append_all_prefix(bus, m, s, prefix, path, true); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + + return 0; +} + +_public_ int sd_bus_emit_object_removed(sd_bus *bus, const char *path) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + struct node *object_manager; + int r; + + /* + * This is like sd_bus_emit_object_added(), but emits an + * InterfacesRemoved signal on the given path. This only includes any + * registered interfaces but skips the properties. Note that this will + * call into the find() callbacks of any registered vtable. Therefore, + * you must call this function before destroying/unlinking your object. + * Otherwise, the list of interfaces will be incomplete. However, note + * that this will *NOT* call into any property callback. Therefore, the + * object might be in an "destructed" state, as long as we can find it. + */ + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + bool path_has_object_manager = false; + r = bus_find_parent_object_manager(bus, &object_manager, path, &path_has_object_manager); + if (r < 0) + return r; + if (r == 0) + return -ESRCH; + + BUS_DONT_DESTROY(bus); + + do { + bus->nodes_modified = false; + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_signal(bus, &m, object_manager->path, "org.freedesktop.DBus.ObjectManager", "InterfacesRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append_basic(m, 'o', path); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return r; + + r = object_removed_append_all(bus, m, path, path_has_object_manager); + if (r < 0) + return r; + + if (bus->nodes_modified) + continue; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + } while (bus->nodes_modified); + + return sd_bus_send(bus, m, NULL); +} + +static int interfaces_added_append_one_prefix( + sd_bus *bus, + sd_bus_message *m, + const char *prefix, + const char *path, + const char *interface, + bool require_fallback) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool found_interface = false; + struct node *n; + void *u = NULL; + int r; + + assert(bus); + assert(m); + assert(prefix); + assert(path); + assert(interface); + + n = hashmap_get(bus->nodes, prefix); + if (!n) + return 0; + + LIST_FOREACH(vtables, c, n->vtables) { + if (require_fallback && !c->is_fallback) + continue; + + if (!streq(c->interface, interface)) + continue; + + r = node_vtable_get_userdata(bus, path, c, &u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + if (r == 0) + continue; + + if (!found_interface) { + r = sd_bus_message_append_basic(m, 's', interface); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "{sv}"); + if (r < 0) + return r; + + found_interface = true; + } + + r = vtable_append_all_properties(bus, m, path, c, u, &error); + if (r < 0) + return r; + if (bus->nodes_modified) + return 0; + } + + if (found_interface) { + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + } + + return found_interface; +} + +static int interfaces_added_append_one( + sd_bus *bus, + sd_bus_message *m, + const char *path, + const char *interface) { + + _cleanup_free_ char *prefix = NULL; + size_t pl; + int r; + + assert(bus); + assert(m); + assert(path); + assert(interface); + + r = interfaces_added_append_one_prefix(bus, m, path, path, interface, false); + if (r != 0) + return r; + if (bus->nodes_modified) + return 0; + + pl = strlen(path); + assert(pl <= BUS_PATH_SIZE_MAX); + prefix = new(char, pl + 1); + if (!prefix) + return -ENOMEM; + + OBJECT_PATH_FOREACH_PREFIX(prefix, path) { + r = interfaces_added_append_one_prefix(bus, m, prefix, path, interface, true); + if (r != 0) + return r; + if (bus->nodes_modified) + return 0; + } + + return -ENOENT; +} + +_public_ int sd_bus_emit_interfaces_added_strv(sd_bus *bus, const char *path, char **interfaces) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + struct node *object_manager; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (strv_isempty(interfaces)) + return 0; + + bool path_has_object_manager = false; + r = bus_find_parent_object_manager(bus, &object_manager, path, &path_has_object_manager); + if (r < 0) + return r; + if (r == 0) + return -ESRCH; + + BUS_DONT_DESTROY(bus); + + do { + bus->nodes_modified = false; + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_signal(bus, &m, object_manager->path, "org.freedesktop.DBus.ObjectManager", "InterfacesAdded"); + if (r < 0) + return r; + + r = sd_bus_message_append_basic(m, 'o', path); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "{sa{sv}}"); + if (r < 0) + return r; + + STRV_FOREACH(i, interfaces) { + assert_return(interface_name_is_valid(*i), -EINVAL); + + r = sd_bus_message_open_container(m, 'e', "sa{sv}"); + if (r < 0) + return r; + + r = interfaces_added_append_one(bus, m, path, *i); + if (r < 0) + return r; + + if (bus->nodes_modified) + break; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + } + + if (bus->nodes_modified) + continue; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + } while (bus->nodes_modified); + + return sd_bus_send(bus, m, NULL); +} + +_public_ int sd_bus_emit_interfaces_added(sd_bus *bus, const char *path, const char *interface, ...) { + char **interfaces; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + interfaces = strv_from_stdarg_alloca(interface); + + return sd_bus_emit_interfaces_added_strv(bus, path, interfaces); +} + +_public_ int sd_bus_emit_interfaces_removed_strv(sd_bus *bus, const char *path, char **interfaces) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + struct node *object_manager; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (strv_isempty(interfaces)) + return 0; + + bool path_has_object_manager = false; + r = bus_find_parent_object_manager(bus, &object_manager, path, &path_has_object_manager); + if (r < 0) + return r; + if (r == 0) + return -ESRCH; + + r = sd_bus_message_new_signal(bus, &m, object_manager->path, "org.freedesktop.DBus.ObjectManager", "InterfacesRemoved"); + if (r < 0) + return r; + + r = sd_bus_message_append_basic(m, 'o', path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(m, interfaces); + if (r < 0) + return r; + + return sd_bus_send(bus, m, NULL); +} + +_public_ int sd_bus_emit_interfaces_removed(sd_bus *bus, const char *path, const char *interface, ...) { + char **interfaces; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + interfaces = strv_from_stdarg_alloca(interface); + + return sd_bus_emit_interfaces_removed_strv(bus, path, interfaces); +} + +_public_ int sd_bus_add_object_manager(sd_bus *bus, sd_bus_slot **slot, const char *path) { + sd_bus_slot *s; + struct node *n; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + n = bus_node_allocate(bus, path); + if (!n) + return -ENOMEM; + + s = bus_slot_allocate(bus, !slot, BUS_NODE_OBJECT_MANAGER, sizeof(struct node_object_manager), NULL); + if (!s) { + r = -ENOMEM; + goto fail; + } + + s->node_object_manager.node = n; + LIST_PREPEND(object_managers, n->object_managers, &s->node_object_manager); + bus->nodes_modified = true; + + if (slot) + *slot = s; + + return 0; + +fail: + sd_bus_slot_unref(s); + bus_node_gc(bus, n); + + return r; +} diff --git a/src/libsystemd/sd-bus/bus-objects.h b/src/libsystemd/sd-bus/bus-objects.h new file mode 100644 index 0000000..20fccfa --- /dev/null +++ b/src/libsystemd/sd-bus/bus-objects.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-internal.h" +#include "bus-introspect.h" + +const sd_bus_vtable* bus_vtable_next(const sd_bus_vtable *vtable, const sd_bus_vtable *v); +bool bus_vtable_has_names(const sd_bus_vtable *vtable); +int bus_process_object(sd_bus *bus, sd_bus_message *m); +void bus_node_gc(sd_bus *b, struct node *n); + +int introspect_path( + sd_bus *bus, + const char *path, + struct node *n, + bool require_fallback, + bool ignore_nodes_modified, + bool *found_object, + char **ret, + sd_bus_error *error); diff --git a/src/libsystemd/sd-bus/bus-protocol.h b/src/libsystemd/sd-bus/bus-protocol.h new file mode 100644 index 0000000..be46b5f --- /dev/null +++ b/src/libsystemd/sd-bus/bus-protocol.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +/* Packet header */ + +struct _packed_ bus_header { + uint8_t endian; + uint8_t type; + uint8_t flags; + uint8_t version; + uint32_t body_size; + /* Note that what the bus spec calls "serial" we'll call "cookie" instead, because we don't + * want to imply that the cookie was in any way monotonically increasing. */ + uint32_t serial; + uint32_t fields_size; +}; + +/* Endianness */ + +enum { + _BUS_INVALID_ENDIAN = 0, + BUS_LITTLE_ENDIAN = 'l', + BUS_BIG_ENDIAN = 'B', +#if __BYTE_ORDER == __BIG_ENDIAN + BUS_NATIVE_ENDIAN = BUS_BIG_ENDIAN, + BUS_REVERSE_ENDIAN = BUS_LITTLE_ENDIAN +#else + BUS_NATIVE_ENDIAN = BUS_LITTLE_ENDIAN, + BUS_REVERSE_ENDIAN = BUS_BIG_ENDIAN +#endif +}; + +/* Flags */ + +enum { + BUS_MESSAGE_NO_REPLY_EXPECTED = 1 << 0, + BUS_MESSAGE_NO_AUTO_START = 1 << 1, + BUS_MESSAGE_ALLOW_INTERACTIVE_AUTHORIZATION = 1 << 2, +}; + +/* Header fields */ + +enum { + _BUS_MESSAGE_HEADER_INVALID = 0, + BUS_MESSAGE_HEADER_PATH, + BUS_MESSAGE_HEADER_INTERFACE, + BUS_MESSAGE_HEADER_MEMBER, + BUS_MESSAGE_HEADER_ERROR_NAME, + BUS_MESSAGE_HEADER_REPLY_SERIAL, + BUS_MESSAGE_HEADER_DESTINATION, + BUS_MESSAGE_HEADER_SENDER, + BUS_MESSAGE_HEADER_SIGNATURE, + BUS_MESSAGE_HEADER_UNIX_FDS, + _BUS_MESSAGE_HEADER_MAX +}; + +/* RequestName parameters */ + +enum { + BUS_NAME_ALLOW_REPLACEMENT = 1 << 0, + BUS_NAME_REPLACE_EXISTING = 1 << 1, + BUS_NAME_DO_NOT_QUEUE = 1 << 2, +}; + +/* RequestName returns */ +enum { + BUS_NAME_PRIMARY_OWNER = 1, + BUS_NAME_IN_QUEUE = 2, + BUS_NAME_EXISTS = 3, + BUS_NAME_ALREADY_OWNER = 4 +}; + +/* ReleaseName returns */ +enum { + BUS_NAME_RELEASED = 1, + BUS_NAME_NON_EXISTENT = 2, + BUS_NAME_NOT_OWNER = 3, +}; + +/* StartServiceByName returns */ +enum { + BUS_START_REPLY_SUCCESS = 1, + BUS_START_REPLY_ALREADY_RUNNING = 2, +}; diff --git a/src/libsystemd/sd-bus/bus-signature.c b/src/libsystemd/sd-bus/bus-signature.c new file mode 100644 index 0000000..78c7436 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-signature.c @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "bus-signature.h" +#include "bus-type.h" + +static int signature_element_length_internal( + const char *s, + bool allow_dict_entry, + unsigned array_depth, + unsigned struct_depth, + size_t *l) { + + int r; + + if (!s) + return -EINVAL; + + assert(l); + + if (bus_type_is_basic(*s) || *s == SD_BUS_TYPE_VARIANT) { + *l = 1; + return 0; + } + + if (*s == SD_BUS_TYPE_ARRAY) { + size_t t; + + if (array_depth >= 32) + return -EINVAL; + + r = signature_element_length_internal(s + 1, true, array_depth+1, struct_depth, &t); + if (r < 0) + return r; + + *l = t + 1; + return 0; + } + + if (*s == SD_BUS_TYPE_STRUCT_BEGIN) { + const char *p = s + 1; + + if (struct_depth >= 32) + return -EINVAL; + + while (*p != SD_BUS_TYPE_STRUCT_END) { + size_t t; + + r = signature_element_length_internal(p, false, array_depth, struct_depth+1, &t); + if (r < 0) + return r; + + p += t; + } + + if (p - s < 2) + /* D-Bus spec: Empty structures are not allowed; there + * must be at least one type code between the parentheses. + */ + return -EINVAL; + + *l = p - s + 1; + return 0; + } + + if (*s == SD_BUS_TYPE_DICT_ENTRY_BEGIN && allow_dict_entry) { + const char *p = s + 1; + unsigned n = 0; + + if (struct_depth >= 32) + return -EINVAL; + + while (*p != SD_BUS_TYPE_DICT_ENTRY_END) { + size_t t; + + if (n == 0 && !bus_type_is_basic(*p)) + return -EINVAL; + + r = signature_element_length_internal(p, false, array_depth, struct_depth+1, &t); + if (r < 0) + return r; + + p += t; + n++; + } + + if (n != 2) + return -EINVAL; + + *l = p - s + 1; + return 0; + } + + return -EINVAL; +} + +int signature_element_length(const char *s, size_t *l) { + return signature_element_length_internal(s, true, 0, 0, l); +} + +bool signature_is_single(const char *s, bool allow_dict_entry) { + int r; + size_t t; + + if (!s) + return false; + + r = signature_element_length_internal(s, allow_dict_entry, 0, 0, &t); + if (r < 0) + return false; + + return s[t] == 0; +} + +bool signature_is_pair(const char *s) { + + if (!s) + return false; + + if (!bus_type_is_basic(*s)) + return false; + + return signature_is_single(s + 1, false); +} + +bool signature_is_valid(const char *s, bool allow_dict_entry) { + const char *p; + int r; + + if (!s) + return false; + + p = s; + while (*p) { + size_t t; + + r = signature_element_length_internal(p, allow_dict_entry, 0, 0, &t); + if (r < 0) + return false; + + p += t; + } + + return p - s <= SD_BUS_MAXIMUM_SIGNATURE_LENGTH; +} diff --git a/src/libsystemd/sd-bus/bus-signature.h b/src/libsystemd/sd-bus/bus-signature.h new file mode 100644 index 0000000..314fcc2 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-signature.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +bool signature_is_single(const char *s, bool allow_dict_entry); +bool signature_is_pair(const char *s); +bool signature_is_valid(const char *s, bool allow_dict_entry); + +int signature_element_length(const char *s, size_t *l); diff --git a/src/libsystemd/sd-bus/bus-slot.c b/src/libsystemd/sd-bus/bus-slot.c new file mode 100644 index 0000000..9f28957 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-slot.c @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-control.h" +#include "bus-objects.h" +#include "bus-slot.h" +#include "string-util.h" + +sd_bus_slot *bus_slot_allocate( + sd_bus *bus, + bool floating, + BusSlotType type, + size_t extra, + void *userdata) { + + sd_bus_slot *slot; + + assert(bus); + + slot = malloc0(offsetof(sd_bus_slot, reply_callback) + extra); + if (!slot) + return NULL; + + slot->n_ref = 1; + slot->type = type; + slot->bus = bus; + slot->floating = floating; + slot->userdata = userdata; + + if (!floating) + sd_bus_ref(bus); + + LIST_PREPEND(slots, bus->slots, slot); + + return slot; +} + +void bus_slot_disconnect(sd_bus_slot *slot, bool unref) { + sd_bus *bus; + + assert(slot); + + if (!slot->bus) + return; + + switch (slot->type) { + + case BUS_REPLY_CALLBACK: + + if (slot->reply_callback.cookie != 0) + ordered_hashmap_remove(slot->bus->reply_callbacks, &slot->reply_callback.cookie); + + if (slot->reply_callback.timeout_usec != 0) + prioq_remove(slot->bus->reply_callbacks_prioq, &slot->reply_callback, &slot->reply_callback.prioq_idx); + + break; + + case BUS_FILTER_CALLBACK: + slot->bus->filter_callbacks_modified = true; + LIST_REMOVE(callbacks, slot->bus->filter_callbacks, &slot->filter_callback); + break; + + case BUS_MATCH_CALLBACK: + + if (slot->match_added) + (void) bus_remove_match_internal(slot->bus, slot->match_callback.match_string); + + if (slot->match_callback.install_slot) { + bus_slot_disconnect(slot->match_callback.install_slot, true); + slot->match_callback.install_slot = sd_bus_slot_unref(slot->match_callback.install_slot); + } + + slot->bus->match_callbacks_modified = true; + bus_match_remove(&slot->bus->match_callbacks, &slot->match_callback); + + slot->match_callback.match_string = mfree(slot->match_callback.match_string); + + break; + + case BUS_NODE_CALLBACK: + + if (slot->node_callback.node) { + LIST_REMOVE(callbacks, slot->node_callback.node->callbacks, &slot->node_callback); + slot->bus->nodes_modified = true; + + bus_node_gc(slot->bus, slot->node_callback.node); + } + + break; + + case BUS_NODE_ENUMERATOR: + + if (slot->node_enumerator.node) { + LIST_REMOVE(enumerators, slot->node_enumerator.node->enumerators, &slot->node_enumerator); + slot->bus->nodes_modified = true; + + bus_node_gc(slot->bus, slot->node_enumerator.node); + } + + break; + + case BUS_NODE_OBJECT_MANAGER: + + if (slot->node_object_manager.node) { + LIST_REMOVE(object_managers, slot->node_object_manager.node->object_managers, &slot->node_object_manager); + slot->bus->nodes_modified = true; + + bus_node_gc(slot->bus, slot->node_object_manager.node); + } + + break; + + case BUS_NODE_VTABLE: + + if (slot->node_vtable.node && slot->node_vtable.interface && slot->node_vtable.vtable) { + const sd_bus_vtable *v; + + for (v = slot->node_vtable.vtable; v->type != _SD_BUS_VTABLE_END; v = bus_vtable_next(slot->node_vtable.vtable, v)) { + struct vtable_member *x = NULL; + + switch (v->type) { + + case _SD_BUS_VTABLE_METHOD: { + struct vtable_member key; + + key.path = slot->node_vtable.node->path; + key.interface = slot->node_vtable.interface; + key.member = v->x.method.member; + + x = hashmap_remove(slot->bus->vtable_methods, &key); + break; + } + + case _SD_BUS_VTABLE_PROPERTY: + case _SD_BUS_VTABLE_WRITABLE_PROPERTY: { + struct vtable_member key; + + key.path = slot->node_vtable.node->path; + key.interface = slot->node_vtable.interface; + key.member = v->x.method.member; + + x = hashmap_remove(slot->bus->vtable_properties, &key); + break; + }} + + free(x); + } + } + + slot->node_vtable.interface = mfree(slot->node_vtable.interface); + + if (slot->node_vtable.node) { + LIST_REMOVE(vtables, slot->node_vtable.node->vtables, &slot->node_vtable); + slot->bus->nodes_modified = true; + + bus_node_gc(slot->bus, slot->node_vtable.node); + } + + break; + + default: + assert_not_reached(); + } + + bus = slot->bus; + + slot->type = _BUS_SLOT_INVALID; + slot->bus = NULL; + LIST_REMOVE(slots, bus->slots, slot); + + if (!slot->floating) + sd_bus_unref(bus); + else if (unref) + sd_bus_slot_unref(slot); +} + +static sd_bus_slot* bus_slot_free(sd_bus_slot *slot) { + assert(slot); + + bus_slot_disconnect(slot, false); + + if (slot->destroy_callback) + slot->destroy_callback(slot->userdata); + + free(slot->description); + return mfree(slot); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_bus_slot, sd_bus_slot, bus_slot_free); + +_public_ sd_bus* sd_bus_slot_get_bus(sd_bus_slot *slot) { + assert_return(slot, NULL); + + return slot->bus; +} + +_public_ void *sd_bus_slot_get_userdata(sd_bus_slot *slot) { + assert_return(slot, NULL); + + return slot->userdata; +} + +_public_ void *sd_bus_slot_set_userdata(sd_bus_slot *slot, void *userdata) { + void *ret; + + assert_return(slot, NULL); + + ret = slot->userdata; + slot->userdata = userdata; + + return ret; +} + +_public_ int sd_bus_slot_set_destroy_callback(sd_bus_slot *slot, sd_bus_destroy_t callback) { + assert_return(slot, -EINVAL); + + slot->destroy_callback = callback; + return 0; +} + +_public_ int sd_bus_slot_get_destroy_callback(sd_bus_slot *slot, sd_bus_destroy_t *callback) { + assert_return(slot, -EINVAL); + + if (callback) + *callback = slot->destroy_callback; + + return !!slot->destroy_callback; +} + +_public_ sd_bus_message *sd_bus_slot_get_current_message(sd_bus_slot *slot) { + assert_return(slot, NULL); + assert_return(slot->type >= 0, NULL); + + if (slot->bus->current_slot != slot) + return NULL; + + return slot->bus->current_message; +} + +_public_ sd_bus_message_handler_t sd_bus_slot_get_current_handler(sd_bus_slot *slot) { + assert_return(slot, NULL); + assert_return(slot->type >= 0, NULL); + + if (slot->bus->current_slot != slot) + return NULL; + + return slot->bus->current_handler; +} + +_public_ void* sd_bus_slot_get_current_userdata(sd_bus_slot *slot) { + assert_return(slot, NULL); + assert_return(slot->type >= 0, NULL); + + if (slot->bus->current_slot != slot) + return NULL; + + return slot->bus->current_userdata; +} + +_public_ int sd_bus_slot_get_floating(sd_bus_slot *slot) { + assert_return(slot, -EINVAL); + + return slot->floating; +} + +_public_ int sd_bus_slot_set_floating(sd_bus_slot *slot, int b) { + assert_return(slot, -EINVAL); + + if (slot->floating == !!b) + return 0; + + if (!slot->bus) /* already disconnected slots can't be reconnected */ + return -ESTALE; + + slot->floating = b; + + /* When a slot is "floating" then the bus references the slot. Otherwise the slot references the bus. Hence, + * when we move from one to the other, let's increase one reference and decrease the other. */ + + if (b) { + sd_bus_slot_ref(slot); + sd_bus_unref(slot->bus); + } else { + sd_bus_ref(slot->bus); + sd_bus_slot_unref(slot); + } + + return 1; +} + +_public_ int sd_bus_slot_set_description(sd_bus_slot *slot, const char *description) { + assert_return(slot, -EINVAL); + + return free_and_strdup(&slot->description, description); +} + +_public_ int sd_bus_slot_get_description(sd_bus_slot *slot, const char **description) { + assert_return(slot, -EINVAL); + assert_return(description, -EINVAL); + + if (slot->description) + *description = slot->description; + else if (slot->type == BUS_MATCH_CALLBACK) + *description = slot->match_callback.match_string; + else + return -ENXIO; + + return 0; +} diff --git a/src/libsystemd/sd-bus/bus-slot.h b/src/libsystemd/sd-bus/bus-slot.h new file mode 100644 index 0000000..8116195 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-slot.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-internal.h" + +sd_bus_slot *bus_slot_allocate(sd_bus *bus, bool floating, BusSlotType type, size_t extra, void *userdata); + +void bus_slot_disconnect(sd_bus_slot *slot, bool unref); diff --git a/src/libsystemd/sd-bus/bus-socket.c b/src/libsystemd/sd-bus/bus-socket.c new file mode 100644 index 0000000..5ade8e9 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-socket.c @@ -0,0 +1,1428 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-message.h" +#include "bus-socket.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "iovec-util.h" +#include "macro.h" +#include "memory-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "user-util.h" +#include "utf8.h" + +#define SNDBUF_SIZE (8*1024*1024) + +static void iovec_advance(struct iovec iov[], unsigned *idx, size_t size) { + + while (size > 0) { + struct iovec *i = iov + *idx; + + if (i->iov_len > size) { + i->iov_base = (uint8_t*) i->iov_base + size; + i->iov_len -= size; + return; + } + + size -= i->iov_len; + + *i = IOVEC_MAKE(NULL, 0); + + (*idx)++; + } +} + +static int append_iovec(sd_bus_message *m, const void *p, size_t sz) { + assert(m); + assert(p); + assert(sz > 0); + + m->iovec[m->n_iovec++] = IOVEC_MAKE((void*) p, sz); + + return 0; +} + +static int bus_message_setup_iovec(sd_bus_message *m) { + struct bus_body_part *part; + unsigned n, i; + int r; + + assert(m); + assert(m->sealed); + + if (m->n_iovec > 0) + return 0; + + assert(!m->iovec); + + n = 1 + m->n_body_parts; + if (n < ELEMENTSOF(m->iovec_fixed)) + m->iovec = m->iovec_fixed; + else { + m->iovec = new(struct iovec, n); + if (!m->iovec) { + r = -ENOMEM; + goto fail; + } + } + + r = append_iovec(m, m->header, BUS_MESSAGE_BODY_BEGIN(m)); + if (r < 0) + goto fail; + + MESSAGE_FOREACH_PART(part, i, m) { + r = bus_body_part_map(part); + if (r < 0) + goto fail; + + r = append_iovec(m, part->data, part->size); + if (r < 0) + goto fail; + } + + assert(n == m->n_iovec); + + return 0; + +fail: + m->poisoned = true; + return r; +} + +bool bus_socket_auth_needs_write(sd_bus *b) { + + unsigned i; + + if (b->auth_index >= ELEMENTSOF(b->auth_iovec)) + return false; + + for (i = b->auth_index; i < ELEMENTSOF(b->auth_iovec); i++) { + struct iovec *j = b->auth_iovec + i; + + if (j->iov_len > 0) + return true; + } + + return false; +} + +static int bus_socket_auth_verify_client(sd_bus *b) { + char *l, *lines[4] = {}; + sd_id128_t peer; + size_t i, n; + int r; + + assert(b); + + /* + * We expect up to three response lines: + * "DATA\r\n" (optional) + * "OK \r\n" + * "AGREE_UNIX_FD\r\n" (optional) + */ + + n = 0; + lines[n] = b->rbuffer; + for (i = 0; i < 3; ++i) { + l = memmem_safe(lines[n], b->rbuffer_size - (lines[n] - (char*) b->rbuffer), "\r\n", 2); + if (l) + lines[++n] = l + 2; + else + break; + } + + /* + * If we sent a non-empty initial response, then we just expect an OK + * reply. We currently do this if, and only if, we picked ANONYMOUS. + * If we did not send an initial response, then we expect a DATA + * challenge, reply with our own DATA, and expect an OK reply. We do + * this for EXTERNAL. + * If FD negotiation was requested, we additionally expect + * an AGREE_UNIX_FD response in all cases. + */ + if (n < (b->anonymous_auth ? 1U : 2U) + !!b->accept_fd) + return 0; /* wait for more data */ + + i = 0; + + /* In case of EXTERNAL, verify the first response was DATA. */ + if (!b->anonymous_auth) { + l = lines[i++]; + if (lines[i] - l == 4 + 2) { + if (memcmp(l, "DATA", 4)) + return -EPERM; + } else if (lines[i] - l == 3 + 32 + 2) { + /* + * Old versions of the server-side implementation of + * `sd-bus` replied with "OK " to "AUTH" requests + * from a client, even if the "AUTH" line did not + * contain inlined arguments. Therefore, we also accept + * "OK " here, even though it is technically the + * wrong reply. We ignore the "" parameter, though, + * since it has no real value. + */ + if (memcmp(l, "OK ", 3)) + return -EPERM; + } else + return -EPERM; + } + + /* Now check the OK line. */ + l = lines[i++]; + + if (lines[i] - l != 3 + 32 + 2) + return -EPERM; + if (memcmp(l, "OK ", 3)) + return -EPERM; + + b->auth = b->anonymous_auth ? BUS_AUTH_ANONYMOUS : BUS_AUTH_EXTERNAL; + + for (unsigned j = 0; j < 32; j += 2) { + int x, y; + + x = unhexchar(l[3 + j]); + y = unhexchar(l[3 + j + 1]); + + if (x < 0 || y < 0) + return -EINVAL; + + peer.bytes[j/2] = ((uint8_t) x << 4 | (uint8_t) y); + } + + if (!sd_id128_is_null(b->server_id) && + !sd_id128_equal(b->server_id, peer)) + return -EPERM; + + b->server_id = peer; + + /* And possibly check the third line, too */ + if (b->accept_fd) { + l = lines[i++]; + b->can_fds = !!memory_startswith(l, lines[i] - l, "AGREE_UNIX_FD"); + } + + assert(i == n); + + b->rbuffer_size -= (lines[i] - (char*) b->rbuffer); + memmove(b->rbuffer, lines[i], b->rbuffer_size); + + r = bus_start_running(b); + if (r < 0) + return r; + + return 1; +} + +static bool line_equals(const char *s, size_t m, const char *line) { + size_t l; + + l = strlen(line); + if (l != m) + return false; + + return memcmp(s, line, l) == 0; +} + +static bool line_begins(const char *s, size_t m, const char *word) { + const char *p; + + p = memory_startswith(s, m, word); + return p && (p == (s + m) || *p == ' '); +} + +static int verify_anonymous_token(sd_bus *b, const char *p, size_t l) { + _cleanup_free_ char *token = NULL; + size_t len; + int r; + + if (!b->anonymous_auth) + return 0; + + if (l <= 0) + return 1; + + assert(p[0] == ' '); + p++; l--; + + if (l % 2 != 0) + return 0; + + r = unhexmem(p, l, (void **) &token, &len); + if (r < 0) + return 0; + + if (memchr(token, 0, len)) + return 0; + + return !!utf8_is_valid(token); +} + +static int verify_external_token(sd_bus *b, const char *p, size_t l) { + _cleanup_free_ char *token = NULL; + size_t len; + uid_t u; + int r; + + /* We don't do any real authentication here. Instead, if + * the owner of this bus wanted authentication they should have + * checked SO_PEERCRED before even creating the bus object. */ + + if (!b->anonymous_auth && !b->ucred_valid) + return 0; + + if (l <= 0) + return 1; + + assert(p[0] == ' '); + p++; l--; + + if (l % 2 != 0) + return 0; + + r = unhexmem(p, l, (void**) &token, &len); + if (r < 0) + return 0; + + if (memchr(token, 0, len)) + return 0; + + r = parse_uid(token, &u); + if (r < 0) + return 0; + + /* We ignore the passed value if anonymous authentication is + * on anyway. */ + if (!b->anonymous_auth && u != b->ucred.uid) + return 0; + + return 1; +} + +static int bus_socket_auth_write(sd_bus *b, const char *t) { + char *p; + size_t l; + + assert(b); + assert(t); + + /* We only make use of the first iovec */ + assert(IN_SET(b->auth_index, 0, 1)); + + l = strlen(t); + p = malloc(b->auth_iovec[0].iov_len + l); + if (!p) + return -ENOMEM; + + memcpy_safe(p, b->auth_iovec[0].iov_base, b->auth_iovec[0].iov_len); + memcpy(p + b->auth_iovec[0].iov_len, t, l); + + b->auth_iovec[0].iov_base = p; + b->auth_iovec[0].iov_len += l; + + free_and_replace(b->auth_buffer, p); + b->auth_index = 0; + return 0; +} + +static int bus_socket_auth_write_ok(sd_bus *b) { + char t[3 + 32 + 2 + 1]; + + assert(b); + + xsprintf(t, "OK " SD_ID128_FORMAT_STR "\r\n", SD_ID128_FORMAT_VAL(b->server_id)); + + return bus_socket_auth_write(b, t); +} + +static int bus_socket_auth_verify_server(sd_bus *b) { + char *e; + const char *line; + size_t l; + bool processed = false; + int r; + + assert(b); + + if (b->rbuffer_size < 1) + return 0; + + /* First char must be a NUL byte */ + if (*(char*) b->rbuffer != 0) + return -EIO; + + if (b->rbuffer_size < 3) + return 0; + + /* Begin with the first line */ + if (b->auth_rbegin <= 0) + b->auth_rbegin = 1; + + for (;;) { + /* Check if line is complete */ + line = (char*) b->rbuffer + b->auth_rbegin; + e = memmem_safe(line, b->rbuffer_size - b->auth_rbegin, "\r\n", 2); + if (!e) + return processed; + + l = e - line; + + if (line_begins(line, l, "AUTH ANONYMOUS")) { + + r = verify_anonymous_token(b, + line + strlen("AUTH ANONYMOUS"), + l - strlen("AUTH ANONYMOUS")); + if (r < 0) + return r; + if (r == 0) + r = bus_socket_auth_write(b, "REJECTED\r\n"); + else { + b->auth = BUS_AUTH_ANONYMOUS; + if (l <= strlen("AUTH ANONYMOUS")) + r = bus_socket_auth_write(b, "DATA\r\n"); + else + r = bus_socket_auth_write_ok(b); + } + + } else if (line_begins(line, l, "AUTH EXTERNAL")) { + + r = verify_external_token(b, + line + strlen("AUTH EXTERNAL"), + l - strlen("AUTH EXTERNAL")); + if (r < 0) + return r; + if (r == 0) + r = bus_socket_auth_write(b, "REJECTED\r\n"); + else { + b->auth = BUS_AUTH_EXTERNAL; + if (l <= strlen("AUTH EXTERNAL")) + r = bus_socket_auth_write(b, "DATA\r\n"); + else + r = bus_socket_auth_write_ok(b); + } + + } else if (line_begins(line, l, "AUTH")) + r = bus_socket_auth_write(b, "REJECTED EXTERNAL ANONYMOUS\r\n"); + else if (line_equals(line, l, "CANCEL") || + line_begins(line, l, "ERROR")) { + + b->auth = _BUS_AUTH_INVALID; + r = bus_socket_auth_write(b, "REJECTED\r\n"); + + } else if (line_equals(line, l, "BEGIN")) { + + if (b->auth == _BUS_AUTH_INVALID) + r = bus_socket_auth_write(b, "ERROR\r\n"); + else { + /* We can't leave from the auth phase + * before we haven't written + * everything queued, so let's check + * that */ + + if (bus_socket_auth_needs_write(b)) + return 1; + + b->rbuffer_size -= (e + 2 - (char*) b->rbuffer); + memmove(b->rbuffer, e + 2, b->rbuffer_size); + return bus_start_running(b); + } + + } else if (line_begins(line, l, "DATA")) { + + if (b->auth == _BUS_AUTH_INVALID) + r = bus_socket_auth_write(b, "ERROR\r\n"); + else { + if (b->auth == BUS_AUTH_ANONYMOUS) + r = verify_anonymous_token(b, line + 4, l - 4); + else + r = verify_external_token(b, line + 4, l - 4); + + if (r < 0) + return r; + if (r == 0) { + b->auth = _BUS_AUTH_INVALID; + r = bus_socket_auth_write(b, "REJECTED\r\n"); + } else + r = bus_socket_auth_write_ok(b); + } + } else if (line_equals(line, l, "NEGOTIATE_UNIX_FD")) { + if (b->auth == _BUS_AUTH_INVALID || !b->accept_fd) + r = bus_socket_auth_write(b, "ERROR\r\n"); + else { + b->can_fds = true; + r = bus_socket_auth_write(b, "AGREE_UNIX_FD\r\n"); + } + } else + r = bus_socket_auth_write(b, "ERROR\r\n"); + + if (r < 0) + return r; + + b->auth_rbegin = e + 2 - (char*) b->rbuffer; + + processed = true; + } +} + +static int bus_socket_auth_verify(sd_bus *b) { + assert(b); + + if (b->is_server) + return bus_socket_auth_verify_server(b); + else + return bus_socket_auth_verify_client(b); +} + +static int bus_socket_write_auth(sd_bus *b) { + ssize_t k; + + assert(b); + assert(b->state == BUS_AUTHENTICATING); + + if (!bus_socket_auth_needs_write(b)) + return 0; + + if (b->prefer_writev) + k = writev(b->output_fd, b->auth_iovec + b->auth_index, ELEMENTSOF(b->auth_iovec) - b->auth_index); + else { + struct msghdr mh = { + .msg_iov = b->auth_iovec + b->auth_index, + .msg_iovlen = ELEMENTSOF(b->auth_iovec) - b->auth_index, + }; + + k = sendmsg(b->output_fd, &mh, MSG_DONTWAIT|MSG_NOSIGNAL); + if (k < 0 && errno == ENOTSOCK) { + b->prefer_writev = true; + k = writev(b->output_fd, b->auth_iovec + b->auth_index, ELEMENTSOF(b->auth_iovec) - b->auth_index); + } + } + + if (k < 0) + return ERRNO_IS_TRANSIENT(errno) ? 0 : -errno; + + iovec_advance(b->auth_iovec, &b->auth_index, (size_t) k); + + /* Now crank the state machine since we might be able to make progress after writing. For example, + * the server only processes "BEGIN" when the write buffer is empty. + */ + return bus_socket_auth_verify(b); +} + +static int bus_socket_read_auth(sd_bus *b) { + struct msghdr mh; + struct iovec iov = {}; + size_t n; + ssize_t k; + int r; + void *p; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int) * BUS_FDS_MAX)) control; + bool handle_cmsg = false; + + assert(b); + assert(b->state == BUS_AUTHENTICATING); + + r = bus_socket_auth_verify(b); + if (r != 0) + return r; + + n = MAX(256u, b->rbuffer_size * 2); + + if (n > BUS_AUTH_SIZE_MAX) + n = BUS_AUTH_SIZE_MAX; + + if (b->rbuffer_size >= n) + return -ENOBUFS; + + p = realloc(b->rbuffer, n); + if (!p) + return -ENOMEM; + + b->rbuffer = p; + + iov = IOVEC_MAKE((uint8_t *)b->rbuffer + b->rbuffer_size, n - b->rbuffer_size); + + if (b->prefer_readv) { + k = readv(b->input_fd, &iov, 1); + if (k < 0) + k = -errno; + } else { + mh = (struct msghdr) { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + k = recvmsg_safe(b->input_fd, &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (k == -ENOTSOCK) { + b->prefer_readv = true; + k = readv(b->input_fd, &iov, 1); + if (k < 0) + k = -errno; + } else + handle_cmsg = true; + } + if (ERRNO_IS_NEG_TRANSIENT(k)) + return 0; + if (k < 0) + return (int) k; + if (k == 0) { + if (handle_cmsg) + cmsg_close_all(&mh); /* paranoia, we shouldn't have gotten any fds on EOF */ + return -ECONNRESET; + } + + b->rbuffer_size += k; + + if (handle_cmsg) { + struct cmsghdr *cmsg; + + CMSG_FOREACH(cmsg, &mh) + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + int j; + + /* Whut? We received fds during the auth + * protocol? Somebody is playing games with + * us. Close them all, and fail */ + j = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + close_many(CMSG_TYPED_DATA(cmsg, int), j); + return -EIO; + } else + log_debug("Got unexpected auxiliary data with level=%d and type=%d", + cmsg->cmsg_level, cmsg->cmsg_type); + } + + r = bus_socket_auth_verify(b); + if (r != 0) + return r; + + return 1; +} + +void bus_socket_setup(sd_bus *b) { + assert(b); + + /* Increase the buffers to 8 MB */ + (void) fd_increase_rxbuf(b->input_fd, SNDBUF_SIZE); + (void) fd_inc_sndbuf(b->output_fd, SNDBUF_SIZE); + + b->message_version = 1; + b->message_endian = 0; +} + +static void bus_get_peercred(sd_bus *b) { + int r; + + assert(b); + assert(!b->ucred_valid); + assert(!b->label); + assert(b->n_groups == SIZE_MAX); + + /* Get the peer for socketpair() sockets */ + b->ucred_valid = getpeercred(b->input_fd, &b->ucred) >= 0; + + /* Get the SELinux context of the peer */ + r = getpeersec(b->input_fd, &b->label); + if (r < 0 && !IN_SET(r, -EOPNOTSUPP, -ENOPROTOOPT)) + log_debug_errno(r, "Failed to determine peer security context: %m"); + + /* Get the list of auxiliary groups of the peer */ + r = getpeergroups(b->input_fd, &b->groups); + if (r >= 0) + b->n_groups = (size_t) r; + else if (!IN_SET(r, -EOPNOTSUPP, -ENOPROTOOPT)) + log_debug_errno(r, "Failed to determine peer's group list: %m"); + + /* Let's query the peers socket address, it might carry information such as the peer's comm or + * description string */ + zero(b->sockaddr_peer); + b->sockaddr_size_peer = 0; + + socklen_t l = sizeof(b->sockaddr_peer) - 1; /* Leave space for a NUL */ + if (getpeername(b->input_fd, &b->sockaddr_peer.sa, &l) < 0) + log_debug_errno(errno, "Failed to get peer's socket address, ignoring: %m"); + else + b->sockaddr_size_peer = l; +} + +static int bus_socket_start_auth_client(sd_bus *b) { + static const char sasl_auth_anonymous[] = { + /* + * We use an arbitrary trace-string for the ANONYMOUS authentication. It can be used by the + * message broker to aid debugging of clients. We fully anonymize the connection and use a + * static default. + */ + /* HEX a n o n y m o u s */ + "\0AUTH ANONYMOUS 616e6f6e796d6f7573\r\n" + }; + static const char sasl_auth_external[] = { + "\0AUTH EXTERNAL\r\n" + "DATA\r\n" + }; + static const char sasl_negotiate_unix_fd[] = { + "NEGOTIATE_UNIX_FD\r\n" + }; + static const char sasl_begin[] = { + "BEGIN\r\n" + }; + size_t i = 0; + + assert(b); + + if (b->anonymous_auth) + b->auth_iovec[i++] = IOVEC_MAKE((char*) sasl_auth_anonymous, sizeof(sasl_auth_anonymous) - 1); + else + b->auth_iovec[i++] = IOVEC_MAKE((char*) sasl_auth_external, sizeof(sasl_auth_external) - 1); + + if (b->accept_fd) + b->auth_iovec[i++] = IOVEC_MAKE_STRING(sasl_negotiate_unix_fd); + + b->auth_iovec[i++] = IOVEC_MAKE_STRING(sasl_begin); + + return bus_socket_write_auth(b); +} + +int bus_socket_start_auth(sd_bus *b) { + assert(b); + + bus_get_peercred(b); + + bus_set_state(b, BUS_AUTHENTICATING); + b->auth_timeout = now(CLOCK_MONOTONIC) + BUS_AUTH_TIMEOUT; + + if (sd_is_socket(b->input_fd, AF_UNIX, 0, 0) <= 0) + b->accept_fd = false; + + if (b->output_fd != b->input_fd) + if (sd_is_socket(b->output_fd, AF_UNIX, 0, 0) <= 0) + b->accept_fd = false; + + if (b->is_server) + return bus_socket_read_auth(b); + else + return bus_socket_start_auth_client(b); +} + +static int bus_socket_inotify_setup(sd_bus *b) { + _cleanup_free_ int *new_watches = NULL; + _cleanup_free_ char *absolute = NULL; + size_t n = 0, done = 0, i; + unsigned max_follow = 32; + const char *p; + int wd, r; + + assert(b); + assert(b->watch_bind); + assert(b->sockaddr.sa.sa_family == AF_UNIX); + assert(b->sockaddr.un.sun_path[0] != 0); + + /* Sets up an inotify fd in case watch_bind is enabled: wait until the configured AF_UNIX file system + * socket appears before connecting to it. The implemented is pretty simplistic: we just subscribe to + * relevant changes to all components of the path, and every time we get an event for that we try to + * reconnect again, without actually caring what precisely the event we got told us. If we still + * can't connect we re-subscribe to all relevant changes of anything in the path, so that our watches + * include any possibly newly created path components. */ + + if (b->inotify_fd < 0) { + b->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (b->inotify_fd < 0) + return -errno; + + b->inotify_fd = fd_move_above_stdio(b->inotify_fd); + } + + /* Make sure the path is NUL terminated */ + p = strndupa_safe(b->sockaddr.un.sun_path, + sizeof(b->sockaddr.un.sun_path)); + + /* Make sure the path is absolute */ + r = path_make_absolute_cwd(p, &absolute); + if (r < 0) + goto fail; + + /* Watch all components of the path, and don't mind any prefix that doesn't exist yet. For the + * innermost directory that exists we want to know when files are created or moved into it. For all + * parents of it we just care if they are removed or renamed. */ + + if (!GREEDY_REALLOC(new_watches, n + 1)) { + r = -ENOMEM; + goto fail; + } + + /* Start with the top-level directory, which is a bit simpler than the rest, since it can't be a + * symlink, and always exists */ + wd = inotify_add_watch(b->inotify_fd, "/", IN_CREATE|IN_MOVED_TO); + if (wd < 0) { + r = log_debug_errno(errno, "Failed to add inotify watch on /: %m"); + goto fail; + } else + new_watches[n++] = wd; + + for (;;) { + _cleanup_free_ char *component = NULL, *prefix = NULL, *destination = NULL; + size_t n_slashes, n_component; + char *c = NULL; + + n_slashes = strspn(absolute + done, "/"); + n_component = n_slashes + strcspn(absolute + done + n_slashes, "/"); + + if (n_component == 0) /* The end */ + break; + + component = strndup(absolute + done, n_component); + if (!component) { + r = -ENOMEM; + goto fail; + } + + /* A trailing slash? That's a directory, and not a socket then */ + if (path_equal(component, "/")) { + r = -EISDIR; + goto fail; + } + + /* A single dot? Let's eat this up */ + if (path_equal(component, "/.")) { + done += n_component; + continue; + } + + prefix = strndup(absolute, done + n_component); + if (!prefix) { + r = -ENOMEM; + goto fail; + } + + if (!GREEDY_REALLOC(new_watches, n + 1)) { + r = -ENOMEM; + goto fail; + } + + wd = inotify_add_watch(b->inotify_fd, prefix, IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO|IN_DONT_FOLLOW); + log_debug("Added inotify watch for %s on bus %s: %i", prefix, strna(b->description), wd); + + if (wd < 0) { + if (IN_SET(errno, ENOENT, ELOOP)) + break; /* This component doesn't exist yet, or the path contains a cyclic symlink right now */ + + r = log_debug_errno(errno, "Failed to add inotify watch on %s: %m", empty_to_root(prefix)); + goto fail; + } else + new_watches[n++] = wd; + + /* Check if this is possibly a symlink. If so, let's follow it and watch it too. */ + r = readlink_malloc(prefix, &destination); + if (r == -EINVAL) { /* not a symlink */ + done += n_component; + continue; + } + if (r < 0) + goto fail; + + if (isempty(destination)) { /* Empty symlink target? Yuck! */ + r = -EINVAL; + goto fail; + } + + if (max_follow <= 0) { /* Let's make sure we don't follow symlinks forever */ + r = -ELOOP; + goto fail; + } + + if (path_is_absolute(destination)) { + /* For absolute symlinks we build the new path and start anew */ + c = strjoin(destination, absolute + done + n_component); + done = 0; + } else { + _cleanup_free_ char *t = NULL; + + /* For relative symlinks we replace the last component, and try again */ + t = strndup(absolute, done); + if (!t) + return -ENOMEM; + + c = strjoin(t, "/", destination, absolute + done + n_component); + } + if (!c) { + r = -ENOMEM; + goto fail; + } + + free_and_replace(absolute, c); + + max_follow--; + } + + /* And now, let's remove all watches from the previous iteration we don't need anymore */ + for (i = 0; i < b->n_inotify_watches; i++) { + bool found = false; + size_t j; + + for (j = 0; j < n; j++) + if (new_watches[j] == b->inotify_watches[i]) { + found = true; + break; + } + + if (found) + continue; + + (void) inotify_rm_watch(b->inotify_fd, b->inotify_watches[i]); + } + + free_and_replace(b->inotify_watches, new_watches); + b->n_inotify_watches = n; + + return 0; + +fail: + bus_close_inotify_fd(b); + return r; +} + +static int bind_description(sd_bus *b, int fd, int family) { + _cleanup_free_ char *bind_name = NULL, *comm = NULL; + union sockaddr_union bsa; + const char *d = NULL; + int r; + + assert(b); + assert(fd >= 0); + + /* If this is an AF_UNIX socket, let's set our client's socket address to carry the description + * string for this bus connection. This is useful for debugging things, as the connection name is + * visible in various socket-related tools, and can even be queried by the server side. */ + + if (family != AF_UNIX) + return 0; + + (void) sd_bus_get_description(b, &d); + + /* Generate a recognizable source address in the abstract namespace. We'll include: + * - a random 64-bit value (to avoid collisions) + * - our "comm" process name (suppressed if contains "/" to avoid parsing issues) + * - the description string of the bus connection. */ + (void) pid_get_comm(0, &comm); + if (comm && strchr(comm, '/')) + comm = mfree(comm); + + if (!d && !comm) /* skip if we don't have either field, rely on kernel autobind instead */ + return 0; + + if (asprintf(&bind_name, "@%" PRIx64 "/bus/%s/%s", random_u64(), strempty(comm), strempty(d)) < 0) + return -ENOMEM; + + strshorten(bind_name, sizeof_field(struct sockaddr_un, sun_path)); + + r = sockaddr_un_set_path(&bsa.un, bind_name); + if (r < 0) + return r; + + if (bind(fd, &bsa.sa, r) < 0) + return -errno; + + return 0; +} + +int bus_socket_connect(sd_bus *b) { + bool inotify_done = false; + int r; + + assert(b); + + for (;;) { + assert(b->input_fd < 0); + assert(b->output_fd < 0); + assert(b->sockaddr.sa.sa_family != AF_UNSPEC); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *pretty = NULL; + (void) sockaddr_pretty(&b->sockaddr.sa, b->sockaddr_size, false, true, &pretty); + log_debug("sd-bus: starting bus%s%s by connecting to %s...", + b->description ? " " : "", strempty(b->description), strnull(pretty)); + } + + b->input_fd = socket(b->sockaddr.sa.sa_family, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (b->input_fd < 0) + return -errno; + + r = bind_description(b, b->input_fd, b->sockaddr.sa.sa_family); + if (r < 0) + return r; + + b->input_fd = fd_move_above_stdio(b->input_fd); + + b->output_fd = b->input_fd; + bus_socket_setup(b); + + if (connect(b->input_fd, &b->sockaddr.sa, b->sockaddr_size) < 0) { + if (errno == EINPROGRESS) { + + /* If we have any inotify watches open, close them now, we don't need them anymore, as + * we have successfully initiated a connection */ + bus_close_inotify_fd(b); + + /* Note that very likely we are already in BUS_OPENING state here, as we enter it when + * we start parsing the address string. The only reason we set the state explicitly + * here, is to undo BUS_WATCH_BIND, in case we did the inotify magic. */ + bus_set_state(b, BUS_OPENING); + return 1; + } + + if (IN_SET(errno, ENOENT, ECONNREFUSED) && /* ENOENT → unix socket doesn't exist at all; ECONNREFUSED → unix socket stale */ + b->watch_bind && + b->sockaddr.sa.sa_family == AF_UNIX && + b->sockaddr.un.sun_path[0] != 0) { + + /* This connection attempt failed, let's release the socket for now, and start with a + * fresh one when reconnecting. */ + bus_close_io_fds(b); + + if (inotify_done) { + /* inotify set up already, don't do it again, just return now, and remember + * that we are waiting for inotify events now. */ + bus_set_state(b, BUS_WATCH_BIND); + return 1; + } + + /* This is a file system socket, and the inotify logic is enabled. Let's create the necessary inotify fd. */ + r = bus_socket_inotify_setup(b); + if (r < 0) + return r; + + /* Let's now try to connect a second time, because in theory there's otherwise a race + * here: the socket might have been created in the time between our first connect() and + * the time we set up the inotify logic. But let's remember that we set up inotify now, + * so that we don't do the connect() more than twice. */ + inotify_done = true; + + } else + return -errno; + } else + break; + } + + /* Yay, established, we don't need no inotify anymore! */ + bus_close_inotify_fd(b); + + return bus_socket_start_auth(b); +} + +int bus_socket_exec(sd_bus *b) { + int s[2], r; + + assert(b); + assert(b->input_fd < 0); + assert(b->output_fd < 0); + assert(b->exec_path); + assert(b->busexec_pid == 0); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *line = NULL; + + if (b->exec_argv) + line = quote_command_line(b->exec_argv, SHELL_ESCAPE_EMPTY); + + log_debug("sd-bus: starting bus%s%s with %s%s", + b->description ? " " : "", strempty(b->description), + line ?: b->exec_path, + b->exec_argv && !line ? "…" : ""); + } + + r = socketpair(AF_UNIX, SOCK_STREAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, s); + if (r < 0) + return -errno; + + r = safe_fork_full("(sd-busexec)", + (int[]) { s[1], s[1], STDERR_FILENO }, + NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO|FORK_RLIMIT_NOFILE_SAFE, &b->busexec_pid); + if (r < 0) { + safe_close_pair(s); + return r; + } + if (r == 0) { + /* Child */ + + if (b->exec_argv) + execvp(b->exec_path, b->exec_argv); + else + execvp(b->exec_path, STRV_MAKE(b->exec_path)); + + _exit(EXIT_FAILURE); + } + + safe_close(s[1]); + b->output_fd = b->input_fd = fd_move_above_stdio(s[0]); + + bus_socket_setup(b); + + return bus_socket_start_auth(b); +} + +int bus_socket_take_fd(sd_bus *b) { + assert(b); + + bus_socket_setup(b); + + return bus_socket_start_auth(b); +} + +int bus_socket_write_message(sd_bus *bus, sd_bus_message *m, size_t *idx) { + struct iovec *iov; + ssize_t k; + size_t n; + unsigned j; + int r; + + assert(bus); + assert(m); + assert(idx); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + if (*idx >= BUS_MESSAGE_SIZE(m)) + return 0; + + r = bus_message_setup_iovec(m); + if (r < 0) + return r; + + n = m->n_iovec * sizeof(struct iovec); + iov = newa(struct iovec, n); + memcpy_safe(iov, m->iovec, n); + + j = 0; + iovec_advance(iov, &j, *idx); + + if (bus->prefer_writev) + k = writev(bus->output_fd, iov, m->n_iovec); + else { + struct msghdr mh = { + .msg_iov = iov, + .msg_iovlen = m->n_iovec, + }; + + if (m->n_fds > 0 && *idx == 0) { + struct cmsghdr *control; + + mh.msg_controllen = CMSG_SPACE(sizeof(int) * m->n_fds); + mh.msg_control = alloca0(mh.msg_controllen); + control = CMSG_FIRSTHDR(&mh); + control->cmsg_len = CMSG_LEN(sizeof(int) * m->n_fds); + control->cmsg_level = SOL_SOCKET; + control->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(control), m->fds, sizeof(int) * m->n_fds); + } + + k = sendmsg(bus->output_fd, &mh, MSG_DONTWAIT|MSG_NOSIGNAL); + if (k < 0 && errno == ENOTSOCK) { + bus->prefer_writev = true; + k = writev(bus->output_fd, iov, m->n_iovec); + } + } + + if (k < 0) + return ERRNO_IS_TRANSIENT(errno) ? 0 : -errno; + + *idx += (size_t) k; + return 1; +} + +static int bus_socket_read_message_need(sd_bus *bus, size_t *need) { + uint32_t a, b; + uint8_t e; + uint64_t sum; + + assert(bus); + assert(need); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + if (bus->rbuffer_size < sizeof(struct bus_header)) { + *need = sizeof(struct bus_header) + 8; + + /* Minimum message size: + * + * Header + + * + * Method Call: +2 string headers + * Signal: +3 string headers + * Method Error: +1 string headers + * +1 uint32 headers + * Method Reply: +1 uint32 headers + * + * A string header is at least 9 bytes + * A uint32 header is at least 8 bytes + * + * Hence the minimum message size of a valid message + * is header + 8 bytes */ + + return 0; + } + + a = ((const uint32_t*) bus->rbuffer)[1]; + b = ((const uint32_t*) bus->rbuffer)[3]; + + e = ((const uint8_t*) bus->rbuffer)[0]; + if (e == BUS_LITTLE_ENDIAN) { + a = le32toh(a); + b = le32toh(b); + } else if (e == BUS_BIG_ENDIAN) { + a = be32toh(a); + b = be32toh(b); + } else + return -EBADMSG; + + sum = (uint64_t) sizeof(struct bus_header) + (uint64_t) ALIGN8(b) + (uint64_t) a; + if (sum >= BUS_MESSAGE_SIZE_MAX) + return -ENOBUFS; + + *need = (size_t) sum; + return 0; +} + +static int bus_socket_make_message(sd_bus *bus, size_t size) { + sd_bus_message *t = NULL; + void *b; + int r; + + assert(bus); + assert(bus->rbuffer_size >= size); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + r = bus_rqueue_make_room(bus); + if (r < 0) + return r; + + if (bus->rbuffer_size > size) { + b = memdup((const uint8_t*) bus->rbuffer + size, + bus->rbuffer_size - size); + if (!b) + return -ENOMEM; + } else + b = NULL; + + r = bus_message_from_malloc(bus, + bus->rbuffer, size, + bus->fds, bus->n_fds, + NULL, + &t); + if (r == -EBADMSG) { + log_debug_errno(r, "Received invalid message from connection %s, dropping.", strna(bus->description)); + free(bus->rbuffer); /* We want to drop current rbuffer and proceed with whatever remains in b */ + } else if (r < 0) { + free(b); + return r; + } + + /* rbuffer ownership was either transferred to t, or we got EBADMSG and dropped it. */ + bus->rbuffer = b; + bus->rbuffer_size -= size; + + bus->fds = NULL; + bus->n_fds = 0; + + if (t) { + t->read_counter = ++bus->read_counter; + bus->rqueue[bus->rqueue_size++] = bus_message_ref_queued(t, bus); + sd_bus_message_unref(t); + } + + return 1; +} + +int bus_socket_read_message(sd_bus *bus) { + struct msghdr mh; + struct iovec iov = {}; + ssize_t k; + size_t need; + int r; + void *b; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int) * BUS_FDS_MAX)) control; + bool handle_cmsg = false; + + assert(bus); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + r = bus_socket_read_message_need(bus, &need); + if (r < 0) + return r; + + if (bus->rbuffer_size >= need) + return bus_socket_make_message(bus, need); + + b = realloc(bus->rbuffer, need); + if (!b) + return -ENOMEM; + + bus->rbuffer = b; + + iov = IOVEC_MAKE((uint8_t *)bus->rbuffer + bus->rbuffer_size, need - bus->rbuffer_size); + + if (bus->prefer_readv) { + k = readv(bus->input_fd, &iov, 1); + if (k < 0) + k = -errno; + } else { + mh = (struct msghdr) { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + k = recvmsg_safe(bus->input_fd, &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (k == -ENOTSOCK) { + bus->prefer_readv = true; + k = readv(bus->input_fd, &iov, 1); + if (k < 0) + k = -errno; + } else + handle_cmsg = true; + } + if (ERRNO_IS_NEG_TRANSIENT(k)) + return 0; + if (k < 0) + return (int) k; + if (k == 0) { + if (handle_cmsg) + cmsg_close_all(&mh); /* On EOF we shouldn't have gotten an fd, but let's make sure */ + return -ECONNRESET; + } + + bus->rbuffer_size += k; + + if (handle_cmsg) { + struct cmsghdr *cmsg; + + CMSG_FOREACH(cmsg, &mh) + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + int n, *f, i; + + n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + if (!bus->can_fds) { + /* Whut? We received fds but this + * isn't actually enabled? Close them, + * and fail */ + + close_many(CMSG_TYPED_DATA(cmsg, int), n); + return -EIO; + } + + f = reallocarray(bus->fds, bus->n_fds + n, sizeof(int)); + if (!f) { + close_many(CMSG_TYPED_DATA(cmsg, int), n); + return -ENOMEM; + } + + for (i = 0; i < n; i++) + f[bus->n_fds++] = fd_move_above_stdio(CMSG_TYPED_DATA(cmsg, int)[i]); + bus->fds = f; + } else + log_debug("Got unexpected auxiliary data with level=%d and type=%d", + cmsg->cmsg_level, cmsg->cmsg_type); + } + + r = bus_socket_read_message_need(bus, &need); + if (r < 0) + return r; + + if (bus->rbuffer_size >= need) + return bus_socket_make_message(bus, need); + + return 1; +} + +int bus_socket_process_opening(sd_bus *b) { + int error = 0, events, r; + socklen_t slen = sizeof(error); + + assert(b->state == BUS_OPENING); + + events = fd_wait_for_event(b->output_fd, POLLOUT, 0); + if (ERRNO_IS_NEG_TRANSIENT(events)) + return 0; + if (events < 0) + return events; + if (!(events & (POLLOUT|POLLERR|POLLHUP))) + return 0; + + r = getsockopt(b->output_fd, SOL_SOCKET, SO_ERROR, &error, &slen); + if (r < 0) + b->last_connect_error = errno; + else if (error != 0) + b->last_connect_error = error; + else if (events & (POLLERR|POLLHUP)) + b->last_connect_error = ECONNREFUSED; + else + return bus_socket_start_auth(b); + + return bus_next_address(b); +} + +int bus_socket_process_authenticating(sd_bus *b) { + int r; + + assert(b); + assert(b->state == BUS_AUTHENTICATING); + + if (now(CLOCK_MONOTONIC) >= b->auth_timeout) + return -ETIMEDOUT; + + r = bus_socket_write_auth(b); + if (r != 0) + return r; + + return bus_socket_read_auth(b); +} + +int bus_socket_process_watch_bind(sd_bus *b) { + int r, q; + + assert(b); + assert(b->state == BUS_WATCH_BIND); + assert(b->inotify_fd >= 0); + + r = flush_fd(b->inotify_fd); + if (r <= 0) + return r; + + log_debug("Got inotify event on bus %s.", strna(b->description)); + + /* We flushed events out of the inotify fd. In that case, maybe the socket is valid now? Let's try to connect + * to it again */ + + r = bus_socket_connect(b); + if (r < 0) + return r; + + q = bus_attach_io_events(b); + if (q < 0) + return q; + + q = bus_attach_inotify_event(b); + if (q < 0) + return q; + + return r; +} diff --git a/src/libsystemd/sd-bus/bus-socket.h b/src/libsystemd/sd-bus/bus-socket.h new file mode 100644 index 0000000..52bc404 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-socket.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +void bus_socket_setup(sd_bus *b); + +int bus_socket_connect(sd_bus *b); +int bus_socket_exec(sd_bus *b); +int bus_socket_take_fd(sd_bus *b); +int bus_socket_start_auth(sd_bus *b); + +int bus_socket_write_message(sd_bus *bus, sd_bus_message *m, size_t *idx); +int bus_socket_read_message(sd_bus *bus); + +int bus_socket_process_opening(sd_bus *b); +int bus_socket_process_authenticating(sd_bus *b); +int bus_socket_process_watch_bind(sd_bus *b); + +bool bus_socket_auth_needs_write(sd_bus *b); diff --git a/src/libsystemd/sd-bus/bus-track.c b/src/libsystemd/sd-bus/bus-track.c new file mode 100644 index 0000000..f9c59a1 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-track.c @@ -0,0 +1,495 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-track.h" +#include "string-util.h" + +struct track_item { + unsigned n_ref; + char *name; + sd_bus_slot *slot; +}; + +struct sd_bus_track { + unsigned n_ref; + unsigned n_adding; /* are we in the process of adding a new name? */ + sd_bus *bus; + sd_bus_track_handler_t handler; + void *userdata; + Hashmap *names; + LIST_FIELDS(sd_bus_track, queue); + Iterator iterator; + bool in_list:1; /* In bus->tracks? */ + bool in_queue:1; /* In bus->track_queue? */ + bool modified:1; + bool recursive:1; + sd_bus_destroy_t destroy_callback; + + LIST_FIELDS(sd_bus_track, tracks); +}; + +#define MATCH_FOR_NAME(name) \ + strjoina("type='signal'," \ + "sender='org.freedesktop.DBus'," \ + "path='/org/freedesktop/DBus'," \ + "interface='org.freedesktop.DBus'," \ + "member='NameOwnerChanged'," \ + "arg0='", name, "'") + +static struct track_item* track_item_free(struct track_item *i) { + if (!i) + return NULL; + + sd_bus_slot_unref(i->slot); + free(i->name); + return mfree(i); +} + +DEFINE_PRIVATE_TRIVIAL_UNREF_FUNC(struct track_item, track_item, track_item_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct track_item*, track_item_unref); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(track_item_hash_ops, char, string_hash_func, string_compare_func, + struct track_item, track_item_free); + +static void bus_track_add_to_queue(sd_bus_track *track) { + assert(track); + + /* Adds the bus track object to the queue of objects we should dispatch next, subject to a number of + * conditions. */ + + /* Already in the queue? */ + if (track->in_queue) + return; + + /* if we are currently in the process of adding a new name, then let's not enqueue this just yet, let's wait + * until the addition is complete. */ + if (track->n_adding > 0) + return; + + /* still referenced? */ + if (hashmap_size(track->names) > 0) + return; + + /* Nothing to call? */ + if (!track->handler) + return; + + /* Already closed? */ + if (!track->in_list) + return; + + LIST_PREPEND(queue, track->bus->track_queue, track); + track->in_queue = true; +} + +static void bus_track_remove_from_queue(sd_bus_track *track) { + assert(track); + + if (!track->in_queue) + return; + + LIST_REMOVE(queue, track->bus->track_queue, track); + track->in_queue = false; +} + +static int bus_track_remove_name_fully(sd_bus_track *track, const char *name) { + struct track_item *i; + + assert(track); + assert(name); + + i = hashmap_remove(track->names, name); + if (!i) + return 0; + + track_item_free(i); + + bus_track_add_to_queue(track); + + track->modified = true; + return 1; +} + +_public_ int sd_bus_track_new( + sd_bus *bus, + sd_bus_track **track, + sd_bus_track_handler_t handler, + void *userdata) { + + sd_bus_track *t; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(track, -EINVAL); + + if (!bus->bus_client) + return -EINVAL; + + t = new0(sd_bus_track, 1); + if (!t) + return -ENOMEM; + + t->n_ref = 1; + t->handler = handler; + t->userdata = userdata; + t->bus = sd_bus_ref(bus); + + LIST_PREPEND(tracks, bus->tracks, t); + t->in_list = true; + + bus_track_add_to_queue(t); + + *track = t; + return 0; +} + +static sd_bus_track *track_free(sd_bus_track *track) { + assert(track); + + if (track->in_list) + LIST_REMOVE(tracks, track->bus->tracks, track); + + bus_track_remove_from_queue(track); + track->names = hashmap_free(track->names); + track->bus = sd_bus_unref(track->bus); + + if (track->destroy_callback) + track->destroy_callback(track->userdata); + + return mfree(track); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_bus_track, sd_bus_track, track_free); + +static int on_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + sd_bus_track *track = ASSERT_PTR(userdata); + const char *name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "sss", &name, NULL, NULL); + if (r < 0) + return 0; + + bus_track_remove_name_fully(track, name); + return 0; +} + +_public_ int sd_bus_track_add_name(sd_bus_track *track, const char *name) { + _cleanup_(track_item_unrefp) struct track_item *n = NULL; + struct track_item *i; + const char *match; + int r; + + assert_return(track, -EINVAL); + assert_return(service_name_is_valid(name), -EINVAL); + + i = hashmap_get(track->names, name); + if (i) { + if (track->recursive) { + assert(i->n_ref > 0); + + /* Manual overflow check (instead of a DEFINE_TRIVIAL_REF_FUNC() helper or so), so + * that we can return a proper error, given this is almost always called in a + * directly client controllable way, and thus better should never hit an assertion + * here. */ + if (i->n_ref >= UINT_MAX) + return -EOVERFLOW; + + i->n_ref++; + } + + bus_track_remove_from_queue(track); + return 0; + } + + r = hashmap_ensure_allocated(&track->names, &track_item_hash_ops); + if (r < 0) + return r; + + n = new(struct track_item, 1); + if (!n) + return -ENOMEM; + + *n = (struct track_item) { + .n_ref = 1, + }; + + n->name = strdup(name); + if (!n->name) + return -ENOMEM; + + /* First, subscribe to this name */ + match = MATCH_FOR_NAME(name); + + bus_track_remove_from_queue(track); /* don't dispatch this while we work in it */ + + r = sd_bus_add_match_async(track->bus, &n->slot, match, on_name_owner_changed, NULL, track); + if (r < 0) { + bus_track_add_to_queue(track); + return r; + } + + r = hashmap_put(track->names, n->name, n); + if (r < 0) { + bus_track_add_to_queue(track); + return r; + } + + /* Second, check if it is currently existing, or maybe doesn't, or maybe disappeared already. */ + track->n_adding++; /* again, make sure this isn't dispatch while we are working in it */ + r = sd_bus_get_name_creds(track->bus, name, 0, NULL); + track->n_adding--; + if (r < 0) { + hashmap_remove(track->names, name); + bus_track_add_to_queue(track); + return r; + } + + TAKE_PTR(n); + + bus_track_remove_from_queue(track); + track->modified = true; + + return 1; +} + +_public_ int sd_bus_track_remove_name(sd_bus_track *track, const char *name) { + struct track_item *i; + + assert_return(name, -EINVAL); + + if (!track) /* Treat a NULL track object as an empty track object */ + return 0; + + i = hashmap_get(track->names, name); + if (!i) + return 0; + + assert(i->n_ref >= 1); + if (i->n_ref <= 1) + return bus_track_remove_name_fully(track, name); + + track_item_unref(i); + + return 1; +} + +_public_ unsigned sd_bus_track_count(sd_bus_track *track) { + + if (!track) /* Let's consider a NULL object equivalent to an empty object */ + return 0; + + /* This signature really should have returned an int, so that we can propagate errors. But well, ... Also, note + * that this returns the number of names being watched, and multiple references to the same name are not + * counted. */ + + return hashmap_size(track->names); +} + +_public_ const char* sd_bus_track_contains(sd_bus_track *track, const char *name) { + assert_return(name, NULL); + + if (!track) /* Let's consider a NULL object equivalent to an empty object */ + return NULL; + + return hashmap_contains(track->names, name) ? name : NULL; +} + +_public_ const char* sd_bus_track_first(sd_bus_track *track) { + const char *n = NULL; + + if (!track) + return NULL; + + track->modified = false; + track->iterator = ITERATOR_FIRST; + + (void) hashmap_iterate(track->names, &track->iterator, NULL, (const void**) &n); + return n; +} + +_public_ const char* sd_bus_track_next(sd_bus_track *track) { + const char *n = NULL; + + if (!track) + return NULL; + + if (track->modified) + return NULL; + + (void) hashmap_iterate(track->names, &track->iterator, NULL, (const void**) &n); + return n; +} + +_public_ int sd_bus_track_add_sender(sd_bus_track *track, sd_bus_message *m) { + const char *sender; + + assert_return(track, -EINVAL); + assert_return(m, -EINVAL); + + if (sd_bus_message_get_bus(m) != track->bus) + return -EINVAL; + + sender = sd_bus_message_get_sender(m); + if (!sender) + return -EINVAL; + + return sd_bus_track_add_name(track, sender); +} + +_public_ int sd_bus_track_remove_sender(sd_bus_track *track, sd_bus_message *m) { + const char *sender; + + assert_return(m, -EINVAL); + + if (!track) /* Treat a NULL track object as an empty track object */ + return 0; + + if (sd_bus_message_get_bus(m) != track->bus) + return -EINVAL; + + sender = sd_bus_message_get_sender(m); + if (!sender) + return -EINVAL; + + return sd_bus_track_remove_name(track, sender); +} + +_public_ sd_bus* sd_bus_track_get_bus(sd_bus_track *track) { + assert_return(track, NULL); + + return track->bus; +} + +void bus_track_dispatch(sd_bus_track *track) { + int r; + + assert(track); + assert(track->handler); + + bus_track_remove_from_queue(track); + + sd_bus_track_ref(track); + + r = track->handler(track, track->userdata); + if (r < 0) + log_debug_errno(r, "Failed to process track handler: %m"); + else if (r == 0) + bus_track_add_to_queue(track); + + sd_bus_track_unref(track); +} + +void bus_track_close(sd_bus_track *track) { + assert(track); + + /* Called whenever our bus connected is closed. If so, and our track object is non-empty, dispatch it + * immediately, as we are closing now, but first flush out all names. */ + + if (!track->in_list) + return; /* We already closed this one, don't close it again. */ + + /* Remember that this one is closed now */ + LIST_REMOVE(tracks, track->bus->tracks, track); + track->in_list = false; + + /* If there's no name in this one anyway, we don't have to dispatch */ + if (hashmap_isempty(track->names)) + return; + + /* Let's flush out all names */ + hashmap_clear(track->names); + + /* Invoke handler */ + if (track->handler) + bus_track_dispatch(track); +} + +_public_ void *sd_bus_track_get_userdata(sd_bus_track *track) { + assert_return(track, NULL); + + return track->userdata; +} + +_public_ void *sd_bus_track_set_userdata(sd_bus_track *track, void *userdata) { + void *ret; + + assert_return(track, NULL); + + ret = track->userdata; + track->userdata = userdata; + + return ret; +} + +_public_ int sd_bus_track_set_destroy_callback(sd_bus_track *track, sd_bus_destroy_t callback) { + assert_return(track, -EINVAL); + + track->destroy_callback = callback; + return 0; +} + +_public_ int sd_bus_track_get_destroy_callback(sd_bus_track *track, sd_bus_destroy_t *ret) { + assert_return(track, -EINVAL); + + if (ret) + *ret = track->destroy_callback; + + return !!track->destroy_callback; +} + +_public_ int sd_bus_track_set_recursive(sd_bus_track *track, int b) { + assert_return(track, -EINVAL); + + if (track->recursive == !!b) + return 0; + + if (!hashmap_isempty(track->names)) + return -EBUSY; + + track->recursive = b; + return 0; +} + +_public_ int sd_bus_track_get_recursive(sd_bus_track *track) { + assert_return(track, -EINVAL); + + return track->recursive; +} + +_public_ int sd_bus_track_count_sender(sd_bus_track *track, sd_bus_message *m) { + const char *sender; + + assert_return(m, -EINVAL); + + if (!track) /* Let's consider a NULL object equivalent to an empty object */ + return 0; + + if (sd_bus_message_get_bus(m) != track->bus) + return -EINVAL; + + sender = sd_bus_message_get_sender(m); + if (!sender) + return -EINVAL; + + return sd_bus_track_count_name(track, sender); +} + +_public_ int sd_bus_track_count_name(sd_bus_track *track, const char *name) { + struct track_item *i; + + assert_return(service_name_is_valid(name), -EINVAL); + + if (!track) /* Let's consider a NULL object equivalent to an empty object */ + return 0; + + i = hashmap_get(track->names, name); + if (!i) + return 0; + + return i->n_ref; +} diff --git a/src/libsystemd/sd-bus/bus-track.h b/src/libsystemd/sd-bus/bus-track.h new file mode 100644 index 0000000..8dae1f3 --- /dev/null +++ b/src/libsystemd/sd-bus/bus-track.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +void bus_track_dispatch(sd_bus_track *track); +void bus_track_close(sd_bus_track *track); diff --git a/src/libsystemd/sd-bus/bus-type.c b/src/libsystemd/sd-bus/bus-type.c new file mode 100644 index 0000000..6a0f53d --- /dev/null +++ b/src/libsystemd/sd-bus/bus-type.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-bus.h" + +#include "bus-internal.h" +#include "bus-type.h" + +bool bus_type_is_valid(char c) { + static const char valid[] = { + SD_BUS_TYPE_BYTE, + SD_BUS_TYPE_BOOLEAN, + SD_BUS_TYPE_INT16, + SD_BUS_TYPE_UINT16, + SD_BUS_TYPE_INT32, + SD_BUS_TYPE_UINT32, + SD_BUS_TYPE_INT64, + SD_BUS_TYPE_UINT64, + SD_BUS_TYPE_DOUBLE, + SD_BUS_TYPE_STRING, + SD_BUS_TYPE_OBJECT_PATH, + SD_BUS_TYPE_SIGNATURE, + SD_BUS_TYPE_ARRAY, + SD_BUS_TYPE_VARIANT, + SD_BUS_TYPE_STRUCT, + SD_BUS_TYPE_DICT_ENTRY, + SD_BUS_TYPE_UNIX_FD + }; + + return !!memchr(valid, c, sizeof(valid)); +} + +bool bus_type_is_basic(char c) { + static const char valid[] = { + SD_BUS_TYPE_BYTE, + SD_BUS_TYPE_BOOLEAN, + SD_BUS_TYPE_INT16, + SD_BUS_TYPE_UINT16, + SD_BUS_TYPE_INT32, + SD_BUS_TYPE_UINT32, + SD_BUS_TYPE_INT64, + SD_BUS_TYPE_UINT64, + SD_BUS_TYPE_DOUBLE, + SD_BUS_TYPE_STRING, + SD_BUS_TYPE_OBJECT_PATH, + SD_BUS_TYPE_SIGNATURE, + SD_BUS_TYPE_UNIX_FD + }; + + return !!memchr(valid, c, sizeof(valid)); +} + +bool bus_type_is_trivial(char c) { + static const char valid[] = { + SD_BUS_TYPE_BYTE, + SD_BUS_TYPE_BOOLEAN, + SD_BUS_TYPE_INT16, + SD_BUS_TYPE_UINT16, + SD_BUS_TYPE_INT32, + SD_BUS_TYPE_UINT32, + SD_BUS_TYPE_INT64, + SD_BUS_TYPE_UINT64, + SD_BUS_TYPE_DOUBLE + }; + + return !!memchr(valid, c, sizeof(valid)); +} + +bool bus_type_is_container(char c) { + static const char valid[] = { + SD_BUS_TYPE_ARRAY, + SD_BUS_TYPE_VARIANT, + SD_BUS_TYPE_STRUCT, + SD_BUS_TYPE_DICT_ENTRY + }; + + return !!memchr(valid, c, sizeof(valid)); +} + +int bus_type_get_alignment(char c) { + + switch (c) { + case SD_BUS_TYPE_BYTE: + case SD_BUS_TYPE_SIGNATURE: + case SD_BUS_TYPE_VARIANT: + return 1; + + case SD_BUS_TYPE_INT16: + case SD_BUS_TYPE_UINT16: + return 2; + + case SD_BUS_TYPE_BOOLEAN: + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_OBJECT_PATH: + case SD_BUS_TYPE_ARRAY: + case SD_BUS_TYPE_UNIX_FD: + return 4; + + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: + case SD_BUS_TYPE_DOUBLE: + case SD_BUS_TYPE_STRUCT: + case SD_BUS_TYPE_STRUCT_BEGIN: + case SD_BUS_TYPE_DICT_ENTRY: + case SD_BUS_TYPE_DICT_ENTRY_BEGIN: + return 8; + } + + return -EINVAL; +} + +int bus_type_get_size(char c) { + + switch (c) { + case SD_BUS_TYPE_BYTE: + return 1; + + case SD_BUS_TYPE_INT16: + case SD_BUS_TYPE_UINT16: + return 2; + + case SD_BUS_TYPE_BOOLEAN: + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: + case SD_BUS_TYPE_UNIX_FD: + return 4; + + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: + case SD_BUS_TYPE_DOUBLE: + return 8; + } + + return -EINVAL; +} + +_public_ int sd_bus_interface_name_is_valid(const char *p) { + assert_return(p, -EINVAL); + + return interface_name_is_valid(p); +} + +_public_ int sd_bus_service_name_is_valid(const char *p) { + assert_return(p, -EINVAL); + + return service_name_is_valid(p); +} + +_public_ int sd_bus_member_name_is_valid(const char *p) { + assert_return(p, -EINVAL); + + return member_name_is_valid(p); +} + +_public_ int sd_bus_object_path_is_valid(const char *p) { + assert_return(p, -EINVAL); + + return object_path_is_valid(p); +} diff --git a/src/libsystemd/sd-bus/bus-type.h b/src/libsystemd/sd-bus/bus-type.h new file mode 100644 index 0000000..490108a --- /dev/null +++ b/src/libsystemd/sd-bus/bus-type.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +bool bus_type_is_valid(char c) _const_; +bool bus_type_is_basic(char c) _const_; +/* "trivial" is systemd's term for what the D-Bus Specification calls + * a "fixed type": that is, a basic type of fixed length */ +bool bus_type_is_trivial(char c) _const_; +bool bus_type_is_container(char c) _const_; + +int bus_type_get_alignment(char c) _const_; +int bus_type_get_size(char c) _const_; diff --git a/src/libsystemd/sd-bus/fuzz-bus-match.c b/src/libsystemd/sd-bus/fuzz-bus-match.c new file mode 100644 index 0000000..16da534 --- /dev/null +++ b/src/libsystemd/sd-bus/fuzz-bus-match.c @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-match.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fuzz.h" +#include "memstream-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + FILE *g = NULL; + int r; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + r = sd_bus_new(&bus); + assert_se(r >= 0); + + _cleanup_(bus_match_free) struct bus_match_node root = { + .type = BUS_MATCH_ROOT, + }; + + /* Note that we use the pointer to match_callback substructure, but the code + * uses container_of() to access outside of the passed-in type. */ + sd_bus_slot slot = { + .type = BUS_MATCH_CALLBACK, + .match_callback = {}, + }; + + if (getenv_bool("SYSTEMD_FUZZ_OUTPUT") <= 0) + assert_se(g = memstream_init(&m)); + + for (size_t offset = 0; offset < size; ) { + _cleanup_free_ char *line = NULL; + char *end; + + end = memchr((char*) data + offset, '\n', size - offset); + + line = memdup_suffix0((char*) data + offset, + end ? end - (char*) data - offset : size - offset); + if (!line) + return log_oom_debug(); + + offset = end ? (size_t) (end - (char*) data + 1) : size; + + struct bus_match_component *components; + size_t n_components; + r = bus_match_parse(line, &components, &n_components); + if (IN_SET(r, -EINVAL, -ENOMEM)) { + log_debug_errno(r, "Failed to parse line: %m"); + continue; + } + assert_se(r >= 0); /* We only expect EINVAL and ENOMEM errors, or success. */ + + CLEANUP_ARRAY(components, n_components, bus_match_parse_free); + + log_debug("Parsed %zu components.", n_components); + + _cleanup_free_ char *again = bus_match_to_string(components, n_components); + if (!again) { + log_oom(); + break; + } + + if (g) + fprintf(g, "%s\n", again); + + r = bus_match_add(&root, components, n_components, &slot.match_callback); + if (r < 0) { + log_error_errno(r, "Failed to add match: %m"); + break; + } + } + + bus_match_dump(g ?: stdout, &root, 0); /* We do this even on failure, to check consistency after error. */ + bus_match_free(&root); + + return 0; +} diff --git a/src/libsystemd/sd-bus/fuzz-bus-match.options b/src/libsystemd/sd-bus/fuzz-bus-match.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/libsystemd/sd-bus/fuzz-bus-match.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/libsystemd/sd-bus/fuzz-bus-message.c b/src/libsystemd/sd-bus/fuzz-bus-message.c new file mode 100644 index 0000000..ca7091e --- /dev/null +++ b/src/libsystemd/sd-bus/fuzz-bus-message.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-dump.h" +#include "bus-message.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fuzz.h" +#include "memstream-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(memstream_done) MemStream ms = {}; + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ void *buffer = NULL; + FILE *g = NULL; + int r; + + fuzz_setup_logging(); + + r = sd_bus_new(&bus); + assert_se(r >= 0); + + assert_se(buffer = memdup(data, size)); + + r = bus_message_from_malloc(bus, buffer, size, NULL, 0, NULL, &m); + if (r == -EBADMSG) + return 0; + assert_se(r >= 0); + TAKE_PTR(buffer); + + if (getenv_bool("SYSTEMD_FUZZ_OUTPUT") <= 0) + assert_se(g = memstream_init(&ms)); + + sd_bus_message_dump(m, g ?: stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + r = sd_bus_message_rewind(m, true); + assert_se(r >= 0); + + return 0; +} diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c new file mode 100644 index 0000000..8befc97 --- /dev/null +++ b/src/libsystemd/sd-bus/sd-bus.c @@ -0,0 +1,4441 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "bus-container.h" +#include "bus-control.h" +#include "bus-internal.h" +#include "bus-kernel.h" +#include "bus-label.h" +#include "bus-message.h" +#include "bus-objects.h" +#include "bus-protocol.h" +#include "bus-slot.h" +#include "bus-socket.h" +#include "bus-track.h" +#include "bus-type.h" +#include "cgroup-util.h" +#include "constants.h" +#include "errno-util.h" +#include "fd-util.h" +#include "glyph-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "io-util.h" +#include "macro.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "origin-id.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +#define log_debug_bus_message(m) \ + do { \ + sd_bus_message *_mm = (m); \ + log_debug("Got message type=%s sender=%s destination=%s path=%s interface=%s member=%s " \ + " cookie=%" PRIu64 " reply_cookie=%" PRIu64 \ + " signature=%s error-name=%s error-message=%s", \ + strna(bus_message_type_to_string(_mm->header->type)), \ + strna(sd_bus_message_get_sender(_mm)), \ + strna(sd_bus_message_get_destination(_mm)), \ + strna(sd_bus_message_get_path(_mm)), \ + strna(sd_bus_message_get_interface(_mm)), \ + strna(sd_bus_message_get_member(_mm)), \ + BUS_MESSAGE_COOKIE(_mm), \ + _mm->reply_cookie, \ + strna(_mm->root_container.signature), \ + strna(_mm->error.name), \ + strna(_mm->error.message)); \ + } while (false) + +static int bus_poll(sd_bus *bus, bool need_more, uint64_t timeout_usec); +static void bus_detach_io_events(sd_bus *b); + +static thread_local sd_bus *default_system_bus = NULL; +static thread_local sd_bus *default_user_bus = NULL; +static thread_local sd_bus *default_starter_bus = NULL; + +static sd_bus **bus_choose_default(int (**bus_open)(sd_bus **)) { + const char *e; + + /* Let's try our best to reuse another cached connection. If + * the starter bus type is set, connect via our normal + * connection logic, ignoring $DBUS_STARTER_ADDRESS, so that + * we can share the connection with the user/system default + * bus. */ + + e = secure_getenv("DBUS_STARTER_BUS_TYPE"); + if (e) { + if (streq(e, "system")) { + if (bus_open) + *bus_open = sd_bus_open_system; + return &default_system_bus; + } else if (STR_IN_SET(e, "user", "session")) { + if (bus_open) + *bus_open = sd_bus_open_user; + return &default_user_bus; + } + } + + /* No type is specified, so we have not other option than to + * use the starter address if it is set. */ + e = secure_getenv("DBUS_STARTER_ADDRESS"); + if (e) { + if (bus_open) + *bus_open = sd_bus_open; + return &default_starter_bus; + } + + /* Finally, if nothing is set use the cached connection for + * the right scope */ + + if (cg_pid_get_owner_uid(0, NULL) >= 0) { + if (bus_open) + *bus_open = sd_bus_open_user; + return &default_user_bus; + } else { + if (bus_open) + *bus_open = sd_bus_open_system; + return &default_system_bus; + } +} + +sd_bus *bus_resolve(sd_bus *bus) { + switch ((uintptr_t) bus) { + case (uintptr_t) SD_BUS_DEFAULT: + return *(bus_choose_default(NULL)); + case (uintptr_t) SD_BUS_DEFAULT_USER: + return default_user_bus; + case (uintptr_t) SD_BUS_DEFAULT_SYSTEM: + return default_system_bus; + default: + return bus; + } +} + +void bus_close_io_fds(sd_bus *b) { + assert(b); + + bus_detach_io_events(b); + + if (b->input_fd != b->output_fd) + safe_close(b->output_fd); + b->output_fd = b->input_fd = safe_close(b->input_fd); +} + +void bus_close_inotify_fd(sd_bus *b) { + assert(b); + + b->inotify_event_source = sd_event_source_disable_unref(b->inotify_event_source); + + b->inotify_fd = safe_close(b->inotify_fd); + b->inotify_watches = mfree(b->inotify_watches); + b->n_inotify_watches = 0; +} + +static void bus_reset_queues(sd_bus *b) { + assert(b); + + while (b->rqueue_size > 0) + bus_message_unref_queued(b->rqueue[--b->rqueue_size], b); + + b->rqueue = mfree(b->rqueue); + + while (b->wqueue_size > 0) + bus_message_unref_queued(b->wqueue[--b->wqueue_size], b); + + b->wqueue = mfree(b->wqueue); +} + +static sd_bus* bus_free(sd_bus *b) { + sd_bus_slot *s; + + assert(b); + assert(!b->track_queue); + assert(!b->tracks); + + b->state = BUS_CLOSED; + + sd_bus_detach_event(b); + + while ((s = b->slots)) { + /* At this point only floating slots can still be + * around, because the non-floating ones keep a + * reference to the bus, and we thus couldn't be + * destructing right now... We forcibly disconnect the + * slots here, so that they still can be referenced by + * apps, but are dead. */ + + assert(s->floating); + bus_slot_disconnect(s, true); + } + + if (b->default_bus_ptr) + *b->default_bus_ptr = NULL; + + bus_close_io_fds(b); + bus_close_inotify_fd(b); + + free(b->label); + free(b->groups); + free(b->rbuffer); + free(b->unique_name); + free(b->auth_buffer); + free(b->address); + free(b->machine); + free(b->description); + free(b->patch_sender); + + free(b->exec_path); + strv_free(b->exec_argv); + + close_many(b->fds, b->n_fds); + free(b->fds); + + bus_reset_queues(b); + + ordered_hashmap_free_free(b->reply_callbacks); + prioq_free(b->reply_callbacks_prioq); + + assert(b->match_callbacks.type == BUS_MATCH_ROOT); + bus_match_free(&b->match_callbacks); + + hashmap_free_free(b->vtable_methods); + hashmap_free_free(b->vtable_properties); + + assert(hashmap_isempty(b->nodes)); + hashmap_free(b->nodes); + + bus_flush_memfd(b); + + assert_se(pthread_mutex_destroy(&b->memfd_cache_mutex) == 0); + + return mfree(b); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(sd_bus*, bus_free); + +DEFINE_ORIGIN_ID_HELPERS(sd_bus, bus); + +_public_ int sd_bus_new(sd_bus **ret) { + _cleanup_free_ sd_bus *b = NULL; + + assert_return(ret, -EINVAL); + + b = new(sd_bus, 1); + if (!b) + return -ENOMEM; + + *b = (sd_bus) { + .n_ref = 1, + .input_fd = -EBADF, + .output_fd = -EBADF, + .inotify_fd = -EBADF, + .message_version = 1, + .creds_mask = SD_BUS_CREDS_WELL_KNOWN_NAMES|SD_BUS_CREDS_UNIQUE_NAME, + .accept_fd = true, + .origin_id = origin_id_query(), + .n_groups = SIZE_MAX, + .close_on_exit = true, + .ucred = UCRED_INVALID, + .runtime_scope = _RUNTIME_SCOPE_INVALID, + }; + + /* We guarantee that wqueue always has space for at least one entry */ + if (!GREEDY_REALLOC(b->wqueue, 1)) + return -ENOMEM; + + assert_se(pthread_mutex_init(&b->memfd_cache_mutex, NULL) == 0); + + *ret = TAKE_PTR(b); + return 0; +} + +_public_ int sd_bus_set_address(sd_bus *bus, const char *address) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(address, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return free_and_strdup(&bus->address, address); +} + +_public_ int sd_bus_set_fd(sd_bus *bus, int input_fd, int output_fd) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(input_fd >= 0, -EBADF); + assert_return(output_fd >= 0, -EBADF); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->input_fd = input_fd; + bus->output_fd = output_fd; + return 0; +} + +_public_ int sd_bus_set_exec(sd_bus *bus, const char *path, char *const *argv) { + _cleanup_strv_free_ char **a = NULL; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(path, -EINVAL); + assert_return(!strv_isempty(argv), -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + a = strv_copy(argv); + if (!a) + return -ENOMEM; + + r = free_and_strdup(&bus->exec_path, path); + if (r < 0) + return r; + + return strv_free_and_replace(bus->exec_argv, a); +} + +_public_ int sd_bus_set_bus_client(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus->patch_sender, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->bus_client = !!b; + return 0; +} + +_public_ int sd_bus_set_monitor(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->is_monitor = !!b; + return 0; +} + +_public_ int sd_bus_negotiate_fds(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->accept_fd = !!b; + return 0; +} + +_public_ int sd_bus_negotiate_timestamp(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!IN_SET(bus->state, BUS_CLOSING, BUS_CLOSED), -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + /* This is not actually supported by any of our transports these days, but we do honour it for synthetic + * replies, and maybe one day classic D-Bus learns this too */ + bus->attach_timestamp = !!b; + + return 0; +} + +_public_ int sd_bus_negotiate_creds(sd_bus *bus, int b, uint64_t mask) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(mask <= _SD_BUS_CREDS_ALL, -EINVAL); + assert_return(!IN_SET(bus->state, BUS_CLOSING, BUS_CLOSED), -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + SET_FLAG(bus->creds_mask, mask, b); + + /* The well knowns we need unconditionally, so that matches can work */ + bus->creds_mask |= SD_BUS_CREDS_WELL_KNOWN_NAMES|SD_BUS_CREDS_UNIQUE_NAME; + + return 0; +} + +_public_ int sd_bus_set_server(sd_bus *bus, int b, sd_id128_t server_id) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(b || sd_id128_equal(server_id, SD_ID128_NULL), -EINVAL); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->is_server = !!b; + bus->server_id = server_id; + return 0; +} + +_public_ int sd_bus_set_anonymous(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->anonymous_auth = !!b; + return 0; +} + +_public_ int sd_bus_set_trusted(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->trusted = !!b; + return 0; +} + +_public_ int sd_bus_set_description(sd_bus *bus, const char *description) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return free_and_strdup(&bus->description, description); +} + +_public_ int sd_bus_set_allow_interactive_authorization(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->allow_interactive_authorization = !!b; + return 0; +} + +_public_ int sd_bus_get_allow_interactive_authorization(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->allow_interactive_authorization; +} + +_public_ int sd_bus_set_watch_bind(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->watch_bind = !!b; + return 0; +} + +_public_ int sd_bus_get_watch_bind(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->watch_bind; +} + +_public_ int sd_bus_set_connected_signal(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus->connected_signal = !!b; + return 0; +} + +_public_ int sd_bus_get_connected_signal(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->connected_signal; +} + +static int synthesize_connected_signal(sd_bus *bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + + /* If enabled, synthesizes a local "Connected" signal mirroring the local "Disconnected" signal. This is called + * whenever we fully established a connection, i.e. after the authorization phase, and after receiving the + * Hello() reply. Or in other words, whenever we enter BUS_RUNNING state. + * + * This is useful so that clients can start doing stuff whenever the connection is fully established in a way + * that works independently from whether we connected to a full bus or just a direct connection. */ + + if (!bus->connected_signal) + return 0; + + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/DBus/Local", + "org.freedesktop.DBus.Local", + "Connected"); + if (r < 0) + return r; + + bus_message_set_sender_local(bus, m); + m->read_counter = ++bus->read_counter; + + r = bus_seal_synthetic_message(bus, m); + if (r < 0) + return r; + + r = bus_rqueue_make_room(bus); + if (r < 0) + return r; + + /* Insert at the very front */ + memmove(bus->rqueue + 1, bus->rqueue, sizeof(sd_bus_message*) * bus->rqueue_size); + bus->rqueue[0] = bus_message_ref_queued(m, bus); + bus->rqueue_size++; + + return 0; +} + +void bus_set_state(sd_bus *bus, enum bus_state state) { + static const char* const table[_BUS_STATE_MAX] = { + [BUS_UNSET] = "UNSET", + [BUS_WATCH_BIND] = "WATCH_BIND", + [BUS_OPENING] = "OPENING", + [BUS_AUTHENTICATING] = "AUTHENTICATING", + [BUS_HELLO] = "HELLO", + [BUS_RUNNING] = "RUNNING", + [BUS_CLOSING] = "CLOSING", + [BUS_CLOSED] = "CLOSED", + }; + + assert(bus); + assert(state < _BUS_STATE_MAX); + + if (state == bus->state) + return; + + log_debug("Bus %s: changing state %s %s %s", strna(bus->description), + table[bus->state], special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), table[state]); + bus->state = state; +} + +static int hello_callback(sd_bus_message *reply, void *userdata, sd_bus_error *error) { + const char *s; + sd_bus *bus; + int r; + + assert(reply); + bus = reply->bus; + assert(bus); + assert(IN_SET(bus->state, BUS_HELLO, BUS_CLOSING)); + + r = sd_bus_message_get_errno(reply); + if (r > 0) { + r = -r; + goto fail; + } + + r = sd_bus_message_read(reply, "s", &s); + if (r < 0) + goto fail; + + if (!service_name_is_valid(s) || s[0] != ':') { + r = -EBADMSG; + goto fail; + } + + r = free_and_strdup(&bus->unique_name, s); + if (r < 0) + goto fail; + + if (bus->state == BUS_HELLO) { + bus_set_state(bus, BUS_RUNNING); + + r = synthesize_connected_signal(bus); + if (r < 0) + goto fail; + } + + return 1; + +fail: + /* When Hello() failed, let's propagate this in two ways: first we return the error immediately here, + * which is the propagated up towards the event loop. Let's also invalidate the connection, so that + * if the user then calls back into us again we won't wait any longer. */ + + bus_set_state(bus, BUS_CLOSING); + return r; +} + +static int bus_send_hello(sd_bus *bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + + if (!bus->bus_client) + return 0; + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "Hello"); + if (r < 0) + return r; + + return sd_bus_call_async(bus, NULL, m, hello_callback, NULL, 0); +} + +int bus_start_running(sd_bus *bus) { + struct reply_callback *c; + usec_t n; + int r; + + assert(bus); + assert(bus->state < BUS_HELLO); + + /* We start all method call timeouts when we enter BUS_HELLO or BUS_RUNNING mode. At this point let's convert + * all relative to absolute timestamps. Note that we do not reshuffle the reply callback priority queue since + * adding a fixed value to all entries should not alter the internal order. */ + + n = now(CLOCK_MONOTONIC); + ORDERED_HASHMAP_FOREACH(c, bus->reply_callbacks) { + if (c->timeout_usec == 0) + continue; + + c->timeout_usec = usec_add(n, c->timeout_usec); + } + + if (bus->bus_client) { + bus_set_state(bus, BUS_HELLO); + return 1; + } + + bus_set_state(bus, BUS_RUNNING); + + r = synthesize_connected_signal(bus); + if (r < 0) + return r; + + return 1; +} + +static int parse_address_key(const char **p, const char *key, char **value) { + _cleanup_free_ char *r = NULL; + size_t l, n = 0; + const char *a; + + assert(p); + assert(*p); + assert(value); + + if (key) { + l = strlen(key); + if (strncmp(*p, key, l) != 0) + return 0; + + if ((*p)[l] != '=') + return 0; + + if (*value) + return -EINVAL; + + a = *p + l + 1; + } else + a = *p; + + while (!IN_SET(*a, ';', ',', 0)) { + char c; + + if (*a == '%') { + int x, y; + + x = unhexchar(a[1]); + if (x < 0) + return x; + + y = unhexchar(a[2]); + if (y < 0) + return y; + + c = (char) ((x << 4) | y); + a += 3; + } else { + c = *a; + a++; + } + + if (!GREEDY_REALLOC(r, n + 2)) + return -ENOMEM; + + r[n++] = c; + } + + if (!r) { + r = strdup(""); + if (!r) + return -ENOMEM; + } else + r[n] = 0; + + if (*a == ',') + a++; + + *p = a; + + free_and_replace(*value, r); + + return 1; +} + +static void skip_address_key(const char **p) { + assert(p); + assert(*p); + + *p += strcspn(*p, ","); + + if (**p == ',') + (*p)++; +} + +static int parse_unix_address(sd_bus *b, const char **p, char **guid) { + _cleanup_free_ char *path = NULL, *abstract = NULL; + size_t l; + int r; + + assert(b); + assert(p); + assert(*p); + assert(guid); + + while (!IN_SET(**p, 0, ';')) { + r = parse_address_key(p, "guid", guid); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "path", &path); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "abstract", &abstract); + if (r < 0) + return r; + else if (r > 0) + continue; + + skip_address_key(p); + } + + if (!path && !abstract) + return -EINVAL; + + if (path && abstract) + return -EINVAL; + + if (path) { + l = strlen(path); + if (l >= sizeof(b->sockaddr.un.sun_path)) /* We insist on NUL termination */ + return -E2BIG; + + b->sockaddr.un = (struct sockaddr_un) { + .sun_family = AF_UNIX, + }; + + memcpy(b->sockaddr.un.sun_path, path, l); + b->sockaddr_size = offsetof(struct sockaddr_un, sun_path) + l + 1; + + } else { + assert(abstract); + + l = strlen(abstract); + if (l >= sizeof(b->sockaddr.un.sun_path) - 1) /* We insist on NUL termination */ + return -E2BIG; + + b->sockaddr.un = (struct sockaddr_un) { + .sun_family = AF_UNIX, + }; + + memcpy(b->sockaddr.un.sun_path+1, abstract, l); + b->sockaddr_size = offsetof(struct sockaddr_un, sun_path) + 1 + l; + } + + b->is_local = true; + + return 0; +} + +static int parse_tcp_address(sd_bus *b, const char **p, char **guid) { + _cleanup_free_ char *host = NULL, *port = NULL, *family = NULL; + int r; + struct addrinfo *result, hints = { + .ai_socktype = SOCK_STREAM, + }; + + assert(b); + assert(p); + assert(*p); + assert(guid); + + while (!IN_SET(**p, 0, ';')) { + r = parse_address_key(p, "guid", guid); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "host", &host); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "port", &port); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "family", &family); + if (r < 0) + return r; + else if (r > 0) + continue; + + skip_address_key(p); + } + + if (!host || !port) + return -EINVAL; + + if (family) { + hints.ai_family = af_from_ipv4_ipv6(family); + if (hints.ai_family == AF_UNSPEC) + return -EINVAL; + } + + r = getaddrinfo(host, port, &hints, &result); + if (r == EAI_SYSTEM) + return -errno; + else if (r != 0) + return -EADDRNOTAVAIL; + + memcpy(&b->sockaddr, result->ai_addr, result->ai_addrlen); + b->sockaddr_size = result->ai_addrlen; + + freeaddrinfo(result); + + b->is_local = false; + + return 0; +} + +static int parse_exec_address(sd_bus *b, const char **p, char **guid) { + char *path = NULL; + unsigned n_argv = 0, j; + char **argv = NULL; + int r; + + assert(b); + assert(p); + assert(*p); + assert(guid); + + while (!IN_SET(**p, 0, ';')) { + r = parse_address_key(p, "guid", guid); + if (r < 0) + goto fail; + else if (r > 0) + continue; + + r = parse_address_key(p, "path", &path); + if (r < 0) + goto fail; + else if (r > 0) + continue; + + if (startswith(*p, "argv")) { + unsigned ul; + + errno = 0; + ul = strtoul(*p + 4, (char**) p, 10); + if (errno > 0 || **p != '=' || ul > 256) { + r = -EINVAL; + goto fail; + } + + (*p)++; + + if (ul >= n_argv) { + if (!GREEDY_REALLOC0(argv, ul + 2)) { + r = -ENOMEM; + goto fail; + } + + n_argv = ul + 1; + } + + r = parse_address_key(p, NULL, argv + ul); + if (r < 0) + goto fail; + + continue; + } + + skip_address_key(p); + } + + if (!path) { + r = -EINVAL; + goto fail; + } + + /* Make sure there are no holes in the array, with the + * exception of argv[0] */ + for (j = 1; j < n_argv; j++) + if (!argv[j]) { + r = -EINVAL; + goto fail; + } + + if (argv && argv[0] == NULL) { + argv[0] = strdup(path); + if (!argv[0]) { + r = -ENOMEM; + goto fail; + } + } + + b->exec_path = path; + b->exec_argv = argv; + + b->is_local = false; + + return 0; + +fail: + for (j = 0; j < n_argv; j++) + free(argv[j]); + + free(argv); + free(path); + return r; +} + +static int parse_container_unix_address(sd_bus *b, const char **p, char **guid) { + _cleanup_free_ char *machine = NULL, *pid = NULL; + int r; + + assert(b); + assert(p); + assert(*p); + assert(guid); + + while (!IN_SET(**p, 0, ';')) { + r = parse_address_key(p, "guid", guid); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "machine", &machine); + if (r < 0) + return r; + else if (r > 0) + continue; + + r = parse_address_key(p, "pid", &pid); + if (r < 0) + return r; + else if (r > 0) + continue; + + skip_address_key(p); + } + + if (!machine == !pid) + return -EINVAL; + + if (machine) { + if (!hostname_is_valid(machine, VALID_HOSTNAME_DOT_HOST)) + return -EINVAL; + + free_and_replace(b->machine, machine); + } else + b->machine = mfree(b->machine); + + if (pid) { + r = parse_pid(pid, &b->nspid); + if (r < 0) + return r; + } else + b->nspid = 0; + + b->sockaddr.un = (struct sockaddr_un) { + .sun_family = AF_UNIX, + /* Note that we use the old /var/run prefix here, to increase compatibility with really old containers */ + .sun_path = "/var/run/dbus/system_bus_socket", + }; + b->sockaddr_size = SOCKADDR_UN_LEN(b->sockaddr.un); + b->is_local = false; + + return 0; +} + +static void bus_reset_parsed_address(sd_bus *b) { + assert(b); + + zero(b->sockaddr); + b->sockaddr_size = 0; + b->exec_argv = strv_free(b->exec_argv); + b->exec_path = mfree(b->exec_path); + b->server_id = SD_ID128_NULL; + b->machine = mfree(b->machine); + b->nspid = 0; +} + +static int bus_parse_next_address(sd_bus *b) { + _cleanup_free_ char *guid = NULL; + const char *a; + int r; + + assert(b); + + if (!b->address) + return 0; + if (b->address[b->address_index] == 0) + return 0; + + bus_reset_parsed_address(b); + + a = b->address + b->address_index; + + while (*a != 0) { + + if (*a == ';') { + a++; + continue; + } + + if (startswith(a, "unix:")) { + a += 5; + + r = parse_unix_address(b, &a, &guid); + if (r < 0) + return r; + break; + + } else if (startswith(a, "tcp:")) { + + a += 4; + r = parse_tcp_address(b, &a, &guid); + if (r < 0) + return r; + + break; + + } else if (startswith(a, "unixexec:")) { + + a += 9; + r = parse_exec_address(b, &a, &guid); + if (r < 0) + return r; + + break; + + } else if (startswith(a, "x-machine-unix:")) { + + a += 15; + r = parse_container_unix_address(b, &a, &guid); + if (r < 0) + return r; + + break; + } + + a = strchr(a, ';'); + if (!a) + return 0; + } + + if (guid) { + r = sd_id128_from_string(guid, &b->server_id); + if (r < 0) + return r; + } + + b->address_index = a - b->address; + return 1; +} + +static void bus_kill_exec(sd_bus *bus) { + if (!pid_is_valid(bus->busexec_pid)) + return; + + sigterm_wait(TAKE_PID(bus->busexec_pid)); +} + +static int bus_start_address(sd_bus *b) { + int r; + + assert(b); + + for (;;) { + bus_close_io_fds(b); + bus_close_inotify_fd(b); + + bus_kill_exec(b); + + /* If you provide multiple different bus-addresses, we + * try all of them in order and use the first one that + * succeeds. */ + + if (b->exec_path) + r = bus_socket_exec(b); + else if ((b->nspid > 0 || b->machine) && b->sockaddr.sa.sa_family != AF_UNSPEC) + r = bus_container_connect_socket(b); + else if (b->sockaddr.sa.sa_family != AF_UNSPEC) + r = bus_socket_connect(b); + else + goto next; + + if (r >= 0) { + int q; + + q = bus_attach_io_events(b); + if (q < 0) + return q; + + q = bus_attach_inotify_event(b); + if (q < 0) + return q; + + return r; + } + + b->last_connect_error = -r; + + next: + r = bus_parse_next_address(b); + if (r < 0) + return r; + if (r == 0) + return b->last_connect_error > 0 ? -b->last_connect_error : -ECONNREFUSED; + } +} + +int bus_next_address(sd_bus *b) { + assert(b); + + bus_reset_parsed_address(b); + return bus_start_address(b); +} + +static int bus_start_fd(sd_bus *b) { + struct stat st; + int r; + + assert(b); + assert(b->input_fd >= 0); + assert(b->output_fd >= 0); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *pi = NULL, *po = NULL; + (void) fd_get_path(b->input_fd, &pi); + (void) fd_get_path(b->output_fd, &po); + log_debug("sd-bus: starting bus%s%s on fds %d/%d (%s, %s)...", + b->description ? " " : "", strempty(b->description), + b->input_fd, b->output_fd, + pi ?: "???", po ?: "???"); + } + + r = fd_nonblock(b->input_fd, true); + if (r < 0) + return r; + + r = fd_cloexec(b->input_fd, true); + if (r < 0) + return r; + + if (b->input_fd != b->output_fd) { + r = fd_nonblock(b->output_fd, true); + if (r < 0) + return r; + + r = fd_cloexec(b->output_fd, true); + if (r < 0) + return r; + } + + if (fstat(b->input_fd, &st) < 0) + return -errno; + + return bus_socket_take_fd(b); +} + +_public_ int sd_bus_start(sd_bus *bus) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state == BUS_UNSET, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + bus_set_state(bus, BUS_OPENING); + + if (bus->is_server && bus->bus_client) + return -EINVAL; + + if (bus->input_fd >= 0) + r = bus_start_fd(bus); + else if (bus->address || bus->sockaddr.sa.sa_family != AF_UNSPEC || bus->exec_path || bus->machine) + r = bus_start_address(bus); + else + return -EINVAL; + + if (r < 0) { + sd_bus_close(bus); + return r; + } + + return bus_send_hello(bus); +} + +_public_ int sd_bus_open_with_description(sd_bus **ret, const char *description) { + const char *e; + _cleanup_(bus_freep) sd_bus *b = NULL; + int r; + + assert_return(ret, -EINVAL); + + /* Let's connect to the starter bus if it is set, and + * otherwise to the bus that is appropriate for the scope + * we are running in */ + + e = secure_getenv("DBUS_STARTER_BUS_TYPE"); + if (e) { + if (streq(e, "system")) + return sd_bus_open_system_with_description(ret, description); + else if (STR_IN_SET(e, "session", "user")) + return sd_bus_open_user_with_description(ret, description); + } + + e = secure_getenv("DBUS_STARTER_ADDRESS"); + if (!e) { + if (cg_pid_get_owner_uid(0, NULL) >= 0) + return sd_bus_open_user_with_description(ret, description); + else + return sd_bus_open_system_with_description(ret, description); + } + + r = sd_bus_new(&b); + if (r < 0) + return r; + + r = sd_bus_set_address(b, e); + if (r < 0) + return r; + + b->bus_client = true; + + /* We don't know whether the bus is trusted or not, so better + * be safe, and authenticate everything */ + b->trusted = false; + b->is_local = false; + b->creds_mask |= SD_BUS_CREDS_UID | SD_BUS_CREDS_EUID | SD_BUS_CREDS_EFFECTIVE_CAPS; + + r = sd_bus_start(b); + if (r < 0) + return r; + + *ret = TAKE_PTR(b); + return 0; +} + +_public_ int sd_bus_open(sd_bus **ret) { + return sd_bus_open_with_description(ret, NULL); +} + +int bus_set_address_system(sd_bus *b) { + const char *e; + int r; + + assert(b); + + e = secure_getenv("DBUS_SYSTEM_BUS_ADDRESS"); + + r = sd_bus_set_address(b, e ?: DEFAULT_SYSTEM_BUS_ADDRESS); + if (r < 0) + return r; + + b->runtime_scope = RUNTIME_SCOPE_SYSTEM; + return r; +} + +_public_ int sd_bus_open_system_with_description(sd_bus **ret, const char *description) { + _cleanup_(bus_freep) sd_bus *b = NULL; + int r; + + assert_return(ret, -EINVAL); + + r = sd_bus_new(&b); + if (r < 0) + return r; + + if (description) { + r = sd_bus_set_description(b, description); + if (r < 0) + return r; + } + + r = bus_set_address_system(b); + if (r < 0) + return r; + + b->bus_client = true; + + /* Let's do per-method access control on the system bus. We + * need the caller's UID and capability set for that. */ + b->trusted = false; + b->creds_mask |= SD_BUS_CREDS_UID | SD_BUS_CREDS_EUID | SD_BUS_CREDS_EFFECTIVE_CAPS; + b->is_local = true; + + r = sd_bus_start(b); + if (r < 0) + return r; + + *ret = TAKE_PTR(b); + return 0; +} + +_public_ int sd_bus_open_system(sd_bus **ret) { + return sd_bus_open_system_with_description(ret, NULL); +} + +int bus_set_address_user(sd_bus *b) { + const char *a; + _cleanup_free_ char *_a = NULL; + int r; + + assert(b); + + a = secure_getenv("DBUS_SESSION_BUS_ADDRESS"); + if (!a) { + const char *e; + _cleanup_free_ char *ee = NULL; + + e = secure_getenv("XDG_RUNTIME_DIR"); + if (!e) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "sd-bus: $XDG_RUNTIME_DIR not set, cannot connect to user bus."); + + ee = bus_address_escape(e); + if (!ee) + return -ENOMEM; + + if (asprintf(&_a, DEFAULT_USER_BUS_ADDRESS_FMT, ee) < 0) + return -ENOMEM; + a = _a; + } + + r = sd_bus_set_address(b, a); + if (r < 0) + return r; + + b->runtime_scope = RUNTIME_SCOPE_USER; + return r; +} + +_public_ int sd_bus_open_user_with_description(sd_bus **ret, const char *description) { + _cleanup_(bus_freep) sd_bus *b = NULL; + int r; + + assert_return(ret, -EINVAL); + + r = sd_bus_new(&b); + if (r < 0) + return r; + + if (description) { + r = sd_bus_set_description(b, description); + if (r < 0) + return r; + } + + r = bus_set_address_user(b); + if (r < 0) + return r; + + b->bus_client = true; + + /* We don't do any per-method access control on the user bus. */ + b->trusted = true; + b->is_local = true; + + r = sd_bus_start(b); + if (r < 0) + return r; + + *ret = TAKE_PTR(b); + return 0; +} + +_public_ int sd_bus_open_user(sd_bus **ret) { + return sd_bus_open_user_with_description(ret, NULL); +} + +int bus_set_address_system_remote(sd_bus *b, const char *host) { + _cleanup_free_ char *e = NULL; + char *m = NULL, *c = NULL, *a, *rbracket = NULL, *p = NULL; + + assert(b); + assert(host); + + /* Skip ":"s in ipv6 addresses */ + if (*host == '[') { + char *t; + + rbracket = strchr(host, ']'); + if (!rbracket) + return -EINVAL; + t = strndupa_safe(host + 1, rbracket - host - 1); + e = bus_address_escape(t); + if (!e) + return -ENOMEM; + } else if ((a = strchr(host, '@'))) { + if (*(a + 1) == '[') { + _cleanup_free_ char *t = NULL; + + rbracket = strchr(a + 1, ']'); + if (!rbracket) + return -EINVAL; + t = new0(char, strlen(host)); + if (!t) + return -ENOMEM; + strncat(t, host, a - host + 1); + strncat(t, a + 2, rbracket - a - 2); + e = bus_address_escape(t); + if (!e) + return -ENOMEM; + } else if (*(a + 1) == '\0' || strchr(a + 1, '@')) + return -EINVAL; + } + + /* Let's see if a port was given */ + m = strchr(rbracket ? rbracket + 1 : host, ':'); + if (m) { + char *t; + bool got_forward_slash = false; + + p = m + 1; + + t = strchr(p, '/'); + if (t) { + p = strndupa_safe(p, t - p); + got_forward_slash = true; + } + + if (!in_charset(p, "0123456789") || *p == '\0') { + if (!hostname_is_valid(p, 0) || got_forward_slash) + return -EINVAL; + + m = TAKE_PTR(p); + goto interpret_port_as_machine_old_syntax; + } + } + + /* Let's see if a machine was given */ + m = strchr(rbracket ? rbracket + 1 : host, '/'); + if (m) { + m++; +interpret_port_as_machine_old_syntax: + /* Let's make sure this is not a port of some kind, + * and is a valid machine name. */ + if (!in_charset(m, "0123456789") && hostname_is_valid(m, 0)) + c = strjoina(",argv", p ? "7" : "5", "=--machine=", m); + } + + if (!e) { + char *t; + + t = strndupa_safe(host, strcspn(host, ":/")); + + e = bus_address_escape(t); + if (!e) + return -ENOMEM; + } + + a = strjoin("unixexec:path=ssh,argv1=-xT", p ? ",argv2=-p,argv3=" : "", strempty(p), + ",argv", p ? "4" : "2", "=--,argv", p ? "5" : "3", "=", e, + ",argv", p ? "6" : "4", "=systemd-stdio-bridge", c); + if (!a) + return -ENOMEM; + + return free_and_replace(b->address, a); +} + +_public_ int sd_bus_open_system_remote(sd_bus **ret, const char *host) { + _cleanup_(bus_freep) sd_bus *b = NULL; + int r; + + assert_return(host, -EINVAL); + assert_return(ret, -EINVAL); + + r = sd_bus_new(&b); + if (r < 0) + return r; + + r = bus_set_address_system_remote(b, host); + if (r < 0) + return r; + + b->bus_client = true; + b->trusted = false; + b->runtime_scope = RUNTIME_SCOPE_SYSTEM; + b->is_local = false; + + r = sd_bus_start(b); + if (r < 0) + return r; + + *ret = TAKE_PTR(b); + return 0; +} + +int bus_set_address_machine(sd_bus *b, RuntimeScope runtime_scope, const char *machine) { + _cleanup_free_ char *a = NULL; + const char *rhs; + + assert(b); + assert(machine); + + rhs = strchr(machine, '@'); + if (rhs || runtime_scope == RUNTIME_SCOPE_USER) { + _cleanup_free_ char *u = NULL, *eu = NULL, *erhs = NULL; + + /* If there's an "@" in the container specification, we'll connect as a user specified at its + * left hand side, which is useful in combination with user=true. This isn't as trivial as it + * might sound: it's not sufficient to enter the container and connect to some socket there, + * since the --user socket path depends on $XDG_RUNTIME_DIR which is set via PAM. Thus, to be + * able to connect, we need to have a PAM session. Our way out? We use systemd-run to get + * into the container and acquire a PAM session there, and then invoke systemd-stdio-bridge + * in it, which propagates the bus transport to us. */ + + if (rhs) { + if (rhs > machine) + u = strndup(machine, rhs - machine); + else + u = getusername_malloc(); /* Empty user name, let's use the local one */ + if (!u) + return -ENOMEM; + + eu = bus_address_escape(u); + if (!eu) + return -ENOMEM; + + rhs++; + } else { + /* No "@" specified but we shall connect to the user instance? Then assume root (and + * not a user named identically to the calling one). This means: + * + * --machine=foobar --user → connect to user bus of root user in container "foobar" + * --machine=@foobar --user → connect to user bus of user named like the calling user in container "foobar" + * + * Why? so that behaviour for "--machine=foobar --system" is roughly similar to + * "--machine=foobar --user": both times we unconditionally connect as root user + * regardless what the calling user is. */ + + rhs = machine; + } + + if (!isempty(rhs)) { + erhs = bus_address_escape(rhs); + if (!erhs) + return -ENOMEM; + } + + /* systemd-run -M… -PGq --wait -pUser=… -pPAMName=login systemd-stdio-bridge */ + + a = strjoin("unixexec:path=systemd-run," + "argv1=-M", erhs ?: ".host", "," + "argv2=-PGq," + "argv3=--wait," + "argv4=-pUser%3d", eu ?: "root", ",", + "argv5=-pPAMName%3dlogin," + "argv6=systemd-stdio-bridge"); + if (!a) + return -ENOMEM; + + if (runtime_scope == RUNTIME_SCOPE_USER) { + /* Ideally we'd use the "--user" switch to systemd-stdio-bridge here, but it's only + * available in recent systemd versions. Using the "-p" switch with the explicit path + * is a working alternative, and is compatible with older versions, hence that's what + * we use here. */ + if (!strextend(&a, ",argv7=-punix:path%3d%24%7bXDG_RUNTIME_DIR%7d/bus")) + return -ENOMEM; + } + } else { + _cleanup_free_ char *e = NULL; + + /* Just a container name, we can go the simple way, and just join the container, and connect + * to the well-known path of the system bus there. */ + + e = bus_address_escape(machine); + if (!e) + return -ENOMEM; + + a = strjoin("x-machine-unix:machine=", e); + if (!a) + return -ENOMEM; + } + + return free_and_replace(b->address, a); +} + +static int user_and_machine_valid(const char *user_and_machine) { + const char *h; + + /* Checks if a container specification in the form "user@container" or just "container" is valid. + * + * If the "@" syntax is used we'll allow either the "user" or the "container" part to be omitted, but + * not both. */ + + h = strchr(user_and_machine, '@'); + if (!h) + h = user_and_machine; + else { + _cleanup_free_ char *user = NULL; + + user = strndup(user_and_machine, h - user_and_machine); + if (!user) + return -ENOMEM; + + if (!isempty(user) && !valid_user_group_name(user, VALID_USER_RELAX | VALID_USER_ALLOW_NUMERIC)) + return false; + + h++; + + if (isempty(h)) + return !isempty(user); + } + + return hostname_is_valid(h, VALID_HOSTNAME_DOT_HOST); +} + +static int user_and_machine_equivalent(const char *user_and_machine) { + _cleanup_free_ char *un = NULL; + const char *f; + + /* Returns true if the specified user+machine name are actually equivalent to our own identity and + * our own host. If so we can shortcut things. Why bother? Because that way we don't have to fork + * off short-lived worker processes that are then unavailable for authentication and logging in the + * peer. Moreover joining a namespace requires privileges. If we are in the right namespace anyway, + * we can avoid permission problems thus. */ + + assert(user_and_machine); + + /* Omitting the user name means that we shall use the same user name as we run as locally, which + * means we'll end up on the same host, let's shortcut */ + if (streq(user_and_machine, "@.host")) + return true; + + /* Otherwise, if we are root, then we can also allow the ".host" syntax, as that's the user this + * would connect to. */ + uid_t uid = geteuid(); + + if (uid == 0 && STR_IN_SET(user_and_machine, ".host", "root@.host", "0@.host")) + return true; + + /* Otherwise, we have to figure out our user id and name, and compare things with that. */ + char buf[DECIMAL_STR_MAX(uid_t)]; + xsprintf(buf, UID_FMT, uid); + + f = startswith(user_and_machine, buf); + if (!f) { + un = getusername_malloc(); + if (!un) + return -ENOMEM; + + f = startswith(user_and_machine, un); + if (!f) + return false; + } + + return STR_IN_SET(f, "@", "@.host"); +} + +_public_ int sd_bus_open_system_machine(sd_bus **ret, const char *user_and_machine) { + _cleanup_(bus_freep) sd_bus *b = NULL; + int r; + + assert_return(user_and_machine, -EINVAL); + assert_return(ret, -EINVAL); + + if (user_and_machine_equivalent(user_and_machine)) + return sd_bus_open_system(ret); + + r = user_and_machine_valid(user_and_machine); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + r = sd_bus_new(&b); + if (r < 0) + return r; + + r = bus_set_address_machine(b, RUNTIME_SCOPE_SYSTEM, user_and_machine); + if (r < 0) + return r; + + b->bus_client = true; + b->runtime_scope = RUNTIME_SCOPE_SYSTEM; + + r = sd_bus_start(b); + if (r < 0) + return r; + + *ret = TAKE_PTR(b); + return 0; +} + +_public_ int sd_bus_open_user_machine(sd_bus **ret, const char *user_and_machine) { + _cleanup_(bus_freep) sd_bus *b = NULL; + int r; + + assert_return(user_and_machine, -EINVAL); + assert_return(ret, -EINVAL); + + /* Shortcut things if we'd end up on this host and as the same user. */ + if (user_and_machine_equivalent(user_and_machine)) + return sd_bus_open_user(ret); + + r = user_and_machine_valid(user_and_machine); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + r = sd_bus_new(&b); + if (r < 0) + return r; + + r = bus_set_address_machine(b, RUNTIME_SCOPE_USER, user_and_machine); + if (r < 0) + return r; + + b->bus_client = true; + b->trusted = true; + + r = sd_bus_start(b); + if (r < 0) + return r; + + *ret = TAKE_PTR(b); + return 0; +} + +_public_ void sd_bus_close(sd_bus *bus) { + if (!bus) + return; + if (bus->state == BUS_CLOSED) + return; + if (bus_origin_changed(bus)) + return; + + /* Don't leave ssh hanging around */ + bus_kill_exec(bus); + + bus_set_state(bus, BUS_CLOSED); + + sd_bus_detach_event(bus); + + /* Drop all queued messages so that they drop references to + * the bus object and the bus may be freed */ + bus_reset_queues(bus); + + bus_close_io_fds(bus); + bus_close_inotify_fd(bus); +} + +_public_ sd_bus *sd_bus_close_unref(sd_bus *bus) { + if (!bus) + return NULL; + if (bus_origin_changed(bus)) + return NULL; + + sd_bus_close(bus); + + return sd_bus_unref(bus); +} + +_public_ sd_bus* sd_bus_flush_close_unref(sd_bus *bus) { + if (!bus) + return NULL; + if (bus_origin_changed(bus)) + return NULL; + + /* Have to do this before flush() to prevent hang */ + bus_kill_exec(bus); + sd_bus_flush(bus); + + return sd_bus_close_unref(bus); +} + +void bus_enter_closing(sd_bus *bus) { + assert(bus); + + if (!IN_SET(bus->state, BUS_WATCH_BIND, BUS_OPENING, BUS_AUTHENTICATING, BUS_HELLO, BUS_RUNNING)) + return; + + bus_set_state(bus, BUS_CLOSING); +} + +/* Define manually so we can add the PID check */ +_public_ sd_bus *sd_bus_ref(sd_bus *bus) { + if (!bus) + return NULL; + if (bus_origin_changed(bus)) + return NULL; + + bus->n_ref++; + + return bus; +} + +_public_ sd_bus* sd_bus_unref(sd_bus *bus) { + if (!bus) + return NULL; + if (bus_origin_changed(bus)) + return NULL; + + assert(bus->n_ref > 0); + if (--bus->n_ref > 0) + return NULL; + + return bus_free(bus); +} + +_public_ int sd_bus_is_open(sd_bus *bus) { + if (!bus) + return 0; + + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return BUS_IS_OPEN(bus->state); +} + +_public_ int sd_bus_is_ready(sd_bus *bus) { + if (!bus) + return 0; + + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->state == BUS_RUNNING; +} + +_public_ int sd_bus_can_send(sd_bus *bus, char type) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->state != BUS_UNSET, -ENOTCONN); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (bus->is_monitor) + return 0; + + if (type == SD_BUS_TYPE_UNIX_FD) { + if (!bus->accept_fd) + return 0; + + r = bus_ensure_running(bus); + if (r < 0) + return r; + + return bus->can_fds; + } + + return bus_type_is_valid(type); +} + +_public_ int sd_bus_get_bus_id(sd_bus *bus, sd_id128_t *id) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(id, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + r = bus_ensure_running(bus); + if (r < 0) + return r; + + *id = bus->server_id; + return 0; +} + +#define COOKIE_CYCLED (UINT32_C(1) << 31) + +static uint64_t cookie_inc(uint64_t cookie) { + + /* Stay within the 32-bit range, since classic D-Bus can't deal with more */ + if (cookie >= UINT32_MAX) + return COOKIE_CYCLED; /* Don't go back to zero, but use the highest bit for checking + * whether we are looping. */ + + return cookie + 1; +} + +static int next_cookie(sd_bus *b) { + uint64_t new_cookie; + + assert(b); + + new_cookie = cookie_inc(b->cookie); + + /* Small optimization: don't bother with checking for cookie reuse until we overran cookiespace at + * least once, but then do it thorougly. */ + if (FLAGS_SET(new_cookie, COOKIE_CYCLED)) { + uint32_t i; + + /* Check if the cookie is currently in use. If so, pick the next one */ + for (i = 0; i < COOKIE_CYCLED; i++) { + if (!ordered_hashmap_contains(b->reply_callbacks, &new_cookie)) + goto good; + + new_cookie = cookie_inc(new_cookie); + } + + /* Can't fulfill request */ + return -EBUSY; + } + +good: + b->cookie = new_cookie; + return 0; +} + +static int bus_seal_message(sd_bus *b, sd_bus_message *m, usec_t timeout) { + int r; + + assert(b); + assert(m); + + if (m->sealed) { + /* If we copy the same message to multiple + * destinations, avoid using the same cookie + * numbers. */ + b->cookie = MAX(b->cookie, BUS_MESSAGE_COOKIE(m)); + return 0; + } + + if (timeout == 0) { + r = sd_bus_get_method_call_timeout(b, &timeout); + if (r < 0) + return r; + } + + if (!m->sender && b->patch_sender) { + r = sd_bus_message_set_sender(m, b->patch_sender); + if (r < 0) + return r; + } + + r = next_cookie(b); + if (r < 0) + return r; + + return sd_bus_message_seal(m, b->cookie, timeout); +} + +static int bus_remarshal_message(sd_bus *b, sd_bus_message **m) { + bool remarshal = false; + + assert(b); + + /* wrong packet version */ + if (b->message_version != 0 && b->message_version != (*m)->header->version) + remarshal = true; + + /* wrong packet endianness */ + if (b->message_endian != 0 && b->message_endian != (*m)->header->endian) + remarshal = true; + + return remarshal ? bus_message_remarshal(b, m) : 0; +} + +int bus_seal_synthetic_message(sd_bus *b, sd_bus_message *m) { + assert(b); + assert(m); + + /* Fake some timestamps, if they were requested, and not + * already initialized */ + if (b->attach_timestamp) { + if (m->realtime <= 0) + m->realtime = now(CLOCK_REALTIME); + + if (m->monotonic <= 0) + m->monotonic = now(CLOCK_MONOTONIC); + } + + /* The bus specification says the serial number cannot be 0, + * hence let's fill something in for synthetic messages. Since + * synthetic messages might have a fake sender and we don't + * want to interfere with the real sender's serial numbers we + * pick a fixed, artificial one. */ + return sd_bus_message_seal(m, UINT32_MAX, 0); +} + +static int bus_write_message(sd_bus *bus, sd_bus_message *m, size_t *idx) { + int r; + + assert(bus); + assert(m); + + r = bus_socket_write_message(bus, m, idx); + if (r <= 0) + return r; + + if (*idx >= BUS_MESSAGE_SIZE(m)) + log_debug("Sent message type=%s sender=%s destination=%s path=%s interface=%s member=%s" + " cookie=%" PRIu64 " reply_cookie=%" PRIu64 + " signature=%s error-name=%s error-message=%s", + bus_message_type_to_string(m->header->type), + strna(sd_bus_message_get_sender(m)), + strna(sd_bus_message_get_destination(m)), + strna(sd_bus_message_get_path(m)), + strna(sd_bus_message_get_interface(m)), + strna(sd_bus_message_get_member(m)), + BUS_MESSAGE_COOKIE(m), + m->reply_cookie, + strna(m->root_container.signature), + strna(m->error.name), + strna(m->error.message)); + + return r; +} + +static int dispatch_wqueue(sd_bus *bus) { + int r, ret = 0; + + assert(bus); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + while (bus->wqueue_size > 0) { + + r = bus_write_message(bus, bus->wqueue[0], &bus->windex); + if (r < 0) + return r; + else if (r == 0) + /* Didn't do anything this time */ + return ret; + else if (bus->windex >= BUS_MESSAGE_SIZE(bus->wqueue[0])) { + /* Fully written. Let's drop the entry from + * the queue. + * + * This isn't particularly optimized, but + * well, this is supposed to be our worst-case + * buffer only, and the socket buffer is + * supposed to be our primary buffer, and if + * it got full, then all bets are off + * anyway. */ + + bus->wqueue_size--; + bus_message_unref_queued(bus->wqueue[0], bus); + memmove(bus->wqueue, bus->wqueue + 1, sizeof(sd_bus_message*) * bus->wqueue_size); + bus->windex = 0; + + ret = 1; + } + } + + return ret; +} + +static int bus_read_message(sd_bus *bus) { + assert(bus); + + return bus_socket_read_message(bus); +} + +int bus_rqueue_make_room(sd_bus *bus) { + assert(bus); + + if (bus->rqueue_size >= BUS_RQUEUE_MAX) + return -ENOBUFS; + + if (!GREEDY_REALLOC(bus->rqueue, bus->rqueue_size + 1)) + return -ENOMEM; + + return 0; +} + +static void rqueue_drop_one(sd_bus *bus, size_t i) { + assert(bus); + assert(i < bus->rqueue_size); + + bus_message_unref_queued(bus->rqueue[i], bus); + memmove(bus->rqueue + i, bus->rqueue + i + 1, sizeof(sd_bus_message*) * (bus->rqueue_size - i - 1)); + bus->rqueue_size--; +} + +static int dispatch_rqueue(sd_bus *bus, sd_bus_message **m) { + int r, ret = 0; + + assert(bus); + assert(m); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + for (;;) { + if (bus->rqueue_size > 0) { + /* Dispatch a queued message */ + *m = sd_bus_message_ref(bus->rqueue[0]); + rqueue_drop_one(bus, 0); + return 1; + } + + /* Try to read a new message */ + r = bus_read_message(bus); + if (r < 0) + return r; + if (r == 0) { + *m = NULL; + return ret; + } + + ret = 1; + } +} + +_public_ int sd_bus_send(sd_bus *bus, sd_bus_message *_m, uint64_t *cookie) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = sd_bus_message_ref(_m); + int r; + + assert_return(m, -EINVAL); + + if (bus) + assert_return(bus = bus_resolve(bus), -ENOPKG); + else + assert_return(bus = m->bus, -ENOTCONN); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (m->n_fds > 0) { + r = sd_bus_can_send(bus, SD_BUS_TYPE_UNIX_FD); + if (r < 0) + return r; + if (r == 0) + return -EOPNOTSUPP; + } + + /* If the cookie number isn't kept, then we know that no reply + * is expected */ + if (!cookie && !m->sealed) + m->header->flags |= BUS_MESSAGE_NO_REPLY_EXPECTED; + + r = bus_seal_message(bus, m, 0); + if (r < 0) + return r; + + /* Remarshall if we have to. This will possibly unref the + * message and place a replacement in m */ + r = bus_remarshal_message(bus, &m); + if (r < 0) + return r; + + /* If this is a reply and no reply was requested, then let's + * suppress this, if we can */ + if (m->dont_send) + goto finish; + + if (IN_SET(bus->state, BUS_RUNNING, BUS_HELLO) && bus->wqueue_size <= 0) { + size_t idx = 0; + + r = bus_write_message(bus, m, &idx); + if (ERRNO_IS_NEG_DISCONNECT(r)) { + bus_enter_closing(bus); + return -ECONNRESET; + } else if (r < 0) + return r; + + if (idx < BUS_MESSAGE_SIZE(m)) { + /* Wasn't fully written. So let's remember how + * much was written. Note that the first entry + * of the wqueue array is always allocated so + * that we always can remember how much was + * written. */ + bus->wqueue[0] = bus_message_ref_queued(m, bus); + bus->wqueue_size = 1; + bus->windex = idx; + } + + } else { + /* Just append it to the queue. */ + + if (bus->wqueue_size >= BUS_WQUEUE_MAX) + return -ENOBUFS; + + if (!GREEDY_REALLOC(bus->wqueue, bus->wqueue_size + 1)) + return -ENOMEM; + + bus->wqueue[bus->wqueue_size++] = bus_message_ref_queued(m, bus); + } + +finish: + if (cookie) + *cookie = BUS_MESSAGE_COOKIE(m); + + return 1; +} + +_public_ int sd_bus_send_to(sd_bus *bus, sd_bus_message *m, const char *destination, uint64_t *cookie) { + int r; + + assert_return(m, -EINVAL); + + if (bus) + assert_return(bus = bus_resolve(bus), -ENOPKG); + else + assert_return(bus = m->bus, -ENOTCONN); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (!streq_ptr(m->destination, destination)) { + + if (!destination) + return -EEXIST; + + r = sd_bus_message_set_destination(m, destination); + if (r < 0) + return r; + } + + return sd_bus_send(bus, m, cookie); +} + +static usec_t calc_elapse(sd_bus *bus, uint64_t usec) { + assert(bus); + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + if (usec == USEC_INFINITY) + return 0; + + /* We start all timeouts the instant we enter BUS_HELLO/BUS_RUNNING state, so that the don't run in parallel + * with any connection setup states. Hence, if a method callback is started earlier than that we just store the + * relative timestamp, and afterwards the absolute one. */ + + if (IN_SET(bus->state, BUS_WATCH_BIND, BUS_OPENING, BUS_AUTHENTICATING)) + return usec; + else + return usec_add(now(CLOCK_MONOTONIC), usec); +} + +static int timeout_compare(const void *a, const void *b) { + const struct reply_callback *x = a, *y = b; + + if (x->timeout_usec != 0 && y->timeout_usec == 0) + return -1; + + if (x->timeout_usec == 0 && y->timeout_usec != 0) + return 1; + + return CMP(x->timeout_usec, y->timeout_usec); +} + +_public_ int sd_bus_call_async( + sd_bus *bus, + sd_bus_slot **slot, + sd_bus_message *_m, + sd_bus_message_handler_t callback, + void *userdata, + uint64_t usec) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = sd_bus_message_ref(_m); + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *s = NULL; + int r; + + assert_return(m, -EINVAL); + assert_return(m->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL); + assert_return(!m->sealed || (!!callback == !(m->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED)), -EINVAL); + + if (bus) + assert_return(bus = bus_resolve(bus), -ENOPKG); + else + assert_return(bus = m->bus, -ENOTCONN); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + /* If no callback is specified and there's no interest in a slot, then there's no reason to ask for a reply */ + if (!callback && !slot && !m->sealed) + m->header->flags |= BUS_MESSAGE_NO_REPLY_EXPECTED; + + r = ordered_hashmap_ensure_allocated(&bus->reply_callbacks, &uint64_hash_ops); + if (r < 0) + return r; + + r = prioq_ensure_allocated(&bus->reply_callbacks_prioq, timeout_compare); + if (r < 0) + return r; + + r = bus_seal_message(bus, m, usec); + if (r < 0) + return r; + + r = bus_remarshal_message(bus, &m); + if (r < 0) + return r; + + if (slot || callback) { + s = bus_slot_allocate(bus, !slot, BUS_REPLY_CALLBACK, sizeof(struct reply_callback), userdata); + if (!s) + return -ENOMEM; + + s->reply_callback.callback = callback; + + s->reply_callback.cookie = BUS_MESSAGE_COOKIE(m); + r = ordered_hashmap_put(bus->reply_callbacks, &s->reply_callback.cookie, &s->reply_callback); + if (r < 0) { + s->reply_callback.cookie = 0; + return r; + } + + s->reply_callback.timeout_usec = calc_elapse(bus, m->timeout); + if (s->reply_callback.timeout_usec != 0) { + r = prioq_put(bus->reply_callbacks_prioq, &s->reply_callback, &s->reply_callback.prioq_idx); + if (r < 0) { + s->reply_callback.timeout_usec = 0; + return r; + } + } + } + + r = sd_bus_send(bus, m, s ? &s->reply_callback.cookie : NULL); + if (r < 0) + return r; + + if (slot) + *slot = s; + s = NULL; + + return r; +} + +int bus_ensure_running(sd_bus *bus) { + int r; + + assert(bus); + + if (bus->state == BUS_RUNNING) + return 1; + + for (;;) { + if (IN_SET(bus->state, BUS_UNSET, BUS_CLOSED, BUS_CLOSING)) + return -ENOTCONN; + + r = sd_bus_process(bus, NULL); + if (r < 0) + return r; + if (bus->state == BUS_RUNNING) + return 1; + if (r > 0) + continue; + + r = sd_bus_wait(bus, UINT64_MAX); + if (r < 0) + return r; + } +} + +_public_ int sd_bus_call( + sd_bus *bus, + sd_bus_message *_m, + uint64_t usec, + sd_bus_error *error, + sd_bus_message **reply) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = sd_bus_message_ref(_m); + usec_t timeout; + uint64_t cookie; + size_t i; + int r; + + bus_assert_return(m, -EINVAL, error); + bus_assert_return(m->header->type == SD_BUS_MESSAGE_METHOD_CALL, -EINVAL, error); + bus_assert_return(!(m->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED), -EINVAL, error); + bus_assert_return(!bus_error_is_dirty(error), -EINVAL, error); + + if (bus) + assert_return(bus = bus_resolve(bus), -ENOPKG); + else + assert_return(bus = m->bus, -ENOTCONN); + bus_assert_return(!bus_origin_changed(bus), -ECHILD, error); + + if (!BUS_IS_OPEN(bus->state)) { + r = -ENOTCONN; + goto fail; + } + + r = bus_ensure_running(bus); + if (r < 0) + goto fail; + + i = bus->rqueue_size; + + r = bus_seal_message(bus, m, usec); + if (r < 0) + goto fail; + + r = bus_remarshal_message(bus, &m); + if (r < 0) + goto fail; + + r = sd_bus_send(bus, m, &cookie); + if (r < 0) + goto fail; + + timeout = calc_elapse(bus, m->timeout); + + for (;;) { + usec_t left; + + while (i < bus->rqueue_size) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *incoming = NULL; + + incoming = sd_bus_message_ref(bus->rqueue[i]); + + if (incoming->reply_cookie == cookie) { + /* Found a match! */ + + rqueue_drop_one(bus, i); + log_debug_bus_message(incoming); + + if (incoming->header->type == SD_BUS_MESSAGE_METHOD_RETURN) { + + if (incoming->n_fds <= 0 || bus->accept_fd) { + if (reply) + *reply = TAKE_PTR(incoming); + + return 1; + } + + return sd_bus_error_set(error, SD_BUS_ERROR_INCONSISTENT_MESSAGE, + "Reply message contained file descriptors which I couldn't accept. Sorry."); + + } else if (incoming->header->type == SD_BUS_MESSAGE_METHOD_ERROR) + return sd_bus_error_copy(error, &incoming->error); + else { + r = -EIO; + goto fail; + } + + } else if (BUS_MESSAGE_COOKIE(incoming) == cookie && + bus->unique_name && + incoming->sender && + streq(bus->unique_name, incoming->sender)) { + + rqueue_drop_one(bus, i); + + /* Our own message? Somebody is trying to send its own client a message, + * let's not dead-lock, let's fail immediately. */ + + r = -ELOOP; + goto fail; + } + + /* Try to read more, right-away */ + i++; + } + + r = bus_read_message(bus); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) { + bus_enter_closing(bus); + r = -ECONNRESET; + } + + goto fail; + } + if (r > 0) + continue; + + if (timeout > 0) { + usec_t n; + + n = now(CLOCK_MONOTONIC); + if (n >= timeout) { + r = -ETIMEDOUT; + goto fail; + } + + left = timeout - n; + } else + left = UINT64_MAX; + + r = bus_poll(bus, true, left); + if (ERRNO_IS_NEG_TRANSIENT(r)) + continue; + if (r < 0) + goto fail; + if (r == 0) { + r = -ETIMEDOUT; + goto fail; + } + + r = dispatch_wqueue(bus); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) { + bus_enter_closing(bus); + r = -ECONNRESET; + } + + goto fail; + } + } + +fail: + return sd_bus_error_set_errno(error, r); +} + +_public_ int sd_bus_get_fd(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(bus->input_fd == bus->output_fd, -EPERM); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (bus->state == BUS_CLOSED) + return -ENOTCONN; + + if (bus->inotify_fd >= 0) + return bus->inotify_fd; + + if (bus->input_fd >= 0) + return bus->input_fd; + + return -ENOTCONN; +} + +_public_ int sd_bus_get_events(sd_bus *bus) { + int flags = 0; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + switch (bus->state) { + + case BUS_UNSET: + case BUS_CLOSED: + return -ENOTCONN; + + case BUS_WATCH_BIND: + flags |= POLLIN; + break; + + case BUS_OPENING: + flags |= POLLOUT; + break; + + case BUS_AUTHENTICATING: + if (bus_socket_auth_needs_write(bus)) + flags |= POLLOUT; + + flags |= POLLIN; + break; + + case BUS_RUNNING: + case BUS_HELLO: + if (bus->rqueue_size <= 0) + flags |= POLLIN; + if (bus->wqueue_size > 0) + flags |= POLLOUT; + break; + + case BUS_CLOSING: + break; + + default: + assert_not_reached(); + } + + return flags; +} + +_public_ int sd_bus_get_timeout(sd_bus *bus, uint64_t *timeout_usec) { + struct reply_callback *c; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(timeout_usec, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state) && bus->state != BUS_CLOSING) + return -ENOTCONN; + + if (bus->track_queue) { + *timeout_usec = 0; + return 1; + } + + switch (bus->state) { + + case BUS_AUTHENTICATING: + *timeout_usec = bus->auth_timeout; + return 1; + + case BUS_RUNNING: + case BUS_HELLO: + if (bus->rqueue_size > 0) { + *timeout_usec = 0; + return 1; + } + + c = prioq_peek(bus->reply_callbacks_prioq); + if (!c) { + *timeout_usec = UINT64_MAX; + return 0; + } + + if (c->timeout_usec == 0) { + *timeout_usec = UINT64_MAX; + return 0; + } + + *timeout_usec = c->timeout_usec; + return 1; + + case BUS_CLOSING: + *timeout_usec = 0; + return 1; + + case BUS_WATCH_BIND: + case BUS_OPENING: + *timeout_usec = UINT64_MAX; + return 0; + + default: + assert_not_reached(); + } +} + +static int process_timeout(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error_buffer = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* m = NULL; + struct reply_callback *c; + sd_bus_slot *slot; + bool is_hello; + usec_t n; + int r; + + assert(bus); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + c = prioq_peek(bus->reply_callbacks_prioq); + if (!c) + return 0; + + n = now(CLOCK_MONOTONIC); + if (c->timeout_usec > n) + return 0; + + r = bus_message_new_synthetic_error( + bus, + c->cookie, + &SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_NO_REPLY, "Method call timed out"), + &m); + if (r < 0) + return r; + + m->read_counter = ++bus->read_counter; + + r = bus_seal_synthetic_message(bus, m); + if (r < 0) + return r; + + assert_se(prioq_pop(bus->reply_callbacks_prioq) == c); + c->timeout_usec = 0; + + ordered_hashmap_remove(bus->reply_callbacks, &c->cookie); + c->cookie = 0; + + slot = container_of(c, sd_bus_slot, reply_callback); + + bus->iteration_counter++; + + is_hello = bus->state == BUS_HELLO && c->callback == hello_callback; + + bus->current_message = m; + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = c->callback; + bus->current_userdata = slot->userdata; + r = c->callback(m, slot->userdata, &error_buffer); + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = NULL; + bus->current_message = NULL; + + if (slot->floating) + bus_slot_disconnect(slot, true); + + sd_bus_slot_unref(slot); + + /* When this is the hello message and it timed out, then make sure to propagate the error up, don't just log + * and ignore the callback handler's return value. */ + if (is_hello) + return r; + + return bus_maybe_reply_error(m, r, &error_buffer); +} + +static int process_hello(sd_bus *bus, sd_bus_message *m) { + assert(bus); + assert(m); + + if (bus->state != BUS_HELLO) + return 0; + + /* Let's make sure the first message on the bus is the HELLO + * reply. But note that we don't actually parse the message + * here (we leave that to the usual handling), we just verify + * we don't let any earlier msg through. */ + + if (!IN_SET(m->header->type, SD_BUS_MESSAGE_METHOD_RETURN, SD_BUS_MESSAGE_METHOD_ERROR)) + return -EIO; + + if (m->reply_cookie != 1) + return -EIO; + + return 0; +} + +static int process_reply(sd_bus *bus, sd_bus_message *m) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *synthetic_reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error_buffer = SD_BUS_ERROR_NULL; + struct reply_callback *c; + sd_bus_slot *slot; + bool is_hello; + int r; + + assert(bus); + assert(m); + + if (!IN_SET(m->header->type, SD_BUS_MESSAGE_METHOD_RETURN, SD_BUS_MESSAGE_METHOD_ERROR)) + return 0; + + if (m->destination && bus->unique_name && !streq_ptr(m->destination, bus->unique_name)) + return 0; + + c = ordered_hashmap_remove(bus->reply_callbacks, &m->reply_cookie); + if (!c) + return 0; + + c->cookie = 0; + + slot = container_of(c, sd_bus_slot, reply_callback); + + if (m->n_fds > 0 && !bus->accept_fd) { + + /* If the reply contained a file descriptor which we + * didn't want we pass an error instead. */ + + r = bus_message_new_synthetic_error( + bus, + m->reply_cookie, + &SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_INCONSISTENT_MESSAGE, "Reply message contained file descriptor"), + &synthetic_reply); + if (r < 0) + return r; + + /* Copy over original timestamp */ + synthetic_reply->realtime = m->realtime; + synthetic_reply->monotonic = m->monotonic; + synthetic_reply->seqnum = m->seqnum; + synthetic_reply->read_counter = m->read_counter; + + r = bus_seal_synthetic_message(bus, synthetic_reply); + if (r < 0) + return r; + + m = synthetic_reply; + } else { + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + } + + if (c->timeout_usec != 0) { + prioq_remove(bus->reply_callbacks_prioq, c, &c->prioq_idx); + c->timeout_usec = 0; + } + + is_hello = bus->state == BUS_HELLO && c->callback == hello_callback; + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = c->callback; + bus->current_userdata = slot->userdata; + r = c->callback(m, slot->userdata, &error_buffer); + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = NULL; + + if (slot->floating) + bus_slot_disconnect(slot, true); + + sd_bus_slot_unref(slot); + + /* When this is the hello message and it failed, then make sure to propagate the error up, don't just log and + * ignore the callback handler's return value. */ + if (is_hello) + return r; + + return bus_maybe_reply_error(m, r, &error_buffer); +} + +static int process_filter(sd_bus *bus, sd_bus_message *m) { + _cleanup_(sd_bus_error_free) sd_bus_error error_buffer = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(m); + + do { + bus->filter_callbacks_modified = false; + + LIST_FOREACH(callbacks, l, bus->filter_callbacks) { + sd_bus_slot *slot; + + if (bus->filter_callbacks_modified) + break; + + /* Don't run this more than once per iteration */ + if (l->last_iteration == bus->iteration_counter) + continue; + + l->last_iteration = bus->iteration_counter; + + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + slot = container_of(l, sd_bus_slot, filter_callback); + + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = l->callback; + bus->current_userdata = slot->userdata; + r = l->callback(m, slot->userdata, &error_buffer); + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = sd_bus_slot_unref(slot); + + r = bus_maybe_reply_error(m, r, &error_buffer); + if (r != 0) + return r; + + } + + } while (bus->filter_callbacks_modified); + + return 0; +} + +static int process_match(sd_bus *bus, sd_bus_message *m) { + int r; + + assert(bus); + assert(m); + + do { + bus->match_callbacks_modified = false; + + r = bus_match_run(bus, &bus->match_callbacks, m); + if (r != 0) + return r; + + } while (bus->match_callbacks_modified); + + return 0; +} + +static int process_builtin(sd_bus *bus, sd_bus_message *m) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(bus); + assert(m); + + if (bus->is_monitor) + return 0; + + if (bus->manual_peer_interface) + return 0; + + if (m->header->type != SD_BUS_MESSAGE_METHOD_CALL) + return 0; + + if (!streq_ptr(m->interface, "org.freedesktop.DBus.Peer")) + return 0; + + if (m->header->flags & BUS_MESSAGE_NO_REPLY_EXPECTED) + return 1; + + if (streq_ptr(m->member, "Ping")) + r = sd_bus_message_new_method_return(m, &reply); + else if (streq_ptr(m->member, "GetMachineId")) { + sd_id128_t id; + + r = sd_id128_get_machine(&id); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", SD_ID128_TO_STRING(id)); + } else { + r = sd_bus_message_new_method_errorf( + m, &reply, + SD_BUS_ERROR_UNKNOWN_METHOD, + "Unknown method '%s' on interface '%s'.", m->member, m->interface); + } + if (r < 0) + return r; + + r = sd_bus_send(bus, reply, NULL); + if (r < 0) + return r; + + return 1; +} + +static int process_fd_check(sd_bus *bus, sd_bus_message *m) { + assert(bus); + assert(m); + + /* If we got a message with a file descriptor which we didn't + * want to accept, then let's drop it. How can this even + * happen? For example, when the kernel queues a message into + * an activatable names's queue which allows fds, and then is + * delivered to us later even though we ourselves did not + * negotiate it. */ + + if (bus->is_monitor) + return 0; + + if (m->n_fds <= 0) + return 0; + + if (bus->accept_fd) + return 0; + + if (m->header->type != SD_BUS_MESSAGE_METHOD_CALL) + return 1; /* just eat it up */ + + return sd_bus_reply_method_errorf(m, SD_BUS_ERROR_INCONSISTENT_MESSAGE, + "Message contains file descriptors, which I cannot accept. Sorry."); +} + +static int process_message(sd_bus *bus, sd_bus_message *m) { + _unused_ _cleanup_(log_context_unrefp) LogContext *c = NULL; + int r; + + assert(bus); + assert(m); + + bus->current_message = m; + bus->iteration_counter++; + + if (log_context_enabled()) + c = log_context_new_strv_consume(bus_message_make_log_fields(m)); + + log_debug_bus_message(m); + + r = process_hello(bus, m); + if (r != 0) + goto finish; + + r = process_reply(bus, m); + if (r != 0) + goto finish; + + r = process_fd_check(bus, m); + if (r != 0) + goto finish; + + r = process_filter(bus, m); + if (r != 0) + goto finish; + + r = process_match(bus, m); + if (r != 0) + goto finish; + + r = process_builtin(bus, m); + if (r != 0) + goto finish; + + r = bus_process_object(bus, m); + +finish: + bus->current_message = NULL; + return r; +} + +static int dispatch_track(sd_bus *bus) { + assert(bus); + + if (!bus->track_queue) + return 0; + + bus_track_dispatch(bus->track_queue); + return 1; +} + +static int process_running(sd_bus *bus, sd_bus_message **ret) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + assert(IN_SET(bus->state, BUS_RUNNING, BUS_HELLO)); + + r = process_timeout(bus); + if (r != 0) + goto null_message; + + r = dispatch_wqueue(bus); + if (r != 0) + goto null_message; + + r = dispatch_track(bus); + if (r != 0) + goto null_message; + + r = dispatch_rqueue(bus, &m); + if (r < 0) + return r; + if (!m) + goto null_message; + + r = process_message(bus, m); + if (r != 0) + goto null_message; + + if (ret) { + r = sd_bus_message_rewind(m, true); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 1; + } + + if (m->header->type == SD_BUS_MESSAGE_METHOD_CALL) { + + log_debug("Unprocessed message call sender=%s object=%s interface=%s member=%s", + strna(sd_bus_message_get_sender(m)), + strna(sd_bus_message_get_path(m)), + strna(sd_bus_message_get_interface(m)), + strna(sd_bus_message_get_member(m))); + + r = sd_bus_reply_method_errorf( + m, + SD_BUS_ERROR_UNKNOWN_OBJECT, + "Unknown object '%s'.", m->path); + if (r < 0) + return r; + } + + return 1; + +null_message: + if (r >= 0 && ret) + *ret = NULL; + + return r; +} + +static int bus_exit_now(sd_bus *bus, sd_event *event) { + assert(bus); + + /* Exit due to close, if this is requested. If this is bus object is attached to an event source, invokes + * sd_event_exit(), otherwise invokes libc exit(). */ + + if (bus->exited) /* did we already exit? */ + return 0; + if (!bus->exit_triggered) /* was the exit condition triggered? */ + return 0; + if (!bus->exit_on_disconnect) /* Shall we actually exit on disconnection? */ + return 0; + + bus->exited = true; /* never exit more than once */ + + log_debug("Bus connection disconnected, exiting."); + + if (!event) + event = bus->event; + + if (event) + return sd_event_exit(event, EXIT_FAILURE); + else + exit(EXIT_FAILURE); + + assert_not_reached(); +} + +static int process_closing_reply_callback(sd_bus *bus, struct reply_callback *c) { + _cleanup_(sd_bus_error_free) sd_bus_error error_buffer = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + sd_bus_slot *slot; + int r; + + assert(bus); + assert(c); + + r = bus_message_new_synthetic_error( + bus, + c->cookie, + &SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_NO_REPLY, "Connection terminated"), + &m); + if (r < 0) + return r; + + m->read_counter = ++bus->read_counter; + + r = bus_seal_synthetic_message(bus, m); + if (r < 0) + return r; + + if (c->timeout_usec != 0) { + prioq_remove(bus->reply_callbacks_prioq, c, &c->prioq_idx); + c->timeout_usec = 0; + } + + ordered_hashmap_remove(bus->reply_callbacks, &c->cookie); + c->cookie = 0; + + slot = container_of(c, sd_bus_slot, reply_callback); + + bus->iteration_counter++; + + bus->current_message = m; + bus->current_slot = sd_bus_slot_ref(slot); + bus->current_handler = c->callback; + bus->current_userdata = slot->userdata; + r = c->callback(m, slot->userdata, &error_buffer); + bus->current_userdata = NULL; + bus->current_handler = NULL; + bus->current_slot = NULL; + bus->current_message = NULL; + + if (slot->floating) + bus_slot_disconnect(slot, true); + + sd_bus_slot_unref(slot); + + return bus_maybe_reply_error(m, r, &error_buffer); +} + +static int process_closing(sd_bus *bus, sd_bus_message **ret) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + struct reply_callback *c; + int r; + + assert(bus); + assert(bus->state == BUS_CLOSING); + + /* First, fail all outstanding method calls */ + c = ordered_hashmap_first(bus->reply_callbacks); + if (c) + return process_closing_reply_callback(bus, c); + + /* Then, fake-drop all remaining bus tracking references */ + if (bus->tracks) { + bus_track_close(bus->tracks); + return 1; + } + + /* Then, synthesize a Disconnected message */ + r = sd_bus_message_new_signal( + bus, + &m, + "/org/freedesktop/DBus/Local", + "org.freedesktop.DBus.Local", + "Disconnected"); + if (r < 0) + return r; + + bus_message_set_sender_local(bus, m); + m->read_counter = ++bus->read_counter; + + r = bus_seal_synthetic_message(bus, m); + if (r < 0) + return r; + + /* sd_bus_close() will deref the event and set bus->event to NULL. But in bus_exit_now() we use + * bus->event to decide whether to return from the event loop or exit(), but given it's always NULL + * at that point, it always exit(). Ref it here and pass it through further down to avoid that. */ + event = sd_event_ref(bus->event); + sd_bus_close(bus); + + bus->current_message = m; + bus->iteration_counter++; + + r = process_filter(bus, m); + if (r != 0) + goto finish; + + r = process_match(bus, m); + if (r != 0) + goto finish; + + /* Nothing else to do, exit now, if the condition holds */ + bus->exit_triggered = true; + (void) bus_exit_now(bus, event); + + if (ret) + *ret = TAKE_PTR(m); + + r = 1; + +finish: + bus->current_message = NULL; + + return r; +} + +static int bus_process_internal(sd_bus *bus, sd_bus_message **ret) { + int r; + + /* Returns 0 when we didn't do anything. This should cause the + * caller to invoke sd_bus_wait() before returning the next + * time. Returns > 0 when we did something, which possibly + * means *ret is filled in with an unprocessed message. */ + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + /* We don't allow recursively invoking sd_bus_process(). */ + assert_return(!bus->current_message, -EBUSY); + assert(!bus->current_slot); /* This should be NULL whenever bus->current_message is */ + + BUS_DONT_DESTROY(bus); + + switch (bus->state) { + + case BUS_UNSET: + return -ENOTCONN; + + case BUS_CLOSED: + return -ECONNRESET; + + case BUS_WATCH_BIND: + r = bus_socket_process_watch_bind(bus); + break; + + case BUS_OPENING: + r = bus_socket_process_opening(bus); + break; + + case BUS_AUTHENTICATING: + r = bus_socket_process_authenticating(bus); + break; + + case BUS_RUNNING: + case BUS_HELLO: + r = process_running(bus, ret); + if (r >= 0) + return r; + + /* This branch initializes *ret, hence we don't use the generic error checking below */ + break; + + case BUS_CLOSING: + return process_closing(bus, ret); + + default: + assert_not_reached(); + } + + if (ERRNO_IS_NEG_DISCONNECT(r)) { + bus_enter_closing(bus); + r = 1; + } else if (r < 0) + return r; + + if (ret) + *ret = NULL; + + return r; +} + +_public_ int sd_bus_process(sd_bus *bus, sd_bus_message **ret) { + return bus_process_internal(bus, ret); +} + +_public_ int sd_bus_process_priority(sd_bus *bus, int64_t priority, sd_bus_message **ret) { + return bus_process_internal(bus, ret); +} + +static int bus_poll(sd_bus *bus, bool need_more, uint64_t timeout_usec) { + struct pollfd p[2] = {}; + usec_t m = USEC_INFINITY; + int r, n; + + assert(bus); + + if (bus->state == BUS_CLOSING) + return 1; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (bus->state == BUS_WATCH_BIND) { + assert(bus->inotify_fd >= 0); + + p[0].events = POLLIN; + p[0].fd = bus->inotify_fd; + n = 1; + } else { + int e; + + e = sd_bus_get_events(bus); + if (e < 0) + return e; + + if (need_more) + /* The caller really needs some more data, they don't + * care about what's already read, or any timeouts + * except its own. */ + e |= POLLIN; + else { + usec_t until; + /* The caller wants to process if there's something to + * process, but doesn't care otherwise */ + + r = sd_bus_get_timeout(bus, &until); + if (r < 0) + return r; + if (r > 0) + m = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + } + + p[0].fd = bus->input_fd; + if (bus->output_fd == bus->input_fd) { + p[0].events = e; + n = 1; + } else { + p[0].events = e & POLLIN; + p[1].fd = bus->output_fd; + p[1].events = e & POLLOUT; + n = 2; + } + } + + if (timeout_usec != UINT64_MAX && (m == USEC_INFINITY || timeout_usec < m)) + m = timeout_usec; + + r = ppoll_usec(p, n, m); + if (r <= 0) + return r; + + return 1; +} + +_public_ int sd_bus_wait(sd_bus *bus, uint64_t timeout_usec) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (bus->state == BUS_CLOSING) + return 0; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + if (bus->rqueue_size > 0) + return 0; + + r = bus_poll(bus, false, timeout_usec); + if (ERRNO_IS_NEG_TRANSIENT(r)) + return 1; /* treat EINTR as success, but let's exit, so that the caller will call back into us soon. */ + + return r; +} + +_public_ int sd_bus_flush(sd_bus *bus) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (bus->state == BUS_CLOSING) + return 0; + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + /* We never were connected? Don't hang in inotify for good, as there's no timeout set for it */ + if (bus->state == BUS_WATCH_BIND) + return -EUNATCH; + + r = bus_ensure_running(bus); + if (r < 0) + return r; + + if (bus->wqueue_size <= 0) + return 0; + + for (;;) { + r = dispatch_wqueue(bus); + if (ERRNO_IS_NEG_DISCONNECT(r)) { + bus_enter_closing(bus); + return -ECONNRESET; + } else if (r < 0) + return r; + + if (bus->wqueue_size <= 0) + return 0; + + r = bus_poll(bus, false, UINT64_MAX); + if (ERRNO_IS_NEG_TRANSIENT(r)) + continue; + if (r < 0) + return r; + } +} + +_public_ int sd_bus_add_filter( + sd_bus *bus, + sd_bus_slot **slot, + sd_bus_message_handler_t callback, + void *userdata) { + + sd_bus_slot *s; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(callback, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + s = bus_slot_allocate(bus, !slot, BUS_FILTER_CALLBACK, sizeof(struct filter_callback), userdata); + if (!s) + return -ENOMEM; + + s->filter_callback.callback = callback; + + bus->filter_callbacks_modified = true; + LIST_PREPEND(callbacks, bus->filter_callbacks, &s->filter_callback); + + if (slot) + *slot = s; + + return 0; +} + +static int add_match_callback( + sd_bus_message *m, + void *userdata, + sd_bus_error *ret_error) { + + sd_bus_slot *match_slot = ASSERT_PTR(userdata); + bool failed = false; + int r; + + assert(m); + + sd_bus_slot_ref(match_slot); + + if (sd_bus_message_is_method_error(m, NULL)) { + log_debug_errno(sd_bus_message_get_errno(m), + "Unable to add match %s, failing connection: %s", + match_slot->match_callback.match_string, + sd_bus_message_get_error(m)->message); + + failed = true; + } else + log_debug("Match %s successfully installed.", match_slot->match_callback.match_string); + + if (match_slot->match_callback.install_callback) { + sd_bus *bus; + + bus = sd_bus_message_get_bus(m); + + /* This function has been called as slot handler, and we want to call another slot handler. Let's + * update the slot callback metadata temporarily with our own data, and then revert back to the old + * values. */ + + assert(bus->current_slot == match_slot->match_callback.install_slot); + assert(bus->current_handler == add_match_callback); + assert(bus->current_userdata == userdata); + + bus->current_slot = match_slot; + bus->current_handler = match_slot->match_callback.install_callback; + bus->current_userdata = match_slot->userdata; + + r = match_slot->match_callback.install_callback(m, match_slot->userdata, ret_error); + + bus->current_slot = match_slot->match_callback.install_slot; + bus->current_handler = add_match_callback; + bus->current_userdata = userdata; + } else { + if (failed) /* Generic failure handling: destroy the connection */ + bus_enter_closing(sd_bus_message_get_bus(m)); + + r = 1; + } + + /* We don't need the install method reply slot anymore, let's free it */ + match_slot->match_callback.install_slot = sd_bus_slot_unref(match_slot->match_callback.install_slot); + + if (failed && match_slot->floating) + bus_slot_disconnect(match_slot, true); + + sd_bus_slot_unref(match_slot); + + return r; +} + +int bus_add_match_full( + sd_bus *bus, + sd_bus_slot **slot, + bool asynchronous, + const char *match, + sd_bus_message_handler_t callback, + sd_bus_message_handler_t install_callback, + void *userdata, + uint64_t timeout_usec) { + + struct bus_match_component *components = NULL; + size_t n_components = 0; + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *s = NULL; + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(match, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + CLEANUP_ARRAY(components, n_components, bus_match_parse_free); + + r = bus_match_parse(match, &components, &n_components); + if (r < 0) + return r; + + s = bus_slot_allocate(bus, !slot, BUS_MATCH_CALLBACK, sizeof(struct match_callback), userdata); + if (!s) + return -ENOMEM; + + s->match_callback.callback = callback; + s->match_callback.install_callback = install_callback; + + if (bus->bus_client) { + enum bus_match_scope scope; + + scope = bus_match_get_scope(components, n_components); + + /* Do not install server-side matches for matches against the local service, interface or bus path. */ + if (scope != BUS_MATCH_LOCAL) { + + /* We store the original match string, so that we can use it to remove the match again. */ + + s->match_callback.match_string = strdup(match); + if (!s->match_callback.match_string) + return -ENOMEM; + + if (asynchronous) { + r = bus_add_match_internal_async(bus, + &s->match_callback.install_slot, + s->match_callback.match_string, + add_match_callback, + s, + timeout_usec); + + if (r < 0) + return r; + + /* Make the slot of the match call floating now. We need the reference, but we don't + * want that this match pins the bus object, hence we first create it non-floating, but + * then make it floating. */ + r = sd_bus_slot_set_floating(s->match_callback.install_slot, true); + } else + r = bus_add_match_internal(bus, + s->match_callback.match_string, + timeout_usec, + &s->match_callback.after); + if (r < 0) + return r; + + s->match_added = true; + } + } + + bus->match_callbacks_modified = true; + r = bus_match_add(&bus->match_callbacks, components, n_components, &s->match_callback); + if (r < 0) + return r; + + if (slot) + *slot = s; + s = NULL; + + return 0; +} + +_public_ int sd_bus_add_match( + sd_bus *bus, + sd_bus_slot **slot, + const char *match, + sd_bus_message_handler_t callback, + void *userdata) { + + return bus_add_match_full(bus, slot, false, match, callback, NULL, userdata, 0); +} + +_public_ int sd_bus_add_match_async( + sd_bus *bus, + sd_bus_slot **slot, + const char *match, + sd_bus_message_handler_t callback, + sd_bus_message_handler_t install_callback, + void *userdata) { + + return bus_add_match_full(bus, slot, true, match, callback, install_callback, userdata, 0); +} + +static int io_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + /* Note that this is called both on input_fd, output_fd as well as inotify_fd events */ + + r = sd_bus_process(bus, NULL); + if (r < 0) { + log_debug_errno(r, "Processing of bus failed, closing down: %m"); + bus_enter_closing(bus); + } + + return 1; +} + +static int time_callback(sd_event_source *s, uint64_t usec, void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + r = sd_bus_process(bus, NULL); + if (r < 0) { + log_debug_errno(r, "Processing of bus failed, closing down: %m"); + bus_enter_closing(bus); + } + + return 1; +} + +static int prepare_callback(sd_event_source *s, void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + int r, e; + usec_t until; + + assert(s); + + e = sd_bus_get_events(bus); + if (e < 0) { + r = e; + goto fail; + } + + if (bus->output_fd != bus->input_fd) { + + r = sd_event_source_set_io_events(bus->input_io_event_source, e & POLLIN); + if (r < 0) + goto fail; + + r = sd_event_source_set_io_events(bus->output_io_event_source, e & POLLOUT); + } else + r = sd_event_source_set_io_events(bus->input_io_event_source, e); + if (r < 0) + goto fail; + + r = sd_bus_get_timeout(bus, &until); + if (r < 0) + goto fail; + if (r > 0) { + int j; + + j = sd_event_source_set_time(bus->time_event_source, until); + if (j < 0) { + r = j; + goto fail; + } + } + + r = sd_event_source_set_enabled(bus->time_event_source, r > 0 ? SD_EVENT_ONESHOT : SD_EVENT_OFF); + if (r < 0) + goto fail; + + return 1; + +fail: + log_debug_errno(r, "Preparing of bus events failed, closing down: %m"); + bus_enter_closing(bus); + + return 1; +} + +static int quit_callback(sd_event_source *event, void *userdata) { + sd_bus *bus = userdata; + + assert(event); + + if (bus->close_on_exit) { + sd_bus_flush(bus); + sd_bus_close(bus); + } + + return 1; +} + +int bus_attach_io_events(sd_bus *bus) { + int r; + + assert(bus); + + if (bus->input_fd < 0) + return 0; + + if (!bus->event) + return 0; + + if (!bus->input_io_event_source) { + r = sd_event_add_io(bus->event, &bus->input_io_event_source, bus->input_fd, 0, io_callback, bus); + if (r < 0) + return r; + + r = sd_event_source_set_prepare(bus->input_io_event_source, prepare_callback); + if (r < 0) + return r; + + r = sd_event_source_set_priority(bus->input_io_event_source, bus->event_priority); + if (r < 0) + return r; + + r = sd_event_source_set_description(bus->input_io_event_source, "bus-input"); + } else + r = sd_event_source_set_io_fd(bus->input_io_event_source, bus->input_fd); + + if (r < 0) + return r; + + if (bus->output_fd != bus->input_fd) { + assert(bus->output_fd >= 0); + + if (!bus->output_io_event_source) { + r = sd_event_add_io(bus->event, &bus->output_io_event_source, bus->output_fd, 0, io_callback, bus); + if (r < 0) + return r; + + r = sd_event_source_set_priority(bus->output_io_event_source, bus->event_priority); + if (r < 0) + return r; + + r = sd_event_source_set_description(bus->input_io_event_source, "bus-output"); + } else + r = sd_event_source_set_io_fd(bus->output_io_event_source, bus->output_fd); + + if (r < 0) + return r; + } + + return 0; +} + +static void bus_detach_io_events(sd_bus *bus) { + assert(bus); + + bus->input_io_event_source = sd_event_source_disable_unref(bus->input_io_event_source); + bus->output_io_event_source = sd_event_source_disable_unref(bus->output_io_event_source); +} + +int bus_attach_inotify_event(sd_bus *bus) { + int r; + + assert(bus); + + if (bus->inotify_fd < 0) + return 0; + + if (!bus->event) + return 0; + + if (!bus->inotify_event_source) { + r = sd_event_add_io(bus->event, &bus->inotify_event_source, bus->inotify_fd, EPOLLIN, io_callback, bus); + if (r < 0) + return r; + + r = sd_event_source_set_priority(bus->inotify_event_source, bus->event_priority); + if (r < 0) + return r; + + r = sd_event_source_set_description(bus->inotify_event_source, "bus-inotify"); + } else + r = sd_event_source_set_io_fd(bus->inotify_event_source, bus->inotify_fd); + if (r < 0) + return r; + + return 0; +} + +_public_ int sd_bus_attach_event(sd_bus *bus, sd_event *event, int priority) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus->event, -EBUSY); + + assert(!bus->input_io_event_source); + assert(!bus->output_io_event_source); + assert(!bus->time_event_source); + + if (event) + bus->event = sd_event_ref(event); + else { + r = sd_event_default(&bus->event); + if (r < 0) + return r; + } + + bus->event_priority = priority; + + r = sd_event_add_time(bus->event, &bus->time_event_source, CLOCK_MONOTONIC, 0, 0, time_callback, bus); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(bus->time_event_source, priority); + if (r < 0) + goto fail; + + r = sd_event_source_set_description(bus->time_event_source, "bus-time"); + if (r < 0) + goto fail; + + r = sd_event_add_exit(bus->event, &bus->quit_event_source, quit_callback, bus); + if (r < 0) + goto fail; + + r = sd_event_source_set_description(bus->quit_event_source, "bus-exit"); + if (r < 0) + goto fail; + + r = bus_attach_io_events(bus); + if (r < 0) + goto fail; + + r = bus_attach_inotify_event(bus); + if (r < 0) + goto fail; + + return 0; + +fail: + sd_bus_detach_event(bus); + return r; +} + +_public_ int sd_bus_detach_event(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + + if (!bus->event) + return 0; + + bus_detach_io_events(bus); + bus->inotify_event_source = sd_event_source_disable_unref(bus->inotify_event_source); + bus->time_event_source = sd_event_source_disable_unref(bus->time_event_source); + bus->quit_event_source = sd_event_source_disable_unref(bus->quit_event_source); + + bus->event = sd_event_unref(bus->event); + return 1; +} + +_public_ sd_event* sd_bus_get_event(sd_bus *bus) { + assert_return(bus = bus_resolve(bus), NULL); + + return bus->event; +} + +_public_ sd_bus_message* sd_bus_get_current_message(sd_bus *bus) { + assert_return(bus = bus_resolve(bus), NULL); + + return bus->current_message; +} + +_public_ sd_bus_slot* sd_bus_get_current_slot(sd_bus *bus) { + assert_return(bus = bus_resolve(bus), NULL); + + return bus->current_slot; +} + +_public_ sd_bus_message_handler_t sd_bus_get_current_handler(sd_bus *bus) { + assert_return(bus = bus_resolve(bus), NULL); + + return bus->current_handler; +} + +_public_ void* sd_bus_get_current_userdata(sd_bus *bus) { + assert_return(bus = bus_resolve(bus), NULL); + + return bus->current_userdata; +} + +static int bus_default(int (*bus_open)(sd_bus **), sd_bus **default_bus, sd_bus **ret) { + sd_bus *b = NULL; + int r; + + assert(bus_open); + assert(default_bus); + + if (!ret) + return !!*default_bus; + + if (*default_bus) { + *ret = sd_bus_ref(*default_bus); + return 0; + } + + r = bus_open(&b); + if (r < 0) + return r; + + b->default_bus_ptr = default_bus; + b->tid = gettid(); + *default_bus = b; + + *ret = b; + return 1; +} + +_public_ int sd_bus_default_system(sd_bus **ret) { + return bus_default(sd_bus_open_system, &default_system_bus, ret); +} + +_public_ int sd_bus_default_user(sd_bus **ret) { + return bus_default(sd_bus_open_user, &default_user_bus, ret); +} + +_public_ int sd_bus_default(sd_bus **ret) { + int (*bus_open)(sd_bus **) = NULL; + sd_bus **busp; + + busp = bus_choose_default(&bus_open); + return bus_default(bus_open, busp, ret); +} + +_public_ int sd_bus_get_tid(sd_bus *b, pid_t *tid) { + assert_return(b, -EINVAL); + assert_return(tid, -EINVAL); + assert_return(!bus_origin_changed(b), -ECHILD); + + if (b->tid != 0) { + *tid = b->tid; + return 0; + } + + if (b->event) + return sd_event_get_tid(b->event, tid); + + return -ENXIO; +} + +_public_ int sd_bus_path_encode(const char *prefix, const char *external_id, char **ret_path) { + _cleanup_free_ char *e = NULL; + char *ret; + + assert_return(object_path_is_valid(prefix), -EINVAL); + assert_return(external_id, -EINVAL); + assert_return(ret_path, -EINVAL); + + e = bus_label_escape(external_id); + if (!e) + return -ENOMEM; + + ret = path_join(prefix, e); + if (!ret) + return -ENOMEM; + + *ret_path = ret; + return 0; +} + +_public_ int sd_bus_path_decode(const char *path, const char *prefix, char **external_id) { + const char *e; + char *ret; + + assert_return(object_path_is_valid(path), -EINVAL); + assert_return(object_path_is_valid(prefix), -EINVAL); + assert_return(external_id, -EINVAL); + + e = object_path_startswith(path, prefix); + if (!e) { + *external_id = NULL; + return 0; + } + + /* Note that 'e' might be an empty string here. That's expected. E.g. a case where the subtree + * corresponds to a subtree on a disk, and we want to return something that represents the root + * of the filesystem. */ + + ret = bus_label_unescape(e); + if (!ret) + return -ENOMEM; + + *external_id = ret; + return 1; +} + +_public_ int sd_bus_path_encode_many(char **out, const char *path_template, ...) { + _cleanup_strv_free_ char **labels = NULL; + char *path, *path_pos, **label_pos; + const char *sep, *template_pos; + size_t path_length; + va_list list; + int r; + + assert_return(out, -EINVAL); + assert_return(path_template, -EINVAL); + + path_length = strlen(path_template); + + va_start(list, path_template); + for (sep = strchr(path_template, '%'); sep; sep = strchr(sep + 1, '%')) { + const char *arg; + char *label; + + arg = va_arg(list, const char *); + if (!arg) { + va_end(list); + return -EINVAL; + } + + label = bus_label_escape(arg); + if (!label) { + va_end(list); + return -ENOMEM; + } + + r = strv_consume(&labels, label); + if (r < 0) { + va_end(list); + return r; + } + + /* add label length, but account for the format character */ + path_length += strlen(label) - 1; + } + va_end(list); + + path = malloc(path_length + 1); + if (!path) + return -ENOMEM; + + path_pos = path; + label_pos = labels; + + for (template_pos = path_template; *template_pos; ) { + sep = strchrnul(template_pos, '%'); + path_pos = mempcpy(path_pos, template_pos, sep - template_pos); + if (!*sep) + break; + + path_pos = stpcpy(path_pos, *label_pos++); + template_pos = sep + 1; + } + + *path_pos = 0; + *out = path; + return 0; +} + +_public_ int sd_bus_path_decode_many(const char *path, const char *path_template, ...) { + _cleanup_strv_free_ char **labels = NULL; + const char *template_pos, *path_pos; + char **label_pos; + va_list list; + int r; + + /* + * This decodes an object-path based on a template argument. The + * template consists of a verbatim path, optionally including special + * directives: + * + * - Each occurrence of '%' in the template matches an arbitrary + * substring of a label in the given path. At most one such + * directive is allowed per label. For each such directive, the + * caller must provide an output parameter (char **) via va_arg. If + * NULL is passed, the given label is verified, but not returned. + * For each matched label, the *decoded* label is stored in the + * passed output argument, and the caller is responsible to free + * it. Note that the output arguments are only modified if the + * actually path matched the template. Otherwise, they're left + * untouched. + * + * This function returns <0 on error, 0 if the path does not match the + * template, 1 if it matched. + */ + + assert_return(path, -EINVAL); + assert_return(path_template, -EINVAL); + + path_pos = path; + + for (template_pos = path_template; *template_pos; ) { + const char *sep; + size_t length; + char *label; + + /* verify everything until the next '%' matches verbatim */ + sep = strchrnul(template_pos, '%'); + length = sep - template_pos; + if (strncmp(path_pos, template_pos, length)) + return 0; + + path_pos += length; + template_pos += length; + + if (!*template_pos) + break; + + /* We found the next '%' character. Everything up until here + * matched. We now skip ahead to the end of this label and make + * sure it matches the tail of the label in the path. Then we + * decode the string in-between and save it for later use. */ + + ++template_pos; /* skip over '%' */ + + sep = strchrnul(template_pos, '/'); + length = sep - template_pos; /* length of suffix to match verbatim */ + + /* verify the suffixes match */ + sep = strchrnul(path_pos, '/'); + if (sep - path_pos < (ssize_t)length || + strncmp(sep - length, template_pos, length)) + return 0; + + template_pos += length; /* skip over matched label */ + length = sep - path_pos - length; /* length of sub-label to decode */ + + /* store unescaped label for later use */ + label = bus_label_unescape_n(path_pos, length); + if (!label) + return -ENOMEM; + + r = strv_consume(&labels, label); + if (r < 0) + return r; + + path_pos = sep; /* skip decoded label and suffix */ + } + + /* end of template must match end of path */ + if (*path_pos) + return 0; + + /* copy the labels over to the caller */ + va_start(list, path_template); + for (label_pos = labels; label_pos && *label_pos; ++label_pos) { + char **arg; + + arg = va_arg(list, char **); + if (arg) + *arg = *label_pos; + else + free(*label_pos); + } + va_end(list); + + labels = mfree(labels); + return 1; +} + +_public_ int sd_bus_try_close(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return -EOPNOTSUPP; +} + +_public_ int sd_bus_get_description(sd_bus *bus, const char **description) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(description, -EINVAL); + + const char *d = bus->description; + if (!d) + d = runtime_scope_to_string(bus->runtime_scope); + if (!d) + return -ENXIO; + + *description = d; + return 0; +} + +_public_ int sd_bus_get_scope(sd_bus *bus, const char **scope) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(scope, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (bus->runtime_scope < 0) + return -ENODATA; + + *scope = runtime_scope_to_string(bus->runtime_scope); + return 0; +} + +_public_ int sd_bus_get_address(sd_bus *bus, const char **address) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(address, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (bus->address) { + *address = bus->address; + return 0; + } + + return -ENODATA; +} + +_public_ int sd_bus_get_creds_mask(sd_bus *bus, uint64_t *mask) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(mask, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + *mask = bus->creds_mask; + return 0; +} + +_public_ int sd_bus_is_bus_client(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->bus_client; +} + +_public_ int sd_bus_is_server(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->is_server; +} + +_public_ int sd_bus_is_anonymous(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->anonymous_auth; +} + +_public_ int sd_bus_is_trusted(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->trusted; +} + +_public_ int sd_bus_is_monitor(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + + return bus->is_monitor; +} + +static void flush_close(sd_bus *bus) { + if (!bus) + return; + + /* Flushes and closes the specified bus. We take a ref before, + * to ensure the flushing does not cause the bus to be + * unreferenced. */ + + sd_bus_flush_close_unref(sd_bus_ref(bus)); +} + +_public_ void sd_bus_default_flush_close(void) { + flush_close(default_starter_bus); + flush_close(default_user_bus); + flush_close(default_system_bus); +} + +_public_ int sd_bus_set_exit_on_disconnect(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + + /* Turns on exit-on-disconnect, and triggers it immediately if the bus connection was already + * disconnected. Note that this is triggered exclusively on disconnections triggered by the server side, never + * from the client side. */ + bus->exit_on_disconnect = b; + + /* If the exit condition was triggered already, exit immediately. */ + return bus_exit_now(bus, /* event= */ NULL); +} + +_public_ int sd_bus_get_exit_on_disconnect(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + + return bus->exit_on_disconnect; +} + +_public_ int sd_bus_set_sender(sd_bus *bus, const char *sender) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus->bus_client, -EPERM); + assert_return(!sender || service_name_is_valid(sender), -EINVAL); + + return free_and_strdup(&bus->patch_sender, sender); +} + +_public_ int sd_bus_get_sender(sd_bus *bus, const char **ret) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(ret, -EINVAL); + + if (!bus->patch_sender) + return -ENODATA; + + *ret = bus->patch_sender; + return 0; +} + +_public_ int sd_bus_get_n_queued_read(sd_bus *bus, uint64_t *ret) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(ret, -EINVAL); + + *ret = bus->rqueue_size; + return 0; +} + +_public_ int sd_bus_get_n_queued_write(sd_bus *bus, uint64_t *ret) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(!bus_origin_changed(bus), -ECHILD); + assert_return(ret, -EINVAL); + + *ret = bus->wqueue_size; + return 0; +} + +_public_ int sd_bus_set_method_call_timeout(sd_bus *bus, uint64_t usec) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + + bus->method_call_timeout = usec; + return 0; +} + +_public_ int sd_bus_get_method_call_timeout(sd_bus *bus, uint64_t *ret) { + const char *e; + usec_t usec; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(ret, -EINVAL); + + if (bus->method_call_timeout != 0) { + *ret = bus->method_call_timeout; + return 0; + } + + e = secure_getenv("SYSTEMD_BUS_TIMEOUT"); + if (e && parse_sec(e, &usec) >= 0 && usec != 0) { + /* Save the parsed value to avoid multiple parsing. To change the timeout value, + * use sd_bus_set_method_call_timeout() instead of setenv(). */ + *ret = bus->method_call_timeout = usec; + return 0; + } + + *ret = bus->method_call_timeout = BUS_DEFAULT_TIMEOUT; + return 0; +} + +_public_ int sd_bus_set_close_on_exit(sd_bus *bus, int b) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + + bus->close_on_exit = b; + return 0; +} + +_public_ int sd_bus_get_close_on_exit(sd_bus *bus) { + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + + return bus->close_on_exit; +} + +_public_ int sd_bus_enqueue_for_read(sd_bus *bus, sd_bus_message *m) { + int r; + + assert_return(bus, -EINVAL); + assert_return(bus = bus_resolve(bus), -ENOPKG); + assert_return(m, -EINVAL); + assert_return(m->sealed, -EINVAL); + assert_return(!bus_origin_changed(bus), -ECHILD); + + if (!BUS_IS_OPEN(bus->state)) + return -ENOTCONN; + + /* Re-enqueue a message for reading. This is primarily useful for PolicyKit-style authentication, + * where we accept a message, then determine we need to interactively authenticate the user, and then + * we want to process the message again. */ + + r = bus_rqueue_make_room(bus); + if (r < 0) + return r; + + bus->rqueue[bus->rqueue_size++] = bus_message_ref_queued(m, bus); + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-address.c b/src/libsystemd/sd-bus/test-bus-address.c new file mode 100644 index 0000000..347ba1a --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-address.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "bus-internal.h" +#include "log.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static void test_one_address(sd_bus *b, + const char *host, + int result, const char *expected) { + int r; + + r = bus_set_address_system_remote(b, host); + log_info("\"%s\" → %d, \"%s\"", host, r, strna(r >= 0 ? b->address : NULL)); + assert_se(r == result); + if (r >= 0) + assert_se(streq_ptr(b->address, expected)); +} + +TEST(bus_set_address_system_remote) { + _cleanup_(sd_bus_unrefp) sd_bus *b = NULL; + + assert_se(sd_bus_new(&b) >= 0); + if (!strv_isempty(saved_argv + 1)) { + STRV_FOREACH(a, saved_argv + 1) + test_one_address(b, *a, 0, NULL); + return; + }; + + test_one_address(b, "host", + 0, "unixexec:path=ssh,argv1=-xT,argv2=--,argv3=host,argv4=systemd-stdio-bridge"); + test_one_address(b, "host:123", + 0, "unixexec:path=ssh,argv1=-xT,argv2=-p,argv3=123,argv4=--,argv5=host,argv6=systemd-stdio-bridge"); + test_one_address(b, "host:123:123", + -EINVAL, NULL); + test_one_address(b, "host:", + -EINVAL, NULL); + test_one_address(b, "user@host", + 0, "unixexec:path=ssh,argv1=-xT,argv2=--,argv3=user%40host,argv4=systemd-stdio-bridge"); + test_one_address(b, "user@host@host", + -EINVAL, NULL); + test_one_address(b, "[::1]", + 0, "unixexec:path=ssh,argv1=-xT,argv2=--,argv3=%3a%3a1,argv4=systemd-stdio-bridge"); + test_one_address(b, "user@[::1]", + 0, "unixexec:path=ssh,argv1=-xT,argv2=--,argv3=user%40%3a%3a1,argv4=systemd-stdio-bridge"); + test_one_address(b, "user@[::1]:99", + 0, "unixexec:path=ssh,argv1=-xT,argv2=-p,argv3=99,argv4=--,argv5=user%40%3a%3a1,argv6=systemd-stdio-bridge"); + test_one_address(b, "user@[::1]:", + -EINVAL, NULL); + test_one_address(b, "user@[::1:", + -EINVAL, NULL); + test_one_address(b, "user@", + -EINVAL, NULL); + test_one_address(b, "user@@", + -EINVAL, NULL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-bus/test-bus-benchmark.c b/src/libsystemd/sd-bus/test-bus-benchmark.c new file mode 100644 index 0000000..d988588 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-benchmark.c @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-internal.h" +#include "bus-kernel.h" +#include "constants.h" +#include "fd-util.h" +#include "missing_resource.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" + +#define MAX_SIZE (2*1024*1024) + +static usec_t arg_loop_usec = 100 * USEC_PER_MSEC; + +typedef enum Type { + TYPE_LEGACY, + TYPE_DIRECT, +} Type; + +static void server(sd_bus *b, size_t *result) { + int r; + + for (;;) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + r = sd_bus_process(b, &m); + assert_se(r >= 0); + + if (r == 0) + assert_se(sd_bus_wait(b, USEC_INFINITY) >= 0); + if (!m) + continue; + + if (sd_bus_message_is_method_call(m, "benchmark.server", "Ping")) + assert_se(sd_bus_reply_method_return(m, NULL) >= 0); + else if (sd_bus_message_is_method_call(m, "benchmark.server", "Work")) { + const void *p; + size_t sz; + + /* Make sure the mmap is mapped */ + assert_se(sd_bus_message_read_array(m, 'y', &p, &sz) > 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + } else if (sd_bus_message_is_method_call(m, "benchmark.server", "Exit")) { + uint64_t res; + assert_se(sd_bus_message_read(m, "t", &res) > 0); + + *result = res; + return; + + } else if (!sd_bus_message_is_signal(m, NULL, NULL)) + assert_not_reached(); + } +} + +static void transaction(sd_bus *b, size_t sz, const char *server_name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + uint8_t *p; + + assert_se(sd_bus_message_new_method_call(b, &m, server_name, "/", "benchmark.server", "Work") >= 0); + assert_se(sd_bus_message_append_array_space(m, 'y', sz, (void**) &p) >= 0); + + memset(p, 0x80, sz); + + assert_se(sd_bus_call(b, m, 0, NULL, &reply) >= 0); +} + +static void client_bisect(const char *address, const char *server_name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *x = NULL; + size_t lsize, rsize, csize; + sd_bus *b; + int r; + + r = sd_bus_new(&b); + assert_se(r >= 0); + + r = sd_bus_set_address(b, address); + assert_se(r >= 0); + + r = sd_bus_start(b); + assert_se(r >= 0); + + r = sd_bus_call_method(b, server_name, "/", "benchmark.server", "Ping", NULL, NULL, NULL); + assert_se(r >= 0); + + lsize = 1; + rsize = MAX_SIZE; + + printf("SIZE\tCOPY\tMEMFD\n"); + + for (;;) { + usec_t t; + unsigned n_copying, n_memfd; + + csize = (lsize + rsize) / 2; + + if (csize <= lsize) + break; + + if (csize <= 0) + break; + + printf("%zu\t", csize); + + b->use_memfd = 0; + + t = now(CLOCK_MONOTONIC); + for (n_copying = 0;; n_copying++) { + transaction(b, csize, server_name); + if (now(CLOCK_MONOTONIC) >= t + arg_loop_usec) + break; + } + printf("%u\t", (unsigned) ((n_copying * USEC_PER_SEC) / arg_loop_usec)); + + b->use_memfd = -1; + + t = now(CLOCK_MONOTONIC); + for (n_memfd = 0;; n_memfd++) { + transaction(b, csize, server_name); + if (now(CLOCK_MONOTONIC) >= t + arg_loop_usec) + break; + } + printf("%u\n", (unsigned) ((n_memfd * USEC_PER_SEC) / arg_loop_usec)); + + if (n_copying == n_memfd) + break; + + if (n_copying > n_memfd) + lsize = csize; + else + rsize = csize; + } + + b->use_memfd = 1; + assert_se(sd_bus_message_new_method_call(b, &x, server_name, "/", "benchmark.server", "Exit") >= 0); + assert_se(sd_bus_message_append(x, "t", csize) >= 0); + assert_se(sd_bus_send(b, x, NULL) >= 0); + + sd_bus_unref(b); +} + +static void client_chart(Type type, const char *address, const char *server_name, int fd) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *x = NULL; + size_t csize; + sd_bus *b; + int r; + + r = sd_bus_new(&b); + assert_se(r >= 0); + + if (type == TYPE_DIRECT) { + r = sd_bus_set_fd(b, fd, fd); + assert_se(r >= 0); + } else { + r = sd_bus_set_address(b, address); + assert_se(r >= 0); + + r = sd_bus_set_bus_client(b, true); + assert_se(r >= 0); + } + + r = sd_bus_start(b); + assert_se(r >= 0); + + r = sd_bus_call_method(b, server_name, "/", "benchmark.server", "Ping", NULL, NULL, NULL); + assert_se(r >= 0); + + switch (type) { + case TYPE_LEGACY: + printf("SIZE\tLEGACY\n"); + break; + case TYPE_DIRECT: + printf("SIZE\tDIRECT\n"); + break; + } + + for (csize = 1; csize <= MAX_SIZE; csize *= 2) { + usec_t t; + unsigned n_memfd; + + printf("%zu\t", csize); + + t = now(CLOCK_MONOTONIC); + for (n_memfd = 0;; n_memfd++) { + transaction(b, csize, server_name); + if (now(CLOCK_MONOTONIC) >= t + arg_loop_usec) + break; + } + + printf("%u\n", (unsigned) ((n_memfd * USEC_PER_SEC) / arg_loop_usec)); + } + + b->use_memfd = 1; + assert_se(sd_bus_message_new_method_call(b, &x, server_name, "/", "benchmark.server", "Exit") >= 0); + assert_se(sd_bus_message_append(x, "t", csize) >= 0); + assert_se(sd_bus_send(b, x, NULL) >= 0); + + sd_bus_unref(b); +} + +int main(int argc, char *argv[]) { + enum { + MODE_BISECT, + MODE_CHART, + } mode = MODE_BISECT; + Type type = TYPE_LEGACY; + int i, pair[2] = EBADF_PAIR; + _cleanup_free_ char *address = NULL, *server_name = NULL; + _cleanup_close_ int bus_ref = -EBADF; + const char *unique; + cpu_set_t cpuset; + size_t result; + sd_bus *b; + pid_t pid; + int r; + + test_setup_logging(LOG_DEBUG); + + for (i = 1; i < argc; i++) { + if (streq(argv[i], "chart")) { + mode = MODE_CHART; + continue; + } else if (streq(argv[i], "legacy")) { + type = TYPE_LEGACY; + continue; + } else if (streq(argv[i], "direct")) { + type = TYPE_DIRECT; + continue; + } + + assert_se(parse_sec(argv[i], &arg_loop_usec) >= 0); + } + + assert_se(arg_loop_usec > 0); + + if (type == TYPE_LEGACY) { + const char *e; + + e = secure_getenv("DBUS_SESSION_BUS_ADDRESS"); + assert_se(e); + + address = strdup(e); + assert_se(address); + } + + r = sd_bus_new(&b); + assert_se(r >= 0); + + if (type == TYPE_DIRECT) { + assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, pair) >= 0); + + r = sd_bus_set_fd(b, pair[0], pair[0]); + assert_se(r >= 0); + + r = sd_bus_set_server(b, true, SD_ID128_NULL); + assert_se(r >= 0); + } else { + r = sd_bus_set_address(b, address); + assert_se(r >= 0); + + r = sd_bus_set_bus_client(b, true); + assert_se(r >= 0); + } + + r = sd_bus_start(b); + assert_se(r >= 0); + + if (type != TYPE_DIRECT) { + r = sd_bus_get_unique_name(b, &unique); + assert_se(r >= 0); + + server_name = strdup(unique); + assert_se(server_name); + } + + sync(); + setpriority(PRIO_PROCESS, 0, -19); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + + safe_close(bus_ref); + sd_bus_unref(b); + + switch (mode) { + case MODE_BISECT: + client_bisect(address, server_name); + break; + + case MODE_CHART: + client_chart(type, address, server_name, pair[1]); + break; + } + + _exit(EXIT_SUCCESS); + } + + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + + server(b, &result); + + if (mode == MODE_BISECT) + printf("Copying/memfd are equally fast at %zu bytes\n", result); + + assert_se(waitpid(pid, NULL, 0) == pid); + + safe_close(pair[1]); + sd_bus_unref(b); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-chat.c b/src/libsystemd/sd-bus/test-bus-chat.c new file mode 100644 index 0000000..da1340f --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-chat.c @@ -0,0 +1,539 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-match.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "tests.h" + +static int match_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + log_info("Match triggered! destination=%s interface=%s member=%s", + strna(sd_bus_message_get_destination(m)), + strna(sd_bus_message_get_interface(m)), + strna(sd_bus_message_get_member(m))); + return 0; +} + +static int object_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + int r; + + if (sd_bus_message_is_method_error(m, NULL)) + return 0; + + if (sd_bus_message_is_method_call(m, "org.object.test", "Foobar")) { + log_info("Invoked Foobar() on %s", sd_bus_message_get_path(m)); + + r = sd_bus_reply_method_return(m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + + return 1; + } + + return 0; +} + +static int server_init(sd_bus **ret_bus) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + const char *unique, *desc; + sd_id128_t id; + int r; + + assert_se(ret_bus); + + r = sd_bus_open_user_with_description(&bus, "my bus!"); + if (r < 0) + return log_error_errno(r, "Failed to connect to user bus: %m"); + + r = sd_bus_get_bus_id(bus, &id); + if (r < 0) + return log_error_errno(r, "Failed to get server ID: %m"); + + r = sd_bus_get_unique_name(bus, &unique); + if (r < 0) + return log_error_errno(r, "Failed to get unique name: %m"); + + assert_se(sd_bus_get_description(bus, &desc) >= 0); + assert_se(streq(desc, "my bus!")); + + log_info("Peer ID is " SD_ID128_FORMAT_STR ".", SD_ID128_FORMAT_VAL(id)); + log_info("Unique ID: %s", unique); + log_info("Can send file handles: %i", sd_bus_can_send(bus, 'h')); + + r = sd_bus_request_name(bus, "org.freedesktop.systemd.test", 0); + if (r < 0) + return log_error_errno(r, "Failed to acquire name: %m"); + + r = sd_bus_add_fallback(bus, NULL, "/foo/bar", object_callback, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add object: %m"); + + r = sd_bus_match_signal(bus, NULL, NULL, NULL, "foo.bar", "Notify", match_callback, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request match: %m"); + + r = sd_bus_match_signal(bus, NULL, NULL, NULL, "foo.bar", "NotifyTo", match_callback, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request match: %m"); + + r = sd_bus_add_match(bus, NULL, "type='signal',interface='org.freedesktop.DBus',member='NameOwnerChanged'", match_callback, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + bus_match_dump(stdout, &bus->match_callbacks, 0); + + *ret_bus = TAKE_PTR(bus); + return 0; +} + +static int server(sd_bus *_bus) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = ASSERT_PTR(_bus); + bool client1_gone = false, client2_gone = false; + int r; + + while (!client1_gone || !client2_gone) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid = 0; + const char *label = NULL; + + r = sd_bus_process(bus, &m); + if (r < 0) + return log_error_errno(r, "Failed to process requests: %m"); + if (r == 0) { + r = sd_bus_wait(bus, UINT64_MAX); + if (r < 0) + return log_error_errno(r, "Failed to wait: %m"); + + continue; + } + if (!m) + continue; + + r = sd_bus_query_sender_creds(m, SD_BUS_CREDS_AUGMENT | SD_BUS_CREDS_PID | SD_BUS_CREDS_SELINUX_CONTEXT, &creds); + if (r < 0) + log_debug_errno(r, "Failed to query sender credentials, ignoring: %m"); + else { + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return log_error_errno(r, "Failed to get sender pid: %m"); + + (void) sd_bus_creds_get_selinux_context(creds, &label); + } + + log_info("Got message! member=%s pid="PID_FMT" label=%s", + strna(sd_bus_message_get_member(m)), + pid, + strna(label)); + + /* sd_bus_message_dump(m); */ + /* sd_bus_message_rewind(m, true); */ + + if (sd_bus_message_is_method_call(m, "org.freedesktop.systemd.test", "LowerCase")) { + const char *hello; + _cleanup_free_ char *lowercase = NULL; + + r = sd_bus_message_read(m, "s", &hello); + if (r < 0) + return log_error_errno(r, "Failed to get parameter: %m"); + + lowercase = strdup(hello); + if (!lowercase) + return log_oom(); + + ascii_strlower(lowercase); + + r = sd_bus_reply_method_return(m, "s", lowercase); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + + } else if (sd_bus_message_is_method_call(m, "org.freedesktop.systemd.test", "ExitClient1")) { + + r = sd_bus_reply_method_return(m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + + client1_gone = true; + } else if (sd_bus_message_is_method_call(m, "org.freedesktop.systemd.test", "ExitClient2")) { + + r = sd_bus_reply_method_return(m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + + client2_gone = true; + } else if (sd_bus_message_is_method_call(m, "org.freedesktop.systemd.test", "Slow")) { + + sleep(1); + + r = sd_bus_reply_method_return(m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + + } else if (sd_bus_message_is_method_call(m, "org.freedesktop.systemd.test", "FileDescriptor")) { + int fd; + static const char x = 'X'; + + r = sd_bus_message_read(m, "h", &fd); + if (r < 0) + return log_error_errno(r, "Failed to get parameter: %m"); + + log_info("Received fd=%d", fd); + + if (write(fd, &x, 1) < 0) { + r = log_error_errno(errno, "Failed to write to fd: %m"); + safe_close(fd); + return r; + } + + r = sd_bus_reply_method_return(m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + + } else if (sd_bus_message_is_method_call(m, NULL, NULL)) { + + r = sd_bus_reply_method_error( + m, + &SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_UNKNOWN_METHOD, "Unknown method.")); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + } + } + + return 0; +} + +static void* client1(void *p) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *hello; + int r; + _cleanup_close_pair_ int pp[2] = EBADF_PAIR; + char x; + + r = sd_bus_open_user(&bus); + if (r < 0) { + log_error_errno(r, "Failed to connect to user bus: %m"); + goto finish; + } + + r = sd_bus_call_method( + bus, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "LowerCase", + &error, + &reply, + "s", + "HELLO"); + if (r < 0) { + log_error_errno(r, "Failed to issue method call: %m"); + goto finish; + } + + r = sd_bus_message_read(reply, "s", &hello); + if (r < 0) { + log_error_errno(r, "Failed to get string: %m"); + goto finish; + } + + assert_se(streq(hello, "hello")); + + if (pipe2(pp, O_CLOEXEC|O_NONBLOCK) < 0) { + r = log_error_errno(errno, "Failed to allocate pipe: %m"); + goto finish; + } + + log_info("Sending fd=%d", pp[1]); + + r = sd_bus_call_method( + bus, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "FileDescriptor", + &error, + NULL, + "h", + pp[1]); + if (r < 0) { + log_error_errno(r, "Failed to issue method call: %m"); + goto finish; + } + + errno = 0; + if (read(pp[0], &x, 1) <= 0) { + log_error("Failed to read from pipe: %s", STRERROR_OR_EOF(errno)); + goto finish; + } + + r = 0; + +finish: + if (bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *q = NULL; + + r = sd_bus_message_new_method_call( + bus, + &q, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "ExitClient1"); + if (r < 0) + log_error_errno(r, "Failed to allocate method call: %m"); + else + sd_bus_send(bus, q, NULL); + + } + + return INT_TO_PTR(r); +} + +static int quit_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + bool *x = userdata; + + log_error_errno(sd_bus_message_get_errno(m), "Quit callback: %m"); + + *x = 1; + return 1; +} + +static void* client2(void *p) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool quit = false; + const char *mid; + int r; + + r = sd_bus_open_user(&bus); + if (r < 0) { + log_error_errno(r, "Failed to connect to user bus: %m"); + goto finish; + } + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.systemd.test", + "/foo/bar/waldo/piep", + "org.object.test", + "Foobar"); + if (r < 0) { + log_error_errno(r, "Failed to allocate method call: %m"); + goto finish; + } + + r = sd_bus_send(bus, m, NULL); + if (r < 0) { + log_error("Failed to issue method call: %s", bus_error_message(&error, r)); + goto finish; + } + + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_signal( + bus, + &m, + "/foobar", + "foo.bar", + "Notify"); + if (r < 0) { + log_error_errno(r, "Failed to allocate signal: %m"); + goto finish; + } + + r = sd_bus_send(bus, m, NULL); + if (r < 0) { + log_error("Failed to issue signal: %s", bus_error_message(&error, r)); + goto finish; + } + + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_signal_to( + bus, + &m, + "org.freedesktop.systemd.test", + "/foobar", + "foo.bar", + "NotifyTo"); + if (r < 0) { + log_error_errno(r, "Failed to allocate signal to: %m"); + goto finish; + } + + r = sd_bus_send(bus, m, NULL); + if (r < 0) { + log_error("Failed to issue signal to: %s", bus_error_message(&error, r)); + goto finish; + } + + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.DBus.Peer", + "GetMachineId"); + if (r < 0) { + log_error_errno(r, "Failed to allocate method call: %m"); + goto finish; + } + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + log_error("Failed to issue method call: %s", bus_error_message(&error, r)); + goto finish; + } + + r = sd_bus_message_read(reply, "s", &mid); + if (r < 0) { + log_error_errno(r, "Failed to parse machine ID: %m"); + goto finish; + } + + log_info("Machine ID is %s.", mid); + + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "Slow"); + if (r < 0) { + log_error_errno(r, "Failed to allocate method call: %m"); + goto finish; + } + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call(bus, m, 200 * USEC_PER_MSEC, &error, &reply); + if (r < 0) + log_debug("Failed to issue method call: %s", bus_error_message(&error, r)); + else { + log_error("Slow call unexpectedly succeed."); + r = -ENOANO; + goto finish; + } + + m = sd_bus_message_unref(m); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "Slow"); + if (r < 0) { + log_error_errno(r, "Failed to allocate method call: %m"); + goto finish; + } + + r = sd_bus_call_async(bus, NULL, m, quit_callback, &quit, 200 * USEC_PER_MSEC); + if (r < 0) { + log_info("Failed to issue method call: %s", bus_error_message(&error, r)); + goto finish; + } + + while (!quit) { + r = sd_bus_process(bus, NULL); + if (r < 0) { + log_error_errno(r, "Failed to process requests: %m"); + goto finish; + } + if (r == 0) { + r = sd_bus_wait(bus, UINT64_MAX); + if (r < 0) { + log_error_errno(r, "Failed to wait: %m"); + goto finish; + } + } + } + + r = 0; + +finish: + if (bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *q = NULL; + + r = sd_bus_message_new_method_call( + bus, + &q, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "ExitClient2"); + if (r < 0) { + log_error_errno(r, "Failed to allocate method call: %m"); + goto finish; + } + + (void) sd_bus_send(bus, q, NULL); + } + + return INT_TO_PTR(r); +} + +int main(int argc, char *argv[]) { + pthread_t c1, c2; + sd_bus *bus; + void *p; + int q, r; + + test_setup_logging(LOG_INFO); + + r = server_init(&bus); + if (r < 0) + return log_tests_skipped("Failed to connect to bus"); + + log_info("Initialized..."); + + r = pthread_create(&c1, NULL, client1, bus); + if (r != 0) + return EXIT_FAILURE; + + r = pthread_create(&c2, NULL, client2, bus); + if (r != 0) + return EXIT_FAILURE; + + r = server(bus); + + q = pthread_join(c1, &p); + if (q != 0) + return EXIT_FAILURE; + if (PTR_TO_INT(p) < 0) + return EXIT_FAILURE; + + q = pthread_join(c2, &p); + if (q != 0) + return EXIT_FAILURE; + if (PTR_TO_INT(p) < 0) + return EXIT_FAILURE; + + if (r < 0) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/src/libsystemd/sd-bus/test-bus-cleanup.c b/src/libsystemd/sd-bus/test-bus-cleanup.c new file mode 100644 index 0000000..3e14627 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-cleanup.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "bus-internal.h" +#include "bus-message.h" +#include "process-util.h" +#include "tests.h" + +static bool use_system_bus = false; + +static void test_bus_new(void) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + + assert_se(sd_bus_new(&bus) == 0); + assert_se(bus->n_ref == 1); +} + +static void test_bus_fork(void) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + int r; + + assert_se(sd_bus_new(&bus) == 0); + assert_se(bus->n_ref == 1); + + /* Check that after a fork the cleanup functions return NULL */ + r = safe_fork("(bus-fork-test)", FORK_WAIT|FORK_LOG, NULL); + if (r == 0) { + assert_se(bus); + assert_se(sd_bus_is_ready(bus) == -ECHILD); + assert_se(sd_bus_flush_close_unref(bus) == NULL); + assert_se(sd_bus_close_unref(bus) == NULL); + assert_se(sd_bus_unref(bus) == NULL); + sd_bus_close(bus); + assert_se(bus->n_ref == 1); + _exit(EXIT_SUCCESS); + } + + assert_se(r >= 0); + assert_se(bus->n_ref == 1); +} + +static int test_bus_open(void) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + r = sd_bus_open_user(&bus); + if (IN_SET(r, -ECONNREFUSED, -ENOENT, -ENOMEDIUM)) { + r = sd_bus_open_system(&bus); + if (IN_SET(r, -ECONNREFUSED, -ENOENT)) + return r; + use_system_bus = true; + } + + assert_se(r >= 0); + assert_se(bus->n_ref >= 1); /* we send a hello message when opening, so the count is above 1 */ + + return 0; +} + +static void test_bus_new_method_call(void) { + sd_bus *bus = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + assert_se(use_system_bus ? sd_bus_open_system(&bus) >= 0 : sd_bus_open_user(&bus) >= 0); + + assert_se(sd_bus_message_new_method_call(bus, &m, "a.service.name", "/an/object/path", "an.interface.name", "AMethodName") >= 0); + + assert_se(m->n_ref == 1); /* We hold the only reference to the message */ + assert_se(bus->n_ref >= 2); + sd_bus_flush_close_unref(bus); + assert_se(m->n_ref == 1); +} + +static void test_bus_new_signal(void) { + sd_bus *bus = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + assert_se(use_system_bus ? sd_bus_open_system(&bus) >= 0 : sd_bus_open_user(&bus) >= 0); + + assert_se(sd_bus_message_new_signal(bus, &m, "/an/object/path", "an.interface.name", "Name") >= 0); + + assert_se(m->n_ref == 1); /* We hold the only reference to the message */ + assert_se(bus->n_ref >= 2); + sd_bus_flush_close_unref(bus); + assert_se(m->n_ref == 1); +} + +int main(int argc, char **argv) { + test_setup_logging(LOG_INFO); + + test_bus_new(); + test_bus_fork(); + + if (test_bus_open() < 0) + return log_tests_skipped("Failed to connect to bus"); + + test_bus_new_method_call(); + test_bus_new_signal(); + + return EXIT_SUCCESS; +} diff --git a/src/libsystemd/sd-bus/test-bus-creds.c b/src/libsystemd/sd-bus/test-bus-creds.c new file mode 100644 index 0000000..13801be --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-creds.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "bus-dump.h" +#include "cgroup-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + int r; + + test_setup_logging(LOG_DEBUG); + + if (cg_unified() == -ENOMEDIUM) + return log_tests_skipped("/sys/fs/cgroup/ not available"); + + r = sd_bus_creds_new_from_pid(&creds, 0, _SD_BUS_CREDS_ALL); + log_full_errno(r < 0 ? LOG_ERR : LOG_DEBUG, r, "sd_bus_creds_new_from_pid: %m"); + assert_se(r >= 0); + + bus_creds_dump(creds, NULL, true); + + creds = sd_bus_creds_unref(creds); + + r = sd_bus_creds_new_from_pid(&creds, 1, _SD_BUS_CREDS_ALL); + if (r != -EACCES) { + assert_se(r >= 0); + putchar('\n'); + bus_creds_dump(creds, NULL, true); + } + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-error.c b/src/libsystemd/sd-bus/test-bus-error.c new file mode 100644 index 0000000..a55f3f9 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-error.c @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "errno-list.h" +#include "errno-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL, second = SD_BUS_ERROR_NULL; + const sd_bus_error const_error = SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_FILE_EXISTS, "const error"); + const sd_bus_error temporarily_const_error = { + .name = SD_BUS_ERROR_ACCESS_DENIED, + .message = "oh! no", + ._need_free = -1, + }; + + assert_se(!sd_bus_error_is_set(&error)); + assert_se(sd_bus_error_set(&error, SD_BUS_ERROR_NOT_SUPPORTED, "xxx") == -EOPNOTSUPP); + assert_se(streq(error.name, SD_BUS_ERROR_NOT_SUPPORTED)); + assert_se(streq(error.message, "xxx")); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_NOT_SUPPORTED)); + assert_se(sd_bus_error_has_names_sentinel(&error, SD_BUS_ERROR_NOT_SUPPORTED, NULL)); + assert_se(sd_bus_error_has_names(&error, SD_BUS_ERROR_NOT_SUPPORTED)); + assert_se(sd_bus_error_has_names(&error, SD_BUS_ERROR_NOT_SUPPORTED, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_has_names(&error, SD_BUS_ERROR_FILE_NOT_FOUND, SD_BUS_ERROR_NOT_SUPPORTED, NULL)); + assert_se(!sd_bus_error_has_names(&error, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_get_errno(&error) == EOPNOTSUPP); + assert_se(sd_bus_error_is_set(&error)); + sd_bus_error_free(&error); + + /* Check with no error */ + assert_se(!sd_bus_error_is_set(&error)); + assert_se(sd_bus_error_setf(&error, NULL, "yyy %i", -1) == 0); + assert_se(error.name == NULL); + assert_se(error.message == NULL); + assert_se(!sd_bus_error_has_name(&error, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(!sd_bus_error_has_names(&error, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_get_errno(&error) == 0); + assert_se(!sd_bus_error_is_set(&error)); + + assert_se(sd_bus_error_setf(&error, SD_BUS_ERROR_FILE_NOT_FOUND, "yyy %i", -1) == -ENOENT); + assert_se(streq(error.name, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(streq(error.message, "yyy -1")); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_has_names(&error, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_get_errno(&error) == ENOENT); + assert_se(sd_bus_error_is_set(&error)); + + assert_se(!sd_bus_error_is_set(&second)); + assert_se(second._need_free == 0); + assert_se(error._need_free > 0); + assert_se(sd_bus_error_copy(&second, &error) == -ENOENT); + assert_se(second._need_free > 0); + assert_se(streq(error.name, second.name)); + assert_se(streq(error.message, second.message)); + assert_se(sd_bus_error_get_errno(&second) == ENOENT); + assert_se(sd_bus_error_has_name(&second, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_has_names(&second, SD_BUS_ERROR_FILE_NOT_FOUND)); + assert_se(sd_bus_error_is_set(&second)); + + sd_bus_error_free(&error); + sd_bus_error_free(&second); + + assert_se(!sd_bus_error_is_set(&second)); + assert_se(const_error._need_free == 0); + assert_se(sd_bus_error_copy(&second, &const_error) == -EEXIST); + assert_se(second._need_free == 0); + assert_se(streq(const_error.name, second.name)); + assert_se(streq(const_error.message, second.message)); + assert_se(sd_bus_error_get_errno(&second) == EEXIST); + assert_se(sd_bus_error_has_name(&second, SD_BUS_ERROR_FILE_EXISTS)); + assert_se(sd_bus_error_is_set(&second)); + sd_bus_error_free(&second); + + assert_se(!sd_bus_error_is_set(&second)); + assert_se(temporarily_const_error._need_free < 0); + assert_se(sd_bus_error_copy(&second, &temporarily_const_error) == -EACCES); + assert_se(second._need_free > 0); + assert_se(streq(temporarily_const_error.name, second.name)); + assert_se(streq(temporarily_const_error.message, second.message)); + assert_se(sd_bus_error_get_errno(&second) == EACCES); + assert_se(sd_bus_error_has_name(&second, SD_BUS_ERROR_ACCESS_DENIED)); + assert_se(sd_bus_error_is_set(&second)); + + assert_se(!sd_bus_error_is_set(&error)); + assert_se(sd_bus_error_set_const(&error, "System.Error.EUCLEAN", "Hallo") == -EUCLEAN); + assert_se(streq(error.name, "System.Error.EUCLEAN")); + assert_se(streq(error.message, "Hallo")); + assert_se(sd_bus_error_has_name(&error, "System.Error.EUCLEAN")); + assert_se(sd_bus_error_get_errno(&error) == EUCLEAN); + assert_se(sd_bus_error_is_set(&error)); + sd_bus_error_free(&error); + + assert_se(!sd_bus_error_is_set(&error)); + assert_se(sd_bus_error_set_errno(&error, EBUSY) == -EBUSY); + assert_se(streq(error.name, "System.Error.EBUSY")); + assert_se(streq(error.message, STRERROR(EBUSY))); + assert_se(sd_bus_error_has_name(&error, "System.Error.EBUSY")); + assert_se(sd_bus_error_get_errno(&error) == EBUSY); + assert_se(sd_bus_error_is_set(&error)); + sd_bus_error_free(&error); + + assert_se(!sd_bus_error_is_set(&error)); + assert_se(sd_bus_error_set_errnof(&error, EIO, "Waldi %c", 'X') == -EIO); + assert_se(streq(error.name, SD_BUS_ERROR_IO_ERROR)); + assert_se(streq(error.message, "Waldi X")); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_IO_ERROR)); + assert_se(sd_bus_error_get_errno(&error) == EIO); + assert_se(sd_bus_error_is_set(&error)); + sd_bus_error_free(&error); + + /* Check with no error */ + assert_se(!sd_bus_error_is_set(&error)); + assert_se(sd_bus_error_set_errnof(&error, 0, "Waldi %c", 'X') == 0); + assert_se(error.name == NULL); + assert_se(error.message == NULL); + assert_se(!sd_bus_error_has_name(&error, SD_BUS_ERROR_IO_ERROR)); + assert_se(sd_bus_error_get_errno(&error) == 0); + assert_se(!sd_bus_error_is_set(&error)); +} + +extern const sd_bus_error_map __start_SYSTEMD_BUS_ERROR_MAP[]; +extern const sd_bus_error_map __stop_SYSTEMD_BUS_ERROR_MAP[]; + +static int dump_mapping_table(void) { + const sd_bus_error_map *m; + + printf("----- errno mappings ------\n"); + m = ALIGN_PTR(__start_SYSTEMD_BUS_ERROR_MAP); + while (m < __stop_SYSTEMD_BUS_ERROR_MAP) { + + if (m->code == BUS_ERROR_MAP_END_MARKER) { + m = ALIGN_PTR(m + 1); + continue; + } + + printf("%s -> %i/%s\n", strna(m->name), m->code, strna(errno_to_name(m->code))); + m++; + } + printf("---------------------------\n"); + + return EXIT_SUCCESS; +} + +TEST(errno_mapping_standard) { + assert_se(sd_bus_error_set(NULL, "System.Error.EUCLEAN", NULL) == -EUCLEAN); + assert_se(sd_bus_error_set(NULL, "System.Error.EBUSY", NULL) == -EBUSY); + assert_se(sd_bus_error_set(NULL, "System.Error.EINVAL", NULL) == -EINVAL); + assert_se(sd_bus_error_set(NULL, "System.Error.WHATSIT", NULL) == -EIO); +} + +BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map test_errors[] = { + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error", 5), + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-2", 52), + SD_BUS_ERROR_MAP_END +}; + +BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map test_errors2[] = { + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-3", 33), + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-4", 44), + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-33", 333), + SD_BUS_ERROR_MAP_END +}; + +static const sd_bus_error_map test_errors3[] = { + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-88", 888), + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-99", 999), + SD_BUS_ERROR_MAP_END +}; + +static const sd_bus_error_map test_errors4[] = { + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-77", 777), + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-78", 778), + SD_BUS_ERROR_MAP_END +}; + +static const sd_bus_error_map test_errors_bad1[] = { + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-1", 0), + SD_BUS_ERROR_MAP_END +}; + +static const sd_bus_error_map test_errors_bad2[] = { + SD_BUS_ERROR_MAP("org.freedesktop.custom-dbus-error-1", -1), + SD_BUS_ERROR_MAP_END +}; + +TEST(errno_mapping_custom) { + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error", NULL) == -5); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-2", NULL) == -52); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-x", NULL) == -EIO); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-33", NULL) == -333); + + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-88", NULL) == -EIO); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-99", NULL) == -EIO); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-77", NULL) == -EIO); + + assert_se(sd_bus_error_add_map(test_errors3) > 0); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-88", NULL) == -888); + assert_se(sd_bus_error_add_map(test_errors4) > 0); + assert_se(sd_bus_error_add_map(test_errors4) == 0); + assert_se(sd_bus_error_add_map(test_errors3) == 0); + + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-99", NULL) == -999); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-77", NULL) == -777); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-78", NULL) == -778); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-2", NULL) == -52); + assert_se(sd_bus_error_set(NULL, "org.freedesktop.custom-dbus-error-y", NULL) == -EIO); + + assert_se(sd_bus_error_set(NULL, BUS_ERROR_NO_SUCH_UNIT, NULL) == -ENOENT); + + assert_se(sd_bus_error_add_map(test_errors_bad1) == -EINVAL); + assert_se(sd_bus_error_add_map(test_errors_bad2) == -EINVAL); +} + +TEST(sd_bus_error_set_errnof) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *str = NULL; + + assert_se(sd_bus_error_set_errnof(NULL, 0, NULL) == 0); + assert_se(sd_bus_error_set_errnof(NULL, ENOANO, NULL) == -ENOANO); + + assert_se(sd_bus_error_set_errnof(&error, 0, NULL) == 0); + assert_se(!bus_error_is_dirty(&error)); + + assert_se(sd_bus_error_set_errnof(&error, EACCES, NULL) == -EACCES); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_ACCESS_DENIED)); + errno = EACCES; + assert_se(asprintf(&str, "%m") >= 0); + assert_se(streq(error.message, str)); + assert_se(error._need_free == 0); + + str = mfree(str); + sd_bus_error_free(&error); + + assert_se(sd_bus_error_set_errnof(&error, ENOANO, NULL) == -ENOANO); + assert_se(sd_bus_error_has_name(&error, "System.Error.ENOANO")); + errno = ENOANO; + assert_se(asprintf(&str, "%m") >= 0); + assert_se(streq(error.message, str)); + assert_se(error._need_free == 1); + + str = mfree(str); + sd_bus_error_free(&error); + + assert_se(sd_bus_error_set_errnof(&error, 100000, NULL) == -100000); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_FAILED)); + errno = 100000; + assert_se(asprintf(&str, "%m") >= 0); + assert_se(streq(error.message, str)); + assert_se(error._need_free == 1); + + str = mfree(str); + sd_bus_error_free(&error); + + assert_se(sd_bus_error_set_errnof(NULL, 0, "hoge %s: %m", "foo") == 0); + assert_se(sd_bus_error_set_errnof(NULL, ENOANO, "hoge %s: %m", "foo") == -ENOANO); + + assert_se(sd_bus_error_set_errnof(&error, 0, "hoge %s: %m", "foo") == 0); + assert_se(!bus_error_is_dirty(&error)); + + assert_se(sd_bus_error_set_errnof(&error, EACCES, "hoge %s: %m", "foo") == -EACCES); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_ACCESS_DENIED)); + errno = EACCES; + assert_se(asprintf(&str, "hoge %s: %m", "foo") >= 0); + assert_se(streq(error.message, str)); + assert_se(error._need_free == 1); + + str = mfree(str); + sd_bus_error_free(&error); + + assert_se(sd_bus_error_set_errnof(&error, ENOANO, "hoge %s: %m", "foo") == -ENOANO); + assert_se(sd_bus_error_has_name(&error, "System.Error.ENOANO")); + errno = ENOANO; + assert_se(asprintf(&str, "hoge %s: %m", "foo") >= 0); + assert_se(streq(error.message, str)); + assert_se(error._need_free == 1); + + str = mfree(str); + sd_bus_error_free(&error); + + assert_se(sd_bus_error_set_errnof(&error, 100000, "hoge %s: %m", "foo") == -100000); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_FAILED)); + errno = 100000; + assert_se(asprintf(&str, "hoge %s: %m", "foo") >= 0); + assert_se(streq(error.message, str)); + assert_se(error._need_free == 1); +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, dump_mapping_table); diff --git a/src/libsystemd/sd-bus/test-bus-introspect.c b/src/libsystemd/sd-bus/test-bus-introspect.c new file mode 100644 index 0000000..3c026ae --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-introspect.c @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-introspect.h" +#include "log.h" +#include "tests.h" + +#include "test-vtable-data.h" + +static void test_manual_introspection_one(const sd_bus_vtable vtable[]) { + struct introspect intro = {}; + _cleanup_free_ char *s = NULL; + + log_info("/* %s */", __func__); + + assert_se(introspect_begin(&intro, false) >= 0); + + assert_se(introspect_write_interface(&intro, "org.foo", vtable) >= 0); + /* write again to check if output looks OK for a different interface */ + assert_se(introspect_write_interface(&intro, "org.foo.bar", vtable) >= 0); + assert_se(introspect_finish(&intro, &s) == 0); + + fputs(s, stdout); + fputs("\n", stdout); +} + +TEST(manual_introspection) { + test_manual_introspection_one(test_vtable_1); + test_manual_introspection_one(test_vtable_2); + test_manual_introspection_one(test_vtable_deprecated); + test_manual_introspection_one((const sd_bus_vtable *) vtable_format_221); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/libsystemd/sd-bus/test-bus-marshal.c b/src/libsystemd/sd-bus/test-bus-marshal.c new file mode 100644 index 0000000..0044d33 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-marshal.c @@ -0,0 +1,418 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#if HAVE_GLIB +#include +#endif + +#if HAVE_DBUS +#include +#endif + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-dump.h" +#include "bus-label.h" +#include "bus-message.h" +#include "bus-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "memstream-util.h" +#include "tests.h" + +static void test_bus_path_encode_unique(void) { + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL, *d = NULL, *e = NULL; + + assert_se(bus_path_encode_unique(NULL, "/foo/bar", "some.sender", "a.suffix", &a) >= 0 && streq_ptr(a, "/foo/bar/some_2esender/a_2esuffix")); + assert_se(bus_path_decode_unique(a, "/foo/bar", &b, &c) > 0 && streq_ptr(b, "some.sender") && streq_ptr(c, "a.suffix")); + assert_se(bus_path_decode_unique(a, "/bar/foo", &d, &d) == 0 && !d); + assert_se(bus_path_decode_unique("/foo/bar/onlyOneSuffix", "/foo/bar", &d, &d) == 0 && !d); + assert_se(bus_path_decode_unique("/foo/bar/_/_", "/foo/bar", &d, &e) > 0 && streq_ptr(d, "") && streq_ptr(e, "")); +} + +static void test_bus_path_encode(void) { + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL, *d = NULL, *e = NULL, *f = NULL, *g = NULL; + + assert_se(sd_bus_path_encode("/foo/bar", "waldo", &a) >= 0 && streq(a, "/foo/bar/waldo")); + assert_se(sd_bus_path_decode(a, "/waldo", &b) == 0 && b == NULL); + assert_se(sd_bus_path_decode(a, "/foo/bar", &b) > 0 && streq(b, "waldo")); + + assert_se(sd_bus_path_encode("xxxx", "waldo", &c) < 0); + assert_se(sd_bus_path_encode("/foo/", "waldo", &c) < 0); + + assert_se(sd_bus_path_encode("/foo/bar", "", &c) >= 0 && streq(c, "/foo/bar/_")); + assert_se(sd_bus_path_decode(c, "/foo/bar", &d) > 0 && streq(d, "")); + + assert_se(sd_bus_path_encode("/foo/bar", "foo.bar", &e) >= 0 && streq(e, "/foo/bar/foo_2ebar")); + assert_se(sd_bus_path_decode(e, "/foo/bar", &f) > 0 && streq(f, "foo.bar")); + + assert_se(sd_bus_path_decode("/waldo", "/waldo", &g) > 0 && streq(g, "")); +} + +static void test_bus_path_encode_many(void) { + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL, *d = NULL, *e = NULL, *f = NULL; + + assert_se(sd_bus_path_decode_many("/foo/bar", "/prefix/%", NULL) == 0); + assert_se(sd_bus_path_decode_many("/prefix/bar", "/prefix/%bar", NULL) == 1); + assert_se(sd_bus_path_decode_many("/foo/bar", "/prefix/%/suffix", NULL) == 0); + assert_se(sd_bus_path_decode_many("/prefix/foobar/suffix", "/prefix/%/suffix", &a) == 1 && streq_ptr(a, "foobar")); + assert_se(sd_bus_path_decode_many("/prefix/one_foo_two/mid/three_bar_four/suffix", "/prefix/one_%_two/mid/three_%_four/suffix", &b, &c) == 1 && streq_ptr(b, "foo") && streq_ptr(c, "bar")); + assert_se(sd_bus_path_decode_many("/prefix/one_foo_two/mid/three_bar_four/suffix", "/prefix/one_%_two/mid/three_%_four/suffix", NULL, &d) == 1 && streq_ptr(d, "bar")); + + assert_se(sd_bus_path_decode_many("/foo/bar", "/foo/bar/%", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/bar%", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/%/bar", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/%bar", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/bar/suffix") == 1); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/%%/suffix", NULL, NULL) == 0); /* multiple '%' are treated verbatim */ + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/%/suffi", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/%/suffix", &e) == 1 && streq_ptr(e, "bar")); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/foo/%/%", NULL, NULL) == 1); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/%/%/%", NULL, NULL, NULL) == 1); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "%/%/%", NULL, NULL, NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/%/%", NULL, NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/%/%/", NULL, NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/%/", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "/%", NULL) == 0); + assert_se(sd_bus_path_decode_many("/foo/bar/suffix", "%", NULL) == 0); + + assert_se(sd_bus_path_encode_many(&f, "/prefix/one_%_two/mid/three_%_four/suffix", "foo", "bar") >= 0 && streq_ptr(f, "/prefix/one_foo_two/mid/three_bar_four/suffix")); +} + +static void test_bus_label_escape_one(const char *a, const char *b) { + _cleanup_free_ char *t = NULL, *x = NULL, *y = NULL; + + assert_se(t = bus_label_escape(a)); + assert_se(streq(t, b)); + + assert_se(x = bus_label_unescape(t)); + assert_se(streq(a, x)); + + assert_se(y = bus_label_unescape(b)); + assert_se(streq(a, y)); +} + +static void test_bus_label_escape(void) { + test_bus_label_escape_one("foo123bar", "foo123bar"); + test_bus_label_escape_one("foo.bar", "foo_2ebar"); + test_bus_label_escape_one("foo_2ebar", "foo_5f2ebar"); + test_bus_label_escape_one("", "_"); + test_bus_label_escape_one("_", "_5f"); + test_bus_label_escape_one("1", "_31"); + test_bus_label_escape_one(":1", "_3a1"); +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *copy = NULL; + _cleanup_free_ char *h = NULL, *first = NULL, *second = NULL, *third = NULL; + const int32_t integer_array[] = { -1, -2, 0, 1, 2 }, *return_array; + const char *x, *x2, *y, *z, *a, *b, *c, *d, *a_signature; + size_t sz, first_size, second_size = 0, third_size = 0; + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + _cleanup_(memstream_done) MemStream ms = {}; + void *buffer = NULL; + int r, boolean; + uint64_t u64; + uint8_t u, v; + double dbl; + FILE *mf; + char *s; + + test_setup_logging(LOG_INFO); + + r = sd_bus_default_user(&bus); + if (r < 0) + r = sd_bus_default_system(&bus); + if (r < 0) + return log_tests_skipped("Failed to connect to bus"); + + r = sd_bus_message_new_method_call(bus, &m, "foobar.waldo", "/", "foobar.waldo", "Piep"); + assert_se(r >= 0); + + r = sd_bus_message_append(m, ""); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "s", "a string"); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "s", NULL); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "asg", 2, "string #1", "string #2", "sba(tt)ss"); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "sass", "foobar", 5, "foo", "bar", "waldo", "piep", "pap", "after"); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "a{yv}", 2, 3, "s", "foo", 5, "s", "waldo"); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "y(ty)y(yt)y", 8, 777ULL, 7, 9, 77, 7777ULL, 10); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "()"); + assert_se(r == -EINVAL); + + r = sd_bus_message_append(m, "ba(ss)", 255, 3, "aaa", "1", "bbb", "2", "ccc", "3"); + assert_se(r >= 0); + + r = sd_bus_message_open_container(m, 'a', "s"); + assert_se(r >= 0); + + r = sd_bus_message_append_basic(m, 's', "foobar"); + assert_se(r >= 0); + + r = sd_bus_message_append_basic(m, 's', "waldo"); + assert_se(r >= 0); + + r = sd_bus_message_close_container(m); + assert_se(r >= 0); + + r = sd_bus_message_append_string_space(m, 5, &s); + assert_se(r >= 0); + strcpy(s, "hallo"); + + r = sd_bus_message_append_array(m, 'i', integer_array, sizeof(integer_array)); + assert_se(r >= 0); + + r = sd_bus_message_append_array(m, 'u', NULL, 0); + assert_se(r >= 0); + + r = sd_bus_message_append(m, "a(stdo)", 1, "foo", 815ULL, 47.0, "/"); + assert_se(r >= 0); + + r = sd_bus_message_seal(m, 4711, 0); + assert_se(r >= 0); + + sd_bus_message_dump(m, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + assert_se(mf = memstream_init(&ms)); + sd_bus_message_dump(m, mf, 0); + assert_se(memstream_finalize(&ms, &first, &first_size) >= 0); + + r = bus_message_get_blob(m, &buffer, &sz); + assert_se(r >= 0); + + h = cescape_length(buffer, sz); + assert_se(h); + log_info("message size = %zu, contents =\n%s", sz, h); + +#if HAVE_GLIB + /* Work-around for asan bug. See c8d980a3e962aba2ea3a4cedf75fa94890a6d746. */ +#if !HAS_FEATURE_ADDRESS_SANITIZER + { + GDBusMessage *g; + char *p; + +#if !defined(GLIB_VERSION_2_36) + g_type_init(); +#endif + + g = g_dbus_message_new_from_blob(buffer, sz, 0, NULL); + p = g_dbus_message_print(g, 0); + log_info("%s", p); + g_free(p); + g_object_unref(g); + } +#endif +#endif + +#if HAVE_DBUS + { + DBusMessage *w; + DBusError error; + + dbus_error_init(&error); + + w = dbus_message_demarshal(buffer, sz, &error); + if (!w) + log_error("%s", error.message); + else + dbus_message_unref(w); + + dbus_error_free(&error); + } +#endif + + m = sd_bus_message_unref(m); + + r = bus_message_from_malloc(bus, buffer, sz, NULL, 0, NULL, &m); + assert_se(r >= 0); + + sd_bus_message_dump(m, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + assert_se(mf = memstream_init(&ms)); + sd_bus_message_dump(m, mf, 0); + assert_se(memstream_finalize(&ms, &second, &second_size) >= 0); + assert_se(first_size == second_size); + assert_se(memcmp(first, second, first_size) == 0); + + assert_se(sd_bus_message_rewind(m, true) >= 0); + + r = sd_bus_message_read(m, "ssasg", &x, &x2, 2, &y, &z, &a_signature); + assert_se(r > 0); + assert_se(streq(x, "a string")); + assert_se(streq(x2, "")); + assert_se(streq(y, "string #1")); + assert_se(streq(z, "string #2")); + assert_se(streq(a_signature, "sba(tt)ss")); + + r = sd_bus_message_read(m, "sass", &x, 5, &y, &z, &a, &b, &c, &d); + assert_se(r > 0); + assert_se(streq(x, "foobar")); + assert_se(streq(y, "foo")); + assert_se(streq(z, "bar")); + assert_se(streq(a, "waldo")); + assert_se(streq(b, "piep")); + assert_se(streq(c, "pap")); + assert_se(streq(d, "after")); + + r = sd_bus_message_read(m, "a{yv}", 2, &u, "s", &x, &v, "s", &y); + assert_se(r > 0); + assert_se(u == 3); + assert_se(streq(x, "foo")); + assert_se(v == 5); + assert_se(streq(y, "waldo")); + + r = sd_bus_message_read(m, "y(ty)", &v, &u64, &u); + assert_se(r > 0); + assert_se(v == 8); + assert_se(u64 == 777); + assert_se(u == 7); + + r = sd_bus_message_read(m, "y(yt)", &v, &u, &u64); + assert_se(r > 0); + assert_se(v == 9); + assert_se(u == 77); + assert_se(u64 == 7777); + + r = sd_bus_message_read(m, "y", &v); + assert_se(r > 0); + assert_se(v == 10); + + r = sd_bus_message_read(m, "()"); + assert_se(r < 0); + + r = sd_bus_message_read(m, "ba(ss)", &boolean, 3, &x, &y, &a, &b, &c, &d); + assert_se(r > 0); + assert_se(boolean); + assert_se(streq(x, "aaa")); + assert_se(streq(y, "1")); + assert_se(streq(a, "bbb")); + assert_se(streq(b, "2")); + assert_se(streq(c, "ccc")); + assert_se(streq(d, "3")); + + assert_se(sd_bus_message_verify_type(m, 'a', "s") > 0); + + r = sd_bus_message_read(m, "as", 2, &x, &y); + assert_se(r > 0); + assert_se(streq(x, "foobar")); + assert_se(streq(y, "waldo")); + + r = sd_bus_message_read_basic(m, 's', &s); + assert_se(r > 0); + assert_se(streq(s, "hallo")); + + r = sd_bus_message_read_array(m, 'i', (const void**) &return_array, &sz); + assert_se(r > 0); + assert_se(sz == sizeof(integer_array)); + assert_se(memcmp(integer_array, return_array, sz) == 0); + + r = sd_bus_message_read_array(m, 'u', (const void**) &return_array, &sz); + assert_se(r > 0); + assert_se(sz == 0); + + r = sd_bus_message_read(m, "a(stdo)", 1, &x, &u64, &dbl, &y); + assert_se(r > 0); + assert_se(streq(x, "foo")); + assert_se(u64 == 815ULL); + assert_se(fabs(dbl - 47.0) < 0.1); + assert_se(streq(y, "/")); + + r = sd_bus_message_peek_type(m, NULL, NULL); + assert_se(r == 0); + + r = sd_bus_message_new_method_call(bus, ©, "foobar.waldo", "/", "foobar.waldo", "Piep"); + assert_se(r >= 0); + + r = sd_bus_message_rewind(m, true); + assert_se(r >= 0); + + r = sd_bus_message_copy(copy, m, true); + assert_se(r >= 0); + + r = sd_bus_message_seal(copy, 4712, 0); + assert_se(r >= 0); + + assert_se(mf = memstream_init(&ms)); + sd_bus_message_dump(copy, mf, 0); + assert_se(memstream_finalize(&ms, &third, &third_size) >= 0); + + printf("<%.*s>\n", (int) first_size, first); + printf("<%.*s>\n", (int) third_size, third); + + assert_se(first_size == third_size); + assert_se(memcmp(first, third, third_size) == 0); + + r = sd_bus_message_rewind(m, true); + assert_se(r >= 0); + + assert_se(sd_bus_message_verify_type(m, 's', NULL) > 0); + + r = sd_bus_message_skip(m, "ssasg"); + assert_se(r > 0); + + assert_se(sd_bus_message_verify_type(m, 's', NULL) > 0); + + r = sd_bus_message_skip(m, "sass"); + assert_se(r >= 0); + + assert_se(sd_bus_message_verify_type(m, 'a', "{yv}") > 0); + + r = sd_bus_message_skip(m, "a{yv}y(ty)y(yt)y"); + assert_se(r >= 0); + + assert_se(sd_bus_message_verify_type(m, 'b', NULL) > 0); + + r = sd_bus_message_read(m, "b", &boolean); + assert_se(r > 0); + assert_se(boolean); + + r = sd_bus_message_enter_container(m, 0, NULL); + assert_se(r > 0); + + r = sd_bus_message_read(m, "(ss)", &x, &y); + assert_se(r > 0); + + r = sd_bus_message_read(m, "(ss)", &a, &b); + assert_se(r > 0); + + r = sd_bus_message_read(m, "(ss)", &c, &d); + assert_se(r > 0); + + r = sd_bus_message_read(m, "(ss)", &x, &y); + assert_se(r == 0); + + r = sd_bus_message_exit_container(m); + assert_se(r >= 0); + + assert_se(streq(x, "aaa")); + assert_se(streq(y, "1")); + assert_se(streq(a, "bbb")); + assert_se(streq(b, "2")); + assert_se(streq(c, "ccc")); + assert_se(streq(d, "3")); + + test_bus_label_escape(); + test_bus_path_encode(); + test_bus_path_encode_unique(); + test_bus_path_encode_many(); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-match.c b/src/libsystemd/sd-bus/test-bus-match.c new file mode 100644 index 0000000..2d77557 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-match.c @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-match.h" +#include "bus-message.h" +#include "bus-slot.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "tests.h" + +static bool mask[32]; + +static int filter(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + log_info("Ran %u", PTR_TO_UINT(userdata)); + assert_se(PTR_TO_UINT(userdata) < ELEMENTSOF(mask)); + mask[PTR_TO_UINT(userdata)] = true; + return 0; +} + +static bool mask_contains(unsigned a[], unsigned n) { + unsigned i, j; + + for (i = 0; i < ELEMENTSOF(mask); i++) { + bool found = false; + + for (j = 0; j < n; j++) + if (a[j] == i) { + found = true; + break; + } + + if (found != mask[i]) + return false; + } + + return true; +} + +static int match_add(sd_bus_slot *slots, struct bus_match_node *root, const char *match, int value) { + struct bus_match_component *components; + size_t n_components; + sd_bus_slot *s; + int r; + + s = slots + value; + + r = bus_match_parse(match, &components, &n_components); + if (r < 0) + return r; + + CLEANUP_ARRAY(components, n_components, bus_match_parse_free); + + s->userdata = INT_TO_PTR(value); + s->match_callback.callback = filter; + + return bus_match_add(root, components, n_components, &s->match_callback); +} + +static void test_match_scope(const char *match, enum bus_match_scope scope) { + struct bus_match_component *components = NULL; + size_t n_components = 0; + + CLEANUP_ARRAY(components, n_components, bus_match_parse_free); + + assert_se(bus_match_parse(match, &components, &n_components) >= 0); + assert_se(bus_match_get_scope(components, n_components) == scope); +} + +int main(int argc, char *argv[]) { + struct bus_match_node root = { + .type = BUS_MATCH_ROOT, + }; + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + sd_bus_slot slots[19] = {}; + int r; + + test_setup_logging(LOG_INFO); + + r = sd_bus_open_user(&bus); + if (r < 0) + r = sd_bus_open_system(&bus); + if (r < 0) + return log_tests_skipped("Failed to connect to bus"); + + assert_se(match_add(slots, &root, "arg2='wal\\'do',sender='foo',type='signal',interface='bar.x',", 1) >= 0); + assert_se(match_add(slots, &root, "arg2='wal\\'do2',sender='foo',type='signal',interface='bar.x',", 2) >= 0); + assert_se(match_add(slots, &root, "arg3='test',sender='foo',type='signal',interface='bar.x',", 3) >= 0); + assert_se(match_add(slots, &root, "arg3='test',sender='foo',type='method_call',interface='bar.x',", 4) >= 0); + assert_se(match_add(slots, &root, "", 5) >= 0); + assert_se(match_add(slots, &root, "interface='quux.x'", 6) >= 0); + assert_se(match_add(slots, &root, "interface='bar.x'", 7) >= 0); + assert_se(match_add(slots, &root, "member='waldo',path='/foo/bar'", 8) >= 0); + assert_se(match_add(slots, &root, "path='/foo/bar'", 9) >= 0); + assert_se(match_add(slots, &root, "path_namespace='/foo'", 10) >= 0); + assert_se(match_add(slots, &root, "path_namespace='/foo/quux'", 11) >= 0); + assert_se(match_add(slots, &root, "arg1='two'", 12) >= 0); + assert_se(match_add(slots, &root, "member='waldo',arg2path='/prefix/'", 13) >= 0); + assert_se(match_add(slots, &root, "member=waldo,path='/foo/bar',arg3namespace='prefix'", 14) >= 0); + assert_se(match_add(slots, &root, "arg4has='pi'", 15) >= 0); + assert_se(match_add(slots, &root, "arg4has='pa'", 16) >= 0); + assert_se(match_add(slots, &root, "arg4has='po'", 17) >= 0); + assert_se(match_add(slots, &root, "arg4='pi'", 18) >= 0); + + bus_match_dump(stdout, &root, 0); + + assert_se(sd_bus_message_new_signal(bus, &m, "/foo/bar", "bar.x", "waldo") >= 0); + assert_se(sd_bus_message_append(m, "ssssas", "one", "two", "/prefix/three", "prefix.four", 3, "pi", "pa", "po") >= 0); + assert_se(sd_bus_message_seal(m, 1, 0) >= 0); + + zero(mask); + assert_se(bus_match_run(NULL, &root, m) == 0); + assert_se(mask_contains((unsigned[]) { 9, 8, 7, 5, 10, 12, 13, 14, 15, 16, 17 }, 11)); + + assert_se(bus_match_remove(&root, &slots[8].match_callback) >= 0); + assert_se(bus_match_remove(&root, &slots[13].match_callback) >= 0); + + bus_match_dump(stdout, &root, 0); + + zero(mask); + assert_se(bus_match_run(NULL, &root, m) == 0); + assert_se(mask_contains((unsigned[]) { 9, 5, 10, 12, 14, 7, 15, 16, 17 }, 9)); + + for (enum bus_match_node_type i = 0; i < _BUS_MATCH_NODE_TYPE_MAX; i++) { + char buf[32]; + const char *x; + + assert_se(x = bus_match_node_type_to_string(i, buf, sizeof(buf))); + + if (i >= BUS_MATCH_MESSAGE_TYPE) + assert_se(bus_match_node_type_from_string(x, strlen(x)) == i); + } + + bus_match_free(&root); + + test_match_scope("interface='foobar'", BUS_MATCH_GENERIC); + test_match_scope("", BUS_MATCH_GENERIC); + test_match_scope("interface='org.freedesktop.DBus.Local'", BUS_MATCH_LOCAL); + test_match_scope("sender='org.freedesktop.DBus.Local'", BUS_MATCH_LOCAL); + test_match_scope("member='gurke',path='/org/freedesktop/DBus/Local'", BUS_MATCH_LOCAL); + test_match_scope("arg2='piep',sender='org.freedesktop.DBus',member='waldo'", BUS_MATCH_DRIVER); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-objects.c b/src/libsystemd/sd-bus/test-bus-objects.c new file mode 100644 index 0000000..ccdd0d5 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-objects.c @@ -0,0 +1,677 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-dump.h" +#include "bus-internal.h" +#include "bus-message.h" +#include "log.h" +#include "macro.h" +#include "strv.h" +#include "tests.h" + +struct context { + int fds[2]; + bool quit; + char *something; + char *automatic_string_property; + uint32_t automatic_integer_property; +}; + +static int something_handler(sd_bus_message *m, void *userdata, sd_bus_error *error) { + struct context *c = userdata; + const char *s; + char *n = NULL; + int r; + + r = sd_bus_message_read(m, "s", &s); + assert_se(r > 0); + + n = strjoin("<<<", s, ">>>"); + assert_se(n); + + free(c->something); + c->something = n; + + log_info("AlterSomething() called, got %s, returning %s", s, n); + + /* This should fail, since the return type doesn't match */ + assert_se(sd_bus_reply_method_return(m, "u", 4711) == -ENOMSG); + + r = sd_bus_reply_method_return(m, "s", n); + assert_se(r >= 0); + + return 1; +} + +static int exit_handler(sd_bus_message *m, void *userdata, sd_bus_error *error) { + struct context *c = userdata; + int r; + + c->quit = true; + + log_info("Exit called"); + + r = sd_bus_reply_method_return(m, ""); + assert_se(r >= 0); + + return 1; +} + +static int get_handler(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error) { + struct context *c = userdata; + int r; + + log_info("property get for %s called, returning \"%s\".", property, c->something); + + r = sd_bus_message_append(reply, "s", c->something); + assert_se(r >= 0); + + return 1; +} + +static int set_handler(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error) { + struct context *c = userdata; + const char *s; + char *n; + int r; + + log_info("property set for %s called", property); + + r = sd_bus_message_read(value, "s", &s); + assert_se(r >= 0); + + n = strdup(s); + assert_se(n); + + free(c->something); + c->something = n; + + return 1; +} + +static int value_handler(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *s = NULL; + const char *x; + int r; + + assert_se(asprintf(&s, "object %p, path %s", userdata, path) >= 0); + r = sd_bus_message_append(reply, "s", s); + assert_se(r >= 0); + + assert_se(x = startswith(path, "/value/")); + + assert_se(PTR_TO_UINT(userdata) == 30); + + return 1; +} + +static int notify_test(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int r; + + assert_se(sd_bus_emit_properties_changed(sd_bus_message_get_bus(m), m->path, "org.freedesktop.systemd.ValueTest", "Value", NULL) >= 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + + return 1; +} + +static int notify_test2(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int r; + + assert_se(sd_bus_emit_properties_changed_strv(sd_bus_message_get_bus(m), m->path, "org.freedesktop.systemd.ValueTest", NULL) >= 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + + return 1; +} + +static int emit_interfaces_added(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int r; + + assert_se(sd_bus_emit_interfaces_added(sd_bus_message_get_bus(m), "/value/a/x", "org.freedesktop.systemd.ValueTest", NULL) >= 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + + return 1; +} + +static int emit_interfaces_removed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int r; + + assert_se(sd_bus_emit_interfaces_removed(sd_bus_message_get_bus(m), "/value/a/x", "org.freedesktop.systemd.ValueTest", NULL) >= 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + + return 1; +} + +static int emit_object_added(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int r; + + assert_se(sd_bus_emit_object_added(sd_bus_message_get_bus(m), "/value/a/x") >= 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + + return 1; +} + +static int emit_object_with_manager_added(sd_bus_message *m, void *userdata, sd_bus_error *error) { + assert_se(sd_bus_emit_object_added(sd_bus_message_get_bus(m), "/value/a") >= 0); + + return ASSERT_SE_NONNEG(sd_bus_reply_method_return(m, NULL)); +} + +static int emit_object_removed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int r; + + assert_se(sd_bus_emit_object_removed(sd_bus_message_get_bus(m), "/value/a/x") >= 0); + + r = sd_bus_reply_method_return(m, NULL); + assert_se(r >= 0); + + return 1; +} + +static const sd_bus_vtable vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("AlterSomething", "s", "s", something_handler, 0), + SD_BUS_METHOD("Exit", "", "", exit_handler, 0), + SD_BUS_WRITABLE_PROPERTY("Something", "s", get_handler, set_handler, 0, 0), + SD_BUS_WRITABLE_PROPERTY("AutomaticStringProperty", "s", NULL, NULL, offsetof(struct context, automatic_string_property), 0), + SD_BUS_WRITABLE_PROPERTY("AutomaticIntegerProperty", "u", NULL, NULL, offsetof(struct context, automatic_integer_property), 0), + SD_BUS_METHOD("NoOperation", NULL, NULL, NULL, 0), + SD_BUS_METHOD("EmitInterfacesAdded", NULL, NULL, emit_interfaces_added, 0), + SD_BUS_METHOD("EmitInterfacesRemoved", NULL, NULL, emit_interfaces_removed, 0), + SD_BUS_METHOD("EmitObjectAdded", NULL, NULL, emit_object_added, 0), + SD_BUS_METHOD("EmitObjectWithManagerAdded", NULL, NULL, emit_object_with_manager_added, 0), + SD_BUS_METHOD("EmitObjectRemoved", NULL, NULL, emit_object_removed, 0), + SD_BUS_VTABLE_END +}; + +static const sd_bus_vtable vtable2[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("NotifyTest", "", "", notify_test, 0), + SD_BUS_METHOD("NotifyTest2", "", "", notify_test2, 0), + SD_BUS_PROPERTY("Value", "s", value_handler, 10, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Value2", "s", value_handler, 10, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("Value3", "s", value_handler, 10, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Value4", "s", value_handler, 10, 0), + SD_BUS_PROPERTY("AnExplicitProperty", "s", NULL, offsetof(struct context, something), SD_BUS_VTABLE_PROPERTY_EXPLICIT|SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_VTABLE_END +}; + +static int enumerator_callback(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + + if (object_path_startswith("/value", path)) + assert_se(*nodes = strv_new("/value/a", "/value/b", "/value/c")); + + return 1; +} + +static int enumerator2_callback(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + + if (object_path_startswith("/value/a", path)) + assert_se(*nodes = strv_new("/value/a/x", "/value/a/y", "/value/a/z")); + + return 1; +} + +static int enumerator3_callback(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **v = NULL; + + if (!object_path_startswith("/value/b", path)) + return 1; + + for (unsigned i = 0; i < 30; i++) + assert_se(strv_extendf(&v, "/value/b/%u", i) >= 0); + + *nodes = TAKE_PTR(v); + return 1; +} + +static void *server(void *p) { + struct context *c = p; + sd_bus *bus = NULL; + sd_id128_t id; + int r; + + c->quit = false; + + assert_se(sd_id128_randomize(&id) >= 0); + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_fd(bus, c->fds[0], c->fds[0]) >= 0); + assert_se(sd_bus_set_server(bus, 1, id) >= 0); + + assert_se(sd_bus_add_object_vtable(bus, NULL, "/foo", "org.freedesktop.systemd.test", vtable, c) >= 0); + assert_se(sd_bus_add_object_vtable(bus, NULL, "/foo", "org.freedesktop.systemd.test2", vtable, c) >= 0); + assert_se(sd_bus_add_fallback_vtable(bus, NULL, "/value", "org.freedesktop.systemd.ValueTest", vtable2, NULL, UINT_TO_PTR(20)) >= 0); + assert_se(sd_bus_add_node_enumerator(bus, NULL, "/value", enumerator_callback, NULL) >= 0); + assert_se(sd_bus_add_node_enumerator(bus, NULL, "/value/a", enumerator2_callback, NULL) >= 0); + assert_se(sd_bus_add_node_enumerator(bus, NULL, "/value/b", enumerator3_callback, NULL) >= 0); + assert_se(sd_bus_add_object_manager(bus, NULL, "/value") >= 0); + assert_se(sd_bus_add_object_manager(bus, NULL, "/value/a") >= 0); + + assert_se(sd_bus_start(bus) >= 0); + + log_error("Entering event loop on server"); + + while (!c->quit) { + log_error("Loop!"); + + r = sd_bus_process(bus, NULL); + if (r < 0) { + log_error_errno(r, "Failed to process requests: %m"); + goto fail; + } + + if (r == 0) { + r = sd_bus_wait(bus, UINT64_MAX); + if (r < 0) { + log_error_errno(r, "Failed to wait: %m"); + goto fail; + } + + continue; + } + } + + r = 0; + +fail: + if (bus) { + sd_bus_flush(bus); + sd_bus_unref(bus); + } + + return INT_TO_PTR(r); +} + +static int client(struct context *c) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_strv_free_ char **lines = NULL; + const char *s; + int r; + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_fd(bus, c->fds[1], c->fds[1]) >= 0); + assert_se(sd_bus_start(bus) >= 0); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "NoOperation", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "AlterSomething", &error, &reply, "s", "hallo"); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + assert_se(streq(s, "<<>>")); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "Doesntexist", &error, &reply, ""); + assert_se(r < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)); + + sd_bus_error_free(&error); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "Doesntexist", &error, &reply, NULL); /* NULL and "" are equivalent */ + assert_se(r < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)); + + sd_bus_error_free(&error); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "AlterSomething", &error, &reply, "as", 1, "hallo"); + assert_se(r < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)); + + sd_bus_error_free(&error); + + r = sd_bus_get_property(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "Something", &error, &reply, "s"); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + assert_se(streq(s, "<<>>")); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_set_property(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "Something", &error, "s", "test"); + assert_se(r >= 0); + + r = sd_bus_get_property(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "Something", &error, &reply, "s"); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + assert_se(streq(s, "test")); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_set_property(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "AutomaticIntegerProperty", &error, "u", 815); + assert_se(r >= 0); + + assert_se(c->automatic_integer_property == 815); + + r = sd_bus_set_property(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "AutomaticStringProperty", &error, "s", "Du Dödel, Du!"); + assert_se(r >= 0); + + assert_se(streq(c->automatic_string_property, "Du Dödel, Du!")); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.DBus.Introspectable", "Introspect", &error, &reply, ""); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + fputs(s, stdout); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.DBus.Introspectable", "Introspect", &error, &reply, NULL); /* NULL and "" are equivalent */ + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + fputs(s, stdout); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_get_property(bus, "org.freedesktop.systemd.test", "/value/xuzz", "org.freedesktop.systemd.ValueTest", "Value", &error, &reply, "s"); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + log_info("read %s", s); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/", "org.freedesktop.DBus.Introspectable", "Introspect", &error, &reply, NULL); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + fputs(s, stdout); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value", "org.freedesktop.DBus.Introspectable", "Introspect", &error, &reply, NULL); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + fputs(s, stdout); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value/a", "org.freedesktop.DBus.Introspectable", "Introspect", &error, &reply, NULL); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + fputs(s, stdout); + + assert_se(lines = strv_split_newlines(s)); + assert_se(strv_contains(lines, " ")); + assert_se(strv_contains(lines, " ")); + assert_se(strv_contains(lines, " ")); + lines = strv_free(lines); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value/b", "org.freedesktop.DBus.Introspectable", "Introspect", &error, &reply, NULL); + assert_se(r >= 0); + + r = sd_bus_message_read(reply, "s", &s); + assert_se(r >= 0); + fputs(s, stdout); + + assert_se(lines = strv_split_newlines(s)); + for (unsigned i = 0; i < 30; i++) { + _cleanup_free_ char *n = NULL; + + assert_se(asprintf(&n, " ", i) >= 0); + assert_se(strv_contains(lines, n)); + } + lines = strv_free(lines); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.DBus.Properties", "GetAll", &error, &reply, "s", NULL); + assert_se(r >= 0); + + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value/a", "org.freedesktop.DBus.Properties", "GetAll", &error, &reply, "s", "org.freedesktop.systemd.ValueTest2"); + assert_se(r < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_INTERFACE)); + sd_bus_error_free(&error); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.DBus.ObjectManager", "GetManagedObjects", &error, &reply, NULL); + assert_se(r < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)); + sd_bus_error_free(&error); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value", "org.freedesktop.DBus.ObjectManager", "GetManagedObjects", &error, &reply, NULL); + assert_se(r >= 0); + + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + /* Check that /value/b does not have ObjectManager interface but /value/a does */ + assert_se(sd_bus_message_rewind(reply, 1) > 0); + assert_se(sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "{oa{sa{sv}}}") > 0); + while (ASSERT_SE_NONNEG(sd_bus_message_enter_container(reply, SD_BUS_TYPE_DICT_ENTRY, "oa{sa{sv}}")) > 0) { + const char *path = NULL; + assert_se(sd_bus_message_read_basic(reply, 'o', &path) > 0); + if (STR_IN_SET(path, "/value/b", "/value/a")) { + /* Check that there is no object manager interface here */ + bool found_object_manager_interface = false; + assert_se(sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "{sa{sv}}") > 0); + while (ASSERT_SE_NONNEG(sd_bus_message_enter_container(reply, SD_BUS_TYPE_DICT_ENTRY, "sa{sv}")) > 0) { + const char *interface_name = NULL; + assert_se(sd_bus_message_read_basic(reply, 's', &interface_name) > 0); + + if (streq(interface_name, "org.freedesktop.DBus.ObjectManager")) { + assert_se(!streq(path, "/value/b")); + found_object_manager_interface = true; + } + + assert_se(sd_bus_message_skip(reply, "a{sv}") >= 0); + assert_se(sd_bus_message_exit_container(reply) >= 0); + } + assert_se(sd_bus_message_exit_container(reply) >= 0); + + if (streq(path, "/value/a")) { + /* ObjectManager must be here */ + assert_se(found_object_manager_interface); + } + + } else + assert_se(sd_bus_message_skip(reply, "a{sa{sv}}") >= 0); + + assert_se(sd_bus_message_exit_container(reply) >= 0); + } + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value/a", "org.freedesktop.systemd.ValueTest", "NotifyTest", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_process(bus, &reply); + assert_se(r > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.Properties", "PropertiesChanged")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/value/a", "org.freedesktop.systemd.ValueTest", "NotifyTest2", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_process(bus, &reply); + assert_se(r > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.Properties", "PropertiesChanged")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "EmitInterfacesAdded", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_process(bus, &reply); + assert_se(r > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.ObjectManager", "InterfacesAdded")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "EmitInterfacesRemoved", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_process(bus, &reply); + assert_se(r > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.ObjectManager", "InterfacesRemoved")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "EmitObjectAdded", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_process(bus, &reply); + assert_se(r > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.ObjectManager", "InterfacesAdded")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + /* Check if /value/a/x does not have org.freedesktop.DBus.ObjectManager */ + assert_se(sd_bus_message_rewind(reply, 1) >= 0); + const char* should_be_value_a_x = NULL; + assert_se(sd_bus_message_read_basic(reply, 'o', &should_be_value_a_x) > 0); + assert_se(streq(should_be_value_a_x, "/value/a/x")); + assert_se(sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "{sa{sv}}") > 0); + while (ASSERT_SE_NONNEG(sd_bus_message_enter_container(reply, SD_BUS_TYPE_DICT_ENTRY, "sa{sv}")) > 0) { + const char* interface_name = NULL; + assert_se(sd_bus_message_read_basic(reply, 's', &interface_name) > 0); + + assert(!streq(interface_name, "org.freedesktop.DBus.ObjectManager")); + + assert_se(sd_bus_message_skip(reply, "a{sv}") >= 0); + + assert_se(sd_bus_message_exit_container(reply) >= 0); + } + + reply = sd_bus_message_unref(reply); + + assert_se(sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "EmitObjectWithManagerAdded", &error, NULL, NULL) >= 0); + + assert_se(sd_bus_process(bus, &reply) > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.ObjectManager", "InterfacesAdded")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + /* Check if /value/a has org.freedesktop.DBus.ObjectManager */ + assert_se(sd_bus_message_rewind(reply, 1) >= 0); + const char* should_be_value_a = NULL; + bool found_object_manager = false; + assert_se(sd_bus_message_read_basic(reply, 'o', &should_be_value_a) > 0); + assert_se(streq(should_be_value_a, "/value/a")); + assert_se(sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "{sa{sv}}") > 0); + while (ASSERT_SE_NONNEG(sd_bus_message_enter_container(reply, SD_BUS_TYPE_DICT_ENTRY, "sa{sv}")) > 0) { + const char* interface_name = NULL; + assert_se(sd_bus_message_read_basic(reply, 's', &interface_name)); + + if (streq(interface_name, "org.freedesktop.DBus.ObjectManager")) { + found_object_manager = true; + break; + } + + assert_se(sd_bus_message_skip(reply, "a{sv}") >= 0); + + assert_se(sd_bus_message_exit_container(reply) >= 0); + } + assert_se(found_object_manager); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "EmitObjectRemoved", &error, NULL, NULL); + assert_se(r >= 0); + + r = sd_bus_process(bus, &reply); + assert_se(r > 0); + + assert_se(sd_bus_message_is_signal(reply, "org.freedesktop.DBus.ObjectManager", "InterfacesRemoved")); + sd_bus_message_dump(reply, stdout, SD_BUS_MESSAGE_DUMP_WITH_HEADER); + + /* Check if /value/a/x does not have org.freedesktop.DBus.ObjectManager */ + assert_se(sd_bus_message_rewind(reply, 1) >= 0); + should_be_value_a_x = NULL; + assert_se(sd_bus_message_read_basic(reply, 'o', &should_be_value_a_x) > 0); + assert_se(streq(should_be_value_a_x, "/value/a/x")); + assert_se(sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "s") > 0); + const char* deleted_interface_name = NULL; + while (ASSERT_SE_NONNEG(sd_bus_message_read_basic(reply, 's', &deleted_interface_name)) > 0) { + assert(!streq(deleted_interface_name, "org.freedesktop.DBus.ObjectManager")); + } + assert_se(sd_bus_message_exit_container(reply) >= 0); + + reply = sd_bus_message_unref(reply); + + r = sd_bus_call_method(bus, "org.freedesktop.systemd.test", "/foo", "org.freedesktop.systemd.test", "Exit", &error, NULL, NULL); + assert_se(r >= 0); + + sd_bus_flush(bus); + + return 0; +} + +int main(int argc, char *argv[]) { + struct context c = {}; + pthread_t s; + void *p; + int r, q; + + test_setup_logging(LOG_DEBUG); + + c.automatic_integer_property = 4711; + assert_se(c.automatic_string_property = strdup("dudeldu")); + + assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, c.fds) >= 0); + + r = pthread_create(&s, NULL, server, &c); + if (r != 0) + return -r; + + r = client(&c); + + q = pthread_join(s, &p); + if (q != 0) + return -q; + + if (r < 0) + return r; + + if (PTR_TO_INT(p) < 0) + return PTR_TO_INT(p); + + free(c.something); + free(c.automatic_string_property); + + return EXIT_SUCCESS; +} diff --git a/src/libsystemd/sd-bus/test-bus-peersockaddr.c b/src/libsystemd/sd-bus/test-bus-peersockaddr.c new file mode 100644 index 0000000..79556e8 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-peersockaddr.c @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "fd-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "tests.h" + +static void *server(void *p) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_ int listen_fd = PTR_TO_INT(p), fd = -EBADF; + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL; + _cleanup_free_ char *our_comm = NULL; + sd_id128_t id; + int r; + + assert_se(sd_id128_randomize(&id) >= 0); + + fd = accept4(listen_fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK); + assert_se(fd >= 0); + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_fd(bus, fd, fd) >= 0); + TAKE_FD(fd); + assert_se(sd_bus_set_server(bus, true, id) >= 0); + assert_se(sd_bus_negotiate_creds(bus, 1, SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID|SD_BUS_CREDS_PID|SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION) >= 0); + + assert_se(sd_bus_start(bus) >= 0); + + assert_se(sd_bus_get_owner_creds(bus, SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID|SD_BUS_CREDS_PID|SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION, &c) >= 0); + + uid_t u; + assert_se(sd_bus_creds_get_euid(c, &u) >= 0); + assert_se(u == getuid()); + + gid_t g; + assert_se(sd_bus_creds_get_egid(c, &g) >= 0); + assert_se(g == getgid()); + + pid_t pid; + assert_se(sd_bus_creds_get_pid(c, &pid) >= 0); + assert_se(pid == getpid_cached()); + + const char *comm; + assert_se(sd_bus_creds_get_comm(c, &comm) >= 0); + assert_se(pid_get_comm(0, &our_comm) >= 0); + assert_se(streq_ptr(comm, our_comm)); + + const char *description; + assert_se(sd_bus_creds_get_description(c, &description) >= 0); + assert_se(streq_ptr(description, "wuffwuff")); + + for (;;) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + r = sd_bus_process(bus, &m); + assert_se(r >= 0); + + if (r == 0) { + assert_se(sd_bus_wait(bus, UINT64_MAX) >= 0); + continue; + } + + if (m && sd_bus_message_is_method_call(m, "foo.foo", "Foo") > 0) { + assert_se(sd_bus_reply_method_return(m, "s", "bar") >= 0); + break; + } + } + + return NULL; +} + +static void* client(void *p) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *z; + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_description(bus, "wuffwuff") >= 0); + assert_se(sd_bus_set_address(bus, p) >= 0); + assert_se(sd_bus_start(bus) >= 0); + + assert_se(sd_bus_call_method(bus, "foo.foo", "/foo", "foo.foo", "Foo", NULL, &reply, "s", "foo") >= 0); + + assert_se(sd_bus_message_read(reply, "s", &z) >= 0); + assert_se(streq_ptr(z, "bar")); + + return NULL; +} + +TEST(description) { + _cleanup_free_ char *a = NULL; + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + }; + socklen_t salen; + pthread_t s, c; + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(fd >= 0); + + assert_se(bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path)) >= 0); /* force auto-bind */ + + assert_se(listen(fd, 1) >= 0); + + salen = sizeof(sa); + assert_se(getsockname(fd, &sa.sa, &salen) >= 0); + assert_se(salen >= offsetof(struct sockaddr_un, sun_path)); + assert_se(sa.un.sun_path[0] == 0); + + assert_se(asprintf(&a, "unix:abstract=%s", sa.un.sun_path + 1) >= 0); + + assert_se(pthread_create(&s, NULL, server, INT_TO_PTR(fd)) == 0); + TAKE_FD(fd); + + assert_se(pthread_create(&c, NULL, client, a) == 0); + + assert_se(pthread_join(s, NULL) == 0); + assert_se(pthread_join(c, NULL) == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-bus/test-bus-queue-ref-cycle.c b/src/libsystemd/sd-bus/test-bus-queue-ref-cycle.c new file mode 100644 index 0000000..7c2fa72 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-queue-ref-cycle.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "sd-bus.h" + +#include "main-func.h" +#include "tests.h" + +static int test_ref_unref(void) { + sd_bus_message *m = NULL; + sd_bus *bus = NULL; + int r; + + /* This test will result in a memory leak in <= v240, but not on v241. Hence to be really useful it + * should be run through a leak tracker such as valgrind. */ + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_tests_skipped("Failed to connect to bus"); + + /* Create a message and enqueue it (this shouldn't send it though as the connection setup is not complete yet) */ + assert_se(sd_bus_message_new_method_call(bus, &m, "foo.bar", "/foo", "quux.quux", "waldo") >= 0); + assert_se(sd_bus_send(bus, m, NULL) >= 0); + + /* Let's now unref the message first and the bus second. */ + m = sd_bus_message_unref(m); + bus = sd_bus_unref(bus); + + /* We should have a memory leak now on <= v240. Let's do this again, but destroy in the opposite + * order. On v240 that too should be a leak. */ + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_tests_skipped("Failed to connect to bus"); + + assert_se(sd_bus_message_new_method_call(bus, &m, "foo.bar", "/foo", "quux.quux", "waldo") >= 0); + assert_se(sd_bus_send(bus, m, NULL) >= 0); + + /* Let's now unref things in the opposite order */ + bus = sd_bus_unref(bus); + m = sd_bus_message_unref(m); + + return 0; +} + +static int run(int argc, char *argv[]) { + int r; + + test_setup_logging(LOG_INFO); + + r = test_ref_unref(); + if (r < 0) + return r; + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/libsystemd/sd-bus/test-bus-server.c b/src/libsystemd/sd-bus/test-bus-server.c new file mode 100644 index 0000000..8049e33 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-server.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "bus-internal.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "string-util.h" +#include "tests.h" + +struct context { + int fds[2]; + + bool client_negotiate_unix_fds; + bool server_negotiate_unix_fds; + + bool client_anonymous_auth; + bool server_anonymous_auth; +}; + +static int _server(struct context *c) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + sd_id128_t id; + bool quit = false; + int r; + + assert_se(sd_id128_randomize(&id) >= 0); + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_fd(bus, c->fds[0], c->fds[0]) >= 0); + assert_se(sd_bus_set_server(bus, 1, id) >= 0); + assert_se(sd_bus_set_anonymous(bus, c->server_anonymous_auth) >= 0); + assert_se(sd_bus_negotiate_fds(bus, c->server_negotiate_unix_fds) >= 0); + assert_se(sd_bus_start(bus) >= 0); + + while (!quit) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + + r = sd_bus_process(bus, &m); + if (r < 0) + return log_error_errno(r, "Failed to process requests: %m"); + + if (r == 0) { + r = sd_bus_wait(bus, UINT64_MAX); + if (r < 0) + return log_error_errno(r, "Failed to wait: %m"); + continue; + } + + if (!m) + continue; + + log_info("Got message! member=%s", strna(sd_bus_message_get_member(m))); + + if (sd_bus_message_is_method_call(m, "org.freedesktop.systemd.test", "Exit")) { + + assert_se((sd_bus_can_send(bus, 'h') >= 1) == + (c->server_negotiate_unix_fds && c->client_negotiate_unix_fds)); + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return log_error_errno(r, "Failed to allocate return: %m"); + + quit = true; + + } else if (sd_bus_message_is_method_call(m, NULL, NULL)) { + r = sd_bus_message_new_method_error( + m, + &reply, + &SD_BUS_ERROR_MAKE_CONST(SD_BUS_ERROR_UNKNOWN_METHOD, "Unknown method.")); + if (r < 0) + return log_error_errno(r, "Failed to allocate return: %m"); + } + + if (reply) { + r = sd_bus_send(bus, reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send reply: %m"); + } + } + + return 0; +} + +static void* server(void *p) { + return INT_TO_PTR(_server(p)); +} + +static int client(struct context *c) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_fd(bus, c->fds[1], c->fds[1]) >= 0); + assert_se(sd_bus_negotiate_fds(bus, c->client_negotiate_unix_fds) >= 0); + assert_se(sd_bus_set_anonymous(bus, c->client_anonymous_auth) >= 0); + assert_se(sd_bus_start(bus) >= 0); + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.systemd.test", + "/", + "org.freedesktop.systemd.test", + "Exit"); + if (r < 0) + return log_error_errno(r, "Failed to allocate method call: %m"); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to issue method call: %s", bus_error_message(&error, r)); + + return 0; +} + +static int test_one(bool client_negotiate_unix_fds, bool server_negotiate_unix_fds, + bool client_anonymous_auth, bool server_anonymous_auth) { + + struct context c; + pthread_t s; + void *p; + int r, q; + + zero(c); + + assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, c.fds) >= 0); + + c.client_negotiate_unix_fds = client_negotiate_unix_fds; + c.server_negotiate_unix_fds = server_negotiate_unix_fds; + c.client_anonymous_auth = client_anonymous_auth; + c.server_anonymous_auth = server_anonymous_auth; + + r = pthread_create(&s, NULL, server, &c); + if (r != 0) + return -r; + + r = client(&c); + + q = pthread_join(s, &p); + if (q != 0) + return -q; + + if (r < 0) + return r; + + if (PTR_TO_INT(p) < 0) + return PTR_TO_INT(p); + + return 0; +} + +int main(int argc, char *argv[]) { + int r; + + test_setup_logging(LOG_DEBUG); + + r = test_one(true, true, false, false); + assert_se(r >= 0); + + r = test_one(true, false, false, false); + assert_se(r >= 0); + + r = test_one(false, true, false, false); + assert_se(r >= 0); + + r = test_one(false, false, false, false); + assert_se(r >= 0); + + r = test_one(true, true, true, true); + assert_se(r >= 0); + + r = test_one(true, true, false, true); + assert_se(r >= 0); + + r = test_one(true, true, true, false); + assert_se(r == -EPERM); + + return EXIT_SUCCESS; +} diff --git a/src/libsystemd/sd-bus/test-bus-signature.c b/src/libsystemd/sd-bus/test-bus-signature.c new file mode 100644 index 0000000..5a4c811 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-signature.c @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-internal.h" +#include "bus-signature.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + char prefix[256]; + int r; + + test_setup_logging(LOG_DEBUG); + + assert_se(signature_is_single("y", false)); + assert_se(signature_is_single("u", false)); + assert_se(signature_is_single("v", false)); + assert_se(signature_is_single("as", false)); + assert_se(signature_is_single("(ss)", false)); + assert_se(!signature_is_single("()", false)); + assert_se(!signature_is_single("(()()()()())", false)); + assert_se(!signature_is_single("(((())))", false)); + assert_se(signature_is_single("((((s))))", false)); + assert_se(signature_is_single("{ss}", true)); + assert_se(signature_is_single("a{ss}", false)); + assert_se(!signature_is_single("uu", false)); + assert_se(!signature_is_single("", false)); + assert_se(!signature_is_single("(", false)); + assert_se(!signature_is_single(")", false)); + assert_se(!signature_is_single("())", false)); + assert_se(!signature_is_single("((())", false)); + assert_se(!signature_is_single("{)", false)); + assert_se(!signature_is_single("{}", true)); + assert_se(!signature_is_single("{sss}", true)); + assert_se(!signature_is_single("{s}", true)); + assert_se(!signature_is_single("{ss}", false)); + assert_se(!signature_is_single("{ass}", true)); + assert_se(!signature_is_single("a}", true)); + + assert_se(signature_is_pair("yy")); + assert_se(signature_is_pair("ss")); + assert_se(signature_is_pair("sas")); + assert_se(signature_is_pair("sv")); + assert_se(signature_is_pair("sa(vs)")); + assert_se(!signature_is_pair("")); + assert_se(!signature_is_pair("va")); + assert_se(!signature_is_pair("sss")); + assert_se(!signature_is_pair("{s}ss")); + + assert_se(signature_is_valid("ssa{ss}sssub", true)); + assert_se(signature_is_valid("ssa{ss}sssub", false)); + assert_se(signature_is_valid("{ss}", true)); + assert_se(!signature_is_valid("{ss}", false)); + assert_se(signature_is_valid("", true)); + assert_se(signature_is_valid("", false)); + + assert_se(signature_is_valid("sssusa(uuubbba(uu)uuuu)a{u(uuuvas)}", false)); + + assert_se(!signature_is_valid("a", false)); + assert_se(signature_is_valid("as", false)); + assert_se(signature_is_valid("aas", false)); + assert_se(signature_is_valid("aaas", false)); + assert_se(signature_is_valid("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaad", false)); + assert_se(signature_is_valid("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaas", false)); + assert_se(!signature_is_valid("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaau", false)); + + assert_se(signature_is_valid("((((((((((((((((((((((((((((((((s))))))))))))))))))))))))))))))))", false)); + assert_se(!signature_is_valid("((((((((((((((((((((((((((((((((()))))))))))))))))))))))))))))))))", false)); + + assert_se(namespace_complex_pattern("", "")); + assert_se(namespace_complex_pattern("foobar", "foobar")); + assert_se(namespace_complex_pattern("foobar.waldo", "foobar.waldo")); + assert_se(namespace_complex_pattern("foobar.", "foobar.waldo")); + assert_se(namespace_complex_pattern("foobar.waldo", "foobar.")); + assert_se(!namespace_complex_pattern("foobar.waldo", "foobar")); + assert_se(!namespace_complex_pattern("foobar", "foobar.waldo")); + assert_se(!namespace_complex_pattern("", "foo")); + assert_se(!namespace_complex_pattern("foo", "")); + assert_se(!namespace_complex_pattern("foo.", "")); + + assert_se(path_complex_pattern("", "")); + assert_se(!path_complex_pattern("", "/")); + assert_se(!path_complex_pattern("/", "")); + assert_se(path_complex_pattern("/", "/")); + assert_se(path_complex_pattern("/foobar/", "/")); + assert_se(!path_complex_pattern("/foobar/", "/foobar")); + assert_se(path_complex_pattern("/foobar", "/foobar")); + assert_se(!path_complex_pattern("/foobar", "/foobar/")); + assert_se(!path_complex_pattern("/foobar", "/foobar/waldo")); + assert_se(path_complex_pattern("/foobar/", "/foobar/waldo")); + assert_se(path_complex_pattern("/foobar/waldo", "/foobar/")); + + assert_se(path_simple_pattern("/foo/", "/foo/bar/waldo")); + + assert_se(namespace_simple_pattern("", "")); + assert_se(namespace_simple_pattern("", ".foobar")); + assert_se(namespace_simple_pattern("foobar", "foobar")); + assert_se(namespace_simple_pattern("foobar.waldo", "foobar.waldo")); + assert_se(namespace_simple_pattern("foobar", "foobar.waldo")); + assert_se(!namespace_simple_pattern("foobar.waldo", "foobar")); + assert_se(!namespace_simple_pattern("", "foo")); + assert_se(!namespace_simple_pattern("foo", "")); + assert_se(namespace_simple_pattern("foo.", "foo.bar.waldo")); + + assert_se(streq(object_path_startswith("/foo/bar", "/foo"), "bar")); + assert_se(streq(object_path_startswith("/foo", "/foo"), "")); + assert_se(streq(object_path_startswith("/foo", "/"), "foo")); + assert_se(streq(object_path_startswith("/", "/"), "")); + assert_se(!object_path_startswith("/foo", "/bar")); + assert_se(!object_path_startswith("/", "/bar")); + assert_se(!object_path_startswith("/foo", "")); + + assert_se(object_path_is_valid("/foo/bar")); + assert_se(object_path_is_valid("/foo")); + assert_se(object_path_is_valid("/")); + assert_se(object_path_is_valid("/foo5")); + assert_se(object_path_is_valid("/foo_5")); + assert_se(!object_path_is_valid("")); + assert_se(!object_path_is_valid("/foo/")); + assert_se(!object_path_is_valid("//")); + assert_se(!object_path_is_valid("//foo")); + assert_se(!object_path_is_valid("/foo//bar")); + assert_se(!object_path_is_valid("/foo/aaaäöä")); + + OBJECT_PATH_FOREACH_PREFIX(prefix, "/") { + log_info("<%s>", prefix); + assert_not_reached(); + } + + r = 0; + OBJECT_PATH_FOREACH_PREFIX(prefix, "/xxx") { + log_info("<%s>", prefix); + assert_se(streq(prefix, "/")); + assert_se(r == 0); + r++; + } + assert_se(r == 1); + + r = 0; + OBJECT_PATH_FOREACH_PREFIX(prefix, "/xxx/yyy/zzz") { + log_info("<%s>", prefix); + assert_se(r != 0 || streq(prefix, "/xxx/yyy")); + assert_se(r != 1 || streq(prefix, "/xxx")); + assert_se(r != 2 || streq(prefix, "/")); + r++; + } + assert_se(r == 3); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-track.c b/src/libsystemd/sd-bus/test-bus-track.c new file mode 100644 index 0000000..5604e84 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-track.c @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "macro.h" +#include "tests.h" + +static bool track_cb_called_x = false; +static bool track_cb_called_y = false; +static bool track_destroy_called_z = false; + +static int track_cb_x(sd_bus_track *t, void *userdata) { + + log_error("TRACK CB X"); + + assert_se(!track_cb_called_x); + track_cb_called_x = true; + + /* This means b's name disappeared. Let's now disconnect, to make sure the track handling on disconnect works + * as it should. */ + + assert_se(shutdown(sd_bus_get_fd(sd_bus_track_get_bus(t)), SHUT_RDWR) >= 0); + return 1; +} + +static int track_cb_y(sd_bus_track *t, void *userdata) { + + log_error("TRACK CB Y"); + + assert_se(!track_cb_called_y); + track_cb_called_y = true; + + /* We got disconnected, let's close everything */ + + assert_se(sd_event_exit(sd_bus_get_event(sd_bus_track_get_bus(t)), EXIT_SUCCESS) >= 0); + + return 0; +} + +static int track_cb_z(sd_bus_track *t, void *userdata) { + assert_not_reached(); +} + +static void track_destroy_z(void *userdata) { + track_destroy_called_z = true; +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(sd_bus_track_unrefp) sd_bus_track *x = NULL, *y = NULL, *z = NULL; + _cleanup_(sd_bus_unrefp) sd_bus *a = NULL, *b = NULL; + bool use_system_bus = false; + const char *unique; + int r; + + test_setup_logging(LOG_INFO); + + assert_se(sd_event_default(&event) >= 0); + + r = sd_bus_open_user(&a); + if (IN_SET(r, -ECONNREFUSED, -ENOENT, -ENOMEDIUM)) { + r = sd_bus_open_system(&a); + if (IN_SET(r, -ECONNREFUSED, -ENOENT)) + return log_tests_skipped("Failed to connect to bus"); + use_system_bus = true; + } + assert_se(r >= 0); + + assert_se(sd_bus_attach_event(a, event, SD_EVENT_PRIORITY_NORMAL) >= 0); + + if (use_system_bus) + assert_se(sd_bus_open_system(&b) >= 0); + else + assert_se(sd_bus_open_user(&b) >= 0); + + assert_se(sd_bus_attach_event(b, event, SD_EVENT_PRIORITY_NORMAL) >= 0); + + /* Watch b's name from a */ + assert_se(sd_bus_track_new(a, &x, track_cb_x, NULL) >= 0); + + assert_se(sd_bus_get_unique_name(b, &unique) >= 0); + + assert_se(sd_bus_track_add_name(x, unique) >= 0); + + /* Watch's a's own name from a */ + assert_se(sd_bus_track_new(a, &y, track_cb_y, NULL) >= 0); + + assert_se(sd_bus_get_unique_name(a, &unique) >= 0); + + assert_se(sd_bus_track_add_name(y, unique) >= 0); + + /* Basic tests. */ + assert_se(sd_bus_track_new(a, &z, track_cb_z, NULL) >= 0); + + /* non-recursive case */ + assert_se(sd_bus_track_set_recursive(z, false) >= 0); + assert_se(sd_bus_track_get_recursive(z) == 0); + assert_se(!sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 0); + assert_se(sd_bus_track_remove_name(z, unique) == 0); + assert_se(sd_bus_track_add_name(z, unique) >= 0); + assert_se(sd_bus_track_add_name(z, unique) >= 0); + assert_se(sd_bus_track_add_name(z, unique) >= 0); + assert_se(sd_bus_track_set_recursive(z, true) == -EBUSY); + assert_se(sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 1); + assert_se(sd_bus_track_remove_name(z, unique) == 1); + assert_se(!sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 0); + assert_se(sd_bus_track_remove_name(z, unique) == 0); + + /* recursive case */ + assert_se(sd_bus_track_set_recursive(z, true) >= 0); + assert_se(sd_bus_track_get_recursive(z) == 1); + assert_se(!sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 0); + assert_se(sd_bus_track_remove_name(z, unique) == 0); + assert_se(sd_bus_track_add_name(z, unique) >= 0); + assert_se(sd_bus_track_add_name(z, unique) >= 0); + assert_se(sd_bus_track_add_name(z, unique) >= 0); + assert_se(sd_bus_track_set_recursive(z, false) == -EBUSY); + assert_se(sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 3); + assert_se(sd_bus_track_remove_name(z, unique) == 1); + assert_se(sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 2); + assert_se(sd_bus_track_remove_name(z, unique) == 1); + assert_se(sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 1); + assert_se(sd_bus_track_remove_name(z, unique) == 1); + assert_se(!sd_bus_track_contains(z, unique)); + assert_se(sd_bus_track_count_name(z, unique) == 0); + assert_se(sd_bus_track_remove_name(z, unique) == 0); + + assert_se(sd_bus_track_set_destroy_callback(z, track_destroy_z) >= 0); + z = sd_bus_track_unref(z); + assert_se(track_destroy_called_z); + + /* Now make b's name disappear */ + sd_bus_close(b); + + assert_se(sd_event_loop(event) >= 0); + + assert_se(track_cb_called_x); + assert_se(track_cb_called_y); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-vtable-cc.cc b/src/libsystemd/sd-bus/test-bus-vtable-cc.cc new file mode 120000 index 0000000..abee398 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-vtable-cc.cc @@ -0,0 +1 @@ +test-bus-vtable.c \ No newline at end of file diff --git a/src/libsystemd/sd-bus/test-bus-vtable.c b/src/libsystemd/sd-bus/test-bus-vtable.c new file mode 100644 index 0000000..fe12238 --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-vtable.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +/* We use system assert.h here, because we don't want to keep macro.h and log.h C++ compatible */ +#undef NDEBUG +#include +#include +#include + +#include "sd-bus-vtable.h" + +#ifndef __cplusplus +# include "bus-objects.h" +#endif + +#include "test-vtable-data.h" + +#define DEFAULT_BUS_PATH "unix:path=/run/dbus/system_bus_socket" + +static struct context c = {}; +static int happy_finder_object = 0; + +static int happy_finder(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + assert(userdata); + assert(userdata == &c); + +#ifndef __cplusplus + log_info("%s called", __func__); +#endif + + happy_finder_object++; + *found = &happy_finder_object; + return 1; /* found */ +} + +static void test_vtable(void) { + sd_bus *bus = NULL; + int r; + + assert(sd_bus_new(&bus) >= 0); + + assert(sd_bus_add_object_vtable(bus, NULL, "/foo", "org.freedesktop.systemd.testVtable", test_vtable_2, &c) >= 0); + assert(sd_bus_add_object_vtable(bus, NULL, "/foo", "org.freedesktop.systemd.testVtable2", test_vtable_2, &c) >= 0); + /* the cast on the line below is needed to test with the old version of the table */ + assert(sd_bus_add_object_vtable(bus, NULL, "/foo", "org.freedesktop.systemd.testVtable221", + (const sd_bus_vtable *)vtable_format_221, &c) >= 0); + + assert(sd_bus_add_fallback_vtable(bus, NULL, "/fallback", "org.freedesktop.systemd.testVtable2", test_vtable_2, happy_finder, &c) >= 0); + + assert(sd_bus_set_address(bus, DEFAULT_BUS_PATH) >= 0); + r = sd_bus_start(bus); + assert(r == 0 || /* success */ + r == -ENOENT /* dbus is inactive */ ); + +#ifndef __cplusplus + _cleanup_free_ char *s, *s2; + + assert_se(introspect_path(bus, "/foo", NULL, false, true, NULL, &s, NULL) == 1); + fputs(s, stdout); + + assert_se(introspect_path(bus, "/fallback", NULL, false, true, NULL, &s2, NULL) == 1); + fputs(s2, stdout); + + assert_se(happy_finder_object == 1); +#endif + + sd_bus_unref(bus); +} + +int main(int argc, char **argv) { + test_vtable(); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-bus-watch-bind.c b/src/libsystemd/sd-bus/test-bus-watch-bind.c new file mode 100644 index 0000000..7f73c6e --- /dev/null +++ b/src/libsystemd/sd-bus/test-bus-watch-bind.c @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-bus.h" +#include "sd-event.h" +#include "sd-id128.h" + +#include "alloc-util.h" +#include "bus-internal.h" +#include "fd-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "path-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "socket-util.h" +#include "string-util.h" +#include "tmpfile-util.h" +#include "tests.h" + +static int method_foobar(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + log_info("Got Foobar() call."); + + assert_se(sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0) >= 0); + return sd_bus_reply_method_return(m, NULL); +} + +static int method_exit(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + log_info("Got Exit() call"); + + assert_se(sd_bus_reply_method_return(m, NULL) >= 0); + /* Simulate D-Bus going away to test the bus_exit_now() path with exit_on_disconnect set */ + bus_enter_closing(sd_bus_message_get_bus(m)); + return 0; +} + +static const sd_bus_vtable vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("Foobar", NULL, NULL, method_foobar, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Exit", NULL, NULL, method_exit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END, +}; + +static void* thread_server(void *p) { + _cleanup_free_ char *suffixed = NULL, *suffixed2 = NULL, *d = NULL; + _cleanup_close_ int fd = -EBADF; + union sockaddr_union u; + const char *path = p; + int r; + + log_debug("Initializing server"); + + /* Let's play some games, by slowly creating the socket directory, and renaming it in the middle */ + usleep_safe(100 * USEC_PER_MSEC); + + assert_se(mkdir_parents(path, 0755) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + assert_se(path_extract_directory(path, &d) >= 0); + assert_se(asprintf(&suffixed, "%s.%" PRIx64, d, random_u64()) >= 0); + assert_se(rename(d, suffixed) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + assert_se(asprintf(&suffixed2, "%s.%" PRIx64, d, random_u64()) >= 0); + assert_se(symlink(suffixed2, d) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + assert_se(symlink(basename(suffixed), suffixed2) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + socklen_t sa_len; + r = sockaddr_un_set_path(&u.un, path); + assert_se(r >= 0); + sa_len = r; + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(fd >= 0); + + assert_se(bind(fd, &u.sa, sa_len) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + assert_se(listen(fd, SOMAXCONN_DELUXE) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + assert_se(touch(path) >= 0); + usleep_safe(100 * USEC_PER_MSEC); + + log_debug("Initialized server"); + + for (;;) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + sd_id128_t id; + int bus_fd, code; + + assert_se(sd_id128_randomize(&id) >= 0); + + assert_se(sd_event_new(&event) >= 0); + + bus_fd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + assert_se(bus_fd >= 0); + + log_debug("Accepted server connection"); + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_exit_on_disconnect(bus, true) >= 0); + assert_se(sd_bus_set_description(bus, "server") >= 0); + assert_se(sd_bus_set_fd(bus, bus_fd, bus_fd) >= 0); + assert_se(sd_bus_set_server(bus, true, id) >= 0); + /* assert_se(sd_bus_set_anonymous(bus, true) >= 0); */ + + assert_se(sd_bus_attach_event(bus, event, 0) >= 0); + + assert_se(sd_bus_add_object_vtable(bus, NULL, "/foo", "foo.TestInterface", vtable, NULL) >= 0); + + assert_se(sd_bus_start(bus) >= 0); + + assert_se(sd_event_loop(event) >= 0); + + assert_se(sd_event_get_exit_code(event, &code) >= 0); + + if (code > 0) + break; + } + + log_debug("Server done"); + + return NULL; +} + +static void* thread_client1(void *p) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + const char *path = p, *t; + int r; + + log_debug("Initializing client1"); + + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_description(bus, "client1") >= 0); + + t = strjoina("unix:path=", path); + assert_se(sd_bus_set_address(bus, t) >= 0); + assert_se(sd_bus_set_watch_bind(bus, true) >= 0); + assert_se(sd_bus_start(bus) >= 0); + + r = sd_bus_call_method(bus, "foo.bar", "/foo", "foo.TestInterface", "Foobar", &error, NULL, NULL); + assert_se(r >= 0); + + log_debug("Client1 done"); + + return NULL; +} + +static int client2_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + assert_se(sd_bus_message_is_method_error(m, NULL) == 0); + assert_se(sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0) >= 0); + return 0; +} + +static void* thread_client2(void *p) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + const char *path = p, *t; + + log_debug("Initializing client2"); + + assert_se(sd_event_new(&event) >= 0); + assert_se(sd_bus_new(&bus) >= 0); + assert_se(sd_bus_set_description(bus, "client2") >= 0); + + t = strjoina("unix:path=", path); + assert_se(sd_bus_set_address(bus, t) >= 0); + assert_se(sd_bus_set_watch_bind(bus, true) >= 0); + assert_se(sd_bus_attach_event(bus, event, 0) >= 0); + assert_se(sd_bus_start(bus) >= 0); + + assert_se(sd_bus_call_method_async(bus, NULL, "foo.bar", "/foo", "foo.TestInterface", "Foobar", client2_callback, NULL, NULL) >= 0); + + assert_se(sd_event_loop(event) >= 0); + + log_debug("Client2 done"); + + return NULL; +} + +static void request_exit(const char *path) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + const char *t; + + assert_se(sd_bus_new(&bus) >= 0); + + t = strjoina("unix:path=", path); + assert_se(sd_bus_set_address(bus, t) >= 0); + assert_se(sd_bus_set_watch_bind(bus, true) >= 0); + assert_se(sd_bus_set_description(bus, "request-exit") >= 0); + assert_se(sd_bus_start(bus) >= 0); + + assert_se(sd_bus_call_method(bus, "foo.bar", "/foo", "foo.TestInterface", "Exit", NULL, NULL, NULL) >= 0); +} + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + pthread_t server, client1, client2; + char *path; + + test_setup_logging(LOG_DEBUG); + + /* We use /dev/shm here rather than /tmp, since some weird distros might set up /tmp as some weird fs that + * doesn't support inotify properly. */ + assert_se(mkdtemp_malloc("/dev/shm/systemd-watch-bind-XXXXXX", &d) >= 0); + + path = strjoina(d, "/this/is/a/socket"); + + assert_se(pthread_create(&server, NULL, thread_server, path) == 0); + assert_se(pthread_create(&client1, NULL, thread_client1, path) == 0); + assert_se(pthread_create(&client2, NULL, thread_client2, path) == 0); + + assert_se(pthread_join(client1, NULL) == 0); + assert_se(pthread_join(client2, NULL) == 0); + + request_exit(path); + + assert_se(pthread_join(server, NULL) == 0); + + return 0; +} diff --git a/src/libsystemd/sd-bus/test-vtable-data.h b/src/libsystemd/sd-bus/test-vtable-data.h new file mode 100644 index 0000000..7269a49 --- /dev/null +++ b/src/libsystemd/sd-bus/test-vtable-data.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* This is meant to be included in other files, hence no headers */ + +struct context { + bool quit; + char *something; + char *automatic_string_property; + uint32_t automatic_integer_property; +}; + +static int handler(sd_bus_message *m, void *userdata, sd_bus_error *error) { + return 1; +} + +static int value_handler(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error) { + return 1; +} + +static int get_handler(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error) { + return 1; +} + +static int set_handler(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error) { + return 1; +} + +static const sd_bus_vtable test_vtable_1[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("Hello", "ssas", "a(uu)", NULL, 0), + SD_BUS_METHOD("DeprecatedHello", "", "", NULL, SD_BUS_VTABLE_DEPRECATED), + SD_BUS_METHOD("DeprecatedHelloNoReply", "", "", NULL, SD_BUS_VTABLE_DEPRECATED|SD_BUS_VTABLE_METHOD_NO_REPLY), + SD_BUS_SIGNAL("Wowza", "sss", 0), + SD_BUS_SIGNAL("DeprecatedWowza", "ut", SD_BUS_VTABLE_DEPRECATED), + SD_BUS_WRITABLE_PROPERTY("AProperty", "s", get_handler, set_handler, 0, 0), + SD_BUS_PROPERTY("AReadOnlyDeprecatedProperty", "(ut)", get_handler, 0, SD_BUS_VTABLE_DEPRECATED), + SD_BUS_PROPERTY("ChangingProperty", "t", get_handler, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Invalidating", "t", get_handler, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("Constant", "t", get_handler, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_PROPERTY_EXPLICIT), + SD_BUS_VTABLE_END +}; + +static const sd_bus_vtable test_vtable_2[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("AlterSomething", "s", "s", handler, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Exit", "", "", handler, 0), + SD_BUS_METHOD_WITH_OFFSET("AlterSomething2", "s", "s", handler, 200, 0), + SD_BUS_METHOD_WITH_OFFSET("Exit2", "", "", handler, 200, 0), + SD_BUS_METHOD_WITH_NAMES_OFFSET("AlterSomething3", "so", SD_BUS_PARAM(string) SD_BUS_PARAM(path), + "s", SD_BUS_PARAM(returnstring), handler, 200, 0), + SD_BUS_METHOD_WITH_NAMES("Exit3", "bx", SD_BUS_PARAM(with_confirmation) SD_BUS_PARAM(after_msec), + "bb", SD_BUS_PARAM(accepted) SD_BUS_PARAM(scheduled), handler, 0), + SD_BUS_PROPERTY("Value", "s", value_handler, 10, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Value2", "s", value_handler, 10, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_PROPERTY("Value3", "s", value_handler, 10, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Value4", "s", value_handler, 10, 0), + SD_BUS_PROPERTY("AnExplicitProperty", "s", NULL, offsetof(struct context, something), + SD_BUS_VTABLE_PROPERTY_EXPLICIT|SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), + SD_BUS_WRITABLE_PROPERTY("Something", "s", get_handler, set_handler, 0, 0), + SD_BUS_WRITABLE_PROPERTY("AutomaticStringProperty", "s", NULL, NULL, + offsetof(struct context, automatic_string_property), 0), + SD_BUS_WRITABLE_PROPERTY("AutomaticIntegerProperty", "u", NULL, NULL, + offsetof(struct context, automatic_integer_property), 0), + SD_BUS_METHOD("NoOperation", NULL, NULL, NULL, 0), + SD_BUS_SIGNAL("DummySignal", "b", 0), + SD_BUS_SIGNAL("DummySignal2", "so", 0), + SD_BUS_SIGNAL_WITH_NAMES("DummySignal3", "so", SD_BUS_PARAM(string) SD_BUS_PARAM(path), 0), + SD_BUS_VTABLE_END +}; + +static const sd_bus_vtable test_vtable_deprecated[] = { + SD_BUS_VTABLE_START(SD_BUS_VTABLE_DEPRECATED), + SD_BUS_VTABLE_END +}; + +struct sd_bus_vtable_221 { + uint8_t type:8; + uint64_t flags:56; + union { + struct { + size_t element_size; + } start; + struct { + const char *member; + const char *signature; + const char *result; + sd_bus_message_handler_t handler; + size_t offset; + } method; + struct { + const char *member; + const char *signature; + } signal; + struct { + const char *member; + const char *signature; + sd_bus_property_get_t get; + sd_bus_property_set_t set; + size_t offset; + } property; + } x; +}; + +static const struct sd_bus_vtable_221 vtable_format_221[] = { + { + .type = _SD_BUS_VTABLE_START, + .flags = 0, + .x = { + .start = { + .element_size = sizeof(struct sd_bus_vtable_221) + }, + }, + }, + { + .type = _SD_BUS_VTABLE_METHOD, + .flags = 0, + .x = { + .method = { + .member = "Exit", + .signature = "", + .result = "", + .handler = handler, + .offset = 0, + }, + }, + }, + { + .type = _SD_BUS_VTABLE_END, + .flags = 0, + .x = { { 0 } }, + } +}; diff --git a/src/libsystemd/sd-daemon/sd-daemon.c b/src/libsystemd/sd-daemon/sd-daemon.c new file mode 100644 index 0000000..6a60cde --- /dev/null +++ b/src/libsystemd/sd-daemon/sd-daemon.c @@ -0,0 +1,775 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "iovec-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "strv.h" +#include "time-util.h" + +#define SNDBUF_SIZE (8*1024*1024) + +static void unsetenv_all(bool unset_environment) { + if (!unset_environment) + return; + + assert_se(unsetenv("LISTEN_PID") == 0); + assert_se(unsetenv("LISTEN_FDS") == 0); + assert_se(unsetenv("LISTEN_FDNAMES") == 0); +} + +_public_ int sd_listen_fds(int unset_environment) { + const char *e; + int n, r; + pid_t pid; + + e = getenv("LISTEN_PID"); + if (!e) { + r = 0; + goto finish; + } + + r = parse_pid(e, &pid); + if (r < 0) + goto finish; + + /* Is this for us? */ + if (getpid_cached() != pid) { + r = 0; + goto finish; + } + + e = getenv("LISTEN_FDS"); + if (!e) { + r = 0; + goto finish; + } + + r = safe_atoi(e, &n); + if (r < 0) + goto finish; + + assert_cc(SD_LISTEN_FDS_START < INT_MAX); + if (n <= 0 || n > INT_MAX - SD_LISTEN_FDS_START) { + r = -EINVAL; + goto finish; + } + + for (int fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd ++) { + r = fd_cloexec(fd, true); + if (r < 0) + goto finish; + } + + r = n; + +finish: + unsetenv_all(unset_environment); + return r; +} + +_public_ int sd_listen_fds_with_names(int unset_environment, char ***names) { + _cleanup_strv_free_ char **l = NULL; + bool have_names; + int n_names = 0, n_fds; + const char *e; + int r; + + if (!names) + return sd_listen_fds(unset_environment); + + e = getenv("LISTEN_FDNAMES"); + if (e) { + n_names = strv_split_full(&l, e, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (n_names < 0) { + unsetenv_all(unset_environment); + return n_names; + } + + have_names = true; + } else + have_names = false; + + n_fds = sd_listen_fds(unset_environment); + if (n_fds <= 0) + return n_fds; + + if (have_names) { + if (n_names != n_fds) + return -EINVAL; + } else { + r = strv_extend_n(&l, "unknown", n_fds); + if (r < 0) + return r; + } + + *names = TAKE_PTR(l); + + return n_fds; +} + +_public_ int sd_is_fifo(int fd, const char *path) { + struct stat st_fd; + + assert_return(fd >= 0, -EBADF); + + if (fstat(fd, &st_fd) < 0) + return -errno; + + if (!S_ISFIFO(st_fd.st_mode)) + return 0; + + if (path) { + struct stat st_path; + + if (stat(path, &st_path) < 0) { + + if (IN_SET(errno, ENOENT, ENOTDIR)) + return 0; + + return -errno; + } + + return stat_inode_same(&st_path, &st_fd); + } + + return 1; +} + +_public_ int sd_is_special(int fd, const char *path) { + struct stat st_fd; + + assert_return(fd >= 0, -EBADF); + + if (fstat(fd, &st_fd) < 0) + return -errno; + + if (!S_ISREG(st_fd.st_mode) && !S_ISCHR(st_fd.st_mode)) + return 0; + + if (path) { + struct stat st_path; + + if (stat(path, &st_path) < 0) { + + if (IN_SET(errno, ENOENT, ENOTDIR)) + return 0; + + return -errno; + } + + if (S_ISREG(st_fd.st_mode) && S_ISREG(st_path.st_mode)) + return stat_inode_same(&st_path, &st_fd); + else if (S_ISCHR(st_fd.st_mode) && S_ISCHR(st_path.st_mode)) + return st_path.st_rdev == st_fd.st_rdev; + else + return 0; + } + + return 1; +} + +static int is_socket_internal(int fd, int type, int listening) { + struct stat st_fd; + + assert_return(fd >= 0, -EBADF); + assert_return(type >= 0, -EINVAL); + + if (fstat(fd, &st_fd) < 0) + return -errno; + + if (!S_ISSOCK(st_fd.st_mode)) + return 0; + + if (type != 0) { + int other_type = 0; + socklen_t l = sizeof(other_type); + + if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &other_type, &l) < 0) + return -errno; + + if (l != sizeof(other_type)) + return -EINVAL; + + if (other_type != type) + return 0; + } + + if (listening >= 0) { + int accepting = 0; + socklen_t l = sizeof(accepting); + + if (getsockopt(fd, SOL_SOCKET, SO_ACCEPTCONN, &accepting, &l) < 0) + return -errno; + + if (l != sizeof(accepting)) + return -EINVAL; + + if (!accepting != !listening) + return 0; + } + + return 1; +} + +_public_ int sd_is_socket(int fd, int family, int type, int listening) { + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(family >= 0, -EINVAL); + + r = is_socket_internal(fd, type, listening); + if (r <= 0) + return r; + + if (family > 0) { + union sockaddr_union sockaddr = {}; + socklen_t l = sizeof(sockaddr); + + if (getsockname(fd, &sockaddr.sa, &l) < 0) + return -errno; + + if (l < sizeof(sa_family_t)) + return -EINVAL; + + return sockaddr.sa.sa_family == family; + } + + return 1; +} + +_public_ int sd_is_socket_inet(int fd, int family, int type, int listening, uint16_t port) { + union sockaddr_union sockaddr = {}; + socklen_t l = sizeof(sockaddr); + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(IN_SET(family, 0, AF_INET, AF_INET6), -EINVAL); + + r = is_socket_internal(fd, type, listening); + if (r <= 0) + return r; + + if (getsockname(fd, &sockaddr.sa, &l) < 0) + return -errno; + + if (l < sizeof(sa_family_t)) + return -EINVAL; + + if (!IN_SET(sockaddr.sa.sa_family, AF_INET, AF_INET6)) + return 0; + + if (family != 0) + if (sockaddr.sa.sa_family != family) + return 0; + + if (port > 0) { + unsigned sa_port; + + r = sockaddr_port(&sockaddr.sa, &sa_port); + if (r < 0) + return r; + + return port == sa_port; + } + + return 1; +} + +_public_ int sd_is_socket_sockaddr(int fd, int type, const struct sockaddr* addr, unsigned addr_len, int listening) { + union sockaddr_union sockaddr = {}; + socklen_t l = sizeof(sockaddr); + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(addr, -EINVAL); + assert_return(addr_len >= sizeof(sa_family_t), -ENOBUFS); + assert_return(IN_SET(addr->sa_family, AF_INET, AF_INET6), -EPFNOSUPPORT); + + r = is_socket_internal(fd, type, listening); + if (r <= 0) + return r; + + if (getsockname(fd, &sockaddr.sa, &l) < 0) + return -errno; + + if (l < sizeof(sa_family_t)) + return -EINVAL; + + if (sockaddr.sa.sa_family != addr->sa_family) + return 0; + + if (sockaddr.sa.sa_family == AF_INET) { + const struct sockaddr_in *in = (const struct sockaddr_in *) addr; + + if (l < sizeof(struct sockaddr_in) || addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (in->sin_port != 0 && + sockaddr.in.sin_port != in->sin_port) + return false; + + return sockaddr.in.sin_addr.s_addr == in->sin_addr.s_addr; + + } else { + const struct sockaddr_in6 *in = (const struct sockaddr_in6 *) addr; + + if (l < sizeof(struct sockaddr_in6) || addr_len < sizeof(struct sockaddr_in6)) + return -EINVAL; + + if (in->sin6_port != 0 && + sockaddr.in6.sin6_port != in->sin6_port) + return false; + + if (in->sin6_flowinfo != 0 && + sockaddr.in6.sin6_flowinfo != in->sin6_flowinfo) + return false; + + if (in->sin6_scope_id != 0 && + sockaddr.in6.sin6_scope_id != in->sin6_scope_id) + return false; + + return memcmp(sockaddr.in6.sin6_addr.s6_addr, in->sin6_addr.s6_addr, + sizeof(in->sin6_addr.s6_addr)) == 0; + } +} + +_public_ int sd_is_socket_unix(int fd, int type, int listening, const char *path, size_t length) { + union sockaddr_union sockaddr = {}; + socklen_t l = sizeof(sockaddr); + int r; + + assert_return(fd >= 0, -EBADF); + + r = is_socket_internal(fd, type, listening); + if (r <= 0) + return r; + + if (getsockname(fd, &sockaddr.sa, &l) < 0) + return -errno; + + if (l < sizeof(sa_family_t)) + return -EINVAL; + + if (sockaddr.sa.sa_family != AF_UNIX) + return 0; + + if (path) { + if (length == 0) + length = strlen(path); + + if (length == 0) + /* Unnamed socket */ + return l == offsetof(struct sockaddr_un, sun_path); + + if (path[0]) + /* Normal path socket */ + return + (l >= offsetof(struct sockaddr_un, sun_path) + length + 1) && + memcmp(path, sockaddr.un.sun_path, length+1) == 0; + else + /* Abstract namespace socket */ + return + (l == offsetof(struct sockaddr_un, sun_path) + length) && + memcmp(path, sockaddr.un.sun_path, length) == 0; + } + + return 1; +} + +_public_ int sd_is_mq(int fd, const char *path) { + struct mq_attr attr; + + /* Check that the fd is valid */ + assert_return(fcntl(fd, F_GETFD) >= 0, -errno); + + if (mq_getattr(fd, &attr) < 0) { + if (errno == EBADF) + /* A non-mq fd (or an invalid one, but we ruled that out above) */ + return 0; + return -errno; + } + + if (path) { + _cleanup_free_ char *fpath = NULL; + struct stat a, b; + + assert_return(path_is_absolute(path), -EINVAL); + + if (fstat(fd, &a) < 0) + return -errno; + + fpath = path_join("/dev/mqueue", path); + if (!fpath) + return -ENOMEM; + + if (stat(fpath, &b) < 0) + return -errno; + + if (!stat_inode_same(&a, &b)) + return 0; + } + + return 1; +} + +static int vsock_bind_privileged_port(int fd) { + union sockaddr_union sa = { + .vm.svm_family = AF_VSOCK, + .vm.svm_cid = VMADDR_CID_ANY, + .vm.svm_port = 1023, + }; + int r; + + assert(fd >= 0); + + do + r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm))); + while (r == -EADDRINUSE && --sa.vm.svm_port > 0); + + return r; +} + +static int pid_notify_with_fds_internal( + pid_t pid, + const char *state, + const int *fds, + unsigned n_fds) { + SocketAddress address; + struct iovec iovec; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_name = &address.sockaddr, + }; + _cleanup_close_ int fd = -EBADF; + struct cmsghdr *cmsg = NULL; + const char *e; + bool send_ucred; + ssize_t n; + int type, r; + + if (!state) + return -EINVAL; + + if (n_fds > 0 && !fds) + return -EINVAL; + + e = getenv("NOTIFY_SOCKET"); + if (!e) + return 0; + + /* Allow AF_UNIX and AF_VSOCK, reject the rest. */ + r = socket_address_parse_unix(&address, e); + if (r == -EPROTO) + r = socket_address_parse_vsock(&address, e); + if (r < 0) + return r; + msghdr.msg_namelen = address.size; + + /* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out, + * we always require a specific CID. */ + if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) + return -EINVAL; + + type = address.type == 0 ? SOCK_DGRAM : address.type; + + /* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns + * ENODEV. Fallback to SOCK_SEQPACKET in that case. */ + fd = socket(address.sockaddr.sa.sa_family, type|SOCK_CLOEXEC, 0); + if (fd < 0) { + if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK || address.type > 0) + return log_debug_errno(errno, "Failed to open %s notify socket to '%s': %m", socket_address_type_to_string(type), e); + + type = SOCK_SEQPACKET; + fd = socket(address.sockaddr.sa.sa_family, type|SOCK_CLOEXEC, 0); + if (fd < 0 && ERRNO_IS_NOT_SUPPORTED(errno)) { + type = SOCK_STREAM; + fd = socket(address.sockaddr.sa.sa_family, type|SOCK_CLOEXEC, 0); + } + if (fd < 0) + return log_debug_errno(errno, "Failed to open %s socket to '%s': %m", socket_address_type_to_string(type), e); + } + + if (address.sockaddr.sa.sa_family == AF_VSOCK) { + r = vsock_bind_privileged_port(fd); + if (r < 0 && !ERRNO_IS_PRIVILEGE(r)) + return log_debug_errno(r, "Failed to bind socket to privileged port: %m"); + } + + if (IN_SET(type, SOCK_STREAM, SOCK_SEQPACKET)) { + if (connect(fd, &address.sockaddr.sa, address.size) < 0) + return log_debug_errno(errno, "Failed to connect socket to '%s': %m", e); + + msghdr.msg_name = NULL; + msghdr.msg_namelen = 0; + } + + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + iovec = IOVEC_MAKE_STRING(state); + + send_ucred = + (pid != 0 && pid != getpid_cached()) || + getuid() != geteuid() || + getgid() != getegid(); + + if (n_fds > 0 || send_ucred) { + /* CMSG_SPACE(0) may return value different than zero, which results in miscalculated controllen. */ + msghdr.msg_controllen = + (n_fds > 0 ? CMSG_SPACE(sizeof(int) * n_fds) : 0) + + (send_ucred ? CMSG_SPACE(sizeof(struct ucred)) : 0); + + msghdr.msg_control = alloca0(msghdr.msg_controllen); + + cmsg = CMSG_FIRSTHDR(&msghdr); + if (n_fds > 0) { + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * n_fds); + + memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * n_fds); + + if (send_ucred) + assert_se(cmsg = CMSG_NXTHDR(&msghdr, cmsg)); + } + + if (send_ucred) { + struct ucred *ucred; + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + + ucred = CMSG_TYPED_DATA(cmsg, struct ucred); + ucred->pid = pid != 0 ? pid : getpid_cached(); + ucred->uid = getuid(); + ucred->gid = getgid(); + } + } + + do { + /* First try with fake ucred data, as requested */ + n = sendmsg(fd, &msghdr, MSG_NOSIGNAL); + if (n < 0) { + if (!send_ucred) + return log_debug_errno(errno, "Failed to send notify message to '%s': %m", e); + + /* If that failed, try with our own ucred instead */ + msghdr.msg_controllen -= CMSG_SPACE(sizeof(struct ucred)); + if (msghdr.msg_controllen == 0) + msghdr.msg_control = NULL; + + n = 0; + send_ucred = false; + } else { + /* Unless we're using SOCK_STREAM, we expect to write all the contents immediately. */ + if (type != SOCK_STREAM && (size_t) n < iovec_total_size(msghdr.msg_iov, msghdr.msg_iovlen)) + return -EIO; + + /* Make sure we only send fds and ucred once, even if we're using SOCK_STREAM. */ + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + } + } while (!iovec_increment(msghdr.msg_iov, msghdr.msg_iovlen, n)); + + return 1; +} + +_public_ int sd_pid_notify_with_fds( + pid_t pid, + int unset_environment, + const char *state, + const int *fds, + unsigned n_fds) { + + int r; + + r = pid_notify_with_fds_internal(pid, state, fds, n_fds); + + if (unset_environment) + assert_se(unsetenv("NOTIFY_SOCKET") == 0); + + return r; +} + +_public_ int sd_pid_notify_barrier(pid_t pid, int unset_environment, uint64_t timeout) { + _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR; + int r; + + if (pipe2(pipe_fd, O_CLOEXEC) < 0) + return -errno; + + r = sd_pid_notify_with_fds(pid, unset_environment, "BARRIER=1", &pipe_fd[1], 1); + if (r <= 0) + return r; + + pipe_fd[1] = safe_close(pipe_fd[1]); + + r = fd_wait_for_event(pipe_fd[0], 0 /* POLLHUP is implicit */, timeout); + if (r < 0) + return r; + if (r == 0) + return -ETIMEDOUT; + + return 1; +} + +_public_ int sd_notify_barrier(int unset_environment, uint64_t timeout) { + return sd_pid_notify_barrier(0, unset_environment, timeout); +} + +_public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state) { + return sd_pid_notify_with_fds(pid, unset_environment, state, NULL, 0); +} + +_public_ int sd_notify(int unset_environment, const char *state) { + return sd_pid_notify_with_fds(0, unset_environment, state, NULL, 0); +} + +_public_ int sd_pid_notifyf(pid_t pid, int unset_environment, const char *format, ...) { + _cleanup_free_ char *p = NULL; + int r; + + if (format) { + va_list ap; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0 || !p) + return -ENOMEM; + } + + return sd_pid_notify(pid, unset_environment, p); +} + +_public_ int sd_notifyf(int unset_environment, const char *format, ...) { + _cleanup_free_ char *p = NULL; + int r; + + if (format) { + va_list ap; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0 || !p) + return -ENOMEM; + } + + return sd_pid_notify(0, unset_environment, p); +} + +_public_ int sd_pid_notifyf_with_fds( + pid_t pid, + int unset_environment, + const int *fds, size_t n_fds, + const char *format, ...) { + + _cleanup_free_ char *p = NULL; + int r; + + /* Paranoia check: we traditionally used 'unsigned' as array size, but we nowadays more correctly use + * 'size_t'. sd_pid_notifyf_with_fds() and sd_pid_notify_with_fds() are from different eras, hence + * differ in this. Let's catch resulting incompatibilites early, even though they are pretty much + * theoretic only */ + if (n_fds > UINT_MAX) + return -E2BIG; + + if (format) { + va_list ap; + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0 || !p) + return -ENOMEM; + } + + return sd_pid_notify_with_fds(pid, unset_environment, p, fds, n_fds); +} + +_public_ int sd_booted(void) { + /* We test whether the runtime unit file directory has been + * created. This takes place in mount-setup.c, so is + * guaranteed to happen very early during boot. */ + + if (laccess("/run/systemd/system/", F_OK) >= 0) + return true; + + if (errno == ENOENT) + return false; + + return -errno; +} + +_public_ int sd_watchdog_enabled(int unset_environment, uint64_t *usec) { + const char *s, *p = ""; /* p is set to dummy value to do unsetting */ + uint64_t u; + int r = 0; + + s = getenv("WATCHDOG_USEC"); + if (!s) + goto finish; + + r = safe_atou64(s, &u); + if (r < 0) + goto finish; + if (!timestamp_is_set(u)) { + r = -EINVAL; + goto finish; + } + + p = getenv("WATCHDOG_PID"); + if (p) { + pid_t pid; + + r = parse_pid(p, &pid); + if (r < 0) + goto finish; + + /* Is this for us? */ + if (getpid_cached() != pid) { + r = 0; + goto finish; + } + } + + if (usec) + *usec = u; + + r = 1; + +finish: + if (unset_environment && s) + assert_se(unsetenv("WATCHDOG_USEC") == 0); + if (unset_environment && p) + assert_se(unsetenv("WATCHDOG_PID") == 0); + + return r; +} diff --git a/src/libsystemd/sd-device/device-enumerator-private.h b/src/libsystemd/sd-device/device-enumerator-private.h new file mode 100644 index 0000000..cf62fab --- /dev/null +++ b/src/libsystemd/sd-device/device-enumerator-private.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +typedef enum MatchInitializedType { + MATCH_INITIALIZED_NO, /* only devices without a db entry */ + MATCH_INITIALIZED_YES, /* only devices with a db entry */ + MATCH_INITIALIZED_ALL, /* all devices */ + MATCH_INITIALIZED_COMPAT, /* only devices that have no devnode/ifindex or have a db entry */ + _MATCH_INITIALIZED_MAX, + _MATCH_INITIALIZED_INVALID = -EINVAL, +} MatchInitializedType; + +int device_enumerator_scan_devices(sd_device_enumerator *enumerator); +int device_enumerator_scan_subsystems(sd_device_enumerator *enumerator); +int device_enumerator_scan_devices_and_subsystems(sd_device_enumerator *enumerator); +int device_enumerator_add_device(sd_device_enumerator *enumerator, sd_device *device); +int device_enumerator_add_parent_devices(sd_device_enumerator *enumerator, sd_device *device); +int device_enumerator_add_match_is_initialized(sd_device_enumerator *enumerator, MatchInitializedType type); +int device_enumerator_add_match_parent_incremental(sd_device_enumerator *enumerator, sd_device *parent); +int device_enumerator_add_prioritized_subsystem(sd_device_enumerator *enumerator, const char *subsystem); +sd_device *device_enumerator_get_first(sd_device_enumerator *enumerator); +sd_device *device_enumerator_get_next(sd_device_enumerator *enumerator); +sd_device **device_enumerator_get_devices(sd_device_enumerator *enumerator, size_t *ret_n_devices); + +#define FOREACH_DEVICE_AND_SUBSYSTEM(enumerator, device) \ + for (device = device_enumerator_get_first(enumerator); \ + device; \ + device = device_enumerator_get_next(enumerator)) diff --git a/src/libsystemd/sd-device/device-enumerator.c b/src/libsystemd/sd-device/device-enumerator.c new file mode 100644 index 0000000..15c5c42 --- /dev/null +++ b/src/libsystemd/sd-device/device-enumerator.c @@ -0,0 +1,1194 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-enumerator-private.h" +#include "device-filter.h" +#include "device-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "set.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" + +typedef enum DeviceEnumerationType { + DEVICE_ENUMERATION_TYPE_DEVICES, + DEVICE_ENUMERATION_TYPE_SUBSYSTEMS, + DEVICE_ENUMERATION_TYPE_ALL, + _DEVICE_ENUMERATION_TYPE_MAX, + _DEVICE_ENUMERATION_TYPE_INVALID = -EINVAL, +} DeviceEnumerationType; + +struct sd_device_enumerator { + unsigned n_ref; + + DeviceEnumerationType type; + Hashmap *devices_by_syspath; + sd_device **devices; + size_t n_devices, current_device_index; + bool scan_uptodate; + bool sorted; + + char **prioritized_subsystems; + Set *match_subsystem; + Set *nomatch_subsystem; + Hashmap *match_sysattr; + Hashmap *nomatch_sysattr; + Hashmap *match_property; + Hashmap *match_property_required; + Set *match_sysname; + Set *nomatch_sysname; + Set *match_tag; + Set *match_parent; + MatchInitializedType match_initialized; +}; + +_public_ int sd_device_enumerator_new(sd_device_enumerator **ret) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *enumerator = NULL; + + assert(ret); + + enumerator = new(sd_device_enumerator, 1); + if (!enumerator) + return -ENOMEM; + + *enumerator = (sd_device_enumerator) { + .n_ref = 1, + .type = _DEVICE_ENUMERATION_TYPE_INVALID, + .match_initialized = MATCH_INITIALIZED_COMPAT, + }; + + *ret = TAKE_PTR(enumerator); + + return 0; +} + +static void device_unref_many(sd_device **devices, size_t n) { + assert(devices || n == 0); + + for (size_t i = 0; i < n; i++) + sd_device_unref(devices[i]); +} + +static void device_enumerator_unref_devices(sd_device_enumerator *enumerator) { + assert(enumerator); + + hashmap_clear_with_destructor(enumerator->devices_by_syspath, sd_device_unref); + device_unref_many(enumerator->devices, enumerator->n_devices); + enumerator->devices = mfree(enumerator->devices); + enumerator->n_devices = 0; +} + +static sd_device_enumerator *device_enumerator_free(sd_device_enumerator *enumerator) { + assert(enumerator); + + device_enumerator_unref_devices(enumerator); + + hashmap_free(enumerator->devices_by_syspath); + strv_free(enumerator->prioritized_subsystems); + set_free(enumerator->match_subsystem); + set_free(enumerator->nomatch_subsystem); + hashmap_free(enumerator->match_sysattr); + hashmap_free(enumerator->nomatch_sysattr); + hashmap_free(enumerator->match_property); + hashmap_free(enumerator->match_property_required); + set_free(enumerator->match_sysname); + set_free(enumerator->nomatch_sysname); + set_free(enumerator->match_tag); + set_free(enumerator->match_parent); + + return mfree(enumerator); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_device_enumerator, sd_device_enumerator, device_enumerator_free); + +int device_enumerator_add_prioritized_subsystem(sd_device_enumerator *enumerator, const char *subsystem) { + int r; + + assert(enumerator); + assert(subsystem); + + if (strv_contains(enumerator->prioritized_subsystems, subsystem)) + return 0; + + r = strv_extend(&enumerator->prioritized_subsystems, subsystem); + if (r < 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +_public_ int sd_device_enumerator_add_match_subsystem(sd_device_enumerator *enumerator, const char *subsystem, int match) { + Set **set; + int r; + + assert_return(enumerator, -EINVAL); + assert_return(subsystem, -EINVAL); + + if (match) + set = &enumerator->match_subsystem; + else + set = &enumerator->nomatch_subsystem; + + r = set_put_strdup(set, subsystem); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +_public_ int sd_device_enumerator_add_match_sysattr(sd_device_enumerator *enumerator, const char *sysattr, const char *value, int match) { + Hashmap **hashmap; + int r; + + assert_return(enumerator, -EINVAL); + assert_return(sysattr, -EINVAL); + + if (match) + hashmap = &enumerator->match_sysattr; + else + hashmap = &enumerator->nomatch_sysattr; + + r = update_match_strv(hashmap, sysattr, value, /* clear_on_null = */ true); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +_public_ int sd_device_enumerator_add_match_property(sd_device_enumerator *enumerator, const char *property, const char *value) { + int r; + + assert_return(enumerator, -EINVAL); + assert_return(property, -EINVAL); + + r = update_match_strv(&enumerator->match_property, property, value, /* clear_on_null = */ false); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +_public_ int sd_device_enumerator_add_match_property_required(sd_device_enumerator *enumerator, const char *property, const char *value) { + int r; + + assert_return(enumerator, -EINVAL); + assert_return(property, -EINVAL); + + r = update_match_strv(&enumerator->match_property_required, property, value, /* clear_on_null = */ false); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +static int device_enumerator_add_match_sysname(sd_device_enumerator *enumerator, const char *sysname, bool match) { + int r; + + assert_return(enumerator, -EINVAL); + assert_return(sysname, -EINVAL); + + r = set_put_strdup(match ? &enumerator->match_sysname : &enumerator->nomatch_sysname, sysname); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +_public_ int sd_device_enumerator_add_match_sysname(sd_device_enumerator *enumerator, const char *sysname) { + return device_enumerator_add_match_sysname(enumerator, sysname, true); +} + +_public_ int sd_device_enumerator_add_nomatch_sysname(sd_device_enumerator *enumerator, const char *sysname) { + return device_enumerator_add_match_sysname(enumerator, sysname, false); +} + +_public_ int sd_device_enumerator_add_match_tag(sd_device_enumerator *enumerator, const char *tag) { + int r; + + assert_return(enumerator, -EINVAL); + assert_return(tag, -EINVAL); + + r = set_put_strdup(&enumerator->match_tag, tag); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +int device_enumerator_add_match_parent_incremental(sd_device_enumerator *enumerator, sd_device *parent) { + const char *path; + int r; + + assert(enumerator); + assert(parent); + + r = sd_device_get_syspath(parent, &path); + if (r < 0) + return r; + + r = set_put_strdup(&enumerator->match_parent, path); + if (r <= 0) + return r; + + enumerator->scan_uptodate = false; + + return 1; +} + +_public_ int sd_device_enumerator_add_match_parent(sd_device_enumerator *enumerator, sd_device *parent) { + assert_return(enumerator, -EINVAL); + assert_return(parent, -EINVAL); + + set_clear(enumerator->match_parent); + + return device_enumerator_add_match_parent_incremental(enumerator, parent); +} + +_public_ int sd_device_enumerator_allow_uninitialized(sd_device_enumerator *enumerator) { + assert_return(enumerator, -EINVAL); + + enumerator->match_initialized = MATCH_INITIALIZED_ALL; + + enumerator->scan_uptodate = false; + + return 1; +} + +int device_enumerator_add_match_is_initialized(sd_device_enumerator *enumerator, MatchInitializedType type) { + assert_return(enumerator, -EINVAL); + assert_return(type >= 0 && type < _MATCH_INITIALIZED_MAX, -EINVAL); + + enumerator->match_initialized = type; + + enumerator->scan_uptodate = false; + + return 1; +} + +static int sound_device_compare(const char *devpath_a, const char *devpath_b) { + const char *sound_a, *sound_b; + size_t prefix_len; + + assert(devpath_a); + assert(devpath_b); + + /* For sound cards the control device must be enumerated last to make sure it's the final + * device node that gets ACLs applied. Applications rely on this fact and use ACL changes on + * the control node as an indicator that the ACL change of the entire sound card completed. The + * kernel makes this guarantee when creating those devices, and hence we should too when + * enumerating them. */ + + sound_a = strstrafter(devpath_a, "/sound/card"); + if (!sound_a) + return 0; + + sound_a = strchr(devpath_a, '/'); + if (!sound_a) + return 0; + + prefix_len = sound_a - devpath_a; + + if (!strneq(devpath_a, devpath_b, prefix_len)) + return 0; + + sound_b = devpath_b + prefix_len; + + return CMP(!!startswith(sound_a, "/controlC"), + !!startswith(sound_b, "/controlC")); +} + +static bool devpath_is_late_block(const char *devpath) { + assert(devpath); + + return strstr(devpath, "/block/md") || strstr(devpath, "/block/dm-"); +} + +static int device_compare(sd_device * const *a, sd_device * const *b) { + const char *devpath_a, *devpath_b; + int r; + + assert(a); + assert(b); + assert(*a); + assert(*b); + + assert_se(sd_device_get_devpath(*(sd_device**) a, &devpath_a) >= 0); + assert_se(sd_device_get_devpath(*(sd_device**) b, &devpath_b) >= 0); + + r = sound_device_compare(devpath_a, devpath_b); + if (r != 0) + return r; + + /* md and dm devices are enumerated after all other devices */ + r = CMP(devpath_is_late_block(devpath_a), devpath_is_late_block(devpath_b)); + if (r != 0) + return r; + + return path_compare(devpath_a, devpath_b); +} + +static int enumerator_sort_devices(sd_device_enumerator *enumerator) { + size_t n_sorted = 0, n = 0; + sd_device **devices; + sd_device *device; + int r; + + assert(enumerator); + + if (enumerator->sorted) + return 0; + + devices = new(sd_device*, hashmap_size(enumerator->devices_by_syspath)); + if (!devices) + return -ENOMEM; + + STRV_FOREACH(prioritized_subsystem, enumerator->prioritized_subsystems) { + + for (;;) { + const char *syspath; + size_t m = n; + + HASHMAP_FOREACH_KEY(device, syspath, enumerator->devices_by_syspath) { + _cleanup_free_ char *p = NULL; + const char *subsys; + + if (sd_device_get_subsystem(device, &subsys) < 0) + continue; + + if (!streq(subsys, *prioritized_subsystem)) + continue; + + devices[n++] = sd_device_ref(device); + + for (;;) { + _cleanup_free_ char *q = NULL; + + r = path_extract_directory(p ?: syspath, &q); + if (r == -EADDRNOTAVAIL) + break; + if (r < 0) + goto failed; + + device = hashmap_get(enumerator->devices_by_syspath, q); + if (device) + devices[n++] = sd_device_ref(device); + + free_and_replace(p, q); + } + + break; + } + + /* We cannot remove multiple entries in the loop HASHMAP_FOREACH_KEY() above. */ + for (size_t i = m; i < n; i++) { + r = sd_device_get_syspath(devices[i], &syspath); + if (r < 0) + goto failed; + + assert_se(hashmap_remove(enumerator->devices_by_syspath, syspath) == devices[i]); + sd_device_unref(devices[i]); + } + + if (m == n) + break; + } + + typesafe_qsort(devices + n_sorted, n - n_sorted, device_compare); + n_sorted = n; + } + + HASHMAP_FOREACH(device, enumerator->devices_by_syspath) + devices[n++] = sd_device_ref(device); + + /* Move all devices back to the hashmap. Otherwise, devices added by + * udev_enumerate_add_syspath() -> device_enumerator_add_device() may not be listed. */ + for (size_t i = 0; i < n_sorted; i++) { + const char *syspath; + + r = sd_device_get_syspath(devices[i], &syspath); + if (r < 0) + goto failed; + + r = hashmap_put(enumerator->devices_by_syspath, syspath, devices[i]); + if (r < 0) + goto failed; + assert(r > 0); + + sd_device_ref(devices[i]); + } + + typesafe_qsort(devices + n_sorted, n - n_sorted, device_compare); + + device_unref_many(enumerator->devices, enumerator->n_devices); + + enumerator->n_devices = n; + free_and_replace(enumerator->devices, devices); + + enumerator->sorted = true; + return 0; + +failed: + device_unref_many(devices, n); + free(devices); + return r; +} + +int device_enumerator_add_device(sd_device_enumerator *enumerator, sd_device *device) { + const char *syspath; + int r; + + assert_return(enumerator, -EINVAL); + assert_return(device, -EINVAL); + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + r = hashmap_ensure_put(&enumerator->devices_by_syspath, &string_hash_ops, syspath, device); + if (IN_SET(r, -EEXIST, 0)) + return 0; + if (r < 0) + return r; + + sd_device_ref(device); + + enumerator->sorted = false; + return 1; +} + +static bool match_property(Hashmap *properties, sd_device *device, bool match_all) { + const char *property_pattern; + char * const *value_patterns; + + assert(device); + + /* Unlike device_match_sysattr(), this accepts device that has at least one matching property. */ + + if (hashmap_isempty(properties)) + return true; + + HASHMAP_FOREACH_KEY(value_patterns, property_pattern, properties) { + bool match = false; + + FOREACH_DEVICE_PROPERTY(device, property, value) { + if (fnmatch(property_pattern, property, 0) != 0) + continue; + + match = strv_fnmatch(value_patterns, value); + if (match) { + if (!match_all) + return true; + + break; + } + } + + if (!match && match_all) + return false; + } + + return match_all; +} + +static bool match_tag(sd_device_enumerator *enumerator, sd_device *device) { + const char *tag; + + assert(enumerator); + assert(device); + + SET_FOREACH(tag, enumerator->match_tag) + if (!sd_device_has_tag(device, tag)) + return false; + + return true; +} + +static bool match_sysname(sd_device_enumerator *enumerator, const char *sysname) { + assert(enumerator); + assert(sysname); + + return set_fnmatch(enumerator->match_sysname, enumerator->nomatch_sysname, sysname); +} + +static int match_initialized(sd_device_enumerator *enumerator, sd_device *device) { + int r; + + assert(enumerator); + assert(device); + + if (enumerator->match_initialized == MATCH_INITIALIZED_ALL) + return true; + + r = sd_device_get_is_initialized(device); + if (r == -ENOENT) /* this is necessarily racey, so ignore missing devices */ + return false; + if (r < 0) + return r; + + if (enumerator->match_initialized == MATCH_INITIALIZED_COMPAT) { + /* only devices that have no devnode/ifindex or have a db entry are accepted. */ + if (r > 0) + return true; + + if (sd_device_get_devnum(device, NULL) >= 0) + return false; + + if (sd_device_get_ifindex(device, NULL) >= 0) + return false; + + return true; + } + + return (enumerator->match_initialized == MATCH_INITIALIZED_NO) == (r == 0); +} + +static bool match_subsystem(sd_device_enumerator *enumerator, const char *subsystem) { + assert(enumerator); + + if (!subsystem) + return false; + + return set_fnmatch(enumerator->match_subsystem, enumerator->nomatch_subsystem, subsystem); +} + +typedef enum MatchFlag { + MATCH_SYSNAME = 1u << 0, + MATCH_SUBSYSTEM = 1u << 1, + MATCH_PARENT = 1u << 2, + MATCH_TAG = 1u << 3, + + MATCH_ALL = (1u << 4) - 1, +} MatchFlag; + +static int test_matches( + sd_device_enumerator *enumerator, + sd_device *device, + MatchFlag flags) { + + int r; + + assert(enumerator); + assert(device); + + if (FLAGS_SET(flags, MATCH_SYSNAME)) { + const char *sysname; + + r = sd_device_get_sysname(device, &sysname); + if (r < 0) + return r; + + if (!match_sysname(enumerator, sysname)) + return false; + } + + if (FLAGS_SET(flags, MATCH_SUBSYSTEM)) { + const char *subsystem; + + r = sd_device_get_subsystem(device, &subsystem); + if (r == -ENOENT) + return false; + if (r < 0) + return r; + + if (!match_subsystem(enumerator, subsystem)) + return false; + } + + if (FLAGS_SET(flags, MATCH_PARENT) && + !device_match_parent(device, enumerator->match_parent, NULL)) + return false; + + if (FLAGS_SET(flags, MATCH_TAG) && + !match_tag(enumerator, device)) + return false; + + r = match_initialized(enumerator, device); + if (r <= 0) + return r; + + if (!match_property(enumerator->match_property, device, /* match_all = */ false)) + return false; + + if (!match_property(enumerator->match_property_required, device, /* match_all = */ true)) + return false; + + if (!device_match_sysattr(device, enumerator->match_sysattr, enumerator->nomatch_sysattr)) + return false; + + return true; +} + +static int enumerator_add_parent_devices( + sd_device_enumerator *enumerator, + sd_device *device, + MatchFlag flags) { + + int r; + + assert(enumerator); + assert(device); + + for (;;) { + r = sd_device_get_parent(device, &device); + if (r == -ENOENT) /* Reached the top? */ + return 0; + if (r < 0) + return r; + + r = test_matches(enumerator, device, flags); + if (r < 0) + return r; + if (r == 0) + continue; + + r = device_enumerator_add_device(enumerator, device); + if (r < 0) + return r; + if (r == 0) /* Exists already? Then no need to go further up. */ + return 0; + } +} + +int device_enumerator_add_parent_devices(sd_device_enumerator *enumerator, sd_device *device) { + return enumerator_add_parent_devices(enumerator, device, MATCH_ALL & (~MATCH_PARENT)); +} + +static bool relevant_sysfs_subdir(const struct dirent *de) { + assert(de); + + if (de->d_name[0] == '.') + return false; + + /* Also filter out regular files and such, i.e. stuff that definitely isn't a kobject path. (Note + * that we rely on the fact that sysfs fills in d_type here, i.e. doesn't do DT_UNKNOWN) */ + return IN_SET(de->d_type, DT_DIR, DT_LNK); +} + +static int enumerator_scan_dir_and_add_devices( + sd_device_enumerator *enumerator, + const char *basedir, + const char *subdir1, + const char *subdir2) { + + _cleanup_closedir_ DIR *dir = NULL; + char *path; + int k, r = 0; + + assert(enumerator); + assert(basedir); + + path = strjoina("/sys/", basedir, "/"); + + if (subdir1) + path = strjoina(path, subdir1, "/"); + + if (subdir2) + path = strjoina(path, subdir2, "/"); + + dir = opendir(path); + if (!dir) { + bool ignore = errno == ENOENT; + + /* this is necessarily racey, so ignore missing directories */ + log_debug_errno(errno, + "sd-device-enumerator: Failed to open directory %s%s: %m", + path, ignore ? ", ignoring" : ""); + return ignore ? 0 : -errno; + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + char syspath[strlen(path) + 1 + strlen(de->d_name) + 1]; + + if (!relevant_sysfs_subdir(de)) + continue; + + if (!match_sysname(enumerator, de->d_name)) + continue; + + (void) sprintf(syspath, "%s%s", path, de->d_name); + + k = sd_device_new_from_syspath(&device, syspath); + if (k < 0) { + if (k != -ENODEV) + /* this is necessarily racey, so ignore missing devices */ + r = k; + + continue; + } + + k = test_matches(enumerator, device, MATCH_ALL & (~MATCH_SYSNAME)); /* sysname is already tested. */ + if (k <= 0) { + if (k < 0) + r = k; + continue; + } + + k = device_enumerator_add_device(enumerator, device); + if (k < 0) + r = k; + + /* Also include all potentially matching parent devices in the enumeration. These are things + * like root busses — e.g. /sys/devices/pci0000:00/ or /sys/devices/pnp0/, which ar not + * linked from /sys/class/ or /sys/bus/, hence pick them up explicitly here. */ + k = enumerator_add_parent_devices(enumerator, device, MATCH_ALL); + if (k < 0) + r = k; + } + + return r; +} + +static int enumerator_scan_dir( + sd_device_enumerator *enumerator, + const char *basedir, + const char *subdir, + const char *subsystem) { + + _cleanup_closedir_ DIR *dir = NULL; + char *path; + int r = 0; + + path = strjoina("/sys/", basedir); + + dir = opendir(path); + if (!dir) { + bool ignore = errno == ENOENT; + + log_debug_errno(errno, + "sd-device-enumerator: Failed to open directory %s%s: %m", + path, ignore ? ", ignoring" : ""); + return ignore ? 0 : -errno; + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + int k; + + if (!relevant_sysfs_subdir(de)) + continue; + + if (!match_subsystem(enumerator, subsystem ?: de->d_name)) + continue; + + k = enumerator_scan_dir_and_add_devices(enumerator, basedir, de->d_name, subdir); + if (k < 0) + r = k; + } + + return r; +} + +static int enumerator_scan_devices_tag(sd_device_enumerator *enumerator, const char *tag) { + _cleanup_closedir_ DIR *dir = NULL; + char *path; + int r = 0; + + assert(enumerator); + assert(tag); + + path = strjoina("/run/udev/tags/", tag); + + dir = opendir(path); + if (!dir) { + bool ignore = errno == ENOENT; + + log_debug_errno(errno, + "sd-device-enumerator: Failed to open directory %s%s: %m", + path, ignore ? ", ignoring" : ""); + return ignore ? 0 : -errno; + } + + /* TODO: filter away subsystems? */ + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int k; + + if (de->d_name[0] == '.') + continue; + + k = sd_device_new_from_device_id(&device, de->d_name); + if (k < 0) { + if (k != -ENODEV) + /* this is necessarily racy, so ignore missing devices */ + r = k; + + continue; + } + + /* Generated from tag, hence not necessary to check tag again. */ + k = test_matches(enumerator, device, MATCH_ALL & (~MATCH_TAG)); + if (k < 0) + r = k; + if (k <= 0) + continue; + + k = device_enumerator_add_device(enumerator, device); + if (k < 0) { + r = k; + continue; + } + } + + return r; +} + +static int enumerator_scan_devices_tags(sd_device_enumerator *enumerator) { + const char *tag; + int r = 0; + + assert(enumerator); + + SET_FOREACH(tag, enumerator->match_tag) { + int k; + + k = enumerator_scan_devices_tag(enumerator, tag); + if (k < 0) + r = k; + } + + return r; +} + +static int parent_add_child(sd_device_enumerator *enumerator, const char *path, MatchFlag flags) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + r = sd_device_new_from_syspath(&device, path); + if (r == -ENODEV) + /* this is necessarily racy, so ignore missing devices */ + return 0; + else if (r < 0) + return r; + + r = test_matches(enumerator, device, flags); + if (r <= 0) + return r; + + return device_enumerator_add_device(enumerator, device); +} + +static int parent_crawl_children(sd_device_enumerator *enumerator, const char *path, Set **stack) { + _cleanup_closedir_ DIR *dir = NULL; + int r = 0; + + assert(enumerator); + assert(path); + assert(stack); + + dir = opendir(path); + if (!dir) { + bool ignore = errno == ENOENT; + + log_debug_errno(errno, + "sd-device-enumerator: Failed to open directory %s%s: %m", + path, ignore ? ", ignoring" : ""); + return ignore ? 0 : -errno; + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + _cleanup_free_ char *child = NULL; + int k; + + if (de->d_name[0] == '.') + continue; + + if (de->d_type != DT_DIR) + continue; + + child = path_join(path, de->d_name); + if (!child) + return -ENOMEM; + + /* Let's check sysname filter earlier. The other tests require the sd-device object created + * from the path, thus much costly. */ + if (match_sysname(enumerator, de->d_name)) { + k = parent_add_child(enumerator, child, MATCH_ALL & (~(MATCH_SYSNAME|MATCH_PARENT))); + if (k < 0) + r = k; + } + + k = set_ensure_consume(stack, &path_hash_ops_free, TAKE_PTR(child)); + if (k < 0) + r = k; + } + + return r; +} + +static int enumerator_scan_devices_children(sd_device_enumerator *enumerator) { + _cleanup_set_free_ Set *stack = NULL; + const char *path; + int r = 0, k; + + assert(enumerator); + + SET_FOREACH(path, enumerator->match_parent) { + k = parent_add_child(enumerator, path, MATCH_ALL & (~MATCH_PARENT)); + if (k < 0) + r = k; + + k = parent_crawl_children(enumerator, path, &stack); + if (k < 0) + r = k; + } + + for (;;) { + _cleanup_free_ char *p = NULL; + + p = set_steal_first(stack); + if (!p) + return r; + + k = parent_crawl_children(enumerator, p, &stack); + if (k < 0) + r = k; + } +} + +static int enumerator_scan_devices_all(sd_device_enumerator *enumerator) { + int k, r = 0; + + k = enumerator_scan_dir(enumerator, "bus", "devices", NULL); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan /sys/bus: %m"); + + k = enumerator_scan_dir(enumerator, "class", NULL, NULL); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan /sys/class: %m"); + + return r; +} + +int device_enumerator_scan_devices(sd_device_enumerator *enumerator) { + int r = 0, k; + + assert(enumerator); + + if (enumerator->scan_uptodate && + enumerator->type == DEVICE_ENUMERATION_TYPE_DEVICES) + return 0; + + device_enumerator_unref_devices(enumerator); + + if (!set_isempty(enumerator->match_tag)) { + k = enumerator_scan_devices_tags(enumerator); + if (k < 0) + r = k; + } else if (enumerator->match_parent) { + k = enumerator_scan_devices_children(enumerator); + if (k < 0) + r = k; + } else { + k = enumerator_scan_devices_all(enumerator); + if (k < 0) + r = k; + } + + enumerator->scan_uptodate = true; + enumerator->type = DEVICE_ENUMERATION_TYPE_DEVICES; + + return r; +} + +_public_ sd_device *sd_device_enumerator_get_device_first(sd_device_enumerator *enumerator) { + assert_return(enumerator, NULL); + + if (device_enumerator_scan_devices(enumerator) < 0) + return NULL; + + if (enumerator_sort_devices(enumerator) < 0) + return NULL; + + enumerator->current_device_index = 0; + + if (enumerator->n_devices == 0) + return NULL; + + return enumerator->devices[0]; +} + +_public_ sd_device *sd_device_enumerator_get_device_next(sd_device_enumerator *enumerator) { + assert_return(enumerator, NULL); + + if (!enumerator->scan_uptodate || + !enumerator->sorted || + enumerator->type != DEVICE_ENUMERATION_TYPE_DEVICES || + enumerator->current_device_index + 1 >= enumerator->n_devices) + return NULL; + + return enumerator->devices[++enumerator->current_device_index]; +} + +int device_enumerator_scan_subsystems(sd_device_enumerator *enumerator) { + int r = 0, k; + + assert(enumerator); + + if (enumerator->scan_uptodate && + enumerator->type == DEVICE_ENUMERATION_TYPE_SUBSYSTEMS) + return 0; + + device_enumerator_unref_devices(enumerator); + + /* modules */ + if (match_subsystem(enumerator, "module")) { + k = enumerator_scan_dir_and_add_devices(enumerator, "module", NULL, NULL); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan modules: %m"); + } + + /* subsystems (only buses support coldplug) */ + if (match_subsystem(enumerator, "subsystem")) { + k = enumerator_scan_dir_and_add_devices(enumerator, "bus", NULL, NULL); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan subsystems: %m"); + } + + /* subsystem drivers */ + if (match_subsystem(enumerator, "drivers")) { + k = enumerator_scan_dir(enumerator, "bus", "drivers", "drivers"); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan drivers: %m"); + } + + enumerator->scan_uptodate = true; + enumerator->type = DEVICE_ENUMERATION_TYPE_SUBSYSTEMS; + + return r; +} + +_public_ sd_device *sd_device_enumerator_get_subsystem_first(sd_device_enumerator *enumerator) { + assert_return(enumerator, NULL); + + if (device_enumerator_scan_subsystems(enumerator) < 0) + return NULL; + + if (enumerator_sort_devices(enumerator) < 0) + return NULL; + + enumerator->current_device_index = 0; + + if (enumerator->n_devices == 0) + return NULL; + + return enumerator->devices[0]; +} + +_public_ sd_device *sd_device_enumerator_get_subsystem_next(sd_device_enumerator *enumerator) { + assert_return(enumerator, NULL); + + if (!enumerator->scan_uptodate || + !enumerator->sorted || + enumerator->type != DEVICE_ENUMERATION_TYPE_SUBSYSTEMS || + enumerator->current_device_index + 1 >= enumerator->n_devices) + return NULL; + + return enumerator->devices[++enumerator->current_device_index]; +} + +int device_enumerator_scan_devices_and_subsystems(sd_device_enumerator *enumerator) { + int r; + + assert(enumerator); + + if (enumerator->scan_uptodate && + enumerator->type == DEVICE_ENUMERATION_TYPE_ALL) + return 0; + + device_enumerator_unref_devices(enumerator); + + if (!set_isempty(enumerator->match_tag)) + r = enumerator_scan_devices_tags(enumerator); + else if (enumerator->match_parent) + r = enumerator_scan_devices_children(enumerator); + else { + int k; + + r = enumerator_scan_devices_all(enumerator); + + if (match_subsystem(enumerator, "module")) { + k = enumerator_scan_dir_and_add_devices(enumerator, "module", NULL, NULL); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan modules: %m"); + } + if (match_subsystem(enumerator, "subsystem")) { + k = enumerator_scan_dir_and_add_devices(enumerator, "bus", NULL, NULL); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan subsystems: %m"); + } + + if (match_subsystem(enumerator, "drivers")) { + k = enumerator_scan_dir(enumerator, "bus", "drivers", "drivers"); + if (k < 0) + r = log_debug_errno(k, "sd-device-enumerator: Failed to scan drivers: %m"); + } + } + + enumerator->scan_uptodate = true; + enumerator->type = DEVICE_ENUMERATION_TYPE_ALL; + + return r; +} + +sd_device *device_enumerator_get_first(sd_device_enumerator *enumerator) { + assert_return(enumerator, NULL); + + if (!enumerator->scan_uptodate) + return NULL; + + if (enumerator_sort_devices(enumerator) < 0) + return NULL; + + enumerator->current_device_index = 0; + + if (enumerator->n_devices == 0) + return NULL; + + return enumerator->devices[0]; +} + +sd_device *device_enumerator_get_next(sd_device_enumerator *enumerator) { + assert_return(enumerator, NULL); + + if (!enumerator->scan_uptodate || + !enumerator->sorted || + enumerator->current_device_index + 1 >= enumerator->n_devices) + return NULL; + + return enumerator->devices[++enumerator->current_device_index]; +} + +sd_device **device_enumerator_get_devices(sd_device_enumerator *enumerator, size_t *ret_n_devices) { + assert(enumerator); + assert(ret_n_devices); + + if (!enumerator->scan_uptodate) + return NULL; + + if (enumerator_sort_devices(enumerator) < 0) + return NULL; + + *ret_n_devices = enumerator->n_devices; + return enumerator->devices; +} diff --git a/src/libsystemd/sd-device/device-filter.c b/src/libsystemd/sd-device/device-filter.c new file mode 100644 index 0000000..4101e7d --- /dev/null +++ b/src/libsystemd/sd-device/device-filter.c @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "device-filter.h" +#include "path-util.h" + +int update_match_strv(Hashmap **match_strv, const char *key, const char *value, bool clear_on_null) { + char **strv; + int r; + + assert(match_strv); + assert(key); + + strv = hashmap_get(*match_strv, key); + if (strv) { + if (!value) { + char **v; + + if (strv_isempty(strv) || !clear_on_null) + return 0; + + /* Accept all value. Clear previous assignment. */ + + v = new0(char*, 1); + if (!v) + return -ENOMEM; + + strv_free_and_replace(strv, v); + } else { + if (strv_contains(strv, value)) + return 0; + + r = strv_extend(&strv, value); + if (r < 0) + return r; + } + + r = hashmap_update(*match_strv, key, strv); + if (r < 0) + return r; + + } else { + _cleanup_strv_free_ char **strv_alloc = NULL; + _cleanup_free_ char *key_alloc = NULL; + + key_alloc = strdup(key); + if (!key_alloc) + return -ENOMEM; + + strv_alloc = strv_new(value); + if (!strv_alloc) + return -ENOMEM; + + r = hashmap_ensure_put(match_strv, &string_hash_ops_free_strv_free, key_alloc, strv_alloc); + if (r < 0) + return r; + + TAKE_PTR(key_alloc); + TAKE_PTR(strv_alloc); + } + + return 1; +} + +static bool device_match_sysattr_value(sd_device *device, const char *sysattr, char * const *patterns) { + const char *value; + + assert(device); + assert(sysattr); + + if (sd_device_get_sysattr_value(device, sysattr, &value) < 0) + return false; + + return strv_fnmatch_or_empty(patterns, value, 0); +} + +bool device_match_sysattr(sd_device *device, Hashmap *match_sysattr, Hashmap *nomatch_sysattr) { + char * const *patterns; + const char *sysattr; + + assert(device); + + HASHMAP_FOREACH_KEY(patterns, sysattr, match_sysattr) + if (!device_match_sysattr_value(device, sysattr, patterns)) + return false; + + HASHMAP_FOREACH_KEY(patterns, sysattr, nomatch_sysattr) + if (device_match_sysattr_value(device, sysattr, patterns)) + return false; + + return true; +} + +bool device_match_parent(sd_device *device, Set *match_parent, Set *nomatch_parent) { + const char *syspath_parent, *syspath; + + assert(device); + + if (sd_device_get_syspath(device, &syspath) < 0) + return false; + + SET_FOREACH(syspath_parent, nomatch_parent) + if (path_startswith(syspath, syspath_parent)) + return false; + + if (set_isempty(match_parent)) + return true; + + SET_FOREACH(syspath_parent, match_parent) + if (path_startswith(syspath, syspath_parent)) + return true; + + return false; +} diff --git a/src/libsystemd/sd-device/device-filter.h b/src/libsystemd/sd-device/device-filter.h new file mode 100644 index 0000000..0c5f34e --- /dev/null +++ b/src/libsystemd/sd-device/device-filter.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +#include "hashmap.h" +#include "set.h" + +int update_match_strv(Hashmap **match_strv, const char *key, const char *value, bool clear_on_null); +bool device_match_sysattr(sd_device *device, Hashmap *match_sysattr, Hashmap *nomatch_sysattr); +bool device_match_parent(sd_device *device, Set *match_parent, Set *nomatch_parent); diff --git a/src/libsystemd/sd-device/device-internal.h b/src/libsystemd/sd-device/device-internal.h new file mode 100644 index 0000000..a465eb2 --- /dev/null +++ b/src/libsystemd/sd-device/device-internal.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-device.h" + +#include "device-private.h" +#include "hashmap.h" +#include "set.h" +#include "time-util.h" + +#define LATEST_UDEV_DATABASE_VERSION 1 + +struct sd_device { + unsigned n_ref; + + /* The database version indicates the supported features by the udev database. + * This is saved and parsed in V field. + * + * 0: None of the following features are supported (systemd version <= 246). + * 1: The current tags (Q) and the database version (V) features are implemented (>= 247). + */ + unsigned database_version; + + sd_device *parent; + + OrderedHashmap *properties; + Iterator properties_iterator; + uint64_t properties_generation; /* changes whenever the properties are changed */ + uint64_t properties_iterator_generation; /* generation when iteration was started */ + + /* the subset of the properties that should be written to the db */ + OrderedHashmap *properties_db; + + Hashmap *sysattr_values; /* cached sysattr values */ + + Set *sysattrs; /* names of sysattrs */ + Iterator sysattrs_iterator; + + Set *all_tags, *current_tags; + Iterator all_tags_iterator, current_tags_iterator; + uint64_t all_tags_iterator_generation, current_tags_iterator_generation; /* generation when iteration was started */ + uint64_t tags_generation; /* changes whenever the tags are changed */ + + Set *devlinks; + Iterator devlinks_iterator; + uint64_t devlinks_generation; /* changes whenever the devlinks are changed */ + uint64_t devlinks_iterator_generation; /* generation when iteration was started */ + int devlink_priority; + + Hashmap *children; + Iterator children_iterator; + bool children_enumerated; + + int ifindex; + char *devtype; + char *devname; + dev_t devnum; + + char **properties_strv; /* the properties hashmap as a strv */ + char *properties_nulstr; /* the same as a nulstr */ + size_t properties_nulstr_len; + + char *syspath; + const char *devpath; + const char *sysnum; + char *sysname; + + char *subsystem; + char *driver_subsystem; /* only set for the 'drivers' subsystem */ + char *driver; + + char *device_id; + + usec_t usec_initialized; + + mode_t devmode; + uid_t devuid; + gid_t devgid; + + uint64_t diskseq; /* Block device sequence number, monothonically incremented by the kernel on create/attach */ + + /* only set when device is passed through netlink */ + sd_device_action_t action; + uint64_t seqnum; + + bool parent_set:1; /* no need to try to reload parent */ + bool sysattrs_read:1; /* don't try to re-read sysattrs once read */ + bool property_tags_outdated:1; /* need to update TAGS= or CURRENT_TAGS= property */ + bool property_devlinks_outdated:1; /* need to update DEVLINKS= property */ + bool properties_buf_outdated:1; /* need to reread hashmap */ + bool subsystem_set:1; /* don't reread subsystem */ + bool driver_set:1; /* don't reread driver */ + bool uevent_loaded:1; /* don't reread uevent */ + bool db_loaded; /* don't reread db */ + + bool is_initialized:1; + bool sealed:1; /* don't read more information from uevent/db */ + bool db_persist:1; /* don't clean up the db when switching from initrd to real root */ +}; + +int device_new_aux(sd_device **ret); +int device_add_property_aux(sd_device *device, const char *key, const char *value, bool db); +static inline int device_add_property_internal(sd_device *device, const char *key, const char *value) { + return device_add_property_aux(device, key, value, false); +} + +int device_set_syspath(sd_device *device, const char *_syspath, bool verify); +int device_set_ifindex(sd_device *device, const char *ifindex); +int device_set_devmode(sd_device *device, const char *devmode); +int device_set_devname(sd_device *device, const char *devname); +int device_set_devtype(sd_device *device, const char *devtype); +int device_set_devnum(sd_device *device, const char *major, const char *minor); +int device_set_subsystem(sd_device *device, const char *subsystem); +int device_set_diskseq(sd_device *device, const char *str); +int device_set_drivers_subsystem(sd_device *device); +int device_set_driver(sd_device *device, const char *driver); +int device_set_usec_initialized(sd_device *device, usec_t when); diff --git a/src/libsystemd/sd-device/device-monitor-private.h b/src/libsystemd/sd-device/device-monitor-private.h new file mode 100644 index 0000000..33e2714 --- /dev/null +++ b/src/libsystemd/sd-device/device-monitor-private.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +typedef enum MonitorNetlinkGroup { + MONITOR_GROUP_NONE, + MONITOR_GROUP_KERNEL, + MONITOR_GROUP_UDEV, + _MONITOR_NETLINK_GROUP_MAX, + _MONITOR_NETLINK_GROUP_INVALID = -EINVAL, +} MonitorNetlinkGroup; + +int device_monitor_new_full(sd_device_monitor **ret, MonitorNetlinkGroup group, int fd); +int device_monitor_disconnect(sd_device_monitor *m); +int device_monitor_allow_unicast_sender(sd_device_monitor *m, sd_device_monitor *sender); +int device_monitor_enable_receiving(sd_device_monitor *m); +int device_monitor_get_fd(sd_device_monitor *m); +int device_monitor_send_device(sd_device_monitor *m, sd_device_monitor *destination, sd_device *device); +int device_monitor_receive_device(sd_device_monitor *m, sd_device **ret); diff --git a/src/libsystemd/sd-device/device-monitor.c b/src/libsystemd/sd-device/device-monitor.c new file mode 100644 index 0000000..bb4f9bd --- /dev/null +++ b/src/libsystemd/sd-device/device-monitor.c @@ -0,0 +1,929 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "sd-device.h" +#include "sd-event.h" + +#include "MurmurHash2.h" +#include "alloc-util.h" +#include "device-filter.h" +#include "device-monitor-private.h" +#include "device-private.h" +#include "device-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "hashmap.h" +#include "iovec-util.h" +#include "missing_socket.h" +#include "mountpoint-util.h" +#include "set.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "uid-range.h" + +#define log_monitor(m, format, ...) \ + log_debug("sd-device-monitor(%s): " format, strna(m ? m->description : NULL), ##__VA_ARGS__) +#define log_monitor_errno(m, r, format, ...) \ + log_debug_errno(r, "sd-device-monitor(%s): " format, strna(m ? m->description : NULL), ##__VA_ARGS__) +#define log_device_monitor(d, m, format, ...) \ + log_device_debug(d, "sd-device-monitor(%s): " format, strna(m ? m->description : NULL), ##__VA_ARGS__) +#define log_device_monitor_errno(d, m, r, format, ...) \ + log_device_debug_errno(d, r, "sd-device-monitor(%s): " format, strna(m ? m->description : NULL), ##__VA_ARGS__) + +struct sd_device_monitor { + unsigned n_ref; + + int sock; + union sockaddr_union snl; + union sockaddr_union snl_trusted_sender; + bool bound; + + UidRange *mapped_userns_uid_range; + + Hashmap *subsystem_filter; + Set *tag_filter; + Hashmap *match_sysattr_filter; + Hashmap *nomatch_sysattr_filter; + Set *match_parent_filter; + Set *nomatch_parent_filter; + bool filter_uptodate; + + sd_event *event; + sd_event_source *event_source; + char *description; + sd_device_monitor_handler_t callback; + void *userdata; +}; + +#define UDEV_MONITOR_MAGIC 0xfeedcafe + +typedef struct monitor_netlink_header { + /* "libudev" prefix to distinguish libudev and kernel messages */ + char prefix[8]; + /* Magic to protect against daemon <-> Library message format mismatch + * Used in the kernel from socket filter rules; needs to be stored in network order */ + unsigned magic; + /* Total length of header structure known to the sender */ + unsigned header_size; + /* Properties string buffer */ + unsigned properties_off; + unsigned properties_len; + /* Hashes of primary device properties strings, to let libudev subscribers + * use in-kernel socket filters; values need to be stored in network order */ + unsigned filter_subsystem_hash; + unsigned filter_devtype_hash; + unsigned filter_tag_bloom_hi; + unsigned filter_tag_bloom_lo; +} monitor_netlink_header; + +static int monitor_set_nl_address(sd_device_monitor *m) { + union sockaddr_union snl; + socklen_t addrlen; + + assert(m); + + /* Get the address the kernel has assigned us. + * It is usually, but not necessarily the pid. */ + addrlen = sizeof(struct sockaddr_nl); + if (getsockname(m->sock, &snl.sa, &addrlen) < 0) + return -errno; + + m->snl.nl.nl_pid = snl.nl.nl_pid; + return 0; +} + +int device_monitor_allow_unicast_sender(sd_device_monitor *m, sd_device_monitor *sender) { + assert(m); + assert(sender); + + m->snl_trusted_sender.nl.nl_pid = sender->snl.nl.nl_pid; + return 0; +} + +_public_ int sd_device_monitor_set_receive_buffer_size(sd_device_monitor *m, size_t size) { + assert_return(m, -EINVAL); + + return fd_set_rcvbuf(m->sock, size, false); +} + +int device_monitor_disconnect(sd_device_monitor *m) { + assert(m); + + m->sock = safe_close(m->sock); + return 0; +} + +int device_monitor_get_fd(sd_device_monitor *m) { + assert(m); + + return m->sock; +} + +int device_monitor_new_full(sd_device_monitor **ret, MonitorNetlinkGroup group, int fd) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *m = NULL; + _cleanup_close_ int sock = -EBADF; + int r; + + assert(group >= 0 && group < _MONITOR_NETLINK_GROUP_MAX); + assert_return(ret, -EINVAL); + + if (group == MONITOR_GROUP_UDEV && + access("/run/udev/control", F_OK) < 0 && + dev_is_devtmpfs() <= 0) { + + /* + * We do not support subscribing to uevents if no instance of + * udev is running. Uevents would otherwise broadcast the + * processing data of the host into containers, which is not + * desired. + * + * Containers will currently not get any udev uevents, until + * a supporting infrastructure is available. + * + * We do not set a netlink multicast group here, so the socket + * will not receive any messages. + */ + + log_monitor(m, "The udev service seems not to be active, disabling the monitor."); + group = MONITOR_GROUP_NONE; + } + + if (fd < 0) { + sock = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_KOBJECT_UEVENT); + if (sock < 0) + return log_monitor_errno(m, errno, "Failed to create socket: %m"); + } + + m = new(sd_device_monitor, 1); + if (!m) + return -ENOMEM; + + *m = (sd_device_monitor) { + .n_ref = 1, + .sock = fd >= 0 ? fd : TAKE_FD(sock), + .bound = fd >= 0, + .snl.nl.nl_family = AF_NETLINK, + .snl.nl.nl_groups = group, + }; + + if (fd >= 0) { + r = monitor_set_nl_address(m); + if (r < 0) { + log_monitor_errno(m, r, "Failed to set netlink address: %m"); + goto fail; + } + } + + if (DEBUG_LOGGING) { + _cleanup_close_ int netns = -EBADF; + + /* So here's the thing: only AF_NETLINK sockets from the main network namespace will get + * hardware events. Let's check if ours is from there, and if not generate a debug message, + * since we cannot possibly work correctly otherwise. This is just a safety check to make + * things easier to debug. */ + + netns = ioctl(m->sock, SIOCGSKNS); + if (netns < 0) + log_monitor_errno(m, errno, "Unable to get network namespace of udev netlink socket, unable to determine if we are in host netns, ignoring: %m"); + else { + struct stat a, b; + + if (fstat(netns, &a) < 0) { + r = log_monitor_errno(m, errno, "Failed to stat netns of udev netlink socket: %m"); + goto fail; + } + + if (stat("/proc/1/ns/net", &b) < 0) { + if (ERRNO_IS_PRIVILEGE(errno)) + /* If we can't access PID1's netns info due to permissions, it's fine, this is a + * safety check only after all. */ + log_monitor_errno(m, errno, "No permission to stat PID1's netns, unable to determine if we are in host netns, ignoring: %m"); + else + log_monitor_errno(m, errno, "Failed to stat PID1's netns, ignoring: %m"); + + } else if (!stat_inode_same(&a, &b)) + log_monitor(m, "Netlink socket we listen on is not from host netns, we won't see device events."); + } + } + + /* Let's bump the receive buffer size, but only if we are not called via socket activation, as in + * that case the service manager sets the receive buffer size for us, and the value in the .socket + * unit should take full effect. */ + if (fd < 0) { + r = sd_device_monitor_set_receive_buffer_size(m, 128*1024*1024); + if (r < 0) + log_monitor_errno(m, r, "Failed to increase receive buffer size, ignoring: %m"); + } + + *ret = TAKE_PTR(m); + return 0; + +fail: + /* Let's unset the socket fd in the monitor object before we destroy it so that the fd passed in is + * not closed on failure. */ + if (fd >= 0) + m->sock = -1; + + return r; +} + +_public_ int sd_device_monitor_new(sd_device_monitor **ret) { + return device_monitor_new_full(ret, MONITOR_GROUP_UDEV, -1); +} + +_public_ int sd_device_monitor_stop(sd_device_monitor *m) { + assert_return(m, -EINVAL); + + m->event_source = sd_event_source_unref(m->event_source); + (void) device_monitor_disconnect(m); + + return 0; +} + +static int device_monitor_event_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _unused_ _cleanup_(log_context_unrefp) LogContext *c = NULL; + sd_device_monitor *m = ASSERT_PTR(userdata); + + if (device_monitor_receive_device(m, &device) <= 0) + return 0; + + if (log_context_enabled()) + c = log_context_new_strv_consume(device_make_log_fields(device)); + + if (m->callback) + return m->callback(m, device, m->userdata); + + return 0; +} + +_public_ int sd_device_monitor_start(sd_device_monitor *m, sd_device_monitor_handler_t callback, void *userdata) { + int r; + + assert_return(m, -EINVAL); + + if (!m->event) { + r = sd_device_monitor_attach_event(m, NULL); + if (r < 0) + return r; + } + + r = device_monitor_enable_receiving(m); + if (r < 0) + return r; + + m->callback = callback; + m->userdata = userdata; + + r = sd_event_add_io(m->event, &m->event_source, m->sock, EPOLLIN, device_monitor_event_handler, m); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->event_source, m->description ?: "sd-device-monitor"); + + return 0; +} + +_public_ int sd_device_monitor_detach_event(sd_device_monitor *m) { + assert_return(m, -EINVAL); + + (void) sd_device_monitor_stop(m); + m->event = sd_event_unref(m->event); + + return 0; +} + +_public_ int sd_device_monitor_attach_event(sd_device_monitor *m, sd_event *event) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->event, -EBUSY); + + if (event) + m->event = sd_event_ref(event); + else { + r = sd_event_default(&m->event); + if (r < 0) + return r; + } + + return 0; +} + +_public_ sd_event *sd_device_monitor_get_event(sd_device_monitor *m) { + assert_return(m, NULL); + + return m->event; +} + +_public_ sd_event_source *sd_device_monitor_get_event_source(sd_device_monitor *m) { + assert_return(m, NULL); + + return m->event_source; +} + +_public_ int sd_device_monitor_set_description(sd_device_monitor *m, const char *description) { + int r; + + assert_return(m, -EINVAL); + + r = free_and_strdup(&m->description, description); + if (r <= 0) + return r; + + if (m->event_source) + (void) sd_event_source_set_description(m->event_source, description); + + return r; +} + +_public_ int sd_device_monitor_get_description(sd_device_monitor *m, const char **ret) { + assert_return(m, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = m->description; + return 0; +} + +int device_monitor_enable_receiving(sd_device_monitor *m) { + int r; + + assert(m); + + r = sd_device_monitor_filter_update(m); + if (r < 0) + return log_monitor_errno(m, r, "Failed to update filter: %m"); + + if (!m->bound) { + /* enable receiving of sender credentials */ + r = setsockopt_int(m->sock, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_monitor_errno(m, r, "Failed to set socket option SO_PASSCRED: %m"); + + if (bind(m->sock, &m->snl.sa, sizeof(struct sockaddr_nl)) < 0) + return log_monitor_errno(m, errno, "Failed to bind monitoring socket: %m"); + + m->bound = true; + + r = monitor_set_nl_address(m); + if (r < 0) + return log_monitor_errno(m, r, "Failed to set address: %m"); + } + + return 0; +} + +static sd_device_monitor *device_monitor_free(sd_device_monitor *m) { + assert(m); + + (void) sd_device_monitor_detach_event(m); + + uid_range_free(m->mapped_userns_uid_range); + free(m->description); + hashmap_free(m->subsystem_filter); + set_free(m->tag_filter); + hashmap_free(m->match_sysattr_filter); + hashmap_free(m->nomatch_sysattr_filter); + set_free(m->match_parent_filter); + set_free(m->nomatch_parent_filter); + + return mfree(m); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_device_monitor, sd_device_monitor, device_monitor_free); + +static int check_subsystem_filter(sd_device_monitor *m, sd_device *device) { + const char *s, *subsystem, *d, *devtype = NULL; + int r; + + assert(m); + assert(device); + + if (hashmap_isempty(m->subsystem_filter)) + return true; + + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0) + return r; + + r = sd_device_get_devtype(device, &devtype); + if (r < 0 && r != -ENOENT) + return r; + + HASHMAP_FOREACH_KEY(d, s, m->subsystem_filter) { + if (!streq(s, subsystem)) + continue; + + if (!d || streq_ptr(d, devtype)) + return true; + } + + return false; +} + +static bool check_tag_filter(sd_device_monitor *m, sd_device *device) { + const char *tag; + + assert(m); + assert(device); + + if (set_isempty(m->tag_filter)) + return true; + + SET_FOREACH(tag, m->tag_filter) + if (sd_device_has_tag(device, tag) > 0) + return true; + + return false; +} + +static int passes_filter(sd_device_monitor *m, sd_device *device) { + int r; + + assert(m); + assert(device); + + r = check_subsystem_filter(m, device); + if (r <= 0) + return r; + + if (!check_tag_filter(m, device)) + return false; + + if (!device_match_sysattr(device, m->match_sysattr_filter, m->nomatch_sysattr_filter)) + return false; + + return device_match_parent(device, m->match_parent_filter, m->nomatch_parent_filter); +} + +static bool check_sender_uid(sd_device_monitor *m, uid_t uid) { + int r; + + assert(m); + + /* Always trust messages from uid 0. */ + if (uid == 0) + return true; + + /* Trust messages sent by the same UID we are running. Currently, such situation happens only for + * unicast messages. */ + if (uid == getuid() || uid == geteuid()) + return true; + + if (!m->mapped_userns_uid_range) { + r = uid_range_load_userns(&m->mapped_userns_uid_range, NULL); + if (r < 0) + log_monitor_errno(m, r, "Failed to load UID ranges mapped to the current user namespace, ignoring: %m"); + } + + /* Trust messages come from outside of the current user namespace. */ + if (!uid_range_contains(m->mapped_userns_uid_range, uid)) + return true; + + /* Otherwise, refuse messages. */ + return false; +} + +int device_monitor_receive_device(sd_device_monitor *m, sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _cleanup_free_ uint8_t *buf_alloc = NULL; + union { + monitor_netlink_header *nlh; + char *nulstr; + uint8_t *buf; + } message; + struct iovec iov; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + union sockaddr_union snl; + struct msghdr smsg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_name = &snl, + .msg_namelen = sizeof(snl), + }; + struct ucred *cred; + size_t offset; + ssize_t n; + bool is_initialized = false; + int r; + + assert(m); + assert(ret); + + n = next_datagram_size_fd(m->sock); + if (n < 0) { + if (!ERRNO_IS_TRANSIENT(n)) + log_monitor_errno(m, n, "Failed to get the received message size: %m"); + return n; + } + + if ((size_t) n < ALLOCA_MAX / sizeof(uint8_t) / 2) + message.buf = newa(uint8_t, n); + else { + buf_alloc = new(uint8_t, n); + if (!buf_alloc) + return log_oom_debug(); + + message.buf = buf_alloc; + } + + iov = IOVEC_MAKE(message.buf, n); + + n = recvmsg(m->sock, &smsg, 0); + if (n < 0) { + if (!ERRNO_IS_TRANSIENT(errno)) + log_monitor_errno(m, errno, "Failed to receive message: %m"); + return -errno; + } + + if (smsg.msg_flags & MSG_TRUNC) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EINVAL), "Received truncated message, ignoring message."); + + if (n < 32) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EINVAL), "Invalid message length (%zi), ignoring message.", n); + + if (snl.nl.nl_groups == MONITOR_GROUP_NONE) { + /* unicast message, check if we trust the sender */ + if (m->snl_trusted_sender.nl.nl_pid == 0 || + snl.nl.nl_pid != m->snl_trusted_sender.nl.nl_pid) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), + "Unicast netlink message ignored."); + + } else if (snl.nl.nl_groups == MONITOR_GROUP_KERNEL) { + if (snl.nl.nl_pid > 0) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), + "Multicast kernel netlink message from PID %"PRIu32" ignored.", + snl.nl.nl_pid); + } + + cred = CMSG_FIND_DATA(&smsg, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!cred) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), + "No sender credentials received, ignoring message."); + + if (!check_sender_uid(m, cred->uid)) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), + "Sender uid="UID_FMT", message ignored.", cred->uid); + + if (!memchr(message.buf, 0, n)) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), "Received message without NUL, ignoring message."); + + if (streq(message.nulstr, "libudev")) { + /* udev message needs proper version magic */ + if (message.nlh->magic != htobe32(UDEV_MONITOR_MAGIC)) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), + "Invalid message signature (%x != %x).", + message.nlh->magic, htobe32(UDEV_MONITOR_MAGIC)); + + if (message.nlh->properties_off + 32 > (size_t) n) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), + "Invalid offset for properties (%u > %zi).", + message.nlh->properties_off + 32, n); + + offset = message.nlh->properties_off; + + /* devices received from udev are always initialized */ + is_initialized = true; + + } else { + /* check kernel message header */ + if (!strstr(message.nulstr, "@/")) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), "Invalid message header."); + + offset = strlen(message.nulstr) + 1; + if (offset >= (size_t) n) + return log_monitor_errno(m, SYNTHETIC_ERRNO(EAGAIN), "Invalid message length."); + } + + r = device_new_from_nulstr(&device, message.nulstr + offset, n - offset); + if (r < 0) + return log_monitor_errno(m, r, "Failed to create device from received message: %m"); + + if (is_initialized) + device_set_is_initialized(device); + + /* Skip device, if it does not pass the current filter */ + r = passes_filter(m, device); + if (r < 0) + return log_device_monitor_errno(device, m, r, "Failed to check received device passing filter: %m"); + if (r == 0) + log_device_monitor(device, m, "Received device does not pass filter, ignoring."); + else + *ret = TAKE_PTR(device); + + return r; +} + +static uint32_t string_hash32(const char *str) { + return MurmurHash2(str, strlen(str), 0); +} + +/* Get a bunch of bit numbers out of the hash, and set the bits in our bit field */ +static uint64_t string_bloom64(const char *str) { + uint64_t bits = 0; + uint32_t hash = string_hash32(str); + + bits |= UINT64_C(1) << (hash & 63); + bits |= UINT64_C(1) << ((hash >> 6) & 63); + bits |= UINT64_C(1) << ((hash >> 12) & 63); + bits |= UINT64_C(1) << ((hash >> 18) & 63); + return bits; +} + +int device_monitor_send_device( + sd_device_monitor *m, + sd_device_monitor *destination, + sd_device *device) { + + monitor_netlink_header nlh = { + .prefix = "libudev", + .magic = htobe32(UDEV_MONITOR_MAGIC), + .header_size = sizeof nlh, + }; + struct iovec iov[2] = { + { .iov_base = &nlh, .iov_len = sizeof nlh }, + }; + struct msghdr smsg = { + .msg_iov = iov, + .msg_iovlen = 2, + }; + /* default destination for sending */ + union sockaddr_union default_destination = { + .nl.nl_family = AF_NETLINK, + .nl.nl_groups = MONITOR_GROUP_UDEV, + }; + uint64_t tag_bloom_bits; + const char *buf, *val; + ssize_t count; + size_t blen; + int r; + + assert(m); + assert(device); + + r = device_get_properties_nulstr(device, &buf, &blen); + if (r < 0) + return log_device_monitor_errno(device, m, r, "Failed to get device properties: %m"); + if (blen < 32) + return log_device_monitor_errno(device, m, SYNTHETIC_ERRNO(EINVAL), + "Length of device property nulstr is too small to contain valid device information."); + + /* fill in versioned header */ + r = sd_device_get_subsystem(device, &val); + if (r < 0) + return log_device_monitor_errno(device, m, r, "Failed to get device subsystem: %m"); + nlh.filter_subsystem_hash = htobe32(string_hash32(val)); + + if (sd_device_get_devtype(device, &val) >= 0) + nlh.filter_devtype_hash = htobe32(string_hash32(val)); + + /* add tag bloom filter */ + tag_bloom_bits = 0; + FOREACH_DEVICE_TAG(device, tag) + tag_bloom_bits |= string_bloom64(tag); + + if (tag_bloom_bits > 0) { + nlh.filter_tag_bloom_hi = htobe32(tag_bloom_bits >> 32); + nlh.filter_tag_bloom_lo = htobe32(tag_bloom_bits & 0xffffffff); + } + + /* add properties list */ + nlh.properties_off = iov[0].iov_len; + nlh.properties_len = blen; + iov[1] = IOVEC_MAKE((char*) buf, blen); + + /* + * Use custom address for target, or the default one. + * + * If we send to a multicast group, we will get + * ECONNREFUSED, which is expected. + */ + smsg.msg_name = destination ? &destination->snl : &default_destination; + smsg.msg_namelen = sizeof(struct sockaddr_nl); + count = sendmsg(m->sock, &smsg, 0); + if (count < 0) { + if (!destination && errno == ECONNREFUSED) { + log_device_monitor(device, m, "Passed to netlink monitor."); + return 0; + } else + return log_device_monitor_errno(device, m, errno, "Failed to send device to netlink monitor: %m"); + } + + log_device_monitor(device, m, "Passed %zi byte to netlink monitor.", count); + return count; +} + +static void bpf_stmt(struct sock_filter *ins, unsigned *i, + unsigned short code, unsigned data) { + ins[(*i)++] = (struct sock_filter) { + .code = code, + .k = data, + }; +} + +static void bpf_jmp(struct sock_filter *ins, unsigned *i, + unsigned short code, unsigned data, + unsigned short jt, unsigned short jf) { + ins[(*i)++] = (struct sock_filter) { + .code = code, + .jt = jt, + .jf = jf, + .k = data, + }; +} + +_public_ int sd_device_monitor_filter_update(sd_device_monitor *m) { + struct sock_filter ins[512] = {}; + struct sock_fprog filter; + const char *subsystem, *devtype, *tag; + unsigned i = 0; + + assert_return(m, -EINVAL); + + if (m->filter_uptodate) + return 0; + + if (m->snl.nl.nl_groups == MONITOR_GROUP_KERNEL || + (hashmap_isempty(m->subsystem_filter) && + set_isempty(m->tag_filter))) { + m->filter_uptodate = true; + return 0; + } + + /* load magic in A */ + bpf_stmt(ins, &i, BPF_LD|BPF_W|BPF_ABS, offsetof(monitor_netlink_header, magic)); + /* jump if magic matches */ + bpf_jmp(ins, &i, BPF_JMP|BPF_JEQ|BPF_K, UDEV_MONITOR_MAGIC, 1, 0); + /* wrong magic, pass packet */ + bpf_stmt(ins, &i, BPF_RET|BPF_K, 0xffffffff); + + if (!set_isempty(m->tag_filter)) { + int tag_matches = set_size(m->tag_filter); + + /* add all tags matches */ + SET_FOREACH(tag, m->tag_filter) { + uint64_t tag_bloom_bits = string_bloom64(tag); + uint32_t tag_bloom_hi = tag_bloom_bits >> 32; + uint32_t tag_bloom_lo = tag_bloom_bits & 0xffffffff; + + /* load device bloom bits in A */ + bpf_stmt(ins, &i, BPF_LD|BPF_W|BPF_ABS, offsetof(monitor_netlink_header, filter_tag_bloom_hi)); + /* clear bits (tag bits & bloom bits) */ + bpf_stmt(ins, &i, BPF_ALU|BPF_AND|BPF_K, tag_bloom_hi); + /* jump to next tag if it does not match */ + bpf_jmp(ins, &i, BPF_JMP|BPF_JEQ|BPF_K, tag_bloom_hi, 0, 3); + + /* load device bloom bits in A */ + bpf_stmt(ins, &i, BPF_LD|BPF_W|BPF_ABS, offsetof(monitor_netlink_header, filter_tag_bloom_lo)); + /* clear bits (tag bits & bloom bits) */ + bpf_stmt(ins, &i, BPF_ALU|BPF_AND|BPF_K, tag_bloom_lo); + /* jump behind end of tag match block if tag matches */ + tag_matches--; + bpf_jmp(ins, &i, BPF_JMP|BPF_JEQ|BPF_K, tag_bloom_lo, 1 + (tag_matches * 6), 0); + } + + /* nothing matched, drop packet */ + bpf_stmt(ins, &i, BPF_RET|BPF_K, 0); + } + + /* add all subsystem matches */ + if (!hashmap_isempty(m->subsystem_filter)) { + HASHMAP_FOREACH_KEY(devtype, subsystem, m->subsystem_filter) { + uint32_t hash = string_hash32(subsystem); + + /* load device subsystem value in A */ + bpf_stmt(ins, &i, BPF_LD|BPF_W|BPF_ABS, offsetof(monitor_netlink_header, filter_subsystem_hash)); + if (!devtype) { + /* jump if subsystem does not match */ + bpf_jmp(ins, &i, BPF_JMP|BPF_JEQ|BPF_K, hash, 0, 1); + } else { + /* jump if subsystem does not match */ + bpf_jmp(ins, &i, BPF_JMP|BPF_JEQ|BPF_K, hash, 0, 3); + /* load device devtype value in A */ + bpf_stmt(ins, &i, BPF_LD|BPF_W|BPF_ABS, offsetof(monitor_netlink_header, filter_devtype_hash)); + /* jump if value does not match */ + hash = string_hash32(devtype); + bpf_jmp(ins, &i, BPF_JMP|BPF_JEQ|BPF_K, hash, 0, 1); + } + + /* matched, pass packet */ + bpf_stmt(ins, &i, BPF_RET|BPF_K, 0xffffffff); + + if (i+1 >= ELEMENTSOF(ins)) + return -E2BIG; + } + + /* nothing matched, drop packet */ + bpf_stmt(ins, &i, BPF_RET|BPF_K, 0); + } + + /* matched, pass packet */ + bpf_stmt(ins, &i, BPF_RET|BPF_K, 0xffffffff); + + /* install filter */ + filter = (struct sock_fprog) { + .len = i, + .filter = ins, + }; + if (setsockopt(m->sock, SOL_SOCKET, SO_ATTACH_FILTER, &filter, sizeof(filter)) < 0) + return -errno; + + m->filter_uptodate = true; + return 0; +} + +_public_ int sd_device_monitor_filter_add_match_subsystem_devtype(sd_device_monitor *m, const char *subsystem, const char *devtype) { + int r; + + assert_return(m, -EINVAL); + assert_return(subsystem, -EINVAL); + + /* Do not use string_has_ops_free_free or hashmap_put_strdup() here, as this may be called + * multiple times with the same subsystem but different devtypes. */ + r = hashmap_put_strdup_full(&m->subsystem_filter, &trivial_hash_ops_free_free, subsystem, devtype); + if (r <= 0) + return r; + + m->filter_uptodate = false; + return r; +} + +_public_ int sd_device_monitor_filter_add_match_tag(sd_device_monitor *m, const char *tag) { + int r; + + assert_return(m, -EINVAL); + assert_return(tag, -EINVAL); + + r = set_put_strdup(&m->tag_filter, tag); + if (r <= 0) + return r; + + m->filter_uptodate = false; + return r; +} + +_public_ int sd_device_monitor_filter_add_match_sysattr(sd_device_monitor *m, const char *sysattr, const char *value, int match) { + Hashmap **hashmap; + + assert_return(m, -EINVAL); + assert_return(sysattr, -EINVAL); + + if (match) + hashmap = &m->match_sysattr_filter; + else + hashmap = &m->nomatch_sysattr_filter; + + /* TODO: unset m->filter_uptodate on success when we support this filter on BPF. */ + return update_match_strv(hashmap, sysattr, value, /* clear_on_null = */ true); +} + +_public_ int sd_device_monitor_filter_add_match_parent(sd_device_monitor *m, sd_device *device, int match) { + const char *syspath; + Set **set; + int r; + + assert_return(m, -EINVAL); + assert_return(device, -EINVAL); + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + if (match) + set = &m->match_parent_filter; + else + set = &m->nomatch_parent_filter; + + /* TODO: unset m->filter_uptodate on success when we support this filter on BPF. */ + return set_put_strdup(set, syspath); +} + +_public_ int sd_device_monitor_filter_remove(sd_device_monitor *m) { + static const struct sock_fprog filter = { 0, NULL }; + + assert_return(m, -EINVAL); + + m->subsystem_filter = hashmap_free(m->subsystem_filter); + m->tag_filter = set_free(m->tag_filter); + m->match_sysattr_filter = hashmap_free(m->match_sysattr_filter); + m->nomatch_sysattr_filter = hashmap_free(m->nomatch_sysattr_filter); + m->match_parent_filter = set_free(m->match_parent_filter); + m->nomatch_parent_filter = set_free(m->nomatch_parent_filter); + + if (setsockopt(m->sock, SOL_SOCKET, SO_DETACH_FILTER, &filter, sizeof(filter)) < 0) + return -errno; + + m->filter_uptodate = true; + return 0; +} diff --git a/src/libsystemd/sd-device/device-private.c b/src/libsystemd/sd-device/device-private.c new file mode 100644 index 0000000..0edabfb --- /dev/null +++ b/src/libsystemd/sd-device/device-private.c @@ -0,0 +1,903 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-internal.h" +#include "device-private.h" +#include "device-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hashmap.h" +#include "macro.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "set.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "tmpfile-util.h" +#include "user-util.h" + +int device_add_property(sd_device *device, const char *key, const char *value) { + int r; + + assert(device); + assert(key); + + r = device_add_property_aux(device, key, value, false); + if (r < 0) + return r; + + if (key[0] != '.') { + r = device_add_property_aux(device, key, value, true); + if (r < 0) + return r; + } + + return 0; +} + +int device_add_propertyf(sd_device *device, const char *key, const char *format, ...) { + _cleanup_free_ char *value = NULL; + va_list ap; + int r; + + assert(device); + assert(key); + + if (!format) + return device_add_property(device, key, NULL); + + va_start(ap, format); + r = vasprintf(&value, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return device_add_property(device, key, value); +} + +void device_set_devlink_priority(sd_device *device, int priority) { + assert(device); + + device->devlink_priority = priority; +} + +void device_set_is_initialized(sd_device *device) { + assert(device); + + device->is_initialized = true; +} + +int device_ensure_usec_initialized(sd_device *device, sd_device *device_old) { + usec_t when; + + assert(device); + + if (device_old && device_old->usec_initialized > 0) + when = device_old->usec_initialized; + else + when = now(CLOCK_MONOTONIC); + + return device_set_usec_initialized(device, when); +} + +uint64_t device_get_properties_generation(sd_device *device) { + assert(device); + + return device->properties_generation; +} + +uint64_t device_get_tags_generation(sd_device *device) { + assert(device); + + return device->tags_generation; +} + +uint64_t device_get_devlinks_generation(sd_device *device) { + assert(device); + + return device->devlinks_generation; +} + +int device_get_devnode_mode(sd_device *device, mode_t *ret) { + int r; + + assert(device); + + r = device_read_db(device); + if (r < 0) + return r; + + if (device->devmode == MODE_INVALID) + return -ENOENT; + + if (ret) + *ret = device->devmode; + + return 0; +} + +int device_get_devnode_uid(sd_device *device, uid_t *ret) { + int r; + + assert(device); + + r = device_read_db(device); + if (r < 0) + return r; + + if (device->devuid == UID_INVALID) + return -ENOENT; + + if (ret) + *ret = device->devuid; + + return 0; +} + +static int device_set_devuid(sd_device *device, const char *uid) { + uid_t u; + int r; + + assert(device); + assert(uid); + + r = parse_uid(uid, &u); + if (r < 0) + return r; + + r = device_add_property_internal(device, "DEVUID", uid); + if (r < 0) + return r; + + device->devuid = u; + + return 0; +} + +int device_get_devnode_gid(sd_device *device, gid_t *ret) { + int r; + + assert(device); + + r = device_read_db(device); + if (r < 0) + return r; + + if (device->devgid == GID_INVALID) + return -ENOENT; + + if (ret) + *ret = device->devgid; + + return 0; +} + +static int device_set_devgid(sd_device *device, const char *gid) { + gid_t g; + int r; + + assert(device); + assert(gid); + + r = parse_gid(gid, &g); + if (r < 0) + return r; + + r = device_add_property_internal(device, "DEVGID", gid); + if (r < 0) + return r; + + device->devgid = g; + + return 0; +} + +int device_set_action(sd_device *device, sd_device_action_t a) { + int r; + + assert(device); + assert(a >= 0 && a < _SD_DEVICE_ACTION_MAX); + + r = device_add_property_internal(device, "ACTION", device_action_to_string(a)); + if (r < 0) + return r; + + device->action = a; + + return 0; +} + +static int device_set_action_from_string(sd_device *device, const char *action) { + sd_device_action_t a; + + assert(device); + assert(action); + + a = device_action_from_string(action); + if (a < 0) + return a; + + return device_set_action(device, a); +} + +static int device_set_seqnum(sd_device *device, const char *str) { + uint64_t seqnum; + int r; + + assert(device); + assert(str); + + r = safe_atou64(str, &seqnum); + if (r < 0) + return r; + if (seqnum == 0) + return -EINVAL; + + r = device_add_property_internal(device, "SEQNUM", str); + if (r < 0) + return r; + + device->seqnum = seqnum; + + return 0; +} + +static int device_amend(sd_device *device, const char *key, const char *value) { + int r; + + assert(device); + assert(key); + assert(value); + + if (streq(key, "DEVPATH")) { + char *path; + + path = strjoina("/sys", value); + + /* the caller must verify or trust this data (e.g., if it comes from the kernel) */ + r = device_set_syspath(device, path, false); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set syspath to '%s': %m", path); + } else if (streq(key, "SUBSYSTEM")) { + r = device_set_subsystem(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set subsystem to '%s': %m", value); + } else if (streq(key, "DEVTYPE")) { + r = device_set_devtype(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devtype to '%s': %m", value); + } else if (streq(key, "DEVNAME")) { + r = device_set_devname(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devname to '%s': %m", value); + } else if (streq(key, "USEC_INITIALIZED")) { + usec_t t; + + r = safe_atou64(value, &t); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to parse timestamp '%s': %m", value); + + r = device_set_usec_initialized(device, t); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set usec-initialized to '%s': %m", value); + } else if (streq(key, "DRIVER")) { + r = device_set_driver(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set driver to '%s': %m", value); + } else if (streq(key, "IFINDEX")) { + r = device_set_ifindex(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set ifindex to '%s': %m", value); + } else if (streq(key, "DEVMODE")) { + r = device_set_devmode(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devmode to '%s': %m", value); + } else if (streq(key, "DEVUID")) { + r = device_set_devuid(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devuid to '%s': %m", value); + } else if (streq(key, "DEVGID")) { + r = device_set_devgid(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devgid to '%s': %m", value); + } else if (streq(key, "ACTION")) { + r = device_set_action_from_string(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set action to '%s': %m", value); + } else if (streq(key, "SEQNUM")) { + r = device_set_seqnum(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set SEQNUM to '%s': %m", value); + } else if (streq(key, "DISKSEQ")) { + r = device_set_diskseq(device, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set DISKSEQ to '%s': %m", value); + } else if (streq(key, "DEVLINKS")) { + for (const char *p = value;;) { + _cleanup_free_ char *word = NULL; + + /* udev rules may set escaped strings, and sd-device does not modify the input + * strings. So, it is also necessary to keep the strings received through + * sd-device-monitor. */ + r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + if (r == 0) + break; + + r = device_add_devlink(device, word); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to add devlink '%s': %m", word); + } + } else if (STR_IN_SET(key, "TAGS", "CURRENT_TAGS")) { + for (const char *p = value;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + if (isempty(word)) + continue; + + r = device_add_tag(device, word, streq(key, "CURRENT_TAGS")); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to add tag '%s': %m", word); + } + } else if (streq(key, "UDEV_DATABASE_VERSION")) { + r = safe_atou(value, &device->database_version); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to parse udev database version '%s': %m", value); + } else { + r = device_add_property_internal(device, key, value); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to add property '%s=%s': %m", key, value); + } + + return 0; +} + +static int device_append( + sd_device *device, + char *key, + const char **_major, + const char **_minor) { + + const char *major = NULL, *minor = NULL; + char *value; + int r; + + assert(device); + assert(key); + assert(_major); + assert(_minor); + + value = strchr(key, '='); + if (!value) + return log_device_debug_errno(device, SYNTHETIC_ERRNO(EINVAL), + "sd-device: Not a key-value pair: '%s'", key); + + *value = '\0'; + + value++; + + if (streq(key, "MAJOR")) + major = value; + else if (streq(key, "MINOR")) + minor = value; + else { + r = device_amend(device, key, value); + if (r < 0) + return r; + } + + if (major) + *_major = major; + + if (minor) + *_minor = minor; + + return 0; +} + +void device_seal(sd_device *device) { + assert(device); + + device->sealed = true; +} + +static int device_verify(sd_device *device) { + int r; + + assert(device); + + if (!device->devpath || !device->subsystem || device->action < 0 || device->seqnum == 0) + return log_device_debug_errno(device, SYNTHETIC_ERRNO(EINVAL), + "sd-device: Device created from strv or nulstr lacks devpath, subsystem, action or seqnum."); + + if (streq(device->subsystem, "drivers")) { + r = device_set_drivers_subsystem(device); + if (r < 0) + return r; + } + + device->sealed = true; + + return 0; +} + +int device_new_from_strv(sd_device **ret, char **strv) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + const char *major = NULL, *minor = NULL; + int r; + + assert(ret); + assert(strv); + + r = device_new_aux(&device); + if (r < 0) + return r; + + STRV_FOREACH(key, strv) { + r = device_append(device, *key, &major, &minor); + if (r < 0) + return r; + } + + if (major) { + r = device_set_devnum(device, major, minor); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devnum %s:%s: %m", major, minor); + } + + r = device_verify(device); + if (r < 0) + return r; + + *ret = TAKE_PTR(device); + + return 0; +} + +int device_new_from_nulstr(sd_device **ret, char *nulstr, size_t len) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + const char *major = NULL, *minor = NULL; + int r; + + assert(ret); + assert(nulstr); + assert(len); + + r = device_new_aux(&device); + if (r < 0) + return r; + + for (size_t i = 0; i < len; ) { + char *key; + const char *end; + + key = nulstr + i; + end = memchr(key, '\0', len - i); + if (!end) + return log_device_debug_errno(device, SYNTHETIC_ERRNO(EINVAL), + "sd-device: Failed to parse nulstr"); + + i += end - key + 1; + + /* netlink messages for some devices contain an unwanted newline at the end of value. + * Let's drop the newline and remaining characters after the newline. */ + truncate_nl(key); + + r = device_append(device, key, &major, &minor); + if (r < 0) + return r; + } + + if (major) { + r = device_set_devnum(device, major, minor); + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to set devnum %s:%s: %m", major, minor); + } + + r = device_verify(device); + if (r < 0) + return r; + + *ret = TAKE_PTR(device); + + return 0; +} + +static int device_update_properties_bufs(sd_device *device) { + _cleanup_free_ char **buf_strv = NULL, *buf_nulstr = NULL; + size_t nulstr_len = 0, num = 0; + + assert(device); + + if (!device->properties_buf_outdated) + return 0; + + /* append udev database version */ + buf_nulstr = newdup(char, "UDEV_DATABASE_VERSION=" STRINGIFY(LATEST_UDEV_DATABASE_VERSION) "\0", + STRLEN("UDEV_DATABASE_VERSION=" STRINGIFY(LATEST_UDEV_DATABASE_VERSION)) + 2); + if (!buf_nulstr) + return -ENOMEM; + + nulstr_len += STRLEN("UDEV_DATABASE_VERSION=" STRINGIFY(LATEST_UDEV_DATABASE_VERSION)) + 1; + num++; + + FOREACH_DEVICE_PROPERTY(device, prop, val) { + size_t len = 0; + + len = strlen(prop) + 1 + strlen(val); + + buf_nulstr = GREEDY_REALLOC0(buf_nulstr, nulstr_len + len + 2); + if (!buf_nulstr) + return -ENOMEM; + + strscpyl(buf_nulstr + nulstr_len, len + 1, prop, "=", val, NULL); + nulstr_len += len + 1; + num++; + } + + /* build buf_strv from buf_nulstr */ + buf_strv = new0(char*, num + 1); + if (!buf_strv) + return -ENOMEM; + + size_t i = 0; + NULSTR_FOREACH(p, buf_nulstr) + buf_strv[i++] = p; + assert(i == num); + + free_and_replace(device->properties_nulstr, buf_nulstr); + device->properties_nulstr_len = nulstr_len; + free_and_replace(device->properties_strv, buf_strv); + + device->properties_buf_outdated = false; + return 0; +} + +int device_get_properties_nulstr(sd_device *device, const char **ret_nulstr, size_t *ret_len) { + int r; + + assert(device); + + r = device_update_properties_bufs(device); + if (r < 0) + return r; + + if (ret_nulstr) + *ret_nulstr = device->properties_nulstr; + if (ret_len) + *ret_len = device->properties_nulstr_len; + + return 0; +} + +int device_get_properties_strv(sd_device *device, char ***ret) { + int r; + + assert(device); + + r = device_update_properties_bufs(device); + if (r < 0) + return r; + + if (ret) + *ret = device->properties_strv; + + return 0; +} + +int device_get_devlink_priority(sd_device *device, int *ret) { + int r; + + assert(device); + + r = device_read_db(device); + if (r < 0) + return r; + + if (ret) + *ret = device->devlink_priority; + + return 0; +} + +int device_clone_with_db(sd_device *device, sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *dest = NULL; + const char *key, *val; + int r; + + assert(device); + assert(ret); + + /* The device may be already removed. Let's copy minimal set of information that was obtained through + * netlink socket. */ + + r = device_new_aux(&dest); + if (r < 0) + return r; + + /* Seal device to prevent reading the uevent file, as the device may have been already removed. */ + dest->sealed = true; + + /* Copy syspath, then also devname, sysname or sysnum can be obtained. */ + r = device_set_syspath(dest, device->syspath, false); + if (r < 0) + return r; + + /* Copy other information stored in database. Here, do not use FOREACH_DEVICE_PROPERTY() and + * sd_device_get_property_value(), as they calls device_properties_prepare() -> + * device_read_uevent_file(), but as commented in the above, the device may be already removed and + * reading uevent file may fail. */ + ORDERED_HASHMAP_FOREACH_KEY(val, key, device->properties) { + if (streq(key, "MINOR")) + continue; + + if (streq(key, "MAJOR")) { + const char *minor = NULL; + + minor = ordered_hashmap_get(device->properties, "MINOR"); + r = device_set_devnum(dest, val, minor); + } else + r = device_amend(dest, key, val); + if (r < 0) + return r; + + if (streq(key, "SUBSYSTEM") && streq(val, "drivers")) { + r = free_and_strdup(&dest->driver_subsystem, device->driver_subsystem); + if (r < 0) + return r; + } + } + + /* Finally, read the udev database. */ + r = device_read_db_internal(dest, /* force = */ true); + if (r < 0) + return r; + + *ret = TAKE_PTR(dest); + return 0; +} + +void device_cleanup_tags(sd_device *device) { + assert(device); + + device->all_tags = set_free_free(device->all_tags); + device->current_tags = set_free_free(device->current_tags); + device->property_tags_outdated = true; + device->tags_generation++; +} + +void device_cleanup_devlinks(sd_device *device) { + assert(device); + + set_free_free(device->devlinks); + device->devlinks = NULL; + device->property_devlinks_outdated = true; + device->devlinks_generation++; +} + +void device_remove_tag(sd_device *device, const char *tag) { + assert(device); + assert(tag); + + free(set_remove(device->current_tags, tag)); + device->property_tags_outdated = true; + device->tags_generation++; +} + +static int device_tag(sd_device *device, const char *tag, bool add) { + const char *id; + char *path; + int r; + + assert(device); + assert(tag); + + r = device_get_device_id(device, &id); + if (r < 0) + return r; + + path = strjoina("/run/udev/tags/", tag, "/", id); + + if (add) + return touch_file(path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, 0444); + + if (unlink(path) < 0 && errno != ENOENT) + return -errno; + + return 0; +} + +int device_tag_index(sd_device *device, sd_device *device_old, bool add) { + int r = 0, k; + + if (add && device_old) + /* delete possible left-over tags */ + FOREACH_DEVICE_TAG(device_old, tag) + if (!sd_device_has_tag(device, tag)) { + k = device_tag(device_old, tag, false); + if (r >= 0 && k < 0) + r = k; + } + + FOREACH_DEVICE_TAG(device, tag) { + k = device_tag(device, tag, add); + if (r >= 0 && k < 0) + r = k; + } + + return r; +} + +static bool device_has_info(sd_device *device) { + assert(device); + + if (!set_isempty(device->devlinks)) + return true; + + if (device->devlink_priority != 0) + return true; + + if (!ordered_hashmap_isempty(device->properties_db)) + return true; + + if (!set_isempty(device->all_tags)) + return true; + + if (!set_isempty(device->current_tags)) + return true; + + return false; +} + +void device_set_db_persist(sd_device *device) { + assert(device); + + device->db_persist = true; +} + +int device_update_db(sd_device *device) { + const char *id; + char *path; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_and_freep) char *path_tmp = NULL; + bool has_info; + int r; + + assert(device); + + has_info = device_has_info(device); + + r = device_get_device_id(device, &id); + if (r < 0) + return r; + + path = strjoina("/run/udev/data/", id); + + /* do not store anything for otherwise empty devices */ + if (!has_info && major(device->devnum) == 0 && device->ifindex == 0) { + if (unlink(path) < 0 && errno != ENOENT) + return -errno; + + return 0; + } + + /* write a database file */ + r = mkdir_parents(path, 0755); + if (r < 0) + return r; + + r = fopen_temporary(path, &f, &path_tmp); + if (r < 0) + return r; + + /* set 'sticky' bit to indicate that we should not clean the database when we transition from initrd + * to the real root */ + if (fchmod(fileno(f), device->db_persist ? 01644 : 0644) < 0) { + r = -errno; + goto fail; + } + + if (has_info) { + const char *property, *value, *ct; + + if (major(device->devnum) > 0) { + FOREACH_DEVICE_DEVLINK(device, devlink) + fprintf(f, "S:%s\n", devlink + STRLEN("/dev/")); + + if (device->devlink_priority != 0) + fprintf(f, "L:%i\n", device->devlink_priority); + } + + if (device->usec_initialized > 0) + fprintf(f, "I:"USEC_FMT"\n", device->usec_initialized); + + ORDERED_HASHMAP_FOREACH_KEY(value, property, device->properties_db) + fprintf(f, "E:%s=%s\n", property, value); + + FOREACH_DEVICE_TAG(device, tag) + fprintf(f, "G:%s\n", tag); /* Any tag */ + + SET_FOREACH(ct, device->current_tags) + fprintf(f, "Q:%s\n", ct); /* Current tag */ + + /* Always write the latest database version here, instead of the value stored in + * device->database_version, as which may be 0. */ + fputs("V:" STRINGIFY(LATEST_UDEV_DATABASE_VERSION) "\n", f); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(path_tmp, path) < 0) { + r = -errno; + goto fail; + } + + path_tmp = mfree(path_tmp); + + log_device_debug(device, "sd-device: Created %s file '%s' for '%s'", has_info ? "db" : "empty", + path, device->devpath); + + return 0; + +fail: + (void) unlink(path); + + return log_device_debug_errno(device, r, "sd-device: Failed to create %s file '%s' for '%s'", has_info ? "db" : "empty", path, device->devpath); +} + +int device_delete_db(sd_device *device) { + const char *id; + char *path; + int r; + + assert(device); + + r = device_get_device_id(device, &id); + if (r < 0) + return r; + + path = strjoina("/run/udev/data/", id); + + if (unlink(path) < 0 && errno != ENOENT) + return -errno; + + return 0; +} + +static const char* const device_action_table[_SD_DEVICE_ACTION_MAX] = { + [SD_DEVICE_ADD] = "add", + [SD_DEVICE_REMOVE] = "remove", + [SD_DEVICE_CHANGE] = "change", + [SD_DEVICE_MOVE] = "move", + [SD_DEVICE_ONLINE] = "online", + [SD_DEVICE_OFFLINE] = "offline", + [SD_DEVICE_BIND] = "bind", + [SD_DEVICE_UNBIND] = "unbind", +}; + +DEFINE_STRING_TABLE_LOOKUP(device_action, sd_device_action_t); + +void dump_device_action_table(void) { + DUMP_STRING_TABLE(device_action, sd_device_action_t, _SD_DEVICE_ACTION_MAX); +} diff --git a/src/libsystemd/sd-device/device-private.h b/src/libsystemd/sd-device/device-private.h new file mode 100644 index 0000000..b903d1a --- /dev/null +++ b/src/libsystemd/sd-device/device-private.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "sd-device.h" + +#include "macro.h" + +int device_new_from_mode_and_devnum(sd_device **ret, mode_t mode, dev_t devnum); +int device_new_from_nulstr(sd_device **ret, char *nulstr, size_t len); +int device_new_from_strv(sd_device **ret, char **strv); + +int device_opendir(sd_device *device, const char *subdir, DIR **ret); + +int device_get_property_bool(sd_device *device, const char *key); +int device_get_property_int(sd_device *device, const char *key, int *ret); +int device_get_sysattr_int(sd_device *device, const char *sysattr, int *ret_value); +int device_get_sysattr_unsigned(sd_device *device, const char *sysattr, unsigned *ret_value); +int device_get_sysattr_bool(sd_device *device, const char *sysattr); +int device_get_device_id(sd_device *device, const char **ret); +int device_get_devlink_priority(sd_device *device, int *ret); +int device_get_devnode_mode(sd_device *device, mode_t *ret); +int device_get_devnode_uid(sd_device *device, uid_t *ret); +int device_get_devnode_gid(sd_device *device, gid_t *ret); + +void device_clear_sysattr_cache(sd_device *device); +int device_cache_sysattr_value(sd_device *device, const char *key, char *value); +int device_get_cached_sysattr_value(sd_device *device, const char *key, const char **ret_value); + +void device_seal(sd_device *device); +void device_set_is_initialized(sd_device *device); +void device_set_db_persist(sd_device *device); +void device_set_devlink_priority(sd_device *device, int priority); +int device_ensure_usec_initialized(sd_device *device, sd_device *device_old); +int device_add_devlink(sd_device *device, const char *devlink); +int device_remove_devlink(sd_device *device, const char *devlink); +bool device_has_devlink(sd_device *device, const char *devlink); +int device_add_property(sd_device *device, const char *property, const char *value); +int device_add_propertyf(sd_device *device, const char *key, const char *format, ...) _printf_(3, 4); +int device_add_tag(sd_device *device, const char *tag, bool both); +void device_remove_tag(sd_device *device, const char *tag); +void device_cleanup_tags(sd_device *device); +void device_cleanup_devlinks(sd_device *device); + +uint64_t device_get_properties_generation(sd_device *device); +uint64_t device_get_tags_generation(sd_device *device); +uint64_t device_get_devlinks_generation(sd_device *device); + +int device_properties_prepare(sd_device *device); +int device_get_properties_nulstr(sd_device *device, const char **ret_nulstr, size_t *ret_len); +int device_get_properties_strv(sd_device *device, char ***ret); + +int device_clone_with_db(sd_device *device, sd_device **ret); + +int device_tag_index(sd_device *dev, sd_device *dev_old, bool add); +int device_update_db(sd_device *device); +int device_delete_db(sd_device *device); +int device_read_db_internal_filename(sd_device *device, const char *filename); /* For fuzzer */ +int device_read_db_internal(sd_device *device, bool force); +static inline int device_read_db(sd_device *device) { + return device_read_db_internal(device, false); +} + +int device_read_uevent_file(sd_device *device); + +int device_set_action(sd_device *device, sd_device_action_t a); +sd_device_action_t device_action_from_string(const char *s) _pure_; +const char *device_action_to_string(sd_device_action_t a) _const_; +void dump_device_action_table(void); diff --git a/src/libsystemd/sd-device/device-util.c b/src/libsystemd/sd-device/device-util.c new file mode 100644 index 0000000..529eff2 --- /dev/null +++ b/src/libsystemd/sd-device/device-util.c @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "device-private.h" +#include "device-util.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "string-util.h" +#include "strv.h" + +int devname_from_devnum(mode_t mode, dev_t devnum, char **ret) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_free_ char *s = NULL; + const char *devname; + int r; + + assert(ret); + + if (devnum_is_zero(devnum)) + return device_path_make_inaccessible(mode, ret); + + r = device_new_from_mode_and_devnum(&dev, mode, devnum); + if (r < 0) + return r; + + r = sd_device_get_devname(dev, &devname); + if (r < 0) + return r; + + s = strdup(devname); + if (!s) + return -ENOMEM; + + *ret = TAKE_PTR(s); + return 0; +} + +int device_open_from_devnum(mode_t mode, dev_t devnum, int flags, char **ret) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + r = device_new_from_mode_and_devnum(&dev, mode, devnum); + if (r < 0) + return r; + + fd = sd_device_open(dev, flags); + if (fd < 0) + return fd; + + if (ret) { + const char *devname; + char *s; + + r = sd_device_get_devname(dev, &devname); + if (r < 0) + return r; + + s = strdup(devname); + if (!s) + return -ENOMEM; + + *ret = s; + } + + return TAKE_FD(fd); +} + +static int add_string_field( + sd_device *device, + const char *field, + int (*func)(sd_device *dev, const char **s), + char ***strv) { + + const char *s; + int r; + + assert(device); + assert(field); + assert(func); + assert(strv); + + r = func(device, &s); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get device \"%s\" property, ignoring: %m", field); + if (r >= 0) + (void) strv_extend_assignment(strv, field, s); + + return 0; +} + +char** device_make_log_fields(sd_device *device) { + _cleanup_strv_free_ char **strv = NULL; + dev_t devnum; + int ifindex; + sd_device_action_t action; + uint64_t seqnum, diskseq; + int r; + + assert(device); + + (void) add_string_field(device, "SYSPATH", sd_device_get_syspath, &strv); + (void) add_string_field(device, "SUBSYSTEM", sd_device_get_subsystem, &strv); + (void) add_string_field(device, "DEVTYPE", sd_device_get_devtype, &strv); + (void) add_string_field(device, "DRIVER", sd_device_get_driver, &strv); + (void) add_string_field(device, "DEVPATH", sd_device_get_devpath, &strv); + (void) add_string_field(device, "DEVNAME", sd_device_get_devname, &strv); + (void) add_string_field(device, "SYSNAME", sd_device_get_sysname, &strv); + (void) add_string_field(device, "SYSNUM", sd_device_get_sysnum, &strv); + + r = sd_device_get_devnum(device, &devnum); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get device \"DEVNUM\" property, ignoring: %m"); + if (r >= 0) + (void) strv_extendf(&strv, "DEVNUM="DEVNUM_FORMAT_STR, DEVNUM_FORMAT_VAL(devnum)); + + r = sd_device_get_ifindex(device, &ifindex); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get device \"IFINDEX\" property, ignoring: %m"); + if (r >= 0) + (void) strv_extendf(&strv, "IFINDEX=%i", ifindex); + + r = sd_device_get_action(device, &action); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get device \"ACTION\" property, ignoring: %m"); + if (r >= 0) + (void) strv_extendf(&strv, "ACTION=%s", device_action_to_string(action)); + + r = sd_device_get_seqnum(device, &seqnum); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get device \"SEQNUM\" property, ignoring: %m"); + if (r >= 0) + (void) strv_extendf(&strv, "SEQNUM=%"PRIu64, seqnum); + + r = sd_device_get_diskseq(device, &diskseq); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get device \"DISKSEQ\" property, ignoring: %m"); + if (r >= 0) + (void) strv_extendf(&strv, "DISKSEQ=%"PRIu64, diskseq); + + return TAKE_PTR(strv); +} diff --git a/src/libsystemd/sd-device/device-util.h b/src/libsystemd/sd-device/device-util.h new file mode 100644 index 0000000..bf86ddc --- /dev/null +++ b/src/libsystemd/sd-device/device-util.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "log.h" +#include "macro.h" + +#define device_unref_and_replace(a, b) \ + unref_and_replace_full(a, b, sd_device_ref, sd_device_unref) + +#define FOREACH_DEVICE_PROPERTY(device, key, value) \ + for (const char *value, *key = sd_device_get_property_first(device, &value); \ + key; \ + key = sd_device_get_property_next(device, &value)) + +#define FOREACH_DEVICE_TAG(device, tag) \ + for (const char *tag = sd_device_get_tag_first(device); \ + tag; \ + tag = sd_device_get_tag_next(device)) + +#define FOREACH_DEVICE_CURRENT_TAG(device, tag) \ + for (const char *tag = sd_device_get_current_tag_first(device); \ + tag; \ + tag = sd_device_get_current_tag_next(device)) + +#define FOREACH_DEVICE_SYSATTR(device, attr) \ + for (const char *attr = sd_device_get_sysattr_first(device); \ + attr; \ + attr = sd_device_get_sysattr_next(device)) + +#define FOREACH_DEVICE_DEVLINK(device, devlink) \ + for (const char *devlink = sd_device_get_devlink_first(device); \ + devlink; \ + devlink = sd_device_get_devlink_next(device)) + +#define _FOREACH_DEVICE_CHILD(device, child, suffix_ptr) \ + for (sd_device *child = sd_device_get_child_first(device, suffix_ptr); \ + child; \ + child = sd_device_get_child_next(device, suffix_ptr)) + +#define FOREACH_DEVICE_CHILD(device, child) \ + _FOREACH_DEVICE_CHILD(device, child, NULL) + +#define FOREACH_DEVICE_CHILD_WITH_SUFFIX(device, child, suffix) \ + _FOREACH_DEVICE_CHILD(device, child, &suffix) + +#define FOREACH_DEVICE(enumerator, device) \ + for (sd_device *device = sd_device_enumerator_get_device_first(enumerator); \ + device; \ + device = sd_device_enumerator_get_device_next(enumerator)) + +#define FOREACH_SUBSYSTEM(enumerator, device) \ + for (sd_device *device = sd_device_enumerator_get_subsystem_first(enumerator); \ + device; \ + device = sd_device_enumerator_get_subsystem_next(enumerator)) + +#define log_device_full_errno_zerook(device, level, error, ...) \ + ({ \ + const char *_sysname = NULL; \ + sd_device *_d = (device); \ + int _level = (level), _e = (error); \ + \ + if (_d && _unlikely_(log_get_max_level() >= LOG_PRI(_level))) \ + (void) sd_device_get_sysname(_d, &_sysname); \ + log_object_internal(_level, _e, PROJECT_FILE, __LINE__, __func__, \ + _sysname ? "DEVICE=" : NULL, _sysname, \ + NULL, NULL, __VA_ARGS__); \ + }) + +#define log_device_full_errno(device, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_device_full_errno_zerook(device, level, _error, __VA_ARGS__); \ + }) + +#define log_device_full(device, level, ...) (void) log_device_full_errno_zerook(device, level, 0, __VA_ARGS__) + +#define log_device_debug(device, ...) log_device_full(device, LOG_DEBUG, __VA_ARGS__) +#define log_device_info(device, ...) log_device_full(device, LOG_INFO, __VA_ARGS__) +#define log_device_notice(device, ...) log_device_full(device, LOG_NOTICE, __VA_ARGS__) +#define log_device_warning(device, ...) log_device_full(device, LOG_WARNING, __VA_ARGS__) +#define log_device_error(device, ...) log_device_full(device, LOG_ERR, __VA_ARGS__) + +#define log_device_debug_errno(device, error, ...) log_device_full_errno(device, LOG_DEBUG, error, __VA_ARGS__) +#define log_device_info_errno(device, error, ...) log_device_full_errno(device, LOG_INFO, error, __VA_ARGS__) +#define log_device_notice_errno(device, error, ...) log_device_full_errno(device, LOG_NOTICE, error, __VA_ARGS__) +#define log_device_warning_errno(device, error, ...) log_device_full_errno(device, LOG_WARNING, error, __VA_ARGS__) +#define log_device_error_errno(device, error, ...) log_device_full_errno(device, LOG_ERR, error, __VA_ARGS__) + +int devname_from_devnum(mode_t mode, dev_t devnum, char **ret); +static inline int devname_from_stat_rdev(const struct stat *st, char **ret) { + assert(st); + return devname_from_devnum(st->st_mode, st->st_rdev, ret); +} +int device_open_from_devnum(mode_t mode, dev_t devnum, int flags, char **ret); + +char** device_make_log_fields(sd_device *device); diff --git a/src/libsystemd/sd-device/sd-device.c b/src/libsystemd/sd-device/sd-device.c new file mode 100644 index 0000000..2fbc619 --- /dev/null +++ b/src/libsystemd/sd-device/sd-device.c @@ -0,0 +1,2715 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "chase.h" +#include "device-internal.h" +#include "device-private.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "id128-util.h" +#include "macro.h" +#include "missing_magic.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "set.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "user-util.h" + +int device_new_aux(sd_device **ret) { + sd_device *device; + + assert(ret); + + device = new(sd_device, 1); + if (!device) + return -ENOMEM; + + *device = (sd_device) { + .n_ref = 1, + .devmode = MODE_INVALID, + .devuid = UID_INVALID, + .devgid = GID_INVALID, + .action = _SD_DEVICE_ACTION_INVALID, + }; + + *ret = device; + return 0; +} + +static sd_device *device_free(sd_device *device) { + assert(device); + + sd_device_unref(device->parent); + free(device->syspath); + free(device->sysname); + free(device->devtype); + free(device->devname); + free(device->subsystem); + free(device->driver_subsystem); + free(device->driver); + free(device->device_id); + free(device->properties_strv); + free(device->properties_nulstr); + + ordered_hashmap_free(device->properties); + ordered_hashmap_free(device->properties_db); + hashmap_free(device->sysattr_values); + set_free(device->sysattrs); + set_free(device->all_tags); + set_free(device->current_tags); + set_free(device->devlinks); + hashmap_free(device->children); + + return mfree(device); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_device, sd_device, device_free); + +int device_add_property_aux(sd_device *device, const char *key, const char *value, bool db) { + OrderedHashmap **properties; + + assert(device); + assert(key); + + if (db) + properties = &device->properties_db; + else + properties = &device->properties; + + if (value) { + _unused_ _cleanup_free_ char *old_value = NULL; + _cleanup_free_ char *new_key = NULL, *new_value = NULL, *old_key = NULL; + int r; + + r = ordered_hashmap_ensure_allocated(properties, &string_hash_ops_free_free); + if (r < 0) + return r; + + new_key = strdup(key); + if (!new_key) + return -ENOMEM; + + new_value = strdup(value); + if (!new_value) + return -ENOMEM; + + old_value = ordered_hashmap_get2(*properties, key, (void**) &old_key); + + /* ordered_hashmap_replace() does not fail when the hashmap already has the entry. */ + r = ordered_hashmap_replace(*properties, new_key, new_value); + if (r < 0) + return r; + + TAKE_PTR(new_key); + TAKE_PTR(new_value); + } else { + _unused_ _cleanup_free_ char *old_value = NULL; + _cleanup_free_ char *old_key = NULL; + + old_value = ordered_hashmap_remove2(*properties, key, (void**) &old_key); + } + + if (!db) { + device->properties_generation++; + device->properties_buf_outdated = true; + } + + return 0; +} + +int device_set_syspath(sd_device *device, const char *_syspath, bool verify) { + _cleanup_free_ char *syspath = NULL; + const char *devpath; + int r; + + assert(device); + assert(_syspath); + + if (verify) { + _cleanup_close_ int fd = -EBADF; + + /* The input path maybe a symlink located outside of /sys. Let's try to chase the symlink at first. + * The primary use case is that e.g. /proc/device-tree is a symlink to /sys/firmware/devicetree/base. + * By chasing symlinks in the path at first, we can call sd_device_new_from_path() with such path. */ + r = chase(_syspath, NULL, 0, &syspath, &fd); + if (r == -ENOENT) + /* the device does not exist (any more?) */ + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), + "sd-device: Failed to chase symlinks in \"%s\".", _syspath); + if (r < 0) + return log_debug_errno(r, "sd-device: Failed to get target of '%s': %m", _syspath); + + if (!path_startswith(syspath, "/sys")) { + _cleanup_free_ char *real_sys = NULL, *new_syspath = NULL; + char *p; + + /* /sys is a symlink to somewhere sysfs is mounted on? In that case, we convert the path to real sysfs to "/sys". */ + r = chase("/sys", NULL, 0, &real_sys, NULL); + if (r < 0) + return log_debug_errno(r, "sd-device: Failed to chase symlink /sys: %m"); + + p = path_startswith(syspath, real_sys); + if (!p) + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), + "sd-device: Canonicalized path '%s' does not starts with sysfs mount point '%s'", + syspath, real_sys); + + new_syspath = path_join("/sys", p); + if (!new_syspath) + return log_oom_debug(); + + free_and_replace(syspath, new_syspath); + path_simplify(syspath); + } + + if (path_startswith(syspath, "/sys/devices/")) { + /* For proper devices, stricter rules apply: they must have a 'uevent' file, + * otherwise we won't allow them */ + + if (faccessat(fd, "uevent", F_OK, 0) < 0) { + if (errno == ENOENT) + /* This is not a valid device. Note, this condition is quite often + * satisfied when enumerating devices or finding a parent device. + * Hence, use log_trace_errno() here. */ + return log_trace_errno(SYNTHETIC_ERRNO(ENODEV), + "sd-device: the uevent file \"%s/uevent\" does not exist.", syspath); + if (errno == ENOTDIR) + /* Not actually a directory. */ + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), + "sd-device: the syspath \"%s\" is not a directory.", syspath); + + return log_debug_errno(errno, "sd-device: cannot find uevent file for %s: %m", syspath); + } + } else { + struct stat st; + + /* For everything else lax rules apply: they just need to be a directory */ + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "sd-device: failed to check if syspath \"%s\" is a directory: %m", syspath); + if (!S_ISDIR(st.st_mode)) + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), + "sd-device: the syspath \"%s\" is not a directory.", syspath); + } + + /* Only operate on sysfs, i.e. refuse going down into /sys/fs/cgroup/ or similar places where + * things are not arranged as kobjects in kernel, and hence don't necessarily have + * kobject/attribute structure. */ + r = getenv_bool_secure("SYSTEMD_DEVICE_VERIFY_SYSFS"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_DEVICE_VERIFY_SYSFS value: %m"); + if (r != 0) { + r = fd_is_fs_type(fd, SYSFS_MAGIC); + if (r < 0) + return log_debug_errno(r, "sd-device: failed to check if syspath \"%s\" is backed by sysfs.", syspath); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), + "sd-device: the syspath \"%s\" is outside of sysfs, refusing.", syspath); + } + } else { + /* must be a subdirectory of /sys */ + if (!path_startswith(_syspath, "/sys/")) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "sd-device: Syspath '%s' is not a subdirectory of /sys", + _syspath); + + r = path_simplify_alloc(_syspath, &syspath); + if (r < 0) + return r; + } + + assert_se(devpath = startswith(syspath, "/sys")); + if (devpath[0] != '/') + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), "sd-device: \"/sys\" alone is not a valid device path."); + + r = device_add_property_internal(device, "DEVPATH", devpath); + if (r < 0) + return log_debug_errno(r, "sd-device: Failed to add \"DEVPATH\" property for device \"%s\": %m", syspath); + + free_and_replace(device->syspath, syspath); + device->devpath = devpath; + + /* Unset sysname and sysnum, they will be assigned when requested. */ + device->sysnum = NULL; + device->sysname = mfree(device->sysname); + return 0; +} + +static int device_new_from_syspath(sd_device **ret, const char *syspath, bool strict) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(syspath, -EINVAL); + + if (strict && !path_startswith(syspath, "/sys/")) + return -EINVAL; + + r = device_new_aux(&device); + if (r < 0) + return r; + + r = device_set_syspath(device, syspath, /* verify= */ true); + if (r < 0) + return r; + + *ret = TAKE_PTR(device); + return 0; +} + +_public_ int sd_device_new_from_syspath(sd_device **ret, const char *syspath) { + return device_new_from_syspath(ret, syspath, /* strict = */ true); +} + +int device_new_from_mode_and_devnum(sd_device **ret, mode_t mode, dev_t devnum) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_free_ char *syspath = NULL; + const char *t, *subsystem = NULL; + dev_t n; + int r; + + assert(ret); + + if (S_ISCHR(mode)) + t = "char"; + else if (S_ISBLK(mode)) + t = "block"; + else + return -ENOTTY; + + if (major(devnum) == 0) + return -ENODEV; + + if (asprintf(&syspath, "/sys/dev/%s/" DEVNUM_FORMAT_STR, t, DEVNUM_FORMAT_VAL(devnum)) < 0) + return -ENOMEM; + + r = sd_device_new_from_syspath(&dev, syspath); + if (r < 0) + return r; + + r = sd_device_get_devnum(dev, &n); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (n != devnum) + return -ENXIO; + + r = sd_device_get_subsystem(dev, &subsystem); + if (r < 0 && r != -ENOENT) + return r; + if (streq_ptr(subsystem, "block") != !!S_ISBLK(mode)) + return -ENXIO; + + *ret = TAKE_PTR(dev); + return 0; +} + +_public_ int sd_device_new_from_devnum(sd_device **ret, char type, dev_t devnum) { + assert_return(ret, -EINVAL); + assert_return(IN_SET(type, 'b', 'c'), -EINVAL); + + return device_new_from_mode_and_devnum(ret, type == 'b' ? S_IFBLK : S_IFCHR, devnum); +} + +static int device_new_from_main_ifname(sd_device **ret, const char *ifname) { + const char *syspath; + + assert(ret); + assert(ifname); + + syspath = strjoina("/sys/class/net/", ifname); + return sd_device_new_from_syspath(ret, syspath); +} + +_public_ int sd_device_new_from_ifname(sd_device **ret, const char *ifname) { + _cleanup_free_ char *main_name = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(ifname, -EINVAL); + + r = parse_ifindex(ifname); + if (r > 0) + return sd_device_new_from_ifindex(ret, r); + + if (ifname_valid(ifname)) { + r = device_new_from_main_ifname(ret, ifname); + if (r >= 0) + return r; + } + + r = rtnl_resolve_link_alternative_name(NULL, ifname, &main_name); + if (r < 0) + return r; + + return device_new_from_main_ifname(ret, main_name); +} + +_public_ int sd_device_new_from_ifindex(sd_device **ret, int ifindex) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + char ifname[IF_NAMESIZE]; + int r, i; + + assert_return(ret, -EINVAL); + assert_return(ifindex > 0, -EINVAL); + + if (format_ifname(ifindex, ifname) < 0) + return -ENODEV; + + r = device_new_from_main_ifname(&dev, ifname); + if (r < 0) + return r; + + r = sd_device_get_ifindex(dev, &i); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (i != ifindex) + return -ENXIO; + + *ret = TAKE_PTR(dev); + return 0; +} + +static int device_strjoin_new( + const char *a, + const char *b, + const char *c, + const char *d, + sd_device **ret) { + + const char *p; + int r; + + p = strjoina(a, b, c, d); + if (access(p, F_OK) < 0) + return IN_SET(errno, ENOENT, ENAMETOOLONG) ? 0 : -errno; /* If this sysfs is too long then it doesn't exist either */ + + r = sd_device_new_from_syspath(ret, p); + if (r < 0) + return r; + + return 1; +} + +_public_ int sd_device_new_from_subsystem_sysname( + sd_device **ret, + const char *subsystem, + const char *sysname) { + + char *name; + int r; + + assert_return(ret, -EINVAL); + assert_return(subsystem, -EINVAL); + assert_return(sysname, -EINVAL); + + if (!path_is_normalized(subsystem)) + return -EINVAL; + if (!path_is_normalized(sysname)) + return -EINVAL; + + /* translate sysname back to sysfs filename */ + name = strdupa_safe(sysname); + string_replace_char(name, '/', '!'); + + if (streq(subsystem, "subsystem")) { + FOREACH_STRING(s, "/sys/bus/", "/sys/class/") { + r = device_strjoin_new(s, name, NULL, NULL, ret); + if (r < 0) + return r; + if (r > 0) + return 0; + } + + } else if (streq(subsystem, "module")) { + r = device_strjoin_new("/sys/module/", name, NULL, NULL, ret); + if (r < 0) + return r; + if (r > 0) + return 0; + + } else if (streq(subsystem, "drivers")) { + const char *sep; + + sep = strchr(name, ':'); + if (sep && sep[1] != '\0') { /* Require ":" and something non-empty after that. */ + + const char *subsys = memdupa_suffix0(name, sep - name); + sep++; + + if (streq(sep, "drivers")) /* If the sysname is "drivers", then it's the drivers directory itself that is meant. */ + r = device_strjoin_new("/sys/bus/", subsys, "/drivers", NULL, ret); + else + r = device_strjoin_new("/sys/bus/", subsys, "/drivers/", sep, ret); + if (r < 0) + return r; + if (r > 0) + return 0; + } + } + + r = device_strjoin_new("/sys/bus/", subsystem, "/devices/", name, ret); + if (r < 0) + return r; + if (r > 0) + return 0; + + r = device_strjoin_new("/sys/class/", subsystem, "/", name, ret); + if (r < 0) + return r; + if (r > 0) + return 0; + + r = device_strjoin_new("/sys/firmware/", subsystem, "/", name, ret); + if (r < 0) + return r; + if (r > 0) + return 0; + + return -ENODEV; +} + +_public_ int sd_device_new_from_stat_rdev(sd_device **ret, const struct stat *st) { + assert_return(ret, -EINVAL); + assert_return(st, -EINVAL); + + return device_new_from_mode_and_devnum(ret, st->st_mode, st->st_rdev); +} + +_public_ int sd_device_new_from_devname(sd_device **ret, const char *devname) { + struct stat st; + dev_t devnum; + mode_t mode; + + assert_return(ret, -EINVAL); + assert_return(devname, -EINVAL); + + /* This function actually accepts both devlinks and devnames, i.e. both symlinks and device + * nodes below /dev/. */ + + /* Also ignore when the specified path is "/dev". */ + if (isempty(path_startswith(devname, "/dev"))) + return -EINVAL; + + if (device_path_parse_major_minor(devname, &mode, &devnum) >= 0) + /* Let's shortcut when "/dev/block/maj:min" or "/dev/char/maj:min" is specified. + * In that case, we can directly convert the path to syspath, hence it is not necessary + * that the specified path exists. So, this works fine without udevd being running. */ + return device_new_from_mode_and_devnum(ret, mode, devnum); + + if (stat(devname, &st) < 0) + return ERRNO_IS_DEVICE_ABSENT(errno) ? -ENODEV : -errno; + + return sd_device_new_from_stat_rdev(ret, &st); +} + +_public_ int sd_device_new_from_path(sd_device **ret, const char *path) { + assert_return(ret, -EINVAL); + assert_return(path, -EINVAL); + + if (path_startswith(path, "/dev")) + return sd_device_new_from_devname(ret, path); + + return device_new_from_syspath(ret, path, /* strict = */ false); +} + +int device_set_devtype(sd_device *device, const char *devtype) { + _cleanup_free_ char *t = NULL; + int r; + + assert(device); + assert(devtype); + + t = strdup(devtype); + if (!t) + return -ENOMEM; + + r = device_add_property_internal(device, "DEVTYPE", t); + if (r < 0) + return r; + + return free_and_replace(device->devtype, t); +} + +int device_set_ifindex(sd_device *device, const char *name) { + int r, ifindex; + + assert(device); + assert(name); + + ifindex = parse_ifindex(name); + if (ifindex < 0) + return ifindex; + + r = device_add_property_internal(device, "IFINDEX", name); + if (r < 0) + return r; + + device->ifindex = ifindex; + + return 0; +} + +static int mangle_devname(const char *p, char **ret) { + char *q; + + assert(p); + assert(ret); + + if (!path_is_safe(p)) + return -EINVAL; + + /* When the path is absolute, it must start with "/dev/", but ignore "/dev/" itself. */ + if (path_is_absolute(p)) { + if (isempty(path_startswith(p, "/dev/"))) + return -EINVAL; + + q = strdup(p); + } else + q = path_join("/dev/", p); + if (!q) + return -ENOMEM; + + path_simplify(q); + + *ret = q; + return 0; +} + +int device_set_devname(sd_device *device, const char *devname) { + _cleanup_free_ char *t = NULL; + int r; + + assert(device); + assert(devname); + + r = mangle_devname(devname, &t); + if (r < 0) + return r; + + r = device_add_property_internal(device, "DEVNAME", t); + if (r < 0) + return r; + + return free_and_replace(device->devname, t); +} + +int device_set_devmode(sd_device *device, const char *_devmode) { + unsigned devmode; + int r; + + assert(device); + assert(_devmode); + + r = safe_atou(_devmode, &devmode); + if (r < 0) + return r; + + if (devmode > 07777) + return -EINVAL; + + r = device_add_property_internal(device, "DEVMODE", _devmode); + if (r < 0) + return r; + + device->devmode = devmode; + + return 0; +} + +int device_set_devnum(sd_device *device, const char *major, const char *minor) { + unsigned maj, min = 0; + int r; + + assert(device); + assert(major); + + r = safe_atou(major, &maj); + if (r < 0) + return r; + if (maj == 0) + return 0; + if (!DEVICE_MAJOR_VALID(maj)) + return -EINVAL; + + if (minor) { + r = safe_atou(minor, &min); + if (r < 0) + return r; + if (!DEVICE_MINOR_VALID(min)) + return -EINVAL; + } + + r = device_add_property_internal(device, "MAJOR", major); + if (r < 0) + return r; + + if (minor) { + r = device_add_property_internal(device, "MINOR", minor); + if (r < 0) + return r; + } + + device->devnum = makedev(maj, min); + + return 0; +} + +int device_set_diskseq(sd_device *device, const char *str) { + uint64_t diskseq; + int r; + + assert(device); + assert(str); + + r = safe_atou64(str, &diskseq); + if (r < 0) + return r; + if (diskseq == 0) + return -EINVAL; + + r = device_add_property_internal(device, "DISKSEQ", str); + if (r < 0) + return r; + + device->diskseq = diskseq; + + return 0; +} + +static int handle_uevent_line( + sd_device *device, + const char *key, + const char *value, + const char **major, + const char **minor) { + + assert(device); + assert(key); + assert(value); + assert(major); + assert(minor); + + if (streq(key, "DEVTYPE")) + return device_set_devtype(device, value); + if (streq(key, "IFINDEX")) + return device_set_ifindex(device, value); + if (streq(key, "DEVNAME")) + return device_set_devname(device, value); + if (streq(key, "DEVMODE")) + return device_set_devmode(device, value); + if (streq(key, "DISKSEQ")) + return device_set_diskseq(device, value); + if (streq(key, "MAJOR")) + *major = value; + else if (streq(key, "MINOR")) + *minor = value; + else + return device_add_property_internal(device, key, value); + + return 0; +} + +int device_read_uevent_file(sd_device *device) { + _cleanup_free_ char *uevent = NULL; + const char *syspath, *key = NULL, *value = NULL, *major = NULL, *minor = NULL; + char *path; + size_t uevent_len; + int r; + + enum { + PRE_KEY, + KEY, + PRE_VALUE, + VALUE, + INVALID_LINE, + } state = PRE_KEY; + + assert(device); + + if (device->uevent_loaded || device->sealed) + return 0; + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + device->uevent_loaded = true; + + path = strjoina(syspath, "/uevent"); + + r = read_full_virtual_file(path, &uevent, &uevent_len); + if (r == -EACCES || ERRNO_IS_NEG_DEVICE_ABSENT(r)) + /* The uevent files may be write-only, the device may be already removed, or the device + * may not have the uevent file. */ + return 0; + if (r < 0) + return log_device_debug_errno(device, r, "sd-device: Failed to read uevent file '%s': %m", path); + + for (size_t i = 0; i < uevent_len; i++) + switch (state) { + case PRE_KEY: + if (!strchr(NEWLINE, uevent[i])) { + key = &uevent[i]; + + state = KEY; + } + + break; + case KEY: + if (uevent[i] == '=') { + uevent[i] = '\0'; + + state = PRE_VALUE; + } else if (strchr(NEWLINE, uevent[i])) { + uevent[i] = '\0'; + log_device_debug(device, "sd-device: Invalid uevent line '%s', ignoring", key); + + state = PRE_KEY; + } + + break; + case PRE_VALUE: + value = &uevent[i]; + state = VALUE; + + _fallthrough_; /* to handle empty property */ + case VALUE: + if (strchr(NEWLINE, uevent[i])) { + uevent[i] = '\0'; + + r = handle_uevent_line(device, key, value, &major, &minor); + if (r < 0) + log_device_debug_errno(device, r, "sd-device: Failed to handle uevent entry '%s=%s', ignoring: %m", key, value); + + state = PRE_KEY; + } + + break; + default: + assert_not_reached(); + } + + if (major) { + r = device_set_devnum(device, major, minor); + if (r < 0) + log_device_debug_errno(device, r, "sd-device: Failed to set 'MAJOR=%s' or 'MINOR=%s' from '%s', ignoring: %m", major, strna(minor), path); + } + + return 0; +} + +_public_ int sd_device_get_ifindex(sd_device *device, int *ifindex) { + int r; + + assert_return(device, -EINVAL); + + r = device_read_uevent_file(device); + if (r < 0) + return r; + + if (device->ifindex <= 0) + return -ENOENT; + + if (ifindex) + *ifindex = device->ifindex; + + return 0; +} + +_public_ int sd_device_new_from_device_id(sd_device **ret, const char *id) { + int r; + + assert_return(ret, -EINVAL); + assert_return(id, -EINVAL); + + switch (id[0]) { + case 'b': + case 'c': { + dev_t devt; + + if (isempty(id)) + return -EINVAL; + + r = parse_devnum(id + 1, &devt); + if (r < 0) + return r; + + return sd_device_new_from_devnum(ret, id[0], devt); + } + + case 'n': { + int ifindex; + + ifindex = parse_ifindex(id + 1); + if (ifindex < 0) + return ifindex; + + return sd_device_new_from_ifindex(ret, ifindex); + } + + case '+': { + const char *subsys, *sep; + + sep = strchr(id + 1, ':'); + if (!sep || sep - id - 1 > NAME_MAX) + return -EINVAL; + + subsys = memdupa_suffix0(id + 1, sep - id - 1); + + return sd_device_new_from_subsystem_sysname(ret, subsys, sep + 1); + } + + default: + return -EINVAL; + } +} + +_public_ int sd_device_get_syspath(sd_device *device, const char **ret) { + assert_return(device, -EINVAL); + + assert(path_startswith(device->syspath, "/sys/")); + + if (ret) + *ret = device->syspath; + + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_FULL( + device_by_path_hash_ops, + char, path_hash_func, path_compare, free, + sd_device, sd_device_unref); + +static int device_enumerate_children_internal(sd_device *device, const char *subdir, Set **stack, Hashmap **children) { + _cleanup_closedir_ DIR *dir = NULL; + int r; + + assert(device); + assert(stack); + assert(children); + + r = device_opendir(device, subdir, &dir); + if (r < 0) + return r; + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + _cleanup_(sd_device_unrefp) sd_device *child = NULL; + _cleanup_free_ char *p = NULL; + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (!IN_SET(de->d_type, DT_LNK, DT_DIR)) + continue; + + if (subdir) + p = path_join(subdir, de->d_name); + else + p = strdup(de->d_name); + if (!p) + return -ENOMEM; + + /* Try to create child device. */ + r = sd_device_new_child(&child, device, p); + if (r >= 0) { + /* OK, this is a child device, saving it. */ + r = hashmap_ensure_put(children, &device_by_path_hash_ops, p, child); + if (r < 0) + return r; + + TAKE_PTR(p); + TAKE_PTR(child); + } else if (r == -ENODEV) { + /* This is not a child device. Push the sub-directory into stack, and read it later. */ + + if (de->d_type == DT_LNK) + /* Do not follow symlinks, otherwise, we will enter an infinite loop, e.g., + * /sys/class/block/nvme0n1/subsystem/nvme0n1/subsystem/nvme0n1/subsystem/… */ + continue; + + r = set_ensure_consume(stack, &path_hash_ops_free, TAKE_PTR(p)); + if (r < 0) + return r; + } else + return r; + } + + return 0; +} + +static int device_enumerate_children(sd_device *device) { + _cleanup_hashmap_free_ Hashmap *children = NULL; + _cleanup_set_free_ Set *stack = NULL; + int r; + + assert(device); + + if (device->children_enumerated) + return 0; /* Already enumerated. */ + + r = device_enumerate_children_internal(device, NULL, &stack, &children); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *subdir = NULL; + + subdir = set_steal_first(stack); + if (!subdir) + break; + + r = device_enumerate_children_internal(device, subdir, &stack, &children); + if (r < 0) + return r; + } + + device->children_enumerated = true; + device->children = TAKE_PTR(children); + return 1; /* Enumerated. */ +} + +_public_ sd_device *sd_device_get_child_first(sd_device *device, const char **ret_suffix) { + int r; + + assert(device); + + r = device_enumerate_children(device); + if (r < 0) { + log_device_debug_errno(device, r, "sd-device: failed to enumerate child devices: %m"); + if (ret_suffix) + *ret_suffix = NULL; + return NULL; + } + + device->children_iterator = ITERATOR_FIRST; + + return sd_device_get_child_next(device, ret_suffix); +} + +_public_ sd_device *sd_device_get_child_next(sd_device *device, const char **ret_suffix) { + sd_device *child; + + assert(device); + + (void) hashmap_iterate(device->children, &device->children_iterator, (void**) &child, (const void**) ret_suffix); + return child; +} + +_public_ int sd_device_new_child(sd_device **ret, sd_device *device, const char *suffix) { + _cleanup_free_ char *path = NULL; + sd_device *child; + const char *s; + int r; + + assert_return(ret, -EINVAL); + assert_return(device, -EINVAL); + assert_return(suffix, -EINVAL); + + if (!path_is_safe(suffix)) + return -EINVAL; + + /* If we have already enumerated children, try to find the child from the cache. */ + child = hashmap_get(device->children, suffix); + if (child) { + *ret = sd_device_ref(child); + return 0; + } + + r = sd_device_get_syspath(device, &s); + if (r < 0) + return r; + + path = path_join(s, suffix); + if (!path) + return -ENOMEM; + + return sd_device_new_from_syspath(ret, path); +} + +static int device_new_from_child(sd_device **ret, sd_device *child) { + _cleanup_free_ char *path = NULL; + const char *syspath; + int r; + + assert(ret); + assert(child); + + r = sd_device_get_syspath(child, &syspath); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *p = NULL; + + r = path_extract_directory(path ?: syspath, &p); + if (r < 0) + return r; + + if (path_equal(p, "/sys")) + return -ENODEV; + + r = sd_device_new_from_syspath(ret, p); + if (r != -ENODEV) + return r; + + free_and_replace(path, p); + } +} + +_public_ int sd_device_get_parent(sd_device *child, sd_device **ret) { + int r; + + assert_return(child, -EINVAL); + + if (!child->parent_set) { + r = device_new_from_child(&child->parent, child); + if (r < 0 && r != -ENODEV) + return r; + + child->parent_set = true; + } + + if (!child->parent) + return -ENOENT; + + if (ret) + *ret = child->parent; + return 0; +} + +int device_set_subsystem(sd_device *device, const char *subsystem) { + _cleanup_free_ char *s = NULL; + int r; + + assert(device); + + if (subsystem) { + s = strdup(subsystem); + if (!s) + return -ENOMEM; + } + + r = device_add_property_internal(device, "SUBSYSTEM", s); + if (r < 0) + return r; + + device->subsystem_set = true; + return free_and_replace(device->subsystem, s); +} + +int device_set_drivers_subsystem(sd_device *device) { + _cleanup_free_ char *subsystem = NULL; + const char *devpath, *drivers, *p; + int r; + + assert(device); + + r = sd_device_get_devpath(device, &devpath); + if (r < 0) + return r; + + drivers = strstr(devpath, "/drivers/"); + if (!drivers) + drivers = endswith(devpath, "/drivers"); + if (!drivers) + return -EINVAL; + + /* Find the path component immediately before the "/drivers/" string */ + r = path_find_last_component(devpath, /* accept_dot_dot= */ false, &drivers, &p); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + subsystem = strndup(p, r); + if (!subsystem) + return -ENOMEM; + + r = device_set_subsystem(device, "drivers"); + if (r < 0) + return r; + + return free_and_replace(device->driver_subsystem, subsystem); +} + +_public_ int sd_device_get_subsystem(sd_device *device, const char **ret) { + int r; + + assert_return(device, -EINVAL); + + if (!device->subsystem_set) { + _cleanup_free_ char *subsystem = NULL; + const char *syspath; + char *path; + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + /* read 'subsystem' link */ + path = strjoina(syspath, "/subsystem"); + r = readlink_value(path, &subsystem); + if (r < 0 && r != -ENOENT) + return log_device_debug_errno(device, r, + "sd-device: Failed to read subsystem for %s: %m", + device->devpath); + + if (subsystem) + r = device_set_subsystem(device, subsystem); + /* use implicit names */ + else if (!isempty(path_startswith(device->devpath, "/module/"))) + r = device_set_subsystem(device, "module"); + else if (strstr(syspath, "/drivers/") || endswith(syspath, "/drivers")) + r = device_set_drivers_subsystem(device); + else if (!isempty(PATH_STARTSWITH_SET(device->devpath, "/class/", "/bus/"))) + r = device_set_subsystem(device, "subsystem"); + else { + device->subsystem_set = true; + r = 0; + } + if (r < 0) + return log_device_debug_errno(device, r, + "sd-device: Failed to set subsystem for %s: %m", + device->devpath); + } + + if (!device->subsystem) + return -ENOENT; + + if (ret) + *ret = device->subsystem; + return 0; +} + +_public_ int sd_device_get_devtype(sd_device *device, const char **devtype) { + int r; + + assert_return(device, -EINVAL); + + r = device_read_uevent_file(device); + if (r < 0) + return r; + + if (!device->devtype) + return -ENOENT; + + if (devtype) + *devtype = device->devtype; + + return !!device->devtype; +} + +_public_ int sd_device_get_parent_with_subsystem_devtype(sd_device *child, const char *subsystem, const char *devtype, sd_device **ret) { + sd_device *parent = NULL; + int r; + + assert_return(child, -EINVAL); + assert_return(subsystem, -EINVAL); + + r = sd_device_get_parent(child, &parent); + while (r >= 0) { + const char *parent_subsystem = NULL; + + (void) sd_device_get_subsystem(parent, &parent_subsystem); + if (streq_ptr(parent_subsystem, subsystem)) { + const char *parent_devtype = NULL; + + if (!devtype) + break; + + (void) sd_device_get_devtype(parent, &parent_devtype); + if (streq_ptr(parent_devtype, devtype)) + break; + } + r = sd_device_get_parent(parent, &parent); + } + + if (r < 0) + return r; + + if (ret) + *ret = parent; + return 0; +} + +_public_ int sd_device_get_devnum(sd_device *device, dev_t *devnum) { + int r; + + assert_return(device, -EINVAL); + + r = device_read_uevent_file(device); + if (r < 0) + return r; + + if (major(device->devnum) <= 0) + return -ENOENT; + + if (devnum) + *devnum = device->devnum; + + return 0; +} + +int device_set_driver(sd_device *device, const char *driver) { + _cleanup_free_ char *d = NULL; + int r; + + assert(device); + + if (driver) { + d = strdup(driver); + if (!d) + return -ENOMEM; + } + + r = device_add_property_internal(device, "DRIVER", d); + if (r < 0) + return r; + + device->driver_set = true; + return free_and_replace(device->driver, d); +} + +_public_ int sd_device_get_driver(sd_device *device, const char **ret) { + assert_return(device, -EINVAL); + + if (!device->driver_set) { + _cleanup_free_ char *driver = NULL; + const char *syspath; + char *path; + int r; + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + path = strjoina(syspath, "/driver"); + r = readlink_value(path, &driver); + if (r < 0 && r != -ENOENT) + return log_device_debug_errno(device, r, + "sd-device: readlink(\"%s\") failed: %m", path); + + r = device_set_driver(device, driver); + if (r < 0) + return log_device_debug_errno(device, r, + "sd-device: Failed to set driver \"%s\": %m", driver); + } + + if (!device->driver) + return -ENOENT; + + if (ret) + *ret = device->driver; + return 0; +} + +_public_ int sd_device_get_devpath(sd_device *device, const char **ret) { + assert_return(device, -EINVAL); + + assert(device->devpath); + assert(device->devpath[0] == '/'); + + if (ret) + *ret = device->devpath; + + return 0; +} + +_public_ int sd_device_get_devname(sd_device *device, const char **devname) { + int r; + + assert_return(device, -EINVAL); + + r = device_read_uevent_file(device); + if (r < 0) + return r; + + if (!device->devname) + return -ENOENT; + + assert(!isempty(path_startswith(device->devname, "/dev/"))); + + if (devname) + *devname = device->devname; + return 0; +} + +static int device_set_sysname_and_sysnum(sd_device *device) { + _cleanup_free_ char *sysname = NULL; + size_t len, n; + int r; + + assert(device); + + r = path_extract_filename(device->devpath, &sysname); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return -EINVAL; + + /* some devices have '!' in their name, change that to '/' */ + string_replace_char(sysname, '!', '/'); + + n = strspn_from_end(sysname, DIGITS); + len = strlen(sysname); + assert(n <= len); + if (n == len) + n = 0; /* Do not set sysnum for number only sysname. */ + + device->sysnum = n > 0 ? sysname + len - n : NULL; + return free_and_replace(device->sysname, sysname); +} + +_public_ int sd_device_get_sysname(sd_device *device, const char **ret) { + int r; + + assert_return(device, -EINVAL); + + if (!device->sysname) { + r = device_set_sysname_and_sysnum(device); + if (r < 0) + return r; + } + + if (ret) + *ret = device->sysname; + return 0; +} + +_public_ int sd_device_get_sysnum(sd_device *device, const char **ret) { + int r; + + assert_return(device, -EINVAL); + + if (!device->sysname) { + r = device_set_sysname_and_sysnum(device); + if (r < 0) + return r; + } + + if (!device->sysnum) + return -ENOENT; + + if (ret) + *ret = device->sysnum; + return 0; +} + +_public_ int sd_device_get_action(sd_device *device, sd_device_action_t *ret) { + assert_return(device, -EINVAL); + + if (device->action < 0) + return -ENOENT; + + if (ret) + *ret = device->action; + + return 0; +} + +_public_ int sd_device_get_seqnum(sd_device *device, uint64_t *ret) { + assert_return(device, -EINVAL); + + if (device->seqnum == 0) + return -ENOENT; + + if (ret) + *ret = device->seqnum; + + return 0; +} + +_public_ int sd_device_get_diskseq(sd_device *device, uint64_t *ret) { + int r; + + assert_return(device, -EINVAL); + + r = device_read_uevent_file(device); + if (r < 0) + return r; + + if (device->diskseq == 0) + return -ENOENT; + + if (ret) + *ret = device->diskseq; + + return 0; +} + +static bool is_valid_tag(const char *tag) { + assert(tag); + + return in_charset(tag, ALPHANUMERICAL "-_") && filename_is_valid(tag); +} + +int device_add_tag(sd_device *device, const char *tag, bool both) { + int r, added; + + assert(device); + assert(tag); + + if (!is_valid_tag(tag)) + return -EINVAL; + + /* Definitely add to the "all" list of tags (i.e. the sticky list) */ + added = set_put_strdup(&device->all_tags, tag); + if (added < 0) + return added; + + /* And optionally, also add it to the current list of tags */ + if (both) { + r = set_put_strdup(&device->current_tags, tag); + if (r < 0) { + if (added > 0) + (void) set_remove(device->all_tags, tag); + + return r; + } + } + + device->tags_generation++; + device->property_tags_outdated = true; + + return 0; +} + +int device_add_devlink(sd_device *device, const char *devlink) { + char *p; + int r; + + assert(device); + assert(devlink); + + r = mangle_devname(devlink, &p); + if (r < 0) + return r; + + r = set_ensure_consume(&device->devlinks, &path_hash_ops_free, p); + if (r < 0) + return r; + + device->devlinks_generation++; + device->property_devlinks_outdated = true; + + return r; /* return 1 when newly added, 0 when already exists */ +} + +int device_remove_devlink(sd_device *device, const char *devlink) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + assert(device); + assert(devlink); + + r = mangle_devname(devlink, &p); + if (r < 0) + return r; + + s = set_remove(device->devlinks, p); + if (!s) + return 0; /* does not exist */ + + device->devlinks_generation++; + device->property_devlinks_outdated = true; + return 1; /* removed */ +} + +bool device_has_devlink(sd_device *device, const char *devlink) { + assert(device); + assert(devlink); + + return set_contains(device->devlinks, devlink); +} + +static int device_add_property_internal_from_string(sd_device *device, const char *str) { + _cleanup_free_ char *key = NULL; + char *value; + int r; + + assert(device); + assert(str); + + key = strdup(str); + if (!key) + return -ENOMEM; + + value = strchr(key, '='); + if (!value) + return -EINVAL; + + *value = '\0'; + + if (isempty(++value)) + value = NULL; + + /* Add the property to both sd_device::properties and sd_device::properties_db, + * as this is called by only handle_db_line(). */ + r = device_add_property_aux(device, key, value, false); + if (r < 0) + return r; + + return device_add_property_aux(device, key, value, true); +} + +int device_set_usec_initialized(sd_device *device, usec_t when) { + char s[DECIMAL_STR_MAX(usec_t)]; + int r; + + assert(device); + + xsprintf(s, USEC_FMT, when); + + r = device_add_property_internal(device, "USEC_INITIALIZED", s); + if (r < 0) + return r; + + device->usec_initialized = when; + return 0; +} + +static int handle_db_line(sd_device *device, char key, const char *value) { + int r; + + assert(device); + assert(value); + + switch (key) { + case 'G': /* Any tag */ + case 'Q': /* Current tag */ + return device_add_tag(device, value, key == 'Q'); + + case 'S': { + const char *path; + + path = strjoina("/dev/", value); + return device_add_devlink(device, path); + } + case 'E': + return device_add_property_internal_from_string(device, value); + + case 'I': { + usec_t t; + + r = safe_atou64(value, &t); + if (r < 0) + return r; + + return device_set_usec_initialized(device, t); + } + case 'L': + return safe_atoi(value, &device->devlink_priority); + + case 'W': + /* Deprecated. Previously, watch handle is both saved in database and /run/udev/watch. + * However, the handle saved in database may not be updated when the handle is updated + * or removed. Moreover, it is not necessary to store the handle within the database, + * as its value becomes meaningless when udevd is restarted. */ + return 0; + + case 'V': + return safe_atou(value, &device->database_version); + + default: + log_device_debug(device, "sd-device: Unknown key '%c' in device db, ignoring", key); + return 0; + } +} + +int device_get_device_id(sd_device *device, const char **ret) { + assert(device); + assert(ret); + + if (!device->device_id) { + _cleanup_free_ char *id = NULL; + const char *subsystem; + dev_t devnum; + int ifindex, r; + + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0) + return r; + + if (sd_device_get_devnum(device, &devnum) >= 0) { + /* use dev_t — b259:131072, c254:0 */ + if (asprintf(&id, "%c" DEVNUM_FORMAT_STR, + streq(subsystem, "block") ? 'b' : 'c', + DEVNUM_FORMAT_VAL(devnum)) < 0) + return -ENOMEM; + } else if (sd_device_get_ifindex(device, &ifindex) >= 0) { + /* use netdev ifindex — n3 */ + if (asprintf(&id, "n%u", (unsigned) ifindex) < 0) + return -ENOMEM; + } else { + _cleanup_free_ char *sysname = NULL; + + /* use $subsys:$sysname — pci:0000:00:1f.2 + * sd_device_get_sysname() has '!' translated, get it from devpath */ + r = path_extract_filename(device->devpath, &sysname); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return -EINVAL; + + if (streq(subsystem, "drivers")) { + /* the 'drivers' pseudo-subsystem is special, and needs the real + * subsystem encoded as well */ + assert(device->driver_subsystem); + id = strjoin("+drivers:", device->driver_subsystem, ":", sysname); + } else + id = strjoin("+", subsystem, ":", sysname); + if (!id) + return -ENOMEM; + } + + if (!filename_is_valid(id)) + return -EINVAL; + + device->device_id = TAKE_PTR(id); + } + + *ret = device->device_id; + return 0; +} + +int device_read_db_internal_filename(sd_device *device, const char *filename) { + _cleanup_free_ char *db = NULL; + const char *value; + size_t db_len; + char key = '\0'; /* Unnecessary initialization to appease gcc-12.0.0-0.4.fc36 */ + int r; + + enum { + PRE_KEY, + KEY, + PRE_VALUE, + VALUE, + INVALID_LINE, + } state = PRE_KEY; + + assert(device); + assert(filename); + + r = read_full_file(filename, &db, &db_len); + if (r < 0) { + if (r == -ENOENT) + return 0; + + return log_device_debug_errno(device, r, "sd-device: Failed to read db '%s': %m", filename); + } + + /* devices with a database entry are initialized */ + device->is_initialized = true; + + device->db_loaded = true; + + for (size_t i = 0; i < db_len; i++) + switch (state) { + case PRE_KEY: + if (!strchr(NEWLINE, db[i])) { + key = db[i]; + + state = KEY; + } + + break; + case KEY: + if (db[i] != ':') { + log_device_debug(device, "sd-device: Invalid db entry with key '%c', ignoring", key); + + state = INVALID_LINE; + } else { + db[i] = '\0'; + + state = PRE_VALUE; + } + + break; + case PRE_VALUE: + value = &db[i]; + + state = VALUE; + + break; + case INVALID_LINE: + if (strchr(NEWLINE, db[i])) + state = PRE_KEY; + + break; + case VALUE: + if (strchr(NEWLINE, db[i])) { + db[i] = '\0'; + r = handle_db_line(device, key, value); + if (r < 0) + log_device_debug_errno(device, r, "sd-device: Failed to handle db entry '%c:%s', ignoring: %m", + key, value); + + state = PRE_KEY; + } + + break; + default: + return log_device_debug_errno(device, SYNTHETIC_ERRNO(EINVAL), "sd-device: invalid db syntax."); + } + + return 0; +} + +int device_read_db_internal(sd_device *device, bool force) { + const char *id, *path; + int r; + + assert(device); + + if (device->db_loaded || (!force && device->sealed)) + return 0; + + r = device_get_device_id(device, &id); + if (r < 0) + return r; + + path = strjoina("/run/udev/data/", id); + + return device_read_db_internal_filename(device, path); +} + +_public_ int sd_device_get_is_initialized(sd_device *device) { + int r; + + assert_return(device, -EINVAL); + + r = device_read_db(device); + if (r == -ENOENT) + /* The device may be already removed or renamed. */ + return false; + if (r < 0) + return r; + + return device->is_initialized; +} + +_public_ int sd_device_get_usec_initialized(sd_device *device, uint64_t *ret) { + int r; + + assert_return(device, -EINVAL); + + r = sd_device_get_is_initialized(device); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + + if (device->usec_initialized == 0) + return -ENODATA; + + if (ret) + *ret = device->usec_initialized; + + return 0; +} + +_public_ int sd_device_get_usec_since_initialized(sd_device *device, uint64_t *ret) { + usec_t now_ts, ts; + int r; + + assert_return(device, -EINVAL); + + r = sd_device_get_usec_initialized(device, &ts); + if (r < 0) + return r; + + now_ts = now(CLOCK_MONOTONIC); + + if (now_ts < ts) + return -EIO; + + if (ret) + *ret = usec_sub_unsigned(now_ts, ts); + + return 0; +} + +_public_ const char *sd_device_get_tag_first(sd_device *device) { + void *v; + + assert_return(device, NULL); + + (void) device_read_db(device); + + device->all_tags_iterator_generation = device->tags_generation; + device->all_tags_iterator = ITERATOR_FIRST; + + (void) set_iterate(device->all_tags, &device->all_tags_iterator, &v); + return v; +} + +_public_ const char *sd_device_get_tag_next(sd_device *device) { + void *v; + + assert_return(device, NULL); + + (void) device_read_db(device); + + if (device->all_tags_iterator_generation != device->tags_generation) + return NULL; + + (void) set_iterate(device->all_tags, &device->all_tags_iterator, &v); + return v; +} + +static bool device_database_supports_current_tags(sd_device *device) { + assert(device); + + (void) device_read_db(device); + + /* The current tags (saved in Q field) feature is implemented in database version 1. + * If the database version is 0, then the tags (NOT current tags, saved in G field) are not + * sticky. Thus, we can safely bypass the operations for the current tags (Q) to tags (G). */ + + return device->database_version >= 1; +} + +_public_ const char *sd_device_get_current_tag_first(sd_device *device) { + void *v; + + assert_return(device, NULL); + + if (!device_database_supports_current_tags(device)) + return sd_device_get_tag_first(device); + + (void) device_read_db(device); + + device->current_tags_iterator_generation = device->tags_generation; + device->current_tags_iterator = ITERATOR_FIRST; + + (void) set_iterate(device->current_tags, &device->current_tags_iterator, &v); + return v; +} + +_public_ const char *sd_device_get_current_tag_next(sd_device *device) { + void *v; + + assert_return(device, NULL); + + if (!device_database_supports_current_tags(device)) + return sd_device_get_tag_next(device); + + (void) device_read_db(device); + + if (device->current_tags_iterator_generation != device->tags_generation) + return NULL; + + (void) set_iterate(device->current_tags, &device->current_tags_iterator, &v); + return v; +} + +_public_ const char *sd_device_get_devlink_first(sd_device *device) { + void *v; + + assert_return(device, NULL); + + (void) device_read_db(device); + + device->devlinks_iterator_generation = device->devlinks_generation; + device->devlinks_iterator = ITERATOR_FIRST; + + (void) set_iterate(device->devlinks, &device->devlinks_iterator, &v); + return v; +} + +_public_ const char *sd_device_get_devlink_next(sd_device *device) { + void *v; + + assert_return(device, NULL); + + (void) device_read_db(device); + + if (device->devlinks_iterator_generation != device->devlinks_generation) + return NULL; + + (void) set_iterate(device->devlinks, &device->devlinks_iterator, &v); + return v; +} + +int device_properties_prepare(sd_device *device) { + int r; + + assert(device); + + r = device_read_uevent_file(device); + if (r < 0) + return r; + + r = device_read_db(device); + if (r < 0) + return r; + + if (device->property_devlinks_outdated) { + _cleanup_free_ char *devlinks = NULL; + + r = set_strjoin(device->devlinks, " ", false, &devlinks); + if (r < 0) + return r; + + if (!isempty(devlinks)) { + r = device_add_property_internal(device, "DEVLINKS", devlinks); + if (r < 0) + return r; + } + + device->property_devlinks_outdated = false; + } + + if (device->property_tags_outdated) { + _cleanup_free_ char *tags = NULL; + + r = set_strjoin(device->all_tags, ":", true, &tags); + if (r < 0) + return r; + + if (!isempty(tags)) { + r = device_add_property_internal(device, "TAGS", tags); + if (r < 0) + return r; + } + + tags = mfree(tags); + r = set_strjoin(device->current_tags, ":", true, &tags); + if (r < 0) + return r; + + if (!isempty(tags)) { + r = device_add_property_internal(device, "CURRENT_TAGS", tags); + if (r < 0) + return r; + } + + device->property_tags_outdated = false; + } + + return 0; +} + +_public_ const char *sd_device_get_property_first(sd_device *device, const char **_value) { + const char *key; + int r; + + assert_return(device, NULL); + + r = device_properties_prepare(device); + if (r < 0) + return NULL; + + device->properties_iterator_generation = device->properties_generation; + device->properties_iterator = ITERATOR_FIRST; + + (void) ordered_hashmap_iterate(device->properties, &device->properties_iterator, (void**)_value, (const void**)&key); + return key; +} + +_public_ const char *sd_device_get_property_next(sd_device *device, const char **_value) { + const char *key; + int r; + + assert_return(device, NULL); + + r = device_properties_prepare(device); + if (r < 0) + return NULL; + + if (device->properties_iterator_generation != device->properties_generation) + return NULL; + + (void) ordered_hashmap_iterate(device->properties, &device->properties_iterator, (void**)_value, (const void**)&key); + return key; +} + +static int device_sysattrs_read_all_internal(sd_device *device, const char *subdir, Set **stack) { + _cleanup_closedir_ DIR *dir = NULL; + int r; + + assert(device); + assert(stack); + + r = device_opendir(device, subdir, &dir); + if (r == -ENOENT && subdir) + return 0; /* Maybe, this is a child device, and is already removed. */ + if (r < 0) + return r; + + if (subdir) { + if (faccessat(dirfd(dir), "uevent", F_OK, 0) >= 0) + return 0; /* this is a child device, skipping */ + if (errno != ENOENT) { + log_device_debug_errno(device, errno, + "sd-device: Failed to access %s/uevent, ignoring sub-directory %s: %m", + subdir, subdir); + return 0; + } + } + + FOREACH_DIRENT_ALL(de, dir, return -errno) { + _cleanup_free_ char *p = NULL; + struct stat statbuf; + + if (dot_or_dot_dot(de->d_name)) + continue; + + /* only handle symlinks, regular files, and directories */ + if (!IN_SET(de->d_type, DT_LNK, DT_REG, DT_DIR)) + continue; + + if (subdir) { + p = path_join(subdir, de->d_name); + if (!p) + return -ENOMEM; + } + + if (de->d_type == DT_DIR) { + /* push the sub-directory into the stack, and read it later. */ + if (p) + r = set_ensure_consume(stack, &path_hash_ops_free, TAKE_PTR(p)); + else + r = set_put_strdup_full(stack, &path_hash_ops_free, de->d_name); + if (r < 0) + return r; + + continue; + } + + if (fstatat(dirfd(dir), de->d_name, &statbuf, AT_SYMLINK_NOFOLLOW) < 0) + continue; + + if ((statbuf.st_mode & (S_IRUSR | S_IWUSR)) == 0) + continue; + + if (p) + r = set_ensure_consume(&device->sysattrs, &path_hash_ops_free, TAKE_PTR(p)); + else + r = set_put_strdup_full(&device->sysattrs, &path_hash_ops_free, de->d_name); + if (r < 0) + return r; + } + + return 0; +} + +static int device_sysattrs_read_all(sd_device *device) { + _cleanup_set_free_ Set *stack = NULL; + int r; + + assert(device); + + if (device->sysattrs_read) + return 0; + + r = device_sysattrs_read_all_internal(device, NULL, &stack); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *subdir = NULL; + + subdir = set_steal_first(stack); + if (!subdir) + break; + + r = device_sysattrs_read_all_internal(device, subdir, &stack); + if (r < 0) + return r; + } + + device->sysattrs_read = true; + + return 0; +} + +_public_ const char *sd_device_get_sysattr_first(sd_device *device) { + void *v; + int r; + + assert_return(device, NULL); + + if (!device->sysattrs_read) { + r = device_sysattrs_read_all(device); + if (r < 0) { + errno = -r; + return NULL; + } + } + + device->sysattrs_iterator = ITERATOR_FIRST; + + (void) set_iterate(device->sysattrs, &device->sysattrs_iterator, &v); + return v; +} + +_public_ const char *sd_device_get_sysattr_next(sd_device *device) { + void *v; + + assert_return(device, NULL); + + if (!device->sysattrs_read) + return NULL; + + (void) set_iterate(device->sysattrs, &device->sysattrs_iterator, &v); + return v; +} + +_public_ int sd_device_has_tag(sd_device *device, const char *tag) { + assert_return(device, -EINVAL); + assert_return(tag, -EINVAL); + + (void) device_read_db(device); + + return set_contains(device->all_tags, tag); +} + +_public_ int sd_device_has_current_tag(sd_device *device, const char *tag) { + assert_return(device, -EINVAL); + assert_return(tag, -EINVAL); + + if (!device_database_supports_current_tags(device)) + return sd_device_has_tag(device, tag); + + (void) device_read_db(device); + + return set_contains(device->current_tags, tag); +} + +_public_ int sd_device_get_property_value(sd_device *device, const char *key, const char **ret_value) { + const char *value; + int r; + + assert_return(device, -EINVAL); + assert_return(key, -EINVAL); + + r = device_properties_prepare(device); + if (r < 0) + return r; + + value = ordered_hashmap_get(device->properties, key); + if (!value) + return -ENOENT; + + if (ret_value) + *ret_value = value; + return 0; +} + +int device_get_property_bool(sd_device *device, const char *key) { + const char *value; + int r; + + assert(device); + assert(key); + + r = sd_device_get_property_value(device, key, &value); + if (r < 0) + return r; + + return parse_boolean(value); +} + +int device_get_property_int(sd_device *device, const char *key, int *ret) { + const char *value; + int r, v; + + assert(device); + assert(key); + + r = sd_device_get_property_value(device, key, &value); + if (r < 0) + return r; + + r = safe_atoi(value, &v); + if (r < 0) + return r; + + if (ret) + *ret = v; + return 0; +} + +_public_ int sd_device_get_trigger_uuid(sd_device *device, sd_id128_t *ret) { + const char *s; + sd_id128_t id; + int r; + + assert_return(device, -EINVAL); + + /* Retrieves the UUID attached to a uevent when triggering it from userspace via + * sd_device_trigger_with_uuid() or an equivalent interface. Returns -ENOENT if the record is not + * caused by a synthetic event and -ENODATA if it was but no UUID was specified */ + + r = sd_device_get_property_value(device, "SYNTH_UUID", &s); + if (r < 0) + return r; + + if (streq(s, "0")) /* SYNTH_UUID=0 is set whenever a device is triggered by userspace without specifying a UUID */ + return -ENODATA; + + r = sd_id128_from_string(s, &id); + if (r < 0) + return r; + + if (ret) + *ret = id; + + return 0; +} + +void device_clear_sysattr_cache(sd_device *device) { + device->sysattr_values = hashmap_free(device->sysattr_values); +} + +int device_cache_sysattr_value(sd_device *device, const char *key, char *value) { + _unused_ _cleanup_free_ char *old_value = NULL; + _cleanup_free_ char *new_key = NULL; + int r; + + assert(device); + assert(key); + + /* This takes the reference of the input value. The input value may be NULL. + * This replaces the value if it already exists. */ + + /* First, remove the old cache entry. So, we do not need to clear cache on error. */ + old_value = hashmap_remove2(device->sysattr_values, key, (void **) &new_key); + if (!new_key) { + new_key = strdup(key); + if (!new_key) + return -ENOMEM; + } + + r = hashmap_ensure_put(&device->sysattr_values, &path_hash_ops_free_free, new_key, value); + if (r < 0) + return r; + + TAKE_PTR(new_key); + + return 0; +} + +int device_get_cached_sysattr_value(sd_device *device, const char *key, const char **ret_value) { + const char *k = NULL, *value; + + assert(device); + assert(key); + + value = hashmap_get2(device->sysattr_values, key, (void **) &k); + if (!k) + return -ESTALE; /* We have not read the attribute. */ + if (!value) + return -ENOENT; /* We have looked up the attribute before and it did not exist. */ + if (ret_value) + *ret_value = value; + return 0; +} + +/* We cache all sysattr lookups. If an attribute does not exist, it is stored + * with a NULL value in the cache, otherwise the returned string is stored */ +_public_ int sd_device_get_sysattr_value(sd_device *device, const char *sysattr, const char **ret_value) { + _cleanup_free_ char *value = NULL, *path = NULL; + const char *syspath; + struct stat statbuf; + int r; + + assert_return(device, -EINVAL); + assert_return(sysattr, -EINVAL); + + /* look for possibly already cached result */ + r = device_get_cached_sysattr_value(device, sysattr, ret_value); + if (r != -ESTALE) + return r; + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + path = path_join(syspath, sysattr); + if (!path) + return -ENOMEM; + + if (lstat(path, &statbuf) < 0) { + int k; + + r = -errno; + + /* remember that we could not access the sysattr */ + k = device_cache_sysattr_value(device, sysattr, NULL); + if (k < 0) + log_device_debug_errno(device, k, + "sd-device: failed to cache attribute '%s' with NULL, ignoring: %m", + sysattr); + + return r; + } else if (S_ISLNK(statbuf.st_mode)) { + /* Some core links return only the last element of the target path, + * these are just values, the paths should not be exposed. */ + if (STR_IN_SET(sysattr, "driver", "subsystem", "module")) { + r = readlink_value(path, &value); + if (r < 0) + return r; + } else + return -EINVAL; + } else if (S_ISDIR(statbuf.st_mode)) + /* skip directories */ + return -EISDIR; + else if (!(statbuf.st_mode & S_IRUSR)) + /* skip non-readable files */ + return -EPERM; + else { + size_t size; + + /* Read attribute value, Some attributes contain embedded '\0'. So, it is necessary to + * also get the size of the result. See issue #20025. */ + r = read_full_virtual_file(path, &value, &size); + if (r < 0) + return r; + + /* drop trailing newlines */ + while (size > 0 && strchr(NEWLINE, value[--size])) + value[size] = '\0'; + } + + /* Unfortunately, we need to return 'const char*' instead of 'char*'. Hence, failure in caching + * sysattr value is critical unlike the other places. */ + r = device_cache_sysattr_value(device, sysattr, value); + if (r < 0) { + log_device_debug_errno(device, r, + "sd-device: failed to cache attribute '%s' with '%s'%s: %m", + sysattr, value, ret_value ? "" : ", ignoring"); + if (ret_value) + return r; + + return 0; + } + + if (ret_value) + *ret_value = value; + + TAKE_PTR(value); + return 0; +} + +int device_get_sysattr_int(sd_device *device, const char *sysattr, int *ret_value) { + const char *value; + int r; + + r = sd_device_get_sysattr_value(device, sysattr, &value); + if (r < 0) + return r; + + int v; + r = safe_atoi(value, &v); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to parse '%s' attribute: %m", sysattr); + + if (ret_value) + *ret_value = v; + /* We return "true" if the value is positive. */ + return v > 0; +} + +int device_get_sysattr_unsigned(sd_device *device, const char *sysattr, unsigned *ret_value) { + const char *value; + int r; + + r = sd_device_get_sysattr_value(device, sysattr, &value); + if (r < 0) + return r; + + unsigned v; + r = safe_atou(value, &v); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to parse '%s' attribute: %m", sysattr); + + if (ret_value) + *ret_value = v; + /* We return "true" if the value is positive. */ + return v > 0; +} + +int device_get_sysattr_bool(sd_device *device, const char *sysattr) { + const char *value; + int r; + + assert(device); + assert(sysattr); + + r = sd_device_get_sysattr_value(device, sysattr, &value); + if (r < 0) + return r; + + return parse_boolean(value); +} + +static void device_remove_cached_sysattr_value(sd_device *device, const char *_key) { + _cleanup_free_ char *key = NULL; + + assert(device); + assert(_key); + + free(hashmap_remove2(device->sysattr_values, _key, (void **) &key)); +} + +_public_ int sd_device_set_sysattr_value(sd_device *device, const char *sysattr, const char *_value) { + _cleanup_free_ char *value = NULL, *path = NULL; + const char *syspath; + size_t len; + int r; + + assert_return(device, -EINVAL); + assert_return(sysattr, -EINVAL); + + /* Set the attribute and save it in the cache. */ + + if (!_value) { + /* If input value is NULL, then clear cache and not write anything. */ + device_remove_cached_sysattr_value(device, sysattr); + return 0; + } + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + path = path_join(syspath, sysattr); + if (!path) + return -ENOMEM; + + len = strlen(_value); + + /* drop trailing newlines */ + while (len > 0 && strchr(NEWLINE, _value[len - 1])) + len --; + + /* value length is limited to 4k */ + if (len > 4096) + return -EINVAL; + + value = strndup(_value, len); + if (!value) + return -ENOMEM; + + r = write_string_file(path, value, WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_NOFOLLOW); + if (r < 0) { + /* On failure, clear cache entry, as we do not know how it fails. */ + device_remove_cached_sysattr_value(device, sysattr); + return r; + } + + /* Do not cache action string written into uevent file. */ + if (streq(sysattr, "uevent")) + return 0; + + r = device_cache_sysattr_value(device, sysattr, value); + if (r < 0) + log_device_debug_errno(device, r, + "sd-device: failed to cache attribute '%s' with '%s', ignoring: %m", + sysattr, value); + else + TAKE_PTR(value); + + return 0; +} + +_public_ int sd_device_set_sysattr_valuef(sd_device *device, const char *sysattr, const char *format, ...) { + _cleanup_free_ char *value = NULL; + va_list ap; + int r; + + assert_return(device, -EINVAL); + assert_return(sysattr, -EINVAL); + + if (!format) { + device_remove_cached_sysattr_value(device, sysattr); + return 0; + } + + va_start(ap, format); + r = vasprintf(&value, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return sd_device_set_sysattr_value(device, sysattr, value); +} + +_public_ int sd_device_trigger(sd_device *device, sd_device_action_t action) { + const char *s; + + assert_return(device, -EINVAL); + + s = device_action_to_string(action); + if (!s) + return -EINVAL; + + /* This uses the simple no-UUID interface of kernel < 4.13 */ + return sd_device_set_sysattr_value(device, "uevent", s); +} + +_public_ int sd_device_trigger_with_uuid( + sd_device *device, + sd_device_action_t action, + sd_id128_t *ret_uuid) { + + const char *s, *j; + sd_id128_t u; + int r; + + assert_return(device, -EINVAL); + + /* If no one wants to know the UUID, use the simple interface from pre-4.13 times */ + if (!ret_uuid) + return sd_device_trigger(device, action); + + s = device_action_to_string(action); + if (!s) + return -EINVAL; + + r = sd_id128_randomize(&u); + if (r < 0) + return r; + + j = strjoina(s, " ", SD_ID128_TO_UUID_STRING(u)); + + r = sd_device_set_sysattr_value(device, "uevent", j); + if (r < 0) + return r; + + *ret_uuid = u; + return 0; +} + +_public_ int sd_device_open(sd_device *device, int flags) { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *devname, *subsystem = NULL; + uint64_t q, diskseq = 0; + struct stat st; + dev_t devnum; + int r; + + assert_return(device, -EINVAL); + assert_return(FLAGS_SET(flags, O_PATH) || !FLAGS_SET(flags, O_NOFOLLOW), -EINVAL); + + r = sd_device_get_devname(device, &devname); + if (r == -ENOENT) + return -ENOEXEC; + if (r < 0) + return r; + + r = sd_device_get_devnum(device, &devnum); + if (r == -ENOENT) + return -ENOEXEC; + if (r < 0) + return r; + + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0 && r != -ENOENT) + return r; + + fd = open(devname, FLAGS_SET(flags, O_PATH) ? flags : O_CLOEXEC|O_NOFOLLOW|O_PATH); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if (st.st_rdev != devnum) + return -ENXIO; + + if (streq_ptr(subsystem, "block") ? !S_ISBLK(st.st_mode) : !S_ISCHR(st.st_mode)) + return -ENXIO; + + /* If flags has O_PATH, then we cannot check diskseq. Let's return earlier. */ + if (FLAGS_SET(flags, O_PATH)) + return TAKE_FD(fd); + + /* If the device is not initialized, then we cannot determine if we should check diskseq through + * ID_IGNORE_DISKSEQ property. Let's skip to check diskseq in that case. */ + r = sd_device_get_is_initialized(device); + if (r < 0) + return r; + if (r > 0) { + r = device_get_property_bool(device, "ID_IGNORE_DISKSEQ"); + if (r < 0 && r != -ENOENT) + return r; + if (r <= 0) { + r = sd_device_get_diskseq(device, &diskseq); + if (r < 0 && r != -ENOENT) + return r; + } + } + + fd2 = fd_reopen(fd, flags); + if (fd2 < 0) + return fd2; + + if (diskseq == 0) + return TAKE_FD(fd2); + + r = fd_get_diskseq(fd2, &q); + if (r < 0) + return r; + + if (q != diskseq) + return -ENXIO; + + return TAKE_FD(fd2); +} + +int device_opendir(sd_device *device, const char *subdir, DIR **ret) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_free_ char *path = NULL; + const char *syspath; + int r; + + assert(device); + assert(ret); + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + if (subdir) { + if (!path_is_safe(subdir)) + return -EINVAL; + + path = path_join(syspath, subdir); + if (!path) + return -ENOMEM; + } + + d = opendir(path ?: syspath); + if (!d) + return -errno; + + *ret = TAKE_PTR(d); + return 0; +} diff --git a/src/libsystemd/sd-device/test-device-util.c b/src/libsystemd/sd-device/test-device-util.c new file mode 100644 index 0000000..bc8ab66 --- /dev/null +++ b/src/libsystemd/sd-device/test-device-util.c @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "device-util.h" +#include "tests.h" + +TEST(log_device_full) { + int r; + + for (int level = LOG_ERR; level <= LOG_DEBUG; level++) { + log_device_full(NULL, level, "test level=%d: %m", level); + + r = log_device_full_errno(NULL, level, EUCLEAN, "test level=%d errno=EUCLEAN: %m", level); + assert_se(r == -EUCLEAN); + + r = log_device_full_errno(NULL, level, 0, "test level=%d errno=0: %m", level); + assert_se(r == 0); + + r = log_device_full_errno(NULL, level, SYNTHETIC_ERRNO(ENODATA), "test level=%d errno=S(ENODATA): %m", level); + assert_se(r == -ENODATA); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-device/test-sd-device-monitor.c b/src/libsystemd/sd-device/test-sd-device-monitor.c new file mode 100644 index 0000000..e124e00 --- /dev/null +++ b/src/libsystemd/sd-device/test-sd-device-monitor.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-device.h" +#include "sd-event.h" + +#include "device-monitor-private.h" +#include "device-private.h" +#include "device-util.h" +#include "macro.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "tests.h" +#include "virt.h" + +static int monitor_handler(sd_device_monitor *m, sd_device *d, void *userdata) { + const char *s, *syspath = userdata; + + assert_se(sd_device_get_syspath(d, &s) >= 0); + assert_se(streq(s, syspath)); + + return sd_event_exit(sd_device_monitor_get_event(m), 100); +} + +static void test_receive_device_fail(void) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + _cleanup_(sd_device_unrefp) sd_device *loopback = NULL; + const char *syspath; + + log_info("/* %s */", __func__); + + /* Try to send device with invalid action and without seqnum. */ + assert_se(sd_device_new_from_syspath(&loopback, "/sys/class/net/lo") >= 0); + assert_se(device_add_property(loopback, "ACTION", "hoge") >= 0); + + assert_se(sd_device_get_syspath(loopback, &syspath) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + assert_se(device_monitor_send_device(monitor_server, monitor_client, loopback) >= 0); + assert_se(sd_event_run(sd_device_monitor_get_event(monitor_client), 0) >= 0); +} + +static void test_send_receive_one(sd_device *device, bool subsystem_filter, bool tag_filter, bool use_bpf) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + const char *syspath, *subsystem, *devtype = NULL; + + log_device_info(device, "/* %s(subsystem_filter=%s, tag_filter=%s, use_bpf=%s) */", __func__, + true_false(subsystem_filter), true_false(tag_filter), true_false(use_bpf)); + + assert_se(sd_device_get_syspath(device, &syspath) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + if (subsystem_filter) { + assert_se(sd_device_get_subsystem(device, &subsystem) >= 0); + (void) sd_device_get_devtype(device, &devtype); + assert_se(sd_device_monitor_filter_add_match_subsystem_devtype(monitor_client, subsystem, devtype) >= 0); + } + + if (tag_filter) + FOREACH_DEVICE_TAG(device, tag) + assert_se(sd_device_monitor_filter_add_match_tag(monitor_client, tag) >= 0); + + if ((subsystem_filter || tag_filter) && use_bpf) + assert_se(sd_device_monitor_filter_update(monitor_client) >= 0); + + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_loop(sd_device_monitor_get_event(monitor_client)) == 100); +} + +static void test_subsystem_filter(sd_device *device) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *syspath, *subsystem; + + log_device_info(device, "/* %s */", __func__); + + assert_se(sd_device_get_syspath(device, &syspath) >= 0); + assert_se(sd_device_get_subsystem(device, &subsystem) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_filter_add_match_subsystem_devtype(monitor_client, subsystem, NULL) >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, subsystem, false) >= 0); + FOREACH_DEVICE(e, d) { + const char *p, *s; + + assert_se(sd_device_get_syspath(d, &p) >= 0); + assert_se(sd_device_get_subsystem(d, &s) >= 0); + + assert_se(device_add_property(d, "ACTION", "add") >= 0); + assert_se(device_add_property(d, "SEQNUM", "10") >= 0); + + log_device_debug(d, "Sending device subsystem:%s syspath:%s", s, p); + assert_se(device_monitor_send_device(monitor_server, monitor_client, d) >= 0); + } + + log_device_info(device, "Sending device subsystem:%s syspath:%s", subsystem, syspath); + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_loop(sd_device_monitor_get_event(monitor_client)) == 100); +} + +static void test_tag_filter(sd_device *device) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *syspath; + + log_device_info(device, "/* %s */", __func__); + + assert_se(sd_device_get_syspath(device, &syspath) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_filter_add_match_tag(monitor_client, "TEST_SD_DEVICE_MONITOR") >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + assert_se(sd_device_enumerator_new(&e) >= 0); + FOREACH_DEVICE(e, d) { + const char *p; + + assert_se(sd_device_get_syspath(d, &p) >= 0); + + assert_se(device_add_property(d, "ACTION", "add") >= 0); + assert_se(device_add_property(d, "SEQNUM", "10") >= 0); + + log_device_debug(d, "Sending device syspath:%s", p); + assert_se(device_monitor_send_device(monitor_server, monitor_client, d) >= 0); + } + + log_device_info(device, "Sending device syspath:%s", syspath); + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_loop(sd_device_monitor_get_event(monitor_client)) == 100); + +} + +static void test_sysattr_filter(sd_device *device, const char *sysattr) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *syspath, *sysattr_value; + + log_device_info(device, "/* %s(%s) */", __func__, sysattr); + + assert_se(sd_device_get_syspath(device, &syspath) >= 0); + assert_se(sd_device_get_sysattr_value(device, sysattr, &sysattr_value) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_filter_add_match_sysattr(monitor_client, sysattr, sysattr_value, true) >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, sysattr, sysattr_value, false) >= 0); + FOREACH_DEVICE(e, d) { + const char *p; + + assert_se(sd_device_get_syspath(d, &p) >= 0); + + assert_se(device_add_property(d, "ACTION", "add") >= 0); + assert_se(device_add_property(d, "SEQNUM", "10") >= 0); + + log_device_debug(d, "Sending device syspath:%s", p); + assert_se(device_monitor_send_device(monitor_server, monitor_client, d) >= 0); + + /* The sysattr filter is not implemented in BPF yet. So, sending multiple devices may fills up + * buffer and device_monitor_send_device() may return EAGAIN. Let's send one device here, + * which should be filtered out by the receiver. */ + break; + } + + log_device_info(device, "Sending device syspath:%s", syspath); + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_loop(sd_device_monitor_get_event(monitor_client)) == 100); + +} + +static void test_parent_filter(sd_device *device) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *syspath, *parent_syspath; + sd_device *parent; + int r; + + log_device_info(device, "/* %s */", __func__); + + assert_se(sd_device_get_syspath(device, &syspath) >= 0); + r = sd_device_get_parent(device, &parent); + if (r < 0) + return (void) log_device_info(device, "Device does not have parent, skipping."); + assert_se(sd_device_get_syspath(parent, &parent_syspath) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_filter_add_match_parent(monitor_client, parent, true) >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + assert_se(sd_device_enumerator_new(&e) >= 0); + FOREACH_DEVICE(e, d) { + const char *p; + + assert_se(sd_device_get_syspath(d, &p) >= 0); + if (path_startswith(p, parent_syspath)) + continue; + + assert_se(device_add_property(d, "ACTION", "add") >= 0); + assert_se(device_add_property(d, "SEQNUM", "10") >= 0); + + log_device_debug(d, "Sending device syspath:%s", p); + assert_se(device_monitor_send_device(monitor_server, monitor_client, d) >= 0); + + /* The parent filter is not implemented in BPF yet. So, sending multiple devices may fills up + * buffer and device_monitor_send_device() may return EAGAIN. Let's send one device here, + * which should be filtered out by the receiver. */ + break; + } + + log_device_info(device, "Sending device syspath:%s", syspath); + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_loop(sd_device_monitor_get_event(monitor_client)) == 100); + +} + +static void test_sd_device_monitor_filter_remove(sd_device *device) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor_server = NULL, *monitor_client = NULL; + const char *syspath; + + log_device_info(device, "/* %s */", __func__); + + assert_se(sd_device_get_syspath(device, &syspath) >= 0); + + assert_se(device_monitor_new_full(&monitor_server, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_server, "sender") >= 0); + assert_se(sd_device_monitor_start(monitor_server, NULL, NULL) >= 0); + + assert_se(device_monitor_new_full(&monitor_client, MONITOR_GROUP_NONE, -1) >= 0); + assert_se(sd_device_monitor_set_description(monitor_client, "receiver") >= 0); + assert_se(device_monitor_allow_unicast_sender(monitor_client, monitor_server) >= 0); + assert_se(sd_device_monitor_start(monitor_client, monitor_handler, (void *) syspath) >= 0); + + assert_se(sd_device_monitor_filter_add_match_subsystem_devtype(monitor_client, "hoge", NULL) >= 0); + assert_se(sd_device_monitor_filter_update(monitor_client) >= 0); + + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_run(sd_device_monitor_get_event(monitor_client), 0) >= 0); + + assert_se(sd_device_monitor_filter_remove(monitor_client) >= 0); + + assert_se(device_monitor_send_device(monitor_server, monitor_client, device) >= 0); + assert_se(sd_event_loop(sd_device_monitor_get_event(monitor_client)) == 100); +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_device_unrefp) sd_device *loopback = NULL, *sda = NULL; + int r; + + test_setup_logging(LOG_INFO); + + if (getuid() != 0) + return log_tests_skipped("not root"); + + if (path_is_read_only_fs("/sys") > 0) + return log_tests_skipped("Running in container"); + + test_receive_device_fail(); + + assert_se(sd_device_new_from_syspath(&loopback, "/sys/class/net/lo") >= 0); + assert_se(device_add_property(loopback, "ACTION", "add") >= 0); + assert_se(device_add_property(loopback, "SEQNUM", "10") >= 0); + assert_se(device_add_tag(loopback, "TEST_SD_DEVICE_MONITOR", true) >= 0); + + test_send_receive_one(loopback, false, false, false); + test_send_receive_one(loopback, true, false, false); + test_send_receive_one(loopback, false, true, false); + test_send_receive_one(loopback, true, true, false); + test_send_receive_one(loopback, true, false, true); + test_send_receive_one(loopback, false, true, true); + test_send_receive_one(loopback, true, true, true); + + test_subsystem_filter(loopback); + test_tag_filter(loopback); + test_sysattr_filter(loopback, "ifindex"); + test_sd_device_monitor_filter_remove(loopback); + + r = sd_device_new_from_subsystem_sysname(&sda, "block", "sda"); + if (r < 0) { + log_info_errno(r, "Failed to create sd_device for sda, skipping remaining tests: %m"); + return 0; + } + + assert_se(device_add_property(sda, "ACTION", "change") >= 0); + assert_se(device_add_property(sda, "SEQNUM", "11") >= 0); + + test_send_receive_one(sda, false, false, false); + test_send_receive_one(sda, true, false, false); + test_send_receive_one(sda, false, true, false); + test_send_receive_one(sda, true, true, false); + test_send_receive_one(sda, true, false, true); + test_send_receive_one(sda, false, true, true); + test_send_receive_one(sda, true, true, true); + + test_parent_filter(sda); + + return 0; +} diff --git a/src/libsystemd/sd-device/test-sd-device-thread.c b/src/libsystemd/sd-device/test-sd-device-thread.c new file mode 100644 index 0000000..c99d179 --- /dev/null +++ b/src/libsystemd/sd-device/test-sd-device-thread.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "device-util.h" + +#define handle_error_errno(error, msg) \ + ({ \ + errno = abs(error); \ + perror(msg); \ + EXIT_FAILURE; \ + }) + +static void* thread(void *p) { + sd_device **d = p; + + *d = sd_device_unref(*d); + + return NULL; +} + +int main(int argc, char *argv[]) { + sd_device *loopback; + pthread_t t; + int r; + + r = sd_device_new_from_syspath(&loopback, "/sys/class/net/lo"); + if (r < 0) + return handle_error_errno(r, "Failed to create loopback device object"); + + FOREACH_DEVICE_PROPERTY(loopback, key, value) + printf("%s=%s\n", key, value); + + r = pthread_create(&t, NULL, thread, &loopback); + if (r != 0) + return handle_error_errno(r, "Failed to create thread"); + + r = pthread_join(t, NULL); + if (r != 0) + return handle_error_errno(r, "Failed to wait thread finished"); + + if (loopback) + return handle_error_errno(r, "loopback device is not unref()ed"); + + return 0; +} diff --git a/src/libsystemd/sd-device/test-sd-device.c b/src/libsystemd/sd-device/test-sd-device.c new file mode 100644 index 0000000..bce99b5 --- /dev/null +++ b/src/libsystemd/sd-device/test-sd-device.c @@ -0,0 +1,678 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "device-enumerator-private.h" +#include "device-internal.h" +#include "device-private.h" +#include "device-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "udev-util.h" + +static void test_sd_device_one(sd_device *d) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *syspath, *sysname, *subsystem = NULL, *devname, *val; + bool is_block = false; + dev_t devnum; + usec_t usec; + int ifindex, r; + + assert_se(sd_device_get_syspath(d, &syspath) >= 0); + assert_se(path_startswith(syspath, "/sys")); + assert_se(sd_device_get_sysname(d, &sysname) >= 0); + + log_info("%s(%s)", __func__, syspath); + + assert_se(sd_device_new_from_syspath(&dev, syspath) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + + assert_se(sd_device_new_from_path(&dev, syspath) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + + r = sd_device_get_ifindex(d, &ifindex); + if (r >= 0) { + assert_se(ifindex > 0); + + r = sd_device_new_from_ifindex(&dev, ifindex); + if (r == -ENODEV) + log_device_warning_errno(d, r, + "Failed to create sd-device object from ifindex %i. " + "Maybe running on a non-host network namespace.", ifindex); + else { + assert_se(r >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + } + + /* This does not require the interface really exists on the network namespace. + * Hence, this should always succeed. */ + assert_se(sd_device_new_from_ifname(&dev, sysname) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + } else + assert_se(r == -ENOENT); + + r = sd_device_get_subsystem(d, &subsystem); + if (r < 0) + assert_se(r == -ENOENT); + else if (!streq(subsystem, "gpio")) { /* Unfortunately, there exist /sys/class/gpio and /sys/bus/gpio. + * Hence, sd_device_new_from_subsystem_sysname() and + * sd_device_new_from_device_id() may not work as expected. */ + const char *name, *id; + + if (streq(subsystem, "drivers")) + name = strjoina(d->driver_subsystem, ":", sysname); + else + name = sysname; + assert_se(sd_device_new_from_subsystem_sysname(&dev, subsystem, name) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + + /* The device ID depends on subsystem. */ + assert_se(device_get_device_id(d, &id) >= 0); + r = sd_device_new_from_device_id(&dev, id); + if (r == -ENODEV && ifindex > 0) + log_device_warning_errno(d, r, + "Failed to create sd-device object from device ID \"%s\". " + "Maybe running on a non-host network namespace.", id); + else { + assert_se(r >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + } + + /* These require udev database, and reading database requires device ID. */ + r = sd_device_get_is_initialized(d); + if (r > 0) { + r = sd_device_get_usec_since_initialized(d, &usec); + assert_se((r >= 0 && usec > 0) || r == -ENODATA); + } else + assert(r == 0); + + r = sd_device_get_property_value(d, "ID_NET_DRIVER", &val); + assert_se(r >= 0 || r == -ENOENT); + } + + is_block = streq_ptr(subsystem, "block"); + + r = sd_device_get_devname(d, &devname); + if (r >= 0) { + r = sd_device_new_from_devname(&dev, devname); + if (r >= 0) { + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + } else + assert_se(r == -ENODEV || ERRNO_IS_PRIVILEGE(r)); + + r = sd_device_new_from_path(&dev, devname); + if (r >= 0) { + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + + _cleanup_close_ int fd = -EBADF; + fd = sd_device_open(d, O_CLOEXEC| O_NONBLOCK | (is_block ? O_RDONLY : O_NOCTTY | O_PATH)); + assert_se(fd >= 0 || ERRNO_IS_PRIVILEGE(fd)); + } else + assert_se(r == -ENODEV || ERRNO_IS_PRIVILEGE(r)); + } else + assert_se(r == -ENOENT); + + r = sd_device_get_devnum(d, &devnum); + if (r >= 0) { + _cleanup_free_ char *p = NULL; + + assert_se(major(devnum) > 0); + + assert_se(sd_device_new_from_devnum(&dev, is_block ? 'b' : 'c', devnum) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + + assert_se(asprintf(&p, "/dev/%s/%u:%u", is_block ? "block" : "char", major(devnum), minor(devnum)) >= 0); + assert_se(sd_device_new_from_devname(&dev, p) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + + assert_se(sd_device_new_from_path(&dev, p) >= 0); + assert_se(sd_device_get_syspath(dev, &val) >= 0); + assert_se(streq(syspath, val)); + dev = sd_device_unref(dev); + } else + assert_se(r == -ENOENT); + + assert_se(sd_device_get_devpath(d, &val) >= 0); + + r = sd_device_get_devtype(d, &val); + assert_se(r >= 0 || r == -ENOENT); + + r = sd_device_get_driver(d, &val); + assert_se(r >= 0 || r == -ENOENT); + + r = sd_device_get_sysnum(d, &val); + if (r >= 0) { + assert_se(val > sysname); + assert_se(val < sysname + strlen(sysname)); + assert_se(in_charset(val, DIGITS)); + assert_se(!ascii_isdigit(val[-1])); + } else + assert_se(r == -ENOENT); + + r = sd_device_get_sysattr_value(d, "nsid", NULL); + if (r >= 0) { + unsigned x; + + assert_se(device_get_sysattr_unsigned(d, "nsid", NULL) >= 0); + r = device_get_sysattr_unsigned(d, "nsid", &x); + assert_se(r >= 0); + assert_se((x > 0) == (r > 0)); + } else + assert_se(ERRNO_IS_PRIVILEGE(r) || IN_SET(r, -ENOENT, -EINVAL)); +} + +TEST(sd_device_enumerator_devices) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + /* On some CI environments, it seems some loop block devices and corresponding bdi devices sometimes + * disappear during running this test. Let's exclude them here for stability. */ + assert_se(sd_device_enumerator_add_match_subsystem(e, "bdi", false) >= 0); + assert_se(sd_device_enumerator_add_nomatch_sysname(e, "loop*") >= 0); + /* On CentOS CI, systemd-networkd-tests.py may be running when this test is invoked. The networkd + * test creates and removes many network interfaces, and may interfere with this test. */ + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", false) >= 0); + FOREACH_DEVICE(e, d) + test_sd_device_one(d); +} + +TEST(sd_device_enumerator_subsystems) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + FOREACH_SUBSYSTEM(e, d) + test_sd_device_one(d); +} + +static void test_sd_device_enumerator_filter_subsystem_one( + const char *subsystem, + Hashmap *h, + unsigned *ret_n_new_dev, + unsigned *ret_n_removed_dev) { + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + unsigned n_new_dev = 0, n_removed_dev = 0; + sd_device *dev; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, subsystem, true) >= 0); + assert_se(sd_device_enumerator_add_nomatch_sysname(e, "loop*") >= 0); + + FOREACH_DEVICE(e, d) { + const char *syspath; + sd_device *t; + + assert_se(sd_device_get_syspath(d, &syspath) >= 0); + t = hashmap_remove(h, syspath); + + if (!t) { + log_warning("New device found: subsystem:%s syspath:%s", subsystem, syspath); + n_new_dev++; + } + + assert_se(!sd_device_unref(t)); + } + + HASHMAP_FOREACH(dev, h) { + const char *syspath; + + assert_se(sd_device_get_syspath(dev, &syspath) >= 0); + log_warning("Device removed: subsystem:%s syspath:%s", subsystem, syspath); + n_removed_dev++; + + assert_se(!sd_device_unref(dev)); + } + + hashmap_free(h); + + *ret_n_new_dev = n_new_dev; + *ret_n_removed_dev = n_removed_dev; +} + +static bool test_sd_device_enumerator_filter_subsystem_trial(void) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_hashmap_free_ Hashmap *subsystems = NULL; + unsigned n_new_dev = 0, n_removed_dev = 0; + Hashmap *h; + char *s; + + assert_se(subsystems = hashmap_new(&string_hash_ops)); + assert_se(sd_device_enumerator_new(&e) >= 0); + /* See comments in TEST(sd_device_enumerator_devices). */ + assert_se(sd_device_enumerator_add_match_subsystem(e, "bdi", false) >= 0); + assert_se(sd_device_enumerator_add_nomatch_sysname(e, "loop*") >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", false) >= 0); + + FOREACH_DEVICE(e, d) { + const char *syspath, *subsystem; + int r; + + assert_se(sd_device_get_syspath(d, &syspath) >= 0); + + r = sd_device_get_subsystem(d, &subsystem); + assert_se(r >= 0 || r == -ENOENT); + if (r < 0) + continue; + + h = hashmap_get(subsystems, subsystem); + if (!h) { + char *str; + assert_se(str = strdup(subsystem)); + assert_se(h = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(subsystems, str, h) >= 0); + } + + assert_se(hashmap_put(h, syspath, d) >= 0); + assert_se(sd_device_ref(d)); + + log_debug("Added subsystem:%s syspath:%s", subsystem, syspath); + } + + while ((h = hashmap_steal_first_key_and_value(subsystems, (void**) &s))) { + unsigned n, m; + + test_sd_device_enumerator_filter_subsystem_one(s, TAKE_PTR(h), &n, &m); + free(s); + + n_new_dev += n; + n_removed_dev += m; + } + + if (n_new_dev > 0) + log_warning("%u new devices are found in re-scan", n_new_dev); + if (n_removed_dev > 0) + log_warning("%u devices removed in re-scan", n_removed_dev); + + return n_new_dev + n_removed_dev == 0; +} + +static bool test_sd_device_enumerator_filter_subsystem_trial_many(void) { + for (unsigned i = 0; i < 20; i++) { + log_debug("%s(): trial %u", __func__, i); + if (test_sd_device_enumerator_filter_subsystem_trial()) + return true; + } + + return false; +} + +static int on_inotify(sd_event_source *s, const struct inotify_event *event, void *userdata) { + if (test_sd_device_enumerator_filter_subsystem_trial_many()) + return sd_event_exit(sd_event_source_get_event(s), 0); + + return sd_event_exit(sd_event_source_get_event(s), -EBUSY); +} + +TEST(sd_device_enumerator_filter_subsystem) { + /* The test test_sd_device_enumerator_filter_subsystem_trial() is quite racy. Let's run the function + * several times after the udev queue becomes empty. */ + + if (!udev_available() || (access("/run/udev", F_OK) < 0 && errno == ENOENT)) { + assert_se(test_sd_device_enumerator_filter_subsystem_trial_many()); + return; + } + + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + assert_se(sd_event_default(&event) >= 0); + assert_se(sd_event_add_inotify(event, NULL, "/run/udev" , IN_DELETE, on_inotify, NULL) >= 0); + + if (udev_queue_is_empty() == 0) { + log_debug("udev queue is not empty, waiting for all queued events to be processed."); + assert_se(sd_event_loop(event) >= 0); + } else + assert_se(test_sd_device_enumerator_filter_subsystem_trial_many()); +} + +TEST(sd_device_enumerator_add_match_sysattr) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *dev; + int ifindex; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", true) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "1", true) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "hoge", true) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "foo", true) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "bar", false) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "baz", false) >= 0); + + dev = sd_device_enumerator_get_device_first(e); + assert_se(dev); + assert_se(sd_device_get_ifindex(dev, &ifindex) >= 0); + assert_se(ifindex == 1); + + assert_se(!sd_device_enumerator_get_device_next(e)); +} + +TEST(sd_device_enumerator_add_match_property) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *dev; + int ifindex; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", true) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "1", true) >= 0); + assert_se(sd_device_enumerator_add_match_property(e, "IFINDE*", "1*") >= 0); + assert_se(sd_device_enumerator_add_match_property(e, "IFINDE*", "hoge") >= 0); + assert_se(sd_device_enumerator_add_match_property(e, "IFINDE*", NULL) >= 0); + assert_se(sd_device_enumerator_add_match_property(e, "AAAAA", "BBBB") >= 0); + assert_se(sd_device_enumerator_add_match_property(e, "FOOOO", NULL) >= 0); + + dev = sd_device_enumerator_get_device_first(e); + assert_se(dev); + assert_se(sd_device_get_ifindex(dev, &ifindex) >= 0); + assert_se(ifindex == 1); +} + +TEST(sd_device_enumerator_add_match_property_required) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *dev; + int ifindex; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", true) >= 0); + assert_se(sd_device_enumerator_add_match_sysattr(e, "ifindex", "1", true) >= 0); + assert_se(sd_device_enumerator_add_match_property_required(e, "IFINDE*", "1*") >= 0); + + /* Only one required match which should be satisfied. */ + dev = sd_device_enumerator_get_device_first(e); + assert_se(dev); + assert_se(sd_device_get_ifindex(dev, &ifindex) >= 0); + assert_se(ifindex == 1); + + /* Now let's add a bunch of garbage properties which should not be satisfied. */ + assert_se(sd_device_enumerator_add_match_property_required(e, "IFINDE*", "hoge") >= 0); + assert_se(sd_device_enumerator_add_match_property_required(e, "IFINDE*", NULL) >= 0); + assert_se(sd_device_enumerator_add_match_property_required(e, "AAAAA", "BBBB") >= 0); + assert_se(sd_device_enumerator_add_match_property_required(e, "FOOOO", NULL) >= 0); + + assert_se(!sd_device_enumerator_get_device_first(e)); +} + +static void check_parent_match(sd_device_enumerator *e, sd_device *dev) { + const char *syspath; + bool found = false; + + assert_se(sd_device_get_syspath(dev, &syspath) >= 0); + + FOREACH_DEVICE(e, d) { + const char *s; + + assert_se(sd_device_get_syspath(d, &s) >= 0); + if (streq(s, syspath)) { + found = true; + break; + } + } + + if (!found) { + log_device_debug(dev, "not enumerated, already removed??"); + /* If the original device not found, then the device should be already removed. */ + assert_se(access(syspath, F_OK) < 0); + assert_se(errno == ENOENT); + } +} + +TEST(sd_device_enumerator_add_match_parent) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + /* See comments in TEST(sd_device_enumerator_devices). */ + assert_se(sd_device_enumerator_add_match_subsystem(e, "bdi", false) >= 0); + assert_se(sd_device_enumerator_add_nomatch_sysname(e, "loop*") >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", false) >= 0); + + if (!slow_tests_enabled()) + assert_se(sd_device_enumerator_add_match_subsystem(e, "block", true) >= 0); + + FOREACH_DEVICE(e, dev) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *p = NULL; + const char *syspath; + sd_device *parent; + + assert_se(sd_device_get_syspath(dev, &syspath) >= 0); + + r = sd_device_get_parent(dev, &parent); + if (r < 0) { + assert_se(ERRNO_IS_DEVICE_ABSENT(r)); + continue; + } + + log_debug("> %s", syspath); + + assert_se(sd_device_enumerator_new(&p) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(p) >= 0); + assert_se(sd_device_enumerator_add_match_parent(p, parent) >= 0); + + check_parent_match(p, dev); + + /* If the device does not have subsystem, then it is not enumerated. */ + r = sd_device_get_subsystem(parent, NULL); + if (r < 0) { + assert_se(r == -ENOENT); + continue; + } + check_parent_match(p, parent); + } +} + +TEST(sd_device_get_child) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + /* See comments in TEST(sd_device_enumerator_devices). */ + assert_se(sd_device_enumerator_add_match_subsystem(e, "bdi", false) >= 0); + assert_se(sd_device_enumerator_add_nomatch_sysname(e, "loop*") >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "net", false) >= 0); + + if (!slow_tests_enabled()) + assert_se(sd_device_enumerator_add_match_subsystem(e, "block", true) >= 0); + + FOREACH_DEVICE(e, dev) { + const char *syspath, *parent_syspath, *expected_suffix, *suffix; + sd_device *parent; + bool found = false; + + assert_se(sd_device_get_syspath(dev, &syspath) >= 0); + + r = sd_device_get_parent(dev, &parent); + if (r < 0) { + assert_se(ERRNO_IS_DEVICE_ABSENT(r)); + continue; + } + + assert_se(sd_device_get_syspath(parent, &parent_syspath) >= 0); + assert_se(expected_suffix = path_startswith(syspath, parent_syspath)); + + log_debug("> %s", syspath); + + FOREACH_DEVICE_CHILD_WITH_SUFFIX(parent, child, suffix) { + const char *s; + + assert_se(child); + assert_se(suffix); + + if (!streq(suffix, expected_suffix)) + continue; + + assert_se(sd_device_get_syspath(child, &s) >= 0); + assert_se(streq(s, syspath)); + found = true; + break; + } + assert_se(found); + } +} + +TEST(sd_device_new_from_nulstr) { + const char *devlinks = + "/dev/disk/by-partuuid/1290d63a-42cc-4c71-b87c-xxxxxxxxxxxx\0" + "/dev/disk/by-path/pci-0000:00:0f.0-scsi-0:0:0:0-part3\0" + "/dev/disk/by-label/Arch\\x20Linux\0" + "/dev/disk/by-uuid/a07b87e5-4af5-4a59-bde9-yyyyyyyyyyyy\0" + "/dev/disk/by-partlabel/Arch\\x20Linux\0" + "\0"; + + _cleanup_(sd_device_unrefp) sd_device *device = NULL, *from_nulstr = NULL; + _cleanup_free_ char *nulstr_copy = NULL; + const char *nulstr; + size_t len; + + assert_se(sd_device_new_from_syspath(&device, "/sys/class/net/lo") >= 0); + + /* Yeah, of course, setting devlink to the loopback interface is nonsense. But this is just a + * test for generating and parsing nulstr. For issue #17772. */ + NULSTR_FOREACH(devlink, devlinks) { + log_device_info(device, "setting devlink: %s", devlink); + assert_se(device_add_devlink(device, devlink) >= 0); + assert_se(set_contains(device->devlinks, devlink)); + } + + /* For issue #23799 */ + assert_se(device_add_tag(device, "tag1", false) >= 0); + assert_se(device_add_tag(device, "tag2", false) >= 0); + assert_se(device_add_tag(device, "current-tag1", true) >= 0); + assert_se(device_add_tag(device, "current-tag2", true) >= 0); + + /* These properties are necessary for device_new_from_nulstr(). See device_verify(). */ + assert_se(device_add_property_internal(device, "SEQNUM", "1") >= 0); + assert_se(device_add_property_internal(device, "ACTION", "change") >= 0); + + assert_se(device_get_properties_nulstr(device, &nulstr, &len) >= 0); + assert_se(nulstr_copy = newdup(char, nulstr, len)); + assert_se(device_new_from_nulstr(&from_nulstr, nulstr_copy, len) >= 0); + + assert_se(sd_device_has_tag(from_nulstr, "tag1") == 1); + assert_se(sd_device_has_tag(from_nulstr, "tag2") == 1); + assert_se(sd_device_has_tag(from_nulstr, "current-tag1") == 1); + assert_se(sd_device_has_tag(from_nulstr, "current-tag2") == 1); + assert_se(sd_device_has_current_tag(from_nulstr, "tag1") == 0); + assert_se(sd_device_has_current_tag(from_nulstr, "tag2") == 0); + assert_se(sd_device_has_current_tag(from_nulstr, "current-tag1") == 1); + assert_se(sd_device_has_current_tag(from_nulstr, "current-tag2") == 1); + + NULSTR_FOREACH(devlink, devlinks) { + log_device_info(from_nulstr, "checking devlink: %s", devlink); + assert_se(set_contains(from_nulstr->devlinks, devlink)); + } +} + +TEST(sd_device_new_from_path) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + int r; + + assert_se(mkdtemp_malloc("/tmp/test-sd-device.XXXXXXX", &tmpdir) >= 0); + + assert_se(sd_device_enumerator_new(&e) >= 0); + assert_se(sd_device_enumerator_allow_uninitialized(e) >= 0); + assert_se(sd_device_enumerator_add_match_subsystem(e, "block", true) >= 0); + assert_se(sd_device_enumerator_add_nomatch_sysname(e, "loop*") >= 0); + assert_se(sd_device_enumerator_add_match_property(e, "DEVNAME", "*") >= 0); + + FOREACH_DEVICE(e, dev) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + const char *syspath, *devpath, *sysname, *s; + _cleanup_free_ char *path = NULL; + + assert_se(sd_device_get_sysname(dev, &sysname) >= 0); + + log_debug("%s(%s)", __func__, sysname); + + assert_se(sd_device_get_syspath(dev, &syspath) >= 0); + assert_se(sd_device_new_from_path(&d, syspath) >= 0); + assert_se(sd_device_get_syspath(d, &s) >= 0); + assert_se(streq(s, syspath)); + d = sd_device_unref(d); + + assert_se(sd_device_get_devname(dev, &devpath) >= 0); + r = sd_device_new_from_path(&d, devpath); + if (r >= 0) { + assert_se(sd_device_get_syspath(d, &s) >= 0); + assert_se(streq(s, syspath)); + d = sd_device_unref(d); + } else + assert_se(r == -ENODEV || ERRNO_IS_PRIVILEGE(r)); + + assert_se(path = path_join(tmpdir, sysname)); + assert_se(symlink(syspath, path) >= 0); + assert_se(sd_device_new_from_path(&d, path) >= 0); + assert_se(sd_device_get_syspath(d, &s) >= 0); + assert_se(streq(s, syspath)); + } +} + +static void test_devname_from_devnum_one(const char *path) { + _cleanup_free_ char *resolved = NULL; + struct stat st; + + log_debug("> %s", path); + + if (stat(path, &st) < 0) { + assert_se(errno == ENOENT); + log_notice("Path %s not found, skipping test", path); + return; + } + + assert_se(devname_from_devnum(st.st_mode, st.st_rdev, &resolved) >= 0); + assert_se(path_equal(path, resolved)); + resolved = mfree(resolved); + assert_se(devname_from_stat_rdev(&st, &resolved) >= 0); + assert_se(path_equal(path, resolved)); +} + +TEST(devname_from_devnum) { + test_devname_from_devnum_one("/dev/null"); + test_devname_from_devnum_one("/dev/zero"); + test_devname_from_devnum_one("/dev/full"); + test_devname_from_devnum_one("/dev/random"); + test_devname_from_devnum_one("/dev/urandom"); + test_devname_from_devnum_one("/dev/tty"); + + if (is_device_node("/run/systemd/inaccessible/blk") > 0) { + test_devname_from_devnum_one("/run/systemd/inaccessible/chr"); + test_devname_from_devnum_one("/run/systemd/inaccessible/blk"); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h new file mode 100644 index 0000000..f4e38d7 --- /dev/null +++ b/src/libsystemd/sd-event/event-source.h @@ -0,0 +1,239 @@ +#pragma once +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-event.h" + +#include "hashmap.h" +#include "inotify-util.h" +#include "list.h" +#include "prioq.h" +#include "ratelimit.h" + +typedef enum EventSourceType { + SOURCE_IO, + SOURCE_TIME_REALTIME, + SOURCE_TIME_BOOTTIME, + SOURCE_TIME_MONOTONIC, + SOURCE_TIME_REALTIME_ALARM, + SOURCE_TIME_BOOTTIME_ALARM, + SOURCE_SIGNAL, + SOURCE_CHILD, + SOURCE_DEFER, + SOURCE_POST, + SOURCE_EXIT, + SOURCE_WATCHDOG, + SOURCE_INOTIFY, + SOURCE_MEMORY_PRESSURE, + _SOURCE_EVENT_SOURCE_TYPE_MAX, + _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL, +} EventSourceType; + +/* All objects we use in epoll events start with this value, so that + * we know how to dispatch it */ +typedef enum WakeupType { + WAKEUP_NONE, + WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */ + WAKEUP_CLOCK_DATA, + WAKEUP_SIGNAL_DATA, + WAKEUP_INOTIFY_DATA, + _WAKEUP_TYPE_MAX, + _WAKEUP_TYPE_INVALID = -EINVAL, +} WakeupType; + +struct inode_data; + +struct sd_event_source { + WakeupType wakeup; + + unsigned n_ref; + + sd_event *event; + void *userdata; + sd_event_handler_t prepare; + + char *description; + + EventSourceType type; + signed int enabled:3; + bool pending:1; + bool dispatching:1; + bool floating:1; + bool exit_on_failure:1; + bool ratelimited:1; + + int64_t priority; + unsigned pending_index; + unsigned prepare_index; + uint64_t pending_iteration; + uint64_t prepare_iteration; + + sd_event_destroy_t destroy_callback; + sd_event_handler_t ratelimit_expire_callback; + + LIST_FIELDS(sd_event_source, sources); + + RateLimit rate_limit; + + /* These are primarily fields relevant for time event sources, but since any event source can + * effectively become one when rate-limited, this is part of the common fields. */ + unsigned earliest_index; + unsigned latest_index; + + union { + struct { + sd_event_io_handler_t callback; + int fd; + uint32_t events; + uint32_t revents; + bool registered:1; + bool owned:1; + } io; + struct { + sd_event_time_handler_t callback; + usec_t next, accuracy; + } time; + struct { + sd_event_signal_handler_t callback; + struct signalfd_siginfo siginfo; + int sig; + bool unblock; + } signal; + struct { + sd_event_child_handler_t callback; + siginfo_t siginfo; + pid_t pid; + int options; + int pidfd; + bool registered:1; /* whether the pidfd is registered in the epoll */ + bool pidfd_owned:1; /* close pidfd when event source is freed */ + bool process_owned:1; /* kill+reap process when event source is freed */ + bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */ + bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */ + } child; + struct { + sd_event_handler_t callback; + } defer; + struct { + sd_event_handler_t callback; + } post; + struct { + sd_event_handler_t callback; + unsigned prioq_index; + } exit; + struct { + sd_event_inotify_handler_t callback; + uint32_t mask; + struct inode_data *inode_data; + LIST_FIELDS(sd_event_source, by_inode_data); + } inotify; + struct { + int fd; + sd_event_handler_t callback; + void *write_buffer; + size_t write_buffer_size; + uint32_t events, revents; + LIST_FIELDS(sd_event_source, write_list); + bool registered:1; + bool locked:1; + bool in_write_list:1; + } memory_pressure; + }; +}; + +struct clock_data { + WakeupType wakeup; + int fd; + + /* For all clocks we maintain two priority queues each, one + * ordered for the earliest times the events may be + * dispatched, and one ordered by the latest times they must + * have been dispatched. The range between the top entries in + * the two prioqs is the time window we can freely schedule + * wakeups in */ + + Prioq *earliest; + Prioq *latest; + usec_t next; + + bool needs_rearm:1; +}; + +struct signal_data { + WakeupType wakeup; + + /* For each priority we maintain one signal fd, so that we + * only have to dequeue a single event per priority at a + * time. */ + + int fd; + int64_t priority; + sigset_t sigset; + sd_event_source *current; +}; + +/* A structure listing all event sources currently watching a specific inode */ +struct inode_data { + /* The identifier for the inode, the combination of the .st_dev + .st_ino fields of the file */ + ino_t ino; + dev_t dev; + + /* An fd of the inode to watch. The fd is kept open until the next iteration of the loop, so that we can + * rearrange the priority still until then, as we need the original inode to change the priority as we need to + * add a watch descriptor to the right inotify for the priority which we can only do if we have a handle to the + * original inode. We keep a list of all inode_data objects with an open fd in the to_close list (see below) of + * the sd-event object, so that it is efficient to close everything, before entering the next event loop + * iteration. */ + int fd; + + /* The inotify "watch descriptor" */ + int wd; + + /* The combination of the mask of all inotify watches on this inode we manage. This is also the mask that has + * most recently been set on the watch descriptor. */ + uint32_t combined_mask; + + /* All event sources subscribed to this inode */ + LIST_HEAD(sd_event_source, event_sources); + + /* The inotify object we watch this inode with */ + struct inotify_data *inotify_data; + + /* A linked list of all inode data objects with fds to close (see above) */ + LIST_FIELDS(struct inode_data, to_close); +}; + +/* A structure encapsulating an inotify fd */ +struct inotify_data { + WakeupType wakeup; + + /* For each priority we maintain one inotify fd, so that we only have to dequeue a single event per priority at + * a time */ + + int fd; + int64_t priority; + + Hashmap *inodes; /* The inode_data structures keyed by dev+ino */ + Hashmap *wd; /* The inode_data structures keyed by the watch descriptor for each */ + + /* The buffer we read inotify events into */ + union inotify_event_buffer buffer; + size_t buffer_filled; /* fill level of the buffer */ + + /* How many event sources are currently marked pending for this inotify. We won't read new events off the + * inotify fd as long as there are still pending events on the inotify (because we have no strategy of queuing + * the events locally if they can't be coalesced). */ + unsigned n_pending; + + /* If this counter is non-zero, don't GC the inotify data object even if not used to watch any inode + * anymore. This is useful to pin the object for a bit longer, after the last event source needing it + * is gone. */ + unsigned n_busy; + + /* A linked list of all inotify objects with data already read, that still need processing. We keep this list + * to make it efficient to figure out what inotify objects to process data on next. */ + LIST_FIELDS(struct inotify_data, buffered); +}; diff --git a/src/libsystemd/sd-event/event-util.c b/src/libsystemd/sd-event/event-util.c new file mode 100644 index 0000000..a310122 --- /dev/null +++ b/src/libsystemd/sd-event/event-util.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "event-source.h" +#include "event-util.h" +#include "fd-util.h" +#include "log.h" +#include "string-util.h" + +int event_reset_time( + sd_event *e, + sd_event_source **s, + clockid_t clock, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + int64_t priority, + const char *description, + bool force_reset) { + + bool created = false; + int enabled, r; + clockid_t c; + + assert(e); + assert(s); + + if (*s) { + if (!force_reset) { + r = sd_event_source_get_enabled(*s, &enabled); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to query whether event source \"%s\" is enabled or not: %m", + strna((*s)->description ?: description)); + + if (enabled != SD_EVENT_OFF) + return 0; + } + + r = sd_event_source_get_time_clock(*s, &c); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to get clock id of event source \"%s\": %m", strna((*s)->description ?: description)); + + if (c != clock) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "sd-event: Current clock id %i of event source \"%s\" is different from specified one %i.", + (int)c, + strna((*s)->description ?: description), + (int)clock); + + r = sd_event_source_set_time(*s, usec); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to set time for event source \"%s\": %m", strna((*s)->description ?: description)); + + r = sd_event_source_set_time_accuracy(*s, accuracy); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to set accuracy for event source \"%s\": %m", strna((*s)->description ?: description)); + + /* callback function is not updated, as we do not have sd_event_source_set_time_callback(). */ + + (void) sd_event_source_set_userdata(*s, userdata); + + r = sd_event_source_set_enabled(*s, SD_EVENT_ONESHOT); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to enable event source \"%s\": %m", strna((*s)->description ?: description)); + } else { + r = sd_event_add_time(e, s, clock, usec, accuracy, callback, userdata); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to create timer event \"%s\": %m", strna(description)); + + created = true; + } + + r = sd_event_source_set_priority(*s, priority); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to set priority for event source \"%s\": %m", strna((*s)->description ?: description)); + + if (description) { + r = sd_event_source_set_description(*s, description); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to set description for event source \"%s\": %m", description); + } + + return created; +} + +int event_reset_time_relative( + sd_event *e, + sd_event_source **s, + clockid_t clock, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + int64_t priority, + const char *description, + bool force_reset) { + + int r; + + assert(e); + + if (usec > 0) { + usec_t usec_now; + + r = sd_event_now(e, clock, &usec_now); + if (r < 0) + return log_debug_errno(r, "sd-event: Failed to get the current time: %m"); + + usec = usec_add(usec_now, usec); + } + + return event_reset_time(e, s, clock, usec, accuracy, callback, userdata, priority, description, force_reset); +} + +int event_add_time_change(sd_event *e, sd_event_source **ret, sd_event_io_handler_t callback, void *userdata) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(e); + + /* Allocates an IO event source that gets woken up whenever the clock changes. Needs to be recreated on each event */ + + fd = time_change_fd(); + if (fd < 0) + return fd; + + r = sd_event_add_io(e, &s, fd, EPOLLIN, callback, userdata); + if (r < 0) + return r; + + r = sd_event_source_set_io_fd_own(s, true); + if (r < 0) + return r; + + TAKE_FD(fd); + + r = sd_event_source_set_description(s, "time-change"); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(s); + else { + r = sd_event_source_set_floating(s, true); + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/libsystemd/sd-event/event-util.h b/src/libsystemd/sd-event/event-util.h new file mode 100644 index 0000000..c185584 --- /dev/null +++ b/src/libsystemd/sd-event/event-util.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-event.h" + +int event_reset_time( + sd_event *e, + sd_event_source **s, + clockid_t clock, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + int64_t priority, + const char *description, + bool force_reset); +int event_reset_time_relative( + sd_event *e, + sd_event_source **s, + clockid_t clock, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + int64_t priority, + const char *description, + bool force_reset); +static inline int event_source_disable(sd_event_source *s) { + return sd_event_source_set_enabled(s, SD_EVENT_OFF); +} + +int event_add_time_change(sd_event *e, sd_event_source **ret, sd_event_io_handler_t callback, void *userdata); diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c new file mode 100644 index 0000000..288798a --- /dev/null +++ b/src/libsystemd/sd-event/sd-event.c @@ -0,0 +1,5357 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-event.h" +#include "sd-id128.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "env-util.h" +#include "event-source.h" +#include "fd-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "hexdecoct.h" +#include "list.h" +#include "logarithm.h" +#include "macro.h" +#include "mallinfo-util.h" +#include "memory-util.h" +#include "missing_magic.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "origin-id.h" +#include "path-util.h" +#include "prioq.h" +#include "process-util.h" +#include "psi-util.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "time-util.h" + +#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC) + +static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) { + /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */ + return s && + s->type == SOURCE_CHILD && + s->child.pidfd >= 0 && + s->child.options == WEXITED; +} + +static bool event_source_is_online(sd_event_source *s) { + assert(s); + return s->enabled != SD_EVENT_OFF && !s->ratelimited; +} + +static bool event_source_is_offline(sd_event_source *s) { + assert(s); + return s->enabled == SD_EVENT_OFF || s->ratelimited; +} + +static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = { + [SOURCE_IO] = "io", + [SOURCE_TIME_REALTIME] = "realtime", + [SOURCE_TIME_BOOTTIME] = "boottime", + [SOURCE_TIME_MONOTONIC] = "monotonic", + [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm", + [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm", + [SOURCE_SIGNAL] = "signal", + [SOURCE_CHILD] = "child", + [SOURCE_DEFER] = "defer", + [SOURCE_POST] = "post", + [SOURCE_EXIT] = "exit", + [SOURCE_WATCHDOG] = "watchdog", + [SOURCE_INOTIFY] = "inotify", + [SOURCE_MEMORY_PRESSURE] = "memory-pressure", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); + +#define EVENT_SOURCE_IS_TIME(t) \ + IN_SET((t), \ + SOURCE_TIME_REALTIME, \ + SOURCE_TIME_BOOTTIME, \ + SOURCE_TIME_MONOTONIC, \ + SOURCE_TIME_REALTIME_ALARM, \ + SOURCE_TIME_BOOTTIME_ALARM) + +#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \ + IN_SET((t), \ + SOURCE_IO, \ + SOURCE_TIME_REALTIME, \ + SOURCE_TIME_BOOTTIME, \ + SOURCE_TIME_MONOTONIC, \ + SOURCE_TIME_REALTIME_ALARM, \ + SOURCE_TIME_BOOTTIME_ALARM, \ + SOURCE_SIGNAL, \ + SOURCE_DEFER, \ + SOURCE_INOTIFY, \ + SOURCE_MEMORY_PRESSURE) + +/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put(). + * Time sources and ratelimited sources can be passed, so effectively this is the same as the + * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */ +#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t) + +struct sd_event { + unsigned n_ref; + + int epoll_fd; + int watchdog_fd; + + Prioq *pending; + Prioq *prepare; + + /* timerfd_create() only supports these five clocks so far. We + * can add support for more clocks when the kernel learns to + * deal with them, too. */ + struct clock_data realtime; + struct clock_data boottime; + struct clock_data monotonic; + struct clock_data realtime_alarm; + struct clock_data boottime_alarm; + + usec_t perturb; + + sd_event_source **signal_sources; /* indexed by signal number */ + Hashmap *signal_data; /* indexed by priority */ + + Hashmap *child_sources; + unsigned n_online_child_sources; + + Set *post_sources; + + Prioq *exit; + + Hashmap *inotify_data; /* indexed by priority */ + + /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */ + LIST_HEAD(struct inode_data, inode_data_to_close_list); + + /* A list of inotify objects that already have events buffered which aren't processed yet */ + LIST_HEAD(struct inotify_data, buffered_inotify_data_list); + + /* A list of memory pressure event sources that still need their subscription string written */ + LIST_HEAD(sd_event_source, memory_pressure_write_list); + + uint64_t origin_id; + + uint64_t iteration; + triple_timestamp timestamp; + int state; + + bool exit_requested:1; + bool need_process_child:1; + bool watchdog:1; + bool profile_delays:1; + + int exit_code; + + pid_t tid; + sd_event **default_event_ptr; + + usec_t watchdog_last, watchdog_period; + + unsigned n_sources; + + struct epoll_event *event_queue; + + LIST_HEAD(sd_event_source, sources); + + sd_event_source *sigint_event_source, *sigterm_event_source; + + usec_t last_run_usec, last_log_usec; + unsigned delays[sizeof(usec_t) * 8]; +}; + +DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event); + +static thread_local sd_event *default_event = NULL; + +static void source_disconnect(sd_event_source *s); +static void event_gc_inode_data(sd_event *e, struct inode_data *d); + +static sd_event *event_resolve(sd_event *e) { + return e == SD_EVENT_DEFAULT ? default_event : e; +} + +static int pending_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + int r; + + assert(x->pending); + assert(y->pending); + + /* Enabled ones first */ + r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF); + if (r != 0) + return r; + + /* Non rate-limited ones first. */ + r = CMP(!!x->ratelimited, !!y->ratelimited); + if (r != 0) + return r; + + /* Lower priority values first */ + r = CMP(x->priority, y->priority); + if (r != 0) + return r; + + /* Older entries first */ + return CMP(x->pending_iteration, y->pending_iteration); +} + +static int prepare_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + int r; + + assert(x->prepare); + assert(y->prepare); + + /* Enabled ones first */ + r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF); + if (r != 0) + return r; + + /* Non rate-limited ones first. */ + r = CMP(!!x->ratelimited, !!y->ratelimited); + if (r != 0) + return r; + + /* Move most recently prepared ones last, so that we can stop + * preparing as soon as we hit one that has already been + * prepared in the current iteration */ + r = CMP(x->prepare_iteration, y->prepare_iteration); + if (r != 0) + return r; + + /* Lower priority values first */ + return CMP(x->priority, y->priority); +} + +static usec_t time_event_source_next(const sd_event_source *s) { + assert(s); + + /* We have two kinds of event sources that have elapsation times associated with them: the actual + * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified + * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are + * looking at here. */ + + if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */ + assert(s->rate_limit.begin != 0); + assert(s->rate_limit.interval != 0); + return usec_add(s->rate_limit.begin, s->rate_limit.interval); + } + + /* Otherwise this must be a time event source, if not ratelimited */ + if (EVENT_SOURCE_IS_TIME(s->type)) + return s->time.next; + + return USEC_INFINITY; +} + +static usec_t time_event_source_latest(const sd_event_source *s) { + assert(s); + + if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the + * same, as we should avoid adding additional inaccuracy on an inaccuracy time + * window */ + assert(s->rate_limit.begin != 0); + assert(s->rate_limit.interval != 0); + return usec_add(s->rate_limit.begin, s->rate_limit.interval); + } + + /* Must be a time event source, if not ratelimited */ + if (EVENT_SOURCE_IS_TIME(s->type)) + return usec_add(s->time.next, s->time.accuracy); + + return USEC_INFINITY; +} + +static bool event_source_timer_candidate(const sd_event_source *s) { + assert(s); + + /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending) + * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */ + return !s->pending || s->ratelimited; +} + +static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) { + const sd_event_source *x = a, *y = b; + int r; + + /* Enabled ones first */ + r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF); + if (r != 0) + return r; + + /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */ + r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y)); + if (r != 0) + return r; + + /* Order by time */ + return CMP(time_func(x), time_func(y)); +} + +static int earliest_time_prioq_compare(const void *a, const void *b) { + return time_prioq_compare(a, b, time_event_source_next); +} + +static int latest_time_prioq_compare(const void *a, const void *b) { + return time_prioq_compare(a, b, time_event_source_latest); +} + +static int exit_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + int r; + + assert(x->type == SOURCE_EXIT); + assert(y->type == SOURCE_EXIT); + + /* Enabled ones first */ + r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF); + if (r != 0) + return r; + + /* Lower priority values first */ + return CMP(x->priority, y->priority); +} + +static void free_clock_data(struct clock_data *d) { + assert(d); + assert(d->wakeup == WAKEUP_CLOCK_DATA); + + safe_close(d->fd); + prioq_free(d->earliest); + prioq_free(d->latest); +} + +static sd_event *event_free(sd_event *e) { + sd_event_source *s; + + assert(e); + + e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source); + e->sigint_event_source = sd_event_source_unref(e->sigint_event_source); + + while ((s = e->sources)) { + assert(s->floating); + source_disconnect(s); + sd_event_source_unref(s); + } + + assert(e->n_sources == 0); + + if (e->default_event_ptr) + *(e->default_event_ptr) = NULL; + + safe_close(e->epoll_fd); + safe_close(e->watchdog_fd); + + free_clock_data(&e->realtime); + free_clock_data(&e->boottime); + free_clock_data(&e->monotonic); + free_clock_data(&e->realtime_alarm); + free_clock_data(&e->boottime_alarm); + + prioq_free(e->pending); + prioq_free(e->prepare); + prioq_free(e->exit); + + free(e->signal_sources); + hashmap_free(e->signal_data); + + hashmap_free(e->inotify_data); + + hashmap_free(e->child_sources); + set_free(e->post_sources); + + free(e->event_queue); + + return mfree(e); +} + +_public_ int sd_event_new(sd_event** ret) { + sd_event *e; + int r; + + assert_return(ret, -EINVAL); + + e = new(sd_event, 1); + if (!e) + return -ENOMEM; + + *e = (sd_event) { + .n_ref = 1, + .epoll_fd = -EBADF, + .watchdog_fd = -EBADF, + .realtime.wakeup = WAKEUP_CLOCK_DATA, + .realtime.fd = -EBADF, + .realtime.next = USEC_INFINITY, + .boottime.wakeup = WAKEUP_CLOCK_DATA, + .boottime.fd = -EBADF, + .boottime.next = USEC_INFINITY, + .monotonic.wakeup = WAKEUP_CLOCK_DATA, + .monotonic.fd = -EBADF, + .monotonic.next = USEC_INFINITY, + .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA, + .realtime_alarm.fd = -EBADF, + .realtime_alarm.next = USEC_INFINITY, + .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA, + .boottime_alarm.fd = -EBADF, + .boottime_alarm.next = USEC_INFINITY, + .perturb = USEC_INFINITY, + .origin_id = origin_id_query(), + }; + + r = prioq_ensure_allocated(&e->pending, pending_prioq_compare); + if (r < 0) + goto fail; + + e->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (e->epoll_fd < 0) { + r = -errno; + goto fail; + } + + e->epoll_fd = fd_move_above_stdio(e->epoll_fd); + + if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) { + log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.", + special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + e->profile_delays = true; + } + + *ret = e; + return 0; + +fail: + event_free(e); + return r; +} + +/* Define manually so we can add the origin check */ +_public_ sd_event *sd_event_ref(sd_event *e) { + if (!e) + return NULL; + if (event_origin_changed(e)) + return NULL; + + e->n_ref++; + + return e; +} + +_public_ sd_event* sd_event_unref(sd_event *e) { + if (!e) + return NULL; + if (event_origin_changed(e)) + return NULL; + + assert(e->n_ref > 0); + if (--e->n_ref > 0) + return NULL; + + return event_free(e); +} + +#define PROTECT_EVENT(e) \ + _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e); + +_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) { + if (s) + (void) sd_event_source_set_enabled(s, SD_EVENT_OFF); + return sd_event_source_unref(s); +} + +static void source_io_unregister(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_IO); + + if (event_origin_changed(s->event)) + return; + + if (!s->io.registered) + return; + + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0) + log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m", + strna(s->description), event_source_type_to_string(s->type)); + + s->io.registered = false; +} + +static int source_io_register( + sd_event_source *s, + int enabled, + uint32_t events) { + + assert(s); + assert(s->type == SOURCE_IO); + assert(enabled != SD_EVENT_OFF); + + struct epoll_event ev = { + .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0), + .data.ptr = s, + }; + + if (epoll_ctl(s->event->epoll_fd, + s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, + s->io.fd, &ev) < 0) + return -errno; + + s->io.registered = true; + + return 0; +} + +static void source_child_pidfd_unregister(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_CHILD); + + if (event_origin_changed(s->event)) + return; + + if (!s->child.registered) + return; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0) + log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m", + strna(s->description), event_source_type_to_string(s->type)); + + s->child.registered = false; +} + +static int source_child_pidfd_register(sd_event_source *s, int enabled) { + assert(s); + assert(s->type == SOURCE_CHILD); + assert(enabled != SD_EVENT_OFF); + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + struct epoll_event ev = { + .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0), + .data.ptr = s, + }; + + if (epoll_ctl(s->event->epoll_fd, + s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, + s->child.pidfd, &ev) < 0) + return -errno; + } + + s->child.registered = true; + return 0; +} + +static void source_memory_pressure_unregister(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (event_origin_changed(s->event)) + return; + + if (!s->memory_pressure.registered) + return; + + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0) + log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m", + strna(s->description), event_source_type_to_string(s->type)); + + s->memory_pressure.registered = false; +} + +static int source_memory_pressure_register(sd_event_source *s, int enabled) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(enabled != SD_EVENT_OFF); + + struct epoll_event ev = { + .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT : + (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)), + .data.ptr = s, + }; + + if (epoll_ctl(s->event->epoll_fd, + s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, + s->memory_pressure.fd, &ev) < 0) + return -errno; + + s->memory_pressure.registered = true; + return 0; +} + +static void source_memory_pressure_add_to_write_list(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (s->memory_pressure.in_write_list) + return; + + LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s); + s->memory_pressure.in_write_list = true; +} + +static void source_memory_pressure_remove_from_write_list(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (!s->memory_pressure.in_write_list) + return; + + LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s); + s->memory_pressure.in_write_list = false; +} + +static clockid_t event_source_type_to_clock(EventSourceType t) { + + switch (t) { + + case SOURCE_TIME_REALTIME: + return CLOCK_REALTIME; + + case SOURCE_TIME_BOOTTIME: + return CLOCK_BOOTTIME; + + case SOURCE_TIME_MONOTONIC: + return CLOCK_MONOTONIC; + + case SOURCE_TIME_REALTIME_ALARM: + return CLOCK_REALTIME_ALARM; + + case SOURCE_TIME_BOOTTIME_ALARM: + return CLOCK_BOOTTIME_ALARM; + + default: + return (clockid_t) -1; + } +} + +static EventSourceType clock_to_event_source_type(clockid_t clock) { + + switch (clock) { + + case CLOCK_REALTIME: + return SOURCE_TIME_REALTIME; + + case CLOCK_BOOTTIME: + return SOURCE_TIME_BOOTTIME; + + case CLOCK_MONOTONIC: + return SOURCE_TIME_MONOTONIC; + + case CLOCK_REALTIME_ALARM: + return SOURCE_TIME_REALTIME_ALARM; + + case CLOCK_BOOTTIME_ALARM: + return SOURCE_TIME_BOOTTIME_ALARM; + + default: + return _SOURCE_EVENT_SOURCE_TYPE_INVALID; + } +} + +static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) { + assert(e); + + switch (t) { + + case SOURCE_TIME_REALTIME: + return &e->realtime; + + case SOURCE_TIME_BOOTTIME: + return &e->boottime; + + case SOURCE_TIME_MONOTONIC: + return &e->monotonic; + + case SOURCE_TIME_REALTIME_ALARM: + return &e->realtime_alarm; + + case SOURCE_TIME_BOOTTIME_ALARM: + return &e->boottime_alarm; + + default: + return NULL; + } +} + +static void event_free_signal_data(sd_event *e, struct signal_data *d) { + assert(e); + + if (!d) + return; + + hashmap_remove(e->signal_data, &d->priority); + safe_close(d->fd); + free(d); +} + +static int event_make_signal_data( + sd_event *e, + int sig, + struct signal_data **ret) { + + struct signal_data *d; + bool added = false; + sigset_t ss_copy; + int64_t priority; + int r; + + assert(e); + + if (event_origin_changed(e)) + return -ECHILD; + + if (e->signal_sources && e->signal_sources[sig]) + priority = e->signal_sources[sig]->priority; + else + priority = SD_EVENT_PRIORITY_NORMAL; + + d = hashmap_get(e->signal_data, &priority); + if (d) { + if (sigismember(&d->sigset, sig) > 0) { + if (ret) + *ret = d; + return 0; + } + } else { + d = new(struct signal_data, 1); + if (!d) + return -ENOMEM; + + *d = (struct signal_data) { + .wakeup = WAKEUP_SIGNAL_DATA, + .fd = -EBADF, + .priority = priority, + }; + + r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d); + if (r < 0) { + free(d); + return r; + } + + added = true; + } + + ss_copy = d->sigset; + assert_se(sigaddset(&ss_copy, sig) >= 0); + + r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */ + &ss_copy, + SFD_NONBLOCK|SFD_CLOEXEC); + if (r < 0) { + r = -errno; + goto fail; + } + + d->sigset = ss_copy; + + if (d->fd >= 0) { + if (ret) + *ret = d; + return 0; + } + + d->fd = fd_move_above_stdio(r); + + struct epoll_event ev = { + .events = EPOLLIN, + .data.ptr = d, + }; + + if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) { + r = -errno; + goto fail; + } + + if (ret) + *ret = d; + + return 0; + +fail: + if (added) + event_free_signal_data(e, d); + + return r; +} + +static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) { + assert(e); + assert(d); + + /* Turns off the specified signal in the signal data + * object. If the signal mask of the object becomes empty that + * way removes it. */ + + if (sigismember(&d->sigset, sig) == 0) + return; + + assert_se(sigdelset(&d->sigset, sig) >= 0); + + if (sigisemptyset(&d->sigset)) { + /* If all the mask is all-zero we can get rid of the structure */ + event_free_signal_data(e, d); + return; + } + + if (event_origin_changed(e)) + return; + + assert(d->fd >= 0); + + if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0) + log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m"); +} + +static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) { + struct signal_data *d; + static const int64_t zero_priority = 0; + + assert(e); + + /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it, + * and possibly drop the signalfd for it. */ + + if (sig == SIGCHLD && + e->n_online_child_sources > 0) + return; + + if (e->signal_sources && + e->signal_sources[sig] && + event_source_is_online(e->signal_sources[sig])) + return; + + /* + * The specified signal might be enabled in three different queues: + * + * 1) the one that belongs to the priority passed (if it is non-NULL) + * 2) the one that belongs to the priority of the event source of the signal (if there is one) + * 3) the 0 priority (to cover the SIGCHLD case) + * + * Hence, let's remove it from all three here. + */ + + if (priority) { + d = hashmap_get(e->signal_data, priority); + if (d) + event_unmask_signal_data(e, d, sig); + } + + if (e->signal_sources && e->signal_sources[sig]) { + d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority); + if (d) + event_unmask_signal_data(e, d, sig); + } + + d = hashmap_get(e->signal_data, &zero_priority); + if (d) + event_unmask_signal_data(e, d, sig); +} + +static void event_source_pp_prioq_reshuffle(sd_event_source *s) { + assert(s); + + /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when + * they are enabled/disabled or marked pending and such. */ + + if (s->pending) + prioq_reshuffle(s->event->pending, s, &s->pending_index); + + if (s->prepare) + prioq_reshuffle(s->event->prepare, s, &s->prepare_index); +} + +static void event_source_time_prioq_reshuffle(sd_event_source *s) { + struct clock_data *d; + + assert(s); + + /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy, + * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered + * properly again. */ + + if (s->ratelimited) + d = &s->event->monotonic; + else if (EVENT_SOURCE_IS_TIME(s->type)) + assert_se(d = event_get_clock_data(s->event, s->type)); + else + return; /* no-op for an event source which is neither a timer nor ratelimited. */ + + prioq_reshuffle(d->earliest, s, &s->earliest_index); + prioq_reshuffle(d->latest, s, &s->latest_index); + d->needs_rearm = true; +} + +static void event_source_time_prioq_remove( + sd_event_source *s, + struct clock_data *d) { + + assert(s); + assert(d); + + prioq_remove(d->earliest, s, &s->earliest_index); + prioq_remove(d->latest, s, &s->latest_index); + s->earliest_index = s->latest_index = PRIOQ_IDX_NULL; + d->needs_rearm = true; +} + +static void source_disconnect(sd_event_source *s) { + sd_event *event; + int r; + + assert(s); + + if (!s->event) + return; + + assert(s->event->n_sources > 0); + + switch (s->type) { + + case SOURCE_IO: + if (s->io.fd >= 0) + source_io_unregister(s); + + break; + + case SOURCE_TIME_REALTIME: + case SOURCE_TIME_BOOTTIME: + case SOURCE_TIME_MONOTONIC: + case SOURCE_TIME_REALTIME_ALARM: + case SOURCE_TIME_BOOTTIME_ALARM: + /* Only remove this event source from the time event source here if it is not ratelimited. If + * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might + * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */ + + if (!s->ratelimited) { + struct clock_data *d; + assert_se(d = event_get_clock_data(s->event, s->type)); + event_source_time_prioq_remove(s, d); + } + + break; + + case SOURCE_SIGNAL: + if (s->signal.sig > 0) { + + if (s->event->signal_sources) + s->event->signal_sources[s->signal.sig] = NULL; + + event_gc_signal_data(s->event, &s->priority, s->signal.sig); + + if (s->signal.unblock) { + sigset_t new_ss; + + if (sigemptyset(&new_ss) < 0) + log_debug_errno(errno, "Failed to reset signal set, ignoring: %m"); + else if (sigaddset(&new_ss, s->signal.sig) < 0) + log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig); + else { + r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL); + if (r != 0) + log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig); + } + } + } + + break; + + case SOURCE_CHILD: + if (event_origin_changed(s->event)) + s->child.process_owned = false; + + if (s->child.pid > 0) { + if (event_source_is_online(s)) { + assert(s->event->n_online_child_sources > 0); + s->event->n_online_child_sources--; + } + + (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid)); + } + + if (EVENT_SOURCE_WATCH_PIDFD(s)) + source_child_pidfd_unregister(s); + else + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + + break; + + case SOURCE_DEFER: + /* nothing */ + break; + + case SOURCE_POST: + set_remove(s->event->post_sources, s); + break; + + case SOURCE_EXIT: + prioq_remove(s->event->exit, s, &s->exit.prioq_index); + break; + + case SOURCE_INOTIFY: { + struct inode_data *inode_data; + + inode_data = s->inotify.inode_data; + if (inode_data) { + struct inotify_data *inotify_data; + assert_se(inotify_data = inode_data->inotify_data); + + /* Detach this event source from the inode object */ + LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s); + s->inotify.inode_data = NULL; + + if (s->pending) { + assert(inotify_data->n_pending > 0); + inotify_data->n_pending--; + } + + /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is + * continued to being watched. That's because inotify doesn't really have an API for that: we + * can only change watch masks with access to the original inode either by fd or by path. But + * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd + * continuously and keeping the mount busy which we can't really do. We could reconstruct the + * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed + * there), but given the need for open_by_handle_at() which is privileged and not universally + * available this would be quite an incomplete solution. Hence we go the other way, leave the + * mask set, even if it is not minimized now, and ignore all events we aren't interested in + * anymore after reception. Yes, this sucks, but … Linux … */ + + /* Maybe release the inode data (and its inotify) */ + event_gc_inode_data(s->event, inode_data); + } + + break; + } + + case SOURCE_MEMORY_PRESSURE: + source_memory_pressure_remove_from_write_list(s); + source_memory_pressure_unregister(s); + break; + + default: + assert_not_reached(); + } + + if (s->pending) + prioq_remove(s->event->pending, s, &s->pending_index); + + if (s->prepare) + prioq_remove(s->event->prepare, s, &s->prepare_index); + + if (s->ratelimited) + event_source_time_prioq_remove(s, &s->event->monotonic); + + event = TAKE_PTR(s->event); + LIST_REMOVE(sources, event->sources, s); + event->n_sources--; + + /* Note that we don't invalidate the type here, since we still need it in order to close the fd or + * pidfd associated with this event source, which we'll do only on source_free(). */ + + if (!s->floating) + sd_event_unref(event); +} + +static sd_event_source* source_free(sd_event_source *s) { + assert(s); + + source_disconnect(s); + + if (s->type == SOURCE_IO && s->io.owned) + s->io.fd = safe_close(s->io.fd); + + if (s->type == SOURCE_CHILD) { + /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */ + + if (s->child.process_owned) { + + if (!s->child.exited) { + bool sent = false; + + if (s->child.pidfd >= 0) { + if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) { + if (errno == ESRCH) /* Already dead */ + sent = true; + else if (!ERRNO_IS_NOT_SUPPORTED(errno)) + log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m", + s->child.pid); + } else + sent = true; + } + + if (!sent) + if (kill(s->child.pid, SIGKILL) < 0) + if (errno != ESRCH) /* Already dead */ + log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m", + s->child.pid); + } + + if (!s->child.waited) { + siginfo_t si = {}; + + /* Reap the child if we can */ + (void) waitid(P_PID, s->child.pid, &si, WEXITED); + } + } + + if (s->child.pidfd_owned) + s->child.pidfd = safe_close(s->child.pidfd); + } + + if (s->type == SOURCE_MEMORY_PRESSURE) { + s->memory_pressure.fd = safe_close(s->memory_pressure.fd); + s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer); + } + + if (s->destroy_callback) + s->destroy_callback(s->userdata); + + free(s->description); + return mfree(s); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free); + +static int source_set_pending(sd_event_source *s, bool b) { + int r; + + assert(s); + assert(s->type != SOURCE_EXIT); + + if (s->pending == b) + return 0; + + s->pending = b; + + if (b) { + s->pending_iteration = s->event->iteration; + + r = prioq_put(s->event->pending, s, &s->pending_index); + if (r < 0) { + s->pending = false; + return r; + } + } else + assert_se(prioq_remove(s->event->pending, s, &s->pending_index)); + + if (EVENT_SOURCE_IS_TIME(s->type)) + event_source_time_prioq_reshuffle(s); + + if (s->type == SOURCE_SIGNAL && !b) { + struct signal_data *d; + + d = hashmap_get(s->event->signal_data, &s->priority); + if (d && d->current == s) + d->current = NULL; + } + + if (s->type == SOURCE_INOTIFY) { + + assert(s->inotify.inode_data); + assert(s->inotify.inode_data->inotify_data); + + if (b) + s->inotify.inode_data->inotify_data->n_pending ++; + else { + assert(s->inotify.inode_data->inotify_data->n_pending > 0); + s->inotify.inode_data->inotify_data->n_pending --; + } + } + + return 1; +} + +static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) { + + /* Let's allocate exactly what we need. Note that the difference of the smallest event source + * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache + * lines. */ + static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = { + [SOURCE_IO] = endoffsetof_field(sd_event_source, io), + [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time), + [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time), + [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time), + [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time), + [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time), + [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal), + [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child), + [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer), + [SOURCE_POST] = endoffsetof_field(sd_event_source, post), + [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit), + [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify), + [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure), + }; + + sd_event_source *s; + + assert(e); + assert(type >= 0); + assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX); + assert(size_table[type] > 0); + + s = malloc0(size_table[type]); + if (!s) + return NULL; + /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full + * size, even if we only allocate the initial part we need. */ + s = expand_to_usable(s, sizeof(sd_event_source)); + + /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger + * than what we allocated here. */ + s->n_ref = 1; + s->event = e; + s->floating = floating; + s->type = type; + s->pending_index = PRIOQ_IDX_NULL; + s->prepare_index = PRIOQ_IDX_NULL; + + if (!floating) + sd_event_ref(e); + + LIST_PREPEND(sources, e->sources, s); + e->n_sources++; + + return s; +} + +static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + assert(s); + + return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata)); +} + +_public_ int sd_event_add_io( + sd_event *e, + sd_event_source **ret, + int fd, + uint32_t events, + sd_event_io_handler_t callback, + void *userdata) { + + _cleanup_(source_freep) sd_event_source *s = NULL; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(fd >= 0, -EBADF); + assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = io_exit_callback; + + s = source_new(e, !ret, SOURCE_IO); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->io.fd = fd; + s->io.events = events; + s->io.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + + r = source_io_register(s, s->enabled, events); + if (r < 0) + return r; + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +static void initialize_perturb(sd_event *e) { + sd_id128_t id = {}; + + /* When we sleep for longer, we try to realign the wakeup to the same time within each + * minute/second/250ms, so that events all across the system can be coalesced into a single CPU + * wakeup. However, let's take some system-specific randomness for this value, so that in a network + * of systems with synced clocks timer events are distributed a bit. Here, we calculate a + * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */ + + if (_likely_(e->perturb != USEC_INFINITY)) + return; + + if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0) + e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE; + else + e->perturb = 0; /* This is a super early process without /proc and /etc ?? */ +} + +static int event_setup_timer_fd( + sd_event *e, + struct clock_data *d, + clockid_t clock) { + + assert(e); + assert(d); + + if (_likely_(d->fd >= 0)) + return 0; + + _cleanup_close_ int fd = -EBADF; + + fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC); + if (fd < 0) + return -errno; + + fd = fd_move_above_stdio(fd); + + struct epoll_event ev = { + .events = EPOLLIN, + .data.ptr = d, + }; + + if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) + return -errno; + + d->fd = TAKE_FD(fd); + return 0; +} + +static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) { + assert(s); + + return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata)); +} + +static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) { + int r; + + assert(d); + + if (d->fd < 0) { + r = event_setup_timer_fd(e, d, clock); + if (r < 0) + return r; + } + + r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare); + if (r < 0) + return r; + + r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare); + if (r < 0) + return r; + + return 0; +} + +static int event_source_time_prioq_put( + sd_event_source *s, + struct clock_data *d) { + + int r; + + assert(s); + assert(d); + assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type)); + + r = prioq_put(d->earliest, s, &s->earliest_index); + if (r < 0) + return r; + + r = prioq_put(d->latest, s, &s->latest_index); + if (r < 0) { + assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0); + s->earliest_index = PRIOQ_IDX_NULL; + return r; + } + + d->needs_rearm = true; + return 0; +} + +_public_ int sd_event_add_time( + sd_event *e, + sd_event_source **ret, + clockid_t clock, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata) { + + EventSourceType type; + _cleanup_(source_freep) sd_event_source *s = NULL; + struct clock_data *d; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(accuracy != UINT64_MAX, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */ + return -EOPNOTSUPP; + + type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */ + if (type < 0) + return -EOPNOTSUPP; + + if (!callback) + callback = time_exit_callback; + + assert_se(d = event_get_clock_data(e, type)); + + r = setup_clock_data(e, d, clock); + if (r < 0) + return r; + + s = source_new(e, !ret, type); + if (!s) + return -ENOMEM; + + s->time.next = usec; + s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy; + s->time.callback = callback; + s->earliest_index = s->latest_index = PRIOQ_IDX_NULL; + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + r = event_source_time_prioq_put(s, d); + if (r < 0) + return r; + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +_public_ int sd_event_add_time_relative( + sd_event *e, + sd_event_source **ret, + clockid_t clock, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata) { + + usec_t t; + int r; + + /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and + * checks for overflow. */ + + r = sd_event_now(e, clock, &t); + if (r < 0) + return r; + + if (usec >= USEC_INFINITY - t) + return -EOVERFLOW; + + return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata); +} + +static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + assert(s); + + return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata)); +} + +_public_ int sd_event_add_signal( + sd_event *e, + sd_event_source **ret, + int sig, + sd_event_signal_handler_t callback, + void *userdata) { + + _cleanup_(source_freep) sd_event_source *s = NULL; + struct signal_data *d; + sigset_t new_ss; + bool block_it; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + /* Let's make sure our special flag stays outside of the valid signal range */ + assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK); + + if (sig & SD_EVENT_SIGNAL_PROCMASK) { + sig &= ~SD_EVENT_SIGNAL_PROCMASK; + assert_return(SIGNAL_VALID(sig), -EINVAL); + + block_it = true; + } else { + assert_return(SIGNAL_VALID(sig), -EINVAL); + + r = signal_is_blocked(sig); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + + block_it = false; + } + + if (!callback) + callback = signal_exit_callback; + + if (!e->signal_sources) { + e->signal_sources = new0(sd_event_source*, _NSIG); + if (!e->signal_sources) + return -ENOMEM; + } else if (e->signal_sources[sig]) + return -EBUSY; + + s = source_new(e, !ret, SOURCE_SIGNAL); + if (!s) + return -ENOMEM; + + s->signal.sig = sig; + s->signal.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + + e->signal_sources[sig] = s; + + if (block_it) { + sigset_t old_ss; + + if (sigemptyset(&new_ss) < 0) + return -errno; + + if (sigaddset(&new_ss, sig) < 0) + return -errno; + + r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss); + if (r != 0) + return -r; + + r = sigismember(&old_ss, sig); + if (r < 0) + return -errno; + + s->signal.unblock = !r; + } else + s->signal.unblock = false; + + r = event_make_signal_data(e, sig, &d); + if (r < 0) { + if (s->signal.unblock) + (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL); + + return r; + } + + /* Use the signal name as description for the event source by default */ + (void) sd_event_source_set_description(s, signal_to_string(sig)); + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) { + assert(s); + + return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata)); +} + +static bool shall_use_pidfd(void) { + /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */ + return getenv_bool_secure("SYSTEMD_PIDFD") != 0; +} + +_public_ int sd_event_add_child( + sd_event *e, + sd_event_source **ret, + pid_t pid, + int options, + sd_event_child_handler_t callback, + void *userdata) { + + _cleanup_(source_freep) sd_event_source *s = NULL; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(pid > 1, -EINVAL); + assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL); + assert_return(options != 0, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = child_exit_callback; + + if (e->n_online_child_sources == 0) { + /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available, + * for compatibility with pre-pidfd and because we don't want the reap the child processes + * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to + * take effect. + * + * (As an optimization we only do this check on the first child event source created.) */ + r = signal_is_blocked(SIGCHLD); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + } + + r = hashmap_ensure_allocated(&e->child_sources, NULL); + if (r < 0) + return r; + + if (hashmap_contains(e->child_sources, PID_TO_PTR(pid))) + return -EBUSY; + + s = source_new(e, !ret, SOURCE_CHILD); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->child.options = options; + s->child.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we + * pin the PID, and make regular waitid() handling race-free. */ + + if (shall_use_pidfd()) { + s->child.pidfd = pidfd_open(pid, 0); + if (s->child.pidfd < 0) { + /* Propagate errors unless the syscall is not supported or blocked */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else + s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */ + } else + s->child.pidfd = -EBADF; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* We have a pidfd and we only want to watch for exit */ + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) + return r; + + } else { + /* We have no pidfd or we shall wait for some other event than WEXITED */ + r = event_make_signal_data(e, SIGCHLD, NULL); + if (r < 0) + return r; + + e->need_process_child = true; + } + + r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s); + if (r < 0) + return r; + + /* These must be done after everything succeeds. */ + s->child.pid = pid; + e->n_online_child_sources++; + + if (ret) + *ret = s; + TAKE_PTR(s); + return 0; +} + +_public_ int sd_event_add_child_pidfd( + sd_event *e, + sd_event_source **ret, + int pidfd, + int options, + sd_event_child_handler_t callback, + void *userdata) { + + + _cleanup_(source_freep) sd_event_source *s = NULL; + pid_t pid; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(pidfd >= 0, -EBADF); + assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL); + assert_return(options != 0, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = child_exit_callback; + + if (e->n_online_child_sources == 0) { + r = signal_is_blocked(SIGCHLD); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + } + + r = hashmap_ensure_allocated(&e->child_sources, NULL); + if (r < 0) + return r; + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + if (hashmap_contains(e->child_sources, PID_TO_PTR(pid))) + return -EBUSY; + + s = source_new(e, !ret, SOURCE_CHILD); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->child.pidfd = pidfd; + s->child.pid = pid; + s->child.options = options; + s->child.callback = callback; + s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */ + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s); + if (r < 0) + return r; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* We only want to watch for WEXITED */ + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) + return r; + } else { + /* We shall wait for some other event than WEXITED */ + r = event_make_signal_data(e, SIGCHLD, NULL); + if (r < 0) + return r; + + e->need_process_child = true; + } + + e->n_online_child_sources++; + + if (ret) + *ret = s; + TAKE_PTR(s); + return 0; +} + +static int generic_exit_callback(sd_event_source *s, void *userdata) { + assert(s); + + return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata)); +} + +_public_ int sd_event_add_defer( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + _cleanup_(source_freep) sd_event_source *s = NULL; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = generic_exit_callback; + + s = source_new(e, !ret, SOURCE_DEFER); + if (!s) + return -ENOMEM; + + s->defer.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + r = source_set_pending(s, true); + if (r < 0) + return r; + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +_public_ int sd_event_add_post( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + _cleanup_(source_freep) sd_event_source *s = NULL; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = generic_exit_callback; + + s = source_new(e, !ret, SOURCE_POST); + if (!s) + return -ENOMEM; + + s->post.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + + r = set_ensure_put(&e->post_sources, NULL, s); + if (r < 0) + return r; + assert(r > 0); + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +_public_ int sd_event_add_exit( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + _cleanup_(source_freep) sd_event_source *s = NULL; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(callback, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + r = prioq_ensure_allocated(&e->exit, exit_prioq_compare); + if (r < 0) + return r; + + s = source_new(e, !ret, SOURCE_EXIT); + if (!s) + return -ENOMEM; + + s->exit.callback = callback; + s->userdata = userdata; + s->exit.prioq_index = PRIOQ_IDX_NULL; + s->enabled = SD_EVENT_ONESHOT; + + r = prioq_put(s->event->exit, s, &s->exit.prioq_index); + if (r < 0) + return r; + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +_public_ int sd_event_trim_memory(void) { + int r; + + /* A default implementation of a memory pressure callback. Simply releases our own allocation caches + * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a + * NULL callback parameter. */ + + log_debug("Memory pressure event, trimming malloc() memory."); + +#if HAVE_GENERIC_MALLINFO + generic_mallinfo before_mallinfo = generic_mallinfo_get(); +#endif + + usec_t before_timestamp = now(CLOCK_MONOTONIC); + hashmap_trim_pools(); + r = malloc_trim(0); + usec_t after_timestamp = now(CLOCK_MONOTONIC); + + if (r > 0) + log_debug("Successfully trimmed some memory."); + else + log_debug("Couldn't trim any memory."); + + usec_t period = after_timestamp - before_timestamp; + +#if HAVE_GENERIC_MALLINFO + generic_mallinfo after_mallinfo = generic_mallinfo_get(); + size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) + + LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena); + log_struct(LOG_DEBUG, + LOG_MESSAGE("Memory trimming took %s, returned %s to OS.", + FORMAT_TIMESPAN(period, 0), + FORMAT_BYTES(l)), + "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR, + "TRIMMED_BYTES=%zu", l, + "TRIMMED_USEC=" USEC_FMT, period); +#else + log_struct(LOG_DEBUG, + LOG_MESSAGE("Memory trimming took %s.", + FORMAT_TIMESPAN(period, 0)), + "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR, + "TRIMMED_USEC=" USEC_FMT, period); +#endif + + return 0; +} + +static int memory_pressure_callback(sd_event_source *s, void *userdata) { + assert(s); + + sd_event_trim_memory(); + return 0; +} + +_public_ int sd_event_add_memory_pressure( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + _cleanup_free_ char *w = NULL; + _cleanup_(source_freep) sd_event_source *s = NULL; + _cleanup_close_ int path_fd = -EBADF, fd = -EBADF; + _cleanup_free_ void *write_buffer = NULL; + const char *watch, *watch_fallback = NULL, *env; + size_t write_buffer_size = 0; + struct stat st; + uint32_t events; + bool locked; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = memory_pressure_callback; + + s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->memory_pressure.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + s->memory_pressure.fd = -EBADF; + + env = secure_getenv("MEMORY_PRESSURE_WATCH"); + if (env) { + if (isempty(env) || path_equal(env, "/dev/null")) + return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH."); + + if (!path_is_absolute(env) || !path_is_normalized(env)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env); + + watch = env; + + env = secure_getenv("MEMORY_PRESSURE_WRITE"); + if (env) { + r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size); + if (r < 0) + return r; + } + + locked = true; + } else { + + r = is_pressure_supported(); + if (r < 0) + return r; + if (r == 0) + return -EOPNOTSUPP; + + /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on + * the system wide pressure if for some reason we cannot (which could be: memory controller + * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll + * only use the system-wide logic. */ + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) + watch = "/proc/pressure/memory"; + else { + _cleanup_free_ char *cg = NULL; + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg); + if (r < 0) + return r; + + w = path_join("/sys/fs/cgroup", cg, "memory.pressure"); + if (!w) + return -ENOMEM; + + watch = w; + watch_fallback = "/proc/pressure/memory"; + } + + /* Android uses three levels in its userspace low memory killer logic: + * some 70000 1000000 + * some 100000 1000000 + * full 70000 1000000 + * + * GNOME's low memory monitor uses: + * some 70000 1000000 + * some 100000 1000000 + * full 100000 1000000 + * + * We'll default to the middle level that both agree on. Except we do it on a 2s window + * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the + * kernel will allow us to do unprivileged, also in the future. */ + if (asprintf((char**) &write_buffer, + "%s " USEC_FMT " " USEC_FMT, + MEMORY_PRESSURE_DEFAULT_TYPE, + MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC, + MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0) + return -ENOMEM; + + write_buffer_size = strlen(write_buffer) + 1; + locked = false; + } + + path_fd = open(watch, O_PATH|O_CLOEXEC); + if (path_fd < 0) { + if (errno != ENOENT) + return -errno; + + /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as + * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and + * the PSI service apparently is not supported) */ + if (!watch_fallback) + return locked ? -ENOENT : -EOPNOTSUPP; + + path_fd = open(watch_fallback, O_PATH|O_CLOEXEC); + if (path_fd < 0) { + if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */ + return -EOPNOTSUPP; + return -errno; + } + } + + if (fstat(path_fd, &st) < 0) + return -errno; + + if (S_ISSOCK(st.st_mode)) { + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + r = connect_unix_path(fd, path_fd, NULL); + if (r < 0) + return r; + + events = EPOLLIN; + + } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) { + fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return fd; + + if (S_ISREG(st.st_mode)) { + struct statfs sfs; + + /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */ + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) && + !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC)) + return -ENOTTY; + + events = EPOLLPRI; + } else + /* For fifos and char devices just watch for EPOLLIN */ + events = EPOLLIN; + + } else if (S_ISDIR(st.st_mode)) + return -EISDIR; + else + return -EBADF; + + s->memory_pressure.fd = TAKE_FD(fd); + s->memory_pressure.write_buffer = TAKE_PTR(write_buffer); + s->memory_pressure.write_buffer_size = write_buffer_size; + s->memory_pressure.events = events; + s->memory_pressure.locked = locked; + + /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the + * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the + * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure + * event sources on which writes must be executed before the first event loop iteration is + * executed. (We could also write the data here, right away, but we want to give the caller the + * freedom to call sd_event_source_set_memory_pressure_type() and + * sd_event_source_set_memory_pressure_rate() before we write it. */ + + if (s->memory_pressure.write_buffer_size > 0) + source_memory_pressure_add_to_write_list(s); + else { + r = source_memory_pressure_register(s, s->enabled); + if (r < 0) + return r; + } + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +static void event_free_inotify_data(sd_event *e, struct inotify_data *d) { + assert(e); + + if (!d) + return; + + assert(hashmap_isempty(d->inodes)); + assert(hashmap_isempty(d->wd)); + + if (d->buffer_filled > 0) + LIST_REMOVE(buffered, e->buffered_inotify_data_list, d); + + hashmap_free(d->inodes); + hashmap_free(d->wd); + + assert_se(hashmap_remove(e->inotify_data, &d->priority) == d); + + if (d->fd >= 0) { + if (!event_origin_changed(e) && + epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0) + log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m"); + + safe_close(d->fd); + } + free(d); +} + +static int event_make_inotify_data( + sd_event *e, + int64_t priority, + struct inotify_data **ret) { + + _cleanup_close_ int fd = -EBADF; + struct inotify_data *d; + int r; + + assert(e); + + d = hashmap_get(e->inotify_data, &priority); + if (d) { + if (ret) + *ret = d; + return 0; + } + + fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC); + if (fd < 0) + return -errno; + + fd = fd_move_above_stdio(fd); + + d = new(struct inotify_data, 1); + if (!d) + return -ENOMEM; + + *d = (struct inotify_data) { + .wakeup = WAKEUP_INOTIFY_DATA, + .fd = TAKE_FD(fd), + .priority = priority, + }; + + r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d); + if (r < 0) { + d->fd = safe_close(d->fd); + free(d); + return r; + } + + struct epoll_event ev = { + .events = EPOLLIN, + .data.ptr = d, + }; + + if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) { + r = -errno; + d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise + * remove the fd from the epoll first, which we don't want as we couldn't + * add it in the first place. */ + event_free_inotify_data(e, d); + return r; + } + + if (ret) + *ret = d; + + return 1; +} + +static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) { + int r; + + assert(x); + assert(y); + + r = CMP(x->dev, y->dev); + if (r != 0) + return r; + + return CMP(x->ino, y->ino); +} + +static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) { + assert(d); + + siphash24_compress(&d->dev, sizeof(d->dev), state); + siphash24_compress(&d->ino, sizeof(d->ino), state); +} + +DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare); + +static void event_free_inode_data( + sd_event *e, + struct inode_data *d) { + + assert(e); + + if (!d) + return; + + assert(!d->event_sources); + + if (d->fd >= 0) { + LIST_REMOVE(to_close, e->inode_data_to_close_list, d); + safe_close(d->fd); + } + + if (d->inotify_data) { + + if (d->wd >= 0) { + if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) { + /* So here's a problem. At the time this runs the watch descriptor might already be + * invalidated, because an IN_IGNORED event might be queued right the moment we enter + * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very + * likely case to happen. */ + + if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL) + log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd); + } + + assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d); + } + + assert_se(hashmap_remove(d->inotify_data->inodes, d) == d); + } + + free(d); +} + +static void event_gc_inotify_data( + sd_event *e, + struct inotify_data *d) { + + assert(e); + + /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch + * any inode with it anymore, which in turn happens if no event source of this priority is interested + * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC + * (under the expectation that the GC is called again once the counter is decremented). */ + + if (!d) + return; + + if (!hashmap_isempty(d->inodes)) + return; + + if (d->n_busy > 0) + return; + + event_free_inotify_data(e, d); +} + +static void event_gc_inode_data( + sd_event *e, + struct inode_data *d) { + + struct inotify_data *inotify_data; + + assert(e); + + if (!d) + return; + + if (d->event_sources) + return; + + inotify_data = d->inotify_data; + event_free_inode_data(e, d); + + event_gc_inotify_data(e, inotify_data); +} + +static int event_make_inode_data( + sd_event *e, + struct inotify_data *inotify_data, + dev_t dev, + ino_t ino, + struct inode_data **ret) { + + struct inode_data *d, key; + int r; + + assert(e); + assert(inotify_data); + + key = (struct inode_data) { + .ino = ino, + .dev = dev, + }; + + d = hashmap_get(inotify_data->inodes, &key); + if (d) { + if (ret) + *ret = d; + + return 0; + } + + r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops); + if (r < 0) + return r; + + d = new(struct inode_data, 1); + if (!d) + return -ENOMEM; + + *d = (struct inode_data) { + .dev = dev, + .ino = ino, + .wd = -1, + .fd = -EBADF, + .inotify_data = inotify_data, + }; + + r = hashmap_put(inotify_data->inodes, d, d); + if (r < 0) { + free(d); + return r; + } + + if (ret) + *ret = d; + + return 1; +} + +static uint32_t inode_data_determine_mask(struct inode_data *d) { + bool excl_unlink = true; + uint32_t combined = 0; + + assert(d); + + /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but + * the IN_EXCL_UNLINK flag is ANDed instead. + * + * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's + * because we cannot change the mask anymore after the event source was created once, since the kernel has no + * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress + * events we don't care for client-side. */ + + LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) { + + if ((s->inotify.mask & IN_EXCL_UNLINK) == 0) + excl_unlink = false; + + combined |= s->inotify.mask; + } + + return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0); +} + +static int inode_data_realize_watch(sd_event *e, struct inode_data *d) { + uint32_t combined_mask; + int wd, r; + + assert(d); + assert(d->fd >= 0); + + combined_mask = inode_data_determine_mask(d); + + if (d->wd >= 0 && combined_mask == d->combined_mask) + return 0; + + r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL); + if (r < 0) + return r; + + wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask); + if (wd < 0) + return -errno; + + if (d->wd < 0) { + r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d); + if (r < 0) { + (void) inotify_rm_watch(d->inotify_data->fd, wd); + return r; + } + + d->wd = wd; + + } else if (d->wd != wd) { + + log_debug("Weird, the watch descriptor we already knew for this inode changed?"); + (void) inotify_rm_watch(d->fd, wd); + return -EINVAL; + } + + d->combined_mask = combined_mask; + return 1; +} + +static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) { + assert(s); + + return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata)); +} + +static int event_add_inotify_fd_internal( + sd_event *e, + sd_event_source **ret, + int fd, + bool donate, + uint32_t mask, + sd_event_inotify_handler_t callback, + void *userdata) { + + _cleanup_close_ int donated_fd = donate ? fd : -EBADF; + _cleanup_(source_freep) sd_event_source *s = NULL; + struct inotify_data *inotify_data = NULL; + struct inode_data *inode_data = NULL; + struct stat st; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(fd >= 0, -EBADF); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!callback) + callback = inotify_exit_callback; + + /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge + * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence + * the user can't use them for us. */ + if (mask & IN_MASK_ADD) + return -EINVAL; + + if (fstat(fd, &st) < 0) + return -errno; + + s = source_new(e, !ret, SOURCE_INOTIFY); + if (!s) + return -ENOMEM; + + s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON; + s->inotify.mask = mask; + s->inotify.callback = callback; + s->userdata = userdata; + + /* Allocate an inotify object for this priority, and an inode object within it */ + r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data); + if (r < 0) + return r; + + r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data); + if (r < 0) { + event_gc_inotify_data(e, inotify_data); + return r; + } + + /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of + * the event source, until then, for which we need the original inode. */ + if (inode_data->fd < 0) { + if (donated_fd >= 0) + inode_data->fd = TAKE_FD(donated_fd); + else { + inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (inode_data->fd < 0) { + r = -errno; + event_gc_inode_data(e, inode_data); + return r; + } + } + + LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data); + } + + /* Link our event source to the inode data object */ + LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s); + s->inotify.inode_data = inode_data; + + /* Actually realize the watch now */ + r = inode_data_realize_watch(e, inode_data); + if (r < 0) + return r; + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + +_public_ int sd_event_add_inotify_fd( + sd_event *e, + sd_event_source **ret, + int fd, + uint32_t mask, + sd_event_inotify_handler_t callback, + void *userdata) { + + return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata); +} + +_public_ int sd_event_add_inotify( + sd_event *e, + sd_event_source **ret, + const char *path, + uint32_t mask, + sd_event_inotify_handler_t callback, + void *userdata) { + + sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */ + int fd, r; + + assert_return(path, -EINVAL); + + fd = open(path, O_PATH | O_CLOEXEC | + (mask & IN_ONLYDIR ? O_DIRECTORY : 0) | + (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0)); + if (fd < 0) + return -errno; + + r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, path); + + if (ret) + *ret = s; + + return r; +} + +static sd_event_source* event_source_free(sd_event_source *s) { + if (!s) + return NULL; + + /* Here's a special hack: when we are called from a + * dispatch handler we won't free the event source + * immediately, but we will detach the fd from the + * epoll. This way it is safe for the caller to unref + * the event source and immediately close the fd, but + * we still retain a valid event source object after + * the callback. */ + + if (s->dispatching) + source_disconnect(s); + else + source_free(s); + + return NULL; +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free); + +_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return free_and_strdup(&s->description, description); +} + +_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) { + assert_return(s, -EINVAL); + assert_return(description, -EINVAL); + + if (!s->description) + return -ENXIO; + + *description = s->description; + return 0; +} + +_public_ sd_event *sd_event_source_get_event(sd_event_source *s) { + assert_return(s, NULL); + assert_return(!event_origin_changed(s->event), NULL); + + return s->event; +} + +_public_ int sd_event_source_get_pending(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type != SOURCE_EXIT, -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->pending; +} + +_public_ int sd_event_source_get_io_fd(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->io.fd; +} + +_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) { + int r; + + assert_return(s, -EINVAL); + assert_return(fd >= 0, -EBADF); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->io.fd == fd) + return 0; + + if (event_source_is_offline(s)) { + s->io.fd = fd; + s->io.registered = false; + } else { + int saved_fd; + + saved_fd = s->io.fd; + assert(s->io.registered); + + s->io.fd = fd; + s->io.registered = false; + + r = source_io_register(s, s->enabled, s->io.events); + if (r < 0) { + s->io.fd = saved_fd; + s->io.registered = true; + return r; + } + + (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL); + } + + return 0; +} + +_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->io.owned; +} + +_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + s->io.owned = own; + return 0; +} + +_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) { + assert_return(s, -EINVAL); + assert_return(events, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *events = s->io.events; + return 0; +} + +_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) { + int r; + + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(s->event), -ECHILD); + + /* edge-triggered updates are never skipped, so we can reset edges */ + if (s->io.events == events && !(events & EPOLLET)) + return 0; + + r = source_set_pending(s, false); + if (r < 0) + return r; + + if (event_source_is_online(s)) { + r = source_io_register(s, s->enabled, events); + if (r < 0) + return r; + } + + s->io.events = events; + + return 0; +} + +_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) { + assert_return(s, -EINVAL); + assert_return(revents, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(s->pending, -ENODATA); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *revents = s->io.revents; + return 0; +} + +_public_ int sd_event_source_get_signal(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_SIGNAL, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->signal.sig; +} + +_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *priority = s->priority; + return 0; +} + +_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) { + bool rm_inotify = false, rm_inode = false; + struct inotify_data *new_inotify_data = NULL; + struct inode_data *new_inode_data = NULL; + int r; + + assert_return(s, -EINVAL); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->priority == priority) + return 0; + + if (s->type == SOURCE_INOTIFY) { + struct inode_data *old_inode_data; + + assert(s->inotify.inode_data); + old_inode_data = s->inotify.inode_data; + + /* We need the original fd to change the priority. If we don't have it we can't change the priority, + * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify + * events we allow priority changes only until the first following iteration. */ + if (old_inode_data->fd < 0) + return -EOPNOTSUPP; + + r = event_make_inotify_data(s->event, priority, &new_inotify_data); + if (r < 0) + return r; + rm_inotify = r > 0; + + r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data); + if (r < 0) + goto fail; + rm_inode = r > 0; + + if (new_inode_data->fd < 0) { + /* Duplicate the fd for the new inode object if we don't have any yet */ + new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3); + if (new_inode_data->fd < 0) { + r = -errno; + goto fail; + } + + LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data); + } + + /* Move the event source to the new inode data structure */ + LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s); + LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s); + s->inotify.inode_data = new_inode_data; + + /* Now create the new watch */ + r = inode_data_realize_watch(s->event, new_inode_data); + if (r < 0) { + /* Move it back */ + LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s); + LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s); + s->inotify.inode_data = old_inode_data; + goto fail; + } + + s->priority = priority; + + event_gc_inode_data(s->event, old_inode_data); + + } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) { + struct signal_data *old, *d; + + /* Move us from the signalfd belonging to the old + * priority to the signalfd of the new priority */ + + assert_se(old = hashmap_get(s->event->signal_data, &s->priority)); + + s->priority = priority; + + r = event_make_signal_data(s->event, s->signal.sig, &d); + if (r < 0) { + s->priority = old->priority; + return r; + } + + event_unmask_signal_data(s->event, old, s->signal.sig); + } else + s->priority = priority; + + event_source_pp_prioq_reshuffle(s); + + if (s->type == SOURCE_EXIT) + prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); + + return 0; + +fail: + if (rm_inode) + event_free_inode_data(s->event, new_inode_data); + + if (rm_inotify) + event_free_inotify_data(s->event, new_inotify_data); + + return r; +} + +_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) { + /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */ + if (!s && !ret) + return false; + + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (ret) + *ret = s->enabled; + + return s->enabled != SD_EVENT_OFF; +} + +static int event_source_offline( + sd_event_source *s, + int enabled, + bool ratelimited) { + + bool was_offline; + int r; + + assert(s); + assert(enabled == SD_EVENT_OFF || ratelimited); + + /* Unset the pending flag when this event source is disabled */ + if (s->enabled != SD_EVENT_OFF && + enabled == SD_EVENT_OFF && + !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) { + r = source_set_pending(s, false); + if (r < 0) + return r; + } + + was_offline = event_source_is_offline(s); + s->enabled = enabled; + s->ratelimited = ratelimited; + + switch (s->type) { + + case SOURCE_IO: + source_io_unregister(s); + break; + + case SOURCE_SIGNAL: + event_gc_signal_data(s->event, &s->priority, s->signal.sig); + break; + + case SOURCE_CHILD: + if (!was_offline) { + assert(s->event->n_online_child_sources > 0); + s->event->n_online_child_sources--; + } + + if (EVENT_SOURCE_WATCH_PIDFD(s)) + source_child_pidfd_unregister(s); + else + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + break; + + case SOURCE_EXIT: + prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); + break; + + case SOURCE_MEMORY_PRESSURE: + source_memory_pressure_unregister(s); + break; + + case SOURCE_TIME_REALTIME: + case SOURCE_TIME_BOOTTIME: + case SOURCE_TIME_MONOTONIC: + case SOURCE_TIME_REALTIME_ALARM: + case SOURCE_TIME_BOOTTIME_ALARM: + case SOURCE_DEFER: + case SOURCE_POST: + case SOURCE_INOTIFY: + break; + + default: + assert_not_reached(); + } + + /* Always reshuffle time prioq, as the ratelimited flag may be changed. */ + event_source_time_prioq_reshuffle(s); + + return 1; +} + +static int event_source_online( + sd_event_source *s, + int enabled, + bool ratelimited) { + + bool was_online; + int r; + + assert(s); + assert(enabled != SD_EVENT_OFF || !ratelimited); + + /* Unset the pending flag when this event source is enabled */ + if (s->enabled == SD_EVENT_OFF && + enabled != SD_EVENT_OFF && + !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) { + r = source_set_pending(s, false); + if (r < 0) + return r; + } + + /* Are we really ready for onlining? */ + if (enabled == SD_EVENT_OFF || ratelimited) { + /* Nope, we are not ready for onlining, then just update the precise state and exit */ + s->enabled = enabled; + s->ratelimited = ratelimited; + return 0; + } + + was_online = event_source_is_online(s); + + switch (s->type) { + case SOURCE_IO: + r = source_io_register(s, enabled, s->io.events); + if (r < 0) + return r; + break; + + case SOURCE_SIGNAL: + r = event_make_signal_data(s->event, s->signal.sig, NULL); + if (r < 0) { + event_gc_signal_data(s->event, &s->priority, s->signal.sig); + return r; + } + + break; + + case SOURCE_CHILD: + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* yes, we have pidfd */ + + r = source_child_pidfd_register(s, enabled); + if (r < 0) + return r; + } else { + /* no pidfd, or something other to watch for than WEXITED */ + + r = event_make_signal_data(s->event, SIGCHLD, NULL); + if (r < 0) { + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + return r; + } + } + + if (!was_online) + s->event->n_online_child_sources++; + break; + + case SOURCE_MEMORY_PRESSURE: + r = source_memory_pressure_register(s, enabled); + if (r < 0) + return r; + + break; + + case SOURCE_TIME_REALTIME: + case SOURCE_TIME_BOOTTIME: + case SOURCE_TIME_MONOTONIC: + case SOURCE_TIME_REALTIME_ALARM: + case SOURCE_TIME_BOOTTIME_ALARM: + case SOURCE_EXIT: + case SOURCE_DEFER: + case SOURCE_POST: + case SOURCE_INOTIFY: + break; + + default: + assert_not_reached(); + } + + s->enabled = enabled; + s->ratelimited = ratelimited; + + /* Non-failing operations below */ + if (s->type == SOURCE_EXIT) + prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); + + /* Always reshuffle time prioq, as the ratelimited flag may be changed. */ + event_source_time_prioq_reshuffle(s); + + return 1; +} + +_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { + int r; + + assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL); + + /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */ + if (m == SD_EVENT_OFF && !s) + return 0; + + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */ + if (s->event->state == SD_EVENT_FINISHED) + return m == SD_EVENT_OFF ? 0 : -ESTALE; + + if (s->enabled == m) /* No change? */ + return 0; + + if (m == SD_EVENT_OFF) + r = event_source_offline(s, m, s->ratelimited); + else { + if (s->enabled != SD_EVENT_OFF) { + /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the + * event source is already enabled after all. */ + s->enabled = m; + return 0; + } + + r = event_source_online(s, m, s->ratelimited); + } + if (r < 0) + return r; + + event_source_pp_prioq_reshuffle(s); + return 0; +} + +_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) { + assert_return(s, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *usec = s->time.next; + return 0; +} + +_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) { + int r; + + assert_return(s, -EINVAL); + assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(s->event), -ECHILD); + + r = source_set_pending(s, false); + if (r < 0) + return r; + + s->time.next = usec; + + event_source_time_prioq_reshuffle(s); + return 0; +} + +_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) { + usec_t t; + int r; + + assert_return(s, -EINVAL); + assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (usec == USEC_INFINITY) + return sd_event_source_set_time(s, USEC_INFINITY); + + r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t); + if (r < 0) + return r; + + usec = usec_add(t, usec); + if (usec == USEC_INFINITY) + return -EOVERFLOW; + + return sd_event_source_set_time(s, usec); +} + +_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) { + assert_return(s, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *usec = s->time.accuracy; + return 0; +} + +_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) { + int r; + + assert_return(s, -EINVAL); + assert_return(usec != UINT64_MAX, -EINVAL); + assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(s->event), -ECHILD); + + r = source_set_pending(s, false); + if (r < 0) + return r; + + if (usec == 0) + usec = DEFAULT_ACCURACY_USEC; + + s->time.accuracy = usec; + + event_source_time_prioq_reshuffle(s); + return 0; +} + +_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) { + assert_return(s, -EINVAL); + assert_return(clock, -EINVAL); + assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *clock = event_source_type_to_clock(s->type); + return 0; +} + +_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) { + assert_return(s, -EINVAL); + assert_return(pid, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *pid = s->child.pid; + return 0; +} + +_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + return s->child.pidfd; +} + +_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + assert_return(SIGNAL_VALID(sig), -EINVAL); + + /* If we already have seen indication the process exited refuse sending a signal early. This way we + * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not + * available. */ + if (s->child.exited) + return -ESRCH; + + if (s->child.pidfd >= 0) { + siginfo_t copy; + + /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the + * structure here */ + if (si) + copy = *si; + + if (pidfd_send_signal(s->child.pidfd, sig, si ? © : NULL, 0) < 0) { + /* Let's propagate the error only if the system call is not implemented or prohibited */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else + return 0; + } + + /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse + * this here. */ + if (flags != 0) + return -EOPNOTSUPP; + + if (si) { + /* We use rt_sigqueueinfo() only if siginfo_t is specified. */ + siginfo_t copy = *si; + + if (rt_sigqueueinfo(s->child.pid, sig, ©) < 0) + return -errno; + } else if (kill(s->child.pid, sig) < 0) + return -errno; + + return 0; +} + +_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + return s->child.pidfd_owned; +} + +_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + s->child.pidfd_owned = own; + return 0; +} + +_public_ int sd_event_source_get_child_process_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->child.process_owned; +} + +_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + s->child.process_owned = own; + return 0; +} + +_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) { + assert_return(s, -EINVAL); + assert_return(mask, -EINVAL); + assert_return(s->type == SOURCE_INOTIFY, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + *mask = s->inotify.mask; + return 0; +} + +_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) { + int r; + + assert_return(s, -EINVAL); + assert_return(s->type != SOURCE_EXIT, -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->prepare == callback) + return 0; + + if (callback && s->prepare) { + s->prepare = callback; + return 0; + } + + r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare); + if (r < 0) + return r; + + s->prepare = callback; + + if (callback) { + r = prioq_put(s->event->prepare, s, &s->prepare_index); + if (r < 0) + return r; + } else + prioq_remove(s->event->prepare, s, &s->prepare_index); + + return 0; +} + +_public_ void* sd_event_source_get_userdata(sd_event_source *s) { + assert_return(s, NULL); + assert_return(!event_origin_changed(s->event), NULL); + + return s->userdata; +} + +_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) { + void *ret; + + assert_return(s, NULL); + assert_return(!event_origin_changed(s->event), NULL); + + ret = s->userdata; + s->userdata = userdata; + + return ret; +} + +static int event_source_enter_ratelimited(sd_event_source *s) { + int r; + + assert(s); + + /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with + * the end of the rate limit time window, much as if it was a timer event source. */ + + if (s->ratelimited) + return 0; /* Already ratelimited, this is a NOP hence */ + + /* Make sure we can install a CLOCK_MONOTONIC event further down. */ + r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC); + if (r < 0) + return r; + + /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's + * first remove them from the prioq appropriate for their own clock, so that we can use the prioq + * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */ + if (EVENT_SOURCE_IS_TIME(s->type)) + event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type)); + + /* Now, let's add the event source to the monotonic clock instead */ + r = event_source_time_prioq_put(s, &s->event->monotonic); + if (r < 0) + goto fail; + + /* And let's take the event source officially offline */ + r = event_source_offline(s, s->enabled, /* ratelimited= */ true); + if (r < 0) { + event_source_time_prioq_remove(s, &s->event->monotonic); + goto fail; + } + + event_source_pp_prioq_reshuffle(s); + + log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description)); + return 0; + +fail: + /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue + * space for it should already be allocated. */ + if (EVENT_SOURCE_IS_TIME(s->type)) + assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0); + + return r; +} + +static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) { + int r; + + assert(s); + + if (!s->ratelimited) + return 0; + + /* Let's take the event source out of the monotonic prioq first. */ + event_source_time_prioq_remove(s, &s->event->monotonic); + + /* Let's then add the event source to its native clock prioq again — if this is a timer event source */ + if (EVENT_SOURCE_IS_TIME(s->type)) { + r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)); + if (r < 0) + goto fail; + } + + /* Let's try to take it online again. */ + r = event_source_online(s, s->enabled, /* ratelimited= */ false); + if (r < 0) { + /* Do something roughly sensible when this failed: undo the two prioq ops above */ + if (EVENT_SOURCE_IS_TIME(s->type)) + event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type)); + + goto fail; + } + + event_source_pp_prioq_reshuffle(s); + ratelimit_reset(&s->rate_limit); + + log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description)); + + if (run_callback && s->ratelimit_expire_callback) { + s->dispatching = true; + r = s->ratelimit_expire_callback(s, s->userdata); + s->dispatching = false; + + if (r < 0) { + log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m", + strna(s->description), + event_source_type_to_string(s->type), + s->exit_on_failure ? "exiting" : "disabling"); + + if (s->exit_on_failure) + (void) sd_event_exit(s->event, r); + } + + if (s->n_ref == 0) + source_free(s); + else if (r < 0) + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + + return 1; + } + + return 0; + +fail: + /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode: + * simply put it back in it, maybe we can then process it more successfully next iteration. */ + assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0); + + return r; +} + +static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) { + usec_t c; + assert(e); + assert(a <= b); + + if (a <= 0) + return 0; + if (a >= USEC_INFINITY) + return USEC_INFINITY; + + if (b <= a + 1) + return a; + + initialize_perturb(e); + + /* + Find a good time to wake up again between times a and b. We + have two goals here: + + a) We want to wake up as seldom as possible, hence prefer + later times over earlier times. + + b) But if we have to wake up, then let's make sure to + dispatch as much as possible on the entire system. + + We implement this by waking up everywhere at the same time + within any given minute if we can, synchronised via the + perturbation value determined from the boot ID. If we can't, + then we try to find the same spot in every 10s, then 1s and + then 250ms step. Otherwise, we pick the last possible time + to wake up. + */ + + c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb; + if (c >= b) { + if (_unlikely_(c < USEC_PER_MINUTE)) + return b; + + c -= USEC_PER_MINUTE; + } + + if (c >= a) + return c; + + c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10)); + if (c >= b) { + if (_unlikely_(c < USEC_PER_SEC*10)) + return b; + + c -= USEC_PER_SEC*10; + } + + if (c >= a) + return c; + + c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC); + if (c >= b) { + if (_unlikely_(c < USEC_PER_SEC)) + return b; + + c -= USEC_PER_SEC; + } + + if (c >= a) + return c; + + c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250)); + if (c >= b) { + if (_unlikely_(c < USEC_PER_MSEC*250)) + return b; + + c -= USEC_PER_MSEC*250; + } + + if (c >= a) + return c; + + return b; +} + +static int event_arm_timer( + sd_event *e, + struct clock_data *d) { + + struct itimerspec its = {}; + sd_event_source *a, *b; + usec_t t; + + assert(e); + assert(d); + + if (!d->needs_rearm) + return 0; + + d->needs_rearm = false; + + a = prioq_peek(d->earliest); + assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type)); + if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) { + + if (d->fd < 0) + return 0; + + if (d->next == USEC_INFINITY) + return 0; + + /* disarm */ + if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0) + return -errno; + + d->next = USEC_INFINITY; + return 0; + } + + b = prioq_peek(d->latest); + assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type)); + assert(b && b->enabled != SD_EVENT_OFF); + + t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b)); + if (d->next == t) + return 0; + + assert_se(d->fd >= 0); + + if (t == 0) { + /* We don't want to disarm here, just mean some time looooong ago. */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 1; + } else + timespec_store(&its.it_value, t); + + if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0) + return -errno; + + d->next = t; + return 0; +} + +static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) { + assert(e); + assert(s); + assert(s->type == SOURCE_IO); + + /* If the event source was already pending, we just OR in the + * new revents, otherwise we reset the value. The ORing is + * necessary to handle EPOLLONESHOT events properly where + * readability might happen independently of writability, and + * we need to keep track of both */ + + if (s->pending) + s->io.revents |= revents; + else + s->io.revents = revents; + + return source_set_pending(s, true); +} + +static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) { + uint64_t x; + ssize_t ss; + + assert(e); + assert(fd >= 0); + + assert_return(events == EPOLLIN, -EIO); + + ss = read(fd, &x, sizeof(x)); + if (ss < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return -errno; + } + + if (_unlikely_(ss != sizeof(x))) + return -EIO; + + if (next) + *next = USEC_INFINITY; + + return 0; +} + +static int process_timer( + sd_event *e, + usec_t n, + struct clock_data *d) { + + sd_event_source *s; + bool callback_invoked = false; + int r; + + assert(e); + assert(d); + + for (;;) { + s = prioq_peek(d->earliest); + assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type)); + + if (!s || time_event_source_next(s) > n) + break; + + if (s->ratelimited) { + /* This is an event sources whose ratelimit window has ended. Let's turn it on + * again. */ + assert(s->ratelimited); + + r = event_source_leave_ratelimit(s, /* run_callback */ true); + if (r < 0) + return r; + else if (r == 1) + callback_invoked = true; + + continue; + } + + if (s->enabled == SD_EVENT_OFF || s->pending) + break; + + r = source_set_pending(s, true); + if (r < 0) + return r; + + event_source_time_prioq_reshuffle(s); + } + + return callback_invoked; +} + +static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) { + int64_t min_priority = threshold; + bool something_new = false; + sd_event_source *s; + int r; + + assert(e); + assert(ret_min_priority); + + if (!e->need_process_child) { + *ret_min_priority = min_priority; + return 0; + } + + e->need_process_child = false; + + /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait + * for, instead of using P_ALL. This is because we only want to get child information of very + * specific child processes, and not all of them. We might not have processed the SIGCHLD event + * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue, + * hence we really don't want anything flushed out of the kernel's queue that we don't care + * about. Since this is O(n) this means that if you have a lot of processes you probably want + * to handle SIGCHLD yourself. + * + * We do not reap the children here (by using WNOWAIT), this is only done after the event + * source is dispatched so that the callback still sees the process as a zombie. */ + + HASHMAP_FOREACH(s, e->child_sources) { + assert(s->type == SOURCE_CHILD); + + if (s->priority > threshold) + continue; + + if (s->pending) + continue; + + if (event_source_is_offline(s)) + continue; + + if (s->child.exited) + continue; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) + /* There's a usable pidfd known for this event source? Then don't waitid() for + * it here */ + continue; + + zero(s->child.siginfo); + if (waitid(P_PID, s->child.pid, &s->child.siginfo, + WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0) + return negative_errno(); + + if (s->child.siginfo.si_pid != 0) { + bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED); + + if (zombie) + s->child.exited = true; + + if (!zombie && (s->child.options & WEXITED)) { + /* If the child isn't dead then let's immediately remove the state + * change from the queue, since there's no benefit in leaving it + * queued. */ + + assert(s->child.options & (WSTOPPED|WCONTINUED)); + (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED))); + } + + r = source_set_pending(s, true); + if (r < 0) + return r; + if (r > 0) { + something_new = true; + min_priority = MIN(min_priority, s->priority); + } + } + } + + *ret_min_priority = min_priority; + return something_new; +} + +static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) { + assert(e); + assert(s); + assert(s->type == SOURCE_CHILD); + + if (s->pending) + return 0; + + if (event_source_is_offline(s)) + return 0; + + if (!EVENT_SOURCE_WATCH_PIDFD(s)) + return 0; + + zero(s->child.siginfo); + if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0) + return -errno; + + if (s->child.siginfo.si_pid == 0) + return 0; + + if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) + s->child.exited = true; + + return source_set_pending(s, true); +} + +static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) { + int r; + + assert(e); + assert(d); + assert_return(events == EPOLLIN, -EIO); + assert(min_priority); + + /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make + * sure to recheck the children we watch. This is because we only ever dequeue the first signal + * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know, + * but we might have higher priority children we care about hence we need to check that + * explicitly. */ + + if (sigismember(&d->sigset, SIGCHLD)) + e->need_process_child = true; + + /* If there's already an event source pending for this priority we don't read another */ + if (d->current) + return 0; + + for (;;) { + struct signalfd_siginfo si; + ssize_t n; + sd_event_source *s = NULL; + + n = read(d->fd, &si, sizeof(si)); + if (n < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return -errno; + } + + if (_unlikely_(n != sizeof(si))) + return -EIO; + + assert(SIGNAL_VALID(si.ssi_signo)); + + if (e->signal_sources) + s = e->signal_sources[si.ssi_signo]; + if (!s) + continue; + if (s->pending) + continue; + + s->signal.siginfo = si; + d->current = s; + + r = source_set_pending(s, true); + if (r < 0) + return r; + if (r > 0 && *min_priority >= s->priority) { + *min_priority = s->priority; + return 1; /* an event source with smaller priority is queued. */ + } + + return 0; + } +} + +static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) { + ssize_t n; + + assert(e); + assert(d); + + assert_return(revents == EPOLLIN, -EIO); + + /* If there's already an event source pending for this priority, don't read another */ + if (d->n_pending > 0) + return 0; + + /* Is the read buffer non-empty? If so, let's not read more */ + if (d->buffer_filled > 0) + return 0; + + if (d->priority > threshold) + return 0; + + n = read(d->fd, &d->buffer, sizeof(d->buffer)); + if (n < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return -errno; + } + + assert(n > 0); + d->buffer_filled = (size_t) n; + LIST_PREPEND(buffered, e->buffered_inotify_data_list, d); + + return 1; +} + +static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) { + assert(e); + assert(d); + assert(sz <= d->buffer_filled); + + if (sz == 0) + return; + + /* Move the rest to the buffer to the front, in order to get things properly aligned again */ + memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz); + d->buffer_filled -= sz; + + if (d->buffer_filled == 0) + LIST_REMOVE(buffered, e->buffered_inotify_data_list, d); +} + +static int event_inotify_data_process(sd_event *e, struct inotify_data *d) { + int r; + + assert(e); + assert(d); + + /* If there's already an event source pending for this priority, don't read another */ + if (d->n_pending > 0) + return 0; + + while (d->buffer_filled > 0) { + size_t sz; + + /* Let's validate that the event structures are complete */ + if (d->buffer_filled < offsetof(struct inotify_event, name)) + return -EIO; + + sz = offsetof(struct inotify_event, name) + d->buffer.ev.len; + if (d->buffer_filled < sz) + return -EIO; + + if (d->buffer.ev.mask & IN_Q_OVERFLOW) { + struct inode_data *inode_data; + + /* The queue overran, let's pass this event to all event sources connected to this inotify + * object */ + + HASHMAP_FOREACH(inode_data, d->inodes) + LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) { + + if (event_source_is_offline(s)) + continue; + + r = source_set_pending(s, true); + if (r < 0) + return r; + } + } else { + struct inode_data *inode_data; + + /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from + * our watch descriptor table. */ + if (d->buffer.ev.mask & IN_IGNORED) { + + inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd)); + if (!inode_data) { + event_inotify_data_drop(e, d, sz); + continue; + } + + /* The watch descriptor was removed by the kernel, let's drop it here too */ + inode_data->wd = -1; + } else { + inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd)); + if (!inode_data) { + event_inotify_data_drop(e, d, sz); + continue; + } + } + + /* Trigger all event sources that are interested in these events. Also trigger all event + * sources if IN_IGNORED or IN_UNMOUNT is set. */ + LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) { + + if (event_source_is_offline(s)) + continue; + + if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 && + (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0) + continue; + + r = source_set_pending(s, true); + if (r < 0) + return r; + } + } + + /* Something pending now? If so, let's finish, otherwise let's read more. */ + if (d->n_pending > 0) + return 1; + } + + return 0; +} + +static int process_inotify(sd_event *e) { + int r, done = 0; + + assert(e); + + LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) { + r = event_inotify_data_process(e, d); + if (r < 0) + return r; + if (r > 0) + done ++; + } + + return done; +} + +static int process_memory_pressure(sd_event_source *s, uint32_t revents) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (s->pending) + s->memory_pressure.revents |= revents; + else + s->memory_pressure.revents = revents; + + return source_set_pending(s, true); +} + +static int source_memory_pressure_write(sd_event_source *s) { + ssize_t n; + int r; + + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + /* once we start writing, the buffer is locked, we allow no further changes. */ + s->memory_pressure.locked = true; + + if (s->memory_pressure.write_buffer_size > 0) { + n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size); + if (n < 0) { + if (!ERRNO_IS_TRANSIENT(errno)) { + /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI + * files, but then generates EOPNOSUPP on read() and write() (instead of on + * open()!). This sucks hard, since we can only detect this kind of failure + * so late. Let's make the best of it, and turn off the event source like we + * do for failed event source handlers. */ + + log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m"); + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + return 0; + } + + n = 0; + } + } else + n = 0; + + assert(n >= 0); + + if ((size_t) n == s->memory_pressure.write_buffer_size) { + s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer); + + if (n > 0) { + s->memory_pressure.write_buffer_size = 0; + + /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */ + r = source_memory_pressure_register(s, s->enabled); + if (r < 0) + return r; + } + } else if (n > 0) { + _cleanup_free_ void *c = NULL; + + assert((size_t) n < s->memory_pressure.write_buffer_size); + + c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n); + if (!c) + return -ENOMEM; + + free_and_replace(s->memory_pressure.write_buffer, c); + s->memory_pressure.write_buffer_size -= n; + return 1; + } + + return 0; +} + +static int source_memory_pressure_initiate_dispatch(sd_event_source *s) { + int r; + + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + r = source_memory_pressure_write(s); + if (r < 0) + return r; + if (r > 0) + return 1; /* if we wrote something, then don't continue with dispatching user dispatch + * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */ + + /* No pending incoming IO? Then let's not continue further */ + if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) { + + /* Treat IO errors on the notifier the same ways errors returned from a callback */ + if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0) + return -EIO; + + return 1; /* leave dispatch, we already processed everything */ + } + + if (s->memory_pressure.revents & EPOLLIN) { + uint8_t pipe_buf[PIPE_BUF]; + ssize_t n; + + /* If the fd is readable, then flush out anything that might be queued */ + + n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf)); + if (n < 0 && !ERRNO_IS_TRANSIENT(errno)) + return -errno; + } + + return 0; /* go on, dispatch to user callback */ +} + +static int source_dispatch(sd_event_source *s) { + EventSourceType saved_type; + sd_event *saved_event; + int r = 0; + + assert(s); + assert(s->pending || s->type == SOURCE_EXIT); + + /* Save the event source type, here, so that we still know it after the event callback which might + * invalidate the event. */ + saved_type = s->type; + + /* Similarly, store a reference to the event loop object, so that we can still access it after the + * callback might have invalidated/disconnected the event source. */ + saved_event = s->event; + PROTECT_EVENT(saved_event); + + /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */ + assert(!s->ratelimited); + if (!ratelimit_below(&s->rate_limit)) { + r = event_source_enter_ratelimited(s); + if (r < 0) + return r; + + return 1; + } + + if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) { + r = source_set_pending(s, false); + if (r < 0) + return r; + } + + if (s->type != SOURCE_POST) { + sd_event_source *z; + + /* If we execute a non-post source, let's mark all post sources as pending. */ + + SET_FOREACH(z, s->event->post_sources) { + if (event_source_is_offline(z)) + continue; + + r = source_set_pending(z, true); + if (r < 0) + return r; + } + } + + if (s->type == SOURCE_MEMORY_PRESSURE) { + r = source_memory_pressure_initiate_dispatch(s); + if (r == -EIO) /* handle EIO errors similar to callback errors */ + goto finish; + if (r < 0) + return r; + if (r > 0) /* already handled */ + return 1; + } + + if (s->enabled == SD_EVENT_ONESHOT) { + r = sd_event_source_set_enabled(s, SD_EVENT_OFF); + if (r < 0) + return r; + } + + s->dispatching = true; + + switch (s->type) { + + case SOURCE_IO: + r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata); + break; + + case SOURCE_TIME_REALTIME: + case SOURCE_TIME_BOOTTIME: + case SOURCE_TIME_MONOTONIC: + case SOURCE_TIME_REALTIME_ALARM: + case SOURCE_TIME_BOOTTIME_ALARM: + r = s->time.callback(s, s->time.next, s->userdata); + break; + + case SOURCE_SIGNAL: + r = s->signal.callback(s, &s->signal.siginfo, s->userdata); + break; + + case SOURCE_CHILD: { + bool zombie; + + zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED); + + r = s->child.callback(s, &s->child.siginfo, s->userdata); + + /* Now, reap the PID for good. */ + if (zombie) { + (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED); + s->child.waited = true; + } + + break; + } + + case SOURCE_DEFER: + r = s->defer.callback(s, s->userdata); + break; + + case SOURCE_POST: + r = s->post.callback(s, s->userdata); + break; + + case SOURCE_EXIT: + r = s->exit.callback(s, s->userdata); + break; + + case SOURCE_INOTIFY: { + struct sd_event *e = s->event; + struct inotify_data *d; + size_t sz; + + assert(s->inotify.inode_data); + assert_se(d = s->inotify.inode_data->inotify_data); + + assert(d->buffer_filled >= offsetof(struct inotify_event, name)); + sz = offsetof(struct inotify_event, name) + d->buffer.ev.len; + assert(d->buffer_filled >= sz); + + /* If the inotify callback destroys the event source then this likely means we don't need to + * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd + * free it immediately, then we couldn't drop the event from the inotify event queue without + * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it + * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then + * explicitly GC it after we are done dropping the inotify event from the buffer. */ + d->n_busy++; + r = s->inotify.callback(s, &d->buffer.ev, s->userdata); + d->n_busy--; + + /* When no event is pending anymore on this inotify object, then let's drop the event from + * the inotify event queue buffer. */ + if (d->n_pending == 0) + event_inotify_data_drop(e, d, sz); + + /* Now we don't want to access 'd' anymore, it's OK to GC now. */ + event_gc_inotify_data(e, d); + break; + } + + case SOURCE_MEMORY_PRESSURE: + r = s->memory_pressure.callback(s, s->userdata); + break; + + case SOURCE_WATCHDOG: + case _SOURCE_EVENT_SOURCE_TYPE_MAX: + case _SOURCE_EVENT_SOURCE_TYPE_INVALID: + assert_not_reached(); + } + + s->dispatching = false; + +finish: + if (r < 0) { + log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m", + strna(s->description), + event_source_type_to_string(saved_type), + s->exit_on_failure ? "exiting" : "disabling"); + + if (s->exit_on_failure) + (void) sd_event_exit(saved_event, r); + } + + if (s->n_ref == 0) + source_free(s); + else if (r < 0) + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + + return 1; +} + +static int event_prepare(sd_event *e) { + int r; + + assert(e); + + for (;;) { + sd_event_source *s; + + s = prioq_peek(e->prepare); + if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s)) + break; + + s->prepare_iteration = e->iteration; + prioq_reshuffle(e->prepare, s, &s->prepare_index); + + assert(s->prepare); + s->dispatching = true; + r = s->prepare(s, s->userdata); + s->dispatching = false; + + if (r < 0) { + log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m", + strna(s->description), + event_source_type_to_string(s->type), + s->exit_on_failure ? "exiting" : "disabling"); + + if (s->exit_on_failure) + (void) sd_event_exit(e, r); + } + + if (s->n_ref == 0) + source_free(s); + else if (r < 0) + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + } + + return 0; +} + +static int dispatch_exit(sd_event *e) { + sd_event_source *p; + int r; + + assert(e); + + p = prioq_peek(e->exit); + assert(!p || p->type == SOURCE_EXIT); + + if (!p || event_source_is_offline(p)) { + e->state = SD_EVENT_FINISHED; + return 0; + } + + PROTECT_EVENT(e); + e->iteration++; + e->state = SD_EVENT_EXITING; + r = source_dispatch(p); + e->state = SD_EVENT_INITIAL; + return r; +} + +static sd_event_source* event_next_pending(sd_event *e) { + sd_event_source *p; + + assert(e); + + p = prioq_peek(e->pending); + if (!p) + return NULL; + + if (event_source_is_offline(p)) + return NULL; + + return p; +} + +static int arm_watchdog(sd_event *e) { + struct itimerspec its = {}; + usec_t t; + + assert(e); + assert(e->watchdog_fd >= 0); + + t = sleep_between(e, + usec_add(e->watchdog_last, (e->watchdog_period / 2)), + usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4))); + + timespec_store(&its.it_value, t); + + /* Make sure we never set the watchdog to 0, which tells the + * kernel to disable it. */ + if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0) + its.it_value.tv_nsec = 1; + + return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL)); +} + +static int process_watchdog(sd_event *e) { + assert(e); + + if (!e->watchdog) + return 0; + + /* Don't notify watchdog too often */ + if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic) + return 0; + + sd_notify(false, "WATCHDOG=1"); + e->watchdog_last = e->timestamp.monotonic; + + return arm_watchdog(e); +} + +static void event_close_inode_data_fds(sd_event *e) { + struct inode_data *d; + + assert(e); + + /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin + * filesystems. But we can't close them right-away as we need them as long as the user still wants to make + * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch + * for the inode). Hence, let's close them when entering the first iteration after they were added, as a + * compromise. */ + + while ((d = e->inode_data_to_close_list)) { + assert(d->fd >= 0); + d->fd = safe_close(d->fd); + + LIST_REMOVE(to_close, e->inode_data_to_close_list, d); + } +} + +static int event_memory_pressure_write_list(sd_event *e) { + int r; + + assert(e); + + for (;;) { + sd_event_source *s; + + s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list); + if (!s) + break; + + assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(s->memory_pressure.write_buffer_size > 0); + s->memory_pressure.in_write_list = false; + + r = source_memory_pressure_write(s); + if (r < 0) + return r; + } + + return 0; +} + +_public_ int sd_event_prepare(sd_event *e) { + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(e->state == SD_EVENT_INITIAL, -EBUSY); + + /* Let's check that if we are a default event loop we are executed in the correct thread. We only do + * this check here once, since gettid() is typically not cached, and thus want to minimize + * syscalls */ + assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO); + + /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */ + PROTECT_EVENT(e); + + if (e->exit_requested) + goto pending; + + e->iteration++; + + e->state = SD_EVENT_PREPARING; + r = event_prepare(e); + e->state = SD_EVENT_INITIAL; + if (r < 0) + return r; + + r = event_memory_pressure_write_list(e); + if (r < 0) + return r; + + r = event_arm_timer(e, &e->realtime); + if (r < 0) + return r; + + r = event_arm_timer(e, &e->boottime); + if (r < 0) + return r; + + r = event_arm_timer(e, &e->monotonic); + if (r < 0) + return r; + + r = event_arm_timer(e, &e->realtime_alarm); + if (r < 0) + return r; + + r = event_arm_timer(e, &e->boottime_alarm); + if (r < 0) + return r; + + event_close_inode_data_fds(e); + + if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list) + goto pending; + + e->state = SD_EVENT_ARMED; + + return 0; + +pending: + e->state = SD_EVENT_ARMED; + r = sd_event_wait(e, 0); + if (r == 0) + e->state = SD_EVENT_ARMED; + + return r; +} + +static int epoll_wait_usec( + int fd, + struct epoll_event *events, + int maxevents, + usec_t timeout) { + + int msec; + /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */ + +#if HAVE_EPOLL_PWAIT2 + static bool epoll_pwait2_absent = false; + int r; + + /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast + * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this + * is not that obvious to implement given the libc and kernel definitions differ in the last + * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a + * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's + * missing. */ + + if (!epoll_pwait2_absent && timeout != USEC_INFINITY) { + r = epoll_pwait2(fd, + events, + maxevents, + TIMESPEC_STORE(timeout), + NULL); + if (r >= 0) + return r; + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not + * supported. */ + + epoll_pwait2_absent = true; + } +#endif + + if (timeout == USEC_INFINITY) + msec = -1; + else { + usec_t k; + + k = DIV_ROUND_UP(timeout, USEC_PER_MSEC); + if (k >= INT_MAX) + msec = INT_MAX; /* Saturate */ + else + msec = (int) k; + } + + return RET_NERRNO(epoll_wait(fd, events, maxevents, msec)); +} + +static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) { + size_t n_event_queue, m, n_event_max; + int64_t min_priority = threshold; + bool something_new = false; + int r; + + assert(e); + assert(ret_min_priority); + + n_event_queue = MAX(e->n_sources, 1u); + if (!GREEDY_REALLOC(e->event_queue, n_event_queue)) + return -ENOMEM; + + n_event_max = MALLOC_ELEMENTSOF(e->event_queue); + + /* If we still have inotify data buffered, then query the other fds, but don't wait on it */ + if (e->buffered_inotify_data_list) + timeout = 0; + + for (;;) { + r = epoll_wait_usec( + e->epoll_fd, + e->event_queue, + n_event_max, + timeout); + if (r < 0) + return r; + + m = (size_t) r; + + if (m < n_event_max) + break; + + if (n_event_max >= n_event_queue * 10) + break; + + if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue)) + return -ENOMEM; + + n_event_max = MALLOC_ELEMENTSOF(e->event_queue); + timeout = 0; + } + + /* Set timestamp only when this is called first time. */ + if (threshold == INT64_MAX) + triple_timestamp_now(&e->timestamp); + + for (size_t i = 0; i < m; i++) { + + if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG)) + r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL); + else { + WakeupType *t = e->event_queue[i].data.ptr; + + switch (*t) { + + case WAKEUP_EVENT_SOURCE: { + sd_event_source *s = e->event_queue[i].data.ptr; + + assert(s); + + if (s->priority > threshold) + continue; + + min_priority = MIN(min_priority, s->priority); + + switch (s->type) { + + case SOURCE_IO: + r = process_io(e, s, e->event_queue[i].events); + break; + + case SOURCE_CHILD: + r = process_pidfd(e, s, e->event_queue[i].events); + break; + + case SOURCE_MEMORY_PRESSURE: + r = process_memory_pressure(s, e->event_queue[i].events); + break; + + default: + assert_not_reached(); + } + + break; + } + + case WAKEUP_CLOCK_DATA: { + struct clock_data *d = e->event_queue[i].data.ptr; + + assert(d); + + r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next); + break; + } + + case WAKEUP_SIGNAL_DATA: + r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority); + break; + + case WAKEUP_INOTIFY_DATA: + r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold); + break; + + default: + assert_not_reached(); + } + } + if (r < 0) + return r; + if (r > 0) + something_new = true; + } + + *ret_min_priority = min_priority; + return something_new; +} + +_public_ int sd_event_wait(sd_event *e, uint64_t timeout) { + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(e->state == SD_EVENT_ARMED, -EBUSY); + + if (e->exit_requested) { + e->state = SD_EVENT_PENDING; + return 1; + } + + for (int64_t threshold = INT64_MAX; ; threshold--) { + int64_t epoll_min_priority, child_min_priority; + + /* There may be a possibility that new epoll (especially IO) and child events are + * triggered just after process_epoll() call but before process_child(), and the new IO + * events may have higher priority than the child events. To salvage these events, + * let's call epoll_wait() again, but accepts only events with higher priority than the + * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments + * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085 + * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */ + + r = process_epoll(e, timeout, threshold, &epoll_min_priority); + if (r == -EINTR) { + e->state = SD_EVENT_PENDING; + return 1; + } + if (r < 0) + goto finish; + if (r == 0 && threshold < INT64_MAX) + /* No new epoll event. */ + break; + + r = process_child(e, threshold, &child_min_priority); + if (r < 0) + goto finish; + if (r == 0) + /* No new child event. */ + break; + + threshold = MIN(epoll_min_priority, child_min_priority); + if (threshold == INT64_MIN) + break; + + timeout = 0; + } + + r = process_watchdog(e); + if (r < 0) + goto finish; + + r = process_inotify(e); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.realtime, &e->realtime); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.boottime, &e->boottime); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.monotonic, &e->monotonic); + if (r < 0) + goto finish; + else if (r == 1) { + /* Ratelimit expiry callback was called. Let's postpone processing pending sources and + * put loop in the initial state in order to evaluate (in the next iteration) also sources + * there were potentially re-enabled by the callback. + * + * Wondering why we treat only this invocation of process_timer() differently? Once event + * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence + * ratelimit expiry callback is never called for any other timer type. */ + r = 0; + goto finish; + } + + if (event_next_pending(e)) { + e->state = SD_EVENT_PENDING; + return 1; + } + + r = 0; + +finish: + e->state = SD_EVENT_INITIAL; + + return r; +} + +_public_ int sd_event_dispatch(sd_event *e) { + sd_event_source *p; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(e->state == SD_EVENT_PENDING, -EBUSY); + + if (e->exit_requested) + return dispatch_exit(e); + + p = event_next_pending(e); + if (p) { + PROTECT_EVENT(e); + + e->state = SD_EVENT_RUNNING; + r = source_dispatch(p); + e->state = SD_EVENT_INITIAL; + return r; + } + + e->state = SD_EVENT_INITIAL; + + return 1; +} + +static void event_log_delays(sd_event *e) { + char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p; + size_t l, i; + + p = b; + l = sizeof(b); + for (i = 0; i < ELEMENTSOF(e->delays); i++) { + l = strpcpyf(&p, l, "%u ", e->delays[i]); + e->delays[i] = 0; + } + log_debug("Event loop iterations: %s", b); +} + +_public_ int sd_event_run(sd_event *e, uint64_t timeout) { + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(e->state == SD_EVENT_INITIAL, -EBUSY); + + if (e->profile_delays && e->last_run_usec != 0) { + usec_t this_run; + unsigned l; + + this_run = now(CLOCK_MONOTONIC); + + l = log2u64(this_run - e->last_run_usec); + assert(l < ELEMENTSOF(e->delays)); + e->delays[l]++; + + if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) { + event_log_delays(e); + e->last_log_usec = this_run; + } + } + + /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */ + PROTECT_EVENT(e); + + r = sd_event_prepare(e); + if (r == 0) + /* There was nothing? Then wait... */ + r = sd_event_wait(e, timeout); + + if (e->profile_delays) + e->last_run_usec = now(CLOCK_MONOTONIC); + + if (r > 0) { + /* There's something now, then let's dispatch it */ + r = sd_event_dispatch(e); + if (r < 0) + return r; + + return 1; + } + + return r; +} + +_public_ int sd_event_loop(sd_event *e) { + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + assert_return(e->state == SD_EVENT_INITIAL, -EBUSY); + + + PROTECT_EVENT(e); + + while (e->state != SD_EVENT_FINISHED) { + r = sd_event_run(e, UINT64_MAX); + if (r < 0) + return r; + } + + return e->exit_code; +} + +_public_ int sd_event_get_fd(sd_event *e) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + + return e->epoll_fd; +} + +_public_ int sd_event_get_state(sd_event *e) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + + return e->state; +} + +_public_ int sd_event_get_exit_code(sd_event *e, int *code) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(code, -EINVAL); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!e->exit_requested) + return -ENODATA; + + *code = e->exit_code; + return 0; +} + +_public_ int sd_event_exit(sd_event *e, int code) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_origin_changed(e), -ECHILD); + + e->exit_requested = true; + e->exit_code = code; + + return 0; +} + +_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(usec, -EINVAL); + assert_return(!event_origin_changed(e), -ECHILD); + + if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock)) + return -EOPNOTSUPP; + + if (!triple_timestamp_is_set(&e->timestamp)) { + /* Implicitly fall back to now() if we never ran before and thus have no cached time. */ + *usec = now(clock); + return 1; + } + + *usec = triple_timestamp_by_clock(&e->timestamp, clock); + return 0; +} + +_public_ int sd_event_default(sd_event **ret) { + sd_event *e = NULL; + int r; + + if (!ret) + return !!default_event; + + if (default_event) { + *ret = sd_event_ref(default_event); + return 0; + } + + r = sd_event_new(&e); + if (r < 0) + return r; + + e->default_event_ptr = &default_event; + e->tid = gettid(); + default_event = e; + + *ret = e; + return 1; +} + +_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(tid, -EINVAL); + assert_return(!event_origin_changed(e), -ECHILD); + + if (e->tid != 0) { + *tid = e->tid; + return 0; + } + + return -ENXIO; +} + +_public_ int sd_event_set_watchdog(sd_event *e, int b) { + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + + if (e->watchdog == !!b) + return e->watchdog; + + if (b) { + r = sd_watchdog_enabled(false, &e->watchdog_period); + if (r <= 0) + return r; + + /* Issue first ping immediately */ + sd_notify(false, "WATCHDOG=1"); + e->watchdog_last = now(CLOCK_MONOTONIC); + + e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); + if (e->watchdog_fd < 0) + return -errno; + + r = arm_watchdog(e); + if (r < 0) + goto fail; + + struct epoll_event ev = { + .events = EPOLLIN, + .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG), + }; + + if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) { + r = -errno; + goto fail; + } + + } else { + if (e->watchdog_fd >= 0) { + (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL); + e->watchdog_fd = safe_close(e->watchdog_fd); + } + } + + e->watchdog = !!b; + return e->watchdog; + +fail: + e->watchdog_fd = safe_close(e->watchdog_fd); + return r; +} + +_public_ int sd_event_get_watchdog(sd_event *e) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + + return e->watchdog; +} + +_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) { + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_origin_changed(e), -ECHILD); + + *ret = e->iteration; + return 0; +} + +_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) { + assert_return(s, -EINVAL); + assert_return(s->event, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + s->destroy_callback = callback; + return 0; +} + +_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (ret) + *ret = s->destroy_callback; + + return !!s->destroy_callback; +} + +_public_ int sd_event_source_get_floating(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->floating; +} + +_public_ int sd_event_source_set_floating(sd_event_source *s, int b) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->floating == !!b) + return 0; + + if (!s->event) /* Already disconnected */ + return -ESTALE; + + s->floating = b; + + if (b) { + sd_event_source_ref(s); + sd_event_unref(s->event); + } else { + sd_event_ref(s->event); + sd_event_source_unref(s); + } + + return 1; +} + +_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type != SOURCE_EXIT, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + return s->exit_on_failure; +} + +_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) { + assert_return(s, -EINVAL); + assert_return(s->type != SOURCE_EXIT, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (s->exit_on_failure == !!b) + return 0; + + s->exit_on_failure = b; + return 1; +} + +_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) { + int r; + + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing + * so is a programming error. */ + assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM); + + /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh, + * non-ratelimited. */ + r = event_source_leave_ratelimit(s, /* run_callback */ false); + if (r < 0) + return r; + + s->rate_limit = (RateLimit) { interval, burst }; + return 0; +} + +_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + s->ratelimit_expire_callback = callback; + return 0; +} + +_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence + * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */ + if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type)) + return -EDOM; + + if (!ratelimit_configured(&s->rate_limit)) + return -ENOEXEC; + + if (ret_interval) + *ret_interval = s->rate_limit.interval; + if (ret_burst) + *ret_burst = s->rate_limit.burst; + + return 0; +} + +_public_ int sd_event_source_is_ratelimited(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type)) + return false; + + if (!ratelimit_configured(&s->rate_limit)) + return false; + + return s->ratelimited; +} + +_public_ int sd_event_source_leave_ratelimit(sd_event_source *s) { + int r; + + assert_return(s, -EINVAL); + + if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type)) + return 0; + + if (!ratelimit_configured(&s->rate_limit)) + return 0; + + if (!s->ratelimited) + return 0; + + r = event_source_leave_ratelimit(s, /* run_callback */ false); + if (r < 0) + return r; + + return 1; /* tell caller that we indeed just left the ratelimit state */ +} + +_public_ int sd_event_set_signal_exit(sd_event *e, int b) { + bool change = false; + int r; + + assert_return(e, -EINVAL); + + if (b) { + /* We want to maintain pointers to these event sources, so that we can destroy them when told + * so. But we also don't want them to pin the event loop itself. Hence we mark them as + * floating after creation (and undo this before deleting them again). */ + + if (!e->sigint_event_source) { + r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL); + if (r < 0) + return r; + + assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0); + change = true; + } + + if (!e->sigterm_event_source) { + r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL); + if (r < 0) { + if (change) { + assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0); + e->sigint_event_source = sd_event_source_unref(e->sigint_event_source); + } + + return r; + } + + assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0); + change = true; + } + + } else { + if (e->sigint_event_source) { + assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0); + e->sigint_event_source = sd_event_source_unref(e->sigint_event_source); + change = true; + } + + if (e->sigterm_event_source) { + assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0); + e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source); + change = true; + } + } + + return change; +} + +_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) { + _cleanup_free_ char *b = NULL; + _cleanup_free_ void *w = NULL; + + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + assert_return(ty, -EINVAL); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (!STR_IN_SET(ty, "some", "full")) + return -EINVAL; + + if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ + return -EBUSY; + + char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size); + if (!space) + return -EINVAL; + + size_t l = (char*) space - (char*) s->memory_pressure.write_buffer; + b = memdup_suffix0(s->memory_pressure.write_buffer, l); + if (!b) + return -ENOMEM; + if (!STR_IN_SET(b, "some", "full")) + return -EINVAL; + + if (streq(b, ty)) + return 0; + + size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l); + w = new(char, nl); + if (!w) + return -ENOMEM; + + memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l)); + + free_and_replace(s->memory_pressure.write_buffer, w); + s->memory_pressure.write_buffer_size = nl; + s->memory_pressure.locked = false; + + return 1; +} + +_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { + _cleanup_free_ char *b = NULL; + _cleanup_free_ void *w = NULL; + + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + assert_return(!event_origin_changed(s->event), -ECHILD); + + if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX) + return -ERANGE; + if (window_usec <= 0 || window_usec >= UINT64_MAX) + return -ERANGE; + if (threshold_usec > window_usec) + return -EINVAL; + + if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ + return -EBUSY; + + char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size); + if (!space) + return -EINVAL; + + size_t l = (char*) space - (char*) s->memory_pressure.write_buffer; + b = memdup_suffix0(s->memory_pressure.write_buffer, l); + if (!b) + return -ENOMEM; + if (!STR_IN_SET(b, "some", "full")) + return -EINVAL; + + if (asprintf((char**) &w, + "%s " USEC_FMT " " USEC_FMT "", + b, + threshold_usec, + window_usec) < 0) + return -EINVAL; + + l = strlen(w) + 1; + if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0) + return 0; + + free_and_replace(s->memory_pressure.write_buffer, w); + s->memory_pressure.write_buffer_size = l; + s->memory_pressure.locked = false; + + return 1; +} diff --git a/src/libsystemd/sd-event/test-event.c b/src/libsystemd/sd-event/test-event.c new file mode 100644 index 0000000..63d3ee7 --- /dev/null +++ b/src/libsystemd/sd-event/test-event.c @@ -0,0 +1,902 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-event.h" + +#include "alloc-util.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "missing_syscall.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static int prepare_handler(sd_event_source *s, void *userdata) { + log_info("preparing %c", PTR_TO_INT(userdata)); + return 1; +} + +static bool got_a, got_b, got_c, got_unref; +static unsigned got_d; + +static int unref_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + sd_event_source_unref(s); + got_unref = true; + return 0; +} + +static int io_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + + log_info("got IO on %c", PTR_TO_INT(userdata)); + + if (userdata == INT_TO_PTR('a')) { + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + assert_se(!got_a); + got_a = true; + } else if (userdata == INT_TO_PTR('b')) { + assert_se(!got_b); + got_b = true; + } else if (userdata == INT_TO_PTR('d')) { + got_d++; + if (got_d < 2) + assert_se(sd_event_source_set_enabled(s, SD_EVENT_ONESHOT) >= 0); + else + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + } else + assert_not_reached(); + + return 1; +} + +static int child_handler(sd_event_source *s, const siginfo_t *si, void *userdata) { + + assert_se(s); + assert_se(si); + + assert_se(si->si_uid == getuid()); + assert_se(si->si_signo == SIGCHLD); + assert_se(si->si_code == CLD_EXITED); + assert_se(si->si_status == 78); + + log_info("got child on %c", PTR_TO_INT(userdata)); + + assert_se(userdata == INT_TO_PTR('f')); + + assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + sd_event_source_unref(s); + + return 1; +} + +static int signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + sd_event_source *p = NULL; + pid_t pid; + siginfo_t plain_si; + + assert_se(s); + assert_se(si); + + log_info("got signal on %c", PTR_TO_INT(userdata)); + + assert_se(userdata == INT_TO_PTR('e')); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGUSR2, -1) >= 0); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + sigset_t ss; + + assert_se(sigemptyset(&ss) >= 0); + assert_se(sigaddset(&ss, SIGUSR2) >= 0); + + zero(plain_si); + assert_se(sigwaitinfo(&ss, &plain_si) >= 0); + + assert_se(plain_si.si_signo == SIGUSR2); + assert_se(plain_si.si_value.sival_int == 4711); + + _exit(78); + } + + assert_se(sd_event_add_child(sd_event_source_get_event(s), &p, pid, WEXITED, child_handler, INT_TO_PTR('f')) >= 0); + assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0); + assert_se(sd_event_source_set_child_process_own(p, true) >= 0); + + /* We can't use structured initialization here, since the structure contains various unions and these + * fields lie in overlapping (carefully aligned) unions that LLVM is allergic to allow assignments + * to */ + zero(plain_si); + plain_si.si_signo = SIGUSR2; + plain_si.si_code = SI_QUEUE; + plain_si.si_pid = getpid_cached(); + plain_si.si_uid = getuid(); + plain_si.si_value.sival_int = 4711; + + assert_se(sd_event_source_send_child_signal(p, SIGUSR2, &plain_si, 0) >= 0); + + sd_event_source_unref(s); + + return 1; +} + +static int defer_handler(sd_event_source *s, void *userdata) { + sd_event_source *p = NULL; + + assert_se(s); + + log_info("got defer on %c", PTR_TO_INT(userdata)); + + assert_se(userdata == INT_TO_PTR('d')); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGUSR1, -1) >= 0); + + assert_se(sd_event_add_signal(sd_event_source_get_event(s), &p, SIGUSR1, signal_handler, INT_TO_PTR('e')) >= 0); + assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0); + raise(SIGUSR1); + + sd_event_source_unref(s); + + return 1; +} + +static bool do_quit; + +static int time_handler(sd_event_source *s, uint64_t usec, void *userdata) { + log_info("got timer on %c", PTR_TO_INT(userdata)); + + if (userdata == INT_TO_PTR('c')) { + + if (do_quit) { + sd_event_source *p; + + assert_se(sd_event_add_defer(sd_event_source_get_event(s), &p, defer_handler, INT_TO_PTR('d')) >= 0); + assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0); + } else { + assert_se(!got_c); + got_c = true; + } + } else + assert_not_reached(); + + return 2; +} + +static bool got_exit = false; + +static int exit_handler(sd_event_source *s, void *userdata) { + log_info("got quit handler on %c", PTR_TO_INT(userdata)); + + got_exit = true; + + return 3; +} + +static bool got_post = false; + +static int post_handler(sd_event_source *s, void *userdata) { + log_info("got post handler"); + + got_post = true; + + return 2; +} + +static void test_basic_one(bool with_pidfd) { + sd_event *e = NULL; + sd_event_source *w = NULL, *x = NULL, *y = NULL, *z = NULL, *q = NULL, *t = NULL; + static const char ch = 'x'; + int a[2] = EBADF_PAIR, b[2] = EBADF_PAIR, + d[2] = EBADF_PAIR, k[2] = EBADF_PAIR; + uint64_t event_now; + int64_t priority; + + log_info("/* %s(pidfd=%s) */", __func__, yes_no(with_pidfd)); + + assert_se(setenv("SYSTEMD_PIDFD", yes_no(with_pidfd), 1) >= 0); + + assert_se(pipe(a) >= 0); + assert_se(pipe(b) >= 0); + assert_se(pipe(d) >= 0); + assert_se(pipe(k) >= 0); + + assert_se(sd_event_default(&e) >= 0); + assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) > 0); + + assert_se(sd_event_set_watchdog(e, true) >= 0); + + /* Test whether we cleanly can destroy an io event source from its own handler */ + got_unref = false; + assert_se(sd_event_add_io(e, &t, k[0], EPOLLIN, unref_handler, NULL) >= 0); + assert_se(write(k[1], &ch, 1) == 1); + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(got_unref); + + got_a = false, got_b = false, got_c = false, got_d = 0; + + /* Add a oneshot handler, trigger it, reenable it, and trigger it again. */ + assert_se(sd_event_add_io(e, &w, d[0], EPOLLIN, io_handler, INT_TO_PTR('d')) >= 0); + assert_se(sd_event_source_set_enabled(w, SD_EVENT_ONESHOT) >= 0); + assert_se(write(d[1], &ch, 1) >= 0); + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(got_d == 1); + assert_se(write(d[1], &ch, 1) >= 0); + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(got_d == 2); + + assert_se(sd_event_add_io(e, &x, a[0], EPOLLIN, io_handler, INT_TO_PTR('a')) >= 0); + assert_se(sd_event_add_io(e, &y, b[0], EPOLLIN, io_handler, INT_TO_PTR('b')) >= 0); + + do_quit = false; + assert_se(sd_event_add_time(e, &z, CLOCK_MONOTONIC, 0, 0, time_handler, INT_TO_PTR('c')) >= 0); + assert_se(sd_event_add_exit(e, &q, exit_handler, INT_TO_PTR('g')) >= 0); + + assert_se(sd_event_source_set_priority(x, 99) >= 0); + assert_se(sd_event_source_get_priority(x, &priority) >= 0); + assert_se(priority == 99); + assert_se(sd_event_source_set_enabled(y, SD_EVENT_ONESHOT) >= 0); + assert_se(sd_event_source_set_prepare(x, prepare_handler) >= 0); + assert_se(sd_event_source_set_priority(z, 50) >= 0); + assert_se(sd_event_source_set_enabled(z, SD_EVENT_ONESHOT) >= 0); + assert_se(sd_event_source_set_prepare(z, prepare_handler) >= 0); + + /* Test for floating event sources */ + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGRTMIN+1, -1) >= 0); + assert_se(sd_event_add_signal(e, NULL, SIGRTMIN+1, NULL, NULL) >= 0); + + assert_se(write(a[1], &ch, 1) >= 0); + assert_se(write(b[1], &ch, 1) >= 0); + + assert_se(!got_a && !got_b && !got_c); + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + + assert_se(!got_a && got_b && !got_c); + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + + assert_se(!got_a && got_b && got_c); + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + + assert_se(got_a && got_b && got_c); + + sd_event_source_unref(x); + sd_event_source_unref(y); + + do_quit = true; + assert_se(sd_event_add_post(e, NULL, post_handler, NULL) >= 0); + assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) == 0); + assert_se(sd_event_source_set_time(z, event_now + 200 * USEC_PER_MSEC) >= 0); + assert_se(sd_event_source_set_enabled(z, SD_EVENT_ONESHOT) >= 0); + + assert_se(sd_event_loop(e) >= 0); + assert_se(got_post); + assert_se(got_exit); + + sd_event_source_unref(z); + sd_event_source_unref(q); + + sd_event_source_unref(w); + + sd_event_unref(e); + + safe_close_pair(a); + safe_close_pair(b); + safe_close_pair(d); + safe_close_pair(k); + + assert_se(unsetenv("SYSTEMD_PIDFD") >= 0); +} + +TEST(basic) { + test_basic_one(true); /* test with pidfd */ + test_basic_one(false); /* test without pidfd */ +} + +TEST(sd_event_now) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + uint64_t event_now; + + assert_se(sd_event_new(&e) >= 0); + assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) > 0); + assert_se(sd_event_now(e, CLOCK_REALTIME, &event_now) > 0); + assert_se(sd_event_now(e, CLOCK_REALTIME_ALARM, &event_now) > 0); + assert_se(sd_event_now(e, CLOCK_BOOTTIME, &event_now) > 0); + assert_se(sd_event_now(e, CLOCK_BOOTTIME_ALARM, &event_now) > 0); + assert_se(sd_event_now(e, -1, &event_now) == -EOPNOTSUPP); + assert_se(sd_event_now(e, 900 /* arbitrary big number */, &event_now) == -EOPNOTSUPP); + + assert_se(sd_event_run(e, 0) == 0); + + assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) == 0); + assert_se(sd_event_now(e, CLOCK_REALTIME, &event_now) == 0); + assert_se(sd_event_now(e, CLOCK_REALTIME_ALARM, &event_now) == 0); + assert_se(sd_event_now(e, CLOCK_BOOTTIME, &event_now) == 0); + assert_se(sd_event_now(e, CLOCK_BOOTTIME_ALARM, &event_now) == 0); + assert_se(sd_event_now(e, -1, &event_now) == -EOPNOTSUPP); + assert_se(sd_event_now(e, 900 /* arbitrary big number */, &event_now) == -EOPNOTSUPP); +} + +static int last_rtqueue_sigval = 0; +static int n_rtqueue = 0; + +static int rtqueue_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + last_rtqueue_sigval = si->ssi_int; + n_rtqueue++; + return 0; +} + +TEST(rtqueue) { + sd_event_source *u = NULL, *v = NULL, *s = NULL; + sd_event *e = NULL; + + assert_se(sd_event_default(&e) >= 0); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGRTMIN+2, SIGRTMIN+3, SIGUSR2, -1) >= 0); + assert_se(sd_event_add_signal(e, &u, SIGRTMIN+2, rtqueue_handler, NULL) >= 0); + assert_se(sd_event_add_signal(e, &v, SIGRTMIN+3, rtqueue_handler, NULL) >= 0); + assert_se(sd_event_add_signal(e, &s, SIGUSR2, rtqueue_handler, NULL) >= 0); + + assert_se(sd_event_source_set_priority(v, -10) >= 0); + + assert_se(sigqueue(getpid_cached(), SIGRTMIN+2, (union sigval) { .sival_int = 1 }) >= 0); + assert_se(sigqueue(getpid_cached(), SIGRTMIN+3, (union sigval) { .sival_int = 2 }) >= 0); + assert_se(sigqueue(getpid_cached(), SIGUSR2, (union sigval) { .sival_int = 3 }) >= 0); + assert_se(sigqueue(getpid_cached(), SIGRTMIN+3, (union sigval) { .sival_int = 4 }) >= 0); + assert_se(sigqueue(getpid_cached(), SIGUSR2, (union sigval) { .sival_int = 5 }) >= 0); + + assert_se(n_rtqueue == 0); + assert_se(last_rtqueue_sigval == 0); + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(n_rtqueue == 1); + assert_se(last_rtqueue_sigval == 2); /* first SIGRTMIN+3 */ + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(n_rtqueue == 2); + assert_se(last_rtqueue_sigval == 4); /* second SIGRTMIN+3 */ + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(n_rtqueue == 3); + assert_se(last_rtqueue_sigval == 3); /* first SIGUSR2 */ + + assert_se(sd_event_run(e, UINT64_MAX) >= 1); + assert_se(n_rtqueue == 4); + assert_se(last_rtqueue_sigval == 1); /* SIGRTMIN+2 */ + + assert_se(sd_event_run(e, 0) == 0); /* the other SIGUSR2 is dropped, because the first one was still queued */ + assert_se(n_rtqueue == 4); + assert_se(last_rtqueue_sigval == 1); + + sd_event_source_unref(u); + sd_event_source_unref(v); + sd_event_source_unref(s); + + sd_event_unref(e); +} + +#define CREATE_EVENTS_MAX (70000U) + +struct inotify_context { + bool delete_self_handler_called; + unsigned create_called[CREATE_EVENTS_MAX]; + unsigned create_overflow; + unsigned n_create_events; +}; + +static void maybe_exit(sd_event_source *s, struct inotify_context *c) { + unsigned n; + + assert_se(s); + assert_se(c); + + if (!c->delete_self_handler_called) + return; + + for (n = 0; n < 3; n++) { + unsigned i; + + if (c->create_overflow & (1U << n)) + continue; + + for (i = 0; i < c->n_create_events; i++) + if (!(c->create_called[i] & (1U << n))) + return; + } + + sd_event_exit(sd_event_source_get_event(s), 0); +} + +static int inotify_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) { + struct inotify_context *c = userdata; + const char *description; + unsigned bit, n; + + assert_se(sd_event_source_get_description(s, &description) >= 0); + assert_se(safe_atou(description, &n) >= 0); + + assert_se(n <= 3); + bit = 1U << n; + + if (ev->mask & IN_Q_OVERFLOW) { + log_info("inotify-handler <%s>: overflow", description); + c->create_overflow |= bit; + } else if (ev->mask & IN_CREATE) { + if (streq(ev->name, "sub")) + log_debug("inotify-handler <%s>: create on %s", description, ev->name); + else { + unsigned i; + + assert_se(safe_atou(ev->name, &i) >= 0); + assert_se(i < c->n_create_events); + c->create_called[i] |= bit; + } + } else if (ev->mask & IN_DELETE) { + log_info("inotify-handler <%s>: delete of %s", description, ev->name); + assert_se(streq(ev->name, "sub")); + } else + assert_not_reached(); + + maybe_exit(s, c); + return 1; +} + +static int delete_self_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) { + struct inotify_context *c = userdata; + + if (ev->mask & IN_Q_OVERFLOW) { + log_info("delete-self-handler: overflow"); + c->delete_self_handler_called = true; + } else if (ev->mask & IN_DELETE_SELF) { + log_info("delete-self-handler: delete-self"); + c->delete_self_handler_called = true; + } else if (ev->mask & IN_IGNORED) { + log_info("delete-self-handler: ignore"); + } else + assert_not_reached(); + + maybe_exit(s, c); + return 1; +} + +static void test_inotify_one(unsigned n_create_events) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + sd_event_source *a = NULL, *b = NULL, *c = NULL, *d = NULL; + struct inotify_context context = { + .n_create_events = n_create_events, + }; + sd_event *e = NULL; + const char *q; + unsigned i; + + log_info("/* %s(%u) */", __func__, n_create_events); + + assert_se(sd_event_default(&e) >= 0); + + assert_se(mkdtemp_malloc("/tmp/test-inotify-XXXXXX", &p) >= 0); + + assert_se(sd_event_add_inotify(e, &a, p, IN_CREATE|IN_ONLYDIR, inotify_handler, &context) >= 0); + assert_se(sd_event_add_inotify(e, &b, p, IN_CREATE|IN_DELETE|IN_DONT_FOLLOW, inotify_handler, &context) >= 0); + assert_se(sd_event_source_set_priority(b, SD_EVENT_PRIORITY_IDLE) >= 0); + assert_se(sd_event_source_set_priority(b, SD_EVENT_PRIORITY_NORMAL) >= 0); + assert_se(sd_event_add_inotify(e, &c, p, IN_CREATE|IN_DELETE|IN_EXCL_UNLINK, inotify_handler, &context) >= 0); + assert_se(sd_event_source_set_priority(c, SD_EVENT_PRIORITY_IDLE) >= 0); + + assert_se(sd_event_source_set_description(a, "0") >= 0); + assert_se(sd_event_source_set_description(b, "1") >= 0); + assert_se(sd_event_source_set_description(c, "2") >= 0); + + q = strjoina(p, "/sub"); + assert_se(touch(q) >= 0); + assert_se(sd_event_add_inotify(e, &d, q, IN_DELETE_SELF, delete_self_handler, &context) >= 0); + + for (i = 0; i < n_create_events; i++) { + char buf[DECIMAL_STR_MAX(unsigned)+1]; + _cleanup_free_ char *z = NULL; + + xsprintf(buf, "%u", i); + assert_se(z = path_join(p, buf)); + + assert_se(touch(z) >= 0); + } + + assert_se(unlink(q) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + sd_event_source_unref(a); + sd_event_source_unref(b); + sd_event_source_unref(c); + sd_event_source_unref(d); + + sd_event_unref(e); +} + +TEST(inotify) { + test_inotify_one(100); /* should work without overflow */ + test_inotify_one(33000); /* should trigger a q overflow */ +} + +static int pidfd_handler(sd_event_source *s, const siginfo_t *si, void *userdata) { + assert_se(s); + assert_se(si); + + assert_se(si->si_uid == getuid()); + assert_se(si->si_signo == SIGCHLD); + assert_se(si->si_code == CLD_EXITED); + assert_se(si->si_status == 66); + + log_info("got pidfd on %c", PTR_TO_INT(userdata)); + + assert_se(userdata == INT_TO_PTR('p')); + + assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + sd_event_source_unref(s); + + return 0; +} + +TEST(pidfd) { + sd_event_source *s = NULL, *t = NULL; + sd_event *e = NULL; + int pidfd; + pid_t pid, pid2; + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + + pid = fork(); + if (pid == 0) + /* child */ + _exit(66); + + assert_se(pid > 1); + + pidfd = pidfd_open(pid, 0); + if (pidfd < 0) { + /* No pidfd_open() supported or blocked? */ + assert_se(ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)); + (void) wait_for_terminate(pid, NULL); + return; + } + + pid2 = fork(); + if (pid2 == 0) + freeze(); + + assert_se(pid > 2); + + assert_se(sd_event_default(&e) >= 0); + assert_se(sd_event_add_child_pidfd(e, &s, pidfd, WEXITED, pidfd_handler, INT_TO_PTR('p')) >= 0); + assert_se(sd_event_source_set_child_pidfd_own(s, true) >= 0); + + /* This one should never trigger, since our second child lives forever */ + assert_se(sd_event_add_child(e, &t, pid2, WEXITED, pidfd_handler, INT_TO_PTR('q')) >= 0); + assert_se(sd_event_source_set_child_process_own(t, true) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + /* Child should still be alive */ + assert_se(kill(pid2, 0) >= 0); + + t = sd_event_source_unref(t); + + /* Child should now be dead, since we dropped the ref */ + assert_se(kill(pid2, 0) < 0 && errno == ESRCH); + + sd_event_unref(e); +} + +static int ratelimit_io_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + unsigned *c = (unsigned*) userdata; + *c += 1; + return 0; +} + +static int ratelimit_time_handler(sd_event_source *s, uint64_t usec, void *userdata) { + int r; + + r = sd_event_source_set_enabled(s, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to turn on notify event source: %m"); + + r = sd_event_source_set_time(s, usec + 1000); + if (r < 0) + log_error_errno(r, "Failed to restart watchdog event source: %m"); + + unsigned *c = (unsigned*) userdata; + *c += 1; + + return 0; +} + +static int expired = -1; +static int ratelimit_expired(sd_event_source *s, void *userdata) { + return ++expired; +} + +TEST(ratelimit) { + _cleanup_close_pair_ int p[2] = EBADF_PAIR; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + uint64_t interval; + unsigned count, burst; + + assert_se(sd_event_default(&e) >= 0); + assert_se(pipe2(p, O_CLOEXEC|O_NONBLOCK) >= 0); + + assert_se(sd_event_add_io(e, &s, p[0], EPOLLIN, ratelimit_io_handler, &count) >= 0); + assert_se(sd_event_source_set_description(s, "test-ratelimit-io") >= 0); + assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 5) >= 0); + assert_se(sd_event_source_get_ratelimit(s, &interval, &burst) >= 0); + assert_se(interval == 1 * USEC_PER_SEC && burst == 5); + + assert_se(write(p[1], "1", 1) == 1); + + count = 0; + for (unsigned i = 0; i < 10; i++) { + log_debug("slow loop iteration %u", i); + assert_se(sd_event_run(e, UINT64_MAX) >= 0); + assert_se(usleep_safe(250 * USEC_PER_MSEC) >= 0); + } + + assert_se(sd_event_source_is_ratelimited(s) == 0); + assert_se(count == 10); + log_info("ratelimit_io_handler: called %u times, event source not ratelimited", count); + + assert_se(sd_event_source_set_ratelimit(s, 0, 0) >= 0); + assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 5) >= 0); + + count = 0; + for (unsigned i = 0; i < 10; i++) { + log_debug("fast event loop iteration %u", i); + assert_se(sd_event_run(e, UINT64_MAX) >= 0); + assert_se(usleep_safe(10) >= 0); + } + log_info("ratelimit_io_handler: called %u times, event source got ratelimited", count); + assert_se(count < 10); + + s = sd_event_source_unref(s); + safe_close_pair(p); + + count = 0; + assert_se(sd_event_add_time_relative(e, &s, CLOCK_MONOTONIC, 1000, 1, ratelimit_time_handler, &count) >= 0); + assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 10) == 0); + + do { + assert_se(sd_event_run(e, UINT64_MAX) >= 0); + } while (!sd_event_source_is_ratelimited(s)); + + log_info("ratelimit_time_handler: called %u times, event source got ratelimited", count); + assert_se(count == 10); + + /* In order to get rid of active rate limit client needs to disable it explicitly */ + assert_se(sd_event_source_set_ratelimit(s, 0, 0) >= 0); + assert_se(!sd_event_source_is_ratelimited(s)); + + assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 10) >= 0); + + /* Set callback that will be invoked when we leave rate limited state. */ + assert_se(sd_event_source_set_ratelimit_expire_callback(s, ratelimit_expired) >= 0); + + do { + assert_se(sd_event_run(e, UINT64_MAX) >= 0); + } while (!sd_event_source_is_ratelimited(s)); + + log_info("ratelimit_time_handler: called 10 more times, event source got ratelimited"); + assert_se(count == 20); + + /* Dispatch the event loop once more and check that ratelimit expiration callback got called */ + assert_se(sd_event_run(e, UINT64_MAX) >= 0); + assert_se(expired == 0); +} + +TEST(simple_timeout) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + usec_t f, t, some_time; + + some_time = random_u64_range(2 * USEC_PER_SEC); + + assert_se(sd_event_default(&e) >= 0); + + assert_se(sd_event_prepare(e) == 0); + + f = now(CLOCK_MONOTONIC); + assert_se(sd_event_wait(e, some_time) >= 0); + t = now(CLOCK_MONOTONIC); + + /* The event loop may sleep longer than the specified time (timer accuracy, scheduling latencies, …), + * but never shorter. Let's check that. */ + assert_se(t >= usec_add(f, some_time)); +} + +static int inotify_self_destroy_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) { + sd_event_source **p = userdata; + + assert_se(ev); + assert_se(p); + assert_se(*p == s); + + assert_se(FLAGS_SET(ev->mask, IN_ATTRIB)); + + assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + + *p = sd_event_source_unref(*p); /* here's what we actually intend to test: we destroy the event + * source from inside the event source handler */ + return 1; +} + +TEST(inotify_self_destroy) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + char path[] = "/tmp/inotifyXXXXXX"; + _cleanup_close_ int fd = -EBADF; + + /* Tests that destroying an inotify event source from its own handler is safe */ + + assert_se(sd_event_default(&e) >= 0); + + fd = mkostemp_safe(path); + assert_se(fd >= 0); + assert_se(sd_event_add_inotify_fd(e, &s, fd, IN_ATTRIB, inotify_self_destroy_handler, &s) >= 0); + fd = safe_close(fd); + assert_se(unlink(path) >= 0); /* This will trigger IN_ATTRIB because link count goes to zero */ + assert_se(sd_event_loop(e) >= 0); +} + +struct inotify_process_buffered_data_context { + const char *path[2]; + unsigned i; +}; + +static int inotify_process_buffered_data_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) { + struct inotify_process_buffered_data_context *c = ASSERT_PTR(userdata); + const char *description; + + assert_se(sd_event_source_get_description(s, &description) >= 0); + + assert_se(c->i < 2); + assert_se(streq(c->path[c->i], description)); + c->i++; + + return 1; +} + +TEST(inotify_process_buffered_data) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL, *q = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *a = NULL, *b = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *z = NULL; + + /* For issue #23826 */ + + assert_se(sd_event_default(&e) >= 0); + + assert_se(mkdtemp_malloc("/tmp/test-inotify-XXXXXX", &p) >= 0); + assert_se(mkdtemp_malloc("/tmp/test-inotify-XXXXXX", &q) >= 0); + + struct inotify_process_buffered_data_context context = { + .path = { p, q }, + }; + + assert_se(sd_event_add_inotify(e, &a, p, IN_CREATE, inotify_process_buffered_data_handler, &context) >= 0); + assert_se(sd_event_add_inotify(e, &b, q, IN_CREATE, inotify_process_buffered_data_handler, &context) >= 0); + + assert_se(z = path_join(p, "aaa")); + assert_se(touch(z) >= 0); + z = mfree(z); + assert_se(z = path_join(q, "bbb")); + assert_se(touch(z) >= 0); + z = mfree(z); + + assert_se(sd_event_run(e, 10 * USEC_PER_SEC) > 0); + assert_se(sd_event_prepare(e) > 0); /* issue #23826: this was 0. */ + assert_se(sd_event_dispatch(e) > 0); + assert_se(sd_event_prepare(e) == 0); + assert_se(sd_event_wait(e, 0) == 0); +} + +TEST(fork) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + int r; + + assert_se(sd_event_default(&e) >= 0); + assert_se(sd_event_prepare(e) == 0); + + /* Check that after a fork the cleanup functions return NULL */ + r = safe_fork("(bus-fork-test)", FORK_WAIT|FORK_LOG, NULL); + if (r == 0) { + assert_se(e); + assert_se(sd_event_ref(e) == NULL); + assert_se(sd_event_unref(e) == NULL); + _exit(EXIT_SUCCESS); + } + + assert_se(r >= 0); +} + +static int hup_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + unsigned *c = userdata; + + assert_se(revents == EPOLLHUP); + + (*c)++; + return 0; +} + +TEST(leave_ratelimit) { + bool expect_ratelimit = false, manually_left_ratelimit = false; + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_close_pair_ int pfd[2] = EBADF_PAIR; + unsigned c = 0; + int r; + + assert_se(sd_event_default(&e) >= 0); + + /* Create an event source that will continuously fire by creating a pipe whose write side is closed, + * and which hence will only see EOF and constant EPOLLHUP */ + assert_se(pipe2(pfd, O_CLOEXEC) >= 0); + assert_se(sd_event_add_io(e, &s, pfd[0], EPOLLIN, hup_callback, &c) >= 0); + assert_se(sd_event_source_set_io_fd_own(s, true) >= 0); + assert_se(sd_event_source_set_ratelimit(s, 5*USEC_PER_MINUTE, 5) >= 0); + + pfd[0] = -EBADF; + pfd[1] = safe_close(pfd[1]); /* Trigger continuous EOF */ + + for (;;) { + r = sd_event_prepare(e); + assert_se(r >= 0); + + if (r == 0) { + r = sd_event_wait(e, UINT64_MAX); + assert_se(r > 0); + } + + r = sd_event_dispatch(e); + assert_se(r > 0); + + r = sd_event_source_is_ratelimited(s); + assert_se(r >= 0); + + if (c < 5) + /* First four dispatches should just work */ + assert_se(!r); + else if (c == 5) { + /* The fifth dispatch should still work, but we now expect the ratelimit to be hit subsequently */ + if (!expect_ratelimit) { + assert_se(!r); + assert_se(sd_event_source_leave_ratelimit(s) == 0); /* this should be a NOP, and return 0 hence */ + expect_ratelimit = true; + } else { + /* We expected the ratelimit, let's leave it manually, and verify it */ + assert_se(r); + assert_se(sd_event_source_leave_ratelimit(s) > 0); /* we are ratelimited, hence should return > 0 */ + assert_se(sd_event_source_is_ratelimited(s) == 0); + + manually_left_ratelimit = true; + } + + } else if (c == 6) + /* On the sixth iteration let's just exit */ + break; + } + + /* Verify we definitely hit the ratelimit and left it manually again */ + assert_se(manually_left_ratelimit); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/libsystemd/sd-hwdb/hwdb-internal.h b/src/libsystemd/sd-hwdb/hwdb-internal.h new file mode 100644 index 0000000..9db3b31 --- /dev/null +++ b/src/libsystemd/sd-hwdb/hwdb-internal.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "constants.h" +#include "hashmap.h" +#include "sparse-endian.h" + +#define HWDB_SIG { 'K', 'S', 'L', 'P', 'H', 'H', 'R', 'H' } + +struct sd_hwdb { + unsigned n_ref; + + FILE *f; + struct stat st; + union { + struct trie_header_f *head; + const char *map; + }; + + OrderedHashmap *properties; + Iterator properties_iterator; + bool properties_modified; +}; + +/* on-disk trie objects */ +struct trie_header_f { + uint8_t signature[8]; + + /* version of tool which created the file */ + le64_t tool_version; + le64_t file_size; + + /* size of structures to allow them to grow */ + le64_t header_size; + le64_t node_size; + le64_t child_entry_size; + le64_t value_entry_size; + + /* offset of the root trie node */ + le64_t nodes_root_off; + + /* size of the nodes and string section */ + le64_t nodes_len; + le64_t strings_len; +} _packed_; + +struct trie_node_f { + /* prefix of lookup string, shared by all children */ + le64_t prefix_off; + /* size of children entry array appended to the node */ + uint8_t children_count; + uint8_t padding[7]; + /* size of value entry array appended to the node */ + le64_t values_count; +} _packed_; + +/* array of child entries, follows directly the node record */ +struct trie_child_entry_f { + /* index of the child node */ + uint8_t c; + uint8_t padding[7]; + /* offset of the child node */ + le64_t child_off; +} _packed_; + +/* array of value entries, follows directly the node record/child array */ +struct trie_value_entry_f { + le64_t key_off; + le64_t value_off; +} _packed_; + +/* v2 extends v1 with filename and line-number */ +struct trie_value_entry2_f { + le64_t key_off; + le64_t value_off; + le64_t filename_off; + le32_t line_number; + le16_t file_priority; + le16_t padding; +} _packed_; + +#define hwdb_bin_paths \ + "/etc/systemd/hwdb/hwdb.bin\0" \ + "/etc/udev/hwdb.bin\0" \ + "/usr/lib/systemd/hwdb/hwdb.bin\0" \ + UDEVLIBEXECDIR "/hwdb.bin\0" diff --git a/src/libsystemd/sd-hwdb/sd-hwdb.c b/src/libsystemd/sd-hwdb/sd-hwdb.c new file mode 100644 index 0000000..f163314 --- /dev/null +++ b/src/libsystemd/sd-hwdb/sd-hwdb.c @@ -0,0 +1,436 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2008 Alan Jenkins +***/ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-hwdb.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "hwdb-internal.h" +#include "nulstr-util.h" +#include "string-util.h" +#include "time-util.h" + +struct linebuf { + char bytes[LINE_MAX]; + size_t size; + size_t len; +}; + +static void linebuf_init(struct linebuf *buf) { + buf->size = 0; + buf->len = 0; +} + +static const char *linebuf_get(struct linebuf *buf) { + if (buf->len + 1 >= sizeof(buf->bytes)) + return NULL; + buf->bytes[buf->len] = '\0'; + return buf->bytes; +} + +static bool linebuf_add(struct linebuf *buf, const char *s, size_t len) { + if (buf->len + len >= sizeof(buf->bytes)) + return false; + memcpy(buf->bytes + buf->len, s, len); + buf->len += len; + return true; +} + +static bool linebuf_add_char(struct linebuf *buf, char c) { + if (buf->len + 1 >= sizeof(buf->bytes)) + return false; + buf->bytes[buf->len++] = c; + return true; +} + +static void linebuf_rem(struct linebuf *buf, size_t count) { + assert(buf->len >= count); + buf->len -= count; +} + +static void linebuf_rem_char(struct linebuf *buf) { + linebuf_rem(buf, 1); +} + +static const struct trie_child_entry_f *trie_node_child(sd_hwdb *hwdb, const struct trie_node_f *node, size_t idx) { + const char *base = (const char *)node; + + base += le64toh(hwdb->head->node_size); + base += idx * le64toh(hwdb->head->child_entry_size); + return (const struct trie_child_entry_f *)base; +} + +static const struct trie_value_entry_f *trie_node_value(sd_hwdb *hwdb, const struct trie_node_f *node, size_t idx) { + const char *base = (const char *)node; + + base += le64toh(hwdb->head->node_size); + base += node->children_count * le64toh(hwdb->head->child_entry_size); + base += idx * le64toh(hwdb->head->value_entry_size); + return (const struct trie_value_entry_f *)base; +} + +static const struct trie_node_f *trie_node_from_off(sd_hwdb *hwdb, le64_t off) { + return (const struct trie_node_f *)(hwdb->map + le64toh(off)); +} + +static const char *trie_string(sd_hwdb *hwdb, le64_t off) { + return hwdb->map + le64toh(off); +} + +static int trie_children_cmp_f(const void *v1, const void *v2) { + const struct trie_child_entry_f *n1 = v1; + const struct trie_child_entry_f *n2 = v2; + + return n1->c - n2->c; +} + +static const struct trie_node_f *node_lookup_f(sd_hwdb *hwdb, const struct trie_node_f *node, uint8_t c) { + struct trie_child_entry_f *child; + struct trie_child_entry_f search; + + search.c = c; + child = bsearch(&search, (const char *)node + le64toh(hwdb->head->node_size), node->children_count, + le64toh(hwdb->head->child_entry_size), trie_children_cmp_f); + if (child) + return trie_node_from_off(hwdb, child->child_off); + return NULL; +} + +static int hwdb_add_property(sd_hwdb *hwdb, const struct trie_value_entry_f *entry) { + const char *key; + int r; + + assert(hwdb); + + key = trie_string(hwdb, entry->key_off); + + /* + * Silently ignore all properties which do not start with a + * space; future extensions might use additional prefixes. + */ + if (key[0] != ' ') + return 0; + + key++; + + if (le64toh(hwdb->head->value_entry_size) >= sizeof(struct trie_value_entry2_f)) { + const struct trie_value_entry2_f *old, *entry2; + + entry2 = (const struct trie_value_entry2_f *)entry; + old = ordered_hashmap_get(hwdb->properties, key); + if (old) { + /* On duplicates, we order by filename priority and line-number. + * + * v2 of the format had 64 bits for the line number. + * v3 reuses top 32 bits of line_number to store the priority. + * We check the top bits — if they are zero we have v2 format. + * This means that v2 clients will print wrong line numbers with + * v3 data. + * + * For v3 data: we compare the priority (of the source file) + * and the line number. + * + * For v2 data: we rely on the fact that the filenames in the hwdb + * are added in the order of priority (higher later), because they + * are *processed* in the order of priority. So we compare the + * indices to determine which file had higher priority. Comparing + * the strings alphabetically would be useless, because those are + * full paths, and e.g. /usr/lib would sort after /etc, even + * though it has lower priority. This is not reliable because of + * suffix compression, but should work for the most common case of + * /usr/lib/udev/hwbd.d and /etc/udev/hwdb.d, and is better than + * not doing the comparison at all. + */ + bool lower; + + if (entry2->file_priority == 0) + lower = entry2->filename_off < old->filename_off || + (entry2->filename_off == old->filename_off && entry2->line_number < old->line_number); + else + lower = entry2->file_priority < old->file_priority || + (entry2->file_priority == old->file_priority && entry2->line_number < old->line_number); + if (lower) + return 0; + } + } + + r = ordered_hashmap_ensure_allocated(&hwdb->properties, &string_hash_ops); + if (r < 0) + return r; + + r = ordered_hashmap_replace(hwdb->properties, key, (void *)entry); + if (r < 0) + return r; + + hwdb->properties_modified = true; + + return 0; +} + +static int trie_fnmatch_f(sd_hwdb *hwdb, const struct trie_node_f *node, size_t p, + struct linebuf *buf, const char *search) { + size_t len; + size_t i; + const char *prefix; + int err; + + prefix = trie_string(hwdb, node->prefix_off); + len = strlen(prefix + p); + linebuf_add(buf, prefix + p, len); + + for (i = 0; i < node->children_count; i++) { + const struct trie_child_entry_f *child = trie_node_child(hwdb, node, i); + + linebuf_add_char(buf, child->c); + err = trie_fnmatch_f(hwdb, trie_node_from_off(hwdb, child->child_off), 0, buf, search); + if (err < 0) + return err; + linebuf_rem_char(buf); + } + + if (le64toh(node->values_count) && fnmatch(linebuf_get(buf), search, 0) == 0) + for (i = 0; i < le64toh(node->values_count); i++) { + err = hwdb_add_property(hwdb, trie_node_value(hwdb, node, i)); + if (err < 0) + return err; + } + + linebuf_rem(buf, len); + return 0; +} + +static int trie_search_f(sd_hwdb *hwdb, const char *search) { + struct linebuf buf; + const struct trie_node_f *node; + size_t i = 0; + int err; + + linebuf_init(&buf); + + node = trie_node_from_off(hwdb, hwdb->head->nodes_root_off); + while (node) { + const struct trie_node_f *child; + size_t p = 0; + + if (node->prefix_off) { + char c; + + for (; (c = trie_string(hwdb, node->prefix_off)[p]); p++) { + if (IN_SET(c, '*', '?', '[')) + return trie_fnmatch_f(hwdb, node, p, &buf, search + i + p); + if (c != search[i + p]) + return 0; + } + i += p; + } + + child = node_lookup_f(hwdb, node, '*'); + if (child) { + linebuf_add_char(&buf, '*'); + err = trie_fnmatch_f(hwdb, child, 0, &buf, search + i); + if (err < 0) + return err; + linebuf_rem_char(&buf); + } + + child = node_lookup_f(hwdb, node, '?'); + if (child) { + linebuf_add_char(&buf, '?'); + err = trie_fnmatch_f(hwdb, child, 0, &buf, search + i); + if (err < 0) + return err; + linebuf_rem_char(&buf); + } + + child = node_lookup_f(hwdb, node, '['); + if (child) { + linebuf_add_char(&buf, '['); + err = trie_fnmatch_f(hwdb, child, 0, &buf, search + i); + if (err < 0) + return err; + linebuf_rem_char(&buf); + } + + if (search[i] == '\0') { + size_t n; + + for (n = 0; n < le64toh(node->values_count); n++) { + err = hwdb_add_property(hwdb, trie_node_value(hwdb, node, n)); + if (err < 0) + return err; + } + return 0; + } + + child = node_lookup_f(hwdb, node, search[i]); + node = child; + i++; + } + return 0; +} + +static int hwdb_new(const char *path, sd_hwdb **ret) { + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + const char sig[] = HWDB_SIG; + + assert_return(ret, -EINVAL); + + hwdb = new0(sd_hwdb, 1); + if (!hwdb) + return -ENOMEM; + + hwdb->n_ref = 1; + + /* Find hwdb.bin in the explicit path if provided, or iterate over hwdb_bin_paths otherwise */ + if (!isempty(path)) { + log_debug("Trying to open \"%s\"...", path); + hwdb->f = fopen(path, "re"); + if (!hwdb->f) + return log_debug_errno(errno, "Failed to open %s: %m", path); + } else { + NULSTR_FOREACH(p, hwdb_bin_paths) { + log_debug("Trying to open \"%s\"...", p); + hwdb->f = fopen(p, "re"); + if (hwdb->f) { + path = p; + break; + } + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to open %s: %m", p); + } + + if (!hwdb->f) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "hwdb.bin does not exist, please run 'systemd-hwdb update'"); + } + + if (fstat(fileno(hwdb->f), &hwdb->st) < 0) + return log_debug_errno(errno, "Failed to stat %s: %m", path); + if (hwdb->st.st_size < (off_t) offsetof(struct trie_header_f, strings_len) + 8) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "File %s is too short: %m", path); + if (file_offset_beyond_memory_size(hwdb->st.st_size)) + return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "File %s is too long: %m", path); + + hwdb->map = mmap(0, hwdb->st.st_size, PROT_READ, MAP_SHARED, fileno(hwdb->f), 0); + if (hwdb->map == MAP_FAILED) + return log_debug_errno(errno, "Failed to map %s: %m", path); + + if (memcmp(hwdb->map, sig, sizeof(hwdb->head->signature)) != 0 || + (size_t) hwdb->st.st_size != le64toh(hwdb->head->file_size)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to recognize the format of %s", path); + + log_debug("=== trie on-disk ==="); + log_debug("tool version: %"PRIu64, le64toh(hwdb->head->tool_version)); + log_debug("file size: %8"PRIi64" bytes", hwdb->st.st_size); + log_debug("header size %8"PRIu64" bytes", le64toh(hwdb->head->header_size)); + log_debug("strings %8"PRIu64" bytes", le64toh(hwdb->head->strings_len)); + log_debug("nodes %8"PRIu64" bytes", le64toh(hwdb->head->nodes_len)); + + *ret = TAKE_PTR(hwdb); + + return 0; +} + +_public_ int sd_hwdb_new_from_path(const char *path, sd_hwdb **ret) { + assert_return(!isempty(path), -EINVAL); + + return hwdb_new(path, ret); +} + +_public_ int sd_hwdb_new(sd_hwdb **ret) { + return hwdb_new(NULL, ret); +} + +static sd_hwdb *hwdb_free(sd_hwdb *hwdb) { + assert(hwdb); + + if (hwdb->map) + munmap((void *)hwdb->map, hwdb->st.st_size); + safe_fclose(hwdb->f); + ordered_hashmap_free(hwdb->properties); + return mfree(hwdb); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_hwdb, sd_hwdb, hwdb_free) + +static int properties_prepare(sd_hwdb *hwdb, const char *modalias) { + assert(hwdb); + assert(modalias); + + ordered_hashmap_clear(hwdb->properties); + hwdb->properties_modified = true; + + return trie_search_f(hwdb, modalias); +} + +_public_ int sd_hwdb_get(sd_hwdb *hwdb, const char *modalias, const char *key, const char **_value) { + const struct trie_value_entry_f *entry; + int r; + + assert_return(hwdb, -EINVAL); + assert_return(hwdb->f, -EINVAL); + assert_return(modalias, -EINVAL); + assert_return(_value, -EINVAL); + + r = properties_prepare(hwdb, modalias); + if (r < 0) + return r; + + entry = ordered_hashmap_get(hwdb->properties, key); + if (!entry) + return -ENOENT; + + *_value = trie_string(hwdb, entry->value_off); + + return 0; +} + +_public_ int sd_hwdb_seek(sd_hwdb *hwdb, const char *modalias) { + int r; + + assert_return(hwdb, -EINVAL); + assert_return(hwdb->f, -EINVAL); + assert_return(modalias, -EINVAL); + + r = properties_prepare(hwdb, modalias); + if (r < 0) + return r; + + hwdb->properties_modified = false; + hwdb->properties_iterator = ITERATOR_FIRST; + + return 0; +} + +_public_ int sd_hwdb_enumerate(sd_hwdb *hwdb, const char **key, const char **value) { + const struct trie_value_entry_f *entry; + const void *k; + + assert_return(hwdb, -EINVAL); + assert_return(key, -EINVAL); + assert_return(value, -EINVAL); + + if (hwdb->properties_modified) + return -EAGAIN; + + if (!ordered_hashmap_iterate(hwdb->properties, &hwdb->properties_iterator, (void **)&entry, &k)) + return 0; + + *key = k; + *value = trie_string(hwdb, entry->value_off); + + return 1; +} diff --git a/src/libsystemd/sd-id128/id128-util.c b/src/libsystemd/sd-id128/id128-util.c new file mode 100644 index 0000000..94bfd70 --- /dev/null +++ b/src/libsystemd/sd-id128/id128-util.c @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "fd-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "id128-util.h" +#include "io-util.h" +#include "sha256.h" +#include "stdio-util.h" +#include "string-util.h" +#include "sync-util.h" +#include "virt.h" + +int id128_from_string_nonzero(const char *s, sd_id128_t *ret) { + sd_id128_t t; + int r; + + assert(ret); + + r = sd_id128_from_string(ASSERT_PTR(s), &t); + if (r < 0) + return r; + + if (sd_id128_is_null(t)) + return -ENXIO; + + *ret = t; + return 0; +} + +bool id128_is_valid(const char *s) { + size_t l; + + assert(s); + + l = strlen(s); + + if (l == SD_ID128_STRING_MAX - 1) + /* Plain formatted 128-bit hex string */ + return in_charset(s, HEXDIGITS); + + if (l == SD_ID128_UUID_STRING_MAX - 1) { + /* Formatted UUID */ + for (size_t i = 0; i < l; i++) { + char c = s[i]; + + if (IN_SET(i, 8, 13, 18, 23)) { + if (c != '-') + return false; + } else if (!ascii_ishex(c)) + return false; + } + return true; + } + + return false; +} + +int id128_read_fd(int fd, Id128Flag f, sd_id128_t *ret) { + char buffer[SD_ID128_UUID_STRING_MAX + 1]; /* +1 is for trailing newline */ + sd_id128_t id; + ssize_t l; + int r; + + assert(fd >= 0); + + /* Reads an 128-bit ID from a file, which may either be in plain format (32 hex digits), or in UUID format, both + * optionally followed by a newline and nothing else. ID files should really be newline terminated, but if they + * aren't that's OK too, following the rule of "Be conservative in what you send, be liberal in what you + * accept". + * + * This returns the following: + * -ENOMEDIUM: an empty string, + * -ENOPKG: "uninitialized" or "uninitialized\n", + * -EUCLEAN: other invalid strings. */ + + l = loop_read(fd, buffer, sizeof(buffer), false); /* we expect a short read of either 32/33 or 36/37 chars */ + if (l < 0) + return (int) l; + if (l == 0) /* empty? */ + return -ENOMEDIUM; + + switch (l) { + + case STRLEN("uninitialized"): + case STRLEN("uninitialized\n"): + return strneq(buffer, "uninitialized\n", l) ? -ENOPKG : -EINVAL; + + case SD_ID128_STRING_MAX: /* plain UUID with trailing newline */ + if (buffer[SD_ID128_STRING_MAX-1] != '\n') + return -EUCLEAN; + + _fallthrough_; + case SD_ID128_STRING_MAX-1: /* plain UUID without trailing newline */ + if (!FLAGS_SET(f, ID128_FORMAT_PLAIN)) + return -EUCLEAN; + + buffer[SD_ID128_STRING_MAX-1] = 0; + break; + + case SD_ID128_UUID_STRING_MAX: /* RFC UUID with trailing newline */ + if (buffer[SD_ID128_UUID_STRING_MAX-1] != '\n') + return -EUCLEAN; + + _fallthrough_; + case SD_ID128_UUID_STRING_MAX-1: /* RFC UUID without trailing newline */ + if (!FLAGS_SET(f, ID128_FORMAT_UUID)) + return -EUCLEAN; + + buffer[SD_ID128_UUID_STRING_MAX-1] = 0; + break; + + default: + return -EUCLEAN; + } + + r = sd_id128_from_string(buffer, &id); + if (r == -EINVAL) + return -EUCLEAN; + if (r < 0) + return r; + + if (FLAGS_SET(f, ID128_REFUSE_NULL) && sd_id128_is_null(id)) + return -ENOMEDIUM; + + if (ret) + *ret = id; + return 0; +} + +int id128_read_at(int dir_fd, const char *path, Id128Flag f, sd_id128_t *ret) { + _cleanup_close_ int fd = -EBADF; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0) + return fd; + + return id128_read_fd(fd, f, ret); +} + +int id128_write_fd(int fd, Id128Flag f, sd_id128_t id) { + char buffer[SD_ID128_UUID_STRING_MAX + 1]; /* +1 is for trailing newline */ + size_t sz; + int r; + + assert(fd >= 0); + assert(IN_SET((f & ID128_FORMAT_ANY), ID128_FORMAT_PLAIN, ID128_FORMAT_UUID)); + + if (FLAGS_SET(f, ID128_REFUSE_NULL) && sd_id128_is_null(id)) + return -ENOMEDIUM; + + if (FLAGS_SET(f, ID128_FORMAT_PLAIN)) { + assert_se(sd_id128_to_string(id, buffer)); + sz = SD_ID128_STRING_MAX; + } else { + assert_se(sd_id128_to_uuid_string(id, buffer)); + sz = SD_ID128_UUID_STRING_MAX; + } + + buffer[sz - 1] = '\n'; + r = loop_write(fd, buffer, sz); + if (r < 0) + return r; + + if (FLAGS_SET(f, ID128_SYNC_ON_WRITE)) { + r = fsync_full(fd); + if (r < 0) + return r; + } + + return 0; +} + +int id128_write_at(int dir_fd, const char *path, Id128Flag f, sd_id128_t id) { + _cleanup_close_ int fd = -EBADF; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + fd = xopenat(dir_fd, path, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY|O_TRUNC, /* xopen_flags = */ 0, 0444); + if (fd < 0) + return fd; + + return id128_write_fd(fd, f, id); +} + +void id128_hash_func(const sd_id128_t *p, struct siphash *state) { + siphash24_compress(p, sizeof(sd_id128_t), state); +} + +int id128_compare_func(const sd_id128_t *a, const sd_id128_t *b) { + return memcmp(a, b, 16); +} + +sd_id128_t id128_make_v4_uuid(sd_id128_t id) { + /* Stolen from generate_random_uuid() of drivers/char/random.c + * in the kernel sources */ + + /* Set UUID version to 4 --- truly random generation */ + id.bytes[6] = (id.bytes[6] & 0x0F) | 0x40; + + /* Set the UUID variant to DCE */ + id.bytes[8] = (id.bytes[8] & 0x3F) | 0x80; + + return id; +} + +DEFINE_HASH_OPS(id128_hash_ops, sd_id128_t, id128_hash_func, id128_compare_func); +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(id128_hash_ops_free, sd_id128_t, id128_hash_func, id128_compare_func, free); + +int id128_get_product(sd_id128_t *ret) { + sd_id128_t uuid; + int r; + + assert(ret); + + /* Reads the systems product UUID from DMI or devicetree (where it is located on POWER). This is + * particularly relevant in VM environments, where VM managers typically place a VM uuid there. */ + + r = detect_container(); + if (r < 0) + return r; + if (r > 0) /* Refuse returning this in containers, as this is not a property of our system then, but + * of the host */ + return -ENOENT; + + r = id128_read("/sys/class/dmi/id/product_uuid", ID128_FORMAT_UUID, &uuid); + if (r == -ENOENT) + r = id128_read("/proc/device-tree/vm,uuid", ID128_FORMAT_UUID, &uuid); + if (r == -ENOENT) + r = id128_read("/sys/hypervisor/uuid", ID128_FORMAT_UUID, &uuid); + if (r < 0) + return r; + + if (sd_id128_is_null(uuid) || sd_id128_is_allf(uuid)) + return -EADDRNOTAVAIL; /* Recognizable error */ + + *ret = uuid; + return 0; +} + +sd_id128_t id128_digest(const void *data, size_t size) { + assert(data || size == 0); + + /* Hashes a UUID from some arbitrary data */ + + if (size == SIZE_MAX) + size = strlen(data); + + uint8_t h[SHA256_DIGEST_SIZE]; + sd_id128_t id; + + /* Take the first half of the SHA256 result */ + assert_cc(sizeof(h) >= sizeof(id.bytes)); + memcpy(id.bytes, sha256_direct(data, size, h), sizeof(id.bytes)); + + return id128_make_v4_uuid(id); +} diff --git a/src/libsystemd/sd-id128/id128-util.h b/src/libsystemd/sd-id128/id128-util.h new file mode 100644 index 0000000..53ba50a --- /dev/null +++ b/src/libsystemd/sd-id128/id128-util.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-id128.h" + +#include "errno-util.h" +#include "hash-funcs.h" +#include "macro.h" + +bool id128_is_valid(const char *s) _pure_; + +typedef enum Id128Flag { + ID128_FORMAT_PLAIN = 1 << 0, /* formatted as 32 hex chars as-is */ + ID128_FORMAT_UUID = 1 << 1, /* formatted as 36 character uuid string */ + ID128_FORMAT_ANY = ID128_FORMAT_PLAIN | ID128_FORMAT_UUID, + + ID128_SYNC_ON_WRITE = 1 << 2, /* Sync the file after write. Used only when writing an ID. */ + ID128_REFUSE_NULL = 1 << 3, /* Refuse all zero ID with -ENOMEDIUM. */ +} Id128Flag; + +int id128_from_string_nonzero(const char *s, sd_id128_t *ret); + +int id128_read_fd(int fd, Id128Flag f, sd_id128_t *ret); +int id128_read_at(int dir_fd, const char *path, Id128Flag f, sd_id128_t *ret); +static inline int id128_read(const char *path, Id128Flag f, sd_id128_t *ret) { + return id128_read_at(AT_FDCWD, path, f, ret); +} + +int id128_write_fd(int fd, Id128Flag f, sd_id128_t id); +int id128_write_at(int dir_fd, const char *path, Id128Flag f, sd_id128_t id); +static inline int id128_write(const char *path, Id128Flag f, sd_id128_t id) { + return id128_write_at(AT_FDCWD, path, f, id); +} + +int id128_get_machine(const char *root, sd_id128_t *ret); +int id128_get_machine_at(int rfd, sd_id128_t *ret); + +void id128_hash_func(const sd_id128_t *p, struct siphash *state); +int id128_compare_func(const sd_id128_t *a, const sd_id128_t *b) _pure_; +extern const struct hash_ops id128_hash_ops; +extern const struct hash_ops id128_hash_ops_free; + +sd_id128_t id128_make_v4_uuid(sd_id128_t id); + +int id128_get_product(sd_id128_t *ret); + +sd_id128_t id128_digest(const void *data, size_t size); + +/* A helper to check for the three relevant cases of "machine ID not initialized" */ +#define ERRNO_IS_NEG_MACHINE_ID_UNSET(r) \ + IN_SET(r, \ + -ENOENT, \ + -ENOMEDIUM, \ + -ENOPKG) +_DEFINE_ABS_WRAPPER(MACHINE_ID_UNSET); diff --git a/src/libsystemd/sd-id128/sd-id128.c b/src/libsystemd/sd-id128/sd-id128.c new file mode 100644 index 0000000..9fda79a --- /dev/null +++ b/src/libsystemd/sd-id128/sd-id128.c @@ -0,0 +1,382 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "chase.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "hmac.h" +#include "id128-util.h" +#include "io-util.h" +#include "macro.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "path-util.h" +#include "random-util.h" +#include "stat-util.h" +#include "user-util.h" + +_public_ char *sd_id128_to_string(sd_id128_t id, char s[_SD_ARRAY_STATIC SD_ID128_STRING_MAX]) { + size_t k = 0; + + assert_return(s, NULL); + + for (size_t n = 0; n < sizeof(sd_id128_t); n++) { + s[k++] = hexchar(id.bytes[n] >> 4); + s[k++] = hexchar(id.bytes[n] & 0xF); + } + + assert(k == SD_ID128_STRING_MAX - 1); + s[k] = 0; + + return s; +} + +_public_ char *sd_id128_to_uuid_string(sd_id128_t id, char s[_SD_ARRAY_STATIC SD_ID128_UUID_STRING_MAX]) { + size_t k = 0; + + assert_return(s, NULL); + + /* Similar to sd_id128_to_string() but formats the result as UUID instead of plain hex chars */ + + for (size_t n = 0; n < sizeof(sd_id128_t); n++) { + + if (IN_SET(n, 4, 6, 8, 10)) + s[k++] = '-'; + + s[k++] = hexchar(id.bytes[n] >> 4); + s[k++] = hexchar(id.bytes[n] & 0xF); + } + + assert(k == SD_ID128_UUID_STRING_MAX - 1); + s[k] = 0; + + return s; +} + +_public_ int sd_id128_from_string(const char *s, sd_id128_t *ret) { + size_t n, i; + sd_id128_t t; + bool is_guid = false; + + assert_return(s, -EINVAL); + + for (n = 0, i = 0; n < sizeof(sd_id128_t);) { + int a, b; + + if (s[i] == '-') { + /* Is this a GUID? Then be nice, and skip over + * the dashes */ + + if (i == 8) + is_guid = true; + else if (IN_SET(i, 13, 18, 23)) { + if (!is_guid) + return -EINVAL; + } else + return -EINVAL; + + i++; + continue; + } + + a = unhexchar(s[i++]); + if (a < 0) + return -EINVAL; + + b = unhexchar(s[i++]); + if (b < 0) + return -EINVAL; + + t.bytes[n++] = (a << 4) | b; + } + + if (i != (is_guid ? SD_ID128_UUID_STRING_MAX : SD_ID128_STRING_MAX) - 1) + return -EINVAL; + + if (s[i] != 0) + return -EINVAL; + + if (ret) + *ret = t; + return 0; +} + +_public_ int sd_id128_string_equal(const char *s, sd_id128_t id) { + sd_id128_t parsed; + int r; + + if (!s) + return false; + + /* Checks if the specified string matches a valid string representation of the specified 128 bit ID/uuid */ + + r = sd_id128_from_string(s, &parsed); + if (r < 0) + return r; + + return sd_id128_equal(parsed, id); +} + +_public_ int sd_id128_get_machine(sd_id128_t *ret) { + static thread_local sd_id128_t saved_machine_id = {}; + int r; + + if (sd_id128_is_null(saved_machine_id)) { + r = id128_read("/etc/machine-id", ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, &saved_machine_id); + if (r < 0) + return r; + } + + if (ret) + *ret = saved_machine_id; + return 0; +} + +int id128_get_machine_at(int rfd, sd_id128_t *ret) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + + r = dir_fd_is_root_or_cwd(rfd); + if (r < 0) + return r; + if (r > 0) + return sd_id128_get_machine(ret); + + fd = chase_and_openat(rfd, "/etc/machine-id", CHASE_AT_RESOLVE_IN_ROOT, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL); + if (fd < 0) + return fd; + + return id128_read_fd(fd, ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, ret); +} + +int id128_get_machine(const char *root, sd_id128_t *ret) { + _cleanup_close_ int fd = -EBADF; + + if (empty_or_root(root)) + return sd_id128_get_machine(ret); + + fd = chase_and_open("/etc/machine-id", root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL); + if (fd < 0) + return fd; + + return id128_read_fd(fd, ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, ret); +} + +_public_ int sd_id128_get_boot(sd_id128_t *ret) { + static thread_local sd_id128_t saved_boot_id = {}; + int r; + + if (sd_id128_is_null(saved_boot_id)) { + r = id128_read("/proc/sys/kernel/random/boot_id", ID128_FORMAT_UUID | ID128_REFUSE_NULL, &saved_boot_id); + if (r == -ENOENT && proc_mounted() == 0) + return -ENOSYS; + if (r < 0) + return r; + } + + if (ret) + *ret = saved_boot_id; + return 0; +} + +static int get_invocation_from_keyring(sd_id128_t *ret) { + _cleanup_free_ char *description = NULL; + char *d, *p, *g, *u, *e; + unsigned long perms; + key_serial_t key; + size_t sz = 256; + uid_t uid; + gid_t gid; + int r, c; + +#define MAX_PERMS ((unsigned long) (KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH| \ + KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH)) + + assert(ret); + + key = request_key("user", "invocation_id", NULL, 0); + if (key == -1) { + /* Keyring support not available? No invocation key stored? */ + if (IN_SET(errno, ENOSYS, ENOKEY)) + return -ENXIO; + + return -errno; + } + + for (;;) { + description = new(char, sz); + if (!description) + return -ENOMEM; + + c = keyctl(KEYCTL_DESCRIBE, key, (unsigned long) description, sz, 0); + if (c < 0) + return -errno; + + if ((size_t) c <= sz) + break; + + sz = c; + free(description); + } + + /* The kernel returns a final NUL in the string, verify that. */ + assert(description[c-1] == 0); + + /* Chop off the final description string */ + d = strrchr(description, ';'); + if (!d) + return -EUCLEAN; + *d = 0; + + /* Look for the permissions */ + p = strrchr(description, ';'); + if (!p) + return -EUCLEAN; + + errno = 0; + perms = strtoul(p + 1, &e, 16); + if (errno > 0) + return -errno; + if (e == p + 1) /* Read at least one character */ + return -EUCLEAN; + if (e != d) /* Must reached the end */ + return -EUCLEAN; + + if ((perms & ~MAX_PERMS) != 0) + return -EPERM; + + *p = 0; + + /* Look for the group ID */ + g = strrchr(description, ';'); + if (!g) + return -EUCLEAN; + r = parse_gid(g + 1, &gid); + if (r < 0) + return r; + if (gid != 0) + return -EPERM; + *g = 0; + + /* Look for the user ID */ + u = strrchr(description, ';'); + if (!u) + return -EUCLEAN; + r = parse_uid(u + 1, &uid); + if (r < 0) + return r; + if (uid != 0) + return -EPERM; + + c = keyctl(KEYCTL_READ, key, (unsigned long) ret, sizeof(sd_id128_t), 0); + if (c < 0) + return -errno; + if (c != sizeof(sd_id128_t)) + return -EUCLEAN; + + return 0; +} + +static int get_invocation_from_environment(sd_id128_t *ret) { + const char *e; + int r; + + assert(ret); + + e = secure_getenv("INVOCATION_ID"); + if (!e) + return -ENXIO; + + r = sd_id128_from_string(e, ret); + return r == -EINVAL ? -EUCLEAN : r; +} + +_public_ int sd_id128_get_invocation(sd_id128_t *ret) { + static thread_local sd_id128_t saved_invocation_id = {}; + int r; + + if (sd_id128_is_null(saved_invocation_id)) { + /* We first check the environment. The environment variable is primarily relevant for user + * services, and sufficiently safe as long as no privilege boundary is involved. */ + r = get_invocation_from_environment(&saved_invocation_id); + if (r == -ENXIO) + /* The kernel keyring is relevant for system services (as for user services we don't + * store the invocation ID in the keyring, as there'd be no trust benefit in that). */ + r = get_invocation_from_keyring(&saved_invocation_id); + if (r < 0) + return r; + + if (sd_id128_is_null(saved_invocation_id)) + return -ENOMEDIUM; + } + + if (ret) + *ret = saved_invocation_id; + return 0; +} + +_public_ int sd_id128_randomize(sd_id128_t *ret) { + sd_id128_t t; + + assert_return(ret, -EINVAL); + + random_bytes(&t, sizeof(t)); + + /* Turn this into a valid v4 UUID, to be nice. Note that we + * only guarantee this for newly generated UUIDs, not for + * pre-existing ones. */ + + *ret = id128_make_v4_uuid(t); + return 0; +} + +_public_ int sd_id128_get_app_specific(sd_id128_t base, sd_id128_t app_id, sd_id128_t *ret) { + assert_cc(sizeof(sd_id128_t) < SHA256_DIGEST_SIZE); /* Check that we don't need to pad with zeros. */ + union { + uint8_t hmac[SHA256_DIGEST_SIZE]; + sd_id128_t result; + } buf; + + assert_return(ret, -EINVAL); + assert_return(!sd_id128_is_null(app_id), -ENXIO); + + hmac_sha256(&base, sizeof(base), &app_id, sizeof(app_id), buf.hmac); + + /* Take only the first half. */ + *ret = id128_make_v4_uuid(buf.result); + return 0; +} + +_public_ int sd_id128_get_machine_app_specific(sd_id128_t app_id, sd_id128_t *ret) { + sd_id128_t id; + int r; + + assert_return(ret, -EINVAL); + + r = sd_id128_get_machine(&id); + if (r < 0) + return r; + + return sd_id128_get_app_specific(id, app_id, ret); +} + +_public_ int sd_id128_get_boot_app_specific(sd_id128_t app_id, sd_id128_t *ret) { + sd_id128_t id; + int r; + + assert_return(ret, -EINVAL); + + r = sd_id128_get_boot(&id); + if (r < 0) + return r; + + return sd_id128_get_app_specific(id, app_id, ret); +} diff --git a/src/libsystemd/sd-journal/audit-type.c b/src/libsystemd/sd-journal/audit-type.c new file mode 100644 index 0000000..122cdf5 --- /dev/null +++ b/src/libsystemd/sd-journal/audit-type.c @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "audit-type.h" +#include "missing_audit.h" + +#include "audit_type-to-name.h" diff --git a/src/libsystemd/sd-journal/audit-type.h b/src/libsystemd/sd-journal/audit-type.h new file mode 100644 index 0000000..f2c4898 --- /dev/null +++ b/src/libsystemd/sd-journal/audit-type.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "alloc-util.h" +#include "macro.h" + +const char *audit_type_to_string(int type); +int audit_type_from_string(const char *s); + +/* This is inspired by DNS TYPEnnn formatting */ +#define audit_type_name_alloca(type) \ + ({ \ + const char *_s_; \ + _s_ = audit_type_to_string(type); \ + if (!_s_) { \ + _s_ = newa(char, STRLEN("AUDIT") + DECIMAL_STR_MAX(int)); \ + sprintf((char*) _s_, "AUDIT%04i", type); \ + } \ + _s_; \ + }) diff --git a/src/libsystemd/sd-journal/audit_type-to-name.awk b/src/libsystemd/sd-journal/audit_type-to-name.awk new file mode 100644 index 0000000..a859c44 --- /dev/null +++ b/src/libsystemd/sd-journal/audit_type-to-name.awk @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "const char *audit_type_to_string(int type) {" + print " switch (type) {" +} +{ + printf " case AUDIT_%s: return \"%s\";\n", $1, $1 +} +END{ + print " default: return NULL;" + print " }" + print "}" +} diff --git a/src/libsystemd/sd-journal/catalog.c b/src/libsystemd/sd-journal/catalog.c new file mode 100644 index 0000000..ae91534 --- /dev/null +++ b/src/libsystemd/sd-journal/catalog.c @@ -0,0 +1,743 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "catalog.h" +#include "conf-files.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "log.h" +#include "memory-util.h" +#include "mkdir.h" +#include "path-util.h" +#include "siphash24.h" +#include "sort-util.h" +#include "sparse-endian.h" +#include "strbuf.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" + +const char * const catalog_file_dirs[] = { + "/usr/local/lib/systemd/catalog/", + "/usr/lib/systemd/catalog/", + NULL +}; + +#define CATALOG_SIGNATURE { 'R', 'H', 'H', 'H', 'K', 'S', 'L', 'P' } + +typedef struct CatalogHeader { + uint8_t signature[8]; /* "RHHHKSLP" */ + le32_t compatible_flags; + le32_t incompatible_flags; + le64_t header_size; + le64_t n_items; + le64_t catalog_item_size; +} CatalogHeader; + +typedef struct CatalogItem { + sd_id128_t id; + char language[32]; /* One byte is used for termination, so the maximum allowed + * length of the string is actually 31 bytes. */ + le64_t offset; +} CatalogItem; + +static void catalog_hash_func(const CatalogItem *i, struct siphash *state) { + siphash24_compress(&i->id, sizeof(i->id), state); + siphash24_compress_string(i->language, state); +} + +static int catalog_compare_func(const CatalogItem *a, const CatalogItem *b) { + unsigned k; + int r; + + for (k = 0; k < ELEMENTSOF(b->id.bytes); k++) { + r = CMP(a->id.bytes[k], b->id.bytes[k]); + if (r != 0) + return r; + } + + return strcmp(a->language, b->language); +} + +DEFINE_HASH_OPS(catalog_hash_ops, CatalogItem, catalog_hash_func, catalog_compare_func); + +static bool next_header(const char **s) { + const char *e; + + e = strchr(*s, '\n'); + + /* Unexpected end */ + if (!e) + return false; + + /* End of headers */ + if (e == *s) + return false; + + *s = e + 1; + return true; +} + +static const char *skip_header(const char *s) { + while (next_header(&s)) + ; + return s; +} + +static char *combine_entries(const char *one, const char *two) { + const char *b1, *b2; + size_t l1, l2, n; + char *dest, *p; + + /* Find split point of headers to body */ + b1 = skip_header(one); + b2 = skip_header(two); + + l1 = strlen(one); + l2 = strlen(two); + dest = new(char, l1 + l2 + 1); + if (!dest) { + log_oom(); + return NULL; + } + + p = dest; + + /* Headers from @one */ + n = b1 - one; + p = mempcpy(p, one, n); + + /* Headers from @two, these will only be found if not present above */ + n = b2 - two; + p = mempcpy(p, two, n); + + /* Body from @one */ + n = l1 - (b1 - one); + if (n > 0) + p = mempcpy(p, b1, n); + /* Body from @two */ + else { + n = l2 - (b2 - two); + p = mempcpy(p, b2, n); + } + + assert(p - dest <= (ptrdiff_t)(l1 + l2)); + p[0] = '\0'; + return dest; +} + +static int finish_item( + OrderedHashmap *h, + sd_id128_t id, + const char *language, + char *payload, size_t payload_size) { + + _cleanup_free_ CatalogItem *i = NULL; + _cleanup_free_ char *combined = NULL; + char *prev; + int r; + + assert(h); + assert(payload); + assert(payload_size > 0); + + i = new0(CatalogItem, 1); + if (!i) + return log_oom(); + + i->id = id; + if (language) { + assert(strlen(language) > 1 && strlen(language) < 32); + strcpy(i->language, language); + } + + prev = ordered_hashmap_get(h, i); + if (prev) { + /* Already have such an item, combine them */ + combined = combine_entries(payload, prev); + if (!combined) + return log_oom(); + + r = ordered_hashmap_update(h, i, combined); + if (r < 0) + return log_error_errno(r, "Failed to update catalog item: %m"); + + TAKE_PTR(combined); + free(prev); + } else { + /* A new item */ + combined = memdup(payload, payload_size + 1); + if (!combined) + return log_oom(); + + r = ordered_hashmap_put(h, i, combined); + if (r < 0) + return log_error_errno(r, "Failed to insert catalog item: %m"); + + TAKE_PTR(i); + TAKE_PTR(combined); + } + + return 0; +} + +int catalog_file_lang(const char* filename, char **lang) { + char *beg, *end, *_lang; + + end = endswith(filename, ".catalog"); + if (!end) + return 0; + + beg = end - 1; + while (beg > filename && !IN_SET(*beg, '.', '/') && end - beg < 32) + beg--; + + if (*beg != '.' || end <= beg + 1) + return 0; + + _lang = strndup(beg + 1, end - beg - 1); + if (!_lang) + return -ENOMEM; + + *lang = _lang; + return 1; +} + +static int catalog_entry_lang( + const char* filename, + unsigned line, + const char* t, + const char* deflang, + char **ret) { + + size_t c; + char *z; + + c = strlen(t); + if (c < 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] Language too short.", filename, line); + if (c > 31) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] language too long.", filename, line); + + if (deflang) { + if (streq(t, deflang)) { + log_warning("[%s:%u] language specified unnecessarily", filename, line); + return 0; + } + + log_warning("[%s:%u] language differs from default for file", filename, line); + } + + z = strdup(t); + if (!z) + return -ENOMEM; + + *ret = z; + return 0; +} + +int catalog_import_file(OrderedHashmap *h, const char *path) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *payload = NULL; + size_t payload_size = 0; + unsigned n = 0; + sd_id128_t id; + _cleanup_free_ char *deflang = NULL, *lang = NULL; + bool got_id = false, empty_line = true; + int r; + + assert(h); + assert(path); + + f = fopen(path, "re"); + if (!f) + return log_error_errno(errno, "Failed to open file %s: %m", path); + + r = catalog_file_lang(path, &deflang); + if (r < 0) + log_error_errno(r, "Failed to determine language for file %s: %m", path); + if (r == 1) + log_debug("File %s has language %s.", path, deflang); + + for (;;) { + _cleanup_free_ char *line = NULL; + size_t line_len; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read file %s: %m", path); + if (r == 0) + break; + + n++; + + if (isempty(line)) { + empty_line = true; + continue; + } + + if (strchr(COMMENTS, line[0])) + continue; + + if (empty_line && + strlen(line) >= 2+1+32 && + line[0] == '-' && + line[1] == '-' && + line[2] == ' ' && + IN_SET(line[2+1+32], ' ', '\0')) { + + bool with_language; + sd_id128_t jd; + + /* New entry */ + + with_language = line[2+1+32] != '\0'; + line[2+1+32] = '\0'; + + if (sd_id128_from_string(line + 2 + 1, &jd) >= 0) { + + if (got_id) { + if (payload_size == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] No payload text.", + path, + n); + + r = finish_item(h, id, lang ?: deflang, payload, payload_size); + if (r < 0) + return r; + + lang = mfree(lang); + payload_size = 0; + } + + if (with_language) { + char *t; + + t = strstrip(line + 2 + 1 + 32 + 1); + r = catalog_entry_lang(path, n, t, deflang, &lang); + if (r < 0) + return r; + } + + got_id = true; + empty_line = false; + id = jd; + + continue; + } + } + + /* Payload */ + if (!got_id) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] Got payload before ID.", + path, n); + + line_len = strlen(line); + if (!GREEDY_REALLOC(payload, payload_size + (empty_line ? 1 : 0) + line_len + 1 + 1)) + return log_oom(); + + if (empty_line) + payload[payload_size++] = '\n'; + memcpy(payload + payload_size, line, line_len); + payload_size += line_len; + payload[payload_size++] = '\n'; + payload[payload_size] = '\0'; + + empty_line = false; + } + + if (got_id) { + if (payload_size == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "[%s:%u] No payload text.", + path, n); + + r = finish_item(h, id, lang ?: deflang, payload, payload_size); + if (r < 0) + return r; + } + + return 0; +} + +static int64_t write_catalog( + const char *database, + struct strbuf *sb, + CatalogItem *items, + size_t n) { + + _cleanup_fclose_ FILE *w = NULL; + _cleanup_free_ char *p = NULL; + CatalogHeader header; + size_t k; + int r; + + r = mkdir_parents(database, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create parent directories of %s: %m", database); + + r = fopen_temporary(database, &w, &p); + if (r < 0) + return log_error_errno(r, "Failed to open database for writing: %s: %m", + database); + + header = (CatalogHeader) { + .signature = CATALOG_SIGNATURE, + .header_size = htole64(CONST_ALIGN_TO(sizeof(CatalogHeader), 8)), + .catalog_item_size = htole64(sizeof(CatalogItem)), + .n_items = htole64(n), + }; + + r = -EIO; + + k = fwrite(&header, 1, sizeof(header), w); + if (k != sizeof(header)) { + log_error("%s: failed to write header.", p); + goto error; + } + + k = fwrite(items, 1, n * sizeof(CatalogItem), w); + if (k != n * sizeof(CatalogItem)) { + log_error("%s: failed to write database.", p); + goto error; + } + + k = fwrite(sb->buf, 1, sb->len, w); + if (k != sb->len) { + log_error("%s: failed to write strings.", p); + goto error; + } + + r = fflush_and_check(w); + if (r < 0) { + log_error_errno(r, "%s: failed to write database: %m", p); + goto error; + } + + (void) fchmod(fileno(w), 0644); + + if (rename(p, database) < 0) { + r = log_error_errno(errno, "rename (%s -> %s) failed: %m", p, database); + goto error; + } + + return ftello(w); + +error: + (void) unlink(p); + return r; +} + +int catalog_update(const char* database, const char* root, const char* const* dirs) { + _cleanup_strv_free_ char **files = NULL; + _cleanup_(strbuf_freep) struct strbuf *sb = NULL; + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + _cleanup_free_ CatalogItem *items = NULL; + ssize_t offset; + char *payload; + CatalogItem *i; + unsigned n; + int r; + int64_t sz; + + h = ordered_hashmap_new(&catalog_hash_ops); + sb = strbuf_new(); + if (!h || !sb) + return log_oom(); + + r = conf_files_list_strv(&files, ".catalog", root, 0, dirs); + if (r < 0) + return log_error_errno(r, "Failed to get catalog files: %m"); + + STRV_FOREACH(f, files) { + log_debug("Reading file '%s'", *f); + r = catalog_import_file(h, *f); + if (r < 0) + return log_error_errno(r, "Failed to import file '%s': %m", *f); + } + + if (ordered_hashmap_size(h) <= 0) { + log_info("No items in catalog."); + return 0; + } else + log_debug("Found %u items in catalog.", ordered_hashmap_size(h)); + + items = new(CatalogItem, ordered_hashmap_size(h)); + if (!items) + return log_oom(); + + n = 0; + ORDERED_HASHMAP_FOREACH_KEY(payload, i, h) { + log_trace("Found " SD_ID128_FORMAT_STR ", language %s", + SD_ID128_FORMAT_VAL(i->id), + isempty(i->language) ? "C" : i->language); + + offset = strbuf_add_string(sb, payload, strlen(payload)); + if (offset < 0) + return log_oom(); + + i->offset = htole64((uint64_t) offset); + items[n++] = *i; + } + + assert(n == ordered_hashmap_size(h)); + typesafe_qsort(items, n, catalog_compare_func); + + strbuf_complete(sb); + + sz = write_catalog(database, sb, items, n); + if (sz < 0) + return log_error_errno(sz, "Failed to write %s: %m", database); + + log_debug("%s: wrote %u items, with %zu bytes of strings, %"PRIi64" total size.", + database, n, sb->len, sz); + return 0; +} + +static int open_mmap(const char *database, int *_fd, struct stat *_st, void **_p) { + _cleanup_close_ int fd = -EBADF; + const CatalogHeader *h; + struct stat st; + void *p; + + assert(_fd); + assert(_st); + assert(_p); + + fd = open(database, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if (st.st_size < (off_t) sizeof(CatalogHeader) || file_offset_beyond_memory_size(st.st_size)) + return -EINVAL; + + p = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + return -errno; + + h = p; + if (memcmp(h->signature, (const uint8_t[]) CATALOG_SIGNATURE, sizeof(h->signature)) != 0 || + le64toh(h->header_size) < sizeof(CatalogHeader) || + le64toh(h->catalog_item_size) < sizeof(CatalogItem) || + h->incompatible_flags != 0 || + le64toh(h->n_items) <= 0 || + st.st_size < (off_t) (le64toh(h->header_size) + le64toh(h->catalog_item_size) * le64toh(h->n_items))) { + munmap(p, st.st_size); + return -EBADMSG; + } + + *_fd = TAKE_FD(fd); + *_st = st; + *_p = p; + + return 0; +} + +static const char *find_id(void *p, sd_id128_t id) { + CatalogItem *f = NULL, key = { .id = id }; + const CatalogHeader *h = p; + const char *loc; + + loc = setlocale(LC_MESSAGES, NULL); + if (!isempty(loc) && !STR_IN_SET(loc, "C", "POSIX")) { + size_t len; + + len = strcspn(loc, ".@"); + if (len > sizeof(key.language) - 1) + log_debug("LC_MESSAGES value too long, ignoring: \"%.*s\"", (int) len, loc); + else { + strncpy(key.language, loc, len); + key.language[len] = '\0'; + + f = bsearch(&key, + (const uint8_t*) p + le64toh(h->header_size), + le64toh(h->n_items), + le64toh(h->catalog_item_size), + (comparison_fn_t) catalog_compare_func); + if (!f) { + char *e; + + e = strchr(key.language, '_'); + if (e) { + *e = 0; + f = bsearch(&key, + (const uint8_t*) p + le64toh(h->header_size), + le64toh(h->n_items), + le64toh(h->catalog_item_size), + (comparison_fn_t) catalog_compare_func); + } + } + } + } + + if (!f) { + zero(key.language); + f = bsearch(&key, + (const uint8_t*) p + le64toh(h->header_size), + le64toh(h->n_items), + le64toh(h->catalog_item_size), + (comparison_fn_t) catalog_compare_func); + } + + if (!f) + return NULL; + + return (const char*) p + + le64toh(h->header_size) + + le64toh(h->n_items) * le64toh(h->catalog_item_size) + + le64toh(f->offset); +} + +int catalog_get(const char* database, sd_id128_t id, char **_text) { + _cleanup_close_ int fd = -EBADF; + void *p = NULL; + struct stat st = {}; + char *text = NULL; + int r; + const char *s; + + assert(_text); + + r = open_mmap(database, &fd, &st, &p); + if (r < 0) + return r; + + s = find_id(p, id); + if (!s) { + r = -ENOENT; + goto finish; + } + + text = strdup(s); + if (!text) { + r = -ENOMEM; + goto finish; + } + + *_text = text; + r = 0; + +finish: + if (p) + munmap(p, st.st_size); + + return r; +} + +static char *find_header(const char *s, const char *header) { + + for (;;) { + const char *v; + + v = startswith(s, header); + if (v) { + v += strspn(v, WHITESPACE); + return strndup(v, strcspn(v, NEWLINE)); + } + + if (!next_header(&s)) + return NULL; + } +} + +static void dump_catalog_entry(FILE *f, sd_id128_t id, const char *s, bool oneline) { + if (oneline) { + _cleanup_free_ char *subject = NULL, *defined_by = NULL; + + subject = find_header(s, "Subject:"); + defined_by = find_header(s, "Defined-By:"); + + fprintf(f, SD_ID128_FORMAT_STR " %s: %s\n", + SD_ID128_FORMAT_VAL(id), + strna(defined_by), strna(subject)); + } else + fprintf(f, "-- " SD_ID128_FORMAT_STR "\n%s\n", + SD_ID128_FORMAT_VAL(id), s); +} + +int catalog_list(FILE *f, const char *database, bool oneline) { + _cleanup_close_ int fd = -EBADF; + void *p = NULL; + struct stat st; + const CatalogHeader *h; + const CatalogItem *items; + int r; + unsigned n; + sd_id128_t last_id; + bool last_id_set = false; + + r = open_mmap(database, &fd, &st, &p); + if (r < 0) + return r; + + h = p; + items = (const CatalogItem*) ((const uint8_t*) p + le64toh(h->header_size)); + + for (n = 0; n < le64toh(h->n_items); n++) { + const char *s; + + if (last_id_set && sd_id128_equal(last_id, items[n].id)) + continue; + + assert_se(s = find_id(p, items[n].id)); + + dump_catalog_entry(f, items[n].id, s, oneline); + + last_id_set = true; + last_id = items[n].id; + } + + munmap(p, st.st_size); + + return 0; +} + +int catalog_list_items(FILE *f, const char *database, bool oneline, char **items) { + int r = 0; + + STRV_FOREACH(item, items) { + sd_id128_t id; + int k; + _cleanup_free_ char *msg = NULL; + + k = sd_id128_from_string(*item, &id); + if (k < 0) { + log_error_errno(k, "Failed to parse id128 '%s': %m", *item); + if (r == 0) + r = k; + continue; + } + + k = catalog_get(database, id, &msg); + if (k < 0) { + log_full_errno(k == -ENOENT ? LOG_NOTICE : LOG_ERR, k, + "Failed to retrieve catalog entry for '%s': %m", *item); + if (r == 0) + r = k; + continue; + } + + dump_catalog_entry(f, id, msg, oneline); + } + + return r; +} diff --git a/src/libsystemd/sd-journal/catalog.h b/src/libsystemd/sd-journal/catalog.h new file mode 100644 index 0000000..df27869 --- /dev/null +++ b/src/libsystemd/sd-journal/catalog.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-id128.h" + +#include "hashmap.h" +#include "strbuf.h" + +int catalog_import_file(OrderedHashmap *h, const char *path); +int catalog_update(const char* database, const char* root, const char* const* dirs); +int catalog_get(const char* database, sd_id128_t id, char **data); +int catalog_list(FILE *f, const char* database, bool oneline); +int catalog_list_items(FILE *f, const char* database, bool oneline, char **items); +int catalog_file_lang(const char *filename, char **lang); +extern const char * const catalog_file_dirs[]; +extern const struct hash_ops catalog_hash_ops; diff --git a/src/libsystemd/sd-journal/fsprg.c b/src/libsystemd/sd-journal/fsprg.c new file mode 100644 index 0000000..e86be6a --- /dev/null +++ b/src/libsystemd/sd-journal/fsprg.c @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * + * fsprg v0.1 - (seekable) forward-secure pseudorandom generator + * Copyright © 2012 B. Poettering + * Contact: fsprg@point-at-infinity.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +/* + * See "Practical Secure Logging: Seekable Sequential Key Generators" + * by G. A. Marson, B. Poettering for details: + * + * http://eprint.iacr.org/2013/397 + */ + +#include + +#include "fsprg.h" +#include "gcrypt-util.h" +#include "memory-util.h" + +#define ISVALID_SECPAR(secpar) (((secpar) % 16 == 0) && ((secpar) >= 16) && ((secpar) <= 16384)) +#define VALIDATE_SECPAR(secpar) assert(ISVALID_SECPAR(secpar)); + +#define RND_HASH GCRY_MD_SHA256 +#define RND_GEN_P 0x01 +#define RND_GEN_Q 0x02 +#define RND_GEN_X 0x03 + +#pragma GCC diagnostic ignored "-Wpointer-arith" +/* TODO: remove void* arithmetic and this work-around */ + +/******************************************************************************/ + +static void mpi_export(void *buf, size_t buflen, const gcry_mpi_t x) { + unsigned len; + size_t nwritten; + + assert(gcry_mpi_cmp_ui(x, 0) >= 0); + len = (gcry_mpi_get_nbits(x) + 7) / 8; + assert(len <= buflen); + memzero(buf, buflen); + gcry_mpi_print(GCRYMPI_FMT_USG, buf + (buflen - len), len, &nwritten, x); + assert(nwritten == len); +} + +static gcry_mpi_t mpi_import(const void *buf, size_t buflen) { + gcry_mpi_t h; + _unused_ unsigned len; + + assert_se(gcry_mpi_scan(&h, GCRYMPI_FMT_USG, buf, buflen, NULL) == 0); + len = (gcry_mpi_get_nbits(h) + 7) / 8; + assert(len <= buflen); + assert(gcry_mpi_cmp_ui(h, 0) >= 0); + + return h; +} + +static void uint64_export(void *buf, size_t buflen, uint64_t x) { + assert(buflen == 8); + ((uint8_t*) buf)[0] = (x >> 56) & 0xff; + ((uint8_t*) buf)[1] = (x >> 48) & 0xff; + ((uint8_t*) buf)[2] = (x >> 40) & 0xff; + ((uint8_t*) buf)[3] = (x >> 32) & 0xff; + ((uint8_t*) buf)[4] = (x >> 24) & 0xff; + ((uint8_t*) buf)[5] = (x >> 16) & 0xff; + ((uint8_t*) buf)[6] = (x >> 8) & 0xff; + ((uint8_t*) buf)[7] = (x >> 0) & 0xff; +} + +static uint64_t uint64_import(const void *buf, size_t buflen) { + assert(buflen == 8); + return + (uint64_t)(((uint8_t*) buf)[0]) << 56 | + (uint64_t)(((uint8_t*) buf)[1]) << 48 | + (uint64_t)(((uint8_t*) buf)[2]) << 40 | + (uint64_t)(((uint8_t*) buf)[3]) << 32 | + (uint64_t)(((uint8_t*) buf)[4]) << 24 | + (uint64_t)(((uint8_t*) buf)[5]) << 16 | + (uint64_t)(((uint8_t*) buf)[6]) << 8 | + (uint64_t)(((uint8_t*) buf)[7]) << 0; +} + +/* deterministically generate from seed/idx a string of buflen pseudorandom bytes */ +static void det_randomize(void *buf, size_t buflen, const void *seed, size_t seedlen, uint32_t idx) { + gcry_md_hd_t hd, hd2; + size_t olen, cpylen; + gcry_error_t err; + uint32_t ctr; + + olen = gcry_md_get_algo_dlen(RND_HASH); + err = gcry_md_open(&hd, RND_HASH, 0); + assert_se(gcry_err_code(err) == GPG_ERR_NO_ERROR); /* This shouldn't happen */ + gcry_md_write(hd, seed, seedlen); + gcry_md_putc(hd, (idx >> 24) & 0xff); + gcry_md_putc(hd, (idx >> 16) & 0xff); + gcry_md_putc(hd, (idx >> 8) & 0xff); + gcry_md_putc(hd, (idx >> 0) & 0xff); + + for (ctr = 0; buflen; ctr++) { + err = gcry_md_copy(&hd2, hd); + assert_se(gcry_err_code(err) == GPG_ERR_NO_ERROR); /* This shouldn't happen */ + gcry_md_putc(hd2, (ctr >> 24) & 0xff); + gcry_md_putc(hd2, (ctr >> 16) & 0xff); + gcry_md_putc(hd2, (ctr >> 8) & 0xff); + gcry_md_putc(hd2, (ctr >> 0) & 0xff); + gcry_md_final(hd2); + cpylen = (buflen < olen) ? buflen : olen; + memcpy(buf, gcry_md_read(hd2, RND_HASH), cpylen); + gcry_md_close(hd2); + buf += cpylen; + buflen -= cpylen; + } + gcry_md_close(hd); +} + +/* deterministically generate from seed/idx a prime of length `bits' that is 3 (mod 4) */ +static gcry_mpi_t genprime3mod4(int bits, const void *seed, size_t seedlen, uint32_t idx) { + size_t buflen = bits / 8; + uint8_t buf[buflen]; + gcry_mpi_t p; + + assert(bits % 8 == 0); + assert(buflen > 0); + + det_randomize(buf, buflen, seed, seedlen, idx); + buf[0] |= 0xc0; /* set upper two bits, so that n=pq has maximum size */ + buf[buflen - 1] |= 0x03; /* set lower two bits, to have result 3 (mod 4) */ + + p = mpi_import(buf, buflen); + while (gcry_prime_check(p, 0)) + gcry_mpi_add_ui(p, p, 4); + + return p; +} + +/* deterministically generate from seed/idx a quadratic residue (mod n) */ +static gcry_mpi_t gensquare(const gcry_mpi_t n, const void *seed, size_t seedlen, uint32_t idx, unsigned secpar) { + size_t buflen = secpar / 8; + uint8_t buf[buflen]; + gcry_mpi_t x; + + det_randomize(buf, buflen, seed, seedlen, idx); + buf[0] &= 0x7f; /* clear upper bit, so that we have x < n */ + x = mpi_import(buf, buflen); + assert(gcry_mpi_cmp(x, n) < 0); + gcry_mpi_mulm(x, x, x, n); + return x; +} + +/* compute 2^m (mod phi(p)), for a prime p */ +static gcry_mpi_t twopowmodphi(uint64_t m, const gcry_mpi_t p) { + gcry_mpi_t phi, r; + int n; + + phi = gcry_mpi_new(0); + gcry_mpi_sub_ui(phi, p, 1); + + /* count number of used bits in m */ + for (n = 0; (1ULL << n) <= m; n++) + ; + + r = gcry_mpi_new(0); + gcry_mpi_set_ui(r, 1); + while (n) { /* square and multiply algorithm for fast exponentiation */ + n--; + gcry_mpi_mulm(r, r, r, phi); + if (m & ((uint64_t)1 << n)) { + gcry_mpi_add(r, r, r); + if (gcry_mpi_cmp(r, phi) >= 0) + gcry_mpi_sub(r, r, phi); + } + } + + gcry_mpi_release(phi); + return r; +} + +/* Decompose $x \in Z_n$ into $(xp,xq) \in Z_p \times Z_q$ using Chinese Remainder Theorem */ +static void CRT_decompose(gcry_mpi_t *xp, gcry_mpi_t *xq, const gcry_mpi_t x, const gcry_mpi_t p, const gcry_mpi_t q) { + *xp = gcry_mpi_new(0); + *xq = gcry_mpi_new(0); + gcry_mpi_mod(*xp, x, p); + gcry_mpi_mod(*xq, x, q); +} + +/* Compose $(xp,xq) \in Z_p \times Z_q$ into $x \in Z_n$ using Chinese Remainder Theorem */ +static void CRT_compose(gcry_mpi_t *x, const gcry_mpi_t xp, const gcry_mpi_t xq, const gcry_mpi_t p, const gcry_mpi_t q) { + gcry_mpi_t a, u; + + a = gcry_mpi_new(0); + u = gcry_mpi_new(0); + *x = gcry_mpi_new(0); + gcry_mpi_subm(a, xq, xp, q); + gcry_mpi_invm(u, p, q); + gcry_mpi_mulm(a, a, u, q); /* a = (xq - xp) / p (mod q) */ + gcry_mpi_mul(*x, p, a); + gcry_mpi_add(*x, *x, xp); /* x = p * ((xq - xp) / p mod q) + xp */ + gcry_mpi_release(a); + gcry_mpi_release(u); +} + +/******************************************************************************/ + +size_t FSPRG_mskinbytes(unsigned _secpar) { + VALIDATE_SECPAR(_secpar); + return 2 + 2 * (_secpar / 2) / 8; /* to store header,p,q */ +} + +size_t FSPRG_mpkinbytes(unsigned _secpar) { + VALIDATE_SECPAR(_secpar); + return 2 + _secpar / 8; /* to store header,n */ +} + +size_t FSPRG_stateinbytes(unsigned _secpar) { + VALIDATE_SECPAR(_secpar); + return 2 + 2 * _secpar / 8 + 8; /* to store header,n,x,epoch */ +} + +static void store_secpar(void *buf, uint16_t secpar) { + secpar = secpar / 16 - 1; + ((uint8_t*) buf)[0] = (secpar >> 8) & 0xff; + ((uint8_t*) buf)[1] = (secpar >> 0) & 0xff; +} + +static uint16_t read_secpar(const void *buf) { + uint16_t secpar; + secpar = + (uint16_t)(((uint8_t*) buf)[0]) << 8 | + (uint16_t)(((uint8_t*) buf)[1]) << 0; + return 16 * (secpar + 1); +} + +void FSPRG_GenMK(void *msk, void *mpk, const void *seed, size_t seedlen, unsigned _secpar) { + uint8_t iseed[FSPRG_RECOMMENDED_SEEDLEN]; + gcry_mpi_t n, p, q; + uint16_t secpar; + + VALIDATE_SECPAR(_secpar); + secpar = _secpar; + + initialize_libgcrypt(false); + + if (!seed) { + gcry_randomize(iseed, FSPRG_RECOMMENDED_SEEDLEN, GCRY_STRONG_RANDOM); + seed = iseed; + seedlen = FSPRG_RECOMMENDED_SEEDLEN; + } + + p = genprime3mod4(secpar / 2, seed, seedlen, RND_GEN_P); + q = genprime3mod4(secpar / 2, seed, seedlen, RND_GEN_Q); + + if (msk) { + store_secpar(msk + 0, secpar); + mpi_export(msk + 2 + 0 * (secpar / 2) / 8, (secpar / 2) / 8, p); + mpi_export(msk + 2 + 1 * (secpar / 2) / 8, (secpar / 2) / 8, q); + } + + if (mpk) { + n = gcry_mpi_new(0); + gcry_mpi_mul(n, p, q); + assert(gcry_mpi_get_nbits(n) == secpar); + + store_secpar(mpk + 0, secpar); + mpi_export(mpk + 2, secpar / 8, n); + + gcry_mpi_release(n); + } + + gcry_mpi_release(p); + gcry_mpi_release(q); +} + +void FSPRG_GenState0(void *state, const void *mpk, const void *seed, size_t seedlen) { + gcry_mpi_t n, x; + uint16_t secpar; + + initialize_libgcrypt(false); + + secpar = read_secpar(mpk + 0); + n = mpi_import(mpk + 2, secpar / 8); + x = gensquare(n, seed, seedlen, RND_GEN_X, secpar); + + memcpy(state, mpk, 2 + secpar / 8); + mpi_export(state + 2 + 1 * secpar / 8, secpar / 8, x); + memzero(state + 2 + 2 * secpar / 8, 8); + + gcry_mpi_release(n); + gcry_mpi_release(x); +} + +void FSPRG_Evolve(void *state) { + gcry_mpi_t n, x; + uint16_t secpar; + uint64_t epoch; + + initialize_libgcrypt(false); + + secpar = read_secpar(state + 0); + n = mpi_import(state + 2 + 0 * secpar / 8, secpar / 8); + x = mpi_import(state + 2 + 1 * secpar / 8, secpar / 8); + epoch = uint64_import(state + 2 + 2 * secpar / 8, 8); + + gcry_mpi_mulm(x, x, x, n); + epoch++; + + mpi_export(state + 2 + 1 * secpar / 8, secpar / 8, x); + uint64_export(state + 2 + 2 * secpar / 8, 8, epoch); + + gcry_mpi_release(n); + gcry_mpi_release(x); +} + +uint64_t FSPRG_GetEpoch(const void *state) { + uint16_t secpar; + secpar = read_secpar(state + 0); + return uint64_import(state + 2 + 2 * secpar / 8, 8); +} + +void FSPRG_Seek(void *state, uint64_t epoch, const void *msk, const void *seed, size_t seedlen) { + gcry_mpi_t p, q, n, x, xp, xq, kp, kq, xm; + uint16_t secpar; + + initialize_libgcrypt(false); + + secpar = read_secpar(msk + 0); + p = mpi_import(msk + 2 + 0 * (secpar / 2) / 8, (secpar / 2) / 8); + q = mpi_import(msk + 2 + 1 * (secpar / 2) / 8, (secpar / 2) / 8); + + n = gcry_mpi_new(0); + gcry_mpi_mul(n, p, q); + + x = gensquare(n, seed, seedlen, RND_GEN_X, secpar); + CRT_decompose(&xp, &xq, x, p, q); /* split (mod n) into (mod p) and (mod q) using CRT */ + + kp = twopowmodphi(epoch, p); /* compute 2^epoch (mod phi(p)) */ + kq = twopowmodphi(epoch, q); /* compute 2^epoch (mod phi(q)) */ + + gcry_mpi_powm(xp, xp, kp, p); /* compute x^(2^epoch) (mod p) */ + gcry_mpi_powm(xq, xq, kq, q); /* compute x^(2^epoch) (mod q) */ + + CRT_compose(&xm, xp, xq, p, q); /* combine (mod p) and (mod q) to (mod n) using CRT */ + + store_secpar(state + 0, secpar); + mpi_export(state + 2 + 0 * secpar / 8, secpar / 8, n); + mpi_export(state + 2 + 1 * secpar / 8, secpar / 8, xm); + uint64_export(state + 2 + 2 * secpar / 8, 8, epoch); + + gcry_mpi_release(p); + gcry_mpi_release(q); + gcry_mpi_release(n); + gcry_mpi_release(x); + gcry_mpi_release(xp); + gcry_mpi_release(xq); + gcry_mpi_release(kp); + gcry_mpi_release(kq); + gcry_mpi_release(xm); +} + +void FSPRG_GetKey(const void *state, void *key, size_t keylen, uint32_t idx) { + uint16_t secpar; + + initialize_libgcrypt(false); + + secpar = read_secpar(state + 0); + det_randomize(key, keylen, state + 2, 2 * secpar / 8 + 8, idx); +} diff --git a/src/libsystemd/sd-journal/fsprg.h b/src/libsystemd/sd-journal/fsprg.h new file mode 100644 index 0000000..d3d88aa --- /dev/null +++ b/src/libsystemd/sd-journal/fsprg.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* + * fsprg v0.1 - (seekable) forward-secure pseudorandom generator + * Copyright © 2012 B. Poettering + * Contact: fsprg@point-at-infinity.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include + +#include "macro.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FSPRG_RECOMMENDED_SECPAR 1536 +#define FSPRG_RECOMMENDED_SEEDLEN (96/8) + +size_t FSPRG_mskinbytes(unsigned secpar) _const_; +size_t FSPRG_mpkinbytes(unsigned secpar) _const_; +size_t FSPRG_stateinbytes(unsigned secpar) _const_; + +/* Setup msk and mpk. Providing seed != NULL makes this algorithm deterministic. */ +void FSPRG_GenMK(void *msk, void *mpk, const void *seed, size_t seedlen, unsigned secpar); + +/* Initialize state deterministically in dependence on seed. */ +/* Note: in case one wants to run only one GenState0 per GenMK it is safe to use + the same seed for both GenMK and GenState0. +*/ +void FSPRG_GenState0(void *state, const void *mpk, const void *seed, size_t seedlen); + +void FSPRG_Evolve(void *state); + +uint64_t FSPRG_GetEpoch(const void *state) _pure_; + +/* Seek to any arbitrary state (by providing msk together with seed from GenState0). */ +void FSPRG_Seek(void *state, uint64_t epoch, const void *msk, const void *seed, size_t seedlen); + +void FSPRG_GetKey(const void *state, void *key, size_t keylen, uint32_t idx); + +#ifdef __cplusplus +} +#endif diff --git a/src/libsystemd/sd-journal/generate-audit_type-list.sh b/src/libsystemd/sd-journal/generate-audit_type-list.sh new file mode 100755 index 0000000..3851ea1 --- /dev/null +++ b/src/libsystemd/sd-journal/generate-audit_type-list.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu +set -o pipefail + +cpp="${1:?}" +shift + +includes=() +for i in "$@"; do + includes+=(-include "$i") +done + +$cpp -dM "${includes[@]}" - +#include + +#include "fd-util.h" +#include "fsprg.h" +#include "gcrypt-util.h" +#include "hexdecoct.h" +#include "journal-authenticate.h" +#include "journal-def.h" +#include "journal-file.h" +#include "memory-util.h" +#include "time-util.h" + +static void* fssheader_free(FSSHeader *p) { + /* mmap() returns MAP_FAILED on error and sets the errno */ + if (!p || p == MAP_FAILED) + return NULL; + + assert_se(munmap(p, PAGE_ALIGN(sizeof(FSSHeader))) >= 0); + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(FSSHeader*, fssheader_free); + +static uint64_t journal_file_tag_seqnum(JournalFile *f) { + uint64_t r; + + assert(f); + + r = le64toh(f->header->n_tags) + 1; + f->header->n_tags = htole64(r); + + return r; +} + +int journal_file_append_tag(JournalFile *f) { + Object *o; + uint64_t p; + int r; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + if (!f->hmac_running) { + r = journal_file_hmac_start(f); + if (r < 0) + return r; + } + + assert(f->hmac); + + r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p); + if (r < 0) + return r; + + o->tag.seqnum = htole64(journal_file_tag_seqnum(f)); + o->tag.epoch = htole64(FSPRG_GetEpoch(f->fsprg_state)); + + log_debug("Writing tag %"PRIu64" for epoch %"PRIu64"", + le64toh(o->tag.seqnum), + FSPRG_GetEpoch(f->fsprg_state)); + + /* Add the tag object itself, so that we can protect its + * header. This will exclude the actual hash value in it */ + r = journal_file_hmac_put_object(f, OBJECT_TAG, o, p); + if (r < 0) + return r; + + /* Get the HMAC tag and store it in the object */ + memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH); + f->hmac_running = false; + + return 0; +} + +int journal_file_hmac_start(JournalFile *f) { + uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */ + gcry_error_t err; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + if (f->hmac_running) + return 0; + + /* Prepare HMAC for next cycle */ + gcry_md_reset(f->hmac); + FSPRG_GetKey(f->fsprg_state, key, sizeof(key), 0); + err = gcry_md_setkey(f->hmac, key, sizeof(key)); + if (gcry_err_code(err) != GPG_ERR_NO_ERROR) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "gcry_md_setkey() failed with error code: %s", + gcry_strerror(err)); + + f->hmac_running = true; + + return 0; +} + +static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) { + uint64_t t; + + assert(f); + assert(epoch); + assert(JOURNAL_HEADER_SEALED(f->header)); + + if (f->fss_start_usec == 0 || f->fss_interval_usec == 0) + return -EOPNOTSUPP; + + if (realtime < f->fss_start_usec) + return -ESTALE; + + t = realtime - f->fss_start_usec; + t = t / f->fss_interval_usec; + + *epoch = t; + + return 0; +} + +static int journal_file_fsprg_need_evolve(JournalFile *f, uint64_t realtime) { + uint64_t goal, epoch; + int r; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + r = journal_file_get_epoch(f, realtime, &goal); + if (r < 0) + return r; + + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (epoch > goal) + return -ESTALE; + + return epoch != goal; +} + +int journal_file_fsprg_evolve(JournalFile *f, uint64_t realtime) { + uint64_t goal, epoch; + int r; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + r = journal_file_get_epoch(f, realtime, &goal); + if (r < 0) + return r; + + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (epoch < goal) + log_debug("Evolving FSPRG key from epoch %"PRIu64" to %"PRIu64".", epoch, goal); + + for (;;) { + if (epoch > goal) + return -ESTALE; + if (epoch == goal) + return 0; + + FSPRG_Evolve(f->fsprg_state); + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (epoch < goal) { + r = journal_file_append_tag(f); + if (r < 0) + return r; + } + } +} + +int journal_file_fsprg_seek(JournalFile *f, uint64_t goal) { + void *msk; + uint64_t epoch; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + assert(f->fsprg_seed); + + if (f->fsprg_state) { + /* Cheaper... */ + + epoch = FSPRG_GetEpoch(f->fsprg_state); + if (goal == epoch) + return 0; + + if (goal == epoch + 1) { + FSPRG_Evolve(f->fsprg_state); + return 0; + } + } else { + f->fsprg_state_size = FSPRG_stateinbytes(FSPRG_RECOMMENDED_SECPAR); + f->fsprg_state = malloc(f->fsprg_state_size); + if (!f->fsprg_state) + return -ENOMEM; + } + + log_debug("Seeking FSPRG key to %"PRIu64".", goal); + + msk = alloca_safe(FSPRG_mskinbytes(FSPRG_RECOMMENDED_SECPAR)); + FSPRG_GenMK(msk, NULL, f->fsprg_seed, f->fsprg_seed_size, FSPRG_RECOMMENDED_SECPAR); + FSPRG_Seek(f->fsprg_state, goal, msk, f->fsprg_seed, f->fsprg_seed_size); + + return 0; +} + +int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) { + int r; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + if (realtime <= 0) + realtime = now(CLOCK_REALTIME); + + r = journal_file_fsprg_need_evolve(f, realtime); + if (r <= 0) + return 0; + + r = journal_file_append_tag(f); + if (r < 0) + return r; + + r = journal_file_fsprg_evolve(f, realtime); + if (r < 0) + return r; + + return 0; +} + +int journal_file_hmac_put_object(JournalFile *f, ObjectType type, Object *o, uint64_t p) { + int r; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + r = journal_file_hmac_start(f); + if (r < 0) + return r; + + if (!o) { + r = journal_file_move_to_object(f, type, p, &o); + if (r < 0) + return r; + } else if (type > OBJECT_UNUSED && o->object.type != type) + return -EBADMSG; + + gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload)); + + switch (o->object.type) { + + case OBJECT_DATA: + /* All but hash and payload are mutable */ + gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash)); + gcry_md_write(f->hmac, journal_file_data_payload_field(f, o), le64toh(o->object.size) - journal_file_data_payload_offset(f)); + break; + + case OBJECT_FIELD: + /* Same here */ + gcry_md_write(f->hmac, &o->field.hash, sizeof(o->field.hash)); + gcry_md_write(f->hmac, o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload)); + break; + + case OBJECT_ENTRY: + /* All */ + gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(Object, entry.seqnum)); + break; + + case OBJECT_FIELD_HASH_TABLE: + case OBJECT_DATA_HASH_TABLE: + case OBJECT_ENTRY_ARRAY: + /* Nothing: everything is mutable */ + break; + + case OBJECT_TAG: + /* All but the tag itself */ + gcry_md_write(f->hmac, &o->tag.seqnum, sizeof(o->tag.seqnum)); + gcry_md_write(f->hmac, &o->tag.epoch, sizeof(o->tag.epoch)); + break; + default: + return -EINVAL; + } + + return 0; +} + +int journal_file_hmac_put_header(JournalFile *f) { + int r; + + assert(f); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + r = journal_file_hmac_start(f); + if (r < 0) + return r; + + /* All but state+reserved, boot_id, arena_size, + * tail_object_offset, n_objects, n_entries, + * tail_entry_seqnum, head_entry_seqnum, entry_array_offset, + * head_entry_realtime, tail_entry_realtime, + * tail_entry_monotonic, n_data, n_fields, n_tags, + * n_entry_arrays. */ + + gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature)); + gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, tail_entry_boot_id) - offsetof(Header, file_id)); + gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id)); + gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset)); + + return 0; +} + +int journal_file_fss_load(JournalFile *f) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *path = NULL; + _cleanup_(fssheader_freep) FSSHeader *header = NULL; + struct stat st; + sd_id128_t machine; + int r; + + assert(f); + + /* This function is used to determine whether sealing should be enabled in the journal header so we + * can't check the header to check if sealing is enabled here. */ + + r = sd_id128_get_machine(&machine); + if (r < 0) + return r; + + if (asprintf(&path, "/var/log/journal/" SD_ID128_FORMAT_STR "/fss", + SD_ID128_FORMAT_VAL(machine)) < 0) + return -ENOMEM; + + fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600); + if (fd < 0) { + if (errno != ENOENT) + log_error_errno(errno, "Failed to open %s: %m", path); + + return -errno; + } + + if (fstat(fd, &st) < 0) + return -errno; + + if (st.st_size < (off_t) sizeof(FSSHeader)) + return -ENODATA; + + header = mmap(NULL, PAGE_ALIGN(sizeof(FSSHeader)), PROT_READ, MAP_SHARED, fd, 0); + if (header == MAP_FAILED) + return -errno; + + if (memcmp(header->signature, FSS_HEADER_SIGNATURE, 8) != 0) + return -EBADMSG; + + if (header->incompatible_flags != 0) + return -EPROTONOSUPPORT; + + if (le64toh(header->header_size) < sizeof(FSSHeader)) + return -EBADMSG; + + if (le64toh(header->fsprg_state_size) != FSPRG_stateinbytes(le16toh(header->fsprg_secpar))) + return -EBADMSG; + + f->fss_file_size = le64toh(header->header_size) + le64toh(header->fsprg_state_size); + if ((uint64_t) st.st_size < f->fss_file_size) + return -ENODATA; + + if (!sd_id128_equal(machine, header->machine_id)) + return -EHOSTDOWN; + + if (le64toh(header->start_usec) <= 0 || le64toh(header->interval_usec) <= 0) + return -EBADMSG; + + size_t sz = PAGE_ALIGN(f->fss_file_size); + assert(sz < SIZE_MAX); + f->fss_file = mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (f->fss_file == MAP_FAILED) { + f->fss_file = NULL; + return -errno; + } + + f->fss_start_usec = le64toh(f->fss_file->start_usec); + f->fss_interval_usec = le64toh(f->fss_file->interval_usec); + + f->fsprg_state = (uint8_t*) f->fss_file + le64toh(f->fss_file->header_size); + f->fsprg_state_size = le64toh(f->fss_file->fsprg_state_size); + + return 0; +} + +int journal_file_hmac_setup(JournalFile *f) { + gcry_error_t e; + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + initialize_libgcrypt(true); + + e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC); + if (e != 0) + return -EOPNOTSUPP; + + return 0; +} + +int journal_file_append_first_tag(JournalFile *f) { + uint64_t p; + int r; + + if (!JOURNAL_HEADER_SEALED(f->header)) + return 0; + + log_debug("Calculating first tag..."); + + r = journal_file_hmac_put_header(f); + if (r < 0) + return r; + + p = le64toh(f->header->field_hash_table_offset); + if (p < offsetof(Object, hash_table.items)) + return -EINVAL; + p -= offsetof(Object, hash_table.items); + + r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, NULL, p); + if (r < 0) + return r; + + p = le64toh(f->header->data_hash_table_offset); + if (p < offsetof(Object, hash_table.items)) + return -EINVAL; + p -= offsetof(Object, hash_table.items); + + r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, NULL, p); + if (r < 0) + return r; + + r = journal_file_append_tag(f); + if (r < 0) + return r; + + return 0; +} + +int journal_file_parse_verification_key(JournalFile *f, const char *key) { + _cleanup_free_ uint8_t *seed = NULL; + size_t seed_size; + const char *k; + unsigned long long start, interval; + int r; + + assert(f); + assert(key); + + seed_size = FSPRG_RECOMMENDED_SEEDLEN; + seed = malloc(seed_size); + if (!seed) + return -ENOMEM; + + k = key; + for (size_t c = 0; c < seed_size; c++) { + int x, y; + + k = skip_leading_chars(k, "-"); + + x = unhexchar(*k); + if (x < 0) + return -EINVAL; + k++; + + y = unhexchar(*k); + if (y < 0) + return -EINVAL; + k++; + + seed[c] = (uint8_t) (x * 16 + y); + } + + if (*k != '/') + return -EINVAL; + k++; + + r = sscanf(k, "%llx-%llx", &start, &interval); + if (r != 2) + return -EINVAL; + + f->fsprg_seed = TAKE_PTR(seed); + f->fsprg_seed_size = seed_size; + + f->fss_start_usec = start * interval; + f->fss_interval_usec = interval; + + return 0; +} + +bool journal_file_next_evolve_usec(JournalFile *f, usec_t *u) { + uint64_t epoch; + + assert(f); + assert(u); + + if (!JOURNAL_HEADER_SEALED(f->header)) + return false; + + epoch = FSPRG_GetEpoch(f->fsprg_state); + + *u = (usec_t) (f->fss_start_usec + f->fss_interval_usec * epoch + f->fss_interval_usec); + + return true; +} diff --git a/src/libsystemd/sd-journal/journal-authenticate.h b/src/libsystemd/sd-journal/journal-authenticate.h new file mode 100644 index 0000000..e895722 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-authenticate.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "journal-file.h" + +int journal_file_append_tag(JournalFile *f); +int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime); +int journal_file_append_first_tag(JournalFile *f); + +int journal_file_hmac_setup(JournalFile *f); +int journal_file_hmac_start(JournalFile *f); +int journal_file_hmac_put_header(JournalFile *f); +int journal_file_hmac_put_object(JournalFile *f, ObjectType type, Object *o, uint64_t p); + +int journal_file_fss_load(JournalFile *f); +int journal_file_parse_verification_key(JournalFile *f, const char *key); + +int journal_file_fsprg_evolve(JournalFile *f, uint64_t realtime); +int journal_file_fsprg_seek(JournalFile *f, uint64_t epoch); + +bool journal_file_next_evolve_usec(JournalFile *f, usec_t *u); diff --git a/src/libsystemd/sd-journal/journal-def.h b/src/libsystemd/sd-journal/journal-def.h new file mode 100644 index 0000000..1b10f24 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-def.h @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-id128.h" + +#include "macro.h" +#include "sparse-endian.h" + +/* + * If you change this file you probably should also change its documentation: + * + * https://systemd.io/JOURNAL_FILE_FORMAT + */ + +typedef struct Header Header; + +typedef struct ObjectHeader ObjectHeader; +typedef union Object Object; + +typedef struct DataObject DataObject; +typedef struct FieldObject FieldObject; +typedef struct EntryObject EntryObject; +typedef struct HashTableObject HashTableObject; +typedef struct EntryArrayObject EntryArrayObject; +typedef struct TagObject TagObject; + +typedef struct HashItem HashItem; + +typedef struct FSSHeader FSSHeader; + +/* Object types */ +typedef enum ObjectType { + OBJECT_UNUSED, /* also serves as "any type" or "additional category" */ + OBJECT_DATA, + OBJECT_FIELD, + OBJECT_ENTRY, + OBJECT_DATA_HASH_TABLE, + OBJECT_FIELD_HASH_TABLE, + OBJECT_ENTRY_ARRAY, + OBJECT_TAG, + _OBJECT_TYPE_MAX, + _OBJECT_TYPE_INVALID = -EINVAL, +} ObjectType; + +/* Object flags (note that src/basic/compress.h uses the same values for the compression types) */ +enum { + OBJECT_COMPRESSED_XZ = 1 << 0, + OBJECT_COMPRESSED_LZ4 = 1 << 1, + OBJECT_COMPRESSED_ZSTD = 1 << 2, + _OBJECT_COMPRESSED_MASK = OBJECT_COMPRESSED_XZ | OBJECT_COMPRESSED_LZ4 | OBJECT_COMPRESSED_ZSTD, +}; + +struct ObjectHeader { + uint8_t type; + uint8_t flags; + uint8_t reserved[6]; + le64_t size; + uint8_t payload[]; +} _packed_; + +#define DataObject__contents { \ + ObjectHeader object; \ + le64_t hash; \ + le64_t next_hash_offset; \ + le64_t next_field_offset; \ + le64_t entry_offset; /* the first array entry we store inline */ \ + le64_t entry_array_offset; \ + le64_t n_entries; \ + union { \ + struct { \ + uint8_t payload[0]; \ + } regular; \ + struct { \ + le32_t tail_entry_array_offset; \ + le32_t tail_entry_array_n_entries; \ + uint8_t payload[0]; \ + } compact; \ + }; \ +} + +struct DataObject DataObject__contents; +struct DataObject__packed DataObject__contents _packed_; +assert_cc(sizeof(struct DataObject) == sizeof(struct DataObject__packed)); + +#define FieldObject__contents { \ + ObjectHeader object; \ + le64_t hash; \ + le64_t next_hash_offset; \ + le64_t head_data_offset; \ + uint8_t payload[]; \ +} + +struct FieldObject FieldObject__contents; +struct FieldObject__packed FieldObject__contents _packed_; +assert_cc(sizeof(struct FieldObject) == sizeof(struct FieldObject__packed)); + +#define EntryObject__contents { \ + ObjectHeader object; \ + le64_t seqnum; \ + le64_t realtime; \ + le64_t monotonic; \ + sd_id128_t boot_id; \ + le64_t xor_hash; \ + union { \ + struct { \ + dummy_t __empty__regular; \ + struct { \ + le64_t object_offset; \ + le64_t hash; \ + } regular[]; \ + }; \ + struct { \ + dummy_t __empty_compact; \ + struct { \ + le32_t object_offset; \ + } compact[]; \ + }; \ + } items; \ +} + +struct EntryObject EntryObject__contents; +struct EntryObject__packed EntryObject__contents _packed_; +assert_cc(sizeof(struct EntryObject) == sizeof(struct EntryObject__packed)); + +struct HashItem { + le64_t head_hash_offset; + le64_t tail_hash_offset; +} _packed_; + +struct HashTableObject { + ObjectHeader object; + HashItem items[]; +} _packed_; + +struct EntryArrayObject { + ObjectHeader object; + le64_t next_entry_array_offset; + union { + DECLARE_FLEX_ARRAY(le64_t, regular); + DECLARE_FLEX_ARRAY(le32_t, compact); + } items; +} _packed_; + +#define TAG_LENGTH (256/8) + +struct TagObject { + ObjectHeader object; + le64_t seqnum; + le64_t epoch; + uint8_t tag[TAG_LENGTH]; /* SHA-256 HMAC */ +} _packed_; + +union Object { + ObjectHeader object; + DataObject data; + FieldObject field; + EntryObject entry; + HashTableObject hash_table; + EntryArrayObject entry_array; + TagObject tag; +}; + +enum { + STATE_OFFLINE = 0, + STATE_ONLINE = 1, + STATE_ARCHIVED = 2, + _STATE_MAX +}; + +/* Header flags */ +enum { + HEADER_INCOMPATIBLE_COMPRESSED_XZ = 1 << 0, + HEADER_INCOMPATIBLE_COMPRESSED_LZ4 = 1 << 1, + HEADER_INCOMPATIBLE_KEYED_HASH = 1 << 2, + HEADER_INCOMPATIBLE_COMPRESSED_ZSTD = 1 << 3, + HEADER_INCOMPATIBLE_COMPACT = 1 << 4, + + HEADER_INCOMPATIBLE_ANY = HEADER_INCOMPATIBLE_COMPRESSED_XZ | + HEADER_INCOMPATIBLE_COMPRESSED_LZ4 | + HEADER_INCOMPATIBLE_KEYED_HASH | + HEADER_INCOMPATIBLE_COMPRESSED_ZSTD | + HEADER_INCOMPATIBLE_COMPACT, + + HEADER_INCOMPATIBLE_SUPPORTED = (HAVE_XZ ? HEADER_INCOMPATIBLE_COMPRESSED_XZ : 0) | + (HAVE_LZ4 ? HEADER_INCOMPATIBLE_COMPRESSED_LZ4 : 0) | + (HAVE_ZSTD ? HEADER_INCOMPATIBLE_COMPRESSED_ZSTD : 0) | + HEADER_INCOMPATIBLE_KEYED_HASH | + HEADER_INCOMPATIBLE_COMPACT, +}; + + +enum { + HEADER_COMPATIBLE_SEALED = 1 << 0, + HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID = 1 << 1, /* if set, the last_entry_boot_id field in the header is exclusively refreshed when an entry is appended */ + HEADER_COMPATIBLE_SEALED_CONTINUOUS = 1 << 2, + HEADER_COMPATIBLE_ANY = HEADER_COMPATIBLE_SEALED | + HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID | + HEADER_COMPATIBLE_SEALED_CONTINUOUS, + + HEADER_COMPATIBLE_SUPPORTED = (HAVE_GCRYPT ? HEADER_COMPATIBLE_SEALED | HEADER_COMPATIBLE_SEALED_CONTINUOUS : 0) | + HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID, +}; + + +#define HEADER_SIGNATURE \ + ((const uint8_t[]) { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' }) + +#define struct_Header__contents { \ + uint8_t signature[8]; /* "LPKSHHRH" */ \ + le32_t compatible_flags; \ + le32_t incompatible_flags; \ + uint8_t state; \ + uint8_t reserved[7]; \ + sd_id128_t file_id; \ + sd_id128_t machine_id; \ + sd_id128_t tail_entry_boot_id; \ + sd_id128_t seqnum_id; \ + le64_t header_size; \ + le64_t arena_size; \ + le64_t data_hash_table_offset; \ + le64_t data_hash_table_size; \ + le64_t field_hash_table_offset; \ + le64_t field_hash_table_size; \ + le64_t tail_object_offset; \ + le64_t n_objects; \ + le64_t n_entries; \ + le64_t tail_entry_seqnum; \ + le64_t head_entry_seqnum; \ + le64_t entry_array_offset; \ + le64_t head_entry_realtime; \ + le64_t tail_entry_realtime; \ + le64_t tail_entry_monotonic; \ + /* Added in 187 */ \ + le64_t n_data; \ + le64_t n_fields; \ + /* Added in 189 */ \ + le64_t n_tags; \ + le64_t n_entry_arrays; \ + /* Added in 246 */ \ + le64_t data_hash_chain_depth; \ + le64_t field_hash_chain_depth; \ + /* Added in 252 */ \ + le32_t tail_entry_array_offset; \ + le32_t tail_entry_array_n_entries; \ + /* Added in 254 */ \ + le64_t tail_entry_offset; \ + } + +struct Header struct_Header__contents; +struct Header__packed struct_Header__contents _packed_; +assert_cc(sizeof(struct Header) == sizeof(struct Header__packed)); +assert_cc(sizeof(struct Header) == 272); + +#define FSS_HEADER_SIGNATURE \ + ((const char[]) { 'K', 'S', 'H', 'H', 'R', 'H', 'L', 'P' }) + +struct FSSHeader { + uint8_t signature[8]; /* "KSHHRHLP" */ + le32_t compatible_flags; + le32_t incompatible_flags; + sd_id128_t machine_id; + sd_id128_t boot_id; /* last writer */ + le64_t header_size; + le64_t start_usec; + le64_t interval_usec; + le16_t fsprg_secpar; + le16_t reserved[3]; + le64_t fsprg_state_size; +} _packed_; diff --git a/src/libsystemd/sd-journal/journal-file.c b/src/libsystemd/sd-journal/journal-file.c new file mode 100644 index 0000000..d2493a0 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-file.c @@ -0,0 +1,4696 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-event.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "compress.h" +#include "env-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "id128-util.h" +#include "journal-authenticate.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "lookup3.h" +#include "memory-util.h" +#include "missing_threads.h" +#include "path-util.h" +#include "prioq.h" +#include "random-util.h" +#include "set.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "sync-util.h" +#include "user-util.h" +#include "xattr-util.h" + +#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem)) +#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem)) + +#define DEFAULT_COMPRESS_THRESHOLD (512ULL) +#define MIN_COMPRESS_THRESHOLD (8ULL) + +#define U64_KB UINT64_C(1024) +#define U64_MB (UINT64_C(1024) * U64_KB) +#define U64_GB (UINT64_C(1024) * U64_MB) + +/* This is the minimum journal file size */ +#define JOURNAL_FILE_SIZE_MIN (512 * U64_KB) /* 512 KiB */ +#define JOURNAL_COMPACT_SIZE_MAX ((uint64_t) UINT32_MAX) /* 4 GiB */ + +/* These are the lower and upper bounds if we deduce the max_use value from the file system size */ +#define MAX_USE_LOWER (1 * U64_MB) /* 1 MiB */ +#define MAX_USE_UPPER (4 * U64_GB) /* 4 GiB */ + +/* Those are the lower and upper bounds for the minimal use limit, + * i.e. how much we'll use even if keep_free suggests otherwise. */ +#define MIN_USE_LOW (1 * U64_MB) /* 1 MiB */ +#define MIN_USE_HIGH (16 * U64_MB) /* 16 MiB */ + +/* This is the upper bound if we deduce max_size from max_use */ +#define MAX_SIZE_UPPER (128 * U64_MB) /* 128 MiB */ + +/* This is the upper bound if we deduce the keep_free value from the file system size */ +#define KEEP_FREE_UPPER (4 * U64_GB) /* 4 GiB */ + +/* This is the keep_free value when we can't determine the system size */ +#define DEFAULT_KEEP_FREE (1 * U64_MB) /* 1 MB */ + +/* This is the default maximum number of journal files to keep around. */ +#define DEFAULT_N_MAX_FILES 100 + +/* n_data was the first entry we added after the initial file format design */ +#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data)) + +/* How many entries to keep in the entry array chain cache at max */ +#define CHAIN_CACHE_MAX 20 + +/* How much to increase the journal file size at once each time we allocate something new. */ +#define FILE_SIZE_INCREASE (8 * U64_MB) /* 8MB */ + +/* Reread fstat() of the file for detecting deletions at least this often */ +#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC) + +/* Longest hash chain to rotate after */ +#define HASH_CHAIN_DEPTH_MAX 100 + +#ifdef __clang__ +# pragma GCC diagnostic ignored "-Waddress-of-packed-member" +#endif + +static int mmap_prot_from_open_flags(int flags) { + switch (flags & O_ACCMODE) { + case O_RDONLY: + return PROT_READ; + case O_WRONLY: + return PROT_WRITE; + case O_RDWR: + return PROT_READ|PROT_WRITE; + default: + assert_not_reached(); + } +} + +int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset) { + uint64_t p; + int r; + + assert(f); + assert(f->header); + assert(ret_offset); + + /* Same as journal_file_tail_end_by_mmap() below, but operates with pread() to avoid the mmap cache + * (and thus is thread safe) */ + + p = le64toh(f->header->tail_object_offset); + if (p == 0) + p = le64toh(f->header->header_size); + else { + Object tail; + uint64_t sz; + + r = journal_file_read_object_header(f, OBJECT_UNUSED, p, &tail); + if (r < 0) + return r; + + sz = le64toh(tail.object.size); + if (sz > UINT64_MAX - sizeof(uint64_t) + 1) + return -EBADMSG; + + sz = ALIGN64(sz); + if (p > UINT64_MAX - sz) + return -EBADMSG; + + p += sz; + } + + *ret_offset = p; + + return 0; +} + +int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset) { + uint64_t p; + int r; + + assert(f); + assert(f->header); + assert(ret_offset); + + /* Same as journal_file_tail_end_by_pread() above, but operates with the usual mmap logic */ + + p = le64toh(f->header->tail_object_offset); + if (p == 0) + p = le64toh(f->header->header_size); + else { + Object *tail; + uint64_t sz; + + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail); + if (r < 0) + return r; + + sz = le64toh(READ_NOW(tail->object.size)); + if (sz > UINT64_MAX - sizeof(uint64_t) + 1) + return -EBADMSG; + + sz = ALIGN64(sz); + if (p > UINT64_MAX - sz) + return -EBADMSG; + + p += sz; + } + + *ret_offset = p; + + return 0; +} + +int journal_file_set_offline_thread_join(JournalFile *f) { + int r; + + assert(f); + + if (f->offline_state == OFFLINE_JOINED) + return 0; + + r = pthread_join(f->offline_thread, NULL); + if (r) + return -r; + + f->offline_state = OFFLINE_JOINED; + + if (mmap_cache_fd_got_sigbus(f->cache_fd)) + return -EIO; + + return 0; +} + +static int journal_file_set_online(JournalFile *f) { + bool wait = true; + + assert(f); + + if (!journal_file_writable(f)) + return -EPERM; + + if (f->fd < 0 || !f->header) + return -EINVAL; + + while (wait) { + switch (f->offline_state) { + case OFFLINE_JOINED: + /* No offline thread, no need to wait. */ + wait = false; + break; + + case OFFLINE_SYNCING: { + OfflineState tmp_state = OFFLINE_SYNCING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_CANCEL, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + /* Canceled syncing prior to offlining, no need to wait. */ + wait = false; + break; + + case OFFLINE_AGAIN_FROM_SYNCING: { + OfflineState tmp_state = OFFLINE_AGAIN_FROM_SYNCING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_CANCEL, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + /* Canceled restart from syncing, no need to wait. */ + wait = false; + break; + + case OFFLINE_AGAIN_FROM_OFFLINING: { + OfflineState tmp_state = OFFLINE_AGAIN_FROM_OFFLINING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_CANCEL, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + /* Canceled restart from offlining, must wait for offlining to complete however. */ + _fallthrough_; + default: { + int r; + + r = journal_file_set_offline_thread_join(f); + if (r < 0) + return r; + + wait = false; + break; + } + } + } + + if (mmap_cache_fd_got_sigbus(f->cache_fd)) + return -EIO; + + switch (f->header->state) { + case STATE_ONLINE: + return 0; + + case STATE_OFFLINE: + f->header->state = STATE_ONLINE; + (void) fsync(f->fd); + return 0; + + default: + return -EINVAL; + } +} + +JournalFile* journal_file_close(JournalFile *f) { + if (!f) + return NULL; + + assert(f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL); + + if (f->cache_fd) + mmap_cache_fd_free(f->cache_fd); + + if (f->close_fd) + safe_close(f->fd); + free(f->path); + + ordered_hashmap_free_free(f->chain_cache); + +#if HAVE_COMPRESSION + free(f->compress_buffer); +#endif + +#if HAVE_GCRYPT + if (f->fss_file) { + size_t sz = PAGE_ALIGN(f->fss_file_size); + assert(sz < SIZE_MAX); + munmap(f->fss_file, sz); + } else + free(f->fsprg_state); + + free(f->fsprg_seed); + + if (f->hmac) + gcry_md_close(f->hmac); +#endif + + return mfree(f); +} + +static bool keyed_hash_requested(void) { + static thread_local int cached = -1; + int r; + + if (cached < 0) { + r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH"); + if (r < 0) { + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring: %m"); + cached = true; + } else + cached = r; + } + + return cached; +} + +static bool compact_mode_requested(void) { + static thread_local int cached = -1; + int r; + + if (cached < 0) { + r = getenv_bool("SYSTEMD_JOURNAL_COMPACT"); + if (r < 0) { + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_COMPACT environment variable, ignoring: %m"); + cached = true; + } else + cached = r; + } + + return cached; +} + +#if HAVE_COMPRESSION +static Compression getenv_compression(void) { + Compression c; + const char *e; + int r; + + e = getenv("SYSTEMD_JOURNAL_COMPRESS"); + if (!e) + return DEFAULT_COMPRESSION; + + r = parse_boolean(e); + if (r >= 0) + return r ? DEFAULT_COMPRESSION : COMPRESSION_NONE; + + c = compression_from_string(e); + if (c < 0) { + log_debug_errno(c, "Failed to parse SYSTEMD_JOURNAL_COMPRESS value, ignoring: %s", e); + return DEFAULT_COMPRESSION; + } + + if (!compression_supported(c)) { + log_debug("Unsupported compression algorithm specified, ignoring: %s", e); + return DEFAULT_COMPRESSION; + } + + return c; +} +#endif + +static Compression compression_requested(void) { +#if HAVE_COMPRESSION + static thread_local Compression cached = _COMPRESSION_INVALID; + + if (cached < 0) + cached = getenv_compression(); + + return cached; +#else + return COMPRESSION_NONE; +#endif +} + +static int journal_file_init_header( + JournalFile *f, + JournalFileFlags file_flags, + JournalFile *template) { + + bool seal = false; + ssize_t k; + int r; + + assert(f); + +#if HAVE_GCRYPT + /* Try to load the FSPRG state, and if we can't, then just don't do sealing */ + seal = FLAGS_SET(file_flags, JOURNAL_SEAL) && journal_file_fss_load(f) >= 0; +#endif + + Header h = { + .header_size = htole64(ALIGN64(sizeof(h))), + .incompatible_flags = htole32( + FLAGS_SET(file_flags, JOURNAL_COMPRESS) * COMPRESSION_TO_HEADER_INCOMPATIBLE_FLAG(compression_requested()) | + keyed_hash_requested() * HEADER_INCOMPATIBLE_KEYED_HASH | + compact_mode_requested() * HEADER_INCOMPATIBLE_COMPACT), + .compatible_flags = htole32( + (seal * (HEADER_COMPATIBLE_SEALED | HEADER_COMPATIBLE_SEALED_CONTINUOUS) ) | + HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID), + }; + + assert_cc(sizeof(h.signature) == sizeof(HEADER_SIGNATURE)); + memcpy(h.signature, HEADER_SIGNATURE, sizeof(HEADER_SIGNATURE)); + + r = sd_id128_randomize(&h.file_id); + if (r < 0) + return r; + + r = sd_id128_get_machine(&h.machine_id); + if (r < 0 && !ERRNO_IS_MACHINE_ID_UNSET(r)) + return r; /* If we have no valid machine ID (test environment?), let's simply leave the + * machine ID field all zeroes. */ + + if (template) { + h.seqnum_id = template->header->seqnum_id; + h.tail_entry_seqnum = template->header->tail_entry_seqnum; + } else + h.seqnum_id = h.file_id; + + k = pwrite(f->fd, &h, sizeof(h), 0); + if (k < 0) + return -errno; + if (k != sizeof(h)) + return -EIO; + + return 0; +} + +static int journal_file_refresh_header(JournalFile *f) { + int r; + + assert(f); + assert(f->header); + + /* We used to update the header's boot ID field here, but we don't do that anymore, as per + * HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID */ + + r = journal_file_set_online(f); + + /* Sync the online state to disk; likely just created a new file, also sync the directory this file + * is located in. */ + (void) fsync_full(f->fd); + + return r; +} + +static bool warn_wrong_flags(const JournalFile *f, bool compatible) { + const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY, + supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED; + const char *type = compatible ? "compatible" : "incompatible"; + uint32_t flags; + + assert(f); + assert(f->header); + + flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags); + + if (flags & ~supported) { + if (flags & ~any) + log_debug("Journal file %s has unknown %s flags 0x%"PRIx32, + f->path, type, flags & ~any); + flags = (flags & any) & ~supported; + if (flags) { + const char* strv[6]; + size_t n = 0; + _cleanup_free_ char *t = NULL; + + if (compatible) { + if (flags & HEADER_COMPATIBLE_SEALED) + strv[n++] = "sealed"; + if (flags & HEADER_COMPATIBLE_SEALED_CONTINUOUS) + strv[n++] = "sealed-continuous"; + } else { + if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ) + strv[n++] = "xz-compressed"; + if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4) + strv[n++] = "lz4-compressed"; + if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD) + strv[n++] = "zstd-compressed"; + if (flags & HEADER_INCOMPATIBLE_KEYED_HASH) + strv[n++] = "keyed-hash"; + if (flags & HEADER_INCOMPATIBLE_COMPACT) + strv[n++] = "compact"; + } + strv[n] = NULL; + assert(n < ELEMENTSOF(strv)); + + t = strv_join((char**) strv, ", "); + log_debug("Journal file %s uses %s %s %s disabled at compilation time.", + f->path, type, n > 1 ? "flags" : "flag", strnull(t)); + } + return true; + } + + return false; +} + +static bool offset_is_valid(uint64_t offset, uint64_t header_size, uint64_t tail_object_offset) { + if (offset == 0) + return true; + if (!VALID64(offset)) + return false; + if (offset < header_size) + return false; + if (offset > tail_object_offset) + return false; + return true; +} + +static bool hash_table_is_valid(uint64_t offset, uint64_t size, uint64_t header_size, uint64_t arena_size, uint64_t tail_object_offset) { + if ((offset == 0) != (size == 0)) + return false; + if (offset == 0) + return true; + if (offset <= offsetof(Object, hash_table.items)) + return false; + offset -= offsetof(Object, hash_table.items); + if (!offset_is_valid(offset, header_size, tail_object_offset)) + return false; + assert(offset <= header_size + arena_size); + if (size > header_size + arena_size - offset) + return false; + return true; +} + +static int journal_file_verify_header(JournalFile *f) { + uint64_t arena_size, header_size; + + assert(f); + assert(f->header); + + if (memcmp(f->header->signature, HEADER_SIGNATURE, 8)) + return -EBADMSG; + + /* In both read and write mode we refuse to open files with incompatible + * flags we don't know. */ + if (warn_wrong_flags(f, false)) + return -EPROTONOSUPPORT; + + /* When open for writing we refuse to open files with compatible flags, too. */ + if (journal_file_writable(f) && warn_wrong_flags(f, true)) + return -EPROTONOSUPPORT; + + if (f->header->state >= _STATE_MAX) + return -EBADMSG; + + header_size = le64toh(READ_NOW(f->header->header_size)); + + /* The first addition was n_data, so check that we are at least this large */ + if (header_size < HEADER_SIZE_MIN) + return -EBADMSG; + + /* When open for writing we refuse to open files with a mismatch of the header size, i.e. writing to + * files implementing older or new header structures. */ + if (journal_file_writable(f) && header_size != sizeof(Header)) + return -EPROTONOSUPPORT; + + /* Don't write to journal files without the new boot ID update behavior guarantee. */ + if (journal_file_writable(f) && !JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header)) + return -EPROTONOSUPPORT; + + if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) + return -EBADMSG; + + arena_size = le64toh(READ_NOW(f->header->arena_size)); + + if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size) + return -ENODATA; + + uint64_t tail_object_offset = le64toh(f->header->tail_object_offset); + if (!offset_is_valid(tail_object_offset, header_size, UINT64_MAX)) + return -ENODATA; + if (header_size + arena_size < tail_object_offset) + return -ENODATA; + if (header_size + arena_size - tail_object_offset < sizeof(ObjectHeader)) + return -ENODATA; + + if (!hash_table_is_valid(le64toh(f->header->data_hash_table_offset), + le64toh(f->header->data_hash_table_size), + header_size, arena_size, tail_object_offset)) + return -ENODATA; + + if (!hash_table_is_valid(le64toh(f->header->field_hash_table_offset), + le64toh(f->header->field_hash_table_size), + header_size, arena_size, tail_object_offset)) + return -ENODATA; + + uint64_t entry_array_offset = le64toh(f->header->entry_array_offset); + if (!offset_is_valid(entry_array_offset, header_size, tail_object_offset)) + return -ENODATA; + + if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_offset)) { + uint32_t offset = le32toh(f->header->tail_entry_array_offset); + uint32_t n = le32toh(f->header->tail_entry_array_n_entries); + + if (!offset_is_valid(offset, header_size, tail_object_offset)) + return -ENODATA; + if (entry_array_offset > offset) + return -ENODATA; + if (entry_array_offset == 0 && offset != 0) + return -ENODATA; + if ((offset == 0) != (n == 0)) + return -ENODATA; + assert(offset <= header_size + arena_size); + if ((uint64_t) n * journal_file_entry_array_item_size(f) > header_size + arena_size - offset) + return -ENODATA; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_offset)) { + uint64_t offset = le64toh(f->header->tail_entry_offset); + + if (!offset_is_valid(offset, header_size, tail_object_offset)) + return -ENODATA; + + if (offset > 0) { + /* When there is an entry object, then these fields must be filled. */ + if (sd_id128_is_null(f->header->tail_entry_boot_id)) + return -ENODATA; + if (!VALID_REALTIME(le64toh(f->header->head_entry_realtime))) + return -ENODATA; + if (!VALID_REALTIME(le64toh(f->header->tail_entry_realtime))) + return -ENODATA; + if (!VALID_MONOTONIC(le64toh(f->header->tail_entry_realtime))) + return -ENODATA; + } else { + /* Otherwise, the fields must be zero. */ + if (JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) && + !sd_id128_is_null(f->header->tail_entry_boot_id)) + return -ENODATA; + if (f->header->head_entry_realtime != 0) + return -ENODATA; + if (f->header->tail_entry_realtime != 0) + return -ENODATA; + if (f->header->tail_entry_realtime != 0) + return -ENODATA; + } + } + + /* Verify number of objects */ + uint64_t n_objects = le64toh(f->header->n_objects); + if (n_objects > arena_size / sizeof(ObjectHeader)) + return -ENODATA; + + uint64_t n_entries = le64toh(f->header->n_entries); + if (n_entries > n_objects) + return -ENODATA; + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && + le64toh(f->header->n_data) > n_objects) + return -ENODATA; + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields) && + le64toh(f->header->n_fields) > n_objects) + return -ENODATA; + + if (JOURNAL_HEADER_CONTAINS(f->header, n_tags) && + le64toh(f->header->n_tags) > n_objects) + return -ENODATA; + + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays) && + le64toh(f->header->n_entry_arrays) > n_objects) + return -ENODATA; + + if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_n_entries) && + le32toh(f->header->tail_entry_array_n_entries) > n_entries) + return -ENODATA; + + if (journal_file_writable(f)) { + sd_id128_t machine_id; + uint8_t state; + int r; + + r = sd_id128_get_machine(&machine_id); + if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) /* Gracefully handle the machine ID not being initialized yet */ + machine_id = SD_ID128_NULL; + else if (r < 0) + return r; + + if (!sd_id128_equal(machine_id, f->header->machine_id)) + return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "Trying to open journal file from different host for writing, refusing."); + + state = f->header->state; + + if (state == STATE_ARCHIVED) + return -ESHUTDOWN; /* Already archived */ + if (state == STATE_ONLINE) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), + "Journal file %s is already online. Assuming unclean closing.", + f->path); + if (state != STATE_OFFLINE) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), + "Journal file %s has unknown state %i.", + f->path, state); + + if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0) + return -EBADMSG; + } + + return 0; +} + +int journal_file_fstat(JournalFile *f) { + int r; + + assert(f); + assert(f->fd >= 0); + + if (fstat(f->fd, &f->last_stat) < 0) + return -errno; + + f->last_stat_usec = now(CLOCK_MONOTONIC); + + /* Refuse dealing with files that aren't regular */ + r = stat_verify_regular(&f->last_stat); + if (r < 0) + return r; + + /* Refuse appending to files that are already deleted */ + if (f->last_stat.st_nlink <= 0) + return -EIDRM; + + return 0; +} + +static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) { + uint64_t old_size, new_size, old_header_size, old_arena_size; + int r; + + assert(f); + assert(f->header); + + /* We assume that this file is not sparse, and we know that for sure, since we always call + * posix_fallocate() ourselves */ + + if (size > PAGE_ALIGN_DOWN_U64(UINT64_MAX) - offset) + return -EINVAL; + + if (mmap_cache_fd_got_sigbus(f->cache_fd)) + return -EIO; + + old_header_size = le64toh(READ_NOW(f->header->header_size)); + old_arena_size = le64toh(READ_NOW(f->header->arena_size)); + if (old_arena_size > PAGE_ALIGN_DOWN_U64(UINT64_MAX) - old_header_size) + return -EBADMSG; + + old_size = old_header_size + old_arena_size; + + new_size = MAX(PAGE_ALIGN_U64(offset + size), old_header_size); + + if (new_size <= old_size) { + + /* We already pre-allocated enough space, but before + * we write to it, let's check with fstat() if the + * file got deleted, in order make sure we don't throw + * away the data immediately. Don't check fstat() for + * all writes though, but only once ever 10s. */ + + if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC)) + return 0; + + return journal_file_fstat(f); + } + + /* Allocate more space. */ + + if (f->metrics.max_size > 0 && new_size > f->metrics.max_size) + return -E2BIG; + + /* Refuse to go over 4G in compact mode so offsets can be stored in 32-bit. */ + if (JOURNAL_HEADER_COMPACT(f->header) && new_size > UINT32_MAX) + return -E2BIG; + + if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) { + struct statvfs svfs; + + if (fstatvfs(f->fd, &svfs) >= 0) { + uint64_t available; + + available = LESS_BY(u64_multiply_safe(svfs.f_bfree, svfs.f_bsize), f->metrics.keep_free); + + if (new_size - old_size > available) + return -E2BIG; + } + } + + /* Increase by larger blocks at once */ + new_size = ROUND_UP(new_size, FILE_SIZE_INCREASE); + if (f->metrics.max_size > 0 && new_size > f->metrics.max_size) + new_size = f->metrics.max_size; + + /* Note that the glibc fallocate() fallback is very + inefficient, hence we try to minimize the allocation area + as we can. */ + r = posix_fallocate_loop(f->fd, old_size, new_size - old_size); + if (r < 0) + return r; + + f->header->arena_size = htole64(new_size - old_header_size); + + return journal_file_fstat(f); +} + +static int journal_file_move_to( + JournalFile *f, + ObjectType type, + bool keep_always, + uint64_t offset, + uint64_t size, + void **ret) { + + int r; + + assert(f); + assert(ret); + + /* This function may clear, overwrite, or alter previously cached entries with the same type. After + * this function has been called, all previously read objects with the same type may be invalidated, + * hence must be re-read before use. */ + + if (size <= 0) + return -EINVAL; + + if (size > UINT64_MAX - offset) + return -EBADMSG; + + /* Avoid SIGBUS on invalid accesses */ + if (offset + size > (uint64_t) f->last_stat.st_size) { + /* Hmm, out of range? Let's refresh the fstat() data + * first, before we trust that check. */ + + r = journal_file_fstat(f); + if (r < 0) + return r; + + if (offset + size > (uint64_t) f->last_stat.st_size) + return -EADDRNOTAVAIL; + } + + return mmap_cache_fd_get(f->cache_fd, type_to_category(type), keep_always, offset, size, &f->last_stat, ret); +} + +static uint64_t minimum_header_size(JournalFile *f, Object *o) { + + static const uint64_t table[] = { + [OBJECT_DATA] = sizeof(DataObject), + [OBJECT_FIELD] = sizeof(FieldObject), + [OBJECT_ENTRY] = sizeof(EntryObject), + [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject), + [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject), + [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject), + [OBJECT_TAG] = sizeof(TagObject), + }; + + assert(f); + assert(o); + + if (o->object.type == OBJECT_DATA) + return journal_file_data_payload_offset(f); + + if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0) + return sizeof(ObjectHeader); + + return table[o->object.type]; +} + +static int check_object_header(JournalFile *f, Object *o, ObjectType type, uint64_t offset) { + uint64_t s; + + assert(f); + assert(o); + + s = le64toh(READ_NOW(o->object.size)); + if (s == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to uninitialized object: %" PRIu64, + offset); + + if (s < sizeof(ObjectHeader)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to overly short object with size %"PRIu64": %" PRIu64, + s, offset); + + if (o->object.type <= OBJECT_UNUSED || o->object.type >= _OBJECT_TYPE_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to object with invalid type (%u): %" PRIu64, + o->object.type, offset); + + if (type > OBJECT_UNUSED && o->object.type != type) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Found %s object while expecting %s object: %" PRIu64, + journal_object_type_to_string(o->object.type), + journal_object_type_to_string(type), + offset); + + if (s < minimum_header_size(f, o)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Size of %s object (%"PRIu64") is smaller than the minimum object size (%"PRIu64"): %" PRIu64, + journal_object_type_to_string(o->object.type), + s, + minimum_header_size(f, o), + offset); + + return 0; +} + +/* Lightweight object checks. We want this to be fast, so that we won't + * slowdown every journal_file_move_to_object() call too much. */ +static int check_object(JournalFile *f, Object *o, uint64_t offset) { + assert(f); + assert(o); + + switch (o->object.type) { + + case OBJECT_DATA: + if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad data n_entries: %" PRIu64 ": %" PRIu64, + le64toh(o->data.n_entries), + offset); + + if (le64toh(o->object.size) <= journal_file_data_payload_offset(f)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad data size (<= %zu): %" PRIu64 ": %" PRIu64, + journal_file_data_payload_offset(f), + le64toh(o->object.size), + offset); + + if (!VALID64(le64toh(o->data.next_hash_offset)) || + !VALID64(le64toh(o->data.next_field_offset)) || + !VALID64(le64toh(o->data.entry_offset)) || + !VALID64(le64toh(o->data.entry_array_offset))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64, + le64toh(o->data.next_hash_offset), + le64toh(o->data.next_field_offset), + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset), + offset); + + break; + + case OBJECT_FIELD: + if (le64toh(o->object.size) <= offsetof(Object, field.payload)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64, + offsetof(Object, field.payload), + le64toh(o->object.size), + offset); + + if (!VALID64(le64toh(o->field.next_hash_offset)) || + !VALID64(le64toh(o->field.head_data_offset))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64, + le64toh(o->field.next_hash_offset), + le64toh(o->field.head_data_offset), + offset); + break; + + case OBJECT_ENTRY: { + uint64_t sz; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, entry.items) || + (sz - offsetof(Object, entry.items)) % journal_file_entry_item_size(f) != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64, + offsetof(Object, entry.items), + sz, + offset); + + if ((sz - offsetof(Object, entry.items)) / journal_file_entry_item_size(f) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid number items in entry: %" PRIu64 ": %" PRIu64, + (sz - offsetof(Object, entry.items)) / journal_file_entry_item_size(f), + offset); + + if (le64toh(o->entry.seqnum) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid entry seqnum: %" PRIx64 ": %" PRIu64, + le64toh(o->entry.seqnum), + offset); + + if (!VALID_REALTIME(le64toh(o->entry.realtime))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64, + le64toh(o->entry.realtime), + offset); + + if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64, + le64toh(o->entry.monotonic), + offset); + + if (sd_id128_is_null(o->entry.boot_id)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object entry with an empty boot ID: %" PRIu64, + offset); + + break; + } + + case OBJECT_DATA_HASH_TABLE: + case OBJECT_FIELD_HASH_TABLE: { + uint64_t sz; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, hash_table.items) || + (sz - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 || + (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid %s hash table size: %" PRIu64 ": %" PRIu64, + journal_object_type_to_string(o->object.type), + sz, + offset); + + break; + } + + case OBJECT_ENTRY_ARRAY: { + uint64_t sz, next; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, entry_array.items) || + (sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 || + (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object entry array size: %" PRIu64 ": %" PRIu64, + sz, + offset); + /* Here, we request that the offset of each entry array object is in strictly increasing order. */ + next = le64toh(o->entry_array.next_entry_array_offset); + if (!VALID64(next) || (next > 0 && next <= offset)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object entry array next_entry_array_offset: %" PRIu64 ": %" PRIu64, + next, + offset); + + break; + } + + case OBJECT_TAG: + if (le64toh(o->object.size) != sizeof(TagObject)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object tag size: %" PRIu64 ": %" PRIu64, + le64toh(o->object.size), + offset); + + if (!VALID_EPOCH(le64toh(o->tag.epoch))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid object tag epoch: %" PRIu64 ": %" PRIu64, + le64toh(o->tag.epoch), offset); + + break; + } + + return 0; +} + +int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) { + int r; + Object *o; + + assert(f); + + /* Even if this function fails, it may clear, overwrite, or alter previously cached entries with the + * same type. After this function has been called, all previously read objects with the same type may + * be invalidated, hence must be re-read before use. */ + + /* Objects may only be located at multiple of 64 bit */ + if (!VALID64(offset)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to %s object at non-64-bit boundary: %" PRIu64, + journal_object_type_to_string(type), + offset); + + /* Object may not be located in the file header */ + if (offset < le64toh(f->header->header_size)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to move to %s object located in file header: %" PRIu64, + journal_object_type_to_string(type), + offset); + + r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), (void**) &o); + if (r < 0) + return r; + + r = check_object_header(f, o, type, offset); + if (r < 0) + return r; + + r = journal_file_move_to(f, type, false, offset, le64toh(READ_NOW(o->object.size)), (void**) &o); + if (r < 0) + return r; + + r = check_object_header(f, o, type, offset); + if (r < 0) + return r; + + r = check_object(f, o, offset); + if (r < 0) + return r; + + if (ret) + *ret = o; + + return 0; +} + +int journal_file_pin_object(JournalFile *f, Object *o) { + assert(f); + assert(o); + + /* This attaches the mmap window that provides the object to the 'pinning' category. So, reading + * another object with the same type will not invalidate the object, until this function is called + * for another object. */ + return mmap_cache_fd_pin(f->cache_fd, type_to_category(o->object.type), o, le64toh(o->object.size)); +} + +int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) { + ssize_t n; + Object o; + int r; + + assert(f); + + /* Objects may only be located at multiple of 64 bit */ + if (!VALID64(offset)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to read %s object at non-64-bit boundary: %" PRIu64, + journal_object_type_to_string(type), offset); + + /* Object may not be located in the file header */ + if (offset < le64toh(f->header->header_size)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Attempt to read %s object located in file header: %" PRIu64, + journal_object_type_to_string(type), offset); + + /* This will likely read too much data but it avoids having to call pread() twice. */ + n = pread(f->fd, &o, sizeof(o), offset); + if (n < 0) + return log_debug_errno(errno, "Failed to read journal %s object at offset: %" PRIu64, + journal_object_type_to_string(type), offset); + + if ((size_t) n < sizeof(o.object)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Failed to read short %s object at offset: %" PRIu64, + journal_object_type_to_string(type), offset); + + r = check_object_header(f, &o, type, offset); + if (r < 0) + return r; + + if ((size_t) n < minimum_header_size(f, &o)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Short read while reading %s object: %" PRIu64, + journal_object_type_to_string(type), offset); + + r = check_object(f, &o, offset); + if (r < 0) + return r; + + if (ret) + *ret = o; + + return 0; +} + +static uint64_t inc_seqnum(uint64_t seqnum) { + if (seqnum < UINT64_MAX-1) + return seqnum + 1; + + return 1; /* skip over UINT64_MAX and 0 when we run out of seqnums and start again */ +} + +static uint64_t journal_file_entry_seqnum( + JournalFile *f, + uint64_t *seqnum) { + + uint64_t next_seqnum; + + assert(f); + assert(f->header); + + /* Picks a new sequence number for the entry we are about to add and returns it. */ + + next_seqnum = inc_seqnum(le64toh(f->header->tail_entry_seqnum)); + + /* If an external seqnum counter was passed, we update both the local and the external one, and set + * it to the maximum of both */ + if (seqnum) + *seqnum = next_seqnum = MAX(inc_seqnum(*seqnum), next_seqnum); + + f->header->tail_entry_seqnum = htole64(next_seqnum); + + if (f->header->head_entry_seqnum == 0) + f->header->head_entry_seqnum = htole64(next_seqnum); + + return next_seqnum; +} + +int journal_file_append_object( + JournalFile *f, + ObjectType type, + uint64_t size, + Object **ret_object, + uint64_t *ret_offset) { + + int r; + uint64_t p; + Object *o; + + assert(f); + assert(f->header); + assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX); + assert(size >= sizeof(ObjectHeader)); + + r = journal_file_set_online(f); + if (r < 0) + return r; + + r = journal_file_tail_end_by_mmap(f, &p); + if (r < 0) + return r; + + r = journal_file_allocate(f, p, size); + if (r < 0) + return r; + + r = journal_file_move_to(f, type, false, p, size, (void**) &o); + if (r < 0) + return r; + + o->object = (ObjectHeader) { + .type = type, + .size = htole64(size), + }; + + f->header->tail_object_offset = htole64(p); + f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1); + + if (ret_object) + *ret_object = o; + + if (ret_offset) + *ret_offset = p; + + return 0; +} + +static int journal_file_setup_data_hash_table(JournalFile *f) { + uint64_t s, p; + Object *o; + int r; + + assert(f); + assert(f->header); + + /* We estimate that we need 1 hash table entry per 768 bytes + of journal file and we want to make sure we never get + beyond 75% fill level. Calculate the hash table size for + the maximum file size based on these metrics. */ + + s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem); + if (s < DEFAULT_DATA_HASH_TABLE_SIZE) + s = DEFAULT_DATA_HASH_TABLE_SIZE; + + log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem)); + + r = journal_file_append_object(f, + OBJECT_DATA_HASH_TABLE, + offsetof(Object, hash_table.items) + s, + &o, &p); + if (r < 0) + return r; + + memzero(o->hash_table.items, s); + + f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items)); + f->header->data_hash_table_size = htole64(s); + + return 0; +} + +static int journal_file_setup_field_hash_table(JournalFile *f) { + uint64_t s, p; + Object *o; + int r; + + assert(f); + assert(f->header); + + /* We use a fixed size hash table for the fields as this + * number should grow very slowly only */ + + s = DEFAULT_FIELD_HASH_TABLE_SIZE; + log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem)); + + r = journal_file_append_object(f, + OBJECT_FIELD_HASH_TABLE, + offsetof(Object, hash_table.items) + s, + &o, &p); + if (r < 0) + return r; + + memzero(o->hash_table.items, s); + + f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items)); + f->header->field_hash_table_size = htole64(s); + + return 0; +} + +int journal_file_map_data_hash_table(JournalFile *f) { + uint64_t s, p; + void *t; + int r; + + assert(f); + assert(f->header); + + if (f->data_hash_table) + return 0; + + p = le64toh(f->header->data_hash_table_offset); + s = le64toh(f->header->data_hash_table_size); + + r = journal_file_move_to(f, + OBJECT_DATA_HASH_TABLE, + true, + p, s, + &t); + if (r < 0) + return r; + + f->data_hash_table = t; + return 0; +} + +int journal_file_map_field_hash_table(JournalFile *f) { + uint64_t s, p; + void *t; + int r; + + assert(f); + assert(f->header); + + if (f->field_hash_table) + return 0; + + p = le64toh(f->header->field_hash_table_offset); + s = le64toh(f->header->field_hash_table_size); + + r = journal_file_move_to(f, + OBJECT_FIELD_HASH_TABLE, + true, + p, s, + &t); + if (r < 0) + return r; + + f->field_hash_table = t; + return 0; +} + +static int journal_file_link_field( + JournalFile *f, + Object *o, + uint64_t offset, + uint64_t hash) { + + uint64_t p, h, m; + int r; + + assert(f); + assert(f->header); + assert(f->field_hash_table); + assert(o); + assert(offset > 0); + + if (o->object.type != OBJECT_FIELD) + return -EINVAL; + + m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + /* This might alter the window we are looking at */ + o->field.next_hash_offset = o->field.head_data_offset = 0; + + h = hash % m; + p = le64toh(f->field_hash_table[h].tail_hash_offset); + if (p == 0) + f->field_hash_table[h].head_hash_offset = htole64(offset); + else { + r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); + if (r < 0) + return r; + + o->field.next_hash_offset = htole64(offset); + } + + f->field_hash_table[h].tail_hash_offset = htole64(offset); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) + f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1); + + return 0; +} + +static int journal_file_link_data( + JournalFile *f, + Object *o, + uint64_t offset, + uint64_t hash) { + + uint64_t p, h, m; + int r; + + assert(f); + assert(f->header); + assert(f->data_hash_table); + assert(o); + assert(offset > 0); + + if (o->object.type != OBJECT_DATA) + return -EINVAL; + + m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + /* This might alter the window we are looking at */ + o->data.next_hash_offset = o->data.next_field_offset = 0; + o->data.entry_offset = o->data.entry_array_offset = 0; + o->data.n_entries = 0; + + h = hash % m; + p = le64toh(f->data_hash_table[h].tail_hash_offset); + if (p == 0) + /* Only entry in the hash table is easy */ + f->data_hash_table[h].head_hash_offset = htole64(offset); + else { + /* Move back to the previous data object, to patch in + * pointer */ + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + o->data.next_hash_offset = htole64(offset); + } + + f->data_hash_table[h].tail_hash_offset = htole64(offset); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) + f->header->n_data = htole64(le64toh(f->header->n_data) + 1); + + return 0; +} + +static int get_next_hash_offset( + JournalFile *f, + uint64_t *p, + le64_t *next_hash_offset, + uint64_t *depth, + le64_t *header_max_depth) { + + uint64_t nextp; + + assert(f); + assert(p); + assert(next_hash_offset); + assert(depth); + + nextp = le64toh(READ_NOW(*next_hash_offset)); + if (nextp > 0) { + if (nextp <= *p) /* Refuse going in loops */ + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Detected hash item loop in %s, refusing.", f->path); + + (*depth)++; + + /* If the depth of this hash chain is larger than all others we have seen so far, record it */ + if (header_max_depth && journal_file_writable(f)) + *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth))); + } + + *p = nextp; + return 0; +} + +int journal_file_find_field_object_with_hash( + JournalFile *f, + const void *field, + uint64_t size, + uint64_t hash, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t p, osize, h, m, depth = 0; + int r; + + assert(f); + assert(f->header); + assert(field); + assert(size > 0); + + /* If the field hash table is empty, we can't find anything */ + if (le64toh(f->header->field_hash_table_size) <= 0) + return 0; + + /* Map the field hash table, if it isn't mapped yet. */ + r = journal_file_map_field_hash_table(f); + if (r < 0) + return r; + + osize = offsetof(Object, field.payload) + size; + + m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + h = hash % m; + p = le64toh(f->field_hash_table[h].head_hash_offset); + while (p > 0) { + Object *o; + + r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); + if (r < 0) + return r; + + if (le64toh(o->field.hash) == hash && + le64toh(o->object.size) == osize && + memcmp(o->field.payload, field, size) == 0) { + + if (ret_object) + *ret_object = o; + if (ret_offset) + *ret_offset = p; + + return 1; + } + + r = get_next_hash_offset( + f, + &p, + &o->field.next_hash_offset, + &depth, + JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL); + if (r < 0) + return r; + } + + return 0; +} + +uint64_t journal_file_hash_data( + JournalFile *f, + const void *data, + size_t sz) { + + assert(f); + assert(f->header); + assert(data || sz == 0); + + /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash + * function use siphash. Old journal files use the Jenkins hash. */ + + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + return siphash24(data, sz, f->header->file_id.bytes); + + return jenkins_hash64(data, sz); +} + +int journal_file_find_field_object( + JournalFile *f, + const void *field, + uint64_t size, + Object **ret_object, + uint64_t *ret_offset) { + + assert(f); + assert(field); + assert(size > 0); + + return journal_file_find_field_object_with_hash( + f, + field, size, + journal_file_hash_data(f, field, size), + ret_object, ret_offset); +} + +int journal_file_find_data_object_with_hash( + JournalFile *f, + const void *data, + uint64_t size, + uint64_t hash, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t p, h, m, depth = 0; + int r; + + assert(f); + assert(f->header); + assert(data || size == 0); + + /* If there's no data hash table, then there's no entry. */ + if (le64toh(f->header->data_hash_table_size) <= 0) + return 0; + + /* Map the data hash table, if it isn't mapped yet. */ + r = journal_file_map_data_hash_table(f); + if (r < 0) + return r; + + m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem); + if (m <= 0) + return -EBADMSG; + + h = hash % m; + p = le64toh(f->data_hash_table[h].head_hash_offset); + + while (p > 0) { + Object *o; + void *d; + size_t rsize; + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + if (le64toh(o->data.hash) != hash) + goto next; + + r = journal_file_data_payload(f, o, p, NULL, 0, 0, &d, &rsize); + if (r < 0) + return r; + assert(r > 0); /* journal_file_data_payload() always returns > 0 if no field is provided. */ + + if (memcmp_nn(data, size, d, rsize) == 0) { + if (ret_object) + *ret_object = o; + + if (ret_offset) + *ret_offset = p; + + return 1; + } + + next: + r = get_next_hash_offset( + f, + &p, + &o->data.next_hash_offset, + &depth, + JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL); + if (r < 0) + return r; + } + + return 0; +} + +int journal_file_find_data_object( + JournalFile *f, + const void *data, + uint64_t size, + Object **ret_object, + uint64_t *ret_offset) { + + assert(f); + assert(data || size == 0); + + return journal_file_find_data_object_with_hash( + f, + data, size, + journal_file_hash_data(f, data, size), + ret_object, ret_offset); +} + +bool journal_field_valid(const char *p, size_t l, bool allow_protected) { + /* We kinda enforce POSIX syntax recommendations for + environment variables here, but make a couple of additional + requirements. + + http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */ + + assert(p); + + if (l == SIZE_MAX) + l = strlen(p); + + /* No empty field names */ + if (l <= 0) + return false; + + /* Don't allow names longer than 64 chars */ + if (l > 64) + return false; + + /* Variables starting with an underscore are protected */ + if (!allow_protected && p[0] == '_') + return false; + + /* Don't allow digits as first character */ + if (ascii_isdigit(p[0])) + return false; + + /* Only allow A-Z0-9 and '_' */ + for (const char *a = p; a < p + l; a++) + if ((*a < 'A' || *a > 'Z') && + !ascii_isdigit(*a) && + *a != '_') + return false; + + return true; +} + +static int journal_file_append_field( + JournalFile *f, + const void *field, + uint64_t size, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t hash, p; + uint64_t osize; + Object *o; + int r; + + assert(f); + assert(field); + assert(size > 0); + + if (!journal_field_valid(field, size, true)) + return -EBADMSG; + + hash = journal_file_hash_data(f, field, size); + + r = journal_file_find_field_object_with_hash(f, field, size, hash, ret_object, ret_offset); + if (r < 0) + return r; + if (r > 0) + return 0; + + osize = offsetof(Object, field.payload) + size; + r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p); + if (r < 0) + return r; + + o->field.hash = htole64(hash); + memcpy(o->field.payload, field, size); + + r = journal_file_link_field(f, o, p, hash); + if (r < 0) + return r; + + /* The linking might have altered the window, so let's only pass the offset to hmac which will + * move to the object again if needed. */ + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_FIELD, NULL, p); + if (r < 0) + return r; +#endif + + if (ret_object) { + r = journal_file_move_to_object(f, OBJECT_FIELD, p, ret_object); + if (r < 0) + return r; + } + + if (ret_offset) + *ret_offset = p; + + return 0; +} + +static int maybe_compress_payload(JournalFile *f, uint8_t *dst, const uint8_t *src, uint64_t size, size_t *rsize) { + assert(f); + assert(f->header); + +#if HAVE_COMPRESSION + Compression c; + int r; + + c = JOURNAL_FILE_COMPRESSION(f); + if (c == COMPRESSION_NONE || size < f->compress_threshold_bytes) + return 0; + + r = compress_blob(c, src, size, dst, size - 1, rsize); + if (r < 0) + return log_debug_errno(r, "Failed to compress data object using %s, ignoring: %m", compression_to_string(c)); + + log_debug("Compressed data object %"PRIu64" -> %zu using %s", size, *rsize, compression_to_string(c)); + + return 1; /* compressed */ +#else + return 0; +#endif +} + +static int journal_file_append_data( + JournalFile *f, + const void *data, + uint64_t size, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t hash, p, osize; + Object *o, *fo; + size_t rsize = 0; + const void *eq; + int r; + + assert(f); + + if (!data || size == 0) + return -EINVAL; + + hash = journal_file_hash_data(f, data, size); + + r = journal_file_find_data_object_with_hash(f, data, size, hash, ret_object, ret_offset); + if (r < 0) + return r; + if (r > 0) + return 0; + + eq = memchr(data, '=', size); + if (!eq) + return -EINVAL; + + osize = journal_file_data_payload_offset(f) + size; + r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p); + if (r < 0) + return r; + + o->data.hash = htole64(hash); + + r = maybe_compress_payload(f, journal_file_data_payload_field(f, o), data, size, &rsize); + if (r <= 0) + /* We don't really care failures, let's continue without compression */ + memcpy_safe(journal_file_data_payload_field(f, o), data, size); + else { + Compression c = JOURNAL_FILE_COMPRESSION(f); + + assert(c >= 0 && c < _COMPRESSION_MAX && c != COMPRESSION_NONE); + + o->object.size = htole64(journal_file_data_payload_offset(f) + rsize); + o->object.flags |= COMPRESSION_TO_OBJECT_FLAG(c); + } + + r = journal_file_link_data(f, o, p, hash); + if (r < 0) + return r; + + /* The linking might have altered the window, so let's refresh our pointer. */ + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); + if (r < 0) + return r; +#endif + + /* Create field object ... */ + r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, NULL); + if (r < 0) + return r; + + /* ... and link it in. */ + o->data.next_field_offset = fo->field.head_data_offset; + fo->field.head_data_offset = le64toh(p); + + if (ret_object) + *ret_object = o; + + if (ret_offset) + *ret_offset = p; + + return 0; +} + +static int maybe_decompress_payload( + JournalFile *f, + uint8_t *payload, + uint64_t size, + Compression compression, + const char *field, + size_t field_length, + size_t data_threshold, + void **ret_data, + size_t *ret_size) { + + assert(f); + + /* We can't read objects larger than 4G on a 32-bit machine */ + if ((uint64_t) (size_t) size != size) + return -E2BIG; + + if (compression != COMPRESSION_NONE) { +#if HAVE_COMPRESSION + size_t rsize; + int r; + + if (field) { + r = decompress_startswith(compression, payload, size, &f->compress_buffer, field, + field_length, '='); + if (r < 0) + return log_debug_errno(r, + "Cannot decompress %s object of length %" PRIu64 ": %m", + compression_to_string(compression), + size); + if (r == 0) { + if (ret_data) + *ret_data = NULL; + if (ret_size) + *ret_size = 0; + return 0; + } + } + + r = decompress_blob(compression, payload, size, &f->compress_buffer, &rsize, 0); + if (r < 0) + return r; + + if (ret_data) + *ret_data = f->compress_buffer; + if (ret_size) + *ret_size = rsize; +#else + return -EPROTONOSUPPORT; +#endif + } else { + if (field && (size < field_length + 1 || memcmp(payload, field, field_length) != 0 || payload[field_length] != '=')) { + if (ret_data) + *ret_data = NULL; + if (ret_size) + *ret_size = 0; + return 0; + } + + if (ret_data) + *ret_data = payload; + if (ret_size) + *ret_size = (size_t) size; + } + + return 1; +} + +int journal_file_data_payload( + JournalFile *f, + Object *o, + uint64_t offset, + const char *field, + size_t field_length, + size_t data_threshold, + void **ret_data, + size_t *ret_size) { + + uint64_t size; + Compression c; + int r; + + assert(f); + assert(!field == (field_length == 0)); /* These must be specified together. */ + + if (!o) { + r = journal_file_move_to_object(f, OBJECT_DATA, offset, &o); + if (r < 0) + return r; + } + + size = le64toh(READ_NOW(o->object.size)); + if (size < journal_file_data_payload_offset(f)) + return -EBADMSG; + + size -= journal_file_data_payload_offset(f); + + c = COMPRESSION_FROM_OBJECT(o); + if (c < 0) + return -EPROTONOSUPPORT; + + return maybe_decompress_payload(f, journal_file_data_payload_field(f, o), size, c, field, + field_length, data_threshold, ret_data, ret_size); +} + +uint64_t journal_file_entry_n_items(JournalFile *f, Object *o) { + uint64_t sz; + + assert(f); + assert(o); + + if (o->object.type != OBJECT_ENTRY) + return 0; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, entry.items)) + return 0; + + return (sz - offsetof(Object, entry.items)) / journal_file_entry_item_size(f); +} + +uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) { + uint64_t sz; + + assert(f); + assert(o); + + if (o->object.type != OBJECT_ENTRY_ARRAY) + return 0; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, entry_array.items)) + return 0; + + return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f); +} + +uint64_t journal_file_hash_table_n_items(Object *o) { + uint64_t sz; + + assert(o); + + if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE)) + return 0; + + sz = le64toh(READ_NOW(o->object.size)); + if (sz < offsetof(Object, hash_table.items)) + return 0; + + return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem); +} + +static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) { + assert(f); + assert(o); + + if (JOURNAL_HEADER_COMPACT(f->header)) { + assert(p <= UINT32_MAX); + o->entry_array.items.compact[i] = htole32(p); + } else + o->entry_array.items.regular[i] = htole64(p); +} + +static int link_entry_into_array( + JournalFile *f, + le64_t *first, + le64_t *idx, + le32_t *tail, + le32_t *tidx, + uint64_t p) { + + uint64_t n = 0, ap = 0, q, i, a, hidx; + Object *o; + int r; + + assert(f); + assert(f->header); + assert(first); + assert(idx); + assert(p > 0); + + a = tail ? le32toh(*tail) : le64toh(*first); + hidx = le64toh(READ_NOW(*idx)); + i = tidx ? le32toh(READ_NOW(*tidx)) : hidx; + + while (a > 0) { + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + n = journal_file_entry_array_n_items(f, o); + if (i < n) { + write_entry_array_item(f, o, i, p); + *idx = htole64(hidx + 1); + if (tidx) + *tidx = htole32(le32toh(*tidx) + 1); + return 0; + } + + i -= n; + ap = a; + a = le64toh(o->entry_array.next_entry_array_offset); + } + + if (hidx > n) + n = (hidx+1) * 2; + else + n = n * 2; + + if (n < 4) + n = 4; + + r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY, + offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f), + &o, &q); + if (r < 0) + return r; + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q); + if (r < 0) + return r; +#endif + + write_entry_array_item(f, o, i, p); + + if (ap == 0) + *first = htole64(q); + else { + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o); + if (r < 0) + return r; + + o->entry_array.next_entry_array_offset = htole64(q); + } + + if (tail) + *tail = htole32(q); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) + f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1); + + *idx = htole64(hidx + 1); + if (tidx) + *tidx = htole32(1); + + return 0; +} + +static int link_entry_into_array_plus_one( + JournalFile *f, + le64_t *extra, + le64_t *first, + le64_t *idx, + le32_t *tail, + le32_t *tidx, + uint64_t p) { + + uint64_t hidx; + int r; + + assert(f); + assert(extra); + assert(first); + assert(idx); + assert(p > 0); + + hidx = le64toh(READ_NOW(*idx)); + if (hidx == UINT64_MAX) + return -EBADMSG; + if (hidx == 0) + *extra = htole64(p); + else { + le64_t i; + + i = htole64(hidx - 1); + r = link_entry_into_array(f, first, &i, tail, tidx, p); + if (r < 0) + return r; + } + + *idx = htole64(hidx + 1); + return 0; +} + +static int journal_file_link_entry_item(JournalFile *f, uint64_t offset, uint64_t p) { + Object *o; + int r; + + assert(f); + assert(offset > 0); + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + return link_entry_into_array_plus_one(f, + &o->data.entry_offset, + &o->data.entry_array_offset, + &o->data.n_entries, + JOURNAL_HEADER_COMPACT(f->header) ? &o->data.compact.tail_entry_array_offset : NULL, + JOURNAL_HEADER_COMPACT(f->header) ? &o->data.compact.tail_entry_array_n_entries : NULL, + offset); +} + +static int journal_file_link_entry( + JournalFile *f, + Object *o, + uint64_t offset, + const EntryItem items[], + size_t n_items) { + + int r; + + assert(f); + assert(f->header); + assert(o); + assert(offset > 0); + + if (o->object.type != OBJECT_ENTRY) + return -EINVAL; + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + /* Link up the entry itself */ + r = link_entry_into_array(f, + &f->header->entry_array_offset, + &f->header->n_entries, + JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_offset) ? &f->header->tail_entry_array_offset : NULL, + JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_n_entries) ? &f->header->tail_entry_array_n_entries : NULL, + offset); + if (r < 0) + return r; + + /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */ + + if (f->header->head_entry_realtime == 0) + f->header->head_entry_realtime = o->entry.realtime; + + f->header->tail_entry_realtime = o->entry.realtime; + f->header->tail_entry_monotonic = o->entry.monotonic; + if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_offset)) + f->header->tail_entry_offset = htole64(offset); + f->newest_mtime = 0; /* we have a new tail entry now, explicitly invalidate newest boot id/timestamp info */ + + /* Link up the items */ + for (uint64_t i = 0; i < n_items; i++) { + int k; + + /* If we fail to link an entry item because we can't allocate a new entry array, don't fail + * immediately but try to link the other entry items since it might still be possible to link + * those if they don't require a new entry array to be allocated. */ + + k = journal_file_link_entry_item(f, offset, items[i].object_offset); + if (k == -E2BIG) + r = k; + else if (k < 0) + return k; + } + + return r; +} + +static void write_entry_item(JournalFile *f, Object *o, uint64_t i, const EntryItem *item) { + assert(f); + assert(o); + assert(item); + + if (JOURNAL_HEADER_COMPACT(f->header)) { + assert(item->object_offset <= UINT32_MAX); + o->entry.items.compact[i].object_offset = htole32(item->object_offset); + } else { + o->entry.items.regular[i].object_offset = htole64(item->object_offset); + o->entry.items.regular[i].hash = htole64(item->hash); + } +} + +static int journal_file_append_entry_internal( + JournalFile *f, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + const sd_id128_t *machine_id, + uint64_t xor_hash, + const EntryItem items[], + size_t n_items, + uint64_t *seqnum, + sd_id128_t *seqnum_id, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t np; + uint64_t osize; + Object *o; + int r; + + assert(f); + assert(f->header); + assert(ts); + assert(boot_id); + assert(!sd_id128_is_null(*boot_id)); + assert(items || n_items == 0); + + if (f->strict_order) { + /* If requested be stricter with ordering in this journal file, to make searching via + * bisection fully deterministic. This is an optional feature, so that if desired journal + * files can be written where the ordering is not strictly enforced (in which case bisection + * will yield *a* result, but not the *only* result, when searching for points in + * time). Strict ordering mode is enabled when journald originally writes the files, but + * might not necessarily be if other tools (the remoting tools for example) write journal + * files from combined sources. + * + * Typically, if any of the errors generated here are seen journald will just rotate the + * journal files and start anew. */ + + if (ts->realtime < le64toh(f->header->tail_entry_realtime)) + return log_debug_errno(SYNTHETIC_ERRNO(EREMCHG), + "Realtime timestamp %" PRIu64 " smaller than previous realtime " + "timestamp %" PRIu64 ", refusing entry.", + ts->realtime, le64toh(f->header->tail_entry_realtime)); + + if (sd_id128_equal(*boot_id, f->header->tail_entry_boot_id) && + ts->monotonic < le64toh(f->header->tail_entry_monotonic)) + return log_debug_errno( + SYNTHETIC_ERRNO(ENOTNAM), + "Monotonic timestamp %" PRIu64 + " smaller than previous monotonic timestamp %" PRIu64 + " while having the same boot ID, refusing entry.", + ts->monotonic, + le64toh(f->header->tail_entry_monotonic)); + } + + if (seqnum_id) { + /* Settle the passed in sequence number ID */ + + if (sd_id128_is_null(*seqnum_id)) + *seqnum_id = f->header->seqnum_id; /* Caller has none assigned, then copy the one from the file */ + else if (!sd_id128_equal(*seqnum_id, f->header->seqnum_id)) { + /* Different seqnum IDs? We can't allow entries from multiple IDs end up in the same journal.*/ + if (le64toh(f->header->n_entries) == 0) + f->header->seqnum_id = *seqnum_id; /* Caller has one, and file so far has no entries, then copy the one from the caller */ + else + return log_debug_errno(SYNTHETIC_ERRNO(EILSEQ), + "Sequence number IDs don't match, refusing entry."); + } + } + + if (machine_id && sd_id128_is_null(f->header->machine_id)) + /* Initialize machine ID when not set yet */ + f->header->machine_id = *machine_id; + + osize = offsetof(Object, entry.items) + (n_items * journal_file_entry_item_size(f)); + + r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np); + if (r < 0) + return r; + + o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum)); + o->entry.realtime = htole64(ts->realtime); + o->entry.monotonic = htole64(ts->monotonic); + o->entry.xor_hash = htole64(xor_hash); + o->entry.boot_id = f->header->tail_entry_boot_id = *boot_id; + + for (size_t i = 0; i < n_items; i++) + write_entry_item(f, o, i, &items[i]); + +#if HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np); + if (r < 0) + return r; +#endif + + r = journal_file_link_entry(f, o, np, items, n_items); + if (r < 0) + return r; + + if (ret_object) + *ret_object = o; + + if (ret_offset) + *ret_offset = np; + + return r; +} + +void journal_file_post_change(JournalFile *f) { + assert(f); + + if (f->fd < 0) + return; + + /* inotify() does not receive IN_MODIFY events from file + * accesses done via mmap(). After each access we hence + * trigger IN_MODIFY by truncating the journal file to its + * current size which triggers IN_MODIFY. */ + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + if (ftruncate(f->fd, f->last_stat.st_size) < 0) + log_debug_errno(errno, "Failed to truncate file to its own size: %m"); +} + +static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) { + assert(userdata); + + journal_file_post_change(userdata); + + return 1; +} + +static void schedule_post_change(JournalFile *f) { + sd_event *e; + int r; + + assert(f); + assert(f->post_change_timer); + + assert_se(e = sd_event_source_get_event(f->post_change_timer)); + + /* If we are already going down, post the change immediately. */ + if (IN_SET(sd_event_get_state(e), SD_EVENT_EXITING, SD_EVENT_FINISHED)) + goto fail; + + r = sd_event_source_get_enabled(f->post_change_timer, NULL); + if (r < 0) { + log_debug_errno(r, "Failed to get ftruncate timer state: %m"); + goto fail; + } + if (r > 0) + return; + + r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period); + if (r < 0) { + log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m"); + goto fail; + } + + r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT); + if (r < 0) { + log_debug_errno(r, "Failed to enable scheduled ftruncate: %m"); + goto fail; + } + + return; + +fail: + /* On failure, let's simply post the change immediately. */ + journal_file_post_change(f); +} + +/* Enable coalesced change posting in a timer on the provided sd_event instance */ +int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) { + _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL; + int r; + + assert(f); + assert_return(!f->post_change_timer, -EINVAL); + assert(e); + assert(t); + + /* If we are already going down, we cannot install the timer. + * In such case, the caller needs to call journal_file_post_change() explicitly. */ + if (IN_SET(sd_event_get_state(e), SD_EVENT_EXITING, SD_EVENT_FINISHED)) + return 0; + + r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(timer, SD_EVENT_OFF); + if (r < 0) + return r; + + f->post_change_timer = TAKE_PTR(timer); + f->post_change_timer_period = t; + + return 1; +} + +static int entry_item_cmp(const EntryItem *a, const EntryItem *b) { + return CMP(ASSERT_PTR(a)->object_offset, ASSERT_PTR(b)->object_offset); +} + +static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) { + size_t j = 1; + + assert(items || n == 0); + + if (n <= 1) + return n; + + for (size_t i = 1; i < n; i++) + if (items[i].object_offset != items[j - 1].object_offset) + items[j++] = items[i]; + + return j; +} + +int journal_file_append_entry( + JournalFile *f, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + const struct iovec iovec[], + size_t n_iovec, + uint64_t *seqnum, + sd_id128_t *seqnum_id, + Object **ret_object, + uint64_t *ret_offset) { + + _cleanup_free_ EntryItem *items_alloc = NULL; + EntryItem *items; + uint64_t xor_hash = 0; + struct dual_timestamp _ts; + sd_id128_t _boot_id, _machine_id, *machine_id; + int r; + + assert(f); + assert(f->header); + assert(iovec); + assert(n_iovec > 0); + + if (ts) { + if (!VALID_REALTIME(ts->realtime)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid realtime timestamp %" PRIu64 ", refusing entry.", + ts->realtime); + if (!VALID_MONOTONIC(ts->monotonic)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid monotomic timestamp %" PRIu64 ", refusing entry.", + ts->monotonic); + } else { + dual_timestamp_now(&_ts); + ts = &_ts; + } + + if (boot_id) { + if (sd_id128_is_null(*boot_id)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Empty boot ID, refusing entry."); + } else { + r = sd_id128_get_boot(&_boot_id); + if (r < 0) + return r; + + boot_id = &_boot_id; + } + + r = sd_id128_get_machine(&_machine_id); + if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) + /* Gracefully handle the machine ID not being initialized yet */ + machine_id = NULL; + else if (r < 0) + return r; + else + machine_id = &_machine_id; + +#if HAVE_GCRYPT + r = journal_file_maybe_append_tag(f, ts->realtime); + if (r < 0) + return r; +#endif + + if (n_iovec < ALLOCA_MAX / sizeof(EntryItem) / 2) + items = newa(EntryItem, n_iovec); + else { + items_alloc = new(EntryItem, n_iovec); + if (!items_alloc) + return -ENOMEM; + + items = items_alloc; + } + + for (size_t i = 0; i < n_iovec; i++) { + uint64_t p; + Object *o; + + r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p); + if (r < 0) + return r; + + /* When calculating the XOR hash field, we need to take special care if the "keyed-hash" + * journal file flag is on. We use the XOR hash field to quickly determine the identity of a + * specific record, and give records with otherwise identical position (i.e. match in seqno, + * timestamp, …) a stable ordering. But for that we can't have it that the hash of the + * objects in each file is different since they are keyed. Hence let's calculate the Jenkins + * hash here for that. This also has the benefit that cursors for old and new journal files + * are completely identical (they include the XOR hash after all). For classic Jenkins-hash + * files things are easier, we can just take the value from the stored record directly. */ + + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len); + else + xor_hash ^= le64toh(o->data.hash); + + items[i] = (EntryItem) { + .object_offset = p, + .hash = le64toh(o->data.hash), + }; + } + + /* Order by the position on disk, in order to improve seek + * times for rotating media. */ + typesafe_qsort(items, n_iovec, entry_item_cmp); + n_iovec = remove_duplicate_entry_items(items, n_iovec); + + r = journal_file_append_entry_internal( + f, + ts, + boot_id, + machine_id, + xor_hash, + items, + n_iovec, + seqnum, + seqnum_id, + ret_object, + ret_offset); + + /* If the memory mapping triggered a SIGBUS then we return an + * IO error and ignore the error code passed down to us, since + * it is very likely just an effect of a nullified replacement + * mapping page */ + + if (mmap_cache_fd_got_sigbus(f->cache_fd)) + r = -EIO; + + if (f->post_change_timer) + schedule_post_change(f); + else + journal_file_post_change(f); + + return r; +} + +typedef struct ChainCacheItem { + uint64_t first; /* The offset of the entry array object at the beginning of the chain, + * i.e., le64toh(f->header->entry_array_offset), or le64toh(o->data.entry_offset). */ + uint64_t array; /* The offset of the cached entry array object. */ + uint64_t begin; /* The offset of the first item in the cached array. */ + uint64_t total; /* The total number of items in all arrays before the cached one in the chain. */ + uint64_t last_index; /* The last index we looked at in the cached array, to optimize locality when bisecting. */ +} ChainCacheItem; + +static void chain_cache_put( + OrderedHashmap *h, + ChainCacheItem *ci, + uint64_t first, + uint64_t array, + uint64_t begin, + uint64_t total, + uint64_t last_index) { + + assert(h); + + if (!ci) { + /* If the chain item to cache for this chain is the + * first one it's not worth caching anything */ + if (array == first) + return; + + if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) { + ci = ordered_hashmap_steal_first(h); + assert(ci); + } else { + ci = new(ChainCacheItem, 1); + if (!ci) + return; + } + + ci->first = first; + + if (ordered_hashmap_put(h, &ci->first, ci) < 0) { + free(ci); + return; + } + } else + assert(ci->first == first); + + ci->array = array; + ci->begin = begin; + ci->total = total; + ci->last_index = last_index; +} + +static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) { + assert(i); + + /* Increase or decrease the specified index, in the right direction. */ + + if (direction == DIRECTION_DOWN) { + if (*i >= n - 1) + return 0; + + (*i)++; + } else { + if (*i <= 0) + return 0; + + (*i)--; + } + + return 1; +} + +static int bump_entry_array( + JournalFile *f, + Object *o, /* the current entry array object. */ + uint64_t offset, /* the offset of the entry array object. */ + uint64_t first, /* The offset of the first entry array object in the chain. */ + direction_t direction, + uint64_t *ret) { + + int r; + + assert(f); + assert(ret); + + if (direction == DIRECTION_DOWN) { + assert(o); + assert(o->object.type == OBJECT_ENTRY_ARRAY); + + *ret = le64toh(o->entry_array.next_entry_array_offset); + } else { + + /* Entry array chains are a singly linked list, so to find the previous array in the chain, we have + * to start iterating from the top. */ + + assert(offset > 0); + + uint64_t p = first, q = 0; + while (p > 0 && p != offset) { + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, p, &o); + if (r < 0) + return r; + + q = p; + p = le64toh(o->entry_array.next_entry_array_offset); + } + + /* If we can't find the previous entry array in the entry array chain, we're likely dealing with a + * corrupted journal file. */ + if (p == 0) + return -EBADMSG; + + *ret = q; + } + + return *ret > 0; +} + +static int generic_array_get( + JournalFile *f, + uint64_t first, /* The offset of the first entry array object in the chain. */ + uint64_t i, /* The index of the target object counted from the beginning of the entry array chain. */ + direction_t direction, + Object **ret_object, /* The found object. */ + uint64_t *ret_offset) { /* The offset of the found object. */ + + uint64_t a, t = 0, k; + ChainCacheItem *ci; + Object *o = NULL; + int r; + + assert(f); + + /* FIXME: fix return value assignment on success. */ + + a = first; + + /* Try the chain cache first */ + ci = ordered_hashmap_get(f->chain_cache, &first); + if (ci && i > ci->total) { + a = ci->array; + i -= ci->total; + t = ci->total; + } + + while (a > 0) { + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + /* If there's corruption and we're going downwards, let's pretend we reached the + * final entry in the entry array chain. */ + + if (direction == DIRECTION_DOWN) + return 0; + + /* If there's corruption and we're going upwards, move back to the previous entry + * array and start iterating entries from there. */ + + i = UINT64_MAX; + break; + } + if (r < 0) + return r; + + k = journal_file_entry_array_n_items(f, o); + if (k == 0) + return 0; + + if (i < k) + break; + + /* The index is larger than the number of elements in the array. Let's move to the next array. */ + i -= k; + t += k; + a = le64toh(o->entry_array.next_entry_array_offset); + } + + /* If we've found the right location, now look for the first non-corrupt entry object (in the right + * direction). */ + + while (a > 0) { + if (i == UINT64_MAX) { + r = bump_entry_array(f, o, a, first, direction, &a); + if (r <= 0) + return r; + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + k = journal_file_entry_array_n_items(f, o); + if (k == 0) + break; + + if (direction == DIRECTION_DOWN) + i = 0; + else { + /* We moved to the previous array. The total must be decreased. */ + if (t < k) + return -EBADMSG; /* chain cache is broken ? */ + + i = k - 1; + t -= k; + } + } + + do { + uint64_t p; + + p = journal_file_entry_array_item(f, o, i); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret_object); + if (r >= 0) { + /* Let's cache this item for the next invocation */ + chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i); + + if (ret_offset) + *ret_offset = p; + + return 1; + } + if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) + return r; + + /* OK, so this entry is borked. Most likely some entry didn't get synced to + * disk properly, let's see if the next one might work for us instead. */ + log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i); + + } while (bump_array_index(&i, direction, k) > 0); + + /* All entries tried in the above do-while loop are broken. Let's move to the next (or previous) array. */ + + if (direction == DIRECTION_DOWN) + /* We are going to the next array, the total must be incremented. */ + t += k; + + i = UINT64_MAX; + } + + return 0; +} + +enum { + TEST_FOUND, /* The current object passes the test. */ + TEST_LEFT, /* The current object is in an earlier position, and the object we are looking + * for should exist in a later position. */ + TEST_RIGHT, /* The current object is in a later position, and the object we are looking for + * should exist in an earlier position. */ + TEST_GOTO_NEXT, /* No matching object exists in this array and earlier arrays, go to the next array. */ + TEST_GOTO_PREVIOUS, /* No matching object exists in this array and later arrays, go to the previous array. */ +}; + +static int generic_array_bisect_step( + JournalFile *f, + Object *array, /* entry array object */ + uint64_t i, /* index of the entry item in the array we will test. */ + uint64_t needle, + int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), + direction_t direction, + uint64_t *m, /* The maximum number of the entries we will check in the array. */ + uint64_t *left, /* The index of the left boundary in the array. */ + uint64_t *right) { /* The index of the right boundary in the array. */ + + uint64_t p; + int r; + + assert(f); + assert(array); + assert(test_object); + assert(m); + assert(left); + assert(right); + assert(*left <= i); + assert(i <= *right); + assert(*right < *m); + + p = journal_file_entry_array_item(f, array, i); + if (p <= 0) + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short."); + + if (i == *left) { + /* This happens on two situations: + * + * a) i == 0 (hence, *left == 0): + * The first entry in the array is corrupted, let's go back to the previous array. + * + * b) *right == *left or *left + 1, and we are going to downwards: + * In that case, the (i-1)-th object has been already tested in the previous call, + * which returned TEST_LEFT. See below. So, there is no matching entry in this + * array nor in the whole entry array chain. */ + assert(i == 0 || (*right - *left <= 1 && direction == DIRECTION_DOWN)); + return TEST_GOTO_PREVIOUS; + } + + /* Otherwise, cutting the array short. So, here we limit the number of elements we will see + * in this array, and set the right boundary to the last possibly non-corrupted object. */ + *m = i; + *right = i - 1; + return TEST_RIGHT; + } + if (r < 0) + return r; + + if (r == TEST_FOUND) + /* There may be multiple entries that match with the needle. When the direction is down, we + * need to find the first matching entry, hence the right boundary can be moved, but the left + * one cannot. Similarly, when the direction is up, we need to find the last matching entry, + * hence the left boundary can be moved, but the right one cannot. */ + r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; + + if (r == TEST_RIGHT) { + /* Currently, left --- needle --- i --- right, hence we can move the right boundary to i. */ + if (direction == DIRECTION_DOWN) + *right = i; + else { + if (i == 0) + return TEST_GOTO_PREVIOUS; + *right = i - 1; + } + } else { + /* Currently, left --- i --- needle --- right, hence we can move the left boundary to i. */ + if (direction == DIRECTION_DOWN) { + /* Note, here *m is always positive, as by the assertions at the beginning, we have + * 0 <= *left <= i <= *right < m */ + if (i == *m - 1) + return TEST_GOTO_NEXT; + + *left = i + 1; + } else + *left = i; + } + + return r; +} + +static int generic_array_bisect( + JournalFile *f, + uint64_t first, /* The offset of the first entry array object in the chain. */ + uint64_t n, /* The total number of elements in the chain of the entry array. */ + uint64_t needle, /* The target value (e.g. seqnum, monotonic, realtime, ...). */ + int (*test_object)(JournalFile *f, + uint64_t p, /* the offset of the (data or entry) object that will be tested. */ + uint64_t needle), + direction_t direction, + Object **ret_object, /* The found object. */ + uint64_t *ret_offset, /* The offset of the found object. */ + uint64_t *ret_idx) { /* The index of the found object counted from the beginning of the entry array chain. */ + + /* Given an entry array chain, this function finds the object "closest" to the given needle in the + * chain, taking into account the provided direction. A function can be provided to determine how + * an object is matched against the given needle. + * + * Given a journal file, the offset of an object and the needle, the test_object() function should + * return TEST_RIGHT if the needle is located earlier in the entry array chain, TEST_LEFT if the + * needle is located later in the entry array chain, and TEST_FOUND if the object matches the needle. + * If test_object() returns TEST_FOUND for a specific object, that object's information will be used + * to populate the return values of this function. If test_object() never returns TEST_FOUND, the + * return values are populated with the details of one of the objects closest to the needle. If the + * direction is DIRECTION_UP, the earlier object is used. Otherwise, the later object is used. + * If there are multiple objects that test_object() return TEST_FOUND for, then the first matching + * object returned when direction is DIRECTION_DOWN. Otherwise the last object is returned. */ + + uint64_t a, p, t = 0, i, last_index = UINT64_MAX; + ChainCacheItem *ci; + Object *array; + int r; + + assert(f); + assert(test_object); + + if (n <= 0) + return 0; + + /* Start with the first array in the chain */ + a = first; + + ci = ordered_hashmap_get(f->chain_cache, &first); + if (ci && n > ci->total && ci->begin != 0) { + /* Ah, we have iterated this bisection array chain previously! Let's see if we can skip ahead + * in the chain, as far as the last time. But we can't jump backwards in the chain, so let's + * check that first. */ + + r = test_object(f, ci->begin, needle); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) + log_debug_errno(r, "Cached entry is corrupted, ignoring: %m"); + else if (r < 0) + return r; + else if (r == TEST_LEFT) { + /* OK, what we are looking for is right of the begin of this EntryArray, so let's + * jump straight to previously cached array in the chain */ + + a = ci->array; + n -= ci->total; + t = ci->total; + last_index = ci->last_index; + } + } + + while (a > 0) { + uint64_t left, right, k, m, m_original; + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array); + if (r < 0) + return r; + + k = journal_file_entry_array_n_items(f, array); + m = m_original = MIN(k, n); + if (m <= 0) + return 0; + + left = 0; + right = m - 1; + + if (direction == DIRECTION_UP) { + /* If we're going upwards, the last entry of the previous array may pass the test, + * and the first entry of the current array may not pass. In that case, the last + * entry of the previous array must be returned. Hence, we need to test the first + * entry of the current array. */ + r = generic_array_bisect_step(f, array, 0, needle, test_object, direction, &m, &left, &right); + if (r < 0) + return r; + if (r == TEST_GOTO_PREVIOUS) + goto previous; + } + + /* Test the last entry of this array, to determine if we should go to the next array. */ + r = generic_array_bisect_step(f, array, right, needle, test_object, direction, &m, &left, &right); + if (r < 0) + return r; + if (r == TEST_GOTO_PREVIOUS) + goto previous; + + /* The expected entry should be in this array, (or the last entry of the previous array). */ + if (r == TEST_RIGHT) { + + /* If we cached the last index we looked at, let's try to not to jump too wildly + * around and see if we can limit the range to look at early to the immediate + * neighbors of the last index we looked at. */ + + if (last_index > 0 && left < last_index - 1 && last_index - 1 < right) { + r = generic_array_bisect_step(f, array, last_index - 1, needle, test_object, direction, &m, &left, &right); + if (r < 0) + return r; + if (r == TEST_GOTO_PREVIOUS) + goto previous; + } + + if (last_index < UINT64_MAX && left < last_index + 1 && last_index + 1 < right) { + r = generic_array_bisect_step(f, array, last_index + 1, needle, test_object, direction, &m, &left, &right); + if (r < 0) + return r; + if (r == TEST_GOTO_PREVIOUS) + goto previous; + } + + for (;;) { + if (left == right) { + /* We found one or more corrupted entries in generic_array_bisect_step(). + * In that case, the entry pointed by 'right' may not be tested. + * + * When we are going to downwards, the entry object pointed by 'left' + * has not been tested yet, Hence, even if left == right, we still + * have to check the final entry to see if it actually matches. + * + * On the other hand, when we are going to upwards, the entry pointed + * by 'left' is always tested, So, it is not necessary to test the + * final entry again. */ + if (m != m_original && direction == DIRECTION_DOWN) { + r = generic_array_bisect_step(f, array, left, needle, test_object, direction, &m, &left, &right); + if (r < 0) + return r; + if (IN_SET(r, TEST_GOTO_PREVIOUS, TEST_GOTO_NEXT)) + return 0; /* The entry does not pass the test, or is corrupted */ + + assert(TEST_RIGHT); + assert(left == right); + } + + i = left; + goto found; + } + + assert(left < right); + i = (left + right + (direction == DIRECTION_UP)) / 2; + + r = generic_array_bisect_step(f, array, i, needle, test_object, direction, &m, &left, &right); + if (r < 0) + return r; + if (r == TEST_GOTO_PREVIOUS) + goto previous; + if (r == TEST_GOTO_NEXT) + return 0; /* Found a corrupt entry, and the array was cut short. */ + } + } + + /* Not found in this array (or the last entry of this array should be returned), go to the next array. */ + assert(r == (direction == DIRECTION_DOWN ? TEST_GOTO_NEXT : TEST_LEFT)); + + if (k >= n) { + if (direction == DIRECTION_UP) { + assert(n > 0); + i = n - 1; + goto found; + } + + return 0; + } + + n -= k; + t += k; + last_index = UINT64_MAX; + a = le64toh(array->entry_array.next_entry_array_offset); + } + + return 0; + +previous: + /* Not found in the current array, return the last entry of the previous array. */ + assert(r == TEST_GOTO_PREVIOUS); + + /* The current array is the first in the chain. no previous array. */ + if (t == 0) + return 0; + + /* When we are going downwards, there is no matching entries in the previous array. */ + if (direction == DIRECTION_DOWN) + return 0; + + /* Indicate to go to the previous array later. Note, do not move to the previous array here, + * as that may invalidate the current array object in the mmap cache and + * journal_file_entry_array_item() below may read invalid address. */ + i = UINT64_MAX; + +found: + p = journal_file_entry_array_item(f, array, 0); + if (p <= 0) + return -EBADMSG; + + /* Let's cache this item for the next invocation */ + chain_cache_put(f->chain_cache, ci, first, a, p, t, i); + + if (i == UINT64_MAX) { + uint64_t m; + + /* Get the last entry of the previous array. */ + + r = bump_entry_array(f, NULL, a, first, DIRECTION_UP, &a); + if (r <= 0) + return r; + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array); + if (r < 0) + return r; + + m = journal_file_entry_array_n_items(f, array); + if (m == 0 || t < m) + return -EBADMSG; + + t -= m; + i = m - 1; + } + + p = journal_file_entry_array_item(f, array, i); + if (p == 0) + return -EBADMSG; + + if (ret_object) { + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret_object); + if (r < 0) + return r; + } + + if (ret_offset) + *ret_offset = p; + + if (ret_idx) + *ret_idx = t + i; + + return 1; +} + +static int generic_array_bisect_for_data( + JournalFile *f, + Object *d, + uint64_t needle, + int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t extra, first, n; + int r; + + assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); + assert(test_object); + + n = le64toh(d->data.n_entries); + if (n <= 0) + return 0; + n--; /* n_entries is the number of entries linked to the data object, including the 'extra' entry. */ + + extra = le64toh(d->data.entry_offset); + first = le64toh(d->data.entry_array_offset); + + /* This bisects the array in object 'first', but first checks an extra. */ + r = test_object(f, extra, needle); + if (r < 0) + return r; + + if (direction == DIRECTION_DOWN) { + /* If we are going downwards, then we need to return the first object that passes the test. + * When there is no object that passes the test, we need to return the first object that + * test_object() returns TEST_RIGHT for. */ + if (IN_SET(r, + TEST_FOUND, /* The 'extra' object passes the test. Hence, this is the first + * object that passes the test. */ + TEST_RIGHT)) /* The 'extra' object is the first object that test_object() returns + * TEST_RIGHT for, and no object exists even in the chained arrays + * that passes the test. */ + goto use_extra; /* The 'extra' object is exactly the one we are looking for. It is + * not necessary to bisect the chained arrays. */ + + /* Otherwise, the 'extra' object is not the one we are looking for. Search in the arrays. */ + + } else { + /* If we are going upwards, then we need to return the last object that passes the test. + * When there is no object that passes the test, we need to return the the last object that + * test_object() returns TEST_LEFT for. */ + if (r == TEST_RIGHT) + return 0; /* Not only the 'extra' object, but also all objects in the chained arrays + * will never get TEST_FOUND or TEST_LEFT. The object we are looking for + * does not exist. */ + + /* Even if the 'extra' object passes the test, there may be multiple objects in the arrays + * that also pass the test. Hence, we need to bisect the arrays for finding the last matching + * object. */ + } + + r = generic_array_bisect(f, first, n, needle, test_object, direction, ret_object, ret_offset, NULL); + if (r != 0) + return r; /* When > 0, the found object is the first (or last, when DIRECTION_UP) object. + * Hence, return the found object now. */ + + /* No matching object found in the chained arrays. + * DIRECTION_DOWN : the 'extra' object neither matches the condition. There is no matching object. + * DIRECTION_UP : the 'extra' object matches the condition. So, return it. */ + if (direction == DIRECTION_DOWN) + return 0; + +use_extra: + if (ret_object) { + r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret_object); + if (r < 0) + return r; + } + + if (ret_offset) + *ret_offset = extra; + + return 1; +} + +static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) { + assert(f); + assert(p > 0); + + if (p == needle) + return TEST_FOUND; + else if (p < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +int journal_file_move_to_entry_by_offset( + JournalFile *f, + uint64_t p, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + assert(f); + assert(f->header); + + return generic_array_bisect( + f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + p, + test_object_offset, + direction, + ret_object, ret_offset, NULL); +} + +static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) { + uint64_t sq; + Object *o; + int r; + + assert(f); + assert(p > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + sq = le64toh(READ_NOW(o->entry.seqnum)); + if (sq == needle) + return TEST_FOUND; + else if (sq < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +int journal_file_move_to_entry_by_seqnum( + JournalFile *f, + uint64_t seqnum, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + assert(f); + assert(f->header); + + return generic_array_bisect( + f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + seqnum, + test_object_seqnum, + direction, + ret_object, ret_offset, NULL); +} + +static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) { + Object *o; + uint64_t rt; + int r; + + assert(f); + assert(p > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + rt = le64toh(READ_NOW(o->entry.realtime)); + if (rt == needle) + return TEST_FOUND; + else if (rt < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +int journal_file_move_to_entry_by_realtime( + JournalFile *f, + uint64_t realtime, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + assert(f); + assert(f->header); + + return generic_array_bisect( + f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + realtime, + test_object_realtime, + direction, + ret_object, ret_offset, NULL); +} + +static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) { + Object *o; + uint64_t m; + int r; + + assert(f); + assert(p > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + m = le64toh(READ_NOW(o->entry.monotonic)); + if (m == needle) + return TEST_FOUND; + else if (m < needle) + return TEST_LEFT; + else + return TEST_RIGHT; +} + +static int find_data_object_by_boot_id( + JournalFile *f, + sd_id128_t boot_id, + Object **ret_object, + uint64_t *ret_offset) { + + char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID="; + + assert(f); + + sd_id128_to_string(boot_id, t + 9); + return journal_file_find_data_object(f, t, sizeof(t) - 1, ret_object, ret_offset); +} + +int journal_file_move_to_entry_by_monotonic( + JournalFile *f, + sd_id128_t boot_id, + uint64_t monotonic, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + Object *o; + int r; + + assert(f); + + r = find_data_object_by_boot_id(f, boot_id, &o, NULL); + if (r <= 0) + return r; + + return generic_array_bisect_for_data( + f, + o, + monotonic, + test_object_monotonic, + direction, + ret_object, ret_offset); +} + +void journal_file_reset_location(JournalFile *f) { + assert(f); + + f->location_type = LOCATION_HEAD; + f->current_offset = 0; + f->current_seqnum = 0; + f->current_realtime = 0; + f->current_monotonic = 0; + zero(f->current_boot_id); + f->current_xor_hash = 0; + + /* Also reset the previous reading direction. Otherwise, next_beyond_location() may wrongly handle we + * already hit EOF. See issue #29216. */ + f->last_direction = _DIRECTION_INVALID; +} + +void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) { + assert(f); + assert(o); + + f->location_type = LOCATION_SEEK; + f->current_offset = offset; + f->current_seqnum = le64toh(o->entry.seqnum); + f->current_realtime = le64toh(o->entry.realtime); + f->current_monotonic = le64toh(o->entry.monotonic); + f->current_boot_id = o->entry.boot_id; + f->current_xor_hash = le64toh(o->entry.xor_hash); +} + +static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) { + + /* Consider it an error if any of the two offsets is uninitialized */ + if (old_offset == 0 || new_offset == 0) + return false; + + /* If we go down, the new offset must be larger than the old one. */ + return direction == DIRECTION_DOWN ? + new_offset > old_offset : + new_offset < old_offset; +} + +int journal_file_next_entry( + JournalFile *f, + uint64_t p, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t i, n, q; + Object *o; + int r; + + assert(f); + assert(f->header); + + /* FIXME: fix return value assignment. */ + + n = le64toh(READ_NOW(f->header->n_entries)); + if (n <= 0) + return 0; + + /* When the input offset 'p' is zero, return the first (or last on DIRECTION_UP) entry. */ + if (p == 0) + return generic_array_get(f, + le64toh(f->header->entry_array_offset), + direction == DIRECTION_DOWN ? 0 : n - 1, + direction, + ret_object, ret_offset); + + /* Otherwise, first find the nearest entry object. */ + r = generic_array_bisect(f, + le64toh(f->header->entry_array_offset), + le64toh(f->header->n_entries), + p, + test_object_offset, + direction, + ret_object ? &o : NULL, &q, &i); + if (r <= 0) + return r; + + assert(direction == DIRECTION_DOWN ? p <= q : q <= p); + + /* If the input offset 'p' points to an entry object, generic_array_bisect() should provides + * the same offset, and the index needs to be shifted. Otherwise, use the found object as is, + * as it is the nearest entry object from the input offset 'p'. */ + + if (p != q) + goto found; + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + + /* And jump to it */ + r = generic_array_get(f, le64toh(f->header->entry_array_offset), i, direction, ret_object ? &o : NULL, &q); + if (r <= 0) + return r; + + /* Ensure our array is properly ordered. */ + if (!check_properly_ordered(q, p, direction)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s: entry array not properly ordered at entry index %" PRIu64, + f->path, i); +found: + if (ret_object) + *ret_object = o; + if (ret_offset) + *ret_offset = q; + + return 1; +} + +int journal_file_move_to_entry_for_data( + JournalFile *f, + Object *d, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + uint64_t extra, first, n; + int r = 0; + + assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); + assert(IN_SET(direction, DIRECTION_DOWN, DIRECTION_UP)); + + /* FIXME: fix return value assignment. */ + + /* This returns the first (when the direction is down, otherwise the last) entry linked to the + * specified data object. */ + + n = le64toh(d->data.n_entries); + if (n <= 0) + return 0; + n--; /* n_entries is the number of entries linked to the data object, including the 'extra' entry. */ + + extra = le64toh(d->data.entry_offset); + first = le64toh(d->data.entry_array_offset); + + if (direction == DIRECTION_DOWN && extra > 0) { + /* When we are going downwards, first try to read the extra entry. */ + r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret_object); + if (r >= 0) + goto use_extra; + if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) + return r; + } + + if (n > 0) { + /* DIRECTION_DOWN : The extra entry is broken, falling back to the entries in the array. + * DIRECTION_UP : Try to find a valid entry in the array from the tail. */ + r = generic_array_get(f, + first, + direction == DIRECTION_DOWN ? 0 : n - 1, + direction, + ret_object, ret_offset); + if (!IN_SET(r, 0, -EADDRNOTAVAIL, -EBADMSG)) + return r; /* found or critical error. */ + } + + if (direction == DIRECTION_UP && extra > 0) { + /* No valid entry exists in the chained array, falling back to the extra entry. */ + r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret_object); + if (r >= 0) + goto use_extra; + } + + return r; + +use_extra: + if (ret_offset) + *ret_offset = extra; + + return 1; +} + +int journal_file_move_to_entry_by_offset_for_data( + JournalFile *f, + Object *d, + uint64_t p, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); + + return generic_array_bisect_for_data( + f, + d, + p, + test_object_offset, + direction, + ret, ret_offset); +} + +int journal_file_move_to_entry_by_monotonic_for_data( + JournalFile *f, + Object *d, + sd_id128_t boot_id, + uint64_t monotonic, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + Object *o, *entry; + uint64_t z; + int r; + + assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); + + /* First, pin the given data object, before reading the _BOOT_ID= data object below. */ + r = journal_file_pin_object(f, d); + if (r < 0) + return r; + + /* Then, read a data object for _BOOT_ID= and seek by time. */ + r = find_data_object_by_boot_id(f, boot_id, &o, NULL); + if (r <= 0) + return r; + + r = generic_array_bisect_for_data(f, + o, + monotonic, + test_object_monotonic, + direction, + NULL, &z); + if (r <= 0) + return r; + + /* And now, continue seeking until we find an entry that exists in both bisection arrays. */ + for (;;) { + uint64_t p; + + /* The journal entry found by the above bisect_plus_one() may not have the specified data, + * that is, it may not be linked in the data object. So, we need to check that. */ + + r = journal_file_move_to_entry_by_offset_for_data( + f, d, z, direction, ret_object ? &entry : NULL, &p); + if (r <= 0) + return r; + if (p == z) + break; /* The journal entry has the specified data. Yay! */ + + /* If the entry does not have the data, then move to the next (or previous, depends on the + * 'direction') entry linked to the data object. But, the next entry may be in another boot. + * So, we need to check that the entry has the matching boot ID. */ + + r = journal_file_move_to_entry_by_offset_for_data( + f, o, p, direction, ret_object ? &entry : NULL, &z); + if (r <= 0) + return r; + if (p == z) + break; /* The journal entry has the specified boot ID. Yay! */ + + /* If not, let's try to the next entry... */ + } + + if (ret_object) + *ret_object = entry; + if (ret_offset) + *ret_offset = z; + return 1; +} + +int journal_file_move_to_entry_by_seqnum_for_data( + JournalFile *f, + Object *d, + uint64_t seqnum, + direction_t direction, + Object **ret_object, + uint64_t *ret_offset) { + + assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); + + return generic_array_bisect_for_data( + f, + d, + seqnum, + test_object_seqnum, + direction, + ret_object, ret_offset); +} + +int journal_file_move_to_entry_by_realtime_for_data( + JournalFile *f, + Object *d, + uint64_t realtime, + direction_t direction, + Object **ret, uint64_t *ret_offset) { + + assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); + + return generic_array_bisect_for_data( + f, + d, + realtime, + test_object_realtime, + direction, + ret, ret_offset); +} + +void journal_file_dump(JournalFile *f) { + Object *o; + uint64_t p; + int r; + + assert(f); + assert(f->header); + + journal_file_print_header(f); + + p = le64toh(READ_NOW(f->header->header_size)); + while (p != 0) { + const char *s; + Compression c; + + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + if (r < 0) + goto fail; + + s = journal_object_type_to_string(o->object.type); + + switch (o->object.type) { + + case OBJECT_ENTRY: + assert(s); + + printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n", + s, + le64toh(o->entry.seqnum), + le64toh(o->entry.monotonic), + le64toh(o->entry.realtime)); + break; + + case OBJECT_TAG: + assert(s); + + printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n", + s, + le64toh(o->tag.seqnum), + le64toh(o->tag.epoch)); + break; + + default: + if (s) + printf("Type: %s \n", s); + else + printf("Type: unknown (%i)", o->object.type); + + break; + } + + c = COMPRESSION_FROM_OBJECT(o); + if (c > COMPRESSION_NONE) + printf("Flags: %s\n", + compression_to_string(c)); + + if (p == le64toh(f->header->tail_object_offset)) + p = 0; + else + p += ALIGN64(le64toh(o->object.size)); + } + + return; +fail: + log_error("File corrupt"); +} + +/* Note: the lifetime of the compound literal is the immediately surrounding block. */ +#define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ") + +void journal_file_print_header(JournalFile *f) { + struct stat st; + + assert(f); + assert(f->header); + + printf("File path: %s\n" + "File ID: %s\n" + "Machine ID: %s\n" + "Boot ID: %s\n" + "Sequential number ID: %s\n" + "State: %s\n" + "Compatible flags:%s%s%s%s\n" + "Incompatible flags:%s%s%s%s%s%s\n" + "Header size: %"PRIu64"\n" + "Arena size: %"PRIu64"\n" + "Data hash table size: %"PRIu64"\n" + "Field hash table size: %"PRIu64"\n" + "Rotate suggested: %s\n" + "Head sequential number: %"PRIu64" (%"PRIx64")\n" + "Tail sequential number: %"PRIu64" (%"PRIx64")\n" + "Head realtime timestamp: %s (%"PRIx64")\n" + "Tail realtime timestamp: %s (%"PRIx64")\n" + "Tail monotonic timestamp: %s (%"PRIx64")\n" + "Objects: %"PRIu64"\n" + "Entry objects: %"PRIu64"\n", + f->path, + SD_ID128_TO_STRING(f->header->file_id), + SD_ID128_TO_STRING(f->header->machine_id), + SD_ID128_TO_STRING(f->header->tail_entry_boot_id), + SD_ID128_TO_STRING(f->header->seqnum_id), + f->header->state == STATE_OFFLINE ? "OFFLINE" : + f->header->state == STATE_ONLINE ? "ONLINE" : + f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN", + JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "", + JOURNAL_HEADER_SEALED_CONTINUOUS(f->header) ? " SEALED_CONTINUOUS" : "", + JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) ? " TAIL_ENTRY_BOOT_ID" : "", + (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "", + JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "", + JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "", + JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "", + JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "", + JOURNAL_HEADER_COMPACT(f->header) ? " COMPACT" : "", + (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "", + le64toh(f->header->header_size), + le64toh(f->header->arena_size), + le64toh(f->header->data_hash_table_size) / sizeof(HashItem), + le64toh(f->header->field_hash_table_size) / sizeof(HashItem), + yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)), + le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum), + le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum), + FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime), + FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime), + FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic), + le64toh(f->header->n_objects), + le64toh(f->header->n_entries)); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) + printf("Data objects: %"PRIu64"\n" + "Data hash table fill: %.1f%%\n", + le64toh(f->header->n_data), + 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)))); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) + printf("Field objects: %"PRIu64"\n" + "Field hash table fill: %.1f%%\n", + le64toh(f->header->n_fields), + 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)))); + + if (JOURNAL_HEADER_CONTAINS(f->header, n_tags)) + printf("Tag objects: %"PRIu64"\n", + le64toh(f->header->n_tags)); + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) + printf("Entry array objects: %"PRIu64"\n", + le64toh(f->header->n_entry_arrays)); + + if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth)) + printf("Deepest field hash chain: %" PRIu64"\n", + f->header->field_hash_chain_depth); + + if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth)) + printf("Deepest data hash chain: %" PRIu64"\n", + f->header->data_hash_chain_depth); + + if (fstat(f->fd, &st) >= 0) + printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL)); +} + +static int journal_file_warn_btrfs(JournalFile *f) { + unsigned attrs; + int r; + + assert(f); + + /* Before we write anything, check if the COW logic is turned + * off on btrfs. Given our write pattern that is quite + * unfriendly to COW file systems this should greatly improve + * performance on COW file systems, such as btrfs, at the + * expense of data integrity features (which shouldn't be too + * bad, given that we do our own checksumming). */ + + r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to determine if journal is on btrfs: %m"); + if (r == 0) + return 0; + + r = read_attr_fd(f->fd, &attrs); + if (r < 0) + return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to read file attributes: %m"); + + if (attrs & FS_NOCOW_FL) { + log_debug("Detected btrfs file system with copy-on-write disabled, all is good."); + return 0; + } + + log_ratelimit_notice(JOURNAL_LOG_RATELIMIT, + "Creating journal file %s on a btrfs file system, and copy-on-write is enabled. " + "This is likely to slow down journal access substantially, please consider turning " + "off the copy-on-write file attribute on the journal directory, using chattr +C.", + f->path); + + return 1; +} + +static void journal_default_metrics(JournalMetrics *m, int fd, bool compact) { + struct statvfs ss; + uint64_t fs_size = 0; + + assert(m); + assert(fd >= 0); + + if (fstatvfs(fd, &ss) >= 0) + fs_size = u64_multiply_safe(ss.f_frsize, ss.f_blocks); + else + log_debug_errno(errno, "Failed to determine disk size: %m"); + + if (m->max_use == UINT64_MAX) { + + if (fs_size > 0) + m->max_use = CLAMP(PAGE_ALIGN_U64(fs_size / 10), /* 10% of file system size */ + MAX_USE_LOWER, MAX_USE_UPPER); + else + m->max_use = MAX_USE_LOWER; + } else { + m->max_use = PAGE_ALIGN_U64(m->max_use); + + if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2) + m->max_use = JOURNAL_FILE_SIZE_MIN*2; + } + + if (m->min_use == UINT64_MAX) { + if (fs_size > 0) + m->min_use = CLAMP(PAGE_ALIGN_U64(fs_size / 50), /* 2% of file system size */ + MIN_USE_LOW, MIN_USE_HIGH); + else + m->min_use = MIN_USE_LOW; + } + + if (m->min_use > m->max_use) + m->min_use = m->max_use; + + if (m->max_size == UINT64_MAX) + m->max_size = MIN(PAGE_ALIGN_U64(m->max_use / 8), /* 8 chunks */ + MAX_SIZE_UPPER); + else + m->max_size = PAGE_ALIGN_U64(m->max_size); + + if (compact && m->max_size > JOURNAL_COMPACT_SIZE_MAX) + m->max_size = JOURNAL_COMPACT_SIZE_MAX; + + if (m->max_size != 0) { + if (m->max_size < JOURNAL_FILE_SIZE_MIN) + m->max_size = JOURNAL_FILE_SIZE_MIN; + + if (m->max_use != 0 && m->max_size*2 > m->max_use) + m->max_use = m->max_size*2; + } + + if (m->min_size == UINT64_MAX) + m->min_size = JOURNAL_FILE_SIZE_MIN; + else + m->min_size = CLAMP(PAGE_ALIGN_U64(m->min_size), + JOURNAL_FILE_SIZE_MIN, + m->max_size ?: UINT64_MAX); + + if (m->keep_free == UINT64_MAX) { + if (fs_size > 0) + m->keep_free = MIN(PAGE_ALIGN_U64(fs_size / 20), /* 5% of file system size */ + KEEP_FREE_UPPER); + else + m->keep_free = DEFAULT_KEEP_FREE; + } + + if (m->n_max_files == UINT64_MAX) + m->n_max_files = DEFAULT_N_MAX_FILES; + + log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64, + FORMAT_BYTES(m->min_use), + FORMAT_BYTES(m->max_use), + FORMAT_BYTES(m->max_size), + FORMAT_BYTES(m->min_size), + FORMAT_BYTES(m->keep_free), + m->n_max_files); +} + +int journal_file_open( + int fd, + const char *fname, + int open_flags, + JournalFileFlags file_flags, + mode_t mode, + uint64_t compress_threshold_bytes, + JournalMetrics *metrics, + MMapCache *mmap_cache, + JournalFile *template, + JournalFile **ret) { + + bool newly_created = false; + JournalFile *f; + void *h; + int r; + + assert(fd >= 0 || fname); + assert(file_flags >= 0); + assert(file_flags <= _JOURNAL_FILE_FLAGS_MAX); + assert(mmap_cache); + assert(ret); + + if (!IN_SET((open_flags & O_ACCMODE), O_RDONLY, O_RDWR)) + return -EINVAL; + + if ((open_flags & O_ACCMODE) == O_RDONLY && FLAGS_SET(open_flags, O_CREAT)) + return -EINVAL; + + if (fname && (open_flags & O_CREAT) && !endswith(fname, ".journal")) + return -EINVAL; + + f = new(JournalFile, 1); + if (!f) + return -ENOMEM; + + *f = (JournalFile) { + .fd = fd, + .mode = mode, + .open_flags = open_flags, + .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ? + DEFAULT_COMPRESS_THRESHOLD : + MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes), + .strict_order = FLAGS_SET(file_flags, JOURNAL_STRICT_ORDER), + .newest_boot_id_prioq_idx = PRIOQ_IDX_NULL, + .last_direction = _DIRECTION_INVALID, + }; + + if (fname) { + f->path = strdup(fname); + if (!f->path) { + r = -ENOMEM; + goto fail; + } + } else { + assert(fd >= 0); + + /* If we don't know the path, fill in something explanatory and vaguely useful */ + if (asprintf(&f->path, "/proc/self/%i", fd) < 0) { + r = -ENOMEM; + goto fail; + } + } + + f->chain_cache = ordered_hashmap_new(&uint64_hash_ops); + if (!f->chain_cache) { + r = -ENOMEM; + goto fail; + } + + if (f->fd < 0) { + /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO + * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence + * it doesn't hurt in that case. */ + + f->fd = openat_report_new(AT_FDCWD, f->path, f->open_flags|O_CLOEXEC|O_NONBLOCK, f->mode, &newly_created); + if (f->fd < 0) { + r = f->fd; + goto fail; + } + + /* fds we opened here by us should also be closed by us. */ + f->close_fd = true; + + r = fd_nonblock(f->fd, false); + if (r < 0) + goto fail; + + if (!newly_created) { + r = journal_file_fstat(f); + if (r < 0) + goto fail; + } + } else { + r = journal_file_fstat(f); + if (r < 0) + goto fail; + + /* If we just got the fd passed in, we don't really know if we created the file anew */ + newly_created = f->last_stat.st_size == 0 && journal_file_writable(f); + } + + r = mmap_cache_add_fd(mmap_cache, f->fd, mmap_prot_from_open_flags(open_flags), &f->cache_fd); + if (r < 0) + goto fail; + + if (newly_created) { + (void) journal_file_warn_btrfs(f); + + /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this + * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many + * file systems maintain for each file, but the API to query this is very new, hence let's emulate this + * via extended attributes. If extended attributes are not supported we'll just skip this, and rely + * solely on mtime/atime/ctime of the file. */ + (void) fd_setcrtime(f->fd, 0); + + r = journal_file_init_header(f, file_flags, template); + if (r < 0) + goto fail; + + r = journal_file_fstat(f); + if (r < 0) + goto fail; + } + + if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) { + r = -ENODATA; + goto fail; + } + + r = mmap_cache_fd_get(f->cache_fd, MMAP_CACHE_CATEGORY_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h); + if (r == -EINVAL) { + /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only + * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error + * code. */ + r = -EAFNOSUPPORT; + goto fail; + } + if (r < 0) + goto fail; + + f->header = h; + + if (!newly_created) { + r = journal_file_verify_header(f); + if (r < 0) + goto fail; + } + +#if HAVE_GCRYPT + if (!newly_created && journal_file_writable(f) && JOURNAL_HEADER_SEALED(f->header)) { + r = journal_file_fss_load(f); + if (r < 0) + goto fail; + } +#endif + + if (journal_file_writable(f)) { + if (metrics) { + journal_default_metrics(metrics, f->fd, JOURNAL_HEADER_COMPACT(f->header)); + f->metrics = *metrics; + } else if (template) + f->metrics = template->metrics; + + r = journal_file_refresh_header(f); + if (r < 0) + goto fail; + } + +#if HAVE_GCRYPT + r = journal_file_hmac_setup(f); + if (r < 0) + goto fail; +#endif + + if (newly_created) { + r = journal_file_setup_field_hash_table(f); + if (r < 0) + goto fail; + + r = journal_file_setup_data_hash_table(f); + if (r < 0) + goto fail; + +#if HAVE_GCRYPT + r = journal_file_append_first_tag(f); + if (r < 0) + goto fail; +#endif + } + + if (mmap_cache_fd_got_sigbus(f->cache_fd)) { + r = -EIO; + goto fail; + } + + if (template && template->post_change_timer) { + r = journal_file_enable_post_change_timer( + f, + sd_event_source_get_event(template->post_change_timer), + template->post_change_timer_period); + + if (r < 0) + goto fail; + } + + /* The file is opened now successfully, thus we take possession of any passed in fd. */ + f->close_fd = true; + + if (DEBUG_LOGGING) { + static int last_seal = -1, last_keyed_hash = -1; + static Compression last_compression = _COMPRESSION_INVALID; + static uint64_t last_bytes = UINT64_MAX; + + if (last_seal != JOURNAL_HEADER_SEALED(f->header) || + last_keyed_hash != JOURNAL_HEADER_KEYED_HASH(f->header) || + last_compression != JOURNAL_FILE_COMPRESSION(f) || + last_bytes != f->compress_threshold_bytes) { + + log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s", + yes_no(JOURNAL_HEADER_SEALED(f->header)), yes_no(JOURNAL_HEADER_KEYED_HASH(f->header)), + compression_to_string(JOURNAL_FILE_COMPRESSION(f)), FORMAT_BYTES(f->compress_threshold_bytes)); + last_seal = JOURNAL_HEADER_SEALED(f->header); + last_keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header); + last_compression = JOURNAL_FILE_COMPRESSION(f); + last_bytes = f->compress_threshold_bytes; + } + } + + *ret = f; + return 0; + +fail: + if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd)) + r = -EIO; + + (void) journal_file_close(f); + + if (newly_created && fd < 0) + (void) unlink(fname); + + return r; +} + +int journal_file_parse_uid_from_filename(const char *path, uid_t *ret_uid) { + _cleanup_free_ char *buf = NULL, *p = NULL; + const char *a, *b, *at; + int r; + + /* This helper returns -EREMOTE when the filename doesn't match user online/offline journal + * pattern. Hence it currently doesn't parse archived or disposed user journals. */ + + assert(path); + assert(ret_uid); + + r = path_extract_filename(path, &p); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return -EISDIR; + + a = startswith(p, "user-"); + if (!a) + return -EREMOTE; + b = endswith(p, ".journal"); + if (!b) + return -EREMOTE; + + at = strchr(a, '@'); + if (at) + return -EREMOTE; + + buf = strndup(a, b-a); + if (!buf) + return -ENOMEM; + + return parse_uid(buf, ret_uid); +} + +int journal_file_archive(JournalFile *f, char **ret_previous_path) { + _cleanup_free_ char *p = NULL; + + assert(f); + + if (!journal_file_writable(f)) + return -EINVAL; + + /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse + * rotation, since we don't know the actual path, and couldn't rename the file hence. */ + if (path_startswith(f->path, "/proc/self/fd")) + return -EINVAL; + + if (!endswith(f->path, ".journal")) + return -EINVAL; + + if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal", + (int) strlen(f->path) - 8, f->path, + SD_ID128_FORMAT_VAL(f->header->seqnum_id), + le64toh(f->header->head_entry_seqnum), + le64toh(f->header->head_entry_realtime)) < 0) + return -ENOMEM; + + /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's + * ignore that case. */ + if (rename(f->path, p) < 0 && errno != ENOENT) + return -errno; + + /* Sync the rename to disk */ + (void) fsync_directory_of_file(f->fd); + + if (ret_previous_path) + *ret_previous_path = f->path; + else + free(f->path); + + f->path = TAKE_PTR(p); + + /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state + * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE, + * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue + * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining + * occurs. */ + f->archive = true; + + return 0; +} + +int journal_file_dispose(int dir_fd, const char *fname) { + _cleanup_free_ char *p = NULL; + + assert(fname); + + /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that + * this is done without looking into the file or changing any of its contents. The idea is that this is called + * whenever something is suspicious and we want to move the file away and make clear that it is not accessed + * for writing anymore. */ + + if (!endswith(fname, ".journal")) + return -EINVAL; + + if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~", + (int) strlen(fname) - 8, fname, + now(CLOCK_REALTIME), + random_u64()) < 0) + return -ENOMEM; + + if (renameat(dir_fd, fname, dir_fd, p) < 0) + return -errno; + + return 0; +} + +int journal_file_copy_entry( + JournalFile *from, + JournalFile *to, + Object *o, + uint64_t p, + uint64_t *seqnum, + sd_id128_t *seqnum_id) { + + _cleanup_free_ EntryItem *items_alloc = NULL; + EntryItem *items; + uint64_t n, m = 0, xor_hash = 0; + sd_id128_t boot_id; + dual_timestamp ts; + int r; + + assert(from); + assert(to); + assert(o); + assert(p > 0); + + if (!journal_file_writable(to)) + return -EPERM; + + ts = (dual_timestamp) { + .monotonic = le64toh(o->entry.monotonic), + .realtime = le64toh(o->entry.realtime), + }; + boot_id = o->entry.boot_id; + + n = journal_file_entry_n_items(from, o); + if (n == 0) + return 0; + + if (n < ALLOCA_MAX / sizeof(EntryItem) / 2) + items = newa(EntryItem, n); + else { + items_alloc = new(EntryItem, n); + if (!items_alloc) + return -ENOMEM; + + items = items_alloc; + } + + for (uint64_t i = 0; i < n; i++) { + uint64_t h, q; + void *data; + size_t l; + Object *u; + + q = journal_file_entry_item_object_offset(from, o, i); + r = journal_file_data_payload(from, NULL, q, NULL, 0, 0, &data, &l); + if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) { + log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", i); + continue; + } + if (r < 0) + return r; + assert(r > 0); + + if (l == 0) + return -EBADMSG; + + r = journal_file_append_data(to, data, l, &u, &h); + if (r < 0) + return r; + + if (JOURNAL_HEADER_KEYED_HASH(to->header)) + xor_hash ^= jenkins_hash64(data, l); + else + xor_hash ^= le64toh(u->data.hash); + + items[m++] = (EntryItem) { + .object_offset = h, + .hash = le64toh(u->data.hash), + }; + } + + if (m == 0) + return 0; + + r = journal_file_append_entry_internal( + to, + &ts, + &boot_id, + &from->header->machine_id, + xor_hash, + items, + m, + seqnum, + seqnum_id, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); + + if (mmap_cache_fd_got_sigbus(to->cache_fd)) + return -EIO; + + return r; +} + +void journal_reset_metrics(JournalMetrics *m) { + assert(m); + + /* Set everything to "pick automatic values". */ + + *m = (JournalMetrics) { + .min_use = UINT64_MAX, + .max_use = UINT64_MAX, + .min_size = UINT64_MAX, + .max_size = UINT64_MAX, + .keep_free = UINT64_MAX, + .n_max_files = UINT64_MAX, + }; +} + +int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *ret_from, usec_t *ret_to) { + assert(f); + assert(f->header); + assert(ret_from || ret_to); + + if (ret_from) { + if (f->header->head_entry_realtime == 0) + return -ENOENT; + + *ret_from = le64toh(f->header->head_entry_realtime); + } + + if (ret_to) { + if (f->header->tail_entry_realtime == 0) + return -ENOENT; + + *ret_to = le64toh(f->header->tail_entry_realtime); + } + + return 1; +} + +int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *ret_from, usec_t *ret_to) { + Object *o; + uint64_t p; + int r; + + assert(f); + assert(ret_from || ret_to); + + /* FIXME: fix return value assignment on success with 0. */ + + r = find_data_object_by_boot_id(f, boot_id, &o, &p); + if (r <= 0) + return r; + + if (le64toh(o->data.n_entries) <= 0) + return 0; + + if (ret_from) { + r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o); + if (r < 0) + return r; + + *ret_from = le64toh(o->entry.monotonic); + } + + if (ret_to) { + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + r = journal_file_move_to_entry_for_data(f, o, DIRECTION_UP, &o, NULL); + if (r <= 0) + return r; + + *ret_to = le64toh(o->entry.monotonic); + } + + return 1; +} + +bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) { + assert(f); + assert(f->header); + + /* If we gained new header fields we gained new features, + * hence suggest a rotation */ + if (le64toh(f->header->header_size) < sizeof(Header)) { + log_ratelimit_full(log_level, JOURNAL_LOG_RATELIMIT, + "%s uses an outdated header, suggesting rotation.", f->path); + return true; + } + + /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from + * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we + * need the n_data field, which only exists in newer versions. */ + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) + if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) { + log_ratelimit_full( + log_level, JOURNAL_LOG_RATELIMIT, + "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %"PRIu64" file size, %"PRIu64" bytes per hash table item), suggesting rotation.", + f->path, + 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))), + le64toh(f->header->n_data), + le64toh(f->header->data_hash_table_size) / sizeof(HashItem), + (uint64_t) f->last_stat.st_size, + f->last_stat.st_size / le64toh(f->header->n_data)); + return true; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) + if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) { + log_ratelimit_full( + log_level, JOURNAL_LOG_RATELIMIT, + "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.", + f->path, + 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))), + le64toh(f->header->n_fields), + le64toh(f->header->field_hash_table_size) / sizeof(HashItem)); + return true; + } + + /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our + * longest chain is longer than some threshold, let's suggest rotation. */ + if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) && + le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) { + log_ratelimit_full( + log_level, JOURNAL_LOG_RATELIMIT, + "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.", + f->path, le64toh(f->header->data_hash_chain_depth)); + return true; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) && + le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) { + log_ratelimit_full( + log_level, JOURNAL_LOG_RATELIMIT, + "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.", + f->path, le64toh(f->header->field_hash_chain_depth)); + return true; + } + + /* Are the data objects properly indexed by field objects? */ + if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && + JOURNAL_HEADER_CONTAINS(f->header, n_fields) && + le64toh(f->header->n_data) > 0 && + le64toh(f->header->n_fields) == 0) { + log_ratelimit_full( + log_level, JOURNAL_LOG_RATELIMIT, + "Data objects of %s are not indexed by field objects, suggesting rotation.", + f->path); + return true; + } + + if (max_file_usec > 0) { + usec_t t, h; + + h = le64toh(f->header->head_entry_realtime); + t = now(CLOCK_REALTIME); + + if (h > 0 && t > h + max_file_usec) { + log_ratelimit_full( + log_level, JOURNAL_LOG_RATELIMIT, + "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.", + f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC)); + return true; + } + } + + return false; +} + +static const char * const journal_object_type_table[] = { + [OBJECT_UNUSED] = "unused", + [OBJECT_DATA] = "data", + [OBJECT_FIELD] = "field", + [OBJECT_ENTRY] = "entry", + [OBJECT_DATA_HASH_TABLE] = "data hash table", + [OBJECT_FIELD_HASH_TABLE] = "field hash table", + [OBJECT_ENTRY_ARRAY] = "entry array", + [OBJECT_TAG] = "tag", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType); diff --git a/src/libsystemd/sd-journal/journal-file.h b/src/libsystemd/sd-journal/journal-file.h new file mode 100644 index 0000000..81fafb9 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-file.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#if HAVE_GCRYPT +# include +#endif + +#include "sd-event.h" +#include "sd-id128.h" + +#include "compress.h" +#include "hashmap.h" +#include "journal-def.h" +#include "mmap-cache.h" +#include "sparse-endian.h" +#include "time-util.h" + +typedef struct JournalMetrics { + /* For all these: UINT64_MAX means "pick automatically", and 0 means "no limit enforced" */ + uint64_t max_size; /* how large journal files grow at max */ + uint64_t min_size; /* how large journal files grow at least */ + uint64_t max_use; /* how much disk space to use in total at max, keep_free permitting */ + uint64_t min_use; /* how much disk space to use in total at least, even if keep_free says not to */ + uint64_t keep_free; /* how much to keep free on disk */ + uint64_t n_max_files; /* how many files to keep around at max */ +} JournalMetrics; + +typedef enum direction { + DIRECTION_UP, + DIRECTION_DOWN, + _DIRECTION_INVALID = -EINVAL, +} direction_t; + +typedef enum LocationType { + /* The first and last entries, resp. */ + LOCATION_HEAD, + LOCATION_TAIL, + + /* We already read the entry we currently point to, and the + * next one to read should probably not be this one again. */ + LOCATION_DISCRETE, + + /* We should seek to the precise location specified, and + * return it, as we haven't read it yet. */ + LOCATION_SEEK +} LocationType; + +typedef enum OfflineState { + OFFLINE_JOINED, + OFFLINE_SYNCING, + OFFLINE_OFFLINING, + OFFLINE_CANCEL, + OFFLINE_AGAIN_FROM_SYNCING, + OFFLINE_AGAIN_FROM_OFFLINING, + OFFLINE_DONE +} OfflineState; + +typedef struct JournalFile { + int fd; + MMapFileDescriptor *cache_fd; + + mode_t mode; + + int open_flags; + bool close_fd:1; + bool archive:1; + bool strict_order:1; + + direction_t last_direction; + LocationType location_type; + uint64_t last_n_entries; + + char *path; + struct stat last_stat; + usec_t last_stat_usec; + + Header *header; + HashItem *data_hash_table; + HashItem *field_hash_table; + + uint64_t current_offset; + uint64_t current_seqnum; + uint64_t current_realtime; + uint64_t current_monotonic; + sd_id128_t current_boot_id; + uint64_t current_xor_hash; + + JournalMetrics metrics; + + sd_event_source *post_change_timer; + usec_t post_change_timer_period; + + OrderedHashmap *chain_cache; + + pthread_t offline_thread; + volatile OfflineState offline_state; + + unsigned last_seen_generation; + + uint64_t compress_threshold_bytes; +#if HAVE_COMPRESSION + void *compress_buffer; +#endif + +#if HAVE_GCRYPT + gcry_md_hd_t hmac; + bool hmac_running; + + FSSHeader *fss_file; + size_t fss_file_size; + + uint64_t fss_start_usec; + uint64_t fss_interval_usec; + + void *fsprg_state; + size_t fsprg_state_size; + + void *fsprg_seed; + size_t fsprg_seed_size; +#endif + + /* When we insert this file into the per-boot priority queue 'newest_by_boot_id' in sd_journal, then by these keys */ + sd_id128_t newest_boot_id; + sd_id128_t newest_machine_id; + uint64_t newest_monotonic_usec; + uint64_t newest_realtime_usec; + unsigned newest_boot_id_prioq_idx; + usec_t newest_mtime; +} JournalFile; + +typedef enum JournalFileFlags { + JOURNAL_COMPRESS = 1 << 0, + JOURNAL_SEAL = 1 << 1, + JOURNAL_STRICT_ORDER = 1 << 2, + _JOURNAL_FILE_FLAGS_MAX = JOURNAL_COMPRESS|JOURNAL_SEAL|JOURNAL_STRICT_ORDER, +} JournalFileFlags; + +typedef struct { + uint64_t object_offset; + uint64_t hash; +} EntryItem; + +int journal_file_open( + int fd, + const char *fname, + int open_flags, + JournalFileFlags file_flags, + mode_t mode, + uint64_t compress_threshold_bytes, + JournalMetrics *metrics, + MMapCache *mmap_cache, + JournalFile *template, + JournalFile **ret); + +int journal_file_set_offline_thread_join(JournalFile *f); +JournalFile* journal_file_close(JournalFile *j); +int journal_file_fstat(JournalFile *f); +DEFINE_TRIVIAL_CLEANUP_FUNC(JournalFile*, journal_file_close); + +#define ALIGN64(x) (((x) + 7ULL) & ~7ULL) +#define VALID64(x) (((x) & 7ULL) == 0ULL) + +/* Use six characters to cover the offsets common in smallish journal + * files without adding too many zeros. */ +#define OFSfmt "%06"PRIx64 + +static inline bool VALID_REALTIME(uint64_t u) { + /* This considers timestamps until the year 3112 valid. That should be plenty room... */ + return u > 0 && u < (1ULL << 55); +} + +static inline bool VALID_MONOTONIC(uint64_t u) { + /* This considers timestamps until 1142 years of runtime valid. */ + return u < (1ULL << 55); +} + +static inline bool VALID_EPOCH(uint64_t u) { + /* This allows changing the key for 1142 years, every usec. */ + return u < (1ULL << 55); +} + +#define JOURNAL_HEADER_CONTAINS(h, field) \ + (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field)) + +#define JOURNAL_HEADER_SEALED(h) \ + FLAGS_SET(le32toh((h)->compatible_flags), HEADER_COMPATIBLE_SEALED) + +#define JOURNAL_HEADER_SEALED_CONTINUOUS(h) \ + FLAGS_SET(le32toh((h)->compatible_flags), HEADER_COMPATIBLE_SEALED_CONTINUOUS) + +#define JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(h) \ + FLAGS_SET(le32toh((h)->compatible_flags), HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID) + +#define JOURNAL_HEADER_COMPRESSED_XZ(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_XZ) + +#define JOURNAL_HEADER_COMPRESSED_LZ4(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_LZ4) + +#define JOURNAL_HEADER_COMPRESSED_ZSTD(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPRESSED_ZSTD) + +#define JOURNAL_HEADER_KEYED_HASH(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_KEYED_HASH) + +#define JOURNAL_HEADER_COMPACT(h) \ + FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPACT) + +int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret); +int journal_file_pin_object(JournalFile *f, Object *o); +int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret); + +int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset); +int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset); + +static inline uint64_t journal_file_entry_item_object_offset(JournalFile *f, Object *o, size_t i) { + assert(f); + assert(o); + return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry.items.compact[i].object_offset) : + le64toh(o->entry.items.regular[i].object_offset); +} + +static inline size_t journal_file_entry_item_size(JournalFile *f) { + assert(f); + return JOURNAL_HEADER_COMPACT(f->header) ? sizeof_field(Object, entry.items.compact[0]) : + sizeof_field(Object, entry.items.regular[0]); +} + +uint64_t journal_file_entry_n_items(JournalFile *f, Object *o) _pure_; + +int journal_file_data_payload( + JournalFile *f, + Object *o, + uint64_t offset, + const char *field, + size_t field_length, + size_t data_threshold, + void **ret_data, + size_t *ret_size); + +static inline size_t journal_file_data_payload_offset(JournalFile *f) { + return JOURNAL_HEADER_COMPACT(f->header) + ? offsetof(Object, data.compact.payload) + : offsetof(Object, data.regular.payload); +} + +static inline uint8_t* journal_file_data_payload_field(JournalFile *f, Object *o) { + return JOURNAL_HEADER_COMPACT(f->header) ? o->data.compact.payload : o->data.regular.payload; +} + +uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_; + +static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) { + assert(f); + assert(o); + return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) : + le64toh(o->entry_array.items.regular[i]); +} + +static inline size_t journal_file_entry_array_item_size(JournalFile *f) { + assert(f); + return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t); +} + +uint64_t journal_file_hash_table_n_items(Object *o) _pure_; + +int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret_object, uint64_t *ret_offset); +int journal_file_append_entry( + JournalFile *f, + const dual_timestamp *ts, + const sd_id128_t *boot_id, + const struct iovec iovec[], + size_t n_iovec, + uint64_t *seqnum, + sd_id128_t *seqnum_id, + Object **ret_object, + uint64_t *ret_offset); + +int journal_file_find_data_object(JournalFile *f, const void *data, uint64_t size, Object **ret_object, uint64_t *ret_offset); +int journal_file_find_data_object_with_hash(JournalFile *f, const void *data, uint64_t size, uint64_t hash, Object **ret_object, uint64_t *ret_offset); + +int journal_file_find_field_object(JournalFile *f, const void *field, uint64_t size, Object **ret_object, uint64_t *ret_offset); +int journal_file_find_field_object_with_hash(JournalFile *f, const void *field, uint64_t size, uint64_t hash, Object **ret_object, uint64_t *ret_offset); + +void journal_file_reset_location(JournalFile *f); +void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset); +int journal_file_next_entry(JournalFile *f, uint64_t p, direction_t direction, Object **ret_object, uint64_t *ret_offset); + +int journal_file_move_to_entry_by_offset(JournalFile *f, uint64_t p, direction_t direction, Object **ret_object, uint64_t *ret_offset); +int journal_file_move_to_entry_by_seqnum(JournalFile *f, uint64_t seqnum, direction_t direction, Object **ret_object, uint64_t *ret_offset); +int journal_file_move_to_entry_by_realtime(JournalFile *f, uint64_t realtime, direction_t direction, Object **ret_object, uint64_t *ret_offset); +int journal_file_move_to_entry_by_monotonic(JournalFile *f, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret_object, uint64_t *ret_offset); + +int journal_file_move_to_entry_for_data(JournalFile *f, Object *d, direction_t direction, Object **ret_object, uint64_t *ret_offset); + +int journal_file_move_to_entry_by_offset_for_data(JournalFile *f, Object *d, uint64_t p, direction_t direction, Object **ret_object, uint64_t *ret_offset); +int journal_file_move_to_entry_by_seqnum_for_data(JournalFile *f, Object *d, uint64_t seqnum, direction_t direction, Object **ret_object, uint64_t *ret_offset); +int journal_file_move_to_entry_by_realtime_for_data(JournalFile *f, Object *d, uint64_t realtime, direction_t direction, Object **ret_object, uint64_t *ret_offset); +int journal_file_move_to_entry_by_monotonic_for_data(JournalFile *f, Object *d, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret_object, uint64_t *ret_offset); + +int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, sd_id128_t *seqnum_id); + +void journal_file_dump(JournalFile *f); +void journal_file_print_header(JournalFile *f); + +int journal_file_archive(JournalFile *f, char **ret_previous_path); +int journal_file_parse_uid_from_filename(const char *path, uid_t *uid); +JournalFile* journal_initiate_close(JournalFile *f, Set *deferred_closes); + +int journal_file_dispose(int dir_fd, const char *fname); + +void journal_file_post_change(JournalFile *f); +int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t); + +void journal_reset_metrics(JournalMetrics *m); + +int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *ret_from, usec_t *ret_to); +int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot, usec_t *ret_from, usec_t *ret_to); + +bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level); + +int journal_file_map_data_hash_table(JournalFile *f); +int journal_file_map_field_hash_table(JournalFile *f); + +static inline Compression JOURNAL_FILE_COMPRESSION(JournalFile *f) { + assert(f); + + if (JOURNAL_HEADER_COMPRESSED_XZ(f->header)) + return COMPRESSION_XZ; + if (JOURNAL_HEADER_COMPRESSED_LZ4(f->header)) + return COMPRESSION_LZ4; + if (JOURNAL_HEADER_COMPRESSED_ZSTD(f->header)) + return COMPRESSION_ZSTD; + return COMPRESSION_NONE; +} + +uint64_t journal_file_hash_data(JournalFile *f, const void *data, size_t sz); + +bool journal_field_valid(const char *p, size_t l, bool allow_protected); + +const char* journal_object_type_to_string(ObjectType type) _const_; + +static inline Compression COMPRESSION_FROM_OBJECT(const Object *o) { + assert(o); + + switch (o->object.flags & _OBJECT_COMPRESSED_MASK) { + case 0: + return COMPRESSION_NONE; + case OBJECT_COMPRESSED_XZ: + return COMPRESSION_XZ; + case OBJECT_COMPRESSED_LZ4: + return COMPRESSION_LZ4; + case OBJECT_COMPRESSED_ZSTD: + return COMPRESSION_ZSTD; + default: + return _COMPRESSION_INVALID; + } +} + +static inline uint8_t COMPRESSION_TO_OBJECT_FLAG(Compression c) { + switch (c) { + case COMPRESSION_XZ: + return OBJECT_COMPRESSED_XZ; + case COMPRESSION_LZ4: + return OBJECT_COMPRESSED_LZ4; + case COMPRESSION_ZSTD: + return OBJECT_COMPRESSED_ZSTD; + default: + return 0; + } +} + +static inline uint32_t COMPRESSION_TO_HEADER_INCOMPATIBLE_FLAG(Compression c) { + switch (c) { + case COMPRESSION_XZ: + return HEADER_INCOMPATIBLE_COMPRESSED_XZ; + case COMPRESSION_LZ4: + return HEADER_INCOMPATIBLE_COMPRESSED_LZ4; + case COMPRESSION_ZSTD: + return HEADER_INCOMPATIBLE_COMPRESSED_ZSTD; + default: + return 0; + } +} + +static inline bool journal_file_writable(JournalFile *f) { + assert(f); + return (f->open_flags & O_ACCMODE) != O_RDONLY; +} diff --git a/src/libsystemd/sd-journal/journal-internal.h b/src/libsystemd/sd-journal/journal-internal.h new file mode 100644 index 0000000..259aac8 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-internal.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-id128.h" +#include "sd-journal.h" + +#include "hashmap.h" +#include "journal-def.h" +#include "journal-file.h" +#include "list.h" +#include "set.h" + +#define JOURNAL_FILES_MAX 7168u + +#define JOURNAL_LOG_RATELIMIT ((const RateLimit) { .interval = 60 * USEC_PER_SEC, .burst = 3 }) + +typedef struct Match Match; +typedef struct Location Location; +typedef struct Directory Directory; + +typedef enum MatchType { + MATCH_DISCRETE, + MATCH_OR_TERM, + MATCH_AND_TERM +} MatchType; + +struct Match { + MatchType type; + Match *parent; + LIST_FIELDS(Match, matches); + + /* For concrete matches */ + char *data; + size_t size; + uint64_t hash; /* old-style jenkins hash. New-style siphash is different per file, hence won't be cached here */ + + /* For terms */ + LIST_HEAD(Match, matches); +}; + +struct Location { + LocationType type; + + bool seqnum_set:1; + bool realtime_set:1; + bool monotonic_set:1; + bool xor_hash_set:1; + + uint64_t seqnum; + sd_id128_t seqnum_id; + + uint64_t realtime; + + uint64_t monotonic; + sd_id128_t boot_id; + + uint64_t xor_hash; +}; + +struct Directory { + char *path; + int wd; + bool is_root; + unsigned last_seen_generation; +}; + +struct sd_journal { + int toplevel_fd; + + char *path; + char *prefix; + char *namespace; + + OrderedHashmap *files; + IteratedCache *files_cache; + MMapCache *mmap; + Hashmap *newest_by_boot_id; /* key: boot_id, value: prioq, ordered by monotonic timestamp of last update */ + + Location current_location; + + JournalFile *current_file; + uint64_t current_field; + + Match *level0, *level1, *level2; + + uint64_t origin_id; + + int inotify_fd; + unsigned current_invalidate_counter, last_invalidate_counter; + usec_t last_process_usec; + unsigned generation; + + /* Iterating through unique fields and their data values */ + char *unique_field; + JournalFile *unique_file; + uint64_t unique_offset; + + /* Iterating through known fields */ + JournalFile *fields_file; + uint64_t fields_offset; + uint64_t fields_hash_table_index; + char *fields_buffer; + + int flags; + + bool on_network:1; + bool no_new_files:1; + bool no_inotify:1; + bool unique_file_lost:1; /* File we were iterating over got + removed, and there were no more + files, so sd_j_enumerate_unique + will return a value equal to 0. */ + bool fields_file_lost:1; + bool has_runtime_files:1; + bool has_persistent_files:1; + + size_t data_threshold; + + Hashmap *directories_by_path; + Hashmap *directories_by_wd; + + Hashmap *errors; +}; + +char *journal_make_match_string(sd_journal *j); +void journal_print_header(sd_journal *j); + +#define JOURNAL_FOREACH_DATA_RETVAL(j, data, l, retval) \ + for (sd_journal_restart_data(j); ((retval) = sd_journal_enumerate_data((j), &(data), &(l))) > 0; ) + +/* All errors that we might encounter while extracting a field that are not real errors, + * but only mean that the field is too large or we don't support the compression. */ +static inline bool JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(int r) { + return IN_SET(abs(r), + ENOBUFS, /* Field or decompressed field too large */ + E2BIG, /* Field too large for pointer width */ + EPROTONOSUPPORT); /* Unsupported compression */ +} diff --git a/src/libsystemd/sd-journal/journal-send.c b/src/libsystemd/sd-journal/journal-send.c new file mode 100644 index 0000000..be23b2f --- /dev/null +++ b/src/libsystemd/sd-journal/journal-send.c @@ -0,0 +1,576 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +# include +#endif + +#define SD_JOURNAL_SUPPRESS_LOCATION + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "iovec-util.h" +#include "journal-send.h" +#include "memfd-util.h" +#include "missing_syscall.h" +#include "process-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tmpfile-util.h" + +#define SNDBUF_SIZE (8*1024*1024) + +#define ALLOCA_CODE_FUNC(f, func) \ + do { \ + size_t _fl; \ + const char *_func = (func); \ + char **_f = &(f); \ + _fl = strlen(_func) + 1; \ + *_f = newa(char, _fl + 10); \ + memcpy(*_f, "CODE_FUNC=", 10); \ + memcpy(*_f + 10, _func, _fl); \ + } while (false) + +/* We open a single fd, and we'll share it with the current process, + * all its threads, and all its subprocesses. This means we need to + * initialize it atomically, and need to operate on it atomically + * never assuming we are the only user */ +static int fd_plus_one = 0; + +static int journal_fd(void) { + int fd; + +retry: + if (fd_plus_one > 0) + return fd_plus_one - 1; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + fd_inc_sndbuf(fd, SNDBUF_SIZE); + + if (!__atomic_compare_exchange_n(&fd_plus_one, &(int){0}, fd+1, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { + safe_close(fd); + goto retry; + } + + return fd; +} + +int journal_fd_nonblock(bool nonblock) { + int r; + + r = journal_fd(); + if (r < 0) + return r; + + return fd_nonblock(r, nonblock); +} + +void close_journal_fd(void) { +#if HAVE_VALGRIND_VALGRIND_H + /* Be nice to valgrind. This is not atomic, so it is useful mainly for debugging. */ + + if (!RUNNING_ON_VALGRIND) + return; + + if (getpid_cached() != gettid()) + return; + + if (fd_plus_one <= 0) + return; + + safe_close(fd_plus_one - 1); + fd_plus_one = 0; +#endif +} + +_public_ int sd_journal_print(int priority, const char *format, ...) { + int r; + va_list ap; + + va_start(ap, format); + r = sd_journal_printv(priority, format, ap); + va_end(ap); + + return r; +} + +_public_ int sd_journal_printv(int priority, const char *format, va_list ap) { + char p[STRLEN("PRIORITY=") + DECIMAL_STR_MAX(int) + 1]; + char sbuf[LINE_MAX + 8] = "MESSAGE="; + struct iovec iov[2]; + int len; + va_list aq; + char *buffer = sbuf; + + assert_return(priority >= 0, -EINVAL); + assert_return(priority <= 7, -EINVAL); + assert_return(format, -EINVAL); + + xsprintf(p, "PRIORITY=%i", priority & LOG_PRIMASK); + + va_copy(aq, ap); + len = vsnprintf(buffer + 8, LINE_MAX, format, aq); + va_end(aq); + + if (len >= (int)LONG_LINE_MAX - 8) + return -ENOBUFS; + + /* Allocate large buffer to accommodate big message */ + if (len >= LINE_MAX) { + buffer = alloca_safe(len + 9); + memcpy(buffer, "MESSAGE=", 8); + assert_se(vsnprintf(buffer + 8, len + 1, format, ap) == len); + } + + /* Strip trailing whitespace, keep prefix whitespace. */ + (void) strstrip(buffer); + + /* Suppress empty lines */ + if (isempty(buffer + 8)) + return 0; + + iov[0] = IOVEC_MAKE_STRING(buffer); + iov[1] = IOVEC_MAKE_STRING(p); + + return sd_journal_sendv(iov, 2); +} + +_printf_(1, 0) static int fill_iovec_sprintf( + const char *format, + va_list ap, + size_t extra, + struct iovec **ret_iov, + size_t *ret_n_iov) { + + PROTECT_ERRNO; + struct iovec *iov = NULL; + size_t n = 0; + + assert(ret_iov); + assert(ret_n_iov); + + if (extra > 0) { + if (!GREEDY_REALLOC0(iov, extra)) + return -ENOMEM; + + n = extra; + } + + CLEANUP_ARRAY(iov, n, iovec_array_free); + + while (format) { + _cleanup_free_ char *buffer = NULL; + va_list aq; + + va_copy(aq, ap); + if (vasprintf(&buffer, format, aq) < 0) { + va_end(aq); + return -ENOMEM; + } + va_end(aq); + + VA_FORMAT_ADVANCE(format, ap); + format = va_arg(ap, char *); + + if (!GREEDY_REALLOC(iov, n + 1)) + return -ENOMEM; + + /* strip trailing whitespace, keep prefixing whitespace */ + iov[n++] = IOVEC_MAKE_STRING(delete_trailing_chars(TAKE_PTR(buffer), NULL)); + } + + *ret_iov = TAKE_PTR(iov); + *ret_n_iov = n; + return 0; +} + +_public_ int sd_journal_send(const char *format, ...) { + struct iovec *iov = NULL; + size_t n_iov = 0; + va_list ap; + int r; + + CLEANUP_ARRAY(iov, n_iov, iovec_array_free); + + va_start(ap, format); + r = fill_iovec_sprintf(format, ap, 0, &iov, &n_iov); + va_end(ap); + if (r < 0) + return r; + + return sd_journal_sendv(iov, n_iov); +} + +_public_ int sd_journal_sendv(const struct iovec *iov, int n) { + PROTECT_ERRNO; + int fd, r; + _cleanup_close_ int buffer_fd = -EBADF; + struct iovec *w; + uint64_t *l; + int i, j = 0; + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/journal/socket", + }; + struct msghdr mh = { + .msg_name = (struct sockaddr*) &sa.sa, + .msg_namelen = SOCKADDR_UN_LEN(sa.un), + }; + ssize_t k; + bool have_syslog_identifier = false; + bool seal = true; + + assert_return(iov, -EINVAL); + assert_return(n > 0, -EINVAL); + + w = newa(struct iovec, n * 5 + 3); + l = newa(uint64_t, n); + + for (i = 0; i < n; i++) { + char *c, *nl; + + if (_unlikely_(!iov[i].iov_base || iov[i].iov_len <= 1)) + return -EINVAL; + + c = memchr(iov[i].iov_base, '=', iov[i].iov_len); + if (_unlikely_(!c || c == iov[i].iov_base)) + return -EINVAL; + + have_syslog_identifier = have_syslog_identifier || + (c == (char *) iov[i].iov_base + 17 && + startswith(iov[i].iov_base, "SYSLOG_IDENTIFIER")); + + nl = memchr(iov[i].iov_base, '\n', iov[i].iov_len); + if (nl) { + if (_unlikely_(nl < c)) + return -EINVAL; + + /* Already includes a newline? Bummer, then + * let's write the variable name, then a + * newline, then the size (64-bit LE), followed + * by the data and a final newline */ + + w[j++] = IOVEC_MAKE(iov[i].iov_base, c - (char*) iov[i].iov_base); + w[j++] = IOVEC_MAKE_STRING("\n"); + + l[i] = htole64(iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1); + w[j++] = IOVEC_MAKE(&l[i], sizeof(uint64_t)); + + w[j++] = IOVEC_MAKE(c + 1, iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1); + } else + /* Nothing special? Then just add the line and + * append a newline */ + w[j++] = iov[i]; + + w[j++] = IOVEC_MAKE_STRING("\n"); + } + + if (!have_syslog_identifier && + string_is_safe(program_invocation_short_name)) { + + /* Implicitly add program_invocation_short_name, if it + * is not set explicitly. We only do this for + * program_invocation_short_name, and nothing else + * since everything else is much nicer to retrieve + * from the outside. */ + + w[j++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER="); + w[j++] = IOVEC_MAKE_STRING(program_invocation_short_name); + w[j++] = IOVEC_MAKE_STRING("\n"); + } + + fd = journal_fd(); + if (_unlikely_(fd < 0)) + return fd; + + mh.msg_iov = w; + mh.msg_iovlen = j; + + k = sendmsg(fd, &mh, MSG_NOSIGNAL); + if (k >= 0) + return 0; + + /* Fail silently if the journal is not available */ + if (errno == ENOENT) + return 0; + + if (!IN_SET(errno, EMSGSIZE, ENOBUFS, EAGAIN)) + return -errno; + + /* Message doesn't fit... Let's dump the data in a memfd or + * temporary file and just pass a file descriptor of it to the + * other side. + * + * For the temporary files we use /dev/shm instead of /tmp + * here, since we want this to be a tmpfs, and one that is + * available from early boot on and where unprivileged users + * can create files. */ + buffer_fd = memfd_new(NULL); + if (buffer_fd < 0) { + if (buffer_fd == -ENOSYS) { + buffer_fd = open_tmpfile_unlinkable("/dev/shm", O_RDWR | O_CLOEXEC); + if (buffer_fd < 0) + return buffer_fd; + + seal = false; + } else + return buffer_fd; + } + + n = writev(buffer_fd, w, j); + if (n < 0) + return -errno; + + if (seal) { + r = memfd_set_sealed(buffer_fd); + if (r < 0) + return r; + } + + r = send_one_fd_sa(fd, buffer_fd, mh.msg_name, mh.msg_namelen, 0); + if (r == -ENOENT) + /* Fail silently if the journal is not available */ + return 0; + return r; +} + +static int fill_iovec_perror_and_send(const char *message, int skip, struct iovec iov[]) { + PROTECT_ERRNO; + size_t n, k; + + k = isempty(message) ? 0 : strlen(message) + 2; + n = 8 + k + 256 + 1; + + for (;;) { + char buffer[n]; + char* j; + + errno = 0; + j = strerror_r(_saved_errno_, buffer + 8 + k, n - 8 - k); + if (errno == 0) { + char error[STRLEN("ERRNO=") + DECIMAL_STR_MAX(int) + 1]; + + if (j != buffer + 8 + k) + memmove(buffer + 8 + k, j, strlen(j)+1); + + memcpy(buffer, "MESSAGE=", 8); + + if (k > 0) { + memcpy(buffer + 8, message, k - 2); + memcpy(buffer + 8 + k - 2, ": ", 2); + } + + xsprintf(error, "ERRNO=%i", _saved_errno_); + + assert_cc(3 == LOG_ERR); + iov[skip+0] = IOVEC_MAKE_STRING("PRIORITY=3"); + iov[skip+1] = IOVEC_MAKE_STRING(buffer); + iov[skip+2] = IOVEC_MAKE_STRING(error); + + return sd_journal_sendv(iov, skip + 3); + } + + if (errno != ERANGE) + return -errno; + + n *= 2; + } +} + +_public_ int sd_journal_perror(const char *message) { + struct iovec iovec[3]; + + return fill_iovec_perror_and_send(message, 0, iovec); +} + +_public_ int sd_journal_stream_fd(const char *identifier, int priority, int level_prefix) { + _cleanup_close_ int fd = -EBADF; + char *header; + size_t l; + int r; + + assert_return(priority >= 0, -EINVAL); + assert_return(priority <= 7, -EINVAL); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/journal/stdout"); + if (r < 0) + return r; + + if (shutdown(fd, SHUT_RD) < 0) + return -errno; + + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); + + identifier = strempty(identifier); + + l = strlen(identifier); + header = newa(char, l + 1 + 1 + 2 + 2 + 2 + 2 + 2); + + memcpy(header, identifier, l); + header[l++] = '\n'; + header[l++] = '\n'; /* unit id */ + header[l++] = '0' + priority; + header[l++] = '\n'; + header[l++] = '0' + !!level_prefix; + header[l++] = '\n'; + header[l++] = '0'; + header[l++] = '\n'; + header[l++] = '0'; + header[l++] = '\n'; + header[l++] = '0'; + header[l++] = '\n'; + + r = loop_write(fd, header, l); + if (r < 0) + return r; + + return TAKE_FD(fd); +} + +_public_ int sd_journal_print_with_location(int priority, const char *file, const char *line, const char *func, const char *format, ...) { + int r; + va_list ap; + + va_start(ap, format); + r = sd_journal_printv_with_location(priority, file, line, func, format, ap); + va_end(ap); + + return r; +} + +_public_ int sd_journal_printv_with_location(int priority, const char *file, const char *line, const char *func, const char *format, va_list ap) { + char p[STRLEN("PRIORITY=") + DECIMAL_STR_MAX(int) + 1]; + char sbuf[LINE_MAX + 8] = "MESSAGE="; + struct iovec iov[5]; + char *f; + int len; + char *buffer = sbuf; + va_list aq; + + assert_return(priority >= 0, -EINVAL); + assert_return(priority <= 7, -EINVAL); + assert_return(format, -EINVAL); + + xsprintf(p, "PRIORITY=%i", priority & LOG_PRIMASK); + + va_copy(aq, ap); + len = vsnprintf(buffer + 8, LINE_MAX, format, aq); + va_end(aq); + + if (len >= (int)LONG_LINE_MAX - 8) + return -ENOBUFS; + + /* Allocate large buffer to accommodate big message */ + if (len >= LINE_MAX) { + buffer = alloca_safe(len + 9); + memcpy(buffer, "MESSAGE=", 8); + assert_se(vsnprintf(buffer + 8, len + 1, format, ap) == len); + } + + /* Strip trailing whitespace, keep prefixing whitespace */ + (void) strstrip(buffer); + + /* Suppress empty lines */ + if (isempty(buffer + 8)) + return 0; + + /* func is initialized from __func__ which is not a macro, but + * a static const char[], hence cannot easily be prefixed with + * CODE_FUNC=, hence let's do it manually here. */ + ALLOCA_CODE_FUNC(f, func); + + iov[0] = IOVEC_MAKE_STRING(buffer); + iov[1] = IOVEC_MAKE_STRING(p); + iov[2] = IOVEC_MAKE_STRING(file); + iov[3] = IOVEC_MAKE_STRING(line); + iov[4] = IOVEC_MAKE_STRING(f); + + return sd_journal_sendv(iov, ELEMENTSOF(iov)); +} + +_public_ int sd_journal_send_with_location(const char *file, const char *line, const char *func, const char *format, ...) { + struct iovec *iov = NULL; + size_t n_iov = 0; + va_list ap; + char *f; + int r; + + CLEANUP_ARRAY(iov, n_iov, iovec_array_free); + + va_start(ap, format); + r = fill_iovec_sprintf(format, ap, 3, &iov, &n_iov); + va_end(ap); + if (r < 0) + return r; + + ALLOCA_CODE_FUNC(f, func); + + iov[0] = IOVEC_MAKE_STRING(file); + iov[1] = IOVEC_MAKE_STRING(line); + iov[2] = IOVEC_MAKE_STRING(f); + + r = sd_journal_sendv(iov, n_iov); + + iov[0] = iov[1] = iov[2] = (struct iovec) {}; + + return r; +} + +_public_ int sd_journal_sendv_with_location( + const char *file, const char *line, + const char *func, + const struct iovec *iov, int n) { + + struct iovec *niov; + char *f; + + assert_return(iov, -EINVAL); + assert_return(n > 0, -EINVAL); + + niov = newa(struct iovec, n + 3); + memcpy(niov, iov, sizeof(struct iovec) * n); + + ALLOCA_CODE_FUNC(f, func); + + niov[n++] = IOVEC_MAKE_STRING(file); + niov[n++] = IOVEC_MAKE_STRING(line); + niov[n++] = IOVEC_MAKE_STRING(f); + + return sd_journal_sendv(niov, n); +} + +_public_ int sd_journal_perror_with_location( + const char *file, const char *line, + const char *func, + const char *message) { + + struct iovec iov[6]; + char *f; + + ALLOCA_CODE_FUNC(f, func); + + iov[0] = IOVEC_MAKE_STRING(file); + iov[1] = IOVEC_MAKE_STRING(line); + iov[2] = IOVEC_MAKE_STRING(f); + + return fill_iovec_perror_and_send(message, 3, iov); +} diff --git a/src/libsystemd/sd-journal/journal-send.h b/src/libsystemd/sd-journal/journal-send.h new file mode 100644 index 0000000..24315e2 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-send.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int journal_fd_nonblock(bool nonblock); +void close_journal_fd(void); diff --git a/src/libsystemd/sd-journal/journal-vacuum.c b/src/libsystemd/sd-journal/journal-vacuum.c new file mode 100644 index 0000000..829edb3 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-vacuum.c @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "journal-vacuum.h" +#include "sort-util.h" +#include "string-util.h" +#include "time-util.h" +#include "xattr-util.h" + +typedef struct vacuum_info { + uint64_t usage; + char *filename; + + uint64_t realtime; + + sd_id128_t seqnum_id; + uint64_t seqnum; + bool have_seqnum; +} vacuum_info; + +static int vacuum_info_compare(const vacuum_info *a, const vacuum_info *b) { + int r; + + if (a->have_seqnum && b->have_seqnum && + sd_id128_equal(a->seqnum_id, b->seqnum_id)) + return CMP(a->seqnum, b->seqnum); + + r = CMP(a->realtime, b->realtime); + if (r != 0) + return r; + + if (a->have_seqnum && b->have_seqnum) + return memcmp(&a->seqnum_id, &b->seqnum_id, 16); + + return strcmp(a->filename, b->filename); +} + +static void vacuum_info_array_free(vacuum_info *list, size_t n) { + if (!list) + return; + + FOREACH_ARRAY(i, list, n) + free(i->filename); + + free(list); +} + +static void patch_realtime( + int fd, + const char *fn, + const struct stat *st, + unsigned long long *realtime) { + + usec_t x; + + /* The timestamp was determined by the file name, but let's see if the file might actually be older + * than the file name suggested... */ + + assert(fd >= 0); + assert(fn); + assert(st); + assert(realtime); + + x = timespec_load(&st->st_ctim); + if (timestamp_is_set(x) && x < *realtime) + *realtime = x; + + x = timespec_load(&st->st_atim); + if (timestamp_is_set(x) && x < *realtime) + *realtime = x; + + x = timespec_load(&st->st_mtim); + if (timestamp_is_set(x) && x < *realtime) + *realtime = x; + + /* Let's read the original creation time, if possible. Ideally we'd just query the creation time the + * FS might provide, but unfortunately there's currently no sane API to query it. Hence let's + * implement this manually... */ + + if (fd_getcrtime_at(fd, fn, AT_SYMLINK_FOLLOW, &x) >= 0 && x < *realtime) + *realtime = x; +} + +static int journal_file_empty(int dir_fd, const char *name) { + _cleanup_close_ int fd = -EBADF; + struct stat st; + le64_t n_entries; + ssize_t n; + + fd = openat(dir_fd, name, O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NONBLOCK|O_NOATIME); + if (fd < 0) { + /* Maybe failed due to O_NOATIME and lack of privileges? */ + fd = openat(dir_fd, name, O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NONBLOCK); + if (fd < 0) + return -errno; + } + + if (fstat(fd, &st) < 0) + return -errno; + + /* If an offline file doesn't even have a header we consider it empty */ + if (st.st_size < (off_t) sizeof(Header)) + return 1; + + /* If the number of entries is empty, we consider it empty, too */ + n = pread(fd, &n_entries, sizeof(n_entries), offsetof(Header, n_entries)); + if (n < 0) + return -errno; + if (n != sizeof(n_entries)) + return -EIO; + + return le64toh(n_entries) <= 0; +} + +int journal_directory_vacuum( + const char *directory, + uint64_t max_use, + uint64_t n_max_files, + usec_t max_retention_usec, + usec_t *oldest_usec, + bool verbose) { + + uint64_t sum = 0, freed = 0, n_active_files = 0; + size_t n_list = 0, i; + _cleanup_closedir_ DIR *d = NULL; + vacuum_info *list = NULL; + usec_t retention_limit = 0; + int r; + + CLEANUP_ARRAY(list, n_list, vacuum_info_array_free); + + assert(directory); + + if (max_use <= 0 && max_retention_usec <= 0 && n_max_files <= 0) + return 0; + + if (max_retention_usec > 0) + retention_limit = usec_sub_unsigned(now(CLOCK_REALTIME), max_retention_usec); + + d = opendir(directory); + if (!d) + return -errno; + + FOREACH_DIRENT_ALL(de, d, return -errno) { + unsigned long long seqnum = 0, realtime; + _cleanup_free_ char *p = NULL; + sd_id128_t seqnum_id; + bool have_seqnum; + uint64_t size; + struct stat st; + size_t q; + + if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) { + log_debug_errno(errno, "Failed to stat file %s while vacuuming, ignoring: %m", de->d_name); + continue; + } + + if (!S_ISREG(st.st_mode)) + continue; + + size = 512UL * (uint64_t) st.st_blocks; + + q = strlen(de->d_name); + + if (endswith(de->d_name, ".journal")) { + + /* Vacuum archived files. Active files are + * left around */ + + if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8) { + n_active_files++; + sum += size; + continue; + } + + if (de->d_name[q-8-16-1] != '-' || + de->d_name[q-8-16-1-16-1] != '-' || + de->d_name[q-8-16-1-16-1-32-1] != '@') { + n_active_files++; + sum += size; + continue; + } + + p = strdup(de->d_name); + if (!p) + return -ENOMEM; + + de->d_name[q-8-16-1-16-1] = 0; + if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) { + n_active_files++; + sum += size; + continue; + } + + if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) { + n_active_files++; + sum += size; + continue; + } + + have_seqnum = true; + + } else if (endswith(de->d_name, ".journal~")) { + unsigned long long tmp; + + /* seqnum_id won't be initialised before use below, so set to 0 */ + seqnum_id = SD_ID128_NULL; + + /* Vacuum corrupted files */ + + if (q < 1 + 16 + 1 + 16 + 8 + 1) { + n_active_files++; + sum += size; + continue; + } + + if (de->d_name[q-1-8-16-1] != '-' || + de->d_name[q-1-8-16-1-16-1] != '@') { + n_active_files++; + sum += size; + continue; + } + + p = strdup(de->d_name); + if (!p) + return -ENOMEM; + + if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) { + n_active_files++; + sum += size; + continue; + } + + have_seqnum = false; + } else { + /* We do not vacuum unknown files! */ + log_debug("Not vacuuming unknown file %s.", de->d_name); + continue; + } + + r = journal_file_empty(dirfd(d), p); + if (r < 0) { + log_debug_errno(r, "Failed check if %s is empty, ignoring: %m", p); + continue; + } + if (r > 0) { + /* Always vacuum empty non-online files. */ + + r = unlinkat_deallocate(dirfd(d), p, 0); + if (r >= 0) { + + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "Deleted empty archived journal %s/%s (%s).", directory, p, FORMAT_BYTES(size)); + + freed += size; + } else if (r != -ENOENT) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to delete empty archived journal %s/%s: %m", + directory, p); + + continue; + } + + patch_realtime(dirfd(d), p, &st, &realtime); + + if (!GREEDY_REALLOC(list, n_list + 1)) + return -ENOMEM; + + list[n_list++] = (vacuum_info) { + .filename = TAKE_PTR(p), + .usage = size, + .seqnum = seqnum, + .realtime = realtime, + .seqnum_id = seqnum_id, + .have_seqnum = have_seqnum, + }; + + sum += size; + } + + typesafe_qsort(list, n_list, vacuum_info_compare); + + for (i = 0; i < n_list; i++) { + uint64_t left; + + left = n_active_files + n_list - i; + + if ((max_retention_usec <= 0 || list[i].realtime >= retention_limit) && + (max_use <= 0 || sum <= max_use) && + (n_max_files <= 0 || left <= n_max_files)) + break; + + r = unlinkat_deallocate(dirfd(d), list[i].filename, 0); + if (r >= 0) { + log_full(verbose ? LOG_INFO : LOG_DEBUG, "Deleted archived journal %s/%s (%s).", + directory, list[i].filename, FORMAT_BYTES(list[i].usage)); + freed += list[i].usage; + + if (list[i].usage < sum) + sum -= list[i].usage; + else + sum = 0; + + } else if (r != -ENOENT) + log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, + "Failed to delete archived journal %s/%s: %m", + directory, list[i].filename); + } + + if (oldest_usec && i < n_list && (*oldest_usec == 0 || list[i].realtime < *oldest_usec)) + *oldest_usec = list[i].realtime; + + log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals from %s.", + FORMAT_BYTES(freed), directory); + + return 0; +} diff --git a/src/libsystemd/sd-journal/journal-vacuum.h b/src/libsystemd/sd-journal/journal-vacuum.h new file mode 100644 index 0000000..d87c847 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-vacuum.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "time-util.h" + +int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t n_max_files, usec_t max_retention_usec, usec_t *oldest_usec, bool verbose); diff --git a/src/libsystemd/sd-journal/journal-verify.c b/src/libsystemd/sd-journal/journal-verify.c new file mode 100644 index 0000000..bdaa01d --- /dev/null +++ b/src/libsystemd/sd-journal/journal-verify.c @@ -0,0 +1,1436 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "compress.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "journal-authenticate.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-verify.h" +#include "lookup3.h" +#include "macro.h" +#include "terminal-util.h" +#include "tmpfile-util.h" + +static void draw_progress(uint64_t p, usec_t *last_usec) { + unsigned n, i, j, k; + usec_t z, x; + + if (!on_tty()) + return; + + z = now(CLOCK_MONOTONIC); + x = *last_usec; + + if (x != 0 && x + 40 * USEC_PER_MSEC > z) + return; + + *last_usec = z; + + n = (3 * columns()) / 4; + j = (n * (unsigned) p) / 65535ULL; + k = n - j; + + fputs("\r", stdout); + if (colors_enabled()) + fputs("\x1B[?25l", stdout); + + fputs(ansi_highlight_green(), stdout); + + for (i = 0; i < j; i++) + fputs("\xe2\x96\x88", stdout); + + fputs(ansi_normal(), stdout); + + for (i = 0; i < k; i++) + fputs("\xe2\x96\x91", stdout); + + printf(" %3"PRIu64"%%", 100U * p / 65535U); + + fputs("\r", stdout); + if (colors_enabled()) + fputs("\x1B[?25h", stdout); + + fflush(stdout); +} + +static uint64_t scale_progress(uint64_t scale, uint64_t p, uint64_t m) { + /* Calculates scale * p / m, but handles m == 0 safely, and saturates. + * Currently all callers use m >= 1, but we keep the check to be defensive. + */ + + if (p >= m || m == 0) + return scale; + + return scale * p / m; +} + +static void flush_progress(void) { + unsigned n, i; + + if (!on_tty()) + return; + + n = (3 * columns()) / 4; + + putchar('\r'); + + for (i = 0; i < n + 5; i++) + putchar(' '); + + putchar('\r'); + fflush(stdout); +} + +#define debug(_offset, _fmt, ...) do { \ + flush_progress(); \ + log_debug(OFSfmt": " _fmt, _offset, ##__VA_ARGS__); \ + } while (0) + +#define warning(_offset, _fmt, ...) do { \ + flush_progress(); \ + log_warning(OFSfmt": " _fmt, _offset, ##__VA_ARGS__); \ + } while (0) + +#define error(_offset, _fmt, ...) do { \ + flush_progress(); \ + log_error(OFSfmt": " _fmt, (uint64_t)_offset, ##__VA_ARGS__); \ + } while (0) + +#define error_errno(_offset, error, _fmt, ...) do { \ + flush_progress(); \ + log_error_errno(error, OFSfmt": " _fmt, (uint64_t)_offset, ##__VA_ARGS__); \ + } while (0) + +static int hash_payload(JournalFile *f, Object *o, uint64_t offset, const uint8_t *src, uint64_t size, uint64_t *res_hash) { + Compression c; + int r; + + assert(o); + assert(src); + assert(res_hash); + + c = COMPRESSION_FROM_OBJECT(o); + if (c < 0) + return -EBADMSG; + if (c != COMPRESSION_NONE) { + _cleanup_free_ void *b = NULL; + size_t b_size; + + r = decompress_blob(c, src, size, &b, &b_size, 0); + if (r < 0) { + error_errno(offset, r, "%s decompression failed: %m", + compression_to_string(c)); + return r; + } + + *res_hash = journal_file_hash_data(f, b, b_size); + } else + *res_hash = journal_file_hash_data(f, src, size); + + return 0; +} + +static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o) { + assert(f); + assert(offset); + assert(o); + + /* This does various superficial tests about the length an + * possible field values. It does not follow any references to + * other objects. */ + + if ((o->object.flags & _OBJECT_COMPRESSED_MASK) != 0 && + o->object.type != OBJECT_DATA) { + error(offset, + "Found compressed object of type %s that isn't of type data, which is not allowed.", + journal_object_type_to_string(o->object.type)); + return -EBADMSG; + } + + switch (o->object.type) { + + case OBJECT_DATA: { + uint64_t h1, h2; + int r; + + if (le64toh(o->data.entry_offset) == 0) + warning(offset, "Unused data (entry_offset==0)"); + + if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) { + error(offset, "Bad n_entries: %"PRIu64, le64toh(o->data.n_entries)); + return -EBADMSG; + } + + if (le64toh(o->object.size) - journal_file_data_payload_offset(f) <= 0) { + error(offset, "Bad object size (<= %zu): %"PRIu64, + journal_file_data_payload_offset(f), + le64toh(o->object.size)); + return -EBADMSG; + } + + h1 = le64toh(o->data.hash); + r = hash_payload(f, o, offset, journal_file_data_payload_field(f, o), + le64toh(o->object.size) - journal_file_data_payload_offset(f), + &h2); + if (r < 0) + return r; + + if (h1 != h2) { + error(offset, "Invalid hash (%08" PRIx64 " vs. %08" PRIx64 ")", h1, h2); + return -EBADMSG; + } + + if (!VALID64(le64toh(o->data.next_hash_offset)) || + !VALID64(le64toh(o->data.next_field_offset)) || + !VALID64(le64toh(o->data.entry_offset)) || + !VALID64(le64toh(o->data.entry_array_offset))) { + error(offset, "Invalid offset (next_hash_offset="OFSfmt", next_field_offset="OFSfmt", entry_offset="OFSfmt", entry_array_offset="OFSfmt, + le64toh(o->data.next_hash_offset), + le64toh(o->data.next_field_offset), + le64toh(o->data.entry_offset), + le64toh(o->data.entry_array_offset)); + return -EBADMSG; + } + + break; + } + + case OBJECT_FIELD: { + uint64_t h1, h2; + int r; + + if (le64toh(o->object.size) - offsetof(Object, field.payload) <= 0) { + error(offset, + "Bad field size (<= %zu): %"PRIu64, + offsetof(Object, field.payload), + le64toh(o->object.size)); + return -EBADMSG; + } + + h1 = le64toh(o->field.hash); + r = hash_payload(f, o, offset, o->field.payload, + le64toh(o->object.size) - offsetof(Object, field.payload), + &h2); + if (r < 0) + return r; + + if (h1 != h2) { + error(offset, "Invalid hash (%08" PRIx64 " vs. %08" PRIx64 ")", h1, h2); + return -EBADMSG; + } + + if (!VALID64(le64toh(o->field.next_hash_offset)) || + !VALID64(le64toh(o->field.head_data_offset))) { + error(offset, + "Invalid offset (next_hash_offset="OFSfmt", head_data_offset="OFSfmt, + le64toh(o->field.next_hash_offset), + le64toh(o->field.head_data_offset)); + return -EBADMSG; + } + break; + } + + case OBJECT_ENTRY: + if ((le64toh(o->object.size) - offsetof(Object, entry.items)) % journal_file_entry_item_size(f) != 0) { + error(offset, + "Bad entry size (<= %zu): %"PRIu64, + offsetof(Object, entry.items), + le64toh(o->object.size)); + return -EBADMSG; + } + + if ((le64toh(o->object.size) - offsetof(Object, entry.items)) / journal_file_entry_item_size(f) <= 0) { + error(offset, + "Invalid number items in entry: %"PRIu64, + (le64toh(o->object.size) - offsetof(Object, entry.items)) / journal_file_entry_item_size(f)); + return -EBADMSG; + } + + if (le64toh(o->entry.seqnum) <= 0) { + error(offset, + "Invalid entry seqnum: %"PRIx64, + le64toh(o->entry.seqnum)); + return -EBADMSG; + } + + if (!VALID_REALTIME(le64toh(o->entry.realtime))) { + error(offset, + "Invalid entry realtime timestamp: %"PRIu64, + le64toh(o->entry.realtime)); + return -EBADMSG; + } + + if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) { + error(offset, + "Invalid entry monotonic timestamp: %"PRIu64, + le64toh(o->entry.monotonic)); + return -EBADMSG; + } + + for (uint64_t i = 0; i < journal_file_entry_n_items(f, o); i++) { + if (journal_file_entry_item_object_offset(f, o, i) == 0 || + !VALID64(journal_file_entry_item_object_offset(f, o, i))) { + error(offset, + "Invalid entry item (%"PRIu64"/%"PRIu64") offset: "OFSfmt, + i, journal_file_entry_n_items(f, o), + journal_file_entry_item_object_offset(f, o, i)); + return -EBADMSG; + } + } + + break; + + case OBJECT_DATA_HASH_TABLE: + case OBJECT_FIELD_HASH_TABLE: + if ((le64toh(o->object.size) - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 || + (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0) { + error(offset, + "Invalid %s size: %"PRIu64, + journal_object_type_to_string(o->object.type), + le64toh(o->object.size)); + return -EBADMSG; + } + + for (uint64_t i = 0; i < journal_file_hash_table_n_items(o); i++) { + if (o->hash_table.items[i].head_hash_offset != 0 && + !VALID64(le64toh(o->hash_table.items[i].head_hash_offset))) { + error(offset, + "Invalid %s hash table item (%"PRIu64"/%"PRIu64") head_hash_offset: "OFSfmt, + journal_object_type_to_string(o->object.type), + i, journal_file_hash_table_n_items(o), + le64toh(o->hash_table.items[i].head_hash_offset)); + return -EBADMSG; + } + if (o->hash_table.items[i].tail_hash_offset != 0 && + !VALID64(le64toh(o->hash_table.items[i].tail_hash_offset))) { + error(offset, + "Invalid %s hash table item (%"PRIu64"/%"PRIu64") tail_hash_offset: "OFSfmt, + journal_object_type_to_string(o->object.type), + i, journal_file_hash_table_n_items(o), + le64toh(o->hash_table.items[i].tail_hash_offset)); + return -EBADMSG; + } + + if ((o->hash_table.items[i].head_hash_offset != 0) != + (o->hash_table.items[i].tail_hash_offset != 0)) { + error(offset, + "Invalid %s hash table item (%"PRIu64"/%"PRIu64"): head_hash_offset="OFSfmt" tail_hash_offset="OFSfmt, + journal_object_type_to_string(o->object.type), + i, journal_file_hash_table_n_items(o), + le64toh(o->hash_table.items[i].head_hash_offset), + le64toh(o->hash_table.items[i].tail_hash_offset)); + return -EBADMSG; + } + } + + break; + + case OBJECT_ENTRY_ARRAY: + if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 || + (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) { + error(offset, + "Invalid object entry array size: %"PRIu64, + le64toh(o->object.size)); + return -EBADMSG; + } + + if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) { + error(offset, + "Invalid object entry array next_entry_array_offset: "OFSfmt, + le64toh(o->entry_array.next_entry_array_offset)); + return -EBADMSG; + } + + for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) { + uint64_t q = journal_file_entry_array_item(f, o, i); + if (q != 0 && !VALID64(q)) { + error(offset, + "Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt, + i, journal_file_entry_array_n_items(f, o), q); + return -EBADMSG; + } + } + + break; + + case OBJECT_TAG: + if (le64toh(o->object.size) != sizeof(TagObject)) { + error(offset, + "Invalid object tag size: %"PRIu64, + le64toh(o->object.size)); + return -EBADMSG; + } + + if (!VALID_EPOCH(le64toh(o->tag.epoch))) { + error(offset, + "Invalid object tag epoch: %"PRIu64, + le64toh(o->tag.epoch)); + return -EBADMSG; + } + + break; + } + + return 0; +} + +static int write_uint64(FILE *fp, uint64_t p) { + if (fwrite(&p, sizeof(p), 1, fp) != 1) + return -EIO; + + return 0; +} + +static int contains_uint64(MMapFileDescriptor *f, uint64_t n, uint64_t p) { + uint64_t a, b; + int r; + + assert(f); + + /* Bisection ... */ + + a = 0; b = n; + while (a < b) { + uint64_t c, *z; + + c = (a + b) / 2; + + r = mmap_cache_fd_get(f, 0, false, c * sizeof(uint64_t), sizeof(uint64_t), NULL, (void **) &z); + if (r < 0) + return r; + + if (*z == p) + return 1; + + if (a + 1 >= b) + return 0; + + if (p < *z) + b = c; + else + a = c; + } + + return 0; +} + +static int verify_data( + JournalFile *f, + Object *o, uint64_t p, + MMapFileDescriptor *cache_entry_fd, uint64_t n_entries, + MMapFileDescriptor *cache_entry_array_fd, uint64_t n_entry_arrays) { + + uint64_t i, n, a, last, q; + int r; + + assert(f); + assert(o); + assert(cache_entry_fd); + assert(cache_entry_array_fd); + + n = le64toh(o->data.n_entries); + a = le64toh(o->data.entry_array_offset); + + /* Entry array means at least two objects */ + if (a && n < 2) { + error(p, "Entry array present (entry_array_offset="OFSfmt", but n_entries=%"PRIu64")", a, n); + return -EBADMSG; + } + + if (n == 0) + return 0; + + /* We already checked that earlier */ + assert(o->data.entry_offset); + + last = q = le64toh(o->data.entry_offset); + if (!contains_uint64(cache_entry_fd, n_entries, q)) { + error(p, "Data object references invalid entry at "OFSfmt, q); + return -EBADMSG; + } + + r = journal_file_move_to_entry_by_offset(f, q, DIRECTION_DOWN, NULL, NULL); + if (r < 0) + return r; + if (r == 0) { + error(q, "Entry object doesn't exist in the main entry array"); + return -EBADMSG; + } + + i = 1; + while (i < n) { + uint64_t next, m, j; + + if (a == 0) { + error(p, "Array chain too short"); + return -EBADMSG; + } + + if (!contains_uint64(cache_entry_array_fd, n_entry_arrays, a)) { + error(p, "Invalid array offset "OFSfmt, a); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + next = le64toh(o->entry_array.next_entry_array_offset); + if (next != 0 && next <= a) { + error(p, "Array chain has cycle (jumps back from "OFSfmt" to "OFSfmt")", a, next); + return -EBADMSG; + } + + m = journal_file_entry_array_n_items(f, o); + for (j = 0; i < n && j < m; i++, j++) { + + q = journal_file_entry_array_item(f, o, j); + if (q <= last) { + error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last); + return -EBADMSG; + } + last = q; + + if (!contains_uint64(cache_entry_fd, n_entries, q)) { + error(p, "Data object references invalid entry at "OFSfmt, q); + return -EBADMSG; + } + + r = journal_file_move_to_entry_by_offset(f, q, DIRECTION_DOWN, NULL, NULL); + if (r < 0) + return r; + if (r == 0) { + error(q, "Entry object doesn't exist in the main entry array"); + return -EBADMSG; + } + + /* Pointer might have moved, reposition */ + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + } + + a = next; + } + + return 0; +} + +static int verify_data_hash_table( + JournalFile *f, + MMapFileDescriptor *cache_data_fd, uint64_t n_data, + MMapFileDescriptor *cache_entry_fd, uint64_t n_entries, + MMapFileDescriptor *cache_entry_array_fd, uint64_t n_entry_arrays, + usec_t *last_usec, + bool show_progress) { + + uint64_t i, n; + int r; + + assert(f); + assert(cache_data_fd); + assert(cache_entry_fd); + assert(cache_entry_array_fd); + assert(last_usec); + + n = le64toh(f->header->data_hash_table_size) / sizeof(HashItem); + if (n <= 0) + return 0; + + r = journal_file_map_data_hash_table(f); + if (r < 0) + return log_error_errno(r, "Failed to map data hash table: %m"); + + for (i = 0; i < n; i++) { + uint64_t last = 0, p; + + if (show_progress) + draw_progress(0xC000 + scale_progress(0x3FFF, i, n), last_usec); + + p = le64toh(f->data_hash_table[i].head_hash_offset); + while (p != 0) { + Object *o; + uint64_t next; + + if (!contains_uint64(cache_data_fd, n_data, p)) { + error(p, "Invalid data object at hash entry %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + + next = le64toh(o->data.next_hash_offset); + if (next != 0 && next <= p) { + error(p, "Hash chain has a cycle in hash entry %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + if (le64toh(o->data.hash) % n != i) { + error(p, "Hash value mismatch in hash entry %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = verify_data(f, o, p, cache_entry_fd, n_entries, cache_entry_array_fd, n_entry_arrays); + if (r < 0) + return r; + + last = p; + p = next; + } + + if (last != le64toh(f->data_hash_table[i].tail_hash_offset)) { + error(p, + "Tail hash pointer mismatch in hash table (%"PRIu64" != %"PRIu64")", + last, + le64toh(f->data_hash_table[i].tail_hash_offset)); + return -EBADMSG; + } + } + + return 0; +} + +static int data_object_in_hash_table(JournalFile *f, uint64_t hash, uint64_t p) { + uint64_t n, h, q; + int r; + assert(f); + + n = le64toh(f->header->data_hash_table_size) / sizeof(HashItem); + if (n <= 0) + return 0; + + r = journal_file_map_data_hash_table(f); + if (r < 0) + return log_error_errno(r, "Failed to map data hash table: %m"); + + h = hash % n; + + q = le64toh(f->data_hash_table[h].head_hash_offset); + while (q != 0) { + Object *o; + + if (p == q) + return 1; + + r = journal_file_move_to_object(f, OBJECT_DATA, q, &o); + if (r < 0) + return r; + + q = le64toh(o->data.next_hash_offset); + } + + return 0; +} + +static int verify_entry( + JournalFile *f, + Object *o, uint64_t p, + MMapFileDescriptor *cache_data_fd, uint64_t n_data, + bool last) { + + uint64_t i, n; + int r; + + assert(f); + assert(o); + assert(cache_data_fd); + + n = journal_file_entry_n_items(f, o); + for (i = 0; i < n; i++) { + uint64_t q; + Object *u; + + q = journal_file_entry_item_object_offset(f, o, i); + + if (!contains_uint64(cache_data_fd, n_data, q)) { + error(p, "Invalid data object of entry"); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_DATA, q, &u); + if (r < 0) + return r; + + r = data_object_in_hash_table(f, le64toh(u->data.hash), q); + if (r < 0) + return r; + if (r == 0) { + error(p, "Data object missing from hash table"); + return -EBADMSG; + } + + /* Pointer might have moved, reposition */ + r = journal_file_move_to_object(f, OBJECT_DATA, q, &u); + if (r < 0) + return r; + + r = journal_file_move_to_entry_by_offset_for_data(f, u, p, DIRECTION_DOWN, NULL, NULL); + if (r < 0) + return r; + + /* The last entry object has a very high chance of not being referenced as journal files + * almost always run out of space during linking of entry items when trying to add a new + * entry array so let's not error in that scenario. */ + if (r == 0 && !last) { + error(p, "Entry object not referenced by linked data object at "OFSfmt, q); + return -EBADMSG; + } + } + + return 0; +} + +static int verify_entry_array( + JournalFile *f, + MMapFileDescriptor *cache_data_fd, uint64_t n_data, + MMapFileDescriptor *cache_entry_fd, uint64_t n_entries, + MMapFileDescriptor *cache_entry_array_fd, uint64_t n_entry_arrays, + usec_t *last_usec, + bool show_progress) { + + uint64_t i = 0, a, n, last = 0; + int r; + + assert(f); + assert(cache_data_fd); + assert(cache_entry_fd); + assert(cache_entry_array_fd); + assert(last_usec); + + n = le64toh(f->header->n_entries); + a = le64toh(f->header->entry_array_offset); + while (i < n) { + uint64_t next, m, j; + Object *o; + + if (show_progress) + draw_progress(0x8000 + scale_progress(0x3FFF, i, n), last_usec); + + if (a == 0) { + error(a, "Array chain too short at %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + if (!contains_uint64(cache_entry_array_fd, n_entry_arrays, a)) { + error(a, "Invalid array %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + + next = le64toh(o->entry_array.next_entry_array_offset); + if (next != 0 && next <= a) { + error(a, "Array chain has cycle at %"PRIu64" of %"PRIu64" (jumps back from to "OFSfmt")", i, n, next); + return -EBADMSG; + } + + m = journal_file_entry_array_n_items(f, o); + for (j = 0; i < n && j < m; i++, j++) { + uint64_t p; + + p = journal_file_entry_array_item(f, o, j); + if (p <= last) { + error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + last = p; + + if (!contains_uint64(cache_entry_fd, n_entries, p)) { + error(a, "Invalid array entry at %"PRIu64" of %"PRIu64, i, n); + return -EBADMSG; + } + + r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + + r = verify_entry(f, o, p, cache_data_fd, n_data, /*last=*/ i + 1 == n); + if (r < 0) + return r; + + /* Pointer might have moved, reposition */ + r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); + if (r < 0) + return r; + } + + a = next; + } + + return 0; +} + +static int verify_hash_table( + Object *o, uint64_t p, uint64_t *n_hash_tables, uint64_t header_offset, uint64_t header_size) { + + assert(o); + assert(n_hash_tables); + + if (*n_hash_tables > 1) { + error(p, + "More than one %s: %" PRIu64, + journal_object_type_to_string(o->object.type), + *n_hash_tables); + return -EBADMSG; + } + + if (header_offset != p + offsetof(Object, hash_table.items)) { + error(p, + "Header offset for %s invalid (%" PRIu64 " != %" PRIu64 ")", + journal_object_type_to_string(o->object.type), + header_offset, + p + offsetof(Object, hash_table.items)); + return -EBADMSG; + } + + if (header_size != le64toh(o->object.size) - offsetof(Object, hash_table.items)) { + error(p, + "Header size for %s invalid (%" PRIu64 " != %" PRIu64 ")", + journal_object_type_to_string(o->object.type), + header_size, + le64toh(o->object.size) - offsetof(Object, hash_table.items)); + return -EBADMSG; + } + + (*n_hash_tables)++; + + return 0; +} + +int journal_file_verify( + JournalFile *f, + const char *key, + usec_t *first_contained, usec_t *last_validated, usec_t *last_contained, + bool show_progress) { + int r; + Object *o; + uint64_t p = 0, last_epoch = 0, last_tag_realtime = 0; + + uint64_t entry_seqnum = 0, entry_monotonic = 0, entry_realtime = 0; + usec_t min_entry_realtime = USEC_INFINITY, max_entry_realtime = 0; + sd_id128_t entry_boot_id = {}; /* Unnecessary initialization to appease gcc */ + bool entry_seqnum_set = false, entry_monotonic_set = false, entry_realtime_set = false, found_main_entry_array = false; + uint64_t n_objects = 0, n_entries = 0, n_data = 0, n_fields = 0, n_data_hash_tables = 0, n_field_hash_tables = 0, n_entry_arrays = 0, n_tags = 0; + usec_t last_usec = 0; + _cleanup_close_ int data_fd = -EBADF, entry_fd = -EBADF, entry_array_fd = -EBADF; + _cleanup_fclose_ FILE *data_fp = NULL, *entry_fp = NULL, *entry_array_fp = NULL; + MMapFileDescriptor *cache_data_fd = NULL, *cache_entry_fd = NULL, *cache_entry_array_fd = NULL; + unsigned i; + bool found_last = false; + const char *tmp_dir = NULL; + MMapCache *m; + +#if HAVE_GCRYPT + uint64_t last_tag = 0; +#endif + assert(f); + + if (key) { +#if HAVE_GCRYPT + r = journal_file_parse_verification_key(f, key); + if (r < 0) { + log_error("Failed to parse seed."); + return r; + } +#else + return -EOPNOTSUPP; +#endif + } else if (JOURNAL_HEADER_SEALED(f->header)) + return -ENOKEY; + + r = var_tmp_dir(&tmp_dir); + if (r < 0) { + log_error_errno(r, "Failed to determine temporary directory: %m"); + goto fail; + } + + data_fd = open_tmpfile_unlinkable(tmp_dir, O_RDWR | O_CLOEXEC); + if (data_fd < 0) { + r = log_error_errno(data_fd, "Failed to create data file: %m"); + goto fail; + } + + entry_fd = open_tmpfile_unlinkable(tmp_dir, O_RDWR | O_CLOEXEC); + if (entry_fd < 0) { + r = log_error_errno(entry_fd, "Failed to create entry file: %m"); + goto fail; + } + + entry_array_fd = open_tmpfile_unlinkable(tmp_dir, O_RDWR | O_CLOEXEC); + if (entry_array_fd < 0) { + r = log_error_errno(entry_array_fd, + "Failed to create entry array file: %m"); + goto fail; + } + + m = mmap_cache_fd_cache(f->cache_fd); + r = mmap_cache_add_fd(m, data_fd, PROT_READ|PROT_WRITE, &cache_data_fd); + if (r < 0) { + log_error_errno(r, "Failed to cache data file: %m"); + goto fail; + } + + r = mmap_cache_add_fd(m, entry_fd, PROT_READ|PROT_WRITE, &cache_entry_fd); + if (r < 0) { + log_error_errno(r, "Failed to cache entry file: %m"); + goto fail; + } + + r = mmap_cache_add_fd(m, entry_array_fd, PROT_READ|PROT_WRITE, &cache_entry_array_fd); + if (r < 0) { + log_error_errno(r, "Failed to cache entry array file: %m"); + goto fail; + } + + r = take_fdopen_unlocked(&data_fd, "w+", &data_fp); + if (r < 0) { + log_error_errno(r, "Failed to open data file stream: %m"); + goto fail; + } + + r = take_fdopen_unlocked(&entry_fd, "w+", &entry_fp); + if (r < 0) { + log_error_errno(r, "Failed to open entry file stream: %m"); + goto fail; + } + + r = take_fdopen_unlocked(&entry_array_fd, "w+", &entry_array_fp); + if (r < 0) { + log_error_errno(r, "Failed to open entry array file stream: %m"); + goto fail; + } + + if (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SUPPORTED) { + log_error("Cannot verify file with unknown extensions."); + r = -EOPNOTSUPP; + goto fail; + } + + for (i = 0; i < sizeof(f->header->reserved); i++) + if (f->header->reserved[i] != 0) { + error(offsetof(Header, reserved[i]), "Reserved field is non-zero"); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_SEALED_CONTINUOUS(f->header)) + warning(p, + "This log file was sealed with an old journald version where the sequence of seals might not be continuous. We cannot guarantee completeness."); + + /* First iteration: we go through all objects, verify the + * superficial structure, headers, hashes. */ + + p = le64toh(f->header->header_size); + for (;;) { + /* Early exit if there are no objects in the file, at all */ + if (le64toh(f->header->tail_object_offset) == 0) + break; + + if (show_progress) + draw_progress(scale_progress(0x7FFF, p, le64toh(f->header->tail_object_offset)), &last_usec); + + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + if (r < 0) { + error_errno(p, r, "Invalid object: %m"); + goto fail; + } + + if (p > le64toh(f->header->tail_object_offset)) { + error(offsetof(Header, tail_object_offset), + "Invalid tail object pointer (%"PRIu64" > %"PRIu64")", + p, + le64toh(f->header->tail_object_offset)); + r = -EBADMSG; + goto fail; + } + + n_objects++; + + r = journal_file_object_verify(f, p, o); + if (r < 0) { + error_errno(p, r, "Invalid object contents: %m"); + goto fail; + } + + if (!!(o->object.flags & OBJECT_COMPRESSED_XZ) + + !!(o->object.flags & OBJECT_COMPRESSED_LZ4) + + !!(o->object.flags & OBJECT_COMPRESSED_ZSTD) > 1) { + error(p, "Object has multiple compression flags set (flags: 0x%x)", o->object.flags); + r = -EINVAL; + goto fail; + } + + if ((o->object.flags & OBJECT_COMPRESSED_XZ) && !JOURNAL_HEADER_COMPRESSED_XZ(f->header)) { + error(p, "XZ compressed object in file without XZ compression"); + r = -EBADMSG; + goto fail; + } + + if ((o->object.flags & OBJECT_COMPRESSED_LZ4) && !JOURNAL_HEADER_COMPRESSED_LZ4(f->header)) { + error(p, "LZ4 compressed object in file without LZ4 compression"); + r = -EBADMSG; + goto fail; + } + + if ((o->object.flags & OBJECT_COMPRESSED_ZSTD) && !JOURNAL_HEADER_COMPRESSED_ZSTD(f->header)) { + error(p, "ZSTD compressed object in file without ZSTD compression"); + r = -EBADMSG; + goto fail; + } + + switch (o->object.type) { + + case OBJECT_DATA: + r = write_uint64(data_fp, p); + if (r < 0) + goto fail; + + n_data++; + break; + + case OBJECT_FIELD: + n_fields++; + break; + + case OBJECT_ENTRY: + if (JOURNAL_HEADER_SEALED(f->header) && n_tags <= 0) { + error(p, "First entry before first tag"); + r = -EBADMSG; + goto fail; + } + + r = write_uint64(entry_fp, p); + if (r < 0) + goto fail; + + if (le64toh(o->entry.realtime) < last_tag_realtime) { + error(p, + "Older entry after newer tag (%"PRIu64" < %"PRIu64")", + le64toh(o->entry.realtime), + last_tag_realtime); + r = -EBADMSG; + goto fail; + } + + if (!entry_seqnum_set && + le64toh(o->entry.seqnum) != le64toh(f->header->head_entry_seqnum)) { + error(p, + "Head entry sequence number incorrect (%"PRIu64" != %"PRIu64")", + le64toh(o->entry.seqnum), + le64toh(f->header->head_entry_seqnum)); + r = -EBADMSG; + goto fail; + } + + if (entry_seqnum_set && + entry_seqnum >= le64toh(o->entry.seqnum)) { + error(p, + "Entry sequence number out of synchronization (%"PRIu64" >= %"PRIu64")", + entry_seqnum, + le64toh(o->entry.seqnum)); + r = -EBADMSG; + goto fail; + } + + entry_seqnum = le64toh(o->entry.seqnum); + entry_seqnum_set = true; + + if (entry_monotonic_set && + sd_id128_equal(entry_boot_id, o->entry.boot_id) && + entry_monotonic > le64toh(o->entry.monotonic)) { + error(p, + "Entry timestamp out of synchronization (%"PRIu64" > %"PRIu64")", + entry_monotonic, + le64toh(o->entry.monotonic)); + r = -EBADMSG; + goto fail; + } + + entry_monotonic = le64toh(o->entry.monotonic); + entry_boot_id = o->entry.boot_id; + entry_monotonic_set = true; + + if (!entry_realtime_set && + le64toh(o->entry.realtime) != le64toh(f->header->head_entry_realtime)) { + error(p, + "Head entry realtime timestamp incorrect (%"PRIu64" != %"PRIu64")", + le64toh(o->entry.realtime), + le64toh(f->header->head_entry_realtime)); + r = -EBADMSG; + goto fail; + } + + entry_realtime = le64toh(o->entry.realtime); + entry_realtime_set = true; + + max_entry_realtime = MAX(max_entry_realtime, le64toh(o->entry.realtime)); + min_entry_realtime = MIN(min_entry_realtime, le64toh(o->entry.realtime)); + + n_entries++; + break; + + case OBJECT_DATA_HASH_TABLE: + r = verify_hash_table(o, p, &n_data_hash_tables, + le64toh(f->header->data_hash_table_offset), + le64toh(f->header->data_hash_table_size)); + if (r < 0) + goto fail; + break; + + case OBJECT_FIELD_HASH_TABLE: + r = verify_hash_table(o, p, &n_field_hash_tables, + le64toh(f->header->field_hash_table_offset), + le64toh(f->header->field_hash_table_size)); + if (r < 0) + goto fail; + + break; + + case OBJECT_ENTRY_ARRAY: + r = write_uint64(entry_array_fp, p); + if (r < 0) + goto fail; + + if (p == le64toh(f->header->entry_array_offset)) { + if (found_main_entry_array) { + error(p, "More than one main entry array"); + r = -EBADMSG; + goto fail; + } + + found_main_entry_array = true; + } + + n_entry_arrays++; + break; + + case OBJECT_TAG: + if (!JOURNAL_HEADER_SEALED(f->header)) { + error(p, "Tag object in file without sealing"); + r = -EBADMSG; + goto fail; + } + + if (le64toh(o->tag.seqnum) != n_tags + 1) { + error(p, + "Tag sequence number out of synchronization (%"PRIu64" != %"PRIu64")", + le64toh(o->tag.seqnum), + n_tags + 1); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_SEALED_CONTINUOUS(f->header)) { + if (!(n_tags == 0 || (n_tags == 1 && le64toh(o->tag.epoch) == last_epoch) + || le64toh(o->tag.epoch) == last_epoch + 1)) { + error(p, + "Epoch sequence not continuous (%"PRIu64" vs %"PRIu64")", + le64toh(o->tag.epoch), + last_epoch); + r = -EBADMSG; + goto fail; + } + } else { + if (le64toh(o->tag.epoch) < last_epoch) { + error(p, + "Epoch sequence out of synchronization (%"PRIu64" < %"PRIu64")", + le64toh(o->tag.epoch), + last_epoch); + r = -EBADMSG; + goto fail; + } + } + +#if HAVE_GCRYPT + if (JOURNAL_HEADER_SEALED(f->header)) { + uint64_t q, rt, rt_end; + + debug(p, "Checking tag %"PRIu64"...", le64toh(o->tag.seqnum)); + + rt = f->fss_start_usec + le64toh(o->tag.epoch) * f->fss_interval_usec; + rt_end = usec_add(rt, f->fss_interval_usec); + if (entry_realtime_set && entry_realtime >= rt_end) { + error(p, + "tag/entry realtime timestamp out of synchronization (%"PRIu64" >= %"PRIu64")", + entry_realtime, + rt + f->fss_interval_usec); + r = -EBADMSG; + goto fail; + } + if (max_entry_realtime >= rt_end) { + error(p, + "Entry realtime (%"PRIu64", %s) is too late with respect to tag (%"PRIu64", %s)", + max_entry_realtime, FORMAT_TIMESTAMP(max_entry_realtime), + rt_end, FORMAT_TIMESTAMP(rt_end)); + r = -EBADMSG; + goto fail; + } + if (min_entry_realtime < rt) { + error(p, + "Entry realtime (%"PRIu64", %s) is too early with respect to tag (%"PRIu64", %s)", + min_entry_realtime, FORMAT_TIMESTAMP(min_entry_realtime), + rt, FORMAT_TIMESTAMP(rt)); + r = -EBADMSG; + goto fail; + } + min_entry_realtime = USEC_INFINITY; + + /* OK, now we know the epoch. So let's now set + * it, and calculate the HMAC for everything + * since the last tag. */ + r = journal_file_fsprg_seek(f, le64toh(o->tag.epoch)); + if (r < 0) + goto fail; + + r = journal_file_hmac_start(f); + if (r < 0) + goto fail; + + if (last_tag == 0) { + r = journal_file_hmac_put_header(f); + if (r < 0) + goto fail; + + q = le64toh(f->header->header_size); + } else + q = last_tag; + + while (q <= p) { + r = journal_file_move_to_object(f, OBJECT_UNUSED, q, &o); + if (r < 0) + goto fail; + + r = journal_file_hmac_put_object(f, OBJECT_UNUSED, o, q); + if (r < 0) + goto fail; + + q = q + ALIGN64(le64toh(o->object.size)); + } + + /* Position might have changed, let's reposition things */ + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + if (r < 0) + goto fail; + + if (memcmp(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH) != 0) { + error(p, "Tag failed verification"); + r = -EBADMSG; + goto fail; + } + + f->hmac_running = false; + last_tag_realtime = rt; + } + + last_tag = p + ALIGN64(le64toh(o->object.size)); +#endif + + last_epoch = le64toh(o->tag.epoch); + + n_tags++; + break; + } + + if (p == le64toh(f->header->tail_object_offset)) { + found_last = true; + break; + } + + p = p + ALIGN64(le64toh(o->object.size)); + }; + + if (!found_last && le64toh(f->header->tail_object_offset) != 0) { + error(le64toh(f->header->tail_object_offset), + "Tail object pointer dead (%"PRIu64" != 0)", + le64toh(f->header->tail_object_offset)); + r = -EBADMSG; + goto fail; + } + + if (n_objects != le64toh(f->header->n_objects)) { + error(offsetof(Header, n_objects), + "Object number mismatch (%"PRIu64" != %"PRIu64")", + n_objects, + le64toh(f->header->n_objects)); + r = -EBADMSG; + goto fail; + } + + if (n_entries != le64toh(f->header->n_entries)) { + error(offsetof(Header, n_entries), + "Entry number mismatch (%"PRIu64" != %"PRIu64")", + n_entries, + le64toh(f->header->n_entries)); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && + n_data != le64toh(f->header->n_data)) { + error(offsetof(Header, n_data), + "Data number mismatch (%"PRIu64" != %"PRIu64")", + n_data, + le64toh(f->header->n_data)); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_fields) && + n_fields != le64toh(f->header->n_fields)) { + error(offsetof(Header, n_fields), + "Field number mismatch (%"PRIu64" != %"PRIu64")", + n_fields, + le64toh(f->header->n_fields)); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_tags) && + n_tags != le64toh(f->header->n_tags)) { + error(offsetof(Header, n_tags), + "Tag number mismatch (%"PRIu64" != %"PRIu64")", + n_tags, + le64toh(f->header->n_tags)); + r = -EBADMSG; + goto fail; + } + + if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays) && + n_entry_arrays != le64toh(f->header->n_entry_arrays)) { + error(offsetof(Header, n_entry_arrays), + "Entry array number mismatch (%"PRIu64" != %"PRIu64")", + n_entry_arrays, + le64toh(f->header->n_entry_arrays)); + r = -EBADMSG; + goto fail; + } + + if (!found_main_entry_array && le64toh(f->header->entry_array_offset) != 0) { + error(0, "Missing main entry array"); + r = -EBADMSG; + goto fail; + } + + if (entry_seqnum_set && + entry_seqnum != le64toh(f->header->tail_entry_seqnum)) { + error(offsetof(Header, tail_entry_seqnum), + "Tail entry sequence number incorrect (%"PRIu64" != %"PRIu64")", + entry_seqnum, + le64toh(f->header->tail_entry_seqnum)); + r = -EBADMSG; + goto fail; + } + + if (entry_monotonic_set && + (sd_id128_equal(entry_boot_id, f->header->tail_entry_boot_id) && + JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) && + entry_monotonic != le64toh(f->header->tail_entry_monotonic))) { + error(0, + "Invalid tail monotonic timestamp (%"PRIu64" != %"PRIu64")", + entry_monotonic, + le64toh(f->header->tail_entry_monotonic)); + r = -EBADMSG; + goto fail; + } + + if (entry_realtime_set && entry_realtime != le64toh(f->header->tail_entry_realtime)) { + error(0, + "Invalid tail realtime timestamp (%"PRIu64" != %"PRIu64")", + entry_realtime, + le64toh(f->header->tail_entry_realtime)); + r = -EBADMSG; + goto fail; + } + + if (fflush(data_fp) != 0) { + r = log_error_errno(errno, "Failed to flush data file stream: %m"); + goto fail; + } + + if (fflush(entry_fp) != 0) { + r = log_error_errno(errno, "Failed to flush entry file stream: %m"); + goto fail; + } + + if (fflush(entry_array_fp) != 0) { + r = log_error_errno(errno, "Failed to flush entry array file stream: %m"); + goto fail; + } + + /* Second iteration: we follow all objects referenced from the + * two entry points: the object hash table and the entry + * array. We also check that everything referenced (directly + * or indirectly) in the data hash table also exists in the + * entry array, and vice versa. Note that we do not care for + * unreferenced objects. We only care that everything that is + * referenced is consistent. */ + + r = verify_entry_array(f, + cache_data_fd, n_data, + cache_entry_fd, n_entries, + cache_entry_array_fd, n_entry_arrays, + &last_usec, + show_progress); + if (r < 0) + goto fail; + + r = verify_data_hash_table(f, + cache_data_fd, n_data, + cache_entry_fd, n_entries, + cache_entry_array_fd, n_entry_arrays, + &last_usec, + show_progress); + if (r < 0) + goto fail; + + if (show_progress) + flush_progress(); + + mmap_cache_fd_free(cache_data_fd); + mmap_cache_fd_free(cache_entry_fd); + mmap_cache_fd_free(cache_entry_array_fd); + + if (first_contained) + *first_contained = le64toh(f->header->head_entry_realtime); +#if HAVE_GCRYPT + if (last_validated) + *last_validated = last_tag_realtime + f->fss_interval_usec; +#endif + if (last_contained) + *last_contained = le64toh(f->header->tail_entry_realtime); + + return 0; + +fail: + if (show_progress) + flush_progress(); + + log_error("File corruption detected at %s:%"PRIu64" (of %"PRIu64" bytes, %"PRIu64"%%).", + f->path, + p, + (uint64_t) f->last_stat.st_size, + 100U * p / (uint64_t) f->last_stat.st_size); + + if (cache_data_fd) + mmap_cache_fd_free(cache_data_fd); + + if (cache_entry_fd) + mmap_cache_fd_free(cache_entry_fd); + + if (cache_entry_array_fd) + mmap_cache_fd_free(cache_entry_array_fd); + + return r; +} diff --git a/src/libsystemd/sd-journal/journal-verify.h b/src/libsystemd/sd-journal/journal-verify.h new file mode 100644 index 0000000..5790330 --- /dev/null +++ b/src/libsystemd/sd-journal/journal-verify.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journal-file.h" + +int journal_file_verify(JournalFile *f, const char *key, usec_t *first_contained, usec_t *last_validated, usec_t *last_contained, bool show_progress); diff --git a/src/libsystemd/sd-journal/lookup3.c b/src/libsystemd/sd-journal/lookup3.c new file mode 100644 index 0000000..c2a6406 --- /dev/null +++ b/src/libsystemd/sd-journal/lookup3.c @@ -0,0 +1,1002 @@ +/* SPDX-License-Identifier: LicenseRef-lookup3-public-domain */ +/* Slightly modified by Lennart Poettering, to avoid name clashes, and + * unexport a few functions. */ + +#include "lookup3.h" + +#if HAVE_VALGRIND_VALGRIND_H +# include +#else +# define RUNNING_ON_VALGRIND 0 +#endif + +/* +------------------------------------------------------------------------------- +lookup3.c, by Bob Jenkins, May 2006, Public Domain. + +These are functions for producing 32-bit hashes for hash table lookup. +hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() +are externally useful functions. Routines to test the hash are included +if SELF_TEST is defined. You can use this free for any purpose. It's in +the public domain. It has no warranty. + +You probably want to use hashlittle(). hashlittle() and hashbig() +hash byte arrays. hashlittle() is faster than hashbig() on +little-endian machines. Intel and AMD are little-endian machines. +On second thought, you probably want hashlittle2(), which is identical to +hashlittle() except it returns two 32-bit hashes for the price of one. +You could implement hashbig2() if you wanted but I haven't bothered here. + +If you want to find a hash of, say, exactly 7 integers, do + a = i1; b = i2; c = i3; + mix(a,b,c); + a += i4; b += i5; c += i6; + mix(a,b,c); + a += i7; + final(a,b,c); +then use c as the hash value. If you have a variable length array of +4-byte integers to hash, use hashword(). If you have a byte array (like +a character string), use hashlittle(). If you have several byte arrays, or +a mix of things, see the comments above hashlittle(). + +Why is this so big? I read 12 bytes at a time into 3 4-byte integers, +then mix those integers. This is fast (you can do a lot more thorough +mixing with 12*3 instructions on 3 integers than you can with 3 instructions +on 1 byte), but shoehorning those bytes into integers efficiently is messy. +------------------------------------------------------------------------------- +*/ +/* #define SELF_TEST 1 */ + +#include /* defines uint32_t etc */ +#include /* defines printf for tests */ +#include /* attempt to define endianness */ +#include /* defines time_t for timings in the test */ +#ifdef linux +# include /* attempt to define endianness */ +#endif + +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif + +/* + * My best guess at if you are big-endian or little-endian. This may + * need adjustment. + */ +#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \ + __BYTE_ORDER == __LITTLE_ENDIAN) || \ + (defined(i386) || defined(__i386__) || defined(__i486__) || \ + defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL)) +# define HASH_LITTLE_ENDIAN 1 +# define HASH_BIG_ENDIAN 0 +#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \ + __BYTE_ORDER == __BIG_ENDIAN) || \ + (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel)) +# define HASH_LITTLE_ENDIAN 0 +# define HASH_BIG_ENDIAN 1 +#else +# define HASH_LITTLE_ENDIAN 0 +# define HASH_BIG_ENDIAN 0 +#endif + +#define hashsize(n) ((uint32_t)1<<(n)) +#define hashmask(n) (hashsize(n)-1) +#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used http://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} + +/* +-------------------------------------------------------------------- + This works on all machines. To be useful, it requires + -- that the key be an array of uint32_t's, and + -- that the length be the number of uint32_t's in the key + + The function hashword() is identical to hashlittle() on little-endian + machines, and identical to hashbig() on big-endian machines, + except that the length has to be measured in uint32_ts rather than in + bytes. hashlittle() is more complicated than hashword() only because + hashlittle() has to dance around fitting the key bytes into registers. +-------------------------------------------------------------------- +*/ +uint32_t jenkins_hashword( +const uint32_t *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32_t initval) /* the previous hash, or an arbitrary value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + return c; +} + +/* +-------------------------------------------------------------------- +hashword2() -- same as hashword(), but take two seeds and return two +32-bit values. pc and pb must both be nonnull, and *pc and *pb must +both be initialized with seeds. If you pass in (*pb)==0, the output +(*pc) will be the same as the return value from hashword(). +-------------------------------------------------------------------- +*/ +void jenkins_hashword2 ( +const uint32_t *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32_t *pc, /* IN: seed OUT: primary hash value */ +uint32_t *pb) /* IN: more seed OUT: secondary hash value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc; + c += *pb; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + *pc=c; *pb=b; +} + +/* +------------------------------------------------------------------------------- +hashlittle() -- hash a variable-length key into a 32-bit value + k : the key (the unaligned variable-length array of bytes) + length : the length of the key, counting by bytes + initval : can be any 4-byte value +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Two keys differing by one or two bits will have +totally different hash values. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do + h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (uint8_t **)k, do it like this: + for (i=0, h=0; i 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]&0xffffff" actually reads beyond the end of the string, but + * then masks off the part it's not allowed to read. Because the + * string is aligned, the masked-off tail is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But valgrind will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + */ +#define VALGRIND_LIKE (_unlikely_(HAS_FEATURE_ADDRESS_SANITIZER || \ + HAS_FEATURE_MEMORY_SANITIZER || \ + RUNNING_ON_VALGRIND)) + + if (!VALGRIND_LIKE) { + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff; a+=k[0]; break; + case 6 : b+=k[1]&0xffff; a+=k[0]; break; + case 5 : b+=k[1]&0xff; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff; break; + case 2 : a+=k[0]&0xffff; break; + case 1 : a+=k[0]&0xff; break; + case 0 : return c; /* zero length strings require no mixing */ + } + } else { + const uint8_t *k8 = (const uint8_t *) k; + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ + case 1 : a+=k8[0]; break; + case 0 : return c; + } + } + + } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; + + /*--------------- all but last block: aligned reads and different mixing */ + while (length > 12) + { + a += k[0] + (((uint32_t)k[1])<<16); + b += k[2] + (((uint32_t)k[3])<<16); + c += k[4] + (((uint32_t)k[5])<<16); + mix(a,b,c); + length -= 12; + k += 6; + } + + /*----------------------------- handle the last (probably partial) block */ + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[4]+(((uint32_t)k[5])<<16); + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=k[4]; + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=k[2]; + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=k[0]; + break; + case 1 : a+=k8[0]; + break; + case 0 : return c; /* zero length requires no mixing */ + } + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + a += ((uint32_t)k[1])<<8; + a += ((uint32_t)k[2])<<16; + a += ((uint32_t)k[3])<<24; + b += k[4]; + b += ((uint32_t)k[5])<<8; + b += ((uint32_t)k[6])<<16; + b += ((uint32_t)k[7])<<24; + c += k[8]; + c += ((uint32_t)k[9])<<8; + c += ((uint32_t)k[10])<<16; + c += ((uint32_t)k[11])<<24; + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=((uint32_t)k[11])<<24; + case 11: c+=((uint32_t)k[10])<<16; + case 10: c+=((uint32_t)k[9])<<8; + case 9 : c+=k[8]; + case 8 : b+=((uint32_t)k[7])<<24; + case 7 : b+=((uint32_t)k[6])<<16; + case 6 : b+=((uint32_t)k[5])<<8; + case 5 : b+=k[4]; + case 4 : a+=((uint32_t)k[3])<<24; + case 3 : a+=((uint32_t)k[2])<<16; + case 2 : a+=((uint32_t)k[1])<<8; + case 1 : a+=k[0]; + break; + case 0 : return c; + } + } + + final(a,b,c); + return c; +} + +/* + * hashlittle2: return 2 32-bit hash values + * + * This is identical to hashlittle(), except it returns two 32-bit hash + * values instead of just one. This is good enough for hash table + * lookup with 2^^64 buckets, or if you want a second hash if you're not + * happy with the first, or if you want a probably-unique 64-bit ID for + * the key. *pc is better mixed than *pb, so use *pc first. If you want + * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)". + */ +void jenkins_hashlittle2( + const void *key, /* the key to hash */ + size_t length, /* length of the key */ + uint32_t *pc, /* IN: primary initval, OUT: primary hash */ + uint32_t *pb) /* IN: secondary initval, OUT: secondary hash */ +{ + uint32_t a,b,c; /* internal state */ + union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc; + c += *pb; + + u.ptr = key; + if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ + + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]&0xffffff" actually reads beyond the end of the string, but + * then masks off the part it's not allowed to read. Because the + * string is aligned, the masked-off tail is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But valgrind will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + */ + if (!VALGRIND_LIKE) { + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff; a+=k[0]; break; + case 6 : b+=k[1]&0xffff; a+=k[0]; break; + case 5 : b+=k[1]&0xff; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff; break; + case 2 : a+=k[0]&0xffff; break; + case 1 : a+=k[0]&0xff; break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + } else { + const uint8_t *k8 = (const uint8_t *)k; + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ + case 1 : a+=k8[0]; break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + } + + } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; + + /*--------------- all but last block: aligned reads and different mixing */ + while (length > 12) + { + a += k[0] + (((uint32_t)k[1])<<16); + b += k[2] + (((uint32_t)k[3])<<16); + c += k[4] + (((uint32_t)k[5])<<16); + mix(a,b,c); + length -= 12; + k += 6; + } + + /*----------------------------- handle the last (probably partial) block */ + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[4]+(((uint32_t)k[5])<<16); + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=k[4]; + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=k[2]; + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=k[0]; + break; + case 1 : a+=k8[0]; + break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + a += ((uint32_t)k[1])<<8; + a += ((uint32_t)k[2])<<16; + a += ((uint32_t)k[3])<<24; + b += k[4]; + b += ((uint32_t)k[5])<<8; + b += ((uint32_t)k[6])<<16; + b += ((uint32_t)k[7])<<24; + c += k[8]; + c += ((uint32_t)k[9])<<8; + c += ((uint32_t)k[10])<<16; + c += ((uint32_t)k[11])<<24; + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=((uint32_t)k[11])<<24; + case 11: c+=((uint32_t)k[10])<<16; + case 10: c+=((uint32_t)k[9])<<8; + case 9 : c+=k[8]; + case 8 : b+=((uint32_t)k[7])<<24; + case 7 : b+=((uint32_t)k[6])<<16; + case 6 : b+=((uint32_t)k[5])<<8; + case 5 : b+=k[4]; + case 4 : a+=((uint32_t)k[3])<<24; + case 3 : a+=((uint32_t)k[2])<<16; + case 2 : a+=((uint32_t)k[1])<<8; + case 1 : a+=k[0]; + break; + case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ + } + } + + final(a,b,c); + *pc=c; *pb=b; +} + +/* + * hashbig(): + * This is the same as hashword() on big-endian machines. It is different + * from hashlittle() on all machines. hashbig() takes advantage of + * big-endian byte ordering. + */ +uint32_t jenkins_hashbig( const void *key, size_t length, uint32_t initval) +{ + uint32_t a,b,c; + union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */ + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length) + initval; + + u.ptr = key; + if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ + + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]<<8" actually reads beyond the end of the string, but + * then shifts out the part it's not allowed to read. Because the + * string is aligned, the illegal read is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But valgrind will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + */ + + if (!VALGRIND_LIKE) { + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff00; a+=k[0]; break; + case 6 : b+=k[1]&0xffff0000; a+=k[0]; break; + case 5 : b+=k[1]&0xff000000; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff00; break; + case 2 : a+=k[0]&0xffff0000; break; + case 1 : a+=k[0]&0xff000000; break; + case 0 : return c; /* zero length strings require no mixing */ + } + } else { + const uint8_t *k8 = (const uint8_t *)k; + + switch(length) /* all the case statements fall through */ + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<8; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<16; /* fall through */ + case 9 : c+=((uint32_t)k8[8])<<24; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<8; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<16; /* fall through */ + case 5 : b+=((uint32_t)k8[4])<<24; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<8; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<16; /* fall through */ + case 1 : a+=((uint32_t)k8[0])<<24; break; + case 0 : return c; + } + } + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += ((uint32_t)k[0])<<24; + a += ((uint32_t)k[1])<<16; + a += ((uint32_t)k[2])<<8; + a += ((uint32_t)k[3]); + b += ((uint32_t)k[4])<<24; + b += ((uint32_t)k[5])<<16; + b += ((uint32_t)k[6])<<8; + b += ((uint32_t)k[7]); + c += ((uint32_t)k[8])<<24; + c += ((uint32_t)k[9])<<16; + c += ((uint32_t)k[10])<<8; + c += ((uint32_t)k[11]); + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=k[11]; + case 11: c+=((uint32_t)k[10])<<8; + case 10: c+=((uint32_t)k[9])<<16; + case 9 : c+=((uint32_t)k[8])<<24; + case 8 : b+=k[7]; + case 7 : b+=((uint32_t)k[6])<<8; + case 6 : b+=((uint32_t)k[5])<<16; + case 5 : b+=((uint32_t)k[4])<<24; + case 4 : a+=k[3]; + case 3 : a+=((uint32_t)k[2])<<8; + case 2 : a+=((uint32_t)k[1])<<16; + case 1 : a+=((uint32_t)k[0])<<24; + break; + case 0 : return c; + } + } + + final(a,b,c); + return c; +} + +#ifdef SELF_TEST + +/* used for timings */ +void driver1() +{ + uint8_t buf[256]; + uint32_t i; + uint32_t h=0; + time_t a,z; + + time(&a); + for (i=0; i<256; ++i) buf[i] = 'x'; + for (i=0; i<1; ++i) + { + h = hashlittle(&buf[0],1,h); + } + time(&z); + if (z-a > 0) printf("time %d %.8x\n", z-a, h); +} + +/* check that every input bit changes every output bit half the time */ +#define HASHSTATE 1 +#define HASHLEN 1 +#define MAXPAIR 60 +#define MAXLEN 70 +void driver2() +{ + uint8_t qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1]; + uint32_t c[HASHSTATE], d[HASHSTATE], i=0, j=0, k, l, m=0, z; + uint32_t e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE]; + uint32_t x[HASHSTATE],y[HASHSTATE]; + uint32_t hlen; + + printf("No more than %d trials should ever be needed \n",MAXPAIR/2); + for (hlen=0; hlen < MAXLEN; ++hlen) + { + z=0; + for (i=0; i>(8-j)); + c[0] = hashlittle(a, hlen, m); + b[i] ^= ((k+1)<>(8-j)); + d[0] = hashlittle(b, hlen, m); + /* check every bit is 1, 0, set, and not set at least once */ + for (l=0; lz) z=k; + if (k==MAXPAIR) + { + printf("Some bit didn't change: "); + printf("%.8x %.8x %.8x %.8x %.8x %.8x ", + e[0],f[0],g[0],h[0],x[0],y[0]); + printf("i %d j %d m %d len %d\n", i, j, m, hlen); + } + if (z==MAXPAIR) goto done; + } + } + } + done: + if (z < MAXPAIR) + { + printf("Mix success %2d bytes %2d initvals ",i,m); + printf("required %d trials\n", z/2); + } + } + printf("\n"); +} + +/* Check for reading beyond the end of the buffer and alignment problems */ +void driver3() +{ + uint8_t buf[MAXLEN+20], *b; + uint32_t len; + uint8_t q[] = "This is the time for all good men to come to the aid of their country..."; + uint32_t h; + uint8_t qq[] = "xThis is the time for all good men to come to the aid of their country..."; + uint32_t i; + uint8_t qqq[] = "xxThis is the time for all good men to come to the aid of their country..."; + uint32_t j; + uint8_t qqqq[] = "xxxThis is the time for all good men to come to the aid of their country..."; + uint32_t ref,x,y; + uint8_t *p; + + printf("Endianness. These lines should all be the same (for values filled in):\n"); + printf("%.8x %.8x %.8x\n", + hashword((const uint32_t *)q, (sizeof(q)-1)/4, 13), + hashword((const uint32_t *)q, (sizeof(q)-5)/4, 13), + hashword((const uint32_t *)q, (sizeof(q)-9)/4, 13)); + p = q; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + p = &qq[1]; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + p = &qqq[2]; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + p = &qqqq[3]; + printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n", + hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13), + hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13), + hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13), + hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13), + hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13), + hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13)); + printf("\n"); + + /* check that hashlittle2 and hashlittle produce the same results */ + i=47; j=0; + hashlittle2(q, sizeof(q), &i, &j); + if (hashlittle(q, sizeof(q), 47) != i) + printf("hashlittle2 and hashlittle mismatch\n"); + + /* check that hashword2 and hashword produce the same results */ + len = 0xdeadbeef; + i=47, j=0; + hashword2(&len, 1, &i, &j); + if (hashword(&len, 1, 47) != i) + printf("hashword2 and hashword mismatch %x %x\n", + i, hashword(&len, 1, 47)); + + /* check hashlittle doesn't read before or after the ends of the string */ + for (h=0, b=buf+1; h<8; ++h, ++b) + { + for (i=0; i +#include + +#include "macro.h" + +uint32_t jenkins_hashword(const uint32_t *k, size_t length, uint32_t initval) _pure_; +void jenkins_hashword2(const uint32_t *k, size_t length, uint32_t *pc, uint32_t *pb); + +uint32_t jenkins_hashlittle(const void *key, size_t length, uint32_t initval) _pure_; +void jenkins_hashlittle2(const void *key, size_t length, uint32_t *pc, uint32_t *pb); + +uint32_t jenkins_hashbig(const void *key, size_t length, uint32_t initval) _pure_; + +static inline uint64_t jenkins_hash64(const void *data, size_t length) { + uint32_t a = 0, b = 0; + + jenkins_hashlittle2(data, length, &a, &b); + + return ((uint64_t) a << 32ULL) | (uint64_t) b; +} diff --git a/src/libsystemd/sd-journal/mmap-cache.c b/src/libsystemd/sd-journal/mmap-cache.c new file mode 100644 index 0000000..973ade6 --- /dev/null +++ b/src/libsystemd/sd-journal/mmap-cache.c @@ -0,0 +1,562 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "list.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "mmap-cache.h" +#include "sigbus.h" + +typedef struct Window Window; + +typedef enum WindowFlags { + WINDOW_KEEP_ALWAYS = 1u << (_MMAP_CACHE_CATEGORY_MAX + 0), + WINDOW_IN_UNUSED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 1), + WINDOW_INVALIDATED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 2), + + _WINDOW_USED_MASK = WINDOW_IN_UNUSED - 1, /* The mask contains all bits that indicate the windows + * is currently in use. Covers the all the object types + * and the additional WINDOW_KEEP_ALWAYS flag. */ +} WindowFlags; + +#define WINDOW_IS_UNUSED(w) (((w)->flags & _WINDOW_USED_MASK) == 0) + +struct Window { + MMapFileDescriptor *fd; + + WindowFlags flags; + + void *ptr; + uint64_t offset; + size_t size; + + LIST_FIELDS(Window, windows); + LIST_FIELDS(Window, unused); +}; + +struct MMapFileDescriptor { + MMapCache *cache; + + int fd; + int prot; + bool sigbus; + + LIST_HEAD(Window, windows); +}; + +struct MMapCache { + unsigned n_ref; + unsigned n_windows; + + unsigned n_category_cache_hit; + unsigned n_window_list_hit; + unsigned n_missed; + + Hashmap *fds; + + LIST_HEAD(Window, unused); + Window *last_unused; + + Window *windows_by_category[_MMAP_CACHE_CATEGORY_MAX]; +}; + +#define WINDOWS_MIN 64 + +#if ENABLE_DEBUG_MMAP_CACHE +/* Tiny windows increase mmap activity and the chance of exposing unsafe use. */ +# define WINDOW_SIZE (page_size()) +#else +# define WINDOW_SIZE ((size_t) (UINT64_C(8) * UINT64_C(1024) * UINT64_C(1024))) +#endif + +MMapCache* mmap_cache_new(void) { + MMapCache *m; + + m = new(MMapCache, 1); + if (!m) + return NULL; + + *m = (MMapCache) { + .n_ref = 1, + }; + + return m; +} + +static Window* window_unlink(Window *w) { + assert(w); + + MMapCache *m = mmap_cache_fd_cache(w->fd); + + if (w->ptr) + munmap(w->ptr, w->size); + + if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) { + if (m->last_unused == w) + m->last_unused = w->unused_prev; + LIST_REMOVE(unused, m->unused, w); + } + + for (unsigned i = 0; i < _MMAP_CACHE_CATEGORY_MAX; i++) + if (FLAGS_SET(w->flags, 1u << i)) + assert_se(TAKE_PTR(m->windows_by_category[i]) == w); + + return LIST_REMOVE(windows, w->fd->windows, w); +} + +static void window_invalidate(Window *w) { + assert(w); + assert(w->fd); + + if (FLAGS_SET(w->flags, WINDOW_INVALIDATED)) + return; + + /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure + * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */ + + assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr); + w->flags |= WINDOW_INVALIDATED; +} + +static Window* window_free(Window *w) { + if (!w) + return NULL; + + window_unlink(w); + w->fd->cache->n_windows--; + + return mfree(w); +} + +static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, size_t size) { + assert(size > 0); + + return + w && + f == w->fd && + offset >= w->offset && + offset + size <= w->offset + w->size; +} + +static bool window_matches_by_addr(Window *w, MMapFileDescriptor *f, void *addr, size_t size) { + assert(size > 0); + + return + w && + f == w->fd && + (uint8_t*) addr >= (uint8_t*) w->ptr && + (uint8_t*) addr + size <= (uint8_t*) w->ptr + w->size; +} + +static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) { + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + + if (!m->last_unused || m->n_windows <= WINDOWS_MIN) { + /* Allocate a new window */ + w = new(Window, 1); + if (!w) + return NULL; + m->n_windows++; + } else + /* Reuse an existing one */ + w = window_unlink(m->last_unused); + + *w = (Window) { + .fd = f, + .offset = offset, + .size = size, + .ptr = ptr, + }; + + return LIST_PREPEND(windows, f->windows, w); +} + +static void category_detach_window(MMapCache *m, MMapCacheCategory c) { + Window *w; + + assert(m); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + + w = TAKE_PTR(m->windows_by_category[c]); + if (!w) + return; /* Nothing attached. */ + + assert(FLAGS_SET(w->flags, 1u << c)); + w->flags &= ~(1u << c); + + if (WINDOW_IS_UNUSED(w)) { + /* Not used anymore? */ +#if ENABLE_DEBUG_MMAP_CACHE + /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */ + window_free(w); +#else + LIST_PREPEND(unused, m->unused, w); + if (!m->last_unused) + m->last_unused = w; + w->flags |= WINDOW_IN_UNUSED; +#endif + } +} + +static void category_attach_window(MMapCache *m, MMapCacheCategory c, Window *w) { + assert(m); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(w); + + if (m->windows_by_category[c] == w) + return; /* Already attached. */ + + category_detach_window(m, c); + + if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) { + /* Used again? */ + if (m->last_unused == w) + m->last_unused = w->unused_prev; + LIST_REMOVE(unused, m->unused, w); + w->flags &= ~WINDOW_IN_UNUSED; + } + + m->windows_by_category[c] = w; + w->flags |= (1u << c); +} + +static MMapCache* mmap_cache_free(MMapCache *m) { + if (!m) + return NULL; + + /* All windows are owned by fds, and each fd takes a reference of MMapCache. So, when this is called, + * all fds are already freed, and hence there is no window. */ + + assert(hashmap_isempty(m->fds)); + hashmap_free(m->fds); + + assert(!m->unused); + assert(m->n_windows == 0); + + return mfree(m); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free); + +static int mmap_try_harder(MMapFileDescriptor *f, void *addr, int flags, uint64_t offset, size_t size, void **ret) { + MMapCache *m = mmap_cache_fd_cache(f); + + assert(ret); + + for (;;) { + void *ptr; + + ptr = mmap(addr, size, f->prot, flags, f->fd, offset); + if (ptr != MAP_FAILED) { + *ret = ptr; + return 0; + } + if (errno != ENOMEM) + return negative_errno(); + + /* When failed with ENOMEM, try again after making a room by freeing an unused window. */ + + if (!m->last_unused) + return -ENOMEM; /* no free window, propagate the original error. */ + + window_free(m->last_unused); + } +} + +static int add_mmap( + MMapFileDescriptor *f, + uint64_t offset, + size_t size, + struct stat *st, + Window **ret) { + + Window *w; + void *d; + int r; + + assert(f); + assert(size > 0); + assert(ret); + + /* overflow check */ + if (size > SIZE_MAX - PAGE_OFFSET_U64(offset)) + return -EADDRNOTAVAIL; + + size = PAGE_ALIGN(size + PAGE_OFFSET_U64(offset)); + offset = PAGE_ALIGN_DOWN_U64(offset); + + if (size < WINDOW_SIZE) { + uint64_t delta; + + delta = PAGE_ALIGN((WINDOW_SIZE - size) / 2); + offset = LESS_BY(offset, delta); + size = WINDOW_SIZE; + } + + if (st) { + /* Memory maps that are larger then the files underneath have undefined behavior. Hence, + * clamp things to the file size if we know it */ + + if (offset >= (uint64_t) st->st_size) + return -EADDRNOTAVAIL; + + if (size > (uint64_t) st->st_size - offset) + size = PAGE_ALIGN((uint64_t) st->st_size - offset); + } + + if (size >= SIZE_MAX) + return -EADDRNOTAVAIL; + + r = mmap_try_harder(f, NULL, MAP_SHARED, offset, size, &d); + if (r < 0) + return r; + + w = window_add(f, offset, size, d); + if (!w) { + (void) munmap(d, size); + return -ENOMEM; + } + + *ret = w; + return 0; +} + +int mmap_cache_fd_get( + MMapFileDescriptor *f, + MMapCacheCategory c, + bool keep_always, + uint64_t offset, + size_t size, + struct stat *st, + void **ret) { + + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + int r; + + assert(size > 0); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(ret); + + if (f->sigbus) + return -EIO; + + /* Check whether the current category is the right one already */ + if (window_matches(m->windows_by_category[c], f, offset, size)) { + m->n_category_cache_hit++; + w = m->windows_by_category[c]; + goto found; + } + + /* Drop the reference to the window, since it's unnecessary now */ + category_detach_window(m, c); + + /* Search for a matching mmap */ + LIST_FOREACH(windows, i, f->windows) + if (window_matches(i, f, offset, size)) { + m->n_window_list_hit++; + w = i; + goto found; + } + + m->n_missed++; + + /* Create a new mmap */ + r = add_mmap(f, offset, size, st, &w); + if (r < 0) + return r; + +found: + if (keep_always) + w->flags |= WINDOW_KEEP_ALWAYS; + + category_attach_window(m, c, w); + *ret = (uint8_t*) w->ptr + (offset - w->offset); + return 0; +} + +int mmap_cache_fd_pin( + MMapFileDescriptor *f, + MMapCacheCategory c, + void *addr, + size_t size) { + + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + + assert(addr); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(size > 0); + + if (f->sigbus) + return -EIO; + + /* Check if the current category is the right one. */ + if (window_matches_by_addr(m->windows_by_category[c], f, addr, size)) { + m->n_category_cache_hit++; + w = m->windows_by_category[c]; + goto found; + } + + /* Search for a matching mmap. */ + LIST_FOREACH(windows, i, f->windows) + if (window_matches_by_addr(i, f, addr, size)) { + m->n_window_list_hit++; + w = i; + goto found; + } + + m->n_missed++; + return -EADDRNOTAVAIL; /* Not found. */ + +found: + if (FLAGS_SET(w->flags, WINDOW_KEEP_ALWAYS)) + return 0; /* The window will never unmapped. */ + + /* Attach the window to the 'pinning' category. */ + category_attach_window(m, MMAP_CACHE_CATEGORY_PIN, w); + return 1; +} + +void mmap_cache_stats_log_debug(MMapCache *m) { + assert(m); + + log_debug("mmap cache statistics: %u category cache hit, %u window list hit, %u miss", + m->n_category_cache_hit, m->n_window_list_hit, m->n_missed); +} + +static void mmap_cache_process_sigbus(MMapCache *m) { + bool found = false; + MMapFileDescriptor *f; + int r; + + assert(m); + + /* Iterate through all triggered pages and mark their files as invalidated. */ + for (;;) { + bool ours; + void *addr; + + r = sigbus_pop(&addr); + if (_likely_(r == 0)) + break; + if (r < 0) { + log_error_errno(r, "SIGBUS handling failed: %m"); + abort(); + } + + ours = false; + HASHMAP_FOREACH(f, m->fds) { + LIST_FOREACH(windows, w, f->windows) + if (window_matches_by_addr(w, f, addr, 1)) { + found = ours = f->sigbus = true; + break; + } + + if (ours) + break; + } + + /* Didn't find a matching window, give up. */ + if (!ours) { + log_error("Unknown SIGBUS page, aborting."); + abort(); + } + } + + /* The list of triggered pages is now empty. Now, let's remap all windows of the triggered file to + * anonymous maps, so that no page of the file in question is triggered again, so that we can be sure + * not to hit the queue size limit. */ + if (_likely_(!found)) + return; + + HASHMAP_FOREACH(f, m->fds) { + if (!f->sigbus) + continue; + + LIST_FOREACH(windows, w, f->windows) + window_invalidate(w); + } +} + +bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f) { + assert(f); + + mmap_cache_process_sigbus(f->cache); + + return f->sigbus; +} + +int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret) { + _cleanup_free_ MMapFileDescriptor *f = NULL; + MMapFileDescriptor *existing; + int r; + + assert(m); + assert(fd >= 0); + + existing = hashmap_get(m->fds, FD_TO_PTR(fd)); + if (existing) { + if (existing->prot != prot) + return -EEXIST; + if (ret) + *ret = existing; + return 0; + } + + f = new(MMapFileDescriptor, 1); + if (!f) + return -ENOMEM; + + *f = (MMapFileDescriptor) { + .fd = fd, + .prot = prot, + }; + + r = hashmap_ensure_put(&m->fds, NULL, FD_TO_PTR(fd), f); + if (r < 0) + return r; + assert(r > 0); + + f->cache = mmap_cache_ref(m); + + if (ret) + *ret = f; + + TAKE_PTR(f); + return 1; +} + +MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f) { + if (!f) + return NULL; + + /* Make sure that any queued SIGBUS are first dispatched, so that we don't end up with a SIGBUS entry + * we cannot relate to any existing memory map. */ + + mmap_cache_process_sigbus(f->cache); + + while (f->windows) + window_free(f->windows); + + assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)) == f); + + /* Unref the cache at the end. Otherwise, the assertions in mmap_cache_free() may be triggered. */ + f->cache = mmap_cache_unref(f->cache); + + return mfree(f); +} + +MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f) { + assert(f); + return ASSERT_PTR(f->cache); +} diff --git a/src/libsystemd/sd-journal/mmap-cache.h b/src/libsystemd/sd-journal/mmap-cache.h new file mode 100644 index 0000000..1fbc236 --- /dev/null +++ b/src/libsystemd/sd-journal/mmap-cache.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "journal-def.h" + +typedef struct MMapCache MMapCache; +typedef struct MMapFileDescriptor MMapFileDescriptor; + +typedef enum MMapCacheCategory { + MMAP_CACHE_CATEGORY_ANY = OBJECT_UNUSED, + MMAP_CACHE_CATEGORY_DATA = OBJECT_DATA, + MMAP_CACHE_CATEGORY_FIELD = OBJECT_FIELD, + MMAP_CACHE_CATEGORY_ENTRY = OBJECT_ENTRY, + MMAP_CACHE_CATEGORY_DATA_HASH_TABLE = OBJECT_DATA_HASH_TABLE, + MMAP_CACHE_CATEGORY_FIELD_HASH_TABLE = OBJECT_FIELD_HASH_TABLE, + MMAP_CACHE_CATEGORY_ENTRY_ARRAY = OBJECT_ENTRY_ARRAY, + MMAP_CACHE_CATEGORY_TAG = OBJECT_TAG, + MMAP_CACHE_CATEGORY_HEADER, /* for reading file header */ + MMAP_CACHE_CATEGORY_PIN, /* for temporary pinning a object */ + _MMAP_CACHE_CATEGORY_MAX, + _MMAP_CACHE_CATEGORY_INVALID = -EINVAL, +} MMapCacheCategory; + +assert_cc((int) _OBJECT_TYPE_MAX < (int) _MMAP_CACHE_CATEGORY_MAX); + +static inline MMapCacheCategory type_to_category(ObjectType type) { + return type >= 0 && type < _OBJECT_TYPE_MAX ? (MMapCacheCategory) type : MMAP_CACHE_CATEGORY_ANY; +} + +MMapCache* mmap_cache_new(void); +MMapCache* mmap_cache_ref(MMapCache *m); +MMapCache* mmap_cache_unref(MMapCache *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(MMapCache*, mmap_cache_unref); + +int mmap_cache_fd_get( + MMapFileDescriptor *f, + MMapCacheCategory c, + bool keep_always, + uint64_t offset, + size_t size, + struct stat *st, + void **ret); + +int mmap_cache_fd_pin( + MMapFileDescriptor *f, + MMapCacheCategory c, + void *addr, + size_t size); + +int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret); +MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f); +MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f); + +void mmap_cache_stats_log_debug(MMapCache *m); + +bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f); diff --git a/src/libsystemd/sd-journal/sd-journal.c b/src/libsystemd/sd-journal/sd-journal.c new file mode 100644 index 0000000..6b9ff0a --- /dev/null +++ b/src/libsystemd/sd-journal/sd-journal.c @@ -0,0 +1,3528 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "catalog.h" +#include "compress.h" +#include "dirent-util.h" +#include "env-file.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "inotify-util.h" +#include "io-util.h" +#include "journal-def.h" +#include "journal-file.h" +#include "journal-internal.h" +#include "list.h" +#include "lookup3.h" +#include "nulstr-util.h" +#include "origin-id.h" +#include "path-util.h" +#include "prioq.h" +#include "process-util.h" +#include "replace-var.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "uid-alloc-range.h" + +#define JOURNAL_FILES_RECHECK_USEC (2 * USEC_PER_SEC) + +/* The maximum size of variable values we'll expand in catalog entries. We bind this to PATH_MAX for now, as + * we want to be able to show all officially valid paths at least */ +#define REPLACE_VAR_MAX PATH_MAX + +#define DEFAULT_DATA_THRESHOLD (64*1024) + +DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_journal, journal); + +static void remove_file_real(sd_journal *j, JournalFile *f); +static int journal_file_read_tail_timestamp(sd_journal *j, JournalFile *f); +static void journal_file_unlink_newest_by_boot_id(sd_journal *j, JournalFile *f); + +static int journal_put_error(sd_journal *j, int r, const char *path) { + _cleanup_free_ char *copy = NULL; + int k; + + /* Memorize an error we encountered, and store which + * file/directory it was generated from. Note that we store + * only *one* path per error code, as the error code is the + * key into the hashmap, and the path is the value. This means + * we keep track only of all error kinds, but not of all error + * locations. This has the benefit that the hashmap cannot + * grow beyond bounds. + * + * We return an error here only if we didn't manage to + * memorize the real error. */ + + if (r >= 0) + return r; + + if (path) { + copy = strdup(path); + if (!copy) + return -ENOMEM; + } + + k = hashmap_ensure_put(&j->errors, NULL, INT_TO_PTR(r), copy); + if (k < 0) { + if (k == -EEXIST) + return 0; + + return k; + } + + TAKE_PTR(copy); + return 0; +} + +static void detach_location(sd_journal *j) { + JournalFile *f; + + assert(j); + + j->current_file = NULL; + j->current_field = 0; + + ORDERED_HASHMAP_FOREACH(f, j->files) + journal_file_reset_location(f); +} + +static void init_location(Location *l, LocationType type, JournalFile *f, Object *o) { + assert(l); + assert(IN_SET(type, LOCATION_DISCRETE, LOCATION_SEEK)); + assert(f); + + *l = (Location) { + .type = type, + .seqnum = le64toh(o->entry.seqnum), + .seqnum_id = f->header->seqnum_id, + .realtime = le64toh(o->entry.realtime), + .monotonic = le64toh(o->entry.monotonic), + .boot_id = o->entry.boot_id, + .xor_hash = le64toh(o->entry.xor_hash), + .seqnum_set = true, + .realtime_set = true, + .monotonic_set = true, + .xor_hash_set = true, + }; +} + +static void set_location(sd_journal *j, JournalFile *f, Object *o) { + assert(j); + assert(f); + assert(o); + + init_location(&j->current_location, LOCATION_DISCRETE, f, o); + + j->current_file = f; + j->current_field = 0; + + /* Let f know its candidate entry was picked. */ + assert(f->location_type == LOCATION_SEEK); + f->location_type = LOCATION_DISCRETE; +} + +static int match_is_valid(const void *data, size_t size) { + const char *b = ASSERT_PTR(data); + + if (size < 2) + return false; + + if (((char*) data)[0] == '_' && ((char*) data)[1] == '_') + return false; + + for (const char *p = b; p < b + size; p++) { + + if (*p == '=') + return p > b; + + if (*p == '_') + continue; + + if (*p >= 'A' && *p <= 'Z') + continue; + + if (ascii_isdigit(*p)) + continue; + + return false; + } + + return false; +} + +static bool same_field(const void *_a, size_t s, const void *_b, size_t t) { + const uint8_t *a = _a, *b = _b; + + for (size_t j = 0; j < s && j < t; j++) { + + if (a[j] != b[j]) + return false; + + if (a[j] == '=') + return true; + } + + assert_not_reached(); +} + +static Match *match_new(Match *p, MatchType t) { + Match *m; + + m = new(Match, 1); + if (!m) + return NULL; + + *m = (Match) { + .type = t, + .parent = p, + }; + + if (p) + LIST_PREPEND(matches, p->matches, m); + + return m; +} + +static Match *match_free(Match *m) { + assert(m); + + while (m->matches) + match_free(m->matches); + + if (m->parent) + LIST_REMOVE(matches, m->parent->matches, m); + + free(m->data); + return mfree(m); +} + +static Match *match_free_if_empty(Match *m) { + if (!m || m->matches) + return m; + + return match_free(m); +} + +_public_ int sd_journal_add_match(sd_journal *j, const void *data, size_t size) { + Match *add_here = NULL, *m = NULL; + uint64_t hash; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(data, -EINVAL); + + if (size == 0) + size = strlen(data); + + if (!match_is_valid(data, size)) + return -EINVAL; + + /* level 0: AND term + * level 1: OR terms + * level 2: AND terms + * level 3: OR terms + * level 4: concrete matches */ + + if (!j->level0) { + j->level0 = match_new(NULL, MATCH_AND_TERM); + if (!j->level0) + return -ENOMEM; + } + + if (!j->level1) { + j->level1 = match_new(j->level0, MATCH_OR_TERM); + if (!j->level1) + return -ENOMEM; + } + + if (!j->level2) { + j->level2 = match_new(j->level1, MATCH_AND_TERM); + if (!j->level2) + return -ENOMEM; + } + + assert(j->level0->type == MATCH_AND_TERM); + assert(j->level1->type == MATCH_OR_TERM); + assert(j->level2->type == MATCH_AND_TERM); + + /* Old-style Jenkins (unkeyed) hashing only here. We do not cover new-style siphash (keyed) hashing + * here, since it's different for each file, and thus can't be pre-calculated in the Match object. */ + hash = jenkins_hash64(data, size); + + LIST_FOREACH(matches, l3, j->level2->matches) { + assert(l3->type == MATCH_OR_TERM); + + LIST_FOREACH(matches, l4, l3->matches) { + assert(l4->type == MATCH_DISCRETE); + + /* Exactly the same match already? Then ignore + * this addition */ + if (l4->hash == hash && + l4->size == size && + memcmp(l4->data, data, size) == 0) + return 0; + + /* Same field? Then let's add this to this OR term */ + if (same_field(data, size, l4->data, l4->size)) { + add_here = l3; + break; + } + } + + if (add_here) + break; + } + + if (!add_here) { + add_here = match_new(j->level2, MATCH_OR_TERM); + if (!add_here) + goto fail; + } + + m = match_new(add_here, MATCH_DISCRETE); + if (!m) + goto fail; + + m->hash = hash; + m->size = size; + m->data = memdup(data, size); + if (!m->data) + goto fail; + + detach_location(j); + + return 0; + +fail: + match_free(m); + match_free_if_empty(add_here); + j->level2 = match_free_if_empty(j->level2); + j->level1 = match_free_if_empty(j->level1); + j->level0 = match_free_if_empty(j->level0); + + return -ENOMEM; +} + +_public_ int sd_journal_add_conjunction(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + if (!j->level0) + return 0; + + if (!j->level1) + return 0; + + if (!j->level1->matches) + return 0; + + j->level1 = NULL; + j->level2 = NULL; + + return 0; +} + +_public_ int sd_journal_add_disjunction(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + if (!j->level0) + return 0; + + if (!j->level1) + return 0; + + if (!j->level2) + return 0; + + if (!j->level2->matches) + return 0; + + j->level2 = NULL; + return 0; +} + +static char *match_make_string(Match *m) { + _cleanup_free_ char *p = NULL; + bool enclose = false; + + if (!m) + return strdup("none"); + + if (m->type == MATCH_DISCRETE) + return cescape_length(m->data, m->size); + + LIST_FOREACH(matches, i, m->matches) { + _cleanup_free_ char *t = NULL; + + t = match_make_string(i); + if (!t) + return NULL; + + if (p) { + if (!strextend(&p, m->type == MATCH_OR_TERM ? " OR " : " AND ", t)) + return NULL; + + enclose = true; + } else + p = TAKE_PTR(t); + } + + if (enclose) + return strjoin("(", p, ")"); + + return TAKE_PTR(p); +} + +char *journal_make_match_string(sd_journal *j) { + assert(j); + + return match_make_string(j->level0); +} + +_public_ void sd_journal_flush_matches(sd_journal *j) { + if (!j || journal_origin_changed(j)) + return; + + if (j->level0) + match_free(j->level0); + + j->level0 = j->level1 = j->level2 = NULL; + + detach_location(j); +} + +static int journal_file_find_newest_for_boot_id( + sd_journal *j, + sd_id128_t id, + JournalFile **ret) { + + JournalFile *prev = NULL; + int r; + + assert(j); + assert(ret); + + /* Before we use it, let's refresh the timestamp from the header, and reshuffle our prioq + * accordingly. We do this only a bunch of times, to not be caught in some update loop. */ + for (unsigned n_tries = 0;; n_tries++) { + JournalFile *f; + Prioq *q; + + q = hashmap_get(j->newest_by_boot_id, &id); + if (!q) + return log_debug_errno(SYNTHETIC_ERRNO(ENODATA), + "Requested delta for boot ID %s, but we have no information about that boot ID.", SD_ID128_TO_STRING(id)); + + assert_se(f = prioq_peek(q)); /* we delete hashmap entries once the prioq is empty, so this must hold */ + + if (f == prev || n_tries >= 5) { + /* This was already the best answer in the previous run, or we tried too often, use it */ + *ret = f; + return 0; + } + + prev = f; + + /* Let's read the journal file's current timestamp once, before we return it, maybe it has changed. */ + r = journal_file_read_tail_timestamp(j, f); + if (r < 0) + return log_debug_errno(r, "Failed to read tail timestamp while trying to find newest journal file for boot ID %s.", SD_ID128_TO_STRING(id)); + + /* Refreshing the timestamp we read might have reshuffled the prioq, hence let's check the + * prioq again and only use the information once we reached an equilibrium or hit a limit */ + } +} + +static int compare_boot_ids(sd_journal *j, sd_id128_t a, sd_id128_t b) { + JournalFile *x, *y; + + assert(j); + + /* Try to find the newest open journal file for the two boot ids */ + if (journal_file_find_newest_for_boot_id(j, a, &x) < 0 || + journal_file_find_newest_for_boot_id(j, b, &y) < 0) + return 0; + + /* Only compare the boot id timestamps if they originate from the same machine. If they are from + * different machines, then we timestamps of the boot ids might be as off as the timestamps on the + * entries and hence not useful for comparing. */ + if (!sd_id128_equal(x->newest_machine_id, y->newest_machine_id)) + return 0; + + return CMP(x->newest_realtime_usec, y->newest_realtime_usec); +} + +static int compare_with_location( + sd_journal *j, + const JournalFile *f, + const Location *l, + const JournalFile *current_file) { + int r; + + assert(j); + assert(f); + assert(l); + assert(f->location_type == LOCATION_SEEK); + assert(IN_SET(l->type, LOCATION_DISCRETE, LOCATION_SEEK)); + + if (l->monotonic_set && + sd_id128_equal(f->current_boot_id, l->boot_id) && + l->realtime_set && + f->current_realtime == l->realtime && + l->xor_hash_set && + f->current_xor_hash == l->xor_hash && + l->seqnum_set && + sd_id128_equal(f->header->seqnum_id, l->seqnum_id) && + f->current_seqnum == l->seqnum && + f != current_file) + return 0; + + if (l->seqnum_set && + sd_id128_equal(f->header->seqnum_id, l->seqnum_id)) { + r = CMP(f->current_seqnum, l->seqnum); + if (r != 0) + return r; + } + + if (l->monotonic_set) { + /* If both arguments have the same boot ID, then we can compare the monotonic timestamps. If + * they are distinct, then we might able to lookup the timestamps of those boot IDs (if they + * are from the same machine) and order by that. */ + if (sd_id128_equal(f->current_boot_id, l->boot_id)) + r = CMP(f->current_monotonic, l->monotonic); + else + r = compare_boot_ids(j, f->current_boot_id, l->boot_id); + if (r != 0) + return r; + } + + if (l->realtime_set) { + r = CMP(f->current_realtime, l->realtime); + if (r != 0) + return r; + } + + if (l->xor_hash_set) { + r = CMP(f->current_xor_hash, l->xor_hash); + if (r != 0) + return r; + } + + return 0; +} + +static int next_for_match( + sd_journal *j, + Match *m, + JournalFile *f, + uint64_t after_offset, + direction_t direction, + Object **ret, + uint64_t *offset) { + + int r; + uint64_t np = 0; + + assert(j); + assert(m); + assert(f); + + if (m->type == MATCH_DISCRETE) { + Object *d; + uint64_t hash; + + /* If the keyed hash logic is used, we need to calculate the hash fresh per file. Otherwise + * we can use what we pre-calculated. */ + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + hash = journal_file_hash_data(f, m->data, m->size); + else + hash = m->hash; + + r = journal_file_find_data_object_with_hash(f, m->data, m->size, hash, &d, NULL); + if (r <= 0) + return r; + + return journal_file_move_to_entry_by_offset_for_data(f, d, after_offset, direction, ret, offset); + + } else if (m->type == MATCH_OR_TERM) { + + /* Find the earliest match beyond after_offset */ + + LIST_FOREACH(matches, i, m->matches) { + uint64_t cp; + + r = next_for_match(j, i, f, after_offset, direction, NULL, &cp); + if (r < 0) + return r; + else if (r > 0) { + if (np == 0 || (direction == DIRECTION_DOWN ? cp < np : cp > np)) + np = cp; + } + } + + if (np == 0) + return 0; + + } else if (m->type == MATCH_AND_TERM) { + Match *last_moved; + + /* Always jump to the next matching entry and repeat + * this until we find an offset that matches for all + * matches. */ + + if (!m->matches) + return 0; + + r = next_for_match(j, m->matches, f, after_offset, direction, NULL, &np); + if (r <= 0) + return r; + + assert(direction == DIRECTION_DOWN ? np >= after_offset : np <= after_offset); + last_moved = m->matches; + + LIST_LOOP_BUT_ONE(matches, i, m->matches, last_moved) { + uint64_t cp; + + r = next_for_match(j, i, f, np, direction, NULL, &cp); + if (r <= 0) + return r; + + assert(direction == DIRECTION_DOWN ? cp >= np : cp <= np); + if (direction == DIRECTION_DOWN ? cp > np : cp < np) { + np = cp; + last_moved = i; + } + } + } + + assert(np > 0); + + if (ret) { + r = journal_file_move_to_object(f, OBJECT_ENTRY, np, ret); + if (r < 0) + return r; + } + + if (offset) + *offset = np; + + return 1; +} + +static int find_location_for_match( + sd_journal *j, + Match *m, + JournalFile *f, + direction_t direction, + Object **ret, + uint64_t *offset) { + + int r; + + assert(j); + assert(m); + assert(f); + + if (m->type == MATCH_DISCRETE) { + Object *d; + uint64_t dp, hash; + + if (JOURNAL_HEADER_KEYED_HASH(f->header)) + hash = journal_file_hash_data(f, m->data, m->size); + else + hash = m->hash; + + r = journal_file_find_data_object_with_hash(f, m->data, m->size, hash, &d, &dp); + if (r <= 0) + return r; + + /* FIXME: missing: find by monotonic */ + + if (j->current_location.type == LOCATION_HEAD) + return direction == DIRECTION_DOWN ? journal_file_move_to_entry_for_data(f, d, DIRECTION_DOWN, ret, offset) : 0; + if (j->current_location.type == LOCATION_TAIL) + return direction == DIRECTION_UP ? journal_file_move_to_entry_for_data(f, d, DIRECTION_UP, ret, offset) : 0; + if (j->current_location.seqnum_set && sd_id128_equal(j->current_location.seqnum_id, f->header->seqnum_id)) + return journal_file_move_to_entry_by_seqnum_for_data(f, d, j->current_location.seqnum, direction, ret, offset); + if (j->current_location.monotonic_set) { + r = journal_file_move_to_entry_by_monotonic_for_data(f, d, j->current_location.boot_id, j->current_location.monotonic, direction, ret, offset); + if (r != 0) + return r; + + /* The data object might have been invalidated. */ + r = journal_file_move_to_object(f, OBJECT_DATA, dp, &d); + if (r < 0) + return r; + } + if (j->current_location.realtime_set) + return journal_file_move_to_entry_by_realtime_for_data(f, d, j->current_location.realtime, direction, ret, offset); + + return journal_file_move_to_entry_for_data(f, d, direction, ret, offset); + + } else if (m->type == MATCH_OR_TERM) { + uint64_t np = 0; + + /* Find the earliest match */ + + LIST_FOREACH(matches, i, m->matches) { + uint64_t cp; + + r = find_location_for_match(j, i, f, direction, NULL, &cp); + if (r < 0) + return r; + else if (r > 0) { + if (np == 0 || (direction == DIRECTION_DOWN ? np > cp : np < cp)) + np = cp; + } + } + + if (np == 0) + return 0; + + if (ret) { + r = journal_file_move_to_object(f, OBJECT_ENTRY, np, ret); + if (r < 0) + return r; + } + + if (offset) + *offset = np; + + return 1; + + } else { + uint64_t np = 0; + + assert(m->type == MATCH_AND_TERM); + + /* First jump to the last match, and then find the + * next one where all matches match */ + + if (!m->matches) + return 0; + + LIST_FOREACH(matches, i, m->matches) { + uint64_t cp; + + r = find_location_for_match(j, i, f, direction, NULL, &cp); + if (r <= 0) + return r; + + if (np == 0 || (direction == DIRECTION_DOWN ? cp > np : cp < np)) + np = cp; + } + + return next_for_match(j, m, f, np, direction, ret, offset); + } +} + +static int find_location_with_matches( + sd_journal *j, + JournalFile *f, + direction_t direction, + Object **ret, + uint64_t *offset) { + + int r; + + assert(j); + assert(f); + assert(ret); + assert(offset); + + if (!j->level0) { + /* No matches is simple */ + + if (j->current_location.type == LOCATION_HEAD) + return direction == DIRECTION_DOWN ? journal_file_next_entry(f, 0, DIRECTION_DOWN, ret, offset) : 0; + if (j->current_location.type == LOCATION_TAIL) + return direction == DIRECTION_UP ? journal_file_next_entry(f, 0, DIRECTION_UP, ret, offset) : 0; + if (j->current_location.seqnum_set && sd_id128_equal(j->current_location.seqnum_id, f->header->seqnum_id)) + return journal_file_move_to_entry_by_seqnum(f, j->current_location.seqnum, direction, ret, offset); + if (j->current_location.monotonic_set) { + r = journal_file_move_to_entry_by_monotonic(f, j->current_location.boot_id, j->current_location.monotonic, direction, ret, offset); + if (r != 0) + return r; + } + if (j->current_location.realtime_set) + return journal_file_move_to_entry_by_realtime(f, j->current_location.realtime, direction, ret, offset); + + return journal_file_next_entry(f, 0, direction, ret, offset); + } else + return find_location_for_match(j, j->level0, f, direction, ret, offset); +} + +static int next_with_matches( + sd_journal *j, + JournalFile *f, + direction_t direction, + Object **ret, + uint64_t *offset) { + + assert(j); + assert(f); + assert(ret); + assert(offset); + + /* No matches is easy. We simple advance the file + * pointer by one. */ + if (!j->level0) + return journal_file_next_entry(f, f->current_offset, direction, ret, offset); + + /* If we have a match then we look for the next matching entry + * with an offset at least one step larger */ + return next_for_match(j, j->level0, f, + direction == DIRECTION_DOWN ? f->current_offset + 1 + : f->current_offset - 1, + direction, ret, offset); +} + +static int next_beyond_location(sd_journal *j, JournalFile *f, direction_t direction) { + Object *c; + uint64_t cp, n_entries; + int r; + + assert(j); + assert(f); + + (void) journal_file_read_tail_timestamp(j, f); + + n_entries = le64toh(f->header->n_entries); + + /* If we hit EOF before, we don't need to look into this file again + * unless direction changed or new entries appeared. */ + if (f->last_direction == direction && + f->location_type == (direction == DIRECTION_DOWN ? LOCATION_TAIL : LOCATION_HEAD) && + n_entries == f->last_n_entries) + return 0; + + f->last_n_entries = n_entries; + + if (f->last_direction == direction && f->current_offset > 0) { + /* LOCATION_SEEK here means we did the work in a previous + * iteration and the current location already points to a + * candidate entry. */ + if (f->location_type != LOCATION_SEEK) { + r = next_with_matches(j, f, direction, &c, &cp); + if (r <= 0) + return r; + + journal_file_save_location(f, c, cp); + } + } else { + f->last_direction = direction; + + r = find_location_with_matches(j, f, direction, &c, &cp); + if (r <= 0) + return r; + + journal_file_save_location(f, c, cp); + } + + /* OK, we found the spot, now let's advance until an entry + * that is actually different from what we were previously + * looking at. This is necessary to handle entries which exist + * in two (or more) journal files, and which shall all be + * suppressed but one. */ + + for (;;) { + bool found; + + if (j->current_location.type == LOCATION_DISCRETE) { + int k; + + k = compare_with_location(j, f, &j->current_location, j->current_file); + + found = direction == DIRECTION_DOWN ? k > 0 : k < 0; + } else + found = true; + + if (found) + return 1; + + r = next_with_matches(j, f, direction, &c, &cp); + if (r <= 0) + return r; + + journal_file_save_location(f, c, cp); + } +} + +static int compare_locations(sd_journal *j, JournalFile *af, JournalFile *bf) { + int r; + + assert(j); + assert(af); + assert(af->header); + assert(bf); + assert(bf->header); + assert(af->location_type == LOCATION_SEEK); + assert(bf->location_type == LOCATION_SEEK); + + /* If contents, timestamps and seqnum match, these entries are identical. */ + if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) && + af->current_monotonic == bf->current_monotonic && + af->current_realtime == bf->current_realtime && + af->current_xor_hash == bf->current_xor_hash && + sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) && + af->current_seqnum == bf->current_seqnum) + return 0; + + if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) { + /* If this is from the same seqnum source, compare seqnums */ + r = CMP(af->current_seqnum, bf->current_seqnum); + if (r != 0) + return r; + + /* Wow! This is weird, different data but the same seqnums? Something is borked, but let's + * make the best of it and compare by time. */ + } + + if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) + /* If the boot id matches, compare monotonic time */ + r = CMP(af->current_monotonic, bf->current_monotonic); + else + /* If they don't match try to compare boot IDs */ + r = compare_boot_ids(j, af->current_boot_id, bf->current_boot_id); + if (r != 0) + return r; + + /* Otherwise, compare UTC time */ + r = CMP(af->current_realtime, bf->current_realtime); + if (r != 0) + return r; + + /* Finally, compare by contents */ + return CMP(af->current_xor_hash, bf->current_xor_hash); +} + +static int real_journal_next(sd_journal *j, direction_t direction) { + JournalFile *new_file = NULL; + unsigned n_files; + const void **files; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + r = iterated_cache_get(j->files_cache, NULL, &files, &n_files); + if (r < 0) + return r; + + for (unsigned i = 0; i < n_files; i++) { + JournalFile *f = (JournalFile *)files[i]; + bool found; + + r = next_beyond_location(j, f, direction); + if (r < 0) { + log_debug_errno(r, "Can't iterate through %s, ignoring: %m", f->path); + remove_file_real(j, f); + continue; + } else if (r == 0) { + f->location_type = direction == DIRECTION_DOWN ? LOCATION_TAIL : LOCATION_HEAD; + continue; + } + + if (!new_file) + found = true; + else { + int k; + + k = compare_locations(j, f, new_file); + + found = direction == DIRECTION_DOWN ? k < 0 : k > 0; + } + + if (found) + new_file = f; + } + + if (!new_file) + return 0; + + r = journal_file_move_to_object(new_file, OBJECT_ENTRY, new_file->current_offset, &o); + if (r < 0) + return r; + + set_location(j, new_file, o); + + return 1; +} + +_public_ int sd_journal_next(sd_journal *j) { + return real_journal_next(j, DIRECTION_DOWN); +} + +_public_ int sd_journal_previous(sd_journal *j) { + return real_journal_next(j, DIRECTION_UP); +} + +_public_ int sd_journal_step_one(sd_journal *j, int advanced) { + assert_return(j, -EINVAL); + + if (j->current_location.type == LOCATION_HEAD) + return sd_journal_next(j); + if (j->current_location.type == LOCATION_TAIL) + return sd_journal_previous(j); + return real_journal_next(j, advanced ? DIRECTION_DOWN : DIRECTION_UP); +} + +static int real_journal_next_skip(sd_journal *j, direction_t direction, uint64_t skip) { + int c = 0, r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(skip <= INT_MAX, -ERANGE); + + if (skip == 0) { + /* If this is not a discrete skip, then at least + * resolve the current location */ + if (j->current_location.type != LOCATION_DISCRETE) { + r = real_journal_next(j, direction); + if (r < 0) + return r; + } + + return 0; + } + + do { + r = real_journal_next(j, direction); + if (r < 0) + return r; + + if (r == 0) + return c; + + skip--; + c++; + } while (skip > 0); + + return c; +} + +_public_ int sd_journal_next_skip(sd_journal *j, uint64_t skip) { + return real_journal_next_skip(j, DIRECTION_DOWN, skip); +} + +_public_ int sd_journal_previous_skip(sd_journal *j, uint64_t skip) { + return real_journal_next_skip(j, DIRECTION_UP, skip); +} + +_public_ int sd_journal_get_cursor(sd_journal *j, char **cursor) { + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(cursor, -EINVAL); + + if (!j->current_file || j->current_file->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(j->current_file, OBJECT_ENTRY, j->current_file->current_offset, &o); + if (r < 0) + return r; + + if (asprintf(cursor, + "s=%s;i=%"PRIx64";b=%s;m=%"PRIx64";t=%"PRIx64";x=%"PRIx64, + SD_ID128_TO_STRING(j->current_file->header->seqnum_id), le64toh(o->entry.seqnum), + SD_ID128_TO_STRING(o->entry.boot_id), le64toh(o->entry.monotonic), + le64toh(o->entry.realtime), + le64toh(o->entry.xor_hash)) < 0) + return -ENOMEM; + + return 0; +} + +_public_ int sd_journal_seek_cursor(sd_journal *j, const char *cursor) { + unsigned long long seqnum, monotonic, realtime, xor_hash; + bool seqnum_id_set = false, + seqnum_set = false, + boot_id_set = false, + monotonic_set = false, + realtime_set = false, + xor_hash_set = false; + sd_id128_t seqnum_id, boot_id; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(!isempty(cursor), -EINVAL); + + for (const char *p = cursor;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ";", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + if (word[0] == '\0' || word[1] != '=') + return -EINVAL; + + switch (word[0]) { + case 's': + seqnum_id_set = true; + r = sd_id128_from_string(word + 2, &seqnum_id); + if (r < 0) + return r; + break; + + case 'i': + seqnum_set = true; + if (sscanf(word + 2, "%llx", &seqnum) != 1) + return -EINVAL; + break; + + case 'b': + boot_id_set = true; + r = sd_id128_from_string(word + 2, &boot_id); + if (r < 0) + return r; + break; + + case 'm': + monotonic_set = true; + if (sscanf(word + 2, "%llx", &monotonic) != 1) + return -EINVAL; + break; + + case 't': + realtime_set = true; + if (sscanf(word + 2, "%llx", &realtime) != 1) + return -EINVAL; + break; + + case 'x': + xor_hash_set = true; + if (sscanf(word + 2, "%llx", &xor_hash) != 1) + return -EINVAL; + break; + } + } + + if ((!seqnum_set || !seqnum_id_set) && + (!monotonic_set || !boot_id_set) && + !realtime_set) + return -EINVAL; + + detach_location(j); + j->current_location = (Location) { + .type = LOCATION_SEEK, + }; + + if (realtime_set) { + j->current_location.realtime = (uint64_t) realtime; + j->current_location.realtime_set = true; + } + + if (seqnum_set && seqnum_id_set) { + j->current_location.seqnum = (uint64_t) seqnum; + j->current_location.seqnum_id = seqnum_id; + j->current_location.seqnum_set = true; + } + + if (monotonic_set && boot_id_set) { + j->current_location.monotonic = (uint64_t) monotonic; + j->current_location.boot_id = boot_id; + j->current_location.monotonic_set = true; + } + + if (xor_hash_set) { + j->current_location.xor_hash = (uint64_t) xor_hash; + j->current_location.xor_hash_set = true; + } + + return 0; +} + +_public_ int sd_journal_test_cursor(sd_journal *j, const char *cursor) { + int r; + Object *o; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(!isempty(cursor), -EINVAL); + + if (!j->current_file || j->current_file->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(j->current_file, OBJECT_ENTRY, j->current_file->current_offset, &o); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *item = NULL; + unsigned long long ll; + sd_id128_t id; + int k = 0; + + r = extract_first_word(&cursor, &item, ";", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + + if (r == 0) + break; + + if (strlen(item) < 2 || item[1] != '=') + return -EINVAL; + + switch (item[0]) { + + case 's': + k = sd_id128_from_string(item+2, &id); + if (k < 0) + return k; + if (!sd_id128_equal(id, j->current_file->header->seqnum_id)) + return 0; + break; + + case 'i': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.seqnum)) + return 0; + break; + + case 'b': + k = sd_id128_from_string(item+2, &id); + if (k < 0) + return k; + if (!sd_id128_equal(id, o->entry.boot_id)) + return 0; + break; + + case 'm': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.monotonic)) + return 0; + break; + + case 't': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.realtime)) + return 0; + break; + + case 'x': + if (sscanf(item+2, "%llx", &ll) != 1) + return -EINVAL; + if (ll != le64toh(o->entry.xor_hash)) + return 0; + break; + } + } + + return 1; +} + +_public_ int sd_journal_seek_monotonic_usec(sd_journal *j, sd_id128_t boot_id, uint64_t usec) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_SEEK, + .boot_id = boot_id, + .monotonic = usec, + .monotonic_set = true, + }; + + return 0; +} + +_public_ int sd_journal_seek_realtime_usec(sd_journal *j, uint64_t usec) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_SEEK, + .realtime = usec, + .realtime_set = true, + }; + + return 0; +} + +_public_ int sd_journal_seek_head(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_HEAD, + }; + + return 0; +} + +_public_ int sd_journal_seek_tail(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + detach_location(j); + + j->current_location = (Location) { + .type = LOCATION_TAIL, + }; + + return 0; +} + +static void check_network(sd_journal *j, int fd) { + assert(j); + + if (j->on_network) + return; + + j->on_network = fd_is_network_fs(fd); +} + +static bool file_has_type_prefix(const char *prefix, const char *filename) { + const char *full, *tilded, *atted; + + full = strjoina(prefix, ".journal"); + tilded = strjoina(full, "~"); + atted = strjoina(prefix, "@"); + + return STR_IN_SET(filename, full, tilded) || + startswith(filename, atted); +} + +static bool file_type_wanted(int flags, const char *filename) { + assert(filename); + + if (!ENDSWITH_SET(filename, ".journal", ".journal~")) + return false; + + /* no flags set → every type is OK */ + if (!(flags & (SD_JOURNAL_SYSTEM | SD_JOURNAL_CURRENT_USER))) + return true; + + if (FLAGS_SET(flags, SD_JOURNAL_CURRENT_USER)) { + char prefix[5 + DECIMAL_STR_MAX(uid_t) + 1]; + + xsprintf(prefix, "user-" UID_FMT, getuid()); + + if (file_has_type_prefix(prefix, filename)) + return true; + + /* If SD_JOURNAL_CURRENT_USER is specified and we are invoked under a system UID, then + * automatically enable SD_JOURNAL_SYSTEM too, because journald will actually put system user + * data into the system journal. */ + + if (uid_for_system_journal(getuid())) + flags |= SD_JOURNAL_SYSTEM; + } + + if (FLAGS_SET(flags, SD_JOURNAL_SYSTEM) && file_has_type_prefix("system", filename)) + return true; + + return false; +} + +static bool path_has_prefix(sd_journal *j, const char *path, const char *prefix) { + assert(j); + assert(path); + assert(prefix); + + if (j->toplevel_fd >= 0) + return false; + + return path_startswith(path, prefix); +} + +static void track_file_disposition(sd_journal *j, JournalFile *f) { + assert(j); + assert(f); + + if (!j->has_runtime_files && path_has_prefix(j, f->path, "/run")) + j->has_runtime_files = true; + else if (!j->has_persistent_files && path_has_prefix(j, f->path, "/var")) + j->has_persistent_files = true; +} + +static const char *skip_slash(const char *p) { + + if (!p) + return NULL; + + while (*p == '/') + p++; + + return p; +} + +static int add_any_file( + sd_journal *j, + int fd, + const char *path) { + + _cleanup_close_ int our_fd = -EBADF; + JournalFile *f; + struct stat st; + int r; + + assert(j); + assert(fd >= 0 || path); + + if (fd < 0) { + assert(path); /* For gcc. */ + if (j->toplevel_fd >= 0) + /* If there's a top-level fd defined make the path relative, explicitly, since otherwise + * openat() ignores the first argument. */ + + fd = our_fd = openat(j->toplevel_fd, skip_slash(path), O_RDONLY|O_CLOEXEC|O_NONBLOCK); + else + fd = our_fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) { + r = log_debug_errno(errno, "Failed to open journal file %s: %m", path); + goto error; + } + + r = fd_nonblock(fd, false); + if (r < 0) { + r = log_debug_errno(errno, "Failed to turn off O_NONBLOCK for %s: %m", path); + goto error; + } + } + + if (fstat(fd, &st) < 0) { + r = log_debug_errno(errno, "Failed to fstat %s: %m", path ?: "fd"); + goto error; + } + + r = stat_verify_regular(&st); + if (r < 0) { + log_debug_errno(r, "Refusing to open %s: %m", path ?: "fd"); + goto error; + } + + if (path) { + f = ordered_hashmap_get(j->files, path); + if (f) { + if (stat_inode_same(&f->last_stat, &st)) { + /* We already track this file, under the same path and with the same + * device/inode numbers, it's hence really the same. Mark this file as seen + * in this generation. This is used to GC old files in process_q_overflow() + * to detect journal files that are still there and discern them from those + * which are gone. */ + + f->last_seen_generation = j->generation; + (void) journal_file_read_tail_timestamp(j, f); + return 0; + } + + /* So we tracked a file under this name, but it has a different inode/device. In that + * case, it got replaced (probably due to rotation?), let's drop it hence from our + * list. */ + remove_file_real(j, f); + f = NULL; + } + } + + if (ordered_hashmap_size(j->files) >= JOURNAL_FILES_MAX) { + r = log_debug_errno(SYNTHETIC_ERRNO(ETOOMANYREFS), + "Too many open journal files, not adding %s.", path ?: "fd"); + goto error; + } + + r = journal_file_open(fd, path, O_RDONLY, 0, 0, 0, NULL, j->mmap, NULL, &f); + if (r < 0) { + log_debug_errno(r, "Failed to open journal file %s: %m", path ?: "from fd"); + goto error; + } + + /* journal_file_dump(f); */ + + /* journal_file_open() generates an replacement fname if necessary, so we can use f->path. */ + r = ordered_hashmap_put(j->files, f->path, f); + if (r < 0) { + f->close_fd = false; /* Make sure journal_file_close() doesn't close the caller's fd + * (or our own). The caller or we will do that ourselves. */ + (void) journal_file_close(f); + goto error; + } + + TAKE_FD(our_fd); /* the fd is now owned by the JournalFile object */ + + f->last_seen_generation = j->generation; + + track_file_disposition(j, f); + check_network(j, f->fd); + (void) journal_file_read_tail_timestamp(j, f); + + j->current_invalidate_counter++; + + log_debug("File %s added.", f->path); + + return 0; + +error: + (void) journal_put_error(j, r, path); /* path==NULL is OK. */ + return r; +} + +static int add_file_by_name( + sd_journal *j, + const char *prefix, + const char *filename) { + + _cleanup_free_ char *path = NULL; + + assert(j); + assert(prefix); + assert(filename); + + if (j->no_new_files) + return 0; + + if (!file_type_wanted(j->flags, filename)) + return 0; + + path = path_join(prefix, filename); + if (!path) + return -ENOMEM; + + return add_any_file(j, -1, path); +} + +static int remove_file_by_name( + sd_journal *j, + const char *prefix, + const char *filename) { + + _cleanup_free_ char *path = NULL; + JournalFile *f; + + assert(j); + assert(prefix); + assert(filename); + + path = path_join(prefix, filename); + if (!path) + return -ENOMEM; + + f = ordered_hashmap_get(j->files, path); + if (!f) + return 0; + + remove_file_real(j, f); + return 1; +} + +static void remove_file_real(sd_journal *j, JournalFile *f) { + assert(j); + assert(f); + + (void) ordered_hashmap_remove(j->files, f->path); + + log_debug("File %s removed.", f->path); + + if (j->current_file == f) { + j->current_file = NULL; + j->current_field = 0; + } + + if (j->unique_file == f) { + /* Jump to the next unique_file or NULL if that one was last */ + j->unique_file = ordered_hashmap_next(j->files, j->unique_file->path); + j->unique_offset = 0; + if (!j->unique_file) + j->unique_file_lost = true; + } + + if (j->fields_file == f) { + j->fields_file = ordered_hashmap_next(j->files, j->fields_file->path); + j->fields_offset = 0; + if (!j->fields_file) + j->fields_file_lost = true; + } + + journal_file_unlink_newest_by_boot_id(j, f); + (void) journal_file_close(f); + + j->current_invalidate_counter++; +} + +static int dirname_is_machine_id(const char *fn) { + sd_id128_t id, machine; + const char *e; + int r; + + /* Returns true if the specified directory name matches the local machine ID */ + + r = sd_id128_get_machine(&machine); + if (r < 0) + return r; + + e = strchr(fn, '.'); + if (e) { + const char *k; + + /* Looks like it has a namespace suffix. Verify that. */ + if (!log_namespace_name_valid(e + 1)) + return false; + + k = strndupa_safe(fn, e - fn); + r = sd_id128_from_string(k, &id); + } else + r = sd_id128_from_string(fn, &id); + if (r < 0) + return r; + + return sd_id128_equal(id, machine); +} + +static int dirname_has_namespace(const char *fn, const char *namespace) { + const char *e; + + /* Returns true if the specified directory name matches the specified namespace */ + + e = strchr(fn, '.'); + if (e) { + const char *k; + + if (!namespace) + return false; + + if (!streq(e + 1, namespace)) + return false; + + k = strndupa_safe(fn, e - fn); + return id128_is_valid(k); + } + + if (namespace) + return false; + + return id128_is_valid(fn); +} + +static bool dirent_is_journal_file(const struct dirent *de) { + assert(de); + + /* Returns true if the specified directory entry looks like a journal file we might be interested in */ + + if (!IN_SET(de->d_type, DT_REG, DT_LNK, DT_UNKNOWN)) + return false; + + return endswith(de->d_name, ".journal") || + endswith(de->d_name, ".journal~"); +} + +static bool dirent_is_journal_subdir(const struct dirent *de) { + const char *e, *n; + assert(de); + + /* returns true if the specified directory entry looks like a directory that might contain journal + * files we might be interested in, i.e. is either a 128-bit ID or a 128-bit ID suffixed by a + * namespace. */ + + if (!IN_SET(de->d_type, DT_DIR, DT_LNK, DT_UNKNOWN)) + return false; + + e = strchr(de->d_name, '.'); + if (!e) + return id128_is_valid(de->d_name); /* No namespace */ + + n = strndupa_safe(de->d_name, e - de->d_name); + if (!id128_is_valid(n)) + return false; + + return log_namespace_name_valid(e + 1); +} + +static int directory_open(sd_journal *j, const char *path, DIR **ret) { + DIR *d; + + assert(j); + assert(path); + assert(ret); + + if (j->toplevel_fd < 0) + d = opendir(path); + else + /* Open the specified directory relative to the toplevel fd. Enforce that the path specified is + * relative, by dropping the initial slash */ + d = xopendirat(j->toplevel_fd, skip_slash(path), 0); + if (!d) + return -errno; + + *ret = d; + return 0; +} + +static int add_directory(sd_journal *j, const char *prefix, const char *dirname); + +static void directory_enumerate(sd_journal *j, Directory *m, DIR *d) { + assert(j); + assert(m); + assert(d); + + FOREACH_DIRENT_ALL(de, d, goto fail) { + if (dirent_is_journal_file(de)) + (void) add_file_by_name(j, m->path, de->d_name); + + if (m->is_root && dirent_is_journal_subdir(de)) + (void) add_directory(j, m->path, de->d_name); + } + + return; +fail: + log_debug_errno(errno, "Failed to enumerate directory %s, ignoring: %m", m->path); +} + +static void directory_watch(sd_journal *j, Directory *m, int fd, uint32_t mask) { + int r; + + assert(j); + assert(m); + assert(fd >= 0); + + /* Watch this directory if that's enabled and if it not being watched yet. */ + + if (m->wd > 0) /* Already have a watch? */ + return; + if (j->inotify_fd < 0) /* Not watching at all? */ + return; + + m->wd = inotify_add_watch_fd(j->inotify_fd, fd, mask); + if (m->wd < 0) { + log_debug_errno(errno, "Failed to watch journal directory '%s', ignoring: %m", m->path); + return; + } + + r = hashmap_put(j->directories_by_wd, INT_TO_PTR(m->wd), m); + if (r == -EEXIST) + log_debug_errno(r, "Directory '%s' already being watched under a different path, ignoring: %m", m->path); + if (r < 0) { + log_debug_errno(r, "Failed to add watch for journal directory '%s' to hashmap, ignoring: %m", m->path); + (void) inotify_rm_watch(j->inotify_fd, m->wd); + m->wd = -1; + } +} + +static int add_directory( + sd_journal *j, + const char *prefix, + const char *dirname) { + + _cleanup_free_ char *path = NULL; + _cleanup_closedir_ DIR *d = NULL; + Directory *m; + int r, k; + + assert(j); + assert(prefix); + + /* Adds a journal file directory to watch. If the directory is already tracked this updates the inotify watch + * and reenumerates directory contents */ + + path = path_join(prefix, dirname); + if (!path) { + r = -ENOMEM; + goto fail; + } + + log_debug("Considering directory '%s'.", path); + + /* We consider everything local that is in a directory for the local machine ID, or that is stored in /run */ + if ((j->flags & SD_JOURNAL_LOCAL_ONLY) && + !((dirname && dirname_is_machine_id(dirname) > 0) || path_has_prefix(j, path, "/run"))) + return 0; + + if (dirname && + (!(FLAGS_SET(j->flags, SD_JOURNAL_ALL_NAMESPACES) || + dirname_has_namespace(dirname, j->namespace) > 0 || + (FLAGS_SET(j->flags, SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE) && dirname_has_namespace(dirname, NULL) > 0)))) + return 0; + + r = directory_open(j, path, &d); + if (r < 0) { + log_debug_errno(r, "Failed to open directory '%s': %m", path); + goto fail; + } + + m = hashmap_get(j->directories_by_path, path); + if (!m) { + m = new(Directory, 1); + if (!m) { + r = -ENOMEM; + goto fail; + } + + *m = (Directory) { + .is_root = false, + .path = path, + }; + + if (hashmap_put(j->directories_by_path, m->path, m) < 0) { + free(m); + r = -ENOMEM; + goto fail; + } + + path = NULL; /* avoid freeing in cleanup */ + j->current_invalidate_counter++; + + log_debug("Directory %s added.", m->path); + + } else if (m->is_root) + return 0; /* Don't 'downgrade' from root directory */ + + m->last_seen_generation = j->generation; + + directory_watch(j, m, dirfd(d), + IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB|IN_DELETE| + IN_DELETE_SELF|IN_MOVE_SELF|IN_UNMOUNT|IN_MOVED_FROM| + IN_ONLYDIR); + + if (!j->no_new_files) + directory_enumerate(j, m, d); + + check_network(j, dirfd(d)); + + return 0; + +fail: + k = journal_put_error(j, r, path ?: prefix); + if (k < 0) + return k; + + return r; +} + +static int add_root_directory(sd_journal *j, const char *p, bool missing_ok) { + + _cleanup_closedir_ DIR *d = NULL; + Directory *m; + int r, k; + + assert(j); + + /* Adds a root directory to our set of directories to use. If the root directory is already in the set, we + * update the inotify logic, and renumerate the directory entries. This call may hence be called to initially + * populate the set, as well as to update it later. */ + + if (p) { + /* If there's a path specified, use it. */ + + log_debug("Considering root directory '%s'.", p); + + if ((j->flags & SD_JOURNAL_RUNTIME_ONLY) && + !path_has_prefix(j, p, "/run")) + return -EINVAL; + + if (j->prefix) + p = strjoina(j->prefix, p); + + r = directory_open(j, p, &d); + if (r == -ENOENT && missing_ok) + return 0; + if (r < 0) { + log_debug_errno(r, "Failed to open root directory %s: %m", p); + goto fail; + } + } else { + _cleanup_close_ int dfd = -EBADF; + + /* If there's no path specified, then we use the top-level fd itself. We duplicate the fd here, since + * opendir() will take possession of the fd, and close it, which we don't want. */ + + p = "."; /* store this as "." in the directories hashmap */ + + dfd = fcntl(j->toplevel_fd, F_DUPFD_CLOEXEC, 3); + if (dfd < 0) { + r = -errno; + goto fail; + } + + d = take_fdopendir(&dfd); + if (!d) { + r = -errno; + goto fail; + } + + rewinddir(d); + } + + m = hashmap_get(j->directories_by_path, p); + if (!m) { + m = new0(Directory, 1); + if (!m) { + r = -ENOMEM; + goto fail; + } + + m->is_root = true; + + m->path = strdup(p); + if (!m->path) { + free(m); + r = -ENOMEM; + goto fail; + } + + if (hashmap_put(j->directories_by_path, m->path, m) < 0) { + free(m->path); + free(m); + r = -ENOMEM; + goto fail; + } + + j->current_invalidate_counter++; + + log_debug("Root directory %s added.", m->path); + + } else if (!m->is_root) + return 0; + + directory_watch(j, m, dirfd(d), + IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB|IN_DELETE| + IN_ONLYDIR); + + if (!j->no_new_files) + directory_enumerate(j, m, d); + + check_network(j, dirfd(d)); + + return 0; + +fail: + k = journal_put_error(j, r, p); + if (k < 0) + return k; + + return r; +} + +static void remove_directory(sd_journal *j, Directory *d) { + assert(j); + + if (d->wd > 0) { + hashmap_remove(j->directories_by_wd, INT_TO_PTR(d->wd)); + + if (j->inotify_fd >= 0) + (void) inotify_rm_watch(j->inotify_fd, d->wd); + } + + hashmap_remove(j->directories_by_path, d->path); + + if (d->is_root) + log_debug("Root directory %s removed.", d->path); + else + log_debug("Directory %s removed.", d->path); + + free(d->path); + free(d); +} + +static int add_search_paths(sd_journal *j) { + + static const char search_paths[] = + "/run/log/journal\0" + "/var/log/journal\0"; + + assert(j); + + /* We ignore most errors here, since the idea is to only open + * what's actually accessible, and ignore the rest. */ + + NULSTR_FOREACH(p, search_paths) + (void) add_root_directory(j, p, true); + + if (!(j->flags & SD_JOURNAL_LOCAL_ONLY)) + (void) add_root_directory(j, "/var/log/journal/remote", true); + + return 0; +} + +static int add_current_paths(sd_journal *j) { + JournalFile *f; + + assert(j); + assert(j->no_new_files); + + /* Simply adds all directories for files we have open as directories. We don't expect errors here, so we + * treat them as fatal. */ + + ORDERED_HASHMAP_FOREACH(f, j->files) { + _cleanup_free_ char *dir = NULL; + int r; + + r = path_extract_directory(f->path, &dir); + if (r < 0) + return r; + + r = add_directory(j, dir, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int allocate_inotify(sd_journal *j) { + assert(j); + + if (j->inotify_fd < 0) { + j->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (j->inotify_fd < 0) + return -errno; + } + + return hashmap_ensure_allocated(&j->directories_by_wd, NULL); +} + +static sd_journal *journal_new(int flags, const char *path, const char *namespace) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + + j = new(sd_journal, 1); + if (!j) + return NULL; + + *j = (sd_journal) { + .origin_id = origin_id_query(), + .toplevel_fd = -EBADF, + .inotify_fd = -EBADF, + .flags = flags, + .data_threshold = DEFAULT_DATA_THRESHOLD, + }; + + if (path) { + char *t; + + t = strdup(path); + if (!t) + return NULL; + + if (flags & SD_JOURNAL_OS_ROOT) + j->prefix = t; + else + j->path = t; + } + + if (namespace) { + j->namespace = strdup(namespace); + if (!j->namespace) + return NULL; + } + + j->files = ordered_hashmap_new(&path_hash_ops); + if (!j->files) + return NULL; + + j->files_cache = ordered_hashmap_iterated_cache_new(j->files); + j->directories_by_path = hashmap_new(&path_hash_ops); + j->mmap = mmap_cache_new(); + if (!j->files_cache || !j->directories_by_path || !j->mmap) + return NULL; + + return TAKE_PTR(j); +} + +#define OPEN_ALLOWED_FLAGS \ + (SD_JOURNAL_LOCAL_ONLY | \ + SD_JOURNAL_RUNTIME_ONLY | \ + SD_JOURNAL_SYSTEM | \ + SD_JOURNAL_CURRENT_USER | \ + SD_JOURNAL_ALL_NAMESPACES | \ + SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE) + +_public_ int sd_journal_open_namespace(sd_journal **ret, const char *namespace, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return((flags & ~OPEN_ALLOWED_FLAGS) == 0, -EINVAL); + + j = journal_new(flags, NULL, namespace); + if (!j) + return -ENOMEM; + + r = add_search_paths(j); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +_public_ int sd_journal_open(sd_journal **ret, int flags) { + return sd_journal_open_namespace(ret, NULL, flags); +} + +#define OPEN_CONTAINER_ALLOWED_FLAGS \ + (SD_JOURNAL_LOCAL_ONLY | SD_JOURNAL_SYSTEM) + +_public_ int sd_journal_open_container(sd_journal **ret, const char *machine, int flags) { + _cleanup_free_ char *root = NULL, *class = NULL; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + char *p; + int r; + + /* This is deprecated, people should use machined's OpenMachineRootDirectory() call instead in + * combination with sd_journal_open_directory_fd(). */ + + assert_return(machine, -EINVAL); + assert_return(ret, -EINVAL); + assert_return((flags & ~OPEN_CONTAINER_ALLOWED_FLAGS) == 0, -EINVAL); + assert_return(hostname_is_valid(machine, 0), -EINVAL); + + p = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, p, + "ROOT", &root, + "CLASS", &class); + if (r == -ENOENT) + return -EHOSTDOWN; + if (r < 0) + return r; + if (!root) + return -ENODATA; + + if (!streq_ptr(class, "container")) + return -EIO; + + j = journal_new(flags, root, NULL); + if (!j) + return -ENOMEM; + + r = add_search_paths(j); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +#define OPEN_DIRECTORY_ALLOWED_FLAGS \ + (SD_JOURNAL_OS_ROOT | \ + SD_JOURNAL_SYSTEM | SD_JOURNAL_CURRENT_USER ) + +_public_ int sd_journal_open_directory(sd_journal **ret, const char *path, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(path, -EINVAL); + assert_return((flags & ~OPEN_DIRECTORY_ALLOWED_FLAGS) == 0, -EINVAL); + + j = journal_new(flags, path, NULL); + if (!j) + return -ENOMEM; + + if (flags & SD_JOURNAL_OS_ROOT) + r = add_search_paths(j); + else + r = add_root_directory(j, path, false); + if (r < 0) + return r; + + *ret = TAKE_PTR(j); + return 0; +} + +_public_ int sd_journal_open_files(sd_journal **ret, const char **paths, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(flags == 0, -EINVAL); + + j = journal_new(flags, NULL, NULL); + if (!j) + return -ENOMEM; + + STRV_FOREACH(path, paths) { + r = add_any_file(j, -1, *path); + if (r < 0) + return r; + } + + j->no_new_files = true; + + *ret = TAKE_PTR(j); + return 0; +} + +#define OPEN_DIRECTORY_FD_ALLOWED_FLAGS \ + (SD_JOURNAL_OS_ROOT | \ + SD_JOURNAL_SYSTEM | \ + SD_JOURNAL_CURRENT_USER | \ + SD_JOURNAL_TAKE_DIRECTORY_FD) + +_public_ int sd_journal_open_directory_fd(sd_journal **ret, int fd, int flags) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + struct stat st; + bool take_fd; + int r; + + assert_return(ret, -EINVAL); + assert_return(fd >= 0, -EBADF); + assert_return((flags & ~OPEN_DIRECTORY_FD_ALLOWED_FLAGS) == 0, -EINVAL); + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISDIR(st.st_mode)) + return -EBADFD; + + take_fd = FLAGS_SET(flags, SD_JOURNAL_TAKE_DIRECTORY_FD); + j = journal_new(flags & ~SD_JOURNAL_TAKE_DIRECTORY_FD, NULL, NULL); + if (!j) + return -ENOMEM; + + j->toplevel_fd = fd; + + if (flags & SD_JOURNAL_OS_ROOT) + r = add_search_paths(j); + else + r = add_root_directory(j, NULL, false); + if (r < 0) + return r; + + SET_FLAG(j->flags, SD_JOURNAL_TAKE_DIRECTORY_FD, take_fd); + + *ret = TAKE_PTR(j); + return 0; +} + +_public_ int sd_journal_open_files_fd(sd_journal **ret, int fds[], unsigned n_fds, int flags) { + JournalFile *f; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(n_fds > 0, -EBADF); + assert_return(flags == 0, -EINVAL); + + j = journal_new(flags, NULL, NULL); + if (!j) + return -ENOMEM; + + for (unsigned i = 0; i < n_fds; i++) { + struct stat st; + + if (fds[i] < 0) { + r = -EBADF; + goto fail; + } + + if (fstat(fds[i], &st) < 0) { + r = -errno; + goto fail; + } + + r = stat_verify_regular(&st); + if (r < 0) + goto fail; + + r = add_any_file(j, fds[i], NULL); + if (r < 0) + goto fail; + } + + j->no_new_files = true; + j->no_inotify = true; + + *ret = TAKE_PTR(j); + return 0; + +fail: + /* If we fail, make sure we don't take possession of the files we managed to make use of successfully, and they + * remain open */ + ORDERED_HASHMAP_FOREACH(f, j->files) + f->close_fd = false; + + return r; +} + +_public_ void sd_journal_close(sd_journal *j) { + Directory *d; + Prioq *p; + + if (!j || journal_origin_changed(j)) + return; + + while ((p = hashmap_first(j->newest_by_boot_id))) + journal_file_unlink_newest_by_boot_id(j, prioq_peek(p)); + hashmap_free(j->newest_by_boot_id); + + sd_journal_flush_matches(j); + + ordered_hashmap_free_with_destructor(j->files, journal_file_close); + iterated_cache_free(j->files_cache); + + while ((d = hashmap_first(j->directories_by_path))) + remove_directory(j, d); + + while ((d = hashmap_first(j->directories_by_wd))) + remove_directory(j, d); + + hashmap_free(j->directories_by_path); + hashmap_free(j->directories_by_wd); + + if (FLAGS_SET(j->flags, SD_JOURNAL_TAKE_DIRECTORY_FD)) + safe_close(j->toplevel_fd); + + safe_close(j->inotify_fd); + + if (j->mmap) { + mmap_cache_stats_log_debug(j->mmap); + mmap_cache_unref(j->mmap); + } + + hashmap_free_free(j->errors); + + free(j->path); + free(j->prefix); + free(j->namespace); + free(j->unique_field); + free(j->fields_buffer); + free(j); +} + +static void journal_file_unlink_newest_by_boot_id(sd_journal *j, JournalFile *f) { + JournalFile *nf; + Prioq *p; + + assert(j); + assert(f); + + if (f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL) /* not linked currently, hence this is a NOP */ + return; + + assert_se(p = hashmap_get(j->newest_by_boot_id, &f->newest_boot_id)); + assert_se(prioq_remove(p, f, &f->newest_boot_id_prioq_idx) > 0); + + nf = prioq_peek(p); + if (nf) + /* There's still a member in the prioq? Then make sure the hashmap key now points to its + * .newest_boot_id field (and not ours!). Not we only replace the memory of the key here, the + * value of the key (and the data associated with it) remain the same. */ + assert_se(hashmap_replace(j->newest_by_boot_id, &nf->newest_boot_id, p) >= 0); + else { + assert_se(hashmap_remove(j->newest_by_boot_id, &f->newest_boot_id) == p); + prioq_free(p); + } + + f->newest_boot_id_prioq_idx = PRIOQ_IDX_NULL; +} + +static int journal_file_newest_monotonic_compare(const void *a, const void *b) { + const JournalFile *x = a, *y = b; + + return -CMP(x->newest_monotonic_usec, y->newest_monotonic_usec); /* Invert order, we want newest first! */ +} + +static int journal_file_reshuffle_newest_by_boot_id(sd_journal *j, JournalFile *f) { + Prioq *p; + int r; + + assert(j); + assert(f); + + p = hashmap_get(j->newest_by_boot_id, &f->newest_boot_id); + if (p) { + /* There's already a priority queue for this boot ID */ + + if (f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL) { + r = prioq_put(p, f, &f->newest_boot_id_prioq_idx); /* Insert if we aren't in there yet */ + if (r < 0) + return r; + } else + prioq_reshuffle(p, f, &f->newest_boot_id_prioq_idx); /* Reshuffle otherwise */ + + } else { + _cleanup_(prioq_freep) Prioq *q = NULL; + + /* No priority queue yet, then allocate one */ + + assert(f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL); /* we can't be a member either */ + + q = prioq_new(journal_file_newest_monotonic_compare); + if (!q) + return -ENOMEM; + + r = prioq_put(q, f, &f->newest_boot_id_prioq_idx); + if (r < 0) + return r; + + r = hashmap_ensure_put(&j->newest_by_boot_id, &id128_hash_ops, &f->newest_boot_id, q); + if (r < 0) { + f->newest_boot_id_prioq_idx = PRIOQ_IDX_NULL; + return r; + } + + TAKE_PTR(q); + } + + return 0; +} + +static int journal_file_read_tail_timestamp(sd_journal *j, JournalFile *f) { + uint64_t offset, mo, rt; + sd_id128_t id; + ObjectType type; + Object *o; + int r; + + assert(j); + assert(f); + assert(f->header); + + /* Tries to read the timestamp of the most recently written entry. */ + + r = journal_file_fstat(f); + if (r < 0) + return r; + if (f->newest_mtime == timespec_load(&f->last_stat.st_mtim)) + return 0; /* mtime didn't change since last time, don't bother */ + + if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_offset)) { + offset = le64toh(READ_NOW(f->header->tail_entry_offset)); + type = OBJECT_ENTRY; + } else { + offset = le64toh(READ_NOW(f->header->tail_object_offset)); + type = OBJECT_UNUSED; + } + if (offset == 0) + return -ENODATA; /* not a single object/entry, hence no tail timestamp */ + + /* Move to the last object in the journal file, in the hope it is an entry (which it usually will + * be). If we lack the "tail_entry_offset" field in the header, we specify the type as OBJECT_UNUSED + * here, since we cannot be sure what the last object will be, and want no noisy logging if it isn't + * an entry. We instead check after figuring out the pointer. */ + r = journal_file_move_to_object(f, type, offset, &o); + if (r < 0) { + log_debug_errno(r, "Failed to move to last object in journal file, ignoring: %m"); + o = NULL; + } + if (o && o->object.type == OBJECT_ENTRY) { + /* Yay, last object is an entry, let's use the data. */ + id = o->entry.boot_id; + mo = le64toh(o->entry.monotonic); + rt = le64toh(o->entry.realtime); + } else { + /* So the object is not an entry or we couldn't access it? In that case, let's read the most + * recent entry timestamps from the header. It's equally good. Unfortunately though, in old + * versions of the journal the boot ID in the header doesn't have to match the monotonic + * timestamp of the header. Let's check the header flag that indicates whether this strictly + * matches first hence, before using the data. */ + + if (JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) && f->header->state == STATE_ARCHIVED) { + mo = le64toh(f->header->tail_entry_monotonic); + rt = le64toh(f->header->tail_entry_realtime); + id = f->header->tail_entry_boot_id; + } else { + /* Otherwise let's find the last entry manually (this possibly means traversing the + * chain of entry arrays, till the end */ + r = journal_file_next_entry(f, 0, DIRECTION_UP, &o, NULL); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + + id = o->entry.boot_id; + mo = le64toh(o->entry.monotonic); + rt = le64toh(o->entry.realtime); + } + } + + if (mo > rt) /* monotonic clock is further ahead than realtime? that's weird, refuse to use the data */ + return -ENODATA; + + if (!sd_id128_equal(f->newest_boot_id, id)) + journal_file_unlink_newest_by_boot_id(j, f); + + f->newest_boot_id = id; + f->newest_monotonic_usec = mo; + f->newest_realtime_usec = rt; + f->newest_machine_id = f->header->machine_id; + f->newest_mtime = timespec_load(&f->last_stat.st_mtim); + + r = journal_file_reshuffle_newest_by_boot_id(j, f); + if (r < 0) + return r; + + return 0; +} + +_public_ int sd_journal_get_realtime_usec(sd_journal *j, uint64_t *ret) { + JournalFile *f; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + uint64_t t = le64toh(o->entry.realtime); + if (!VALID_REALTIME(t)) + return -EBADMSG; + + if (ret) + *ret = t; + + return 0; +} + +_public_ int sd_journal_get_monotonic_usec(sd_journal *j, uint64_t *ret, sd_id128_t *ret_boot_id) { + JournalFile *f; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + if (ret_boot_id) + *ret_boot_id = o->entry.boot_id; + else { + sd_id128_t id; + + r = sd_id128_get_boot(&id); + if (r < 0) + return r; + + if (!sd_id128_equal(id, o->entry.boot_id)) + return -ESTALE; + } + + uint64_t t = le64toh(o->entry.monotonic); + if (!VALID_MONOTONIC(t)) + return -EBADMSG; + + if (ret) + *ret = t; + + return 0; +} + +_public_ int sd_journal_get_seqnum( + sd_journal *j, + uint64_t *ret_seqnum, + sd_id128_t *ret_seqnum_id) { + + JournalFile *f; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + if (ret_seqnum_id) + *ret_seqnum_id = f->header->seqnum_id; + if (ret_seqnum) + *ret_seqnum = le64toh(o->entry.seqnum); + + return 0; +} + +static bool field_is_valid(const char *field) { + assert(field); + + if (isempty(field)) + return false; + + if (startswith(field, "__")) + return false; + + for (const char *p = field; *p; p++) { + + if (*p == '_') + continue; + + if (*p >= 'A' && *p <= 'Z') + continue; + + if (ascii_isdigit(*p)) + continue; + + return false; + } + + return true; +} + +_public_ int sd_journal_get_data(sd_journal *j, const char *field, const void **data, size_t *size) { + JournalFile *f; + size_t field_length; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(field, -EINVAL); + assert_return(data, -EINVAL); + assert_return(size, -EINVAL); + assert_return(field_is_valid(field), -EINVAL); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + field_length = strlen(field); + + uint64_t n = journal_file_entry_n_items(f, o); + for (uint64_t i = 0; i < n; i++) { + uint64_t p; + void *d; + size_t l; + + p = journal_file_entry_item_object_offset(f, o, i); + r = journal_file_data_payload(f, NULL, p, field, field_length, j->data_threshold, &d, &l); + if (r == 0) + continue; + if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) { + log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", i); + continue; + } + if (r < 0) + return r; + + *data = d; + *size = l; + + return 0; + } + + return -ENOENT; +} + +_public_ int sd_journal_enumerate_data(sd_journal *j, const void **data, size_t *size) { + JournalFile *f; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(data, -EINVAL); + assert_return(size, -EINVAL); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + for (uint64_t n = journal_file_entry_n_items(f, o); j->current_field < n; j->current_field++) { + uint64_t p; + void *d; + size_t l; + + p = journal_file_entry_item_object_offset(f, o, j->current_field); + r = journal_file_data_payload(f, NULL, p, NULL, 0, j->data_threshold, &d, &l); + if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) { + log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", j->current_field); + continue; + } + if (r < 0) + return r; + assert(r > 0); + + *data = d; + *size = l; + + j->current_field++; + + return 1; + } + + return 0; +} + +_public_ int sd_journal_enumerate_available_data(sd_journal *j, const void **data, size_t *size) { + for (;;) { + int r; + + r = sd_journal_enumerate_data(j, data, size); + if (r >= 0) + return r; + if (!JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(r)) + return r; + j->current_field++; /* Try with the next field */ + } +} + +_public_ void sd_journal_restart_data(sd_journal *j) { + if (!j || journal_origin_changed(j)) + return; + + j->current_field = 0; +} + +static int reiterate_all_paths(sd_journal *j) { + assert(j); + + if (j->no_new_files) + return add_current_paths(j); + + if (j->flags & SD_JOURNAL_OS_ROOT) + return add_search_paths(j); + + if (j->toplevel_fd >= 0) + return add_root_directory(j, NULL, false); + + if (j->path) + return add_root_directory(j, j->path, true); + + return add_search_paths(j); +} + +_public_ int sd_journal_get_fd(sd_journal *j) { + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + if (j->no_inotify) + return -EMEDIUMTYPE; + + if (j->inotify_fd >= 0) + return j->inotify_fd; + + r = allocate_inotify(j); + if (r < 0) + return r; + + log_debug("Reiterating files to get inotify watches established."); + + /* Iterate through all dirs again, to add them to the inotify */ + r = reiterate_all_paths(j); + if (r < 0) + return r; + + return j->inotify_fd; +} + +_public_ int sd_journal_get_events(sd_journal *j) { + int fd; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + fd = sd_journal_get_fd(j); + if (fd < 0) + return fd; + + return POLLIN; +} + +_public_ int sd_journal_get_timeout(sd_journal *j, uint64_t *timeout_usec) { + int fd; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(timeout_usec, -EINVAL); + + fd = sd_journal_get_fd(j); + if (fd < 0) + return fd; + + if (!j->on_network) { + *timeout_usec = UINT64_MAX; + return 0; + } + + /* If we are on the network we need to regularly check for + * changes manually */ + + *timeout_usec = j->last_process_usec + JOURNAL_FILES_RECHECK_USEC; + return 1; +} + +static void process_q_overflow(sd_journal *j) { + JournalFile *f; + Directory *m; + + assert(j); + + /* When the inotify queue overruns we need to enumerate and re-validate all journal files to bring our list + * back in sync with what's on disk. For this we pick a new generation counter value. It'll be assigned to all + * journal files we encounter. All journal files and all directories that don't carry it after reenumeration + * are subject for unloading. */ + + log_debug("Inotify queue overrun, reiterating everything."); + + j->generation++; + (void) reiterate_all_paths(j); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + + if (f->last_seen_generation == j->generation) + continue; + + log_debug("File '%s' hasn't been seen in this enumeration, removing.", f->path); + remove_file_real(j, f); + } + + HASHMAP_FOREACH(m, j->directories_by_path) { + + if (m->last_seen_generation == j->generation) + continue; + + if (m->is_root) /* Never GC root directories */ + continue; + + log_debug("Directory '%s' hasn't been seen in this enumeration, removing.", f->path); + remove_directory(j, m); + } + + log_debug("Reiteration complete."); +} + +static void process_inotify_event(sd_journal *j, const struct inotify_event *e) { + Directory *d; + + assert(j); + assert(e); + + if (e->mask & IN_Q_OVERFLOW) { + process_q_overflow(j); + return; + } + + /* Is this a subdirectory we watch? */ + d = hashmap_get(j->directories_by_wd, INT_TO_PTR(e->wd)); + if (d) { + if (!(e->mask & IN_ISDIR) && e->len > 0 && + (endswith(e->name, ".journal") || + endswith(e->name, ".journal~"))) { + + /* Event for a journal file */ + + if (e->mask & (IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB)) + (void) add_file_by_name(j, d->path, e->name); + else if (e->mask & (IN_DELETE|IN_MOVED_FROM|IN_UNMOUNT)) + (void) remove_file_by_name(j, d->path, e->name); + + } else if (!d->is_root && e->len == 0) { + + /* Event for a subdirectory */ + + if (e->mask & (IN_DELETE_SELF|IN_MOVE_SELF|IN_UNMOUNT)) + remove_directory(j, d); + + } else if (d->is_root && (e->mask & IN_ISDIR) && e->len > 0 && id128_is_valid(e->name)) { + + /* Event for root directory */ + + if (e->mask & (IN_CREATE|IN_MOVED_TO|IN_MODIFY|IN_ATTRIB)) + (void) add_directory(j, d->path, e->name); + } + + return; + } + + if (e->mask & IN_IGNORED) + return; + + log_debug("Unexpected inotify event."); +} + +static int determine_change(sd_journal *j) { + bool b; + + assert(j); + + b = j->current_invalidate_counter != j->last_invalidate_counter; + j->last_invalidate_counter = j->current_invalidate_counter; + + return b ? SD_JOURNAL_INVALIDATE : SD_JOURNAL_APPEND; +} + +_public_ int sd_journal_process(sd_journal *j) { + bool got_something = false; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + if (j->inotify_fd < 0) /* We have no inotify fd yet? Then there's noting to process. */ + return 0; + + j->last_process_usec = now(CLOCK_MONOTONIC); + j->last_invalidate_counter = j->current_invalidate_counter; + + for (;;) { + union inotify_event_buffer buffer; + ssize_t l; + + l = read(j->inotify_fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return got_something ? determine_change(j) : SD_JOURNAL_NOP; + + return -errno; + } + + got_something = true; + + FOREACH_INOTIFY_EVENT(e, buffer, l) + process_inotify_event(j, e); + } +} + +_public_ int sd_journal_wait(sd_journal *j, uint64_t timeout_usec) { + int r; + uint64_t t; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + if (j->inotify_fd < 0) { + JournalFile *f; + + /* This is the first invocation, hence create the inotify watch */ + r = sd_journal_get_fd(j); + if (r < 0) + return r; + + /* Server might have done some vacuuming while we weren't watching. Get rid of the deleted + * files now so they don't stay around indefinitely. */ + ORDERED_HASHMAP_FOREACH(f, j->files) { + r = journal_file_fstat(f); + if (r == -EIDRM) + remove_file_real(j, f); + else if (r < 0) + log_debug_errno(r, "Failed to fstat() journal file '%s', ignoring: %m", f->path); + } + + /* The journal might have changed since the context object was created and we weren't + * watching before, hence don't wait for anything, and return immediately. */ + return determine_change(j); + } + + r = sd_journal_get_timeout(j, &t); + if (r < 0) + return r; + + if (t != UINT64_MAX) { + t = usec_sub_unsigned(t, now(CLOCK_MONOTONIC)); + + if (timeout_usec == UINT64_MAX || timeout_usec > t) + timeout_usec = t; + } + + do { + r = fd_wait_for_event(j->inotify_fd, POLLIN, timeout_usec); + } while (r == -EINTR); + + if (r < 0) + return r; + + return sd_journal_process(j); +} + +_public_ int sd_journal_get_cutoff_realtime_usec(sd_journal *j, uint64_t *from, uint64_t *to) { + JournalFile *f; + bool first = true; + uint64_t fmin = 0, tmax = 0; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(from || to, -EINVAL); + assert_return(from != to, -EINVAL); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + usec_t fr, t; + + r = journal_file_get_cutoff_realtime_usec(f, &fr, &t); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + if (r == 0) + continue; + + if (first) { + fmin = fr; + tmax = t; + first = false; + } else { + fmin = MIN(fr, fmin); + tmax = MAX(t, tmax); + } + } + + if (from) + *from = fmin; + if (to) + *to = tmax; + + return first ? 0 : 1; +} + +_public_ int sd_journal_get_cutoff_monotonic_usec( + sd_journal *j, + sd_id128_t boot_id, + uint64_t *ret_from, + uint64_t *ret_to) { + + uint64_t from = UINT64_MAX, to = UINT64_MAX; + bool found = false; + JournalFile *f; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(ret_from != ret_to, -EINVAL); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + usec_t ff, tt; + + r = journal_file_get_cutoff_monotonic_usec(f, boot_id, &ff, &tt); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + if (r == 0) + continue; + + if (found) { + from = MIN(ff, from); + to = MAX(tt, to); + } else { + from = ff; + to = tt; + found = true; + } + } + + if (ret_from) + *ret_from = from; + if (ret_to) + *ret_to = to; + + return found; +} + +void journal_print_header(sd_journal *j) { + JournalFile *f; + bool newline = false; + + assert(j); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + if (newline) + putchar('\n'); + else + newline = true; + + journal_file_print_header(f); + } +} + +_public_ int sd_journal_get_usage(sd_journal *j, uint64_t *ret) { + JournalFile *f; + uint64_t sum = 0; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(ret, -EINVAL); + + ORDERED_HASHMAP_FOREACH(f, j->files) { + struct stat st; + uint64_t b; + + if (fstat(f->fd, &st) < 0) + return -errno; + + b = (uint64_t) st.st_blocks; + if (b > UINT64_MAX / 512) + return -EOVERFLOW; + b *= 512; + + if (sum > UINT64_MAX - b) + return -EOVERFLOW; + sum += b; + } + + *ret = sum; + return 0; +} + +_public_ int sd_journal_query_unique(sd_journal *j, const char *field) { + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + if (!field_is_valid(field)) + return -EINVAL; + + r = free_and_strdup(&j->unique_field, field); + if (r < 0) + return r; + + j->unique_file = NULL; + j->unique_offset = 0; + j->unique_file_lost = false; + + return 0; +} + +_public_ int sd_journal_enumerate_unique( + sd_journal *j, + const void **ret_data, + size_t *ret_size) { + + size_t k; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(j->unique_field, -EINVAL); + + k = strlen(j->unique_field); + + if (!j->unique_file) { + if (j->unique_file_lost) + return 0; + + j->unique_file = ordered_hashmap_first(j->files); + if (!j->unique_file) + return 0; + + j->unique_offset = 0; + } + + for (;;) { + JournalFile *of; + Object *o; + void *odata; + size_t ol; + bool found; + int r; + + /* Proceed to next data object in the field's linked list */ + if (j->unique_offset == 0) { + r = journal_file_find_field_object(j->unique_file, j->unique_field, k, &o, NULL); + if (r < 0) + return r; + + j->unique_offset = r > 0 ? le64toh(o->field.head_data_offset) : 0; + } else { + r = journal_file_move_to_object(j->unique_file, OBJECT_DATA, j->unique_offset, &o); + if (r < 0) + return r; + + j->unique_offset = le64toh(o->data.next_field_offset); + } + + /* We reached the end of the list? Then start again, with the next file */ + if (j->unique_offset == 0) { + j->unique_file = ordered_hashmap_next(j->files, j->unique_file->path); + if (!j->unique_file) + return 0; + + continue; + } + + r = journal_file_move_to_object(j->unique_file, OBJECT_DATA, j->unique_offset, &o); + if (r < 0) + return r; + + /* Let's pin the data object, so we can look at it at the same time as one on another file. */ + r = journal_file_pin_object(j->unique_file, o); + if (r < 0) + return r; + + r = journal_file_data_payload(j->unique_file, o, j->unique_offset, NULL, 0, + j->data_threshold, &odata, &ol); + if (r < 0) + return r; + + /* Check if we have at least the field name and "=". */ + if (ol <= k) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object has size %zu, expected at least %zu", + j->unique_file->path, + j->unique_offset, ol, k + 1); + + if (memcmp(odata, j->unique_field, k) != 0 || ((const char*) odata)[k] != '=') + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object does not start with \"%s=\"", + j->unique_file->path, + j->unique_offset, + j->unique_field); + + /* OK, now let's see if we already returned this data object by checking if it exists in the + * earlier traversed files. */ + found = false; + ORDERED_HASHMAP_FOREACH(of, j->files) { + if (of == j->unique_file) + break; + + /* Skip this file it didn't have any fields indexed */ + if (JOURNAL_HEADER_CONTAINS(of->header, n_fields) && le64toh(of->header->n_fields) <= 0) + continue; + + /* We can reuse the hash from our current file only on old-style journal files + * without keyed hashes. On new-style files we have to calculate the hash anew, to + * take the per-file hash seed into consideration. */ + if (!JOURNAL_HEADER_KEYED_HASH(j->unique_file->header) && !JOURNAL_HEADER_KEYED_HASH(of->header)) + r = journal_file_find_data_object_with_hash(of, odata, ol, le64toh(o->data.hash), NULL, NULL); + else + r = journal_file_find_data_object(of, odata, ol, NULL, NULL); + if (r < 0) + return r; + if (r > 0) { + found = true; + break; + } + } + + if (found) + continue; + + *ret_data = odata; + *ret_size = ol; + + return 1; + } +} + +_public_ int sd_journal_enumerate_available_unique(sd_journal *j, const void **data, size_t *size) { + for (;;) { + int r; + + r = sd_journal_enumerate_unique(j, data, size); + if (r >= 0) + return r; + if (!JOURNAL_ERRNO_IS_UNAVAILABLE_FIELD(r)) + return r; + /* Try with the next field. sd_journal_enumerate_unique() modifies state, so on the next try + * we will access the next field. */ + } +} + +_public_ void sd_journal_restart_unique(sd_journal *j) { + if (!j || journal_origin_changed(j)) + return; + + j->unique_file = NULL; + j->unique_offset = 0; + j->unique_file_lost = false; +} + +_public_ int sd_journal_enumerate_fields(sd_journal *j, const char **field) { + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(field, -EINVAL); + + if (!j->fields_file) { + if (j->fields_file_lost) + return 0; + + j->fields_file = ordered_hashmap_first(j->files); + if (!j->fields_file) + return 0; + + j->fields_hash_table_index = 0; + j->fields_offset = 0; + } + + for (;;) { + JournalFile *f, *of; + uint64_t m; + Object *o; + size_t sz; + bool found; + + f = j->fields_file; + + if (j->fields_offset == 0) { + bool eof = false; + + /* We are not yet positioned at any field. Let's pick the first one */ + r = journal_file_map_field_hash_table(f); + if (r < 0) + return r; + + m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem); + for (;;) { + if (j->fields_hash_table_index >= m) { + /* Reached the end of the hash table, go to the next file. */ + eof = true; + break; + } + + j->fields_offset = le64toh(f->field_hash_table[j->fields_hash_table_index].head_hash_offset); + + if (j->fields_offset != 0) + break; + + /* Empty hash table bucket, go to next one */ + j->fields_hash_table_index++; + } + + if (eof) { + /* Proceed with next file */ + j->fields_file = ordered_hashmap_next(j->files, f->path); + if (!j->fields_file) { + *field = NULL; + return 0; + } + + j->fields_offset = 0; + j->fields_hash_table_index = 0; + continue; + } + + } else { + /* We are already positioned at a field. If so, let's figure out the next field from it */ + + r = journal_file_move_to_object(f, OBJECT_FIELD, j->fields_offset, &o); + if (r < 0) + return r; + + j->fields_offset = le64toh(o->field.next_hash_offset); + if (j->fields_offset == 0) { + /* Reached the end of the hash table chain */ + j->fields_hash_table_index++; + continue; + } + } + + /* We use OBJECT_UNUSED here, so that the iterator below doesn't remove our mmap window */ + r = journal_file_move_to_object(f, OBJECT_UNUSED, j->fields_offset, &o); + if (r < 0) + return r; + + /* Because we used OBJECT_UNUSED above, we need to do our type check manually */ + if (o->object.type != OBJECT_FIELD) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "%s:offset " OFSfmt ": object has type %i, expected %i", + f->path, j->fields_offset, + o->object.type, OBJECT_FIELD); + + sz = le64toh(o->object.size) - offsetof(Object, field.payload); + + /* Let's see if we already returned this field name before. */ + found = false; + ORDERED_HASHMAP_FOREACH(of, j->files) { + if (of == f) + break; + + /* Skip this file it didn't have any fields indexed */ + if (JOURNAL_HEADER_CONTAINS(of->header, n_fields) && le64toh(of->header->n_fields) <= 0) + continue; + + if (!JOURNAL_HEADER_KEYED_HASH(f->header) && !JOURNAL_HEADER_KEYED_HASH(of->header)) + r = journal_file_find_field_object_with_hash(of, o->field.payload, sz, + le64toh(o->field.hash), NULL, NULL); + else + r = journal_file_find_field_object(of, o->field.payload, sz, NULL, NULL); + if (r < 0) + return r; + if (r > 0) { + found = true; + break; + } + } + + if (found) + continue; + + /* Check if this is really a valid string containing no NUL byte */ + if (memchr(o->field.payload, 0, sz)) + return -EBADMSG; + + if (j->data_threshold > 0 && sz > j->data_threshold) + sz = j->data_threshold; + + if (!GREEDY_REALLOC(j->fields_buffer, sz + 1)) + return -ENOMEM; + + memcpy(j->fields_buffer, o->field.payload, sz); + j->fields_buffer[sz] = 0; + + if (!field_is_valid(j->fields_buffer)) + return -EBADMSG; + + *field = j->fields_buffer; + return 1; + } +} + +_public_ void sd_journal_restart_fields(sd_journal *j) { + if (!j || journal_origin_changed(j)) + return; + + j->fields_file = NULL; + j->fields_hash_table_index = 0; + j->fields_offset = 0; + j->fields_file_lost = false; +} + +_public_ int sd_journal_reliable_fd(sd_journal *j) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + return !j->on_network; +} + +static char *lookup_field(const char *field, void *userdata) { + sd_journal *j = ASSERT_PTR(userdata); + const void *data; + size_t size, d; + int r; + + assert(field); + + r = sd_journal_get_data(j, field, &data, &size); + if (r < 0 || + size > REPLACE_VAR_MAX) + return strdup(field); + + d = strlen(field) + 1; + + return strndup((const char*) data + d, size - d); +} + +_public_ int sd_journal_get_catalog(sd_journal *j, char **ret) { + const void *data; + size_t size; + sd_id128_t id; + _cleanup_free_ char *text = NULL, *cid = NULL; + char *t; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(ret, -EINVAL); + + r = sd_journal_get_data(j, "MESSAGE_ID", &data, &size); + if (r < 0) + return r; + + cid = strndup((const char*) data + 11, size - 11); + if (!cid) + return -ENOMEM; + + r = sd_id128_from_string(cid, &id); + if (r < 0) + return r; + + r = catalog_get(secure_getenv("SYSTEMD_CATALOG") ?: CATALOG_DATABASE, id, &text); + if (r < 0) + return r; + + t = replace_var(text, lookup_field, j); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +_public_ int sd_journal_get_catalog_for_message_id(sd_id128_t id, char **ret) { + assert_return(ret, -EINVAL); + + return catalog_get(CATALOG_DATABASE, id, ret); +} + +_public_ int sd_journal_set_data_threshold(sd_journal *j, size_t sz) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + + j->data_threshold = sz; + return 0; +} + +_public_ int sd_journal_get_data_threshold(sd_journal *j, size_t *sz) { + assert_return(j, -EINVAL); + assert_return(!journal_origin_changed(j), -ECHILD); + assert_return(sz, -EINVAL); + + *sz = j->data_threshold; + return 0; +} + +_public_ int sd_journal_has_runtime_files(sd_journal *j) { + assert_return(j, -EINVAL); + + return j->has_runtime_files; +} + +_public_ int sd_journal_has_persistent_files(sd_journal *j) { + assert_return(j, -EINVAL); + + return j->has_persistent_files; +} diff --git a/src/libsystemd/sd-journal/test-audit-type.c b/src/libsystemd/sd-journal/test-audit-type.c new file mode 100644 index 0000000..1d5003b --- /dev/null +++ b/src/libsystemd/sd-journal/test-audit-type.c @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "audit-type.h" +#include "tests.h" + +static void print_audit_label(int i) { + const char *name; + + name = audit_type_name_alloca(i); + /* This is a separate function only because of alloca */ + printf("%i → %s → %s\n", i, audit_type_to_string(i), name); +} + +TEST(audit_type) { + int i; + + for (i = 0; i <= AUDIT_KERNEL; i++) + print_audit_label(i); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-journal/test-catalog.c b/src/libsystemd/sd-journal/test-catalog.c new file mode 100644 index 0000000..603952e --- /dev/null +++ b/src/libsystemd/sd-journal/test-catalog.c @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "catalog.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static char** catalog_dirs = NULL; +static const char *no_catalog_dirs[] = { + "/bin/hopefully/with/no/catalog", + NULL +}; + +static OrderedHashmap* test_import(const char* contents, ssize_t size, int code) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-catalog.XXXXXX"; + _cleanup_close_ int fd = -EBADF; + OrderedHashmap *h; + + if (size < 0) + size = strlen(contents); + + assert_se(h = ordered_hashmap_new(&catalog_hash_ops)); + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se(write(fd, contents, size) == size); + + assert_se(catalog_import_file(h, name) == code); + + return h; +} + +static void test_catalog_import_invalid(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + + h = test_import("xxx", -1, -EINVAL); + assert_se(ordered_hashmap_isempty(h)); +} + +static void test_catalog_import_badid(void) { + _unused_ _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededede\n" \ +"Subject: message\n" \ +"\n" \ +"payload\n"; + h = test_import(input, -1, -EINVAL); +} + +static void test_catalog_import_one(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + char *payload; + + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: message\n" \ +"\n" \ +"payload\n"; + const char *expect = +"Subject: message\n" \ +"\n" \ +"payload\n"; + + h = test_import(input, -1, 0); + assert_se(ordered_hashmap_size(h) == 1); + + ORDERED_HASHMAP_FOREACH(payload, h) { + printf("expect: %s\n", expect); + printf("actual: %s\n", payload); + assert_se(streq(expect, payload)); + } +} + +static void test_catalog_import_merge(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + char *payload; + + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"payload\n" \ +"\n" \ +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"\n" \ +"override payload\n"; + + const char *combined = +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"override payload\n"; + + h = test_import(input, -1, 0); + assert_se(ordered_hashmap_size(h) == 1); + + ORDERED_HASHMAP_FOREACH(payload, h) + assert_se(streq(combined, payload)); +} + +static void test_catalog_import_merge_no_body(void) { + _cleanup_ordered_hashmap_free_free_free_ OrderedHashmap *h = NULL; + char *payload; + + const char *input = +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"payload\n" \ +"\n" \ +"-- 0027229ca0644181a76c4e92458afaff dededededededededededededededed\n" \ +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"\n"; + + const char *combined = +"Subject: override subject\n" \ +"X-Header: hello\n" \ +"Subject: message\n" \ +"Defined-By: me\n" \ +"\n" \ +"payload\n"; + + h = test_import(input, -1, 0); + assert_se(ordered_hashmap_size(h) == 1); + + ORDERED_HASHMAP_FOREACH(payload, h) + assert_se(streq(combined, payload)); +} + +static void test_catalog_update(const char *database) { + int r; + + /* Test what happens if there are no files. */ + r = catalog_update(database, NULL, NULL); + assert_se(r == 0); + + /* Test what happens if there are no files in the directory. */ + r = catalog_update(database, NULL, no_catalog_dirs); + assert_se(r == 0); + + /* Make sure that we at least have some files loaded or the + * catalog_list below will fail. */ + r = catalog_update(database, NULL, (const char * const *) catalog_dirs); + assert_se(r == 0); +} + +static void test_catalog_file_lang(void) { + _cleanup_free_ char *lang = NULL, *lang2 = NULL, *lang3 = NULL, *lang4 = NULL; + + assert_se(catalog_file_lang("systemd.de_DE.catalog", &lang) == 1); + assert_se(streq(lang, "de_DE")); + + assert_se(catalog_file_lang("systemd..catalog", &lang2) == 0); + assert_se(lang2 == NULL); + + assert_se(catalog_file_lang("systemd.fr.catalog", &lang2) == 1); + assert_se(streq(lang2, "fr")); + + assert_se(catalog_file_lang("systemd.fr.catalog.gz", &lang3) == 0); + assert_se(lang3 == NULL); + + assert_se(catalog_file_lang("systemd.01234567890123456789012345678901.catalog", &lang3) == 0); + assert_se(lang3 == NULL); + + assert_se(catalog_file_lang("systemd.0123456789012345678901234567890.catalog", &lang3) == 1); + assert_se(streq(lang3, "0123456789012345678901234567890")); + + assert_se(catalog_file_lang("/x/y/systemd.catalog", &lang4) == 0); + assert_se(lang4 == NULL); + + assert_se(catalog_file_lang("/x/y/systemd.ru_RU.catalog", &lang4) == 1); + assert_se(streq(lang4, "ru_RU")); +} + +int main(int argc, char *argv[]) { + _cleanup_(unlink_tempfilep) char database[] = "/tmp/test-catalog.XXXXXX"; + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *text = NULL; + int r; + + setlocale(LC_ALL, "de_DE.UTF-8"); + + test_setup_logging(LOG_DEBUG); + + /* If test-catalog is located at the build directory, then use catalogs in that. + * If it is not, e.g. installed by systemd-tests package, then use installed catalogs. */ + catalog_dirs = STRV_MAKE(get_catalog_dir()); + + assert_se(access(catalog_dirs[0], F_OK) >= 0); + log_notice("Using catalog directory '%s'", catalog_dirs[0]); + + test_catalog_file_lang(); + + test_catalog_import_invalid(); + test_catalog_import_badid(); + test_catalog_import_one(); + test_catalog_import_merge(); + test_catalog_import_merge_no_body(); + + assert_se((fd = mkostemp_safe(database)) >= 0); + + test_catalog_update(database); + + r = catalog_list(stdout, database, true); + assert_se(r >= 0); + + r = catalog_list(stdout, database, false); + assert_se(r >= 0); + + assert_se(catalog_get(database, SD_MESSAGE_COREDUMP, &text) >= 0); + printf(">>>%s<<<\n", text); + + return 0; +} diff --git a/src/libsystemd/sd-journal/test-journal-append.c b/src/libsystemd/sd-journal/test-journal-append.c new file mode 100644 index 0000000..24b98c8 --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-append.c @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "chattr-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "journal-file-util.h" +#include "log.h" +#include "mmap-cache.h" +#include "parse-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "strv.h" +#include "terminal-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static int journal_append_message(JournalFile *mj, const char *message) { + struct iovec iovec; + struct dual_timestamp ts; + + assert(mj); + assert(message); + + dual_timestamp_now(&ts); + iovec = IOVEC_MAKE_STRING(message); + return journal_file_append_entry( + mj, + &ts, + /* boot_id= */ NULL, + &iovec, + /* n_iovec= */ 1, + /* seqnum= */ NULL, + /* seqnum_id= */ NULL, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); +} + +static int journal_corrupt_and_append(uint64_t start_offset, uint64_t step) { + _cleanup_(mmap_cache_unrefp) MMapCache *mmap_cache = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tempdir = NULL; + _cleanup_(journal_file_offline_closep) JournalFile *mj = NULL; + uint64_t start, end; + int r; + + mmap_cache = mmap_cache_new(); + assert_se(mmap_cache); + + /* journal_file_open() requires a valid machine id */ + if (sd_id128_get_machine(NULL) < 0) + return log_tests_skipped("No valid machine ID found"); + + assert_se(mkdtemp_malloc("/tmp/journal-append-XXXXXX", &tempdir) >= 0); + assert_se(chdir(tempdir) >= 0); + (void) chattr_path(tempdir, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + log_debug("Opening journal %s/system.journal", tempdir); + + r = journal_file_open( + /* fd= */ -1, + "system.journal", + O_RDWR|O_CREAT, + JOURNAL_COMPRESS, + 0644, + /* compress_threshold_bytes= */ UINT64_MAX, + /* metrics= */ NULL, + mmap_cache, + /* template= */ NULL, + &mj); + if (r < 0) + return log_error_errno(r, "Failed to open the journal: %m"); + + assert_se(mj); + + /* Add a couple of initial messages */ + for (int i = 0; i < 10; i++) { + _cleanup_free_ char *message = NULL; + + assert_se(asprintf(&message, "MESSAGE=Initial message %d", i) >= 0); + r = journal_append_message(mj, message); + if (r < 0) + return log_error_errno(r, "Failed to write to the journal: %m"); + } + + start = start_offset == UINT64_MAX ? random_u64() % mj->last_stat.st_size : start_offset; + end = (uint64_t) mj->last_stat.st_size; + + /* Print the initial offset at which we start flipping bits, which can be + * later used to reproduce a potential fail */ + log_info("Start offset: %" PRIu64 ", corrupt-step: %" PRIu64, start, step); + fflush(stdout); + + if (start >= end) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Start offset >= journal size, can't continue"); + + for (uint64_t offset = start; offset < end; offset += step) { + _cleanup_free_ char *message = NULL; + uint8_t b; + + /* Flip a bit in the journal file */ + r = pread(mj->fd, &b, 1, offset); + assert_se(r == 1); + b |= 0x1; + r = pwrite(mj->fd, &b, 1, offset); + assert_se(r == 1); + + /* Close and reopen the journal to flush all caches and remap + * the corrupted journal */ + mj = journal_file_offline_close(mj); + r = journal_file_open( + /* fd= */ -1, + "system.journal", + O_RDWR|O_CREAT, + JOURNAL_COMPRESS, + 0644, + /* compress_threshold_bytes= */ UINT64_MAX, + /* metrics= */ NULL, + mmap_cache, + /* template= */ NULL, + &mj); + if (r < 0) { + /* The corrupted journal might get rejected during reopening + * if it's corrupted enough (especially its header), so + * treat this as a success if it doesn't crash */ + log_info_errno(r, "Failed to reopen the journal: %m"); + break; + } + + /* Try to write something to the (possibly corrupted) journal */ + assert_se(asprintf(&message, "MESSAGE=Hello world %" PRIu64, offset) >= 0); + r = journal_append_message(mj, message); + if (r < 0) { + /* We care only about crashes or sanitizer errors, + * failed write without any crash is a success */ + log_info_errno(r, "Failed to write to the journal: %m"); + break; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + uint64_t start_offset = UINT64_MAX; + uint64_t iterations = 100; + uint64_t iteration_step = 1; + uint64_t corrupt_step = 31; + bool sequential = false, run_one = false; + int c, r; + + test_setup_logging(LOG_DEBUG); + + enum { + ARG_START_OFFSET = 0x1000, + ARG_ITERATIONS, + ARG_ITERATION_STEP, + ARG_CORRUPT_STEP, + ARG_SEQUENTIAL, + ARG_RUN_ONE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "start-offset", required_argument, NULL, ARG_START_OFFSET }, + { "iterations", required_argument, NULL, ARG_ITERATIONS }, + { "iteration-step", required_argument, NULL, ARG_ITERATION_STEP }, + { "corrupt-step", required_argument, NULL, ARG_CORRUPT_STEP }, + { "sequential", no_argument, NULL, ARG_SEQUENTIAL }, + { "run-one", required_argument, NULL, ARG_RUN_ONE }, + {} + }; + + assert_se(argc >= 0); + assert_se(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + printf("Syntax:\n" + " %s [OPTION...]\n" + "Options:\n" + " --start-offset=OFFSET Offset at which to start corrupting the journal\n" + " (default: random offset is picked, unless\n" + " --sequential is used - in that case we use 0 + iteration)\n" + " --iterations=ITER Number of iterations to perform before exiting\n" + " (default: 100)\n" + " --iteration-step=STEP Iteration step (default: 1)\n" + " --corrupt-step=STEP Corrupt every n-th byte starting from OFFSET (default: 31)\n" + " --sequential Go through offsets sequentially instead of picking\n" + " a random one on each iteration. If set, we go through\n" + " offsets <0; ITER), or 0) + /* Reached the end of the journal file */ + break; + } + + return EXIT_SUCCESS; +} diff --git a/src/libsystemd/sd-journal/test-journal-enum.c b/src/libsystemd/sd-journal/test-journal-enum.c new file mode 100644 index 0000000..03fe8e2 --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-enum.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-journal.h" + +#include "journal-internal.h" +#include "log.h" +#include "macro.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + unsigned n = 0; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY) >= 0); + + assert_se(sd_journal_add_match(j, "_TRANSPORT=syslog", 0) >= 0); + assert_se(sd_journal_add_match(j, "_UID=0", 0) >= 0); + + SD_JOURNAL_FOREACH_BACKWARDS(j) { + const void *d; + size_t l; + + assert_se(sd_journal_get_data(j, "MESSAGE", &d, &l) >= 0); + + printf("%.*s\n", (int) l, (char*) d); + + n++; + if (n >= 10) + break; + } + + return 0; +} diff --git a/src/libsystemd/sd-journal/test-journal-file.c b/src/libsystemd/sd-journal/test-journal-file.c new file mode 100644 index 0000000..729de1f --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-file.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "journal-file.h" +#include "tests.h" +#include "user-util.h" + +static void test_journal_file_parse_uid_from_filename_simple( + const char *path, + uid_t expected_uid, + int expected_error) { + + uid_t uid = UID_INVALID; + int r; + + log_info("testing %s", path); + + r = journal_file_parse_uid_from_filename(path, &uid); + assert_se(r == expected_error); + if (r < 0) + assert_se(uid == UID_INVALID); + else + assert_se(uid == expected_uid); +} + +TEST(journal_file_parse_uid_from_filename) { + + test_journal_file_parse_uid_from_filename_simple("/var/log/journal/", 0, -EISDIR); + + /* The helper should return -EREMOTE for any filenames that don't look like an online or offline user + * journals. This includes archived and disposed journal files. */ + test_journal_file_parse_uid_from_filename_simple("/etc/password", 0, -EREMOTE); + test_journal_file_parse_uid_from_filename_simple("system.journal", 0, -EREMOTE); + test_journal_file_parse_uid_from_filename_simple("user-1000@0005d26980bdce6e-2f2a4939583822ef.journal~", 0, -EREMOTE); + test_journal_file_parse_uid_from_filename_simple("user-1000@xxx-yyy-zzz.journal", 0, -EREMOTE); + + test_journal_file_parse_uid_from_filename_simple("user-1000.journal", 1000, 0); + test_journal_file_parse_uid_from_filename_simple("user-foo.journal", 0, -EINVAL); + test_journal_file_parse_uid_from_filename_simple("user-65535.journal", 0, -ENXIO); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-journal/test-journal-flush.c b/src/libsystemd/sd-journal/test-journal-flush.c new file mode 100644 index 0000000..3f07835 --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-flush.c @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "journal-file-util.h" +#include "journal-internal.h" +#include "logs-show.h" +#include "macro.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static void test_journal_flush_one(int argc, char *argv[]) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + _cleanup_free_ char *fn = NULL; + _cleanup_(rm_rf_physical_and_freep) char *dn = NULL; + _cleanup_(journal_file_offline_closep) JournalFile *new_journal = NULL; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + unsigned n, limit; + int r; + + assert_se(m = mmap_cache_new()); + assert_se(mkdtemp_malloc("/var/tmp/test-journal-flush.XXXXXX", &dn) >= 0); + (void) chattr_path(dn, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + assert_se(fn = path_join(dn, "test.journal")); + + r = journal_file_open(-1, fn, O_CREAT|O_RDWR, 0, 0644, 0, NULL, m, NULL, &new_journal); + assert_se(r >= 0); + + if (argc > 1) + r = sd_journal_open_files(&j, (const char **) strv_skip(argv, 1), 0); + else + r = sd_journal_open(&j, 0); + assert_se(r == 0); + + sd_journal_set_data_threshold(j, 0); + + n = 0; + limit = slow_tests_enabled() ? 10000 : 1000; + SD_JOURNAL_FOREACH(j) { + Object *o; + JournalFile *f; + + f = j->current_file; + assert_se(f && f->current_offset > 0); + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + log_error_errno(r, "journal_file_move_to_object failed: %m"); + assert_se(r >= 0); + + r = journal_file_copy_entry(f, new_journal, o, f->current_offset, NULL, NULL); + if (r < 0) + log_warning_errno(r, "journal_file_copy_entry failed: %m"); + assert_se(r >= 0 || + IN_SET(r, -EBADMSG, /* corrupted file */ + -EPROTONOSUPPORT, /* unsupported compression */ + -EIO, /* file rotated */ + -EREMCHG)); /* clock rollback */ + + if (++n >= limit) + break; + } + + if (n == 0) + return (void) log_tests_skipped("No journal entry found"); + + /* Open the new journal before archiving and offlining the file. */ + sd_journal_close(j); + assert_se(sd_journal_open_directory(&j, dn, 0) >= 0); + + /* Read the online journal. */ + assert_se(sd_journal_seek_tail(j) >= 0); + assert_se(sd_journal_step_one(j, 0) > 0); + printf("current_journal: %s (%i)\n", j->current_file->path, j->current_file->fd); + assert_se(show_journal_entry(stdout, j, OUTPUT_EXPORT, 0, 0, NULL, NULL, NULL, &(dual_timestamp) {}, &(sd_id128_t) {}) >= 0); + + uint64_t p; + assert_se(journal_file_tail_end_by_mmap(j->current_file, &p) >= 0); + for (uint64_t q = ALIGN64(p + 1); q < (uint64_t) j->current_file->last_stat.st_size; q = ALIGN64(q + 1)) { + Object *o; + + r = journal_file_move_to_object(j->current_file, OBJECT_UNUSED, q, &o); + assert_se(IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)); + } + + /* Archive and offline file. */ + assert_se(journal_file_archive(new_journal, NULL) >= 0); + assert_se(journal_file_set_offline(new_journal, /* wait = */ true) >= 0); + + /* Read the archived and offline journal. */ + for (uint64_t q = ALIGN64(p + 1); q < (uint64_t) j->current_file->last_stat.st_size; q = ALIGN64(q + 1)) { + Object *o; + + r = journal_file_move_to_object(j->current_file, OBJECT_UNUSED, q, &o); + assert_se(IN_SET(r, -EBADMSG, -EADDRNOTAVAIL, -EIDRM)); + } +} + +TEST(journal_flush) { + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + test_journal_flush_one(saved_argc, saved_argv); +} + +TEST(journal_flush_compact) { + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + test_journal_flush_one(saved_argc, saved_argv); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/libsystemd/sd-journal/test-journal-init.c b/src/libsystemd/sd-journal/test-journal-init.c new file mode 100644 index 0000000..c8a1977 --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-init.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-journal.h" + +#include "chattr-util.h" +#include "journal-internal.h" +#include "log.h" +#include "parse-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + sd_journal *j; + int r, i, I = 100; + char t[] = "/var/tmp/journal-stream-XXXXXX"; + + test_setup_logging(LOG_DEBUG); + + if (argc >= 2) { + r = safe_atoi(argv[1], &I); + if (r < 0) + log_info("Could not parse loop count argument. Using default."); + } + + log_info("Running %d loops", I); + + assert_se(mkdtemp(t)); + (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + for (i = 0; i < I; i++) { + r = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); + assert_se(r == 0); + + sd_journal_close(j); + + r = sd_journal_open_directory(&j, t, 0); + assert_se(r == 0); + + assert_se(sd_journal_seek_head(j) == 0); + assert_se(j->current_location.type == LOCATION_HEAD); + + r = safe_fork("(journal-fork-test)", FORK_WAIT|FORK_LOG, NULL); + if (r == 0) { + assert_se(j); + assert_se(sd_journal_get_realtime_usec(j, NULL) == -ECHILD); + assert_se(sd_journal_seek_tail(j) == -ECHILD); + assert_se(j->current_location.type == LOCATION_HEAD); + sd_journal_close(j); + _exit(EXIT_SUCCESS); + } + + assert_se(r >= 0); + + sd_journal_close(j); + + j = NULL; + r = sd_journal_open_directory(&j, t, SD_JOURNAL_LOCAL_ONLY); + assert_se(r == -EINVAL); + assert_se(j == NULL); + } + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + return 0; +} diff --git a/src/libsystemd/sd-journal/test-journal-interleaving.c b/src/libsystemd/sd-journal/test-journal-interleaving.c new file mode 100644 index 0000000..8aeef8f --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-interleaving.c @@ -0,0 +1,737 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-id128.h" +#include "sd-journal.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "iovec-util.h" +#include "journal-file-util.h" +#include "journal-vacuum.h" +#include "log.h" +#include "logs-show.h" +#include "parse-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "tests.h" + +/* This program tests skipping around in a multi-file journal. */ + +static bool arg_keep = false; +static dual_timestamp previous_ts = {}; + +_noreturn_ static void log_assert_errno(const char *text, int error, const char *file, unsigned line, const char *func) { + log_internal(LOG_CRIT, error, file, line, func, + "'%s' failed at %s:%u (%s): %m", text, file, line, func); + abort(); +} + +#define assert_ret(expr) \ + do { \ + int _r_ = (expr); \ + if (_unlikely_(_r_ < 0)) \ + log_assert_errno(#expr, -_r_, PROJECT_FILE, __LINE__, __func__); \ + } while (false) + +static JournalFile *test_open_internal(const char *name, JournalFileFlags flags) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + JournalFile *f; + + m = mmap_cache_new(); + assert_se(m != NULL); + + assert_ret(journal_file_open(-1, name, O_RDWR|O_CREAT, flags, 0644, UINT64_MAX, NULL, m, NULL, &f)); + return f; +} + +static JournalFile *test_open(const char *name) { + return test_open_internal(name, JOURNAL_COMPRESS); +} + +static JournalFile *test_open_strict(const char *name) { + return test_open_internal(name, JOURNAL_COMPRESS | JOURNAL_STRICT_ORDER); +} + +static void test_close(JournalFile *f) { + (void) journal_file_offline_close(f); +} + +static void test_done(const char *t) { + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + log_info("------------------------------------------------------------"); +} + +static void append_number(JournalFile *f, int n, const sd_id128_t *boot_id, uint64_t *seqnum, uint64_t *ret_offset) { + _cleanup_free_ char *p = NULL, *q = NULL; + dual_timestamp ts; + struct iovec iovec[2]; + size_t n_iov = 0; + + dual_timestamp_now(&ts); + + if (ts.monotonic <= previous_ts.monotonic) + ts.monotonic = previous_ts.monotonic + 1; + + if (ts.realtime <= previous_ts.realtime) + ts.realtime = previous_ts.realtime + 1; + + previous_ts = ts; + + assert_se(asprintf(&p, "NUMBER=%d", n) >= 0); + iovec[n_iov++] = IOVEC_MAKE_STRING(p); + + if (boot_id) { + assert_se(q = strjoin("_BOOT_ID=", SD_ID128_TO_STRING(*boot_id))); + iovec[n_iov++] = IOVEC_MAKE_STRING(q); + } + + assert_ret(journal_file_append_entry(f, &ts, boot_id, iovec, n_iov, seqnum, NULL, NULL, ret_offset)); +} + +static void append_unreferenced_data(JournalFile *f, const sd_id128_t *boot_id) { + _cleanup_free_ char *q = NULL; + dual_timestamp ts; + struct iovec iovec; + + assert(boot_id); + + ts.monotonic = usec_sub_unsigned(previous_ts.monotonic, 10); + ts.realtime = usec_sub_unsigned(previous_ts.realtime, 10); + + assert_se(q = strjoin("_BOOT_ID=", SD_ID128_TO_STRING(*boot_id))); + iovec = IOVEC_MAKE_STRING(q); + + assert_se(journal_file_append_entry(f, &ts, boot_id, &iovec, 1, NULL, NULL, NULL, NULL) == -EREMCHG); +} + +static void test_check_number(sd_journal *j, int n) { + sd_id128_t boot_id; + const void *d; + _cleanup_free_ char *k = NULL; + size_t l; + int x; + + assert_se(sd_journal_get_monotonic_usec(j, NULL, &boot_id) >= 0); + assert_ret(sd_journal_get_data(j, "NUMBER", &d, &l)); + assert_se(k = strndup(d, l)); + printf("%s %s (expected=%i)\n", SD_ID128_TO_STRING(boot_id), k, n); + + assert_se(safe_atoi(k + STRLEN("NUMBER="), &x) >= 0); + assert_se(n == x); +} + +static void test_check_numbers_down(sd_journal *j, int count) { + int i; + + for (i = 1; i <= count; i++) { + int r; + test_check_number(j, i); + assert_ret(r = sd_journal_next(j)); + if (i == count) + assert_se(r == 0); + else + assert_se(r == 1); + } + +} + +static void test_check_numbers_up(sd_journal *j, int count) { + for (int i = count; i >= 1; i--) { + int r; + test_check_number(j, i); + assert_ret(r = sd_journal_previous(j)); + if (i == 1) + assert_se(r == 0); + else + assert_se(r == 1); + } + +} + +static void setup_sequential(void) { + JournalFile *f1, *f2, *f3; + sd_id128_t id; + + f1 = test_open("one.journal"); + f2 = test_open("two.journal"); + f3 = test_open("three.journal"); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_number(f1, 1, &id, NULL, NULL); + append_number(f1, 2, &id, NULL, NULL); + append_number(f1, 3, &id, NULL, NULL); + append_number(f2, 4, &id, NULL, NULL); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_number(f2, 5, &id, NULL, NULL); + append_number(f2, 6, &id, NULL, NULL); + append_number(f3, 7, &id, NULL, NULL); + append_number(f3, 8, &id, NULL, NULL); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_number(f3, 9, &id, NULL, NULL); + test_close(f1); + test_close(f2); + test_close(f3); +} + +static void setup_interleaved(void) { + JournalFile *f1, *f2, *f3; + sd_id128_t id; + + f1 = test_open("one.journal"); + f2 = test_open("two.journal"); + f3 = test_open("three.journal"); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_number(f1, 1, &id, NULL, NULL); + append_number(f2, 2, &id, NULL, NULL); + append_number(f3, 3, &id, NULL, NULL); + append_number(f1, 4, &id, NULL, NULL); + append_number(f2, 5, &id, NULL, NULL); + append_number(f3, 6, &id, NULL, NULL); + append_number(f1, 7, &id, NULL, NULL); + append_number(f2, 8, &id, NULL, NULL); + append_number(f3, 9, &id, NULL, NULL); + test_close(f1); + test_close(f2); + test_close(f3); +} + +static void setup_unreferenced_data(void) { + JournalFile *f1, *f2, *f3; + sd_id128_t id; + + /* For issue #29275. */ + + f1 = test_open_strict("one.journal"); + f2 = test_open_strict("two.journal"); + f3 = test_open_strict("three.journal"); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_number(f1, 1, &id, NULL, NULL); + append_number(f1, 2, &id, NULL, NULL); + append_number(f1, 3, &id, NULL, NULL); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_unreferenced_data(f1, &id); + append_number(f2, 4, &id, NULL, NULL); + append_number(f2, 5, &id, NULL, NULL); + append_number(f2, 6, &id, NULL, NULL); + assert_se(sd_id128_randomize(&id) >= 0); + log_info("boot_id: %s", SD_ID128_TO_STRING(id)); + append_unreferenced_data(f2, &id); + append_number(f3, 7, &id, NULL, NULL); + append_number(f3, 8, &id, NULL, NULL); + append_number(f3, 9, &id, NULL, NULL); + test_close(f1); + test_close(f2); + test_close(f3); +} + +static void mkdtemp_chdir_chattr(char *path) { + assert_se(mkdtemp(path)); + assert_se(chdir(path) >= 0); + + /* Speed up things a bit on btrfs, ensuring that CoW is turned off for all files created in our + * directory during the test run */ + (void) chattr_path(path, FS_NOCOW_FL, FS_NOCOW_FL, NULL); +} + +static void test_skip_one(void (*setup)(void)) { + char t[] = "/var/tmp/journal-skip-XXXXXX"; + sd_journal *j; + int r; + + mkdtemp_chdir_chattr(t); + + setup(); + + /* Seek to head, iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_next(j) == 1); /* pointing to the first entry */ + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to head, iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_next(j) == 1); /* pointing to the first entry */ + assert_se(sd_journal_previous(j) == 0); /* no-op */ + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to head twice, iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_next(j) == 1); /* pointing to the first entry */ + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_next(j) == 1); /* pointing to the first entry */ + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to head, move to previous, then iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_previous(j) == 0); /* no-op */ + assert_se(sd_journal_next(j) == 1); /* pointing to the first entry */ + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to head, walk several steps, then iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_previous(j) == 0); /* no-op */ + assert_se(sd_journal_previous(j) == 0); /* no-op */ + assert_se(sd_journal_previous(j) == 0); /* no-op */ + assert_se(sd_journal_next(j) == 1); /* pointing to the first entry */ + assert_se(sd_journal_previous(j) == 0); /* no-op */ + assert_se(sd_journal_previous(j) == 0); /* no-op */ + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to tail, iterate up. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_previous(j) == 1); /* pointing to the last entry */ + test_check_numbers_up(j, 9); + sd_journal_close(j); + + /* Seek to tail twice, iterate up. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_previous(j) == 1); /* pointing to the last entry */ + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_previous(j) == 1); /* pointing to the last entry */ + test_check_numbers_up(j, 9); + sd_journal_close(j); + + /* Seek to tail, move to next, then iterate up. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_next(j) == 0); /* no-op */ + assert_se(sd_journal_previous(j) == 1); /* pointing to the last entry */ + test_check_numbers_up(j, 9); + sd_journal_close(j); + + /* Seek to tail, walk several steps, then iterate up. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_next(j) == 0); /* no-op */ + assert_se(sd_journal_next(j) == 0); /* no-op */ + assert_se(sd_journal_next(j) == 0); /* no-op */ + assert_se(sd_journal_previous(j) == 1); /* pointing to the last entry. */ + assert_se(sd_journal_next(j) == 0); /* no-op */ + assert_se(sd_journal_next(j) == 0); /* no-op */ + test_check_numbers_up(j, 9); + sd_journal_close(j); + + /* Seek to tail, skip to head, iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_previous_skip(j, 9) == 9); /* pointing to the first entry. */ + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to tail, skip to head in a more complex way, then iterate down. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_tail(j)); + assert_se(sd_journal_next(j) == 0); + assert_se(sd_journal_previous_skip(j, 4) == 4); + assert_se(sd_journal_previous_skip(j, 5) == 5); + assert_se(sd_journal_previous(j) == 0); + assert_se(sd_journal_previous_skip(j, 5) == 0); + assert_se(sd_journal_next(j) == 1); + assert_se(sd_journal_previous_skip(j, 5) == 1); + assert_se(sd_journal_next(j) == 1); + assert_se(sd_journal_next(j) == 1); + assert_se(sd_journal_previous(j) == 1); + assert_se(sd_journal_next(j) == 1); + assert_se(sd_journal_next(j) == 1); + assert_se(sd_journal_previous_skip(j, 5) == 3); + test_check_numbers_down(j, 9); + sd_journal_close(j); + + /* Seek to head, skip to tail, iterate up. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_next_skip(j, 9) == 9); + test_check_numbers_up(j, 9); + sd_journal_close(j); + + /* Seek to head, skip to tail in a more complex way, then iterate up. */ + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_ret(sd_journal_seek_head(j)); + assert_se(sd_journal_previous(j) == 0); + assert_se(sd_journal_next_skip(j, 4) == 4); + assert_se(sd_journal_next_skip(j, 5) == 5); + assert_se(sd_journal_next(j) == 0); + assert_se(sd_journal_next_skip(j, 5) == 0); + assert_se(sd_journal_previous(j) == 1); + assert_se(sd_journal_next_skip(j, 5) == 1); + assert_se(sd_journal_previous(j) == 1); + assert_se(sd_journal_previous(j) == 1); + assert_se(sd_journal_next(j) == 1); + assert_se(sd_journal_previous(j) == 1); + assert_se(sd_journal_previous(j) == 1); + assert_se(r = sd_journal_next_skip(j, 5) == 3); + test_check_numbers_up(j, 9); + sd_journal_close(j); + + test_done(t); +} + +TEST(skip) { + test_skip_one(setup_sequential); + test_skip_one(setup_interleaved); +} + +static void test_boot_id_one(void (*setup)(void), size_t n_boots_expected) { + char t[] = "/var/tmp/journal-boot-id-XXXXXX"; + sd_journal *j; + _cleanup_free_ BootId *boots = NULL; + size_t n_boots; + + mkdtemp_chdir_chattr(t); + + setup(); + + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_se(journal_get_boots(j, &boots, &n_boots) >= 0); + assert_se(boots); + assert_se(n_boots == n_boots_expected); + sd_journal_close(j); + + FOREACH_ARRAY(b, boots, n_boots) { + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_se(journal_find_boot_by_id(j, b->id) == 1); + sd_journal_close(j); + } + + for (int i = - (int) n_boots + 1; i <= (int) n_boots; i++) { + sd_id128_t id; + + assert_ret(sd_journal_open_directory(&j, t, 0)); + assert_se(journal_find_boot_by_offset(j, i, &id) == 1); + if (i <= 0) + assert_se(sd_id128_equal(id, boots[n_boots + i - 1].id)); + else + assert_se(sd_id128_equal(id, boots[i - 1].id)); + sd_journal_close(j); + } + + test_done(t); +} + +TEST(boot_id) { + test_boot_id_one(setup_sequential, 3); + test_boot_id_one(setup_unreferenced_data, 3); +} + +static void test_sequence_numbers_one(void) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + char t[] = "/var/tmp/journal-seq-XXXXXX"; + JournalFile *one, *two; + uint64_t seqnum = 0; + sd_id128_t seqnum_id; + + m = mmap_cache_new(); + assert_se(m != NULL); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "one.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0644, + UINT64_MAX, NULL, m, NULL, &one) == 0); + + append_number(one, 1, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 1); + append_number(one, 2, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 2); + + assert_se(one->header->state == STATE_ONLINE); + assert_se(!sd_id128_equal(one->header->file_id, one->header->machine_id)); + assert_se(!sd_id128_equal(one->header->file_id, one->header->tail_entry_boot_id)); + assert_se(sd_id128_equal(one->header->file_id, one->header->seqnum_id)); + + memcpy(&seqnum_id, &one->header->seqnum_id, sizeof(sd_id128_t)); + + assert_se(journal_file_open(-1, "two.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0644, + UINT64_MAX, NULL, m, one, &two) == 0); + + assert_se(two->header->state == STATE_ONLINE); + assert_se(!sd_id128_equal(two->header->file_id, one->header->file_id)); + assert_se(sd_id128_equal(two->header->machine_id, one->header->machine_id)); + assert_se(sd_id128_is_null(two->header->tail_entry_boot_id)); /* Not written yet. */ + assert_se(sd_id128_equal(two->header->seqnum_id, one->header->seqnum_id)); + + append_number(two, 3, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 3); + append_number(two, 4, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 4); + + /* Verify tail_entry_boot_id. */ + assert_se(sd_id128_equal(two->header->tail_entry_boot_id, one->header->tail_entry_boot_id)); + + test_close(two); + + append_number(one, 5, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 5); + + append_number(one, 6, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 6); + + test_close(one); + + /* If the machine-id is not initialized, the header file verification + * (which happens when re-opening a journal file) will fail. */ + if (sd_id128_get_machine(NULL) >= 0) { + /* restart server */ + seqnum = 0; + + assert_se(journal_file_open(-1, "two.journal", O_RDWR, JOURNAL_COMPRESS, 0, + UINT64_MAX, NULL, m, NULL, &two) == 0); + + assert_se(sd_id128_equal(two->header->seqnum_id, seqnum_id)); + + append_number(two, 7, NULL, &seqnum, NULL); + printf("seqnum=%"PRIu64"\n", seqnum); + assert_se(seqnum == 5); + + /* So..., here we have the same seqnum in two files with the + * same seqnum_id. */ + + test_close(two); + } + + test_done(t); +} + +TEST(sequence_numbers) { + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + test_sequence_numbers_one(); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + test_sequence_numbers_one(); +} + +static int expected_result(uint64_t needle, const uint64_t *candidates, const uint64_t *offset, size_t n, direction_t direction, uint64_t *ret) { + switch (direction) { + case DIRECTION_DOWN: + for (size_t i = 0; i < n; i++) { + if (candidates[i] == 0) { + *ret = 0; + return 0; + } + if (needle <= candidates[i]) { + *ret = offset[i]; + return 1; + } + } + *ret = 0; + return 0; + + case DIRECTION_UP: + for (size_t i = 0; i < n; i++) + if (needle < candidates[i] || candidates[i] == 0) { + if (i == 0) { + *ret = 0; + return 0; + } + *ret = offset[i - 1]; + return 1; + } + *ret = offset[n - 1]; + return 1; + + default: + assert_not_reached(); + } +} + +static void verify(JournalFile *f, const uint64_t *seqnum, const uint64_t *offset, size_t n) { + uint64_t p, q; + int r, e; + + /* by seqnum (sequential) */ + for (uint64_t i = 0; i < n + 2; i++) { + p = 0; + r = journal_file_move_to_entry_by_seqnum(f, i, DIRECTION_DOWN, NULL, &p); + e = expected_result(i, seqnum, offset, n, DIRECTION_DOWN, &q); + assert_se(r == e); + assert_se(p == q); + + p = 0; + r = journal_file_move_to_entry_by_seqnum(f, i, DIRECTION_UP, NULL, &p); + e = expected_result(i, seqnum, offset, n, DIRECTION_UP, &q); + assert_se(r == e); + assert_se(p == q); + } + + /* by seqnum (random) */ + for (size_t trial = 0; trial < 3 * n; trial++) { + uint64_t i = random_u64_range(n + 2); + + p = 0; + r = journal_file_move_to_entry_by_seqnum(f, i, DIRECTION_DOWN, NULL, &p); + e = expected_result(i, seqnum, offset, n, DIRECTION_DOWN, &q); + assert_se(r == e); + assert_se(p == q); + } + for (size_t trial = 0; trial < 3 * n; trial++) { + uint64_t i = random_u64_range(n + 2); + + p = 0; + r = journal_file_move_to_entry_by_seqnum(f, i, DIRECTION_UP, NULL, &p); + e = expected_result(i, seqnum, offset, n, DIRECTION_UP, &q); + assert_se(r == e); + assert_se(p == q); + } + + /* by offset (sequential) */ + for (size_t i = 0; i < n; i++) { + p = 0; + r = journal_file_move_to_entry_by_offset(f, offset[i] - 1, DIRECTION_DOWN, NULL, &p); + e = expected_result(offset[i] - 1, offset, offset, n, DIRECTION_DOWN, &q); + assert_se(r == e); + assert_se(p == q); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, offset[i], DIRECTION_DOWN, NULL, &p); + e = expected_result(offset[i], offset, offset, n, DIRECTION_DOWN, &q); + assert_se(r == e); + assert_se(p == q); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, offset[i] + 1, DIRECTION_DOWN, NULL, &p); + e = expected_result(offset[i] + 1, offset, offset, n, DIRECTION_DOWN, &q); + assert_se(r == e); + assert_se(p == q); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, offset[i] - 1, DIRECTION_UP, NULL, &p); + e = expected_result(offset[i] - 1, offset, offset, n, DIRECTION_UP, &q); + assert_se(r == e); + assert_se(p == q); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, offset[i], DIRECTION_UP, NULL, &p); + e = expected_result(offset[i], offset, offset, n, DIRECTION_UP, &q); + assert_se(r == e); + assert_se(p == q); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, offset[i] + 1, DIRECTION_UP, NULL, &p); + e = expected_result(offset[i] + 1, offset, offset, n, DIRECTION_UP, &q); + assert_se(r == e); + assert_se(p == q); + } + + /* by offset (random) */ + for (size_t trial = 0; trial < 3 * n; trial++) { + uint64_t i = offset[0] - 1 + random_u64_range(offset[n-1] - offset[0] + 2); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, i, DIRECTION_DOWN, NULL, &p); + e = expected_result(i, offset, offset, n, DIRECTION_DOWN, &q); + assert_se(r == e); + assert_se(p == q); + } + for (size_t trial = 0; trial < 3 * n; trial++) { + uint64_t i = offset[0] - 1 + random_u64_range(offset[n-1] - offset[0] + 2); + + p = 0; + r = journal_file_move_to_entry_by_offset(f, i, DIRECTION_UP, NULL, &p); + e = expected_result(i, offset, offset, n, DIRECTION_UP, &q); + assert_se(r == e); + assert_se(p == q); + } +} + +static void test_generic_array_bisect_one(size_t n, size_t num_corrupted) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + char t[] = "/var/tmp/journal-seq-XXXXXX"; + _cleanup_free_ uint64_t *seqnum = NULL, *offset = NULL; + JournalFile *f; + + log_info("/* %s(%zu, %zu) */", __func__, n, num_corrupted); + + assert_se(m = mmap_cache_new()); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0644, + UINT64_MAX, NULL, m, NULL, &f) == 0); + + assert_se(seqnum = new0(uint64_t, n)); + assert_se(offset = new0(uint64_t, n)); + + for (size_t i = 0; i < n; i++) { + append_number(f, i, NULL, seqnum + i, offset + i); + if (i == 0) { + assert_se(seqnum[i] > 0); + assert_se(offset[i] > 0); + } else { + assert_se(seqnum[i] > seqnum[i-1]); + assert_se(offset[i] > offset[i-1]); + } + } + + verify(f, seqnum, offset, n); + + /* Reset chain cache. */ + assert_se(journal_file_move_to_entry_by_offset(f, offset[0], DIRECTION_DOWN, NULL, NULL) > 0); + + /* make journal corrupted by clearing seqnum. */ + for (size_t i = n - num_corrupted; i < n; i++) { + Object *o; + + assert_se(journal_file_move_to_object(f, OBJECT_ENTRY, offset[i], &o) >= 0); + assert_se(o); + o->entry.seqnum = 0; + seqnum[i] = 0; + } + + verify(f, seqnum, offset, n); + + test_close(f); + test_done(t); +} + +TEST(generic_array_bisect) { + for (size_t n = 1; n < 10; n++) + for (size_t m = 1; m <= n; m++) + test_generic_array_bisect_one(n, m); + + test_generic_array_bisect_one(100, 40); +} + +static int intro(void) { + /* journal_file_open() requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + arg_keep = saved_argc > 1; + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/libsystemd/sd-journal/test-journal-match.c b/src/libsystemd/sd-journal/test-journal-match.c new file mode 100644 index 0000000..571a88c --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-match.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "journal-internal.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_free_ char *t; + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_journal_open(&j, 0) >= 0); + + assert_se(sd_journal_add_match(j, "foobar", 0) < 0); + assert_se(sd_journal_add_match(j, "foobar=waldo", 0) < 0); + assert_se(sd_journal_add_match(j, "", 0) < 0); + assert_se(sd_journal_add_match(j, "=", 0) < 0); + assert_se(sd_journal_add_match(j, "=xxxxx", 0) < 0); + assert_se(sd_journal_add_match(j, (uint8_t[4]){'A', '=', '\1', '\2'}, 4) >= 0); + assert_se(sd_journal_add_match(j, (uint8_t[5]){'B', '=', 'C', '\0', 'D'}, 5) >= 0); + assert_se(sd_journal_add_match(j, "HALLO=WALDO", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=mmmm", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=xxxxx", 0) >= 0); + assert_se(sd_journal_add_match(j, "HALLO=", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=xxxxx", 0) >= 0); + assert_se(sd_journal_add_match(j, "QUUX=yyyyy", 0) >= 0); + assert_se(sd_journal_add_match(j, "PIFF=paff", 0) >= 0); + + assert_se(sd_journal_add_disjunction(j) >= 0); + + assert_se(sd_journal_add_match(j, "ONE=one", 0) >= 0); + assert_se(sd_journal_add_match(j, "ONE=two", 0) >= 0); + assert_se(sd_journal_add_match(j, "TWO=two", 0) >= 0); + + assert_se(sd_journal_add_conjunction(j) >= 0); + + assert_se(sd_journal_add_match(j, "L4_1=yes", 0) >= 0); + assert_se(sd_journal_add_match(j, "L4_1=ok", 0) >= 0); + assert_se(sd_journal_add_match(j, "L4_2=yes", 0) >= 0); + assert_se(sd_journal_add_match(j, "L4_2=ok", 0) >= 0); + + assert_se(sd_journal_add_disjunction(j) >= 0); + + assert_se(sd_journal_add_match(j, "L3=yes", 0) >= 0); + assert_se(sd_journal_add_match(j, "L3=ok", 0) >= 0); + + assert_se(t = journal_make_match_string(j)); + + printf("resulting match expression is: %s\n", t); + + assert_se(streq(t, "(((L3=ok OR L3=yes) OR ((L4_2=ok OR L4_2=yes) AND (L4_1=ok OR L4_1=yes))) AND ((TWO=two AND (ONE=two OR ONE=one)) OR (PIFF=paff AND (QUUX=yyyyy OR QUUX=xxxxx OR QUUX=mmmm) AND (HALLO= OR HALLO=WALDO) AND B=C\\000D AND A=\\001\\002)))")); + + return 0; +} diff --git a/src/libsystemd/sd-journal/test-journal-send.c b/src/libsystemd/sd-journal/test-journal-send.c new file mode 100644 index 0000000..ca1fe7c --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-send.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-journal.h" + +#include "fileio.h" +#include "journal-send.h" +#include "macro.h" +#include "memory-util.h" +#include "tests.h" + +TEST(journal_print) { + assert_se(sd_journal_print(LOG_INFO, "XXX") == 0); + assert_se(sd_journal_print(LOG_INFO, "%s", "YYY") == 0); + assert_se(sd_journal_print(LOG_INFO, "X%4094sY", "ZZZ") == 0); + assert_se(sd_journal_print(LOG_INFO, "X%*sY", (int) LONG_LINE_MAX - 8 - 3, "ZZZ") == 0); + assert_se(sd_journal_print(LOG_INFO, "X%*sY", (int) LONG_LINE_MAX - 8 - 2, "ZZZ") == -ENOBUFS); +} + +TEST(journal_send) { + _cleanup_free_ char *huge = NULL; + +#define HUGE_SIZE (4096*1024) + assert_se(huge = malloc(HUGE_SIZE)); + + /* utf-8 and non-utf-8, message-less and message-ful iovecs */ + struct iovec graph1[] = { + {(char*) "GRAPH=graph", STRLEN("GRAPH=graph")} + }; + struct iovec graph2[] = { + {(char*) "GRAPH=graph\n", STRLEN("GRAPH=graph\n")} + }; + struct iovec message1[] = { + {(char*) "MESSAGE=graph", STRLEN("MESSAGE=graph")} + }; + struct iovec message2[] = { + {(char*) "MESSAGE=graph\n", STRLEN("MESSAGE=graph\n")} + }; + + assert_se(sd_journal_print(LOG_INFO, "piepapo") == 0); + + assert_se(sd_journal_send("MESSAGE=foobar", + "VALUE=%i", 7, + NULL) == 0); + + errno = ENOENT; + assert_se(sd_journal_perror("Foobar") == 0); + + assert_se(sd_journal_perror("") == 0); + + memcpy(huge, "HUGE=", STRLEN("HUGE=")); + memset(&huge[STRLEN("HUGE=")], 'x', HUGE_SIZE - STRLEN("HUGE=") - 1); + huge[HUGE_SIZE - 1] = '\0'; + + assert_se(sd_journal_send("MESSAGE=Huge field attached", + huge, + NULL) == 0); + + assert_se(sd_journal_send("MESSAGE=uiui", + "VALUE=A", + "VALUE=B", + "VALUE=C", + "SINGLETON=1", + "OTHERVALUE=X", + "OTHERVALUE=Y", + "WITH_BINARY=this is a binary value \a", + NULL) == 0); + + syslog(LOG_NOTICE, "Hello World!"); + + assert_se(sd_journal_print(LOG_NOTICE, "Hello World") == 0); + + assert_se(sd_journal_send("MESSAGE=Hello World!", + "MESSAGE_ID=52fb62f99e2c49d89cfbf9d6de5e3555", + "PRIORITY=5", + "HOME=%s", getenv("HOME"), + "TERM=%s", getenv("TERM"), + "PAGE_SIZE=%li", sysconf(_SC_PAGESIZE), + "N_CPUS=%li", sysconf(_SC_NPROCESSORS_ONLN), + NULL) == 0); + + assert_se(sd_journal_sendv(graph1, 1) == 0); + assert_se(sd_journal_sendv(graph2, 1) == 0); + assert_se(sd_journal_sendv(message1, 1) == 0); + assert_se(sd_journal_sendv(message2, 1) == 0); + + /* test without location fields */ +#undef sd_journal_sendv + assert_se(sd_journal_sendv(graph1, 1) == 0); + assert_se(sd_journal_sendv(graph2, 1) == 0); + assert_se(sd_journal_sendv(message1, 1) == 0); + assert_se(sd_journal_sendv(message2, 1) == 0); + + /* The above syslog() opens a fd which is stored in libc, and the valgrind reports the fd is + * leaked when we do not call closelog(). */ + closelog(); +} + +static int outro(void) { + /* Sleep a bit to make it easy for journald to collect metadata. */ + sleep(1); + + close_journal_fd(); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_FULL(LOG_INFO, NULL, outro); diff --git a/src/libsystemd/sd-journal/test-journal-stream.c b/src/libsystemd/sd-journal/test-journal-stream.c new file mode 100644 index 0000000..3a370ef --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-stream.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-journal.h" + +#include "alloc-util.h" +#include "chattr-util.h" +#include "iovec-util.h" +#include "journal-file-util.h" +#include "journal-internal.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "rm-rf.h" +#include "tests.h" + +#define N_ENTRIES 200 + +static void verify_contents(sd_journal *j, unsigned skip) { + unsigned i; + + assert_se(j); + + i = 0; + SD_JOURNAL_FOREACH(j) { + const void *d; + char *k, *c; + size_t l; + unsigned u = 0; + + assert_se(sd_journal_get_cursor(j, &k) >= 0); + printf("cursor: %s\n", k); + free(k); + + assert_se(sd_journal_get_data(j, "MAGIC", &d, &l) >= 0); + printf("\t%.*s\n", (int) l, (const char*) d); + + assert_se(sd_journal_get_data(j, "NUMBER", &d, &l) >= 0); + assert_se(k = strndup(d, l)); + printf("\t%s\n", k); + + if (skip > 0) { + assert_se(safe_atou(k + 7, &u) >= 0); + assert_se(i == u); + i += skip; + } + + free(k); + + assert_se(sd_journal_get_cursor(j, &c) >= 0); + assert_se(sd_journal_test_cursor(j, c) > 0); + free(c); + } + + if (skip > 0) + assert_se(i == N_ENTRIES); +} + +static void run_test(void) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + JournalFile *one, *two, *three; + char t[] = "/var/tmp/journal-stream-XXXXXX"; + unsigned i; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + char *z; + const void *data; + size_t l; + dual_timestamp previous_ts = DUAL_TIMESTAMP_NULL; + + m = mmap_cache_new(); + assert_se(m != NULL); + + assert_se(mkdtemp(t)); + assert_se(chdir(t) >= 0); + (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + assert_se(journal_file_open(-1, "one.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0666, UINT64_MAX, NULL, m, NULL, &one) == 0); + assert_se(journal_file_open(-1, "two.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0666, UINT64_MAX, NULL, m, NULL, &two) == 0); + assert_se(journal_file_open(-1, "three.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0666, UINT64_MAX, NULL, m, NULL, &three) == 0); + + for (i = 0; i < N_ENTRIES; i++) { + char *p, *q; + dual_timestamp ts; + struct iovec iovec[2]; + + dual_timestamp_now(&ts); + + if (ts.monotonic <= previous_ts.monotonic) + ts.monotonic = previous_ts.monotonic + 1; + + if (ts.realtime <= previous_ts.realtime) + ts.realtime = previous_ts.realtime + 1; + + previous_ts = ts; + + assert_se(asprintf(&p, "NUMBER=%u", i) >= 0); + iovec[0] = IOVEC_MAKE(p, strlen(p)); + + assert_se(asprintf(&q, "MAGIC=%s", i % 5 == 0 ? "quux" : "waldo") >= 0); + + iovec[1] = IOVEC_MAKE(q, strlen(q)); + + if (i % 10 == 0) + assert_se(journal_file_append_entry(three, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0); + else { + if (i % 3 == 0) + assert_se(journal_file_append_entry(two, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0); + + assert_se(journal_file_append_entry(one, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0); + } + + free(p); + free(q); + } + + (void) journal_file_offline_close(one); + (void) journal_file_offline_close(two); + (void) journal_file_offline_close(three); + + assert_se(sd_journal_open_directory(&j, t, 0) >= 0); + + assert_se(sd_journal_add_match(j, "MAGIC=quux", 0) >= 0); + SD_JOURNAL_FOREACH_BACKWARDS(j) { + _cleanup_free_ char *c; + + assert_se(sd_journal_get_data(j, "NUMBER", &data, &l) >= 0); + printf("\t%.*s\n", (int) l, (const char*) data); + + assert_se(sd_journal_get_cursor(j, &c) >= 0); + assert_se(sd_journal_test_cursor(j, c) > 0); + } + + SD_JOURNAL_FOREACH(j) { + _cleanup_free_ char *c; + + assert_se(sd_journal_get_data(j, "NUMBER", &data, &l) >= 0); + printf("\t%.*s\n", (int) l, (const char*) data); + + assert_se(sd_journal_get_cursor(j, &c) >= 0); + assert_se(sd_journal_test_cursor(j, c) > 0); + } + + sd_journal_flush_matches(j); + + verify_contents(j, 1); + + printf("NEXT TEST\n"); + assert_se(sd_journal_add_match(j, "MAGIC=quux", 0) >= 0); + + assert_se(z = journal_make_match_string(j)); + printf("resulting match expression is: %s\n", z); + free(z); + + verify_contents(j, 5); + + printf("NEXT TEST\n"); + sd_journal_flush_matches(j); + assert_se(sd_journal_add_match(j, "MAGIC=waldo", 0) >= 0); + assert_se(sd_journal_add_match(j, "NUMBER=10", 0) >= 0); + assert_se(sd_journal_add_match(j, "NUMBER=11", 0) >= 0); + assert_se(sd_journal_add_match(j, "NUMBER=12", 0) >= 0); + + assert_se(z = journal_make_match_string(j)); + printf("resulting match expression is: %s\n", z); + free(z); + + verify_contents(j, 0); + + assert_se(sd_journal_query_unique(j, "NUMBER") >= 0); + SD_JOURNAL_FOREACH_UNIQUE(j, data, l) + printf("%.*s\n", (int) l, (const char*) data); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); +} + +int main(int argc, char *argv[]) { + + /* journal_file_open() requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + test_setup_logging(LOG_DEBUG); + + /* Run this test multiple times with different configurations of features. */ + + assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "0", 1) >= 0); + run_test(); + + assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "1", 1) >= 0); + run_test(); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + run_test(); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + run_test(); + + return 0; +} diff --git a/src/libsystemd/sd-journal/test-journal-verify.c b/src/libsystemd/sd-journal/test-journal-verify.c new file mode 100644 index 0000000..edce440 --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal-verify.c @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "chattr-util.h" +#include "fd-util.h" +#include "iovec-util.h" +#include "journal-file-util.h" +#include "journal-verify.h" +#include "log.h" +#include "mmap-cache.h" +#include "rm-rf.h" +#include "strv.h" +#include "terminal-util.h" +#include "tests.h" + +#define N_ENTRIES 6000 +#define RANDOM_RANGE 77 + +static void bit_toggle(const char *fn, uint64_t p) { + uint8_t b; + ssize_t r; + int fd; + + fd = open(fn, O_RDWR|O_CLOEXEC); + assert_se(fd >= 0); + + r = pread(fd, &b, 1, p/8); + assert_se(r == 1); + + b ^= 1 << (p % 8); + + r = pwrite(fd, &b, 1, p/8); + assert_se(r == 1); + + safe_close(fd); +} + +static int raw_verify(const char *fn, const char *verification_key) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + JournalFile *f; + int r; + + m = mmap_cache_new(); + assert_se(m != NULL); + + r = journal_file_open( + /* fd= */ -1, + fn, + O_RDONLY, + JOURNAL_COMPRESS|(verification_key ? JOURNAL_SEAL : 0), + 0666, + /* compress_threshold_bytes= */ UINT64_MAX, + /* metrics= */ NULL, + m, + /* template= */ NULL, + &f); + if (r < 0) + return r; + + r = journal_file_verify(f, verification_key, NULL, NULL, NULL, false); + (void) journal_file_close(f); + + return r; +} + +static int run_test(const char *verification_key, ssize_t max_iterations) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + char t[] = "/var/tmp/journal-XXXXXX"; + struct stat st; + JournalFile *f; + JournalFile *df; + usec_t from = 0, to = 0, total = 0; + uint64_t start, end; + int r; + + m = mmap_cache_new(); + assert_se(m != NULL); + + /* journal_file_open() requires a valid machine id */ + if (sd_id128_get_machine(NULL) < 0) + return log_tests_skipped("No valid machine ID found"); + + test_setup_logging(LOG_DEBUG); + + assert_se(mkdtemp(t)); + assert_se(chdir(t) >= 0); + (void) chattr_path(t, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + + log_info("Generating a test journal"); + + assert_se(journal_file_open( + /* fd= */ -1, + "test.journal", + O_RDWR|O_CREAT, + JOURNAL_COMPRESS|(verification_key ? JOURNAL_SEAL : 0), + 0666, + /* compress_threshold_bytes= */ UINT64_MAX, + /* metrics= */ NULL, + m, + /* template= */ NULL, + &df) == 0); + + for (size_t n = 0; n < N_ENTRIES; n++) { + _cleanup_free_ char *test = NULL; + struct iovec iovec; + struct dual_timestamp ts; + + dual_timestamp_now(&ts); + assert_se(asprintf(&test, "RANDOM=%li", random() % RANDOM_RANGE)); + iovec = IOVEC_MAKE_STRING(test); + assert_se(journal_file_append_entry( + df, + &ts, + /* boot_id= */ NULL, + &iovec, + /* n_iovec= */ 1, + /* seqnum= */ NULL, + /* seqnum_id= */ NULL, + /* ret_object= */ NULL, + /* ret_offset= */ NULL) == 0); + } + + (void) journal_file_offline_close(df); + + log_info("Verifying with key: %s", strna(verification_key)); + + assert_se(journal_file_open( + /* fd= */ -1, + "test.journal", + O_RDONLY, + JOURNAL_COMPRESS|(verification_key ? JOURNAL_SEAL : 0), + 0666, + /* compress_threshold_bytes= */ UINT64_MAX, + /* metrics= */ NULL, + m, + /* template= */ NULL, + &f) == 0); + journal_file_print_header(f); + journal_file_dump(f); + + assert_se(journal_file_verify(f, verification_key, &from, &to, &total, true) >= 0); + + if (verification_key && JOURNAL_HEADER_SEALED(f->header)) + log_info("=> Validated from %s to %s, %s missing", + FORMAT_TIMESTAMP(from), + FORMAT_TIMESTAMP(to), + FORMAT_TIMESPAN(total > to ? total - to : 0, 0)); + + (void) journal_file_close(f); + assert_se(stat("test.journal", &st) >= 0); + + start = 38448 * 8 + 0; + end = max_iterations < 0 ? (uint64_t)st.st_size * 8 : start + max_iterations; + log_info("Toggling bits %"PRIu64 " to %"PRIu64, start, end); + + for (uint64_t p = start; p < end; p++) { + bit_toggle("test.journal", p); + + if (max_iterations < 0) + log_info("[ %"PRIu64"+%"PRIu64"]", p / 8, p % 8); + + r = raw_verify("test.journal", verification_key); + /* Suppress the notice when running in the limited (CI) mode */ + if (verification_key && max_iterations < 0 && r >= 0) + log_notice(ANSI_HIGHLIGHT_RED ">>>> %"PRIu64" (bit %"PRIu64") can be toggled without detection." ANSI_NORMAL, p / 8, p % 8); + + bit_toggle("test.journal", p); + } + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + return 0; +} + +int main(int argc, char *argv[]) { + const char *verification_key = NULL; + int max_iterations = 512; + + if (argc > 1) { + /* Don't limit the number of iterations when the verification key + * is provided on the command line, we want to do that only in CIs */ + verification_key = argv[1]; + max_iterations = -1; + } + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + run_test(verification_key, max_iterations); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + run_test(verification_key, max_iterations); + +#if HAVE_GCRYPT + /* If we're running without any arguments and we're compiled with gcrypt + * check the journal verification stuff with a valid key as well */ + if (argc <= 1) { + verification_key = "c262bd-85187f-0b1b04-877cc5/1c7af8-35a4e900"; + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + run_test(verification_key, max_iterations); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + run_test(verification_key, max_iterations); + } +#endif + + return 0; +} diff --git a/src/libsystemd/sd-journal/test-journal.c b/src/libsystemd/sd-journal/test-journal.c new file mode 100644 index 0000000..96f2b67 --- /dev/null +++ b/src/libsystemd/sd-journal/test-journal.c @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "chattr-util.h" +#include "iovec-util.h" +#include "journal-authenticate.h" +#include "journal-file-util.h" +#include "journal-vacuum.h" +#include "log.h" +#include "rm-rf.h" +#include "tests.h" + +static bool arg_keep = false; + +static void mkdtemp_chdir_chattr(char *path) { + assert_se(mkdtemp(path)); + assert_se(chdir(path) >= 0); + + /* Speed up things a bit on btrfs, ensuring that CoW is turned off for all files created in our + * directory during the test run */ + (void) chattr_path(path, FS_NOCOW_FL, FS_NOCOW_FL, NULL); +} + +static void test_non_empty_one(void) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + dual_timestamp ts; + JournalFile *f; + struct iovec iovec; + static const char test[] = "TEST1=1", test2[] = "TEST2=2"; + Object *o, *d; + uint64_t p; + sd_id128_t fake_boot_id; + char t[] = "/var/tmp/journal-XXXXXX"; + + m = mmap_cache_new(); + assert_se(m != NULL); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS|JOURNAL_SEAL, 0666, UINT64_MAX, NULL, m, NULL, &f) == 0); + + assert_se(dual_timestamp_now(&ts)); + assert_se(sd_id128_randomize(&fake_boot_id) == 0); + + iovec = IOVEC_MAKE_STRING(test); + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); + + iovec = IOVEC_MAKE_STRING(test2); + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); + + iovec = IOVEC_MAKE_STRING(test); + assert_se(journal_file_append_entry(f, &ts, &fake_boot_id, &iovec, 1, NULL, NULL, NULL, NULL) == 0); + +#if HAVE_GCRYPT + journal_file_append_tag(f); +#endif + journal_file_dump(f); + + assert_se(journal_file_next_entry(f, 0, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_next_entry(f, p, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_next_entry(f, p, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 3); + assert_se(sd_id128_equal(o->entry.boot_id, fake_boot_id)); + + assert_se(journal_file_next_entry(f, p, DIRECTION_DOWN, &o, &p) == 0); + + assert_se(journal_file_next_entry(f, 0, DIRECTION_DOWN, &o, &p) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_find_data_object(f, test, strlen(test), &d, NULL) == 1); + assert_se(journal_file_move_to_entry_for_data(f, d, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_move_to_entry_for_data(f, d, DIRECTION_UP, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 3); + + assert_se(journal_file_find_data_object(f, test2, strlen(test2), &d, NULL) == 1); + assert_se(journal_file_move_to_entry_for_data(f, d, DIRECTION_UP, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_move_to_entry_for_data(f, d, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_find_data_object(f, "quux", 4, &d, NULL) == 0); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 1, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 1); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 3, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 3); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 2, DIRECTION_DOWN, &o, NULL) == 1); + assert_se(le64toh(o->entry.seqnum) == 2); + + assert_se(journal_file_move_to_entry_by_seqnum(f, 10, DIRECTION_DOWN, &o, NULL) == 0); + + journal_file_rotate(&f, m, JOURNAL_SEAL|JOURNAL_COMPRESS, UINT64_MAX, NULL); + journal_file_rotate(&f, m, JOURNAL_SEAL|JOURNAL_COMPRESS, UINT64_MAX, NULL); + + (void) journal_file_offline_close(f); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + puts("------------------------------------------------------------"); +} + +TEST(non_empty) { + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + test_non_empty_one(); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + test_non_empty_one(); +} + +static void test_empty_one(void) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + JournalFile *f1, *f2, *f3, *f4; + char t[] = "/var/tmp/journal-XXXXXX"; + + m = mmap_cache_new(); + assert_se(m != NULL); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, 0, 0666, UINT64_MAX, NULL, m, NULL, &f1) == 0); + assert_se(journal_file_open(-1, "test-compress.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS, 0666, UINT64_MAX, NULL, m, NULL, &f2) == 0); + assert_se(journal_file_open(-1, "test-seal.journal", O_RDWR|O_CREAT, JOURNAL_SEAL, 0666, UINT64_MAX, NULL, m, NULL, &f3) == 0); + assert_se(journal_file_open(-1, "test-seal-compress.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS|JOURNAL_SEAL, 0666, UINT64_MAX, NULL, m, NULL, &f4) == 0); + + journal_file_print_header(f1); + puts(""); + journal_file_print_header(f2); + puts(""); + journal_file_print_header(f3); + puts(""); + journal_file_print_header(f4); + puts(""); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + (void) journal_file_offline_close(f1); + (void) journal_file_offline_close(f2); + (void) journal_file_offline_close(f3); + (void) journal_file_offline_close(f4); +} + +TEST(empty) { + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + test_empty_one(); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + test_empty_one(); +} + +#if HAVE_COMPRESSION +static bool check_compressed(uint64_t compress_threshold, uint64_t data_size) { + _cleanup_(mmap_cache_unrefp) MMapCache *m = NULL; + dual_timestamp ts; + JournalFile *f; + struct iovec iovec; + Object *o; + uint64_t p; + char t[] = "/var/tmp/journal-XXXXXX"; + char data[2048] = "FIELD="; + bool is_compressed; + int r; + + assert_se(data_size <= sizeof(data)); + + m = mmap_cache_new(); + assert_se(m != NULL); + + mkdtemp_chdir_chattr(t); + + assert_se(journal_file_open(-1, "test.journal", O_RDWR|O_CREAT, JOURNAL_COMPRESS|JOURNAL_SEAL, 0666, compress_threshold, NULL, m, NULL, &f) == 0); + + dual_timestamp_now(&ts); + + iovec = IOVEC_MAKE(data, data_size); + assert_se(journal_file_append_entry(f, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); + +#if HAVE_GCRYPT + journal_file_append_tag(f); +#endif + journal_file_dump(f); + + /* We have to partially reimplement some of the dump logic, because the normal next_entry does the + * decompression for us. */ + p = le64toh(f->header->header_size); + for (;;) { + r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); + assert_se(r == 0); + if (o->object.type == OBJECT_DATA) + break; + + assert_se(p < le64toh(f->header->tail_object_offset)); + p = p + ALIGN64(le64toh(o->object.size)); + } + + is_compressed = COMPRESSION_FROM_OBJECT(o) != COMPRESSION_NONE; + + (void) journal_file_offline_close(f); + + log_info("Done..."); + + if (arg_keep) + log_info("Not removing %s", t); + else { + journal_directory_vacuum(".", 3000000, 0, 0, NULL, true); + + assert_se(rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + } + + puts("------------------------------------------------------------"); + + return is_compressed; +} + +static void test_min_compress_size_one(void) { + /* Note that XZ will actually fail to compress anything under 80 bytes, so you have to choose the limits + * carefully */ + + /* DEFAULT_MIN_COMPRESS_SIZE is 512 */ + assert_se(!check_compressed(UINT64_MAX, 255)); + assert_se(check_compressed(UINT64_MAX, 513)); + + /* compress everything */ + assert_se(check_compressed(0, 96)); + assert_se(check_compressed(8, 96)); + + /* Ensure we don't try to compress less than 8 bytes */ + assert_se(!check_compressed(0, 7)); + + /* check boundary conditions */ + assert_se(check_compressed(256, 256)); + assert_se(!check_compressed(256, 255)); +} + +TEST(min_compress_size) { + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0); + test_min_compress_size_one(); + + assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0); + test_min_compress_size_one(); +} +#endif + +static int intro(void) { + arg_keep = saved_argc > 1; + + /* journal_file_open() requires a valid machine id */ + if (access("/etc/machine-id", F_OK) != 0) + return log_tests_skipped("/etc/machine-id not found"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/libsystemd/sd-journal/test-mmap-cache.c b/src/libsystemd/sd-journal/test-mmap-cache.c new file mode 100644 index 0000000..ce5ea12 --- /dev/null +++ b/src/libsystemd/sd-journal/test-mmap-cache.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "fd-util.h" +#include "macro.h" +#include "mmap-cache.h" +#include "tests.h" +#include "tmpfile-util.h" + +int main(int argc, char *argv[]) { + MMapFileDescriptor *fx; + int x, y, z, r; + char px[] = "/tmp/testmmapXXXXXXX", py[] = "/tmp/testmmapYXXXXXX", pz[] = "/tmp/testmmapZXXXXXX"; + MMapCache *m; + void *p, *q; + + test_setup_logging(LOG_DEBUG); + + assert_se(m = mmap_cache_new()); + + x = mkostemp_safe(px); + assert_se(x >= 0); + (void) unlink(px); + + assert_se(mmap_cache_add_fd(m, x, PROT_READ, &fx) > 0); + + y = mkostemp_safe(py); + assert_se(y >= 0); + (void) unlink(py); + + z = mkostemp_safe(pz); + assert_se(z >= 0); + (void) unlink(pz); + + r = mmap_cache_fd_get(fx, 0, false, 1, 2, NULL, &p); + assert_se(r >= 0); + + r = mmap_cache_fd_get(fx, 0, false, 2, 2, NULL, &q); + assert_se(r >= 0); + + assert_se((uint8_t*) p + 1 == (uint8_t*) q); + + r = mmap_cache_fd_get(fx, 1, false, 3, 2, NULL, &q); + assert_se(r >= 0); + + assert_se((uint8_t*) p + 2 == (uint8_t*) q); + + r = mmap_cache_fd_get(fx, 0, false, 16ULL*1024ULL*1024ULL, 2, NULL, &p); + assert_se(r >= 0); + + r = mmap_cache_fd_get(fx, 1, false, 16ULL*1024ULL*1024ULL+1, 2, NULL, &q); + assert_se(r >= 0); + + assert_se((uint8_t*) p + 1 == (uint8_t*) q); + + mmap_cache_fd_free(fx); + mmap_cache_unref(m); + + safe_close(x); + safe_close(y); + safe_close(z); + + return 0; +} diff --git a/src/libsystemd/sd-login/sd-login.c b/src/libsystemd/sd-login/sd-login.c new file mode 100644 index 0000000..f9e86c6 --- /dev/null +++ b/src/libsystemd/sd-login/sd-login.c @@ -0,0 +1,1323 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-login.h" + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "dirent-util.h" +#include "env-file.h" +#include "escape.h" +#include "extract-word.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "io-util.h" +#include "login-util.h" +#include "macro.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +/* Error codes: + * + * invalid input parameters → -EINVAL + * invalid fd → -EBADF + * process does not exist → -ESRCH + * cgroup does not exist → -ENOENT + * machine, session does not exist → -ENXIO + * requested metadata on object is missing → -ENODATA + */ + +_public_ int sd_pid_get_session(pid_t pid, char **session) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(session, -EINVAL); + + r = cg_pid_get_session(pid, session); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_unit(pid_t pid, char **unit) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(unit, -EINVAL); + + r = cg_pid_get_unit(pid, unit); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_user_unit(pid_t pid, char **unit) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(unit, -EINVAL); + + r = cg_pid_get_user_unit(pid, unit); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_machine_name(pid_t pid, char **name) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(name, -EINVAL); + + r = cg_pid_get_machine_name(pid, name); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_slice(pid_t pid, char **slice) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(slice, -EINVAL); + + r = cg_pid_get_slice(pid, slice); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_user_slice(pid_t pid, char **slice) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(slice, -EINVAL); + + r = cg_pid_get_user_slice(pid, slice); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_owner_uid(pid_t pid, uid_t *uid) { + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(uid, -EINVAL); + + r = cg_pid_get_owner_uid(pid, uid); + return IN_SET(r, -ENXIO, -ENOMEDIUM) ? -ENODATA : r; +} + +_public_ int sd_pid_get_cgroup(pid_t pid, char **cgroup) { + char *c; + int r; + + assert_return(pid >= 0, -EINVAL); + assert_return(cgroup, -EINVAL); + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &c); + if (r < 0) + return r; + + /* The internal APIs return the empty string for the root + * cgroup, let's return the "/" in the public APIs instead, as + * that's easier and less ambiguous for people to grok. */ + if (isempty(c)) { + r = free_and_strdup(&c, "/"); + if (r < 0) + return r; + + } + + *cgroup = c; + return 0; +} + +_public_ int sd_pidfd_get_session(int pidfd, char **ret_session) { + _cleanup_free_ char *session = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_session, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_session(pid, &session); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_session = TAKE_PTR(session); + + return 0; +} + +_public_ int sd_pidfd_get_unit(int pidfd, char **ret_unit) { + _cleanup_free_ char *unit = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_unit, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_unit(pid, &unit); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_unit = TAKE_PTR(unit); + + return 0; +} + +_public_ int sd_pidfd_get_user_unit(int pidfd, char **ret_unit) { + _cleanup_free_ char *unit = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_unit, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_user_unit(pid, &unit); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_unit = TAKE_PTR(unit); + + return 0; +} + +_public_ int sd_pidfd_get_machine_name(int pidfd, char **ret_name) { + _cleanup_free_ char *name = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_name, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_machine_name(pid, &name); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_name = TAKE_PTR(name); + + return 0; +} + +_public_ int sd_pidfd_get_slice(int pidfd, char **ret_slice) { + _cleanup_free_ char *slice = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_slice, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_slice(pid, &slice); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_slice = TAKE_PTR(slice); + + return 0; +} + +_public_ int sd_pidfd_get_user_slice(int pidfd, char **ret_slice) { + _cleanup_free_ char *slice = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_slice, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_user_slice(pid, &slice); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_slice = TAKE_PTR(slice); + + return 0; +} + +_public_ int sd_pidfd_get_owner_uid(int pidfd, uid_t *ret_uid) { + uid_t uid; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EINVAL); + assert_return(ret_uid, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_owner_uid(pid, &uid); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_uid = uid; + + return 0; +} + +_public_ int sd_pidfd_get_cgroup(int pidfd, char **ret_cgroup) { + _cleanup_free_ char *cgroup = NULL; + pid_t pid; + int r; + + assert_return(pidfd >= 0, -EBADF); + assert_return(ret_cgroup, -EINVAL); + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + r = sd_pid_get_cgroup(pid, &cgroup); + if (r < 0) + return r; + + r = pidfd_verify_pid(pidfd, pid); + if (r < 0) + return r; + + *ret_cgroup = TAKE_PTR(cgroup); + + return 0; +} + +_public_ int sd_peer_get_session(int fd, char **session) { + struct ucred ucred = UCRED_INVALID; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(session, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_session(ucred.pid, session); +} + +_public_ int sd_peer_get_owner_uid(int fd, uid_t *uid) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(uid, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_owner_uid(ucred.pid, uid); +} + +_public_ int sd_peer_get_unit(int fd, char **unit) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(unit, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_unit(ucred.pid, unit); +} + +_public_ int sd_peer_get_user_unit(int fd, char **unit) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(unit, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_user_unit(ucred.pid, unit); +} + +_public_ int sd_peer_get_machine_name(int fd, char **machine) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(machine, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_machine_name(ucred.pid, machine); +} + +_public_ int sd_peer_get_slice(int fd, char **slice) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(slice, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_slice(ucred.pid, slice); +} + +_public_ int sd_peer_get_user_slice(int fd, char **slice) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(slice, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return cg_pid_get_user_slice(ucred.pid, slice); +} + +_public_ int sd_peer_get_cgroup(int fd, char **cgroup) { + struct ucred ucred; + int r; + + assert_return(fd >= 0, -EBADF); + assert_return(cgroup, -EINVAL); + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + return sd_pid_get_cgroup(ucred.pid, cgroup); +} + +static int file_of_uid(uid_t uid, char **p) { + + assert_return(uid_is_valid(uid), -EINVAL); + assert(p); + + if (asprintf(p, "/run/systemd/users/" UID_FMT, uid) < 0) + return -ENOMEM; + + return 0; +} + +_public_ int sd_uid_get_state(uid_t uid, char**state) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + assert_return(state, -EINVAL); + + r = file_of_uid(uid, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "STATE", &s); + if (r == -ENOENT) + r = free_and_strdup(&s, "offline"); + if (r < 0) + return r; + if (isempty(s)) + return -EIO; + + *state = TAKE_PTR(s); + return 0; +} + +_public_ int sd_uid_get_display(uid_t uid, char **session) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + assert_return(session, -EINVAL); + + r = file_of_uid(uid, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "DISPLAY", &s); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + *session = TAKE_PTR(s); + + return 0; +} + +_public_ int sd_uid_get_login_time(uid_t uid, uint64_t *usec) { + _cleanup_free_ char *p = NULL, *s = NULL, *rt = NULL; + usec_t t; + int r; + + assert_return(usec, -EINVAL); + + r = file_of_uid(uid, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "STATE", &s, "REALTIME", &rt); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s) || isempty(rt)) + return -EIO; + + if (!STR_IN_SET(s, "active", "online")) + return -ENXIO; + + r = safe_atou64(rt, &t); + if (r < 0) + return r; + + *usec = t; + return 0; +} + +static int file_of_seat(const char *seat, char **_p) { + char *p; + int r; + + assert(_p); + + if (seat) { + if (!filename_is_valid(seat)) + return -EINVAL; + + p = path_join("/run/systemd/seats", seat); + } else { + _cleanup_free_ char *buf = NULL; + + r = sd_session_get_seat(NULL, &buf); + if (r < 0) + return r; + + p = path_join("/run/systemd/seats", buf); + } + if (!p) + return -ENOMEM; + + *_p = TAKE_PTR(p); + return 0; +} + +_public_ int sd_uid_is_on_seat(uid_t uid, int require_active, const char *seat) { + _cleanup_free_ char *filename = NULL, *content = NULL; + int r; + + assert_return(uid_is_valid(uid), -EINVAL); + + r = file_of_seat(seat, &filename); + if (r < 0) + return r; + + r = parse_env_file(NULL, filename, + require_active ? "ACTIVE_UID" : "UIDS", + &content); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + if (isempty(content)) + return 0; + + char t[DECIMAL_STR_MAX(uid_t)]; + xsprintf(t, UID_FMT, uid); + + return string_contains_word(content, NULL, t); +} + +static int uid_get_array(uid_t uid, const char *variable, char ***array) { + _cleanup_free_ char *p = NULL, *s = NULL; + char **a; + int r; + + assert(variable); + + r = file_of_uid(uid, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, variable, &s); + if (r == -ENOENT || (r >= 0 && isempty(s))) { + if (array) + *array = NULL; + return 0; + } + if (r < 0) + return r; + + a = strv_split(s, NULL); + if (!a) + return -ENOMEM; + + strv_uniq(a); + r = (int) strv_length(a); + + if (array) + *array = a; + else + strv_free(a); + + return r; +} + +_public_ int sd_uid_get_sessions(uid_t uid, int require_active, char ***sessions) { + return uid_get_array( + uid, + require_active == 0 ? "ONLINE_SESSIONS" : + require_active > 0 ? "ACTIVE_SESSIONS" : + "SESSIONS", + sessions); +} + +_public_ int sd_uid_get_seats(uid_t uid, int require_active, char ***seats) { + return uid_get_array( + uid, + require_active == 0 ? "ONLINE_SEATS" : + require_active > 0 ? "ACTIVE_SEATS" : + "SEATS", + seats); +} + +static int file_of_session(const char *session, char **_p) { + char *p; + int r; + + assert(_p); + + if (session) { + if (!session_id_valid(session)) + return -EINVAL; + + p = path_join("/run/systemd/sessions", session); + } else { + _cleanup_free_ char *buf = NULL; + + r = sd_pid_get_session(0, &buf); + if (r < 0) + return r; + + p = path_join("/run/systemd/sessions", buf); + } + + if (!p) + return -ENOMEM; + + *_p = p; + return 0; +} + +_public_ int sd_session_is_active(const char *session) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + r = file_of_session(session, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "ACTIVE", &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -EIO; + + return parse_boolean(s); +} + +_public_ int sd_session_is_remote(const char *session) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + r = file_of_session(session, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "REMOTE", &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + return parse_boolean(s); +} + +_public_ int sd_session_get_state(const char *session, char **state) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + assert_return(state, -EINVAL); + + r = file_of_session(session, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "STATE", &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -EIO; + + *state = TAKE_PTR(s); + + return 0; +} + +_public_ int sd_session_get_uid(const char *session, uid_t *uid) { + int r; + _cleanup_free_ char *p = NULL, *s = NULL; + + assert_return(uid, -EINVAL); + + r = file_of_session(session, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "UID", &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -EIO; + + return parse_uid(s, uid); +} + +static int session_get_string(const char *session, const char *field, char **value) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + assert_return(value, -EINVAL); + assert(field); + + r = file_of_session(session, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, field, &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + *value = TAKE_PTR(s); + return 0; +} + +_public_ int sd_session_get_username(const char *session, char **username) { + return session_get_string(session, "USER", username); +} + +_public_ int sd_session_get_seat(const char *session, char **seat) { + return session_get_string(session, "SEAT", seat); +} + +_public_ int sd_session_get_start_time(const char *session, uint64_t *usec) { + _cleanup_free_ char *p = NULL, *s = NULL; + usec_t t; + int r; + + assert_return(usec, -EINVAL); + + r = file_of_session(session, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, "REALTIME", &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -EIO; + + r = safe_atou64(s, &t); + if (r < 0) + return r; + + *usec = t; + return 0; +} + +_public_ int sd_session_get_tty(const char *session, char **tty) { + return session_get_string(session, "TTY", tty); +} + +_public_ int sd_session_get_vt(const char *session, unsigned *vtnr) { + _cleanup_free_ char *vtnr_string = NULL; + unsigned u; + int r; + + assert_return(vtnr, -EINVAL); + + r = session_get_string(session, "VTNR", &vtnr_string); + if (r < 0) + return r; + + r = safe_atou(vtnr_string, &u); + if (r < 0) + return r; + + *vtnr = u; + return 0; +} + +_public_ int sd_session_get_service(const char *session, char **service) { + return session_get_string(session, "SERVICE", service); +} + +_public_ int sd_session_get_type(const char *session, char **type) { + return session_get_string(session, "TYPE", type); +} + +_public_ int sd_session_get_class(const char *session, char **class) { + return session_get_string(session, "CLASS", class); +} + +_public_ int sd_session_get_desktop(const char *session, char **desktop) { + _cleanup_free_ char *escaped = NULL; + int r; + ssize_t l; + + assert_return(desktop, -EINVAL); + + r = session_get_string(session, "DESKTOP", &escaped); + if (r < 0) + return r; + + l = cunescape(escaped, 0, desktop); + if (l < 0) + return l; + return 0; +} + +_public_ int sd_session_get_display(const char *session, char **display) { + return session_get_string(session, "DISPLAY", display); +} + +_public_ int sd_session_get_remote_user(const char *session, char **remote_user) { + return session_get_string(session, "REMOTE_USER", remote_user); +} + +_public_ int sd_session_get_remote_host(const char *session, char **remote_host) { + return session_get_string(session, "REMOTE_HOST", remote_host); +} + +_public_ int sd_session_get_leader(const char *session, pid_t *leader) { + _cleanup_free_ char *leader_string = NULL; + pid_t pid; + int r; + + assert_return(leader, -EINVAL); + + r = session_get_string(session, "LEADER", &leader_string); + if (r < 0) + return r; + + r = parse_pid(leader_string, &pid); + if (r < 0) + return r; + + *leader = pid; + return 0; +} + +_public_ int sd_seat_get_active(const char *seat, char **session, uid_t *uid) { + _cleanup_free_ char *p = NULL, *s = NULL, *t = NULL; + int r; + + assert_return(session || uid, -EINVAL); + + r = file_of_seat(seat, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, + "ACTIVE", &s, + "ACTIVE_UID", &t); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + + if (session && !s) + return -ENODATA; + + if (uid && !t) + return -ENODATA; + + if (uid && t) { + r = parse_uid(t, uid); + if (r < 0) + return r; + } + + if (session && s) + *session = TAKE_PTR(s); + + return 0; +} + +_public_ int sd_seat_get_sessions( + const char *seat, + char ***ret_sessions, + uid_t **ret_uids, + unsigned *ret_n_uids) { + + _cleanup_free_ char *fname = NULL, *session_line = NULL, *uid_line = NULL; + _cleanup_strv_free_ char **sessions = NULL; + _cleanup_free_ uid_t *uids = NULL; + unsigned n_sessions = 0; + int r; + + r = file_of_seat(seat, &fname); + if (r < 0) + return r; + + r = parse_env_file(NULL, fname, + "SESSIONS", &session_line, + "UIDS", &uid_line); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + + if (session_line) { + sessions = strv_split(session_line, NULL); + if (!sessions) + return -ENOMEM; + + n_sessions = strv_length(sessions); + }; + + if (ret_uids && uid_line) { + uids = new(uid_t, n_sessions); + if (!uids) + return -ENOMEM; + + size_t n = 0; + for (const char *p = uid_line;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + r = parse_uid(word, &uids[n++]); + if (r < 0) + return r; + } + + if (n != n_sessions) + return -EUCLEAN; + } + + if (ret_sessions) + *ret_sessions = TAKE_PTR(sessions); + if (ret_uids) + *ret_uids = TAKE_PTR(uids); + if (ret_n_uids) + *ret_n_uids = n_sessions; + + return n_sessions; +} + +static int seat_get_can(const char *seat, const char *variable) { + _cleanup_free_ char *p = NULL, *s = NULL; + int r; + + assert(variable); + + r = file_of_seat(seat, &p); + if (r < 0) + return r; + + r = parse_env_file(NULL, p, + variable, &s); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + return parse_boolean(s); +} + +_public_ int sd_seat_can_multi_session(const char *seat) { + return true; +} + +_public_ int sd_seat_can_tty(const char *seat) { + return seat_get_can(seat, "CAN_TTY"); +} + +_public_ int sd_seat_can_graphical(const char *seat) { + return seat_get_can(seat, "CAN_GRAPHICAL"); +} + +_public_ int sd_get_seats(char ***seats) { + int r; + + r = get_files_in_directory("/run/systemd/seats/", seats); + if (r == -ENOENT) { + if (seats) + *seats = NULL; + return 0; + } + return r; +} + +_public_ int sd_get_sessions(char ***sessions) { + int r; + + r = get_files_in_directory("/run/systemd/sessions/", sessions); + if (r == -ENOENT) { + if (sessions) + *sessions = NULL; + return 0; + } + return r; +} + +_public_ int sd_get_uids(uid_t **users) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + unsigned n = 0; + _cleanup_free_ uid_t *l = NULL; + + d = opendir("/run/systemd/users/"); + if (!d) { + if (errno == ENOENT) { + if (users) + *users = NULL; + return 0; + } + return -errno; + } + + FOREACH_DIRENT_ALL(de, d, return -errno) { + int k; + uid_t uid; + + if (!dirent_is_file(de)) + continue; + + k = parse_uid(de->d_name, &uid); + if (k < 0) + continue; + + if (users) { + if ((unsigned) r >= n) { + uid_t *t; + + n = MAX(16, 2*r); + t = reallocarray(l, n, sizeof(uid_t)); + if (!t) + return -ENOMEM; + + l = t; + } + + assert((unsigned) r < n); + l[r++] = uid; + } else + r++; + } + + if (users) + *users = TAKE_PTR(l); + + return r; +} + +_public_ int sd_get_machine_names(char ***machines) { + _cleanup_strv_free_ char **l = NULL; + char **a, **b; + int r; + + r = get_files_in_directory("/run/systemd/machines/", &l); + if (r == -ENOENT) { + if (machines) + *machines = NULL; + return 0; + } + if (r < 0) + return r; + + if (l) { + r = 0; + + /* Filter out the unit: symlinks */ + for (a = b = l; *a; a++) { + if (startswith(*a, "unit:") || !hostname_is_valid(*a, 0)) + free(*a); + else { + *b = *a; + b++; + r++; + } + } + + *b = NULL; + } + + if (machines) + *machines = TAKE_PTR(l); + + return r; +} + +_public_ int sd_machine_get_class(const char *machine, char **class) { + _cleanup_free_ char *c = NULL; + const char *p; + int r; + + assert_return(class, -EINVAL); + + if (streq(machine, ".host")) { + c = strdup("host"); + if (!c) + return -ENOMEM; + } else { + if (!hostname_is_valid(machine, 0)) + return -EINVAL; + + p = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, p, "CLASS", &c); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (!c) + return -EIO; + } + + *class = TAKE_PTR(c); + return 0; +} + +_public_ int sd_machine_get_ifindices(const char *machine, int **ret_ifindices) { + _cleanup_free_ char *netif_line = NULL; + const char *p; + int r; + + assert_return(hostname_is_valid(machine, 0), -EINVAL); + + p = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, p, "NETIF", &netif_line); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (!netif_line) { + *ret_ifindices = NULL; + return 0; + } + + _cleanup_strv_free_ char **tt = strv_split(netif_line, NULL); + if (!tt) + return -ENOMEM; + + _cleanup_free_ int *ifindices = NULL; + if (ret_ifindices) { + ifindices = new(int, strv_length(tt)); + if (!ifindices) + return -ENOMEM; + } + + size_t n = 0; + for (size_t i = 0; tt[i]; i++) { + int ind; + + ind = parse_ifindex(tt[i]); + if (ind < 0) + /* Return -EUCLEAN to distinguish from -EINVAL for invalid args */ + return ind == -EINVAL ? -EUCLEAN : ind; + + if (ret_ifindices) + ifindices[n] = ind; + n++; + } + + if (ret_ifindices) + *ret_ifindices = TAKE_PTR(ifindices); + + return n; +} + +static int MONITOR_TO_FD(sd_login_monitor *m) { + return (int) (unsigned long) m - 1; +} + +static sd_login_monitor* FD_TO_MONITOR(int fd) { + return (sd_login_monitor*) (unsigned long) (fd + 1); +} + +_public_ int sd_login_monitor_new(const char *category, sd_login_monitor **m) { + _cleanup_close_ int fd = -EBADF; + bool good = false; + int k; + + assert_return(m, -EINVAL); + + fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (fd < 0) + return -errno; + + if (!category || streq(category, "seat")) { + k = inotify_add_watch(fd, "/run/systemd/seats/", IN_MOVED_TO|IN_DELETE); + if (k < 0) + return -errno; + + good = true; + } + + if (!category || streq(category, "session")) { + k = inotify_add_watch(fd, "/run/systemd/sessions/", IN_MOVED_TO|IN_DELETE); + if (k < 0) + return -errno; + + good = true; + } + + if (!category || streq(category, "uid")) { + k = inotify_add_watch(fd, "/run/systemd/users/", IN_MOVED_TO|IN_DELETE); + if (k < 0) + return -errno; + + good = true; + } + + if (!category || streq(category, "machine")) { + k = inotify_add_watch(fd, "/run/systemd/machines/", IN_MOVED_TO|IN_DELETE); + if (k < 0) + return -errno; + + good = true; + } + + if (!good) + return -EINVAL; + + *m = FD_TO_MONITOR(TAKE_FD(fd)); + return 0; +} + +_public_ sd_login_monitor* sd_login_monitor_unref(sd_login_monitor *m) { + if (m) + (void) close_nointr(MONITOR_TO_FD(m)); + + return NULL; +} + +_public_ int sd_login_monitor_flush(sd_login_monitor *m) { + int r; + + assert_return(m, -EINVAL); + + r = flush_fd(MONITOR_TO_FD(m)); + if (r < 0) + return r; + + return 0; +} + +_public_ int sd_login_monitor_get_fd(sd_login_monitor *m) { + + assert_return(m, -EINVAL); + + return MONITOR_TO_FD(m); +} + +_public_ int sd_login_monitor_get_events(sd_login_monitor *m) { + + assert_return(m, -EINVAL); + + /* For now we will only return POLLIN here, since we don't + * need anything else ever for inotify. However, let's have + * this API to keep our options open should we later on need + * it. */ + return POLLIN; +} + +_public_ int sd_login_monitor_get_timeout(sd_login_monitor *m, uint64_t *timeout_usec) { + + assert_return(m, -EINVAL); + assert_return(timeout_usec, -EINVAL); + + /* For now we will only return UINT64_MAX, since we don't + * need any timeout. However, let's have this API to keep our + * options open should we later on need it. */ + *timeout_usec = UINT64_MAX; + return 0; +} diff --git a/src/libsystemd/sd-login/test-login.c b/src/libsystemd/sd-login/test-login.c new file mode 100644 index 0000000..819f86f --- /dev/null +++ b/src/libsystemd/sd-login/test-login.c @@ -0,0 +1,334 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-login.h" + +#include "alloc-util.h" +#include "errno-list.h" +#include "fd-util.h" +#include "format-util.h" +#include "log.h" +#include "missing_syscall.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "time-util.h" +#include "user-util.h" + +static char* format_uids(char **buf, uid_t* uids, int count) { + int pos = 0, inc; + size_t size = (DECIMAL_STR_MAX(uid_t) + 1) * count + 1; + + assert_se(*buf = malloc(size)); + + for (int k = 0; k < count; k++) { + sprintf(*buf + pos, "%s"UID_FMT"%n", k > 0 ? " " : "", uids[k], &inc); + pos += inc; + } + + assert_se(pos < (ssize_t)size); + (*buf)[pos] = '\0'; + + return *buf; +} + +static const char *e(int r) { + return r == 0 ? "OK" : errno_to_name(r); +} + +TEST(login) { + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + _cleanup_free_ char *pp = NULL, *qq = NULL, + *display_session = NULL, *cgroup = NULL, + *display = NULL, *remote_user = NULL, *remote_host = NULL, + *type = NULL, *class = NULL, *state = NULL, *state2 = NULL, + *seat = NULL, *session = NULL, + *unit = NULL, *user_unit = NULL, *slice = NULL; + _cleanup_close_ int pidfd = -EBADF; + int r; + uid_t u, u2 = UID_INVALID; + char *t, **seats = NULL, **sessions = NULL; + + r = sd_pid_get_unit(0, &unit); + log_info("sd_pid_get_unit(0, …) → %s / \"%s\"", e(r), strnull(unit)); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pid_get_user_unit(0, &user_unit); + log_info("sd_pid_get_user_unit(0, …) → %s / \"%s\"", e(r), strnull(user_unit)); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pid_get_slice(0, &slice); + log_info("sd_pid_get_slice(0, …) → %s / \"%s\"", e(r), strnull(slice)); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pid_get_owner_uid(0, &u2); + log_info("sd_pid_get_owner_uid(0, …) → %s / "UID_FMT, e(r), u2); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pid_get_session(0, &session); + log_info("sd_pid_get_session(0, …) → %s / \"%s\"", e(r), strnull(session)); + + r = sd_pid_get_cgroup(0, &cgroup); + log_info("sd_pid_get_cgroup(0, …) → %s / \"%s\"", e(r), strnull(cgroup)); + assert_se(IN_SET(r, 0, -ENOMEDIUM)); + + pidfd = pidfd_open(getpid_cached(), 0); + if (pidfd >= 0) { + _cleanup_free_ char *cgroup2 = NULL, *session2 = NULL, + *unit2 = NULL, *user_unit2 = NULL, *slice2 = NULL; + + r = sd_pidfd_get_unit(pidfd, &unit2); + log_info("sd_pidfd_get_unit(pidfd, …) → %s / \"%s\"", e(r), strnull(unit2)); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pidfd_get_user_unit(pidfd, &user_unit2); + log_info("sd_pidfd_get_user_unit(pidfd, …) → %s / \"%s\"", e(r), strnull(user_unit2)); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pidfd_get_slice(pidfd, &slice2); + log_info("sd_pidfd_get_slice(pidfd, …) → %s / \"%s\"", e(r), strnull(slice2)); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pidfd_get_owner_uid(pidfd, &u2); + log_info("sd_pidfd_get_owner_uid(pidfd, …) → %s / "UID_FMT, e(r), u2); + assert_se(IN_SET(r, 0, -ENODATA)); + + r = sd_pidfd_get_session(pidfd, &session2); + log_info("sd_pidfd_get_session(pidfd, …) → %s / \"%s\"", e(r), strnull(session2)); + + r = sd_pidfd_get_cgroup(pidfd, &cgroup2); + log_info("sd_pidfd_get_cgroup(pidfd, …) → %s / \"%s\"", e(r), strnull(cgroup2)); + assert_se(IN_SET(r, 0, -ENOMEDIUM)); + } + + r = sd_uid_get_display(u2, &display_session); + log_info("sd_uid_get_display("UID_FMT", …) → %s / \"%s\"", u2, e(r), strnull(display_session)); + if (u2 == UID_INVALID) + assert_se(r == -EINVAL); + else + assert_se(IN_SET(r, 0, -ENODATA)); + + assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, pair) == 0); + sd_peer_get_session(pair[0], &pp); + sd_peer_get_session(pair[1], &qq); + assert_se(streq_ptr(pp, qq)); + + r = sd_uid_get_sessions(u2, false, &sessions); + assert_se(t = strv_join(sessions, " ")); + log_info("sd_uid_get_sessions("UID_FMT", …) → %s \"%s\"", u2, e(r), t); + if (u2 == UID_INVALID) + assert_se(r == -EINVAL); + else { + assert_se(r >= 0); + assert_se(r == (int) strv_length(sessions)); + } + sessions = strv_free(sessions); + free(t); + + assert_se(r == sd_uid_get_sessions(u2, false, NULL)); + + r = sd_uid_get_seats(u2, false, &seats); + assert_se(t = strv_join(seats, " ")); + log_info("sd_uid_get_seats("UID_FMT", …) → %s \"%s\"", u2, e(r), t); + if (u2 == UID_INVALID) + assert_se(r == -EINVAL); + else { + assert_se(r >= 0); + assert_se(r == (int) strv_length(seats)); + } + seats = strv_free(seats); + free(t); + + assert_se(r == sd_uid_get_seats(u2, false, NULL)); + + if (session) { + r = sd_session_is_active(session); + if (r == -ENXIO) + log_notice("sd_session_is_active() failed with ENXIO, it seems logind is not running."); + else { + /* All those tests will fail with ENXIO, so let's skip them. */ + + assert_se(r >= 0); + log_info("sd_session_is_active(\"%s\") → %s", session, yes_no(r)); + + r = sd_session_is_remote(session); + assert_se(r >= 0); + log_info("sd_session_is_remote(\"%s\") → %s", session, yes_no(r)); + + r = sd_session_get_state(session, &state); + assert_se(r == 0); + log_info("sd_session_get_state(\"%s\") → \"%s\"", session, state); + + assert_se(sd_session_get_uid(session, &u) >= 0); + log_info("sd_session_get_uid(\"%s\") → "UID_FMT, session, u); + assert_se(u == u2); + + assert_se(sd_session_get_type(session, &type) >= 0); + log_info("sd_session_get_type(\"%s\") → \"%s\"", session, type); + + assert_se(sd_session_get_class(session, &class) >= 0); + log_info("sd_session_get_class(\"%s\") → \"%s\"", session, class); + + r = sd_session_get_display(session, &display); + assert_se(IN_SET(r, 0, -ENODATA)); + log_info("sd_session_get_display(\"%s\") → \"%s\"", session, strna(display)); + + r = sd_session_get_remote_user(session, &remote_user); + assert_se(IN_SET(r, 0, -ENODATA)); + log_info("sd_session_get_remote_user(\"%s\") → \"%s\"", + session, strna(remote_user)); + + r = sd_session_get_remote_host(session, &remote_host); + assert_se(IN_SET(r, 0, -ENODATA)); + log_info("sd_session_get_remote_host(\"%s\") → \"%s\"", + session, strna(remote_host)); + + r = sd_session_get_seat(session, &seat); + if (r >= 0) { + assert_se(seat); + + log_info("sd_session_get_seat(\"%s\") → \"%s\"", session, seat); + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + r = sd_seat_can_multi_session(seat); +#pragma GCC diagnostic pop + assert_se(r == 1); + log_info("sd_session_can_multi_seat(\"%s\") → %s", seat, yes_no(r)); + + r = sd_seat_can_tty(seat); + assert_se(r >= 0); + log_info("sd_session_can_tty(\"%s\") → %s", seat, yes_no(r)); + + r = sd_seat_can_graphical(seat); + assert_se(r >= 0); + log_info("sd_session_can_graphical(\"%s\") → %s", seat, yes_no(r)); + } else { + log_info_errno(r, "sd_session_get_seat(\"%s\"): %m", session); + assert_se(r == -ENODATA); + } + + assert_se(sd_uid_get_state(u, &state2) == 0); + log_info("sd_uid_get_state("UID_FMT", …) → %s", u, state2); + } + } + + if (seat) { + _cleanup_free_ char *session2 = NULL, *buf = NULL; + _cleanup_free_ uid_t *uids = NULL; + unsigned n; + + assert_se(sd_uid_is_on_seat(u, 0, seat) > 0); + + r = sd_seat_get_active(seat, &session2, &u2); + assert_se(r == 0); + log_info("sd_seat_get_active(\"%s\", …) → \"%s\", "UID_FMT, seat, session2, u2); + + r = sd_uid_is_on_seat(u, 1, seat); + assert_se(IN_SET(r, 0, 1)); + assert_se(!!r == streq(session, session2)); + + r = sd_seat_get_sessions(seat, &sessions, &uids, &n); + assert_se(r >= 0); + assert_se(r == (int) strv_length(sessions)); + assert_se(t = strv_join(sessions, " ")); + strv_free(sessions); + log_info("sd_seat_get_sessions(\"%s\", …) → %s, \"%s\", [%u] {%s}", + seat, e(r), t, n, format_uids(&buf, uids, n)); + free(t); + + assert_se(sd_seat_get_sessions(seat, NULL, NULL, NULL) == r); + } + + r = sd_get_seats(&seats); + assert_se(r >= 0); + assert_se(r == (int) strv_length(seats)); + assert_se(t = strv_join(seats, ", ")); + strv_free(seats); + log_info("sd_get_seats(…) → [%i] \"%s\"", r, t); + t = mfree(t); + + assert_se(sd_get_seats(NULL) == r); + + r = sd_seat_get_active(NULL, &t, NULL); + assert_se(IN_SET(r, 0, -ENODATA, -ENXIO)); + log_info("sd_seat_get_active(NULL, …) (active session on current seat) → %s / \"%s\"", e(r), strnull(t)); + free(t); + + r = sd_get_sessions(&sessions); + assert_se(r >= 0); + assert_se(r == (int) strv_length(sessions)); + assert_se(t = strv_join(sessions, ", ")); + strv_free(sessions); + log_info("sd_get_sessions(…) → [%i] \"%s\"", r, t); + free(t); + + assert_se(sd_get_sessions(NULL) == r); + + { + _cleanup_free_ uid_t *uids = NULL; + _cleanup_free_ char *buf = NULL; + + r = sd_get_uids(&uids); + assert_se(r >= 0); + log_info("sd_get_uids(…) → [%i] {%s}", r, format_uids(&buf, uids, r)); + + assert_se(sd_get_uids(NULL) == r); + } + + { + _cleanup_strv_free_ char **machines = NULL; + _cleanup_free_ char *buf = NULL; + + r = sd_get_machine_names(&machines); + assert_se(r >= 0); + assert_se(r == (int) strv_length(machines)); + assert_se(buf = strv_join(machines, " ")); + log_info("sd_get_machines(…) → [%i] \"%s\"", r, buf); + + assert_se(sd_get_machine_names(NULL) == r); + } +} + +TEST(monitor) { + sd_login_monitor *m = NULL; + int r; + + if (!streq_ptr(saved_argv[1], "-m")) + return; + + assert_se(sd_login_monitor_new("session", &m) == 0); + + for (unsigned n = 0; n < 5; n++) { + struct pollfd pollfd = {}; + usec_t timeout, nw; + + assert_se((pollfd.fd = sd_login_monitor_get_fd(m)) >= 0); + assert_se((pollfd.events = sd_login_monitor_get_events(m)) >= 0); + + assert_se(sd_login_monitor_get_timeout(m, &timeout) >= 0); + + nw = now(CLOCK_MONOTONIC); + + r = poll(&pollfd, 1, + timeout == UINT64_MAX ? -1 : + timeout > nw ? (int) ((timeout - nw) / 1000) : + 0); + + assert_se(r >= 0); + + sd_login_monitor_flush(m); + printf("Wake!\n"); + } + + sd_login_monitor_unref(m); +} + +static int intro(void) { + log_info("/* Information printed is from the live system */"); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/libsystemd/sd-netlink/netlink-genl.c b/src/libsystemd/sd-netlink/netlink-genl.c new file mode 100644 index 0000000..1dc62e8 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-genl.c @@ -0,0 +1,488 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "netlink-genl.h" +#include "netlink-internal.h" +#include "netlink-types.h" + +typedef struct GenericNetlinkFamily { + sd_netlink *genl; + + const NLAPolicySet *policy_set; + + uint16_t id; /* a.k.a nlmsg_type */ + char *name; + uint32_t version; + uint32_t additional_header_size; + Hashmap *multicast_group_by_name; +} GenericNetlinkFamily; + +static const GenericNetlinkFamily nlctrl_static = { + .id = GENL_ID_CTRL, + .name = (char*) CTRL_GENL_NAME, + .version = 0x01, +}; + +static GenericNetlinkFamily *genl_family_free(GenericNetlinkFamily *f) { + if (!f) + return NULL; + + if (f->genl) { + if (f->id > 0) + hashmap_remove(f->genl->genl_family_by_id, UINT_TO_PTR(f->id)); + if (f->name) + hashmap_remove(f->genl->genl_family_by_name, f->name); + } + + free(f->name); + hashmap_free(f->multicast_group_by_name); + + return mfree(f); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(GenericNetlinkFamily*, genl_family_free); + +void genl_clear_family(sd_netlink *nl) { + assert(nl); + + nl->genl_family_by_name = hashmap_free_with_destructor(nl->genl_family_by_name, genl_family_free); + nl->genl_family_by_id = hashmap_free_with_destructor(nl->genl_family_by_id, genl_family_free); +} + +static int genl_family_new_unsupported( + sd_netlink *nl, + const char *family_name, + const NLAPolicySet *policy_set) { + + _cleanup_(genl_family_freep) GenericNetlinkFamily *f = NULL; + int r; + + assert(nl); + assert(family_name); + assert(policy_set); + + /* Kernel does not support the genl family? To prevent from resolving the family name again, + * let's store the family with zero id to indicate that. */ + + f = new(GenericNetlinkFamily, 1); + if (!f) + return -ENOMEM; + + *f = (GenericNetlinkFamily) { + .policy_set = policy_set, + }; + + f->name = strdup(family_name); + if (!f->name) + return -ENOMEM; + + r = hashmap_ensure_put(&nl->genl_family_by_name, &string_hash_ops, f->name, f); + if (r < 0) + return r; + + f->genl = nl; + TAKE_PTR(f); + return 0; +} + +static int genl_family_new( + sd_netlink *nl, + const char *expected_family_name, + const NLAPolicySet *policy_set, + sd_netlink_message *message, + const GenericNetlinkFamily **ret) { + + _cleanup_(genl_family_freep) GenericNetlinkFamily *f = NULL; + const char *family_name; + uint8_t cmd; + int r; + + assert(nl); + assert(expected_family_name); + assert(policy_set); + assert(message); + assert(ret); + + f = new(GenericNetlinkFamily, 1); + if (!f) + return -ENOMEM; + + *f = (GenericNetlinkFamily) { + .policy_set = policy_set, + }; + + r = sd_genl_message_get_family_name(nl, message, &family_name); + if (r < 0) + return r; + + if (!streq(family_name, CTRL_GENL_NAME)) + return -EINVAL; + + r = sd_genl_message_get_command(nl, message, &cmd); + if (r < 0) + return r; + + if (cmd != CTRL_CMD_NEWFAMILY) + return -EINVAL; + + r = sd_netlink_message_read_u16(message, CTRL_ATTR_FAMILY_ID, &f->id); + if (r < 0) + return r; + + r = sd_netlink_message_read_string_strdup(message, CTRL_ATTR_FAMILY_NAME, &f->name); + if (r < 0) + return r; + + if (!streq(f->name, expected_family_name)) + return -EINVAL; + + r = sd_netlink_message_read_u32(message, CTRL_ATTR_VERSION, &f->version); + if (r < 0) + return r; + + r = sd_netlink_message_read_u32(message, CTRL_ATTR_HDRSIZE, &f->additional_header_size); + if (r < 0) + return r; + + r = sd_netlink_message_enter_container(message, CTRL_ATTR_MCAST_GROUPS); + if (r >= 0) { + for (uint16_t i = 0; i < UINT16_MAX; i++) { + _cleanup_free_ char *group_name = NULL; + uint32_t group_id; + + r = sd_netlink_message_enter_array(message, i + 1); + if (r == -ENODATA) + break; + if (r < 0) + return r; + + r = sd_netlink_message_read_u32(message, CTRL_ATTR_MCAST_GRP_ID, &group_id); + if (r < 0) + return r; + + r = sd_netlink_message_read_string_strdup(message, CTRL_ATTR_MCAST_GRP_NAME, &group_name); + if (r < 0) + return r; + + r = sd_netlink_message_exit_container(message); + if (r < 0) + return r; + + if (group_id == 0) { + log_debug("sd-netlink: received multicast group '%s' for generic netlink family '%s' with id == 0, ignoring", + group_name, f->name); + continue; + } + + r = hashmap_ensure_put(&f->multicast_group_by_name, &string_hash_ops_free, group_name, UINT32_TO_PTR(group_id)); + if (r < 0) + return r; + + TAKE_PTR(group_name); + } + + r = sd_netlink_message_exit_container(message); + if (r < 0) + return r; + } + + r = hashmap_ensure_put(&nl->genl_family_by_id, NULL, UINT_TO_PTR(f->id), f); + if (r < 0) + return r; + + r = hashmap_ensure_put(&nl->genl_family_by_name, &string_hash_ops, f->name, f); + if (r < 0) { + hashmap_remove(nl->genl_family_by_id, UINT_TO_PTR(f->id)); + return r; + } + + f->genl = nl; + *ret = TAKE_PTR(f); + return 0; +} + +static const NLAPolicySet *genl_family_get_policy_set(const GenericNetlinkFamily *family) { + assert(family); + + if (family->policy_set) + return family->policy_set; + + return genl_get_policy_set_by_name(family->name); +} + +static int genl_message_new( + sd_netlink *nl, + const GenericNetlinkFamily *family, + uint8_t cmd, + sd_netlink_message **ret) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + const NLAPolicySet *policy_set; + int r; + + assert(nl); + assert(nl->protocol == NETLINK_GENERIC); + assert(family); + assert(ret); + + policy_set = genl_family_get_policy_set(family); + if (!policy_set) + return -EOPNOTSUPP; + + r = message_new_full(nl, family->id, policy_set, + sizeof(struct genlmsghdr) + family->additional_header_size, &m); + if (r < 0) + return r; + + *(struct genlmsghdr *) NLMSG_DATA(m->hdr) = (struct genlmsghdr) { + .cmd = cmd, + .version = family->version, + }; + + *ret = TAKE_PTR(m); + return 0; +} + +static int genl_family_get_by_name_internal( + sd_netlink *nl, + const GenericNetlinkFamily *ctrl, + const char *name, + const GenericNetlinkFamily **ret) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + const NLAPolicySet *policy_set; + int r; + + assert(nl); + assert(nl->protocol == NETLINK_GENERIC); + assert(ctrl); + assert(name); + assert(ret); + + policy_set = genl_get_policy_set_by_name(name); + if (!policy_set) + return -EOPNOTSUPP; + + r = genl_message_new(nl, ctrl, CTRL_CMD_GETFAMILY, &req); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(req, CTRL_ATTR_FAMILY_NAME, name); + if (r < 0) + return r; + + if (sd_netlink_call(nl, req, 0, &reply) < 0) { + (void) genl_family_new_unsupported(nl, name, policy_set); + return -EOPNOTSUPP; + } + + return genl_family_new(nl, name, policy_set, reply, ret); +} + +static int genl_family_get_by_name(sd_netlink *nl, const char *name, const GenericNetlinkFamily **ret) { + const GenericNetlinkFamily *f, *ctrl; + int r; + + assert(nl); + assert(nl->protocol == NETLINK_GENERIC); + assert(name); + assert(ret); + + f = hashmap_get(nl->genl_family_by_name, name); + if (f) { + if (f->id == 0) /* kernel does not support the family. */ + return -EOPNOTSUPP; + + *ret = f; + return 0; + } + + if (streq(name, CTRL_GENL_NAME)) + return genl_family_get_by_name_internal(nl, &nlctrl_static, CTRL_GENL_NAME, ret); + + ctrl = hashmap_get(nl->genl_family_by_name, CTRL_GENL_NAME); + if (!ctrl) { + r = genl_family_get_by_name_internal(nl, &nlctrl_static, CTRL_GENL_NAME, &ctrl); + if (r < 0) + return r; + } + + return genl_family_get_by_name_internal(nl, ctrl, name, ret); +} + +static int genl_family_get_by_id(sd_netlink *nl, uint16_t id, const GenericNetlinkFamily **ret) { + const GenericNetlinkFamily *f; + + assert(nl); + assert(nl->protocol == NETLINK_GENERIC); + assert(ret); + + f = hashmap_get(nl->genl_family_by_id, UINT_TO_PTR(id)); + if (f) { + *ret = f; + return 0; + } + + if (id == GENL_ID_CTRL) { + *ret = &nlctrl_static; + return 0; + } + + return -ENOENT; +} + +int genl_get_policy_set_and_header_size( + sd_netlink *nl, + uint16_t id, + const NLAPolicySet **ret_policy_set, + size_t *ret_header_size) { + + const GenericNetlinkFamily *f; + int r; + + assert(nl); + assert(nl->protocol == NETLINK_GENERIC); + + r = genl_family_get_by_id(nl, id, &f); + if (r < 0) + return r; + + if (ret_policy_set) { + const NLAPolicySet *p; + + p = genl_family_get_policy_set(f); + if (!p) + return -EOPNOTSUPP; + + *ret_policy_set = p; + } + if (ret_header_size) + *ret_header_size = sizeof(struct genlmsghdr) + f->additional_header_size; + return 0; +} + +int sd_genl_message_new(sd_netlink *nl, const char *family_name, uint8_t cmd, sd_netlink_message **ret) { + const GenericNetlinkFamily *family; + int r; + + assert_return(nl, -EINVAL); + assert_return(nl->protocol == NETLINK_GENERIC, -EINVAL); + assert_return(family_name, -EINVAL); + assert_return(ret, -EINVAL); + + r = genl_family_get_by_name(nl, family_name, &family); + if (r < 0) + return r; + + return genl_message_new(nl, family, cmd, ret); +} + +int sd_genl_message_get_family_name(sd_netlink *nl, sd_netlink_message *m, const char **ret) { + const GenericNetlinkFamily *family; + uint16_t nlmsg_type; + int r; + + assert_return(nl, -EINVAL); + assert_return(nl->protocol == NETLINK_GENERIC, -EINVAL); + assert_return(m, -EINVAL); + assert_return(ret, -EINVAL); + + r = sd_netlink_message_get_type(m, &nlmsg_type); + if (r < 0) + return r; + + r = genl_family_get_by_id(nl, nlmsg_type, &family); + if (r < 0) + return r; + + *ret = family->name; + return 0; +} + +int sd_genl_message_get_command(sd_netlink *nl, sd_netlink_message *m, uint8_t *ret) { + struct genlmsghdr *h; + uint16_t nlmsg_type; + size_t size; + int r; + + assert_return(nl, -EINVAL); + assert_return(nl->protocol == NETLINK_GENERIC, -EINVAL); + assert_return(m, -EINVAL); + assert_return(m->protocol == NETLINK_GENERIC, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(ret, -EINVAL); + + r = sd_netlink_message_get_type(m, &nlmsg_type); + if (r < 0) + return r; + + r = genl_get_policy_set_and_header_size(nl, nlmsg_type, NULL, &size); + if (r < 0) + return r; + + if (m->hdr->nlmsg_len < NLMSG_LENGTH(size)) + return -EBADMSG; + + h = NLMSG_DATA(m->hdr); + + *ret = h->cmd; + return 0; +} + +static int genl_family_get_multicast_group_id_by_name(const GenericNetlinkFamily *f, const char *name, uint32_t *ret) { + void *p; + + assert(f); + assert(name); + + p = hashmap_get(f->multicast_group_by_name, name); + if (!p) + return -ENOENT; + + if (ret) + *ret = PTR_TO_UINT32(p); + return 0; +} + +int sd_genl_add_match( + sd_netlink *nl, + sd_netlink_slot **ret_slot, + const char *family_name, + const char *multicast_group_name, + uint8_t command, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, + const char *description) { + + const GenericNetlinkFamily *f; + uint32_t multicast_group_id; + int r; + + assert_return(nl, -EINVAL); + assert_return(nl->protocol == NETLINK_GENERIC, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(family_name, -EINVAL); + assert_return(multicast_group_name, -EINVAL); + + /* If command == 0, then all commands belonging to the multicast group trigger the callback. */ + + r = genl_family_get_by_name(nl, family_name, &f); + if (r < 0) + return r; + + r = genl_family_get_multicast_group_id_by_name(f, multicast_group_name, &multicast_group_id); + if (r < 0) + return r; + + return netlink_add_match_internal(nl, ret_slot, &multicast_group_id, 1, f->id, command, + callback, destroy_callback, userdata, description); +} + +int sd_genl_socket_open(sd_netlink **ret) { + return netlink_open_family(ret, NETLINK_GENERIC); +} diff --git a/src/libsystemd/sd-netlink/netlink-genl.h b/src/libsystemd/sd-netlink/netlink-genl.h new file mode 100644 index 0000000..b06be05 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-genl.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-netlink.h" + +#define CTRL_GENL_NAME "nlctrl" + +void genl_clear_family(sd_netlink *nl); diff --git a/src/libsystemd/sd-netlink/netlink-internal.h b/src/libsystemd/sd-netlink/netlink-internal.h new file mode 100644 index 0000000..891d3e8 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-internal.h @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-netlink.h" + +#include "list.h" +#include "netlink-types.h" +#include "ordered-set.h" +#include "prioq.h" +#include "time-util.h" + +#define NETLINK_DEFAULT_TIMEOUT_USEC ((usec_t) (25 * USEC_PER_SEC)) + +#define NETLINK_RQUEUE_MAX 64*1024 + +#define NETLINK_CONTAINER_DEPTH 32 + +struct reply_callback { + sd_netlink_message_handler_t callback; + usec_t timeout; + uint32_t serial; + unsigned prioq_idx; +}; + +struct match_callback { + sd_netlink_message_handler_t callback; + uint32_t *groups; + size_t n_groups; + uint16_t type; + uint8_t cmd; /* used by genl */ + + LIST_FIELDS(struct match_callback, match_callbacks); +}; + +typedef enum NetlinkSlotType { + NETLINK_REPLY_CALLBACK, + NETLINK_MATCH_CALLBACK, + _NETLINK_SLOT_INVALID = -EINVAL, +} NetlinkSlotType; + +struct sd_netlink_slot { + unsigned n_ref; + NetlinkSlotType type:8; + bool floating; + sd_netlink *netlink; + void *userdata; + sd_netlink_destroy_t destroy_callback; + + char *description; + + LIST_FIELDS(sd_netlink_slot, slots); + + union { + struct reply_callback reply_callback; + struct match_callback match_callback; + }; +}; + +struct sd_netlink { + unsigned n_ref; + + int fd; + + union { + struct sockaddr sa; + struct sockaddr_nl nl; + } sockaddr; + + int protocol; + + Hashmap *broadcast_group_refs; + bool broadcast_group_dont_leave:1; /* until we can rely on 4.2 */ + + OrderedSet *rqueue; + Hashmap *rqueue_by_serial; + Hashmap *rqueue_partial_by_serial; + + struct nlmsghdr *rbuffer; + + bool processing:1; + + uint32_t serial; + + struct Prioq *reply_callbacks_prioq; + Hashmap *reply_callbacks; + + LIST_HEAD(struct match_callback, match_callbacks); + + LIST_HEAD(sd_netlink_slot, slots); + + pid_t original_pid; + + sd_event_source *io_event_source; + sd_event_source *time_event_source; + sd_event_source *exit_event_source; + sd_event *event; + + Hashmap *genl_family_by_name; + Hashmap *genl_family_by_id; +}; + +struct netlink_attribute { + size_t offset; /* offset from hdr to attribute */ + bool nested:1; + bool net_byteorder:1; +}; + +struct netlink_container { + const struct NLAPolicySet *policy_set; /* the policy set of the container */ + size_t offset; /* offset from hdr to the start of the container */ + struct netlink_attribute *attributes; + uint16_t max_attribute; /* the maximum attribute in container */ +}; + +struct sd_netlink_message { + unsigned n_ref; + + int protocol; + + struct nlmsghdr *hdr; + struct netlink_container containers[NETLINK_CONTAINER_DEPTH]; + unsigned n_containers; /* number of containers */ + uint32_t multicast_group; + bool sealed:1; + + sd_netlink_message *next; /* next in a chain of multi-part messages */ +}; + +int message_new_empty(sd_netlink *nl, sd_netlink_message **ret); +int message_new_full( + sd_netlink *nl, + uint16_t nlmsg_type, + const NLAPolicySet *policy_set, + size_t header_size, + sd_netlink_message **ret); +int message_new(sd_netlink *nl, sd_netlink_message **ret, uint16_t type); +int message_new_synthetic_error(sd_netlink *nl, int error, uint32_t serial, sd_netlink_message **ret); + +static inline uint32_t message_get_serial(sd_netlink_message *m) { + assert(m); + return ASSERT_PTR(m->hdr)->nlmsg_seq; +} + +void message_seal(sd_netlink_message *m); + +int netlink_open_family(sd_netlink **ret, int family); +bool netlink_pid_changed(sd_netlink *nl); + +int socket_bind(sd_netlink *nl); +int socket_broadcast_group_ref(sd_netlink *nl, unsigned group); +int socket_broadcast_group_unref(sd_netlink *nl, unsigned group); +int socket_write_message(sd_netlink *nl, sd_netlink_message *m); +int socket_read_message(sd_netlink *nl); + +int netlink_add_match_internal( + sd_netlink *nl, + sd_netlink_slot **ret_slot, + const uint32_t *groups, + size_t n_groups, + uint16_t type, + uint8_t cmd, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, + const char *description); + +/* Make sure callbacks don't destroy the netlink connection */ +#define NETLINK_DONT_DESTROY(nl) \ + _cleanup_(sd_netlink_unrefp) _unused_ sd_netlink *_dont_destroy_##nl = sd_netlink_ref(nl) + +bool nfproto_is_valid(int nfproto); + +/* nfnl */ +/* TODO: to be exported later */ +int sd_nfnl_socket_open(sd_netlink **ret); +int sd_nfnl_send_batch( + sd_netlink *nfnl, + sd_netlink_message **messages, + size_t msgcount, + uint32_t **ret_serials); +int sd_nfnl_call_batch( + sd_netlink *nfnl, + sd_netlink_message **messages, + size_t n_messages, + uint64_t usec, + sd_netlink_message ***ret_messages); +int sd_nfnl_message_new( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + uint16_t subsys, + uint16_t msg_type, + uint16_t flags); +int sd_nfnl_nft_message_new_table(sd_netlink *nfnl, sd_netlink_message **ret, + int nfproto, const char *table); +int sd_nfnl_nft_message_new_basechain(sd_netlink *nfnl, sd_netlink_message **ret, + int nfproto, const char *table, const char *chain, + const char *type, uint8_t hook, int prio); +int sd_nfnl_nft_message_new_rule(sd_netlink *nfnl, sd_netlink_message **ret, + int nfproto, const char *table, const char *chain); +int sd_nfnl_nft_message_new_set(sd_netlink *nfnl, sd_netlink_message **ret, + int nfproto, const char *table, const char *set_name, + uint32_t setid, uint32_t klen); +int sd_nfnl_nft_message_new_setelems(sd_netlink *nfnl, sd_netlink_message **ret, + int add, int nfproto, const char *table, const char *set_name); +int sd_nfnl_nft_message_append_setelem(sd_netlink_message *m, + uint32_t index, + const void *key, size_t key_len, + const void *data, size_t data_len, + uint32_t flags); diff --git a/src/libsystemd/sd-netlink/netlink-message-nfnl.c b/src/libsystemd/sd-netlink/netlink-message-nfnl.c new file mode 100644 index 0000000..fd3055d --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-message-nfnl.c @@ -0,0 +1,417 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "iovec-util.h" +#include "netlink-internal.h" +#include "netlink-types.h" +#include "netlink-util.h" + +bool nfproto_is_valid(int nfproto) { + return IN_SET(nfproto, + NFPROTO_UNSPEC, + NFPROTO_INET, + NFPROTO_IPV4, + NFPROTO_ARP, + NFPROTO_NETDEV, + NFPROTO_BRIDGE, + NFPROTO_IPV6); +} + +int sd_nfnl_message_new(sd_netlink *nfnl, sd_netlink_message **ret, int nfproto, uint16_t subsys, uint16_t msg_type, uint16_t flags) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert_return(nfnl, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(nfproto_is_valid(nfproto), -EINVAL); + assert_return(NFNL_MSG_TYPE(msg_type) == msg_type, -EINVAL); + + r = message_new(nfnl, &m, subsys << 8 | msg_type); + if (r < 0) + return r; + + m->hdr->nlmsg_flags |= flags; + + *(struct nfgenmsg*) NLMSG_DATA(m->hdr) = (struct nfgenmsg) { + .nfgen_family = nfproto, + .version = NFNETLINK_V0, + }; + + *ret = TAKE_PTR(m); + return 0; +} + +static int nfnl_message_set_res_id(sd_netlink_message *m, uint16_t res_id) { + struct nfgenmsg *nfgen; + + assert(m); + assert(m->hdr); + + nfgen = NLMSG_DATA(m->hdr); + nfgen->res_id = htobe16(res_id); + + return 0; +} + +static int nfnl_message_get_subsys(sd_netlink_message *m, uint16_t *ret) { + uint16_t t; + int r; + + assert(m); + assert(ret); + + r = sd_netlink_message_get_type(m, &t); + if (r < 0) + return r; + + *ret = NFNL_SUBSYS_ID(t); + return 0; +} + +static int nfnl_message_new_batch(sd_netlink *nfnl, sd_netlink_message **ret, uint16_t subsys, uint16_t msg_type) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert_return(nfnl, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(NFNL_MSG_TYPE(msg_type) == msg_type, -EINVAL); + + r = sd_nfnl_message_new(nfnl, &m, NFPROTO_UNSPEC, NFNL_SUBSYS_NONE, msg_type, 0); + if (r < 0) + return r; + + r = nfnl_message_set_res_id(m, subsys); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +int sd_nfnl_send_batch( + sd_netlink *nfnl, + sd_netlink_message **messages, + size_t n_messages, + uint32_t **ret_serials) { + + /* iovs refs batch_begin and batch_end, hence, free iovs first, then free batch_begin and batch_end. */ + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *batch_begin = NULL, *batch_end = NULL; + _cleanup_free_ struct iovec *iovs = NULL; + _cleanup_free_ uint32_t *serials = NULL; + uint16_t subsys; + ssize_t k; + size_t c = 0; + int r; + + assert_return(nfnl, -EINVAL); + assert_return(!netlink_pid_changed(nfnl), -ECHILD); + assert_return(messages, -EINVAL); + assert_return(n_messages > 0, -EINVAL); + + iovs = new(struct iovec, n_messages + 2); + if (!iovs) + return -ENOMEM; + + if (ret_serials) { + serials = new(uint32_t, n_messages); + if (!serials) + return -ENOMEM; + } + + r = nfnl_message_get_subsys(messages[0], &subsys); + if (r < 0) + return r; + + r = nfnl_message_new_batch(nfnl, &batch_begin, subsys, NFNL_MSG_BATCH_BEGIN); + if (r < 0) + return r; + + netlink_seal_message(nfnl, batch_begin); + iovs[c++] = IOVEC_MAKE(batch_begin->hdr, batch_begin->hdr->nlmsg_len); + + for (size_t i = 0; i < n_messages; i++) { + uint16_t s; + + r = nfnl_message_get_subsys(messages[i], &s); + if (r < 0) + return r; + + if (s != subsys) + return -EINVAL; + + netlink_seal_message(nfnl, messages[i]); + if (serials) + serials[i] = message_get_serial(messages[i]); + + /* It seems that the kernel accepts an arbitrary number. Let's set the lower 16 bits of the + * serial of the first message. */ + nfnl_message_set_res_id(messages[i], (uint16_t) (message_get_serial(batch_begin) & UINT16_MAX)); + + iovs[c++] = IOVEC_MAKE(messages[i]->hdr, messages[i]->hdr->nlmsg_len); + } + + r = nfnl_message_new_batch(nfnl, &batch_end, subsys, NFNL_MSG_BATCH_END); + if (r < 0) + return r; + + netlink_seal_message(nfnl, batch_end); + iovs[c++] = IOVEC_MAKE(batch_end->hdr, batch_end->hdr->nlmsg_len); + + assert(c == n_messages + 2); + k = writev(nfnl->fd, iovs, n_messages + 2); + if (k < 0) + return -errno; + + if (ret_serials) + *ret_serials = TAKE_PTR(serials); + + return 0; +} + +int sd_nfnl_call_batch( + sd_netlink *nfnl, + sd_netlink_message **messages, + size_t n_messages, + uint64_t usec, + sd_netlink_message ***ret_messages) { + + _cleanup_free_ sd_netlink_message **replies = NULL; + _cleanup_free_ uint32_t *serials = NULL; + int r; + + assert_return(nfnl, -EINVAL); + assert_return(!netlink_pid_changed(nfnl), -ECHILD); + assert_return(messages, -EINVAL); + assert_return(n_messages > 0, -EINVAL); + + if (ret_messages) { + replies = new0(sd_netlink_message*, n_messages); + if (!replies) + return -ENOMEM; + } + + r = sd_nfnl_send_batch(nfnl, messages, n_messages, &serials); + if (r < 0) + return r; + + for (size_t i = 0; i < n_messages; i++) + RET_GATHER(r, + sd_netlink_read(nfnl, serials[i], usec, ret_messages ? replies + i : NULL)); + if (r < 0) + return r; + + if (ret_messages) + *ret_messages = TAKE_PTR(replies); + + return 0; +} + +int sd_nfnl_nft_message_new_basechain( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + const char *table, + const char *chain, + const char *type, + uint8_t hook, + int prio) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_message_new(nfnl, &m, nfproto, NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWCHAIN, NLM_F_CREATE); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_CHAIN_TABLE, table); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_CHAIN_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_CHAIN_TYPE, type); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_CHAIN_HOOK); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_HOOK_HOOKNUM, htobe32(hook)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_HOOK_PRIORITY, htobe32(prio)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +int sd_nfnl_nft_message_new_table( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + const char *table) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_message_new(nfnl, &m, nfproto, NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWTABLE, NLM_F_CREATE | NLM_F_EXCL); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_TABLE_NAME, table); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return r; +} + +int sd_nfnl_nft_message_new_rule( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + const char *table, + const char *chain) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_message_new(nfnl, &m, nfproto, NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWRULE, NLM_F_CREATE); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_RULE_TABLE, table); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_RULE_CHAIN, chain); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return r; +} + +int sd_nfnl_nft_message_new_set( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + const char *table, + const char *set_name, + uint32_t set_id, + uint32_t klen) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_message_new(nfnl, &m, nfproto, NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSET, NLM_F_CREATE); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_SET_TABLE, table); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_SET_NAME, set_name); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_ID, ++set_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_LEN, htobe32(klen)); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return r; +} + +int sd_nfnl_nft_message_new_setelems( + sd_netlink *nfnl, + sd_netlink_message **ret, + int add, /* boolean */ + int nfproto, + const char *table, + const char *set_name) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + if (add) + r = sd_nfnl_message_new(nfnl, &m, nfproto, NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM, NLM_F_CREATE); + else + r = sd_nfnl_message_new(nfnl, &m, nfproto, NFNL_SUBSYS_NFTABLES, NFT_MSG_DELSETELEM, 0); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_SET_ELEM_LIST_TABLE, table); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_SET_ELEM_LIST_SET, set_name); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return r; +} + +int sd_nfnl_nft_message_append_setelem( + sd_netlink_message *m, + uint32_t index, + const void *key, + size_t key_len, + const void *data, + size_t data_len, + uint32_t flags) { + + int r; + + r = sd_netlink_message_open_array(m, index); + if (r < 0) + return r; + + r = sd_netlink_message_append_container_data(m, NFTA_SET_ELEM_KEY, NFTA_DATA_VALUE, key, key_len); + if (r < 0) + goto cancel; + + if (data) { + r = sd_netlink_message_append_container_data(m, NFTA_SET_ELEM_DATA, NFTA_DATA_VALUE, data, data_len); + if (r < 0) + goto cancel; + } + + if (flags != 0) { + r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(flags)); + if (r < 0) + goto cancel; + } + + return sd_netlink_message_close_container(m); /* array */ + +cancel: + (void) sd_netlink_message_cancel_array(m); + return r; +} + +int sd_nfnl_socket_open(sd_netlink **ret) { + return netlink_open_family(ret, NETLINK_NETFILTER); +} diff --git a/src/libsystemd/sd-netlink/netlink-message-rtnl.c b/src/libsystemd/sd-netlink/netlink-message-rtnl.c new file mode 100644 index 0000000..008e802 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-message-rtnl.c @@ -0,0 +1,1204 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "format-util.h" +#include "netlink-internal.h" +#include "netlink-types.h" +#include "netlink-util.h" +#include "socket-util.h" + +static bool rtnl_message_type_is_neigh(uint16_t type) { + return IN_SET(type, RTM_NEWNEIGH, RTM_GETNEIGH, RTM_DELNEIGH); +} + +static bool rtnl_message_type_is_route(uint16_t type) { + return IN_SET(type, RTM_NEWROUTE, RTM_GETROUTE, RTM_DELROUTE); +} + +static bool rtnl_message_type_is_nexthop(uint16_t type) { + return IN_SET(type, RTM_NEWNEXTHOP, RTM_GETNEXTHOP, RTM_DELNEXTHOP); +} + +static bool rtnl_message_type_is_link(uint16_t type) { + return IN_SET(type, + RTM_NEWLINK, RTM_SETLINK, RTM_GETLINK, RTM_DELLINK, + RTM_NEWLINKPROP, RTM_DELLINKPROP, RTM_GETLINKPROP); +} + +static bool rtnl_message_type_is_addr(uint16_t type) { + return IN_SET(type, RTM_NEWADDR, RTM_GETADDR, RTM_DELADDR); +} + +static bool rtnl_message_type_is_addrlabel(uint16_t type) { + return IN_SET(type, RTM_NEWADDRLABEL, RTM_DELADDRLABEL, RTM_GETADDRLABEL); +} + +static bool rtnl_message_type_is_routing_policy_rule(uint16_t type) { + return IN_SET(type, RTM_NEWRULE, RTM_DELRULE, RTM_GETRULE); +} + +static bool rtnl_message_type_is_traffic_control(uint16_t type) { + return IN_SET(type, + RTM_NEWQDISC, RTM_DELQDISC, RTM_GETQDISC, + RTM_NEWTCLASS, RTM_DELTCLASS, RTM_GETTCLASS); +} + +static bool rtnl_message_type_is_mdb(uint16_t type) { + return IN_SET(type, RTM_NEWMDB, RTM_DELMDB, RTM_GETMDB); +} + +int sd_rtnl_message_route_set_dst_prefixlen(sd_netlink_message *m, unsigned char prefixlen) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + if ((rtm->rtm_family == AF_INET && prefixlen > 32) || + (rtm->rtm_family == AF_INET6 && prefixlen > 128)) + return -ERANGE; + + rtm->rtm_dst_len = prefixlen; + + return 0; +} + +int sd_rtnl_message_route_set_src_prefixlen(sd_netlink_message *m, unsigned char prefixlen) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + if ((rtm->rtm_family == AF_INET && prefixlen > 32) || + (rtm->rtm_family == AF_INET6 && prefixlen > 128)) + return -ERANGE; + + rtm->rtm_src_len = prefixlen; + + return 0; +} + +int sd_rtnl_message_route_set_scope(sd_netlink_message *m, unsigned char scope) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + rtm->rtm_scope = scope; + + return 0; +} + +int sd_rtnl_message_route_set_flags(sd_netlink_message *m, unsigned flags) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + rtm->rtm_flags = flags; + + return 0; +} + +int sd_rtnl_message_route_get_flags(sd_netlink_message *m, unsigned *flags) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(flags, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *flags = rtm->rtm_flags; + + return 0; +} + +int sd_rtnl_message_route_set_table(sd_netlink_message *m, unsigned char table) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + rtm->rtm_table = table; + + return 0; +} + +int sd_rtnl_message_route_get_family(sd_netlink_message *m, int *family) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(family, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *family = rtm->rtm_family; + + return 0; +} + +int sd_rtnl_message_route_get_type(sd_netlink_message *m, unsigned char *type) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(type, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *type = rtm->rtm_type; + + return 0; +} + +int sd_rtnl_message_route_set_type(sd_netlink_message *m, unsigned char type) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + rtm->rtm_type = type; + + return 0; +} + +int sd_rtnl_message_route_get_protocol(sd_netlink_message *m, unsigned char *protocol) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(protocol, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *protocol = rtm->rtm_protocol; + + return 0; +} + +int sd_rtnl_message_route_get_scope(sd_netlink_message *m, unsigned char *scope) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(scope, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *scope = rtm->rtm_scope; + + return 0; +} + +int sd_rtnl_message_route_get_tos(sd_netlink_message *m, uint8_t *tos) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(tos, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *tos = rtm->rtm_tos; + + return 0; +} + +int sd_rtnl_message_route_get_table(sd_netlink_message *m, unsigned char *table) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(table, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *table = rtm->rtm_table; + + return 0; +} + +int sd_rtnl_message_route_get_dst_prefixlen(sd_netlink_message *m, unsigned char *dst_len) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(dst_len, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *dst_len = rtm->rtm_dst_len; + + return 0; +} + +int sd_rtnl_message_route_get_src_prefixlen(sd_netlink_message *m, unsigned char *src_len) { + struct rtmsg *rtm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_route(m->hdr->nlmsg_type), -EINVAL); + assert_return(src_len, -EINVAL); + + rtm = NLMSG_DATA(m->hdr); + + *src_len = rtm->rtm_src_len; + + return 0; +} + +int sd_rtnl_message_new_route(sd_netlink *rtnl, sd_netlink_message **ret, + uint16_t nlmsg_type, int rtm_family, + unsigned char rtm_protocol) { + struct rtmsg *rtm; + int r; + + assert_return(rtnl_message_type_is_route(nlmsg_type), -EINVAL); + assert_return((nlmsg_type == RTM_GETROUTE && rtm_family == AF_UNSPEC) || + IN_SET(rtm_family, AF_INET, AF_INET6), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWROUTE) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_APPEND; + + rtm = NLMSG_DATA((*ret)->hdr); + + rtm->rtm_family = rtm_family; + rtm->rtm_protocol = rtm_protocol; + + return 0; +} + +int sd_rtnl_message_new_nexthop(sd_netlink *rtnl, sd_netlink_message **ret, + uint16_t nlmsg_type, int nh_family, + unsigned char nh_protocol) { + struct nhmsg *nhm; + int r; + + assert_return(rtnl_message_type_is_nexthop(nlmsg_type), -EINVAL); + switch (nlmsg_type) { + case RTM_DELNEXTHOP: + assert_return(nh_family == AF_UNSPEC, -EINVAL); + _fallthrough_; + case RTM_GETNEXTHOP: + assert_return(nh_protocol == RTPROT_UNSPEC, -EINVAL); + break; + case RTM_NEWNEXTHOP: + assert_return(IN_SET(nh_family, AF_UNSPEC, AF_INET, AF_INET6), -EINVAL); + break; + default: + assert_not_reached(); + } + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWNEXTHOP) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_APPEND; + + nhm = NLMSG_DATA((*ret)->hdr); + + nhm->nh_family = nh_family; + nhm->nh_scope = RT_SCOPE_UNIVERSE; + nhm->nh_protocol = nh_protocol; + + return 0; +} + +int sd_rtnl_message_nexthop_set_flags(sd_netlink_message *m, uint8_t flags) { + struct nhmsg *nhm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(m->hdr->nlmsg_type == RTM_NEWNEXTHOP, -EINVAL); + + nhm = NLMSG_DATA(m->hdr); + nhm->nh_flags = flags; + + return 0; +} + +int sd_rtnl_message_nexthop_get_flags(sd_netlink_message *m, uint8_t *ret) { + struct nhmsg *nhm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_nexthop(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + nhm = NLMSG_DATA(m->hdr); + *ret = nhm->nh_flags; + + return 0; +} + +int sd_rtnl_message_nexthop_get_family(sd_netlink_message *m, uint8_t *family) { + struct nhmsg *nhm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_nexthop(m->hdr->nlmsg_type), -EINVAL); + assert_return(family, -EINVAL); + + nhm = NLMSG_DATA(m->hdr); + *family = nhm->nh_family; + + return 0; +} + +int sd_rtnl_message_nexthop_get_protocol(sd_netlink_message *m, uint8_t *protocol) { + struct nhmsg *nhm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_nexthop(m->hdr->nlmsg_type), -EINVAL); + assert_return(protocol, -EINVAL); + + nhm = NLMSG_DATA(m->hdr); + *protocol = nhm->nh_protocol; + + return 0; +} + +int sd_rtnl_message_neigh_set_flags(sd_netlink_message *m, uint8_t flags) { + struct ndmsg *ndm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_neigh(m->hdr->nlmsg_type), -EINVAL); + + ndm = NLMSG_DATA(m->hdr); + ndm->ndm_flags = flags; + + return 0; +} + +int sd_rtnl_message_neigh_set_state(sd_netlink_message *m, uint16_t state) { + struct ndmsg *ndm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_neigh(m->hdr->nlmsg_type), -EINVAL); + + ndm = NLMSG_DATA(m->hdr); + ndm->ndm_state = state; + + return 0; +} + +int sd_rtnl_message_neigh_get_flags(sd_netlink_message *m, uint8_t *flags) { + struct ndmsg *ndm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_neigh(m->hdr->nlmsg_type), -EINVAL); + + ndm = NLMSG_DATA(m->hdr); + *flags = ndm->ndm_flags; + + return 0; +} + +int sd_rtnl_message_neigh_get_state(sd_netlink_message *m, uint16_t *state) { + struct ndmsg *ndm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_neigh(m->hdr->nlmsg_type), -EINVAL); + + ndm = NLMSG_DATA(m->hdr); + *state = ndm->ndm_state; + + return 0; +} + +int sd_rtnl_message_neigh_get_family(sd_netlink_message *m, int *family) { + struct ndmsg *ndm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_neigh(m->hdr->nlmsg_type), -EINVAL); + assert_return(family, -EINVAL); + + ndm = NLMSG_DATA(m->hdr); + + *family = ndm->ndm_family; + + return 0; +} + +int sd_rtnl_message_neigh_get_ifindex(sd_netlink_message *m, int *index) { + struct ndmsg *ndm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_neigh(m->hdr->nlmsg_type), -EINVAL); + assert_return(index, -EINVAL); + + ndm = NLMSG_DATA(m->hdr); + + *index = ndm->ndm_ifindex; + + return 0; +} + +int sd_rtnl_message_new_neigh( + sd_netlink *rtnl, + sd_netlink_message **ret, + uint16_t nlmsg_type, + int index, + int ndm_family) { + + struct ndmsg *ndm; + int r; + + assert_return(rtnl_message_type_is_neigh(nlmsg_type), -EINVAL); + assert_return(IN_SET(ndm_family, AF_UNSPEC, AF_INET, AF_INET6, AF_BRIDGE), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWNEIGH) { + if (ndm_family == AF_BRIDGE) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_APPEND; + else + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_REPLACE; + } + + ndm = NLMSG_DATA((*ret)->hdr); + + ndm->ndm_family = ndm_family; + ndm->ndm_ifindex = index; + + return 0; +} + +int sd_rtnl_message_link_set_flags(sd_netlink_message *m, unsigned flags, unsigned change) { + struct ifinfomsg *ifi; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_link(m->hdr->nlmsg_type), -EINVAL); + assert_return(change != 0, -EINVAL); + + ifi = NLMSG_DATA(m->hdr); + + ifi->ifi_flags = flags; + ifi->ifi_change = change; + + return 0; +} + +int sd_rtnl_message_link_set_type(sd_netlink_message *m, unsigned type) { + struct ifinfomsg *ifi; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_link(m->hdr->nlmsg_type), -EINVAL); + + ifi = NLMSG_DATA(m->hdr); + + ifi->ifi_type = type; + + return 0; +} + +int sd_rtnl_message_link_set_family(sd_netlink_message *m, unsigned family) { + struct ifinfomsg *ifi; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_link(m->hdr->nlmsg_type), -EINVAL); + + ifi = NLMSG_DATA(m->hdr); + + ifi->ifi_family = family; + + return 0; +} + +int sd_rtnl_message_new_link(sd_netlink *rtnl, sd_netlink_message **ret, + uint16_t nlmsg_type, int index) { + struct ifinfomsg *ifi; + int r; + + assert_return(rtnl_message_type_is_link(nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWLINK) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + else if (nlmsg_type == RTM_NEWLINKPROP) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL | NLM_F_APPEND; + + ifi = NLMSG_DATA((*ret)->hdr); + + ifi->ifi_family = AF_UNSPEC; + ifi->ifi_index = index; + + return 0; +} + +int sd_rtnl_message_addr_set_prefixlen(sd_netlink_message *m, unsigned char prefixlen) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + if ((ifa->ifa_family == AF_INET && prefixlen > 32) || + (ifa->ifa_family == AF_INET6 && prefixlen > 128)) + return -ERANGE; + + ifa->ifa_prefixlen = prefixlen; + + return 0; +} + +int sd_rtnl_message_addr_set_flags(sd_netlink_message *m, unsigned char flags) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + ifa->ifa_flags = flags; + + return 0; +} + +int sd_rtnl_message_addr_set_scope(sd_netlink_message *m, unsigned char scope) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + ifa->ifa_scope = scope; + + return 0; +} + +int sd_rtnl_message_addr_get_family(sd_netlink_message *m, int *ret_family) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret_family, -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + *ret_family = ifa->ifa_family; + + return 0; +} + +int sd_rtnl_message_addr_get_prefixlen(sd_netlink_message *m, unsigned char *ret_prefixlen) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret_prefixlen, -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + *ret_prefixlen = ifa->ifa_prefixlen; + + return 0; +} + +int sd_rtnl_message_addr_get_scope(sd_netlink_message *m, unsigned char *ret_scope) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret_scope, -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + *ret_scope = ifa->ifa_scope; + + return 0; +} + +int sd_rtnl_message_addr_get_flags(sd_netlink_message *m, unsigned char *ret_flags) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret_flags, -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + *ret_flags = ifa->ifa_flags; + + return 0; +} + +int sd_rtnl_message_addr_get_ifindex(sd_netlink_message *m, int *ret_ifindex) { + struct ifaddrmsg *ifa; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addr(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret_ifindex, -EINVAL); + + ifa = NLMSG_DATA(m->hdr); + + *ret_ifindex = ifa->ifa_index; + + return 0; +} + +int sd_rtnl_message_new_addr( + sd_netlink *rtnl, + sd_netlink_message **ret, + uint16_t nlmsg_type, + int index, + int family) { + + struct ifaddrmsg *ifa; + int r; + + assert_return(rtnl_message_type_is_addr(nlmsg_type), -EINVAL); + assert_return((nlmsg_type == RTM_GETADDR && index == 0) || + index > 0, -EINVAL); + assert_return((nlmsg_type == RTM_GETADDR && family == AF_UNSPEC) || + IN_SET(family, AF_INET, AF_INET6), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + ifa = NLMSG_DATA((*ret)->hdr); + + ifa->ifa_index = index; + ifa->ifa_family = family; + + return 0; +} + +int sd_rtnl_message_new_addr_update( + sd_netlink *rtnl, + sd_netlink_message **ret, + int index, + int family) { + int r; + + r = sd_rtnl_message_new_addr(rtnl, ret, RTM_NEWADDR, index, family); + if (r < 0) + return r; + + (*ret)->hdr->nlmsg_flags |= NLM_F_REPLACE; + + return 0; +} + +int sd_rtnl_message_link_get_ifindex(sd_netlink_message *m, int *ifindex) { + struct ifinfomsg *ifi; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_link(m->hdr->nlmsg_type), -EINVAL); + assert_return(ifindex, -EINVAL); + + ifi = NLMSG_DATA(m->hdr); + + *ifindex = ifi->ifi_index; + + return 0; +} + +int sd_rtnl_message_link_get_flags(sd_netlink_message *m, unsigned *flags) { + struct ifinfomsg *ifi; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_link(m->hdr->nlmsg_type), -EINVAL); + assert_return(flags, -EINVAL); + + ifi = NLMSG_DATA(m->hdr); + + *flags = ifi->ifi_flags; + + return 0; +} + +int sd_rtnl_message_link_get_type(sd_netlink_message *m, unsigned short *type) { + struct ifinfomsg *ifi; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_link(m->hdr->nlmsg_type), -EINVAL); + assert_return(type, -EINVAL); + + ifi = NLMSG_DATA(m->hdr); + + *type = ifi->ifi_type; + + return 0; +} + +int sd_rtnl_message_get_family(sd_netlink_message *m, int *family) { + assert_return(m, -EINVAL); + assert_return(family, -EINVAL); + + assert(m->hdr); + + if (rtnl_message_type_is_link(m->hdr->nlmsg_type)) { + struct ifinfomsg *ifi; + + ifi = NLMSG_DATA(m->hdr); + + *family = ifi->ifi_family; + + return 0; + } else if (rtnl_message_type_is_route(m->hdr->nlmsg_type)) { + struct rtmsg *rtm; + + rtm = NLMSG_DATA(m->hdr); + + *family = rtm->rtm_family; + + return 0; + } else if (rtnl_message_type_is_neigh(m->hdr->nlmsg_type)) { + struct ndmsg *ndm; + + ndm = NLMSG_DATA(m->hdr); + + *family = ndm->ndm_family; + + return 0; + } else if (rtnl_message_type_is_addr(m->hdr->nlmsg_type)) { + struct ifaddrmsg *ifa; + + ifa = NLMSG_DATA(m->hdr); + + *family = ifa->ifa_family; + + return 0; + } else if (rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type)) { + struct rtmsg *rtm; + + rtm = NLMSG_DATA(m->hdr); + + *family = rtm->rtm_family; + + return 0; + } else if (rtnl_message_type_is_nexthop(m->hdr->nlmsg_type)) { + struct nhmsg *nhm; + + nhm = NLMSG_DATA(m->hdr); + + *family = nhm->nh_family; + + return 0; + } + + return -EOPNOTSUPP; +} + +int sd_rtnl_message_new_addrlabel( + sd_netlink *rtnl, + sd_netlink_message **ret, + uint16_t nlmsg_type, + int ifindex, + int ifal_family) { + + struct ifaddrlblmsg *addrlabel; + int r; + + assert_return(rtnl_message_type_is_addrlabel(nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWADDRLABEL) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + addrlabel = NLMSG_DATA((*ret)->hdr); + + addrlabel->ifal_family = ifal_family; + addrlabel->ifal_index = ifindex; + + return 0; +} + +int sd_rtnl_message_addrlabel_set_prefixlen(sd_netlink_message *m, unsigned char prefixlen) { + struct ifaddrlblmsg *addrlabel; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addrlabel(m->hdr->nlmsg_type), -EINVAL); + + addrlabel = NLMSG_DATA(m->hdr); + + if (prefixlen > 128) + return -ERANGE; + + addrlabel->ifal_prefixlen = prefixlen; + + return 0; +} + +int sd_rtnl_message_addrlabel_get_prefixlen(sd_netlink_message *m, unsigned char *prefixlen) { + struct ifaddrlblmsg *addrlabel; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_addrlabel(m->hdr->nlmsg_type), -EINVAL); + + addrlabel = NLMSG_DATA(m->hdr); + + *prefixlen = addrlabel->ifal_prefixlen; + + return 0; +} + +int sd_rtnl_message_new_routing_policy_rule( + sd_netlink *rtnl, + sd_netlink_message **ret, + uint16_t nlmsg_type, + int ifal_family) { + + struct fib_rule_hdr *frh; + int r; + + assert_return(rtnl_message_type_is_routing_policy_rule(nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWRULE) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + frh = NLMSG_DATA((*ret)->hdr); + frh->family = ifal_family; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_set_tos(sd_netlink_message *m, uint8_t tos) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + frh->tos = tos; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_get_tos(sd_netlink_message *m, uint8_t *tos) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + *tos = frh->tos; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_set_table(sd_netlink_message *m, uint8_t table) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + frh->table = table; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_get_table(sd_netlink_message *m, uint8_t *table) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + *table = frh->table; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_set_flags(sd_netlink_message *m, uint32_t flags) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + frh->flags = flags; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_get_flags(sd_netlink_message *m, uint32_t *flags) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + *flags = frh->flags; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_set_fib_type(sd_netlink_message *m, uint8_t type) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + frh->action = type; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_get_fib_type(sd_netlink_message *m, uint8_t *type) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + *type = frh->action; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_set_fib_dst_prefixlen(sd_netlink_message *m, uint8_t len) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + frh->dst_len = len; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_get_fib_dst_prefixlen(sd_netlink_message *m, uint8_t *len) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + *len = frh->dst_len; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_set_fib_src_prefixlen(sd_netlink_message *m, uint8_t len) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + frh->src_len = len; + + return 0; +} + +int sd_rtnl_message_routing_policy_rule_get_fib_src_prefixlen(sd_netlink_message *m, uint8_t *len) { + struct fib_rule_hdr *frh; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_routing_policy_rule(m->hdr->nlmsg_type), -EINVAL); + + frh = NLMSG_DATA(m->hdr); + + *len = frh->src_len; + + return 0; +} + +int sd_rtnl_message_new_traffic_control( + sd_netlink *rtnl, + sd_netlink_message **ret, + uint16_t nlmsg_type, + int ifindex, + uint32_t handle, + uint32_t parent) { + + struct tcmsg *tcm; + int r; + + assert_return(rtnl_message_type_is_traffic_control(nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (IN_SET(nlmsg_type, RTM_NEWQDISC, RTM_NEWTCLASS)) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + tcm = NLMSG_DATA((*ret)->hdr); + tcm->tcm_ifindex = ifindex; + tcm->tcm_handle = handle; + tcm->tcm_parent = parent; + + return 0; +} + +int sd_rtnl_message_traffic_control_get_ifindex(sd_netlink_message *m, int *ret) { + struct tcmsg *tcm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_traffic_control(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + tcm = NLMSG_DATA(m->hdr); + *ret = tcm->tcm_ifindex; + + return 0; +} + +int sd_rtnl_message_traffic_control_get_handle(sd_netlink_message *m, uint32_t *ret) { + struct tcmsg *tcm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_traffic_control(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + tcm = NLMSG_DATA(m->hdr); + *ret = tcm->tcm_handle; + + return 0; +} + +int sd_rtnl_message_traffic_control_get_parent(sd_netlink_message *m, uint32_t *ret) { + struct tcmsg *tcm; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(rtnl_message_type_is_traffic_control(m->hdr->nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + tcm = NLMSG_DATA(m->hdr); + *ret = tcm->tcm_parent; + + return 0; +} + +int sd_rtnl_message_new_mdb( + sd_netlink *rtnl, + sd_netlink_message **ret, + uint16_t nlmsg_type, + int mdb_ifindex) { + + struct br_port_msg *bpm; + int r; + + assert_return(rtnl_message_type_is_mdb(nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWMDB) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + bpm = NLMSG_DATA((*ret)->hdr); + bpm->family = AF_BRIDGE; + bpm->ifindex = mdb_ifindex; + + return 0; +} diff --git a/src/libsystemd/sd-netlink/netlink-message.c b/src/libsystemd/sd-netlink/netlink-message.c new file mode 100644 index 0000000..000a50e --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-message.c @@ -0,0 +1,1421 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "format-util.h" +#include "memory-util.h" +#include "netlink-internal.h" +#include "netlink-types.h" +#include "netlink-util.h" +#include "socket-util.h" +#include "strv.h" + +#define GET_CONTAINER(m, i) ((struct rtattr*)((uint8_t*)(m)->hdr + (m)->containers[i].offset)) + +#define RTA_TYPE(rta) ((rta)->rta_type & NLA_TYPE_MASK) +#define RTA_FLAGS(rta) ((rta)->rta_type & ~NLA_TYPE_MASK) + +int message_new_empty(sd_netlink *nl, sd_netlink_message **ret) { + sd_netlink_message *m; + + assert(nl); + assert(ret); + + /* Note that 'nl' is currently unused, if we start using it internally we must take care to + * avoid problems due to mutual references between buses and their queued messages. See sd-bus. */ + + m = new(sd_netlink_message, 1); + if (!m) + return -ENOMEM; + + *m = (sd_netlink_message) { + .n_ref = 1, + .protocol = nl->protocol, + .sealed = false, + }; + + *ret = m; + return 0; +} + +int message_new_full( + sd_netlink *nl, + uint16_t nlmsg_type, + const NLAPolicySet *policy_set, + size_t header_size, + sd_netlink_message **ret) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + size_t size; + int r; + + assert(nl); + assert(policy_set); + assert(ret); + + size = NLMSG_SPACE(header_size); + assert(size >= sizeof(struct nlmsghdr)); + + r = message_new_empty(nl, &m); + if (r < 0) + return r; + + m->containers[0].policy_set = policy_set; + + m->hdr = malloc0(size); + if (!m->hdr) + return -ENOMEM; + + m->hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + m->hdr->nlmsg_len = size; + m->hdr->nlmsg_type = nlmsg_type; + + *ret = TAKE_PTR(m); + return 0; +} + +int message_new(sd_netlink *nl, sd_netlink_message **ret, uint16_t nlmsg_type) { + const NLAPolicySet *policy_set; + size_t size; + int r; + + assert_return(nl, -EINVAL); + assert_return(ret, -EINVAL); + + r = netlink_get_policy_set_and_header_size(nl, nlmsg_type, &policy_set, &size); + if (r < 0) + return r; + + return message_new_full(nl, nlmsg_type, policy_set, size, ret); +} + +int message_new_synthetic_error(sd_netlink *nl, int error, uint32_t serial, sd_netlink_message **ret) { + struct nlmsgerr *err; + int r; + + assert(error <= 0); + + r = message_new(nl, ret, NLMSG_ERROR); + if (r < 0) + return r; + + message_seal(*ret); + (*ret)->hdr->nlmsg_seq = serial; + + err = NLMSG_DATA((*ret)->hdr); + err->error = error; + + return 0; +} + +int sd_netlink_message_set_request_dump(sd_netlink_message *m, int dump) { + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + assert_return(m->protocol != NETLINK_ROUTE || + IN_SET(m->hdr->nlmsg_type, + RTM_GETLINK, RTM_GETLINKPROP, RTM_GETADDR, RTM_GETROUTE, RTM_GETNEIGH, + RTM_GETRULE, RTM_GETADDRLABEL, RTM_GETNEXTHOP, RTM_GETQDISC, RTM_GETTCLASS), + -EINVAL); + + SET_FLAG(m->hdr->nlmsg_flags, NLM_F_DUMP, dump); + + return 0; +} + +DEFINE_TRIVIAL_REF_FUNC(sd_netlink_message, sd_netlink_message); + +sd_netlink_message* sd_netlink_message_unref(sd_netlink_message *m) { + while (m && --m->n_ref == 0) { + unsigned i; + + free(m->hdr); + + for (i = 0; i <= m->n_containers; i++) + free(m->containers[i].attributes); + + sd_netlink_message *t = m; + m = m->next; + free(t); + } + + return NULL; +} + +int sd_netlink_message_get_type(sd_netlink_message *m, uint16_t *ret) { + assert_return(m, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = m->hdr->nlmsg_type; + + return 0; +} + +int sd_netlink_message_set_flags(sd_netlink_message *m, uint16_t flags) { + assert_return(m, -EINVAL); + assert_return(flags != 0, -EINVAL); + + m->hdr->nlmsg_flags = flags; + + return 0; +} + +int sd_netlink_message_is_broadcast(sd_netlink_message *m) { + assert_return(m, -EINVAL); + + return m->multicast_group != 0; +} + +/* If successful the updated message will be correctly aligned, if unsuccessful the old message is untouched. */ +static int add_rtattr(sd_netlink_message *m, uint16_t attr_type, const void *data, size_t data_length) { + size_t message_length; + struct nlmsghdr *new_hdr; + struct rtattr *rta; + int offset; + + assert(m); + assert(m->hdr); + assert(!m->sealed); + assert(NLMSG_ALIGN(m->hdr->nlmsg_len) == m->hdr->nlmsg_len); + assert(!data || data_length > 0); + + /* get the new message size (with padding at the end) */ + message_length = m->hdr->nlmsg_len + RTA_SPACE(data_length); + + /* buffer should be smaller than both one page or 8K to be accepted by the kernel */ + if (message_length > MIN(page_size(), 8192UL)) + return -ENOBUFS; + + /* realloc to fit the new attribute */ + new_hdr = realloc(m->hdr, message_length); + if (!new_hdr) + return -ENOMEM; + m->hdr = new_hdr; + + /* get pointer to the attribute we are about to add */ + rta = (struct rtattr *) ((uint8_t *) m->hdr + m->hdr->nlmsg_len); + + rtattr_append_attribute_internal(rta, attr_type, data, data_length); + + /* if we are inside containers, extend them */ + for (unsigned i = 0; i < m->n_containers; i++) + GET_CONTAINER(m, i)->rta_len += RTA_SPACE(data_length); + + /* update message size */ + offset = m->hdr->nlmsg_len; + m->hdr->nlmsg_len = message_length; + + /* return old message size */ + return offset; +} + +static int message_attribute_has_type(sd_netlink_message *m, size_t *ret_size, uint16_t attr_type, NLAType type) { + const NLAPolicy *policy; + + assert(m); + + policy = policy_set_get_policy(m->containers[m->n_containers].policy_set, attr_type); + if (!policy) + return -EOPNOTSUPP; + + if (policy_get_type(policy) != type) + return -EINVAL; + + if (ret_size) + *ret_size = policy_get_size(policy); + return 0; +} + +int sd_netlink_message_append_string(sd_netlink_message *m, uint16_t attr_type, const char *data) { + size_t length, size; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(data, -EINVAL); + + r = message_attribute_has_type(m, &size, attr_type, NETLINK_TYPE_STRING); + if (r < 0) + return r; + + if (size) { + length = strnlen(data, size+1); + if (length > size) + return -EINVAL; + } else + length = strlen(data); + + r = add_rtattr(m, attr_type, data, length + 1); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_strv(sd_netlink_message *m, uint16_t attr_type, const char* const *data) { + size_t length, size; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(data, -EINVAL); + + r = message_attribute_has_type(m, &size, attr_type, NETLINK_TYPE_STRING); + if (r < 0) + return r; + + STRV_FOREACH(p, data) { + if (size) { + length = strnlen(*p, size+1); + if (length > size) + return -EINVAL; + } else + length = strlen(*p); + + r = add_rtattr(m, attr_type, *p, length + 1); + if (r < 0) + return r; + } + + return 0; +} + +int sd_netlink_message_append_flag(sd_netlink_message *m, uint16_t attr_type) { + size_t size; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, &size, attr_type, NETLINK_TYPE_FLAG); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, NULL, 0); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_u8(sd_netlink_message *m, uint16_t attr_type, uint8_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U8); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(uint8_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_u16(sd_netlink_message *m, uint16_t attr_type, uint16_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U16); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(uint16_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_u32(sd_netlink_message *m, uint16_t attr_type, uint32_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U32); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(uint32_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_u64(sd_netlink_message *m, uint16_t attr_type, uint64_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U64); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(uint64_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_s8(sd_netlink_message *m, uint16_t attr_type, int8_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_S8); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(int8_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_s16(sd_netlink_message *m, uint16_t attr_type, int16_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_S16); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(int16_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_s32(sd_netlink_message *m, uint16_t attr_type, int32_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_S32); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(int32_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_s64(sd_netlink_message *m, uint16_t attr_type, int64_t data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_S64); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, &data, sizeof(int64_t)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_data(sd_netlink_message *m, uint16_t attr_type, const void *data, size_t len) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = add_rtattr(m, attr_type, data, len); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_container_data( + sd_netlink_message *m, + uint16_t container_type, + uint16_t attr_type, + const void *data, + size_t len) { + + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + + r = sd_netlink_message_open_container(m, container_type); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(m, attr_type, data, len); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); +} + +int netlink_message_append_in_addr_union(sd_netlink_message *m, uint16_t attr_type, int family, const union in_addr_union *data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(data, -EINVAL); + assert_return(IN_SET(family, AF_INET, AF_INET6), -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_IN_ADDR); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, data, FAMILY_ADDRESS_SIZE(family)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_in_addr(sd_netlink_message *m, uint16_t attr_type, const struct in_addr *data) { + return netlink_message_append_in_addr_union(m, attr_type, AF_INET, (const union in_addr_union *) data); +} + +int sd_netlink_message_append_in6_addr(sd_netlink_message *m, uint16_t attr_type, const struct in6_addr *data) { + return netlink_message_append_in_addr_union(m, attr_type, AF_INET6, (const union in_addr_union *) data); +} + +int netlink_message_append_sockaddr_union(sd_netlink_message *m, uint16_t attr_type, const union sockaddr_union *data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(data, -EINVAL); + assert_return(IN_SET(data->sa.sa_family, AF_INET, AF_INET6), -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_SOCKADDR); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, data, data->sa.sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_sockaddr_in(sd_netlink_message *m, uint16_t attr_type, const struct sockaddr_in *data) { + return netlink_message_append_sockaddr_union(m, attr_type, (const union sockaddr_union *) data); +} + +int sd_netlink_message_append_sockaddr_in6(sd_netlink_message *m, uint16_t attr_type, const struct sockaddr_in6 *data) { + return netlink_message_append_sockaddr_union(m, attr_type, (const union sockaddr_union *) data); +} + +int sd_netlink_message_append_ether_addr(sd_netlink_message *m, uint16_t attr_type, const struct ether_addr *data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(data, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_ETHER_ADDR); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, data, ETH_ALEN); + if (r < 0) + return r; + + return 0; +} + +int netlink_message_append_hw_addr(sd_netlink_message *m, uint16_t attr_type, const struct hw_addr_data *data) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(data, -EINVAL); + assert_return(data->length > 0, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_ETHER_ADDR); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, data->bytes, data->length); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_append_cache_info(sd_netlink_message *m, uint16_t attr_type, const struct ifa_cacheinfo *info) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(info, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_CACHE_INFO); + if (r < 0) + return r; + + r = add_rtattr(m, attr_type, info, sizeof(struct ifa_cacheinfo)); + if (r < 0) + return r; + + return 0; +} + +int sd_netlink_message_open_container(sd_netlink_message *m, uint16_t attr_type) { + size_t size; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + /* m->containers[m->n_containers + 1] is accessed both in read and write. Prevent access out of bound */ + assert_return(m->n_containers < (NETLINK_CONTAINER_DEPTH - 1), -ERANGE); + + r = message_attribute_has_type(m, &size, attr_type, NETLINK_TYPE_NESTED); + if (r < 0) { + const NLAPolicySetUnion *policy_set_union; + int family; + + r = message_attribute_has_type(m, &size, attr_type, NETLINK_TYPE_NESTED_UNION_BY_FAMILY); + if (r < 0) + return r; + + r = sd_rtnl_message_get_family(m, &family); + if (r < 0) + return r; + + policy_set_union = policy_set_get_policy_set_union( + m->containers[m->n_containers].policy_set, + attr_type); + if (!policy_set_union) + return -EOPNOTSUPP; + + m->containers[m->n_containers + 1].policy_set = + policy_set_union_get_policy_set_by_family( + policy_set_union, + family); + } else + m->containers[m->n_containers + 1].policy_set = + policy_set_get_policy_set( + m->containers[m->n_containers].policy_set, + attr_type); + if (!m->containers[m->n_containers + 1].policy_set) + return -EOPNOTSUPP; + + r = add_rtattr(m, attr_type | NLA_F_NESTED, NULL, size); + if (r < 0) + return r; + + m->containers[m->n_containers++].offset = r; + + return 0; +} + +int sd_netlink_message_open_container_union(sd_netlink_message *m, uint16_t attr_type, const char *key) { + const NLAPolicySetUnion *policy_set_union; + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(m->n_containers < (NETLINK_CONTAINER_DEPTH - 1), -ERANGE); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_NESTED_UNION_BY_STRING); + if (r < 0) + return r; + + policy_set_union = policy_set_get_policy_set_union( + m->containers[m->n_containers].policy_set, + attr_type); + if (!policy_set_union) + return -EOPNOTSUPP; + + m->containers[m->n_containers + 1].policy_set = + policy_set_union_get_policy_set_by_string( + policy_set_union, + key); + if (!m->containers[m->n_containers + 1].policy_set) + return -EOPNOTSUPP; + + r = sd_netlink_message_append_string(m, policy_set_union_get_match_attribute(policy_set_union), key); + if (r < 0) + return r; + + /* do we ever need non-null size */ + r = add_rtattr(m, attr_type | NLA_F_NESTED, NULL, 0); + if (r < 0) + return r; + + m->containers[m->n_containers++].offset = r; + + return 0; +} + +int sd_netlink_message_close_container(sd_netlink_message *m) { + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(m->n_containers > 0, -EINVAL); + + m->containers[m->n_containers].policy_set = NULL; + m->containers[m->n_containers].offset = 0; + m->n_containers--; + + return 0; +} + +int sd_netlink_message_open_array(sd_netlink_message *m, uint16_t attr_type) { + int r; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(m->n_containers < (NETLINK_CONTAINER_DEPTH - 1), -ERANGE); + + r = add_rtattr(m, attr_type | NLA_F_NESTED, NULL, 0); + if (r < 0) + return r; + + m->containers[m->n_containers].offset = r; + m->n_containers++; + m->containers[m->n_containers].policy_set = m->containers[m->n_containers - 1].policy_set; + + return 0; +} + +int sd_netlink_message_cancel_array(sd_netlink_message *m) { + uint32_t rta_len; + + assert_return(m, -EINVAL); + assert_return(!m->sealed, -EPERM); + assert_return(m->n_containers > 1, -EINVAL); + + rta_len = GET_CONTAINER(m, (m->n_containers - 1))->rta_len; + + for (unsigned i = 0; i < m->n_containers; i++) + GET_CONTAINER(m, i)->rta_len -= rta_len; + + m->hdr->nlmsg_len -= rta_len; + + m->n_containers--; + m->containers[m->n_containers].policy_set = NULL; + + return 0; +} + +static int netlink_message_read_internal( + sd_netlink_message *m, + uint16_t attr_type, + void **ret_data, + bool *ret_net_byteorder) { + + struct netlink_attribute *attribute; + struct rtattr *rta; + + assert_return(m, -EINVAL); + assert_return(m->sealed, -EPERM); + + assert(m->n_containers < NETLINK_CONTAINER_DEPTH); + + if (!m->containers[m->n_containers].attributes) + return -ENODATA; + + if (attr_type > m->containers[m->n_containers].max_attribute) + return -ENODATA; + + attribute = &m->containers[m->n_containers].attributes[attr_type]; + + if (attribute->offset == 0) + return -ENODATA; + + rta = (struct rtattr*)((uint8_t *) m->hdr + attribute->offset); + + if (ret_data) + *ret_data = RTA_DATA(rta); + + if (ret_net_byteorder) + *ret_net_byteorder = attribute->net_byteorder; + + return RTA_PAYLOAD(rta); +} + +int sd_netlink_message_read(sd_netlink_message *m, uint16_t attr_type, size_t size, void *data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if ((size_t) r > size) + return -ENOBUFS; + + if (data) + memcpy(data, attr_data, r); + + return r; +} + +int sd_netlink_message_read_data(sd_netlink_message *m, uint16_t attr_type, size_t *ret_size, void **ret_data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if (ret_data) { + void *data; + + data = memdup(attr_data, r); + if (!data) + return -ENOMEM; + + *ret_data = data; + } + + if (ret_size) + *ret_size = r; + + return r; +} + +int sd_netlink_message_read_data_suffix0(sd_netlink_message *m, uint16_t attr_type, size_t *ret_size, void **ret_data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if (ret_data) { + void *data; + + data = memdup_suffix0(attr_data, r); + if (!data) + return -ENOMEM; + + *ret_data = data; + } + + if (ret_size) + *ret_size = r; + + return r; +} + +int sd_netlink_message_read_string_strdup(sd_netlink_message *m, uint16_t attr_type, char **data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_STRING); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if (data) { + char *str; + + str = strndup(attr_data, r); + if (!str) + return -ENOMEM; + + *data = str; + } + + return 0; +} + +int sd_netlink_message_read_string(sd_netlink_message *m, uint16_t attr_type, const char **data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_STRING); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if (strnlen(attr_data, r) >= (size_t) r) + return -EIO; + + if (data) + *data = (const char *) attr_data; + + return 0; +} + +int sd_netlink_message_read_u8(sd_netlink_message *m, uint16_t attr_type, uint8_t *data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U8); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if ((size_t) r < sizeof(uint8_t)) + return -EIO; + + if (data) + *data = *(uint8_t *) attr_data; + + return 0; +} + +int sd_netlink_message_read_u16(sd_netlink_message *m, uint16_t attr_type, uint16_t *data) { + void *attr_data; + bool net_byteorder; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U16); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, &net_byteorder); + if (r < 0) + return r; + + if ((size_t) r < sizeof(uint16_t)) + return -EIO; + + if (data) { + if (net_byteorder) + *data = be16toh(*(uint16_t *) attr_data); + else + *data = *(uint16_t *) attr_data; + } + + return 0; +} + +int sd_netlink_message_read_u32(sd_netlink_message *m, uint16_t attr_type, uint32_t *data) { + void *attr_data; + bool net_byteorder; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_U32); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, &net_byteorder); + if (r < 0) + return r; + + if ((size_t) r < sizeof(uint32_t)) + return -EIO; + + if (data) { + if (net_byteorder) + *data = be32toh(*(uint32_t *) attr_data); + else + *data = *(uint32_t *) attr_data; + } + + return 0; +} + +int sd_netlink_message_read_ether_addr(sd_netlink_message *m, uint16_t attr_type, struct ether_addr *data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_ETHER_ADDR); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if ((size_t) r < sizeof(struct ether_addr)) + return -EIO; + + if (data) + memcpy(data, attr_data, sizeof(struct ether_addr)); + + return 0; +} + +int netlink_message_read_hw_addr(sd_netlink_message *m, uint16_t attr_type, struct hw_addr_data *data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_ETHER_ADDR); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if (r > HW_ADDR_MAX_SIZE) + return -EIO; + + if (data) { + memcpy(data->bytes, attr_data, r); + data->length = r; + } + + return 0; +} + +int sd_netlink_message_read_cache_info(sd_netlink_message *m, uint16_t attr_type, struct ifa_cacheinfo *info) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_CACHE_INFO); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if ((size_t) r < sizeof(struct ifa_cacheinfo)) + return -EIO; + + if (info) + memcpy(info, attr_data, sizeof(struct ifa_cacheinfo)); + + return 0; +} + +int netlink_message_read_in_addr_union(sd_netlink_message *m, uint16_t attr_type, int family, union in_addr_union *data) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + assert_return(IN_SET(family, AF_INET, AF_INET6), -EINVAL); + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_IN_ADDR); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r < 0) + return r; + + if ((size_t) r < FAMILY_ADDRESS_SIZE(family)) + return -EIO; + + if (data) + memcpy(data, attr_data, FAMILY_ADDRESS_SIZE(family)); + + return 0; +} + +int sd_netlink_message_read_in_addr(sd_netlink_message *m, uint16_t attr_type, struct in_addr *data) { + union in_addr_union u; + int r; + + r = netlink_message_read_in_addr_union(m, attr_type, AF_INET, &u); + if (r >= 0 && data) + *data = u.in; + + return r; +} + +int sd_netlink_message_read_in6_addr(sd_netlink_message *m, uint16_t attr_type, struct in6_addr *data) { + union in_addr_union u; + int r; + + r = netlink_message_read_in_addr_union(m, attr_type, AF_INET6, &u); + if (r >= 0 && data) + *data = u.in6; + + return r; +} + +int sd_netlink_message_has_flag(sd_netlink_message *m, uint16_t attr_type) { + void *attr_data; + int r; + + assert_return(m, -EINVAL); + + /* This returns 1 when the flag is set, 0 when not set, negative errno on error. */ + + r = message_attribute_has_type(m, NULL, attr_type, NETLINK_TYPE_FLAG); + if (r < 0) + return r; + + r = netlink_message_read_internal(m, attr_type, &attr_data, NULL); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + return 1; +} + +int sd_netlink_message_read_strv(sd_netlink_message *m, uint16_t container_type, uint16_t attr_type, char ***ret) { + _cleanup_strv_free_ char **s = NULL; + const NLAPolicySet *policy_set; + const NLAPolicy *policy; + struct rtattr *rta; + void *container; + size_t rt_len; + int r; + + assert_return(m, -EINVAL); + assert_return(m->n_containers < NETLINK_CONTAINER_DEPTH, -EINVAL); + + policy = policy_set_get_policy( + m->containers[m->n_containers].policy_set, + container_type); + if (!policy) + return -EOPNOTSUPP; + + if (policy_get_type(policy) != NETLINK_TYPE_NESTED) + return -EINVAL; + + policy_set = policy_set_get_policy_set( + m->containers[m->n_containers].policy_set, + container_type); + if (!policy_set) + return -EOPNOTSUPP; + + policy = policy_set_get_policy(policy_set, attr_type); + if (!policy) + return -EOPNOTSUPP; + + if (policy_get_type(policy) != NETLINK_TYPE_STRING) + return -EINVAL; + + r = netlink_message_read_internal(m, container_type, &container, NULL); + if (r < 0) + return r; + + rt_len = (size_t) r; + rta = container; + + /* RTA_OK() macro compares with rta->rt_len, which is unsigned short, and + * LGTM.com analysis does not like the type difference. Hence, here we + * introduce an unsigned short variable as a workaround. */ + unsigned short len = rt_len; + for (; RTA_OK(rta, len); rta = RTA_NEXT(rta, len)) { + uint16_t type; + + type = RTA_TYPE(rta); + if (type != attr_type) + continue; + + r = strv_extend(&s, RTA_DATA(rta)); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(s); + return 0; +} + +static int netlink_container_parse( + sd_netlink_message *m, + struct netlink_container *container, + struct rtattr *rta, + size_t rt_len) { + + _cleanup_free_ struct netlink_attribute *attributes = NULL; + uint16_t max_attr = 0; + + /* RTA_OK() macro compares with rta->rt_len, which is unsigned short, and + * LGTM.com analysis does not like the type difference. Hence, here we + * introduce an unsigned short variable as a workaround. */ + unsigned short len = rt_len; + for (; RTA_OK(rta, len); rta = RTA_NEXT(rta, len)) { + uint16_t attr; + + attr = RTA_TYPE(rta); + max_attr = MAX(max_attr, attr); + + if (!GREEDY_REALLOC0(attributes, (size_t) max_attr + 1)) + return -ENOMEM; + + if (attributes[attr].offset != 0) + log_debug("sd-netlink: message parse - overwriting repeated attribute"); + + attributes[attr].offset = (uint8_t *) rta - (uint8_t *) m->hdr; + attributes[attr].nested = RTA_FLAGS(rta) & NLA_F_NESTED; + attributes[attr].net_byteorder = RTA_FLAGS(rta) & NLA_F_NET_BYTEORDER; + } + + container->attributes = TAKE_PTR(attributes); + container->max_attribute = max_attr; + + return 0; +} + +int sd_netlink_message_enter_container(sd_netlink_message *m, uint16_t attr_type) { + const NLAPolicy *policy; + const NLAPolicySet *policy_set; + void *container; + size_t size; + int r; + + assert_return(m, -EINVAL); + assert_return(m->n_containers < (NETLINK_CONTAINER_DEPTH - 1), -EINVAL); + + policy = policy_set_get_policy( + m->containers[m->n_containers].policy_set, + attr_type); + if (!policy) + return -EOPNOTSUPP; + + switch (policy_get_type(policy)) { + case NETLINK_TYPE_NESTED: + policy_set = policy_set_get_policy_set( + m->containers[m->n_containers].policy_set, + attr_type); + break; + + case NETLINK_TYPE_NESTED_UNION_BY_STRING: { + const NLAPolicySetUnion *policy_set_union; + const char *key; + + policy_set_union = policy_get_policy_set_union(policy); + if (!policy_set_union) + return -EOPNOTSUPP; + + r = sd_netlink_message_read_string( + m, + policy_set_union_get_match_attribute(policy_set_union), + &key); + if (r < 0) + return r; + + policy_set = policy_set_union_get_policy_set_by_string( + policy_set_union, + key); + break; + } + case NETLINK_TYPE_NESTED_UNION_BY_FAMILY: { + const NLAPolicySetUnion *policy_set_union; + int family; + + policy_set_union = policy_get_policy_set_union(policy); + if (!policy_set_union) + return -EOPNOTSUPP; + + r = sd_rtnl_message_get_family(m, &family); + if (r < 0) + return r; + + policy_set = policy_set_union_get_policy_set_by_family( + policy_set_union, + family); + break; + } + default: + assert_not_reached(); + } + if (!policy_set) + return -EOPNOTSUPP; + + r = netlink_message_read_internal(m, attr_type, &container, NULL); + if (r < 0) + return r; + + size = (size_t) r; + m->n_containers++; + + r = netlink_container_parse(m, + &m->containers[m->n_containers], + container, + size); + if (r < 0) { + m->n_containers--; + return r; + } + + m->containers[m->n_containers].policy_set = policy_set; + + return 0; +} + +int sd_netlink_message_enter_array(sd_netlink_message *m, uint16_t attr_type) { + void *container; + size_t size; + int r; + + assert_return(m, -EINVAL); + assert_return(m->n_containers < (NETLINK_CONTAINER_DEPTH - 1), -EINVAL); + + r = netlink_message_read_internal(m, attr_type, &container, NULL); + if (r < 0) + return r; + + size = (size_t) r; + m->n_containers++; + + r = netlink_container_parse(m, + &m->containers[m->n_containers], + container, + size); + if (r < 0) { + m->n_containers--; + return r; + } + + m->containers[m->n_containers].policy_set = m->containers[m->n_containers - 1].policy_set; + + return 0; +} + +int sd_netlink_message_exit_container(sd_netlink_message *m) { + assert_return(m, -EINVAL); + assert_return(m->sealed, -EINVAL); + assert_return(m->n_containers > 0, -EINVAL); + + m->containers[m->n_containers].attributes = mfree(m->containers[m->n_containers].attributes); + m->containers[m->n_containers].max_attribute = 0; + m->containers[m->n_containers].policy_set = NULL; + + m->n_containers--; + + return 0; +} + +int sd_netlink_message_get_max_attribute(sd_netlink_message *m, uint16_t *ret) { + assert_return(m, -EINVAL); + assert_return(m->sealed, -EINVAL); + assert_return(ret, -EINVAL); + + *ret = m->containers[m->n_containers].max_attribute; + return 0; +} + +int sd_netlink_message_is_error(sd_netlink_message *m) { + assert_return(m, 0); + assert_return(m->hdr, 0); + + return m->hdr->nlmsg_type == NLMSG_ERROR; +} + +int sd_netlink_message_get_errno(sd_netlink_message *m) { + struct nlmsgerr *err; + + assert_return(m, -EINVAL); + assert_return(m->hdr, -EINVAL); + + if (!sd_netlink_message_is_error(m)) + return 0; + + err = NLMSG_DATA(m->hdr); + + return err->error; +} + +static int netlink_message_parse_error(sd_netlink_message *m) { + struct nlmsgerr *err = NLMSG_DATA(m->hdr); + size_t hlen = sizeof(struct nlmsgerr); + + /* no TLVs, nothing to do here */ + if (!(m->hdr->nlmsg_flags & NLM_F_ACK_TLVS)) + return 0; + + /* if NLM_F_CAPPED is set then the inner err msg was capped */ + if (!(m->hdr->nlmsg_flags & NLM_F_CAPPED)) + hlen += err->msg.nlmsg_len - sizeof(struct nlmsghdr); + + if (m->hdr->nlmsg_len <= NLMSG_SPACE(hlen)) + return 0; + + return netlink_container_parse(m, + &m->containers[m->n_containers], + (struct rtattr*)((uint8_t*) NLMSG_DATA(m->hdr) + hlen), + NLMSG_PAYLOAD(m->hdr, hlen)); +} + +int sd_netlink_message_rewind(sd_netlink_message *m, sd_netlink *nl) { + size_t size; + int r; + + assert_return(m, -EINVAL); + assert_return(nl, -EINVAL); + + /* don't allow appending to message once parsed */ + message_seal(m); + + for (unsigned i = 1; i <= m->n_containers; i++) + m->containers[i].attributes = mfree(m->containers[i].attributes); + + m->n_containers = 0; + + if (m->containers[0].attributes) + /* top-level attributes have already been parsed */ + return 0; + + assert(m->hdr); + + r = netlink_get_policy_set_and_header_size(nl, m->hdr->nlmsg_type, + &m->containers[0].policy_set, &size); + if (r < 0) + return r; + + if (sd_netlink_message_is_error(m)) + return netlink_message_parse_error(m); + + return netlink_container_parse(m, + &m->containers[0], + (struct rtattr*)((uint8_t*) NLMSG_DATA(m->hdr) + NLMSG_ALIGN(size)), + NLMSG_PAYLOAD(m->hdr, size)); +} + +void message_seal(sd_netlink_message *m) { + assert(m); + + m->sealed = true; +} + +sd_netlink_message *sd_netlink_message_next(sd_netlink_message *m) { + assert_return(m, NULL); + + return m->next; +} diff --git a/src/libsystemd/sd-netlink/netlink-slot.c b/src/libsystemd/sd-netlink/netlink-slot.c new file mode 100644 index 0000000..d85d2cd --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-slot.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "netlink-internal.h" +#include "netlink-slot.h" +#include "string-util.h" + +int netlink_slot_allocate( + sd_netlink *nl, + bool floating, + NetlinkSlotType type, + size_t extra, + void *userdata, + const char *description, + sd_netlink_slot **ret) { + + _cleanup_free_ sd_netlink_slot *slot = NULL; + + assert(nl); + assert(ret); + + slot = malloc0(offsetof(sd_netlink_slot, reply_callback) + extra); + if (!slot) + return -ENOMEM; + + slot->n_ref = 1; + slot->netlink = nl; + slot->userdata = userdata; + slot->type = type; + slot->floating = floating; + + if (description) { + slot->description = strdup(description); + if (!slot->description) + return -ENOMEM; + } + + if (!floating) + sd_netlink_ref(nl); + + LIST_PREPEND(slots, nl->slots, slot); + + *ret = TAKE_PTR(slot); + + return 0; +} + +void netlink_slot_disconnect(sd_netlink_slot *slot, bool unref) { + sd_netlink *nl; + + assert(slot); + + nl = slot->netlink; + if (!nl) + return; + + switch (slot->type) { + + case NETLINK_REPLY_CALLBACK: + (void) hashmap_remove(nl->reply_callbacks, &slot->reply_callback.serial); + + if (slot->reply_callback.timeout != USEC_INFINITY) + prioq_remove(nl->reply_callbacks_prioq, &slot->reply_callback, &slot->reply_callback.prioq_idx); + + break; + case NETLINK_MATCH_CALLBACK: + LIST_REMOVE(match_callbacks, nl->match_callbacks, &slot->match_callback); + + for (size_t i = 0; i < slot->match_callback.n_groups; i++) + (void) socket_broadcast_group_unref(nl, slot->match_callback.groups[i]); + + slot->match_callback.n_groups = 0; + slot->match_callback.groups = mfree(slot->match_callback.groups); + + break; + default: + assert_not_reached(); + } + + slot->type = _NETLINK_SLOT_INVALID; + slot->netlink = NULL; + LIST_REMOVE(slots, nl->slots, slot); + + if (!slot->floating) + sd_netlink_unref(nl); + else if (unref) + sd_netlink_slot_unref(slot); +} + +static sd_netlink_slot* netlink_slot_free(sd_netlink_slot *slot) { + assert(slot); + + netlink_slot_disconnect(slot, false); + + if (slot->destroy_callback) + slot->destroy_callback(slot->userdata); + + free(slot->description); + return mfree(slot); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink_slot, sd_netlink_slot, netlink_slot_free); + +sd_netlink *sd_netlink_slot_get_netlink(sd_netlink_slot *slot) { + assert_return(slot, NULL); + + return slot->netlink; +} + +void *sd_netlink_slot_get_userdata(sd_netlink_slot *slot) { + assert_return(slot, NULL); + + return slot->userdata; +} + +void *sd_netlink_slot_set_userdata(sd_netlink_slot *slot, void *userdata) { + void *ret; + + assert_return(slot, NULL); + + ret = slot->userdata; + slot->userdata = userdata; + + return ret; +} + +int sd_netlink_slot_get_destroy_callback(sd_netlink_slot *slot, sd_netlink_destroy_t *callback) { + assert_return(slot, -EINVAL); + + if (callback) + *callback = slot->destroy_callback; + + return !!slot->destroy_callback; +} + +int sd_netlink_slot_set_destroy_callback(sd_netlink_slot *slot, sd_netlink_destroy_t callback) { + assert_return(slot, -EINVAL); + + slot->destroy_callback = callback; + return 0; +} + +int sd_netlink_slot_get_floating(sd_netlink_slot *slot) { + assert_return(slot, -EINVAL); + + return slot->floating; +} + +int sd_netlink_slot_set_floating(sd_netlink_slot *slot, int b) { + assert_return(slot, -EINVAL); + + if (slot->floating == !!b) + return 0; + + if (!slot->netlink) /* Already disconnected */ + return -ESTALE; + + slot->floating = b; + + if (b) { + sd_netlink_slot_ref(slot); + sd_netlink_unref(slot->netlink); + } else { + sd_netlink_ref(slot->netlink); + sd_netlink_slot_unref(slot); + } + + return 1; +} + +int sd_netlink_slot_get_description(sd_netlink_slot *slot, const char **description) { + assert_return(slot, -EINVAL); + + if (description) + *description = slot->description; + + return !!slot->description; +} + +int sd_netlink_slot_set_description(sd_netlink_slot *slot, const char *description) { + assert_return(slot, -EINVAL); + + return free_and_strdup(&slot->description, description); +} diff --git a/src/libsystemd/sd-netlink/netlink-slot.h b/src/libsystemd/sd-netlink/netlink-slot.h new file mode 100644 index 0000000..79de817 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-slot.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-netlink.h" + +int netlink_slot_allocate( + sd_netlink *nl, + bool floating, + NetlinkSlotType type, + size_t extra, + void *userdata, + const char *description, + sd_netlink_slot **ret); +void netlink_slot_disconnect(sd_netlink_slot *slot, bool unref); diff --git a/src/libsystemd/sd-netlink/netlink-socket.c b/src/libsystemd/sd-netlink/netlink-socket.c new file mode 100644 index 0000000..64cde89 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-socket.c @@ -0,0 +1,459 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "iovec-util.h" +#include "netlink-internal.h" +#include "netlink-types.h" +#include "socket-util.h" + +static int broadcast_groups_get(sd_netlink *nl) { + _cleanup_free_ uint32_t *groups = NULL; + socklen_t len = 0, old_len; + int r; + + assert(nl); + assert(nl->fd >= 0); + + if (getsockopt(nl->fd, SOL_NETLINK, NETLINK_LIST_MEMBERSHIPS, NULL, &len) < 0) { + if (errno != ENOPROTOOPT) + return -errno; + + nl->broadcast_group_dont_leave = true; + return 0; + } + + if (len == 0) + return 0; + + groups = new0(uint32_t, len); + if (!groups) + return -ENOMEM; + + old_len = len; + + if (getsockopt(nl->fd, SOL_NETLINK, NETLINK_LIST_MEMBERSHIPS, groups, &len) < 0) + return -errno; + + if (old_len != len) + return -EIO; + + for (unsigned i = 0; i < len; i++) + for (unsigned j = 0; j < sizeof(uint32_t) * 8; j++) + if (groups[i] & (1U << j)) { + unsigned group = i * sizeof(uint32_t) * 8 + j + 1; + + r = hashmap_ensure_put(&nl->broadcast_group_refs, NULL, UINT_TO_PTR(group), UINT_TO_PTR(1)); + if (r < 0) + return r; + } + + return 0; +} + +int socket_bind(sd_netlink *nl) { + socklen_t addrlen; + int r; + + r = setsockopt_int(nl->fd, SOL_NETLINK, NETLINK_PKTINFO, true); + if (r < 0) + return r; + + addrlen = sizeof(nl->sockaddr); + + /* ignore EINVAL to allow binding an already bound socket */ + if (bind(nl->fd, &nl->sockaddr.sa, addrlen) < 0 && errno != EINVAL) + return -errno; + + if (getsockname(nl->fd, &nl->sockaddr.sa, &addrlen) < 0) + return -errno; + + return broadcast_groups_get(nl); +} + +static unsigned broadcast_group_get_ref(sd_netlink *nl, unsigned group) { + assert(nl); + + return PTR_TO_UINT(hashmap_get(nl->broadcast_group_refs, UINT_TO_PTR(group))); +} + +static int broadcast_group_set_ref(sd_netlink *nl, unsigned group, unsigned n_ref) { + int r; + + assert(nl); + + r = hashmap_ensure_allocated(&nl->broadcast_group_refs, NULL); + if (r < 0) + return r; + + return hashmap_replace(nl->broadcast_group_refs, UINT_TO_PTR(group), UINT_TO_PTR(n_ref)); +} + +static int broadcast_group_join(sd_netlink *nl, unsigned group) { + assert(nl); + assert(nl->fd >= 0); + assert(group > 0); + + /* group is "unsigned", but netlink(7) says the argument for NETLINK_ADD_MEMBERSHIP is "int" */ + return setsockopt_int(nl->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, group); +} + +int socket_broadcast_group_ref(sd_netlink *nl, unsigned group) { + unsigned n_ref; + int r; + + assert(nl); + + n_ref = broadcast_group_get_ref(nl, group); + + n_ref++; + + r = broadcast_group_set_ref(nl, group, n_ref); + if (r < 0) + return r; + + if (n_ref > 1) + /* already in the group */ + return 0; + + return broadcast_group_join(nl, group); +} + +static int broadcast_group_leave(sd_netlink *nl, unsigned group) { + assert(nl); + assert(nl->fd >= 0); + assert(group > 0); + + if (nl->broadcast_group_dont_leave) + return 0; + + /* group is "unsigned", but netlink(7) says the argument for NETLINK_DROP_MEMBERSHIP is "int" */ + return setsockopt_int(nl->fd, SOL_NETLINK, NETLINK_DROP_MEMBERSHIP, group); +} + +int socket_broadcast_group_unref(sd_netlink *nl, unsigned group) { + unsigned n_ref; + int r; + + assert(nl); + + n_ref = broadcast_group_get_ref(nl, group); + if (n_ref == 0) + return 0; + + n_ref--; + + r = broadcast_group_set_ref(nl, group, n_ref); + if (r < 0) + return r; + + if (n_ref > 0) + /* still refs left */ + return 0; + + return broadcast_group_leave(nl, group); +} + +/* returns the number of bytes sent, or a negative error code */ +int socket_write_message(sd_netlink *nl, sd_netlink_message *m) { + union sockaddr_union addr = { + .nl.nl_family = AF_NETLINK, + }; + ssize_t k; + + assert(nl); + assert(m); + assert(m->hdr); + + k = sendto(nl->fd, m->hdr, m->hdr->nlmsg_len, 0, &addr.sa, sizeof(addr)); + if (k < 0) + return -errno; + + return k; +} + +static int socket_recv_message(int fd, void *buf, size_t buf_size, uint32_t *ret_mcast_group, bool peek) { + struct iovec iov = IOVEC_MAKE(buf, buf_size); + union sockaddr_union sender; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct nl_pktinfo))) control; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_name = &sender, + .msg_namelen = sizeof(sender), + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + ssize_t n; + + assert(fd >= 0); + assert(peek || (buf && buf_size > 0)); + + n = recvmsg_safe(fd, &msg, MSG_TRUNC | (peek ? MSG_PEEK : 0)); + if (n == -ENOBUFS) + return log_debug_errno(n, "sd-netlink: kernel receive buffer overrun"); + else if (ERRNO_IS_NEG_TRANSIENT(n)) { + if (ret_mcast_group) + *ret_mcast_group = 0; + return 0; + } else if (n < 0) + return (int) n; + + if (sender.nl.nl_pid != 0) { + /* not from the kernel, ignore */ + log_debug("sd-netlink: ignoring message from PID %"PRIu32, sender.nl.nl_pid); + + if (peek) { + /* drop the message */ + n = recvmsg_safe(fd, &msg, 0); + if (n < 0) + return (int) n; + } + + if (ret_mcast_group) + *ret_mcast_group = 0; + return 0; + } + + if (!peek && (size_t) n > buf_size) /* message did not fit in read buffer */ + return -EIO; + + if (ret_mcast_group) { + struct nl_pktinfo *pi; + + pi = CMSG_FIND_DATA(&msg, SOL_NETLINK, NETLINK_PKTINFO, struct nl_pktinfo); + if (pi) + *ret_mcast_group = pi->group; + else + *ret_mcast_group = 0; + } + + return (int) n; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + netlink_message_hash_ops, + void, trivial_hash_func, trivial_compare_func, + sd_netlink_message, sd_netlink_message_unref); + +static int netlink_queue_received_message(sd_netlink *nl, sd_netlink_message *m) { + uint32_t serial; + int r; + + assert(nl); + assert(m); + + if (ordered_set_size(nl->rqueue) >= NETLINK_RQUEUE_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS), + "sd-netlink: exhausted the read queue size (%d)", NETLINK_RQUEUE_MAX); + + r = ordered_set_ensure_put(&nl->rqueue, &netlink_message_hash_ops, m); + if (r < 0) + return r; + + sd_netlink_message_ref(m); + + if (sd_netlink_message_is_broadcast(m)) + return 0; + + serial = message_get_serial(m); + if (serial == 0) + return 0; + + if (sd_netlink_message_get_errno(m) < 0) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *old = NULL; + + old = hashmap_remove(nl->rqueue_by_serial, UINT32_TO_PTR(serial)); + if (old) + log_debug("sd-netlink: received error message with serial %"PRIu32", but another message with " + "the same serial is already stored in the read queue, replacing.", serial); + } + + r = hashmap_ensure_put(&nl->rqueue_by_serial, &netlink_message_hash_ops, UINT32_TO_PTR(serial), m); + if (r == -EEXIST) { + if (!sd_netlink_message_is_error(m)) + log_debug("sd-netlink: received message with serial %"PRIu32", but another message with " + "the same serial is already stored in the read queue, ignoring.", serial); + return 0; + } + if (r < 0) { + sd_netlink_message_unref(ordered_set_remove(nl->rqueue, m)); + return r; + } + + sd_netlink_message_ref(m); + return 0; +} + +static int netlink_queue_partially_received_message(sd_netlink *nl, sd_netlink_message *m) { + uint32_t serial; + int r; + + assert(nl); + assert(m); + assert(m->hdr->nlmsg_flags & NLM_F_MULTI); + + if (hashmap_size(nl->rqueue_partial_by_serial) >= NETLINK_RQUEUE_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS), + "sd-netlink: exhausted the partial read queue size (%d)", NETLINK_RQUEUE_MAX); + + serial = message_get_serial(m); + r = hashmap_ensure_put(&nl->rqueue_partial_by_serial, &netlink_message_hash_ops, UINT32_TO_PTR(serial), m); + if (r < 0) + return r; + + sd_netlink_message_ref(m); + return 0; +} + +static int parse_message_one(sd_netlink *nl, uint32_t group, const struct nlmsghdr *hdr, sd_netlink_message **ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + size_t size; + int r; + + assert(nl); + assert(hdr); + assert(ret); + + /* not broadcast and not for us */ + if (group == 0 && hdr->nlmsg_pid != nl->sockaddr.nl.nl_pid) + goto finalize; + + /* silently drop noop messages */ + if (hdr->nlmsg_type == NLMSG_NOOP) + goto finalize; + + /* check that we support this message type */ + r = netlink_get_policy_set_and_header_size(nl, hdr->nlmsg_type, NULL, &size); + if (r == -EOPNOTSUPP) { + log_debug("sd-netlink: ignored message with unknown type: %i", hdr->nlmsg_type); + goto finalize; + } + if (r < 0) + return r; + + /* check that the size matches the message type */ + if (hdr->nlmsg_len < NLMSG_LENGTH(size)) { + log_debug("sd-netlink: message is shorter than expected, dropping."); + goto finalize; + } + + r = message_new_empty(nl, &m); + if (r < 0) + return r; + + m->multicast_group = group; + m->hdr = memdup(hdr, hdr->nlmsg_len); + if (!m->hdr) + return -ENOMEM; + + /* seal and parse the top-level message */ + r = sd_netlink_message_rewind(m, nl); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 1; + +finalize: + *ret = NULL; + return 0; +} + +/* On success, the number of bytes received is returned and *ret points to the received message + * which has a valid header and the correct size. + * If nothing useful was received 0 is returned. + * On failure, a negative error code is returned. + */ +int socket_read_message(sd_netlink *nl) { + bool done = false; + uint32_t group; + size_t len; + int r; + + assert(nl); + + /* read nothing, just get the pending message size */ + r = socket_recv_message(nl->fd, NULL, 0, NULL, true); + if (r <= 0) + return r; + len = (size_t) r; + + /* make room for the pending message */ + if (!greedy_realloc((void**) &nl->rbuffer, len, sizeof(uint8_t))) + return -ENOMEM; + + /* read the pending message */ + r = socket_recv_message(nl->fd, nl->rbuffer, MALLOC_SIZEOF_SAFE(nl->rbuffer), &group, false); + if (r <= 0) + return r; + len = (size_t) r; + + if (!NLMSG_OK(nl->rbuffer, len)) { + log_debug("sd-netlink: received invalid message, discarding %zu bytes of incoming message", len); + return 0; + } + + for (struct nlmsghdr *hdr = nl->rbuffer; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + + r = parse_message_one(nl, group, hdr, &m); + if (r < 0) + return r; + if (r == 0) + continue; + + if (hdr->nlmsg_flags & NLM_F_MULTI) { + if (hdr->nlmsg_type == NLMSG_DONE) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *existing = NULL; + + /* finished reading multi-part message */ + existing = hashmap_remove(nl->rqueue_partial_by_serial, UINT32_TO_PTR(hdr->nlmsg_seq)); + + /* if we receive only NLMSG_DONE, put it into the receive queue. */ + r = netlink_queue_received_message(nl, existing ?: m); + if (r < 0) + return r; + + done = true; + } else { + sd_netlink_message *existing; + + existing = hashmap_get(nl->rqueue_partial_by_serial, UINT32_TO_PTR(hdr->nlmsg_seq)); + if (existing) { + /* This is the continuation of the previously read messages. + * Let's append this message at the end. */ + while (existing->next) + existing = existing->next; + existing->next = TAKE_PTR(m); + } else { + /* This is the first message. Put it into the queue for partially + * received messages. */ + r = netlink_queue_partially_received_message(nl, m); + if (r < 0) + return r; + } + } + + } else { + r = netlink_queue_received_message(nl, m); + if (r < 0) + return r; + + done = true; + } + } + + if (len > 0) + log_debug("sd-netlink: discarding trailing %zu bytes of incoming message", len); + + return done; +} diff --git a/src/libsystemd/sd-netlink/netlink-types-genl.c b/src/libsystemd/sd-netlink/netlink-types-genl.c new file mode 100644 index 0000000..6fe9adc --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-types-genl.c @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "missing_network.h" +#include "netlink-genl.h" +#include "netlink-types-internal.h" + +/***************** genl ctrl type systems *****************/ +static const NLAPolicy genl_ctrl_mcast_group_policies[] = { + [CTRL_ATTR_MCAST_GRP_NAME] = BUILD_POLICY(STRING), + [CTRL_ATTR_MCAST_GRP_ID] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(genl_ctrl_mcast_group); + +static const NLAPolicy genl_ctrl_ops_policies[] = { + [CTRL_ATTR_OP_ID] = BUILD_POLICY(U32), + [CTRL_ATTR_OP_FLAGS] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(genl_ctrl_ops); + +static const NLAPolicy genl_ctrl_policies[] = { + [CTRL_ATTR_FAMILY_ID] = BUILD_POLICY(U16), + [CTRL_ATTR_FAMILY_NAME] = BUILD_POLICY(STRING), + [CTRL_ATTR_VERSION] = BUILD_POLICY(U32), + [CTRL_ATTR_HDRSIZE] = BUILD_POLICY(U32), + [CTRL_ATTR_MAXATTR] = BUILD_POLICY(U32), + [CTRL_ATTR_OPS] = BUILD_POLICY_NESTED(genl_ctrl_ops), + [CTRL_ATTR_MCAST_GROUPS] = BUILD_POLICY_NESTED(genl_ctrl_mcast_group), + /* + [CTRL_ATTR_POLICY] = { .type = NETLINK_TYPE_NESTED, }, + [CTRL_ATTR_OP_POLICY] = { .type = NETLINK_TYPE_NESTED, } + */ + [CTRL_ATTR_OP] = BUILD_POLICY(U32), +}; + +/***************** genl batadv type systems *****************/ +static const NLAPolicy genl_batadv_policies[] = { + [BATADV_ATTR_VERSION] = BUILD_POLICY(STRING), + [BATADV_ATTR_ALGO_NAME] = BUILD_POLICY(STRING), + [BATADV_ATTR_MESH_IFINDEX] = BUILD_POLICY(U32), + [BATADV_ATTR_MESH_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ), + [BATADV_ATTR_MESH_ADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_HARD_IFINDEX] = BUILD_POLICY(U32), + [BATADV_ATTR_HARD_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ), + [BATADV_ATTR_HARD_ADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_ORIG_ADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_TPMETER_RESULT] = BUILD_POLICY(U8), + [BATADV_ATTR_TPMETER_TEST_TIME] = BUILD_POLICY(U32), + [BATADV_ATTR_TPMETER_BYTES] = BUILD_POLICY(U64), + [BATADV_ATTR_TPMETER_COOKIE] = BUILD_POLICY(U32), + [BATADV_ATTR_PAD] = BUILD_POLICY(UNSPEC), + [BATADV_ATTR_ACTIVE] = BUILD_POLICY(FLAG), + [BATADV_ATTR_TT_ADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_TT_TTVN] = BUILD_POLICY(U8), + [BATADV_ATTR_TT_LAST_TTVN] = BUILD_POLICY(U8), + [BATADV_ATTR_TT_CRC32] = BUILD_POLICY(U32), + [BATADV_ATTR_TT_VID] = BUILD_POLICY(U16), + [BATADV_ATTR_TT_FLAGS] = BUILD_POLICY(U32), + [BATADV_ATTR_FLAG_BEST] = BUILD_POLICY(FLAG), + [BATADV_ATTR_LAST_SEEN_MSECS] = BUILD_POLICY(U32), + [BATADV_ATTR_NEIGH_ADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_TQ] = BUILD_POLICY(U8), + [BATADV_ATTR_THROUGHPUT] = BUILD_POLICY(U32), + [BATADV_ATTR_BANDWIDTH_UP] = BUILD_POLICY(U32), + [BATADV_ATTR_BANDWIDTH_DOWN] = BUILD_POLICY(U32), + [BATADV_ATTR_ROUTER] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_BLA_OWN] = BUILD_POLICY(FLAG), + [BATADV_ATTR_BLA_ADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_BLA_VID] = BUILD_POLICY(U16), + [BATADV_ATTR_BLA_BACKBONE] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_BLA_CRC] = BUILD_POLICY(U16), + [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = BUILD_POLICY(U32), + [BATADV_ATTR_DAT_CACHE_HWADDRESS] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [BATADV_ATTR_DAT_CACHE_VID] = BUILD_POLICY(U16), + [BATADV_ATTR_MCAST_FLAGS] = BUILD_POLICY(U32), + [BATADV_ATTR_MCAST_FLAGS_PRIV] = BUILD_POLICY(U32), + [BATADV_ATTR_VLANID] = BUILD_POLICY(U16), + [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_AP_ISOLATION_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_ISOLATION_MARK] = BUILD_POLICY(U32), + [BATADV_ATTR_ISOLATION_MASK] = BUILD_POLICY(U32), + [BATADV_ATTR_BONDING_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_FRAGMENTATION_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_GW_BANDWIDTH_DOWN] = BUILD_POLICY(U32), + [BATADV_ATTR_GW_BANDWIDTH_UP] = BUILD_POLICY(U32), + [BATADV_ATTR_GW_MODE] = BUILD_POLICY(U8), + [BATADV_ATTR_GW_SEL_CLASS] = BUILD_POLICY(U32), + [BATADV_ATTR_HOP_PENALTY] = BUILD_POLICY(U8), + [BATADV_ATTR_LOG_LEVEL] = BUILD_POLICY(U32), + [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_MULTICAST_FANOUT] = BUILD_POLICY(U32), + [BATADV_ATTR_NETWORK_CODING_ENABLED] = BUILD_POLICY(U8), + [BATADV_ATTR_ORIG_INTERVAL] = BUILD_POLICY(U32), + [BATADV_ATTR_ELP_INTERVAL] = BUILD_POLICY(U32), + [BATADV_ATTR_THROUGHPUT_OVERRIDE] = BUILD_POLICY(U32), +}; + +/***************** genl fou type systems *****************/ +static const NLAPolicy genl_fou_policies[] = { + [FOU_ATTR_PORT] = BUILD_POLICY(U16), + [FOU_ATTR_AF] = BUILD_POLICY(U8), + [FOU_ATTR_IPPROTO] = BUILD_POLICY(U8), + [FOU_ATTR_TYPE] = BUILD_POLICY(U8), + [FOU_ATTR_REMCSUM_NOPARTIAL] = BUILD_POLICY(FLAG), + [FOU_ATTR_LOCAL_V4] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [FOU_ATTR_PEER_V4] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [FOU_ATTR_LOCAL_V6] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [FOU_ATTR_PEER_V6] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [FOU_ATTR_PEER_PORT] = BUILD_POLICY(U16), + [FOU_ATTR_IFINDEX] = BUILD_POLICY(U32), +}; + +/***************** genl l2tp type systems *****************/ +static const NLAPolicy genl_l2tp_policies[] = { + [L2TP_ATTR_PW_TYPE] = BUILD_POLICY(U16), + [L2TP_ATTR_ENCAP_TYPE] = BUILD_POLICY(U16), + [L2TP_ATTR_OFFSET] = BUILD_POLICY(U16), + [L2TP_ATTR_DATA_SEQ] = BUILD_POLICY(U16), + [L2TP_ATTR_L2SPEC_TYPE] = BUILD_POLICY(U8), + [L2TP_ATTR_L2SPEC_LEN] = BUILD_POLICY(U8), + [L2TP_ATTR_PROTO_VERSION] = BUILD_POLICY(U8), + [L2TP_ATTR_IFNAME] = BUILD_POLICY(STRING), + [L2TP_ATTR_CONN_ID] = BUILD_POLICY(U32), + [L2TP_ATTR_PEER_CONN_ID] = BUILD_POLICY(U32), + [L2TP_ATTR_SESSION_ID] = BUILD_POLICY(U32), + [L2TP_ATTR_PEER_SESSION_ID] = BUILD_POLICY(U32), + [L2TP_ATTR_UDP_CSUM] = BUILD_POLICY(U8), + [L2TP_ATTR_VLAN_ID] = BUILD_POLICY(U16), + [L2TP_ATTR_RECV_SEQ] = BUILD_POLICY(U8), + [L2TP_ATTR_SEND_SEQ] = BUILD_POLICY(U8), + [L2TP_ATTR_LNS_MODE] = BUILD_POLICY(U8), + [L2TP_ATTR_USING_IPSEC] = BUILD_POLICY(U8), + [L2TP_ATTR_FD] = BUILD_POLICY(U32), + [L2TP_ATTR_IP_SADDR] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [L2TP_ATTR_IP_DADDR] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [L2TP_ATTR_UDP_SPORT] = BUILD_POLICY(U16), + [L2TP_ATTR_UDP_DPORT] = BUILD_POLICY(U16), + [L2TP_ATTR_IP6_SADDR] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [L2TP_ATTR_IP6_DADDR] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [L2TP_ATTR_UDP_ZERO_CSUM6_TX] = BUILD_POLICY(FLAG), + [L2TP_ATTR_UDP_ZERO_CSUM6_RX] = BUILD_POLICY(FLAG), +}; + +/***************** genl macsec type systems *****************/ +static const NLAPolicy genl_macsec_rxsc_policies[] = { + [MACSEC_RXSC_ATTR_SCI] = BUILD_POLICY(U64), +}; + +DEFINE_POLICY_SET(genl_macsec_rxsc); + +static const NLAPolicy genl_macsec_sa_policies[] = { + [MACSEC_SA_ATTR_AN] = BUILD_POLICY(U8), + [MACSEC_SA_ATTR_ACTIVE] = BUILD_POLICY(U8), + [MACSEC_SA_ATTR_PN] = BUILD_POLICY(U32), + [MACSEC_SA_ATTR_KEYID] = BUILD_POLICY_WITH_SIZE(BINARY, MACSEC_KEYID_LEN), + [MACSEC_SA_ATTR_KEY] = BUILD_POLICY_WITH_SIZE(BINARY, MACSEC_MAX_KEY_LEN), +}; + +DEFINE_POLICY_SET(genl_macsec_sa); + +static const NLAPolicy genl_macsec_policies[] = { + [MACSEC_ATTR_IFINDEX] = BUILD_POLICY(U32), + [MACSEC_ATTR_RXSC_CONFIG] = BUILD_POLICY_NESTED(genl_macsec_rxsc), + [MACSEC_ATTR_SA_CONFIG] = BUILD_POLICY_NESTED(genl_macsec_sa), +}; + +/***************** genl NetLabel type systems *****************/ +static const NLAPolicy genl_netlabel_policies[] = { + [NLBL_UNLABEL_A_IPV4ADDR] = BUILD_POLICY(IN_ADDR), + [NLBL_UNLABEL_A_IPV4MASK] = BUILD_POLICY(IN_ADDR), + [NLBL_UNLABEL_A_IPV6ADDR] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [NLBL_UNLABEL_A_IPV6MASK] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [NLBL_UNLABEL_A_IFACE] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ-1), + [NLBL_UNLABEL_A_SECCTX] = BUILD_POLICY(STRING), +}; + +/***************** genl nl80211 type systems *****************/ +static const NLAPolicy genl_nl80211_policies[] = { + [NL80211_ATTR_WIPHY] = BUILD_POLICY(U32), + [NL80211_ATTR_WIPHY_NAME] = BUILD_POLICY(STRING), + [NL80211_ATTR_IFINDEX] = BUILD_POLICY(U32), + [NL80211_ATTR_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ-1), + [NL80211_ATTR_IFTYPE] = BUILD_POLICY(U32), + [NL80211_ATTR_MAC] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [NL80211_ATTR_SSID] = BUILD_POLICY_WITH_SIZE(BINARY, IEEE80211_MAX_SSID_LEN), + [NL80211_ATTR_STATUS_CODE] = BUILD_POLICY(U16), + [NL80211_ATTR_4ADDR] = BUILD_POLICY(U8), +}; + +/***************** genl wireguard type systems *****************/ +static const NLAPolicy genl_wireguard_allowedip_policies[] = { + [WGALLOWEDIP_A_FAMILY] = BUILD_POLICY(U16), + [WGALLOWEDIP_A_IPADDR] = BUILD_POLICY(IN_ADDR), + [WGALLOWEDIP_A_CIDR_MASK] = BUILD_POLICY(U8), +}; + +DEFINE_POLICY_SET(genl_wireguard_allowedip); + +static const NLAPolicy genl_wireguard_peer_policies[] = { + [WGPEER_A_PUBLIC_KEY] = BUILD_POLICY_WITH_SIZE(BINARY, WG_KEY_LEN), + [WGPEER_A_FLAGS] = BUILD_POLICY(U32), + [WGPEER_A_PRESHARED_KEY] = BUILD_POLICY_WITH_SIZE(BINARY, WG_KEY_LEN), + [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL] = BUILD_POLICY(U16), + [WGPEER_A_ENDPOINT] = BUILD_POLICY(SOCKADDR), + [WGPEER_A_ALLOWEDIPS] = BUILD_POLICY_NESTED(genl_wireguard_allowedip), +}; + +DEFINE_POLICY_SET(genl_wireguard_peer); + +static const NLAPolicy genl_wireguard_policies[] = { + [WGDEVICE_A_IFINDEX] = BUILD_POLICY(U32), + [WGDEVICE_A_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ-1), + [WGDEVICE_A_FLAGS] = BUILD_POLICY(U32), + [WGDEVICE_A_PRIVATE_KEY] = BUILD_POLICY_WITH_SIZE(BINARY, WG_KEY_LEN), + [WGDEVICE_A_LISTEN_PORT] = BUILD_POLICY(U16), + [WGDEVICE_A_FWMARK] = BUILD_POLICY(U32), + [WGDEVICE_A_PEERS] = BUILD_POLICY_NESTED(genl_wireguard_peer), +}; + +/***************** genl families *****************/ +static const NLAPolicySetUnionElement genl_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_STRING(CTRL_GENL_NAME, genl_ctrl), + BUILD_UNION_ELEMENT_BY_STRING(BATADV_NL_NAME, genl_batadv), + BUILD_UNION_ELEMENT_BY_STRING(FOU_GENL_NAME, genl_fou), + BUILD_UNION_ELEMENT_BY_STRING(L2TP_GENL_NAME, genl_l2tp), + BUILD_UNION_ELEMENT_BY_STRING(MACSEC_GENL_NAME, genl_macsec), + BUILD_UNION_ELEMENT_BY_STRING(NETLBL_NLTYPE_UNLABELED_NAME, genl_netlabel), + BUILD_UNION_ELEMENT_BY_STRING(NL80211_GENL_NAME, genl_nl80211), + BUILD_UNION_ELEMENT_BY_STRING(WG_GENL_NAME, genl_wireguard), +}; + +/* This is the root type system union, so match_attribute is not necessary. */ +DEFINE_POLICY_SET_UNION(genl, 0); + +const NLAPolicySet *genl_get_policy_set_by_name(const char *name) { + return policy_set_union_get_policy_set_by_string(&genl_policy_set_union, name); +} diff --git a/src/libsystemd/sd-netlink/netlink-types-internal.h b/src/libsystemd/sd-netlink/netlink-types-internal.h new file mode 100644 index 0000000..1412514 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-types-internal.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" +#include "netlink-types.h" + +/* C.f. see 'struct nla_policy' at include/net/netlink.h. */ +struct NLAPolicy { + NLAType type; + size_t size; + union { + const NLAPolicySet *policy_set; + const NLAPolicySetUnion *policy_set_union; + }; +}; + +struct NLAPolicySet { + uint16_t count; + const NLAPolicy *policies; +}; + +typedef struct NLAPolicySetUnionElement { + union { + int family; /* used by NETLINK_TYPE_NESTED_UNION_BY_FAMILY */ + const char *string; /* used by NETLINK_TYPE_NESTED_UNION_BY_STRING */ + }; + NLAPolicySet policy_set; +} NLAPolicySetUnionElement; + +struct NLAPolicySetUnion { + size_t count; + const NLAPolicySetUnionElement *elements; + uint16_t match_attribute; /* used by NETLINK_TYPE_NESTED_UNION_BY_STRING */ +}; + +#define BUILD_POLICY_WITH_SIZE(t, n) \ + { .type = NETLINK_TYPE_##t, .size = n } +#define BUILD_POLICY(t) \ + BUILD_POLICY_WITH_SIZE(t, 0) +#define BUILD_POLICY_NESTED_WITH_SIZE(name, n) \ + { .type = NETLINK_TYPE_NESTED, .size = n, .policy_set = &name##_policy_set } +#define BUILD_POLICY_NESTED(name) \ + BUILD_POLICY_NESTED_WITH_SIZE(name, 0) +#define _BUILD_POLICY_NESTED_UNION(name, by) \ + { .type = NETLINK_TYPE_NESTED_UNION_BY_##by, .policy_set_union = &name##_policy_set_union } +#define BUILD_POLICY_NESTED_UNION_BY_STRING(name) \ + _BUILD_POLICY_NESTED_UNION(name, STRING) +#define BUILD_POLICY_NESTED_UNION_BY_FAMILY(name) \ + _BUILD_POLICY_NESTED_UNION(name, FAMILY) + +#define _BUILD_POLICY_SET(name) \ + { .count = ELEMENTSOF(name##_policies), .policies = name##_policies } +#define DEFINE_POLICY_SET(name) \ + static const NLAPolicySet name##_policy_set = _BUILD_POLICY_SET(name) + +# define BUILD_UNION_ELEMENT_BY_STRING(s, name) \ + { .string = s, .policy_set = _BUILD_POLICY_SET(name) } +# define BUILD_UNION_ELEMENT_BY_FAMILY(f, name) \ + { .family = f, .policy_set = _BUILD_POLICY_SET(name) } + +#define DEFINE_POLICY_SET_UNION(name, attr) \ + static const NLAPolicySetUnion name##_policy_set_union = { \ + .count = ELEMENTSOF(name##_policy_set_union_elements), \ + .elements = name##_policy_set_union_elements, \ + .match_attribute = attr, \ + } diff --git a/src/libsystemd/sd-netlink/netlink-types-nfnl.c b/src/libsystemd/sd-netlink/netlink-types-nfnl.c new file mode 100644 index 0000000..8ef4d45 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-types-nfnl.c @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "netlink-types-internal.h" + +static const NLAPolicy nfnl_nft_table_policies[] = { + [NFTA_TABLE_NAME] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_TABLE_FLAGS] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(nfnl_nft_table); + +static const NLAPolicy nfnl_nft_chain_hook_policies[] = { + [NFTA_HOOK_HOOKNUM] = BUILD_POLICY(U32), + [NFTA_HOOK_PRIORITY] = BUILD_POLICY(U32), + [NFTA_HOOK_DEV] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ - 1), +}; + +DEFINE_POLICY_SET(nfnl_nft_chain_hook); + +static const NLAPolicy nfnl_nft_chain_policies[] = { + [NFTA_CHAIN_TABLE] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_CHAIN_NAME] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_CHAIN_HOOK] = BUILD_POLICY_NESTED(nfnl_nft_chain_hook), + [NFTA_CHAIN_TYPE] = BUILD_POLICY_WITH_SIZE(STRING, 16), + [NFTA_CHAIN_FLAGS] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(nfnl_nft_chain); + +static const NLAPolicy nfnl_nft_expr_meta_policies[] = { + [NFTA_META_DREG] = BUILD_POLICY(U32), + [NFTA_META_KEY] = BUILD_POLICY(U32), + [NFTA_META_SREG] = BUILD_POLICY(U32), +}; + +static const NLAPolicy nfnl_nft_expr_payload_policies[] = { + [NFTA_PAYLOAD_DREG] = BUILD_POLICY(U32), + [NFTA_PAYLOAD_BASE] = BUILD_POLICY(U32), + [NFTA_PAYLOAD_OFFSET] = BUILD_POLICY(U32), + [NFTA_PAYLOAD_LEN] = BUILD_POLICY(U32), +}; + +static const NLAPolicy nfnl_nft_expr_nat_policies[] = { + [NFTA_NAT_TYPE] = BUILD_POLICY(U32), + [NFTA_NAT_FAMILY] = BUILD_POLICY(U32), + [NFTA_NAT_REG_ADDR_MIN] = BUILD_POLICY(U32), + [NFTA_NAT_REG_ADDR_MAX] = BUILD_POLICY(U32), + [NFTA_NAT_REG_PROTO_MIN] = BUILD_POLICY(U32), + [NFTA_NAT_REG_PROTO_MAX] = BUILD_POLICY(U32), + [NFTA_NAT_FLAGS] = BUILD_POLICY(U32), +}; + +static const NLAPolicy nfnl_nft_data_policies[] = { + [NFTA_DATA_VALUE] = { .type = NETLINK_TYPE_BINARY }, +}; + +DEFINE_POLICY_SET(nfnl_nft_data); + +static const NLAPolicy nfnl_nft_expr_bitwise_policies[] = { + [NFTA_BITWISE_SREG] = BUILD_POLICY(U32), + [NFTA_BITWISE_DREG] = BUILD_POLICY(U32), + [NFTA_BITWISE_LEN] = BUILD_POLICY(U32), + [NFTA_BITWISE_MASK] = BUILD_POLICY_NESTED(nfnl_nft_data), + [NFTA_BITWISE_XOR] = BUILD_POLICY_NESTED(nfnl_nft_data), +}; + +static const NLAPolicy nfnl_nft_expr_cmp_policies[] = { + [NFTA_CMP_SREG] = BUILD_POLICY(U32), + [NFTA_CMP_OP] = BUILD_POLICY(U32), + [NFTA_CMP_DATA] = BUILD_POLICY_NESTED(nfnl_nft_data), +}; + +static const NLAPolicy nfnl_nft_expr_fib_policies[] = { + [NFTA_FIB_DREG] = BUILD_POLICY(U32), + [NFTA_FIB_RESULT] = BUILD_POLICY(U32), + [NFTA_FIB_FLAGS] = BUILD_POLICY(U32), +}; + +static const NLAPolicy nfnl_nft_expr_lookup_policies[] = { + [NFTA_LOOKUP_SET] = { .type = NETLINK_TYPE_STRING }, + [NFTA_LOOKUP_SREG] = BUILD_POLICY(U32), + [NFTA_LOOKUP_DREG] = BUILD_POLICY(U32), + [NFTA_LOOKUP_FLAGS] = BUILD_POLICY(U32), +}; + +static const NLAPolicy nfnl_nft_expr_masq_policies[] = { + [NFTA_MASQ_FLAGS] = BUILD_POLICY(U32), + [NFTA_MASQ_REG_PROTO_MIN] = BUILD_POLICY(U32), + [NFTA_MASQ_REG_PROTO_MAX] = BUILD_POLICY(U32), +}; + +static const NLAPolicySetUnionElement nfnl_expr_data_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_STRING("bitwise", nfnl_nft_expr_bitwise), + BUILD_UNION_ELEMENT_BY_STRING("cmp", nfnl_nft_expr_cmp), + BUILD_UNION_ELEMENT_BY_STRING("fib", nfnl_nft_expr_fib), + BUILD_UNION_ELEMENT_BY_STRING("lookup", nfnl_nft_expr_lookup), + BUILD_UNION_ELEMENT_BY_STRING("masq", nfnl_nft_expr_masq), + BUILD_UNION_ELEMENT_BY_STRING("meta", nfnl_nft_expr_meta), + BUILD_UNION_ELEMENT_BY_STRING("nat", nfnl_nft_expr_nat), + BUILD_UNION_ELEMENT_BY_STRING("payload", nfnl_nft_expr_payload), +}; + +DEFINE_POLICY_SET_UNION(nfnl_expr_data, NFTA_EXPR_NAME); + +static const NLAPolicy nfnl_nft_rule_expr_policies[] = { + [NFTA_EXPR_NAME] = BUILD_POLICY_WITH_SIZE(STRING, 16), + [NFTA_EXPR_DATA] = BUILD_POLICY_NESTED_UNION_BY_STRING(nfnl_expr_data), +}; + +DEFINE_POLICY_SET(nfnl_nft_rule_expr); + +static const NLAPolicy nfnl_nft_rule_policies[] = { + [NFTA_RULE_TABLE] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_RULE_CHAIN] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_RULE_EXPRESSIONS] = BUILD_POLICY_NESTED(nfnl_nft_rule_expr), +}; + +DEFINE_POLICY_SET(nfnl_nft_rule); + +static const NLAPolicy nfnl_nft_set_policies[] = { + [NFTA_SET_TABLE] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_SET_NAME] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_SET_FLAGS] = BUILD_POLICY(U32), + [NFTA_SET_KEY_TYPE] = BUILD_POLICY(U32), + [NFTA_SET_KEY_LEN] = BUILD_POLICY(U32), + [NFTA_SET_DATA_TYPE] = BUILD_POLICY(U32), + [NFTA_SET_DATA_LEN] = BUILD_POLICY(U32), + [NFTA_SET_POLICY] = BUILD_POLICY(U32), + [NFTA_SET_ID] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(nfnl_nft_set); + +static const NLAPolicy nfnl_nft_setelem_policies[] = { + [NFTA_SET_ELEM_KEY] = BUILD_POLICY_NESTED(nfnl_nft_data), + [NFTA_SET_ELEM_DATA] = BUILD_POLICY_NESTED(nfnl_nft_data), + [NFTA_SET_ELEM_FLAGS] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(nfnl_nft_setelem); + +static const NLAPolicy nfnl_nft_setelem_list_policies[] = { + [NFTA_SET_ELEM_LIST_TABLE] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_SET_ELEM_LIST_SET] = BUILD_POLICY_WITH_SIZE(STRING, NFT_TABLE_MAXNAMELEN - 1), + [NFTA_SET_ELEM_LIST_ELEMENTS] = BUILD_POLICY_NESTED(nfnl_nft_setelem), +}; + +DEFINE_POLICY_SET(nfnl_nft_setelem_list); + +static const NLAPolicy nfnl_subsys_nft_policies[] = { + [NFT_MSG_DELTABLE] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_table, sizeof(struct nfgenmsg)), + [NFT_MSG_NEWTABLE] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_table, sizeof(struct nfgenmsg)), + [NFT_MSG_NEWCHAIN] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_chain, sizeof(struct nfgenmsg)), + [NFT_MSG_NEWRULE] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_rule, sizeof(struct nfgenmsg)), + [NFT_MSG_NEWSET] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_set, sizeof(struct nfgenmsg)), + [NFT_MSG_NEWSETELEM] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_setelem_list, sizeof(struct nfgenmsg)), + [NFT_MSG_DELSETELEM] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_nft_setelem_list, sizeof(struct nfgenmsg)), +}; + +DEFINE_POLICY_SET(nfnl_subsys_nft); + +static const NLAPolicy nfnl_msg_batch_policies[] = { + [NFNL_BATCH_GENID] = BUILD_POLICY(U32) +}; + +DEFINE_POLICY_SET(nfnl_msg_batch); + +static const NLAPolicy nfnl_subsys_none_policies[] = { + [NFNL_MSG_BATCH_BEGIN] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_msg_batch, sizeof(struct nfgenmsg)), + [NFNL_MSG_BATCH_END] = BUILD_POLICY_NESTED_WITH_SIZE(nfnl_msg_batch, sizeof(struct nfgenmsg)), +}; + +DEFINE_POLICY_SET(nfnl_subsys_none); + +static const NLAPolicy nfnl_policies[] = { + [NFNL_SUBSYS_NONE] = BUILD_POLICY_NESTED(nfnl_subsys_none), + [NFNL_SUBSYS_NFTABLES] = BUILD_POLICY_NESTED(nfnl_subsys_nft), +}; + +DEFINE_POLICY_SET(nfnl); + +const NLAPolicy *nfnl_get_policy(uint16_t nlmsg_type) { + const NLAPolicySet *subsys; + + subsys = policy_set_get_policy_set(&nfnl_policy_set, NFNL_SUBSYS_ID(nlmsg_type)); + if (!subsys) + return NULL; + + return policy_set_get_policy(subsys, NFNL_MSG_TYPE(nlmsg_type)); +} diff --git a/src/libsystemd/sd-netlink/netlink-types-rtnl.c b/src/libsystemd/sd-netlink/netlink-types-rtnl.c new file mode 100644 index 0000000..0153456 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-types-rtnl.c @@ -0,0 +1,1229 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "missing_network.h" +#include "netlink-types-internal.h" + +enum { + BOND_ARP_TARGETS_0, + BOND_ARP_TARGETS_1, + BOND_ARP_TARGETS_2, + BOND_ARP_TARGETS_3, + BOND_ARP_TARGETS_4, + BOND_ARP_TARGETS_5, + BOND_ARP_TARGETS_6, + BOND_ARP_TARGETS_7, + BOND_ARP_TARGETS_8, + BOND_ARP_TARGETS_9, + BOND_ARP_TARGETS_10, + BOND_ARP_TARGETS_11, + BOND_ARP_TARGETS_12, + BOND_ARP_TARGETS_13, + BOND_ARP_TARGETS_14, + BOND_ARP_TARGETS_15, + _BOND_ARP_TARGETS_MAX, +}; + +assert_cc(_BOND_ARP_TARGETS_MAX == BOND_MAX_ARP_TARGETS); + +static const NLAPolicySet rtnl_link_policy_set; + +static const NLAPolicy rtnl_link_info_data_bareudp_policies[] = { + [IFLA_BAREUDP_PORT] = BUILD_POLICY(U16), + [IFLA_BAREUDP_ETHERTYPE] = BUILD_POLICY(U16), + [IFLA_BAREUDP_SRCPORT_MIN] = BUILD_POLICY(U16), + [IFLA_BAREUDP_MULTIPROTO_MODE] = BUILD_POLICY(FLAG), +}; + +static const NLAPolicy rtnl_link_info_data_batadv_policies[] = { + [IFLA_BATADV_ALGO_NAME] = BUILD_POLICY_WITH_SIZE(STRING, 20), +}; + +static const NLAPolicy rtnl_bond_arp_ip_target_policies[] = { + [BOND_ARP_TARGETS_0] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_1] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_2] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_3] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_4] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_5] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_6] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_7] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_8] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_9] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_10] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_11] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_12] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_13] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_14] = BUILD_POLICY(U32), + [BOND_ARP_TARGETS_15] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bond_arp_ip_target); + +static const NLAPolicy rtnl_bond_ad_info_policies[] = { + [IFLA_BOND_AD_INFO_AGGREGATOR] = BUILD_POLICY(U16), + [IFLA_BOND_AD_INFO_NUM_PORTS] = BUILD_POLICY(U16), + [IFLA_BOND_AD_INFO_ACTOR_KEY] = BUILD_POLICY(U16), + [IFLA_BOND_AD_INFO_PARTNER_KEY] = BUILD_POLICY(U16), + [IFLA_BOND_AD_INFO_PARTNER_MAC] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), +}; + +DEFINE_POLICY_SET(rtnl_bond_ad_info); + +static const NLAPolicy rtnl_link_info_data_bond_policies[] = { + [IFLA_BOND_MODE] = BUILD_POLICY(U8), + [IFLA_BOND_ACTIVE_SLAVE] = BUILD_POLICY(U32), + [IFLA_BOND_MIIMON] = BUILD_POLICY(U32), + [IFLA_BOND_UPDELAY] = BUILD_POLICY(U32), + [IFLA_BOND_DOWNDELAY] = BUILD_POLICY(U32), + [IFLA_BOND_USE_CARRIER] = BUILD_POLICY(U8), + [IFLA_BOND_ARP_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BOND_ARP_IP_TARGET] = BUILD_POLICY_NESTED(rtnl_bond_arp_ip_target), + [IFLA_BOND_ARP_VALIDATE] = BUILD_POLICY(U32), + [IFLA_BOND_ARP_ALL_TARGETS] = BUILD_POLICY(U32), + [IFLA_BOND_PRIMARY] = BUILD_POLICY(U32), + [IFLA_BOND_PRIMARY_RESELECT] = BUILD_POLICY(U8), + [IFLA_BOND_FAIL_OVER_MAC] = BUILD_POLICY(U8), + [IFLA_BOND_XMIT_HASH_POLICY] = BUILD_POLICY(U8), + [IFLA_BOND_RESEND_IGMP] = BUILD_POLICY(U32), + [IFLA_BOND_NUM_PEER_NOTIF] = BUILD_POLICY(U8), + [IFLA_BOND_ALL_SLAVES_ACTIVE] = BUILD_POLICY(U8), + [IFLA_BOND_MIN_LINKS] = BUILD_POLICY(U32), + [IFLA_BOND_LP_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BOND_PACKETS_PER_SLAVE] = BUILD_POLICY(U32), + [IFLA_BOND_AD_LACP_RATE] = BUILD_POLICY(U8), + [IFLA_BOND_AD_SELECT] = BUILD_POLICY(U8), + [IFLA_BOND_AD_INFO] = BUILD_POLICY_NESTED(rtnl_bond_ad_info), + [IFLA_BOND_AD_ACTOR_SYS_PRIO] = BUILD_POLICY(U16), + [IFLA_BOND_AD_USER_PORT_KEY] = BUILD_POLICY(U16), + [IFLA_BOND_AD_ACTOR_SYSTEM] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [IFLA_BOND_TLB_DYNAMIC_LB] = BUILD_POLICY(U8), + [IFLA_BOND_PEER_NOTIF_DELAY] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_link_info_data_bridge_policies[] = { + [IFLA_BR_FORWARD_DELAY] = BUILD_POLICY(U32), + [IFLA_BR_HELLO_TIME] = BUILD_POLICY(U32), + [IFLA_BR_MAX_AGE] = BUILD_POLICY(U32), + [IFLA_BR_AGEING_TIME] = BUILD_POLICY(U32), + [IFLA_BR_STP_STATE] = BUILD_POLICY(U32), + [IFLA_BR_PRIORITY] = BUILD_POLICY(U16), + [IFLA_BR_VLAN_FILTERING] = BUILD_POLICY(U8), + [IFLA_BR_VLAN_PROTOCOL] = BUILD_POLICY(U16), + [IFLA_BR_GROUP_FWD_MASK] = BUILD_POLICY(U16), + [IFLA_BR_ROOT_ID] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_bridge_id)), + [IFLA_BR_BRIDGE_ID] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_bridge_id)), + [IFLA_BR_ROOT_PORT] = BUILD_POLICY(U16), + [IFLA_BR_ROOT_PATH_COST] = BUILD_POLICY(U32), + [IFLA_BR_TOPOLOGY_CHANGE] = BUILD_POLICY(U8), + [IFLA_BR_TOPOLOGY_CHANGE_DETECTED] = BUILD_POLICY(U8), + [IFLA_BR_HELLO_TIMER] = BUILD_POLICY(U64), + [IFLA_BR_TCN_TIMER] = BUILD_POLICY(U64), + [IFLA_BR_TOPOLOGY_CHANGE_TIMER] = BUILD_POLICY(U64), + [IFLA_BR_GC_TIMER] = BUILD_POLICY(U64), + [IFLA_BR_GROUP_ADDR] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [IFLA_BR_FDB_FLUSH] = BUILD_POLICY(FLAG), + [IFLA_BR_MCAST_ROUTER] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_SNOOPING] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_QUERY_USE_IFADDR] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_QUERIER] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_HASH_ELASTICITY] = BUILD_POLICY(U32), + [IFLA_BR_MCAST_HASH_MAX] = BUILD_POLICY(U32), + [IFLA_BR_MCAST_LAST_MEMBER_CNT] = BUILD_POLICY(U32), + [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = BUILD_POLICY(U32), + [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = BUILD_POLICY(U64), + [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = BUILD_POLICY(U64), + [IFLA_BR_MCAST_QUERIER_INTVL] = BUILD_POLICY(U64), + [IFLA_BR_MCAST_QUERY_INTVL] = BUILD_POLICY(U64), + [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = BUILD_POLICY(U64), + [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = BUILD_POLICY(U64), + [IFLA_BR_NF_CALL_IPTABLES] = BUILD_POLICY(U8), + [IFLA_BR_NF_CALL_IP6TABLES] = BUILD_POLICY(U8), + [IFLA_BR_NF_CALL_ARPTABLES] = BUILD_POLICY(U8), + [IFLA_BR_VLAN_DEFAULT_PVID] = BUILD_POLICY(U16), + [IFLA_BR_VLAN_STATS_ENABLED] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_STATS_ENABLED] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_IGMP_VERSION] = BUILD_POLICY(U8), + [IFLA_BR_MCAST_MLD_VERSION] = BUILD_POLICY(U8), + [IFLA_BR_VLAN_STATS_PER_PORT] = BUILD_POLICY(U8), + [IFLA_BR_MULTI_BOOLOPT] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct br_boolopt_multi)), +}; + +static const NLAPolicy rtnl_link_info_data_can_policies[] = { + [IFLA_CAN_BITTIMING] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_bittiming)), + [IFLA_CAN_BITTIMING_CONST] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_bittiming_const)), + [IFLA_CAN_CLOCK] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_clock)), + [IFLA_CAN_STATE] = BUILD_POLICY(U32), + [IFLA_CAN_CTRLMODE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_ctrlmode)), + [IFLA_CAN_RESTART_MS] = BUILD_POLICY(U32), + [IFLA_CAN_RESTART] = BUILD_POLICY(U32), + [IFLA_CAN_BERR_COUNTER] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_berr_counter)), + [IFLA_CAN_DATA_BITTIMING] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_bittiming)), + [IFLA_CAN_DATA_BITTIMING_CONST] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_bittiming_const)), + [IFLA_CAN_TERMINATION] = BUILD_POLICY(U16), + [IFLA_CAN_TERMINATION_CONST] = BUILD_POLICY(BINARY), /* size = termination_const_cnt * sizeof(u16) */ + [IFLA_CAN_BITRATE_CONST] = BUILD_POLICY(BINARY), /* size = bitrate_const_cnt * sizeof(u32) */ + [IFLA_CAN_DATA_BITRATE_CONST] = BUILD_POLICY(BINARY), /* size = data_bitrate_const_cnt * sizeof(u32) */ + [IFLA_CAN_BITRATE_MAX] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_link_info_data_geneve_policies[] = { + [IFLA_GENEVE_ID] = BUILD_POLICY(U32), + [IFLA_GENEVE_REMOTE] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [IFLA_GENEVE_TTL] = BUILD_POLICY(U8), + [IFLA_GENEVE_TOS] = BUILD_POLICY(U8), + [IFLA_GENEVE_PORT] = BUILD_POLICY(U16), + [IFLA_GENEVE_COLLECT_METADATA] = BUILD_POLICY(FLAG), + [IFLA_GENEVE_REMOTE6] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFLA_GENEVE_UDP_CSUM] = BUILD_POLICY(U8), + [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = BUILD_POLICY(U8), + [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = BUILD_POLICY(U8), + [IFLA_GENEVE_LABEL] = BUILD_POLICY(U32), + [IFLA_GENEVE_TTL_INHERIT] = BUILD_POLICY(U8), + [IFLA_GENEVE_DF] = BUILD_POLICY(U8), + [IFLA_GENEVE_INNER_PROTO_INHERIT] = BUILD_POLICY(FLAG), +}; + +static const NLAPolicy rtnl_link_info_data_gre_policies[] = { + [IFLA_GRE_LINK] = BUILD_POLICY(U32), + [IFLA_GRE_IFLAGS] = BUILD_POLICY(U16), + [IFLA_GRE_OFLAGS] = BUILD_POLICY(U16), + [IFLA_GRE_IKEY] = BUILD_POLICY(U32), + [IFLA_GRE_OKEY] = BUILD_POLICY(U32), + [IFLA_GRE_LOCAL] = BUILD_POLICY(IN_ADDR), + [IFLA_GRE_REMOTE] = BUILD_POLICY(IN_ADDR), + [IFLA_GRE_TTL] = BUILD_POLICY(U8), + [IFLA_GRE_TOS] = BUILD_POLICY(U8), + [IFLA_GRE_PMTUDISC] = BUILD_POLICY(U8), + [IFLA_GRE_ENCAP_LIMIT] = BUILD_POLICY(U8), + [IFLA_GRE_FLOWINFO] = BUILD_POLICY(U32), + [IFLA_GRE_FLAGS] = BUILD_POLICY(U32), + [IFLA_GRE_ENCAP_TYPE] = BUILD_POLICY(U16), + [IFLA_GRE_ENCAP_FLAGS] = BUILD_POLICY(U16), + [IFLA_GRE_ENCAP_SPORT] = BUILD_POLICY(U16), + [IFLA_GRE_ENCAP_DPORT] = BUILD_POLICY(U16), + [IFLA_GRE_COLLECT_METADATA] = BUILD_POLICY(FLAG), + [IFLA_GRE_IGNORE_DF] = BUILD_POLICY(U8), + [IFLA_GRE_FWMARK] = BUILD_POLICY(U32), + [IFLA_GRE_ERSPAN_INDEX] = BUILD_POLICY(U32), + [IFLA_GRE_ERSPAN_VER] = BUILD_POLICY(U8), + [IFLA_GRE_ERSPAN_DIR] = BUILD_POLICY(U8), + [IFLA_GRE_ERSPAN_HWID] = BUILD_POLICY(U16), +}; + +static const NLAPolicy rtnl_link_info_data_ipoib_policies[] = { + [IFLA_IPOIB_PKEY] = BUILD_POLICY(U16), + [IFLA_IPOIB_MODE] = BUILD_POLICY(U16), + [IFLA_IPOIB_UMCAST] = BUILD_POLICY(U16), +}; + +/* IFLA_IPTUN_ attributes are used in ipv4/ipip.c, ipv6/ip6_tunnel.c, and ipv6/sit.c. And unfortunately, + * IFLA_IPTUN_FLAGS is used with different types, ugh... */ +#define DEFINE_IPTUN_TYPES(name, flags_type) \ + static const NLAPolicy rtnl_link_info_data_##name##_policies[] = { \ + [IFLA_IPTUN_LINK] = BUILD_POLICY(U32), \ + [IFLA_IPTUN_LOCAL] = BUILD_POLICY(IN_ADDR), \ + [IFLA_IPTUN_REMOTE] = BUILD_POLICY(IN_ADDR), \ + [IFLA_IPTUN_TTL] = BUILD_POLICY(U8), \ + [IFLA_IPTUN_TOS] = BUILD_POLICY(U8), \ + [IFLA_IPTUN_ENCAP_LIMIT] = BUILD_POLICY(U8), \ + [IFLA_IPTUN_FLOWINFO] = BUILD_POLICY(U32), \ + [IFLA_IPTUN_FLAGS] = BUILD_POLICY(flags_type), \ + [IFLA_IPTUN_PROTO] = BUILD_POLICY(U8), \ + [IFLA_IPTUN_PMTUDISC] = BUILD_POLICY(U8), \ + [IFLA_IPTUN_6RD_PREFIX] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), \ + [IFLA_IPTUN_6RD_RELAY_PREFIX] = BUILD_POLICY(U32), \ + [IFLA_IPTUN_6RD_PREFIXLEN] = BUILD_POLICY(U16), \ + [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = BUILD_POLICY(U16), \ + [IFLA_IPTUN_ENCAP_TYPE] = BUILD_POLICY(U16), \ + [IFLA_IPTUN_ENCAP_FLAGS] = BUILD_POLICY(U16), \ + [IFLA_IPTUN_ENCAP_SPORT] = BUILD_POLICY(U16), \ + [IFLA_IPTUN_ENCAP_DPORT] = BUILD_POLICY(U16), \ + [IFLA_IPTUN_COLLECT_METADATA] = BUILD_POLICY(FLAG), \ + [IFLA_IPTUN_FWMARK] = BUILD_POLICY(U32), \ + } + +DEFINE_IPTUN_TYPES(iptun, U32); /* for ipip and ip6tnl */ +DEFINE_IPTUN_TYPES(sit, U16); /* for sit */ + +static const NLAPolicy rtnl_link_info_data_ipvlan_policies[] = { + [IFLA_IPVLAN_MODE] = BUILD_POLICY(U16), + [IFLA_IPVLAN_FLAGS] = BUILD_POLICY(U16), +}; + +static const NLAPolicy rtnl_link_info_data_macsec_policies[] = { + [IFLA_MACSEC_SCI] = BUILD_POLICY(U64), + [IFLA_MACSEC_PORT] = BUILD_POLICY(U16), + [IFLA_MACSEC_ICV_LEN] = BUILD_POLICY(U8), + [IFLA_MACSEC_CIPHER_SUITE] = BUILD_POLICY(U64), + [IFLA_MACSEC_WINDOW] = BUILD_POLICY(U32), + [IFLA_MACSEC_ENCODING_SA] = BUILD_POLICY(U8), + [IFLA_MACSEC_ENCRYPT] = BUILD_POLICY(U8), + [IFLA_MACSEC_PROTECT] = BUILD_POLICY(U8), + [IFLA_MACSEC_INC_SCI] = BUILD_POLICY(U8), + [IFLA_MACSEC_ES] = BUILD_POLICY(U8), + [IFLA_MACSEC_SCB] = BUILD_POLICY(U8), + [IFLA_MACSEC_REPLAY_PROTECT] = BUILD_POLICY(U8), + [IFLA_MACSEC_VALIDATION] = BUILD_POLICY(U8), + [IFLA_MACSEC_OFFLOAD] = BUILD_POLICY(U8), +}; + +static const NLAPolicy rtnl_macvlan_macaddr_policies[] = { + [IFLA_MACVLAN_MACADDR] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), +}; + +DEFINE_POLICY_SET(rtnl_macvlan_macaddr); + +static const NLAPolicy rtnl_link_info_data_macvlan_policies[] = { + [IFLA_MACVLAN_MODE] = BUILD_POLICY(U32), + [IFLA_MACVLAN_FLAGS] = BUILD_POLICY(U16), + [IFLA_MACVLAN_MACADDR_MODE] = BUILD_POLICY(U32), + [IFLA_MACVLAN_MACADDR_DATA] = BUILD_POLICY_NESTED(rtnl_macvlan_macaddr), + [IFLA_MACVLAN_MACADDR_COUNT] = BUILD_POLICY(U32), + [IFLA_MACVLAN_BC_QUEUE_LEN] = BUILD_POLICY(U32), + [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_link_info_data_tun_policies[] = { + [IFLA_TUN_OWNER] = BUILD_POLICY(U32), + [IFLA_TUN_GROUP] = BUILD_POLICY(U32), + [IFLA_TUN_TYPE] = BUILD_POLICY(U8), + [IFLA_TUN_PI] = BUILD_POLICY(U8), + [IFLA_TUN_VNET_HDR] = BUILD_POLICY(U8), + [IFLA_TUN_PERSIST] = BUILD_POLICY(U8), + [IFLA_TUN_MULTI_QUEUE] = BUILD_POLICY(U8), + [IFLA_TUN_NUM_QUEUES] = BUILD_POLICY(U32), + [IFLA_TUN_NUM_DISABLED_QUEUES] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_link_info_data_veth_policies[] = { + [VETH_INFO_PEER] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), +}; + +static const NLAPolicy rtnl_vlan_qos_map_policies[] = { + [IFLA_VLAN_QOS_MAPPING] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vlan_qos_mapping)), +}; + +DEFINE_POLICY_SET(rtnl_vlan_qos_map); + +static const NLAPolicy rtnl_link_info_data_vlan_policies[] = { + [IFLA_VLAN_ID] = BUILD_POLICY(U16), + [IFLA_VLAN_FLAGS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vlan_flags)), + [IFLA_VLAN_EGRESS_QOS] = BUILD_POLICY_NESTED(rtnl_vlan_qos_map), + [IFLA_VLAN_INGRESS_QOS] = BUILD_POLICY_NESTED(rtnl_vlan_qos_map), + [IFLA_VLAN_PROTOCOL] = BUILD_POLICY(U16), +}; + +static const NLAPolicy rtnl_link_info_data_vrf_policies[] = { + [IFLA_VRF_TABLE] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_link_info_data_vti_policies[] = { + [IFLA_VTI_LINK] = BUILD_POLICY(U32), + [IFLA_VTI_IKEY] = BUILD_POLICY(U32), + [IFLA_VTI_OKEY] = BUILD_POLICY(U32), + [IFLA_VTI_LOCAL] = BUILD_POLICY(IN_ADDR), + [IFLA_VTI_REMOTE] = BUILD_POLICY(IN_ADDR), + [IFLA_VTI_FWMARK] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_link_info_data_vxcan_policies[] = { + [VXCAN_INFO_PEER] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), +}; + +static const NLAPolicy rtnl_link_info_data_vxlan_policies[] = { + [IFLA_VXLAN_ID] = BUILD_POLICY(U32), + [IFLA_VXLAN_GROUP] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [IFLA_VXLAN_LINK] = BUILD_POLICY(U32), + [IFLA_VXLAN_LOCAL] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in_addr)), + [IFLA_VXLAN_TTL] = BUILD_POLICY(U8), + [IFLA_VXLAN_TOS] = BUILD_POLICY(U8), + [IFLA_VXLAN_LEARNING] = BUILD_POLICY(U8), + [IFLA_VXLAN_AGEING] = BUILD_POLICY(U32), + [IFLA_VXLAN_LIMIT] = BUILD_POLICY(U32), + [IFLA_VXLAN_PORT_RANGE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vxlan_port_range)), + [IFLA_VXLAN_PROXY] = BUILD_POLICY(U8), + [IFLA_VXLAN_RSC] = BUILD_POLICY(U8), + [IFLA_VXLAN_L2MISS] = BUILD_POLICY(U8), + [IFLA_VXLAN_L3MISS] = BUILD_POLICY(U8), + [IFLA_VXLAN_PORT] = BUILD_POLICY(U16), + [IFLA_VXLAN_GROUP6] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFLA_VXLAN_LOCAL6] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFLA_VXLAN_UDP_CSUM] = BUILD_POLICY(U8), + [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = BUILD_POLICY(U8), + [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = BUILD_POLICY(U8), + [IFLA_VXLAN_REMCSUM_TX] = BUILD_POLICY(U8), + [IFLA_VXLAN_REMCSUM_RX] = BUILD_POLICY(U8), + [IFLA_VXLAN_GBP] = BUILD_POLICY(FLAG), + [IFLA_VXLAN_REMCSUM_NOPARTIAL] = BUILD_POLICY(FLAG), + [IFLA_VXLAN_COLLECT_METADATA] = BUILD_POLICY(U8), + [IFLA_VXLAN_LABEL] = BUILD_POLICY(U32), + [IFLA_VXLAN_GPE] = BUILD_POLICY(FLAG), + [IFLA_VXLAN_TTL_INHERIT] = BUILD_POLICY(FLAG), + [IFLA_VXLAN_DF] = BUILD_POLICY(U8), +}; + +static const NLAPolicy rtnl_link_info_data_xfrm_policies[] = { + [IFLA_XFRM_LINK] = BUILD_POLICY(U32), + [IFLA_XFRM_IF_ID] = BUILD_POLICY(U32) +}; + +static const NLAPolicySetUnionElement rtnl_link_info_data_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_STRING("bareudp", rtnl_link_info_data_bareudp), + BUILD_UNION_ELEMENT_BY_STRING("batadv", rtnl_link_info_data_batadv), + BUILD_UNION_ELEMENT_BY_STRING("bond", rtnl_link_info_data_bond), + BUILD_UNION_ELEMENT_BY_STRING("bridge", rtnl_link_info_data_bridge), +/* + BUILD_UNION_ELEMENT_BY_STRING("caif", rtnl_link_info_data_caif), +*/ + BUILD_UNION_ELEMENT_BY_STRING("can", rtnl_link_info_data_can), + BUILD_UNION_ELEMENT_BY_STRING("erspan", rtnl_link_info_data_gre), + BUILD_UNION_ELEMENT_BY_STRING("geneve", rtnl_link_info_data_geneve), + BUILD_UNION_ELEMENT_BY_STRING("gre", rtnl_link_info_data_gre), + BUILD_UNION_ELEMENT_BY_STRING("gretap", rtnl_link_info_data_gre), +/* + BUILD_UNION_ELEMENT_BY_STRING("gtp", rtnl_link_info_data_gtp), + BUILD_UNION_ELEMENT_BY_STRING("hsr", rtnl_link_info_data_hsr), +*/ + BUILD_UNION_ELEMENT_BY_STRING("ip6erspan", rtnl_link_info_data_gre), + BUILD_UNION_ELEMENT_BY_STRING("ip6gre", rtnl_link_info_data_gre), + BUILD_UNION_ELEMENT_BY_STRING("ip6gretap", rtnl_link_info_data_gre), + BUILD_UNION_ELEMENT_BY_STRING("ip6tnl", rtnl_link_info_data_iptun), + BUILD_UNION_ELEMENT_BY_STRING("ipoib", rtnl_link_info_data_ipoib), + BUILD_UNION_ELEMENT_BY_STRING("ipip", rtnl_link_info_data_iptun), + BUILD_UNION_ELEMENT_BY_STRING("ipvlan", rtnl_link_info_data_ipvlan), + BUILD_UNION_ELEMENT_BY_STRING("ipvtap", rtnl_link_info_data_ipvlan), + BUILD_UNION_ELEMENT_BY_STRING("macsec", rtnl_link_info_data_macsec), + BUILD_UNION_ELEMENT_BY_STRING("macvlan", rtnl_link_info_data_macvlan), + BUILD_UNION_ELEMENT_BY_STRING("macvtap", rtnl_link_info_data_macvlan), +/* + BUILD_UNION_ELEMENT_BY_STRING("ppp", rtnl_link_info_data_ppp), + BUILD_UNION_ELEMENT_BY_STRING("rmnet", rtnl_link_info_data_rmnet), +*/ + BUILD_UNION_ELEMENT_BY_STRING("sit", rtnl_link_info_data_sit), + BUILD_UNION_ELEMENT_BY_STRING("tun", rtnl_link_info_data_tun), + BUILD_UNION_ELEMENT_BY_STRING("veth", rtnl_link_info_data_veth), + BUILD_UNION_ELEMENT_BY_STRING("vlan", rtnl_link_info_data_vlan), + BUILD_UNION_ELEMENT_BY_STRING("vrf", rtnl_link_info_data_vrf), + BUILD_UNION_ELEMENT_BY_STRING("vti", rtnl_link_info_data_vti), + BUILD_UNION_ELEMENT_BY_STRING("vti6", rtnl_link_info_data_vti), + BUILD_UNION_ELEMENT_BY_STRING("vxcan", rtnl_link_info_data_vxcan), + BUILD_UNION_ELEMENT_BY_STRING("vxlan", rtnl_link_info_data_vxlan), +/* + BUILD_UNION_ELEMENT_BY_STRING("wwan", rtnl_link_info_data_wwan), +*/ + BUILD_UNION_ELEMENT_BY_STRING("xfrm", rtnl_link_info_data_xfrm), +}; + +DEFINE_POLICY_SET_UNION(rtnl_link_info_data, IFLA_INFO_KIND); + +static const struct NLAPolicy rtnl_bridge_port_policies[] = { + [IFLA_BRPORT_STATE] = BUILD_POLICY(U8), + [IFLA_BRPORT_COST] = BUILD_POLICY(U32), + [IFLA_BRPORT_PRIORITY] = BUILD_POLICY(U16), + [IFLA_BRPORT_MODE] = BUILD_POLICY(U8), + [IFLA_BRPORT_GUARD] = BUILD_POLICY(U8), + [IFLA_BRPORT_PROTECT] = BUILD_POLICY(U8), + [IFLA_BRPORT_FAST_LEAVE] = BUILD_POLICY(U8), + [IFLA_BRPORT_LEARNING] = BUILD_POLICY(U8), + [IFLA_BRPORT_UNICAST_FLOOD] = BUILD_POLICY(U8), + [IFLA_BRPORT_PROXYARP] = BUILD_POLICY(U8), + [IFLA_BRPORT_LEARNING_SYNC] = BUILD_POLICY(U8), + [IFLA_BRPORT_PROXYARP_WIFI] = BUILD_POLICY(U8), + [IFLA_BRPORT_ROOT_ID] = BUILD_POLICY(U8), + [IFLA_BRPORT_BRIDGE_ID] = BUILD_POLICY(U8), + [IFLA_BRPORT_DESIGNATED_PORT] = BUILD_POLICY(U16), + [IFLA_BRPORT_DESIGNATED_COST] = BUILD_POLICY(U16), + [IFLA_BRPORT_ID] = BUILD_POLICY(U16), + [IFLA_BRPORT_NO] = BUILD_POLICY(U16), + [IFLA_BRPORT_TOPOLOGY_CHANGE_ACK] = BUILD_POLICY(U8), + [IFLA_BRPORT_CONFIG_PENDING] = BUILD_POLICY(U8), + [IFLA_BRPORT_MESSAGE_AGE_TIMER] = BUILD_POLICY(U64), + [IFLA_BRPORT_FORWARD_DELAY_TIMER] = BUILD_POLICY(U64), + [IFLA_BRPORT_HOLD_TIMER] = BUILD_POLICY(U64), + [IFLA_BRPORT_FLUSH] = BUILD_POLICY(U8), + [IFLA_BRPORT_MULTICAST_ROUTER] = BUILD_POLICY(U8), + [IFLA_BRPORT_PAD] = BUILD_POLICY(U8), + [IFLA_BRPORT_MCAST_FLOOD] = BUILD_POLICY(U8), + [IFLA_BRPORT_MCAST_TO_UCAST] = BUILD_POLICY(U8), + [IFLA_BRPORT_VLAN_TUNNEL] = BUILD_POLICY(U8), + [IFLA_BRPORT_BCAST_FLOOD] = BUILD_POLICY(U8), + [IFLA_BRPORT_GROUP_FWD_MASK] = BUILD_POLICY(U16), + [IFLA_BRPORT_NEIGH_SUPPRESS] = BUILD_POLICY(U8), + [IFLA_BRPORT_ISOLATED] = BUILD_POLICY(U8), + [IFLA_BRPORT_BACKUP_PORT] = BUILD_POLICY(U32), + [IFLA_BRPORT_MRP_RING_OPEN] = BUILD_POLICY(U8), + [IFLA_BRPORT_MRP_IN_OPEN] = BUILD_POLICY(U8), + [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = BUILD_POLICY(U32), + [IFLA_BRPORT_MCAST_EHT_HOSTS_CNT] = BUILD_POLICY(U32), +}; + +static const NLAPolicySetUnionElement rtnl_link_info_slave_data_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_STRING("bridge", rtnl_bridge_port), +}; + +DEFINE_POLICY_SET_UNION(rtnl_link_info_slave_data, IFLA_INFO_SLAVE_KIND); + +static const NLAPolicy rtnl_link_info_policies[] = { + [IFLA_INFO_KIND] = BUILD_POLICY(STRING), + [IFLA_INFO_DATA] = BUILD_POLICY_NESTED_UNION_BY_STRING(rtnl_link_info_data), + /* TODO: Currently IFLA_INFO_XSTATS is used only when IFLA_INFO_KIND is "can". In the future, + * when multiple kinds of netdevs use this attribute, convert its type to NETLINK_TYPE_UNION. */ + [IFLA_INFO_XSTATS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct can_device_stats)), + [IFLA_INFO_SLAVE_KIND] = BUILD_POLICY(STRING), + [IFLA_INFO_SLAVE_DATA] = BUILD_POLICY_NESTED_UNION_BY_STRING(rtnl_link_info_slave_data), +}; + +DEFINE_POLICY_SET(rtnl_link_info); + +static const struct NLAPolicy rtnl_inet_policies[] = { + [IFLA_INET_CONF] = BUILD_POLICY(BINARY), /* size = IPV4_DEVCONF_MAX * 4 */ +}; + +DEFINE_POLICY_SET(rtnl_inet); + +static const struct NLAPolicy rtnl_inet6_policies[] = { + [IFLA_INET6_FLAGS] = BUILD_POLICY(U32), + [IFLA_INET6_CONF] = BUILD_POLICY(BINARY), /* size = DEVCONF_MAX * sizeof(s32) */ + [IFLA_INET6_STATS] = BUILD_POLICY(BINARY), /* size = IPSTATS_MIB_MAX * sizeof(u64) */ + [IFLA_INET6_MCAST] = {}, /* unused. */ + [IFLA_INET6_CACHEINFO] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_cacheinfo)), + [IFLA_INET6_ICMP6STATS] = BUILD_POLICY(BINARY), /* size = ICMP6_MIB_MAX * sizeof(u64) */ + [IFLA_INET6_TOKEN] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFLA_INET6_ADDR_GEN_MODE] = BUILD_POLICY(U8), +}; + +DEFINE_POLICY_SET(rtnl_inet6); + +static const NLAPolicySetUnionElement rtnl_prot_info_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_FAMILY(AF_BRIDGE, rtnl_bridge_port), + BUILD_UNION_ELEMENT_BY_FAMILY(AF_INET6, rtnl_inet6), +}; + +DEFINE_POLICY_SET_UNION(rtnl_prot_info, 0); + +static const NLAPolicy rtnl_af_spec_unspec_policies[] = { + [AF_INET] = BUILD_POLICY_NESTED(rtnl_inet), + [AF_INET6] = BUILD_POLICY_NESTED(rtnl_inet6), +}; + +static const NLAPolicy rtnl_bridge_vlan_tunnel_info_policies[] = { + [IFLA_BRIDGE_VLAN_TUNNEL_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_VLAN_TUNNEL_VID] = BUILD_POLICY(U16), + [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = BUILD_POLICY(U16), +}; + +DEFINE_POLICY_SET(rtnl_bridge_vlan_tunnel_info); + +static const NLAPolicy rtnl_bridge_mrp_instance_policies[] = { + [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = BUILD_POLICY(U16), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_instance); + +static const NLAPolicy rtnl_bridge_mrp_port_state_policies[] = { + [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_port_state); + +static const NLAPolicy rtnl_bridge_mrp_port_role_policies[] = { + [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_port_role); + +static const NLAPolicy rtnl_bridge_mrp_ring_state_policies[] = { + [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_RING_STATE_STATE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_ring_state); + +static const NLAPolicy rtnl_bridge_mrp_ring_role_policies[] = { + [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_ring_role); + +static const NLAPolicy rtnl_bridge_mrp_start_test_policies[] = { + [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_start_test); + +static const NLAPolicy rtnl_bridge_mrp_info_policies[] = { + [IFLA_BRIDGE_MRP_INFO_RING_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_P_IFINDEX] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_S_IFINDEX] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_PRIO] = BUILD_POLICY(U16), + [IFLA_BRIDGE_MRP_INFO_RING_STATE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_RING_ROLE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_TEST_MONITOR] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_I_IFINDEX] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_IN_STATE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_IN_ROLE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_info); + +static const NLAPolicy rtnl_bridge_mrp_in_role_policies[] = { + [IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] = BUILD_POLICY(U16), + [IFLA_BRIDGE_MRP_IN_ROLE_ROLE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_in_role); + +static const NLAPolicy rtnl_bridge_mrp_in_state_policies[] = { + [IFLA_BRIDGE_MRP_IN_STATE_IN_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_IN_STATE_STATE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_in_state); + +static const NLAPolicy rtnl_bridge_mrp_start_in_test_policies[] = { + [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = BUILD_POLICY(U32), + [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp_start_in_test); + +static const NLAPolicy rtnl_bridge_mrp_policies[] = { + [IFLA_BRIDGE_MRP_INSTANCE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_instance), + [IFLA_BRIDGE_MRP_PORT_STATE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_port_state), + [IFLA_BRIDGE_MRP_PORT_ROLE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_port_role), + [IFLA_BRIDGE_MRP_RING_STATE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_ring_state), + [IFLA_BRIDGE_MRP_RING_ROLE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_ring_role), + [IFLA_BRIDGE_MRP_START_TEST] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_start_test), + [IFLA_BRIDGE_MRP_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_info), + [IFLA_BRIDGE_MRP_IN_ROLE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_in_role), + [IFLA_BRIDGE_MRP_IN_STATE] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_in_state), + [IFLA_BRIDGE_MRP_START_IN_TEST] = BUILD_POLICY_NESTED(rtnl_bridge_mrp_start_in_test), +}; + +DEFINE_POLICY_SET(rtnl_bridge_mrp); + +static const NLAPolicy rtnl_bridge_cfm_mep_create_policies[] = { + [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_mep_create); + +static const NLAPolicy rtnl_bridge_cfm_mep_delete_policies[] = { + [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_mep_delete); + +static const NLAPolicy rtnl_bridge_cfm_mep_config_policies[] = { + [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_mep_config); + +static const NLAPolicy rtnl_bridge_cfm_cc_config_policies[] = { + [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = BUILD_POLICY_WITH_SIZE(BINARY, CFM_MAID_LENGTH), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_cc_config); + +static const NLAPolicy rtnl_bridge_cfm_cc_peer_mep_policies[] = { + [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_cc_peer_mep); + +static const NLAPolicy rtnl_bridge_cfm_cc_rdi_policies[] = { + [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_RDI_RDI] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_cc_rdi); + +static const NLAPolicy rtnl_bridge_cfm_cc_ccm_tx_policies[] = { + [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = BUILD_POLICY_WITH_SIZE(ETHER_ADDR, ETH_ALEN), + [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = BUILD_POLICY(U8), + [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = BUILD_POLICY(U8), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_cc_ccm_tx); + +static const NLAPolicy rtnl_bridge_cfm_mep_status_policies[] = { + [IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_mep_status); + +static const NLAPolicy rtnl_bridge_cfm_cc_peer_status_policies[] = { + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE] = BUILD_POLICY(U8), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE] = BUILD_POLICY(U8), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN] = BUILD_POLICY(U32), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm_cc_peer_status); + +static const NLAPolicy rtnl_bridge_cfm_policies[] = { + [IFLA_BRIDGE_CFM_MEP_CREATE] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_mep_create), + [IFLA_BRIDGE_CFM_MEP_DELETE] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_mep_delete), + [IFLA_BRIDGE_CFM_MEP_CONFIG] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_mep_config), + [IFLA_BRIDGE_CFM_CC_CONFIG] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_config), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_peer_mep), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_peer_mep), + [IFLA_BRIDGE_CFM_CC_RDI] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_rdi), + [IFLA_BRIDGE_CFM_CC_CCM_TX] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_ccm_tx), + [IFLA_BRIDGE_CFM_MEP_CREATE_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_mep_create), + [IFLA_BRIDGE_CFM_MEP_CONFIG_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_mep_config), + [IFLA_BRIDGE_CFM_CC_CONFIG_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_config), + [IFLA_BRIDGE_CFM_CC_RDI_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_rdi), + [IFLA_BRIDGE_CFM_CC_CCM_TX_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_ccm_tx), + [IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_peer_mep), + [IFLA_BRIDGE_CFM_MEP_STATUS_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_mep_status), + [IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_cfm_cc_peer_status), +}; + +DEFINE_POLICY_SET(rtnl_bridge_cfm); + +static const NLAPolicy rtnl_af_spec_bridge_policies[] = { + [IFLA_BRIDGE_FLAGS] = BUILD_POLICY(U16), + [IFLA_BRIDGE_MODE] = BUILD_POLICY(U16), + [IFLA_BRIDGE_VLAN_INFO] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct bridge_vlan_info)), + [IFLA_BRIDGE_VLAN_TUNNEL_INFO] = BUILD_POLICY_NESTED(rtnl_bridge_vlan_tunnel_info), + [IFLA_BRIDGE_MRP] = BUILD_POLICY_NESTED(rtnl_bridge_mrp), + [IFLA_BRIDGE_CFM] = BUILD_POLICY_NESTED(rtnl_bridge_cfm), +}; + +static const NLAPolicySetUnionElement rtnl_af_spec_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_FAMILY(AF_UNSPEC, rtnl_af_spec_unspec), + BUILD_UNION_ELEMENT_BY_FAMILY(AF_BRIDGE, rtnl_af_spec_bridge), +}; + +DEFINE_POLICY_SET_UNION(rtnl_af_spec, 0); + +static const NLAPolicy rtnl_prop_list_policies[] = { + [IFLA_ALT_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, ALTIFNAMSIZ - 1), +}; + +DEFINE_POLICY_SET(rtnl_prop_list); + +static const NLAPolicy rtnl_vf_vlan_list_policies[] = { + [IFLA_VF_VLAN_INFO] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_vlan_info)), +}; + +DEFINE_POLICY_SET(rtnl_vf_vlan_list); + +static const NLAPolicy rtnl_vf_info_policies[] = { + [IFLA_VF_MAC] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_mac)), + [IFLA_VF_VLAN] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_vlan)), + [IFLA_VF_VLAN_LIST] = BUILD_POLICY_NESTED(rtnl_vf_vlan_list), + [IFLA_VF_TX_RATE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_tx_rate)), + [IFLA_VF_SPOOFCHK] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_spoofchk)), + [IFLA_VF_RATE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_rate)), + [IFLA_VF_LINK_STATE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_link_state)), + [IFLA_VF_RSS_QUERY_EN] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_rss_query_en)), + [IFLA_VF_TRUST] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_trust)), + [IFLA_VF_IB_NODE_GUID] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_guid)), + [IFLA_VF_IB_PORT_GUID] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_vf_guid)), +}; + +DEFINE_POLICY_SET(rtnl_vf_info); + +static const NLAPolicy rtnl_vfinfo_list_policies[] = { + [IFLA_VF_INFO] = BUILD_POLICY_NESTED(rtnl_vf_info), +}; + +DEFINE_POLICY_SET(rtnl_vfinfo_list); + +static const NLAPolicy rtnl_vf_port_policies[] = { + [IFLA_PORT_VF] = BUILD_POLICY(U32), + [IFLA_PORT_PROFILE] = BUILD_POLICY(STRING), + [IFLA_PORT_VSI_TYPE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct ifla_port_vsi)), + [IFLA_PORT_INSTANCE_UUID] = BUILD_POLICY_WITH_SIZE(BINARY, PORT_UUID_MAX), + [IFLA_PORT_HOST_UUID] = BUILD_POLICY_WITH_SIZE(BINARY, PORT_UUID_MAX), + [IFLA_PORT_REQUEST] = BUILD_POLICY(U8), + [IFLA_PORT_RESPONSE] = BUILD_POLICY(U16), +}; + +DEFINE_POLICY_SET(rtnl_vf_port); + +static const NLAPolicy rtnl_vf_ports_policies[] = { + [IFLA_VF_PORT] = BUILD_POLICY_NESTED(rtnl_vf_port), +}; + +DEFINE_POLICY_SET(rtnl_vf_ports); + +static const NLAPolicy rtnl_xdp_policies[] = { + [IFLA_XDP_FD] = BUILD_POLICY(S32), + [IFLA_XDP_ATTACHED] = BUILD_POLICY(U8), + [IFLA_XDP_FLAGS] = BUILD_POLICY(U32), + [IFLA_XDP_PROG_ID] = BUILD_POLICY(U32), + [IFLA_XDP_DRV_PROG_ID] = BUILD_POLICY(U32), + [IFLA_XDP_SKB_PROG_ID] = BUILD_POLICY(U32), + [IFLA_XDP_HW_PROG_ID] = BUILD_POLICY(U32), + [IFLA_XDP_EXPECTED_FD] = BUILD_POLICY(S32), +}; + +DEFINE_POLICY_SET(rtnl_xdp); + +static const NLAPolicy rtnl_proto_down_reason_policies[] = { + [IFLA_PROTO_DOWN_REASON_MASK] = BUILD_POLICY(U32), + [IFLA_PROTO_DOWN_REASON_VALUE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_proto_down_reason); + +static const NLAPolicy rtnl_link_policies[] = { + [IFLA_ADDRESS] = BUILD_POLICY(ETHER_ADDR), + [IFLA_BROADCAST] = BUILD_POLICY(ETHER_ADDR), + [IFLA_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ - 1), + [IFLA_MTU] = BUILD_POLICY(U32), + [IFLA_LINK] = BUILD_POLICY(U32), + [IFLA_QDISC] = BUILD_POLICY(STRING), + [IFLA_STATS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct rtnl_link_stats)), + [IFLA_COST] = { /* Not used. */ }, + [IFLA_PRIORITY] = { /* Not used. */ }, + [IFLA_MASTER] = BUILD_POLICY(U32), + [IFLA_WIRELESS] = { /* Used only by wext. */ }, + [IFLA_PROTINFO] = BUILD_POLICY_NESTED_UNION_BY_FAMILY(rtnl_prot_info), + [IFLA_TXQLEN] = BUILD_POLICY(U32), + [IFLA_MAP] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct rtnl_link_ifmap)), + [IFLA_WEIGHT] = BUILD_POLICY(U32), + [IFLA_OPERSTATE] = BUILD_POLICY(U8), + [IFLA_LINKMODE] = BUILD_POLICY(U8), + [IFLA_LINKINFO] = BUILD_POLICY_NESTED(rtnl_link_info), + [IFLA_NET_NS_PID] = BUILD_POLICY(U32), + [IFLA_IFALIAS] = BUILD_POLICY_WITH_SIZE(STRING, IFALIASZ - 1), + [IFLA_NUM_VF] = BUILD_POLICY(U32), + [IFLA_VFINFO_LIST] = BUILD_POLICY_NESTED(rtnl_vfinfo_list), + [IFLA_STATS64] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct rtnl_link_stats64)), + [IFLA_VF_PORTS] = BUILD_POLICY_NESTED(rtnl_vf_ports), + [IFLA_PORT_SELF] = BUILD_POLICY_NESTED(rtnl_vf_port), + [IFLA_AF_SPEC] = BUILD_POLICY_NESTED_UNION_BY_FAMILY(rtnl_af_spec), + [IFLA_GROUP] = BUILD_POLICY(U32), + [IFLA_NET_NS_FD] = BUILD_POLICY(U32), + [IFLA_EXT_MASK] = BUILD_POLICY(U32), + [IFLA_PROMISCUITY] = BUILD_POLICY(U32), + [IFLA_NUM_TX_QUEUES] = BUILD_POLICY(U32), + [IFLA_NUM_RX_QUEUES] = BUILD_POLICY(U32), + [IFLA_CARRIER] = BUILD_POLICY(U8), + [IFLA_PHYS_PORT_ID] = BUILD_POLICY_WITH_SIZE(BINARY, MAX_PHYS_ITEM_ID_LEN), + [IFLA_CARRIER_CHANGES] = BUILD_POLICY(U32), + [IFLA_PHYS_SWITCH_ID] = BUILD_POLICY_WITH_SIZE(BINARY, MAX_PHYS_ITEM_ID_LEN), + [IFLA_LINK_NETNSID] = BUILD_POLICY(S32), + [IFLA_PHYS_PORT_NAME] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ - 1), + [IFLA_PROTO_DOWN] = BUILD_POLICY(U8), + [IFLA_GSO_MAX_SEGS] = BUILD_POLICY(U32), + [IFLA_GSO_MAX_SIZE] = BUILD_POLICY(U32), + [IFLA_XDP] = BUILD_POLICY_NESTED(rtnl_xdp), + [IFLA_EVENT] = BUILD_POLICY(U32), + [IFLA_NEW_NETNSID] = BUILD_POLICY(S32), + [IFLA_TARGET_NETNSID] = BUILD_POLICY(S32), + [IFLA_CARRIER_UP_COUNT] = BUILD_POLICY(U32), + [IFLA_CARRIER_DOWN_COUNT] = BUILD_POLICY(U32), + [IFLA_NEW_IFINDEX] = BUILD_POLICY(S32), + [IFLA_MIN_MTU] = BUILD_POLICY(U32), + [IFLA_MAX_MTU] = BUILD_POLICY(U32), + [IFLA_PROP_LIST] = BUILD_POLICY_NESTED(rtnl_prop_list), + [IFLA_ALT_IFNAME] = BUILD_POLICY_WITH_SIZE(STRING, ALTIFNAMSIZ - 1), + [IFLA_PERM_ADDRESS] = BUILD_POLICY(ETHER_ADDR), + [IFLA_PROTO_DOWN_REASON] = BUILD_POLICY_NESTED(rtnl_proto_down_reason), + [IFLA_PARENT_DEV_NAME] = BUILD_POLICY(STRING), + [IFLA_PARENT_DEV_BUS_NAME] = BUILD_POLICY(STRING), +}; + +DEFINE_POLICY_SET(rtnl_link); + +/* IFA_FLAGS was defined in kernel 3.14, but we still support older + * kernels where IFA_MAX is lower. */ +static const NLAPolicy rtnl_address_policies[] = { + [IFA_ADDRESS] = BUILD_POLICY(IN_ADDR), + [IFA_LOCAL] = BUILD_POLICY(IN_ADDR), + [IFA_LABEL] = BUILD_POLICY_WITH_SIZE(STRING, IFNAMSIZ - 1), + [IFA_BROADCAST] = BUILD_POLICY(IN_ADDR), + [IFA_ANYCAST] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFA_CACHEINFO] = BUILD_POLICY_WITH_SIZE(CACHE_INFO, sizeof(struct ifa_cacheinfo)), + [IFA_MULTICAST] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFA_FLAGS] = BUILD_POLICY(U32), + [IFA_RT_PRIORITY] = BUILD_POLICY(U32), + [IFA_TARGET_NETNSID] = BUILD_POLICY(S32), +}; + +DEFINE_POLICY_SET(rtnl_address); + +/* RTM_METRICS --- array of struct rtattr with types of RTAX_* */ + +static const NLAPolicy rtnl_route_metrics_policies[] = { + [RTAX_MTU] = BUILD_POLICY(U32), + [RTAX_WINDOW] = BUILD_POLICY(U32), + [RTAX_RTT] = BUILD_POLICY(U32), + [RTAX_RTTVAR] = BUILD_POLICY(U32), + [RTAX_SSTHRESH] = BUILD_POLICY(U32), + [RTAX_CWND] = BUILD_POLICY(U32), + [RTAX_ADVMSS] = BUILD_POLICY(U32), + [RTAX_REORDERING] = BUILD_POLICY(U32), + [RTAX_HOPLIMIT] = BUILD_POLICY(U32), + [RTAX_INITCWND] = BUILD_POLICY(U32), + [RTAX_FEATURES] = BUILD_POLICY(U32), + [RTAX_RTO_MIN] = BUILD_POLICY(U32), + [RTAX_INITRWND] = BUILD_POLICY(U32), + [RTAX_QUICKACK] = BUILD_POLICY(U32), + [RTAX_CC_ALGO] = BUILD_POLICY(STRING), + [RTAX_FASTOPEN_NO_COOKIE] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_route_metrics); + +static const NLAPolicy rtnl_route_policies[] = { + [RTA_DST] = BUILD_POLICY(IN_ADDR), + [RTA_SRC] = BUILD_POLICY(IN_ADDR), + [RTA_IIF] = BUILD_POLICY(U32), + [RTA_OIF] = BUILD_POLICY(U32), + [RTA_GATEWAY] = BUILD_POLICY(IN_ADDR), + [RTA_PRIORITY] = BUILD_POLICY(U32), + [RTA_PREFSRC] = BUILD_POLICY(IN_ADDR), + [RTA_METRICS] = BUILD_POLICY_NESTED(rtnl_route_metrics), + [RTA_MULTIPATH] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct rtnexthop)), + [RTA_FLOW] = BUILD_POLICY(U32), + [RTA_CACHEINFO] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct rta_cacheinfo)), + [RTA_TABLE] = BUILD_POLICY(U32), + [RTA_MARK] = BUILD_POLICY(U32), + [RTA_MFC_STATS] = BUILD_POLICY(U64), + [RTA_VIA] = BUILD_POLICY(BINARY), /* See struct rtvia */ + [RTA_NEWDST] = BUILD_POLICY(U32), + [RTA_PREF] = BUILD_POLICY(U8), + [RTA_ENCAP_TYPE] = BUILD_POLICY(U16), + [RTA_ENCAP] = { .type = NETLINK_TYPE_NESTED }, /* Multiple type systems i.e. LWTUNNEL_ENCAP_MPLS/LWTUNNEL_ENCAP_IP/LWTUNNEL_ENCAP_ILA etc... */ + [RTA_EXPIRES] = BUILD_POLICY(U32), + [RTA_UID] = BUILD_POLICY(U32), + [RTA_TTL_PROPAGATE] = BUILD_POLICY(U8), + [RTA_IP_PROTO] = BUILD_POLICY(U8), + [RTA_SPORT] = BUILD_POLICY(U16), + [RTA_DPORT] = BUILD_POLICY(U16), + [RTA_NH_ID] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_route); + +static const NLAPolicy rtnl_neigh_policies[] = { + [NDA_DST] = BUILD_POLICY(IN_ADDR), + [NDA_LLADDR] = BUILD_POLICY(ETHER_ADDR), + [NDA_CACHEINFO] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct nda_cacheinfo)), + [NDA_PROBES] = BUILD_POLICY(U32), + [NDA_VLAN] = BUILD_POLICY(U16), + [NDA_PORT] = BUILD_POLICY(U16), + [NDA_VNI] = BUILD_POLICY(U32), + [NDA_IFINDEX] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_neigh); + +static const NLAPolicy rtnl_addrlabel_policies[] = { + [IFAL_ADDRESS] = BUILD_POLICY_WITH_SIZE(IN_ADDR, sizeof(struct in6_addr)), + [IFAL_LABEL] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_addrlabel); + +static const NLAPolicy rtnl_routing_policy_rule_policies[] = { + [FRA_DST] = BUILD_POLICY(IN_ADDR), + [FRA_SRC] = BUILD_POLICY(IN_ADDR), + [FRA_IIFNAME] = BUILD_POLICY(STRING), + [FRA_GOTO] = BUILD_POLICY(U32), + [FRA_PRIORITY] = BUILD_POLICY(U32), + [FRA_FWMARK] = BUILD_POLICY(U32), + [FRA_FLOW] = BUILD_POLICY(U32), + [FRA_TUN_ID] = BUILD_POLICY(U64), + [FRA_SUPPRESS_IFGROUP] = BUILD_POLICY(U32), + [FRA_SUPPRESS_PREFIXLEN] = BUILD_POLICY(U32), + [FRA_TABLE] = BUILD_POLICY(U32), + [FRA_FWMASK] = BUILD_POLICY(U32), + [FRA_OIFNAME] = BUILD_POLICY(STRING), + [FRA_PAD] = BUILD_POLICY(U32), + [FRA_L3MDEV] = BUILD_POLICY(U8), + [FRA_UID_RANGE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct fib_rule_uid_range)), + [FRA_PROTOCOL] = BUILD_POLICY(U8), + [FRA_IP_PROTO] = BUILD_POLICY(U8), + [FRA_SPORT_RANGE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct fib_rule_port_range)), + [FRA_DPORT_RANGE] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct fib_rule_port_range)), +}; + +DEFINE_POLICY_SET(rtnl_routing_policy_rule); + +static const NLAPolicy rtnl_nexthop_policies[] = { + [NHA_ID] = BUILD_POLICY(U32), + [NHA_GROUP] = { /* array of struct nexthop_grp */ }, + [NHA_GROUP_TYPE] = BUILD_POLICY(U16), + [NHA_BLACKHOLE] = BUILD_POLICY(FLAG), + [NHA_OIF] = BUILD_POLICY(U32), + [NHA_GATEWAY] = BUILD_POLICY(IN_ADDR), + [NHA_ENCAP_TYPE] = BUILD_POLICY(U16), + [NHA_ENCAP] = { .type = NETLINK_TYPE_NESTED }, + [NHA_GROUPS] = BUILD_POLICY(FLAG), + [NHA_MASTER] = BUILD_POLICY(U32), + [NHA_FDB] = BUILD_POLICY(FLAG), +}; + +DEFINE_POLICY_SET(rtnl_nexthop); + +static const NLAPolicy rtnl_tca_option_data_cake_policies[] = { + [TCA_CAKE_BASE_RATE64] = BUILD_POLICY(U64), + [TCA_CAKE_DIFFSERV_MODE] = BUILD_POLICY(U32), + [TCA_CAKE_ATM] = BUILD_POLICY(U32), + [TCA_CAKE_FLOW_MODE] = BUILD_POLICY(U32), + [TCA_CAKE_OVERHEAD] = BUILD_POLICY(S32), + [TCA_CAKE_RTT] = BUILD_POLICY(U32), + [TCA_CAKE_TARGET] = BUILD_POLICY(U32), + [TCA_CAKE_AUTORATE] = BUILD_POLICY(U32), + [TCA_CAKE_MEMORY] = BUILD_POLICY(U32), + [TCA_CAKE_NAT] = BUILD_POLICY(U32), + [TCA_CAKE_RAW] = BUILD_POLICY(U32), + [TCA_CAKE_WASH] = BUILD_POLICY(U32), + [TCA_CAKE_MPU] = BUILD_POLICY(U32), + [TCA_CAKE_INGRESS] = BUILD_POLICY(U32), + [TCA_CAKE_ACK_FILTER] = BUILD_POLICY(U32), + [TCA_CAKE_SPLIT_GSO] = BUILD_POLICY(U32), + [TCA_CAKE_FWMARK] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_codel_policies[] = { + [TCA_CODEL_TARGET] = BUILD_POLICY(U32), + [TCA_CODEL_LIMIT] = BUILD_POLICY(U32), + [TCA_CODEL_INTERVAL] = BUILD_POLICY(U32), + [TCA_CODEL_ECN] = BUILD_POLICY(U32), + [TCA_CODEL_CE_THRESHOLD] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_drr_policies[] = { + [TCA_DRR_QUANTUM] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_ets_quanta_policies[] = { + [TCA_ETS_QUANTA_BAND] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_tca_option_data_ets_quanta); + +static const NLAPolicy rtnl_tca_option_data_ets_prio_policies[] = { + [TCA_ETS_PRIOMAP_BAND] = BUILD_POLICY(U8), +}; + +DEFINE_POLICY_SET(rtnl_tca_option_data_ets_prio); + +static const NLAPolicy rtnl_tca_option_data_ets_policies[] = { + [TCA_ETS_NBANDS] = BUILD_POLICY(U8), + [TCA_ETS_NSTRICT] = BUILD_POLICY(U8), + [TCA_ETS_QUANTA] = BUILD_POLICY_NESTED(rtnl_tca_option_data_ets_quanta), + [TCA_ETS_PRIOMAP] = BUILD_POLICY_NESTED(rtnl_tca_option_data_ets_prio), + [TCA_ETS_QUANTA_BAND] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_fq_policies[] = { + [TCA_FQ_PLIMIT] = BUILD_POLICY(U32), + [TCA_FQ_FLOW_PLIMIT] = BUILD_POLICY(U32), + [TCA_FQ_QUANTUM] = BUILD_POLICY(U32), + [TCA_FQ_INITIAL_QUANTUM] = BUILD_POLICY(U32), + [TCA_FQ_RATE_ENABLE] = BUILD_POLICY(U32), + [TCA_FQ_FLOW_DEFAULT_RATE] = BUILD_POLICY(U32), + [TCA_FQ_FLOW_MAX_RATE] = BUILD_POLICY(U32), + [TCA_FQ_BUCKETS_LOG] = BUILD_POLICY(U32), + [TCA_FQ_FLOW_REFILL_DELAY] = BUILD_POLICY(U32), + [TCA_FQ_LOW_RATE_THRESHOLD] = BUILD_POLICY(U32), + [TCA_FQ_CE_THRESHOLD] = BUILD_POLICY(U32), + [TCA_FQ_ORPHAN_MASK] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_fq_codel_policies[] = { + [TCA_FQ_CODEL_TARGET] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_LIMIT] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_INTERVAL] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_ECN] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_FLOWS] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_QUANTUM] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_CE_THRESHOLD] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_DROP_BATCH_SIZE] = BUILD_POLICY(U32), + [TCA_FQ_CODEL_MEMORY_LIMIT] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_fq_pie_policies[] = { + [TCA_FQ_PIE_LIMIT] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_gred_policies[] = { + [TCA_GRED_DPS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct tc_gred_sopt)), +}; + +static const NLAPolicy rtnl_tca_option_data_hhf_policies[] = { + [TCA_HHF_BACKLOG_LIMIT] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_htb_policies[] = { + [TCA_HTB_PARMS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct tc_htb_opt)), + [TCA_HTB_INIT] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct tc_htb_glob)), + [TCA_HTB_CTAB] = BUILD_POLICY_WITH_SIZE(BINARY, TC_RTAB_SIZE), + [TCA_HTB_RTAB] = BUILD_POLICY_WITH_SIZE(BINARY, TC_RTAB_SIZE), + [TCA_HTB_RATE64] = BUILD_POLICY(U64), + [TCA_HTB_CEIL64] = BUILD_POLICY(U64), +}; + +static const NLAPolicy rtnl_tca_option_data_pie_policies[] = { + [TCA_PIE_LIMIT] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_qfq_policies[] = { + [TCA_QFQ_WEIGHT] = BUILD_POLICY(U32), + [TCA_QFQ_LMAX] = BUILD_POLICY(U32), +}; + +static const NLAPolicy rtnl_tca_option_data_sfb_policies[] = { + [TCA_SFB_PARMS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct tc_sfb_qopt)), +}; + +static const NLAPolicy rtnl_tca_option_data_tbf_policies[] = { + [TCA_TBF_PARMS] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct tc_tbf_qopt)), + [TCA_TBF_RTAB] = BUILD_POLICY_WITH_SIZE(BINARY, TC_RTAB_SIZE), + [TCA_TBF_PTAB] = BUILD_POLICY_WITH_SIZE(BINARY, TC_RTAB_SIZE), + [TCA_TBF_RATE64] = BUILD_POLICY(U64), + [TCA_TBF_PRATE64] = BUILD_POLICY(U64), + [TCA_TBF_BURST] = BUILD_POLICY(U32), + [TCA_TBF_PBURST] = BUILD_POLICY(U32), +}; + +static const NLAPolicySetUnionElement rtnl_tca_option_data_policy_set_union_elements[] = { + BUILD_UNION_ELEMENT_BY_STRING("cake", rtnl_tca_option_data_cake), + BUILD_UNION_ELEMENT_BY_STRING("codel", rtnl_tca_option_data_codel), + BUILD_UNION_ELEMENT_BY_STRING("drr", rtnl_tca_option_data_drr), + BUILD_UNION_ELEMENT_BY_STRING("ets", rtnl_tca_option_data_ets), + BUILD_UNION_ELEMENT_BY_STRING("fq", rtnl_tca_option_data_fq), + BUILD_UNION_ELEMENT_BY_STRING("fq_codel", rtnl_tca_option_data_fq_codel), + BUILD_UNION_ELEMENT_BY_STRING("fq_pie", rtnl_tca_option_data_fq_pie), + BUILD_UNION_ELEMENT_BY_STRING("gred", rtnl_tca_option_data_gred), + BUILD_UNION_ELEMENT_BY_STRING("hhf", rtnl_tca_option_data_hhf), + BUILD_UNION_ELEMENT_BY_STRING("htb", rtnl_tca_option_data_htb), + BUILD_UNION_ELEMENT_BY_STRING("pie", rtnl_tca_option_data_pie), + BUILD_UNION_ELEMENT_BY_STRING("qfq", rtnl_tca_option_data_qfq), + BUILD_UNION_ELEMENT_BY_STRING("sfb", rtnl_tca_option_data_sfb), + BUILD_UNION_ELEMENT_BY_STRING("tbf", rtnl_tca_option_data_tbf), +}; + +DEFINE_POLICY_SET_UNION(rtnl_tca_option_data, TCA_KIND); + +static const NLAPolicy rtnl_tca_policies[] = { + [TCA_KIND] = BUILD_POLICY(STRING), + [TCA_OPTIONS] = BUILD_POLICY_NESTED_UNION_BY_STRING(rtnl_tca_option_data), + [TCA_INGRESS_BLOCK] = BUILD_POLICY(U32), + [TCA_EGRESS_BLOCK] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(rtnl_tca); + +static const NLAPolicy rtnl_mdb_policies[] = { + [MDBA_SET_ENTRY] = BUILD_POLICY_WITH_SIZE(BINARY, sizeof(struct br_port_msg)), +}; + +DEFINE_POLICY_SET(rtnl_mdb); + +static const NLAPolicy rtnl_policies[] = { + [RTM_NEWLINK] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_DELLINK] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_GETLINK] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_SETLINK] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_NEWLINKPROP] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_DELLINKPROP] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_GETLINKPROP] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_link, sizeof(struct ifinfomsg)), + [RTM_NEWADDR] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_address, sizeof(struct ifaddrmsg)), + [RTM_DELADDR] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_address, sizeof(struct ifaddrmsg)), + [RTM_GETADDR] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_address, sizeof(struct ifaddrmsg)), + [RTM_NEWROUTE] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_route, sizeof(struct rtmsg)), + [RTM_DELROUTE] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_route, sizeof(struct rtmsg)), + [RTM_GETROUTE] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_route, sizeof(struct rtmsg)), + [RTM_NEWNEIGH] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_neigh, sizeof(struct ndmsg)), + [RTM_DELNEIGH] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_neigh, sizeof(struct ndmsg)), + [RTM_GETNEIGH] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_neigh, sizeof(struct ndmsg)), + [RTM_NEWADDRLABEL] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_addrlabel, sizeof(struct ifaddrlblmsg)), + [RTM_DELADDRLABEL] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_addrlabel, sizeof(struct ifaddrlblmsg)), + [RTM_GETADDRLABEL] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_addrlabel, sizeof(struct ifaddrlblmsg)), + [RTM_NEWRULE] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_routing_policy_rule, sizeof(struct fib_rule_hdr)), + [RTM_DELRULE] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_routing_policy_rule, sizeof(struct fib_rule_hdr)), + [RTM_GETRULE] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_routing_policy_rule, sizeof(struct fib_rule_hdr)), + [RTM_NEWNEXTHOP] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_nexthop, sizeof(struct nhmsg)), + [RTM_DELNEXTHOP] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_nexthop, sizeof(struct nhmsg)), + [RTM_GETNEXTHOP] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_nexthop, sizeof(struct nhmsg)), + [RTM_NEWQDISC] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_tca, sizeof(struct tcmsg)), + [RTM_DELQDISC] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_tca, sizeof(struct tcmsg)), + [RTM_GETQDISC] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_tca, sizeof(struct tcmsg)), + [RTM_NEWTCLASS] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_tca, sizeof(struct tcmsg)), + [RTM_DELTCLASS] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_tca, sizeof(struct tcmsg)), + [RTM_GETTCLASS] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_tca, sizeof(struct tcmsg)), + [RTM_NEWMDB] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_mdb, sizeof(struct br_port_msg)), + [RTM_DELMDB] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_mdb, sizeof(struct br_port_msg)), + [RTM_GETMDB] = BUILD_POLICY_NESTED_WITH_SIZE(rtnl_mdb, sizeof(struct br_port_msg)), +}; + +DEFINE_POLICY_SET(rtnl); + +const NLAPolicy *rtnl_get_policy(uint16_t nlmsg_type) { + return policy_set_get_policy(&rtnl_policy_set, nlmsg_type); +} diff --git a/src/libsystemd/sd-netlink/netlink-types.c b/src/libsystemd/sd-netlink/netlink-types.c new file mode 100644 index 0000000..21ef80c --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-types.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "netlink-genl.h" +#include "netlink-internal.h" +#include "netlink-types-internal.h" + +static const NLAPolicy empty_policies[1] = { + /* fake array to avoid .types==NULL, which denotes invalid type-systems */ +}; + +DEFINE_POLICY_SET(empty); + +static const NLAPolicy error_policies[] = { + [NLMSGERR_ATTR_MSG] = BUILD_POLICY(STRING), + [NLMSGERR_ATTR_OFFS] = BUILD_POLICY(U32), +}; + +DEFINE_POLICY_SET(error); + +static const NLAPolicy basic_policies[] = { + [NLMSG_DONE] = BUILD_POLICY_NESTED(empty), + [NLMSG_ERROR] = BUILD_POLICY_NESTED_WITH_SIZE(error, sizeof(struct nlmsgerr)), +}; + +DEFINE_POLICY_SET(basic); + +NLAType policy_get_type(const NLAPolicy *policy) { + return ASSERT_PTR(policy)->type; +} + +size_t policy_get_size(const NLAPolicy *policy) { + return ASSERT_PTR(policy)->size; +} + +const NLAPolicySet *policy_get_policy_set(const NLAPolicy *policy) { + assert(policy); + assert(policy->type == NETLINK_TYPE_NESTED); + + return ASSERT_PTR(policy->policy_set); +} + +const NLAPolicySetUnion *policy_get_policy_set_union(const NLAPolicy *policy) { + assert(policy); + assert(IN_SET(policy->type, NETLINK_TYPE_NESTED_UNION_BY_STRING, NETLINK_TYPE_NESTED_UNION_BY_FAMILY)); + + return ASSERT_PTR(policy->policy_set_union); +} + +int netlink_get_policy_set_and_header_size( + sd_netlink *nl, + uint16_t type, + const NLAPolicySet **ret_policy_set, + size_t *ret_header_size) { + + const NLAPolicy *policy; + + assert(nl); + + if (IN_SET(type, NLMSG_DONE, NLMSG_ERROR)) + policy = policy_set_get_policy(&basic_policy_set, type); + else + switch (nl->protocol) { + case NETLINK_ROUTE: + policy = rtnl_get_policy(type); + break; + case NETLINK_NETFILTER: + policy = nfnl_get_policy(type); + break; + case NETLINK_GENERIC: + return genl_get_policy_set_and_header_size(nl, type, ret_policy_set, ret_header_size); + default: + return -EOPNOTSUPP; + } + if (!policy) + return -EOPNOTSUPP; + + if (policy_get_type(policy) != NETLINK_TYPE_NESTED) + return -EOPNOTSUPP; + + if (ret_policy_set) + *ret_policy_set = policy_get_policy_set(policy); + if (ret_header_size) + *ret_header_size = policy_get_size(policy); + return 0; +} + +const NLAPolicy *policy_set_get_policy(const NLAPolicySet *policy_set, uint16_t attr_type) { + const NLAPolicy *policy; + + assert(policy_set); + assert(policy_set->policies); + + if (attr_type >= policy_set->count) + return NULL; + + policy = &policy_set->policies[attr_type]; + + if (policy->type == NETLINK_TYPE_UNSPEC) + return NULL; + + return policy; +} + +const NLAPolicySet *policy_set_get_policy_set(const NLAPolicySet *policy_set, uint16_t attr_type) { + const NLAPolicy *policy; + + policy = policy_set_get_policy(policy_set, attr_type); + if (!policy) + return NULL; + + return policy_get_policy_set(policy); +} + +const NLAPolicySetUnion *policy_set_get_policy_set_union(const NLAPolicySet *policy_set, uint16_t attr_type) { + const NLAPolicy *policy; + + policy = policy_set_get_policy(policy_set, attr_type); + if (!policy) + return NULL; + + return policy_get_policy_set_union(policy); +} + +uint16_t policy_set_union_get_match_attribute(const NLAPolicySetUnion *policy_set_union) { + assert(policy_set_union->match_attribute != 0); + + return policy_set_union->match_attribute; +} + +const NLAPolicySet *policy_set_union_get_policy_set_by_string(const NLAPolicySetUnion *policy_set_union, const char *string) { + assert(policy_set_union); + assert(policy_set_union->elements); + assert(string); + + for (size_t i = 0; i < policy_set_union->count; i++) + if (streq(policy_set_union->elements[i].string, string)) + return &policy_set_union->elements[i].policy_set; + + return NULL; +} + +const NLAPolicySet *policy_set_union_get_policy_set_by_family(const NLAPolicySetUnion *policy_set_union, int family) { + assert(policy_set_union); + assert(policy_set_union->elements); + + for (size_t i = 0; i < policy_set_union->count; i++) + if (policy_set_union->elements[i].family == family) + return &policy_set_union->elements[i].policy_set; + + return NULL; +} diff --git a/src/libsystemd/sd-netlink/netlink-types.h b/src/libsystemd/sd-netlink/netlink-types.h new file mode 100644 index 0000000..e034a98 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-types.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-netlink.h" + +typedef enum NLAType { + NETLINK_TYPE_UNSPEC, /* NLA_UNSPEC */ + NETLINK_TYPE_BINARY, /* NLA_BINARY */ + NETLINK_TYPE_FLAG, /* NLA_FLAG */ + NETLINK_TYPE_U8, /* NLA_U8 */ + NETLINK_TYPE_U16, /* NLA_U16 */ + NETLINK_TYPE_U32, /* NLA_U32 */ + NETLINK_TYPE_U64, /* NLA_U64 */ + NETLINK_TYPE_S8, /* NLA_S8 */ + NETLINK_TYPE_S16, /* NLA_S16 */ + NETLINK_TYPE_S32, /* NLA_S32 */ + NETLINK_TYPE_S64, /* NLA_S64 */ + NETLINK_TYPE_STRING, /* NLA_STRING */ + NETLINK_TYPE_BITFIELD32, /* NLA_BITFIELD32 */ + NETLINK_TYPE_REJECT, /* NLA_REJECT */ + NETLINK_TYPE_IN_ADDR, + NETLINK_TYPE_ETHER_ADDR, + NETLINK_TYPE_CACHE_INFO, + NETLINK_TYPE_SOCKADDR, + NETLINK_TYPE_NESTED, /* NLA_NESTED */ + NETLINK_TYPE_NESTED_UNION_BY_STRING, + NETLINK_TYPE_NESTED_UNION_BY_FAMILY, + _NETLINK_TYPE_MAX, + _NETLINK_TYPE_INVALID = -EINVAL, +} NLAType; + +typedef struct NLAPolicy NLAPolicy; +typedef struct NLAPolicySet NLAPolicySet; +typedef struct NLAPolicySetUnion NLAPolicySetUnion; + +const NLAPolicy *rtnl_get_policy(uint16_t nlmsg_type); +const NLAPolicy *nfnl_get_policy(uint16_t nlmsg_type); +const NLAPolicySet *genl_get_policy_set_by_name(const char *name); +int genl_get_policy_set_and_header_size( + sd_netlink *nl, + uint16_t id, + const NLAPolicySet **ret_policy_set, + size_t *ret_header_size); + +NLAType policy_get_type(const NLAPolicy *policy); +size_t policy_get_size(const NLAPolicy *policy); +const NLAPolicySet *policy_get_policy_set(const NLAPolicy *policy); +const NLAPolicySetUnion *policy_get_policy_set_union(const NLAPolicy *policy); + +int netlink_get_policy_set_and_header_size( + sd_netlink *nl, + uint16_t type, + const NLAPolicySet **ret_policy_set, + size_t *ret_header_size); + +const NLAPolicy *policy_set_get_policy(const NLAPolicySet *policy_set, uint16_t attr_type); +const NLAPolicySet *policy_set_get_policy_set(const NLAPolicySet *type_system, uint16_t attr_type); +const NLAPolicySetUnion *policy_set_get_policy_set_union(const NLAPolicySet *type_system, uint16_t attr_type); +uint16_t policy_set_union_get_match_attribute(const NLAPolicySetUnion *policy_set_union); +const NLAPolicySet *policy_set_union_get_policy_set_by_string(const NLAPolicySetUnion *type_system_union, const char *string); +const NLAPolicySet *policy_set_union_get_policy_set_by_family(const NLAPolicySetUnion *type_system_union, int family); diff --git a/src/libsystemd/sd-netlink/netlink-util.c b/src/libsystemd/sd-netlink/netlink-util.c new file mode 100644 index 0000000..832159a --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-util.c @@ -0,0 +1,818 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-netlink.h" + +#include "fd-util.h" +#include "iovec-util.h" +#include "memory-util.h" +#include "netlink-internal.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "strv.h" + +static int set_link_name(sd_netlink **rtnl, int ifindex, const char *name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + int r; + + assert(rtnl); + assert(ifindex > 0); + assert(name); + + /* Assign the requested name. */ + r = sd_rtnl_message_new_link(*rtnl, &message, RTM_SETLINK, ifindex); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(message, IFLA_IFNAME, name); + if (r < 0) + return r; + + return sd_netlink_call(*rtnl, message, 0, NULL); +} + +int rtnl_set_link_name(sd_netlink **rtnl, int ifindex, const char *name, char* const *alternative_names) { + _cleanup_strv_free_ char **original_altnames = NULL, **new_altnames = NULL; + bool altname_deleted = false; + int r; + + assert(rtnl); + assert(ifindex > 0); + + if (isempty(name) && strv_isempty(alternative_names)) + return 0; + + if (name && !ifname_valid(name)) + return -EINVAL; + + /* If the requested name is already assigned as an alternative name, then first drop it. */ + r = rtnl_get_link_alternative_names(rtnl, ifindex, &original_altnames); + if (r < 0) + log_debug_errno(r, "Failed to get alternative names on network interface %i, ignoring: %m", + ifindex); + + if (name) { + if (strv_contains(original_altnames, name)) { + r = rtnl_delete_link_alternative_names(rtnl, ifindex, STRV_MAKE(name)); + if (r < 0) + return log_debug_errno(r, "Failed to remove '%s' from alternative names on network interface %i: %m", + name, ifindex); + + altname_deleted = true; + } + + r = set_link_name(rtnl, ifindex, name); + if (r < 0) + goto fail; + } + + /* Filter out already assigned names from requested alternative names. Also, dedup the request. */ + STRV_FOREACH(a, alternative_names) { + if (streq_ptr(name, *a)) + continue; + + if (strv_contains(original_altnames, *a)) + continue; + + if (strv_contains(new_altnames, *a)) + continue; + + if (!ifname_valid_full(*a, IFNAME_VALID_ALTERNATIVE)) + continue; + + r = strv_extend(&new_altnames, *a); + if (r < 0) + return r; + } + + strv_sort(new_altnames); + + /* Finally, assign alternative names. */ + r = rtnl_set_link_alternative_names(rtnl, ifindex, new_altnames); + if (r == -EEXIST) /* Already assigned to another interface? */ + STRV_FOREACH(a, new_altnames) { + r = rtnl_set_link_alternative_names(rtnl, ifindex, STRV_MAKE(*a)); + if (r < 0) + log_debug_errno(r, "Failed to assign '%s' as an alternative name on network interface %i, ignoring: %m", + *a, ifindex); + } + else if (r < 0) + log_debug_errno(r, "Failed to assign alternative names on network interface %i, ignoring: %m", ifindex); + + return 0; + +fail: + if (altname_deleted) { + int q = rtnl_set_link_alternative_names(rtnl, ifindex, STRV_MAKE(name)); + if (q < 0) + log_debug_errno(q, "Failed to restore '%s' as an alternative name on network interface %i, ignoring: %m", + name, ifindex); + } + + return r; +} + +int rtnl_set_link_properties( + sd_netlink **rtnl, + int ifindex, + const char *alias, + const struct hw_addr_data *hw_addr, + uint32_t txqueues, + uint32_t rxqueues, + uint32_t txqueuelen, + uint32_t mtu, + uint32_t gso_max_size, + size_t gso_max_segments) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + int r; + + assert(rtnl); + assert(ifindex > 0); + + if (!alias && + (!hw_addr || hw_addr->length == 0) && + txqueues == 0 && + rxqueues == 0 && + txqueuelen == UINT32_MAX && + mtu == 0 && + gso_max_size == 0 && + gso_max_segments == 0) + return 0; + + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &message, RTM_SETLINK, ifindex); + if (r < 0) + return r; + + if (alias) { + r = sd_netlink_message_append_string(message, IFLA_IFALIAS, alias); + if (r < 0) + return r; + } + + if (hw_addr && hw_addr->length > 0) { + r = netlink_message_append_hw_addr(message, IFLA_ADDRESS, hw_addr); + if (r < 0) + return r; + } + + if (txqueues > 0) { + r = sd_netlink_message_append_u32(message, IFLA_NUM_TX_QUEUES, txqueues); + if (r < 0) + return r; + } + + if (rxqueues > 0) { + r = sd_netlink_message_append_u32(message, IFLA_NUM_RX_QUEUES, rxqueues); + if (r < 0) + return r; + } + + if (txqueuelen < UINT32_MAX) { + r = sd_netlink_message_append_u32(message, IFLA_TXQLEN, txqueuelen); + if (r < 0) + return r; + } + + if (mtu != 0) { + r = sd_netlink_message_append_u32(message, IFLA_MTU, mtu); + if (r < 0) + return r; + } + + if (gso_max_size > 0) { + r = sd_netlink_message_append_u32(message, IFLA_GSO_MAX_SIZE, gso_max_size); + if (r < 0) + return r; + } + + if (gso_max_segments > 0) { + r = sd_netlink_message_append_u32(message, IFLA_GSO_MAX_SEGS, gso_max_segments); + if (r < 0) + return r; + } + + r = sd_netlink_call(*rtnl, message, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +int rtnl_get_link_alternative_names(sd_netlink **rtnl, int ifindex, char ***ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL, *reply = NULL; + _cleanup_strv_free_ char **names = NULL; + int r; + + assert(rtnl); + assert(ifindex > 0); + assert(ret); + + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &message, RTM_GETLINK, ifindex); + if (r < 0) + return r; + + r = sd_netlink_call(*rtnl, message, 0, &reply); + if (r < 0) + return r; + + r = sd_netlink_message_read_strv(reply, IFLA_PROP_LIST, IFLA_ALT_IFNAME, &names); + if (r < 0 && r != -ENODATA) + return r; + + *ret = TAKE_PTR(names); + + return 0; +} + +static int rtnl_update_link_alternative_names( + sd_netlink **rtnl, + uint16_t nlmsg_type, + int ifindex, + char* const *alternative_names) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + int r; + + assert(rtnl); + assert(ifindex > 0); + assert(IN_SET(nlmsg_type, RTM_NEWLINKPROP, RTM_DELLINKPROP)); + + if (strv_isempty(alternative_names)) + return 0; + + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &message, nlmsg_type, ifindex); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(message, IFLA_PROP_LIST); + if (r < 0) + return r; + + r = sd_netlink_message_append_strv(message, IFLA_ALT_IFNAME, (const char**) alternative_names); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(message); + if (r < 0) + return r; + + r = sd_netlink_call(*rtnl, message, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +int rtnl_set_link_alternative_names(sd_netlink **rtnl, int ifindex, char* const *alternative_names) { + return rtnl_update_link_alternative_names(rtnl, RTM_NEWLINKPROP, ifindex, alternative_names); +} + +int rtnl_delete_link_alternative_names(sd_netlink **rtnl, int ifindex, char* const *alternative_names) { + return rtnl_update_link_alternative_names(rtnl, RTM_DELLINKPROP, ifindex, alternative_names); +} + +int rtnl_set_link_alternative_names_by_ifname( + sd_netlink **rtnl, + const char *ifname, + char* const *alternative_names) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + int r; + + assert(rtnl); + assert(ifname); + + if (strv_isempty(alternative_names)) + return 0; + + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &message, RTM_NEWLINKPROP, 0); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(message, IFLA_IFNAME, ifname); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(message, IFLA_PROP_LIST); + if (r < 0) + return r; + + r = sd_netlink_message_append_strv(message, IFLA_ALT_IFNAME, (const char**) alternative_names); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(message); + if (r < 0) + return r; + + r = sd_netlink_call(*rtnl, message, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +int rtnl_resolve_link_alternative_name(sd_netlink **rtnl, const char *name, char **ret) { + _cleanup_(sd_netlink_unrefp) sd_netlink *our_rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL, *reply = NULL; + int r, ifindex; + + assert(name); + + /* This returns ifindex and the main interface name. */ + + if (!ifname_valid_full(name, IFNAME_VALID_ALTERNATIVE)) + return -EINVAL; + + if (!rtnl) + rtnl = &our_rtnl; + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &message, RTM_GETLINK, 0); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(message, IFLA_ALT_IFNAME, name); + if (r < 0) + return r; + + r = sd_netlink_call(*rtnl, message, 0, &reply); + if (r == -EINVAL) + return -ENODEV; /* The device doesn't exist */ + if (r < 0) + return r; + + r = sd_rtnl_message_link_get_ifindex(reply, &ifindex); + if (r < 0) + return r; + assert(ifindex > 0); + + if (ret) { + r = sd_netlink_message_read_string_strdup(reply, IFLA_IFNAME, ret); + if (r < 0) + return r; + } + + return ifindex; +} + +int rtnl_resolve_ifname(sd_netlink **rtnl, const char *name) { + int r; + + /* Like if_nametoindex, but resolves "alternative names" too. */ + + assert(name); + + r = if_nametoindex(name); + if (r > 0) + return r; + + return rtnl_resolve_link_alternative_name(rtnl, name, NULL); +} + +int rtnl_resolve_interface(sd_netlink **rtnl, const char *name) { + int r; + + /* Like rtnl_resolve_ifname, but resolves interface numbers too. */ + + assert(name); + + r = parse_ifindex(name); + if (r > 0) + return r; + assert(r < 0); + + return rtnl_resolve_ifname(rtnl, name); +} + +int rtnl_resolve_interface_or_warn(sd_netlink **rtnl, const char *name) { + int r; + + r = rtnl_resolve_interface(rtnl, name); + if (r < 0) + return log_error_errno(r, "Failed to resolve interface \"%s\": %m", name); + return r; +} + +int rtnl_get_link_info( + sd_netlink **rtnl, + int ifindex, + unsigned short *ret_iftype, + unsigned *ret_flags, + char **ret_kind, + struct hw_addr_data *ret_hw_addr, + struct hw_addr_data *ret_permanent_hw_addr) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL, *reply = NULL; + struct hw_addr_data addr = HW_ADDR_NULL, perm_addr = HW_ADDR_NULL; + _cleanup_free_ char *kind = NULL; + unsigned short iftype; + unsigned flags; + int r; + + assert(rtnl); + assert(ifindex > 0); + + if (!ret_iftype && !ret_flags) + return 0; + + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &message, RTM_GETLINK, ifindex); + if (r < 0) + return r; + + r = sd_netlink_call(*rtnl, message, 0, &reply); + if (r == -EINVAL) + return -ENODEV; /* The device does not exist */ + if (r < 0) + return r; + + if (ret_iftype) { + r = sd_rtnl_message_link_get_type(reply, &iftype); + if (r < 0) + return r; + } + + if (ret_flags) { + r = sd_rtnl_message_link_get_flags(reply, &flags); + if (r < 0) + return r; + } + + if (ret_kind) { + r = sd_netlink_message_enter_container(reply, IFLA_LINKINFO); + if (r >= 0) { + r = sd_netlink_message_read_string_strdup(reply, IFLA_INFO_KIND, &kind); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_netlink_message_exit_container(reply); + if (r < 0) + return r; + } + } + + if (ret_hw_addr) { + r = netlink_message_read_hw_addr(reply, IFLA_ADDRESS, &addr); + if (r < 0 && r != -ENODATA) + return r; + } + + if (ret_permanent_hw_addr) { + r = netlink_message_read_hw_addr(reply, IFLA_PERM_ADDRESS, &perm_addr); + if (r < 0 && r != -ENODATA) + return r; + } + + if (ret_iftype) + *ret_iftype = iftype; + if (ret_flags) + *ret_flags = flags; + if (ret_kind) + *ret_kind = TAKE_PTR(kind); + if (ret_hw_addr) + *ret_hw_addr = addr; + if (ret_permanent_hw_addr) + *ret_permanent_hw_addr = perm_addr; + return 0; +} + +int rtnl_log_parse_error(int r) { + return log_error_errno(r, "Failed to parse netlink message: %m"); +} + +int rtnl_log_create_error(int r) { + return log_error_errno(r, "Failed to create netlink message: %m"); +} + +void rtattr_append_attribute_internal(struct rtattr *rta, unsigned short type, const void *data, size_t data_length) { + size_t padding_length; + uint8_t *padding; + + assert(rta); + assert(!data || data_length > 0); + + /* fill in the attribute */ + rta->rta_type = type; + rta->rta_len = RTA_LENGTH(data_length); + if (data) + /* we don't deal with the case where the user lies about the type + * and gives us too little data (so don't do that) + */ + padding = mempcpy(RTA_DATA(rta), data, data_length); + + else + /* if no data was passed, make sure we still initialize the padding + note that we can have data_length > 0 (used by some containers) */ + padding = RTA_DATA(rta); + + /* make sure also the padding at the end of the message is initialized */ + padding_length = (uint8_t *) rta + RTA_SPACE(data_length) - padding; + memzero(padding, padding_length); +} + +int rtattr_append_attribute(struct rtattr **rta, unsigned short type, const void *data, size_t data_length) { + struct rtattr *new_rta, *sub_rta; + size_t message_length; + + assert(rta); + assert(!data || data_length > 0); + + /* get the new message size (with padding at the end) */ + message_length = RTA_ALIGN(rta ? (*rta)->rta_len : 0) + RTA_SPACE(data_length); + + /* buffer should be smaller than both one page or 8K to be accepted by the kernel */ + if (message_length > MIN(page_size(), 8192UL)) + return -ENOBUFS; + + /* realloc to fit the new attribute */ + new_rta = realloc(*rta, message_length); + if (!new_rta) + return -ENOMEM; + *rta = new_rta; + + /* get pointer to the attribute we are about to add */ + sub_rta = (struct rtattr *) ((uint8_t *) *rta + RTA_ALIGN((*rta)->rta_len)); + + rtattr_append_attribute_internal(sub_rta, type, data, data_length); + + /* update rta_len */ + (*rta)->rta_len = message_length; + + return 0; +} + +MultipathRoute *multipath_route_free(MultipathRoute *m) { + if (!m) + return NULL; + + free(m->ifname); + + return mfree(m); +} + +int multipath_route_dup(const MultipathRoute *m, MultipathRoute **ret) { + _cleanup_(multipath_route_freep) MultipathRoute *n = NULL; + _cleanup_free_ char *ifname = NULL; + + assert(m); + assert(ret); + + if (m->ifname) { + ifname = strdup(m->ifname); + if (!ifname) + return -ENOMEM; + } + + n = new(MultipathRoute, 1); + if (!n) + return -ENOMEM; + + *n = (MultipathRoute) { + .gateway = m->gateway, + .weight = m->weight, + .ifindex = m->ifindex, + .ifname = TAKE_PTR(ifname), + }; + + *ret = TAKE_PTR(n); + + return 0; +} + +int rtattr_read_nexthop(const struct rtnexthop *rtnh, size_t size, int family, OrderedSet **ret) { + _cleanup_ordered_set_free_free_ OrderedSet *set = NULL; + int r; + + assert(rtnh); + assert(IN_SET(family, AF_INET, AF_INET6)); + + if (size < sizeof(struct rtnexthop)) + return -EBADMSG; + + for (; size >= sizeof(struct rtnexthop); ) { + _cleanup_(multipath_route_freep) MultipathRoute *m = NULL; + + if (NLMSG_ALIGN(rtnh->rtnh_len) > size) + return -EBADMSG; + + if (rtnh->rtnh_len < sizeof(struct rtnexthop)) + return -EBADMSG; + + m = new(MultipathRoute, 1); + if (!m) + return -ENOMEM; + + *m = (MultipathRoute) { + .ifindex = rtnh->rtnh_ifindex, + .weight = rtnh->rtnh_hops, + }; + + if (rtnh->rtnh_len > sizeof(struct rtnexthop)) { + size_t len = rtnh->rtnh_len - sizeof(struct rtnexthop); + + for (struct rtattr *attr = RTNH_DATA(rtnh); RTA_OK(attr, len); attr = RTA_NEXT(attr, len)) { + if (attr->rta_type == RTA_GATEWAY) { + if (attr->rta_len != RTA_LENGTH(FAMILY_ADDRESS_SIZE(family))) + return -EBADMSG; + + m->gateway.family = family; + memcpy(&m->gateway.address, RTA_DATA(attr), FAMILY_ADDRESS_SIZE(family)); + break; + } else if (attr->rta_type == RTA_VIA) { + uint16_t gw_family; + + if (family != AF_INET) + return -EINVAL; + + if (attr->rta_len < RTA_LENGTH(sizeof(uint16_t))) + return -EBADMSG; + + gw_family = *(uint16_t *) RTA_DATA(attr); + + if (gw_family != AF_INET6) + return -EBADMSG; + + if (attr->rta_len != RTA_LENGTH(FAMILY_ADDRESS_SIZE(gw_family) + sizeof(gw_family))) + return -EBADMSG; + + memcpy(&m->gateway, RTA_DATA(attr), FAMILY_ADDRESS_SIZE(gw_family) + sizeof(gw_family)); + break; + } + } + } + + r = ordered_set_ensure_put(&set, NULL, m); + if (r < 0) + return r; + + TAKE_PTR(m); + + size -= NLMSG_ALIGN(rtnh->rtnh_len); + rtnh = RTNH_NEXT(rtnh); + } + + if (ret) + *ret = TAKE_PTR(set); + return 0; +} + +bool netlink_pid_changed(sd_netlink *nl) { + /* We don't support people creating an nl connection and + * keeping it around over a fork(). Let's complain. */ + return ASSERT_PTR(nl)->original_pid != getpid_cached(); +} + +static int socket_open(int family) { + int fd; + + fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, family); + if (fd < 0) + return -errno; + + return fd_move_above_stdio(fd); +} + +int netlink_open_family(sd_netlink **ret, int family) { + _cleanup_close_ int fd = -EBADF; + int r; + + fd = socket_open(family); + if (fd < 0) + return fd; + + r = sd_netlink_open_fd(ret, fd); + if (r < 0) + return r; + TAKE_FD(fd); + + return 0; +} + +static bool serial_used(sd_netlink *nl, uint32_t serial) { + assert(nl); + + return + hashmap_contains(nl->reply_callbacks, UINT32_TO_PTR(serial)) || + hashmap_contains(nl->rqueue_by_serial, UINT32_TO_PTR(serial)) || + hashmap_contains(nl->rqueue_partial_by_serial, UINT32_TO_PTR(serial)); +} + +void netlink_seal_message(sd_netlink *nl, sd_netlink_message *m) { + uint32_t picked; + + assert(nl); + assert(!netlink_pid_changed(nl)); + assert(m); + assert(m->hdr); + + /* Avoid collisions with outstanding requests */ + do { + picked = nl->serial; + + /* Don't use seq == 0, as that is used for broadcasts, so we would get confused by replies to + such messages */ + nl->serial = nl->serial == UINT32_MAX ? 1 : nl->serial + 1; + + } while (serial_used(nl, picked)); + + m->hdr->nlmsg_seq = picked; + message_seal(m); +} + +static int socket_writev_message(sd_netlink *nl, sd_netlink_message **m, size_t msgcount) { + _cleanup_free_ struct iovec *iovs = NULL; + ssize_t k; + + assert(nl); + assert(m); + assert(msgcount > 0); + + iovs = new(struct iovec, msgcount); + if (!iovs) + return -ENOMEM; + + for (size_t i = 0; i < msgcount; i++) { + assert(m[i]->hdr); + assert(m[i]->hdr->nlmsg_len > 0); + + iovs[i] = IOVEC_MAKE(m[i]->hdr, m[i]->hdr->nlmsg_len); + } + + k = writev(nl->fd, iovs, msgcount); + if (k < 0) + return -errno; + + return k; +} + +int sd_netlink_sendv( + sd_netlink *nl, + sd_netlink_message **messages, + size_t msgcount, + uint32_t **ret_serial) { + + _cleanup_free_ uint32_t *serials = NULL; + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + assert_return(messages, -EINVAL); + assert_return(msgcount > 0, -EINVAL); + + if (ret_serial) { + serials = new(uint32_t, msgcount); + if (!serials) + return -ENOMEM; + } + + for (size_t i = 0; i < msgcount; i++) { + assert_return(!messages[i]->sealed, -EPERM); + + netlink_seal_message(nl, messages[i]); + if (serials) + serials[i] = message_get_serial(messages[i]); + } + + r = socket_writev_message(nl, messages, msgcount); + if (r < 0) + return r; + + if (ret_serial) + *ret_serial = TAKE_PTR(serials); + + return r; +} diff --git a/src/libsystemd/sd-netlink/netlink-util.h b/src/libsystemd/sd-netlink/netlink-util.h new file mode 100644 index 0000000..369f5d5 --- /dev/null +++ b/src/libsystemd/sd-netlink/netlink-util.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-netlink.h" + +#include "ether-addr-util.h" +#include "in-addr-util.h" +#include "ordered-set.h" +#include "socket-util.h" + +/* See struct rtvia in rtnetlink.h */ +typedef struct RouteVia { + uint16_t family; + union in_addr_union address; +} _packed_ RouteVia; + +typedef struct MultipathRoute { + RouteVia gateway; + uint32_t weight; + int ifindex; + char *ifname; +} MultipathRoute; + +MultipathRoute *multipath_route_free(MultipathRoute *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(MultipathRoute*, multipath_route_free); + +int multipath_route_dup(const MultipathRoute *m, MultipathRoute **ret); + +int rtnl_set_link_name(sd_netlink **rtnl, int ifindex, const char *name, char* const* alternative_names); +static inline int rtnl_append_link_alternative_names(sd_netlink **rtnl, int ifindex, char* const *alternative_names) { + return rtnl_set_link_name(rtnl, ifindex, NULL, alternative_names); +} +int rtnl_set_link_properties( + sd_netlink **rtnl, + int ifindex, + const char *alias, + const struct hw_addr_data *hw_addr, + uint32_t txqueues, + uint32_t rxqueues, + uint32_t txqueuelen, + uint32_t mtu, + uint32_t gso_max_size, + size_t gso_max_segments); +int rtnl_get_link_alternative_names(sd_netlink **rtnl, int ifindex, char ***ret); +int rtnl_set_link_alternative_names(sd_netlink **rtnl, int ifindex, char* const *alternative_names); +int rtnl_set_link_alternative_names_by_ifname(sd_netlink **rtnl, const char *ifname, char* const *alternative_names); +int rtnl_delete_link_alternative_names(sd_netlink **rtnl, int ifindex, char* const *alternative_names); +int rtnl_resolve_link_alternative_name(sd_netlink **rtnl, const char *name, char **ret); +int rtnl_resolve_ifname(sd_netlink **rtnl, const char *name); +int rtnl_resolve_interface(sd_netlink **rtnl, const char *name); +int rtnl_resolve_interface_or_warn(sd_netlink **rtnl, const char *name); +int rtnl_get_link_info( + sd_netlink **rtnl, + int ifindex, + unsigned short *ret_iftype, + unsigned *ret_flags, + char **ret_kind, + struct hw_addr_data *ret_hw_addr, + struct hw_addr_data *ret_permanent_hw_addr); + +int rtnl_log_parse_error(int r); +int rtnl_log_create_error(int r); + +#define netlink_call_async(nl, ret_slot, message, callback, destroy_callback, userdata) \ + ({ \ + int (*_callback_)(sd_netlink *, sd_netlink_message *, typeof(userdata)) = callback; \ + void (*_destroy_)(typeof(userdata)) = destroy_callback; \ + sd_netlink_call_async(nl, ret_slot, message, \ + (sd_netlink_message_handler_t) _callback_, \ + (sd_netlink_destroy_t) _destroy_, \ + userdata, 0, __func__); \ + }) + +#define netlink_add_match(nl, ret_slot, match, callback, destroy_callback, userdata, description) \ + ({ \ + int (*_callback_)(sd_netlink *, sd_netlink_message *, typeof(userdata)) = callback; \ + void (*_destroy_)(typeof(userdata)) = destroy_callback; \ + sd_netlink_add_match(nl, ret_slot, match, \ + (sd_netlink_message_handler_t) _callback_, \ + (sd_netlink_destroy_t) _destroy_, \ + userdata, description); \ + }) + +#define genl_add_match(nl, ret_slot, family, group, cmd, callback, destroy_callback, userdata, description) \ + ({ \ + int (*_callback_)(sd_netlink *, sd_netlink_message *, typeof(userdata)) = callback; \ + void (*_destroy_)(typeof(userdata)) = destroy_callback; \ + sd_genl_add_match(nl, ret_slot, family, group, cmd, \ + (sd_netlink_message_handler_t) _callback_, \ + (sd_netlink_destroy_t) _destroy_, \ + userdata, description); \ + }) + +int netlink_message_append_hw_addr(sd_netlink_message *m, unsigned short type, const struct hw_addr_data *data); +int netlink_message_append_in_addr_union(sd_netlink_message *m, unsigned short type, int family, const union in_addr_union *data); +int netlink_message_append_sockaddr_union(sd_netlink_message *m, unsigned short type, const union sockaddr_union *data); + +int netlink_message_read_hw_addr(sd_netlink_message *m, unsigned short type, struct hw_addr_data *data); +int netlink_message_read_in_addr_union(sd_netlink_message *m, unsigned short type, int family, union in_addr_union *data); + +void rtattr_append_attribute_internal(struct rtattr *rta, unsigned short type, const void *data, size_t data_length); +int rtattr_append_attribute(struct rtattr **rta, unsigned short type, const void *data, size_t data_length); + +int rtattr_read_nexthop(const struct rtnexthop *rtnh, size_t size, int family, OrderedSet **ret); + +void netlink_seal_message(sd_netlink *nl, sd_netlink_message *m); + +size_t netlink_get_reply_callback_count(sd_netlink *nl); + +/* TODO: to be exported later */ +int sd_netlink_sendv(sd_netlink *nl, sd_netlink_message **messages, size_t msgcnt, uint32_t **ret_serial); diff --git a/src/libsystemd/sd-netlink/sd-netlink.c b/src/libsystemd/sd-netlink/sd-netlink.c new file mode 100644 index 0000000..b6730b7 --- /dev/null +++ b/src/libsystemd/sd-netlink/sd-netlink.c @@ -0,0 +1,909 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "hashmap.h" +#include "io-util.h" +#include "macro.h" +#include "netlink-genl.h" +#include "netlink-internal.h" +#include "netlink-slot.h" +#include "netlink-util.h" +#include "process-util.h" +#include "socket-util.h" +#include "string-util.h" + +/* Some really high limit, to catch programming errors */ +#define REPLY_CALLBACKS_MAX UINT16_MAX + +static int netlink_new(sd_netlink **ret) { + _cleanup_(sd_netlink_unrefp) sd_netlink *nl = NULL; + + assert_return(ret, -EINVAL); + + nl = new(sd_netlink, 1); + if (!nl) + return -ENOMEM; + + *nl = (sd_netlink) { + .n_ref = 1, + .fd = -EBADF, + .sockaddr.nl.nl_family = AF_NETLINK, + .original_pid = getpid_cached(), + .protocol = -1, + + /* Kernel change notification messages have sequence number 0. We want to avoid that with our + * own serials, in order not to get confused when matching up kernel replies to our earlier + * requests. + * + * Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK + * socket for us and passes it to us across execve()) and we get restarted multiple times + * while the socket sticks around we might get confused by replies from earlier runs coming + * in late — which is pretty likely if we'd start our sequence numbers always from 1. Hence, + * let's start with a value based on the system clock. This should make collisions much less + * likely (though still theoretically possible). We use a 32 bit μs counter starting at boot + * for this (and explicitly exclude the zero, see above). This counter will wrap around after + * a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to + * reply to our requests. + * + * We only pick the initial start value this way. For each message we simply increase the + * sequence number by 1. This means we could enqueue 1 netlink message per μs without risking + * collisions, which should be OK. + * + * Note this means the serials will be in the range 1…UINT32_MAX here. + * + * (In an ideal world we'd attach the current serial counter to the netlink socket itself + * somehow, to avoid all this, but I couldn't come up with a nice way to do this) */ + .serial = (uint32_t) (now(CLOCK_MONOTONIC) % UINT32_MAX) + 1, + }; + + *ret = TAKE_PTR(nl); + return 0; +} + +int sd_netlink_open_fd(sd_netlink **ret, int fd) { + _cleanup_(sd_netlink_unrefp) sd_netlink *nl = NULL; + int r, protocol; + + assert_return(ret, -EINVAL); + assert_return(fd >= 0, -EBADF); + + r = netlink_new(&nl); + if (r < 0) + return r; + + r = getsockopt_int(fd, SOL_SOCKET, SO_PROTOCOL, &protocol); + if (r < 0) + return r; + + nl->fd = fd; + nl->protocol = protocol; + + r = setsockopt_int(fd, SOL_NETLINK, NETLINK_EXT_ACK, true); + if (r < 0) + log_debug_errno(r, "sd-netlink: Failed to enable NETLINK_EXT_ACK option, ignoring: %m"); + + r = setsockopt_int(fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, true); + if (r < 0) + log_debug_errno(r, "sd-netlink: Failed to enable NETLINK_GET_STRICT_CHK option, ignoring: %m"); + + r = socket_bind(nl); + if (r < 0) { + nl->fd = -EBADF; /* on failure, the caller remains owner of the fd, hence don't close it here */ + nl->protocol = -1; + return r; + } + + *ret = TAKE_PTR(nl); + + return 0; +} + +int sd_netlink_open(sd_netlink **ret) { + return netlink_open_family(ret, NETLINK_ROUTE); +} + +int sd_netlink_increase_rxbuf(sd_netlink *nl, size_t size) { + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + + return fd_increase_rxbuf(nl->fd, size); +} + +static sd_netlink *netlink_free(sd_netlink *nl) { + sd_netlink_slot *s; + + assert(nl); + + ordered_set_free(nl->rqueue); + hashmap_free(nl->rqueue_by_serial); + hashmap_free(nl->rqueue_partial_by_serial); + free(nl->rbuffer); + + while ((s = nl->slots)) { + assert(s->floating); + netlink_slot_disconnect(s, true); + } + hashmap_free(nl->reply_callbacks); + prioq_free(nl->reply_callbacks_prioq); + + sd_event_source_unref(nl->io_event_source); + sd_event_source_unref(nl->time_event_source); + sd_event_unref(nl->event); + + hashmap_free(nl->broadcast_group_refs); + + genl_clear_family(nl); + + safe_close(nl->fd); + return mfree(nl); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink, sd_netlink, netlink_free); + +int sd_netlink_send( + sd_netlink *nl, + sd_netlink_message *message, + uint32_t *serial) { + + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + assert_return(message, -EINVAL); + assert_return(!message->sealed, -EPERM); + + netlink_seal_message(nl, message); + + r = socket_write_message(nl, message); + if (r < 0) + return r; + + if (serial) + *serial = message_get_serial(message); + + return 1; +} + +static int dispatch_rqueue(sd_netlink *nl, sd_netlink_message **ret) { + sd_netlink_message *m; + int r; + + assert(nl); + assert(ret); + + if (ordered_set_size(nl->rqueue) <= 0) { + /* Try to read a new message */ + r = socket_read_message(nl); + if (r == -ENOBUFS) /* FIXME: ignore buffer overruns for now */ + log_debug_errno(r, "sd-netlink: Got ENOBUFS from netlink socket, ignoring."); + else if (r < 0) + return r; + } + + /* Dispatch a queued message */ + m = ordered_set_steal_first(nl->rqueue); + if (m) + sd_netlink_message_unref(hashmap_remove_value(nl->rqueue_by_serial, UINT32_TO_PTR(message_get_serial(m)), m)); + *ret = m; + return !!m; +} + +static int process_timeout(sd_netlink *nl) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + struct reply_callback *c; + sd_netlink_slot *slot; + usec_t n; + int r; + + assert(nl); + + c = prioq_peek(nl->reply_callbacks_prioq); + if (!c) + return 0; + + n = now(CLOCK_MONOTONIC); + if (c->timeout > n) + return 0; + + r = message_new_synthetic_error(nl, -ETIMEDOUT, c->serial, &m); + if (r < 0) + return r; + + assert_se(prioq_pop(nl->reply_callbacks_prioq) == c); + hashmap_remove(nl->reply_callbacks, UINT32_TO_PTR(c->serial)); + + slot = container_of(c, sd_netlink_slot, reply_callback); + + r = c->callback(nl, m, slot->userdata); + if (r < 0) + log_debug_errno(r, "sd-netlink: timedout callback %s%s%sfailed: %m", + slot->description ? "'" : "", + strempty(slot->description), + slot->description ? "' " : ""); + + if (slot->floating) + netlink_slot_disconnect(slot, true); + + return 1; +} + +static int process_reply(sd_netlink *nl, sd_netlink_message *m) { + struct reply_callback *c; + sd_netlink_slot *slot; + uint32_t serial; + uint16_t type; + int r; + + assert(nl); + assert(m); + + serial = message_get_serial(m); + c = hashmap_remove(nl->reply_callbacks, UINT32_TO_PTR(serial)); + if (!c) + return 0; + + if (c->timeout != USEC_INFINITY) + prioq_remove(nl->reply_callbacks_prioq, c, &c->prioq_idx); + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) + return r; + + if (type == NLMSG_DONE) + m = NULL; + + slot = container_of(c, sd_netlink_slot, reply_callback); + + r = c->callback(nl, m, slot->userdata); + if (r < 0) + log_debug_errno(r, "sd-netlink: reply callback %s%s%sfailed: %m", + slot->description ? "'" : "", + strempty(slot->description), + slot->description ? "' " : ""); + + if (slot->floating) + netlink_slot_disconnect(slot, true); + + return 1; +} + +static int process_match(sd_netlink *nl, sd_netlink_message *m) { + uint16_t type; + uint8_t cmd; + int r; + + assert(nl); + assert(m); + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) + return r; + + if (m->protocol == NETLINK_GENERIC) { + r = sd_genl_message_get_command(nl, m, &cmd); + if (r < 0) + return r; + } else + cmd = 0; + + LIST_FOREACH(match_callbacks, c, nl->match_callbacks) { + sd_netlink_slot *slot; + bool found = false; + + if (c->type != type) + continue; + if (c->cmd != 0 && c->cmd != cmd) + continue; + + for (size_t i = 0; i < c->n_groups; i++) + if (c->groups[i] == m->multicast_group) { + found = true; + break; + } + + if (!found) + continue; + + slot = container_of(c, sd_netlink_slot, match_callback); + + r = c->callback(nl, m, slot->userdata); + if (r < 0) + log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m", + slot->description ? "'" : "", + strempty(slot->description), + slot->description ? "' " : ""); + if (r != 0) + break; + } + + return 1; +} + +static int process_running(sd_netlink *nl, sd_netlink_message **ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(nl); + + r = process_timeout(nl); + if (r != 0) + goto null_message; + + r = dispatch_rqueue(nl, &m); + if (r < 0) + return r; + if (!m) + goto null_message; + + if (sd_netlink_message_is_broadcast(m)) + r = process_match(nl, m); + else + r = process_reply(nl, m); + if (r != 0) + goto null_message; + + if (ret) { + *ret = TAKE_PTR(m); + + return 1; + } + + return 1; + +null_message: + if (r >= 0 && ret) + *ret = NULL; + + return r; +} + +int sd_netlink_process(sd_netlink *nl, sd_netlink_message **ret) { + NETLINK_DONT_DESTROY(nl); + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + assert_return(!nl->processing, -EBUSY); + + nl->processing = true; + r = process_running(nl, ret); + nl->processing = false; + + return r; +} + +static usec_t timespan_to_timestamp(usec_t usec) { + static bool default_timeout_set = false; + static usec_t default_timeout; + int r; + + if (usec == 0) { + if (!default_timeout_set) { + const char *e; + + default_timeout_set = true; + default_timeout = NETLINK_DEFAULT_TIMEOUT_USEC; + + e = secure_getenv("SYSTEMD_NETLINK_DEFAULT_TIMEOUT"); + if (e) { + r = parse_sec(e, &default_timeout); + if (r < 0) + log_debug_errno(r, "sd-netlink: Failed to parse $SYSTEMD_NETLINK_DEFAULT_TIMEOUT environment variable, ignoring: %m"); + } + } + + usec = default_timeout; + } + + return usec_add(now(CLOCK_MONOTONIC), usec); +} + +static int netlink_poll(sd_netlink *nl, bool need_more, usec_t timeout_usec) { + usec_t m = USEC_INFINITY; + int r, e; + + assert(nl); + + e = sd_netlink_get_events(nl); + if (e < 0) + return e; + + if (need_more) + /* Caller wants more data, and doesn't care about + * what's been read or any other timeouts. */ + e |= POLLIN; + else { + usec_t until; + + /* Caller wants to process if there is something to + * process, but doesn't care otherwise */ + + r = sd_netlink_get_timeout(nl, &until); + if (r < 0) + return r; + + m = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + } + + r = fd_wait_for_event(nl->fd, e, MIN(m, timeout_usec)); + if (r <= 0) + return r; + + return 1; +} + +int sd_netlink_wait(sd_netlink *nl, uint64_t timeout_usec) { + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + + if (ordered_set_size(nl->rqueue) > 0) + return 0; + + r = netlink_poll(nl, false, timeout_usec); + if (ERRNO_IS_NEG_TRANSIENT(r)) /* Convert EINTR to "something happened" and give user a chance to run some code before calling back into us */ + return 1; + return r; +} + +static int timeout_compare(const void *a, const void *b) { + const struct reply_callback *x = a, *y = b; + + return CMP(x->timeout, y->timeout); +} + +size_t netlink_get_reply_callback_count(sd_netlink *nl) { + assert(nl); + + return hashmap_size(nl->reply_callbacks); +} + +int sd_netlink_call_async( + sd_netlink *nl, + sd_netlink_slot **ret_slot, + sd_netlink_message *m, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, + uint64_t usec, + const char *description) { + + _cleanup_free_ sd_netlink_slot *slot = NULL; + int r, k; + + assert_return(nl, -EINVAL); + assert_return(m, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + + if (hashmap_size(nl->reply_callbacks) >= REPLY_CALLBACKS_MAX) + return -EXFULL; + + r = hashmap_ensure_allocated(&nl->reply_callbacks, &trivial_hash_ops); + if (r < 0) + return r; + + if (usec != UINT64_MAX) { + r = prioq_ensure_allocated(&nl->reply_callbacks_prioq, timeout_compare); + if (r < 0) + return r; + } + + r = netlink_slot_allocate(nl, !ret_slot, NETLINK_REPLY_CALLBACK, sizeof(struct reply_callback), userdata, description, &slot); + if (r < 0) + return r; + + slot->reply_callback.callback = callback; + slot->reply_callback.timeout = timespan_to_timestamp(usec); + + k = sd_netlink_send(nl, m, &slot->reply_callback.serial); + if (k < 0) + return k; + + r = hashmap_put(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial), &slot->reply_callback); + if (r < 0) + return r; + + if (slot->reply_callback.timeout != USEC_INFINITY) { + r = prioq_put(nl->reply_callbacks_prioq, &slot->reply_callback, &slot->reply_callback.prioq_idx); + if (r < 0) { + (void) hashmap_remove(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial)); + return r; + } + } + + /* Set this at last. Otherwise, some failures in above would call destroy_callback but some would not. */ + slot->destroy_callback = destroy_callback; + + if (ret_slot) + *ret_slot = slot; + + TAKE_PTR(slot); + + return k; +} + +int sd_netlink_read( + sd_netlink *nl, + uint32_t serial, + uint64_t usec, + sd_netlink_message **ret) { + + usec_t timeout; + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + + timeout = timespan_to_timestamp(usec); + + for (;;) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + usec_t left; + + m = hashmap_remove(nl->rqueue_by_serial, UINT32_TO_PTR(serial)); + if (m) { + uint16_t type; + + /* found a match, remove from rqueue and return it */ + sd_netlink_message_unref(ordered_set_remove(nl->rqueue, m)); + + r = sd_netlink_message_get_errno(m); + if (r < 0) + return r; + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) + return r; + + if (type == NLMSG_DONE) { + if (ret) + *ret = NULL; + return 0; + } + + if (ret) + *ret = TAKE_PTR(m); + return 1; + } + + r = socket_read_message(nl); + if (r < 0) + return r; + if (r > 0) + /* received message, so try to process straight away */ + continue; + + if (timeout != USEC_INFINITY) { + usec_t n; + + n = now(CLOCK_MONOTONIC); + if (n >= timeout) + return -ETIMEDOUT; + + left = usec_sub_unsigned(timeout, n); + } else + left = USEC_INFINITY; + + r = netlink_poll(nl, true, left); + if (r < 0) + return r; + if (r == 0) + return -ETIMEDOUT; + } +} + +int sd_netlink_call( + sd_netlink *nl, + sd_netlink_message *message, + uint64_t usec, + sd_netlink_message **ret) { + + uint32_t serial; + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + assert_return(message, -EINVAL); + + r = sd_netlink_send(nl, message, &serial); + if (r < 0) + return r; + + return sd_netlink_read(nl, serial, usec, ret); +} + +int sd_netlink_get_events(sd_netlink *nl) { + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + + return ordered_set_size(nl->rqueue) == 0 ? POLLIN : 0; +} + +int sd_netlink_get_timeout(sd_netlink *nl, uint64_t *timeout_usec) { + struct reply_callback *c; + + assert_return(nl, -EINVAL); + assert_return(timeout_usec, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + + if (ordered_set_size(nl->rqueue) > 0) { + *timeout_usec = 0; + return 1; + } + + c = prioq_peek(nl->reply_callbacks_prioq); + if (!c) { + *timeout_usec = UINT64_MAX; + return 0; + } + + *timeout_usec = c->timeout; + return 1; +} + +static int io_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + sd_netlink *nl = ASSERT_PTR(userdata); + int r; + + r = sd_netlink_process(nl, NULL); + if (r < 0) + return r; + + return 1; +} + +static int time_callback(sd_event_source *s, uint64_t usec, void *userdata) { + sd_netlink *nl = ASSERT_PTR(userdata); + int r; + + r = sd_netlink_process(nl, NULL); + if (r < 0) + return r; + + return 1; +} + +static int prepare_callback(sd_event_source *s, void *userdata) { + sd_netlink *nl = ASSERT_PTR(userdata); + int r, enabled; + usec_t until; + + assert(s); + + r = sd_netlink_get_events(nl); + if (r < 0) + return r; + + r = sd_event_source_set_io_events(nl->io_event_source, r); + if (r < 0) + return r; + + enabled = sd_netlink_get_timeout(nl, &until); + if (enabled < 0) + return enabled; + if (enabled > 0) { + r = sd_event_source_set_time(nl->time_event_source, until); + if (r < 0) + return r; + } + + r = sd_event_source_set_enabled(nl->time_event_source, + enabled > 0 ? SD_EVENT_ONESHOT : SD_EVENT_OFF); + if (r < 0) + return r; + + return 1; +} + +int sd_netlink_attach_event(sd_netlink *nl, sd_event *event, int64_t priority) { + int r; + + assert_return(nl, -EINVAL); + assert_return(!nl->event, -EBUSY); + + assert(!nl->io_event_source); + assert(!nl->time_event_source); + + if (event) + nl->event = sd_event_ref(event); + else { + r = sd_event_default(&nl->event); + if (r < 0) + return r; + } + + r = sd_event_add_io(nl->event, &nl->io_event_source, nl->fd, 0, io_callback, nl); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(nl->io_event_source, priority); + if (r < 0) + goto fail; + + r = sd_event_source_set_description(nl->io_event_source, "netlink-receive-message"); + if (r < 0) + goto fail; + + r = sd_event_source_set_prepare(nl->io_event_source, prepare_callback); + if (r < 0) + goto fail; + + r = sd_event_add_time(nl->event, &nl->time_event_source, CLOCK_MONOTONIC, 0, 0, time_callback, nl); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(nl->time_event_source, priority); + if (r < 0) + goto fail; + + r = sd_event_source_set_description(nl->time_event_source, "netlink-timer"); + if (r < 0) + goto fail; + + return 0; + +fail: + sd_netlink_detach_event(nl); + return r; +} + +int sd_netlink_detach_event(sd_netlink *nl) { + assert_return(nl, -EINVAL); + assert_return(nl->event, -ENXIO); + + nl->io_event_source = sd_event_source_unref(nl->io_event_source); + + nl->time_event_source = sd_event_source_unref(nl->time_event_source); + + nl->event = sd_event_unref(nl->event); + + return 0; +} + +sd_event* sd_netlink_get_event(sd_netlink *nl) { + assert_return(nl, NULL); + + return nl->event; +} + +int netlink_add_match_internal( + sd_netlink *nl, + sd_netlink_slot **ret_slot, + const uint32_t *groups, + size_t n_groups, + uint16_t type, + uint8_t cmd, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, + const char *description) { + + _cleanup_free_ sd_netlink_slot *slot = NULL; + int r; + + assert(groups); + assert(n_groups > 0); + + for (size_t i = 0; i < n_groups; i++) { + r = socket_broadcast_group_ref(nl, groups[i]); + if (r < 0) + return r; + } + + r = netlink_slot_allocate(nl, !ret_slot, NETLINK_MATCH_CALLBACK, sizeof(struct match_callback), + userdata, description, &slot); + if (r < 0) + return r; + + slot->match_callback.groups = newdup(uint32_t, groups, n_groups); + if (!slot->match_callback.groups) + return -ENOMEM; + + slot->match_callback.n_groups = n_groups; + slot->match_callback.callback = callback; + slot->match_callback.type = type; + slot->match_callback.cmd = cmd; + + LIST_PREPEND(match_callbacks, nl->match_callbacks, &slot->match_callback); + + /* Set this at last. Otherwise, some failures in above call the destroy callback but some do not. */ + slot->destroy_callback = destroy_callback; + + if (ret_slot) + *ret_slot = slot; + + TAKE_PTR(slot); + return 0; +} + +int sd_netlink_add_match( + sd_netlink *rtnl, + sd_netlink_slot **ret_slot, + uint16_t type, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, + const char *description) { + + static const uint32_t + address_groups[] = { RTNLGRP_IPV4_IFADDR, RTNLGRP_IPV6_IFADDR, }, + link_groups[] = { RTNLGRP_LINK, }, + neighbor_groups[] = { RTNLGRP_NEIGH, }, + nexthop_groups[] = { RTNLGRP_NEXTHOP, }, + route_groups[] = { RTNLGRP_IPV4_ROUTE, RTNLGRP_IPV6_ROUTE, }, + rule_groups[] = { RTNLGRP_IPV4_RULE, RTNLGRP_IPV6_RULE, }, + tc_groups[] = { RTNLGRP_TC }; + const uint32_t *groups; + size_t n_groups; + + assert_return(rtnl, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(!netlink_pid_changed(rtnl), -ECHILD); + + switch (type) { + case RTM_NEWLINK: + case RTM_DELLINK: + groups = link_groups; + n_groups = ELEMENTSOF(link_groups); + break; + case RTM_NEWADDR: + case RTM_DELADDR: + groups = address_groups; + n_groups = ELEMENTSOF(address_groups); + break; + case RTM_NEWNEIGH: + case RTM_DELNEIGH: + groups = neighbor_groups; + n_groups = ELEMENTSOF(neighbor_groups); + break; + case RTM_NEWROUTE: + case RTM_DELROUTE: + groups = route_groups; + n_groups = ELEMENTSOF(route_groups); + break; + case RTM_NEWRULE: + case RTM_DELRULE: + groups = rule_groups; + n_groups = ELEMENTSOF(rule_groups); + break; + case RTM_NEWNEXTHOP: + case RTM_DELNEXTHOP: + groups = nexthop_groups; + n_groups = ELEMENTSOF(nexthop_groups); + break; + case RTM_NEWQDISC: + case RTM_DELQDISC: + case RTM_NEWTCLASS: + case RTM_DELTCLASS: + groups = tc_groups; + n_groups = ELEMENTSOF(tc_groups); + break; + default: + return -EOPNOTSUPP; + } + + return netlink_add_match_internal(rtnl, ret_slot, groups, n_groups, type, 0, callback, + destroy_callback, userdata, description); +} + +int sd_netlink_attach_filter(sd_netlink *nl, size_t len, const struct sock_filter *filter) { + assert_return(nl, -EINVAL); + assert_return(len == 0 || filter, -EINVAL); + + if (setsockopt(nl->fd, SOL_SOCKET, + len == 0 ? SO_DETACH_FILTER : SO_ATTACH_FILTER, + &(struct sock_fprog) { + .len = len, + .filter = (struct sock_filter*) filter, + }, sizeof(struct sock_fprog)) < 0) + return -errno; + + return 0; +} diff --git a/src/libsystemd/sd-netlink/test-netlink.c b/src/libsystemd/sd-netlink/test-netlink.c new file mode 100644 index 0000000..13aedc4 --- /dev/null +++ b/src/libsystemd/sd-netlink/test-netlink.c @@ -0,0 +1,686 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "macro.h" +#include "netlink-genl.h" +#include "netlink-internal.h" +#include "netlink-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +TEST(message_newlink_bridge) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + uint32_t cost; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &message, RTM_NEWLINK, 1) >= 0); + assert_se(sd_rtnl_message_link_set_family(message, AF_BRIDGE) >= 0); + assert_se(sd_netlink_message_open_container(message, IFLA_PROTINFO) >= 0); + assert_se(sd_netlink_message_append_u32(message, IFLA_BRPORT_COST, 10) >= 0); + assert_se(sd_netlink_message_close_container(message) >= 0); + + assert_se(sd_netlink_message_rewind(message, rtnl) >= 0); + + assert_se(sd_netlink_message_enter_container(message, IFLA_PROTINFO) >= 0); + assert_se(sd_netlink_message_read_u32(message, IFLA_BRPORT_COST, &cost) >= 0); + assert_se(cost == 10); + assert_se(sd_netlink_message_exit_container(message) >= 0); +} + +TEST(message_getlink) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL, *reply = NULL; + int ifindex; + uint8_t u8_data; + uint16_t u16_data; + uint32_t u32_data; + const char *str_data; + struct ether_addr eth_data; + + assert_se(sd_netlink_open(&rtnl) >= 0); + ifindex = (int) if_nametoindex("lo"); + + /* we'd really like to test NEWLINK, but let's not mess with the running kernel */ + assert_se(sd_rtnl_message_new_link(rtnl, &message, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_netlink_call(rtnl, message, 0, &reply) == 1); + + /* u8 */ + assert_se(sd_netlink_message_read_u8(reply, IFLA_CARRIER, &u8_data) >= 0); + assert_se(sd_netlink_message_read_u8(reply, IFLA_OPERSTATE, &u8_data) >= 0); + assert_se(sd_netlink_message_read_u8(reply, IFLA_LINKMODE, &u8_data) >= 0); + + /* u16 */ + assert_se(sd_netlink_message_get_type(reply, &u16_data) >= 0); + assert_se(u16_data == RTM_NEWLINK); + + /* u32 */ + assert_se(sd_netlink_message_read_u32(reply, IFLA_MTU, &u32_data) >= 0); + assert_se(sd_netlink_message_read_u32(reply, IFLA_GROUP, &u32_data) >= 0); + assert_se(sd_netlink_message_read_u32(reply, IFLA_TXQLEN, &u32_data) >= 0); + assert_se(sd_netlink_message_read_u32(reply, IFLA_NUM_TX_QUEUES, &u32_data) >= 0); + + /* string */ + assert_se(sd_netlink_message_read_string(reply, IFLA_IFNAME, &str_data) >= 0); + + /* ether_addr */ + assert_se(sd_netlink_message_read_ether_addr(reply, IFLA_ADDRESS, ð_data) >= 0); +} + +TEST(message_address) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL, *reply = NULL; + int ifindex; + struct in_addr in_data; + struct ifa_cacheinfo cache; + const char *label; + + assert_se(sd_netlink_open(&rtnl) >= 0); + ifindex = (int) if_nametoindex("lo"); + + assert_se(sd_rtnl_message_new_addr(rtnl, &message, RTM_GETADDR, ifindex, AF_INET) >= 0); + assert_se(sd_netlink_message_set_request_dump(message, true) >= 0); + assert_se(sd_netlink_call(rtnl, message, 0, &reply) == 1); + + assert_se(sd_netlink_message_read_in_addr(reply, IFA_LOCAL, &in_data) >= 0); + assert_se(sd_netlink_message_read_in_addr(reply, IFA_ADDRESS, &in_data) >= 0); + assert_se(sd_netlink_message_read_string(reply, IFA_LABEL, &label) >= 0); + assert_se(sd_netlink_message_read_cache_info(reply, IFA_CACHEINFO, &cache) == 0); +} + +TEST(message_route) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + struct in_addr addr, addr_data; + uint32_t index = 2, u32_data; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_route(rtnl, &req, RTM_NEWROUTE, AF_INET, RTPROT_STATIC) >= 0); + + addr.s_addr = htobe32(INADDR_LOOPBACK); + + assert_se(sd_netlink_message_append_in_addr(req, RTA_GATEWAY, &addr) >= 0); + assert_se(sd_netlink_message_append_u32(req, RTA_OIF, index) >= 0); + + assert_se(sd_netlink_message_rewind(req, rtnl) >= 0); + + assert_se(sd_netlink_message_read_in_addr(req, RTA_GATEWAY, &addr_data) >= 0); + assert_se(addr_data.s_addr == addr.s_addr); + + assert_se(sd_netlink_message_read_u32(req, RTA_OIF, &u32_data) >= 0); + assert_se(u32_data == index); + + assert_se((req = sd_netlink_message_unref(req)) == NULL); +} + +static int link_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + const char *data; + + assert_se(rtnl); + assert_se(m); + + assert_se(streq_ptr(userdata, "foo")); + + assert_se(sd_netlink_message_read_string(m, IFLA_IFNAME, &data) >= 0); + assert_se(streq(data, "lo")); + + log_info("%s: got link info about %s", __func__, data); + return 1; +} + +TEST(netlink_event_loop) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + _cleanup_free_ char *userdata = NULL; + int ifindex; + + assert_se(sd_netlink_open(&rtnl) >= 0); + ifindex = (int) if_nametoindex("lo"); + + assert_se(userdata = strdup("foo")); + + assert_se(sd_event_default(&event) >= 0); + assert_se(sd_netlink_attach_event(rtnl, event, 0) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_netlink_call_async(rtnl, NULL, m, link_handler, NULL, userdata, 0, NULL) >= 0); + + assert_se(sd_event_run(event, 0) >= 0); + + assert_se(sd_netlink_detach_event(rtnl) >= 0); + assert_se((rtnl = sd_netlink_unref(rtnl)) == NULL); +} + +static void test_async_destroy(void *userdata) { +} + +TEST(netlink_call_async) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL; + _cleanup_(sd_netlink_slot_unrefp) sd_netlink_slot *slot = NULL; + _cleanup_free_ char *userdata = NULL; + sd_netlink_destroy_t destroy_callback; + const char *description; + int ifindex; + + assert_se(sd_netlink_open(&rtnl) >= 0); + ifindex = (int) if_nametoindex("lo"); + + assert_se(userdata = strdup("foo")); + + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_netlink_call_async(rtnl, &slot, m, link_handler, test_async_destroy, userdata, 0, "hogehoge") >= 0); + + assert_se(sd_netlink_slot_get_netlink(slot) == rtnl); + + assert_se(sd_netlink_slot_get_userdata(slot) == userdata); + assert_se(sd_netlink_slot_set_userdata(slot, NULL) == userdata); + assert_se(sd_netlink_slot_get_userdata(slot) == NULL); + assert_se(sd_netlink_slot_set_userdata(slot, userdata) == NULL); + assert_se(sd_netlink_slot_get_userdata(slot) == userdata); + + assert_se(sd_netlink_slot_get_destroy_callback(slot, &destroy_callback) == 1); + assert_se(destroy_callback == test_async_destroy); + assert_se(sd_netlink_slot_set_destroy_callback(slot, NULL) >= 0); + assert_se(sd_netlink_slot_get_destroy_callback(slot, &destroy_callback) == 0); + assert_se(destroy_callback == NULL); + assert_se(sd_netlink_slot_set_destroy_callback(slot, test_async_destroy) >= 0); + assert_se(sd_netlink_slot_get_destroy_callback(slot, &destroy_callback) == 1); + assert_se(destroy_callback == test_async_destroy); + + assert_se(sd_netlink_slot_get_floating(slot) == 0); + assert_se(sd_netlink_slot_set_floating(slot, 1) == 1); + assert_se(sd_netlink_slot_get_floating(slot) == 1); + + assert_se(sd_netlink_slot_get_description(slot, &description) == 1); + assert_se(streq(description, "hogehoge")); + assert_se(sd_netlink_slot_set_description(slot, NULL) >= 0); + assert_se(sd_netlink_slot_get_description(slot, &description) == 0); + assert_se(description == NULL); + + assert_se(sd_netlink_wait(rtnl, 0) >= 0); + assert_se(sd_netlink_process(rtnl, &reply) >= 0); + + assert_se((rtnl = sd_netlink_unref(rtnl)) == NULL); +} + +struct test_async_object { + unsigned n_ref; + char *ifname; +}; + +static struct test_async_object *test_async_object_free(struct test_async_object *t) { + assert_se(t); + + free(t->ifname); + return mfree(t); +} + +DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(struct test_async_object, test_async_object, test_async_object_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct test_async_object *, test_async_object_unref); + +static int link_handler2(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + struct test_async_object *t = userdata; + const char *data; + + assert_se(rtnl); + assert_se(m); + assert_se(userdata); + + log_info("%s: got link info about %s", __func__, t->ifname); + + assert_se(sd_netlink_message_read_string(m, IFLA_IFNAME, &data) >= 0); + assert_se(streq(data, "lo")); + + return 1; +} + +static void test_async_object_destroy(void *userdata) { + struct test_async_object *t = userdata; + + assert_se(userdata); + + log_info("%s: n_ref=%u", __func__, t->n_ref); + test_async_object_unref(t); +} + +TEST(async_destroy_callback) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL; + _cleanup_(test_async_object_unrefp) struct test_async_object *t = NULL; + _cleanup_(sd_netlink_slot_unrefp) sd_netlink_slot *slot = NULL; + int ifindex; + + assert_se(sd_netlink_open(&rtnl) >= 0); + ifindex = (int) if_nametoindex("lo"); + + assert_se(t = new(struct test_async_object, 1)); + *t = (struct test_async_object) { + .n_ref = 1, + }; + assert_se(t->ifname = strdup("lo")); + + /* destroy callback is called after processing message */ + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_netlink_call_async(rtnl, NULL, m, link_handler2, test_async_object_destroy, t, 0, NULL) >= 0); + + assert_se(t->n_ref == 1); + assert_se(test_async_object_ref(t)); + assert_se(t->n_ref == 2); + + assert_se(sd_netlink_wait(rtnl, 0) >= 0); + assert_se(sd_netlink_process(rtnl, &reply) == 1); + assert_se(t->n_ref == 1); + + assert_se(!sd_netlink_message_unref(m)); + + /* destroy callback is called when asynchronous call is cancelled, that is, slot is freed. */ + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_netlink_call_async(rtnl, &slot, m, link_handler2, test_async_object_destroy, t, 0, NULL) >= 0); + + assert_se(t->n_ref == 1); + assert_se(test_async_object_ref(t)); + assert_se(t->n_ref == 2); + + assert_se(!(slot = sd_netlink_slot_unref(slot))); + assert_se(t->n_ref == 1); + + assert_se(!sd_netlink_message_unref(m)); + + /* destroy callback is also called by sd_netlink_unref() */ + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_netlink_call_async(rtnl, NULL, m, link_handler2, test_async_object_destroy, t, 0, NULL) >= 0); + + assert_se(t->n_ref == 1); + assert_se(test_async_object_ref(t)); + assert_se(t->n_ref == 2); + + assert_se((rtnl = sd_netlink_unref(rtnl)) == NULL); + assert_se(t->n_ref == 1); +} + +static int pipe_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + int *counter = userdata; + int r; + + (*counter)--; + + r = sd_netlink_message_get_errno(m); + + log_info_errno(r, "%d left in pipe. got reply: %m", *counter); + + assert_se(r >= 0); + + return 1; +} + +TEST(pipe) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m1 = NULL, *m2 = NULL; + int ifindex, counter = 0; + + assert_se(sd_netlink_open(&rtnl) >= 0); + ifindex = (int) if_nametoindex("lo"); + + assert_se(sd_rtnl_message_new_link(rtnl, &m1, RTM_GETLINK, ifindex) >= 0); + assert_se(sd_rtnl_message_new_link(rtnl, &m2, RTM_GETLINK, ifindex) >= 0); + + counter++; + assert_se(sd_netlink_call_async(rtnl, NULL, m1, pipe_handler, NULL, &counter, 0, NULL) >= 0); + + counter++; + assert_se(sd_netlink_call_async(rtnl, NULL, m2, pipe_handler, NULL, &counter, 0, NULL) >= 0); + + while (counter > 0) { + assert_se(sd_netlink_wait(rtnl, 0) >= 0); + assert_se(sd_netlink_process(rtnl, NULL) >= 0); + } + + assert_se((rtnl = sd_netlink_unref(rtnl)) == NULL); +} + +TEST(message_container) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + uint16_t u16_data; + uint32_t u32_data; + const char *string_data; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0) >= 0); + + assert_se(sd_netlink_message_open_container(m, IFLA_LINKINFO) >= 0); + assert_se(sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "vlan") >= 0); + assert_se(sd_netlink_message_append_u16(m, IFLA_VLAN_ID, 100) >= 0); + assert_se(sd_netlink_message_close_container(m) >= 0); + assert_se(sd_netlink_message_close_container(m) >= 0); + + assert_se(sd_netlink_message_rewind(m, rtnl) >= 0); + + assert_se(sd_netlink_message_enter_container(m, IFLA_LINKINFO) >= 0); + assert_se(sd_netlink_message_read_string(m, IFLA_INFO_KIND, &string_data) >= 0); + assert_se(streq("vlan", string_data)); + + assert_se(sd_netlink_message_enter_container(m, IFLA_INFO_DATA) >= 0); + assert_se(sd_netlink_message_read_u16(m, IFLA_VLAN_ID, &u16_data) >= 0); + assert_se(sd_netlink_message_exit_container(m) >= 0); + + assert_se(sd_netlink_message_read_string(m, IFLA_INFO_KIND, &string_data) >= 0); + assert_se(streq("vlan", string_data)); + assert_se(sd_netlink_message_exit_container(m) >= 0); + + assert_se(sd_netlink_message_read_u32(m, IFLA_LINKINFO, &u32_data) < 0); +} + +TEST(sd_netlink_add_match) { + _cleanup_(sd_netlink_slot_unrefp) sd_netlink_slot *s1 = NULL, *s2 = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_netlink_add_match(rtnl, &s1, RTM_NEWLINK, link_handler, NULL, NULL, NULL) >= 0); + assert_se(sd_netlink_add_match(rtnl, &s2, RTM_NEWLINK, link_handler, NULL, NULL, NULL) >= 0); + assert_se(sd_netlink_add_match(rtnl, NULL, RTM_NEWLINK, link_handler, NULL, NULL, NULL) >= 0); + + assert_se(!(s1 = sd_netlink_slot_unref(s1))); + assert_se(!(s2 = sd_netlink_slot_unref(s2))); + + assert_se((rtnl = sd_netlink_unref(rtnl)) == NULL); +} + +TEST(dump_addresses) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_addr(rtnl, &req, RTM_GETADDR, 0, AF_UNSPEC) >= 0); + assert_se(sd_netlink_message_set_request_dump(req, true) >= 0); + assert_se(sd_netlink_call(rtnl, req, 0, &reply) >= 0); + + for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) { + uint16_t type; + unsigned char scope, flags; + int family, ifindex; + + assert_se(sd_netlink_message_get_type(m, &type) >= 0); + assert_se(type == RTM_NEWADDR); + + assert_se(sd_rtnl_message_addr_get_ifindex(m, &ifindex) >= 0); + assert_se(sd_rtnl_message_addr_get_family(m, &family) >= 0); + assert_se(sd_rtnl_message_addr_get_scope(m, &scope) >= 0); + assert_se(sd_rtnl_message_addr_get_flags(m, &flags) >= 0); + + assert_se(ifindex > 0); + assert_se(IN_SET(family, AF_INET, AF_INET6)); + + log_info("got IPv%i address on ifindex %i", family == AF_INET ? 4 : 6, ifindex); + } +} + +TEST(sd_netlink_message_get_errno) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(message_new_synthetic_error(rtnl, -ETIMEDOUT, 1, &m) >= 0); + assert_se(sd_netlink_message_get_errno(m) == -ETIMEDOUT); +} + +TEST(message_array) { + _cleanup_(sd_netlink_unrefp) sd_netlink *genl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + + assert_se(sd_genl_socket_open(&genl) >= 0); + assert_se(sd_genl_message_new(genl, CTRL_GENL_NAME, CTRL_CMD_GETFAMILY, &m) >= 0); + + assert_se(sd_netlink_message_open_container(m, CTRL_ATTR_MCAST_GROUPS) >= 0); + for (unsigned i = 0; i < 10; i++) { + char name[STRLEN("hoge") + DECIMAL_STR_MAX(uint32_t)]; + uint32_t id = i + 1000; + + xsprintf(name, "hoge%" PRIu32, id); + assert_se(sd_netlink_message_open_array(m, i + 1) >= 0); + assert_se(sd_netlink_message_append_u32(m, CTRL_ATTR_MCAST_GRP_ID, id) >= 0); + assert_se(sd_netlink_message_append_string(m, CTRL_ATTR_MCAST_GRP_NAME, name) >= 0); + assert_se(sd_netlink_message_close_container(m) >= 0); + } + assert_se(sd_netlink_message_close_container(m) >= 0); + + message_seal(m); + assert_se(sd_netlink_message_rewind(m, genl) >= 0); + + assert_se(sd_netlink_message_enter_container(m, CTRL_ATTR_MCAST_GROUPS) >= 0); + for (unsigned i = 0; i < 10; i++) { + char expected[STRLEN("hoge") + DECIMAL_STR_MAX(uint32_t)]; + const char *name; + uint32_t id; + + assert_se(sd_netlink_message_enter_array(m, i + 1) >= 0); + assert_se(sd_netlink_message_read_u32(m, CTRL_ATTR_MCAST_GRP_ID, &id) >= 0); + assert_se(sd_netlink_message_read_string(m, CTRL_ATTR_MCAST_GRP_NAME, &name) >= 0); + assert_se(sd_netlink_message_exit_container(m) >= 0); + + assert_se(id == i + 1000); + xsprintf(expected, "hoge%" PRIu32, id); + assert_se(streq(name, expected)); + } + assert_se(sd_netlink_message_exit_container(m) >= 0); +} + +TEST(message_strv) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + _cleanup_strv_free_ char **names_in = NULL, **names_out; + const char *p; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINKPROP, 1) >= 0); + + for (unsigned i = 0; i < 10; i++) { + char name[STRLEN("hoge") + DECIMAL_STR_MAX(uint32_t)]; + + xsprintf(name, "hoge%" PRIu32, i + 1000); + assert_se(strv_extend(&names_in, name) >= 0); + } + + assert_se(sd_netlink_message_open_container(m, IFLA_PROP_LIST) >= 0); + assert_se(sd_netlink_message_append_strv(m, IFLA_ALT_IFNAME, (const char**) names_in) >= 0); + assert_se(sd_netlink_message_close_container(m) >= 0); + + message_seal(m); + assert_se(sd_netlink_message_rewind(m, rtnl) >= 0); + + assert_se(sd_netlink_message_read_strv(m, IFLA_PROP_LIST, IFLA_ALT_IFNAME, &names_out) >= 0); + assert_se(strv_equal(names_in, names_out)); + + assert_se(sd_netlink_message_enter_container(m, IFLA_PROP_LIST) >= 0); + assert_se(sd_netlink_message_read_string(m, IFLA_ALT_IFNAME, &p) >= 0); + assert_se(streq(p, "hoge1009")); + assert_se(sd_netlink_message_exit_container(m) >= 0); +} + +static int genl_ctrl_match_callback(sd_netlink *genl, sd_netlink_message *m, void *userdata) { + const char *name; + uint16_t id; + uint8_t cmd; + + assert_se(genl); + assert_se(m); + + assert_se(sd_genl_message_get_family_name(genl, m, &name) >= 0); + assert_se(streq(name, CTRL_GENL_NAME)); + + assert_se(sd_genl_message_get_command(genl, m, &cmd) >= 0); + + switch (cmd) { + case CTRL_CMD_NEWFAMILY: + case CTRL_CMD_DELFAMILY: + assert_se(sd_netlink_message_read_string(m, CTRL_ATTR_FAMILY_NAME, &name) >= 0); + assert_se(sd_netlink_message_read_u16(m, CTRL_ATTR_FAMILY_ID, &id) >= 0); + log_debug("%s: %s (id=%"PRIu16") family is %s.", + __func__, name, id, cmd == CTRL_CMD_NEWFAMILY ? "added" : "removed"); + break; + case CTRL_CMD_NEWMCAST_GRP: + case CTRL_CMD_DELMCAST_GRP: + assert_se(sd_netlink_message_read_string(m, CTRL_ATTR_FAMILY_NAME, &name) >= 0); + assert_se(sd_netlink_message_read_u16(m, CTRL_ATTR_FAMILY_ID, &id) >= 0); + log_debug("%s: multicast group for %s (id=%"PRIu16") family is %s.", + __func__, name, id, cmd == CTRL_CMD_NEWMCAST_GRP ? "added" : "removed"); + break; + default: + log_debug("%s: received nlctrl message with unknown command '%"PRIu8"'.", __func__, cmd); + } + + return 0; +} + +TEST(genl) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *genl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + const char *name; + uint8_t cmd; + int r; + + assert_se(sd_genl_socket_open(&genl) >= 0); + assert_se(sd_event_default(&event) >= 0); + assert_se(sd_netlink_attach_event(genl, event, 0) >= 0); + + assert_se(sd_genl_message_new(genl, CTRL_GENL_NAME, CTRL_CMD_GETFAMILY, &m) >= 0); + assert_se(sd_genl_message_get_family_name(genl, m, &name) >= 0); + assert_se(streq(name, CTRL_GENL_NAME)); + assert_se(sd_genl_message_get_command(genl, m, &cmd) >= 0); + assert_se(cmd == CTRL_CMD_GETFAMILY); + + assert_se(sd_genl_add_match(genl, NULL, CTRL_GENL_NAME, "notify", 0, genl_ctrl_match_callback, NULL, NULL, "genl-ctrl-notify") >= 0); + + m = sd_netlink_message_unref(m); + assert_se(sd_genl_message_new(genl, "should-not-exist", CTRL_CMD_GETFAMILY, &m) < 0); + assert_se(sd_genl_message_new(genl, "should-not-exist", CTRL_CMD_GETFAMILY, &m) == -EOPNOTSUPP); + + /* These families may not be supported by kernel. Hence, ignore results. */ + (void) sd_genl_message_new(genl, FOU_GENL_NAME, 0, &m); + m = sd_netlink_message_unref(m); + (void) sd_genl_message_new(genl, L2TP_GENL_NAME, 0, &m); + m = sd_netlink_message_unref(m); + (void) sd_genl_message_new(genl, MACSEC_GENL_NAME, 0, &m); + m = sd_netlink_message_unref(m); + (void) sd_genl_message_new(genl, NL80211_GENL_NAME, 0, &m); + m = sd_netlink_message_unref(m); + (void) sd_genl_message_new(genl, NETLBL_NLTYPE_UNLABELED_NAME, 0, &m); + + for (;;) { + r = sd_event_run(event, 500 * USEC_PER_MSEC); + assert_se(r >= 0); + if (r == 0) + return; + } +} + +static void remove_dummy_interfacep(int *ifindex) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + + if (!ifindex || *ifindex <= 0) + return; + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &message, RTM_DELLINK, *ifindex) >= 0); + assert_se(sd_netlink_call(rtnl, message, 0, NULL) == 1); +} + +TEST(rtnl_set_link_name) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL, *reply = NULL; + _cleanup_(remove_dummy_interfacep) int ifindex = 0; + _cleanup_strv_free_ char **alternative_names = NULL; + int r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se(sd_netlink_open(&rtnl) >= 0); + + assert_se(sd_rtnl_message_new_link(rtnl, &message, RTM_NEWLINK, 0) >= 0); + assert_se(sd_netlink_message_append_string(message, IFLA_IFNAME, "test-netlink") >= 0); + assert_se(sd_netlink_message_open_container(message, IFLA_LINKINFO) >= 0); + assert_se(sd_netlink_message_append_string(message, IFLA_INFO_KIND, "dummy") >= 0); + r = sd_netlink_call(rtnl, message, 0, &reply); + if (r == -EPERM) + return (void) log_tests_skipped("missing required capabilities"); + if (r == -EOPNOTSUPP) + return (void) log_tests_skipped("dummy network interface is not supported"); + assert_se(r >= 0); + + message = sd_netlink_message_unref(message); + reply = sd_netlink_message_unref(reply); + + assert_se(sd_rtnl_message_new_link(rtnl, &message, RTM_GETLINK, 0) >= 0); + assert_se(sd_netlink_message_append_string(message, IFLA_IFNAME, "test-netlink") >= 0); + assert_se(sd_netlink_call(rtnl, message, 0, &reply) == 1); + + assert_se(sd_rtnl_message_link_get_ifindex(reply, &ifindex) >= 0); + assert_se(ifindex > 0); + + /* Test that the new name (which is currently an alternative name) is + * restored as an alternative name on error. Create an error by using + * an invalid device name, namely one that exceeds IFNAMSIZ + * (alternative names can exceed IFNAMSIZ, but not regular names). */ + r = rtnl_set_link_alternative_names(&rtnl, ifindex, STRV_MAKE("testlongalternativename", "test-shortname")); + if (r == -EPERM) + return (void) log_tests_skipped("missing required capabilities"); + if (r == -EOPNOTSUPP) + return (void) log_tests_skipped("alternative name is not supported"); + assert_se(r >= 0); + + assert_se(rtnl_get_link_alternative_names(&rtnl, ifindex, &alternative_names) >= 0); + assert_se(strv_contains(alternative_names, "testlongalternativename")); + assert_se(strv_contains(alternative_names, "test-shortname")); + + assert_se(rtnl_set_link_name(&rtnl, ifindex, "testlongalternativename", NULL) == -EINVAL); + assert_se(rtnl_set_link_name(&rtnl, ifindex, "test-shortname", STRV_MAKE("testlongalternativename", "test-shortname", "test-additional-name")) >= 0); + + alternative_names = strv_free(alternative_names); + assert_se(rtnl_get_link_alternative_names(&rtnl, ifindex, &alternative_names) >= 0); + assert_se(strv_contains(alternative_names, "testlongalternativename")); + assert_se(strv_contains(alternative_names, "test-additional-name")); + assert_se(!strv_contains(alternative_names, "test-shortname")); + + assert_se(rtnl_delete_link_alternative_names(&rtnl, ifindex, STRV_MAKE("testlongalternativename")) >= 0); + + alternative_names = strv_free(alternative_names); + assert_se(rtnl_get_link_alternative_names(&rtnl, ifindex, &alternative_names) >= 0); + assert_se(!strv_contains(alternative_names, "testlongalternativename")); + assert_se(strv_contains(alternative_names, "test-additional-name")); + assert_se(!strv_contains(alternative_names, "test-shortname")); + + _cleanup_free_ char *resolved = NULL; + assert_se(rtnl_resolve_link_alternative_name(&rtnl, "test-additional-name", &resolved) == ifindex); + assert_se(streq_ptr(resolved, "test-shortname")); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/libsystemd/sd-network/network-util.c b/src/libsystemd/sd-network/network-util.c new file mode 100644 index 0000000..2059567 --- /dev/null +++ b/src/libsystemd/sd-network/network-util.c @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-network.h" + +#include "alloc-util.h" +#include "network-util.h" +#include "string-table.h" +#include "strv.h" + +bool network_is_online(void) { + _cleanup_free_ char *online_state = NULL; + LinkOnlineState state; + int r; + + r = sd_network_get_online_state(&online_state); + if (r < 0) + state = _LINK_ONLINE_STATE_INVALID; + else + state = link_online_state_from_string(online_state); + + if (state >= LINK_ONLINE_STATE_PARTIAL) + return true; + else if (state < 0) { + _cleanup_free_ char *carrier_state = NULL, *addr_state = NULL; + + r = sd_network_get_carrier_state(&carrier_state); + if (r < 0) /* if we don't know anything, we consider the system online */ + return true; + + r = sd_network_get_address_state(&addr_state); + if (r < 0) /* if we don't know anything, we consider the system online */ + return true; + + /* we don't know the online state for certain, so make an educated guess */ + if (STR_IN_SET(carrier_state, "degraded-carrier", "carrier") && + STR_IN_SET(addr_state, "routable", "degraded")) + return true; + } + + return false; +} + +static const char* const link_operstate_table[_LINK_OPERSTATE_MAX] = { + [LINK_OPERSTATE_MISSING] = "missing", + [LINK_OPERSTATE_OFF] = "off", + [LINK_OPERSTATE_NO_CARRIER] = "no-carrier", + [LINK_OPERSTATE_DORMANT] = "dormant", + [LINK_OPERSTATE_DEGRADED_CARRIER] = "degraded-carrier", + [LINK_OPERSTATE_CARRIER] = "carrier", + [LINK_OPERSTATE_DEGRADED] = "degraded", + [LINK_OPERSTATE_ENSLAVED] = "enslaved", + [LINK_OPERSTATE_ROUTABLE] = "routable", +}; + +DEFINE_STRING_TABLE_LOOKUP(link_operstate, LinkOperationalState); + +static const char* const link_carrier_state_table[_LINK_CARRIER_STATE_MAX] = { + [LINK_CARRIER_STATE_OFF] = "off", + [LINK_CARRIER_STATE_NO_CARRIER] = "no-carrier", + [LINK_CARRIER_STATE_DORMANT] = "dormant", + [LINK_CARRIER_STATE_DEGRADED_CARRIER] = "degraded-carrier", + [LINK_CARRIER_STATE_CARRIER] = "carrier", + [LINK_CARRIER_STATE_ENSLAVED] = "enslaved", +}; + +DEFINE_STRING_TABLE_LOOKUP(link_carrier_state, LinkCarrierState); + +static const char* const link_required_address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_NO] = "any", + [ADDRESS_FAMILY_IPV4] = "ipv4", + [ADDRESS_FAMILY_IPV6] = "ipv6", + [ADDRESS_FAMILY_YES] = "both", +}; + +DEFINE_STRING_TABLE_LOOKUP(link_required_address_family, AddressFamily); + +static const char* const link_address_state_table[_LINK_ADDRESS_STATE_MAX] = { + [LINK_ADDRESS_STATE_OFF] = "off", + [LINK_ADDRESS_STATE_DEGRADED] = "degraded", + [LINK_ADDRESS_STATE_ROUTABLE] = "routable", +}; + +DEFINE_STRING_TABLE_LOOKUP(link_address_state, LinkAddressState); + +static const char *const link_online_state_table[_LINK_ONLINE_STATE_MAX] = { + [LINK_ONLINE_STATE_OFFLINE] = "offline", + [LINK_ONLINE_STATE_PARTIAL] = "partial", + [LINK_ONLINE_STATE_ONLINE] = "online", +}; + +DEFINE_STRING_TABLE_LOOKUP(link_online_state, LinkOnlineState); + +int parse_operational_state_range(const char *str, LinkOperationalStateRange *out) { + LinkOperationalStateRange range = { _LINK_OPERSTATE_INVALID, _LINK_OPERSTATE_INVALID }; + _cleanup_free_ const char *min = NULL; + const char *p; + + assert(str); + assert(out); + + p = strchr(str, ':'); + if (p) { + min = strndup(str, p - str); + + if (!isempty(p + 1)) { + range.max = link_operstate_from_string(p + 1); + if (range.max < 0) + return -EINVAL; + } + } else + min = strdup(str); + + if (!min) + return -ENOMEM; + + if (!isempty(min)) { + range.min = link_operstate_from_string(min); + if (range.min < 0) + return -EINVAL; + } + + /* Fail on empty strings. */ + if (range.min == _LINK_OPERSTATE_INVALID && range.max == _LINK_OPERSTATE_INVALID) + return -EINVAL; + + if (range.min == _LINK_OPERSTATE_INVALID) + range.min = LINK_OPERSTATE_MISSING; + if (range.max == _LINK_OPERSTATE_INVALID) + range.max = LINK_OPERSTATE_ROUTABLE; + + if (range.min > range.max) + return -EINVAL; + + *out = range; + + return 0; +} + +int network_link_get_operational_state(int ifindex, LinkOperationalState *ret) { + _cleanup_free_ char *str = NULL; + LinkOperationalState s; + int r; + + assert(ifindex > 0); + assert(ret); + + r = sd_network_link_get_operational_state(ifindex, &str); + if (r < 0) + return r; + + s = link_operstate_from_string(str); + if (s < 0) + return s; + + *ret = s; + return 0; +} diff --git a/src/libsystemd/sd-network/network-util.h b/src/libsystemd/sd-network/network-util.h new file mode 100644 index 0000000..c47e271 --- /dev/null +++ b/src/libsystemd/sd-network/network-util.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +bool network_is_online(void); + +typedef enum AddressFamily { + /* This is a bitmask, though it usually doesn't feel that way! */ + ADDRESS_FAMILY_NO = 0, + ADDRESS_FAMILY_IPV4 = 1 << 0, + ADDRESS_FAMILY_IPV6 = 1 << 1, + ADDRESS_FAMILY_YES = ADDRESS_FAMILY_IPV4 | ADDRESS_FAMILY_IPV6, + _ADDRESS_FAMILY_MAX, + _ADDRESS_FAMILY_INVALID = -EINVAL, +} AddressFamily; + +typedef enum LinkOperationalState { + LINK_OPERSTATE_MISSING, + LINK_OPERSTATE_OFF, + LINK_OPERSTATE_NO_CARRIER, + LINK_OPERSTATE_DORMANT, + LINK_OPERSTATE_DEGRADED_CARRIER, + LINK_OPERSTATE_CARRIER, + LINK_OPERSTATE_DEGRADED, + LINK_OPERSTATE_ENSLAVED, + LINK_OPERSTATE_ROUTABLE, + _LINK_OPERSTATE_MAX, + _LINK_OPERSTATE_INVALID = -EINVAL, +} LinkOperationalState; + +typedef enum LinkCarrierState { + LINK_CARRIER_STATE_OFF = LINK_OPERSTATE_OFF, + LINK_CARRIER_STATE_NO_CARRIER = LINK_OPERSTATE_NO_CARRIER, + LINK_CARRIER_STATE_DORMANT = LINK_OPERSTATE_DORMANT, + LINK_CARRIER_STATE_DEGRADED_CARRIER = LINK_OPERSTATE_DEGRADED_CARRIER, + LINK_CARRIER_STATE_CARRIER = LINK_OPERSTATE_CARRIER, + LINK_CARRIER_STATE_ENSLAVED = LINK_OPERSTATE_ENSLAVED, + _LINK_CARRIER_STATE_MAX, + _LINK_CARRIER_STATE_INVALID = -EINVAL, +} LinkCarrierState; + +typedef enum LinkAddressState { + LINK_ADDRESS_STATE_OFF, + LINK_ADDRESS_STATE_DEGRADED, + LINK_ADDRESS_STATE_ROUTABLE, + _LINK_ADDRESS_STATE_MAX, + _LINK_ADDRESS_STATE_INVALID = -EINVAL, +} LinkAddressState; + +typedef enum LinkOnlineState { + LINK_ONLINE_STATE_OFFLINE, + LINK_ONLINE_STATE_PARTIAL, + LINK_ONLINE_STATE_ONLINE, + _LINK_ONLINE_STATE_MAX, + _LINK_ONLINE_STATE_INVALID = -EINVAL, +} LinkOnlineState; + +const char* link_operstate_to_string(LinkOperationalState s) _const_; +LinkOperationalState link_operstate_from_string(const char *s) _pure_; + +const char* link_carrier_state_to_string(LinkCarrierState s) _const_; +LinkCarrierState link_carrier_state_from_string(const char *s) _pure_; + +const char* link_required_address_family_to_string(AddressFamily s) _const_; +AddressFamily link_required_address_family_from_string(const char *s) _pure_; + +const char* link_address_state_to_string(LinkAddressState s) _const_; +LinkAddressState link_address_state_from_string(const char *s) _pure_; + +const char* link_online_state_to_string(LinkOnlineState s) _const_; +LinkOnlineState link_online_state_from_string(const char *s) _pure_; + +typedef struct LinkOperationalStateRange { + LinkOperationalState min; + LinkOperationalState max; +} LinkOperationalStateRange; + +#define LINK_OPERSTATE_RANGE_DEFAULT (LinkOperationalStateRange) { LINK_OPERSTATE_DEGRADED, \ + LINK_OPERSTATE_ROUTABLE } + +int parse_operational_state_range(const char *str, LinkOperationalStateRange *out); +int network_link_get_operational_state(int ifindex, LinkOperationalState *ret); diff --git a/src/libsystemd/sd-network/sd-network.c b/src/libsystemd/sd-network/sd-network.c new file mode 100644 index 0000000..cf3c400 --- /dev/null +++ b/src/libsystemd/sd-network/sd-network.c @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-network.h" + +#include "alloc-util.h" +#include "env-file.h" +#include "fd-util.h" +#include "fs-util.h" +#include "inotify-util.h" +#include "macro.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" + +static int network_get_string(const char *field, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + assert_return(ret, -EINVAL); + + r = parse_env_file(NULL, "/run/systemd/netif/state", field, &s); + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + *ret = TAKE_PTR(s); + return 0; +} + +int sd_network_get_operational_state(char **ret) { + return network_get_string("OPER_STATE", ret); +} + +int sd_network_get_carrier_state(char **ret) { + return network_get_string("CARRIER_STATE", ret); +} + +int sd_network_get_address_state(char **ret) { + return network_get_string("ADDRESS_STATE", ret); +} + +int sd_network_get_ipv4_address_state(char **ret) { + return network_get_string("IPV4_ADDRESS_STATE", ret); +} + +int sd_network_get_ipv6_address_state(char **ret) { + return network_get_string("IPV6_ADDRESS_STATE", ret); +} + +int sd_network_get_online_state(char **ret) { + return network_get_string("ONLINE_STATE", ret); +} + +static int network_get_strv(const char *key, char ***ret) { + _cleanup_strv_free_ char **a = NULL; + _cleanup_free_ char *s = NULL; + int r; + + assert_return(ret, -EINVAL); + + r = parse_env_file(NULL, "/run/systemd/netif/state", key, &s); + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + a = strv_split(s, NULL); + if (!a) + return -ENOMEM; + + strv_uniq(a); + r = (int) strv_length(a); + + *ret = TAKE_PTR(a); + return r; +} + +int sd_network_get_dns(char ***ret) { + return network_get_strv("DNS", ret); +} + +int sd_network_get_ntp(char ***ret) { + return network_get_strv("NTP", ret); +} + +int sd_network_get_search_domains(char ***ret) { + return network_get_strv("DOMAINS", ret); +} + +int sd_network_get_route_domains(char ***ret) { + return network_get_strv("ROUTE_DOMAINS", ret); +} + +static int network_link_get_string(int ifindex, const char *field, char **ret) { + char path[STRLEN("/run/systemd/netif/links/") + DECIMAL_STR_MAX(ifindex)]; + _cleanup_free_ char *s = NULL; + int r; + + assert_return(ifindex > 0, -EINVAL); + assert_return(ret, -EINVAL); + + xsprintf(path, "/run/systemd/netif/links/%i", ifindex); + + r = parse_env_file(NULL, path, field, &s); + if (r < 0) + return r; + if (isempty(s)) + return -ENODATA; + + *ret = TAKE_PTR(s); + return 0; +} + +static int network_link_get_boolean(int ifindex, const char *key) { + _cleanup_free_ char *s = NULL; + int r; + + r = network_link_get_string(ifindex, key, &s); + if (r < 0) + return r; + + return parse_boolean(s); +} + +static int network_link_get_strv(int ifindex, const char *key, char ***ret) { + _cleanup_strv_free_ char **a = NULL; + _cleanup_free_ char *s = NULL; + int r; + + assert_return(ifindex > 0, -EINVAL); + assert_return(ret, -EINVAL); + + r = network_link_get_string(ifindex, key, &s); + if (r < 0) + return r; + + a = strv_split(s, NULL); + if (!a) + return -ENOMEM; + + strv_uniq(a); + r = (int) strv_length(a); + + *ret = TAKE_PTR(a); + return r; +} + +int sd_network_link_get_setup_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "ADMIN_STATE", ret); +} + +int sd_network_link_get_network_file(int ifindex, char **ret) { + return network_link_get_string(ifindex, "NETWORK_FILE", ret); +} + +int sd_network_link_get_network_file_dropins(int ifindex, char ***ret) { + _cleanup_free_ char **sv = NULL, *joined = NULL; + int r; + + assert_return(ifindex > 0, -EINVAL); + assert_return(ret, -EINVAL); + + r = network_link_get_string(ifindex, "NETWORK_FILE_DROPINS", &joined); + if (r < 0) + return r; + + r = strv_split_full(&sv, joined, ":", EXTRACT_CUNESCAPE); + if (r < 0) + return r; + + *ret = TAKE_PTR(sv); + return 0; +} + +int sd_network_link_get_operational_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "OPER_STATE", ret); +} + +int sd_network_link_get_required_family_for_online(int ifindex, char **ret) { + return network_link_get_string(ifindex, "REQUIRED_FAMILY_FOR_ONLINE", ret); +} + +int sd_network_link_get_carrier_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "CARRIER_STATE", ret); +} + +int sd_network_link_get_address_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "ADDRESS_STATE", ret); +} + +int sd_network_link_get_ipv4_address_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "IPV4_ADDRESS_STATE", ret); +} + +int sd_network_link_get_ipv6_address_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "IPV6_ADDRESS_STATE", ret); +} + +int sd_network_link_get_online_state(int ifindex, char **ret) { + return network_link_get_string(ifindex, "ONLINE_STATE", ret); +} + +int sd_network_link_get_dhcp6_client_iaid_string(int ifindex, char **ret) { + return network_link_get_string(ifindex, "DHCP6_CLIENT_IAID", ret); +} + +int sd_network_link_get_dhcp6_client_duid_string(int ifindex, char **ret) { + return network_link_get_string(ifindex, "DHCP6_CLIENT_DUID", ret); +} + +int sd_network_link_get_required_for_online(int ifindex) { + return network_link_get_boolean(ifindex, "REQUIRED_FOR_ONLINE"); +} + +int sd_network_link_get_required_operstate_for_online(int ifindex, char **ret) { + return network_link_get_string(ifindex, "REQUIRED_OPER_STATE_FOR_ONLINE", ret); +} + +int sd_network_link_get_activation_policy(int ifindex, char **ret) { + return network_link_get_string(ifindex, "ACTIVATION_POLICY", ret); +} + +int sd_network_link_get_llmnr(int ifindex, char **ret) { + return network_link_get_string(ifindex, "LLMNR", ret); +} + +int sd_network_link_get_mdns(int ifindex, char **ret) { + return network_link_get_string(ifindex, "MDNS", ret); +} + +int sd_network_link_get_dns_over_tls(int ifindex, char **ret) { + return network_link_get_string(ifindex, "DNS_OVER_TLS", ret); +} + +int sd_network_link_get_dnssec(int ifindex, char **ret) { + return network_link_get_string(ifindex, "DNSSEC", ret); +} + +int sd_network_link_get_dnssec_negative_trust_anchors(int ifindex, char ***ret) { + return network_link_get_strv(ifindex, "DNSSEC_NTA", ret); +} + +int sd_network_link_get_dns(int ifindex, char ***ret) { + return network_link_get_strv(ifindex, "DNS", ret); +} + +int sd_network_link_get_ntp(int ifindex, char ***ret) { + return network_link_get_strv(ifindex, "NTP", ret); +} + +int sd_network_link_get_sip(int ifindex, char ***ret) { + return network_link_get_strv(ifindex, "SIP", ret); +} + +int sd_network_link_get_captive_portal(int ifindex, char **ret) { + return network_link_get_string(ifindex, "CAPTIVE_PORTAL", ret); +} + +int sd_network_link_get_search_domains(int ifindex, char ***ret) { + return network_link_get_strv(ifindex, "DOMAINS", ret); +} + +int sd_network_link_get_route_domains(int ifindex, char ***ret) { + return network_link_get_strv(ifindex, "ROUTE_DOMAINS", ret); +} + +int sd_network_link_get_dns_default_route(int ifindex) { + return network_link_get_boolean(ifindex, "DNS_DEFAULT_ROUTE"); +} + +static int network_link_get_ifindexes(int ifindex, const char *key, int **ret) { + _cleanup_free_ int *ifis = NULL; + _cleanup_free_ char *s = NULL; + size_t c = 0; + int r; + + assert_return(ifindex > 0, -EINVAL); + assert_return(ret, -EINVAL); + + r = network_link_get_string(ifindex, key, &s); + if (r < 0) + return r; + + for (const char *x = s;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&x, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + if (!GREEDY_REALLOC(ifis, c + 2)) + return -ENOMEM; + + r = ifis[c++] = parse_ifindex(word); + if (r < 0) + return r; + } + + if (ifis) + ifis[c] = 0; /* Let's add a 0 ifindex to the end, to be nice */ + + *ret = TAKE_PTR(ifis); + return c; +} + +int sd_network_link_get_carrier_bound_to(int ifindex, int **ret) { + return network_link_get_ifindexes(ifindex, "CARRIER_BOUND_TO", ret); +} + +int sd_network_link_get_carrier_bound_by(int ifindex, int **ret) { + return network_link_get_ifindexes(ifindex, "CARRIER_BOUND_BY", ret); +} + +int sd_network_link_get_stat(int ifindex, struct stat *ret) { + char path[STRLEN("/run/systemd/netif/links/") + DECIMAL_STR_MAX(ifindex)]; + struct stat st; + + assert_return(ifindex > 0, -EINVAL); + + xsprintf(path, "/run/systemd/netif/links/%i", ifindex); + + if (stat(path, &st) < 0) + return -errno; + + if (ret) + *ret = st; + + return 0; +} + +static int MONITOR_TO_FD(sd_network_monitor *m) { + return (int) (unsigned long) m - 1; +} + +static sd_network_monitor* FD_TO_MONITOR(int fd) { + return (sd_network_monitor*) (unsigned long) (fd + 1); +} + +static int monitor_add_inotify_watch(int fd) { + int wd; + + wd = inotify_add_watch(fd, "/run/systemd/netif/links/", IN_MOVED_TO|IN_DELETE); + if (wd >= 0) + return wd; + else if (errno != ENOENT) + return -errno; + + wd = inotify_add_watch(fd, "/run/systemd/netif/", IN_CREATE|IN_ISDIR); + if (wd >= 0) + return wd; + else if (errno != ENOENT) + return -errno; + + wd = inotify_add_watch(fd, "/run/systemd/", IN_CREATE|IN_ISDIR); + if (wd < 0) + return -errno; + + return wd; +} + +int sd_network_monitor_new(sd_network_monitor **m, const char *category) { + _cleanup_close_ int fd = -EBADF; + int k; + bool good = false; + + assert_return(m, -EINVAL); + + fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (fd < 0) + return -errno; + + if (!category || streq(category, "links")) { + k = monitor_add_inotify_watch(fd); + if (k < 0) + return k; + + good = true; + } + + if (!good) + return -EINVAL; + + *m = FD_TO_MONITOR(TAKE_FD(fd)); + return 0; +} + +sd_network_monitor* sd_network_monitor_unref(sd_network_monitor *m) { + if (m) + (void) close_nointr(MONITOR_TO_FD(m)); + + return NULL; +} + +int sd_network_monitor_flush(sd_network_monitor *m) { + union inotify_event_buffer buffer; + ssize_t l; + int fd; + + assert_return(m, -EINVAL); + + fd = MONITOR_TO_FD(m); + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return -errno; + } + + FOREACH_INOTIFY_EVENT(e, buffer, l) { + if (e->mask & IN_ISDIR) { + int wd; + + wd = monitor_add_inotify_watch(fd); + if (wd < 0) + return wd; + + if (wd != e->wd) { + if (inotify_rm_watch(fd, e->wd) < 0) + return -errno; + } + } + } + + return 0; +} + +int sd_network_monitor_get_fd(sd_network_monitor *m) { + assert_return(m, -EINVAL); + + return MONITOR_TO_FD(m); +} + +int sd_network_monitor_get_events(sd_network_monitor *m) { + assert_return(m, -EINVAL); + + /* For now we will only return POLLIN here, since we don't + * need anything else ever for inotify. However, let's have + * this API to keep our options open should we later on need + * it. */ + return POLLIN; +} + +int sd_network_monitor_get_timeout(sd_network_monitor *m, uint64_t *ret_usec) { + assert_return(m, -EINVAL); + assert_return(ret_usec, -EINVAL); + + /* For now we will only return UINT64_MAX, since we don't + * need any timeout. However, let's have this API to keep our + * options open should we later on need it. */ + *ret_usec = UINT64_MAX; + return 0; +} diff --git a/src/libsystemd/sd-path/sd-path.c b/src/libsystemd/sd-path/sd-path.c new file mode 100644 index 0000000..7290d1c --- /dev/null +++ b/src/libsystemd/sd-path/sd-path.c @@ -0,0 +1,693 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-path.h" + +#include "alloc-util.h" +#include "architecture.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "nulstr-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static int from_environment(const char *envname, const char *fallback, const char **ret) { + assert(ret); + + if (envname) { + const char *e; + + e = secure_getenv(envname); + if (e && path_is_absolute(e)) { + *ret = e; + return 0; + } + } + + if (fallback) { + *ret = fallback; + return 0; + } + + return -ENXIO; +} + +static int from_home_dir(const char *envname, const char *suffix, char **buffer, const char **ret) { + _cleanup_free_ char *h = NULL; + int r; + + assert(suffix); + assert(buffer); + assert(ret); + + if (envname) { + const char *e = NULL; + + e = secure_getenv(envname); + if (e && path_is_absolute(e)) { + *ret = e; + return 0; + } + } + + r = get_home_dir(&h); + if (r < 0) + return r; + + if (!path_extend(&h, suffix)) + return -ENOMEM; + + *buffer = h; + *ret = TAKE_PTR(h); + return 0; +} + +static int from_user_dir(const char *field, char **buffer, const char **ret) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *b = NULL; + _cleanup_free_ const char *fn = NULL; + const char *c = NULL; + int r; + + assert(field); + assert(buffer); + assert(ret); + + r = from_home_dir("XDG_CONFIG_HOME", ".config", &b, &c); + if (r < 0) + return r; + + fn = path_join(c, "user-dirs.dirs"); + if (!fn) + return -ENOMEM; + + f = fopen(fn, "re"); + if (!f) { + if (errno == ENOENT) + goto fallback; + + return -errno; + } + + /* This is an awful parse, but it follows closely what xdg-user-dirs does upstream */ + for (;;) { + _cleanup_free_ char *line = NULL; + char *p, *e; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + p = startswith(line, field); + if (!p) + continue; + + p += strspn(p, WHITESPACE); + + if (*p != '=') + continue; + p++; + + p += strspn(p, WHITESPACE); + + if (*p != '"') + continue; + p++; + + e = strrchr(p, '"'); + if (!e) + continue; + *e = 0; + + /* Three syntaxes permitted: relative to $HOME, $HOME itself, and absolute path */ + if (startswith(p, "$HOME/")) { + _cleanup_free_ char *h = NULL; + + r = get_home_dir(&h); + if (r < 0) + return r; + + if (!path_extend(&h, p+5)) + return -ENOMEM; + + *buffer = h; + *ret = TAKE_PTR(h); + return 0; + } else if (streq(p, "$HOME")) { + + r = get_home_dir(buffer); + if (r < 0) + return r; + + *ret = *buffer; + return 0; + } else if (path_is_absolute(p)) { + char *copy; + + copy = strdup(p); + if (!copy) + return -ENOMEM; + + *buffer = copy; + *ret = copy; + return 0; + } + } + +fallback: + /* The desktop directory defaults to $HOME/Desktop, the others to $HOME */ + if (streq(field, "XDG_DESKTOP_DIR")) { + _cleanup_free_ char *h = NULL; + + r = get_home_dir(&h); + if (r < 0) + return r; + + if (!path_extend(&h, "Desktop")) + return -ENOMEM; + + *buffer = h; + *ret = TAKE_PTR(h); + } else { + r = get_home_dir(buffer); + if (r < 0) + return r; + + *ret = *buffer; + } + + return 0; +} + +static int get_path(uint64_t type, char **buffer, const char **ret) { + int r; + + assert(buffer); + assert(ret); + + switch (type) { + + case SD_PATH_TEMPORARY: + return tmp_dir(ret); + + case SD_PATH_TEMPORARY_LARGE: + return var_tmp_dir(ret); + + case SD_PATH_SYSTEM_BINARIES: + *ret = "/usr/bin"; + return 0; + + case SD_PATH_SYSTEM_INCLUDE: + *ret = "/usr/include"; + return 0; + + case SD_PATH_SYSTEM_LIBRARY_PRIVATE: + *ret = "/usr/lib"; + return 0; + + case SD_PATH_SYSTEM_LIBRARY_ARCH: + *ret = LIBDIR; + return 0; + + case SD_PATH_SYSTEM_SHARED: + *ret = "/usr/share"; + return 0; + + case SD_PATH_SYSTEM_CONFIGURATION_FACTORY: + *ret = "/usr/share/factory/etc"; + return 0; + + case SD_PATH_SYSTEM_STATE_FACTORY: + *ret = "/usr/share/factory/var"; + return 0; + + case SD_PATH_SYSTEM_CONFIGURATION: + *ret = "/etc"; + return 0; + + case SD_PATH_SYSTEM_RUNTIME: + *ret = "/run"; + return 0; + + case SD_PATH_SYSTEM_RUNTIME_LOGS: + *ret = "/run/log"; + return 0; + + case SD_PATH_SYSTEM_STATE_PRIVATE: + *ret = "/var/lib"; + return 0; + + case SD_PATH_SYSTEM_STATE_LOGS: + *ret = "/var/log"; + return 0; + + case SD_PATH_SYSTEM_STATE_CACHE: + *ret = "/var/cache"; + return 0; + + case SD_PATH_SYSTEM_STATE_SPOOL: + *ret = "/var/spool"; + return 0; + + case SD_PATH_USER_BINARIES: + return from_home_dir(NULL, ".local/bin", buffer, ret); + + case SD_PATH_USER_LIBRARY_PRIVATE: + return from_home_dir(NULL, ".local/lib", buffer, ret); + + case SD_PATH_USER_LIBRARY_ARCH: + return from_home_dir(NULL, ".local/lib/" LIB_ARCH_TUPLE, buffer, ret); + + case SD_PATH_USER_SHARED: + return from_home_dir("XDG_DATA_HOME", ".local/share", buffer, ret); + + case SD_PATH_USER_CONFIGURATION: + return from_home_dir("XDG_CONFIG_HOME", ".config", buffer, ret); + + case SD_PATH_USER_RUNTIME: + return from_environment("XDG_RUNTIME_DIR", NULL, ret); + + case SD_PATH_USER_STATE_CACHE: + return from_home_dir("XDG_CACHE_HOME", ".cache", buffer, ret); + + case SD_PATH_USER_STATE_PRIVATE: + return from_home_dir("XDG_STATE_HOME", ".local/state", buffer, ret); + + case SD_PATH_USER: + r = get_home_dir(buffer); + if (r < 0) + return r; + + *ret = *buffer; + return 0; + + case SD_PATH_USER_DOCUMENTS: + return from_user_dir("XDG_DOCUMENTS_DIR", buffer, ret); + + case SD_PATH_USER_MUSIC: + return from_user_dir("XDG_MUSIC_DIR", buffer, ret); + + case SD_PATH_USER_PICTURES: + return from_user_dir("XDG_PICTURES_DIR", buffer, ret); + + case SD_PATH_USER_VIDEOS: + return from_user_dir("XDG_VIDEOS_DIR", buffer, ret); + + case SD_PATH_USER_DOWNLOAD: + return from_user_dir("XDG_DOWNLOAD_DIR", buffer, ret); + + case SD_PATH_USER_PUBLIC: + return from_user_dir("XDG_PUBLICSHARE_DIR", buffer, ret); + + case SD_PATH_USER_TEMPLATES: + return from_user_dir("XDG_TEMPLATES_DIR", buffer, ret); + + case SD_PATH_USER_DESKTOP: + return from_user_dir("XDG_DESKTOP_DIR", buffer, ret); + + case SD_PATH_SYSTEMD_UTIL: + *ret = PREFIX_NOSLASH "/lib/systemd"; + return 0; + + case SD_PATH_SYSTEMD_SYSTEM_UNIT: + *ret = SYSTEM_DATA_UNIT_DIR; + return 0; + + case SD_PATH_SYSTEMD_SYSTEM_PRESET: + *ret = PREFIX_NOSLASH "/lib/systemd/system-preset"; + return 0; + + case SD_PATH_SYSTEMD_USER_UNIT: + *ret = USER_DATA_UNIT_DIR; + return 0; + + case SD_PATH_SYSTEMD_USER_PRESET: + *ret = PREFIX_NOSLASH "/lib/systemd/user-preset"; + return 0; + + case SD_PATH_SYSTEMD_SYSTEM_CONF: + *ret = SYSTEM_CONFIG_UNIT_DIR; + return 0; + + case SD_PATH_SYSTEMD_USER_CONF: + *ret = USER_CONFIG_UNIT_DIR; + return 0; + + case SD_PATH_SYSTEMD_SYSTEM_GENERATOR: + *ret = SYSTEM_GENERATOR_DIR; + return 0; + + case SD_PATH_SYSTEMD_USER_GENERATOR: + *ret = USER_GENERATOR_DIR; + return 0; + + case SD_PATH_SYSTEMD_SLEEP: + *ret = PREFIX_NOSLASH "/lib/systemd/system-sleep"; + return 0; + + case SD_PATH_SYSTEMD_SHUTDOWN: + *ret = PREFIX_NOSLASH "/lib/systemd/system-shutdown"; + return 0; + + case SD_PATH_TMPFILES: + *ret = "/usr/lib/tmpfiles.d"; + return 0; + + case SD_PATH_SYSUSERS: + *ret = PREFIX_NOSLASH "/lib/sysusers.d"; + return 0; + + case SD_PATH_SYSCTL: + *ret = PREFIX_NOSLASH "/lib/sysctl.d"; + return 0; + + case SD_PATH_BINFMT: + *ret = PREFIX_NOSLASH "/lib/binfmt.d"; + return 0; + + case SD_PATH_MODULES_LOAD: + *ret = PREFIX_NOSLASH "/lib/modules-load.d"; + return 0; + + case SD_PATH_CATALOG: + *ret = "/usr/lib/systemd/catalog"; + return 0; + + case SD_PATH_SYSTEMD_SYSTEM_ENVIRONMENT_GENERATOR: + *ret = SYSTEM_ENV_GENERATOR_DIR; + return 0; + + case SD_PATH_SYSTEMD_USER_ENVIRONMENT_GENERATOR: + *ret = USER_ENV_GENERATOR_DIR; + return 0; + } + + return -EOPNOTSUPP; +} + +static int get_path_alloc(uint64_t type, const char *suffix, char **path) { + _cleanup_free_ char *buffer = NULL; + char *buffer2 = NULL; + const char *ret; + int r; + + assert(path); + + r = get_path(type, &buffer, &ret); + if (r < 0) + return r; + + if (suffix) { + suffix += strspn(suffix, "/"); + buffer2 = path_join(ret, suffix); + if (!buffer2) + return -ENOMEM; + } else if (!buffer) { + buffer = strdup(ret); + if (!buffer) + return -ENOMEM; + } + + *path = buffer2 ?: TAKE_PTR(buffer); + return 0; +} + +_public_ int sd_path_lookup(uint64_t type, const char *suffix, char **path) { + int r; + + assert_return(path, -EINVAL); + + r = get_path_alloc(type, suffix, path); + if (r != -EOPNOTSUPP) + return r; + + /* Fall back to sd_path_lookup_strv */ + _cleanup_strv_free_ char **l = NULL; + char *buffer; + + r = sd_path_lookup_strv(type, suffix, &l); + if (r < 0) + return r; + + buffer = strv_join(l, ":"); + if (!buffer) + return -ENOMEM; + + *path = buffer; + return 0; +} + +static int search_from_environment( + char ***list, + const char *env_home, + const char *home_suffix, + const char *env_search, + bool env_search_sufficient, + const char *first, ...) { + + _cleanup_strv_free_ char **l = NULL; + const char *e; + char *h = NULL; + int r; + + assert(list); + + if (env_search) { + e = secure_getenv(env_search); + if (e) { + l = strv_split(e, ":"); + if (!l) + return -ENOMEM; + + if (env_search_sufficient) { + *list = TAKE_PTR(l); + return 0; + } + } + } + + if (!l && first) { + va_list ap; + + va_start(ap, first); + l = strv_new_ap(first, ap); + va_end(ap); + + if (!l) + return -ENOMEM; + } + + if (env_home) { + e = secure_getenv(env_home); + if (e && path_is_absolute(e)) { + h = strdup(e); + if (!h) + return -ENOMEM; + } + } + + if (!h && home_suffix) { + e = secure_getenv("HOME"); + if (e && path_is_absolute(e)) { + h = path_join(e, home_suffix); + if (!h) + return -ENOMEM; + } + } + + if (h) { + r = strv_consume_prepend(&l, h); + if (r < 0) + return -ENOMEM; + } + + *list = TAKE_PTR(l); + return 0; +} + +#if HAVE_SPLIT_BIN +# define ARRAY_SBIN_BIN(x) x "sbin", x "bin" +#else +# define ARRAY_SBIN_BIN(x) x "bin" +#endif + +static int get_search(uint64_t type, char ***list) { + int r; + + assert(list); + + switch (type) { + + case SD_PATH_SEARCH_BINARIES: + return search_from_environment(list, + NULL, + ".local/bin", + "PATH", + true, + ARRAY_SBIN_BIN("/usr/local/"), + ARRAY_SBIN_BIN("/usr/"), + NULL); + + case SD_PATH_SEARCH_LIBRARY_PRIVATE: + return search_from_environment(list, + NULL, + ".local/lib", + NULL, + false, + "/usr/local/lib", + "/usr/lib", + NULL); + + case SD_PATH_SEARCH_LIBRARY_ARCH: + return search_from_environment(list, + NULL, + ".local/lib/" LIB_ARCH_TUPLE, + "LD_LIBRARY_PATH", + true, + LIBDIR, + NULL); + + case SD_PATH_SEARCH_SHARED: + return search_from_environment(list, + "XDG_DATA_HOME", + ".local/share", + "XDG_DATA_DIRS", + false, + "/usr/local/share", + "/usr/share", + NULL); + + case SD_PATH_SEARCH_CONFIGURATION_FACTORY: + return search_from_environment(list, + NULL, + NULL, + NULL, + false, + "/usr/local/share/factory/etc", + "/usr/share/factory/etc", + NULL); + + case SD_PATH_SEARCH_STATE_FACTORY: + return search_from_environment(list, + NULL, + NULL, + NULL, + false, + "/usr/local/share/factory/var", + "/usr/share/factory/var", + NULL); + + case SD_PATH_SEARCH_CONFIGURATION: + return search_from_environment(list, + "XDG_CONFIG_HOME", + ".config", + "XDG_CONFIG_DIRS", + false, + "/etc", + NULL); + + case SD_PATH_SEARCH_BINARIES_DEFAULT: + return strv_from_nulstr(list, DEFAULT_PATH_NULSTR); + + case SD_PATH_SYSTEMD_SEARCH_SYSTEM_UNIT: + case SD_PATH_SYSTEMD_SEARCH_USER_UNIT: { + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + RuntimeScope scope = type == SD_PATH_SYSTEMD_SEARCH_SYSTEM_UNIT ? + RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER; + + r = lookup_paths_init(&lp, scope, 0, NULL); + if (r < 0) + return r; + + *list = TAKE_PTR(lp.search_path); + return 0; + } + + case SD_PATH_SYSTEMD_SEARCH_SYSTEM_GENERATOR: + case SD_PATH_SYSTEMD_SEARCH_USER_GENERATOR: { + RuntimeScope scope = type == SD_PATH_SYSTEMD_SEARCH_SYSTEM_GENERATOR ? + RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER; + char **t; + + t = generator_binary_paths(scope); + if (!t) + return -ENOMEM; + + *list = t; + return 0; + } + + case SD_PATH_SYSTEMD_SEARCH_SYSTEM_ENVIRONMENT_GENERATOR: + case SD_PATH_SYSTEMD_SEARCH_USER_ENVIRONMENT_GENERATOR: { + char **t; + + t = env_generator_binary_paths(type == SD_PATH_SYSTEMD_SEARCH_SYSTEM_ENVIRONMENT_GENERATOR ? + RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER); + if (!t) + return -ENOMEM; + + *list = t; + return 0; + } + + case SD_PATH_SYSTEMD_SEARCH_NETWORK: + return strv_from_nulstr(list, NETWORK_DIRS_NULSTR); + + } + + return -EOPNOTSUPP; +} + +_public_ int sd_path_lookup_strv(uint64_t type, const char *suffix, char ***paths) { + _cleanup_strv_free_ char **l = NULL, **n = NULL; + int r; + + assert_return(paths, -EINVAL); + + r = get_search(type, &l); + if (r == -EOPNOTSUPP) { + _cleanup_free_ char *t = NULL; + + r = get_path_alloc(type, suffix, &t); + if (r < 0) + return r; + + l = new(char*, 2); + if (!l) + return -ENOMEM; + l[0] = TAKE_PTR(t); + l[1] = NULL; + + *paths = TAKE_PTR(l); + return 0; + + } else if (r < 0) + return r; + + if (!suffix) { + *paths = TAKE_PTR(l); + return 0; + } + + n = new(char*, strv_length(l)+1); + if (!n) + return -ENOMEM; + + char **j = n; + STRV_FOREACH(i, l) { + *j = path_join(*i, suffix); + if (!*j) + return -ENOMEM; + + j++; + } + *j = NULL; + + *paths = TAKE_PTR(n); + return 0; +} diff --git a/src/libsystemd/sd-resolve/resolve-private.h b/src/libsystemd/sd-resolve/resolve-private.h new file mode 100644 index 0000000..7a339f7 --- /dev/null +++ b/src/libsystemd/sd-resolve/resolve-private.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-resolve.h" + +int resolve_getaddrinfo_with_destroy_callback( + sd_resolve *resolve, sd_resolve_query **q, + const char *node, const char *service, const struct addrinfo *hints, + sd_resolve_getaddrinfo_handler_t callback, + sd_resolve_destroy_t destroy_callback, void *userdata); +int resolve_getnameinfo_with_destroy_callback( + sd_resolve *resolve, sd_resolve_query **q, + const struct sockaddr *sa, socklen_t salen, int flags, uint64_t get, + sd_resolve_getnameinfo_handler_t callback, + sd_resolve_destroy_t destroy_callback, void *userdata); + +#define resolve_getaddrinfo(resolve, ret_query, node, service, hints, callback, destroy_callback, userdata) \ + ({ \ + int (*_callback_)(sd_resolve_query*, int, const struct addrinfo*, typeof(userdata)) = callback; \ + void (*_destroy_)(typeof(userdata)) = destroy_callback; \ + resolve_getaddrinfo_with_destroy_callback( \ + resolve, ret_query, \ + node, service, hints, \ + (sd_resolve_getaddrinfo_handler_t) _callback_, \ + (sd_resolve_destroy_t) _destroy_, \ + userdata); \ + }) + +#define resolve_getnameinfo(resolve, ret_query, sa, salen, flags, get, callback, destroy_callback, userdata) \ + ({ \ + int (*_callback_)(sd_resolve_query*, int, const char*, const char*, typeof(userdata)) = callback; \ + void (*_destroy_)(typeof(userdata)) = destroy_callback; \ + resolve_getaddrinfo_with_destroy_callback( \ + resolve, ret_query, \ + sa, salen, flags, get, \ + (sd_resolve_getnameinfo_handler_t) _callback_, \ + (sd_resolve_destroy_t) _destroy_, \ + userdata); \ + }) diff --git a/src/libsystemd/sd-resolve/sd-resolve.c b/src/libsystemd/sd-resolve/sd-resolve.c new file mode 100644 index 0000000..2000f86 --- /dev/null +++ b/src/libsystemd/sd-resolve/sd-resolve.c @@ -0,0 +1,1296 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-resolve.h" + +#include "alloc-util.h" +#include "dns-def.h" +#include "errno-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "iovec-util.h" +#include "list.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "missing_threads.h" +#include "process-util.h" +#include "resolve-private.h" +#include "socket-util.h" + +#define WORKERS_MIN 1U +#define WORKERS_MAX 16U +#define QUERIES_MAX 256U +#define BUFSIZE 10240U + +typedef enum { + REQUEST_ADDRINFO, + RESPONSE_ADDRINFO, + REQUEST_NAMEINFO, + RESPONSE_NAMEINFO, + REQUEST_TERMINATE, + RESPONSE_DIED +} QueryType; + +enum { + REQUEST_RECV_FD, + REQUEST_SEND_FD, + RESPONSE_RECV_FD, + RESPONSE_SEND_FD, + _FD_MAX +}; + +struct sd_resolve { + unsigned n_ref; + + bool dead:1; + pid_t original_pid; + + int fds[_FD_MAX]; + + pthread_t workers[WORKERS_MAX]; + unsigned n_valid_workers; + + unsigned current_id; + sd_resolve_query* query_array[QUERIES_MAX]; + unsigned n_queries, n_done, n_outstanding; + + sd_event_source *event_source; + sd_event *event; + + sd_resolve_query *current; + + sd_resolve **default_resolve_ptr; + pid_t tid; + + LIST_HEAD(sd_resolve_query, queries); +}; + +struct sd_resolve_query { + unsigned n_ref; + + sd_resolve *resolve; + + QueryType type:4; + bool done:1; + bool floating:1; + unsigned id; + + int ret; + int _errno; + int _h_errno; + struct addrinfo *addrinfo; + char *serv, *host; + + union { + sd_resolve_getaddrinfo_handler_t getaddrinfo_handler; + sd_resolve_getnameinfo_handler_t getnameinfo_handler; + }; + + void *userdata; + sd_resolve_destroy_t destroy_callback; + + LIST_FIELDS(sd_resolve_query, queries); +}; + +typedef struct RHeader { + QueryType type; + unsigned id; + size_t length; +} RHeader; + +typedef struct AddrInfoRequest { + struct RHeader header; + bool hints_valid; + int ai_flags; + int ai_family; + int ai_socktype; + int ai_protocol; + size_t node_len, service_len; +} AddrInfoRequest; + +typedef struct AddrInfoResponse { + struct RHeader header; + int ret; + int _errno; + int _h_errno; + /* followed by addrinfo_serialization[] */ +} AddrInfoResponse; + +typedef struct AddrInfoSerialization { + int ai_flags; + int ai_family; + int ai_socktype; + int ai_protocol; + size_t ai_addrlen; + size_t canonname_len; + /* Followed by ai_addr amd ai_canonname with variable lengths */ +} AddrInfoSerialization; + +typedef struct NameInfoRequest { + struct RHeader header; + int flags; + socklen_t sockaddr_len; + bool gethost:1, getserv:1; +} NameInfoRequest; + +typedef struct NameInfoResponse { + struct RHeader header; + size_t hostlen, servlen; + int ret; + int _errno; + int _h_errno; +} NameInfoResponse; + +typedef union Packet { + RHeader rheader; + AddrInfoRequest addrinfo_request; + AddrInfoResponse addrinfo_response; + NameInfoRequest nameinfo_request; + NameInfoResponse nameinfo_response; +} Packet; + +static int getaddrinfo_done(sd_resolve_query* q); +static int getnameinfo_done(sd_resolve_query *q); + +static void resolve_query_disconnect(sd_resolve_query *q); + +#define RESOLVE_DONT_DESTROY(resolve) \ + _cleanup_(sd_resolve_unrefp) _unused_ sd_resolve *_dont_destroy_##resolve = sd_resolve_ref(resolve) + +static void query_assign_errno(sd_resolve_query *q, int ret, int error, int h_error) { + assert(q); + + q->ret = ret; + q->_errno = abs(error); + q->_h_errno = h_error; +} + +static int send_died(int out_fd) { + RHeader rh = { + .type = RESPONSE_DIED, + .length = sizeof(RHeader), + }; + + assert(out_fd >= 0); + + if (send(out_fd, &rh, rh.length, MSG_NOSIGNAL) < 0) + return -errno; + + return 0; +} + +static void *serialize_addrinfo(void *p, const struct addrinfo *ai, size_t *length, size_t maxlength) { + AddrInfoSerialization s; + size_t cnl, l; + + assert(p); + assert(ai); + assert(length); + assert(*length <= maxlength); + + cnl = ai->ai_canonname ? strlen(ai->ai_canonname)+1 : 0; + l = sizeof(AddrInfoSerialization) + ai->ai_addrlen + cnl; + + if (*length + l > maxlength) + return NULL; + + s = (AddrInfoSerialization) { + .ai_flags = ai->ai_flags, + .ai_family = ai->ai_family, + .ai_socktype = ai->ai_socktype, + .ai_protocol = ai->ai_protocol, + .ai_addrlen = ai->ai_addrlen, + .canonname_len = cnl, + }; + + memcpy((uint8_t*) p, &s, sizeof(AddrInfoSerialization)); + memcpy((uint8_t*) p + sizeof(AddrInfoSerialization), ai->ai_addr, ai->ai_addrlen); + memcpy_safe((char*) p + sizeof(AddrInfoSerialization) + ai->ai_addrlen, + ai->ai_canonname, cnl); + + *length += l; + return (uint8_t*) p + l; +} + +static int send_addrinfo_reply( + int out_fd, + unsigned id, + int ret, + struct addrinfo *ai, + int _errno, + int _h_errno) { + + AddrInfoResponse resp = {}; + union { + AddrInfoSerialization ais; + uint8_t space[BUFSIZE]; + } buffer; + struct iovec iov[2]; + struct msghdr mh; + + assert(out_fd >= 0); + + resp = (AddrInfoResponse) { + .header.type = RESPONSE_ADDRINFO, + .header.id = id, + .header.length = sizeof(AddrInfoResponse), + .ret = ret, + ._errno = _errno, + ._h_errno = _h_errno, + }; + + msan_unpoison(&resp, sizeof(resp)); + + if (ret == 0 && ai) { + void *p = &buffer; + struct addrinfo *k; + + for (k = ai; k; k = k->ai_next) { + p = serialize_addrinfo(p, k, &resp.header.length, (uint8_t*) &buffer + BUFSIZE - (uint8_t*) p); + if (!p) { + freeaddrinfo(ai); + return -ENOBUFS; + } + } + } + + if (ai) + freeaddrinfo(ai); + + iov[0] = IOVEC_MAKE(&resp, sizeof(AddrInfoResponse)); + iov[1] = IOVEC_MAKE(&buffer, resp.header.length - sizeof(AddrInfoResponse)); + + mh = (struct msghdr) { + .msg_iov = iov, + .msg_iovlen = ELEMENTSOF(iov) + }; + + if (sendmsg(out_fd, &mh, MSG_NOSIGNAL) < 0) + return -errno; + + return 0; +} + +static int send_nameinfo_reply( + int out_fd, + unsigned id, + int ret, + const char *host, + const char *serv, + int _errno, + int _h_errno) { + + NameInfoResponse resp = {}; + struct iovec iov[3]; + struct msghdr mh; + size_t hl, sl; + + assert(out_fd >= 0); + + sl = serv ? strlen(serv)+1 : 0; + hl = host ? strlen(host)+1 : 0; + + resp = (NameInfoResponse) { + .header.type = RESPONSE_NAMEINFO, + .header.id = id, + .header.length = sizeof(NameInfoResponse) + hl + sl, + .hostlen = hl, + .servlen = sl, + .ret = ret, + ._errno = _errno, + ._h_errno = _h_errno, + }; + + msan_unpoison(&resp, sizeof(resp)); + + iov[0] = IOVEC_MAKE(&resp, sizeof(NameInfoResponse)); + iov[1] = IOVEC_MAKE((void*) host, hl); + iov[2] = IOVEC_MAKE((void*) serv, sl); + + mh = (struct msghdr) { + .msg_iov = iov, + .msg_iovlen = ELEMENTSOF(iov) + }; + + if (sendmsg(out_fd, &mh, MSG_NOSIGNAL) < 0) + return -errno; + + return 0; +} + +static int handle_request(int out_fd, const Packet *packet, size_t length) { + const RHeader *req; + + assert(out_fd >= 0); + assert(packet); + + req = &packet->rheader; + + assert_return(length >= sizeof(RHeader), -EIO); + assert_return(length == req->length, -EIO); + + switch (req->type) { + + case REQUEST_ADDRINFO: { + const AddrInfoRequest *ai_req = &packet->addrinfo_request; + struct addrinfo hints, *result = NULL; + const char *node, *service; + int ret; + + assert_return(length >= sizeof(AddrInfoRequest), -EBADMSG); + assert_return(length == sizeof(AddrInfoRequest) + ai_req->node_len + ai_req->service_len, -EBADMSG); + + hints = (struct addrinfo) { + .ai_flags = ai_req->ai_flags, + .ai_family = ai_req->ai_family, + .ai_socktype = ai_req->ai_socktype, + .ai_protocol = ai_req->ai_protocol, + }; + + msan_unpoison(&hints, sizeof(hints)); + + node = ai_req->node_len ? (const char*) ai_req + sizeof(AddrInfoRequest) : NULL; + service = ai_req->service_len ? (const char*) ai_req + sizeof(AddrInfoRequest) + ai_req->node_len : NULL; + + ret = getaddrinfo(node, service, + ai_req->hints_valid ? &hints : NULL, + &result); + + /* send_addrinfo_reply() frees result */ + return send_addrinfo_reply(out_fd, req->id, ret, result, errno, h_errno); + } + + case REQUEST_NAMEINFO: { + const NameInfoRequest *ni_req = &packet->nameinfo_request; + char hostbuf[NI_MAXHOST], servbuf[NI_MAXSERV]; + union sockaddr_union sa; + int ret; + + assert_return(length >= sizeof(NameInfoRequest), -EBADMSG); + assert_return(length == sizeof(NameInfoRequest) + ni_req->sockaddr_len, -EBADMSG); + assert_return(ni_req->sockaddr_len <= sizeof(sa), -EBADMSG); + + memcpy(&sa, (const uint8_t *) ni_req + sizeof(NameInfoRequest), ni_req->sockaddr_len); + + ret = getnameinfo(&sa.sa, ni_req->sockaddr_len, + ni_req->gethost ? hostbuf : NULL, ni_req->gethost ? sizeof(hostbuf) : 0, + ni_req->getserv ? servbuf : NULL, ni_req->getserv ? sizeof(servbuf) : 0, + ni_req->flags); + + return send_nameinfo_reply(out_fd, req->id, ret, + ret == 0 && ni_req->gethost ? hostbuf : NULL, + ret == 0 && ni_req->getserv ? servbuf : NULL, + errno, h_errno); + } + + case REQUEST_TERMINATE: + /* Quit */ + return -ECONNRESET; + + default: + assert_not_reached(); + } + + return 0; +} + +static void* thread_worker(void *p) { + sd_resolve *resolve = p; + + /* Assign a pretty name to this thread */ + (void) pthread_setname_np(pthread_self(), "sd-resolve"); + + while (!resolve->dead) { + union { + Packet packet; + uint8_t space[BUFSIZE]; + } buf; + ssize_t length; + + length = recv(resolve->fds[REQUEST_RECV_FD], &buf, sizeof buf, 0); + if (length < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + continue; + + break; + } + if (length == 0) + break; + + if (handle_request(resolve->fds[RESPONSE_SEND_FD], &buf.packet, (size_t) length) < 0) + break; + } + + send_died(resolve->fds[RESPONSE_SEND_FD]); + + return NULL; +} + +static int start_threads(sd_resolve *resolve, unsigned extra) { + sigset_t ss, saved_ss; + unsigned n; + int r, k; + + assert_se(sigfillset(&ss) >= 0); + + /* No signals in forked off threads please. We set the mask before forking, so that the threads never exist + * with a different mask than a fully blocked one */ + r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss); + if (r > 0) + return -r; + + n = resolve->n_outstanding + extra; + n = CLAMP(n, WORKERS_MIN, WORKERS_MAX); + + while (resolve->n_valid_workers < n) { + r = pthread_create(&resolve->workers[resolve->n_valid_workers], NULL, thread_worker, resolve); + if (r > 0) { + r = -r; + goto finish; + } + + resolve->n_valid_workers++; + } + + r = 0; + +finish: + k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL); + if (k > 0 && r >= 0) + r = -k; + + return r; +} + +static bool resolve_pid_changed(sd_resolve *r) { + assert(r); + + /* We don't support people creating a resolver and keeping it + * around after fork(). Let's complain. */ + + return r->original_pid != getpid_cached(); +} + +int sd_resolve_new(sd_resolve **ret) { + _cleanup_(sd_resolve_unrefp) sd_resolve *resolve = NULL; + int i; + + assert_return(ret, -EINVAL); + + resolve = new0(sd_resolve, 1); + if (!resolve) + return -ENOMEM; + + resolve->n_ref = 1; + resolve->original_pid = getpid_cached(); + + for (i = 0; i < _FD_MAX; i++) + resolve->fds[i] = -EBADF; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, resolve->fds + REQUEST_RECV_FD) < 0) + return -errno; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, resolve->fds + RESPONSE_RECV_FD) < 0) + return -errno; + + for (i = 0; i < _FD_MAX; i++) + resolve->fds[i] = fd_move_above_stdio(resolve->fds[i]); + + (void) fd_inc_sndbuf(resolve->fds[REQUEST_SEND_FD], QUERIES_MAX * BUFSIZE); + (void) fd_increase_rxbuf(resolve->fds[REQUEST_RECV_FD], QUERIES_MAX * BUFSIZE); + (void) fd_inc_sndbuf(resolve->fds[RESPONSE_SEND_FD], QUERIES_MAX * BUFSIZE); + (void) fd_increase_rxbuf(resolve->fds[RESPONSE_RECV_FD], QUERIES_MAX * BUFSIZE); + + (void) fd_nonblock(resolve->fds[RESPONSE_RECV_FD], true); + + *ret = TAKE_PTR(resolve); + return 0; +} + +int sd_resolve_default(sd_resolve **ret) { + static thread_local sd_resolve *default_resolve = NULL; + sd_resolve *e = NULL; + int r; + + if (!ret) + return !!default_resolve; + + if (default_resolve) { + *ret = sd_resolve_ref(default_resolve); + return 0; + } + + r = sd_resolve_new(&e); + if (r < 0) + return r; + + e->default_resolve_ptr = &default_resolve; + e->tid = gettid(); + default_resolve = e; + + *ret = e; + return 1; +} + +int sd_resolve_get_tid(sd_resolve *resolve, pid_t *tid) { + assert_return(resolve, -EINVAL); + assert_return(tid, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + if (resolve->tid != 0) { + *tid = resolve->tid; + return 0; + } + + if (resolve->event) + return sd_event_get_tid(resolve->event, tid); + + return -ENXIO; +} + +static sd_resolve *resolve_free(sd_resolve *resolve) { + PROTECT_ERRNO; + sd_resolve_query *q; + unsigned i; + + assert(resolve); + + while ((q = resolve->queries)) { + assert(q->floating); + resolve_query_disconnect(q); + sd_resolve_query_unref(q); + } + + if (resolve->default_resolve_ptr) + *(resolve->default_resolve_ptr) = NULL; + + resolve->dead = true; + + sd_resolve_detach_event(resolve); + + if (resolve->fds[REQUEST_SEND_FD] >= 0) { + + RHeader req = { + .type = REQUEST_TERMINATE, + .length = sizeof req, + }; + + /* Send one termination packet for each worker */ + for (i = 0; i < resolve->n_valid_workers; i++) + (void) send(resolve->fds[REQUEST_SEND_FD], &req, req.length, MSG_NOSIGNAL); + } + + /* Now terminate them and wait until they are gone. + If we get an error than most likely the thread already exited. */ + for (i = 0; i < resolve->n_valid_workers; i++) + (void) pthread_join(resolve->workers[i], NULL); + + /* Close all communication channels */ + close_many(resolve->fds, _FD_MAX); + + return mfree(resolve); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_resolve, sd_resolve, resolve_free); + +int sd_resolve_get_fd(sd_resolve *resolve) { + assert_return(resolve, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + return resolve->fds[RESPONSE_RECV_FD]; +} + +int sd_resolve_get_events(sd_resolve *resolve) { + assert_return(resolve, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + return resolve->n_queries > resolve->n_done ? POLLIN : 0; +} + +int sd_resolve_get_timeout(sd_resolve *resolve, uint64_t *usec) { + assert_return(resolve, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + *usec = UINT64_MAX; + return 0; +} + +static sd_resolve_query *lookup_query(sd_resolve *resolve, unsigned id) { + sd_resolve_query *q; + + assert(resolve); + + q = resolve->query_array[id % QUERIES_MAX]; + if (q) + if (q->id == id) + return q; + + return NULL; +} + +static int complete_query(sd_resolve *resolve, sd_resolve_query *q) { + int r; + + assert(q); + assert(!q->done); + assert(q->resolve == resolve); + + q->done = true; + resolve->n_done++; + + resolve->current = sd_resolve_query_ref(q); + + switch (q->type) { + + case REQUEST_ADDRINFO: + r = getaddrinfo_done(q); + break; + + case REQUEST_NAMEINFO: + r = getnameinfo_done(q); + break; + + default: + assert_not_reached(); + } + + resolve->current = NULL; + + if (q->floating) { + resolve_query_disconnect(q); + sd_resolve_query_unref(q); + } + + sd_resolve_query_unref(q); + + return r; +} + +static int unserialize_addrinfo(const void **p, size_t *length, struct addrinfo **ret_ai) { + AddrInfoSerialization s; + struct addrinfo *ai; + size_t l; + + assert(p); + assert(*p); + assert(ret_ai); + assert(length); + + if (*length < sizeof(AddrInfoSerialization)) + return -EBADMSG; + + memcpy(&s, *p, sizeof(s)); + + l = sizeof(AddrInfoSerialization) + s.ai_addrlen + s.canonname_len; + if (*length < l) + return -EBADMSG; + + ai = new(struct addrinfo, 1); + if (!ai) + return -ENOMEM; + + *ai = (struct addrinfo) { + .ai_flags = s.ai_flags, + .ai_family = s.ai_family, + .ai_socktype = s.ai_socktype, + .ai_protocol = s.ai_protocol, + .ai_addrlen = s.ai_addrlen, + }; + + if (s.ai_addrlen > 0) { + ai->ai_addr = memdup((const uint8_t*) *p + sizeof(AddrInfoSerialization), s.ai_addrlen); + if (!ai->ai_addr) { + free(ai); + return -ENOMEM; + } + } + + if (s.canonname_len > 0) { + ai->ai_canonname = memdup((const uint8_t*) *p + sizeof(AddrInfoSerialization) + s.ai_addrlen, s.canonname_len); + if (!ai->ai_canonname) { + free(ai->ai_addr); + free(ai); + return -ENOMEM; + } + } + + *length -= l; + *ret_ai = ai; + *p = ((const uint8_t*) *p) + l; + + return 0; +} + +static int handle_response(sd_resolve *resolve, const Packet *packet, size_t length) { + const RHeader *resp; + sd_resolve_query *q; + int r; + + assert(resolve); + assert(packet); + + resp = &packet->rheader; + assert_return(length >= sizeof(RHeader), -EIO); + assert_return(length == resp->length, -EIO); + + if (resp->type == RESPONSE_DIED) { + resolve->dead = true; + return 0; + } + + assert(resolve->n_outstanding > 0); + resolve->n_outstanding--; + + q = lookup_query(resolve, resp->id); + if (!q) + return 0; + + switch (resp->type) { + + case RESPONSE_ADDRINFO: { + const AddrInfoResponse *ai_resp = &packet->addrinfo_response; + const void *p; + size_t l; + struct addrinfo *prev = NULL; + + assert_return(length >= sizeof(AddrInfoResponse), -EBADMSG); + assert_return(q->type == REQUEST_ADDRINFO, -EBADMSG); + + query_assign_errno(q, ai_resp->ret, ai_resp->_errno, ai_resp->_h_errno); + + l = length - sizeof(AddrInfoResponse); + p = (const uint8_t*) resp + sizeof(AddrInfoResponse); + + while (l > 0 && p) { + struct addrinfo *ai = NULL; + + r = unserialize_addrinfo(&p, &l, &ai); + if (r < 0) { + query_assign_errno(q, EAI_SYSTEM, r, 0); + freeaddrinfo(q->addrinfo); + q->addrinfo = NULL; + break; + } + + if (prev) + prev->ai_next = ai; + else + q->addrinfo = ai; + + prev = ai; + } + + return complete_query(resolve, q); + } + + case RESPONSE_NAMEINFO: { + const NameInfoResponse *ni_resp = &packet->nameinfo_response; + + assert_return(length >= sizeof(NameInfoResponse), -EBADMSG); + assert_return(q->type == REQUEST_NAMEINFO, -EBADMSG); + + if (ni_resp->hostlen > DNS_HOSTNAME_MAX || + ni_resp->servlen > DNS_HOSTNAME_MAX || + sizeof(NameInfoResponse) + ni_resp->hostlen + ni_resp->servlen > length) + query_assign_errno(q, EAI_SYSTEM, EIO, 0); + else { + query_assign_errno(q, ni_resp->ret, ni_resp->_errno, ni_resp->_h_errno); + + if (ni_resp->hostlen > 0) { + q->host = strndup((const char*) ni_resp + sizeof(NameInfoResponse), + ni_resp->hostlen-1); + if (!q->host) + query_assign_errno(q, EAI_MEMORY, ENOMEM, 0); + } + + if (ni_resp->servlen > 0) { + q->serv = strndup((const char*) ni_resp + sizeof(NameInfoResponse) + ni_resp->hostlen, + ni_resp->servlen-1); + if (!q->serv) + query_assign_errno(q, EAI_MEMORY, ENOMEM, 0); + } + } + + return complete_query(resolve, q); + } + + default: + return 0; + } +} + +int sd_resolve_process(sd_resolve *resolve) { + RESOLVE_DONT_DESTROY(resolve); + + union { + Packet packet; + uint8_t space[BUFSIZE]; + } buf; + ssize_t l; + int r; + + assert_return(resolve, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + /* We don't allow recursively invoking sd_resolve_process(). */ + assert_return(!resolve->current, -EBUSY); + + l = recv(resolve->fds[RESPONSE_RECV_FD], &buf, sizeof buf, 0); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return -errno; + } + if (l == 0) + return -ECONNREFUSED; + + r = handle_response(resolve, &buf.packet, (size_t) l); + if (r < 0) + return r; + + return 1; +} + +int sd_resolve_wait(sd_resolve *resolve, uint64_t timeout_usec) { + int r; + + assert_return(resolve, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + if (resolve->n_done >= resolve->n_queries) + return 0; + + do { + r = fd_wait_for_event(resolve->fds[RESPONSE_RECV_FD], POLLIN, timeout_usec); + } while (r == -EINTR); + + if (r < 0) + return r; + if (r == 0) + return -ETIMEDOUT; + + return sd_resolve_process(resolve); +} + +static int alloc_query(sd_resolve *resolve, bool floating, sd_resolve_query **_q) { + sd_resolve_query *q; + int r; + + assert(resolve); + assert(_q); + + if (resolve->n_queries >= QUERIES_MAX) + return -ENOBUFS; + + r = start_threads(resolve, 1); + if (r < 0) + return r; + + while (resolve->query_array[resolve->current_id % QUERIES_MAX]) + resolve->current_id++; + + q = resolve->query_array[resolve->current_id % QUERIES_MAX] = new0(sd_resolve_query, 1); + if (!q) + return -ENOMEM; + + q->n_ref = 1; + q->resolve = resolve; + q->floating = floating; + q->id = resolve->current_id++; + + if (!floating) + sd_resolve_ref(resolve); + + LIST_PREPEND(queries, resolve->queries, q); + resolve->n_queries++; + + *_q = q; + return 0; +} + +int resolve_getaddrinfo_with_destroy_callback( + sd_resolve *resolve, + sd_resolve_query **ret_query, + const char *node, const char *service, + const struct addrinfo *hints, + sd_resolve_getaddrinfo_handler_t callback, + sd_resolve_destroy_t destroy_callback, + void *userdata) { + + _cleanup_(sd_resolve_query_unrefp) sd_resolve_query *q = NULL; + size_t node_len, service_len; + AddrInfoRequest req = {}; + struct iovec iov[3]; + struct msghdr mh = {}; + int r; + + assert_return(resolve, -EINVAL); + assert_return(node || service, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + r = alloc_query(resolve, !ret_query, &q); + if (r < 0) + return r; + + q->type = REQUEST_ADDRINFO; + q->getaddrinfo_handler = callback; + q->userdata = userdata; + + node_len = node ? strlen(node) + 1 : 0; + service_len = service ? strlen(service) + 1 : 0; + + req = (AddrInfoRequest) { + .node_len = node_len, + .service_len = service_len, + + .header.id = q->id, + .header.type = REQUEST_ADDRINFO, + .header.length = sizeof(AddrInfoRequest) + node_len + service_len, + + .hints_valid = hints, + .ai_flags = hints ? hints->ai_flags : 0, + .ai_family = hints ? hints->ai_family : 0, + .ai_socktype = hints ? hints->ai_socktype : 0, + .ai_protocol = hints ? hints->ai_protocol : 0, + }; + + msan_unpoison(&req, sizeof(req)); + + iov[mh.msg_iovlen++] = IOVEC_MAKE(&req, sizeof(AddrInfoRequest)); + if (node) + iov[mh.msg_iovlen++] = IOVEC_MAKE((void*) node, req.node_len); + if (service) + iov[mh.msg_iovlen++] = IOVEC_MAKE((void*) service, req.service_len); + mh.msg_iov = iov; + + if (sendmsg(resolve->fds[REQUEST_SEND_FD], &mh, MSG_NOSIGNAL) < 0) + return -errno; + + resolve->n_outstanding++; + q->destroy_callback = destroy_callback; + + if (ret_query) + *ret_query = q; + + TAKE_PTR(q); + + return 0; +} + +int sd_resolve_getaddrinfo( + sd_resolve *resolve, + sd_resolve_query **ret_query, + const char *node, const char *service, + const struct addrinfo *hints, + sd_resolve_getaddrinfo_handler_t callback, + void *userdata) { + + return resolve_getaddrinfo_with_destroy_callback(resolve, ret_query, node, service, hints, callback, NULL, userdata); +} + +static int getaddrinfo_done(sd_resolve_query* q) { + assert(q); + assert(q->done); + assert(q->getaddrinfo_handler); + + errno = q->_errno; + h_errno = q->_h_errno; + + return q->getaddrinfo_handler(q, q->ret, q->addrinfo, q->userdata); +} + +int resolve_getnameinfo_with_destroy_callback( + sd_resolve *resolve, + sd_resolve_query **ret_query, + const struct sockaddr *sa, socklen_t salen, + int flags, + uint64_t get, + sd_resolve_getnameinfo_handler_t callback, + sd_resolve_destroy_t destroy_callback, + void *userdata) { + + _cleanup_(sd_resolve_query_unrefp) sd_resolve_query *q = NULL; + NameInfoRequest req = {}; + struct iovec iov[2]; + struct msghdr mh; + int r; + + assert_return(resolve, -EINVAL); + assert_return(sa, -EINVAL); + assert_return(salen >= sizeof(struct sockaddr), -EINVAL); + assert_return(salen <= sizeof(union sockaddr_union), -EINVAL); + assert_return((get & ~SD_RESOLVE_GET_BOTH) == 0, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(!resolve_pid_changed(resolve), -ECHILD); + + r = alloc_query(resolve, !ret_query, &q); + if (r < 0) + return r; + + q->type = REQUEST_NAMEINFO; + q->getnameinfo_handler = callback; + q->userdata = userdata; + + req = (NameInfoRequest) { + .header.id = q->id, + .header.type = REQUEST_NAMEINFO, + .header.length = sizeof(NameInfoRequest) + salen, + + .flags = flags, + .sockaddr_len = salen, + .gethost = !!(get & SD_RESOLVE_GET_HOST), + .getserv = !!(get & SD_RESOLVE_GET_SERVICE), + }; + + msan_unpoison(&req, sizeof(req)); + + iov[0] = IOVEC_MAKE(&req, sizeof(NameInfoRequest)); + iov[1] = IOVEC_MAKE((void*) sa, salen); + + mh = (struct msghdr) { + .msg_iov = iov, + .msg_iovlen = ELEMENTSOF(iov) + }; + + if (sendmsg(resolve->fds[REQUEST_SEND_FD], &mh, MSG_NOSIGNAL) < 0) + return -errno; + + resolve->n_outstanding++; + q->destroy_callback = destroy_callback; + + if (ret_query) + *ret_query = q; + + TAKE_PTR(q); + + return 0; +} + +int sd_resolve_getnameinfo( + sd_resolve *resolve, + sd_resolve_query **ret_query, + const struct sockaddr *sa, socklen_t salen, + int flags, + uint64_t get, + sd_resolve_getnameinfo_handler_t callback, + void *userdata) { + + return resolve_getnameinfo_with_destroy_callback(resolve, ret_query, sa, salen, flags, get, callback, NULL, userdata); +} + +static int getnameinfo_done(sd_resolve_query *q) { + + assert(q); + assert(q->done); + assert(q->getnameinfo_handler); + + errno = q->_errno; + h_errno = q->_h_errno; + + return q->getnameinfo_handler(q, q->ret, q->host, q->serv, q->userdata); +} + +static void resolve_freeaddrinfo(struct addrinfo *ai) { + while (ai) { + struct addrinfo *next = ai->ai_next; + + free(ai->ai_addr); + free(ai->ai_canonname); + free_and_replace(ai, next); + } +} + +static void resolve_query_disconnect(sd_resolve_query *q) { + sd_resolve *resolve; + unsigned i; + + assert(q); + + if (!q->resolve) + return; + + resolve = q->resolve; + assert(resolve->n_queries > 0); + + if (q->done) { + assert(resolve->n_done > 0); + resolve->n_done--; + } + + i = q->id % QUERIES_MAX; + assert(resolve->query_array[i] == q); + resolve->query_array[i] = NULL; + LIST_REMOVE(queries, resolve->queries, q); + resolve->n_queries--; + + q->resolve = NULL; + if (!q->floating) + sd_resolve_unref(resolve); +} + +static sd_resolve_query *resolve_query_free(sd_resolve_query *q) { + assert(q); + + resolve_query_disconnect(q); + + if (q->destroy_callback) + q->destroy_callback(q->userdata); + + resolve_freeaddrinfo(q->addrinfo); + free(q->host); + free(q->serv); + + return mfree(q); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_resolve_query, sd_resolve_query, resolve_query_free); + +int sd_resolve_query_is_done(sd_resolve_query *q) { + assert_return(q, -EINVAL); + assert_return(!resolve_pid_changed(q->resolve), -ECHILD); + + return q->done; +} + +void* sd_resolve_query_set_userdata(sd_resolve_query *q, void *userdata) { + void *ret; + + assert_return(q, NULL); + assert_return(!resolve_pid_changed(q->resolve), NULL); + + ret = q->userdata; + q->userdata = userdata; + + return ret; +} + +void* sd_resolve_query_get_userdata(sd_resolve_query *q) { + assert_return(q, NULL); + assert_return(!resolve_pid_changed(q->resolve), NULL); + + return q->userdata; +} + +sd_resolve *sd_resolve_query_get_resolve(sd_resolve_query *q) { + assert_return(q, NULL); + assert_return(!resolve_pid_changed(q->resolve), NULL); + + return q->resolve; +} + +int sd_resolve_query_get_destroy_callback(sd_resolve_query *q, sd_resolve_destroy_t *destroy_callback) { + assert_return(q, -EINVAL); + + if (destroy_callback) + *destroy_callback = q->destroy_callback; + + return !!q->destroy_callback; +} + +int sd_resolve_query_set_destroy_callback(sd_resolve_query *q, sd_resolve_destroy_t destroy_callback) { + assert_return(q, -EINVAL); + + q->destroy_callback = destroy_callback; + return 0; +} + +int sd_resolve_query_get_floating(sd_resolve_query *q) { + assert_return(q, -EINVAL); + + return q->floating; +} + +int sd_resolve_query_set_floating(sd_resolve_query *q, int b) { + assert_return(q, -EINVAL); + + if (q->floating == !!b) + return 0; + + if (!q->resolve) /* Already disconnected */ + return -ESTALE; + + q->floating = b; + + if (b) { + sd_resolve_query_ref(q); + sd_resolve_unref(q->resolve); + } else { + sd_resolve_ref(q->resolve); + sd_resolve_query_unref(q); + } + + return 1; +} + +static int io_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + sd_resolve *resolve = ASSERT_PTR(userdata); + int r; + + r = sd_resolve_process(resolve); + if (r < 0) + return r; + + return 1; +} + +int sd_resolve_attach_event(sd_resolve *resolve, sd_event *event, int64_t priority) { + int r; + + assert_return(resolve, -EINVAL); + assert_return(!resolve->event, -EBUSY); + + assert(!resolve->event_source); + + if (event) + resolve->event = sd_event_ref(event); + else { + r = sd_event_default(&resolve->event); + if (r < 0) + return r; + } + + r = sd_event_add_io(resolve->event, &resolve->event_source, resolve->fds[RESPONSE_RECV_FD], POLLIN, io_callback, resolve); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(resolve->event_source, priority); + if (r < 0) + goto fail; + + return 0; + +fail: + sd_resolve_detach_event(resolve); + return r; +} + + int sd_resolve_detach_event(sd_resolve *resolve) { + assert_return(resolve, -EINVAL); + + if (!resolve->event) + return 0; + + resolve->event_source = sd_event_source_disable_unref(resolve->event_source); + resolve->event = sd_event_unref(resolve->event); + return 1; +} + +sd_event *sd_resolve_get_event(sd_resolve *resolve) { + assert_return(resolve, NULL); + + return resolve->event; +} diff --git a/src/libsystemd/sd-resolve/test-resolve.c b/src/libsystemd/sd-resolve/test-resolve.c new file mode 100644 index 0000000..829e13e --- /dev/null +++ b/src/libsystemd/sd-resolve/test-resolve.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "sd-resolve.h" + +#include "alloc-util.h" +#include "macro.h" +#include "socket-util.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" + +#define TEST_TIMEOUT_USEC (20*USEC_PER_SEC) + +static int getaddrinfo_handler(sd_resolve_query *q, int ret, const struct addrinfo *ai, void *userdata) { + const struct addrinfo *i; + + assert_se(q); + + if (ret != 0) { + log_error("getaddrinfo error: %s %i", gai_strerror(ret), ret); + return 0; + } + + for (i = ai; i; i = i->ai_next) { + _cleanup_free_ char *addr = NULL; + + assert_se(sockaddr_pretty(i->ai_addr, i->ai_addrlen, false, true, &addr) == 0); + puts(addr); + } + + printf("canonical name: %s\n", strna(ai->ai_canonname)); + + return 0; +} + +static int getnameinfo_handler(sd_resolve_query *q, int ret, const char *host, const char *serv, void *userdata) { + assert_se(q); + + if (ret != 0) { + log_error("getnameinfo error: %s %i", gai_strerror(ret), ret); + return 0; + } + + printf("Host: %s — Serv: %s\n", strna(host), strna(serv)); + return 0; +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_resolve_query_unrefp) sd_resolve_query *q1 = NULL, *q2 = NULL; + _cleanup_(sd_resolve_unrefp) sd_resolve *resolve = NULL; + int r; + + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM, + .ai_flags = AI_CANONNAME, + }; + + union sockaddr_union sa = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(80), + }; + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_resolve_default(&resolve) >= 0); + + /* Test a floating resolver query */ + r = sd_resolve_getaddrinfo(resolve, NULL, "redhat.com", "http", NULL, getaddrinfo_handler, NULL); + if (r < 0) + log_error_errno(r, "sd_resolve_getaddrinfo(): %m"); + + /* Make a name -> address query */ + r = sd_resolve_getaddrinfo(resolve, &q1, argc >= 2 ? argv[1] : "www.heise.de", NULL, &hints, getaddrinfo_handler, NULL); + if (r < 0) + log_error_errno(r, "sd_resolve_getaddrinfo(): %m"); + + /* Make an address -> name query */ + sa.in.sin_addr.s_addr = inet_addr(argc >= 3 ? argv[2] : "193.99.144.71"); + r = sd_resolve_getnameinfo(resolve, &q2, &sa.sa, SOCKADDR_LEN(sa), 0, SD_RESOLVE_GET_BOTH, getnameinfo_handler, NULL); + if (r < 0) + log_error_errno(r, "sd_resolve_getnameinfo(): %m"); + + /* Wait until all queries are completed */ + for (;;) { + r = sd_resolve_wait(resolve, TEST_TIMEOUT_USEC); + if (r == 0) + break; + if (r == -ETIMEDOUT) { + /* Let's catch timeouts here, so that we can run safely in a CI that has no reliable DNS. Note + * that we invoke exit() directly here, as the stuck NSS call will not allow us to exit + * cleanly. */ + + log_notice_errno(r, "sd_resolve_wait() timed out, but that's OK"); + exit(EXIT_SUCCESS); + } + if (r < 0) { + log_error_errno(r, "sd_resolve_wait(): %m"); + assert_not_reached(); + } + } + + return 0; +} diff --git a/src/libudev/libudev-device-internal.h b/src/libudev/libudev-device-internal.h new file mode 100644 index 0000000..437d431 --- /dev/null +++ b/src/libudev/libudev-device-internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "libudev.h" +#include "sd-device.h" + +struct udev_device; + +struct udev_device *udev_device_new(struct udev *udev, sd_device *device); +sd_device *udev_device_get_sd_device(struct udev_device *udev_device); diff --git a/src/libudev/libudev-device.c b/src/libudev/libudev-device.c new file mode 100644 index 0000000..7b9f54c --- /dev/null +++ b/src/libudev/libudev-device.c @@ -0,0 +1,895 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libudev.h" +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-private.h" +#include "device-util.h" +#include "libudev-device-internal.h" +#include "libudev-list-internal.h" +#include "parse-util.h" +#include "time-util.h" + +/** + * SECTION:libudev-device + * @short_description: kernel sys devices + * + * Representation of kernel sys devices. Devices are uniquely identified + * by their syspath, every device has exactly one path in the kernel sys + * filesystem. Devices usually belong to a kernel subsystem, and have + * a unique name inside that subsystem. + */ + +/** + * udev_device: + * + * Opaque object representing one kernel sys device. + */ +struct udev_device { + struct udev *udev; + + /* real device object */ + sd_device *device; + + /* legacy */ + unsigned n_ref; + + struct udev_device *parent; + bool parent_set; + + struct udev_list *properties; + uint64_t properties_generation; + struct udev_list *all_tags, *current_tags; + uint64_t all_tags_generation, current_tags_generation; + struct udev_list *devlinks; + uint64_t devlinks_generation; + bool properties_read:1; + bool all_tags_read:1; + bool current_tags_read:1; + bool devlinks_read:1; + struct udev_list *sysattrs; + bool sysattrs_read; +}; + +/** + * udev_device_get_seqnum: + * @udev_device: udev device + * + * This is only valid if the device was received through a monitor. Devices read from + * sys do not have a sequence number. + * + * Returns: the kernel event sequence number, or 0 if there is no sequence number available. + **/ +_public_ unsigned long long udev_device_get_seqnum(struct udev_device *udev_device) { + uint64_t seqnum; + + assert_return_errno(udev_device, 0, EINVAL); + + if (sd_device_get_seqnum(udev_device->device, &seqnum) < 0) + return 0; + + return seqnum; +} + +/** + * udev_device_get_devnum: + * @udev_device: udev device + * + * Get the device major/minor number. + * + * Returns: the dev_t number. + **/ +_public_ dev_t udev_device_get_devnum(struct udev_device *udev_device) { + dev_t devnum; + int r; + + assert_return_errno(udev_device, makedev(0, 0), EINVAL); + + r = sd_device_get_devnum(udev_device->device, &devnum); + if (r == -ENOENT) + return makedev(0, 0); + if (r < 0) + return_with_errno(makedev(0, 0), r); + + return devnum; +} + +/** + * udev_device_get_driver: + * @udev_device: udev device + * + * Get the kernel driver name. + * + * Returns: the driver name string, or #NULL if there is no driver attached. + **/ +_public_ const char *udev_device_get_driver(struct udev_device *udev_device) { + const char *driver; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_driver(udev_device->device, &driver); + if (r < 0) + return_with_errno(NULL, r); + + return driver; +} + +/** + * udev_device_get_devtype: + * @udev_device: udev device + * + * Retrieve the devtype string of the udev device. + * + * Returns: the devtype name of the udev device, or #NULL if it cannot be determined + **/ +_public_ const char *udev_device_get_devtype(struct udev_device *udev_device) { + const char *devtype; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_devtype(udev_device->device, &devtype); + if (r == -ENOENT) + return NULL; + if (r < 0) + return_with_errno(NULL, r); + + return devtype; +} + +/** + * udev_device_get_subsystem: + * @udev_device: udev device + * + * Retrieve the subsystem string of the udev device. The string does not + * contain any "/". + * + * Returns: the subsystem name of the udev device, or #NULL if it cannot be determined + **/ +_public_ const char *udev_device_get_subsystem(struct udev_device *udev_device) { + const char *subsystem; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_subsystem(udev_device->device, &subsystem); + if (r < 0) + return_with_errno(NULL, r); + + return subsystem; +} + +/** + * udev_device_get_property_value: + * @udev_device: udev device + * @key: property name + * + * Get the value of a given property. + * + * Returns: the property string, or #NULL if there is no such property. + **/ +_public_ const char *udev_device_get_property_value(struct udev_device *udev_device, const char *key) { + const char *value; + int r; + + assert_return_errno(udev_device && key, NULL, EINVAL); + + r = sd_device_get_property_value(udev_device->device, key, &value); + if (r < 0) + return_with_errno(NULL, r); + + return value; +} + +struct udev_device *udev_device_new(struct udev *udev, sd_device *device) { + _cleanup_(udev_list_freep) struct udev_list *properties = NULL, *all_tags = NULL, *current_tags = NULL, *sysattrs = NULL, *devlinks = NULL; + struct udev_device *udev_device; + + assert(device); + + properties = udev_list_new(true); + if (!properties) + return_with_errno(NULL, ENOMEM); + all_tags = udev_list_new(true); + if (!all_tags) + return_with_errno(NULL, ENOMEM); + current_tags = udev_list_new(true); + if (!current_tags) + return_with_errno(NULL, ENOMEM); + sysattrs = udev_list_new(true); + if (!sysattrs) + return_with_errno(NULL, ENOMEM); + devlinks = udev_list_new(true); + if (!devlinks) + return_with_errno(NULL, ENOMEM); + + udev_device = new(struct udev_device, 1); + if (!udev_device) + return_with_errno(NULL, ENOMEM); + + *udev_device = (struct udev_device) { + .n_ref = 1, + .udev = udev, + .device = sd_device_ref(device), + .properties = TAKE_PTR(properties), + .all_tags = TAKE_PTR(all_tags), + .current_tags = TAKE_PTR(current_tags), + .sysattrs = TAKE_PTR(sysattrs), + .devlinks = TAKE_PTR(devlinks), + }; + + return udev_device; +} + +/** + * udev_device_new_from_syspath: + * @udev: udev library context + * @syspath: sys device path including sys directory + * + * Create new udev device, and fill in information from the sys + * device and the udev database entry. The syspath is the absolute + * path to the device, including the sys mount point. + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev device. + * + * Returns: a new udev device, or #NULL, if it does not exist + **/ +_public_ struct udev_device *udev_device_new_from_syspath(struct udev *udev, const char *syspath) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + r = sd_device_new_from_syspath(&device, syspath); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(udev, device); +} + +/** + * udev_device_new_from_devnum: + * @udev: udev library context + * @type: char or block device + * @devnum: device major/minor number + * + * Create new udev device, and fill in information from the sys + * device and the udev database entry. The device is looked-up + * by its major/minor number and type. Character and block device + * numbers are not unique across the two types. + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev device. + * + * Returns: a new udev device, or #NULL, if it does not exist + **/ +_public_ struct udev_device *udev_device_new_from_devnum(struct udev *udev, char type, dev_t devnum) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + r = sd_device_new_from_devnum(&device, type, devnum); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(udev, device); +} + +/** + * udev_device_new_from_device_id: + * @udev: udev library context + * @id: text string identifying a kernel device + * + * Create new udev device, and fill in information from the sys + * device and the udev database entry. The device is looked-up + * by a special string: + * b8:2 - block device major:minor + * c128:1 - char device major:minor + * n3 - network device ifindex + * +sound:card29 - kernel driver core subsystem:device name + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev device. + * + * Returns: a new udev device, or #NULL, if it does not exist + **/ +_public_ struct udev_device *udev_device_new_from_device_id(struct udev *udev, const char *id) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + r = sd_device_new_from_device_id(&device, id); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(udev, device); +} + +/** + * udev_device_new_from_subsystem_sysname: + * @udev: udev library context + * @subsystem: the subsystem of the device + * @sysname: the name of the device + * + * Create new udev device, and fill in information from the sys device + * and the udev database entry. The device is looked up by the subsystem + * and name string of the device, like "mem" / "zero", or "block" / "sda". + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev device. + * + * Returns: a new udev device, or #NULL, if it does not exist + **/ +_public_ struct udev_device *udev_device_new_from_subsystem_sysname(struct udev *udev, const char *subsystem, const char *sysname) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + r = sd_device_new_from_subsystem_sysname(&device, subsystem, sysname); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(udev, device); +} + +/** + * udev_device_new_from_environment + * @udev: udev library context + * + * Create new udev device, and fill in information from the + * current process environment. This only works reliably if + * the process is called from a udev rule. It is usually used + * for tools executed from IMPORT= rules. + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev device. + * + * Returns: a new udev device, or #NULL, if it does not exist + **/ +_public_ struct udev_device *udev_device_new_from_environment(struct udev *udev) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + r = device_new_from_strv(&device, environ); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(udev, device); +} + +static struct udev_device *device_new_from_parent(struct udev_device *child) { + sd_device *parent; + int r; + + assert_return_errno(child, NULL, EINVAL); + + r = sd_device_get_parent(child->device, &parent); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(child->udev, parent); +} + +/** + * udev_device_get_parent: + * @udev_device: the device to start searching from + * + * Find the next parent device, and fill in information from the sys + * device and the udev database entry. + * + * Returned device is not referenced. It is attached to the child + * device, and will be cleaned up when the child device is cleaned up. + * + * It is not necessarily just the upper level directory, empty or not + * recognized sys directories are ignored. + * + * It can be called as many times as needed, without caring about + * references. + * + * Returns: a new udev device, or #NULL, if it no parent exist. + **/ +_public_ struct udev_device *udev_device_get_parent(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + if (!udev_device->parent_set) { + udev_device->parent_set = true; + udev_device->parent = device_new_from_parent(udev_device); + } + + /* TODO: errno will differ here in case parent == NULL */ + return udev_device->parent; +} + +/** + * udev_device_get_parent_with_subsystem_devtype: + * @udev_device: udev device to start searching from + * @subsystem: the subsystem of the device + * @devtype: the type (DEVTYPE) of the device + * + * Find the next parent device, with a matching subsystem and devtype + * value, and fill in information from the sys device and the udev + * database entry. + * + * If devtype is #NULL, only subsystem is checked, and any devtype will + * match. + * + * Returned device is not referenced. It is attached to the child + * device, and will be cleaned up when the child device is cleaned up. + * + * It can be called as many times as needed, without caring about + * references. + * + * Returns: a new udev device, or #NULL if no matching parent exists. + **/ +_public_ struct udev_device *udev_device_get_parent_with_subsystem_devtype(struct udev_device *udev_device, const char *subsystem, const char *devtype) { + sd_device *parent; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + /* this relies on the fact that finding the subdevice of a parent or the + parent of a subdevice commute */ + + /* first find the correct sd_device */ + r = sd_device_get_parent_with_subsystem_devtype(udev_device->device, subsystem, devtype, &parent); + if (r < 0) + return_with_errno(NULL, r); + + /* then walk the chain of udev_device parents until the corresponding + one is found */ + while ((udev_device = udev_device_get_parent(udev_device))) + if (udev_device->device == parent) + return udev_device; + + return_with_errno(NULL, ENOENT); +} + +/** + * udev_device_get_udev: + * @udev_device: udev device + * + * Retrieve the udev library context the device was created with. + * + * Returns: the udev library context + **/ +_public_ struct udev *udev_device_get_udev(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + return udev_device->udev; +} + +static struct udev_device *udev_device_free(struct udev_device *udev_device) { + assert(udev_device); + + sd_device_unref(udev_device->device); + udev_device_unref(udev_device->parent); + + udev_list_free(udev_device->properties); + udev_list_free(udev_device->sysattrs); + udev_list_free(udev_device->all_tags); + udev_list_free(udev_device->current_tags); + udev_list_free(udev_device->devlinks); + + return mfree(udev_device); +} + +/** + * udev_device_ref: + * @udev_device: udev device + * + * Take a reference of a udev device. + * + * Returns: the passed udev device + **/ + +/** + * udev_device_unref: + * @udev_device: udev device + * + * Drop a reference of a udev device. If the refcount reaches zero, + * the resources of the device will be released. + * + * Returns: #NULL + **/ +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(struct udev_device, udev_device, udev_device_free); + +/** + * udev_device_get_devpath: + * @udev_device: udev device + * + * Retrieve the kernel devpath value of the udev device. The path + * does not contain the sys mount point, and starts with a '/'. + * + * Returns: the devpath of the udev device + **/ +_public_ const char *udev_device_get_devpath(struct udev_device *udev_device) { + const char *devpath; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_devpath(udev_device->device, &devpath); + if (r < 0) + return_with_errno(NULL, r); + + return devpath; +} + +/** + * udev_device_get_syspath: + * @udev_device: udev device + * + * Retrieve the sys path of the udev device. The path is an + * absolute path and starts with the sys mount point. + * + * Returns: the sys path of the udev device + **/ +_public_ const char *udev_device_get_syspath(struct udev_device *udev_device) { + const char *syspath; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_syspath(udev_device->device, &syspath); + if (r < 0) + return_with_errno(NULL, r); + + return syspath; +} + +/** + * udev_device_get_sysname: + * @udev_device: udev device + * + * Get the kernel device name in /sys. + * + * Returns: the name string of the device + **/ +_public_ const char *udev_device_get_sysname(struct udev_device *udev_device) { + const char *sysname; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_sysname(udev_device->device, &sysname); + if (r < 0) + return_with_errno(NULL, r); + + return sysname; +} + +/** + * udev_device_get_sysnum: + * @udev_device: udev device + * + * Get the instance number of the device. + * + * Returns: the trailing number string of the device name + **/ +_public_ const char *udev_device_get_sysnum(struct udev_device *udev_device) { + const char *sysnum; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_sysnum(udev_device->device, &sysnum); + if (r == -ENOENT) + return NULL; + if (r < 0) + return_with_errno(NULL, r); + + return sysnum; +} + +/** + * udev_device_get_devnode: + * @udev_device: udev device + * + * Retrieve the device node file name belonging to the udev device. + * The path is an absolute path, and starts with the device directory. + * + * Returns: the device node file name of the udev device, or #NULL if no device node exists + **/ +_public_ const char *udev_device_get_devnode(struct udev_device *udev_device) { + const char *devnode; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_devname(udev_device->device, &devnode); + if (r < 0) + return_with_errno(NULL, r); + + return devnode; +} + +/** + * udev_device_get_devlinks_list_entry: + * @udev_device: udev device + * + * Retrieve the list of device links pointing to the device file of + * the udev device. The next list entry can be retrieved with + * udev_list_entry_get_next(), which returns #NULL if no more entries exist. + * The devlink path can be retrieved from the list entry by + * udev_list_entry_get_name(). The path is an absolute path, and starts with + * the device directory. + * + * Returns: the first entry of the device node link list + **/ +_public_ struct udev_list_entry *udev_device_get_devlinks_list_entry(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + if (device_get_devlinks_generation(udev_device->device) != udev_device->devlinks_generation || + !udev_device->devlinks_read) { + udev_list_cleanup(udev_device->devlinks); + + FOREACH_DEVICE_DEVLINK(udev_device->device, devlink) + if (!udev_list_entry_add(udev_device->devlinks, devlink, NULL)) + return_with_errno(NULL, ENOMEM); + + udev_device->devlinks_read = true; + udev_device->devlinks_generation = device_get_devlinks_generation(udev_device->device); + } + + return udev_list_get_entry(udev_device->devlinks); +} + +/** + * udev_device_get_event_properties_entry: + * @udev_device: udev device + * + * Retrieve the list of key/value device properties of the udev + * device. The next list entry can be retrieved with udev_list_entry_get_next(), + * which returns #NULL if no more entries exist. The property name + * can be retrieved from the list entry by udev_list_entry_get_name(), + * the property value by udev_list_entry_get_value(). + * + * Returns: the first entry of the property list + **/ +_public_ struct udev_list_entry *udev_device_get_properties_list_entry(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + if (device_get_properties_generation(udev_device->device) != udev_device->properties_generation || + !udev_device->properties_read) { + udev_list_cleanup(udev_device->properties); + + FOREACH_DEVICE_PROPERTY(udev_device->device, key, value) + if (!udev_list_entry_add(udev_device->properties, key, value)) + return_with_errno(NULL, ENOMEM); + + udev_device->properties_read = true; + udev_device->properties_generation = device_get_properties_generation(udev_device->device); + } + + return udev_list_get_entry(udev_device->properties); +} + +/** + * udev_device_get_action: + * @udev_device: udev device + * + * This is only valid if the device was received through a monitor. Devices read from + * sys do not have an action string. Usual actions are: add, remove, change, move, + * online, offline. + * + * Returns: the kernel action value, or #NULL if there is no action value available. + **/ +_public_ const char *udev_device_get_action(struct udev_device *udev_device) { + sd_device_action_t action; + + assert_return_errno(udev_device, NULL, EINVAL); + + if (sd_device_get_action(udev_device->device, &action) < 0) + return NULL; + + return device_action_to_string(action); +} + +/** + * udev_device_get_usec_since_initialized: + * @udev_device: udev device + * + * Return the number of microseconds passed since udev set up the + * device for the first time. + * + * This is only implemented for devices with need to store properties + * in the udev database. All other devices return 0 here. + * + * Returns: the number of microseconds since the device was first seen. + **/ +_public_ unsigned long long int udev_device_get_usec_since_initialized(struct udev_device *udev_device) { + usec_t ts; + int r; + + assert_return(udev_device, -EINVAL); + + r = sd_device_get_usec_since_initialized(udev_device->device, &ts); + if (r < 0) + return_with_errno(0, r); + + return ts; +} + +/** + * udev_device_get_sysattr_value: + * @udev_device: udev device + * @sysattr: attribute name + * + * The retrieved value is cached in the device. Repeated calls will return the same + * value and not open the attribute again. + * + * Returns: the content of a sys attribute file, or #NULL if there is no sys attribute value. + **/ +_public_ const char *udev_device_get_sysattr_value(struct udev_device *udev_device, const char *sysattr) { + const char *value; + int r; + + assert_return_errno(udev_device, NULL, EINVAL); + + r = sd_device_get_sysattr_value(udev_device->device, sysattr, &value); + if (r < 0) + return_with_errno(NULL, r); + + return value; +} + +/** + * udev_device_set_sysattr_value: + * @udev_device: udev device + * @sysattr: attribute name + * @value: new value to be set + * + * Update the contents of the sys attribute and the cached value of the device. + * + * Returns: Negative error code on failure or 0 on success. + **/ +_public_ int udev_device_set_sysattr_value(struct udev_device *udev_device, const char *sysattr, const char *value) { + int r; + + assert_return(udev_device, -EINVAL); + + r = sd_device_set_sysattr_value(udev_device->device, sysattr, value); + if (r < 0) + return r; + + return 0; +} + +/** + * udev_device_get_sysattr_list_entry: + * @udev_device: udev device + * + * Retrieve the list of available sysattrs, with value being empty; + * This just return all available sysfs attributes for a particular + * device without reading their values. + * + * Returns: the first entry of the property list + **/ +_public_ struct udev_list_entry *udev_device_get_sysattr_list_entry(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + if (!udev_device->sysattrs_read) { + udev_list_cleanup(udev_device->sysattrs); + + FOREACH_DEVICE_SYSATTR(udev_device->device, sysattr) + if (!udev_list_entry_add(udev_device->sysattrs, sysattr, NULL)) + return_with_errno(NULL, ENOMEM); + + udev_device->sysattrs_read = true; + } + + return udev_list_get_entry(udev_device->sysattrs); +} + +/** + * udev_device_get_is_initialized: + * @udev_device: udev device + * + * Check if udev has already handled the device and has set up + * device node permissions and context, or has renamed a network + * device. + * + * This is only implemented for devices with a device node + * or network interfaces. All other devices return 1 here. + * + * Returns: 1 if the device is set up. 0 otherwise. + **/ +_public_ int udev_device_get_is_initialized(struct udev_device *udev_device) { + int r; + + assert_return(udev_device, -EINVAL); + + r = sd_device_get_is_initialized(udev_device->device); + if (r < 0) + return_with_errno(0, r); + + return r; +} + +/** + * udev_device_get_tags_list_entry: + * @udev_device: udev device + * + * Retrieve the list of tags attached to the udev device. The next + * list entry can be retrieved with udev_list_entry_get_next(), + * which returns #NULL if no more entries exist. The tag string + * can be retrieved from the list entry by udev_list_entry_get_name(). + * + * Returns: the first entry of the tag list + **/ +_public_ struct udev_list_entry *udev_device_get_tags_list_entry(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + if (device_get_tags_generation(udev_device->device) != udev_device->all_tags_generation || + !udev_device->all_tags_read) { + udev_list_cleanup(udev_device->all_tags); + + FOREACH_DEVICE_TAG(udev_device->device, tag) + if (!udev_list_entry_add(udev_device->all_tags, tag, NULL)) + return_with_errno(NULL, ENOMEM); + + udev_device->all_tags_read = true; + udev_device->all_tags_generation = device_get_tags_generation(udev_device->device); + } + + return udev_list_get_entry(udev_device->all_tags); +} + +_public_ struct udev_list_entry *udev_device_get_current_tags_list_entry(struct udev_device *udev_device) { + assert_return_errno(udev_device, NULL, EINVAL); + + if (device_get_tags_generation(udev_device->device) != udev_device->current_tags_generation || + !udev_device->current_tags_read) { + udev_list_cleanup(udev_device->current_tags); + + FOREACH_DEVICE_CURRENT_TAG(udev_device->device, tag) + if (!udev_list_entry_add(udev_device->current_tags, tag, NULL)) + return_with_errno(NULL, ENOMEM); + + udev_device->current_tags_read = true; + udev_device->current_tags_generation = device_get_tags_generation(udev_device->device); + } + + return udev_list_get_entry(udev_device->current_tags); +} + +/** + * udev_device_has_tag: + * @udev_device: udev device + * @tag: tag name + * + * Check if a given device has a certain tag associated. + * + * Returns: 1 if the tag is found. 0 otherwise. + **/ +_public_ int udev_device_has_tag(struct udev_device *udev_device, const char *tag) { + assert_return(udev_device, 0); + + return sd_device_has_tag(udev_device->device, tag) > 0; +} + +_public_ int udev_device_has_current_tag(struct udev_device *udev_device, const char *tag) { + assert_return(udev_device, 0); + + return sd_device_has_current_tag(udev_device->device, tag) > 0; +} + +sd_device *udev_device_get_sd_device(struct udev_device *udev_device) { + assert(udev_device); + + return udev_device->device; +} diff --git a/src/libudev/libudev-enumerate.c b/src/libudev/libudev-enumerate.c new file mode 100644 index 0000000..d71a31c --- /dev/null +++ b/src/libudev/libudev-enumerate.c @@ -0,0 +1,458 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libudev.h" +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-enumerator-private.h" +#include "device-util.h" +#include "libudev-device-internal.h" +#include "libudev-list-internal.h" + +/** + * SECTION:libudev-enumerate + * @short_description: lookup and sort sys devices + * + * Lookup devices in the sys filesystem, filter devices by properties, + * and return a sorted list of devices. + */ + +/** + * udev_enumerate: + * + * Opaque object representing one device lookup/sort context. + */ +struct udev_enumerate { + struct udev *udev; + unsigned n_ref; + struct udev_list *devices_list; + bool devices_uptodate:1; + + sd_device_enumerator *enumerator; +}; + +/** + * udev_enumerate_new: + * @udev: udev library context + * + * Create an enumeration context to scan /sys. + * + * Returns: an enumeration context. + **/ +_public_ struct udev_enumerate *udev_enumerate_new(struct udev *udev) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_(udev_list_freep) struct udev_list *list = NULL; + struct udev_enumerate *udev_enumerate; + int r; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return_with_errno(NULL, r); + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return_with_errno(NULL, r); + + list = udev_list_new(false); + if (!list) + return_with_errno(NULL, ENOMEM); + + udev_enumerate = new(struct udev_enumerate, 1); + if (!udev_enumerate) + return_with_errno(NULL, ENOMEM); + + *udev_enumerate = (struct udev_enumerate) { + .udev = udev, + .n_ref = 1, + .enumerator = TAKE_PTR(e), + .devices_list = TAKE_PTR(list), + }; + + return udev_enumerate; +} + +static struct udev_enumerate *udev_enumerate_free(struct udev_enumerate *udev_enumerate) { + assert(udev_enumerate); + + udev_list_free(udev_enumerate->devices_list); + sd_device_enumerator_unref(udev_enumerate->enumerator); + return mfree(udev_enumerate); +} + +/** + * udev_enumerate_ref: + * @udev_enumerate: context + * + * Take a reference of an enumeration context. + * + * Returns: the passed enumeration context + **/ + +/** + * udev_enumerate_unref: + * @udev_enumerate: context + * + * Drop a reference of an enumeration context. If the refcount reaches zero, + * all resources of the enumeration context will be released. + * + * Returns: #NULL + **/ +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(struct udev_enumerate, udev_enumerate, udev_enumerate_free); + +/** + * udev_enumerate_get_udev: + * @udev_enumerate: context + * + * Get the udev library context. + * + * Returns: a pointer to the context. + */ +_public_ struct udev *udev_enumerate_get_udev(struct udev_enumerate *udev_enumerate) { + assert_return_errno(udev_enumerate, NULL, EINVAL); + + return udev_enumerate->udev; +} + +/** + * udev_enumerate_get_list_entry: + * @udev_enumerate: context + * + * Get the first entry of the sorted list of device paths. + * + * Returns: a udev_list_entry. + */ +_public_ struct udev_list_entry *udev_enumerate_get_list_entry(struct udev_enumerate *udev_enumerate) { + struct udev_list_entry *e; + + assert_return_errno(udev_enumerate, NULL, EINVAL); + + if (!udev_enumerate->devices_uptodate) { + sd_device *device; + + udev_list_cleanup(udev_enumerate->devices_list); + + FOREACH_DEVICE_AND_SUBSYSTEM(udev_enumerate->enumerator, device) { + const char *syspath; + int r; + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return_with_errno(NULL, r); + + if (!udev_list_entry_add(udev_enumerate->devices_list, syspath, NULL)) + return_with_errno(NULL, ENOMEM); + } + + udev_enumerate->devices_uptodate = true; + } + + e = udev_list_get_entry(udev_enumerate->devices_list); + if (!e) + return_with_errno(NULL, ENODATA); + + return e; +} + +/** + * udev_enumerate_add_match_subsystem: + * @udev_enumerate: context + * @subsystem: filter for a subsystem of the device to include in the list + * + * Match only devices belonging to a certain kernel subsystem. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_subsystem(struct udev_enumerate *udev_enumerate, const char *subsystem) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!subsystem) + return 0; + + r = sd_device_enumerator_add_match_subsystem(udev_enumerate->enumerator, subsystem, true); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_nomatch_subsystem: + * @udev_enumerate: context + * @subsystem: filter for a subsystem of the device to exclude from the list + * + * Match only devices not belonging to a certain kernel subsystem. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_nomatch_subsystem(struct udev_enumerate *udev_enumerate, const char *subsystem) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!subsystem) + return 0; + + r = sd_device_enumerator_add_match_subsystem(udev_enumerate->enumerator, subsystem, false); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_match_sysattr: + * @udev_enumerate: context + * @sysattr: filter for a sys attribute at the device to include in the list + * @value: optional value of the sys attribute + * + * Match only devices with a certain /sys device attribute. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_sysattr(struct udev_enumerate *udev_enumerate, const char *sysattr, const char *value) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!sysattr) + return 0; + + r = sd_device_enumerator_add_match_sysattr(udev_enumerate->enumerator, sysattr, value, true); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_nomatch_sysattr: + * @udev_enumerate: context + * @sysattr: filter for a sys attribute at the device to exclude from the list + * @value: optional value of the sys attribute + * + * Match only devices not having a certain /sys device attribute. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_nomatch_sysattr(struct udev_enumerate *udev_enumerate, const char *sysattr, const char *value) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!sysattr) + return 0; + + r = sd_device_enumerator_add_match_sysattr(udev_enumerate->enumerator, sysattr, value, false); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_match_property: + * @udev_enumerate: context + * @property: filter for a property of the device to include in the list + * @value: value of the property + * + * Match only devices with a certain property. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_property(struct udev_enumerate *udev_enumerate, const char *property, const char *value) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!property) + return 0; + + r = sd_device_enumerator_add_match_property(udev_enumerate->enumerator, property, value); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_match_tag: + * @udev_enumerate: context + * @tag: filter for a tag of the device to include in the list + * + * Match only devices with a certain tag. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_tag(struct udev_enumerate *udev_enumerate, const char *tag) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!tag) + return 0; + + r = sd_device_enumerator_add_match_tag(udev_enumerate->enumerator, tag); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_match_parent: + * @udev_enumerate: context + * @parent: parent device where to start searching + * + * Return the devices on the subtree of one given device. The parent + * itself is included in the list. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_parent(struct udev_enumerate *udev_enumerate, struct udev_device *parent) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!parent) + return 0; + + r = sd_device_enumerator_add_match_parent(udev_enumerate->enumerator, udev_device_get_sd_device(parent)); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_match_is_initialized: + * @udev_enumerate: context + * + * Match only devices which udev has set up already. This makes + * sure, that the device node permissions and context are properly set + * and that network devices are fully renamed. + * + * Usually, devices which are found in the kernel but not already + * handled by udev, have still pending events. Services should subscribe + * to monitor events and wait for these devices to become ready, instead + * of using uninitialized devices. + * + * For now, this will not affect devices which do not have a device node + * and are not network interfaces. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_is_initialized(struct udev_enumerate *udev_enumerate) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + r = device_enumerator_add_match_is_initialized(udev_enumerate->enumerator, MATCH_INITIALIZED_COMPAT); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_match_sysname: + * @udev_enumerate: context + * @sysname: filter for the name of the device to include in the list + * + * Match only devices with a given /sys device name. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_match_sysname(struct udev_enumerate *udev_enumerate, const char *sysname) { + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!sysname) + return 0; + + r = sd_device_enumerator_add_match_sysname(udev_enumerate->enumerator, sysname); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_add_syspath: + * @udev_enumerate: context + * @syspath: path of a device + * + * Add a device to the list of devices, to retrieve it back sorted in dependency order. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_enumerate_add_syspath(struct udev_enumerate *udev_enumerate, const char *syspath) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + assert_return(udev_enumerate, -EINVAL); + + if (!syspath) + return 0; + + r = sd_device_new_from_syspath(&device, syspath); + if (r < 0) + return r; + + r = device_enumerator_add_device(udev_enumerate->enumerator, device); + if (r < 0) + return r; + + udev_enumerate->devices_uptodate = false; + return 0; +} + +/** + * udev_enumerate_scan_devices: + * @udev_enumerate: udev enumeration context + * + * Scan /sys for all devices which match the given filters. No matches + * will return all currently available devices. + * + * Returns: 0 on success, otherwise a negative error value. + **/ +_public_ int udev_enumerate_scan_devices(struct udev_enumerate *udev_enumerate) { + assert_return(udev_enumerate, -EINVAL); + + return device_enumerator_scan_devices(udev_enumerate->enumerator); +} + +/** + * udev_enumerate_scan_subsystems: + * @udev_enumerate: udev enumeration context + * + * Scan /sys for all kernel subsystems, including buses, classes, drivers. + * + * Returns: 0 on success, otherwise a negative error value. + **/ +_public_ int udev_enumerate_scan_subsystems(struct udev_enumerate *udev_enumerate) { + assert_return(udev_enumerate, -EINVAL); + + return device_enumerator_scan_subsystems(udev_enumerate->enumerator); +} diff --git a/src/libudev/libudev-hwdb.c b/src/libudev/libudev-hwdb.c new file mode 100644 index 0000000..8e9ea97 --- /dev/null +++ b/src/libudev/libudev-hwdb.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-hwdb.h" + +#include "alloc-util.h" +#include "hwdb-util.h" +#include "libudev-list-internal.h" + +/** + * SECTION:libudev-hwdb + * @short_description: retrieve properties from the hardware database + * + * Libudev hardware database interface. + */ + +/** + * udev_hwdb: + * + * Opaque object representing the hardware database. + */ +struct udev_hwdb { + unsigned n_ref; + sd_hwdb *hwdb; + struct udev_list *properties_list; +}; + +/** + * udev_hwdb_new: + * @udev: udev library context (unused) + * + * Create a hardware database context to query properties for devices. + * + * Returns: a hwdb context. + **/ +_public_ struct udev_hwdb *udev_hwdb_new(struct udev *udev) { + _cleanup_(udev_list_freep) struct udev_list *list = NULL; + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb_internal = NULL; + struct udev_hwdb *hwdb; + int r; + + r = sd_hwdb_new(&hwdb_internal); + if (r < 0) + return_with_errno(NULL, r); + + list = udev_list_new(true); + if (!list) + return_with_errno(NULL, ENOMEM); + + hwdb = new(struct udev_hwdb, 1); + if (!hwdb) + return_with_errno(NULL, ENOMEM); + + *hwdb = (struct udev_hwdb) { + .n_ref = 1, + .hwdb = TAKE_PTR(hwdb_internal), + .properties_list = TAKE_PTR(list), + }; + + return hwdb; +} + +static struct udev_hwdb *udev_hwdb_free(struct udev_hwdb *hwdb) { + assert(hwdb); + + sd_hwdb_unref(hwdb->hwdb); + udev_list_free(hwdb->properties_list); + return mfree(hwdb); +} + +/** + * udev_hwdb_ref: + * @hwdb: context + * + * Take a reference of a hwdb context. + * + * Returns: the passed enumeration context + **/ + +/** + * udev_hwdb_unref: + * @hwdb: context + * + * Drop a reference of a hwdb context. If the refcount reaches zero, + * all resources of the hwdb context will be released. + * + * Returns: #NULL + **/ +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(struct udev_hwdb, udev_hwdb, udev_hwdb_free); + +/** + * udev_hwdb_get_properties_list_entry: + * @hwdb: context + * @modalias: modalias string + * @flags: (unused) + * + * Lookup a matching device in the hardware database. The lookup key is a + * modalias string, whose formats are defined for the Linux kernel modules. + * Examples are: pci:v00008086d00001C2D*, usb:v04F2pB221*. The first entry + * of a list of retrieved properties is returned. + * + * Returns: a udev_list_entry. + */ +_public_ struct udev_list_entry *udev_hwdb_get_properties_list_entry(struct udev_hwdb *hwdb, const char *modalias, unsigned flags) { + const char *key, *value; + struct udev_list_entry *e; + + assert_return_errno(hwdb, NULL, EINVAL); + assert_return_errno(modalias, NULL, EINVAL); + + udev_list_cleanup(hwdb->properties_list); + + SD_HWDB_FOREACH_PROPERTY(hwdb->hwdb, modalias, key, value) + if (!udev_list_entry_add(hwdb->properties_list, key, value)) + return_with_errno(NULL, ENOMEM); + + e = udev_list_get_entry(hwdb->properties_list); + if (!e) + return_with_errno(NULL, ENODATA); + + return e; +} diff --git a/src/libudev/libudev-list-internal.h b/src/libudev/libudev-list-internal.h new file mode 100644 index 0000000..c23735e --- /dev/null +++ b/src/libudev/libudev-list-internal.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "libudev.h" + +#include "macro.h" + +struct udev_list; + +struct udev_list *udev_list_new(bool unique); +void udev_list_cleanup(struct udev_list *list); +struct udev_list *udev_list_free(struct udev_list *list); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_list *, udev_list_free); + +struct udev_list_entry *udev_list_get_entry(struct udev_list *list); +struct udev_list_entry *udev_list_entry_add(struct udev_list *list, const char *name, const char *value); diff --git a/src/libudev/libudev-list.c b/src/libudev/libudev-list.c new file mode 100644 index 0000000..0adc1d5 --- /dev/null +++ b/src/libudev/libudev-list.c @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "hashmap.h" +#include "libudev-list-internal.h" +#include "list.h" +#include "sort-util.h" + +/** + * SECTION:libudev-list + * @short_description: list operation + * + * Libudev list operations. + */ + +/** + * udev_list_entry: + * + * Opaque object representing one entry in a list. An entry contains + * contains a name, and optionally a value. + */ +struct udev_list_entry { + struct udev_list *list; + char *name; + char *value; + + LIST_FIELDS(struct udev_list_entry, entries); +}; + +struct udev_list { + Hashmap *unique_entries; + LIST_HEAD(struct udev_list_entry, entries); + bool unique:1; + bool uptodate:1; +}; + +static struct udev_list_entry *udev_list_entry_free(struct udev_list_entry *entry) { + if (!entry) + return NULL; + + if (entry->list) { + if (entry->list->unique && entry->name) + hashmap_remove(entry->list->unique_entries, entry->name); + + if (!entry->list->unique || entry->list->uptodate) + LIST_REMOVE(entries, entry->list->entries, entry); + } + + free(entry->name); + free(entry->value); + + return mfree(entry); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_list_entry *, udev_list_entry_free); + +struct udev_list *udev_list_new(bool unique) { + struct udev_list *list; + + list = new(struct udev_list, 1); + if (!list) + return NULL; + + *list = (struct udev_list) { + .unique = unique, + }; + + return list; +} + +struct udev_list_entry *udev_list_entry_add(struct udev_list *list, const char *_name, const char *_value) { + _cleanup_(udev_list_entry_freep) struct udev_list_entry *entry = NULL; + _cleanup_free_ char *name = NULL, *value = NULL; + + assert(list); + assert(_name); + + name = strdup(_name); + if (!name) + return NULL; + + if (_value) { + value = strdup(_value); + if (!value) + return NULL; + } + + entry = new(struct udev_list_entry, 1); + if (!entry) + return NULL; + + *entry = (struct udev_list_entry) { + .name = TAKE_PTR(name), + .value = TAKE_PTR(value), + }; + + if (list->unique) { + udev_list_entry_free(hashmap_get(list->unique_entries, entry->name)); + + if (hashmap_ensure_put(&list->unique_entries, &string_hash_ops, entry->name, entry) < 0) + return NULL; + + list->uptodate = false; + } else + LIST_APPEND(entries, list->entries, entry); + + entry->list = list; + + return TAKE_PTR(entry); +} + +void udev_list_cleanup(struct udev_list *list) { + if (!list) + return; + + if (list->unique) { + list->uptodate = false; + hashmap_clear_with_destructor(list->unique_entries, udev_list_entry_free); + } else + LIST_FOREACH(entries, i, list->entries) + udev_list_entry_free(i); +} + +struct udev_list *udev_list_free(struct udev_list *list) { + if (!list) + return NULL; + + udev_list_cleanup(list); + hashmap_free(list->unique_entries); + + return mfree(list); +} + +static int udev_list_entry_compare_func(struct udev_list_entry * const *a, struct udev_list_entry * const *b) { + return strcmp((*a)->name, (*b)->name); +} + +struct udev_list_entry *udev_list_get_entry(struct udev_list *list) { + if (!list) + return NULL; + + if (list->unique && !list->uptodate) { + size_t n; + + LIST_HEAD_INIT(list->entries); + + n = hashmap_size(list->unique_entries); + if (n == 0) + ; + else if (n == 1) + LIST_PREPEND(entries, list->entries, hashmap_first(list->unique_entries)); + else { + _cleanup_free_ struct udev_list_entry **buf = NULL; + struct udev_list_entry *entry, **p; + + buf = new(struct udev_list_entry *, n); + if (!buf) + return NULL; + + p = buf; + HASHMAP_FOREACH(entry, list->unique_entries) + *p++ = entry; + + typesafe_qsort(buf, n, udev_list_entry_compare_func); + + for (size_t j = n; j > 0; j--) + LIST_PREPEND(entries, list->entries, buf[j-1]); + } + + list->uptodate = true; + } + + return list->entries; +} + +/** + * udev_list_entry_get_next: + * @list_entry: current entry + * + * Get the next entry from the list. + * + * Returns: udev_list_entry, #NULL if no more entries are available. + */ +_public_ struct udev_list_entry *udev_list_entry_get_next(struct udev_list_entry *list_entry) { + if (!list_entry) + return NULL; + if (list_entry->list->unique && !list_entry->list->uptodate) + return NULL; + return list_entry->entries_next; +} + +/** + * udev_list_entry_get_by_name: + * @list_entry: current entry + * @name: name string to match + * + * Lookup an entry in the list with a certain name. + * + * Returns: udev_list_entry, #NULL if no matching entry is found. + */ +_public_ struct udev_list_entry *udev_list_entry_get_by_name(struct udev_list_entry *list_entry, const char *name) { + if (!list_entry) + return NULL; + if (!list_entry->list->unique || !list_entry->list->uptodate) + return NULL; + return hashmap_get(list_entry->list->unique_entries, name); +} + +/** + * udev_list_entry_get_name: + * @list_entry: current entry + * + * Get the name of a list entry. + * + * Returns: the name string of this entry. + */ +_public_ const char *udev_list_entry_get_name(struct udev_list_entry *list_entry) { + if (!list_entry) + return NULL; + return list_entry->name; +} + +/** + * udev_list_entry_get_value: + * @list_entry: current entry + * + * Get the value of list entry. + * + * Returns: the value string of this entry. + */ +_public_ const char *udev_list_entry_get_value(struct udev_list_entry *list_entry) { + if (!list_entry) + return NULL; + return list_entry->value; +} diff --git a/src/libudev/libudev-monitor.c b/src/libudev/libudev-monitor.c new file mode 100644 index 0000000..d7c931d --- /dev/null +++ b/src/libudev/libudev-monitor.c @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "libudev.h" + +#include "alloc-util.h" +#include "device-monitor-private.h" +#include "device-private.h" +#include "device-util.h" +#include "io-util.h" +#include "libudev-device-internal.h" +#include "string-util.h" + +/** + * SECTION:libudev-monitor + * @short_description: device event source + * + * Connects to a device event source. + */ + +/** + * udev_monitor: + * + * Opaque object handling an event source. + */ +struct udev_monitor { + struct udev *udev; + unsigned n_ref; + sd_device_monitor *monitor; +}; + +static MonitorNetlinkGroup monitor_netlink_group_from_string(const char *name) { + if (!name) + return MONITOR_GROUP_NONE; + if (streq(name, "udev")) + return MONITOR_GROUP_UDEV; + if (streq(name, "kernel")) + return MONITOR_GROUP_KERNEL; + return _MONITOR_NETLINK_GROUP_INVALID; +} + +/** + * udev_monitor_new_from_netlink: + * @udev: udev library context + * @name: name of event source + * + * Create new udev monitor and connect to a specified event + * source. Valid sources identifiers are "udev" and "kernel". + * + * Applications should usually not connect directly to the + * "kernel" events, because the devices might not be usable + * at that time, before udev has configured them, and created + * device nodes. Accessing devices at the same time as udev, + * might result in unpredictable behavior. The "udev" events + * are sent out after udev has finished its event processing, + * all rules have been processed, and needed device nodes are + * created. + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev monitor. + * + * Returns: a new udev monitor, or #NULL, in case of an error + **/ +_public_ struct udev_monitor *udev_monitor_new_from_netlink(struct udev *udev, const char *name) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *m = NULL; + struct udev_monitor *udev_monitor; + MonitorNetlinkGroup g; + int r; + + g = monitor_netlink_group_from_string(name); + if (g < 0) + return_with_errno(NULL, EINVAL); + + r = device_monitor_new_full(&m, g, -1); + if (r < 0) + return_with_errno(NULL, r); + + udev_monitor = new(struct udev_monitor, 1); + if (!udev_monitor) + return_with_errno(NULL, ENOMEM); + + *udev_monitor = (struct udev_monitor) { + .udev = udev, + .n_ref = 1, + .monitor = TAKE_PTR(m), + }; + + return udev_monitor; +} + +/** + * udev_monitor_filter_update: + * @udev_monitor: monitor + * + * Update the installed socket filter. This is only needed, + * if the filter was removed or changed. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_monitor_filter_update(struct udev_monitor *udev_monitor) { + assert_return(udev_monitor, -EINVAL); + + return sd_device_monitor_filter_update(udev_monitor->monitor); +} + +/** + * udev_monitor_enable_receiving: + * @udev_monitor: the monitor which should receive events + * + * Binds the @udev_monitor socket to the event source. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_monitor_enable_receiving(struct udev_monitor *udev_monitor) { + assert_return(udev_monitor, -EINVAL); + + return device_monitor_enable_receiving(udev_monitor->monitor); +} + +/** + * udev_monitor_set_receive_buffer_size: + * @udev_monitor: the monitor which should receive events + * @size: the size in bytes + * + * Set the size of the kernel socket buffer. This call needs the + * appropriate privileges to succeed. + * + * Returns: 0 on success, otherwise -1 on error. + */ +_public_ int udev_monitor_set_receive_buffer_size(struct udev_monitor *udev_monitor, int size) { + assert_return(udev_monitor, -EINVAL); + + return sd_device_monitor_set_receive_buffer_size(udev_monitor->monitor, (size_t) size); +} + +static struct udev_monitor *udev_monitor_free(struct udev_monitor *udev_monitor) { + assert(udev_monitor); + + sd_device_monitor_unref(udev_monitor->monitor); + return mfree(udev_monitor); +} + +/** + * udev_monitor_ref: + * @udev_monitor: udev monitor + * + * Take a reference of a udev monitor. + * + * Returns: the passed udev monitor + **/ + +/** + * udev_monitor_unref: + * @udev_monitor: udev monitor + * + * Drop a reference of a udev monitor. If the refcount reaches zero, + * the bound socket will be closed, and the resources of the monitor + * will be released. + * + * Returns: #NULL + **/ +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(struct udev_monitor, udev_monitor, udev_monitor_free); + +/** + * udev_monitor_get_udev: + * @udev_monitor: udev monitor + * + * Retrieve the udev library context the monitor was created with. + * + * Returns: the udev library context + **/ +_public_ struct udev *udev_monitor_get_udev(struct udev_monitor *udev_monitor) { + assert_return(udev_monitor, NULL); + + return udev_monitor->udev; +} + +/** + * udev_monitor_get_fd: + * @udev_monitor: udev monitor + * + * Retrieve the socket file descriptor associated with the monitor. + * + * Returns: the socket file descriptor + **/ +_public_ int udev_monitor_get_fd(struct udev_monitor *udev_monitor) { + assert_return(udev_monitor, -EINVAL); + + return device_monitor_get_fd(udev_monitor->monitor); +} + +static int udev_monitor_receive_sd_device(struct udev_monitor *udev_monitor, sd_device **ret) { + int r; + + assert(udev_monitor); + assert(ret); + + for (;;) { + /* r == 0 means a device is received but it does not pass the current filter. */ + r = device_monitor_receive_device(udev_monitor->monitor, ret); + if (r != 0) + return r; + + for (;;) { + /* Wait for next message */ + r = fd_wait_for_event(device_monitor_get_fd(udev_monitor->monitor), POLLIN, 0); + if (r == -EINTR) + continue; + if (r < 0) + return r; + if (r == 0) + return -EAGAIN; + + /* Receive next message */ + break; + } + } +} + +/** + * udev_monitor_receive_device: + * @udev_monitor: udev monitor + * + * Receive data from the udev monitor socket, allocate a new udev + * device, fill in the received data, and return the device. + * + * Only socket connections with uid=0 are accepted. + * + * The monitor socket is by default set to NONBLOCK. A variant of poll() on + * the file descriptor returned by udev_monitor_get_fd() should to be used to + * wake up when new devices arrive, or alternatively the file descriptor + * switched into blocking mode. + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev device. + * + * Returns: a new udev device, or #NULL, in case of an error + **/ +_public_ struct udev_device *udev_monitor_receive_device(struct udev_monitor *udev_monitor) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + assert_return(udev_monitor, NULL); + + r = udev_monitor_receive_sd_device(udev_monitor, &device); + if (r < 0) + return_with_errno(NULL, r); + + return udev_device_new(udev_monitor->udev, device); +} + +/** + * udev_monitor_filter_add_match_subsystem_devtype: + * @udev_monitor: the monitor + * @subsystem: the subsystem value to match the incoming devices against + * @devtype: the devtype value to match the incoming devices against + * + * This filter is efficiently executed inside the kernel, and libudev subscribers + * will usually not be woken up for devices which do not match. + * + * The filter must be installed before the monitor is switched to listening mode. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_monitor_filter_add_match_subsystem_devtype(struct udev_monitor *udev_monitor, const char *subsystem, const char *devtype) { + int r; + + assert_return(udev_monitor, -EINVAL); + + r = sd_device_monitor_filter_add_match_subsystem_devtype(udev_monitor->monitor, subsystem, devtype); + return r < 0 ? r : 0; +} + +/** + * udev_monitor_filter_add_match_tag: + * @udev_monitor: the monitor + * @tag: the name of a tag + * + * This filter is efficiently executed inside the kernel, and libudev subscribers + * will usually not be woken up for devices which do not match. + * + * The filter must be installed before the monitor is switched to listening mode. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_monitor_filter_add_match_tag(struct udev_monitor *udev_monitor, const char *tag) { + int r; + + assert_return(udev_monitor, -EINVAL); + + r = sd_device_monitor_filter_add_match_tag(udev_monitor->monitor, tag); + return r < 0 ? r : 0; +} + +/** + * udev_monitor_filter_remove: + * @udev_monitor: monitor + * + * Remove all filters from monitor. + * + * Returns: 0 on success, otherwise a negative error value. + */ +_public_ int udev_monitor_filter_remove(struct udev_monitor *udev_monitor) { + assert_return(udev_monitor, -EINVAL); + + return sd_device_monitor_filter_remove(udev_monitor->monitor); +} diff --git a/src/libudev/libudev-queue.c b/src/libudev/libudev-queue.c new file mode 100644 index 0000000..0af99e5 --- /dev/null +++ b/src/libudev/libudev-queue.c @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2009 Alan Jenkins +***/ + +#include +#include + +#include "libudev.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "udev-util.h" + +/** + * SECTION:libudev-queue + * @short_description: access to currently active events + * + * This exports the current state of the udev processing queue. + */ + +/** + * udev_queue: + * + * Opaque object representing the current event queue in the udev daemon. + */ +struct udev_queue { + struct udev *udev; + unsigned n_ref; + int fd; +}; + +/** + * udev_queue_new: + * @udev: udev library context + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev queue context. + * + * Returns: the udev queue context, or #NULL on error. + **/ +_public_ struct udev_queue *udev_queue_new(struct udev *udev) { + struct udev_queue *udev_queue; + + udev_queue = new(struct udev_queue, 1); + if (!udev_queue) + return_with_errno(NULL, ENOMEM); + + *udev_queue = (struct udev_queue) { + .udev = udev, + .n_ref = 1, + .fd = -EBADF, + }; + + return udev_queue; +} + +static struct udev_queue *udev_queue_free(struct udev_queue *udev_queue) { + assert(udev_queue); + + safe_close(udev_queue->fd); + return mfree(udev_queue); +} + +/** + * udev_queue_ref: + * @udev_queue: udev queue context + * + * Take a reference of a udev queue context. + * + * Returns: the same udev queue context. + **/ + +/** + * udev_queue_unref: + * @udev_queue: udev queue context + * + * Drop a reference of a udev queue context. If the refcount reaches zero, + * the resources of the queue context will be released. + * + * Returns: #NULL + **/ +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(struct udev_queue, udev_queue, udev_queue_free); + +/** + * udev_queue_get_udev: + * @udev_queue: udev queue context + * + * Retrieve the udev library context the queue context was created with. + * + * Returns: the udev library context. + **/ +_public_ struct udev *udev_queue_get_udev(struct udev_queue *udev_queue) { + assert_return_errno(udev_queue, NULL, EINVAL); + + return udev_queue->udev; +} + +/** + * udev_queue_get_kernel_seqnum: + * @udev_queue: udev queue context + * + * This function is deprecated. + * + * Returns: 0. + **/ +_public_ unsigned long long int udev_queue_get_kernel_seqnum(struct udev_queue *udev_queue) { + return 0; +} + +/** + * udev_queue_get_udev_seqnum: + * @udev_queue: udev queue context + * + * This function is deprecated. + * + * Returns: 0. + **/ +_public_ unsigned long long int udev_queue_get_udev_seqnum(struct udev_queue *udev_queue) { + return 0; +} + +/** + * udev_queue_get_udev_is_active: + * @udev_queue: udev queue context + * + * Check if udev is active on the system. + * + * Returns: a flag indicating if udev is active. + **/ +_public_ int udev_queue_get_udev_is_active(struct udev_queue *udev_queue) { + return access("/run/udev/control", F_OK) >= 0; +} + +/** + * udev_queue_get_queue_is_empty: + * @udev_queue: udev queue context + * + * Check if udev is currently processing any events. + * + * Returns: a flag indicating if udev is currently handling events. + **/ +_public_ int udev_queue_get_queue_is_empty(struct udev_queue *udev_queue) { + return udev_queue_is_empty() > 0; +} + +/** + * udev_queue_get_seqnum_sequence_is_finished: + * @udev_queue: udev queue context + * @start: first event sequence number + * @end: last event sequence number + * + * This function is deprecated, and equivalent to udev_queue_get_queue_is_empty(). + * + * Returns: a flag indicating if udev is currently handling events. + **/ +_public_ int udev_queue_get_seqnum_sequence_is_finished(struct udev_queue *udev_queue, + unsigned long long int start, unsigned long long int end) { + return udev_queue_is_empty() > 0; +} + +/** + * udev_queue_get_seqnum_is_finished: + * @udev_queue: udev queue context + * @seqnum: sequence number + * + * This function is deprecated, and equivalent to udev_queue_get_queue_is_empty(). + * + * Returns: a flag indicating if udev is currently handling events. + **/ +_public_ int udev_queue_get_seqnum_is_finished(struct udev_queue *udev_queue, unsigned long long int seqnum) { + return udev_queue_is_empty() > 0; +} + +/** + * udev_queue_get_queued_list_entry: + * @udev_queue: udev queue context + * + * This function is deprecated. + * + * Returns: NULL. + **/ +_public_ struct udev_list_entry *udev_queue_get_queued_list_entry(struct udev_queue *udev_queue) { + return_with_errno(NULL, ENODATA); +} + +/** + * udev_queue_get_fd: + * @udev_queue: udev queue context + * + * Returns: a file descriptor to watch for a queue to become empty. + */ +_public_ int udev_queue_get_fd(struct udev_queue *udev_queue) { + _cleanup_close_ int fd = -EBADF; + + assert_return(udev_queue, -EINVAL); + + if (udev_queue->fd >= 0) + return udev_queue->fd; + + fd = inotify_init1(IN_CLOEXEC); + if (fd < 0) + return -errno; + + if (inotify_add_watch(fd, "/run/udev" , IN_DELETE) < 0) + return -errno; + + return udev_queue->fd = TAKE_FD(fd); +} + +/** + * udev_queue_flush: + * @udev_queue: udev queue context + * + * Returns: the result of clearing the watch for queue changes. + */ +_public_ int udev_queue_flush(struct udev_queue *udev_queue) { + int r; + + assert_return(udev_queue, -EINVAL); + + if (udev_queue->fd < 0) + return -EINVAL; + + r = flush_fd(udev_queue->fd); + if (r < 0) + return r; + + return 0; +} diff --git a/src/libudev/libudev-util.c b/src/libudev/libudev-util.c new file mode 100644 index 0000000..8c51877 --- /dev/null +++ b/src/libudev/libudev-util.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "device-nodes.h" +#include "libudev-util.h" + +/** + * SECTION:libudev-util + * @short_description: utils + * + * Utilities useful when dealing with devices and device node names. + */ + +/** + * udev_util_encode_string: + * @str: input string to be encoded + * @str_enc: output string to store the encoded input string + * @len: maximum size of the output string, which may be + * four times as long as the input string + * + * Encode all potentially unsafe characters of a string to the + * corresponding 2 char hex value prefixed by '\x'. + * + * Returns: 0 if the entire string was copied, non-zero otherwise. + **/ +_public_ int udev_util_encode_string(const char *str, char *str_enc, size_t len) { + return encode_devnode_name(str, str_enc, len); +} diff --git a/src/libudev/libudev-util.h b/src/libudev/libudev-util.h new file mode 100644 index 0000000..0dc18d4 --- /dev/null +++ b/src/libudev/libudev-util.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "libudev.h" + +#include "macro.h" + +/* Cleanup functions */ +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev*, udev_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_device*, udev_device_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_enumerate*, udev_enumerate_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_monitor*, udev_monitor_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_hwdb*, udev_hwdb_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct udev_queue*, udev_queue_unref); diff --git a/src/libudev/libudev.c b/src/libudev/libudev.c new file mode 100644 index 0000000..7357487 --- /dev/null +++ b/src/libudev/libudev.c @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "libudev.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "string-util.h" + +/** + * SECTION:libudev + * @short_description: libudev context + */ + +/** + * udev: + * + * Opaque object representing the library context. + */ +struct udev { + unsigned n_ref; + void *userdata; +}; + +/** + * udev_get_userdata: + * @udev: udev library context + * + * Retrieve stored data pointer from library context. This might be useful + * to access from callbacks. + * + * Returns: stored userdata + **/ +_public_ void *udev_get_userdata(struct udev *udev) { + assert_return(udev, NULL); + + return udev->userdata; +} + +/** + * udev_set_userdata: + * @udev: udev library context + * @userdata: data pointer + * + * Store custom @userdata in the library context. + **/ +_public_ void udev_set_userdata(struct udev *udev, void *userdata) { + if (!udev) + return; + + udev->userdata = userdata; +} + +/** + * udev_new: + * + * Create udev library context. This only allocates the basic data structure. + * + * The initial refcount is 1, and needs to be decremented to + * release the resources of the udev library context. + * + * Returns: a new udev library context + **/ +_public_ struct udev *udev_new(void) { + struct udev *udev; + + udev = new(struct udev, 1); + if (!udev) + return_with_errno(NULL, ENOMEM); + + *udev = (struct udev) { + .n_ref = 1, + }; + + return udev; +} + +/** + * udev_ref: + * @udev: udev library context + * + * Take a reference of the udev library context. + * + * Returns: the passed udev library context + **/ +DEFINE_PUBLIC_TRIVIAL_REF_FUNC(struct udev, udev); + +/** + * udev_unref: + * @udev: udev library context + * + * Drop a reference of the udev library context. If the refcount + * reaches zero, the resources of the context will be released. + * + * Returns: the passed udev library context if it has still an active reference, or #NULL otherwise. + **/ +_public_ struct udev *udev_unref(struct udev *udev) { + if (!udev) + return NULL; + + assert(udev->n_ref > 0); + udev->n_ref--; + if (udev->n_ref > 0) + /* This is different from our convention, but let's keep backward + * compatibility. So, do not use DEFINE_PUBLIC_TRIVIAL_UNREF_FUNC() + * macro to define this function. */ + return udev; + + return mfree(udev); +} + +/** + * udev_set_log_fn: + * @udev: udev library context + * @log_fn: function to be called for log messages + * + * This function is deprecated. + * + **/ +_public_ void udev_set_log_fn( + struct udev *udev, + void (*log_fn)(struct udev *udev, + int priority, const char *file, int line, const char *fn, + const char *format, va_list args)) { + return; +} + +/** + * udev_get_log_priority: + * @udev: udev library context + * + * This function is deprecated. + * + **/ +_public_ int udev_get_log_priority(struct udev *udev) { + return log_get_max_level(); +} + +/** + * udev_set_log_priority: + * @udev: udev library context + * @priority: the new log priority + * + * This function is deprecated. + * + **/ +_public_ void udev_set_log_priority(struct udev *udev, int priority) { + log_set_max_level(priority); +} diff --git a/src/libudev/libudev.h b/src/libudev/libudev.h new file mode 100644 index 0000000..aef4a55 --- /dev/null +++ b/src/libudev/libudev.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#ifndef _LIBUDEV_H_ +#define _LIBUDEV_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * udev - library context + * + * reads the udev config and system environment + * allows custom logging + */ +struct udev; +struct udev *udev_ref(struct udev *udev); +struct udev *udev_unref(struct udev *udev); +struct udev *udev_new(void); +void udev_set_log_fn(struct udev *udev, + void (*log_fn)(struct udev *udev, + int priority, const char *file, int line, const char *fn, + const char *format, va_list args)) __attribute__((__deprecated__)); +int udev_get_log_priority(struct udev *udev) __attribute__((__deprecated__)); +void udev_set_log_priority(struct udev *udev, int priority) __attribute__((__deprecated__)); +void *udev_get_userdata(struct udev *udev); +void udev_set_userdata(struct udev *udev, void *userdata); + +/* + * udev_list + * + * access to libudev generated lists + */ +struct udev_list_entry; +struct udev_list_entry *udev_list_entry_get_next(struct udev_list_entry *list_entry); +struct udev_list_entry *udev_list_entry_get_by_name(struct udev_list_entry *list_entry, const char *name); +const char *udev_list_entry_get_name(struct udev_list_entry *list_entry); +const char *udev_list_entry_get_value(struct udev_list_entry *list_entry); +/** + * udev_list_entry_foreach: + * @list_entry: entry to store the current position + * @first_entry: first entry to start with + * + * Helper to iterate over all entries of a list. + */ +#define udev_list_entry_foreach(list_entry, first_entry) \ + for (list_entry = first_entry; \ + list_entry; \ + list_entry = udev_list_entry_get_next(list_entry)) + +/* + * udev_device + * + * access to sysfs/kernel devices + */ +struct udev_device; +struct udev_device *udev_device_ref(struct udev_device *udev_device); +struct udev_device *udev_device_unref(struct udev_device *udev_device); +struct udev *udev_device_get_udev(struct udev_device *udev_device); +struct udev_device *udev_device_new_from_syspath(struct udev *udev, const char *syspath); +struct udev_device *udev_device_new_from_devnum(struct udev *udev, char type, dev_t devnum); +struct udev_device *udev_device_new_from_subsystem_sysname(struct udev *udev, const char *subsystem, const char *sysname); +struct udev_device *udev_device_new_from_device_id(struct udev *udev, const char *id); +struct udev_device *udev_device_new_from_environment(struct udev *udev); +/* udev_device_get_parent_*() does not take a reference on the returned device, it is automatically unref'd with the parent */ +struct udev_device *udev_device_get_parent(struct udev_device *udev_device); +struct udev_device *udev_device_get_parent_with_subsystem_devtype(struct udev_device *udev_device, + const char *subsystem, const char *devtype); +/* retrieve device properties */ +const char *udev_device_get_devpath(struct udev_device *udev_device); +const char *udev_device_get_subsystem(struct udev_device *udev_device); +const char *udev_device_get_devtype(struct udev_device *udev_device); +const char *udev_device_get_syspath(struct udev_device *udev_device); +const char *udev_device_get_sysname(struct udev_device *udev_device); +const char *udev_device_get_sysnum(struct udev_device *udev_device); +const char *udev_device_get_devnode(struct udev_device *udev_device); +int udev_device_get_is_initialized(struct udev_device *udev_device); +struct udev_list_entry *udev_device_get_devlinks_list_entry(struct udev_device *udev_device); +struct udev_list_entry *udev_device_get_properties_list_entry(struct udev_device *udev_device); +struct udev_list_entry *udev_device_get_tags_list_entry(struct udev_device *udev_device); +struct udev_list_entry *udev_device_get_current_tags_list_entry(struct udev_device *udev_device); +struct udev_list_entry *udev_device_get_sysattr_list_entry(struct udev_device *udev_device); +const char *udev_device_get_property_value(struct udev_device *udev_device, const char *key); +const char *udev_device_get_driver(struct udev_device *udev_device); +dev_t udev_device_get_devnum(struct udev_device *udev_device); +const char *udev_device_get_action(struct udev_device *udev_device); +unsigned long long int udev_device_get_seqnum(struct udev_device *udev_device); +unsigned long long int udev_device_get_usec_since_initialized(struct udev_device *udev_device); +const char *udev_device_get_sysattr_value(struct udev_device *udev_device, const char *sysattr); +int udev_device_set_sysattr_value(struct udev_device *udev_device, const char *sysattr, const char *value); +int udev_device_has_tag(struct udev_device *udev_device, const char *tag); +int udev_device_has_current_tag(struct udev_device *udev_device, const char *tag); + +/* + * udev_monitor + * + * access to kernel uevents and udev events + */ +struct udev_monitor; +struct udev_monitor *udev_monitor_ref(struct udev_monitor *udev_monitor); +struct udev_monitor *udev_monitor_unref(struct udev_monitor *udev_monitor); +struct udev *udev_monitor_get_udev(struct udev_monitor *udev_monitor); +/* kernel and udev generated events over netlink */ +struct udev_monitor *udev_monitor_new_from_netlink(struct udev *udev, const char *name); +/* bind socket */ +int udev_monitor_enable_receiving(struct udev_monitor *udev_monitor); +int udev_monitor_set_receive_buffer_size(struct udev_monitor *udev_monitor, int size); +int udev_monitor_get_fd(struct udev_monitor *udev_monitor); +struct udev_device *udev_monitor_receive_device(struct udev_monitor *udev_monitor); +/* in-kernel socket filters to select messages that get delivered to a listener */ +int udev_monitor_filter_add_match_subsystem_devtype(struct udev_monitor *udev_monitor, + const char *subsystem, const char *devtype); +int udev_monitor_filter_add_match_tag(struct udev_monitor *udev_monitor, const char *tag); +int udev_monitor_filter_update(struct udev_monitor *udev_monitor); +int udev_monitor_filter_remove(struct udev_monitor *udev_monitor); + +/* + * udev_enumerate + * + * search sysfs for specific devices and provide a sorted list + */ +struct udev_enumerate; +struct udev_enumerate *udev_enumerate_ref(struct udev_enumerate *udev_enumerate); +struct udev_enumerate *udev_enumerate_unref(struct udev_enumerate *udev_enumerate); +struct udev *udev_enumerate_get_udev(struct udev_enumerate *udev_enumerate); +struct udev_enumerate *udev_enumerate_new(struct udev *udev); +/* device properties filter */ +int udev_enumerate_add_match_subsystem(struct udev_enumerate *udev_enumerate, const char *subsystem); +int udev_enumerate_add_nomatch_subsystem(struct udev_enumerate *udev_enumerate, const char *subsystem); +int udev_enumerate_add_match_sysattr(struct udev_enumerate *udev_enumerate, const char *sysattr, const char *value); +int udev_enumerate_add_nomatch_sysattr(struct udev_enumerate *udev_enumerate, const char *sysattr, const char *value); +int udev_enumerate_add_match_property(struct udev_enumerate *udev_enumerate, const char *property, const char *value); +int udev_enumerate_add_match_sysname(struct udev_enumerate *udev_enumerate, const char *sysname); +int udev_enumerate_add_match_tag(struct udev_enumerate *udev_enumerate, const char *tag); +int udev_enumerate_add_match_parent(struct udev_enumerate *udev_enumerate, struct udev_device *parent); +int udev_enumerate_add_match_is_initialized(struct udev_enumerate *udev_enumerate); +int udev_enumerate_add_syspath(struct udev_enumerate *udev_enumerate, const char *syspath); +/* run enumeration with active filters */ +int udev_enumerate_scan_devices(struct udev_enumerate *udev_enumerate); +int udev_enumerate_scan_subsystems(struct udev_enumerate *udev_enumerate); +/* return device list */ +struct udev_list_entry *udev_enumerate_get_list_entry(struct udev_enumerate *udev_enumerate); + +/* + * udev_queue + * + * access to the currently running udev events + */ +struct udev_queue; +struct udev_queue *udev_queue_ref(struct udev_queue *udev_queue); +struct udev_queue *udev_queue_unref(struct udev_queue *udev_queue); +struct udev *udev_queue_get_udev(struct udev_queue *udev_queue); +struct udev_queue *udev_queue_new(struct udev *udev); +unsigned long long int udev_queue_get_kernel_seqnum(struct udev_queue *udev_queue) __attribute__((__deprecated__)); +unsigned long long int udev_queue_get_udev_seqnum(struct udev_queue *udev_queue) __attribute__((__deprecated__)); +int udev_queue_get_udev_is_active(struct udev_queue *udev_queue); +int udev_queue_get_queue_is_empty(struct udev_queue *udev_queue); +int udev_queue_get_seqnum_is_finished(struct udev_queue *udev_queue, unsigned long long int seqnum) __attribute__((__deprecated__)); +int udev_queue_get_seqnum_sequence_is_finished(struct udev_queue *udev_queue, + unsigned long long int start, unsigned long long int end) __attribute__((__deprecated__)); +int udev_queue_get_fd(struct udev_queue *udev_queue); +int udev_queue_flush(struct udev_queue *udev_queue); +struct udev_list_entry *udev_queue_get_queued_list_entry(struct udev_queue *udev_queue) __attribute__((__deprecated__)); + +/* + * udev_hwdb + * + * access to the static hardware properties database + */ +struct udev_hwdb; +struct udev_hwdb *udev_hwdb_new(struct udev *udev); +struct udev_hwdb *udev_hwdb_ref(struct udev_hwdb *hwdb); +struct udev_hwdb *udev_hwdb_unref(struct udev_hwdb *hwdb); +struct udev_list_entry *udev_hwdb_get_properties_list_entry(struct udev_hwdb *hwdb, const char *modalias, unsigned flags); + +/* + * udev_util + * + * udev specific utilities + */ +int udev_util_encode_string(const char *str, char *str_enc, size_t len); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/src/libudev/libudev.pc.in b/src/libudev/libudev.pc.in new file mode 100644 index 0000000..6541bcb --- /dev/null +++ b/src/libudev/libudev.pc.in @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +prefix={{PREFIX}} +exec_prefix={{PREFIX}} +libdir={{LIBDIR}} +includedir={{INCLUDE_DIR}} + +Name: libudev +Description: Library to access udev device information +Version: {{PROJECT_VERSION}} +Libs: -L${libdir} -ludev +Libs.private: -lrt -pthread +Cflags: -I${includedir} diff --git a/src/libudev/libudev.sym b/src/libudev/libudev.sym new file mode 100644 index 0000000..6aa6768 --- /dev/null +++ b/src/libudev/libudev.sym @@ -0,0 +1,126 @@ +/*** + SPDX-License-Identifier: LGPL-2.1-or-later + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +LIBUDEV_183 { +global: + udev_device_get_action; + udev_device_get_devlinks_list_entry; + udev_device_get_devnode; + udev_device_get_devnum; + udev_device_get_devpath; + udev_device_get_devtype; + udev_device_get_driver; + udev_device_get_is_initialized; + udev_device_get_parent; + udev_device_get_parent_with_subsystem_devtype; + udev_device_get_properties_list_entry; + udev_device_get_property_value; + udev_device_get_seqnum; + udev_device_get_subsystem; + udev_device_get_sysattr_list_entry; + udev_device_get_sysattr_value; + udev_device_get_sysname; + udev_device_get_sysnum; + udev_device_get_syspath; + udev_device_get_tags_list_entry; + udev_device_get_udev; + udev_device_get_usec_since_initialized; + udev_device_has_tag; + udev_device_new_from_devnum; + udev_device_new_from_environment; + udev_device_new_from_subsystem_sysname; + udev_device_new_from_syspath; + udev_device_ref; + udev_device_unref; + udev_enumerate_add_match_is_initialized; + udev_enumerate_add_match_parent; + udev_enumerate_add_match_property; + udev_enumerate_add_match_subsystem; + udev_enumerate_add_match_sysattr; + udev_enumerate_add_match_sysname; + udev_enumerate_add_match_tag; + udev_enumerate_add_nomatch_subsystem; + udev_enumerate_add_nomatch_sysattr; + udev_enumerate_add_syspath; + udev_enumerate_get_list_entry; + udev_enumerate_get_udev; + udev_enumerate_new; + udev_enumerate_ref; + udev_enumerate_scan_devices; + udev_enumerate_scan_subsystems; + udev_enumerate_unref; + udev_get_log_priority; + udev_get_userdata; + udev_list_entry_get_by_name; + udev_list_entry_get_name; + udev_list_entry_get_next; + udev_list_entry_get_value; + udev_monitor_enable_receiving; + udev_monitor_filter_add_match_subsystem_devtype; + udev_monitor_filter_add_match_tag; + udev_monitor_filter_remove; + udev_monitor_filter_update; + udev_monitor_get_fd; + udev_monitor_get_udev; + udev_monitor_new_from_netlink; + udev_monitor_receive_device; + udev_monitor_ref; + udev_monitor_set_receive_buffer_size; + udev_monitor_unref; + udev_new; + udev_queue_get_kernel_seqnum; + udev_queue_get_queue_is_empty; + udev_queue_get_queued_list_entry; + udev_queue_get_seqnum_is_finished; + udev_queue_get_seqnum_sequence_is_finished; + udev_queue_get_udev; + udev_queue_get_udev_is_active; + udev_queue_get_udev_seqnum; + udev_queue_new; + udev_queue_ref; + udev_queue_unref; + udev_ref; + udev_set_log_fn; + udev_set_log_priority; + udev_set_userdata; + udev_unref; + udev_util_encode_string; +local: + *; +}; + +LIBUDEV_189 { +global: + udev_device_new_from_device_id; +} LIBUDEV_183; + +LIBUDEV_196 { +global: + udev_hwdb_new; + udev_hwdb_ref; + udev_hwdb_unref; + udev_hwdb_get_properties_list_entry; +} LIBUDEV_189; + +LIBUDEV_199 { +global: + udev_device_set_sysattr_value; +} LIBUDEV_196; + +LIBUDEV_215 { +global: + udev_queue_flush; + udev_queue_get_fd; +} LIBUDEV_199; + +LIBUDEV_247 { +global: + udev_device_has_current_tag; + udev_device_get_current_tags_list_entry; +} LIBUDEV_215; diff --git a/src/libudev/meson.build b/src/libudev/meson.build new file mode 100644 index 0000000..7f99919 --- /dev/null +++ b/src/libudev/meson.build @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libudev_sources = files( + 'libudev-device.c', + 'libudev-enumerate.c', + 'libudev-hwdb.c', + 'libudev-list.c', + 'libudev-monitor.c', + 'libudev-queue.c', + 'libudev-util.c', + 'libudev.c', +) + +############################################################ + +libudev_includes = [includes, include_directories('.')] + +libudev_dir_path = meson.current_source_dir() + +libudev_sym = files('libudev.sym') +libudev_sym_path = libudev_dir_path / 'libudev.sym' + +install_headers('libudev.h') +libudev_h_path = libudev_dir_path / 'libudev.h' + +libudev_basic = static_library( + 'udev-basic', + libudev_sources, + include_directories : includes, + dependencies : userspace, + c_args : ['-fvisibility=default'], + build_by_default : false) + +static_libudev = get_option('static-libudev') +static_libudev_pic = static_libudev == 'true' or static_libudev == 'pic' + +libudev_pc = custom_target( + 'libudev.pc', + input : 'libudev.pc.in', + output : 'libudev.pc', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : pkgconfiglibdir != 'no', + install_tag : 'devel', + install_dir : pkgconfiglibdir) diff --git a/src/libudev/test-libudev.c b/src/libudev/test-libudev.c new file mode 100644 index 0000000..e05a062 --- /dev/null +++ b/src/libudev/test-libudev.c @@ -0,0 +1,496 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "libudev-list-internal.h" +#include "libudev-util.h" +#include "log.h" +#include "main-func.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tests.h" +#include "version.h" + +static bool arg_monitor = false; + +static void print_device(struct udev_device *device) { + const char *str; + dev_t devnum; + int count; + struct udev_list_entry *list_entry; + + log_info("*** device: %p ***", device); + str = udev_device_get_action(device); + if (str) + log_info("action: '%s'", str); + + str = udev_device_get_syspath(device); + log_info("syspath: '%s'", str); + + str = udev_device_get_sysname(device); + log_info("sysname: '%s'", str); + + str = udev_device_get_sysnum(device); + if (str) + log_info("sysnum: '%s'", str); + + str = udev_device_get_devpath(device); + log_info("devpath: '%s'", str); + + str = udev_device_get_subsystem(device); + if (str) + log_info("subsystem: '%s'", str); + + str = udev_device_get_devtype(device); + if (str) + log_info("devtype: '%s'", str); + + str = udev_device_get_driver(device); + if (str) + log_info("driver: '%s'", str); + + str = udev_device_get_devnode(device); + if (str) + log_info("devname: '%s'", str); + + devnum = udev_device_get_devnum(device); + if (major(devnum) > 0) + log_info("devnum: %u:%u", major(devnum), minor(devnum)); + + count = 0; + udev_list_entry_foreach(list_entry, udev_device_get_devlinks_list_entry(device)) { + log_info("link: '%s'", udev_list_entry_get_name(list_entry)); + count++; + } + if (count > 0) + log_info("found %i links", count); + + count = 0; + udev_list_entry_foreach(list_entry, udev_device_get_properties_list_entry(device)) { + log_info("property: '%s=%s'", + udev_list_entry_get_name(list_entry), + udev_list_entry_get_value(list_entry)); + count++; + } + if (count > 0) + log_info("found %i properties", count); + + str = udev_device_get_property_value(device, "MAJOR"); + if (str) + log_info("MAJOR: '%s'", str); + + str = udev_device_get_sysattr_value(device, "dev"); + if (str) + log_info("attr{dev}: '%s'", str); +} + +static void test_device(struct udev *udev, const char *syspath) { + _cleanup_(udev_device_unrefp) struct udev_device *device = NULL; + + log_info("/* %s, device %s */", __func__, syspath); + device = udev_device_new_from_syspath(udev, syspath); + if (device) + print_device(device); + else + log_warning_errno(errno, "udev_device_new_from_syspath: %m"); +} + +static void test_device_parents(struct udev *udev, const char *syspath) { + _cleanup_(udev_device_unrefp) struct udev_device *device = NULL; + struct udev_device *device_parent; + + log_info("/* %s, device %s */", __func__, syspath); + device = udev_device_new_from_syspath(udev, syspath); + if (!device) + return; + + log_info("looking at parents"); + device_parent = device; + do { + print_device(device_parent); + device_parent = udev_device_get_parent(device_parent); + } while (device_parent != NULL); + + log_info("looking at parents again"); + device_parent = device; + do { + print_device(device_parent); + device_parent = udev_device_get_parent(device_parent); + } while (device_parent != NULL); +} + +static void test_device_devnum(struct udev *udev) { + dev_t devnum = makedev(1, 3); + _cleanup_(udev_device_unrefp) struct udev_device *device; + + log_info("/* %s, device " DEVNUM_FORMAT_STR " */", __func__, DEVNUM_FORMAT_VAL(devnum)); + + device = udev_device_new_from_devnum(udev, 'c', devnum); + if (device) + print_device(device); + else + log_warning_errno(errno, "udev_device_new_from_devnum: %m"); +} + +static void test_device_subsys_name(struct udev *udev, const char *subsys, const char *dev) { + _cleanup_(udev_device_unrefp) struct udev_device *device; + + log_info("looking up device: '%s:%s'", subsys, dev); + device = udev_device_new_from_subsystem_sysname(udev, subsys, dev); + if (!device) + log_warning_errno(errno, "udev_device_new_from_subsystem_sysname: %m"); + else + print_device(device); +} + +static int enumerate_print_list(struct udev_enumerate *enumerate) { + struct udev_list_entry *list_entry; + int count = 0; + + udev_list_entry_foreach(list_entry, udev_enumerate_get_list_entry(enumerate)) { + struct udev_device *device; + + device = udev_device_new_from_syspath(udev_enumerate_get_udev(enumerate), + udev_list_entry_get_name(list_entry)); + if (device) { + log_info("device: '%s' (%s)", + udev_device_get_syspath(device), + udev_device_get_subsystem(device)); + udev_device_unref(device); + count++; + } + } + log_info("found %i devices", count); + return count; +} + +static void test_monitor(struct udev *udev) { + _cleanup_(udev_monitor_unrefp) struct udev_monitor *udev_monitor = NULL; + _cleanup_close_ int fd_ep = -EBADF; + int fd_udev; + struct epoll_event ep_udev = { + .events = EPOLLIN, + }, ep_stdin = { + .events = EPOLLIN, + .data.fd = STDIN_FILENO, + }; + + log_info("/* %s */", __func__); + + fd_ep = epoll_create1(EPOLL_CLOEXEC); + assert_se(fd_ep >= 0); + + udev_monitor = udev_monitor_new_from_netlink(udev, "udev"); + assert_se(udev_monitor != NULL); + + fd_udev = udev_monitor_get_fd(udev_monitor); + ep_udev.data.fd = fd_udev; + + assert_se(udev_monitor_filter_add_match_subsystem_devtype(udev_monitor, "block", NULL) >= 0); + assert_se(udev_monitor_filter_add_match_subsystem_devtype(udev_monitor, "tty", NULL) >= 0); + assert_se(udev_monitor_filter_add_match_subsystem_devtype(udev_monitor, "usb", "usb_device") >= 0); + + assert_se(udev_monitor_enable_receiving(udev_monitor) >= 0); + + assert_se(epoll_ctl(fd_ep, EPOLL_CTL_ADD, fd_udev, &ep_udev) >= 0); + assert_se(epoll_ctl(fd_ep, EPOLL_CTL_ADD, STDIN_FILENO, &ep_stdin) >= 0); + + for (;;) { + int fdcount; + struct epoll_event ev[4]; + struct udev_device *device; + int i; + + printf("waiting for events from udev, press ENTER to exit\n"); + fdcount = epoll_wait(fd_ep, ev, ELEMENTSOF(ev), -1); + printf("epoll fd count: %i\n", fdcount); + + for (i = 0; i < fdcount; i++) { + if (ev[i].data.fd == fd_udev && ev[i].events & EPOLLIN) { + device = udev_monitor_receive_device(udev_monitor); + if (!device) { + printf("no device from socket\n"); + continue; + } + print_device(device); + udev_device_unref(device); + } else if (ev[i].data.fd == STDIN_FILENO && ev[i].events & EPOLLIN) { + printf("exiting loop\n"); + return; + } + } + } +} + +static void test_queue(struct udev *udev) { + struct udev_queue *udev_queue; + bool empty; + + log_info("/* %s */", __func__); + + assert_se(udev_queue = udev_queue_new(udev)); + + empty = udev_queue_get_queue_is_empty(udev_queue); + log_info("queue is %s", empty ? "empty" : "not empty"); + udev_queue_unref(udev_queue); +} + +static int test_enumerate(struct udev *udev, const char *subsystem) { + struct udev_enumerate *udev_enumerate; + int r; + + log_info("/* %s */", __func__); + + log_info("enumerate '%s'", subsystem == NULL ? "" : subsystem); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_add_match_subsystem(udev_enumerate, subsystem); + udev_enumerate_scan_devices(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + + log_info("enumerate 'net' + duplicated scan + null + zero"); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_add_match_subsystem(udev_enumerate, "net"); + udev_enumerate_scan_devices(udev_enumerate); + udev_enumerate_scan_devices(udev_enumerate); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/zero"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/null"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/zero"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/null"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/zero"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/null"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/null"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/zero"); + udev_enumerate_add_syspath(udev_enumerate, "/sys/class/mem/zero"); + udev_enumerate_scan_devices(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + + log_info("enumerate 'block'"); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_add_match_subsystem(udev_enumerate,"block"); + r = udev_enumerate_add_match_is_initialized(udev_enumerate); + if (r < 0) { + udev_enumerate_unref(udev_enumerate); + return r; + } + udev_enumerate_scan_devices(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + + log_info("enumerate 'not block'"); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_add_nomatch_subsystem(udev_enumerate, "block"); + udev_enumerate_scan_devices(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + + log_info("enumerate 'pci, mem, vc'"); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_add_match_subsystem(udev_enumerate, "pci"); + udev_enumerate_add_match_subsystem(udev_enumerate, "mem"); + udev_enumerate_add_match_subsystem(udev_enumerate, "vc"); + udev_enumerate_scan_devices(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + + log_info("enumerate 'subsystem'"); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_scan_subsystems(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + + log_info("enumerate 'property IF_FS_*=filesystem'"); + udev_enumerate = udev_enumerate_new(udev); + if (!udev_enumerate) + return -1; + udev_enumerate_add_match_property(udev_enumerate, "ID_FS*", "filesystem"); + udev_enumerate_scan_devices(udev_enumerate); + enumerate_print_list(udev_enumerate); + udev_enumerate_unref(udev_enumerate); + return 0; +} + +static void test_hwdb(struct udev *udev, const char *modalias) { + struct udev_hwdb *hwdb; + struct udev_list_entry *entry; + + log_info("/* %s */", __func__); + + hwdb = udev_hwdb_new(udev); + if (!hwdb) + log_warning_errno(errno, "Failed to open hwdb: %m"); + + udev_list_entry_foreach(entry, udev_hwdb_get_properties_list_entry(hwdb, modalias, 0)) + log_info("'%s'='%s'", udev_list_entry_get_name(entry), udev_list_entry_get_value(entry)); + + hwdb = udev_hwdb_unref(hwdb); + assert_se(hwdb == NULL); +} + +static void test_list(void) { + _cleanup_(udev_list_freep) struct udev_list *list = NULL; + struct udev_list_entry *e; + + /* empty list */ + assert_se(list = udev_list_new(false)); + assert_se(!udev_list_get_entry(list)); + list = udev_list_free(list); + + /* unique == false */ + assert_se(list = udev_list_new(false)); + assert_se(udev_list_entry_add(list, "aaa", "hoge")); + assert_se(udev_list_entry_add(list, "aaa", "hogehoge")); + assert_se(udev_list_entry_add(list, "bbb", "foo")); + e = udev_list_get_entry(list); + assert_se(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "aaa")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "hoge")); + e = udev_list_entry_get_next(e); + assert_se(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "aaa")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "hogehoge")); + e = udev_list_entry_get_next(e); + assert_se(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "bbb")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "foo")); + assert_se(!udev_list_entry_get_next(e)); + + assert_se(!udev_list_entry_get_by_name(e, "aaa")); + assert_se(!udev_list_entry_get_by_name(e, "bbb")); + assert_se(!udev_list_entry_get_by_name(e, "ccc")); + list = udev_list_free(list); + + /* unique == true */ + assert_se(list = udev_list_new(true)); + assert_se(udev_list_entry_add(list, "aaa", "hoge")); + assert_se(udev_list_entry_add(list, "aaa", "hogehoge")); + assert_se(udev_list_entry_add(list, "bbb", "foo")); + e = udev_list_get_entry(list); + assert_se(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "aaa")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "hogehoge")); + e = udev_list_entry_get_next(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "bbb")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "foo")); + assert_se(!udev_list_entry_get_next(e)); + + e = udev_list_entry_get_by_name(e, "bbb"); + assert_se(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "bbb")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "foo")); + e = udev_list_entry_get_by_name(e, "aaa"); + assert_se(e); + assert_se(streq_ptr(udev_list_entry_get_name(e), "aaa")); + assert_se(streq_ptr(udev_list_entry_get_value(e), "hogehoge")); + assert_se(!udev_list_entry_get_by_name(e, "ccc")); +} + +static int parse_args(int argc, char *argv[], const char **syspath, const char **subsystem) { + static const struct option options[] = { + { "syspath", required_argument, NULL, 'p' }, + { "subsystem", required_argument, NULL, 's' }, + { "debug", no_argument, NULL, 'd' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "monitor", no_argument, NULL, 'm' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "p:s:dhVm", options, NULL)) >= 0) + switch (c) { + case 'p': + *syspath = optarg; + break; + + case 's': + *subsystem = optarg; + break; + + case 'd': + log_set_max_level(LOG_DEBUG); + break; + + case 'h': + printf("--debug --syspath= --subsystem= --help\n"); + return 0; + + case 'V': + printf("%s\n", GIT_VERSION); + return 0; + + case 'm': + arg_monitor = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(udev_unrefp) struct udev *udev = NULL; + + const char *syspath = "/devices/virtual/mem/null"; + const char *subsystem = NULL; + int r; + + test_setup_logging(LOG_INFO); + + r = parse_args(argc, argv, &syspath, &subsystem); + if (r <= 0) + return r; + + assert_se(udev = udev_new()); + + /* add sys path if needed */ + if (!startswith(syspath, "/sys")) + syspath = strjoina("/sys/", syspath); + + test_device(udev, syspath); + test_device_devnum(udev); + test_device_subsys_name(udev, "block", "sda"); + test_device_subsys_name(udev, "subsystem", "pci"); + test_device_subsys_name(udev, "drivers", "scsi:sd"); + test_device_subsys_name(udev, "module", "printk"); + test_device_parents(udev, syspath); + + test_enumerate(udev, subsystem); + + test_queue(udev); + + test_hwdb(udev, "usb:v0D50p0011*"); + + if (arg_monitor) + test_monitor(udev); + + test_list(); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/libudev/test-udev-device-thread.c b/src/libudev/test-udev-device-thread.c new file mode 100644 index 0000000..c082fdc --- /dev/null +++ b/src/libudev/test-udev-device-thread.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "libudev.h" + +#define handle_error_errno(error, msg) \ + ({ \ + errno = abs(error); \ + perror(msg); \ + EXIT_FAILURE; \ + }) + +static void* thread(void *p) { + struct udev_device **d = p; + + *d = udev_device_unref(*d); + + return NULL; +} + +int main(int argc, char *argv[]) { + struct udev_device *loopback; + struct udev_list_entry *entry, *e; + pthread_t t; + int r; + + loopback = udev_device_new_from_syspath(NULL, "/sys/class/net/lo"); + if (!loopback) + return handle_error_errno(errno, "Failed to create loopback device object"); + + entry = udev_device_get_properties_list_entry(loopback); + udev_list_entry_foreach(e, entry) + printf("%s=%s\n", udev_list_entry_get_name(e), udev_list_entry_get_value(e)); + + r = pthread_create(&t, NULL, thread, &loopback); + if (r != 0) + return handle_error_errno(r, "Failed to create thread"); + + r = pthread_join(t, NULL); + if (r != 0) + return handle_error_errno(r, "Failed to wait thread finished"); + + if (loopback) + return handle_error_errno(r, "loopback device is not unref()ed"); + + return 0; +} diff --git a/src/locale/kbd-model-map b/src/locale/kbd-model-map new file mode 100644 index 0000000..279d1a3 --- /dev/null +++ b/src/locale/kbd-model-map @@ -0,0 +1,72 @@ +# Originally generated from system-config-keyboard's model list. +# consolelayout xlayout xmodel xvariant xoptions +sg ch pc105 de_nodeadkeys terminate:ctrl_alt_bksp +nl nl pc105 - terminate:ctrl_alt_bksp +mk-utf mk,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +trq tr pc105 - terminate:ctrl_alt_bksp +uk gb pc105 - terminate:ctrl_alt_bksp +is-latin1 is pc105 - terminate:ctrl_alt_bksp +de de pc105 - terminate:ctrl_alt_bksp +la-latin1 latam pc105 - terminate:ctrl_alt_bksp +us us pc105+inet - terminate:ctrl_alt_bksp +ko kr pc105 - terminate:ctrl_alt_bksp +ro-std ro pc105 std terminate:ctrl_alt_bksp +de-latin1 de pc105 - terminate:ctrl_alt_bksp +slovene si pc105 - terminate:ctrl_alt_bksp +hu hu pc105 - terminate:ctrl_alt_bksp +jp106 jp jp106 - terminate:ctrl_alt_bksp +croat hr pc105 - terminate:ctrl_alt_bksp +it2 it pc105 - terminate:ctrl_alt_bksp +hu101 hu pc105 qwerty terminate:ctrl_alt_bksp +sr-latin rs pc105 latin terminate:ctrl_alt_bksp +fi fi pc105 - terminate:ctrl_alt_bksp +fr_CH ch pc105 fr terminate:ctrl_alt_bksp +dk-latin1 dk pc105 - terminate:ctrl_alt_bksp +fr fr pc105 - terminate:ctrl_alt_bksp +it it pc105 - terminate:ctrl_alt_bksp +ua-utf ua,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +fr-latin1 fr pc105 - terminate:ctrl_alt_bksp +sg-latin1 ch pc105 de_nodeadkeys terminate:ctrl_alt_bksp +be-latin1 be pc105 - terminate:ctrl_alt_bksp +dk dk pc105 - terminate:ctrl_alt_bksp +fr-pc fr pc105 - terminate:ctrl_alt_bksp +bg_pho-utf8 bg,us pc105 ,phonetic terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +it-ibm it pc105 - terminate:ctrl_alt_bksp +cz-us-qwertz cz,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +cz-qwerty cz,us pc105 qwerty, terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +br-abnt2 br abnt2 - terminate:ctrl_alt_bksp +ro ro pc105 - terminate:ctrl_alt_bksp +us-acentos us pc105 intl terminate:ctrl_alt_bksp +pt-latin1 pt pc105 - terminate:ctrl_alt_bksp +ro-std-cedilla ro pc105 std_cedilla terminate:ctrl_alt_bksp +tj_alt-UTF8 tj pc105 - terminate:ctrl_alt_bksp +de-latin1-nodeadkeys de pc105 nodeadkeys terminate:ctrl_alt_bksp +no no pc105 - terminate:ctrl_alt_bksp +bg_bds-utf8 bg,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +dvorak us pc105 dvorak terminate:ctrl_alt_bksp +dvorak us pc105 dvorak-alt-intl terminate:ctrl_alt_bksp +ru ru,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +cz-lat2 cz pc105 qwerty terminate:ctrl_alt_bksp +pl2 pl pc105 - terminate:ctrl_alt_bksp +es es pc105 - terminate:ctrl_alt_bksp +ro-cedilla ro pc105 cedilla terminate:ctrl_alt_bksp +ie ie pc105 - terminate:ctrl_alt_bksp +et ee pc105 - terminate:ctrl_alt_bksp +sk-qwerty sk pc105 qwerty terminate:ctrl_alt_bksp +sk-qwertz sk pc105 - terminate:ctrl_alt_bksp +fr-latin9 fr pc105 latin9 terminate:ctrl_alt_bksp +fr_CH-latin1 ch pc105 fr terminate:ctrl_alt_bksp +cf ca pc105 - terminate:ctrl_alt_bksp +sv-latin1 se pc105 - terminate:ctrl_alt_bksp +sr-cy rs pc105 - terminate:ctrl_alt_bksp +gr gr,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +by by,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +il il pc105 - terminate:ctrl_alt_bksp +kazakh kz,us pc105 - terminate:ctrl_alt_bksp,grp:shifts_toggle,grp_led:scroll +lt.baltic lt pc105 - terminate:ctrl_alt_bksp +lt.l4 lt pc105 - terminate:ctrl_alt_bksp +lt lt pc105 - terminate:ctrl_alt_bksp +khmer kh,us pc105 - terminate:ctrl_alt_bksp +es-dvorak es microsoftpro dvorak terminate:ctrl_alt_bksp +lv lv pc105 apostrophe terminate:ctrl_alt_bksp +lv-tilde lv pc105 tilde terminate:ctrl_alt_bksp diff --git a/src/locale/language-fallback-map b/src/locale/language-fallback-map new file mode 100644 index 0000000..d0b02a6 --- /dev/null +++ b/src/locale/language-fallback-map @@ -0,0 +1,13 @@ +csb_PL csb:pl +en_AU en_AU:en_GB +en_IE en_IE:en_GB +en_NZ en_NZ:en_GB +en_ZA en_ZA:en_GB +fr_BE fr_BE:fr_FR +fr_CA fr_CA:fr_FR +fr_CH fr_CH:fr_FR +fr_LU fr_LU:fr_FR +it_CH it_CH:it_IT +mai_IN mai:hi +nds_DE nds:de +szl_PL szl:pl diff --git a/src/locale/localectl.c b/src/locale/localectl.c new file mode 100644 index 0000000..3235402 --- /dev/null +++ b/src/locale/localectl.c @@ -0,0 +1,535 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "kbd-util.h" +#include "locale-setup.h" +#include "main-func.h" +#include "memory-util.h" +#include "pager.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "set.h" +#include "spawn-polkit-agent.h" +#include "strv.h" +#include "terminal-util.h" +#include "verbs.h" +#include "virt.h" + +/* Enough time for locale-gen to finish server-side (in case it is in use) */ +#define LOCALE_SLOW_BUS_CALL_TIMEOUT_USEC (2*USEC_PER_MINUTE) + +static PagerFlags arg_pager_flags = 0; +static bool arg_ask_password = true; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static const char *arg_host = NULL; +static bool arg_convert = true; + +typedef struct StatusInfo { + char **locale; + const char *vconsole_keymap; + const char *vconsole_keymap_toggle; + const char *x11_layout; + const char *x11_model; + const char *x11_variant; + const char *x11_options; +} StatusInfo; + +static void status_info_clear(StatusInfo *info) { + if (info) { + strv_free(info->locale); + zero(*info); + } +} + +static int print_status_info(StatusInfo *i) { + _cleanup_strv_free_ char **kernel_locale = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + TableCell *cell; + int r; + + assert(i); + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + _cleanup_(locale_context_clear) LocaleContext c = {}; + + r = locale_context_load(&c, LOCALE_LOAD_PROC_CMDLINE); + if (r < 0) + return log_error_errno(r, "Failed to read /proc/cmdline: %m"); + + r = locale_context_build_env(&c, &kernel_locale, NULL); + if (r < 0) + return log_error_errno(r, "Failed to build locale settings from kernel command line: %m"); + } + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + (void) table_set_ellipsize_percent(table, cell, 100); + + table_set_ersatz_string(table, TABLE_ERSATZ_UNSET); + + if (!strv_isempty(kernel_locale)) { + log_warning("Warning: Settings on kernel command line override system locale settings in /etc/locale.conf."); + r = table_add_many(table, + TABLE_FIELD, "Command Line", + TABLE_SET_COLOR, ansi_highlight_yellow(), + TABLE_STRV, kernel_locale, + TABLE_SET_COLOR, ansi_highlight_yellow()); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "System Locale", + TABLE_STRV, i->locale, + TABLE_FIELD, "VC Keymap", + TABLE_STRING, i->vconsole_keymap); + if (r < 0) + return table_log_add_error(r); + + if (!isempty(i->vconsole_keymap_toggle)) { + r = table_add_many(table, + TABLE_FIELD, "VC Toggle Keymap", + TABLE_STRING, i->vconsole_keymap_toggle); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "X11 Layout", + TABLE_STRING, i->x11_layout); + if (r < 0) + return table_log_add_error(r); + + if (!isempty(i->x11_model)) { + r = table_add_many(table, + TABLE_FIELD, "X11 Model", + TABLE_STRING, i->x11_model); + if (r < 0) + return table_log_add_error(r); + } + + if (!isempty(i->x11_variant)) { + r = table_add_many(table, + TABLE_FIELD, "X11 Variant", + TABLE_STRING, i->x11_variant); + if (r < 0) + return table_log_add_error(r); + } + + if (!isempty(i->x11_options)) { + r = table_add_many(table, + TABLE_FIELD, "X11 Options", + TABLE_STRING, i->x11_options); + if (r < 0) + return table_log_add_error(r); + } + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int show_status(int argc, char **argv, void *userdata) { + _cleanup_(status_info_clear) StatusInfo info = {}; + static const struct bus_properties_map map[] = { + { "VConsoleKeymap", "s", NULL, offsetof(StatusInfo, vconsole_keymap) }, + { "VConsoleKeymapToggle", "s", NULL, offsetof(StatusInfo, vconsole_keymap_toggle) }, + { "X11Layout", "s", NULL, offsetof(StatusInfo, x11_layout) }, + { "X11Model", "s", NULL, offsetof(StatusInfo, x11_model) }, + { "X11Variant", "s", NULL, offsetof(StatusInfo, x11_variant) }, + { "X11Options", "s", NULL, offsetof(StatusInfo, x11_options) }, + { "Locale", "as", NULL, offsetof(StatusInfo, locale) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + r = bus_map_all_properties(bus, + "org.freedesktop.locale1", + "/org/freedesktop/locale1", + map, + 0, + &error, + &m, + &info); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + return print_status_info(&info); +} + +static int set_locale(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_message_new_method_call(bus, &m, bus_locale, "SetLocale"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, argv + 1); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "b", arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* We use a longer timeout for the method call in case localed is running locale-gen */ + r = sd_bus_call(bus, m, LOCALE_SLOW_BUS_CALL_TIMEOUT_USEC, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to issue method call: %s", bus_error_message(&error, r)); + + return 0; +} + +static int list_locales(int argc, char **argv, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + int r; + + r = get_locales(&l); + if (r < 0) + return log_error_errno(r, "Failed to read list of locales: %m"); + + pager_open(arg_pager_flags); + strv_print(l); + + return 0; +} + +static int set_vconsole_keymap(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *map, *toggle_map; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + map = argv[1]; + toggle_map = argc > 2 ? argv[2] : ""; + + r = bus_call_method( + bus, + bus_locale, + "SetVConsoleKeyboard", + &error, + NULL, + "ssbb", map, toggle_map, arg_convert, arg_ask_password); + if (r < 0) + return log_error_errno(r, "Failed to set keymap: %s", bus_error_message(&error, r)); + + return 0; +} + +static int list_vconsole_keymaps(int argc, char **argv, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + int r; + + r = get_keymaps(&l); + if (r < 0) + return log_error_errno(r, "Failed to read list of keymaps: %m"); + + pager_open(arg_pager_flags); + + strv_print(l); + + return 0; +} + +static int set_x11_keymap(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *layout, *model, *variant, *options; + sd_bus *bus = userdata; + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + layout = argv[1]; + model = argc > 2 ? argv[2] : ""; + variant = argc > 3 ? argv[3] : ""; + options = argc > 4 ? argv[4] : ""; + + r = bus_call_method( + bus, + bus_locale, + "SetX11Keyboard", + &error, + NULL, + "ssssbb", layout, model, variant, options, + arg_convert, arg_ask_password); + if (r < 0) + return log_error_errno(r, "Failed to set keymap: %s", bus_error_message(&error, r)); + + return 0; +} + +static int list_x11_keymaps(int argc, char **argv, void *userdata) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **list = NULL; + enum { + NONE, + MODELS, + LAYOUTS, + VARIANTS, + OPTIONS + } state = NONE, look_for; + int r; + + f = fopen("/usr/share/X11/xkb/rules/base.lst", "re"); + if (!f) + return log_error_errno(errno, "Failed to open keyboard mapping list. %m"); + + if (streq(argv[0], "list-x11-keymap-models")) + look_for = MODELS; + else if (streq(argv[0], "list-x11-keymap-layouts")) + look_for = LAYOUTS; + else if (streq(argv[0], "list-x11-keymap-variants")) + look_for = VARIANTS; + else if (streq(argv[0], "list-x11-keymap-options")) + look_for = OPTIONS; + else + assert_not_reached(); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *w; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read keyboard mapping list: %m"); + if (r == 0) + break; + + if (isempty(line)) + continue; + + if (line[0] == '!') { + if (startswith(line, "! model")) + state = MODELS; + else if (startswith(line, "! layout")) + state = LAYOUTS; + else if (startswith(line, "! variant")) + state = VARIANTS; + else if (startswith(line, "! option")) + state = OPTIONS; + else + state = NONE; + + continue; + } + + if (state != look_for) + continue; + + w = line + strcspn(line, WHITESPACE); + + if (argc > 1) { + char *e; + + if (*w == 0) + continue; + + *w = 0; + w++; + w += strspn(w, WHITESPACE); + + e = strchr(w, ':'); + if (!e) + continue; + + *e = 0; + + if (!streq(w, argv[1])) + continue; + } else + *w = 0; + + if (strv_consume(&list, TAKE_PTR(line)) < 0) + return log_oom(); + } + + if (strv_isempty(list)) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Couldn't find any entries."); + + strv_sort(list); + strv_uniq(list); + + pager_open(arg_pager_flags); + + strv_print(list); + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("localectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n\n" + "%sQuery or change system locale and keyboard settings.%s\n" + "\nCommands:\n" + " status Show current locale settings\n" + " set-locale LOCALE... Set system locale\n" + " list-locales Show known locales\n" + " set-keymap MAP [MAP] Set console and X11 keyboard mappings\n" + " list-keymaps Show known virtual console keyboard mappings\n" + " set-x11-keymap LAYOUT [MODEL [VARIANT [OPTIONS]]]\n" + " Set X11 and console keyboard mappings\n" + " list-x11-keymap-models Show known X11 keyboard mapping models\n" + " list-x11-keymap-layouts Show known X11 keyboard mapping layouts\n" + " list-x11-keymap-variants [LAYOUT]\n" + " Show known X11 keyboard mapping variants\n" + " list-x11-keymap-options Show known X11 keyboard mapping options\n" + "\nOptions:\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-ask-password Do not prompt for password\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " --no-convert Don't convert keyboard mappings\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int verb_help(int argc, char **argv, void *userdata) { + return help(); +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_CONVERT, + ARG_NO_ASK_PASSWORD + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "no-convert", no_argument, NULL, ARG_NO_CONVERT }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hH:M:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_CONVERT: + arg_convert = false; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int localectl_main(sd_bus *bus, int argc, char *argv[]) { + + static const Verb verbs[] = { + { "status", VERB_ANY, 1, VERB_DEFAULT, show_status }, + { "set-locale", 2, VERB_ANY, 0, set_locale }, + { "list-locales", VERB_ANY, 1, 0, list_locales }, + { "set-keymap", 2, 3, 0, set_vconsole_keymap }, + { "list-keymaps", VERB_ANY, 1, 0, list_vconsole_keymaps }, + { "set-x11-keymap", 2, 5, 0, set_x11_keymap }, + { "list-x11-keymap-models", VERB_ANY, 1, 0, list_x11_keymaps }, + { "list-x11-keymap-layouts", VERB_ANY, 1, 0, list_x11_keymaps }, + { "list-x11-keymap-variants", VERB_ANY, 2, 0, list_x11_keymaps }, + { "list-x11-keymap-options", VERB_ANY, 1, 0, list_x11_keymaps }, + { "help", VERB_ANY, VERB_ANY, 0, verb_help }, /* Not documented, but supported since it is created. */ + {} + }; + + return dispatch_verb(argc, argv, verbs, bus); +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + setlocale(LC_ALL, ""); + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, &bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + return localectl_main(bus, argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/locale/localed-util.c b/src/locale/localed-util.c new file mode 100644 index 0000000..e4e57a0 --- /dev/null +++ b/src/locale/localed-util.c @@ -0,0 +1,1161 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "bus-polkit.h" +#include "copy.h" +#include "env-file-label.h" +#include "env-file.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "fileio.h" +#include "fs-util.h" +#include "kbd-util.h" +#include "localed-util.h" +#include "macro.h" +#include "mkdir-label.h" +#include "nulstr-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "xkbcommon-util.h" + +static bool startswith_comma(const char *s, const char *prefix) { + assert(s); + assert(prefix); + + s = startswith(s, prefix); + if (!s) + return false; + + return IN_SET(*s, ',', '\0'); +} + +static const char* systemd_kbd_model_map(void) { + const char* s; + + s = getenv("SYSTEMD_KBD_MODEL_MAP"); + if (s) + return s; + + return SYSTEMD_KBD_MODEL_MAP; +} + +static const char* systemd_language_fallback_map(void) { + const char* s; + + s = getenv("SYSTEMD_LANGUAGE_FALLBACK_MAP"); + if (s) + return s; + + return SYSTEMD_LANGUAGE_FALLBACK_MAP; +} + +void x11_context_clear(X11Context *xc) { + assert(xc); + + xc->layout = mfree(xc->layout); + xc->options = mfree(xc->options); + xc->model = mfree(xc->model); + xc->variant = mfree(xc->variant); +} + +void x11_context_replace(X11Context *dest, X11Context *src) { + assert(dest); + assert(src); + + x11_context_clear(dest); + *dest = TAKE_STRUCT(*src); +} + +bool x11_context_isempty(const X11Context *xc) { + assert(xc); + + return + isempty(xc->layout) && + isempty(xc->model) && + isempty(xc->variant) && + isempty(xc->options); +} + +void x11_context_empty_to_null(X11Context *xc) { + assert(xc); + + /* Do not call x11_context_clear() for the passed object. */ + + xc->layout = empty_to_null(xc->layout); + xc->model = empty_to_null(xc->model); + xc->variant = empty_to_null(xc->variant); + xc->options = empty_to_null(xc->options); +} + +bool x11_context_is_safe(const X11Context *xc) { + assert(xc); + + return + (!xc->layout || string_is_safe(xc->layout)) && + (!xc->model || string_is_safe(xc->model)) && + (!xc->variant || string_is_safe(xc->variant)) && + (!xc->options || string_is_safe(xc->options)); +} + +bool x11_context_equal(const X11Context *a, const X11Context *b) { + assert(a); + assert(b); + + return + streq_ptr(a->layout, b->layout) && + streq_ptr(a->model, b->model) && + streq_ptr(a->variant, b->variant) && + streq_ptr(a->options, b->options); +} + +int x11_context_copy(X11Context *dest, const X11Context *src) { + bool modified; + int r; + + assert(dest); + + if (dest == src) + return 0; + + if (!src) { + modified = !x11_context_isempty(dest); + x11_context_clear(dest); + return modified; + } + + r = free_and_strdup(&dest->layout, src->layout); + if (r < 0) + return r; + modified = r > 0; + + r = free_and_strdup(&dest->model, src->model); + if (r < 0) + return r; + modified = modified || r > 0; + + r = free_and_strdup(&dest->variant, src->variant); + if (r < 0) + return r; + modified = modified || r > 0; + + r = free_and_strdup(&dest->options, src->options); + if (r < 0) + return r; + modified = modified || r > 0; + + return modified; +} + +int x11_context_verify_and_warn(const X11Context *xc, int log_level, sd_bus_error *error) { + int r; + + assert(xc); + + if (!x11_context_is_safe(xc)) { + if (error) + sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid X11 keyboard layout."); + return log_full_errno(log_level, SYNTHETIC_ERRNO(EINVAL), "Invalid X11 keyboard layout."); + } + + r = verify_xkb_rmlvo(xc->model, xc->layout, xc->variant, xc->options); + if (r == -EOPNOTSUPP) { + log_full_errno(MAX(log_level, LOG_NOTICE), r, + "Cannot verify if new keymap is correct, libxkbcommon.so unavailable."); + return 0; + } + if (r < 0) { + if (error) + sd_bus_error_set_errnof(error, r, "Specified keymap cannot be compiled, refusing as invalid."); + return log_full_errno(log_level, r, + "Cannot compile XKB keymap for x11 keyboard layout " + "(model='%s' / layout='%s' / variant='%s' / options='%s'): %m", + strempty(xc->model), strempty(xc->layout), strempty(xc->variant), strempty(xc->options)); + } + + return 0; +} + +void vc_context_clear(VCContext *vc) { + assert(vc); + + vc->keymap = mfree(vc->keymap); + vc->toggle = mfree(vc->toggle); +} + +void vc_context_replace(VCContext *dest, VCContext *src) { + assert(dest); + assert(src); + + vc_context_clear(dest); + *dest = TAKE_STRUCT(*src); +} + +bool vc_context_isempty(const VCContext *vc) { + assert(vc); + + return + isempty(vc->keymap) && + isempty(vc->toggle); +} + +void vc_context_empty_to_null(VCContext *vc) { + assert(vc); + + /* Do not call vc_context_clear() for the passed object. */ + + vc->keymap = empty_to_null(vc->keymap); + vc->toggle = empty_to_null(vc->toggle); +} + +bool vc_context_equal(const VCContext *a, const VCContext *b) { + assert(a); + assert(b); + + return + streq_ptr(a->keymap, b->keymap) && + streq_ptr(a->toggle, b->toggle); +} + +int vc_context_copy(VCContext *dest, const VCContext *src) { + bool modified; + int r; + + assert(dest); + + if (dest == src) + return 0; + + if (!src) { + modified = !vc_context_isempty(dest); + vc_context_clear(dest); + return modified; + } + + r = free_and_strdup(&dest->keymap, src->keymap); + if (r < 0) + return r; + modified = r > 0; + + r = free_and_strdup(&dest->toggle, src->toggle); + if (r < 0) + return r; + modified = modified || r > 0; + + return modified; +} + +static int verify_keymap(const char *keymap, int log_level, sd_bus_error *error) { + int r; + + assert(keymap); + + r = keymap_exists(keymap); /* This also verifies that the keymap name is kosher. */ + if (r < 0) { + if (error) + sd_bus_error_set_errnof(error, r, "Failed to check keymap %s: %m", keymap); + return log_full_errno(log_level, r, "Failed to check keymap %s: %m", keymap); + } + if (r == 0) { + if (error) + sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Keymap %s is not installed.", keymap); + return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOENT), "Keymap %s is not installed.", keymap); + } + + return 0; +} + +int vc_context_verify_and_warn(const VCContext *vc, int log_level, sd_bus_error *error) { + int r; + + assert(vc); + + if (vc->keymap) { + r = verify_keymap(vc->keymap, log_level, error); + if (r < 0) + return r; + } + + if (vc->toggle) { + r = verify_keymap(vc->toggle, log_level, error); + if (r < 0) + return r; + } + + return 0; +} + +void context_clear(Context *c) { + assert(c); + + locale_context_clear(&c->locale_context); + x11_context_clear(&c->x11_from_xorg); + x11_context_clear(&c->x11_from_vc); + vc_context_clear(&c->vc); + + c->locale_cache = sd_bus_message_unref(c->locale_cache); + c->x11_cache = sd_bus_message_unref(c->x11_cache); + c->vc_cache = sd_bus_message_unref(c->vc_cache); + + c->polkit_registry = bus_verify_polkit_async_registry_free(c->polkit_registry); +}; + +X11Context *context_get_x11_context(Context *c) { + assert(c); + + if (!x11_context_isempty(&c->x11_from_vc)) + return &c->x11_from_vc; + + if (!x11_context_isempty(&c->x11_from_xorg)) + return &c->x11_from_xorg; + + return &c->x11_from_vc; +} + +int locale_read_data(Context *c, sd_bus_message *m) { + assert(c); + + /* Do not try to re-read the file within single bus operation. */ + if (m) { + if (m == c->locale_cache) + return 0; + + sd_bus_message_unref(c->locale_cache); + c->locale_cache = sd_bus_message_ref(m); + } + + return locale_context_load(&c->locale_context, LOCALE_LOAD_LOCALE_CONF | LOCALE_LOAD_ENVIRONMENT | LOCALE_LOAD_SIMPLIFY); +} + +int vconsole_read_data(Context *c, sd_bus_message *m) { + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + assert(c); + + /* Do not try to re-read the file within single bus operation. */ + if (m) { + if (m == c->vc_cache) + return 0; + + sd_bus_message_unref(c->vc_cache); + c->vc_cache = sd_bus_message_ref(m); + } + + fd = RET_NERRNO(open("/etc/vconsole.conf", O_CLOEXEC | O_PATH)); + if (fd == -ENOENT) { + c->vc_stat = (struct stat) {}; + vc_context_clear(&c->vc); + x11_context_clear(&c->x11_from_vc); + return 0; + } + if (fd < 0) + return fd; + + if (fstat(fd, &st) < 0) + return -errno; + + /* If the file is not changed, then we do not need to re-read */ + if (stat_inode_unmodified(&c->vc_stat, &st)) + return 0; + + c->vc_stat = st; + vc_context_clear(&c->vc); + x11_context_clear(&c->x11_from_vc); + + r = parse_env_file_fd( + fd, "/etc/vconsole.conf", + "KEYMAP", &c->vc.keymap, + "KEYMAP_TOGGLE", &c->vc.toggle, + "XKBLAYOUT", &c->x11_from_vc.layout, + "XKBMODEL", &c->x11_from_vc.model, + "XKBVARIANT", &c->x11_from_vc.variant, + "XKBOPTIONS", &c->x11_from_vc.options); + if (r < 0) + return r; + + if (vc_context_verify(&c->vc) < 0) + vc_context_clear(&c->vc); + + if (x11_context_verify(&c->x11_from_vc) < 0) + x11_context_clear(&c->x11_from_vc); + + return 0; +} + +int x11_read_data(Context *c, sd_bus_message *m) { + _cleanup_close_ int fd = -EBADF; + _cleanup_fclose_ FILE *f = NULL; + bool in_section = false; + struct stat st; + int r; + + assert(c); + + /* Do not try to re-read the file within single bus operation. */ + if (m) { + if (m == c->x11_cache) + return 0; + + sd_bus_message_unref(c->x11_cache); + c->x11_cache = sd_bus_message_ref(m); + } + + fd = RET_NERRNO(open("/etc/X11/xorg.conf.d/00-keyboard.conf", O_CLOEXEC | O_PATH)); + if (fd == -ENOENT) { + c->x11_stat = (struct stat) {}; + x11_context_clear(&c->x11_from_xorg); + return 0; + } + if (fd < 0) + return fd; + + if (fstat(fd, &st) < 0) + return -errno; + + /* If the file is not changed, then we do not need to re-read */ + if (stat_inode_unmodified(&c->x11_stat, &st)) + return 0; + + c->x11_stat = st; + x11_context_clear(&c->x11_from_xorg); + + r = fdopen_independent(fd, "re", &f); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + if (IN_SET(line[0], 0, '#')) + continue; + + if (in_section && first_word(line, "Option")) { + _cleanup_strv_free_ char **a = NULL; + + r = strv_split_full(&a, line, WHITESPACE, EXTRACT_UNQUOTE); + if (r < 0) + return r; + + if (strv_length(a) == 3) { + char **p = NULL; + + if (streq(a[1], "XkbLayout")) + p = &c->x11_from_xorg.layout; + else if (streq(a[1], "XkbModel")) + p = &c->x11_from_xorg.model; + else if (streq(a[1], "XkbVariant")) + p = &c->x11_from_xorg.variant; + else if (streq(a[1], "XkbOptions")) + p = &c->x11_from_xorg.options; + + if (p) + free_and_replace(*p, a[2]); + } + + } else if (!in_section && first_word(line, "Section")) { + _cleanup_strv_free_ char **a = NULL; + + r = strv_split_full(&a, line, WHITESPACE, EXTRACT_UNQUOTE); + if (r < 0) + return -ENOMEM; + + if (strv_length(a) == 2 && streq(a[1], "InputClass")) + in_section = true; + + } else if (in_section && first_word(line, "EndSection")) + in_section = false; + } + + if (x11_context_verify(&c->x11_from_xorg) < 0) + x11_context_clear(&c->x11_from_xorg); + + return 0; +} + +int vconsole_write_data(Context *c) { + _cleanup_strv_free_ char **l = NULL; + const X11Context *xc; + int r; + + assert(c); + + xc = context_get_x11_context(c); + + r = load_env_file(NULL, "/etc/vconsole.conf", &l); + if (r < 0 && r != -ENOENT) + return r; + + r = strv_env_assign(&l, "KEYMAP", empty_to_null(c->vc.keymap)); + if (r < 0) + return r; + + r = strv_env_assign(&l, "KEYMAP_TOGGLE", empty_to_null(c->vc.toggle)); + if (r < 0) + return r; + + r = strv_env_assign(&l, "XKBLAYOUT", empty_to_null(xc->layout)); + if (r < 0) + return r; + + r = strv_env_assign(&l, "XKBMODEL", empty_to_null(xc->model)); + if (r < 0) + return r; + + r = strv_env_assign(&l, "XKBVARIANT", empty_to_null(xc->variant)); + if (r < 0) + return r; + + r = strv_env_assign(&l, "XKBOPTIONS", empty_to_null(xc->options)); + if (r < 0) + return r; + + if (strv_isempty(l)) { + if (unlink("/etc/vconsole.conf") < 0) + return errno == ENOENT ? 0 : -errno; + + c->vc_stat = (struct stat) {}; + return 0; + } + + r = write_vconsole_conf_label(l); + if (r < 0) + return r; + + if (stat("/etc/vconsole.conf", &c->vc_stat) < 0) + return -errno; + + return 0; +} + +int x11_write_data(Context *c) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_and_freep) char *temp_path = NULL; + const X11Context *xc; + int r; + + assert(c); + + xc = context_get_x11_context(c); + if (x11_context_isempty(xc)) { + if (unlink("/etc/X11/xorg.conf.d/00-keyboard.conf") < 0) + return errno == ENOENT ? 0 : -errno; + + c->x11_stat = (struct stat) {}; + return 0; + } + + (void) mkdir_p_label("/etc/X11/xorg.conf.d", 0755); + r = fopen_temporary("/etc/X11/xorg.conf.d/00-keyboard.conf", &f, &temp_path); + if (r < 0) + return r; + + (void) fchmod(fileno(f), 0644); + + fputs("# Written by systemd-localed(8), read by systemd-localed and Xorg. It's\n" + "# probably wise not to edit this file manually. Use localectl(1) to\n" + "# update this file.\n" + "Section \"InputClass\"\n" + " Identifier \"system-keyboard\"\n" + " MatchIsKeyboard \"on\"\n", f); + + if (!isempty(xc->layout)) + fprintf(f, " Option \"XkbLayout\" \"%s\"\n", xc->layout); + + if (!isempty(xc->model)) + fprintf(f, " Option \"XkbModel\" \"%s\"\n", xc->model); + + if (!isempty(xc->variant)) + fprintf(f, " Option \"XkbVariant\" \"%s\"\n", xc->variant); + + if (!isempty(xc->options)) + fprintf(f, " Option \"XkbOptions\" \"%s\"\n", xc->options); + + fputs("EndSection\n", f); + + r = fflush_sync_and_check(f); + if (r < 0) + return r; + + if (rename(temp_path, "/etc/X11/xorg.conf.d/00-keyboard.conf") < 0) + return -errno; + + if (stat("/etc/X11/xorg.conf.d/00-keyboard.conf", &c->x11_stat) < 0) + return -errno; + + return 0; +} + +static int read_next_mapping( + const char *filename, + unsigned min_fields, + unsigned max_fields, + FILE *f, + unsigned *n, + char ***ret) { + + assert(f); + assert(n); + assert(ret); + + for (;;) { + _cleanup_strv_free_ char **b = NULL; + _cleanup_free_ char *line = NULL; + size_t length; + int r; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + (*n)++; + + if (IN_SET(line[0], 0, '#')) + continue; + + r = strv_split_full(&b, line, WHITESPACE, EXTRACT_UNQUOTE); + if (r < 0) + return r; + + length = strv_length(b); + if (length < min_fields || length > max_fields) { + log_debug("Invalid line %s:%u, ignoring.", strna(filename), *n); + continue; + + } + + *ret = TAKE_PTR(b); + return 1; + } + + *ret = NULL; + return 0; +} + +int vconsole_convert_to_x11(const VCContext *vc, X11Context *ret) { + _cleanup_fclose_ FILE *f = NULL; + const char *map; + X11Context xc; + int r; + + assert(vc); + assert(ret); + + if (isempty(vc->keymap)) { + *ret = (X11Context) {}; + return 0; + } + + map = systemd_kbd_model_map(); + f = fopen(map, "re"); + if (!f) + return -errno; + + for (unsigned n = 0;;) { + _cleanup_strv_free_ char **a = NULL; + + r = read_next_mapping(map, 5, UINT_MAX, f, &n, &a); + if (r < 0) + return r; + if (r == 0) + break; + + if (!streq(vc->keymap, a[0])) + continue; + + xc = (X11Context) { + .layout = empty_or_dash_to_null(a[1]), + .model = empty_or_dash_to_null(a[2]), + .variant = empty_or_dash_to_null(a[3]), + .options = empty_or_dash_to_null(a[4]), + }; + + if (x11_context_verify(&xc) < 0) + continue; + + return x11_context_copy(ret, &xc); + } + + /* No custom mapping has been found, see if the keymap is a converted one. In such case deducing the + * corresponding x11 layout is easy. */ + _cleanup_free_ char *xlayout = NULL, *converted = NULL; + char *xvariant; + + xlayout = strdup(vc->keymap); + if (!xlayout) + return -ENOMEM; + xvariant = strchr(xlayout, '-'); + if (xvariant) { + xvariant[0] = '\0'; + xvariant++; + } + + /* Note: by default we use keyboard model "microsoftpro" which should be equivalent to "pc105" but + * with the internet/media key mapping added. */ + xc = (X11Context) { + .layout = xlayout, + .model = (char*) "microsoftpro", + .variant = xvariant, + .options = (char*) "terminate:ctrl_alt_bksp", + }; + + /* This sanity check seems redundant with the verification of the X11 layout done on the next + * step. However xkbcommon is an optional dependency hence the verification might be a NOP. */ + r = find_converted_keymap(&xc, &converted); + if (r == 0 && xc.variant) { + /* If we still haven't find a match, try with no variant, it's still better than nothing. */ + xc.variant = NULL; + r = find_converted_keymap(&xc, &converted); + } + if (r < 0) + return r; + + if (r == 0 || x11_context_verify(&xc) < 0) { + *ret = (X11Context) {}; + return 0; + } + + return x11_context_copy(ret, &xc); +} + +int find_converted_keymap(const X11Context *xc, char **ret) { + _cleanup_free_ char *n = NULL; + + assert(xc); + assert(!isempty(xc->layout)); + assert(ret); + + if (xc->variant) + n = strjoin(xc->layout, "-", xc->variant); + else + n = strdup(xc->layout); + if (!n) + return -ENOMEM; + + NULSTR_FOREACH(dir, KBD_KEYMAP_DIRS) { + _cleanup_free_ char *p = NULL, *pz = NULL; + bool uncompressed; + + p = strjoin(dir, "xkb/", n, ".map"); + pz = strjoin(dir, "xkb/", n, ".map.gz"); + if (!p || !pz) + return -ENOMEM; + + uncompressed = access(p, F_OK) == 0; + if (uncompressed || access(pz, F_OK) == 0) { + log_debug("Found converted keymap %s at %s", n, uncompressed ? p : pz); + *ret = TAKE_PTR(n); + return 1; + } + } + + *ret = NULL; + return 0; +} + +int find_legacy_keymap(const X11Context *xc, char **ret) { + const char *map; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *new_keymap = NULL; + unsigned best_matching = 0; + int r; + + assert(xc); + assert(!isempty(xc->layout)); + + map = systemd_kbd_model_map(); + f = fopen(map, "re"); + if (!f) + return -errno; + + for (unsigned n = 0;;) { + _cleanup_strv_free_ char **a = NULL; + unsigned matching = 0; + + r = read_next_mapping(map, 5, UINT_MAX, f, &n, &a); + if (r < 0) + return r; + if (r == 0) + break; + + /* Determine how well matching this entry is */ + if (streq(xc->layout, a[1])) + /* If we got an exact match, this is the best */ + matching = 10; + else { + /* see if we get an exact match with the order reversed */ + _cleanup_strv_free_ char **b = NULL; + _cleanup_free_ char *c = NULL; + r = strv_split_full(&b, a[1], ",", 0); + if (r < 0) + return r; + strv_reverse(b); + c = strv_join(b, ","); + if (!c) + return log_oom(); + if (streq(xc->layout, c)) + matching = 9; + else { + /* We have multiple X layouts, look for an + * entry that matches our key with everything + * but the first layout stripped off. */ + if (startswith_comma(xc->layout, a[1])) + matching = 5; + else { + _cleanup_free_ char *x = NULL; + + /* If that didn't work, strip off the + * other layouts from the entry, too */ + x = strdupcspn(a[1], ","); + if (!x) + return -ENOMEM; + if (startswith_comma(xc->layout, x)) + matching = 1; + } + } + } + + if (matching > 0) { + if (isempty(xc->model) || streq_ptr(xc->model, a[2])) { + matching++; + + if (streq_ptr(xc->variant, a[3]) || ((isempty(xc->variant) || streq_skip_trailing_chars(xc->variant, "", ",")) && streq(a[3], "-"))) { + matching++; + + if (streq_ptr(xc->options, a[4])) + matching++; + } + } + } + + /* The best matching entry so far, then let's save that */ + if (matching >= MAX(best_matching, 1u)) { + log_debug("Found legacy keymap %s with score %u", a[0], matching); + + if (matching > best_matching) { + best_matching = matching; + + r = free_and_strdup(&new_keymap, a[0]); + if (r < 0) + return r; + } + } + } + + if (best_matching < 9 && !isempty(xc->layout)) { + _cleanup_free_ char *l = NULL, *v = NULL, *converted = NULL; + + /* The best match is only the first part of the X11 + * keymap. Check if we have a converted map which + * matches just the first layout. + */ + + l = strdupcspn(xc->layout, ","); + if (!l) + return -ENOMEM; + + if (!isempty(xc->variant)) { + v = strdupcspn(xc->variant, ","); + if (!v) + return -ENOMEM; + } + + r = find_converted_keymap( + &(X11Context) { + .layout = l, + .variant = v, + }, + &converted); + if (r < 0) + return r; + if (r > 0) + free_and_replace(new_keymap, converted); + } + + *ret = TAKE_PTR(new_keymap); + return !!*ret; +} + +int x11_convert_to_vconsole(const X11Context *xc, VCContext *ret) { + _cleanup_free_ char *keymap = NULL; + int r; + + assert(xc); + assert(ret); + + if (isempty(xc->layout)) { + *ret = (VCContext) {}; + return 0; + } + + r = find_converted_keymap(xc, &keymap); + if (r == 0) { + r = find_legacy_keymap(xc, &keymap); + if (r == 0 && xc->variant) + /* If we still haven't find a match, try with no variant, it's still better than + * nothing. */ + r = find_converted_keymap( + &(X11Context) { + .layout = xc->layout, + }, + &keymap); + } + if (r < 0) + return r; + + *ret = (VCContext) { + .keymap = TAKE_PTR(keymap), + }; + return 0; +} + +int find_language_fallback(const char *lang, char **ret) { + const char *map; + _cleanup_fclose_ FILE *f = NULL; + unsigned n = 0; + int r; + + assert(lang); + assert(ret); + + map = systemd_language_fallback_map(); + f = fopen(map, "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_strv_free_ char **a = NULL; + + r = read_next_mapping(map, 2, 2, f, &n, &a); + if (r <= 0) + return r; + + if (streq(lang, a[0])) { + assert(strv_length(a) == 2); + *ret = TAKE_PTR(a[1]); + return 1; + } + } +} + +bool locale_gen_check_available(void) { +#if HAVE_LOCALEGEN + if (access(LOCALEGEN_PATH, X_OK) < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Unable to determine whether " LOCALEGEN_PATH " exists and is executable, assuming it is not: %m"); + return false; + } + if (access("/etc/locale.gen", F_OK) < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Unable to determine whether /etc/locale.gen exists, assuming it does not: %m"); + return false; + } + return true; +#else + return false; +#endif +} + +#if HAVE_LOCALEGEN +static bool locale_encoding_is_utf8_or_unspecified(const char *locale) { + const char *c = strchr(locale, '.'); + return !c || strcaseeq(c, ".UTF-8") || strcasestr(locale, ".UTF-8@"); +} + +static int locale_gen_locale_supported(const char *locale_entry) { + /* Returns an error valus <= 0 if the locale-gen entry is invalid or unsupported, + * 1 in case the locale entry is valid, and -EOPNOTSUPP specifically in case + * the distributor has not provided us with a SUPPORTED file to check + * locale for validity. */ + + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(locale_entry); + + /* Locale templates without country code are never supported */ + if (!strstr(locale_entry, "_")) + return -EINVAL; + + f = fopen("/usr/share/i18n/SUPPORTED", "re"); + if (!f) { + if (errno == ENOENT) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unable to check validity of locale entry %s: /usr/share/i18n/SUPPORTED does not exist", + locale_entry); + return -errno; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_debug_errno(r, "Failed to read /usr/share/i18n/SUPPORTED: %m"); + if (r == 0) + return 0; + + if (strcaseeq_ptr(line, locale_entry)) + return 1; + } +} +#endif + +int locale_gen_enable_locale(const char *locale) { +#if HAVE_LOCALEGEN + _cleanup_fclose_ FILE *fr = NULL, *fw = NULL; + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_free_ char *locale_entry = NULL; + bool locale_enabled = false, first_line = false; + bool write_new = false; + int r; + + if (isempty(locale)) + return 0; + + if (locale_encoding_is_utf8_or_unspecified(locale)) { + locale_entry = strjoin(locale, " UTF-8"); + if (!locale_entry) + return -ENOMEM; + } else + return -ENOEXEC; /* We do not process non-UTF-8 locale */ + + r = locale_gen_locale_supported(locale_entry); + if (r == 0) + return -EINVAL; + if (r < 0 && r != -EOPNOTSUPP) + return r; + + fr = fopen("/etc/locale.gen", "re"); + if (!fr) { + if (errno != ENOENT) + return -errno; + write_new = true; + } + + r = fopen_temporary("/etc/locale.gen", &fw, &temp_path); + if (r < 0) + return r; + + if (write_new) + (void) fchmod(fileno(fw), 0644); + else { + /* apply mode & xattrs of the original file to new file */ + r = copy_access(fileno(fr), fileno(fw)); + if (r < 0) + return r; + r = copy_xattr(fileno(fr), NULL, fileno(fw), NULL, COPY_ALL_XATTRS); + if (r < 0) + log_debug_errno(r, "Failed to copy all xattrs from old to new /etc/locale.gen file, ignoring: %m"); + } + + if (!write_new) { + /* The config file ends with a line break, which we do not want to include before potentially appending a new locale + * instead of uncommenting an existing line. By prepending linebreaks, we can avoid buffering this file but can still write + * a nice config file without empty lines */ + first_line = true; + for (;;) { + _cleanup_free_ char *line = NULL; + char *line_locale; + + r = read_line(fr, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + if (locale_enabled) { + /* Just complete writing the file if the new locale was already enabled */ + if (!first_line) + fputc('\n', fw); + fputs(line, fw); + first_line = false; + continue; + } + + line_locale = strstrip(line); + if (isempty(line_locale)) { + fputc('\n', fw); + first_line = false; + continue; + } + + if (line_locale[0] == '#') + line_locale = strstrip(line_locale + 1); + else if (strcaseeq_ptr(line_locale, locale_entry)) + return 0; /* the file already had our locale activated, so skip updating it */ + + if (strcaseeq_ptr(line_locale, locale_entry)) { + /* Uncomment existing line for new locale */ + if (!first_line) + fputc('\n', fw); + fputs(locale_entry, fw); + locale_enabled = true; + first_line = false; + continue; + } + + /* The line was not for the locale we want to enable, just copy it */ + if (!first_line) + fputc('\n', fw); + fputs(line, fw); + first_line = false; + } + } + + /* Add locale to enable to the end of the file if it was not found as commented line */ + if (!locale_enabled) { + if (!write_new) + fputc('\n', fw); + fputs(locale_entry, fw); + } + fputc('\n', fw); + + r = fflush_sync_and_check(fw); + if (r < 0) + return r; + + if (rename(temp_path, "/etc/locale.gen") < 0) + return -errno; + temp_path = mfree(temp_path); + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int locale_gen_run(void) { +#if HAVE_LOCALEGEN + pid_t pid; + int r; + + r = safe_fork("(sd-localegen)", FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, &pid); + if (r < 0) + return r; + if (r == 0) { + execl(LOCALEGEN_PATH, LOCALEGEN_PATH, NULL); + _exit(EXIT_FAILURE); + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} diff --git a/src/locale/localed-util.h b/src/locale/localed-util.h new file mode 100644 index 0000000..0c68f29 --- /dev/null +++ b/src/locale/localed-util.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +#include "hashmap.h" +#include "locale-setup.h" + +typedef struct X11Context { + char *layout; + char *model; + char *variant; + char *options; +} X11Context; + +typedef struct VCContext { + char *keymap; + char *toggle; +} VCContext; + +typedef struct Context { + sd_bus_message *locale_cache; + LocaleContext locale_context; + + sd_bus_message *x11_cache; + struct stat x11_stat; + X11Context x11_from_xorg; + X11Context x11_from_vc; + + sd_bus_message *vc_cache; + struct stat vc_stat; + VCContext vc; + + Hashmap *polkit_registry; +} Context; + +void x11_context_clear(X11Context *xc); +void x11_context_replace(X11Context *dest, X11Context *src); +bool x11_context_isempty(const X11Context *xc); +void x11_context_empty_to_null(X11Context *xc); +bool x11_context_is_safe(const X11Context *xc); +bool x11_context_equal(const X11Context *a, const X11Context *b); +int x11_context_copy(X11Context *dest, const X11Context *src); +int x11_context_verify_and_warn(const X11Context *xc, int log_level, sd_bus_error *error); +static inline int x11_context_verify(const X11Context *xc) { + return x11_context_verify_and_warn(xc, LOG_DEBUG, NULL); +} + +X11Context *context_get_x11_context(Context *c); + +void vc_context_clear(VCContext *vc); +void vc_context_replace(VCContext *dest, VCContext *src); +bool vc_context_isempty(const VCContext *vc); +void vc_context_empty_to_null(VCContext *vc); +bool vc_context_equal(const VCContext *a, const VCContext *b); +int vc_context_copy(VCContext *dest, const VCContext *src); +int vc_context_verify_and_warn(const VCContext *vc, int log_level, sd_bus_error *error); +static inline int vc_context_verify(const VCContext *vc) { + return vc_context_verify_and_warn(vc, LOG_DEBUG, NULL); +} + +int find_converted_keymap(const X11Context *xc, char **ret); +int find_legacy_keymap(const X11Context *xc, char **ret); +int find_language_fallback(const char *lang, char **ret); + +int locale_read_data(Context *c, sd_bus_message *m); +int vconsole_read_data(Context *c, sd_bus_message *m); +int x11_read_data(Context *c, sd_bus_message *m); + +void context_clear(Context *c); +int vconsole_convert_to_x11(const VCContext *vc, X11Context *ret); +int vconsole_write_data(Context *c); +int x11_convert_to_vconsole(const X11Context *xc, VCContext *ret); +int x11_write_data(Context *c); + +bool locale_gen_check_available(void); +int locale_gen_enable_locale(const char *locale); +int locale_gen_run(void); diff --git a/src/locale/localed.c b/src/locale/localed.c new file mode 100644 index 0000000..5d96237 --- /dev/null +++ b/src/locale/localed.c @@ -0,0 +1,680 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-log-control-api.h" +#include "bus-message.h" +#include "bus-polkit.h" +#include "bus-unit-util.h" +#include "constants.h" +#include "kbd-util.h" +#include "localed-util.h" +#include "macro.h" +#include "main-func.h" +#include "missing_capability.h" +#include "path-util.h" +#include "selinux-util.h" +#include "service-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static int vconsole_reload(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + + r = bus_call_method(bus, bus_systemd_mgr, "RestartUnit", &error, NULL, "ss", "systemd-vconsole-setup.service", "replace"); + if (r < 0) + return log_error_errno(r, "Failed to issue method call: %s", bus_error_message(&error, r)); + return 0; +} + +static int property_get_locale( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Context *c = ASSERT_PTR(userdata); + _cleanup_strv_free_ char **l = NULL; + int r; + + r = locale_read_data(c, reply); + if (r < 0) + return r; + + r = locale_context_build_env(&c->locale_context, &l, NULL); + if (r < 0) + return r; + + return sd_bus_message_append_strv(reply, l); +} + +static int property_get_vconsole( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Context *c = ASSERT_PTR(userdata); + int r; + + assert(property); + + r = vconsole_read_data(c, reply); + if (r < 0) + return r; + + if (streq(property, "VConsoleKeymap")) + return sd_bus_message_append_basic(reply, 's', c->vc.keymap); + if (streq(property, "VConsoleKeymapToggle")) + return sd_bus_message_append_basic(reply, 's', c->vc.toggle); + + return -EINVAL; +} + +static int property_get_xkb( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Context *c = ASSERT_PTR(userdata); + const X11Context *xc; + int r; + + assert(property); + + r = vconsole_read_data(c, reply); + if (r < 0) + return r; + + r = x11_read_data(c, reply); + if (r < 0) + return r; + + xc = context_get_x11_context(c); + + if (streq(property, "X11Layout")) + return sd_bus_message_append_basic(reply, 's', xc->layout); + if (streq(property, "X11Model")) + return sd_bus_message_append_basic(reply, 's', xc->model); + if (streq(property, "X11Variant")) + return sd_bus_message_append_basic(reply, 's', xc->variant); + if (streq(property, "X11Options")) + return sd_bus_message_append_basic(reply, 's', xc->options); + + return -EINVAL; +} + +static int process_locale_list_item( + const char *assignment, + char *new_locale[static _VARIABLE_LC_MAX], + bool use_localegen, + sd_bus_error *error) { + + assert(assignment); + assert(new_locale); + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) { + const char *name, *e; + + assert_se(name = locale_variable_to_string(p)); + + e = startswith(assignment, name); + if (!e) + continue; + + if (*e != '=') + continue; + + e++; + + if (!locale_is_valid(e)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Locale %s is not valid, refusing.", e); + if (!use_localegen && locale_is_installed(e) <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Locale %s not installed, refusing.", e); + if (new_locale[p]) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Locale variable %s set twice, refusing.", name); + + new_locale[p] = strdup(e); + if (!new_locale[p]) + return -ENOMEM; + + return 0; + } + + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Locale assignment %s not valid, refusing.", assignment); +} + +static int locale_gen_process_locale(char *new_locale[static _VARIABLE_LC_MAX], sd_bus_error *error) { + int r; + + assert(new_locale); + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) { + if (p == VARIABLE_LANGUAGE) + continue; + if (isempty(new_locale[p])) + continue; + if (locale_is_installed(new_locale[p])) + continue; + + r = locale_gen_enable_locale(new_locale[p]); + if (r == -ENOEXEC) { + log_error_errno(r, "Refused to enable locale for generation: %m"); + return sd_bus_error_setf(error, + SD_BUS_ERROR_INVALID_ARGS, + "Specified locale is not installed and non-UTF-8 locale will not be auto-generated: %s", + new_locale[p]); + } + if (r == -EINVAL) { + log_error_errno(r, "Failed to enable invalid locale %s for generation.", new_locale[p]); + return sd_bus_error_setf(error, + SD_BUS_ERROR_INVALID_ARGS, + "Cannot enable locale generation for invalid locale: %s", + new_locale[p]); + } + if (r < 0) { + log_error_errno(r, "Failed to enable locale for generation: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to enable locale generation: %m"); + } + + r = locale_gen_run(); + if (r < 0) { + log_error_errno(r, "Failed to generate locale: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to generate locale: %m"); + } + } + + return 0; +} + +static int method_set_locale(sd_bus_message *m, void *userdata, sd_bus_error *error) { + _cleanup_(locale_variables_freep) char *new_locale[_VARIABLE_LC_MAX] = {}; + _cleanup_strv_free_ char **l = NULL, **l_set = NULL, **l_unset = NULL; + Context *c = ASSERT_PTR(userdata); + int interactive, r; + bool use_localegen; + + assert(m); + + r = sd_bus_message_read_strv(m, &l); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_basic(m, 'b', &interactive); + if (r < 0) + return bus_log_parse_error(r); + + use_localegen = locale_gen_check_available(); + + /* If single locale without variable name is provided, then we assume it is LANG=. */ + if (strv_length(l) == 1 && !strchr(l[0], '=')) { + if (!locale_is_valid(l[0])) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid locale specification: %s", l[0]); + if (!use_localegen && locale_is_installed(l[0]) <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Specified locale is not installed: %s", l[0]); + + new_locale[VARIABLE_LANG] = strdup(l[0]); + if (!new_locale[VARIABLE_LANG]) + return log_oom(); + + l = strv_free(l); + } + + /* Check whether a variable is valid */ + STRV_FOREACH(i, l) { + r = process_locale_list_item(*i, new_locale, use_localegen, error); + if (r < 0) + return r; + } + + /* If LANG was specified, but not LANGUAGE, check if we should + * set it based on the language fallback table. */ + if (!isempty(new_locale[VARIABLE_LANG]) && + isempty(new_locale[VARIABLE_LANGUAGE])) { + _cleanup_free_ char *language = NULL; + + (void) find_language_fallback(new_locale[VARIABLE_LANG], &language); + if (language) { + log_debug("Converted LANG=%s to LANGUAGE=%s", new_locale[VARIABLE_LANG], language); + free_and_replace(new_locale[VARIABLE_LANGUAGE], language); + } + } + + r = locale_read_data(c, m); + if (r < 0) { + log_error_errno(r, "Failed to read locale data: %m"); + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Failed to read locale data"); + } + + /* Merge with the current settings */ + r = locale_context_merge(&c->locale_context, new_locale); + if (r < 0) + return log_oom(); + + locale_variables_simplify(new_locale); + + if (locale_context_equal(&c->locale_context, new_locale)) { + log_debug("Locale settings were not modified."); + return sd_bus_reply_method_return(m, NULL); + } + + r = bus_verify_polkit_async( + m, + CAP_SYS_ADMIN, + "org.freedesktop.locale1.set-locale", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + /* Generate locale in case it is missing and the system is using locale-gen */ + if (use_localegen) { + r = locale_gen_process_locale(new_locale, error); + if (r < 0) + return r; + } + + locale_context_take(&c->locale_context, new_locale); + + /* Write locale configuration */ + r = locale_context_save(&c->locale_context, &l_set, &l_unset); + if (r < 0) { + log_error_errno(r, "Failed to set locale: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to set locale: %m"); + } + + /* Since we just updated the locale configuration file, ask the system manager to read it again to + * update its default locale settings. It's important to not use UnsetAndSetEnvironment or a similar + * method because in this case unsetting variables means restoring them to PID1 default values, which + * may be outdated, since locale.conf has just changed and PID1 hasn't read it */ + (void) bus_service_manager_reload(sd_bus_message_get_bus(m)); + + if (!strv_isempty(l_set)) { + _cleanup_free_ char *line = NULL; + + line = strv_join(l_set, ", "); + log_info("Changed locale to %s.", strnull(line)); + } else + log_info("Changed locale to unset."); + + (void) sd_bus_emit_properties_changed( + sd_bus_message_get_bus(m), + "/org/freedesktop/locale1", + "org.freedesktop.locale1", + "Locale", NULL); + + return sd_bus_reply_method_return(m, NULL); +} + +static int method_set_vc_keyboard(sd_bus_message *m, void *userdata, sd_bus_error *error) { + _cleanup_(x11_context_clear) X11Context converted = {}; + Context *c = ASSERT_PTR(userdata); + int convert, interactive, r; + bool x_needs_update; + VCContext in; + + assert(m); + + r = sd_bus_message_read(m, "ssbb", &in.keymap, &in.toggle, &convert, &interactive); + if (r < 0) + return bus_log_parse_error(r); + + vc_context_empty_to_null(&in); + + r = vc_context_verify_and_warn(&in, LOG_ERR, error); + if (r < 0) + return r; + + r = vconsole_read_data(c, m); + if (r < 0) { + log_error_errno(r, "Failed to read virtual console keymap data: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to read virtual console keymap data: %m"); + } + + r = x11_read_data(c, m); + if (r < 0) { + log_error_errno(r, "Failed to read X11 keyboard layout data: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to read X11 keyboard layout data: %m"); + } + + if (convert) { + r = vconsole_convert_to_x11(&in, &converted); + if (r < 0) { + log_error_errno(r, "Failed to convert keymap data: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to convert keymap data: %m"); + } + + if (x11_context_isempty(&converted)) + log_notice("No conversion found for virtual console keymap \"%s\".", strempty(in.keymap)); + else + log_info("The virtual console keymap '%s' is converted to X11 keyboard layout '%s' model '%s' variant '%s' options '%s'", + in.keymap, strempty(converted.layout), strempty(converted.model), strempty(converted.variant), strempty(converted.options)); + + /* save the result of conversion to emit changed properties later. */ + x_needs_update = !x11_context_equal(&c->x11_from_vc, &converted) || !x11_context_equal(&c->x11_from_xorg, &converted); + } else + x_needs_update = !x11_context_equal(&c->x11_from_vc, &c->x11_from_xorg); + + if (vc_context_equal(&c->vc, &in) && !x_needs_update) + return sd_bus_reply_method_return(m, NULL); + + r = bus_verify_polkit_async( + m, + CAP_SYS_ADMIN, + "org.freedesktop.locale1.set-keyboard", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = vc_context_copy(&c->vc, &in); + if (r < 0) + return log_oom(); + + if (x_needs_update) { + if (convert) { + r = x11_context_copy(&c->x11_from_vc, &converted); + if (r < 0) + return log_oom(); + x11_context_replace(&c->x11_from_xorg, &converted); + } else { + const X11Context *xc = context_get_x11_context(c); + + /* Even if the conversion is not requested, sync the two X11 contexts. */ + r = x11_context_copy(&c->x11_from_vc, xc); + if (r < 0) + return log_oom(); + + r = x11_context_copy(&c->x11_from_xorg, xc); + if (r < 0) + return log_oom(); + } + } + + r = vconsole_write_data(c); + if (r < 0) + log_warning_errno(r, "Failed to write virtual console keymap, ignoring: %m"); + + if (x_needs_update) { + r = x11_write_data(c); + if (r < 0) + log_warning_errno(r, "Failed to write X11 keyboard layout, ignoring: %m"); + } + + log_info("Changed virtual console keymap to '%s' toggle '%s'", + strempty(c->vc.keymap), strempty(c->vc.toggle)); + + (void) vconsole_reload(sd_bus_message_get_bus(m)); + + (void) sd_bus_emit_properties_changed( + sd_bus_message_get_bus(m), + "/org/freedesktop/locale1", + "org.freedesktop.locale1", + "VConsoleKeymap", "VConsoleKeymapToggle", + x_needs_update ? "X11Layout" : NULL, + x_needs_update ? "X11Model" : NULL, + x_needs_update ? "X11Variant" : NULL, + x_needs_update ? "X11Options" : NULL, + NULL); + + return sd_bus_reply_method_return(m, NULL); +} + +static int method_set_x11_keyboard(sd_bus_message *m, void *userdata, sd_bus_error *error) { + _cleanup_(vc_context_clear) VCContext converted = {}; + Context *c = ASSERT_PTR(userdata); + int convert, interactive, r; + X11Context in; + + assert(m); + + r = sd_bus_message_read(m, "ssssbb", &in.layout, &in.model, &in.variant, &in.options, &convert, &interactive); + if (r < 0) + return bus_log_parse_error(r); + + x11_context_empty_to_null(&in); + + r = x11_context_verify_and_warn(&in, LOG_ERR, error); + if (r < 0) + return r; + + r = vconsole_read_data(c, m); + if (r < 0) { + log_error_errno(r, "Failed to read virtual console keymap data: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to read virtual console keymap data: %m"); + } + + r = x11_read_data(c, m); + if (r < 0) { + log_error_errno(r, "Failed to read x11 keyboard layout data: %m"); + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Failed to read x11 keyboard layout data"); + } + + if (convert) { + r = x11_convert_to_vconsole(&in, &converted); + if (r < 0) { + log_error_errno(r, "Failed to convert keymap data: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to convert keymap data: %m"); + } + + if (vc_context_isempty(&converted)) + /* We search for layout-variant match first, but then we also look + * for anything which matches just the layout. So it's accurate to say + * that we couldn't find anything which matches the layout. */ + log_notice("No conversion to virtual console map found for \"%s\".", strempty(in.layout)); + else + log_info("The X11 keyboard layout '%s' is converted to virtual console keymap '%s'", + in.layout, converted.keymap); + + /* save the result of conversion to emit changed properties later. */ + convert = !vc_context_equal(&c->vc, &converted); + } + + if (x11_context_equal(&c->x11_from_vc, &in) && x11_context_equal(&c->x11_from_xorg, &in) && !convert) + return sd_bus_reply_method_return(m, NULL); + + r = bus_verify_polkit_async( + m, + CAP_SYS_ADMIN, + "org.freedesktop.locale1.set-keyboard", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = x11_context_copy(&c->x11_from_vc, &in); + if (r < 0) + return log_oom(); + + r = x11_context_copy(&c->x11_from_xorg, &in); + if (r < 0) + return log_oom(); + + if (convert) + vc_context_replace(&c->vc, &converted); + + r = vconsole_write_data(c); + if (r < 0) + log_warning_errno(r, "Failed to update vconsole.conf, ignoring: %m"); + + r = x11_write_data(c); + if (r < 0) + log_warning_errno(r, "Failed to write X11 keyboard layout, ignoring: %m"); + + log_info("Changed X11 keyboard layout to '%s' model '%s' variant '%s' options '%s'", + strempty(in.layout), + strempty(in.model), + strempty(in.variant), + strempty(in.options)); + + (void) sd_bus_emit_properties_changed( + sd_bus_message_get_bus(m), + "/org/freedesktop/locale1", + "org.freedesktop.locale1", + "X11Layout", "X11Model", "X11Variant", "X11Options", + convert ? "VConsoleKeymap" : NULL, + convert ? "VConsoleKeymapToggle" : NULL, + NULL); + + if (convert) + (void) vconsole_reload(sd_bus_message_get_bus(m)); + + return sd_bus_reply_method_return(m, NULL); +} + +static const sd_bus_vtable locale_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Locale", "as", property_get_locale, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("X11Layout", "s", property_get_xkb, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("X11Model", "s", property_get_xkb, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("X11Variant", "s", property_get_xkb, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("X11Options", "s", property_get_xkb, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("VConsoleKeymap", "s", property_get_vconsole, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("VConsoleKeymapToggle", "s", property_get_vconsole, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_METHOD_WITH_ARGS("SetLocale", + SD_BUS_ARGS("as", locale, "b", interactive), + SD_BUS_NO_RESULT, + method_set_locale, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetVConsoleKeyboard", + SD_BUS_ARGS("s", keymap, "s", keymap_toggle, "b", convert, "b", interactive), + SD_BUS_NO_RESULT, + method_set_vc_keyboard, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetX11Keyboard", + SD_BUS_ARGS("s", layout, "s", model, "s", variant, "s", options, "b", convert, "b", interactive), + SD_BUS_NO_RESULT, + method_set_x11_keyboard, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +static const BusObjectImplementation manager_object = { + "/org/freedesktop/locale1", + "org.freedesktop.locale1", + .vtables = BUS_VTABLES(locale_vtable), +}; + +static int connect_bus(Context *c, sd_event *event, sd_bus **_bus) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + assert(c); + assert(event); + assert(_bus); + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to get system bus connection: %m"); + + r = bus_add_implementation(bus, &manager_object, c); + if (r < 0) + return r; + + r = bus_log_control_api_register(bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.locale1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + *_bus = TAKE_PTR(bus); + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(context_clear) Context context = {}; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + log_setup(); + + r = service_parse_argv("systemd-localed.service", + "Manage system locale settings and key mappings.", + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + (void) sd_event_set_watchdog(event, true); + + r = sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to install SIGINT handler: %m"); + + r = sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to install SIGTERM handler: %m"); + + r = connect_bus(&context, event, &bus); + if (r < 0) + return r; + + r = bus_event_loop_with_idle(event, bus, "org.freedesktop.locale1", DEFAULT_EXIT_USEC, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/locale/meson.build b/src/locale/meson.build new file mode 100644 index 0000000..3d3aa58 --- /dev/null +++ b/src/locale/meson.build @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_localed_sources = files( + 'localed-util.c', + 'localed.c', + 'xkbcommon-util.c', +) + +localectl_sources = files('localectl.c') + +# logind will load libxkbcommon.so dynamically on its own, but we still need to +# specify where the headers are. +if conf.get('HAVE_XKBCOMMON') == 1 + libxkbcommon_deps = [ + libdl, + libxkbcommon.partial_dependency(compile_args: true), + ] +else + libxkbcommon_deps = [] +endif + +executables += [ + libexec_template + { + 'name' : 'systemd-localed', + 'dbus' : true, + 'conditions' : ['ENABLE_LOCALED'], + 'sources' : systemd_localed_sources, + 'dependencies' : libxkbcommon_deps, + }, + executable_template + { + 'name' : 'localectl', + 'public' : true, + 'conditions' : ['ENABLE_LOCALED'], + 'sources' : files('localectl.c'), + }, + test_template + { + 'sources' : files( + 'test-localed-util.c', + 'localed-util.c', + 'xkbcommon-util.c', + ), + 'dependencies' : libxkbcommon_deps, + }, +] + +# If you know a way that allows the same variables to be used +# in sources list and concatenated to a string for test_env, +# let me know. +kbd_model_map = meson.current_source_dir() / 'kbd-model-map' +language_fallback_map = meson.current_source_dir() / 'language-fallback-map' + +if conf.get('ENABLE_LOCALED') == 1 + install_data('org.freedesktop.locale1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.locale1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.locale1.policy', + install_dir : polkitpolicydir) + install_data('kbd-model-map', + 'language-fallback-map', + install_dir : pkgdatadir) +endif diff --git a/src/locale/org.freedesktop.locale1.conf b/src/locale/org.freedesktop.locale1.conf new file mode 100644 index 0000000..04813b2 --- /dev/null +++ b/src/locale/org.freedesktop.locale1.conf @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/locale/org.freedesktop.locale1.policy b/src/locale/org.freedesktop.locale1.policy new file mode 100644 index 0000000..553e31d --- /dev/null +++ b/src/locale/org.freedesktop.locale1.policy @@ -0,0 +1,42 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Set system locale + Authentication is required to set the system locale. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.locale1.set-keyboard + + + + Set system keyboard settings + Authentication is required to set the system keyboard settings. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + diff --git a/src/locale/org.freedesktop.locale1.service b/src/locale/org.freedesktop.locale1.service new file mode 100644 index 0000000..2d812cb --- /dev/null +++ b/src/locale/org.freedesktop.locale1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.locale1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.locale1.service diff --git a/src/locale/test-localed-util.c b/src/locale/test-localed-util.c new file mode 100644 index 0000000..e92c178 --- /dev/null +++ b/src/locale/test-localed-util.c @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "localed-util.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" + +TEST(find_language_fallback) { + _cleanup_free_ char *ans = NULL, *ans2 = NULL; + + assert_se(find_language_fallback("foobar", &ans) == 0); + assert_se(ans == NULL); + + assert_se(find_language_fallback("csb", &ans) == 0); + assert_se(ans == NULL); + + assert_se(find_language_fallback("csb_PL", &ans) == 1); + assert_se(streq(ans, "csb:pl")); + + assert_se(find_language_fallback("szl_PL", &ans2) == 1); + assert_se(streq(ans2, "szl:pl")); +} + +TEST(find_converted_keymap) { + _cleanup_free_ char *ans = NULL, *ans2 = NULL; + int r; + + assert_se(find_converted_keymap( + &(X11Context) { + .layout = (char*) "pl", + .variant = (char*) "foobar", + }, &ans) == 0); + assert_se(ans == NULL); + + r = find_converted_keymap( + &(X11Context) { + .layout = (char*) "pl", + }, &ans); + if (r == 0) { + log_info("Skipping rest of %s: keymaps are not installed", __func__); + return; + } + + assert_se(r == 1); + assert_se(streq(ans, "pl")); + ans = mfree(ans); + + assert_se(find_converted_keymap( + &(X11Context) { + .layout = (char*) "pl", + .variant = (char*) "dvorak", + }, &ans2) == 1); + assert_se(streq(ans2, "pl-dvorak")); +} + +TEST(find_legacy_keymap) { + X11Context xc = {}; + _cleanup_free_ char *ans = NULL, *ans2 = NULL; + + xc.layout = (char*) "foobar"; + assert_se(find_legacy_keymap(&xc, &ans) == 0); + assert_se(ans == NULL); + + xc.layout = (char*) "pl"; + assert_se(find_legacy_keymap(&xc, &ans) == 1); + assert_se(streq(ans, "pl2")); + + xc.layout = (char*) "pl,ru"; + assert_se(find_legacy_keymap(&xc, &ans2) == 1); + assert_se(streq(ans, "pl2")); +} + +TEST(vconsole_convert_to_x11) { + _cleanup_(x11_context_clear) X11Context xc = {}; + _cleanup_(vc_context_clear) VCContext vc = {}; + int r; + + log_info("/* test empty keymap */"); + assert_se(vconsole_convert_to_x11(&vc, &xc) >= 0); + assert_se(x11_context_isempty(&xc)); + + log_info("/* test without variant, new mapping (es:) */"); + assert_se(free_and_strdup(&vc.keymap, "es") >= 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) >= 0); + assert_se(streq(xc.layout, "es")); + assert_se(xc.variant == NULL); + x11_context_clear(&xc); + + log_info("/* test with known variant, new mapping (es:dvorak) */"); + assert_se(free_and_strdup(&vc.keymap, "es-dvorak") >= 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) >= 0); + assert_se(streq(xc.layout, "es")); + assert_se(streq(xc.variant, "dvorak")); + x11_context_clear(&xc); + + log_info("/* test with old mapping (fr:latin9) */"); + assert_se(free_and_strdup(&vc.keymap, "fr-latin9") >= 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) >= 0); + assert_se(streq(xc.layout, "fr")); + assert_se(streq(xc.variant, "latin9")); + x11_context_clear(&xc); + + log_info("/* test with a compound mapping (ru,us) */"); + assert_se(free_and_strdup(&vc.keymap, "ru") >= 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) >= 0); + assert_se(streq(xc.layout, "ru,us")); + assert_se(xc.variant == NULL); + x11_context_clear(&xc); + + log_info("/* test with a simple mapping (us) */"); + assert_se(free_and_strdup(&vc.keymap, "us") >= 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) >= 0); + assert_se(streq(xc.layout, "us")); + assert_se(xc.variant == NULL); + x11_context_clear(&xc); + + /* "gh" has no mapping in kbd-model-map and kbd provides a converted keymap for this layout. */ + log_info("/* test with a converted keymap (gh:) */"); + assert_se(free_and_strdup(&vc.keymap, "gh") >= 0); + r = vconsole_convert_to_x11(&vc, &xc); + if (r == 0) { + log_info("Skipping rest of %s: keymaps are not installed", __func__); + return; + } + assert_se(r > 0); + assert_se(streq(xc.layout, "gh")); + assert_se(xc.variant == NULL); + x11_context_clear(&xc); + + log_info("/* test with converted keymap and with a known variant (gh:ewe) */"); + assert_se(free_and_strdup(&vc.keymap, "gh-ewe") >= 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) > 0); + assert_se(streq(xc.layout, "gh")); + assert_se(streq(xc.variant, "ewe")); + x11_context_clear(&xc); + + log_info("/* test with converted keymap and with an unknown variant (gh:ewe) */"); + assert_se(free_and_strdup(&vc.keymap, "gh-foobar") > 0); + assert_se(vconsole_convert_to_x11(&vc, &xc) > 0); + assert_se(streq(xc.layout, "gh")); + assert_se(xc.variant == NULL); + x11_context_clear(&xc); +} + +TEST(x11_convert_to_vconsole) { + _cleanup_(x11_context_clear) X11Context xc = {}; + _cleanup_(vc_context_clear) VCContext vc = {}; + + log_info("/* test empty layout (:) */"); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(vc_context_isempty(&vc)); + + log_info("/* test without variant, new mapping (es:) */"); + assert_se(free_and_strdup(&xc.layout, "es") >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "es")); + vc_context_clear(&vc); + + log_info("/* test with unknown variant, new mapping (es:foobar) */"); + assert_se(free_and_strdup(&xc.variant, "foobar") >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "es")); + vc_context_clear(&vc); + + log_info("/* test with known variant, new mapping (es:dvorak) */"); + assert_se(free_and_strdup(&xc.variant, "dvorak") >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + if (vc_context_isempty(&vc)) { + log_info("Skipping rest of %s: keymaps are not installed", __func__); + return; + } + assert_se(streq(vc.keymap, "es-dvorak")); + vc_context_clear(&vc); + + /* es no-variant test is not very good as the desired match + comes first in the list so will win if both candidates score + the same. in this case the desired match comes second so will + not win unless we correctly give the no-variant match a bonus + */ + log_info("/* test without variant, desired match second (bg,us:) */"); + assert_se(free_and_strdup(&xc.layout, "bg,us") >= 0); + assert_se(free_and_strdup(&xc.variant, NULL) >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "bg_bds-utf8")); + vc_context_clear(&vc); + + /* same, but with variant specified as "," */ + log_info("/* test with variant as ',', desired match second (bg,us:) */"); + assert_se(free_and_strdup(&xc.variant, ",") >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "bg_bds-utf8")); + vc_context_clear(&vc); + + log_info("/* test with old mapping (fr:latin9) */"); + assert_se(free_and_strdup(&xc.layout, "fr") >= 0); + assert_se(free_and_strdup(&xc.variant, "latin9") >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "fr-latin9")); + vc_context_clear(&vc); + + /* https://bugzilla.redhat.com/show_bug.cgi?id=1039185 */ + /* us,ru is the x config users want, but they still want ru + as the console layout in this case */ + log_info("/* test with a compound mapping (us,ru:) */"); + assert_se(free_and_strdup(&xc.layout, "us,ru") >= 0); + assert_se(free_and_strdup(&xc.variant, NULL) >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "ru")); + vc_context_clear(&vc); + + log_info("/* test with a compound mapping (ru,us:) */"); + assert_se(free_and_strdup(&xc.layout, "ru,us") >= 0); + assert_se(free_and_strdup(&xc.variant, NULL) >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "ru")); + vc_context_clear(&vc); + + /* https://bugzilla.redhat.com/show_bug.cgi?id=1333998 */ + log_info("/* test with a simple new mapping (ru:) */"); + assert_se(free_and_strdup(&xc.layout, "ru") >= 0); + assert_se(free_and_strdup(&xc.variant, NULL) >= 0); + assert_se(x11_convert_to_vconsole(&xc, &vc) >= 0); + assert_se(streq(vc.keymap, "ru")); +} + +static int intro(void) { + _cleanup_free_ char *map = NULL; + + assert_se(get_testdata_dir("test-keymap-util/kbd-model-map", &map) >= 0); + assert_se(setenv("SYSTEMD_KBD_MODEL_MAP", map, 1) == 0); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/locale/xkbcommon-util.c b/src/locale/xkbcommon-util.c new file mode 100644 index 0000000..295ac8a --- /dev/null +++ b/src/locale/xkbcommon-util.c @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dlfcn-util.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "xkbcommon-util.h" + +#if HAVE_XKBCOMMON +static void *xkbcommon_dl = NULL; + +struct xkb_context* (*sym_xkb_context_new)(enum xkb_context_flags flags); +void (*sym_xkb_context_unref)(struct xkb_context *context); +void (*sym_xkb_context_set_log_fn)( + struct xkb_context *context, + void (*log_fn)( + struct xkb_context *context, + enum xkb_log_level level, + const char *format, + va_list args)); +struct xkb_keymap* (*sym_xkb_keymap_new_from_names)( + struct xkb_context *context, + const struct xkb_rule_names *names, + enum xkb_keymap_compile_flags flags); +void (*sym_xkb_keymap_unref)(struct xkb_keymap *keymap); + +static int dlopen_xkbcommon(void) { + return dlopen_many_sym_or_warn( + &xkbcommon_dl, "libxkbcommon.so.0", LOG_DEBUG, + DLSYM_ARG(xkb_context_new), + DLSYM_ARG(xkb_context_unref), + DLSYM_ARG(xkb_context_set_log_fn), + DLSYM_ARG(xkb_keymap_new_from_names), + DLSYM_ARG(xkb_keymap_unref)); +} + +_printf_(3, 0) +static void log_xkb(struct xkb_context *ctx, enum xkb_log_level lvl, const char *format, va_list args) { + const char *fmt; + + fmt = strjoina("libxkbcommon: ", format); + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(LOG_DEBUG, 0, PROJECT_FILE, __LINE__, __func__, fmt, args); + REENABLE_WARNING; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct xkb_context *, sym_xkb_context_unref, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct xkb_keymap *, sym_xkb_keymap_unref, NULL); + +int verify_xkb_rmlvo(const char *model, const char *layout, const char *variant, const char *options) { + _cleanup_(sym_xkb_context_unrefp) struct xkb_context *ctx = NULL; + _cleanup_(sym_xkb_keymap_unrefp) struct xkb_keymap *km = NULL; + const struct xkb_rule_names rmlvo = { + .model = model, + .layout = layout, + .variant = variant, + .options = options, + }; + int r; + + /* Compile keymap from RMLVO information to check out its validity */ + + r = dlopen_xkbcommon(); + if (r < 0) + return r; + + ctx = sym_xkb_context_new(XKB_CONTEXT_NO_ENVIRONMENT_NAMES); + if (!ctx) + return -ENOMEM; + + sym_xkb_context_set_log_fn(ctx, log_xkb); + + km = sym_xkb_keymap_new_from_names(ctx, &rmlvo, XKB_KEYMAP_COMPILE_NO_FLAGS); + if (!km) + return -EINVAL; + + return 0; +} + +#endif diff --git a/src/locale/xkbcommon-util.h b/src/locale/xkbcommon-util.h new file mode 100644 index 0000000..e99c2d7 --- /dev/null +++ b/src/locale/xkbcommon-util.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_XKBCOMMON +#include + +extern struct xkb_context* (*sym_xkb_context_new)(enum xkb_context_flags flags); +extern void (*sym_xkb_context_unref)(struct xkb_context *context); +extern void (*sym_xkb_context_set_log_fn)( + struct xkb_context *context, + void (*log_fn)( + struct xkb_context *context, + enum xkb_log_level level, + const char *format, + va_list args)); +extern struct xkb_keymap* (*sym_xkb_keymap_new_from_names)( + struct xkb_context *context, + const struct xkb_rule_names *names, + enum xkb_keymap_compile_flags flags); +extern void (*sym_xkb_keymap_unref)(struct xkb_keymap *keymap); + +int verify_xkb_rmlvo(const char *model, const char *layout, const char *variant, const char *options); + +#else + +static inline int verify_xkb_rmlvo(const char *model, const char *layout, const char *variant, const char *options) { + return 0; +} + +#endif diff --git a/src/login/inhibit.c b/src/login/inhibit.c new file mode 100644 index 0000000..ad73c4b --- /dev/null +++ b/src/login/inhibit.c @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "fd-util.h" +#include "format-table.h" +#include "format-util.h" +#include "main-func.h" +#include "pager.h" +#include "pretty-print.h" +#include "process-util.h" +#include "signal-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "user-util.h" + +static const char* arg_what = "idle:sleep:shutdown"; +static const char* arg_who = NULL; +static const char* arg_why = "Unknown reason"; +static const char* arg_mode = NULL; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; + +static enum { + ACTION_INHIBIT, + ACTION_LIST +} arg_action = ACTION_INHIBIT; + +static int inhibit(sd_bus *bus, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + int fd; + + r = bus_call_method(bus, bus_login_mgr, "Inhibit", error, &reply, "ssss", arg_what, arg_who, arg_why, arg_mode); + if (r < 0) + return r; + + r = sd_bus_message_read_basic(reply, SD_BUS_TYPE_UNIX_FD, &fd); + if (r < 0) + return r; + + return RET_NERRNO(fcntl(fd, F_DUPFD_CLOEXEC, 3)); +} + +static int print_inhibitors(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_login_mgr, "ListInhibitors", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Could not get active inhibitors: %s", bus_error_message(&error, r)); + + table = table_new("who", "uid", "user", "pid", "comm", "what", "why", "mode"); + if (!table) + return log_oom(); + + /* If there's not enough space, shorten the "WHY" column, as it's little more than an explaining comment. */ + (void) table_set_weight(table, TABLE_HEADER_CELL(6), 20); + (void) table_set_maximum_width(table, TABLE_HEADER_CELL(0), columns()/2); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssuu)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + _cleanup_free_ char *comm = NULL, *u = NULL; + const char *what, *who, *why, *mode; + uint32_t uid, pid; + + r = sd_bus_message_read(reply, "(ssssuu)", &what, &who, &why, &mode, &uid, &pid); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (arg_mode && !streq(mode, arg_mode)) + continue; + + (void) pid_get_comm(pid, &comm); + u = uid_to_name(uid); + + r = table_add_many(table, + TABLE_STRING, who, + TABLE_UID, (uid_t) uid, + TABLE_STRING, strna(u), + TABLE_PID, (pid_t) pid, + TABLE_STRING, strna(comm), + TABLE_STRING, what, + TABLE_STRING, why, + TABLE_STRING, mode); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (table_get_rows(table) > 1) { + r = table_set_sort(table, (size_t) 1, (size_t) 0, (size_t) 5, (size_t) 6); + if (r < 0) + return table_log_sort_error(r); + + table_set_header(table, arg_legend); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + if (table_get_rows(table) > 1) + printf("\n%zu inhibitors listed.\n", table_get_rows(table) - 1); + else + printf("No inhibitors.\n"); + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-inhibit", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n" + "\n%sExecute a process while inhibiting shutdown/sleep/idle.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --what=WHAT Operations to inhibit, colon separated list of:\n" + " shutdown, sleep, idle, handle-power-key,\n" + " handle-suspend-key, handle-hibernate-key,\n" + " handle-lid-switch\n" + " --who=STRING A descriptive string who is inhibiting\n" + " --why=STRING A descriptive string why is being inhibited\n" + " --mode=MODE One of block or delay\n" + " --list List active inhibitors\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_WHAT, + ARG_WHO, + ARG_WHY, + ARG_MODE, + ARG_LIST, + ARG_NO_PAGER, + ARG_NO_LEGEND, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "what", required_argument, NULL, ARG_WHAT }, + { "who", required_argument, NULL, ARG_WHO }, + { "why", required_argument, NULL, ARG_WHY }, + { "mode", required_argument, NULL, ARG_MODE }, + { "list", no_argument, NULL, ARG_LIST }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_WHAT: + arg_what = optarg; + break; + + case ARG_WHO: + arg_who = optarg; + break; + + case ARG_WHY: + arg_why = optarg; + break; + + case ARG_MODE: + arg_mode = optarg; + break; + + case ARG_LIST: + arg_action = ACTION_LIST; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_action == ACTION_INHIBIT && optind == argc) + arg_action = ACTION_LIST; + + else if (arg_action == ACTION_INHIBIT && optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Missing command line to execute."); + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = sd_bus_default_system(&bus); + if (r < 0) + return bus_log_connect_error(r, BUS_TRANSPORT_LOCAL); + + if (arg_action == ACTION_LIST) + return print_inhibitors(bus); + else { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_strv_free_ char **arguments = NULL; + _cleanup_free_ char *w = NULL; + _cleanup_close_ int fd = -EBADF; + pid_t pid; + + /* Ignore SIGINT and allow the forked process to receive it */ + (void) ignore_signals(SIGINT); + + if (!arg_who) { + w = strv_join(argv + optind, " "); + if (!w) + return log_oom(); + + arg_who = w; + } + + if (!arg_mode) + arg_mode = "block"; + + fd = inhibit(bus, &error); + if (fd < 0) + return log_error_errno(fd, "Failed to inhibit: %s", bus_error_message(&error, fd)); + + arguments = strv_copy(argv + optind); + if (!arguments) + return log_oom(); + + r = safe_fork("(inhibit)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + execvp(arguments[0], arguments); + log_open(); + log_error_errno(errno, "Failed to execute %s: %m", argv[optind]); + _exit(EXIT_FAILURE); + } + + return wait_for_terminate_and_check(argv[optind], pid, WAIT_LOG); + } +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/login/loginctl.c b/src/login/loginctl.c new file mode 100644 index 0000000..7fc6efc --- /dev/null +++ b/src/login/loginctl.c @@ -0,0 +1,1653 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-print-properties.h" +#include "bus-unit-procs.h" +#include "cgroup-show.h" +#include "cgroup-util.h" +#include "format-table.h" +#include "log.h" +#include "logs-show.h" +#include "macro.h" +#include "main-func.h" +#include "memory-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "spawn-polkit-agent.h" +#include "string-table.h" +#include "strv.h" +#include "sysfs-show.h" +#include "terminal-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "verbs.h" + +static char **arg_property = NULL; +static BusPrintPropertyFlags arg_print_flags = 0; +static bool arg_full = false; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static const char *arg_kill_whom = NULL; +static int arg_signal = SIGTERM; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static char *arg_host = NULL; +static bool arg_ask_password = true; +static unsigned arg_lines = 10; +static OutputMode arg_output = OUTPUT_SHORT; + +STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); + +typedef struct SessionStatusInfo { + const char *id; + uid_t uid; + const char *name; + dual_timestamp timestamp; + unsigned vtnr; + const char *seat; + const char *tty; + const char *display; + bool remote; + const char *remote_host; + const char *remote_user; + const char *service; + pid_t leader; + const char *type; + const char *class; + const char *state; + const char *scope; + const char *desktop; + bool idle_hint; + dual_timestamp idle_hint_timestamp; +} SessionStatusInfo; + +typedef struct UserStatusInfo { + uid_t uid; + bool linger; + const char *name; + dual_timestamp timestamp; + const char *state; + char **sessions; + const char *display; + const char *slice; +} UserStatusInfo; + +typedef struct SeatStatusInfo { + const char *id; + const char *active_session; + char **sessions; +} SeatStatusInfo; + +static void user_status_info_done(UserStatusInfo *info) { + assert(info); + + strv_free(info->sessions); +} + +static void seat_status_info_done(SeatStatusInfo *info) { + assert(info); + + strv_free(info->sessions); +} + +static OutputFlags get_output_flags(void) { + return + FLAGS_SET(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) * OUTPUT_SHOW_ALL | + (arg_full || !on_tty() || pager_have()) * OUTPUT_FULL_WIDTH | + colors_enabled() * OUTPUT_COLOR; +} + +static int show_table(Table *table, const char *word) { + int r; + + assert(table); + assert(word); + + if (table_get_rows(table) > 1 || OUTPUT_MODE_IS_JSON(arg_output)) { + r = table_set_sort(table, (size_t) 0); + if (r < 0) + return table_log_sort_error(r); + + table_set_header(table, arg_legend); + + if (OUTPUT_MODE_IS_JSON(arg_output)) + r = table_print_json(table, NULL, output_mode_to_json_format_flags(arg_output) | JSON_FORMAT_COLOR_AUTO); + else + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + if (table_get_rows(table) > 1) + printf("\n%zu %s listed.\n", table_get_rows(table) - 1, word); + else + printf("No %s.\n", word); + } + + return 0; +} + +static int list_sessions(int argc, char *argv[], void *userdata) { + + static const struct bus_properties_map map[] = { + { "IdleHint", "b", NULL, offsetof(SessionStatusInfo, idle_hint) }, + { "IdleSinceHintMonotonic", "t", NULL, offsetof(SessionStatusInfo, idle_hint_timestamp.monotonic) }, + { "State", "s", NULL, offsetof(SessionStatusInfo, state) }, + { "TTY", "s", NULL, offsetof(SessionStatusInfo, tty) }, + {}, + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_login_mgr, "ListSessions", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list sessions: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(susso)"); + if (r < 0) + return bus_log_parse_error(r); + + table = table_new("session", "uid", "user", "seat", "tty", "state", "idle", "since"); + if (!table) + return log_oom(); + + /* Right-align the first two fields (since they are numeric) */ + (void) table_set_align_percent(table, TABLE_HEADER_CELL(0), 100); + (void) table_set_align_percent(table, TABLE_HEADER_CELL(1), 100); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (;;) { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + const char *id, *user, *seat, *object; + uint32_t uid; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + SessionStatusInfo i = {}; + + r = sd_bus_message_read(reply, "(susso)", &id, &uid, &user, &seat, &object); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = bus_map_all_properties(bus, "org.freedesktop.login1", object, map, BUS_MAP_BOOLEAN_AS_BOOL, &e, &m, &i); + if (r < 0) { + log_full_errno(sd_bus_error_has_name(&e, SD_BUS_ERROR_UNKNOWN_OBJECT) ? LOG_DEBUG : LOG_WARNING, + r, + "Failed to get properties of session %s, ignoring: %s", + id, bus_error_message(&e, r)); + continue; + } + + r = table_add_many(table, + TABLE_STRING, id, + TABLE_UID, (uid_t) uid, + TABLE_STRING, user, + TABLE_STRING, empty_to_null(seat), + TABLE_STRING, empty_to_null(i.tty), + TABLE_STRING, i.state, + TABLE_BOOLEAN, i.idle_hint); + if (r < 0) + return table_log_add_error(r); + + if (i.idle_hint) + r = table_add_cell(table, NULL, TABLE_TIMESTAMP_RELATIVE_MONOTONIC, &i.idle_hint_timestamp.monotonic); + else + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return show_table(table, "sessions"); +} + +static int list_users(int argc, char *argv[], void *userdata) { + + static const struct bus_properties_map property_map[] = { + { "Linger", "b", NULL, offsetof(UserStatusInfo, linger) }, + { "State", "s", NULL, offsetof(UserStatusInfo, state) }, + {}, + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_login_mgr, "ListUsers", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list users: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(uso)"); + if (r < 0) + return bus_log_parse_error(r); + + table = table_new("uid", "user", "linger", "state"); + if (!table) + return log_oom(); + + (void) table_set_align_percent(table, TABLE_HEADER_CELL(0), 100); + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (;;) { + _cleanup_(sd_bus_error_free) sd_bus_error error_property = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply_property = NULL; + _cleanup_(user_status_info_done) UserStatusInfo info = {}; + const char *user, *object; + uint32_t uid; + + r = sd_bus_message_read(reply, "(uso)", &uid, &user, &object); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = bus_map_all_properties(bus, + "org.freedesktop.login1", + object, + property_map, + BUS_MAP_BOOLEAN_AS_BOOL, + &error_property, + &reply_property, + &info); + if (r < 0) { + log_full_errno(sd_bus_error_has_name(&error_property, SD_BUS_ERROR_UNKNOWN_OBJECT) ? LOG_DEBUG : LOG_WARNING, + r, + "Failed to get properties of user %s, ignoring: %s", + user, bus_error_message(&error_property, r)); + continue; + } + + r = table_add_many(table, + TABLE_UID, (uid_t) uid, + TABLE_STRING, user, + TABLE_BOOLEAN, info.linger, + TABLE_STRING, info.state); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return show_table(table, "users"); +} + +static int list_seats(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_login_mgr, "ListSeats", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list seats: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(so)"); + if (r < 0) + return bus_log_parse_error(r); + + table = table_new("seat"); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (;;) { + const char *seat; + + r = sd_bus_message_read(reply, "(so)", &seat, NULL); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = table_add_cell(table, NULL, TABLE_STRING, seat); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return show_table(table, "seats"); +} + +static int show_unit_cgroup( + sd_bus *bus, + const char *unit, + pid_t leader, + const char *prefix) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *cgroup = NULL; + unsigned c; + int r; + + assert(bus); + assert(unit); + assert(prefix); + + r = show_cgroup_get_unit_path_and_warn(bus, unit, &cgroup); + if (r < 0) + return r; + + if (isempty(cgroup)) + return 0; + + c = columns(); + if (c > 18) + c -= 18; + + r = unit_show_processes(bus, unit, cgroup, prefix, c, get_output_flags(), &error); + if (r == -EBADR) { + if (arg_transport == BUS_TRANSPORT_REMOTE) + return 0; + + /* Fallback for older systemd versions where the GetUnitProcesses() call is not yet available */ + + if (cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, cgroup) != 0 && leader <= 0) + return 0; + + show_cgroup_and_extra(SYSTEMD_CGROUP_CONTROLLER, cgroup, prefix, c, &leader, leader > 0, get_output_flags()); + } else if (r < 0) + return log_error_errno(r, "Failed to dump process list: %s", bus_error_message(&error, r)); + + return 0; +} + +static int prop_map_first_of_struct(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + const char *contents; + int r; + + assert(bus); + assert(m); + + r = sd_bus_message_peek_type(m, NULL, &contents); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_STRUCT, contents); + if (r < 0) + return r; + + r = sd_bus_message_read_basic(m, contents[0], userdata); + if (r < 0) + return r; + + r = sd_bus_message_skip(m, contents+1); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int prop_map_sessions_strv(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + const char *name; + int r; + + assert(bus); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', "(so)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(m, "(so)", &name, NULL)) > 0) { + r = strv_extend(userdata, name); + if (r < 0) + return r; + } + if (r < 0) + return r; + + return sd_bus_message_exit_container(m); +} + +static int mark_session(char **sessions, const char *target_session) { + assert(sessions); + assert(target_session); + + STRV_FOREACH(i, sessions) + if (streq(*i, target_session)) { + _cleanup_free_ char *marked = NULL; + + marked = strjoin("*", target_session); + if (!marked) + return log_oom(); + + return free_and_replace(*i, marked); + } + + return 0; +} + +static int print_session_status_info(sd_bus *bus, const char *path) { + + static const struct bus_properties_map map[] = { + { "Id", "s", NULL, offsetof(SessionStatusInfo, id) }, + { "Name", "s", NULL, offsetof(SessionStatusInfo, name) }, + { "TTY", "s", NULL, offsetof(SessionStatusInfo, tty) }, + { "Display", "s", NULL, offsetof(SessionStatusInfo, display) }, + { "RemoteHost", "s", NULL, offsetof(SessionStatusInfo, remote_host) }, + { "RemoteUser", "s", NULL, offsetof(SessionStatusInfo, remote_user) }, + { "Service", "s", NULL, offsetof(SessionStatusInfo, service) }, + { "Desktop", "s", NULL, offsetof(SessionStatusInfo, desktop) }, + { "Type", "s", NULL, offsetof(SessionStatusInfo, type) }, + { "Class", "s", NULL, offsetof(SessionStatusInfo, class) }, + { "Scope", "s", NULL, offsetof(SessionStatusInfo, scope) }, + { "State", "s", NULL, offsetof(SessionStatusInfo, state) }, + { "VTNr", "u", NULL, offsetof(SessionStatusInfo, vtnr) }, + { "Leader", "u", NULL, offsetof(SessionStatusInfo, leader) }, + { "Remote", "b", NULL, offsetof(SessionStatusInfo, remote) }, + { "Timestamp", "t", NULL, offsetof(SessionStatusInfo, timestamp.realtime) }, + { "TimestampMonotonic", "t", NULL, offsetof(SessionStatusInfo, timestamp.monotonic) }, + { "IdleHint", "b", NULL, offsetof(SessionStatusInfo, idle_hint) }, + { "IdleSinceHint", "t", NULL, offsetof(SessionStatusInfo, idle_hint_timestamp.realtime) }, + { "IdleSinceHintMonotonic", "t", NULL, offsetof(SessionStatusInfo, idle_hint_timestamp.monotonic) }, + { "User", "(uo)", prop_map_first_of_struct, offsetof(SessionStatusInfo, uid) }, + { "Seat", "(so)", prop_map_first_of_struct, offsetof(SessionStatusInfo, seat) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + SessionStatusInfo i = {}; + int r; + + r = bus_map_all_properties(bus, "org.freedesktop.login1", path, map, BUS_MAP_BOOLEAN_AS_BOOL, &error, &m, &i); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_NA); + + if (dual_timestamp_is_set(&i.timestamp)) { + r = table_add_cell(table, NULL, TABLE_FIELD, "Since"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s; %s", + FORMAT_TIMESTAMP(i.timestamp.realtime), + FORMAT_TIMESTAMP_RELATIVE_MONOTONIC(i.timestamp.monotonic)); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "State", + TABLE_STRING, i.state); + if (r < 0) + return table_log_add_error(r); + + if (i.leader > 0) { + _cleanup_free_ char *name = NULL; + + (void) pid_get_comm(i.leader, &name); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Leader"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, PID_FMT "%s%s%s", + i.leader, + !isempty(name) ? " (" : "", + strempty(name), + !isempty(name) ? ")" : ""); + if (r < 0) + return table_log_add_error(r); + } + + + if (!isempty(i.seat)) { + r = table_add_cell(table, NULL, TABLE_FIELD, "Seat"); + if (r < 0) + return table_log_add_error(r); + + if (i.vtnr > 0) + r = table_add_cell_stringf(table, NULL, "%s; vc%u", i.seat, i.vtnr); + else + r = table_add_cell(table, NULL, TABLE_STRING, i.seat); + if (r < 0) + return table_log_add_error(r); + } + + if (!isempty(i.tty)) + r = table_add_many(table, + TABLE_FIELD, "TTY", + TABLE_STRING, i.tty); + else if (!isempty(i.display)) + r = table_add_many(table, + TABLE_FIELD, "Display", + TABLE_STRING, i.display); + else + r = 0; + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Remote"); + if (r < 0) + return table_log_add_error(r); + + if (i.remote_host && i.remote_user) + r = table_add_cell_stringf(table, NULL, "%s@%s", i.remote_user, i.remote_host); + else if (i.remote_host) + r = table_add_cell(table, NULL, TABLE_STRING, i.remote_host); + else if (i.remote_user) + r = table_add_cell_stringf(table, NULL, "user %s", i.remote_user); + else + r = table_add_cell(table, NULL, TABLE_BOOLEAN, &i.remote); + if (r < 0) + return table_log_add_error(r); + + if (i.service) { + r = table_add_many(table, + TABLE_FIELD, "Service", + TABLE_STRING, i.service); + if (r < 0) + return table_log_add_error(r); + } + + if (i.type) { + r = table_add_many(table, + TABLE_FIELD, "Type", + TABLE_STRING, i.type); + if (r < 0) + return table_log_add_error(r); + } + + if (i.class) { + r = table_add_many(table, + TABLE_FIELD, "Class", + TABLE_STRING, i.class); + if (r < 0) + return table_log_add_error(r); + } + + if (!isempty(i.desktop)) { + r = table_add_many(table, + TABLE_FIELD, "Desktop", + TABLE_STRING, i.desktop); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_cell(table, NULL, TABLE_FIELD, "Idle"); + if (r < 0) + return table_log_add_error(r); + + if (i.idle_hint && dual_timestamp_is_set(&i.idle_hint_timestamp)) + r = table_add_cell_stringf(table, NULL, "%s since %s (%s)", + yes_no(i.idle_hint), + FORMAT_TIMESTAMP(i.idle_hint_timestamp.realtime), + FORMAT_TIMESTAMP_RELATIVE_MONOTONIC(i.idle_hint_timestamp.monotonic)); + else + r = table_add_cell(table, NULL, TABLE_BOOLEAN, &i.idle_hint); + if (r < 0) + return table_log_add_error(r); + + if (i.scope) { + r = table_add_many(table, + TABLE_FIELD, "Unit", + TABLE_SET_MINIMUM_WIDTH, STRLEN("Display"), /* For alignment with show_unit_cgroup */ + TABLE_STRING, i.scope); + if (r < 0) + return table_log_add_error(r); + } + + /* We don't use the table to show the header, in order to make the width of the column stable. */ + printf("%s%s - %s (" UID_FMT ")%s\n", ansi_highlight(), i.id, i.name, i.uid, ansi_normal()); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (i.scope) { + show_unit_cgroup(bus, i.scope, i.leader, /* prefix = */ strrepa(" ", STRLEN("Display: "))); + + if (arg_transport == BUS_TRANSPORT_LOCAL) + show_journal_by_unit( + stdout, + i.scope, + NULL, + arg_output, + 0, + i.timestamp.monotonic, + arg_lines, + 0, + get_output_flags() | OUTPUT_BEGIN_NEWLINE, + SD_JOURNAL_LOCAL_ONLY, + true, + NULL); + } + + return 0; +} + +static int print_user_status_info(sd_bus *bus, const char *path) { + + static const struct bus_properties_map map[] = { + { "Name", "s", NULL, offsetof(UserStatusInfo, name) }, + { "Linger", "b", NULL, offsetof(UserStatusInfo, linger) }, + { "Slice", "s", NULL, offsetof(UserStatusInfo, slice) }, + { "State", "s", NULL, offsetof(UserStatusInfo, state) }, + { "UID", "u", NULL, offsetof(UserStatusInfo, uid) }, + { "Timestamp", "t", NULL, offsetof(UserStatusInfo, timestamp.realtime) }, + { "TimestampMonotonic", "t", NULL, offsetof(UserStatusInfo, timestamp.monotonic) }, + { "Display", "(so)", prop_map_first_of_struct, offsetof(UserStatusInfo, display) }, + { "Sessions", "a(so)", prop_map_sessions_strv, offsetof(UserStatusInfo, sessions) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(user_status_info_done) UserStatusInfo i = {}; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + r = bus_map_all_properties(bus, "org.freedesktop.login1", path, map, BUS_MAP_BOOLEAN_AS_BOOL, &error, &m, &i); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_NA); + + if (dual_timestamp_is_set(&i.timestamp)) { + r = table_add_cell(table, NULL, TABLE_FIELD, "Since"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s; %s", + FORMAT_TIMESTAMP(i.timestamp.realtime), + FORMAT_TIMESTAMP_RELATIVE_MONOTONIC(i.timestamp.monotonic)); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "State", + TABLE_STRING, i.state); + if (r < 0) + return table_log_add_error(r); + + if (!strv_isempty(i.sessions)) { + _cleanup_strv_free_ char **sessions = TAKE_PTR(i.sessions); + + r = mark_session(sessions, i.display); + if (r < 0) + return r; + + r = table_add_many(table, + TABLE_FIELD, "Sessions", + TABLE_STRV_WRAPPED, sessions); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "Linger", + TABLE_BOOLEAN, i.linger); + if (r < 0) + return table_log_add_error(r); + + if (i.slice) { + r = table_add_many(table, + TABLE_FIELD, "Unit", + TABLE_SET_MINIMUM_WIDTH, STRLEN("Sessions"), /* For alignment with show_unit_cgroup */ + TABLE_STRING, i.slice); + if (r < 0) + return table_log_add_error(r); + } + + printf("%s%s (" UID_FMT ")%s\n", ansi_highlight(), i.name, i.uid, ansi_normal()); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (i.slice) { + show_unit_cgroup(bus, i.slice, /* leader = */ 0, /* prefix = */ strrepa(" ", STRLEN("Sessions: "))); + + if (arg_transport == BUS_TRANSPORT_LOCAL) + show_journal_by_unit( + stdout, + i.slice, + NULL, + arg_output, + 0, + i.timestamp.monotonic, + arg_lines, + 0, + get_output_flags() | OUTPUT_BEGIN_NEWLINE, + SD_JOURNAL_LOCAL_ONLY, + true, + NULL); + } + + return 0; +} + +static int print_seat_status_info(sd_bus *bus, const char *path) { + + static const struct bus_properties_map map[] = { + { "Id", "s", NULL, offsetof(SeatStatusInfo, id) }, + { "ActiveSession", "(so)", prop_map_first_of_struct, offsetof(SeatStatusInfo, active_session) }, + { "Sessions", "a(so)", prop_map_sessions_strv, offsetof(SeatStatusInfo, sessions) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(seat_status_info_done) SeatStatusInfo i = {}; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + r = bus_map_all_properties(bus, "org.freedesktop.login1", path, map, 0, &error, &m, &i); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_NA); + + if (!strv_isempty(i.sessions)) { + _cleanup_strv_free_ char **sessions = TAKE_PTR(i.sessions); + + r = mark_session(sessions, i.active_session); + if (r < 0) + return r; + + r = table_add_many(table, + TABLE_FIELD, "Sessions", + TABLE_STRV_WRAPPED, sessions); + if (r < 0) + return table_log_add_error(r); + } + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + r = table_add_many(table, + TABLE_FIELD, "Devices", + TABLE_SET_MINIMUM_WIDTH, STRLEN("Sessions"), /* For alignment with show_sysfs */ + TABLE_EMPTY); + if (r < 0) + return table_log_add_error(r); + } + + printf("%s%s%s\n", ansi_highlight(), i.id, ansi_normal()); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + unsigned c = columns(); + if (c > 21) + c -= 21; + + show_sysfs(i.id, strrepa(" ", STRLEN("Sessions:")), c, get_output_flags()); + } + + return 0; +} + +static int print_property(const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags) { + char type; + const char *contents; + int r; + + assert(name); + assert(m); + + r = sd_bus_message_peek_type(m, &type, &contents); + if (r < 0) + return r; + + switch (type) { + + case SD_BUS_TYPE_STRUCT: + + if (contents[0] == SD_BUS_TYPE_STRING && STR_IN_SET(name, "Display", "Seat", "ActiveSession")) { + const char *s; + + r = sd_bus_message_read(m, "(so)", &s, NULL); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, s); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_UINT32 && streq(name, "User")) { + uint32_t uid; + + r = sd_bus_message_read(m, "(uo)", &uid, NULL); + if (r < 0) + return bus_log_parse_error(r); + + if (!uid_is_valid(uid)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid user ID: " UID_FMT, + uid); + + bus_print_property_valuef(name, expected_value, flags, UID_FMT, uid); + return 1; + } + break; + + case SD_BUS_TYPE_ARRAY: + + if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "Sessions")) { + const char *s; + bool space = false; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(so)"); + if (r < 0) + return bus_log_parse_error(r); + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + printf("%s=", name); + + while ((r = sd_bus_message_read(m, "(so)", &s, NULL)) > 0) { + printf("%s%s", space ? " " : "", s); + space = true; + } + + if (space || !FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + printf("\n"); + + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + } + break; + } + + return 0; +} + +static int show_properties(sd_bus *bus, const char *path) { + int r; + + assert(bus); + assert(path); + + r = bus_print_all_properties( + bus, + "org.freedesktop.login1", + path, + print_property, + arg_property, + arg_print_flags, + NULL); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static int get_bus_path_by_id( + sd_bus *bus, + const char *type, + const char *method, + const char *id, + char **ret) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *p = NULL; + const char *path; + int r; + + assert(bus); + assert(type); + assert(STR_IN_SET(type, "session", "seat")); + assert(method); + assert(id); + assert(ret); + + r = bus_call_method(bus, bus_login_mgr, method, &error, &reply, "s", id); + if (r < 0) + return log_error_errno(r, "Failed to get path for %s '%s': %s", type, id, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + p = strdup(path); + if (!p) + return log_oom(); + + *ret = TAKE_PTR(p); + return 0; +} + +static int show_session(int argc, char *argv[], void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + bool properties; + int r; + + assert(argv); + + properties = !strstr(argv[0], "status"); + + pager_open(arg_pager_flags); + + if (argc <= 1) { + _cleanup_free_ char *path = NULL; + + /* If no argument is specified inspect the manager itself */ + if (properties) + return show_properties(bus, "/org/freedesktop/login1"); + + r = get_bus_path_by_id(bus, "session", "GetSession", "auto", &path); + if (r < 0) + return r; + + return print_session_status_info(bus, path); + } + + for (int i = 1, first = true; i < argc; i++, first = false) { + _cleanup_free_ char *path = NULL; + + r = get_bus_path_by_id(bus, "session", "GetSession", argv[i], &path); + if (r < 0) + return r; + + if (!first) + putchar('\n'); + + if (properties) + r = show_properties(bus, path); + else + r = print_session_status_info(bus, path); + if (r < 0) + return r; + } + + return 0; +} + +static int show_user(int argc, char *argv[], void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + bool properties; + int r; + + assert(argv); + + properties = !strstr(argv[0], "status"); + + pager_open(arg_pager_flags); + + if (argc <= 1) { + /* If no argument is specified inspect the manager itself */ + if (properties) + return show_properties(bus, "/org/freedesktop/login1"); + + return print_user_status_info(bus, "/org/freedesktop/login1/user/self"); + } + + for (int i = 1, first = true; i < argc; i++, first = false) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *path; + uid_t uid; + + r = get_user_creds((const char**) (argv+i), &uid, NULL, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to look up user %s: %m", argv[i]); + + r = bus_call_method(bus, bus_login_mgr, "GetUser", &error, &reply, "u", (uint32_t) uid); + if (r < 0) + return log_error_errno(r, "Failed to get user: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + if (!first) + putchar('\n'); + + if (properties) + r = show_properties(bus, path); + else + r = print_user_status_info(bus, path); + if (r < 0) + return r; + } + + return 0; +} + +static int show_seat(int argc, char *argv[], void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + bool properties; + int r; + + assert(argv); + + properties = !strstr(argv[0], "status"); + + pager_open(arg_pager_flags); + + if (argc <= 1) { + _cleanup_free_ char *path = NULL; + + /* If no argument is specified inspect the manager itself */ + if (properties) + return show_properties(bus, "/org/freedesktop/login1"); + + r = get_bus_path_by_id(bus, "seat", "GetSeat", "auto", &path); + if (r < 0) + return r; + + return print_seat_status_info(bus, path); + } + + for (int i = 1, first = true; i < argc; i++, first = false) { + _cleanup_free_ char *path = NULL; + + r = get_bus_path_by_id(bus, "seat", "GetSeat", argv[i], &path); + if (r < 0) + return r; + + if (!first) + putchar('\n'); + + if (properties) + r = show_properties(bus, path); + else + r = print_seat_status_info(bus, path); + if (r < 0) + return r; + } + + return 0; +} + +static int activate(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + if (argc < 2) { + r = sd_bus_call_method( + bus, + "org.freedesktop.login1", + "/org/freedesktop/login1/session/auto", + "org.freedesktop.login1.Session", + streq(argv[0], "lock-session") ? "Lock" : + streq(argv[0], "unlock-session") ? "Unlock" : + streq(argv[0], "terminate-session") ? "Terminate" : + "Activate", + &error, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to issue method call: %s", bus_error_message(&error, r)); + + return 0; + } + + for (int i = 1; i < argc; i++) { + r = bus_call_method( + bus, + bus_login_mgr, + streq(argv[0], "lock-session") ? "LockSession" : + streq(argv[0], "unlock-session") ? "UnlockSession" : + streq(argv[0], "terminate-session") ? "TerminateSession" : + "ActivateSession", + &error, NULL, + "s", argv[i]); + if (r < 0) + return log_error_errno(r, "Failed to issue method call: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int kill_session(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + if (!arg_kill_whom) + arg_kill_whom = "all"; + + for (int i = 1; i < argc; i++) { + r = bus_call_method( + bus, + bus_login_mgr, + "KillSession", + &error, NULL, + "ssi", argv[i], arg_kill_whom, arg_signal); + if (r < 0) + return log_error_errno(r, "Could not kill session: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int enable_linger(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + char* short_argv[3]; + bool b; + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + b = streq(argv[0], "enable-linger"); + + if (argc < 2) { + /* No argument? Let's use an empty user name, + * then logind will use our user. */ + + short_argv[0] = argv[0]; + short_argv[1] = (char*) ""; + short_argv[2] = NULL; + argv = short_argv; + argc = 2; + } + + for (int i = 1; i < argc; i++) { + uid_t uid; + + if (isempty(argv[i])) + uid = UID_INVALID; + else { + r = get_user_creds((const char**) (argv+i), &uid, NULL, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to look up user %s: %m", argv[i]); + } + + r = bus_call_method( + bus, + bus_login_mgr, + "SetUserLinger", + &error, NULL, + "ubb", (uint32_t) uid, b, true); + if (r < 0) + return log_error_errno(r, "Could not enable linger: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int terminate_user(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (int i = 1; i < argc; i++) { + uid_t uid; + + if (isempty(argv[i])) + uid = getuid(); + else { + const char *u = argv[i]; + + r = get_user_creds(&u, &uid, NULL, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to look up user %s: %m", argv[i]); + } + + r = bus_call_method(bus, bus_login_mgr, "TerminateUser", &error, NULL, "u", (uint32_t) uid); + if (r < 0) + return log_error_errno(r, "Could not terminate user: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int kill_user(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + if (!arg_kill_whom) + arg_kill_whom = "all"; + + for (int i = 1; i < argc; i++) { + uid_t uid; + + if (isempty(argv[i])) + uid = getuid(); + else { + const char *u = argv[i]; + + r = get_user_creds(&u, &uid, NULL, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to look up user %s: %m", argv[i]); + } + + r = bus_call_method( + bus, + bus_login_mgr, + "KillUser", + &error, NULL, + "ui", (uint32_t) uid, arg_signal); + if (r < 0) + return log_error_errno(r, "Could not kill user: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int attach(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (int i = 2; i < argc; i++) { + + r = bus_call_method( + bus, + bus_login_mgr, + "AttachDevice", + &error, NULL, + "ssb", argv[1], argv[i], true); + if (r < 0) + return log_error_errno(r, "Could not attach device: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int flush_devices(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method(bus, bus_login_mgr, "FlushDevices", &error, NULL, "b", true); + if (r < 0) + return log_error_errno(r, "Could not flush devices: %s", bus_error_message(&error, r)); + + return 0; +} + +static int lock_sessions(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method( + bus, + bus_login_mgr, + streq(argv[0], "lock-sessions") ? "LockSessions" : "UnlockSessions", + &error, NULL, + NULL); + if (r < 0) + return log_error_errno(r, "Could not lock sessions: %s", bus_error_message(&error, r)); + + return 0; +} + +static int terminate_seat(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + assert(argv); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (int i = 1; i < argc; i++) { + + r = bus_call_method(bus, bus_login_mgr, "TerminateSeat", &error, NULL, "s", argv[i]); + if (r < 0) + return log_error_errno(r, "Could not terminate seat: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("loginctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%5$sSend control commands to or query the login manager.%6$s\n" + "\n%3$sSession Commands:%4$s\n" + " list-sessions List sessions\n" + " session-status [ID...] Show session status\n" + " show-session [ID...] Show properties of sessions or the manager\n" + " activate [ID] Activate a session\n" + " lock-session [ID...] Screen lock one or more sessions\n" + " unlock-session [ID...] Screen unlock one or more sessions\n" + " lock-sessions Screen lock all current sessions\n" + " unlock-sessions Screen unlock all current sessions\n" + " terminate-session ID... Terminate one or more sessions\n" + " kill-session ID... Send signal to processes of a session\n" + "\n%3$sUser Commands:%4$s\n" + " list-users List users\n" + " user-status [USER...] Show user status\n" + " show-user [USER...] Show properties of users or the manager\n" + " enable-linger [USER...] Enable linger state of one or more users\n" + " disable-linger [USER...] Disable linger state of one or more users\n" + " terminate-user USER... Terminate all sessions of one or more users\n" + " kill-user USER... Send signal to processes of a user\n" + "\n%3$sSeat Commands:%4$s\n" + " list-seats List seats\n" + " seat-status [NAME...] Show seat status\n" + " show-seat [NAME...] Show properties of seats or the manager\n" + " attach NAME DEVICE... Attach one or more devices to a seat\n" + " flush-devices Flush all device associations\n" + " terminate-seat NAME... Terminate all sessions on one or more seats\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --no-ask-password Don't prompt for password\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " -p --property=NAME Show only properties by this name\n" + " -P NAME Equivalent to --value --property=NAME\n" + " -a --all Show all properties, including empty ones\n" + " --value When showing properties, only print the value\n" + " -l --full Do not ellipsize output\n" + " --kill-whom=WHOM Whom to send signal to\n" + " -s --signal=SIGNAL Which signal to send\n" + " -n --lines=INTEGER Number of journal entries to show\n" + " -o --output=STRING Change journal output mode (short, short-precise,\n" + " short-iso, short-iso-precise, short-full,\n" + " short-monotonic, short-unix, short-delta,\n" + " json, json-pretty, json-sse, json-seq, cat,\n" + " verbose, export, with-unit)\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_VALUE, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_KILL_WHOM, + ARG_NO_ASK_PASSWORD, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "property", required_argument, NULL, 'p' }, + { "all", no_argument, NULL, 'a' }, + { "value", no_argument, NULL, ARG_VALUE }, + { "full", no_argument, NULL, 'l' }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "kill-whom", required_argument, NULL, ARG_KILL_WHOM }, + { "signal", required_argument, NULL, 's' }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "lines", required_argument, NULL, 'n' }, + { "output", required_argument, NULL, 'o' }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hp:P:als:H:M:n:o:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case 'P': + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_ONLY_VALUE, true); + _fallthrough_; + + case 'p': { + r = strv_extend(&arg_property, optarg); + if (r < 0) + return log_oom(); + + /* If the user asked for a particular + * property, show it to them, even if it is + * empty. */ + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + break; + } + + case 'a': + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + break; + + case ARG_VALUE: + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_ONLY_VALUE, true); + break; + + case 'l': + arg_full = true; + break; + + case 'n': + if (safe_atou(optarg, &arg_lines) < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse lines '%s'", optarg); + break; + + case 'o': + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(output_mode, OutputMode, _OUTPUT_MODE_MAX); + return 0; + } + + arg_output = output_mode_from_string(optarg); + if (arg_output < 0) + return log_error_errno(arg_output, "Unknown output '%s'.", optarg); + + if (OUTPUT_MODE_IS_JSON(arg_output)) + arg_legend = false; + + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case ARG_KILL_WHOM: + arg_kill_whom = optarg; + break; + + case 's': + r = parse_signal_argument(optarg, &arg_signal); + if (r <= 0) + return r; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int loginctl_main(int argc, char *argv[], sd_bus *bus) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "list-sessions", VERB_ANY, 1, VERB_DEFAULT, list_sessions }, + { "session-status", VERB_ANY, VERB_ANY, 0, show_session }, + { "show-session", VERB_ANY, VERB_ANY, 0, show_session }, + { "activate", VERB_ANY, 2, 0, activate }, + { "lock-session", VERB_ANY, VERB_ANY, 0, activate }, + { "unlock-session", VERB_ANY, VERB_ANY, 0, activate }, + { "lock-sessions", VERB_ANY, 1, 0, lock_sessions }, + { "unlock-sessions", VERB_ANY, 1, 0, lock_sessions }, + { "terminate-session", 2, VERB_ANY, 0, activate }, + { "kill-session", 2, VERB_ANY, 0, kill_session }, + { "list-users", VERB_ANY, 1, 0, list_users }, + { "user-status", VERB_ANY, VERB_ANY, 0, show_user }, + { "show-user", VERB_ANY, VERB_ANY, 0, show_user }, + { "enable-linger", VERB_ANY, VERB_ANY, 0, enable_linger }, + { "disable-linger", VERB_ANY, VERB_ANY, 0, enable_linger }, + { "terminate-user", 2, VERB_ANY, 0, terminate_user }, + { "kill-user", 2, VERB_ANY, 0, kill_user }, + { "list-seats", VERB_ANY, 1, 0, list_seats }, + { "seat-status", VERB_ANY, VERB_ANY, 0, show_seat }, + { "show-seat", VERB_ANY, VERB_ANY, 0, show_seat }, + { "attach", 3, VERB_ANY, 0, attach }, + { "flush-devices", VERB_ANY, 1, 0, flush_devices }, + { "terminate-seat", 2, VERB_ANY, 0, terminate_seat }, + {} + }; + + return dispatch_verb(argc, argv, verbs, bus); +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + setlocale(LC_ALL, ""); + log_setup(); + + /* The journal merging logic potentially needs a lot of fds. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + sigbus_install(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, &bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + (void) sd_bus_set_allow_interactive_authorization(bus, arg_ask_password); + + return loginctl_main(argc, argv, bus); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/login/logind-action.c b/src/login/logind-action.c new file mode 100644 index 0000000..8269f52 --- /dev/null +++ b/src/login/logind-action.c @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "conf-parser.h" +#include "format-util.h" +#include "logind-action.h" +#include "logind-dbus.h" +#include "logind-session-dbus.h" +#include "process-util.h" +#include "special.h" +#include "string-table.h" +#include "terminal-util.h" +#include "user-util.h" + +static const HandleActionData handle_action_data_table[_HANDLE_ACTION_MAX] = { + [HANDLE_POWEROFF] = { + .handle = HANDLE_POWEROFF, + .target = SPECIAL_POWEROFF_TARGET, + .inhibit_what = INHIBIT_SHUTDOWN, + .polkit_action = "org.freedesktop.login1.power-off", + .polkit_action_multiple_sessions = "org.freedesktop.login1.power-off-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.power-off-ignore-inhibit", + .sleep_operation = _SLEEP_OPERATION_INVALID, + .message_id = SD_MESSAGE_SHUTDOWN_STR, + .message = "System is powering down", + .log_verb = "power-off", + }, + [HANDLE_REBOOT] = { + .handle = HANDLE_REBOOT, + .target = SPECIAL_REBOOT_TARGET, + .inhibit_what = INHIBIT_SHUTDOWN, + .polkit_action = "org.freedesktop.login1.reboot", + .polkit_action_multiple_sessions = "org.freedesktop.login1.reboot-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.reboot-ignore-inhibit", + .sleep_operation = _SLEEP_OPERATION_INVALID, + .message_id = SD_MESSAGE_SHUTDOWN_STR, + .message = "System is rebooting", + .log_verb = "reboot", + }, + [HANDLE_HALT] = { + .handle = HANDLE_HALT, + .target = SPECIAL_HALT_TARGET, + .inhibit_what = INHIBIT_SHUTDOWN, + .polkit_action = "org.freedesktop.login1.halt", + .polkit_action_multiple_sessions = "org.freedesktop.login1.halt-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.halt-ignore-inhibit", + .sleep_operation = _SLEEP_OPERATION_INVALID, + .message_id = SD_MESSAGE_SHUTDOWN_STR, + .message = "System is halting", + .log_verb = "halt", + }, + [HANDLE_KEXEC] = { + .handle = HANDLE_KEXEC, + .target = SPECIAL_KEXEC_TARGET, + .inhibit_what = INHIBIT_SHUTDOWN, + .polkit_action = "org.freedesktop.login1.reboot", + .polkit_action_multiple_sessions = "org.freedesktop.login1.reboot-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.reboot-ignore-inhibit", + .sleep_operation = _SLEEP_OPERATION_INVALID, + .message_id = SD_MESSAGE_SHUTDOWN_STR, + .message = "System is rebooting with kexec", + .log_verb = "kexec", + }, + [HANDLE_SOFT_REBOOT] = { + .handle = HANDLE_SOFT_REBOOT, + .target = SPECIAL_SOFT_REBOOT_TARGET, + .inhibit_what = INHIBIT_SHUTDOWN, + .polkit_action = "org.freedesktop.login1.reboot", + .polkit_action_multiple_sessions = "org.freedesktop.login1.reboot-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.reboot-ignore-inhibit", + .sleep_operation = _SLEEP_OPERATION_INVALID, + .message_id = SD_MESSAGE_SHUTDOWN_STR, + .message = "System userspace is rebooting", + .log_verb = "soft-reboot", + }, + [HANDLE_SUSPEND] = { + .handle = HANDLE_SUSPEND, + .target = SPECIAL_SUSPEND_TARGET, + .inhibit_what = INHIBIT_SLEEP, + .polkit_action = "org.freedesktop.login1.suspend", + .polkit_action_multiple_sessions = "org.freedesktop.login1.suspend-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.suspend-ignore-inhibit", + .sleep_operation = SLEEP_SUSPEND, + }, + [HANDLE_HIBERNATE] = { + .handle = HANDLE_HIBERNATE, + .target = SPECIAL_HIBERNATE_TARGET, + .inhibit_what = INHIBIT_SLEEP, + .polkit_action = "org.freedesktop.login1.hibernate", + .polkit_action_multiple_sessions = "org.freedesktop.login1.hibernate-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.hibernate-ignore-inhibit", + .sleep_operation = SLEEP_HIBERNATE, + }, + [HANDLE_HYBRID_SLEEP] = { + .handle = HANDLE_HYBRID_SLEEP, + .target = SPECIAL_HYBRID_SLEEP_TARGET, + .inhibit_what = INHIBIT_SLEEP, + .polkit_action = "org.freedesktop.login1.hibernate", + .polkit_action_multiple_sessions = "org.freedesktop.login1.hibernate-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.hibernate-ignore-inhibit", + .sleep_operation = SLEEP_HYBRID_SLEEP, + }, + [HANDLE_SUSPEND_THEN_HIBERNATE] = { + .handle = HANDLE_SUSPEND_THEN_HIBERNATE, + .target = SPECIAL_SUSPEND_THEN_HIBERNATE_TARGET, + .inhibit_what = INHIBIT_SLEEP, + .polkit_action = "org.freedesktop.login1.hibernate", + .polkit_action_multiple_sessions = "org.freedesktop.login1.hibernate-multiple-sessions", + .polkit_action_ignore_inhibit = "org.freedesktop.login1.hibernate-ignore-inhibit", + .sleep_operation = SLEEP_SUSPEND_THEN_HIBERNATE, + }, + [HANDLE_FACTORY_RESET] = { + .handle = HANDLE_FACTORY_RESET, + .target = SPECIAL_FACTORY_RESET_TARGET, + .inhibit_what = _INHIBIT_WHAT_INVALID, + .sleep_operation = _SLEEP_OPERATION_INVALID, + .message_id = SD_MESSAGE_FACTORY_RESET_STR, + .message = "System is performing factory reset", + }, +}; + +const HandleActionData* handle_action_lookup(HandleAction action) { + + if (action < 0 || (size_t) action >= ELEMENTSOF(handle_action_data_table)) + return NULL; + + return &handle_action_data_table[action]; +} + +static int handle_action_execute( + Manager *m, + HandleAction handle, + bool ignore_inhibited, + bool is_edge) { + + static const char * const message_table[_HANDLE_ACTION_MAX] = { + [HANDLE_POWEROFF] = "Powering off...", + [HANDLE_REBOOT] = "Rebooting...", + [HANDLE_HALT] = "Halting...", + [HANDLE_KEXEC] = "Rebooting via kexec...", + [HANDLE_SOFT_REBOOT] = "Rebooting userspace...", + [HANDLE_SUSPEND] = "Suspending...", + [HANDLE_HIBERNATE] = "Hibernating...", + [HANDLE_HYBRID_SLEEP] = "Hibernating and suspending...", + [HANDLE_SUSPEND_THEN_HIBERNATE] = "Suspending, then hibernating...", + [HANDLE_FACTORY_RESET] = "Performing factory reset...", + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + InhibitWhat inhibit_operation; + Inhibitor *offending = NULL; + int r; + + assert(m); + + if (handle == HANDLE_KEXEC && access(KEXEC, X_OK) < 0) + return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Requested %s operation not supported, ignoring.", handle_action_to_string(handle)); + + if (m->delayed_action) + return log_debug_errno(SYNTHETIC_ERRNO(EALREADY), + "Action %s already in progress, ignoring requested %s operation.", + handle_action_to_string(m->delayed_action->handle), + handle_action_to_string(handle)); + + inhibit_operation = handle_action_lookup(handle)->inhibit_what; + + /* If the actual operation is inhibited, warn and fail */ + if (inhibit_what_is_valid(inhibit_operation) && + !ignore_inhibited && + manager_is_inhibited(m, inhibit_operation, INHIBIT_BLOCK, NULL, false, false, 0, &offending)) { + _cleanup_free_ char *comm = NULL, *u = NULL; + + (void) pidref_get_comm(&offending->pid, &comm); + u = uid_to_name(offending->uid); + + /* If this is just a recheck of the lid switch then don't warn about anything */ + log_full(is_edge ? LOG_ERR : LOG_DEBUG, + "Refusing %s operation, %s is inhibited by UID "UID_FMT"/%s, PID "PID_FMT"/%s.", + handle_action_to_string(handle), + inhibit_what_to_string(inhibit_operation), + offending->uid, strna(u), + offending->pid.pid, strna(comm)); + + return is_edge ? -EPERM : 0; + } + + log_info("%s", message_table[handle]); + + r = bus_manager_shutdown_or_sleep_now_or_later(m, handle_action_lookup(handle), &error); + if (r < 0) + return log_error_errno(r, "Failed to execute %s operation: %s", + handle_action_to_string(handle), + bus_error_message(&error, r)); + + return 1; +} + +static int handle_action_sleep_execute( + Manager *m, + HandleAction handle, + bool ignore_inhibited, + bool is_edge) { + + bool supported; + + assert(m); + assert(HANDLE_ACTION_IS_SLEEP(handle)); + + if (handle == HANDLE_SUSPEND) + supported = sleep_supported(SLEEP_SUSPEND) > 0; + else if (handle == HANDLE_HIBERNATE) + supported = sleep_supported(SLEEP_HIBERNATE) > 0; + else if (handle == HANDLE_HYBRID_SLEEP) + supported = sleep_supported(SLEEP_HYBRID_SLEEP) > 0; + else if (handle == HANDLE_SUSPEND_THEN_HIBERNATE) + supported = sleep_supported(SLEEP_SUSPEND_THEN_HIBERNATE) > 0; + else + assert_not_reached(); + + if (!supported && handle != HANDLE_SUSPEND) { + supported = sleep_supported(SLEEP_SUSPEND) > 0; + if (supported) { + log_notice("Requested %s operation is not supported, using regular suspend instead.", + handle_action_to_string(handle)); + handle = HANDLE_SUSPEND; + } + } + + if (!supported) + return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Requested %s operation not supported, ignoring.", handle_action_to_string(handle)); + + return handle_action_execute(m, handle, ignore_inhibited, is_edge); +} + +int manager_handle_action( + Manager *m, + InhibitWhat inhibit_key, + HandleAction handle, + bool ignore_inhibited, + bool is_edge) { + + assert(m); + assert(handle_action_valid(handle)); + + /* If the key handling is turned off, don't do anything */ + if (handle == HANDLE_IGNORE) { + log_debug("Handling of %s (%s) is disabled, taking no action.", + inhibit_key == 0 ? "idle timeout" : inhibit_what_to_string(inhibit_key), + is_edge ? "edge" : "level"); + return 0; + } + + if (inhibit_key == INHIBIT_HANDLE_LID_SWITCH) { + /* If the last system suspend or startup is too close, let's not suspend for now, to give + * USB docking stations some time to settle so that we can properly watch its displays. */ + if (m->lid_switch_ignore_event_source) { + log_debug("Ignoring lid switch request, system startup or resume too close."); + return 0; + } + } + + /* If the key handling is inhibited, don't do anything */ + if (inhibit_key > 0) { + if (manager_is_inhibited(m, inhibit_key, INHIBIT_BLOCK, NULL, true, false, 0, NULL)) { + log_debug("Refusing %s operation, %s is inhibited.", + handle_action_to_string(handle), + inhibit_what_to_string(inhibit_key)); + return 0; + } + } + + /* Locking is handled differently from the rest. */ + if (handle == HANDLE_LOCK) { + if (!is_edge) + return 0; + + log_info("Locking sessions..."); + session_send_lock_all(m, true); + return 1; + } + + if (HANDLE_ACTION_IS_SLEEP(handle)) + return handle_action_sleep_execute(m, handle, ignore_inhibited, is_edge); + + return handle_action_execute(m, handle, ignore_inhibited, is_edge); +} + +static const char* const handle_action_verb_table[_HANDLE_ACTION_MAX] = { + [HANDLE_IGNORE] = "do nothing", + [HANDLE_POWEROFF] = "power off", + [HANDLE_REBOOT] = "reboot", + [HANDLE_HALT] = "halt", + [HANDLE_KEXEC] = "kexec", + [HANDLE_SOFT_REBOOT] = "soft-reboot", + [HANDLE_SUSPEND] = "suspend", + [HANDLE_HIBERNATE] = "hibernate", + [HANDLE_HYBRID_SLEEP] = "enter hybrid sleep", + [HANDLE_SUSPEND_THEN_HIBERNATE] = "suspend and later hibernate", + [HANDLE_FACTORY_RESET] = "perform a factory reset", + [HANDLE_LOCK] = "be locked", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(handle_action_verb, HandleAction); + +/* These strings are sent out by PrepareForShutdownWithMetadata signals as metadata, so the values cannot + * change as they are public APIs. */ +static const char* const handle_action_table[_HANDLE_ACTION_MAX] = { + [HANDLE_IGNORE] = "ignore", + [HANDLE_POWEROFF] = "poweroff", + [HANDLE_REBOOT] = "reboot", + [HANDLE_HALT] = "halt", + [HANDLE_KEXEC] = "kexec", + [HANDLE_SOFT_REBOOT] = "soft-reboot", + [HANDLE_SUSPEND] = "suspend", + [HANDLE_HIBERNATE] = "hibernate", + [HANDLE_HYBRID_SLEEP] = "hybrid-sleep", + [HANDLE_SUSPEND_THEN_HIBERNATE] = "suspend-then-hibernate", + [HANDLE_FACTORY_RESET] = "factory-reset", + [HANDLE_LOCK] = "lock", +}; + +DEFINE_STRING_TABLE_LOOKUP(handle_action, HandleAction); +DEFINE_CONFIG_PARSE_ENUM(config_parse_handle_action, handle_action, HandleAction, "Failed to parse handle action setting"); diff --git a/src/login/logind-action.h b/src/login/logind-action.h new file mode 100644 index 0000000..dbca963 --- /dev/null +++ b/src/login/logind-action.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +typedef enum HandleAction { + HANDLE_IGNORE, + HANDLE_POWEROFF, + _HANDLE_ACTION_SHUTDOWN_FIRST = HANDLE_POWEROFF, + HANDLE_REBOOT, + HANDLE_HALT, + HANDLE_KEXEC, + HANDLE_SOFT_REBOOT, + _HANDLE_ACTION_SHUTDOWN_LAST = HANDLE_SOFT_REBOOT, + HANDLE_SUSPEND, + _HANDLE_ACTION_SLEEP_FIRST = HANDLE_SUSPEND, + HANDLE_HIBERNATE, + HANDLE_HYBRID_SLEEP, + HANDLE_SUSPEND_THEN_HIBERNATE, + _HANDLE_ACTION_SLEEP_LAST = HANDLE_SUSPEND_THEN_HIBERNATE, + HANDLE_LOCK, + HANDLE_FACTORY_RESET, + _HANDLE_ACTION_MAX, + _HANDLE_ACTION_INVALID = -EINVAL, +} HandleAction; + +typedef struct HandleActionData HandleActionData; + +#include "logind-inhibit.h" +#include "logind.h" +#include "sleep-config.h" + +static inline bool handle_action_valid(HandleAction a) { + return a >= 0 && a < _HANDLE_ACTION_MAX; +} + +static inline bool HANDLE_ACTION_IS_SHUTDOWN(HandleAction a) { + return a >= _HANDLE_ACTION_SHUTDOWN_FIRST && a <= _HANDLE_ACTION_SHUTDOWN_LAST; +} + +static inline bool HANDLE_ACTION_IS_SLEEP(HandleAction a) { + return a >= _HANDLE_ACTION_SLEEP_FIRST && a <= _HANDLE_ACTION_SLEEP_LAST; +} + +struct HandleActionData { + HandleAction handle; + const char *target; + InhibitWhat inhibit_what; + const char *polkit_action; + const char *polkit_action_multiple_sessions; + const char *polkit_action_ignore_inhibit; + SleepOperation sleep_operation; + const char* message_id; + const char* message; + const char* log_verb; +}; + +int manager_handle_action( + Manager *m, + InhibitWhat inhibit_key, + HandleAction handle, + bool ignore_inhibited, + bool is_edge); + +const char* handle_action_verb_to_string(HandleAction h) _const_; + +const char* handle_action_to_string(HandleAction h) _const_; +HandleAction handle_action_from_string(const char *s) _pure_; + +const HandleActionData* handle_action_lookup(HandleAction handle); + +CONFIG_PARSER_PROTOTYPE(config_parse_handle_action); diff --git a/src/login/logind-brightness.c b/src/login/logind-brightness.c new file mode 100644 index 0000000..40bcb39 --- /dev/null +++ b/src/login/logind-brightness.c @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-util.h" +#include "device-util.h" +#include "hash-funcs.h" +#include "logind-brightness.h" +#include "logind.h" +#include "process-util.h" +#include "stdio-util.h" + +/* Brightness and LED devices tend to be very slow to write to (often being I2C and such). Writes to the + * sysfs attributes are synchronous, and hence will freeze our process on access. We can't really have that, + * hence we add some complexity: whenever we need to write to the brightness attribute, we do so in a forked + * off process, which terminates when it is done. Watching that process allows us to watch completion of the + * write operation. + * + * To make this even more complex: clients are likely to send us many write requests in a short time-frame + * (because they implement reactive brightness sliders on screen). Let's coalesce writes to make this + * efficient: whenever we get requests to change brightness while we are still writing to the brightness + * attribute, let's remember the request and restart a new one when the initial operation finished. When we + * get another request while one is ongoing and one is pending we'll replace the pending one with the new + * one. + * + * The bus messages are answered when the first write operation finishes that started either due to the + * request or due to a later request that overrode the requested one. + * + * Yes, this is complex, but I don't see an easier way if we want to be both efficient and still support + * completion notification. */ + +typedef struct BrightnessWriter { + Manager *manager; + + sd_device *device; + char *path; + + pid_t child; + + uint32_t brightness; + bool again; + + Set *current_messages; + Set *pending_messages; + + sd_event_source* child_event_source; +} BrightnessWriter; + +static BrightnessWriter* brightness_writer_free(BrightnessWriter *w) { + if (!w) + return NULL; + + if (w->manager && w->path) + (void) hashmap_remove_value(w->manager->brightness_writers, w->path, w); + + sd_device_unref(w->device); + free(w->path); + + set_free(w->current_messages); + set_free(w->pending_messages); + + w->child_event_source = sd_event_source_unref(w->child_event_source); + + return mfree(w); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(BrightnessWriter*, brightness_writer_free); + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + brightness_writer_hash_ops, + char, + string_hash_func, + string_compare_func, + BrightnessWriter, + brightness_writer_free); + +static void brightness_writer_reply(BrightnessWriter *w, int error) { + int r; + + assert(w); + + for (;;) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + m = set_steal_first(w->current_messages); + if (!m) + break; + + if (error == 0) + r = sd_bus_reply_method_return(m, NULL); + else + r = sd_bus_reply_method_errnof(m, error, "Failed to write to brightness device: %m"); + if (r < 0) + log_warning_errno(r, "Failed to send method reply, ignoring: %m"); + } +} + +static int brightness_writer_fork(BrightnessWriter *w); + +static int on_brightness_writer_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { + BrightnessWriter *w = ASSERT_PTR(userdata); + int r; + + assert(s); + assert(si); + + assert(si->si_pid == w->child); + w->child = 0; + w->child_event_source = sd_event_source_unref(w->child_event_source); + + brightness_writer_reply(w, + si->si_code == CLD_EXITED && + si->si_status == EXIT_SUCCESS ? 0 : -EPROTO); + + if (w->again) { + /* Another request to change the brightness has been queued. Act on it, but make the pending + * messages the current ones. */ + w->again = false; + set_free(w->current_messages); + w->current_messages = TAKE_PTR(w->pending_messages); + + r = brightness_writer_fork(w); + if (r >= 0) + return 0; + + brightness_writer_reply(w, r); + } + + brightness_writer_free(w); + return 0; +} + +static int brightness_writer_fork(BrightnessWriter *w) { + int r; + + assert(w); + assert(w->manager); + assert(w->child == 0); + assert(!w->child_event_source); + + r = safe_fork("(sd-bright)", FORK_DEATHSIG_SIGKILL|FORK_REARRANGE_STDIO|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_REOPEN_LOG, &w->child); + if (r < 0) + return r; + if (r == 0) { + char brs[DECIMAL_STR_MAX(uint32_t)+1]; + + /* Child */ + xsprintf(brs, "%" PRIu32, w->brightness); + + r = sd_device_set_sysattr_value(w->device, "brightness", brs); + if (r < 0) { + log_device_error_errno(w->device, r, "Failed to write brightness to device: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + r = sd_event_add_child(w->manager->event, &w->child_event_source, w->child, WEXITED, on_brightness_writer_exit, w); + if (r < 0) + return log_error_errno(r, "Failed to watch brightness writer child " PID_FMT ": %m", w->child); + + return 0; +} + +static int set_add_message(Set **set, sd_bus_message *message) { + int r; + + assert(set); + + if (!message) + return 0; + + r = sd_bus_message_get_expect_reply(message); + if (r <= 0) + return r; + + r = set_ensure_put(set, &bus_message_hash_ops, message); + if (r <= 0) + return r; + sd_bus_message_ref(message); + + return 1; +} + +int manager_write_brightness( + Manager *m, + sd_device *device, + uint32_t brightness, + sd_bus_message *message) { + + _cleanup_(brightness_writer_freep) BrightnessWriter *w = NULL; + BrightnessWriter *existing; + const char *path; + int r; + + assert(m); + assert(device); + + r = sd_device_get_syspath(device, &path); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get sysfs path for brightness device: %m"); + + existing = hashmap_get(m->brightness_writers, path); + if (existing) { + /* There's already a writer for this device. Let's update it with the new brightness, and add + * our message to the set of message to reply when done. */ + + r = set_add_message(&existing->pending_messages, message); + if (r < 0) + return log_error_errno(r, "Failed to add message to set: %m"); + + /* We override any previously requested brightness here: we coalesce writes, and the newest + * requested brightness is the one we'll put into effect. */ + existing->brightness = brightness; + existing->again = true; /* request another iteration of the writer when the current one is + * complete */ + return 0; + } + + w = new(BrightnessWriter, 1); + if (!w) + return log_oom(); + + *w = (BrightnessWriter) { + .device = sd_device_ref(device), + .path = strdup(path), + .brightness = brightness, + }; + + if (!w->path) + return log_oom(); + + r = hashmap_ensure_put(&m->brightness_writers, &brightness_writer_hash_ops, w->path, w); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to add brightness writer to hashmap: %m"); + + w->manager = m; + + r = set_add_message(&w->current_messages, message); + if (r < 0) + return log_error_errno(r, "Failed to add message to set: %m"); + + r = brightness_writer_fork(w); + if (r < 0) + return r; + + TAKE_PTR(w); + return 0; +} diff --git a/src/login/logind-brightness.h b/src/login/logind-brightness.h new file mode 100644 index 0000000..f1c7775 --- /dev/null +++ b/src/login/logind-brightness.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-device.h" + +#include "logind.h" + +int manager_write_brightness(Manager *m, sd_device *device, uint32_t brightness, sd_bus_message *message); diff --git a/src/login/logind-button.c b/src/login/logind-button.c new file mode 100644 index 0000000..14835ae --- /dev/null +++ b/src/login/logind-button.c @@ -0,0 +1,526 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "fd-util.h" +#include "logind-button.h" +#include "logind-dbus.h" +#include "missing_input.h" +#include "string-util.h" + +#define CONST_MAX5(a, b, c, d, e) CONST_MAX(CONST_MAX(a, b), CONST_MAX(CONST_MAX(c, d), e)) + +#define ULONG_BITS (sizeof(unsigned long)*8) + +#define LONG_PRESS_DURATION (5 * USEC_PER_SEC) + +static bool bitset_get(const unsigned long *bits, unsigned i) { + return (bits[i / ULONG_BITS] >> (i % ULONG_BITS)) & 1UL; +} + +static void bitset_put(unsigned long *bits, unsigned i) { + bits[i / ULONG_BITS] |= (unsigned long) 1 << (i % ULONG_BITS); +} + +Button* button_new(Manager *m, const char *name) { + Button *b; + + assert(m); + assert(name); + + b = new0(Button, 1); + if (!b) + return NULL; + + b->name = strdup(name); + if (!b->name) + return mfree(b); + + if (hashmap_put(m->buttons, b->name, b) < 0) { + free(b->name); + return mfree(b); + } + + b->manager = m; + b->fd = -EBADF; + + return b; +} + +Button *button_free(Button *b) { + if (!b) + return NULL; + + hashmap_remove(b->manager->buttons, b->name); + + sd_event_source_unref(b->io_event_source); + sd_event_source_unref(b->check_event_source); + + asynchronous_close(b->fd); + + free(b->name); + free(b->seat); + + return mfree(b); +} + +int button_set_seat(Button *b, const char *sn) { + assert(b); + + return free_and_strdup(&b->seat, sn); +} + +static void button_lid_switch_handle_action(Manager *manager, bool is_edge) { + HandleAction handle_action; + + assert(manager); + + /* If we are docked or on external power, handle the lid switch + * differently */ + if (manager_is_docked_or_external_displays(manager)) + handle_action = manager->handle_lid_switch_docked; + else if (handle_action_valid(manager->handle_lid_switch_ep) && manager_is_on_external_power()) + handle_action = manager->handle_lid_switch_ep; + else + handle_action = manager->handle_lid_switch; + + manager_handle_action(manager, INHIBIT_HANDLE_LID_SWITCH, handle_action, manager->lid_switch_ignore_inhibited, is_edge); +} + +static int button_recheck(sd_event_source *e, void *userdata) { + Button *b = ASSERT_PTR(userdata); + + assert(b->lid_closed); + + button_lid_switch_handle_action(b->manager, false); + return 1; +} + +static int button_install_check_event_source(Button *b) { + int r; + assert(b); + + /* Install a post handler, so that we keep rechecking as long as the lid is closed. */ + + if (b->check_event_source) + return 0; + + r = sd_event_add_post(b->manager->event, &b->check_event_source, button_recheck, b); + if (r < 0) + return r; + + return sd_event_source_set_priority(b->check_event_source, SD_EVENT_PRIORITY_IDLE+1); +} + +static int long_press_of_power_key_handler(sd_event_source *e, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(e); + + m->power_key_long_press_event_source = sd_event_source_unref(m->power_key_long_press_event_source); + + log_struct(LOG_INFO, + LOG_MESSAGE("Power key pressed long."), + "MESSAGE_ID=" SD_MESSAGE_POWER_KEY_LONG_PRESS_STR); + + manager_handle_action(m, INHIBIT_HANDLE_POWER_KEY, m->handle_power_key_long_press, m->power_key_ignore_inhibited, true); + return 0; +} + +static int long_press_of_reboot_key_handler(sd_event_source *e, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(e); + + m->reboot_key_long_press_event_source = sd_event_source_unref(m->reboot_key_long_press_event_source); + + log_struct(LOG_INFO, + LOG_MESSAGE("Reboot key pressed long."), + "MESSAGE_ID=" SD_MESSAGE_REBOOT_KEY_LONG_PRESS_STR); + + manager_handle_action(m, INHIBIT_HANDLE_REBOOT_KEY, m->handle_reboot_key_long_press, m->reboot_key_ignore_inhibited, true); + return 0; +} + +static int long_press_of_suspend_key_handler(sd_event_source *e, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(e); + + m->suspend_key_long_press_event_source = sd_event_source_unref(m->suspend_key_long_press_event_source); + + log_struct(LOG_INFO, + LOG_MESSAGE("Suspend key pressed long."), + "MESSAGE_ID=" SD_MESSAGE_SUSPEND_KEY_LONG_PRESS_STR); + + manager_handle_action(m, INHIBIT_HANDLE_SUSPEND_KEY, m->handle_suspend_key_long_press, m->suspend_key_ignore_inhibited, true); + return 0; +} + +static int long_press_of_hibernate_key_handler(sd_event_source *e, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(e); + + m->hibernate_key_long_press_event_source = sd_event_source_unref(m->hibernate_key_long_press_event_source); + + log_struct(LOG_INFO, + LOG_MESSAGE("Hibernate key pressed long."), + "MESSAGE_ID=" SD_MESSAGE_HIBERNATE_KEY_LONG_PRESS_STR); + + manager_handle_action(m, INHIBIT_HANDLE_HIBERNATE_KEY, m->handle_hibernate_key_long_press, m->hibernate_key_ignore_inhibited, true); + return 0; +} + +static void start_long_press(Manager *m, sd_event_source **e, sd_event_time_handler_t callback) { + int r; + + assert(m); + assert(e); + + if (*e) + return; + + r = sd_event_add_time_relative( + m->event, + e, + CLOCK_MONOTONIC, + LONG_PRESS_DURATION, 0, + callback, m); + if (r < 0) + log_warning_errno(r, "Failed to add long press timer event, ignoring: %m"); +} + +static int button_dispatch(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Button *b = ASSERT_PTR(userdata); + struct input_event ev; + ssize_t l; + + assert(s); + assert(fd == b->fd); + + l = read(b->fd, &ev, sizeof(ev)); + if (l < 0) + return errno != EAGAIN ? -errno : 0; + if ((size_t) l < sizeof(ev)) + return -EIO; + + if (ev.type == EV_KEY && ev.value > 0) { + + switch (ev.code) { + + case KEY_POWER: + case KEY_POWER2: + if (b->manager->handle_power_key_long_press != HANDLE_IGNORE && b->manager->handle_power_key_long_press != b->manager->handle_power_key) { + log_debug("Power key pressed. Further action depends on the key press duration."); + start_long_press(b->manager, &b->manager->power_key_long_press_event_source, long_press_of_power_key_handler); + } else { + log_struct(LOG_INFO, + LOG_MESSAGE("Power key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_POWER_KEY_STR); + manager_handle_action(b->manager, INHIBIT_HANDLE_POWER_KEY, b->manager->handle_power_key, b->manager->power_key_ignore_inhibited, true); + } + break; + + /* The kernel naming is a bit confusing here: + KEY_RESTART was probably introduced for media playback purposes, but + is now being predominantly used to indicate device reboot. + */ + + case KEY_RESTART: + if (b->manager->handle_reboot_key_long_press != HANDLE_IGNORE && b->manager->handle_reboot_key_long_press != b->manager->handle_reboot_key) { + log_debug("Reboot key pressed. Further action depends on the key press duration."); + start_long_press(b->manager, &b->manager->reboot_key_long_press_event_source, long_press_of_reboot_key_handler); + } else { + log_struct(LOG_INFO, + LOG_MESSAGE("Reboot key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_REBOOT_KEY_STR); + manager_handle_action(b->manager, INHIBIT_HANDLE_REBOOT_KEY, b->manager->handle_reboot_key, b->manager->reboot_key_ignore_inhibited, true); + } + break; + + /* The kernel naming is a bit confusing here: + + KEY_SLEEP = suspend-to-ram, which everybody else calls "suspend" + KEY_SUSPEND = suspend-to-disk, which everybody else calls "hibernate" + */ + + case KEY_SLEEP: + if (b->manager->handle_suspend_key_long_press != HANDLE_IGNORE && b->manager->handle_suspend_key_long_press != b->manager->handle_suspend_key) { + log_debug("Suspend key pressed. Further action depends on the key press duration."); + start_long_press(b->manager, &b->manager->suspend_key_long_press_event_source, long_press_of_suspend_key_handler); + } else { + log_struct(LOG_INFO, + LOG_MESSAGE("Suspend key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_SUSPEND_KEY_STR); + manager_handle_action(b->manager, INHIBIT_HANDLE_SUSPEND_KEY, b->manager->handle_suspend_key, b->manager->suspend_key_ignore_inhibited, true); + } + break; + + case KEY_SUSPEND: + if (b->manager->handle_hibernate_key_long_press != HANDLE_IGNORE && b->manager->handle_hibernate_key_long_press != b->manager->handle_hibernate_key) { + log_debug("Hibernate key pressed. Further action depends on the key press duration."); + start_long_press(b->manager, &b->manager->hibernate_key_long_press_event_source, long_press_of_hibernate_key_handler); + } else { + log_struct(LOG_INFO, + LOG_MESSAGE("Hibernate key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_HIBERNATE_KEY_STR); + manager_handle_action(b->manager, INHIBIT_HANDLE_HIBERNATE_KEY, b->manager->handle_hibernate_key, b->manager->hibernate_key_ignore_inhibited, true); + } + break; + } + + } else if (ev.type == EV_KEY && ev.value == 0) { + + switch (ev.code) { + + case KEY_POWER: + case KEY_POWER2: + if (b->manager->power_key_long_press_event_source) { + /* Long press event timer is still pending and key release + event happened. This means that key press duration was + insufficient to trigger a long press event + */ + log_struct(LOG_INFO, + LOG_MESSAGE("Power key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_POWER_KEY_STR); + + b->manager->power_key_long_press_event_source = sd_event_source_unref(b->manager->power_key_long_press_event_source); + + manager_handle_action(b->manager, INHIBIT_HANDLE_POWER_KEY, b->manager->handle_power_key, b->manager->power_key_ignore_inhibited, true); + } + break; + + case KEY_RESTART: + if (b->manager->reboot_key_long_press_event_source) { + log_struct(LOG_INFO, + LOG_MESSAGE("Reboot key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_REBOOT_KEY_STR); + + b->manager->reboot_key_long_press_event_source = sd_event_source_unref(b->manager->reboot_key_long_press_event_source); + + manager_handle_action(b->manager, INHIBIT_HANDLE_REBOOT_KEY, b->manager->handle_reboot_key, b->manager->reboot_key_ignore_inhibited, true); + } + break; + + case KEY_SLEEP: + if (b->manager->suspend_key_long_press_event_source) { + log_struct(LOG_INFO, + LOG_MESSAGE("Suspend key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_SUSPEND_KEY_STR); + + b->manager->suspend_key_long_press_event_source = sd_event_source_unref(b->manager->suspend_key_long_press_event_source); + + manager_handle_action(b->manager, INHIBIT_HANDLE_SUSPEND_KEY, b->manager->handle_suspend_key, b->manager->suspend_key_ignore_inhibited, true); + } + break; + case KEY_SUSPEND: + if (b->manager->hibernate_key_long_press_event_source) { + log_struct(LOG_INFO, + LOG_MESSAGE("Hibernate key pressed short."), + "MESSAGE_ID=" SD_MESSAGE_HIBERNATE_KEY_STR); + + b->manager->hibernate_key_long_press_event_source = sd_event_source_unref(b->manager->hibernate_key_long_press_event_source); + + manager_handle_action(b->manager, INHIBIT_HANDLE_HIBERNATE_KEY, b->manager->handle_hibernate_key, b->manager->hibernate_key_ignore_inhibited, true); + } + break; + } + + } else if (ev.type == EV_SW && ev.value > 0) { + + if (ev.code == SW_LID) { + log_struct(LOG_INFO, + LOG_MESSAGE("Lid closed."), + "MESSAGE_ID=" SD_MESSAGE_LID_CLOSED_STR); + + b->lid_closed = true; + button_lid_switch_handle_action(b->manager, true); + button_install_check_event_source(b); + manager_send_changed(b->manager, "LidClosed", NULL); + + } else if (ev.code == SW_DOCK) { + log_struct(LOG_INFO, + LOG_MESSAGE("System docked."), + "MESSAGE_ID=" SD_MESSAGE_SYSTEM_DOCKED_STR); + + b->docked = true; + } + + } else if (ev.type == EV_SW && ev.value == 0) { + + if (ev.code == SW_LID) { + log_struct(LOG_INFO, + LOG_MESSAGE("Lid opened."), + "MESSAGE_ID=" SD_MESSAGE_LID_OPENED_STR); + + b->lid_closed = false; + b->check_event_source = sd_event_source_unref(b->check_event_source); + manager_send_changed(b->manager, "LidClosed", NULL); + + } else if (ev.code == SW_DOCK) { + log_struct(LOG_INFO, + LOG_MESSAGE("System undocked."), + "MESSAGE_ID=" SD_MESSAGE_SYSTEM_UNDOCKED_STR); + + b->docked = false; + } + } + + return 0; +} + +static int button_suitable(int fd) { + unsigned long types[CONST_MAX(EV_KEY, EV_SW)/ULONG_BITS+1]; + + assert(fd >= 0); + + if (ioctl(fd, EVIOCGBIT(EV_SYN, sizeof types), types) < 0) + return -errno; + + if (bitset_get(types, EV_KEY)) { + unsigned long keys[CONST_MAX5(KEY_POWER, KEY_POWER2, KEY_SLEEP, KEY_SUSPEND, KEY_RESTART)/ULONG_BITS+1]; + + if (ioctl(fd, EVIOCGBIT(EV_KEY, sizeof keys), keys) < 0) + return -errno; + + if (bitset_get(keys, KEY_POWER) || + bitset_get(keys, KEY_POWER2) || + bitset_get(keys, KEY_SLEEP) || + bitset_get(keys, KEY_SUSPEND) || + bitset_get(keys, KEY_RESTART)) + return true; + } + + if (bitset_get(types, EV_SW)) { + unsigned long switches[CONST_MAX(SW_LID, SW_DOCK)/ULONG_BITS+1]; + + if (ioctl(fd, EVIOCGBIT(EV_SW, sizeof switches), switches) < 0) + return -errno; + + if (bitset_get(switches, SW_LID) || + bitset_get(switches, SW_DOCK)) + return true; + } + + return false; +} + +static int button_set_mask(const char *name, int fd) { + unsigned long + types[CONST_MAX(EV_KEY, EV_SW)/ULONG_BITS+1] = {}, + keys[CONST_MAX5(KEY_POWER, KEY_POWER2, KEY_SLEEP, KEY_SUSPEND, KEY_RESTART)/ULONG_BITS+1] = {}, + switches[CONST_MAX(SW_LID, SW_DOCK)/ULONG_BITS+1] = {}; + struct input_mask mask; + + assert(name); + assert(fd >= 0); + + bitset_put(types, EV_KEY); + bitset_put(types, EV_SW); + + mask = (struct input_mask) { + .type = EV_SYN, + .codes_size = sizeof(types), + .codes_ptr = PTR_TO_UINT64(types), + }; + + if (ioctl(fd, EVIOCSMASK, &mask) < 0) + /* Log only at debug level if the kernel doesn't do EVIOCSMASK yet */ + return log_full_errno(IN_SET(errno, ENOTTY, EOPNOTSUPP, EINVAL) ? LOG_DEBUG : LOG_WARNING, + errno, "Failed to set EV_SYN event mask on /dev/input/%s: %m", name); + + bitset_put(keys, KEY_POWER); + bitset_put(keys, KEY_POWER2); + bitset_put(keys, KEY_SLEEP); + bitset_put(keys, KEY_SUSPEND); + bitset_put(keys, KEY_RESTART); + + mask = (struct input_mask) { + .type = EV_KEY, + .codes_size = sizeof(keys), + .codes_ptr = PTR_TO_UINT64(keys), + }; + + if (ioctl(fd, EVIOCSMASK, &mask) < 0) + return log_warning_errno(errno, "Failed to set EV_KEY event mask on /dev/input/%s: %m", name); + + bitset_put(switches, SW_LID); + bitset_put(switches, SW_DOCK); + + mask = (struct input_mask) { + .type = EV_SW, + .codes_size = sizeof(switches), + .codes_ptr = PTR_TO_UINT64(switches), + }; + + if (ioctl(fd, EVIOCSMASK, &mask) < 0) + return log_warning_errno(errno, "Failed to set EV_SW event mask on /dev/input/%s: %m", name); + + return 0; +} + +int button_open(Button *b) { + _cleanup_(asynchronous_closep) int fd = -EBADF; + const char *p; + char name[256]; + int r; + + assert(b); + + b->fd = asynchronous_close(b->fd); + + p = strjoina("/dev/input/", b->name); + + fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (fd < 0) + return log_warning_errno(errno, "Failed to open %s: %m", p); + + r = button_suitable(fd); + if (r < 0) + return log_warning_errno(r, "Failed to determine whether input device %s is relevant to us: %m", p); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "Device %s does not expose keys or switches relevant to us, ignoring.", p); + + if (ioctl(fd, EVIOCGNAME(sizeof name), name) < 0) + return log_error_errno(errno, "Failed to get input name for %s: %m", p); + + (void) button_set_mask(b->name, fd); + + b->io_event_source = sd_event_source_unref(b->io_event_source); + r = sd_event_add_io(b->manager->event, &b->io_event_source, fd, EPOLLIN, button_dispatch, b); + if (r < 0) + return log_error_errno(r, "Failed to add button event for %s: %m", p); + + b->fd = TAKE_FD(fd); + log_info("Watching system buttons on %s (%s)", p, name); + return 0; +} + +int button_check_switches(Button *b) { + unsigned long switches[CONST_MAX(SW_LID, SW_DOCK)/ULONG_BITS+1] = {}; + assert(b); + + if (b->fd < 0) + return -EINVAL; + + if (ioctl(b->fd, EVIOCGSW(sizeof(switches)), switches) < 0) + return -errno; + + b->lid_closed = bitset_get(switches, SW_LID); + b->docked = bitset_get(switches, SW_DOCK); + manager_send_changed(b->manager, "LidClosed", NULL); + + if (b->lid_closed) + button_install_check_event_source(b); + + return 0; +} diff --git a/src/login/logind-button.h b/src/login/logind-button.h new file mode 100644 index 0000000..6c39471 --- /dev/null +++ b/src/login/logind-button.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Button Button; + +#include "logind.h" + +struct Button { + Manager *manager; + + sd_event_source *io_event_source; + sd_event_source *check_event_source; + + char *name; + char *seat; + int fd; + + bool lid_closed; + bool docked; +}; + +Button* button_new(Manager *m, const char *name); +Button *button_free(Button *b); +int button_open(Button *b); +int button_set_seat(Button *b, const char *sn); +int button_check_switches(Button *b); diff --git a/src/login/logind-core.c b/src/login/logind-core.c new file mode 100644 index 0000000..f15008e --- /dev/null +++ b/src/login/logind-core.c @@ -0,0 +1,850 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "battery-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "conf-parser.h" +#include "device-util.h" +#include "efi-loader.h" +#include "errno-util.h" +#include "fd-util.h" +#include "limits-util.h" +#include "logind.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "user-util.h" +#include "userdb.h" +#include "utmp-wtmp.h" + +void manager_reset_config(Manager *m) { + assert(m); + + m->n_autovts = 6; + m->reserve_vt = 6; + m->remove_ipc = true; + m->inhibit_delay_max = 5 * USEC_PER_SEC; + m->user_stop_delay = 10 * USEC_PER_SEC; + + m->handle_power_key = HANDLE_POWEROFF; + m->handle_power_key_long_press = HANDLE_IGNORE; + m->handle_reboot_key = HANDLE_REBOOT; + m->handle_reboot_key_long_press = HANDLE_POWEROFF; + m->handle_suspend_key = HANDLE_SUSPEND; + m->handle_suspend_key_long_press = HANDLE_HIBERNATE; + m->handle_hibernate_key = HANDLE_HIBERNATE; + m->handle_hibernate_key_long_press = HANDLE_IGNORE; + + m->handle_lid_switch = HANDLE_SUSPEND; + m->handle_lid_switch_ep = _HANDLE_ACTION_INVALID; + m->handle_lid_switch_docked = HANDLE_IGNORE; + + m->power_key_ignore_inhibited = false; + m->suspend_key_ignore_inhibited = false; + m->hibernate_key_ignore_inhibited = false; + m->lid_switch_ignore_inhibited = true; + m->reboot_key_ignore_inhibited = false; + + m->holdoff_timeout_usec = 30 * USEC_PER_SEC; + + m->idle_action_usec = 30 * USEC_PER_MINUTE; + m->idle_action = HANDLE_IGNORE; + + m->runtime_dir_size = physical_memory_scale(10U, 100U); /* 10% */ + m->runtime_dir_inodes = DIV_ROUND_UP(m->runtime_dir_size, 4096); /* 4k per inode */ + m->sessions_max = 8192; + m->inhibitors_max = 8192; + + m->kill_user_processes = KILL_USER_PROCESSES; + + m->kill_only_users = strv_free(m->kill_only_users); + m->kill_exclude_users = strv_free(m->kill_exclude_users); + + m->stop_idle_session_usec = USEC_INFINITY; +} + +int manager_parse_config_file(Manager *m) { + assert(m); + + return config_parse_config_file("logind.conf", "Login\0", + config_item_perf_lookup, logind_gperf_lookup, + CONFIG_PARSE_WARN, m); +} + +int manager_add_device(Manager *m, const char *sysfs, bool master, Device **ret_device) { + Device *d; + + assert(m); + assert(sysfs); + + d = hashmap_get(m->devices, sysfs); + if (d) + /* we support adding master-flags, but not removing them */ + d->master = d->master || master; + else { + d = device_new(m, sysfs, master); + if (!d) + return -ENOMEM; + } + + if (ret_device) + *ret_device = d; + + return 0; +} + +int manager_add_seat(Manager *m, const char *id, Seat **ret_seat) { + Seat *s; + int r; + + assert(m); + assert(id); + + s = hashmap_get(m->seats, id); + if (!s) { + r = seat_new(&s, m, id); + if (r < 0) + return r; + } + + if (ret_seat) + *ret_seat = s; + + return 0; +} + +int manager_add_session(Manager *m, const char *id, Session **ret_session) { + Session *s; + int r; + + assert(m); + assert(id); + + s = hashmap_get(m->sessions, id); + if (!s) { + r = session_new(&s, m, id); + if (r < 0) + return r; + } + + if (ret_session) + *ret_session = s; + + return 0; +} + +int manager_add_user( + Manager *m, + UserRecord *ur, + User **ret_user) { + + User *u; + int r; + + assert(m); + assert(ur); + + u = hashmap_get(m->users, UID_TO_PTR(ur->uid)); + if (!u) { + r = user_new(&u, m, ur); + if (r < 0) + return r; + } + + if (ret_user) + *ret_user = u; + + return 0; +} + +int manager_add_user_by_name( + Manager *m, + const char *name, + User **ret_user) { + + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + int r; + + assert(m); + assert(name); + + r = userdb_by_name(name, USERDB_SUPPRESS_SHADOW, &ur); + if (r < 0) + return r; + + return manager_add_user(m, ur, ret_user); +} + +int manager_add_user_by_uid( + Manager *m, + uid_t uid, + User **ret_user) { + + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + int r; + + assert(m); + assert(uid_is_valid(uid)); + + r = userdb_by_uid(uid, USERDB_SUPPRESS_SHADOW, &ur); + if (r < 0) + return r; + + return manager_add_user(m, ur, ret_user); +} + +int manager_add_inhibitor(Manager *m, const char* id, Inhibitor **ret) { + Inhibitor *i; + int r; + + assert(m); + assert(id); + + i = hashmap_get(m->inhibitors, id); + if (!i) { + r = inhibitor_new(&i, m, id); + if (r < 0) + return r; + } + + if (ret) + *ret = i; + + return 0; +} + +int manager_add_button(Manager *m, const char *name, Button **ret_button) { + Button *b; + + assert(m); + assert(name); + + b = hashmap_get(m->buttons, name); + if (!b) { + b = button_new(m, name); + if (!b) + return -ENOMEM; + } + + if (ret_button) + *ret_button = b; + + return 0; +} + +int manager_process_seat_device(Manager *m, sd_device *d) { + Device *device; + int r; + + assert(m); + + if (device_for_action(d, SD_DEVICE_REMOVE) || + sd_device_has_current_tag(d, "seat") <= 0) { + const char *syspath; + + r = sd_device_get_syspath(d, &syspath); + if (r < 0) + return 0; + + device = hashmap_get(m->devices, syspath); + if (!device) + return 0; + + seat_add_to_gc_queue(device->seat); + device_free(device); + + } else { + const char *sn, *syspath; + bool master; + Seat *seat; + + if (sd_device_get_property_value(d, "ID_SEAT", &sn) < 0 || isempty(sn)) + sn = "seat0"; + + if (!seat_name_is_valid(sn)) { + log_device_warning(d, "Device with invalid seat name %s found, ignoring.", sn); + return 0; + } + + seat = hashmap_get(m->seats, sn); + master = sd_device_has_current_tag(d, "master-of-seat") > 0; + + /* Ignore non-master devices for unknown seats */ + if (!master && !seat) + return 0; + + r = sd_device_get_syspath(d, &syspath); + if (r < 0) + return r; + + r = manager_add_device(m, syspath, master, &device); + if (r < 0) + return r; + + if (!seat) { + r = manager_add_seat(m, sn, &seat); + if (r < 0) { + if (!device->seat) + device_free(device); + + return r; + } + } + + device_attach(device, seat); + seat_start(seat); + } + + return 0; +} + +int manager_process_button_device(Manager *m, sd_device *d) { + const char *sysname; + Button *b; + int r; + + assert(m); + + r = sd_device_get_sysname(d, &sysname); + if (r < 0) + return r; + + if (device_for_action(d, SD_DEVICE_REMOVE) || + sd_device_has_current_tag(d, "power-switch") <= 0) + + button_free(hashmap_get(m->buttons, sysname)); + + else { + const char *sn; + + r = manager_add_button(m, sysname, &b); + if (r < 0) + return r; + + if (sd_device_get_property_value(d, "ID_SEAT", &sn) < 0 || isempty(sn)) + sn = "seat0"; + + button_set_seat(b, sn); + + r = button_open(b); + if (r < 0) /* event device doesn't have any keys or switches relevant to us? (or any other error + * opening the device?) let's close the button again. */ + button_free(b); + } + + return 0; +} + +int manager_get_session_by_pidref(Manager *m, const PidRef *pid, Session **ret) { + _cleanup_free_ char *unit = NULL; + Session *s; + int r; + + assert(m); + + if (!pidref_is_set(pid)) + return -EINVAL; + + s = hashmap_get(m->sessions_by_leader, pid); + if (s) { + r = pidref_verify(pid); + if (r < 0) + return r; + } else { + r = cg_pidref_get_unit(pid, &unit); + if (r < 0) + return r; + + s = hashmap_get(m->session_units, unit); + } + + if (ret) + *ret = s; + + return !!s; +} + +int manager_get_user_by_pid(Manager *m, pid_t pid, User **ret) { + _cleanup_free_ char *unit = NULL; + User *u = NULL; + int r; + + assert(m); + + if (!pid_is_valid(pid)) + return -EINVAL; + + r = cg_pid_get_slice(pid, &unit); + if (r >= 0) + u = hashmap_get(m->user_units, unit); + + if (ret) + *ret = u; + + return !!u; +} + +int manager_get_idle_hint(Manager *m, dual_timestamp *t) { + Session *s; + bool idle_hint; + dual_timestamp ts = DUAL_TIMESTAMP_NULL; + + assert(m); + + idle_hint = !manager_is_inhibited(m, INHIBIT_IDLE, INHIBIT_BLOCK, t, false, false, 0, NULL); + + HASHMAP_FOREACH(s, m->sessions) { + dual_timestamp k; + int ih; + + ih = session_get_idle_hint(s, &k); + if (ih < 0) + return ih; + + if (!ih) { + if (!idle_hint) { + if (k.monotonic < ts.monotonic) + ts = k; + } else { + idle_hint = false; + ts = k; + } + } else if (idle_hint) { + + if (k.monotonic > ts.monotonic) + ts = k; + } + } + + if (t) + *t = ts; + + return idle_hint; +} + +bool manager_shall_kill(Manager *m, const char *user) { + assert(m); + assert(user); + + if (!m->kill_exclude_users && streq(user, "root")) + return false; + + if (strv_contains(m->kill_exclude_users, user)) + return false; + + if (!strv_isempty(m->kill_only_users)) + return strv_contains(m->kill_only_users, user); + + return m->kill_user_processes; +} + +int config_parse_n_autovts( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + unsigned *n = ASSERT_PTR(data); + unsigned o; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = safe_atou(rvalue, &o); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse number of autovts, ignoring: %s", rvalue); + return 0; + } + + if (o > 15) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "A maximum of 15 autovts are supported, ignoring: %s", rvalue); + return 0; + } + + *n = o; + return 0; +} + +static int vt_is_busy(unsigned vtnr) { + struct vt_stat vt_stat; + int r; + _cleanup_close_ int fd = -EBADF; + + assert(vtnr >= 1); + + /* VT_GETSTATE "cannot return state for more than 16 VTs, since v_state is short" */ + assert(vtnr <= 15); + + /* We explicitly open /dev/tty1 here instead of /dev/tty0. If + * we'd open the latter we'd open the foreground tty which + * hence would be unconditionally busy. By opening /dev/tty1 + * we avoid this. Since tty1 is special and needs to be an + * explicitly loaded getty or DM this is safe. */ + + fd = open_terminal("/dev/tty1", O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return -errno; + + if (ioctl(fd, VT_GETSTATE, &vt_stat) < 0) + r = -errno; + else + r = !!(vt_stat.v_state & (1 << vtnr)); + + return r; +} + +int manager_spawn_autovt(Manager *m, unsigned vtnr) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + char name[sizeof("autovt@tty.service") + DECIMAL_STR_MAX(unsigned)]; + int r; + + assert(m); + assert(vtnr >= 1); + + if (vtnr > m->n_autovts && + vtnr != m->reserve_vt) + return 0; + + if (vtnr != m->reserve_vt) { + /* If this is the reserved TTY, we'll start the getty + * on it in any case, but otherwise only if it is not + * busy. */ + + r = vt_is_busy(vtnr); + if (r < 0) + return r; + else if (r > 0) + return -EBUSY; + } + + xsprintf(name, "autovt@tty%u.service", vtnr); + r = bus_call_method(m->bus, bus_systemd_mgr, "StartUnit", &error, NULL, "ss", name, "fail"); + if (r < 0) + return log_error_errno(r, "Failed to start %s: %s", name, bus_error_message(&error, r)); + + return 0; +} + +bool manager_is_lid_closed(Manager *m) { + Button *b; + + HASHMAP_FOREACH(b, m->buttons) + if (b->lid_closed) + return true; + + return false; +} + +static bool manager_is_docked(Manager *m) { + Button *b; + + HASHMAP_FOREACH(b, m->buttons) + if (b->docked) + return true; + + return false; +} + +static int manager_count_external_displays(Manager *m) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r, n = 0; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "drm", true); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + const char *status, *enabled, *dash, *nn, *subsys; + sd_device *p; + + if (sd_device_get_parent(d, &p) < 0) + continue; + + /* If the parent shares the same subsystem as the + * device we are looking at then it is a connector, + * which is what we are interested in. */ + if (sd_device_get_subsystem(p, &subsys) < 0 || !streq(subsys, "drm")) + continue; + + if (sd_device_get_sysname(d, &nn) < 0) + continue; + + /* Ignore internal displays: the type is encoded in the sysfs name, as the second dash + * separated item (the first is the card name, the last the connector number). We implement a + * deny list of external displays here, rather than an allow list of internal ones, to ensure + * we don't block suspends too eagerly. */ + dash = strchr(nn, '-'); + if (!dash) + continue; + + dash++; + if (!STARTSWITH_SET(dash, + "VGA-", "DVI-I-", "DVI-D-", "DVI-A-" + "Composite-", "SVIDEO-", "Component-", + "DIN-", "DP-", "HDMI-A-", "HDMI-B-", "TV-")) + continue; + + /* Ignore ports that are not enabled */ + if (sd_device_get_sysattr_value(d, "enabled", &enabled) < 0 || !streq(enabled, "enabled")) + continue; + + /* We count any connector which is not explicitly + * "disconnected" as connected. */ + if (sd_device_get_sysattr_value(d, "status", &status) < 0 || !streq(status, "disconnected")) + n++; + } + + return n; +} + +bool manager_is_docked_or_external_displays(Manager *m) { + int n; + + /* If we are docked don't react to lid closing */ + if (manager_is_docked(m)) { + log_debug("System is docked."); + return true; + } + + /* If we have more than one display connected, + * assume that we are docked. */ + n = manager_count_external_displays(m); + if (n < 0) + log_warning_errno(n, "Display counting failed: %m"); + else if (n >= 1) { + log_debug("External (%i) displays connected.", n); + return true; + } + + return false; +} + +bool manager_is_on_external_power(void) { + int r; + + /* For now we only check for AC power, but 'external power' can apply to anything that isn't an internal + * battery */ + r = on_ac_power(); + if (r < 0) + log_warning_errno(r, "Failed to read AC power status: %m"); + + return r != 0; /* Treat failure as 'on AC' */ +} + +bool manager_all_buttons_ignored(Manager *m) { + assert(m); + + if (m->handle_power_key != HANDLE_IGNORE) + return false; + if (m->handle_power_key_long_press != HANDLE_IGNORE) + return false; + if (m->handle_suspend_key != HANDLE_IGNORE) + return false; + if (m->handle_suspend_key_long_press != HANDLE_IGNORE) + return false; + if (m->handle_hibernate_key != HANDLE_IGNORE) + return false; + if (m->handle_hibernate_key_long_press != HANDLE_IGNORE) + return false; + if (m->handle_reboot_key != HANDLE_IGNORE) + return false; + if (m->handle_reboot_key_long_press != HANDLE_IGNORE) + return false; + if (m->handle_lid_switch != HANDLE_IGNORE) + return false; + if (!IN_SET(m->handle_lid_switch_ep, _HANDLE_ACTION_INVALID, HANDLE_IGNORE)) + return false; + if (m->handle_lid_switch_docked != HANDLE_IGNORE) + return false; + + return true; +} + +int manager_read_utmp(Manager *m) { +#if ENABLE_UTMP + int r; + _unused_ _cleanup_(utxent_cleanup) bool utmpx = false; + + assert(m); + + if (utmpxname(_PATH_UTMPX) < 0) + return log_error_errno(errno, "Failed to set utmp path to " _PATH_UTMPX ": %m"); + + utmpx = utxent_start(); + + for (;;) { + _cleanup_free_ char *t = NULL; + struct utmpx *u; + const char *c; + Session *s; + + errno = 0; + u = getutxent(); + if (!u) { + if (errno == ENOENT) + log_debug_errno(errno, _PATH_UTMPX " does not exist, ignoring."); + else if (errno != 0) + log_warning_errno(errno, "Failed to read " _PATH_UTMPX ", ignoring: %m"); + return 0; + } + + if (u->ut_type != USER_PROCESS) + continue; + + if (!pid_is_valid(u->ut_pid)) + continue; + + t = strndup(u->ut_line, sizeof(u->ut_line)); + if (!t) + return log_oom(); + + c = path_startswith(t, "/dev/"); + if (c) { + r = free_and_strdup(&t, c); + if (r < 0) + return log_oom(); + } + + if (isempty(t)) + continue; + + if (manager_get_session_by_pidref(m, &PIDREF_MAKE_FROM_PID(u->ut_pid), &s) <= 0) + continue; + + if (s->tty_validity == TTY_FROM_UTMP && !streq_ptr(s->tty, t)) { + /* This may happen on multiplexed SSH connection (i.e. 'SSH connection sharing'). In + * this case PAM and utmp sessions don't match. In such a case let's invalidate the TTY + * information and never acquire it again. */ + + s->tty = mfree(s->tty); + s->tty_validity = TTY_UTMP_INCONSISTENT; + log_debug("Session '%s' has inconsistent TTY information, dropping TTY information.", s->id); + continue; + } + + /* Never override what we figured out once */ + if (s->tty || s->tty_validity >= 0) + continue; + + s->tty = TAKE_PTR(t); + s->tty_validity = TTY_FROM_UTMP; + log_debug("Acquired TTY information '%s' from utmp for session '%s'.", s->tty, s->id); + } + +#else + return 0; +#endif +} + +#if ENABLE_UTMP +static int manager_dispatch_utmp(sd_event_source *s, const struct inotify_event *event, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + /* If there's indication the file itself might have been removed or became otherwise unavailable, then let's + * reestablish the watch on whatever there's now. */ + if ((event->mask & (IN_ATTRIB|IN_DELETE_SELF|IN_MOVE_SELF|IN_Q_OVERFLOW|IN_UNMOUNT)) != 0) + manager_connect_utmp(m); + + (void) manager_read_utmp(m); + return 0; +} +#endif + +void manager_connect_utmp(Manager *m) { +#if ENABLE_UTMP + sd_event_source *s = NULL; + int r; + + assert(m); + + /* Watch utmp for changes via inotify. We do this to deal with tools such as ssh, which will register the PAM + * session early, and acquire a TTY only much later for the connection. Thus during PAM the TTY won't be known + * yet. ssh will register itself with utmp when it finally acquired the TTY. Hence, let's make use of this, and + * watch utmp for the TTY asynchronously. We use the PAM session's leader PID as key, to find the right entry. + * + * Yes, relying on utmp is pretty ugly, but it's good enough for informational purposes, as well as idle + * detection (which, for tty sessions, relies on the TTY used) */ + + r = sd_event_add_inotify(m->event, &s, _PATH_UTMPX, IN_MODIFY|IN_MOVE_SELF|IN_DELETE_SELF|IN_ATTRIB, manager_dispatch_utmp, m); + if (r < 0) + log_full_errno(r == -ENOENT ? LOG_DEBUG: LOG_WARNING, r, "Failed to create inotify watch on " _PATH_UTMPX ", ignoring: %m"); + else { + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + log_warning_errno(r, "Failed to adjust utmp event source priority, ignoring: %m"); + + (void) sd_event_source_set_description(s, "utmp"); + } + + sd_event_source_unref(m->utmp_event_source); + m->utmp_event_source = s; +#endif +} + +void manager_reconnect_utmp(Manager *m) { +#if ENABLE_UTMP + assert(m); + + if (m->utmp_event_source) + return; + + manager_connect_utmp(m); +#endif +} + +int manager_read_efi_boot_loader_entries(Manager *m) { +#if ENABLE_EFI + int r; + + assert(m); + if (m->efi_boot_loader_entries_set) + return 0; + + r = efi_loader_get_entries(&m->efi_boot_loader_entries); + if (r < 0) { + if (r == -ENOENT || ERRNO_IS_NOT_SUPPORTED(r)) { + log_debug_errno(r, "Boot loader reported no entries."); + m->efi_boot_loader_entries_set = true; + return 0; + } + return log_error_errno(r, "Failed to determine entries reported by boot loader: %m"); + } + + m->efi_boot_loader_entries_set = true; + return 1; +#else + return 0; +#endif +} diff --git a/src/login/logind-dbus.c b/src/login/logind-dbus.c new file mode 100644 index 0000000..ec1f2f3 --- /dev/null +++ b/src/login/logind-dbus.c @@ -0,0 +1,4406 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-device.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "audit-util.h" +#include "bootspec.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-get-properties.h" +#include "bus-locator.h" +#include "bus-polkit.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "device-util.h" +#include "dirent-util.h" +#include "efi-api.h" +#include "efi-loader.h" +#include "efivars.h" +#include "env-file.h" +#include "env-util.h" +#include "escape.h" +#include "event-util.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "logind-action.h" +#include "logind-dbus.h" +#include "logind-polkit.h" +#include "logind-seat-dbus.h" +#include "logind-session-dbus.h" +#include "logind-user-dbus.h" +#include "logind.h" +#include "missing_capability.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "reboot-util.h" +#include "selinux-util.h" +#include "sleep-config.h" +#include "special.h" +#include "serialize.h" +#include "stdio-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "user-util.h" +#include "utmp-wtmp.h" +#include "virt.h" +#include "wall.h" + +/* As a random fun fact sysvinit had a 252 (256-(strlen(" \r\n")+1)) + * character limit for the wall message. + * https://git.savannah.nongnu.org/cgit/sysvinit.git/tree/src/shutdown.c#n72 + * There is no real technical need for that but doesn't make sense + * to store arbitrary amounts either. As we are not stingy here, we + * allow 4k. + */ +#define WALL_MESSAGE_MAX 4096U + +#define SHUTDOWN_SCHEDULE_FILE "/run/systemd/shutdown/scheduled" + +static int update_schedule_file(Manager *m); +static void reset_scheduled_shutdown(Manager *m); +static int manager_setup_shutdown_timers(Manager* m); + +static int get_sender_session( + Manager *m, + sd_bus_message *message, + bool consult_display, + sd_bus_error *error, + Session **ret) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Session *session = NULL; + const char *name; + int r; + + /* Acquire the sender's session. This first checks if the sending process is inside a session itself, + * and returns that. If not and 'consult_display' is true, this returns the display session of the + * owning user of the caller. */ + + r = sd_bus_query_sender_creds(message, + SD_BUS_CREDS_SESSION|SD_BUS_CREDS_AUGMENT| + (consult_display ? SD_BUS_CREDS_OWNER_UID : 0), &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_session(creds, &name); + if (r < 0) { + if (r != -ENXIO) + return r; + + if (consult_display) { + uid_t uid; + + r = sd_bus_creds_get_owner_uid(creds, &uid); + if (r < 0) { + if (r != -ENXIO) + return r; + } else { + User *user; + + user = hashmap_get(m->users, UID_TO_PTR(uid)); + if (user) + session = user->display; + } + } + } else + session = hashmap_get(m->sessions, name); + + if (!session) + return sd_bus_error_setf(error, BUS_ERROR_NO_SESSION_FOR_PID, + consult_display ? + "Caller does not belong to any known session and doesn't own any suitable session." : + "Caller does not belong to any known session."); + + *ret = session; + return 0; +} + +int manager_get_session_from_creds( + Manager *m, + sd_bus_message *message, + const char *name, + sd_bus_error *error, + Session **ret) { + + Session *session; + + assert(m); + assert(ret); + + if (SEAT_IS_SELF(name)) /* the caller's own session */ + return get_sender_session(m, message, false, error, ret); + if (SEAT_IS_AUTO(name)) /* The caller's own session if they have one, otherwise their user's display session */ + return get_sender_session(m, message, true, error, ret); + + session = hashmap_get(m->sessions, name); + if (!session) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_SESSION, "No session '%s' known", name); + + *ret = session; + return 0; +} + +static int get_sender_user(Manager *m, sd_bus_message *message, sd_bus_error *error, User **ret) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + uid_t uid; + User *user; + int r; + + /* Note that we get the owner UID of the session, not the actual client UID here! */ + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_OWNER_UID|SD_BUS_CREDS_AUGMENT, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_owner_uid(creds, &uid); + if (r < 0) { + if (r != -ENXIO) + return r; + + user = NULL; + } else + user = hashmap_get(m->users, UID_TO_PTR(uid)); + + if (!user) + return sd_bus_error_setf(error, BUS_ERROR_NO_USER_FOR_PID, + "Caller does not belong to any logged in or lingering user"); + + *ret = user; + return 0; +} + +int manager_get_user_from_creds(Manager *m, sd_bus_message *message, uid_t uid, sd_bus_error *error, User **ret) { + User *user; + + assert(m); + assert(ret); + + if (!uid_is_valid(uid)) + return get_sender_user(m, message, error, ret); + + user = hashmap_get(m->users, UID_TO_PTR(uid)); + if (!user) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_USER, + "User ID "UID_FMT" is not logged in or lingering", uid); + + *ret = user; + return 0; +} + +int manager_get_seat_from_creds( + Manager *m, + sd_bus_message *message, + const char *name, + sd_bus_error *error, + Seat **ret) { + + Seat *seat; + int r; + + assert(m); + assert(ret); + + if (SEAT_IS_SELF(name) || SEAT_IS_AUTO(name)) { + Session *session; + + /* Use these special seat names as session names */ + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + seat = session->seat; + if (!seat) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_SEAT, "Session '%s' has no seat.", session->id); + } else { + seat = hashmap_get(m->seats, name); + if (!seat) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_SEAT, "No seat '%s' known", name); + } + + *ret = seat; + return 0; +} + +static int return_test_polkit( + sd_bus_message *message, + int capability, + const char *action, + const char **details, + uid_t good_user, + sd_bus_error *e) { + + const char *result; + bool challenge; + int r; + + r = bus_test_polkit(message, capability, action, details, good_user, &challenge, e); + if (r < 0) + return r; + + if (r > 0) + result = "yes"; + else if (challenge) + result = "challenge"; + else + result = "no"; + + return sd_bus_reply_method_return(message, "s", result); +} + +static int property_get_idle_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", manager_get_idle_hint(m, NULL) > 0); +} + +static int property_get_idle_since_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + dual_timestamp t = DUAL_TIMESTAMP_NULL; + + assert(bus); + assert(reply); + + manager_get_idle_hint(m, &t); + + return sd_bus_message_append(reply, "t", streq(property, "IdleSinceHint") ? t.realtime : t.monotonic); +} + +static int property_get_inhibited( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + InhibitWhat w; + + assert(bus); + assert(reply); + + w = manager_inhibit_what(m, streq(property, "BlockInhibited") ? INHIBIT_BLOCK : INHIBIT_DELAY); + + return sd_bus_message_append(reply, "s", inhibit_what_to_string(w)); +} + +static int property_get_preparing( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + bool b = false; + + assert(bus); + assert(reply); + + if (m->delayed_action) { + if (streq(property, "PreparingForShutdown")) + b = m->delayed_action->inhibit_what & INHIBIT_SHUTDOWN; + else + b = m->delayed_action->inhibit_what & INHIBIT_SLEEP; + } + + return sd_bus_message_append(reply, "b", b); +} + +static int property_get_scheduled_shutdown( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "st"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "st", + m->scheduled_shutdown_action ? handle_action_to_string(m->scheduled_shutdown_action->handle) : NULL, + m->scheduled_shutdown_timeout); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_handle_action, handle_action, HandleAction); +static BUS_DEFINE_PROPERTY_GET(property_get_docked, "b", Manager, manager_is_docked_or_external_displays); +static BUS_DEFINE_PROPERTY_GET(property_get_lid_closed, "b", Manager, manager_is_lid_closed); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_on_external_power, "b", manager_is_on_external_power()); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_compat_user_tasks_max, "t", CGROUP_LIMIT_MAX); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "t", Hashmap *, (uint64_t) hashmap_size); + +static int method_get_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *name; + Session *session; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + p = session_bus_path(session); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +/* Get login session of a process. This is not what you are looking for these days, + * as apps may instead belong to a user service unit. This includes terminal + * emulators and hence command-line apps. */ +static int method_get_session_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + _cleanup_free_ char *p = NULL; + Session *session = NULL; + pid_t pid; + int r; + + assert(message); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + + r = sd_bus_message_read(message, "u", &pid); + if (r < 0) + return r; + if (pid < 0) + return -EINVAL; + + if (pid == 0) { + r = manager_get_session_from_creds(m, message, NULL, error, &session); + if (r < 0) + return r; + } else { + r = manager_get_session_by_pidref(m, &PIDREF_MAKE_FROM_PID(pid), &session); + if (r < 0) + return r; + + if (!session) + return sd_bus_error_setf(error, BUS_ERROR_NO_SESSION_FOR_PID, + "PID "PID_FMT" does not belong to any known session", pid); + } + + p = session_bus_path(session); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_get_user(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + uint32_t uid; + User *user; + int r; + + assert(message); + + r = sd_bus_message_read(message, "u", &uid); + if (r < 0) + return r; + + r = manager_get_user_from_creds(m, message, uid, error, &user); + if (r < 0) + return r; + + p = user_bus_path(user); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_get_user_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + User *user = NULL; + pid_t pid; + int r; + + assert(message); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + + r = sd_bus_message_read(message, "u", &pid); + if (r < 0) + return r; + if (pid < 0) + return -EINVAL; + + if (pid == 0) { + r = manager_get_user_from_creds(m, message, UID_INVALID, error, &user); + if (r < 0) + return r; + } else { + r = manager_get_user_by_pid(m, pid, &user); + if (r < 0) + return r; + if (!user) + return sd_bus_error_setf(error, BUS_ERROR_NO_USER_FOR_PID, + "PID "PID_FMT" does not belong to any logged in user or lingering user", + pid); + } + + p = user_bus_path(user); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_get_seat(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *name; + Seat *seat; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_seat_from_creds(m, message, name, error, &seat); + if (r < 0) + return r; + + p = seat_bus_path(seat); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_list_sessions(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + Session *session; + int r; + + assert(message); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(susso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(session, m->sessions) { + _cleanup_free_ char *p = NULL; + + p = session_bus_path(session); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(susso)", + session->id, + (uint32_t) session->user->user_record->uid, + session->user->user_record->user_name, + session->seat ? session->seat->id : "", + p); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_users(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + User *user; + int r; + + assert(message); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(uso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(user, m->users) { + _cleanup_free_ char *p = NULL; + + p = user_bus_path(user); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(uso)", + (uint32_t) user->user_record->uid, + user->user_record->user_name, + p); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_seats(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + Seat *seat; + int r; + + assert(message); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(so)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(seat, m->seats) { + _cleanup_free_ char *p = NULL; + + p = seat_bus_path(seat); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(so)", seat->id, p); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_list_inhibitors(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + Inhibitor *inhibitor; + int r; + + assert(message); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssssuu)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(inhibitor, m->inhibitors) { + + r = sd_bus_message_append(reply, "(ssssuu)", + strempty(inhibit_what_to_string(inhibitor->what)), + strempty(inhibitor->who), + strempty(inhibitor->why), + strempty(inhibit_mode_to_string(inhibitor->mode)), + (uint32_t) inhibitor->uid, + (uint32_t) inhibitor->pid.pid); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int create_session( + sd_bus_message *message, + void *userdata, + sd_bus_error *error, + uid_t uid, + pid_t pid, + int pidfd, + const char *service, + const char *type, + const char *class, + const char *desktop, + const char *cseat, + uint32_t vtnr, + const char *tty, + const char *display, + int remote, + const char *remote_user, + const char *remote_host, + uint64_t flags) { + + _cleanup_(pidref_done) PidRef leader = PIDREF_NULL; + Manager *m = ASSERT_PTR(userdata); + _cleanup_free_ char *id = NULL; + Session *session = NULL; + uint32_t audit_id = 0; + User *user = NULL; + Seat *seat = NULL; + SessionType t; + SessionClass c; + int r; + + assert(message); + + if (!uid_is_valid(uid)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UID"); + + if (flags != 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero."); + + if (pidfd >= 0) { + r = pidref_set_pidfd(&leader, pidfd); + if (r < 0) + return r; + } else if (pid == 0) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t p; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &p); + if (r < 0) + return r; + + r = pidref_set_pid(&leader, p); + if (r < 0) + return r; + } else { + assert(pid > 0); + + r = pidref_set_pid(&leader, pid); + if (r < 0) + return r; + } + + if (leader.pid == 1 || leader.pid == getpid_cached()) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid leader PID"); + + if (isempty(type)) + t = _SESSION_TYPE_INVALID; + else { + t = session_type_from_string(type); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid session type %s", type); + } + + if (isempty(class)) + c = _SESSION_CLASS_INVALID; + else { + c = session_class_from_string(class); + if (c < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid session class %s", class); + } + + if (isempty(desktop)) + desktop = NULL; + else { + if (!string_is_safe(desktop)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid desktop string %s", desktop); + } + + if (isempty(cseat)) + seat = NULL; + else { + seat = hashmap_get(m->seats, cseat); + if (!seat) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_SEAT, + "No seat '%s' known", cseat); + } + + if (tty_is_vc(tty)) { + int v; + + if (!seat) + seat = m->seat0; + else if (seat != m->seat0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "TTY %s is virtual console but seat %s is not seat0", tty, seat->id); + + v = vtnr_from_tty(tty); + if (v <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Cannot determine VT number from virtual console TTY %s", tty); + + if (vtnr == 0) + vtnr = (uint32_t) v; + else if (vtnr != (uint32_t) v) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Specified TTY and VT number do not match"); + + } else if (tty_is_console(tty)) { + + if (!seat) + seat = m->seat0; + else if (seat != m->seat0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Console TTY specified but seat is not seat0"); + + if (vtnr != 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Console TTY specified but VT number is not 0"); + } + + if (seat) { + if (seat_has_vts(seat)) { + if (vtnr <= 0 || vtnr > 63) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "VT number out of range"); + } else { + if (vtnr != 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Seat has no VTs but VT number not 0"); + } + } + + if (t == _SESSION_TYPE_INVALID) { + if (!isempty(display)) + t = SESSION_X11; + else if (!isempty(tty)) + t = SESSION_TTY; + else + t = SESSION_UNSPECIFIED; + } + + if (c == _SESSION_CLASS_INVALID) { + if (t == SESSION_UNSPECIFIED) + c = SESSION_BACKGROUND; + else + c = SESSION_USER; + } + + /* Check if we are already in a logind session. Or if we are in user@.service + * which is a special PAM session that avoids creating a logind session. */ + r = manager_get_user_by_pid(m, leader.pid, NULL); + if (r < 0) + return r; + if (r > 0) + return sd_bus_error_setf(error, BUS_ERROR_SESSION_BUSY, + "Already running in a session or user slice"); + + /* + * Old gdm and lightdm start the user-session on the same VT as + * the greeter session. But they destroy the greeter session + * after the user-session and want the user-session to take + * over the VT. We need to support this for + * backwards-compatibility, so make sure we allow new sessions + * on a VT that a greeter is running on. Furthermore, to allow + * re-logins, we have to allow a greeter to take over a used VT for + * the exact same reasons. + */ + if (c != SESSION_GREETER && + vtnr > 0 && + vtnr < MALLOC_ELEMENTSOF(m->seat0->positions) && + m->seat0->positions[vtnr] && + m->seat0->positions[vtnr]->class != SESSION_GREETER) + return sd_bus_error_set(error, BUS_ERROR_SESSION_BUSY, "Already occupied by a session"); + + if (hashmap_size(m->sessions) >= m->sessions_max) + return sd_bus_error_setf(error, SD_BUS_ERROR_LIMITS_EXCEEDED, + "Maximum number of sessions (%" PRIu64 ") reached, refusing further sessions.", + m->sessions_max); + + (void) audit_session_from_pid(leader.pid, &audit_id); + if (audit_session_is_valid(audit_id)) { + /* Keep our session IDs and the audit session IDs in sync */ + + if (asprintf(&id, "%"PRIu32, audit_id) < 0) + return -ENOMEM; + + /* Wut? There's already a session by this name and we didn't find it above? Weird, then let's + * not trust the audit data and let's better register a new ID */ + if (hashmap_contains(m->sessions, id)) { + log_warning("Existing logind session ID %s used by new audit session, ignoring.", id); + audit_id = AUDIT_SESSION_INVALID; + id = mfree(id); + } + } + + if (!id) { + do { + id = mfree(id); + + if (asprintf(&id, "c%" PRIu64, ++m->session_counter) < 0) + return -ENOMEM; + + } while (hashmap_contains(m->sessions, id)); + } + + /* The generated names should not clash with 'auto' or 'self' */ + assert(!SESSION_IS_SELF(id)); + assert(!SESSION_IS_AUTO(id)); + + /* If we are not watching utmp already, try again */ + manager_reconnect_utmp(m); + + r = manager_add_user_by_uid(m, uid, &user); + if (r < 0) + goto fail; + + r = manager_add_session(m, id, &session); + if (r < 0) + goto fail; + + session_set_user(session, user); + r = session_set_leader_consume(session, TAKE_PIDREF(leader)); + if (r < 0) + goto fail; + + session->original_type = session->type = t; + session->class = c; + session->remote = remote; + session->vtnr = vtnr; + + if (!isempty(tty)) { + session->tty = strdup(tty); + if (!session->tty) { + r = -ENOMEM; + goto fail; + } + + session->tty_validity = TTY_FROM_PAM; + } + + if (!isempty(display)) { + session->display = strdup(display); + if (!session->display) { + r = -ENOMEM; + goto fail; + } + } + + if (!isempty(remote_user)) { + session->remote_user = strdup(remote_user); + if (!session->remote_user) { + r = -ENOMEM; + goto fail; + } + } + + if (!isempty(remote_host)) { + session->remote_host = strdup(remote_host); + if (!session->remote_host) { + r = -ENOMEM; + goto fail; + } + } + + if (!isempty(service)) { + session->service = strdup(service); + if (!session->service) { + r = -ENOMEM; + goto fail; + } + } + + if (!isempty(desktop)) { + session->desktop = strdup(desktop); + if (!session->desktop) { + r = -ENOMEM; + goto fail; + } + } + + if (seat) { + r = seat_attach_session(seat, session); + if (r < 0) + goto fail; + } + + r = sd_bus_message_enter_container(message, 'a', "(sv)"); + if (r < 0) + goto fail; + + r = session_start(session, message, error); + if (r < 0) + goto fail; + + r = sd_bus_message_exit_container(message); + if (r < 0) + goto fail; + + session->create_message = sd_bus_message_ref(message); + + /* Now, let's wait until the slice unit and stuff got created. We send the reply back from + * session_send_create_reply(). */ + + return 1; + +fail: + if (session) + session_add_to_gc_queue(session); + + if (user) + user_add_to_gc_queue(user); + + return r; +} + +static int method_create_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *service, *type, *class, *cseat, *tty, *display, *remote_user, *remote_host, *desktop; + pid_t leader; + uid_t uid; + int remote; + uint32_t vtnr = 0; + int r; + + assert(message); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + + r = sd_bus_message_read(message, + "uusssssussbss", + &uid, + &leader, + &service, + &type, + &class, + &desktop, + &cseat, + &vtnr, + &tty, + &display, + &remote, + &remote_user, + &remote_host); + if (r < 0) + return r; + + return create_session( + message, + userdata, + error, + uid, + leader, + /* pidfd = */ -EBADF, + service, + type, + class, + desktop, + cseat, + vtnr, + tty, + display, + remote, + remote_user, + remote_host, + /* flags = */ 0); +} + +static int method_create_session_pidfd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *service, *type, *class, *cseat, *tty, *display, *remote_user, *remote_host, *desktop; + int leaderfd = -EBADF; + uid_t uid; + int remote; + uint32_t vtnr = 0; + uint64_t flags; + int r; + + r = sd_bus_message_read(message, + "uhsssssussbsst", + &uid, + &leaderfd, + &service, + &type, + &class, + &desktop, + &cseat, + &vtnr, + &tty, + &display, + &remote, + &remote_user, + &remote_host, + &flags); + if (r < 0) + return r; + + return create_session( + message, + userdata, + error, + uid, + /* pid = */ 0, + leaderfd, + service, + type, + class, + desktop, + cseat, + vtnr, + tty, + display, + remote, + remote_user, + remote_host, + flags); +} + +static int method_release_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Session *session; + const char *name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + r = session_release(session); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_activate_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Session *session; + const char *name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + /* PolicyKit is done by bus_session_method_activate() */ + + return bus_session_method_activate(message, session, error); +} + +static int method_activate_session_on_seat(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *session_name, *seat_name; + Manager *m = ASSERT_PTR(userdata); + Session *session; + Seat *seat; + int r; + + assert(message); + + /* Same as ActivateSession() but refuses to work if the seat doesn't match */ + + r = sd_bus_message_read(message, "ss", &session_name, &seat_name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, session_name, error, &session); + if (r < 0) + return r; + + r = manager_get_seat_from_creds(m, message, seat_name, error, &seat); + if (r < 0) + return r; + + if (session->seat != seat) + return sd_bus_error_setf(error, BUS_ERROR_SESSION_NOT_ON_SEAT, + "Session %s not on seat %s", session_name, seat_name); + + r = check_polkit_chvt(message, m, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_activate(session); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_lock_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Session *session; + const char *name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + return bus_session_method_lock(message, session, error); +} + +static int method_lock_sessions(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.lock-sessions", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_send_lock_all(m, streq(sd_bus_message_get_member(message), "LockSessions")); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_kill_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *name; + Manager *m = ASSERT_PTR(userdata); + Session *session; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + return bus_session_method_kill(message, session, error); +} + +static int method_kill_user(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + uint32_t uid; + User *user; + int r; + + assert(message); + + r = sd_bus_message_read(message, "u", &uid); + if (r < 0) + return r; + + r = manager_get_user_from_creds(m, message, uid, error, &user); + if (r < 0) + return r; + + return bus_user_method_kill(message, user, error); +} + +static int method_terminate_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + Session *session; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_session_from_creds(m, message, name, error, &session); + if (r < 0) + return r; + + return bus_session_method_terminate(message, session, error); +} + +static int method_terminate_user(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + uint32_t uid; + User *user; + int r; + + assert(message); + + r = sd_bus_message_read(message, "u", &uid); + if (r < 0) + return r; + + r = manager_get_user_from_creds(m, message, uid, error, &user); + if (r < 0) + return r; + + return bus_user_method_terminate(message, user, error); +} + +static int method_terminate_seat(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + Seat *seat; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = manager_get_seat_from_creds(m, message, name, error, &seat); + if (r < 0) + return r; + + return bus_seat_method_terminate(message, seat, error); +} + +static int method_set_user_linger(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_free_ char *cc = NULL; + Manager *m = ASSERT_PTR(userdata); + int r, b, interactive; + struct passwd *pw; + const char *path; + uint32_t uid, auth_uid; + + assert(message); + + r = sd_bus_message_read(message, "ubb", &uid, &b, &interactive); + if (r < 0) + return r; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID | + SD_BUS_CREDS_OWNER_UID|SD_BUS_CREDS_AUGMENT, &creds); + if (r < 0) + return r; + + if (!uid_is_valid(uid)) { + /* Note that we get the owner UID of the session or user unit, + * not the actual client UID here! */ + r = sd_bus_creds_get_owner_uid(creds, &uid); + if (r < 0) + return r; + } + + /* owner_uid is racy, so for authorization we must use euid */ + r = sd_bus_creds_get_euid(creds, &auth_uid); + if (r < 0) + return r; + + errno = 0; + pw = getpwuid(uid); + if (!pw) + return errno_or_else(ENOENT); + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + uid == auth_uid ? "org.freedesktop.login1.set-self-linger" : + "org.freedesktop.login1.set-user-linger", + NULL, + interactive, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + (void) mkdir_p_label("/var/lib/systemd", 0755); + r = mkdir_safe_label("/var/lib/systemd/linger", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + return r; + + cc = cescape(pw->pw_name); + if (!cc) + return -ENOMEM; + + path = strjoina("/var/lib/systemd/linger/", cc); + if (b) { + User *u; + + r = touch(path); + if (r < 0) + return r; + + if (manager_add_user_by_uid(m, uid, &u) >= 0) + user_start(u); + + } else { + User *u; + + r = unlink(path); + if (r < 0 && errno != ENOENT) + return -errno; + + u = hashmap_get(m->users, UID_TO_PTR(uid)); + if (u) + user_add_to_gc_queue(u); + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int trigger_device(Manager *m, sd_device *parent) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(m); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + if (parent) { + r = sd_device_enumerator_add_match_parent(e, parent); + if (r < 0) + return r; + } + + FOREACH_DEVICE(e, d) { + r = sd_device_trigger(d, SD_DEVICE_CHANGE); + if (r < 0) + log_device_debug_errno(d, r, "Failed to trigger device, ignoring: %m"); + } + + return 0; +} + +static int attach_device(Manager *m, const char *seat, const char *sysfs, sd_bus_error *error) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + _cleanup_free_ char *rule = NULL, *file = NULL; + const char *id_for_seat; + int r; + + assert(m); + assert(seat); + assert(sysfs); + + r = sd_device_new_from_syspath(&d, sysfs); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to open device '%s': %m", sysfs); + + if (sd_device_has_current_tag(d, "seat") <= 0) + return sd_bus_error_set_errnof(error, ENODEV, "Device '%s' lacks 'seat' udev tag.", sysfs); + + if (sd_device_get_property_value(d, "ID_FOR_SEAT", &id_for_seat) < 0) + return sd_bus_error_set_errnof(error, ENODEV, "Device '%s' lacks 'ID_FOR_SEAT' udev property.", sysfs); + + if (asprintf(&file, "/etc/udev/rules.d/72-seat-%s.rules", id_for_seat) < 0) + return -ENOMEM; + + if (asprintf(&rule, "TAG==\"seat\", ENV{ID_FOR_SEAT}==\"%s\", ENV{ID_SEAT}=\"%s\"", id_for_seat, seat) < 0) + return -ENOMEM; + + (void) mkdir_p_label("/etc/udev/rules.d", 0755); + r = write_string_file_atomic_label(file, rule); + if (r < 0) + return r; + + return trigger_device(m, d); +} + +static int flush_devices(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + + assert(m); + + d = opendir("/etc/udev/rules.d"); + if (!d) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to open /etc/udev/rules.d: %m"); + } else + FOREACH_DIRENT_ALL(de, d, break) { + if (!dirent_is_file(de)) + continue; + + if (!startswith(de->d_name, "72-seat-")) + continue; + + if (!endswith(de->d_name, ".rules")) + continue; + + if (unlinkat(dirfd(d), de->d_name, 0) < 0) + log_warning_errno(errno, "Failed to unlink %s: %m", de->d_name); + } + + return trigger_device(m, NULL); +} + +static int method_attach_device(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *sysfs, *seat; + Manager *m = ASSERT_PTR(userdata); + int interactive, r; + + assert(message); + + r = sd_bus_message_read(message, "ssb", &seat, &sysfs, &interactive); + if (r < 0) + return r; + + if (!path_is_normalized(sysfs)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", sysfs); + if (!path_startswith(sysfs, "/sys")) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not in /sys", sysfs); + + if (SEAT_IS_SELF(seat) || SEAT_IS_AUTO(seat)) { + Seat *found; + + r = manager_get_seat_from_creds(m, message, seat, error, &found); + if (r < 0) + return r; + + seat = found->id; + + } else if (!seat_name_is_valid(seat)) /* Note that a seat does not have to exist yet for this operation to succeed */ + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Seat name %s is not valid", seat); + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.attach-device", + NULL, + interactive, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = attach_device(m, seat, sysfs, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_flush_devices(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + int interactive, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &interactive); + if (r < 0) + return r; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.flush-devices", + NULL, + interactive, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = flush_devices(m); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int have_multiple_sessions( + Manager *m, + uid_t uid) { + + Session *session; + + assert(m); + + /* Check for other users' sessions. Greeter sessions do not + * count, and non-login sessions do not count either. */ + HASHMAP_FOREACH(session, m->sessions) + if (session->class == SESSION_USER && + session->user->user_record->uid != uid) + return true; + + return false; +} + +static int bus_manager_log_shutdown( + Manager *m, + const HandleActionData *a) { + assert(m); + assert(a); + + const char *message = a->message ?: "System is shutting down"; + const char *log_verb = a->log_verb ? strjoina("SHUTDOWN=", a->log_verb) : NULL; + + return log_struct(LOG_NOTICE, + "MESSAGE_ID=%s", a->message_id ?: SD_MESSAGE_SHUTDOWN_STR, + LOG_MESSAGE("%s%s%s%s.", + message, + m->wall_message ? " (" : "", + strempty(m->wall_message), + m->wall_message ? ")" : ""), + log_verb); +} + +static int lid_switch_ignore_handler(sd_event_source *e, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(e); + + m->lid_switch_ignore_event_source = sd_event_source_unref(m->lid_switch_ignore_event_source); + return 0; +} + +int manager_set_lid_switch_ignore(Manager *m, usec_t until) { + int r; + + assert(m); + + if (until <= now(CLOCK_MONOTONIC)) + return 0; + + /* We want to ignore the lid switch for a while after each + * suspend, and after boot-up. Hence let's install a timer for + * this. As long as the event source exists we ignore the lid + * switch. */ + + if (m->lid_switch_ignore_event_source) { + usec_t u; + + r = sd_event_source_get_time(m->lid_switch_ignore_event_source, &u); + if (r < 0) + return r; + + if (until <= u) + return 0; + + r = sd_event_source_set_time(m->lid_switch_ignore_event_source, until); + } else + r = sd_event_add_time( + m->event, + &m->lid_switch_ignore_event_source, + CLOCK_MONOTONIC, + until, 0, + lid_switch_ignore_handler, m); + + return r; +} + +static int send_prepare_for(Manager *m, const HandleActionData *a, bool _active) { + int k = 0, r, active = _active; + + assert(m); + assert(a); + assert(IN_SET(a->inhibit_what, INHIBIT_SHUTDOWN, INHIBIT_SLEEP)); + + /* We need to send both old and new signal for backward compatibility. The newer one allows clients + * to know which type of reboot is going to happen, as they might be doing different actions (e.g.: + * on soft-reboot), and it is sent first, so that clients know that if they receive the old one + * first then they don't have to wait for the new one, as it means it's not supported. So, do not + * change the order here, as it is an API. */ + if (a->inhibit_what == INHIBIT_SHUTDOWN) { + k = sd_bus_emit_signal(m->bus, + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + "PrepareForShutdownWithMetadata", + "ba{sv}", + active, + 1, + "type", + "s", + handle_action_to_string(a->handle)); + if (k < 0) + log_debug_errno(k, "Failed to emit PrepareForShutdownWithMetadata(): %m"); + } + + r = sd_bus_emit_signal(m->bus, + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + a->inhibit_what == INHIBIT_SHUTDOWN ? "PrepareForShutdown" : "PrepareForSleep", + "b", + active); + if (r < 0) + log_debug_errno(r, "Failed to emit PrepareForShutdown(): %m"); + + return RET_GATHER(k, r); +} + +static int execute_shutdown_or_sleep( + Manager *m, + const HandleActionData *a, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *p; + int r; + + assert(m); + assert(a); + + if (a->inhibit_what == INHIBIT_SHUTDOWN) + bus_manager_log_shutdown(m, a); + + r = bus_call_method( + m->bus, + bus_systemd_mgr, + "StartUnit", + error, + &reply, + "ss", a->target, "replace-irreversibly"); + if (r < 0) + goto error; + + r = sd_bus_message_read(reply, "o", &p); + if (r < 0) + goto error; + + r = free_and_strdup(&m->action_job, p); + if (r < 0) + goto error; + + m->delayed_action = a; + + /* Make sure the lid switch is ignored for a while */ + manager_set_lid_switch_ignore(m, usec_add(now(CLOCK_MONOTONIC), m->holdoff_timeout_usec)); + + return 0; + +error: + /* Tell people that they now may take a lock again */ + (void) send_prepare_for(m, a, false); + + return r; +} + +int manager_dispatch_delayed(Manager *manager, bool timeout) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Inhibitor *offending = NULL; + int r; + + assert(manager); + + if (!manager->delayed_action || manager->action_job) + return 0; + + if (manager_is_inhibited(manager, manager->delayed_action->inhibit_what, INHIBIT_DELAY, NULL, false, false, 0, &offending)) { + _cleanup_free_ char *comm = NULL, *u = NULL; + + if (!timeout) + return 0; + + (void) pidref_get_comm(&offending->pid, &comm); + u = uid_to_name(offending->uid); + + log_notice("Delay lock is active (UID "UID_FMT"/%s, PID "PID_FMT"/%s) but inhibitor timeout is reached.", + offending->uid, strna(u), + offending->pid.pid, strna(comm)); + } + + /* Actually do the operation */ + r = execute_shutdown_or_sleep(manager, manager->delayed_action, &error); + if (r < 0) { + log_warning("Error during inhibitor-delayed operation (already returned success to client): %s", + bus_error_message(&error, r)); + + manager->delayed_action = NULL; + } + + return 1; /* We did some work. */ +} + +static int manager_inhibit_timeout_handler( + sd_event_source *s, + uint64_t usec, + void *userdata) { + + Manager *manager = ASSERT_PTR(userdata); + + assert(manager->inhibit_timeout_source == s); + + return manager_dispatch_delayed(manager, true); +} + +static int delay_shutdown_or_sleep( + Manager *m, + const HandleActionData *a) { + + int r; + + assert(m); + assert(a); + + if (m->inhibit_timeout_source) { + r = sd_event_source_set_time_relative(m->inhibit_timeout_source, m->inhibit_delay_max); + if (r < 0) + return log_error_errno(r, "sd_event_source_set_time_relative() failed: %m"); + + r = sd_event_source_set_enabled(m->inhibit_timeout_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "sd_event_source_set_enabled() failed: %m"); + } else { + r = sd_event_add_time_relative( + m->event, + &m->inhibit_timeout_source, + CLOCK_MONOTONIC, m->inhibit_delay_max, 0, + manager_inhibit_timeout_handler, m); + if (r < 0) + return r; + } + + m->delayed_action = a; + + return 0; +} + +int bus_manager_shutdown_or_sleep_now_or_later( + Manager *m, + const HandleActionData *a, + sd_bus_error *error) { + + _cleanup_free_ char *load_state = NULL; + bool delayed; + int r; + + assert(m); + assert(a); + assert(!m->action_job); + + r = unit_load_state(m->bus, a->target, &load_state); + if (r < 0) + return r; + + if (!streq(load_state, "loaded")) + return log_notice_errno(SYNTHETIC_ERRNO(EACCES), + "Unit %s is %s, refusing operation.", + a->target, load_state); + + /* Tell everybody to prepare for shutdown/sleep */ + (void) send_prepare_for(m, a, true); + + delayed = + m->inhibit_delay_max > 0 && + manager_is_inhibited(m, a->inhibit_what, INHIBIT_DELAY, NULL, false, false, 0, NULL); + + if (delayed) + /* Shutdown is delayed, keep in mind what we + * want to do, and start a timeout */ + r = delay_shutdown_or_sleep(m, a); + else + /* Shutdown is not delayed, execute it + * immediately */ + r = execute_shutdown_or_sleep(m, a, error); + + return r; +} + +static int verify_shutdown_creds( + Manager *m, + sd_bus_message *message, + const HandleActionData *a, + uint64_t flags, + sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + bool multiple_sessions, blocked, interactive; + uid_t uid; + int r; + + assert(m); + assert(a); + assert(message); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + r = have_multiple_sessions(m, uid); + if (r < 0) + return r; + + multiple_sessions = r > 0; + blocked = manager_is_inhibited(m, a->inhibit_what, INHIBIT_BLOCK, NULL, false, true, uid, NULL); + interactive = flags & SD_LOGIND_INTERACTIVE; + + if (multiple_sessions) { + r = bus_verify_polkit_async( + message, + CAP_SYS_BOOT, + a->polkit_action_multiple_sessions, + NULL, + interactive, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + } + + if (blocked) { + /* We don't check polkit for root here, because you can't be more privileged than root */ + if (uid == 0 && (flags & SD_LOGIND_ROOT_CHECK_INHIBITORS)) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, + "Access denied to root due to active block inhibitor"); + + r = bus_verify_polkit_async(message, + CAP_SYS_BOOT, + a->polkit_action_ignore_inhibit, + NULL, + interactive, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + } + + if (!multiple_sessions && !blocked) { + r = bus_verify_polkit_async(message, + CAP_SYS_BOOT, + a->polkit_action, + NULL, + interactive, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + } + + return 0; +} + +static int setup_wall_message_timer(Manager *m, sd_bus_message* message) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + int r; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_AUGMENT|SD_BUS_CREDS_TTY|SD_BUS_CREDS_UID, &creds); + if (r >= 0) { + const char *tty = NULL; + + (void) sd_bus_creds_get_uid(creds, &m->scheduled_shutdown_uid); + (void) sd_bus_creds_get_tty(creds, &tty); + + r = free_and_strdup(&m->scheduled_shutdown_tty, tty); + if (r < 0) + return log_oom(); + } + + r = manager_setup_wall_message_timer(m); + if (r < 0) + return r; + + return 0; +} + +static int method_do_shutdown_or_sleep( + Manager *m, + sd_bus_message *message, + const HandleActionData *a, + bool with_flags, + sd_bus_error *error) { + + uint64_t flags; + int r; + + assert(m); + assert(message); + assert(a); + + if (with_flags) { + /* New style method: with flags parameter (and interactive bool in the bus message header) */ + r = sd_bus_message_read(message, "t", &flags); + if (r < 0) + return r; + if ((flags & ~SD_LOGIND_SHUTDOWN_AND_SLEEP_FLAGS_PUBLIC) != 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid flags parameter"); + + if (FLAGS_SET(flags, (SD_LOGIND_REBOOT_VIA_KEXEC|SD_LOGIND_SOFT_REBOOT))) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "Both reboot via kexec and soft reboot selected, which is not supported"); + + if (a->handle != HANDLE_REBOOT) { + if (flags & SD_LOGIND_REBOOT_VIA_KEXEC) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "Reboot via kexec option is only applicable with reboot operations"); + if ((flags & SD_LOGIND_SOFT_REBOOT) || (flags & SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "Soft reboot option is only applicable with reboot operations"); + } + } else { + /* Old style method: no flags parameter, but interactive bool passed as boolean in + * payload. Let's convert this argument to the new-style flags parameter for our internal + * use. */ + int interactive; + + r = sd_bus_message_read(message, "b", &interactive); + if (r < 0) + return r; + + flags = interactive ? SD_LOGIND_INTERACTIVE : 0; + } + + if ((flags & SD_LOGIND_SOFT_REBOOT) || + ((flags & SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP) && path_is_os_tree("/run/nextroot") > 0)) + a = handle_action_lookup(HANDLE_SOFT_REBOOT); + else if ((flags & SD_LOGIND_REBOOT_VIA_KEXEC) && kexec_loaded()) + a = handle_action_lookup(HANDLE_KEXEC); + + /* Don't allow multiple jobs being executed at the same time */ + if (m->delayed_action) + return sd_bus_error_setf(error, BUS_ERROR_OPERATION_IN_PROGRESS, + "There's already a shutdown or sleep operation in progress"); + + if (a->sleep_operation >= 0) { + SleepSupport support; + + r = sleep_supported_full(a->sleep_operation, &support); + if (r < 0) + return r; + if (r == 0) + switch (support) { + + case SLEEP_DISABLED: + return sd_bus_error_setf(error, BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, + "Sleep verb '%s' is disabled by config", + sleep_operation_to_string(a->sleep_operation)); + + case SLEEP_NOT_CONFIGURED: + case SLEEP_STATE_OR_MODE_NOT_SUPPORTED: + case SLEEP_ALARM_NOT_SUPPORTED: + return sd_bus_error_setf(error, BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, + "Sleep verb '%s' is not configured or configuration is not supported by kernel", + sleep_operation_to_string(a->sleep_operation)); + + case SLEEP_RESUME_NOT_SUPPORTED: + return sd_bus_error_set(error, BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, + "Not running on EFI and resume= is not set. No available method to resume from hibernation"); + + case SLEEP_NOT_ENOUGH_SWAP_SPACE: + return sd_bus_error_set(error, BUS_ERROR_SLEEP_VERB_NOT_SUPPORTED, + "Not enough suitable swap space for hibernation available on compatible block devices and file systems"); + + default: + assert_not_reached(); + + } + } + + r = verify_shutdown_creds(m, message, a, flags, error); + if (r != 0) + return r; + + /* reset case we're shorting a scheduled shutdown */ + m->unlink_nologin = false; + reset_scheduled_shutdown(m); + + m->scheduled_shutdown_timeout = 0; + m->scheduled_shutdown_action = a; + + (void) setup_wall_message_timer(m, message); + + r = bus_manager_shutdown_or_sleep_now_or_later(m, a, error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_POWEROFF), + sd_bus_message_is_method_call(message, NULL, "PowerOffWithFlags"), + error); +} + +static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_REBOOT), + sd_bus_message_is_method_call(message, NULL, "RebootWithFlags"), + error); +} + +static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_HALT), + sd_bus_message_is_method_call(message, NULL, "HaltWithFlags"), + error); +} + +static int method_suspend(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_SUSPEND), + sd_bus_message_is_method_call(message, NULL, "SuspendWithFlags"), + error); +} + +static int method_hibernate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_HIBERNATE), + sd_bus_message_is_method_call(message, NULL, "HibernateWithFlags"), + error); +} + +static int method_hybrid_sleep(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_HYBRID_SLEEP), + sd_bus_message_is_method_call(message, NULL, "HybridSleepWithFlags"), + error); +} + +static int method_suspend_then_hibernate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_do_shutdown_or_sleep( + m, message, + handle_action_lookup(HANDLE_SUSPEND_THEN_HIBERNATE), + sd_bus_message_is_method_call(message, NULL, "SuspendThenHibernateWithFlags"), + error); +} + +static int nologin_timeout_handler( + sd_event_source *s, + uint64_t usec, + void *userdata) { + + Manager *m = userdata; + + log_info("Creating /run/nologin, blocking further logins..."); + + m->unlink_nologin = + create_shutdown_run_nologin_or_warn() >= 0; + + return 0; +} + +static usec_t nologin_timeout_usec(usec_t elapse) { + /* Issue /run/nologin five minutes before shutdown */ + return LESS_BY(elapse, 5 * USEC_PER_MINUTE); +} + +void manager_load_scheduled_shutdown(Manager *m) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *usec = NULL, + *warn_wall = NULL, + *mode = NULL, + *wall_message = NULL, + *uid = NULL, + *tty = NULL; + int r; + + assert(m); + + r = parse_env_file(f, SHUTDOWN_SCHEDULE_FILE, + "USEC", &usec, + "WARN_WALL", &warn_wall, + "MODE", &mode, + "WALL_MESSAGE", &wall_message, + "UID", &uid, + "TTY", &tty); + + /* reset will delete the file */ + reset_scheduled_shutdown(m); + + if (r == -ENOENT) + return; + if (r < 0) + return (void) log_debug_errno(r, "Failed to parse " SHUTDOWN_SCHEDULE_FILE ": %m"); + + HandleAction handle = handle_action_from_string(mode); + if (handle < 0) + return (void) log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse scheduled shutdown type: %s", mode); + + if (!usec) + return (void) log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "USEC is required"); + if (deserialize_usec(usec, &m->scheduled_shutdown_timeout) < 0) + return; + + /* assign parsed type only after we know usec is also valid */ + m->scheduled_shutdown_action = handle_action_lookup(handle); + + if (warn_wall) { + r = parse_boolean(warn_wall); + if (r < 0) + log_debug_errno(r, "Failed to parse enabling wall messages"); + else + m->enable_wall_messages = r; + } + + if (wall_message) { + _cleanup_free_ char *unescaped = NULL; + r = cunescape(wall_message, 0, &unescaped); + if (r < 0) + log_debug_errno(r, "Failed to parse wall message: %s", wall_message); + else + free_and_replace(m->wall_message, unescaped); + } + + if (uid) { + r = parse_uid(uid, &m->scheduled_shutdown_uid); + if (r < 0) + log_debug_errno(r, "Failed to parse wall uid: %s", uid); + } + + free_and_replace(m->scheduled_shutdown_tty, tty); + + r = manager_setup_shutdown_timers(m); + if (r < 0) + return reset_scheduled_shutdown(m); + + (void) manager_setup_wall_message_timer(m); + (void) update_schedule_file(m); + + return; +} + +static int update_schedule_file(Manager *m) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + assert(m->scheduled_shutdown_action); + + r = mkdir_parents_label(SHUTDOWN_SCHEDULE_FILE, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create shutdown subdirectory: %m"); + + r = fopen_temporary(SHUTDOWN_SCHEDULE_FILE, &f, &temp_path); + if (r < 0) + return log_error_errno(r, "Failed to save information about scheduled shutdowns: %m"); + + (void) fchmod(fileno(f), 0644); + + serialize_usec(f, "USEC", m->scheduled_shutdown_timeout); + serialize_item_format(f, "WARN_WALL", "%s", one_zero(m->enable_wall_messages)); + serialize_item_format(f, "MODE", "%s", handle_action_to_string(m->scheduled_shutdown_action->handle)); + serialize_item_format(f, "UID", UID_FMT, m->scheduled_shutdown_uid); + + if (m->scheduled_shutdown_tty) + serialize_item_format(f, "TTY", "%s", m->scheduled_shutdown_tty); + + if (!isempty(m->wall_message)) { + r = serialize_item_escaped(f, "WALL_MESSAGE", m->wall_message); + if (r < 0) + goto fail; + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, SHUTDOWN_SCHEDULE_FILE) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + return 0; + +fail: + (void) unlink(SHUTDOWN_SCHEDULE_FILE); + + return log_error_errno(r, "Failed to write information about scheduled shutdowns: %m"); +} + +static void reset_scheduled_shutdown(Manager *m) { + assert(m); + + m->scheduled_shutdown_timeout_source = sd_event_source_unref(m->scheduled_shutdown_timeout_source); + m->wall_message_timeout_source = sd_event_source_unref(m->wall_message_timeout_source); + m->nologin_timeout_source = sd_event_source_unref(m->nologin_timeout_source); + + m->scheduled_shutdown_action = NULL; + m->scheduled_shutdown_timeout = USEC_INFINITY; + m->scheduled_shutdown_uid = UID_INVALID; + m->scheduled_shutdown_tty = mfree(m->scheduled_shutdown_tty); + m->shutdown_dry_run = false; + + if (m->unlink_nologin) { + (void) unlink_or_warn("/run/nologin"); + m->unlink_nologin = false; + } + + (void) unlink(SHUTDOWN_SCHEDULE_FILE); +} + +static int manager_scheduled_shutdown_handler( + sd_event_source *s, + uint64_t usec, + void *userdata) { + + const HandleActionData *a = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + a = m->scheduled_shutdown_action; + assert(a); + + /* Don't allow multiple jobs being executed at the same time */ + if (m->delayed_action) { + r = -EALREADY; + log_error("Scheduled shutdown to %s failed: shutdown or sleep operation already in progress", a->target); + goto error; + } + + if (m->shutdown_dry_run) { + /* We do not process delay inhibitors here. Otherwise, we + * would have to be considered "in progress" (like the check + * above) for some seconds after our admin has seen the final + * wall message. */ + + bus_manager_log_shutdown(m, a); + log_info("Running in dry run, suppressing action."); + reset_scheduled_shutdown(m); + + return 0; + } + + r = bus_manager_shutdown_or_sleep_now_or_later(m, m->scheduled_shutdown_action, &error); + if (r < 0) { + log_error_errno(r, "Scheduled shutdown to %s failed: %m", a->target); + goto error; + } + + return 0; + +error: + reset_scheduled_shutdown(m); + return r; +} + +static int method_schedule_shutdown(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + HandleAction handle; + const HandleActionData *a; + uint64_t elapse; + char *type; + int r; + bool dry_run = false; + + assert(message); + + r = sd_bus_message_read(message, "st", &type, &elapse); + if (r < 0) + return r; + + if (startswith(type, "dry-")) { + type += 4; + dry_run = true; + } + + handle = handle_action_from_string(type); + if (!HANDLE_ACTION_IS_SHUTDOWN(handle)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unsupported shutdown type: %s", type); + + a = handle_action_lookup(handle); + assert(a); + assert(a->polkit_action); + + r = verify_shutdown_creds(m, message, a, 0, error); + if (r != 0) + return r; + + m->scheduled_shutdown_action = a; + m->shutdown_dry_run = dry_run; + m->scheduled_shutdown_timeout = elapse; + + r = manager_setup_shutdown_timers(m); + if (r < 0) + return r; + + r = setup_wall_message_timer(m, message); + if (r >= 0) + r = update_schedule_file(m); + + if (r < 0) { + reset_scheduled_shutdown(m); + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int manager_setup_shutdown_timers(Manager* m) { + int r; + + r = event_reset_time(m->event, &m->scheduled_shutdown_timeout_source, + CLOCK_REALTIME, + m->scheduled_shutdown_timeout, 0, + manager_scheduled_shutdown_handler, m, + 0, "scheduled-shutdown-timeout", true); + if (r < 0) + goto fail; + + r = event_reset_time(m->event, &m->nologin_timeout_source, + CLOCK_REALTIME, + nologin_timeout_usec(m->scheduled_shutdown_timeout), 0, + nologin_timeout_handler, m, + 0, "nologin-timeout", true); + if (r < 0) + goto fail; + + return 0; + +fail: + m->scheduled_shutdown_timeout_source = sd_event_source_unref(m->scheduled_shutdown_timeout_source); + m->nologin_timeout_source = sd_event_source_unref(m->nologin_timeout_source); + + return r; +} + +static int method_cancel_scheduled_shutdown(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + const HandleActionData *a; + bool cancelled; + int r; + + assert(message); + + cancelled = m->scheduled_shutdown_action + && !IN_SET(m->scheduled_shutdown_action->handle, HANDLE_IGNORE, _HANDLE_ACTION_INVALID); + if (!cancelled) + return sd_bus_reply_method_return(message, "b", false); + + a = m->scheduled_shutdown_action; + if (!a->polkit_action) + return sd_bus_error_set(error, SD_BUS_ERROR_AUTH_FAILED, "Unsupported shutdown type"); + + r = bus_verify_polkit_async( + message, + CAP_SYS_BOOT, + a->polkit_action, + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + if (m->enable_wall_messages) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *tty = NULL; + uid_t uid = 0; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_AUGMENT|SD_BUS_CREDS_TTY|SD_BUS_CREDS_UID, &creds); + if (r >= 0) { + (void) sd_bus_creds_get_uid(creds, &uid); + (void) sd_bus_creds_get_tty(creds, &tty); + } + + _cleanup_free_ char *username = uid_to_name(uid); + + log_struct(LOG_INFO, + LOG_MESSAGE("System shutdown has been cancelled"), + "ACTION=%s", handle_action_to_string(a->handle), + "MESSAGE_ID=" SD_MESSAGE_SHUTDOWN_CANCELED_STR, + username ? "OPERATOR=%s" : NULL, username); + + (void) wall("System shutdown has been cancelled", + username, tty, logind_wall_tty_filter, m); + } + + reset_scheduled_shutdown(m); + + return sd_bus_reply_method_return(message, "b", true); +} + +static int method_can_shutdown_or_sleep( + Manager *m, + sd_bus_message *message, + const HandleActionData *a, + sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + bool multiple_sessions, challenge, blocked; + const char *result = NULL; + uid_t uid; + int r; + + assert(m); + assert(message); + assert(a); + + if (a->sleep_operation >= 0) { + SleepSupport support; + + r = sleep_supported_full(a->sleep_operation, &support); + if (r < 0) + return r; + if (r == 0) + return sd_bus_reply_method_return(message, "s", support == SLEEP_DISABLED ? "no" : "na"); + } + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + r = have_multiple_sessions(m, uid); + if (r < 0) + return r; + + multiple_sessions = r > 0; + blocked = manager_is_inhibited(m, a->inhibit_what, INHIBIT_BLOCK, NULL, false, true, uid, NULL); + + HandleAction handle = handle_action_from_string(sleep_operation_to_string(a->sleep_operation)); + if (handle >= 0) { + const char *target; + + target = handle_action_lookup(handle)->target; + if (target) { + _cleanup_free_ char *load_state = NULL; + + r = unit_load_state(m->bus, target, &load_state); + if (r < 0) + return r; + + if (!streq(load_state, "loaded")) { + result = "no"; + goto finish; + } + } + } + + if (multiple_sessions) { + r = bus_test_polkit(message, CAP_SYS_BOOT, a->polkit_action_multiple_sessions, NULL, UID_INVALID, &challenge, error); + if (r < 0) + return r; + + if (r > 0) + result = "yes"; + else if (challenge) + result = "challenge"; + else + result = "no"; + } + + if (blocked) { + r = bus_test_polkit(message, CAP_SYS_BOOT, a->polkit_action_ignore_inhibit, NULL, UID_INVALID, &challenge, error); + if (r < 0) + return r; + + if (r > 0) { + if (!result) + result = "yes"; + } else if (challenge) { + if (!result || streq(result, "yes")) + result = "challenge"; + } else + result = "no"; + } + + if (!multiple_sessions && !blocked) { + /* If neither inhibit nor multiple sessions + * apply then just check the normal policy */ + + r = bus_test_polkit(message, CAP_SYS_BOOT, a->polkit_action, NULL, UID_INVALID, &challenge, error); + if (r < 0) + return r; + + if (r > 0) + result = "yes"; + else if (challenge) + result = "challenge"; + else + result = "no"; + } + + finish: + return sd_bus_reply_method_return(message, "s", result); +} + +static int method_can_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_POWEROFF), + error); +} + +static int method_can_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_REBOOT), + error); +} + +static int method_can_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_HALT), + error); +} + +static int method_can_suspend(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_SUSPEND), + error); +} + +static int method_can_hibernate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_HIBERNATE), + error); +} + +static int method_can_hybrid_sleep(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_HYBRID_SLEEP), + error); +} + +static int method_can_suspend_then_hibernate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + + return method_can_shutdown_or_sleep( + m, message, handle_action_lookup(HANDLE_SUSPEND_THEN_HIBERNATE), + error); +} + +static int property_get_reboot_parameter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + _cleanup_free_ char *parameter = NULL; + int r; + + assert(bus); + assert(reply); + assert(userdata); + + r = read_reboot_parameter(¶meter); + if (r < 0) + return r; + + return sd_bus_message_append(reply, "s", parameter); +} + +static int method_set_reboot_parameter( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + const char *arg; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &arg); + if (r < 0) + return r; + + r = detect_container(); + if (r < 0) + return r; + if (r > 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, + "Reboot parameter not supported in containers, refusing."); + + r = bus_verify_polkit_async(message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-parameter", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = update_reboot_parameter_and_warn(arg, false); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_can_reboot_parameter( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _unused_ Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = detect_container(); + if (r < 0) + return r; + if (r > 0) /* Inside containers, specifying a reboot parameter, doesn't make much sense */ + return sd_bus_reply_method_return(message, "s", "na"); + + return return_test_polkit( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-parameter", + NULL, + UID_INVALID, + error); +} + +static int property_get_reboot_to_firmware_setup( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + int r; + + assert(bus); + assert(reply); + assert(userdata); + + r = getenv_bool("SYSTEMD_REBOOT_TO_FIRMWARE_SETUP"); + if (r == -ENXIO) { + /* EFI case: let's see what is currently configured in the EFI variables */ + r = efi_get_reboot_to_firmware(); + if (r < 0 && r != -EOPNOTSUPP) + log_warning_errno(r, "Failed to determine reboot-to-firmware-setup state: %m"); + } else if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_FIRMWARE_SETUP: %m"); + else if (r > 0) { + /* Non-EFI case: let's see whether /run/systemd/reboot-to-firmware-setup exists. */ + if (access("/run/systemd/reboot-to-firmware-setup", F_OK) < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to check whether /run/systemd/reboot-to-firmware-setup exists: %m"); + + r = false; + } else + r = true; + } + + return sd_bus_message_append(reply, "b", r > 0); +} + +static int method_set_reboot_to_firmware_setup( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + bool use_efi; + int b, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = getenv_bool("SYSTEMD_REBOOT_TO_FIRMWARE_SETUP"); + if (r == -ENXIO) { + /* EFI case: let's see what the firmware supports */ + + r = efi_reboot_to_firmware_supported(); + if (r == -EOPNOTSUPP) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Firmware does not support boot into firmware."); + if (r < 0) + return r; + + use_efi = true; + + } else if (r <= 0) { + /* non-EFI case: $SYSTEMD_REBOOT_TO_FIRMWARE_SETUP is set to off */ + + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_FIRMWARE_SETUP: %m"); + + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Firmware does not support boot into firmware."); + } else + /* non-EFI case: $SYSTEMD_REBOOT_TO_FIRMWARE_SETUP is set to on */ + use_efi = false; + + r = bus_verify_polkit_async(message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-to-firmware-setup", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + if (use_efi) { + r = efi_set_reboot_to_firmware(b); + if (r < 0) + return r; + } else { + if (b) { + r = touch("/run/systemd/reboot-to-firmware-setup"); + if (r < 0) + return r; + } else { + if (unlink("/run/systemd/reboot-to-firmware-setup") < 0 && errno != ENOENT) + return -errno; + } + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_can_reboot_to_firmware_setup( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _unused_ Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = getenv_bool("SYSTEMD_REBOOT_TO_FIRMWARE_SETUP"); + if (r == -ENXIO) { + /* EFI case: let's see what the firmware supports */ + + r = efi_reboot_to_firmware_supported(); + if (r < 0) { + if (r != -EOPNOTSUPP) + log_warning_errno(r, "Failed to determine whether reboot to firmware is supported: %m"); + + return sd_bus_reply_method_return(message, "s", "na"); + } + + } else if (r <= 0) { + /* Non-EFI case: let's trust $SYSTEMD_REBOOT_TO_FIRMWARE_SETUP */ + + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_FIRMWARE_SETUP: %m"); + + return sd_bus_reply_method_return(message, "s", "na"); + } + + return return_test_polkit( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-to-firmware-setup", + NULL, + UID_INVALID, + error); +} + +static int property_get_reboot_to_boot_loader_menu( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t x = UINT64_MAX; + int r; + + assert(bus); + assert(reply); + assert(userdata); + + r = getenv_bool("SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU"); + if (r == -ENXIO) { + /* EFI case: returns the current value of LoaderConfigTimeoutOneShot. Three cases are distinguished: + * + * 1. Variable not set, boot into boot loader menu is not enabled (we return UINT64_MAX to the user) + * 2. Variable set to "0", boot into boot loader menu is enabled with no timeout (we return 0 to the user) + * 3. Variable set to numeric value formatted in ASCII, boot into boot loader menu with the specified timeout in seconds + */ + + r = efi_loader_get_config_timeout_one_shot(&x); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to read LoaderConfigTimeoutOneShot variable, ignoring: %m"); + } + + } else if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU: %m"); + else if (r > 0) { + _cleanup_free_ char *v = NULL; + + /* Non-EFI case, let's process /run/systemd/reboot-to-boot-loader-menu. */ + + r = read_one_line_file("/run/systemd/reboot-to-boot-loader-menu", &v); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to read /run/systemd/reboot-to-boot-loader-menu: %m"); + } else { + r = safe_atou64(v, &x); + if (r < 0) + log_warning_errno(r, "Failed to parse /run/systemd/reboot-to-boot-loader-menu: %m"); + } + } + + return sd_bus_message_append(reply, "t", x); +} + +static int method_set_reboot_to_boot_loader_menu( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + bool use_efi; + uint64_t x; + int r; + + assert(message); + + r = sd_bus_message_read(message, "t", &x); + if (r < 0) + return r; + + r = getenv_bool("SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU"); + if (r == -ENXIO) { + uint64_t features; + + /* EFI case: let's see if booting into boot loader menu is supported. */ + + r = efi_loader_get_features(&features); + if (r < 0) + log_warning_errno(r, "Failed to determine whether reboot to boot loader menu is supported: %m"); + if (r < 0 || !FLAGS_SET(features, EFI_LOADER_FEATURE_CONFIG_TIMEOUT_ONE_SHOT)) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Boot loader does not support boot into boot loader menu."); + + use_efi = true; + + } else if (r <= 0) { + /* non-EFI case: $SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU is set to off */ + + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU: %m"); + + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Boot loader does not support boot into boot loader menu."); + } else + /* non-EFI case: $SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU is set to on */ + use_efi = false; + + r = bus_verify_polkit_async(message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-to-boot-loader-menu", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + if (use_efi) { + if (x == UINT64_MAX) + r = efi_set_variable(EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot), NULL, 0); + else { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + xsprintf(buf, "%" PRIu64, DIV_ROUND_UP(x, USEC_PER_SEC)); /* second granularity */ + + r = efi_set_variable_string(EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot), buf); + } + if (r < 0) + return r; + } else { + if (x == UINT64_MAX) { + if (unlink("/run/systemd/reboot-to-boot-loader-menu") < 0 && errno != ENOENT) + return -errno; + } else { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + + xsprintf(buf, "%" PRIu64, x); /* μs granularity */ + + r = write_string_file_atomic_label("/run/systemd/reboot-to-boot-loader-menu", buf); + if (r < 0) + return r; + } + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_can_reboot_to_boot_loader_menu( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _unused_ Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = getenv_bool("SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU"); + if (r == -ENXIO) { + uint64_t features = 0; + + /* EFI case, let's see if booting into boot loader menu is supported. */ + + r = efi_loader_get_features(&features); + if (r < 0) + log_warning_errno(r, "Failed to determine whether reboot to boot loader menu is supported: %m"); + if (r < 0 || !FLAGS_SET(features, EFI_LOADER_FEATURE_CONFIG_TIMEOUT_ONE_SHOT)) + return sd_bus_reply_method_return(message, "s", "na"); + + } else if (r <= 0) { + /* Non-EFI case: let's trust $SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU */ + + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_BOOT_LOADER_MENU: %m"); + + return sd_bus_reply_method_return(message, "s", "na"); + } + + return return_test_polkit( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-to-boot-loader-menu", + NULL, + UID_INVALID, + error); +} + +static int property_get_reboot_to_boot_loader_entry( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *v = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *x = NULL; + int r; + + assert(bus); + assert(reply); + + r = getenv_bool("SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY"); + if (r == -ENXIO) { + /* EFI case: let's read the LoaderEntryOneShot variable */ + + r = efi_loader_update_entry_one_shot_cache(&m->efi_loader_entry_one_shot, &m->efi_loader_entry_one_shot_stat); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to read LoaderEntryOneShot variable, ignoring: %m"); + } else + x = m->efi_loader_entry_one_shot; + + } else if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY: %m"); + else if (r > 0) { + + /* Non-EFI case, let's process /run/systemd/reboot-to-boot-loader-entry. */ + + r = read_one_line_file("/run/systemd/reboot-to-boot-loader-entry", &v); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to read /run/systemd/reboot-to-boot-loader-entry, ignoring: %m"); + } else if (!efi_loader_entry_name_valid(v)) + log_warning("/run/systemd/reboot-to-boot-loader-entry is not valid, ignoring."); + else + x = v; + } + + return sd_bus_message_append(reply, "s", x); +} + +static int boot_loader_entry_exists(Manager *m, const char *id) { + _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL; + int r; + + assert(m); + assert(id); + + r = boot_config_load_auto(&config, NULL, NULL); + if (r < 0 && r != -ENOKEY) /* don't complain if no GPT is found, hence skip ENOKEY */ + return r; + + r = manager_read_efi_boot_loader_entries(m); + if (r >= 0) + (void) boot_config_augment_from_loader(&config, m->efi_boot_loader_entries, /* auto_only= */ true); + + return !!boot_config_find_entry(&config, id); +} + +static int method_set_reboot_to_boot_loader_entry( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + bool use_efi; + const char *v; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (isempty(v)) + v = NULL; + else if (efi_loader_entry_name_valid(v)) { + r = boot_loader_entry_exists(m, v); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Boot loader entry '%s' is not known.", v); + } else + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Boot loader entry name '%s' is not valid, refusing.", v); + + r = getenv_bool("SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY"); + if (r == -ENXIO) { + uint64_t features; + + /* EFI case: let's see if booting into boot loader entry is supported. */ + + r = efi_loader_get_features(&features); + if (r < 0) + log_warning_errno(r, "Failed to determine whether reboot into boot loader entry is supported: %m"); + if (r < 0 || !FLAGS_SET(features, EFI_LOADER_FEATURE_ENTRY_ONESHOT)) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Loader does not support boot into boot loader entry."); + + use_efi = true; + + } else if (r <= 0) { + /* non-EFI case: $SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY is set to off */ + + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY: %m"); + + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Loader does not support boot into boot loader entry."); + } else + /* non-EFI case: $SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY is set to on */ + use_efi = false; + + r = bus_verify_polkit_async(message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-to-boot-loader-entry", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + if (use_efi) { + if (isempty(v)) + /* Delete item */ + r = efi_set_variable(EFI_LOADER_VARIABLE(LoaderEntryOneShot), NULL, 0); + else + r = efi_set_variable_string(EFI_LOADER_VARIABLE(LoaderEntryOneShot), v); + if (r < 0) + return r; + } else { + if (isempty(v)) { + if (unlink("/run/systemd/reboot-to-boot-loader-entry") < 0 && errno != ENOENT) + return -errno; + } else { + r = write_string_file_atomic_label("/run/systemd/reboot-boot-to-loader-entry", v); + if (r < 0) + return r; + } + } + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_can_reboot_to_boot_loader_entry( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _unused_ Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = getenv_bool("SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY"); + if (r == -ENXIO) { + uint64_t features = 0; + + /* EFI case, let's see if booting into boot loader entry is supported. */ + + r = efi_loader_get_features(&features); + if (r < 0) + log_warning_errno(r, "Failed to determine whether reboot to boot loader entry is supported: %m"); + if (r < 0 || !FLAGS_SET(features, EFI_LOADER_FEATURE_ENTRY_ONESHOT)) + return sd_bus_reply_method_return(message, "s", "na"); + + } else if (r <= 0) { + /* Non-EFI case: let's trust $SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY */ + + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_REBOOT_TO_BOOT_LOADER_ENTRY: %m"); + + return sd_bus_reply_method_return(message, "s", "na"); + } + + return return_test_polkit( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-reboot-to-boot-loader-entry", + NULL, + UID_INVALID, + error); +} + +static int property_get_boot_loader_entries( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL; + Manager *m = ASSERT_PTR(userdata); + size_t i; + int r; + + assert(bus); + assert(reply); + + r = boot_config_load_auto(&config, NULL, NULL); + if (r < 0 && r != -ENOKEY) /* don't complain if there's no GPT found */ + return r; + + r = manager_read_efi_boot_loader_entries(m); + if (r >= 0) + (void) boot_config_augment_from_loader(&config, m->efi_boot_loader_entries, /* auto_only= */ true); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + for (i = 0; i < config.n_entries; i++) { + BootEntry *e = config.entries + i; + + r = sd_bus_message_append(reply, "s", e->id); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int method_set_wall_message( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + int r; + Manager *m = ASSERT_PTR(userdata); + char *wall_message; + int enable_wall_messages; + + assert(message); + + r = sd_bus_message_read(message, "sb", &wall_message, &enable_wall_messages); + if (r < 0) + return r; + + if (strlen(wall_message) > WALL_MESSAGE_MAX) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Wall message too long, maximum permitted length is %u characters.", + WALL_MESSAGE_MAX); + + /* Short-circuit the operation if the desired state is already in place, to + * avoid an unnecessary polkit permission check. */ + if (streq_ptr(m->wall_message, empty_to_null(wall_message)) && + m->enable_wall_messages == enable_wall_messages) + goto done; + + r = bus_verify_polkit_async(message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.set-wall-message", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = free_and_strdup(&m->wall_message, empty_to_null(wall_message)); + if (r < 0) + return log_oom(); + + m->enable_wall_messages = enable_wall_messages; + + done: + return sd_bus_reply_method_return(message, NULL); +} + +static int method_inhibit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + const char *who, *why, *what, *mode; + _cleanup_free_ char *id = NULL; + _cleanup_close_ int fifo_fd = -EBADF; + Manager *m = ASSERT_PTR(userdata); + InhibitMode mm; + InhibitWhat w; + pid_t pid; + uid_t uid; + int r; + + assert(message); + + r = sd_bus_message_read(message, "ssss", &what, &who, &why, &mode); + if (r < 0) + return r; + + w = inhibit_what_from_string(what); + if (w <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid what specification %s", what); + + mm = inhibit_mode_from_string(mode); + if (mm < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid mode specification %s", mode); + + /* Delay is only supported for shutdown/sleep */ + if (mm == INHIBIT_DELAY && (w & ~(INHIBIT_SHUTDOWN|INHIBIT_SLEEP))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Delay inhibitors only supported for shutdown and sleep"); + + /* Don't allow taking delay locks while we are already + * executing the operation. We shouldn't create the impression + * that the lock was successful if the machine is about to go + * down/suspend any moment. */ + if (m->delayed_action && m->delayed_action->inhibit_what & w) + return sd_bus_error_setf(error, BUS_ERROR_OPERATION_IN_PROGRESS, + "The operation inhibition has been requested for is already running"); + + r = bus_verify_polkit_async( + message, + CAP_SYS_BOOT, + w == INHIBIT_SHUTDOWN ? (mm == INHIBIT_BLOCK ? "org.freedesktop.login1.inhibit-block-shutdown" : "org.freedesktop.login1.inhibit-delay-shutdown") : + w == INHIBIT_SLEEP ? (mm == INHIBIT_BLOCK ? "org.freedesktop.login1.inhibit-block-sleep" : "org.freedesktop.login1.inhibit-delay-sleep") : + w == INHIBIT_IDLE ? "org.freedesktop.login1.inhibit-block-idle" : + w == INHIBIT_HANDLE_POWER_KEY ? "org.freedesktop.login1.inhibit-handle-power-key" : + w == INHIBIT_HANDLE_SUSPEND_KEY ? "org.freedesktop.login1.inhibit-handle-suspend-key" : + w == INHIBIT_HANDLE_REBOOT_KEY ? "org.freedesktop.login1.inhibit-handle-reboot-key" : + w == INHIBIT_HANDLE_HIBERNATE_KEY ? "org.freedesktop.login1.inhibit-handle-hibernate-key" : + "org.freedesktop.login1.inhibit-handle-lid-switch", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed pin source process "PID_FMT": %m", pid); + + if (hashmap_size(m->inhibitors) >= m->inhibitors_max) + return sd_bus_error_setf(error, SD_BUS_ERROR_LIMITS_EXCEEDED, + "Maximum number of inhibitors (%" PRIu64 ") reached, refusing further inhibitors.", + m->inhibitors_max); + + do { + id = mfree(id); + + if (asprintf(&id, "%" PRIu64, ++m->inhibit_counter) < 0) + return -ENOMEM; + + } while (hashmap_get(m->inhibitors, id)); + + _cleanup_(inhibitor_freep) Inhibitor *i = NULL; + r = manager_add_inhibitor(m, id, &i); + if (r < 0) + return r; + + i->what = w; + i->mode = mm; + i->pid = TAKE_PIDREF(pidref); + i->uid = uid; + i->why = strdup(why); + i->who = strdup(who); + + if (!i->why || !i->who) + return -ENOMEM; + + fifo_fd = inhibitor_create_fifo(i); + if (fifo_fd < 0) + return fifo_fd; + + r = inhibitor_start(i); + if (r < 0) + return r; + TAKE_PTR(i); + + return sd_bus_reply_method_return(message, "h", fifo_fd); +} + +static const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_WRITABLE_PROPERTY("EnableWallMessages", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, enable_wall_messages), 0), + SD_BUS_WRITABLE_PROPERTY("WallMessage", "s", NULL, NULL, offsetof(Manager, wall_message), 0), + + SD_BUS_PROPERTY("NAutoVTs", "u", NULL, offsetof(Manager, n_autovts), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KillOnlyUsers", "as", NULL, offsetof(Manager, kill_only_users), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KillExcludeUsers", "as", NULL, offsetof(Manager, kill_exclude_users), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("KillUserProcesses", "b", bus_property_get_bool, offsetof(Manager, kill_user_processes), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RebootParameter", "s", property_get_reboot_parameter, 0, 0), + SD_BUS_PROPERTY("RebootToFirmwareSetup", "b", property_get_reboot_to_firmware_setup, 0, 0), + SD_BUS_PROPERTY("RebootToBootLoaderMenu", "t", property_get_reboot_to_boot_loader_menu, 0, 0), + SD_BUS_PROPERTY("RebootToBootLoaderEntry", "s", property_get_reboot_to_boot_loader_entry, 0, 0), + SD_BUS_PROPERTY("BootLoaderEntries", "as", property_get_boot_loader_entries, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IdleHint", "b", property_get_idle_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHint", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHintMonotonic", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("BlockInhibited", "s", property_get_inhibited, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("DelayInhibited", "s", property_get_inhibited, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("InhibitDelayMaxUSec", "t", NULL, offsetof(Manager, inhibit_delay_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UserStopDelayUSec", "t", NULL, offsetof(Manager, user_stop_delay), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandlePowerKey", "s", property_get_handle_action, offsetof(Manager, handle_power_key), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandlePowerKeyLongPress", "s", property_get_handle_action, offsetof(Manager, handle_power_key_long_press), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleRebootKey", "s", property_get_handle_action, offsetof(Manager, handle_reboot_key), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleRebootKeyLongPress", "s", property_get_handle_action, offsetof(Manager, handle_reboot_key_long_press), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleSuspendKey", "s", property_get_handle_action, offsetof(Manager, handle_suspend_key), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleSuspendKeyLongPress", "s", property_get_handle_action, offsetof(Manager, handle_suspend_key_long_press), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleHibernateKey", "s", property_get_handle_action, offsetof(Manager, handle_hibernate_key), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleHibernateKeyLongPress", "s", property_get_handle_action, offsetof(Manager, handle_hibernate_key_long_press), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleLidSwitch", "s", property_get_handle_action, offsetof(Manager, handle_lid_switch), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleLidSwitchExternalPower", "s", property_get_handle_action, offsetof(Manager, handle_lid_switch_ep), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HandleLidSwitchDocked", "s", property_get_handle_action, offsetof(Manager, handle_lid_switch_docked), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("HoldoffTimeoutUSec", "t", NULL, offsetof(Manager, holdoff_timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IdleAction", "s", property_get_handle_action, offsetof(Manager, idle_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("IdleActionUSec", "t", NULL, offsetof(Manager, idle_action_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PreparingForShutdown", "b", property_get_preparing, 0, 0), + SD_BUS_PROPERTY("PreparingForSleep", "b", property_get_preparing, 0, 0), + SD_BUS_PROPERTY("ScheduledShutdown", "(st)", property_get_scheduled_shutdown, 0, 0), + SD_BUS_PROPERTY("Docked", "b", property_get_docked, 0, 0), + SD_BUS_PROPERTY("LidClosed", "b", property_get_lid_closed, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("OnExternalPower", "b", property_get_on_external_power, 0, 0), + SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(Manager, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectorySize", "t", NULL, offsetof(Manager, runtime_dir_size), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimeDirectoryInodesMax", "t", NULL, offsetof(Manager, runtime_dir_inodes), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InhibitorsMax", "t", NULL, offsetof(Manager, inhibitors_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NCurrentInhibitors", "t", property_get_hashmap_size, offsetof(Manager, inhibitors), 0), + SD_BUS_PROPERTY("SessionsMax", "t", NULL, offsetof(Manager, sessions_max), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NCurrentSessions", "t", property_get_hashmap_size, offsetof(Manager, sessions), 0), + SD_BUS_PROPERTY("UserTasksMax", "t", property_get_compat_user_tasks_max, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("StopIdleSessionUSec", "t", NULL, offsetof(Manager, stop_idle_session_usec), SD_BUS_VTABLE_PROPERTY_CONST), + + SD_BUS_METHOD_WITH_ARGS("GetSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_RESULT("o", object_path), + method_get_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetSessionByPID", + SD_BUS_ARGS("u", pid), + SD_BUS_RESULT("o", object_path), + method_get_session_by_pid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUser", + SD_BUS_ARGS("u", uid), + SD_BUS_RESULT("o", object_path), + method_get_user, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUserByPID", + SD_BUS_ARGS("u", pid), + SD_BUS_RESULT("o", object_path), + method_get_user_by_pid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetSeat", + SD_BUS_ARGS("s", seat_id), + SD_BUS_RESULT("o", object_path), + method_get_seat, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListSessions", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(susso)", sessions), + method_list_sessions, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListUsers", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(uso)", users), + method_list_users, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListSeats", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(so)", seats), + method_list_seats, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListInhibitors", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(ssssuu)", inhibitors), + method_list_inhibitors, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CreateSession", + SD_BUS_ARGS("u", uid, + "u", pid, + "s", service, + "s", type, + "s", class, + "s", desktop, + "s", seat_id, + "u", vtnr, + "s", tty, + "s", display, + "b", remote, + "s", remote_user, + "s", remote_host, + "a(sv)", properties), + SD_BUS_RESULT("s", session_id, + "o", object_path, + "s", runtime_path, + "h", fifo_fd, + "u", uid, + "s", seat_id, + "u", vtnr, + "b", existing), + method_create_session, + 0), + SD_BUS_METHOD_WITH_ARGS("CreateSessionWithPIDFD", + SD_BUS_ARGS("u", uid, + "h", pidfd, + "s", service, + "s", type, + "s", class, + "s", desktop, + "s", seat_id, + "u", vtnr, + "s", tty, + "s", display, + "b", remote, + "s", remote_user, + "s", remote_host, + "t", flags, + "a(sv)", properties), + SD_BUS_RESULT("s", session_id, + "o", object_path, + "s", runtime_path, + "h", fifo_fd, + "u", uid, + "s", seat_id, + "u", vtnr, + "b", existing), + method_create_session_pidfd, + 0), + SD_BUS_METHOD_WITH_ARGS("ReleaseSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_NO_RESULT, + method_release_session, + 0), + SD_BUS_METHOD_WITH_ARGS("ActivateSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_NO_RESULT, + method_activate_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ActivateSessionOnSeat", + SD_BUS_ARGS("s", session_id, "s", seat_id), + SD_BUS_NO_RESULT, + method_activate_session_on_seat, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("LockSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_NO_RESULT, + method_lock_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("UnlockSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_NO_RESULT, + method_lock_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("LockSessions", + NULL, + NULL, + method_lock_sessions, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("UnlockSessions", + NULL, + NULL, + method_lock_sessions, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("KillSession", + SD_BUS_ARGS("s", session_id, "s", who, "i", signal_number), + SD_BUS_NO_RESULT, + method_kill_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("KillUser", + SD_BUS_ARGS("u", uid, "i", signal_number), + SD_BUS_NO_RESULT, + method_kill_user, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TerminateSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_NO_RESULT, + method_terminate_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TerminateUser", + SD_BUS_ARGS("u", uid), + SD_BUS_NO_RESULT, + method_terminate_user, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TerminateSeat", + SD_BUS_ARGS("s", seat_id), + SD_BUS_NO_RESULT, + method_terminate_seat, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetUserLinger", + SD_BUS_ARGS("u", uid, "b", enable, "b", interactive), + SD_BUS_NO_RESULT, + method_set_user_linger, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AttachDevice", + SD_BUS_ARGS("s", seat_id, "s", sysfs_path, "b", interactive), + SD_BUS_NO_RESULT, + method_attach_device, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("FlushDevices", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_flush_devices, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("PowerOff", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_poweroff, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("PowerOffWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_poweroff, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Reboot", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_reboot, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RebootWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_reboot, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Halt", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_halt, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("HaltWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_halt, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Suspend", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_suspend, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SuspendWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_suspend, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Hibernate", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_hibernate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("HibernateWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_hibernate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("HybridSleep", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_hybrid_sleep, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("HybridSleepWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_hybrid_sleep, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SuspendThenHibernate", + SD_BUS_ARGS("b", interactive), + SD_BUS_NO_RESULT, + method_suspend_then_hibernate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SuspendThenHibernateWithFlags", + SD_BUS_ARGS("t", flags), + SD_BUS_NO_RESULT, + method_suspend_then_hibernate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanPowerOff", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_poweroff, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanReboot", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_reboot, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanHalt", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_halt, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanSuspend", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_suspend, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanHibernate", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_hibernate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanHybridSleep", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_hybrid_sleep, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanSuspendThenHibernate", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_suspend_then_hibernate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ScheduleShutdown", + SD_BUS_ARGS("s", type, "t", usec), + SD_BUS_NO_RESULT, + method_schedule_shutdown, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CancelScheduledShutdown", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("b", cancelled), + method_cancel_scheduled_shutdown, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Inhibit", + SD_BUS_ARGS("s", what, "s", who, "s", why, "s", mode), + SD_BUS_RESULT("h", pipe_fd), + method_inhibit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanRebootParameter", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_reboot_parameter, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetRebootParameter", + SD_BUS_ARGS("s", parameter), + SD_BUS_NO_RESULT, + method_set_reboot_parameter, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanRebootToFirmwareSetup", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_reboot_to_firmware_setup, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetRebootToFirmwareSetup", + SD_BUS_ARGS("b", enable), + SD_BUS_NO_RESULT, + method_set_reboot_to_firmware_setup, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanRebootToBootLoaderMenu", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_reboot_to_boot_loader_menu, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetRebootToBootLoaderMenu", + SD_BUS_ARGS("t", timeout), + SD_BUS_NO_RESULT, + method_set_reboot_to_boot_loader_menu, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CanRebootToBootLoaderEntry", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", result), + method_can_reboot_to_boot_loader_entry, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetRebootToBootLoaderEntry", + SD_BUS_ARGS("s", boot_loader_entry), + SD_BUS_NO_RESULT, + method_set_reboot_to_boot_loader_entry, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetWallMessage", + SD_BUS_ARGS("s", wall_message, "b", enable), + SD_BUS_NO_RESULT, + method_set_wall_message, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_SIGNAL_WITH_ARGS("SessionNew", + SD_BUS_ARGS("s", session_id, "o", object_path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("SessionRemoved", + SD_BUS_ARGS("s", session_id, "o", object_path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("UserNew", + SD_BUS_ARGS("u", uid, "o", object_path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("UserRemoved", + SD_BUS_ARGS("u", uid, "o", object_path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("SeatNew", + SD_BUS_ARGS("s", seat_id, "o", object_path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("SeatRemoved", + SD_BUS_ARGS("s", seat_id, "o", object_path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("PrepareForShutdown", + SD_BUS_ARGS("b", start), + 0), + SD_BUS_SIGNAL_WITH_ARGS("PrepareForShutdownWithMetadata", + SD_BUS_ARGS("b", start, "a{sv}", metadata), + 0), + SD_BUS_SIGNAL_WITH_ARGS("PrepareForSleep", + SD_BUS_ARGS("b", start), + 0), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + .vtables = BUS_VTABLES(manager_vtable), + .children = BUS_IMPLEMENTATIONS(&seat_object, + &session_object, + &user_object), +}; + +static int session_jobs_reply(Session *s, uint32_t jid, const char *unit, const char *result) { + assert(s); + assert(unit); + + if (!s->started) + return 0; + + if (result && !streq(result, "done")) { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + + sd_bus_error_setf(&e, BUS_ERROR_JOB_FAILED, + "Job %u for unit '%s' failed with '%s'", jid, unit, result); + return session_send_create_reply(s, &e); + } + + return session_send_create_reply(s, NULL); +} + +int match_job_removed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *path, *result, *unit; + Manager *m = ASSERT_PTR(userdata); + Session *session; + uint32_t id; + User *user; + int r; + + assert(message); + + r = sd_bus_message_read(message, "uoss", &id, &path, &unit, &result); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (m->action_job && streq(m->action_job, path)) { + assert(m->delayed_action); + log_info("Operation '%s' finished.", handle_action_to_string(m->delayed_action->handle)); + + /* Tell people that they now may take a lock again */ + (void) send_prepare_for(m, m->delayed_action, false); + + m->action_job = mfree(m->action_job); + m->delayed_action = NULL; + return 0; + } + + session = hashmap_get(m->session_units, unit); + if (session) { + if (streq_ptr(path, session->scope_job)) { + session->scope_job = mfree(session->scope_job); + (void) session_jobs_reply(session, id, unit, result); + + session_save(session); + user_save(session->user); + } + + session_add_to_gc_queue(session); + } + + user = hashmap_get(m->user_units, unit); + if (user) { + if (streq_ptr(path, user->service_job)) { + user->service_job = mfree(user->service_job); + + LIST_FOREACH(sessions_by_user, s, user->sessions) + (void) session_jobs_reply(s, id, unit, NULL /* don't propagate user service failures to the client */); + + user_save(user); + } + + user_add_to_gc_queue(user); + } + + return 0; +} + +int match_unit_removed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *path, *unit; + Manager *m = ASSERT_PTR(userdata); + Session *session; + User *user; + int r; + + assert(message); + + r = sd_bus_message_read(message, "so", &unit, &path); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + session = hashmap_get(m->session_units, unit); + if (session) + session_add_to_gc_queue(session); + + user = hashmap_get(m->user_units, unit); + if (user) + user_add_to_gc_queue(user); + + return 0; +} + +int match_properties_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *unit = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *path; + Session *session; + User *user; + int r; + + assert(message); + + path = sd_bus_message_get_path(message); + if (!path) + return 0; + + r = unit_name_from_dbus_path(path, &unit); + if (r == -EINVAL) /* not a unit */ + return 0; + if (r < 0) { + log_oom(); + return 0; + } + + session = hashmap_get(m->session_units, unit); + if (session) + session_add_to_gc_queue(session); + + user = hashmap_get(m->user_units, unit); + if (user) + user_add_to_gc_queue(user); + + return 0; +} + +int match_reloading(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Session *session; + int b, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (b) + return 0; + + /* systemd finished reloading, let's recheck all our sessions */ + log_debug("System manager has been reloaded, rechecking sessions..."); + + HASHMAP_FOREACH(session, m->sessions) + session_add_to_gc_queue(session); + + return 0; +} + +int manager_send_changed(Manager *manager, const char *property, ...) { + char **l; + + assert(manager); + + l = strv_from_stdarg_alloca(property); + + return sd_bus_emit_properties_changed_strv( + manager->bus, + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + l); +} + +static int strdup_job(sd_bus_message *reply, char **job) { + const char *j; + char *copy; + int r; + + r = sd_bus_message_read(reply, "o", &j); + if (r < 0) + return r; + + copy = strdup(j); + if (!copy) + return -ENOMEM; + + *job = copy; + return 1; +} + +int manager_start_scope( + Manager *manager, + const char *scope, + const PidRef *pidref, + const char *slice, + const char *description, + char **wants, + char **after, + const char *requires_mounts_for, + sd_bus_message *more_properties, + sd_bus_error *error, + char **job) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + int r; + + assert(manager); + assert(scope); + assert(pidref_is_set(pidref)); + assert(job); + + r = bus_message_new_method_call(manager->bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "ss", strempty(scope), "fail"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return r; + + if (!isempty(slice)) { + r = sd_bus_message_append(m, "(sv)", "Slice", "s", slice); + if (r < 0) + return r; + } + + if (!isempty(description)) { + r = sd_bus_message_append(m, "(sv)", "Description", "s", description); + if (r < 0) + return r; + } + + STRV_FOREACH(i, wants) { + r = sd_bus_message_append(m, "(sv)", "Wants", "as", 1, *i); + if (r < 0) + return r; + } + + STRV_FOREACH(i, after) { + r = sd_bus_message_append(m, "(sv)", "After", "as", 1, *i); + if (r < 0) + return r; + } + + if (!empty_or_root(requires_mounts_for)) { + r = sd_bus_message_append(m, "(sv)", "RequiresMountsFor", "as", 1, requires_mounts_for); + if (r < 0) + return r; + } + + /* Make sure that the session shells are terminated with SIGHUP since bash and friends tend to ignore + * SIGTERM */ + r = sd_bus_message_append(m, "(sv)", "SendSIGHUP", "b", true); + if (r < 0) + return r; + + r = bus_append_scope_pidref(m, pidref); + if (r < 0) + return r; + + /* For login session scopes, if a process is OOM killed by the kernel, *don't* terminate the rest of + the scope */ + r = sd_bus_message_append(m, "(sv)", "OOMPolicy", "s", "continue"); + if (r < 0) + return r; + + /* disable TasksMax= for the session scope, rely on the slice setting for it */ + r = sd_bus_message_append(m, "(sv)", "TasksMax", "t", UINT64_MAX); + if (r < 0) + return bus_log_create_error(r); + + if (more_properties) { + /* If TasksMax also appears here, it will overwrite the default value set above */ + r = sd_bus_message_copy(m, more_properties, true); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return r; + + r = sd_bus_call(manager->bus, m, 0, error, &reply); + if (r < 0) + return r; + + return strdup_job(reply, job); +} + +int manager_start_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(manager); + assert(unit); + assert(job); + + r = bus_call_method( + manager->bus, + bus_systemd_mgr, + "StartUnit", + error, + &reply, + "ss", unit, "replace"); + if (r < 0) + return r; + + return strdup_job(reply, job); +} + +int manager_stop_unit(Manager *manager, const char *unit, const char *job_mode, sd_bus_error *error, char **ret_job) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(manager); + assert(unit); + assert(ret_job); + + r = bus_call_method( + manager->bus, + bus_systemd_mgr, + "StopUnit", + error, + &reply, + "ss", unit, job_mode ?: "fail"); + if (r < 0) { + if (sd_bus_error_has_names(error, BUS_ERROR_NO_SUCH_UNIT, + BUS_ERROR_LOAD_FAILED)) { + + *ret_job = NULL; + sd_bus_error_free(error); + return 0; + } + + return r; + } + + return strdup_job(reply, ret_job); +} + +int manager_abandon_scope(Manager *manager, const char *scope, sd_bus_error *ret_error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL; + int r; + + assert(manager); + assert(scope); + + path = unit_dbus_path_from_name(scope); + if (!path) + return -ENOMEM; + + r = sd_bus_call_method( + manager->bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Scope", + "Abandon", + &error, + NULL, + NULL); + if (r < 0) { + if (sd_bus_error_has_names(&error, BUS_ERROR_NO_SUCH_UNIT, + BUS_ERROR_LOAD_FAILED, + BUS_ERROR_SCOPE_NOT_RUNNING)) + return 0; + + sd_bus_error_move(ret_error, &error); + return r; + } + + return 1; +} + +int manager_kill_unit(Manager *manager, const char *unit, KillWho who, int signo, sd_bus_error *error) { + assert(manager); + assert(unit); + + return bus_call_method( + manager->bus, + bus_systemd_mgr, + "KillUnit", + error, + NULL, + "ssi", unit, who == KILL_LEADER ? "main" : "all", signo); +} + +int manager_unit_is_active(Manager *manager, const char *unit, sd_bus_error *ret_error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *path = NULL; + const char *state; + int r; + + assert(manager); + assert(unit); + + path = unit_dbus_path_from_name(unit); + if (!path) + return -ENOMEM; + + r = sd_bus_get_property( + manager->bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "ActiveState", + &error, + &reply, + "s"); + if (r < 0) { + /* systemd might have dropped off momentarily, let's + * not make this an error */ + if (sd_bus_error_has_names(&error, SD_BUS_ERROR_NO_REPLY, + SD_BUS_ERROR_DISCONNECTED)) + return true; + + /* If the unit is already unloaded then it's not + * active */ + if (sd_bus_error_has_names(&error, BUS_ERROR_NO_SUCH_UNIT, + BUS_ERROR_LOAD_FAILED)) + return false; + + sd_bus_error_move(ret_error, &error); + return r; + } + + r = sd_bus_message_read(reply, "s", &state); + if (r < 0) + return r; + + return !STR_IN_SET(state, "inactive", "failed"); +} + +int manager_job_is_active(Manager *manager, const char *path, sd_bus_error *ret_error) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(manager); + assert(path); + + r = sd_bus_get_property( + manager->bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Job", + "State", + &error, + &reply, + "s"); + if (r < 0) { + if (sd_bus_error_has_names(&error, SD_BUS_ERROR_NO_REPLY, + SD_BUS_ERROR_DISCONNECTED)) + return true; + + if (sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_OBJECT)) + return false; + + sd_bus_error_move(ret_error, &error); + return r; + } + + /* We don't actually care about the state really. The fact + * that we could read the job state is enough for us */ + + return true; +} diff --git a/src/login/logind-dbus.h b/src/login/logind-dbus.h new file mode 100644 index 0000000..c9d5923 --- /dev/null +++ b/src/login/logind-dbus.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" +#include "logind-action.h" +#include "logind-session.h" +#include "logind-user.h" +#include "logind.h" + +int manager_get_session_from_creds(Manager *m, sd_bus_message *message, const char *name, sd_bus_error *error, Session **ret); +int manager_get_user_from_creds(Manager *m, sd_bus_message *message, uid_t uid, sd_bus_error *error, User **ret); +int manager_get_seat_from_creds(Manager *m, sd_bus_message *message, const char *name, sd_bus_error *error, Seat **ret); + +int manager_dispatch_delayed(Manager *manager, bool timeout); + +int bus_manager_shutdown_or_sleep_now_or_later(Manager *m, const HandleActionData *a, sd_bus_error *error); + +int match_job_removed(sd_bus_message *message, void *userdata, sd_bus_error *error); +int match_unit_removed(sd_bus_message *message, void *userdata, sd_bus_error *error); +int match_properties_changed(sd_bus_message *message, void *userdata, sd_bus_error *error); +int match_reloading(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int manager_send_changed(Manager *manager, const char *property, ...) _sentinel_; + +int manager_start_scope(Manager *manager, const char *scope, const PidRef *pidref, const char *slice, const char *description, char **wants, char **after, const char *requires_mounts_for, sd_bus_message *more_properties, sd_bus_error *error, char **job); +int manager_start_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job); +int manager_stop_unit(Manager *manager, const char *unit, const char *job_mode, sd_bus_error *error, char **job); +int manager_abandon_scope(Manager *manager, const char *scope, sd_bus_error *error); +int manager_kill_unit(Manager *manager, const char *unit, KillWho who, int signo, sd_bus_error *error); +int manager_unit_is_active(Manager *manager, const char *unit, sd_bus_error *error); +int manager_job_is_active(Manager *manager, const char *path, sd_bus_error *error); + +void manager_load_scheduled_shutdown(Manager *m); + +extern const BusObjectImplementation manager_object; diff --git a/src/login/logind-device.c b/src/login/logind-device.c new file mode 100644 index 0000000..376cd0d --- /dev/null +++ b/src/login/logind-device.c @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "logind-device.h" +#include "logind-seat-dbus.h" + +Device* device_new(Manager *m, const char *sysfs, bool master) { + Device *d; + + assert(m); + assert(sysfs); + + d = new0(Device, 1); + if (!d) + return NULL; + + d->sysfs = strdup(sysfs); + if (!d->sysfs) + return mfree(d); + + if (hashmap_put(m->devices, d->sysfs, d) < 0) { + free(d->sysfs); + return mfree(d); + } + + d->manager = m; + d->master = master; + dual_timestamp_now(&d->timestamp); + + return d; +} + +static void device_detach(Device *d) { + Seat *s; + SessionDevice *sd; + + assert(d); + + if (!d->seat) + return; + + while ((sd = d->session_devices)) + session_device_free(sd); + + s = d->seat; + LIST_REMOVE(devices, d->seat->devices, d); + d->seat = NULL; + + if (!seat_has_master_device(s)) { + seat_add_to_gc_queue(s); + seat_send_changed(s, "CanGraphical", NULL); + } +} + +void device_free(Device *d) { + assert(d); + + device_detach(d); + + hashmap_remove(d->manager->devices, d->sysfs); + + free(d->sysfs); + free(d); +} + +void device_attach(Device *d, Seat *s) { + bool had_master; + + assert(d); + assert(s); + + if (d->seat == s) + return; + + if (d->seat) + device_detach(d); + + d->seat = s; + had_master = seat_has_master_device(s); + + /* We keep the device list sorted by the "master" flag. That is, master + * devices are at the front, other devices at the tail. As there is no + * way to easily add devices at the list-tail, we need to iterate the + * list to find the first non-master device when adding non-master + * devices. We assume there is only a few (normally 1) master devices + * per seat, so we iterate only a few times. */ + + if (d->master || !s->devices) + LIST_PREPEND(devices, s->devices, d); + else + LIST_FOREACH(devices, i, s->devices) { + if (!i->devices_next || !i->master) { + LIST_INSERT_AFTER(devices, s->devices, i, d); + break; + } + } + + if (!had_master && d->master && s->started) { + seat_save(s); + seat_send_changed(s, "CanGraphical", NULL); + } +} diff --git a/src/login/logind-device.h b/src/login/logind-device.h new file mode 100644 index 0000000..0d89613 --- /dev/null +++ b/src/login/logind-device.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Device Device; + +#include "list.h" +#include "logind-seat.h" +#include "logind-session-device.h" + +struct Device { + Manager *manager; + + char *sysfs; + Seat *seat; + bool master; + + dual_timestamp timestamp; + + LIST_FIELDS(struct Device, devices); + LIST_HEAD(SessionDevice, session_devices); +}; + +Device* device_new(Manager *m, const char *sysfs, bool master); +void device_free(Device *d); +void device_attach(Device *d, Seat *s); diff --git a/src/login/logind-gperf.gperf b/src/login/logind-gperf.gperf new file mode 100644 index 0000000..c95a3b2 --- /dev/null +++ b/src/login/logind-gperf.gperf @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "logind.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name logind_gperf_hash +%define lookup-function-name logind_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Login.NAutoVTs, config_parse_n_autovts, 0, offsetof(Manager, n_autovts) +Login.ReserveVT, config_parse_unsigned, 0, offsetof(Manager, reserve_vt) +Login.KillUserProcesses, config_parse_bool, 0, offsetof(Manager, kill_user_processes) +Login.KillOnlyUsers, config_parse_strv, 0, offsetof(Manager, kill_only_users) +Login.KillExcludeUsers, config_parse_strv, 0, offsetof(Manager, kill_exclude_users) +Login.InhibitDelayMaxSec, config_parse_sec, 0, offsetof(Manager, inhibit_delay_max) +Login.UserStopDelaySec, config_parse_sec, 0, offsetof(Manager, user_stop_delay) +Login.HandlePowerKey, config_parse_handle_action, 0, offsetof(Manager, handle_power_key) +Login.HandlePowerKeyLongPress, config_parse_handle_action, 0, offsetof(Manager, handle_power_key_long_press) +Login.HandleRebootKey, config_parse_handle_action, 0, offsetof(Manager, handle_reboot_key) +Login.HandleRebootKeyLongPress, config_parse_handle_action, 0, offsetof(Manager, handle_reboot_key_long_press) +Login.HandleSuspendKey, config_parse_handle_action, 0, offsetof(Manager, handle_suspend_key) +Login.HandleSuspendKeyLongPress, config_parse_handle_action, 0, offsetof(Manager, handle_suspend_key_long_press) +Login.HandleHibernateKey, config_parse_handle_action, 0, offsetof(Manager, handle_hibernate_key) +Login.HandleHibernateKeyLongPress, config_parse_handle_action, 0, offsetof(Manager, handle_hibernate_key_long_press) +Login.HandleLidSwitch, config_parse_handle_action, 0, offsetof(Manager, handle_lid_switch) +Login.HandleLidSwitchExternalPower, config_parse_handle_action, 0, offsetof(Manager, handle_lid_switch_ep) +Login.HandleLidSwitchDocked, config_parse_handle_action, 0, offsetof(Manager, handle_lid_switch_docked) +Login.PowerKeyIgnoreInhibited, config_parse_bool, 0, offsetof(Manager, power_key_ignore_inhibited) +Login.SuspendKeyIgnoreInhibited, config_parse_bool, 0, offsetof(Manager, suspend_key_ignore_inhibited) +Login.HibernateKeyIgnoreInhibited, config_parse_bool, 0, offsetof(Manager, hibernate_key_ignore_inhibited) +Login.LidSwitchIgnoreInhibited, config_parse_bool, 0, offsetof(Manager, lid_switch_ignore_inhibited) +Login.RebootKeyIgnoreInhibited, config_parse_bool, 0, offsetof(Manager, reboot_key_ignore_inhibited) +Login.HoldoffTimeoutSec, config_parse_sec, 0, offsetof(Manager, holdoff_timeout_usec) +Login.IdleAction, config_parse_handle_action, 0, offsetof(Manager, idle_action) +Login.IdleActionSec, config_parse_sec, 0, offsetof(Manager, idle_action_usec) +Login.RuntimeDirectorySize, config_parse_tmpfs_size, 0, offsetof(Manager, runtime_dir_size) +Login.RuntimeDirectoryInodesMax, config_parse_iec_uint64, 0, offsetof(Manager, runtime_dir_inodes) +Login.RemoveIPC, config_parse_bool, 0, offsetof(Manager, remove_ipc) +Login.InhibitorsMax, config_parse_uint64, 0, offsetof(Manager, inhibitors_max) +Login.SessionsMax, config_parse_uint64, 0, offsetof(Manager, sessions_max) +Login.UserTasksMax, config_parse_compat_user_tasks_max, 0, 0 +Login.StopIdleSessionSec, config_parse_sec_fix_0, 0, offsetof(Manager, stop_idle_session_usec) diff --git a/src/login/logind-inhibit.c b/src/login/logind-inhibit.c new file mode 100644 index 0000000..1566dab --- /dev/null +++ b/src/login/logind-inhibit.c @@ -0,0 +1,532 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "env-file.h" +#include "errno-list.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "logind-dbus.h" +#include "logind-inhibit.h" +#include "missing_threads.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-table.h" +#include "string-util.h" +#include "tmpfile-util.h" +#include "user-util.h" + +static void inhibitor_remove_fifo(Inhibitor *i); + +int inhibitor_new(Inhibitor **ret, Manager *m, const char* id) { + _cleanup_(inhibitor_freep) Inhibitor *i = NULL; + int r; + + assert(ret); + assert(m); + assert(id); + + i = new(Inhibitor, 1); + if (!i) + return -ENOMEM; + + *i = (Inhibitor) { + .manager = m, + .what = _INHIBIT_WHAT_INVALID, + .mode = _INHIBIT_MODE_INVALID, + .uid = UID_INVALID, + .fifo_fd = -EBADF, + .pid = PIDREF_NULL, + }; + + i->state_file = path_join("/run/systemd/inhibit", id); + if (!i->state_file) + return -ENOMEM; + + i->id = basename(i->state_file); + + r = hashmap_put(m->inhibitors, i->id, i); + if (r < 0) + return r; + + *ret = TAKE_PTR(i); + return 0; +} + +Inhibitor* inhibitor_free(Inhibitor *i) { + + if (!i) + return NULL; + + free(i->who); + free(i->why); + + sd_event_source_unref(i->event_source); + safe_close(i->fifo_fd); + + hashmap_remove(i->manager->inhibitors, i->id); + + /* Note that we don't remove neither the state file nor the fifo path here, since we want both to + * survive daemon restarts */ + free(i->state_file); + free(i->fifo_path); + + pidref_done(&i->pid); + + return mfree(i); +} + +static int inhibitor_save(Inhibitor *i) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(i); + + r = mkdir_safe_label("/run/systemd/inhibit", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + r = fopen_temporary(i->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "WHAT=%s\n" + "MODE=%s\n" + "UID="UID_FMT"\n" + "PID="PID_FMT"\n", + inhibit_what_to_string(i->what), + inhibit_mode_to_string(i->mode), + i->uid, + i->pid.pid); + + if (i->who) { + _cleanup_free_ char *cc = NULL; + + cc = cescape(i->who); + if (!cc) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "WHO=%s\n", cc); + } + + if (i->why) { + _cleanup_free_ char *cc = NULL; + + cc = cescape(i->why); + if (!cc) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "WHY=%s\n", cc); + } + + if (i->fifo_path) + fprintf(f, "FIFO=%s\n", i->fifo_path); + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, i->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + return 0; + +fail: + (void) unlink(i->state_file); + + return log_error_errno(r, "Failed to save inhibit data %s: %m", i->state_file); +} + +static int bus_manager_send_inhibited_change(Inhibitor *i) { + const char *property; + + assert(i); + + property = i->mode == INHIBIT_BLOCK ? "BlockInhibited" : "DelayInhibited"; + + return manager_send_changed(i->manager, property, NULL); +} + +int inhibitor_start(Inhibitor *i) { + assert(i); + + if (i->started) + return 0; + + dual_timestamp_now(&i->since); + + log_debug("Inhibitor %s (%s) pid="PID_FMT" uid="UID_FMT" mode=%s started.", + strna(i->who), strna(i->why), + i->pid.pid, i->uid, + inhibit_mode_to_string(i->mode)); + + i->started = true; + + inhibitor_save(i); + + bus_manager_send_inhibited_change(i); + + return 0; +} + +void inhibitor_stop(Inhibitor *i) { + assert(i); + + if (i->started) + log_debug("Inhibitor %s (%s) pid="PID_FMT" uid="UID_FMT" mode=%s stopped.", + strna(i->who), strna(i->why), + i->pid.pid, i->uid, + inhibit_mode_to_string(i->mode)); + + inhibitor_remove_fifo(i); + + if (i->state_file) + (void) unlink(i->state_file); + + i->started = false; + + bus_manager_send_inhibited_change(i); +} + +int inhibitor_load(Inhibitor *i) { + _cleanup_free_ char *what = NULL, *uid = NULL, *pid = NULL, *who = NULL, *why = NULL, *mode = NULL; + InhibitWhat w; + InhibitMode mm; + char *cc; + ssize_t l; + int r; + + r = parse_env_file(NULL, i->state_file, + "WHAT", &what, + "UID", &uid, + "PID", &pid, + "WHO", &who, + "WHY", &why, + "MODE", &mode, + "FIFO", &i->fifo_path); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", i->state_file); + + w = what ? inhibit_what_from_string(what) : 0; + if (w >= 0) + i->what = w; + + mm = mode ? inhibit_mode_from_string(mode) : INHIBIT_BLOCK; + if (mm >= 0) + i->mode = mm; + + if (uid) { + r = parse_uid(uid, &i->uid); + if (r < 0) + log_debug_errno(r, "Failed to parse UID of inhibitor: %s", uid); + } + + if (pid) { + pidref_done(&i->pid); + r = pidref_set_pidstr(&i->pid, pid); + if (r < 0) + log_debug_errno(r, "Failed to parse PID of inhibitor: %s", pid); + } + + if (who) { + l = cunescape(who, 0, &cc); + if (l < 0) + return log_debug_errno(l, "Failed to unescape \"who\" of inhibitor: %m"); + + free_and_replace(i->who, cc); + } + + if (why) { + l = cunescape(why, 0, &cc); + if (l < 0) + return log_debug_errno(l, "Failed to unescape \"why\" of inhibitor: %m"); + + free_and_replace(i->why, cc); + } + + if (i->fifo_path) { + _cleanup_close_ int fd = -EBADF; + + /* Let's re-open the FIFO on both sides, and close the writing side right away */ + fd = inhibitor_create_fifo(i); + if (fd < 0) + return log_error_errno(fd, "Failed to reopen FIFO: %m"); + } + + return 0; +} + +static int inhibitor_dispatch_fifo(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Inhibitor *i = ASSERT_PTR(userdata); + + assert(s); + assert(fd == i->fifo_fd); + + inhibitor_stop(i); + inhibitor_free(i); + + return 0; +} + +int inhibitor_create_fifo(Inhibitor *i) { + int r; + + assert(i); + + /* Create FIFO */ + if (!i->fifo_path) { + r = mkdir_safe_label("/run/systemd/inhibit", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + return r; + + i->fifo_path = strjoin("/run/systemd/inhibit/", i->id, ".ref"); + if (!i->fifo_path) + return -ENOMEM; + + if (mkfifo(i->fifo_path, 0600) < 0 && errno != EEXIST) + return -errno; + } + + /* Open reading side */ + if (i->fifo_fd < 0) { + i->fifo_fd = open(i->fifo_path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (i->fifo_fd < 0) + return -errno; + } + + if (!i->event_source) { + r = sd_event_add_io(i->manager->event, &i->event_source, i->fifo_fd, 0, inhibitor_dispatch_fifo, i); + if (r < 0) + return r; + + r = sd_event_source_set_priority(i->event_source, SD_EVENT_PRIORITY_IDLE-10); + if (r < 0) + return r; + + (void) sd_event_source_set_description(i->event_source, "inhibitor-ref"); + } + + /* Open writing side */ + return RET_NERRNO(open(i->fifo_path, O_WRONLY|O_CLOEXEC|O_NONBLOCK)); +} + +static void inhibitor_remove_fifo(Inhibitor *i) { + assert(i); + + i->event_source = sd_event_source_unref(i->event_source); + i->fifo_fd = safe_close(i->fifo_fd); + + if (i->fifo_path) { + (void) unlink(i->fifo_path); + i->fifo_path = mfree(i->fifo_path); + } +} + +bool inhibitor_is_orphan(Inhibitor *i) { + assert(i); + + if (!i->started) + return true; + + if (!i->fifo_path) + return true; + + if (i->fifo_fd < 0) + return true; + + if (pipe_eof(i->fifo_fd) != 0) + return true; + + return false; +} + +InhibitWhat manager_inhibit_what(Manager *m, InhibitMode mm) { + Inhibitor *i; + InhibitWhat what = 0; + + assert(m); + + HASHMAP_FOREACH(i, m->inhibitors) + if (i->mode == mm && i->started) + what |= i->what; + + return what; +} + +static int pidref_is_active_session(Manager *m, const PidRef *pid) { + Session *s; + int r; + + assert(m); + assert(pid); + + /* Get client session. This is not what you are looking for these days. + * FIXME #6852 */ + r = manager_get_session_by_pidref(m, pid, &s); + if (r < 0) + return r; + + /* If there's no session assigned to it, then it's globally active on all ttys */ + if (r == 0) + return 1; + + return session_is_active(s); +} + +bool manager_is_inhibited( + Manager *m, + InhibitWhat w, + InhibitMode mm, + dual_timestamp *since, + bool ignore_inactive, + bool ignore_uid, + uid_t uid, + Inhibitor **offending) { + + Inhibitor *i; + struct dual_timestamp ts = DUAL_TIMESTAMP_NULL; + bool inhibited = false; + + assert(m); + assert(w > 0); + assert(w < _INHIBIT_WHAT_MAX); + + HASHMAP_FOREACH(i, m->inhibitors) { + if (!i->started) + continue; + + if (!(i->what & w)) + continue; + + if (i->mode != mm) + continue; + + if (ignore_inactive && pidref_is_active_session(m, &i->pid) <= 0) + continue; + + if (ignore_uid && i->uid == uid) + continue; + + if (!inhibited || + i->since.monotonic < ts.monotonic) + ts = i->since; + + inhibited = true; + + if (offending) + *offending = i; + } + + if (since) + *since = ts; + + return inhibited; +} + +const char *inhibit_what_to_string(InhibitWhat w) { + static thread_local char buffer[STRLEN( + "shutdown:" + "sleep:" + "idle:" + "handle-power-key:" + "handle-suspend-key:" + "handle-hibernate-key:" + "handle-lid-switch:" + "handle-reboot-key")+1]; + char *p; + + if (!inhibit_what_is_valid(w)) + return NULL; + + p = buffer; + if (w & INHIBIT_SHUTDOWN) + p = stpcpy(p, "shutdown:"); + if (w & INHIBIT_SLEEP) + p = stpcpy(p, "sleep:"); + if (w & INHIBIT_IDLE) + p = stpcpy(p, "idle:"); + if (w & INHIBIT_HANDLE_POWER_KEY) + p = stpcpy(p, "handle-power-key:"); + if (w & INHIBIT_HANDLE_SUSPEND_KEY) + p = stpcpy(p, "handle-suspend-key:"); + if (w & INHIBIT_HANDLE_HIBERNATE_KEY) + p = stpcpy(p, "handle-hibernate-key:"); + if (w & INHIBIT_HANDLE_LID_SWITCH) + p = stpcpy(p, "handle-lid-switch:"); + if (w & INHIBIT_HANDLE_REBOOT_KEY) + p = stpcpy(p, "handle-reboot-key:"); + + if (p > buffer) + *(p-1) = 0; + else + *p = 0; + + return buffer; +} + +int inhibit_what_from_string(const char *s) { + InhibitWhat what = 0; + + for (const char *p = s;;) { + _cleanup_free_ char *word = NULL; + int r; + + /* A sanity check that our return values fit in an int */ + assert_cc((int) _INHIBIT_WHAT_MAX == _INHIBIT_WHAT_MAX); + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return what; + + if (streq(word, "shutdown")) + what |= INHIBIT_SHUTDOWN; + else if (streq(word, "sleep")) + what |= INHIBIT_SLEEP; + else if (streq(word, "idle")) + what |= INHIBIT_IDLE; + else if (streq(word, "handle-power-key")) + what |= INHIBIT_HANDLE_POWER_KEY; + else if (streq(word, "handle-suspend-key")) + what |= INHIBIT_HANDLE_SUSPEND_KEY; + else if (streq(word, "handle-hibernate-key")) + what |= INHIBIT_HANDLE_HIBERNATE_KEY; + else if (streq(word, "handle-lid-switch")) + what |= INHIBIT_HANDLE_LID_SWITCH; + else if (streq(word, "handle-reboot-key")) + what |= INHIBIT_HANDLE_REBOOT_KEY; + else + return _INHIBIT_WHAT_INVALID; + } +} + +static const char* const inhibit_mode_table[_INHIBIT_MODE_MAX] = { + [INHIBIT_BLOCK] = "block", + [INHIBIT_DELAY] = "delay" +}; + +DEFINE_STRING_TABLE_LOOKUP(inhibit_mode, InhibitMode); diff --git a/src/login/logind-inhibit.h b/src/login/logind-inhibit.h new file mode 100644 index 0000000..c34c225 --- /dev/null +++ b/src/login/logind-inhibit.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "pidref.h" + +typedef struct Inhibitor Inhibitor; + +typedef enum InhibitWhat { + INHIBIT_SHUTDOWN = 1 << 0, + INHIBIT_SLEEP = 1 << 1, + INHIBIT_IDLE = 1 << 2, + INHIBIT_HANDLE_POWER_KEY = 1 << 3, + INHIBIT_HANDLE_SUSPEND_KEY = 1 << 4, + INHIBIT_HANDLE_HIBERNATE_KEY = 1 << 5, + INHIBIT_HANDLE_LID_SWITCH = 1 << 6, + INHIBIT_HANDLE_REBOOT_KEY = 1 << 7, + _INHIBIT_WHAT_MAX = 1 << 8, + _INHIBIT_WHAT_INVALID = -EINVAL, +} InhibitWhat; + +typedef enum InhibitMode { + INHIBIT_BLOCK, + INHIBIT_DELAY, + _INHIBIT_MODE_MAX, + _INHIBIT_MODE_INVALID = -EINVAL, +} InhibitMode; + +#include "logind.h" + +struct Inhibitor { + Manager *manager; + + sd_event_source *event_source; + + const char *id; + char *state_file; + + bool started; + + InhibitWhat what; + char *who; + char *why; + InhibitMode mode; + + PidRef pid; + uid_t uid; + + dual_timestamp since; + + char *fifo_path; + int fifo_fd; +}; + +int inhibitor_new(Inhibitor **ret, Manager *m, const char* id); +Inhibitor* inhibitor_free(Inhibitor *i); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Inhibitor*, inhibitor_free); + +int inhibitor_load(Inhibitor *i); + +int inhibitor_start(Inhibitor *i); +void inhibitor_stop(Inhibitor *i); + +int inhibitor_create_fifo(Inhibitor *i); + +bool inhibitor_is_orphan(Inhibitor *i); + +InhibitWhat manager_inhibit_what(Manager *m, InhibitMode mm); +bool manager_is_inhibited(Manager *m, InhibitWhat w, InhibitMode mm, dual_timestamp *since, bool ignore_inactive, bool ignore_uid, uid_t uid, Inhibitor **offending); + +static inline bool inhibit_what_is_valid(InhibitWhat w) { + return w > 0 && w < _INHIBIT_WHAT_MAX; +} + +const char *inhibit_what_to_string(InhibitWhat k); +int inhibit_what_from_string(const char *s); + +const char *inhibit_mode_to_string(InhibitMode k); +InhibitMode inhibit_mode_from_string(const char *s); diff --git a/src/login/logind-polkit.c b/src/login/logind-polkit.c new file mode 100644 index 0000000..e4efd64 --- /dev/null +++ b/src/login/logind-polkit.c @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-polkit.h" +#include "logind-polkit.h" +#include "missing_capability.h" +#include "user-util.h" + +int check_polkit_chvt(sd_bus_message *message, Manager *manager, sd_bus_error *error) { +#if ENABLE_POLKIT + return bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.chvt", + NULL, + false, + UID_INVALID, + &manager->polkit_registry, + error); +#else + /* Allow chvt when polkit is not present. This allows a service to start a graphical session as a + * non-root user when polkit is not compiled in, more closely matching the default polkit policy */ + return 1; +#endif +} diff --git a/src/login/logind-polkit.h b/src/login/logind-polkit.h new file mode 100644 index 0000000..9ec01a3 --- /dev/null +++ b/src/login/logind-polkit.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" +#include "logind.h" + +int check_polkit_chvt(sd_bus_message *message, Manager *manager, sd_bus_error *error); diff --git a/src/login/logind-seat-dbus.c b/src/login/logind-seat-dbus.c new file mode 100644 index 0000000..877b9c1 --- /dev/null +++ b/src/login/logind-seat-dbus.c @@ -0,0 +1,442 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-label.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "logind-dbus.h" +#include "logind-polkit.h" +#include "logind-seat-dbus.h" +#include "logind-seat.h" +#include "logind-session-dbus.h" +#include "logind.h" +#include "missing_capability.h" +#include "strv.h" +#include "user-util.h" + +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_const_true, "b", true); +static BUS_DEFINE_PROPERTY_GET(property_get_can_tty, "b", Seat, seat_can_tty); +static BUS_DEFINE_PROPERTY_GET(property_get_can_graphical, "b", Seat, seat_can_graphical); + +static int property_get_active_session( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Seat *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + p = s->active ? session_bus_path(s->active) : strdup("/"); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(so)", s->active ? s->active->id : "", p); +} + +static int property_get_sessions( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Seat *s = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(so)"); + if (r < 0) + return r; + + LIST_FOREACH(sessions_by_seat, session, s->sessions) { + _cleanup_free_ char *p = NULL; + + p = session_bus_path(session); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(so)", session->id, p); + if (r < 0) + return r; + + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return 1; +} + +static int property_get_idle_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Seat *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", seat_get_idle_hint(s, NULL) > 0); +} + +static int property_get_idle_since_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Seat *s = ASSERT_PTR(userdata); + dual_timestamp t; + uint64_t u; + int r; + + assert(bus); + assert(reply); + + r = seat_get_idle_hint(s, &t); + if (r < 0) + return r; + + u = streq(property, "IdleSinceHint") ? t.realtime : t.monotonic; + + return sd_bus_message_append(reply, "t", u); +} + +int bus_seat_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Seat *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.login1.manage", + NULL, + false, + UID_INVALID, + &s->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = seat_stop_sessions(s, /* force = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_activate_session(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Seat *s = ASSERT_PTR(userdata); + const char *name; + Session *session; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + session = hashmap_get(s->manager->sessions, name); + if (!session) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_SESSION, "No session '%s' known", name); + + if (session->seat != s) + return sd_bus_error_setf(error, BUS_ERROR_SESSION_NOT_ON_SEAT, "Session %s not on seat %s", name, s->id); + + r = check_polkit_chvt(message, s->manager, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_activate(session); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_switch_to(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Seat *s = ASSERT_PTR(userdata); + unsigned to; + int r; + + assert(message); + + r = sd_bus_message_read(message, "u", &to); + if (r < 0) + return r; + + if (to <= 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid virtual terminal"); + + r = check_polkit_chvt(message, s->manager, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = seat_switch_to(s, to); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_switch_to_next(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Seat *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = check_polkit_chvt(message, s->manager, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = seat_switch_to_next(s); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_switch_to_previous(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Seat *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = check_polkit_chvt(message, s->manager, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = seat_switch_to_previous(s); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int seat_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + _cleanup_free_ char *e = NULL; + sd_bus_message *message; + Manager *m = ASSERT_PTR(userdata); + const char *p; + Seat *seat; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + p = startswith(path, "/org/freedesktop/login1/seat/"); + if (!p) + return 0; + + e = bus_label_unescape(p); + if (!e) + return -ENOMEM; + + message = sd_bus_get_current_message(bus); + + r = manager_get_seat_from_creds(m, message, e, error, &seat); + if (r == -ENXIO) { + sd_bus_error_free(error); + return 0; + } + if (r < 0) + return r; + + *found = seat; + return 1; +} + +char *seat_bus_path(Seat *s) { + _cleanup_free_ char *t = NULL; + + assert(s); + + t = bus_label_escape(s->id); + if (!t) + return NULL; + + return strjoin("/org/freedesktop/login1/seat/", t); +} + +static int seat_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + sd_bus_message *message; + Manager *m = userdata; + Seat *seat; + int r; + + assert(bus); + assert(path); + assert(nodes); + + HASHMAP_FOREACH(seat, m->seats) { + char *p; + + p = seat_bus_path(seat); + if (!p) + return -ENOMEM; + + r = strv_consume(&l, p); + if (r < 0) + return r; + } + + message = sd_bus_get_current_message(bus); + if (message) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_SESSION|SD_BUS_CREDS_OWNER_UID|SD_BUS_CREDS_AUGMENT, &creds); + if (r >= 0) { + bool may_auto = false; + const char *name; + + r = sd_bus_creds_get_session(creds, &name); + if (r >= 0) { + Session *session; + + session = hashmap_get(m->sessions, name); + if (session && session->seat) { + r = strv_extend(&l, "/org/freedesktop/login1/seat/self"); + if (r < 0) + return r; + + may_auto = true; + } + } + + if (!may_auto) { + uid_t uid; + + r = sd_bus_creds_get_owner_uid(creds, &uid); + if (r >= 0) { + User *user; + + user = hashmap_get(m->users, UID_TO_PTR(uid)); + may_auto = user && user->display && user->display->seat; + } + } + + if (may_auto) { + r = strv_extend(&l, "/org/freedesktop/login1/seat/auto"); + if (r < 0) + return r; + } + } + } + + *nodes = TAKE_PTR(l); + return 1; +} + +int seat_send_signal(Seat *s, bool new_seat) { + _cleanup_free_ char *p = NULL; + + assert(s); + + p = seat_bus_path(s); + if (!p) + return -ENOMEM; + + return sd_bus_emit_signal( + s->manager->bus, + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + new_seat ? "SeatNew" : "SeatRemoved", + "so", s->id, p); +} + +int seat_send_changed(Seat *s, const char *properties, ...) { + _cleanup_free_ char *p = NULL; + char **l; + + assert(s); + + if (!s->started) + return 0; + + p = seat_bus_path(s); + if (!p) + return -ENOMEM; + + l = strv_from_stdarg_alloca(properties); + + return sd_bus_emit_properties_changed_strv(s->manager->bus, p, "org.freedesktop.login1.Seat", l); +} + +static const sd_bus_vtable seat_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Seat, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ActiveSession", "(so)", property_get_active_session, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CanMultiSession", "b", property_get_const_true, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("CanTTY", "b", property_get_can_tty, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CanGraphical", "b", property_get_can_graphical, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Sessions", "a(so)", property_get_sessions, 0, 0), + SD_BUS_PROPERTY("IdleHint", "b", property_get_idle_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHint", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHintMonotonic", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_METHOD("Terminate", NULL, NULL, bus_seat_method_terminate, SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_METHOD_WITH_ARGS("ActivateSession", + SD_BUS_ARGS("s", session_id), + SD_BUS_NO_RESULT, + method_activate_session, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SwitchTo", + SD_BUS_ARGS("u", vtnr), + SD_BUS_NO_RESULT, + method_switch_to, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_METHOD("SwitchToNext", NULL, NULL, method_switch_to_next, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("SwitchToPrevious", NULL, NULL, method_switch_to_previous, SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation seat_object = { + "/org/freedesktop/login1/seat", + "org.freedesktop.login1.Seat", + .fallback_vtables = BUS_FALLBACK_VTABLES({seat_vtable, seat_object_find}), + .node_enumerator = seat_node_enumerator, +}; diff --git a/src/login/logind-seat-dbus.h b/src/login/logind-seat-dbus.h new file mode 100644 index 0000000..258db91 --- /dev/null +++ b/src/login/logind-seat-dbus.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" +#include "logind-seat.h" + +extern const BusObjectImplementation seat_object; + +char *seat_bus_path(Seat *s); + +int seat_send_signal(Seat *s, bool new_seat); +int seat_send_changed(Seat *s, const char *properties, ...) _sentinel_; + +int bus_seat_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/login/logind-seat.c b/src/login/logind-seat.c new file mode 100644 index 0000000..8d875d2 --- /dev/null +++ b/src/login/logind-seat.c @@ -0,0 +1,682 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "devnode-acl.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "logind-seat-dbus.h" +#include "logind-seat.h" +#include "logind-session-dbus.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" + +int seat_new(Seat** ret, Manager *m, const char *id) { + _cleanup_(seat_freep) Seat *s = NULL; + int r; + + assert(ret); + assert(m); + assert(id); + + if (!seat_name_is_valid(id)) + return -EINVAL; + + s = new(Seat, 1); + if (!s) + return -ENOMEM; + + *s = (Seat) { + .manager = m, + }; + + s->state_file = path_join("/run/systemd/seats", id); + if (!s->state_file) + return -ENOMEM; + + s->id = basename(s->state_file); + + r = hashmap_put(m->seats, s->id, s); + if (r < 0) + return r; + + *ret = TAKE_PTR(s); + return 0; +} + +Seat* seat_free(Seat *s) { + if (!s) + return NULL; + + if (s->in_gc_queue) + LIST_REMOVE(gc_queue, s->manager->seat_gc_queue, s); + + while (s->sessions) + session_free(s->sessions); + + assert(!s->active); + + while (s->devices) + device_free(s->devices); + + hashmap_remove(s->manager->seats, s->id); + + free(s->positions); + free(s->state_file); + + return mfree(s); +} + +int seat_save(Seat *s) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(s); + + if (!s->started) + return 0; + + r = mkdir_safe_label("/run/systemd/seats", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + r = fopen_temporary(s->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "IS_SEAT0=%i\n" + "CAN_MULTI_SESSION=1\n" + "CAN_TTY=%i\n" + "CAN_GRAPHICAL=%i\n", + seat_is_seat0(s), + seat_can_tty(s), + seat_can_graphical(s)); + + if (s->active) { + assert(s->active->user); + + fprintf(f, + "ACTIVE=%s\n" + "ACTIVE_UID="UID_FMT"\n", + s->active->id, + s->active->user->user_record->uid); + } + + if (s->sessions) { + fputs("SESSIONS=", f); + LIST_FOREACH(sessions_by_seat, i, s->sessions) { + fprintf(f, + "%s%c", + i->id, + i->sessions_by_seat_next ? ' ' : '\n'); + } + + fputs("UIDS=", f); + LIST_FOREACH(sessions_by_seat, i, s->sessions) + fprintf(f, + UID_FMT"%c", + i->user->user_record->uid, + i->sessions_by_seat_next ? ' ' : '\n'); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, s->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + return 0; + +fail: + (void) unlink(s->state_file); + return log_error_errno(r, "Failed to save seat data %s: %m", s->state_file); +} + +int seat_load(Seat *s) { + assert(s); + + /* There isn't actually anything to read here ... */ + + return 0; +} + +static int vt_allocate(unsigned vtnr) { + char p[sizeof("/dev/tty") + DECIMAL_STR_MAX(unsigned)]; + _cleanup_close_ int fd = -EBADF; + + assert(vtnr >= 1); + + xsprintf(p, "/dev/tty%u", vtnr); + fd = open_terminal(p, O_RDWR|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return fd; + + return 0; +} + +int seat_preallocate_vts(Seat *s) { + int r = 0; + unsigned i; + + assert(s); + assert(s->manager); + + if (s->manager->n_autovts <= 0) + return 0; + + if (!seat_has_vts(s)) + return 0; + + log_debug("Preallocating VTs..."); + + for (i = 1; i <= s->manager->n_autovts; i++) { + int q; + + q = vt_allocate(i); + if (q < 0) + r = log_error_errno(q, "Failed to preallocate VT %u: %m", i); + } + + return r; +} + +int seat_apply_acls(Seat *s, Session *old_active) { + int r; + + assert(s); + + r = devnode_acl_all(s->id, + false, + !!old_active, old_active ? old_active->user->user_record->uid : 0, + !!s->active, s->active ? s->active->user->user_record->uid : 0); + + if (r < 0) + return log_error_errno(r, "Failed to apply ACLs: %m"); + + return 0; +} + +int seat_set_active(Seat *s, Session *session) { + Session *old_active; + + assert(s); + assert(!session || session->seat == s); + + /* When logind receives the SIGRTMIN signal from the kernel, it will + * execute session_leave_vt and stop all devices of the session; at + * this time, if the session is active and there is no change in the + * session, then the session does not have the permissions of the device, + * and the machine will have a black screen and suspended animation. + * Therefore, if the active session has executed session_leave_vt , + * A resume is required here. */ + if (session == s->active) { + if (session) { + log_debug("Active session remains unchanged, resuming session devices."); + session_device_resume_all(session); + } + return 0; + } + + old_active = s->active; + s->active = session; + + if (old_active) { + session_device_pause_all(old_active); + session_send_changed(old_active, "Active", NULL); + } + + (void) seat_apply_acls(s, old_active); + + if (session && session->started) { + session_send_changed(session, "Active", NULL); + session_device_resume_all(session); + } + + if (!session || session->started) + seat_send_changed(s, "ActiveSession", NULL); + + seat_save(s); + + if (session) { + session_save(session); + user_save(session->user); + } + + if (old_active) { + session_save(old_active); + if (!session || session->user != old_active->user) + user_save(old_active->user); + } + + return 0; +} + +static Session* seat_get_position(Seat *s, unsigned pos) { + assert(s); + + if (pos >= MALLOC_ELEMENTSOF(s->positions)) + return NULL; + + return s->positions[pos]; +} + +int seat_switch_to(Seat *s, unsigned num) { + Session *session; + + /* Public session positions skip 0 (there is only F1-F12). Maybe it + * will get reassigned in the future, so return error for now. */ + if (num == 0) + return -EINVAL; + + session = seat_get_position(s, num); + if (!session) { + /* allow switching to unused VTs to trigger auto-activate */ + if (seat_has_vts(s) && num < 64) + return chvt(num); + + return -EINVAL; + } + + return session_activate(session); +} + +int seat_switch_to_next(Seat *s) { + unsigned start, i; + Session *session; + + if (MALLOC_ELEMENTSOF(s->positions) == 0) + return -EINVAL; + + start = 1; + if (s->active && s->active->position > 0) + start = s->active->position; + + for (i = start + 1; i < MALLOC_ELEMENTSOF(s->positions); ++i) { + session = seat_get_position(s, i); + if (session) + return session_activate(session); + } + + for (i = 1; i < start; ++i) { + session = seat_get_position(s, i); + if (session) + return session_activate(session); + } + + return -EINVAL; +} + +int seat_switch_to_previous(Seat *s) { + if (MALLOC_ELEMENTSOF(s->positions) == 0) + return -EINVAL; + + size_t start = s->active && s->active->position > 0 ? s->active->position : 1; + + for (size_t i = start - 1; i > 0; i--) { + Session *session = seat_get_position(s, i); + if (session) + return session_activate(session); + } + + for (size_t i = MALLOC_ELEMENTSOF(s->positions) - 1; i > start; i--) { + Session *session = seat_get_position(s, i); + if (session) + return session_activate(session); + } + + return -EINVAL; +} + +int seat_active_vt_changed(Seat *s, unsigned vtnr) { + Session *new_active = NULL; + int r; + + assert(s); + assert(vtnr >= 1); + + if (!seat_has_vts(s)) + return -EINVAL; + + log_debug("VT changed to %u", vtnr); + + /* we might have earlier closing sessions on the same VT, so try to + * find a running one first */ + LIST_FOREACH(sessions_by_seat, i, s->sessions) + if (i->vtnr == vtnr && !i->stopping) { + new_active = i; + break; + } + + if (!new_active) + /* no running one? then we can't decide which one is the + * active one, let the first one win */ + LIST_FOREACH(sessions_by_seat, i, s->sessions) + if (i->vtnr == vtnr) { + new_active = i; + break; + } + + r = seat_set_active(s, new_active); + manager_spawn_autovt(s->manager, vtnr); + + return r; +} + +int seat_read_active_vt(Seat *s) { + char t[64]; + ssize_t k; + int vtnr; + + assert(s); + + if (!seat_has_vts(s)) + return 0; + + if (lseek(s->manager->console_active_fd, SEEK_SET, 0) < 0) + return log_error_errno(errno, "lseek on console_active_fd failed: %m"); + + errno = 0; + k = read(s->manager->console_active_fd, t, sizeof(t)-1); + if (k <= 0) + return log_error_errno(errno ?: EIO, + "Failed to read current console: %s", STRERROR_OR_EOF(errno)); + + t[k] = 0; + truncate_nl(t); + + vtnr = vtnr_from_tty(t); + if (vtnr < 0) { + log_error_errno(vtnr, "Hm, /sys/class/tty/tty0/active is badly formatted: %m"); + return -EIO; + } + + return seat_active_vt_changed(s, vtnr); +} + +int seat_start(Seat *s) { + assert(s); + + if (s->started) + return 0; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_SEAT_START_STR, + "SEAT_ID=%s", s->id, + LOG_MESSAGE("New seat %s.", s->id)); + + /* Initialize VT magic stuff */ + seat_preallocate_vts(s); + + /* Read current VT */ + seat_read_active_vt(s); + + s->started = true; + + /* Save seat data */ + seat_save(s); + + seat_send_signal(s, true); + + return 0; +} + +int seat_stop(Seat *s, bool force) { + int r; + + assert(s); + + if (s->started) + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_SEAT_STOP_STR, + "SEAT_ID=%s", s->id, + LOG_MESSAGE("Removed seat %s.", s->id)); + + r = seat_stop_sessions(s, force); + + (void) unlink(s->state_file); + seat_add_to_gc_queue(s); + + if (s->started) + seat_send_signal(s, false); + + s->started = false; + + return r; +} + +int seat_stop_sessions(Seat *s, bool force) { + int r = 0, k; + + assert(s); + + LIST_FOREACH(sessions_by_seat, session, s->sessions) { + k = session_stop(session, force); + if (k < 0) + r = k; + } + + return r; +} + +void seat_evict_position(Seat *s, Session *session) { + unsigned pos = session->position; + + session->position = 0; + + if (pos == 0) + return; + + if (pos < MALLOC_ELEMENTSOF(s->positions) && s->positions[pos] == session) { + s->positions[pos] = NULL; + + /* There might be another session claiming the same + * position (eg., during gdm->session transition), so let's look + * for it and set it on the free slot. */ + LIST_FOREACH(sessions_by_seat, iter, s->sessions) + if (iter->position == pos && session_get_state(iter) != SESSION_CLOSING) { + s->positions[pos] = iter; + break; + } + } +} + +void seat_claim_position(Seat *s, Session *session, unsigned pos) { + /* with VTs, the position is always the same as the VTnr */ + if (seat_has_vts(s)) + pos = session->vtnr; + + if (!GREEDY_REALLOC0(s->positions, pos + 1)) + return; + + seat_evict_position(s, session); + + session->position = pos; + if (pos > 0) + s->positions[pos] = session; +} + +static void seat_assign_position(Seat *s, Session *session) { + unsigned pos; + + if (session->position > 0) + return; + + for (pos = 1; pos < MALLOC_ELEMENTSOF(s->positions); ++pos) + if (!s->positions[pos]) + break; + + seat_claim_position(s, session, pos); +} + +int seat_attach_session(Seat *s, Session *session) { + assert(s); + assert(session); + assert(!session->seat); + + if (!seat_has_vts(s) != !session->vtnr) + return -EINVAL; + + session->seat = s; + LIST_PREPEND(sessions_by_seat, s->sessions, session); + seat_assign_position(s, session); + + /* On seats with VTs, the VT logic defines which session is active. On + * seats without VTs, we automatically activate new sessions. */ + if (!seat_has_vts(s)) + seat_set_active(s, session); + + return 0; +} + +void seat_complete_switch(Seat *s) { + Session *session; + + assert(s); + + /* if no session-switch is pending or if it got canceled, do nothing */ + if (!s->pending_switch) + return; + + session = TAKE_PTR(s->pending_switch); + + seat_set_active(s, session); +} + +bool seat_has_vts(Seat *s) { + assert(s); + + return seat_is_seat0(s) && s->manager->console_active_fd >= 0; +} + +bool seat_is_seat0(Seat *s) { + assert(s); + + return s->manager->seat0 == s; +} + +bool seat_can_tty(Seat *s) { + assert(s); + + return seat_has_vts(s); +} + +bool seat_has_master_device(Seat *s) { + assert(s); + + /* device list is ordered by "master" flag */ + return !!s->devices && s->devices->master; +} + +bool seat_can_graphical(Seat *s) { + assert(s); + + return seat_has_master_device(s); +} + +int seat_get_idle_hint(Seat *s, dual_timestamp *t) { + bool idle_hint = true; + dual_timestamp ts = DUAL_TIMESTAMP_NULL; + + assert(s); + + LIST_FOREACH(sessions_by_seat, session, s->sessions) { + dual_timestamp k; + int ih; + + ih = session_get_idle_hint(session, &k); + if (ih < 0) + return ih; + + if (!ih) { + if (!idle_hint) { + if (k.monotonic > ts.monotonic) + ts = k; + } else { + idle_hint = false; + ts = k; + } + } else if (idle_hint) { + + if (k.monotonic > ts.monotonic) + ts = k; + } + } + + if (t) + *t = ts; + + return idle_hint; +} + +bool seat_may_gc(Seat *s, bool drop_not_started) { + assert(s); + + if (drop_not_started && !s->started) + return true; + + if (seat_is_seat0(s)) + return false; + + return !seat_has_master_device(s); +} + +void seat_add_to_gc_queue(Seat *s) { + assert(s); + + if (s->in_gc_queue) + return; + + LIST_PREPEND(gc_queue, s->manager->seat_gc_queue, s); + s->in_gc_queue = true; +} + +static bool seat_name_valid_char(char c) { + return + ascii_isalpha(c) || + ascii_isdigit(c) || + IN_SET(c, '-', '_'); +} + +bool seat_name_is_valid(const char *name) { + const char *p; + + assert(name); + + if (!startswith(name, "seat")) + return false; + + if (!name[4]) + return false; + + for (p = name; *p; p++) + if (!seat_name_valid_char(*p)) + return false; + + if (strlen(name) > 255) + return false; + + return true; +} diff --git a/src/login/logind-seat.h b/src/login/logind-seat.h new file mode 100644 index 0000000..2d18e75 --- /dev/null +++ b/src/login/logind-seat.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Seat Seat; + +#include "list.h" +#include "logind-session.h" + +struct Seat { + Manager *manager; + char *id; + + char *state_file; + + LIST_HEAD(Device, devices); + + Session *active; + Session *pending_switch; + LIST_HEAD(Session, sessions); + + Session **positions; + + bool in_gc_queue:1; + bool started:1; + + LIST_FIELDS(Seat, gc_queue); +}; + +int seat_new(Seat **ret, Manager *m, const char *id); +Seat* seat_free(Seat *s); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Seat *, seat_free); + +int seat_save(Seat *s); +int seat_load(Seat *s); + +int seat_apply_acls(Seat *s, Session *old_active); +int seat_set_active(Seat *s, Session *session); +int seat_switch_to(Seat *s, unsigned num); +int seat_switch_to_next(Seat *s); +int seat_switch_to_previous(Seat *s); +int seat_active_vt_changed(Seat *s, unsigned vtnr); +int seat_read_active_vt(Seat *s); +int seat_preallocate_vts(Seat *s); + +int seat_attach_session(Seat *s, Session *session); +void seat_complete_switch(Seat *s); +void seat_evict_position(Seat *s, Session *session); +void seat_claim_position(Seat *s, Session *session, unsigned pos); + +bool seat_has_vts(Seat *s); +bool seat_is_seat0(Seat *s); +bool seat_can_tty(Seat *s); +bool seat_has_master_device(Seat *s); +bool seat_can_graphical(Seat *s); + +int seat_get_idle_hint(Seat *s, dual_timestamp *t); + +int seat_start(Seat *s); +int seat_stop(Seat *s, bool force); +int seat_stop_sessions(Seat *s, bool force); + +bool seat_may_gc(Seat *s, bool drop_not_started); +void seat_add_to_gc_queue(Seat *s); + +bool seat_name_is_valid(const char *name); + +static inline bool SEAT_IS_SELF(const char *name) { + return isempty(name) || streq(name, "self"); +} + +static inline bool SEAT_IS_AUTO(const char *name) { + return streq_ptr(name, "auto"); +} diff --git a/src/login/logind-session-dbus.c b/src/login/logind-session-dbus.c new file mode 100644 index 0000000..a136ae4 --- /dev/null +++ b/src/login/logind-session-dbus.c @@ -0,0 +1,994 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-label.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "logind-brightness.h" +#include "logind-dbus.h" +#include "logind-polkit.h" +#include "logind-seat-dbus.h" +#include "logind-session-dbus.h" +#include "logind-session-device.h" +#include "logind-session.h" +#include "logind-user-dbus.h" +#include "logind.h" +#include "missing_capability.h" +#include "path-util.h" +#include "signal-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "user-util.h" + +static int property_get_user( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Session *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + p = user_bus_path(s->user); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(uo)", (uint32_t) s->user->user_record->uid, p); +} + +static int property_get_name( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Session *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "s", s->user->user_record->user_name); +} + +static int property_get_seat( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + Session *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + p = s->seat ? seat_bus_path(s->seat) : strdup("/"); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(so)", s->seat ? s->seat->id : "", p); +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, session_type, SessionType); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_class, session_class, SessionClass); +static BUS_DEFINE_PROPERTY_GET(property_get_active, "b", Session, session_is_active); +static BUS_DEFINE_PROPERTY_GET2(property_get_state, "s", Session, session_get_state, session_state_to_string); + +static int property_get_idle_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Session *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", session_get_idle_hint(s, NULL) > 0); +} + +static int property_get_idle_since_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Session *s = ASSERT_PTR(userdata); + dual_timestamp t = DUAL_TIMESTAMP_NULL; + uint64_t u; + int r; + + assert(bus); + assert(reply); + + r = session_get_idle_hint(s, &t); + if (r < 0) + return r; + + u = streq(property, "IdleSinceHint") ? t.realtime : t.monotonic; + + return sd_bus_message_append(reply, "t", u); +} + +static int property_get_locked_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Session *s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", session_get_locked_hint(s) > 0); +} + +int bus_session_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.login1.manage", + NULL, + false, + s->user->user_record->uid, + &s->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_stop(s, /* force = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_session_method_activate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = check_polkit_chvt(message, s->manager, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_activate(s); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_session_method_lock(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.login1.lock-sessions", + NULL, + false, + s->user->user_record->uid, + &s->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_send_lock(s, strstr(sd_bus_message_get_member(message), "Lock")); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_idle_hint(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Session *s = ASSERT_PTR(userdata); + uid_t uid; + int r, b; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + if (uid != 0 && uid != s->user->user_record->uid) + return sd_bus_error_set(error, SD_BUS_ERROR_ACCESS_DENIED, "Only owner of session may set idle hint"); + + r = session_set_idle_hint(s, b); + if (r == -ENOTTY) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Idle hint control is not supported on non-graphical sessions."); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_locked_hint(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Session *s = ASSERT_PTR(userdata); + uid_t uid; + int r, b; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + if (uid != 0 && uid != s->user->user_record->uid) + return sd_bus_error_set(error, SD_BUS_ERROR_ACCESS_DENIED, "Only owner of session may set locked hint"); + + session_set_locked_hint(s, b); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_session_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + const char *swho; + int32_t signo; + KillWho who; + int r; + + assert(message); + + r = sd_bus_message_read(message, "si", &swho, &signo); + if (r < 0) + return r; + + if (isempty(swho)) + who = KILL_ALL; + else { + who = kill_who_from_string(swho); + if (who < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid kill parameter '%s'", swho); + } + + if (!SIGNAL_VALID(signo)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal %i", signo); + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.login1.manage", + NULL, + false, + s->user->user_record->uid, + &s->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = session_kill(s, who, signo); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_take_control(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + Session *s = ASSERT_PTR(userdata); + int r, force; + uid_t uid; + + assert(message); + + r = sd_bus_message_read(message, "b", &force); + if (r < 0) + return r; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + if (uid != 0 && (force || uid != s->user->user_record->uid)) + return sd_bus_error_set(error, SD_BUS_ERROR_ACCESS_DENIED, "Only owner of session may take control"); + + r = session_set_controller(s, sd_bus_message_get_sender(message), force, true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_release_control(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + + assert(message); + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You are not in control of this session"); + + session_drop_controller(s); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_type(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + const char *t; + SessionType type; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + + type = session_type_from_string(t); + if (type < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid session type '%s'", t); + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You must be in control of this session to set type"); + + session_set_type(s, type); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_display(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + const char *display; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &display); + if (r < 0) + return r; + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You must be in control of this session to set display"); + + if (!SESSION_TYPE_IS_GRAPHICAL(s->type)) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Setting display is only supported for graphical sessions"); + + r = session_set_display(s, display); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_tty(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + int fd, r, flags; + _cleanup_free_ char *q = NULL; + + assert(message); + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return r; + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You must be in control of this session to set tty"); + + assert(fd >= 0); + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) + return -errno; + if ((flags & O_ACCMODE) != O_RDWR) + return -EACCES; + if (FLAGS_SET(flags, O_PATH)) + return -ENOTTY; + + r = getttyname_malloc(fd, &q); + if (r < 0) + return r; + + r = session_set_tty(s, q); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_take_device(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + uint32_t major, minor; + _cleanup_(session_device_freep) SessionDevice *sd = NULL; + dev_t dev; + int r; + + assert(message); + + r = sd_bus_message_read(message, "uu", &major, &minor); + if (r < 0) + return r; + + if (!DEVICE_MAJOR_VALID(major) || !DEVICE_MINOR_VALID(minor)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Device major/minor is not valid."); + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You are not in control of this session"); + + dev = makedev(major, minor); + sd = hashmap_get(s->devices, &dev); + if (sd) + /* We don't allow retrieving a device multiple times. + * The related ReleaseDevice call is not ref-counted. + * The caller should use dup() if it requires more + * than one fd (it would be functionally + * equivalent). */ + return sd_bus_error_set(error, BUS_ERROR_DEVICE_IS_TAKEN, "Device already taken"); + + r = session_device_new(s, dev, true, &sd); + if (r < 0) + return r; + + r = session_device_save(sd); + if (r < 0) + return r; + + r = sd_bus_reply_method_return(message, "hb", sd->fd, !sd->active); + if (r < 0) + return r; + + session_save(s); + TAKE_PTR(sd); + + return 1; +} + +static int method_release_device(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + uint32_t major, minor; + SessionDevice *sd; + dev_t dev; + int r; + + assert(message); + + r = sd_bus_message_read(message, "uu", &major, &minor); + if (r < 0) + return r; + + if (!DEVICE_MAJOR_VALID(major) || !DEVICE_MINOR_VALID(minor)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Device major/minor is not valid."); + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You are not in control of this session"); + + dev = makedev(major, minor); + sd = hashmap_get(s->devices, &dev); + if (!sd) + return sd_bus_error_set(error, BUS_ERROR_DEVICE_NOT_TAKEN, "Device not taken"); + + session_device_free(sd); + session_save(s); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_pause_device_complete(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Session *s = ASSERT_PTR(userdata); + uint32_t major, minor; + SessionDevice *sd; + dev_t dev; + int r; + + assert(message); + + r = sd_bus_message_read(message, "uu", &major, &minor); + if (r < 0) + return r; + + if (!DEVICE_MAJOR_VALID(major) || !DEVICE_MINOR_VALID(minor)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Device major/minor is not valid."); + + if (!session_is_controller(s, sd_bus_message_get_sender(message))) + return sd_bus_error_set(error, BUS_ERROR_NOT_IN_CONTROL, "You are not in control of this session"); + + dev = makedev(major, minor); + sd = hashmap_get(s->devices, &dev); + if (!sd) + return sd_bus_error_set(error, BUS_ERROR_DEVICE_NOT_TAKEN, "Device not taken"); + + session_device_complete_pause(sd); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_brightness(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + const char *subsystem, *name, *seat; + Session *s = ASSERT_PTR(userdata); + uint32_t brightness; + uid_t uid; + int r; + + assert(message); + + r = sd_bus_message_read(message, "ssu", &subsystem, &name, &brightness); + if (r < 0) + return r; + + if (!STR_IN_SET(subsystem, "backlight", "leds")) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Subsystem type %s not supported, must be one of 'backlight' or 'leds'.", subsystem); + if (!filename_is_valid(name)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Not a valid device name %s, refusing.", name); + + if (!s->seat) + return sd_bus_error_set(error, BUS_ERROR_NOT_YOUR_DEVICE, "Your session has no seat, refusing."); + if (s->seat->active != s) + return sd_bus_error_set(error, BUS_ERROR_NOT_YOUR_DEVICE, "Session is not in foreground, refusing."); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &uid); + if (r < 0) + return r; + + if (uid != 0 && uid != s->user->user_record->uid) + return sd_bus_error_set(error, SD_BUS_ERROR_ACCESS_DENIED, "Only owner of session may change brightness."); + + r = sd_device_new_from_subsystem_sysname(&d, subsystem, name); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to open device %s:%s: %m", subsystem, name); + + if (sd_device_get_property_value(d, "ID_SEAT", &seat) >= 0 && !streq_ptr(seat, s->seat->id)) + return sd_bus_error_setf(error, BUS_ERROR_NOT_YOUR_DEVICE, "Device %s:%s does not belong to your seat %s, refusing.", subsystem, name, s->seat->id); + + r = manager_write_brightness(s->manager, d, brightness, message); + if (r < 0) + return r; + + return 1; +} + +static int session_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + _cleanup_free_ char *e = NULL; + sd_bus_message *message; + Manager *m = ASSERT_PTR(userdata); + Session *session; + const char *p; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + p = startswith(path, "/org/freedesktop/login1/session/"); + if (!p) + return 0; + + e = bus_label_unescape(p); + if (!e) + return -ENOMEM; + + message = sd_bus_get_current_message(bus); + + r = manager_get_session_from_creds(m, message, e, error, &session); + if (r == -ENXIO) { + sd_bus_error_free(error); + return 0; + } + if (r < 0) + return r; + + *found = session; + return 1; +} + +char *session_bus_path(Session *s) { + _cleanup_free_ char *t = NULL; + + assert(s); + + t = bus_label_escape(s->id); + if (!t) + return NULL; + + return strjoin("/org/freedesktop/login1/session/", t); +} + +static int session_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + sd_bus_message *message; + Manager *m = userdata; + Session *session; + int r; + + assert(bus); + assert(path); + assert(nodes); + + HASHMAP_FOREACH(session, m->sessions) { + char *p; + + p = session_bus_path(session); + if (!p) + return -ENOMEM; + + r = strv_consume(&l, p); + if (r < 0) + return r; + } + + message = sd_bus_get_current_message(bus); + if (message) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_SESSION|SD_BUS_CREDS_OWNER_UID|SD_BUS_CREDS_AUGMENT, &creds); + if (r >= 0) { + bool may_auto = false; + const char *name; + + r = sd_bus_creds_get_session(creds, &name); + if (r >= 0) { + session = hashmap_get(m->sessions, name); + if (session) { + r = strv_extend(&l, "/org/freedesktop/login1/session/self"); + if (r < 0) + return r; + + may_auto = true; + } + } + + if (!may_auto) { + uid_t uid; + + r = sd_bus_creds_get_owner_uid(creds, &uid); + if (r >= 0) { + User *user; + + user = hashmap_get(m->users, UID_TO_PTR(uid)); + may_auto = user && user->display; + } + } + + if (may_auto) { + r = strv_extend(&l, "/org/freedesktop/login1/session/auto"); + if (r < 0) + return r; + } + } + } + + *nodes = TAKE_PTR(l); + return 1; +} + +int session_send_signal(Session *s, bool new_session) { + _cleanup_free_ char *p = NULL; + + assert(s); + + p = session_bus_path(s); + if (!p) + return -ENOMEM; + + return sd_bus_emit_signal( + s->manager->bus, + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + new_session ? "SessionNew" : "SessionRemoved", + "so", s->id, p); +} + +int session_send_changed(Session *s, const char *properties, ...) { + _cleanup_free_ char *p = NULL; + char **l; + + assert(s); + + if (!s->started) + return 0; + + p = session_bus_path(s); + if (!p) + return -ENOMEM; + + l = strv_from_stdarg_alloca(properties); + + return sd_bus_emit_properties_changed_strv(s->manager->bus, p, "org.freedesktop.login1.Session", l); +} + +int session_send_lock(Session *s, bool lock) { + _cleanup_free_ char *p = NULL; + + assert(s); + + p = session_bus_path(s); + if (!p) + return -ENOMEM; + + return sd_bus_emit_signal( + s->manager->bus, + p, + "org.freedesktop.login1.Session", + lock ? "Lock" : "Unlock", + NULL); +} + +int session_send_lock_all(Manager *m, bool lock) { + Session *session; + int r = 0; + + assert(m); + + HASHMAP_FOREACH(session, m->sessions) { + int k; + + k = session_send_lock(session, lock); + if (k < 0) + r = k; + } + + return r; +} + +static bool session_ready(Session *s) { + assert(s); + + /* Returns true when the session is ready, i.e. all jobs we enqueued for it are done (regardless if successful or not) */ + + return !s->scope_job && + !s->user->service_job; +} + +int session_send_create_reply(Session *s, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *c = NULL; + _cleanup_close_ int fifo_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r; + + assert(s); + + /* This is called after the session scope and the user service were successfully created, and finishes where + * bus_manager_create_session() left off. */ + + if (!s->create_message) + return 0; + + if (!sd_bus_error_is_set(error) && !session_ready(s)) + return 0; + + c = TAKE_PTR(s->create_message); + if (error) + return sd_bus_reply_method_error(c, error); + + fifo_fd = session_create_fifo(s); + if (fifo_fd < 0) + return fifo_fd; + + r = session_watch_pidfd(s); + if (r < 0) + return r; + + /* Update the session state file before we notify the client about the result. */ + session_save(s); + + p = session_bus_path(s); + if (!p) + return -ENOMEM; + + log_debug("Sending reply about created session: " + "id=%s object_path=%s uid=%u runtime_path=%s " + "session_fd=%d seat=%s vtnr=%u", + s->id, + p, + (uint32_t) s->user->user_record->uid, + s->user->runtime_path, + fifo_fd, + s->seat ? s->seat->id : "", + (uint32_t) s->vtnr); + + return sd_bus_reply_method_return( + c, "soshusub", + s->id, + p, + s->user->runtime_path, + fifo_fd, + (uint32_t) s->user->user_record->uid, + s->seat ? s->seat->id : "", + (uint32_t) s->vtnr, + false); +} + +static const sd_bus_vtable session_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Session, id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("User", "(uo)", property_get_user, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Name", "s", property_get_name, 0, SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("Timestamp", offsetof(Session, timestamp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("VTNr", "u", NULL, offsetof(Session, vtnr), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Seat", "(so)", property_get_seat, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("TTY", "s", NULL, offsetof(Session, tty), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Display", "s", NULL, offsetof(Session, display), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Remote", "b", bus_property_get_bool, offsetof(Session, remote), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoteHost", "s", NULL, offsetof(Session, remote_host), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoteUser", "s", NULL, offsetof(Session, remote_user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Service", "s", NULL, offsetof(Session, service), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Desktop", "s", NULL, offsetof(Session, desktop), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Scope", "s", NULL, offsetof(Session, scope), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Leader", "u", bus_property_get_pid, offsetof(Session, leader.pid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Audit", "u", NULL, offsetof(Session, audit_id), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Session, type), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Class", "s", property_get_class, offsetof(Session, class), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Active", "b", property_get_active, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("State", "s", property_get_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleHint", "b", property_get_idle_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHint", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHintMonotonic", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("LockedHint", "b", property_get_locked_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_METHOD("Terminate", + NULL, + NULL, + bus_session_method_terminate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Activate", + NULL, + NULL, + bus_session_method_activate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Lock", + NULL, + NULL, + bus_session_method_lock, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Unlock", + NULL, + NULL, + bus_session_method_lock, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetIdleHint", + SD_BUS_ARGS("b", idle), + SD_BUS_NO_RESULT, + method_set_idle_hint, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLockedHint", + SD_BUS_ARGS("b", locked), + SD_BUS_NO_RESULT, + method_set_locked_hint, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Kill", + SD_BUS_ARGS("s", who, "i", signal_number), + SD_BUS_NO_RESULT, + bus_session_method_kill, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TakeControl", + SD_BUS_ARGS("b", force), + SD_BUS_NO_RESULT, + method_take_control, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("ReleaseControl", + NULL, + NULL, + method_release_control, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetType", + SD_BUS_ARGS("s", type), + SD_BUS_NO_RESULT, + method_set_type, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDisplay", + SD_BUS_ARGS("s", display), + SD_BUS_NO_RESULT, + method_set_display, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetTTY", + SD_BUS_ARGS("h", tty_fd), + SD_BUS_NO_RESULT, + method_set_tty, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TakeDevice", + SD_BUS_ARGS("u", major, "u", minor), + SD_BUS_RESULT("h", fd, "b", inactive), + method_take_device, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReleaseDevice", + SD_BUS_ARGS("u", major, "u", minor), + SD_BUS_NO_RESULT, + method_release_device, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("PauseDeviceComplete", + SD_BUS_ARGS("u", major, "u", minor), + SD_BUS_NO_RESULT, + method_pause_device_complete, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetBrightness", + SD_BUS_ARGS("s", subsystem, "s", name, "u", brightness), + SD_BUS_NO_RESULT, + method_set_brightness, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_SIGNAL_WITH_ARGS("PauseDevice", + SD_BUS_ARGS("u", major, "u", minor, "s", type), + 0), + SD_BUS_SIGNAL_WITH_ARGS("ResumeDevice", + SD_BUS_ARGS("u", major, "u", minor, "h", fd), + 0), + SD_BUS_SIGNAL("Lock", NULL, 0), + SD_BUS_SIGNAL("Unlock", NULL, 0), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation session_object = { + "/org/freedesktop/login1/session", + "org.freedesktop.login1.Session", + .fallback_vtables = BUS_FALLBACK_VTABLES({session_vtable, session_object_find}), + .node_enumerator = session_node_enumerator, +}; diff --git a/src/login/logind-session-dbus.h b/src/login/logind-session-dbus.h new file mode 100644 index 0000000..751ca86 --- /dev/null +++ b/src/login/logind-session-dbus.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" +#include "logind-session.h" + +extern const BusObjectImplementation session_object; + +char *session_bus_path(Session *s); + +int session_send_signal(Session *s, bool new_session); +int session_send_changed(Session *s, const char *properties, ...) _sentinel_; +int session_send_lock(Session *s, bool lock); +int session_send_lock_all(Manager *m, bool lock); + +int session_send_create_reply(Session *s, sd_bus_error *error); + +int bus_session_method_activate(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_session_method_lock(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_session_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_session_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/login/logind-session-device.c b/src/login/logind-session-device.c new file mode 100644 index 0000000..44d8d52 --- /dev/null +++ b/src/login/logind-session-device.c @@ -0,0 +1,507 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "bus-util.h" +#include "daemon-util.h" +#include "fd-util.h" +#include "logind-session-dbus.h" +#include "logind-session-device.h" +#include "missing_drm.h" +#include "missing_input.h" +#include "parse-util.h" + +enum SessionDeviceNotifications { + SESSION_DEVICE_RESUME, + SESSION_DEVICE_TRY_PAUSE, + SESSION_DEVICE_PAUSE, + SESSION_DEVICE_RELEASE, +}; + +static int session_device_notify(SessionDevice *sd, enum SessionDeviceNotifications type) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *path = NULL; + const char *t = NULL; + uint32_t major, minor; + int r; + + assert(sd); + + major = major(sd->dev); + minor = minor(sd->dev); + + if (!sd->session->controller) + return 0; + + path = session_bus_path(sd->session); + if (!path) + return -ENOMEM; + + r = sd_bus_message_new_signal( + sd->session->manager->bus, + &m, path, + "org.freedesktop.login1.Session", + type == SESSION_DEVICE_RESUME ? "ResumeDevice" : "PauseDevice"); + if (!m) + return r; + + r = sd_bus_message_set_destination(m, sd->session->controller); + if (r < 0) + return r; + + switch (type) { + + case SESSION_DEVICE_RESUME: + r = sd_bus_message_append(m, "uuh", major, minor, sd->fd); + if (r < 0) + return r; + break; + + case SESSION_DEVICE_TRY_PAUSE: + t = "pause"; + break; + + case SESSION_DEVICE_PAUSE: + t = "force"; + break; + + case SESSION_DEVICE_RELEASE: + t = "gone"; + break; + + default: + return -EINVAL; + } + + if (t) { + r = sd_bus_message_append(m, "uus", major, minor, t); + if (r < 0) + return r; + } + + return sd_bus_send(sd->session->manager->bus, m, NULL); +} + +static void sd_eviocrevoke(int fd) { + static bool warned = false; + + assert(fd >= 0); + + if (ioctl(fd, EVIOCREVOKE, NULL) < 0) { + + if (errno == EINVAL && !warned) { + log_warning_errno(errno, "Kernel does not support evdev-revocation: %m"); + warned = true; + } + } +} + +static int sd_drmsetmaster(int fd) { + assert(fd >= 0); + return RET_NERRNO(ioctl(fd, DRM_IOCTL_SET_MASTER, 0)); +} + +static int sd_drmdropmaster(int fd) { + assert(fd >= 0); + return RET_NERRNO(ioctl(fd, DRM_IOCTL_DROP_MASTER, 0)); +} + +static int session_device_open(SessionDevice *sd, bool active) { + int fd, r; + + assert(sd); + assert(sd->type != DEVICE_TYPE_UNKNOWN); + assert(sd->node); + + /* open device and try to get a udev_device from it */ + fd = open(sd->node, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (fd < 0) + return -errno; + + switch (sd->type) { + + case DEVICE_TYPE_DRM: + if (active) { + /* Weird legacy DRM semantics might return an error even though we're master. No way to detect + * that so fail at all times and let caller retry in inactive state. */ + r = sd_drmsetmaster(fd); + if (r < 0) { + (void) close_nointr(fd); + return r; + } + } else + /* DRM-Master is granted to the first user who opens a device automatically (ughh, + * racy!). Hence, we just drop DRM-Master in case we were the first. */ + (void) sd_drmdropmaster(fd); + break; + + case DEVICE_TYPE_EVDEV: + if (!active) + sd_eviocrevoke(fd); + break; + + case DEVICE_TYPE_UNKNOWN: + default: + /* fallback for devices without synchronizations */ + break; + } + + return fd; +} + +static int session_device_start(SessionDevice *sd) { + int r; + + assert(sd); + assert(session_is_active(sd->session)); + + if (sd->active) + return 0; + + switch (sd->type) { + + case DEVICE_TYPE_DRM: + if (sd->fd < 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), + "Failed to re-activate DRM fd, as the fd was lost (maybe logind restart went wrong?)"); + + /* Device is kept open. Simply call drmSetMaster() and hope there is no-one else. In case it fails, we + * keep the device paused. Maybe at some point we have a drmStealMaster(). */ + r = sd_drmsetmaster(sd->fd); + if (r < 0) + return r; + break; + + case DEVICE_TYPE_EVDEV: + /* Evdev devices are revoked while inactive. Reopen it and we are fine. */ + r = session_device_open(sd, true); + if (r < 0) + return r; + + /* For evdev devices, the file descriptor might be left uninitialized. This might happen while resuming + * into a session and logind has been restarted right before. */ + close_and_replace(sd->fd, r); + break; + + case DEVICE_TYPE_UNKNOWN: + default: + /* fallback for devices without synchronizations */ + break; + } + + sd->active = true; + return 0; +} + +static void session_device_stop(SessionDevice *sd) { + assert(sd); + + if (!sd->active) + return; + + switch (sd->type) { + + case DEVICE_TYPE_DRM: + if (sd->fd < 0) { + log_error("Failed to de-activate DRM fd, as the fd was lost (maybe logind restart went wrong?)"); + return; + } + + /* On DRM devices we simply drop DRM-Master but keep it open. + * This allows the user to keep resources allocated. The + * CAP_SYS_ADMIN restriction to DRM-Master prevents users from + * circumventing this. */ + sd_drmdropmaster(sd->fd); + break; + + case DEVICE_TYPE_EVDEV: + /* Revoke access on evdev file-descriptors during deactivation. + * This will basically prevent any operations on the fd and + * cannot be undone. Good side is: it needs no CAP_SYS_ADMIN + * protection this way. */ + sd_eviocrevoke(sd->fd); + break; + + case DEVICE_TYPE_UNKNOWN: + default: + /* fallback for devices without synchronization */ + break; + } + + sd->active = false; +} + +static DeviceType detect_device_type(sd_device *dev) { + const char *sysname, *subsystem; + DeviceType type = DEVICE_TYPE_UNKNOWN; + + if (sd_device_get_sysname(dev, &sysname) < 0 || + sd_device_get_subsystem(dev, &subsystem) < 0) + return type; + + if (streq(subsystem, "drm")) { + if (startswith(sysname, "card")) + type = DEVICE_TYPE_DRM; + } else if (streq(subsystem, "input")) { + if (startswith(sysname, "event")) + type = DEVICE_TYPE_EVDEV; + } + + return type; +} + +static int session_device_verify(SessionDevice *sd) { + _cleanup_(sd_device_unrefp) sd_device *p = NULL; + const char *sp, *node; + sd_device *dev; + int r; + + r = sd_device_new_from_devnum(&p, 'c', sd->dev); + if (r < 0) + return r; + + dev = p; + + if (sd_device_get_syspath(dev, &sp) < 0 || + sd_device_get_devname(dev, &node) < 0) + return -EINVAL; + + /* detect device type so we can find the correct sysfs parent */ + sd->type = detect_device_type(dev); + + /* Prevent opening unsupported devices. Especially devices of + * subsystem "input" must be opened via the evdev node as + * we require EVIOCREVOKE. */ + switch (sd->type) { + case DEVICE_TYPE_EVDEV: + /* for evdev devices we need the parent node as device */ + if (sd_device_get_parent_with_subsystem_devtype(p, "input", NULL, &dev) < 0) + return -ENODEV; + if (sd_device_get_syspath(dev, &sp) < 0) + return -ENODEV; + break; + + case DEVICE_TYPE_DRM: + break; + + case DEVICE_TYPE_UNKNOWN: + default: + return -ENODEV; + } + + /* search for an existing seat device and return it if available */ + sd->device = hashmap_get(sd->session->manager->devices, sp); + if (!sd->device) { + /* The caller might have gotten the udev event before we were + * able to process it. Hence, fake the "add" event and let the + * logind-manager handle the new device. */ + r = manager_process_seat_device(sd->session->manager, dev); + if (r < 0) + return r; + + /* if it's still not available, then the device is invalid */ + sd->device = hashmap_get(sd->session->manager->devices, sp); + if (!sd->device) + return -ENODEV; + } + + if (sd->device->seat != sd->session->seat) + return -EPERM; + + sd->node = strdup(node); + if (!sd->node) + return -ENOMEM; + + return 0; +} + +int session_device_new(Session *s, dev_t dev, bool open_device, SessionDevice **out) { + SessionDevice *sd; + int r; + + assert(s); + assert(out); + + if (!s->seat) + return -EPERM; + + sd = new0(SessionDevice, 1); + if (!sd) + return -ENOMEM; + + sd->session = s; + sd->dev = dev; + sd->fd = -EBADF; + sd->type = DEVICE_TYPE_UNKNOWN; + + r = session_device_verify(sd); + if (r < 0) + goto error; + + r = hashmap_put(s->devices, &sd->dev, sd); + if (r < 0) + goto error; + + if (open_device) { + /* Open the device for the first time. We need a valid fd to pass back + * to the caller. If the session is not active, this _might_ immediately + * revoke access and thus invalidate the fd. But this is still needed + * to pass a valid fd back. */ + sd->active = session_is_active(s); + r = session_device_open(sd, sd->active); + if (r < 0) { + /* EINVAL _may_ mean a master is active; retry inactive */ + if (sd->active && r == -EINVAL) { + sd->active = false; + r = session_device_open(sd, false); + } + if (r < 0) + goto error; + } + sd->fd = r; + } + + LIST_PREPEND(sd_by_device, sd->device->session_devices, sd); + + *out = sd; + return 0; + +error: + hashmap_remove(s->devices, &sd->dev); + free(sd->node); + free(sd); + return r; +} + +SessionDevice *session_device_free(SessionDevice *sd) { + if (!sd) + return NULL; + + /* Make sure to remove the pushed fd. */ + if (sd->pushed_fd) + (void) notify_remove_fd_warnf("session-%s-device-%u-%u", sd->session->id, major(sd->dev), minor(sd->dev)); + + session_device_stop(sd); + session_device_notify(sd, SESSION_DEVICE_RELEASE); + safe_close(sd->fd); + + LIST_REMOVE(sd_by_device, sd->device->session_devices, sd); + + hashmap_remove(sd->session->devices, &sd->dev); + + free(sd->node); + + return mfree(sd); +} + +void session_device_complete_pause(SessionDevice *sd) { + SessionDevice *iter; + + if (!sd->active) + return; + + session_device_stop(sd); + + /* if not all devices are paused, wait for further completion events */ + HASHMAP_FOREACH(iter, sd->session->devices) + if (iter->active) + return; + + /* complete any pending session switch */ + seat_complete_switch(sd->session->seat); +} + +void session_device_resume_all(Session *s) { + SessionDevice *sd; + + assert(s); + + HASHMAP_FOREACH(sd, s->devices) { + if (sd->active) + continue; + + if (session_device_start(sd) < 0) + continue; + if (session_device_save(sd) < 0) + continue; + + session_device_notify(sd, SESSION_DEVICE_RESUME); + } +} + +void session_device_pause_all(Session *s) { + SessionDevice *sd; + + assert(s); + + HASHMAP_FOREACH(sd, s->devices) { + if (!sd->active) + continue; + + session_device_stop(sd); + session_device_notify(sd, SESSION_DEVICE_PAUSE); + } +} + +unsigned session_device_try_pause_all(Session *s) { + unsigned num_pending = 0; + SessionDevice *sd; + + assert(s); + + HASHMAP_FOREACH(sd, s->devices) { + if (!sd->active) + continue; + + session_device_notify(sd, SESSION_DEVICE_TRY_PAUSE); + num_pending++; + } + + return num_pending; +} + +int session_device_save(SessionDevice *sd) { + const char *id; + int r; + + assert(sd); + + /* Store device fd in PID1. It will send it back to us on restart so revocation will continue to work. To make + * things simple, send fds for all type of devices even if they don't support the revocation mechanism so we + * don't have to handle them differently later. + * + * Note: for device supporting revocation, PID1 will drop a stored fd automatically if the corresponding device + * is revoked. */ + + if (sd->pushed_fd) + return 0; + + /* Session ID does not contain separators. */ + id = sd->session->id; + assert(*(id + strcspn(id, "-\n")) == '\0'); + + r = notify_push_fdf(sd->fd, "session-%s-device-%u-%u", id, major(sd->dev), minor(sd->dev)); + if (r < 0) + return r; + + sd->pushed_fd = true; + return 1; +} + +void session_device_attach_fd(SessionDevice *sd, int fd, bool active) { + assert(fd >= 0); + assert(sd); + assert(sd->fd < 0); + assert(!sd->active); + + sd->fd = fd; + sd->pushed_fd = true; + sd->active = active; +} diff --git a/src/login/logind-session-device.h b/src/login/logind-session-device.h new file mode 100644 index 0000000..04654d1 --- /dev/null +++ b/src/login/logind-session-device.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef enum DeviceType DeviceType; +typedef struct SessionDevice SessionDevice; + +#include "list.h" +#include "logind.h" + +enum DeviceType { + DEVICE_TYPE_UNKNOWN, + DEVICE_TYPE_DRM, + DEVICE_TYPE_EVDEV, +}; + +struct SessionDevice { + Session *session; + Device *device; + + dev_t dev; + char *node; + int fd; + DeviceType type:3; + bool active:1; + bool pushed_fd:1; + + LIST_FIELDS(struct SessionDevice, sd_by_device); +}; + +int session_device_new(Session *s, dev_t dev, bool open_device, SessionDevice **out); +SessionDevice *session_device_free(SessionDevice *sd); +DEFINE_TRIVIAL_CLEANUP_FUNC(SessionDevice*, session_device_free); + +void session_device_complete_pause(SessionDevice *sd); + +void session_device_resume_all(Session *s); +void session_device_pause_all(Session *s); +unsigned session_device_try_pause_all(Session *s); + +int session_device_save(SessionDevice *sd); +void session_device_attach_fd(SessionDevice *sd, int fd, bool active); diff --git a/src/login/logind-session.c b/src/login/logind-session.c new file mode 100644 index 0000000..3988e55 --- /dev/null +++ b/src/login/logind-session.c @@ -0,0 +1,1624 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "audit-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "devnum-util.h" +#include "env-file.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "logind-dbus.h" +#include "logind-seat-dbus.h" +#include "logind-session-dbus.h" +#include "logind-session.h" +#include "logind-user-dbus.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "string-table.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "uid-alloc-range.h" +#include "user-util.h" + +#define RELEASE_USEC (20*USEC_PER_SEC) + +static void session_remove_fifo(Session *s); +static void session_restore_vt(Session *s); + +int session_new(Session **ret, Manager *m, const char *id) { + _cleanup_(session_freep) Session *s = NULL; + int r; + + assert(ret); + assert(m); + assert(id); + + if (!session_id_valid(id)) + return -EINVAL; + + s = new(Session, 1); + if (!s) + return -ENOMEM; + + *s = (Session) { + .manager = m, + .fifo_fd = -EBADF, + .vtfd = -EBADF, + .audit_id = AUDIT_SESSION_INVALID, + .tty_validity = _TTY_VALIDITY_INVALID, + .leader = PIDREF_NULL, + }; + + s->state_file = path_join("/run/systemd/sessions", id); + if (!s->state_file) + return -ENOMEM; + + s->id = basename(s->state_file); + + s->devices = hashmap_new(&devt_hash_ops); + if (!s->devices) + return -ENOMEM; + + r = hashmap_put(m->sessions, s->id, s); + if (r < 0) + return r; + + *ret = TAKE_PTR(s); + return 0; +} + +static void session_reset_leader(Session *s) { + assert(s); + + if (!pidref_is_set(&s->leader)) + return; + + (void) hashmap_remove_value(s->manager->sessions_by_leader, &s->leader, s); + + return pidref_done(&s->leader); +} + +Session* session_free(Session *s) { + SessionDevice *sd; + + if (!s) + return NULL; + + if (s->in_gc_queue) + LIST_REMOVE(gc_queue, s->manager->session_gc_queue, s); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + + session_drop_controller(s); + + while ((sd = hashmap_first(s->devices))) + session_device_free(sd); + + hashmap_free(s->devices); + + if (s->user) { + LIST_REMOVE(sessions_by_user, s->user->sessions, s); + + if (s->user->display == s) + s->user->display = NULL; + + user_update_last_session_timer(s->user); + } + + if (s->seat) { + if (s->seat->active == s) + s->seat->active = NULL; + if (s->seat->pending_switch == s) + s->seat->pending_switch = NULL; + + seat_evict_position(s->seat, s); + LIST_REMOVE(sessions_by_seat, s->seat->sessions, s); + } + + if (s->scope) { + hashmap_remove(s->manager->session_units, s->scope); + free(s->scope); + } + + free(s->scope_job); + + session_reset_leader(s); + + sd_bus_message_unref(s->create_message); + + free(s->tty); + free(s->display); + free(s->remote_host); + free(s->remote_user); + free(s->service); + free(s->desktop); + + hashmap_remove(s->manager->sessions, s->id); + + sd_event_source_unref(s->fifo_event_source); + safe_close(s->fifo_fd); + + /* Note that we remove neither the state file nor the fifo path here, since we want both to survive + * daemon restarts */ + free(s->state_file); + free(s->fifo_path); + + sd_event_source_unref(s->stop_on_idle_event_source); + + return mfree(s); +} + +void session_set_user(Session *s, User *u) { + assert(s); + assert(!s->user); + + s->user = u; + LIST_PREPEND(sessions_by_user, u->sessions, s); + + user_update_last_session_timer(u); +} + +int session_set_leader_consume(Session *s, PidRef _leader) { + _cleanup_(pidref_done) PidRef pidref = _leader; + int r; + + assert(s); + assert(pidref_is_set(&pidref)); + + if (pidref_equal(&s->leader, &pidref)) + return 0; + + session_reset_leader(s); + + s->leader = TAKE_PIDREF(pidref); + + r = hashmap_ensure_put(&s->manager->sessions_by_leader, &pidref_hash_ops, &s->leader, s); + if (r < 0) + return r; + assert(r > 0); + + (void) audit_session_from_pid(s->leader.pid, &s->audit_id); + + return 1; +} + +static void session_save_devices(Session *s, FILE *f) { + SessionDevice *sd; + + if (!hashmap_isempty(s->devices)) { + fprintf(f, "DEVICES="); + HASHMAP_FOREACH(sd, s->devices) + fprintf(f, DEVNUM_FORMAT_STR " ", DEVNUM_FORMAT_VAL(sd->dev)); + fprintf(f, "\n"); + } +} + +int session_save(Session *s) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(s); + + if (!s->user) + return -ESTALE; + + if (!s->started) + return 0; + + r = mkdir_safe_label("/run/systemd/sessions", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + r = fopen_temporary(s->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "UID="UID_FMT"\n" + "USER=%s\n" + "ACTIVE=%i\n" + "IS_DISPLAY=%i\n" + "STATE=%s\n" + "REMOTE=%i\n", + s->user->user_record->uid, + s->user->user_record->user_name, + session_is_active(s), + s->user->display == s, + session_state_to_string(session_get_state(s)), + s->remote); + + if (s->type >= 0) + fprintf(f, "TYPE=%s\n", session_type_to_string(s->type)); + + if (s->original_type >= 0) + fprintf(f, "ORIGINAL_TYPE=%s\n", session_type_to_string(s->original_type)); + + if (s->class >= 0) + fprintf(f, "CLASS=%s\n", session_class_to_string(s->class)); + + if (s->scope) + fprintf(f, "SCOPE=%s\n", s->scope); + if (s->scope_job) + fprintf(f, "SCOPE_JOB=%s\n", s->scope_job); + + if (s->fifo_path) + fprintf(f, "FIFO=%s\n", s->fifo_path); + + if (s->seat) + fprintf(f, "SEAT=%s\n", s->seat->id); + + if (s->tty) + fprintf(f, "TTY=%s\n", s->tty); + + if (s->tty_validity >= 0) + fprintf(f, "TTY_VALIDITY=%s\n", tty_validity_to_string(s->tty_validity)); + + if (s->display) + fprintf(f, "DISPLAY=%s\n", s->display); + + if (s->remote_host) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->remote_host); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "REMOTE_HOST=%s\n", escaped); + } + + if (s->remote_user) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->remote_user); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "REMOTE_USER=%s\n", escaped); + } + + if (s->service) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->service); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "SERVICE=%s\n", escaped); + } + + if (s->desktop) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(s->desktop); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "DESKTOP=%s\n", escaped); + } + + if (s->seat && seat_has_vts(s->seat)) + fprintf(f, "VTNR=%u\n", s->vtnr); + + if (!s->vtnr) + fprintf(f, "POSITION=%u\n", s->position); + + if (pidref_is_set(&s->leader)) + fprintf(f, "LEADER="PID_FMT"\n", s->leader.pid); + + if (audit_session_is_valid(s->audit_id)) + fprintf(f, "AUDIT=%"PRIu32"\n", s->audit_id); + + if (dual_timestamp_is_set(&s->timestamp)) + fprintf(f, + "REALTIME="USEC_FMT"\n" + "MONOTONIC="USEC_FMT"\n", + s->timestamp.realtime, + s->timestamp.monotonic); + + if (s->controller) { + fprintf(f, "CONTROLLER=%s\n", s->controller); + session_save_devices(s, f); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, s->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + return 0; + +fail: + (void) unlink(s->state_file); + + return log_error_errno(r, "Failed to save session data %s: %m", s->state_file); +} + +static int session_load_devices(Session *s, const char *devices) { + int r = 0; + + assert(s); + + for (const char *p = devices;;) { + _cleanup_free_ char *word = NULL; + SessionDevice *sd; + dev_t dev; + int k; + + k = extract_first_word(&p, &word, NULL, 0); + if (k == 0) + break; + if (k < 0) { + r = k; + break; + } + + k = parse_devnum(word, &dev); + if (k < 0) { + r = k; + continue; + } + + /* The file descriptors for loaded devices will be reattached later. */ + k = session_device_new(s, dev, false, &sd); + if (k < 0) + r = k; + } + + if (r < 0) + log_error_errno(r, "Loading session devices for session %s failed: %m", s->id); + + return r; +} + +int session_load(Session *s) { + _cleanup_free_ char *remote = NULL, + *seat = NULL, + *tty_validity = NULL, + *vtnr = NULL, + *state = NULL, + *position = NULL, + *leader = NULL, + *type = NULL, + *original_type = NULL, + *class = NULL, + *uid = NULL, + *realtime = NULL, + *monotonic = NULL, + *controller = NULL, + *active = NULL, + *devices = NULL, + *is_display = NULL; + + int k, r; + + assert(s); + + r = parse_env_file(NULL, s->state_file, + "REMOTE", &remote, + "SCOPE", &s->scope, + "SCOPE_JOB", &s->scope_job, + "FIFO", &s->fifo_path, + "SEAT", &seat, + "TTY", &s->tty, + "TTY_VALIDITY", &tty_validity, + "DISPLAY", &s->display, + "REMOTE_HOST", &s->remote_host, + "REMOTE_USER", &s->remote_user, + "SERVICE", &s->service, + "DESKTOP", &s->desktop, + "VTNR", &vtnr, + "STATE", &state, + "POSITION", &position, + "LEADER", &leader, + "TYPE", &type, + "ORIGINAL_TYPE", &original_type, + "CLASS", &class, + "UID", &uid, + "REALTIME", &realtime, + "MONOTONIC", &monotonic, + "CONTROLLER", &controller, + "ACTIVE", &active, + "DEVICES", &devices, + "IS_DISPLAY", &is_display); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", s->state_file); + + if (!s->user) { + uid_t u; + User *user; + + if (!uid) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "UID not specified for session %s", + s->id); + + r = parse_uid(uid, &u); + if (r < 0) { + log_error("Failed to parse UID value %s for session %s.", uid, s->id); + return r; + } + + user = hashmap_get(s->manager->users, UID_TO_PTR(u)); + if (!user) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "User of session %s not known.", + s->id); + + session_set_user(s, user); + } + + if (remote) { + k = parse_boolean(remote); + if (k >= 0) + s->remote = k; + } + + if (vtnr) + safe_atou(vtnr, &s->vtnr); + + if (seat && !s->seat) { + Seat *o; + + o = hashmap_get(s->manager->seats, seat); + if (o) + r = seat_attach_session(o, s); + if (!o || r < 0) + log_error("Cannot attach session %s to seat %s", s->id, seat); + } + + if (!s->seat || !seat_has_vts(s->seat)) + s->vtnr = 0; + + if (position && s->seat) { + unsigned npos; + + safe_atou(position, &npos); + seat_claim_position(s->seat, s, npos); + } + + if (tty_validity) { + TTYValidity v; + + v = tty_validity_from_string(tty_validity); + if (v < 0) + log_debug("Failed to parse TTY validity: %s", tty_validity); + else + s->tty_validity = v; + } + + if (leader) { + _cleanup_(pidref_done) PidRef p = PIDREF_NULL; + + r = pidref_set_pidstr(&p, leader); + if (r < 0) + log_debug_errno(r, "Failed to parse leader PID of session: %s", leader); + else { + r = session_set_leader_consume(s, TAKE_PIDREF(p)); + if (r < 0) + log_warning_errno(r, "Failed to set session leader PID, ignoring: %m"); + } + } + + if (type) { + SessionType t; + + t = session_type_from_string(type); + if (t >= 0) + s->type = t; + } + + if (original_type) { + SessionType ot; + + ot = session_type_from_string(original_type); + if (ot >= 0) + s->original_type = ot; + } else + /* Pre-v246 compat: initialize original_type if not set in the state file */ + s->original_type = s->type; + + if (class) { + SessionClass c; + + c = session_class_from_string(class); + if (c >= 0) + s->class = c; + } + + if (streq_ptr(state, "closing")) + s->stopping = true; + + if (s->fifo_path) { + int fd; + + /* If we open an unopened pipe for reading we will not + get an EOF. to trigger an EOF we hence open it for + writing, but close it right away which then will + trigger the EOF. This will happen immediately if no + other process has the FIFO open for writing, i. e. + when the session died before logind (re)started. */ + + fd = session_create_fifo(s); + safe_close(fd); + } + + if (realtime) + (void) deserialize_usec(realtime, &s->timestamp.realtime); + if (monotonic) + (void) deserialize_usec(monotonic, &s->timestamp.monotonic); + + if (active) { + k = parse_boolean(active); + if (k >= 0) + s->was_active = k; + } + + if (is_display) { + /* Note that when enumerating users are loaded before sessions, hence the display session to use is + * something we have to store along with the session and not the user, as in that case we couldn't + * apply it at the time we load the user. */ + + k = parse_boolean(is_display); + if (k < 0) + log_warning_errno(k, "Failed to parse IS_DISPLAY session property: %m"); + else if (k > 0) + s->user->display = s; + } + + if (controller) { + if (bus_name_has_owner(s->manager->bus, controller, NULL) > 0) { + session_set_controller(s, controller, false, false); + session_load_devices(s, devices); + } else + session_restore_vt(s); + } + + return r; +} + +int session_activate(Session *s) { + unsigned num_pending; + + assert(s); + assert(s->user); + + if (!s->seat) + return -EOPNOTSUPP; + + if (s->seat->active == s) + return 0; + + /* on seats with VTs, we let VTs manage session-switching */ + if (seat_has_vts(s->seat)) { + if (s->vtnr == 0) + return -EOPNOTSUPP; + + return chvt(s->vtnr); + } + + /* On seats without VTs, we implement session-switching in logind. We + * try to pause all session-devices and wait until the session + * controller acknowledged them. Once all devices are asleep, we simply + * switch the active session and be done. + * We save the session we want to switch to in seat->pending_switch and + * seat_complete_switch() will perform the final switch. */ + + s->seat->pending_switch = s; + + /* if no devices are running, immediately perform the session switch */ + num_pending = session_device_try_pause_all(s); + if (!num_pending) + seat_complete_switch(s->seat); + + return 0; +} + +static int session_start_scope(Session *s, sd_bus_message *properties, sd_bus_error *error) { + int r; + + assert(s); + assert(s->user); + + if (!s->scope) { + _cleanup_strv_free_ char **after = NULL; + _cleanup_free_ char *scope = NULL; + const char *description; + + s->scope_job = mfree(s->scope_job); + + scope = strjoin("session-", s->id, ".scope"); + if (!scope) + return log_oom(); + + description = strjoina("Session ", s->id, " of User ", s->user->user_record->user_name); + + /* We usually want to order session scopes after systemd-user-sessions.service since the + * latter unit is used as login session barrier for unprivileged users. However the barrier + * doesn't apply for root as sysadmin should always be able to log in (and without waiting + * for any timeout to expire) in case something goes wrong during the boot process. Since + * ordering after systemd-user-sessions.service and the user instance is optional we make use + * of STRV_IGNORE with strv_new() to skip these order constraints when needed. */ + after = strv_new("systemd-logind.service", + s->user->runtime_dir_service, + !uid_is_system(s->user->user_record->uid) ? "systemd-user-sessions.service" : STRV_IGNORE, + s->user->service); + if (!after) + return log_oom(); + + r = manager_start_scope( + s->manager, + scope, + &s->leader, + s->user->slice, + description, + /* These two have StopWhenUnneeded= set, hence add a dep towards them */ + STRV_MAKE(s->user->runtime_dir_service, + s->user->service), + after, + user_record_home_directory(s->user->user_record), + properties, + error, + &s->scope_job); + if (r < 0) + return log_error_errno(r, "Failed to start session scope %s: %s", + scope, bus_error_message(error, r)); + + s->scope = TAKE_PTR(scope); + } + + (void) hashmap_put(s->manager->session_units, s->scope, s); + + return 0; +} + +static int session_dispatch_stop_on_idle(sd_event_source *source, uint64_t t, void *userdata) { + Session *s = userdata; + dual_timestamp ts; + int r, idle; + + assert(s); + + if (s->stopping) + return 0; + + idle = session_get_idle_hint(s, &ts); + if (idle) { + log_info("Session \"%s\" of user \"%s\" is idle, stopping.", s->id, s->user->user_record->user_name); + + return session_stop(s, /* force */ true); + } + + r = sd_event_source_set_time( + source, + usec_add(dual_timestamp_is_set(&ts) ? ts.monotonic : now(CLOCK_MONOTONIC), + s->manager->stop_idle_session_usec)); + if (r < 0) + return log_error_errno(r, "Failed to configure stop on idle session event source: %m"); + + r = sd_event_source_set_enabled(source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable stop on idle session event source: %m"); + + return 1; +} + +static int session_setup_stop_on_idle_timer(Session *s) { + int r; + + assert(s); + + if (s->manager->stop_idle_session_usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time_relative( + s->manager->event, + &s->stop_on_idle_event_source, + CLOCK_MONOTONIC, + s->manager->stop_idle_session_usec, + 0, + session_dispatch_stop_on_idle, s); + if (r < 0) + return log_error_errno(r, "Failed to add stop on idle session event source: %m"); + + return 0; +} + +int session_start(Session *s, sd_bus_message *properties, sd_bus_error *error) { + int r; + + assert(s); + + if (!s->user) + return -ESTALE; + + if (s->stopping) + return -EINVAL; + + if (s->started) + return 0; + + r = user_start(s->user); + if (r < 0) + return r; + + r = session_start_scope(s, properties, error); + if (r < 0) + return r; + + r = session_setup_stop_on_idle_timer(s); + if (r < 0) + return r; + + log_struct(s->class == SESSION_BACKGROUND ? LOG_DEBUG : LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_SESSION_START_STR, + "SESSION_ID=%s", s->id, + "USER_ID=%s", s->user->user_record->user_name, + "LEADER="PID_FMT, s->leader.pid, + LOG_MESSAGE("New session %s of user %s.", s->id, s->user->user_record->user_name)); + + if (!dual_timestamp_is_set(&s->timestamp)) + dual_timestamp_now(&s->timestamp); + + if (s->seat) + seat_read_active_vt(s->seat); + + s->started = true; + + user_elect_display(s->user); + + /* Save data */ + session_save(s); + user_save(s->user); + if (s->seat) + seat_save(s->seat); + + /* Send signals */ + session_send_signal(s, true); + user_send_changed(s->user, "Display", NULL); + + if (s->seat && s->seat->active == s) + seat_send_changed(s->seat, "ActiveSession", NULL); + + return 0; +} + +static int session_stop_scope(Session *s, bool force) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(s); + + if (!s->scope) + return 0; + + /* Let's always abandon the scope first. This tells systemd that we are not interested anymore, and everything + * that is left in the scope is "left-over". Informing systemd about this has the benefit that it will log + * when killing any processes left after this point. */ + r = manager_abandon_scope(s->manager, s->scope, &error); + if (r < 0) { + log_warning_errno(r, "Failed to abandon session scope, ignoring: %s", bus_error_message(&error, r)); + sd_bus_error_free(&error); + } + + s->scope_job = mfree(s->scope_job); + + /* Optionally, let's kill everything that's left now. */ + if (force || + (s->user->user_record->kill_processes != 0 && + (s->user->user_record->kill_processes > 0 || + manager_shall_kill(s->manager, s->user->user_record->user_name)))) { + + r = manager_stop_unit(s->manager, s->scope, force ? "replace" : "fail", &error, &s->scope_job); + if (r < 0) { + if (force) + return log_error_errno(r, "Failed to stop session scope: %s", bus_error_message(&error, r)); + + log_warning_errno(r, "Failed to stop session scope, ignoring: %s", bus_error_message(&error, r)); + } + } else { + + /* With no killing, this session is allowed to persist in "closing" state indefinitely. + * Therefore session stop and session removal may be two distinct events. + * Session stop is quite significant on its own, let's log it. */ + log_struct(s->class == SESSION_BACKGROUND ? LOG_DEBUG : LOG_INFO, + "SESSION_ID=%s", s->id, + "USER_ID=%s", s->user->user_record->user_name, + "LEADER="PID_FMT, s->leader.pid, + LOG_MESSAGE("Session %s logged out. Waiting for processes to exit.", s->id)); + } + + return 0; +} + +int session_stop(Session *s, bool force) { + int r; + + assert(s); + + /* This is called whenever we begin with tearing down a session record. It's called in four cases: explicit API + * request via the bus (either directly for the session object or for the seat or user object this session + * belongs to; 'force' is true), or due to automatic GC (i.e. scope vanished; 'force' is false), or because the + * session FIFO saw an EOF ('force' is false), or because the release timer hit ('force' is false). */ + + if (!s->user) + return -ESTALE; + if (!s->started) + return 0; + if (s->stopping) + return 0; + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + s->leader_pidfd_event_source = sd_event_source_unref(s->leader_pidfd_event_source); + + if (s->seat) + seat_evict_position(s->seat, s); + + /* We are going down, don't care about FIFOs anymore */ + session_remove_fifo(s); + + /* Kill cgroup */ + r = session_stop_scope(s, force); + + s->stopping = true; + + user_elect_display(s->user); + + session_save(s); + user_save(s->user); + + return r; +} + +int session_finalize(Session *s) { + SessionDevice *sd; + + assert(s); + + if (!s->user) + return -ESTALE; + + if (s->started) + log_struct(s->class == SESSION_BACKGROUND ? LOG_DEBUG : LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_SESSION_STOP_STR, + "SESSION_ID=%s", s->id, + "USER_ID=%s", s->user->user_record->user_name, + "LEADER="PID_FMT, s->leader.pid, + LOG_MESSAGE("Removed session %s.", s->id)); + + s->timer_event_source = sd_event_source_unref(s->timer_event_source); + s->leader_pidfd_event_source = sd_event_source_unref(s->leader_pidfd_event_source); + + if (s->seat) + seat_evict_position(s->seat, s); + + /* Kill session devices */ + while ((sd = hashmap_first(s->devices))) + session_device_free(sd); + + (void) unlink(s->state_file); + session_add_to_gc_queue(s); + user_add_to_gc_queue(s->user); + + if (s->started) { + session_send_signal(s, false); + s->started = false; + } + + if (s->seat) { + if (s->seat->active == s) + seat_set_active(s->seat, NULL); + + seat_save(s->seat); + } + + session_reset_leader(s); + + user_save(s->user); + user_send_changed(s->user, "Display", NULL); + + return 0; +} + +static int release_timeout_callback(sd_event_source *es, uint64_t usec, void *userdata) { + Session *s = ASSERT_PTR(userdata); + + assert(es); + + session_stop(s, /* force = */ false); + return 0; +} + +int session_release(Session *s) { + assert(s); + + if (!s->started || s->stopping) + return 0; + + if (s->timer_event_source) + return 0; + + return sd_event_add_time_relative( + s->manager->event, + &s->timer_event_source, + CLOCK_MONOTONIC, + RELEASE_USEC, 0, + release_timeout_callback, s); +} + +bool session_is_active(Session *s) { + assert(s); + + if (!s->seat) + return true; + + return s->seat->active == s; +} + +static int get_tty_atime(const char *tty, usec_t *atime) { + _cleanup_free_ char *p = NULL; + struct stat st; + + assert(tty); + assert(atime); + + if (!path_is_absolute(tty)) { + p = path_join("/dev", tty); + if (!p) + return -ENOMEM; + + tty = p; + } else if (!path_startswith(tty, "/dev/")) + return -ENOENT; + + if (lstat(tty, &st) < 0) + return -errno; + + *atime = timespec_load(&st.st_atim); + return 0; +} + +static int get_process_ctty_atime(pid_t pid, usec_t *atime) { + _cleanup_free_ char *p = NULL; + int r; + + assert(pid > 0); + assert(atime); + + r = get_ctty(pid, NULL, &p); + if (r < 0) + return r; + + return get_tty_atime(p, atime); +} + +int session_get_idle_hint(Session *s, dual_timestamp *t) { + usec_t atime = 0, dtime = 0; + int r; + + assert(s); + + /* Graphical sessions have an explicit idle hint */ + if (SESSION_TYPE_IS_GRAPHICAL(s->type)) { + if (t) + *t = s->idle_hint_timestamp; + + return s->idle_hint; + } + + /* For sessions with an explicitly configured tty, let's check its atime */ + if (s->tty) { + r = get_tty_atime(s->tty, &atime); + if (r >= 0) + goto found_atime; + } + + /* For sessions with a leader but no explicitly configured tty, let's check the controlling tty of + * the leader */ + if (pidref_is_set(&s->leader)) { + r = get_process_ctty_atime(s->leader.pid, &atime); + if (r >= 0) + goto found_atime; + } + + if (t) + *t = DUAL_TIMESTAMP_NULL; + + return false; + +found_atime: + if (t) + dual_timestamp_from_realtime(t, atime); + + if (s->manager->idle_action_usec > 0 && s->manager->stop_idle_session_usec != USEC_INFINITY) + dtime = MIN(s->manager->idle_action_usec, s->manager->stop_idle_session_usec); + else if (s->manager->idle_action_usec > 0) + dtime = s->manager->idle_action_usec; + else if (s->manager->stop_idle_session_usec != USEC_INFINITY) + dtime = s->manager->stop_idle_session_usec; + else + return false; + + return usec_add(atime, dtime) <= now(CLOCK_REALTIME); +} + +int session_set_idle_hint(Session *s, bool b) { + assert(s); + + if (!SESSION_TYPE_IS_GRAPHICAL(s->type)) + return -ENOTTY; + + if (s->idle_hint == b) + return 0; + + s->idle_hint = b; + dual_timestamp_now(&s->idle_hint_timestamp); + + session_send_changed(s, "IdleHint", "IdleSinceHint", "IdleSinceHintMonotonic", NULL); + + if (s->seat) + seat_send_changed(s->seat, "IdleHint", "IdleSinceHint", "IdleSinceHintMonotonic", NULL); + + user_send_changed(s->user, "IdleHint", "IdleSinceHint", "IdleSinceHintMonotonic", NULL); + manager_send_changed(s->manager, "IdleHint", "IdleSinceHint", "IdleSinceHintMonotonic", NULL); + + return 1; +} + +int session_get_locked_hint(Session *s) { + assert(s); + + return s->locked_hint; +} + +void session_set_locked_hint(Session *s, bool b) { + assert(s); + + if (s->locked_hint == b) + return; + + s->locked_hint = b; + + session_send_changed(s, "LockedHint", NULL); +} + +void session_set_type(Session *s, SessionType t) { + assert(s); + + if (s->type == t) + return; + + s->type = t; + session_save(s); + + session_send_changed(s, "Type", NULL); +} + +int session_set_display(Session *s, const char *display) { + int r; + + assert(s); + assert(display); + + r = free_and_strdup(&s->display, display); + if (r <= 0) /* 0 means the strings were equal */ + return r; + + session_save(s); + + session_send_changed(s, "Display", NULL); + + return 1; +} + +int session_set_tty(Session *s, const char *tty) { + int r; + + assert(s); + assert(tty); + + r = free_and_strdup(&s->tty, tty); + if (r <= 0) /* 0 means the strings were equal */ + return r; + + session_save(s); + + session_send_changed(s, "TTY", NULL); + + return 1; +} + +static int session_dispatch_fifo(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Session *s = ASSERT_PTR(userdata); + + assert(s->fifo_fd == fd); + + /* EOF on the FIFO means the session died abnormally. */ + + session_remove_fifo(s); + session_stop(s, /* force = */ false); + + return 1; +} + +int session_create_fifo(Session *s) { + int r; + + assert(s); + + /* Create FIFO */ + if (!s->fifo_path) { + r = mkdir_safe_label("/run/systemd/sessions", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + return r; + + s->fifo_path = strjoin("/run/systemd/sessions/", s->id, ".ref"); + if (!s->fifo_path) + return -ENOMEM; + + if (mkfifo(s->fifo_path, 0600) < 0 && errno != EEXIST) + return -errno; + } + + /* Open reading side */ + if (s->fifo_fd < 0) { + s->fifo_fd = open(s->fifo_path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (s->fifo_fd < 0) + return -errno; + } + + if (!s->fifo_event_source) { + r = sd_event_add_io(s->manager->event, &s->fifo_event_source, s->fifo_fd, 0, session_dispatch_fifo, s); + if (r < 0) + return r; + + /* Let's make sure we noticed dead sessions before we process new bus requests (which might + * create new sessions). */ + r = sd_event_source_set_priority(s->fifo_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) + return r; + } + + /* Open writing side */ + return RET_NERRNO(open(s->fifo_path, O_WRONLY|O_CLOEXEC|O_NONBLOCK)); +} + +static void session_remove_fifo(Session *s) { + assert(s); + + s->fifo_event_source = sd_event_source_unref(s->fifo_event_source); + s->fifo_fd = safe_close(s->fifo_fd); + + if (s->fifo_path) { + (void) unlink(s->fifo_path); + s->fifo_path = mfree(s->fifo_path); + } +} + +static int session_dispatch_leader_pidfd(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Session *s = ASSERT_PTR(userdata); + + assert(s->leader.fd == fd); + session_stop(s, /* force= */ false); + + return 1; +} + +int session_watch_pidfd(Session *s) { + int r; + + assert(s); + + if (s->leader.fd < 0) + return 0; + + r = sd_event_add_io(s->manager->event, &s->leader_pidfd_event_source, s->leader.fd, EPOLLIN, session_dispatch_leader_pidfd, s); + if (r < 0) + return r; + + r = sd_event_source_set_priority(s->leader_pidfd_event_source, SD_EVENT_PRIORITY_IMPORTANT); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->leader_pidfd_event_source, "session-pidfd"); + + return 0; +} + +bool session_may_gc(Session *s, bool drop_not_started) { + int r; + + assert(s); + + if (drop_not_started && !s->started) + return true; + + if (!s->user) + return true; + + r = pidref_is_alive(&s->leader); + if (r < 0) + log_debug_errno(r, "Unable to determine if leader PID " PID_FMT " is still alive, assuming not.", s->leader.pid); + if (r > 0) + return false; + + if (s->fifo_fd >= 0) { + if (pipe_eof(s->fifo_fd) <= 0) + return false; + } + + if (s->scope_job) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = manager_job_is_active(s->manager, s->scope_job, &error); + if (r < 0) + log_debug_errno(r, "Failed to determine whether job '%s' is pending, ignoring: %s", s->scope_job, bus_error_message(&error, r)); + if (r != 0) + return false; + } + + if (s->scope) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = manager_unit_is_active(s->manager, s->scope, &error); + if (r < 0) + log_debug_errno(r, "Failed to determine whether unit '%s' is active, ignoring: %s", s->scope, bus_error_message(&error, r)); + if (r != 0) + return false; + } + + return true; +} + +void session_add_to_gc_queue(Session *s) { + assert(s); + + if (s->in_gc_queue) + return; + + LIST_PREPEND(gc_queue, s->manager->session_gc_queue, s); + s->in_gc_queue = true; +} + +SessionState session_get_state(Session *s) { + assert(s); + + /* always check closing first */ + if (s->stopping || s->timer_event_source) + return SESSION_CLOSING; + + if (s->scope_job || (!pidref_is_set(&s->leader) && s->fifo_fd < 0)) + return SESSION_OPENING; + + if (session_is_active(s)) + return SESSION_ACTIVE; + + return SESSION_ONLINE; +} + +int session_kill(Session *s, KillWho who, int signo) { + assert(s); + + if (!s->scope) + return -ESRCH; + + return manager_kill_unit(s->manager, s->scope, who, signo, NULL); +} + +static int session_open_vt(Session *s, bool reopen) { + _cleanup_close_ int fd = -EBADF; + char path[sizeof("/dev/tty") + DECIMAL_STR_MAX(s->vtnr)]; + + assert(s); + + if (s->vtnr < 1) + return -ENODEV; + + if (!reopen && s->vtfd >= 0) + return s->vtfd; + + sprintf(path, "/dev/tty%u", s->vtnr); + + fd = open_terminal(path, O_RDWR | O_CLOEXEC | O_NONBLOCK | O_NOCTTY); + if (fd < 0) + return log_error_errno(fd, "Cannot open VT %s of session %s: %m", path, s->id); + + close_and_replace(s->vtfd, fd); + return s->vtfd; +} + +static int session_prepare_vt(Session *s) { + int vt, r; + struct vt_mode mode = {}; + + assert(s); + + if (s->vtnr < 1) + return 0; + + vt = session_open_vt(s, /* reopen = */ false); + if (vt < 0) + return vt; + + r = fchown(vt, s->user->user_record->uid, -1); + if (r < 0) { + r = log_error_errno(errno, + "Cannot change owner of /dev/tty%u: %m", + s->vtnr); + goto error; + } + + r = ioctl(vt, KDSKBMODE, K_OFF); + if (r < 0) { + r = log_error_errno(errno, + "Cannot set K_OFF on /dev/tty%u: %m", + s->vtnr); + goto error; + } + + r = ioctl(vt, KDSETMODE, KD_GRAPHICS); + if (r < 0) { + r = log_error_errno(errno, + "Cannot set KD_GRAPHICS on /dev/tty%u: %m", + s->vtnr); + goto error; + } + + /* Oh, thanks to the VT layer, VT_AUTO does not work with KD_GRAPHICS. + * So we need a dummy handler here which just acknowledges *all* VT + * switch requests. */ + mode.mode = VT_PROCESS; + mode.relsig = SIGRTMIN; + mode.acqsig = SIGRTMIN + 1; + r = ioctl(vt, VT_SETMODE, &mode); + if (r < 0) { + r = log_error_errno(errno, + "Cannot set VT_PROCESS on /dev/tty%u: %m", + s->vtnr); + goto error; + } + + return 0; + +error: + session_restore_vt(s); + return r; +} + +static void session_restore_vt(Session *s) { + int r; + + assert(s); + + if (s->vtfd < 0) + return; + + r = vt_restore(s->vtfd); + if (r == -EIO) { + /* It might happen if the controlling process exited before or while we were + * restoring the VT as it would leave the old file-descriptor in a hung-up + * state. In this case let's retry with a fresh handle to the virtual terminal. */ + + /* We do a little dance to avoid having the terminal be available + * for reuse before we've cleaned it up. */ + + int fd = session_open_vt(s, /* reopen = */ true); + if (fd >= 0) + r = vt_restore(fd); + } + if (r < 0) + log_warning_errno(r, "Failed to restore VT, ignoring: %m"); + + s->vtfd = safe_close(s->vtfd); +} + +void session_leave_vt(Session *s) { + int r; + + assert(s); + + /* This is called whenever we get a VT-switch signal from the kernel. + * We acknowledge all of them unconditionally. Note that session are + * free to overwrite those handlers and we only register them for + * sessions with controllers. Legacy sessions are not affected. + * However, if we switch from a non-legacy to a legacy session, we must + * make sure to pause all device before acknowledging the switch. We + * process the real switch only after we are notified via sysfs, so the + * legacy session might have already started using the devices. If we + * don't pause the devices before the switch, we might confuse the + * session we switch to. */ + + if (s->vtfd < 0) + return; + + session_device_pause_all(s); + r = vt_release(s->vtfd, /* restore = */ false); + if (r == -EIO) { + /* Handle the same VT hung-up case as in session_restore_vt */ + + int fd = session_open_vt(s, /* reopen = */ true); + if (fd >= 0) + r = vt_release(fd, /* restore = */ false); + } + if (r < 0) + log_debug_errno(r, "Cannot release VT of session %s: %m", s->id); +} + +bool session_is_controller(Session *s, const char *sender) { + return streq_ptr(ASSERT_PTR(s)->controller, sender); +} + +static void session_release_controller(Session *s, bool notify) { + _unused_ _cleanup_free_ char *name = NULL; + SessionDevice *sd; + + assert(s); + + if (!s->controller) + return; + + name = s->controller; + + /* By resetting the controller before releasing the devices, we won't send notification signals. + * This avoids sending useless notifications if the controller is released on disconnects. */ + if (!notify) + s->controller = NULL; + + while ((sd = hashmap_first(s->devices))) + session_device_free(sd); + + s->controller = NULL; + s->track = sd_bus_track_unref(s->track); +} + +static int on_bus_track(sd_bus_track *track, void *userdata) { + Session *s = ASSERT_PTR(userdata); + + assert(track); + + session_drop_controller(s); + + return 0; +} + +int session_set_controller(Session *s, const char *sender, bool force, bool prepare) { + _cleanup_free_ char *name = NULL; + int r; + + assert(s); + assert(sender); + + if (session_is_controller(s, sender)) + return 0; + if (s->controller && !force) + return -EBUSY; + + name = strdup(sender); + if (!name) + return -ENOMEM; + + s->track = sd_bus_track_unref(s->track); + r = sd_bus_track_new(s->manager->bus, &s->track, on_bus_track, s); + if (r < 0) + return r; + + r = sd_bus_track_add_name(s->track, name); + if (r < 0) + return r; + + /* When setting a session controller, we forcibly mute the VT and set + * it into graphics-mode. Applications can override that by changing + * VT state after calling TakeControl(). However, this serves as a good + * default and well-behaving controllers can now ignore VTs entirely. + * Note that we reset the VT on ReleaseControl() and if the controller + * exits. + * If logind crashes/restarts, we restore the controller during restart + * (without preparing the VT since the controller has probably overridden + * VT state by now) or reset the VT in case it crashed/exited, too. */ + if (prepare) { + r = session_prepare_vt(s); + if (r < 0) { + s->track = sd_bus_track_unref(s->track); + return r; + } + } + + session_release_controller(s, true); + s->controller = TAKE_PTR(name); + session_save(s); + + return 0; +} + +void session_drop_controller(Session *s) { + assert(s); + + if (!s->controller) + return; + + s->track = sd_bus_track_unref(s->track); + session_set_type(s, s->original_type); + session_release_controller(s, false); + session_save(s); + session_restore_vt(s); +} + +static const char* const session_state_table[_SESSION_STATE_MAX] = { + [SESSION_OPENING] = "opening", + [SESSION_ONLINE] = "online", + [SESSION_ACTIVE] = "active", + [SESSION_CLOSING] = "closing", +}; + +DEFINE_STRING_TABLE_LOOKUP(session_state, SessionState); + +static const char* const session_type_table[_SESSION_TYPE_MAX] = { + [SESSION_UNSPECIFIED] = "unspecified", + [SESSION_TTY] = "tty", + [SESSION_X11] = "x11", + [SESSION_WAYLAND] = "wayland", + [SESSION_MIR] = "mir", + [SESSION_WEB] = "web", +}; + +DEFINE_STRING_TABLE_LOOKUP(session_type, SessionType); + +static const char* const session_class_table[_SESSION_CLASS_MAX] = { + [SESSION_USER] = "user", + [SESSION_GREETER] = "greeter", + [SESSION_LOCK_SCREEN] = "lock-screen", + [SESSION_BACKGROUND] = "background", +}; + +DEFINE_STRING_TABLE_LOOKUP(session_class, SessionClass); + +static const char* const kill_who_table[_KILL_WHO_MAX] = { + [KILL_LEADER] = "leader", + [KILL_ALL] = "all", +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho); + +static const char* const tty_validity_table[_TTY_VALIDITY_MAX] = { + [TTY_FROM_PAM] = "from-pam", + [TTY_FROM_UTMP] = "from-utmp", + [TTY_UTMP_INCONSISTENT] = "utmp-inconsistent", +}; + +DEFINE_STRING_TABLE_LOOKUP(tty_validity, TTYValidity); diff --git a/src/login/logind-session.h b/src/login/logind-session.h new file mode 100644 index 0000000..8b63843 --- /dev/null +++ b/src/login/logind-session.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Session Session; +typedef enum KillWho KillWho; + +#include "list.h" +#include "login-util.h" +#include "logind-user.h" +#include "pidref.h" +#include "string-util.h" + +typedef enum SessionState { + SESSION_OPENING, /* Session scope is being created */ + SESSION_ONLINE, /* Logged in */ + SESSION_ACTIVE, /* Logged in and in the fg */ + SESSION_CLOSING, /* Logged out, but scope is still there */ + _SESSION_STATE_MAX, + _SESSION_STATE_INVALID = -EINVAL, +} SessionState; + +typedef enum SessionClass { + SESSION_USER, + SESSION_GREETER, + SESSION_LOCK_SCREEN, + SESSION_BACKGROUND, + _SESSION_CLASS_MAX, + _SESSION_CLASS_INVALID = -EINVAL, +} SessionClass; + +typedef enum SessionType { + SESSION_UNSPECIFIED, + SESSION_TTY, + SESSION_X11, + SESSION_WAYLAND, + SESSION_MIR, + SESSION_WEB, + _SESSION_TYPE_MAX, + _SESSION_TYPE_INVALID = -EINVAL, +} SessionType; + +#define SESSION_TYPE_IS_GRAPHICAL(type) IN_SET(type, SESSION_X11, SESSION_WAYLAND, SESSION_MIR) + +enum KillWho { + KILL_LEADER, + KILL_ALL, + _KILL_WHO_MAX, + _KILL_WHO_INVALID = -EINVAL, +}; + +typedef enum TTYValidity { + TTY_FROM_PAM, + TTY_FROM_UTMP, + TTY_UTMP_INCONSISTENT, /* may happen on ssh sessions with multiplexed TTYs */ + _TTY_VALIDITY_MAX, + _TTY_VALIDITY_INVALID = -EINVAL, +} TTYValidity; + +struct Session { + Manager *manager; + + const char *id; + unsigned position; + SessionType type; + SessionType original_type; + SessionClass class; + + char *state_file; + + User *user; + + dual_timestamp timestamp; + + char *display; + char *tty; + TTYValidity tty_validity; + + bool remote; + char *remote_user; + char *remote_host; + char *service; + char *desktop; + + char *scope; + char *scope_job; + + Seat *seat; + unsigned vtnr; + int vtfd; + + PidRef leader; + uint32_t audit_id; + + int fifo_fd; + char *fifo_path; + + sd_event_source *fifo_event_source; + sd_event_source *leader_pidfd_event_source; + + bool idle_hint; + dual_timestamp idle_hint_timestamp; + + bool locked_hint; + + bool in_gc_queue:1; + bool started:1; + bool stopping:1; + + bool was_active:1; + + sd_bus_message *create_message; + + /* Set up when a client requested to release the session via the bus */ + sd_event_source *timer_event_source; + + char *controller; + Hashmap *devices; + sd_bus_track *track; + + sd_event_source *stop_on_idle_event_source; + + LIST_FIELDS(Session, sessions_by_user); + LIST_FIELDS(Session, sessions_by_seat); + + LIST_FIELDS(Session, gc_queue); +}; + +int session_new(Session **ret, Manager *m, const char *id); +Session* session_free(Session *s); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Session *, session_free); + +void session_set_user(Session *s, User *u); +int session_set_leader_consume(Session *s, PidRef _leader); +bool session_may_gc(Session *s, bool drop_not_started); +void session_add_to_gc_queue(Session *s); +int session_activate(Session *s); +bool session_is_active(Session *s); +int session_get_idle_hint(Session *s, dual_timestamp *t); +int session_set_idle_hint(Session *s, bool b); +int session_get_locked_hint(Session *s); +void session_set_locked_hint(Session *s, bool b); +void session_set_type(Session *s, SessionType t); +int session_set_display(Session *s, const char *display); +int session_set_tty(Session *s, const char *tty); +int session_create_fifo(Session *s); +int session_watch_pidfd(Session *s); +int session_start(Session *s, sd_bus_message *properties, sd_bus_error *error); +int session_stop(Session *s, bool force); +int session_finalize(Session *s); +int session_release(Session *s); +int session_save(Session *s); +int session_load(Session *s); +int session_kill(Session *s, KillWho who, int signo); + +SessionState session_get_state(Session *u); + +const char* session_state_to_string(SessionState t) _const_; +SessionState session_state_from_string(const char *s) _pure_; + +const char* session_type_to_string(SessionType t) _const_; +SessionType session_type_from_string(const char *s) _pure_; + +const char* session_class_to_string(SessionClass t) _const_; +SessionClass session_class_from_string(const char *s) _pure_; + +const char *kill_who_to_string(KillWho k) _const_; +KillWho kill_who_from_string(const char *s) _pure_; + +const char* tty_validity_to_string(TTYValidity t) _const_; +TTYValidity tty_validity_from_string(const char *s) _pure_; + +void session_leave_vt(Session *s); + +bool session_is_controller(Session *s, const char *sender); +int session_set_controller(Session *s, const char *sender, bool force, bool prepare); +void session_drop_controller(Session *s); + +static inline bool SESSION_IS_SELF(const char *name) { + return isempty(name) || streq(name, "self"); +} + +static inline bool SESSION_IS_AUTO(const char *name) { + return streq_ptr(name, "auto"); +} diff --git a/src/login/logind-user-dbus.c b/src/login/logind-user-dbus.c new file mode 100644 index 0000000..88649b2 --- /dev/null +++ b/src/login/logind-user-dbus.c @@ -0,0 +1,421 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "format-util.h" +#include "logind-dbus.h" +#include "logind-session-dbus.h" +#include "logind-user-dbus.h" +#include "logind-user.h" +#include "logind.h" +#include "missing_capability.h" +#include "signal-util.h" +#include "strv.h" +#include "user-util.h" + +static int property_get_uid( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "u", (uint32_t) u->user_record->uid); +} + +static int property_get_gid( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "u", (uint32_t) u->user_record->gid); +} + +static int property_get_name( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "s", u->user_record->user_name); +} + +static BUS_DEFINE_PROPERTY_GET2(property_get_state, "s", User, user_get_state, user_state_to_string); + +static int property_get_display( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *p = NULL; + User *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + p = u->display ? session_bus_path(u->display) : strdup("/"); + if (!p) + return -ENOMEM; + + return sd_bus_message_append(reply, "(so)", u->display ? u->display->id : "", p); +} + +static int property_get_sessions( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(so)"); + if (r < 0) + return r; + + LIST_FOREACH(sessions_by_user, session, u->sessions) { + _cleanup_free_ char *p = NULL; + + p = session_bus_path(session); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(so)", session->id, p); + if (r < 0) + return r; + + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_idle_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "b", user_get_idle_hint(u, NULL) > 0); +} + +static int property_get_idle_since_hint( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + dual_timestamp t = DUAL_TIMESTAMP_NULL; + uint64_t k; + + assert(bus); + assert(reply); + + (void) user_get_idle_hint(u, &t); + k = streq(property, "IdleSinceHint") ? t.realtime : t.monotonic; + + return sd_bus_message_append(reply, "t", k); +} + +static int property_get_linger( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + User *u = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = user_check_linger_file(u); + + return sd_bus_message_append(reply, "b", r > 0); +} + +int bus_user_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + User *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.login1.manage", + NULL, + false, + u->user_record->uid, + &u->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = user_stop(u, /* force = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_user_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) { + User *u = ASSERT_PTR(userdata); + int32_t signo; + int r; + + assert(message); + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.login1.manage", + NULL, + false, + u->user_record->uid, + &u->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = sd_bus_message_read(message, "i", &signo); + if (r < 0) + return r; + + if (!SIGNAL_VALID(signo)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal %i", signo); + + r = user_kill(u, signo); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int user_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + uid_t uid; + User *user; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + if (streq(path, "/org/freedesktop/login1/user/self")) { + sd_bus_message *message; + + message = sd_bus_get_current_message(bus); + + r = manager_get_user_from_creds(m, message, UID_INVALID, error, &user); + if (r == -ENXIO) { + sd_bus_error_free(error); + return 0; + } + if (r < 0) + return r; + } else { + const char *p; + + p = startswith(path, "/org/freedesktop/login1/user/_"); + if (!p) + return 0; + + r = parse_uid(p, &uid); + if (r < 0) + return 0; + + user = hashmap_get(m->users, UID_TO_PTR(uid)); + if (!user) + return 0; + } + + *found = user; + return 1; +} + +char *user_bus_path(User *u) { + char *s; + + assert(u); + + if (asprintf(&s, "/org/freedesktop/login1/user/_"UID_FMT, u->user_record->uid) < 0) + return NULL; + + return s; +} + +static int user_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + sd_bus_message *message; + Manager *m = userdata; + User *user; + int r; + + assert(bus); + assert(path); + assert(nodes); + + HASHMAP_FOREACH(user, m->users) { + char *p; + + p = user_bus_path(user); + if (!p) + return -ENOMEM; + + r = strv_consume(&l, p); + if (r < 0) + return r; + } + + message = sd_bus_get_current_message(bus); + if (message) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_OWNER_UID|SD_BUS_CREDS_AUGMENT, &creds); + if (r >= 0) { + uid_t uid; + + r = sd_bus_creds_get_owner_uid(creds, &uid); + if (r >= 0) { + user = hashmap_get(m->users, UID_TO_PTR(uid)); + if (user) { + r = strv_extend(&l, "/org/freedesktop/login1/user/self"); + if (r < 0) + return r; + } + } + } + } + + *nodes = TAKE_PTR(l); + + return 1; +} + +static const sd_bus_vtable user_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("UID", "u", property_get_uid, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("GID", "u", property_get_gid, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Name", "s", property_get_name, 0, SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("Timestamp", offsetof(User, timestamp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RuntimePath", "s", NULL, offsetof(User, runtime_path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Service", "s", NULL, offsetof(User, service), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Slice", "s", NULL, offsetof(User, slice), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Display", "(so)", property_get_display, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("State", "s", property_get_state, 0, 0), + SD_BUS_PROPERTY("Sessions", "a(so)", property_get_sessions, 0, 0), + SD_BUS_PROPERTY("IdleHint", "b", property_get_idle_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHint", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IdleSinceHintMonotonic", "t", property_get_idle_since_hint, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Linger", "b", property_get_linger, 0, 0), + + SD_BUS_METHOD("Terminate", NULL, NULL, bus_user_method_terminate, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Kill", + SD_BUS_ARGS("i", signal_number), + SD_BUS_NO_RESULT, + bus_user_method_kill, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation user_object = { + "/org/freedesktop/login1/user", + "org.freedesktop.login1.User", + .fallback_vtables = BUS_FALLBACK_VTABLES({user_vtable, user_object_find}), + .node_enumerator = user_node_enumerator, +}; + +int user_send_signal(User *u, bool new_user) { + _cleanup_free_ char *p = NULL; + + assert(u); + + p = user_bus_path(u); + if (!p) + return -ENOMEM; + + return sd_bus_emit_signal( + u->manager->bus, + "/org/freedesktop/login1", + "org.freedesktop.login1.Manager", + new_user ? "UserNew" : "UserRemoved", + "uo", (uint32_t) u->user_record->uid, p); +} + +int user_send_changed(User *u, const char *properties, ...) { + _cleanup_free_ char *p = NULL; + char **l; + + assert(u); + + if (!u->started) + return 0; + + p = user_bus_path(u); + if (!p) + return -ENOMEM; + + l = strv_from_stdarg_alloca(properties); + + return sd_bus_emit_properties_changed_strv(u->manager->bus, p, "org.freedesktop.login1.User", l); +} diff --git a/src/login/logind-user-dbus.h b/src/login/logind-user-dbus.h new file mode 100644 index 0000000..d2f24ce --- /dev/null +++ b/src/login/logind-user-dbus.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "logind-user.h" + +extern const BusObjectImplementation user_object; + +char *user_bus_path(User *s); + +int user_send_signal(User *u, bool new_user); +int user_send_changed(User *u, const char *properties, ...) _sentinel_; + +int bus_user_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_user_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/login/logind-user.c b/src/login/logind-user.c new file mode 100644 index 0000000..c613307 --- /dev/null +++ b/src/login/logind-user.c @@ -0,0 +1,940 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "cgroup-util.h" +#include "clean-ipc.h" +#include "env-file.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "label-util.h" +#include "limits-util.h" +#include "logind-dbus.h" +#include "logind-user-dbus.h" +#include "logind-user.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "rm-rf.h" +#include "serialize.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "uid-alloc-range.h" +#include "unit-name.h" +#include "user-util.h" + +int user_new(User **ret, + Manager *m, + UserRecord *ur) { + + _cleanup_(user_freep) User *u = NULL; + char lu[DECIMAL_STR_MAX(uid_t) + 1]; + int r; + + assert(ret); + assert(m); + assert(ur); + + if (!ur->user_name) + return -EINVAL; + + if (!uid_is_valid(ur->uid)) + return -EINVAL; + + u = new(User, 1); + if (!u) + return -ENOMEM; + + *u = (User) { + .manager = m, + .user_record = user_record_ref(ur), + .last_session_timestamp = USEC_INFINITY, + }; + + if (asprintf(&u->state_file, "/run/systemd/users/" UID_FMT, ur->uid) < 0) + return -ENOMEM; + + if (asprintf(&u->runtime_path, "/run/user/" UID_FMT, ur->uid) < 0) + return -ENOMEM; + + xsprintf(lu, UID_FMT, ur->uid); + r = slice_build_subslice(SPECIAL_USER_SLICE, lu, &u->slice); + if (r < 0) + return r; + + r = unit_name_build("user", lu, ".service", &u->service); + if (r < 0) + return r; + + r = unit_name_build("user-runtime-dir", lu, ".service", &u->runtime_dir_service); + if (r < 0) + return r; + + r = hashmap_put(m->users, UID_TO_PTR(ur->uid), u); + if (r < 0) + return r; + + r = hashmap_put(m->user_units, u->slice, u); + if (r < 0) + return r; + + r = hashmap_put(m->user_units, u->service, u); + if (r < 0) + return r; + + r = hashmap_put(m->user_units, u->runtime_dir_service, u); + if (r < 0) + return r; + + *ret = TAKE_PTR(u); + return 0; +} + +User *user_free(User *u) { + if (!u) + return NULL; + + if (u->in_gc_queue) + LIST_REMOVE(gc_queue, u->manager->user_gc_queue, u); + + while (u->sessions) + session_free(u->sessions); + + if (u->service) + hashmap_remove_value(u->manager->user_units, u->service, u); + + if (u->runtime_dir_service) + hashmap_remove_value(u->manager->user_units, u->runtime_dir_service, u); + + if (u->slice) + hashmap_remove_value(u->manager->user_units, u->slice, u); + + hashmap_remove_value(u->manager->users, UID_TO_PTR(u->user_record->uid), u); + + sd_event_source_unref(u->timer_event_source); + + u->service_job = mfree(u->service_job); + + u->service = mfree(u->service); + u->runtime_dir_service = mfree(u->runtime_dir_service); + u->slice = mfree(u->slice); + u->runtime_path = mfree(u->runtime_path); + u->state_file = mfree(u->state_file); + + user_record_unref(u->user_record); + + return mfree(u); +} + +static int user_save_internal(User *u) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(u); + assert(u->state_file); + + r = mkdir_safe_label("/run/systemd/users", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + r = fopen_temporary(u->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "NAME=%s\n" + "STATE=%s\n" /* friendly user-facing state */ + "STOPPING=%s\n", /* low-level state */ + u->user_record->user_name, + user_state_to_string(user_get_state(u)), + yes_no(u->stopping)); + + /* LEGACY: no-one reads RUNTIME= anymore, drop it at some point */ + if (u->runtime_path) + fprintf(f, "RUNTIME=%s\n", u->runtime_path); + + if (u->service_job) + fprintf(f, "SERVICE_JOB=%s\n", u->service_job); + + if (u->display) + fprintf(f, "DISPLAY=%s\n", u->display->id); + + if (dual_timestamp_is_set(&u->timestamp)) + fprintf(f, + "REALTIME="USEC_FMT"\n" + "MONOTONIC="USEC_FMT"\n", + u->timestamp.realtime, + u->timestamp.monotonic); + + if (u->last_session_timestamp != USEC_INFINITY) + fprintf(f, "LAST_SESSION_TIMESTAMP=" USEC_FMT "\n", + u->last_session_timestamp); + + if (u->sessions) { + bool first; + + fputs("SESSIONS=", f); + first = true; + LIST_FOREACH(sessions_by_user, i, u->sessions) { + if (first) + first = false; + else + fputc(' ', f); + + fputs(i->id, f); + } + + fputs("\nSEATS=", f); + first = true; + LIST_FOREACH(sessions_by_user, i, u->sessions) { + if (!i->seat) + continue; + + if (first) + first = false; + else + fputc(' ', f); + + fputs(i->seat->id, f); + } + + fputs("\nACTIVE_SESSIONS=", f); + first = true; + LIST_FOREACH(sessions_by_user, i, u->sessions) { + if (!session_is_active(i)) + continue; + + if (first) + first = false; + else + fputc(' ', f); + + fputs(i->id, f); + } + + fputs("\nONLINE_SESSIONS=", f); + first = true; + LIST_FOREACH(sessions_by_user, i, u->sessions) { + if (session_get_state(i) == SESSION_CLOSING) + continue; + + if (first) + first = false; + else + fputc(' ', f); + + fputs(i->id, f); + } + + fputs("\nACTIVE_SEATS=", f); + first = true; + LIST_FOREACH(sessions_by_user, i, u->sessions) { + if (!session_is_active(i) || !i->seat) + continue; + + if (first) + first = false; + else + fputc(' ', f); + + fputs(i->seat->id, f); + } + + fputs("\nONLINE_SEATS=", f); + first = true; + LIST_FOREACH(sessions_by_user, i, u->sessions) { + if (session_get_state(i) == SESSION_CLOSING || !i->seat) + continue; + + if (first) + first = false; + else + fputc(' ', f); + + fputs(i->seat->id, f); + } + fputc('\n', f); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, u->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + return 0; + +fail: + (void) unlink(u->state_file); + + return log_error_errno(r, "Failed to save user data %s: %m", u->state_file); +} + +int user_save(User *u) { + assert(u); + + if (!u->started) + return 0; + + return user_save_internal(u); +} + +int user_load(User *u) { + _cleanup_free_ char *realtime = NULL, *monotonic = NULL, *stopping = NULL, *last_session_timestamp = NULL; + int r; + + assert(u); + + r = parse_env_file(NULL, u->state_file, + "SERVICE_JOB", &u->service_job, + "STOPPING", &stopping, + "REALTIME", &realtime, + "MONOTONIC", &monotonic, + "LAST_SESSION_TIMESTAMP", &last_session_timestamp); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", u->state_file); + + if (stopping) { + r = parse_boolean(stopping); + if (r < 0) + log_debug_errno(r, "Failed to parse 'STOPPING' boolean: %s", stopping); + else + u->stopping = r; + } + + if (realtime) + (void) deserialize_usec(realtime, &u->timestamp.realtime); + if (monotonic) + (void) deserialize_usec(monotonic, &u->timestamp.monotonic); + if (last_session_timestamp) + (void) deserialize_usec(last_session_timestamp, &u->last_session_timestamp); + + return 0; +} + +static void user_start_service(User *u) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(u); + + /* Start the service containing the "systemd --user" instance (user@.service). Note that we don't explicitly + * start the per-user slice or the systemd-runtime-dir@.service instance, as those are pulled in both by + * user@.service and the session scopes as dependencies. */ + + u->service_job = mfree(u->service_job); + + r = manager_start_unit(u->manager, u->service, &error, &u->service_job); + if (r < 0) + log_full_errno(sd_bus_error_has_name(&error, BUS_ERROR_UNIT_MASKED) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to start user service '%s', ignoring: %s", u->service, bus_error_message(&error, r)); +} + +static int update_slice_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + _cleanup_(user_record_unrefp) UserRecord *ur = ASSERT_PTR(userdata); + const sd_bus_error *e; + int r; + + assert(m); + + e = sd_bus_message_get_error(m); + if (e) { + r = sd_bus_error_get_errno(e); + log_warning_errno(r, + "Failed to update slice of %s, ignoring: %s", + ur->user_name, + bus_error_message(e, r)); + + return 0; + } + + log_debug("Successfully set slice parameters of %s.", ur->user_name); + return 0; +} + +static int user_update_slice(User *u) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(u); + + if (u->user_record->tasks_max == UINT64_MAX && + u->user_record->memory_high == UINT64_MAX && + u->user_record->memory_max == UINT64_MAX && + u->user_record->cpu_weight == UINT64_MAX && + u->user_record->io_weight == UINT64_MAX) + return 0; + + r = bus_message_new_method_call(u->manager->bus, &m, bus_systemd_mgr, "SetUnitProperties"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "sb", u->slice, true); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + const struct { + const char *name; + uint64_t value; + } settings[] = { + { "TasksMax", u->user_record->tasks_max }, + { "MemoryMax", u->user_record->memory_max }, + { "MemoryHigh", u->user_record->memory_high }, + { "CPUWeight", u->user_record->cpu_weight }, + { "IOWeight", u->user_record->io_weight }, + }; + + for (size_t i = 0; i < ELEMENTSOF(settings); i++) + if (settings[i].value != UINT64_MAX) { + r = sd_bus_message_append(m, "(sv)", settings[i].name, "t", settings[i].value); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call_async(u->manager->bus, NULL, m, update_slice_callback, u->user_record, 0); + if (r < 0) + return log_error_errno(r, "Failed to change user slice properties: %m"); + + /* Ref the user record pointer, so that the slot keeps it pinned */ + user_record_ref(u->user_record); + + return 0; +} + +int user_start(User *u) { + assert(u); + + if (u->started && !u->stopping) + return 0; + + /* If u->stopping is set, the user is marked for removal and service stop-jobs are queued. We have to clear + * that flag before queueing the start-jobs again. If they succeed, the user object can be re-used just fine + * (pid1 takes care of job-ordering and proper restart), but if they fail, we want to force another user_stop() + * so possibly pending units are stopped. */ + u->stopping = false; + + if (!u->started) + log_debug("Starting services for new user %s.", u->user_record->user_name); + + /* Save the user data so far, because pam_systemd will read the XDG_RUNTIME_DIR out of it while starting up + * systemd --user. We need to do user_save_internal() because we have not "officially" started yet. */ + user_save_internal(u); + + /* Set slice parameters */ + (void) user_update_slice(u); + + /* Start user@UID.service */ + user_start_service(u); + + if (!u->started) { + if (!dual_timestamp_is_set(&u->timestamp)) + dual_timestamp_now(&u->timestamp); + user_send_signal(u, true); + u->started = true; + } + + /* Save new user data */ + user_save(u); + + return 0; +} + +static void user_stop_service(User *u, bool force) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(u); + assert(u->service); + + /* The reverse of user_start_service(). Note that we only stop user@UID.service here, and let StopWhenUnneeded= + * deal with the slice and the user-runtime-dir@.service instance. */ + + u->service_job = mfree(u->service_job); + + r = manager_stop_unit(u->manager, u->service, force ? "replace" : "fail", &error, &u->service_job); + if (r < 0) + log_warning_errno(r, "Failed to stop user service '%s', ignoring: %s", u->service, bus_error_message(&error, r)); +} + +int user_stop(User *u, bool force) { + int r = 0; + + assert(u); + + /* This is called whenever we begin with tearing down a user record. It's called in two cases: explicit API + * request to do so via the bus (in which case 'force' is true) and automatically due to GC, if there's no + * session left pinning it (in which case 'force' is false). Note that this just initiates tearing down of the + * user, the User object will remain in memory until user_finalize() is called, see below. */ + + if (!u->started) + return 0; + + if (u->stopping) { /* Stop jobs have already been queued */ + user_save(u); + return 0; + } + + LIST_FOREACH(sessions_by_user, s, u->sessions) { + int k; + + k = session_stop(s, force); + if (k < 0) + r = k; + } + + user_stop_service(u, force); + + u->stopping = true; + + user_save(u); + + return r; +} + +int user_finalize(User *u) { + int r = 0, k; + + assert(u); + + /* Called when the user is really ready to be freed, i.e. when all unit stop jobs and suchlike for it are + * done. This is called as a result of an earlier user_done() when all jobs are completed. */ + + if (u->started) + log_debug("User %s logged out.", u->user_record->user_name); + + LIST_FOREACH(sessions_by_user, s, u->sessions) { + k = session_finalize(s); + if (k < 0) + r = k; + } + + /* Clean SysV + POSIX IPC objects, but only if this is not a system user. Background: in many setups cronjobs + * are run in full PAM and thus logind sessions, even if the code run doesn't belong to actual users but to + * system components. Since enable RemoveIPC= globally for all users, we need to be a bit careful with such + * cases, as we shouldn't accidentally remove a system service's IPC objects while it is running, just because + * a cronjob running as the same user just finished. Hence: exclude system users generally from IPC clean-up, + * and do it only for normal users. */ + if (u->manager->remove_ipc && !uid_is_system(u->user_record->uid)) { + k = clean_ipc_by_uid(u->user_record->uid); + if (k < 0) + r = k; + } + + (void) unlink(u->state_file); + user_add_to_gc_queue(u); + + if (u->started) { + user_send_signal(u, false); + u->started = false; + } + + return r; +} + +int user_get_idle_hint(User *u, dual_timestamp *t) { + bool idle_hint = true; + dual_timestamp ts = DUAL_TIMESTAMP_NULL; + + assert(u); + + LIST_FOREACH(sessions_by_user, s, u->sessions) { + dual_timestamp k; + int ih; + + ih = session_get_idle_hint(s, &k); + if (ih < 0) + return ih; + + if (!ih) { + if (!idle_hint) { + if (k.monotonic < ts.monotonic) + ts = k; + } else { + idle_hint = false; + ts = k; + } + } else if (idle_hint) { + + if (k.monotonic > ts.monotonic) + ts = k; + } + } + + if (t) + *t = ts; + + return idle_hint; +} + +int user_check_linger_file(User *u) { + _cleanup_free_ char *cc = NULL; + char *p = NULL; + + cc = cescape(u->user_record->user_name); + if (!cc) + return -ENOMEM; + + p = strjoina("/var/lib/systemd/linger/", cc); + if (access(p, F_OK) < 0) { + if (errno != ENOENT) + return -errno; + + return false; + } + + return true; +} + +static bool user_unit_active(User *u) { + int r; + + assert(u->service); + assert(u->runtime_dir_service); + assert(u->slice); + + FOREACH_STRING(i, u->service, u->runtime_dir_service, u->slice) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = manager_unit_is_active(u->manager, i, &error); + if (r < 0) + log_debug_errno(r, "Failed to determine whether unit '%s' is active, ignoring: %s", i, bus_error_message(&error, r)); + if (r != 0) + return true; + } + + return false; +} + +static usec_t user_get_stop_delay(User *u) { + assert(u); + + if (u->user_record->stop_delay_usec != UINT64_MAX) + return u->user_record->stop_delay_usec; + + if (user_record_removable(u->user_record) > 0) + return 0; /* For removable users lower the stop delay to zero */ + + return u->manager->user_stop_delay; +} + +bool user_may_gc(User *u, bool drop_not_started) { + int r; + + assert(u); + + if (drop_not_started && !u->started) + return true; + + if (u->sessions) + return false; + + if (u->last_session_timestamp != USEC_INFINITY) { + usec_t user_stop_delay; + + /* All sessions have been closed. Let's see if we shall leave the user record around for a bit */ + + user_stop_delay = user_get_stop_delay(u); + + if (user_stop_delay == USEC_INFINITY) + return false; /* Leave it around forever! */ + if (user_stop_delay > 0 && + now(CLOCK_MONOTONIC) < usec_add(u->last_session_timestamp, user_stop_delay)) + return false; /* Leave it around for a bit longer. */ + } + + /* Is this a user that shall stay around forever ("linger")? Before we say "no" to GC'ing for lingering users, let's check + * if any of the three units that we maintain for this user is still around. If none of them is, + * there's no need to keep this user around even if lingering is enabled. */ + if (user_check_linger_file(u) > 0 && user_unit_active(u)) + return false; + + /* Check if our job is still pending */ + if (u->service_job) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = manager_job_is_active(u->manager, u->service_job, &error); + if (r < 0) + log_debug_errno(r, "Failed to determine whether job '%s' is pending, ignoring: %s", u->service_job, bus_error_message(&error, r)); + if (r != 0) + return false; + } + + /* Note that we don't care if the three units we manage for each user object are up or not, as we are managing + * their state rather than tracking it. */ + + return true; +} + +void user_add_to_gc_queue(User *u) { + assert(u); + + if (u->in_gc_queue) + return; + + LIST_PREPEND(gc_queue, u->manager->user_gc_queue, u); + u->in_gc_queue = true; +} + +UserState user_get_state(User *u) { + assert(u); + + if (u->stopping) + return USER_CLOSING; + + if (!u->started || u->service_job) + return USER_OPENING; + + if (u->sessions) { + bool all_closing = true; + + LIST_FOREACH(sessions_by_user, i, u->sessions) { + SessionState state; + + state = session_get_state(i); + if (state == SESSION_ACTIVE) + return USER_ACTIVE; + if (state != SESSION_CLOSING) + all_closing = false; + } + + return all_closing ? USER_CLOSING : USER_ONLINE; + } + + if (user_check_linger_file(u) > 0 && user_unit_active(u)) + return USER_LINGERING; + + return USER_CLOSING; +} + +int user_kill(User *u, int signo) { + assert(u); + + return manager_kill_unit(u->manager, u->slice, KILL_ALL, signo, NULL); +} + +static bool elect_display_filter(Session *s) { + /* Return true if the session is a candidate for the user’s ‘primary session’ or ‘display’. */ + assert(s); + + return IN_SET(s->class, SESSION_USER, SESSION_GREETER) && s->started && !s->stopping; +} + +static int elect_display_compare(Session *s1, Session *s2) { + /* Indexed by SessionType. Lower numbers mean more preferred. */ + static const int type_ranks[_SESSION_TYPE_MAX] = { + [SESSION_UNSPECIFIED] = 0, + [SESSION_TTY] = -2, + [SESSION_X11] = -3, + [SESSION_WAYLAND] = -3, + [SESSION_MIR] = -3, + [SESSION_WEB] = -1, + }; + + /* Calculate the partial order relationship between s1 and s2, + * returning < 0 if s1 is preferred as the user’s ‘primary session’, + * 0 if s1 and s2 are equally preferred or incomparable, or > 0 if s2 + * is preferred. + * + * s1 or s2 may be NULL. */ + if (!s1 && !s2) + return 0; + + if ((s1 == NULL) != (s2 == NULL)) + return (s1 == NULL) - (s2 == NULL); + + if (s1->stopping != s2->stopping) + return s1->stopping - s2->stopping; + + if ((s1->class != SESSION_USER) != (s2->class != SESSION_USER)) + return (s1->class != SESSION_USER) - (s2->class != SESSION_USER); + + if ((s1->type == _SESSION_TYPE_INVALID) != (s2->type == _SESSION_TYPE_INVALID)) + return (s1->type == _SESSION_TYPE_INVALID) - (s2->type == _SESSION_TYPE_INVALID); + + if (s1->type != s2->type) + return type_ranks[s1->type] - type_ranks[s2->type]; + + return 0; +} + +void user_elect_display(User *u) { + assert(u); + + /* This elects a primary session for each user, which we call the "display". We try to keep the assignment + * stable, but we "upgrade" to better choices. */ + log_debug("Electing new display for user %s", u->user_record->user_name); + + LIST_FOREACH(sessions_by_user, s, u->sessions) { + if (!elect_display_filter(s)) { + log_debug("Ignoring session %s", s->id); + continue; + } + + if (elect_display_compare(s, u->display) < 0) { + log_debug("Choosing session %s in preference to %s", s->id, u->display ? u->display->id : "-"); + u->display = s; + } + } +} + +static int user_stop_timeout_callback(sd_event_source *es, uint64_t usec, void *userdata) { + User *u = ASSERT_PTR(userdata); + + user_add_to_gc_queue(u); + + return 0; +} + +void user_update_last_session_timer(User *u) { + usec_t user_stop_delay; + int r; + + assert(u); + + if (u->sessions) { + /* There are sessions, turn off the timer */ + u->last_session_timestamp = USEC_INFINITY; + u->timer_event_source = sd_event_source_unref(u->timer_event_source); + return; + } + + if (u->last_session_timestamp != USEC_INFINITY) + return; /* Timer already started */ + + u->last_session_timestamp = now(CLOCK_MONOTONIC); + + assert(!u->timer_event_source); + + user_stop_delay = user_get_stop_delay(u); + if (!timestamp_is_set(user_stop_delay)) + return; + + if (sd_event_get_state(u->manager->event) == SD_EVENT_FINISHED) { + log_debug("Not allocating user stop timeout, since we are already exiting."); + return; + } + + r = sd_event_add_time(u->manager->event, + &u->timer_event_source, + CLOCK_MONOTONIC, + usec_add(u->last_session_timestamp, user_stop_delay), 0, + user_stop_timeout_callback, u); + if (r < 0) + log_warning_errno(r, "Failed to enqueue user stop event source, ignoring: %m"); + + if (DEBUG_LOGGING) + log_debug("Last session of user '%s' logged out, terminating user context in %s.", + u->user_record->user_name, + FORMAT_TIMESPAN(user_stop_delay, USEC_PER_MSEC)); +} + +static const char* const user_state_table[_USER_STATE_MAX] = { + [USER_OFFLINE] = "offline", + [USER_OPENING] = "opening", + [USER_LINGERING] = "lingering", + [USER_ONLINE] = "online", + [USER_ACTIVE] = "active", + [USER_CLOSING] = "closing" +}; + +DEFINE_STRING_TABLE_LOOKUP(user_state, UserState); + +int config_parse_tmpfs_size( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *sz = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* First, try to parse as percentage */ + r = parse_permyriad(rvalue); + if (r > 0) + *sz = physical_memory_scale(r, 10000U); + else { + uint64_t k; + + /* If the passed argument was not a percentage, or out of range, parse as byte size */ + + r = parse_size(rvalue, 1024, &k); + if (r >= 0 && (k <= 0 || (uint64_t) (size_t) k != k)) + r = -ERANGE; + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value '%s', ignoring: %m", rvalue); + return 0; + } + + *sz = PAGE_ALIGN((size_t) k); + } + + return 0; +} + +int config_parse_compat_user_tasks_max( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "Support for option %s= has been removed.", + lvalue); + log_info("Hint: try creating /etc/systemd/system/user-.slice.d/50-limits.conf with:\n" + " [Slice]\n" + " TasksMax=%s", + rvalue); + return 0; +} diff --git a/src/login/logind-user.h b/src/login/logind-user.h new file mode 100644 index 0000000..21b9f8f --- /dev/null +++ b/src/login/logind-user.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct User User; + +#include "conf-parser.h" +#include "list.h" +#include "logind.h" +#include "user-record.h" + +typedef enum UserState { + USER_OFFLINE, /* Not logged in at all */ + USER_OPENING, /* Is logging in */ + USER_LINGERING, /* Lingering has been enabled by the admin for this user */ + USER_ONLINE, /* User logged in */ + USER_ACTIVE, /* User logged in and has a session in the fg */ + USER_CLOSING, /* User logged out, but processes still remain and lingering is not enabled */ + _USER_STATE_MAX, + _USER_STATE_INVALID = -EINVAL, +} UserState; + +struct User { + Manager *manager; + + UserRecord *user_record; + + char *state_file; + char *runtime_path; + + char *slice; /* user-UID.slice */ + char *service; /* user@UID.service */ + char *runtime_dir_service; /* user-runtime-dir@UID.service */ + + char *service_job; + + Session *display; + + dual_timestamp timestamp; /* When this User object was 'started' the first time */ + usec_t last_session_timestamp; /* When the number of sessions of this user went from 1 to 0 the last time */ + + /* Set up when the last session of the user logs out */ + sd_event_source *timer_event_source; + + bool in_gc_queue:1; + + bool started:1; /* Whenever the user being started, has been started or is being stopped again. */ + bool stopping:1; /* Whenever the user is being stopped or has been stopped. */ + + LIST_HEAD(Session, sessions); + LIST_FIELDS(User, gc_queue); +}; + +int user_new(User **out, Manager *m, UserRecord *ur); +User *user_free(User *u); + +DEFINE_TRIVIAL_CLEANUP_FUNC(User *, user_free); + +bool user_may_gc(User *u, bool drop_not_started); +void user_add_to_gc_queue(User *u); +int user_start(User *u); +int user_stop(User *u, bool force); +int user_finalize(User *u); +UserState user_get_state(User *u); +int user_get_idle_hint(User *u, dual_timestamp *t); +int user_save(User *u); +int user_load(User *u); +int user_kill(User *u, int signo); +int user_check_linger_file(User *u); +void user_elect_display(User *u); +void user_update_last_session_timer(User *u); + +const char* user_state_to_string(UserState s) _const_; +UserState user_state_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_compat_user_tasks_max); diff --git a/src/login/logind-wall.c b/src/login/logind-wall.c new file mode 100644 index 0000000..97b74e9 --- /dev/null +++ b/src/login/logind-wall.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "audit-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-util.h" +#include "event-util.h" +#include "format-util.h" +#include "logind.h" +#include "path-util.h" +#include "special.h" +#include "strv.h" +#include "unit-name.h" +#include "user-util.h" +#include "wall.h" + +static usec_t when_wall(usec_t n, usec_t elapse) { + static const int wall_timers[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 25, 40, 55, 70, 100, 130, 150, 180, + }; + + /* If the time is already passed, then don't announce */ + if (n >= elapse) + return 0; + + usec_t left = elapse - n; + + for (unsigned i = 1; i < ELEMENTSOF(wall_timers); i++) + if (wall_timers[i] * USEC_PER_MINUTE >= left) + return left - wall_timers[i-1] * USEC_PER_MINUTE; + + return left % USEC_PER_HOUR; +} + +bool logind_wall_tty_filter(const char *tty, bool is_local, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(m->scheduled_shutdown_action); + + const char *p = path_startswith(tty, "/dev/"); + if (!p) + return true; + + /* Do not send information about events which do not destroy local sessions to local terminals. We + * can assume that if the system enters sleep or hibernation, this will be visible in an obvious way + * for any local user. And once the systems exits sleep or hibernation, the notification would be + * just noise, in particular for auto-suspend. */ + if (is_local && HANDLE_ACTION_IS_SLEEP(m->scheduled_shutdown_action->handle)) + return false; + + return !streq_ptr(p, m->scheduled_shutdown_tty); +} + +static int warn_wall(Manager *m, usec_t n) { + assert(m); + + if (!m->scheduled_shutdown_action) + return 0; + + bool left = m->scheduled_shutdown_timeout > n; + + _cleanup_free_ char *l = NULL; + if (asprintf(&l, "%s%sThe system will %s %s%s!", + strempty(m->wall_message), + isempty(m->wall_message) ? "" : "\n", + handle_action_verb_to_string(m->scheduled_shutdown_action->handle), + left ? "at " : "now", + left ? FORMAT_TIMESTAMP(m->scheduled_shutdown_timeout) : "") < 0) { + + log_oom(); + return 1; /* We're out-of-memory for now, but let's try to print the message later */ + } + + _cleanup_free_ char *username = uid_to_name(m->scheduled_shutdown_uid); + + int level = left ? LOG_INFO : LOG_NOTICE; + + log_struct(level, + LOG_MESSAGE("%s", l), + "ACTION=%s", handle_action_to_string(m->scheduled_shutdown_action->handle), + "MESSAGE_ID=" SD_MESSAGE_SHUTDOWN_SCHEDULED_STR, + username ? "OPERATOR=%s" : NULL, username); + + if (m->enable_wall_messages) + (void) wall(l, username, m->scheduled_shutdown_tty, logind_wall_tty_filter, m); + + return 1; +} + +static int wall_message_timeout_handler( + sd_event_source *s, + uint64_t usec, + void *userdata) { + + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(s == m->wall_message_timeout_source); + + usec_t n = now(CLOCK_REALTIME); + + r = warn_wall(m, n); + if (r == 0) + return 0; + + usec_t next = when_wall(n, m->scheduled_shutdown_timeout); + if (next > 0) { + r = sd_event_source_set_time(s, n + next); + if (r < 0) + return log_error_errno(r, "sd_event_source_set_time() failed. %m"); + + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "sd_event_source_set_enabled() failed. %m"); + } + + return 0; +} + +int manager_setup_wall_message_timer(Manager *m) { + int r; + + assert(m); + + usec_t n = now(CLOCK_REALTIME); + usec_t elapse = m->scheduled_shutdown_timeout; + + /* wall message handling */ + + if (!m->scheduled_shutdown_action) + return 0; + + if (elapse > 0 && elapse < n) + return 0; + + /* Warn immediately if less than 15 minutes are left */ + if (elapse == 0 || elapse - n < 15 * USEC_PER_MINUTE) { + r = warn_wall(m, n); + if (r == 0) + return 0; + } + + elapse = when_wall(n, elapse); + if (elapse == 0) + return 0; + + r = event_reset_time(m->event, &m->wall_message_timeout_source, + CLOCK_REALTIME, + n + elapse, 0, + wall_message_timeout_handler, m, + 0, "wall-message-timer", true); + + if (r < 0) { + m->wall_message_timeout_source = sd_event_source_unref(m->wall_message_timeout_source); + return log_error_errno(r, "Failed to set up wall message timer: %m"); + } + + return 0; +} diff --git a/src/login/logind.c b/src/login/logind.c new file mode 100644 index 0000000..88e05bb --- /dev/null +++ b/src/login/logind.c @@ -0,0 +1,1206 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-device.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-log-control-api.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "common-signal.h" +#include "constants.h" +#include "daemon-util.h" +#include "device-util.h" +#include "dirent-util.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "logind-dbus.h" +#include "logind-seat-dbus.h" +#include "logind-session-dbus.h" +#include "logind-user-dbus.h" +#include "logind.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "service-util.h" +#include "signal-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "user-util.h" + +static Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(device_hash_ops, char, string_hash_func, string_compare_func, Device, device_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(seat_hash_ops, char, string_hash_func, string_compare_func, Seat, seat_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(session_hash_ops, char, string_hash_func, string_compare_func, Session, session_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(user_hash_ops, void, trivial_hash_func, trivial_compare_func, User, user_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(inhibitor_hash_ops, char, string_hash_func, string_compare_func, Inhibitor, inhibitor_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(button_hash_ops, char, string_hash_func, string_compare_func, Button, button_free); + +static int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .console_active_fd = -EBADF, + .reserve_vt_fd = -EBADF, + .enable_wall_messages = true, + .idle_action_not_before_usec = now(CLOCK_MONOTONIC), + }; + + m->devices = hashmap_new(&device_hash_ops); + m->seats = hashmap_new(&seat_hash_ops); + m->sessions = hashmap_new(&session_hash_ops); + m->users = hashmap_new(&user_hash_ops); + m->inhibitors = hashmap_new(&inhibitor_hash_ops); + m->buttons = hashmap_new(&button_hash_ops); + + m->user_units = hashmap_new(&string_hash_ops); + m->session_units = hashmap_new(&string_hash_ops); + + if (!m->devices || !m->seats || !m->sessions || !m->users || !m->inhibitors || !m->buttons || !m->user_units || !m->session_units) + return -ENOMEM; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + if (r < 0) + return r; + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + (void) sd_event_set_watchdog(m->event, true); + + manager_reset_config(m); + + *ret = TAKE_PTR(m); + return 0; +} + +static Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + hashmap_free(m->devices); + hashmap_free(m->seats); + hashmap_free(m->sessions); + + /* All records should have been removed by session_free */ + assert(hashmap_isempty(m->sessions_by_leader)); + hashmap_free(m->sessions_by_leader); + + hashmap_free(m->users); + hashmap_free(m->inhibitors); + hashmap_free(m->buttons); + hashmap_free(m->brightness_writers); + + hashmap_free(m->user_units); + hashmap_free(m->session_units); + + sd_event_source_unref(m->idle_action_event_source); + sd_event_source_unref(m->inhibit_timeout_source); + sd_event_source_unref(m->scheduled_shutdown_timeout_source); + sd_event_source_unref(m->nologin_timeout_source); + sd_event_source_unref(m->wall_message_timeout_source); + + sd_event_source_unref(m->console_active_event_source); + sd_event_source_unref(m->lid_switch_ignore_event_source); + + sd_event_source_unref(m->reboot_key_long_press_event_source); + +#if ENABLE_UTMP + sd_event_source_unref(m->utmp_event_source); +#endif + + safe_close(m->console_active_fd); + + sd_device_monitor_unref(m->device_seat_monitor); + sd_device_monitor_unref(m->device_monitor); + sd_device_monitor_unref(m->device_vcsa_monitor); + sd_device_monitor_unref(m->device_button_monitor); + + if (m->unlink_nologin) + (void) unlink_or_warn("/run/nologin"); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + + sd_bus_flush_close_unref(m->bus); + sd_event_unref(m->event); + + safe_close(m->reserve_vt_fd); + + strv_free(m->kill_only_users); + strv_free(m->kill_exclude_users); + + free(m->scheduled_shutdown_tty); + free(m->wall_message); + free(m->action_job); + + strv_free(m->efi_boot_loader_entries); + free(m->efi_loader_entry_one_shot); + + return mfree(m); +} + +static int manager_enumerate_devices(Manager *m) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(m); + + /* Loads devices from udev and creates seats for them as + * necessary */ + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_tag(e, "master-of-seat"); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + int k; + + k = manager_process_seat_device(m, d); + if (k < 0) + r = k; + } + + return r; +} + +static int manager_enumerate_buttons(Manager *m) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(m); + + /* Loads buttons from udev */ + + if (manager_all_buttons_ignored(m)) + return 0; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "input", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_tag(e, "power-switch"); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + int k; + + k = manager_process_button_device(m, d); + if (k < 0) + r = k; + } + + return r; +} + +static int manager_enumerate_seats(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + + assert(m); + + /* This loads data about seats stored on disk, but does not + * actually create any seats. Removes data of seats that no + * longer exist. */ + + d = opendir("/run/systemd/seats"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /run/systemd/seats: %m"); + } + + FOREACH_DIRENT(de, d, return -errno) { + Seat *s; + int k; + + if (!dirent_is_file(de)) + continue; + + s = hashmap_get(m->seats, de->d_name); + if (!s) { + if (unlinkat(dirfd(d), de->d_name, 0) < 0) + log_warning_errno(errno, "Failed to remove /run/systemd/seats/%s, ignoring: %m", + de->d_name); + continue; + } + + k = seat_load(s); + if (k < 0) + r = k; + } + + return r; +} + +static int manager_enumerate_linger_users(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + + assert(m); + + d = opendir("/var/lib/systemd/linger"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /var/lib/systemd/linger/: %m"); + } + + FOREACH_DIRENT(de, d, return -errno) { + int k; + _cleanup_free_ char *n = NULL; + + if (!dirent_is_file(de)) + continue; + k = cunescape(de->d_name, 0, &n); + if (k < 0) { + r = log_warning_errno(k, "Failed to unescape username '%s', ignoring: %m", de->d_name); + continue; + } + k = manager_add_user_by_name(m, n, NULL); + if (k < 0) + r = log_warning_errno(k, "Couldn't add lingering user %s, ignoring: %m", de->d_name); + } + + return r; +} + +static int manager_enumerate_users(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + int r, k; + + assert(m); + + /* Add lingering users */ + r = manager_enumerate_linger_users(m); + + /* Read in user data stored on disk */ + d = opendir("/run/systemd/users"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /run/systemd/users: %m"); + } + + FOREACH_DIRENT(de, d, return -errno) { + User *u; + uid_t uid; + + if (!dirent_is_file(de)) + continue; + + k = parse_uid(de->d_name, &uid); + if (k < 0) { + r = log_warning_errno(k, "Failed to parse filename /run/systemd/users/%s as UID.", de->d_name); + continue; + } + + k = manager_add_user_by_uid(m, uid, &u); + if (k < 0) { + r = log_warning_errno(k, "Failed to add user by file name %s, ignoring: %m", de->d_name); + continue; + } + + user_add_to_gc_queue(u); + + k = user_load(u); + if (k < 0) + r = k; + } + + return r; +} + +static int parse_fdname(const char *fdname, char **session_id, dev_t *dev) { + _cleanup_strv_free_ char **parts = NULL; + _cleanup_free_ char *id = NULL; + unsigned major, minor; + int r; + + parts = strv_split(fdname, "-"); + if (!parts) + return -ENOMEM; + if (strv_length(parts) != 5) + return -EINVAL; + + if (!streq(parts[0], "session")) + return -EINVAL; + + id = strdup(parts[1]); + if (!id) + return -ENOMEM; + + if (!streq(parts[2], "device")) + return -EINVAL; + + r = safe_atou(parts[3], &major); + if (r < 0) + return r; + r = safe_atou(parts[4], &minor); + if (r < 0) + return r; + + *dev = makedev(major, minor); + *session_id = TAKE_PTR(id); + + return 0; +} + +static int deliver_fd(Manager *m, const char *fdname, int fd) { + _cleanup_free_ char *id = NULL; + SessionDevice *sd; + struct stat st; + Session *s; + dev_t dev; + int r; + + assert(m); + assert(fd >= 0); + + r = parse_fdname(fdname, &id, &dev); + if (r < 0) + return log_debug_errno(r, "Failed to parse fd name %s: %m", fdname); + + s = hashmap_get(m->sessions, id); + if (!s) + /* If the session doesn't exist anymore, the associated session device attached to this fd + * doesn't either. Let's simply close this fd. */ + return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), "Failed to attach fd for unknown session: %s", id); + + if (fstat(fd, &st) < 0) + /* The device is allowed to go away at a random point, in which case fstat() failing is + * expected. */ + return log_debug_errno(errno, "Failed to stat device fd for session %s: %m", id); + + if (!S_ISCHR(st.st_mode) || st.st_rdev != dev) + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), "Device fd doesn't point to the expected character device node"); + + sd = hashmap_get(s->devices, &dev); + if (!sd) + /* Weird, we got an fd for a session device which wasn't recorded in the session state + * file... */ + return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "Got fd for missing session device [%u:%u] in session %s", + major(dev), minor(dev), s->id); + + log_debug("Attaching fd to session device [%u:%u] for session %s", + major(dev), minor(dev), s->id); + + session_device_attach_fd(sd, fd, s->was_active); + return 0; +} + +static int manager_attach_fds(Manager *m) { + _cleanup_strv_free_ char **fdnames = NULL; + int n; + + /* Upon restart, PID1 will send us back all fds of session devices that we previously opened. Each + * file descriptor is associated with a given session. The session ids are passed through FDNAMES. */ + + assert(m); + + n = sd_listen_fds_with_names(true, &fdnames); + if (n < 0) + return log_warning_errno(n, "Failed to acquire passed fd list: %m"); + if (n == 0) + return 0; + + for (int i = 0; i < n; i++) { + int fd = SD_LISTEN_FDS_START + i; + + if (deliver_fd(m, fdnames[i], fd) >= 0) + continue; + + /* Hmm, we couldn't deliver the fd to any session device object? If so, let's close the fd + * and remove it from fdstore. */ + close_and_notify_warn(fd, fdnames[i]); + } + + return 0; +} + +static int manager_enumerate_sessions(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0, k; + + assert(m); + + /* Read in session data stored on disk */ + d = opendir("/run/systemd/sessions"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /run/systemd/sessions: %m"); + } + + FOREACH_DIRENT(de, d, return -errno) { + struct Session *s; + + if (!dirent_is_file(de)) + continue; + + k = manager_add_session(m, de->d_name, &s); + if (k < 0) { + r = log_warning_errno(k, "Failed to add session by file name %s, ignoring: %m", de->d_name); + continue; + } + + session_add_to_gc_queue(s); + + k = session_load(s); + if (k < 0) + r = k; + } + + /* We might be restarted and PID1 could have sent us back the session device fds we previously + * saved. */ + (void) manager_attach_fds(m); + + return r; +} + +static int manager_enumerate_inhibitors(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + + assert(m); + + d = opendir("/run/systemd/inhibit"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /run/systemd/inhibit: %m"); + } + + FOREACH_DIRENT(de, d, return -errno) { + int k; + Inhibitor *i; + + if (!dirent_is_file(de)) + continue; + + k = manager_add_inhibitor(m, de->d_name, &i); + if (k < 0) { + r = log_warning_errno(k, "Couldn't add inhibitor %s, ignoring: %m", de->d_name); + continue; + } + + k = inhibitor_load(i); + if (k < 0) + r = k; + } + + return r; +} + +static int manager_dispatch_seat_udev(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(device); + + manager_process_seat_device(m, device); + return 0; +} + +static int manager_dispatch_device_udev(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(device); + + manager_process_seat_device(m, device); + return 0; +} + +static int manager_dispatch_vcsa_udev(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + const char *name; + + assert(device); + + /* Whenever a VCSA device is removed try to reallocate our + * VTs, to make sure our auto VTs never go away. */ + + if (sd_device_get_sysname(device, &name) >= 0 && + startswith(name, "vcsa") && + device_for_action(device, SD_DEVICE_REMOVE)) + seat_preallocate_vts(m->seat0); + + return 0; +} + +static int manager_dispatch_button_udev(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(device); + + manager_process_button_device(m, device); + return 0; +} + +static int manager_dispatch_console(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(m->seat0); + assert(m->console_active_fd == fd); + + seat_read_active_vt(m->seat0); + return 0; +} + +static int manager_reserve_vt(Manager *m) { + _cleanup_free_ char *p = NULL; + + assert(m); + + if (m->reserve_vt <= 0) + return 0; + + if (asprintf(&p, "/dev/tty%u", m->reserve_vt) < 0) + return log_oom(); + + m->reserve_vt_fd = open(p, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (m->reserve_vt_fd < 0) { + + /* Don't complain on VT-less systems */ + if (errno != ENOENT) + log_warning_errno(errno, "Failed to pin reserved VT: %m"); + return -errno; + } + + return 0; +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = sd_bus_default_system(&m->bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = bus_match_signal_async(m->bus, NULL, bus_systemd_mgr, "JobRemoved", match_job_removed, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for JobRemoved: %m"); + + r = bus_match_signal_async(m->bus, NULL, bus_systemd_mgr, "UnitRemoved", match_unit_removed, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for UnitRemoved: %m"); + + r = sd_bus_match_signal_async( + m->bus, + NULL, + "org.freedesktop.systemd1", + NULL, + "org.freedesktop.DBus.Properties", + "PropertiesChanged", + match_properties_changed, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for PropertiesChanged: %m"); + + r = bus_match_signal_async(m->bus, NULL, bus_systemd_mgr, "Reloading", match_reloading, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for Reloading: %m"); + + r = bus_call_method_async(m->bus, NULL, bus_systemd_mgr, "Subscribe", NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to enable subscription: %m"); + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.login1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + return 0; +} + +static int manager_vt_switch(sd_event_source *src, const struct signalfd_siginfo *si, void *data) { + Manager *m = ASSERT_PTR(data); + Session *active; + + /* + * We got a VT-switch signal and we have to acknowledge it immediately. + * Preferably, we'd just use m->seat0->active->vtfd, but unfortunately, + * old user-space might run multiple sessions on a single VT, *sigh*. + * Therefore, we have to iterate all sessions and find one with a vtfd + * on the requested VT. + * As only VTs with active controllers have VT_PROCESS set, our current + * notion of the active VT might be wrong (for instance if the switch + * happens while we setup VT_PROCESS). Therefore, read the current VT + * first and then use s->active->vtnr as reference. Note that this is + * not racy, as no further VT-switch can happen as long as we're in + * synchronous VT_PROCESS mode. + */ + + assert(m->seat0); + + seat_read_active_vt(m->seat0); + + active = m->seat0->active; + if (!active || active->vtnr < 1) { + _cleanup_close_ int fd = -EBADF; + int r; + + /* We are requested to acknowledge the VT-switch signal by the kernel but + * there's no registered sessions for the current VT. Normally this + * shouldn't happen but something wrong might have happened when we tried + * to release the VT. Better be safe than sorry, and try to release the VT + * one more time otherwise the user will be locked with the current VT. */ + + log_warning("Received VT_PROCESS signal without a registered session, restoring VT."); + + /* At this point we only have the kernel mapping for referring to the current VT. */ + fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) { + log_warning_errno(fd, "Failed to open current VT, ignoring: %m"); + return 0; + } + + r = vt_release(fd, /* restore = */ true); + if (r < 0) + log_warning_errno(r, "Failed to release current VT, ignoring: %m"); + + return 0; + } + + if (active->vtfd >= 0) + session_leave_vt(active); + else + LIST_FOREACH(sessions_by_seat, iter, m->seat0->sessions) + if (iter->vtnr == active->vtnr && iter->vtfd >= 0) { + session_leave_vt(iter); + break; + } + + return 0; +} + +static int manager_connect_console(Manager *m) { + int r; + + assert(m); + assert(m->console_active_fd < 0); + + /* On certain systems (such as S390, Xen, and containers) /dev/tty0 does not exist (as there is no VC), so + * don't fail if we can't open it. */ + + if (access("/dev/tty0", F_OK) < 0) + return 0; + + m->console_active_fd = open("/sys/class/tty/tty0/active", O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (m->console_active_fd < 0) { + + /* On some systems /dev/tty0 may exist even though /sys/class/tty/tty0 does not. These are broken, but + * common. Let's complain but continue anyway. */ + if (errno == ENOENT) { + log_warning_errno(errno, "System has /dev/tty0 but not /sys/class/tty/tty0/active which is broken, ignoring: %m"); + return 0; + } + + return log_error_errno(errno, "Failed to open /sys/class/tty/tty0/active: %m"); + } + + r = sd_event_add_io(m->event, &m->console_active_event_source, m->console_active_fd, 0, manager_dispatch_console, m); + if (r < 0) + return log_error_errno(r, "Failed to watch foreground console: %m"); + + /* + * SIGRTMIN is used as global VT-release signal, SIGRTMIN + 1 is used + * as VT-acquire signal. We ignore any acquire-events (yes, we still + * have to provide a valid signal-number for it!) and acknowledge all + * release events immediately. + */ + + if (SIGRTMIN + 1 > SIGRTMAX) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Not enough real-time signals available: %i-%i", + SIGRTMIN, SIGRTMAX); + + assert_se(ignore_signals(SIGRTMIN + 1) >= 0); + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGRTMIN, -1) >= 0); + + r = sd_event_add_signal(m->event, NULL, SIGRTMIN, manager_vt_switch, m); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to signal: %m"); + + return 0; +} + +static int manager_connect_udev(Manager *m) { + int r; + + assert(m); + assert(!m->device_seat_monitor); + assert(!m->device_monitor); + assert(!m->device_vcsa_monitor); + assert(!m->device_button_monitor); + + r = sd_device_monitor_new(&m->device_seat_monitor); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_tag(m->device_seat_monitor, "master-of-seat"); + if (r < 0) + return r; + + r = sd_device_monitor_attach_event(m->device_seat_monitor, m->event); + if (r < 0) + return r; + + r = sd_device_monitor_start(m->device_seat_monitor, manager_dispatch_seat_udev, m); + if (r < 0) + return r; + + (void) sd_device_monitor_set_description(m->device_seat_monitor, "seat"); + + r = sd_device_monitor_new(&m->device_monitor); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "input", NULL); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "graphics", NULL); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "drm", NULL); + if (r < 0) + return r; + + r = sd_device_monitor_attach_event(m->device_monitor, m->event); + if (r < 0) + return r; + + r = sd_device_monitor_start(m->device_monitor, manager_dispatch_device_udev, m); + if (r < 0) + return r; + + (void) sd_device_monitor_set_description(m->device_monitor, "input,graphics,drm"); + + /* Don't watch keys if nobody cares */ + if (!manager_all_buttons_ignored(m)) { + r = sd_device_monitor_new(&m->device_button_monitor); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_tag(m->device_button_monitor, "power-switch"); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_button_monitor, "input", NULL); + if (r < 0) + return r; + + r = sd_device_monitor_attach_event(m->device_button_monitor, m->event); + if (r < 0) + return r; + + r = sd_device_monitor_start(m->device_button_monitor, manager_dispatch_button_udev, m); + if (r < 0) + return r; + + (void) sd_device_monitor_set_description(m->device_button_monitor, "button"); + } + + /* Don't bother watching VCSA devices, if nobody cares */ + if (m->n_autovts > 0 && m->console_active_fd >= 0) { + + r = sd_device_monitor_new(&m->device_vcsa_monitor); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_vcsa_monitor, "vc", NULL); + if (r < 0) + return r; + + r = sd_device_monitor_attach_event(m->device_vcsa_monitor, m->event); + if (r < 0) + return r; + + r = sd_device_monitor_start(m->device_vcsa_monitor, manager_dispatch_vcsa_udev, m); + if (r < 0) + return r; + + (void) sd_device_monitor_set_description(m->device_vcsa_monitor, "vcsa"); + } + + return 0; +} + +static void manager_gc(Manager *m, bool drop_not_started) { + Seat *seat; + Session *session; + User *user; + + assert(m); + + while ((seat = LIST_POP(gc_queue, m->seat_gc_queue))) { + seat->in_gc_queue = false; + + if (seat_may_gc(seat, drop_not_started)) { + seat_stop(seat, /* force = */ false); + seat_free(seat); + } + } + + while ((session = LIST_POP(gc_queue, m->session_gc_queue))) { + session->in_gc_queue = false; + + /* First, if we are not closing yet, initiate stopping. */ + if (session_may_gc(session, drop_not_started) && + session_get_state(session) != SESSION_CLOSING) + (void) session_stop(session, /* force = */ false); + + /* Normally, this should make the session referenced again, if it doesn't then let's get rid + * of it immediately. */ + if (session_may_gc(session, drop_not_started)) { + (void) session_finalize(session); + session_free(session); + } + } + + while ((user = LIST_POP(gc_queue, m->user_gc_queue))) { + user->in_gc_queue = false; + + /* First step: queue stop jobs */ + if (user_may_gc(user, drop_not_started)) + (void) user_stop(user, false); + + /* Second step: finalize user */ + if (user_may_gc(user, drop_not_started)) { + (void) user_finalize(user); + user_free(user); + } + } +} + +static int manager_dispatch_idle_action(sd_event_source *s, uint64_t t, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + struct dual_timestamp since; + usec_t n, elapse; + int r; + + if (m->idle_action == HANDLE_IGNORE || + m->idle_action_usec <= 0) + return 0; + + n = now(CLOCK_MONOTONIC); + + r = manager_get_idle_hint(m, &since); + if (r <= 0) { + /* Not idle. Let's check if after a timeout it might be idle then. */ + elapse = n + m->idle_action_usec; + m->was_idle = false; + } else { + + /* Idle! Let's see if it's time to do something, or if + * we shall sleep for longer. */ + + if (n >= since.monotonic + m->idle_action_usec && + (m->idle_action_not_before_usec <= 0 || n >= m->idle_action_not_before_usec + m->idle_action_usec)) { + bool is_edge = false; + + /* We weren't idle previously or some activity happened while we were sleeping, and now we are + * idle. Let's remember that for the next time and make this an edge transition. */ + if (!m->was_idle || since.monotonic >= m->idle_action_not_before_usec) { + is_edge = true; + m->was_idle = true; + } + + if (m->idle_action == HANDLE_LOCK && !is_edge) + /* We are idle and we were before so we are actually not taking any action. */ + log_debug("System idle."); + else + log_info("System idle. Will %s now.", handle_action_verb_to_string(m->idle_action)); + + manager_handle_action(m, 0, m->idle_action, false, is_edge); + m->idle_action_not_before_usec = n; + } + + elapse = MAX(since.monotonic, m->idle_action_not_before_usec) + m->idle_action_usec; + } + + if (!m->idle_action_event_source) { + + r = sd_event_add_time( + m->event, + &m->idle_action_event_source, + CLOCK_MONOTONIC, + elapse, USEC_PER_SEC*30, + manager_dispatch_idle_action, m); + if (r < 0) + return log_error_errno(r, "Failed to add idle event source: %m"); + + r = sd_event_source_set_priority(m->idle_action_event_source, SD_EVENT_PRIORITY_IDLE+10); + if (r < 0) + return log_error_errno(r, "Failed to set idle event source priority: %m"); + } else { + r = sd_event_source_set_time(m->idle_action_event_source, elapse); + if (r < 0) + return log_error_errno(r, "Failed to set idle event timer: %m"); + + r = sd_event_source_set_enabled(m->idle_action_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable idle event timer: %m"); + } + + return 0; +} + +static int manager_dispatch_reload_signal(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = userdata; + int r; + + (void) sd_notifyf(/* unset= */ false, + "RELOADING=1\n" + "STATUS=Reloading configuration...\n" + "MONOTONIC_USEC=" USEC_FMT, now(CLOCK_MONOTONIC)); + + manager_reset_config(m); + r = manager_parse_config_file(m); + if (r < 0) + log_warning_errno(r, "Failed to parse config file, using defaults: %m"); + else + log_info("Config file reloaded."); + + (void) sd_notify(/* unset= */ false, NOTIFY_READY); + return 0; +} + +static int manager_startup(Manager *m) { + int r; + Seat *seat; + Session *session; + User *user; + Button *button; + Inhibitor *inhibitor; + + assert(m); + + r = sd_event_add_signal(m->event, NULL, SIGHUP, manager_dispatch_reload_signal, m); + if (r < 0) + return log_error_errno(r, "Failed to register SIGHUP handler: %m"); + + /* Connect to utmp */ + manager_connect_utmp(m); + + /* Connect to console */ + r = manager_connect_console(m); + if (r < 0) + return r; + + /* Connect to udev */ + r = manager_connect_udev(m); + if (r < 0) + return log_error_errno(r, "Failed to create udev watchers: %m"); + + /* Connect to the bus */ + r = manager_connect_bus(m); + if (r < 0) + return r; + + /* Instantiate magic seat 0 */ + r = manager_add_seat(m, "seat0", &m->seat0); + if (r < 0) + return log_error_errno(r, "Failed to add seat0: %m"); + + r = manager_set_lid_switch_ignore(m, 0 + m->holdoff_timeout_usec); + if (r < 0) + log_warning_errno(r, "Failed to set up lid switch ignore event source: %m"); + + /* Deserialize state */ + r = manager_enumerate_devices(m); + if (r < 0) + log_warning_errno(r, "Device enumeration failed: %m"); + + r = manager_enumerate_seats(m); + if (r < 0) + log_warning_errno(r, "Seat enumeration failed: %m"); + + r = manager_enumerate_users(m); + if (r < 0) + log_warning_errno(r, "User enumeration failed: %m"); + + r = manager_enumerate_sessions(m); + if (r < 0) + log_warning_errno(r, "Session enumeration failed: %m"); + + r = manager_enumerate_inhibitors(m); + if (r < 0) + log_warning_errno(r, "Inhibitor enumeration failed: %m"); + + r = manager_enumerate_buttons(m); + if (r < 0) + log_warning_errno(r, "Button enumeration failed: %m"); + + manager_load_scheduled_shutdown(m); + + /* Remove stale objects before we start them */ + manager_gc(m, false); + + /* Reserve the special reserved VT */ + manager_reserve_vt(m); + + /* Read in utmp if it exists */ + manager_read_utmp(m); + + /* And start everything */ + HASHMAP_FOREACH(seat, m->seats) + (void) seat_start(seat); + + HASHMAP_FOREACH(user, m->users) + (void) user_start(user); + + HASHMAP_FOREACH(session, m->sessions) + (void) session_start(session, NULL, NULL); + + HASHMAP_FOREACH(inhibitor, m->inhibitors) { + (void) inhibitor_start(inhibitor); + + /* Let's see if the inhibitor is dead now, then remove it */ + if (inhibitor_is_orphan(inhibitor)) { + inhibitor_stop(inhibitor); + inhibitor_free(inhibitor); + } + } + + HASHMAP_FOREACH(button, m->buttons) + button_check_switches(button); + + manager_dispatch_idle_action(NULL, 0, m); + + return 0; +} + +static int manager_run(Manager *m) { + int r; + + assert(m); + + for (;;) { + r = sd_event_get_state(m->event); + if (r < 0) + return r; + if (r == SD_EVENT_FINISHED) + return 0; + + manager_gc(m, true); + + r = manager_dispatch_delayed(m, false); + if (r < 0) + return r; + if (r > 0) + continue; + + r = sd_event_run(m->event, UINT64_MAX); + if (r < 0) + return r; + } +} + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = NULL; + int r; + + log_set_facility(LOG_AUTH); + log_setup(); + + r = service_parse_argv("systemd-logind.service", + "Manager for user logins and devices and privileged operations.", + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + + /* Always create the directories people can create inotify watches in. Note that some applications + * might check for the existence of /run/systemd/seats/ to determine whether logind is available, so + * please always make sure these directories are created early on and unconditionally. */ + (void) mkdir_label("/run/systemd/seats", 0755); + (void) mkdir_label("/run/systemd/users", 0755); + (void) mkdir_label("/run/systemd/sessions", 0755); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGHUP, SIGTERM, SIGINT, SIGCHLD, SIGRTMIN+18, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to allocate manager object: %m"); + + (void) manager_parse_config_file(m); + + r = manager_startup(m); + if (r < 0) + return log_error_errno(r, "Failed to fully start up daemon: %m"); + + notify_message = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + return manager_run(m); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/login/logind.conf.in b/src/login/logind.conf.in new file mode 100644 index 0000000..e5fe924 --- /dev/null +++ b/src/login/logind.conf.in @@ -0,0 +1,51 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/logind.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/logind.conf' to display the full config. +# +# See logind.conf(5) for details. + +[Login] +#NAutoVTs=6 +#ReserveVT=6 +#KillUserProcesses={{ "yes" if KILL_USER_PROCESSES else "no" }} +#KillOnlyUsers= +#KillExcludeUsers=root +#InhibitDelayMaxSec=5 +#UserStopDelaySec=10 +#HandlePowerKey=poweroff +#HandlePowerKeyLongPress=ignore +#HandleRebootKey=reboot +#HandleRebootKeyLongPress=poweroff +#HandleSuspendKey=suspend +#HandleSuspendKeyLongPress=hibernate +#HandleHibernateKey=hibernate +#HandleHibernateKeyLongPress=ignore +#HandleLidSwitch=suspend +#HandleLidSwitchExternalPower=suspend +#HandleLidSwitchDocked=ignore +#PowerKeyIgnoreInhibited=no +#SuspendKeyIgnoreInhibited=no +#HibernateKeyIgnoreInhibited=no +#LidSwitchIgnoreInhibited=yes +#RebootKeyIgnoreInhibited=no +#HoldoffTimeoutSec=30s +#IdleAction=ignore +#IdleActionSec=30min +#RuntimeDirectorySize=10% +#RuntimeDirectoryInodesMax= +#RemoveIPC=yes +#InhibitorsMax=8192 +#SessionsMax=8192 +#StopIdleSessionSec=infinity diff --git a/src/login/logind.h b/src/login/logind.h new file mode 100644 index 0000000..7532d37 --- /dev/null +++ b/src/login/logind.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-event.h" + +#include "conf-parser.h" +#include "hashmap.h" +#include "list.h" +#include "set.h" +#include "time-util.h" +#include "user-record.h" + +typedef struct Manager Manager; + +#include "logind-action.h" +#include "logind-button.h" +#include "logind-device.h" +#include "logind-inhibit.h" + +struct Manager { + sd_event *event; + sd_bus *bus; + + Hashmap *devices; + Hashmap *seats; + Hashmap *sessions; + Hashmap *sessions_by_leader; + Hashmap *users; /* indexed by UID */ + Hashmap *inhibitors; + Hashmap *buttons; + Hashmap *brightness_writers; + + LIST_HEAD(Seat, seat_gc_queue); + LIST_HEAD(Session, session_gc_queue); + LIST_HEAD(User, user_gc_queue); + + sd_device_monitor *device_seat_monitor, *device_monitor, *device_vcsa_monitor, *device_button_monitor; + + sd_event_source *console_active_event_source; + +#if ENABLE_UTMP + sd_event_source *utmp_event_source; +#endif + + int console_active_fd; + + unsigned n_autovts; + + unsigned reserve_vt; + int reserve_vt_fd; + + Seat *seat0; + + char **kill_only_users, **kill_exclude_users; + bool kill_user_processes; + + uint64_t session_counter; + uint64_t inhibit_counter; + + Hashmap *session_units; + Hashmap *user_units; + + usec_t inhibit_delay_max; + usec_t user_stop_delay; + + /* If a shutdown/suspend was delayed due to an inhibitor this contains the action we are supposed to + * start after the delay is over */ + const HandleActionData *delayed_action; + + /* If a shutdown/suspend is currently executed, then this is the job of it */ + char *action_job; + sd_event_source *inhibit_timeout_source; + + const HandleActionData *scheduled_shutdown_action; + usec_t scheduled_shutdown_timeout; + sd_event_source *scheduled_shutdown_timeout_source; + uid_t scheduled_shutdown_uid; + char *scheduled_shutdown_tty; + sd_event_source *nologin_timeout_source; + bool unlink_nologin; + + char *wall_message; + bool enable_wall_messages; + sd_event_source *wall_message_timeout_source; + + bool shutdown_dry_run; + + sd_event_source *idle_action_event_source; + usec_t idle_action_usec; + usec_t idle_action_not_before_usec; + HandleAction idle_action; + bool was_idle; + + usec_t stop_idle_session_usec; + + HandleAction handle_power_key; + HandleAction handle_power_key_long_press; + HandleAction handle_reboot_key; + HandleAction handle_reboot_key_long_press; + HandleAction handle_suspend_key; + HandleAction handle_suspend_key_long_press; + HandleAction handle_hibernate_key; + HandleAction handle_hibernate_key_long_press; + + HandleAction handle_lid_switch; + HandleAction handle_lid_switch_ep; + HandleAction handle_lid_switch_docked; + + bool power_key_ignore_inhibited; + bool suspend_key_ignore_inhibited; + bool hibernate_key_ignore_inhibited; + bool lid_switch_ignore_inhibited; + bool reboot_key_ignore_inhibited; + + bool remove_ipc; + + Hashmap *polkit_registry; + + usec_t holdoff_timeout_usec; + sd_event_source *lid_switch_ignore_event_source; + + sd_event_source *power_key_long_press_event_source; + sd_event_source *reboot_key_long_press_event_source; + sd_event_source *suspend_key_long_press_event_source; + sd_event_source *hibernate_key_long_press_event_source; + + uint64_t runtime_dir_size; + uint64_t runtime_dir_inodes; + uint64_t sessions_max; + uint64_t inhibitors_max; + + char **efi_boot_loader_entries; + bool efi_boot_loader_entries_set; + + char *efi_loader_entry_one_shot; + struct stat efi_loader_entry_one_shot_stat; +}; + +void manager_reset_config(Manager *m); +int manager_parse_config_file(Manager *m); + +int manager_add_device(Manager *m, const char *sysfs, bool master, Device **ret_device); +int manager_add_button(Manager *m, const char *name, Button **ret_button); +int manager_add_seat(Manager *m, const char *id, Seat **ret_seat); +int manager_add_session(Manager *m, const char *id, Session **ret_session); +int manager_add_user(Manager *m, UserRecord *ur, User **ret_user); +int manager_add_user_by_name(Manager *m, const char *name, User **ret_user); +int manager_add_user_by_uid(Manager *m, uid_t uid, User **ret_user); +int manager_add_inhibitor(Manager *m, const char* id, Inhibitor **ret_inhibitor); + +int manager_process_seat_device(Manager *m, sd_device *d); +int manager_process_button_device(Manager *m, sd_device *d); + +int manager_spawn_autovt(Manager *m, unsigned vtnr); + +bool manager_shall_kill(Manager *m, const char *user); + +int manager_get_idle_hint(Manager *m, dual_timestamp *t); + +int manager_get_user_by_pid(Manager *m, pid_t pid, User **user); +int manager_get_session_by_pidref(Manager *m, const PidRef *pid, Session **ret); + +bool manager_is_lid_closed(Manager *m); +bool manager_is_docked_or_external_displays(Manager *m); +bool manager_is_on_external_power(void); +bool manager_all_buttons_ignored(Manager *m); + +int manager_read_utmp(Manager *m); +void manager_connect_utmp(Manager *m); +void manager_reconnect_utmp(Manager *m); + +/* gperf lookup function */ +const struct ConfigPerfItem* logind_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +int manager_set_lid_switch_ignore(Manager *m, usec_t until); + +CONFIG_PARSER_PROTOTYPE(config_parse_n_autovts); +CONFIG_PARSER_PROTOTYPE(config_parse_tmpfs_size); + +int manager_setup_wall_message_timer(Manager *m); +bool logind_wall_tty_filter(const char *tty, bool is_local, void *userdata); + +int manager_read_efi_boot_loader_entries(Manager *m); diff --git a/src/login/meson.build b/src/login/meson.build new file mode 100644 index 0000000..b5bb150 --- /dev/null +++ b/src/login/meson.build @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_logind_sources = files( + 'logind.c', +) + +logind_gperf_c = custom_target( + 'logind_gperf.c', + input : 'logind-gperf.gperf', + output : 'logind-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +liblogind_core_sources = files( + 'logind-action.c', + 'logind-brightness.c', + 'logind-button.c', + 'logind-core.c', + 'logind-dbus.c', + 'logind-device.c', + 'logind-inhibit.c', + 'logind-polkit.c', + 'logind-seat-dbus.c', + 'logind-seat.c', + 'logind-session-dbus.c', + 'logind-session-device.c', + 'logind-session.c', + 'logind-user-dbus.c', + 'logind-user.c', + 'logind-wall.c', +) + +liblogind_core_sources += [logind_gperf_c] + +liblogind_core = static_library( + 'logind-core', + liblogind_core_sources, + include_directories : includes, + dependencies : [libacl, + userspace], + build_by_default : false) + +loginctl_sources = files( + 'loginctl.c', + 'sysfs-show.c', +) + +executables += [ + libexec_template + { + 'name' : 'systemd-logind', + 'dbus' : true, + 'conditions' : ['ENABLE_LOGIND'], + 'sources' : systemd_logind_sources, + 'link_with' : [ + liblogind_core, + libshared, + ], + 'dependencies' : [ + libacl, + threads, + ], + }, + executable_template + { + 'name' : 'loginctl', + 'public' : true, + 'conditions' : ['ENABLE_LOGIND'], + 'sources' : loginctl_sources, + 'dependencies' : [ + liblz4, + libxz, + libzstd, + threads, + ], + }, + executable_template + { + 'name' : 'systemd-inhibit', + 'public' : true, + 'conditions' : ['ENABLE_LOGIND'], + 'sources' : files('inhibit.c'), + }, + libexec_template + { + 'name' : 'systemd-user-runtime-dir', + 'conditions' : ['ENABLE_LOGIND'], + 'sources' : files('user-runtime-dir.c'), + }, + test_template + { + 'sources' : files('test-inhibit.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-login-tables.c'), + 'link_with' : [ + liblogind_core, + libshared, + ], + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('test-session-properties.c'), + 'type' : 'manual', + }, +] + +simple_tests += files( + 'test-login-shared.c' +) + +modules += [ + pam_template + { + 'name' : 'pam_systemd', + 'conditions' : [ + 'ENABLE_LOGIND', + 'HAVE_PAM', + ], + 'sources' : files('pam_systemd.c'), + 'version-script' : meson.current_source_dir() / 'pam_systemd.sym', + }, + pam_template + { + 'name' : 'pam_systemd_loadkey', + 'conditions' : [ + 'HAVE_PAM', + ], + 'sources' : files('pam_systemd_loadkey.c'), + 'version-script' : meson.current_source_dir() / 'pam_systemd_loadkey.sym', + }, +] + +enable_logind = conf.get('ENABLE_LOGIND') == 1 + +custom_target( + 'logind.conf', + input : 'logind.conf.in', + output : 'logind.conf', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : enable_logind and install_sysconfdir_samples and pkgsysconfdir != 'no', + install_dir : pkgconfigfiledir) + +custom_target( + 'systemd-user', + input : 'systemd-user.in', + output : 'systemd-user', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : enable_logind and pamconfdir != 'no', + install_dir : pamconfdir) + +if enable_logind + install_data('org.freedesktop.login1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.login1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.login1.policy', + install_dir : polkitpolicydir) +endif diff --git a/src/login/org.freedesktop.login1.conf b/src/login/org.freedesktop.login1.conf new file mode 100644 index 0000000..8ba094b --- /dev/null +++ b/src/login/org.freedesktop.login1.conf @@ -0,0 +1,360 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/login/org.freedesktop.login1.policy b/src/login/org.freedesktop.login1.policy new file mode 100644 index 0000000..012ee14 --- /dev/null +++ b/src/login/org.freedesktop.login1.policy @@ -0,0 +1,415 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Allow applications to inhibit system shutdown + Authentication is required for an application to inhibit system shutdown. + + no + yes + yes + + org.freedesktop.login1.inhibit-delay-shutdown org.freedesktop.login1.inhibit-block-sleep org.freedesktop.login1.inhibit-delay-sleep org.freedesktop.login1.inhibit-block-idle + + + + Allow applications to delay system shutdown + Authentication is required for an application to delay system shutdown. + + yes + yes + yes + + org.freedesktop.login1.inhibit-delay-sleep + + + + Allow applications to inhibit system sleep + Authentication is required for an application to inhibit system sleep. + + no + yes + yes + + org.freedesktop.login1.inhibit-delay-sleep org.freedesktop.login1.inhibit-block-idle + + + + Allow applications to delay system sleep + Authentication is required for an application to delay system sleep. + + yes + yes + yes + + + + + Allow applications to inhibit automatic system suspend + Authentication is required for an application to inhibit automatic system suspend. + + yes + yes + yes + + + + + Allow applications to inhibit system handling of the power key + Authentication is required for an application to inhibit system handling of the power key. + + no + yes + yes + + org.freedesktop.login1.inhibit-handle-suspend-key org.freedesktop.login1.inhibit-handle-hibernate-key org.freedesktop.login1.inhibit-handle-lid-switch + + + + Allow applications to inhibit system handling of the suspend key + Authentication is required for an application to inhibit system handling of the suspend key. + + no + yes + yes + + org.freedesktop.login1.inhibit-handle-hibernate-key org.freedesktop.login1.inhibit-handle-lid-switch + + + + Allow applications to inhibit system handling of the hibernate key + Authentication is required for an application to inhibit system handling of the hibernate key. + + no + yes + yes + + + + + Allow applications to inhibit system handling of the lid switch + Authentication is required for an application to inhibit system handling of the lid switch. + + no + yes + yes + + + + + Allow applications to inhibit system handling of the reboot key + Authentication is required for an application to inhibit system handling of the reboot key. + + no + yes + yes + + org.freedesktop.login1.inhibit-handle-suspend-key org.freedesktop.login1.inhibit-handle-hibernate-key org.freedesktop.login1.inhibit-handle-lid-switch + + + + Allow non-logged-in user to run programs + Explicit request is required to run programs as a non-logged-in user. + + yes + yes + yes + + + + + Allow non-logged-in users to run programs + Authentication is required to run programs as a non-logged-in user. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Allow attaching devices to seats + Authentication is required to attach a device to a seat. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.flush-devices + + + + Flush device to seat attachments + Authentication is required to reset how devices are attached to seats. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Power off the system + Authentication is required to power off the system. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.set-wall-message + + + + Power off the system while other users are logged in + Authentication is required to power off the system while other users are logged in. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.power-off + + + + Power off the system while an application is inhibiting this + Authentication is required to power off the system while an application is inhibiting this. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.power-off + + + + Reboot the system + Authentication is required to reboot the system. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.set-wall-message + + + + Reboot the system while other users are logged in + Authentication is required to reboot the system while other users are logged in. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.reboot + + + + Reboot the system while an application is inhibiting this + Authentication is required to reboot the system while an application is inhibiting this. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.reboot + + + + Halt the system + Authentication is required to halt the system. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.set-wall-message + + + + Halt the system while other users are logged in + Authentication is required to halt the system while other users are logged in. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.halt + + + + Halt the system while an application is inhibiting this + Authentication is required to halt the system while an application is inhibiting this. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.halt + + + + Suspend the system + Authentication is required to suspend the system. + + auth_admin_keep + auth_admin_keep + yes + + + + + Suspend the system while other users are logged in + Authentication is required to suspend the system while other users are logged in. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.suspend + + + + Suspend the system while an application is inhibiting this + Authentication is required to suspend the system while an application is inhibiting this. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.suspend + + + + Hibernate the system + Authentication is required to hibernate the system. + + auth_admin_keep + auth_admin_keep + yes + + + + + Hibernate the system while other users are logged in + Authentication is required to hibernate the system while other users are logged in. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.hibernate + + + + Hibernate the system while an application is inhibiting this + Authentication is required to hibernate the system while an application is inhibiting this. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.login1.hibernate + + + + Manage active sessions, users and seats + Authentication is required to manage active sessions, users and seats. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Lock or unlock active sessions + Authentication is required to lock or unlock active sessions. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Set the reboot "reason" in the kernel + Authentication is required to set the reboot "reason" in the kernel. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.reboot + + + + Indicate to the firmware to boot to setup interface + Authentication is required to indicate to the firmware to boot to setup interface. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.reboot + + + + Indicate to the boot loader to boot to the boot loader menu + Authentication is required to indicate to the boot loader to boot to the boot loader menu. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.reboot + + + + Indicate to the boot loader to boot a specific entry + Authentication is required to indicate to the boot loader to boot into a specific boot loader entry. + + auth_admin_keep + auth_admin_keep + yes + + org.freedesktop.login1.reboot + + + + Set a wall message + Authentication is required to set a wall message + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Change Session + Authentication is required to change the virtual terminal. + + auth_admin_keep + yes + yes + + + + diff --git a/src/login/org.freedesktop.login1.service b/src/login/org.freedesktop.login1.service new file mode 100644 index 0000000..6d443cf --- /dev/null +++ b/src/login/org.freedesktop.login1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.login1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.login1.service diff --git a/src/login/pam_systemd.c b/src/login/pam_systemd.c new file mode 100644 index 0000000..bf45974 --- /dev/null +++ b/src/login/pam_systemd.c @@ -0,0 +1,1266 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "audit-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-internal.h" +#include "bus-locator.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup-setup.h" +#include "devnum-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "locale-util.h" +#include "login-util.h" +#include "macro.h" +#include "missing_syscall.h" +#include "pam-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "user-util.h" +#include "userdb.h" + +#define LOGIN_SLOW_BUS_CALL_TIMEOUT_USEC (2*USEC_PER_MINUTE) + +static int parse_caps( + pam_handle_t *handle, + const char *value, + uint64_t *caps) { + + bool subtract; + int r; + + assert(handle); + assert(value); + + if (value[0] == '~') { + subtract = true; + value++; + } else + subtract = false; + + for (;;) { + _cleanup_free_ char *s = NULL; + uint64_t b, m; + int c; + + /* We can't use spaces as separators here, as PAM's simplistic argument parser doesn't allow + * spaces inside of arguments. We use commas instead (which is similar to cap_from_text(), + * which also uses commas). */ + r = extract_first_word(&value, &s, ",", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + c = capability_from_name(s); + if (c < 0) { + pam_syslog(handle, LOG_WARNING, "Unknown capability, ignoring: %s", s); + continue; + } + + m = UINT64_C(1) << c; + + if (!caps) + continue; + + if (*caps == UINT64_MAX) + b = subtract ? all_capabilities() : 0; + else + b = *caps; + + if (subtract) + *caps = b & ~m; + else + *caps = b | m; + } + + return 0; +} + +static int parse_argv( + pam_handle_t *handle, + int argc, const char **argv, + const char **class, + const char **type, + const char **desktop, + bool *debug, + uint64_t *default_capability_bounding_set, + uint64_t *default_capability_ambient_set) { + + int r; + + assert(handle); + assert(argc >= 0); + assert(argc == 0 || argv); + + for (int i = 0; i < argc; i++) { + const char *p; + + if ((p = startswith(argv[i], "class="))) { + if (class) + *class = p; + + } else if ((p = startswith(argv[i], "type="))) { + if (type) + *type = p; + + } else if ((p = startswith(argv[i], "desktop="))) { + if (desktop) + *desktop = p; + + } else if (streq(argv[i], "debug")) { + if (debug) + *debug = true; + + } else if ((p = startswith(argv[i], "debug="))) { + r = parse_boolean(p); + if (r < 0) + pam_syslog(handle, LOG_WARNING, "Failed to parse debug= argument, ignoring: %s", p); + else if (debug) + *debug = r; + + } else if ((p = startswith(argv[i], "default-capability-bounding-set="))) { + r = parse_caps(handle, p, default_capability_bounding_set); + if (r < 0) + pam_syslog(handle, LOG_WARNING, "Failed to parse default-capability-bounding-set= argument, ignoring: %s", p); + + } else if ((p = startswith(argv[i], "default-capability-ambient-set="))) { + r = parse_caps(handle, p, default_capability_ambient_set); + if (r < 0) + pam_syslog(handle, LOG_WARNING, "Failed to parse default-capability-ambient-set= argument, ignoring: %s", p); + + } else + pam_syslog(handle, LOG_WARNING, "Unknown parameter '%s', ignoring.", argv[i]); + } + + return 0; +} + +static int acquire_user_record( + pam_handle_t *handle, + UserRecord **ret_record) { + + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + const char *username = NULL, *json = NULL; + _cleanup_free_ char *field = NULL; + int r; + + assert(handle); + + r = pam_get_user(handle, &username, NULL); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get user name: @PAMERR@"); + + if (isempty(username)) + return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR, "User name not valid."); + + /* If pam_systemd_homed (or some other module) already acquired the user record we can reuse it + * here. */ + field = strjoin("systemd-user-record-", username); + if (!field) + return pam_log_oom(handle); + + r = pam_get_data(handle, field, (const void**) &json); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM user record data: @PAMERR@"); + if (r == PAM_SUCCESS && json) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + /* Parse cached record */ + r = json_parse(json, JSON_PARSE_SENSITIVE, &v, NULL, NULL); + if (r < 0) + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to parse JSON user record: %m"); + + ur = user_record_new(); + if (!ur) + return pam_log_oom(handle); + + r = user_record_load(ur, v, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE); + if (r < 0) + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to load user record: %m"); + + /* Safety check if cached record actually matches what we are looking for */ + if (!streq_ptr(username, ur->user_name)) + return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR, + "Acquired user record does not match user name."); + } else { + _cleanup_free_ char *formatted = NULL; + + /* Request the record ourselves */ + r = userdb_by_name(username, 0, &ur); + if (r < 0) { + pam_syslog_errno(handle, LOG_ERR, r, "Failed to get user record: %m"); + return PAM_USER_UNKNOWN; + } + + r = json_variant_format(ur->json, 0, &formatted); + if (r < 0) + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to format user JSON: %m"); + + /* And cache it for everyone else */ + r = pam_set_data(handle, field, formatted, pam_cleanup_free); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, + "Failed to set PAM user record data '%s': @PAMERR@", field); + TAKE_PTR(formatted); + } + + if (!uid_is_valid(ur->uid)) + return pam_syslog_pam_error(handle, LOG_ERR, PAM_SERVICE_ERR, + "Acquired user record does not have a UID."); + + if (ret_record) + *ret_record = TAKE_PTR(ur); + + return PAM_SUCCESS; +} + +static bool display_is_local(const char *display) { + assert(display); + + return + display[0] == ':' && + ascii_isdigit(display[1]); +} + +static int socket_from_display(const char *display) { + _cleanup_free_ char *f = NULL; + size_t k; + char *c; + union sockaddr_union sa; + socklen_t sa_len; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(display); + + if (!display_is_local(display)) + return -EINVAL; + + k = strspn(display+1, "0123456789"); + + /* Try abstract socket first. */ + f = new(char, STRLEN("@/tmp/.X11-unix/X") + k + 1); + if (!f) + return -ENOMEM; + + c = stpcpy(f, "@/tmp/.X11-unix/X"); + memcpy(c, display+1, k); + c[k] = 0; + + r = sockaddr_un_set_path(&sa.un, f); + if (r < 0) + return r; + sa_len = r; + + fd = RET_NERRNO(socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0)); + if (fd < 0) + return fd; + + r = RET_NERRNO(connect(fd, &sa.sa, sa_len)); + if (r >= 0) + return TAKE_FD(fd); + if (r != -ECONNREFUSED) + return r; + + /* Try also non-abstract socket. */ + r = sockaddr_un_set_path(&sa.un, f + 1); + if (r < 0) + return r; + sa_len = r; + + r = RET_NERRNO(connect(fd, &sa.sa, sa_len)); + if (r >= 0) + return TAKE_FD(fd); + return r; +} + +static int get_seat_from_display(const char *display, const char **seat, uint32_t *vtnr) { + _cleanup_free_ char *sys_path = NULL, *tty = NULL; + _cleanup_close_ int fd = -EBADF; + struct ucred ucred; + int v, r; + dev_t display_ctty; + + assert(display); + assert(vtnr); + + /* We deduce the X11 socket from the display name, then use + * SO_PEERCRED to determine the X11 server process, ask for + * the controlling tty of that and if it's a VC then we know + * the seat and the virtual terminal. Sounds ugly, is only + * semi-ugly. */ + + fd = socket_from_display(display); + if (fd < 0) + return fd; + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + r = get_ctty_devnr(ucred.pid, &display_ctty); + if (r < 0) + return r; + + if (asprintf(&sys_path, "/sys/dev/char/" DEVNUM_FORMAT_STR, DEVNUM_FORMAT_VAL(display_ctty)) < 0) + return -ENOMEM; + r = readlink_value(sys_path, &tty); + if (r < 0) + return r; + + v = vtnr_from_tty(tty); + if (v < 0) + return v; + else if (v == 0) + return -ENOENT; + + if (seat) + *seat = "seat0"; + *vtnr = (uint32_t) v; + + return 0; +} + +static int export_legacy_dbus_address( + pam_handle_t *handle, + const char *runtime) { + + const char *s; + _cleanup_free_ char *t = NULL; + int r = PAM_BUF_ERR; + + /* We need to export $DBUS_SESSION_BUS_ADDRESS because various applications will not connect + * correctly to the bus without it. This setting matches what dbus.socket does for the user + * session using 'systemctl --user set-environment'. We want to have the same configuration + * in processes started from the PAM session. + * + * The setting of the address is guarded by the access() check because it is also possible to compile + * dbus without --enable-user-session, in which case this socket is not used, and + * $DBUS_SESSION_BUS_ADDRESS should not be set. An alternative approach would to not do the access() + * check here, and let applications try on their own, by using "unix:path=%s/bus;autolaunch:". But we + * expect the socket to be present by the time we do this check, so we can just as well check once + * here. */ + + s = strjoina(runtime, "/bus"); + if (access(s, F_OK) < 0) + return PAM_SUCCESS; + + if (asprintf(&t, DEFAULT_USER_BUS_ADDRESS_FMT, runtime) < 0) + return pam_log_oom(handle); + + r = pam_misc_setenv(handle, "DBUS_SESSION_BUS_ADDRESS", t, 0); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set bus variable: @PAMERR@"); + + return PAM_SUCCESS; +} + +static int append_session_memory_max(pam_handle_t *handle, sd_bus_message *m, const char *limit) { + uint64_t val; + int r; + + if (isempty(limit)) + return PAM_SUCCESS; + + if (streq(limit, "infinity")) { + r = sd_bus_message_append(m, "(sv)", "MemoryMax", "t", UINT64_MAX); + if (r < 0) + return pam_bus_log_create_error(handle, r); + + return PAM_SUCCESS; + } + + r = parse_permyriad(limit); + if (r >= 0) { + r = sd_bus_message_append(m, "(sv)", "MemoryMaxScale", "u", UINT32_SCALE_FROM_PERMYRIAD(r)); + if (r < 0) + return pam_bus_log_create_error(handle, r); + + return PAM_SUCCESS; + } + + r = parse_size(limit, 1024, &val); + if (r >= 0) { + r = sd_bus_message_append(m, "(sv)", "MemoryMax", "t", val); + if (r < 0) + return pam_bus_log_create_error(handle, r); + + return PAM_SUCCESS; + } + + pam_syslog(handle, LOG_WARNING, "Failed to parse systemd.memory_max, ignoring: %s", limit); + return PAM_SUCCESS; +} + +static int append_session_runtime_max_sec(pam_handle_t *handle, sd_bus_message *m, const char *limit) { + usec_t val; + int r; + + /* No need to parse "infinity" here, it will be set by default later in scope_init() */ + if (isempty(limit) || streq(limit, "infinity")) + return PAM_SUCCESS; + + r = parse_sec(limit, &val); + if (r >= 0) { + r = sd_bus_message_append(m, "(sv)", "RuntimeMaxUSec", "t", (uint64_t) val); + if (r < 0) + return pam_bus_log_create_error(handle, r); + } else + pam_syslog(handle, LOG_WARNING, "Failed to parse systemd.runtime_max_sec: %s, ignoring.", limit); + + return PAM_SUCCESS; +} + +static int append_session_tasks_max(pam_handle_t *handle, sd_bus_message *m, const char *limit) { + uint64_t val; + int r; + + /* No need to parse "infinity" here, it will be set unconditionally later in manager_start_scope() */ + if (isempty(limit) || streq(limit, "infinity")) + return PAM_SUCCESS; + + r = safe_atou64(limit, &val); + if (r >= 0) { + r = sd_bus_message_append(m, "(sv)", "TasksMax", "t", val); + if (r < 0) + return pam_bus_log_create_error(handle, r); + } else + pam_syslog(handle, LOG_WARNING, "Failed to parse systemd.tasks_max, ignoring: %s", limit); + + return PAM_SUCCESS; +} + +static int append_session_cpu_weight(pam_handle_t *handle, sd_bus_message *m, const char *limit) { + uint64_t val; + int r; + + if (isempty(limit)) + return PAM_SUCCESS; + + r = cg_cpu_weight_parse(limit, &val); + if (r < 0) + pam_syslog(handle, LOG_WARNING, "Failed to parse systemd.cpu_weight, ignoring: %s", limit); + else { + r = sd_bus_message_append(m, "(sv)", "CPUWeight", "t", val); + if (r < 0) + return pam_bus_log_create_error(handle, r); + } + + return PAM_SUCCESS; +} + +static int append_session_io_weight(pam_handle_t *handle, sd_bus_message *m, const char *limit) { + uint64_t val; + int r; + + if (isempty(limit)) + return PAM_SUCCESS; + + r = cg_weight_parse(limit, &val); + if (r < 0) + pam_syslog(handle, LOG_WARNING, "Failed to parse systemd.io_weight, ignoring: %s", limit); + else { + r = sd_bus_message_append(m, "(sv)", "IOWeight", "t", val); + if (r < 0) + return pam_bus_log_create_error(handle, r); + } + + return PAM_SUCCESS; +} + +static const char* getenv_harder(pam_handle_t *handle, const char *key, const char *fallback) { + const char *v; + + assert(handle); + assert(key); + + /* Looks for an environment variable, preferably in the environment block associated with the + * specified PAM handle, falling back to the process' block instead. Why check both? Because we want + * to permit configuration of session properties from unit files that invoke PAM services, so that + * PAM services don't have to be reworked to set systemd-specific properties, but these properties + * can still be set from the unit file Environment= block. */ + + v = pam_getenv(handle, key); + if (!isempty(v)) + return v; + + /* We use secure_getenv() here, since we might get loaded into su/sudo, which are SUID. Ideally + * they'd clean up the environment before invoking foreign code (such as PAM modules), but alas they + * currently don't (to be precise, they clean up the environment they pass to their children, but + * not their own environ[]). */ + v = secure_getenv(key); + if (!isempty(v)) + return v; + + return fallback; +} + +static int update_environment(pam_handle_t *handle, const char *key, const char *value) { + int r; + + assert(handle); + assert(key); + + /* Updates the environment, but only if there's actually a value set. Also, log about errors */ + + if (isempty(value)) + return PAM_SUCCESS; + + r = pam_misc_setenv(handle, key, value, 0); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, + "Failed to set environment variable %s: @PAMERR@", key); + + return PAM_SUCCESS; +} + +static bool validate_runtime_directory(pam_handle_t *handle, const char *path, uid_t uid) { + struct stat st; + + assert(handle); + assert(path); + + /* Some extra paranoia: let's not set $XDG_RUNTIME_DIR if the directory we'd set it to isn't actually + * set up properly for us. This is supposed to provide a careful safety net for supporting su/sudo + * type transitions: in that case the UID changes, but the session and thus the user owning it + * doesn't change. Since the $XDG_RUNTIME_DIR lifecycle is bound to the session's user being logged + * in at least once we should be particularly careful when setting the environment variable, since + * otherwise we might end up setting $XDG_RUNTIME_DIR to some directory owned by the wrong user. */ + + if (!path_is_absolute(path)) { + pam_syslog(handle, LOG_ERR, "Provided runtime directory '%s' is not absolute.", path); + goto fail; + } + + if (lstat(path, &st) < 0) { + pam_syslog_errno(handle, LOG_ERR, errno, "Failed to stat() runtime directory '%s': %m", path); + goto fail; + } + + if (!S_ISDIR(st.st_mode)) { + pam_syslog(handle, LOG_ERR, "Runtime directory '%s' is not actually a directory.", path); + goto fail; + } + + if (st.st_uid != uid) { + pam_syslog(handle, LOG_ERR, "Runtime directory '%s' is not owned by UID " UID_FMT ", as it should.", path, uid); + goto fail; + } + + return true; + +fail: + pam_syslog(handle, LOG_WARNING, "Not setting $XDG_RUNTIME_DIR, as the directory is not in order."); + return false; +} + +static int pam_putenv_and_log(pam_handle_t *handle, const char *e, bool debug) { + int r; + + assert(handle); + assert(e); + + r = pam_putenv(handle, e); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, + "Failed to set PAM environment variable %s: @PAMERR@", e); + + pam_debug_syslog(handle, debug, "PAM environment variable %s set based on user record.", e); + + return PAM_SUCCESS; +} + +static int apply_user_record_settings( + pam_handle_t *handle, + UserRecord *ur, + bool debug, + uint64_t default_capability_bounding_set, + uint64_t default_capability_ambient_set) { + int r; + + assert(handle); + assert(ur); + + if (ur->umask != MODE_INVALID) { + umask(ur->umask); + pam_debug_syslog(handle, debug, "Set user umask to %04o based on user record.", ur->umask); + } + + STRV_FOREACH(i, ur->environment) { + _cleanup_free_ char *n = NULL; + const char *e; + + assert_se(e = strchr(*i, '=')); /* environment was already validated while parsing JSON record, this thus must hold */ + + n = strndup(*i, e - *i); + if (!n) + return pam_log_oom(handle); + + if (pam_getenv(handle, n)) { + pam_debug_syslog(handle, debug, + "PAM environment variable $%s already set, not changing based on record.", *i); + continue; + } + + r = pam_putenv_and_log(handle, *i, debug); + if (r != PAM_SUCCESS) + return r; + } + + if (ur->email_address) { + if (pam_getenv(handle, "EMAIL")) + pam_debug_syslog(handle, debug, + "PAM environment variable $EMAIL already set, not changing based on user record."); + else { + _cleanup_free_ char *joined = NULL; + + joined = strjoin("EMAIL=", ur->email_address); + if (!joined) + return pam_log_oom(handle); + + r = pam_putenv_and_log(handle, joined, debug); + if (r != PAM_SUCCESS) + return r; + } + } + + if (ur->time_zone) { + if (pam_getenv(handle, "TZ")) + pam_debug_syslog(handle, debug, + "PAM environment variable $TZ already set, not changing based on user record."); + else if (!timezone_is_valid(ur->time_zone, LOG_DEBUG)) + pam_debug_syslog(handle, debug, + "Time zone specified in user record is not valid locally, not setting $TZ."); + else { + _cleanup_free_ char *joined = NULL; + + joined = strjoin("TZ=:", ur->time_zone); + if (!joined) + return pam_log_oom(handle); + + r = pam_putenv_and_log(handle, joined, debug); + if (r != PAM_SUCCESS) + return r; + } + } + + if (ur->preferred_language) { + if (pam_getenv(handle, "LANG")) + pam_debug_syslog(handle, debug, + "PAM environment variable $LANG already set, not changing based on user record."); + else if (locale_is_installed(ur->preferred_language) <= 0) + pam_debug_syslog(handle, debug, + "Preferred language specified in user record is not valid or not installed, not setting $LANG."); + else { + _cleanup_free_ char *joined = NULL; + + joined = strjoin("LANG=", ur->preferred_language); + if (!joined) + return pam_log_oom(handle); + + r = pam_putenv_and_log(handle, joined, debug); + if (r != PAM_SUCCESS) + return r; + } + } + + if (nice_is_valid(ur->nice_level)) { + if (nice(ur->nice_level) < 0) + pam_syslog_errno(handle, LOG_ERR, errno, + "Failed to set nice level to %i, ignoring: %m", ur->nice_level); + else + pam_debug_syslog(handle, debug, + "Nice level set to %i, based on user record.", ur->nice_level); + } + + for (int rl = 0; rl < _RLIMIT_MAX; rl++) { + + if (!ur->rlimits[rl]) + continue; + + r = setrlimit_closest(rl, ur->rlimits[rl]); + if (r < 0) + pam_syslog_errno(handle, LOG_ERR, r, + "Failed to set resource limit %s, ignoring: %m", rlimit_to_string(rl)); + else + pam_debug_syslog(handle, debug, + "Resource limit %s set, based on user record.", rlimit_to_string(rl)); + } + + uint64_t a, b; + a = user_record_capability_ambient_set(ur); + if (a == UINT64_MAX) + a = default_capability_ambient_set; + + b = user_record_capability_bounding_set(ur); + if (b == UINT64_MAX) + b = default_capability_bounding_set; + + if (a != UINT64_MAX && a != 0) { + a &= b; + + r = capability_ambient_set_apply(a, /* also_inherit= */ true); + if (r < 0) + pam_syslog_errno(handle, LOG_ERR, r, + "Failed to set ambient capabilities, ignoring: %m"); + } + + if (b != UINT64_MAX && !cap_test_all(b)) { + r = capability_bounding_set_drop(b, /* right_now= */ false); + if (r < 0) + pam_syslog_errno(handle, LOG_ERR, r, + "Failed to set bounding capabilities, ignoring: %m"); + } + + return PAM_SUCCESS; +} + +static int configure_runtime_directory( + pam_handle_t *handle, + UserRecord *ur, + const char *rt) { + + int r; + + assert(handle); + assert(ur); + assert(rt); + + if (!validate_runtime_directory(handle, rt, ur->uid)) + return PAM_SUCCESS; + + r = pam_misc_setenv(handle, "XDG_RUNTIME_DIR", rt, 0); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set runtime dir: @PAMERR@"); + + return export_legacy_dbus_address(handle, rt); +} + +static uint64_t pick_default_capability_ambient_set( + UserRecord *ur, + const char *service, + const char *seat) { + + /* If not configured otherwise, let's enable CAP_WAKE_ALARM for regular users when logging in on a + * seat (i.e. when they are present physically on the device), or when invoked for the systemd --user + * instances. This allows desktops to install CAP_WAKE_ALARM to implement alarm clock apps without + * much fuss. */ + + return ur && + user_record_disposition(ur) == USER_REGULAR && + (streq_ptr(service, "systemd-user") || !isempty(seat)) ? (UINT64_C(1) << CAP_WAKE_ALARM) : UINT64_MAX; +} + +typedef struct SessionContext { + const uid_t uid; + const pid_t pid; + const char *service; + const char *type; + const char *class; + const char *desktop; + const char *seat; + const uint32_t vtnr; + const char *tty; + const char *display; + const bool remote; + const char *remote_user; + const char *remote_host; + const char *memory_max; + const char *tasks_max; + const char *cpu_weight; + const char *io_weight; + const char *runtime_max_sec; +} SessionContext; + +static int create_session_message( + sd_bus *bus, + pam_handle_t *handle, + const SessionContext *context, + bool avoid_pidfd, + sd_bus_message **ret) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_close_ int pidfd = -EBADF; + int r; + + assert(bus); + assert(handle); + assert(context); + assert(ret); + + if (!avoid_pidfd) { + pidfd = pidfd_open(getpid_cached(), 0); + if (pidfd < 0 && !ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } + + r = bus_message_new_method_call(bus, &m, bus_login_mgr, pidfd >= 0 ? "CreateSessionWithPIDFD" : "CreateSession"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, + pidfd >= 0 ? "uhsssssussbss" : "uusssssussbss", + (uint32_t) context->uid, + pidfd >= 0 ? pidfd : context->pid, + context->service, + context->type, + context->class, + context->desktop, + context->seat, + context->vtnr, + context->tty, + context->display, + context->remote, + context->remote_user, + context->remote_host); + if (r < 0) + return r; + + if (pidfd >= 0) { + r = sd_bus_message_append(m, "t", UINT64_C(0)); + if (r < 0) + return r; + } + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return r; + + r = append_session_memory_max(handle, m, context->memory_max); + if (r != PAM_SUCCESS) + return r; + + r = append_session_runtime_max_sec(handle, m, context->runtime_max_sec); + if (r != PAM_SUCCESS) + return r; + + r = append_session_tasks_max(handle, m, context->tasks_max); + if (r != PAM_SUCCESS) + return r; + + r = append_session_cpu_weight(handle, m, context->cpu_weight); + if (r != PAM_SUCCESS) + return r; + + r = append_session_io_weight(handle, m, context->io_weight); + if (r != PAM_SUCCESS) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +_public_ PAM_EXTERN int pam_sm_open_session( + pam_handle_t *handle, + int flags, + int argc, const char **argv) { + + /* Let's release the D-Bus connection once this function exits, after all the session might live + * quite a long time, and we are not going to process the bus connection in that time, so let's + * better close before the daemon kicks us off because we are not processing anything. */ + _cleanup_(pam_bus_data_disconnectp) PamBusData *d = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + const char + *id, *object_path, *runtime_path, + *service = NULL, + *tty = NULL, *display = NULL, + *remote_user = NULL, *remote_host = NULL, + *seat = NULL, + *type = NULL, *class = NULL, + *class_pam = NULL, *type_pam = NULL, *cvtnr = NULL, *desktop = NULL, *desktop_pam = NULL, + *memory_max = NULL, *tasks_max = NULL, *cpu_weight = NULL, *io_weight = NULL, *runtime_max_sec = NULL; + uint64_t default_capability_bounding_set = UINT64_MAX, default_capability_ambient_set = UINT64_MAX; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + int session_fd = -EBADF, existing, r; + bool debug = false, remote; + uint32_t vtnr = 0; + uid_t original_uid; + + assert(handle); + + if (parse_argv(handle, + argc, argv, + &class_pam, + &type_pam, + &desktop_pam, + &debug, + &default_capability_bounding_set, + &default_capability_ambient_set) < 0) + return PAM_SESSION_ERR; + + pam_debug_syslog(handle, debug, "pam-systemd initializing"); + + r = acquire_user_record(handle, &ur); + if (r != PAM_SUCCESS) + return r; + + /* Make most of this a NOP on non-logind systems */ + if (!logind_running()) + goto success; + + /* Make sure we don't enter a loop by talking to + * systemd-logind when it is actually waiting for the + * background to finish start-up. If the service is + * "systemd-user" we simply set XDG_RUNTIME_DIR and + * leave. */ + + r = pam_get_item(handle, PAM_SERVICE, (const void**) &service); + if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM service: @PAMERR@"); + if (streq_ptr(service, "systemd-user")) { + char rt[STRLEN("/run/user/") + DECIMAL_STR_MAX(uid_t)]; + + xsprintf(rt, "/run/user/"UID_FMT, ur->uid); + r = configure_runtime_directory(handle, ur, rt); + if (r != PAM_SUCCESS) + return r; + + goto success; + } + + /* Otherwise, we ask logind to create a session for us */ + + r = pam_get_item(handle, PAM_XDISPLAY, (const void**) &display); + if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM_XDISPLAY: @PAMERR@"); + r = pam_get_item(handle, PAM_TTY, (const void**) &tty); + if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM_TTY: @PAMERR@"); + r = pam_get_item(handle, PAM_RUSER, (const void**) &remote_user); + if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM_RUSER: @PAMERR@"); + r = pam_get_item(handle, PAM_RHOST, (const void**) &remote_host); + if (!IN_SET(r, PAM_BAD_ITEM, PAM_SUCCESS)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM_RHOST: @PAMERR@"); + + seat = getenv_harder(handle, "XDG_SEAT", NULL); + cvtnr = getenv_harder(handle, "XDG_VTNR", NULL); + type = getenv_harder(handle, "XDG_SESSION_TYPE", type_pam); + class = getenv_harder(handle, "XDG_SESSION_CLASS", class_pam); + desktop = getenv_harder(handle, "XDG_SESSION_DESKTOP", desktop_pam); + + tty = strempty(tty); + + if (strchr(tty, ':')) { + /* A tty with a colon is usually an X11 display, placed there to show up in utmp. We rearrange things + * and don't pretend that an X display was a tty. */ + if (isempty(display)) + display = tty; + tty = NULL; + + } else if (streq(tty, "cron")) { + /* cron is setting PAM_TTY to "cron" for some reason (the commit carries no information why, but + * probably because it wants to set it to something as pam_time/pam_access/… require PAM_TTY to be set + * (as they otherwise even try to update it!) — but cron doesn't actually allocate a TTY for its forked + * off processes.) */ + type = "unspecified"; + class = "background"; + tty = NULL; + + } else if (streq(tty, "ssh")) { + /* ssh has been setting PAM_TTY to "ssh" (for the same reason as cron does this, see above. For further + * details look for "PAM_TTY_KLUDGE" in the openssh sources). */ + type ="tty"; + class = "user"; + tty = NULL; /* This one is particularly sad, as this means that ssh sessions — even though usually + * associated with a pty — won't be tracked by their tty in logind. This is because ssh + * does the PAM session registration early for new connections, and registers a pty only + * much later (this is because it doesn't know yet if it needs one at all, as whether to + * register a pty or not is negotiated much later in the protocol). */ + + } else + /* Chop off leading /dev prefix that some clients specify, but others do not. */ + tty = skip_dev_prefix(tty); + + /* If this fails vtnr will be 0, that's intended */ + if (!isempty(cvtnr)) + (void) safe_atou32(cvtnr, &vtnr); + + if (!isempty(display) && !vtnr) { + if (isempty(seat)) + (void) get_seat_from_display(display, &seat, &vtnr); + else if (streq(seat, "seat0")) + (void) get_seat_from_display(display, NULL, &vtnr); + } + + if (seat && !streq(seat, "seat0") && vtnr != 0) { + pam_debug_syslog(handle, debug, "Ignoring vtnr %"PRIu32" for %s which is not seat0", vtnr, seat); + vtnr = 0; + } + + if (isempty(type)) + type = !isempty(display) ? "x11" : + !isempty(tty) ? "tty" : "unspecified"; + + if (isempty(class)) + class = streq(type, "unspecified") ? "background" : "user"; + + remote = !isempty(remote_host) && !is_localhost(remote_host); + + r = pam_get_data(handle, "systemd.memory_max", (const void **)&memory_max); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM systemd.memory_max data: @PAMERR@"); + r = pam_get_data(handle, "systemd.tasks_max", (const void **)&tasks_max); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM systemd.tasks_max data: @PAMERR@"); + r = pam_get_data(handle, "systemd.cpu_weight", (const void **)&cpu_weight); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM systemd.cpu_weight data: @PAMERR@"); + r = pam_get_data(handle, "systemd.io_weight", (const void **)&io_weight); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM systemd.io_weight data: @PAMERR@"); + r = pam_get_data(handle, "systemd.runtime_max_sec", (const void **)&runtime_max_sec); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get PAM systemd.runtime_max_sec data: @PAMERR@"); + + /* Talk to logind over the message bus */ + r = pam_acquire_bus_connection(handle, "pam-systemd", &bus, &d); + if (r != PAM_SUCCESS) + return r; + + pam_debug_syslog(handle, debug, + "Asking logind to create session: " + "uid="UID_FMT" pid="PID_FMT" service=%s type=%s class=%s desktop=%s seat=%s vtnr=%"PRIu32" tty=%s display=%s remote=%s remote_user=%s remote_host=%s", + ur->uid, getpid_cached(), + strempty(service), + type, class, strempty(desktop), + strempty(seat), vtnr, strempty(tty), strempty(display), + yes_no(remote), strempty(remote_user), strempty(remote_host)); + pam_debug_syslog(handle, debug, + "Session limits: " + "memory_max=%s tasks_max=%s cpu_weight=%s io_weight=%s runtime_max_sec=%s", + strna(memory_max), strna(tasks_max), strna(cpu_weight), strna(io_weight), strna(runtime_max_sec)); + + const SessionContext context = { + .uid = ur->uid, + .pid = 0, + .service = service, + .type = type, + .class = class, + .desktop = desktop, + .seat = seat, + .vtnr = vtnr, + .tty = tty, + .display = display, + .remote = remote, + .remote_user = remote_user, + .remote_host = remote_host, + .memory_max = memory_max, + .tasks_max = tasks_max, + .cpu_weight = cpu_weight, + .io_weight = io_weight, + .runtime_max_sec = runtime_max_sec, + }; + + r = create_session_message(bus, + handle, + &context, + /* avoid_pidfd = */ false, + &m); + if (r < 0) + return pam_bus_log_create_error(handle, r); + + r = sd_bus_call(bus, m, LOGIN_SLOW_BUS_CALL_TIMEOUT_USEC, &error, &reply); + if (r < 0 && sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)) { + sd_bus_error_free(&error); + pam_debug_syslog(handle, debug, + "CreateSessionWithPIDFD() API is not available, retrying with CreateSession()."); + + m = sd_bus_message_unref(m); + r = create_session_message(bus, + handle, + &context, + /* avoid_pidfd = */ true, + &m); + if (r < 0) + return pam_bus_log_create_error(handle, r); + + r = sd_bus_call(bus, m, LOGIN_SLOW_BUS_CALL_TIMEOUT_USEC, &error, &reply); + } + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_SESSION_BUSY)) { + /* We are already in a session, don't do anything */ + pam_debug_syslog(handle, debug, + "Not creating session: %s", bus_error_message(&error, r)); + goto success; + } + + pam_syslog(handle, LOG_ERR, + "Failed to create session: %s", bus_error_message(&error, r)); + return PAM_SESSION_ERR; + } + + r = sd_bus_message_read(reply, + "soshusub", + &id, + &object_path, + &runtime_path, + &session_fd, + &original_uid, + &seat, + &vtnr, + &existing); + if (r < 0) + return pam_bus_log_parse_error(handle, r); + + pam_debug_syslog(handle, debug, + "Reply from logind: " + "id=%s object_path=%s runtime_path=%s session_fd=%d seat=%s vtnr=%u original_uid=%u", + id, object_path, runtime_path, session_fd, seat, vtnr, original_uid); + + r = update_environment(handle, "XDG_SESSION_ID", id); + if (r != PAM_SUCCESS) + return r; + + if (original_uid == ur->uid) { + /* Don't set $XDG_RUNTIME_DIR if the user we now authenticated for does not match the + * original user of the session. We do this in order not to result in privileged apps + * clobbering the runtime directory unnecessarily. */ + + r = configure_runtime_directory(handle, ur, runtime_path); + if (r != PAM_SUCCESS) + return r; + } + + /* Most likely we got the session/type/class from environment variables, but might have gotten the data + * somewhere else (for example PAM module parameters). Let's now update the environment variables, so that this + * data is inherited into the session processes, and programs can rely on them to be initialized. */ + + r = update_environment(handle, "XDG_SESSION_TYPE", type); + if (r != PAM_SUCCESS) + return r; + + r = update_environment(handle, "XDG_SESSION_CLASS", class); + if (r != PAM_SUCCESS) + return r; + + r = update_environment(handle, "XDG_SESSION_DESKTOP", desktop); + if (r != PAM_SUCCESS) + return r; + + r = update_environment(handle, "XDG_SEAT", seat); + if (r != PAM_SUCCESS) + return r; + + if (vtnr > 0) { + char buf[DECIMAL_STR_MAX(vtnr)]; + sprintf(buf, "%u", vtnr); + + r = update_environment(handle, "XDG_VTNR", buf); + if (r != PAM_SUCCESS) + return r; + } + + r = pam_set_data(handle, "systemd.existing", INT_TO_PTR(!!existing), NULL); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to install existing flag: @PAMERR@"); + + if (session_fd >= 0) { + _cleanup_close_ int fd = fcntl(session_fd, F_DUPFD_CLOEXEC, 3); + if (fd < 0) + return pam_syslog_errno(handle, LOG_ERR, errno, "Failed to dup session fd: %m"); + + r = pam_set_data(handle, "systemd.session-fd", FD_TO_PTR(fd), NULL); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to install session fd: @PAMERR@"); + TAKE_FD(fd); + } + +success: + if (default_capability_ambient_set == UINT64_MAX) + default_capability_ambient_set = pick_default_capability_ambient_set(ur, service, seat); + + return apply_user_record_settings(handle, ur, debug, default_capability_bounding_set, default_capability_ambient_set); +} + +_public_ PAM_EXTERN int pam_sm_close_session( + pam_handle_t *handle, + int flags, + int argc, const char **argv) { + + const void *existing = NULL; + bool debug = false; + const char *id; + int r; + + assert(handle); + + if (parse_argv(handle, + argc, argv, + NULL, + NULL, + NULL, + &debug, + NULL, + NULL) < 0) + return PAM_SESSION_ERR; + + pam_debug_syslog(handle, debug, "pam-systemd shutting down"); + + /* Only release session if it wasn't pre-existing when we + * tried to create it */ + r = pam_get_data(handle, "systemd.existing", &existing); + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, + "Failed to get PAM systemd.existing data: @PAMERR@"); + + id = pam_getenv(handle, "XDG_SESSION_ID"); + if (id && !existing) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + + /* Before we go and close the FIFO we need to tell logind that this is a clean session + * shutdown, so that it doesn't just go and slaughter us immediately after closing the fd */ + + r = pam_acquire_bus_connection(handle, "pam-systemd", &bus, NULL); + if (r != PAM_SUCCESS) + return r; + + r = bus_call_method(bus, bus_login_mgr, "ReleaseSession", &error, NULL, "s", id); + if (r < 0) + return pam_syslog_pam_error(handle, LOG_ERR, PAM_SESSION_ERR, + "Failed to release session: %s", bus_error_message(&error, r)); + } + + /* Note that we are knowingly leaking the FIFO fd here. This way, logind can watch us die. If we + * closed it here it would not have any clue when that is completed. Given that one cannot really + * have multiple PAM sessions open from the same process this means we will leak one FD at max. */ + + return PAM_SUCCESS; +} diff --git a/src/login/pam_systemd.sym b/src/login/pam_systemd.sym new file mode 100644 index 0000000..130cf6a --- /dev/null +++ b/src/login/pam_systemd.sym @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +{ +global: + pam_sm_close_session; + pam_sm_open_session; +local: *; +}; diff --git a/src/login/pam_systemd_loadkey.c b/src/login/pam_systemd_loadkey.c new file mode 100644 index 0000000..3b4e911 --- /dev/null +++ b/src/login/pam_systemd_loadkey.c @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "keyring-util.h" +#include "macro.h" +#include "missing_syscall.h" +#include "nulstr-util.h" +#include "pam-util.h" +#include "strv.h" + +/* By default, this module retrieves the key stored by systemd-cryptsetup. + * This can be overridden by the keyname= parameter. */ +static const char DEFAULT_KEYNAME[] = "cryptsetup"; + +_public_ int pam_sm_authenticate( + pam_handle_t *handle, + int flags, + int argc, const char **argv) { + + assert(handle); + + /* Parse argv. */ + + assert(argc >= 0); + assert(argc == 0 || argv); + + const char *keyname = DEFAULT_KEYNAME; + bool debug = false; + + for (int i = 0; i < argc; i++) { + const char *p; + + if ((p = startswith(argv[i], "keyname="))) + keyname = p; + else if (streq(argv[i], "debug")) + debug = true; + else + pam_syslog(handle, LOG_WARNING, "Unknown parameter '%s', ignoring.", argv[i]); + } + + pam_debug_syslog(handle, debug, "pam-systemd-loadkey initializing"); + + /* Retrieve the key. */ + + key_serial_t serial; + serial = request_key("user", keyname, NULL, 0); + if (serial < 0) { + if (errno == ENOKEY) { + pam_debug_syslog(handle, debug, "Key not found: %s", keyname); + return PAM_AUTHINFO_UNAVAIL; + } else if (errno == EKEYEXPIRED) { + pam_debug_syslog(handle, debug, "Key expired: %s", keyname); + return PAM_AUTHINFO_UNAVAIL; + } else + return pam_syslog_errno(handle, LOG_ERR, errno, "Failed to look up the key: %m"); + } + + _cleanup_(erase_and_freep) void *p = NULL; + size_t n; + int r; + + r = keyring_read(serial, &p, &n); + if (r < 0) + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to read the key: %m"); + + /* Split the key by NUL. Set the last item as authtok. */ + + _cleanup_(strv_free_erasep) char **passwords = strv_parse_nulstr(p, n); + if (!passwords) + return pam_log_oom(handle); + + size_t passwords_len = strv_length(passwords); + if (passwords_len == 0) { + pam_debug_syslog(handle, debug, "Key is empty"); + return PAM_AUTHINFO_UNAVAIL; + } else if (passwords_len > 1) + pam_debug_syslog(handle, debug, "Multiple passwords found in the key. Using the last one"); + + r = pam_set_item(handle, PAM_AUTHTOK, passwords[passwords_len - 1]); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set PAM auth token: @PAMERR@"); + + return PAM_SUCCESS; +} + +_public_ int pam_sm_setcred( + pam_handle_t *handle, + int flags, + int argc, const char **argv) { + + return PAM_SUCCESS; +} diff --git a/src/login/pam_systemd_loadkey.sym b/src/login/pam_systemd_loadkey.sym new file mode 100644 index 0000000..d611dc1 --- /dev/null +++ b/src/login/pam_systemd_loadkey.sym @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +{ +global: + pam_sm_authenticate; + pam_sm_setcred; +local: *; +}; diff --git a/src/login/sysfs-show.c b/src/login/sysfs-show.c new file mode 100644 index 0000000..0a8c02a --- /dev/null +++ b/src/login/sysfs-show.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-enumerator-private.h" +#include "glyph-util.h" +#include "path-util.h" +#include "string-util.h" +#include "sysfs-show.h" +#include "terminal-util.h" + +static int show_sysfs_one( + const char *seat, + sd_device **dev_list, + size_t *i_dev, + size_t n_dev, + const char *sub, + const char *prefix, + unsigned n_columns, + OutputFlags flags) { + + size_t max_width; + int r; + + assert(seat); + assert(dev_list); + assert(i_dev); + assert(prefix); + + if (flags & OUTPUT_FULL_WIDTH) + max_width = SIZE_MAX; + else if (n_columns < 10) + max_width = 10; + else + max_width = n_columns; + + while (*i_dev < n_dev) { + const char *sysfs, *sn, *name = NULL, *subsystem, *sysname; + _cleanup_free_ char *k = NULL, *l = NULL; + size_t lookahead; + bool is_master; + + if (sd_device_get_syspath(dev_list[*i_dev], &sysfs) < 0 || + !path_startswith(sysfs, sub)) + return 0; + + if (sd_device_get_property_value(dev_list[*i_dev], "ID_SEAT", &sn) < 0 || isempty(sn)) + sn = "seat0"; + + /* Explicitly also check for tag 'seat' here */ + if (!streq(seat, sn) || + sd_device_has_current_tag(dev_list[*i_dev], "seat") <= 0 || + sd_device_get_subsystem(dev_list[*i_dev], &subsystem) < 0 || + sd_device_get_sysname(dev_list[*i_dev], &sysname) < 0) { + (*i_dev)++; + continue; + } + + is_master = sd_device_has_current_tag(dev_list[*i_dev], "master-of-seat") > 0; + + if (sd_device_get_sysattr_value(dev_list[*i_dev], "name", &name) < 0) + (void) sd_device_get_sysattr_value(dev_list[*i_dev], "id", &name); + + /* Look if there's more coming after this */ + for (lookahead = *i_dev + 1; lookahead < n_dev; lookahead++) { + const char *lookahead_sysfs; + + if (sd_device_get_syspath(dev_list[lookahead], &lookahead_sysfs) < 0) + continue; + + if (path_startswith(lookahead_sysfs, sub) && + !path_startswith(lookahead_sysfs, sysfs)) { + const char *lookahead_sn; + + if (sd_device_get_property_value(dev_list[lookahead], "ID_SEAT", &lookahead_sn) < 0 || + isempty(lookahead_sn)) + lookahead_sn = "seat0"; + + if (streq(seat, lookahead_sn) && sd_device_has_current_tag(dev_list[lookahead], "seat") > 0) + break; + } + } + + k = ellipsize(sysfs, max_width, 20); + if (!k) + return -ENOMEM; + + printf("%s%s%s\n", prefix, special_glyph(lookahead < n_dev ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT), k); + + if (asprintf(&l, + "%s%s:%s%s%s%s", + is_master ? "[MASTER] " : "", + subsystem, sysname, + name ? " \"" : "", strempty(name), name ? "\"" : "") < 0) + return -ENOMEM; + + free(k); + k = ellipsize(l, max_width, 70); + if (!k) + return -ENOMEM; + + printf("%s%s%s\n", prefix, lookahead < n_dev ? special_glyph(SPECIAL_GLYPH_TREE_VERTICAL) : " ", k); + + if (++(*i_dev) < n_dev) { + _cleanup_free_ char *p = NULL; + + p = strjoin(prefix, lookahead < n_dev ? special_glyph(SPECIAL_GLYPH_TREE_VERTICAL) : " "); + if (!p) + return -ENOMEM; + + r = show_sysfs_one(seat, dev_list, i_dev, n_dev, sysfs, p, + n_columns == UINT_MAX || n_columns < 2 ? n_columns : n_columns - 2, + flags); + if (r < 0) + return r; + } + + } + + return 0; +} + +int show_sysfs(const char *seat, const char *prefix, unsigned n_columns, OutputFlags flags) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + size_t n_dev = 0, i = 0; + sd_device **dev_list; + int r; + + if (n_columns <= 0) + n_columns = columns(); + + prefix = strempty(prefix); + + if (isempty(seat)) + seat = "seat0"; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_tag(e, streq(seat, "seat0") ? "seat" : seat); + if (r < 0) + return r; + + r = device_enumerator_scan_devices(e); + if (r < 0) + return r; + + dev_list = device_enumerator_get_devices(e, &n_dev); + + if (dev_list && n_dev > 0) + show_sysfs_one(seat, dev_list, &i, n_dev, "/", prefix, n_columns, flags); + else + printf("%s%s%s\n", prefix, special_glyph(SPECIAL_GLYPH_TREE_RIGHT), "(none)"); + + return 0; +} diff --git a/src/login/sysfs-show.h b/src/login/sysfs-show.h new file mode 100644 index 0000000..32ccbf3 --- /dev/null +++ b/src/login/sysfs-show.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "output-mode.h" + +int show_sysfs(const char *seat, const char *prefix, unsigned columns, OutputFlags flags); diff --git a/src/login/systemd-user.in b/src/login/systemd-user.in new file mode 100644 index 0000000..8a3c9e0 --- /dev/null +++ b/src/login/systemd-user.in @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# This file is part of systemd. +# +# Used by systemd --user instances. + +{% if ENABLE_HOMED %} +-account sufficient pam_systemd_home.so +{% endif %} +account sufficient pam_unix.so no_pass_expiry +account required pam_permit.so + +{% if HAVE_SELINUX %} +session required pam_selinux.so close +session required pam_selinux.so nottys open +{% endif %} +session required pam_loginuid.so +session optional pam_keyinit.so force revoke +session required pam_namespace.so +{% if ENABLE_HOMED %} +-session optional pam_systemd_home.so +{% endif %} +session optional pam_umask.so silent +session optional pam_systemd.so diff --git a/src/login/test-inhibit.c b/src/login/test-inhibit.c new file mode 100644 index 0000000..abb80d9 --- /dev/null +++ b/src/login/test-inhibit.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "bus-locator.h" +#include "bus-util.h" +#include "fd-util.h" +#include "macro.h" +#include "tests.h" + +static int inhibit(sd_bus *bus, const char *what) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *who = "Test Tool", *reason = "Just because!", *mode = "block"; + int fd; + int r; + + r = bus_call_method(bus, bus_login_mgr, "Inhibit", &error, &reply, "ssss", what, who, reason, mode); + assert_se(r >= 0); + + r = sd_bus_message_read_basic(reply, SD_BUS_TYPE_UNIX_FD, &fd); + assert_se(r >= 0); + assert_se(fd >= 0); + + return fcntl(fd, F_DUPFD_CLOEXEC, 3); +} + +static void print_inhibitors(sd_bus *bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *what, *who, *why, *mode; + uint32_t uid, pid; + unsigned n = 0; + int r; + + r = bus_call_method(bus, bus_login_mgr, "ListInhibitors", &error, &reply, NULL); + assert_se(r >= 0); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssuu)"); + assert_se(r >= 0); + + while ((r = sd_bus_message_read(reply, "(ssssuu)", &what, &who, &why, &mode, &uid, &pid)) > 0) { + printf("what=<%s> who=<%s> why=<%s> mode=<%s> uid=<%"PRIu32"> pid=<%"PRIu32">\n", + what, who, why, mode, uid, pid); + + n++; + } + assert_se(r >= 0); + + printf("%u inhibitors\n", n); +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + int fd1, fd2; + int r; + + test_setup_logging(LOG_DEBUG); + + r = sd_bus_open_system(&bus); + assert_se(r >= 0); + + print_inhibitors(bus); + + fd1 = inhibit(bus, "sleep"); + assert_se(fd1 >= 0); + print_inhibitors(bus); + + fd2 = inhibit(bus, "idle:shutdown"); + assert_se(fd2 >= 0); + print_inhibitors(bus); + + safe_close(fd1); + sleep(1); + print_inhibitors(bus); + + safe_close(fd2); + sleep(1); + print_inhibitors(bus); + + return 0; +} diff --git a/src/login/test-login-shared.c b/src/login/test-login-shared.c new file mode 100644 index 0000000..17cd479 --- /dev/null +++ b/src/login/test-login-shared.c @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "login-util.h" +#include "macro.h" +#include "tests.h" + +TEST(session_id_valid) { + assert_se(session_id_valid("c1")); + assert_se(session_id_valid("1234")); + + assert_se(!session_id_valid("1-2")); + assert_se(!session_id_valid("")); + assert_se(!session_id_valid("\tid")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/login/test-login-tables.c b/src/login/test-login-tables.c new file mode 100644 index 0000000..3c5ec04 --- /dev/null +++ b/src/login/test-login-tables.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "logind-action.h" +#include "logind-session.h" +#include "test-tables.h" +#include "tests.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(handle_action, HANDLE_ACTION); + test_table(inhibit_mode, INHIBIT_MODE); + test_table(kill_who, KILL_WHO); + test_table(session_class, SESSION_CLASS); + test_table(session_state, SESSION_STATE); + test_table(session_type, SESSION_TYPE); + test_table(user_state, USER_STATE); + + return EXIT_SUCCESS; +} diff --git a/src/login/test-session-properties.c b/src/login/test-session-properties.c new file mode 100644 index 0000000..b5b5f60 --- /dev/null +++ b/src/login/test-session-properties.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* Usage: + * ./test-session-properties [] + * e.g., + * ./test-session-properties /org/freedesktop/login1/session/_32 /dev/tty2 + */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-locator.h" +#include "path-util.h" +#include "string-util.h" +#include "terminal-util.h" +#include "tests.h" + +static const char *arg_tty = NULL; + +static BusLocator session; + +/* Tests org.freedesktop.logind.Session SetType */ +TEST(set_type) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus* bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char* types[] = {"tty", "x11", "wayland", "mir", "web"}; + _cleanup_free_ char *type = NULL, *type2 = NULL; + + assert_se(sd_bus_open_system(&bus) >= 0); + + /* Default type is set */ + assert_se(bus_get_property_string(bus, &session, "Type", NULL, &type) >= 0); + assert_se(streq(type, "tty")); + + /* Type can only be set by the session controller (which we're not ATM) */ + assert_se(bus_call_method(bus, &session, "SetType", &error, NULL, "s", "x11") < 0); + assert_se(sd_bus_error_has_name(&error, BUS_ERROR_NOT_IN_CONTROL)); + + assert_se(bus_call_method(bus, &session, "TakeControl", NULL, NULL, "b", true) >= 0); + + /* All defined session types can be set */ + for (size_t i = 0; i < ELEMENTSOF(types); i++) { + type = mfree(type); + assert_se(bus_call_method(bus, &session, "SetType", NULL, NULL, "s", types[i]) >= 0); + assert_se(bus_get_property_string(bus, &session, "Type", NULL, &type) >= 0); + assert_se(streq(type, types[i])); + } + + /* An unknown type is rejected */ + sd_bus_error_free(&error); + assert_se(bus_call_method(bus, &session, "SetType", &error, NULL, "s", "hello") < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)); + assert_se(bus_get_property_string(bus, &session, "Type", NULL, &type2) >= 0); + + /* Type is reset to the original value when we release control of the session */ + assert_se(!streq(type, "tty")); + assert_se(bus_call_method(bus, &session, "ReleaseControl", NULL, NULL, NULL) >= 0); + type = mfree(type); + assert_se(bus_get_property_string(bus, &session, "Type", NULL, &type) >= 0); + assert_se(streq(type, "tty")); +} + +/* Tests org.freedesktop.logind.Session SetDisplay */ +TEST(set_display) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus* bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *display = NULL; + + assert_se(sd_bus_open_system(&bus) >= 0); + + /* Display is unset by default */ + assert_se(bus_get_property_string(bus, &session, "Display", NULL, &display) >= 0); + assert_se(isempty(display)); + + /* Display can only be set by the session controller (which we're not ATM) */ + assert_se(bus_call_method(bus, &session, "SetDisplay", &error, NULL, "s", ":0") < 0); + assert_se(sd_bus_error_has_name(&error, BUS_ERROR_NOT_IN_CONTROL)); + + assert_se(bus_call_method(bus, &session, "TakeControl", NULL, NULL, "b", true) >= 0); + + /* Display can only be set on a graphical session */ + assert_se(bus_call_method(bus, &session, "SetType", NULL, NULL, "s", "tty") >= 0); + sd_bus_error_free(&error); + assert_se(bus_call_method(bus, &session, "SetDisplay", &error, NULL, "s", ":0") < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_NOT_SUPPORTED)); + + assert_se(bus_call_method(bus, &session, "SetType", NULL, NULL, "s", "x11") >= 0); + + /* Non-empty display can be set */ + assert_se(bus_call_method(bus, &session, "SetDisplay", NULL, NULL, "s", ":0") >= 0); + display = mfree(display); + assert_se(bus_get_property_string(bus, &session, "Display", NULL, &display) >= 0); + assert_se(streq(display, ":0")); + + /* Empty display can be set too */ + assert_se(bus_call_method(bus, &session, "SetDisplay", NULL, NULL, "s", "") >= 0); + display = mfree(display); + assert_se(bus_get_property_string(bus, &session, "Display", NULL, &display) >= 0); + assert_se(isempty(display)); +} + +/* Tests org.freedesktop.logind.Session SetTTY */ +TEST(set_tty) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus* bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *tty = NULL; + int fd; + + if (!arg_tty) + return; + + fd = open(arg_tty, O_RDWR|O_CLOEXEC|O_NOCTTY); + assert_se(fd >= 0); + + assert_se(sd_bus_open_system(&bus) >= 0); + + /* tty can only be set by the session controller (which we're not ATM) */ + assert_se(bus_call_method(bus, &session, "SetTTY", &error, NULL, "h", fd) < 0); + assert_se(sd_bus_error_has_name(&error, BUS_ERROR_NOT_IN_CONTROL)); + + assert_se(bus_call_method(bus, &session, "TakeControl", NULL, NULL, "b", true) >= 0); + + /* tty can be set */ + assert_se(bus_call_method(bus, &session, "SetTTY", NULL, NULL, "h", fd) >= 0); + tty = mfree(tty); + assert_se(bus_get_property_string(bus, &session, "TTY", NULL, &tty) >= 0); + assert_se(streq(tty, "tty2")); +} + +/* Tests org.freedesktop.logind.Session SetIdleHint */ +TEST(set_idle_hint) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus* bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int idle_hint; + time_t stamp, idle_since1, idle_since2; + + assert_se(sd_bus_open_system(&bus) >= 0); + + /* Idle hint is not set by default */ + assert_se(bus_get_property_trivial(bus, &session, "IdleHint", NULL, 'b', &idle_hint) >= 0); + assert_se(!idle_hint); + + assert_se(bus_call_method(bus, &session, "TakeControl", NULL, NULL, "b", true) >= 0); + + /* Idle hint can only be set on a graphical session */ + assert_se(bus_call_method(bus, &session, "SetType", NULL, NULL, "s", "tty") >= 0); + assert_se(bus_call_method(bus, &session, "SetIdleHint", &error, NULL, "b", true) < 0); + assert_se(sd_bus_error_has_name(&error, SD_BUS_ERROR_NOT_SUPPORTED)); + + assert_se(bus_call_method(bus, &session, "SetType", NULL, NULL, "s", "x11") >= 0); + + stamp = now(CLOCK_MONOTONIC); + + /* Idle hint can be set */ + assert_se(bus_call_method(bus, &session, "SetIdleHint", NULL, NULL, "b", true) >= 0); + assert_se(bus_get_property_trivial(bus, &session, "IdleHint", NULL, 'b', &idle_hint) >= 0); + assert_se(idle_hint); + assert_se(bus_get_property_trivial(bus, &session, "IdleSinceHintMonotonic", NULL, 't', &idle_since1) >= 0); + assert_se(idle_since1 >= stamp); + + /* Repeated setting doesn't change anything */ + assert_se(bus_call_method(bus, &session, "SetIdleHint", NULL, NULL, "b", true) >= 0); + assert_se(bus_get_property_trivial(bus, &session, "IdleHint", NULL, 'b', &idle_hint) >= 0); + assert_se(idle_hint); + assert_se(bus_get_property_trivial(bus, &session, "IdleSinceHintMonotonic", NULL, 't', &idle_since2) >= 0); + assert_se(idle_since2 == idle_since1); + + /* Idle hint can be unset */ + assert_se(bus_call_method(bus, &session, "SetIdleHint", NULL, NULL, "b", false) >= 0); + assert_se(bus_get_property_trivial(bus, &session, "IdleHint", NULL, 'b', &idle_hint) >= 0); + assert_se(!idle_hint); +} + +static int intro(void) { + if (saved_argc <= 1) + return EXIT_FAILURE; + + session = (BusLocator) { + .destination = "org.freedesktop.login1", + .path = saved_argv[1], + .interface = "org.freedesktop.login1.Session", + }; + + if (saved_argc > 2) + arg_tty = saved_argv[2]; + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/login/user-runtime-dir.c b/src/login/user-runtime-dir.c new file mode 100644 index 0000000..ad04b04 --- /dev/null +++ b/src/login/user-runtime-dir.c @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "dev-setup.h" +#include "format-util.h" +#include "fs-util.h" +#include "label-util.h" +#include "limits-util.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "smack-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static int acquire_runtime_dir_properties(uint64_t *size, uint64_t *inodes) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_get_property_trivial(bus, bus_login_mgr, "RuntimeDirectorySize", &error, 't', size); + if (r < 0) { + log_warning_errno(r, "Failed to acquire runtime directory size, ignoring: %s", bus_error_message(&error, r)); + *size = physical_memory_scale(10U, 100U); /* 10% */ + } + + r = bus_get_property_trivial(bus, bus_login_mgr, "RuntimeDirectoryInodesMax", &error, 't', inodes); + if (r < 0) { + log_warning_errno(r, "Failed to acquire number of inodes for runtime directory, ignoring: %s", bus_error_message(&error, r)); + *inodes = DIV_ROUND_UP(*size, 4096); + } + + return 0; +} + +static int user_mkdir_runtime_path( + const char *runtime_path, + uid_t uid, + gid_t gid, + uint64_t runtime_dir_size, + uint64_t runtime_dir_inodes) { + + int r; + + assert(runtime_path); + assert(path_is_absolute(runtime_path)); + assert(uid_is_valid(uid)); + assert(gid_is_valid(gid)); + + r = mkdir_safe_label("/run/user", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + return log_error_errno(r, "Failed to create /run/user: %m"); + + if (path_is_mount_point(runtime_path, NULL, 0) > 0) + log_debug("%s is already a mount point", runtime_path); + else { + char options[sizeof("mode=0700,uid=,gid=,size=,nr_inodes=,smackfsroot=*") + + DECIMAL_STR_MAX(uid_t) + + DECIMAL_STR_MAX(gid_t) + + DECIMAL_STR_MAX(uint64_t) + + DECIMAL_STR_MAX(uint64_t)]; + + xsprintf(options, + "mode=0700,uid=" UID_FMT ",gid=" GID_FMT ",size=%" PRIu64 ",nr_inodes=%" PRIu64 "%s", + uid, gid, runtime_dir_size, runtime_dir_inodes, + mac_smack_use() ? ",smackfsroot=*" : ""); + + r = mkdir_label(runtime_path, 0700); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Failed to create %s: %m", runtime_path); + + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", runtime_path, "tmpfs", MS_NODEV|MS_NOSUID, options); + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(r)) { + log_error_errno(r, "Failed to mount per-user tmpfs directory %s: %m", runtime_path); + goto fail; + } + + log_debug_errno(r, + "Failed to mount per-user tmpfs directory %s.\n" + "Assuming containerized execution, ignoring: %m", runtime_path); + + r = chmod_and_chown(runtime_path, 0700, uid, gid); + if (r < 0) { + log_error_errno(r, "Failed to change ownership and mode of \"%s\": %m", runtime_path); + goto fail; + } + } + + r = label_fix(runtime_path, 0); + if (r < 0) + log_warning_errno(r, "Failed to fix label of \"%s\", ignoring: %m", runtime_path); + } + + return 0; + +fail: + /* Try to clean up, but ignore errors */ + (void) rmdir(runtime_path); + return r; +} + +static int user_remove_runtime_path(const char *runtime_path) { + int r; + + assert(runtime_path); + assert(path_is_absolute(runtime_path)); + + r = rm_rf(runtime_path, 0); + if (r < 0) + log_debug_errno(r, "Failed to remove runtime directory %s (before unmounting), ignoring: %m", runtime_path); + + /* Ignore cases where the directory isn't mounted, as that's quite possible, if we lacked the permissions to + * mount something */ + r = umount2(runtime_path, MNT_DETACH); + if (r < 0 && !IN_SET(errno, EINVAL, ENOENT)) + log_debug_errno(errno, "Failed to unmount user runtime directory %s, ignoring: %m", runtime_path); + + r = rm_rf(runtime_path, REMOVE_ROOT); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to remove runtime directory %s (after unmounting): %m", runtime_path); + + return 0; +} + +static int do_mount(const char *user) { + char runtime_path[sizeof("/run/user") + DECIMAL_STR_MAX(uid_t)]; + uint64_t runtime_dir_size, runtime_dir_inodes; + uid_t uid; + gid_t gid; + int r; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, + r == -ESRCH ? "No such user \"%s\"" : + r == -ENOMSG ? "UID \"%s\" is invalid or has an invalid main group" + : "Failed to look up user \"%s\": %m", + user); + + r = acquire_runtime_dir_properties(&runtime_dir_size, &runtime_dir_inodes); + if (r < 0) + return r; + + xsprintf(runtime_path, "/run/user/" UID_FMT, uid); + + log_debug("Will mount %s owned by "UID_FMT":"GID_FMT, runtime_path, uid, gid); + return user_mkdir_runtime_path(runtime_path, uid, gid, runtime_dir_size, runtime_dir_inodes); +} + +static int do_umount(const char *user) { + char runtime_path[sizeof("/run/user") + DECIMAL_STR_MAX(uid_t)]; + uid_t uid; + int r; + + /* The user may be already removed. So, first try to parse the string by parse_uid(), + * and if it fails, fall back to get_user_creds(). */ + if (parse_uid(user, &uid) < 0) { + r = get_user_creds(&user, &uid, NULL, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, + r == -ESRCH ? "No such user \"%s\"" : + r == -ENOMSG ? "UID \"%s\" is invalid or has an invalid main group" + : "Failed to look up user \"%s\": %m", + user); + } + + xsprintf(runtime_path, "/run/user/" UID_FMT, uid); + + log_debug("Will remove %s", runtime_path); + return user_remove_runtime_path(runtime_path); +} + +static int run(int argc, char *argv[]) { + int r; + + log_parse_environment(); + log_open(); + + if (argc != 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes two arguments."); + if (!STR_IN_SET(argv[1], "start", "stop")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "First argument must be either \"start\" or \"stop\"."); + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + + if (streq(argv[1], "start")) + return do_mount(argv[2]); + if (streq(argv[1], "stop")) + return do_umount(argv[2]); + assert_not_reached(); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/machine-id-setup/machine-id-setup-main.c b/src/machine-id-setup/machine-id-setup-main.c new file mode 100644 index 0000000..59aad98 --- /dev/null +++ b/src/machine-id-setup/machine-id-setup-main.c @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "build.h" +#include "dissect-image.h" +#include "id128-util.h" +#include "log.h" +#include "machine-id-setup.h" +#include "main-func.h" +#include "mount-util.h" +#include "parse-argument.h" +#include "path-util.h" +#include "pretty-print.h" +#include "terminal-util.h" + +static char *arg_root = NULL; +static char *arg_image = NULL; +static bool arg_commit = false; +static bool arg_print = false; +static ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-machine-id-setup", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n" + "\n%sInitialize /etc/machine-id from a random source.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + " --commit Commit transient ID\n" + " --print Print used machine ID\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_COMMIT, + ARG_PRINT, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "commit", no_argument, NULL, ARG_COMMIT }, + { "print", no_argument, NULL, ARG_PRINT }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_ROOT: + r = parse_path_argument(optarg, true, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_COMMIT: + arg_commit = true; + break; + + case ARG_PRINT: + arg_print = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Extraneous arguments"); + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + int r; + + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_VALIDATE_OS | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_FSCK | + DISSECT_IMAGE_GROWFS, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } + + if (arg_commit) { + sd_id128_t id; + + r = machine_id_commit(arg_root); + if (r < 0) + return r; + + r = id128_get_machine(arg_root, &id); + if (r < 0) + return log_error_errno(r, "Failed to read machine ID back: %m"); + + if (arg_print) + puts(SD_ID128_TO_STRING(id)); + + } else if (id128_get_machine(arg_root, NULL) == -ENOPKG) { + if (arg_print) + puts("uninitialized"); + } else { + sd_id128_t id; + + r = machine_id_setup(arg_root, false, SD_ID128_NULL, &id); + if (r < 0) + return r; + + if (arg_print) + puts(SD_ID128_TO_STRING(id)); + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/machine-id-setup/meson.build b/src/machine-id-setup/meson.build new file mode 100644 index 0000000..316165a --- /dev/null +++ b/src/machine-id-setup/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-machine-id-setup', + 'sources' : files('machine-id-setup-main.c'), + }, +] diff --git a/src/machine/image-dbus.c b/src/machine/image-dbus.c new file mode 100644 index 0000000..aa4525d --- /dev/null +++ b/src/machine/image-dbus.c @@ -0,0 +1,530 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-label.h" +#include "bus-polkit.h" +#include "copy.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "image-dbus.h" +#include "io-util.h" +#include "loop-util.h" +#include "missing_capability.h" +#include "mount-util.h" +#include "os-util.h" +#include "process-util.h" +#include "raw-clone.h" +#include "strv.h" +#include "user-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, image_type, ImageType); + +int bus_image_method_remove( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + Image *image = ASSERT_PTR(userdata); + Manager *m = image->userdata; + pid_t child; + int r; + + assert(message); + + if (m->n_operations >= OPERATIONS_MAX) + return sd_bus_error_set(error, SD_BUS_ERROR_LIMITS_EXCEEDED, "Too many ongoing operations."); + + const char *details[] = { + "image", image->name, + "verb", "remove", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-images", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to create pipe: %m"); + + r = safe_fork("(sd-imgrm)", FORK_RESET_SIGNALS, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + r = image_remove(image); + if (r < 0) { + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + r = operation_new(m, NULL, child, message, errno_pipe_fd[0], NULL); + if (r < 0) { + (void) sigkill_wait(child); + return r; + } + + errno_pipe_fd[0] = -EBADF; + + return 1; +} + +int bus_image_method_rename( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Image *image = ASSERT_PTR(userdata); + Manager *m = image->userdata; + const char *new_name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &new_name); + if (r < 0) + return r; + + if (!image_name_is_valid(new_name)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Image name '%s' is invalid.", new_name); + + const char *details[] = { + "image", image->name, + "verb", "rename", + "new_name", new_name, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-images", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = image_rename(image, new_name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_image_method_clone( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + Image *image = ASSERT_PTR(userdata); + Manager *m = ASSERT_PTR(image->userdata); + const char *new_name; + int r, read_only; + pid_t child; + + assert(message); + + if (m->n_operations >= OPERATIONS_MAX) + return sd_bus_error_set(error, SD_BUS_ERROR_LIMITS_EXCEEDED, "Too many ongoing operations."); + + r = sd_bus_message_read(message, "sb", &new_name, &read_only); + if (r < 0) + return r; + + if (!image_name_is_valid(new_name)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Image name '%s' is invalid.", new_name); + + const char *details[] = { + "image", image->name, + "verb", "clone", + "new_name", new_name, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-images", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to create pipe: %m"); + + r = safe_fork("(sd-imgclone)", FORK_RESET_SIGNALS, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + r = image_clone(image, new_name, read_only); + if (r < 0) { + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + r = operation_new(m, NULL, child, message, errno_pipe_fd[0], NULL); + if (r < 0) { + (void) sigkill_wait(child); + return r; + } + + errno_pipe_fd[0] = -EBADF; + + return 1; +} + +int bus_image_method_mark_read_only( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Image *image = userdata; + Manager *m = image->userdata; + int read_only, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &read_only); + if (r < 0) + return r; + + const char *details[] = { + "image", image->name, + "verb", "mark_read_only", + "read_only", one_zero(read_only), + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-images", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = image_read_only(image, read_only); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_image_method_set_limit( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Image *image = userdata; + Manager *m = image->userdata; + uint64_t limit; + int r; + + assert(message); + + r = sd_bus_message_read(message, "t", &limit); + if (r < 0) + return r; + if (!FILE_SIZE_VALID_OR_INFINITY(limit)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "New limit out of range"); + + const char *details[] = { + "machine", image->name, + "verb", "set_limit", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-images", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = image_set_limit(image, limit); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_image_method_get_hostname( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Image *image = userdata; + int r; + + if (!image->metadata_valid) { + r = image_read_metadata(image, &image_policy_container); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read image metadata: %m"); + } + + return sd_bus_reply_method_return(message, "s", image->hostname); +} + +int bus_image_method_get_machine_id( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Image *image = userdata; + int r; + + if (!image->metadata_valid) { + r = image_read_metadata(image, &image_policy_container); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read image metadata: %m"); + } + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + if (sd_id128_is_null(image->machine_id)) /* Add an empty array if the ID is zero */ + r = sd_bus_message_append(reply, "ay", 0); + else + r = sd_bus_message_append_array(reply, 'y', image->machine_id.bytes, 16); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +int bus_image_method_get_machine_info( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Image *image = userdata; + int r; + + if (!image->metadata_valid) { + r = image_read_metadata(image, &image_policy_container); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read image metadata: %m"); + } + + return bus_reply_pair_array(message, image->machine_info); +} + +int bus_image_method_get_os_release( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + Image *image = userdata; + int r; + + if (!image->metadata_valid) { + r = image_read_metadata(image, &image_policy_container); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read image metadata: %m"); + } + + return bus_reply_pair_array(message, image->os_release); +} + +static int image_flush_cache(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + hashmap_clear(m->image_cache); + return 0; +} + +static int image_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + _cleanup_free_ char *e = NULL; + Manager *m = userdata; + Image *image = NULL; + const char *p; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + p = startswith(path, "/org/freedesktop/machine1/image/"); + if (!p) + return 0; + + e = bus_label_unescape(p); + if (!e) + return -ENOMEM; + + image = hashmap_get(m->image_cache, e); + if (image) { + *found = image; + return 1; + } + + if (!m->image_cache_defer_event) { + r = sd_event_add_defer(m->event, &m->image_cache_defer_event, image_flush_cache, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->image_cache_defer_event, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return r; + } + + r = sd_event_source_set_enabled(m->image_cache_defer_event, SD_EVENT_ONESHOT); + if (r < 0) + return r; + + r = image_find(IMAGE_MACHINE, e, NULL, &image); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + image->userdata = m; + + r = hashmap_ensure_put(&m->image_cache, &image_hash_ops, image->name, image); + if (r < 0) { + image_unref(image); + return r; + } + + *found = image; + return 1; +} + +char *image_bus_path(const char *name) { + _cleanup_free_ char *e = NULL; + + assert(name); + + e = bus_label_escape(name); + if (!e) + return NULL; + + return strjoin("/org/freedesktop/machine1/image/", e); +} + +static int image_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + _cleanup_strv_free_ char **l = NULL; + Image *image; + int r; + + assert(bus); + assert(path); + assert(nodes); + + images = hashmap_new(&image_hash_ops); + if (!images) + return -ENOMEM; + + r = image_discover(IMAGE_MACHINE, NULL, images); + if (r < 0) + return r; + + HASHMAP_FOREACH(image, images) { + char *p; + + p = image_bus_path(image->name); + if (!p) + return -ENOMEM; + + r = strv_consume(&l, p); + if (r < 0) + return r; + } + + *nodes = TAKE_PTR(l); + + return 1; +} + +const sd_bus_vtable image_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Name", "s", NULL, offsetof(Image, name), 0), + SD_BUS_PROPERTY("Path", "s", NULL, offsetof(Image, path), 0), + SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Image, type), 0), + SD_BUS_PROPERTY("ReadOnly", "b", bus_property_get_bool, offsetof(Image, read_only), 0), + SD_BUS_PROPERTY("CreationTimestamp", "t", NULL, offsetof(Image, crtime), 0), + SD_BUS_PROPERTY("ModificationTimestamp", "t", NULL, offsetof(Image, mtime), 0), + SD_BUS_PROPERTY("Usage", "t", NULL, offsetof(Image, usage), 0), + SD_BUS_PROPERTY("Limit", "t", NULL, offsetof(Image, limit), 0), + SD_BUS_PROPERTY("UsageExclusive", "t", NULL, offsetof(Image, usage_exclusive), 0), + SD_BUS_PROPERTY("LimitExclusive", "t", NULL, offsetof(Image, limit_exclusive), 0), + SD_BUS_METHOD("Remove", NULL, NULL, bus_image_method_remove, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Rename", "s", NULL, bus_image_method_rename, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("Clone", "sb", NULL, bus_image_method_clone, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("MarkReadOnly", "b", NULL, bus_image_method_mark_read_only, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("SetLimit", "t", NULL, bus_image_method_set_limit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetHostname", NULL, "s", bus_image_method_get_hostname, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetMachineID", NULL, "ay", bus_image_method_get_machine_id, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetMachineInfo", NULL, "a{ss}", bus_image_method_get_machine_info, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetOSRelease", NULL, "a{ss}", bus_image_method_get_os_release, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation image_object = { + "/org/freedesktop/machine1/image", + "org.freedesktop.machine1.Image", + .fallback_vtables = BUS_FALLBACK_VTABLES({image_vtable, image_object_find}), + .node_enumerator = image_node_enumerator, +}; diff --git a/src/machine/image-dbus.h b/src/machine/image-dbus.h new file mode 100644 index 0000000..4b00203 --- /dev/null +++ b/src/machine/image-dbus.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-object.h" +#include "machined.h" + +extern const BusObjectImplementation image_object; + +char *image_bus_path(const char *name); + +int bus_image_method_remove(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_rename(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_clone(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_mark_read_only(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_set_limit(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_get_hostname(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_get_machine_id(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_get_machine_info(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_image_method_get_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c new file mode 100644 index 0000000..4620f32 --- /dev/null +++ b/src/machine/machine-dbus.c @@ -0,0 +1,1399 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-internal.h" +#include "bus-label.h" +#include "bus-locator.h" +#include "bus-polkit.h" +#include "copy.h" +#include "env-file.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "in-addr-util.h" +#include "iovec-util.h" +#include "local-addresses.h" +#include "machine-dbus.h" +#include "machine.h" +#include "missing_capability.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "os-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "user-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_class, machine_class, MachineClass); +static BUS_DEFINE_PROPERTY_GET2(property_get_state, "s", Machine, machine_get_state, machine_state_to_string); + +static int property_get_netif( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Machine *m = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + return sd_bus_message_append_array(reply, 'i', m->netif, m->n_netif * sizeof(int)); +} + +int bus_machine_method_unregister(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + const char *details[] = { + "machine", m->name, + "verb", "unregister", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = machine_finalize(m); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_machine_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + const char *details[] = { + "machine", m->name, + "verb", "terminate", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = machine_stop(m); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_machine_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Machine *m = ASSERT_PTR(userdata); + const char *swho; + int32_t signo; + KillWho who; + int r; + + assert(message); + + r = sd_bus_message_read(message, "si", &swho, &signo); + if (r < 0) + return r; + + if (isempty(swho)) + who = KILL_ALL; + else { + who = kill_who_from_string(swho); + if (who < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid kill parameter '%s'", swho); + } + + if (!SIGNAL_VALID(signo)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal %i", signo); + + const char *details[] = { + "machine", m->name, + "verb", "kill", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_KILL, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = machine_kill(m, who, signo); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_machine_method_get_addresses(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(iay)"); + if (r < 0) + return r; + + switch (m->class) { + + case MACHINE_HOST: { + _cleanup_free_ struct local_address *addresses = NULL; + int n; + + n = local_addresses(NULL, 0, AF_UNSPEC, &addresses); + if (n < 0) + return n; + + for (int i = 0; i < n; i++) { + r = sd_bus_message_open_container(reply, 'r', "iay"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", addresses[i].family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &addresses[i].address, FAMILY_ADDRESS_SIZE(addresses[i].family)); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + break; + } + + case MACHINE_CONTAINER: { + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + _cleanup_free_ char *us = NULL, *them = NULL; + _cleanup_close_ int netns_fd = -EBADF; + const char *p; + pid_t child; + + r = readlink_malloc("/proc/self/ns/net", &us); + if (r < 0) + return r; + + p = procfs_file_alloca(m->leader.pid, "ns/net"); + r = readlink_malloc(p, &them); + if (r < 0) + return r; + + if (streq(us, them)) + return sd_bus_error_setf(error, BUS_ERROR_NO_PRIVATE_NETWORKING, "Machine %s does not use private networking", m->name); + + r = namespace_open(m->leader.pid, NULL, NULL, &netns_fd, NULL, NULL); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET, 0, pair) < 0) + return -errno; + + r = namespace_fork("(sd-addrns)", "(sd-addr)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + -1, -1, netns_fd, -1, -1, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + _cleanup_free_ struct local_address *addresses = NULL; + struct local_address *a; + int i, n; + + pair[0] = safe_close(pair[0]); + + n = local_addresses(NULL, 0, AF_UNSPEC, &addresses); + if (n < 0) + _exit(EXIT_FAILURE); + + for (a = addresses, i = 0; i < n; a++, i++) { + struct iovec iov[2] = { + { .iov_base = &a->family, .iov_len = sizeof(a->family) }, + { .iov_base = &a->address, .iov_len = FAMILY_ADDRESS_SIZE(a->family) }, + }; + + r = writev(pair[1], iov, 2); + if (r < 0) + _exit(EXIT_FAILURE); + } + + pair[1] = safe_close(pair[1]); + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + for (;;) { + int family; + ssize_t n; + union in_addr_union in_addr; + struct iovec iov[2]; + struct msghdr mh = { + .msg_iov = iov, + .msg_iovlen = 2, + }; + + iov[0] = IOVEC_MAKE(&family, sizeof(family)); + iov[1] = IOVEC_MAKE(&in_addr, sizeof(in_addr)); + + n = recvmsg(pair[0], &mh, 0); + if (n < 0) + return -errno; + if ((size_t) n < sizeof(family)) + break; + + r = sd_bus_message_open_container(reply, 'r', "iay"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", family); + if (r < 0) + return r; + + switch (family) { + + case AF_INET: + if (n != sizeof(struct in_addr) + sizeof(family)) + return -EIO; + + r = sd_bus_message_append_array(reply, 'y', &in_addr.in, sizeof(in_addr.in)); + break; + + case AF_INET6: + if (n != sizeof(struct in6_addr) + sizeof(family)) + return -EIO; + + r = sd_bus_message_append_array(reply, 'y', &in_addr.in6, sizeof(in_addr.in6)); + break; + } + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + r = wait_for_terminate_and_check("(sd-addrns)", child, 0); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to wait for child: %m"); + if (r != EXIT_SUCCESS) + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Child died abnormally."); + break; + } + + default: + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Requesting IP address data is only supported on container machines."); + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +#define EXIT_NOT_FOUND 2 + +int bus_machine_method_get_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + switch (m->class) { + + case MACHINE_HOST: + r = load_os_release_pairs(NULL, &l); + if (r < 0) + return r; + + break; + + case MACHINE_CONTAINER: { + _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF; + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + _cleanup_fclose_ FILE *f = NULL; + pid_t child; + + r = namespace_open(m->leader.pid, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET, 0, pair) < 0) + return -errno; + + r = namespace_fork("(sd-osrelns)", "(sd-osrel)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + pidns_fd, mntns_fd, -1, -1, root_fd, + &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + int fd = -EBADF; + + pair[0] = safe_close(pair[0]); + + r = open_os_release(NULL, NULL, &fd); + if (r == -ENOENT) + _exit(EXIT_NOT_FOUND); + if (r < 0) + _exit(EXIT_FAILURE); + + r = copy_bytes(fd, pair[1], UINT64_MAX, 0); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + f = take_fdopen(&pair[0], "r"); + if (!f) + return -errno; + + r = load_env_file_pairs(f, "/etc/os-release", &l); + if (r < 0) + return r; + + r = wait_for_terminate_and_check("(sd-osrelns)", child, 0); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to wait for child: %m"); + if (r == EXIT_NOT_FOUND) + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Machine does not contain OS release information"); + if (r != EXIT_SUCCESS) + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Child died abnormally."); + + break; + } + + default: + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Requesting OS release data is only supported on container machines."); + } + + return bus_reply_pair_array(message, l); +} + +int bus_machine_method_open_pty(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *pty_name = NULL; + _cleanup_close_ int master = -EBADF; + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + const char *details[] = { + "machine", m->name, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-open-pty" : "org.freedesktop.machine1.open-pty", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + master = machine_openpt(m, O_RDWR|O_NOCTTY|O_CLOEXEC, &pty_name); + if (master < 0) + return master; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "hs", master, pty_name); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int container_bus_new(Machine *m, sd_bus_error *error, sd_bus **ret) { + int r; + + assert(m); + assert(ret); + + switch (m->class) { + + case MACHINE_HOST: + *ret = NULL; + break; + + case MACHINE_CONTAINER: { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + char *address; + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + if (asprintf(&address, "x-machine-unix:pid=%" PID_PRI, m->leader.pid) < 0) + return -ENOMEM; + + bus->address = address; + bus->bus_client = true; + bus->trusted = false; + bus->runtime_scope = RUNTIME_SCOPE_SYSTEM; + + r = sd_bus_start(bus); + if (r == -ENOENT) + return sd_bus_error_set_errnof(error, r, "There is no system bus in container %s.", m->name); + if (r < 0) + return r; + + *ret = TAKE_PTR(bus); + break; + } + + default: + return -EOPNOTSUPP; + } + + return 0; +} + +int bus_machine_method_open_login(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *pty_name = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *allocated_bus = NULL; + _cleanup_close_ int master = -EBADF; + sd_bus *container_bus = NULL; + Machine *m = ASSERT_PTR(userdata); + const char *p, *getty; + int r; + + assert(message); + + const char *details[] = { + "machine", m->name, + "verb", "login", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-login" : "org.freedesktop.machine1.login", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + master = machine_openpt(m, O_RDWR|O_NOCTTY|O_CLOEXEC, &pty_name); + if (master < 0) + return master; + + p = path_startswith(pty_name, "/dev/pts/"); + assert(p); + + r = container_bus_new(m, error, &allocated_bus); + if (r < 0) + return r; + + container_bus = allocated_bus ?: m->manager->bus; + + getty = strjoina("container-getty@", p, ".service"); + + r = bus_call_method(container_bus, bus_systemd_mgr, "StartUnit", error, NULL, "ss", getty, "replace"); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "hs", master, pty_name); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +int bus_machine_method_open_shell(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *tm = NULL; + _cleanup_free_ char *pty_name = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *allocated_bus = NULL; + sd_bus *container_bus = NULL; + _cleanup_close_ int master = -EBADF, slave = -EBADF; + _cleanup_strv_free_ char **env = NULL, **args_wire = NULL, **args = NULL; + _cleanup_free_ char *command_line = NULL; + Machine *m = ASSERT_PTR(userdata); + const char *p, *unit, *user, *path, *description, *utmp_id; + int r; + + assert(message); + + r = sd_bus_message_read(message, "ss", &user, &path); + if (r < 0) + return r; + user = isempty(user) ? "root" : user; + r = sd_bus_message_read_strv(message, &args_wire); + if (r < 0) + return r; + if (isempty(path)) { + path = "/bin/sh"; + + args = new0(char*, 3 + 1); + if (!args) + return -ENOMEM; + args[0] = strdup("sh"); + if (!args[0]) + return -ENOMEM; + args[1] = strdup("-c"); + if (!args[1]) + return -ENOMEM; + r = asprintf(&args[2], + "shell=$(getent passwd %s 2>/dev/null | { IFS=: read _ _ _ _ _ _ x; echo \"$x\"; })\n"\ + "exec \"${shell:-/bin/sh}\" -l", /* -l is means --login */ + user); + if (r < 0) { + args[2] = NULL; + return -ENOMEM; + } + } else { + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Specified path '%s' is not absolute", path); + args = TAKE_PTR(args_wire); + if (strv_isempty(args)) { + args = strv_free(args); + + args = strv_new(path); + if (!args) + return -ENOMEM; + } + } + + r = sd_bus_message_read_strv(message, &env); + if (r < 0) + return r; + if (!strv_env_is_valid(env)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments"); + + command_line = strv_join(args, " "); + if (!command_line) + return -ENOMEM; + const char *details[] = { + "machine", m->name, + "user", user, + "program", path, + "command_line", command_line, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + m->class == MACHINE_HOST ? "org.freedesktop.machine1.host-shell" : "org.freedesktop.machine1.shell", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + master = machine_openpt(m, O_RDWR|O_NOCTTY|O_CLOEXEC, &pty_name); + if (master < 0) + return master; + + p = path_startswith(pty_name, "/dev/pts/"); + assert(p); + + slave = machine_open_terminal(m, pty_name, O_RDWR|O_NOCTTY|O_CLOEXEC); + if (slave < 0) + return slave; + + utmp_id = path_startswith(pty_name, "/dev/"); + assert(utmp_id); + + r = container_bus_new(m, error, &allocated_bus); + if (r < 0) + return r; + + container_bus = allocated_bus ?: m->manager->bus; + + r = bus_message_new_method_call(container_bus, &tm, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return r; + + /* Name and mode */ + unit = strjoina("container-shell@", p, ".service"); + r = sd_bus_message_append(tm, "ss", unit, "fail"); + if (r < 0) + return r; + + /* Properties */ + r = sd_bus_message_open_container(tm, 'a', "(sv)"); + if (r < 0) + return r; + + description = strjoina("Shell for User ", user); + r = sd_bus_message_append(tm, + "(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)", + "Description", "s", description, + "StandardInputFileDescriptor", "h", slave, + "StandardOutputFileDescriptor", "h", slave, + "StandardErrorFileDescriptor", "h", slave, + "SendSIGHUP", "b", true, + "IgnoreSIGPIPE", "b", false, + "KillMode", "s", "mixed", + "TTYPath", "s", pty_name, + "TTYReset", "b", true, + "UtmpIdentifier", "s", utmp_id, + "UtmpMode", "s", "user", + "PAMName", "s", "login", + "WorkingDirectory", "s", "-~"); + if (r < 0) + return r; + + r = sd_bus_message_append(tm, "(sv)", "User", "s", user); + if (r < 0) + return r; + + if (!strv_isempty(env)) { + r = sd_bus_message_open_container(tm, 'r', "sv"); + if (r < 0) + return r; + + r = sd_bus_message_append(tm, "s", "Environment"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(tm, 'v', "as"); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(tm, env); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + } + + /* Exec container */ + r = sd_bus_message_open_container(tm, 'r', "sv"); + if (r < 0) + return r; + + r = sd_bus_message_append(tm, "s", "ExecStart"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(tm, 'v', "a(sasb)"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(tm, 'a', "(sasb)"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(tm, 'r', "sasb"); + if (r < 0) + return r; + + r = sd_bus_message_append(tm, "s", path); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(tm, args); + if (r < 0) + return r; + + r = sd_bus_message_append(tm, "b", true); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + + r = sd_bus_message_close_container(tm); + if (r < 0) + return r; + + /* Auxiliary units */ + r = sd_bus_message_append(tm, "a(sa(sv))", 0); + if (r < 0) + return r; + + r = sd_bus_call(container_bus, tm, 0, error, NULL); + if (r < 0) + return r; + + slave = safe_close(slave); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "hs", master, pty_name); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) { + int read_only, make_file_or_directory; + const char *dest, *src, *propagate_directory; + Machine *m = ASSERT_PTR(userdata); + uid_t uid; + int r; + + assert(message); + + if (m->class != MACHINE_CONTAINER) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Bind mounting is only supported on container machines."); + + r = sd_bus_message_read(message, "ssbb", &src, &dest, &read_only, &make_file_or_directory); + if (r < 0) + return r; + + if (!path_is_absolute(src) || !path_is_normalized(src)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized."); + + if (isempty(dest)) + dest = src; + else if (!path_is_absolute(dest) || !path_is_normalized(dest)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized."); + + const char *details[] = { + "machine", m->name, + "verb", "bind", + "src", src, + "dest", dest, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = machine_get_uid_shift(m, &uid); + if (r < 0) + return r; + if (uid != 0) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Can't bind mount on container with user namespacing applied."); + + propagate_directory = strjoina("/run/systemd/nspawn/propagate/", m->name); + r = bind_mount_in_namespace( + &m->leader, + propagate_directory, + "/run/host/incoming/", + src, dest, + read_only, + make_file_or_directory); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in machine's namespace: %m", src, dest); + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_machine_method_copy(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *host_basename = NULL, *container_basename = NULL; + const char *src, *dest, *host_path, *container_path; + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + CopyFlags copy_flags = COPY_REFLINK|COPY_MERGE|COPY_HARDLINKS; + _cleanup_close_ int hostfd = -EBADF; + Machine *m = ASSERT_PTR(userdata); + bool copy_from; + pid_t child; + uid_t uid_shift; + int r; + + assert(message); + + if (m->manager->n_operations >= OPERATIONS_MAX) + return sd_bus_error_set(error, SD_BUS_ERROR_LIMITS_EXCEEDED, "Too many ongoing copies."); + + if (m->class != MACHINE_CONTAINER) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Copying files is only supported on container machines."); + + r = sd_bus_message_read(message, "ss", &src, &dest); + if (r < 0) + return r; + + if (endswith(sd_bus_message_get_member(message), "WithFlags")) { + uint64_t raw_flags; + + r = sd_bus_message_read(message, "t", &raw_flags); + if (r < 0) + return r; + + if ((raw_flags & ~_MACHINE_COPY_FLAGS_MASK_PUBLIC) != 0) + return -EINVAL; + + if (raw_flags & MACHINE_COPY_REPLACE) + copy_flags |= COPY_REPLACE; + } + + if (!path_is_absolute(src)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute."); + + if (isempty(dest)) + dest = src; + else if (!path_is_absolute(dest)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute."); + + const char *details[] = { + "machine", m->name, + "verb", "copy", + "src", src, + "dest", dest, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = machine_get_uid_shift(m, &uid_shift); + if (r < 0) + return r; + + copy_from = strstr(sd_bus_message_get_member(message), "CopyFrom"); + + if (copy_from) { + container_path = src; + host_path = dest; + } else { + host_path = src; + container_path = dest; + } + + r = path_extract_filename(host_path, &host_basename); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to extract file name of '%s' path: %m", host_path); + + r = path_extract_filename(container_path, &container_basename); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to extract file name of '%s' path: %m", container_path); + + hostfd = open_parent(host_path, O_CLOEXEC, 0); + if (hostfd < 0) + return sd_bus_error_set_errnof(error, hostfd, "Failed to open host directory %s: %m", host_path); + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to create pipe: %m"); + + r = safe_fork("(sd-copy)", FORK_RESET_SIGNALS, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + int containerfd; + const char *q; + int mntfd; + + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + q = procfs_file_alloca(m->leader.pid, "ns/mnt"); + mntfd = open(q, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (mntfd < 0) { + r = log_error_errno(errno, "Failed to open mount namespace of leader: %m"); + goto child_fail; + } + + if (setns(mntfd, CLONE_NEWNS) < 0) { + r = log_error_errno(errno, "Failed to join namespace of leader: %m"); + goto child_fail; + } + + containerfd = open_parent(container_path, O_CLOEXEC, 0); + if (containerfd < 0) { + r = log_error_errno(containerfd, "Failed to open destination directory: %m"); + goto child_fail; + } + + /* Run the actual copy operation. Note that when a UID shift is set we'll either clamp the UID/GID to + * 0 or to the actual UID shift depending on the direction we copy. If no UID shift is set we'll copy + * the UID/GIDs as they are. */ + if (copy_from) + r = copy_tree_at(containerfd, container_basename, hostfd, host_basename, uid_shift == 0 ? UID_INVALID : 0, uid_shift == 0 ? GID_INVALID : 0, copy_flags, NULL, NULL); + else + r = copy_tree_at(hostfd, host_basename, containerfd, container_basename, uid_shift == 0 ? UID_INVALID : uid_shift, uid_shift == 0 ? GID_INVALID : uid_shift, copy_flags, NULL, NULL); + + hostfd = safe_close(hostfd); + containerfd = safe_close(containerfd); + + if (r < 0) { + r = log_error_errno(r, "Failed to copy tree: %m"); + goto child_fail; + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + /* Copying might take a while, hence install a watch on the child, and return */ + + r = operation_new(m->manager, m, child, message, errno_pipe_fd[0], NULL); + if (r < 0) { + (void) sigkill_wait(child); + return r; + } + errno_pipe_fd[0] = -EBADF; + + return 1; +} + +int bus_machine_method_open_root_directory(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_close_ int fd = -EBADF; + Machine *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + const char *details[] = { + "machine", m->name, + "verb", "open_root_directory", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->manager->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + switch (m->class) { + + case MACHINE_HOST: + fd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd < 0) + return -errno; + + break; + + case MACHINE_CONTAINER: { + _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF; + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + pid_t child; + + r = namespace_open(m->leader.pid, NULL, &mntns_fd, NULL, NULL, &root_fd); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) < 0) + return -errno; + + r = namespace_fork("(sd-openrootns)", "(sd-openroot)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + -1, mntns_fd, -1, -1, root_fd, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + _cleanup_close_ int dfd = -EBADF; + + pair[0] = safe_close(pair[0]); + + dfd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (dfd < 0) + _exit(EXIT_FAILURE); + + r = send_one_fd(pair[1], dfd, 0); + dfd = safe_close(dfd); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + r = wait_for_terminate_and_check("(sd-openrootns)", child, 0); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to wait for child: %m"); + if (r != EXIT_SUCCESS) + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Child died abnormally."); + + fd = receive_one_fd(pair[0], MSG_DONTWAIT); + if (fd < 0) + return fd; + + break; + } + + default: + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Opening the root directory is only supported on container machines."); + } + + return sd_bus_reply_method_return(message, "h", fd); +} + +int bus_machine_method_get_uid_shift(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Machine *m = ASSERT_PTR(userdata); + uid_t shift = 0; + int r; + + assert(message); + + /* You wonder why this is a method and not a property? Well, properties are not supposed to return errors, but + * we kinda have to for this. */ + + if (m->class == MACHINE_HOST) + return sd_bus_reply_method_return(message, "u", UINT32_C(0)); + + if (m->class != MACHINE_CONTAINER) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "UID/GID shift may only be determined for container machines."); + + r = machine_get_uid_shift(m, &shift); + if (r == -ENXIO) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Machine %s uses a complex UID/GID mapping, cannot determine shift", m->name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "u", (uint32_t) shift); +} + +static int machine_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + if (streq(path, "/org/freedesktop/machine1/machine/self")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + sd_bus_message *message; + pid_t pid; + + message = sd_bus_get_current_message(bus); + if (!message) + return 0; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + r = manager_get_machine_by_pid(m, pid, &machine); + if (r <= 0) + return 0; + } else { + _cleanup_free_ char *e = NULL; + const char *p; + + p = startswith(path, "/org/freedesktop/machine1/machine/"); + if (!p) + return 0; + + e = bus_label_unescape(p); + if (!e) + return -ENOMEM; + + machine = hashmap_get(m->machines, e); + if (!machine) + return 0; + } + + *found = machine; + return 1; +} + +char *machine_bus_path(Machine *m) { + _cleanup_free_ char *e = NULL; + + assert(m); + + e = bus_label_escape(m->name); + if (!e) + return NULL; + + return strjoin("/org/freedesktop/machine1/machine/", e); +} + +static int machine_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Machine *machine = NULL; + Manager *m = userdata; + int r; + + assert(bus); + assert(path); + assert(nodes); + + HASHMAP_FOREACH(machine, m->machines) { + char *p; + + p = machine_bus_path(machine); + if (!p) + return -ENOMEM; + + r = strv_consume(&l, p); + if (r < 0) + return r; + } + + *nodes = TAKE_PTR(l); + + return 1; +} + +static const sd_bus_vtable machine_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Name", "s", NULL, offsetof(Machine, name), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Id", "ay", bus_property_get_id128, offsetof(Machine, id), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("Timestamp", offsetof(Machine, timestamp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Service", "s", NULL, offsetof(Machine, service), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Unit", "s", NULL, offsetof(Machine, unit), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Scope", "s", NULL, offsetof(Machine, unit), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), + SD_BUS_PROPERTY("Leader", "u", NULL, offsetof(Machine, leader.pid), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("Class", "s", property_get_class, offsetof(Machine, class), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(Machine, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NetworkInterfaces", "ai", property_get_netif, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("State", "s", property_get_state, 0, 0), + + SD_BUS_METHOD("Terminate", + NULL, + NULL, + bus_machine_method_terminate, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Kill", + SD_BUS_ARGS("s", who, "i", signal), + SD_BUS_NO_RESULT, + bus_machine_method_kill, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetAddresses", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(iay)", addresses), + bus_machine_method_get_addresses, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetOSRelease", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a{ss}", fields), + bus_machine_method_get_os_release, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetUIDShift", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("u", shift), + bus_machine_method_get_uid_shift, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenPTY", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("h", pty, "s", pty_path), + bus_machine_method_open_pty, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenLogin", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("h", pty, "s", pty_path), + bus_machine_method_open_login, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenShell", + SD_BUS_ARGS("s", user, "s", path, "as", args, "as", environment), + SD_BUS_RESULT("h", pty, "s", pty_path), + bus_machine_method_open_shell, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("BindMount", + SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir), + SD_BUS_NO_RESULT, + bus_machine_method_bind_mount, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyFrom", + SD_BUS_ARGS("s", source, "s", destination), + SD_BUS_NO_RESULT, + bus_machine_method_copy, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyTo", + SD_BUS_ARGS("s", source, "s", destination), + SD_BUS_NO_RESULT, + bus_machine_method_copy, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyFromWithFlags", + SD_BUS_ARGS("s", source, "s", destination, "t", flags), + SD_BUS_NO_RESULT, + bus_machine_method_copy, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyToWithFlags", + SD_BUS_ARGS("s", source, "s", destination, "t", flags), + SD_BUS_NO_RESULT, + bus_machine_method_copy, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenRootDirectory", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("h", fd), + bus_machine_method_open_root_directory, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation machine_object = { + "/org/freedesktop/machine1/machine", + "org.freedesktop.machine1.Machine", + .fallback_vtables = BUS_FALLBACK_VTABLES({machine_vtable, machine_object_find}), + .node_enumerator = machine_node_enumerator, +}; + +int machine_send_signal(Machine *m, bool new_machine) { + _cleanup_free_ char *p = NULL; + + assert(m); + + p = machine_bus_path(m); + if (!p) + return -ENOMEM; + + return sd_bus_emit_signal( + m->manager->bus, + "/org/freedesktop/machine1", + "org.freedesktop.machine1.Manager", + new_machine ? "MachineNew" : "MachineRemoved", + "so", m->name, p); +} + +int machine_send_create_reply(Machine *m, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *c = NULL; + _cleanup_free_ char *p = NULL; + + assert(m); + + if (!m->create_message) + return 0; + + c = TAKE_PTR(m->create_message); + + if (error) + return sd_bus_reply_method_error(c, error); + + /* Update the machine state file before we notify the client + * about the result. */ + machine_save(m); + + p = machine_bus_path(m); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(c, "o", p); +} diff --git a/src/machine/machine-dbus.h b/src/machine/machine-dbus.h new file mode 100644 index 0000000..a013345 --- /dev/null +++ b/src/machine/machine-dbus.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-util.h" +#include "machine.h" + +typedef enum { + MACHINE_COPY_REPLACE = 1 << 0, /* Public API via DBUS, do not change */ + _MACHINE_COPY_FLAGS_MASK_PUBLIC = MACHINE_COPY_REPLACE, +} MachineCopyFlags; + +extern const BusObjectImplementation machine_object; + +char *machine_bus_path(Machine *s); + +int bus_machine_method_unregister(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_terminate(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_get_addresses(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_get_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_open_pty(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_open_login(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_open_shell(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_copy(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_open_root_directory(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_machine_method_get_uid_shift(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int machine_send_signal(Machine *m, bool new_machine); +int machine_send_create_reply(Machine *m, sd_bus_error *error); diff --git a/src/machine/machine.c b/src/machine/machine.c new file mode 100644 index 0000000..44ff5c1 --- /dev/null +++ b/src/machine/machine.c @@ -0,0 +1,910 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "env-file.h" +#include "errno-util.h" +#include "escape.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "machine-dbus.h" +#include "machine.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "special.h" +#include "stdio-util.h" +#include "string-table.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "user-util.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC(Machine*, machine_free); + +int machine_new(Manager *manager, MachineClass class, const char *name, Machine **ret) { + _cleanup_(machine_freep) Machine *m = NULL; + int r; + + assert(manager); + assert(class < _MACHINE_CLASS_MAX); + assert(name); + assert(ret); + + /* Passing class == _MACHINE_CLASS_INVALID here is fine. It + * means as much as "we don't know yet", and that we'll figure + * it out later when loading the state file. */ + + m = new(Machine, 1); + if (!m) + return -ENOMEM; + + *m = (Machine) { + .leader = PIDREF_NULL, + }; + + m->name = strdup(name); + if (!m->name) + return -ENOMEM; + + if (class != MACHINE_HOST) { + m->state_file = path_join("/run/systemd/machines", m->name); + if (!m->state_file) + return -ENOMEM; + } + + m->class = class; + + r = hashmap_put(manager->machines, m->name, m); + if (r < 0) + return r; + + m->manager = manager; + + *ret = TAKE_PTR(m); + return 0; +} + +Machine* machine_free(Machine *m) { + if (!m) + return NULL; + + while (m->operations) + operation_free(m->operations); + + if (m->in_gc_queue) + LIST_REMOVE(gc_queue, m->manager->machine_gc_queue, m); + + machine_release_unit(m); + + free(m->scope_job); + + (void) hashmap_remove(m->manager->machines, m->name); + + if (m->manager->host_machine == m) + m->manager->host_machine = NULL; + + if (pidref_is_set(&m->leader)) { + (void) hashmap_remove_value(m->manager->machine_leaders, PID_TO_PTR(m->leader.pid), m); + pidref_done(&m->leader); + } + + sd_bus_message_unref(m->create_message); + + free(m->name); + free(m->state_file); + free(m->service); + free(m->root_directory); + free(m->netif); + return mfree(m); +} + +int machine_save(Machine *m) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(m); + + if (!m->state_file) + return 0; + + if (!m->started) + return 0; + + r = mkdir_safe_label("/run/systemd/machines", 0755, 0, 0, MKDIR_WARN_MODE); + if (r < 0) + goto fail; + + r = fopen_temporary(m->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "NAME=%s\n", + m->name); + + if (m->unit) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(m->unit); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + + fprintf(f, "SCOPE=%s\n", escaped); /* We continue to call this "SCOPE=" because it is internal only, and we want to stay compatible with old files */ + } + + if (m->scope_job) + fprintf(f, "SCOPE_JOB=%s\n", m->scope_job); + + if (m->service) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(m->service); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + fprintf(f, "SERVICE=%s\n", escaped); + } + + if (m->root_directory) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(m->root_directory); + if (!escaped) { + r = -ENOMEM; + goto fail; + } + fprintf(f, "ROOT=%s\n", escaped); + } + + if (!sd_id128_is_null(m->id)) + fprintf(f, "ID=" SD_ID128_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(m->id)); + + if (pidref_is_set(&m->leader)) + fprintf(f, "LEADER="PID_FMT"\n", m->leader.pid); + + if (m->class != _MACHINE_CLASS_INVALID) + fprintf(f, "CLASS=%s\n", machine_class_to_string(m->class)); + + if (dual_timestamp_is_set(&m->timestamp)) + fprintf(f, + "REALTIME="USEC_FMT"\n" + "MONOTONIC="USEC_FMT"\n", + m->timestamp.realtime, + m->timestamp.monotonic); + + if (m->n_netif > 0) { + size_t i; + + fputs("NETIF=", f); + + for (i = 0; i < m->n_netif; i++) { + if (i != 0) + fputc(' ', f); + + fprintf(f, "%i", m->netif[i]); + } + + fputc('\n', f); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, m->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + + if (m->unit) { + char *sl; + + /* Create a symlink from the unit name to the machine + * name, so that we can quickly find the machine for + * each given unit. Ignore error. */ + sl = strjoina("/run/systemd/machines/unit:", m->unit); + (void) symlink(m->name, sl); + } + + return 0; + +fail: + (void) unlink(m->state_file); + + return log_error_errno(r, "Failed to save machine data %s: %m", m->state_file); +} + +static void machine_unlink(Machine *m) { + assert(m); + + if (m->unit) { + char *sl; + + sl = strjoina("/run/systemd/machines/unit:", m->unit); + (void) unlink(sl); + } + + if (m->state_file) + (void) unlink(m->state_file); +} + +int machine_load(Machine *m) { + _cleanup_free_ char *realtime = NULL, *monotonic = NULL, *id = NULL, *leader = NULL, *class = NULL, *netif = NULL; + int r; + + assert(m); + + if (!m->state_file) + return 0; + + r = parse_env_file(NULL, m->state_file, + "SCOPE", &m->unit, + "SCOPE_JOB", &m->scope_job, + "SERVICE", &m->service, + "ROOT", &m->root_directory, + "ID", &id, + "LEADER", &leader, + "CLASS", &class, + "REALTIME", &realtime, + "MONOTONIC", &monotonic, + "NETIF", &netif); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", m->state_file); + + if (id) + (void) sd_id128_from_string(id, &m->id); + + if (leader) { + pidref_done(&m->leader); + r = pidref_set_pidstr(&m->leader, leader); + if (r < 0) + log_debug_errno(r, "Failed to set leader PID to '%s', ignoring: %m", leader); + } + + if (class) { + MachineClass c; + + c = machine_class_from_string(class); + if (c >= 0) + m->class = c; + } + + if (realtime) + (void) deserialize_usec(realtime, &m->timestamp.realtime); + if (monotonic) + (void) deserialize_usec(monotonic, &m->timestamp.monotonic); + + if (netif) { + _cleanup_free_ int *ni = NULL; + size_t nr = 0; + const char *p; + + p = netif; + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_warning_errno(r, "Failed to parse NETIF: %s", netif); + break; + } + + r = parse_ifindex(word); + if (r < 0) + continue; + + if (!GREEDY_REALLOC(ni, nr + 1)) + return log_oom(); + + ni[nr++] = r; + } + + free_and_replace(m->netif, ni); + m->n_netif = nr; + } + + return r; +} + +static int machine_start_scope( + Machine *machine, + sd_bus_message *more_properties, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_free_ char *escaped = NULL, *unit = NULL; + const char *description; + int r; + + assert(machine); + assert(pidref_is_set(&machine->leader)); + assert(!machine->unit); + + escaped = unit_name_escape(machine->name); + if (!escaped) + return log_oom(); + + unit = strjoin("machine-", escaped, ".scope"); + if (!unit) + return log_oom(); + + r = bus_message_new_method_call( + machine->manager->bus, + &m, + bus_systemd_mgr, + "StartTransientUnit"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "ss", unit, "fail"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "(sv)", "Slice", "s", SPECIAL_MACHINE_SLICE); + if (r < 0) + return r; + + description = strjoina(machine->class == MACHINE_VM ? "Virtual Machine " : "Container ", machine->name); + r = sd_bus_message_append(m, "(sv)", "Description", "s", description); + if (r < 0) + return r; + + r = bus_append_scope_pidref(m, &machine->leader); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)", + "Delegate", "b", 1, + "CollectMode", "s", "inactive-or-failed", + "AddRef", "b", 1, + "TasksMax", "t", UINT64_C(16384)); + if (r < 0) + return r; + + if (more_properties) { + r = sd_bus_message_copy(m, more_properties, true); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return r; + + r = sd_bus_call(NULL, m, 0, error, &reply); + if (r < 0) + return r; + + machine->unit = TAKE_PTR(unit); + machine->referenced = true; + + const char *job; + r = sd_bus_message_read(reply, "o", &job); + if (r < 0) + return r; + + return free_and_strdup(&machine->scope_job, job); +} + +static int machine_ensure_scope(Machine *m, sd_bus_message *properties, sd_bus_error *error) { + int r; + + assert(m); + assert(m->class != MACHINE_HOST); + + if (!m->unit) { + r = machine_start_scope(m, properties, error); + if (r < 0) + return log_error_errno(r, "Failed to start machine scope: %s", bus_error_message(error, r)); + } + + assert(m->unit); + hashmap_put(m->manager->machine_units, m->unit, m); + + return 0; +} + +int machine_start(Machine *m, sd_bus_message *properties, sd_bus_error *error) { + int r; + + assert(m); + + if (!IN_SET(m->class, MACHINE_CONTAINER, MACHINE_VM)) + return -EOPNOTSUPP; + + if (m->started) + return 0; + + r = hashmap_put(m->manager->machine_leaders, PID_TO_PTR(m->leader.pid), m); + if (r < 0) + return r; + + /* Create cgroup */ + r = machine_ensure_scope(m, properties, error); + if (r < 0) + return r; + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_MACHINE_START_STR, + "NAME=%s", m->name, + "LEADER="PID_FMT, m->leader.pid, + LOG_MESSAGE("New machine %s.", m->name)); + + if (!dual_timestamp_is_set(&m->timestamp)) + dual_timestamp_now(&m->timestamp); + + m->started = true; + + /* Save new machine data */ + machine_save(m); + + machine_send_signal(m, true); + (void) manager_enqueue_nscd_cache_flush(m->manager); + + return 0; +} + +int machine_stop(Machine *m) { + int r; + + assert(m); + + if (!IN_SET(m->class, MACHINE_CONTAINER, MACHINE_VM)) + return -EOPNOTSUPP; + + if (m->unit) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + char *job = NULL; + + r = manager_stop_unit(m->manager, m->unit, &error, &job); + if (r < 0) + return log_error_errno(r, "Failed to stop machine scope: %s", bus_error_message(&error, r)); + + free_and_replace(m->scope_job, job); + } + + m->stopping = true; + + machine_save(m); + (void) manager_enqueue_nscd_cache_flush(m->manager); + + return 0; +} + +int machine_finalize(Machine *m) { + assert(m); + + if (m->started) { + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_MACHINE_STOP_STR, + "NAME=%s", m->name, + "LEADER="PID_FMT, m->leader.pid, + LOG_MESSAGE("Machine %s terminated.", m->name)); + + m->stopping = true; /* The machine is supposed to be going away. Don't try to kill it. */ + } + + machine_unlink(m); + machine_add_to_gc_queue(m); + + if (m->started) { + machine_send_signal(m, false); + m->started = false; + } + + return 0; +} + +bool machine_may_gc(Machine *m, bool drop_not_started) { + assert(m); + + if (m->class == MACHINE_HOST) + return false; + + if (drop_not_started && !m->started) + return true; + + if (m->scope_job && manager_job_is_active(m->manager, m->scope_job)) + return false; + + if (m->unit && manager_unit_is_active(m->manager, m->unit)) + return false; + + return true; +} + +void machine_add_to_gc_queue(Machine *m) { + assert(m); + + if (m->in_gc_queue) + return; + + LIST_PREPEND(gc_queue, m->manager->machine_gc_queue, m); + m->in_gc_queue = true; +} + +MachineState machine_get_state(Machine *s) { + assert(s); + + if (s->class == MACHINE_HOST) + return MACHINE_RUNNING; + + if (s->stopping) + return MACHINE_CLOSING; + + if (s->scope_job) + return MACHINE_OPENING; + + return MACHINE_RUNNING; +} + +int machine_kill(Machine *m, KillWho who, int signo) { + assert(m); + + if (!IN_SET(m->class, MACHINE_VM, MACHINE_CONTAINER)) + return -EOPNOTSUPP; + + if (!m->unit) + return -ESRCH; + + if (who == KILL_LEADER) /* If we shall simply kill the leader, do so directly */ + return pidref_kill(&m->leader, signo); + + /* Otherwise, make PID 1 do it for us, for the entire cgroup */ + return manager_kill_unit(m->manager, m->unit, signo, NULL); +} + +int machine_openpt(Machine *m, int flags, char **ret_slave) { + assert(m); + + switch (m->class) { + + case MACHINE_HOST: + return openpt_allocate(flags, ret_slave); + + case MACHINE_CONTAINER: + if (!pidref_is_set(&m->leader)) + return -EINVAL; + + return openpt_allocate_in_namespace(m->leader.pid, flags, ret_slave); + + default: + return -EOPNOTSUPP; + } +} + +int machine_open_terminal(Machine *m, const char *path, int mode) { + assert(m); + + switch (m->class) { + + case MACHINE_HOST: + return open_terminal(path, mode); + + case MACHINE_CONTAINER: + if (!pidref_is_set(&m->leader)) + return -EINVAL; + + return open_terminal_in_namespace(m->leader.pid, path, mode); + + default: + return -EOPNOTSUPP; + } +} + +void machine_release_unit(Machine *m) { + assert(m); + + if (!m->unit) + return; + + if (m->referenced) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + r = manager_unref_unit(m->manager, m->unit, &error); + if (r < 0) + log_warning_errno(r, "Failed to drop reference to machine scope, ignoring: %s", + bus_error_message(&error, r)); + + m->referenced = false; + } + + (void) hashmap_remove(m->manager->machine_units, m->unit); + m->unit = mfree(m->unit); +} + +int machine_get_uid_shift(Machine *m, uid_t *ret) { + char p[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1]; + uid_t uid_base, uid_shift, uid_range; + gid_t gid_base, gid_shift, gid_range; + _cleanup_fclose_ FILE *f = NULL; + int k, r; + + assert(m); + assert(ret); + + /* Return the base UID/GID of the specified machine. Note that this only works for containers with simple + * mappings. In most cases setups should be simple like this, and administrators should only care about the + * basic offset a container has relative to the host. This is what this function exposes. + * + * If we encounter any more complex mappings we politely refuse this with ENXIO. */ + + if (m->class == MACHINE_HOST) { + *ret = 0; + return 0; + } + + if (m->class != MACHINE_CONTAINER) + return -EOPNOTSUPP; + + xsprintf(p, "/proc/" PID_FMT "/uid_map", m->leader.pid); + f = fopen(p, "re"); + if (!f) { + if (errno == ENOENT) { + /* If the file doesn't exist, user namespacing is off in the kernel, return a zero mapping hence. */ + *ret = 0; + return 0; + } + + return -errno; + } + + /* Read the first line. There's at least one. */ + errno = 0; + k = fscanf(f, UID_FMT " " UID_FMT " " UID_FMT "\n", &uid_base, &uid_shift, &uid_range); + if (k != 3) { + if (ferror(f)) + return errno_or_else(EIO); + + return -EBADMSG; + } + + /* Not a mapping starting at 0? Then it's a complex mapping we can't expose here. */ + if (uid_base != 0) + return -ENXIO; + /* Insist that at least the nobody user is mapped, everything else is weird, and hence complex, and we don't support it */ + if (uid_range < UID_NOBODY) + return -ENXIO; + + /* If there's more than one line, then we don't support this mapping. */ + r = safe_fgetc(f, NULL); + if (r < 0) + return r; + if (r != 0) /* Insist on EOF */ + return -ENXIO; + + fclose(f); + + xsprintf(p, "/proc/" PID_FMT "/gid_map", m->leader.pid); + f = fopen(p, "re"); + if (!f) + return -errno; + + /* Read the first line. There's at least one. */ + errno = 0; + k = fscanf(f, GID_FMT " " GID_FMT " " GID_FMT "\n", &gid_base, &gid_shift, &gid_range); + if (k != 3) { + if (ferror(f)) + return errno_or_else(EIO); + + return -EBADMSG; + } + + /* If there's more than one line, then we don't support this file. */ + r = safe_fgetc(f, NULL); + if (r < 0) + return r; + if (r != 0) /* Insist on EOF */ + return -ENXIO; + + /* If the UID and GID mapping doesn't match, we don't support this mapping. */ + if (uid_base != (uid_t) gid_base) + return -ENXIO; + if (uid_shift != (uid_t) gid_shift) + return -ENXIO; + if (uid_range != (uid_t) gid_range) + return -ENXIO; + + *ret = uid_shift; + return 0; +} + +static int machine_owns_uid_internal( + Machine *machine, + const char *map_file, /* "uid_map" or "gid_map" */ + uid_t uid, + uid_t *ret_internal_uid) { + + _cleanup_fclose_ FILE *f = NULL; + const char *p; + + /* This is a generic implementation for both uids and gids, under the assumptions they have the same types and semantics. */ + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + assert(machine); + + /* Checks if the specified host UID is owned by the machine, and returns the UID it maps to + * internally in the machine */ + + if (machine->class != MACHINE_CONTAINER) + goto negative; + + p = procfs_file_alloca(machine->leader.pid, map_file); + f = fopen(p, "re"); + if (!f) { + log_debug_errno(errno, "Failed to open %s, ignoring.", p); + goto negative; + } + + for (;;) { + uid_t uid_base, uid_shift, uid_range, converted; + int k; + + errno = 0; + k = fscanf(f, UID_FMT " " UID_FMT " " UID_FMT, &uid_base, &uid_shift, &uid_range); + if (k < 0 && feof(f)) + break; + if (k != 3) { + if (ferror(f)) + return errno_or_else(EIO); + + return -EIO; + } + + /* The private user namespace is disabled, ignoring. */ + if (uid_shift == 0) + continue; + + if (uid < uid_shift || uid >= uid_shift + uid_range) + continue; + + converted = (uid - uid_shift + uid_base); + if (!uid_is_valid(converted)) + return -EINVAL; + + if (ret_internal_uid) + *ret_internal_uid = converted; + + return true; + } + +negative: + if (ret_internal_uid) + *ret_internal_uid = UID_INVALID; + + return false; +} + +int machine_owns_uid(Machine *machine, uid_t uid, uid_t *ret_internal_uid) { + return machine_owns_uid_internal(machine, "uid_map", uid, ret_internal_uid); +} + +int machine_owns_gid(Machine *machine, gid_t gid, gid_t *ret_internal_gid) { + return machine_owns_uid_internal(machine, "gid_map", (uid_t) gid, (uid_t*) ret_internal_gid); +} + +static int machine_translate_uid_internal( + Machine *machine, + const char *map_file, /* "uid_map" or "gid_map" */ + uid_t uid, + uid_t *ret_host_uid) { + + _cleanup_fclose_ FILE *f = NULL; + const char *p; + + /* This is a generic implementation for both uids and gids, under the assumptions they have the same types and semantics. */ + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + assert(machine); + assert(uid_is_valid(uid)); + + if (machine->class != MACHINE_CONTAINER) + return -ESRCH; + + /* Translates a machine UID into a host UID */ + + p = procfs_file_alloca(machine->leader.pid, map_file); + f = fopen(p, "re"); + if (!f) + return -errno; + + for (;;) { + uid_t uid_base, uid_shift, uid_range, converted; + int k; + + errno = 0; + k = fscanf(f, UID_FMT " " UID_FMT " " UID_FMT, &uid_base, &uid_shift, &uid_range); + if (k < 0 && feof(f)) + break; + if (k != 3) { + if (ferror(f)) + return errno_or_else(EIO); + + return -EIO; + } + + if (uid < uid_base || uid >= uid_base + uid_range) + continue; + + converted = uid - uid_base + uid_shift; + if (!uid_is_valid(converted)) + return -EINVAL; + + if (ret_host_uid) + *ret_host_uid = converted; + return 0; + } + + return -ESRCH; +} + +int machine_translate_uid(Machine *machine, gid_t uid, gid_t *ret_host_uid) { + return machine_translate_uid_internal(machine, "uid_map", uid, ret_host_uid); +} + +int machine_translate_gid(Machine *machine, gid_t gid, gid_t *ret_host_gid) { + return machine_translate_uid_internal(machine, "gid_map", (uid_t) gid, (uid_t*) ret_host_gid); +} + +static const char* const machine_class_table[_MACHINE_CLASS_MAX] = { + [MACHINE_CONTAINER] = "container", + [MACHINE_VM] = "vm", + [MACHINE_HOST] = "host", +}; + +DEFINE_STRING_TABLE_LOOKUP(machine_class, MachineClass); + +static const char* const machine_state_table[_MACHINE_STATE_MAX] = { + [MACHINE_OPENING] = "opening", + [MACHINE_RUNNING] = "running", + [MACHINE_CLOSING] = "closing" +}; + +DEFINE_STRING_TABLE_LOOKUP(machine_state, MachineState); + +static const char* const kill_who_table[_KILL_WHO_MAX] = { + [KILL_LEADER] = "leader", + [KILL_ALL] = "all" +}; + +DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho); diff --git a/src/machine/machine.h b/src/machine/machine.h new file mode 100644 index 0000000..30ef93b --- /dev/null +++ b/src/machine/machine.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Machine Machine; +typedef enum KillWho KillWho; + +#include "list.h" +#include "machined.h" +#include "operation.h" +#include "pidref.h" +#include "time-util.h" + +typedef enum MachineState { + MACHINE_OPENING, /* Machine is being registered */ + MACHINE_RUNNING, /* Machine is running */ + MACHINE_CLOSING, /* Machine is terminating */ + _MACHINE_STATE_MAX, + _MACHINE_STATE_INVALID = -EINVAL, +} MachineState; + +typedef enum MachineClass { + MACHINE_CONTAINER, + MACHINE_VM, + MACHINE_HOST, + _MACHINE_CLASS_MAX, + _MACHINE_CLASS_INVALID = -EINVAL, +} MachineClass; + +enum KillWho { + KILL_LEADER, + KILL_ALL, + _KILL_WHO_MAX, + _KILL_WHO_INVALID = -EINVAL, +}; + +struct Machine { + Manager *manager; + + char *name; + sd_id128_t id; + + MachineClass class; + + char *state_file; + char *service; + char *root_directory; + + char *unit; + char *scope_job; + + PidRef leader; + + dual_timestamp timestamp; + + bool in_gc_queue:1; + bool started:1; + bool stopping:1; + bool referenced:1; + + sd_bus_message *create_message; + + int *netif; + size_t n_netif; + + LIST_HEAD(Operation, operations); + + LIST_FIELDS(Machine, gc_queue); +}; + +int machine_new(Manager *manager, MachineClass class, const char *name, Machine **ret); +Machine* machine_free(Machine *m); +bool machine_may_gc(Machine *m, bool drop_not_started); +void machine_add_to_gc_queue(Machine *m); +int machine_start(Machine *m, sd_bus_message *properties, sd_bus_error *error); +int machine_stop(Machine *m); +int machine_finalize(Machine *m); +int machine_save(Machine *m); +int machine_load(Machine *m); +int machine_kill(Machine *m, KillWho who, int signo); + +void machine_release_unit(Machine *m); + +MachineState machine_get_state(Machine *u); + +const char* machine_class_to_string(MachineClass t) _const_; +MachineClass machine_class_from_string(const char *s) _pure_; + +const char* machine_state_to_string(MachineState t) _const_; +MachineState machine_state_from_string(const char *s) _pure_; + +const char *kill_who_to_string(KillWho k) _const_; +KillWho kill_who_from_string(const char *s) _pure_; + +int machine_openpt(Machine *m, int flags, char **ret_slave); +int machine_open_terminal(Machine *m, const char *path, int mode); + +int machine_get_uid_shift(Machine *m, uid_t *ret); + +int machine_owns_uid(Machine *m, uid_t host_uid, uid_t *ret_internal_uid); +int machine_owns_gid(Machine *m, gid_t host_gid, gid_t *ret_internal_gid); + +int machine_translate_uid(Machine *m, uid_t internal_uid, uid_t *ret_host_uid); +int machine_translate_gid(Machine *m, gid_t internal_gid, gid_t *ret_host_gid); diff --git a/src/machine/machinectl.c b/src/machine/machinectl.c new file mode 100644 index 0000000..418dd00 --- /dev/null +++ b/src/machine/machinectl.c @@ -0,0 +1,3007 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-print-properties.h" +#include "bus-unit-procs.h" +#include "bus-unit-util.h" +#include "bus-wait-for-jobs.h" +#include "cgroup-show.h" +#include "cgroup-util.h" +#include "constants.h" +#include "copy.h" +#include "edit-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "format-table.h" +#include "hostname-util.h" +#include "import-util.h" +#include "locale-util.h" +#include "log.h" +#include "logs-show.h" +#include "machine-dbus.h" +#include "macro.h" +#include "main-func.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "ptyfwd.h" +#include "rlimit-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "sort-util.h" +#include "spawn-ask-password-agent.h" +#include "spawn-polkit-agent.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "terminal-util.h" +#include "unit-name.h" +#include "verbs.h" +#include "web-util.h" + +static char **arg_property = NULL; +static bool arg_all = false; +static BusPrintPropertyFlags arg_print_flags = 0; +static bool arg_full = false; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static const char *arg_kill_whom = NULL; +static int arg_signal = SIGTERM; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static const char *arg_host = NULL; +static bool arg_read_only = false; +static bool arg_mkdir = false; +static bool arg_quiet = false; +static bool arg_ask_password = true; +static unsigned arg_lines = 10; +static OutputMode arg_output = OUTPUT_SHORT; +static bool arg_now = false; +static bool arg_force = false; +static ImportVerify arg_verify = IMPORT_VERIFY_SIGNATURE; +static const char* arg_format = NULL; +static const char *arg_uid = NULL; +static char **arg_setenv = NULL; +static unsigned arg_max_addresses = 1; + +STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep); + +static OutputFlags get_output_flags(void) { + return + FLAGS_SET(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) * OUTPUT_SHOW_ALL | + (arg_full || !on_tty() || pager_have()) * OUTPUT_FULL_WIDTH | + colors_enabled() * OUTPUT_COLOR | + !arg_quiet * OUTPUT_WARN_CUTOFF; +} + +static int call_get_os_release(sd_bus *bus, const char *method, const char *name, const char *query, ...) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *k, *v, **query_res = NULL; + size_t count = 0, awaited_args = 0; + va_list ap; + int r; + + assert(bus); + assert(name); + assert(query); + + NULSTR_FOREACH(iter, query) + awaited_args++; + query_res = newa0(const char *, awaited_args); + + r = bus_call_method(bus, bus_machine_mgr, method, &error, &reply, "s", name); + if (r < 0) + return log_debug_errno(r, "Failed to call '%s()': %s", method, bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "{ss}"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "{ss}", &k, &v)) > 0) { + count = 0; + NULSTR_FOREACH(iter, query) { + if (streq(k, iter)) { + query_res[count] = v; + break; + } + count++; + } + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + va_start(ap, query); + for (count = 0; count < awaited_args; count++) { + char *val, **out; + + out = va_arg(ap, char **); + assert(out); + if (query_res[count]) { + val = strdup(query_res[count]); + if (!val) { + va_end(ap); + return -ENOMEM; + } + *out = val; + } + } + va_end(ap); + + return 0; +} + +static int call_get_addresses( + sd_bus *bus, + const char *name, + int ifi, + const char *prefix, + const char *prefix2, + char **ret) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *addresses = NULL; + unsigned n = 0; + int r; + + assert(bus); + assert(name); + assert(prefix); + assert(prefix2); + + r = bus_call_method(bus, bus_machine_mgr, "GetMachineAddresses", NULL, &reply, "s", name); + if (r < 0) + return log_debug_errno(r, "Could not get addresses: %s", bus_error_message(&error, r)); + + addresses = strdup(prefix); + if (!addresses) + return log_oom(); + prefix = ""; + + r = sd_bus_message_enter_container(reply, 'a', "(iay)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_enter_container(reply, 'r', "iay")) > 0) { + int family; + const void *a; + size_t sz; + char buf_ifi[1 + DECIMAL_STR_MAX(int)] = ""; + + r = sd_bus_message_read(reply, "i", &family); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &a, &sz); + if (r < 0) + return bus_log_parse_error(r); + + if (family == AF_INET6 && ifi > 0) + xsprintf(buf_ifi, "%%%i", ifi); + + if (!strextend(&addresses, prefix, IN_ADDR_TO_STRING(family, a), buf_ifi)) + return log_oom(); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + prefix = prefix2; + + n++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + *ret = TAKE_PTR(addresses); + return (int) n; +} + +static int show_table(Table *table, const char *word) { + int r; + + assert(table); + assert(word); + + if (table_get_rows(table) > 1 || OUTPUT_MODE_IS_JSON(arg_output)) { + r = table_set_sort(table, (size_t) 0); + if (r < 0) + return table_log_sort_error(r); + + table_set_header(table, arg_legend); + + if (OUTPUT_MODE_IS_JSON(arg_output)) + r = table_print_json(table, NULL, output_mode_to_json_format_flags(arg_output) | JSON_FORMAT_COLOR_AUTO); + else + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + if (table_get_rows(table) > 1) + printf("\n%zu %s listed.\n", table_get_rows(table) - 1, word); + else + printf("No %s.\n", word); + } + + return 0; +} + +static int list_machines(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_machine_mgr, "ListMachines", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Could not get machines: %s", bus_error_message(&error, r)); + + table = table_new("machine", "class", "service", "os", "version", + arg_max_addresses > 0 ? "addresses" : NULL); + if (!table) + return log_oom(); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + if (!arg_full && arg_max_addresses > 0 && arg_max_addresses < UINT_MAX) + table_set_cell_height_max(table, arg_max_addresses); + + if (arg_full) + table_set_width(table, 0); + + r = sd_bus_message_enter_container(reply, 'a', "(ssso)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + _cleanup_free_ char *os = NULL, *version_id = NULL, *addresses = NULL; + const char *name, *class, *service; + + r = sd_bus_message_read(reply, "(ssso)", &name, &class, &service, NULL); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (name[0] == '.' && !arg_all) + continue; + + (void) call_get_os_release( + bus, + "GetMachineOSRelease", + name, + "ID\0" + "VERSION_ID\0", + &os, + &version_id); + + r = table_add_many(table, + TABLE_STRING, empty_to_null(name), + TABLE_STRING, empty_to_null(class), + TABLE_STRING, empty_to_null(service), + TABLE_STRING, empty_to_null(os), + TABLE_STRING, empty_to_null(version_id)); + if (r < 0) + return table_log_add_error(r); + + if (arg_max_addresses > 0) { + (void) call_get_addresses(bus, name, 0, "", "\n", &addresses); + + r = table_add_many(table, + TABLE_STRING, empty_to_null(addresses)); + if (r < 0) + return table_log_add_error(r); + } + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return show_table(table, "machines"); +} + +static int list_images(int argc, char *argv[], void *userdata) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_machine_mgr, "ListImages", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Could not get images: %s", bus_error_message(&error, r)); + + table = table_new("name", "type", "ro", "usage", "created", "modified"); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + (void) table_set_align_percent(table, TABLE_HEADER_CELL(3), 100); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssbttto)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + uint64_t crtime, mtime, size; + const char *name, *type; + int ro_int; + + r = sd_bus_message_read(reply, "(ssbttto)", &name, &type, &ro_int, &crtime, &mtime, &size, NULL); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (name[0] == '.' && !arg_all) + continue; + + r = table_add_many(table, + TABLE_STRING, name, + TABLE_STRING, type, + TABLE_BOOLEAN, ro_int, + TABLE_SET_COLOR, ro_int ? ansi_highlight_red() : NULL, + TABLE_SIZE, size, + TABLE_TIMESTAMP, crtime, + TABLE_TIMESTAMP, mtime); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return show_table(table, "images"); +} + +static int show_unit_cgroup(sd_bus *bus, const char *unit, pid_t leader) { + _cleanup_free_ char *cgroup = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + unsigned c; + + assert(bus); + assert(unit); + + r = show_cgroup_get_unit_path_and_warn(bus, unit, &cgroup); + if (r < 0) + return r; + + if (isempty(cgroup)) + return 0; + + c = columns(); + if (c > 18) + c -= 18; + else + c = 0; + + r = unit_show_processes(bus, unit, cgroup, "\t\t ", c, get_output_flags(), &error); + if (r == -EBADR) { + + if (arg_transport == BUS_TRANSPORT_REMOTE) + return 0; + + /* Fallback for older systemd versions where the GetUnitProcesses() call is not yet available */ + + if (cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, cgroup) != 0 && leader <= 0) + return 0; + + show_cgroup_and_extra(SYSTEMD_CGROUP_CONTROLLER, cgroup, "\t\t ", c, &leader, leader > 0, get_output_flags()); + } else if (r < 0) + return log_error_errno(r, "Failed to dump process list: %s", bus_error_message(&error, r)); + + return 0; +} + +static int print_os_release(sd_bus *bus, const char *method, const char *name, const char *prefix) { + _cleanup_free_ char *pretty = NULL; + int r; + + assert(bus); + assert(name); + assert(prefix); + + r = call_get_os_release(bus, method, name, "PRETTY_NAME\0", &pretty, NULL); + if (r < 0) + return r; + + if (pretty) + printf("%s%s\n", prefix, pretty); + + return 0; +} + +static int print_uid_shift(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + uint32_t shift; + int r; + + assert(bus); + assert(name); + + r = bus_call_method(bus, bus_machine_mgr, "GetMachineUIDShift", &error, &reply, "s", name); + if (r < 0) + return log_debug_errno(r, "Failed to query UID/GID shift: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "u", &shift); + if (r < 0) + return r; + + if (shift == 0) /* Don't show trivial mappings */ + return 0; + + printf(" UID Shift: %" PRIu32 "\n", shift); + return 0; +} + +typedef struct MachineStatusInfo { + const char *name; + sd_id128_t id; + const char *class; + const char *service; + const char *unit; + const char *root_directory; + pid_t leader; + struct dual_timestamp timestamp; + int *netif; + size_t n_netif; +} MachineStatusInfo; + +static void machine_status_info_clear(MachineStatusInfo *info) { + if (info) { + free(info->netif); + zero(*info); + } +} + +static void print_machine_status_info(sd_bus *bus, MachineStatusInfo *i) { + _cleanup_free_ char *addresses = NULL, *s1 = NULL, *s2 = NULL; + int ifi = -1; + + assert(bus); + assert(i); + + fputs(strna(i->name), stdout); + + if (!sd_id128_is_null(i->id)) + printf("(" SD_ID128_FORMAT_STR ")\n", SD_ID128_FORMAT_VAL(i->id)); + else + putchar('\n'); + + s1 = strdup(strempty(FORMAT_TIMESTAMP_RELATIVE(i->timestamp.realtime))); + s2 = strdup(strempty(FORMAT_TIMESTAMP(i->timestamp.realtime))); + + if (!isempty(s1)) + printf("\t Since: %s; %s\n", strna(s2), s1); + else if (!isempty(s2)) + printf("\t Since: %s\n", s2); + + if (i->leader > 0) { + _cleanup_free_ char *t = NULL; + + printf("\t Leader: %u", (unsigned) i->leader); + + (void) pid_get_comm(i->leader, &t); + if (t) + printf(" (%s)", t); + + putchar('\n'); + } + + if (i->service) { + printf("\t Service: %s", i->service); + + if (i->class) + printf("; class %s", i->class); + + putchar('\n'); + } else if (i->class) + printf("\t Class: %s\n", i->class); + + if (i->root_directory) + printf("\t Root: %s\n", i->root_directory); + + if (i->n_netif > 0) { + fputs("\t Iface:", stdout); + + for (size_t c = 0; c < i->n_netif; c++) { + char name[IF_NAMESIZE]; + + if (format_ifname(i->netif[c], name) >= 0) { + fputc(' ', stdout); + fputs(name, stdout); + + if (ifi < 0) + ifi = i->netif[c]; + else + ifi = 0; + } else + printf(" %i", i->netif[c]); + } + + fputc('\n', stdout); + } + + if (call_get_addresses(bus, i->name, ifi, + "\t Address: ", "\n\t ", + &addresses) > 0) { + fputs(addresses, stdout); + fputc('\n', stdout); + } + + print_os_release(bus, "GetMachineOSRelease", i->name, "\t OS: "); + + print_uid_shift(bus, i->name); + + if (i->unit) { + printf("\t Unit: %s\n", i->unit); + show_unit_cgroup(bus, i->unit, i->leader); + + if (arg_transport == BUS_TRANSPORT_LOCAL) + + show_journal_by_unit( + stdout, + i->unit, + NULL, + arg_output, + 0, + i->timestamp.monotonic, + arg_lines, + 0, + get_output_flags() | OUTPUT_BEGIN_NEWLINE, + SD_JOURNAL_LOCAL_ONLY, + true, + NULL); + } +} + +static int map_netif(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + MachineStatusInfo *i = userdata; + size_t l; + const void *v; + int r; + + assert_cc(sizeof(int32_t) == sizeof(int)); + r = sd_bus_message_read_array(m, SD_BUS_TYPE_INT32, &v, &l); + if (r < 0) + return r; + if (r == 0) + return -EBADMSG; + + i->n_netif = l / sizeof(int32_t); + i->netif = memdup(v, l); + if (!i->netif) + return -ENOMEM; + + return 0; +} + +static int show_machine_info(const char *verb, sd_bus *bus, const char *path, bool *new_line) { + + static const struct bus_properties_map map[] = { + { "Name", "s", NULL, offsetof(MachineStatusInfo, name) }, + { "Class", "s", NULL, offsetof(MachineStatusInfo, class) }, + { "Service", "s", NULL, offsetof(MachineStatusInfo, service) }, + { "Unit", "s", NULL, offsetof(MachineStatusInfo, unit) }, + { "RootDirectory", "s", NULL, offsetof(MachineStatusInfo, root_directory) }, + { "Leader", "u", NULL, offsetof(MachineStatusInfo, leader) }, + { "Timestamp", "t", NULL, offsetof(MachineStatusInfo, timestamp.realtime) }, + { "TimestampMonotonic", "t", NULL, offsetof(MachineStatusInfo, timestamp.monotonic) }, + { "Id", "ay", bus_map_id128, offsetof(MachineStatusInfo, id) }, + { "NetworkInterfaces", "ai", map_netif, 0 }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(machine_status_info_clear) MachineStatusInfo info = {}; + int r; + + assert(verb); + assert(bus); + assert(path); + assert(new_line); + + r = bus_map_all_properties(bus, + "org.freedesktop.machine1", + path, + map, + 0, + &error, + &m, + &info); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + if (*new_line) + printf("\n"); + *new_line = true; + + print_machine_status_info(bus, &info); + + return r; +} + +static int show_machine_properties(sd_bus *bus, const char *path, bool *new_line) { + int r; + + assert(bus); + assert(path); + assert(new_line); + + if (*new_line) + printf("\n"); + + *new_line = true; + + r = bus_print_all_properties(bus, "org.freedesktop.machine1", path, NULL, arg_property, arg_print_flags, NULL); + if (r < 0) + log_error_errno(r, "Could not get properties: %m"); + + return r; +} + +static int show_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool properties, new_line = false; + sd_bus *bus = ASSERT_PTR(userdata); + int r = 0; + + properties = !strstr(argv[0], "status"); + + pager_open(arg_pager_flags); + + if (properties && argc <= 1) { + + /* If no argument is specified, inspect the manager + * itself */ + r = show_machine_properties(bus, "/org/freedesktop/machine1", &new_line); + if (r < 0) + return r; + } + + for (int i = 1; i < argc; i++) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *path = NULL; + + r = bus_call_method(bus, bus_machine_mgr, "GetMachine", &error, &reply, "s", argv[i]); + if (r < 0) + return log_error_errno(r, "Could not get path to machine: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + if (properties) + r = show_machine_properties(bus, path, &new_line); + else + r = show_machine_info(argv[0], bus, path, &new_line); + } + + return r; +} + +static int print_image_hostname(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *hn; + int r; + + r = bus_call_method(bus, bus_machine_mgr, "GetImageHostname", NULL, &reply, "s", name); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "s", &hn); + if (r < 0) + return r; + + if (!isempty(hn)) + printf("\tHostname: %s\n", hn); + + return 0; +} + +static int print_image_machine_id(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + sd_id128_t id = SD_ID128_NULL; + const void *p; + size_t size; + int r; + + r = bus_call_method(bus, bus_machine_mgr, "GetImageMachineID", NULL, &reply, "s", name); + if (r < 0) + return r; + + r = sd_bus_message_read_array(reply, 'y', &p, &size); + if (r < 0) + return r; + + if (size == sizeof(sd_id128_t)) + memcpy(&id, p, size); + + if (!sd_id128_is_null(id)) + printf(" Machine ID: " SD_ID128_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(id)); + + return 0; +} + +static int print_image_machine_info(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + r = bus_call_method(bus, bus_machine_mgr, "GetImageMachineInfo", NULL, &reply, "s", name); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(reply, 'a', "{ss}"); + if (r < 0) + return r; + + for (;;) { + const char *p, *q; + + r = sd_bus_message_read(reply, "{ss}", &p, &q); + if (r < 0) + return r; + if (r == 0) + break; + + if (streq(p, "DEPLOYMENT")) + printf(" Deployment: %s\n", q); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + return 0; +} + +typedef struct ImageStatusInfo { + const char *name; + const char *path; + const char *type; + bool read_only; + usec_t crtime; + usec_t mtime; + uint64_t usage; + uint64_t limit; + uint64_t usage_exclusive; + uint64_t limit_exclusive; +} ImageStatusInfo; + +static void print_image_status_info(sd_bus *bus, ImageStatusInfo *i) { + assert(bus); + assert(i); + + if (i->name) { + fputs(i->name, stdout); + putchar('\n'); + } + + if (i->type) + printf("\t Type: %s\n", i->type); + + if (i->path) + printf("\t Path: %s\n", i->path); + + (void) print_image_hostname(bus, i->name); + (void) print_image_machine_id(bus, i->name); + (void) print_image_machine_info(bus, i->name); + + print_os_release(bus, "GetImageOSRelease", i->name, "\t OS: "); + + printf("\t RO: %s%s%s\n", + i->read_only ? ansi_highlight_red() : "", + i->read_only ? "read-only" : "writable", + i->read_only ? ansi_normal() : ""); + + if (timestamp_is_set(i->crtime)) + printf("\t Created: %s; %s\n", + FORMAT_TIMESTAMP(i->crtime), FORMAT_TIMESTAMP_RELATIVE(i->crtime)); + + if (timestamp_is_set(i->mtime)) + printf("\tModified: %s; %s\n", + FORMAT_TIMESTAMP(i->mtime), FORMAT_TIMESTAMP_RELATIVE(i->mtime)); + + if (i->usage != UINT64_MAX) { + if (i->usage_exclusive != i->usage && i->usage_exclusive != UINT64_MAX) + printf("\t Usage: %s (exclusive: %s)\n", + FORMAT_BYTES(i->usage), FORMAT_BYTES(i->usage_exclusive)); + else + printf("\t Usage: %s\n", FORMAT_BYTES(i->usage)); + } + + if (i->limit != UINT64_MAX) { + if (i->limit_exclusive != i->limit && i->limit_exclusive != UINT64_MAX) + printf("\t Limit: %s (exclusive: %s)\n", + FORMAT_BYTES(i->limit), FORMAT_BYTES(i->limit_exclusive)); + else + printf("\t Limit: %s\n", FORMAT_BYTES(i->limit)); + } +} + +static int show_image_info(sd_bus *bus, const char *path, bool *new_line) { + + static const struct bus_properties_map map[] = { + { "Name", "s", NULL, offsetof(ImageStatusInfo, name) }, + { "Path", "s", NULL, offsetof(ImageStatusInfo, path) }, + { "Type", "s", NULL, offsetof(ImageStatusInfo, type) }, + { "ReadOnly", "b", NULL, offsetof(ImageStatusInfo, read_only) }, + { "CreationTimestamp", "t", NULL, offsetof(ImageStatusInfo, crtime) }, + { "ModificationTimestamp", "t", NULL, offsetof(ImageStatusInfo, mtime) }, + { "Usage", "t", NULL, offsetof(ImageStatusInfo, usage) }, + { "Limit", "t", NULL, offsetof(ImageStatusInfo, limit) }, + { "UsageExclusive", "t", NULL, offsetof(ImageStatusInfo, usage_exclusive) }, + { "LimitExclusive", "t", NULL, offsetof(ImageStatusInfo, limit_exclusive) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + ImageStatusInfo info = {}; + int r; + + assert(bus); + assert(path); + assert(new_line); + + r = bus_map_all_properties(bus, + "org.freedesktop.machine1", + path, + map, + BUS_MAP_BOOLEAN_AS_BOOL, + &error, + &m, + &info); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + if (*new_line) + printf("\n"); + *new_line = true; + + print_image_status_info(bus, &info); + + return r; +} + +typedef struct PoolStatusInfo { + const char *path; + uint64_t usage; + uint64_t limit; +} PoolStatusInfo; + +static void print_pool_status_info(sd_bus *bus, PoolStatusInfo *i) { + if (i->path) + printf("\t Path: %s\n", i->path); + + if (i->usage != UINT64_MAX) + printf("\t Usage: %s\n", FORMAT_BYTES(i->usage)); + + if (i->limit != UINT64_MAX) + printf("\t Limit: %s\n", FORMAT_BYTES(i->limit)); +} + +static int show_pool_info(sd_bus *bus) { + + static const struct bus_properties_map map[] = { + { "PoolPath", "s", NULL, offsetof(PoolStatusInfo, path) }, + { "PoolUsage", "t", NULL, offsetof(PoolStatusInfo, usage) }, + { "PoolLimit", "t", NULL, offsetof(PoolStatusInfo, limit) }, + {} + }; + + PoolStatusInfo info = { + .usage = UINT64_MAX, + .limit = UINT64_MAX, + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + + r = bus_map_all_properties(bus, + "org.freedesktop.machine1", + "/org/freedesktop/machine1", + map, + 0, + &error, + &m, + &info); + if (r < 0) + return log_error_errno(r, "Could not get properties: %s", bus_error_message(&error, r)); + + print_pool_status_info(bus, &info); + + return 0; +} + +static int show_image_properties(sd_bus *bus, const char *path, bool *new_line) { + int r; + + assert(bus); + assert(path); + assert(new_line); + + if (*new_line) + printf("\n"); + + *new_line = true; + + r = bus_print_all_properties(bus, "org.freedesktop.machine1", path, NULL, arg_property, arg_print_flags, NULL); + if (r < 0) + log_error_errno(r, "Could not get properties: %m"); + + return r; +} + +static int show_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool properties, new_line = false; + sd_bus *bus = ASSERT_PTR(userdata); + int r = 0; + + properties = !strstr(argv[0], "status"); + + pager_open(arg_pager_flags); + + if (argc <= 1) { + + /* If no argument is specified, inspect the manager + * itself */ + + if (properties) + r = show_image_properties(bus, "/org/freedesktop/machine1", &new_line); + else + r = show_pool_info(bus); + if (r < 0) + return r; + } + + for (int i = 1; i < argc; i++) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *path = NULL; + + r = bus_call_method(bus, bus_machine_mgr, "GetImage", &error, &reply, "s", argv[i]); + if (r < 0) + return log_error_errno(r, "Could not get path to image: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + if (properties) + r = show_image_properties(bus, path, &new_line); + else + r = show_image_info(bus, path, &new_line); + } + + return r; +} + +static int kill_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + if (!arg_kill_whom) + arg_kill_whom = "all"; + + for (int i = 1; i < argc; i++) { + r = bus_call_method( + bus, + bus_machine_mgr, + "KillMachine", + &error, + NULL, + "ssi", argv[i], arg_kill_whom, arg_signal); + if (r < 0) + return log_error_errno(r, "Could not kill machine: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int reboot_machine(int argc, char *argv[], void *userdata) { + arg_kill_whom = "leader"; + arg_signal = SIGINT; /* sysvinit + systemd */ + + return kill_machine(argc, argv, userdata); +} + +static int poweroff_machine(int argc, char *argv[], void *userdata) { + arg_kill_whom = "leader"; + arg_signal = SIGRTMIN+4; /* only systemd */ + + return kill_machine(argc, argv, userdata); +} + +static int terminate_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (int i = 1; i < argc; i++) { + r = bus_call_method(bus, bus_machine_mgr, "TerminateMachine", &error, NULL, "s", argv[i]); + if (r < 0) + return log_error_errno(r, "Could not terminate machine: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static const char *select_copy_method(bool copy_from, bool force) { + if (force) + return copy_from ? "CopyFromMachineWithFlags" : "CopyToMachineWithFlags"; + else + return copy_from ? "CopyFromMachine" : "CopyToMachine"; +} + +static int copy_files(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *abs_host_path = NULL; + char *dest, *host_path, *container_path; + sd_bus *bus = ASSERT_PTR(userdata); + bool copy_from; + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + copy_from = streq(argv[0], "copy-from"); + dest = argv[3] ?: argv[2]; + host_path = copy_from ? dest : argv[2]; + container_path = copy_from ? argv[2] : dest; + + if (!path_is_absolute(host_path)) { + r = path_make_absolute_cwd(host_path, &abs_host_path); + if (r < 0) + return log_error_errno(r, "Failed to make path absolute: %m"); + + host_path = abs_host_path; + } + + r = bus_message_new_method_call( + bus, + &m, + bus_machine_mgr, + select_copy_method(copy_from, arg_force)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "sss", + argv[1], + copy_from ? container_path : host_path, + copy_from ? host_path : container_path); + if (r < 0) + return bus_log_create_error(r); + + if (arg_force) { + r = sd_bus_message_append(m, "t", (uint64_t) MACHINE_COPY_REPLACE); + if (r < 0) + return bus_log_create_error(r); + } + + /* This is a slow operation, hence turn off any method call timeouts */ + r = sd_bus_call(bus, m, USEC_INFINITY, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to copy: %s", bus_error_message(&error, r)); + + return 0; +} + +static int bind_mount(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method( + bus, + bus_machine_mgr, + "BindMountMachine", + &error, + NULL, + "sssbb", + argv[1], + argv[2], + argv[3], + arg_read_only, + arg_mkdir); + if (r < 0) + return log_error_errno(r, "Failed to bind mount: %s", bus_error_message(&error, r)); + + return 0; +} + +static int on_machine_removed(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + PTYForward ** forward = (PTYForward**) userdata; + int r; + + assert(m); + assert(forward); + + if (*forward) { + /* If the forwarder is already initialized, tell it to + * exit on the next vhangup(), so that we still flush + * out what might be queued and exit then. */ + + r = pty_forward_set_ignore_vhangup(*forward, false); + if (r >= 0) + return 0; + + log_error_errno(r, "Failed to set ignore_vhangup flag: %m"); + } + + /* On error, or when the forwarder is not initialized yet, quit immediately */ + sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), EXIT_FAILURE); + return 0; +} + +static int process_forward(sd_event *event, PTYForward **forward, int master, PTYForwardFlags flags, const char *name) { + char last_char = 0; + bool machine_died; + int r; + + assert(event); + assert(master >= 0); + assert(name); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGWINCH, SIGTERM, SIGINT, -1) >= 0); + + if (!arg_quiet) { + if (streq(name, ".host")) + log_info("Connected to the local host. Press ^] three times within 1s to exit session."); + else + log_info("Connected to machine %s. Press ^] three times within 1s to exit session.", name); + } + + (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + + r = pty_forward_new(event, master, flags, forward); + if (r < 0) + return log_error_errno(r, "Failed to create PTY forwarder: %m"); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + pty_forward_get_last_char(*forward, &last_char); + + machine_died = + (flags & PTY_FORWARD_IGNORE_VHANGUP) && + pty_forward_get_ignore_vhangup(*forward) == 0; + + *forward = pty_forward_free(*forward); + + if (last_char != '\n') + fputc('\n', stdout); + + if (!arg_quiet) { + if (machine_died) + log_info("Machine %s terminated.", name); + else if (streq(name, ".host")) + log_info("Connection to the local host terminated."); + else + log_info("Connection to machine %s terminated.", name); + } + + return 0; +} + +static int parse_machine_uid(const char *spec, const char **machine, char **uid) { + /* + * Whatever is specified in the spec takes priority over global arguments. + */ + char *_uid = NULL; + const char *_machine = NULL; + + if (spec) { + const char *at; + + at = strchr(spec, '@'); + if (at) { + if (at == spec) + /* Do the same as ssh and refuse "@host". */ + return -EINVAL; + + _machine = at + 1; + _uid = strndup(spec, at - spec); + if (!_uid) + return -ENOMEM; + } else + _machine = spec; + }; + + if (arg_uid && !_uid) { + _uid = strdup(arg_uid); + if (!_uid) + return -ENOMEM; + } + + *uid = _uid; + *machine = isempty(_machine) ? ".host" : _machine; + return 0; +} + +static int login_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *slot = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int master = -1, r; + sd_bus *bus = ASSERT_PTR(userdata); + const char *match, *machine; + + if (!strv_isempty(arg_setenv) || arg_uid) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--setenv= and --uid= are not supported for 'login'. Use 'shell' instead."); + + if (!IN_SET(arg_transport, BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_MACHINE)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Login only supported on local machines."); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get event loop: %m"); + + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + machine = argc < 2 || isempty(argv[1]) ? ".host" : argv[1]; + + match = strjoina("type='signal'," + "sender='org.freedesktop.machine1'," + "path='/org/freedesktop/machine1',", + "interface='org.freedesktop.machine1.Manager'," + "member='MachineRemoved'," + "arg0='", machine, "'"); + + r = sd_bus_add_match_async(bus, &slot, match, on_machine_removed, NULL, &forward); + if (r < 0) + return log_error_errno(r, "Failed to request machine removal match: %m"); + + r = bus_call_method(bus, bus_machine_mgr, "OpenMachineLogin", &error, &reply, "s", machine); + if (r < 0) + return log_error_errno(r, "Failed to get login PTY: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "hs", &master, NULL); + if (r < 0) + return bus_log_parse_error(r); + + return process_forward(event, &forward, master, PTY_FORWARD_IGNORE_VHANGUP, machine); +} + +static int shell_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *slot = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int master = -1, r; + sd_bus *bus = ASSERT_PTR(userdata); + const char *match, *machine, *path; + _cleanup_free_ char *uid = NULL; + + if (!IN_SET(arg_transport, BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_MACHINE)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Shell only supported on local machines."); + + /* Pass $TERM to shell session, if not explicitly specified. */ + if (!strv_find_prefix(arg_setenv, "TERM=")) { + const char *t; + + t = strv_find_prefix(environ, "TERM="); + if (t) { + if (strv_extend(&arg_setenv, t) < 0) + return log_oom(); + } + } + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get event loop: %m"); + + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + r = parse_machine_uid(argc >= 2 ? argv[1] : NULL, &machine, &uid); + if (r < 0) + return log_error_errno(r, "Failed to parse machine specification: %m"); + + match = strjoina("type='signal'," + "sender='org.freedesktop.machine1'," + "path='/org/freedesktop/machine1',", + "interface='org.freedesktop.machine1.Manager'," + "member='MachineRemoved'," + "arg0='", machine, "'"); + + r = sd_bus_add_match_async(bus, &slot, match, on_machine_removed, NULL, &forward); + if (r < 0) + return log_error_errno(r, "Failed to request machine removal match: %m"); + + r = bus_message_new_method_call(bus, &m, bus_machine_mgr, "OpenMachineShell"); + if (r < 0) + return bus_log_create_error(r); + + path = argc < 3 || isempty(argv[2]) ? NULL : argv[2]; + + r = sd_bus_message_append(m, "sss", machine, uid, path); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, strv_length(argv) <= 3 ? NULL : argv + 2); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, arg_setenv); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to get shell PTY: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "hs", &master, NULL); + if (r < 0) + return bus_log_parse_error(r); + + return process_forward(event, &forward, master, 0, machine); +} + +static int normalize_nspawn_filename(const char *name, char **ret_file) { + _cleanup_free_ char *file = NULL; + + assert(name); + assert(ret_file); + + if (!endswith(name, ".nspawn")) + file = strjoin(name, ".nspawn"); + else + file = strdup(name); + if (!file) + return log_oom(); + + if (!filename_is_valid(file)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid settings file name '%s'.", file); + + *ret_file = TAKE_PTR(file); + return 0; +} + +static int get_settings_path(const char *name, char **ret_path) { + assert(name); + assert(ret_path); + + FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn", "/var/lib/machines") { + _cleanup_free_ char *path = NULL; + + path = path_join(i, name); + if (!path) + return -ENOMEM; + + if (access(path, F_OK) >= 0) { + *ret_path = TAKE_PTR(path); + return 0; + } + } + + return -ENOENT; +} + +static int edit_settings(int argc, char *argv[], void *userdata) { + _cleanup_(edit_file_context_done) EditFileContext context = {}; + int r; + + if (!on_tty()) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot edit machine settings if not on a tty."); + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Edit is only supported on the host machine."); + + r = mac_init(); + if (r < 0) + return r; + + STRV_FOREACH(name, strv_skip(argv, 1)) { + _cleanup_free_ char *file = NULL, *path = NULL; + + if (path_is_absolute(*name)) { + if (!path_is_safe(*name)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid settings file path '%s'.", + *name); + + r = edit_files_add(&context, *name, NULL, NULL); + if (r < 0) + return r; + continue; + } + + r = normalize_nspawn_filename(*name, &file); + if (r < 0) + return r; + + r = get_settings_path(file, &path); + if (r == -ENOENT) { + log_debug("No existing settings file for machine '%s' found, creating a new file.", *name); + + path = path_join("/etc/systemd/nspawn", file); + if (!path) + return log_oom(); + + r = edit_files_add(&context, path, NULL, NULL); + if (r < 0) + return r; + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to get the path of the settings file: %m"); + + if (path_startswith(path, "/var/lib/machines")) { + _cleanup_free_ char *new_path = NULL; + + new_path = path_join("/etc/systemd/nspawn", file); + if (!new_path) + return log_oom(); + + r = edit_files_add(&context, new_path, path, NULL); + } else + r = edit_files_add(&context, path, NULL, NULL); + if (r < 0) + return r; + } + + return do_edit_files_and_install(&context); +} + +static int cat_settings(int argc, char *argv[], void *userdata) { + int r = 0; + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cat is only supported on the host machine."); + + pager_open(arg_pager_flags); + + STRV_FOREACH(name, strv_skip(argv, 1)) { + _cleanup_free_ char *file = NULL, *path = NULL; + int q; + + if (path_is_absolute(*name)) { + if (!path_is_safe(*name)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid settings file path '%s'.", + *name); + + q = cat_files(*name, /* dropins = */ NULL, /* flags = */ CAT_FORMAT_HAS_SECTIONS); + if (q < 0) + return r < 0 ? r : q; + continue; + } + + q = normalize_nspawn_filename(*name, &file); + if (q < 0) + return r < 0 ? r : q; + + q = get_settings_path(file, &path); + if (q == -ENOENT) { + log_error_errno(q, "No settings file found for machine '%s'.", *name); + r = r < 0 ? r : q; + continue; + } + if (q < 0) { + log_error_errno(q, "Failed to get the path of the settings file: %m"); + return r < 0 ? r : q; + } + + q = cat_files(path, /* dropins = */ NULL, /* flags = */ CAT_FORMAT_HAS_SECTIONS); + if (q < 0) + return r < 0 ? r : q; + } + + return r; +} + +static int remove_image(int argc, char *argv[], void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (int i = 1; i < argc; i++) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + r = bus_message_new_method_call(bus, &m, bus_machine_mgr, "RemoveImage"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", argv[i]); + if (r < 0) + return bus_log_create_error(r); + + /* This is a slow operation, hence turn off any method call timeouts */ + r = sd_bus_call(bus, m, USEC_INFINITY, &error, NULL); + if (r < 0) + return log_error_errno(r, "Could not remove image: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int rename_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method( + bus, + bus_machine_mgr, + "RenameImage", + &error, + NULL, + "ss", argv[1], argv[2]); + if (r < 0) + return log_error_errno(r, "Could not rename image: %s", bus_error_message(&error, r)); + + return 0; +} + +static int clone_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_message_new_method_call(bus, &m, bus_machine_mgr, "CloneImage"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "ssb", argv[1], argv[2], arg_read_only); + if (r < 0) + return bus_log_create_error(r); + + /* This is a slow operation, hence turn off any method call timeouts */ + r = sd_bus_call(bus, m, USEC_INFINITY, &error, NULL); + if (r < 0) + return log_error_errno(r, "Could not clone image: %s", bus_error_message(&error, r)); + + return 0; +} + +static int read_only_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int b = true, r; + + if (argc > 2) { + b = parse_boolean(argv[2]); + if (b < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse boolean argument: %s", + argv[2]); + } + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method(bus, bus_machine_mgr, "MarkImageReadOnly", &error, NULL, "sb", argv[1], b); + if (r < 0) + return log_error_errno(r, "Could not mark image read-only: %s", bus_error_message(&error, r)); + + return 0; +} + +static int image_exists(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(name); + + r = bus_call_method(bus, bus_machine_mgr, "GetImage", &error, NULL, "s", name); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_IMAGE)) + return 0; + + return log_error_errno(r, "Failed to check whether image %s exists: %s", name, bus_error_message(&error, r)); + } + + return 1; +} + +static int make_service_name(const char *name, char **ret) { + int r; + + assert(name); + assert(ret); + + if (!hostname_is_valid(name, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid machine name %s.", name); + + r = unit_name_build("systemd-nspawn", name, ".service", ret); + if (r < 0) + return log_error_errno(r, "Failed to build unit name: %m"); + + return 0; +} + +static int start_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + ask_password_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + + for (int i = 1; i < argc; i++) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *unit = NULL; + const char *object; + + r = make_service_name(argv[i], &unit); + if (r < 0) + return r; + + r = image_exists(bus, argv[i]); + if (r < 0) + return r; + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), + "Machine image '%s' does not exist.", + argv[i]); + + r = bus_call_method( + bus, + bus_systemd_mgr, + "StartUnit", + &error, + &reply, + "ss", unit, "fail"); + if (r < 0) + return log_error_errno(r, "Failed to start unit: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_add(w, object); + if (r < 0) + return log_oom(); + } + + r = bus_wait_for_jobs(w, arg_quiet, NULL); + if (r < 0) + return r; + + return 0; +} + +static int enable_machine(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *method; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + bool enable; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + enable = streq(argv[0], "enable"); + method = enable ? "EnableUnitFiles" : "DisableUnitFiles"; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return bus_log_create_error(r); + + if (enable) { + r = sd_bus_message_append(m, "s", "machines.target"); + if (r < 0) + return bus_log_create_error(r); + } + + for (int i = 1; i < argc; i++) { + _cleanup_free_ char *unit = NULL; + + r = make_service_name(argv[i], &unit); + if (r < 0) + return r; + + r = image_exists(bus, argv[i]); + if (r < 0) + return r; + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), + "Machine image '%s' does not exist.", + argv[i]); + + r = sd_bus_message_append(m, "s", unit); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + if (enable) + r = sd_bus_message_append(m, "bb", false, false); + else + r = sd_bus_message_append(m, "b", false); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to enable or disable unit: %s", bus_error_message(&error, r)); + + if (enable) { + r = sd_bus_message_read(reply, "b", NULL); + if (r < 0) + return bus_log_parse_error(r); + } + + r = bus_deserialize_and_dump_unit_file_changes(reply, arg_quiet); + if (r < 0) + return r; + + r = bus_service_manager_reload(bus); + if (r < 0) + return r; + + if (arg_now) { + _cleanup_strv_free_ char **new_args = NULL; + + new_args = strv_new(enable ? "start" : "poweroff"); + if (!new_args) + return log_oom(); + + r = strv_extend_strv(&new_args, argv + 1, /* filter_duplicates = */ false); + if (r < 0) + return log_oom(); + + if (enable) + return start_machine(strv_length(new_args), new_args, userdata); + + return poweroff_machine(strv_length(new_args), new_args, userdata); + } + + return 0; +} + +static int match_log_message(sd_bus_message *m, void *userdata, sd_bus_error *error) { + const char **our_path = userdata, *line; + unsigned priority; + int r; + + assert(m); + assert(our_path); + + r = sd_bus_message_read(m, "us", &priority, &line); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (!streq_ptr(*our_path, sd_bus_message_get_path(m))) + return 0; + + if (arg_quiet && LOG_PRI(priority) >= LOG_INFO) + return 0; + + log_full(priority, "%s", line); + return 0; +} + +static int match_transfer_removed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + const char **our_path = userdata, *path, *result; + uint32_t id; + int r; + + assert(m); + assert(our_path); + + r = sd_bus_message_read(m, "uos", &id, &path, &result); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (!streq_ptr(*our_path, path)) + return 0; + + sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), !streq_ptr(result, "done")); + return 0; +} + +static int transfer_signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + assert(s); + assert(si); + + if (!arg_quiet) + log_info("Continuing download in the background. Use \"machinectl cancel-transfer %" PRIu32 "\" to abort transfer.", PTR_TO_UINT32(userdata)); + + sd_event_exit(sd_event_source_get_event(s), EINTR); + return 0; +} + +static int transfer_image_common(sd_bus *bus, sd_bus_message *m) { + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *slot_job_removed = NULL, *slot_log_message = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_event_unrefp) sd_event* event = NULL; + const char *path = NULL; + uint32_t id; + int r; + + assert(bus); + assert(m); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get event loop: %m"); + + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + r = bus_match_signal_async( + bus, + &slot_job_removed, + bus_import_mgr, + "TransferRemoved", + match_transfer_removed, NULL, &path); + if (r < 0) + return log_error_errno(r, "Failed to request match: %m"); + + r = sd_bus_match_signal_async( + bus, + &slot_log_message, + "org.freedesktop.import1", + NULL, + "org.freedesktop.import1.Transfer", + "LogMessage", + match_log_message, NULL, &path); + if (r < 0) + return log_error_errno(r, "Failed to request match: %m"); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to transfer image: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "uo", &id, &path); + if (r < 0) + return bus_log_parse_error(r); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + if (!arg_quiet) + log_info("Enqueued transfer job %u. Press C-c to continue download in background.", id); + + (void) sd_event_add_signal(event, NULL, SIGINT, transfer_signal_handler, UINT32_TO_PTR(id)); + (void) sd_event_add_signal(event, NULL, SIGTERM, transfer_signal_handler, UINT32_TO_PTR(id)); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + return -r; +} + +static int import_tar(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *ll = NULL, *fn = NULL; + const char *local = NULL, *path = NULL; + _cleanup_close_ int fd = -EBADF; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) + path = empty_or_dash_to_null(argv[1]); + + if (argc >= 3) + local = empty_or_dash_to_null(argv[2]); + else if (path) { + r = path_extract_filename(path, &fn); + if (r < 0) + return log_error_errno(r, "Cannot extract container name from filename: %m"); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), + "Path '%s' refers to directory, but we need a regular file: %m", path); + + local = fn; + } + if (!local) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Need either path or local name."); + + r = tar_strip_suffixes(local, &ll); + if (r < 0) + return log_oom(); + + local = ll; + + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Local name %s is not a suitable machine name.", + local); + + if (path) { + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", path); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "ImportTar"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "hsbb", + fd >= 0 ? fd : STDIN_FILENO, + local, + arg_force, + arg_read_only); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +static int import_raw(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *ll = NULL, *fn = NULL; + const char *local = NULL, *path = NULL; + _cleanup_close_ int fd = -EBADF; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) + path = empty_or_dash_to_null(argv[1]); + + if (argc >= 3) + local = empty_or_dash_to_null(argv[2]); + else if (path) { + r = path_extract_filename(path, &fn); + if (r < 0) + return log_error_errno(r, "Cannot extract container name from filename: %m"); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), + "Path '%s' refers to directory, but we need a regular file: %m", path); + + local = fn; + } + if (!local) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Need either path or local name."); + + r = raw_strip_suffixes(local, &ll); + if (r < 0) + return log_oom(); + + local = ll; + + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Local name %s is not a suitable machine name.", + local); + + if (path) { + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", path); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "ImportRaw"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "hsbb", + fd >= 0 ? fd : STDIN_FILENO, + local, + arg_force, + arg_read_only); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +static int import_fs(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + const char *local = NULL, *path = NULL; + _cleanup_free_ char *fn = NULL; + _cleanup_close_ int fd = -EBADF; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) + path = empty_or_dash_to_null(argv[1]); + + if (argc >= 3) + local = empty_or_dash_to_null(argv[2]); + else if (path) { + r = path_extract_filename(path, &fn); + if (r < 0) + return log_error_errno(r, "Cannot extract container name from filename: %m"); + + local = fn; + } + if (!local) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Need either path or local name."); + + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Local name %s is not a suitable machine name.", + local); + + if (path) { + fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open directory '%s': %m", path); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "ImportFileSystem"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "hsbb", + fd >= 0 ? fd : STDIN_FILENO, + local, + arg_force, + arg_read_only); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +static void determine_compression_from_filename(const char *p) { + if (arg_format) + return; + + if (!p) + return; + + if (endswith(p, ".xz")) + arg_format = "xz"; + else if (endswith(p, ".gz")) + arg_format = "gzip"; + else if (endswith(p, ".bz2")) + arg_format = "bzip2"; +} + +static int export_tar(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_close_ int fd = -EBADF; + const char *local = NULL, *path = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + local = argv[1]; + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Machine name %s is not valid.", local); + + if (argc >= 3) + path = argv[2]; + path = empty_or_dash_to_null(path); + + if (path) { + determine_compression_from_filename(path); + + fd = open(path, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC|O_NOCTTY, 0666); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", path); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "ExportTar"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "shs", + local, + fd >= 0 ? fd : STDOUT_FILENO, + arg_format); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +static int export_raw(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_close_ int fd = -EBADF; + const char *local = NULL, *path = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + local = argv[1]; + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Machine name %s is not valid.", local); + + if (argc >= 3) + path = argv[2]; + path = empty_or_dash_to_null(path); + + if (path) { + determine_compression_from_filename(path); + + fd = open(path, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC|O_NOCTTY, 0666); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", path); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "ExportRaw"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "shs", + local, + fd >= 0 ? fd : STDOUT_FILENO, + arg_format); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +static int pull_tar(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *l = NULL, *ll = NULL; + const char *local, *remote; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + remote = argv[1]; + if (!http_url_is_valid(remote) && !file_url_is_valid(remote)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "URL '%s' is not valid.", remote); + + if (argc >= 3) + local = argv[2]; + else { + r = import_url_last_component(remote, &l); + if (r < 0) + return log_error_errno(r, "Failed to get final component of URL: %m"); + + local = l; + } + + local = empty_or_dash_to_null(local); + + if (local) { + r = tar_strip_suffixes(local, &ll); + if (r < 0) + return log_oom(); + + local = ll; + + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Local name %s is not a suitable machine name.", + local); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "PullTar"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "sssb", + remote, + local, + import_verify_to_string(arg_verify), + arg_force); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +static int pull_raw(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *l = NULL, *ll = NULL; + const char *local, *remote; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + remote = argv[1]; + if (!http_url_is_valid(remote) && !file_url_is_valid(remote)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "URL '%s' is not valid.", remote); + + if (argc >= 3) + local = argv[2]; + else { + r = import_url_last_component(remote, &l); + if (r < 0) + return log_error_errno(r, "Failed to get final component of URL: %m"); + + local = l; + } + + local = empty_or_dash_to_null(local); + + if (local) { + r = raw_strip_suffixes(local, &ll); + if (r < 0) + return log_oom(); + + local = ll; + + if (!hostname_is_valid(local, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Local name %s is not a suitable machine name.", + local); + } + + r = bus_message_new_method_call(bus, &m, bus_import_mgr, "PullRaw"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "sssb", + remote, + local, + import_verify_to_string(arg_verify), + arg_force); + if (r < 0) + return bus_log_create_error(r); + + return transfer_image_common(bus, m); +} + +typedef struct TransferInfo { + uint32_t id; + const char *type; + const char *remote; + const char *local; + double progress; +} TransferInfo; + +static int compare_transfer_info(const TransferInfo *a, const TransferInfo *b) { + return strcmp(a->local, b->local); +} + +static int list_transfers(int argc, char *argv[], void *userdata) { + size_t max_type = STRLEN("TYPE"), max_local = STRLEN("LOCAL"), max_remote = STRLEN("REMOTE"); + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ TransferInfo *transfers = NULL; + const char *type, *remote, *local; + sd_bus *bus = userdata; + uint32_t id, max_id = 0; + size_t n_transfers = 0; + double progress; + int r; + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_import_mgr, "ListTransfers", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Could not get transfers: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(usssdo)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(usssdo)", &id, &type, &remote, &local, &progress, NULL)) > 0) { + size_t l; + + if (!GREEDY_REALLOC(transfers, n_transfers + 1)) + return log_oom(); + + transfers[n_transfers].id = id; + transfers[n_transfers].type = type; + transfers[n_transfers].remote = remote; + transfers[n_transfers].local = local; + transfers[n_transfers].progress = progress; + + l = strlen(type); + if (l > max_type) + max_type = l; + + l = strlen(remote); + if (l > max_remote) + max_remote = l; + + l = strlen(local); + if (l > max_local) + max_local = l; + + if (id > max_id) + max_id = id; + + n_transfers++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + typesafe_qsort(transfers, n_transfers, compare_transfer_info); + + if (arg_legend && n_transfers > 0) + printf("%-*s %-*s %-*s %-*s %-*s\n", + (int) MAX(2U, DECIMAL_STR_WIDTH(max_id)), "ID", + (int) 7, "PERCENT", + (int) max_type, "TYPE", + (int) max_local, "LOCAL", + (int) max_remote, "REMOTE"); + + for (size_t j = 0; j < n_transfers; j++) + + if (transfers[j].progress < 0) + printf("%*" PRIu32 " %*s %-*s %-*s %-*s\n", + (int) MAX(2U, DECIMAL_STR_WIDTH(max_id)), transfers[j].id, + (int) 7, "n/a", + (int) max_type, transfers[j].type, + (int) max_local, transfers[j].local, + (int) max_remote, transfers[j].remote); + else + printf("%*" PRIu32 " %*u%% %-*s %-*s %-*s\n", + (int) MAX(2U, DECIMAL_STR_WIDTH(max_id)), transfers[j].id, + (int) 6, (unsigned) (transfers[j].progress * 100), + (int) max_type, transfers[j].type, + (int) max_local, transfers[j].local, + (int) max_remote, transfers[j].remote); + + if (arg_legend) { + if (n_transfers > 0) + printf("\n%zu transfers listed.\n", n_transfers); + else + printf("No transfers.\n"); + } + + return 0; +} + +static int cancel_transfer(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (int i = 1; i < argc; i++) { + uint32_t id; + + r = safe_atou32(argv[i], &id); + if (r < 0) + return log_error_errno(r, "Failed to parse transfer id: %s", argv[i]); + + r = bus_call_method(bus, bus_import_mgr, "CancelTransfer", &error, NULL, "u", id); + if (r < 0) + return log_error_errno(r, "Could not cancel transfer: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int set_limit(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + uint64_t limit; + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + if (STR_IN_SET(argv[argc-1], "-", "none", "infinity")) + limit = UINT64_MAX; + else { + r = parse_size(argv[argc-1], 1024, &limit); + if (r < 0) + return log_error_errno(r, "Failed to parse size: %s", argv[argc-1]); + } + + if (argc > 2) + /* With two arguments changes the quota limit of the + * specified image */ + r = bus_call_method(bus, bus_machine_mgr, "SetImageLimit", &error, NULL, "st", argv[1], limit); + else + /* With one argument changes the pool quota limit */ + r = bus_call_method(bus, bus_machine_mgr, "SetPoolLimit", &error, NULL, "t", limit); + + if (r < 0) + return log_error_errno(r, "Could not set limit: %s", bus_error_message(&error, r)); + + return 0; +} + +static int clean_images(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + uint64_t usage, total = 0; + sd_bus *bus = userdata; + const char *name; + unsigned c = 0; + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_message_new_method_call(bus, &m, bus_machine_mgr, "CleanPool"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", arg_all ? "all" : "hidden"); + if (r < 0) + return bus_log_create_error(r); + + /* This is a slow operation, hence permit a longer time for completion. */ + r = sd_bus_call(bus, m, USEC_INFINITY, &error, &reply); + if (r < 0) + return log_error_errno(r, "Could not clean pool: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(st)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(st)", &name, &usage)) > 0) { + if (usage == UINT64_MAX) { + log_info("Removed image '%s'", name); + total = UINT64_MAX; + } else { + log_info("Removed image '%s'. Freed exclusive disk space: %s", + name, FORMAT_BYTES(usage)); + if (total != UINT64_MAX) + total += usage; + } + c++; + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (total == UINT64_MAX) + log_info("Removed %u images in total.", c); + else + log_info("Removed %u images in total. Total freed exclusive disk space: %s.", + c, FORMAT_BYTES(total)); + + return 0; +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("machinectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%5$sSend control commands to or query the virtual machine and container%6$s\n" + "%5$sregistration manager.%6$s\n" + "\n%3$sMachine Commands:%4$s\n" + " list List running VMs and containers\n" + " status NAME... Show VM/container details\n" + " show [NAME...] Show properties of one or more VMs/containers\n" + " start NAME... Start container as a service\n" + " login [NAME] Get a login prompt in a container or on the\n" + " local host\n" + " shell [[USER@]NAME [COMMAND...]]\n" + " Invoke a shell (or other command) in a container\n" + " or on the local host\n" + " enable NAME... Enable automatic container start at boot\n" + " disable NAME... Disable automatic container start at boot\n" + " poweroff NAME... Power off one or more containers\n" + " reboot NAME... Reboot one or more containers\n" + " terminate NAME... Terminate one or more VMs/containers\n" + " kill NAME... Send signal to processes of a VM/container\n" + " copy-to NAME PATH [PATH] Copy files from the host to a container\n" + " copy-from NAME PATH [PATH] Copy files from a container to the host\n" + " bind NAME PATH [PATH] Bind mount a path from the host into a container\n" + "\n%3$sImage Commands:%4$s\n" + " list-images Show available container and VM images\n" + " image-status [NAME...] Show image details\n" + " show-image [NAME...] Show properties of image\n" + " edit NAME|FILE... Edit settings of one or more VMs/containers\n" + " cat NAME|FILE... Show settings of one or more VMs/containers\n" + " clone NAME NAME Clone an image\n" + " rename NAME NAME Rename an image\n" + " read-only NAME [BOOL] Mark or unmark image read-only\n" + " remove NAME... Remove an image\n" + " set-limit [NAME] BYTES Set image or pool size limit (disk quota)\n" + " clean Remove hidden (or all) images\n" + "\n%3$sImage Transfer Commands:%4$s\n" + " pull-tar URL [NAME] Download a TAR container image\n" + " pull-raw URL [NAME] Download a RAW container or VM image\n" + " import-tar FILE [NAME] Import a local TAR container image\n" + " import-raw FILE [NAME] Import a local RAW container or VM image\n" + " import-fs DIRECTORY [NAME] Import a local directory container image\n" + " export-tar NAME [FILE] Export a TAR container image locally\n" + " export-raw NAME [FILE] Export a RAW container or VM image locally\n" + " list-transfers Show list of downloads in progress\n" + " cancel-transfer Cancel a download\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --no-ask-password Do not ask for system passwords\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " -p --property=NAME Show only properties by this name\n" + " -q --quiet Suppress output\n" + " -a --all Show all properties, including empty ones\n" + " --value When showing properties, only print the value\n" + " -l --full Do not ellipsize output\n" + " --kill-whom=WHOM Whom to send signal to\n" + " -s --signal=SIGNAL Which signal to send\n" + " --uid=USER Specify user ID to invoke shell as\n" + " -E --setenv=VAR[=VALUE] Add an environment variable for shell\n" + " --read-only Create read-only bind mount\n" + " --mkdir Create directory before bind mounting, if missing\n" + " -n --lines=INTEGER Number of journal entries to show\n" + " --max-addresses=INTEGER Number of internet addresses to show at most\n" + " -o --output=STRING Change journal output mode (short, short-precise,\n" + " short-iso, short-iso-precise, short-full,\n" + " short-monotonic, short-unix, short-delta,\n" + " json, json-pretty, json-sse, json-seq, cat,\n" + " verbose, export, with-unit)\n" + " --verify=MODE Verification mode for downloaded images (no,\n" + " checksum, signature)\n" + " --force Download image even if already exists\n" + " --now Start or power off container after enabling or\n" + " disabling it\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_VALUE, + ARG_KILL_WHOM, + ARG_READ_ONLY, + ARG_MKDIR, + ARG_NO_ASK_PASSWORD, + ARG_VERIFY, + ARG_NOW, + ARG_FORCE, + ARG_FORMAT, + ARG_UID, + ARG_MAX_ADDRESSES, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "property", required_argument, NULL, 'p' }, + { "all", no_argument, NULL, 'a' }, + { "value", no_argument, NULL, ARG_VALUE }, + { "full", no_argument, NULL, 'l' }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "kill-whom", required_argument, NULL, ARG_KILL_WHOM }, + { "signal", required_argument, NULL, 's' }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "read-only", no_argument, NULL, ARG_READ_ONLY }, + { "mkdir", no_argument, NULL, ARG_MKDIR }, + { "quiet", no_argument, NULL, 'q' }, + { "lines", required_argument, NULL, 'n' }, + { "output", required_argument, NULL, 'o' }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "verify", required_argument, NULL, ARG_VERIFY }, + { "now", no_argument, NULL, ARG_NOW }, + { "force", no_argument, NULL, ARG_FORCE }, + { "format", required_argument, NULL, ARG_FORMAT }, + { "uid", required_argument, NULL, ARG_UID }, + { "setenv", required_argument, NULL, 'E' }, + { "max-addresses", required_argument, NULL, ARG_MAX_ADDRESSES }, + {} + }; + + bool reorder = false; + int c, r, shell = -1; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + + for (;;) { + static const char option_string[] = "-hp:als:H:M:qn:o:E:"; + + c = getopt_long(argc, argv, option_string + reorder, options, NULL); + if (c < 0) + break; + + switch (c) { + + case 1: /* getopt_long() returns 1 if "-" was the first character of the option string, and a + * non-option argument was discovered. */ + + assert(!reorder); + + /* We generally are fine with the fact that getopt_long() reorders the command line, and looks + * for switches after the main verb. However, for "shell" we really don't want that, since we + * want that switches specified after the machine name are passed to the program to execute, + * and not processed by us. To make this possible, we'll first invoke getopt_long() with + * reordering disabled (i.e. with the "-" prefix in the option string), looking for the first + * non-option parameter. If it's the verb "shell" we remember its position and continue + * processing options. In this case, as soon as we hit the next non-option argument we found + * the machine name, and stop further processing. If the first non-option argument is any other + * verb than "shell" we switch to normal reordering mode and continue processing arguments + * normally. */ + + if (shell >= 0) { + /* If we already found the "shell" verb on the command line, and now found the next + * non-option argument, then this is the machine name and we should stop processing + * further arguments. */ + optind --; /* don't process this argument, go one step back */ + goto done; + } + if (streq(optarg, "shell")) + /* Remember the position of the "shell" verb, and continue processing normally. */ + shell = optind - 1; + else { + int saved_optind; + + /* OK, this is some other verb. In this case, turn on reordering again, and continue + * processing normally. */ + reorder = true; + + /* We changed the option string. getopt_long() only looks at it again if we invoke it + * at least once with a reset option index. Hence, let's reset the option index here, + * then invoke getopt_long() again (ignoring what it has to say, after all we most + * likely already processed it), and the bump the option index so that we read the + * intended argument again. */ + saved_optind = optind; + optind = 0; + (void) getopt_long(argc, argv, option_string + reorder, options, NULL); + optind = saved_optind - 1; /* go one step back, process this argument again */ + } + + break; + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case 'p': + r = strv_extend(&arg_property, optarg); + if (r < 0) + return log_oom(); + + /* If the user asked for a particular + * property, show it to them, even if it is + * empty. */ + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + break; + + case 'a': + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + arg_all = true; + break; + + case ARG_VALUE: + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_ONLY_VALUE, true); + break; + + case 'l': + arg_full = true; + break; + + case 'n': + if (safe_atou(optarg, &arg_lines) < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse lines '%s'", optarg); + break; + + case 'o': + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(output_mode, OutputMode, _OUTPUT_MODE_MAX); + return 0; + } + + r = output_mode_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Unknown output '%s'.", optarg); + arg_output = r; + + if (OUTPUT_MODE_IS_JSON(arg_output)) + arg_legend = false; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_KILL_WHOM: + arg_kill_whom = optarg; + break; + + case 's': + r = parse_signal_argument(optarg, &arg_signal); + if (r <= 0) + return r; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case ARG_READ_ONLY: + arg_read_only = true; + break; + + case ARG_MKDIR: + arg_mkdir = true; + break; + + case 'q': + arg_quiet = true; + break; + + case ARG_VERIFY: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(import_verify, ImportVerify, _IMPORT_VERIFY_MAX); + return 0; + } + + r = import_verify_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --verify= setting: %s", optarg); + arg_verify = r; + break; + + case ARG_NOW: + arg_now = true; + break; + + case ARG_FORCE: + arg_force = true; + break; + + case ARG_FORMAT: + if (!STR_IN_SET(optarg, "uncompressed", "xz", "gzip", "bzip2")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown format: %s", optarg); + + arg_format = optarg; + break; + + case ARG_UID: + arg_uid = optarg; + break; + + case 'E': + r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg); + if (r < 0) + return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg); + break; + + case ARG_MAX_ADDRESSES: + if (streq(optarg, "all")) + arg_max_addresses = UINT_MAX; + else if (safe_atou(optarg, &arg_max_addresses) < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid number of addresses: %s", optarg); + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + } + +done: + if (shell >= 0) { + char *t; + + /* We found the "shell" verb while processing the argument list. Since we turned off reordering of the + * argument list initially let's readjust it now, and move the "shell" verb to the back. */ + + optind -= 1; /* place the option index where the "shell" verb will be placed */ + + t = argv[shell]; + for (int i = shell; i < optind; i++) + argv[i] = argv[i+1]; + argv[optind] = t; + } + + return 1; +} + +static int machinectl_main(int argc, char *argv[], sd_bus *bus) { + + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "list", VERB_ANY, 1, VERB_DEFAULT, list_machines }, + { "list-images", VERB_ANY, 1, 0, list_images }, + { "status", 2, VERB_ANY, 0, show_machine }, + { "image-status", VERB_ANY, VERB_ANY, 0, show_image }, + { "show", VERB_ANY, VERB_ANY, 0, show_machine }, + { "show-image", VERB_ANY, VERB_ANY, 0, show_image }, + { "terminate", 2, VERB_ANY, 0, terminate_machine }, + { "reboot", 2, VERB_ANY, 0, reboot_machine }, + { "poweroff", 2, VERB_ANY, 0, poweroff_machine }, + { "stop", 2, VERB_ANY, 0, poweroff_machine }, /* Convenience alias */ + { "kill", 2, VERB_ANY, 0, kill_machine }, + { "login", VERB_ANY, 2, 0, login_machine }, + { "shell", VERB_ANY, VERB_ANY, 0, shell_machine }, + { "bind", 3, 4, 0, bind_mount }, + { "edit", 2, VERB_ANY, 0, edit_settings }, + { "cat", 2, VERB_ANY, 0, cat_settings }, + { "copy-to", 3, 4, 0, copy_files }, + { "copy-from", 3, 4, 0, copy_files }, + { "remove", 2, VERB_ANY, 0, remove_image }, + { "rename", 3, 3, 0, rename_image }, + { "clone", 3, 3, 0, clone_image }, + { "read-only", 2, 3, 0, read_only_image }, + { "start", 2, VERB_ANY, 0, start_machine }, + { "enable", 2, VERB_ANY, 0, enable_machine }, + { "disable", 2, VERB_ANY, 0, enable_machine }, + { "import-tar", 2, 3, 0, import_tar }, + { "import-raw", 2, 3, 0, import_raw }, + { "import-fs", 2, 3, 0, import_fs }, + { "export-tar", 2, 3, 0, export_tar }, + { "export-raw", 2, 3, 0, export_raw }, + { "pull-tar", 2, 3, 0, pull_tar }, + { "pull-raw", 2, 3, 0, pull_raw }, + { "list-transfers", VERB_ANY, 1, 0, list_transfers }, + { "cancel-transfer", 2, VERB_ANY, 0, cancel_transfer }, + { "set-limit", 2, 3, 0, set_limit }, + { "clean", VERB_ANY, 1, 0, clean_images }, + {} + }; + + return dispatch_verb(argc, argv, verbs, bus); +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + setlocale(LC_ALL, ""); + log_setup(); + + /* The journal merging logic potentially needs a lot of fds. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + sigbus_install(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, &bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + (void) sd_bus_set_allow_interactive_authorization(bus, arg_ask_password); + + return machinectl_main(argc, argv, bus); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/machine/machined-core.c b/src/machine/machined-core.c new file mode 100644 index 0000000..ffca209 --- /dev/null +++ b/src/machine/machined-core.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "machined.h" +#include "nscd-flush.h" +#include "strv.h" +#include "user-util.h" + +#if ENABLE_NSCD +static int on_nscd_cache_flush_event(sd_event_source *s, void *userdata) { + /* Let's ask glibc's nscd daemon to flush its caches. We request this for the three database machines may show + * up in: the hosts database (for resolvable machine names) and the user and group databases (for the user ns + * ranges). */ + + (void) nscd_flush_cache(STRV_MAKE("passwd", "group", "hosts")); + return 0; +} + +int manager_enqueue_nscd_cache_flush(Manager *m) { + int r; + + assert(m); + + if (!m->nscd_cache_flush_event) { + r = sd_event_add_defer(m->event, &m->nscd_cache_flush_event, on_nscd_cache_flush_event, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate NSCD cache flush event: %m"); + + sd_event_source_set_description(m->nscd_cache_flush_event, "nscd-cache-flush"); + } + + r = sd_event_source_set_enabled(m->nscd_cache_flush_event, SD_EVENT_ONESHOT); + if (r < 0) { + m->nscd_cache_flush_event = sd_event_source_unref(m->nscd_cache_flush_event); + return log_error_errno(r, "Failed to enable NSCD cache flush event: %m"); + } + + return 0; +} +#endif + +int manager_find_machine_for_uid(Manager *m, uid_t uid, Machine **ret_machine, uid_t *ret_internal_uid) { + Machine *machine; + int r; + + assert(m); + assert(uid_is_valid(uid)); + + /* Finds the machine for the specified host UID and returns it along with the UID translated into the + * internal UID inside the machine */ + + HASHMAP_FOREACH(machine, m->machines) { + uid_t converted; + + r = machine_owns_uid(machine, uid, &converted); + if (r < 0) + return r; + if (r) { + if (ret_machine) + *ret_machine = machine; + + if (ret_internal_uid) + *ret_internal_uid = converted; + + return true; + } + } + + if (ret_machine) + *ret_machine = NULL; + if (ret_internal_uid) + *ret_internal_uid = UID_INVALID; + + return false; +} + +int manager_find_machine_for_gid(Manager *m, gid_t gid, Machine **ret_machine, gid_t *ret_internal_gid) { + Machine *machine; + int r; + + assert(m); + assert(gid_is_valid(gid)); + + HASHMAP_FOREACH(machine, m->machines) { + gid_t converted; + + r = machine_owns_gid(machine, gid, &converted); + if (r < 0) + return r; + if (r) { + if (ret_machine) + *ret_machine = machine; + + if (ret_internal_gid) + *ret_internal_gid = converted; + + return true; + } + } + + if (ret_machine) + *ret_machine = NULL; + if (ret_internal_gid) + *ret_internal_gid = GID_INVALID; + + return false; +} diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c new file mode 100644 index 0000000..9fec047 --- /dev/null +++ b/src/machine/machined-dbus.c @@ -0,0 +1,1516 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-locator.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "discover-image.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "hostname-util.h" +#include "image-dbus.h" +#include "io-util.h" +#include "machine-dbus.h" +#include "machine-pool.h" +#include "machined.h" +#include "missing_capability.h" +#include "os-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "unit-name.h" +#include "user-util.h" + +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_pool_path, "s", "/var/lib/machines"); + +static int property_get_pool_usage( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_close_ int fd = -EBADF; + uint64_t usage = UINT64_MAX; + + assert(bus); + assert(reply); + + fd = open("/var/lib/machines", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd >= 0) { + BtrfsQuotaInfo q; + + if (btrfs_subvol_get_subtree_quota_fd(fd, 0, &q) >= 0) + usage = q.referenced; + } + + return sd_bus_message_append(reply, "t", usage); +} + +static int property_get_pool_limit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_close_ int fd = -EBADF; + uint64_t size = UINT64_MAX; + + assert(bus); + assert(reply); + + fd = open("/var/lib/machines", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd >= 0) { + BtrfsQuotaInfo q; + + if (btrfs_subvol_get_subtree_quota_fd(fd, 0, &q) >= 0) + size = q.referenced_max; + } + + return sd_bus_message_append(reply, "t", size); +} + +static int method_get_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + const char *name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + machine = hashmap_get(m->machines, name); + if (!machine) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_MACHINE, "No machine '%s' known", name); + + p = machine_bus_path(machine); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_get_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + _unused_ Manager *m = ASSERT_PTR(userdata); + const char *name; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = image_find(IMAGE_MACHINE, name, NULL, NULL); + if (r == -ENOENT) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_IMAGE, "No image '%s' known", name); + if (r < 0) + return r; + + p = image_bus_path(name); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_get_machine_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + Machine *machine = NULL; + pid_t pid; + int r; + + assert(message); + + assert_cc(sizeof(pid_t) == sizeof(uint32_t)); + + r = sd_bus_message_read(message, "u", &pid); + if (r < 0) + return r; + + if (pid < 0) + return -EINVAL; + + if (pid == 0) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } + + r = manager_get_machine_by_pid(m, pid, &machine); + if (r < 0) + return r; + if (!machine) + return sd_bus_error_setf(error, BUS_ERROR_NO_MACHINE_FOR_PID, "PID "PID_FMT" does not belong to any known machine", pid); + + p = machine_bus_path(machine); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_list_machines(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + int r; + + assert(message); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return sd_bus_error_set_errno(error, r); + + r = sd_bus_message_open_container(reply, 'a', "(ssso)"); + if (r < 0) + return sd_bus_error_set_errno(error, r); + + HASHMAP_FOREACH(machine, m->machines) { + _cleanup_free_ char *p = NULL; + + p = machine_bus_path(machine); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(ssso)", + machine->name, + strempty(machine_class_to_string(machine->class)), + machine->service, + p); + if (r < 0) + return sd_bus_error_set_errno(error, r); + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return sd_bus_error_set_errno(error, r); + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_create_or_register_machine(Manager *manager, sd_bus_message *message, bool read_network, Machine **_m, sd_bus_error *error) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + const char *name, *service, *class, *root_directory; + const int32_t *netif = NULL; + MachineClass c; + uint32_t leader; + sd_id128_t id; + const void *v; + Machine *m; + size_t n, n_netif = 0; + int r; + + assert(manager); + assert(message); + assert(_m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + if (!hostname_is_valid(name, 0)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid machine name"); + + r = sd_bus_message_read_array(message, 'y', &v, &n); + if (r < 0) + return r; + if (n == 0) + id = SD_ID128_NULL; + else if (n == 16) + memcpy(&id, v, n); + else + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid machine ID parameter"); + + r = sd_bus_message_read(message, "ssus", &service, &class, &leader, &root_directory); + if (r < 0) + return r; + + if (read_network) { + r = sd_bus_message_read_array(message, 'i', (const void**) &netif, &n_netif); + if (r < 0) + return r; + + n_netif /= sizeof(int32_t); + + for (size_t i = 0; i < n_netif; i++) { + if (netif[i] <= 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid network interface index %i", netif[i]); + } + } + + if (isempty(class)) + c = _MACHINE_CLASS_INVALID; + else { + c = machine_class_from_string(class); + if (c < 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid machine class parameter"); + } + + if (leader == 1) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid leader PID"); + + if (!isempty(root_directory) && !path_is_absolute(root_directory)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Root directory must be empty or an absolute path"); + + if (leader == 0) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + assert_cc(sizeof(uint32_t) == sizeof(pid_t)); + + r = sd_bus_creds_get_pid(creds, (pid_t*) &leader); + if (r < 0) + return r; + } + + r = pidref_set_pid(&pidref, leader); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to pin process " PID_FMT ": %m", pidref.pid); + + if (hashmap_get(manager->machines, name)) + return sd_bus_error_setf(error, BUS_ERROR_MACHINE_EXISTS, "Machine '%s' already exists", name); + + r = manager_add_machine(manager, name, &m); + if (r < 0) + return r; + + m->leader = TAKE_PIDREF(pidref); + m->class = c; + m->id = id; + + if (!isempty(service)) { + m->service = strdup(service); + if (!m->service) { + r = -ENOMEM; + goto fail; + } + } + + if (!isempty(root_directory)) { + m->root_directory = strdup(root_directory); + if (!m->root_directory) { + r = -ENOMEM; + goto fail; + } + } + + if (n_netif > 0) { + assert_cc(sizeof(int32_t) == sizeof(int)); + m->netif = memdup(netif, sizeof(int32_t) * n_netif); + if (!m->netif) { + r = -ENOMEM; + goto fail; + } + + m->n_netif = n_netif; + } + + *_m = m; + + return 1; + +fail: + machine_add_to_gc_queue(m); + return r; +} + +static int method_create_machine_internal(sd_bus_message *message, bool read_network, void *userdata, sd_bus_error *error) { + Manager *manager = ASSERT_PTR(userdata); + Machine *m = NULL; + int r; + + assert(message); + + r = method_create_or_register_machine(manager, message, read_network, &m, error); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "(sv)"); + if (r < 0) + goto fail; + + r = machine_start(m, message, error); + if (r < 0) + goto fail; + + m->create_message = sd_bus_message_ref(message); + return 1; + +fail: + machine_add_to_gc_queue(m); + return r; +} + +static int method_create_machine_with_network(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_create_machine_internal(message, true, userdata, error); +} + +static int method_create_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_create_machine_internal(message, false, userdata, error); +} + +static int method_register_machine_internal(sd_bus_message *message, bool read_network, void *userdata, sd_bus_error *error) { + Manager *manager = ASSERT_PTR(userdata); + _cleanup_free_ char *p = NULL; + Machine *m = NULL; + int r; + + assert(message); + + r = method_create_or_register_machine(manager, message, read_network, &m, error); + if (r < 0) + return r; + + r = cg_pid_get_unit(m->leader.pid, &m->unit); + if (r < 0) { + r = sd_bus_error_set_errnof(error, r, + "Failed to determine unit of process "PID_FMT" : %m", + m->leader.pid); + goto fail; + } + + r = machine_start(m, NULL, error); + if (r < 0) + goto fail; + + p = machine_bus_path(m); + if (!p) { + r = -ENOMEM; + goto fail; + } + + return sd_bus_reply_method_return(message, "o", p); + +fail: + machine_add_to_gc_queue(m); + return r; +} + +static int method_register_machine_with_network(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_register_machine_internal(message, true, userdata, error); +} + +static int method_register_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return method_register_machine_internal(message, false, userdata, error); +} + +static int redirect_method_to_machine(sd_bus_message *message, Manager *m, sd_bus_error *error, sd_bus_message_handler_t method) { + Machine *machine; + const char *name; + int r; + + assert(message); + assert(m); + assert(method); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return sd_bus_error_set_errno(error, r); + + machine = hashmap_get(m->machines, name); + if (!machine) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_MACHINE, "No machine '%s' known", name); + + return method(message, machine, error); +} + +static int method_unregister_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_unregister); +} + +static int method_terminate_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_terminate); +} + +static int method_kill_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_kill); +} + +static int method_get_machine_addresses(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_get_addresses); +} + +static int method_get_machine_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_get_os_release); +} + +static int method_list_images(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_hashmap_free_ Hashmap *images = NULL; + _unused_ Manager *m = ASSERT_PTR(userdata); + Image *image; + int r; + + assert(message); + + images = hashmap_new(&image_hash_ops); + if (!images) + return -ENOMEM; + + r = image_discover(IMAGE_MACHINE, NULL, images); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssbttto)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(image, images) { + _cleanup_free_ char *p = NULL; + + p = image_bus_path(image->name); + if (!p) + return -ENOMEM; + + r = sd_bus_message_append(reply, "(ssbttto)", + image->name, + image_type_to_string(image->type), + image->read_only, + image->crtime, + image->mtime, + image->usage, + p); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_open_machine_pty(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_open_pty); +} + +static int method_open_machine_login(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_open_login); +} + +static int method_open_machine_shell(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_open_shell); +} + +static int method_bind_mount_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_bind_mount); +} + +static int method_copy_machine(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_copy); +} + +static int method_open_machine_root_directory(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_open_root_directory); +} + +static int method_get_machine_uid_shift(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_machine(message, userdata, error, bus_machine_method_get_uid_shift); +} + +static int redirect_method_to_image(sd_bus_message *message, Manager *m, sd_bus_error *error, sd_bus_message_handler_t method) { + _cleanup_(image_unrefp) Image* i = NULL; + const char *name; + int r; + + assert(message); + assert(m); + assert(method); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (!image_name_is_valid(name)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Image name '%s' is invalid.", name); + + r = image_find(IMAGE_MACHINE, name, NULL, &i); + if (r == -ENOENT) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_IMAGE, "No image '%s' known", name); + if (r < 0) + return r; + + i->userdata = m; + return method(message, i, error); +} + +static int method_remove_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_remove); +} + +static int method_rename_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_rename); +} + +static int method_clone_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_clone); +} + +static int method_mark_image_read_only(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_mark_read_only); +} + +static int method_get_image_hostname(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_get_hostname); +} + +static int method_get_image_machine_id(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_get_machine_id); +} + +static int method_get_image_machine_info(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_get_machine_info); +} + +static int method_get_image_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_get_os_release); +} + +static int clean_pool_done(Operation *operation, int ret, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_fclose_ FILE *f = NULL; + bool success; + size_t n; + int r; + + assert(operation); + assert(operation->extra_fd >= 0); + + if (lseek(operation->extra_fd, 0, SEEK_SET) < 0) + return -errno; + + f = take_fdopen(&operation->extra_fd, "r"); + if (!f) + return -errno; + + /* The resulting temporary file starts with a boolean value that indicates success or not. */ + errno = 0; + n = fread(&success, 1, sizeof(success), f); + if (n != sizeof(success)) + return ret < 0 ? ret : errno_or_else(EIO); + + if (ret < 0) { + _cleanup_free_ char *name = NULL; + + /* The clean-up operation failed. In this case the resulting temporary file should contain a boolean + * set to false followed by the name of the failed image. Let's try to read this and use it for the + * error message. If we can't read it, don't mind, and return the naked error. */ + + if (success) /* The resulting temporary file could not be updated, ignore it. */ + return ret; + + r = read_nul_string(f, LONG_LINE_MAX, &name); + if (r <= 0) /* Same here... */ + return ret; + + return sd_bus_error_set_errnof(error, ret, "Failed to remove image %s: %m", name); + } + + assert(success); + + r = sd_bus_message_new_method_return(operation->message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(st)"); + if (r < 0) + return r; + + /* On success the resulting temporary file will contain a list of image names that were removed followed by + * their size on disk. Let's read that and turn it into a bus message. */ + for (;;) { + _cleanup_free_ char *name = NULL; + uint64_t size; + + r = read_nul_string(f, LONG_LINE_MAX, &name); + if (r < 0) + return r; + if (r == 0) /* reached the end */ + break; + + errno = 0; + n = fread(&size, 1, sizeof(size), f); + if (n != sizeof(size)) + return errno_or_else(EIO); + + r = sd_bus_message_append(reply, "(st)", name, size); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_clean_pool(sd_bus_message *message, void *userdata, sd_bus_error *error) { + enum { + REMOVE_ALL, + REMOVE_HIDDEN, + } mode; + + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + _cleanup_close_ int result_fd = -EBADF; + Manager *m = userdata; + Operation *operation; + const char *mm; + pid_t child; + int r; + + assert(message); + + if (m->n_operations >= OPERATIONS_MAX) + return sd_bus_error_set(error, SD_BUS_ERROR_LIMITS_EXCEEDED, "Too many ongoing operations."); + + r = sd_bus_message_read(message, "s", &mm); + if (r < 0) + return r; + + if (streq(mm, "all")) + mode = REMOVE_ALL; + else if (streq(mm, "hidden")) + mode = REMOVE_HIDDEN; + else + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mode '%s'.", mm); + + const char *details[] = { + "verb", "clean_pool", + "mode", mm, + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to create pipe: %m"); + + /* Create a temporary file we can dump information about deleted images into. We use a temporary file for this + * instead of a pipe or so, since this might grow quit large in theory and we don't want to process this + * continuously */ + result_fd = open_tmpfile_unlinkable(NULL, O_RDWR|O_CLOEXEC); + if (result_fd < 0) + return -errno; + + /* This might be a slow operation, run it asynchronously in a background process */ + r = safe_fork("(sd-clean)", FORK_RESET_SIGNALS, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + bool success = true; + Image *image; + ssize_t l; + + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + images = hashmap_new(&image_hash_ops); + if (!images) { + r = -ENOMEM; + goto child_fail; + } + + r = image_discover(IMAGE_MACHINE, NULL, images); + if (r < 0) + goto child_fail; + + l = write(result_fd, &success, sizeof(success)); + if (l < 0) { + r = -errno; + goto child_fail; + } + + HASHMAP_FOREACH(image, images) { + + /* We can't remove vendor images (i.e. those in /usr) */ + if (IMAGE_IS_VENDOR(image)) + continue; + + if (IMAGE_IS_HOST(image)) + continue; + + if (mode == REMOVE_HIDDEN && !IMAGE_IS_HIDDEN(image)) + continue; + + r = image_remove(image); + if (r == -EBUSY) /* keep images that are currently being used. */ + continue; + if (r < 0) { + /* If the operation failed, let's override everything we wrote, and instead write there at which image we failed. */ + success = false; + (void) ftruncate(result_fd, 0); + (void) lseek(result_fd, 0, SEEK_SET); + (void) write(result_fd, &success, sizeof(success)); + (void) write(result_fd, image->name, strlen(image->name)+1); + goto child_fail; + } + + l = write(result_fd, image->name, strlen(image->name)+1); + if (l < 0) { + r = -errno; + goto child_fail; + } + + l = write(result_fd, &image->usage_exclusive, sizeof(image->usage_exclusive)); + if (l < 0) { + r = -errno; + goto child_fail; + } + } + + result_fd = safe_close(result_fd); + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + /* The clean-up might take a while, hence install a watch on the child and return */ + + r = operation_new(m, NULL, child, message, errno_pipe_fd[0], &operation); + if (r < 0) { + (void) sigkill_wait(child); + return r; + } + + operation->extra_fd = result_fd; + operation->done = clean_pool_done; + + result_fd = -EBADF; + errno_pipe_fd[0] = -EBADF; + + return 1; +} + +static int method_set_pool_limit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint64_t limit; + int r; + + assert(message); + + r = sd_bus_message_read(message, "t", &limit); + if (r < 0) + return r; + if (!FILE_SIZE_VALID_OR_INFINITY(limit)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "New limit out of range"); + + const char *details[] = { + "verb", "set_pool_limit", + NULL + }; + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.machine1.manage-machines", + details, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + /* Set up the machine directory if necessary */ + r = setup_machine_directory(error, /* use_btrfs_subvol= */ true, /* use_btrfs_quota= */ true); + if (r < 0) + return r; + + (void) btrfs_qgroup_set_limit("/var/lib/machines", 0, limit); + + r = btrfs_subvol_set_subtree_quota_limit("/var/lib/machines", 0, limit); + if (r == -ENOTTY) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Quota is only supported on btrfs."); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to adjust quota limit: %m"); + + return sd_bus_reply_method_return(message, NULL); +} + +static int method_set_image_limit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(message, userdata, error, bus_image_method_set_limit); +} + +static int method_map_from_machine_user(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Machine *machine; + uint32_t uid; + uid_t converted; + int r; + + r = sd_bus_message_read(message, "su", &name, &uid); + if (r < 0) + return r; + + if (!uid_is_valid(uid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid user ID " UID_FMT, uid); + + machine = hashmap_get(m->machines, name); + if (!machine) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_MACHINE, "No machine '%s' known", name); + + if (machine->class != MACHINE_CONTAINER) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Not supported for non-container machines."); + + r = machine_translate_uid(machine, uid, &converted); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_USER_MAPPING, "Machine '%s' has no matching user mappings.", name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "u", (uint32_t) converted); +} + +static int method_map_to_machine_user(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *o = NULL; + Manager *m = userdata; + Machine *machine; + uid_t uid, converted; + int r; + + r = sd_bus_message_read(message, "u", &uid); + if (r < 0) + return r; + if (!uid_is_valid(uid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid user ID " UID_FMT, uid); + if (uid < 0x10000) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_USER_MAPPING, "User " UID_FMT " belongs to host UID range", uid); + + r = manager_find_machine_for_uid(m, uid, &machine, &converted); + if (r < 0) + return r; + if (!r) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_USER_MAPPING, "No matching user mapping for " UID_FMT ".", uid); + + o = machine_bus_path(machine); + if (!o) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "sou", machine->name, o, (uint32_t) converted); +} + +static int method_map_from_machine_group(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Machine *machine; + gid_t converted; + uint32_t gid; + int r; + + r = sd_bus_message_read(message, "su", &name, &gid); + if (r < 0) + return r; + + if (!gid_is_valid(gid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid group ID " GID_FMT, gid); + + machine = hashmap_get(m->machines, name); + if (!machine) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_MACHINE, "No machine '%s' known", name); + + if (machine->class != MACHINE_CONTAINER) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Not supported for non-container machines."); + + r = machine_translate_gid(machine, gid, &converted); + if (r == -ESRCH) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_USER_MAPPING, "Machine '%s' has no matching group mappings.", name); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "u", (uint32_t) converted); +} + +static int method_map_to_machine_group(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *o = NULL; + Manager *m = userdata; + Machine *machine; + gid_t gid, converted; + int r; + + r = sd_bus_message_read(message, "u", &gid); + if (r < 0) + return r; + if (!gid_is_valid(gid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid group ID " GID_FMT, gid); + if (gid < 0x10000) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_GROUP_MAPPING, "Group " GID_FMT " belongs to host GID range", gid); + + r = manager_find_machine_for_gid(m, gid, &machine, &converted); + if (r < 0) + return r; + if (!r) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_GROUP_MAPPING, "No matching group mapping for " GID_FMT ".", gid); + + o = machine_bus_path(machine); + if (!o) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "sou", machine->name, o, (uint32_t) converted); +} + +const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("PoolPath", "s", property_get_pool_path, 0, 0), + SD_BUS_PROPERTY("PoolUsage", "t", property_get_pool_usage, 0, 0), + SD_BUS_PROPERTY("PoolLimit", "t", property_get_pool_limit, 0, 0), + + SD_BUS_METHOD_WITH_ARGS("GetMachine", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("o", machine), + method_get_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImage", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("o", image), + method_get_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetMachineByPID", + SD_BUS_ARGS("u", pid), + SD_BUS_RESULT("o", machine), + method_get_machine_by_pid, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListMachines", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(ssso)", machines), + method_list_machines, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListImages", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(ssbttto)", images), + method_list_images, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CreateMachine", + SD_BUS_ARGS("s", name, "ay", id, "s", service, "s", class, "u", leader, "s", root_directory, "a(sv)", scope_properties), + SD_BUS_RESULT("o", path), + method_create_machine, 0), + SD_BUS_METHOD_WITH_ARGS("CreateMachineWithNetwork", + SD_BUS_ARGS("s", name, "ay", id, "s", service, "s", class, "u", leader, "s", root_directory, "ai", ifindices, "a(sv)", scope_properties), + SD_BUS_RESULT("o", path), + method_create_machine_with_network, 0), + SD_BUS_METHOD_WITH_ARGS("RegisterMachine", + SD_BUS_ARGS("s", name, "ay", id, "s", service, "s", class, "u", leader, "s", root_directory), + SD_BUS_RESULT("o", path), + method_register_machine, 0), + SD_BUS_METHOD_WITH_ARGS("RegisterMachineWithNetwork", + SD_BUS_ARGS("s", name, "ay", id, "s", service, "s", class, "u", leader, "s", root_directory, "ai", ifindices), + SD_BUS_RESULT("o", path), + method_register_machine_with_network, 0), + SD_BUS_METHOD_WITH_ARGS("UnregisterMachine", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_unregister_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("TerminateMachine", + SD_BUS_ARGS("s", id), + SD_BUS_NO_RESULT, + method_terminate_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("KillMachine", + SD_BUS_ARGS("s", name, "s", who, "i", signal), + SD_BUS_NO_RESULT, + method_kill_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetMachineAddresses", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("a(iay)", addresses), + method_get_machine_addresses, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetMachineOSRelease", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("a{ss}", fields), + method_get_machine_os_release, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenMachinePTY", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("h", pty, "s", pty_path), + method_open_machine_pty, + 0), + SD_BUS_METHOD_WITH_ARGS("OpenMachineLogin", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("h", pty, "s", pty_path), + method_open_machine_login, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenMachineShell", + SD_BUS_ARGS("s", name, "s", user, "s", path, "as", args, "as", environment), + SD_BUS_RESULT("h", pty, "s", pty_path), + method_open_machine_shell, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("BindMountMachine", + SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir), + SD_BUS_NO_RESULT, + method_bind_mount_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyFromMachine", + SD_BUS_ARGS("s", name, "s", source, "s", destination), + SD_BUS_NO_RESULT, + method_copy_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyToMachine", + SD_BUS_ARGS("s", name, "s", source, "s", destination), + SD_BUS_NO_RESULT, + method_copy_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyFromMachineWithFlags", + SD_BUS_ARGS("s", name, "s", source, "s", destination, "t", flags), + SD_BUS_NO_RESULT, + method_copy_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CopyToMachineWithFlags", + SD_BUS_ARGS("s", name, "s", source, "s", destination, "t", flags), + SD_BUS_NO_RESULT, + method_copy_machine, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("OpenMachineRootDirectory", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("h", fd), + method_open_machine_root_directory, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetMachineUIDShift", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("u", shift), + method_get_machine_uid_shift, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RemoveImage", + SD_BUS_ARGS("s", name), + SD_BUS_NO_RESULT, + method_remove_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RenameImage", + SD_BUS_ARGS("s", name, "s", new_name), + SD_BUS_NO_RESULT, + method_rename_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CloneImage", + SD_BUS_ARGS("s", name, "s", new_name, "b", read_only), + SD_BUS_NO_RESULT, + method_clone_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MarkImageReadOnly", + SD_BUS_ARGS("s", name, "b", read_only), + SD_BUS_NO_RESULT, + method_mark_image_read_only, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageHostname", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("s", hostname), + method_get_image_hostname, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageMachineID", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("ay", id), + method_get_image_machine_id, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageMachineInfo", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("a{ss}", machine_info), + method_get_image_machine_info, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageOSRelease", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("a{ss}", os_release), + method_get_image_os_release, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetPoolLimit", + SD_BUS_ARGS("t", size), + SD_BUS_NO_RESULT, + method_set_pool_limit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetImageLimit", + SD_BUS_ARGS("s", name, "t", size), + SD_BUS_NO_RESULT, + method_set_image_limit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("CleanPool", + SD_BUS_ARGS("s", mode), + SD_BUS_RESULT("a(st)",images), + method_clean_pool, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MapFromMachineUser", + SD_BUS_ARGS("s", name, "u", uid_inner), + SD_BUS_RESULT("u", uid_outer), + method_map_from_machine_user, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MapToMachineUser", + SD_BUS_ARGS("u", uid_outer), + SD_BUS_RESULT("s", machine_name, "o", machine_path, "u", uid_inner), + method_map_to_machine_user, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MapFromMachineGroup", + SD_BUS_ARGS("s", name, "u", gid_inner), + SD_BUS_RESULT("u", gid_outer), + method_map_from_machine_group, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MapToMachineGroup", + SD_BUS_ARGS("u", gid_outer), + SD_BUS_RESULT("s", machine_name, "o", machine_path, "u", gid_inner), + method_map_to_machine_group, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_SIGNAL_WITH_ARGS("MachineNew", + SD_BUS_ARGS("s", machine, "o", path), + 0), + SD_BUS_SIGNAL_WITH_ARGS("MachineRemoved", + SD_BUS_ARGS("s", machine, "o", path), + 0), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/machine1", + "org.freedesktop.machine1.Manager", + .vtables = BUS_VTABLES(manager_vtable), + .children = BUS_IMPLEMENTATIONS( &machine_object, + &image_object ), +}; + +int match_job_removed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *path, *result, *unit; + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + uint32_t id; + int r; + + assert(message); + + r = sd_bus_message_read(message, "uoss", &id, &path, &unit, &result); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + machine = hashmap_get(m->machine_units, unit); + if (!machine) + return 0; + + if (streq_ptr(path, machine->scope_job)) { + machine->scope_job = mfree(machine->scope_job); + + if (machine->started) { + if (streq(result, "done")) + machine_send_create_reply(machine, NULL); + else { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + + sd_bus_error_setf(&e, BUS_ERROR_JOB_FAILED, "Start job for unit %s failed with '%s'", unit, result); + + machine_send_create_reply(machine, &e); + } + } + + machine_save(machine); + } + + machine_add_to_gc_queue(machine); + return 0; +} + +int match_properties_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *unit = NULL; + const char *path; + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + int r; + + assert(message); + + path = sd_bus_message_get_path(message); + if (!path) + return 0; + + r = unit_name_from_dbus_path(path, &unit); + if (r == -EINVAL) /* not for a unit */ + return 0; + if (r < 0) { + log_oom(); + return 0; + } + + machine = hashmap_get(m->machine_units, unit); + if (!machine) + return 0; + + machine_add_to_gc_queue(machine); + return 0; +} + +int match_unit_removed(sd_bus_message *message, void *userdata, sd_bus_error *error) { + const char *path, *unit; + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + int r; + + assert(message); + + r = sd_bus_message_read(message, "so", &unit, &path); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + machine = hashmap_get(m->machine_units, unit); + if (!machine) + return 0; + + machine_add_to_gc_queue(machine); + return 0; +} + +int match_reloading(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Machine *machine; + int b, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + if (b) + return 0; + + /* systemd finished reloading, let's recheck all our machines */ + log_debug("System manager has been reloaded, rechecking machines..."); + + HASHMAP_FOREACH(machine, m->machines) + machine_add_to_gc_queue(machine); + + return 0; +} + +int manager_unref_unit( + Manager *m, + const char *unit, + sd_bus_error *error) { + + assert(m); + assert(unit); + + return bus_call_method(m->bus, bus_systemd_mgr, "UnrefUnit", error, NULL, "s", unit); +} + +int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(manager); + assert(unit); + + r = bus_call_method(manager->bus, bus_systemd_mgr, "StopUnit", error, &reply, "ss", unit, "fail"); + if (r < 0) { + if (sd_bus_error_has_names(error, BUS_ERROR_NO_SUCH_UNIT, + BUS_ERROR_LOAD_FAILED)) { + + if (job) + *job = NULL; + + sd_bus_error_free(error); + return 0; + } + + return r; + } + + if (job) { + const char *j; + char *copy; + + r = sd_bus_message_read(reply, "o", &j); + if (r < 0) + return r; + + copy = strdup(j); + if (!copy) + return -ENOMEM; + + *job = copy; + } + + return 1; +} + +int manager_kill_unit(Manager *manager, const char *unit, int signo, sd_bus_error *error) { + assert(manager); + assert(unit); + + return bus_call_method(manager->bus, bus_systemd_mgr, "KillUnit", error, NULL, "ssi", unit, "all", signo); +} + +int manager_unit_is_active(Manager *manager, const char *unit) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *path = NULL; + const char *state; + int r; + + assert(manager); + assert(unit); + + path = unit_dbus_path_from_name(unit); + if (!path) + return -ENOMEM; + + r = sd_bus_get_property( + manager->bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "ActiveState", + &error, + &reply, + "s"); + if (r < 0) { + if (sd_bus_error_has_names(&error, SD_BUS_ERROR_NO_REPLY, + SD_BUS_ERROR_DISCONNECTED)) + return true; + + if (sd_bus_error_has_names(&error, BUS_ERROR_NO_SUCH_UNIT, + BUS_ERROR_LOAD_FAILED)) + return false; + + return r; + } + + r = sd_bus_message_read(reply, "s", &state); + if (r < 0) + return -EINVAL; + + return !STR_IN_SET(state, "inactive", "failed"); +} + +int manager_job_is_active(Manager *manager, const char *path) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(manager); + assert(path); + + r = sd_bus_get_property( + manager->bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Job", + "State", + &error, + &reply, + "s"); + if (r < 0) { + if (sd_bus_error_has_names(&error, SD_BUS_ERROR_NO_REPLY, + SD_BUS_ERROR_DISCONNECTED)) + return true; + + if (sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_OBJECT)) + return false; + + return r; + } + + /* We don't actually care about the state really. The fact + * that we could read the job state is enough for us */ + + return true; +} + +int manager_get_machine_by_pid(Manager *m, pid_t pid, Machine **machine) { + Machine *mm; + int r; + + assert(m); + assert(pid >= 1); + assert(machine); + + mm = hashmap_get(m->machine_leaders, PID_TO_PTR(pid)); + if (!mm) { + _cleanup_free_ char *unit = NULL; + + r = cg_pid_get_unit(pid, &unit); + if (r >= 0) + mm = hashmap_get(m->machine_units, unit); + } + if (!mm) + return 0; + + *machine = mm; + return 1; +} + +int manager_add_machine(Manager *m, const char *name, Machine **_machine) { + Machine *machine; + int r; + + assert(m); + assert(name); + + machine = hashmap_get(m->machines, name); + if (!machine) { + r = machine_new(m, _MACHINE_CLASS_INVALID, name, &machine); + if (r < 0) + return r; + } + + if (_machine) + *_machine = machine; + + return 0; +} diff --git a/src/machine/machined-varlink.c b/src/machine/machined-varlink.c new file mode 100644 index 0000000..6ca98e2 --- /dev/null +++ b/src/machine/machined-varlink.c @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "format-util.h" +#include "machined-varlink.h" +#include "mkdir.h" +#include "user-util.h" +#include "varlink.h" +#include "varlink-io.systemd.UserDatabase.h" + +typedef struct LookupParameters { + const char *user_name; + const char *group_name; + union { + uid_t uid; + gid_t gid; + }; + const char *service; +} LookupParameters; + +static int build_user_json(const char *user_name, uid_t uid, const char *real_name, JsonVariant **ret) { + assert(user_name); + assert(uid_is_valid(uid)); + assert(ret); + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)), + JSON_BUILD_PAIR_CONDITION(!isempty(real_name), "realName", JSON_BUILD_STRING(real_name)), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")), + JSON_BUILD_PAIR("shell", JSON_BUILD_STRING(NOLOGIN)), + JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.Machine")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("container")))))); +} + +static bool user_match_lookup_parameters(LookupParameters *p, const char *name, uid_t uid) { + assert(p); + + if (p->user_name && !streq(name, p->user_name)) + return false; + + if (uid_is_valid(p->uid) && uid != p->uid) + return false; + + return true; +} + +static int user_lookup_uid(Manager *m, uid_t uid, char **ret_name, char **ret_real_name) { + _cleanup_free_ char *n = NULL, *rn = NULL; + uid_t converted_uid; + Machine *machine; + int r; + + assert(m); + assert(uid_is_valid(uid)); + assert(ret_name); + assert(ret_real_name); + + if (uid < 0x10000) /* Host UID range */ + return -ESRCH; + + r = manager_find_machine_for_uid(m, uid, &machine, &converted_uid); + if (r < 0) + return r; + if (!r) + return -ESRCH; + + if (asprintf(&n, "vu-%s-" UID_FMT, machine->name, converted_uid) < 0) + return -ENOMEM; + + /* Don't synthesize invalid user/group names (too long...) */ + if (!valid_user_group_name(n, 0)) + return -ESRCH; + + if (asprintf(&rn, "UID " UID_FMT " of Container %s", converted_uid, machine->name) < 0) + return -ENOMEM; + + /* Don't synthesize invalid real names either, but since this field doesn't matter much, simply invalidate things */ + if (!valid_gecos(rn)) + rn = mfree(rn); + + *ret_name = TAKE_PTR(n); + *ret_real_name = TAKE_PTR(rn); + return 0; +} + +static int user_lookup_name(Manager *m, const char *name, uid_t *ret_uid, char **ret_real_name) { + _cleanup_free_ char *mn = NULL, *rn = NULL; + uid_t uid, converted_uid; + Machine *machine; + const char *e, *d; + int r; + + assert(m); + assert(ret_uid); + assert(ret_real_name); + + if (!valid_user_group_name(name, 0)) + return -ESRCH; + + e = startswith(name, "vu-"); + if (!e) + return -ESRCH; + + d = strrchr(e, '-'); + if (!d) + return -ESRCH; + + if (parse_uid(d + 1, &uid) < 0) + return -ESRCH; + + mn = strndup(e, d - e); + if (!mn) + return -ENOMEM; + + machine = hashmap_get(m->machines, mn); + if (!machine) + return -ESRCH; + + if (machine->class != MACHINE_CONTAINER) + return -ESRCH; + + r = machine_translate_uid(machine, uid, &converted_uid); + if (r < 0) + return r; + + if (asprintf(&rn, "UID " UID_FMT " of Container %s", uid, machine->name) < 0) + return -ENOMEM; + if (!valid_gecos(rn)) + rn = mfree(rn); + + *ret_uid = converted_uid; + *ret_real_name = TAKE_PTR(rn); + return 0; +} + +static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 }, + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .uid = UID_INVALID, + }; + _cleanup_free_ char *found_name = NULL, *found_real_name = NULL; + uid_t found_uid = UID_INVALID, uid; + Manager *m = ASSERT_PTR(userdata); + const char *un; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.Machine")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (uid_is_valid(p.uid)) + r = user_lookup_uid(m, p.uid, &found_name, &found_real_name); + else if (p.user_name) + r = user_lookup_name(m, p.user_name, &found_uid, &found_real_name); + else + return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL); + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + uid = uid_is_valid(found_uid) ? found_uid : p.uid; + un = found_name ?: p.user_name; + + if (!user_match_lookup_parameters(&p, un, uid)) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_user_json(un, uid, found_real_name, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int build_group_json(const char *group_name, gid_t gid, const char *description, JsonVariant **ret) { + assert(group_name); + assert(gid_is_valid(gid)); + assert(ret); + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(group_name)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)), + JSON_BUILD_PAIR_CONDITION(!isempty(description), "description", JSON_BUILD_STRING(description)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.Machine")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("container")))))); + } + +static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) { + assert(p); + + if (p->group_name && !streq(name, p->group_name)) + return false; + + if (gid_is_valid(p->gid) && gid != p->gid) + return false; + + return true; +} + +static int group_lookup_gid(Manager *m, gid_t gid, char **ret_name, char **ret_description) { + _cleanup_free_ char *n = NULL, *d = NULL; + gid_t converted_gid; + Machine *machine; + int r; + + assert(m); + assert(gid_is_valid(gid)); + assert(ret_name); + assert(ret_description); + + if (gid < 0x10000) /* Host GID range */ + return -ESRCH; + + r = manager_find_machine_for_gid(m, gid, &machine, &converted_gid); + if (r < 0) + return r; + if (!r) + return -ESRCH; + + if (asprintf(&n, "vg-%s-" GID_FMT, machine->name, converted_gid) < 0) + return -ENOMEM; + + if (!valid_user_group_name(n, 0)) + return -ESRCH; + + if (asprintf(&d, "GID " GID_FMT " of Container %s", converted_gid, machine->name) < 0) + return -ENOMEM; + if (!valid_gecos(d)) + d = mfree(d); + + *ret_name = TAKE_PTR(n); + *ret_description = TAKE_PTR(d); + + return 0; +} + +static int group_lookup_name(Manager *m, const char *name, gid_t *ret_gid, char **ret_description) { + _cleanup_free_ char *mn = NULL, *desc = NULL; + gid_t gid, converted_gid; + Machine *machine; + const char *e, *d; + int r; + + assert(m); + assert(ret_gid); + assert(ret_description); + + if (!valid_user_group_name(name, 0)) + return -ESRCH; + + e = startswith(name, "vg-"); + if (!e) + return -ESRCH; + + d = strrchr(e, '-'); + if (!d) + return -ESRCH; + + if (parse_gid(d + 1, &gid) < 0) + return -ESRCH; + + mn = strndup(e, d - e); + if (!mn) + return -ENOMEM; + + machine = hashmap_get(m->machines, mn); + if (!machine) + return -ESRCH; + + if (machine->class != MACHINE_CONTAINER) + return -ESRCH; + + r = machine_translate_gid(machine, gid, &converted_gid); + if (r < 0) + return r; + + if (asprintf(&desc, "GID " GID_FMT " of Container %s", gid, machine->name) < 0) + return -ENOMEM; + if (!valid_gecos(desc)) + desc = mfree(desc); + + *ret_gid = converted_gid; + *ret_description = TAKE_PTR(desc); + return 0; +} + +static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + LookupParameters p = { + .gid = GID_INVALID, + }; + _cleanup_free_ char *found_name = NULL, *found_description = NULL; + uid_t found_gid = GID_INVALID, gid; + Manager *m = ASSERT_PTR(userdata); + const char *gn; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.Machine")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + if (gid_is_valid(p.gid)) + r = group_lookup_gid(m, p.gid, &found_name, &found_description); + else if (p.group_name) + r = group_lookup_name(m, p.group_name, (uid_t*) &found_gid, &found_description); + else + return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL); + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + gid = gid_is_valid(found_gid) ? found_gid : p.gid; + gn = found_name ?: p.group_name; + + if (!group_match_lookup_parameters(&p, gn, gid)) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_group_json(gn, gid, found_description, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + LookupParameters p = {}; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!streq_ptr(p.service, "io.systemd.Machine")) + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + /* We don't support auxiliary groups for machines. */ + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); +} + +int manager_varlink_init(Manager *m) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + + if (m->varlink_server) + return 0; + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_error_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_add_interface(s, &vl_interface_io_systemd_UserDatabase); + if (r < 0) + return log_error_errno(r, "Failed to add UserDatabase interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + s, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships); + if (r < 0) + return log_error_errno(r, "Failed to register varlink methods: %m"); + + (void) mkdir_p("/run/systemd/userdb", 0755); + + r = varlink_server_listen_address(s, "/run/systemd/userdb/io.systemd.Machine", 0666); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + m->varlink_server = TAKE_PTR(s); + return 0; +} + +void manager_varlink_done(Manager *m) { + assert(m); + + m->varlink_server = varlink_server_unref(m->varlink_server); +} diff --git a/src/machine/machined-varlink.h b/src/machine/machined-varlink.h new file mode 100644 index 0000000..f26bbe5 --- /dev/null +++ b/src/machine/machined-varlink.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "machined.h" + +int manager_varlink_init(Manager *m); +void manager_varlink_done(Manager *m); diff --git a/src/machine/machined.c b/src/machine/machined.c new file mode 100644 index 0000000..58a407d --- /dev/null +++ b/src/machine/machined.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-log-control-api.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "common-signal.h" +#include "daemon-util.h" +#include "dirent-util.h" +#include "discover-image.h" +#include "fd-util.h" +#include "format-util.h" +#include "hostname-util.h" +#include "machined-varlink.h" +#include "machined.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "process-util.h" +#include "service-util.h" +#include "signal-util.h" +#include "special.h" + +static Manager* manager_unref(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unref); + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(machine_hash_ops, char, string_hash_func, string_compare_func, Machine, machine_free); + +static int manager_new(Manager **ret) { + _cleanup_(manager_unrefp) Manager *m = NULL; + int r; + + assert(ret); + + m = new0(Manager, 1); + if (!m) + return -ENOMEM; + + m->machines = hashmap_new(&machine_hash_ops); + m->machine_units = hashmap_new(&string_hash_ops); + m->machine_leaders = hashmap_new(NULL); + + if (!m->machines || !m->machine_units || !m->machine_leaders) + return -ENOMEM; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + if (r < 0) + return r; + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == -EHOSTDOWN ? LOG_DEBUG : LOG_NOTICE, r, + "Unable to create memory pressure event source, ignoring: %m"); + + (void) sd_event_set_watchdog(m->event, true); + + *ret = TAKE_PTR(m); + return 0; +} + +static Manager* manager_unref(Manager *m) { + if (!m) + return NULL; + + while (m->operations) + operation_free(m->operations); + + assert(m->n_operations == 0); + + hashmap_free(m->machines); /* This will free all machines, so that the machine_units/machine_leaders is empty */ + hashmap_free(m->machine_units); + hashmap_free(m->machine_leaders); + hashmap_free(m->image_cache); + + sd_event_source_unref(m->image_cache_defer_event); +#if ENABLE_NSCD + sd_event_source_unref(m->nscd_cache_flush_event); +#endif + + bus_verify_polkit_async_registry_free(m->polkit_registry); + + manager_varlink_done(m); + + sd_bus_flush_close_unref(m->bus); + sd_event_unref(m->event); + + return mfree(m); +} + +static int manager_add_host_machine(Manager *m) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + _cleanup_free_ char *rd = NULL, *unit = NULL; + sd_id128_t mid; + Machine *t; + int r; + + if (m->host_machine) + return 0; + + r = sd_id128_get_machine(&mid); + if (r < 0) + return log_error_errno(r, "Failed to get machine ID: %m"); + + rd = strdup("/"); + if (!rd) + return log_oom(); + + unit = strdup(SPECIAL_ROOT_SLICE); + if (!unit) + return log_oom(); + + r = pidref_set_pid(&pidref, 1); + if (r < 0) + return log_error_errno(r, "Failed to open reference to PID 1: %m"); + + r = machine_new(m, MACHINE_HOST, ".host", &t); + if (r < 0) + return log_error_errno(r, "Failed to create machine: %m"); + + t->leader = TAKE_PIDREF(pidref); + t->id = mid; + + t->root_directory = TAKE_PTR(rd); + t->unit = TAKE_PTR(unit); + + dual_timestamp_from_boottime(&t->timestamp, 0); + + m->host_machine = t; + + return 0; +} + +static int manager_enumerate_machines(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(m); + + r = manager_add_host_machine(m); + if (r < 0) + return r; + + /* Read in machine data stored on disk */ + d = opendir("/run/systemd/machines"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /run/systemd/machines: %m"); + } + + FOREACH_DIRENT(de, d, return -errno) { + struct Machine *machine; + int k; + + if (!dirent_is_file(de)) + continue; + + /* Ignore symlinks that map the unit name to the machine */ + if (startswith(de->d_name, "unit:")) + continue; + + if (!hostname_is_valid(de->d_name, 0)) + continue; + + k = manager_add_machine(m, de->d_name, &machine); + if (k < 0) { + r = log_error_errno(k, "Failed to add machine by file name %s: %m", de->d_name); + continue; + } + + machine_add_to_gc_queue(machine); + + k = machine_load(machine); + if (k < 0) + r = k; + } + + return r; +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = sd_bus_default_system(&m->bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_match_signal_async(m->bus, NULL, bus_systemd_mgr, "JobRemoved", match_job_removed, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to add match for JobRemoved: %m"); + + r = bus_match_signal_async(m->bus, NULL, bus_systemd_mgr, "UnitRemoved", match_unit_removed, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for UnitRemoved: %m"); + + r = sd_bus_match_signal_async( + m->bus, + NULL, + "org.freedesktop.systemd1", + NULL, + "org.freedesktop.DBus.Properties", + "PropertiesChanged", + match_properties_changed, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for PropertiesChanged: %m"); + + r = bus_match_signal_async(m->bus, NULL, bus_systemd_mgr, "Reloading", match_reloading, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match for Reloading: %m"); + + r = bus_call_method_async(m->bus, NULL, bus_systemd_mgr, "Subscribe", NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to enable subscription: %m"); + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.machine1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + return 0; +} + +static void manager_gc(Manager *m, bool drop_not_started) { + Machine *machine; + + assert(m); + + while ((machine = LIST_POP(gc_queue, m->machine_gc_queue))) { + machine->in_gc_queue = false; + + /* First, if we are not closing yet, initiate stopping */ + if (machine_may_gc(machine, drop_not_started) && + machine_get_state(machine) != MACHINE_CLOSING) + machine_stop(machine); + + /* Now, the stop probably made this referenced + * again, but if it didn't, then it's time to let it + * go entirely. */ + if (machine_may_gc(machine, drop_not_started)) { + machine_finalize(machine); + machine_free(machine); + } + } +} + +static int manager_startup(Manager *m) { + Machine *machine; + int r; + + assert(m); + + /* Connect to the bus */ + r = manager_connect_bus(m); + if (r < 0) + return r; + + /* Set up Varlink service */ + r = manager_varlink_init(m); + if (r < 0) + return r; + + /* Deserialize state */ + manager_enumerate_machines(m); + + /* Remove stale objects before we start them */ + manager_gc(m, false); + + /* And start everything */ + HASHMAP_FOREACH(machine, m->machines) + machine_start(machine, NULL, NULL); + + return 0; +} + +static bool check_idle(void *userdata) { + Manager *m = userdata; + + if (m->operations) + return false; + + if (varlink_server_current_connections(m->varlink_server) > 0) + return false; + + manager_gc(m, true); + + return hashmap_isempty(m->machines); +} + +static int manager_run(Manager *m) { + assert(m); + + return bus_event_loop_with_idle( + m->event, + m->bus, + "org.freedesktop.machine1", + DEFAULT_EXIT_USEC, + check_idle, m); +} + +static int run(int argc, char *argv[]) { + _cleanup_(manager_unrefp) Manager *m = NULL; + int r; + + log_set_facility(LOG_AUTH); + log_setup(); + + r = service_parse_argv("systemd-machined.service", + "Manage registrations of local VMs and containers.", + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + /* Always create the directories people can create inotify watches in. Note that some applications might check + * for the existence of /run/systemd/machines/ to determine whether machined is available, so please always + * make sure this check stays in. */ + (void) mkdir_label("/run/systemd/machines", 0755); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to allocate manager object: %m"); + + r = manager_startup(m); + if (r < 0) + return log_error_errno(r, "Failed to fully start up daemon: %m"); + + log_debug("systemd-machined running as pid "PID_FMT, getpid_cached()); + r = sd_notify(false, NOTIFY_READY); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + + r = manager_run(m); + + (void) sd_notify(false, NOTIFY_STOPPING); + log_debug("systemd-machined stopped as pid "PID_FMT, getpid_cached()); + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/machine/machined.h b/src/machine/machined.h new file mode 100644 index 0000000..280c32b --- /dev/null +++ b/src/machine/machined.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" +#include "sd-event.h" + +typedef struct Manager Manager; + +#include "hashmap.h" +#include "image-dbus.h" +#include "list.h" +#include "machine-dbus.h" +#include "machine.h" +#include "operation.h" +#include "varlink.h" + +struct Manager { + sd_event *event; + sd_bus *bus; + + Hashmap *machines; + Hashmap *machine_units; + Hashmap *machine_leaders; + + Hashmap *polkit_registry; + + Hashmap *image_cache; + sd_event_source *image_cache_defer_event; + + LIST_HEAD(Machine, machine_gc_queue); + + Machine *host_machine; + + LIST_HEAD(Operation, operations); + unsigned n_operations; + +#if ENABLE_NSCD + sd_event_source *nscd_cache_flush_event; +#endif + + VarlinkServer *varlink_server; +}; + +int manager_add_machine(Manager *m, const char *name, Machine **_machine); +int manager_get_machine_by_pid(Manager *m, pid_t pid, Machine **machine); + +extern const BusObjectImplementation manager_object; + +int match_reloading(sd_bus_message *message, void *userdata, sd_bus_error *error); +int match_unit_removed(sd_bus_message *message, void *userdata, sd_bus_error *error); +int match_properties_changed(sd_bus_message *message, void *userdata, sd_bus_error *error); +int match_job_removed(sd_bus_message *message, void *userdata, sd_bus_error *error); + +int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job); +int manager_kill_unit(Manager *manager, const char *unit, int signo, sd_bus_error *error); +int manager_unref_unit(Manager *m, const char *unit, sd_bus_error *error); +int manager_unit_is_active(Manager *manager, const char *unit); +int manager_job_is_active(Manager *manager, const char *path); + +#if ENABLE_NSCD +int manager_enqueue_nscd_cache_flush(Manager *m); +#else +static inline void manager_enqueue_nscd_cache_flush(Manager *m) {} +#endif + +int manager_find_machine_for_uid(Manager *m, uid_t host_uid, Machine **ret_machine, uid_t *ret_internal_uid); +int manager_find_machine_for_gid(Manager *m, gid_t host_gid, Machine **ret_machine, gid_t *ret_internal_gid); diff --git a/src/machine/meson.build b/src/machine/meson.build new file mode 100644 index 0000000..b3a1ffc --- /dev/null +++ b/src/machine/meson.build @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libmachine_core_sources = files( + 'image-dbus.c', + 'machine-dbus.c', + 'machine.c', + 'machined-core.c', + 'machined-dbus.c', + 'machined-varlink.c', + 'operation.c', +) + +libmachine_core = static_library( + 'machine-core', + libmachine_core_sources, + include_directories : includes, + dependencies : [threads, + userspace], + build_by_default : false) + +executables += [ + libexec_template + { + 'name' : 'systemd-machined', + 'dbus' : true, + 'conditions' : ['ENABLE_MACHINED'], + 'sources' : files('machined.c'), + 'link_with' : [ + libmachine_core, + libshared, + ], + }, + executable_template + { + 'name' : 'machinectl', + 'public' : true, + 'conditions' : ['ENABLE_MACHINED'], + 'sources' : files('machinectl.c'), + 'dependencies' : [ + liblz4, + libxz, + libzstd, + threads, + ], + }, + test_template + { + 'sources' : files('test-machine-tables.c'), + 'link_with': [ + libmachine_core, + libshared + ], + 'dependencies': threads, + }, +] + +if conf.get('ENABLE_MACHINED') == 1 + install_data('org.freedesktop.machine1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.machine1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.machine1.policy', + install_dir : polkitpolicydir) +endif diff --git a/src/machine/operation.c b/src/machine/operation.c new file mode 100644 index 0000000..87c62a9 --- /dev/null +++ b/src/machine/operation.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "operation.h" +#include "process-util.h" + +static int operation_done(sd_event_source *s, const siginfo_t *si, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Operation *o = ASSERT_PTR(userdata); + int r; + + assert(si); + + log_debug("Operation " PID_FMT " is now complete with code=%s status=%i", + o->pid, + sigchld_code_to_string(si->si_code), si->si_status); + + o->pid = 0; + + if (si->si_code != CLD_EXITED) { + r = sd_bus_error_set(&error, SD_BUS_ERROR_FAILED, "Child died abnormally."); + goto fail; + } + + if (si->si_status == EXIT_SUCCESS) + r = 0; + else if (read(o->errno_fd, &r, sizeof(r)) != sizeof(r)) { /* Try to acquire error code for failed operation */ + r = sd_bus_error_set(&error, SD_BUS_ERROR_FAILED, "Child failed."); + goto fail; + } + + if (o->done) { + /* A completion routine is set for this operation, call it. */ + r = o->done(o, r, &error); + if (r < 0) { + if (!sd_bus_error_is_set(&error)) + sd_bus_error_set_errno(&error, r); + + goto fail; + } + + } else { + /* The default operation when done is to simply return an error on failure or an empty success + * message on success. */ + if (r < 0) { + sd_bus_error_set_errno(&error, r); + goto fail; + } + + r = sd_bus_reply_method_return(o->message, NULL); + if (r < 0) + log_error_errno(r, "Failed to reply to message: %m"); + } + + operation_free(o); + return 0; + +fail: + r = sd_bus_reply_method_error(o->message, &error); + if (r < 0) + log_error_errno(r, "Failed to reply to message: %m"); + + operation_free(o); + return 0; +} + +int operation_new(Manager *manager, Machine *machine, pid_t child, sd_bus_message *message, int errno_fd, Operation **ret) { + Operation *o; + int r; + + assert(manager); + assert(child > 1); + assert(message); + assert(errno_fd >= 0); + + o = new0(Operation, 1); + if (!o) + return -ENOMEM; + + o->extra_fd = -EBADF; + + r = sd_event_add_child(manager->event, &o->event_source, child, WEXITED, operation_done, o); + if (r < 0) { + free(o); + return r; + } + + o->pid = child; + o->message = sd_bus_message_ref(message); + o->errno_fd = errno_fd; + + LIST_PREPEND(operations, manager->operations, o); + manager->n_operations++; + o->manager = manager; + + if (machine) { + LIST_PREPEND(operations_by_machine, machine->operations, o); + o->machine = machine; + } + + log_debug("Started new operation " PID_FMT ".", child); + + /* At this point we took ownership of both the child and the errno file descriptor! */ + + if (ret) + *ret = o; + + return 0; +} + +Operation *operation_free(Operation *o) { + if (!o) + return NULL; + + sd_event_source_unref(o->event_source); + + safe_close(o->errno_fd); + safe_close(o->extra_fd); + + if (o->pid > 1) + (void) sigkill_wait(o->pid); + + sd_bus_message_unref(o->message); + + if (o->manager) { + LIST_REMOVE(operations, o->manager->operations, o); + o->manager->n_operations--; + } + + if (o->machine) + LIST_REMOVE(operations_by_machine, o->machine->operations, o); + + return mfree(o); +} diff --git a/src/machine/operation.h b/src/machine/operation.h new file mode 100644 index 0000000..fd48288 --- /dev/null +++ b/src/machine/operation.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" +#include "sd-event.h" + +#include "list.h" + +typedef struct Operation Operation; + +#include "machined.h" + +#define OPERATIONS_MAX 64 + +struct Operation { + Manager *manager; + Machine *machine; + pid_t pid; + sd_bus_message *message; + int errno_fd; + int extra_fd; + sd_event_source *event_source; + int (*done)(Operation *o, int ret, sd_bus_error *error); + LIST_FIELDS(Operation, operations); + LIST_FIELDS(Operation, operations_by_machine); +}; + +int operation_new(Manager *manager, Machine *machine, pid_t child, sd_bus_message *message, int errno_fd, Operation **ret); +Operation *operation_free(Operation *o); diff --git a/src/machine/org.freedesktop.machine1.conf b/src/machine/org.freedesktop.machine1.conf new file mode 100644 index 0000000..bafc1af --- /dev/null +++ b/src/machine/org.freedesktop.machine1.conf @@ -0,0 +1,242 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/machine/org.freedesktop.machine1.policy b/src/machine/org.freedesktop.machine1.policy new file mode 100644 index 0000000..f031e4e --- /dev/null +++ b/src/machine/org.freedesktop.machine1.policy @@ -0,0 +1,104 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Log into a local container + Authentication is required to log into a local container. + + auth_admin + auth_admin + auth_admin_keep + + + + + Log into the local host + Authentication is required to log into the local host. + + auth_admin + auth_admin + yes + + + + + Acquire a shell in a local container + Authentication is required to acquire a shell in a local container. + + auth_admin + auth_admin + auth_admin_keep + + org.freedesktop.login1.login + + + + Acquire a shell on the local host + Authentication is required to acquire a shell on the local host. + + auth_admin + auth_admin + auth_admin_keep + + org.freedesktop.login1.host-login + + + + Acquire a pseudo TTY in a local container + Authentication is required to acquire a pseudo TTY in a local container. + + auth_admin + auth_admin + auth_admin_keep + + + + + Acquire a pseudo TTY on the local host + Authentication is required to acquire a pseudo TTY on the local host. + + auth_admin + auth_admin + auth_admin_keep + + + + + Manage local virtual machines and containers + Authentication is required to manage local virtual machines and containers. + + auth_admin + auth_admin + auth_admin_keep + + org.freedesktop.login1.shell org.freedesktop.login1.login + + + + Manage local virtual machine and container images + Authentication is required to manage local virtual machine and container images. + + auth_admin + auth_admin + auth_admin_keep + + + + diff --git a/src/machine/org.freedesktop.machine1.service b/src/machine/org.freedesktop.machine1.service new file mode 100644 index 0000000..64b73c1 --- /dev/null +++ b/src/machine/org.freedesktop.machine1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.machine1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.machine1.service diff --git a/src/machine/test-machine-tables.c b/src/machine/test-machine-tables.c new file mode 100644 index 0000000..32c5e0e --- /dev/null +++ b/src/machine/test-machine-tables.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "machine.h" +#include "test-tables.h" +#include "tests.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(kill_who, KILL_WHO); + test_table(machine_class, MACHINE_CLASS); + test_table(machine_state, MACHINE_STATE); + + return EXIT_SUCCESS; +} diff --git a/src/modules-load/meson.build b/src/modules-load/meson.build new file mode 100644 index 0000000..2f1decc --- /dev/null +++ b/src/modules-load/meson.build @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-modules-load', + 'conditions' : ['HAVE_KMOD'], + 'sources' : files('modules-load.c'), + 'dependencies' : libkmod, + }, +] + +if conf.get('HAVE_KMOD') == 1 + install_emptydir(modulesloaddir) + if install_sysconfdir + install_emptydir(sysconfdir / 'modules-load.d') + endif +endif diff --git a/src/modules-load/modules-load.c b/src/modules-load/modules-load.c new file mode 100644 index 0000000..efca237 --- /dev/null +++ b/src/modules-load/modules-load.c @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "build.h" +#include "conf-files.h" +#include "constants.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "main-func.h" +#include "module-util.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "strv.h" + +static char **arg_proc_cmdline_modules = NULL; +static const char conf_file_dirs[] = CONF_PATHS_NULSTR("modules-load.d"); + +STATIC_DESTRUCTOR_REGISTER(arg_proc_cmdline_modules, strv_freep); + +static void systemd_kmod_log(void *data, int priority, const char *file, int line, + const char *fn, const char *format, va_list args) { + + DISABLE_WARNING_FORMAT_NONLITERAL; + log_internalv(priority, 0, file, line, fn, format, args); + REENABLE_WARNING; +} + +static int add_modules(const char *p) { + _cleanup_strv_free_ char **k = NULL; + + k = strv_split(p, ","); + if (!k) + return log_oom(); + + if (strv_extend_strv(&arg_proc_cmdline_modules, k, true) < 0) + return log_oom(); + + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + int r; + + if (proc_cmdline_key_streq(key, "modules_load")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = add_modules(value); + if (r < 0) + return r; + } + + return 0; +} + +static int apply_file(struct kmod_ctx *ctx, const char *path, bool ignore_enoent) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *pp = NULL; + int r; + + assert(ctx); + assert(path); + + r = search_and_fopen_nulstr(path, "re", NULL, conf_file_dirs, &f, &pp); + if (r < 0) { + if (ignore_enoent && r == -ENOENT) + return 0; + + return log_error_errno(r, "Failed to open %s: %m", path); + } + + log_debug("apply: %s", pp); + for (;;) { + _cleanup_free_ char *line = NULL; + int k; + + k = read_stripped_line(f, LONG_LINE_MAX, &line); + if (k < 0) + return log_error_errno(k, "Failed to read file '%s': %m", pp); + if (k == 0) + break; + + if (isempty(line)) + continue; + if (strchr(COMMENTS, *line)) + continue; + + k = module_load_and_warn(ctx, line, true); + if (k == -ENOENT) + continue; + RET_GATHER(r, k); + } + + return r; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-modules-load.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [CONFIGURATION FILE...]\n\n" + "Loads statically configured kernel modules.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL; + int r, k; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + log_setup(); + + umask(0022); + + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, PROC_CMDLINE_STRIP_RD_PREFIX); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + + ctx = kmod_new(NULL, NULL); + if (!ctx) { + log_error("Failed to allocate memory for kmod."); + return -ENOMEM; + } + + kmod_load_resources(ctx); + kmod_set_log_fn(ctx, systemd_kmod_log, NULL); + + r = 0; + + if (argc > optind) { + for (int i = optind; i < argc; i++) + RET_GATHER(r, apply_file(ctx, argv[i], false)); + + } else { + _cleanup_strv_free_ char **files = NULL; + + STRV_FOREACH(i, arg_proc_cmdline_modules) { + k = module_load_and_warn(ctx, *i, true); + if (k == -ENOENT) + continue; + RET_GATHER(r, k); + } + + k = conf_files_list_nulstr(&files, ".conf", NULL, 0, conf_file_dirs); + if (k < 0) + return log_error_errno(k, "Failed to enumerate modules-load.d files: %m"); + + STRV_FOREACH(fn, files) + RET_GATHER(r, apply_file(ctx, *fn, true)); + } + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/mount/meson.build b/src/mount/meson.build new file mode 100644 index 0000000..176fb53 --- /dev/null +++ b/src/mount/meson.build @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-mount', + 'public' : true, + 'sources' : files('mount-tool.c'), + 'dependencies' : libmount, + }, +] + +meson.add_install_script(sh, '-c', ln_s.format(bindir / 'systemd-mount', + bindir / 'systemd-umount')) diff --git a/src/mount/mount-tool.c b/src/mount/mount-tool.c new file mode 100644 index 0000000..f626f07 --- /dev/null +++ b/src/mount/mount-tool.c @@ -0,0 +1,1590 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-bus.h" +#include "sd-device.h" + +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-wait-for-jobs.h" +#include "chase.h" +#include "device-util.h" +#include "dirent-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "format-util.h" +#include "fs-util.h" +#include "fstab-util.h" +#include "libmount-util.h" +#include "main-func.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "sort-util.h" +#include "spawn-polkit-agent.h" +#include "stat-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "umask-util.h" +#include "unit-def.h" +#include "unit-name.h" +#include "user-util.h" + +enum { + ACTION_DEFAULT, + ACTION_MOUNT, + ACTION_AUTOMOUNT, + ACTION_UMOUNT, + ACTION_LIST, +} arg_action = ACTION_DEFAULT; + +static bool arg_no_block = false; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static bool arg_full = false; +static bool arg_ask_password = true; +static bool arg_quiet = false; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; +static const char *arg_host = NULL; +static bool arg_discover = false; +static char *arg_mount_what = NULL; +static char *arg_mount_where = NULL; +static char *arg_mount_type = NULL; +static char *arg_mount_options = NULL; +static char *arg_description = NULL; +static char **arg_property = NULL; +static usec_t arg_timeout_idle = USEC_INFINITY; +static bool arg_timeout_idle_set = false; +static char **arg_automount_property = NULL; +static int arg_bind_device = -1; +static uid_t arg_uid = UID_INVALID; +static gid_t arg_gid = GID_INVALID; +static bool arg_fsck = true; +static bool arg_aggressive_gc = false; +static bool arg_tmpfs = false; + +STATIC_DESTRUCTOR_REGISTER(arg_mount_what, freep); +STATIC_DESTRUCTOR_REGISTER(arg_mount_where, freep); +STATIC_DESTRUCTOR_REGISTER(arg_mount_type, freep); +STATIC_DESTRUCTOR_REGISTER(arg_mount_options, freep); +STATIC_DESTRUCTOR_REGISTER(arg_description, freep); +STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_automount_property, strv_freep); + +static int parse_where(const char *input, char **ret_where) { + int r; + + assert(input); + assert(ret_where); + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + r = chase(input, NULL, CHASE_NONEXISTENT, ret_where, NULL); + if (r < 0) + return log_error_errno(r, "Failed to make path %s absolute: %m", input); + } else { + if (!path_is_absolute(input)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path must be absolute when operating remotely: %s", + input); + + r = path_simplify_alloc(input, ret_where); + if (r < 0) + return log_error_errno(r, "Failed to simplify path %s: %m", input); + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-mount", "1", &link); + if (r < 0) + return log_oom(); + + printf("systemd-mount [OPTIONS...] WHAT [WHERE]\n" + "systemd-mount [OPTIONS...] --tmpfs [NAME] WHERE\n" + "systemd-mount [OPTIONS...] --list\n" + "%s [OPTIONS...] %sWHAT|WHERE...\n\n" + "Establish a mount or auto-mount point transiently.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-block Do not wait until operation finished\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers\n" + " -l --full Do not ellipsize output\n" + " --no-ask-password Do not prompt for password\n" + " -q --quiet Suppress information messages during runtime\n" + " --user Run as user unit\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " --discover Discover mount device metadata\n" + " -t --type=TYPE File system type\n" + " -o --options=OPTIONS Mount options\n" + " --owner=USER Add uid= and gid= options for USER\n" + " --fsck=no Don't run file system check before mount\n" + " --description=TEXT Description for unit\n" + " -p --property=NAME=VALUE Set mount unit property\n" + " -A --automount=BOOL Create an auto-mount point\n" + " --timeout-idle-sec=SEC Specify automount idle timeout\n" + " --automount-property=NAME=VALUE\n" + " Set automount unit property\n" + " --bind-device Bind automount unit to device\n" + " --list List mountable block devices\n" + " -u --umount Unmount mount points\n" + " -G --collect Unload unit after it stopped, even when failed\n" + " -T --tmpfs Create a new tmpfs on the mount point\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + streq(program_invocation_short_name, "systemd-umount") ? "" : "--umount ", + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_BLOCK, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_NO_ASK_PASSWORD, + ARG_USER, + ARG_SYSTEM, + ARG_DISCOVER, + ARG_MOUNT_TYPE, + ARG_MOUNT_OPTIONS, + ARG_OWNER, + ARG_FSCK, + ARG_DESCRIPTION, + ARG_TIMEOUT_IDLE, + ARG_AUTOMOUNT, + ARG_AUTOMOUNT_PROPERTY, + ARG_BIND_DEVICE, + ARG_LIST, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-block", no_argument, NULL, ARG_NO_BLOCK }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "full", no_argument, NULL, 'l' }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "quiet", no_argument, NULL, 'q' }, + { "user", no_argument, NULL, ARG_USER }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "discover", no_argument, NULL, ARG_DISCOVER }, + { "type", required_argument, NULL, 't' }, + { "options", required_argument, NULL, 'o' }, + { "owner", required_argument, NULL, ARG_OWNER }, + { "fsck", required_argument, NULL, ARG_FSCK }, + { "description", required_argument, NULL, ARG_DESCRIPTION }, + { "property", required_argument, NULL, 'p' }, + { "automount", required_argument, NULL, ARG_AUTOMOUNT }, + { "timeout-idle-sec", required_argument, NULL, ARG_TIMEOUT_IDLE }, + { "automount-property", required_argument, NULL, ARG_AUTOMOUNT_PROPERTY }, + { "bind-device", no_argument, NULL, ARG_BIND_DEVICE }, + { "list", no_argument, NULL, ARG_LIST }, + { "umount", no_argument, NULL, 'u' }, + { "unmount", no_argument, NULL, 'u' }, /* Compat spelling */ + { "collect", no_argument, NULL, 'G' }, + { "tmpfs", no_argument, NULL, 'T' }, + {}, + }; + + int r, c; + + assert(argc >= 0); + assert(argv); + + if (invoked_as(argv, "systemd-umount")) + arg_action = ACTION_UMOUNT; + + while ((c = getopt_long(argc, argv, "hqH:M:t:o:p:AuGlT", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_BLOCK: + arg_no_block = true; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case 'l': + arg_full = true; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case 'q': + arg_quiet = true; + break; + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + + case ARG_SYSTEM: + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case ARG_DISCOVER: + arg_discover = true; + break; + + case 't': + r = free_and_strdup_warn(&arg_mount_type, optarg); + if (r < 0) + return r; + break; + + case 'o': + r = free_and_strdup_warn(&arg_mount_options, optarg); + if (r < 0) + return r; + break; + + case ARG_OWNER: { + const char *user = optarg; + + r = get_user_creds(&user, &arg_uid, &arg_gid, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, + r == -EBADMSG ? "UID or GID of user %s are invalid." + : "Cannot use \"%s\" as owner: %m", + optarg); + break; + } + + case ARG_FSCK: + r = parse_boolean_argument("--fsck=", optarg, &arg_fsck); + if (r < 0) + return r; + break; + + case ARG_DESCRIPTION: + r = free_and_strdup_warn(&arg_description, optarg); + if (r < 0) + return r; + break; + + case 'p': + if (strv_extend(&arg_property, optarg) < 0) + return log_oom(); + + break; + + case 'A': + arg_action = ACTION_AUTOMOUNT; + break; + + case ARG_AUTOMOUNT: + r = parse_boolean_argument("--automount=", optarg, NULL); + if (r < 0) + return r; + + arg_action = r ? ACTION_AUTOMOUNT : ACTION_MOUNT; + break; + + case ARG_TIMEOUT_IDLE: + r = parse_sec(optarg, &arg_timeout_idle); + if (r < 0) + return log_error_errno(r, "Failed to parse timeout: %s", optarg); + + break; + + case ARG_AUTOMOUNT_PROPERTY: + if (strv_extend(&arg_automount_property, optarg) < 0) + return log_oom(); + + break; + + case ARG_BIND_DEVICE: + arg_bind_device = true; + break; + + case ARG_LIST: + arg_action = ACTION_LIST; + break; + + case 'u': + arg_action = ACTION_UMOUNT; + break; + + case 'G': + arg_aggressive_gc = true; + break; + + case 'T': + arg_tmpfs = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_runtime_scope == RUNTIME_SCOPE_USER) { + arg_ask_password = false; + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Execution in user context is not supported on non-local systems."); + } + + if (arg_action == ACTION_LIST) { + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many arguments."); + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Listing devices only supported locally."); + } else if (arg_action == ACTION_UMOUNT) { + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "At least one argument required."); + + if (arg_transport != BUS_TRANSPORT_LOCAL) + for (int i = optind; i < argc; i++) + if (!path_is_absolute(argv[i])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path must be absolute when operating remotely: %s", + argv[i]); + } else { + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "At least one argument required."); + + if (argc > optind+2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "More than two arguments are not allowed."); + + if (arg_tmpfs) { + if (argc <= optind+1) { + arg_mount_what = strdup("tmpfs"); + if (!arg_mount_what) + return log_oom(); + + r = parse_where(argv[optind], &arg_mount_where); + if (r < 0) + return r; + } else { + arg_mount_what = strdup(argv[optind]); + if (!arg_mount_what) + return log_oom(); + } + + if (!strv_contains(arg_property, "Type=tmpfs") && + strv_extend(&arg_property, "Type=tmpfs") < 0) + return log_oom(); + } else { + if (arg_mount_type && !fstype_is_blockdev_backed(arg_mount_type)) { + arg_mount_what = strdup(argv[optind]); + if (!arg_mount_what) + return log_oom(); + + } else if (arg_transport == BUS_TRANSPORT_LOCAL) { + _cleanup_free_ char *u = NULL; + + u = fstab_node_to_udev_node(argv[optind]); + if (!u) + return log_oom(); + + r = chase(u, NULL, 0, &arg_mount_what, NULL); + if (r < 0) + return log_error_errno(r, "Failed to make path %s absolute: %m", u); + + } else { + if (!path_is_absolute(argv[optind])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path must be absolute when operating remotely: %s", + argv[optind]); + + r = path_simplify_alloc(argv[optind], &arg_mount_what); + if (r < 0) + return log_error_errno(r, "Failed to simplify path: %m"); + } + } + + if (argc > optind+1) { + r = parse_where(argv[optind+1], &arg_mount_where); + if (r < 0) + return r; + } else if (!arg_tmpfs) + arg_discover = true; + + if (arg_discover && arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Automatic mount location discovery is only supported locally."); + } + + return 1; +} + +static int transient_unit_set_properties(sd_bus_message *m, UnitType t, char **properties) { + int r; + + if (!isempty(arg_description)) { + r = sd_bus_message_append(m, "(sv)", "Description", "s", arg_description); + if (r < 0) + return r; + } + + if (arg_bind_device && is_device_path(arg_mount_what)) { + _cleanup_free_ char *device_unit = NULL; + + r = unit_name_from_path(arg_mount_what, ".device", &device_unit); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "(sv)(sv)", + "After", "as", 1, device_unit, + "BindsTo", "as", 1, device_unit); + if (r < 0) + return r; + } + + if (arg_aggressive_gc) { + r = sd_bus_message_append(m, "(sv)", "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return r; + } + + r = bus_append_unit_property_assignment_many(m, t, properties); + if (r < 0) + return r; + + return 0; +} + +static int transient_mount_set_properties(sd_bus_message *m) { + int r; + + assert(m); + + r = transient_unit_set_properties(m, UNIT_MOUNT, arg_property); + if (r < 0) + return r; + + if (arg_mount_what) { + r = sd_bus_message_append(m, "(sv)", "What", "s", arg_mount_what); + if (r < 0) + return r; + } + + if (arg_mount_type) { + r = sd_bus_message_append(m, "(sv)", "Type", "s", arg_mount_type); + if (r < 0) + return r; + } + + _cleanup_free_ char *options = NULL; + + /* Prepend uid=…,gid=… if arg_uid is set */ + if (arg_uid != UID_INVALID) { + r = strextendf_with_separator(&options, ",", + "uid="UID_FMT",gid="GID_FMT, arg_uid, arg_gid); + if (r < 0) + return r; + } + + /* Override the default for tmpfs mounts. The kernel sets the sticky bit on the root directory by + * default. This makes sense for the case when the user does 'mount -t tmpfs tmpfs /tmp', but less so + * for other directories. + * + * Let's also set some reasonable limits. We use the current umask, to match what a command to create + * directory would use, e.g. mkdir. */ + if (arg_tmpfs) { + mode_t mask; + + r = get_process_umask(0, &mask); + if (r < 0) + return r; + + assert((mask & ~0777) == 0); + r = strextendf_with_separator(&options, ",", + "mode=0%o,nodev,nosuid%s", 0777 & ~mask, NESTED_TMPFS_LIMITS); + if (r < 0) + return r; + } + + if (arg_mount_options) + if (!strextend_with_separator(&options, ",", arg_mount_options)) + return -ENOMEM; + + if (options) { + log_debug("Using mount options: %s", options); + r = sd_bus_message_append(m, "(sv)", "Options", "s", options); + if (r < 0) + return r; + } else + log_debug("Not using any mount options"); + + if (arg_fsck) { + _cleanup_free_ char *fsck = NULL; + + r = unit_name_from_path_instance("systemd-fsck", arg_mount_what, ".service", &fsck); + if (r < 0) + return r; + + r = sd_bus_message_append(m, + "(sv)(sv)", + "Requires", "as", 1, fsck, + "After", "as", 1, fsck); + if (r < 0) + return r; + } + + return 0; +} + +static int transient_automount_set_properties(sd_bus_message *m) { + int r; + + assert(m); + + r = transient_unit_set_properties(m, UNIT_AUTOMOUNT, arg_automount_property); + if (r < 0) + return r; + + if (arg_timeout_idle != USEC_INFINITY) { + r = sd_bus_message_append(m, "(sv)", "TimeoutIdleUSec", "t", arg_timeout_idle); + if (r < 0) + return r; + } + + return 0; +} + +static int start_transient_mount( + sd_bus *bus, + char **argv) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_free_ char *mount_unit = NULL; + int r; + + if (!arg_no_block) { + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + r = unit_name_from_path(arg_mount_where, ".mount", &mount_unit); + if (r < 0) + return log_error_errno(r, "Failed to make mount unit name: %m"); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_set_allow_interactive_authorization(m, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Name and mode */ + r = sd_bus_message_append(m, "ss", mount_unit, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = transient_mount_set_properties(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* Auxiliary units */ + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return bus_log_create_error(r); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to start transient mount unit: %s", bus_error_message(&error, r)); + + if (w) { + const char *object; + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, arg_quiet, NULL); + if (r < 0) + return r; + } + + if (!arg_quiet) + log_info("Started unit %s%s%s for mount point: %s%s%s", + ansi_highlight(), mount_unit, ansi_normal(), + ansi_highlight(), arg_mount_where, ansi_normal()); + + return 0; +} + +static int start_transient_automount( + sd_bus *bus, + char **argv) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_free_ char *automount_unit = NULL, *mount_unit = NULL; + int r; + + if (!arg_no_block) { + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + r = unit_name_from_path(arg_mount_where, ".automount", &automount_unit); + if (r < 0) + return log_error_errno(r, "Failed to make automount unit name: %m"); + + r = unit_name_from_path(arg_mount_where, ".mount", &mount_unit); + if (r < 0) + return log_error_errno(r, "Failed to make mount unit name: %m"); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_set_allow_interactive_authorization(m, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Name and mode */ + r = sd_bus_message_append(m, "ss", automount_unit, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = transient_automount_set_properties(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* Auxiliary units */ + r = sd_bus_message_open_container(m, 'a', "(sa(sv))"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'r', "sa(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", mount_unit); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = transient_mount_set_properties(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to start transient automount unit: %s", bus_error_message(&error, r)); + + if (w) { + const char *object; + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, arg_quiet, NULL); + if (r < 0) + return r; + } + + if (!arg_quiet) + log_info("Started unit %s%s%s for mount point: %s%s%s", + ansi_highlight(), automount_unit, ansi_normal(), + ansi_highlight(), arg_mount_where, ansi_normal()); + + return 0; +} + +static int find_mount_points(const char *what, char ***list) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + _cleanup_strv_free_ char **l = NULL; + size_t n = 0; + int r; + + assert(what); + assert(list); + + /* Returns all mount points obtained from /proc/self/mountinfo in *list, + * and the number of mount points as return value. */ + + r = libmount_parse(NULL, NULL, &table, &iter); + if (r < 0) + return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + struct libmnt_fs *fs; + const char *source, *target; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + + source = mnt_fs_get_source(fs); + target = mnt_fs_get_target(fs); + if (!source || !target) + continue; + + if (!path_equal(source, what)) + continue; + + /* one extra slot is needed for the terminating NULL */ + if (!GREEDY_REALLOC0(l, n + 2)) + return log_oom(); + + l[n] = strdup(target); + if (!l[n]) + return log_oom(); + n++; + } + + if (!GREEDY_REALLOC0(l, n + 1)) + return log_oom(); + + *list = TAKE_PTR(l); + return n; +} + +static int find_loop_device(const char *backing_file, sd_device **ret) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(backing_file); + assert(ret); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return log_oom(); + + r = sd_device_enumerator_add_match_subsystem(e, "block", /* match = */ true); + if (r < 0) + return log_error_errno(r, "Failed to add subsystem match: %m"); + + r = sd_device_enumerator_add_match_property(e, "ID_FS_USAGE", "filesystem"); + if (r < 0) + return log_error_errno(r, "Failed to add property match: %m"); + + r = sd_device_enumerator_add_match_sysname(e, "loop*"); + if (r < 0) + return log_error_errno(r, "Failed to add sysname match: %m"); + + r = sd_device_enumerator_add_match_sysattr(e, "loop/backing_file", /* value = */ NULL, /* match = */ true); + if (r < 0) + return log_error_errno(r, "Failed to add sysattr match: %m"); + + FOREACH_DEVICE(e, dev) { + const char *s; + + r = sd_device_get_sysattr_value(dev, "loop/backing_file", &s); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to read \"loop/backing_file\" sysattr, ignoring: %m"); + continue; + } + + if (inode_same(s, backing_file, 0) <= 0) + continue; + + *ret = sd_device_ref(dev); + return 0; + } + + return -ENXIO; +} + +static int stop_mount( + sd_bus *bus, + const char *where, + const char *suffix) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_free_ char *mount_unit = NULL; + int r; + + if (!arg_no_block) { + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + r = unit_name_from_path(where, suffix, &mount_unit); + if (r < 0) + return log_error_errno(r, "Failed to make %s unit name from path %s: %m", suffix + 1, where); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StopUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_set_allow_interactive_authorization(m, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Name and mode */ + r = sd_bus_message_append(m, "ss", mount_unit, "fail"); + if (r < 0) + return bus_log_create_error(r); + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + if (streq(suffix, ".automount") && + sd_bus_error_has_name(&error, "org.freedesktop.systemd1.NoSuchUnit")) + return 0; + return log_error_errno(r, "Failed to stop %s unit: %s", suffix + 1, bus_error_message(&error, r)); + } + + if (w) { + const char *object; + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, arg_quiet, NULL); + if (r < 0) + return r; + } + + if (!arg_quiet) + log_info("Stopped unit %s%s%s for mount point: %s%s%s", + ansi_highlight(), mount_unit, ansi_normal(), + ansi_highlight(), where, ansi_normal()); + + return 0; +} + +static int stop_mounts( + sd_bus *bus, + const char *where) { + + int r; + + if (path_equal(where, "/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Refusing to operate on root directory: %s", where); + + if (!path_is_normalized(where)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path contains non-normalized components: %s", where); + + r = stop_mount(bus, where, ".mount"); + if (r < 0) + return r; + + r = stop_mount(bus, where, ".automount"); + if (r < 0) + return r; + + return 0; +} + +static int umount_by_device(sd_bus *bus, sd_device *dev) { + _cleanup_strv_free_ char **list = NULL; + const char *v; + int r, ret = 0; + + assert(bus); + assert(dev); + + if (sd_device_get_property_value(dev, "SYSTEMD_MOUNT_WHERE", &v) >= 0) + ret = stop_mounts(bus, v); + + r = sd_device_get_devname(dev, &v); + if (r < 0) + return r; + + r = find_mount_points(v, &list); + if (r < 0) + return r; + + STRV_FOREACH(l, list) { + r = stop_mounts(bus, *l); + if (r < 0) + ret = r; + } + + return ret; +} + +static int umount_by_device_node(sd_bus *bus, const char *node) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *v; + int r; + + assert(bus); + assert(node); + + r = sd_device_new_from_devname(&dev, node); + if (r < 0) + return log_error_errno(r, "Failed to get device from %s: %m", node); + + r = sd_device_get_property_value(dev, "ID_FS_USAGE", &v); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get \"ID_FS_USAGE\" device property: %m"); + + if (!streq(v, "filesystem")) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "%s does not contain a known file system.", node); + + return umount_by_device(bus, dev); +} + +static int umount_loop(sd_bus *bus, const char *backing_file) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int r; + + assert(backing_file); + + r = find_loop_device(backing_file, &dev); + if (r < 0) + return log_error_errno(r, r == -ENXIO ? "File %s is not mounted." : "Can't get loop device for %s: %m", backing_file); + + return umount_by_device(bus, dev); +} + +static int action_umount( + sd_bus *bus, + int argc, + char **argv) { + + int r, r2 = 0; + + if (arg_transport != BUS_TRANSPORT_LOCAL) { + for (int i = optind; i < argc; i++) { + _cleanup_free_ char *p = NULL; + + r = path_simplify_alloc(argv[i], &p); + if (r < 0) + return r; + + r = stop_mounts(bus, p); + if (r < 0) + r2 = r; + } + return r2; + } + + for (int i = optind; i < argc; i++) { + _cleanup_free_ char *u = NULL, *p = NULL; + struct stat st; + + u = fstab_node_to_udev_node(argv[i]); + if (!u) + return log_oom(); + + r = chase(u, NULL, 0, &p, NULL); + if (r < 0) { + r2 = log_error_errno(r, "Failed to make path %s absolute: %m", argv[i]); + continue; + } + + if (stat(p, &st) < 0) + return log_error_errno(errno, "Can't stat %s (from %s): %m", p, argv[i]); + + if (S_ISBLK(st.st_mode)) + r = umount_by_device_node(bus, p); + else if (S_ISREG(st.st_mode)) + r = umount_loop(bus, p); + else if (S_ISDIR(st.st_mode)) + r = stop_mounts(bus, p); + else { + log_error("Invalid file type: %s (from %s)", p, argv[i]); + r = -EINVAL; + } + + if (r < 0) + r2 = r; + } + + return r2; +} + +static int acquire_mount_type(sd_device *d) { + const char *v; + + assert(d); + + if (arg_mount_type) + return 0; + + if (sd_device_get_property_value(d, "ID_FS_TYPE", &v) < 0) + return 0; + + arg_mount_type = strdup(v); + if (!arg_mount_type) + return log_oom(); + + log_debug("Discovered type=%s", arg_mount_type); + return 1; +} + +static int acquire_mount_options(sd_device *d) { + const char *v; + + assert(d); + + if (arg_mount_options) + return 0; + + if (sd_device_get_property_value(d, "SYSTEMD_MOUNT_OPTIONS", &v) < 0) + return 0; + + arg_mount_options = strdup(v); + if (!arg_mount_options) + return log_oom(); + + log_debug("Discovered options=%s", arg_mount_options); + return 1; +} + +static const char* get_label(sd_device *d) { + const char *label; + + assert(d); + + if (sd_device_get_property_value(d, "ID_FS_LABEL", &label) >= 0) + return label; + + if (sd_device_get_property_value(d, "ID_PART_ENTRY_NAME", &label) >= 0) + return label; + + return NULL; +} + +static int acquire_mount_where(sd_device *d) { + const char *v; + int r; + + if (arg_mount_where) + return 0; + + if (sd_device_get_property_value(d, "SYSTEMD_MOUNT_WHERE", &v) < 0) { + _cleanup_free_ char *escaped = NULL, *devname_bn = NULL; + const char *name; + + name = get_label(d); + if (!name) + (void) device_get_model_string(d, &name); + if (!name) { + const char *dn; + + if (sd_device_get_devname(d, &dn) < 0) + return 0; + + r = path_extract_filename(dn, &devname_bn); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", dn); + + name = devname_bn; + } + + escaped = xescape(name, "\\"); + if (!escaped) + return log_oom(); + if (!filename_is_valid(escaped)) + return 0; + + arg_mount_where = path_join("/run/media/system", escaped); + } else + arg_mount_where = strdup(v); + + if (!arg_mount_where) + return log_oom(); + + log_debug("Discovered where=%s", arg_mount_where); + return 1; +} + +static int acquire_mount_where_for_loop_dev(sd_device *dev) { + _cleanup_strv_free_ char **list = NULL; + const char *node; + int r; + + assert(dev); + + if (arg_mount_where) + return 0; + + r = sd_device_get_devname(dev, &node); + if (r < 0) + return r; + + r = find_mount_points(node, &list); + if (r < 0) + return r; + if (r == 0) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "Can't find mount point of %s. It is expected that %s is already mounted on a place.", + node, node); + if (r >= 2) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "%s is mounted on %d places. It is expected that %s is mounted on a place.", + node, r, node); + + arg_mount_where = strdup(list[0]); + if (!arg_mount_where) + return log_oom(); + + log_debug("Discovered where=%s", arg_mount_where); + return 1; +} + +static int acquire_description(sd_device *d) { + const char *model = NULL, *label; + + if (arg_description) + return 0; + + (void) device_get_model_string(d, &model); + + label = get_label(d); + if (!label) + (void) sd_device_get_property_value(d, "ID_PART_ENTRY_NUMBER", &label); + + if (model && label) + arg_description = strjoin(model, " ", label); + else if (label) + arg_description = strdup(label); + else if (model) + arg_description = strdup(model); + else + return 0; + + if (!arg_description) + return log_oom(); + + log_debug("Discovered description=%s", arg_description); + return 1; +} + +static int acquire_removable(sd_device *d) { + const char *v; + + /* Shortcut this if there's no reason to check it */ + if (arg_action != ACTION_DEFAULT && arg_timeout_idle_set && arg_bind_device >= 0) + return 0; + + for (;;) { + if (sd_device_get_sysattr_value(d, "removable", &v) >= 0) + break; + + if (sd_device_get_parent(d, &d) < 0) + return 0; + + if (sd_device_get_subsystem(d, &v) < 0 || !streq(v, "block")) + return 0; + } + + if (parse_boolean(v) <= 0) + return 0; + + log_debug("Discovered removable device."); + + if (arg_action == ACTION_DEFAULT) { + log_debug("Automatically turning on automount."); + arg_action = ACTION_AUTOMOUNT; + } + + if (!arg_timeout_idle_set) { + log_debug("Setting idle timeout to 1s."); + arg_timeout_idle = USEC_PER_SEC; + } + + if (arg_bind_device < 0) { + log_debug("Binding automount unit to device."); + arg_bind_device = true; + } + + return 1; +} + +static int discover_loop_backing_file(void) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + int r; + + r = find_loop_device(arg_mount_what, &d); + if (r < 0 && r != -ENXIO) + return log_error_errno(errno, "Can't get loop device for %s: %m", arg_mount_what); + + if (r == -ENXIO) { + _cleanup_free_ char *escaped = NULL, *bn = NULL; + + if (arg_mount_where) + return 0; + + r = path_extract_filename(arg_mount_what, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from backing file path '%s': %m", arg_mount_what); + + escaped = xescape(bn, "\\"); + if (!escaped) + return log_oom(); + if (!filename_is_valid(escaped)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Escaped name %s is not a valid filename.", + escaped); + + arg_mount_where = path_join("/run/media/system", escaped); + if (!arg_mount_where) + return log_oom(); + + log_debug("Discovered where=%s", arg_mount_where); + return 0; + } + + r = acquire_mount_type(d); + if (r < 0) + return r; + + r = acquire_mount_options(d); + if (r < 0) + return r; + + r = acquire_mount_where_for_loop_dev(d); + if (r < 0) + return r; + + r = acquire_description(d); + if (r < 0) + return r; + + return 0; +} + +static int discover_device(void) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + struct stat st; + const char *v; + int r; + + if (stat(arg_mount_what, &st) < 0) + return log_error_errno(errno, "Can't stat %s: %m", arg_mount_what); + + if (S_ISREG(st.st_mode)) + return discover_loop_backing_file(); + + if (!S_ISBLK(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid file type: %s", + arg_mount_what); + + r = sd_device_new_from_stat_rdev(&d, &st); + if (r < 0) + return log_error_errno(r, "Failed to get device from device number: %m"); + + if (sd_device_get_property_value(d, "ID_FS_USAGE", &v) < 0 || !streq(v, "filesystem")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s does not contain a known file system.", + arg_mount_what); + + r = acquire_mount_type(d); + if (r < 0) + return r; + + r = acquire_mount_options(d); + if (r < 0) + return r; + + r = acquire_mount_where(d); + if (r < 0) + return r; + + r = acquire_description(d); + if (r < 0) + return r; + + r = acquire_removable(d); + if (r < 0) + return r; + + return 0; +} + +static int list_devices(void) { + enum { + COLUMN_NODE, + COLUMN_PATH, + COLUMN_MODEL, + COLUMN_WWN, + COLUMN_FSTYPE, + COLUMN_LABEL, + COLUMN_UUID, + _COLUMN_MAX, + }; + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return log_oom(); + + r = sd_device_enumerator_add_match_subsystem(e, "block", true); + if (r < 0) + return log_error_errno(r, "Failed to add block match: %m"); + + r = sd_device_enumerator_add_match_property(e, "ID_FS_USAGE", "filesystem"); + if (r < 0) + return log_error_errno(r, "Failed to add property match: %m"); + + table = table_new("NODE", "PATH", "MODEL", "WWN", "FSTYPE", "LABEL", "UUID"); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + r = table_set_sort(table, (size_t) 0); + if (r < 0) + return log_error_errno(r, "Failed to set sort index: %m"); + + table_set_header(table, arg_legend); + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + FOREACH_DEVICE(e, d) { + for (unsigned c = 0; c < _COLUMN_MAX; c++) { + const char *x = NULL; + + switch (c) { + + case COLUMN_NODE: + (void) sd_device_get_devname(d, &x); + break; + + case COLUMN_PATH: + (void) sd_device_get_property_value(d, "ID_PATH", &x); + break; + + case COLUMN_MODEL: + (void) device_get_model_string(d, &x); + break; + + case COLUMN_WWN: + (void) sd_device_get_property_value(d, "ID_WWN", &x); + break; + + case COLUMN_FSTYPE: + (void) sd_device_get_property_value(d, "ID_FS_TYPE", &x); + break; + + case COLUMN_LABEL: + x = get_label(d); + break; + + case COLUMN_UUID: + (void) sd_device_get_property_value(d, "ID_FS_UUID", &x); + break; + } + + r = table_add_cell(table, NULL, c == COLUMN_NODE ? TABLE_PATH : TABLE_STRING, x); + if (r < 0) + return table_log_add_error(r); + } + } + + pager_open(arg_pager_flags); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int run(int argc, char* argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_action == ACTION_LIST) + return list_devices(); + + r = bus_connect_transport_systemd(arg_transport, arg_host, arg_runtime_scope, &bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + if (arg_action == ACTION_UMOUNT) + return action_umount(bus, argc, argv); + + if ((!arg_mount_type || fstype_is_blockdev_backed(arg_mount_type)) + && !path_is_normalized(arg_mount_what)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path contains non-normalized components: %s", + arg_mount_what); + + if (arg_discover) { + r = discover_device(); + if (r < 0) + return r; + } + + if (!arg_mount_where) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Can't figure out where to mount %s.", + arg_mount_what); + + if (path_equal(arg_mount_where, "/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Refusing to operate on root directory."); + + if (!path_is_normalized(arg_mount_where)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path contains non-normalized components: %s", + arg_mount_where); + + if (streq_ptr(arg_mount_type, "auto")) + arg_mount_type = mfree(arg_mount_type); + if (streq_ptr(arg_mount_options, "defaults")) + arg_mount_options = mfree(arg_mount_options); + + if (!is_device_path(arg_mount_what)) + arg_fsck = false; + + if (arg_fsck && arg_mount_type && arg_transport == BUS_TRANSPORT_LOCAL) { + r = fsck_exists_for_fstype(arg_mount_type); + if (r < 0) + log_warning_errno(r, "Couldn't determine whether fsck for %s exists, proceeding anyway.", arg_mount_type); + else if (r == 0) { + log_debug("Disabling file system check as fsck for %s doesn't exist.", arg_mount_type); + arg_fsck = false; /* fsck doesn't exist, let's not attempt it */ + } + } + + /* The kernel (properly) refuses mounting file systems with unknown uid=,gid= options, + * but not for all filesystem types. Let's try to catch the cases where the option + * would be used if the file system does not support it. It is also possible to + * autodetect the file system, but that's only possible with disk-based file systems + * which incidentally seem to be implemented more carefully and reject unknown options, + * so it's probably OK that we do the check only when the type is specified. + */ + if (arg_mount_type && + !streq(arg_mount_type, "auto") && + arg_uid != UID_INVALID && + !fstype_can_uid_gid(arg_mount_type)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "File system type %s is not known to support uid=/gid=, refusing.", + arg_mount_type); + + switch (arg_action) { + + case ACTION_MOUNT: + case ACTION_DEFAULT: + r = start_transient_mount(bus, argv + optind); + break; + + case ACTION_AUTOMOUNT: + r = start_transient_automount(bus, argv + optind); + break; + + default: + assert_not_reached(); + } + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/network/fuzz-netdev-parser.c b/src/network/fuzz-netdev-parser.c new file mode 100644 index 0000000..f0988bd --- /dev/null +++ b/src/network/fuzz-netdev-parser.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fs-util.h" +#include "fuzz.h" +#include "networkd-manager.h" +#include "tmpfile-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(manager_freep) Manager *manager = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_tempfilep) char netdev_config[] = "/tmp/fuzz-networkd.XXXXXX"; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(fmkostemp_safe(netdev_config, "r+", &f) == 0); + if (size != 0) + assert_se(fwrite(data, size, 1, f) == 1); + + fflush(f); + assert_se(manager_new(&manager, /* test_mode = */ true) >= 0); + (void) netdev_load_one(manager, netdev_config); + return 0; +} diff --git a/src/network/fuzz-netdev-parser.options b/src/network/fuzz-netdev-parser.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/network/fuzz-netdev-parser.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/network/fuzz-network-parser.c b/src/network/fuzz-network-parser.c new file mode 100644 index 0000000..eb17f09 --- /dev/null +++ b/src/network/fuzz-network-parser.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fs-util.h" +#include "fuzz.h" +#include "networkd-manager.h" +#include "tmpfile-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(manager_freep) Manager *manager = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_tempfilep) char network_config[] = "/tmp/fuzz-networkd.XXXXXX"; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(fmkostemp_safe(network_config, "r+", &f) == 0); + if (size != 0) + assert_se(fwrite(data, size, 1, f) == 1); + + fflush(f); + assert_se(manager_new(&manager, /* test_mode = */ true) >= 0); + (void) network_load_one(manager, &manager->networks, network_config); + return 0; +} diff --git a/src/network/fuzz-network-parser.options b/src/network/fuzz-network-parser.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/network/fuzz-network-parser.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/network/generator/main.c b/src/network/generator/main.c new file mode 100644 index 0000000..0439a9d --- /dev/null +++ b/src/network/generator/main.c @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "build.h" +#include "fd-util.h" +#include "fs-util.h" +#include "generator.h" +#include "macro.h" +#include "main-func.h" +#include "mkdir.h" +#include "network-generator.h" +#include "path-util.h" +#include "proc-cmdline.h" + +#define NETWORKD_UNIT_DIRECTORY "/run/systemd/network" + +static const char *arg_root = NULL; + +static int network_save(Network *network, const char *dest_dir) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(network); + + r = generator_open_unit_file_full(dest_dir, NULL, NULL, &f, &temp_path); + if (r < 0) + return r; + + network_dump(network, f); + + if (asprintf(&p, "%s/%s-%s.network", + dest_dir, + isempty(network->ifname) ? "71" : "70", + isempty(network->ifname) ? "default" : network->ifname) < 0) + return log_oom(); + + r = conservative_rename(temp_path, p); + if (r < 0) + return r; + + temp_path = mfree(temp_path); + return 0; +} + +static int netdev_save(NetDev *netdev, const char *dest_dir) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(netdev); + + r = generator_open_unit_file_full(dest_dir, NULL, NULL, &f, &temp_path); + if (r < 0) + return r; + + netdev_dump(netdev, f); + + if (asprintf(&p, "%s/70-%s.netdev", dest_dir, netdev->ifname) < 0) + return log_oom(); + + r = conservative_rename(temp_path, p); + if (r < 0) + return r; + + temp_path = mfree(temp_path); + return 0; +} + +static int link_save(Link *link, const char *dest_dir) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(link); + + r = generator_open_unit_file_full(dest_dir, NULL, NULL, &f, &temp_path); + if (r < 0) + return r; + + link_dump(link, f); + + if (asprintf(&p, "%s/%s-%s.link", + dest_dir, + !isempty(link->ifname) ? "70" : !hw_addr_is_null(&link->mac) ? "71" : "72", + link->filename) < 0) + return log_oom(); + + r = conservative_rename(temp_path, p); + if (r < 0) + return r; + + temp_path = mfree(temp_path); + return 0; +} + +static int context_save(Context *context) { + Network *network; + NetDev *netdev; + Link *link; + int r; + + const char *p = prefix_roota(arg_root, NETWORKD_UNIT_DIRECTORY); + + r = mkdir_p(p, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create directory " NETWORKD_UNIT_DIRECTORY ": %m"); + + HASHMAP_FOREACH(network, context->networks_by_name) + RET_GATHER(r, network_save(network, p)); + + HASHMAP_FOREACH(netdev, context->netdevs_by_name) + RET_GATHER(r, netdev_save(netdev, p)); + + HASHMAP_FOREACH(link, context->links_by_filename) + RET_GATHER(r, link_save(link, p)); + + return r; +} + +static int help(void) { + printf("%s [OPTIONS...] [-- KERNEL_CMDLINE]\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --root=PATH Operate on an alternate filesystem root\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_ROOT, + }; + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "root", required_argument, NULL, ARG_ROOT }, + {}, + }; + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_ROOT: + arg_root = optarg; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(context_clear) Context context = {}; + int r; + + log_setup(); + + umask(0022); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (optind >= argc) { + r = proc_cmdline_parse(parse_cmdline_item, &context, 0); + if (r < 0) + return log_warning_errno(r, "Failed to parse kernel command line: %m"); + } else { + for (int i = optind; i < argc; i++) { + _cleanup_free_ char *word = NULL; + char *value; + + word = strdup(argv[i]); + if (!word) + return log_oom(); + + value = strchr(word, '='); + if (value) + *(value++) = 0; + + r = parse_cmdline_item(word, value, &context); + if (r < 0) + return log_warning_errno(r, "Failed to parse command line \"%s%s%s\": %m", + word, value ? "=" : "", strempty(value)); + } + } + + r = context_merge_networks(&context); + if (r < 0) + return log_warning_errno(r, "Failed to merge multiple command line options: %m"); + + return context_save(&context); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/network/generator/network-generator.c b/src/network/generator/network-generator.c new file mode 100644 index 0000000..48527a2 --- /dev/null +++ b/src/network/generator/network-generator.c @@ -0,0 +1,1432 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fileio.h" +#include "hostname-util.h" +#include "log.h" +#include "macro.h" +#include "memstream-util.h" +#include "netif-naming-scheme.h" +#include "network-generator.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" + +/* + # .network + ip={dhcp|on|any|dhcp6|auto6|either6|link6|link-local} + ip=:{dhcp|on|any|dhcp6|auto6|link6|link-local}[:[][:]] + ip=:[]:::::{none|off|dhcp|on|any|dhcp6|auto6|link6|ibft|link-local}[:[][:]] + ip=:[]:::::{none|off|dhcp|on|any|dhcp6|auto6|link6|ibft|link-local}[:[][:]] + rd.route=/:[:] + nameserver= [nameserver= ...] + rd.peerdns=0 + + # .link + ifname=: + net.ifname-policy=policy1[,policy2,...][,] # This is an original rule, not supported by other tools. + + # .netdev + vlan=: + bond=[::[:[:]]] + team=: # not supported + bridge=: + + # ignored + bootdev= + BOOTIF= + rd.bootif=0 + biosdevname=0 + rd.neednet=1 +*/ + +static const char * const dracut_dhcp_type_table[_DHCP_TYPE_MAX] = { + [DHCP_TYPE_NONE] = "none", + [DHCP_TYPE_OFF] = "off", + [DHCP_TYPE_ON] = "on", + [DHCP_TYPE_ANY] = "any", + [DHCP_TYPE_DHCP] = "dhcp", + [DHCP_TYPE_DHCP6] = "dhcp6", + [DHCP_TYPE_AUTO6] = "auto6", + [DHCP_TYPE_EITHER6] = "either6", + [DHCP_TYPE_IBFT] = "ibft", + [DHCP_TYPE_LINK6] = "link6", + [DHCP_TYPE_LINK_LOCAL] = "link-local", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(dracut_dhcp_type, DHCPType); + +static const char * const networkd_dhcp_type_table[_DHCP_TYPE_MAX] = { + [DHCP_TYPE_NONE] = "no", + [DHCP_TYPE_OFF] = "no", + [DHCP_TYPE_ON] = "yes", + [DHCP_TYPE_ANY] = "yes", + [DHCP_TYPE_DHCP] = "ipv4", + [DHCP_TYPE_DHCP6] = "ipv6", + [DHCP_TYPE_AUTO6] = "no", /* TODO: enable other setting? */ + [DHCP_TYPE_EITHER6] = "ipv6", /* TODO: enable other setting? */ + [DHCP_TYPE_IBFT] = "no", + [DHCP_TYPE_LINK6] = "no", + [DHCP_TYPE_LINK_LOCAL] = "no", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(networkd_dhcp_type, DHCPType); + +static const char * const networkd_ipv6ra_type_table[_DHCP_TYPE_MAX] = { + [DHCP_TYPE_NONE] = "no", + [DHCP_TYPE_OFF] = "no", + [DHCP_TYPE_LINK6] = "no", + [DHCP_TYPE_LINK_LOCAL] = "no", + /* We omit the other entries, to leave the default in effect */ +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(networkd_ipv6ra_type, DHCPType); + +static const char * const networkd_link_local_type_table[_DHCP_TYPE_MAX] = { + [DHCP_TYPE_NONE] = "no", + [DHCP_TYPE_OFF] = "no", + [DHCP_TYPE_LINK6] = "ipv6", + [DHCP_TYPE_LINK_LOCAL] = "yes", + /* We omit the other entries, to leave the default in effect */ +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(networkd_link_local_type, DHCPType); + +static Address *address_free(Address *address) { + if (!address) + return NULL; + + if (address->network) + LIST_REMOVE(addresses, address->network->addresses, address); + + return mfree(address); +} + +static int address_new(Network *network, int family, unsigned char prefixlen, + union in_addr_union *addr, union in_addr_union *peer, Address **ret) { + Address *address; + + assert(network); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(addr); + + address = new(Address, 1); + if (!address) + return -ENOMEM; + + *address = (Address) { + .family = family, + .prefixlen = prefixlen, + .address = *addr, + .peer = peer ? *peer : IN_ADDR_NULL, + }; + + LIST_PREPEND(addresses, network->addresses, address); + + address->network = network; + + if (ret) + *ret = address; + return 0; +} + +static Route *route_free(Route *route) { + if (!route) + return NULL; + + if (route->network) + LIST_REMOVE(routes, route->network->routes, route); + + return mfree(route); +} + +static int route_new(Network *network, int family, unsigned char prefixlen, + union in_addr_union *dest, union in_addr_union *gateway, Route **ret) { + Route *route; + + assert(network); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(dest || gateway); + + route = new(Route, 1); + if (!route) + return -ENOMEM; + + *route = (Route) { + .family = family, + .prefixlen = prefixlen, + .dest = dest ? *dest : IN_ADDR_NULL, + .gateway = gateway ? *gateway : IN_ADDR_NULL, + }; + + LIST_PREPEND(routes, network->routes, route); + + route->network = network; + + if (ret) + *ret = route; + return 0; +} + +static Network *network_free(Network *network) { + Address *address; + Route *route; + + if (!network) + return NULL; + + free(network->ifname); + free(network->hostname); + strv_free(network->dns); + free(network->vlan); + free(network->bridge); + free(network->bond); + + while ((address = network->addresses)) + address_free(address); + + while ((route = network->routes)) + route_free(route); + + return mfree(network); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Network*, network_free); + +static int network_new(Context *context, const char *name, Network **ret) { + _cleanup_(network_freep) Network *network = NULL; + _cleanup_free_ char *ifname = NULL; + int r; + + assert(context); + + if (!isempty(name) && !ifname_valid(name)) + return -EINVAL; + + ifname = strdup(name); + if (!ifname) + return -ENOMEM; + + network = new(Network, 1); + if (!network) + return -ENOMEM; + + *network = (Network) { + .ifname = TAKE_PTR(ifname), + .dhcp_type = _DHCP_TYPE_INVALID, + .dhcp_use_dns = -1, + }; + + r = hashmap_ensure_put(&context->networks_by_name, &string_hash_ops, network->ifname, network); + if (r < 0) + return r; + + if (ret) + *ret = network; + + TAKE_PTR(network); + return 0; +} + +Network *network_get(Context *context, const char *ifname) { + return hashmap_get(context->networks_by_name, ifname); +} + +static NetDev *netdev_free(NetDev *netdev) { + if (!netdev) + return NULL; + + free(netdev->ifname); + free(netdev->kind); + return mfree(netdev); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(NetDev*, netdev_free); + +static int netdev_new(Context *context, const char *_kind, const char *_ifname, NetDev **ret) { + _cleanup_(netdev_freep) NetDev *netdev = NULL; + _cleanup_free_ char *kind = NULL, *ifname = NULL; + int r; + + assert(context); + assert(_kind); + + if (!ifname_valid(_ifname)) + return -EINVAL; + + kind = strdup(_kind); + if (!kind) + return -ENOMEM; + + ifname = strdup(_ifname); + if (!ifname) + return -ENOMEM; + + netdev = new(NetDev, 1); + if (!netdev) + return -ENOMEM; + + *netdev = (NetDev) { + .kind = TAKE_PTR(kind), + .ifname = TAKE_PTR(ifname), + }; + + r = hashmap_ensure_put(&context->netdevs_by_name, &string_hash_ops, netdev->ifname, netdev); + if (r < 0) + return r; + + if (ret) + *ret = netdev; + + TAKE_PTR(netdev); + return 0; +} + +NetDev *netdev_get(Context *context, const char *ifname) { + return hashmap_get(context->netdevs_by_name, ifname); +} + +static Link *link_free(Link *link) { + if (!link) + return NULL; + + free(link->filename); + free(link->ifname); + strv_free(link->policies); + strv_free(link->alt_policies); + return mfree(link); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_free); + +static int link_new( + Context *context, + const char *name, + const struct hw_addr_data *mac, + Link **ret) { + + _cleanup_(link_freep) Link *link = NULL; + _cleanup_free_ char *ifname = NULL, *filename = NULL; + int r; + + assert(context); + assert(mac); + + if (name) { + if (!ifname_valid(name)) + return -EINVAL; + + ifname = strdup(name); + if (!ifname) + return -ENOMEM; + + filename = strdup(name); + if (!filename) + return -ENOMEM; + } + + if (!filename) { + filename = strdup(hw_addr_is_null(mac) ? "default" : + HW_ADDR_TO_STR_FULL(mac, HW_ADDR_TO_STRING_NO_COLON)); + if (!filename) + return -ENOMEM; + } + + link = new(Link, 1); + if (!link) + return -ENOMEM; + + *link = (Link) { + .filename = TAKE_PTR(filename), + .ifname = TAKE_PTR(ifname), + .mac = *mac, + }; + + r = hashmap_ensure_put(&context->links_by_filename, &string_hash_ops, link->filename, link); + if (r < 0) + return r; + + if (ret) + *ret = link; + + TAKE_PTR(link); + return 0; +} + +Link *link_get(Context *context, const char *filename) { + assert(context); + assert(filename); + return hashmap_get(context->links_by_filename, filename); +} + +static int network_set_dhcp_type(Context *context, const char *ifname, const char *dhcp_type) { + Network *network; + DHCPType t; + int r; + + assert(context); + assert(ifname); + assert(dhcp_type); + + t = dracut_dhcp_type_from_string(dhcp_type); + if (t < 0) + return t; + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + network->dhcp_type = t; + return 0; +} + +static int network_set_hostname(Context *context, const char *ifname, const char *hostname) { + Network *network; + + assert(context); + assert(ifname); + + network = network_get(context, ifname); + if (!network) + return -ENODEV; + + return free_and_strdup(&network->hostname, hostname); +} + +static int network_set_mtu(Context *context, const char *ifname, const char *mtu) { + Network *network; + + assert(context); + assert(ifname); + + if (isempty(mtu)) + return 0; + + network = network_get(context, ifname); + if (!network) + return -ENODEV; + + return parse_mtu(AF_UNSPEC, mtu, &network->mtu); +} + +static int network_set_mac_address(Context *context, const char *ifname, const char *mac) { + Network *network; + + assert(context); + assert(ifname); + assert(mac); + + network = network_get(context, ifname); + if (!network) + return -ENODEV; + + return parse_ether_addr(mac, &network->mac); +} + +static int network_set_address(Context *context, const char *ifname, int family, unsigned char prefixlen, + union in_addr_union *addr, union in_addr_union *peer) { + Network *network; + + assert(context); + assert(ifname); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(addr); + + if (!in_addr_is_set(family, addr)) + return 0; + + network = network_get(context, ifname); + if (!network) + return -ENODEV; + + return address_new(network, family, prefixlen, addr, peer, NULL); +} + +static int network_set_route(Context *context, const char *ifname, int family, unsigned char prefixlen, + union in_addr_union *dest, union in_addr_union *gateway) { + Network *network; + int r; + + assert(context); + assert(ifname); + assert(IN_SET(family, AF_INET, AF_INET6)); + + if (!(dest && in_addr_is_set(family, dest)) && + !(gateway && in_addr_is_set(family, gateway))) + return 0; + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + return route_new(network, family, prefixlen, dest, gateway, NULL); +} + +static int network_set_dns(Context *context, const char *ifname, int family, const char *dns) { + union in_addr_union a; + Network *network; + int r; + + assert(context); + assert(ifname); + assert(IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6)); + assert(dns); + + if (family == AF_UNSPEC) + r = in_addr_from_string_auto(dns, &family, &a); + else + r = in_addr_from_string(family, dns, &a); + if (r < 0) + return r; + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + return strv_extend(&network->dns, dns); +} + +static int network_set_dhcp_use_dns(Context *context, const char *ifname, bool value) { + Network *network; + int r; + + assert(context); + assert(ifname); + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + network->dhcp_use_dns = value; + + return 0; +} + +static int network_set_vlan(Context *context, const char *ifname, const char *value) { + Network *network; + int r; + + assert(context); + assert(ifname); + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + return free_and_strdup(&network->vlan, value); +} + +static int network_set_bridge(Context *context, const char *ifname, const char *value) { + Network *network; + int r; + + assert(context); + assert(ifname); + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + return free_and_strdup(&network->bridge, value); +} + +static int network_set_bond(Context *context, const char *ifname, const char *value) { + Network *network; + int r; + + assert(context); + assert(ifname); + + network = network_get(context, ifname); + if (!network) { + r = network_new(context, ifname, &network); + if (r < 0) + return r; + } + + return free_and_strdup(&network->bond, value); +} + +static int parse_cmdline_ip_mtu_mac(Context *context, const char *ifname, const char *value) { + const char *mtu, *p; + int r; + + assert(context); + assert(ifname); + assert(value); + + /* [][:] */ + + p = strchr(value, ':'); + if (!p) + mtu = value; + else + mtu = strndupa_safe(value, p - value); + + r = network_set_mtu(context, ifname, mtu); + if (r < 0) + return r; + + if (!p || isempty(p + 1)) + return 0; + + r = network_set_mac_address(context, ifname, p + 1); + if (r < 0) + return r; + + return 0; +} + +static int parse_ip_address_one(int family, const char **value, union in_addr_union *ret) { + const char *p, *q, *buf; + int r; + + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(value); + assert(ret); + + p = ASSERT_PTR(*value); + + if (p[0] == ':') { + *value = p + 1; + return 0; + } + + if (family == AF_INET6) { + if (p[0] != '[') + return -EINVAL; + + q = strchr(p + 1, ']'); + if (!q) + return -EINVAL; + + if (q[1] != ':') + return -EINVAL; + + buf = strndupa_safe(p + 1, q - p - 1); + p = q + 2; + } else { + q = strchr(p, ':'); + if (!q) + return -EINVAL; + + buf = strndupa_safe(p, q - p); + p = q + 1; + } + + r = in_addr_from_string(family, buf, ret); + if (r < 0) + return r; + + *value = p; + return 1; +} + +static int parse_netmask_or_prefixlen(int family, const char **value, unsigned char *ret) { + union in_addr_union netmask; + const char *p, *q; + int r; + + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(value); + assert(*value); + assert(ret); + + r = parse_ip_address_one(family, value, &netmask); + if (r > 0) { + if (family == AF_INET6) + /* TODO: Not supported yet. */ + return -EINVAL; + + *ret = in4_addr_netmask_to_prefixlen(&netmask.in); + } else if (r == 0) + *ret = family == AF_INET6 ? 128 : 32; + else { + p = strchr(*value, ':'); + if (!p) + return -EINVAL; + + q = strndupa_safe(*value, p - *value); + r = safe_atou8(q, ret); + if (r < 0) + return r; + + *value = p + 1; + } + + return 0; +} + +static int parse_ip_dns_address_one(Context *context, const char *ifname, const char **value) { + const char *p, *q, *buf; + int r, family; + + assert(context); + assert(ifname); + assert(value); + + p = ASSERT_PTR(*value); + + if (isempty(p)) + return 0; + + if (p[0] == '[') { + q = strchr(p + 1, ']'); + if (!q) + return -EINVAL; + if (!IN_SET(q[1], ':', '\0')) + return -EINVAL; + + buf = strndupa_safe(p + 1, q - p - 1); + p = q + 1; + family = AF_INET6; + } else { + q = strchr(p, ':'); + if (!q) + buf = *value; + else + buf = strndupa_safe(*value, q - *value); + + p += strlen(buf); + family = AF_INET; + } + + r = network_set_dns(context, ifname, family, buf); + if (r < 0) + return r; + + *value = p; + return 0; +} + +static int parse_cmdline_ip_address(Context *context, int family, const char *value) { + union in_addr_union addr = {}, peer = {}, gateway = {}; + const char *hostname = NULL, *ifname, *dhcp_type, *p; + unsigned char prefixlen; + int r; + + assert(context); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(value); + + /* ip=:[]:::::{none|off|dhcp|on|any|dhcp6|auto6|ibft|link6}[:[][:]] + * ip=:[]:::::{none|off|dhcp|on|any|dhcp6|auto6|ibft|link6}[:[][:]] */ + + r = parse_ip_address_one(family, &value, &addr); + if (r < 0) + return r; + r = parse_ip_address_one(family, &value, &peer); + if (r < 0) + return r; + r = parse_ip_address_one(family, &value, &gateway); + if (r < 0) + return r; + r = parse_netmask_or_prefixlen(family, &value, &prefixlen); + if (r < 0) + return r; + + /* hostname */ + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + if (p != value) { + hostname = strndupa_safe(value, p - value); + if (!hostname_is_valid(hostname, 0)) + return -EINVAL; + } + + value = p + 1; + + /* ifname */ + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + ifname = strndupa_safe(value, p - value); + + value = p + 1; + + /* dhcp_type */ + p = strchr(value, ':'); + if (!p) + dhcp_type = value; + else + dhcp_type = strndupa_safe(value, p - value); + + r = network_set_dhcp_type(context, ifname, dhcp_type); + if (r < 0) + return r; + + /* set values */ + r = network_set_hostname(context, ifname, hostname); + if (r < 0) + return r; + + r = network_set_address(context, ifname, family, prefixlen, &addr, &peer); + if (r < 0) + return r; + + r = network_set_route(context, ifname, family, 0, NULL, &gateway); + if (r < 0) + return r; + + if (!p) + return 0; + + /* First, try [][:] */ + r = parse_cmdline_ip_mtu_mac(context, ifname, p + 1); + if (r >= 0) + return 0; + + /* Next, try [][:] */ + value = p + 1; + r = parse_ip_dns_address_one(context, ifname, &value); + if (r < 0) + return r; + + value += *value == ':'; + r = parse_ip_dns_address_one(context, ifname, &value); + if (r < 0) + return r; + + /* refuse unexpected trailing strings */ + if (!isempty(value)) + return -EINVAL; + + return 0; +} + +static int parse_cmdline_ip_interface(Context *context, const char *value) { + const char *ifname, *dhcp_type, *p; + int r; + + assert(context); + assert(value); + + /* ip=:{dhcp|on|any|dhcp6|auto6|link6}[:[][:]] */ + + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + ifname = strndupa_safe(value, p - value); + + value = p + 1; + p = strchr(value, ':'); + if (!p) + dhcp_type = value; + else + dhcp_type = strndupa_safe(value, p - value); + + r = network_set_dhcp_type(context, ifname, dhcp_type); + if (r < 0) + return r; + + if (!p) + return 0; + + return parse_cmdline_ip_mtu_mac(context, ifname, p + 1); +} + +static int parse_cmdline_ip(Context *context, const char *key, const char *value) { + const char *p; + int r; + + assert(context); + assert(key); + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + p = strchr(value, ':'); + if (!p) + /* ip={dhcp|on|any|dhcp6|auto6|either6|link6} */ + return network_set_dhcp_type(context, "", value); + + if (value[0] == '[') + return parse_cmdline_ip_address(context, AF_INET6, value); + + r = parse_cmdline_ip_address(context, AF_INET, value); + if (r < 0) + return parse_cmdline_ip_interface(context, value); + + return 0; +} + +static int parse_cmdline_rd_route(Context *context, const char *key, const char *value) { + union in_addr_union addr = {}, gateway = {}; + unsigned char prefixlen; + const char *buf, *p; + int family, r; + + assert(context); + assert(key); + + /* rd.route=/:[:] */ + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + if (value[0] == '[') { + p = strchr(value, ']'); + if (!p) + return -EINVAL; + + if (p[1] != ':') + return -EINVAL; + + buf = strndupa_safe(value + 1, p - value - 1); + value = p + 2; + family = AF_INET6; + } else { + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + buf = strndupa_safe(value, p - value); + value = p + 1; + family = AF_INET; + } + + r = in_addr_prefix_from_string(buf, family, &addr, &prefixlen); + if (r < 0) + return r; + + p = strchr(value, ':'); + if (!p) + value = strjoina(value, ":"); + + r = parse_ip_address_one(family, &value, &gateway); + if (r < 0) + return r; + + return network_set_route(context, value, family, prefixlen, &addr, &gateway); +} + +static int parse_cmdline_nameserver(Context *context, const char *key, const char *value) { + assert(context); + assert(key); + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + return network_set_dns(context, "", AF_UNSPEC, value); +} + +static int parse_cmdline_rd_peerdns(Context *context, const char *key, const char *value) { + int r; + + assert(context); + assert(key); + + if (proc_cmdline_value_missing(key, value)) + return network_set_dhcp_use_dns(context, "", true); + + r = parse_boolean(value); + if (r < 0) + return r; + + return network_set_dhcp_use_dns(context, "", r); +} + +static int parse_cmdline_vlan(Context *context, const char *key, const char *value) { + const char *name, *p; + NetDev *netdev; + int r; + + assert(context); + assert(key); + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + name = strndupa_safe(value, p - value); + + netdev = netdev_get(context, name); + if (!netdev) { + r = netdev_new(context, "vlan", name, &netdev); + if (r < 0) + return r; + } + + return network_set_vlan(context, p + 1, name); +} + +static int parse_cmdline_bridge(Context *context, const char *key, const char *value) { + const char *name, *p; + NetDev *netdev; + int r; + + assert(context); + assert(key); + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + name = strndupa_safe(value, p - value); + + netdev = netdev_get(context, name); + if (!netdev) { + r = netdev_new(context, "bridge", name, &netdev); + if (r < 0) + return r; + } + + p++; + if (isempty(p)) + return -EINVAL; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ",", 0); + if (r <= 0) + return r; + + r = network_set_bridge(context, word, name); + if (r < 0) + return r; + } +} + +static int parse_cmdline_bond(Context *context, const char *key, const char *value) { + const char *name, *slaves, *p; + NetDev *netdev; + int r; + + assert(context); + assert(key); + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + name = strndupa_safe(value, p - value); + + netdev = netdev_get(context, name); + if (!netdev) { + r = netdev_new(context, "bond", name, &netdev); + if (r < 0) + return r; + } + + value = p + 1; + p = strchr(value, ':'); + if (!p) + slaves = value; + else + slaves = strndupa_safe(value, p - value); + + if (isempty(slaves)) + return -EINVAL; + + for (const char *q = slaves; ; ) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&q, &word, ",", 0); + if (r == 0) + break; + if (r < 0) + return r; + + r = network_set_bond(context, word, name); + if (r < 0) + return r; + } + + if (!p) + return 0; + + value = p + 1; + p = strchr(value, ':'); + if (!p) + /* TODO: set bonding options */ + return 0; + + return parse_mtu(AF_UNSPEC, p + 1, &netdev->mtu); +} + +static int parse_cmdline_ifname(Context *context, const char *key, const char *value) { + struct hw_addr_data mac; + const char *name, *p; + int r; + + assert(context); + assert(key); + + /* ifname=: */ + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + p = strchr(value, ':'); + if (!p) + return -EINVAL; + + name = strndupa_safe(value, p - value); + + r = parse_hw_addr(p + 1, &mac); + if (r < 0) + return r; + + return link_new(context, name, &mac, NULL); +} + +static int parse_cmdline_ifname_policy(Context *context, const char *key, const char *value) { + _cleanup_strv_free_ char **policies = NULL, **alt_policies = NULL; + struct hw_addr_data mac = HW_ADDR_NULL; + Link *link; + int r; + + assert(context); + assert(key); + + /* net.ifname-policy=policy1[,policy2,...][,] */ + + if (proc_cmdline_value_missing(key, value)) + return -EINVAL; + + for (const char *q = value; ; ) { + _cleanup_free_ char *word = NULL; + NamePolicy p; + + r = extract_first_word(&q, &word, ",", 0); + if (r == 0) + break; + if (r < 0) + return r; + + p = name_policy_from_string(word); + if (p < 0) { + r = parse_hw_addr(word, &mac); + if (r < 0) + return r; + + if (hw_addr_is_null(&mac)) + return -EINVAL; + + if (!isempty(q)) + return -EINVAL; + + break; + } + + if (alternative_names_policy_from_string(word) >= 0) { + r = strv_extend(&alt_policies, word); + if (r < 0) + return r; + } + + r = strv_consume(&policies, TAKE_PTR(word)); + if (r < 0) + return r; + } + + if (strv_isempty(policies)) + return -EINVAL; + + r = link_new(context, NULL, &mac, &link); + if (r < 0) + return r; + + link->policies = TAKE_PTR(policies); + link->alt_policies = TAKE_PTR(alt_policies); + return 0; +} + +int parse_cmdline_item(const char *key, const char *value, void *data) { + Context *context = ASSERT_PTR(data); + + assert(key); + + if (streq(key, "ip")) + return parse_cmdline_ip(context, key, value); + if (streq(key, "rd.route")) + return parse_cmdline_rd_route(context, key, value); + if (streq(key, "nameserver")) + return parse_cmdline_nameserver(context, key, value); + if (streq(key, "rd.peerdns")) + return parse_cmdline_rd_peerdns(context, key, value); + if (streq(key, "vlan")) + return parse_cmdline_vlan(context, key, value); + if (streq(key, "bridge")) + return parse_cmdline_bridge(context, key, value); + if (streq(key, "bond")) + return parse_cmdline_bond(context, key, value); + if (streq(key, "ifname")) + return parse_cmdline_ifname(context, key, value); + if (streq(key, "net.ifname-policy")) + return parse_cmdline_ifname_policy(context, key, value); + + return 0; +} + +int context_merge_networks(Context *context) { + Network *all, *network; + int r; + + assert(context); + + /* Copy settings about the following options + rd.route=/:[:] + nameserver= [nameserver= ...] + rd.peerdns=0 */ + + all = network_get(context, ""); + if (!all) + return 0; + + if (hashmap_size(context->networks_by_name) <= 1) + return 0; + + HASHMAP_FOREACH(network, context->networks_by_name) { + if (network == all) + continue; + + network->dhcp_use_dns = all->dhcp_use_dns; + + r = strv_extend_strv(&network->dns, all->dns, false); + if (r < 0) + return r; + + LIST_FOREACH(routes, route, all->routes) { + r = route_new(network, route->family, route->prefixlen, &route->dest, &route->gateway, NULL); + if (r < 0) + return r; + } + } + + assert_se(hashmap_remove(context->networks_by_name, "") == all); + network_free(all); + return 0; +} + +void context_clear(Context *context) { + if (!context) + return; + + hashmap_free_with_destructor(context->networks_by_name, network_free); + hashmap_free_with_destructor(context->netdevs_by_name, netdev_free); + hashmap_free_with_destructor(context->links_by_filename, link_free); +} + +static int address_dump(Address *address, FILE *f) { + assert(address); + assert(f); + + fprintf(f, + "\n[Address]\n" + "Address=%s\n", + IN_ADDR_PREFIX_TO_STRING(address->family, &address->address, address->prefixlen)); + if (in_addr_is_set(address->family, &address->peer)) + fprintf(f, "Peer=%s\n", + IN_ADDR_TO_STRING(address->family, &address->peer)); + return 0; +} + +static int route_dump(Route *route, FILE *f) { + assert(route); + assert(f); + + fputs("\n[Route]\n", f); + if (in_addr_is_set(route->family, &route->dest)) + fprintf(f, "Destination=%s\n", + IN_ADDR_PREFIX_TO_STRING(route->family, &route->dest, route->prefixlen)); + if (in_addr_is_set(route->family, &route->gateway)) + fprintf(f, "Gateway=%s\n", + IN_ADDR_TO_STRING(route->family, &route->gateway)); + + return 0; +} + +void network_dump(Network *network, FILE *f) { + const char *dhcp; + + assert(network); + assert(f); + + fputs("[Match]\n", f); + + if (isempty(network->ifname)) + /* If the interface name is not specified, then let's make the .network file match the all + * physical interfaces. */ + fputs("Kind=!*\n" + "Type=!loopback\n", f); + else + fprintf(f, "Name=%s\n", network->ifname); + + fputs("\n[Link]\n", f); + + if (!ether_addr_is_null(&network->mac)) + fprintf(f, "MACAddress=%s\n", ETHER_ADDR_TO_STR(&network->mac)); + if (network->mtu > 0) + fprintf(f, "MTUBytes=%" PRIu32 "\n", network->mtu); + + fputs("\n[Network]\n", f); + + dhcp = networkd_dhcp_type_to_string(network->dhcp_type); + if (dhcp) + fprintf(f, "DHCP=%s\n", dhcp); + + const char *ll; + ll = networkd_link_local_type_to_string(network->dhcp_type); + if (ll) + fprintf(f, "LinkLocalAddressing=%s\n", ll); + + const char *ra; + ra = networkd_ipv6ra_type_to_string(network->dhcp_type); + if (ra) + fprintf(f, "IPv6AcceptRA=%s\n", ra); + + if (!strv_isempty(network->dns)) + STRV_FOREACH(dns, network->dns) + fprintf(f, "DNS=%s\n", *dns); + + if (network->vlan) + fprintf(f, "VLAN=%s\n", network->vlan); + + if (network->bridge) + fprintf(f, "Bridge=%s\n", network->bridge); + + if (network->bond) + fprintf(f, "Bond=%s\n", network->bond); + + fputs("\n[DHCP]\n", f); + + if (!isempty(network->hostname)) + fprintf(f, "Hostname=%s\n", network->hostname); + + if (network->dhcp_use_dns >= 0) + fprintf(f, "UseDNS=%s\n", yes_no(network->dhcp_use_dns)); + + LIST_FOREACH(addresses, address, network->addresses) + (void) address_dump(address, f); + + LIST_FOREACH(routes, route, network->routes) + (void) route_dump(route, f); +} + +void netdev_dump(NetDev *netdev, FILE *f) { + assert(netdev); + assert(f); + + fprintf(f, + "[NetDev]\n" + "Kind=%s\n" + "Name=%s\n", + netdev->kind, + netdev->ifname); + + if (netdev->mtu > 0) + fprintf(f, "MTUBytes=%" PRIu32 "\n", netdev->mtu); +} + +void link_dump(Link *link, FILE *f) { + assert(link); + assert(f); + + fputs("[Match]\n", f); + + if (!hw_addr_is_null(&link->mac)) + fprintf(f, "MACAddress=%s\n", HW_ADDR_TO_STR(&link->mac)); + else + fputs("OriginalName=*\n", f); + + fputs("\n[Link]\n", f); + + if (!isempty(link->ifname)) + fprintf(f, "Name=%s\n", link->ifname); + + if (!strv_isempty(link->policies)) { + fputs("NamePolicy=", f); + fputstrv(f, link->policies, " ", NULL); + fputc('\n', f); + } + + if (!strv_isempty(link->alt_policies)) { + fputs("AlternativeNamesPolicy=", f); + fputstrv(f, link->alt_policies, " ", NULL); + fputc('\n', f); + } +} + +int network_format(Network *network, char **ret) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + assert(network); + assert(ret); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + network_dump(network, f); + + return memstream_finalize(&m, ret, NULL); +} + +int netdev_format(NetDev *netdev, char **ret) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + assert(netdev); + assert(ret); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + netdev_dump(netdev, f); + + return memstream_finalize(&m, ret, NULL); +} + +int link_format(Link *link, char **ret) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + assert(link); + assert(ret); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + link_dump(link, f); + + return memstream_finalize(&m, ret, NULL); +} diff --git a/src/network/generator/network-generator.h b/src/network/generator/network-generator.h new file mode 100644 index 0000000..aa5ca9d --- /dev/null +++ b/src/network/generator/network-generator.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "ether-addr-util.h" +#include "hashmap.h" +#include "in-addr-util.h" +#include "list.h" + +typedef enum DHCPType { + DHCP_TYPE_NONE, + DHCP_TYPE_OFF, /* Same as DHCP_TYPE_NONE */ + DHCP_TYPE_ON, + DHCP_TYPE_ANY, /* Same as DHCP_TYPE_ON */ + DHCP_TYPE_DHCP, /* Actually means: DHCPv4 */ + DHCP_TYPE_DHCP6, + DHCP_TYPE_AUTO6, + DHCP_TYPE_EITHER6, + DHCP_TYPE_IBFT, + DHCP_TYPE_LINK6, + DHCP_TYPE_LINK_LOCAL, + _DHCP_TYPE_MAX, + _DHCP_TYPE_INVALID = -EINVAL, +} DHCPType; + +typedef struct Address Address; +typedef struct Link Link; +typedef struct NetDev NetDev; +typedef struct Network Network; +typedef struct Route Route; +typedef struct Context Context; + +struct Address { + Network *network; + + union in_addr_union address, peer; + unsigned char prefixlen; + int family; + + LIST_FIELDS(Address, addresses); +}; + +struct Route { + Network *network; + + union in_addr_union dest, gateway; + unsigned char prefixlen; + int family; + + LIST_FIELDS(Route, routes); +}; + +struct Network { + /* [Match] */ + char *ifname; + + /* [Link] */ + struct ether_addr mac; + uint32_t mtu; + + /* [Network] */ + DHCPType dhcp_type; + char **dns; + char *vlan; + char *bridge; + char *bond; + + /* [DHCP] */ + char *hostname; + int dhcp_use_dns; + + LIST_HEAD(Address, addresses); + LIST_HEAD(Route, routes); +}; + +struct NetDev { + /* [NetDev] */ + char *ifname; + char *kind; + uint32_t mtu; +}; + +struct Link { + char *filename; + + /* [Match] */ + struct hw_addr_data mac; + + /* [Link] */ + char *ifname; + char **policies; + char **alt_policies; +}; + +typedef struct Context { + Hashmap *networks_by_name; + Hashmap *netdevs_by_name; + Hashmap *links_by_filename; +} Context; + +int parse_cmdline_item(const char *key, const char *value, void *data); +int context_merge_networks(Context *context); +void context_clear(Context *context); + +Network *network_get(Context *context, const char *ifname); +void network_dump(Network *network, FILE *f); +int network_format(Network *network, char **ret); + +NetDev *netdev_get(Context *context, const char *ifname); +void netdev_dump(NetDev *netdev, FILE *f); +int netdev_format(NetDev *netdev, char **ret); + +Link *link_get(Context *context, const char *filename); +void link_dump(Link *link, FILE *f); +int link_format(Link *link, char **ret); diff --git a/src/network/generator/test-network-generator.c b/src/network/generator/test-network-generator.c new file mode 100644 index 0000000..7850da9 --- /dev/null +++ b/src/network/generator/test-network-generator.c @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "macro.h" +#include "network-generator.h" +#include "string-util.h" +#include "tests.h" + +static void test_network_one(const char *ifname, const char *key, const char *value, const char *expected) { + _cleanup_(context_clear) Context context = {}; + _cleanup_free_ char *output = NULL; + Network *network; + + printf("# %s=%s\n", key, value); + assert_se(parse_cmdline_item(key, value, &context) >= 0); + assert_se(network = network_get(&context, ifname)); + assert_se(network_format(network, &output) >= 0); + puts(output); + assert_se(streq(output, expected)); +} + +static void test_network_two(const char *ifname, + const char *key1, const char *value1, + const char *key2, const char *value2, + const char *expected) { + _cleanup_(context_clear) Context context = {}; + _cleanup_free_ char *output = NULL; + Network *network; + + printf("# %s=%s\n", key1, value1); + printf("# %s=%s\n", key2, value2); + assert_se(parse_cmdline_item(key1, value1, &context) >= 0); + assert_se(parse_cmdline_item(key2, value2, &context) >= 0); + assert_se(context_merge_networks(&context) >= 0); + assert_se(network = network_get(&context, ifname)); + assert_se(network_format(network, &output) >= 0); + puts(output); + assert_se(streq(output, expected)); +} + +static void test_netdev_one(const char *ifname, const char *key, const char *value, const char *expected) { + _cleanup_(context_clear) Context context = {}; + _cleanup_free_ char *output = NULL; + NetDev *netdev; + + printf("# %s=%s\n", key, value); + assert_se(parse_cmdline_item(key, value, &context) >= 0); + assert_se(netdev = netdev_get(&context, ifname)); + assert_se(netdev_format(netdev, &output) >= 0); + puts(output); + assert_se(streq(output, expected)); +} + +static void test_link_one(const char *filename, const char *key, const char *value, const char *expected) { + _cleanup_(context_clear) Context context = {}; + _cleanup_free_ char *output = NULL; + Link *link; + + printf("# %s=%s\n", key, value); + assert_se(parse_cmdline_item(key, value, &context) >= 0); + assert_se(link = link_get(&context, filename)); + assert_se(link_format(link, &output) >= 0); + puts(output); + assert_se(streq(output, expected)); +} + +int main(int argc, char *argv[]) { + test_setup_logging(LOG_DEBUG); + + test_network_one("", "ip", "dhcp6", + "[Match]\n" + "Kind=!*\n" + "Type=!loopback\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=ipv6\n" + "\n[DHCP]\n" + ); + + test_network_one("eth0", "ip", "eth0:dhcp", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=ipv4\n" + "\n[DHCP]\n" + ); + + test_network_one("eth0", "ip", "eth0:dhcp:1530", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "MTUBytes=1530\n" + "\n[Network]\n" + "DHCP=ipv4\n" + "\n[DHCP]\n" + ); + + test_network_one("eth0", "ip", "eth0:dhcp:1530:00:11:22:33:44:55", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "MACAddress=00:11:22:33:44:55\n" + "MTUBytes=1530\n" + "\n[Network]\n" + "DHCP=ipv4\n" + "\n[DHCP]\n" + ); + + test_network_one("eth0", "ip", "192.168.0.10::192.168.0.1:255.255.255.0:hogehoge:eth0:on", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_one("eth0", "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_one("eth0", "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:1530", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "MTUBytes=1530\n" + "\n[Network]\n" + "DHCP=yes\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_one("eth0", "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:1530:00:11:22:33:44:55", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "MACAddress=00:11:22:33:44:55\n" + "MTUBytes=1530\n" + "\n[Network]\n" + "DHCP=yes\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_one("eth0", "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:10.10.10.10", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.10.10.10\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_one("eth0", "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:10.10.10.10:10.10.10.11", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.10.10.10\n" + "DNS=10.10.10.11\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_one("eth0", "ip", "[2001:1234:56:8f63::10]::[2001:1234:56:8f63::1]:64:hogehoge:eth0:on", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=2001:1234:56:8f63::10/64\n" + "\n[Route]\n" + "Gateway=2001:1234:56:8f63::1\n" + ); + + test_network_one("eth0", "ip", "[2001:1234:56:8f63::10]:[2001:1234:56:8f63::2]:[2001:1234:56:8f63::1]:64:hogehoge:eth0:on", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=2001:1234:56:8f63::10/64\n" + "Peer=2001:1234:56:8f63::2\n" + "\n[Route]\n" + "Gateway=2001:1234:56:8f63::1\n" + ); + + test_network_one("", "rd.route", "10.1.2.3/16:10.0.2.3", + "[Match]\n" + "Kind=!*\n" + "Type=!loopback\n" + "\n[Link]\n" + "\n[Network]\n" + "\n[DHCP]\n" + "\n[Route]\n" + "Destination=10.1.2.3/16\n" + "Gateway=10.0.2.3\n" + ); + + test_network_one("eth0", "rd.route", "10.1.2.3/16:10.0.2.3:eth0", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "\n[DHCP]\n" + "\n[Route]\n" + "Destination=10.1.2.3/16\n" + "Gateway=10.0.2.3\n" + ); + + test_network_one("", "nameserver", "10.1.2.3", + "[Match]\n" + "Kind=!*\n" + "Type=!loopback\n" + "\n[Link]\n" + "\n[Network]\n" + "DNS=10.1.2.3\n" + "\n[DHCP]\n" + ); + + test_network_one("", "rd.peerdns", "0", + "[Match]\n" + "Kind=!*\n" + "Type=!loopback\n" + "\n[Link]\n" + "\n[Network]\n" + "\n[DHCP]\n" + "UseDNS=no\n" + ); + + test_network_one("", "rd.peerdns", "1", + "[Match]\n" + "Kind=!*\n" + "Type=!loopback\n" + "\n[Link]\n" + "\n[Network]\n" + "\n[DHCP]\n" + "UseDNS=yes\n" + ); + + test_network_one("eth0", "vlan", "vlan99:eth0", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "VLAN=vlan99\n" + "\n[DHCP]\n" + ); + + test_network_one("eth0", "bridge", "bridge99:eth0,eth1", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "Bridge=bridge99\n" + "\n[DHCP]\n" + ); + + test_network_one("eth1", "bridge", "bridge99:eth0,eth1", + "[Match]\n" + "Name=eth1\n" + "\n[Link]\n" + "\n[Network]\n" + "Bridge=bridge99\n" + "\n[DHCP]\n" + ); + + test_network_one("eth0", "bond", "bond99:eth0,eth1", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "Bond=bond99\n" + "\n[DHCP]\n" + ); + + test_network_one("eth1", "bond", "bond99:eth0,eth1::1530", + "[Match]\n" + "Name=eth1\n" + "\n[Link]\n" + "\n[Network]\n" + "Bond=bond99\n" + "\n[DHCP]\n" + ); + + test_netdev_one("bond99", "bond", "bond99:eth0,eth1::1530", + "[NetDev]\n" + "Kind=bond\n" + "Name=bond99\n" + "MTUBytes=1530\n" + ); + + test_link_one("hogehoge", "ifname", "hogehoge:00:11:22:33:44:55", + "[Match]\n" + "MACAddress=00:11:22:33:44:55\n" + "\n[Link]\n" + "Name=hogehoge\n" + ); + + test_link_one("001122334455", "net.ifname-policy", "keep,kernel,database,onboard,slot,path,mac,00:11:22:33:44:55", + "[Match]\n" + "MACAddress=00:11:22:33:44:55\n" + "\n[Link]\n" + "NamePolicy=keep kernel database onboard slot path mac\n" + "AlternativeNamesPolicy=database onboard slot path mac\n" + ); + + test_link_one("default", "net.ifname-policy", "keep,kernel,database,onboard,slot,path,mac", + "[Match]\n" + "OriginalName=*\n" + "\n[Link]\n" + "NamePolicy=keep kernel database onboard slot path mac\n" + "AlternativeNamesPolicy=database onboard slot path mac\n" + ); + + test_network_two("eth0", + "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:10.10.10.10:10.10.10.11", + "rd.route", "10.1.2.3/16:10.0.2.3", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.10.10.10\n" + "DNS=10.10.10.11\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Destination=10.1.2.3/16\n" + "Gateway=10.0.2.3\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_two("eth0", + "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on", + "nameserver", "10.1.2.3", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.1.2.3\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_two("eth0", + "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:10.10.10.10:10.10.10.11", + "nameserver", "10.1.2.3", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.10.10.10\n" + "DNS=10.10.10.11\n" + "DNS=10.1.2.3\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_two("eth0", + "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:10.10.10.10:10.10.10.11", + "rd.peerdns", "1", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.10.10.10\n" + "DNS=10.10.10.11\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "UseDNS=yes\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + test_network_two("eth0", + "ip", "192.168.0.10:192.168.0.2:192.168.0.1:255.255.255.0:hogehoge:eth0:on:10.10.10.10:10.10.10.11", + "bridge", "bridge99:eth0,eth1", + "[Match]\n" + "Name=eth0\n" + "\n[Link]\n" + "\n[Network]\n" + "DHCP=yes\n" + "DNS=10.10.10.10\n" + "DNS=10.10.10.11\n" + "Bridge=bridge99\n" + "\n[DHCP]\n" + "Hostname=hogehoge\n" + "\n[Address]\n" + "Address=192.168.0.10/24\n" + "Peer=192.168.0.2\n" + "\n[Route]\n" + "Gateway=192.168.0.1\n" + ); + + return 0; +} diff --git a/src/network/meson.build b/src/network/meson.build new file mode 100644 index 0000000..5c05eba --- /dev/null +++ b/src/network/meson.build @@ -0,0 +1,266 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +sources = files( + 'netdev/bareudp.c', + 'netdev/batadv.c', + 'netdev/bond.c', + 'netdev/bridge.c', + 'netdev/dummy.c', + 'netdev/fou-tunnel.c', + 'netdev/geneve.c', + 'netdev/ifb.c', + 'netdev/ipoib.c', + 'netdev/ipvlan.c', + 'netdev/l2tp-tunnel.c', + 'netdev/macsec.c', + 'netdev/macvlan.c', + 'netdev/netdev-util.c', + 'netdev/netdev.c', + 'netdev/netdevsim.c', + 'netdev/nlmon.c', + 'netdev/tunnel.c', + 'netdev/tuntap.c', + 'netdev/vcan.c', + 'netdev/veth.c', + 'netdev/vlan.c', + 'netdev/vrf.c', + 'netdev/vxcan.c', + 'netdev/vxlan.c', + 'netdev/wireguard.c', + 'netdev/wlan.c', + 'netdev/xfrm.c', + 'networkd-address-generation.c', + 'networkd-address-label.c', + 'networkd-address-pool.c', + 'networkd-address.c', + 'networkd-bridge-fdb.c', + 'networkd-bridge-mdb.c', + 'networkd-bridge-vlan.c', + 'networkd-can.c', + 'networkd-conf.c', + 'networkd-dhcp-common.c', + 'networkd-dhcp-prefix-delegation.c', + 'networkd-dhcp-server-bus.c', + 'networkd-dhcp-server-static-lease.c', + 'networkd-dhcp-server.c', + 'networkd-dhcp4-bus.c', + 'networkd-dhcp4.c', + 'networkd-dhcp6-bus.c', + 'networkd-dhcp6.c', + 'networkd-ipv4acd.c', + 'networkd-ipv4ll.c', + 'networkd-ipv6-proxy-ndp.c', + 'networkd-ipv6ll.c', + 'networkd-json.c', + 'networkd-link-bus.c', + 'networkd-link.c', + 'networkd-lldp-rx.c', + 'networkd-lldp-tx.c', + 'networkd-manager-bus.c', + 'networkd-manager.c', + 'networkd-ndisc.c', + 'networkd-neighbor.c', + 'networkd-netlabel.c', + 'networkd-network-bus.c', + 'networkd-network.c', + 'networkd-nexthop.c', + 'networkd-queue.c', + 'networkd-radv.c', + 'networkd-route-util.c', + 'networkd-route.c', + 'networkd-routing-policy-rule.c', + 'networkd-setlink.c', + 'networkd-speed-meter.c', + 'networkd-sriov.c', + 'networkd-state-file.c', + 'networkd-sysctl.c', + 'networkd-util.c', + 'networkd-wifi.c', + 'networkd-wiphy.c', + 'tc/cake.c', + 'tc/codel.c', + 'tc/drr.c', + 'tc/ets.c', + 'tc/fifo.c', + 'tc/fq-codel.c', + 'tc/fq-pie.c', + 'tc/fq.c', + 'tc/gred.c', + 'tc/hhf.c', + 'tc/htb.c', + 'tc/netem.c', + 'tc/pie.c', + 'tc/qdisc.c', + 'tc/qfq.c', + 'tc/sfb.c', + 'tc/sfq.c', + 'tc/tbf.c', + 'tc/tc-util.c', + 'tc/tc.c', + 'tc/tclass.c', + 'tc/teql.c', +) + +systemd_networkd_sources = files('networkd.c') + +systemd_networkd_wait_online_sources = files( + 'wait-online/link.c', + 'wait-online/manager.c', + 'wait-online/wait-online.c', +) + +networkctl_sources = files('networkctl.c') + +network_generator_sources = files( + 'generator/main.c', + 'generator/network-generator.c', +) + +networkd_network_gperf_gperf = files('networkd-network-gperf.gperf') +networkd_netdev_gperf_gperf = files('netdev/netdev-gperf.gperf') + +sources += custom_target( + 'networkd-gperf.c', + input : 'networkd-gperf.gperf', + output : 'networkd-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +sources += custom_target( + 'networkd-network-gperf.c', + input : networkd_network_gperf_gperf, + output : 'networkd-network-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +sources += custom_target( + 'netdev-gperf.c', + input : networkd_netdev_gperf_gperf, + output : 'netdev-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +if get_option('link-networkd-shared') + networkd_link_with = [libshared] +else + networkd_link_with = [libsystemd_static, + libshared_static, + libbasic_gcrypt] +endif + +network_includes = [libsystemd_network_includes, include_directories(['.', 'netdev', 'tc'])] + +libnetworkd_core = static_library( + 'networkd-core', + sources, + include_directories : network_includes, + dependencies : userspace, + link_with : networkd_link_with, + build_by_default : false) + +network_test_template = test_template + { + 'link_with' : [ + libnetworkd_core, + libsystemd_network, + ], + 'include_directories' : network_includes, +} + +network_fuzz_template = fuzz_template + { + 'link_with' : [ + libnetworkd_core, + libsystemd_network, + ], + 'dependencies' : threads, + 'include_directories' : network_includes, +} + +executables += [ + libexec_template + { + 'name' : 'systemd-networkd', + 'dbus' : true, + 'conditions' : ['ENABLE_NETWORKD'], + 'sources' : systemd_networkd_sources, + 'include_directories' : network_includes, + 'link_with' : [ + libnetworkd_core, + libsystemd_network, + networkd_link_with, + ], + 'dependencies' : threads, + }, + libexec_template + { + 'name' : 'systemd-networkd-wait-online', + 'public' : true, + 'conditions' : ['ENABLE_NETWORKD'], + 'sources' : systemd_networkd_wait_online_sources, + 'link_with' : networkd_link_with, + }, + executable_template + { + 'name' : 'networkctl', + 'public' : true, + 'conditions' : ['ENABLE_NETWORKD'], + 'sources' : networkctl_sources, + 'include_directories' : libsystemd_network_includes, + 'link_with' : [ + libsystemd_network, + networkd_link_with, + ], + }, + libexec_template + { + 'name' : 'systemd-network-generator', + 'sources' : network_generator_sources, + 'link_with' : networkd_link_with, + }, + test_template + { + 'sources' : files( + 'generator/test-network-generator.c', + 'generator/network-generator.c', + ), + 'suite' : 'network', + }, + network_test_template + { + 'sources' : files('test-network-tables.c'), + 'dependencies' : threads, + }, + network_test_template + { + 'sources' : files('test-network.c'), + 'dependencies' : threads, + }, + network_test_template + { + 'sources' : files('test-networkd-address.c'), + 'dependencies' : libatomic, + }, + network_test_template + { + 'sources' : files('test-networkd-conf.c'), + 'dependencies' : libatomic, + }, + network_test_template + { + 'sources' : files('test-networkd-util.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-netdev-parser.c'), + }, + network_fuzz_template + { + 'sources' : files('fuzz-network-parser.c'), + }, +] + +if conf.get('ENABLE_NETWORKD') == 1 + install_data('org.freedesktop.network1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.network1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.network1.policy', + install_dir : polkitpolicydir) + if install_polkit + install_data('systemd-networkd.rules', + install_dir : polkitrulesdir) + endif + if install_polkit_pkla + install_data('systemd-networkd.pkla', + install_dir : polkitpkladir) + endif + + if install_sysconfdir_samples + install_data('networkd.conf', + install_dir : pkgconfigfiledir) + endif +endif diff --git a/src/network/netdev/bareudp.c b/src/network/netdev/bareudp.c new file mode 100644 index 0000000..1df8865 --- /dev/null +++ b/src/network/netdev/bareudp.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include +#include + +#include "bareudp.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "string-table.h" + +static const char* const bare_udp_protocol_table[_BARE_UDP_PROTOCOL_MAX] = { + [BARE_UDP_PROTOCOL_IPV4] = "ipv4", + [BARE_UDP_PROTOCOL_IPV6] = "ipv6", + [BARE_UDP_PROTOCOL_MPLS_UC] = "mpls-uc", + [BARE_UDP_PROTOCOL_MPLS_MC] = "mpls-mc", +}; + +DEFINE_STRING_TABLE_LOOKUP(bare_udp_protocol, BareUDPProtocol); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bare_udp_iftype, bare_udp_protocol, BareUDPProtocol, + "Failed to parse EtherType="); + +static int netdev_bare_udp_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(m); + + BareUDP *u = BAREUDP(netdev); + int r; + + r = sd_netlink_message_append_u16(m, IFLA_BAREUDP_ETHERTYPE, htobe16(u->iftype)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_BAREUDP_PORT, htobe16(u->dest_port)); + if (r < 0) + return r; + + return 0; +} + +static int netdev_bare_udp_verify(NetDev *netdev, const char *filename) { + assert(filename); + + BareUDP *u = BAREUDP(netdev); + + if (u->dest_port == 0) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: BareUDP DesinationPort= is not set. Ignoring.", filename); + + if (u->iftype == _BARE_UDP_PROTOCOL_INVALID) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: BareUDP EtherType= is not set. Ignoring.", filename); + + return 0; +} + +static void bare_udp_init(NetDev *netdev) { + BareUDP *u = BAREUDP(netdev); + + u->iftype = _BARE_UDP_PROTOCOL_INVALID; +} + +const NetDevVTable bare_udp_vtable = { + .object_size = sizeof(BareUDP), + .sections = NETDEV_COMMON_SECTIONS "BareUDP\0", + .init = bare_udp_init, + .config_verify = netdev_bare_udp_verify, + .fill_message_create = netdev_bare_udp_fill_message_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_NONE, +}; diff --git a/src/network/netdev/bareudp.h b/src/network/netdev/bareudp.h new file mode 100644 index 0000000..8d8863c --- /dev/null +++ b/src/network/netdev/bareudp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +typedef struct BareUDP BareUDP; + +#include + +#include "conf-parser.h" +#include "netdev.h" + +typedef enum BareUDPProtocol { + BARE_UDP_PROTOCOL_IPV4 = ETH_P_IP, + BARE_UDP_PROTOCOL_IPV6 = ETH_P_IPV6, + BARE_UDP_PROTOCOL_MPLS_UC = ETH_P_MPLS_UC, + BARE_UDP_PROTOCOL_MPLS_MC = ETH_P_MPLS_MC, + _BARE_UDP_PROTOCOL_MAX, + _BARE_UDP_PROTOCOL_INVALID = -EINVAL, +} BareUDPProtocol; + +struct BareUDP { + NetDev meta; + + BareUDPProtocol iftype; + uint16_t dest_port; +}; + +DEFINE_NETDEV_CAST(BAREUDP, BareUDP); +extern const NetDevVTable bare_udp_vtable; + +const char *bare_udp_protocol_to_string(BareUDPProtocol d) _const_; +BareUDPProtocol bare_udp_protocol_from_string(const char *d) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_bare_udp_iftype); diff --git a/src/network/netdev/batadv.c b/src/network/netdev/batadv.c new file mode 100644 index 0000000..26da023 --- /dev/null +++ b/src/network/netdev/batadv.c @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "batadv.h" +#include "fileio.h" +#include "netlink-util.h" +#include "network-internal.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" + +static void batadv_init(NetDev *n) { + BatmanAdvanced *b = BATADV(n); + + /* Set defaults */ + b->aggregation = true; + b->gateway_bandwidth_down = 10000; + b->gateway_bandwidth_up = 2000; + b->bridge_loop_avoidance = true; + b->distributed_arp_table = true; + b->fragmentation = true; + b->hop_penalty = 15; + b->originator_interval = 1000; + b->routing_algorithm = BATADV_ROUTING_ALGORITHM_BATMAN_V; +} + +static const char* const batadv_gateway_mode_table[_BATADV_GATEWAY_MODE_MAX] = { + [BATADV_GATEWAY_MODE_OFF] = "off", + [BATADV_GATEWAY_MODE_CLIENT] = "client", + [BATADV_GATEWAY_MODE_SERVER] = "server", +}; + +static const char* const batadv_routing_algorithm_table[_BATADV_ROUTING_ALGORITHM_MAX] = { + [BATADV_ROUTING_ALGORITHM_BATMAN_V] = "batman-v", + [BATADV_ROUTING_ALGORITHM_BATMAN_IV] = "batman-iv", +}; + +static const char* const batadv_routing_algorithm_kernel_table[_BATADV_ROUTING_ALGORITHM_MAX] = { + [BATADV_ROUTING_ALGORITHM_BATMAN_V] = "BATMAN_V", + [BATADV_ROUTING_ALGORITHM_BATMAN_IV] = "BATMAN_IV", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(batadv_gateway_mode, BatadvGatewayModes); +DEFINE_CONFIG_PARSE_ENUM(config_parse_batadv_gateway_mode, batadv_gateway_mode, BatadvGatewayModes, + "Failed to parse GatewayMode="); + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(batadv_routing_algorithm, BatadvRoutingAlgorithm); +DEFINE_CONFIG_PARSE_ENUM(config_parse_batadv_routing_algorithm, batadv_routing_algorithm, BatadvRoutingAlgorithm, + "Failed to parse RoutingAlgorithm="); + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(batadv_routing_algorithm_kernel, BatadvRoutingAlgorithm); + +int config_parse_badadv_bandwidth ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t k; + uint32_t *bandwidth = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_size(rvalue, 1000, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (k/1000/100 > UINT32_MAX) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "The value of '%s=', is outside of 0...429496729500000 range: %s", + lvalue, rvalue); + + *bandwidth = k/1000/100; + + return 0; +} + +/* callback for batman netdev's parameter set */ +static int netdev_batman_set_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(m); + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_netdev_warning_errno(netdev, r, "BATADV parameters could not be set: %m"); + return 1; + } + + log_netdev_debug(netdev, "BATADV parameters set success"); + + return 1; +} + +static int netdev_batadv_post_create_message(NetDev *netdev, sd_netlink_message *message) { + BatmanAdvanced *b = BATADV(netdev); + int r; + + r = sd_netlink_message_append_u32(message, BATADV_ATTR_MESH_IFINDEX, netdev->ifindex); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, BATADV_ATTR_GW_MODE, b->gateway_mode); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, BATADV_ATTR_AGGREGATED_OGMS_ENABLED, b->aggregation); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, b->bridge_loop_avoidance); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, b->distributed_arp_table); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, BATADV_ATTR_FRAGMENTATION_ENABLED, b->fragmentation); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, BATADV_ATTR_HOP_PENALTY, b->hop_penalty); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(message, BATADV_ATTR_ORIG_INTERVAL, DIV_ROUND_UP(b->originator_interval, USEC_PER_MSEC)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(message, BATADV_ATTR_GW_BANDWIDTH_DOWN, b->gateway_bandwidth_down); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(message, BATADV_ATTR_GW_BANDWIDTH_UP, b->gateway_bandwidth_up); + if (r < 0) + return r; + + return 0; +} + +static int netdev_batadv_post_create(NetDev *netdev, Link *link) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + int r; + + assert(netdev); + + r = sd_genl_message_new(netdev->manager->genl, BATADV_NL_NAME, BATADV_CMD_SET_MESH, &message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not allocate netlink message: %m"); + + r = netdev_batadv_post_create_message(netdev, message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not create netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, message, netdev_batman_set_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not send netlink message: %m"); + + netdev_ref(netdev); + + return r; +} + +static int netdev_batadv_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(m); + + BatmanAdvanced *b = BATADV(netdev); + int r; + + r = sd_netlink_message_append_string(m, IFLA_BATADV_ALGO_NAME, batadv_routing_algorithm_kernel_to_string(b->routing_algorithm)); + if (r < 0) + return r; + + return 0; +} + +const NetDevVTable batadv_vtable = { + .object_size = sizeof(BatmanAdvanced), + .init = batadv_init, + .sections = NETDEV_COMMON_SECTIONS "BatmanAdvanced\0", + .fill_message_create = netdev_batadv_fill_message_create, + .post_create = netdev_batadv_post_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/batadv.h b/src/network/netdev/batadv.h new file mode 100644 index 0000000..f1f9b46 --- /dev/null +++ b/src/network/netdev/batadv.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include + +#include "conf-parser.h" +#include "netdev.h" + +#define BATADV_GENL_NAME "batadv" + +typedef enum BatadvGatewayModes { + BATADV_GATEWAY_MODE_OFF = BATADV_GW_MODE_OFF, + BATADV_GATEWAY_MODE_CLIENT = BATADV_GW_MODE_CLIENT, + BATADV_GATEWAY_MODE_SERVER = BATADV_GW_MODE_SERVER, + _BATADV_GATEWAY_MODE_MAX, + _BATADV_GATEWAY_MODE_INVALID = -EINVAL, +} BatadvGatewayModes; + +typedef enum BatadvRoutingAlgorithm { + BATADV_ROUTING_ALGORITHM_BATMAN_V, + BATADV_ROUTING_ALGORITHM_BATMAN_IV, + _BATADV_ROUTING_ALGORITHM_MAX, + _BATADV_ROUTING_ALGORITHM_INVALID = -EINVAL, +} BatadvRoutingAlgorithm; + +typedef struct Batadv { + NetDev meta; + + BatadvGatewayModes gateway_mode; + uint32_t gateway_bandwidth_down; + uint32_t gateway_bandwidth_up; + uint8_t hop_penalty; + BatadvRoutingAlgorithm routing_algorithm; + usec_t originator_interval; + bool aggregation; + bool bridge_loop_avoidance; + bool distributed_arp_table; + bool fragmentation; +} BatmanAdvanced; + +DEFINE_NETDEV_CAST(BATADV, BatmanAdvanced); +extern const NetDevVTable batadv_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_batadv_gateway_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_batadv_routing_algorithm); +CONFIG_PARSER_PROTOTYPE(config_parse_badadv_bandwidth); diff --git a/src/network/netdev/bond.c b/src/network/netdev/bond.c new file mode 100644 index 0000000..4d75a0d --- /dev/null +++ b/src/network/netdev/bond.c @@ -0,0 +1,415 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "bond.h" +#include "bond-util.h" +#include "conf-parser.h" +#include "ether-addr-util.h" +#include "extract-word.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "string-table.h" + +/* + * Number of seconds between instances where the bonding + * driver sends learning packets to each slaves peer switch + */ +#define LEARNING_PACKETS_INTERVAL_MIN_SEC (1 * USEC_PER_SEC) +#define LEARNING_PACKETS_INTERVAL_MAX_SEC (0x7fffffff * USEC_PER_SEC) + +/* Number of IGMP membership reports to be issued after + * a failover event. + */ +#define RESEND_IGMP_MIN 0 +#define RESEND_IGMP_MAX 255 +#define RESEND_IGMP_DEFAULT 1 + +/* + * Number of packets to transmit through a slave before + * moving to the next one. + */ +#define PACKETS_PER_SLAVE_MIN 0 +#define PACKETS_PER_SLAVE_MAX 65535 +#define PACKETS_PER_SLAVE_DEFAULT 1 + +/* + * Number of peer notifications (gratuitous ARPs and + * unsolicited IPv6 Neighbor Advertisements) to be issued after a + * failover event. + */ +#define GRATUITOUS_ARP_MIN 0 +#define GRATUITOUS_ARP_MAX 255 +#define GRATUITOUS_ARP_DEFAULT 1 + +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_mode, bond_mode, BondMode, "Failed to parse bond mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_xmit_hash_policy, + bond_xmit_hash_policy, + BondXmitHashPolicy, + "Failed to parse bond transmit hash policy"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_lacp_rate, bond_lacp_rate, BondLacpRate, "Failed to parse bond lacp rate"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_ad_select, bond_ad_select, BondAdSelect, "Failed to parse bond AD select"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_fail_over_mac, bond_fail_over_mac, BondFailOverMac, "Failed to parse bond fail over MAC"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_arp_validate, bond_arp_validate, BondArpValidate, "Failed to parse bond arp validate"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_arp_all_targets, bond_arp_all_targets, BondArpAllTargets, "Failed to parse bond Arp all targets"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_bond_primary_reselect, bond_primary_reselect, BondPrimaryReselect, "Failed to parse bond primary reselect"); + +static int netdev_bond_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(!link); + assert(m); + + Bond *b = BOND(netdev); + int r; + + if (b->mode != _NETDEV_BOND_MODE_INVALID) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_MODE, b->mode); + if (r < 0) + return r; + } + + if (b->xmit_hash_policy != _NETDEV_BOND_XMIT_HASH_POLICY_INVALID) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_XMIT_HASH_POLICY, b->xmit_hash_policy); + if (r < 0) + return r; + } + + if (b->lacp_rate != _NETDEV_BOND_LACP_RATE_INVALID && + b->mode == NETDEV_BOND_MODE_802_3AD) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_AD_LACP_RATE, b->lacp_rate); + if (r < 0) + return r; + } + + if (b->miimon != 0) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_MIIMON, b->miimon / USEC_PER_MSEC); + if (r < 0) + return r; + } + + if (b->downdelay != 0) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_DOWNDELAY, b->downdelay / USEC_PER_MSEC); + if (r < 0) + return r; + } + + if (b->updelay != 0) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_UPDELAY, b->updelay / USEC_PER_MSEC); + if (r < 0) + return r; + } + + if (b->arp_interval != 0) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_ARP_INTERVAL, b->arp_interval / USEC_PER_MSEC); + if (r < 0) + return r; + + if (b->lp_interval >= LEARNING_PACKETS_INTERVAL_MIN_SEC && + b->lp_interval <= LEARNING_PACKETS_INTERVAL_MAX_SEC) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_LP_INTERVAL, b->lp_interval / USEC_PER_SEC); + if (r < 0) + return r; + } + } + + if (b->ad_select != _NETDEV_BOND_AD_SELECT_INVALID && + b->mode == NETDEV_BOND_MODE_802_3AD) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_AD_SELECT, b->ad_select); + if (r < 0) + return r; + } + + if (b->fail_over_mac != _NETDEV_BOND_FAIL_OVER_MAC_INVALID && + b->mode == NETDEV_BOND_MODE_ACTIVE_BACKUP) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_FAIL_OVER_MAC, b->fail_over_mac); + if (r < 0) + return r; + } + + if (b->arp_validate != _NETDEV_BOND_ARP_VALIDATE_INVALID) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_ARP_VALIDATE, b->arp_validate); + if (r < 0) + return r; + } + + if (b->arp_all_targets != _NETDEV_BOND_ARP_ALL_TARGETS_INVALID) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_ARP_ALL_TARGETS, b->arp_all_targets); + if (r < 0) + return r; + } + + if (b->primary_reselect != _NETDEV_BOND_PRIMARY_RESELECT_INVALID) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_PRIMARY_RESELECT, b->primary_reselect); + if (r < 0) + return r; + } + + if (b->resend_igmp <= RESEND_IGMP_MAX) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_RESEND_IGMP, b->resend_igmp); + if (r < 0) + return r; + } + + if (b->packets_per_slave <= PACKETS_PER_SLAVE_MAX && + b->mode == NETDEV_BOND_MODE_BALANCE_RR) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_PACKETS_PER_SLAVE, b->packets_per_slave); + if (r < 0) + return r; + } + + if (b->num_grat_arp <= GRATUITOUS_ARP_MAX) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_NUM_PEER_NOTIF, b->num_grat_arp); + if (r < 0) + return r; + } + + if (b->min_links != 0) { + r = sd_netlink_message_append_u32(m, IFLA_BOND_MIN_LINKS, b->min_links); + if (r < 0) + return r; + } + + if (b->ad_actor_sys_prio != 0) { + r = sd_netlink_message_append_u16(m, IFLA_BOND_AD_ACTOR_SYS_PRIO, b->ad_actor_sys_prio); + if (r < 0) + return r; + } + + if (b->ad_user_port_key != 0) { + r = sd_netlink_message_append_u16(m, IFLA_BOND_AD_USER_PORT_KEY, b->ad_user_port_key); + if (r < 0) + return r; + } + + if (!ether_addr_is_null(&b->ad_actor_system)) { + r = sd_netlink_message_append_ether_addr(m, IFLA_BOND_AD_ACTOR_SYSTEM, &b->ad_actor_system); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(m, IFLA_BOND_ALL_SLAVES_ACTIVE, b->all_slaves_active); + if (r < 0) + return r; + + if (b->tlb_dynamic_lb >= 0) { + r = sd_netlink_message_append_u8(m, IFLA_BOND_TLB_DYNAMIC_LB, b->tlb_dynamic_lb); + if (r < 0) + return r; + } + + if (b->arp_interval > 0 && !ordered_set_isempty(b->arp_ip_targets)) { + void *val; + int n = 0; + + r = sd_netlink_message_open_container(m, IFLA_BOND_ARP_IP_TARGET); + if (r < 0) + return r; + + ORDERED_SET_FOREACH(val, b->arp_ip_targets) { + r = sd_netlink_message_append_u32(m, n++, PTR_TO_UINT32(val)); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_arp_ip_target_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Bond *b = BOND(userdata); + int r; + + if (isempty(rvalue)) { + b->arp_ip_targets = ordered_set_free(b->arp_ip_targets); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *n = NULL; + union in_addr_union ip; + + r = extract_first_word(&p, &n, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse Bond ARP IP target address, ignoring assignment: %s", + rvalue); + return 0; + } + if (r == 0) + return 0; + + r = in_addr_from_string(AF_INET, n, &ip); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Bond ARP IP target address is invalid, ignoring assignment: %s", n); + continue; + } + + if (ordered_set_size(b->arp_ip_targets) >= NETDEV_BOND_ARP_TARGETS_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Too many ARP IP targets are specified. The maximum number is %d. Ignoring assignment: %s", + NETDEV_BOND_ARP_TARGETS_MAX, n); + continue; + } + + r = ordered_set_ensure_put(&b->arp_ip_targets, NULL, UINT32_TO_PTR(ip.in.s_addr)); + if (r == -ENOMEM) + return log_oom(); + if (r == -EEXIST) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Bond ARP IP target address is duplicated, ignoring assignment: %s", n); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store bond ARP IP target address '%s', ignoring assignment: %m", n); + } +} + +int config_parse_ad_actor_sys_prio( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Bond *b = ASSERT_PTR(userdata); + + return config_parse_uint16_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 1, UINT16_MAX, true, + &b->ad_actor_sys_prio); +} + +int config_parse_ad_user_port_key( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Bond *b = ASSERT_PTR(userdata); + + return config_parse_uint16_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, 1023, /* ignoring= */ true, + &b->ad_user_port_key); +} + +int config_parse_ad_actor_system( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + Bond *b = userdata; + struct ether_addr n; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = parse_ether_addr(rvalue, &n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Not a valid MAC address %s. Ignoring assignment: %m", + rvalue); + return 0; + } + if (ether_addr_is_null(&n) || (n.ether_addr_octet[0] & 0x01)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Not an appropriate MAC address %s, cannot be null or multicast. Ignoring assignment.", + rvalue); + return 0; + } + + b->ad_actor_system = n; + + return 0; +} + +static void bond_done(NetDev *netdev) { + Bond *b = BOND(netdev); + + ordered_set_free(b->arp_ip_targets); +} + +static void bond_init(NetDev *netdev) { + Bond *b = BOND(netdev); + + b->mode = _NETDEV_BOND_MODE_INVALID; + b->xmit_hash_policy = _NETDEV_BOND_XMIT_HASH_POLICY_INVALID; + b->lacp_rate = _NETDEV_BOND_LACP_RATE_INVALID; + b->ad_select = _NETDEV_BOND_AD_SELECT_INVALID; + b->fail_over_mac = _NETDEV_BOND_FAIL_OVER_MAC_INVALID; + b->arp_validate = _NETDEV_BOND_ARP_VALIDATE_INVALID; + b->arp_all_targets = _NETDEV_BOND_ARP_ALL_TARGETS_INVALID; + b->primary_reselect = _NETDEV_BOND_PRIMARY_RESELECT_INVALID; + + b->all_slaves_active = false; + b->tlb_dynamic_lb = -1; + + b->resend_igmp = RESEND_IGMP_DEFAULT; + b->packets_per_slave = PACKETS_PER_SLAVE_DEFAULT; + b->num_grat_arp = GRATUITOUS_ARP_DEFAULT; + b->lp_interval = LEARNING_PACKETS_INTERVAL_MIN_SEC; +} + +const NetDevVTable bond_vtable = { + .object_size = sizeof(Bond), + .init = bond_init, + .done = bond_done, + .sections = NETDEV_COMMON_SECTIONS "Bond\0", + .fill_message_create = netdev_bond_fill_message_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/bond.h b/src/network/netdev/bond.h new file mode 100644 index 0000000..e4b0a0d --- /dev/null +++ b/src/network/netdev/bond.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "bond-util.h" +#include "macro.h" +#include "netdev.h" +#include "ordered-set.h" + +typedef struct Bond { + NetDev meta; + + BondMode mode; + BondXmitHashPolicy xmit_hash_policy; + BondLacpRate lacp_rate; + BondAdSelect ad_select; + BondFailOverMac fail_over_mac; + BondArpValidate arp_validate; + BondArpAllTargets arp_all_targets; + BondPrimaryReselect primary_reselect; + + int tlb_dynamic_lb; + + bool all_slaves_active; + + unsigned resend_igmp; + unsigned packets_per_slave; + unsigned num_grat_arp; + unsigned min_links; + + uint16_t ad_actor_sys_prio; + uint16_t ad_user_port_key; + struct ether_addr ad_actor_system; + + usec_t miimon; + usec_t updelay; + usec_t downdelay; + usec_t arp_interval; + usec_t lp_interval; + + OrderedSet *arp_ip_targets; +} Bond; + +DEFINE_NETDEV_CAST(BOND, Bond); +extern const NetDevVTable bond_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_bond_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_xmit_hash_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_lacp_rate); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_ad_select); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_fail_over_mac); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_arp_validate); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_arp_all_targets); +CONFIG_PARSER_PROTOTYPE(config_parse_bond_primary_reselect); +CONFIG_PARSER_PROTOTYPE(config_parse_arp_ip_target_address); +CONFIG_PARSER_PROTOTYPE(config_parse_ad_actor_sys_prio); +CONFIG_PARSER_PROTOTYPE(config_parse_ad_user_port_key); +CONFIG_PARSER_PROTOTYPE(config_parse_ad_actor_system); diff --git a/src/network/netdev/bridge.c b/src/network/netdev/bridge.c new file mode 100644 index 0000000..3e394ed --- /dev/null +++ b/src/network/netdev/bridge.c @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "bridge.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "string-table.h" +#include "vlan-util.h" + +assert_cc((int) MULTICAST_ROUTER_NONE == (int) MDB_RTR_TYPE_DISABLED); +assert_cc((int) MULTICAST_ROUTER_TEMPORARY_QUERY == (int) MDB_RTR_TYPE_TEMP_QUERY); +assert_cc((int) MULTICAST_ROUTER_PERMANENT == (int) MDB_RTR_TYPE_PERM); +assert_cc((int) MULTICAST_ROUTER_TEMPORARY == (int) MDB_RTR_TYPE_TEMP); + +static const char* const multicast_router_table[_MULTICAST_ROUTER_MAX] = { + [MULTICAST_ROUTER_NONE] = "no", + [MULTICAST_ROUTER_TEMPORARY_QUERY] = "query", + [MULTICAST_ROUTER_PERMANENT] = "permanent", + [MULTICAST_ROUTER_TEMPORARY] = "temporary", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(multicast_router, MulticastRouter, _MULTICAST_ROUTER_INVALID); +DEFINE_CONFIG_PARSE_ENUM(config_parse_multicast_router, multicast_router, MulticastRouter, + "Failed to parse bridge multicast router setting"); + +/* callback for bridge netdev's parameter set */ +static int netdev_bridge_set_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(m); + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_netdev_warning_errno(netdev, r, "Bridge parameters could not be set: %m"); + return 1; + } + + log_netdev_debug(netdev, "Bridge parameters set success"); + + return 1; +} + +static int netdev_bridge_post_create_message(NetDev *netdev, sd_netlink_message *req) { + Bridge *b = BRIDGE(netdev); + int r; + + r = sd_netlink_message_open_container(req, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(req, IFLA_INFO_DATA, netdev_kind_to_string(netdev->kind)); + if (r < 0) + return r; + + /* convert to jiffes */ + if (b->forward_delay != USEC_INFINITY) { + r = sd_netlink_message_append_u32(req, IFLA_BR_FORWARD_DELAY, usec_to_jiffies(b->forward_delay)); + if (r < 0) + return r; + } + + if (b->hello_time > 0) { + r = sd_netlink_message_append_u32(req, IFLA_BR_HELLO_TIME, usec_to_jiffies(b->hello_time)); + if (r < 0) + return r; + } + + if (b->max_age > 0) { + r = sd_netlink_message_append_u32(req, IFLA_BR_MAX_AGE, usec_to_jiffies(b->max_age)); + if (r < 0) + return r; + } + + if (b->ageing_time != USEC_INFINITY) { + r = sd_netlink_message_append_u32(req, IFLA_BR_AGEING_TIME, usec_to_jiffies(b->ageing_time)); + if (r < 0) + return r; + } + + if (b->priority > 0) { + r = sd_netlink_message_append_u16(req, IFLA_BR_PRIORITY, b->priority); + if (r < 0) + return r; + } + + if (b->group_fwd_mask > 0) { + r = sd_netlink_message_append_u16(req, IFLA_BR_GROUP_FWD_MASK, b->group_fwd_mask); + if (r < 0) + return r; + } + + if (b->default_pvid != VLANID_INVALID) { + r = sd_netlink_message_append_u16(req, IFLA_BR_VLAN_DEFAULT_PVID, b->default_pvid); + if (r < 0) + return r; + } + + if (b->mcast_querier >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BR_MCAST_QUERIER, b->mcast_querier); + if (r < 0) + return r; + } + + if (b->mcast_snooping >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BR_MCAST_SNOOPING, b->mcast_snooping); + if (r < 0) + return r; + } + + if (b->vlan_filtering >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BR_VLAN_FILTERING, b->vlan_filtering); + if (r < 0) + return r; + } + + if (b->vlan_protocol >= 0) { + r = sd_netlink_message_append_u16(req, IFLA_BR_VLAN_PROTOCOL, htobe16(b->vlan_protocol)); + if (r < 0) + return r; + } + + if (b->stp >= 0) { + r = sd_netlink_message_append_u32(req, IFLA_BR_STP_STATE, b->stp); + if (r < 0) + return r; + } + + if (b->igmp_version > 0) { + r = sd_netlink_message_append_u8(req, IFLA_BR_MCAST_IGMP_VERSION, b->igmp_version); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +static int netdev_bridge_post_create(NetDev *netdev, Link *link) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(netdev); + + r = sd_rtnl_message_new_link(netdev->manager->rtnl, &req, RTM_NEWLINK, netdev->ifindex); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not allocate netlink message: %m"); + + r = sd_netlink_message_set_flags(req, NLM_F_REQUEST | NLM_F_ACK); + if (r < 0) + return log_link_error_errno(link, r, "Could not set netlink message flags: %m"); + + r = netdev_bridge_post_create_message(netdev, req); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not create netlink message: %m"); + + r = netlink_call_async(netdev->manager->rtnl, NULL, req, netdev_bridge_set_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not send netlink message: %m"); + + netdev_ref(netdev); + + return r; +} + +int config_parse_bridge_igmp_version( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Bridge *b = ASSERT_PTR(userdata); + + if (isempty(rvalue)) { + b->igmp_version = 0; /* 0 means unset. */ + return 0; + } + + return config_parse_uint8_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 2, 3, true, + &b->igmp_version); +} + +int config_parse_bridge_port_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + uint16_t *prio = ASSERT_PTR(data); + + return config_parse_uint16_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, LINK_BRIDGE_PORT_PRIORITY_MAX, true, + prio); +} + +static void bridge_init(NetDev *netdev) { + Bridge *b = BRIDGE(netdev); + + b->mcast_querier = -1; + b->mcast_snooping = -1; + b->vlan_filtering = -1; + b->vlan_protocol = -1; + b->stp = -1; + b->default_pvid = VLANID_INVALID; + b->forward_delay = USEC_INFINITY; + b->ageing_time = USEC_INFINITY; +} + +const NetDevVTable bridge_vtable = { + .object_size = sizeof(Bridge), + .init = bridge_init, + .sections = NETDEV_COMMON_SECTIONS "Bridge\0", + .post_create = netdev_bridge_post_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/bridge.h b/src/network/netdev/bridge.h new file mode 100644 index 0000000..72dd3e4 --- /dev/null +++ b/src/network/netdev/bridge.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "netdev.h" + +#define LINK_BRIDGE_PORT_PRIORITY_INVALID 128U +#define LINK_BRIDGE_PORT_PRIORITY_MAX 63U + +typedef struct Bridge { + NetDev meta; + + int mcast_querier; + int mcast_snooping; + int vlan_filtering; + int vlan_protocol; + int stp; + uint16_t priority; + uint16_t group_fwd_mask; + uint16_t default_pvid; + uint8_t igmp_version; + + usec_t forward_delay; + usec_t hello_time; + usec_t max_age; + usec_t ageing_time; +} Bridge; + +typedef enum MulticastRouter { + MULTICAST_ROUTER_NONE, + MULTICAST_ROUTER_TEMPORARY_QUERY, + MULTICAST_ROUTER_PERMANENT, + MULTICAST_ROUTER_TEMPORARY, + _MULTICAST_ROUTER_MAX, + _MULTICAST_ROUTER_INVALID = -EINVAL, +} MulticastRouter; + +DEFINE_NETDEV_CAST(BRIDGE, Bridge); +extern const NetDevVTable bridge_vtable; + +const char* multicast_router_to_string(MulticastRouter i) _const_; +MulticastRouter multicast_router_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_multicast_router); +CONFIG_PARSER_PROTOTYPE(config_parse_bridge_igmp_version); +CONFIG_PARSER_PROTOTYPE(config_parse_bridge_port_priority); diff --git a/src/network/netdev/dummy.c b/src/network/netdev/dummy.c new file mode 100644 index 0000000..00df1d2 --- /dev/null +++ b/src/network/netdev/dummy.c @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "dummy.h" + +const NetDevVTable dummy_vtable = { + .object_size = sizeof(Dummy), + .sections = NETDEV_COMMON_SECTIONS, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/dummy.h b/src/network/netdev/dummy.h new file mode 100644 index 0000000..eafdf4b --- /dev/null +++ b/src/network/netdev/dummy.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "netdev.h" + +typedef struct Dummy { + NetDev meta; +} Dummy; + +DEFINE_NETDEV_CAST(DUMMY, Dummy); +extern const NetDevVTable dummy_vtable; diff --git a/src/network/netdev/fou-tunnel.c b/src/network/netdev/fou-tunnel.c new file mode 100644 index 0000000..3bf41a8 --- /dev/null +++ b/src/network/netdev/fou-tunnel.c @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "conf-parser.h" +#include "fou-tunnel.h" +#include "ip-protocol-list.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "string-table.h" +#include "string-util.h" + +static const char* const fou_encap_type_table[_NETDEV_FOO_OVER_UDP_ENCAP_MAX] = { + [NETDEV_FOO_OVER_UDP_ENCAP_DIRECT] = "FooOverUDP", + [NETDEV_FOO_OVER_UDP_ENCAP_GUE] = "GenericUDPEncapsulation", +}; + +DEFINE_STRING_TABLE_LOOKUP(fou_encap_type, FooOverUDPEncapType); +DEFINE_CONFIG_PARSE_ENUM(config_parse_fou_encap_type, fou_encap_type, FooOverUDPEncapType, + "Failed to parse Encapsulation="); + +static int netdev_fill_fou_tunnel_message(NetDev *netdev, sd_netlink_message *m) { + FouTunnel *t = FOU(netdev); + uint8_t encap_type; + int r; + + r = sd_netlink_message_append_u16(m, FOU_ATTR_PORT, htobe16(t->port)); + if (r < 0) + return r; + + if (IN_SET(t->peer_family, AF_INET, AF_INET6)) { + r = sd_netlink_message_append_u16(m, FOU_ATTR_PEER_PORT, htobe16(t->peer_port)); + if (r < 0) + return r; + } + + switch (t->fou_encap_type) { + case NETDEV_FOO_OVER_UDP_ENCAP_DIRECT: + encap_type = FOU_ENCAP_DIRECT; + break; + case NETDEV_FOO_OVER_UDP_ENCAP_GUE: + encap_type = FOU_ENCAP_GUE; + break; + default: + assert_not_reached(); + } + + r = sd_netlink_message_append_u8(m, FOU_ATTR_TYPE, encap_type); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, FOU_ATTR_AF, AF_INET); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, FOU_ATTR_IPPROTO, t->fou_protocol); + if (r < 0) + return r; + + if (t->local_family == AF_INET) { + r = sd_netlink_message_append_in_addr(m, FOU_ATTR_LOCAL_V4, &t->local.in); + if (r < 0) + return r; + } else if (t->local_family == AF_INET6) { + r = sd_netlink_message_append_in6_addr(m, FOU_ATTR_LOCAL_V6, &t->local.in6); + if (r < 0) + return r; + } + + if (t->peer_family == AF_INET) { + r = sd_netlink_message_append_in_addr(m, FOU_ATTR_PEER_V4, &t->peer.in); + if (r < 0) + return r; + } else if (t->peer_family == AF_INET6){ + r = sd_netlink_message_append_in6_addr(m, FOU_ATTR_PEER_V6, &t->peer.in6); + if (r < 0) + return r; + } + + return 0; +} + +static int netdev_create_fou_tunnel_message(NetDev *netdev, sd_netlink_message **ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + + r = sd_genl_message_new(netdev->manager->genl, FOU_GENL_NAME, FOU_CMD_ADD, &m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not allocate netlink message: %m"); + + r = netdev_fill_fou_tunnel_message(netdev, m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not create netlink message: %m"); + + *ret = TAKE_PTR(m); + return 0; +} + +static int fou_tunnel_create_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_info(netdev, "netdev exists, using existing without changing its parameters"); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, "netdev could not be created: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "FooOverUDP tunnel is created"); + return 1; +} + +static int netdev_fou_tunnel_create(NetDev *netdev) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(FOU(netdev)); + + r = netdev_create_fou_tunnel_message(netdev, &m); + if (r < 0) + return r; + + r = netlink_call_async(netdev->manager->genl, NULL, m, fou_tunnel_create_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create FooOverUDP tunnel: %m"); + + netdev_ref(netdev); + return 0; +} + +int config_parse_ip_protocol( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + uint8_t *proto = ASSERT_PTR(data); + int r; + + r = parse_ip_protocol_full(rvalue, /* relaxed= */ true); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=%s', ignoring: %m", + lvalue, rvalue); + return 0; + } + + if (r > UINT8_MAX) { + /* linux/fou.h defines the netlink field as one byte, so we need to reject + * protocols numbers that don't fit in one byte. */ + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid '%s=%s', allowed range is 0..255, ignoring.", + lvalue, rvalue); + return 0; + } + + *proto = r; + return 0; +} + +int config_parse_fou_tunnel_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + union in_addr_union *addr = ASSERT_PTR(data); + FouTunnel *t = userdata; + int r, *f; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (streq(lvalue, "Local")) + f = &t->local_family; + else + f = &t->peer_family; + + r = in_addr_from_string_auto(rvalue, f, addr); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "FooOverUDP tunnel '%s' address is invalid, ignoring assignment: %s", + lvalue, rvalue); + + return 0; +} + +static int netdev_fou_tunnel_verify(NetDev *netdev, const char *filename) { + assert(filename); + + FouTunnel *t = FOU(netdev); + + switch (t->fou_encap_type) { + case NETDEV_FOO_OVER_UDP_ENCAP_DIRECT: + if (t->fou_protocol <= 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "FooOverUDP protocol not configured in %s. Rejecting configuration.", + filename); + break; + case NETDEV_FOO_OVER_UDP_ENCAP_GUE: + if (t->fou_protocol > 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "FooOverUDP GUE can't be set with protocol configured in %s. Rejecting configuration.", + filename); + break; + default: + assert_not_reached(); + } + + if (t->peer_family == AF_UNSPEC && t->peer_port > 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "FooOverUDP peer port is set but peer address not configured in %s. Rejecting configuration.", + filename); + else if (t->peer_family != AF_UNSPEC && t->peer_port == 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "FooOverUDP peer port not set but peer address is configured in %s. Rejecting configuration.", + filename); + return 0; +} + +static void fou_tunnel_init(NetDev *netdev) { + FouTunnel *t = FOU(netdev); + + t->fou_encap_type = NETDEV_FOO_OVER_UDP_ENCAP_DIRECT; +} + +const NetDevVTable foutnl_vtable = { + .object_size = sizeof(FouTunnel), + .init = fou_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "FooOverUDP\0", + .create = netdev_fou_tunnel_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = netdev_fou_tunnel_verify, +}; diff --git a/src/network/netdev/fou-tunnel.h b/src/network/netdev/fou-tunnel.h new file mode 100644 index 0000000..576d82e --- /dev/null +++ b/src/network/netdev/fou-tunnel.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "in-addr-util.h" +#include "netdev.h" + +typedef enum FooOverUDPEncapType { + NETDEV_FOO_OVER_UDP_ENCAP_UNSPEC = FOU_ENCAP_UNSPEC, + NETDEV_FOO_OVER_UDP_ENCAP_DIRECT = FOU_ENCAP_DIRECT, + NETDEV_FOO_OVER_UDP_ENCAP_GUE = FOU_ENCAP_GUE, + _NETDEV_FOO_OVER_UDP_ENCAP_MAX, + _NETDEV_FOO_OVER_UDP_ENCAP_INVALID = -EINVAL, +} FooOverUDPEncapType; + +typedef struct FouTunnel { + NetDev meta; + + uint8_t fou_protocol; + + uint16_t port; + uint16_t peer_port; + + int local_family; + int peer_family; + + FooOverUDPEncapType fou_encap_type; + union in_addr_union local; + union in_addr_union peer; +} FouTunnel; + +DEFINE_NETDEV_CAST(FOU, FouTunnel); +extern const NetDevVTable foutnl_vtable; + +const char *fou_encap_type_to_string(FooOverUDPEncapType d) _const_; +FooOverUDPEncapType fou_encap_type_from_string(const char *d) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_fou_encap_type); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_protocol); +CONFIG_PARSER_PROTOTYPE(config_parse_fou_tunnel_address); diff --git a/src/network/netdev/geneve.c b/src/network/netdev/geneve.c new file mode 100644 index 0000000..bc655ec --- /dev/null +++ b/src/network/netdev/geneve.c @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "extract-word.h" +#include "geneve.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" + +#define GENEVE_FLOW_LABEL_MAX_MASK 0xFFFFFU +#define DEFAULT_GENEVE_DESTINATION_PORT 6081 + +static const char* const geneve_df_table[_NETDEV_GENEVE_DF_MAX] = { + [NETDEV_GENEVE_DF_NO] = "no", + [NETDEV_GENEVE_DF_YES] = "yes", + [NETDEV_GENEVE_DF_INHERIT] = "inherit", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(geneve_df, GeneveDF, NETDEV_GENEVE_DF_YES); +DEFINE_CONFIG_PARSE_ENUM(config_parse_geneve_df, geneve_df, GeneveDF, "Failed to parse Geneve IPDoNotFragment= setting"); + +static int netdev_geneve_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(m); + + Geneve *v = GENEVE(netdev); + int r; + + if (v->id <= GENEVE_VID_MAX) { + r = sd_netlink_message_append_u32(m, IFLA_GENEVE_ID, v->id); + if (r < 0) + return r; + } + + if (in_addr_is_set(v->remote_family, &v->remote)) { + if (v->remote_family == AF_INET) + r = sd_netlink_message_append_in_addr(m, IFLA_GENEVE_REMOTE, &v->remote.in); + else + r = sd_netlink_message_append_in6_addr(m, IFLA_GENEVE_REMOTE6, &v->remote.in6); + if (r < 0) + return r; + } + + if (v->inherit) { + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_TTL_INHERIT, 1); + if (r < 0) + return r; + } else { + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_TTL, v->ttl); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_TOS, v->tos); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_UDP_CSUM, v->udpcsum); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, v->udp6zerocsumtx); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, v->udp6zerocsumrx); + if (r < 0) + return r; + + if (v->dest_port != DEFAULT_GENEVE_DESTINATION_PORT) { + r = sd_netlink_message_append_u16(m, IFLA_GENEVE_PORT, htobe16(v->dest_port)); + if (r < 0) + return r; + } + + if (v->flow_label > 0) { + r = sd_netlink_message_append_u32(m, IFLA_GENEVE_LABEL, htobe32(v->flow_label)); + if (r < 0) + return r; + } + + if (v->inherit_inner_protocol) { + r = sd_netlink_message_append_flag(m, IFLA_GENEVE_INNER_PROTO_INHERIT); + if (r < 0) + return r; + } + + if (v->geneve_df != _NETDEV_GENEVE_DF_INVALID) { + r = sd_netlink_message_append_u8(m, IFLA_GENEVE_DF, v->geneve_df); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_geneve_vni( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Geneve *v = ASSERT_PTR(userdata); + + return config_parse_uint32_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, GENEVE_VID_MAX, true, + &v->id); +} + +int config_parse_geneve_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Geneve *v = ASSERT_PTR(userdata); + union in_addr_union *addr = data, buffer; + int r, f; + + r = in_addr_from_string_auto(rvalue, &f, &buffer); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "geneve '%s' address is invalid, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + r = in_addr_is_multicast(f, &buffer); + if (r > 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "geneve invalid multicast '%s' address, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + v->remote_family = f; + *addr = buffer; + + return 0; +} + +int config_parse_geneve_flow_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Geneve *v = ASSERT_PTR(userdata); + uint32_t f; + int r; + + r = safe_atou32(rvalue, &f); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse Geneve flow label '%s'.", rvalue); + return 0; + } + + if (f & ~GENEVE_FLOW_LABEL_MAX_MASK) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Geneve flow label '%s' not valid. Flow label range should be [0-1048575].", rvalue); + return 0; + } + + v->flow_label = f; + + return 0; +} + +int config_parse_geneve_ttl( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + Geneve *v = ASSERT_PTR(userdata); + int r; + + if (streq(rvalue, "inherit")) { + v->inherit = true; + v->ttl = 0; /* unset the unused ttl field for clarity */ + return 0; + } + + r = config_parse_uint8_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, UINT8_MAX, true, + &v->ttl); + if (r <= 0) + return r; + v->inherit = false; + return 0; +} + +static int netdev_geneve_verify(NetDev *netdev, const char *filename) { + assert(filename); + + Geneve *v = GENEVE(netdev); + + if (v->id > GENEVE_VID_MAX) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: Geneve without valid VNI (or Virtual Network Identifier) configured. Ignoring.", + filename); + return 0; +} + +static void geneve_init(NetDev *netdev) { + Geneve *v = GENEVE(netdev); + + v->id = GENEVE_VID_MAX + 1; + v->geneve_df = _NETDEV_GENEVE_DF_INVALID; + v->dest_port = DEFAULT_GENEVE_DESTINATION_PORT; + v->udpcsum = false; + v->udp6zerocsumtx = false; + v->udp6zerocsumrx = false; +} + +const NetDevVTable geneve_vtable = { + .object_size = sizeof(Geneve), + .init = geneve_init, + .sections = NETDEV_COMMON_SECTIONS "GENEVE\0", + .fill_message_create = netdev_geneve_fill_message_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = netdev_geneve_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/geneve.h b/src/network/netdev/geneve.h new file mode 100644 index 0000000..3cbf694 --- /dev/null +++ b/src/network/netdev/geneve.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Geneve Geneve; + +#include "in-addr-util.h" +#include "netdev.h" +#include "networkd-network.h" + +#define GENEVE_VID_MAX (1u << 24) - 1 + +typedef enum GeneveDF { + NETDEV_GENEVE_DF_NO = GENEVE_DF_UNSET, + NETDEV_GENEVE_DF_YES = GENEVE_DF_SET, + NETDEV_GENEVE_DF_INHERIT = GENEVE_DF_INHERIT, + _NETDEV_GENEVE_DF_MAX, + _NETDEV_GENEVE_DF_INVALID = -EINVAL, +} GeneveDF; + +struct Geneve { + NetDev meta; + + uint32_t id; + uint32_t flow_label; + + int remote_family; + + uint8_t tos; + uint8_t ttl; + + uint16_t dest_port; + + bool udpcsum; + bool udp6zerocsumtx; + bool udp6zerocsumrx; + bool inherit; + + GeneveDF geneve_df; + union in_addr_union remote; + + bool inherit_inner_protocol; +}; + +DEFINE_NETDEV_CAST(GENEVE, Geneve); +extern const NetDevVTable geneve_vtable; + +const char *geneve_df_to_string(GeneveDF d) _const_; +GeneveDF geneve_df_from_string(const char *d) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_geneve_vni); +CONFIG_PARSER_PROTOTYPE(config_parse_geneve_address); +CONFIG_PARSER_PROTOTYPE(config_parse_geneve_flow_label); +CONFIG_PARSER_PROTOTYPE(config_parse_geneve_df); +CONFIG_PARSER_PROTOTYPE(config_parse_geneve_ttl); diff --git a/src/network/netdev/ifb.c b/src/network/netdev/ifb.c new file mode 100644 index 0000000..d7ff44c --- /dev/null +++ b/src/network/netdev/ifb.c @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "ifb.h" + +const NetDevVTable ifb_vtable = { + .object_size = sizeof(IntermediateFunctionalBlock), + .sections = NETDEV_COMMON_SECTIONS, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/ifb.h b/src/network/netdev/ifb.h new file mode 100644 index 0000000..badfb4a --- /dev/null +++ b/src/network/netdev/ifb.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#pragma once + +#include "netdev.h" + +typedef struct IntermediateFunctionalBlock { + NetDev meta; +} IntermediateFunctionalBlock; + +DEFINE_NETDEV_CAST(IFB, IntermediateFunctionalBlock); +extern const NetDevVTable ifb_vtable; diff --git a/src/network/netdev/ipoib.c b/src/network/netdev/ipoib.c new file mode 100644 index 0000000..d5fe299 --- /dev/null +++ b/src/network/netdev/ipoib.c @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "ipoib.h" +#include "networkd-network.h" +#include "parse-util.h" +#include "string-table.h" + +assert_cc((int) IP_OVER_INFINIBAND_MODE_DATAGRAM == (int) IPOIB_MODE_DATAGRAM); +assert_cc((int) IP_OVER_INFINIBAND_MODE_CONNECTED == (int) IPOIB_MODE_CONNECTED); + +static void netdev_ipoib_init(NetDev *netdev) { + IPoIB *ipoib = IPOIB(netdev); + + ipoib->mode = _IP_OVER_INFINIBAND_MODE_INVALID; + ipoib->umcast = -1; +} + +static int netdev_ipoib_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(link); + assert(m); + + IPoIB *ipoib = IPOIB(netdev); + int r; + + if (ipoib->pkey > 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_PKEY, ipoib->pkey); + if (r < 0) + return r; + } + + if (ipoib->mode >= 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_MODE, ipoib->mode); + if (r < 0) + return r; + } + + if (ipoib->umcast >= 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_UMCAST, ipoib->umcast); + if (r < 0) + return r; + } + + return 0; +} + +int ipoib_set_netlink_message(Link *link, sd_netlink_message *m) { + int r; + + assert(link); + assert(link->network); + assert(m); + + r = sd_netlink_message_set_flags(m, NLM_F_REQUEST | NLM_F_ACK); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, link->kind); + if (r < 0) + return r; + + if (link->network->ipoib_mode >= 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_MODE, link->network->ipoib_mode); + if (r < 0) + return r; + } + + if (link->network->ipoib_umcast >= 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_UMCAST, link->network->ipoib_umcast); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +static const char * const ipoib_mode_table[_IP_OVER_INFINIBAND_MODE_MAX] = { + [IP_OVER_INFINIBAND_MODE_DATAGRAM] = "datagram", + [IP_OVER_INFINIBAND_MODE_CONNECTED] = "connected", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(ipoib_mode, IPoIBMode); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipoib_mode, ipoib_mode, IPoIBMode, "Failed to parse IPoIB mode"); + +int config_parse_ipoib_pkey( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint16_t u, *pkey = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *pkey = 0; /* 0 means unset. */ + return 0; + } + + r = safe_atou16(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse IPoIB pkey '%s', ignoring assignment: %m", + rvalue); + return 0; + } + if (IN_SET(u, 0, 0x8000)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "IPoIB pkey cannot be 0 nor 0x8000, ignoring assignment: %s", + rvalue); + return 0; + } + + *pkey = u; + return 0; +} + + +const NetDevVTable ipoib_vtable = { + .object_size = sizeof(IPoIB), + .sections = NETDEV_COMMON_SECTIONS "IPoIB\0", + .init = netdev_ipoib_init, + .fill_message_create = netdev_ipoib_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_INFINIBAND, + .generate_mac = true, +}; diff --git a/src/network/netdev/ipoib.h b/src/network/netdev/ipoib.h new file mode 100644 index 0000000..415d3b1 --- /dev/null +++ b/src/network/netdev/ipoib.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "netdev.h" + +typedef enum IPoIBMode { + IP_OVER_INFINIBAND_MODE_DATAGRAM, + IP_OVER_INFINIBAND_MODE_CONNECTED, + _IP_OVER_INFINIBAND_MODE_MAX, + _IP_OVER_INFINIBAND_MODE_INVALID = -EINVAL, +} IPoIBMode; + +typedef struct IPoIB { + NetDev meta; + + uint16_t pkey; + IPoIBMode mode; + int umcast; +} IPoIB; + +DEFINE_NETDEV_CAST(IPOIB, IPoIB); +extern const NetDevVTable ipoib_vtable; + +int ipoib_set_netlink_message(Link *link, sd_netlink_message *m); + +CONFIG_PARSER_PROTOTYPE(config_parse_ipoib_pkey); +CONFIG_PARSER_PROTOTYPE(config_parse_ipoib_mode); diff --git a/src/network/netdev/ipvlan.c b/src/network/netdev/ipvlan.c new file mode 100644 index 0000000..05d5d01 --- /dev/null +++ b/src/network/netdev/ipvlan.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "conf-parser.h" +#include "ipvlan.h" +#include "ipvlan-util.h" +#include "networkd-link.h" +#include "string-util.h" + +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipvlan_mode, ipvlan_mode, IPVlanMode, "Failed to parse ipvlan mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipvlan_flags, ipvlan_flags, IPVlanFlags, "Failed to parse ipvlan flags"); + +static int netdev_ipvlan_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *req) { + assert(netdev); + assert(link); + assert(netdev->ifname); + + IPVlan *m = netdev->kind == NETDEV_KIND_IPVLAN ? IPVLAN(netdev) : IPVTAP(netdev); + int r; + + if (m->mode != _NETDEV_IPVLAN_MODE_INVALID) { + r = sd_netlink_message_append_u16(req, IFLA_IPVLAN_MODE, m->mode); + if (r < 0) + return r; + } + + if (m->flags != _NETDEV_IPVLAN_FLAGS_INVALID) { + r = sd_netlink_message_append_u16(req, IFLA_IPVLAN_FLAGS, m->flags); + if (r < 0) + return r; + } + + return 0; +} + +static void ipvlan_init(NetDev *netdev) { + IPVlan *m = ASSERT_PTR(netdev)->kind == NETDEV_KIND_IPVLAN ? IPVLAN(netdev) : IPVTAP(netdev); + + m->mode = _NETDEV_IPVLAN_MODE_INVALID; + m->flags = _NETDEV_IPVLAN_FLAGS_INVALID; +} + +const NetDevVTable ipvlan_vtable = { + .object_size = sizeof(IPVlan), + .init = ipvlan_init, + .sections = NETDEV_COMMON_SECTIONS "IPVLAN\0", + .fill_message_create = netdev_ipvlan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; + +const NetDevVTable ipvtap_vtable = { + .object_size = sizeof(IPVlan), + .init = ipvlan_init, + .sections = NETDEV_COMMON_SECTIONS "IPVTAP\0", + .fill_message_create = netdev_ipvlan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; + +IPVlanMode link_get_ipvlan_mode(Link *link) { + assert(link); + + if (!link->netdev || link->netdev->kind != NETDEV_KIND_IPVLAN) + return _NETDEV_IPVLAN_MODE_INVALID; + + return IPVLAN(link->netdev)->mode; +} diff --git a/src/network/netdev/ipvlan.h b/src/network/netdev/ipvlan.h new file mode 100644 index 0000000..633b0bd --- /dev/null +++ b/src/network/netdev/ipvlan.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "ipvlan-util.h" +#include "netdev.h" + +typedef struct IPVlan { + NetDev meta; + + IPVlanMode mode; + IPVlanFlags flags; +} IPVlan; + +DEFINE_NETDEV_CAST(IPVLAN, IPVlan); +DEFINE_NETDEV_CAST(IPVTAP, IPVlan); +extern const NetDevVTable ipvlan_vtable; +extern const NetDevVTable ipvtap_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_ipvlan_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_ipvlan_flags); + +IPVlanMode link_get_ipvlan_mode(Link *link); diff --git a/src/network/netdev/l2tp-tunnel.c b/src/network/netdev/l2tp-tunnel.c new file mode 100644 index 0000000..8b9406b --- /dev/null +++ b/src/network/netdev/l2tp-tunnel.c @@ -0,0 +1,825 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "conf-parser.h" +#include "hashmap.h" +#include "l2tp-tunnel.h" +#include "netlink-util.h" +#include "networkd-address.h" +#include "networkd-manager.h" +#include "networkd-route-util.h" +#include "parse-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" + +static const char* const l2tp_l2spec_type_table[_NETDEV_L2TP_L2SPECTYPE_MAX] = { + [NETDEV_L2TP_L2SPECTYPE_NONE] = "none", + [NETDEV_L2TP_L2SPECTYPE_DEFAULT] = "default", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(l2tp_l2spec_type, L2tpL2specType); + +static const char* const l2tp_encap_type_table[_NETDEV_L2TP_ENCAPTYPE_MAX] = { + [NETDEV_L2TP_ENCAPTYPE_UDP] = "udp", + [NETDEV_L2TP_ENCAPTYPE_IP] = "ip", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(l2tp_encap_type, L2tpEncapType); +DEFINE_CONFIG_PARSE_ENUM(config_parse_l2tp_encap_type, l2tp_encap_type, L2tpEncapType, "Failed to parse L2TP Encapsulation Type"); + +static const char* const l2tp_local_address_type_table[_NETDEV_L2TP_LOCAL_ADDRESS_MAX] = { + [NETDEV_L2TP_LOCAL_ADDRESS_AUTO] = "auto", + [NETDEV_L2TP_LOCAL_ADDRESS_STATIC] = "static", + [NETDEV_L2TP_LOCAL_ADDRESS_DYNAMIC] = "dynamic", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(l2tp_local_address_type, L2tpLocalAddressType); + +static L2tpSession* l2tp_session_free(L2tpSession *s) { + if (!s) + return NULL; + + if (s->tunnel && s->section) + ordered_hashmap_remove(s->tunnel->sessions_by_section, s->section); + + config_section_free(s->section); + free(s->name); + return mfree(s); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(L2tpSession, l2tp_session_free); + +static int l2tp_session_new_static(L2tpTunnel *t, const char *filename, unsigned section_line, L2tpSession **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(l2tp_session_freep) L2tpSession *s = NULL; + int r; + + assert(t); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + s = ordered_hashmap_get(t->sessions_by_section, n); + if (s) { + *ret = TAKE_PTR(s); + return 0; + } + + s = new(L2tpSession, 1); + if (!s) + return -ENOMEM; + + *s = (L2tpSession) { + .l2tp_l2spec_type = NETDEV_L2TP_L2SPECTYPE_DEFAULT, + .tunnel = t, + .section = TAKE_PTR(n), + }; + + r = ordered_hashmap_ensure_put(&t->sessions_by_section, &config_section_hash_ops, s->section, s); + if (r < 0) + return r; + + *ret = TAKE_PTR(s); + return 0; +} + +static int netdev_l2tp_create_message_tunnel(NetDev *netdev, union in_addr_union *local_address, sd_netlink_message **ret) { + assert(local_address); + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + uint16_t encap_type; + L2tpTunnel *t = L2TP(netdev); + int r; + + r = sd_genl_message_new(netdev->manager->genl, L2TP_GENL_NAME, L2TP_CMD_TUNNEL_CREATE, &m); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, L2TP_ATTR_CONN_ID, t->tunnel_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, L2TP_ATTR_PEER_CONN_ID, t->peer_tunnel_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, L2TP_ATTR_PROTO_VERSION, 3); + if (r < 0) + return r; + + switch (t->l2tp_encap_type) { + case NETDEV_L2TP_ENCAPTYPE_IP: + encap_type = L2TP_ENCAPTYPE_IP; + break; + case NETDEV_L2TP_ENCAPTYPE_UDP: + default: + encap_type = L2TP_ENCAPTYPE_UDP; + break; + } + + r = sd_netlink_message_append_u16(m, L2TP_ATTR_ENCAP_TYPE, encap_type); + if (r < 0) + return r; + + if (t->family == AF_INET) { + r = sd_netlink_message_append_in_addr(m, L2TP_ATTR_IP_SADDR, &local_address->in); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(m, L2TP_ATTR_IP_DADDR, &t->remote.in); + if (r < 0) + return r; + } else { + r = sd_netlink_message_append_in6_addr(m, L2TP_ATTR_IP6_SADDR, &local_address->in6); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, L2TP_ATTR_IP6_DADDR, &t->remote.in6); + if (r < 0) + return r; + } + + if (encap_type == L2TP_ENCAPTYPE_UDP) { + r = sd_netlink_message_append_u16(m, L2TP_ATTR_UDP_SPORT, t->l2tp_udp_sport); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, L2TP_ATTR_UDP_DPORT, t->l2tp_udp_dport); + if (r < 0) + return r; + + if (t->udp_csum) { + r = sd_netlink_message_append_u8(m, L2TP_ATTR_UDP_CSUM, t->udp_csum); + if (r < 0) + return r; + } + + if (t->udp6_csum_tx) { + r = sd_netlink_message_append_flag(m, L2TP_ATTR_UDP_ZERO_CSUM6_TX); + if (r < 0) + return r; + } + + if (t->udp6_csum_rx) { + r = sd_netlink_message_append_flag(m, L2TP_ATTR_UDP_ZERO_CSUM6_RX); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(m); + + return 0; +} + +static int netdev_l2tp_create_message_session(NetDev *netdev, L2tpSession *session, sd_netlink_message **ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + uint16_t l2_spec_len; + uint8_t l2_spec_type; + int r; + + assert(netdev); + assert(session); + assert(session->tunnel); + + r = sd_genl_message_new(netdev->manager->genl, L2TP_GENL_NAME, L2TP_CMD_SESSION_CREATE, &m); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, L2TP_ATTR_CONN_ID, session->tunnel->tunnel_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, L2TP_ATTR_PEER_CONN_ID, session->tunnel->peer_tunnel_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, L2TP_ATTR_SESSION_ID, session->session_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, L2TP_ATTR_PW_TYPE, L2TP_PWTYPE_ETH); + if (r < 0) + return r; + + switch (session->l2tp_l2spec_type) { + case NETDEV_L2TP_L2SPECTYPE_NONE: + l2_spec_type = L2TP_L2SPECTYPE_NONE; + l2_spec_len = 0; + break; + case NETDEV_L2TP_L2SPECTYPE_DEFAULT: + default: + l2_spec_type = L2TP_L2SPECTYPE_DEFAULT; + l2_spec_len = 4; + break; + } + + r = sd_netlink_message_append_u8(m, L2TP_ATTR_L2SPEC_TYPE, l2_spec_type); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, L2TP_ATTR_L2SPEC_LEN, l2_spec_len); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, L2TP_ATTR_IFNAME, session->name); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + + return 0; +} + +static int link_get_l2tp_local_address(Link *link, L2tpTunnel *t, union in_addr_union *ret) { + Address *a; + + assert(link); + assert(t); + + SET_FOREACH(a, link->addresses) { + if (!address_is_ready(a)) + continue; + + if (a->family != t->family) + continue; + + if (in_addr_is_set(a->family, &a->in_addr_peer)) + continue; + + if (t->local_address_type == NETDEV_L2TP_LOCAL_ADDRESS_STATIC && + !FLAGS_SET(a->flags, IFA_F_PERMANENT)) + continue; + + if (t->local_address_type == NETDEV_L2TP_LOCAL_ADDRESS_DYNAMIC && + FLAGS_SET(a->flags, IFA_F_PERMANENT)) + continue; + + if (ret) + *ret = a->in_addr; + } + + return -ENOENT; +} + +static int l2tp_get_local_address(NetDev *netdev, union in_addr_union *ret) { + Link *link = NULL; + L2tpTunnel *t = L2TP(netdev); + Address *a = NULL; + int r; + + assert(netdev->manager); + + if (t->local_ifname) { + r = link_get_by_name(netdev->manager, t->local_ifname, &link); + if (r < 0) + return r; + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return -EBUSY; + } + + if (netdev->manager->manage_foreign_routes) { + /* First, check if the remote address is accessible. */ + if (link) + r = link_address_is_reachable(link, t->family, &t->remote, &t->local, &a); + else + r = manager_address_is_reachable(netdev->manager, t->family, &t->remote, &t->local, &a); + if (r < 0) + return r; + } + + if (in_addr_is_set(t->family, &t->local)) { + /* local address is explicitly specified. */ + + if (!a) { + if (link) + r = link_get_address(link, t->family, &t->local, 0, &a); + else + r = manager_get_address(netdev->manager, t->family, &t->local, 0, &a); + if (r < 0) + return r; + + if (!address_is_ready(a)) + return -EBUSY; + } + + if (ret) + *ret = a->in_addr; + + return 0; + } + + if (a) { + if (t->local_address_type == NETDEV_L2TP_LOCAL_ADDRESS_STATIC && + !FLAGS_SET(a->flags, IFA_F_PERMANENT)) + return -EINVAL; + + if (t->local_address_type == NETDEV_L2TP_LOCAL_ADDRESS_DYNAMIC && + FLAGS_SET(a->flags, IFA_F_PERMANENT)) + return -EINVAL; + + if (ret) + *ret = a->in_addr; + + return 0; + } + + if (link) + return link_get_l2tp_local_address(link, t, ret); + + HASHMAP_FOREACH(link, netdev->manager->links_by_index) { + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + continue; + + if (link_get_l2tp_local_address(link, t, ret) >= 0) + return 0; + } + + return -ENOENT; +} + +static void l2tp_session_destroy_callback(L2tpSession *session) { + if (!session) + return; + + netdev_unref(NETDEV(session->tunnel)); +} + +static int l2tp_create_session_handler(sd_netlink *rtnl, sd_netlink_message *m, L2tpSession *session) { + NetDev *netdev; + int r; + + assert(session); + assert(session->tunnel); + + netdev = NETDEV(session->tunnel); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_info(netdev, "L2TP session %s exists, using existing without changing its parameters", + session->name); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, "L2TP session %s could not be created: %m", session->name); + return 1; + } + + log_netdev_debug(netdev, "L2TP session %s created", session->name); + return 1; +} + +static int l2tp_create_session(NetDev *netdev, L2tpSession *session) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *n = NULL; + int r; + + r = netdev_l2tp_create_message_session(netdev, session, &n); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, n, l2tp_create_session_handler, + l2tp_session_destroy_callback, session); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create L2TP session %s: %m", session->name); + + netdev_ref(netdev); + return 0; +} + +static int l2tp_create_tunnel_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + L2tpSession *session; + L2tpTunnel *t = L2TP(netdev); + int r; + + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_info(netdev, "netdev exists, using existing without changing its parameters"); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, "netdev could not be created: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "L2TP tunnel is created"); + + ORDERED_HASHMAP_FOREACH(session, t->sessions_by_section) + (void) l2tp_create_session(netdev, session); + + return 1; +} + +static int l2tp_create_tunnel(NetDev *netdev) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + union in_addr_union local_address; + L2tpTunnel *t = L2TP(netdev); + int r; + + r = l2tp_get_local_address(netdev, &local_address); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not find local address."); + + if (t->local_address_type >= 0 && DEBUG_LOGGING) + log_netdev_debug(netdev, "Local address %s acquired.", + IN_ADDR_TO_STRING(t->family, &local_address)); + + r = netdev_l2tp_create_message_tunnel(netdev, &local_address, &m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, m, l2tp_create_tunnel_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create L2TP tunnel: %m"); + + netdev_ref(netdev); + + return 0; +} + +static int netdev_l2tp_is_ready_to_create(NetDev *netdev, Link *link) { + return l2tp_get_local_address(netdev, NULL) >= 0; +} + +int config_parse_l2tp_tunnel_local_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *addr_or_type = NULL, *ifname = NULL; + L2tpLocalAddressType type; + L2tpTunnel *t = ASSERT_PTR(userdata); + const char *p = ASSERT_PTR(rvalue); + union in_addr_union a; + int r, f; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + t->local_ifname = mfree(t->local_ifname); + t->local_address_type = NETDEV_L2TP_LOCAL_ADDRESS_AUTO; + t->local = IN_ADDR_NULL; + + if (!in_addr_is_set(t->family, &t->remote)) + /* If Remote= is not specified yet, then also clear family. */ + t->family = AF_UNSPEC; + + return 0; + } + + r = extract_first_word(&p, &addr_or_type, "@", 0); + if (r < 0) + return log_oom(); + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid L2TP Tunnel address specified in %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (!isempty(p)) { + if (!ifname_valid_full(p, IFNAME_VALID_ALTERNATIVE)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid interface name specified in %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + ifname = strdup(p); + if (!ifname) + return log_oom(); + } + + type = l2tp_local_address_type_from_string(addr_or_type); + if (type >= 0) { + free_and_replace(t->local_ifname, ifname); + t->local_address_type = type; + t->local = IN_ADDR_NULL; + + if (!in_addr_is_set(t->family, &t->remote)) + /* If Remote= is not specified yet, then also clear family. */ + t->family = AF_UNSPEC; + + return 0; + } + + r = in_addr_from_string_auto(addr_or_type, &f, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid L2TP Tunnel local address \"%s\" specified, ignoring assignment: %s", addr_or_type, rvalue); + return 0; + } + + if (in_addr_is_null(f, &a)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "L2TP Tunnel local address cannot be null, ignoring assignment: %s", rvalue); + return 0; + } + + if (t->family != AF_UNSPEC && t->family != f) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Address family does not match the previous assignment, ignoring assignment: %s", rvalue); + return 0; + } + + t->family = f; + t->local = a; + free_and_replace(t->local_ifname, ifname); + t->local_address_type = _NETDEV_L2TP_LOCAL_ADDRESS_INVALID; + return 0; +} + +int config_parse_l2tp_tunnel_remote_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + L2tpTunnel *t = ASSERT_PTR(userdata); + union in_addr_union a; + int r, f; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + t->remote = IN_ADDR_NULL; + + if (!in_addr_is_set(t->family, &t->local)) + /* If Local= is not specified yet, then also clear family. */ + t->family = AF_UNSPEC; + + return 0; + } + + r = in_addr_from_string_auto(rvalue, &f, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid L2TP Tunnel remote address specified, ignoring assignment: %s", rvalue); + return 0; + } + + if (in_addr_is_null(f, &a)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "L2TP Tunnel remote address cannot be null, ignoring assignment: %s", rvalue); + return 0; + } + + if (t->family != AF_UNSPEC && t->family != f) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Address family does not match the previous assignment, ignoring assignment: %s", rvalue); + return 0; + } + + t->family = f; + t->remote = a; + return 0; +} + +int config_parse_l2tp_tunnel_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + uint32_t *id = ASSERT_PTR(data); + + return config_parse_uint32_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 1, UINT32_MAX, true, + id); +} + +int config_parse_l2tp_session_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + L2tpTunnel *t = ASSERT_PTR(userdata); + _cleanup_(l2tp_session_free_or_set_invalidp) L2tpSession *session = NULL; + int r; + + r = l2tp_session_new_static(t, filename, section_line, &session); + if (r < 0) + return log_oom(); + + uint32_t *id = streq(lvalue, "SessionId") ? &session->session_id : &session->peer_session_id; + + r = config_parse_uint32_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 1, UINT32_MAX, true, + id); + if (r <= 0) + return r; + TAKE_PTR(session); + return 0; +} + +int config_parse_l2tp_session_l2spec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(l2tp_session_free_or_set_invalidp) L2tpSession *session = NULL; + L2tpTunnel *t = userdata; + L2tpL2specType spec; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = l2tp_session_new_static(t, filename, section_line, &session); + if (r < 0) + return log_oom(); + + spec = l2tp_l2spec_type_from_string(rvalue); + if (spec < 0) { + log_syntax(unit, LOG_WARNING, filename, line, spec, + "Failed to parse layer2 specific header type. Ignoring assignment: %s", rvalue); + return 0; + } + + session->l2tp_l2spec_type = spec; + + session = NULL; + return 0; +} + +int config_parse_l2tp_session_name( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(l2tp_session_free_or_set_invalidp) L2tpSession *session = NULL; + L2tpTunnel *t = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = l2tp_session_new_static(t, filename, section_line, &session); + if (r < 0) + return log_oom(); + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse L2TP tunnel session name. Ignoring assignment: %s", rvalue); + return 0; + } + + r = free_and_strdup(&session->name, rvalue); + if (r < 0) + return log_oom(); + + session = NULL; + return 0; +} + +static void l2tp_tunnel_init(NetDev *netdev) { + L2tpTunnel *t = L2TP(netdev); + + t->l2tp_encap_type = NETDEV_L2TP_ENCAPTYPE_UDP; + t->udp6_csum_rx = true; + t->udp6_csum_tx = true; +} + +static int l2tp_session_verify(L2tpSession *session) { + NetDev *netdev; + + assert(session); + assert(session->tunnel); + + netdev = NETDEV(session->tunnel); + + if (section_is_invalid(session->section)) + return -EINVAL; + + if (!session->name) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: L2TP session without name configured. " + "Ignoring [L2TPSession] section from line %u", + session->section->filename, session->section->line); + + if (session->session_id == 0 || session->peer_session_id == 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: L2TP session without session IDs configured. " + "Ignoring [L2TPSession] section from line %u", + session->section->filename, session->section->line); + + return 0; +} + +static int netdev_l2tp_tunnel_verify(NetDev *netdev, const char *filename) { + assert(filename); + + L2tpTunnel *t = L2TP(netdev); + L2tpSession *session; + + if (!IN_SET(t->family, AF_INET, AF_INET6)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: L2TP tunnel with invalid address family configured. Ignoring", + filename); + + if (!in_addr_is_set(t->family, &t->remote)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: L2TP tunnel without a remote address configured. Ignoring", + filename); + + if (t->tunnel_id == 0 || t->peer_tunnel_id == 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: L2TP tunnel without tunnel IDs configured. Ignoring", + filename); + + ORDERED_HASHMAP_FOREACH(session, t->sessions_by_section) + if (l2tp_session_verify(session) < 0) + l2tp_session_free(session); + + return 0; +} + +static void l2tp_tunnel_done(NetDev *netdev) { + L2tpTunnel *t = L2TP(netdev); + + ordered_hashmap_free_with_destructor(t->sessions_by_section, l2tp_session_free); + free(t->local_ifname); +} + +const NetDevVTable l2tptnl_vtable = { + .object_size = sizeof(L2tpTunnel), + .init = l2tp_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "L2TP\0L2TPSession\0", + .create = l2tp_create_tunnel, + .done = l2tp_tunnel_done, + .create_type = NETDEV_CREATE_INDEPENDENT, + .is_ready_to_create = netdev_l2tp_is_ready_to_create, + .config_verify = netdev_l2tp_tunnel_verify, +}; diff --git a/src/network/netdev/l2tp-tunnel.h b/src/network/netdev/l2tp-tunnel.h new file mode 100644 index 0000000..6028b35 --- /dev/null +++ b/src/network/netdev/l2tp-tunnel.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "in-addr-util.h" +#include "netdev.h" +#include "networkd-util.h" + +typedef enum L2tpL2specType { + NETDEV_L2TP_L2SPECTYPE_NONE = L2TP_L2SPECTYPE_NONE, + NETDEV_L2TP_L2SPECTYPE_DEFAULT = L2TP_L2SPECTYPE_DEFAULT, + _NETDEV_L2TP_L2SPECTYPE_MAX, + _NETDEV_L2TP_L2SPECTYPE_INVALID = -EINVAL, +} L2tpL2specType; + +typedef enum L2tpEncapType { + NETDEV_L2TP_ENCAPTYPE_UDP = L2TP_ENCAPTYPE_UDP, + NETDEV_L2TP_ENCAPTYPE_IP = L2TP_ENCAPTYPE_IP, + _NETDEV_L2TP_ENCAPTYPE_MAX, + _NETDEV_L2TP_ENCAPTYPE_INVALID = -EINVAL, +} L2tpEncapType; + +typedef enum L2tpLocalAddressType { + NETDEV_L2TP_LOCAL_ADDRESS_AUTO, + NETDEV_L2TP_LOCAL_ADDRESS_STATIC, + NETDEV_L2TP_LOCAL_ADDRESS_DYNAMIC, + _NETDEV_L2TP_LOCAL_ADDRESS_MAX, + _NETDEV_L2TP_LOCAL_ADDRESS_INVALID = -EINVAL, +} L2tpLocalAddressType; + +typedef struct L2tpTunnel L2tpTunnel; + +typedef struct L2tpSession { + L2tpTunnel *tunnel; + ConfigSection *section; + + char *name; + + uint32_t session_id; + uint32_t peer_session_id; + L2tpL2specType l2tp_l2spec_type; +} L2tpSession; + +struct L2tpTunnel { + NetDev meta; + + uint16_t l2tp_udp_sport; + uint16_t l2tp_udp_dport; + + uint32_t tunnel_id; + uint32_t peer_tunnel_id; + + int family; + + bool udp_csum; + bool udp6_csum_rx; + bool udp6_csum_tx; + + char *local_ifname; + L2tpLocalAddressType local_address_type; + union in_addr_union local; + union in_addr_union remote; + + L2tpEncapType l2tp_encap_type; + + OrderedHashmap *sessions_by_section; +}; + +DEFINE_NETDEV_CAST(L2TP, L2tpTunnel); +extern const NetDevVTable l2tptnl_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_tunnel_local_address); +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_tunnel_remote_address); +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_tunnel_id); +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_encap_type); +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_session_l2spec); +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_session_id); +CONFIG_PARSER_PROTOTYPE(config_parse_l2tp_session_name); diff --git a/src/network/netdev/macsec.c b/src/network/netdev/macsec.c new file mode 100644 index 0000000..17d6ace --- /dev/null +++ b/src/network/netdev/macsec.c @@ -0,0 +1,1204 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "conf-parser.h" +#include "fileio.h" +#include "hashmap.h" +#include "hexdecoct.h" +#include "macsec.h" +#include "memory-util.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-helpers.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" + +static void security_association_clear(SecurityAssociation *sa) { + if (!sa) + return; + + explicit_bzero_safe(sa->key, sa->key_len); + free(sa->key); + free(sa->key_file); +} + +static void security_association_init(SecurityAssociation *sa) { + assert(sa); + + sa->activate = -1; + sa->use_for_encoding = -1; +} + +static ReceiveAssociation* macsec_receive_association_free(ReceiveAssociation *c) { + if (!c) + return NULL; + + if (c->macsec && c->section) + ordered_hashmap_remove(c->macsec->receive_associations_by_section, c->section); + + config_section_free(c->section); + security_association_clear(&c->sa); + + return mfree(c); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(ReceiveAssociation, macsec_receive_association_free); + +static int macsec_receive_association_new_static(MACsec *s, const char *filename, unsigned section_line, ReceiveAssociation **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(macsec_receive_association_freep) ReceiveAssociation *c = NULL; + int r; + + assert(s); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + c = ordered_hashmap_get(s->receive_associations_by_section, n); + if (c) { + *ret = TAKE_PTR(c); + return 0; + } + + c = new(ReceiveAssociation, 1); + if (!c) + return -ENOMEM; + + *c = (ReceiveAssociation) { + .macsec = s, + .section = TAKE_PTR(n), + }; + + security_association_init(&c->sa); + + r = ordered_hashmap_ensure_put(&s->receive_associations_by_section, &config_section_hash_ops, c->section, c); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + + return 0; +} + +static ReceiveChannel* macsec_receive_channel_free(ReceiveChannel *c) { + if (!c) + return NULL; + + if (c->macsec) { + if (c->sci.as_uint64 > 0) + ordered_hashmap_remove_value(c->macsec->receive_channels, &c->sci.as_uint64, c); + + if (c->section) + ordered_hashmap_remove(c->macsec->receive_channels_by_section, c->section); + } + + config_section_free(c->section); + + return mfree(c); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(ReceiveChannel, macsec_receive_channel_free); + +static int macsec_receive_channel_new(MACsec *s, uint64_t sci, ReceiveChannel **ret) { + ReceiveChannel *c; + + assert(s); + + c = new(ReceiveChannel, 1); + if (!c) + return -ENOMEM; + + *c = (ReceiveChannel) { + .macsec = s, + .sci.as_uint64 = sci, + }; + + *ret = c; + return 0; +} + +static int macsec_receive_channel_new_static(MACsec *s, const char *filename, unsigned section_line, ReceiveChannel **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(macsec_receive_channel_freep) ReceiveChannel *c = NULL; + int r; + + assert(s); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + c = ordered_hashmap_get(s->receive_channels_by_section, n); + if (c) { + *ret = TAKE_PTR(c); + return 0; + } + + r = macsec_receive_channel_new(s, 0, &c); + if (r < 0) + return r; + + c->section = TAKE_PTR(n); + + r = ordered_hashmap_ensure_put(&s->receive_channels_by_section, &config_section_hash_ops, c->section, c); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + + return 0; +} + +static TransmitAssociation* macsec_transmit_association_free(TransmitAssociation *a) { + if (!a) + return NULL; + + if (a->macsec && a->section) + ordered_hashmap_remove(a->macsec->transmit_associations_by_section, a->section); + + config_section_free(a->section); + security_association_clear(&a->sa); + + return mfree(a); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(TransmitAssociation, macsec_transmit_association_free); + +static int macsec_transmit_association_new_static(MACsec *s, const char *filename, unsigned section_line, TransmitAssociation **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(macsec_transmit_association_freep) TransmitAssociation *a = NULL; + int r; + + assert(s); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + a = ordered_hashmap_get(s->transmit_associations_by_section, n); + if (a) { + *ret = TAKE_PTR(a); + return 0; + } + + a = new(TransmitAssociation, 1); + if (!a) + return -ENOMEM; + + *a = (TransmitAssociation) { + .macsec = s, + .section = TAKE_PTR(n), + }; + + security_association_init(&a->sa); + + r = ordered_hashmap_ensure_put(&s->transmit_associations_by_section, &config_section_hash_ops, a->section, a); + if (r < 0) + return r; + + *ret = TAKE_PTR(a); + + return 0; +} + +static int netdev_macsec_create_message(NetDev *netdev, int command, sd_netlink_message **ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + assert(netdev->ifindex > 0); + + r = sd_genl_message_new(netdev->manager->genl, MACSEC_GENL_NAME, command, &m); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, MACSEC_ATTR_IFINDEX, netdev->ifindex); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + + return 0; +} + +static int netdev_macsec_fill_message_sci(NetDev *netdev, MACsecSCI *sci, sd_netlink_message *m) { + int r; + + assert(netdev); + assert(m); + assert(sci); + + r = sd_netlink_message_open_container(m, MACSEC_ATTR_RXSC_CONFIG); + if (r < 0) + return r; + + r = sd_netlink_message_append_u64(m, MACSEC_RXSC_ATTR_SCI, sci->as_uint64); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +static int netdev_macsec_fill_message_sa(NetDev *netdev, SecurityAssociation *a, sd_netlink_message *m) { + int r; + + assert(netdev); + assert(a); + assert(m); + + r = sd_netlink_message_open_container(m, MACSEC_ATTR_SA_CONFIG); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, MACSEC_SA_ATTR_AN, a->association_number); + if (r < 0) + return r; + + if (a->packet_number > 0) { + r = sd_netlink_message_append_u32(m, MACSEC_SA_ATTR_PN, a->packet_number); + if (r < 0) + return r; + } + + if (a->key_len > 0) { + r = sd_netlink_message_append_data(m, MACSEC_SA_ATTR_KEYID, a->key_id, MACSEC_KEYID_LEN); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(m, MACSEC_SA_ATTR_KEY, a->key, a->key_len); + if (r < 0) + return r; + } + + if (a->activate >= 0) { + r = sd_netlink_message_append_u8(m, MACSEC_SA_ATTR_ACTIVE, a->activate); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +static int macsec_receive_association_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_info(netdev, + "MACsec receive secure association exists, using it without changing parameters"); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, + "Failed to add receive secure association: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "Receive secure association is configured"); + + return 1; +} + +static int netdev_macsec_configure_receive_association(NetDev *netdev, ReceiveAssociation *a) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + assert(a); + + r = netdev_macsec_create_message(netdev, MACSEC_CMD_ADD_RXSA, &m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create netlink message: %m"); + + r = netdev_macsec_fill_message_sa(netdev, &a->sa, m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to fill netlink message: %m"); + + r = netdev_macsec_fill_message_sci(netdev, &a->sci, m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to fill netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, m, macsec_receive_association_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to configure receive secure association: %m"); + + netdev_ref(netdev); + + return 0; +} + +static int macsec_receive_channel_handler(sd_netlink *rtnl, sd_netlink_message *m, ReceiveChannel *c) { + assert(c); + assert(c->macsec); + + NetDev *netdev = ASSERT_PTR(NETDEV(c->macsec)); + int r; + + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_debug(netdev, + "MACsec receive channel exists, using it without changing parameters"); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, + "Failed to add receive secure channel: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "Receive channel is configured"); + + for (unsigned i = 0; i < c->n_rxsa; i++) { + r = netdev_macsec_configure_receive_association(netdev, c->rxsa[i]); + if (r < 0) { + log_netdev_warning_errno(netdev, r, + "Failed to configure receive security association: %m"); + netdev_enter_failed(netdev); + return 1; + } + } + + return 1; +} + +static void receive_channel_destroy_callback(ReceiveChannel *c) { + assert(c); + assert(c->macsec); + + netdev_unref(NETDEV(c->macsec)); +} + +static int netdev_macsec_configure_receive_channel(NetDev *netdev, ReceiveChannel *c) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + assert(c); + + r = netdev_macsec_create_message(netdev, MACSEC_CMD_ADD_RXSC, &m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create netlink message: %m"); + + r = netdev_macsec_fill_message_sci(netdev, &c->sci, m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to fill netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, m, macsec_receive_channel_handler, + receive_channel_destroy_callback, c); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to configure receive channel: %m"); + + netdev_ref(netdev); + + return 0; +} + +static int macsec_transmit_association_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_info(netdev, + "MACsec transmit secure association exists, using it without changing parameters"); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, + "Failed to add transmit secure association: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "Transmit secure association is configured"); + + return 1; +} + +static int netdev_macsec_configure_transmit_association(NetDev *netdev, TransmitAssociation *a) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + assert(a); + + r = netdev_macsec_create_message(netdev, MACSEC_CMD_ADD_TXSA, &m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to create netlink message: %m"); + + r = netdev_macsec_fill_message_sa(netdev, &a->sa, m); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to fill netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, m, macsec_transmit_association_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to configure transmit secure association: %m"); + + netdev_ref(netdev); + + return 0; +} + +static int netdev_macsec_configure(NetDev *netdev, Link *link) { + MACsec *s = MACSEC(netdev); + TransmitAssociation *a; + ReceiveChannel *c; + int r; + + ORDERED_HASHMAP_FOREACH(a, s->transmit_associations_by_section) { + r = netdev_macsec_configure_transmit_association(netdev, a); + if (r < 0) + return r; + } + + ORDERED_HASHMAP_FOREACH(c, s->receive_channels) { + r = netdev_macsec_configure_receive_channel(netdev, c); + if (r < 0) + return r; + } + + return 0; +} + +static int netdev_macsec_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(m); + + MACsec *v = MACSEC(netdev); + int r; + + if (v->port > 0) { + r = sd_netlink_message_append_u16(m, IFLA_MACSEC_PORT, v->port); + if (r < 0) + return r; + } + + if (v->encrypt >= 0) { + r = sd_netlink_message_append_u8(m, IFLA_MACSEC_ENCRYPT, v->encrypt); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(m, IFLA_MACSEC_ENCODING_SA, v->encoding_an); + if (r < 0) + return r; + + return 0; +} + +int config_parse_macsec_port( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + MACsec *s = ASSERT_PTR(userdata); + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + _cleanup_(macsec_receive_channel_free_or_set_invalidp) ReceiveChannel *c = NULL; + uint16_t port; + void *dest; + int r; + + /* This parses port used to make Secure Channel Identifier (SCI) */ + + if (streq(section, "MACsec")) + dest = &s->port; + else if (streq(section, "MACsecReceiveChannel")) { + r = macsec_receive_channel_new_static(s, filename, section_line, &c); + if (r < 0) + return log_oom(); + + dest = &c->sci.port; + } else { + assert(streq(section, "MACsecReceiveAssociation")); + + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + dest = &b->sci.port; + } + + r = parse_ip_port(rvalue, &port); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse port '%s' for secure channel identifier. Ignoring assignment: %m", + rvalue); + return 0; + } + + unaligned_write_be16(dest, port); + + TAKE_PTR(b); + TAKE_PTR(c); + + return 0; +} + +int config_parse_macsec_hw_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + MACsec *s = ASSERT_PTR(userdata); + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + _cleanup_(macsec_receive_channel_free_or_set_invalidp) ReceiveChannel *c = NULL; + int r; + + if (streq(section, "MACsecReceiveChannel")) + r = macsec_receive_channel_new_static(s, filename, section_line, &c); + else + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + r = parse_ether_addr(rvalue, b ? &b->sci.mac : &c->sci.mac); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse MAC address for secure channel identifier. " + "Ignoring assignment: %s", rvalue); + return 0; + } + + TAKE_PTR(b); + TAKE_PTR(c); + + return 0; +} + +int config_parse_macsec_packet_number( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + MACsec *s = ASSERT_PTR(userdata); + _cleanup_(macsec_transmit_association_free_or_set_invalidp) TransmitAssociation *a = NULL; + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + uint32_t val, *dest; + int r; + + if (streq(section, "MACsecTransmitAssociation")) + r = macsec_transmit_association_new_static(s, filename, section_line, &a); + else + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + dest = a ? &a->sa.packet_number : &b->sa.packet_number; + + r = safe_atou32(rvalue, &val); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse packet number. Ignoring assignment: %s", rvalue); + return 0; + } + if (streq(section, "MACsecTransmitAssociation") && val == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid packet number. Ignoring assignment: %s", rvalue); + return 0; + } + + *dest = val; + TAKE_PTR(a); + TAKE_PTR(b); + + return 0; +} + +int config_parse_macsec_key( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(macsec_transmit_association_free_or_set_invalidp) TransmitAssociation *a = NULL; + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + _cleanup_(erase_and_freep) void *p = NULL; + MACsec *s = userdata; + SecurityAssociation *dest; + size_t l; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + (void) warn_file_is_world_accessible(filename, NULL, unit, line); + + if (streq(section, "MACsecTransmitAssociation")) + r = macsec_transmit_association_new_static(s, filename, section_line, &a); + else + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + dest = a ? &a->sa : &b->sa; + + r = unhexmem_full(rvalue, strlen(rvalue), true, &p, &l); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse key. Ignoring assignment: %m"); + return 0; + } + + if (l != 16) { + /* See DEFAULT_SAK_LEN in drivers/net/macsec.c */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid key length (%zu). Ignoring assignment", l); + return 0; + } + + explicit_bzero_safe(dest->key, dest->key_len); + free_and_replace(dest->key, p); + dest->key_len = l; + + TAKE_PTR(a); + TAKE_PTR(b); + + return 0; +} + +int config_parse_macsec_key_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(macsec_transmit_association_free_or_set_invalidp) TransmitAssociation *a = NULL; + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + _cleanup_free_ char *path = NULL; + MACsec *s = userdata; + char **dest; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(section, "MACsecTransmitAssociation")) + r = macsec_transmit_association_new_static(s, filename, section_line, &a); + else + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + dest = a ? &a->sa.key_file : &b->sa.key_file; + + if (isempty(rvalue)) { + *dest = mfree(*dest); + return 0; + } + + path = strdup(rvalue); + if (!path) + return log_oom(); + + if (path_simplify_and_warn(path, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue) < 0) + return 0; + + free_and_replace(*dest, path); + TAKE_PTR(a); + TAKE_PTR(b); + + return 0; +} + +int config_parse_macsec_key_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(macsec_transmit_association_free_or_set_invalidp) TransmitAssociation *a = NULL; + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + _cleanup_free_ void *p = NULL; + MACsec *s = userdata; + uint8_t *dest; + size_t l; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(section, "MACsecTransmitAssociation")) + r = macsec_transmit_association_new_static(s, filename, section_line, &a); + else + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + r = unhexmem(rvalue, strlen(rvalue), &p, &l); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse KeyId=%s, ignoring assignment: %m", rvalue); + return 0; + } + if (l > MACSEC_KEYID_LEN) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified KeyId= is larger then the allowed maximum (%zu > %i), ignoring: %s", + l, MACSEC_KEYID_LEN, rvalue); + return 0; + } + + dest = a ? a->sa.key_id : b->sa.key_id; + memcpy_safe(dest, p, l); + memzero(dest + l, MACSEC_KEYID_LEN - l); + + TAKE_PTR(a); + TAKE_PTR(b); + + return 0; +} + +int config_parse_macsec_sa_activate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(macsec_transmit_association_free_or_set_invalidp) TransmitAssociation *a = NULL; + _cleanup_(macsec_receive_association_free_or_set_invalidp) ReceiveAssociation *b = NULL; + MACsec *s = userdata; + int *dest, r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(section, "MACsecTransmitAssociation")) + r = macsec_transmit_association_new_static(s, filename, section_line, &a); + else + r = macsec_receive_association_new_static(s, filename, section_line, &b); + if (r < 0) + return log_oom(); + + dest = a ? &a->sa.activate : &b->sa.activate; + + r = parse_tristate(rvalue, dest); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse activation mode of %s security association. " + "Ignoring assignment: %s", + streq(section, "MACsecTransmitAssociation") ? "transmit" : "receive", + rvalue); + return 0; + } + + TAKE_PTR(a); + TAKE_PTR(b); + + return 0; +} + +int config_parse_macsec_use_for_encoding( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(macsec_transmit_association_free_or_set_invalidp) TransmitAssociation *a = NULL; + MACsec *s = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = macsec_transmit_association_new_static(s, filename, section_line, &a); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + a->sa.use_for_encoding = -1; + TAKE_PTR(a); + return 0; + } + + r = parse_tristate(rvalue, &a->sa.use_for_encoding); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s= setting. Ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (a->sa.use_for_encoding > 0) + a->sa.activate = true; + + TAKE_PTR(a); + + return 0; +} + +static int macsec_read_key_file(NetDev *netdev, SecurityAssociation *sa) { + _cleanup_(erase_and_freep) uint8_t *key = NULL; + size_t key_len; + int r; + + assert(netdev); + assert(sa); + + if (!sa->key_file) + return 0; + + r = read_full_file_full( + AT_FDCWD, sa->key_file, UINT64_MAX, MACSEC_KEYID_LEN, + READ_FULL_FILE_SECURE | + READ_FULL_FILE_UNHEX | + READ_FULL_FILE_WARN_WORLD_READABLE | + READ_FULL_FILE_CONNECT_SOCKET | + READ_FULL_FILE_FAIL_WHEN_LARGER, + NULL, (char **) &key, &key_len); + if (r < 0) + return log_netdev_error_errno(netdev, r, + "Failed to read key from '%s', ignoring: %m", + sa->key_file); + + if (key_len != MACSEC_KEYID_LEN) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "Invalid key length (%zu bytes), ignoring: %m", key_len); + + explicit_bzero_safe(sa->key, sa->key_len); + free_and_replace(sa->key, key); + sa->key_len = key_len; + + return 0; +} + +static int macsec_receive_channel_verify(ReceiveChannel *c) { + NetDev *netdev; + int r; + + assert(c); + assert(c->macsec); + + netdev = NETDEV(c->macsec); + + if (section_is_invalid(c->section)) + return -EINVAL; + + if (ether_addr_is_null(&c->sci.mac)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec receive channel without MAC address configured. " + "Ignoring [MACsecReceiveChannel] section from line %u", + c->section->filename, c->section->line); + + if (c->sci.port == 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec receive channel without port configured. " + "Ignoring [MACsecReceiveChannel] section from line %u", + c->section->filename, c->section->line); + + r = ordered_hashmap_ensure_put(&c->macsec->receive_channels, &uint64_hash_ops, &c->sci.as_uint64, c); + if (r == -ENOMEM) + return log_oom(); + if (r == -EEXIST) + return log_netdev_error_errno(netdev, r, + "%s: Multiple [MACsecReceiveChannel] sections have same SCI, " + "Ignoring [MACsecReceiveChannel] section from line %u", + c->section->filename, c->section->line); + if (r < 0) + return log_netdev_error_errno(netdev, r, + "%s: Failed to store [MACsecReceiveChannel] section at hashmap, " + "Ignoring [MACsecReceiveChannel] section from line %u", + c->section->filename, c->section->line); + return 0; +} + +static int macsec_transmit_association_verify(TransmitAssociation *t) { + NetDev *netdev; + int r; + + assert(t); + assert(t->macsec); + + netdev = NETDEV(t->macsec); + + if (section_is_invalid(t->section)) + return -EINVAL; + + if (t->sa.packet_number == 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec transmit secure association without PacketNumber= configured. " + "Ignoring [MACsecTransmitAssociation] section from line %u", + t->section->filename, t->section->line); + + r = macsec_read_key_file(netdev, &t->sa); + if (r < 0) + return r; + + if (t->sa.key_len <= 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec transmit secure association without key configured. " + "Ignoring [MACsecTransmitAssociation] section from line %u", + t->section->filename, t->section->line); + + return 0; +} + +static int macsec_receive_association_verify(ReceiveAssociation *a) { + ReceiveChannel *c; + NetDev *netdev; + int r; + + assert(a); + assert(a->macsec); + + netdev = NETDEV(a->macsec); + + if (section_is_invalid(a->section)) + return -EINVAL; + + r = macsec_read_key_file(netdev, &a->sa); + if (r < 0) + return r; + + if (a->sa.key_len <= 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec receive secure association without key configured. " + "Ignoring [MACsecReceiveAssociation] section from line %u", + a->section->filename, a->section->line); + + if (ether_addr_is_null(&a->sci.mac)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec receive secure association without MAC address configured. " + "Ignoring [MACsecReceiveAssociation] section from line %u", + a->section->filename, a->section->line); + + if (a->sci.port == 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: MACsec receive secure association without port configured. " + "Ignoring [MACsecReceiveAssociation] section from line %u", + a->section->filename, a->section->line); + + c = ordered_hashmap_get(a->macsec->receive_channels, &a->sci.as_uint64); + if (!c) { + _cleanup_(macsec_receive_channel_freep) ReceiveChannel *new_channel = NULL; + + r = macsec_receive_channel_new(a->macsec, a->sci.as_uint64, &new_channel); + if (r < 0) + return log_oom(); + + r = ordered_hashmap_ensure_put(&a->macsec->receive_channels, &uint64_hash_ops, &new_channel->sci.as_uint64, new_channel); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_netdev_error_errno(netdev, r, + "%s: Failed to store receive channel at hashmap, " + "Ignoring [MACsecReceiveAssociation] section from line %u", + a->section->filename, a->section->line); + c = TAKE_PTR(new_channel); + } + if (c->n_rxsa >= MACSEC_MAX_ASSOCIATION_NUMBER) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(ERANGE), + "%s: Too many [MACsecReceiveAssociation] sections for the same receive channel, " + "Ignoring [MACsecReceiveAssociation] section from line %u", + a->section->filename, a->section->line); + + a->sa.association_number = c->n_rxsa; + c->rxsa[c->n_rxsa++] = a; + + return 0; +} + +static int netdev_macsec_verify(NetDev *netdev, const char *filename) { + assert(filename); + + MACsec *v = MACSEC(netdev); + TransmitAssociation *a; + ReceiveAssociation *n; + ReceiveChannel *c; + uint8_t an, encoding_an; + bool use_for_encoding; + int r; + + ORDERED_HASHMAP_FOREACH(c, v->receive_channels_by_section) { + r = macsec_receive_channel_verify(c); + if (r < 0) + macsec_receive_channel_free(c); + } + + an = 0; + use_for_encoding = false; + encoding_an = 0; + ORDERED_HASHMAP_FOREACH(a, v->transmit_associations_by_section) { + r = macsec_transmit_association_verify(a); + if (r < 0) { + macsec_transmit_association_free(a); + continue; + } + + if (an >= MACSEC_MAX_ASSOCIATION_NUMBER) { + log_netdev_error(netdev, + "%s: Too many [MACsecTransmitAssociation] sections configured. " + "Ignoring [MACsecTransmitAssociation] section from line %u", + a->section->filename, a->section->line); + macsec_transmit_association_free(a); + continue; + } + + a->sa.association_number = an++; + + if (a->sa.use_for_encoding > 0) { + if (use_for_encoding) { + log_netdev_warning(netdev, + "%s: Multiple security associations are set to be used for transmit channel." + "Disabling UseForEncoding= in [MACsecTransmitAssociation] section from line %u", + a->section->filename, a->section->line); + a->sa.use_for_encoding = false; + } else { + encoding_an = a->sa.association_number; + use_for_encoding = true; + } + } + } + + assert(encoding_an < MACSEC_MAX_ASSOCIATION_NUMBER); + v->encoding_an = encoding_an; + + ORDERED_HASHMAP_FOREACH(n, v->receive_associations_by_section) { + r = macsec_receive_association_verify(n); + if (r < 0) + macsec_receive_association_free(n); + } + + return 0; +} + +static void macsec_init(NetDev *netdev) { + MACsec *v = MACSEC(netdev); + + v->encrypt = -1; +} + +static void macsec_done(NetDev *netdev) { + MACsec *v = MACSEC(netdev); + + ordered_hashmap_free_with_destructor(v->receive_channels, macsec_receive_channel_free); + ordered_hashmap_free_with_destructor(v->receive_channels_by_section, macsec_receive_channel_free); + ordered_hashmap_free_with_destructor(v->transmit_associations_by_section, macsec_transmit_association_free); + ordered_hashmap_free_with_destructor(v->receive_associations_by_section, macsec_receive_association_free); +} + +const NetDevVTable macsec_vtable = { + .object_size = sizeof(MACsec), + .init = macsec_init, + .sections = NETDEV_COMMON_SECTIONS "MACsec\0MACsecReceiveChannel\0MACsecTransmitAssociation\0MACsecReceiveAssociation\0", + .fill_message_create = netdev_macsec_fill_message_create, + .post_create = netdev_macsec_configure, + .done = macsec_done, + .create_type = NETDEV_CREATE_STACKED, + .config_verify = netdev_macsec_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/macsec.h b/src/network/netdev/macsec.h new file mode 100644 index 0000000..17bb1ca --- /dev/null +++ b/src/network/netdev/macsec.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "ether-addr-util.h" +#include "in-addr-util.h" +#include "netdev.h" +#include "networkd-util.h" +#include "sparse-endian.h" + +/* See the definition of MACSEC_NUM_AN in kernel's drivers/net/macsec.c */ +#define MACSEC_MAX_ASSOCIATION_NUMBER 4 + +typedef struct MACsec MACsec; + +typedef union MACsecSCI { + uint64_t as_uint64; + + struct { + struct ether_addr mac; + be16_t port; + } _packed_; +} MACsecSCI; + +assert_cc(sizeof(MACsecSCI) == sizeof(uint64_t)); + +typedef struct SecurityAssociation { + uint8_t association_number; + uint32_t packet_number; + uint8_t key_id[MACSEC_KEYID_LEN]; + uint8_t *key; + uint32_t key_len; + char *key_file; + int activate; + int use_for_encoding; +} SecurityAssociation; + +typedef struct TransmitAssociation { + MACsec *macsec; + ConfigSection *section; + + SecurityAssociation sa; +} TransmitAssociation; + +typedef struct ReceiveAssociation { + MACsec *macsec; + ConfigSection *section; + + MACsecSCI sci; + SecurityAssociation sa; +} ReceiveAssociation; + +typedef struct ReceiveChannel { + MACsec *macsec; + ConfigSection *section; + + MACsecSCI sci; + ReceiveAssociation *rxsa[MACSEC_MAX_ASSOCIATION_NUMBER]; + unsigned n_rxsa; +} ReceiveChannel; + +struct MACsec { + NetDev meta; + + uint16_t port; + int encrypt; + uint8_t encoding_an; + + OrderedHashmap *receive_channels; + OrderedHashmap *receive_channels_by_section; + OrderedHashmap *transmit_associations_by_section; + OrderedHashmap *receive_associations_by_section; +}; + +DEFINE_NETDEV_CAST(MACSEC, MACsec); +extern const NetDevVTable macsec_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_port); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_hw_address); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_packet_number); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_key_id); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_key); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_key_file); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_sa_activate); +CONFIG_PARSER_PROTOTYPE(config_parse_macsec_use_for_encoding); diff --git a/src/network/netdev/macvlan.c b/src/network/netdev/macvlan.c new file mode 100644 index 0000000..203807e --- /dev/null +++ b/src/network/netdev/macvlan.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "conf-parser.h" +#include "macvlan.h" +#include "macvlan-util.h" +#include "networkd-network.h" +#include "parse-util.h" + +DEFINE_CONFIG_PARSE_ENUM(config_parse_macvlan_mode, macvlan_mode, MacVlanMode, "Failed to parse macvlan mode"); + +static int netdev_macvlan_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *req) { + assert(netdev); + assert(netdev->ifname); + assert(link); + assert(link->network); + + MacVlan *m = netdev->kind == NETDEV_KIND_MACVLAN ? MACVLAN(netdev) : MACVTAP(netdev); + int r; + + if (m->mode == NETDEV_MACVLAN_MODE_SOURCE && !set_isempty(m->match_source_mac)) { + const struct ether_addr *mac_addr; + + r = sd_netlink_message_append_u32(req, IFLA_MACVLAN_MACADDR_MODE, MACVLAN_MACADDR_SET); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(req, IFLA_MACVLAN_MACADDR_DATA); + if (r < 0) + return r; + + SET_FOREACH(mac_addr, m->match_source_mac) { + r = sd_netlink_message_append_ether_addr(req, IFLA_MACVLAN_MACADDR, mac_addr); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + } + + if (m->mode != _NETDEV_MACVLAN_MODE_INVALID) { + r = sd_netlink_message_append_u32(req, IFLA_MACVLAN_MODE, m->mode); + if (r < 0) + return r; + } + + /* set the nopromisc flag if Promiscuous= of the link is explicitly set to false */ + if (m->mode == NETDEV_MACVLAN_MODE_PASSTHRU && link->network->promiscuous == 0) { + r = sd_netlink_message_append_u16(req, IFLA_MACVLAN_FLAGS, MACVLAN_FLAG_NOPROMISC); + if (r < 0) + return r; + } + + if (m->bc_queue_length != UINT32_MAX) { + r = sd_netlink_message_append_u32(req, IFLA_MACVLAN_BC_QUEUE_LEN, m->bc_queue_length); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_macvlan_broadcast_queue_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + MacVlan *m = ASSERT_PTR(userdata); + + if (isempty(rvalue)) { + m->bc_queue_length = UINT32_MAX; + return 0; + } + + return config_parse_uint32_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, UINT32_MAX - 1, true, + &m->bc_queue_length); +} + +static void macvlan_done(NetDev *netdev) { + MacVlan *m = ASSERT_PTR(netdev)->kind == NETDEV_KIND_MACVLAN ? MACVLAN(netdev) : MACVTAP(netdev); + + set_free(m->match_source_mac); +} + +static void macvlan_init(NetDev *netdev) { + MacVlan *m = ASSERT_PTR(netdev)->kind == NETDEV_KIND_MACVLAN ? MACVLAN(netdev) : MACVTAP(netdev); + + m->mode = _NETDEV_MACVLAN_MODE_INVALID; + m->bc_queue_length = UINT32_MAX; +} + +const NetDevVTable macvtap_vtable = { + .object_size = sizeof(MacVlan), + .init = macvlan_init, + .done = macvlan_done, + .sections = NETDEV_COMMON_SECTIONS "MACVTAP\0", + .fill_message_create = netdev_macvlan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; + +const NetDevVTable macvlan_vtable = { + .object_size = sizeof(MacVlan), + .init = macvlan_init, + .done = macvlan_done, + .sections = NETDEV_COMMON_SECTIONS "MACVLAN\0", + .fill_message_create = netdev_macvlan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/macvlan.h b/src/network/netdev/macvlan.h new file mode 100644 index 0000000..c45fc4f --- /dev/null +++ b/src/network/netdev/macvlan.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct MacVlan MacVlan; + +#include "macvlan-util.h" +#include "netdev.h" +#include "set.h" + +struct MacVlan { + NetDev meta; + + MacVlanMode mode; + Set *match_source_mac; + + uint32_t bc_queue_length; +}; + +DEFINE_NETDEV_CAST(MACVLAN, MacVlan); +DEFINE_NETDEV_CAST(MACVTAP, MacVlan); +extern const NetDevVTable macvlan_vtable; +extern const NetDevVTable macvtap_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_macvlan_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_macvlan_broadcast_queue_size); diff --git a/src/network/netdev/netdev-gperf.gperf b/src/network/netdev/netdev-gperf.gperf new file mode 100644 index 0000000..d5aa522 --- /dev/null +++ b/src/network/netdev/netdev-gperf.gperf @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "bareudp.h" +#include "batadv.h" +#include "bond.h" +#include "bridge.h" +#include "conf-parser.h" +#include "fou-tunnel.h" +#include "geneve.h" +#include "ipoib.h" +#include "ipvlan.h" +#include "l2tp-tunnel.h" +#include "macsec.h" +#include "macvlan.h" +#include "net-condition.h" +#include "netdev.h" +#include "tunnel.h" +#include "tuntap.h" +#include "veth.h" +#include "vlan-util.h" +#include "vlan.h" +#include "vrf.h" +#include "vxcan.h" +#include "vxlan.h" +#include "wireguard.h" +#include "wlan.h" +#include "xfrm.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name network_netdev_gperf_hash +%define lookup-function-name network_netdev_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Match.Host, config_parse_net_condition, CONDITION_HOST, offsetof(NetDev, conditions) +Match.Virtualization, config_parse_net_condition, CONDITION_VIRTUALIZATION, offsetof(NetDev, conditions) +Match.KernelCommandLine, config_parse_net_condition, CONDITION_KERNEL_COMMAND_LINE, offsetof(NetDev, conditions) +Match.KernelVersion, config_parse_net_condition, CONDITION_KERNEL_VERSION, offsetof(NetDev, conditions) +Match.Credential, config_parse_net_condition, CONDITION_CREDENTIAL, offsetof(NetDev, conditions) +Match.Architecture, config_parse_net_condition, CONDITION_ARCHITECTURE, offsetof(NetDev, conditions) +Match.Firmware, config_parse_net_condition, CONDITION_FIRMWARE, offsetof(NetDev, conditions) +NetDev.Description, config_parse_string, 0, offsetof(NetDev, description) +NetDev.Name, config_parse_ifname, 0, offsetof(NetDev, ifname) +NetDev.Kind, config_parse_netdev_kind, 0, offsetof(NetDev, kind) +NetDev.MTUBytes, config_parse_mtu, AF_UNSPEC, offsetof(NetDev, mtu) +NetDev.MACAddress, config_parse_netdev_hw_addr, ETH_ALEN, offsetof(NetDev, hw_addr) +VLAN.Id, config_parse_vlanid, 0, offsetof(VLan, id) +VLAN.Protocol, config_parse_vlanprotocol, 0, offsetof(VLan, protocol) +VLAN.GVRP, config_parse_tristate, 0, offsetof(VLan, gvrp) +VLAN.MVRP, config_parse_tristate, 0, offsetof(VLan, mvrp) +VLAN.LooseBinding, config_parse_tristate, 0, offsetof(VLan, loose_binding) +VLAN.ReorderHeader, config_parse_tristate, 0, offsetof(VLan, reorder_hdr) +VLAN.EgressQOSMaps, config_parse_vlan_qos_maps, 0, offsetof(VLan, egress_qos_maps) +VLAN.IngressQOSMaps, config_parse_vlan_qos_maps, 0, offsetof(VLan, ingress_qos_maps) +MACVLAN.Mode, config_parse_macvlan_mode, 0, offsetof(MacVlan, mode) +MACVLAN.SourceMACAddress, config_parse_ether_addrs, 0, offsetof(MacVlan, match_source_mac) +MACVLAN.BroadcastMulticastQueueLength, config_parse_macvlan_broadcast_queue_size, 0, offsetof(MacVlan, bc_queue_length) +MACVTAP.Mode, config_parse_macvlan_mode, 0, offsetof(MacVlan, mode) +MACVTAP.SourceMACAddress, config_parse_ether_addrs, 0, offsetof(MacVlan, match_source_mac) +IPVLAN.Mode, config_parse_ipvlan_mode, 0, offsetof(IPVlan, mode) +IPVLAN.Flags, config_parse_ipvlan_flags, 0, offsetof(IPVlan, flags) +IPVTAP.Mode, config_parse_ipvlan_mode, 0, offsetof(IPVlan, mode) +IPVTAP.Flags, config_parse_ipvlan_flags, 0, offsetof(IPVlan, flags) +Tunnel.Local, config_parse_tunnel_local_address, 0, 0 +Tunnel.Remote, config_parse_tunnel_remote_address, 0, 0 +Tunnel.TOS, config_parse_unsigned, 0, offsetof(Tunnel, tos) +Tunnel.TTL, config_parse_unsigned, 0, offsetof(Tunnel, ttl) +Tunnel.Key, config_parse_tunnel_key, 0, offsetof(Tunnel, key) +Tunnel.InputKey, config_parse_tunnel_key, 0, offsetof(Tunnel, ikey) +Tunnel.OutputKey, config_parse_tunnel_key, 0, offsetof(Tunnel, okey) +Tunnel.DiscoverPathMTU, config_parse_tristate, 0, offsetof(Tunnel, pmtudisc) +Tunnel.IgnoreDontFragment, config_parse_bool, 0, offsetof(Tunnel, ignore_df) +Tunnel.Mode, config_parse_ip6tnl_mode, 0, offsetof(Tunnel, ip6tnl_mode) +Tunnel.IPv6FlowLabel, config_parse_ipv6_flowlabel, 0, 0 +Tunnel.CopyDSCP, config_parse_bool, 0, offsetof(Tunnel, copy_dscp) +Tunnel.EncapsulationLimit, config_parse_encap_limit, 0, 0 +Tunnel.Independent, config_parse_bool, 0, offsetof(Tunnel, independent) +Tunnel.AssignToLoopback, config_parse_bool, 0, offsetof(Tunnel, assign_to_loopback) +Tunnel.AllowLocalRemote, config_parse_tristate, 0, offsetof(Tunnel, allow_localremote) +Tunnel.FooOverUDP, config_parse_bool, 0, offsetof(Tunnel, fou_tunnel) +Tunnel.FOUDestinationPort, config_parse_ip_port, 0, offsetof(Tunnel, fou_destination_port) +Tunnel.FOUSourcePort, config_parse_ip_port, 0, offsetof(Tunnel, encap_src_port) +Tunnel.Encapsulation, config_parse_fou_encap_type, 0, offsetof(Tunnel, fou_encap_type) +Tunnel.IPv6RapidDeploymentPrefix, config_parse_6rd_prefix, 0, 0 +Tunnel.ERSPANVersion, config_parse_erspan_version, 0, offsetof(Tunnel, erspan_version) +Tunnel.ERSPANIndex, config_parse_erspan_index, 0, offsetof(Tunnel, erspan_index) +Tunnel.ERSPANDirection, config_parse_erspan_direction, 0, offsetof(Tunnel, erspan_direction) +Tunnel.ERSPANHardwareId, config_parse_erspan_hwid, 0, offsetof(Tunnel, erspan_hwid) +Tunnel.SerializeTunneledPackets, config_parse_tristate, 0, offsetof(Tunnel, gre_erspan_sequence) +Tunnel.ISATAP, config_parse_tristate, 0, offsetof(Tunnel, isatap) +Tunnel.External, config_parse_bool, 0, offsetof(Tunnel, external) +FooOverUDP.Protocol, config_parse_ip_protocol, 0, offsetof(FouTunnel, fou_protocol) +FooOverUDP.Encapsulation, config_parse_fou_encap_type, 0, offsetof(FouTunnel, fou_encap_type) +FooOverUDP.Port, config_parse_ip_port, 0, offsetof(FouTunnel, port) +FooOverUDP.PeerPort, config_parse_ip_port, 0, offsetof(FouTunnel, peer_port) +FooOverUDP.Local, config_parse_fou_tunnel_address, 0, offsetof(FouTunnel, local) +FooOverUDP.Peer, config_parse_fou_tunnel_address, 0, offsetof(FouTunnel, peer) +L2TP.TunnelId, config_parse_l2tp_tunnel_id, 0, offsetof(L2tpTunnel, tunnel_id) +L2TP.PeerTunnelId, config_parse_l2tp_tunnel_id, 0, offsetof(L2tpTunnel, peer_tunnel_id) +L2TP.UDPSourcePort, config_parse_ip_port, 0, offsetof(L2tpTunnel, l2tp_udp_sport) +L2TP.UDPDestinationPort, config_parse_ip_port, 0, offsetof(L2tpTunnel, l2tp_udp_dport) +L2TP.Local, config_parse_l2tp_tunnel_local_address, 0, 0 +L2TP.Remote, config_parse_l2tp_tunnel_remote_address, 0, 0 +L2TP.EncapsulationType, config_parse_l2tp_encap_type, 0, offsetof(L2tpTunnel, l2tp_encap_type) +L2TP.UDPCheckSum, config_parse_bool, 0, offsetof(L2tpTunnel, udp_csum) +L2TP.UDP6CheckSumRx, config_parse_bool, 0, offsetof(L2tpTunnel, udp6_csum_rx) +L2TP.UDP6CheckSumTx, config_parse_bool, 0, offsetof(L2tpTunnel, udp6_csum_tx) +L2TPSession.SessionId, config_parse_l2tp_session_id, 0, 0 +L2TPSession.PeerSessionId, config_parse_l2tp_session_id, 0, 0 +L2TPSession.Layer2SpecificHeader, config_parse_l2tp_session_l2spec, 0, 0 +L2TPSession.Name, config_parse_l2tp_session_name, 0, 0 +Peer.Name, config_parse_ifname, 0, offsetof(Veth, ifname_peer) +Peer.MACAddress, config_parse_netdev_hw_addr, ETH_ALEN, offsetof(Veth, hw_addr_peer) +VXCAN.Peer, config_parse_ifname, 0, offsetof(VxCan, ifname_peer) +VXLAN.VNI, config_parse_uint32, 0, offsetof(VxLan, vni) +VXLAN.Id, config_parse_uint32, 0, offsetof(VxLan, vni) /* deprecated */ +VXLAN.Group, config_parse_vxlan_address, 0, offsetof(VxLan, group) +VXLAN.Local, config_parse_vxlan_address, 0, offsetof(VxLan, local) +VXLAN.Remote, config_parse_vxlan_address, 0, offsetof(VxLan, remote) +VXLAN.TOS, config_parse_unsigned, 0, offsetof(VxLan, tos) +VXLAN.TTL, config_parse_vxlan_ttl, 0, offsetof(VxLan, ttl) +VXLAN.MacLearning, config_parse_bool, 0, offsetof(VxLan, learning) +VXLAN.ARPProxy, config_parse_bool, 0, offsetof(VxLan, arp_proxy) +VXLAN.ReduceARPProxy, config_parse_bool, 0, offsetof(VxLan, arp_proxy) +VXLAN.L2MissNotification, config_parse_bool, 0, offsetof(VxLan, l2miss) +VXLAN.L3MissNotification, config_parse_bool, 0, offsetof(VxLan, l3miss) +VXLAN.RouteShortCircuit, config_parse_bool, 0, offsetof(VxLan, route_short_circuit) +VXLAN.UDPCheckSum, config_parse_bool, 0, offsetof(VxLan, udpcsum) +VXLAN.UDPChecksum, config_parse_bool, 0, offsetof(VxLan, udpcsum) +VXLAN.UDP6ZeroCheckSumRx, config_parse_bool, 0, offsetof(VxLan, udp6zerocsumrx) +VXLAN.UDP6ZeroChecksumRx, config_parse_bool, 0, offsetof(VxLan, udp6zerocsumrx) +VXLAN.UDP6ZeroCheckSumTx, config_parse_bool, 0, offsetof(VxLan, udp6zerocsumtx) +VXLAN.UDP6ZeroChecksumTx, config_parse_bool, 0, offsetof(VxLan, udp6zerocsumtx) +VXLAN.RemoteChecksumTx, config_parse_bool, 0, offsetof(VxLan, remote_csum_tx) +VXLAN.RemoteChecksumRx, config_parse_bool, 0, offsetof(VxLan, remote_csum_rx) +VXLAN.FDBAgeingSec, config_parse_sec, 0, offsetof(VxLan, fdb_ageing) +VXLAN.GroupPolicyExtension, config_parse_bool, 0, offsetof(VxLan, group_policy) +VXLAN.GenericProtocolExtension, config_parse_bool, 0, offsetof(VxLan, generic_protocol_extension) +VXLAN.MaximumFDBEntries, config_parse_unsigned, 0, offsetof(VxLan, max_fdb) +VXLAN.PortRange, config_parse_port_range, 0, 0 +VXLAN.DestinationPort, config_parse_ip_port, 0, offsetof(VxLan, dest_port) +VXLAN.FlowLabel, config_parse_flow_label, 0, 0 +VXLAN.IPDoNotFragment, config_parse_df, 0, offsetof(VxLan, df) +VXLAN.Independent, config_parse_bool, 0, offsetof(VxLan, independent) +GENEVE.Id, config_parse_geneve_vni, 0, offsetof(Geneve, id) +GENEVE.Remote, config_parse_geneve_address, 0, offsetof(Geneve, remote) +GENEVE.TOS, config_parse_uint8, 0, offsetof(Geneve, tos) +GENEVE.TTL, config_parse_geneve_ttl, 0, offsetof(Geneve, ttl) +GENEVE.UDPChecksum, config_parse_bool, 0, offsetof(Geneve, udpcsum) +GENEVE.UDP6ZeroCheckSumRx, config_parse_bool, 0, offsetof(Geneve, udp6zerocsumrx) +GENEVE.UDP6ZeroChecksumRx, config_parse_bool, 0, offsetof(Geneve, udp6zerocsumrx) +GENEVE.UDP6ZeroCheckSumTx, config_parse_bool, 0, offsetof(Geneve, udp6zerocsumtx) +GENEVE.UDP6ZeroChecksumTx, config_parse_bool, 0, offsetof(Geneve, udp6zerocsumtx) +GENEVE.DestinationPort, config_parse_ip_port, 0, offsetof(Geneve, dest_port) +GENEVE.IPDoNotFragment, config_parse_geneve_df, 0, offsetof(Geneve, geneve_df) +GENEVE.FlowLabel, config_parse_geneve_flow_label, 0, 0 +GENEVE.InheritInnerProtocol, config_parse_bool, 0, offsetof(Geneve, inherit_inner_protocol) +MACsec.Port, config_parse_macsec_port, 0, 0 +MACsec.Encrypt, config_parse_tristate, 0, offsetof(MACsec, encrypt) +MACsecReceiveChannel.Port, config_parse_macsec_port, 0, 0 +MACsecReceiveChannel.MACAddress, config_parse_macsec_hw_address, 0, 0 +MACsecTransmitAssociation.PacketNumber, config_parse_macsec_packet_number, 0, 0 +MACsecTransmitAssociation.KeyId, config_parse_macsec_key_id, 0, 0 +MACsecTransmitAssociation.Key, config_parse_macsec_key, 0, 0 +MACsecTransmitAssociation.KeyFile, config_parse_macsec_key_file, 0, 0 +MACsecTransmitAssociation.Activate, config_parse_macsec_sa_activate, 0, 0 +MACsecTransmitAssociation.UseForEncoding, config_parse_macsec_use_for_encoding, 0, 0 +MACsecReceiveAssociation.Port, config_parse_macsec_port, 0, 0 +MACsecReceiveAssociation.MACAddress, config_parse_macsec_hw_address, 0, 0 +MACsecReceiveAssociation.PacketNumber, config_parse_macsec_packet_number, 0, 0 +MACsecReceiveAssociation.KeyId, config_parse_macsec_key_id, 0, 0 +MACsecReceiveAssociation.Key, config_parse_macsec_key, 0, 0 +MACsecReceiveAssociation.KeyFile, config_parse_macsec_key_file, 0, 0 +MACsecReceiveAssociation.Activate, config_parse_macsec_sa_activate, 0, 0 +Tun.OneQueue, config_parse_warn_compat, DISABLED_LEGACY, 0 +Tun.MultiQueue, config_parse_bool, 0, offsetof(TunTap, multi_queue) +Tun.PacketInfo, config_parse_bool, 0, offsetof(TunTap, packet_info) +Tun.VNetHeader, config_parse_bool, 0, offsetof(TunTap, vnet_hdr) +Tun.User, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(TunTap, user_name) +Tun.Group, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(TunTap, group_name) +Tun.KeepCarrier, config_parse_bool, 0, offsetof(TunTap, keep_fd) +Tap.OneQueue, config_parse_warn_compat, DISABLED_LEGACY, 0 +Tap.MultiQueue, config_parse_bool, 0, offsetof(TunTap, multi_queue) +Tap.PacketInfo, config_parse_bool, 0, offsetof(TunTap, packet_info) +Tap.VNetHeader, config_parse_bool, 0, offsetof(TunTap, vnet_hdr) +Tap.User, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(TunTap, user_name) +Tap.Group, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(TunTap, group_name) +Tap.KeepCarrier, config_parse_bool, 0, offsetof(TunTap, keep_fd) +Bond.Mode, config_parse_bond_mode, 0, offsetof(Bond, mode) +Bond.TransmitHashPolicy, config_parse_bond_xmit_hash_policy, 0, offsetof(Bond, xmit_hash_policy) +Bond.LACPTransmitRate, config_parse_bond_lacp_rate, 0, offsetof(Bond, lacp_rate) +Bond.AdSelect, config_parse_bond_ad_select, 0, offsetof(Bond, ad_select) +Bond.FailOverMACPolicy, config_parse_bond_fail_over_mac, 0, offsetof(Bond, fail_over_mac) +Bond.ARPIPTargets, config_parse_arp_ip_target_address, 0, 0 +Bond.ARPValidate, config_parse_bond_arp_validate, 0, offsetof(Bond, arp_validate) +Bond.ARPAllTargets, config_parse_bond_arp_all_targets, 0, offsetof(Bond, arp_all_targets) +Bond.PrimaryReselectPolicy, config_parse_bond_primary_reselect, 0, offsetof(Bond, primary_reselect) +Bond.ResendIGMP, config_parse_unsigned, 0, offsetof(Bond, resend_igmp) +Bond.PacketsPerSlave, config_parse_unsigned, 0, offsetof(Bond, packets_per_slave) +Bond.GratuitousARP, config_parse_unsigned, 0, offsetof(Bond, num_grat_arp) +Bond.AllSlavesActive, config_parse_bool, 0, offsetof(Bond, all_slaves_active) +Bond.DynamicTransmitLoadBalancing, config_parse_tristate, 0, offsetof(Bond, tlb_dynamic_lb) +Bond.MinLinks, config_parse_unsigned, 0, offsetof(Bond, min_links) +Bond.MIIMonitorSec, config_parse_sec, 0, offsetof(Bond, miimon) +Bond.UpDelaySec, config_parse_sec, 0, offsetof(Bond, updelay) +Bond.DownDelaySec, config_parse_sec, 0, offsetof(Bond, downdelay) +Bond.ARPIntervalSec, config_parse_sec, 0, offsetof(Bond, arp_interval) +Bond.LearnPacketIntervalSec, config_parse_sec, 0, offsetof(Bond, lp_interval) +Bond.AdActorSystemPriority, config_parse_ad_actor_sys_prio, 0, offsetof(Bond, ad_actor_sys_prio) +Bond.AdUserPortKey, config_parse_ad_user_port_key, 0, offsetof(Bond, ad_user_port_key) +Bond.AdActorSystem, config_parse_ad_actor_system, 0, offsetof(Bond, ad_actor_system) +Bridge.HelloTimeSec, config_parse_sec, 0, offsetof(Bridge, hello_time) +Bridge.MaxAgeSec, config_parse_sec, 0, offsetof(Bridge, max_age) +Bridge.AgeingTimeSec, config_parse_sec, 0, offsetof(Bridge, ageing_time) +Bridge.ForwardDelaySec, config_parse_sec, 0, offsetof(Bridge, forward_delay) +Bridge.Priority, config_parse_uint16, 0, offsetof(Bridge, priority) +Bridge.GroupForwardMask, config_parse_uint16, 0, offsetof(Bridge, group_fwd_mask) +Bridge.DefaultPVID, config_parse_default_port_vlanid, 0, offsetof(Bridge, default_pvid) +Bridge.MulticastQuerier, config_parse_tristate, 0, offsetof(Bridge, mcast_querier) +Bridge.MulticastSnooping, config_parse_tristate, 0, offsetof(Bridge, mcast_snooping) +Bridge.VLANFiltering, config_parse_tristate, 0, offsetof(Bridge, vlan_filtering) +Bridge.VLANProtocol, config_parse_vlanprotocol, 0, offsetof(Bridge, vlan_protocol) +Bridge.STP, config_parse_tristate, 0, offsetof(Bridge, stp) +Bridge.MulticastIGMPVersion, config_parse_uint8, 0, offsetof(Bridge, igmp_version) +VRF.TableId, config_parse_uint32, 0, offsetof(Vrf, table) /* deprecated */ +VRF.Table, config_parse_uint32, 0, offsetof(Vrf, table) +BareUDP.DestinationPort, config_parse_ip_port, 0, offsetof(BareUDP, dest_port) +BareUDP.EtherType, config_parse_bare_udp_iftype, 0, offsetof(BareUDP, iftype) +WireGuard.FirewallMark, config_parse_unsigned, 0, offsetof(Wireguard, fwmark) +WireGuard.FwMark, config_parse_unsigned, 0, offsetof(Wireguard, fwmark) /* deprecated */ +WireGuard.ListenPort, config_parse_wireguard_listen_port, 0, offsetof(Wireguard, port) +WireGuard.PrivateKey, config_parse_wireguard_private_key, 0, 0 +WireGuard.PrivateKeyFile, config_parse_wireguard_private_key_file, 0, 0 +WireGuard.RouteTable, config_parse_wireguard_route_table, 0, offsetof(Wireguard, route_table) +WireGuard.RouteMetric, config_parse_wireguard_route_priority, 0, offsetof(Wireguard, route_priority) +WireGuardPeer.AllowedIPs, config_parse_wireguard_allowed_ips, 0, 0 +WireGuardPeer.Endpoint, config_parse_wireguard_endpoint, 0, 0 +WireGuardPeer.PublicKey, config_parse_wireguard_peer_key, 0, 0 +WireGuardPeer.PresharedKey, config_parse_wireguard_peer_key, 0, 0 +WireGuardPeer.PresharedKeyFile, config_parse_wireguard_preshared_key_file, 0, 0 +WireGuardPeer.PersistentKeepalive, config_parse_wireguard_keepalive, 0, 0 +WireGuardPeer.RouteTable, config_parse_wireguard_peer_route_table, 0, 0 +WireGuardPeer.RouteMetric, config_parse_wireguard_peer_route_priority,0, 0 +Xfrm.InterfaceId, config_parse_uint32, 0, offsetof(Xfrm, if_id) +Xfrm.Independent, config_parse_bool, 0, offsetof(Xfrm, independent) +BatmanAdvanced.Aggregation, config_parse_bool, 0, offsetof(BatmanAdvanced, aggregation) +BatmanAdvanced.BridgeLoopAvoidance, config_parse_bool, 0, offsetof(BatmanAdvanced, bridge_loop_avoidance) +BatmanAdvanced.DistributedArpTable, config_parse_bool, 0, offsetof(BatmanAdvanced, distributed_arp_table) +BatmanAdvanced.Fragmentation, config_parse_bool, 0, offsetof(BatmanAdvanced, fragmentation) +BatmanAdvanced.GatewayMode, config_parse_batadv_gateway_mode, 0, offsetof(BatmanAdvanced, gateway_mode) +BatmanAdvanced.GatewayBandwithDown, config_parse_badadv_bandwidth, 0, offsetof(BatmanAdvanced, gateway_bandwidth_down) +BatmanAdvanced.GatewayBandwithUp, config_parse_badadv_bandwidth, 0, offsetof(BatmanAdvanced, gateway_bandwidth_up) +BatmanAdvanced.GatewayBandwidthDown, config_parse_badadv_bandwidth, 0, offsetof(BatmanAdvanced, gateway_bandwidth_down) +BatmanAdvanced.GatewayBandwidthUp, config_parse_badadv_bandwidth, 0, offsetof(BatmanAdvanced, gateway_bandwidth_up) +BatmanAdvanced.HopPenalty, config_parse_uint8, 0, offsetof(BatmanAdvanced, hop_penalty) +BatmanAdvanced.OriginatorIntervalSec, config_parse_sec, 0, offsetof(BatmanAdvanced, originator_interval) +BatmanAdvanced.RoutingAlgorithm, config_parse_batadv_routing_algorithm, 0, offsetof(BatmanAdvanced, routing_algorithm) +IPoIB.PartitionKey, config_parse_ipoib_pkey, 0, offsetof(IPoIB, pkey) +IPoIB.Mode, config_parse_ipoib_mode, 0, offsetof(IPoIB, mode) +IPoIB.IgnoreUserspaceMulticastGroups, config_parse_tristate, 0, offsetof(IPoIB, umcast) +WLAN.PhysicalDevice, config_parse_wiphy, 0, 0 +WLAN.Type, config_parse_wlan_iftype, 0, offsetof(WLan, iftype) +WLAN.WDS, config_parse_tristate, 0, offsetof(WLan, wds) diff --git a/src/network/netdev/netdev-util.c b/src/network/netdev/netdev-util.c new file mode 100644 index 0000000..6229992 --- /dev/null +++ b/src/network/netdev/netdev-util.c @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "netdev-util.h" +#include "networkd-address.h" +#include "networkd-link.h" +#include "string-table.h" + +static const char * const netdev_local_address_type_table[_NETDEV_LOCAL_ADDRESS_TYPE_MAX] = { + [NETDEV_LOCAL_ADDRESS_IPV4LL] = "ipv4_link_local", + [NETDEV_LOCAL_ADDRESS_IPV6LL] = "ipv6_link_local", + [NETDEV_LOCAL_ADDRESS_DHCP4] = "dhcp4", + [NETDEV_LOCAL_ADDRESS_DHCP6] = "dhcp6", + [NETDEV_LOCAL_ADDRESS_SLAAC] = "slaac", +}; + +DEFINE_STRING_TABLE_LOOKUP(netdev_local_address_type, NetDevLocalAddressType); + +int link_get_local_address( + Link *link, + NetDevLocalAddressType type, + int family, + int *ret_family, + union in_addr_union *ret_address) { + + Address *a; + + assert(link); + + switch (type) { + case NETDEV_LOCAL_ADDRESS_IPV4LL: + assert(IN_SET(family, AF_UNSPEC, AF_INET)); + family = AF_INET; + break; + case NETDEV_LOCAL_ADDRESS_IPV6LL: + assert(IN_SET(family, AF_UNSPEC, AF_INET6)); + family = AF_INET6; + break; + case NETDEV_LOCAL_ADDRESS_DHCP4: + assert(IN_SET(family, AF_UNSPEC, AF_INET)); + family = AF_INET; + break; + case NETDEV_LOCAL_ADDRESS_DHCP6: + assert(IN_SET(family, AF_UNSPEC, AF_INET6)); + family = AF_INET6; + break; + case NETDEV_LOCAL_ADDRESS_SLAAC: + assert(IN_SET(family, AF_UNSPEC, AF_INET6)); + family = AF_INET6; + break; + default: + assert_not_reached(); + } + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return -EBUSY; + + SET_FOREACH(a, link->addresses) { + if (!address_is_ready(a)) + continue; + + if (a->family != family) + continue; + + if (in_addr_is_set(a->family, &a->in_addr_peer)) + continue; + + switch (type) { + case NETDEV_LOCAL_ADDRESS_IPV4LL: + if (a->source != NETWORK_CONFIG_SOURCE_IPV4LL) + continue; + break; + case NETDEV_LOCAL_ADDRESS_IPV6LL: + if (!in6_addr_is_link_local(&a->in_addr.in6)) + continue; + break; + case NETDEV_LOCAL_ADDRESS_DHCP4: + if (a->source != NETWORK_CONFIG_SOURCE_DHCP4) + continue; + break; + case NETDEV_LOCAL_ADDRESS_DHCP6: + if (a->source != NETWORK_CONFIG_SOURCE_DHCP6) + continue; + break; + case NETDEV_LOCAL_ADDRESS_SLAAC: + if (a->source != NETWORK_CONFIG_SOURCE_NDISC) + continue; + break; + default: + assert_not_reached(); + } + + if (ret_family) + *ret_family = a->family; + if (ret_address) + *ret_address = a->in_addr; + return 1; + } + + return -ENXIO; +} diff --git a/src/network/netdev/netdev-util.h b/src/network/netdev/netdev-util.h new file mode 100644 index 0000000..02b07e3 --- /dev/null +++ b/src/network/netdev/netdev-util.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "in-addr-util.h" +#include "macro.h" + +typedef struct Link Link; + +typedef enum NetDevLocalAddressType { + NETDEV_LOCAL_ADDRESS_IPV4LL, + NETDEV_LOCAL_ADDRESS_IPV6LL, + NETDEV_LOCAL_ADDRESS_DHCP4, + NETDEV_LOCAL_ADDRESS_DHCP6, + NETDEV_LOCAL_ADDRESS_SLAAC, + _NETDEV_LOCAL_ADDRESS_TYPE_MAX, + _NETDEV_LOCAL_ADDRESS_TYPE_INVALID = -EINVAL, +} NetDevLocalAddressType; + +const char *netdev_local_address_type_to_string(NetDevLocalAddressType t) _const_; +NetDevLocalAddressType netdev_local_address_type_from_string(const char *s) _pure_; + +int link_get_local_address( + Link *link, + NetDevLocalAddressType type, + int family, + int *ret_family, + union in_addr_union *ret_address); diff --git a/src/network/netdev/netdev.c b/src/network/netdev/netdev.c new file mode 100644 index 0000000..57127a8 --- /dev/null +++ b/src/network/netdev/netdev.c @@ -0,0 +1,957 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "arphrd-util.h" +#include "bareudp.h" +#include "batadv.h" +#include "bond.h" +#include "bridge.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "dummy.h" +#include "fd-util.h" +#include "fou-tunnel.h" +#include "geneve.h" +#include "ifb.h" +#include "ipoib.h" +#include "ipvlan.h" +#include "l2tp-tunnel.h" +#include "list.h" +#include "macsec.h" +#include "macvlan.h" +#include "netdev.h" +#include "netdevsim.h" +#include "netif-util.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "networkd-setlink.h" +#include "networkd-sriov.h" +#include "nlmon.h" +#include "path-lookup.h" +#include "siphash24.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tunnel.h" +#include "tuntap.h" +#include "vcan.h" +#include "veth.h" +#include "vlan.h" +#include "vrf.h" +#include "vxcan.h" +#include "vxlan.h" +#include "wireguard.h" +#include "wlan.h" +#include "xfrm.h" + +const NetDevVTable * const netdev_vtable[_NETDEV_KIND_MAX] = { + [NETDEV_KIND_BAREUDP] = &bare_udp_vtable, + [NETDEV_KIND_BATADV] = &batadv_vtable, + [NETDEV_KIND_BOND] = &bond_vtable, + [NETDEV_KIND_BRIDGE] = &bridge_vtable, + [NETDEV_KIND_DUMMY] = &dummy_vtable, + [NETDEV_KIND_ERSPAN] = &erspan_vtable, + [NETDEV_KIND_FOU] = &foutnl_vtable, + [NETDEV_KIND_GENEVE] = &geneve_vtable, + [NETDEV_KIND_GRE] = &gre_vtable, + [NETDEV_KIND_GRETAP] = &gretap_vtable, + [NETDEV_KIND_IFB] = &ifb_vtable, + [NETDEV_KIND_IP6GRE] = &ip6gre_vtable, + [NETDEV_KIND_IP6GRETAP] = &ip6gretap_vtable, + [NETDEV_KIND_IP6TNL] = &ip6tnl_vtable, + [NETDEV_KIND_IPIP] = &ipip_vtable, + [NETDEV_KIND_IPOIB] = &ipoib_vtable, + [NETDEV_KIND_IPVLAN] = &ipvlan_vtable, + [NETDEV_KIND_IPVTAP] = &ipvtap_vtable, + [NETDEV_KIND_L2TP] = &l2tptnl_vtable, + [NETDEV_KIND_MACSEC] = &macsec_vtable, + [NETDEV_KIND_MACVLAN] = &macvlan_vtable, + [NETDEV_KIND_MACVTAP] = &macvtap_vtable, + [NETDEV_KIND_NETDEVSIM] = &netdevsim_vtable, + [NETDEV_KIND_NLMON] = &nlmon_vtable, + [NETDEV_KIND_SIT] = &sit_vtable, + [NETDEV_KIND_TAP] = &tap_vtable, + [NETDEV_KIND_TUN] = &tun_vtable, + [NETDEV_KIND_VCAN] = &vcan_vtable, + [NETDEV_KIND_VETH] = &veth_vtable, + [NETDEV_KIND_VLAN] = &vlan_vtable, + [NETDEV_KIND_VRF] = &vrf_vtable, + [NETDEV_KIND_VTI6] = &vti6_vtable, + [NETDEV_KIND_VTI] = &vti_vtable, + [NETDEV_KIND_VXCAN] = &vxcan_vtable, + [NETDEV_KIND_VXLAN] = &vxlan_vtable, + [NETDEV_KIND_WIREGUARD] = &wireguard_vtable, + [NETDEV_KIND_WLAN] = &wlan_vtable, + [NETDEV_KIND_XFRM] = &xfrm_vtable, +}; + +static const char* const netdev_kind_table[_NETDEV_KIND_MAX] = { + [NETDEV_KIND_BAREUDP] = "bareudp", + [NETDEV_KIND_BATADV] = "batadv", + [NETDEV_KIND_BOND] = "bond", + [NETDEV_KIND_BRIDGE] = "bridge", + [NETDEV_KIND_DUMMY] = "dummy", + [NETDEV_KIND_ERSPAN] = "erspan", + [NETDEV_KIND_FOU] = "fou", + [NETDEV_KIND_GENEVE] = "geneve", + [NETDEV_KIND_GRE] = "gre", + [NETDEV_KIND_GRETAP] = "gretap", + [NETDEV_KIND_IFB] = "ifb", + [NETDEV_KIND_IP6GRE] = "ip6gre", + [NETDEV_KIND_IP6GRETAP] = "ip6gretap", + [NETDEV_KIND_IP6TNL] = "ip6tnl", + [NETDEV_KIND_IPIP] = "ipip", + [NETDEV_KIND_IPOIB] = "ipoib", + [NETDEV_KIND_IPVLAN] = "ipvlan", + [NETDEV_KIND_IPVTAP] = "ipvtap", + [NETDEV_KIND_L2TP] = "l2tp", + [NETDEV_KIND_MACSEC] = "macsec", + [NETDEV_KIND_MACVLAN] = "macvlan", + [NETDEV_KIND_MACVTAP] = "macvtap", + [NETDEV_KIND_NETDEVSIM] = "netdevsim", + [NETDEV_KIND_NLMON] = "nlmon", + [NETDEV_KIND_SIT] = "sit", + [NETDEV_KIND_TAP] = "tap", + [NETDEV_KIND_TUN] = "tun", + [NETDEV_KIND_VCAN] = "vcan", + [NETDEV_KIND_VETH] = "veth", + [NETDEV_KIND_VLAN] = "vlan", + [NETDEV_KIND_VRF] = "vrf", + [NETDEV_KIND_VTI6] = "vti6", + [NETDEV_KIND_VTI] = "vti", + [NETDEV_KIND_VXCAN] = "vxcan", + [NETDEV_KIND_VXLAN] = "vxlan", + [NETDEV_KIND_WIREGUARD] = "wireguard", + [NETDEV_KIND_WLAN] = "wlan", + [NETDEV_KIND_XFRM] = "xfrm", +}; + +DEFINE_STRING_TABLE_LOOKUP(netdev_kind, NetDevKind); + +bool netdev_is_managed(NetDev *netdev) { + if (!netdev || !netdev->manager || !netdev->ifname) + return false; + + return hashmap_get(netdev->manager->netdevs, netdev->ifname) == netdev; +} + +static bool netdev_is_stacked_and_independent(NetDev *netdev) { + assert(netdev); + + if (netdev_get_create_type(netdev) != NETDEV_CREATE_STACKED) + return false; + + switch (netdev->kind) { + case NETDEV_KIND_ERSPAN: + return ERSPAN(netdev)->independent; + case NETDEV_KIND_GRE: + return GRE(netdev)->independent; + case NETDEV_KIND_GRETAP: + return GRETAP(netdev)->independent; + case NETDEV_KIND_IP6GRE: + return IP6GRE(netdev)->independent; + case NETDEV_KIND_IP6GRETAP: + return IP6GRETAP(netdev)->independent; + case NETDEV_KIND_IP6TNL: + return IP6TNL(netdev)->independent; + case NETDEV_KIND_IPIP: + return IPIP(netdev)->independent; + case NETDEV_KIND_SIT: + return SIT(netdev)->independent; + case NETDEV_KIND_VTI: + return VTI(netdev)->independent; + case NETDEV_KIND_VTI6: + return VTI6(netdev)->independent; + case NETDEV_KIND_VXLAN: + return VXLAN(netdev)->independent; + case NETDEV_KIND_XFRM: + return XFRM(netdev)->independent; + default: + return false; + } +} + +static bool netdev_is_stacked(NetDev *netdev) { + assert(netdev); + + if (netdev_get_create_type(netdev) != NETDEV_CREATE_STACKED) + return false; + + if (netdev_is_stacked_and_independent(netdev)) + return false; + + return true; +} + +static void netdev_detach_from_manager(NetDev *netdev) { + if (netdev->ifname && netdev->manager) + hashmap_remove(netdev->manager->netdevs, netdev->ifname); +} + +static NetDev *netdev_free(NetDev *netdev) { + assert(netdev); + + netdev_detach_from_manager(netdev); + + free(netdev->filename); + + free(netdev->description); + free(netdev->ifname); + condition_free_list(netdev->conditions); + + /* Invoke the per-kind done() destructor, but only if the state field is initialized. We conditionalize that + * because we parse .netdev files twice: once to determine the kind (with a short, minimal NetDev structure + * allocation, with no room for per-kind fields), and once to read the kind's properties (with a full, + * comprehensive NetDev structure allocation with enough space for whatever the specific kind needs). Now, in + * the first case we shouldn't try to destruct the per-kind NetDev fields on destruction, in the second case we + * should. We use the state field to discern the two cases: it's _NETDEV_STATE_INVALID on the first "raw" + * call. */ + if (netdev->state != _NETDEV_STATE_INVALID && + NETDEV_VTABLE(netdev) && + NETDEV_VTABLE(netdev)->done) + NETDEV_VTABLE(netdev)->done(netdev); + + return mfree(netdev); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(NetDev, netdev, netdev_free); + +void netdev_drop(NetDev *netdev) { + if (!netdev) + return; + + if (netdev_is_stacked(netdev)) { + /* The netdev may be removed due to the underlying device removal, and the device may + * be re-added later. */ + netdev->state = NETDEV_STATE_LOADING; + netdev->ifindex = 0; + + log_netdev_debug(netdev, "netdev removed"); + return; + } + + if (NETDEV_VTABLE(netdev) && NETDEV_VTABLE(netdev)->drop) + NETDEV_VTABLE(netdev)->drop(netdev); + + netdev->state = NETDEV_STATE_LINGER; + + log_netdev_debug(netdev, "netdev removed"); + + netdev_detach_from_manager(netdev); + netdev_unref(netdev); + return; +} + +int netdev_get(Manager *manager, const char *name, NetDev **ret) { + NetDev *netdev; + + assert(manager); + assert(name); + assert(ret); + + netdev = hashmap_get(manager->netdevs, name); + if (!netdev) + return -ENOENT; + + *ret = netdev; + + return 0; +} + +void netdev_enter_failed(NetDev *netdev) { + netdev->state = NETDEV_STATE_FAILED; +} + +static int netdev_enter_ready(NetDev *netdev) { + assert(netdev); + assert(netdev->ifname); + + if (netdev->state != NETDEV_STATE_CREATING) + return 0; + + netdev->state = NETDEV_STATE_READY; + + log_netdev_info(netdev, "netdev ready"); + + if (NETDEV_VTABLE(netdev)->post_create) + NETDEV_VTABLE(netdev)->post_create(netdev, NULL); + + return 0; +} + +/* callback for netdev's created without a backing Link */ +static int netdev_create_handler(sd_netlink *rtnl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST) + log_netdev_info(netdev, "netdev exists, using existing without changing its parameters"); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, "netdev could not be created: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "Created"); + + return 1; +} + +int netdev_set_ifindex(NetDev *netdev, sd_netlink_message *message) { + uint16_t type; + const char *kind; + const char *received_kind; + const char *received_name; + int r, ifindex; + + assert(netdev); + assert(message); + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not get rtnl message type: %m"); + + if (type != RTM_NEWLINK) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), "Cannot set ifindex from unexpected rtnl message type."); + + r = sd_rtnl_message_link_get_ifindex(message, &ifindex); + if (r < 0) { + log_netdev_error_errno(netdev, r, "Could not get ifindex: %m"); + netdev_enter_failed(netdev); + return r; + } else if (ifindex <= 0) { + log_netdev_error(netdev, "Got invalid ifindex: %d", ifindex); + netdev_enter_failed(netdev); + return -EINVAL; + } + + if (netdev->ifindex > 0) { + if (netdev->ifindex != ifindex) { + log_netdev_error(netdev, "Could not set ifindex to %d, already set to %d", + ifindex, netdev->ifindex); + netdev_enter_failed(netdev); + return -EEXIST; + } else + /* ifindex already set to the same for this netdev */ + return 0; + } + + r = sd_netlink_message_read_string(message, IFLA_IFNAME, &received_name); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not get IFNAME: %m"); + + if (!streq(netdev->ifname, received_name)) { + log_netdev_error(netdev, "Received newlink with wrong IFNAME %s", received_name); + netdev_enter_failed(netdev); + return -EINVAL; + } + + if (!NETDEV_VTABLE(netdev)->skip_netdev_kind_check) { + + r = sd_netlink_message_enter_container(message, IFLA_LINKINFO); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not get LINKINFO: %m"); + + r = sd_netlink_message_read_string(message, IFLA_INFO_KIND, &received_kind); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not get KIND: %m"); + + r = sd_netlink_message_exit_container(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not exit container: %m"); + + if (netdev->kind == NETDEV_KIND_TAP) + /* the kernel does not distinguish between tun and tap */ + kind = "tun"; + else { + kind = netdev_kind_to_string(netdev->kind); + if (!kind) { + log_netdev_error(netdev, "Could not get kind"); + netdev_enter_failed(netdev); + return -EINVAL; + } + } + + if (!streq(kind, received_kind)) { + log_netdev_error(netdev, "Received newlink with wrong KIND %s, expected %s", + received_kind, kind); + netdev_enter_failed(netdev); + return -EINVAL; + } + } + + netdev->ifindex = ifindex; + + log_netdev_debug(netdev, "netdev has index %d", netdev->ifindex); + + netdev_enter_ready(netdev); + + return 0; +} + +#define HASH_KEY SD_ID128_MAKE(52,e1,45,bd,00,6f,29,96,21,c6,30,6d,83,71,04,48) + +int netdev_generate_hw_addr( + NetDev *netdev, + Link *parent, + const char *name, + const struct hw_addr_data *hw_addr, + struct hw_addr_data *ret) { + + struct hw_addr_data a = HW_ADDR_NULL; + bool is_static = false; + int r; + + assert(netdev); + assert(name); + assert(hw_addr); + assert(ret); + + if (hw_addr_equal(hw_addr, &HW_ADDR_NONE)) { + *ret = HW_ADDR_NULL; + return 0; + } + + if (hw_addr->length == 0) { + uint64_t result; + + /* HardwareAddress= is not specified. */ + + if (!NETDEV_VTABLE(netdev)->generate_mac) + goto finalize; + + if (!IN_SET(NETDEV_VTABLE(netdev)->iftype, ARPHRD_ETHER, ARPHRD_INFINIBAND)) + goto finalize; + + r = net_get_unique_predictable_data_from_name(name, &HASH_KEY, &result); + if (r < 0) { + log_netdev_warning_errno(netdev, r, + "Failed to generate persistent MAC address, ignoring: %m"); + goto finalize; + } + + a.length = arphrd_to_hw_addr_len(NETDEV_VTABLE(netdev)->iftype); + + switch (NETDEV_VTABLE(netdev)->iftype) { + case ARPHRD_ETHER: + assert(a.length <= sizeof(result)); + memcpy(a.bytes, &result, a.length); + + if (ether_addr_is_null(&a.ether) || ether_addr_is_broadcast(&a.ether)) { + log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "Failed to generate persistent MAC address, ignoring: %m"); + a = HW_ADDR_NULL; + goto finalize; + } + + break; + case ARPHRD_INFINIBAND: + if (result == 0) { + log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "Failed to generate persistent MAC address: %m"); + goto finalize; + } + + assert(a.length >= sizeof(result)); + memzero(a.bytes, a.length - sizeof(result)); + memcpy(a.bytes + a.length - sizeof(result), &result, sizeof(result)); + break; + default: + assert_not_reached(); + } + + } else { + a = *hw_addr; + is_static = true; + } + + r = net_verify_hardware_address(name, is_static, NETDEV_VTABLE(netdev)->iftype, + parent ? &parent->hw_addr : NULL, &a); + if (r < 0) + return r; + +finalize: + *ret = a; + return 0; +} + +static int netdev_create_message(NetDev *netdev, Link *link, sd_netlink_message *m) { + int r; + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, netdev->ifname); + if (r < 0) + return r; + + struct hw_addr_data hw_addr; + r = netdev_generate_hw_addr(netdev, link, netdev->ifname, &netdev->hw_addr, &hw_addr); + if (r < 0) + return r; + + if (hw_addr.length > 0) { + log_netdev_debug(netdev, "Using MAC address: %s", HW_ADDR_TO_STR(&hw_addr)); + r = netlink_message_append_hw_addr(m, IFLA_ADDRESS, &hw_addr); + if (r < 0) + return r; + } + + if (netdev->mtu != 0) { + r = sd_netlink_message_append_u32(m, IFLA_MTU, netdev->mtu); + if (r < 0) + return r; + } + + if (link) { + r = sd_netlink_message_append_u32(m, IFLA_LINK, link->ifindex); + if (r < 0) + return r; + } + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + if (NETDEV_VTABLE(netdev)->fill_message_create) { + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, netdev_kind_to_string(netdev->kind)); + if (r < 0) + return r; + + r = NETDEV_VTABLE(netdev)->fill_message_create(netdev, link, m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + } else { + r = sd_netlink_message_append_string(m, IFLA_INFO_KIND, netdev_kind_to_string(netdev->kind)); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +static int independent_netdev_create(NetDev *netdev) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + + /* create netdev */ + if (NETDEV_VTABLE(netdev)->create) { + r = NETDEV_VTABLE(netdev)->create(netdev); + if (r < 0) + return r; + + log_netdev_debug(netdev, "Created"); + return 0; + } + + r = sd_rtnl_message_new_link(netdev->manager->rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return r; + + r = netdev_create_message(netdev, NULL, m); + if (r < 0) + return r; + + r = netlink_call_async(netdev->manager->rtnl, NULL, m, netdev_create_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return r; + + netdev_ref(netdev); + + netdev->state = NETDEV_STATE_CREATING; + log_netdev_debug(netdev, "Creating"); + return 0; +} + +static int stacked_netdev_create(NetDev *netdev, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + assert(netdev->manager); + assert(link); + assert(req); + + r = sd_rtnl_message_new_link(netdev->manager->rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return r; + + r = netdev_create_message(netdev, link, m); + if (r < 0) + return r; + + r = request_call_netlink_async(netdev->manager->rtnl, m, req); + if (r < 0) + return r; + + netdev->state = NETDEV_STATE_CREATING; + log_netdev_debug(netdev, "Creating"); + return 0; +} + +static bool link_is_ready_to_create_stacked_netdev_one(Link *link, bool allow_unmanaged) { + assert(link); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED, LINK_STATE_UNMANAGED)) + return false; + + if (!link->network) + return allow_unmanaged; + + if (link->set_link_messages > 0) + return false; + + /* If stacked netdevs are created before the underlying interface being activated, then + * the activation policy for the netdevs are ignored. See issue #22593. */ + if (!link->activated) + return false; + + return true; +} + +static bool link_is_ready_to_create_stacked_netdev(Link *link) { + return check_ready_for_all_sr_iov_ports(link, /* allow_unmanaged = */ false, + link_is_ready_to_create_stacked_netdev_one); +} + +static int netdev_is_ready_to_create(NetDev *netdev, Link *link) { + assert(netdev); + + if (netdev->state != NETDEV_STATE_LOADING) + return false; + + if (link && !link_is_ready_to_create_stacked_netdev(link)) + return false; + + if (NETDEV_VTABLE(netdev)->is_ready_to_create) + return NETDEV_VTABLE(netdev)->is_ready_to_create(netdev, link); + + return true; +} + +static int stacked_netdev_process_request(Request *req, Link *link, void *userdata) { + NetDev *netdev = ASSERT_PTR(userdata); + int r; + + assert(req); + assert(link); + + r = netdev_is_ready_to_create(netdev, link); + if (r <= 0) + return r; + + r = stacked_netdev_create(netdev, link, req); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to create netdev: %m"); + + return 1; +} + +static int create_stacked_netdev_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not create stacked netdev"); + link_enter_failed(link); + return 0; + } + + if (link->create_stacked_netdev_messages == 0) { + link->stacked_netdevs_created = true; + log_link_debug(link, "Stacked netdevs created."); + link_check_ready(link); + } + + return 0; +} + +int link_request_stacked_netdev(Link *link, NetDev *netdev) { + int r; + + assert(link); + assert(netdev); + + if (!netdev_is_stacked(netdev)) + return -EINVAL; + + if (!IN_SET(netdev->state, NETDEV_STATE_LOADING, NETDEV_STATE_FAILED) || netdev->ifindex > 0) + return 0; /* Already created. */ + + link->stacked_netdevs_created = false; + r = link_queue_request_full(link, REQUEST_TYPE_NETDEV_STACKED, + netdev, (mfree_func_t) netdev_unref, + trivial_hash_func, trivial_compare_func, + stacked_netdev_process_request, + &link->create_stacked_netdev_messages, + create_stacked_netdev_handler, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request stacked netdev '%s': %m", + netdev->ifname); + if (r == 0) + return 0; + + netdev_ref(netdev); + log_link_debug(link, "Requested stacked netdev '%s'", netdev->ifname); + return 1; +} + +static int independent_netdev_process_request(Request *req, Link *link, void *userdata) { + NetDev *netdev = ASSERT_PTR(userdata); + int r; + + assert(!link); + + r = netdev_is_ready_to_create(netdev, NULL); + if (r <= 0) + return r; + + r = independent_netdev_create(netdev); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to create netdev: %m"); + + return 1; +} + +static int netdev_request_to_create(NetDev *netdev) { + int r; + + assert(netdev); + assert(netdev->manager); + + if (netdev->manager->test_mode) + return 0; + + if (netdev_is_stacked(netdev)) + return 0; + + r = netdev_is_ready_to_create(netdev, NULL); + if (r < 0) + return r; + if (r > 0) { + /* If the netdev has no dependency, then create it now. */ + r = independent_netdev_create(netdev); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to create netdev: %m"); + + } else { + /* Otherwise, wait for the dependencies being resolved. */ + r = netdev_queue_request(netdev, independent_netdev_process_request, NULL); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to request to create netdev: %m"); + } + + return 0; +} + +int netdev_load_one(Manager *manager, const char *filename) { + _cleanup_(netdev_unrefp) NetDev *netdev_raw = NULL, *netdev = NULL; + const char *dropin_dirname; + int r; + + assert(manager); + assert(filename); + + r = null_or_empty_path(filename); + if (r < 0) + return log_warning_errno(r, "Failed to check if \"%s\" is empty: %m", filename); + if (r > 0) { + log_debug("Skipping empty file: %s", filename); + return 0; + } + + netdev_raw = new(NetDev, 1); + if (!netdev_raw) + return log_oom(); + + *netdev_raw = (NetDev) { + .n_ref = 1, + .kind = _NETDEV_KIND_INVALID, + .state = _NETDEV_STATE_INVALID, /* an invalid state means done() of the implementation won't be called on destruction */ + }; + + dropin_dirname = strjoina(basename(filename), ".d"); + r = config_parse_many( + STRV_MAKE_CONST(filename), NETWORK_DIRS, dropin_dirname, /* root = */ NULL, + NETDEV_COMMON_SECTIONS NETDEV_OTHER_SECTIONS, + config_item_perf_lookup, network_netdev_gperf_lookup, + CONFIG_PARSE_WARN, + netdev_raw, + NULL, + NULL); + if (r < 0) + return r; /* config_parse_many() logs internally. */ + + /* skip out early if configuration does not match the environment */ + if (!condition_test_list(netdev_raw->conditions, environ, NULL, NULL, NULL)) { + log_debug("%s: Conditions in the file do not match the system environment, skipping.", filename); + return 0; + } + + if (netdev_raw->kind == _NETDEV_KIND_INVALID) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "NetDev has no Kind= configured in \"%s\", ignoring.", filename); + + if (!netdev_raw->ifname) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "NetDev without Name= configured in \"%s\", ignoring.", filename); + + netdev = malloc0(NETDEV_VTABLE(netdev_raw)->object_size); + if (!netdev) + return log_oom(); + + netdev->n_ref = 1; + netdev->manager = manager; + netdev->kind = netdev_raw->kind; + netdev->state = NETDEV_STATE_LOADING; /* we initialize the state here for the first time, + so that done() will be called on destruction */ + + if (NETDEV_VTABLE(netdev)->init) + NETDEV_VTABLE(netdev)->init(netdev); + + r = config_parse_many( + STRV_MAKE_CONST(filename), NETWORK_DIRS, dropin_dirname, /* root = */ NULL, + NETDEV_VTABLE(netdev)->sections, + config_item_perf_lookup, network_netdev_gperf_lookup, + CONFIG_PARSE_WARN, + netdev, NULL, NULL); + if (r < 0) + return r; /* config_parse_many() logs internally. */ + + /* verify configuration */ + if (NETDEV_VTABLE(netdev)->config_verify) { + r = NETDEV_VTABLE(netdev)->config_verify(netdev, filename); + if (r < 0) + return r; /* config_verify() logs internally. */ + } + + netdev->filename = strdup(filename); + if (!netdev->filename) + return log_oom(); + + r = hashmap_ensure_put(&netdev->manager->netdevs, &string_hash_ops, netdev->ifname, netdev); + if (r == -ENOMEM) + return log_oom(); + if (r == -EEXIST) { + NetDev *n = hashmap_get(netdev->manager->netdevs, netdev->ifname); + + assert(n); + if (!streq(netdev->filename, n->filename)) + log_netdev_warning_errno(netdev, r, + "Device was already configured by \"%s\", ignoring %s.", + n->filename, netdev->filename); + + /* Clear ifname before netdev_free() is called. Otherwise, the NetDev object 'n' is + * removed from the hashmap 'manager->netdevs'. */ + netdev->ifname = mfree(netdev->ifname); + return -EEXIST; + } + assert(r > 0); + + log_netdev_debug(netdev, "loaded \"%s\"", netdev_kind_to_string(netdev->kind)); + + r = netdev_request_to_create(netdev); + if (r < 0) + return r; /* netdev_request_to_create() logs internally. */ + + TAKE_PTR(netdev); + return 0; +} + +int netdev_load(Manager *manager, bool reload) { + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(manager); + + if (!reload) + hashmap_clear_with_destructor(manager->netdevs, netdev_unref); + + r = conf_files_list_strv(&files, ".netdev", NULL, 0, NETWORK_DIRS); + if (r < 0) + return log_error_errno(r, "Failed to enumerate netdev files: %m"); + + STRV_FOREACH(f, files) + (void) netdev_load_one(manager, *f); + + return 0; +} + +int config_parse_netdev_kind( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + NetDevKind k, *kind = ASSERT_PTR(data); + + assert(filename); + assert(rvalue); + + k = netdev_kind_from_string(rvalue); + if (k < 0) { + log_syntax(unit, LOG_WARNING, filename, line, k, "Failed to parse netdev kind, ignoring assignment: %s", rvalue); + return 0; + } + + if (*kind != _NETDEV_KIND_INVALID && *kind != k) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified netdev kind is different from the previous value '%s', ignoring assignment: %s", + netdev_kind_to_string(*kind), rvalue); + return 0; + } + + *kind = k; + + return 0; +} + +int config_parse_netdev_hw_addr( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + struct hw_addr_data *hw_addr = ASSERT_PTR(data); + + assert(rvalue); + + if (streq(rvalue, "none")) { + *hw_addr = HW_ADDR_NONE; + return 0; + } + + return config_parse_hw_addr(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} diff --git a/src/network/netdev/netdev.h b/src/network/netdev/netdev.h new file mode 100644 index 0000000..cb8cc8c --- /dev/null +++ b/src/network/netdev/netdev.h @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "ether-addr-util.h" +#include "list.h" +#include "log-link.h" +#include "networkd-link.h" +#include "time-util.h" + +/* Special hardware address value to suppress generating persistent hardware address for the netdev. */ +#define HW_ADDR_NONE ((struct hw_addr_data) { .length = 1, }) + +#define NETDEV_COMMON_SECTIONS "Match\0NetDev\0" +/* This is the list of known sections. We need to ignore them in the initial parsing phase. */ +#define NETDEV_OTHER_SECTIONS \ + "-BareUDP\0" \ + "-BatmanAdvanced\0" \ + "-Bond\0" \ + "-Bridge\0" \ + "-FooOverUDP\0" \ + "-GENEVE\0" \ + "-IPoIB\0" \ + "-IPVLAN\0" \ + "-IPVTAP\0" \ + "-L2TP\0" \ + "-L2TPSession\0" \ + "-MACsec\0" \ + "-MACsecReceiveAssociation\0" \ + "-MACsecReceiveChannel\0" \ + "-MACsecTransmitAssociation\0" \ + "-MACVLAN\0" \ + "-MACVTAP\0" \ + "-Peer\0" \ + "-Tap\0" \ + "-Tun\0" \ + "-Tunnel\0" \ + "-VLAN\0" \ + "-VRF\0" \ + "-VXCAN\0" \ + "-VXLAN\0" \ + "-WLAN\0" \ + "-WireGuard\0" \ + "-WireGuardPeer\0" \ + "-Xfrm\0" + +typedef enum NetDevKind { + NETDEV_KIND_BAREUDP, + NETDEV_KIND_BATADV, + NETDEV_KIND_BOND, + NETDEV_KIND_BRIDGE, + NETDEV_KIND_DUMMY, + NETDEV_KIND_ERSPAN, + NETDEV_KIND_FOU, + NETDEV_KIND_GENEVE, + NETDEV_KIND_GRE, + NETDEV_KIND_GRETAP, + NETDEV_KIND_IFB, + NETDEV_KIND_IP6GRE, + NETDEV_KIND_IP6GRETAP, + NETDEV_KIND_IP6TNL, + NETDEV_KIND_IPIP, + NETDEV_KIND_IPOIB, + NETDEV_KIND_IPVLAN, + NETDEV_KIND_IPVTAP, + NETDEV_KIND_L2TP, + NETDEV_KIND_MACSEC, + NETDEV_KIND_MACVLAN, + NETDEV_KIND_MACVTAP, + NETDEV_KIND_NETDEVSIM, + NETDEV_KIND_NLMON, + NETDEV_KIND_SIT, + NETDEV_KIND_TAP, + NETDEV_KIND_TUN, + NETDEV_KIND_VCAN, + NETDEV_KIND_VETH, + NETDEV_KIND_VLAN, + NETDEV_KIND_VRF, + NETDEV_KIND_VTI, + NETDEV_KIND_VTI6, + NETDEV_KIND_VXCAN, + NETDEV_KIND_VXLAN, + NETDEV_KIND_WIREGUARD, + NETDEV_KIND_WLAN, + NETDEV_KIND_XFRM, + _NETDEV_KIND_MAX, + _NETDEV_KIND_TUNNEL, /* Used by config_parse_stacked_netdev() */ + _NETDEV_KIND_INVALID = -EINVAL, +} NetDevKind; + +typedef enum NetDevState { + NETDEV_STATE_LOADING, + NETDEV_STATE_FAILED, + NETDEV_STATE_CREATING, + NETDEV_STATE_READY, + NETDEV_STATE_LINGER, + _NETDEV_STATE_MAX, + _NETDEV_STATE_INVALID = -EINVAL, +} NetDevState; + +typedef enum NetDevCreateType { + NETDEV_CREATE_INDEPENDENT, + NETDEV_CREATE_STACKED, + _NETDEV_CREATE_MAX, + _NETDEV_CREATE_INVALID = -EINVAL, +} NetDevCreateType; + +typedef struct Manager Manager; +typedef struct Condition Condition; + +typedef struct NetDev { + Manager *manager; + + unsigned n_ref; + + char *filename; + + LIST_HEAD(Condition, conditions); + + NetDevState state; + NetDevKind kind; + char *description; + char *ifname; + struct hw_addr_data hw_addr; + uint32_t mtu; + int ifindex; +} NetDev; + +typedef struct NetDevVTable { + /* How much memory does an object of this unit type need */ + size_t object_size; + + /* Config file sections this netdev kind understands, separated + * by NUL chars */ + const char *sections; + + /* This should reset all type-specific variables. This should + * not allocate memory, and is called with zero-initialized + * data. It should hence only initialize variables that need + * to be set != 0. */ + void (*init)(NetDev *n); + + /* This is called when the interface is removed. */ + void (*drop)(NetDev *n); + + /* This should free all kind-specific variables. It should be + * idempotent. */ + void (*done)(NetDev *n); + + /* fill in message to create netdev */ + int (*fill_message_create)(NetDev *netdev, Link *link, sd_netlink_message *message); + + /* specifies if netdev is independent, or a master device or a stacked device */ + NetDevCreateType create_type; + + /* This is used for stacked netdev. Return true when the underlying link is ready. */ + int (*is_ready_to_create)(NetDev *netdev, Link *link); + + /* create netdev, if not done via rtnl */ + int (*create)(NetDev *netdev); + + /* perform additional configuration after netdev has been createad */ + int (*post_create)(NetDev *netdev, Link *link); + + /* verify that compulsory configuration options were specified */ + int (*config_verify)(NetDev *netdev, const char *filename); + + /* expected iftype, e.g. ARPHRD_ETHER. */ + uint16_t iftype; + + /* Generate MAC address when MACAddress= is not specified. */ + bool generate_mac; + + /* When assigning ifindex to the netdev, skip to check if the netdev kind matches. */ + bool skip_netdev_kind_check; +} NetDevVTable; + +extern const NetDevVTable * const netdev_vtable[_NETDEV_KIND_MAX]; + +#define NETDEV_VTABLE(n) ((n)->kind != _NETDEV_KIND_INVALID ? netdev_vtable[(n)->kind] : NULL) + +/* For casting a netdev into the various netdev kinds */ +#define DEFINE_NETDEV_CAST(UPPERCASE, MixedCase) \ + static inline MixedCase* UPPERCASE(NetDev *n) { \ + assert(n); \ + assert(n->kind == NETDEV_KIND_##UPPERCASE); \ + assert(n->state < _NETDEV_STATE_MAX); \ + \ + return (MixedCase*) n; \ + } + +/* For casting the various netdev kinds into a netdev */ +#define NETDEV(n) (&(n)->meta) + +int netdev_load(Manager *manager, bool reload); +int netdev_load_one(Manager *manager, const char *filename); +void netdev_drop(NetDev *netdev); +void netdev_enter_failed(NetDev *netdev); + +NetDev *netdev_unref(NetDev *netdev); +NetDev *netdev_ref(NetDev *netdev); +DEFINE_TRIVIAL_DESTRUCTOR(netdev_destroy_callback, NetDev, netdev_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(NetDev*, netdev_unref); + +bool netdev_is_managed(NetDev *netdev); +int netdev_get(Manager *manager, const char *name, NetDev **ret); +int netdev_set_ifindex(NetDev *netdev, sd_netlink_message *newlink); +int netdev_generate_hw_addr(NetDev *netdev, Link *link, const char *name, + const struct hw_addr_data *hw_addr, struct hw_addr_data *ret); + +int link_request_stacked_netdev(Link *link, NetDev *netdev); + +const char *netdev_kind_to_string(NetDevKind d) _const_; +NetDevKind netdev_kind_from_string(const char *d) _pure_; + +static inline NetDevCreateType netdev_get_create_type(NetDev *netdev) { + assert(netdev); + assert(NETDEV_VTABLE(netdev)); + + return NETDEV_VTABLE(netdev)->create_type; +} + +CONFIG_PARSER_PROTOTYPE(config_parse_netdev_kind); +CONFIG_PARSER_PROTOTYPE(config_parse_netdev_hw_addr); + +/* gperf */ +const struct ConfigPerfItem* network_netdev_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +/* Macros which append INTERFACE= to the message */ + +#define log_netdev_full_errno_zerook(netdev, level, error, ...) \ + ({ \ + const NetDev *_n = (netdev); \ + log_interface_full_errno_zerook(_n ? _n->ifname : NULL, level, error, __VA_ARGS__); \ + }) + +#define log_netdev_full_errno(netdev, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_netdev_full_errno_zerook(netdev, level, _error, __VA_ARGS__); \ + }) + +#define log_netdev_full(netdev, level, ...) (void) log_netdev_full_errno_zerook(netdev, level, 0, __VA_ARGS__) + +#define log_netdev_debug(netdev, ...) log_netdev_full(netdev, LOG_DEBUG, __VA_ARGS__) +#define log_netdev_info(netdev, ...) log_netdev_full(netdev, LOG_INFO, __VA_ARGS__) +#define log_netdev_notice(netdev, ...) log_netdev_full(netdev, LOG_NOTICE, __VA_ARGS__) +#define log_netdev_warning(netdev, ...) log_netdev_full(netdev, LOG_WARNING, __VA_ARGS__) +#define log_netdev_error(netdev, ...) log_netdev_full(netdev, LOG_ERR, __VA_ARGS__) + +#define log_netdev_debug_errno(netdev, error, ...) log_netdev_full_errno(netdev, LOG_DEBUG, error, __VA_ARGS__) +#define log_netdev_info_errno(netdev, error, ...) log_netdev_full_errno(netdev, LOG_INFO, error, __VA_ARGS__) +#define log_netdev_notice_errno(netdev, error, ...) log_netdev_full_errno(netdev, LOG_NOTICE, error, __VA_ARGS__) +#define log_netdev_warning_errno(netdev, error, ...) log_netdev_full_errno(netdev, LOG_WARNING, error, __VA_ARGS__) +#define log_netdev_error_errno(netdev, error, ...) log_netdev_full_errno(netdev, LOG_ERR, error, __VA_ARGS__) + +#define LOG_NETDEV_MESSAGE(netdev, fmt, ...) "MESSAGE=%s: " fmt, (netdev)->ifname, ##__VA_ARGS__ +#define LOG_NETDEV_INTERFACE(netdev) "INTERFACE=%s", (netdev)->ifname diff --git a/src/network/netdev/netdevsim.c b/src/network/netdev/netdevsim.c new file mode 100644 index 0000000..15d5c13 --- /dev/null +++ b/src/network/netdev/netdevsim.c @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "netdevsim.h" + +const NetDevVTable netdevsim_vtable = { + .object_size = sizeof(NetDevSim), + .sections = NETDEV_COMMON_SECTIONS, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/netdevsim.h b/src/network/netdev/netdevsim.h new file mode 100644 index 0000000..27adc59 --- /dev/null +++ b/src/network/netdev/netdevsim.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct NetDevSim NetDevSim; + +#include "netdev.h" + +struct NetDevSim { + NetDev meta; +}; + +DEFINE_NETDEV_CAST(NETDEVSIM, NetDevSim); +extern const NetDevVTable netdevsim_vtable; diff --git a/src/network/netdev/nlmon.c b/src/network/netdev/nlmon.c new file mode 100644 index 0000000..ff37209 --- /dev/null +++ b/src/network/netdev/nlmon.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "nlmon.h" + +static int netdev_nlmon_verify(NetDev *netdev, const char *filename) { + assert(netdev); + assert(filename); + + if (netdev->hw_addr.length > 0) { + log_netdev_warning(netdev, "%s: MACAddress= is not supported. Ignoring", filename); + netdev->hw_addr = HW_ADDR_NULL; + } + + return 0; +} + +const NetDevVTable nlmon_vtable = { + .object_size = sizeof(NLMon), + .sections = NETDEV_COMMON_SECTIONS, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = netdev_nlmon_verify, + .iftype = ARPHRD_NETLINK, +}; diff --git a/src/network/netdev/nlmon.h b/src/network/netdev/nlmon.h new file mode 100644 index 0000000..edfc504 --- /dev/null +++ b/src/network/netdev/nlmon.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct NLMon NLMon; + +#include "netdev.h" + +struct NLMon { + NetDev meta; +}; + +DEFINE_NETDEV_CAST(NLMON, NLMon); + +extern const NetDevVTable nlmon_vtable; diff --git a/src/network/netdev/tunnel.c b/src/network/netdev/tunnel.c new file mode 100644 index 0000000..db84e7c --- /dev/null +++ b/src/network/netdev/tunnel.c @@ -0,0 +1,1242 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "af-list.h" +#include "conf-parser.h" +#include "hexdecoct.h" +#include "missing_network.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "siphash24.h" +#include "string-table.h" +#include "string-util.h" +#include "tunnel.h" + +#define DEFAULT_IPV6_TTL 64 +#define IP6_FLOWINFO_FLOWLABEL htobe32(0x000FFFFF) +#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40 + +static const char* const ip6tnl_mode_table[_NETDEV_IP6_TNL_MODE_MAX] = { + [NETDEV_IP6_TNL_MODE_IP6IP6] = "ip6ip6", + [NETDEV_IP6_TNL_MODE_IPIP6] = "ipip6", + [NETDEV_IP6_TNL_MODE_ANYIP6] = "any", +}; + +DEFINE_STRING_TABLE_LOOKUP(ip6tnl_mode, Ip6TnlMode); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ip6tnl_mode, ip6tnl_mode, Ip6TnlMode, "Failed to parse ip6 tunnel Mode"); + +#define HASH_KEY SD_ID128_MAKE(74,c4,de,12,f3,d9,41,34,bb,3d,c1,a4,42,93,50,87) + +int dhcp4_pd_create_6rd_tunnel_name(Link *link, char **ret) { + _cleanup_free_ char *ifname_alloc = NULL; + uint8_t ipv4masklen, sixrd_prefixlen, *buf, *p; + struct in_addr ipv4address; + struct in6_addr sixrd_prefix; + char ifname[IFNAMSIZ]; + uint64_t result; + size_t sz; + int r; + + assert(link); + assert(link->dhcp_lease); + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &ipv4address); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get DHCPv4 address: %m"); + + r = sd_dhcp_lease_get_6rd(link->dhcp_lease, &ipv4masklen, &sixrd_prefixlen, &sixrd_prefix, NULL, NULL); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get 6rd option: %m"); + + sz = sizeof(uint8_t) * 2 + sizeof(struct in6_addr) + sizeof(struct in_addr); + buf = newa(uint8_t, sz); + p = buf; + p = mempcpy(p, &ipv4masklen, sizeof(uint8_t)); + p = mempcpy(p, &ipv4address, sizeof(struct in_addr)); + p = mempcpy(p, &sixrd_prefixlen, sizeof(uint8_t)); + p = mempcpy(p, &sixrd_prefix, sizeof(struct in6_addr)); + + result = siphash24(buf, sz, HASH_KEY.bytes); + memcpy(ifname, "6rd-", STRLEN("6rd-")); + ifname[STRLEN("6rd-") ] = urlsafe_base64char(result >> 54); + ifname[STRLEN("6rd-") + 1] = urlsafe_base64char(result >> 48); + ifname[STRLEN("6rd-") + 2] = urlsafe_base64char(result >> 42); + ifname[STRLEN("6rd-") + 3] = urlsafe_base64char(result >> 36); + ifname[STRLEN("6rd-") + 4] = urlsafe_base64char(result >> 30); + ifname[STRLEN("6rd-") + 5] = urlsafe_base64char(result >> 24); + ifname[STRLEN("6rd-") + 6] = urlsafe_base64char(result >> 18); + ifname[STRLEN("6rd-") + 7] = urlsafe_base64char(result >> 12); + ifname[STRLEN("6rd-") + 8] = urlsafe_base64char(result >> 6); + ifname[STRLEN("6rd-") + 9] = urlsafe_base64char(result); + ifname[STRLEN("6rd-") + 10] = '\0'; + assert_cc(STRLEN("6rd-") + 10 <= IFNAMSIZ); + + ifname_alloc = strdup(ifname); + if (!ifname_alloc) + return log_oom_debug(); + + *ret = TAKE_PTR(ifname_alloc); + return 0; +} + +static int dhcp4_pd_create_6rd_tunnel_message( + Link *link, + sd_netlink_message *m, + const struct in_addr *ipv4address, + uint8_t ipv4masklen, + const struct in6_addr *sixrd_prefix, + uint8_t sixrd_prefixlen) { + int r; + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, link->dhcp4_6rd_tunnel_name); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "sit"); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(m, IFLA_IPTUN_LOCAL, ipv4address); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_IPTUN_TTL, 64); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, IFLA_IPTUN_6RD_PREFIX, sixrd_prefix); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_6RD_PREFIXLEN, sixrd_prefixlen); + if (r < 0) + return r; + + struct in_addr relay_prefix = *ipv4address; + (void) in4_addr_mask(&relay_prefix, ipv4masklen); + r = sd_netlink_message_append_u32(m, IFLA_IPTUN_6RD_RELAY_PREFIX, relay_prefix.s_addr); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, ipv4masklen); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +int dhcp4_pd_create_6rd_tunnel(Link *link, link_netlink_message_handler_t callback) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + uint8_t ipv4masklen, sixrd_prefixlen; + struct in_addr ipv4address; + struct in6_addr sixrd_prefix; + int r; + + assert(link); + assert(link->ifindex > 0); + assert(link->manager); + assert(link->dhcp_lease); + assert(link->dhcp4_6rd_tunnel_name); + assert(callback); + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &ipv4address); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get DHCPv4 address: %m"); + + r = sd_dhcp_lease_get_6rd(link->dhcp_lease, &ipv4masklen, &sixrd_prefixlen, &sixrd_prefix, NULL, NULL); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get 6rd option: %m"); + + r = sd_rtnl_message_new_link(link->manager->rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to create netlink message: %m"); + + r = dhcp4_pd_create_6rd_tunnel_message(link, m, + &ipv4address, ipv4masklen, + &sixrd_prefix, sixrd_prefixlen); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to fill netlink message: %m"); + + r = netlink_call_async(link->manager->rtnl, NULL, m, callback, + link_netlink_destroy_callback, link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not send netlink message: %m"); + + link_ref(link); + + return 0; +} + +static int tunnel_get_local_address(Tunnel *t, Link *link, union in_addr_union *ret) { + assert(t); + + if (t->local_type < 0) { + if (ret) + *ret = t->local; + return 0; + } + + return link_get_local_address(link, t->local_type, t->family, NULL, ret); +} + +static int netdev_ipip_sit_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(m); + + union in_addr_union local; + Tunnel *t = ASSERT_PTR(netdev)->kind == NETDEV_KIND_IPIP ? IPIP(netdev) : SIT(netdev); + int r; + + if (t->external) { + r = sd_netlink_message_append_flag(m, IFLA_IPTUN_COLLECT_METADATA); + if (r < 0) + return r; + + /* If external mode is enabled, then the following settings should not be appended. */ + return 0; + } + + if (link || t->assign_to_loopback) { + r = sd_netlink_message_append_u32(m, IFLA_IPTUN_LINK, link ? link->ifindex : LOOPBACK_IFINDEX); + if (r < 0) + return r; + } + + r = tunnel_get_local_address(t, link, &local); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not find local address: %m"); + + r = sd_netlink_message_append_in_addr(m, IFLA_IPTUN_LOCAL, &local.in); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(m, IFLA_IPTUN_REMOTE, &t->remote.in); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_IPTUN_TTL, t->ttl); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_IPTUN_PMTUDISC, t->pmtudisc); + if (r < 0) + return r; + + if (t->fou_tunnel) { + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_ENCAP_TYPE, t->fou_encap_type); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_ENCAP_SPORT, htobe16(t->encap_src_port)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_ENCAP_DPORT, htobe16(t->fou_destination_port)); + if (r < 0) + return r; + } + + if (netdev->kind == NETDEV_KIND_SIT) { + if (t->sixrd_prefixlen > 0) { + r = sd_netlink_message_append_in6_addr(m, IFLA_IPTUN_6RD_PREFIX, &t->sixrd_prefix); + if (r < 0) + return r; + + /* u16 is deliberate here, even though we're passing a netmask that can never be + * >128. The kernel is expecting to receive the prefixlen as a u16. + */ + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_6RD_PREFIXLEN, t->sixrd_prefixlen); + if (r < 0) + return r; + } + + if (t->isatap >= 0) { + uint16_t flags = 0; + + SET_FLAG(flags, SIT_ISATAP, t->isatap); + + r = sd_netlink_message_append_u16(m, IFLA_IPTUN_FLAGS, flags); + if (r < 0) + return r; + } + } + + return 0; +} + +static int netdev_gre_erspan_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + union in_addr_union local; + uint32_t ikey = 0; + uint32_t okey = 0; + uint16_t iflags = 0; + uint16_t oflags = 0; + Tunnel *t; + int r; + + assert(netdev); + assert(m); + + switch (netdev->kind) { + case NETDEV_KIND_GRE: + t = GRE(netdev); + break; + case NETDEV_KIND_ERSPAN: + t = ERSPAN(netdev); + break; + case NETDEV_KIND_GRETAP: + t = GRETAP(netdev); + break; + default: + assert_not_reached(); + } + + if (t->external) { + r = sd_netlink_message_append_flag(m, IFLA_GRE_COLLECT_METADATA); + if (r < 0) + return r; + + /* If external mode is enabled, then the following settings should not be appended. */ + return 0; + } + + if (link || t->assign_to_loopback) { + r = sd_netlink_message_append_u32(m, IFLA_GRE_LINK, link ? link->ifindex : LOOPBACK_IFINDEX); + if (r < 0) + return r; + } + + if (netdev->kind == NETDEV_KIND_ERSPAN) { + r = sd_netlink_message_append_u8(m, IFLA_GRE_ERSPAN_VER, t->erspan_version); + if (r < 0) + return r; + + if (t->erspan_version == 1) { + r = sd_netlink_message_append_u32(m, IFLA_GRE_ERSPAN_INDEX, t->erspan_index); + if (r < 0) + return r; + + } else if (t->erspan_version == 2) { + r = sd_netlink_message_append_u8(m, IFLA_GRE_ERSPAN_DIR, t->erspan_direction); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_ERSPAN_HWID, t->erspan_hwid); + if (r < 0) + return r; + } + } + + r = tunnel_get_local_address(t, link, &local); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not find local address: %m"); + + r = sd_netlink_message_append_in_addr(m, IFLA_GRE_LOCAL, &local.in); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(m, IFLA_GRE_REMOTE, &t->remote.in); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GRE_TTL, t->ttl); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GRE_TOS, t->tos); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GRE_PMTUDISC, t->pmtudisc); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GRE_IGNORE_DF, t->ignore_df); + if (r < 0) + return r; + + if (t->key != 0) { + ikey = okey = htobe32(t->key); + iflags |= GRE_KEY; + oflags |= GRE_KEY; + } + + if (t->ikey != 0) { + ikey = htobe32(t->ikey); + iflags |= GRE_KEY; + } + + if (t->okey != 0) { + okey = htobe32(t->okey); + oflags |= GRE_KEY; + } + + if (t->gre_erspan_sequence > 0) { + iflags |= GRE_SEQ; + oflags |= GRE_SEQ; + } else if (t->gre_erspan_sequence == 0) { + iflags &= ~GRE_SEQ; + oflags &= ~GRE_SEQ; + } + + r = sd_netlink_message_append_u32(m, IFLA_GRE_IKEY, ikey); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, IFLA_GRE_OKEY, okey); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_IFLAGS, iflags); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_OFLAGS, oflags); + if (r < 0) + return r; + + if (t->fou_tunnel) { + r = sd_netlink_message_append_u16(m, IFLA_GRE_ENCAP_TYPE, t->fou_encap_type); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_ENCAP_SPORT, htobe16(t->encap_src_port)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_ENCAP_DPORT, htobe16(t->fou_destination_port)); + if (r < 0) + return r; + } + + return 0; +} + +static int netdev_ip6gre_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + union in_addr_union local; + uint32_t ikey = 0, okey = 0; + uint16_t iflags = 0, oflags = 0; + Tunnel *t; + int r; + + assert(netdev); + assert(m); + + if (netdev->kind == NETDEV_KIND_IP6GRE) + t = IP6GRE(netdev); + else + t = IP6GRETAP(netdev); + + if (t->external) { + r = sd_netlink_message_append_flag(m, IFLA_GRE_COLLECT_METADATA); + if (r < 0) + return r; + + /* If external mode is enabled, then the following settings should not be appended. */ + return 0; + } + + if (link || t->assign_to_loopback) { + r = sd_netlink_message_append_u32(m, IFLA_GRE_LINK, link ? link->ifindex : LOOPBACK_IFINDEX); + if (r < 0) + return r; + } + + r = tunnel_get_local_address(t, link, &local); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not find local address: %m"); + + r = sd_netlink_message_append_in6_addr(m, IFLA_GRE_LOCAL, &local.in6); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, IFLA_GRE_REMOTE, &t->remote.in6); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_GRE_TTL, t->ttl); + if (r < 0) + return r; + + if (t->ipv6_flowlabel != _NETDEV_IPV6_FLOWLABEL_INVALID) { + r = sd_netlink_message_append_u32(m, IFLA_GRE_FLOWINFO, t->ipv6_flowlabel); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(m, IFLA_GRE_FLAGS, t->flags); + if (r < 0) + return r; + + if (t->key != 0) { + ikey = okey = htobe32(t->key); + iflags |= GRE_KEY; + oflags |= GRE_KEY; + } + + if (t->ikey != 0) { + ikey = htobe32(t->ikey); + iflags |= GRE_KEY; + } + + if (t->okey != 0) { + okey = htobe32(t->okey); + oflags |= GRE_KEY; + } + + r = sd_netlink_message_append_u32(m, IFLA_GRE_IKEY, ikey); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, IFLA_GRE_OKEY, okey); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_IFLAGS, iflags); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_GRE_OFLAGS, oflags); + if (r < 0) + return r; + + return 0; +} + +static int netdev_vti_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(netdev); + assert(m); + + union in_addr_union local; + uint32_t ikey, okey; + Tunnel *t = netdev->kind == NETDEV_KIND_VTI ? VTI(netdev) : VTI6(netdev); + int r; + + if (link || t->assign_to_loopback) { + r = sd_netlink_message_append_u32(m, IFLA_VTI_LINK, link ? link->ifindex : LOOPBACK_IFINDEX); + if (r < 0) + return r; + } + + if (t->key != 0) + ikey = okey = htobe32(t->key); + else { + ikey = htobe32(t->ikey); + okey = htobe32(t->okey); + } + + r = sd_netlink_message_append_u32(m, IFLA_VTI_IKEY, ikey); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, IFLA_VTI_OKEY, okey); + if (r < 0) + return r; + + r = tunnel_get_local_address(t, link, &local); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not find local address: %m"); + + r = netlink_message_append_in_addr_union(m, IFLA_VTI_LOCAL, t->family, &local); + if (r < 0) + return r; + + r = netlink_message_append_in_addr_union(m, IFLA_VTI_REMOTE, t->family, &t->remote); + if (r < 0) + return r; + + return 0; +} + +static int netdev_ip6tnl_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(netdev); + assert(m); + + union in_addr_union local; + uint8_t proto; + Tunnel *t = IP6TNL(netdev); + int r; + + switch (t->ip6tnl_mode) { + case NETDEV_IP6_TNL_MODE_IP6IP6: + proto = IPPROTO_IPV6; + break; + case NETDEV_IP6_TNL_MODE_IPIP6: + proto = IPPROTO_IPIP; + break; + case NETDEV_IP6_TNL_MODE_ANYIP6: + default: + proto = 0; + break; + } + + r = sd_netlink_message_append_u8(m, IFLA_IPTUN_PROTO, proto); + if (r < 0) + return r; + + if (t->external) { + r = sd_netlink_message_append_flag(m, IFLA_IPTUN_COLLECT_METADATA); + if (r < 0) + return r; + + /* If external mode is enabled, then the following settings should not be appended. */ + return 0; + } + + if (link || t->assign_to_loopback) { + r = sd_netlink_message_append_u32(m, IFLA_IPTUN_LINK, link ? link->ifindex : LOOPBACK_IFINDEX); + if (r < 0) + return r; + } + + r = tunnel_get_local_address(t, link, &local); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not find local address: %m"); + + r = sd_netlink_message_append_in6_addr(m, IFLA_IPTUN_LOCAL, &local.in6); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, IFLA_IPTUN_REMOTE, &t->remote.in6); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_IPTUN_TTL, t->ttl); + if (r < 0) + return r; + + if (t->ipv6_flowlabel != _NETDEV_IPV6_FLOWLABEL_INVALID) { + r = sd_netlink_message_append_u32(m, IFLA_IPTUN_FLOWINFO, t->ipv6_flowlabel); + if (r < 0) + return r; + } + + if (t->copy_dscp) + t->flags |= IP6_TNL_F_RCV_DSCP_COPY; + + if (t->allow_localremote >= 0) + SET_FLAG(t->flags, IP6_TNL_F_ALLOW_LOCAL_REMOTE, t->allow_localremote); + + r = sd_netlink_message_append_u32(m, IFLA_IPTUN_FLAGS, t->flags); + if (r < 0) + return r; + + if (t->encap_limit != 0) { + r = sd_netlink_message_append_u8(m, IFLA_IPTUN_ENCAP_LIMIT, t->encap_limit); + if (r < 0) + return r; + } + + return 0; +} + +static int netdev_tunnel_is_ready_to_create(NetDev *netdev, Link *link) { + assert(netdev); + + Tunnel *t = ASSERT_PTR(TUNNEL(netdev)); + + if (t->independent) + return true; + + return tunnel_get_local_address(t, link, NULL) >= 0; +} + +static int netdev_tunnel_verify(NetDev *netdev, const char *filename) { + assert(netdev); + assert(filename); + + Tunnel *t = ASSERT_PTR(TUNNEL(netdev)); + + if (netdev->kind == NETDEV_KIND_IP6TNL && + t->ip6tnl_mode == _NETDEV_IP6_TNL_MODE_INVALID) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "ip6tnl without mode configured in %s. Ignoring", filename); + + if (t->external) { + if (IN_SET(netdev->kind, NETDEV_KIND_VTI, NETDEV_KIND_VTI6)) + log_netdev_debug(netdev, "vti/vti6 tunnel do not support external mode, ignoring."); + else { + /* tunnel with external mode does not require underlying interface. */ + t->independent = true; + + /* tunnel with external mode does not require any settings checked below. */ + return 0; + } + } + + if (IN_SET(netdev->kind, NETDEV_KIND_VTI, NETDEV_KIND_IPIP, NETDEV_KIND_SIT, NETDEV_KIND_GRE) && + !IN_SET(t->family, AF_UNSPEC, AF_INET)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "vti/ipip/sit/gre tunnel without a local/remote IPv4 address configured in %s. Ignoring", filename); + + if (IN_SET(netdev->kind, NETDEV_KIND_GRETAP, NETDEV_KIND_ERSPAN) && + (t->family != AF_INET || !in_addr_is_set(t->family, &t->remote))) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "gretap/erspan tunnel without a remote IPv4 address configured in %s. Ignoring", filename); + + if ((IN_SET(netdev->kind, NETDEV_KIND_VTI6, NETDEV_KIND_IP6TNL) && t->family != AF_INET6) || + (netdev->kind == NETDEV_KIND_IP6GRE && !IN_SET(t->family, AF_UNSPEC, AF_INET6))) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "vti6/ip6tnl/ip6gre tunnel without a local/remote IPv6 address configured in %s. Ignoring", filename); + + if (netdev->kind == NETDEV_KIND_IP6GRETAP && + (t->family != AF_INET6 || !in_addr_is_set(t->family, &t->remote))) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "ip6gretap tunnel without a remote IPv6 address configured in %s. Ignoring", filename); + + if (t->fou_tunnel && t->fou_destination_port <= 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "FooOverUDP missing port configured in %s. Ignoring", filename); + + /* netlink_message_append_in_addr_union() is used for vti/vti6. So, t->family cannot be AF_UNSPEC. */ + if (netdev->kind == NETDEV_KIND_VTI) + t->family = AF_INET; + + if (t->assign_to_loopback) + t->independent = true; + + if (t->independent && t->local_type >= 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "The local address cannot be '%s' when Independent= or AssignToLoopback= is enabled, ignoring.", + strna(netdev_local_address_type_to_string(t->local_type))); + + if (t->pmtudisc > 0 && t->ignore_df) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "IgnoreDontFragment= cannot be enabled when DiscoverPathMTU= is enabled"); + if (t->pmtudisc < 0) + t->pmtudisc = !t->ignore_df; + return 0; +} + +static int unset_local(Tunnel *t) { + assert(t); + + /* Unset the previous assignment. */ + t->local = IN_ADDR_NULL; + t->local_type = _NETDEV_LOCAL_ADDRESS_TYPE_INVALID; + + /* If the remote address is not specified, also clear the address family. */ + if (!in_addr_is_set(t->family, &t->remote)) + t->family = AF_UNSPEC; + + return 0; +} + +int config_parse_tunnel_local_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + union in_addr_union buffer = IN_ADDR_NULL; + NetDevLocalAddressType type; + Tunnel *t = ASSERT_PTR(userdata); + int r, f; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "any")) + return unset_local(t); + + type = netdev_local_address_type_from_string(rvalue); + if (IN_SET(type, NETDEV_LOCAL_ADDRESS_IPV4LL, NETDEV_LOCAL_ADDRESS_DHCP4)) + f = AF_INET; + else if (IN_SET(type, NETDEV_LOCAL_ADDRESS_IPV6LL, NETDEV_LOCAL_ADDRESS_DHCP6, NETDEV_LOCAL_ADDRESS_SLAAC)) + f = AF_INET6; + else { + type = _NETDEV_LOCAL_ADDRESS_TYPE_INVALID; + r = in_addr_from_string_auto(rvalue, &f, &buffer); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Tunnel address \"%s\" invalid, ignoring assignment: %m", rvalue); + return 0; + } + + if (in_addr_is_null(f, &buffer)) + return unset_local(t); + } + + if (t->family != AF_UNSPEC && t->family != f) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Address family does not match the previous assignment, ignoring assignment: %s", rvalue); + return 0; + } + + t->family = f; + t->local = buffer; + t->local_type = type; + return 0; +} + +static int unset_remote(Tunnel *t) { + assert(t); + + /* Unset the previous assignment. */ + t->remote = IN_ADDR_NULL; + + /* If the local address is not specified, also clear the address family. */ + if (t->local_type == _NETDEV_LOCAL_ADDRESS_TYPE_INVALID && + !in_addr_is_set(t->family, &t->local)) + t->family = AF_UNSPEC; + + return 0; +} + +int config_parse_tunnel_remote_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + union in_addr_union buffer; + Tunnel *t = ASSERT_PTR(userdata); + int r, f; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "any")) + return unset_remote(t); + + r = in_addr_from_string_auto(rvalue, &f, &buffer); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Tunnel address \"%s\" invalid, ignoring assignment: %m", rvalue); + return 0; + } + + if (in_addr_is_null(f, &buffer)) + return unset_remote(t); + + if (t->family != AF_UNSPEC && t->family != f) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Address family does not match the previous assignment, ignoring assignment: %s", rvalue); + return 0; + } + + t->family = f; + t->remote = buffer; + return 0; +} + +int config_parse_tunnel_key( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *dest = ASSERT_PTR(data), k; + union in_addr_union buffer; + int r; + + assert(filename); + assert(rvalue); + + r = in_addr_from_string(AF_INET, rvalue, &buffer); + if (r < 0) { + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse tunnel key ignoring assignment: %s", rvalue); + return 0; + } + } else + k = be32toh(buffer.in.s_addr); + + *dest = k; + return 0; +} + +int config_parse_ipv6_flowlabel( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Tunnel *t = ASSERT_PTR(userdata); + uint32_t k; + int r; + + assert(filename); + assert(rvalue); + + if (streq(rvalue, "inherit")) { + t->ipv6_flowlabel = IP6_FLOWINFO_FLOWLABEL; + t->flags |= IP6_TNL_F_USE_ORIG_FLOWLABEL; + return 0; + } + + r = config_parse_uint32_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, 0xFFFFF, true, + &k); + if (r <= 0) + return r; + t->ipv6_flowlabel = htobe32(k) & IP6_FLOWINFO_FLOWLABEL; + t->flags &= ~IP6_TNL_F_USE_ORIG_FLOWLABEL; + + return 0; +} + +int config_parse_encap_limit( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(rvalue); + + Tunnel *t = ASSERT_PTR(userdata); + int r; + + if (streq(rvalue, "none")) { + t->encap_limit = 0; + t->flags |= IP6_TNL_F_IGN_ENCAP_LIMIT; + return 0; + } + + r = config_parse_uint8_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, UINT8_MAX, true, + &t->encap_limit); + if (r <= 0) + return r; + t->flags &= ~IP6_TNL_F_IGN_ENCAP_LIMIT; + + return 0; +} + +int config_parse_6rd_prefix( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Tunnel *t = userdata; + union in_addr_union p; + uint8_t l; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = in_addr_prefix_from_string(rvalue, AF_INET6, &p, &l); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse 6rd prefix \"%s\", ignoring: %m", rvalue); + return 0; + } + if (l == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "6rd prefix length of \"%s\" must be greater than zero, ignoring", rvalue); + return 0; + } + + t->sixrd_prefix = p.in6; + t->sixrd_prefixlen = l; + + return 0; +} + +int config_parse_erspan_version( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + uint8_t *v = ASSERT_PTR(data); + + if (isempty(rvalue)) { + *v = 1; /* defaults to 1 */ + return 0; + } + + return config_parse_uint8_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, 2, true, + v); +} + +int config_parse_erspan_index( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + uint32_t *v = ASSERT_PTR(data); + + if (isempty(rvalue)) { + *v = 0; /* defaults to 0 */ + return 0; + } + + return config_parse_uint32_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, 0x100000 - 1, true, + v); +} + +int config_parse_erspan_direction( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + uint8_t *v = ASSERT_PTR(data); + + if (isempty(rvalue) || streq(rvalue, "ingress")) + *v = 0; /* defaults to ingress */ + else if (streq(rvalue, "egress")) + *v = 1; + else + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid erspan direction \"%s\", which must be \"ingress\" or \"egress\", ignoring.", rvalue); + + return 0; +} + +int config_parse_erspan_hwid( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + + uint16_t *v = ASSERT_PTR(data); + + if (isempty(rvalue)) { + *v = 0; /* defaults to 0 */ + return 0; + } + + return config_parse_uint16_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, 63, true, + v); +} + +static void netdev_tunnel_init(NetDev *netdev) { + Tunnel *t = ASSERT_PTR(TUNNEL(netdev)); + + t->local_type = _NETDEV_LOCAL_ADDRESS_TYPE_INVALID; + t->pmtudisc = -1; + t->fou_encap_type = NETDEV_FOO_OVER_UDP_ENCAP_DIRECT; + t->isatap = -1; + t->gre_erspan_sequence = -1; + t->encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT; + t->ip6tnl_mode = _NETDEV_IP6_TNL_MODE_INVALID; + t->ipv6_flowlabel = _NETDEV_IPV6_FLOWLABEL_INVALID; + t->allow_localremote = -1; + t->erspan_version = 1; + + if (IN_SET(netdev->kind, NETDEV_KIND_IP6GRE, NETDEV_KIND_IP6GRETAP, NETDEV_KIND_IP6TNL)) + t->ttl = DEFAULT_IPV6_TTL; +} + +const NetDevVTable ipip_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_ipip_sit_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_TUNNEL, +}; + +const NetDevVTable sit_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_ipip_sit_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_SIT, +}; + +const NetDevVTable vti_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_vti_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_TUNNEL, +}; + +const NetDevVTable vti6_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_vti_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_TUNNEL6, +}; + +const NetDevVTable gre_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_gre_erspan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_IPGRE, +}; + +const NetDevVTable gretap_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_gre_erspan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; + +const NetDevVTable ip6gre_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_ip6gre_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_IP6GRE, +}; + +const NetDevVTable ip6gretap_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_ip6gre_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; + +const NetDevVTable ip6tnl_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_ip6tnl_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_TUNNEL6, +}; + +const NetDevVTable erspan_vtable = { + .object_size = sizeof(Tunnel), + .init = netdev_tunnel_init, + .sections = NETDEV_COMMON_SECTIONS "Tunnel\0", + .fill_message_create = netdev_gre_erspan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_tunnel_is_ready_to_create, + .config_verify = netdev_tunnel_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/tunnel.h b/src/network/netdev/tunnel.h new file mode 100644 index 0000000..713f2fb --- /dev/null +++ b/src/network/netdev/tunnel.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "in-addr-util.h" + +#include "conf-parser.h" +#include "fou-tunnel.h" +#include "netdev-util.h" +#include "netdev.h" +#include "networkd-link.h" + +typedef enum Ip6TnlMode { + NETDEV_IP6_TNL_MODE_IP6IP6, + NETDEV_IP6_TNL_MODE_IPIP6, + NETDEV_IP6_TNL_MODE_ANYIP6, + _NETDEV_IP6_TNL_MODE_MAX, + _NETDEV_IP6_TNL_MODE_INVALID = -EINVAL, +} Ip6TnlMode; + +typedef enum IPv6FlowLabel { + NETDEV_IPV6_FLOWLABEL_INHERIT = 0xFFFFF + 1, + _NETDEV_IPV6_FLOWLABEL_MAX, + _NETDEV_IPV6_FLOWLABEL_INVALID = -EINVAL, +} IPv6FlowLabel; + +typedef struct Tunnel { + NetDev meta; + + uint8_t encap_limit; + + int family; + int ipv6_flowlabel; + int allow_localremote; + int gre_erspan_sequence; + int isatap; + + unsigned ttl; + unsigned tos; + unsigned flags; + + uint32_t key; + uint32_t ikey; + uint32_t okey; + + uint8_t erspan_version; + uint32_t erspan_index; /* version 1 */ + uint8_t erspan_direction; /* version 2 */ + uint16_t erspan_hwid; /* version 2 */ + + NetDevLocalAddressType local_type; + union in_addr_union local; + union in_addr_union remote; + + Ip6TnlMode ip6tnl_mode; + FooOverUDPEncapType fou_encap_type; + + int pmtudisc; + bool ignore_df; + bool copy_dscp; + bool independent; + bool fou_tunnel; + bool assign_to_loopback; + bool external; /* a.k.a collect metadata mode */ + + uint16_t encap_src_port; + uint16_t fou_destination_port; + + struct in6_addr sixrd_prefix; + uint8_t sixrd_prefixlen; +} Tunnel; + +int dhcp4_pd_create_6rd_tunnel_name(Link *link, char **ret); +int dhcp4_pd_create_6rd_tunnel(Link *link, link_netlink_message_handler_t callback); + +DEFINE_NETDEV_CAST(IPIP, Tunnel); +DEFINE_NETDEV_CAST(GRE, Tunnel); +DEFINE_NETDEV_CAST(GRETAP, Tunnel); +DEFINE_NETDEV_CAST(IP6GRE, Tunnel); +DEFINE_NETDEV_CAST(IP6GRETAP, Tunnel); +DEFINE_NETDEV_CAST(SIT, Tunnel); +DEFINE_NETDEV_CAST(VTI, Tunnel); +DEFINE_NETDEV_CAST(VTI6, Tunnel); +DEFINE_NETDEV_CAST(IP6TNL, Tunnel); +DEFINE_NETDEV_CAST(ERSPAN, Tunnel); + +static inline Tunnel* TUNNEL(NetDev *netdev) { + assert(netdev); + + switch (netdev->kind) { + case NETDEV_KIND_IPIP: + return IPIP(netdev); + case NETDEV_KIND_SIT: + return SIT(netdev); + case NETDEV_KIND_GRE: + return GRE(netdev); + case NETDEV_KIND_GRETAP: + return GRETAP(netdev); + case NETDEV_KIND_IP6GRE: + return IP6GRE(netdev); + case NETDEV_KIND_IP6GRETAP: + return IP6GRETAP(netdev); + case NETDEV_KIND_VTI: + return VTI(netdev); + case NETDEV_KIND_VTI6: + return VTI6(netdev); + case NETDEV_KIND_IP6TNL: + return IP6TNL(netdev); + case NETDEV_KIND_ERSPAN: + return ERSPAN(netdev); + default: + return NULL; + } +} + +extern const NetDevVTable ipip_vtable; +extern const NetDevVTable sit_vtable; +extern const NetDevVTable vti_vtable; +extern const NetDevVTable vti6_vtable; +extern const NetDevVTable gre_vtable; +extern const NetDevVTable gretap_vtable; +extern const NetDevVTable ip6gre_vtable; +extern const NetDevVTable ip6gretap_vtable; +extern const NetDevVTable ip6tnl_vtable; +extern const NetDevVTable erspan_vtable; + +const char *ip6tnl_mode_to_string(Ip6TnlMode d) _const_; +Ip6TnlMode ip6tnl_mode_from_string(const char *d) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_ip6tnl_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_tunnel_local_address); +CONFIG_PARSER_PROTOTYPE(config_parse_tunnel_remote_address); +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_flowlabel); +CONFIG_PARSER_PROTOTYPE(config_parse_encap_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_tunnel_key); +CONFIG_PARSER_PROTOTYPE(config_parse_6rd_prefix); +CONFIG_PARSER_PROTOTYPE(config_parse_erspan_version); +CONFIG_PARSER_PROTOTYPE(config_parse_erspan_index); +CONFIG_PARSER_PROTOTYPE(config_parse_erspan_direction); +CONFIG_PARSER_PROTOTYPE(config_parse_erspan_hwid); diff --git a/src/network/netdev/tuntap.c b/src/network/netdev/tuntap.c new file mode 100644 index 0000000..9e909d1 --- /dev/null +++ b/src/network/netdev/tuntap.c @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "daemon-util.h" +#include "fd-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "socket-util.h" +#include "tuntap.h" +#include "user-util.h" + +#define TUN_DEV "/dev/net/tun" + +static TunTap* TUNTAP(NetDev *netdev) { + assert(netdev); + + switch (netdev->kind) { + case NETDEV_KIND_TAP: + return TAP(netdev); + case NETDEV_KIND_TUN: + return TUN(netdev); + default: + return NULL; + } +} + +static void *close_fd_ptr(void *p) { + safe_close(PTR_TO_FD(p)); + return NULL; +} + +DEFINE_PRIVATE_HASH_OPS_FULL(named_fd_hash_ops, char, string_hash_func, string_compare_func, free, void, close_fd_ptr); + +int manager_add_tuntap_fd(Manager *m, int fd, const char *name) { + _cleanup_free_ char *tuntap_name = NULL; + const char *p; + int r; + + assert(m); + assert(fd >= 0); + assert(name); + + p = startswith(name, "tuntap-"); + if (!p) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Received unknown fd (%s).", name); + + if (!ifname_valid(p)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Received tuntap fd with invalid name (%s).", p); + + tuntap_name = strdup(p); + if (!tuntap_name) + return log_oom_debug(); + + r = hashmap_ensure_put(&m->tuntap_fds_by_name, &named_fd_hash_ops, tuntap_name, FD_TO_PTR(fd)); + if (r < 0) + return log_debug_errno(r, "Failed to store tuntap fd: %m"); + + TAKE_PTR(tuntap_name); + return 0; +} + +void manager_clear_unmanaged_tuntap_fds(Manager *m) { + char *name; + void *p; + + assert(m); + + while ((p = hashmap_steal_first_key_and_value(m->tuntap_fds_by_name, (void**) &name))) { + close_and_notify_warn(PTR_TO_FD(p), name); + name = mfree(name); + } +} + +static int tuntap_take_fd(NetDev *netdev) { + _cleanup_free_ char *name = NULL; + void *p; + int r; + + assert(netdev); + assert(netdev->manager); + + r = link_get_by_name(netdev->manager, netdev->ifname, NULL); + if (r < 0) + return r; + + p = hashmap_remove2(netdev->manager->tuntap_fds_by_name, netdev->ifname, (void**) &name); + if (!p) + return -ENOENT; + + log_netdev_debug(netdev, "Found file descriptor in fd store."); + return PTR_TO_FD(p); +} + +static int netdev_create_tuntap(NetDev *netdev) { + _cleanup_close_ int fd = -EBADF; + struct ifreq ifr = {}; + TunTap *t; + int r; + + assert(netdev); + t = TUNTAP(netdev); + assert(t); + + fd = TAKE_FD(t->fd); + if (fd < 0) + fd = tuntap_take_fd(netdev); + if (fd < 0) + fd = open(TUN_DEV, O_RDWR|O_CLOEXEC); + if (fd < 0) + return log_netdev_error_errno(netdev, errno, "Failed to open " TUN_DEV ": %m"); + + if (netdev->kind == NETDEV_KIND_TAP) + ifr.ifr_flags |= IFF_TAP; + else + ifr.ifr_flags |= IFF_TUN; + + if (!t->packet_info) + ifr.ifr_flags |= IFF_NO_PI; + + if (t->multi_queue) + ifr.ifr_flags |= IFF_MULTI_QUEUE; + + if (t->vnet_hdr) + ifr.ifr_flags |= IFF_VNET_HDR; + + strncpy(ifr.ifr_name, netdev->ifname, IFNAMSIZ-1); + + if (ioctl(fd, TUNSETIFF, &ifr) < 0) + return log_netdev_error_errno(netdev, errno, "TUNSETIFF failed: %m"); + + if (t->multi_queue) { + /* If we don't detach the queue, the kernel will send packets to our queue and they + * will be dropped because we never read them, which is especially important in case + * of KeepCarrier option which persists open FD. So detach our queue right after + * device create/attach to make kernel not send the packets to it. The option is + * available for multi-queue devices only. + * + * See https://github.com/systemd/systemd/pull/30504 for details. */ + struct ifreq detach_request = { .ifr_flags = IFF_DETACH_QUEUE }; + if (ioctl(fd, TUNSETQUEUE, &detach_request) < 0) + return log_netdev_error_errno(netdev, errno, "TUNSETQUEUE failed: %m"); + } + + if (t->user_name) { + const char *user = t->user_name; + uid_t uid; + + r = get_user_creds(&user, &uid, NULL, NULL, NULL, USER_CREDS_ALLOW_MISSING); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Cannot resolve user name %s: %m", t->user_name); + + if (ioctl(fd, TUNSETOWNER, uid) < 0) + return log_netdev_error_errno(netdev, errno, "TUNSETOWNER failed: %m"); + } + + if (t->group_name) { + const char *group = t->group_name; + gid_t gid; + + r = get_group_creds(&group, &gid, USER_CREDS_ALLOW_MISSING); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Cannot resolve group name %s: %m", t->group_name); + + if (ioctl(fd, TUNSETGROUP, gid) < 0) + return log_netdev_error_errno(netdev, errno, "TUNSETGROUP failed: %m"); + + } + + if (ioctl(fd, TUNSETPERSIST, 1) < 0) + return log_netdev_error_errno(netdev, errno, "TUNSETPERSIST failed: %m"); + + if (t->keep_fd) { + t->fd = TAKE_FD(fd); + (void) notify_push_fdf(t->fd, "tuntap-%s", netdev->ifname); + } + + return 0; +} + +static void tuntap_init(NetDev *netdev) { + TunTap *t; + + assert(netdev); + t = TUNTAP(netdev); + assert(t); + + t->fd = -EBADF; +} + +static void tuntap_drop(NetDev *netdev) { + TunTap *t; + + assert(netdev); + t = TUNTAP(netdev); + assert(t); + + t->fd = close_and_notify_warn(t->fd, netdev->ifname); +} + +static void tuntap_done(NetDev *netdev) { + TunTap *t; + + assert(netdev); + t = TUNTAP(netdev); + assert(t); + + t->fd = safe_close(t->fd); + t->user_name = mfree(t->user_name); + t->group_name = mfree(t->group_name); +} + +static int tuntap_verify(NetDev *netdev, const char *filename) { + assert(netdev); + + if (netdev->mtu != 0) + log_netdev_warning(netdev, + "MTUBytes= configured for %s device in %s will be ignored.\n" + "Please set it in the corresponding .network file.", + netdev_kind_to_string(netdev->kind), filename); + + if (netdev->hw_addr.length > 0) + log_netdev_warning(netdev, + "MACAddress= configured for %s device in %s will be ignored.\n" + "Please set it in the corresponding .network file.", + netdev_kind_to_string(netdev->kind), filename); + + return 0; +} + +const NetDevVTable tun_vtable = { + .object_size = sizeof(TunTap), + .sections = NETDEV_COMMON_SECTIONS "Tun\0", + .config_verify = tuntap_verify, + .init = tuntap_init, + .drop = tuntap_drop, + .done = tuntap_done, + .create = netdev_create_tuntap, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_NONE, +}; + +const NetDevVTable tap_vtable = { + .object_size = sizeof(TunTap), + .sections = NETDEV_COMMON_SECTIONS "Tap\0", + .config_verify = tuntap_verify, + .init = tuntap_init, + .drop = tuntap_drop, + .done = tuntap_done, + .create = netdev_create_tuntap, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, +}; diff --git a/src/network/netdev/tuntap.h b/src/network/netdev/tuntap.h new file mode 100644 index 0000000..88e0ce5 --- /dev/null +++ b/src/network/netdev/tuntap.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct TunTap TunTap; + +#include "netdev.h" + +struct TunTap { + NetDev meta; + + int fd; + char *user_name; + char *group_name; + bool multi_queue; + bool packet_info; + bool vnet_hdr; + bool keep_fd; +}; + +DEFINE_NETDEV_CAST(TUN, TunTap); +DEFINE_NETDEV_CAST(TAP, TunTap); +extern const NetDevVTable tun_vtable; +extern const NetDevVTable tap_vtable; + +int manager_add_tuntap_fd(Manager *m, int fd, const char *name); +void manager_clear_unmanaged_tuntap_fds(Manager *m); diff --git a/src/network/netdev/vcan.c b/src/network/netdev/vcan.c new file mode 100644 index 0000000..380547e --- /dev/null +++ b/src/network/netdev/vcan.c @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "vcan.h" + +const NetDevVTable vcan_vtable = { + .object_size = sizeof(VCan), + .sections = NETDEV_COMMON_SECTIONS, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_CAN, +}; diff --git a/src/network/netdev/vcan.h b/src/network/netdev/vcan.h new file mode 100644 index 0000000..843984f --- /dev/null +++ b/src/network/netdev/vcan.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct VCan VCan; + +#include +#include + +#include "netdev.h" + +struct VCan { + NetDev meta; +}; + +DEFINE_NETDEV_CAST(VCAN, VCan); + +extern const NetDevVTable vcan_vtable; diff --git a/src/network/netdev/veth.c b/src/network/netdev/veth.c new file mode 100644 index 0000000..e0f5b4e --- /dev/null +++ b/src/network/netdev/veth.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "netlink-util.h" +#include "veth.h" + +static int netdev_veth_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(!link); + assert(m); + + struct hw_addr_data hw_addr; + Veth *v = VETH(netdev); + int r; + + r = sd_netlink_message_open_container(m, VETH_INFO_PEER); + if (r < 0) + return r; + + if (v->ifname_peer) { + r = sd_netlink_message_append_string(m, IFLA_IFNAME, v->ifname_peer); + if (r < 0) + return r; + } + + r = netdev_generate_hw_addr(netdev, NULL, v->ifname_peer, &v->hw_addr_peer, &hw_addr); + if (r < 0) + return r; + + if (hw_addr.length > 0) { + log_netdev_debug(netdev, "Using MAC address for peer: %s", HW_ADDR_TO_STR(&hw_addr)); + r = netlink_message_append_hw_addr(m, IFLA_ADDRESS, &hw_addr); + if (r < 0) + return r; + } + + if (netdev->mtu != 0) { + r = sd_netlink_message_append_u32(m, IFLA_MTU, netdev->mtu); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +static int netdev_veth_verify(NetDev *netdev, const char *filename) { + assert(filename); + + Veth *v = VETH(netdev); + + if (!v->ifname_peer) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "Veth NetDev without peer name configured in %s. Ignoring", + filename); + + return 0; +} + +static void veth_done(NetDev *netdev) { + Veth *v = VETH(netdev); + + free(v->ifname_peer); +} + +const NetDevVTable veth_vtable = { + .object_size = sizeof(Veth), + .sections = NETDEV_COMMON_SECTIONS "Peer\0", + .done = veth_done, + .fill_message_create = netdev_veth_fill_message_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = netdev_veth_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/veth.h b/src/network/netdev/veth.h new file mode 100644 index 0000000..e0d6fd4 --- /dev/null +++ b/src/network/netdev/veth.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Veth Veth; + +#include "netdev.h" + +struct Veth { + NetDev meta; + + char *ifname_peer; + struct hw_addr_data hw_addr_peer; +}; + +DEFINE_NETDEV_CAST(VETH, Veth); +extern const NetDevVTable veth_vtable; diff --git a/src/network/netdev/vlan.c b/src/network/netdev/vlan.c new file mode 100644 index 0000000..2390206 --- /dev/null +++ b/src/network/netdev/vlan.c @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "parse-util.h" +#include "vlan-util.h" +#include "vlan.h" + +static int netdev_vlan_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *req) { + assert(link); + assert(req); + + struct ifla_vlan_flags flags = {}; + VLan *v = VLAN(netdev); + int r; + + r = sd_netlink_message_append_u16(req, IFLA_VLAN_ID, v->id); + if (r < 0) + return r; + + if (v->protocol >= 0) { + r = sd_netlink_message_append_u16(req, IFLA_VLAN_PROTOCOL, htobe16(v->protocol)); + if (r < 0) + return r; + } + + if (v->gvrp != -1) { + flags.mask |= VLAN_FLAG_GVRP; + SET_FLAG(flags.flags, VLAN_FLAG_GVRP, v->gvrp); + } + + if (v->mvrp != -1) { + flags.mask |= VLAN_FLAG_MVRP; + SET_FLAG(flags.flags, VLAN_FLAG_MVRP, v->mvrp); + } + + if (v->reorder_hdr != -1) { + flags.mask |= VLAN_FLAG_REORDER_HDR; + SET_FLAG(flags.flags, VLAN_FLAG_REORDER_HDR, v->reorder_hdr); + } + + if (v->loose_binding != -1) { + flags.mask |= VLAN_FLAG_LOOSE_BINDING; + SET_FLAG(flags.flags, VLAN_FLAG_LOOSE_BINDING, v->loose_binding); + } + + r = sd_netlink_message_append_data(req, IFLA_VLAN_FLAGS, &flags, sizeof(struct ifla_vlan_flags)); + if (r < 0) + return r; + + if (!set_isempty(v->egress_qos_maps)) { + struct ifla_vlan_qos_mapping *m; + + r = sd_netlink_message_open_container(req, IFLA_VLAN_EGRESS_QOS); + if (r < 0) + return r; + + SET_FOREACH(m, v->egress_qos_maps) { + r = sd_netlink_message_append_data(req, IFLA_VLAN_QOS_MAPPING, m, sizeof(struct ifla_vlan_qos_mapping)); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + } + + if (!set_isempty(v->ingress_qos_maps)) { + struct ifla_vlan_qos_mapping *m; + + r = sd_netlink_message_open_container(req, IFLA_VLAN_INGRESS_QOS); + if (r < 0) + return r; + + SET_FOREACH(m, v->ingress_qos_maps) { + r = sd_netlink_message_append_data(req, IFLA_VLAN_QOS_MAPPING, m, sizeof(struct ifla_vlan_qos_mapping)); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + } + + return 0; +} + +static void vlan_qos_maps_hash_func(const struct ifla_vlan_qos_mapping *x, struct siphash *state) { + siphash24_compress(&x->from, sizeof(x->from), state); + siphash24_compress(&x->to, sizeof(x->to), state); +} + +static int vlan_qos_maps_compare_func(const struct ifla_vlan_qos_mapping *a, const struct ifla_vlan_qos_mapping *b) { + int r; + + r = CMP(a->from, b->from); + if (r != 0) + return r; + + return CMP(a->to, b->to); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + vlan_qos_maps_hash_ops, + struct ifla_vlan_qos_mapping, + vlan_qos_maps_hash_func, + vlan_qos_maps_compare_func, + free); + +int config_parse_vlan_qos_maps( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *s = set_free(*s); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ struct ifla_vlan_qos_mapping *m = NULL; + _cleanup_free_ char *w = NULL; + unsigned from, to; + + r = extract_first_word(&p, &w, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue); + return 0; + } + if (r == 0) + return 0; + + r = parse_range(w, &from, &to); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, w); + continue; + } + + m = new(struct ifla_vlan_qos_mapping, 1); + if (!m) + return log_oom(); + + *m = (struct ifla_vlan_qos_mapping) { + .from = from, + .to = to, + }; + + r = set_ensure_consume(s, &vlan_qos_maps_hash_ops, TAKE_PTR(m)); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to store %s, ignoring: %s", lvalue, w); + continue; + } + } +} + +static int netdev_vlan_verify(NetDev *netdev, const char *filename) { + assert(filename); + + VLan *v = VLAN(netdev); + + if (v->id == VLANID_INVALID) { + log_netdev_warning(netdev, "VLAN without valid Id (%"PRIu16") configured in %s.", v->id, filename); + return -EINVAL; + } + + return 0; +} + +static void vlan_done(NetDev *netdev) { + VLan *v = VLAN(netdev); + + set_free(v->egress_qos_maps); + set_free(v->ingress_qos_maps); +} + +static void vlan_init(NetDev *netdev) { + VLan *v = VLAN(netdev); + + v->id = VLANID_INVALID; + v->protocol = -1; + v->gvrp = -1; + v->mvrp = -1; + v->loose_binding = -1; + v->reorder_hdr = -1; +} + +const NetDevVTable vlan_vtable = { + .object_size = sizeof(VLan), + .init = vlan_init, + .sections = NETDEV_COMMON_SECTIONS "VLAN\0", + .fill_message_create = netdev_vlan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .config_verify = netdev_vlan_verify, + .done = vlan_done, + .iftype = ARPHRD_ETHER, +}; diff --git a/src/network/netdev/vlan.h b/src/network/netdev/vlan.h new file mode 100644 index 0000000..1e5e590 --- /dev/null +++ b/src/network/netdev/vlan.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct VLan VLan; + +#include "netdev.h" +#include "set.h" + +struct VLan { + NetDev meta; + + uint16_t id; + int protocol; + + int gvrp; + int mvrp; + int loose_binding; + int reorder_hdr; + + Set *egress_qos_maps; + Set *ingress_qos_maps; +}; + +DEFINE_NETDEV_CAST(VLAN, VLan); +extern const NetDevVTable vlan_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_vlan_qos_maps); diff --git a/src/network/netdev/vrf.c b/src/network/netdev/vrf.c new file mode 100644 index 0000000..b75ec2b --- /dev/null +++ b/src/network/netdev/vrf.c @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "vrf.h" + +static int netdev_vrf_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(!link); + assert(m); + + Vrf *v = VRF(netdev); + int r; + + r = sd_netlink_message_append_u32(m, IFLA_VRF_TABLE, v->table); + if (r < 0) + return r; + + return 0; +} + +const NetDevVTable vrf_vtable = { + .object_size = sizeof(Vrf), + .sections = NETDEV_COMMON_SECTIONS "VRF\0", + .fill_message_create = netdev_vrf_fill_message_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/vrf.h b/src/network/netdev/vrf.h new file mode 100644 index 0000000..87977e2 --- /dev/null +++ b/src/network/netdev/vrf.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Vrf Vrf; + +#include "netdev.h" + +struct Vrf { + NetDev meta; + + uint32_t table; +}; + +DEFINE_NETDEV_CAST(VRF, Vrf); +extern const NetDevVTable vrf_vtable; diff --git a/src/network/netdev/vxcan.c b/src/network/netdev/vxcan.c new file mode 100644 index 0000000..c0343f4 --- /dev/null +++ b/src/network/netdev/vxcan.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "vxcan.h" + +static int netdev_vxcan_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(!link); + assert(m); + + VxCan *v = VXCAN(netdev); + int r; + + r = sd_netlink_message_open_container(m, VXCAN_INFO_PEER); + if (r < 0) + return r; + + if (v->ifname_peer) { + r = sd_netlink_message_append_string(m, IFLA_IFNAME, v->ifname_peer); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +static int netdev_vxcan_verify(NetDev *netdev, const char *filename) { + assert(filename); + + VxCan *v = VXCAN(netdev); + + if (!v->ifname_peer) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "VxCan NetDev without peer name configured in %s. Ignoring", filename); + + return 0; +} + +static void vxcan_done(NetDev *netdev) { + VxCan *v = VXCAN(netdev); + + free(v->ifname_peer); +} + +const NetDevVTable vxcan_vtable = { + .object_size = sizeof(VxCan), + .sections = NETDEV_COMMON_SECTIONS "VXCAN\0", + .done = vxcan_done, + .fill_message_create = netdev_vxcan_fill_message_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = netdev_vxcan_verify, + .iftype = ARPHRD_CAN, +}; diff --git a/src/network/netdev/vxcan.h b/src/network/netdev/vxcan.h new file mode 100644 index 0000000..47be3f0 --- /dev/null +++ b/src/network/netdev/vxcan.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct VxCan VxCan; + +#include "netdev.h" + +struct VxCan { + NetDev meta; + + char *ifname_peer; +}; + +DEFINE_NETDEV_CAST(VXCAN, VxCan); + +extern const NetDevVTable vxcan_vtable; diff --git a/src/network/netdev/vxlan.c b/src/network/netdev/vxlan.c new file mode 100644 index 0000000..b11fdbb --- /dev/null +++ b/src/network/netdev/vxlan.c @@ -0,0 +1,435 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "conf-parser.h" +#include "alloc-util.h" +#include "extract-word.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "parse-util.h" +#include "vxlan.h" + +static const char* const df_table[_NETDEV_VXLAN_DF_MAX] = { + [NETDEV_VXLAN_DF_NO] = "no", + [NETDEV_VXLAN_DF_YES] = "yes", + [NETDEV_VXLAN_DF_INHERIT] = "inherit", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(df, VxLanDF, NETDEV_VXLAN_DF_YES); +DEFINE_CONFIG_PARSE_ENUM(config_parse_df, df, VxLanDF, "Failed to parse VXLAN IPDoNotFragment= setting"); + +static int vxlan_get_local_address(VxLan *v, Link *link, int *ret_family, union in_addr_union *ret_address) { + assert(v); + + if (v->local_type < 0) { + if (ret_family) + *ret_family = v->local_family; + if (ret_address) + *ret_address = v->local; + return 0; + } + + return link_get_local_address(link, v->local_type, v->local_family, ret_family, ret_address); +} + +static int netdev_vxlan_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + assert(m); + + union in_addr_union local; + int local_family, r; + VxLan *v = VXLAN(netdev); + + if (v->vni <= VXLAN_VID_MAX) { + r = sd_netlink_message_append_u32(m, IFLA_VXLAN_ID, v->vni); + if (r < 0) + return r; + } + + if (in_addr_is_set(v->group_family, &v->group)) { + if (v->group_family == AF_INET) + r = sd_netlink_message_append_in_addr(m, IFLA_VXLAN_GROUP, &v->group.in); + else + r = sd_netlink_message_append_in6_addr(m, IFLA_VXLAN_GROUP6, &v->group.in6); + if (r < 0) + return r; + } else if (in_addr_is_set(v->remote_family, &v->remote)) { + if (v->remote_family == AF_INET) + r = sd_netlink_message_append_in_addr(m, IFLA_VXLAN_GROUP, &v->remote.in); + else + r = sd_netlink_message_append_in6_addr(m, IFLA_VXLAN_GROUP6, &v->remote.in6); + if (r < 0) + return r; + } + + r = vxlan_get_local_address(v, link, &local_family, &local); + if (r < 0) + return r; + + if (in_addr_is_set(local_family, &local)) { + if (local_family == AF_INET) + r = sd_netlink_message_append_in_addr(m, IFLA_VXLAN_LOCAL, &local.in); + else + r = sd_netlink_message_append_in6_addr(m, IFLA_VXLAN_LOCAL6, &local.in6); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(m, IFLA_VXLAN_LINK, link ? link->ifindex : 0); + if (r < 0) + return r; + + if (v->inherit) { + r = sd_netlink_message_append_flag(m, IFLA_VXLAN_TTL_INHERIT); + if (r < 0) + return r; + } else { + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_TTL, v->ttl); + if (r < 0) + return r; + } + + if (v->tos != 0) { + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_TOS, v->tos); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_LEARNING, v->learning); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_RSC, v->route_short_circuit); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_PROXY, v->arp_proxy); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_L2MISS, v->l2miss); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_L3MISS, v->l3miss); + if (r < 0) + return r; + + if (v->fdb_ageing != 0) { + r = sd_netlink_message_append_u32(m, IFLA_VXLAN_AGEING, v->fdb_ageing / USEC_PER_SEC); + if (r < 0) + return r; + } + + if (v->max_fdb != 0) { + r = sd_netlink_message_append_u32(m, IFLA_VXLAN_LIMIT, v->max_fdb); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_UDP_CSUM, v->udpcsum); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, v->udp6zerocsumtx); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, v->udp6zerocsumrx); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_REMCSUM_TX, v->remote_csum_tx); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_REMCSUM_RX, v->remote_csum_rx); + if (r < 0) + return r; + + r = sd_netlink_message_append_u16(m, IFLA_VXLAN_PORT, htobe16(v->dest_port)); + if (r < 0) + return r; + + if (v->port_range.low != 0 || v->port_range.high != 0) { + struct ifla_vxlan_port_range port_range; + + port_range.low = htobe16(v->port_range.low); + port_range.high = htobe16(v->port_range.high); + + r = sd_netlink_message_append_data(m, IFLA_VXLAN_PORT_RANGE, &port_range, sizeof(port_range)); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(m, IFLA_VXLAN_LABEL, htobe32(v->flow_label)); + if (r < 0) + return r; + + if (v->group_policy) { + r = sd_netlink_message_append_flag(m, IFLA_VXLAN_GBP); + if (r < 0) + return r; + } + + if (v->generic_protocol_extension) { + r = sd_netlink_message_append_flag(m, IFLA_VXLAN_GPE); + if (r < 0) + return r; + } + + if (v->df != _NETDEV_VXLAN_DF_INVALID) { + r = sd_netlink_message_append_u8(m, IFLA_VXLAN_DF, v->df); + if (r < 0) + return r; + } + + return 0; +} + +int config_parse_vxlan_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + VxLan *v = ASSERT_PTR(userdata); + union in_addr_union *addr = data, buffer; + int *family, f, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(lvalue, "Local")) + family = &v->local_family; + else if (streq(lvalue, "Remote")) + family = &v->remote_family; + else if (streq(lvalue, "Group")) + family = &v->group_family; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *addr = IN_ADDR_NULL; + *family = AF_UNSPEC; + return 0; + } + + if (streq(lvalue, "Local")) { + NetDevLocalAddressType t; + + t = netdev_local_address_type_from_string(rvalue); + if (t >= 0) { + v->local = IN_ADDR_NULL; + v->local_family = AF_UNSPEC; + v->local_type = t; + return 0; + } + } + + r = in_addr_from_string_auto(rvalue, &f, &buffer); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + r = in_addr_is_multicast(f, &buffer); + + if (streq(lvalue, "Group")) { + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s= must be a multicast address, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + } else { + if (r > 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s= cannot be a multicast address, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + } + + if (streq(lvalue, "Local")) + v->local_type = _NETDEV_LOCAL_ADDRESS_TYPE_INVALID; + *addr = buffer; + *family = f; + + return 0; +} + +int config_parse_port_range( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + VxLan *v = ASSERT_PTR(userdata); + int r; + + r = parse_ip_port_range(rvalue, &v->port_range.low, &v->port_range.high); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse VXLAN port range '%s'. Port should be greater than 0 and less than 65535.", rvalue); + return 0; +} + +int config_parse_flow_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + VxLan *v = userdata; + unsigned f; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = safe_atou(rvalue, &f); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse VXLAN flow label '%s'.", rvalue); + return 0; + } + + if (f & ~VXLAN_FLOW_LABEL_MAX_MASK) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "VXLAN flow label '%s' not valid. Flow label range should be [0-1048575].", rvalue); + return 0; + } + + v->flow_label = f; + + return 0; +} + +int config_parse_vxlan_ttl( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + VxLan *v = ASSERT_PTR(userdata); + int r; + + if (streq(rvalue, "inherit")) { + v->inherit = true; + v->ttl = 0; /* unset the unused ttl field for clarity */ + return 0; + } + + r = config_parse_unsigned_bounded( + unit, filename, line, section, section_line, lvalue, rvalue, + 0, UINT8_MAX, true, + &v->ttl); + if (r <= 0) + return r; + v->inherit = false; + return 0; +} + +static int netdev_vxlan_verify(NetDev *netdev, const char *filename) { + assert(filename); + + VxLan *v = VXLAN(netdev); + + if (v->vni > VXLAN_VID_MAX) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: VXLAN without valid VNI (or VXLAN Segment ID) configured. Ignoring.", + filename); + + if (v->ttl > 255) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: VXLAN TTL must be <= 255. Ignoring.", + filename); + + if (!v->dest_port && v->generic_protocol_extension) + v->dest_port = 4790; + + if (in_addr_is_set(v->group_family, &v->group) && in_addr_is_set(v->remote_family, &v->remote)) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: VXLAN both 'Group=' and 'Remote=' cannot be specified. Ignoring.", + filename); + + if (v->independent && v->local_type >= 0) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "The local address cannot be '%s' when Independent= is enabled, ignoring.", + strna(netdev_local_address_type_to_string(v->local_type))); + + return 0; +} + +static int netdev_vxlan_is_ready_to_create(NetDev *netdev, Link *link) { + VxLan *v = VXLAN(netdev); + + if (v->independent) + return true; + + return vxlan_get_local_address(v, link, NULL, NULL) >= 0; +} + +static void vxlan_init(NetDev *netdev) { + VxLan *v = VXLAN(netdev); + + v->local_type = _NETDEV_LOCAL_ADDRESS_TYPE_INVALID; + v->vni = VXLAN_VID_MAX + 1; + v->df = _NETDEV_VXLAN_DF_INVALID; + v->learning = true; + v->udpcsum = false; + v->udp6zerocsumtx = false; + v->udp6zerocsumrx = false; +} + +const NetDevVTable vxlan_vtable = { + .object_size = sizeof(VxLan), + .init = vxlan_init, + .sections = NETDEV_COMMON_SECTIONS "VXLAN\0", + .fill_message_create = netdev_vxlan_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .is_ready_to_create = netdev_vxlan_is_ready_to_create, + .config_verify = netdev_vxlan_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, +}; diff --git a/src/network/netdev/vxlan.h b/src/network/netdev/vxlan.h new file mode 100644 index 0000000..141ac4d --- /dev/null +++ b/src/network/netdev/vxlan.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct VxLan VxLan; + +#include + +#include "in-addr-util.h" +#include "netdev-util.h" +#include "netdev.h" + +#define VXLAN_VID_MAX (1u << 24) - 1 +#define VXLAN_FLOW_LABEL_MAX_MASK 0xFFFFFU + +typedef enum VxLanDF { + NETDEV_VXLAN_DF_NO = VXLAN_DF_UNSET, + NETDEV_VXLAN_DF_YES = VXLAN_DF_SET, + NETDEV_VXLAN_DF_INHERIT = VXLAN_DF_INHERIT, + _NETDEV_VXLAN_DF_MAX, + _NETDEV_VXLAN_DF_INVALID = -EINVAL, +} VxLanDF; + +struct VxLan { + NetDev meta; + + uint32_t vni; + + int remote_family; + int local_family; + int group_family; + + VxLanDF df; + + NetDevLocalAddressType local_type; + union in_addr_union local; + union in_addr_union remote; + union in_addr_union group; + + unsigned tos; + unsigned ttl; + unsigned max_fdb; + unsigned flow_label; + + uint16_t dest_port; + + usec_t fdb_ageing; + + bool learning; + bool arp_proxy; + bool route_short_circuit; + bool l2miss; + bool l3miss; + bool udpcsum; + bool udp6zerocsumtx; + bool udp6zerocsumrx; + bool remote_csum_tx; + bool remote_csum_rx; + bool group_policy; + bool generic_protocol_extension; + bool inherit; + bool independent; + + struct ifla_vxlan_port_range port_range; +}; + +DEFINE_NETDEV_CAST(VXLAN, VxLan); +extern const NetDevVTable vxlan_vtable; + +const char *df_to_string(VxLanDF d) _const_; +VxLanDF df_from_string(const char *d) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_vxlan_address); +CONFIG_PARSER_PROTOTYPE(config_parse_port_range); +CONFIG_PARSER_PROTOTYPE(config_parse_flow_label); +CONFIG_PARSER_PROTOTYPE(config_parse_df); +CONFIG_PARSER_PROTOTYPE(config_parse_vxlan_ttl); diff --git a/src/network/netdev/wireguard.c b/src/network/netdev/wireguard.c new file mode 100644 index 0000000..4c7d837 --- /dev/null +++ b/src/network/netdev/wireguard.c @@ -0,0 +1,1141 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2015-2017 Jason A. Donenfeld . All Rights Reserved. +***/ + +#include +#include +#include +#include +#include + +#include "sd-resolve.h" + +#include "alloc-util.h" +#include "dns-domain.h" +#include "event-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "memory-util.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "networkd-route-util.h" +#include "networkd-route.h" +#include "networkd-util.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "random-util.h" +#include "resolve-private.h" +#include "string-util.h" +#include "strv.h" +#include "wireguard.h" + +static void wireguard_resolve_endpoints(NetDev *netdev); +static int peer_resolve_endpoint(WireguardPeer *peer); + +static void wireguard_peer_clear_ipmasks(WireguardPeer *peer) { + assert(peer); + + LIST_CLEAR(ipmasks, peer->ipmasks, free); +} + +static WireguardPeer* wireguard_peer_free(WireguardPeer *peer) { + if (!peer) + return NULL; + + if (peer->wireguard) { + LIST_REMOVE(peers, peer->wireguard->peers, peer); + + if (peer->section) + hashmap_remove(peer->wireguard->peers_by_section, peer->section); + } + + config_section_free(peer->section); + + wireguard_peer_clear_ipmasks(peer); + + free(peer->endpoint_host); + free(peer->endpoint_port); + free(peer->preshared_key_file); + explicit_bzero_safe(peer->preshared_key, WG_KEY_LEN); + + sd_event_source_disable_unref(peer->resolve_retry_event_source); + sd_resolve_query_unref(peer->resolve_query); + + return mfree(peer); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(WireguardPeer, wireguard_peer_free); + +static int wireguard_peer_new_static(Wireguard *w, const char *filename, unsigned section_line, WireguardPeer **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(wireguard_peer_freep) WireguardPeer *peer = NULL; + int r; + + assert(w); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + peer = hashmap_get(w->peers_by_section, n); + if (peer) { + *ret = TAKE_PTR(peer); + return 0; + } + + peer = new(WireguardPeer, 1); + if (!peer) + return -ENOMEM; + + *peer = (WireguardPeer) { + .flags = WGPEER_F_REPLACE_ALLOWEDIPS, + .wireguard = w, + .section = TAKE_PTR(n), + }; + + LIST_PREPEND(peers, w->peers, peer); + + r = hashmap_ensure_put(&w->peers_by_section, &config_section_hash_ops, peer->section, peer); + if (r < 0) + return r; + + *ret = TAKE_PTR(peer); + return 0; +} + +static int wireguard_set_ipmask_one(NetDev *netdev, sd_netlink_message *message, const WireguardIPmask *mask, uint16_t index) { + int r; + + assert(message); + assert(mask); + assert(index > 0); + + /* This returns 1 on success, 0 on recoverable error, and negative errno on failure. */ + + r = sd_netlink_message_open_array(message, index); + if (r < 0) + return 0; + + r = sd_netlink_message_append_u16(message, WGALLOWEDIP_A_FAMILY, mask->family); + if (r < 0) + goto cancel; + + r = netlink_message_append_in_addr_union(message, WGALLOWEDIP_A_IPADDR, mask->family, &mask->ip); + if (r < 0) + goto cancel; + + r = sd_netlink_message_append_u8(message, WGALLOWEDIP_A_CIDR_MASK, mask->cidr); + if (r < 0) + goto cancel; + + r = sd_netlink_message_close_container(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not add wireguard allowed ip: %m"); + + return 1; + +cancel: + r = sd_netlink_message_cancel_array(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not cancel wireguard allowed ip message attribute: %m"); + + return 0; +} + +static int wireguard_set_peer_one(NetDev *netdev, sd_netlink_message *message, const WireguardPeer *peer, uint16_t index, WireguardIPmask **mask_start) { + WireguardIPmask *start, *last = NULL; + uint16_t j = 0; + int r; + + assert(message); + assert(peer); + assert(index > 0); + assert(mask_start); + + /* This returns 1 on success, 0 on recoverable error, and negative errno on failure. */ + + start = *mask_start ?: peer->ipmasks; + + r = sd_netlink_message_open_array(message, index); + if (r < 0) + return 0; + + r = sd_netlink_message_append_data(message, WGPEER_A_PUBLIC_KEY, &peer->public_key, sizeof(peer->public_key)); + if (r < 0) + goto cancel; + + if (!*mask_start) { + r = sd_netlink_message_append_data(message, WGPEER_A_PRESHARED_KEY, &peer->preshared_key, WG_KEY_LEN); + if (r < 0) + goto cancel; + + r = sd_netlink_message_append_u32(message, WGPEER_A_FLAGS, peer->flags); + if (r < 0) + goto cancel; + + r = sd_netlink_message_append_u16(message, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, peer->persistent_keepalive_interval); + if (r < 0) + goto cancel; + + if (IN_SET(peer->endpoint.sa.sa_family, AF_INET, AF_INET6)) { + r = netlink_message_append_sockaddr_union(message, WGPEER_A_ENDPOINT, &peer->endpoint); + if (r < 0) + goto cancel; + } + } + + r = sd_netlink_message_open_container(message, WGPEER_A_ALLOWEDIPS); + if (r < 0) + goto cancel; + + LIST_FOREACH(ipmasks, mask, start) { + r = wireguard_set_ipmask_one(netdev, message, mask, ++j); + if (r < 0) + return r; + if (r == 0) { + last = mask; + break; + } + } + + r = sd_netlink_message_close_container(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not add wireguard allowed ip: %m"); + + r = sd_netlink_message_close_container(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not add wireguard peer: %m"); + + *mask_start = last; /* Start next cycle from this mask. */ + return !last; + +cancel: + r = sd_netlink_message_cancel_array(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not cancel wireguard peers: %m"); + + return 0; +} + +static int wireguard_set_interface(NetDev *netdev) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *message = NULL; + WireguardIPmask *mask_start = NULL; + bool sent_once = false; + uint32_t serial; + Wireguard *w = WIREGUARD(netdev); + int r; + + for (WireguardPeer *peer_start = w->peers; peer_start || !sent_once; ) { + uint16_t i = 0; + + message = sd_netlink_message_unref(message); + + r = sd_genl_message_new(netdev->manager->genl, WG_GENL_NAME, WG_CMD_SET_DEVICE, &message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Failed to allocate generic netlink message: %m"); + + r = sd_netlink_message_append_string(message, WGDEVICE_A_IFNAME, netdev->ifname); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append wireguard interface name: %m"); + + if (peer_start == w->peers) { + r = sd_netlink_message_append_data(message, WGDEVICE_A_PRIVATE_KEY, &w->private_key, WG_KEY_LEN); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append wireguard private key: %m"); + + r = sd_netlink_message_append_u16(message, WGDEVICE_A_LISTEN_PORT, w->port); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append wireguard port: %m"); + + r = sd_netlink_message_append_u32(message, WGDEVICE_A_FWMARK, w->fwmark); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append wireguard fwmark: %m"); + + r = sd_netlink_message_append_u32(message, WGDEVICE_A_FLAGS, w->flags); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append wireguard flags: %m"); + } + + r = sd_netlink_message_open_container(message, WGDEVICE_A_PEERS); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append wireguard peer attributes: %m"); + + WireguardPeer *peer_last = NULL; + LIST_FOREACH(peers, peer, peer_start) { + r = wireguard_set_peer_one(netdev, message, peer, ++i, &mask_start); + if (r < 0) + return r; + if (r == 0) { + peer_last = peer; + break; + } + } + peer_start = peer_last; /* Start next cycle from this peer. */ + + r = sd_netlink_message_close_container(message); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not close wireguard container: %m"); + + r = sd_netlink_send(netdev->manager->genl, message, &serial); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not set wireguard device: %m"); + + sent_once = true; + } + + return 0; +} + +static int on_resolve_retry(sd_event_source *s, usec_t usec, void *userdata) { + WireguardPeer *peer = ASSERT_PTR(userdata); + NetDev *netdev; + + assert(peer->wireguard); + + netdev = NETDEV(peer->wireguard); + + if (!netdev_is_managed(netdev)) + return 0; + + peer->resolve_query = sd_resolve_query_unref(peer->resolve_query); + + (void) peer_resolve_endpoint(peer); + return 0; +} + +static usec_t peer_next_resolve_usec(WireguardPeer *peer) { + usec_t usec; + + /* Given the number of retries this function will return an exponential increasing amount of + * milliseconds to wait starting at 200ms and capped at 25 seconds. */ + + assert(peer); + + usec = (2 << MIN(peer->n_retries, 7U)) * 100 * USEC_PER_MSEC; + + return random_u64_range(usec / 10) + usec * 9 / 10; +} + +static int wireguard_peer_resolve_handler( + sd_resolve_query *q, + int ret, + const struct addrinfo *ai, + void *userdata) { + + WireguardPeer *peer = ASSERT_PTR(userdata); + NetDev *netdev; + int r; + + assert(peer->wireguard); + + netdev = NETDEV(peer->wireguard); + + if (!netdev_is_managed(netdev)) + return 0; + + if (ret != 0) { + log_netdev_warning(netdev, "Failed to resolve host '%s:%s', ignoring: %s", + peer->endpoint_host, peer->endpoint_port, gai_strerror(ret)); + peer->n_retries++; + + } else { + bool found = false; + for (; ai; ai = ai->ai_next) { + if (!IN_SET(ai->ai_family, AF_INET, AF_INET6)) + continue; + + if (ai->ai_addrlen != (ai->ai_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6))) + continue; + + memcpy(&peer->endpoint, ai->ai_addr, ai->ai_addrlen); + (void) wireguard_set_interface(netdev); + peer->n_retries = 0; + found = true; + break; + } + + if (!found) { + log_netdev_warning(netdev, "Neither IPv4 nor IPv6 address found for peer endpoint %s:%s, ignoring the endpoint.", + peer->endpoint_host, peer->endpoint_port); + peer->n_retries++; + } + } + + if (peer->n_retries > 0) { + r = event_reset_time_relative(netdev->manager->event, + &peer->resolve_retry_event_source, + CLOCK_BOOTTIME, + peer_next_resolve_usec(peer), 0, + on_resolve_retry, peer, 0, "wireguard-resolve-retry", true); + if (r < 0) + log_netdev_warning_errno(netdev, r, "Could not arm resolve retry handler for endpoint %s:%s, ignoring: %m", + peer->endpoint_host, peer->endpoint_port); + } + + wireguard_resolve_endpoints(netdev); + return 0; +} + +static int peer_resolve_endpoint(WireguardPeer *peer) { + static const struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_DGRAM, + .ai_protocol = IPPROTO_UDP + }; + NetDev *netdev; + int r; + + assert(peer); + assert(peer->wireguard); + + netdev = NETDEV(peer->wireguard); + + if (!peer->endpoint_host || !peer->endpoint_port) + /* Not necessary to resolve the endpoint. */ + return 0; + + if (sd_event_source_get_enabled(peer->resolve_retry_event_source, NULL) > 0) + /* Timer event source is enabled. The endpoint will be resolved later. */ + return 0; + + if (peer->resolve_query) + /* Being resolved, or already resolved. */ + return 0; + + r = sd_resolve_getaddrinfo(netdev->manager->resolve, + &peer->resolve_query, + peer->endpoint_host, + peer->endpoint_port, + &hints, + wireguard_peer_resolve_handler, + peer); + if (r < 0) + return log_netdev_full_errno(netdev, r == -ENOBUFS ? LOG_DEBUG : LOG_WARNING, r, + "Failed to create endpoint resolver for %s:%s, ignoring: %m", + peer->endpoint_host, peer->endpoint_port); + + return 0; +} + +static void wireguard_resolve_endpoints(NetDev *netdev) { + Wireguard *w = WIREGUARD(netdev); + + LIST_FOREACH(peers, peer, w->peers) + if (peer_resolve_endpoint(peer) == -ENOBUFS) + /* Too many requests. Let's resolve remaining endpoints later. */ + break; +} + +static int netdev_wireguard_post_create(NetDev *netdev, Link *link) { + assert(WIREGUARD(netdev)); + + (void) wireguard_set_interface(netdev); + wireguard_resolve_endpoints(netdev); + return 0; +} + +int config_parse_wireguard_listen_port( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint16_t *s = ASSERT_PTR(data); + int r; + + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "auto")) { + *s = 0; + return 0; + } + + r = parse_ip_port(rvalue, s); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid port specification, ignoring assignment: %s", rvalue); + return 0; + } + + return 0; +} + +static int wireguard_decode_key_and_warn( + const char *rvalue, + uint8_t ret[static WG_KEY_LEN], + const char *unit, + const char *filename, + unsigned line, + const char *lvalue) { + + _cleanup_(erase_and_freep) void *key = NULL; + size_t len; + int r; + + assert(rvalue); + assert(ret); + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + memzero(ret, WG_KEY_LEN); + return 0; + } + + if (!streq(lvalue, "PublicKey")) + (void) warn_file_is_world_accessible(filename, NULL, unit, line); + + r = unbase64mem_full(rvalue, strlen(rvalue), true, &key, &len); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to decode wireguard key provided by %s=, ignoring assignment: %m", lvalue); + return 0; + } + if (len != WG_KEY_LEN) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Wireguard key provided by %s= has invalid length (%zu bytes), ignoring assignment.", + lvalue, len); + return 0; + } + + memcpy(ret, key, WG_KEY_LEN); + return 0; +} + +int config_parse_wireguard_private_key( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Wireguard *w = WIREGUARD(data); + + return wireguard_decode_key_and_warn(rvalue, w->private_key, unit, filename, line, lvalue); +} + +int config_parse_wireguard_private_key_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Wireguard *w = WIREGUARD(data); + _cleanup_free_ char *path = NULL; + + if (isempty(rvalue)) { + w->private_key_file = mfree(w->private_key_file); + return 0; + } + + path = strdup(rvalue); + if (!path) + return log_oom(); + + if (path_simplify_and_warn(path, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue) < 0) + return 0; + + return free_and_replace(w->private_key_file, path); +} + +int config_parse_wireguard_peer_key( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Wireguard *w = WIREGUARD(data); + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + int r; + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + r = wireguard_decode_key_and_warn(rvalue, + streq(lvalue, "PublicKey") ? peer->public_key : peer->preshared_key, + unit, filename, line, lvalue); + if (r < 0) + return r; + + TAKE_PTR(peer); + return 0; +} + +int config_parse_wireguard_preshared_key_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Wireguard *w = WIREGUARD(data); + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + _cleanup_free_ char *path = NULL; + int r; + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + peer->preshared_key_file = mfree(peer->preshared_key_file); + TAKE_PTR(peer); + return 0; + } + + path = strdup(rvalue); + if (!path) + return log_oom(); + + if (path_simplify_and_warn(path, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue) < 0) + return 0; + + free_and_replace(peer->preshared_key_file, path); + TAKE_PTR(peer); + return 0; +} + +int config_parse_wireguard_allowed_ips( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(rvalue); + + Wireguard *w = WIREGUARD(data); + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + union in_addr_union addr; + unsigned char prefixlen; + int r, family; + WireguardIPmask *ipmask; + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + wireguard_peer_clear_ipmasks(peer); + TAKE_PTR(peer); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + union in_addr_union masked; + + r = extract_first_word(&p, &word, "," WHITESPACE, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to split allowed ips \"%s\" option: %m", rvalue); + break; + } + + r = in_addr_prefix_from_string_auto(word, &family, &addr, &prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Network address is invalid, ignoring assignment: %s", word); + continue; + } + + masked = addr; + assert_se(in_addr_mask(family, &masked, prefixlen) >= 0); + if (!in_addr_equal(family, &masked, &addr)) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified address '%s' is not properly masked, assuming '%s'.", + word, + IN_ADDR_PREFIX_TO_STRING(family, &masked, prefixlen)); + + ipmask = new(WireguardIPmask, 1); + if (!ipmask) + return log_oom(); + + *ipmask = (WireguardIPmask) { + .family = family, + .ip = masked, + .cidr = prefixlen, + }; + + LIST_PREPEND(ipmasks, peer->ipmasks, ipmask); + } + + TAKE_PTR(peer); + return 0; +} + +int config_parse_wireguard_endpoint( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(filename); + assert(rvalue); + assert(userdata); + + Wireguard *w = WIREGUARD(userdata); + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + _cleanup_free_ char *host = NULL; + union in_addr_union addr; + const char *p; + uint16_t port; + int family, r; + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + r = in_addr_port_ifindex_name_from_string_auto(rvalue, &family, &addr, &port, NULL, NULL); + if (r >= 0) { + if (family == AF_INET) + peer->endpoint.in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr = addr.in, + .sin_port = htobe16(port), + }; + else if (family == AF_INET6) + peer->endpoint.in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = addr.in6, + .sin6_port = htobe16(port), + }; + else + assert_not_reached(); + + peer->endpoint_host = mfree(peer->endpoint_host); + peer->endpoint_port = mfree(peer->endpoint_port); + + TAKE_PTR(peer); + return 0; + } + + p = strrchr(rvalue, ':'); + if (!p) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unable to find port of endpoint, ignoring assignment: %s", + rvalue); + return 0; + } + + host = strndup(rvalue, p - rvalue); + if (!host) + return log_oom(); + + if (!dns_name_is_valid(host)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid domain name of endpoint, ignoring assignment: %s", + rvalue); + return 0; + } + + p++; + r = parse_ip_port(p, &port); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid port of endpoint, ignoring assignment: %s", + rvalue); + return 0; + } + + peer->endpoint = (union sockaddr_union) {}; + + free_and_replace(peer->endpoint_host, host); + + r = free_and_strdup(&peer->endpoint_port, p); + if (r < 0) + return log_oom(); + + TAKE_PTR(peer); /* The peer may already have been in the hash map, that is fine too. */ + return 0; +} + +int config_parse_wireguard_keepalive( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + assert(rvalue); + + Wireguard *w = WIREGUARD(data); + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + uint16_t keepalive = 0; + int r; + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + if (streq(rvalue, "off")) + keepalive = 0; + else { + r = safe_atou16(rvalue, &keepalive); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse \"%s\" as keepalive interval (range 0–65535), ignoring assignment: %m", + rvalue); + return 0; + } + } + + peer->persistent_keepalive_interval = keepalive; + + TAKE_PTR(peer); + return 0; +} + +int config_parse_wireguard_route_table( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + NetDev *netdev = ASSERT_PTR(userdata); + uint32_t *table = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || parse_boolean(rvalue) == 0) { + *table = 0; /* Disabled. */ + return 0; + } + + r = manager_get_route_table_from_string(netdev->manager, rvalue, table); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_wireguard_peer_route_table( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Wireguard *w = WIREGUARD(userdata); + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(NETDEV(w)->manager); + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + peer->route_table_set = false; /* Use the table specified in [WireGuard] section. */ + TAKE_PTR(peer); + return 0; + } + + if (parse_boolean(rvalue) == 0) { + peer->route_table = 0; /* Disabled. */ + peer->route_table_set = true; + TAKE_PTR(peer); + return 0; + } + + r = manager_get_route_table_from_string(NETDEV(w)->manager, rvalue, &peer->route_table); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + peer->route_table_set = true; + TAKE_PTR(peer); + return 0; +} + +int config_parse_wireguard_route_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *priority = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *priority = 0; + return 0; + } + + r = safe_atou32(rvalue, priority); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse route priority \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + return 0; +} + +int config_parse_wireguard_peer_route_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(wireguard_peer_free_or_set_invalidp) WireguardPeer *peer = NULL; + Wireguard *w; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(userdata); + + w = WIREGUARD(userdata); + assert(w); + + r = wireguard_peer_new_static(w, filename, section_line, &peer); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + peer->route_priority_set = false; /* Use the priority specified in [WireGuard] section. */ + TAKE_PTR(peer); + return 0; + } + + r = safe_atou32(rvalue, &peer->route_priority); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse route priority \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + peer->route_priority_set = true; + TAKE_PTR(peer); + return 0; +} + +static void wireguard_init(NetDev *netdev) { + Wireguard *w = WIREGUARD(netdev); + + w->flags = WGDEVICE_F_REPLACE_PEERS; +} + +static void wireguard_done(NetDev *netdev) { + Wireguard *w = WIREGUARD(netdev); + + explicit_bzero_safe(w->private_key, WG_KEY_LEN); + free(w->private_key_file); + + hashmap_free_with_destructor(w->peers_by_section, wireguard_peer_free); + + set_free(w->routes); +} + +static int wireguard_read_key_file(const char *filename, uint8_t dest[static WG_KEY_LEN]) { + _cleanup_(erase_and_freep) char *key = NULL; + size_t key_len; + int r; + + if (!filename) + return 0; + + assert(dest); + + r = read_full_file_full( + AT_FDCWD, filename, UINT64_MAX, WG_KEY_LEN, + READ_FULL_FILE_SECURE | + READ_FULL_FILE_UNBASE64 | + READ_FULL_FILE_WARN_WORLD_READABLE | + READ_FULL_FILE_CONNECT_SOCKET | + READ_FULL_FILE_FAIL_WHEN_LARGER, + NULL, &key, &key_len); + if (r < 0) + return r; + + if (key_len != WG_KEY_LEN) + return -EINVAL; + + memcpy(dest, key, WG_KEY_LEN); + return 0; +} + +static int wireguard_peer_verify(WireguardPeer *peer) { + NetDev *netdev = NETDEV(peer->wireguard); + int r; + + if (section_is_invalid(peer->section)) + return -EINVAL; + + if (eqzero(peer->public_key)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: WireGuardPeer section without PublicKey= configured. " + "Ignoring [WireGuardPeer] section from line %u.", + peer->section->filename, peer->section->line); + + r = wireguard_read_key_file(peer->preshared_key_file, peer->preshared_key); + if (r < 0) + return log_netdev_error_errno(netdev, r, + "%s: Failed to read preshared key from '%s'. " + "Ignoring [WireGuardPeer] section from line %u.", + peer->section->filename, peer->preshared_key_file, + peer->section->line); + + return 0; +} + +static int wireguard_verify(NetDev *netdev, const char *filename) { + Wireguard *w = WIREGUARD(netdev); + int r; + + r = wireguard_read_key_file(w->private_key_file, w->private_key); + if (r < 0) + return log_netdev_error_errno(netdev, r, + "Failed to read private key from %s. Ignoring network device.", + w->private_key_file); + + if (eqzero(w->private_key)) + return log_netdev_error_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: Missing PrivateKey= or PrivateKeyFile=, " + "Ignoring network device.", filename); + + LIST_FOREACH(peers, peer, w->peers) { + if (wireguard_peer_verify(peer) < 0) { + wireguard_peer_free(peer); + continue; + } + + if ((peer->route_table_set ? peer->route_table : w->route_table) == 0) + continue; + + LIST_FOREACH(ipmasks, ipmask, peer->ipmasks) { + _cleanup_(route_freep) Route *route = NULL; + + r = route_new(&route); + if (r < 0) + return log_oom(); + + route->family = ipmask->family; + route->dst = ipmask->ip; + route->dst_prefixlen = ipmask->cidr; + route->scope = RT_SCOPE_UNIVERSE; + route->protocol = RTPROT_STATIC; + route->table = peer->route_table_set ? peer->route_table : w->route_table; + route->priority = peer->route_priority_set ? peer->route_priority : w->route_priority; + if (route->priority == 0 && route->family == AF_INET6) + route->priority = IP6_RT_PRIO_USER; + route->source = NETWORK_CONFIG_SOURCE_STATIC; + + r = set_ensure_consume(&w->routes, &route_hash_ops, TAKE_PTR(route)); + if (r < 0) + return log_oom(); + } + } + + return 0; +} + +const NetDevVTable wireguard_vtable = { + .object_size = sizeof(Wireguard), + .sections = NETDEV_COMMON_SECTIONS "WireGuard\0WireGuardPeer\0", + .post_create = netdev_wireguard_post_create, + .init = wireguard_init, + .done = wireguard_done, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = wireguard_verify, + .iftype = ARPHRD_NONE, +}; diff --git a/src/network/netdev/wireguard.h b/src/network/netdev/wireguard.h new file mode 100644 index 0000000..09dca88 --- /dev/null +++ b/src/network/netdev/wireguard.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +typedef struct Wireguard Wireguard; + +#include +#include + +#include "sd-event.h" +#include "sd-resolve.h" + +#include "in-addr-util.h" +#include "netdev.h" +#include "socket-util.h" + +typedef struct WireguardIPmask { + uint16_t family; + union in_addr_union ip; + uint8_t cidr; + + LIST_FIELDS(struct WireguardIPmask, ipmasks); +} WireguardIPmask; + +typedef struct WireguardPeer { + Wireguard *wireguard; + ConfigSection *section; + + uint8_t public_key[WG_KEY_LEN]; + uint8_t preshared_key[WG_KEY_LEN]; + char *preshared_key_file; + uint32_t flags; + uint16_t persistent_keepalive_interval; + + union sockaddr_union endpoint; + char *endpoint_host; + char *endpoint_port; + + unsigned n_retries; + sd_event_source *resolve_retry_event_source; + sd_resolve_query *resolve_query; + + uint32_t route_table; + uint32_t route_priority; + bool route_table_set; + bool route_priority_set; + + LIST_HEAD(WireguardIPmask, ipmasks); + LIST_FIELDS(struct WireguardPeer, peers); +} WireguardPeer; + +struct Wireguard { + NetDev meta; + unsigned last_peer_section; + + uint32_t flags; + uint8_t private_key[WG_KEY_LEN]; + char *private_key_file; + uint16_t port; + uint32_t fwmark; + + Hashmap *peers_by_section; + LIST_HEAD(WireguardPeer, peers); + + Set *routes; + uint32_t route_table; + uint32_t route_priority; +}; + +DEFINE_NETDEV_CAST(WIREGUARD, Wireguard); +extern const NetDevVTable wireguard_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_allowed_ips); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_endpoint); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_listen_port); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_peer_key); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_private_key); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_private_key_file); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_preshared_key_file); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_keepalive); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_route_table); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_peer_route_table); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_route_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_wireguard_peer_route_priority); diff --git a/src/network/netdev/wlan.c b/src/network/netdev/wlan.c new file mode 100644 index 0000000..904e40f --- /dev/null +++ b/src/network/netdev/wlan.c @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-netlink.h" + +#include "netlink-util.h" +#include "networkd-manager.h" +#include "networkd-wiphy.h" +#include "parse-util.h" +#include "wifi-util.h" +#include "wlan.h" + +static void wlan_done(NetDev *netdev) { + WLan *w = WLAN(netdev); + + w->wiphy_name = mfree(w->wiphy_name); +} + +static void wlan_init(NetDev *netdev) { + WLan *w = WLAN(netdev); + + w->wiphy_index = UINT32_MAX; + w->wds = -1; +} + +static int wlan_get_wiphy(NetDev *netdev, Wiphy **ret) { + WLan *w = WLAN(netdev); + + if (w->wiphy_name) + return wiphy_get_by_name(netdev->manager, w->wiphy_name, ret); + + return wiphy_get_by_index(netdev->manager, w->wiphy_index, ret); +} + +static int wlan_is_ready_to_create(NetDev *netdev, Link *link) { + return wlan_get_wiphy(netdev, NULL) >= 0; +} + +static int wlan_fill_message(NetDev *netdev, sd_netlink_message *m) { + WLan *w = WLAN(netdev); + Wiphy *wiphy; + int r; + + r = wlan_get_wiphy(netdev, &wiphy); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NL80211_ATTR_WIPHY, wiphy->index); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NL80211_ATTR_IFNAME, netdev->ifname); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NL80211_ATTR_IFTYPE, w->iftype); + if (r < 0) + return r; + + if (!hw_addr_is_null(&netdev->hw_addr) && netdev->hw_addr.length == ETH_ALEN) { + r = sd_netlink_message_append_ether_addr(m, NL80211_ATTR_MAC, &netdev->hw_addr.ether); + if (r < 0) + return r; + } + + if (w->wds >= 0) { + r = sd_netlink_message_append_u8(m, NL80211_ATTR_4ADDR, w->wds); + if (r < 0) + return r; + } + + return 0; +} + +static int wlan_create_handler(sd_netlink *genl, sd_netlink_message *m, NetDev *netdev) { + int r; + + assert(netdev); + assert(netdev->state != _NETDEV_STATE_INVALID); + + r = sd_netlink_message_get_errno(m); + if (IN_SET(r, -EEXIST, -ENFILE)) + /* Unlike the other netdevs, the kernel may return -ENFILE. See dev_alloc_name(). */ + log_netdev_info(netdev, "WLAN interface exists, using existing without changing its parameters."); + else if (r < 0) { + log_netdev_warning_errno(netdev, r, "WLAN interface could not be created: %m"); + netdev_enter_failed(netdev); + + return 1; + } + + log_netdev_debug(netdev, "WLAN interface is created."); + return 1; +} + +static int wlan_create(NetDev *netdev) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netdev); + assert(netdev->manager); + assert(netdev->manager->genl); + + r = sd_genl_message_new(netdev->manager->genl, NL80211_GENL_NAME, NL80211_CMD_NEW_INTERFACE, &m); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to allocate netlink message: %m"); + + r = wlan_fill_message(netdev, m); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to fill netlink message: %m"); + + r = netlink_call_async(netdev->manager->genl, NULL, m, wlan_create_handler, + netdev_destroy_callback, netdev); + if (r < 0) + return log_netdev_warning_errno(netdev, r, "Failed to send netlink message: %m"); + + netdev_ref(netdev); + return 0; +} + +static int wlan_verify(NetDev *netdev, const char *filename) { + WLan *w = WLAN(netdev); + + assert(filename); + + if (w->iftype == NL80211_IFTYPE_UNSPECIFIED) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: WLAN interface type is not specified, ignoring.", + filename); + + if (w->wiphy_index == UINT32_MAX && isempty(w->wiphy_name)) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: physical WLAN device is not specified, ignoring.", + filename); + + return 0; +} + +int config_parse_wiphy( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + WLan *w = WLAN(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + w->wiphy_name = mfree(w->wiphy_name); + w->wiphy_index = UINT32_MAX; + return 0; + } + + r = safe_atou32(rvalue, &w->wiphy_index); + if (r >= 0) { + w->wiphy_name = mfree(w->wiphy_name); + return 0; + } + + r = free_and_strdup_warn(&w->wiphy_name, rvalue); + if (r < 0) + return r; + + w->wiphy_index = UINT32_MAX; + return 0; +} + +int config_parse_wlan_iftype( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + enum nl80211_iftype t, *iftype = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *iftype = NL80211_IFTYPE_UNSPECIFIED; + return 0; + } + + t = nl80211_iftype_from_string(rvalue); + /* We reuse the kernel provided enum which does not contain negative value. So, the cast + * below is mandatory. Otherwise, the check below always passes. */ + if ((int) t < 0) { + log_syntax(unit, LOG_WARNING, filename, line, t, + "Failed to parse wlan interface type, ignoring assignment: %s", + rvalue); + return 0; + } + + *iftype = t; + return 0; +} + +const NetDevVTable wlan_vtable = { + .object_size = sizeof(WLan), + .init = wlan_init, + .done = wlan_done, + .sections = NETDEV_COMMON_SECTIONS "WLAN\0", + .is_ready_to_create = wlan_is_ready_to_create, + .create = wlan_create, + .create_type = NETDEV_CREATE_INDEPENDENT, + .config_verify = wlan_verify, + .iftype = ARPHRD_ETHER, + .generate_mac = true, + .skip_netdev_kind_check = true, +}; diff --git a/src/network/netdev/wlan.h b/src/network/netdev/wlan.h new file mode 100644 index 0000000..bcc2dbc --- /dev/null +++ b/src/network/netdev/wlan.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "netdev.h" + +typedef struct WLan { + NetDev meta; + + char *wiphy_name; + uint32_t wiphy_index; + enum nl80211_iftype iftype; + int wds; /* tristate */ +} WLan; + +DEFINE_NETDEV_CAST(WLAN, WLan); +extern const NetDevVTable wlan_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_wiphy); +CONFIG_PARSER_PROTOTYPE(config_parse_wlan_iftype); diff --git a/src/network/netdev/xfrm.c b/src/network/netdev/xfrm.c new file mode 100644 index 0000000..905bfc0 --- /dev/null +++ b/src/network/netdev/xfrm.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "missing_network.h" +#include "xfrm.h" + +static int xfrm_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *message) { + assert(message); + + Xfrm *x = XFRM(netdev); + int r; + + assert(link || x->independent); + + r = sd_netlink_message_append_u32(message, IFLA_XFRM_LINK, link ? link->ifindex : LOOPBACK_IFINDEX); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(message, IFLA_XFRM_IF_ID, x->if_id); + if (r < 0) + return r; + + return 0; +} + +static int xfrm_verify(NetDev *netdev, const char *filename) { + assert(filename); + + Xfrm *x = XFRM(netdev); + + if (x->if_id == 0) + return log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "%s: Xfrm interface ID cannot be zero.", filename); + return 0; +} + +const NetDevVTable xfrm_vtable = { + .object_size = sizeof(Xfrm), + .sections = NETDEV_COMMON_SECTIONS "Xfrm\0", + .fill_message_create = xfrm_fill_message_create, + .config_verify = xfrm_verify, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_NONE, +}; diff --git a/src/network/netdev/xfrm.h b/src/network/netdev/xfrm.h new file mode 100644 index 0000000..f56c4f2 --- /dev/null +++ b/src/network/netdev/xfrm.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "netdev.h" + +typedef struct Xfrm { + NetDev meta; + + uint32_t if_id; + bool independent; +} Xfrm; + +DEFINE_NETDEV_CAST(XFRM, Xfrm); +extern const NetDevVTable xfrm_vtable; diff --git a/src/network/networkctl.c b/src/network/networkctl.c new file mode 100644 index 0000000..ec31e8e --- /dev/null +++ b/src/network/networkctl.c @@ -0,0 +1,3499 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-dhcp-client.h" +#include "sd-hwdb.h" +#include "sd-lldp-rx.h" +#include "sd-netlink.h" +#include "sd-network.h" + +#include "alloc-util.h" +#include "bond-util.h" +#include "bridge-util.h" +#include "build.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-wait-for-jobs.h" +#include "conf-files.h" +#include "device-util.h" +#include "edit-util.h" +#include "escape.h" +#include "ether-addr-util.h" +#include "ethtool-util.h" +#include "fd-util.h" +#include "format-table.h" +#include "format-util.h" +#include "fs-util.h" +#include "geneve-util.h" +#include "glob-util.h" +#include "hwdb-util.h" +#include "ipvlan-util.h" +#include "local-addresses.h" +#include "locale-util.h" +#include "logs-show.h" +#include "macro.h" +#include "macvlan-util.h" +#include "main-func.h" +#include "netif-util.h" +#include "netlink-util.h" +#include "network-internal.h" +#include "network-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "pretty-print.h" +#include "set.h" +#include "sigbus.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "sort-util.h" +#include "sparse-endian.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "unit-def.h" +#include "verbs.h" +#include "virt.h" +#include "wifi-util.h" + +/* Kernel defines MODULE_NAME_LEN as 64 - sizeof(unsigned long). So, 64 is enough. */ +#define NETDEV_KIND_MAX 64 + +/* use 128 kB for receive socket kernel queue, we shouldn't need more here */ +#define RCVBUF_SIZE (128*1024) + +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static bool arg_no_reload = false; +static bool arg_all = false; +static bool arg_stats = false; +static bool arg_full = false; +static unsigned arg_lines = 10; +static char *arg_drop_in = NULL; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; + +STATIC_DESTRUCTOR_REGISTER(arg_drop_in, freep); + +static int check_netns_match(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + struct stat st; + uint64_t id; + int r; + + assert(bus); + + r = bus_get_property_trivial(bus, bus_network_mgr, "NamespaceId", &error, 't', &id); + if (r < 0) { + log_debug_errno(r, "Failed to query network namespace of networkd, ignoring: %s", bus_error_message(&error, r)); + return 0; + } + if (id == 0) { + log_debug("systemd-networkd.service not running in a network namespace (?), skipping netns check."); + return 0; + } + + if (stat("/proc/self/ns/net", &st) < 0) + return log_error_errno(errno, "Failed to determine our own network namespace ID: %m"); + + if (id != st.st_ino) + return log_error_errno(SYNTHETIC_ERRNO(EREMOTE), + "networkctl must be invoked in same network namespace as systemd-networkd.service."); + + return 0; +} + +static bool networkd_is_running(void) { + static int cached = -1; + int r; + + if (cached < 0) { + r = access("/run/systemd/netif/state", F_OK); + if (r < 0) { + if (errno != ENOENT) + log_debug_errno(errno, + "Failed to determine whether networkd is running, assuming it's not: %m"); + + cached = false; + } else + cached = true; + } + + return cached; +} + +static int acquire_bus(sd_bus **ret) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + assert(ret); + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + if (networkd_is_running()) { + r = check_netns_match(bus); + if (r < 0) + return r; + } else + log_warning("systemd-networkd is not running, output might be incomplete."); + + *ret = TAKE_PTR(bus); + return 0; +} + +static int get_description(sd_bus *bus, JsonVariant **ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *text; + int r; + + assert(bus); + assert(ret); + + r = bus_call_method(bus, bus_network_mgr, "Describe", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to get description: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &text); + if (r < 0) + return bus_log_parse_error(r); + + r = json_parse(text, 0, ret, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse JSON: %m"); + + return 0; +} + +static int dump_manager_description(sd_bus *bus) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(bus); + + r = get_description(bus, &v); + if (r < 0) + return r; + + json_variant_dump(v, arg_json_format_flags, NULL, NULL); + return 0; +} + +static int dump_link_description(sd_bus *bus, char * const *patterns) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_free_ bool *matched_patterns = NULL; + JsonVariant *i; + size_t c = 0; + int r; + + assert(bus); + assert(patterns); + + r = get_description(bus, &v); + if (r < 0) + return r; + + matched_patterns = new0(bool, strv_length(patterns)); + if (!matched_patterns) + return log_oom(); + + JSON_VARIANT_ARRAY_FOREACH(i, json_variant_by_key(v, "Interfaces")) { + char ifindex_str[DECIMAL_STR_MAX(int64_t)]; + const char *name; + int64_t index; + size_t pos; + + name = json_variant_string(json_variant_by_key(i, "Name")); + index = json_variant_integer(json_variant_by_key(i, "Index")); + xsprintf(ifindex_str, "%" PRIi64, index); + + if (!strv_fnmatch_full(patterns, ifindex_str, 0, &pos) && + !strv_fnmatch_full(patterns, name, 0, &pos)) { + bool match = false; + JsonVariant *a; + + JSON_VARIANT_ARRAY_FOREACH(a, json_variant_by_key(i, "AlternativeNames")) + if (strv_fnmatch_full(patterns, json_variant_string(a), 0, &pos)) { + match = true; + break; + } + + if (!match) + continue; + } + + matched_patterns[pos] = true; + json_variant_dump(i, arg_json_format_flags, NULL, NULL); + c++; + } + + /* Look if we matched all our arguments that are not globs. It is OK for a glob to match + * nothing, but not for an exact argument. */ + for (size_t pos = 0; pos < strv_length(patterns); pos++) { + if (matched_patterns[pos]) + continue; + + if (string_is_glob(patterns[pos])) + log_debug("Pattern \"%s\" doesn't match any interface, ignoring.", + patterns[pos]); + else + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), + "Interface \"%s\" not found.", patterns[pos]); + } + + if (c == 0) + log_warning("No interfaces matched."); + + return 0; +} + +static void operational_state_to_color( + const char *name, + const char *state, + const char **on, + const char **off) { + + if (STRPTR_IN_SET(state, "routable", "enslaved") || + (streq_ptr(name, "lo") && streq_ptr(state, "carrier"))) { + if (on) + *on = ansi_highlight_green(); + if (off) + *off = ansi_normal(); + } else if (streq_ptr(state, "degraded")) { + if (on) + *on = ansi_highlight_yellow(); + if (off) + *off = ansi_normal(); + } else { + if (on) + *on = ""; + if (off) + *off = ""; + } +} + +static void setup_state_to_color(const char *state, const char **on, const char **off) { + if (streq_ptr(state, "configured")) { + if (on) + *on = ansi_highlight_green(); + if (off) + *off = ansi_normal(); + } else if (streq_ptr(state, "configuring")) { + if (on) + *on = ansi_highlight_yellow(); + if (off) + *off = ansi_normal(); + } else if (STRPTR_IN_SET(state, "failed", "linger")) { + if (on) + *on = ansi_highlight_red(); + if (off) + *off = ansi_normal(); + } else { + if (on) + *on = ""; + if (off) + *off = ""; + } +} + +static void online_state_to_color(const char *state, const char **on, const char **off) { + if (streq_ptr(state, "online")) { + if (on) + *on = ansi_highlight_green(); + if (off) + *off = ansi_normal(); + } else if (streq_ptr(state, "partial")) { + if (on) + *on = ansi_highlight_yellow(); + if (off) + *off = ansi_normal(); + } else { + if (on) + *on = ""; + if (off) + *off = ""; + } +} + +typedef struct VxLanInfo { + uint32_t vni; + uint32_t link; + + int local_family; + int group_family; + + union in_addr_union local; + union in_addr_union group; + + uint16_t dest_port; + + uint8_t proxy; + uint8_t learning; + uint8_t rsc; + uint8_t l2miss; + uint8_t l3miss; + uint8_t tos; + uint8_t ttl; +} VxLanInfo; + +typedef struct LinkInfo { + char name[IFNAMSIZ+1]; + char *netdev_kind; + sd_device *sd_device; + int ifindex; + unsigned short iftype; + struct hw_addr_data hw_address; + struct hw_addr_data permanent_hw_address; + uint32_t master; + uint32_t mtu; + uint32_t min_mtu; + uint32_t max_mtu; + uint32_t tx_queues; + uint32_t rx_queues; + uint8_t addr_gen_mode; + char *qdisc; + char **alternative_names; + + union { + struct rtnl_link_stats64 stats64; + struct rtnl_link_stats stats; + }; + + uint64_t tx_bitrate; + uint64_t rx_bitrate; + + /* bridge info */ + uint32_t forward_delay; + uint32_t hello_time; + uint32_t max_age; + uint32_t ageing_time; + uint32_t stp_state; + uint32_t cost; + uint16_t priority; + uint8_t mcast_igmp_version; + uint8_t port_state; + + /* vxlan info */ + VxLanInfo vxlan_info; + + /* vlan info */ + uint16_t vlan_id; + + /* tunnel info */ + uint8_t ttl; + uint8_t tos; + uint8_t inherit; + uint8_t df; + uint8_t csum; + uint8_t csum6_tx; + uint8_t csum6_rx; + uint16_t tunnel_port; + uint32_t vni; + uint32_t label; + union in_addr_union local; + union in_addr_union remote; + + /* bonding info */ + uint8_t mode; + uint32_t miimon; + uint32_t updelay; + uint32_t downdelay; + + /* macvlan and macvtap info */ + uint32_t macvlan_mode; + + /* ipvlan info */ + uint16_t ipvlan_mode; + uint16_t ipvlan_flags; + + /* ethtool info */ + int autonegotiation; + uint64_t speed; + Duplex duplex; + NetDevPort port; + + /* wlan info */ + enum nl80211_iftype wlan_iftype; + char *ssid; + struct ether_addr bssid; + + bool has_hw_address:1; + bool has_permanent_hw_address:1; + bool has_tx_queues:1; + bool has_rx_queues:1; + bool has_stats64:1; + bool has_stats:1; + bool has_bitrates:1; + bool has_ethtool_link_info:1; + bool has_wlan_link_info:1; + bool has_tunnel_ipv4:1; + bool has_ipv6_address_generation_mode:1; + + bool needs_freeing:1; +} LinkInfo; + +static int link_info_compare(const LinkInfo *a, const LinkInfo *b) { + return CMP(a->ifindex, b->ifindex); +} + +static LinkInfo* link_info_array_free(LinkInfo *array) { + for (unsigned i = 0; array && array[i].needs_freeing; i++) { + sd_device_unref(array[i].sd_device); + free(array[i].netdev_kind); + free(array[i].ssid); + free(array[i].qdisc); + strv_free(array[i].alternative_names); + } + + return mfree(array); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(LinkInfo*, link_info_array_free); + +static int decode_netdev(sd_netlink_message *m, LinkInfo *info) { + int r; + + assert(m); + assert(info); + + r = sd_netlink_message_enter_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_read_string_strdup(m, IFLA_INFO_KIND, &info->netdev_kind); + if (r < 0) { + (void) sd_netlink_message_exit_container(m); + return r; + } + + r = sd_netlink_message_enter_container(m, IFLA_INFO_DATA); + if (r < 0) + return r; + + if (streq(info->netdev_kind, "bridge")) { + (void) sd_netlink_message_read_u32(m, IFLA_BR_FORWARD_DELAY, &info->forward_delay); + (void) sd_netlink_message_read_u32(m, IFLA_BR_HELLO_TIME, &info->hello_time); + (void) sd_netlink_message_read_u32(m, IFLA_BR_MAX_AGE, &info->max_age); + (void) sd_netlink_message_read_u32(m, IFLA_BR_AGEING_TIME, &info->ageing_time); + (void) sd_netlink_message_read_u32(m, IFLA_BR_STP_STATE, &info->stp_state); + (void) sd_netlink_message_read_u32(m, IFLA_BRPORT_COST, &info->cost); + (void) sd_netlink_message_read_u16(m, IFLA_BR_PRIORITY, &info->priority); + (void) sd_netlink_message_read_u8(m, IFLA_BR_MCAST_IGMP_VERSION, &info->mcast_igmp_version); + (void) sd_netlink_message_read_u8(m, IFLA_BRPORT_STATE, &info->port_state); + } if (streq(info->netdev_kind, "bond")) { + (void) sd_netlink_message_read_u8(m, IFLA_BOND_MODE, &info->mode); + (void) sd_netlink_message_read_u32(m, IFLA_BOND_MIIMON, &info->miimon); + (void) sd_netlink_message_read_u32(m, IFLA_BOND_DOWNDELAY, &info->downdelay); + (void) sd_netlink_message_read_u32(m, IFLA_BOND_UPDELAY, &info->updelay); + } else if (streq(info->netdev_kind, "vxlan")) { + (void) sd_netlink_message_read_u32(m, IFLA_VXLAN_ID, &info->vxlan_info.vni); + + r = sd_netlink_message_read_in_addr(m, IFLA_VXLAN_GROUP, &info->vxlan_info.group.in); + if (r >= 0) + info->vxlan_info.group_family = AF_INET; + else { + r = sd_netlink_message_read_in6_addr(m, IFLA_VXLAN_GROUP6, &info->vxlan_info.group.in6); + if (r >= 0) + info->vxlan_info.group_family = AF_INET6; + } + + r = sd_netlink_message_read_in_addr(m, IFLA_VXLAN_LOCAL, &info->vxlan_info.local.in); + if (r >= 0) + info->vxlan_info.local_family = AF_INET; + else { + r = sd_netlink_message_read_in6_addr(m, IFLA_VXLAN_LOCAL6, &info->vxlan_info.local.in6); + if (r >= 0) + info->vxlan_info.local_family = AF_INET6; + } + + (void) sd_netlink_message_read_u32(m, IFLA_VXLAN_LINK, &info->vxlan_info.link); + (void) sd_netlink_message_read_u16(m, IFLA_VXLAN_PORT, &info->vxlan_info.dest_port); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_PROXY, &info->vxlan_info.proxy); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_LEARNING, &info->vxlan_info.learning); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_RSC, &info->vxlan_info.rsc); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_L3MISS, &info->vxlan_info.l3miss); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_L2MISS, &info->vxlan_info.l2miss); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_TOS, &info->vxlan_info.tos); + (void) sd_netlink_message_read_u8(m, IFLA_VXLAN_TTL, &info->vxlan_info.ttl); + } else if (streq(info->netdev_kind, "vlan")) + (void) sd_netlink_message_read_u16(m, IFLA_VLAN_ID, &info->vlan_id); + else if (STR_IN_SET(info->netdev_kind, "ipip", "sit")) { + (void) sd_netlink_message_read_in_addr(m, IFLA_IPTUN_LOCAL, &info->local.in); + (void) sd_netlink_message_read_in_addr(m, IFLA_IPTUN_REMOTE, &info->remote.in); + } else if (streq(info->netdev_kind, "geneve")) { + (void) sd_netlink_message_read_u32(m, IFLA_GENEVE_ID, &info->vni); + + r = sd_netlink_message_read_in_addr(m, IFLA_GENEVE_REMOTE, &info->remote.in); + if (r >= 0) + info->has_tunnel_ipv4 = true; + else + (void) sd_netlink_message_read_in6_addr(m, IFLA_GENEVE_REMOTE6, &info->remote.in6); + + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_TTL, &info->ttl); + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_TTL_INHERIT, &info->inherit); + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_TOS, &info->tos); + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_DF, &info->df); + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_UDP_CSUM, &info->csum); + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, &info->csum6_tx); + (void) sd_netlink_message_read_u8(m, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, &info->csum6_rx); + (void) sd_netlink_message_read_u16(m, IFLA_GENEVE_PORT, &info->tunnel_port); + (void) sd_netlink_message_read_u32(m, IFLA_GENEVE_LABEL, &info->label); + } else if (STR_IN_SET(info->netdev_kind, "gre", "gretap", "erspan")) { + (void) sd_netlink_message_read_in_addr(m, IFLA_GRE_LOCAL, &info->local.in); + (void) sd_netlink_message_read_in_addr(m, IFLA_GRE_REMOTE, &info->remote.in); + } else if (STR_IN_SET(info->netdev_kind, "ip6gre", "ip6gretap", "ip6erspan")) { + (void) sd_netlink_message_read_in6_addr(m, IFLA_GRE_LOCAL, &info->local.in6); + (void) sd_netlink_message_read_in6_addr(m, IFLA_GRE_REMOTE, &info->remote.in6); + } else if (streq(info->netdev_kind, "vti")) { + (void) sd_netlink_message_read_in_addr(m, IFLA_VTI_LOCAL, &info->local.in); + (void) sd_netlink_message_read_in_addr(m, IFLA_VTI_REMOTE, &info->remote.in); + } else if (streq(info->netdev_kind, "vti6")) { + (void) sd_netlink_message_read_in6_addr(m, IFLA_VTI_LOCAL, &info->local.in6); + (void) sd_netlink_message_read_in6_addr(m, IFLA_VTI_REMOTE, &info->remote.in6); + } else if (STR_IN_SET(info->netdev_kind, "macvlan", "macvtap")) + (void) sd_netlink_message_read_u32(m, IFLA_MACVLAN_MODE, &info->macvlan_mode); + else if (streq(info->netdev_kind, "ipvlan")) { + (void) sd_netlink_message_read_u16(m, IFLA_IPVLAN_MODE, &info->ipvlan_mode); + (void) sd_netlink_message_read_u16(m, IFLA_IPVLAN_FLAGS, &info->ipvlan_flags); + } + + (void) sd_netlink_message_exit_container(m); + (void) sd_netlink_message_exit_container(m); + + return 0; +} + +static int decode_link( + sd_netlink_message *m, + LinkInfo *info, + char * const *patterns, + bool matched_patterns[]) { + + _cleanup_strv_free_ char **altnames = NULL; + const char *name, *qdisc; + int ifindex, r; + uint16_t type; + + assert(m); + assert(info); + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) + return r; + + if (type != RTM_NEWLINK) + return 0; + + r = sd_rtnl_message_link_get_ifindex(m, &ifindex); + if (r < 0) + return r; + + r = sd_netlink_message_read_string(m, IFLA_IFNAME, &name); + if (r < 0) + return r; + + r = sd_netlink_message_read_strv(m, IFLA_PROP_LIST, IFLA_ALT_IFNAME, &altnames); + if (r < 0 && r != -ENODATA) + return r; + + if (patterns) { + char str[DECIMAL_STR_MAX(int)]; + size_t pos; + + assert(matched_patterns); + + xsprintf(str, "%i", ifindex); + if (!strv_fnmatch_full(patterns, str, 0, &pos) && + !strv_fnmatch_full(patterns, name, 0, &pos)) { + bool match = false; + + STRV_FOREACH(p, altnames) + if (strv_fnmatch_full(patterns, *p, 0, &pos)) { + match = true; + break; + } + if (!match) + return 0; + } + + matched_patterns[pos] = true; + } + + r = sd_rtnl_message_link_get_type(m, &info->iftype); + if (r < 0) + return r; + + strscpy(info->name, sizeof info->name, name); + info->ifindex = ifindex; + info->alternative_names = TAKE_PTR(altnames); + + info->has_hw_address = + netlink_message_read_hw_addr(m, IFLA_ADDRESS, &info->hw_address) >= 0 && + info->hw_address.length > 0; + + info->has_permanent_hw_address = + (netlink_message_read_hw_addr(m, IFLA_PERM_ADDRESS, &info->permanent_hw_address) >= 0 || + ethtool_get_permanent_hw_addr(NULL, info->name, &info->permanent_hw_address) >= 0) && + !hw_addr_is_null(&info->permanent_hw_address) && + !hw_addr_equal(&info->permanent_hw_address, &info->hw_address); + + (void) sd_netlink_message_read_u32(m, IFLA_MTU, &info->mtu); + (void) sd_netlink_message_read_u32(m, IFLA_MIN_MTU, &info->min_mtu); + (void) sd_netlink_message_read_u32(m, IFLA_MAX_MTU, &info->max_mtu); + + info->has_rx_queues = + sd_netlink_message_read_u32(m, IFLA_NUM_RX_QUEUES, &info->rx_queues) >= 0 && + info->rx_queues > 0; + + info->has_tx_queues = + sd_netlink_message_read_u32(m, IFLA_NUM_TX_QUEUES, &info->tx_queues) >= 0 && + info->tx_queues > 0; + + if (sd_netlink_message_read(m, IFLA_STATS64, sizeof info->stats64, &info->stats64) >= 0) + info->has_stats64 = true; + else if (sd_netlink_message_read(m, IFLA_STATS, sizeof info->stats, &info->stats) >= 0) + info->has_stats = true; + + r = sd_netlink_message_read_string(m, IFLA_QDISC, &qdisc); + if (r >= 0) { + info->qdisc = strdup(qdisc); + if (!info->qdisc) + return log_oom(); + } + + (void) sd_netlink_message_read_u32(m, IFLA_MASTER, &info->master); + + r = sd_netlink_message_enter_container(m, IFLA_AF_SPEC); + if (r >= 0) { + r = sd_netlink_message_enter_container(m, AF_INET6); + if (r >= 0) { + r = sd_netlink_message_read_u8(m, IFLA_INET6_ADDR_GEN_MODE, &info->addr_gen_mode); + if (r >= 0 && IN_SET(info->addr_gen_mode, + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM)) + info->has_ipv6_address_generation_mode = true; + + (void) sd_netlink_message_exit_container(m); + } + (void) sd_netlink_message_exit_container(m); + } + + /* fill kind info */ + (void) decode_netdev(m, info); + + return 1; +} + +static int link_get_property( + sd_bus *bus, + const LinkInfo *link, + sd_bus_error *error, + sd_bus_message **reply, + const char *iface, + const char *propname, + const char *type) { + + _cleanup_free_ char *path = NULL; + char ifindex_str[DECIMAL_STR_MAX(int)]; + int r; + + assert(bus); + assert(link); + assert(link->ifindex >= 0); + assert(error); + assert(reply); + assert(iface); + assert(propname); + assert(type); + + xsprintf(ifindex_str, "%i", link->ifindex); + + r = sd_bus_path_encode("/org/freedesktop/network1/link", ifindex_str, &path); + if (r < 0) + return r; + + return sd_bus_get_property(bus, "org.freedesktop.network1", path, iface, propname, error, reply, type); +} + +static int acquire_link_bitrates(sd_bus *bus, LinkInfo *link) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(link); + + r = link_get_property(bus, link, &error, &reply, "org.freedesktop.network1.Link", "BitRates", "(tt)"); + if (r < 0) { + bool quiet = sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, + BUS_ERROR_SPEED_METER_INACTIVE); + + return log_full_errno(quiet ? LOG_DEBUG : LOG_WARNING, + r, "Failed to query link bit rates: %s", bus_error_message(&error, r)); + } + + r = sd_bus_message_read(reply, "(tt)", &link->tx_bitrate, &link->rx_bitrate); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + link->has_bitrates = link->tx_bitrate != UINT64_MAX && link->rx_bitrate != UINT64_MAX; + + return 0; +} + +static void acquire_ether_link_info(int *fd, LinkInfo *link) { + assert(fd); + assert(link); + + if (ethtool_get_link_info(fd, + link->name, + &link->autonegotiation, + &link->speed, + &link->duplex, + &link->port) >= 0) + link->has_ethtool_link_info = true; +} + +static void acquire_wlan_link_info(LinkInfo *link) { + _cleanup_(sd_netlink_unrefp) sd_netlink *genl = NULL; + const char *type = NULL; + int r, k = 0; + + assert(link); + + if (link->sd_device) + (void) sd_device_get_devtype(link->sd_device, &type); + if (!streq_ptr(type, "wlan")) + return; + + r = sd_genl_socket_open(&genl); + if (r < 0) { + log_debug_errno(r, "Failed to open generic netlink socket: %m"); + return; + } + + (void) sd_netlink_increase_rxbuf(genl, RCVBUF_SIZE); + + r = wifi_get_interface(genl, link->ifindex, &link->wlan_iftype, &link->ssid); + if (r < 0) + log_debug_errno(r, "%s: failed to query ssid: %m", link->name); + + if (link->wlan_iftype == NL80211_IFTYPE_STATION) { + k = wifi_get_station(genl, link->ifindex, &link->bssid); + if (k < 0) + log_debug_errno(k, "%s: failed to query bssid: %m", link->name); + } + + link->has_wlan_link_info = r > 0 || k > 0; +} + +static int acquire_link_info(sd_bus *bus, sd_netlink *rtnl, char * const *patterns, LinkInfo **ret) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + _cleanup_(link_info_array_freep) LinkInfo *links = NULL; + _cleanup_free_ bool *matched_patterns = NULL; + _cleanup_close_ int fd = -EBADF; + size_t c = 0; + int r; + + assert(rtnl); + assert(ret); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, 0); + if (r < 0) + return rtnl_log_create_error(r); + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return rtnl_log_create_error(r); + + r = sd_netlink_call(rtnl, req, 0, &reply); + if (r < 0) + return log_error_errno(r, "Failed to enumerate links: %m"); + + if (patterns) { + matched_patterns = new0(bool, strv_length(patterns)); + if (!matched_patterns) + return log_oom(); + } + + for (sd_netlink_message *i = reply; i; i = sd_netlink_message_next(i)) { + if (!GREEDY_REALLOC0(links, c + 2)) /* We keep one trailing one as marker */ + return -ENOMEM; + + r = decode_link(i, links + c, patterns, matched_patterns); + if (r < 0) + return r; + if (r == 0) + continue; + + links[c].needs_freeing = true; + + (void) sd_device_new_from_ifindex(&links[c].sd_device, links[c].ifindex); + + acquire_ether_link_info(&fd, &links[c]); + acquire_wlan_link_info(&links[c]); + + c++; + } + + /* Look if we matched all our arguments that are not globs. It + * is OK for a glob to match nothing, but not for an exact argument. */ + for (size_t pos = 0; pos < strv_length(patterns); pos++) { + if (matched_patterns[pos]) + continue; + + if (string_is_glob(patterns[pos])) + log_debug("Pattern \"%s\" doesn't match any interface, ignoring.", + patterns[pos]); + else + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), + "Interface \"%s\" not found.", patterns[pos]); + } + + typesafe_qsort(links, c, link_info_compare); + + if (bus) + FOREACH_ARRAY(link, links, c) + (void) acquire_link_bitrates(bus, link); + + *ret = TAKE_PTR(links); + + if (patterns && c == 0) + log_warning("No interfaces matched."); + + return (int) c; +} + +static int list_links(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(link_info_array_freep) LinkInfo *links = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + TableCell *cell; + int c, r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + if (arg_json_format_flags != JSON_FORMAT_OFF) { + if (arg_all || argc <= 1) + return dump_manager_description(bus); + else + return dump_link_description(bus, strv_skip(argv, 1)); + } + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + c = acquire_link_info(NULL, rtnl, argc > 1 ? argv + 1 : NULL, &links); + if (c < 0) + return c; + + pager_open(arg_pager_flags); + + table = table_new("idx", "link", "type", "operational", "setup"); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + table_set_header(table, arg_legend); + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + assert_se(cell = table_get_cell(table, 0, 0)); + (void) table_set_minimum_width(table, cell, 3); + (void) table_set_weight(table, cell, 0); + (void) table_set_ellipsize_percent(table, cell, 100); + (void) table_set_align_percent(table, cell, 100); + + assert_se(cell = table_get_cell(table, 0, 1)); + (void) table_set_ellipsize_percent(table, cell, 100); + + FOREACH_ARRAY(link, links, c) { + _cleanup_free_ char *setup_state = NULL, *operational_state = NULL; + _cleanup_free_ char *t = NULL; + const char *on_color_operational, *on_color_setup; + + (void) sd_network_link_get_operational_state(link->ifindex, &operational_state); + operational_state_to_color(link->name, operational_state, &on_color_operational, NULL); + + (void) sd_network_link_get_setup_state(link->ifindex, &setup_state); + setup_state_to_color(setup_state, &on_color_setup, NULL); + + r = net_get_type_string(link->sd_device, link->iftype, &t); + if (r == -ENOMEM) + return log_oom(); + + r = table_add_many(table, + TABLE_INT, link->ifindex, + TABLE_STRING, link->name, + TABLE_STRING, t, + TABLE_STRING, operational_state, + TABLE_SET_COLOR, on_color_operational, + TABLE_STRING, setup_state ?: "unmanaged", + TABLE_SET_COLOR, on_color_setup); + if (r < 0) + return table_log_add_error(r); + } + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (arg_legend) + printf("\n%i links listed.\n", c); + + return 0; +} + +/* IEEE Organizationally Unique Identifier vendor string */ +static int ieee_oui(sd_hwdb *hwdb, const struct ether_addr *mac, char **ret) { + _cleanup_free_ char *desc = NULL; + const char *description; + char modalias[STRLEN("OUI:XXYYXXYYXXYY") + 1]; + int r; + + assert(ret); + + if (!hwdb || !mac) + return -EINVAL; + + /* skip commonly misused 00:00:00 (Xerox) prefix */ + if (memcmp(mac, "\0\0\0", 3) == 0) + return -EINVAL; + + xsprintf(modalias, "OUI:" ETHER_ADDR_FORMAT_STR, ETHER_ADDR_FORMAT_VAL(*mac)); + + r = sd_hwdb_get(hwdb, modalias, "ID_OUI_FROM_DATABASE", &description); + if (r < 0) + return r; + + desc = strdup(description); + if (!desc) + return -ENOMEM; + + *ret = TAKE_PTR(desc); + + return 0; +} + +static int get_gateway_description( + sd_netlink *rtnl, + sd_hwdb *hwdb, + int ifindex, + int family, + union in_addr_union *gateway, + char **ret) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + int r; + + assert(rtnl); + assert(ifindex >= 0); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(gateway); + assert(ret); + + r = sd_rtnl_message_new_neigh(rtnl, &req, RTM_GETNEIGH, ifindex, family); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) { + union in_addr_union gw = IN_ADDR_NULL; + struct ether_addr mac = ETHER_ADDR_NULL; + uint16_t type; + int ifi, fam; + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_error_errno(r, "Failed to get netlink message, ignoring: %m"); + continue; + } + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) { + log_error_errno(r, "Failed to get netlink message type, ignoring: %m"); + continue; + } + + if (type != RTM_NEWNEIGH) { + log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Got unexpected netlink message type %u, ignoring", + type); + continue; + } + + r = sd_rtnl_message_neigh_get_family(m, &fam); + if (r < 0) { + log_error_errno(r, "Failed to get rtnl family, ignoring: %m"); + continue; + } + + if (fam != family) { + log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got invalid rtnl family %d, ignoring", fam); + continue; + } + + r = sd_rtnl_message_neigh_get_ifindex(m, &ifi); + if (r < 0) { + log_error_errno(r, "Failed to get rtnl ifindex, ignoring: %m"); + continue; + } + + if (ifindex > 0 && ifi != ifindex) + continue; + + switch (fam) { + + case AF_INET: + r = sd_netlink_message_read_in_addr(m, NDA_DST, &gw.in); + if (r < 0) + continue; + + break; + + case AF_INET6: + r = sd_netlink_message_read_in6_addr(m, NDA_DST, &gw.in6); + if (r < 0) + continue; + + break; + + default: + continue; + } + + if (!in_addr_equal(fam, &gw, gateway)) + continue; + + r = sd_netlink_message_read(m, NDA_LLADDR, sizeof(mac), &mac); + if (r < 0) + continue; + + r = ieee_oui(hwdb, &mac, ret); + if (r < 0) + continue; + + return 0; + } + + return -ENODATA; +} + +static int dump_list(Table *table, const char *key, char * const *l) { + int r; + + assert(table); + assert(key); + + if (strv_isempty(l)) + return 0; + + r = table_add_many(table, + TABLE_FIELD, key, + TABLE_STRV, l); + if (r < 0) + return table_log_add_error(r); + + return 0; +} + +static int dump_gateways(sd_netlink *rtnl, sd_hwdb *hwdb, Table *table, int ifindex) { + _cleanup_free_ struct local_address *local_addrs = NULL; + _cleanup_strv_free_ char **buf = NULL; + int r, n; + + assert(rtnl); + assert(table); + + n = local_gateways(rtnl, ifindex, AF_UNSPEC, &local_addrs); + if (n <= 0) + return n; + + FOREACH_ARRAY(local, local_addrs, n) { + _cleanup_free_ char *description = NULL; + + r = get_gateway_description(rtnl, hwdb, local->ifindex, local->family, &local->address, &description); + if (r < 0) + log_debug_errno(r, "Could not get description of gateway, ignoring: %m"); + + /* Show interface name for the entry if we show entries for all interfaces */ + r = strv_extendf(&buf, "%s%s%s%s%s%s", + IN_ADDR_TO_STRING(local->family, &local->address), + description ? " (" : "", + strempty(description), + description ? ")" : "", + ifindex <= 0 ? " on " : "", + ifindex <= 0 ? FORMAT_IFNAME_FULL(local->ifindex, FORMAT_IFNAME_IFINDEX_WITH_PERCENT) : ""); + if (r < 0) + return log_oom(); + } + + return dump_list(table, "Gateway", buf); +} + +static int dump_addresses( + sd_netlink *rtnl, + sd_dhcp_lease *lease, + Table *table, + int ifindex) { + + _cleanup_free_ struct local_address *local_addrs = NULL; + _cleanup_strv_free_ char **buf = NULL; + struct in_addr dhcp4_address = {}; + int r, n; + + assert(rtnl); + assert(table); + + n = local_addresses(rtnl, ifindex, AF_UNSPEC, &local_addrs); + if (n <= 0) + return n; + + if (lease) + (void) sd_dhcp_lease_get_address(lease, &dhcp4_address); + + FOREACH_ARRAY(local, local_addrs, n) { + struct in_addr server_address; + bool dhcp4 = false; + + if (local->family == AF_INET && in4_addr_equal(&local->address.in, &dhcp4_address)) + dhcp4 = sd_dhcp_lease_get_server_identifier(lease, &server_address) >= 0; + + r = strv_extendf(&buf, "%s%s%s%s%s%s", + IN_ADDR_TO_STRING(local->family, &local->address), + dhcp4 ? " (DHCP4 via " : "", + dhcp4 ? IN4_ADDR_TO_STRING(&server_address) : "", + dhcp4 ? ")" : "", + ifindex <= 0 ? " on " : "", + ifindex <= 0 ? FORMAT_IFNAME_FULL(local->ifindex, FORMAT_IFNAME_IFINDEX_WITH_PERCENT) : ""); + if (r < 0) + return log_oom(); + } + + return dump_list(table, "Address", buf); +} + +static int dump_address_labels(sd_netlink *rtnl) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + TableCell *cell; + int r; + + assert(rtnl); + + r = sd_rtnl_message_new_addrlabel(rtnl, &req, RTM_GETADDRLABEL, 0, AF_INET6); + if (r < 0) + return log_error_errno(r, "Could not allocate RTM_GETADDRLABEL message: %m"); + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, req, 0, &reply); + if (r < 0) + return r; + + table = table_new("label", "prefix/prefixlen"); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + r = table_set_sort(table, (size_t) 0); + if (r < 0) + return r; + + assert_se(cell = table_get_cell(table, 0, 0)); + (void) table_set_align_percent(table, cell, 100); + (void) table_set_ellipsize_percent(table, cell, 100); + + assert_se(cell = table_get_cell(table, 0, 1)); + (void) table_set_align_percent(table, cell, 100); + + for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) { + struct in6_addr prefix; + uint8_t prefixlen; + uint32_t label; + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_error_errno(r, "Failed to get netlink message, ignoring: %m"); + continue; + } + + r = sd_netlink_message_read_u32(m, IFAL_LABEL, &label); + if (r < 0 && r != -ENODATA) { + log_error_errno(r, "Could not read IFAL_LABEL, ignoring: %m"); + continue; + } + + r = sd_netlink_message_read_in6_addr(m, IFAL_ADDRESS, &prefix); + if (r < 0) + continue; + + r = sd_rtnl_message_addrlabel_get_prefixlen(m, &prefixlen); + if (r < 0) + continue; + + r = table_add_cell(table, NULL, TABLE_UINT32, &label); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s/%u", IN6_ADDR_TO_STRING(&prefix), prefixlen); + if (r < 0) + return table_log_add_error(r); + } + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int list_address_labels(int argc, char *argv[], void *userdata) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + return dump_address_labels(rtnl); +} + +static int open_lldp_neighbors(int ifindex, FILE **ret) { + _cleanup_fclose_ FILE *f = NULL; + char p[STRLEN("/run/systemd/netif/lldp/") + DECIMAL_STR_MAX(int)]; + + assert(ifindex >= 0); + assert(ret); + + xsprintf(p, "/run/systemd/netif/lldp/%i", ifindex); + + f = fopen(p, "re"); + if (!f) + return -errno; + + *ret = TAKE_PTR(f); + return 0; +} + +static int next_lldp_neighbor(FILE *f, sd_lldp_neighbor **ret) { + _cleanup_free_ void *raw = NULL; + size_t l; + le64_t u; + int r; + + assert(f); + assert(ret); + + l = fread(&u, 1, sizeof(u), f); + if (l == 0 && feof(f)) + return 0; + if (l != sizeof(u)) + return -EBADMSG; + + /* each LLDP packet is at most MTU size, but let's allow up to 4KiB just in case */ + if (le64toh(u) >= 4096) + return -EBADMSG; + + raw = new(uint8_t, le64toh(u)); + if (!raw) + return -ENOMEM; + + if (fread(raw, 1, le64toh(u), f) != le64toh(u)) + return -EBADMSG; + + r = sd_lldp_neighbor_from_raw(ret, raw, le64toh(u)); + if (r < 0) + return r; + + return 1; +} + +static int dump_lldp_neighbors(Table *table, const char *prefix, int ifindex) { + _cleanup_strv_free_ char **buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(table); + assert(prefix); + assert(ifindex > 0); + + r = open_lldp_neighbors(ifindex, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + for (;;) { + const char *system_name = NULL, *port_id = NULL, *port_description = NULL; + _cleanup_(sd_lldp_neighbor_unrefp) sd_lldp_neighbor *n = NULL; + + r = next_lldp_neighbor(f, &n); + if (r < 0) + return r; + if (r == 0) + break; + + (void) sd_lldp_neighbor_get_system_name(n, &system_name); + (void) sd_lldp_neighbor_get_port_id_as_string(n, &port_id); + (void) sd_lldp_neighbor_get_port_description(n, &port_description); + + r = strv_extendf(&buf, "%s on port %s%s%s%s", + strna(system_name), + strna(port_id), + isempty(port_description) ? "" : " (", + strempty(port_description), + isempty(port_description) ? "" : ")"); + if (r < 0) + return log_oom(); + } + + return dump_list(table, prefix, buf); +} + +static int dump_dhcp_leases(Table *table, const char *prefix, sd_bus *bus, const LinkInfo *link) { + _cleanup_strv_free_ char **buf = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(table); + assert(prefix); + assert(bus); + assert(link); + + r = link_get_property(bus, link, &error, &reply, "org.freedesktop.network1.DHCPServer", "Leases", "a(uayayayayt)"); + if (r < 0) { + bool quiet = sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY); + + log_full_errno(quiet ? LOG_DEBUG : LOG_WARNING, + r, "Failed to query link DHCP leases: %s", bus_error_message(&error, r)); + return 0; + } + + r = sd_bus_message_enter_container(reply, 'a', "(uayayayayt)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_enter_container(reply, 'r', "uayayayayt")) > 0) { + _cleanup_free_ char *id = NULL, *ip = NULL; + const void *client_id, *addr, *gtw, *hwaddr; + size_t client_id_sz, sz; + uint64_t expiration; + uint32_t family; + + r = sd_bus_message_read(reply, "u", &family); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &client_id, &client_id_sz); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &addr, &sz); + if (r < 0 || sz != 4) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', >w, &sz); + if (r < 0 || sz != 4) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &hwaddr, &sz); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_basic(reply, 't', &expiration); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_dhcp_client_id_to_string(client_id, client_id_sz, &id); + if (r < 0) + return bus_log_parse_error(r); + + r = in_addr_to_string(family, addr, &ip); + if (r < 0) + return bus_log_parse_error(r); + + r = strv_extendf(&buf, "%s (to %s)", ip, id); + if (r < 0) + return log_oom(); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (strv_isempty(buf)) { + r = strv_extendf(&buf, "none"); + if (r < 0) + return log_oom(); + } + + return dump_list(table, prefix, buf); +} + +static int dump_ifindexes(Table *table, const char *prefix, const int *ifindexes) { + int r; + + assert(table); + assert(prefix); + + if (!ifindexes) + return 0; + + for (unsigned c = 0; ifindexes[c] > 0; c++) { + if (c == 0) + r = table_add_cell(table, NULL, TABLE_FIELD, prefix); + else + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_IFINDEX, &ifindexes[c]); + if (r < 0) + return table_log_add_error(r); + } + + return 0; +} + +#define DUMP_STATS_ONE(name, val_name) \ + ({ \ + r = table_add_cell(table, NULL, TABLE_FIELD, name); \ + if (r < 0) \ + return table_log_add_error(r); \ + r = table_add_cell(table, NULL, \ + info->has_stats64 ? TABLE_UINT64 : TABLE_UINT32, \ + info->has_stats64 ? (void*) &info->stats64.val_name : (void*) &info->stats.val_name); \ + if (r < 0) \ + return table_log_add_error(r); \ + }) + +static int dump_statistics(Table *table, const LinkInfo *info) { + int r; + + assert(table); + assert(info); + + if (!arg_stats) + return 0; + + if (!info->has_stats64 && !info->has_stats) + return 0; + + DUMP_STATS_ONE("Rx Packets", rx_packets); + DUMP_STATS_ONE("Tx Packets", tx_packets); + DUMP_STATS_ONE("Rx Bytes", rx_bytes); + DUMP_STATS_ONE("Tx Bytes", tx_bytes); + DUMP_STATS_ONE("Rx Errors", rx_errors); + DUMP_STATS_ONE("Tx Errors", tx_errors); + DUMP_STATS_ONE("Rx Dropped", rx_dropped); + DUMP_STATS_ONE("Tx Dropped", tx_dropped); + DUMP_STATS_ONE("Multicast Packets", multicast); + DUMP_STATS_ONE("Collisions", collisions); + + return 0; +} + +static int dump_hw_address(Table *table, sd_hwdb *hwdb, const char *field, const struct hw_addr_data *addr) { + _cleanup_free_ char *description = NULL; + int r; + + assert(table); + assert(field); + assert(addr); + + if (addr->length == ETH_ALEN) + (void) ieee_oui(hwdb, &addr->ether, &description); + + r = table_add_cell(table, NULL, TABLE_FIELD, field); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s%s%s%s", + HW_ADDR_TO_STR(addr), + description ? " (" : "", + strempty(description), + description ? ")" : ""); + if (r < 0) + return table_log_add_error(r); + + return 0; +} + +static OutputFlags get_output_flags(void) { + return + arg_all * OUTPUT_SHOW_ALL | + (arg_full || !on_tty() || pager_have()) * OUTPUT_FULL_WIDTH | + colors_enabled() * OUTPUT_COLOR; +} + +static int show_logs(const LinkInfo *info) { + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + if (arg_lines == 0) + return 0; + + r = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to open journal: %m"); + + r = add_match_this_boot(j, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add boot matches: %m"); + + if (info) { + char m1[STRLEN("_KERNEL_DEVICE=n") + DECIMAL_STR_MAX(int)]; + const char *m2, *m3; + + /* kernel */ + xsprintf(m1, "_KERNEL_DEVICE=n%i", info->ifindex); + /* networkd */ + m2 = strjoina("INTERFACE=", info->name); + /* udevd */ + m3 = strjoina("DEVICE=", info->name); + + (void)( + (r = sd_journal_add_match(j, m1, 0)) || + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m2, 0)) || + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m3, 0)) + ); + if (r < 0) + return log_error_errno(r, "Failed to add link matches: %m"); + } else { + r = add_matches_for_unit(j, "systemd-networkd.service"); + if (r < 0) + return log_error_errno(r, "Failed to add unit matches: %m"); + + r = add_matches_for_unit(j, "systemd-networkd-wait-online.service"); + if (r < 0) + return log_error_errno(r, "Failed to add unit matches: %m"); + } + + return show_journal( + stdout, + j, + OUTPUT_SHORT, + 0, + 0, + arg_lines, + get_output_flags() | OUTPUT_BEGIN_NEWLINE, + NULL); +} + +static int table_add_string_line(Table *table, const char *key, const char *value) { + int r; + + assert(table); + assert(key); + + if (isempty(value)) + return 0; + + r = table_add_many(table, + TABLE_FIELD, key, + TABLE_STRING, value); + if (r < 0) + return table_log_add_error(r); + + return 0; +} + +static int format_dropins(char **dropins) { + STRV_FOREACH(d, dropins) { + _cleanup_free_ char *s = NULL; + int glyph = *(d + 1) == NULL ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH; + + s = strjoin(special_glyph(glyph), *d); + if (!s) + return log_oom(); + + free_and_replace(*d, s); + } + + return 0; +} + +static int link_status_one( + sd_bus *bus, + sd_netlink *rtnl, + sd_hwdb *hwdb, + const LinkInfo *info) { + + _cleanup_strv_free_ char **dns = NULL, **ntp = NULL, **sip = NULL, **search_domains = NULL, + **route_domains = NULL, **link_dropins = NULL, **network_dropins = NULL; + _cleanup_free_ char *t = NULL, *network = NULL, *iaid = NULL, *duid = NULL, *captive_portal = NULL, + *setup_state = NULL, *operational_state = NULL, *online_state = NULL, *activation_policy = NULL; + const char *driver = NULL, *path = NULL, *vendor = NULL, *model = NULL, *link = NULL, + *on_color_operational, *off_color_operational, *on_color_setup, *off_color_setup, *on_color_online; + _cleanup_free_ int *carrier_bound_to = NULL, *carrier_bound_by = NULL; + _cleanup_(sd_dhcp_lease_unrefp) sd_dhcp_lease *lease = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(bus); + assert(rtnl); + assert(info); + + (void) sd_network_link_get_operational_state(info->ifindex, &operational_state); + operational_state_to_color(info->name, operational_state, &on_color_operational, &off_color_operational); + + (void) sd_network_link_get_online_state(info->ifindex, &online_state); + online_state_to_color(online_state, &on_color_online, NULL); + + (void) sd_network_link_get_setup_state(info->ifindex, &setup_state); + setup_state_to_color(setup_state, &on_color_setup, &off_color_setup); + + (void) sd_network_link_get_dns(info->ifindex, &dns); + (void) sd_network_link_get_search_domains(info->ifindex, &search_domains); + (void) sd_network_link_get_route_domains(info->ifindex, &route_domains); + (void) sd_network_link_get_ntp(info->ifindex, &ntp); + (void) sd_network_link_get_sip(info->ifindex, &sip); + (void) sd_network_link_get_captive_portal(info->ifindex, &captive_portal); + (void) sd_network_link_get_network_file(info->ifindex, &network); + (void) sd_network_link_get_network_file_dropins(info->ifindex, &network_dropins); + (void) sd_network_link_get_carrier_bound_to(info->ifindex, &carrier_bound_to); + (void) sd_network_link_get_carrier_bound_by(info->ifindex, &carrier_bound_by); + (void) sd_network_link_get_activation_policy(info->ifindex, &activation_policy); + + if (info->sd_device) { + const char *joined; + + (void) sd_device_get_property_value(info->sd_device, "ID_NET_LINK_FILE", &link); + + if (sd_device_get_property_value(info->sd_device, "ID_NET_LINK_FILE_DROPINS", &joined) >= 0) { + r = strv_split_full(&link_dropins, joined, ":", EXTRACT_CUNESCAPE); + if (r < 0) + return r; + } + + (void) sd_device_get_property_value(info->sd_device, "ID_NET_DRIVER", &driver); + (void) sd_device_get_property_value(info->sd_device, "ID_PATH", &path); + (void) device_get_vendor_string(info->sd_device, &vendor); + (void) device_get_model_string(info->sd_device, &model); + } + + r = net_get_type_string(info->sd_device, info->iftype, &t); + if (r == -ENOMEM) + return log_oom(); + + char lease_file[STRLEN("/run/systemd/netif/leases/") + DECIMAL_STR_MAX(int)]; + xsprintf(lease_file, "/run/systemd/netif/leases/%i", info->ifindex); + + (void) dhcp_lease_load(&lease, lease_file); + + r = format_dropins(network_dropins); + if (r < 0) + return r; + + if (strv_prepend(&network_dropins, network) < 0) + return log_oom(); + + r = format_dropins(link_dropins); + if (r < 0) + return r; + + if (strv_prepend(&link_dropins, link) < 0) + return log_oom(); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + /* unit files and basic states. */ + r = table_add_many(table, + TABLE_FIELD, "Link File", + TABLE_STRV, link_dropins ?: STRV_MAKE("n/a"), + TABLE_FIELD, "Network File", + TABLE_STRV, network_dropins ?: STRV_MAKE("n/a"), + TABLE_FIELD, "State"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s%s%s (%s%s%s)", + on_color_operational, strna(operational_state), off_color_operational, + on_color_setup, setup_state ?: "unmanaged", off_color_setup); + if (r < 0) + return table_log_add_error(r); + + r = table_add_many(table, + TABLE_FIELD, "Online state", + TABLE_STRING, online_state ?: "unknown", + TABLE_SET_COLOR, on_color_online); + if (r < 0) + return table_log_add_error(r); + + r = table_add_string_line(table, "Type", t); + if (r < 0) + return r; + + r = table_add_string_line(table, "Kind", info->netdev_kind); + if (r < 0) + return r; + + r = table_add_string_line(table, "Path", path); + if (r < 0) + return r; + + r = table_add_string_line(table, "Driver", driver); + if (r < 0) + return r; + + r = table_add_string_line(table, "Vendor", vendor); + if (r < 0) + return r; + + r = table_add_string_line(table, "Model", model); + if (r < 0) + return r; + + strv_sort(info->alternative_names); + r = dump_list(table, "Alternative Names", info->alternative_names); + if (r < 0) + return r; + + if (info->has_hw_address) { + r = dump_hw_address(table, hwdb, "Hardware Address", &info->hw_address); + if (r < 0) + return r; + } + + if (info->has_permanent_hw_address) { + r = dump_hw_address(table, hwdb, "Permanent Hardware Address", &info->permanent_hw_address); + if (r < 0) + return r; + } + + if (info->mtu > 0) { + char min_str[DECIMAL_STR_MAX(uint32_t)], max_str[DECIMAL_STR_MAX(uint32_t)]; + + xsprintf(min_str, "%" PRIu32, info->min_mtu); + xsprintf(max_str, "%" PRIu32, info->max_mtu); + + r = table_add_cell(table, NULL, TABLE_FIELD, "MTU"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%" PRIu32 "%s%s%s%s%s%s%s", + info->mtu, + info->min_mtu > 0 || info->max_mtu > 0 ? " (" : "", + info->min_mtu > 0 ? "min: " : "", + info->min_mtu > 0 ? min_str : "", + info->min_mtu > 0 && info->max_mtu > 0 ? ", " : "", + info->max_mtu > 0 ? "max: " : "", + info->max_mtu > 0 ? max_str : "", + info->min_mtu > 0 || info->max_mtu > 0 ? ")" : ""); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_string_line(table, "QDisc", info->qdisc); + if (r < 0) + return r; + + if (info->master > 0) { + r = table_add_many(table, + TABLE_FIELD, "Master", + TABLE_IFINDEX, info->master); + if (r < 0) + return table_log_add_error(r); + } + + if (info->has_ipv6_address_generation_mode) { + static const struct { + const char *mode; + } mode_table[] = { + { "eui64" }, + { "none" }, + { "stable-privacy" }, + { "random" }, + }; + + r = table_add_many(table, + TABLE_FIELD, "IPv6 Address Generation Mode", + TABLE_STRING, mode_table[info->addr_gen_mode]); + if (r < 0) + return table_log_add_error(r); + } + + if (streq_ptr(info->netdev_kind, "bridge")) { + r = table_add_many(table, + TABLE_FIELD, "Forward Delay", + TABLE_TIMESPAN_MSEC, jiffies_to_usec(info->forward_delay), + TABLE_FIELD, "Hello Time", + TABLE_TIMESPAN_MSEC, jiffies_to_usec(info->hello_time), + TABLE_FIELD, "Max Age", + TABLE_TIMESPAN_MSEC, jiffies_to_usec(info->max_age), + TABLE_FIELD, "Ageing Time", + TABLE_TIMESPAN_MSEC, jiffies_to_usec(info->ageing_time), + TABLE_FIELD, "Priority", + TABLE_UINT16, info->priority, + TABLE_FIELD, "STP", + TABLE_BOOLEAN, info->stp_state > 0, + TABLE_FIELD, "Multicast IGMP Version", + TABLE_UINT8, info->mcast_igmp_version, + TABLE_FIELD, "Cost", + TABLE_UINT32, info->cost); + if (r < 0) + return table_log_add_error(r); + + if (info->port_state <= BR_STATE_BLOCKING) { + r = table_add_many(table, + TABLE_FIELD, "Port State", + TABLE_STRING, bridge_state_to_string(info->port_state)); + if (r < 0) + return table_log_add_error(r); + } + + } else if (streq_ptr(info->netdev_kind, "bond")) { + r = table_add_many(table, + TABLE_FIELD, "Mode", + TABLE_STRING, bond_mode_to_string(info->mode), + TABLE_FIELD, "Miimon", + TABLE_TIMESPAN_MSEC, info->miimon * USEC_PER_MSEC, + TABLE_FIELD, "Updelay", + TABLE_TIMESPAN_MSEC, info->updelay * USEC_PER_MSEC, + TABLE_FIELD, "Downdelay", + TABLE_TIMESPAN_MSEC, info->downdelay * USEC_PER_MSEC); + if (r < 0) + return table_log_add_error(r); + + } else if (streq_ptr(info->netdev_kind, "vxlan")) { + char ttl[CONST_MAX(STRLEN("auto") + 1, DECIMAL_STR_MAX(uint8_t))]; + + if (info->vxlan_info.vni > 0) { + r = table_add_many(table, + TABLE_FIELD, "VNI", + TABLE_UINT32, info->vxlan_info.vni); + if (r < 0) + return table_log_add_error(r); + } + + if (IN_SET(info->vxlan_info.group_family, AF_INET, AF_INET6)) { + const char *p; + + r = in_addr_is_multicast(info->vxlan_info.group_family, &info->vxlan_info.group); + if (r <= 0) + p = "Remote"; + else + p = "Group"; + + r = table_add_many(table, + TABLE_FIELD, p, + info->vxlan_info.group_family == AF_INET ? TABLE_IN_ADDR : TABLE_IN6_ADDR, &info->vxlan_info.group); + if (r < 0) + return table_log_add_error(r); + } + + if (IN_SET(info->vxlan_info.local_family, AF_INET, AF_INET6)) { + r = table_add_many(table, + TABLE_FIELD, "Local", + info->vxlan_info.local_family == AF_INET ? TABLE_IN_ADDR : TABLE_IN6_ADDR, &info->vxlan_info.local); + if (r < 0) + return table_log_add_error(r); + } + + if (info->vxlan_info.dest_port > 0) { + r = table_add_many(table, + TABLE_FIELD, "Destination Port", + TABLE_UINT16, be16toh(info->vxlan_info.dest_port)); + if (r < 0) + return table_log_add_error(r); + } + + if (info->vxlan_info.link > 0) { + r = table_add_many(table, + TABLE_FIELD, "Underlying Device", + TABLE_IFINDEX, info->vxlan_info.link); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "Learning", + TABLE_BOOLEAN, info->vxlan_info.learning, + TABLE_FIELD, "RSC", + TABLE_BOOLEAN, info->vxlan_info.rsc, + TABLE_FIELD, "L3MISS", + TABLE_BOOLEAN, info->vxlan_info.l3miss, + TABLE_FIELD, "L2MISS", + TABLE_BOOLEAN, info->vxlan_info.l2miss); + if (r < 0) + return table_log_add_error(r); + + if (info->vxlan_info.tos > 1) { + r = table_add_many(table, + TABLE_FIELD, "TOS", + TABLE_UINT8, info->vxlan_info.tos); + if (r < 0) + return table_log_add_error(r); + } + + if (info->vxlan_info.ttl > 0) + xsprintf(ttl, "%" PRIu8, info->vxlan_info.ttl); + else + strcpy(ttl, "auto"); + + r = table_add_many(table, + TABLE_FIELD, "TTL", + TABLE_STRING, ttl); + if (r < 0) + return table_log_add_error(r); + + } else if (streq_ptr(info->netdev_kind, "vlan") && info->vlan_id > 0) { + r = table_add_many(table, + TABLE_FIELD, "VLan Id", + TABLE_UINT16, info->vlan_id); + if (r < 0) + return table_log_add_error(r); + + } else if (STRPTR_IN_SET(info->netdev_kind, "ipip", "sit", "gre", "gretap", "erspan", "vti")) { + if (in_addr_is_set(AF_INET, &info->local)) { + r = table_add_many(table, + TABLE_FIELD, "Local", + TABLE_IN_ADDR, &info->local); + if (r < 0) + return table_log_add_error(r); + } + + if (in_addr_is_set(AF_INET, &info->remote)) { + r = table_add_many(table, + TABLE_FIELD, "Remote", + TABLE_IN_ADDR, &info->remote); + if (r < 0) + return table_log_add_error(r); + } + + } else if (STRPTR_IN_SET(info->netdev_kind, "ip6gre", "ip6gretap", "ip6erspan", "vti6")) { + if (in_addr_is_set(AF_INET6, &info->local)) { + r = table_add_many(table, + TABLE_FIELD, "Local", + TABLE_IN6_ADDR, &info->local); + if (r < 0) + return table_log_add_error(r); + } + + if (in_addr_is_set(AF_INET6, &info->remote)) { + r = table_add_many(table, + TABLE_FIELD, "Remote", + TABLE_IN6_ADDR, &info->remote); + if (r < 0) + return table_log_add_error(r); + } + + } else if (streq_ptr(info->netdev_kind, "geneve")) { + r = table_add_many(table, + TABLE_FIELD, "VNI", + TABLE_UINT32, info->vni); + if (r < 0) + return table_log_add_error(r); + + if (info->has_tunnel_ipv4 && in_addr_is_set(AF_INET, &info->remote)) { + r = table_add_many(table, + TABLE_FIELD, "Remote", + TABLE_IN_ADDR, &info->remote); + if (r < 0) + return table_log_add_error(r); + } else if (in_addr_is_set(AF_INET6, &info->remote)) { + r = table_add_many(table, + TABLE_FIELD, "Remote", + TABLE_IN6_ADDR, &info->remote); + if (r < 0) + return table_log_add_error(r); + } + + if (info->ttl > 0) { + r = table_add_many(table, + TABLE_FIELD, "TTL", + TABLE_UINT8, info->ttl); + if (r < 0) + return table_log_add_error(r); + } + + if (info->tos > 0) { + r = table_add_many(table, + TABLE_FIELD, "TOS", + TABLE_UINT8, info->tos); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "Port", + TABLE_UINT16, info->tunnel_port, + TABLE_FIELD, "Inherit", + TABLE_STRING, geneve_df_to_string(info->inherit)); + if (r < 0) + return table_log_add_error(r); + + if (info->df > 0) { + r = table_add_many(table, + TABLE_FIELD, "IPDoNotFragment", + TABLE_UINT8, info->df); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "UDPChecksum", + TABLE_BOOLEAN, info->csum, + TABLE_FIELD, "UDP6ZeroChecksumTx", + TABLE_BOOLEAN, info->csum6_tx, + TABLE_FIELD, "UDP6ZeroChecksumRx", + TABLE_BOOLEAN, info->csum6_rx); + if (r < 0) + return table_log_add_error(r); + + if (info->label > 0) { + r = table_add_many(table, + TABLE_FIELD, "FlowLabel", + TABLE_UINT32, info->label); + if (r < 0) + return table_log_add_error(r); + } + + } else if (STRPTR_IN_SET(info->netdev_kind, "macvlan", "macvtap")) { + r = table_add_many(table, + TABLE_FIELD, "Mode", + TABLE_STRING, macvlan_mode_to_string(info->macvlan_mode)); + if (r < 0) + return table_log_add_error(r); + + } else if (streq_ptr(info->netdev_kind, "ipvlan")) { + const char *p; + + if (info->ipvlan_flags & IPVLAN_F_PRIVATE) + p = "private"; + else if (info->ipvlan_flags & IPVLAN_F_VEPA) + p = "vepa"; + else + p = "bridge"; + + r = table_add_cell(table, NULL, TABLE_FIELD, "Mode"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s (%s)", + ipvlan_mode_to_string(info->ipvlan_mode), p); + if (r < 0) + return table_log_add_error(r); + } + + if (info->has_wlan_link_info) { + _cleanup_free_ char *esc = NULL; + + r = table_add_cell(table, NULL, TABLE_FIELD, "Wi-Fi access point"); + if (r < 0) + return table_log_add_error(r); + + if (info->ssid) + esc = cescape(info->ssid); + + r = table_add_cell_stringf(table, NULL, "%s (%s)", + strnull(esc), + ETHER_ADDR_TO_STR(&info->bssid)); + if (r < 0) + return table_log_add_error(r); + } + + if (info->has_bitrates) { + r = table_add_cell(table, NULL, TABLE_FIELD, "Bit Rate (Tx/Rx)"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%sbps/%sbps", + FORMAT_BYTES_FULL(info->tx_bitrate, 0), + FORMAT_BYTES_FULL(info->rx_bitrate, 0)); + if (r < 0) + return table_log_add_error(r); + } + + if (info->has_tx_queues || info->has_rx_queues) { + r = table_add_cell(table, NULL, TABLE_FIELD, "Number of Queues (Tx/Rx)"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%" PRIu32 "/%" PRIu32, info->tx_queues, info->rx_queues); + if (r < 0) + return table_log_add_error(r); + } + + if (info->has_ethtool_link_info) { + if (IN_SET(info->autonegotiation, AUTONEG_DISABLE, AUTONEG_ENABLE)) { + r = table_add_many(table, + TABLE_FIELD, "Auto negotiation", + TABLE_BOOLEAN, info->autonegotiation == AUTONEG_ENABLE); + if (r < 0) + return table_log_add_error(r); + } + + if (info->speed > 0 && info->speed != UINT64_MAX) { + r = table_add_many(table, + TABLE_FIELD, "Speed", + TABLE_BPS, info->speed); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_string_line(table, "Duplex", duplex_to_string(info->duplex)); + if (r < 0) + return r; + + r = table_add_string_line(table, "Port", port_to_string(info->port)); + if (r < 0) + return r; + } + + r = dump_addresses(rtnl, lease, table, info->ifindex); + if (r < 0) + return r; + + r = dump_gateways(rtnl, hwdb, table, info->ifindex); + if (r < 0) + return r; + + r = dump_list(table, "DNS", dns); + if (r < 0) + return r; + + r = dump_list(table, "Search Domains", search_domains); + if (r < 0) + return r; + + r = dump_list(table, "Route Domains", route_domains); + if (r < 0) + return r; + + r = dump_list(table, "NTP", ntp); + if (r < 0) + return r; + + r = dump_list(table, "SIP", sip); + if (r < 0) + return r; + + r = dump_ifindexes(table, "Carrier Bound To", carrier_bound_to); + if (r < 0) + return r; + + r = dump_ifindexes(table, "Carrier Bound By", carrier_bound_by); + if (r < 0) + return r; + + r = table_add_string_line(table, "Activation Policy", activation_policy); + if (r < 0) + return r; + + r = sd_network_link_get_required_for_online(info->ifindex); + if (r >= 0) { + r = table_add_many(table, + TABLE_FIELD, "Required For Online", + TABLE_BOOLEAN, r); + if (r < 0) + return table_log_add_error(r); + } + + if (captive_portal) { + r = table_add_many(table, + TABLE_FIELD, "Captive Portal", + TABLE_STRING, captive_portal, + TABLE_SET_URL, captive_portal); + if (r < 0) + return table_log_add_error(r); + } + + if (lease) { + const void *client_id; + size_t client_id_len; + const char *tz; + + r = sd_dhcp_lease_get_timezone(lease, &tz); + if (r >= 0) { + r = table_add_many(table, + TABLE_FIELD, "Time Zone", + TABLE_STRING, tz); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_dhcp_lease_get_client_id(lease, &client_id, &client_id_len); + if (r >= 0) { + _cleanup_free_ char *id = NULL; + + r = sd_dhcp_client_id_to_string(client_id, client_id_len, &id); + if (r >= 0) { + r = table_add_many(table, + TABLE_FIELD, "DHCP4 Client ID", + TABLE_STRING, id); + if (r < 0) + return table_log_add_error(r); + } + } + } + + r = sd_network_link_get_dhcp6_client_iaid_string(info->ifindex, &iaid); + if (r >= 0) { + r = table_add_many(table, + TABLE_FIELD, "DHCP6 Client IAID", + TABLE_STRING, iaid); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_network_link_get_dhcp6_client_duid_string(info->ifindex, &duid); + if (r >= 0) { + r = table_add_many(table, + TABLE_FIELD, "DHCP6 Client DUID", + TABLE_STRING, duid); + if (r < 0) + return table_log_add_error(r); + } + + r = dump_lldp_neighbors(table, "Connected To", info->ifindex); + if (r < 0) + return r; + + r = dump_dhcp_leases(table, "Offered DHCP leases", bus, info); + if (r < 0) + return r; + + r = dump_statistics(table, info); + if (r < 0) + return r; + + /* First line: circle, ifindex, ifname. */ + printf("%s%s%s %d: %s\n", + on_color_operational, special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE), off_color_operational, + info->ifindex, info->name); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return show_logs(info); +} + +static int system_status(sd_netlink *rtnl, sd_hwdb *hwdb) { + _cleanup_free_ char *operational_state = NULL, *online_state = NULL, *netifs_joined = NULL; + _cleanup_strv_free_ char **netifs = NULL, **dns = NULL, **ntp = NULL, **search_domains = NULL, **route_domains = NULL; + const char *on_color_operational, *off_color_operational, *on_color_online; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(rtnl); + + (void) sd_network_get_operational_state(&operational_state); + operational_state_to_color(NULL, operational_state, &on_color_operational, &off_color_operational); + + (void) sd_network_get_online_state(&online_state); + online_state_to_color(online_state, &on_color_online, NULL); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + r = get_files_in_directory("/run/systemd/netif/links/", &netifs); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to list network interfaces: %m"); + else if (r > 0) { + netifs_joined = strv_join(netifs, ", "); + if (!netifs_joined) + return log_oom(); + } + + r = table_add_many(table, + TABLE_FIELD, "State", + TABLE_STRING, strna(operational_state), + TABLE_SET_COLOR, on_color_operational, + TABLE_FIELD, "Online state", + TABLE_STRING, online_state ?: "unknown", + TABLE_SET_COLOR, on_color_online); + if (r < 0) + return table_log_add_error(r); + + r = dump_addresses(rtnl, NULL, table, 0); + if (r < 0) + return r; + + r = dump_gateways(rtnl, hwdb, table, 0); + if (r < 0) + return r; + + (void) sd_network_get_dns(&dns); + r = dump_list(table, "DNS", dns); + if (r < 0) + return r; + + (void) sd_network_get_search_domains(&search_domains); + r = dump_list(table, "Search Domains", search_domains); + if (r < 0) + return r; + + (void) sd_network_get_route_domains(&route_domains); + r = dump_list(table, "Route Domains", route_domains); + if (r < 0) + return r; + + (void) sd_network_get_ntp(&ntp); + r = dump_list(table, "NTP", ntp); + if (r < 0) + return r; + + printf("%s%s%s Interfaces: %s\n", + on_color_operational, special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE), off_color_operational, + strna(netifs_joined)); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return show_logs(NULL); +} + +static int link_status(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + _cleanup_(link_info_array_freep) LinkInfo *links = NULL; + int r, c; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + if (arg_json_format_flags != JSON_FORMAT_OFF) { + if (arg_all || argc <= 1) + return dump_manager_description(bus); + else + return dump_link_description(bus, strv_skip(argv, 1)); + } + + pager_open(arg_pager_flags); + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = sd_hwdb_new(&hwdb); + if (r < 0) + log_debug_errno(r, "Failed to open hardware database: %m"); + + if (arg_all) + c = acquire_link_info(bus, rtnl, NULL, &links); + else if (argc <= 1) + return system_status(rtnl, hwdb); + else + c = acquire_link_info(bus, rtnl, argv + 1, &links); + if (c < 0) + return c; + + r = 0; + + bool first = true; + FOREACH_ARRAY(i, links, c) { + if (!first) + putchar('\n'); + + RET_GATHER(r, link_status_one(bus, rtnl, hwdb, i)); + + first = false; + } + + return r; +} + +static char *lldp_capabilities_to_string(uint16_t x) { + static const char characters[] = { + 'o', 'p', 'b', 'w', 'r', 't', 'd', 'a', 'c', 's', 'm', + }; + char *ret; + unsigned i; + + ret = new(char, ELEMENTSOF(characters) + 1); + if (!ret) + return NULL; + + for (i = 0; i < ELEMENTSOF(characters); i++) + ret[i] = (x & (1U << i)) ? characters[i] : '.'; + + ret[i] = 0; + return ret; +} + +static void lldp_capabilities_legend(uint16_t x) { + unsigned cols = columns(); + static const char* const table[] = { + "o - Other", + "p - Repeater", + "b - Bridge", + "w - WLAN Access Point", + "r - Router", + "t - Telephone", + "d - DOCSIS cable device", + "a - Station", + "c - Customer VLAN", + "s - Service VLAN", + "m - Two-port MAC Relay (TPMR)", + }; + + if (x == 0) + return; + + printf("\nCapability Flags:\n"); + for (unsigned w = 0, i = 0; i < ELEMENTSOF(table); i++) + if (x & (1U << i) || arg_all) { + bool newline; + + newline = w + strlen(table[i]) + (w == 0 ? 0 : 2) > cols; + if (newline) + w = 0; + w += printf("%s%s%s", newline ? "\n" : "", w == 0 ? "" : "; ", table[i]); + } + puts(""); +} + +static int link_lldp_status(int argc, char *argv[], void *userdata) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(link_info_array_freep) LinkInfo *links = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + int r, c, m = 0; + uint16_t all = 0; + TableCell *cell; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + c = acquire_link_info(NULL, rtnl, argc > 1 ? argv + 1 : NULL, &links); + if (c < 0) + return c; + + pager_open(arg_pager_flags); + + table = table_new("link", + "chassis-id", + "system-name", + "caps", + "port-id", + "port-description"); + if (!table) + return log_oom(); + + if (arg_full) + table_set_width(table, 0); + + table_set_header(table, arg_legend); + + assert_se(cell = table_get_cell(table, 0, 3)); + table_set_minimum_width(table, cell, 11); + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + FOREACH_ARRAY(link, links, c) { + _cleanup_fclose_ FILE *f = NULL; + + r = open_lldp_neighbors(link->ifindex, &f); + if (r == -ENOENT) + continue; + if (r < 0) { + log_warning_errno(r, "Failed to open LLDP data for %i, ignoring: %m", link->ifindex); + continue; + } + + for (;;) { + const char *chassis_id = NULL, *port_id = NULL, *system_name = NULL, *port_description = NULL; + _cleanup_(sd_lldp_neighbor_unrefp) sd_lldp_neighbor *n = NULL; + _cleanup_free_ char *capabilities = NULL; + uint16_t cc; + + r = next_lldp_neighbor(f, &n); + if (r < 0) { + log_warning_errno(r, "Failed to read neighbor data: %m"); + break; + } + if (r == 0) + break; + + (void) sd_lldp_neighbor_get_chassis_id_as_string(n, &chassis_id); + (void) sd_lldp_neighbor_get_port_id_as_string(n, &port_id); + (void) sd_lldp_neighbor_get_system_name(n, &system_name); + (void) sd_lldp_neighbor_get_port_description(n, &port_description); + + if (sd_lldp_neighbor_get_enabled_capabilities(n, &cc) >= 0) { + capabilities = lldp_capabilities_to_string(cc); + all |= cc; + } + + r = table_add_many(table, + TABLE_STRING, link->name, + TABLE_STRING, chassis_id, + TABLE_STRING, system_name, + TABLE_STRING, capabilities, + TABLE_STRING, port_id, + TABLE_STRING, port_description); + if (r < 0) + return table_log_add_error(r); + + m++; + } + } + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (arg_legend) { + lldp_capabilities_legend(all); + printf("\n%i neighbors listed.\n", m); + } + + return 0; +} + +static int link_delete_send_message(sd_netlink *rtnl, int index) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(index >= 0); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_DELLINK, index); + if (r < 0) + return rtnl_log_create_error(r); + + r = sd_netlink_call(rtnl, req, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +static int link_up_down_send_message(sd_netlink *rtnl, char *command, int index) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(index >= 0); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_SETLINK, index); + if (r < 0) + return rtnl_log_create_error(r); + + if (streq(command, "up")) + r = sd_rtnl_message_link_set_flags(req, IFF_UP, IFF_UP); + else + r = sd_rtnl_message_link_set_flags(req, 0, IFF_UP); + if (r < 0) + return log_error_errno(r, "Could not set link flags: %m"); + + r = sd_netlink_call(rtnl, req, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +static int link_up_down(int argc, char *argv[], void *userdata) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_set_free_ Set *indexes = NULL; + int index, r; + void *p; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + indexes = set_new(NULL); + if (!indexes) + return log_oom(); + + for (int i = 1; i < argc; i++) { + index = rtnl_resolve_interface_or_warn(&rtnl, argv[i]); + if (index < 0) + return index; + + r = set_put(indexes, INT_TO_PTR(index)); + if (r < 0) + return log_oom(); + } + + SET_FOREACH(p, indexes) { + index = PTR_TO_INT(p); + r = link_up_down_send_message(rtnl, argv[0], index); + if (r < 0) + return log_error_errno(r, "Failed to bring %s interface %s: %m", + argv[0], FORMAT_IFNAME_FULL(index, FORMAT_IFNAME_IFINDEX)); + } + + return r; +} + +static int link_delete(int argc, char *argv[], void *userdata) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_set_free_ Set *indexes = NULL; + int index, r; + void *p; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + indexes = set_new(NULL); + if (!indexes) + return log_oom(); + + for (int i = 1; i < argc; i++) { + index = rtnl_resolve_interface_or_warn(&rtnl, argv[i]); + if (index < 0) + return index; + + r = set_put(indexes, INT_TO_PTR(index)); + if (r < 0) + return log_oom(); + } + + SET_FOREACH(p, indexes) { + index = PTR_TO_INT(p); + r = link_delete_send_message(rtnl, index); + if (r < 0) + return log_error_errno(r, "Failed to delete interface %s: %m", + FORMAT_IFNAME_FULL(index, FORMAT_IFNAME_IFINDEX)); + } + + return r; +} + +static int link_renew_one(sd_bus *bus, int index, const char *name) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(index >= 0); + assert(name); + + r = bus_call_method(bus, bus_network_mgr, "RenewLink", &error, NULL, "i", index); + if (r < 0) + return log_error_errno(r, "Failed to renew dynamic configuration of interface %s: %s", + name, bus_error_message(&error, r)); + + return 0; +} + +static int link_renew(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int index, k = 0, r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + for (int i = 1; i < argc; i++) { + index = rtnl_resolve_interface_or_warn(&rtnl, argv[i]); + if (index < 0) + return index; + + r = link_renew_one(bus, index, argv[i]); + if (r < 0 && k >= 0) + k = r; + } + + return k; +} + +static int link_force_renew_one(sd_bus *bus, int index, const char *name) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(index >= 0); + assert(name); + + r = bus_call_method(bus, bus_network_mgr, "ForceRenewLink", &error, NULL, "i", index); + if (r < 0) + return log_error_errno(r, "Failed to force renew dynamic configuration of interface %s: %s", + name, bus_error_message(&error, r)); + + return 0; +} + +static int link_force_renew(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int k = 0, r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + for (int i = 1; i < argc; i++) { + int index = rtnl_resolve_interface_or_warn(&rtnl, argv[i]); + if (index < 0) + return index; + + r = link_force_renew_one(bus, index, argv[i]); + if (r < 0 && k >= 0) + k = r; + } + + return k; +} + +static int verb_reload(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_network_mgr, "Reload", &error, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to reload network settings: %s", bus_error_message(&error, r)); + + return 0; +} + +static int verb_reconfigure(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_set_free_ Set *indexes = NULL; + int index, r; + void *p; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + indexes = set_new(NULL); + if (!indexes) + return log_oom(); + + for (int i = 1; i < argc; i++) { + index = rtnl_resolve_interface_or_warn(&rtnl, argv[i]); + if (index < 0) + return index; + + r = set_put(indexes, INT_TO_PTR(index)); + if (r < 0) + return log_oom(); + } + + SET_FOREACH(p, indexes) { + index = PTR_TO_INT(p); + r = bus_call_method(bus, bus_network_mgr, "ReconfigureLink", &error, NULL, "i", index); + if (r < 0) + return log_error_errno(r, "Failed to reconfigure network interface %s: %s", + FORMAT_IFNAME_FULL(index, FORMAT_IFNAME_IFINDEX), + bus_error_message(&error, r)); + } + + return 0; +} + +typedef enum ReloadFlags { + RELOAD_NETWORKD = 1 << 0, + RELOAD_UDEVD = 1 << 1, +} ReloadFlags; + +static int get_config_files_by_name(const char *name, char **ret_path, char ***ret_dropins) { + _cleanup_free_ char *path = NULL; + int r; + + assert(name); + assert(ret_path); + + STRV_FOREACH(i, NETWORK_DIRS) { + _cleanup_free_ char *p = NULL; + + p = path_join(*i, name); + if (!p) + return -ENOMEM; + + r = RET_NERRNO(access(p, F_OK)); + if (r >= 0) { + path = TAKE_PTR(p); + break; + } + + if (r != -ENOENT) + log_debug_errno(r, "Failed to determine whether '%s' exists, ignoring: %m", p); + } + + if (!path) + return -ENOENT; + + if (ret_dropins) { + _cleanup_free_ char *dropin_dirname = NULL; + + dropin_dirname = strjoin(name, ".d"); + if (!dropin_dirname) + return -ENOMEM; + + r = conf_files_list_dropins(ret_dropins, dropin_dirname, /* root = */ NULL, NETWORK_DIRS); + if (r < 0) + return r; + } + + *ret_path = TAKE_PTR(path); + + return 0; +} + +static int get_dropin_by_name( + const char *name, + char * const *dropins, + char **ret) { + + assert(name); + assert(dropins); + assert(ret); + + STRV_FOREACH(i, dropins) + if (path_equal_filename(*i, name)) { + _cleanup_free_ char *d = NULL; + + d = strdup(*i); + if (!d) + return -ENOMEM; + + *ret = TAKE_PTR(d); + return 1; + } + + *ret = NULL; + return 0; +} + +static int get_network_files_by_link( + sd_netlink **rtnl, + const char *link, + char **ret_path, + char ***ret_dropins) { + + _cleanup_strv_free_ char **dropins = NULL; + _cleanup_free_ char *path = NULL; + int r, ifindex; + + assert(rtnl); + assert(link); + assert(ret_path); + assert(ret_dropins); + + ifindex = rtnl_resolve_interface_or_warn(rtnl, link); + if (ifindex < 0) + return ifindex; + + r = sd_network_link_get_network_file(ifindex, &path); + if (r == -ENODATA) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Link '%s' has no associated network file.", link); + if (r < 0) + return log_error_errno(r, "Failed to get network file for link '%s': %m", link); + + r = sd_network_link_get_network_file_dropins(ifindex, &dropins); + if (r < 0 && r != -ENODATA) + return log_error_errno(r, "Failed to get network drop-ins for link '%s': %m", link); + + *ret_path = TAKE_PTR(path); + *ret_dropins = TAKE_PTR(dropins); + + return 0; +} + +static int get_link_files_by_link(const char *link, char **ret_path, char ***ret_dropins) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _cleanup_strv_free_ char **dropins_split = NULL; + _cleanup_free_ char *p = NULL; + const char *path, *dropins; + int r; + + assert(link); + assert(ret_path); + assert(ret_dropins); + + r = sd_device_new_from_ifname(&device, link); + if (r < 0) + return log_error_errno(r, "Failed to create sd-device object for link '%s': %m", link); + + r = sd_device_get_property_value(device, "ID_NET_LINK_FILE", &path); + if (r == -ENOENT) + return log_error_errno(r, "Link '%s' has no associated link file.", link); + if (r < 0) + return log_error_errno(r, "Failed to get link file for link '%s': %m", link); + + r = sd_device_get_property_value(device, "ID_NET_LINK_FILE_DROPINS", &dropins); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to get link drop-ins for link '%s': %m", link); + if (r >= 0) { + r = strv_split_full(&dropins_split, dropins, ":", EXTRACT_CUNESCAPE); + if (r < 0) + return log_error_errno(r, "Failed to parse link drop-ins for link '%s': %m", link); + } + + p = strdup(path); + if (!p) + return log_oom(); + + *ret_path = TAKE_PTR(p); + *ret_dropins = TAKE_PTR(dropins_split); + + return 0; +} + +static int get_config_files_by_link_config( + const char *link_config, + sd_netlink **rtnl, + char **ret_path, + char ***ret_dropins, + ReloadFlags *ret_reload) { + + _cleanup_strv_free_ char **dropins = NULL, **link_config_split = NULL; + _cleanup_free_ char *path = NULL; + const char *ifname, *type; + ReloadFlags reload; + size_t n; + int r; + + assert(link_config); + assert(rtnl); + assert(ret_path); + assert(ret_dropins); + + link_config_split = strv_split(link_config, ":"); + if (!link_config_split) + return log_oom(); + + n = strv_length(link_config_split); + if (n == 0 || isempty(link_config_split[0])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No link name is given."); + if (n > 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid link config '%s'.", link_config); + + ifname = link_config_split[0]; + type = n == 2 ? link_config_split[1] : "network"; + + if (streq(type, "network")) { + if (!networkd_is_running()) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "Cannot get network file for link if systemd-networkd is not running."); + + r = get_network_files_by_link(rtnl, ifname, &path, &dropins); + if (r < 0) + return r; + + reload = RELOAD_NETWORKD; + } else if (streq(type, "link")) { + r = get_link_files_by_link(ifname, &path, &dropins); + if (r < 0) + return r; + + reload = RELOAD_UDEVD; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid config type '%s' for link '%s'.", type, ifname); + + *ret_path = TAKE_PTR(path); + *ret_dropins = TAKE_PTR(dropins); + + if (ret_reload) + *ret_reload = reload; + + return 0; +} + +static int add_config_to_edit( + EditFileContext *context, + const char *path, + char * const *dropins) { + + _cleanup_free_ char *new_path = NULL, *dropin_path = NULL, *old_dropin = NULL; + _cleanup_strv_free_ char **comment_paths = NULL; + int r; + + assert(context); + assert(path); + assert(!arg_drop_in || dropins); + + if (path_startswith(path, "/usr")) { + _cleanup_free_ char *name = NULL; + + r = path_extract_filename(path, &name); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from '%s': %m", path); + + new_path = path_join(NETWORK_DIRS[0], name); + if (!new_path) + return log_oom(); + } + + if (!arg_drop_in) + return edit_files_add(context, new_path ?: path, path, NULL); + + r = get_dropin_by_name(arg_drop_in, dropins, &old_dropin); + if (r < 0) + return log_error_errno(r, "Failed to acquire drop-in '%s': %m", arg_drop_in); + + if (r > 0 && !path_startswith(old_dropin, "/usr")) + /* An existing drop-in is found and not in /usr/. Let's edit it directly. */ + dropin_path = TAKE_PTR(old_dropin); + else { + /* No drop-in was found or an existing drop-in resides in /usr/. Let's create + * a new drop-in file. */ + dropin_path = strjoin(new_path ?: path, ".d/", arg_drop_in); + if (!dropin_path) + return log_oom(); + } + + comment_paths = strv_new(path); + if (!comment_paths) + return log_oom(); + + r = strv_extend_strv(&comment_paths, dropins, /* filter_duplicates = */ false); + if (r < 0) + return log_oom(); + + return edit_files_add(context, dropin_path, old_dropin, comment_paths); +} + +static int udevd_reload(sd_bus *bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + const char *job_path; + int r; + + assert(bus); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + + r = bus_call_method(bus, + bus_systemd_mgr, + "ReloadUnit", + &error, + &reply, + "ss", + "systemd-udevd.service", + "replace"); + if (r < 0) + return log_error_errno(r, "Failed to reload systemd-udevd: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &job_path); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, job_path, /* quiet = */ true, NULL); + if (r == -ENOEXEC) { + log_debug("systemd-udevd is not running, skipping reload."); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to reload systemd-udevd: %m"); + + return 1; +} + +static int verb_edit(int argc, char *argv[], void *userdata) { + _cleanup_(edit_file_context_done) EditFileContext context = { + .marker_start = DROPIN_MARKER_START, + .marker_end = DROPIN_MARKER_END, + .remove_parent = !!arg_drop_in, + }; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + ReloadFlags reload = 0; + int r; + + if (!on_tty()) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot edit network config files if not on a tty."); + + r = mac_selinux_init(); + if (r < 0) + return r; + + STRV_FOREACH(name, strv_skip(argv, 1)) { + _cleanup_strv_free_ char **dropins = NULL; + _cleanup_free_ char *path = NULL; + const char *link_config; + + link_config = startswith(*name, "@"); + if (link_config) { + ReloadFlags flags; + + r = get_config_files_by_link_config(link_config, &rtnl, &path, &dropins, &flags); + if (r < 0) + return r; + + reload |= flags; + + r = add_config_to_edit(&context, path, dropins); + if (r < 0) + return r; + + continue; + } + + if (ENDSWITH_SET(*name, ".network", ".netdev")) + reload |= RELOAD_NETWORKD; + else if (endswith(*name, ".link")) + reload |= RELOAD_UDEVD; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid network config name '%s'.", *name); + + r = get_config_files_by_name(*name, &path, &dropins); + if (r == -ENOENT) { + if (arg_drop_in) + return log_error_errno(r, "Cannot find network config '%s'.", *name); + + log_debug("No existing network config '%s' found, creating a new file.", *name); + + path = path_join(NETWORK_DIRS[0], *name); + if (!path) + return log_oom(); + + r = edit_files_add(&context, path, NULL, NULL); + if (r < 0) + return r; + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to get the path of network config '%s': %m", *name); + + r = add_config_to_edit(&context, path, dropins); + if (r < 0) + return r; + } + + r = do_edit_files_and_install(&context); + if (r < 0) + return r; + + if (arg_no_reload) + return 0; + + if (!sd_booted() || running_in_chroot() > 0) { + log_debug("System is not booted with systemd or is running in chroot, skipping reload."); + return 0; + } + + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + if (FLAGS_SET(reload, RELOAD_UDEVD)) { + r = udevd_reload(bus); + if (r < 0) + return r; + } + + if (FLAGS_SET(reload, RELOAD_NETWORKD)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (!networkd_is_running()) { + log_debug("systemd-networkd is not running, skipping reload."); + return 0; + } + + r = bus_call_method(bus, bus_network_mgr, "Reload", &error, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to reload systemd-networkd: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_cat(int argc, char *argv[], void *userdata) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r, ret = 0; + + pager_open(arg_pager_flags); + + STRV_FOREACH(name, strv_skip(argv, 1)) { + _cleanup_strv_free_ char **dropins = NULL; + _cleanup_free_ char *path = NULL; + const char *link_config; + + link_config = startswith(*name, "@"); + if (link_config) { + r = get_config_files_by_link_config(link_config, &rtnl, &path, &dropins, /* ret_reload = */ NULL); + if (r < 0) + return ret < 0 ? ret : r; + } else { + r = get_config_files_by_name(*name, &path, &dropins); + if (r == -ENOENT) { + log_error_errno(r, "Cannot find network config file '%s'.", *name); + ret = ret < 0 ? ret : r; + continue; + } + if (r < 0) { + log_error_errno(r, "Failed to get the path of network config '%s': %m", *name); + return ret < 0 ? ret : r; + } + } + + r = cat_files(path, dropins, /* flags = */ CAT_FORMAT_HAS_SECTIONS); + if (r < 0) + return ret < 0 ? ret : r; + } + + return ret; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("networkctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND\n\n" + "%sQuery and control the networking subsystem.%s\n" + "\nCommands:\n" + " list [PATTERN...] List links\n" + " status [PATTERN...] Show link status\n" + " lldp [PATTERN...] Show LLDP neighbors\n" + " label Show current address label entries in the kernel\n" + " delete DEVICES... Delete virtual netdevs\n" + " up DEVICES... Bring devices up\n" + " down DEVICES... Bring devices down\n" + " renew DEVICES... Renew dynamic configurations\n" + " forcerenew DEVICES... Trigger DHCP reconfiguration of all connected clients\n" + " reconfigure DEVICES... Reconfigure interfaces\n" + " reload Reload .network and .netdev files\n" + " edit FILES|DEVICES... Edit network configuration files\n" + " cat FILES|DEVICES... Show network configuration files\n" + "\nOptions:\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " -a --all Show status for all links\n" + " -s --stats Show detailed link statistics\n" + " -l --full Do not ellipsize output\n" + " -n --lines=INTEGER Number of journal entries to show\n" + " --json=pretty|short|off\n" + " Generate JSON output\n" + " --no-reload Do not reload systemd-networkd or systemd-udevd\n" + " after editing network config\n" + " --drop-in=NAME Edit specified drop-in instead of main config file\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_JSON, + ARG_NO_RELOAD, + ARG_DROP_IN, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "all", no_argument, NULL, 'a' }, + { "stats", no_argument, NULL, 's' }, + { "full", no_argument, NULL, 'l' }, + { "lines", required_argument, NULL, 'n' }, + { "json", required_argument, NULL, ARG_JSON }, + { "no-reload", no_argument, NULL, ARG_NO_RELOAD }, + { "drop-in", required_argument, NULL, ARG_DROP_IN }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hasln:", options, NULL)) >= 0) { + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_NO_RELOAD: + arg_no_reload = true; + break; + + case ARG_DROP_IN: + if (isempty(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Empty drop-in file name."); + + if (!endswith(optarg, ".conf")) { + char *conf; + + conf = strjoin(optarg, ".conf"); + if (!conf) + return log_oom(); + + free_and_replace(arg_drop_in, conf); + } else { + r = free_and_strdup(&arg_drop_in, optarg); + if (r < 0) + return log_oom(); + } + + if (!filename_is_valid(arg_drop_in)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid drop-in file name '%s'.", arg_drop_in); + + break; + + case 'a': + arg_all = true; + break; + + case 's': + arg_stats = true; + break; + + case 'l': + arg_full = true; + break; + + case 'n': + if (safe_atou(optarg, &arg_lines) < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse lines '%s'", optarg); + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + } + + return 1; +} + +static int networkctl_main(int argc, char *argv[]) { + static const Verb verbs[] = { + { "list", VERB_ANY, VERB_ANY, VERB_DEFAULT|VERB_ONLINE_ONLY, list_links }, + { "status", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, link_status }, + { "lldp", VERB_ANY, VERB_ANY, 0, link_lldp_status }, + { "label", 1, 1, 0, list_address_labels }, + { "delete", 2, VERB_ANY, 0, link_delete }, + { "up", 2, VERB_ANY, 0, link_up_down }, + { "down", 2, VERB_ANY, 0, link_up_down }, + { "renew", 2, VERB_ANY, VERB_ONLINE_ONLY, link_renew }, + { "forcerenew", 2, VERB_ANY, VERB_ONLINE_ONLY, link_force_renew }, + { "reconfigure", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_reconfigure }, + { "reload", 1, 1, VERB_ONLINE_ONLY, verb_reload }, + { "edit", 2, VERB_ANY, 0, verb_edit }, + { "cat", 2, VERB_ANY, 0, verb_cat }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char* argv[]) { + int r; + + log_setup(); + + sigbus_install(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return networkctl_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/network/networkd-address-generation.c b/src/network/networkd-address-generation.c new file mode 100644 index 0000000..65f0009 --- /dev/null +++ b/src/network/networkd-address-generation.c @@ -0,0 +1,439 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-id128.h" + +#include "arphrd-util.h" +#include "id128-util.h" +#include "memory-util.h" +#include "networkd-address-generation.h" +#include "networkd-link.h" +#include "networkd-network.h" +#include "string-util.h" + +#define DAD_CONFLICTS_IDGEN_RETRIES_RFC7217 3 + +/* https://www.iana.org/assignments/ipv6-interface-ids/ipv6-interface-ids.xml */ +#define SUBNET_ROUTER_ANYCAST_ADDRESS ((const struct in6_addr) { .s6_addr = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } }) +#define SUBNET_ROUTER_ANYCAST_PREFIXLEN 64 +#define RESERVED_INTERFACE_IDENTIFIERS_ADDRESS ((const struct in6_addr) { .s6_addr = { 0x02, 0x00, 0x5E, 0xFF, 0xFE } }) +#define RESERVED_INTERFACE_IDENTIFIERS_PREFIXLEN 40 +#define RESERVED_SUBNET_ANYCAST_ADDRESSES ((const struct in6_addr) { .s6_addr = { 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x80 } }) +#define RESERVED_SUBNET_ANYCAST_PREFIXLEN 57 + +#define DHCP_PD_APP_ID SD_ID128_MAKE(fb,b9,37,ca,4a,ed,4a,4d,b0,70,7f,aa,71,c0,c9,85) +#define NDISC_APP_ID SD_ID128_MAKE(13,ac,81,a7,d5,3f,49,78,92,79,5d,0c,29,3a,bc,7e) +#define RADV_APP_ID SD_ID128_MAKE(1f,1e,90,c8,5c,78,4f,dc,8e,61,2d,59,0d,53,c1,25) + +typedef enum AddressGenerationType { + ADDRESS_GENERATION_EUI64, + ADDRESS_GENERATION_STATIC, + ADDRESS_GENERATION_PREFIXSTABLE, + _ADDRESS_GENERATION_TYPE_MAX, + _ADDRESS_GENERATION_TYPE_INVALID = -EINVAL, +} AddressGenerationType; + +typedef struct IPv6Token { + AddressGenerationType type; + struct in6_addr address; + sd_id128_t secret_key; +} IPv6Token; + +static int generate_eui64_address(const Link *link, const struct in6_addr *prefix, struct in6_addr *ret) { + assert(link); + assert(prefix); + assert(ret); + + memcpy(ret->s6_addr, prefix, 8); + + switch (link->iftype) { + case ARPHRD_INFINIBAND: + /* Use last 8 byte. See RFC4391 section 8 */ + memcpy(&ret->s6_addr[8], &link->hw_addr.infiniband[INFINIBAND_ALEN - 8], 8); + break; + case ARPHRD_ETHER: + /* see RFC4291 section 2.5.1 */ + ret->s6_addr[8] = link->hw_addr.ether.ether_addr_octet[0]; + ret->s6_addr[9] = link->hw_addr.ether.ether_addr_octet[1]; + ret->s6_addr[10] = link->hw_addr.ether.ether_addr_octet[2]; + ret->s6_addr[11] = 0xff; + ret->s6_addr[12] = 0xfe; + ret->s6_addr[13] = link->hw_addr.ether.ether_addr_octet[3]; + ret->s6_addr[14] = link->hw_addr.ether.ether_addr_octet[4]; + ret->s6_addr[15] = link->hw_addr.ether.ether_addr_octet[5]; + break; + default: + return log_link_debug_errno(link, SYNTHETIC_ERRNO(EINVAL), + "Token=eui64 is not supported for interface type %s, ignoring.", + strna(arphrd_to_name(link->iftype))); + } + + ret->s6_addr[8] ^= 1 << 1; + return 0; +} + +static bool stable_private_address_is_valid(const struct in6_addr *addr) { + assert(addr); + + /* According to rfc4291, generated address should not be in the following ranges. */ + + if (in6_addr_prefix_covers(&SUBNET_ROUTER_ANYCAST_ADDRESS, SUBNET_ROUTER_ANYCAST_PREFIXLEN, addr)) + return false; + + if (in6_addr_prefix_covers(&RESERVED_INTERFACE_IDENTIFIERS_ADDRESS, RESERVED_INTERFACE_IDENTIFIERS_PREFIXLEN, addr)) + return false; + + if (in6_addr_prefix_covers(&RESERVED_SUBNET_ANYCAST_ADDRESSES, RESERVED_SUBNET_ANYCAST_PREFIXLEN, addr)) + return false; + + return true; +} + +static void generate_stable_private_address_one( + Link *link, + const sd_id128_t *secret_key, + const struct in6_addr *prefix, + uint8_t dad_counter, + struct in6_addr *ret) { + + struct siphash state; + uint64_t rid; + + assert(link); + assert(secret_key); + assert(prefix); + assert(ret); + + /* According to RFC7217 section 5.1 + * RID = F(Prefix, Net_Iface, Network_ID, DAD_Counter, secret_key) */ + + siphash24_init(&state, secret_key->bytes); + + siphash24_compress(prefix, 8, &state); + siphash24_compress_string(link->ifname, &state); + if (link->iftype == ARPHRD_INFINIBAND) + /* Only last 8 bytes of IB MAC are stable */ + siphash24_compress(&link->hw_addr.infiniband[INFINIBAND_ALEN - 8], 8, &state); + else + siphash24_compress(link->hw_addr.bytes, link->hw_addr.length, &state); + + if (link->ssid) + siphash24_compress_string(link->ssid, &state); + + siphash24_compress(&dad_counter, sizeof(uint8_t), &state); + + rid = htole64(siphash24_finalize(&state)); + + memcpy(ret->s6_addr, prefix->s6_addr, 8); + memcpy(ret->s6_addr + 8, &rid, 8); +} + +static int generate_stable_private_address( + Link *link, + const sd_id128_t *app_id, + const sd_id128_t *secret_key, + const struct in6_addr *prefix, + struct in6_addr *ret) { + + sd_id128_t secret_machine_key; + struct in6_addr addr; + uint8_t i; + int r; + + assert(link); + assert(app_id); + assert(secret_key); + assert(prefix); + assert(ret); + + if (sd_id128_is_null(*secret_key)) { + r = sd_id128_get_machine_app_specific(*app_id, &secret_machine_key); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to generate secret key for IPv6 stable private address: %m"); + + secret_key = &secret_machine_key; + } + + /* While this loop uses dad_counter and a retry limit as specified in RFC 7217, the loop does + * not actually attempt Duplicate Address Detection; the counter will be incremented only when + * the address generation algorithm produces an invalid address, and the loop may exit with an + * address which ends up being unusable due to duplication on the link. */ + for (i = 0; i < DAD_CONFLICTS_IDGEN_RETRIES_RFC7217; i++) { + generate_stable_private_address_one(link, secret_key, prefix, i, &addr); + + if (stable_private_address_is_valid(&addr)) + break; + } + if (i >= DAD_CONFLICTS_IDGEN_RETRIES_RFC7217) + /* propagate recognizable errors. */ + return log_link_debug_errno(link, SYNTHETIC_ERRNO(ENOANO), + "Failed to generate stable private address."); + + *ret = addr; + return 0; +} + +static int generate_addresses( + Link *link, + Set *tokens, + const sd_id128_t *app_id, + const struct in6_addr *prefix, + uint8_t prefixlen, + Set **ret) { + + _cleanup_set_free_ Set *addresses = NULL; + struct in6_addr masked; + IPv6Token *j; + int r; + + assert(link); + assert(app_id); + assert(prefix); + assert(prefixlen > 0 && prefixlen <= 64); + assert(ret); + + masked = *prefix; + in6_addr_mask(&masked, prefixlen); + + SET_FOREACH(j, tokens) { + struct in6_addr addr, *copy; + + switch (j->type) { + case ADDRESS_GENERATION_EUI64: + if (generate_eui64_address(link, &masked, &addr) < 0) + continue; + break; + + case ADDRESS_GENERATION_STATIC: + memcpy(addr.s6_addr, masked.s6_addr, 8); + memcpy(addr.s6_addr + 8, j->address.s6_addr + 8, 8); + break; + + case ADDRESS_GENERATION_PREFIXSTABLE: + if (in6_addr_is_set(&j->address) && !in6_addr_equal(&j->address, &masked)) + continue; + + if (generate_stable_private_address(link, app_id, &j->secret_key, &masked, &addr) < 0) + continue; + + break; + + default: + assert_not_reached(); + } + + copy = newdup(struct in6_addr, &addr, 1); + if (!copy) + return -ENOMEM; + + r = set_ensure_consume(&addresses, &in6_addr_hash_ops_free, copy); + if (r < 0) + return r; + } + + /* fall back to EUI-64 if no token is provided */ + if (set_isempty(addresses)) { + _cleanup_free_ struct in6_addr *addr = NULL; + + addr = new(struct in6_addr, 1); + if (!addr) + return -ENOMEM; + + if (IN_SET(link->iftype, ARPHRD_ETHER, ARPHRD_INFINIBAND)) + r = generate_eui64_address(link, &masked, addr); + else + r = generate_stable_private_address(link, app_id, &SD_ID128_NULL, &masked, addr); + if (r < 0) + return r; + + r = set_ensure_consume(&addresses, &in6_addr_hash_ops_free, TAKE_PTR(addr)); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(addresses); + return 0; +} + +int dhcp_pd_generate_addresses(Link *link, const struct in6_addr *prefix, Set **ret) { + return generate_addresses(link, link->network->dhcp_pd_tokens, &DHCP_PD_APP_ID, prefix, 64, ret); +} + +int ndisc_generate_addresses(Link *link, const struct in6_addr *prefix, uint8_t prefixlen, Set **ret) { + return generate_addresses(link, link->network->ndisc_tokens, &NDISC_APP_ID, prefix, prefixlen, ret); +} + +int radv_generate_addresses(Link *link, Set *tokens, const struct in6_addr *prefix, uint8_t prefixlen, Set **ret) { + return generate_addresses(link, tokens, &RADV_APP_ID, prefix, prefixlen, ret); +} + +static void ipv6_token_hash_func(const IPv6Token *p, struct siphash *state) { + siphash24_compress(&p->type, sizeof(p->type), state); + siphash24_compress(&p->address, sizeof(p->address), state); + id128_hash_func(&p->secret_key, state); +} + +static int ipv6_token_compare_func(const IPv6Token *a, const IPv6Token *b) { + int r; + + r = CMP(a->type, b->type); + if (r != 0) + return r; + + r = memcmp(&a->address, &b->address, sizeof(struct in6_addr)); + if (r != 0) + return r; + + return id128_compare_func(&a->secret_key, &b->secret_key); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + ipv6_token_hash_ops, + IPv6Token, + ipv6_token_hash_func, + ipv6_token_compare_func, + free); + +static int ipv6_token_add(Set **tokens, AddressGenerationType type, const struct in6_addr *addr, const sd_id128_t *secret_key) { + IPv6Token *p; + + assert(tokens); + assert(type >= 0 && type < _ADDRESS_GENERATION_TYPE_MAX); + assert(addr); + assert(secret_key); + + p = new(IPv6Token, 1); + if (!p) + return -ENOMEM; + + *p = (IPv6Token) { + .type = type, + .address = *addr, + .secret_key = *secret_key, + }; + + return set_ensure_consume(tokens, &ipv6_token_hash_ops, p); +} + +int config_parse_address_generation_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *addr_alloc = NULL; + sd_id128_t secret_key = SD_ID128_NULL; + union in_addr_union buffer = {}; + AddressGenerationType type; + Set **tokens = ASSERT_PTR(data); + const char *addr; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *tokens = set_free(*tokens); + return 0; + } + + if ((addr = startswith(rvalue, "prefixstable"))) { + const char *comma; + + type = ADDRESS_GENERATION_PREFIXSTABLE; + + if (*addr == ':') { + addr++; + + comma = strchr(addr, ','); + if (comma) { + addr_alloc = strndup(addr, comma - addr); + if (!addr_alloc) + return log_oom(); + + addr = addr_alloc; + } + } else if (*addr == ',') + comma = TAKE_PTR(addr); + else if (*addr == '\0') { + comma = NULL; + addr = NULL; + } else { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid IPv6 token mode in %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (comma) { + r = id128_from_string_nonzero(comma + 1, &secret_key); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + r == -ENXIO ? "Secret key in %s= cannot be null, ignoring assignment: %s" + : "Failed to parse secret key in %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + } + + } else if (streq(rvalue, "eui64")) { + type = ADDRESS_GENERATION_EUI64; + addr = NULL; + } else { + type = ADDRESS_GENERATION_STATIC; + + addr = startswith(rvalue, "static:"); + if (!addr) + addr = rvalue; + } + + if (addr) { + r = in_addr_from_string(AF_INET6, addr, &buffer); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse IP address in %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + } + + switch (type) { + case ADDRESS_GENERATION_EUI64: + assert(in6_addr_is_null(&buffer.in6)); + break; + + case ADDRESS_GENERATION_STATIC: + /* Only last 64 bits are used. */ + memzero(buffer.in6.s6_addr, 8); + + if (in6_addr_is_null(&buffer.in6)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "IPv6 address in %s= cannot be the ANY address, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + break; + + case ADDRESS_GENERATION_PREFIXSTABLE: + /* At most, the initial 64 bits are used. */ + (void) in6_addr_mask(&buffer.in6, 64); + break; + + default: + assert_not_reached(); + } + + r = ipv6_token_add(tokens, type, &buffer.in6, &secret_key); + if (r < 0) + return log_oom(); + + return 0; +} diff --git a/src/network/networkd-address-generation.h b/src/network/networkd-address-generation.h new file mode 100644 index 0000000..901b2ec --- /dev/null +++ b/src/network/networkd-address-generation.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "set.h" + +typedef struct Link Link; + +int dhcp_pd_generate_addresses(Link *link, const struct in6_addr *prefix, Set **ret); +int ndisc_generate_addresses(Link *link, const struct in6_addr *prefix, uint8_t prefixlen, Set **ret); +int radv_generate_addresses(Link *link, Set *tokens, const struct in6_addr *prefix, uint8_t prefixlen, Set **ret); + +CONFIG_PARSER_PROTOTYPE(config_parse_address_generation_type); diff --git a/src/network/networkd-address-label.c b/src/network/networkd-address-label.c new file mode 100644 index 0000000..745b959 --- /dev/null +++ b/src/network/networkd-address-label.c @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "netlink-util.h" +#include "networkd-address-label.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "parse-util.h" + +AddressLabel *address_label_free(AddressLabel *label) { + if (!label) + return NULL; + + if (label->network) { + assert(label->section); + hashmap_remove(label->network->address_labels_by_section, label->section); + } + + config_section_free(label->section); + return mfree(label); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(AddressLabel, address_label_free); + +static int address_label_new_static(Network *network, const char *filename, unsigned section_line, AddressLabel **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(address_label_freep) AddressLabel *label = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + label = hashmap_get(network->address_labels_by_section, n); + if (label) { + *ret = TAKE_PTR(label); + return 0; + } + + label = new(AddressLabel, 1); + if (!label) + return -ENOMEM; + + *label = (AddressLabel) { + .network = network, + .section = TAKE_PTR(n), + .label = UINT32_MAX, + }; + + r = hashmap_ensure_put(&network->address_labels_by_section, &config_section_hash_ops, label->section, label); + if (r < 0) + return r; + + *ret = TAKE_PTR(label); + return 0; +} + +static int address_label_configure_handler( + sd_netlink *rtnl, + sd_netlink_message *m, + Request *req, + Link *link, + void *userdata) { + + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not set address label"); + link_enter_failed(link); + return 1; + } + + if (link->static_address_label_messages == 0) { + log_link_debug(link, "Addresses label set"); + link->static_address_labels_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int address_label_configure(AddressLabel *label, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(label); + assert(link); + assert(link->ifindex > 0); + assert(link->manager); + assert(link->manager->rtnl); + assert(req); + + r = sd_rtnl_message_new_addrlabel(link->manager->rtnl, &m, RTM_NEWADDRLABEL, + link->ifindex, AF_INET6); + if (r < 0) + return r; + + r = sd_rtnl_message_addrlabel_set_prefixlen(m, label->prefixlen); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, IFAL_LABEL, label->label); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, IFA_ADDRESS, &label->prefix); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static int address_label_process_request(Request *req, Link *link, void *userdata) { + AddressLabel *label = ASSERT_PTR(userdata); + int r; + + assert(req); + assert(link); + + if (!link_is_ready_to_configure(link, false)) + return 0; + + r = address_label_configure(label, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure address label: %m"); + + return 1; +} + +int link_request_static_address_labels(Link *link) { + AddressLabel *label; + int r; + + assert(link); + assert(link->network); + + link->static_address_labels_configured = false; + + HASHMAP_FOREACH(label, link->network->address_labels_by_section) { + r = link_queue_request_full(link, REQUEST_TYPE_ADDRESS_LABEL, + label, NULL, trivial_hash_func, trivial_compare_func, + address_label_process_request, + &link->static_address_label_messages, + address_label_configure_handler, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request address label: %m"); + } + + if (link->static_address_label_messages == 0) { + link->static_address_labels_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Setting address labels."); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +static int address_label_section_verify(AddressLabel *label) { + assert(label); + assert(label->section); + + if (section_is_invalid(label->section)) + return -EINVAL; + + if (!label->prefix_set) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: [IPv6AddressLabel] section without Prefix= setting specified. " + "Ignoring [IPv6AddressLabel] section from line %u.", + label->section->filename, label->section->line); + + if (label->label == UINT32_MAX) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: [IPv6AddressLabel] section without Label= setting specified. " + "Ignoring [IPv6AddressLabel] section from line %u.", + label->section->filename, label->section->line); + + return 0; +} + +void network_drop_invalid_address_labels(Network *network) { + AddressLabel *label; + + assert(network); + + HASHMAP_FOREACH(label, network->address_labels_by_section) + if (address_label_section_verify(label) < 0) + address_label_free(label); +} + +int config_parse_address_label_prefix( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(address_label_free_or_set_invalidp) AddressLabel *n = NULL; + Network *network = userdata; + unsigned char prefixlen; + union in_addr_union a; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_label_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = in_addr_prefix_from_string(rvalue, AF_INET6, &a, &prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid prefix for address label, ignoring assignment: %s", rvalue); + return 0; + } + if (in6_addr_is_ipv4_mapped_address(&a.in6) && prefixlen > 96) { + /* See ip6addrlbl_alloc() in net/ipv6/addrlabel.c of kernel. */ + log_syntax(unit, LOG_WARNING, filename, line, 0, + "The prefix length of IPv4 mapped address for address label must be equal to or smaller than 96, " + "ignoring assignment: %s", rvalue); + return 0; + } + + n->prefix = a.in6; + n->prefixlen = prefixlen; + n->prefix_set = true; + + TAKE_PTR(n); + return 0; +} + +int config_parse_address_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(address_label_free_or_set_invalidp) AddressLabel *n = NULL; + Network *network = userdata; + uint32_t k; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_label_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address label, ignoring: %s", rvalue); + return 0; + } + + if (k == UINT_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Address label is invalid, ignoring: %s", rvalue); + return 0; + } + + n->label = k; + TAKE_PTR(n); + + return 0; +} diff --git a/src/network/networkd-address-label.h b/src/network/networkd-address-label.h new file mode 100644 index 0000000..1e2ee70 --- /dev/null +++ b/src/network/networkd-address-label.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Network Network; + +typedef struct AddressLabel { + Network *network; + ConfigSection *section; + + uint32_t label; + struct in6_addr prefix; + unsigned char prefixlen; + bool prefix_set; +} AddressLabel; + +AddressLabel *address_label_free(AddressLabel *label); + +void network_drop_invalid_address_labels(Network *network); + +int link_request_static_address_labels(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_address_label); +CONFIG_PARSER_PROTOTYPE(config_parse_address_label_prefix); diff --git a/src/network/networkd-address-pool.c b/src/network/networkd-address-pool.c new file mode 100644 index 0000000..d9ac78a --- /dev/null +++ b/src/network/networkd-address-pool.c @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "networkd-address-pool.h" +#include "networkd-address.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "string-util.h" + +#define RANDOM_PREFIX_TRIAL_MAX 1024 + +static int address_pool_new( + Manager *m, + int family, + const union in_addr_union *u, + unsigned prefixlen) { + + _cleanup_free_ AddressPool *p = NULL; + int r; + + assert(m); + assert(u); + + p = new(AddressPool, 1); + if (!p) + return -ENOMEM; + + *p = (AddressPool) { + .manager = m, + .family = family, + .prefixlen = prefixlen, + .in_addr = *u, + }; + + r = ordered_set_ensure_put(&m->address_pools, NULL, p); + if (r < 0) + return r; + + TAKE_PTR(p); + return 0; +} + +static int address_pool_new_from_string( + Manager *m, + int family, + const char *p, + unsigned prefixlen) { + + union in_addr_union u; + int r; + + assert(m); + assert(p); + + r = in_addr_from_string(family, p, &u); + if (r < 0) + return r; + + return address_pool_new(m, family, &u, prefixlen); +} + +int address_pool_setup_default(Manager *m) { + int r; + + assert(m); + + /* Add in the well-known private address ranges. */ + r = address_pool_new_from_string(m, AF_INET6, "fd00::", 8); + if (r < 0) + return r; + + r = address_pool_new_from_string(m, AF_INET, "192.168.0.0", 16); + if (r < 0) + return r; + + r = address_pool_new_from_string(m, AF_INET, "172.16.0.0", 12); + if (r < 0) + return r; + + r = address_pool_new_from_string(m, AF_INET, "10.0.0.0", 8); + if (r < 0) + return r; + + return 0; +} + +static bool address_intersect( + const Address *a, + int family, + const union in_addr_union *u, + unsigned prefixlen) { + + assert(a); + assert(u); + + if (a->family != family) + return false; + + return in_addr_prefix_intersect(family, u, prefixlen, &a->in_addr, a->prefixlen); +} + +static bool address_pool_prefix_is_taken( + AddressPool *p, + const union in_addr_union *u, + unsigned prefixlen) { + + Address *a; + Link *l; + Network *n; + Request *req; + + assert(p); + assert(u); + + /* Don't clash with assigned addresses. */ + HASHMAP_FOREACH(l, p->manager->links_by_index) + SET_FOREACH(a, l->addresses) + if (address_intersect(a, p->family, u, prefixlen)) + return true; + + /* And don't clash with configured but un-assigned addresses either. */ + ORDERED_HASHMAP_FOREACH(n, p->manager->networks) + ORDERED_HASHMAP_FOREACH(a, n->addresses_by_section) + if (address_intersect(a, p->family, u, prefixlen)) + return true; + + /* Also check queued addresses. */ + ORDERED_SET_FOREACH(req, p->manager->request_queue) { + if (req->type != REQUEST_TYPE_ADDRESS) + continue; + + if (address_intersect(req->userdata, p->family, u, prefixlen)) + return true; + } + + return false; +} + +static int address_pool_acquire_one(AddressPool *p, int family, unsigned prefixlen, union in_addr_union *found) { + union in_addr_union u; + int r; + + assert(p); + assert(prefixlen > 0); + assert(found); + + if (p->family != family) + return 0; + + if (p->prefixlen >= prefixlen) + return 0; + + u = p->in_addr; + + for (unsigned i = 0; i < RANDOM_PREFIX_TRIAL_MAX; i++) { + r = in_addr_random_prefix(p->family, &u, p->prefixlen, prefixlen); + if (r <= 0) + return r; + + if (!address_pool_prefix_is_taken(p, &u, prefixlen)) { + log_debug("Found range %s", IN_ADDR_PREFIX_TO_STRING(p->family, &u, prefixlen)); + + *found = u; + return 1; + } + } + + return 0; +} + +int address_pool_acquire(Manager *m, int family, unsigned prefixlen, union in_addr_union *found) { + AddressPool *p; + int r; + + assert(m); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(prefixlen > 0); + assert(found); + + ORDERED_SET_FOREACH(p, m->address_pools) { + r = address_pool_acquire_one(p, family, prefixlen, found); + if (r != 0) + return r; + } + + return 0; +} diff --git a/src/network/networkd-address-pool.h b/src/network/networkd-address-pool.h new file mode 100644 index 0000000..93bdec8 --- /dev/null +++ b/src/network/networkd-address-pool.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "in-addr-util.h" + +typedef struct Manager Manager; + +typedef struct AddressPool { + Manager *manager; + + int family; + unsigned prefixlen; + union in_addr_union in_addr; +} AddressPool; + +int address_pool_setup_default(Manager *m); +int address_pool_acquire(Manager *m, int family, unsigned prefixlen, union in_addr_union *found); diff --git a/src/network/networkd-address.c b/src/network/networkd-address.c new file mode 100644 index 0000000..0e4d87b --- /dev/null +++ b/src/network/networkd-address.c @@ -0,0 +1,2566 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "firewall-util.h" +#include "logarithm.h" +#include "memory-util.h" +#include "netlink-util.h" +#include "networkd-address-pool.h" +#include "networkd-address.h" +#include "networkd-dhcp-server.h" +#include "networkd-ipv4acd.h" +#include "networkd-manager.h" +#include "networkd-netlabel.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "networkd-route-util.h" +#include "networkd-route.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" + +#define ADDRESSES_PER_LINK_MAX 2048U +#define STATIC_ADDRESSES_PER_NETWORK_MAX 1024U + +#define KNOWN_FLAGS \ + (IFA_F_SECONDARY | \ + IFA_F_NODAD | \ + IFA_F_OPTIMISTIC | \ + IFA_F_DADFAILED | \ + IFA_F_HOMEADDRESS | \ + IFA_F_DEPRECATED | \ + IFA_F_TENTATIVE | \ + IFA_F_PERMANENT | \ + IFA_F_MANAGETEMPADDR | \ + IFA_F_NOPREFIXROUTE | \ + IFA_F_MCAUTOJOIN | \ + IFA_F_STABLE_PRIVACY) + +/* From net/ipv4/devinet.c */ +#define IPV6ONLY_FLAGS \ + (IFA_F_NODAD | \ + IFA_F_OPTIMISTIC | \ + IFA_F_DADFAILED | \ + IFA_F_HOMEADDRESS | \ + IFA_F_TENTATIVE | \ + IFA_F_MANAGETEMPADDR | \ + IFA_F_STABLE_PRIVACY) + +/* We do not control the following flags. */ +#define UNMANAGED_FLAGS \ + (IFA_F_SECONDARY | \ + IFA_F_DADFAILED | \ + IFA_F_DEPRECATED | \ + IFA_F_TENTATIVE | \ + IFA_F_PERMANENT | \ + IFA_F_STABLE_PRIVACY) + +int address_flags_to_string_alloc(uint32_t flags, int family, char **ret) { + _cleanup_free_ char *str = NULL; + static const char* map[] = { + [LOG2U(IFA_F_SECONDARY)] = "secondary", /* This is also called "temporary" for ipv6. */ + [LOG2U(IFA_F_NODAD)] = "nodad", + [LOG2U(IFA_F_OPTIMISTIC)] = "optimistic", + [LOG2U(IFA_F_DADFAILED)] = "dadfailed", + [LOG2U(IFA_F_HOMEADDRESS)] = "home-address", + [LOG2U(IFA_F_DEPRECATED)] = "deprecated", + [LOG2U(IFA_F_TENTATIVE)] = "tentative", + [LOG2U(IFA_F_PERMANENT)] = "permanent", + [LOG2U(IFA_F_MANAGETEMPADDR)] = "manage-temporary-address", + [LOG2U(IFA_F_NOPREFIXROUTE)] = "no-prefixroute", + [LOG2U(IFA_F_MCAUTOJOIN)] = "auto-join", + [LOG2U(IFA_F_STABLE_PRIVACY)] = "stable-privacy", + }; + + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(map); i++) + if (FLAGS_SET(flags, 1 << i) && map[i]) + if (!strextend_with_separator( + &str, ",", + family == AF_INET6 && (1 << i) == IFA_F_SECONDARY ? "temporary" : map[i])) + return -ENOMEM; + + *ret = TAKE_PTR(str); + return 0; +} + +static LinkAddressState address_state_from_scope(uint8_t scope) { + if (scope < RT_SCOPE_SITE) + /* universally accessible addresses found */ + return LINK_ADDRESS_STATE_ROUTABLE; + + if (scope < RT_SCOPE_HOST) + /* only link or site local addresses found */ + return LINK_ADDRESS_STATE_DEGRADED; + + /* no useful addresses found */ + return LINK_ADDRESS_STATE_OFF; +} + +void link_get_address_states( + Link *link, + LinkAddressState *ret_ipv4, + LinkAddressState *ret_ipv6, + LinkAddressState *ret_all) { + + uint8_t ipv4_scope = RT_SCOPE_NOWHERE, ipv6_scope = RT_SCOPE_NOWHERE; + Address *address; + + assert(link); + + SET_FOREACH(address, link->addresses) { + if (!address_is_ready(address)) + continue; + + if (address->family == AF_INET) + ipv4_scope = MIN(ipv4_scope, address->scope); + + if (address->family == AF_INET6) + ipv6_scope = MIN(ipv6_scope, address->scope); + } + + if (ret_ipv4) + *ret_ipv4 = address_state_from_scope(ipv4_scope); + if (ret_ipv6) + *ret_ipv6 = address_state_from_scope(ipv6_scope); + if (ret_all) + *ret_all = address_state_from_scope(MIN(ipv4_scope, ipv6_scope)); +} + +int address_new(Address **ret) { + _cleanup_(address_freep) Address *address = NULL; + + address = new(Address, 1); + if (!address) + return -ENOMEM; + + *address = (Address) { + .family = AF_UNSPEC, + .scope = RT_SCOPE_UNIVERSE, + .lifetime_valid_usec = USEC_INFINITY, + .lifetime_preferred_usec = USEC_INFINITY, + .set_broadcast = -1, + }; + + *ret = TAKE_PTR(address); + + return 0; +} + +int address_new_static(Network *network, const char *filename, unsigned section_line, Address **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(address_freep) Address *address = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + address = ordered_hashmap_get(network->addresses_by_section, n); + if (address) { + *ret = TAKE_PTR(address); + return 0; + } + + if (ordered_hashmap_size(network->addresses_by_section) >= STATIC_ADDRESSES_PER_NETWORK_MAX) + return -E2BIG; + + r = address_new(&address); + if (r < 0) + return r; + + address->network = network; + address->section = TAKE_PTR(n); + address->source = NETWORK_CONFIG_SOURCE_STATIC; + /* This will be adjusted in address_section_verify(). */ + address->duplicate_address_detection = _ADDRESS_FAMILY_INVALID; + + r = ordered_hashmap_ensure_put(&network->addresses_by_section, &config_section_hash_ops, address->section, address); + if (r < 0) + return r; + + *ret = TAKE_PTR(address); + return 0; +} + +Address *address_free(Address *address) { + if (!address) + return NULL; + + if (address->network) { + assert(address->section); + ordered_hashmap_remove(address->network->addresses_by_section, address->section); + } + + if (address->link) { + set_remove(address->link->addresses, address); + + if (address->family == AF_INET6 && + in6_addr_equal(&address->in_addr.in6, &address->link->ipv6ll_address)) + memzero(&address->link->ipv6ll_address, sizeof(struct in6_addr)); + + ipv4acd_detach(address->link, address); + } + + config_section_free(address->section); + free(address->label); + free(address->netlabel); + nft_set_context_clear(&address->nft_set_context); + return mfree(address); +} + +static bool address_lifetime_is_valid(const Address *a) { + assert(a); + + return + a->lifetime_valid_usec == USEC_INFINITY || + a->lifetime_valid_usec > now(CLOCK_BOOTTIME); +} + +bool address_is_ready(const Address *a) { + assert(a); + assert(a->link); + + if (!ipv4acd_bound(a->link, a)) + return false; + + if (FLAGS_SET(a->flags, IFA_F_TENTATIVE)) + return false; + + if (FLAGS_SET(a->state, NETWORK_CONFIG_STATE_REMOVING)) + return false; + + if (!FLAGS_SET(a->state, NETWORK_CONFIG_STATE_CONFIGURED)) + return false; + + return address_lifetime_is_valid(a); +} + +bool link_check_addresses_ready(Link *link, NetworkConfigSource source) { + Address *a; + bool has = false; + + assert(link); + + /* Check if all addresses on the interface are ready. If there is no address, this will return false. */ + + SET_FOREACH(a, link->addresses) { + if (source >= 0 && a->source != source) + continue; + if (address_is_marked(a)) + continue; + if (!address_exists(a)) + continue; + if (!address_is_ready(a)) + return false; + has = true; + } + + return has; +} + +void link_mark_addresses(Link *link, NetworkConfigSource source) { + Address *a; + + assert(link); + + SET_FOREACH(a, link->addresses) { + if (a->source != source) + continue; + + address_mark(a); + } +} + +static int address_get_broadcast(const Address *a, Link *link, struct in_addr *ret) { + struct in_addr b_addr = {}; + + assert(a); + assert(link); + + /* Returns 0 when broadcast address is null, 1 when non-null broadcast address, -EAGAIN when the main + * address is null. */ + + /* broadcast is only for IPv4. */ + if (a->family != AF_INET) + goto finalize; + + /* broadcast address cannot be used when peer address is specified. */ + if (in4_addr_is_set(&a->in_addr_peer.in)) + goto finalize; + + /* A /31 or /32 IPv4 address does not have a broadcast address. + * See https://tools.ietf.org/html/rfc3021 */ + if (a->prefixlen > 30) + goto finalize; + + /* If explicitly configured, use the address as is. */ + if (in4_addr_is_set(&a->broadcast)) { + b_addr = a->broadcast; + goto finalize; + } + + /* If explicitly disabled, then return null address. */ + if (a->set_broadcast == 0) + goto finalize; + + /* For wireguard interfaces, broadcast is disabled by default. */ + if (a->set_broadcast < 0 && streq_ptr(link->kind, "wireguard")) + goto finalize; + + /* If the main address is null, e.g. Address=0.0.0.0/24, the broadcast address will be automatically + * determined after an address is acquired. */ + if (!in4_addr_is_set(&a->in_addr.in)) + return -EAGAIN; + + /* Otherwise, generate a broadcast address from the main address and prefix length. */ + b_addr.s_addr = a->in_addr.in.s_addr | htobe32(UINT32_C(0xffffffff) >> a->prefixlen); + +finalize: + if (ret) + *ret = b_addr; + + return in4_addr_is_set(&b_addr); +} + +static void address_set_broadcast(Address *a, Link *link) { + assert(a); + assert_se(address_get_broadcast(a, link, &a->broadcast) >= 0); +} + +static void address_set_cinfo(Manager *m, const Address *a, struct ifa_cacheinfo *cinfo) { + usec_t now_usec; + + assert(m); + assert(a); + assert(cinfo); + + assert_se(sd_event_now(m->event, CLOCK_BOOTTIME, &now_usec) >= 0); + + *cinfo = (struct ifa_cacheinfo) { + .ifa_valid = usec_to_sec(a->lifetime_valid_usec, now_usec), + .ifa_prefered = usec_to_sec(a->lifetime_preferred_usec, now_usec), + }; +} + +static void address_set_lifetime(Manager *m, Address *a, const struct ifa_cacheinfo *cinfo) { + usec_t now_usec; + + assert(m); + assert(a); + assert(cinfo); + + assert_se(sd_event_now(m->event, CLOCK_BOOTTIME, &now_usec) >= 0); + + a->lifetime_valid_usec = sec_to_usec(cinfo->ifa_valid, now_usec); + a->lifetime_preferred_usec = sec_to_usec(cinfo->ifa_prefered, now_usec); +} + +static bool address_is_static_null(const Address *address) { + assert(address); + + if (!address->network) + return false; + + if (!address->requested_as_null) + return false; + + assert(!in_addr_is_set(address->family, &address->in_addr)); + return true; +} + +static int address_ipv4_prefix(const Address *a, struct in_addr *ret) { + struct in_addr p; + int r; + + assert(a); + assert(a->family == AF_INET); + assert(ret); + + p = in4_addr_is_set(&a->in_addr_peer.in) ? a->in_addr_peer.in : a->in_addr.in; + r = in4_addr_mask(&p, a->prefixlen); + if (r < 0) + return r; + + *ret = p; + return 0; +} + +static void address_hash_func(const Address *a, struct siphash *state) { + assert(a); + + siphash24_compress(&a->family, sizeof(a->family), state); + + switch (a->family) { + case AF_INET: { + struct in_addr prefix; + + siphash24_compress(&a->prefixlen, sizeof(a->prefixlen), state); + + assert_se(address_ipv4_prefix(a, &prefix) >= 0); + siphash24_compress(&prefix, sizeof(prefix), state); + + siphash24_compress(&a->in_addr.in, sizeof(a->in_addr.in), state); + break; + } + case AF_INET6: + siphash24_compress(&a->in_addr.in6, sizeof(a->in_addr.in6), state); + + if (in6_addr_is_null(&a->in_addr.in6)) + siphash24_compress(&a->prefixlen, sizeof(a->prefixlen), state); + break; + + default: + /* treat any other address family as AF_UNSPEC */ + break; + } +} + +static int address_compare_func(const Address *a1, const Address *a2) { + int r; + + r = CMP(a1->family, a2->family); + if (r != 0) + return r; + + switch (a1->family) { + case AF_INET: { + struct in_addr p1, p2; + + /* See kernel's find_matching_ifa() in net/ipv4/devinet.c */ + r = CMP(a1->prefixlen, a2->prefixlen); + if (r != 0) + return r; + + assert_se(address_ipv4_prefix(a1, &p1) >= 0); + assert_se(address_ipv4_prefix(a2, &p2) >= 0); + r = memcmp(&p1, &p2, sizeof(p1)); + if (r != 0) + return r; + + return memcmp(&a1->in_addr.in, &a2->in_addr.in, sizeof(a1->in_addr.in)); + } + case AF_INET6: + /* See kernel's ipv6_get_ifaddr() in net/ipv6/addrconf.c */ + r = memcmp(&a1->in_addr.in6, &a2->in_addr.in6, sizeof(a1->in_addr.in6)); + if (r != 0) + return r; + + /* To distinguish IPv6 null addresses with different prefixlen, e.g. ::48 vs ::64, let's + * compare the prefix length. */ + if (in6_addr_is_null(&a1->in_addr.in6)) + r = CMP(a1->prefixlen, a2->prefixlen); + + return r; + + default: + /* treat any other address family as AF_UNSPEC */ + return 0; + } +} + +DEFINE_HASH_OPS( + address_hash_ops, + Address, + address_hash_func, + address_compare_func); + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + address_hash_ops_free, + Address, + address_hash_func, + address_compare_func, + address_free); + +static bool address_can_update(const Address *la, const Address *na) { + assert(la); + assert(la->link); + assert(na); + assert(na->network); + + /* + * property | IPv4 | IPv6 + * ----------------------------------------- + * family | ✗ | ✗ + * prefixlen | ✗ | ✗ + * address | ✗ | ✗ + * scope | ✗ | - + * label | ✗ | - + * broadcast | ✗ | - + * peer | ✗ | ✓ + * flags | ✗ | ✓ + * lifetime | ✓ | ✓ + * route metric | ✓ | ✓ + * protocol | ✓ | ✓ + * + * ✗ : cannot be changed + * ✓ : can be changed + * - : unused + * + * IPv4 : See inet_rtm_newaddr() in net/ipv4/devinet.c. + * IPv6 : See inet6_addr_modify() in net/ipv6/addrconf.c. + */ + + if (la->family != na->family) + return false; + + if (la->prefixlen != na->prefixlen) + return false; + + /* When a null address is requested, the address to be assigned/updated will be determined later. */ + if (!address_is_static_null(na) && + in_addr_equal(la->family, &la->in_addr, &na->in_addr) <= 0) + return false; + + switch (la->family) { + case AF_INET: { + struct in_addr bcast; + + if (la->scope != na->scope) + return false; + if (((la->flags ^ na->flags) & KNOWN_FLAGS & ~IPV6ONLY_FLAGS & ~UNMANAGED_FLAGS) != 0) + return false; + if (!streq_ptr(la->label, na->label)) + return false; + if (!in4_addr_equal(&la->in_addr_peer.in, &na->in_addr_peer.in)) + return false; + if (address_get_broadcast(na, la->link, &bcast) >= 0) { + /* If the broadcast address can be determined now, check if they match. */ + if (!in4_addr_equal(&la->broadcast, &bcast)) + return false; + } else { + /* When a null address is requested, then the broadcast address will be + * automatically calculated from the acquired address, e.g. + * 192.168.0.10/24 -> 192.168.0.255 + * So, here let's only check if the broadcast is the last address in the range, e.g. + * 0.0.0.0/24 -> 0.0.0.255 */ + if (!FLAGS_SET(la->broadcast.s_addr, htobe32(UINT32_C(0xffffffff) >> la->prefixlen))) + return false; + } + break; + } + case AF_INET6: + break; + + default: + assert_not_reached(); + } + + return true; +} + +int address_dup(const Address *src, Address **ret) { + _cleanup_(address_freep) Address *dest = NULL; + int r; + + assert(src); + assert(ret); + + dest = newdup(Address, src, 1); + if (!dest) + return -ENOMEM; + + /* clear all pointers */ + dest->network = NULL; + dest->section = NULL; + dest->link = NULL; + dest->label = NULL; + dest->netlabel = NULL; + dest->nft_set_context.sets = NULL; + dest->nft_set_context.n_sets = 0; + + if (src->family == AF_INET) { + r = free_and_strdup(&dest->label, src->label); + if (r < 0) + return r; + } + + r = free_and_strdup(&dest->netlabel, src->netlabel); + if (r < 0) + return r; + + r = nft_set_context_dup(&src->nft_set_context, &dest->nft_set_context); + if (r < 0) + return r; + + *ret = TAKE_PTR(dest); + return 0; +} + +static int address_set_masquerade(Address *address, bool add) { + union in_addr_union masked; + int r; + + assert(address); + assert(address->link); + + if (!address->link->network) + return 0; + + if (address->family == AF_INET && + !FLAGS_SET(address->link->network->ip_masquerade, ADDRESS_FAMILY_IPV4)) + return 0; + + if (address->family == AF_INET6 && + !FLAGS_SET(address->link->network->ip_masquerade, ADDRESS_FAMILY_IPV6)) + return 0; + + if (address->scope >= RT_SCOPE_LINK) + return 0; + + if (address->ip_masquerade_done == add) + return 0; + + masked = address->in_addr; + r = in_addr_mask(address->family, &masked, address->prefixlen); + if (r < 0) + return r; + + r = fw_add_masquerade(&address->link->manager->fw_ctx, add, address->family, &masked, address->prefixlen); + if (r < 0) + return r; + + address->ip_masquerade_done = add; + + return 0; +} + +static void address_modify_nft_set_context(Address *address, bool add, NFTSetContext *nft_set_context) { + int r; + + assert(address); + assert(address->link); + assert(address->link->manager); + assert(nft_set_context); + + if (!address->link->manager->fw_ctx) { + r = fw_ctx_new_full(&address->link->manager->fw_ctx, /* init_tables= */ false); + if (r < 0) + return; + } + + FOREACH_ARRAY(nft_set, nft_set_context->sets, nft_set_context->n_sets) { + uint32_t ifindex; + + assert(nft_set); + + switch (nft_set->source) { + case NFT_SET_SOURCE_ADDRESS: + r = nft_set_element_modify_ip(address->link->manager->fw_ctx, add, nft_set->nfproto, address->family, nft_set->table, nft_set->set, + &address->in_addr); + break; + case NFT_SET_SOURCE_PREFIX: + r = nft_set_element_modify_iprange(address->link->manager->fw_ctx, add, nft_set->nfproto, address->family, nft_set->table, nft_set->set, + &address->in_addr, address->prefixlen); + break; + case NFT_SET_SOURCE_IFINDEX: + ifindex = address->link->ifindex; + r = nft_set_element_modify_any(address->link->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, + &ifindex, sizeof(ifindex)); + break; + default: + assert_not_reached(); + } + + if (r < 0) + log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, IP address %s, ignoring", + add? "add" : "delete", + nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); + else + log_debug("%s NFT set: family %s, table %s, set %s, IP address %s", + add ? "Added" : "Deleted", + nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); + } +} + +static void address_modify_nft_set(Address *address, bool add) { + assert(address); + assert(address->link); + + if (!IN_SET(address->family, AF_INET, AF_INET6)) + return; + + if (!address->link->network) + return; + + switch (address->source) { + case NETWORK_CONFIG_SOURCE_DHCP4: + return address_modify_nft_set_context(address, add, &address->link->network->dhcp_nft_set_context); + case NETWORK_CONFIG_SOURCE_DHCP6: + return address_modify_nft_set_context(address, add, &address->link->network->dhcp6_nft_set_context); + case NETWORK_CONFIG_SOURCE_DHCP_PD: + return address_modify_nft_set_context(address, add, &address->link->network->dhcp_pd_nft_set_context); + case NETWORK_CONFIG_SOURCE_NDISC: + return address_modify_nft_set_context(address, add, &address->link->network->ndisc_nft_set_context); + case NETWORK_CONFIG_SOURCE_STATIC: + return address_modify_nft_set_context(address, add, &address->nft_set_context); + default: + return; + } +} + +static int address_add(Link *link, Address *address) { + int r; + + assert(link); + assert(address); + + r = set_ensure_put(&link->addresses, &address_hash_ops_free, address); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + address->link = link; + return 0; +} + +static int address_update(Address *address) { + Link *link = ASSERT_PTR(ASSERT_PTR(address)->link); + int r; + + if (address_is_ready(address) && + address->family == AF_INET6 && + in6_addr_is_link_local(&address->in_addr.in6) && + in6_addr_is_null(&link->ipv6ll_address)) { + + link->ipv6ll_address = address->in_addr.in6; + + r = link_ipv6ll_gained(link); + if (r < 0) + return r; + } + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = address_set_masquerade(address, /* add = */ true); + if (r < 0) + return log_link_warning_errno(link, r, "Could not enable IP masquerading: %m"); + + address_add_netlabel(address); + + address_modify_nft_set(address, /* add = */ true); + + if (address_is_ready(address) && address->callback) { + r = address->callback(address); + if (r < 0) + return r; + } + + link_update_operstate(link, /* also_update_master = */ true); + link_check_ready(link); + return 0; +} + +static int address_drop(Address *address) { + Link *link = ASSERT_PTR(ASSERT_PTR(address)->link); + int r; + + r = address_set_masquerade(address, /* add = */ false); + if (r < 0) + log_link_warning_errno(link, r, "Failed to disable IP masquerading, ignoring: %m"); + + address_modify_nft_set(address, /* add = */ false); + + address_del_netlabel(address); + + address_free(address); + + link_update_operstate(link, /* also_update_master = */ true); + link_check_ready(link); + return 0; +} + +static bool address_match_null(const Address *a, const Address *null_address) { + assert(a); + assert(null_address); + + if (!a->requested_as_null) + return false; + + /* Currently, null address is supported only by static addresses. Note that static + * address may be set as foreign during reconfiguring the interface. */ + if (!IN_SET(a->source, NETWORK_CONFIG_SOURCE_FOREIGN, NETWORK_CONFIG_SOURCE_STATIC)) + return false; + + if (a->family != null_address->family) + return false; + + if (a->prefixlen != null_address->prefixlen) + return false; + + return true; +} + +static int address_get_request(Link *link, const Address *address, Request **ret) { + Request *req; + + assert(link); + assert(link->manager); + assert(address); + + req = ordered_set_get( + link->manager->request_queue, + &(Request) { + .link = link, + .type = REQUEST_TYPE_ADDRESS, + .userdata = (void*) address, + .hash_func = (hash_func_t) address_hash_func, + .compare_func = (compare_func_t) address_compare_func, + }); + if (req) { + if (ret) + *ret = req; + return 0; + } + + if (address_is_static_null(address)) + ORDERED_SET_FOREACH(req, link->manager->request_queue) { + if (req->link != link) + continue; + if (req->type != REQUEST_TYPE_ADDRESS) + continue; + + if (!address_match_null(req->userdata, address)) + continue; + + if (ret) + *ret = req; + + return 0; + } + + return -ENOENT; +} + +int address_get(Link *link, const Address *in, Address **ret) { + Address *a; + + assert(link); + assert(in); + + a = set_get(link->addresses, in); + if (a) { + if (ret) + *ret = a; + return 0; + } + + /* Find matching address that originally requested as null address. */ + if (address_is_static_null(in)) + SET_FOREACH(a, link->addresses) { + if (!address_match_null(a, in)) + continue; + + if (ret) + *ret = a; + return 0; + } + + return -ENOENT; +} + +int address_get_harder(Link *link, const Address *in, Address **ret) { + Request *req; + int r; + + assert(link); + assert(in); + + if (address_get(link, in, ret) >= 0) + return 0; + + r = address_get_request(link, in, &req); + if (r < 0) + return r; + + if (ret) + *ret = ASSERT_PTR(req->userdata); + + return 0; +} + +int link_get_address(Link *link, int family, const union in_addr_union *address, unsigned char prefixlen, Address **ret) { + Address *a; + int r; + + assert(link); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + /* This find an Address object on the link which matches the given address and prefix length + * and does not have peer address. When the prefixlen is zero, then an Address object with an + * arbitrary prefixlen will be returned. */ + + if (family == AF_INET6 || prefixlen != 0) { + _cleanup_(address_freep) Address *tmp = NULL; + + /* In this case, we can use address_get(). */ + + r = address_new(&tmp); + if (r < 0) + return r; + + tmp->family = family; + tmp->in_addr = *address; + tmp->prefixlen = prefixlen; + + r = address_get(link, tmp, &a); + if (r < 0) + return r; + + if (family == AF_INET6) { + /* IPv6 addresses are managed without peer address and prefix length. Hence, we need + * to check them explicitly. */ + if (in_addr_is_set(family, &a->in_addr_peer)) + return -ENOENT; + if (prefixlen != 0 && a->prefixlen != prefixlen) + return -ENOENT; + } + + if (ret) + *ret = a; + + return 0; + } + + SET_FOREACH(a, link->addresses) { + if (a->family != family) + continue; + + if (!in_addr_equal(family, &a->in_addr, address)) + continue; + + if (in_addr_is_set(family, &a->in_addr_peer)) + continue; + + if (ret) + *ret = a; + + return 0; + } + + return -ENOENT; +} + +int manager_get_address(Manager *manager, int family, const union in_addr_union *address, unsigned char prefixlen, Address **ret) { + Link *link; + + assert(manager); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + HASHMAP_FOREACH(link, manager->links_by_index) { + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + continue; + + if (link_get_address(link, family, address, prefixlen, ret) >= 0) + return 0; + } + + return -ENOENT; +} + +bool manager_has_address(Manager *manager, int family, const union in_addr_union *address, bool check_ready) { + Address *a; + + assert(manager); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + if (manager_get_address(manager, family, address, 0, &a) < 0) + return false; + + return check_ready ? address_is_ready(a) : (address_exists(a) && address_lifetime_is_valid(a)); +} + +const char* format_lifetime(char *buf, size_t l, usec_t lifetime_usec) { + assert(buf); + assert(l > 4); + + if (lifetime_usec == USEC_INFINITY) + return "forever"; + + sprintf(buf, "for "); + /* format_timespan() never fails */ + assert_se(format_timespan(buf + 4, l - 4, usec_sub_unsigned(lifetime_usec, now(CLOCK_BOOTTIME)), USEC_PER_SEC)); + return buf; +} + +static void log_address_debug(const Address *address, const char *str, const Link *link) { + _cleanup_free_ char *state = NULL, *flags_str = NULL, *scope_str = NULL; + + assert(address); + assert(str); + assert(link); + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(address->state, &state); + + const char *peer = in_addr_is_set(address->family, &address->in_addr_peer) ? + IN_ADDR_TO_STRING(address->family, &address->in_addr_peer) : NULL; + + const char *broadcast = (address->family == AF_INET && in4_addr_is_set(&address->broadcast)) ? + IN4_ADDR_TO_STRING(&address->broadcast) : NULL; + + (void) address_flags_to_string_alloc(address->flags, address->family, &flags_str); + (void) route_scope_to_string_alloc(address->scope, &scope_str); + + log_link_debug(link, "%s %s address (%s): %s%s%s/%u%s%s (valid %s, preferred %s), flags: %s, scope: %s%s%s", + str, strna(network_config_source_to_string(address->source)), strna(state), + IN_ADDR_TO_STRING(address->family, &address->in_addr), + peer ? " peer " : "", strempty(peer), address->prefixlen, + broadcast ? " broadcast " : "", strempty(broadcast), + FORMAT_LIFETIME(address->lifetime_valid_usec), + FORMAT_LIFETIME(address->lifetime_preferred_usec), + strna(flags_str), strna(scope_str), + address->family == AF_INET ? ", label: " : "", + address->family == AF_INET ? strna(address->label) : ""); +} + +static int address_set_netlink_message(const Address *address, sd_netlink_message *m, Link *link) { + uint32_t flags; + int r; + + assert(address); + assert(m); + assert(link); + + r = sd_rtnl_message_addr_set_prefixlen(m, address->prefixlen); + if (r < 0) + return r; + + /* On remove, only IFA_F_MANAGETEMPADDR flag for IPv6 addresses are used. But anyway, set all + * flags except tentative flag here unconditionally. Without setting the flag, the template + * addresses generated by kernel will not be removed automatically when the main address is + * removed. */ + flags = address->flags & ~IFA_F_TENTATIVE; + r = sd_rtnl_message_addr_set_flags(m, flags & 0xff); + if (r < 0) + return r; + + if ((flags & ~0xff) != 0) { + r = sd_netlink_message_append_u32(m, IFA_FLAGS, flags); + if (r < 0) + return r; + } + + r = netlink_message_append_in_addr_union(m, IFA_LOCAL, address->family, &address->in_addr); + if (r < 0) + return r; + + return 0; +} + +static int address_remove_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + assert(link); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EADDRNOTAVAIL) + log_link_message_warning_errno(link, m, r, "Could not drop address"); + + return 1; +} + +int address_remove(Address *address) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + Request *req; + Link *link; + int r; + + assert(address); + assert(IN_SET(address->family, AF_INET, AF_INET6)); + assert(address->link); + assert(address->link->ifindex > 0); + assert(address->link->manager); + assert(address->link->manager->rtnl); + + link = address->link; + + log_address_debug(address, "Removing", link); + + r = sd_rtnl_message_new_addr(link->manager->rtnl, &m, RTM_DELADDR, + link->ifindex, address->family); + if (r < 0) + return log_link_warning_errno(link, r, "Could not allocate RTM_DELADDR message: %m"); + + r = address_set_netlink_message(address, m, link); + if (r < 0) + return log_link_warning_errno(link, r, "Could not set netlink attributes: %m"); + + r = netlink_call_async(link->manager->rtnl, NULL, m, + address_remove_handler, + link_netlink_destroy_callback, link); + if (r < 0) + return log_link_warning_errno(link, r, "Could not send rtnetlink message: %m"); + + link_ref(link); + + address_enter_removing(address); + if (address_get_request(link, address, &req) >= 0) + address_enter_removing(req->userdata); + + /* The operational state is determined by address state and carrier state. Hence, if we remove + * an address, the operational state may be changed. */ + link_update_operstate(link, true); + return 0; +} + +int address_remove_and_drop(Address *address) { + if (!address) + return 0; + + address_cancel_request(address); + + if (address_exists(address)) + return address_remove(address); + + return address_drop(address); +} + +bool link_address_is_dynamic(const Link *link, const Address *address) { + Route *route; + + assert(link); + assert(address); + + if (address->lifetime_preferred_usec != USEC_INFINITY) + return true; + + /* Even when the address is leased from a DHCP server, networkd assign the address + * without lifetime when KeepConfiguration=dhcp. So, let's check that we have + * corresponding routes with RTPROT_DHCP. */ + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* The route is not assigned yet, or already removed. Ignoring. */ + if (!route_exists(route)) + continue; + + if (route->protocol != RTPROT_DHCP) + continue; + + if (address->family != route->family) + continue; + + if (in_addr_equal(address->family, &address->in_addr, &route->prefsrc)) + return true; + } + + return false; +} + +int link_drop_ipv6ll_addresses(Link *link) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + + /* IPv6LL address may be in the tentative state, and in that case networkd has not received it. + * So, we need to dump all IPv6 addresses. */ + + if (link_may_have_ipv6ll(link, /* check_multicast = */ false)) + return 0; + + r = sd_rtnl_message_new_addr(link->manager->rtnl, &req, RTM_GETADDR, link->ifindex, AF_INET6); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(link->manager->rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *addr = reply; addr; addr = sd_netlink_message_next(addr)) { + _cleanup_(address_freep) Address *a = NULL; + unsigned char flags, prefixlen; + struct in6_addr address; + Address *existing; + int ifindex; + + /* NETLINK_GET_STRICT_CHK socket option is supported since kernel 4.20. To support + * older kernels, we need to check ifindex here. */ + r = sd_rtnl_message_addr_get_ifindex(addr, &ifindex); + if (r < 0) { + log_link_debug_errno(link, r, "rtnl: received address message without valid ifindex, ignoring: %m"); + continue; + } else if (link->ifindex != ifindex) + continue; + + r = sd_rtnl_message_addr_get_flags(addr, &flags); + if (r < 0) { + log_link_debug_errno(link, r, "rtnl: received address message without valid flags, ignoring: %m"); + continue; + } + + r = sd_rtnl_message_addr_get_prefixlen(addr, &prefixlen); + if (r < 0) { + log_link_debug_errno(link, r, "rtnl: received address message without prefixlen, ignoring: %m"); + continue; + } + + if (sd_netlink_message_read_in6_addr(addr, IFA_LOCAL, NULL) >= 0) + /* address with peer, ignoring. */ + continue; + + r = sd_netlink_message_read_in6_addr(addr, IFA_ADDRESS, &address); + if (r < 0) { + log_link_debug_errno(link, r, "rtnl: received address message without valid address, ignoring: %m"); + continue; + } + + if (!in6_addr_is_link_local(&address)) + continue; + + r = address_new(&a); + if (r < 0) + return -ENOMEM; + + a->family = AF_INET6; + a->in_addr.in6 = address; + a->prefixlen = prefixlen; + a->flags = flags; + + if (address_get(link, a, &existing) < 0) { + r = address_add(link, a); + if (r < 0) + return r; + + existing = TAKE_PTR(a); + } + + r = address_remove(existing); + if (r < 0) + return r; + } + + return 0; +} + +int link_drop_foreign_addresses(Link *link) { + Address *address; + int r = 0; + + assert(link); + assert(link->network); + + /* First, mark all addresses. */ + SET_FOREACH(address, link->addresses) { + /* We consider IPv6LL addresses to be managed by the kernel, or dropped in link_drop_ipv6ll_addresses() */ + if (address->family == AF_INET6 && in6_addr_is_link_local(&address->in_addr.in6)) + continue; + + /* Do not remove localhost address (127.0.0.1 and ::1) */ + if (link->flags & IFF_LOOPBACK && in_addr_is_localhost_one(address->family, &address->in_addr) > 0) + continue; + + /* Ignore addresses we configured. */ + if (address->source != NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore addresses not assigned yet or already removing. */ + if (!address_exists(address)) + continue; + + /* link_address_is_dynamic() is slightly heavy. Let's call the function only when KeepConfiguration= is set. */ + if (IN_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP, KEEP_CONFIGURATION_STATIC) && + link_address_is_dynamic(link, address) == (link->network->keep_configuration == KEEP_CONFIGURATION_DHCP)) + continue; + + address_mark(address); + } + + /* Then, unmark requested addresses. */ + ORDERED_HASHMAP_FOREACH(address, link->network->addresses_by_section) { + Address *existing; + + if (address_get(link, address, &existing) < 0) + continue; + + if (!address_can_update(existing, address)) + continue; + + /* Found matching static configuration. Keep the existing address. */ + address_unmark(existing); + } + + /* Finally, remove all marked addresses. */ + SET_FOREACH(address, link->addresses) { + if (!address_is_marked(address)) + continue; + + RET_GATHER(r, address_remove(address)); + } + + return r; +} + +int link_drop_managed_addresses(Link *link) { + Address *address; + int r = 0; + + assert(link); + + SET_FOREACH(address, link->addresses) { + /* Do not touch addresses managed by kernel or other tools. */ + if (address->source == NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore addresses not assigned yet or already removing. */ + if (!address_exists(address)) + continue; + + RET_GATHER(r, address_remove(address)); + } + + return r; +} + +void link_foreignize_addresses(Link *link) { + Address *address; + + assert(link); + + SET_FOREACH(address, link->addresses) + address->source = NETWORK_CONFIG_SOURCE_FOREIGN; +} + +static int address_acquire(Link *link, const Address *original, Address **ret) { + _cleanup_(address_freep) Address *na = NULL; + union in_addr_union in_addr; + int r; + + assert(link); + assert(original); + assert(ret); + + /* Something useful was configured? just use it */ + if (in_addr_is_set(original->family, &original->in_addr)) + return address_dup(original, ret); + + /* The address is configured to be 0.0.0.0 or [::] by the user? + * Then let's acquire something more useful from the pool. */ + r = address_pool_acquire(link->manager, original->family, original->prefixlen, &in_addr); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + + /* Pick first address in range for ourselves. */ + if (original->family == AF_INET) + in_addr.in.s_addr = in_addr.in.s_addr | htobe32(1); + else if (original->family == AF_INET6) + in_addr.in6.s6_addr[15] |= 1; + + r = address_dup(original, &na); + if (r < 0) + return r; + + na->in_addr = in_addr; + + *ret = TAKE_PTR(na); + return 0; +} + +int address_configure_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, const char *error_msg) { + int r; + + assert(rtnl); + assert(m); + assert(link); + assert(error_msg); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, error_msg); + link_enter_failed(link); + return 0; + } + + return 1; +} + +static int address_configure(const Address *address, const struct ifa_cacheinfo *c, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(address); + assert(IN_SET(address->family, AF_INET, AF_INET6)); + assert(c); + assert(link); + assert(link->ifindex > 0); + assert(link->manager); + assert(link->manager->rtnl); + assert(req); + + log_address_debug(address, "Configuring", link); + + r = sd_rtnl_message_new_addr_update(link->manager->rtnl, &m, link->ifindex, address->family); + if (r < 0) + return r; + + r = address_set_netlink_message(address, m, link); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_scope(m, address->scope); + if (r < 0) + return r; + + if (address->family == AF_INET6 || in_addr_is_set(address->family, &address->in_addr_peer)) { + r = netlink_message_append_in_addr_union(m, IFA_ADDRESS, address->family, &address->in_addr_peer); + if (r < 0) + return r; + } else if (in4_addr_is_set(&address->broadcast)) { + r = sd_netlink_message_append_in_addr(m, IFA_BROADCAST, &address->broadcast); + if (r < 0) + return r; + } + + if (address->family == AF_INET && address->label) { + r = sd_netlink_message_append_string(m, IFA_LABEL, address->label); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_cache_info(m, IFA_CACHEINFO, c); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, IFA_RT_PRIORITY, address->route_metric); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool address_is_ready_to_configure(Link *link, const Address *address) { + assert(link); + assert(address); + + if (!link_is_ready_to_configure(link, false)) + return false; + + if (!ipv4acd_bound(link, address)) + return false; + + /* Refuse adding more than the limit */ + if (set_size(link->addresses) >= ADDRESSES_PER_LINK_MAX) + return false; + + return true; +} + +static int address_process_request(Request *req, Link *link, Address *address) { + struct Address *existing; + struct ifa_cacheinfo c; + int r; + + assert(req); + assert(link); + assert(address); + + if (!address_is_ready_to_configure(link, address)) + return 0; + + address_set_cinfo(link->manager, address, &c); + if (c.ifa_valid == 0) { + log_link_debug(link, "Refuse to configure %s address %s, as its valid lifetime is zero.", + network_config_source_to_string(address->source), + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); + + address_cancel_requesting(address); + if (address_get(link, address, &existing) >= 0) + address_cancel_requesting(existing); + return 1; + } + + r = address_configure(address, &c, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure address: %m"); + + address_enter_configuring(address); + if (address_get(link, address, &existing) >= 0) + address_enter_configuring(existing); + + return 1; +} + +int link_request_address( + Link *link, + const Address *address, + unsigned *message_counter, + address_netlink_handler_t netlink_handler, + Request **ret) { + + _cleanup_(address_freep) Address *tmp = NULL; + Address *existing = NULL; + int r; + + assert(link); + assert(address); + assert(address->source != NETWORK_CONFIG_SOURCE_FOREIGN); + + if (address->lifetime_valid_usec == 0) + /* The requested address is outdated. Let's ignore the request. */ + return 0; + + if (address_get(link, address, &existing) < 0) { + if (address_get_request(link, address, NULL) >= 0) + return 0; /* already requested, skipping. */ + + r = address_acquire(link, address, &tmp); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to acquire an address from pool: %m"); + + /* Consider address tentative until we get the real flags from the kernel */ + tmp->flags |= IFA_F_TENTATIVE; + + } else { + r = address_dup(address, &tmp); + if (r < 0) + return log_oom(); + + /* Copy already assigned address when it is requested as a null address. */ + if (address_is_static_null(address)) + tmp->in_addr = existing->in_addr; + + /* Copy state for logging below. */ + tmp->state = existing->state; + } + + address_set_broadcast(tmp, link); + + r = ipv4acd_configure(link, tmp); + if (r < 0) + return r; + + log_address_debug(tmp, "Requesting", link); + r = link_queue_request_safe(link, REQUEST_TYPE_ADDRESS, + tmp, + address_free, + address_hash_func, + address_compare_func, + address_process_request, + message_counter, netlink_handler, ret); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request address: %m"); + if (r == 0) + return 0; + + address_enter_requesting(tmp); + if (existing) + address_enter_requesting(existing); + + TAKE_PTR(tmp); + return 1; +} + +static int static_address_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Address *address) { + int r; + + assert(link); + + r = address_configure_handler_internal(rtnl, m, link, "Failed to set static address"); + if (r <= 0) + return r; + + if (link->static_address_messages == 0) { + log_link_debug(link, "Addresses set"); + link->static_addresses_configured = true; + link_check_ready(link); + } + + return 1; +} + +int link_request_static_address(Link *link, const Address *address) { + assert(link); + assert(address); + assert(address->source == NETWORK_CONFIG_SOURCE_STATIC); + + return link_request_address(link, address, &link->static_address_messages, + static_address_handler, NULL); +} + +int link_request_static_addresses(Link *link) { + Address *a; + int r; + + assert(link); + assert(link->network); + + link->static_addresses_configured = false; + + ORDERED_HASHMAP_FOREACH(a, link->network->addresses_by_section) { + r = link_request_static_address(link, a); + if (r < 0) + return r; + } + + r = link_request_radv_addresses(link); + if (r < 0) + return r; + + if (link->static_address_messages == 0) { + link->static_addresses_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Setting addresses"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +void address_cancel_request(Address *address) { + Request req; + + assert(address); + assert(address->link); + + if (!address_is_requesting(address)) + return; + + req = (Request) { + .link = address->link, + .type = REQUEST_TYPE_ADDRESS, + .userdata = address, + .hash_func = (hash_func_t) address_hash_func, + .compare_func = (compare_func_t) address_compare_func, + }; + + request_detach(address->link->manager, &req); + address_cancel_requesting(address); +} + +int manager_rtnl_process_address(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(address_freep) Address *tmp = NULL; + struct ifa_cacheinfo cinfo; + Link *link; + uint16_t type; + Address *address = NULL; + Request *req = NULL; + bool is_new = false, update_dhcp4; + int ifindex, r; + + assert(rtnl); + assert(message); + assert(m); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive address message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWADDR, RTM_DELADDR)) { + log_warning("rtnl: received unexpected message type %u when processing address, ignoring.", type); + return 0; + } + + r = sd_rtnl_message_addr_get_ifindex(message, &ifindex); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get ifindex from message, ignoring: %m"); + return 0; + } else if (ifindex <= 0) { + log_warning("rtnl: received address message with invalid ifindex %d, ignoring.", ifindex); + return 0; + } + + r = link_get_by_index(m, ifindex, &link); + if (r < 0) { + /* when enumerating we might be out of sync, but we will get the address again, so just + * ignore it */ + if (!m->enumerating) + log_warning("rtnl: received address for link '%d' we don't know about, ignoring.", ifindex); + return 0; + } + + r = address_new(&tmp); + if (r < 0) + return log_oom(); + + /* First, read minimal information to make address_get() work below. */ + + r = sd_rtnl_message_addr_get_family(message, &tmp->family); + if (r < 0) { + log_link_warning(link, "rtnl: received address message without family, ignoring."); + return 0; + } else if (!IN_SET(tmp->family, AF_INET, AF_INET6)) { + log_link_debug(link, "rtnl: received address message with invalid family '%i', ignoring.", tmp->family); + return 0; + } + + r = sd_rtnl_message_addr_get_prefixlen(message, &tmp->prefixlen); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received address message without prefixlen, ignoring: %m"); + return 0; + } + + switch (tmp->family) { + case AF_INET: + r = sd_netlink_message_read_in_addr(message, IFA_LOCAL, &tmp->in_addr.in); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received address message without valid address, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_in_addr(message, IFA_ADDRESS, &tmp->in_addr_peer.in); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: could not get peer address from address message, ignoring: %m"); + return 0; + } else if (r >= 0) { + if (in4_addr_equal(&tmp->in_addr.in, &tmp->in_addr_peer.in)) + tmp->in_addr_peer = IN_ADDR_NULL; + } + + break; + + case AF_INET6: + r = sd_netlink_message_read_in6_addr(message, IFA_LOCAL, &tmp->in_addr.in6); + if (r >= 0) { + /* Have peer address. */ + r = sd_netlink_message_read_in6_addr(message, IFA_ADDRESS, &tmp->in_addr_peer.in6); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: could not get peer address from address message, ignoring: %m"); + return 0; + } + } else if (r == -ENODATA) { + /* Does not have peer address. */ + r = sd_netlink_message_read_in6_addr(message, IFA_ADDRESS, &tmp->in_addr.in6); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received address message without valid address, ignoring: %m"); + return 0; + } + } else { + log_link_warning_errno(link, r, "rtnl: could not get local address from address message, ignoring: %m"); + return 0; + } + + break; + + default: + assert_not_reached(); + } + + update_dhcp4 = tmp->family == AF_INET6; + + /* Then, find the managed Address and Request objects corresponding to the received address. */ + (void) address_get(link, tmp, &address); + (void) address_get_request(link, tmp, &req); + + if (type == RTM_DELADDR) { + if (address) { + address_enter_removed(address); + log_address_debug(address, "Forgetting removed", link); + (void) address_drop(address); + } else + log_address_debug(tmp, "Kernel removed unknown", link); + + if (req) + address_enter_removed(req->userdata); + + goto finalize; + } + + if (!address) { + /* If we did not know the address, then save it. */ + r = address_add(link, tmp); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to save received address %s, ignoring: %m", + IN_ADDR_PREFIX_TO_STRING(tmp->family, &tmp->in_addr, tmp->prefixlen)); + return 0; + } + address = TAKE_PTR(tmp); + + is_new = true; + + } else { + /* Otherwise, update the managed Address object with the netlink notification. */ + address->prefixlen = tmp->prefixlen; + address->in_addr_peer = tmp->in_addr_peer; + } + + /* Also update information that cannot be obtained through netlink notification. */ + if (req && req->waiting_reply) { + Address *a = ASSERT_PTR(req->userdata); + + address->source = a->source; + address->provider = a->provider; + (void) free_and_strdup_warn(&address->netlabel, a->netlabel); + nft_set_context_clear(&address->nft_set_context); + (void) nft_set_context_dup(&a->nft_set_context, &address->nft_set_context); + address->requested_as_null = a->requested_as_null; + address->callback = a->callback; + } + + /* Then, update miscellaneous info. */ + r = sd_rtnl_message_addr_get_scope(message, &address->scope); + if (r < 0) + log_link_debug_errno(link, r, "rtnl: received address message without scope, ignoring: %m"); + + if (address->family == AF_INET) { + _cleanup_free_ char *label = NULL; + + r = sd_netlink_message_read_string_strdup(message, IFA_LABEL, &label); + if (r >= 0) { + if (!streq_ptr(label, link->ifname)) + free_and_replace(address->label, label); + } else if (r != -ENODATA) + log_link_debug_errno(link, r, "rtnl: could not get label from address message, ignoring: %m"); + + r = sd_netlink_message_read_in_addr(message, IFA_BROADCAST, &address->broadcast); + if (r < 0 && r != -ENODATA) + log_link_debug_errno(link, r, "rtnl: could not get broadcast from address message, ignoring: %m"); + } + + r = sd_netlink_message_read_u32(message, IFA_FLAGS, &address->flags); + if (r == -ENODATA) { + unsigned char flags; + + /* For old kernels. */ + r = sd_rtnl_message_addr_get_flags(message, &flags); + if (r >= 0) + address->flags = flags; + } else if (r < 0) + log_link_debug_errno(link, r, "rtnl: failed to read IFA_FLAGS attribute, ignoring: %m"); + + r = sd_netlink_message_read_cache_info(message, IFA_CACHEINFO, &cinfo); + if (r >= 0) + address_set_lifetime(m, address, &cinfo); + else if (r != -ENODATA) + log_link_debug_errno(link, r, "rtnl: failed to read IFA_CACHEINFO attribute, ignoring: %m"); + + r = sd_netlink_message_read_u32(message, IFA_RT_PRIORITY, &address->route_metric); + if (r < 0 && r != -ENODATA) + log_link_debug_errno(link, r, "rtnl: failed to read IFA_RT_PRIORITY attribute, ignoring: %m"); + + address_enter_configured(address); + if (req) + address_enter_configured(req->userdata); + + log_address_debug(address, is_new ? "Received new": "Received updated", link); + + /* address_update() logs internally, so we don't need to here. */ + r = address_update(address); + if (r < 0) + link_enter_failed(link); + +finalize: + if (update_dhcp4) { + r = dhcp4_update_ipv6_connectivity(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to notify IPv6 connectivity to DHCPv4 client: %m"); + link_enter_failed(link); + } + } + + return 1; +} + +int config_parse_broadcast( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + union in_addr_union u; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + /* The broadcast address will be calculated based on Address=, and set if the link is + * not a wireguard interface. Here, we do not check or set n->family. */ + n->broadcast = (struct in_addr) {}; + n->set_broadcast = -1; + TAKE_PTR(n); + return 0; + } + + r = parse_boolean(rvalue); + if (r >= 0) { + /* The broadcast address will be calculated based on Address=. Here, we do not check or + * set n->family. */ + n->broadcast = (struct in_addr) {}; + n->set_broadcast = r; + TAKE_PTR(n); + return 0; + } + + if (n->family == AF_INET6) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Broadcast is not valid for IPv6 addresses, ignoring assignment: %s", rvalue); + return 0; + } + + r = in_addr_from_string(AF_INET, rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Broadcast is invalid, ignoring assignment: %s", rvalue); + return 0; + } + if (in4_addr_is_null(&u.in)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Broadcast cannot be ANY address, ignoring assignment: %s", rvalue); + return 0; + } + + n->broadcast = u.in; + n->set_broadcast = true; + n->family = AF_INET; + TAKE_PTR(n); + + return 0; +} + +int config_parse_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + union in_addr_union buffer; + unsigned char prefixlen; + int r, f; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(section, "Network")) + /* we are not in an Address section, so use line number instead. */ + r = address_new_static(network, filename, line, &n); + else + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + /* Address=address/prefixlen */ + r = in_addr_prefix_from_string_auto_internal(rvalue, PREFIXLEN_REFUSE, &f, &buffer, &prefixlen); + if (r == -ENOANO) { + r = in_addr_prefix_from_string_auto(rvalue, &f, &buffer, &prefixlen); + if (r >= 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Address '%s' is specified without prefix length. Assuming the prefix length is %u. " + "Please specify the prefix length explicitly.", rvalue, prefixlen); + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid address '%s', ignoring assignment: %m", rvalue); + return 0; + } + + if (n->family != AF_UNSPEC && f != n->family) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Address is incompatible, ignoring assignment: %s", rvalue); + return 0; + } + + if (in_addr_is_null(f, &buffer)) { + /* Will use address from address pool. Note that for ipv6 case, prefix of the address + * pool is 8, but 40 bit is used by the global ID and 16 bit by the subnet ID. So, + * let's limit the prefix length to 64 or larger. See RFC4193. */ + if ((f == AF_INET && prefixlen < 8) || + (f == AF_INET6 && prefixlen < 64)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Null address with invalid prefixlen='%u', ignoring assignment: %s", + prefixlen, rvalue); + return 0; + } + } + + n->family = f; + n->prefixlen = prefixlen; + + if (streq(lvalue, "Address")) { + n->in_addr = buffer; + n->requested_as_null = !in_addr_is_set(n->family, &n->in_addr); + } else + n->in_addr_peer = buffer; + + TAKE_PTR(n); + return 0; +} + +int config_parse_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + n->label = mfree(n->label); + TAKE_PTR(n); + return 0; + } + + if (!address_label_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Interface label is too long or invalid, ignoring assignment: %s", rvalue); + return 0; + } + + r = free_and_strdup(&n->label, rvalue); + if (r < 0) + return log_oom(); + + TAKE_PTR(n); + return 0; +} + +int config_parse_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + usec_t k; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + /* We accept only "forever", "infinity", empty, or "0". */ + if (STR_IN_SET(rvalue, "forever", "infinity", "")) + k = USEC_INFINITY; + else if (streq(rvalue, "0")) + k = 0; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid PreferredLifetime= value, ignoring: %s", rvalue); + return 0; + } + + n->lifetime_preferred_usec = k; + TAKE_PTR(n); + + return 0; +} + +int config_parse_address_flags( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + if (streq(lvalue, "AddPrefixRoute")) + r = !r; + + SET_FLAG(n->flags, ltype, r); + + TAKE_PTR(n); + return 0; +} + +int config_parse_address_scope( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + r = route_scope_from_string(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse address scope \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->scope = r; + n->scope_set = true; + TAKE_PTR(n); + return 0; +} + +int config_parse_address_route_metric( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + r = safe_atou32(rvalue, &n->route_metric); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_duplicate_address_detection( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + r = parse_boolean(rvalue); + if (r >= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "For historical reasons, %s=%s means %s=%s. " + "Please use 'both', 'ipv4', 'ipv6' or 'none' instead.", + lvalue, rvalue, lvalue, r ? "none" : "both"); + n->duplicate_address_detection = r ? ADDRESS_FAMILY_NO : ADDRESS_FAMILY_YES; + n = NULL; + return 0; + } + + AddressFamily a = duplicate_address_detection_address_family_from_string(rvalue); + if (a < 0) { + log_syntax(unit, LOG_WARNING, filename, line, a, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + n->duplicate_address_detection = a; + + TAKE_PTR(n); + return 0; +} + +int config_parse_address_netlabel( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + assert(network); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate new address, ignoring assignment: %m"); + return 0; + } + + r = config_parse_string(unit, filename, line, section, section_line, + lvalue, CONFIG_PARSE_STRING_SAFE, rvalue, &n->netlabel, network); + if (r < 0) + return r; + + TAKE_PTR(n); + return 0; +} + +static void address_section_adjust_broadcast(Address *address) { + assert(address); + assert(address->section); + + if (!in4_addr_is_set(&address->broadcast)) + return; + + if (address->family == AF_INET6) + log_warning("%s: broadcast address is set for an IPv6 address. " + "Ignoring Broadcast= setting in the [Address] section from line %u.", + address->section->filename, address->section->line); + else if (address->prefixlen > 30) + log_warning("%s: broadcast address is set for an IPv4 address with prefix length larger than 30. " + "Ignoring Broadcast= setting in the [Address] section from line %u.", + address->section->filename, address->section->line); + else if (in4_addr_is_set(&address->in_addr_peer.in)) + log_warning("%s: broadcast address is set for an IPv4 address with peer address. " + "Ignoring Broadcast= setting in the [Address] section from line %u.", + address->section->filename, address->section->line); + else if (!in4_addr_is_set(&address->in_addr.in)) + log_warning("%s: broadcast address is set for an IPv4 address with null address. " + "Ignoring Broadcast= setting in the [Address] section from line %u.", + address->section->filename, address->section->line); + else + /* Otherwise, keep the specified broadcast address. */ + return; + + address->broadcast.s_addr = 0; +} + +int address_section_verify(Address *address) { + if (section_is_invalid(address->section)) + return -EINVAL; + + if (address->family == AF_UNSPEC) { + assert(address->section); + + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Address section without Address= field was configured. " + "Ignoring [Address] section from line %u.", + address->section->filename, address->section->line); + } + + if (address->family == AF_INET6 && !socket_ipv6_is_supported()) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: an IPv6 address was configured, but the kernel does not support IPv6. " + "Ignoring [Address] section from line %u.", + address->section->filename, address->section->line); + + assert(IN_SET(address->family, AF_INET, AF_INET6)); + + address_section_adjust_broadcast(address); + + if (address->family == AF_INET6 && address->label) { + log_warning("%s: address label is set for IPv6 address in the [Address] section from line %u. " + "Ignoring Label= setting.", + address->section->filename, address->section->line); + + address->label = mfree(address->label); + } + + if (!address->scope_set) { + if (in_addr_is_localhost(address->family, &address->in_addr) > 0) + address->scope = RT_SCOPE_HOST; + else if (in_addr_is_link_local(address->family, &address->in_addr) > 0) + address->scope = RT_SCOPE_LINK; + } + + if (address->duplicate_address_detection < 0) { + if (address->family == AF_INET6) + address->duplicate_address_detection = ADDRESS_FAMILY_IPV6; + else if (in4_addr_is_link_local(&address->in_addr.in)) + address->duplicate_address_detection = ADDRESS_FAMILY_IPV4; + else + address->duplicate_address_detection = ADDRESS_FAMILY_NO; + } else if (address->duplicate_address_detection == ADDRESS_FAMILY_IPV6 && address->family == AF_INET) + log_warning("%s: DuplicateAddressDetection=ipv6 is specified for IPv4 address, ignoring.", + address->section->filename); + else if (address->duplicate_address_detection == ADDRESS_FAMILY_IPV4 && address->family == AF_INET6) + log_warning("%s: DuplicateAddressDetection=ipv4 is specified for IPv6 address, ignoring.", + address->section->filename); + + if (address->family == AF_INET6 && + !FLAGS_SET(address->duplicate_address_detection, ADDRESS_FAMILY_IPV6)) + address->flags |= IFA_F_NODAD; + + uint32_t filtered_flags = address->family == AF_INET ? + address->flags & KNOWN_FLAGS & ~UNMANAGED_FLAGS & ~IPV6ONLY_FLAGS : + address->flags & KNOWN_FLAGS & ~UNMANAGED_FLAGS; + if (address->flags != filtered_flags) { + _cleanup_free_ char *str = NULL; + + (void) address_flags_to_string_alloc(address->flags ^ filtered_flags, address->family, &str); + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: unexpected address flags \"%s\" were configured. " + "Ignoring [Address] section from line %u.", + address->section->filename, strna(str), address->section->line); + } + + return 0; +} + +int network_drop_invalid_addresses(Network *network) { + _cleanup_set_free_ Set *addresses = NULL; + Address *address; + int r; + + assert(network); + + ORDERED_HASHMAP_FOREACH(address, network->addresses_by_section) { + Address *dup; + + if (address_section_verify(address) < 0) { + /* Drop invalid [Address] sections or Address= settings in [Network]. + * Note that address_free() will drop the address from addresses_by_section. */ + address_free(address); + continue; + } + + /* Always use the setting specified later. So, remove the previously assigned setting. */ + dup = set_remove(addresses, address); + if (dup) { + log_warning("%s: Duplicated address %s is specified at line %u and %u, " + "dropping the address setting specified at line %u.", + dup->section->filename, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen), + address->section->line, + dup->section->line, dup->section->line); + /* address_free() will drop the address from addresses_by_section. */ + address_free(dup); + } + + /* Use address_hash_ops, instead of address_hash_ops_free. Otherwise, the Address objects + * will be freed. */ + r = set_ensure_put(&addresses, &address_hash_ops, address); + if (r < 0) + return log_oom(); + assert(r > 0); + } + + r = network_adjust_dhcp_server(network, &addresses); + if (r < 0) + return r; + + return 0; +} + +int config_parse_address_ip_nft_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(address_free_or_set_invalidp) Address *n = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(network); + + r = address_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate a new address, ignoring assignment: %m"); + return 0; + } + + r = config_parse_nft_set(unit, filename, line, section, section_line, lvalue, ltype, rvalue, &n->nft_set_context, network); + if (r < 0) + return r; + + TAKE_PTR(n); + return 0; +} diff --git a/src/network/networkd-address.h b/src/network/networkd-address.h new file mode 100644 index 0000000..5be2f77 --- /dev/null +++ b/src/network/networkd-address.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "conf-parser.h" +#include "firewall-util.h" +#include "hash-funcs.h" +#include "in-addr-util.h" +#include "network-util.h" +#include "networkd-link.h" +#include "networkd-util.h" +#include "time-util.h" + +typedef struct Address Address; +typedef struct Manager Manager; +typedef struct Network Network; +typedef struct Request Request; +typedef int (*address_ready_callback_t)(Address *address); +typedef int (*address_netlink_handler_t)( + sd_netlink *rtnl, + sd_netlink_message *m, + Request *req, + Link *link, + Address *address); + +struct Address { + Link *link; + Network *network; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + union in_addr_union provider; /* DHCP server or router address */ + + int family; + unsigned char prefixlen; + unsigned char scope; + uint32_t flags; + uint32_t route_metric; /* route metric for prefix route */ + char *label, *netlabel; + + int set_broadcast; + struct in_addr broadcast; + + union in_addr_union in_addr; + union in_addr_union in_addr_peer; + + /* These are absolute points in time, and NOT timespans/durations. + * Must be specified with clock_boottime_or_monotonic(). */ + usec_t lifetime_valid_usec; + usec_t lifetime_preferred_usec; + + bool scope_set:1; + bool ip_masquerade_done:1; + bool requested_as_null:1; + + /* duplicate_address_detection is only used by static or IPv4 dynamic addresses. + * To control DAD for IPv6 dynamic addresses, set IFA_F_NODAD to flags. */ + AddressFamily duplicate_address_detection; + + /* Called when address become ready */ + address_ready_callback_t callback; + + NFTSetContext nft_set_context; +}; + +const char* format_lifetime(char *buf, size_t l, usec_t lifetime_usec) _warn_unused_result_; +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define FORMAT_LIFETIME(lifetime) \ + format_lifetime((char[FORMAT_TIMESPAN_MAX+STRLEN("for ")]){}, FORMAT_TIMESPAN_MAX+STRLEN("for "), lifetime) + +int address_flags_to_string_alloc(uint32_t flags, int family, char **ret); + +void link_get_address_states( + Link *link, + LinkAddressState *ret_ipv4, + LinkAddressState *ret_ipv6, + LinkAddressState *ret_all); + +extern const struct hash_ops address_hash_ops; + +int address_new(Address **ret); +int address_new_static(Network *network, const char *filename, unsigned section_line, Address **ret); +Address* address_free(Address *address); +int address_get(Link *link, const Address *in, Address **ret); +int address_get_harder(Link *link, const Address *in, Address **ret); +int address_configure_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, const char *error_msg); +int address_remove(Address *address); +int address_remove_and_drop(Address *address); +int address_dup(const Address *src, Address **ret); +bool address_is_ready(const Address *a); +bool link_check_addresses_ready(Link *link, NetworkConfigSource source); + +DEFINE_SECTION_CLEANUP_FUNCTIONS(Address, address_free); + +int link_drop_managed_addresses(Link *link); +int link_drop_foreign_addresses(Link *link); +int link_drop_ipv6ll_addresses(Link *link); +void link_foreignize_addresses(Link *link); +bool link_address_is_dynamic(const Link *link, const Address *address); +int link_get_address(Link *link, int family, const union in_addr_union *address, unsigned char prefixlen, Address **ret); +static inline int link_get_ipv6_address(Link *link, const struct in6_addr *address, unsigned char prefixlen, Address **ret) { + assert(address); + return link_get_address(link, AF_INET6, &(union in_addr_union) { .in6 = *address }, prefixlen, ret); +} +static inline int link_get_ipv4_address(Link *link, const struct in_addr *address, unsigned char prefixlen, Address **ret) { + assert(address); + return link_get_address(link, AF_INET, &(union in_addr_union) { .in = *address }, prefixlen, ret); +} +int manager_get_address(Manager *manager, int family, const union in_addr_union *address, unsigned char prefixlen, Address **ret); +bool manager_has_address(Manager *manager, int family, const union in_addr_union *address, bool check_ready); + +void address_cancel_request(Address *address); +int link_request_address( + Link *link, + const Address *address, + unsigned *message_counter, + address_netlink_handler_t netlink_handler, + Request **ret); +int link_request_static_address(Link *link, const Address *address); +int link_request_static_addresses(Link *link); + +int manager_rtnl_process_address(sd_netlink *nl, sd_netlink_message *message, Manager *m); + +int address_section_verify(Address *address); +int network_drop_invalid_addresses(Network *network); + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(Address, address); + +void link_mark_addresses(Link *link, NetworkConfigSource source); + +CONFIG_PARSER_PROTOTYPE(config_parse_address); +CONFIG_PARSER_PROTOTYPE(config_parse_broadcast); +CONFIG_PARSER_PROTOTYPE(config_parse_label); +CONFIG_PARSER_PROTOTYPE(config_parse_lifetime); +CONFIG_PARSER_PROTOTYPE(config_parse_address_flags); +CONFIG_PARSER_PROTOTYPE(config_parse_address_scope); +CONFIG_PARSER_PROTOTYPE(config_parse_address_route_metric); +CONFIG_PARSER_PROTOTYPE(config_parse_duplicate_address_detection); +CONFIG_PARSER_PROTOTYPE(config_parse_address_netlabel); +CONFIG_PARSER_PROTOTYPE(config_parse_address_ip_nft_set); diff --git a/src/network/networkd-bridge-fdb.c b/src/network/networkd-bridge-fdb.c new file mode 100644 index 0000000..803e27c --- /dev/null +++ b/src/network/networkd-bridge-fdb.c @@ -0,0 +1,535 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "alloc-util.h" +#include "bridge.h" +#include "netlink-util.h" +#include "networkd-bridge-fdb.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "networkd-util.h" +#include "parse-util.h" +#include "string-table.h" +#include "vlan-util.h" +#include "vxlan.h" + +#define STATIC_BRIDGE_FDB_ENTRIES_PER_NETWORK_MAX 1024U + +/* remove and FDB entry. */ +BridgeFDB *bridge_fdb_free(BridgeFDB *fdb) { + if (!fdb) + return NULL; + + if (fdb->network) { + assert(fdb->section); + hashmap_remove(fdb->network->bridge_fdb_entries_by_section, fdb->section); + } + + config_section_free(fdb->section); + + free(fdb->outgoing_ifname); + return mfree(fdb); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(BridgeFDB, bridge_fdb_free); + +/* create a new FDB entry or get an existing one. */ +static int bridge_fdb_new_static( + Network *network, + const char *filename, + unsigned section_line, + BridgeFDB **ret) { + + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(bridge_fdb_freep) BridgeFDB *fdb = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + /* search entry in hashmap first. */ + fdb = hashmap_get(network->bridge_fdb_entries_by_section, n); + if (fdb) { + *ret = TAKE_PTR(fdb); + return 0; + } + + if (hashmap_size(network->bridge_fdb_entries_by_section) >= STATIC_BRIDGE_FDB_ENTRIES_PER_NETWORK_MAX) + return -E2BIG; + + /* allocate space for and FDB entry. */ + fdb = new(BridgeFDB, 1); + if (!fdb) + return -ENOMEM; + + /* init FDB structure. */ + *fdb = (BridgeFDB) { + .network = network, + .section = TAKE_PTR(n), + .vni = VXLAN_VID_MAX + 1, + .ntf_flags = NEIGHBOR_CACHE_ENTRY_FLAGS_SELF, + }; + + r = hashmap_ensure_put(&network->bridge_fdb_entries_by_section, &config_section_hash_ops, fdb->section, fdb); + if (r < 0) + return r; + + /* return allocated FDB structure. */ + *ret = TAKE_PTR(fdb); + + return 0; +} + +static int bridge_fdb_configure_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not add bridge FDB entry"); + link_enter_failed(link); + return 0; + } + + if (link->static_bridge_fdb_messages == 0) { + log_link_debug(link, "Bridge FDB entries set"); + link->static_bridge_fdb_configured = true; + link_check_ready(link); + } + + return 0; +} + +/* send a request to the kernel to add a FDB entry in its static MAC table. */ +static int bridge_fdb_configure_message(const BridgeFDB *fdb, Link *link, sd_netlink_message *req) { + int r; + + assert(fdb); + assert(link); + + r = sd_rtnl_message_neigh_set_flags(req, fdb->ntf_flags); + if (r < 0) + return r; + + /* only NUD_PERMANENT state supported. */ + r = sd_rtnl_message_neigh_set_state(req, NUD_NOARP | NUD_PERMANENT); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, NDA_LLADDR, &fdb->mac_addr, sizeof(fdb->mac_addr)); + if (r < 0) + return r; + + /* VLAN Id is optional. We'll add VLAN Id only if it's specified. */ + if (fdb->vlan_id > 0) { + r = sd_netlink_message_append_u16(req, NDA_VLAN, fdb->vlan_id); + if (r < 0) + return r; + } + + if (fdb->outgoing_ifindex > 0) { + r = sd_netlink_message_append_u32(req, NDA_IFINDEX, fdb->outgoing_ifindex); + if (r < 0) + return r; + } + + if (in_addr_is_set(fdb->family, &fdb->destination_addr)) { + r = netlink_message_append_in_addr_union(req, NDA_DST, fdb->family, &fdb->destination_addr); + if (r < 0) + return r; + } + + if (fdb->vni <= VXLAN_VID_MAX) { + r = sd_netlink_message_append_u32(req, NDA_VNI, fdb->vni); + if (r < 0) + return r; + } + + return 0; +} + +static int bridge_fdb_configure(BridgeFDB *fdb, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(fdb); + assert(link); + assert(link->manager); + assert(req); + + r = sd_rtnl_message_new_neigh(link->manager->rtnl, &m, RTM_NEWNEIGH, link->ifindex, AF_BRIDGE); + if (r < 0) + return r; + + r = bridge_fdb_configure_message(fdb, link, m); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool bridge_fdb_is_ready_to_configure(BridgeFDB *fdb, Link *link) { + Link *out = NULL; + + assert(fdb); + assert(link); + assert(link->manager); + + if (!link_is_ready_to_configure(link, false)) + return false; + + if (fdb->outgoing_ifname) { + if (link_get_by_name(link->manager, fdb->outgoing_ifname, &out) < 0) + return false; + + fdb->outgoing_ifindex = out->ifindex; + } else if (fdb->outgoing_ifindex > 0) { + if (link_get_by_index(link->manager, fdb->outgoing_ifindex, &out) < 0) + return false; + } + if (out && !link_is_ready_to_configure(out, false)) + return false; + + return true; +} + +static int bridge_fdb_process_request(Request *req, Link *link, void *userdata) { + BridgeFDB *fdb = ASSERT_PTR(userdata); + int r; + + assert(req); + assert(link); + + if (!bridge_fdb_is_ready_to_configure(fdb, link)) + return 0; + + r = bridge_fdb_configure(fdb, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure bridge FDB: %m"); + + return 1; +} + +int link_request_static_bridge_fdb(Link *link) { + BridgeFDB *fdb; + int r; + + assert(link); + assert(link->network); + + link->static_bridge_fdb_configured = false; + + HASHMAP_FOREACH(fdb, link->network->bridge_fdb_entries_by_section) { + r = link_queue_request_full(link, REQUEST_TYPE_BRIDGE_FDB, + fdb, NULL, + trivial_hash_func, + trivial_compare_func, + bridge_fdb_process_request, + &link->static_bridge_fdb_messages, + bridge_fdb_configure_handler, + NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request static bridge FDB entry: %m"); + } + + if (link->static_bridge_fdb_messages == 0) { + link->static_bridge_fdb_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Setting bridge FDB entries"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +void network_drop_invalid_bridge_fdb_entries(Network *network) { + BridgeFDB *fdb; + + assert(network); + + HASHMAP_FOREACH(fdb, network->bridge_fdb_entries_by_section) + if (section_is_invalid(fdb->section)) + bridge_fdb_free(fdb); +} + +/* parse the HW address from config files. */ +int config_parse_fdb_hwaddr( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_fdb_free_or_set_invalidp) BridgeFDB *fdb = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_fdb_new_static(network, filename, section_line, &fdb); + if (r < 0) + return log_oom(); + + r = parse_ether_addr(rvalue, &fdb->mac_addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Not a valid MAC address, ignoring assignment: %s", rvalue); + return 0; + } + + TAKE_PTR(fdb); + return 0; +} + +/* parse the VLAN Id from config files. */ +int config_parse_fdb_vlan_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_fdb_free_or_set_invalidp) BridgeFDB *fdb = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_fdb_new_static(network, filename, section_line, &fdb); + if (r < 0) + return log_oom(); + + r = config_parse_vlanid(unit, filename, line, section, + section_line, lvalue, ltype, + rvalue, &fdb->vlan_id, userdata); + if (r < 0) + return r; + + TAKE_PTR(fdb); + return 0; +} + +int config_parse_fdb_destination( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_fdb_free_or_set_invalidp) BridgeFDB *fdb = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_fdb_new_static(network, filename, section_line, &fdb); + if (r < 0) + return log_oom(); + + r = in_addr_from_string_auto(rvalue, &fdb->family, &fdb->destination_addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "FDB destination IP address is invalid, ignoring assignment: %s", + rvalue); + return 0; + } + + TAKE_PTR(fdb); + return 0; +} + +int config_parse_fdb_vxlan_vni( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_fdb_free_or_set_invalidp) BridgeFDB *fdb = NULL; + Network *network = userdata; + uint32_t vni; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_fdb_new_static(network, filename, section_line, &fdb); + if (r < 0) + return log_oom(); + + r = safe_atou32(rvalue, &vni); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse VXLAN Network Identifier (VNI), ignoring assignment: %s", + rvalue); + return 0; + } + + if (vni > VXLAN_VID_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "FDB invalid VXLAN Network Identifier (VNI), ignoring assignment: %s", + rvalue); + return 0; + } + + fdb->vni = vni; + + TAKE_PTR(fdb); + return 0; +} + +static const char* const ntf_flags_table[_NEIGHBOR_CACHE_ENTRY_FLAGS_MAX] = { + [NEIGHBOR_CACHE_ENTRY_FLAGS_USE] = "use", + [NEIGHBOR_CACHE_ENTRY_FLAGS_SELF] = "self", + [NEIGHBOR_CACHE_ENTRY_FLAGS_MASTER] = "master", + [NEIGHBOR_CACHE_ENTRY_FLAGS_ROUTER] = "router", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(ntf_flags, NeighborCacheEntryFlags); + +int config_parse_fdb_ntf_flags( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_fdb_free_or_set_invalidp) BridgeFDB *fdb = NULL; + Network *network = userdata; + NeighborCacheEntryFlags f; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_fdb_new_static(network, filename, section_line, &fdb); + if (r < 0) + return log_oom(); + + f = ntf_flags_from_string(rvalue); + if (f < 0) { + log_syntax(unit, LOG_WARNING, filename, line, f, + "FDB failed to parse AssociatedWith=, ignoring assignment: %s", + rvalue); + return 0; + } + + fdb->ntf_flags = f; + + TAKE_PTR(fdb); + return 0; +} + +int config_parse_fdb_interface( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_fdb_free_or_set_invalidp) BridgeFDB *fdb = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_fdb_new_static(network, filename, section_line, &fdb); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + fdb->outgoing_ifname = mfree(fdb->outgoing_ifname); + fdb->outgoing_ifindex = 0; + TAKE_PTR(fdb); + return 0; + } + + r = parse_ifindex(rvalue); + if (r > 0) { + fdb->outgoing_ifname = mfree(fdb->outgoing_ifname); + fdb->outgoing_ifindex = r; + TAKE_PTR(fdb); + return 0; + } + + if (!ifname_valid_full(rvalue, IFNAME_VALID_ALTERNATIVE)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid interface name in %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + r = free_and_strdup(&fdb->outgoing_ifname, rvalue); + if (r < 0) + return log_oom(); + fdb->outgoing_ifindex = 0; + + TAKE_PTR(fdb); + return 0; +} diff --git a/src/network/networkd-bridge-fdb.h b/src/network/networkd-bridge-fdb.h new file mode 100644 index 0000000..b59d673 --- /dev/null +++ b/src/network/networkd-bridge-fdb.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "conf-parser.h" +#include "ether-addr-util.h" +#include "in-addr-util.h" + +typedef struct Link Link; +typedef struct Network Network; + +typedef enum NeighborCacheEntryFlags { + NEIGHBOR_CACHE_ENTRY_FLAGS_USE = NTF_USE, + NEIGHBOR_CACHE_ENTRY_FLAGS_SELF = NTF_SELF, + NEIGHBOR_CACHE_ENTRY_FLAGS_MASTER = NTF_MASTER, + NEIGHBOR_CACHE_ENTRY_FLAGS_ROUTER = NTF_ROUTER, + _NEIGHBOR_CACHE_ENTRY_FLAGS_MAX, + _NEIGHBOR_CACHE_ENTRY_FLAGS_INVALID = -EINVAL, +} NeighborCacheEntryFlags; + +typedef struct BridgeFDB { + Network *network; + ConfigSection *section; + + uint32_t vni; + + int family; + uint16_t vlan_id; + + struct ether_addr mac_addr; + union in_addr_union destination_addr; + NeighborCacheEntryFlags ntf_flags; + char *outgoing_ifname; + int outgoing_ifindex; +} BridgeFDB; + +BridgeFDB *bridge_fdb_free(BridgeFDB *fdb); + +void network_drop_invalid_bridge_fdb_entries(Network *network); + +int link_request_static_bridge_fdb(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_fdb_hwaddr); +CONFIG_PARSER_PROTOTYPE(config_parse_fdb_vlan_id); +CONFIG_PARSER_PROTOTYPE(config_parse_fdb_destination); +CONFIG_PARSER_PROTOTYPE(config_parse_fdb_vxlan_vni); +CONFIG_PARSER_PROTOTYPE(config_parse_fdb_ntf_flags); +CONFIG_PARSER_PROTOTYPE(config_parse_fdb_interface); diff --git a/src/network/networkd-bridge-mdb.c b/src/network/networkd-bridge-mdb.c new file mode 100644 index 0000000..bd1a974 --- /dev/null +++ b/src/network/networkd-bridge-mdb.c @@ -0,0 +1,365 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "netlink-util.h" +#include "networkd-bridge-mdb.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "string-util.h" +#include "vlan-util.h" + +#define STATIC_BRIDGE_MDB_ENTRIES_PER_NETWORK_MAX 1024U + +/* remove MDB entry. */ +BridgeMDB *bridge_mdb_free(BridgeMDB *mdb) { + if (!mdb) + return NULL; + + if (mdb->network) { + assert(mdb->section); + hashmap_remove(mdb->network->bridge_mdb_entries_by_section, mdb->section); + } + + config_section_free(mdb->section); + + return mfree(mdb); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(BridgeMDB, bridge_mdb_free); + +/* create a new MDB entry or get an existing one. */ +static int bridge_mdb_new_static( + Network *network, + const char *filename, + unsigned section_line, + BridgeMDB **ret) { + + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(bridge_mdb_freep) BridgeMDB *mdb = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + /* search entry in hashmap first. */ + mdb = hashmap_get(network->bridge_mdb_entries_by_section, n); + if (mdb) { + *ret = TAKE_PTR(mdb); + return 0; + } + + if (hashmap_size(network->bridge_mdb_entries_by_section) >= STATIC_BRIDGE_MDB_ENTRIES_PER_NETWORK_MAX) + return -E2BIG; + + /* allocate space for an MDB entry. */ + mdb = new(BridgeMDB, 1); + if (!mdb) + return -ENOMEM; + + /* init MDB structure. */ + *mdb = (BridgeMDB) { + .network = network, + .section = TAKE_PTR(n), + }; + + r = hashmap_ensure_put(&network->bridge_mdb_entries_by_section, &config_section_hash_ops, mdb->section, mdb); + if (r < 0) + return r; + + /* return allocated MDB structure. */ + *ret = TAKE_PTR(mdb); + return 0; +} + +static int bridge_mdb_configure_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r == -EINVAL && streq_ptr(link->kind, "bridge") && link->master_ifindex <= 0) { + /* To configure bridge MDB entries on bridge master, 1bc844ee0faa1b92e3ede00bdd948021c78d7088 (v5.4) is required. */ + if (!link->manager->bridge_mdb_on_master_not_supported) { + log_link_warning_errno(link, r, "Kernel seems not to support bridge MDB entries on bridge master, ignoring: %m"); + link->manager->bridge_mdb_on_master_not_supported = true; + } + } else if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not add MDB entry"); + link_enter_failed(link); + return 1; + } + + if (link->static_bridge_mdb_messages == 0) { + link->static_bridge_mdb_configured = true; + link_check_ready(link); + } + + return 1; +} + +/* send a request to the kernel to add an MDB entry */ +static int bridge_mdb_configure(BridgeMDB *mdb, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + struct br_mdb_entry entry; + int r; + + assert(mdb); + assert(link); + assert(link->manager); + assert(req); + + if (DEBUG_LOGGING) + log_link_debug(link, "Configuring bridge MDB entry: MulticastGroupAddress=%s, VLANId=%u", + IN_ADDR_TO_STRING(mdb->family, &mdb->group_addr), mdb->vlan_id); + + entry = (struct br_mdb_entry) { + /* If MDB entry is added on bridge master, then the state must be MDB_TEMPORARY. + * See br_mdb_add_group() in net/bridge/br_mdb.c of kernel. */ + .state = link->master_ifindex <= 0 ? MDB_TEMPORARY : MDB_PERMANENT, + .ifindex = link->ifindex, + .vid = mdb->vlan_id, + }; + + switch (mdb->family) { + case AF_INET: + entry.addr.u.ip4 = mdb->group_addr.in.s_addr; + entry.addr.proto = htobe16(ETH_P_IP); + break; + + case AF_INET6: + entry.addr.u.ip6 = mdb->group_addr.in6; + entry.addr.proto = htobe16(ETH_P_IPV6); + break; + + default: + assert_not_reached(); + } + + r = sd_rtnl_message_new_mdb(link->manager->rtnl, &m, RTM_NEWMDB, + link->master_ifindex > 0 ? link->master_ifindex : link->ifindex); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(m, MDBA_SET_ENTRY, &entry, sizeof(entry)); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool bridge_mdb_is_ready_to_configure(Link *link) { + Link *master; + + assert(link); + + if (!link_is_ready_to_configure(link, false)) + return false; + + if (!link->master_set) + return false; + + if (link->master_ifindex <= 0 && streq_ptr(link->kind, "bridge")) + return true; /* The interface is bridge master. */ + + if (link_get_master(link, &master) < 0) + return false; + + if (!streq_ptr(master->kind, "bridge")) + return false; + + if (!IN_SET(master->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + if (master->set_flags_messages > 0) + return false; + + if (!link_has_carrier(master)) + return false; + + return true; +} + +static int bridge_mdb_process_request(Request *req, Link *link, void *userdata) { + BridgeMDB *mdb = ASSERT_PTR(userdata); + int r; + + assert(req); + assert(link); + + if (!bridge_mdb_is_ready_to_configure(link)) + return 0; + + r = bridge_mdb_configure(mdb, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure bridge MDB: %m"); + + return 1; +} + +int link_request_static_bridge_mdb(Link *link) { + BridgeMDB *mdb; + int r; + + assert(link); + assert(link->manager); + + link->static_bridge_mdb_configured = false; + + if (!link->network) + return 0; + + if (hashmap_isempty(link->network->bridge_mdb_entries_by_section)) + goto finish; + + HASHMAP_FOREACH(mdb, link->network->bridge_mdb_entries_by_section) { + r = link_queue_request_full(link, REQUEST_TYPE_BRIDGE_MDB, + mdb, NULL, + trivial_hash_func, + trivial_compare_func, + bridge_mdb_process_request, + &link->static_bridge_mdb_messages, + bridge_mdb_configure_handler, + NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request MDB entry to multicast group database: %m"); + } + +finish: + if (link->static_bridge_mdb_messages == 0) { + link->static_bridge_mdb_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Setting bridge MDB entries."); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +static int bridge_mdb_verify(BridgeMDB *mdb) { + if (section_is_invalid(mdb->section)) + return -EINVAL; + + if (mdb->family == AF_UNSPEC) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: [BridgeMDB] section without MulticastGroupAddress= field configured. " + "Ignoring [BridgeMDB] section from line %u.", + mdb->section->filename, mdb->section->line); + + if (!in_addr_is_multicast(mdb->family, &mdb->group_addr)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: MulticastGroupAddress= is not a multicast address. " + "Ignoring [BridgeMDB] section from line %u.", + mdb->section->filename, mdb->section->line); + + if (mdb->family == AF_INET) { + if (in4_addr_is_local_multicast(&mdb->group_addr.in)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: MulticastGroupAddress= is a local multicast address. " + "Ignoring [BridgeMDB] section from line %u.", + mdb->section->filename, mdb->section->line); + } else { + if (in6_addr_is_link_local_all_nodes(&mdb->group_addr.in6)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: MulticastGroupAddress= is the multicast all nodes address. " + "Ignoring [BridgeMDB] section from line %u.", + mdb->section->filename, mdb->section->line); + } + + return 0; +} + +void network_drop_invalid_bridge_mdb_entries(Network *network) { + BridgeMDB *mdb; + + assert(network); + + HASHMAP_FOREACH(mdb, network->bridge_mdb_entries_by_section) + if (bridge_mdb_verify(mdb) < 0) + bridge_mdb_free(mdb); +} + +/* parse the VLAN Id from config files. */ +int config_parse_mdb_vlan_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_mdb_free_or_set_invalidp) BridgeMDB *mdb = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_mdb_new_static(network, filename, section_line, &mdb); + if (r < 0) + return log_oom(); + + r = config_parse_vlanid(unit, filename, line, section, + section_line, lvalue, ltype, + rvalue, &mdb->vlan_id, userdata); + if (r < 0) + return r; + + TAKE_PTR(mdb); + return 0; +} + +/* parse the multicast group from config files. */ +int config_parse_mdb_group_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(bridge_mdb_free_or_set_invalidp) BridgeMDB *mdb = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = bridge_mdb_new_static(network, filename, section_line, &mdb); + if (r < 0) + return log_oom(); + + r = in_addr_from_string_auto(rvalue, &mdb->family, &mdb->group_addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Cannot parse multicast group address: %m"); + return 0; + } + + TAKE_PTR(mdb); + return 0; +} diff --git a/src/network/networkd-bridge-mdb.h b/src/network/networkd-bridge-mdb.h new file mode 100644 index 0000000..edea255 --- /dev/null +++ b/src/network/networkd-bridge-mdb.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Network Network; + +typedef struct BridgeMDB { + Network *network; + ConfigSection *section; + + int family; + union in_addr_union group_addr; + uint16_t vlan_id; +} BridgeMDB; + +BridgeMDB *bridge_mdb_free(BridgeMDB *mdb); + +void network_drop_invalid_bridge_mdb_entries(Network *network); + +int link_request_static_bridge_mdb(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_mdb_group_address); +CONFIG_PARSER_PROTOTYPE(config_parse_mdb_vlan_id); diff --git a/src/network/networkd-bridge-vlan.c b/src/network/networkd-bridge-vlan.c new file mode 100644 index 0000000..36e3610 --- /dev/null +++ b/src/network/networkd-bridge-vlan.c @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2016 BISDN GmbH. All rights reserved. +***/ + +#include +#include +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "networkd-bridge-vlan.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "parse-util.h" +#include "vlan-util.h" + +static bool is_bit_set(unsigned bit, uint32_t scope) { + assert(bit < sizeof(scope)*8); + return scope & (UINT32_C(1) << bit); +} + +static void set_bit(unsigned nr, uint32_t *addr) { + if (nr < BRIDGE_VLAN_BITMAP_MAX) + addr[nr / 32] |= (UINT32_C(1) << (nr % 32)); +} + +static int find_next_bit(int i, uint32_t x) { + int j; + + if (i >= 32) + return -1; + + /* find first bit */ + if (i < 0) + return BUILTIN_FFS_U32(x); + + /* mask off prior finds to get next */ + j = __builtin_ffs(x >> i); + return j ? j + i : 0; +} + +int bridge_vlan_append_info( + const Link *link, + sd_netlink_message *req, + uint16_t pvid, + const uint32_t *br_vid_bitmap, + const uint32_t *br_untagged_bitmap) { + + struct bridge_vlan_info br_vlan; + bool done, untagged = false; + uint16_t begin, end; + int r, cnt; + + assert(link); + assert(req); + assert(br_vid_bitmap); + assert(br_untagged_bitmap); + + cnt = 0; + + begin = end = UINT16_MAX; + for (int k = 0; k < BRIDGE_VLAN_BITMAP_LEN; k++) { + uint32_t untagged_map = br_untagged_bitmap[k]; + uint32_t vid_map = br_vid_bitmap[k]; + unsigned base_bit = k * 32; + int i = -1; + + done = false; + do { + int j = find_next_bit(i, vid_map); + if (j > 0) { + /* first hit of any bit */ + if (begin == UINT16_MAX && end == UINT16_MAX) { + begin = end = j - 1 + base_bit; + untagged = is_bit_set(j - 1, untagged_map); + goto next; + } + + /* this bit is a continuation of prior bits */ + if (j - 2 + base_bit == end && untagged == is_bit_set(j - 1, untagged_map) && (uint16_t)j - 1 + base_bit != pvid && (uint16_t)begin != pvid) { + end++; + goto next; + } + } else + done = true; + + if (begin != UINT16_MAX) { + cnt++; + if (done && k < BRIDGE_VLAN_BITMAP_LEN - 1) + break; + + br_vlan.flags = 0; + if (untagged) + br_vlan.flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + if (begin == end) { + br_vlan.vid = begin; + + if (begin == pvid) + br_vlan.flags |= BRIDGE_VLAN_INFO_PVID; + + r = sd_netlink_message_append_data(req, IFLA_BRIDGE_VLAN_INFO, &br_vlan, sizeof(br_vlan)); + if (r < 0) + return r; + } else { + br_vlan.vid = begin; + br_vlan.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + + r = sd_netlink_message_append_data(req, IFLA_BRIDGE_VLAN_INFO, &br_vlan, sizeof(br_vlan)); + if (r < 0) + return r; + + br_vlan.vid = end; + br_vlan.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN; + br_vlan.flags |= BRIDGE_VLAN_INFO_RANGE_END; + + r = sd_netlink_message_append_data(req, IFLA_BRIDGE_VLAN_INFO, &br_vlan, sizeof(br_vlan)); + if (r < 0) + return r; + } + + if (done) + break; + } + if (j > 0) { + begin = end = j - 1 + base_bit; + untagged = is_bit_set(j - 1, untagged_map); + } + + next: + i = j; + } while (!done); + } + + assert(cnt > 0); + return cnt; +} + +void network_adjust_bridge_vlan(Network *network) { + assert(network); + + if (!network->use_br_vlan) + return; + + /* pvid might not be in br_vid_bitmap yet */ + if (network->pvid) + set_bit(network->pvid, network->br_vid_bitmap); +} + +int config_parse_brvlan_pvid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + uint16_t pvid; + int r; + + r = parse_vlanid(rvalue, &pvid); + if (r < 0) + return r; + + network->pvid = pvid; + network->use_br_vlan = true; + + return 0; +} + +int config_parse_brvlan_vlan( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + uint16_t vid, vid_end; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = parse_vid_range(rvalue, &vid, &vid_end); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse VLAN, ignoring: %s", rvalue); + return 0; + } + + for (; vid <= vid_end; vid++) + set_bit(vid, network->br_vid_bitmap); + + network->use_br_vlan = true; + return 0; +} + +int config_parse_brvlan_untagged( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + uint16_t vid, vid_end; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = parse_vid_range(rvalue, &vid, &vid_end); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Could not parse VLAN: %s", rvalue); + return 0; + } + + for (; vid <= vid_end; vid++) { + set_bit(vid, network->br_vid_bitmap); + set_bit(vid, network->br_untagged_bitmap); + } + + network->use_br_vlan = true; + return 0; +} diff --git a/src/network/networkd-bridge-vlan.h b/src/network/networkd-bridge-vlan.h new file mode 100644 index 0000000..f44b810 --- /dev/null +++ b/src/network/networkd-bridge-vlan.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2016 BISDN GmbH. All rights reserved. +***/ + +#include + +#include "sd-netlink.h" + +#include "conf-parser.h" + +#define BRIDGE_VLAN_BITMAP_MAX 4096 +#define BRIDGE_VLAN_BITMAP_LEN (BRIDGE_VLAN_BITMAP_MAX / 32) + +typedef struct Link Link; +typedef struct Network Network; + +void network_adjust_bridge_vlan(Network *network); + +int bridge_vlan_append_info( + const Link * link, + sd_netlink_message *req, + uint16_t pvid, + const uint32_t *br_vid_bitmap, + const uint32_t *br_untagged_bitmap); + +CONFIG_PARSER_PROTOTYPE(config_parse_brvlan_pvid); +CONFIG_PARSER_PROTOTYPE(config_parse_brvlan_vlan); +CONFIG_PARSER_PROTOTYPE(config_parse_brvlan_untagged); diff --git a/src/network/networkd-can.c b/src/network/networkd-can.c new file mode 100644 index 0000000..b8a1871 --- /dev/null +++ b/src/network/networkd-can.c @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "networkd-can.h" +#include "networkd-link.h" +#include "networkd-network.h" +#include "networkd-setlink.h" +#include "parse-util.h" +#include "string-util.h" + +#define CAN_TERMINATION_DEFAULT_OHM_VALUE 120 + +int can_set_netlink_message(Link *link, sd_netlink_message *m) { + int r; + + assert(link); + assert(link->network); + assert(m); + + r = sd_netlink_message_set_flags(m, NLM_F_REQUEST | NLM_F_ACK); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, link->kind); + if (r < 0) + return r; + + if (link->network->can_bitrate > 0) { + struct can_bittiming bt = { + .bitrate = link->network->can_bitrate, + .sample_point = link->network->can_sample_point, + .sjw = link->network->can_sync_jump_width, + }; + + log_link_debug(link, "Setting bitrate = %u bit/s", bt.bitrate); + if (link->network->can_sample_point > 0) + log_link_debug(link, "Setting sample point = %u.%u%%", bt.sample_point / 10, bt.sample_point % 10); + else + log_link_debug(link, "Using default sample point"); + + r = sd_netlink_message_append_data(m, IFLA_CAN_BITTIMING, &bt, sizeof(bt)); + if (r < 0) + return r; + } else if (link->network->can_time_quanta_ns > 0) { + struct can_bittiming bt = { + .tq = link->network->can_time_quanta_ns, + .prop_seg = link->network->can_propagation_segment, + .phase_seg1 = link->network->can_phase_buffer_segment_1, + .phase_seg2 = link->network->can_phase_buffer_segment_2, + .sjw = link->network->can_sync_jump_width, + }; + + log_link_debug(link, "Setting time quanta = %"PRIu32" nsec", bt.tq); + r = sd_netlink_message_append_data(m, IFLA_CAN_BITTIMING, &bt, sizeof(bt)); + if (r < 0) + return r; + } + + if (link->network->can_data_bitrate > 0) { + struct can_bittiming bt = { + .bitrate = link->network->can_data_bitrate, + .sample_point = link->network->can_data_sample_point, + .sjw = link->network->can_data_sync_jump_width, + }; + + log_link_debug(link, "Setting data bitrate = %u bit/s", bt.bitrate); + if (link->network->can_data_sample_point > 0) + log_link_debug(link, "Setting data sample point = %u.%u%%", bt.sample_point / 10, bt.sample_point % 10); + else + log_link_debug(link, "Using default data sample point"); + + r = sd_netlink_message_append_data(m, IFLA_CAN_DATA_BITTIMING, &bt, sizeof(bt)); + if (r < 0) + return r; + } else if (link->network->can_data_time_quanta_ns > 0) { + struct can_bittiming bt = { + .tq = link->network->can_data_time_quanta_ns, + .prop_seg = link->network->can_data_propagation_segment, + .phase_seg1 = link->network->can_data_phase_buffer_segment_1, + .phase_seg2 = link->network->can_data_phase_buffer_segment_2, + .sjw = link->network->can_data_sync_jump_width, + }; + + log_link_debug(link, "Setting data time quanta = %"PRIu32" nsec", bt.tq); + r = sd_netlink_message_append_data(m, IFLA_CAN_DATA_BITTIMING, &bt, sizeof(bt)); + if (r < 0) + return r; + } + + if (link->network->can_restart_us > 0) { + uint64_t restart_ms; + + if (link->network->can_restart_us == USEC_INFINITY) + restart_ms = 0; + else + restart_ms = DIV_ROUND_UP(link->network->can_restart_us, USEC_PER_MSEC); + + log_link_debug(link, "Setting restart = %s", FORMAT_TIMESPAN(restart_ms * 1000, MSEC_PER_SEC)); + r = sd_netlink_message_append_u32(m, IFLA_CAN_RESTART_MS, restart_ms); + if (r < 0) + return r; + } + + if (link->network->can_control_mode_mask != 0) { + struct can_ctrlmode cm = { + .mask = link->network->can_control_mode_mask, + .flags = link->network->can_control_mode_flags, + }; + + r = sd_netlink_message_append_data(m, IFLA_CAN_CTRLMODE, &cm, sizeof(cm)); + if (r < 0) + return r; + } + + if (link->network->can_termination_set) { + log_link_debug(link, "Setting can-termination to '%u'.", link->network->can_termination); + + r = sd_netlink_message_append_u16(m, IFLA_CAN_TERMINATION, link->network->can_termination); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return 0; +} + +int config_parse_can_bitrate( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *br = ASSERT_PTR(data); + uint64_t sz; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_size(rvalue, 1000, &sz); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse can bitrate '%s', ignoring: %m", rvalue); + return 0; + } + + /* Linux uses __u32 for bitrates, so the value should not exceed that. */ + if (sz <= 0 || sz > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Bit rate out of permitted range 1...4294967295"); + return 0; + } + + *br = (uint32_t) sz; + + return 0; +} + +int config_parse_can_time_quanta( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + nsec_t val, *tq = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_nsec(rvalue, &val); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse can time quanta '%s', ignoring: %m", rvalue); + return 0; + } + + /* Linux uses __u32 for bitrates, so the value should not exceed that. */ + if (val <= 0 || val > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Time quanta out of permitted range 1...4294967295"); + return 0; + } + + *tq = val; + return 0; +} + +int config_parse_can_restart_usec( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t usec, *restart_usec = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse CAN restart sec '%s', ignoring: %m", rvalue); + return 0; + } + + if (usec != USEC_INFINITY && + DIV_ROUND_UP(usec, USEC_PER_MSEC) > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "CAN RestartSec= must be in the range 0...%"PRIu32"ms, ignoring: %s", UINT32_MAX, rvalue); + return 0; + } + + *restart_usec = usec; + return 0; +} + +int config_parse_can_control_mode( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + uint32_t mask = ltype; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(mask != 0); + + if (isempty(rvalue)) { + network->can_control_mode_mask &= ~mask; + network->can_control_mode_flags &= ~mask; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse CAN control mode '%s', ignoring: %s", lvalue, rvalue); + return 0; + } + + network->can_control_mode_mask |= mask; + SET_FLAG(network->can_control_mode_flags, mask, r); + return 0; +} + +int config_parse_can_termination( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + network->can_termination_set = false; + return 0; + } + + /* Note that 0 termination ohm value means no termination resistor, and there is no conflict + * between parse_boolean() and safe_atou16() when Termination=0. However, Termination=1 must be + * treated as 1 ohm, instead of true (and then the default ohm value). So, we need to parse the + * string with safe_atou16() at first. */ + + r = safe_atou16(rvalue, &network->can_termination); + if (r < 0) { + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse CAN termination value, ignoring: %s", rvalue); + return 0; + } + + network->can_termination = r ? CAN_TERMINATION_DEFAULT_OHM_VALUE : 0; + } + + network->can_termination_set = true; + return 0; +} diff --git a/src/network/networkd-can.h b/src/network/networkd-can.h new file mode 100644 index 0000000..3945082 --- /dev/null +++ b/src/network/networkd-can.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-netlink.h" + +#include "conf-parser.h" + +typedef struct Link Link; + +int can_set_netlink_message(Link *link, sd_netlink_message *m); + +CONFIG_PARSER_PROTOTYPE(config_parse_can_bitrate); +CONFIG_PARSER_PROTOTYPE(config_parse_can_time_quanta); +CONFIG_PARSER_PROTOTYPE(config_parse_can_restart_usec); +CONFIG_PARSER_PROTOTYPE(config_parse_can_control_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_can_termination); diff --git a/src/network/networkd-conf.c b/src/network/networkd-conf.c new file mode 100644 index 0000000..063732a --- /dev/null +++ b/src/network/networkd-conf.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Vinay Kulkarni + ***/ + +#include "conf-parser.h" +#include "constants.h" +#include "networkd-conf.h" +#include "networkd-manager.h" +#include "networkd-speed-meter.h" + +int manager_parse_config_file(Manager *m) { + int r; + + assert(m); + + r = config_parse_config_file("networkd.conf", + "Network\0" + "DHCPv4\0" + "DHCPv6\0" + "DHCP\0", + config_item_perf_lookup, networkd_gperf_lookup, + CONFIG_PARSE_WARN, + m); + if (r < 0) + return r; + + if (m->use_speed_meter && m->speed_meter_interval_usec < SPEED_METER_MINIMUM_TIME_INTERVAL) { + log_warning("SpeedMeterIntervalSec= is too small, using %s.", + FORMAT_TIMESPAN(SPEED_METER_MINIMUM_TIME_INTERVAL, USEC_PER_SEC)); + m->speed_meter_interval_usec = SPEED_METER_MINIMUM_TIME_INTERVAL; + } + + return 0; +} diff --git a/src/network/networkd-conf.h b/src/network/networkd-conf.h new file mode 100644 index 0000000..6f8612a --- /dev/null +++ b/src/network/networkd-conf.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2014 Vinay Kulkarni +***/ + +#include "conf-parser.h" + +typedef struct Manager Manager; + +int manager_parse_config_file(Manager *m); + +const struct ConfigPerfItem* networkd_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/network/networkd-dhcp-common.c b/src/network/networkd-dhcp-common.c new file mode 100644 index 0000000..080b153 --- /dev/null +++ b/src/network/networkd-dhcp-common.c @@ -0,0 +1,1489 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "bus-error.h" +#include "bus-locator.h" +#include "dhcp-identifier.h" +#include "dhcp-option.h" +#include "dhcp6-internal.h" +#include "escape.h" +#include "hexdecoct.h" +#include "in-addr-prefix-util.h" +#include "networkd-dhcp-common.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-route-util.h" +#include "parse-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "strv.h" +#include "vrf.h" + +static uint32_t link_get_vrf_table(Link *link) { + assert(link); + assert(link->network); + + return link->network->vrf ? VRF(link->network->vrf)->table : RT_TABLE_MAIN; +} + +uint32_t link_get_dhcp4_route_table(Link *link) { + assert(link); + assert(link->network); + + /* When the interface is part of an VRF use the VRFs routing table, unless + * another table is explicitly specified. */ + + if (link->network->dhcp_route_table_set) + return link->network->dhcp_route_table; + return link_get_vrf_table(link); +} + +uint32_t link_get_ipv6_accept_ra_route_table(Link *link) { + assert(link); + assert(link->network); + + if (link->network->ipv6_accept_ra_route_table_set) + return link->network->ipv6_accept_ra_route_table; + return link_get_vrf_table(link); +} + +bool link_dhcp_enabled(Link *link, int family) { + assert(link); + assert(IN_SET(family, AF_INET, AF_INET6)); + + /* Currently, sd-dhcp-client supports only ethernet and infiniband. */ + if (family == AF_INET && !IN_SET(link->iftype, ARPHRD_ETHER, ARPHRD_INFINIBAND)) + return false; + + if (family == AF_INET6 && !socket_ipv6_is_supported()) + return false; + + if (link->flags & IFF_LOOPBACK) + return false; + + if (link->iftype == ARPHRD_CAN) + return false; + + if (!link->network) + return false; + + return link->network->dhcp & (family == AF_INET ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6); +} + +void network_adjust_dhcp(Network *network) { + assert(network); + assert(network->dhcp >= 0); + + if (network->dhcp == ADDRESS_FAMILY_NO) + return; + + /* Bonding slave does not support addressing. */ + if (network->bond) { + log_warning("%s: Cannot enable DHCP= when Bond= is specified, disabling DHCP=.", + network->filename); + network->dhcp = ADDRESS_FAMILY_NO; + return; + } + + if (!FLAGS_SET(network->link_local, ADDRESS_FAMILY_IPV6) && + FLAGS_SET(network->dhcp, ADDRESS_FAMILY_IPV6)) { + log_warning("%s: DHCPv6 client is enabled but IPv6 link-local addressing is disabled. " + "Disabling DHCPv6 client.", network->filename); + SET_FLAG(network->dhcp, ADDRESS_FAMILY_IPV6, false); + } + + network_adjust_dhcp4(network); +} + +static bool duid_needs_product_uuid(const DUID *duid) { + assert(duid); + + return duid->type == DUID_TYPE_UUID && duid->raw_data_len == 0; +} + +static const struct DUID fallback_duid = { .type = DUID_TYPE_EN }; + +const DUID *link_get_duid(Link *link, int family) { + const DUID *duid; + + assert(link); + assert(IN_SET(family, AF_INET, AF_INET6)); + + if (link->network) { + duid = family == AF_INET ? &link->network->dhcp_duid : &link->network->dhcp6_duid; + if (duid->type != _DUID_TYPE_INVALID) { + if (duid_needs_product_uuid(duid)) + return &link->manager->duid_product_uuid; + else + return duid; + } + } + + duid = family == AF_INET ? &link->manager->dhcp_duid : &link->manager->dhcp6_duid; + if (link->hw_addr.length == 0 && IN_SET(duid->type, DUID_TYPE_LLT, DUID_TYPE_LL)) + /* Fallback to DUID that works without MAC address. + * This is useful for tunnel devices without MAC address. */ + return &fallback_duid; + + return duid; +} + +static int get_product_uuid_handler(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + Manager *manager = ASSERT_PTR(userdata); + const sd_bus_error *e; + const void *a; + size_t sz; + int r; + + assert(m); + + /* To avoid calling GetProductUUID() bus method so frequently, set the flag below + * even if the method fails. */ + manager->has_product_uuid = true; + + e = sd_bus_message_get_error(m); + if (e) { + r = sd_bus_error_get_errno(e); + log_warning_errno(r, "Could not get product UUID. Falling back to use machine-app-specific ID as DUID-UUID: %s", + bus_error_message(e, r)); + return 0; + } + + r = sd_bus_message_read_array(m, 'y', &a, &sz); + if (r < 0) { + log_warning_errno(r, "Failed to get product UUID. Falling back to use machine-app-specific ID as DUID-UUID: %m"); + return 0; + } + + if (sz != sizeof(sd_id128_t)) { + log_warning("Invalid product UUID. Falling back to use machine-app-specific ID as DUID-UUID."); + return 0; + } + + log_debug("Successfully obtained product UUID"); + + memcpy(&manager->duid_product_uuid.raw_data, a, sz); + manager->duid_product_uuid.raw_data_len = sz; + + return 0; +} + +int manager_request_product_uuid(Manager *m) { + static bool bus_method_is_called = false; + int r; + + assert(m); + + if (bus_method_is_called) + return 0; + + if (sd_bus_is_ready(m->bus) <= 0 && !m->product_uuid_requested) { + log_debug("Not connected to system bus, requesting product UUID later."); + m->product_uuid_requested = true; + return 0; + } + + m->product_uuid_requested = false; + + r = bus_call_method_async( + m->bus, + NULL, + bus_hostname, + "GetProductUUID", + get_product_uuid_handler, + m, + "b", + false); + if (r < 0) + return log_warning_errno(r, "Failed to get product UUID: %m"); + + log_debug("Requesting product UUID."); + + bus_method_is_called = true; + + return 0; +} + +int dhcp_configure_duid(Link *link, const DUID *duid) { + Manager *m; + int r; + + assert(link); + assert(link->manager); + assert(duid); + + m = link->manager; + + if (!duid_needs_product_uuid(duid)) + return 1; + + if (m->has_product_uuid) + return 1; + + r = manager_request_product_uuid(m); + if (r < 0) { + log_link_warning_errno(link, r, + "Failed to get product UUID. Falling back to use machine-app-specific ID as DUID-UUID: %m"); + + m->has_product_uuid = true; /* Do not request UUID again on failure. */ + return 1; + } + + return 0; +} + +bool address_is_filtered(int family, const union in_addr_union *address, uint8_t prefixlen, Set *allow_list, Set *deny_list) { + struct in_addr_prefix *p; + + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + if (allow_list) { + SET_FOREACH(p, allow_list) + if (p->family == family && + p->prefixlen <= prefixlen && + in_addr_prefix_covers(family, &p->address, p->prefixlen, address) > 0) + return false; + + return true; + } + + SET_FOREACH(p, deny_list) + if (p->family == family && + in_addr_prefix_intersect(family, &p->address, p->prefixlen, address, prefixlen) > 0) + return true; + + return false; +} + +int link_get_captive_portal(Link *link, const char **ret) { + const char *dhcp4_cp = NULL, *dhcp6_cp = NULL, *ndisc_cp = NULL; + int r; + + assert(link); + + if (!link->network) { + *ret = NULL; + return 0; + } + + if (link->network->dhcp_use_captive_portal && link->dhcp_lease) { + r = sd_dhcp_lease_get_captive_portal(link->dhcp_lease, &dhcp4_cp); + if (r < 0 && r != -ENODATA) + return r; + } + + if (link->network->dhcp6_use_captive_portal && link->dhcp6_lease) { + r = sd_dhcp6_lease_get_captive_portal(link->dhcp6_lease, &dhcp6_cp); + if (r < 0 && r != -ENODATA) + return r; + } + + if (link->network->ipv6_accept_ra_use_captive_portal) { + NDiscCaptivePortal *cp; + usec_t usec = 0; + + /* Use the captive portal with the longest lifetime. */ + + SET_FOREACH(cp, link->ndisc_captive_portals) { + if (cp->lifetime_usec < usec) + continue; + + ndisc_cp = cp->captive_portal; + usec = cp->lifetime_usec; + } + + if (set_size(link->ndisc_captive_portals) > 1) + log_link_debug(link, "Multiple captive portals obtained by IPv6RA, using \"%s\" and ignoring others.", + ndisc_cp); + } + + if (dhcp4_cp) { + if (dhcp6_cp && !streq(dhcp4_cp, dhcp6_cp)) + log_link_debug(link, "DHCPv6 captive portal (%s) does not match DHCPv4 (%s), ignoring DHCPv6 captive portal.", + dhcp6_cp, dhcp4_cp); + + if (ndisc_cp && !streq(dhcp4_cp, ndisc_cp)) + log_link_debug(link, "IPv6RA captive portal (%s) does not match DHCPv4 (%s), ignoring IPv6RA captive portal.", + ndisc_cp, dhcp4_cp); + + *ret = dhcp4_cp; + return 1; + } + + if (dhcp6_cp) { + if (ndisc_cp && !streq(dhcp6_cp, ndisc_cp)) + log_link_debug(link, "IPv6RA captive portal (%s) does not match DHCPv6 (%s), ignoring IPv6RA captive portal.", + ndisc_cp, dhcp6_cp); + + *ret = dhcp6_cp; + return 1; + } + + *ret = ndisc_cp; + return !!ndisc_cp; +} + +int config_parse_dhcp( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + AddressFamily *dhcp = data, s; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + /* Note that this is mostly like + * config_parse_address_family(), except that it + * understands some old names for the enum values */ + + s = address_family_from_string(rvalue); + if (s < 0) { + + /* Previously, we had a slightly different enum here, + * support its values for compatibility. */ + + s = dhcp_deprecated_address_family_from_string(rvalue); + if (s < 0) { + log_syntax(unit, LOG_WARNING, filename, line, s, + "Failed to parse DHCP option, ignoring: %s", rvalue); + return 0; + } + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "DHCP=%s is deprecated, please use DHCP=%s instead.", + rvalue, address_family_to_string(s)); + } + + *dhcp = s; + return 0; +} + +int config_parse_dhcp_route_metric( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + uint32_t metric; + int r; + + assert(filename); + assert(lvalue); + assert(IN_SET(ltype, AF_UNSPEC, AF_INET)); + assert(rvalue); + assert(data); + + r = safe_atou32(rvalue, &metric); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse RouteMetric=%s, ignoring assignment: %m", rvalue); + return 0; + } + + switch (ltype) { + case AF_INET: + network->dhcp_route_metric = metric; + network->dhcp_route_metric_set = true; + break; + case AF_UNSPEC: + /* For backward compatibility. */ + if (!network->dhcp_route_metric_set) + network->dhcp_route_metric = metric; + if (!network->ipv6_accept_ra_route_metric_set) { + network->ipv6_accept_ra_route_metric_high = metric; + network->ipv6_accept_ra_route_metric_medium = metric; + network->ipv6_accept_ra_route_metric_low = metric; + } + break; + default: + assert_not_reached(); + } + + return 0; +} + +int config_parse_ipv6_accept_ra_route_metric( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + uint32_t metric_high, metric_medium, metric_low; + int r, s, t; + + assert(filename); + assert(rvalue); + + if (safe_atou32(rvalue, &metric_low) >= 0) + metric_high = metric_medium = metric_low; + else { + _cleanup_free_ char *high = NULL, *medium = NULL, *low = NULL; + const char *p = rvalue; + + r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &high, &medium, &low, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r != 3 || !isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, r < 0 ? r : 0, + "Failed to parse RouteTable=%s, ignoring assignment: %m", rvalue); + return 0; + } + + r = safe_atou32(high, &metric_high); + s = safe_atou32(medium, &metric_medium); + t = safe_atou32(low, &metric_low); + if (r < 0 || s < 0 || t < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r < 0 ? r : s < 0 ? s : t, + "Failed to parse RouteTable=%s, ignoring assignment: %m", rvalue); + return 0; + } + + if (metric_high >= metric_medium || metric_medium >= metric_low) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid RouteTable=%s, ignoring assignment: %m", rvalue); + return 0; + } + } + + network->ipv6_accept_ra_route_metric_high = metric_high; + network->ipv6_accept_ra_route_metric_medium = metric_medium; + network->ipv6_accept_ra_route_metric_low = metric_low; + network->ipv6_accept_ra_route_metric_set = true; + + return 0; +} + +int config_parse_dhcp_send_hostname( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(IN_SET(ltype, AF_UNSPEC, AF_INET, AF_INET6)); + assert(rvalue); + assert(data); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse SendHostname=%s, ignoring assignment: %m", rvalue); + return 0; + } + + switch (ltype) { + case AF_INET: + network->dhcp_send_hostname = r; + network->dhcp_send_hostname_set = true; + break; + case AF_INET6: + network->dhcp6_send_hostname = r; + network->dhcp6_send_hostname_set = true; + break; + case AF_UNSPEC: + /* For backward compatibility. */ + if (!network->dhcp_send_hostname_set) + network->dhcp_send_hostname = r; + if (!network->dhcp6_send_hostname_set) + network->dhcp6_send_hostname = r; + break; + default: + assert_not_reached(); + } + + return 0; +} +int config_parse_dhcp_use_dns( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(IN_SET(ltype, AF_UNSPEC, AF_INET, AF_INET6)); + assert(rvalue); + assert(data); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse UseDNS=%s, ignoring assignment: %m", rvalue); + return 0; + } + + switch (ltype) { + case AF_INET: + network->dhcp_use_dns = r; + network->dhcp_use_dns_set = true; + break; + case AF_INET6: + network->dhcp6_use_dns = r; + network->dhcp6_use_dns_set = true; + break; + case AF_UNSPEC: + /* For backward compatibility. */ + if (!network->dhcp_use_dns_set) + network->dhcp_use_dns = r; + if (!network->dhcp6_use_dns_set) + network->dhcp6_use_dns = r; + break; + default: + assert_not_reached(); + } + + return 0; +} + +int config_parse_dhcp_use_domains( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + DHCPUseDomains d; + + assert(filename); + assert(lvalue); + assert(IN_SET(ltype, AF_UNSPEC, AF_INET, AF_INET6)); + assert(rvalue); + assert(data); + + d = dhcp_use_domains_from_string(rvalue); + if (d < 0) { + log_syntax(unit, LOG_WARNING, filename, line, d, + "Failed to parse %s=%s, ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + switch (ltype) { + case AF_INET: + network->dhcp_use_domains = d; + network->dhcp_use_domains_set = true; + break; + case AF_INET6: + network->dhcp6_use_domains = d; + network->dhcp6_use_domains_set = true; + break; + case AF_UNSPEC: + /* For backward compatibility. */ + if (!network->dhcp_use_domains_set) + network->dhcp_use_domains = d; + if (!network->dhcp6_use_domains_set) + network->dhcp6_use_domains = d; + break; + default: + assert_not_reached(); + } + + return 0; +} + +int config_parse_dhcp_use_ntp( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(IN_SET(ltype, AF_UNSPEC, AF_INET, AF_INET6)); + assert(rvalue); + assert(data); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse UseNTP=%s, ignoring assignment: %m", rvalue); + return 0; + } + + switch (ltype) { + case AF_INET: + network->dhcp_use_ntp = r; + network->dhcp_use_ntp_set = true; + break; + case AF_INET6: + network->dhcp6_use_ntp = r; + network->dhcp6_use_ntp_set = true; + break; + case AF_UNSPEC: + /* For backward compatibility. */ + if (!network->dhcp_use_ntp_set) + network->dhcp_use_ntp = r; + if (!network->dhcp6_use_ntp_set) + network->dhcp6_use_ntp = r; + break; + default: + assert_not_reached(); + } + + return 0; +} + +int config_parse_dhcp_or_ra_route_table( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + uint32_t rt; + int r; + + assert(filename); + assert(lvalue); + assert(IN_SET(ltype, AF_INET, AF_INET6)); + assert(rvalue); + + r = manager_get_route_table_from_string(network->manager, rvalue, &rt); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse RouteTable=%s, ignoring assignment: %m", rvalue); + return 0; + } + + switch (ltype) { + case AF_INET: + network->dhcp_route_table = rt; + network->dhcp_route_table_set = true; + break; + case AF_INET6: + network->ipv6_accept_ra_route_table = rt; + network->ipv6_accept_ra_route_table_set = true; + break; + default: + assert_not_reached(); + } + + return 0; +} + +int config_parse_iaid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + uint32_t iaid; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(IN_SET(ltype, AF_INET, AF_INET6)); + + r = safe_atou32(rvalue, &iaid); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Unable to read IAID, ignoring assignment: %s", rvalue); + return 0; + } + + if (ltype == AF_INET) { + network->dhcp_iaid = iaid; + network->dhcp_iaid_set = true; + if (!network->dhcp6_iaid_set_explicitly) { + /* Backward compatibility. Previously, IAID is shared by DHCPv4 and DHCPv6. + * If DHCPv6 IAID is not specified explicitly, then use DHCPv4 IAID for DHCPv6. */ + network->dhcp6_iaid = iaid; + network->dhcp6_iaid_set = true; + } + } else { + assert(ltype == AF_INET6); + network->dhcp6_iaid = iaid; + network->dhcp6_iaid_set = true; + network->dhcp6_iaid_set_explicitly = true; + } + + return 0; +} + +int config_parse_dhcp_user_or_vendor_class( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***l = ASSERT_PTR(data); + int r; + + assert(lvalue); + assert(rvalue); + assert(IN_SET(ltype, AF_INET, AF_INET6)); + + if (isempty(rvalue)) { + *l = strv_free(*l); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + size_t len; + + r = extract_first_word(&p, &w, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to split user classes option, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + len = strlen(w); + if (ltype == AF_INET) { + if (len > UINT8_MAX || len == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s length is not in the range 1…255, ignoring.", w); + continue; + } + } else { + if (len > UINT16_MAX || len == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s length is not in the range 1…65535, ignoring.", w); + continue; + } + } + + r = strv_consume(l, TAKE_PTR(w)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_dhcp_send_option( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sd_dhcp_option_unrefp) sd_dhcp_option *opt4 = NULL; + _cleanup_(sd_dhcp6_option_unrefp) sd_dhcp6_option *opt6 = NULL; + _unused_ _cleanup_(sd_dhcp_option_unrefp) sd_dhcp_option *old4 = NULL; + _unused_ _cleanup_(sd_dhcp6_option_unrefp) sd_dhcp6_option *old6 = NULL; + uint32_t uint32_data, enterprise_identifier = 0; + _cleanup_free_ char *word = NULL, *q = NULL; + OrderedHashmap **options = ASSERT_PTR(data); + uint16_t u16, uint16_data; + union in_addr_union addr; + DHCPOptionDataType type; + uint8_t u8, uint8_data; + const void *udata; + const char *p; + ssize_t sz; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *options = ordered_hashmap_free(*options); + return 0; + } + + p = rvalue; + if (ltype == AF_INET6 && streq(lvalue, "SendVendorOption")) { + r = extract_first_word(&p, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid DHCP option, ignoring assignment: %s", rvalue); + return 0; + } + + r = safe_atou32(word, &enterprise_identifier); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCPv6 enterprise identifier data, ignoring assignment: %s", p); + return 0; + } + word = mfree(word); + } + + r = extract_first_word(&p, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid DHCP option, ignoring assignment: %s", rvalue); + return 0; + } + + if (ltype == AF_INET6) { + r = safe_atou16(word, &u16); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid DHCP option, ignoring assignment: %s", rvalue); + return 0; + } + if (u16 < 1 || u16 >= UINT16_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid DHCP option, valid range is 1-65535, ignoring assignment: %s", rvalue); + return 0; + } + } else { + r = safe_atou8(word, &u8); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid DHCP option, ignoring assignment: %s", rvalue); + return 0; + } + if (u8 < 1 || u8 >= UINT8_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid DHCP option, valid range is 1-254, ignoring assignment: %s", rvalue); + return 0; + } + } + + word = mfree(word); + r = extract_first_word(&p, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0 || isempty(p)) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid DHCP option, ignoring assignment: %s", rvalue); + return 0; + } + + type = dhcp_option_data_type_from_string(word); + if (type < 0) { + log_syntax(unit, LOG_WARNING, filename, line, type, + "Invalid DHCP option data type, ignoring assignment: %s", p); + return 0; + } + + switch (type) { + case DHCP_OPTION_DATA_UINT8:{ + r = safe_atou8(p, &uint8_data); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP uint8 data, ignoring assignment: %s", p); + return 0; + } + + udata = &uint8_data; + sz = sizeof(uint8_t); + break; + } + case DHCP_OPTION_DATA_UINT16:{ + uint16_t k; + + r = safe_atou16(p, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP uint16 data, ignoring assignment: %s", p); + return 0; + } + + uint16_data = htobe16(k); + udata = &uint16_data; + sz = sizeof(uint16_t); + break; + } + case DHCP_OPTION_DATA_UINT32: { + uint32_t k; + + r = safe_atou32(p, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP uint32 data, ignoring assignment: %s", p); + return 0; + } + + uint32_data = htobe32(k); + udata = &uint32_data; + sz = sizeof(uint32_t); + + break; + } + case DHCP_OPTION_DATA_IPV4ADDRESS: { + r = in_addr_from_string(AF_INET, p, &addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP ipv4address data, ignoring assignment: %s", p); + return 0; + } + + udata = &addr.in; + sz = sizeof(addr.in.s_addr); + break; + } + case DHCP_OPTION_DATA_IPV6ADDRESS: { + r = in_addr_from_string(AF_INET6, p, &addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP ipv6address data, ignoring assignment: %s", p); + return 0; + } + + udata = &addr.in6; + sz = sizeof(addr.in6.s6_addr); + break; + } + case DHCP_OPTION_DATA_STRING: + sz = cunescape(p, UNESCAPE_ACCEPT_NUL, &q); + if (sz < 0) { + log_syntax(unit, LOG_WARNING, filename, line, sz, + "Failed to decode DHCP option data, ignoring assignment: %s", p); + return 0; + } + + udata = q; + break; + default: + return -EINVAL; + } + + if (ltype == AF_INET6) { + r = sd_dhcp6_option_new(u16, udata, sz, enterprise_identifier, &opt6); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store DHCP option '%s', ignoring assignment: %m", rvalue); + return 0; + } + + r = ordered_hashmap_ensure_allocated(options, &dhcp6_option_hash_ops); + if (r < 0) + return log_oom(); + + /* Overwrite existing option */ + old6 = ordered_hashmap_get(*options, UINT_TO_PTR(u16)); + r = ordered_hashmap_replace(*options, UINT_TO_PTR(u16), opt6); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store DHCP option '%s', ignoring assignment: %m", rvalue); + return 0; + } + TAKE_PTR(opt6); + } else { + r = sd_dhcp_option_new(u8, udata, sz, &opt4); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store DHCP option '%s', ignoring assignment: %m", rvalue); + return 0; + } + + r = ordered_hashmap_ensure_allocated(options, &dhcp_option_hash_ops); + if (r < 0) + return log_oom(); + + /* Overwrite existing option */ + old4 = ordered_hashmap_get(*options, UINT_TO_PTR(u8)); + r = ordered_hashmap_replace(*options, UINT_TO_PTR(u8), opt4); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store DHCP option '%s', ignoring assignment: %m", rvalue); + return 0; + } + TAKE_PTR(opt4); + } + return 0; +} + +int config_parse_dhcp_request_options( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + if (ltype == AF_INET) + network->dhcp_request_options = set_free(network->dhcp_request_options); + else + network->dhcp6_request_options = set_free(network->dhcp6_request_options); + + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *n = NULL; + uint32_t i; + + r = extract_first_word(&p, &n, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP request option, ignoring assignment: %s", + rvalue); + return 0; + } + if (r == 0) + return 0; + + r = safe_atou32(n, &i); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "DHCP request option is invalid, ignoring assignment: %s", n); + continue; + } + + if (i < 1 || i >= UINT8_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "DHCP request option is invalid, valid range is 1-254, ignoring assignment: %s", n); + continue; + } + + r = set_ensure_put(ltype == AF_INET ? &network->dhcp_request_options : &network->dhcp6_request_options, + NULL, UINT32_TO_PTR(i)); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store DHCP request option '%s', ignoring assignment: %m", n); + } +} + +static const char* const dhcp_use_domains_table[_DHCP_USE_DOMAINS_MAX] = { + [DHCP_USE_DOMAINS_NO] = "no", + [DHCP_USE_DOMAINS_ROUTE] = "route", + [DHCP_USE_DOMAINS_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dhcp_use_domains, DHCPUseDomains, DHCP_USE_DOMAINS_YES); + +static const char * const dhcp_option_data_type_table[_DHCP_OPTION_DATA_MAX] = { + [DHCP_OPTION_DATA_UINT8] = "uint8", + [DHCP_OPTION_DATA_UINT16] = "uint16", + [DHCP_OPTION_DATA_UINT32] = "uint32", + [DHCP_OPTION_DATA_STRING] = "string", + [DHCP_OPTION_DATA_IPV4ADDRESS] = "ipv4address", + [DHCP_OPTION_DATA_IPV6ADDRESS] = "ipv6address", +}; + +DEFINE_STRING_TABLE_LOOKUP(dhcp_option_data_type, DHCPOptionDataType); + +static const char* const duid_type_table[_DUID_TYPE_MAX] = { + [DUID_TYPE_LLT] = "link-layer-time", + [DUID_TYPE_EN] = "vendor", + [DUID_TYPE_LL] = "link-layer", + [DUID_TYPE_UUID] = "uuid", +}; +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(duid_type, DUIDType); + +int config_parse_duid_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *type_string = NULL; + const char *p = ASSERT_PTR(rvalue); + bool force = ltype; + DUID *duid = ASSERT_PTR(data); + DUIDType type; + int r; + + assert(filename); + assert(lvalue); + + if (!force && duid->set) + return 0; + + r = extract_first_word(&p, &type_string, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to extract DUID type from '%s', ignoring.", rvalue); + return 0; + } + + type = duid_type_from_string(type_string); + if (type < 0) { + uint16_t t; + + r = safe_atou16(type_string, &t); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DUID type '%s', ignoring.", type_string); + return 0; + } + + type = t; + assert(type == t); /* Check if type can store uint16_t. */ + } + + if (!isempty(p)) { + usec_t u; + + if (type != DUID_TYPE_LLT) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = parse_timestamp(p, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse timestamp, ignoring: %s", p); + return 0; + } + + duid->llt_time = u; + } + + duid->type = type; + duid->set = force; + + return 0; +} + +int config_parse_manager_duid_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *manager = ASSERT_PTR(userdata); + int r; + + /* For backward compatibility. Setting both DHCPv4 and DHCPv6 DUID if they are not specified explicitly. */ + + r = config_parse_duid_type(unit, filename, line, section, section_line, lvalue, false, rvalue, &manager->dhcp_duid, manager); + if (r < 0) + return r; + + return config_parse_duid_type(unit, filename, line, section, section_line, lvalue, false, rvalue, &manager->dhcp6_duid, manager); +} + +int config_parse_network_duid_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + int r; + + r = config_parse_duid_type(unit, filename, line, section, section_line, lvalue, true, rvalue, &network->dhcp_duid, network); + if (r < 0) + return r; + + /* For backward compatibility, also set DHCPv6 DUID if not specified explicitly. */ + return config_parse_duid_type(unit, filename, line, section, section_line, lvalue, false, rvalue, &network->dhcp6_duid, network); +} + +int config_parse_duid_rawdata( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint8_t raw_data[MAX_DUID_DATA_LEN]; + unsigned count = 0; + bool force = ltype; + DUID *duid = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (!force && duid->set) + return 0; + + /* RawData contains DUID in format "NN:NN:NN..." */ + for (const char *p = rvalue;;) { + int n1, n2, len, r; + uint32_t byte; + _cleanup_free_ char *cbyte = NULL; + + r = extract_first_word(&p, &cbyte, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to read DUID, ignoring assignment: %s.", rvalue); + return 0; + } + if (r == 0) + break; + + if (count >= MAX_DUID_DATA_LEN) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Max DUID length exceeded, ignoring assignment: %s.", rvalue); + return 0; + } + + len = strlen(cbyte); + if (!IN_SET(len, 1, 2)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid length - DUID byte: %s, ignoring assignment: %s.", cbyte, rvalue); + return 0; + } + n1 = unhexchar(cbyte[0]); + if (len == 2) + n2 = unhexchar(cbyte[1]); + else + n2 = 0; + + if (n1 < 0 || n2 < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid DUID byte: %s. Ignoring assignment: %s.", cbyte, rvalue); + return 0; + } + + byte = ((uint8_t) n1 << (4 * (len-1))) | (uint8_t) n2; + raw_data[count++] = byte; + } + + assert_cc(sizeof(raw_data) == sizeof(duid->raw_data)); + memcpy(duid->raw_data, raw_data, count); + duid->raw_data_len = count; + duid->set = force; + + return 0; +} + +int config_parse_manager_duid_rawdata( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *manager = ASSERT_PTR(userdata); + int r; + + /* For backward compatibility. Setting both DHCPv4 and DHCPv6 DUID if they are not specified explicitly. */ + + r = config_parse_duid_rawdata(unit, filename, line, section, section_line, lvalue, false, rvalue, &manager->dhcp_duid, manager); + if (r < 0) + return r; + + return config_parse_duid_rawdata(unit, filename, line, section, section_line, lvalue, false, rvalue, &manager->dhcp6_duid, manager); +} + +int config_parse_network_duid_rawdata( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + int r; + + r = config_parse_duid_rawdata(unit, filename, line, section, section_line, lvalue, true, rvalue, &network->dhcp_duid, network); + if (r < 0) + return r; + + /* For backward compatibility, also set DHCPv6 DUID if not specified explicitly. */ + return config_parse_duid_rawdata(unit, filename, line, section, section_line, lvalue, false, rvalue, &network->dhcp6_duid, network); +} + +int config_parse_uplink( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + bool accept_none = true; + int *index, r; + char **name; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (streq(section, "DHCPServer")) { + index = &network->dhcp_server_uplink_index; + name = &network->dhcp_server_uplink_name; + } else if (streq(section, "IPv6SendRA")) { + index = &network->router_uplink_index; + name = &network->router_uplink_name; + } else if (STR_IN_SET(section, "DHCPv6PrefixDelegation", "DHCPPrefixDelegation")) { + index = &network->dhcp_pd_uplink_index; + name = &network->dhcp_pd_uplink_name; + accept_none = false; + } else + assert_not_reached(); + + if (isempty(rvalue) || streq(rvalue, ":auto")) { + *index = UPLINK_INDEX_AUTO; + *name = mfree(*name); + return 0; + } + + if (accept_none && streq(rvalue, ":none")) { + *index = UPLINK_INDEX_NONE; + *name = mfree(*name); + return 0; + } + + if (!accept_none && streq(rvalue, ":self")) { + *index = UPLINK_INDEX_SELF; + *name = mfree(*name); + return 0; + } + + r = parse_ifindex(rvalue); + if (r > 0) { + *index = r; + *name = mfree(*name); + return 0; + } + + if (!ifname_valid_full(rvalue, IFNAME_VALID_ALTERNATIVE)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid interface name in %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + /* The interface name will be resolved later. */ + r = free_and_strdup_warn(name, rvalue); + if (r < 0) + return r; + + /* Note, if uplink_name is set, then uplink_index will be ignored. So, the below does not mean + * an uplink interface will be selected automatically. */ + *index = UPLINK_INDEX_AUTO; + return 0; +} diff --git a/src/network/networkd-dhcp-common.h b/src/network/networkd-dhcp-common.h new file mode 100644 index 0000000..6e3f3b2 --- /dev/null +++ b/src/network/networkd-dhcp-common.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "dhcp-identifier.h" +#include "in-addr-util.h" +#include "set.h" +#include "time-util.h" + +/* Special values for *_uplink_index. */ +#define UPLINK_INDEX_AUTO 0 /* uplink will be selected automatically */ +#define UPLINK_INDEX_NONE -1 /* uplink will not be selected automatically */ +#define UPLINK_INDEX_SELF -2 /* the interface itself is uplink */ + +#define DHCP_ROUTE_METRIC 1024 +#define IPV6RA_ROUTE_METRIC_HIGH 512 +#define IPV6RA_ROUTE_METRIC_MEDIUM 1024 +#define IPV6RA_ROUTE_METRIC_LOW 2048 +#define DHCP6PD_ROUTE_METRIC 256 + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Network Network; + +typedef enum DHCPUseDomains { + DHCP_USE_DOMAINS_NO, + DHCP_USE_DOMAINS_YES, + DHCP_USE_DOMAINS_ROUTE, + _DHCP_USE_DOMAINS_MAX, + _DHCP_USE_DOMAINS_INVALID = -EINVAL, +} DHCPUseDomains; + +typedef enum DHCPOptionDataType { + DHCP_OPTION_DATA_UINT8, + DHCP_OPTION_DATA_UINT16, + DHCP_OPTION_DATA_UINT32, + DHCP_OPTION_DATA_STRING, + DHCP_OPTION_DATA_IPV4ADDRESS, + DHCP_OPTION_DATA_IPV6ADDRESS, + _DHCP_OPTION_DATA_MAX, + _DHCP_OPTION_DATA_INVALID, +} DHCPOptionDataType; + +typedef struct DUID { + /* Value of Type in [DHCP] section */ + DUIDType type; + + uint8_t raw_data_len; + uint8_t raw_data[MAX_DUID_DATA_LEN]; + usec_t llt_time; + bool set; +} DUID; + +uint32_t link_get_dhcp4_route_table(Link *link); +uint32_t link_get_ipv6_accept_ra_route_table(Link *link); + +bool link_dhcp_enabled(Link *link, int family); +static inline bool link_dhcp4_enabled(Link *link) { + return link_dhcp_enabled(link, AF_INET); +} +static inline bool link_dhcp6_enabled(Link *link) { + return link_dhcp_enabled(link, AF_INET6); +} + +void network_adjust_dhcp(Network *network); + +const DUID *link_get_duid(Link *link, int family); +static inline const DUID *link_get_dhcp4_duid(Link *link) { + return link_get_duid(link, AF_INET); +} +static inline const DUID *link_get_dhcp6_duid(Link *link) { + return link_get_duid(link, AF_INET6); +} + +int dhcp_configure_duid(Link *link, const DUID *duid); +int manager_request_product_uuid(Manager *m); + +bool address_is_filtered(int family, const union in_addr_union *address, uint8_t prefixlen, Set *allow_list, Set *deny_list); +static inline bool in4_address_is_filtered(const struct in_addr *address, Set *allow_list, Set *deny_list) { + return address_is_filtered(AF_INET, &(union in_addr_union) { .in = *address }, 32, allow_list, deny_list); +} +static inline bool in6_prefix_is_filtered(const struct in6_addr *prefix, uint8_t prefixlen, Set *allow_list, Set *deny_list) { + return address_is_filtered(AF_INET6, &(union in_addr_union) { .in6 = *prefix }, prefixlen, allow_list, deny_list); +} + +int link_get_captive_portal(Link *link, const char **ret); + +const char* dhcp_use_domains_to_string(DHCPUseDomains p) _const_; +DHCPUseDomains dhcp_use_domains_from_string(const char *s) _pure_; + +const char *dhcp_option_data_type_to_string(DHCPOptionDataType d) _const_; +DHCPOptionDataType dhcp_option_data_type_from_string(const char *d) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_route_metric); +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_accept_ra_route_metric); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_send_hostname); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_use_dns); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_use_domains); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_use_ntp); +CONFIG_PARSER_PROTOTYPE(config_parse_iaid); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_or_ra_route_table); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_user_or_vendor_class); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_send_option); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_request_options); +CONFIG_PARSER_PROTOTYPE(config_parse_duid_type); +CONFIG_PARSER_PROTOTYPE(config_parse_manager_duid_type); +CONFIG_PARSER_PROTOTYPE(config_parse_network_duid_type); +CONFIG_PARSER_PROTOTYPE(config_parse_duid_rawdata); +CONFIG_PARSER_PROTOTYPE(config_parse_manager_duid_rawdata); +CONFIG_PARSER_PROTOTYPE(config_parse_network_duid_rawdata); +CONFIG_PARSER_PROTOTYPE(config_parse_uplink); diff --git a/src/network/networkd-dhcp-prefix-delegation.c b/src/network/networkd-dhcp-prefix-delegation.c new file mode 100644 index 0000000..af2fe9e --- /dev/null +++ b/src/network/networkd-dhcp-prefix-delegation.c @@ -0,0 +1,1257 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "dhcp6-lease-internal.h" +#include "hashmap.h" +#include "in-addr-prefix-util.h" +#include "networkd-address-generation.h" +#include "networkd-address.h" +#include "networkd-dhcp-prefix-delegation.h" +#include "networkd-dhcp6.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "networkd-radv.h" +#include "networkd-route.h" +#include "networkd-setlink.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "tunnel.h" + +bool link_dhcp_pd_is_enabled(Link *link) { + assert(link); + + if (!link->network) + return false; + + return link->network->dhcp_pd; +} + +bool dhcp_pd_is_uplink(Link *link, Link *target, bool accept_auto) { + assert(link); + assert(target); + + if (!link_dhcp_pd_is_enabled(link)) + return false; + + if (link->network->dhcp_pd_uplink_name) + return streq_ptr(target->ifname, link->network->dhcp_pd_uplink_name) || + strv_contains(target->alternative_names, link->network->dhcp_pd_uplink_name); + + if (link->network->dhcp_pd_uplink_index > 0) + return target->ifindex == link->network->dhcp_pd_uplink_index; + + if (link->network->dhcp_pd_uplink_index == UPLINK_INDEX_SELF) + return link == target; + + assert(link->network->dhcp_pd_uplink_index == UPLINK_INDEX_AUTO); + return accept_auto; +} + +static void link_remove_dhcp_pd_subnet_prefix(Link *link, const struct in6_addr *prefix) { + void *key; + + assert(link); + assert(link->manager); + assert(prefix); + + if (hashmap_get(link->manager->links_by_dhcp_pd_subnet_prefix, prefix) != link) + return; + + hashmap_remove2(link->manager->links_by_dhcp_pd_subnet_prefix, prefix, &key); + free(key); +} + +static int link_add_dhcp_pd_subnet_prefix(Link *link, const struct in6_addr *prefix) { + _cleanup_free_ struct in6_addr *copy = NULL; + int r; + + assert(link); + assert(prefix); + + copy = newdup(struct in6_addr, prefix, 1); + if (!copy) + return -ENOMEM; + + r = hashmap_ensure_put(&link->manager->links_by_dhcp_pd_subnet_prefix, &in6_addr_hash_ops_free, copy, link); + if (r < 0) + return r; + if (r > 0) + TAKE_PTR(copy); + + return 0; +} + +static int link_get_by_dhcp_pd_subnet_prefix(Manager *manager, const struct in6_addr *prefix, Link **ret) { + Link *link; + + assert(manager); + assert(prefix); + + link = hashmap_get(manager->links_by_dhcp_pd_subnet_prefix, prefix); + if (!link) + return -ENODEV; + + if (ret) + *ret = link; + return 0; +} + +static int dhcp_pd_get_assigned_subnet_prefix(Link *link, const struct in6_addr *pd_prefix, uint8_t pd_prefix_len, struct in6_addr *ret) { + assert(link); + assert(pd_prefix); + + if (!link_dhcp_pd_is_enabled(link)) + return -ENOENT; + + if (link->network->dhcp_pd_assign) { + Address *address; + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_DHCP_PD) + continue; + assert(address->family == AF_INET6); + + if (in6_addr_prefix_covers(pd_prefix, pd_prefix_len, &address->in_addr.in6) <= 0) + continue; + + if (ret) { + struct in6_addr prefix = address->in_addr.in6; + + in6_addr_mask(&prefix, 64); + *ret = prefix; + } + return 0; + } + } else { + Route *route; + + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_DHCP_PD) + continue; + assert(route->family == AF_INET6); + + if (in6_addr_prefix_covers(pd_prefix, pd_prefix_len, &route->dst.in6) > 0) { + if (ret) + *ret = route->dst.in6; + return 0; + } + } + } + + return -ENOENT; +} + +int dhcp_pd_remove(Link *link, bool only_marked) { + int k, r = 0; + + assert(link); + assert(link->manager); + + if (!link_dhcp_pd_is_enabled(link)) + return 0; + + if (!only_marked) + link->dhcp_pd_configured = false; + + if (!link->network->dhcp_pd_assign) { + Route *route; + + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_DHCP_PD) + continue; + if (only_marked && !route_is_marked(route)) + continue; + + if (link->radv) + sd_radv_remove_prefix(link->radv, &route->dst.in6, 64); + + link_remove_dhcp_pd_subnet_prefix(link, &route->dst.in6); + + k = route_remove(route); + if (k < 0) + r = k; + + route_cancel_request(route, link); + } + } else { + Address *address; + + SET_FOREACH(address, link->addresses) { + struct in6_addr prefix; + + if (address->source != NETWORK_CONFIG_SOURCE_DHCP_PD) + continue; + if (only_marked && !address_is_marked(address)) + continue; + + prefix = address->in_addr.in6; + in6_addr_mask(&prefix, 64); + + if (link->radv) + sd_radv_remove_prefix(link->radv, &prefix, 64); + + link_remove_dhcp_pd_subnet_prefix(link, &prefix); + + k = address_remove_and_drop(address); + if (k < 0) + r = k; + } + } + + return r; +} + +static int dhcp_pd_check_ready(Link *link); + +static int dhcp_pd_address_ready_callback(Address *address) { + Address *a; + + assert(address); + assert(address->link); + + SET_FOREACH(a, address->link->addresses) + if (a->source == NETWORK_CONFIG_SOURCE_DHCP_PD) + a->callback = NULL; + + return dhcp_pd_check_ready(address->link); +} + +static int dhcp_pd_check_ready(Link *link) { + int r; + + assert(link); + assert(link->network); + + if (link->dhcp_pd_messages > 0) { + log_link_debug(link, "%s(): DHCP-PD addresses and routes are not set.", __func__); + return 0; + } + + if (link->network->dhcp_pd_assign) { + bool has_ready = false; + Address *address; + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_DHCP_PD) + continue; + if (address_is_ready(address)) { + has_ready = true; + break; + } + } + + if (!has_ready) { + SET_FOREACH(address, link->addresses) + if (address->source == NETWORK_CONFIG_SOURCE_DHCP_PD) + address->callback = dhcp_pd_address_ready_callback; + + log_link_debug(link, "%s(): no DHCP-PD address is ready.", __func__); + return 0; + } + } + + link->dhcp_pd_configured = true; + + log_link_debug(link, "DHCP-PD addresses and routes set."); + + r = dhcp_pd_remove(link, /* only_marked = */ true); + if (r < 0) + return r; + + link_check_ready(link); + return 1; +} + +static int dhcp_pd_route_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Route *route) { + int r; + + assert(link); + + r = route_configure_handler_internal(rtnl, m, link, "Failed to add prefix route for DHCP delegated subnet prefix"); + if (r <= 0) + return r; + + r = dhcp_pd_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int dhcp_pd_request_route(Link *link, const struct in6_addr *prefix, usec_t lifetime_usec) { + _cleanup_(route_freep) Route *route = NULL; + Route *existing; + int r; + + assert(link); + assert(link->network); + assert(prefix); + + if (link->network->dhcp_pd_assign) + return 0; + + r = route_new(&route); + if (r < 0) + return r; + + route->source = NETWORK_CONFIG_SOURCE_DHCP_PD; + route->family = AF_INET6; + route->dst.in6 = *prefix; + route->dst_prefixlen = 64; + route->protocol = RTPROT_DHCP; + route->priority = link->network->dhcp_pd_route_metric; + route->lifetime_usec = lifetime_usec; + + if (route_get(NULL, link, route, &existing) < 0) + link->dhcp_pd_configured = false; + else + route_unmark(existing); + + r = link_request_route(link, TAKE_PTR(route), true, &link->dhcp_pd_messages, + dhcp_pd_route_handler, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request DHCP-PD prefix route: %m"); + + return 0; +} + +static int dhcp_pd_address_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Address *address) { + int r; + + assert(link); + + r = address_configure_handler_internal(rtnl, m, link, "Could not set DHCP-PD address"); + if (r <= 0) + return r; + + r = dhcp_pd_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static void log_dhcp_pd_address(Link *link, const Address *address) { + assert(address); + assert(address->family == AF_INET6); + + int log_level = address_get_harder(link, address, NULL) >= 0 ? LOG_DEBUG : LOG_INFO; + + if (log_level < log_get_max_level()) + return; + + log_link_full(link, log_level, "DHCP-PD address %s (valid %s, preferred %s)", + IN6_ADDR_PREFIX_TO_STRING(&address->in_addr.in6, address->prefixlen), + FORMAT_LIFETIME(address->lifetime_valid_usec), + FORMAT_LIFETIME(address->lifetime_preferred_usec)); +} + +static int dhcp_pd_request_address( + Link *link, + const struct in6_addr *prefix, + usec_t lifetime_preferred_usec, + usec_t lifetime_valid_usec) { + + _cleanup_set_free_ Set *addresses = NULL; + struct in6_addr *a; + int r; + + assert(link); + assert(link->network); + assert(prefix); + + if (!link->network->dhcp_pd_assign) + return 0; + + r = dhcp_pd_generate_addresses(link, prefix, &addresses); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to generate addresses for acquired DHCP delegated prefix: %m"); + + SET_FOREACH(a, addresses) { + _cleanup_(address_freep) Address *address = NULL; + Address *existing; + + r = address_new(&address); + if (r < 0) + return log_link_error_errno(link, r, "Failed to allocate address for DHCP delegated prefix: %m"); + + address->source = NETWORK_CONFIG_SOURCE_DHCP_PD; + address->family = AF_INET6; + address->in_addr.in6 = *a; + address->prefixlen = 64; + address->lifetime_preferred_usec = lifetime_preferred_usec; + address->lifetime_valid_usec = lifetime_valid_usec; + SET_FLAG(address->flags, IFA_F_MANAGETEMPADDR, link->network->dhcp_pd_manage_temporary_address); + address->route_metric = link->network->dhcp_pd_route_metric; + + log_dhcp_pd_address(link, address); + + r = free_and_strdup_warn(&address->netlabel, link->network->dhcp_pd_netlabel); + if (r < 0) + return r; + + if (address_get(link, address, &existing) < 0) + link->dhcp_pd_configured = false; + else + address_unmark(existing); + + r = link_request_address(link, address, &link->dhcp_pd_messages, + dhcp_pd_address_handler, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request DHCP delegated prefix address: %m"); + } + + return 0; +} + +static int dhcp_pd_calculate_subnet_prefix( + const struct in6_addr *pd_prefix, + uint8_t pd_prefix_len, + uint64_t subnet_id, + struct in6_addr *ret) { + + struct in6_addr prefix; + + assert(pd_prefix); + assert(pd_prefix_len <= 64); + assert(ret); + + if (subnet_id >= UINT64_C(1) << (64 - pd_prefix_len)) + return -ERANGE; + + prefix = *pd_prefix; + + if (pd_prefix_len < 32) + prefix.s6_addr32[0] |= htobe32(subnet_id >> 32); + + prefix.s6_addr32[1] |= htobe32(subnet_id & 0xffffffff); + + *ret = prefix; + return 0; +} + +static int dhcp_pd_get_preferred_subnet_prefix( + Link *link, + const struct in6_addr *pd_prefix, + uint8_t pd_prefix_len, + struct in6_addr *ret) { + + struct in6_addr prefix; + Link *assigned_link; + int r; + + assert(link); + assert(link->manager); + assert(link->network); + assert(pd_prefix); + + if (link->network->dhcp_pd_subnet_id >= 0) { + /* If the link has a preference for a particular subnet id try to allocate that */ + + r = dhcp_pd_calculate_subnet_prefix(pd_prefix, pd_prefix_len, link->network->dhcp_pd_subnet_id, &prefix); + if (r < 0) + return log_link_warning_errno(link, r, + "subnet id %" PRIi64 " is out of range. Only have %" PRIu64 " subnets.", + link->network->dhcp_pd_subnet_id, UINT64_C(1) << (64 - pd_prefix_len)); + + *ret = prefix; + return 0; + } + + if (dhcp_pd_get_assigned_subnet_prefix(link, pd_prefix, pd_prefix_len, ret) >= 0) + return 0; + + for (uint64_t n = 0; ; n++) { + /* If we do not have an allocation preference just iterate + * through the address space and return the first free prefix. */ + + r = dhcp_pd_calculate_subnet_prefix(pd_prefix, pd_prefix_len, n, &prefix); + if (r < 0) + return log_link_warning_errno(link, r, + "Couldn't find a suitable prefix. Ran out of address space."); + + /* Do not use explicitly requested subnet IDs. Note that the corresponding link may not + * appear yet. So, we need to check the ID is not used in any .network files. */ + if (set_contains(link->manager->dhcp_pd_subnet_ids, &n)) + continue; + + /* Check that the prefix is not assigned to another link. */ + if (link_get_by_dhcp_pd_subnet_prefix(link->manager, &prefix, &assigned_link) < 0 || + assigned_link == link) + break; + } + + r = link_add_dhcp_pd_subnet_prefix(link, &prefix); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to save acquired free subnet prefix: %m"); + + *ret = prefix; + return 0; +} + +static int dhcp_pd_assign_subnet_prefix( + Link *link, + const struct in6_addr *pd_prefix, + uint8_t pd_prefix_len, + usec_t lifetime_preferred_usec, + usec_t lifetime_valid_usec, + bool is_uplink) { + + struct in6_addr prefix; + int r; + + assert(link); + assert(link->network); + assert(pd_prefix); + + r = dhcp_pd_get_preferred_subnet_prefix(link, pd_prefix, pd_prefix_len, &prefix); + if (r < 0) + return r == -ERANGE ? 0 : r; + + const char *pretty = IN6_ADDR_PREFIX_TO_STRING(&prefix, 64); + + if (link_radv_enabled(link) && link->network->dhcp_pd_announce) { + if (is_uplink) + log_link_debug(link, "Ignoring Announce= setting on upstream interface."); + else { + r = radv_add_prefix(link, &prefix, 64, lifetime_preferred_usec, lifetime_valid_usec); + if (r < 0) + return log_link_warning_errno(link, r, + "Failed to assign/update prefix %s to IPv6 Router Advertisement: %m", + pretty); + } + } + + r = dhcp_pd_request_route(link, &prefix, lifetime_valid_usec); + if (r < 0) + return log_link_warning_errno(link, r, + "Failed to assign/update route for prefix %s: %m", pretty); + + r = dhcp_pd_request_address(link, &prefix, lifetime_preferred_usec, lifetime_valid_usec); + if (r < 0) + return log_link_warning_errno(link, r, + "Failed to assign/update address for prefix %s: %m", pretty); + + log_link_debug(link, "Assigned prefix %s", pretty); + return 1; +} + +static int dhcp_pd_prepare(Link *link) { + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return 0; + + if (!link_dhcp_pd_is_enabled(link)) + return 0; + + if (link_radv_enabled(link) && link->network->dhcp_pd_announce && !link->radv) + return 0; + + link_mark_addresses(link, NETWORK_CONFIG_SOURCE_DHCP_PD); + link_mark_routes(link, NETWORK_CONFIG_SOURCE_DHCP_PD); + + return 1; +} + +static int dhcp_pd_finalize(Link *link) { + int r; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return 0; + + if (link->dhcp_pd_messages == 0) { + link->dhcp_pd_configured = false; + + r = dhcp_pd_remove(link, /* only_marked = */ true); + if (r < 0) + return r; + } + + if (!link->dhcp_pd_configured) + link_set_state(link, LINK_STATE_CONFIGURING); + + link_check_ready(link); + return 0; +} + +void dhcp_pd_prefix_lost(Link *uplink) { + Route *route; + Link *link; + int r; + + assert(uplink); + assert(uplink->manager); + + HASHMAP_FOREACH(link, uplink->manager->links_by_index) { + if (!dhcp_pd_is_uplink(link, uplink, /* accept_auto = */ true)) + continue; + + r = dhcp_pd_remove(link, /* only_marked = */ false); + if (r < 0) + link_enter_failed(link); + } + + SET_FOREACH(route, uplink->manager->routes) { + if (!IN_SET(route->source, NETWORK_CONFIG_SOURCE_DHCP4, NETWORK_CONFIG_SOURCE_DHCP6)) + continue; + if (route->family != AF_INET6) + continue; + if (route->type != RTN_UNREACHABLE) + continue; + if (!set_contains(uplink->dhcp_pd_prefixes, + &(struct in_addr_prefix) { + .family = AF_INET6, + .prefixlen = route->dst_prefixlen, + .address = route->dst })) + continue; + + (void) route_remove(route); + + route_cancel_request(route, uplink); + } + + set_clear(uplink->dhcp_pd_prefixes); +} + +void dhcp4_pd_prefix_lost(Link *uplink) { + Link *tunnel; + + dhcp_pd_prefix_lost(uplink); + + if (uplink->dhcp4_6rd_tunnel_name && + link_get_by_name(uplink->manager, uplink->dhcp4_6rd_tunnel_name, &tunnel) >= 0) + (void) link_remove(tunnel); +} + +static int dhcp4_unreachable_route_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Route *route) { + int r; + + assert(link); + + r = route_configure_handler_internal(rtnl, m, link, "Failed to set unreachable route for DHCPv4 delegated prefix"); + if (r <= 0) + return r; + + r = dhcp4_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int dhcp6_unreachable_route_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Route *route) { + int r; + + assert(link); + + r = route_configure_handler_internal(rtnl, m, link, "Failed to set unreachable route for DHCPv6 delegated prefix"); + if (r <= 0) + return r; + + r = dhcp6_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int dhcp_request_unreachable_route( + Link *link, + const struct in6_addr *addr, + uint8_t prefixlen, + usec_t lifetime_usec, + NetworkConfigSource source, + const union in_addr_union *server_address, + unsigned *counter, + route_netlink_handler_t callback, + bool *configured) { + + _cleanup_(route_freep) Route *route = NULL; + Route *existing; + int r; + + assert(link); + assert(addr); + assert(IN_SET(source, NETWORK_CONFIG_SOURCE_DHCP4, NETWORK_CONFIG_SOURCE_DHCP6)); + assert(server_address); + assert(counter); + assert(callback); + assert(configured); + + if (prefixlen >= 64) { + log_link_debug(link, "Not adding a blocking route for DHCP delegated prefix %s since the prefix has length >= 64.", + IN6_ADDR_PREFIX_TO_STRING(addr, prefixlen)); + return 0; + } + + r = route_new(&route); + if (r < 0) + return log_oom(); + + route->source = source; + route->provider = *server_address; + route->family = AF_INET6; + route->dst.in6 = *addr; + route->dst_prefixlen = prefixlen; + route->type = RTN_UNREACHABLE; + route->protocol = RTPROT_DHCP; + route->priority = IP6_RT_PRIO_USER; + route->lifetime_usec = lifetime_usec; + + if (route_get(link->manager, NULL, route, &existing) < 0) + *configured = false; + else + route_unmark(existing); + + r = link_request_route(link, TAKE_PTR(route), true, counter, callback, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request unreachable route for DHCP delegated prefix %s: %m", + IN6_ADDR_PREFIX_TO_STRING(addr, prefixlen)); + + return 0; +} + +static int dhcp4_request_unreachable_route( + Link *link, + const struct in6_addr *addr, + uint8_t prefixlen, + usec_t lifetime_usec, + const union in_addr_union *server_address) { + + return dhcp_request_unreachable_route(link, addr, prefixlen, lifetime_usec, + NETWORK_CONFIG_SOURCE_DHCP4, server_address, + &link->dhcp4_messages, dhcp4_unreachable_route_handler, + &link->dhcp4_configured); +} + +static int dhcp6_request_unreachable_route( + Link *link, + const struct in6_addr *addr, + uint8_t prefixlen, + usec_t lifetime_usec, + const union in_addr_union *server_address) { + + return dhcp_request_unreachable_route(link, addr, prefixlen, lifetime_usec, + NETWORK_CONFIG_SOURCE_DHCP6, server_address, + &link->dhcp6_messages, dhcp6_unreachable_route_handler, + &link->dhcp6_configured); +} + +static int dhcp_pd_prefix_add(Link *link, const struct in6_addr *prefix, uint8_t prefixlen) { + struct in_addr_prefix *p; + int r; + + assert(link); + assert(prefix); + + p = new(struct in_addr_prefix, 1); + if (!p) + return log_oom(); + + *p = (struct in_addr_prefix) { + .family = AF_INET6, + .prefixlen = prefixlen, + .address.in6 = *prefix, + }; + + int log_level = set_contains(link->dhcp_pd_prefixes, p) ? LOG_DEBUG : + prefixlen > 64 || prefixlen < 48 ? LOG_WARNING : LOG_INFO; + log_link_full(link, + log_level, + "DHCP: received delegated prefix %s%s", + IN6_ADDR_PREFIX_TO_STRING(prefix, prefixlen), + prefixlen > 64 ? " with prefix length > 64, ignoring." : + prefixlen < 48 ? " with prefix length < 48, looks unusual.": ""); + + /* Store PD prefix even if prefixlen > 64, not to make logged at warning level so frequently. */ + r = set_ensure_consume(&link->dhcp_pd_prefixes, &in_addr_prefix_hash_ops_free, p); + if (r < 0) + return log_link_error_errno(link, r, "Failed to store DHCP delegated prefix %s: %m", + IN6_ADDR_PREFIX_TO_STRING(prefix, prefixlen)); + return 0; +} + +static int dhcp4_pd_request_default_gateway_on_6rd_tunnel(Link *link, const struct in_addr *br_address, usec_t lifetime_usec) { + _cleanup_(route_freep) Route *route = NULL; + Route *existing; + int r; + + assert(link); + assert(br_address); + + r = route_new(&route); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to allocate default gateway for DHCP delegated prefix: %m"); + + route->source = NETWORK_CONFIG_SOURCE_DHCP_PD; + route->family = AF_INET6; + route->gw_family = AF_INET6; + route->gw.in6.s6_addr32[3] = br_address->s_addr; + route->scope = RT_SCOPE_UNIVERSE; + route->protocol = RTPROT_DHCP; + route->priority = IP6_RT_PRIO_USER; + route->lifetime_usec = lifetime_usec; + + if (route_get(NULL, link, route, &existing) < 0) /* This is a new route. */ + link->dhcp_pd_configured = false; + else + route_unmark(existing); + + r = link_request_route(link, TAKE_PTR(route), true, &link->dhcp_pd_messages, + dhcp_pd_route_handler, NULL); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to request default gateway for DHCP delegated prefix: %m"); + + return 0; +} + +static void dhcp4_calculate_pd_prefix( + const struct in_addr *ipv4address, + uint8_t ipv4masklen, + const struct in6_addr *sixrd_prefix, + uint8_t sixrd_prefixlen, + struct in6_addr *ret_pd_prefix, + uint8_t *ret_pd_prefixlen) { + + struct in6_addr pd_prefix; + + assert(ipv4address); + assert(ipv4masklen <= 32); + assert(sixrd_prefix); + assert(32 - ipv4masklen + sixrd_prefixlen <= 128); + assert(ret_pd_prefix); + + pd_prefix = *sixrd_prefix; + for (unsigned i = 0; i < (unsigned) (32 - ipv4masklen); i++) + if (ipv4address->s_addr & htobe32(UINT32_C(1) << (32 - ipv4masklen - i - 1))) + pd_prefix.s6_addr[(i + sixrd_prefixlen) / 8] |= 1 << (7 - (i + sixrd_prefixlen) % 8); + + *ret_pd_prefix = pd_prefix; + if (ret_pd_prefixlen) + *ret_pd_prefixlen = 32 - ipv4masklen + sixrd_prefixlen; +} + +static int dhcp4_pd_assign_subnet_prefix(Link *link, Link *uplink) { + uint8_t ipv4masklen, sixrd_prefixlen, pd_prefixlen; + struct in6_addr sixrd_prefix, pd_prefix; + const struct in_addr *br_addresses; + struct in_addr ipv4address; + usec_t lifetime_usec; + int r; + + assert(link); + assert(uplink); + assert(uplink->manager); + assert(uplink->dhcp_lease); + + r = sd_dhcp_lease_get_address(uplink->dhcp_lease, &ipv4address); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get DHCPv4 address: %m"); + + r = sd_dhcp_lease_get_lifetime_timestamp(uplink->dhcp_lease, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get lifetime of DHCPv4 lease: %m"); + + r = sd_dhcp_lease_get_6rd(uplink->dhcp_lease, &ipv4masklen, &sixrd_prefixlen, &sixrd_prefix, &br_addresses, NULL); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get DHCPv4 6rd option: %m"); + + dhcp4_calculate_pd_prefix(&ipv4address, ipv4masklen, &sixrd_prefix, sixrd_prefixlen, &pd_prefix, &pd_prefixlen); + + if (pd_prefixlen > 64) + return 0; + + r = dhcp_pd_prepare(link); + if (r <= 0) + return r; + + if (streq_ptr(uplink->dhcp4_6rd_tunnel_name, link->ifname)) { + r = dhcp4_pd_request_default_gateway_on_6rd_tunnel(link, &br_addresses[0], lifetime_usec); + if (r < 0) + return r; + } + + r = dhcp_pd_assign_subnet_prefix(link, &pd_prefix, pd_prefixlen, lifetime_usec, lifetime_usec, /* is_uplink = */ false); + if (r < 0) + return r; + + return dhcp_pd_finalize(link); +} + +static int dhcp4_pd_6rd_tunnel_create_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + assert(link); + assert(link->manager); + assert(link->dhcp4_6rd_tunnel_name); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_link_message_warning_errno(link, m, r, "Failed to create tunnel device for DHCPv4 6rd"); + link_enter_failed(link); + return 0; + } + + return 0; +} + +int dhcp4_pd_prefix_acquired(Link *uplink) { + _cleanup_free_ char *tunnel_name = NULL; + uint8_t ipv4masklen, sixrd_prefixlen, pd_prefixlen; + struct in6_addr sixrd_prefix, pd_prefix; + struct in_addr ipv4address; + union in_addr_union server_address; + const struct in_addr *br_addresses; + usec_t lifetime_usec; + Link *link; + int r; + + assert(uplink); + assert(uplink->manager); + assert(uplink->dhcp_lease); + + r = sd_dhcp_lease_get_address(uplink->dhcp_lease, &ipv4address); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get DHCPv4 address: %m"); + + r = sd_dhcp_lease_get_lifetime_timestamp(uplink->dhcp_lease, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get lifetime of DHCPv4 lease: %m"); + + r = sd_dhcp_lease_get_server_identifier(uplink->dhcp_lease, &server_address.in); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get server address of DHCPv4 lease: %m"); + + r = sd_dhcp_lease_get_6rd(uplink->dhcp_lease, &ipv4masklen, &sixrd_prefixlen, &sixrd_prefix, &br_addresses, NULL); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get DHCPv4 6rd option: %m"); + + if (DEBUG_LOGGING) + log_link_debug(uplink, "DHCPv4: 6rd option is acquired: IPv4_masklen=%u, 6rd_prefix=%s, br_address="IPV4_ADDRESS_FMT_STR, + ipv4masklen, + IN6_ADDR_PREFIX_TO_STRING(&sixrd_prefix, sixrd_prefixlen), + IPV4_ADDRESS_FMT_VAL(*br_addresses)); + + /* Calculate PD prefix */ + dhcp4_calculate_pd_prefix(&ipv4address, ipv4masklen, &sixrd_prefix, sixrd_prefixlen, &pd_prefix, &pd_prefixlen); + + /* Register and log PD prefix */ + r = dhcp_pd_prefix_add(uplink, &pd_prefix, pd_prefixlen); + if (r < 0) + return r; + + /* Request unreachable route */ + r = dhcp4_request_unreachable_route(uplink, &pd_prefix, pd_prefixlen, lifetime_usec, &server_address); + if (r < 0) + return r; + + /* Generate 6rd SIT tunnel device name. */ + r = dhcp4_pd_create_6rd_tunnel_name(uplink, &tunnel_name); + if (r < 0) + return r; + + /* Remove old tunnel device if exists. */ + if (!streq_ptr(uplink->dhcp4_6rd_tunnel_name, tunnel_name)) { + Link *old_tunnel; + + if (uplink->dhcp4_6rd_tunnel_name && + link_get_by_name(uplink->manager, uplink->dhcp4_6rd_tunnel_name, &old_tunnel) >= 0) + (void) link_remove(old_tunnel); + + free_and_replace(uplink->dhcp4_6rd_tunnel_name, tunnel_name); + } + + /* Create 6rd SIT tunnel device if it does not exist yet. */ + if (link_get_by_name(uplink->manager, uplink->dhcp4_6rd_tunnel_name, NULL) < 0) { + r = dhcp4_pd_create_6rd_tunnel(uplink, dhcp4_pd_6rd_tunnel_create_handler); + if (r < 0) + return r; + } + + /* Then, assign subnet prefixes to downstream interfaces. */ + HASHMAP_FOREACH(link, uplink->manager->links_by_index) { + if (!dhcp_pd_is_uplink(link, uplink, /* accept_auto = */ true)) + continue; + + r = dhcp4_pd_assign_subnet_prefix(link, uplink); + if (r < 0) { + /* When failed on the upstream interface (i.e., the case link == uplink), + * immediately abort the assignment of the prefixes. As, the all assigned + * prefixes will be dropped soon in link_enter_failed(), and it is meaningless + * to continue the assignment. */ + if (link == uplink) + return r; + + link_enter_failed(link); + } + } + + return 0; +} + +static int dhcp6_pd_assign_subnet_prefixes(Link *link, Link *uplink) { + int r; + + assert(link); + assert(uplink); + assert(uplink->dhcp6_lease); + + r = dhcp_pd_prepare(link); + if (r <= 0) + return r; + + FOREACH_DHCP6_PD_PREFIX(uplink->dhcp6_lease) { + usec_t lifetime_preferred_usec, lifetime_valid_usec; + struct in6_addr pd_prefix; + uint8_t pd_prefix_len; + + r = sd_dhcp6_lease_get_pd_prefix(uplink->dhcp6_lease, &pd_prefix, &pd_prefix_len); + if (r < 0) + return r; + + if (pd_prefix_len > 64) + continue; + + /* Mask prefix for safety. */ + r = in6_addr_mask(&pd_prefix, pd_prefix_len); + if (r < 0) + return r; + + r = sd_dhcp6_lease_get_pd_lifetime_timestamp(uplink->dhcp6_lease, CLOCK_BOOTTIME, + &lifetime_preferred_usec, &lifetime_valid_usec); + if (r < 0) + return r; + + r = dhcp_pd_assign_subnet_prefix(link, &pd_prefix, pd_prefix_len, + lifetime_preferred_usec, lifetime_valid_usec, + /* is_uplink = */ link == uplink); + if (r < 0) + return r; + } + + return dhcp_pd_finalize(link); +} + +int dhcp6_pd_prefix_acquired(Link *uplink) { + union in_addr_union server_address; + Link *link; + int r; + + assert(uplink); + assert(uplink->dhcp6_lease); + + r = sd_dhcp6_lease_get_server_address(uplink->dhcp6_lease, &server_address.in6); + if (r < 0) + return log_link_warning_errno(uplink, r, "Failed to get server address of DHCPv6 lease: %m"); + + /* First, logs acquired prefixes and request unreachable routes. */ + FOREACH_DHCP6_PD_PREFIX(uplink->dhcp6_lease) { + usec_t lifetime_valid_usec; + struct in6_addr pd_prefix; + uint8_t pd_prefix_len; + + r = sd_dhcp6_lease_get_pd_prefix(uplink->dhcp6_lease, &pd_prefix, &pd_prefix_len); + if (r < 0) + return r; + + /* Mask prefix for safety. */ + r = in6_addr_mask(&pd_prefix, pd_prefix_len); + if (r < 0) + return log_link_error_errno(uplink, r, "Failed to mask DHCPv6 delegated prefix: %m"); + + r = dhcp_pd_prefix_add(uplink, &pd_prefix, pd_prefix_len); + if (r < 0) + return r; + + r = sd_dhcp6_lease_get_pd_lifetime_timestamp(uplink->dhcp6_lease, CLOCK_BOOTTIME, + NULL, &lifetime_valid_usec); + if (r < 0) + return r; + + r = dhcp6_request_unreachable_route(uplink, &pd_prefix, pd_prefix_len, + lifetime_valid_usec, &server_address); + if (r < 0) + return r; + } + + /* Then, assign subnet prefixes. */ + HASHMAP_FOREACH(link, uplink->manager->links_by_index) { + if (!dhcp_pd_is_uplink(link, uplink, /* accept_auto = */ true)) + continue; + + r = dhcp6_pd_assign_subnet_prefixes(link, uplink); + if (r < 0) { + /* When failed on the upstream interface (i.e., the case link == uplink), + * immediately abort the assignment of the prefixes. As, the all assigned + * prefixes will be dropped soon in link_enter_failed(), and it is meaningless + * to continue the assignment. */ + if (link == uplink) + return r; + + link_enter_failed(link); + } + } + + return 0; +} + +static bool dhcp4_pd_uplink_is_ready(Link *link) { + assert(link); + + if (!link->network) + return false; + + if (!link->network->dhcp_use_6rd) + return false; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + if (!link->dhcp_client) + return false; + + if (sd_dhcp_client_is_running(link->dhcp_client) <= 0) + return false; + + return sd_dhcp_lease_has_6rd(link->dhcp_lease); +} + +static bool dhcp6_pd_uplink_is_ready(Link *link) { + assert(link); + + if (!link->network) + return false; + + if (!link->network->dhcp6_use_pd_prefix) + return false; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + if (!link->dhcp6_client) + return false; + + if (sd_dhcp6_client_is_running(link->dhcp6_client) <= 0) + return false; + + return sd_dhcp6_lease_has_pd_prefix(link->dhcp6_lease); +} + +int dhcp_pd_find_uplink(Link *link, Link **ret) { + Link *uplink = NULL; + int r = 0; + + assert(link); + assert(link->manager); + assert(link_dhcp_pd_is_enabled(link)); + assert(ret); + + if (link->network->dhcp_pd_uplink_name) + r = link_get_by_name(link->manager, link->network->dhcp_pd_uplink_name, &uplink); + else if (link->network->dhcp_pd_uplink_index > 0) + r = link_get_by_index(link->manager, link->network->dhcp_pd_uplink_index, &uplink); + else if (link->network->dhcp_pd_uplink_index == UPLINK_INDEX_SELF) + uplink = link; + if (r < 0) + return r; + + if (uplink) { + if (dhcp4_pd_uplink_is_ready(uplink)) { + *ret = uplink; + return AF_INET; + } + + if (dhcp6_pd_uplink_is_ready(uplink)) { + *ret = uplink; + return AF_INET6; + } + + return -EBUSY; + } + + HASHMAP_FOREACH(uplink, link->manager->links_by_index) { + /* Assume that there exists at most one link which acquired delegated prefixes. */ + if (dhcp4_pd_uplink_is_ready(uplink)) { + *ret = uplink; + return AF_INET; + } + + if (dhcp6_pd_uplink_is_ready(uplink)) { + *ret = uplink; + return AF_INET6; + } + } + + return -ENODEV; +} + +int dhcp_request_prefix_delegation(Link *link) { + Link *uplink; + int r; + + assert(link); + + if (!link_dhcp_pd_is_enabled(link)) + return 0; + + r = dhcp_pd_find_uplink(link, &uplink); + if (r < 0) + return 0; + + log_link_debug(link, "Requesting subnets of delegated prefixes acquired by DHCPv%c client on %s", + r == AF_INET ? '4' : '6', uplink->ifname); + + return r == AF_INET ? + dhcp4_pd_assign_subnet_prefix(link, uplink) : + dhcp6_pd_assign_subnet_prefixes(link, uplink); +} + +int config_parse_dhcp_pd_subnet_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int64_t *p = ASSERT_PTR(data); + uint64_t t; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue) || streq(rvalue, "auto")) { + *p = -1; + return 0; + } + + r = safe_atoux64(rvalue, &t); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (t > INT64_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid subnet id '%s', ignoring assignment.", + rvalue); + return 0; + } + + *p = (int64_t) t; + + return 0; +} diff --git a/src/network/networkd-dhcp-prefix-delegation.h b/src/network/networkd-dhcp-prefix-delegation.h new file mode 100644 index 0000000..e591b8a --- /dev/null +++ b/src/network/networkd-dhcp-prefix-delegation.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-dhcp-lease.h" +#include "sd-dhcp6-lease.h" + +#include "conf-parser.h" + +typedef struct Link Link; + +bool link_dhcp_pd_is_enabled(Link *link); +bool dhcp_pd_is_uplink(Link *link, Link *target, bool accept_auto); +int dhcp_pd_find_uplink(Link *link, Link **ret); +int dhcp_pd_remove(Link *link, bool only_marked); +int dhcp_request_prefix_delegation(Link *link); +int dhcp4_pd_prefix_acquired(Link *uplink); +int dhcp6_pd_prefix_acquired(Link *uplink); +void dhcp_pd_prefix_lost(Link *uplink); +void dhcp4_pd_prefix_lost(Link *uplink); + +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_pd_subnet_id); diff --git a/src/network/networkd-dhcp-server-bus.c b/src/network/networkd-dhcp-server-bus.c new file mode 100644 index 0000000..e3397c3 --- /dev/null +++ b/src/network/networkd-dhcp-server-bus.c @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-util.h" +#include "dhcp-server-internal.h" +#include "networkd-dhcp-server-bus.h" +#include "networkd-link-bus.h" +#include "networkd-manager.h" +#include "strv.h" + +static int property_get_leases( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + sd_dhcp_server *s; + DHCPLease *lease; + int r; + + assert(reply); + + s = l->dhcp_server; + if (!s) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Link %s has no DHCP server.", l->ifname); + + if (sd_dhcp_server_is_in_relay_mode(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Link %s has DHCP relay agent active.", l->ifname); + + r = sd_bus_message_open_container(reply, 'a', "(uayayayayt)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(lease, s->bound_leases_by_client_id) { + r = sd_bus_message_open_container(reply, 'r', "uayayayayt"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "u", (uint32_t)AF_INET); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', lease->client_id.data, lease->client_id.length); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &lease->address, sizeof(lease->address)); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &lease->gateway, sizeof(lease->gateway)); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &lease->chaddr, sizeof(lease->chaddr)); + if (r < 0) + return r; + + r = sd_bus_message_append_basic(reply, 't', &lease->expiration); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int dhcp_server_emit_changed(Link *link, const char *property, ...) { + _cleanup_free_ char *path = NULL; + char **l; + + assert(link); + + if (sd_bus_is_ready(link->manager->bus) <= 0) + return 0; + + path = link_bus_path(link); + if (!path) + return log_oom(); + + l = strv_from_stdarg_alloca(property); + + return sd_bus_emit_properties_changed_strv( + link->manager->bus, + path, + "org.freedesktop.network1.DHCPServer", + l); +} + +void dhcp_server_callback(sd_dhcp_server *s, uint64_t event, void *data) { + Link *l = ASSERT_PTR(data); + + if (event & SD_DHCP_SERVER_EVENT_LEASE_CHANGED) + (void) dhcp_server_emit_changed(l, "Leases", NULL); +} + +static const sd_bus_vtable dhcp_server_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Leases", "a(uayayayayt)", property_get_leases, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation dhcp_server_object = { + "/org/freedesktop/network1/link", + "org.freedesktop.network1.DHCPServer", + .fallback_vtables = BUS_FALLBACK_VTABLES({dhcp_server_vtable, link_object_find}), + .node_enumerator = link_node_enumerator, +}; diff --git a/src/network/networkd-dhcp-server-bus.h b/src/network/networkd-dhcp-server-bus.h new file mode 100644 index 0000000..f52be82 --- /dev/null +++ b/src/network/networkd-dhcp-server-bus.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-dhcp-server.h" + +#include "bus-object.h" + +extern const BusObjectImplementation dhcp_server_object; + +void dhcp_server_callback(sd_dhcp_server *server, uint64_t event, void *data); diff --git a/src/network/networkd-dhcp-server-static-lease.c b/src/network/networkd-dhcp-server-static-lease.c new file mode 100644 index 0000000..8e7eec6 --- /dev/null +++ b/src/network/networkd-dhcp-server-static-lease.c @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "hashmap.h" +#include "networkd-dhcp-server-static-lease.h" +#include "networkd-network.h" +#include "networkd-util.h" + +DEFINE_SECTION_CLEANUP_FUNCTIONS(DHCPStaticLease, dhcp_static_lease_free); + +DHCPStaticLease *dhcp_static_lease_free(DHCPStaticLease *static_lease) { + if (!static_lease) + return NULL; + + if (static_lease->network && static_lease->section) + hashmap_remove(static_lease->network->dhcp_static_leases_by_section, static_lease->section); + + config_section_free(static_lease->section); + free(static_lease->client_id); + return mfree(static_lease); +} + +static int dhcp_static_lease_new(DHCPStaticLease **ret) { + DHCPStaticLease *p; + + assert(ret); + + p = new0(DHCPStaticLease, 1); + if (!p) + return -ENOMEM; + + *ret = TAKE_PTR(p); + return 0; +} + +static int lease_new_static(Network *network, const char *filename, unsigned section_line, DHCPStaticLease **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(dhcp_static_lease_freep) DHCPStaticLease *static_lease = NULL; + int r; + + assert(network); + assert(filename); + assert(section_line > 0); + assert(ret); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + static_lease = hashmap_get(network->dhcp_static_leases_by_section, n); + if (static_lease) { + *ret = TAKE_PTR(static_lease); + return 0; + } + + r = dhcp_static_lease_new(&static_lease); + if (r < 0) + return r; + + static_lease->network = network; + static_lease->section = TAKE_PTR(n); + r = hashmap_ensure_put(&network->dhcp_static_leases_by_section, &config_section_hash_ops, static_lease->section, static_lease); + if (r < 0) + return r; + + *ret = TAKE_PTR(static_lease); + return 0; +} + +static int static_lease_verify(DHCPStaticLease *static_lease) { + if (section_is_invalid(static_lease->section)) + return -EINVAL; + + if (in4_addr_is_null(&static_lease->address)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: DHCP static lease without Address= field configured. " + "Ignoring [DHCPServerStaticLease] section from line %u.", + static_lease->section->filename, static_lease->section->line); + + /* TODO: check that the address is in the pool. */ + + if (static_lease->client_id_size == 0 || !static_lease->client_id) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: DHCP static lease without MACAddress= field configured. " + "Ignoring [DHCPServerStaticLease] section from line %u.", + static_lease->section->filename, static_lease->section->line); + + assert(static_lease->client_id_size == ETH_ALEN + 1); + + return 0; +} + +void network_drop_invalid_static_leases(Network *network) { + DHCPStaticLease *static_lease; + + assert(network); + + HASHMAP_FOREACH(static_lease, network->dhcp_static_leases_by_section) + if (static_lease_verify(static_lease) < 0) + dhcp_static_lease_free(static_lease); +} + +int config_parse_dhcp_static_lease_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(dhcp_static_lease_free_or_set_invalidp) DHCPStaticLease *lease = NULL; + Network *network = ASSERT_PTR(userdata); + union in_addr_union addr; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = lease_new_static(network, filename, section_line, &lease); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + lease->address.s_addr = 0; + TAKE_PTR(lease); + return 0; + } + + r = in_addr_from_string(AF_INET, rvalue, &addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse IPv4 address for DHCPv4 static lease, ignoring assignment: %s", rvalue); + return 0; + } + if (in4_addr_is_null(&addr.in)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "IPv4 address for DHCPv4 static lease cannot be the ANY address, ignoring assignment: %s", rvalue); + return 0; + } + + lease->address = addr.in; + + TAKE_PTR(lease); + return 0; +} + +int config_parse_dhcp_static_lease_hwaddr( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(dhcp_static_lease_free_or_set_invalidp) DHCPStaticLease *lease = NULL; + Network *network = ASSERT_PTR(userdata); + struct ether_addr hwaddr; + uint8_t *c; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = lease_new_static(network, filename, section_line, &lease); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + lease->client_id = mfree(lease->client_id); + lease->client_id_size = 0; + return 0; + } + + r = parse_ether_addr(rvalue, &hwaddr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse MAC address for DHCPv4 static lease, ignoring assignment: %s", rvalue); + return 0; + } + if (ether_addr_is_null(&hwaddr) || (hwaddr.ether_addr_octet[0] & 0x01)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "MAC address for DHCPv4 static lease cannot be null or multicast, ignoring assignment: %s", rvalue); + return 0; + } + + c = new(uint8_t, ETH_ALEN + 1); + if (!c) + return log_oom(); + + /* set client id type to 1: Ethernet Link-Layer (RFC 2132) */ + c[0] = 0x01; + memcpy(c + 1, &hwaddr, ETH_ALEN); + + free_and_replace(lease->client_id, c); + lease->client_id_size = ETH_ALEN + 1; + + TAKE_PTR(lease); + return 0; +} diff --git a/src/network/networkd-dhcp-server-static-lease.h b/src/network/networkd-dhcp-server-static-lease.h new file mode 100644 index 0000000..9b8e78b --- /dev/null +++ b/src/network/networkd-dhcp-server-static-lease.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include + +#include "conf-parser.h" +#include "in-addr-util.h" + +typedef struct Network Network; +typedef struct ConfigSection ConfigSection; + +typedef struct DHCPStaticLease { + Network *network; + ConfigSection *section; + + struct in_addr address; + uint8_t *client_id; + size_t client_id_size; +} DHCPStaticLease; + +DHCPStaticLease *dhcp_static_lease_free(DHCPStaticLease *lease); +void network_drop_invalid_static_leases(Network *network); + +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_static_lease_address); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_static_lease_hwaddr); diff --git a/src/network/networkd-dhcp-server.c b/src/network/networkd-dhcp-server.c new file mode 100644 index 0000000..607fe00 --- /dev/null +++ b/src/network/networkd-dhcp-server.c @@ -0,0 +1,779 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-dhcp-server.h" + +#include "dhcp-protocol.h" +#include "fd-util.h" +#include "fileio.h" +#include "network-common.h" +#include "networkd-address.h" +#include "networkd-dhcp-server-bus.h" +#include "networkd-dhcp-server-static-lease.h" +#include "networkd-dhcp-server.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "networkd-route-util.h" +#include "parse-util.h" +#include "socket-netlink.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" + +static bool link_dhcp4_server_enabled(Link *link) { + assert(link); + + if (link->flags & IFF_LOOPBACK) + return false; + + if (!link->network) + return false; + + if (link->iftype == ARPHRD_CAN) + return false; + + return link->network->dhcp_server; +} + +int network_adjust_dhcp_server(Network *network, Set **addresses) { + int r; + + assert(network); + assert(addresses); + + if (!network->dhcp_server) + return 0; + + if (network->bond) { + log_warning("%s: DHCPServer= is enabled for bond slave. Disabling DHCP server.", + network->filename); + network->dhcp_server = false; + return 0; + } + + assert(network->dhcp_server_address_prefixlen <= 32); + + if (network->dhcp_server_address_prefixlen == 0) { + Address *address; + + /* If the server address is not specified, then find suitable static address. */ + + ORDERED_HASHMAP_FOREACH(address, network->addresses_by_section) { + assert(!section_is_invalid(address->section)); + + if (address->family != AF_INET) + continue; + + if (in4_addr_is_localhost(&address->in_addr.in)) + continue; + + if (in4_addr_is_link_local(&address->in_addr.in)) + continue; + + if (in4_addr_is_set(&address->in_addr_peer.in)) + continue; + + /* TODO: check if the prefix length is small enough for the pool. */ + + network->dhcp_server_address = address; + break; + } + if (!network->dhcp_server_address) { + log_warning("%s: DHCPServer= is enabled, but no suitable static address configured. " + "Disabling DHCP server.", + network->filename); + network->dhcp_server = false; + return 0; + } + + } else { + _cleanup_(address_freep) Address *a = NULL; + Address *existing; + unsigned line; + + /* TODO: check if the prefix length is small enough for the pool. */ + + /* If an address is explicitly specified, then check if the corresponding [Address] section + * is configured, and add one if not. */ + + existing = set_get(*addresses, + &(Address) { + .family = AF_INET, + .in_addr.in = network->dhcp_server_address_in_addr, + .prefixlen = network->dhcp_server_address_prefixlen, + }); + if (existing) { + /* Corresponding [Address] section already exists. */ + network->dhcp_server_address = existing; + return 0; + } + + r = ordered_hashmap_by_section_find_unused_line(network->addresses_by_section, network->filename, &line); + if (r < 0) + return log_warning_errno(r, "%s: Failed to find unused line number for DHCP server address: %m", + network->filename); + + r = address_new_static(network, network->filename, line, &a); + if (r < 0) + return log_warning_errno(r, "%s: Failed to add new static address object for DHCP server: %m", + network->filename); + + a->family = AF_INET; + a->prefixlen = network->dhcp_server_address_prefixlen; + a->in_addr.in = network->dhcp_server_address_in_addr; + a->requested_as_null = !in4_addr_is_set(&network->dhcp_server_address_in_addr); + + r = address_section_verify(a); + if (r < 0) + return r; + + r = set_ensure_put(addresses, &address_hash_ops, a); + if (r < 0) + return log_oom(); + assert(r > 0); + + network->dhcp_server_address = TAKE_PTR(a); + } + + return 0; +} + +static int dhcp_server_find_uplink(Link *link, Link **ret) { + assert(link); + + if (link->network->dhcp_server_uplink_name) + return link_get_by_name(link->manager, link->network->dhcp_server_uplink_name, ret); + + if (link->network->dhcp_server_uplink_index > 0) + return link_get_by_index(link->manager, link->network->dhcp_server_uplink_index, ret); + + if (link->network->dhcp_server_uplink_index == UPLINK_INDEX_AUTO) { + /* It is not necessary to propagate error in automatic selection. */ + if (manager_find_uplink(link->manager, AF_INET, link, ret) < 0) + *ret = NULL; + return 0; + } + + *ret = NULL; + return 0; +} + +static int link_push_uplink_to_dhcp_server( + Link *link, + sd_dhcp_lease_server_type_t what, + sd_dhcp_server *s) { + + _cleanup_free_ struct in_addr *addresses = NULL; + bool use_dhcp_lease_data = true; + size_t n_addresses = 0; + + assert(link); + + if (!link->network) + return 0; + assert(link->network); + + log_link_debug(link, "Copying %s from link", dhcp_lease_server_type_to_string(what)); + + switch (what) { + + case SD_DHCP_LEASE_DNS: + /* For DNS we have a special case. We the data configured explicitly locally along with the + * data from the DHCP lease. */ + + for (unsigned i = 0; i < link->network->n_dns; i++) { + struct in_addr ia; + + /* Only look for IPv4 addresses */ + if (link->network->dns[i]->family != AF_INET) + continue; + + ia = link->network->dns[i]->address.in; + + /* Never propagate obviously borked data */ + if (in4_addr_is_null(&ia) || in4_addr_is_localhost(&ia)) + continue; + + if (!GREEDY_REALLOC(addresses, n_addresses + 1)) + return log_oom(); + + addresses[n_addresses++] = ia; + } + + use_dhcp_lease_data = link->network->dhcp_use_dns; + break; + + case SD_DHCP_LEASE_NTP: { + /* For NTP things are similar, but for NTP hostnames can be configured too, which we cannot + * propagate via DHCP. Hence let's only propagate those which are IP addresses. */ + + STRV_FOREACH(i, link->network->ntp) { + union in_addr_union ia; + + if (in_addr_from_string(AF_INET, *i, &ia) < 0) + continue; + + /* Never propagate obviously borked data */ + if (in4_addr_is_null(&ia.in) || in4_addr_is_localhost(&ia.in)) + continue; + + if (!GREEDY_REALLOC(addresses, n_addresses + 1)) + return log_oom(); + + addresses[n_addresses++] = ia.in; + } + + use_dhcp_lease_data = link->network->dhcp_use_ntp; + break; + } + + case SD_DHCP_LEASE_SIP: + + /* For SIP we don't allow explicit, local configuration, but there's control whether to use the data */ + use_dhcp_lease_data = link->network->dhcp_use_sip; + break; + + case SD_DHCP_LEASE_POP3: + case SD_DHCP_LEASE_SMTP: + case SD_DHCP_LEASE_LPR: + /* For the other server types we currently do not allow local configuration of server data, + * since there are typically no local consumers of the data. */ + break; + + default: + assert_not_reached(); + } + + if (use_dhcp_lease_data && link->dhcp_lease) { + const struct in_addr *da; + + int n = sd_dhcp_lease_get_servers(link->dhcp_lease, what, &da); + if (n > 0) { + if (!GREEDY_REALLOC(addresses, n_addresses + n)) + return log_oom(); + + for (int j = 0; j < n; j++) + if (in4_addr_is_non_local(&da[j])) + addresses[n_addresses++] = da[j]; + } + } + + if (n_addresses <= 0) + return 0; + + return sd_dhcp_server_set_servers(s, what, addresses, n_addresses); +} + +static int dhcp4_server_parse_dns_server_string_and_warn( + const char *string, + struct in_addr **addresses, + size_t *n_addresses) { + + for (;;) { + _cleanup_free_ char *word = NULL, *server_name = NULL; + union in_addr_union address; + int family, r, ifindex = 0; + + r = extract_first_word(&string, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + r = in_addr_ifindex_name_from_string_auto(word, &family, &address, &ifindex, &server_name); + if (r < 0) { + log_warning_errno(r, "Failed to parse DNS server address '%s', ignoring: %m", word); + continue; + } + + /* Only look for IPv4 addresses */ + if (family != AF_INET) + continue; + + /* Never propagate obviously borked data */ + if (in4_addr_is_null(&address.in) || in4_addr_is_localhost(&address.in)) + continue; + + if (!GREEDY_REALLOC(*addresses, *n_addresses + 1)) + return log_oom(); + + (*addresses)[(*n_addresses)++] = address.in; + } + + return 0; +} + +static int dhcp4_server_set_dns_from_resolve_conf(Link *link) { + _cleanup_free_ struct in_addr *addresses = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t n_addresses = 0; + int r; + + f = fopen(PRIVATE_UPLINK_RESOLV_CONF, "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open " PRIVATE_UPLINK_RESOLV_CONF ": %m"); + } + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *a; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read " PRIVATE_UPLINK_RESOLV_CONF ": %m"); + if (r == 0) + break; + + if (IN_SET(*line, '#', ';', 0)) + continue; + + a = first_word(line, "nameserver"); + if (!a) + continue; + + r = dhcp4_server_parse_dns_server_string_and_warn(a, &addresses, &n_addresses); + if (r < 0) + log_warning_errno(r, "Failed to parse DNS server address '%s', ignoring.", a); + } + + if (n_addresses <= 0) + return 0; + + return sd_dhcp_server_set_dns(link->dhcp_server, addresses, n_addresses); +} + +static int dhcp4_server_configure(Link *link) { + bool acquired_uplink = false; + sd_dhcp_option *p; + DHCPStaticLease *static_lease; + Link *uplink = NULL; + Address *address; + bool bind_to_interface; + int r; + + assert(link); + assert(link->network); + assert(link->network->dhcp_server_address); + + log_link_debug(link, "Configuring DHCP Server."); + + if (link->dhcp_server) + return -EBUSY; + + r = sd_dhcp_server_new(&link->dhcp_server, link->ifindex); + if (r < 0) + return r; + + r = sd_dhcp_server_attach_event(link->dhcp_server, link->manager->event, 0); + if (r < 0) + return r; + + r = sd_dhcp_server_set_callback(link->dhcp_server, dhcp_server_callback, link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to set callback for DHCPv4 server instance: %m"); + + r = address_get(link, link->network->dhcp_server_address, &address); + if (r < 0) + return log_link_error_errno(link, r, "Failed to find suitable address for DHCPv4 server instance: %m"); + + /* use the server address' subnet as the pool */ + r = sd_dhcp_server_configure_pool(link->dhcp_server, &address->in_addr.in, address->prefixlen, + link->network->dhcp_server_pool_offset, link->network->dhcp_server_pool_size); + if (r < 0) + return log_link_error_errno(link, r, "Failed to configure address pool for DHCPv4 server instance: %m"); + + if (link->network->dhcp_server_max_lease_time_usec > 0) { + r = sd_dhcp_server_set_max_lease_time(link->dhcp_server, link->network->dhcp_server_max_lease_time_usec); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set maximum lease time for DHCPv4 server instance: %m"); + } + + if (link->network->dhcp_server_default_lease_time_usec > 0) { + r = sd_dhcp_server_set_default_lease_time(link->dhcp_server, link->network->dhcp_server_default_lease_time_usec); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set default lease time for DHCPv4 server instance: %m"); + } + + r = sd_dhcp_server_set_ipv6_only_preferred_usec(link->dhcp_server, link->network->dhcp_server_ipv6_only_preferred_usec); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set IPv6 only preferred time for DHCPv4 server instance: %m"); + + r = sd_dhcp_server_set_boot_server_address(link->dhcp_server, &link->network->dhcp_server_boot_server_address); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to set boot server address for DHCPv4 server instance: %m"); + + r = sd_dhcp_server_set_boot_server_name(link->dhcp_server, link->network->dhcp_server_boot_server_name); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to set boot server name for DHCPv4 server instance: %m"); + + r = sd_dhcp_server_set_boot_filename(link->dhcp_server, link->network->dhcp_server_boot_filename); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to set boot filename for DHCPv4 server instance: %m"); + + r = sd_dhcp_server_set_rapid_commit(link->dhcp_server, link->network->dhcp_server_rapid_commit); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to %s Rapid Commit support for DHCPv4 server instance: %m", + enable_disable(link->network->dhcp_server_rapid_commit)); + + for (sd_dhcp_lease_server_type_t type = 0; type < _SD_DHCP_LEASE_SERVER_TYPE_MAX; type ++) { + + if (!link->network->dhcp_server_emit[type].emit) + continue; + + if (link->network->dhcp_server_emit[type].n_addresses > 0) + /* Explicitly specified servers to emit */ + r = sd_dhcp_server_set_servers( + link->dhcp_server, + type, + link->network->dhcp_server_emit[type].addresses, + link->network->dhcp_server_emit[type].n_addresses); + else { + /* Emission is requested, but nothing explicitly configured. Let's find a suitable upling */ + if (!acquired_uplink) { + (void) dhcp_server_find_uplink(link, &uplink); + acquired_uplink = true; + } + + if (uplink && uplink->network) + r = link_push_uplink_to_dhcp_server(uplink, type, link->dhcp_server); + else if (type == SD_DHCP_LEASE_DNS) + r = dhcp4_server_set_dns_from_resolve_conf(link); + else { + log_link_debug(link, + "Not emitting %s on link, couldn't find suitable uplink.", + dhcp_lease_server_type_to_string(type)); + continue; + } + } + + if (r < 0) + log_link_warning_errno(link, r, + "Failed to set %s for DHCP server, ignoring: %m", + dhcp_lease_server_type_to_string(type)); + } + + if (link->network->dhcp_server_emit_router) { + r = sd_dhcp_server_set_router(link->dhcp_server, &link->network->dhcp_server_router); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set router address for DHCP server: %m"); + } + + r = sd_dhcp_server_set_relay_target(link->dhcp_server, &link->network->dhcp_server_relay_target); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set relay target for DHCP server: %m"); + + bind_to_interface = sd_dhcp_server_is_in_relay_mode(link->dhcp_server) ? false : link->network->dhcp_server_bind_to_interface; + r = sd_dhcp_server_set_bind_to_interface(link->dhcp_server, bind_to_interface); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set interface binding for DHCP server: %m"); + + r = sd_dhcp_server_set_relay_agent_information(link->dhcp_server, link->network->dhcp_server_relay_agent_circuit_id, link->network->dhcp_server_relay_agent_remote_id); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set agent circuit/remote id for DHCP server: %m"); + + if (link->network->dhcp_server_emit_timezone) { + _cleanup_free_ char *buffer = NULL; + const char *tz = NULL; + + if (link->network->dhcp_server_timezone) + tz = link->network->dhcp_server_timezone; + else { + r = get_timezone(&buffer); + if (r < 0) + log_link_warning_errno(link, r, "Failed to determine timezone, not sending timezone: %m"); + else + tz = buffer; + } + + if (tz) { + r = sd_dhcp_server_set_timezone(link->dhcp_server, tz); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set timezone for DHCP server: %m"); + } + } + + ORDERED_HASHMAP_FOREACH(p, link->network->dhcp_server_send_options) { + r = sd_dhcp_server_add_option(link->dhcp_server, p); + if (r == -EEXIST) + continue; + if (r < 0) + return log_link_error_errno(link, r, "Failed to set DHCPv4 option: %m"); + } + + ORDERED_HASHMAP_FOREACH(p, link->network->dhcp_server_send_vendor_options) { + r = sd_dhcp_server_add_vendor_option(link->dhcp_server, p); + if (r == -EEXIST) + continue; + if (r < 0) + return log_link_error_errno(link, r, "Failed to set DHCPv4 option: %m"); + } + + HASHMAP_FOREACH(static_lease, link->network->dhcp_static_leases_by_section) { + r = sd_dhcp_server_set_static_lease(link->dhcp_server, &static_lease->address, static_lease->client_id, static_lease->client_id_size); + if (r < 0) + return log_link_error_errno(link, r, "Failed to set DHCPv4 static lease for DHCP server: %m"); + } + + r = sd_dhcp_server_start(link->dhcp_server); + if (r < 0) + return log_link_error_errno(link, r, "Could not start DHCPv4 server instance: %m"); + + log_link_debug(link, "Offering DHCPv4 leases"); + return 0; +} + +static bool dhcp_server_is_ready_to_configure(Link *link) { + Link *uplink = NULL; + Address *a; + + assert(link); + assert(link->network); + assert(link->network->dhcp_server_address); + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return false; + + if (!link_has_carrier(link)) + return false; + + if (!link->static_addresses_configured) + return false; + + if (address_get(link, link->network->dhcp_server_address, &a) < 0) + return false; + + if (!address_is_ready(a)) + return false; + + if (dhcp_server_find_uplink(link, &uplink) < 0) + return false; + + if (uplink && !uplink->network) + return false; + + return true; +} + +static int dhcp_server_process_request(Request *req, Link *link, void *userdata) { + int r; + + assert(link); + + if (!dhcp_server_is_ready_to_configure(link)) + return 0; + + r = dhcp4_server_configure(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure DHCP server: %m"); + + return 1; +} + +int link_request_dhcp_server(Link *link) { + int r; + + assert(link); + + if (!link_dhcp4_server_enabled(link)) + return 0; + + if (link->dhcp_server) + return 0; + + log_link_debug(link, "Requesting DHCP server."); + r = link_queue_request(link, REQUEST_TYPE_DHCP_SERVER, dhcp_server_process_request, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request configuration of DHCP server: %m"); + + return 0; +} + +int config_parse_dhcp_server_relay_agent_suboption( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **suboption_value = data; + char* p; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *suboption_value = mfree(*suboption_value); + return 0; + } + + p = startswith(rvalue, "string:"); + if (!p) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse %s=%s'. Invalid format, ignoring.", lvalue, rvalue); + return 0; + } + return free_and_strdup(suboption_value, empty_to_null(p)); +} + +int config_parse_dhcp_server_emit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + NetworkDHCPServerEmitAddress *emit = ASSERT_PTR(data); + + assert(rvalue); + + if (isempty(rvalue)) { + emit->addresses = mfree(emit->addresses); + emit->n_addresses = 0; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + union in_addr_union a; + int r; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract word, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + if (streq(w, "_server_address")) + a = IN_ADDR_NULL; /* null address will be converted to the server address. */ + else { + r = in_addr_from_string(AF_INET, w, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s= address '%s', ignoring: %m", lvalue, w); + continue; + } + + if (in4_addr_is_null(&a.in)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Found a null address in %s=, ignoring.", lvalue); + continue; + } + } + + if (!GREEDY_REALLOC(emit->addresses, emit->n_addresses + 1)) + return log_oom(); + + emit->addresses[emit->n_addresses++] = a.in; + } +} + +int config_parse_dhcp_server_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + union in_addr_union a; + unsigned char prefixlen; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + network->dhcp_server_address_in_addr = (struct in_addr) {}; + network->dhcp_server_address_prefixlen = 0; + return 0; + } + + r = in_addr_prefix_from_string(rvalue, AF_INET, &a, &prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + if (in4_addr_is_localhost(&a.in) || in4_addr_is_link_local(&a.in)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "DHCP server address cannot be a localhost or link-local address, " + "ignoring assignment: %s", rvalue); + return 0; + } + + network->dhcp_server_address_in_addr = a.in; + network->dhcp_server_address_prefixlen = prefixlen; + return 0; +} + +int config_parse_dhcp_server_ipv6_only_preferred( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t t, *usec = ASSERT_PTR(data); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *usec = 0; + return 0; + } + + r = parse_sec(rvalue, &t); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse [%s] %s=, ignoring assignment: %s", section, lvalue, rvalue); + return 0; + } + + if (t < MIN_V6ONLY_WAIT_USEC && !network_test_mode_enabled()) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid [%s] %s=, ignoring assignment: %s", section, lvalue, rvalue); + return 0; + } + + *usec = t; + return 0; +} diff --git a/src/network/networkd-dhcp-server.h b/src/network/networkd-dhcp-server.h new file mode 100644 index 0000000..960232a --- /dev/null +++ b/src/network/networkd-dhcp-server.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "set.h" + +typedef struct Link Link; +typedef struct Network Network; + +int network_adjust_dhcp_server(Network *network, Set **addresses); + +int link_request_dhcp_server(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_server_relay_agent_suboption); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_server_emit); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_server_address); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_server_ipv6_only_preferred); diff --git a/src/network/networkd-dhcp4-bus.c b/src/network/networkd-dhcp4-bus.c new file mode 100644 index 0000000..e00aa03 --- /dev/null +++ b/src/network/networkd-dhcp4-bus.c @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-dhcp-client.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-util.h" +#include "dhcp-client-internal.h" +#include "networkd-dhcp4-bus.h" +#include "networkd-link-bus.h" +#include "networkd-manager.h" +#include "string-table.h" +#include "strv.h" + +static int property_get_dhcp_client_state( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *l = ASSERT_PTR(userdata); + sd_dhcp_client *c; + + assert(reply); + + c = l->dhcp_client; + if (!c) + return sd_bus_message_append(reply, "s", "disabled"); + + return sd_bus_message_append(reply, "s", dhcp_state_to_string(dhcp_client_get_state(c))); +} + +static int dhcp_client_emit_changed(Link *link, const char *property, ...) { + _cleanup_free_ char *path = NULL; + char **l; + + assert(link); + + if (sd_bus_is_ready(link->manager->bus) <= 0) + return 0; + + path = link_bus_path(link); + if (!path) + return log_oom(); + + l = strv_from_stdarg_alloca(property); + + return sd_bus_emit_properties_changed_strv( + link->manager->bus, + path, + "org.freedesktop.network1.DHCPv4Client", + l); +} + +int dhcp_client_callback_bus(sd_dhcp_client *c, int event, void *userdata) { + Link *l = ASSERT_PTR(userdata); + + return dhcp_client_emit_changed(l, "State", NULL); +} + +static const sd_bus_vtable dhcp_client_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("State", "s", property_get_dhcp_client_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation dhcp_client_object = { + "/org/freedesktop/network1/link", + "org.freedesktop.network1.DHCPv4Client", + .fallback_vtables = BUS_FALLBACK_VTABLES({dhcp_client_vtable, link_object_find}), + .node_enumerator = link_node_enumerator, +}; diff --git a/src/network/networkd-dhcp4-bus.h b/src/network/networkd-dhcp4-bus.h new file mode 100644 index 0000000..482e824 --- /dev/null +++ b/src/network/networkd-dhcp4-bus.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-dhcp-client.h" + +#include "networkd-link-bus.h" + +extern const BusObjectImplementation dhcp_client_object; + +int dhcp_client_callback_bus(sd_dhcp_client *client, int event, void *userdata); diff --git a/src/network/networkd-dhcp4.c b/src/network/networkd-dhcp4.c new file mode 100644 index 0000000..49c452d --- /dev/null +++ b/src/network/networkd-dhcp4.c @@ -0,0 +1,2025 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dhcp-client-internal.h" +#include "hostname-setup.h" +#include "hostname-util.h" +#include "parse-util.h" +#include "network-internal.h" +#include "networkd-address.h" +#include "networkd-dhcp-prefix-delegation.h" +#include "networkd-dhcp4-bus.h" +#include "networkd-dhcp4.h" +#include "networkd-ipv4acd.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-nexthop.h" +#include "networkd-queue.h" +#include "networkd-route.h" +#include "networkd-setlink.h" +#include "networkd-state-file.h" +#include "string-table.h" +#include "strv.h" +#include "sysctl-util.h" + +void network_adjust_dhcp4(Network *network) { + assert(network); + + if (!FLAGS_SET(network->dhcp, ADDRESS_FAMILY_IPV4)) + return; + + if (network->dhcp_use_gateway < 0) + network->dhcp_use_gateway = network->dhcp_use_routes; + + /* RFC7844 section 3.: MAY contain the Client Identifier option + * Section 3.5: clients MUST use client identifiers based solely on the link-layer address + * NOTE: Using MAC, as it does not reveal extra information, and some servers might not answer + * if this option is not sent */ + if (network->dhcp_anonymize && + network->dhcp_client_identifier >= 0 && + network->dhcp_client_identifier != DHCP_CLIENT_ID_MAC) { + log_warning("%s: ClientIdentifier= is set, although Anonymize=yes. Using ClientIdentifier=mac.", + network->filename); + network->dhcp_client_identifier = DHCP_CLIENT_ID_MAC; + } + + if (network->dhcp_client_identifier < 0) + network->dhcp_client_identifier = network->dhcp_anonymize ? DHCP_CLIENT_ID_MAC : DHCP_CLIENT_ID_DUID; + + /* By default, RapidCommit= is enabled when Anonymize=no and neither AllowList= nor DenyList= is specified. */ + if (network->dhcp_use_rapid_commit < 0) + network->dhcp_use_rapid_commit = + !network->dhcp_anonymize && + set_isempty(network->dhcp_allow_listed_ip) && + set_isempty(network->dhcp_deny_listed_ip); +} + +static int dhcp4_prefix_covers( + Link *link, + const struct in_addr *in_prefix, + uint8_t in_prefixlen) { + + struct in_addr prefix; + uint8_t prefixlen; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(in_prefix); + + /* Return true if the input address or address range is in the assigned network. + * E.g. if the DHCP server provides 192.168.0.100/24, then this returns true for the address or + * address range in 192.168.0.0/24, and returns false otherwise. */ + + r = sd_dhcp_lease_get_prefix(link->dhcp_lease, &prefix, &prefixlen); + if (r < 0) + return r; + + return in4_addr_prefix_covers_full(&prefix, prefixlen, in_prefix, in_prefixlen); +} + +static int dhcp4_get_router(Link *link, struct in_addr *ret) { + const struct in_addr *routers; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(ret); + + r = sd_dhcp_lease_get_router(link->dhcp_lease, &routers); + if (r < 0) + return r; + + /* The router option may provide multiple routers, We only use the first non-null address. */ + + FOREACH_ARRAY(router, routers, r) { + if (in4_addr_is_null(router)) + continue; + + *ret = *router; + return 0; + } + + return -ENODATA; +} + +static int dhcp4_get_classless_static_or_static_routes(Link *link, sd_dhcp_route ***ret_routes, size_t *ret_num) { + _cleanup_free_ sd_dhcp_route **routes = NULL; + int r; + + assert(link); + assert(link->dhcp_lease); + + /* If the DHCP server returns both a Classless Static Routes option and a Static Routes option, + * the DHCP client MUST ignore the Static Routes option. */ + + r = sd_dhcp_lease_get_classless_routes(link->dhcp_lease, &routes); + if (r >= 0) { + assert(r > 0); + if (ret_routes) + *ret_routes = TAKE_PTR(routes); + if (ret_num) + *ret_num = r; + return 1; /* classless */ + } else if (r != -ENODATA) + return r; + + r = sd_dhcp_lease_get_static_routes(link->dhcp_lease, &routes); + if (r < 0) + return r; + + assert(r > 0); + if (ret_routes) + *ret_routes = TAKE_PTR(routes); + if (ret_num) + *ret_num = r; + return 0; /* static */ +} + +static int dhcp4_find_gateway_for_destination( + Link *link, + const struct in_addr *destination, + uint8_t prefixlength, + bool allow_null, + struct in_addr *ret) { + + _cleanup_free_ sd_dhcp_route **routes = NULL; + size_t n_routes = 0; + bool is_classless, reachable; + uint8_t max_prefixlen = UINT8_MAX; + struct in_addr gw; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(destination); + assert(ret); + + /* This tries to find the most suitable gateway for an address or address range. + * E.g. if the server provides the default gateway 192.168.0.1 and a classless static route for + * 8.0.0.0/8 with gateway 192.168.0.2, then this returns 192.168.0.2 for 8.8.8.8/32, and 192.168.0.1 + * for 9.9.9.9/32. If 'allow_null' flag is set, and the input address or address range is in the + * assigned network, then the default gateway will be ignored and the null address will be returned + * unless a matching non-default gateway found. */ + + r = dhcp4_prefix_covers(link, destination, prefixlength); + if (r < 0) + return r; + reachable = r > 0; + + r = dhcp4_get_classless_static_or_static_routes(link, &routes, &n_routes); + if (r < 0 && r != -ENODATA) + return r; + is_classless = r > 0; + + /* First, find most suitable gateway. */ + FOREACH_ARRAY(e, routes, n_routes) { + struct in_addr dst; + uint8_t len; + + r = sd_dhcp_route_get_destination(*e, &dst); + if (r < 0) + return r; + + r = sd_dhcp_route_get_destination_prefix_length(*e, &len); + if (r < 0) + return r; + + r = in4_addr_prefix_covers_full(&dst, len, destination, prefixlength); + if (r < 0) + return r; + if (r == 0) + continue; + + if (max_prefixlen != UINT8_MAX && max_prefixlen > len) + continue; + + r = sd_dhcp_route_get_gateway(*e, &gw); + if (r < 0) + return r; + + max_prefixlen = len; + } + + /* Found a suitable gateway in classless static routes or static routes. */ + if (max_prefixlen != UINT8_MAX) { + if (max_prefixlen == 0 && reachable && allow_null) + /* Do not return the default gateway, if the destination is in the assigned network. */ + *ret = (struct in_addr) {}; + else + *ret = gw; + return 0; + } + + /* When the destination is in the assigned network, return the null address if allowed. */ + if (reachable && allow_null) { + *ret = (struct in_addr) {}; + return 0; + } + + /* According to RFC 3442: If the DHCP server returns both a Classless Static Routes option and + * a Router option, the DHCP client MUST ignore the Router option. */ + if (!is_classless) { + r = dhcp4_get_router(link, ret); + if (r >= 0) + return 0; + if (r != -ENODATA) + return r; + } + + if (!reachable) + return -EHOSTUNREACH; /* Not in the same network, cannot reach the destination. */ + + assert(!allow_null); + return -ENODATA; /* No matching gateway found. */ +} + +static int dhcp4_remove_address_and_routes(Link *link, bool only_marked) { + Address *address; + Route *route; + int k, r = 0; + + assert(link); + + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_DHCP4) + continue; + if (only_marked && !route_is_marked(route)) + continue; + + k = route_remove(route); + if (k < 0) + r = k; + + route_cancel_request(route, link); + } + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_DHCP4) + continue; + if (only_marked && !address_is_marked(address)) + continue; + + k = address_remove_and_drop(address); + if (k < 0) + r = k; + } + + return r; +} + +static int dhcp4_address_get(Link *link, Address **ret) { + Address *address; + + assert(link); + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_DHCP4) + continue; + if (address_is_marked(address)) + continue; + + if (ret) + *ret = address; + return 0; + } + + return -ENOENT; +} + +static int dhcp4_address_ready_callback(Address *address) { + assert(address); + assert(address->link); + + /* Do not call this again. */ + address->callback = NULL; + + return dhcp4_check_ready(address->link); +} + +int dhcp4_check_ready(Link *link) { + Address *address; + int r; + + assert(link); + + if (link->dhcp4_messages > 0) { + log_link_debug(link, "%s(): DHCPv4 address and routes are not set.", __func__); + return 0; + } + + if (dhcp4_address_get(link, &address) < 0) { + log_link_debug(link, "%s(): DHCPv4 address is not set.", __func__); + return 0; + } + + if (!address_is_ready(address)) { + log_link_debug(link, "%s(): DHCPv4 address is not ready.", __func__); + address->callback = dhcp4_address_ready_callback; + return 0; + } + + link->dhcp4_configured = true; + log_link_debug(link, "DHCPv4 address and routes set."); + + /* New address and routes are configured now. Let's release old lease. */ + r = dhcp4_remove_address_and_routes(link, /* only_marked = */ true); + if (r < 0) + return r; + + r = sd_ipv4ll_stop(link->ipv4ll); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to drop IPv4 link-local address: %m"); + + link_check_ready(link); + return 0; +} + +static int dhcp4_route_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Route *route) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not set DHCPv4 route"); + link_enter_failed(link); + return 1; + } + + r = dhcp4_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int dhcp4_request_route(Route *in, Link *link) { + _cleanup_(route_freep) Route *route = in; + struct in_addr server; + Route *existing; + int r; + + assert(route); + assert(link); + assert(link->network); + assert(link->dhcp_lease); + + r = sd_dhcp_lease_get_server_identifier(link->dhcp_lease, &server); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get DHCP server IP address: %m"); + + route->source = NETWORK_CONFIG_SOURCE_DHCP4; + route->provider.in = server; + route->family = AF_INET; + if (!route->protocol_set) + route->protocol = RTPROT_DHCP; + if (!route->priority_set) + route->priority = link->network->dhcp_route_metric; + if (!route->table_set) + route->table = link_get_dhcp4_route_table(link); + if (route->mtu == 0) + route->mtu = link->network->dhcp_route_mtu; + if (route->quickack < 0) + route->quickack = link->network->dhcp_quickack; + if (route->initcwnd == 0) + route->initcwnd = link->network->dhcp_initial_congestion_window; + if (route->initrwnd == 0) + route->initrwnd = link->network->dhcp_advertised_receive_window; + + if (route_get(NULL, link, route, &existing) < 0) /* This is a new route. */ + link->dhcp4_configured = false; + else + route_unmark(existing); + + return link_request_route(link, TAKE_PTR(route), true, &link->dhcp4_messages, + dhcp4_route_handler, NULL); +} + +static bool link_prefixroute(Link *link) { + return !link->network->dhcp_route_table_set || + link->network->dhcp_route_table == RT_TABLE_MAIN; +} + +static int dhcp4_request_prefix_route(Link *link) { + _cleanup_(route_freep) Route *route = NULL; + int r; + + assert(link); + assert(link->dhcp_lease); + + if (link_prefixroute(link)) + /* When true, the route will be created by kernel. See dhcp4_update_address(). */ + return 0; + + r = route_new(&route); + if (r < 0) + return r; + + route->scope = RT_SCOPE_LINK; + + r = sd_dhcp_lease_get_prefix(link->dhcp_lease, &route->dst.in, &route->dst_prefixlen); + if (r < 0) + return r; + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &route->prefsrc.in); + if (r < 0) + return r; + + return dhcp4_request_route(TAKE_PTR(route), link); +} + +static int dhcp4_request_route_to_gateway(Link *link, const struct in_addr *gw) { + _cleanup_(route_freep) Route *route = NULL; + struct in_addr address; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(gw); + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &address); + if (r < 0) + return r; + + r = route_new(&route); + if (r < 0) + return r; + + route->dst.in = *gw; + route->dst_prefixlen = 32; + route->prefsrc.in = address; + route->scope = RT_SCOPE_LINK; + + return dhcp4_request_route(TAKE_PTR(route), link); +} + +static int dhcp4_request_route_auto( + Route *in, + Link *link, + const struct in_addr *gw) { + + _cleanup_(route_freep) Route *route = in; + struct in_addr address; + int r; + + assert(route); + assert(link); + assert(link->dhcp_lease); + assert(gw); + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &address); + if (r < 0) + return r; + + if (in4_addr_is_localhost(&route->dst.in)) { + if (in4_addr_is_set(gw)) + log_link_debug(link, "DHCP: requested route destination "IPV4_ADDRESS_FMT_STR"/%u is localhost, " + "ignoring gateway address "IPV4_ADDRESS_FMT_STR, + IPV4_ADDRESS_FMT_VAL(route->dst.in), route->dst_prefixlen, IPV4_ADDRESS_FMT_VAL(*gw)); + + route->scope = RT_SCOPE_HOST; + route->gw_family = AF_UNSPEC; + route->gw = IN_ADDR_NULL; + route->prefsrc = IN_ADDR_NULL; + + } else if (in4_addr_equal(&route->dst.in, &address)) { + if (in4_addr_is_set(gw)) + log_link_debug(link, "DHCP: requested route destination "IPV4_ADDRESS_FMT_STR"/%u is equivalent to the acquired address, " + "ignoring gateway address "IPV4_ADDRESS_FMT_STR, + IPV4_ADDRESS_FMT_VAL(route->dst.in), route->dst_prefixlen, IPV4_ADDRESS_FMT_VAL(*gw)); + + route->scope = RT_SCOPE_HOST; + route->gw_family = AF_UNSPEC; + route->gw = IN_ADDR_NULL; + route->prefsrc.in = address; + + } else if (in4_addr_is_null(gw)) { + r = dhcp4_prefix_covers(link, &route->dst.in, route->dst_prefixlen); + if (r < 0) + return r; + if (r == 0 && DEBUG_LOGGING) { + struct in_addr prefix; + uint8_t prefixlen; + + r = sd_dhcp_lease_get_prefix(link->dhcp_lease, &prefix, &prefixlen); + if (r < 0) + return r; + + log_link_debug(link, "DHCP: requested route destination "IPV4_ADDRESS_FMT_STR"/%u is not in the assigned network " + IPV4_ADDRESS_FMT_STR"/%u, but no gateway is specified, using 'link' scope.", + IPV4_ADDRESS_FMT_VAL(route->dst.in), route->dst_prefixlen, + IPV4_ADDRESS_FMT_VAL(prefix), prefixlen); + } + + route->scope = RT_SCOPE_LINK; + route->gw_family = AF_UNSPEC; + route->gw = IN_ADDR_NULL; + route->prefsrc.in = address; + + } else { + r = dhcp4_request_route_to_gateway(link, gw); + if (r < 0) + return r; + + route->scope = RT_SCOPE_UNIVERSE; + route->gw_family = AF_INET; + route->gw.in = *gw; + route->prefsrc.in = address; + } + + return dhcp4_request_route(TAKE_PTR(route), link); +} + +static int dhcp4_request_classless_static_or_static_routes(Link *link) { + _cleanup_free_ sd_dhcp_route **routes = NULL; + size_t n_routes; + int r; + + assert(link); + assert(link->dhcp_lease); + + if (!link->network->dhcp_use_routes) + return 0; + + r = dhcp4_get_classless_static_or_static_routes(link, &routes, &n_routes); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + FOREACH_ARRAY(e, routes, n_routes) { + _cleanup_(route_freep) Route *route = NULL; + struct in_addr gw; + + r = route_new(&route); + if (r < 0) + return r; + + r = sd_dhcp_route_get_gateway(*e, &gw); + if (r < 0) + return r; + + r = sd_dhcp_route_get_destination(*e, &route->dst.in); + if (r < 0) + return r; + + r = sd_dhcp_route_get_destination_prefix_length(*e, &route->dst_prefixlen); + if (r < 0) + return r; + + r = dhcp4_request_route_auto(TAKE_PTR(route), link, &gw); + if (r < 0) + return r; + } + + return 0; +} + +static int dhcp4_request_default_gateway(Link *link) { + _cleanup_(route_freep) Route *route = NULL; + struct in_addr address, router; + int r; + + assert(link); + assert(link->dhcp_lease); + + if (!link->network->dhcp_use_gateway) + return 0; + + /* According to RFC 3442: If the DHCP server returns both a Classless Static Routes option and + * a Router option, the DHCP client MUST ignore the Router option. */ + if (link->network->dhcp_use_routes && + dhcp4_get_classless_static_or_static_routes(link, NULL, NULL) > 0) + return 0; + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &address); + if (r < 0) + return r; + + r = dhcp4_get_router(link, &router); + if (r == -ENODATA) { + log_link_debug(link, "DHCP: No valid router address received from DHCP server."); + return 0; + } + if (r < 0) + return r; + + /* The dhcp netmask may mask out the gateway. First, add an explicit route for the gateway host + * so that we can route no matter the netmask or existing kernel route tables. */ + r = dhcp4_request_route_to_gateway(link, &router); + if (r < 0) + return r; + + r = route_new(&route); + if (r < 0) + return r; + + /* Next, add a default gateway. */ + route->gw_family = AF_INET; + route->gw.in = router; + route->prefsrc.in = address; + + return dhcp4_request_route(TAKE_PTR(route), link); +} + +static int dhcp4_request_semi_static_routes(Link *link) { + Route *rt; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(link->network); + + HASHMAP_FOREACH(rt, link->network->routes_by_section) { + _cleanup_(route_freep) Route *route = NULL; + struct in_addr gw; + + if (!rt->gateway_from_dhcp_or_ra) + continue; + + if (rt->gw_family != AF_INET) + continue; + + assert(rt->family == AF_INET); + + r = dhcp4_find_gateway_for_destination(link, &rt->dst.in, rt->dst_prefixlen, /* allow_null = */ false, &gw); + if (IN_SET(r, -EHOSTUNREACH, -ENODATA)) { + log_link_debug_errno(link, r, "DHCP: Cannot find suitable gateway for destination %s of semi-static route, ignoring: %m", + IN4_ADDR_PREFIX_TO_STRING(&rt->dst.in, rt->dst_prefixlen)); + continue; + } + if (r < 0) + return r; + + r = dhcp4_request_route_to_gateway(link, &gw); + if (r < 0) + return r; + + r = route_dup(rt, &route); + if (r < 0) + return r; + + route->gw.in = gw; + + r = dhcp4_request_route(TAKE_PTR(route), link); + if (r < 0) + return r; + } + + return 0; +} + +static int dhcp4_request_routes_to_servers( + Link *link, + const struct in_addr *servers, + size_t n_servers) { + + int r; + + assert(link); + assert(link->dhcp_lease); + assert(link->network); + assert(servers || n_servers == 0); + + FOREACH_ARRAY(dst, servers, n_servers) { + _cleanup_(route_freep) Route *route = NULL; + struct in_addr gw; + + if (in4_addr_is_null(dst)) + continue; + + r = dhcp4_find_gateway_for_destination(link, dst, 32, /* allow_null = */ true, &gw); + if (r == -EHOSTUNREACH) { + log_link_debug_errno(link, r, "DHCP: Cannot find suitable gateway for destination %s, ignoring: %m", + IN4_ADDR_PREFIX_TO_STRING(dst, 32)); + continue; + } + if (r < 0) + return r; + + r = route_new(&route); + if (r < 0) + return r; + + route->dst.in = *dst; + route->dst_prefixlen = 32; + + r = dhcp4_request_route_auto(TAKE_PTR(route), link, &gw); + if (r < 0) + return r; + } + + return 0; +} + +static int dhcp4_request_routes_to_dns(Link *link) { + const struct in_addr *dns; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(link->network); + + if (!link->network->dhcp_use_dns || + !link->network->dhcp_routes_to_dns) + return 0; + + r = sd_dhcp_lease_get_dns(link->dhcp_lease, &dns); + if (IN_SET(r, 0, -ENODATA)) + return 0; + if (r < 0) + return r; + + return dhcp4_request_routes_to_servers(link, dns, r); +} + +static int dhcp4_request_routes_to_ntp(Link *link) { + const struct in_addr *ntp; + int r; + + assert(link); + assert(link->dhcp_lease); + assert(link->network); + + if (!link->network->dhcp_use_ntp || + !link->network->dhcp_routes_to_ntp) + return 0; + + r = sd_dhcp_lease_get_ntp(link->dhcp_lease, &ntp); + if (IN_SET(r, 0, -ENODATA)) + return 0; + if (r < 0) + return r; + + return dhcp4_request_routes_to_servers(link, ntp, r); +} + +static int dhcp4_request_routes(Link *link) { + int r; + + assert(link); + assert(link->dhcp_lease); + + r = dhcp4_request_prefix_route(link); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not request prefix route: %m"); + + r = dhcp4_request_default_gateway(link); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not request default gateway: %m"); + + r = dhcp4_request_classless_static_or_static_routes(link); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not request static routes: %m"); + + r = dhcp4_request_semi_static_routes(link); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not request routes with Gateway=_dhcp4 setting: %m"); + + r = dhcp4_request_routes_to_dns(link); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not request routes to DNS servers: %m"); + + r = dhcp4_request_routes_to_ntp(link); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not request routes to NTP servers: %m"); + + return 0; +} + +static int dhcp_reset_mtu(Link *link) { + int r; + + assert(link); + + if (!link->network->dhcp_use_mtu) + return 0; + + r = link_request_to_set_mtu(link, link->original_mtu); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Could not queue request to reset MTU: %m"); + + return 0; +} + +static int dhcp_reset_hostname(Link *link) { + const char *hostname; + int r; + + assert(link); + + if (!link->network->dhcp_use_hostname) + return 0; + + hostname = link->network->dhcp_hostname; + if (!hostname) + (void) sd_dhcp_lease_get_hostname(link->dhcp_lease, &hostname); + + if (!hostname) + return 0; + + /* If a hostname was set due to the lease, then unset it now. */ + r = manager_set_hostname(link->manager, NULL); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: Failed to reset transient hostname: %m"); + + return 0; +} + +int dhcp4_lease_lost(Link *link) { + int k, r = 0; + + assert(link); + assert(link->dhcp_lease); + assert(link->network); + + log_link_info(link, "DHCP lease lost"); + + link->dhcp4_configured = false; + + if (link->network->dhcp_use_6rd && + sd_dhcp_lease_has_6rd(link->dhcp_lease)) + dhcp4_pd_prefix_lost(link); + + k = dhcp4_remove_address_and_routes(link, /* only_marked = */ false); + if (k < 0) + r = k; + + k = dhcp_reset_mtu(link); + if (k < 0) + r = k; + + k = dhcp_reset_hostname(link); + if (k < 0) + r = k; + + link->dhcp_lease = sd_dhcp_lease_unref(link->dhcp_lease); + link_dirty(link); + + /* If one of the above failed. Do not request nexthops and routes. */ + if (r < 0) + return r; + + r = link_request_static_nexthops(link, true); + if (r < 0) + return r; + + return link_request_static_routes(link, true); +} + +static int dhcp4_address_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Address *address) { + int r; + + assert(link); + + r = address_configure_handler_internal(rtnl, m, link, "Could not set DHCPv4 address"); + if (r <= 0) + return r; + + r = dhcp4_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int dhcp4_request_address(Link *link, bool announce) { + _cleanup_(address_freep) Address *addr = NULL; + struct in_addr address, server; + uint8_t prefixlen; + Address *existing; + usec_t lifetime_usec; + int r; + + assert(link); + assert(link->manager); + assert(link->network); + assert(link->dhcp_lease); + + r = sd_dhcp_lease_get_address(link->dhcp_lease, &address); + if (r < 0) + return log_link_warning_errno(link, r, "DHCP error: no address: %m"); + + r = sd_dhcp_lease_get_prefix(link->dhcp_lease, NULL, &prefixlen); + if (r < 0) + return log_link_warning_errno(link, r, "DHCP error: no netmask: %m"); + + r = sd_dhcp_lease_get_server_identifier(link->dhcp_lease, &server); + if (r < 0) + return log_link_debug_errno(link, r, "DHCP error: failed to get DHCP server IP address: %m"); + + if (!FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP)) { + r = sd_dhcp_lease_get_lifetime_timestamp(link->dhcp_lease, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "DHCP error: failed to get lifetime: %m"); + } else + lifetime_usec = USEC_INFINITY; + + if (announce) { + const struct in_addr *router; + + r = sd_dhcp_lease_get_router(link->dhcp_lease, &router); + if (r < 0 && r != -ENODATA) + return log_link_error_errno(link, r, "DHCP error: Could not get gateway: %m"); + + if (r > 0 && in4_addr_is_set(&router[0])) + log_struct(LOG_INFO, + LOG_LINK_INTERFACE(link), + LOG_LINK_MESSAGE(link, "DHCPv4 address "IPV4_ADDRESS_FMT_STR"/%u, gateway "IPV4_ADDRESS_FMT_STR" acquired from "IPV4_ADDRESS_FMT_STR, + IPV4_ADDRESS_FMT_VAL(address), + prefixlen, + IPV4_ADDRESS_FMT_VAL(router[0]), + IPV4_ADDRESS_FMT_VAL(server)), + "ADDRESS="IPV4_ADDRESS_FMT_STR, IPV4_ADDRESS_FMT_VAL(address), + "PREFIXLEN=%u", prefixlen, + "GATEWAY="IPV4_ADDRESS_FMT_STR, IPV4_ADDRESS_FMT_VAL(router[0])); + else + log_struct(LOG_INFO, + LOG_LINK_INTERFACE(link), + LOG_LINK_MESSAGE(link, "DHCPv4 address "IPV4_ADDRESS_FMT_STR"/%u acquired from "IPV4_ADDRESS_FMT_STR, + IPV4_ADDRESS_FMT_VAL(address), + prefixlen, + IPV4_ADDRESS_FMT_VAL(server)), + "ADDRESS="IPV4_ADDRESS_FMT_STR, IPV4_ADDRESS_FMT_VAL(address), + "PREFIXLEN=%u", prefixlen); + } + + r = address_new(&addr); + if (r < 0) + return log_oom(); + + addr->source = NETWORK_CONFIG_SOURCE_DHCP4; + addr->provider.in = server; + addr->family = AF_INET; + addr->in_addr.in.s_addr = address.s_addr; + addr->lifetime_preferred_usec = lifetime_usec; + addr->lifetime_valid_usec = lifetime_usec; + addr->prefixlen = prefixlen; + r = sd_dhcp_lease_get_broadcast(link->dhcp_lease, &addr->broadcast); + if (r < 0 && r != -ENODATA) + return log_link_warning_errno(link, r, "DHCP: failed to get broadcast address: %m"); + SET_FLAG(addr->flags, IFA_F_NOPREFIXROUTE, !link_prefixroute(link)); + addr->route_metric = link->network->dhcp_route_metric; + addr->duplicate_address_detection = link->network->dhcp_send_decline ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_NO; + + r = free_and_strdup_warn(&addr->label, link->network->dhcp_label); + if (r < 0) + return r; + + r = free_and_strdup_warn(&addr->netlabel, link->network->dhcp_netlabel); + if (r < 0) + return r; + + if (address_get(link, addr, &existing) < 0) /* The address is new. */ + link->dhcp4_configured = false; + else + address_unmark(existing); + + r = link_request_address(link, addr, &link->dhcp4_messages, + dhcp4_address_handler, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request DHCPv4 address: %m"); + + return 0; +} + +static int dhcp4_request_address_and_routes(Link *link, bool announce) { + int r; + + assert(link); + + link_mark_addresses(link, NETWORK_CONFIG_SOURCE_DHCP4); + link_mark_routes(link, NETWORK_CONFIG_SOURCE_DHCP4); + + r = dhcp4_request_address(link, announce); + if (r < 0) + return r; + + r = dhcp4_request_routes(link); + if (r < 0) + return r; + + if (!link->dhcp4_configured) { + link_set_state(link, LINK_STATE_CONFIGURING); + link_check_ready(link); + } + + return 0; +} + +static int dhcp_lease_renew(sd_dhcp_client *client, Link *link) { + _cleanup_(sd_dhcp_lease_unrefp) sd_dhcp_lease *old_lease = NULL; + sd_dhcp_lease *lease; + int r; + + assert(link); + assert(link->network); + assert(client); + + r = sd_dhcp_client_get_lease(client, &lease); + if (r < 0) + return log_link_warning_errno(link, r, "DHCP error: no lease: %m"); + + old_lease = TAKE_PTR(link->dhcp_lease); + link->dhcp_lease = sd_dhcp_lease_ref(lease); + link_dirty(link); + + if (link->network->dhcp_use_6rd) { + if (sd_dhcp_lease_has_6rd(link->dhcp_lease)) { + r = dhcp4_pd_prefix_acquired(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to process 6rd option: %m"); + } else if (sd_dhcp_lease_has_6rd(old_lease)) + dhcp4_pd_prefix_lost(link); + } + + return dhcp4_request_address_and_routes(link, false); +} + +static int dhcp_lease_acquired(sd_dhcp_client *client, Link *link) { + sd_dhcp_lease *lease; + int r; + + assert(client); + assert(link); + + r = sd_dhcp_client_get_lease(client, &lease); + if (r < 0) + return log_link_error_errno(link, r, "DHCP error: No lease: %m"); + + sd_dhcp_lease_unref(link->dhcp_lease); + link->dhcp_lease = sd_dhcp_lease_ref(lease); + link_dirty(link); + + if (link->network->dhcp_use_mtu) { + uint16_t mtu; + + r = sd_dhcp_lease_get_mtu(lease, &mtu); + if (r >= 0) { + r = link_request_to_set_mtu(link, mtu); + if (r < 0) + log_link_error_errno(link, r, "Failed to set MTU to %" PRIu16 ": %m", mtu); + } + } + + if (link->network->dhcp_use_hostname) { + const char *dhcpname = NULL; + _cleanup_free_ char *hostname = NULL; + + if (link->network->dhcp_hostname) + dhcpname = link->network->dhcp_hostname; + else + (void) sd_dhcp_lease_get_hostname(lease, &dhcpname); + + if (dhcpname) { + r = shorten_overlong(dhcpname, &hostname); + if (r < 0) + log_link_warning_errno(link, r, "Unable to shorten overlong DHCP hostname '%s', ignoring: %m", dhcpname); + if (r == 1) + log_link_notice(link, "Overlong DHCP hostname received, shortened from '%s' to '%s'", dhcpname, hostname); + } + + if (hostname) { + r = manager_set_hostname(link->manager, hostname); + if (r < 0) + log_link_error_errno(link, r, "Failed to set transient hostname to '%s': %m", hostname); + } + } + + if (link->network->dhcp_use_timezone) { + const char *tz = NULL; + + (void) sd_dhcp_lease_get_timezone(link->dhcp_lease, &tz); + + if (tz) { + r = manager_set_timezone(link->manager, tz); + if (r < 0) + log_link_error_errno(link, r, "Failed to set timezone to '%s': %m", tz); + } + } + + if (link->network->dhcp_use_6rd && + sd_dhcp_lease_has_6rd(link->dhcp_lease)) { + r = dhcp4_pd_prefix_acquired(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to process 6rd option: %m"); + } + + return dhcp4_request_address_and_routes(link, true); +} + +static int dhcp_lease_ip_change(sd_dhcp_client *client, Link *link) { + int r; + + r = dhcp_lease_acquired(client, link); + if (r < 0) + (void) dhcp4_lease_lost(link); + + return r; +} + +static int dhcp_server_is_filtered(Link *link, sd_dhcp_client *client) { + sd_dhcp_lease *lease; + struct in_addr addr; + int r; + + assert(link); + assert(link->network); + assert(client); + + r = sd_dhcp_client_get_lease(client, &lease); + if (r < 0) + return log_link_error_errno(link, r, "Failed to get DHCP lease: %m"); + + r = sd_dhcp_lease_get_server_identifier(lease, &addr); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get DHCP server IP address: %m"); + + if (in4_address_is_filtered(&addr, link->network->dhcp_allow_listed_ip, link->network->dhcp_deny_listed_ip)) { + if (DEBUG_LOGGING) { + if (link->network->dhcp_allow_listed_ip) + log_link_debug(link, "DHCPv4 server IP address "IPV4_ADDRESS_FMT_STR" not found in allow-list, ignoring offer.", + IPV4_ADDRESS_FMT_VAL(addr)); + else + log_link_debug(link, "DHCPv4 server IP address "IPV4_ADDRESS_FMT_STR" found in deny-list, ignoring offer.", + IPV4_ADDRESS_FMT_VAL(addr)); + } + + return true; + } + + return false; +} + +static int dhcp4_handler(sd_dhcp_client *client, int event, void *userdata) { + Link *link = ASSERT_PTR(userdata); + int r; + + assert(link->network); + assert(link->manager); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + switch (event) { + case SD_DHCP_CLIENT_EVENT_STOP: + if (link->ipv4ll) { + log_link_debug(link, "DHCP client is stopped. Acquiring IPv4 link-local address"); + + if (in4_addr_is_set(&link->network->ipv4ll_start_address)) { + r = sd_ipv4ll_set_address(link->ipv4ll, &link->network->ipv4ll_start_address); + if (r < 0) + return log_link_warning_errno(link, r, "Could not set IPv4 link-local start address: %m"); + } + + r = sd_ipv4ll_start(link->ipv4ll); + if (r < 0) + return log_link_warning_errno(link, r, "Could not acquire IPv4 link-local address: %m"); + } + + if (FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP)) { + log_link_notice(link, "DHCPv4 connection considered critical, ignoring request to reconfigure it."); + return 0; + } + + if (link->dhcp_lease) { + if (link->network->dhcp_send_release) { + r = sd_dhcp_client_send_release(client); + if (r < 0) + log_link_full_errno(link, + ERRNO_IS_DISCONNECT(r) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to send DHCP RELEASE, ignoring: %m"); + } + + r = dhcp4_lease_lost(link); + if (r < 0) { + link_enter_failed(link); + return r; + } + } + + break; + case SD_DHCP_CLIENT_EVENT_EXPIRED: + if (FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP)) { + log_link_notice(link, "DHCPv4 connection considered critical, ignoring request to reconfigure it."); + return 0; + } + + if (link->dhcp_lease) { + r = dhcp4_lease_lost(link); + if (r < 0) { + link_enter_failed(link); + return r; + } + } + + break; + case SD_DHCP_CLIENT_EVENT_IP_CHANGE: + if (FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP)) { + log_link_notice(link, "DHCPv4 connection considered critical, ignoring request to reconfigure it."); + return 0; + } + + r = dhcp_lease_ip_change(client, link); + if (r < 0) { + link_enter_failed(link); + return r; + } + + break; + case SD_DHCP_CLIENT_EVENT_RENEW: + r = dhcp_lease_renew(client, link); + if (r < 0) { + link_enter_failed(link); + return r; + } + break; + case SD_DHCP_CLIENT_EVENT_IP_ACQUIRE: + r = dhcp_lease_acquired(client, link); + if (r < 0) { + link_enter_failed(link); + return r; + } + break; + case SD_DHCP_CLIENT_EVENT_SELECTING: + r = dhcp_server_is_filtered(link, client); + if (r < 0) { + link_enter_failed(link); + return r; + } + if (r > 0) + return -ENOMSG; + break; + + case SD_DHCP_CLIENT_EVENT_TRANSIENT_FAILURE: + if (link->ipv4ll && !sd_ipv4ll_is_running(link->ipv4ll)) { + log_link_debug(link, "Problems acquiring DHCP lease, acquiring IPv4 link-local address"); + + if (in4_addr_is_set(&link->network->ipv4ll_start_address)) { + r = sd_ipv4ll_set_address(link->ipv4ll, &link->network->ipv4ll_start_address); + if (r < 0) + return log_link_warning_errno(link, r, "Could not set IPv4 link-local start address: %m"); + } + + r = sd_ipv4ll_start(link->ipv4ll); + if (r < 0) + return log_link_warning_errno(link, r, "Could not acquire IPv4 link-local address: %m"); + } + break; + + default: + if (event < 0) + log_link_warning_errno(link, event, "DHCP error: Client failed: %m"); + else + log_link_warning(link, "DHCP unknown event: %i", event); + break; + } + + return 0; +} + +static int dhcp4_set_hostname(Link *link) { + _cleanup_free_ char *hostname = NULL; + const char *hn; + int r; + + assert(link); + + if (!link->network->dhcp_send_hostname) + hn = NULL; + else if (link->network->dhcp_hostname) + hn = link->network->dhcp_hostname; + else { + r = gethostname_strict(&hostname); + if (r < 0 && r != -ENXIO) /* ENXIO: no hostname set or hostname is "localhost" */ + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to get hostname: %m"); + + hn = hostname; + } + + r = sd_dhcp_client_set_hostname(link->dhcp_client, hn); + if (r == -EINVAL && hostname) + /* Ignore error when the machine's hostname is not suitable to send in DHCP packet. */ + log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set hostname from kernel hostname, ignoring: %m"); + else if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set hostname: %m"); + + return 0; +} + +static int dhcp4_set_client_identifier(Link *link) { + int r; + + assert(link); + assert(link->network); + assert(link->dhcp_client); + + switch (link->network->dhcp_client_identifier) { + case DHCP_CLIENT_ID_DUID: { + /* If configured, apply user specified DUID and IAID */ + const DUID *duid = link_get_dhcp4_duid(link); + + if (duid->raw_data_len == 0) + switch (duid->type) { + case DUID_TYPE_LLT: + r = sd_dhcp_client_set_iaid_duid_llt(link->dhcp_client, + link->network->dhcp_iaid_set, + link->network->dhcp_iaid, + duid->llt_time); + break; + case DUID_TYPE_LL: + r = sd_dhcp_client_set_iaid_duid_ll(link->dhcp_client, + link->network->dhcp_iaid_set, + link->network->dhcp_iaid); + break; + case DUID_TYPE_EN: + r = sd_dhcp_client_set_iaid_duid_en(link->dhcp_client, + link->network->dhcp_iaid_set, + link->network->dhcp_iaid); + break; + case DUID_TYPE_UUID: + r = sd_dhcp_client_set_iaid_duid_uuid(link->dhcp_client, + link->network->dhcp_iaid_set, + link->network->dhcp_iaid); + break; + default: + r = sd_dhcp_client_set_iaid_duid_raw(link->dhcp_client, + link->network->dhcp_iaid_set, + link->network->dhcp_iaid, + duid->type, NULL, 0); + } + else + r = sd_dhcp_client_set_iaid_duid_raw(link->dhcp_client, + link->network->dhcp_iaid_set, + link->network->dhcp_iaid, + duid->type, duid->raw_data, duid->raw_data_len); + if (r < 0) + return r; + break; + } + case DHCP_CLIENT_ID_MAC: { + const uint8_t *hw_addr = link->hw_addr.bytes; + size_t hw_addr_len = link->hw_addr.length; + + if (link->iftype == ARPHRD_INFINIBAND && hw_addr_len == INFINIBAND_ALEN) { + /* set_client_id expects only last 8 bytes of an IB address */ + hw_addr += INFINIBAND_ALEN - 8; + hw_addr_len -= INFINIBAND_ALEN - 8; + } + + r = sd_dhcp_client_set_client_id(link->dhcp_client, + link->iftype, + hw_addr, + hw_addr_len); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set client ID: %m"); + break; + } + default: + assert_not_reached(); + } + + return 0; +} + +static int dhcp4_find_dynamic_address(Link *link, struct in_addr *ret) { + Address *a; + + assert(link); + assert(link->network); + assert(ret); + + if (!FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP)) + return false; + + SET_FOREACH(a, link->addresses) { + if (a->source != NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + if (a->family != AF_INET) + continue; + if (link_address_is_dynamic(link, a)) + break; + } + + if (!a) + return false; + + *ret = a->in_addr.in; + return true; +} + +static int dhcp4_set_request_address(Link *link) { + struct in_addr a; + + assert(link); + assert(link->network); + assert(link->dhcp_client); + + a = link->network->dhcp_request_address; + + if (in4_addr_is_null(&a)) + (void) dhcp4_find_dynamic_address(link, &a); + + if (in4_addr_is_null(&a)) + return 0; + + log_link_debug(link, "DHCPv4 CLIENT: requesting %s.", IN4_ADDR_TO_STRING(&a)); + return sd_dhcp_client_set_request_address(link->dhcp_client, &a); +} + +static bool link_needs_dhcp_broadcast(Link *link) { + const char *val; + int r; + + assert(link); + assert(link->network); + + /* Return the setting in DHCP[4].RequestBroadcast if specified. Otherwise return the device property + * ID_NET_DHCP_BROADCAST setting, which may be set for interfaces requiring that the DHCPOFFER message + * is being broadcast because they can't handle unicast messages while not fully configured. + * If neither is set or a failure occurs, return false, which is the default for this flag. + */ + r = link->network->dhcp_broadcast; + if (r < 0 && link->dev && sd_device_get_property_value(link->dev, "ID_NET_DHCP_BROADCAST", &val) >= 0) { + r = parse_boolean(val); + if (r < 0) + log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to parse ID_NET_DHCP_BROADCAST, ignoring: %m"); + else + log_link_debug(link, "DHCPv4 CLIENT: Detected ID_NET_DHCP_BROADCAST='%d'.", r); + + } + return r == true; +} + +static bool link_dhcp4_ipv6_only_mode(Link *link) { + assert(link); + assert(link->network); + + /* If it is explicitly specified, then honor the setting. */ + if (link->network->dhcp_ipv6_only_mode >= 0) + return link->network->dhcp_ipv6_only_mode; + + /* Defaults to false, until we support 464XLAT. See issue #30891. */ + return false; +} + +static int dhcp4_configure(Link *link) { + sd_dhcp_option *send_option; + void *request_options; + int r; + + assert(link); + assert(link->network); + + if (link->dhcp_client) + return log_link_debug_errno(link, SYNTHETIC_ERRNO(EBUSY), "DHCPv4 client is already configured."); + + r = sd_dhcp_client_new(&link->dhcp_client, link->network->dhcp_anonymize); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to allocate DHCPv4 client: %m"); + + r = sd_dhcp_client_attach_event(link->dhcp_client, link->manager->event, 0); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to attach event to DHCPv4 client: %m"); + + r = sd_dhcp_client_attach_device(link->dhcp_client, link->dev); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to attach device: %m"); + + r = sd_dhcp_client_set_rapid_commit(link->dhcp_client, link->network->dhcp_use_rapid_commit); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set rapid commit: %m"); + + r = sd_dhcp_client_set_mac(link->dhcp_client, + link->hw_addr.bytes, + link->bcast_addr.length > 0 ? link->bcast_addr.bytes : NULL, + link->hw_addr.length, link->iftype); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set MAC address: %m"); + + r = sd_dhcp_client_set_ifindex(link->dhcp_client, link->ifindex); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set ifindex: %m"); + + r = sd_dhcp_client_set_callback(link->dhcp_client, dhcp4_handler, link); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set callback: %m"); + + r = sd_dhcp_client_set_request_broadcast(link->dhcp_client, link_needs_dhcp_broadcast(link)); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for broadcast: %m"); + + r = dhcp_client_set_state_callback(link->dhcp_client, dhcp_client_callback_bus, link); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set state change callback: %m"); + + if (link->mtu > 0) { + r = sd_dhcp_client_set_mtu(link->dhcp_client, link->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set MTU: %m"); + } + + if (!link->network->dhcp_anonymize) { + r = dhcp4_set_request_address(link); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set initial DHCPv4 address: %m"); + + if (link->network->dhcp_use_mtu) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_MTU_INTERFACE); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for MTU: %m"); + } + + if (link->network->dhcp_use_routes) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_STATIC_ROUTE); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for static route: %m"); + + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_CLASSLESS_STATIC_ROUTE); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for classless static route: %m"); + } + + if (link->network->dhcp_use_domains != DHCP_USE_DOMAINS_NO) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_DOMAIN_SEARCH); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for domain search list: %m"); + } + + if (link->network->dhcp_use_ntp) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_NTP_SERVER); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for NTP server: %m"); + } + + if (link->network->dhcp_use_sip) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_SIP_SERVER); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for SIP server: %m"); + } + if (link->network->dhcp_use_captive_portal) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_DHCP_CAPTIVE_PORTAL); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for captive portal: %m"); + } + + if (link->network->dhcp_use_timezone) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_TZDB_TIMEZONE); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for timezone: %m"); + } + + if (link->network->dhcp_use_6rd) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_6RD); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for 6rd: %m"); + } + + if (link_dhcp4_ipv6_only_mode(link)) { + r = sd_dhcp_client_set_request_option(link->dhcp_client, SD_DHCP_OPTION_IPV6_ONLY_PREFERRED); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for IPv6-only preferred option: %m"); + } + + SET_FOREACH(request_options, link->network->dhcp_request_options) { + uint32_t option = PTR_TO_UINT32(request_options); + + r = sd_dhcp_client_set_request_option(link->dhcp_client, option); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set request flag for '%u': %m", option); + } + + ORDERED_HASHMAP_FOREACH(send_option, link->network->dhcp_client_send_options) { + r = sd_dhcp_client_add_option(link->dhcp_client, send_option); + if (r == -EEXIST) + continue; + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set send option: %m"); + } + + ORDERED_HASHMAP_FOREACH(send_option, link->network->dhcp_client_send_vendor_options) { + r = sd_dhcp_client_add_vendor_option(link->dhcp_client, send_option); + if (r == -EEXIST) + continue; + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set send option: %m"); + } + + r = dhcp4_set_hostname(link); + if (r < 0) + return r; + + if (link->network->dhcp_vendor_class_identifier) { + r = sd_dhcp_client_set_vendor_class_identifier(link->dhcp_client, + link->network->dhcp_vendor_class_identifier); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set vendor class identifier: %m"); + } + + if (link->network->dhcp_mudurl) { + r = sd_dhcp_client_set_mud_url(link->dhcp_client, link->network->dhcp_mudurl); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set MUD URL: %m"); + } + + if (link->network->dhcp_user_class) { + r = sd_dhcp_client_set_user_class(link->dhcp_client, link->network->dhcp_user_class); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set user class: %m"); + } + } + + if (link->network->dhcp_client_port > 0) { + r = sd_dhcp_client_set_client_port(link->dhcp_client, link->network->dhcp_client_port); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set listen port: %m"); + } + + if (link->network->dhcp_max_attempts > 0) { + r = sd_dhcp_client_set_max_attempts(link->dhcp_client, link->network->dhcp_max_attempts); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set max attempts: %m"); + } + + if (link->network->dhcp_ip_service_type >= 0) { + r = sd_dhcp_client_set_service_type(link->dhcp_client, link->network->dhcp_ip_service_type); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set IP service type: %m"); + } + + if (link->network->dhcp_socket_priority_set) { + r = sd_dhcp_client_set_socket_priority(link->dhcp_client, link->network->dhcp_socket_priority); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed to set socket priority: %m"); + } + + if (link->network->dhcp_fallback_lease_lifetime_usec > 0) { + r = sd_dhcp_client_set_fallback_lease_lifetime(link->dhcp_client, link->network->dhcp_fallback_lease_lifetime_usec); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv4 CLIENT: Failed set to lease lifetime: %m"); + } + + return dhcp4_set_client_identifier(link); +} + +int dhcp4_update_mac(Link *link) { + bool restart; + int r; + + assert(link); + + if (!link->dhcp_client) + return 0; + + restart = sd_dhcp_client_is_running(link->dhcp_client); + + r = sd_dhcp_client_stop(link->dhcp_client); + if (r < 0) + return r; + + r = sd_dhcp_client_set_mac(link->dhcp_client, + link->hw_addr.bytes, + link->bcast_addr.length > 0 ? link->bcast_addr.bytes : NULL, + link->hw_addr.length, link->iftype); + if (r < 0) + return r; + + r = dhcp4_set_client_identifier(link); + if (r < 0) + return r; + + if (restart) { + r = dhcp4_start(link); + if (r < 0) + return r; + } + + return 0; +} + +int dhcp4_update_ipv6_connectivity(Link *link) { + assert(link); + + if (!link->network) + return 0; + + if (!link->network->dhcp_ipv6_only_mode) + return 0; + + if (!link->dhcp_client) + return 0; + + /* If the client is running, set the current connectivity. */ + if (sd_dhcp_client_is_running(link->dhcp_client)) + return sd_dhcp_client_set_ipv6_connectivity(link->dhcp_client, link_has_ipv6_connectivity(link)); + + /* If the client has been already stopped or not started yet, let's check the current connectivity + * and start the client if necessary. */ + if (link_has_ipv6_connectivity(link)) + return 0; + + return dhcp4_start_full(link, /* set_ipv6_connectivity = */ false); +} + +int dhcp4_start_full(Link *link, bool set_ipv6_connectivity) { + int r; + + assert(link); + assert(link->network); + + if (!link->dhcp_client) + return 0; + + if (!link_has_carrier(link)) + return 0; + + if (sd_dhcp_client_is_running(link->dhcp_client) > 0) + return 0; + + r = sd_dhcp_client_start(link->dhcp_client); + if (r < 0) + return r; + + if (set_ipv6_connectivity) { + r = dhcp4_update_ipv6_connectivity(link); + if (r < 0) + return r; + } + + return 1; +} + +int dhcp4_renew(Link *link) { + assert(link); + + if (!link->dhcp_client) + return 0; + + /* The DHCPv4 client may have been stopped by the IPv6 only mode. Let's unconditionally restart the + * client if it is not running. */ + if (!sd_dhcp_client_is_running(link->dhcp_client)) + return dhcp4_start(link); + + /* The client may be waiting for IPv6 connectivity. Let's restart the client in that case. */ + if (dhcp_client_get_state(link->dhcp_client) != DHCP_STATE_BOUND) + return sd_dhcp_client_interrupt_ipv6_only_mode(link->dhcp_client); + + /* Otherwise, send a RENEW command. */ + return sd_dhcp_client_send_renew(link->dhcp_client); +} + +static int dhcp4_configure_duid(Link *link) { + assert(link); + assert(link->network); + + if (link->network->dhcp_client_identifier != DHCP_CLIENT_ID_DUID) + return 1; + + return dhcp_configure_duid(link, link_get_dhcp4_duid(link)); +} + +static int dhcp4_process_request(Request *req, Link *link, void *userdata) { + int r; + + assert(link); + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return 0; + + r = dhcp4_configure_duid(link); + if (r <= 0) + return r; + + r = dhcp4_configure(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure DHCPv4 client: %m"); + + r = dhcp4_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start DHCPv4 client: %m"); + + log_link_debug(link, "DHCPv4 client is configured%s.", + r > 0 ? ", acquiring DHCPv4 lease" : ""); + return 1; +} + +int link_request_dhcp4_client(Link *link) { + int r; + + assert(link); + + if (!link_dhcp4_enabled(link)) + return 0; + + if (link->dhcp_client) + return 0; + + r = link_queue_request(link, REQUEST_TYPE_DHCP4_CLIENT, dhcp4_process_request, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request configuring of the DHCPv4 client: %m"); + + log_link_debug(link, "Requested configuring of the DHCPv4 client."); + return 0; +} + +int config_parse_dhcp_max_attempts( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(data); + uint64_t a; + int r; + + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + network->dhcp_max_attempts = 0; + return 0; + } + + if (streq(rvalue, "infinity")) { + network->dhcp_max_attempts = UINT64_MAX; + return 0; + } + + r = safe_atou64(rvalue, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DHCP maximum attempts, ignoring: %s", rvalue); + return 0; + } + + if (a == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s= must be positive integer or 'infinity', ignoring: %s", lvalue, rvalue); + return 0; + } + + network->dhcp_max_attempts = a; + + return 0; +} + +int config_parse_dhcp_ip_service_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *tos = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + *tos = -1; /* use sd_dhcp_client's default (currently, CS6). */ + else if (streq(rvalue, "none")) + *tos = 0; + else if (streq(rvalue, "CS4")) + *tos = IPTOS_CLASS_CS4; + else if (streq(rvalue, "CS6")) + *tos = IPTOS_CLASS_CS6; + else + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + + return 0; +} + +int config_parse_dhcp_socket_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(data); + int a, r; + + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + network->dhcp_socket_priority_set = false; + return 0; + } + + r = safe_atoi(rvalue, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse socket priority, ignoring: %s", rvalue); + return 0; + } + + network->dhcp_socket_priority_set = true; + network->dhcp_socket_priority = a; + + return 0; +} + +int config_parse_dhcp_fallback_lease_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + network->dhcp_fallback_lease_lifetime_usec = 0; + return 0; + } + + /* We accept only "forever" or "infinity". */ + if (!STR_IN_SET(rvalue, "forever", "infinity")) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid LeaseLifetime= value, ignoring: %s", rvalue); + return 0; + } + + network->dhcp_fallback_lease_lifetime_usec = USEC_INFINITY; + + return 0; +} + +int config_parse_dhcp_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **label = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *label = mfree(*label); + return 0; + } + + if (!address_label_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Address label is too long or invalid, ignoring assignment: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(label, rvalue); +} + +static const char* const dhcp_client_identifier_table[_DHCP_CLIENT_ID_MAX] = { + [DHCP_CLIENT_ID_MAC] = "mac", + [DHCP_CLIENT_ID_DUID] = "duid", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(dhcp_client_identifier, DHCPClientIdentifier); +DEFINE_CONFIG_PARSE_ENUM(config_parse_dhcp_client_identifier, dhcp_client_identifier, DHCPClientIdentifier, + "Failed to parse client identifier type"); diff --git a/src/network/networkd-dhcp4.h b/src/network/networkd-dhcp4.h new file mode 100644 index 0000000..b3fe027 --- /dev/null +++ b/src/network/networkd-dhcp4.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +typedef struct Link Link; +typedef struct Network Network; + +typedef enum DHCPClientIdentifier { + DHCP_CLIENT_ID_MAC, + DHCP_CLIENT_ID_DUID, + _DHCP_CLIENT_ID_MAX, + _DHCP_CLIENT_ID_INVALID = -EINVAL, +} DHCPClientIdentifier; + +void network_adjust_dhcp4(Network *network); +int dhcp4_update_mac(Link *link); +int dhcp4_update_ipv6_connectivity(Link *link); +int dhcp4_start_full(Link *link, bool set_ipv6_connectivity); +static inline int dhcp4_start(Link *link) { + return dhcp4_start_full(link, true); +} +int dhcp4_renew(Link *link); +int dhcp4_lease_lost(Link *link); +int dhcp4_check_ready(Link *link); + +int link_request_dhcp4_client(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_client_identifier); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_max_attempts); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_ip_service_type); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_socket_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_mud_url); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_fallback_lease_lifetime); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp_label); diff --git a/src/network/networkd-dhcp6-bus.c b/src/network/networkd-dhcp6-bus.c new file mode 100644 index 0000000..a225877 --- /dev/null +++ b/src/network/networkd-dhcp6-bus.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-util.h" +#include "dhcp6-client-internal.h" +#include "dhcp6-protocol.h" +#include "networkd-dhcp6-bus.h" +#include "networkd-link-bus.h" +#include "networkd-manager.h" +#include "string-table.h" +#include "strv.h" + +static int property_get_dhcp6_client_state( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *l = ASSERT_PTR(userdata); + sd_dhcp6_client *c; + + assert(reply); + + c = l->dhcp6_client; + if (!c) + return sd_bus_message_append(reply, "s", "disabled"); + + return sd_bus_message_append(reply, "s", dhcp6_state_to_string(dhcp6_client_get_state(c))); +} + +static int dhcp6_client_emit_changed(Link *link, const char *property, ...) { + _cleanup_free_ char *path = NULL; + char **l; + + assert(link); + + if (sd_bus_is_ready(link->manager->bus) <= 0) + return 0; + + path = link_bus_path(link); + if (!path) + return log_oom(); + + l = strv_from_stdarg_alloca(property); + + return sd_bus_emit_properties_changed_strv( + link->manager->bus, + path, + "org.freedesktop.network1.DHCPv6Client", + l); +} + +void dhcp6_client_callback_bus(sd_dhcp6_client *c, int event, void *userdata) { + Link *l = ASSERT_PTR(userdata); + + dhcp6_client_emit_changed(l, "State", NULL); +} + +static const sd_bus_vtable dhcp6_client_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("State", "s", property_get_dhcp6_client_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation dhcp6_client_object = { + "/org/freedesktop/network1/link", + "org.freedesktop.network1.DHCPv6Client", + .fallback_vtables = BUS_FALLBACK_VTABLES({dhcp6_client_vtable, link_object_find}), + .node_enumerator = link_node_enumerator, +}; diff --git a/src/network/networkd-dhcp6-bus.h b/src/network/networkd-dhcp6-bus.h new file mode 100644 index 0000000..76a6b72 --- /dev/null +++ b/src/network/networkd-dhcp6-bus.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-dhcp6-client.h" + +#include "networkd-link-bus.h" + +extern const BusObjectImplementation dhcp6_client_object; + +void dhcp6_client_callback_bus(sd_dhcp6_client *client, int event, void *userdata); diff --git a/src/network/networkd-dhcp6.c b/src/network/networkd-dhcp6.c new file mode 100644 index 0000000..f499d03 --- /dev/null +++ b/src/network/networkd-dhcp6.c @@ -0,0 +1,892 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include "dhcp6-client-internal.h" +#include "dhcp6-lease-internal.h" +#include "hashmap.h" +#include "hostname-setup.h" +#include "hostname-util.h" +#include "networkd-address.h" +#include "networkd-dhcp-prefix-delegation.h" +#include "networkd-dhcp6-bus.h" +#include "networkd-dhcp6.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "networkd-route.h" +#include "networkd-state-file.h" +#include "string-table.h" +#include "string-util.h" + +bool link_dhcp6_with_address_enabled(Link *link) { + if (!link_dhcp6_enabled(link)) + return false; + + return link->network->dhcp6_use_address; +} + +static DHCP6ClientStartMode link_get_dhcp6_client_start_mode(Link *link) { + assert(link); + + if (!link->network) + return DHCP6_CLIENT_START_MODE_NO; + + /* When WithoutRA= is explicitly specified, then honor it. */ + if (link->network->dhcp6_client_start_mode >= 0) + return link->network->dhcp6_client_start_mode; + + /* When this interface itself is an uplink interface, then start dhcp6 client in solicit mode. */ + if (dhcp_pd_is_uplink(link, link, /* accept_auto = */ false)) + return DHCP6_CLIENT_START_MODE_SOLICIT; + + /* Otherwise, start dhcp6 client when RA is received. */ + return DHCP6_CLIENT_START_MODE_NO; +} + +static int dhcp6_remove(Link *link, bool only_marked) { + Address *address; + Route *route; + int k, r = 0; + + assert(link); + + if (!only_marked) + link->dhcp6_configured = false; + + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_DHCP6) + continue; + if (only_marked && !route_is_marked(route)) + continue; + + k = route_remove(route); + if (k < 0) + r = k; + + route_cancel_request(route, link); + } + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_DHCP6) + continue; + if (only_marked && !address_is_marked(address)) + continue; + + k = address_remove_and_drop(address); + if (k < 0) + r = k; + } + + return r; +} + +static int dhcp6_address_ready_callback(Address *address) { + Address *a; + + assert(address); + assert(address->link); + + SET_FOREACH(a, address->link->addresses) + if (a->source == NETWORK_CONFIG_SOURCE_DHCP6) + a->callback = NULL; + + return dhcp6_check_ready(address->link); +} + +int dhcp6_check_ready(Link *link) { + int r; + + assert(link); + assert(link->network); + + if (link->dhcp6_messages > 0) { + log_link_debug(link, "%s(): DHCPv6 addresses and routes are not set.", __func__); + return 0; + } + + if (link->network->dhcp6_use_address && + sd_dhcp6_lease_has_address(link->dhcp6_lease) && + !link_check_addresses_ready(link, NETWORK_CONFIG_SOURCE_DHCP6)) { + Address *address; + + SET_FOREACH(address, link->addresses) + if (address->source == NETWORK_CONFIG_SOURCE_DHCP6) + address->callback = dhcp6_address_ready_callback; + + log_link_debug(link, "%s(): no DHCPv6 address is ready.", __func__); + return 0; + } + + link->dhcp6_configured = true; + log_link_debug(link, "DHCPv6 addresses and routes set."); + + r = dhcp6_remove(link, /* only_marked = */ true); + if (r < 0) + return r; + + link_check_ready(link); + return 0; +} + +static int dhcp6_address_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Address *address) { + int r; + + assert(link); + + r = address_configure_handler_internal(rtnl, m, link, "Could not set DHCPv6 address"); + if (r <= 0) + return r; + + r = dhcp6_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int verify_dhcp6_address(Link *link, const Address *address) { + bool by_ndisc = false; + Address *existing; + int log_level; + + assert(link); + assert(address); + assert(address->family == AF_INET6); + + const char *pretty = IN6_ADDR_TO_STRING(&address->in_addr.in6); + + if (address_get_harder(link, address, &existing) < 0) { + /* New address. */ + log_level = LOG_INFO; + goto simple_log; + } else + log_level = LOG_DEBUG; + + if (address->prefixlen == existing->prefixlen) + /* Currently, only conflict in prefix length is reported. */ + goto simple_log; + + if (existing->source == NETWORK_CONFIG_SOURCE_NDISC) + by_ndisc = true; + + log_link_warning(link, "Ignoring DHCPv6 address %s/%u (valid %s, preferred %s) which conflicts with %s/%u%s.", + pretty, address->prefixlen, + FORMAT_LIFETIME(address->lifetime_valid_usec), + FORMAT_LIFETIME(address->lifetime_preferred_usec), + pretty, existing->prefixlen, + by_ndisc ? " assigned by NDisc" : ""); + if (by_ndisc) + log_link_warning(link, "Hint: use IPv6Token= setting to change the address generated by NDisc or set UseAutonomousPrefix=no."); + + return -EEXIST; + +simple_log: + log_link_full(link, log_level, "DHCPv6 address %s/%u (valid %s, preferred %s)", + pretty, address->prefixlen, + FORMAT_LIFETIME(address->lifetime_valid_usec), + FORMAT_LIFETIME(address->lifetime_preferred_usec)); + return 0; +} + +static int dhcp6_request_address( + Link *link, + const struct in6_addr *server_address, + const struct in6_addr *ip6_addr, + usec_t lifetime_preferred_usec, + usec_t lifetime_valid_usec) { + + _cleanup_(address_freep) Address *addr = NULL; + Address *existing; + int r; + + r = address_new(&addr); + if (r < 0) + return log_oom(); + + addr->source = NETWORK_CONFIG_SOURCE_DHCP6; + addr->provider.in6 = *server_address; + addr->family = AF_INET6; + addr->in_addr.in6 = *ip6_addr; + addr->flags = IFA_F_NOPREFIXROUTE; + addr->prefixlen = 128; + addr->lifetime_preferred_usec = lifetime_preferred_usec; + addr->lifetime_valid_usec = lifetime_valid_usec; + + if (verify_dhcp6_address(link, addr) < 0) + return 0; + + r = free_and_strdup_warn(&addr->netlabel, link->network->dhcp6_netlabel); + if (r < 0) + return r; + + if (address_get(link, addr, &existing) < 0) + link->dhcp6_configured = false; + else + address_unmark(existing); + + r = link_request_address(link, addr, &link->dhcp6_messages, + dhcp6_address_handler, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request DHCPv6 address %s/128: %m", + IN6_ADDR_TO_STRING(ip6_addr)); + return 0; +} + +static int dhcp6_address_acquired(Link *link) { + struct in6_addr server_address; + int r; + + assert(link); + assert(link->network); + assert(link->dhcp6_lease); + + if (!link->network->dhcp6_use_address) + return 0; + + r = sd_dhcp6_lease_get_server_address(link->dhcp6_lease, &server_address); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get server address of DHCPv6 lease: %m"); + + FOREACH_DHCP6_ADDRESS(link->dhcp6_lease) { + usec_t lifetime_preferred_usec, lifetime_valid_usec; + struct in6_addr ip6_addr; + + r = sd_dhcp6_lease_get_address(link->dhcp6_lease, &ip6_addr); + if (r < 0) + return r; + + r = sd_dhcp6_lease_get_address_lifetime_timestamp(link->dhcp6_lease, CLOCK_BOOTTIME, + &lifetime_preferred_usec, &lifetime_valid_usec); + if (r < 0) + return r; + + r = dhcp6_request_address(link, &server_address, &ip6_addr, + lifetime_preferred_usec, + lifetime_valid_usec); + if (r < 0) + return r; + } + + if (link->network->dhcp6_use_hostname) { + const char *dhcpname = NULL; + _cleanup_free_ char *hostname = NULL; + + (void) sd_dhcp6_lease_get_fqdn(link->dhcp6_lease, &dhcpname); + + if (dhcpname) { + r = shorten_overlong(dhcpname, &hostname); + if (r < 0) + log_link_warning_errno(link, r, "Unable to shorten overlong DHCP hostname '%s', ignoring: %m", dhcpname); + if (r == 1) + log_link_notice(link, "Overlong DHCP hostname received, shortened from '%s' to '%s'", dhcpname, hostname); + } + if (hostname) { + r = manager_set_hostname(link->manager, hostname); + if (r < 0) + log_link_error_errno(link, r, "Failed to set transient hostname to '%s': %m", hostname); + } + } + + return 0; +} + +static int dhcp6_lease_ip_acquired(sd_dhcp6_client *client, Link *link) { + _cleanup_(sd_dhcp6_lease_unrefp) sd_dhcp6_lease *lease_old = NULL; + sd_dhcp6_lease *lease; + int r; + + link_mark_addresses(link, NETWORK_CONFIG_SOURCE_DHCP6); + link_mark_routes(link, NETWORK_CONFIG_SOURCE_DHCP6); + + r = sd_dhcp6_client_get_lease(client, &lease); + if (r < 0) + return log_link_error_errno(link, r, "Failed to get DHCPv6 lease: %m"); + + lease_old = TAKE_PTR(link->dhcp6_lease); + link->dhcp6_lease = sd_dhcp6_lease_ref(lease); + + r = dhcp6_address_acquired(link); + if (r < 0) + return r; + + if (sd_dhcp6_lease_has_pd_prefix(lease)) { + r = dhcp6_pd_prefix_acquired(link); + if (r < 0) + return r; + } else if (sd_dhcp6_lease_has_pd_prefix(lease_old)) + /* When we had PD prefixes but not now, we need to remove them. */ + dhcp_pd_prefix_lost(link); + + if (link->dhcp6_messages == 0) { + link->dhcp6_configured = true; + + r = dhcp6_remove(link, /* only_marked = */ true); + if (r < 0) + return r; + } else + log_link_debug(link, "Setting DHCPv6 addresses and routes"); + + if (!link->dhcp6_configured) + link_set_state(link, LINK_STATE_CONFIGURING); + + link_check_ready(link); + return 0; +} + +static int dhcp6_lease_information_acquired(sd_dhcp6_client *client, Link *link) { + sd_dhcp6_lease *lease; + int r; + + assert(client); + assert(link); + + r = sd_dhcp6_client_get_lease(client, &lease); + if (r < 0) + return log_link_error_errno(link, r, "Failed to get DHCPv6 lease: %m"); + + unref_and_replace_full(link->dhcp6_lease, lease, sd_dhcp6_lease_ref, sd_dhcp6_lease_unref); + + link_dirty(link); + return 0; +} + +static int dhcp6_lease_lost(Link *link) { + int r; + + assert(link); + assert(link->manager); + + log_link_info(link, "DHCPv6 lease lost"); + + if (sd_dhcp6_lease_has_pd_prefix(link->dhcp6_lease)) + dhcp_pd_prefix_lost(link); + + link->dhcp6_lease = sd_dhcp6_lease_unref(link->dhcp6_lease); + + r = dhcp6_remove(link, /* only_marked = */ false); + if (r < 0) + return r; + + return 0; +} + +static void dhcp6_handler(sd_dhcp6_client *client, int event, void *userdata) { + Link *link = ASSERT_PTR(userdata); + int r = 0; + + assert(link->network); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return; + + switch (event) { + case SD_DHCP6_CLIENT_EVENT_STOP: + case SD_DHCP6_CLIENT_EVENT_RESEND_EXPIRE: + case SD_DHCP6_CLIENT_EVENT_RETRANS_MAX: + r = dhcp6_lease_lost(link); + break; + + case SD_DHCP6_CLIENT_EVENT_IP_ACQUIRE: + r = dhcp6_lease_ip_acquired(client, link); + break; + + case SD_DHCP6_CLIENT_EVENT_INFORMATION_REQUEST: + r = dhcp6_lease_information_acquired(client, link); + break; + + default: + if (event < 0) + log_link_warning_errno(link, event, "DHCPv6 error, ignoring: %m"); + else + log_link_warning(link, "DHCPv6 unknown event: %d", event); + } + if (r < 0) + link_enter_failed(link); +} + +int dhcp6_start_on_ra(Link *link, bool information_request) { + int r; + + assert(link); + assert(link->dhcp6_client); + assert(link->network); + assert(in6_addr_is_link_local(&link->ipv6ll_address)); + + if (link_get_dhcp6_client_start_mode(link) != DHCP6_CLIENT_START_MODE_NO) + /* When WithoutRA= is specified, then the DHCPv6 client should be already running in + * the requested mode. Hence, ignore the requests by RA. */ + return 0; + + r = sd_dhcp6_client_is_running(link->dhcp6_client); + if (r < 0) + return r; + + if (r > 0) { + int inf_req; + + r = sd_dhcp6_client_get_information_request(link->dhcp6_client, &inf_req); + if (r < 0) + return r; + + if (inf_req == information_request) + /* The client is already running in the requested mode. */ + return 0; + + if (!inf_req) { + log_link_debug(link, + "The DHCPv6 client is already running in the managed mode, " + "refusing to start the client in the information requesting mode."); + return 0; + } + + log_link_debug(link, + "The DHCPv6 client is running in the information requesting mode. " + "Restarting the client in the managed mode."); + + r = sd_dhcp6_client_stop(link->dhcp6_client); + if (r < 0) + return r; + } else { + r = sd_dhcp6_client_set_local_address(link->dhcp6_client, &link->ipv6ll_address); + if (r < 0) + return r; + } + + r = sd_dhcp6_client_set_information_request(link->dhcp6_client, information_request); + if (r < 0) + return r; + + r = sd_dhcp6_client_start(link->dhcp6_client); + if (r < 0) + return r; + + return 0; +} + +int dhcp6_start(Link *link) { + DHCP6ClientStartMode start_mode; + int r; + + assert(link); + assert(link->network); + + if (!link->dhcp6_client) + return 0; + + if (!link_dhcp6_enabled(link)) + return 0; + + if (!link_has_carrier(link)) + return 0; + + if (sd_dhcp6_client_is_running(link->dhcp6_client) > 0) + return 0; + + if (!in6_addr_is_link_local(&link->ipv6ll_address)) { + log_link_debug(link, "IPv6 link-local address is not set, delaying to start DHCPv6 client."); + return 0; + } + + r = sd_dhcp6_client_set_local_address(link->dhcp6_client, &link->ipv6ll_address); + if (r < 0) + return r; + + start_mode = link_get_dhcp6_client_start_mode(link); + if (start_mode == DHCP6_CLIENT_START_MODE_NO) + return 0; + + r = sd_dhcp6_client_set_information_request(link->dhcp6_client, + start_mode == DHCP6_CLIENT_START_MODE_INFORMATION_REQUEST); + if (r < 0) + return r; + + r = sd_dhcp6_client_start(link->dhcp6_client); + if (r < 0) + return r; + + return 1; +} + +static int dhcp6_set_hostname(sd_dhcp6_client *client, Link *link) { + _cleanup_free_ char *hostname = NULL; + const char *hn; + int r; + + assert(link); + + if (!link->network->dhcp6_send_hostname) + hn = NULL; + else if (link->network->dhcp6_hostname) + hn = link->network->dhcp6_hostname; + else { + r = gethostname_strict(&hostname); + if (r < 0 && r != -ENXIO) /* ENXIO: no hostname set or hostname is "localhost" */ + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to get hostname: %m"); + + hn = hostname; + } + + r = sd_dhcp6_client_set_fqdn(client, hn); + if (r == -EINVAL && hostname) + /* Ignore error when the machine's hostname is not suitable to send in DHCP packet. */ + log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set hostname from kernel hostname, ignoring: %m"); + else if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set hostname: %m"); + + return 0; +} + +static int dhcp6_set_identifier(Link *link, sd_dhcp6_client *client) { + const DUID *duid; + int r; + + assert(link); + assert(link->network); + assert(client); + + r = sd_dhcp6_client_set_mac(client, link->hw_addr.bytes, link->hw_addr.length, link->iftype); + if (r < 0) + return r; + + if (link->network->dhcp6_iaid_set) { + r = sd_dhcp6_client_set_iaid(client, link->network->dhcp6_iaid); + if (r < 0) + return r; + } + + duid = link_get_dhcp6_duid(link); + + if (duid->raw_data_len == 0) + switch (duid->type) { + case DUID_TYPE_LLT: + r = sd_dhcp6_client_set_duid_llt(client, duid->llt_time); + break; + case DUID_TYPE_LL: + r = sd_dhcp6_client_set_duid_ll(client); + break; + case DUID_TYPE_EN: + r = sd_dhcp6_client_set_duid_en(client); + break; + case DUID_TYPE_UUID: + r = sd_dhcp6_client_set_duid_uuid(client); + break; + default: + r = sd_dhcp6_client_set_duid_raw(client, duid->type, NULL, 0); + } + else + r = sd_dhcp6_client_set_duid_raw(client, duid->type, duid->raw_data, duid->raw_data_len); + if (r < 0) + return r; + + return 0; +} + +static int dhcp6_configure(Link *link) { + _cleanup_(sd_dhcp6_client_unrefp) sd_dhcp6_client *client = NULL; + sd_dhcp6_option *vendor_option; + sd_dhcp6_option *send_option; + void *request_options; + int r; + + assert(link); + assert(link->network); + + if (link->dhcp6_client) + return log_link_debug_errno(link, SYNTHETIC_ERRNO(EBUSY), "DHCPv6 client is already configured."); + + r = sd_dhcp6_client_new(&client); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to create DHCPv6 client: %m"); + + r = sd_dhcp6_client_attach_event(client, link->manager->event, 0); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to attach event: %m"); + + r = sd_dhcp6_client_attach_device(client, link->dev); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to attach device: %m"); + + r = dhcp6_set_identifier(link, client); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set identifier: %m"); + + ORDERED_HASHMAP_FOREACH(send_option, link->network->dhcp6_client_send_options) { + r = sd_dhcp6_client_add_option(client, send_option); + if (r == -EEXIST) + continue; + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set option: %m"); + } + + r = dhcp6_set_hostname(client, link); + if (r < 0) + return r; + + r = sd_dhcp6_client_set_ifindex(client, link->ifindex); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set ifindex: %m"); + + if (link->network->dhcp6_mudurl) { + r = sd_dhcp6_client_set_request_mud_url(client, link->network->dhcp6_mudurl); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set MUD URL: %m"); + } + + if (link->network->dhcp6_use_dns) { + r = sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_DNS_SERVER); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to request DNS servers: %m"); + } + + if (link->network->dhcp6_use_domains > 0) { + r = sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_DOMAIN); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to request domains: %m"); + } + + if (link->network->dhcp6_use_captive_portal > 0) { + r = sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_CAPTIVE_PORTAL); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to request captive portal: %m"); + } + + if (link->network->dhcp6_use_ntp) { + r = sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_NTP_SERVER); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to request NTP servers: %m"); + + /* If the server does not provide NTP servers, then we fallback to use SNTP servers. */ + r = sd_dhcp6_client_set_request_option(client, SD_DHCP6_OPTION_SNTP_SERVER); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to request SNTP servers: %m"); + } + + SET_FOREACH(request_options, link->network->dhcp6_request_options) { + uint32_t option = PTR_TO_UINT32(request_options); + + r = sd_dhcp6_client_set_request_option(client, option); + if (r == -EEXIST) { + log_link_debug(link, "DHCPv6 CLIENT: Failed to set request flag for '%u' already exists, ignoring.", option); + continue; + } + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set request flag for '%u': %m", option); + } + + if (link->network->dhcp6_user_class) { + r = sd_dhcp6_client_set_request_user_class(client, link->network->dhcp6_user_class); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set user class: %m"); + } + + if (link->network->dhcp6_vendor_class) { + r = sd_dhcp6_client_set_request_vendor_class(client, link->network->dhcp6_vendor_class); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set vendor class: %m"); + } + + ORDERED_HASHMAP_FOREACH(vendor_option, link->network->dhcp6_client_send_vendor_options) { + r = sd_dhcp6_client_add_vendor_option(client, vendor_option); + if (r == -EEXIST) + continue; + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set vendor option: %m"); + } + + r = sd_dhcp6_client_set_callback(client, dhcp6_handler, link); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set callback: %m"); + + r = dhcp6_client_set_state_callback(client, dhcp6_client_callback_bus, link); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set state change callback: %m"); + + r = sd_dhcp6_client_set_prefix_delegation(client, link->network->dhcp6_use_pd_prefix); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to %s requesting prefixes to be delegated: %m", + enable_disable(link->network->dhcp6_use_pd_prefix)); + + /* Even if UseAddress=no, we need to request IA_NA, as the dhcp6 client may be started in solicit mode. */ + r = sd_dhcp6_client_set_address_request(client, link->network->dhcp6_use_pd_prefix ? link->network->dhcp6_use_address : true); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to %s requesting address: %m", + enable_disable(link->network->dhcp6_use_address)); + + if (link->network->dhcp6_pd_prefix_length > 0) { + r = sd_dhcp6_client_set_prefix_delegation_hint(client, + link->network->dhcp6_pd_prefix_length, + &link->network->dhcp6_pd_prefix_hint); + if (r < 0) + return log_link_debug_errno(link, r, "DHCPv6 CLIENT: Failed to set prefix delegation hint: %m"); + } + + r = sd_dhcp6_client_set_rapid_commit(client, link->network->dhcp6_use_rapid_commit); + if (r < 0) + return log_link_debug_errno(link, r, + "DHCPv6 CLIENT: Failed to %s rapid commit: %m", + enable_disable(link->network->dhcp6_use_rapid_commit)); + + r = sd_dhcp6_client_set_send_release(client, link->network->dhcp6_send_release); + if (r < 0) + return log_link_debug_errno(link, r, + "DHCPv6 CLIENT: Failed to %s sending release message on stop: %m", + enable_disable(link->network->dhcp6_send_release)); + + link->dhcp6_client = TAKE_PTR(client); + + return 0; +} + +int dhcp6_update_mac(Link *link) { + bool restart; + int r; + + assert(link); + + if (!link->dhcp6_client) + return 0; + + restart = sd_dhcp6_client_is_running(link->dhcp6_client) > 0; + + if (restart) { + r = sd_dhcp6_client_stop(link->dhcp6_client); + if (r < 0) + return r; + } + + r = dhcp6_set_identifier(link, link->dhcp6_client); + if (r < 0) + return r; + + if (restart) { + r = sd_dhcp6_client_start(link->dhcp6_client); + if (r < 0) + return log_link_warning_errno(link, r, "Could not restart DHCPv6 client: %m"); + } + + return 0; +} + +static int dhcp6_process_request(Request *req, Link *link, void *userdata) { + int r; + + assert(link); + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return 0; + + r = dhcp_configure_duid(link, link_get_dhcp6_duid(link)); + if (r <= 0) + return r; + + r = dhcp6_configure(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure DHCPv6 client: %m"); + + r = ndisc_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start IPv6 Router Discovery: %m"); + + r = dhcp6_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start DHCPv6 client: %m"); + + log_link_debug(link, "DHCPv6 client is configured%s.", + r > 0 ? ", acquiring DHCPv6 lease" : ""); + return 1; +} + +int link_request_dhcp6_client(Link *link) { + int r; + + assert(link); + + if (!link_dhcp6_enabled(link) && !link_ipv6_accept_ra_enabled(link)) + return 0; + + if (link->dhcp6_client) + return 0; + + r = link_queue_request(link, REQUEST_TYPE_DHCP6_CLIENT, dhcp6_process_request, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request configuring of the DHCPv6 client: %m"); + + log_link_debug(link, "Requested configuring of the DHCPv6 client."); + return 0; +} + +int link_serialize_dhcp6_client(Link *link, FILE *f) { + _cleanup_free_ char *duid = NULL; + uint32_t iaid; + int r; + + assert(link); + + if (!link->dhcp6_client) + return 0; + + r = sd_dhcp6_client_get_iaid(link->dhcp6_client, &iaid); + if (r >= 0) + fprintf(f, "DHCP6_CLIENT_IAID=0x%x\n", iaid); + + r = sd_dhcp6_client_duid_as_string(link->dhcp6_client, &duid); + if (r >= 0) + fprintf(f, "DHCP6_CLIENT_DUID=%s\n", duid); + + return 0; +} + +int config_parse_dhcp6_pd_prefix_hint( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + union in_addr_union u; + unsigned char prefixlen; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = in_addr_prefix_from_string(rvalue, AF_INET6, &u, &prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=%s, ignoring assignment.", lvalue, rvalue); + return 0; + } + + if (prefixlen < 1 || prefixlen > 128) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid prefix length in %s=%s, ignoring assignment.", lvalue, rvalue); + return 0; + } + + network->dhcp6_pd_prefix_hint = u.in6; + network->dhcp6_pd_prefix_length = prefixlen; + + return 0; +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_dhcp6_client_start_mode, dhcp6_client_start_mode, DHCP6ClientStartMode, + "Failed to parse WithoutRA= setting"); + +static const char* const dhcp6_client_start_mode_table[_DHCP6_CLIENT_START_MODE_MAX] = { + [DHCP6_CLIENT_START_MODE_NO] = "no", + [DHCP6_CLIENT_START_MODE_INFORMATION_REQUEST] = "information-request", + [DHCP6_CLIENT_START_MODE_SOLICIT] = "solicit", +}; + +DEFINE_STRING_TABLE_LOOKUP(dhcp6_client_start_mode, DHCP6ClientStartMode); diff --git a/src/network/networkd-dhcp6.h b/src/network/networkd-dhcp6.h new file mode 100644 index 0000000..81267c2 --- /dev/null +++ b/src/network/networkd-dhcp6.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "macro.h" + +typedef enum DHCP6ClientStartMode { + DHCP6_CLIENT_START_MODE_NO, + DHCP6_CLIENT_START_MODE_INFORMATION_REQUEST, + DHCP6_CLIENT_START_MODE_SOLICIT, + _DHCP6_CLIENT_START_MODE_MAX, + _DHCP6_CLIENT_START_MODE_INVALID = -EINVAL, +} DHCP6ClientStartMode; + +typedef struct Link Link; + +bool link_dhcp6_with_address_enabled(Link *link); +int dhcp6_check_ready(Link *link); +int dhcp6_update_mac(Link *link); +int dhcp6_start(Link *link); +int dhcp6_start_on_ra(Link *link, bool information_request); + +int link_request_dhcp6_client(Link *link); + +int link_serialize_dhcp6_client(Link *link, FILE *f); + +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp6_pd_prefix_hint); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp6_mud_url); +CONFIG_PARSER_PROTOTYPE(config_parse_dhcp6_client_start_mode); + +const char* dhcp6_client_start_mode_to_string(DHCP6ClientStartMode i) _const_; +DHCP6ClientStartMode dhcp6_client_start_mode_from_string(const char *s) _pure_; diff --git a/src/network/networkd-gperf.gperf b/src/network/networkd-gperf.gperf new file mode 100644 index 0000000..8542ffa --- /dev/null +++ b/src/network/networkd-gperf.gperf @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "networkd-conf.h" +#include "networkd-dhcp-common.h" +#include "networkd-manager.h" +#include "networkd-route-util.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name networkd_gperf_hash +%define lookup-function-name networkd_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Network.SpeedMeter, config_parse_bool, 0, offsetof(Manager, use_speed_meter) +Network.SpeedMeterIntervalSec, config_parse_sec, 0, offsetof(Manager, speed_meter_interval_usec) +Network.ManageForeignRoutingPolicyRules, config_parse_bool, 0, offsetof(Manager, manage_foreign_rules) +Network.ManageForeignRoutes, config_parse_bool, 0, offsetof(Manager, manage_foreign_routes) +Network.RouteTable, config_parse_route_table_names, 0, 0 +Network.IPv6PrivacyExtensions, config_parse_ipv6_privacy_extensions, 0, offsetof(Manager, ipv6_privacy_extensions) +DHCPv4.DUIDType, config_parse_duid_type, 0, offsetof(Manager, dhcp_duid) +DHCPv4.DUIDRawData, config_parse_duid_rawdata, 0, offsetof(Manager, dhcp_duid) +DHCPv6.DUIDType, config_parse_duid_type, 0, offsetof(Manager, dhcp6_duid) +DHCPv6.DUIDRawData, config_parse_duid_rawdata, 0, offsetof(Manager, dhcp6_duid) +/* Deprecated */ +DHCP.DUIDType, config_parse_manager_duid_type, 0, 0 +DHCP.DUIDRawData, config_parse_manager_duid_rawdata, 0, 0 diff --git a/src/network/networkd-ipv4acd.c b/src/network/networkd-ipv4acd.c new file mode 100644 index 0000000..3d5e203 --- /dev/null +++ b/src/network/networkd-ipv4acd.c @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include /* IFF_LOOPBACK */ +#include /* ARPHRD_ETHER */ + +#include "sd-dhcp-client.h" +#include "sd-ipv4acd.h" + +#include "ipvlan.h" +#include "networkd-address.h" +#include "networkd-dhcp4.h" +#include "networkd-ipv4acd.h" +#include "networkd-link.h" +#include "networkd-manager.h" + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + ipv4acd_hash_ops, + void, trivial_hash_func, trivial_compare_func, + sd_ipv4acd, sd_ipv4acd_unref); + +bool link_ipv4acd_supported(Link *link) { + assert(link); + + if (link->flags & IFF_LOOPBACK) + return false; + + /* ARPHRD_INFINIBAND seems to potentially support IPv4ACD. + * But currently sd-ipv4acd only supports ARPHRD_ETHER. */ + if (link->iftype != ARPHRD_ETHER) + return false; + + if (link->hw_addr.length != ETH_ALEN) + return false; + + if (ether_addr_is_null(&link->hw_addr.ether)) + return false; + + if (streq_ptr(link->kind, "vrf")) + return false; + + /* L3 or L3S mode do not support ARP. */ + if (IN_SET(link_get_ipvlan_mode(link), NETDEV_IPVLAN_MODE_L3, NETDEV_IPVLAN_MODE_L3S)) + return false; + + return true; +} + +static bool address_ipv4acd_enabled(Link *link, const Address *address) { + assert(link); + assert(address); + + if (address->family != AF_INET) + return false; + + if (!FLAGS_SET(address->duplicate_address_detection, ADDRESS_FAMILY_IPV4)) + return false; + + /* Currently, only static and DHCP4 addresses are supported. */ + if (!IN_SET(address->source, NETWORK_CONFIG_SOURCE_STATIC, NETWORK_CONFIG_SOURCE_DHCP4)) + return false; + + return link_ipv4acd_supported(link); +} + +bool ipv4acd_bound(Link *link, const Address *address) { + sd_ipv4acd *acd; + + assert(link); + assert(address); + + if (address->family != AF_INET) + return true; + + acd = hashmap_get(link->ipv4acd_by_address, IN4_ADDR_TO_PTR(&address->in_addr.in)); + if (!acd) + return true; + + return sd_ipv4acd_is_bound(acd) > 0; +} + +static int static_ipv4acd_address_remove(Link *link, Address *address, bool on_conflict) { + int r; + + assert(link); + assert(address); + + if (!address_exists(address)) + return 0; /* Not assigned. */ + + if (on_conflict) + log_link_warning(link, "Dropping address %s, as an address conflict was detected.", IN4_ADDR_TO_STRING(&address->in_addr.in)); + else + log_link_debug(link, "Removing address %s, as the ACD client is stopped.", IN4_ADDR_TO_STRING(&address->in_addr.in)); + + r = address_remove(address); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to remove address %s: %m", IN4_ADDR_TO_STRING(&address->in_addr.in)); + + return 0; +} + +static int dhcp4_address_on_conflict(Link *link) { + int r; + + assert(link); + assert(link->dhcp_client); + + r = sd_dhcp_client_send_decline(link->dhcp_client); + if (r < 0) + log_link_warning_errno(link, r, "Failed to send DHCP DECLINE, ignoring: %m"); + + if (!link->dhcp_lease) + /* Unlikely, but during probing the address, the lease may be lost. */ + return 0; + + log_link_warning(link, "Dropping DHCPv4 lease, as an address conflict was detected."); + r = dhcp4_lease_lost(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to drop DHCPv4 lease: %m"); + + /* It is not necessary to call address_remove() here, as dhcp4_lease_lost() removes it. */ + return 0; +} + +static void on_acd(sd_ipv4acd *acd, int event, void *userdata) { + Link *link = ASSERT_PTR(userdata); + Address *address = NULL; + struct in_addr a; + int r; + + assert(acd); + + r = sd_ipv4acd_get_address(acd, &a); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to get address from IPv4ACD: %m"); + link_enter_failed(link); + } + + (void) link_get_ipv4_address(link, &a, 0, &address); + + switch (event) { + case SD_IPV4ACD_EVENT_STOP: + if (!address) + break; + + if (address->source == NETWORK_CONFIG_SOURCE_STATIC) { + r = static_ipv4acd_address_remove(link, address, /* on_conflict = */ false); + if (r < 0) + link_enter_failed(link); + } + + /* We have nothing to do for DHCPv4 lease here, as the dhcp client is already stopped + * when stopping the ipv4acd client. See link_stop_engines(). */ + break; + + case SD_IPV4ACD_EVENT_BIND: + log_link_debug(link, "Successfully claimed address %s", IN4_ADDR_TO_STRING(&a)); + break; + + case SD_IPV4ACD_EVENT_CONFLICT: + if (!address) + break; + + log_link_warning(link, "Dropping address %s, as an address conflict was detected.", IN4_ADDR_TO_STRING(&a)); + + if (address->source == NETWORK_CONFIG_SOURCE_STATIC) + r = static_ipv4acd_address_remove(link, address, /* on_conflict = */ true); + else + r = dhcp4_address_on_conflict(link); + if (r < 0) + link_enter_failed(link); + break; + + default: + assert_not_reached(); + } +} + +static int ipv4acd_check_mac(sd_ipv4acd *acd, const struct ether_addr *mac, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + struct hw_addr_data hw_addr; + + assert(mac); + + hw_addr = (struct hw_addr_data) { + .length = ETH_ALEN, + .ether = *mac, + }; + + return link_get_by_hw_addr(m, &hw_addr, NULL) >= 0; +} + +static int ipv4acd_start_one(Link *link, sd_ipv4acd *acd) { + assert(link); + assert(acd); + + if (sd_ipv4acd_is_running(acd)) + return 0; + + if (!link_has_carrier(link)) + return 0; + + return sd_ipv4acd_start(acd, /* reset_conflicts = */ true); +} + +int ipv4acd_configure(Link *link, const Address *address) { + _cleanup_(sd_ipv4acd_unrefp) sd_ipv4acd *acd = NULL; + sd_ipv4acd *existing; + int r; + + assert(link); + assert(link->manager); + assert(address); + + if (address->family != AF_INET) + return 0; + + existing = hashmap_get(link->ipv4acd_by_address, IN4_ADDR_TO_PTR(&address->in_addr.in)); + + if (!address_ipv4acd_enabled(link, address)) + return sd_ipv4acd_stop(existing); + + if (existing) + return ipv4acd_start_one(link, existing); + + log_link_debug(link, "Configuring IPv4ACD for address %s.", IN4_ADDR_TO_STRING(&address->in_addr.in)); + + r = sd_ipv4acd_new(&acd); + if (r < 0) + return r; + + r = sd_ipv4acd_attach_event(acd, link->manager->event, 0); + if (r < 0) + return r; + + r = sd_ipv4acd_set_ifindex(acd, link->ifindex); + if (r < 0) + return r; + + r = sd_ipv4acd_set_mac(acd, &link->hw_addr.ether); + if (r < 0) + return r; + + r = sd_ipv4acd_set_address(acd, &address->in_addr.in); + if (r < 0) + return r; + + r = sd_ipv4acd_set_callback(acd, on_acd, link); + if (r < 0) + return r; + + r = sd_ipv4acd_set_check_mac_callback(acd, ipv4acd_check_mac, link->manager); + if (r < 0) + return r; + + r = hashmap_ensure_put(&link->ipv4acd_by_address, &ipv4acd_hash_ops, IN4_ADDR_TO_PTR(&address->in_addr.in), acd); + if (r < 0) + return r; + + return ipv4acd_start_one(link, TAKE_PTR(acd)); +} + +void ipv4acd_detach(Link *link, const Address *address) { + assert(link); + assert(address); + + if (address->family != AF_INET) + return; + + sd_ipv4acd_unref(hashmap_remove(link->ipv4acd_by_address, IN4_ADDR_TO_PTR(&address->in_addr.in))); +} + +int ipv4acd_update_mac(Link *link) { + sd_ipv4acd *acd; + int r; + + assert(link); + + if (link->hw_addr.length != ETH_ALEN) + return 0; + if (ether_addr_is_null(&link->hw_addr.ether)) + return 0; + + HASHMAP_FOREACH(acd, link->ipv4acd_by_address) { + r = sd_ipv4acd_set_mac(acd, &link->hw_addr.ether); + if (r < 0) + return r; + } + + return 0; +} + +int ipv4acd_start(Link *link) { + sd_ipv4acd *acd; + int r; + + assert(link); + + HASHMAP_FOREACH(acd, link->ipv4acd_by_address) { + r = ipv4acd_start_one(link, acd); + if (r < 0) + return r; + } + + return 0; +} + +int ipv4acd_stop(Link *link) { + sd_ipv4acd *acd; + int k, r = 0; + + assert(link); + + HASHMAP_FOREACH(acd, link->ipv4acd_by_address) { + k = sd_ipv4acd_stop(acd); + if (k < 0) + r = k; + } + + return r; +} + +int ipv4acd_set_ifname(Link *link) { + sd_ipv4acd *acd; + int r; + + assert(link); + + HASHMAP_FOREACH(acd, link->ipv4acd_by_address) { + r = sd_ipv4acd_set_ifname(acd, link->ifname); + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/network/networkd-ipv4acd.h b/src/network/networkd-ipv4acd.h new file mode 100644 index 0000000..54da435 --- /dev/null +++ b/src/network/networkd-ipv4acd.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Address Address; +typedef struct Link Link; + +bool link_ipv4acd_supported(Link *link); +bool ipv4acd_bound(Link *link, const Address *address); +int ipv4acd_configure(Link *link, const Address *address); +void ipv4acd_detach(Link *link, const Address *address); +int ipv4acd_update_mac(Link *link); +int ipv4acd_start(Link *link); +int ipv4acd_stop(Link *link); +int ipv4acd_set_ifname(Link *link); diff --git a/src/network/networkd-ipv4ll.c b/src/network/networkd-ipv4ll.c new file mode 100644 index 0000000..c357382 --- /dev/null +++ b/src/network/networkd-ipv4ll.c @@ -0,0 +1,319 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "netif-util.h" +#include "networkd-address.h" +#include "networkd-ipv4acd.h" +#include "networkd-ipv4ll.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "parse-util.h" + +bool link_ipv4ll_enabled(Link *link) { + assert(link); + + if (!link_ipv4acd_supported(link)) + return false; + + if (!link->network) + return false; + + if (link->network->bond) + return false; + + return link->network->link_local & ADDRESS_FAMILY_IPV4; +} + +static int address_new_from_ipv4ll(Link *link, Address **ret) { + _cleanup_(address_freep) Address *address = NULL; + struct in_addr addr; + int r; + + assert(link); + assert(link->ipv4ll); + assert(ret); + + r = sd_ipv4ll_get_address(link->ipv4ll, &addr); + if (r < 0) + return r; + + r = address_new(&address); + if (r < 0) + return -ENOMEM; + + address->source = NETWORK_CONFIG_SOURCE_IPV4LL; + address->family = AF_INET; + address->in_addr.in = addr; + address->prefixlen = 16; + address->scope = RT_SCOPE_LINK; + address->route_metric = IPV4LL_ROUTE_METRIC; + + *ret = TAKE_PTR(address); + return 0; +} + +static int ipv4ll_address_lost(Link *link) { + _cleanup_(address_freep) Address *address = NULL; + Address *existing; + int r; + + assert(link); + + link->ipv4ll_address_configured = false; + + r = address_new_from_ipv4ll(link, &address); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + if (address_get(link, address, &existing) < 0) + return 0; + + if (existing->source != NETWORK_CONFIG_SOURCE_IPV4LL) + return 0; + + if (!address_exists(existing)) + return 0; + + log_link_debug(link, "IPv4 link-local release "IPV4_ADDRESS_FMT_STR, + IPV4_ADDRESS_FMT_VAL(address->in_addr.in)); + + return address_remove(existing); +} + +static int ipv4ll_address_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Address *address) { + int r; + + assert(link); + assert(!link->ipv4ll_address_configured); + + r = address_configure_handler_internal(rtnl, m, link, "Could not set ipv4ll address"); + if (r <= 0) + return r; + + link->ipv4ll_address_configured = true; + link_check_ready(link); + + return 1; +} + +static int ipv4ll_address_claimed(sd_ipv4ll *ll, Link *link) { + _cleanup_(address_freep) Address *address = NULL; + int r; + + assert(ll); + assert(link); + + link->ipv4ll_address_configured = false; + + r = address_new_from_ipv4ll(link, &address); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + log_link_debug(link, "IPv4 link-local claim "IPV4_ADDRESS_FMT_STR, + IPV4_ADDRESS_FMT_VAL(address->in_addr.in)); + + return link_request_address(link, address, NULL, ipv4ll_address_handler, NULL); +} + +static void ipv4ll_handler(sd_ipv4ll *ll, int event, void *userdata) { + Link *link = ASSERT_PTR(userdata); + int r; + + assert(link->network); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return; + + switch (event) { + case SD_IPV4LL_EVENT_STOP: + r = ipv4ll_address_lost(link); + if (r < 0) { + link_enter_failed(link); + return; + } + break; + case SD_IPV4LL_EVENT_CONFLICT: + r = ipv4ll_address_lost(link); + if (r < 0) { + link_enter_failed(link); + return; + } + + r = sd_ipv4ll_restart(ll); + if (r < 0) { + log_link_warning_errno(link, r, "Could not acquire IPv4 link-local address: %m"); + link_enter_failed(link); + } + break; + case SD_IPV4LL_EVENT_BIND: + r = ipv4ll_address_claimed(ll, link); + if (r < 0) { + log_link_error(link, "Failed to configure ipv4ll address: %m"); + link_enter_failed(link); + return; + } + break; + default: + log_link_warning(link, "IPv4 link-local unknown event: %d", event); + break; + } +} + +static int ipv4ll_check_mac(sd_ipv4ll *ll, const struct ether_addr *mac, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + struct hw_addr_data hw_addr; + + assert(mac); + + hw_addr = (struct hw_addr_data) { + .length = ETH_ALEN, + .ether = *mac, + }; + + return link_get_by_hw_addr(m, &hw_addr, NULL) >= 0; +} + +int ipv4ll_configure(Link *link) { + uint64_t seed; + int r; + + assert(link); + + if (!link_ipv4ll_enabled(link)) + return 0; + + if (link->ipv4ll) + return -EBUSY; + + r = sd_ipv4ll_new(&link->ipv4ll); + if (r < 0) + return r; + + r = sd_ipv4ll_attach_event(link->ipv4ll, link->manager->event, 0); + if (r < 0) + return r; + + if (link->dev && + net_get_unique_predictable_data(link->dev, true, &seed) >= 0) { + r = sd_ipv4ll_set_address_seed(link->ipv4ll, seed); + if (r < 0) + return r; + } + + r = sd_ipv4ll_set_mac(link->ipv4ll, &link->hw_addr.ether); + if (r < 0) + return r; + + r = sd_ipv4ll_set_ifindex(link->ipv4ll, link->ifindex); + if (r < 0) + return r; + + r = sd_ipv4ll_set_callback(link->ipv4ll, ipv4ll_handler, link); + if (r < 0) + return r; + + return sd_ipv4ll_set_check_mac_callback(link->ipv4ll, ipv4ll_check_mac, link->manager); +} + +int ipv4ll_update_mac(Link *link) { + assert(link); + + if (link->hw_addr.length != ETH_ALEN) + return 0; + if (ether_addr_is_null(&link->hw_addr.ether)) + return 0; + if (!link->ipv4ll) + return 0; + + return sd_ipv4ll_set_mac(link->ipv4ll, &link->hw_addr.ether); +} + +int config_parse_ipv4ll( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + AddressFamily *link_local = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* Note that this is mostly like + * config_parse_address_family(), except that it + * applies only to IPv4 */ + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=%s, ignoring assignment. " + "Note that the setting %s= is deprecated, please use LinkLocalAddressing= instead.", + lvalue, rvalue, lvalue); + return 0; + } + + SET_FLAG(*link_local, ADDRESS_FAMILY_IPV4, r); + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s=%s is deprecated, please use LinkLocalAddressing=%s instead.", + lvalue, rvalue, address_family_to_string(*link_local)); + + return 0; +} + +int config_parse_ipv4ll_address( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + union in_addr_union a; + struct in_addr *ipv4ll_address = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *ipv4ll_address = (struct in_addr) {}; + return 0; + } + + r = in_addr_from_string(AF_INET, rvalue, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + if (!in4_addr_is_link_local_dynamic(&a.in)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified address cannot be used as an IPv4 link local address, ignoring assignment: %s", + rvalue); + return 0; + } + + *ipv4ll_address = a.in; + return 0; +} diff --git a/src/network/networkd-ipv4ll.h b/src/network/networkd-ipv4ll.h new file mode 100644 index 0000000..fa53bd2 --- /dev/null +++ b/src/network/networkd-ipv4ll.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +#define IPV4LL_ROUTE_METRIC 2048 + +typedef struct Link Link; + +bool link_ipv4ll_enabled(Link *link); + +int ipv4ll_configure(Link *link); +int ipv4ll_update_mac(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_ipv4ll); +CONFIG_PARSER_PROTOTYPE(config_parse_ipv4ll_address); diff --git a/src/network/networkd-ipv6-proxy-ndp.c b/src/network/networkd-ipv6-proxy-ndp.c new file mode 100644 index 0000000..edd369a --- /dev/null +++ b/src/network/networkd-ipv6-proxy-ndp.c @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "netlink-util.h" +#include "networkd-ipv6-proxy-ndp.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "socket-util.h" +#include "string-util.h" + +void network_adjust_ipv6_proxy_ndp(Network *network) { + assert(network); + + if (set_isempty(network->ipv6_proxy_ndp_addresses)) + return; + + if (!socket_ipv6_is_supported()) { + log_once(LOG_WARNING, + "%s: IPv6 proxy NDP addresses are set, but IPv6 is not supported by kernel, " + "Ignoring IPv6 proxy NDP addresses.", network->filename); + network->ipv6_proxy_ndp_addresses = set_free_free(network->ipv6_proxy_ndp_addresses); + } +} + +static int ipv6_proxy_ndp_address_configure_handler( + sd_netlink *rtnl, + sd_netlink_message *m, + Request *req, + Link *link, + struct in6_addr *address) { + + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0) + log_link_message_warning_errno(link, m, r, "Could not add IPv6 proxy ndp address entry, ignoring"); + + if (link->static_ipv6_proxy_ndp_messages == 0) { + log_link_debug(link, "IPv6 proxy NDP addresses set."); + link->static_ipv6_proxy_ndp_configured = true; + link_check_ready(link); + } + + return 1; +} + +/* send a request to the kernel to add an IPv6 Proxy entry to the neighbour table */ +static int ipv6_proxy_ndp_address_configure(const struct in6_addr *address, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(address); + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(req); + + /* create new netlink message */ + r = sd_rtnl_message_new_neigh(link->manager->rtnl, &m, RTM_NEWNEIGH, link->ifindex, AF_INET6); + if (r < 0) + return r; + + r = sd_rtnl_message_neigh_set_flags(m, NTF_PROXY); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, NDA_DST, address); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static int ipv6_proxy_ndp_address_process_request(Request *req, Link *link, struct in6_addr *address) { + int r; + + assert(req); + assert(link); + assert(address); + + if (!link_is_ready_to_configure(link, false)) + return 0; + + r = ipv6_proxy_ndp_address_configure(address, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure IPv6 proxy NDP address: %m"); + + return 1; +} + +int link_request_static_ipv6_proxy_ndp_addresses(Link *link) { + struct in6_addr *address; + int r; + + assert(link); + assert(link->network); + + link->static_ipv6_proxy_ndp_configured = false; + + SET_FOREACH(address, link->network->ipv6_proxy_ndp_addresses) { + r = link_queue_request_safe(link, REQUEST_TYPE_IPV6_PROXY_NDP, + address, NULL, + in6_addr_hash_func, + in6_addr_compare_func, + ipv6_proxy_ndp_address_process_request, + &link->static_ipv6_proxy_ndp_messages, + ipv6_proxy_ndp_address_configure_handler, + NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request IPv6 proxy NDP address: %m"); + } + + if (link->static_ipv6_proxy_ndp_messages == 0) { + link->static_ipv6_proxy_ndp_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Setting IPv6 proxy NDP addresses."); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +int config_parse_ipv6_proxy_ndp_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ struct in6_addr *address = NULL; + Network *network = ASSERT_PTR(userdata); + union in_addr_union buffer; + int r; + + assert(filename); + assert(rvalue); + + if (isempty(rvalue)) { + network->ipv6_proxy_ndp_addresses = set_free_free(network->ipv6_proxy_ndp_addresses); + return 0; + } + + r = in_addr_from_string(AF_INET6, rvalue, &buffer); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse IPv6 proxy NDP address, ignoring: %s", rvalue); + return 0; + } + + if (in_addr_is_null(AF_INET6, &buffer)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "IPv6 proxy NDP address cannot be the ANY address, ignoring: %s", rvalue); + return 0; + } + + address = newdup(struct in6_addr, &buffer.in6, 1); + if (!address) + return log_oom(); + + r = set_ensure_put(&network->ipv6_proxy_ndp_addresses, &in6_addr_hash_ops, address); + if (r < 0) + return log_oom(); + if (r > 0) + TAKE_PTR(address); + + return 0; +} diff --git a/src/network/networkd-ipv6-proxy-ndp.h b/src/network/networkd-ipv6-proxy-ndp.h new file mode 100644 index 0000000..e57d28f --- /dev/null +++ b/src/network/networkd-ipv6-proxy-ndp.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +typedef struct Link Link; +typedef struct Network Network; + +void network_adjust_ipv6_proxy_ndp(Network *network); + +int link_request_static_ipv6_proxy_ndp_addresses(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_proxy_ndp_address); diff --git a/src/network/networkd-ipv6ll.c b/src/network/networkd-ipv6ll.c new file mode 100644 index 0000000..32229a3 --- /dev/null +++ b/src/network/networkd-ipv6ll.c @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "in-addr-util.h" +#include "networkd-address.h" +#include "networkd-ipv6ll.h" +#include "networkd-link.h" +#include "networkd-network.h" +#include "networkd-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "strv.h" +#include "sysctl-util.h" + +bool link_ipv6ll_enabled(Link *link) { + assert(link); + + if (!socket_ipv6_is_supported()) + return false; + + if (link->flags & IFF_LOOPBACK) + return false; + + if (!link->network) + return false; + + if (link->iftype == ARPHRD_CAN) + return false; + + if (STRPTR_IN_SET(link->kind, "vrf", "wireguard", "ipip", "gre", "sit", "vti", "nlmon")) + return false; + + if (link->network->bond) + return false; + + return link->network->link_local & ADDRESS_FAMILY_IPV6; +} + +bool link_may_have_ipv6ll(Link *link, bool check_multicast) { + assert(link); + + /* + * This is equivalent to link_ipv6ll_enabled() for non-WireGuard interfaces. + * + * For WireGuard interface, the kernel does not assign any IPv6LL addresses, but we can assign + * it manually. It is necessary to set an IPv6LL address manually to run NDisc or RADV on + * WireGuard interface. Note, also Multicast=yes must be set. See #17380. + * + * TODO: May be better to introduce GenerateIPv6LinkLocalAddress= setting, and use algorithms + * used in networkd-address-generation.c + */ + + if (link_ipv6ll_enabled(link)) + return true; + + /* IPv6LL address can be manually assigned on WireGuard interface. */ + if (streq_ptr(link->kind, "wireguard")) { + Address *a; + + if (!link->network) + return false; + + if (check_multicast && !FLAGS_SET(link->flags, IFF_MULTICAST) && link->network->multicast <= 0) + return false; + + ORDERED_HASHMAP_FOREACH(a, link->network->addresses_by_section) { + if (a->family != AF_INET6) + continue; + if (in6_addr_is_set(&a->in_addr_peer.in6)) + continue; + if (in6_addr_is_link_local(&a->in_addr.in6)) + return true; + } + } + + return false; +} + +IPv6LinkLocalAddressGenMode link_get_ipv6ll_addrgen_mode(Link *link) { + assert(link); + + if (!link_ipv6ll_enabled(link)) + return IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_NONE; + + if (link->network->ipv6ll_address_gen_mode >= 0) + return link->network->ipv6ll_address_gen_mode; + + if (in6_addr_is_set(&link->network->ipv6ll_stable_secret)) + return IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY; + + return IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_EUI64; +} + +int ipv6ll_addrgen_mode_fill_message(sd_netlink_message *message, IPv6LinkLocalAddressGenMode mode) { + int r; + + assert(message); + assert(mode >= 0 && mode < _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX); + + r = sd_netlink_message_open_container(message, IFLA_AF_SPEC); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(message, AF_INET6); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(message, IFLA_INET6_ADDR_GEN_MODE, mode); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(message); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(message); + if (r < 0) + return r; + + return 0; +} + +int link_update_ipv6ll_addrgen_mode(Link *link, sd_netlink_message *message) { + uint8_t mode; + int family, r; + + assert(link); + assert(message); + + r = sd_rtnl_message_get_family(message, &family); + if (r < 0) + return r; + + if (family != AF_UNSPEC) + return 0; + + r = sd_netlink_message_enter_container(message, IFLA_AF_SPEC); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + r = sd_netlink_message_enter_container(message, AF_INET6); + if (r == -ENODATA) + return sd_netlink_message_exit_container(message); + if (r < 0) + return r; + + mode = (uint8_t) link->ipv6ll_address_gen_mode; + r = sd_netlink_message_read_u8(message, IFLA_INET6_ADDR_GEN_MODE, &mode); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_netlink_message_exit_container(message); + if (r < 0) + return r; + + r = sd_netlink_message_exit_container(message); + if (r < 0) + return r; + + if (mode == (uint8_t) link->ipv6ll_address_gen_mode) + return 0; + + if (mode >= _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX) { + log_link_debug(link, "Received invalid IPv6 link-local address generation mode (%u), ignoring.", mode); + return 0; + } + + if (link->ipv6ll_address_gen_mode < 0) + log_link_debug(link, "Saved IPv6 link-local address generation mode: %s", + ipv6_link_local_address_gen_mode_to_string(mode)); + else + log_link_debug(link, "IPv6 link-local address generation mode is changed: %s -> %s", + ipv6_link_local_address_gen_mode_to_string(link->ipv6ll_address_gen_mode), + ipv6_link_local_address_gen_mode_to_string(mode)); + + link->ipv6ll_address_gen_mode = mode; + return 0; +} + +#define STABLE_SECRET_APP_ID_1 SD_ID128_MAKE(aa,05,1d,94,43,68,45,07,b9,73,f1,e8,e4,b7,34,52) +#define STABLE_SECRET_APP_ID_2 SD_ID128_MAKE(52,c4,40,a0,9f,2f,48,58,a9,3a,f6,29,25,ba,7a,7d) + +int link_set_ipv6ll_stable_secret(Link *link) { + struct in6_addr a; + int r; + + assert(link); + assert(link->network); + + if (link->network->ipv6ll_address_gen_mode != IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY) + return 0; + + if (in6_addr_is_set(&link->network->ipv6ll_stable_secret)) + a = link->network->ipv6ll_stable_secret; + else { + sd_id128_t key; + le64_t v; + + /* Generate a stable secret address from machine-ID and the interface name. */ + + r = sd_id128_get_machine_app_specific(STABLE_SECRET_APP_ID_1, &key); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to generate key: %m"); + + v = htole64(siphash24_string(link->ifname, key.bytes)); + memcpy(a.s6_addr, &v, sizeof(v)); + + r = sd_id128_get_machine_app_specific(STABLE_SECRET_APP_ID_2, &key); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to generate key: %m"); + + v = htole64(siphash24_string(link->ifname, key.bytes)); + assert_cc(sizeof(v) * 2 == sizeof(a.s6_addr)); + memcpy(a.s6_addr + sizeof(v), &v, sizeof(v)); + } + + return sysctl_write_ip_property(AF_INET6, link->ifname, "stable_secret", + IN6_ADDR_TO_STRING(&a)); +} + +int link_set_ipv6ll_addrgen_mode(Link *link, IPv6LinkLocalAddressGenMode mode) { + assert(link); + assert(mode >= 0 && mode < _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX); + + if (mode == link->ipv6ll_address_gen_mode) + return 0; + + return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "addr_gen_mode", mode); +} + +static const char* const ipv6_link_local_address_gen_mode_table[_IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX] = { + [IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_EUI64] = "eui64", + [IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_NONE] = "none", + [IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY] = "stable-privacy", + [IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_RANDOM] = "random", +}; + +DEFINE_STRING_TABLE_LOOKUP(ipv6_link_local_address_gen_mode, IPv6LinkLocalAddressGenMode); +DEFINE_CONFIG_PARSE_ENUM( + config_parse_ipv6_link_local_address_gen_mode, + ipv6_link_local_address_gen_mode, + IPv6LinkLocalAddressGenMode, + "Failed to parse IPv6 link-local address generation mode"); diff --git a/src/network/networkd-ipv6ll.h b/src/network/networkd-ipv6ll.h new file mode 100644 index 0000000..2759eed --- /dev/null +++ b/src/network/networkd-ipv6ll.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "macro.h" + +typedef struct Link Link; + +typedef enum IPv6LinkLocalAddressGenMode { + IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_EUI64 = IN6_ADDR_GEN_MODE_EUI64, + IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_NONE = IN6_ADDR_GEN_MODE_NONE, + IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY = IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_RANDOM = IN6_ADDR_GEN_MODE_RANDOM, + _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX, + _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_INVALID = -EINVAL, +} IPv6LinkLocalAddressGenMode; + +bool link_ipv6ll_enabled(Link *link); +bool link_may_have_ipv6ll(Link *link, bool check_multicast); + +IPv6LinkLocalAddressGenMode link_get_ipv6ll_addrgen_mode(Link *link); +int ipv6ll_addrgen_mode_fill_message(sd_netlink_message *message, IPv6LinkLocalAddressGenMode mode); +int link_update_ipv6ll_addrgen_mode(Link *link, sd_netlink_message *message); + +int link_set_ipv6ll_stable_secret(Link *link); +int link_set_ipv6ll_addrgen_mode(Link *link, IPv6LinkLocalAddressGenMode mode); + +const char* ipv6_link_local_address_gen_mode_to_string(IPv6LinkLocalAddressGenMode s) _const_; +IPv6LinkLocalAddressGenMode ipv6_link_local_address_gen_mode_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_link_local_address_gen_mode); diff --git a/src/network/networkd-json.c b/src/network/networkd-json.c new file mode 100644 index 0000000..eed8d9f --- /dev/null +++ b/src/network/networkd-json.c @@ -0,0 +1,1434 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "dhcp-server-internal.h" +#include "dhcp6-internal.h" +#include "dhcp6-lease-internal.h" +#include "dns-domain.h" +#include "ip-protocol-list.h" +#include "netif-util.h" +#include "networkd-address.h" +#include "networkd-dhcp-common.h" +#include "networkd-json.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-neighbor.h" +#include "networkd-network.h" +#include "networkd-nexthop.h" +#include "networkd-route-util.h" +#include "networkd-route.h" +#include "networkd-routing-policy-rule.h" +#include "sort-util.h" +#include "udev-util.h" +#include "user-util.h" +#include "wifi-util.h" + +static int address_build_json(Address *address, JsonVariant **ret) { + _cleanup_free_ char *scope = NULL, *flags = NULL, *state = NULL; + int r; + + assert(address); + assert(ret); + + r = route_scope_to_string_alloc(address->scope, &scope); + if (r < 0) + return r; + + r = address_flags_to_string_alloc(address->flags, address->family, &flags); + if (r < 0) + return r; + + r = network_config_state_to_string_alloc(address->state, &state); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("Family", address->family), + JSON_BUILD_PAIR_IN_ADDR("Address", &address->in_addr, address->family), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("Peer", &address->in_addr_peer, address->family), + JSON_BUILD_PAIR_IN4_ADDR_NON_NULL("Broadcast", &address->broadcast), + JSON_BUILD_PAIR_UNSIGNED("PrefixLength", address->prefixlen), + JSON_BUILD_PAIR_UNSIGNED("Scope", address->scope), + JSON_BUILD_PAIR_STRING("ScopeString", scope), + JSON_BUILD_PAIR_UNSIGNED("Flags", address->flags), + JSON_BUILD_PAIR_STRING("FlagsString", flags), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Label", address->label), + JSON_BUILD_PAIR_FINITE_USEC("PreferredLifetimeUSec", address->lifetime_preferred_usec), + JSON_BUILD_PAIR_FINITE_USEC("PreferredLifetimeUsec", address->lifetime_preferred_usec), /* for backward compat */ + JSON_BUILD_PAIR_FINITE_USEC("ValidLifetimeUSec", address->lifetime_valid_usec), + JSON_BUILD_PAIR_FINITE_USEC("ValidLifetimeUsec", address->lifetime_valid_usec), /* for backward compat */ + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(address->source)), + JSON_BUILD_PAIR_STRING("ConfigState", state), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ConfigProvider", &address->provider, address->family))); +} + +static int addresses_append_json(Set *addresses, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + Address *address; + int r; + + assert(v); + + SET_FOREACH(address, addresses) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = address_build_json(address, &e); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "Addresses", array); +} + +static int neighbor_build_json(Neighbor *n, JsonVariant **ret) { + _cleanup_free_ char *state = NULL; + int r; + + assert(n); + assert(ret); + + r = network_config_state_to_string_alloc(n->state, &state); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("Family", n->family), + JSON_BUILD_PAIR_IN_ADDR("Destination", &n->in_addr, n->family), + JSON_BUILD_PAIR_HW_ADDR("LinkLayerAddress", &n->ll_addr), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(n->source)), + JSON_BUILD_PAIR_STRING("ConfigState", state))); +} + +static int neighbors_append_json(Set *neighbors, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + Neighbor *neighbor; + int r; + + assert(v); + + SET_FOREACH(neighbor, neighbors) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = neighbor_build_json(neighbor, &e); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "Neighbors", array); +} + +static int nexthop_group_build_json(NextHop *nexthop, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + struct nexthop_grp *g; + int r; + + assert(nexthop); + assert(ret); + + HASHMAP_FOREACH(g, nexthop->group) { + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("ID", g->id), + JSON_BUILD_PAIR_UNSIGNED("Weight", g->weight+1))); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(array); + return 0; +} + +static int nexthop_build_json(NextHop *n, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *group = NULL; + _cleanup_free_ char *flags = NULL, *protocol = NULL, *state = NULL; + int r; + + assert(n); + assert(ret); + + r = route_flags_to_string_alloc(n->flags, &flags); + if (r < 0) + return r; + + r = route_protocol_to_string_alloc(n->protocol, &protocol); + if (r < 0) + return r; + + r = network_config_state_to_string_alloc(n->state, &state); + if (r < 0) + return r; + + r = nexthop_group_build_json(n, &group); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("ID", n->id), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("Gateway", &n->gw, n->family), + JSON_BUILD_PAIR_UNSIGNED("Flags", n->flags), + JSON_BUILD_PAIR_STRING("FlagsString", strempty(flags)), + JSON_BUILD_PAIR_UNSIGNED("Protocol", n->protocol), + JSON_BUILD_PAIR_STRING("ProtocolString", protocol), + JSON_BUILD_PAIR_BOOLEAN("Blackhole", n->blackhole), + JSON_BUILD_PAIR_VARIANT_NON_NULL("Group", group), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(n->source)), + JSON_BUILD_PAIR_STRING("ConfigState", state))); +} + +static int nexthops_append_json(Set *nexthops, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + NextHop *nexthop; + int r; + + assert(v); + + SET_FOREACH(nexthop, nexthops) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = nexthop_build_json(nexthop, &e); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "NextHops", array); +} + +static int route_build_json(Route *route, JsonVariant **ret) { + _cleanup_free_ char *scope = NULL, *protocol = NULL, *table = NULL, *flags = NULL, *state = NULL; + Manager *manager; + int r; + + assert(route); + assert(ret); + + manager = route->link ? route->link->manager : route->manager; + + assert(manager); + + r = route_scope_to_string_alloc(route->scope, &scope); + if (r < 0) + return r; + + r = route_protocol_to_string_alloc(route->protocol, &protocol); + if (r < 0) + return r; + + r = manager_get_route_table_to_string(manager, route->table, /* append_num = */ false, &table); + if (r < 0) + return r; + + r = route_flags_to_string_alloc(route->flags, &flags); + if (r < 0) + return r; + + r = network_config_state_to_string_alloc(route->state, &state); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("Family", route->family), + JSON_BUILD_PAIR_IN_ADDR("Destination", &route->dst, route->family), + JSON_BUILD_PAIR_UNSIGNED("DestinationPrefixLength", route->dst_prefixlen), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("Gateway", &route->gw, route->gw_family), + JSON_BUILD_PAIR_CONDITION(route->src_prefixlen > 0, + "Source", JSON_BUILD_IN_ADDR(&route->src, route->family)), + JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("SourcePrefixLength", route->src_prefixlen), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("PreferredSource", &route->prefsrc, route->family), + JSON_BUILD_PAIR_UNSIGNED("Scope", route->scope), + JSON_BUILD_PAIR_STRING("ScopeString", scope), + JSON_BUILD_PAIR_UNSIGNED("Protocol", route->protocol), + JSON_BUILD_PAIR_STRING("ProtocolString", protocol), + JSON_BUILD_PAIR_UNSIGNED("Type", route->type), + JSON_BUILD_PAIR_STRING("TypeString", route_type_to_string(route->type)), + JSON_BUILD_PAIR_UNSIGNED("Priority", route->priority), + JSON_BUILD_PAIR_UNSIGNED("Table", route->table), + JSON_BUILD_PAIR_STRING("TableString", table), + JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("MTU", route->mtu), + JSON_BUILD_PAIR_UNSIGNED("Preference", route->pref), + JSON_BUILD_PAIR_UNSIGNED("Flags", route->flags), + JSON_BUILD_PAIR_STRING("FlagsString", strempty(flags)), + JSON_BUILD_PAIR_FINITE_USEC("LifetimeUSec", route->lifetime_usec), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(route->source)), + JSON_BUILD_PAIR_STRING("ConfigState", state), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ConfigProvider", &route->provider, route->family))); +} + +static int routes_append_json(Set *routes, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + Route *route; + int r; + + assert(v); + + SET_FOREACH(route, routes) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = route_build_json(route, &e); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "Routes", array); +} + +static int routing_policy_rule_build_json(RoutingPolicyRule *rule, JsonVariant **ret) { + _cleanup_free_ char *table = NULL, *protocol = NULL, *state = NULL; + int r; + + assert(rule); + assert(rule->manager); + assert(ret); + + r = manager_get_route_table_to_string(rule->manager, rule->table, /* append_num = */ false, &table); + if (r < 0 && r != -EINVAL) + return r; + + r = route_protocol_to_string_alloc(rule->protocol, &protocol); + if (r < 0) + return r; + + r = network_config_state_to_string_alloc(rule->state, &state); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("Family", rule->family), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("FromPrefix", &rule->from, rule->family), + JSON_BUILD_PAIR_CONDITION(in_addr_is_set(rule->family, &rule->from), + "FromPrefixLength", JSON_BUILD_UNSIGNED(rule->from_prefixlen)), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ToPrefix", &rule->to, rule->family), + JSON_BUILD_PAIR_CONDITION(in_addr_is_set(rule->family, &rule->to), + "ToPrefixLength", JSON_BUILD_UNSIGNED(rule->to_prefixlen)), + JSON_BUILD_PAIR_UNSIGNED("Protocol", rule->protocol), + JSON_BUILD_PAIR_STRING("ProtocolString", protocol), + JSON_BUILD_PAIR_UNSIGNED("TOS", rule->tos), + JSON_BUILD_PAIR_UNSIGNED("Type", rule->type), + JSON_BUILD_PAIR_STRING("TypeString", fr_act_type_full_to_string(rule->type)), + JSON_BUILD_PAIR_UNSIGNED("IPProtocol", rule->ipproto), + JSON_BUILD_PAIR_STRING("IPProtocolString", ip_protocol_to_name(rule->ipproto)), + JSON_BUILD_PAIR_UNSIGNED("Priority", rule->priority), + JSON_BUILD_PAIR_UNSIGNED("FirewallMark", rule->fwmark), + JSON_BUILD_PAIR_UNSIGNED("FirewallMask", rule->fwmask), + JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("Table", rule->table), + JSON_BUILD_PAIR_STRING_NON_EMPTY("TableString", table), + JSON_BUILD_PAIR_BOOLEAN("Invert", rule->invert_rule), + JSON_BUILD_PAIR_CONDITION(rule->suppress_prefixlen >= 0, + "SuppressPrefixLength", JSON_BUILD_UNSIGNED(rule->suppress_prefixlen)), + JSON_BUILD_PAIR_CONDITION(rule->suppress_ifgroup >= 0, + "SuppressInterfaceGroup", JSON_BUILD_UNSIGNED(rule->suppress_ifgroup)), + JSON_BUILD_PAIR_CONDITION(rule->sport.start != 0 || rule->sport.end != 0, "SourcePort", + JSON_BUILD_ARRAY(JSON_BUILD_UNSIGNED(rule->sport.start), JSON_BUILD_UNSIGNED(rule->sport.end))), + JSON_BUILD_PAIR_CONDITION(rule->dport.start != 0 || rule->dport.end != 0, "DestinationPort", + JSON_BUILD_ARRAY(JSON_BUILD_UNSIGNED(rule->dport.start), JSON_BUILD_UNSIGNED(rule->dport.end))), + JSON_BUILD_PAIR_CONDITION(rule->uid_range.start != UID_INVALID && rule->uid_range.end != UID_INVALID, "User", + JSON_BUILD_ARRAY(JSON_BUILD_UNSIGNED(rule->uid_range.start), JSON_BUILD_UNSIGNED(rule->uid_range.end))), + JSON_BUILD_PAIR_STRING_NON_EMPTY("IncomingInterface", rule->iif), + JSON_BUILD_PAIR_STRING_NON_EMPTY("OutgoingInterface", rule->oif), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(rule->source)), + JSON_BUILD_PAIR_STRING("ConfigState", state))); +} + +static int routing_policy_rules_append_json(Set *rules, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + RoutingPolicyRule *rule; + int r; + + assert(v); + + SET_FOREACH(rule, rules) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = routing_policy_rule_build_json(rule, &e); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "RoutingPolicyRules", array); +} + +static int network_append_json(Network *network, JsonVariant **v) { + assert(v); + + if (!network) + return 0; + + return json_variant_merge_objectb( + v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("NetworkFile", network->filename), + JSON_BUILD_PAIR_STRV("NetworkFileDropins", network->dropins), + JSON_BUILD_PAIR_BOOLEAN("RequiredForOnline", network->required_for_online), + JSON_BUILD_PAIR("RequiredOperationalStateForOnline", + JSON_BUILD_ARRAY(JSON_BUILD_STRING(link_operstate_to_string(network->required_operstate_for_online.min)), + JSON_BUILD_STRING(link_operstate_to_string(network->required_operstate_for_online.max)))), + JSON_BUILD_PAIR_STRING("RequiredFamilyForOnline", + link_required_address_family_to_string(network->required_family_for_online)), + JSON_BUILD_PAIR_STRING("ActivationPolicy", + activation_policy_to_string(network->activation_policy)))); +} + +static int device_append_json(sd_device *device, JsonVariant **v) { + _cleanup_strv_free_ char **link_dropins = NULL; + const char *link = NULL, *path = NULL, *vendor = NULL, *model = NULL, *joined; + int r; + + assert(v); + + if (!device) + return 0; + + (void) sd_device_get_property_value(device, "ID_NET_LINK_FILE", &link); + + if (sd_device_get_property_value(device, "ID_NET_LINK_FILE_DROPINS", &joined) >= 0) { + r = strv_split_full(&link_dropins, joined, ":", EXTRACT_CUNESCAPE); + if (r < 0) + return r; + } + + (void) sd_device_get_property_value(device, "ID_PATH", &path); + + (void) device_get_vendor_string(device, &vendor); + (void) device_get_model_string(device, &model); + + return json_variant_merge_objectb( + v, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING_NON_EMPTY("LinkFile", link), + JSON_BUILD_PAIR_STRV_NON_EMPTY("LinkFileDropins", link_dropins), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Path", path), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Vendor", vendor), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Model", model))); +} + +static int dns_append_json_one(Link *link, const struct in_addr_full *a, NetworkConfigSource s, const union in_addr_union *p, JsonVariant **array) { + assert(link); + assert(a); + assert(array); + + if (a->ifindex != 0 && a->ifindex != link->ifindex) + return 0; + + return json_variant_append_arrayb( + array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("Family", a->family), + JSON_BUILD_PAIR_IN_ADDR("Address", &a->address, a->family), + JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("Port", a->port), + JSON_BUILD_PAIR_CONDITION(a->ifindex != 0, "InterfaceIndex", JSON_BUILD_INTEGER(a->ifindex)), + JSON_BUILD_PAIR_STRING_NON_EMPTY("ServerName", a->server_name), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(s)), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ConfigProvider", p, a->family))); +} + +static int dns_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + int r; + + assert(link); + assert(v); + + if (!link->network) + return 0; + + if (link->n_dns != UINT_MAX) + for (unsigned i = 0; i < link->n_dns; i++) { + r = dns_append_json_one(link, link->dns[i], NETWORK_CONFIG_SOURCE_RUNTIME, NULL, &array); + if (r < 0) + return r; + } + else { + for (unsigned i = 0; i < link->network->n_dns; i++) { + r = dns_append_json_one(link, link->network->dns[i], NETWORK_CONFIG_SOURCE_STATIC, NULL, &array); + if (r < 0) + return r; + } + + if (link->dhcp_lease && link->network->dhcp_use_dns) { + const struct in_addr *dns; + union in_addr_union s; + int n_dns; + + r = sd_dhcp_lease_get_server_identifier(link->dhcp_lease, &s.in); + if (r < 0) + return r; + + n_dns = sd_dhcp_lease_get_dns(link->dhcp_lease, &dns); + for (int i = 0; i < n_dns; i++) { + r = dns_append_json_one(link, + &(struct in_addr_full) { .family = AF_INET, .address.in = dns[i], }, + NETWORK_CONFIG_SOURCE_DHCP4, + &s, + &array); + if (r < 0) + return r; + } + } + + if (link->dhcp6_lease && link->network->dhcp6_use_dns) { + const struct in6_addr *dns; + union in_addr_union s; + int n_dns; + + r = sd_dhcp6_lease_get_server_address(link->dhcp6_lease, &s.in6); + if (r < 0) + return r; + + n_dns = sd_dhcp6_lease_get_dns(link->dhcp6_lease, &dns); + for (int i = 0; i < n_dns; i++) { + r = dns_append_json_one(link, + &(struct in_addr_full) { .family = AF_INET6, .address.in6 = dns[i], }, + NETWORK_CONFIG_SOURCE_DHCP6, + &s, + &array); + if (r < 0) + return r; + } + } + + if (link->network->ipv6_accept_ra_use_dns) { + NDiscRDNSS *a; + + SET_FOREACH(a, link->ndisc_rdnss) { + r = dns_append_json_one(link, + &(struct in_addr_full) { .family = AF_INET6, .address.in6 = a->address, }, + NETWORK_CONFIG_SOURCE_NDISC, + &(union in_addr_union) { .in6 = a->router }, + &array); + if (r < 0) + return r; + } + } + } + + return json_variant_set_field_non_null(v, "DNS", array); +} + +static int server_append_json_one_addr(int family, const union in_addr_union *a, NetworkConfigSource s, const union in_addr_union *p, JsonVariant **array) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(a); + assert(array); + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("Family", family), + JSON_BUILD_PAIR_IN_ADDR("Address", a, family), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(s)), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ConfigProvider", p, family))); + if (r < 0) + return r; + + return json_variant_append_array(array, v); +} + +static int server_append_json_one_fqdn(int family, const char *fqdn, NetworkConfigSource s, const union in_addr_union *p, JsonVariant **array) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6)); + assert(fqdn); + assert(array); + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("Server", fqdn), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(s)), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ConfigProvider", p, family))); + if (r < 0) + return r; + + return json_variant_append_array(array, v); +} + +static int server_append_json_one_string(const char *str, NetworkConfigSource s, JsonVariant **array) { + union in_addr_union a; + int family; + + assert(str); + + if (in_addr_from_string_auto(str, &family, &a) >= 0) + return server_append_json_one_addr(family, &a, s, NULL, array); + + return server_append_json_one_fqdn(AF_UNSPEC, str, s, NULL, array); +} + +static int ntp_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + int r; + + assert(link); + assert(v); + + if (!link->network) + return 0; + + STRV_FOREACH(p, link->ntp ?: link->network->ntp) { + r = server_append_json_one_string(*p, NETWORK_CONFIG_SOURCE_RUNTIME, &array); + if (r < 0) + return r; + } + + if (!link->ntp) { + if (link->dhcp_lease && link->network->dhcp_use_ntp) { + const struct in_addr *ntp; + union in_addr_union s; + int n_ntp; + + r = sd_dhcp_lease_get_server_identifier(link->dhcp_lease, &s.in); + if (r < 0) + return r; + + n_ntp = sd_dhcp_lease_get_ntp(link->dhcp_lease, &ntp); + for (int i = 0; i < n_ntp; i++) { + r = server_append_json_one_addr(AF_INET, + &(union in_addr_union) { .in = ntp[i], }, + NETWORK_CONFIG_SOURCE_DHCP4, + &s, + &array); + if (r < 0) + return r; + } + } + + if (link->dhcp6_lease && link->network->dhcp6_use_ntp) { + const struct in6_addr *ntp_addr; + union in_addr_union s; + char **ntp_fqdn; + int n_ntp; + + r = sd_dhcp6_lease_get_server_address(link->dhcp6_lease, &s.in6); + if (r < 0) + return r; + + n_ntp = sd_dhcp6_lease_get_ntp_addrs(link->dhcp6_lease, &ntp_addr); + for (int i = 0; i < n_ntp; i++) { + r = server_append_json_one_addr(AF_INET6, + &(union in_addr_union) { .in6 = ntp_addr[i], }, + NETWORK_CONFIG_SOURCE_DHCP6, + &s, + &array); + if (r < 0) + return r; + } + + n_ntp = sd_dhcp6_lease_get_ntp_fqdn(link->dhcp6_lease, &ntp_fqdn); + for (int i = 0; i < n_ntp; i++) { + r = server_append_json_one_fqdn(AF_INET6, + ntp_fqdn[i], + NETWORK_CONFIG_SOURCE_DHCP6, + &s, + &array); + if (r < 0) + return r; + } + } + } + + return json_variant_set_field_non_null(v, "NTP", array); +} + +static int sip_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + const struct in_addr *sip; + union in_addr_union s; + int n_sip, r; + + assert(link); + assert(v); + + if (!link->network || !link->network->dhcp_use_sip || !link->dhcp_lease) + return 0; + + n_sip = sd_dhcp_lease_get_sip(link->dhcp_lease, &sip); + if (n_sip <= 0) + return 0; + + r = sd_dhcp_lease_get_server_identifier(link->dhcp_lease, &s.in); + if (r < 0) + return r; + + for (int i = 0; i < n_sip; i++) { + r = server_append_json_one_addr(AF_INET, + &(union in_addr_union) { .in = sip[i], }, + NETWORK_CONFIG_SOURCE_DHCP4, + &s, + &array); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "SIP", array); +} + +static int domain_append_json(int family, const char *domain, NetworkConfigSource s, const union in_addr_union *p, JsonVariant **array) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6)); + assert(domain); + assert(array); + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("Domain", domain), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(s)), + JSON_BUILD_PAIR_IN_ADDR_NON_NULL("ConfigProvider", p, family))); + if (r < 0) + return r; + + return json_variant_append_array(array, v); +} + +static int domains_append_json(Link *link, bool is_route, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + OrderedSet *link_domains, *network_domains; + DHCPUseDomains use_domains; + union in_addr_union s; + char **domains; + const char *domain; + int r; + + assert(link); + assert(v); + + if (!link->network) + return 0; + + link_domains = is_route ? link->route_domains : link->search_domains; + network_domains = is_route ? link->network->route_domains : link->network->search_domains; + use_domains = is_route ? DHCP_USE_DOMAINS_ROUTE : DHCP_USE_DOMAINS_YES; + + ORDERED_SET_FOREACH(domain, link_domains ?: network_domains) { + r = domain_append_json(AF_UNSPEC, domain, + link_domains ? NETWORK_CONFIG_SOURCE_RUNTIME : NETWORK_CONFIG_SOURCE_STATIC, + NULL, &array); + if (r < 0) + return r; + } + + if (!link_domains) { + if (link->dhcp_lease && + link->network->dhcp_use_domains == use_domains) { + r = sd_dhcp_lease_get_server_identifier(link->dhcp_lease, &s.in); + if (r < 0) + return r; + + if (sd_dhcp_lease_get_domainname(link->dhcp_lease, &domain) >= 0) { + r = domain_append_json(AF_INET, domain, NETWORK_CONFIG_SOURCE_DHCP4, &s, &array); + if (r < 0) + return r; + } + + if (sd_dhcp_lease_get_search_domains(link->dhcp_lease, &domains) >= 0) + STRV_FOREACH(p, domains) { + r = domain_append_json(AF_INET, *p, NETWORK_CONFIG_SOURCE_DHCP4, &s, &array); + if (r < 0) + return r; + } + } + + if (link->dhcp6_lease && + link->network->dhcp6_use_domains == use_domains) { + r = sd_dhcp6_lease_get_server_address(link->dhcp6_lease, &s.in6); + if (r < 0) + return r; + + if (sd_dhcp6_lease_get_domains(link->dhcp6_lease, &domains) >= 0) + STRV_FOREACH(p, domains) { + r = domain_append_json(AF_INET6, *p, NETWORK_CONFIG_SOURCE_DHCP6, &s, &array); + if (r < 0) + return r; + } + } + + if (link->network->ipv6_accept_ra_use_domains == use_domains) { + NDiscDNSSL *a; + + SET_FOREACH(a, link->ndisc_dnssl) { + r = domain_append_json(AF_INET6, NDISC_DNSSL_DOMAIN(a), NETWORK_CONFIG_SOURCE_NDISC, + &(union in_addr_union) { .in6 = a->router }, + &array); + if (r < 0) + return r; + } + } + } + + return json_variant_set_field_non_null(v, is_route ? "RouteDomains" : "SearchDomains", array); +} + +static int nta_append_json(const char *nta, NetworkConfigSource s, JsonVariant **array) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(nta); + assert(array); + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("DNSSECNegativeTrustAnchor", nta), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(s)))); + if (r < 0) + return r; + + return json_variant_append_array(array, v); +} + +static int ntas_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + const char *nta; + int r; + + assert(link); + assert(v); + + if (!link->network) + return 0; + + SET_FOREACH(nta, link->dnssec_negative_trust_anchors ?: link->network->dnssec_negative_trust_anchors) { + r = nta_append_json(nta, + link->dnssec_negative_trust_anchors ? NETWORK_CONFIG_SOURCE_RUNTIME : NETWORK_CONFIG_SOURCE_STATIC, + &array); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "DNSSECNegativeTrustAnchors", array); +} + +static int dns_misc_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + ResolveSupport resolve_support; + NetworkConfigSource source; + DnsOverTlsMode mode; + int t, r; + + assert(link); + assert(v); + + if (!link->network) + return 0; + + resolve_support = link->llmnr >= 0 ? link->llmnr : link->network->llmnr; + if (resolve_support >= 0) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + source = link->llmnr >= 0 ? NETWORK_CONFIG_SOURCE_RUNTIME : NETWORK_CONFIG_SOURCE_STATIC; + + r = json_build(&e, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("LLMNR", resolve_support_to_string(resolve_support)), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(source)))); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + resolve_support = link->mdns >= 0 ? link->mdns : link->network->mdns; + if (resolve_support >= 0) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + source = link->mdns >= 0 ? NETWORK_CONFIG_SOURCE_RUNTIME : NETWORK_CONFIG_SOURCE_STATIC; + + r = json_build(&e, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("MDNS", resolve_support_to_string(resolve_support)), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(source)))); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + t = link->dns_default_route >= 0 ? link->dns_default_route : link->network->dns_default_route; + if (t >= 0) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + source = link->dns_default_route >= 0 ? NETWORK_CONFIG_SOURCE_RUNTIME : NETWORK_CONFIG_SOURCE_STATIC; + + r = json_build(&e, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_BOOLEAN("DNSDefaultRoute", t), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(source)))); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + mode = link->dns_over_tls_mode >= 0 ? link->dns_over_tls_mode : link->network->dns_over_tls_mode; + if (mode >= 0) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + source = link->dns_over_tls_mode >= 0 ? NETWORK_CONFIG_SOURCE_RUNTIME : NETWORK_CONFIG_SOURCE_STATIC; + + r = json_build(&e, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("DNSOverTLS", dns_over_tls_mode_to_string(mode)), + JSON_BUILD_PAIR_STRING("ConfigSource", network_config_source_to_string(source)))); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "DNSSettings", array); +} + +static int captive_portal_append_json(Link *link, JsonVariant **v) { + const char *captive_portal; + int r; + + assert(link); + assert(v); + + r = link_get_captive_portal(link, &captive_portal); + if (r <= 0) + return r; + + return json_variant_merge_objectb(v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("CaptivePortal", captive_portal))); +} + +static int pref64_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL, *w = NULL; + NDiscPREF64 *i; + int r; + + assert(link); + assert(v); + + if (!link->network || !link->network->ipv6_accept_ra_use_pref64) + return 0; + + SET_FOREACH(i, link->ndisc_pref64) { + r = json_variant_append_arrayb(&array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_IN6_ADDR_NON_NULL("Prefix", &i->prefix), + JSON_BUILD_PAIR_UNSIGNED("PrefixLength", i->prefix_len), + JSON_BUILD_PAIR_FINITE_USEC("LifetimeUSec", i->lifetime_usec), + JSON_BUILD_PAIR_IN6_ADDR_NON_NULL("ConfigProvider", &i->router))); + if (r < 0) + return r; + } + + r = json_variant_set_field_non_null(&w, "PREF64", array); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "NDisc", w); +} + +static int dhcp_server_offered_leases_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + DHCPLease *lease; + int r; + + assert(link); + assert(v); + + if (!link->dhcp_server) + return 0; + + HASHMAP_FOREACH(lease, link->dhcp_server->bound_leases_by_client_id) { + struct in_addr address = { .s_addr = lease->address }; + + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_BYTE_ARRAY( + "ClientId", + lease->client_id.data, + lease->client_id.length), + JSON_BUILD_PAIR_IN4_ADDR_NON_NULL("Address", &address), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Hostname", lease->hostname), + JSON_BUILD_PAIR_FINITE_USEC( + "ExpirationUSec", lease->expiration))); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "Leases", array); +} + +static int dhcp_server_static_leases_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + DHCPLease *lease; + int r; + + assert(link); + assert(v); + + if (!link->dhcp_server) + return 0; + + HASHMAP_FOREACH(lease, link->dhcp_server->static_leases_by_client_id) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + struct in_addr address = { .s_addr = lease->address }; + + r = json_build(&e, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_BYTE_ARRAY( + "ClientId", + lease->client_id.data, + lease->client_id.length), + JSON_BUILD_PAIR_IN4_ADDR_NON_NULL("Address", &address))); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "StaticLeases", array); +} + +static int dhcp_server_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + int r; + + assert(link); + assert(v); + + if (!link->dhcp_server) + return 0; + + r = json_build(&w, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("PoolOffset", link->dhcp_server->pool_offset), + JSON_BUILD_PAIR_UNSIGNED("PoolSize", link->dhcp_server->pool_size))); + if (r < 0) + return r; + + r = dhcp_server_offered_leases_append_json(link, &w); + if (r < 0) + return r; + + r = dhcp_server_static_leases_append_json(link, &w); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "DHCPServer", w); +} + +static int dhcp6_client_vendor_options_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + sd_dhcp6_option **options = NULL; + int r, n_vendor_options; + + assert(link); + assert(v); + + if (!link->dhcp6_lease) + return 0; + + n_vendor_options = sd_dhcp6_lease_get_vendor_options(link->dhcp6_lease, &options); + + FOREACH_ARRAY(option, options, n_vendor_options) { + r = json_variant_append_arrayb(&array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("EnterpriseId", (*option)->enterprise_identifier), + JSON_BUILD_PAIR_UNSIGNED("SubOptionCode", (*option)->option), + JSON_BUILD_PAIR_HEX("SubOptionData", (*option)->data, (*option)->length))); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "VendorSpecificOptions", array); +} + +static int dhcp6_client_lease_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + usec_t ts = USEC_INFINITY, t1 = USEC_INFINITY, t2 = USEC_INFINITY; + int r; + + assert(link); + assert(v); + + if (!link->dhcp6_lease) + return 0; + + r = sd_dhcp6_lease_get_timestamp(link->dhcp6_lease, CLOCK_BOOTTIME, &ts); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_dhcp6_lease_get_t1_timestamp(link->dhcp6_lease, CLOCK_BOOTTIME, &t1); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_dhcp6_lease_get_t2_timestamp(link->dhcp6_lease, CLOCK_BOOTTIME, &t2); + if (r < 0 && r != -ENODATA) + return r; + + r = json_build(&w, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_FINITE_USEC("Timeout1USec", t1), + JSON_BUILD_PAIR_FINITE_USEC("Timeout2USec", t2), + JSON_BUILD_PAIR_FINITE_USEC("LeaseTimestampUSec", ts))); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "Lease", w); +} + +static int dhcp6_client_pd_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + int r; + + assert(link); + assert(link->network); + assert(v); + + if (!link->network->dhcp6_use_pd_prefix || + !sd_dhcp6_lease_has_pd_prefix(link->dhcp6_lease)) + return 0; + + FOREACH_DHCP6_PD_PREFIX(link->dhcp6_lease) { + usec_t lifetime_preferred_usec, lifetime_valid_usec; + struct in6_addr prefix; + uint8_t prefix_len; + + r = sd_dhcp6_lease_get_pd_prefix(link->dhcp6_lease, &prefix, &prefix_len); + if (r < 0) + return r; + + r = sd_dhcp6_lease_get_pd_lifetime_timestamp(link->dhcp6_lease, CLOCK_BOOTTIME, + &lifetime_preferred_usec, &lifetime_valid_usec); + if (r < 0) + return r; + + r = json_variant_append_arrayb(&array, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_IN6_ADDR("Prefix", &prefix), + JSON_BUILD_PAIR_UNSIGNED("PrefixLength", prefix_len), + JSON_BUILD_PAIR_FINITE_USEC("PreferredLifetimeUSec", lifetime_preferred_usec), + JSON_BUILD_PAIR_FINITE_USEC("ValidLifetimeUSec", lifetime_valid_usec))); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "Prefixes", array); +} + +static int dhcp6_client_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + int r; + + assert(link); + assert(v); + + if (!link->dhcp6_client) + return 0; + + r = dhcp6_client_lease_append_json(link, &w); + if (r < 0) + return r; + + r = dhcp6_client_pd_append_json(link, &w); + if (r < 0) + return r; + + r = dhcp6_client_vendor_options_append_json(link, &w); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "DHCPv6Client", w); +} + +static int dhcp_client_lease_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + usec_t lease_timestamp_usec = USEC_INFINITY, t1 = USEC_INFINITY, t2 = USEC_INFINITY; + int r; + + assert(link); + assert(v); + + if (!link->dhcp_client || !link->dhcp_lease) + return 0; + + r = sd_dhcp_lease_get_timestamp(link->dhcp_lease, CLOCK_BOOTTIME, &lease_timestamp_usec); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_dhcp_lease_get_t1_timestamp(link->dhcp_lease, CLOCK_BOOTTIME, &t1); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_dhcp_lease_get_t2_timestamp(link->dhcp_lease, CLOCK_BOOTTIME, &t2); + if (r < 0 && r != -ENODATA) + return r; + + r = json_build(&w, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_FINITE_USEC("LeaseTimestampUSec", lease_timestamp_usec), + JSON_BUILD_PAIR_FINITE_USEC("Timeout1USec", t1), + JSON_BUILD_PAIR_FINITE_USEC("Timeout2USec", t2))); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "Lease", w); +} + +static int dhcp_client_pd_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *addresses = NULL, *array = NULL; + uint8_t ipv4masklen, sixrd_prefixlen; + struct in6_addr sixrd_prefix; + const struct in_addr *br_addresses; + size_t n_br_addresses = 0; + int r; + + assert(link); + assert(link->network); + assert(v); + + if (!link->network->dhcp_use_6rd || !sd_dhcp_lease_has_6rd(link->dhcp_lease)) + return 0; + + r = sd_dhcp_lease_get_6rd(link->dhcp_lease, &ipv4masklen, &sixrd_prefixlen, &sixrd_prefix, &br_addresses, &n_br_addresses); + if (r < 0) + return r; + + FOREACH_ARRAY(br_address, br_addresses, n_br_addresses) { + r = json_variant_append_arrayb(&addresses, JSON_BUILD_IN4_ADDR(br_address)); + if (r < 0) + return r; + } + + r = json_build(&array, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_IN6_ADDR("Prefix", &sixrd_prefix), + JSON_BUILD_PAIR_UNSIGNED("PrefixLength", sixrd_prefixlen), + JSON_BUILD_PAIR_UNSIGNED("IPv4MaskLength", ipv4masklen), + JSON_BUILD_PAIR_VARIANT_NON_NULL("BorderRouters", addresses))); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "6rdPrefix", array); +} + +static int dhcp_client_append_json(Link *link, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + int r; + + assert(link); + assert(v); + + if (!link->dhcp_client) + return 0; + + r = dhcp_client_lease_append_json(link, &w); + if (r < 0) + return r; + + r = dhcp_client_pd_append_json(link, &w); + if (r < 0) + return r; + + return json_variant_set_field_non_null(v, "DHCPv4Client", w); +} + +int link_build_json(Link *link, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_free_ char *type = NULL, *flags = NULL; + int r; + + assert(link); + assert(ret); + + r = net_get_type_string(link->dev, link->iftype, &type); + if (r == -ENOMEM) + return r; + + r = link_flags_to_string_alloc(link->flags, &flags); + if (r < 0) + return r; + + r = json_build(&v, JSON_BUILD_OBJECT( + /* basic information */ + JSON_BUILD_PAIR_INTEGER("Index", link->ifindex), + JSON_BUILD_PAIR_STRING("Name", link->ifname), + JSON_BUILD_PAIR_STRV_NON_EMPTY("AlternativeNames", link->alternative_names), + JSON_BUILD_PAIR_CONDITION(link->master_ifindex > 0, + "MasterInterfaceIndex", JSON_BUILD_INTEGER(link->master_ifindex)), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Kind", link->kind), + JSON_BUILD_PAIR_STRING("Type", type), + JSON_BUILD_PAIR_STRING_NON_EMPTY("Driver", link->driver), + JSON_BUILD_PAIR_UNSIGNED("Flags", link->flags), + JSON_BUILD_PAIR_STRING("FlagsString", flags), + JSON_BUILD_PAIR_UNSIGNED("KernelOperationalState", link->kernel_operstate), + JSON_BUILD_PAIR_STRING("KernelOperationalStateString", kernel_operstate_to_string(link->kernel_operstate)), + JSON_BUILD_PAIR_UNSIGNED("MTU", link->mtu), + JSON_BUILD_PAIR_UNSIGNED("MinimumMTU", link->min_mtu), + JSON_BUILD_PAIR_UNSIGNED("MaximumMTU", link->max_mtu), + JSON_BUILD_PAIR_HW_ADDR_NON_NULL("HardwareAddress", &link->hw_addr), + JSON_BUILD_PAIR_HW_ADDR_NON_NULL("PermanentHardwareAddress", &link->permanent_hw_addr), + JSON_BUILD_PAIR_HW_ADDR_NON_NULL("BroadcastAddress", &link->bcast_addr), + JSON_BUILD_PAIR_IN6_ADDR_NON_NULL("IPv6LinkLocalAddress", &link->ipv6ll_address), + /* wlan information */ + JSON_BUILD_PAIR_CONDITION(link->wlan_iftype > 0, "WirelessLanInterfaceType", + JSON_BUILD_UNSIGNED(link->wlan_iftype)), + JSON_BUILD_PAIR_CONDITION(link->wlan_iftype > 0, "WirelessLanInterfaceTypeString", + JSON_BUILD_STRING(nl80211_iftype_to_string(link->wlan_iftype))), + JSON_BUILD_PAIR_STRING_NON_EMPTY("SSID", link->ssid), + JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL("BSSID", &link->bssid), + /* link state */ + JSON_BUILD_PAIR_STRING("AdministrativeState", link_state_to_string(link->state)), + JSON_BUILD_PAIR_STRING("OperationalState", link_operstate_to_string(link->operstate)), + JSON_BUILD_PAIR_STRING("CarrierState", link_carrier_state_to_string(link->carrier_state)), + JSON_BUILD_PAIR_STRING("AddressState", link_address_state_to_string(link->address_state)), + JSON_BUILD_PAIR_STRING("IPv4AddressState", link_address_state_to_string(link->ipv4_address_state)), + JSON_BUILD_PAIR_STRING("IPv6AddressState", link_address_state_to_string(link->ipv6_address_state)), + JSON_BUILD_PAIR_STRING("OnlineState", link_online_state_to_string(link->online_state)))); + if (r < 0) + return r; + + r = network_append_json(link->network, &v); + if (r < 0) + return r; + + r = device_append_json(link->dev, &v); + if (r < 0) + return r; + + r = dns_append_json(link, &v); + if (r < 0) + return r; + + r = ntp_append_json(link, &v); + if (r < 0) + return r; + + r = sip_append_json(link, &v); + if (r < 0) + return r; + + r = domains_append_json(link, /* is_route = */ false, &v); + if (r < 0) + return r; + + r = domains_append_json(link, /* is_route = */ true, &v); + if (r < 0) + return r; + + r = ntas_append_json(link, &v); + if (r < 0) + return r; + + r = dns_misc_append_json(link, &v); + if (r < 0) + return r; + + r = captive_portal_append_json(link, &v); + if (r < 0) + return r; + + r = pref64_append_json(link, &v); + if (r < 0) + return r; + + r = addresses_append_json(link->addresses, &v); + if (r < 0) + return r; + + r = neighbors_append_json(link->neighbors, &v); + if (r < 0) + return r; + + r = nexthops_append_json(link->nexthops, &v); + if (r < 0) + return r; + + r = routes_append_json(link->routes, &v); + if (r < 0) + return r; + + r = dhcp_server_append_json(link, &v); + if (r < 0) + return r; + + r = dhcp_client_append_json(link, &v); + if (r < 0) + return r; + + r = dhcp6_client_append_json(link, &v); + if (r < 0) + return r; + + *ret = TAKE_PTR(v); + return 0; +} + +static int links_append_json(Manager *manager, JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_free_ Link **links = NULL; + size_t n_links = 0; + int r; + + assert(manager); + assert(v); + + r = hashmap_dump_sorted(manager->links_by_index, (void***) &links, &n_links); + if (r < 0) + return r; + + FOREACH_ARRAY(link, links, n_links) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = link_build_json(*link, &e); + if (r < 0) + return r; + + r = json_variant_append_array(&array, e); + if (r < 0) + return r; + } + + return json_variant_set_field_non_null(v, "Interfaces", array); +} + +int manager_build_json(Manager *manager, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(manager); + assert(ret); + + r = links_append_json(manager, &v); + if (r < 0) + return r; + + r = nexthops_append_json(manager->nexthops, &v); + if (r < 0) + return r; + + r = routes_append_json(manager->routes, &v); + if (r < 0) + return r; + + r = routing_policy_rules_append_json(manager->rules, &v); + if (r < 0) + return r; + + *ret = TAKE_PTR(v); + return 0; +} diff --git a/src/network/networkd-json.h b/src/network/networkd-json.h new file mode 100644 index 0000000..25018fa --- /dev/null +++ b/src/network/networkd-json.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "json.h" + +typedef struct Link Link; +typedef struct Manager Manager; + +int link_build_json(Link *link, JsonVariant **ret); +int manager_build_json(Manager *manager, JsonVariant **ret); diff --git a/src/network/networkd-link-bus.c b/src/network/networkd-link-bus.c new file mode 100644 index 0000000..58d4875 --- /dev/null +++ b/src/network/networkd-link-bus.c @@ -0,0 +1,898 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-message-util.h" +#include "bus-polkit.h" +#include "dns-domain.h" +#include "networkd-dhcp4.h" +#include "networkd-json.h" +#include "networkd-link-bus.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-state-file.h" +#include "parse-util.h" +#include "resolve-util.h" +#include "socket-netlink.h" +#include "strv.h" +#include "user-util.h" + +BUS_DEFINE_PROPERTY_GET_ENUM(property_get_operational_state, link_operstate, LinkOperationalState); +BUS_DEFINE_PROPERTY_GET_ENUM(property_get_carrier_state, link_carrier_state, LinkCarrierState); +BUS_DEFINE_PROPERTY_GET_ENUM(property_get_address_state, link_address_state, LinkAddressState); +BUS_DEFINE_PROPERTY_GET_ENUM(property_get_online_state, link_online_state, LinkOnlineState); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_administrative_state, link_state, LinkState); + +static int property_get_bit_rates( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *link = ASSERT_PTR(userdata); + Manager *manager; + double interval_sec; + uint64_t tx, rx; + + assert(bus); + assert(reply); + + manager = link->manager; + + if (!manager->use_speed_meter || + manager->speed_meter_usec_old == 0 || + !link->stats_updated) + return sd_bus_message_append(reply, "(tt)", UINT64_MAX, UINT64_MAX); + + assert(manager->speed_meter_usec_new > manager->speed_meter_usec_old); + interval_sec = (manager->speed_meter_usec_new - manager->speed_meter_usec_old) / USEC_PER_SEC; + + if (link->stats_new.tx_bytes > link->stats_old.tx_bytes) + tx = (uint64_t) ((link->stats_new.tx_bytes - link->stats_old.tx_bytes) / interval_sec); + else + tx = (uint64_t) ((UINT64_MAX - (link->stats_old.tx_bytes - link->stats_new.tx_bytes)) / interval_sec); + + if (link->stats_new.rx_bytes > link->stats_old.rx_bytes) + rx = (uint64_t) ((link->stats_new.rx_bytes - link->stats_old.rx_bytes) / interval_sec); + else + rx = (uint64_t) ((UINT64_MAX - (link->stats_old.rx_bytes - link->stats_new.rx_bytes)) / interval_sec); + + return sd_bus_message_append(reply, "(tt)", tx, rx); +} + +static int verify_managed_link(Link *l, sd_bus_error *error) { + assert(l); + + if (l->flags & IFF_LOOPBACK) + return sd_bus_error_setf(error, BUS_ERROR_LINK_BUSY, "Link %s is loopback device.", l->ifname); + + return 0; +} + +int bus_link_method_set_ntp_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **ntp = NULL; + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &ntp); + if (r < 0) + return r; + + STRV_FOREACH(i, ntp) { + r = dns_name_is_valid_or_address(*i); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NTP server: %s", *i); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-ntp-servers", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + strv_free_and_replace(l->ntp, ntp); + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_link_method_set_dns_servers_internal(sd_bus_message *message, void *userdata, sd_bus_error *error, bool extended) { + struct in_addr_full **dns; + Link *l = ASSERT_PTR(userdata); + size_t n; + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = bus_message_read_dns_servers(message, error, extended, &dns, &n); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-dns-servers", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + goto finalize; + if (r == 0) { + r = 1; /* Polkit will call us back */ + goto finalize; + } + + if (l->n_dns != UINT_MAX) + for (unsigned i = 0; i < l->n_dns; i++) + in_addr_full_free(l->dns[i]); + + free_and_replace(l->dns, dns); + l->n_dns = n; + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); + +finalize: + for (size_t i = 0; i < n; i++) + in_addr_full_free(dns[i]); + free(dns); + + return r; +} + +int bus_link_method_set_dns_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_link_method_set_dns_servers_internal(message, userdata, error, false); +} + +int bus_link_method_set_dns_servers_ex(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_link_method_set_dns_servers_internal(message, userdata, error, true); +} + +int bus_link_method_set_domains(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_ordered_set_free_ OrderedSet *search_domains = NULL, *route_domains = NULL; + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "(sb)"); + if (r < 0) + return r; + + search_domains = ordered_set_new(&string_hash_ops_free); + if (!search_domains) + return -ENOMEM; + + route_domains = ordered_set_new(&string_hash_ops_free); + if (!route_domains) + return -ENOMEM; + + for (;;) { + _cleanup_free_ char *str = NULL; + const char *name; + int route_only; + + r = sd_bus_message_read(message, "(sb)", &name, &route_only); + if (r < 0) + return r; + if (r == 0) + break; + + r = dns_name_is_valid(name); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid search domain %s", name); + if (!route_only && dns_name_is_root(name)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Root domain is not suitable as search domain"); + + r = dns_name_normalize(name, 0, &str); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid search domain %s", name); + + r = ordered_set_consume(route_only ? route_domains : search_domains, TAKE_PTR(str)); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-domains", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + ordered_set_free(l->search_domains); + ordered_set_free(l->route_domains); + l->search_domains = TAKE_PTR(search_domains); + l->route_domains = TAKE_PTR(route_domains); + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_default_route(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r, b; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-default-route", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + if (l->dns_default_route != b) { + l->dns_default_route = b; + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_llmnr(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + ResolveSupport mode; + const char *llmnr; + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &llmnr); + if (r < 0) + return r; + + if (isempty(llmnr)) + mode = RESOLVE_SUPPORT_YES; + else { + mode = resolve_support_from_string(llmnr); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid LLMNR setting: %s", llmnr); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-llmnr", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + if (l->llmnr != mode) { + l->llmnr = mode; + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_mdns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + ResolveSupport mode; + const char *mdns; + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &mdns); + if (r < 0) + return r; + + if (isempty(mdns)) + mode = RESOLVE_SUPPORT_NO; + else { + mode = resolve_support_from_string(mdns); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid MulticastDNS setting: %s", mdns); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-mdns", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + if (l->mdns != mode) { + l->mdns = mode; + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_dns_over_tls(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + const char *dns_over_tls; + DnsOverTlsMode mode; + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &dns_over_tls); + if (r < 0) + return r; + + if (isempty(dns_over_tls)) + mode = _DNS_OVER_TLS_MODE_INVALID; + else { + mode = dns_over_tls_mode_from_string(dns_over_tls); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid DNSOverTLS setting: %s", dns_over_tls); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-dns-over-tls", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + if (l->dns_over_tls_mode != mode) { + l->dns_over_tls_mode = mode; + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_dnssec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + const char *dnssec; + DnssecMode mode; + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &dnssec); + if (r < 0) + return r; + + if (isempty(dnssec)) + mode = _DNSSEC_MODE_INVALID; + else { + mode = dnssec_mode_from_string(dnssec); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid DNSSEC setting: %s", dnssec); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-dnssec", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + if (l->dnssec_mode != mode) { + l->dnssec_mode = mode; + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_dnssec_negative_trust_anchors(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_set_free_free_ Set *ns = NULL; + _cleanup_strv_free_ char **ntas = NULL; + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read_strv(message, &ntas); + if (r < 0) + return r; + + STRV_FOREACH(i, ntas) { + r = dns_name_is_valid(*i); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid negative trust anchor domain: %s", *i); + } + + ns = set_new(&dns_name_hash_ops); + if (!ns) + return -ENOMEM; + + STRV_FOREACH(i, ntas) { + r = set_put_strdup(&ns, *i); + if (r < 0) + return r; + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.set-dnssec-negative-trust-anchors", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + set_free_free(l->dnssec_negative_trust_anchors); + l->dnssec_negative_trust_anchors = TAKE_PTR(ns); + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_revert_ntp(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.revert-ntp", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + link_ntp_settings_clear(l); + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_revert_dns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_managed_link(l, error); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.revert-dns", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + link_dns_settings_clear(l); + + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_force_renew(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r; + + if (!l->network) + return sd_bus_error_setf(error, BUS_ERROR_UNMANAGED_INTERFACE, + "Interface %s is not managed by systemd-networkd", + l->ifname); + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.forcerenew", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + if (sd_dhcp_server_is_running(l->dhcp_server)) { + r = sd_dhcp_server_forcerenew(l->dhcp_server); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_renew(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r; + + if (!l->network) + return sd_bus_error_setf(error, BUS_ERROR_UNMANAGED_INTERFACE, + "Interface %s is not managed by systemd-networkd", + l->ifname); + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.renew", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + r = dhcp4_renew(l); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_reconfigure(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.reconfigure", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + r = link_reconfigure(l, /* force = */ true); + if (r < 0) + return r; + if (r > 0) { + link_set_state(l, LINK_STATE_INITIALIZED); + r = link_save_and_clean_full(l, /* also_save_manager = */ true); + if (r < 0) + return r; + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_describe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_free_ char *text = NULL; + Link *link = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = link_build_json(link, &v); + if (r < 0) + return log_link_error_errno(link, r, "Failed to build JSON data: %m"); + + r = json_variant_format(v, 0, &text); + if (r < 0) + return log_link_error_errno(link, r, "Failed to format JSON data: %m"); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", text); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static const sd_bus_vtable link_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("OperationalState", "s", property_get_operational_state, offsetof(Link, operstate), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CarrierState", "s", property_get_carrier_state, offsetof(Link, carrier_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AddressState", "s", property_get_address_state, offsetof(Link, address_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IPv4AddressState", "s", property_get_address_state, offsetof(Link, ipv4_address_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IPv6AddressState", "s", property_get_address_state, offsetof(Link, ipv6_address_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("OnlineState", "s", property_get_online_state, offsetof(Link, online_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AdministrativeState", "s", property_get_administrative_state, offsetof(Link, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("BitRates", "(tt)", property_get_bit_rates, 0, 0), + + SD_BUS_METHOD_WITH_ARGS("SetNTP", + SD_BUS_ARGS("as", servers), + SD_BUS_NO_RESULT, + bus_link_method_set_ntp_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNS", + SD_BUS_ARGS("a(iay)", addresses), + SD_BUS_NO_RESULT, + bus_link_method_set_dns_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSEx", + SD_BUS_ARGS("a(iayqs)", addresses), + SD_BUS_NO_RESULT, + bus_link_method_set_dns_servers_ex, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDomains", + SD_BUS_ARGS("a(sb)", domains), + SD_BUS_NO_RESULT, + bus_link_method_set_domains, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDefaultRoute", + SD_BUS_ARGS("b", enable), + SD_BUS_NO_RESULT, + bus_link_method_set_default_route, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLLMNR", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_llmnr, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetMulticastDNS", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_mdns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSOverTLS", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_dns_over_tls, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSSEC", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_dnssec, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSSECNegativeTrustAnchors", + SD_BUS_ARGS("as", names), + SD_BUS_NO_RESULT, + bus_link_method_set_dnssec_negative_trust_anchors, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RevertNTP", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_link_method_revert_ntp, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RevertDNS", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_link_method_revert_dns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Renew", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_link_method_renew, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ForceRenew", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_link_method_force_renew, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Reconfigure", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_link_method_reconfigure, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Describe", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", json), + bus_link_method_describe, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +char *link_bus_path(Link *link) { + _cleanup_free_ char *ifindex = NULL; + char *p; + int r; + + assert(link); + assert(link->ifindex > 0); + + if (asprintf(&ifindex, "%d", link->ifindex) < 0) + return NULL; + + r = sd_bus_path_encode("/org/freedesktop/network1/link", ifindex, &p); + if (r < 0) + return NULL; + + return p; +} + +int link_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = ASSERT_PTR(userdata); + unsigned c = 0; + Link *link; + + assert(bus); + assert(path); + assert(nodes); + + l = new0(char*, hashmap_size(m->links_by_index) + 1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(link, m->links_by_index) { + char *p; + + p = link_bus_path(link); + if (!p) + return -ENOMEM; + + l[c++] = p; + } + + l[c] = NULL; + *nodes = TAKE_PTR(l); + + return 1; +} + +int link_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + _cleanup_free_ char *identifier = NULL; + Manager *m = ASSERT_PTR(userdata); + Link *link; + int ifindex, r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = sd_bus_path_decode(path, "/org/freedesktop/network1/link", &identifier); + if (r <= 0) + return 0; + + ifindex = parse_ifindex(identifier); + if (ifindex < 0) + return 0; + + r = link_get_by_index(m, ifindex, &link); + if (r < 0) + return 0; + + if (streq(interface, "org.freedesktop.network1.DHCPServer") && + (!link->dhcp_server || sd_dhcp_server_is_in_relay_mode(link->dhcp_server))) + return 0; + + if (streq(interface, "org.freedesktop.network1.DHCPv4Client") && !link->dhcp_client) + return 0; + + if (streq(interface, "org.freedesktop.network1.DHCPv6Client") && !link->dhcp6_client) + return 0; + + *found = link; + + return 1; +} + +int link_send_changed_strv(Link *link, char **properties) { + _cleanup_free_ char *p = NULL; + + assert(link); + assert(link->manager); + assert(properties); + + if (sd_bus_is_ready(link->manager->bus) <= 0) + return 0; + + p = link_bus_path(link); + if (!p) + return -ENOMEM; + + return sd_bus_emit_properties_changed_strv( + link->manager->bus, + p, + "org.freedesktop.network1.Link", + properties); +} + +int link_send_changed(Link *link, const char *property, ...) { + char **properties; + + properties = strv_from_stdarg_alloca(property); + + return link_send_changed_strv(link, properties); +} + +const BusObjectImplementation link_object = { + "/org/freedesktop/network1/link", + "org.freedesktop.network1.Link", + .fallback_vtables = BUS_FALLBACK_VTABLES({link_vtable, link_object_find}), + .node_enumerator = link_node_enumerator, +}; diff --git a/src/network/networkd-link-bus.h b/src/network/networkd-link-bus.h new file mode 100644 index 0000000..924d997 --- /dev/null +++ b/src/network/networkd-link-bus.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" +#include "macro.h" + +typedef struct Link Link; + +extern const BusObjectImplementation link_object; + +char *link_bus_path(Link *link); +int link_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error); +int link_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error); +int link_send_changed_strv(Link *link, char **properties); +int link_send_changed(Link *link, const char *property, ...) _sentinel_; + +int property_get_operational_state(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int property_get_carrier_state(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int property_get_address_state(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int property_get_online_state(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +int bus_link_method_set_ntp_servers(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dns_servers(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dns_servers_ex(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_domains(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_default_route(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_llmnr(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_mdns(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dns_over_tls(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dnssec(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dnssec_negative_trust_anchors(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_revert_ntp(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_revert_dns(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_renew(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_force_renew(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_reconfigure(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_describe(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/network/networkd-link.c b/src/network/networkd-link.c new file mode 100644 index 0000000..4ef1be4 --- /dev/null +++ b/src/network/networkd-link.c @@ -0,0 +1,2773 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "arphrd-util.h" +#include "batadv.h" +#include "bond.h" +#include "bridge.h" +#include "bus-util.h" +#include "device-private.h" +#include "device-util.h" +#include "dhcp-identifier.h" +#include "dhcp-lease-internal.h" +#include "env-file.h" +#include "ethtool-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "logarithm.h" +#include "missing_network.h" +#include "netlink-util.h" +#include "network-internal.h" +#include "networkd-address-label.h" +#include "networkd-address.h" +#include "networkd-bridge-fdb.h" +#include "networkd-bridge-mdb.h" +#include "networkd-can.h" +#include "networkd-dhcp-prefix-delegation.h" +#include "networkd-dhcp-server.h" +#include "networkd-dhcp4.h" +#include "networkd-dhcp6.h" +#include "networkd-ipv4acd.h" +#include "networkd-ipv4ll.h" +#include "networkd-ipv6-proxy-ndp.h" +#include "networkd-link-bus.h" +#include "networkd-link.h" +#include "networkd-lldp-tx.h" +#include "networkd-manager.h" +#include "networkd-ndisc.h" +#include "networkd-neighbor.h" +#include "networkd-nexthop.h" +#include "networkd-queue.h" +#include "networkd-radv.h" +#include "networkd-route-util.h" +#include "networkd-route.h" +#include "networkd-routing-policy-rule.h" +#include "networkd-setlink.h" +#include "networkd-sriov.h" +#include "networkd-state-file.h" +#include "networkd-sysctl.h" +#include "networkd-wifi.h" +#include "set.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "tc.h" +#include "tmpfile-util.h" +#include "tuntap.h" +#include "udev-util.h" +#include "vrf.h" + +bool link_ipv6_enabled(Link *link) { + assert(link); + + if (!socket_ipv6_is_supported()) + return false; + + if (link->iftype == ARPHRD_CAN) + return false; + + if (!link->network) + return false; + + if (link->network->bond) + return false; + + if (link_may_have_ipv6ll(link, /* check_multicast = */ false)) + return true; + + if (network_has_static_ipv6_configurations(link->network)) + return true; + + return false; +} + +bool link_has_ipv6_connectivity(Link *link) { + LinkAddressState ipv6_address_state; + + assert(link); + + link_get_address_states(link, NULL, &ipv6_address_state, NULL); + + switch (ipv6_address_state) { + case LINK_ADDRESS_STATE_ROUTABLE: + /* If the interface has a routable IPv6 address, then we assume yes. */ + return true; + + case LINK_ADDRESS_STATE_DEGRADED: + /* If the interface has only degraded IPv6 address (mostly, link-local address), then let's check + * there is an IPv6 default gateway. */ + return link_has_default_gateway(link, AF_INET6); + + case LINK_ADDRESS_STATE_OFF: + /* No IPv6 address. */ + return false; + + default: + assert_not_reached(); + } +} + +static bool link_is_ready_to_configure_one(Link *link, bool allow_unmanaged) { + assert(link); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED, LINK_STATE_UNMANAGED)) + return false; + + if (!link->network) + return allow_unmanaged; + + if (!link->network->configure_without_carrier) { + if (link->set_flags_messages > 0) + return false; + + if (!link_has_carrier(link)) + return false; + } + + if (link->set_link_messages > 0) + return false; + + if (!link->activated) + return false; + + return true; +} + +bool link_is_ready_to_configure(Link *link, bool allow_unmanaged) { + return check_ready_for_all_sr_iov_ports(link, allow_unmanaged, link_is_ready_to_configure_one); +} + +void link_ntp_settings_clear(Link *link) { + link->ntp = strv_free(link->ntp); +} + +void link_dns_settings_clear(Link *link) { + if (link->n_dns != UINT_MAX) + for (unsigned i = 0; i < link->n_dns; i++) + in_addr_full_free(link->dns[i]); + link->dns = mfree(link->dns); + link->n_dns = UINT_MAX; + + link->search_domains = ordered_set_free(link->search_domains); + link->route_domains = ordered_set_free(link->route_domains); + + link->dns_default_route = -1; + link->llmnr = _RESOLVE_SUPPORT_INVALID; + link->mdns = _RESOLVE_SUPPORT_INVALID; + link->dnssec_mode = _DNSSEC_MODE_INVALID; + link->dns_over_tls_mode = _DNS_OVER_TLS_MODE_INVALID; + + link->dnssec_negative_trust_anchors = set_free_free(link->dnssec_negative_trust_anchors); +} + +static void link_free_engines(Link *link) { + if (!link) + return; + + link->dhcp_server = sd_dhcp_server_unref(link->dhcp_server); + + link->dhcp_client = sd_dhcp_client_unref(link->dhcp_client); + link->dhcp_lease = sd_dhcp_lease_unref(link->dhcp_lease); + link->dhcp4_6rd_tunnel_name = mfree(link->dhcp4_6rd_tunnel_name); + + link->lldp_rx = sd_lldp_rx_unref(link->lldp_rx); + link->lldp_tx = sd_lldp_tx_unref(link->lldp_tx); + + link->ipv4acd_by_address = hashmap_free(link->ipv4acd_by_address); + + link->ipv4ll = sd_ipv4ll_unref(link->ipv4ll); + + link->dhcp6_client = sd_dhcp6_client_unref(link->dhcp6_client); + link->dhcp6_lease = sd_dhcp6_lease_unref(link->dhcp6_lease); + + link->ndisc = sd_ndisc_unref(link->ndisc); + link->ndisc_expire = sd_event_source_disable_unref(link->ndisc_expire); + ndisc_flush(link); + + link->radv = sd_radv_unref(link->radv); +} + +static Link *link_free(Link *link) { + assert(link); + + link_ntp_settings_clear(link); + link_dns_settings_clear(link); + + link->routes = set_free(link->routes); + link->nexthops = set_free(link->nexthops); + link->neighbors = set_free(link->neighbors); + link->addresses = set_free(link->addresses); + link->qdiscs = set_free(link->qdiscs); + link->tclasses = set_free(link->tclasses); + + link->dhcp_pd_prefixes = set_free(link->dhcp_pd_prefixes); + + link_free_engines(link); + + set_free(link->sr_iov_virt_port_ifindices); + free(link->ifname); + strv_free(link->alternative_names); + free(link->kind); + free(link->ssid); + free(link->previous_ssid); + free(link->driver); + + unlink_and_free(link->lease_file); + unlink_and_free(link->lldp_file); + unlink_and_free(link->state_file); + + sd_device_unref(link->dev); + netdev_unref(link->netdev); + + hashmap_free(link->bound_to_links); + hashmap_free(link->bound_by_links); + + set_free_with_destructor(link->slaves, link_unref); + + network_unref(link->network); + + sd_event_source_disable_unref(link->carrier_lost_timer); + + return mfree(link); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Link, link, link_free); + +int link_get_by_index(Manager *m, int ifindex, Link **ret) { + Link *link; + + assert(m); + assert(ifindex > 0); + + link = hashmap_get(m->links_by_index, INT_TO_PTR(ifindex)); + if (!link) + return -ENODEV; + + if (ret) + *ret = link; + return 0; +} + +int link_get_by_name(Manager *m, const char *ifname, Link **ret) { + Link *link; + + assert(m); + assert(ifname); + + link = hashmap_get(m->links_by_name, ifname); + if (!link) + return -ENODEV; + + if (ret) + *ret = link; + return 0; +} + +int link_get_by_hw_addr(Manager *m, const struct hw_addr_data *hw_addr, Link **ret) { + Link *link; + + assert(m); + assert(hw_addr); + + link = hashmap_get(m->links_by_hw_addr, hw_addr); + if (!link) + return -ENODEV; + + if (ret) + *ret = link; + return 0; +} + +int link_get_master(Link *link, Link **ret) { + assert(link); + assert(link->manager); + assert(ret); + + if (link->master_ifindex <= 0 || link->master_ifindex == link->ifindex) + return -ENODEV; + + return link_get_by_index(link->manager, link->master_ifindex, ret); +} + +void link_set_state(Link *link, LinkState state) { + assert(link); + + if (link->state == state) + return; + + log_link_debug(link, "State changed: %s -> %s", + link_state_to_string(link->state), + link_state_to_string(state)); + + link->state = state; + + link_send_changed(link, "AdministrativeState", NULL); + link_dirty(link); +} + +int link_stop_engines(Link *link, bool may_keep_dhcp) { + int r = 0, k; + + assert(link); + assert(link->manager); + assert(link->manager->event); + + bool keep_dhcp = may_keep_dhcp && + link->network && + !link->network->dhcp_send_decline && /* IPv4 ACD for the DHCPv4 address is running. */ + (link->manager->restarting || + FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP_ON_STOP)); + + if (!keep_dhcp) { + k = sd_dhcp_client_stop(link->dhcp_client); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop DHCPv4 client: %m"); + } + + k = sd_dhcp_server_stop(link->dhcp_server); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop DHCPv4 server: %m"); + + k = sd_lldp_rx_stop(link->lldp_rx); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop LLDP Rx: %m"); + + k = sd_lldp_tx_stop(link->lldp_tx); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop LLDP Tx: %m"); + + k = sd_ipv4ll_stop(link->ipv4ll); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop IPv4 link-local: %m"); + + k = ipv4acd_stop(link); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop IPv4 ACD client: %m"); + + k = sd_dhcp6_client_stop(link->dhcp6_client); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop DHCPv6 client: %m"); + + k = dhcp_pd_remove(link, /* only_marked = */ false); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not remove DHCPv6 PD addresses and routes: %m"); + + k = ndisc_stop(link); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop IPv6 Router Discovery: %m"); + + ndisc_flush(link); + + k = sd_radv_stop(link->radv); + if (k < 0) + r = log_link_warning_errno(link, k, "Could not stop IPv6 Router Advertisement: %m"); + + return r; +} + +void link_enter_failed(Link *link) { + assert(link); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return; + + log_link_warning(link, "Failed"); + + link_set_state(link, LINK_STATE_FAILED); + + (void) link_stop_engines(link, false); +} + +void link_check_ready(Link *link) { + Address *a; + + assert(link); + + if (link->state == LINK_STATE_CONFIGURED) + return; + + if (link->state != LINK_STATE_CONFIGURING) + return (void) log_link_debug(link, "%s(): link is in %s state.", __func__, link_state_to_string(link->state)); + + if (!link->network) + return (void) log_link_debug(link, "%s(): link is unmanaged.", __func__); + + if (!link->tc_configured) + return (void) log_link_debug(link, "%s(): traffic controls are not configured.", __func__); + + if (link->set_link_messages > 0) + return (void) log_link_debug(link, "%s(): link layer is configuring.", __func__); + + if (!link->activated) + return (void) log_link_debug(link, "%s(): link is not activated.", __func__); + + if (link->iftype == ARPHRD_CAN) { + /* let's shortcut things for CAN which doesn't need most of checks below. */ + link_set_state(link, LINK_STATE_CONFIGURED); + return; + } + + if (!link->stacked_netdevs_created) + return (void) log_link_debug(link, "%s(): stacked netdevs are not created.", __func__); + + if (!link->static_addresses_configured) + return (void) log_link_debug(link, "%s(): static addresses are not configured.", __func__); + + if (!link->static_address_labels_configured) + return (void) log_link_debug(link, "%s(): static address labels are not configured.", __func__); + + if (!link->static_bridge_fdb_configured) + return (void) log_link_debug(link, "%s(): static bridge MDB entries are not configured.", __func__); + + if (!link->static_bridge_mdb_configured) + return (void) log_link_debug(link, "%s(): static bridge MDB entries are not configured.", __func__); + + if (!link->static_ipv6_proxy_ndp_configured) + return (void) log_link_debug(link, "%s(): static IPv6 proxy NDP addresses are not configured.", __func__); + + if (!link->static_neighbors_configured) + return (void) log_link_debug(link, "%s(): static neighbors are not configured.", __func__); + + if (!link->static_nexthops_configured) + return (void) log_link_debug(link, "%s(): static nexthops are not configured.", __func__); + + if (!link->static_routes_configured) + return (void) log_link_debug(link, "%s(): static routes are not configured.", __func__); + + if (!link->static_routing_policy_rules_configured) + return (void) log_link_debug(link, "%s(): static routing policy rules are not configured.", __func__); + + if (!link->sr_iov_configured) + return (void) log_link_debug(link, "%s(): SR-IOV is not configured.", __func__); + + /* IPv6LL is assigned after the link gains its carrier. */ + if (!link->network->configure_without_carrier && + link_ipv6ll_enabled(link) && + !in6_addr_is_set(&link->ipv6ll_address)) + return (void) log_link_debug(link, "%s(): IPv6LL is not configured yet.", __func__); + + /* All static addresses must be ready. */ + bool has_static_address = false; + SET_FOREACH(a, link->addresses) { + if (a->source != NETWORK_CONFIG_SOURCE_STATIC) + continue; + if (!address_is_ready(a)) + return (void) log_link_debug(link, "%s(): static address %s is not ready.", __func__, + IN_ADDR_PREFIX_TO_STRING(a->family, &a->in_addr, a->prefixlen)); + has_static_address = true; + } + + /* If at least one static address is requested, do not request that dynamic addressing protocols are finished. */ + if (has_static_address) + goto ready; + + /* If no dynamic addressing protocol enabled, assume the interface is ready. + * Note, ignore NDisc when ConfigureWithoutCarrier= is enabled, as IPv6AcceptRA= is enabled by default. */ + if (!link_ipv4ll_enabled(link) && !link_dhcp4_enabled(link) && + !link_dhcp6_enabled(link) && !link_dhcp_pd_is_enabled(link) && + (link->network->configure_without_carrier || !link_ipv6_accept_ra_enabled(link))) + goto ready; + + bool ipv4ll_ready = + link_ipv4ll_enabled(link) && link->ipv4ll_address_configured && + link_check_addresses_ready(link, NETWORK_CONFIG_SOURCE_IPV4LL); + bool dhcp4_ready = + link_dhcp4_enabled(link) && link->dhcp4_configured && + link_check_addresses_ready(link, NETWORK_CONFIG_SOURCE_DHCP4); + bool dhcp6_ready = + link_dhcp6_enabled(link) && link->dhcp6_configured && + (!link->network->dhcp6_use_address || + link_check_addresses_ready(link, NETWORK_CONFIG_SOURCE_DHCP6)); + bool dhcp_pd_ready = + link_dhcp_pd_is_enabled(link) && link->dhcp_pd_configured && + (!link->network->dhcp_pd_assign || + link_check_addresses_ready(link, NETWORK_CONFIG_SOURCE_DHCP_PD)); + bool ndisc_ready = + link_ipv6_accept_ra_enabled(link) && link->ndisc_configured && + (!link->network->ipv6_accept_ra_use_autonomous_prefix || + link_check_addresses_ready(link, NETWORK_CONFIG_SOURCE_NDISC)); + + /* If the uplink for PD is self, then request the corresponding DHCP protocol is also ready. */ + if (dhcp_pd_is_uplink(link, link, /* accept_auto = */ false)) { + if (link_dhcp4_enabled(link) && link->network->dhcp_use_6rd && + sd_dhcp_lease_has_6rd(link->dhcp_lease)) { + if (!link->dhcp4_configured) + return (void) log_link_debug(link, "%s(): DHCPv4 6rd prefix is assigned, but DHCPv4 protocol is not finished yet.", __func__); + if (!dhcp_pd_ready) + return (void) log_link_debug(link, "%s(): DHCPv4 is finished, but prefix acquired by DHCPv4-6rd is not assigned yet.", __func__); + } + + if (link_dhcp6_enabled(link) && link->network->dhcp6_use_pd_prefix && + sd_dhcp6_lease_has_pd_prefix(link->dhcp6_lease)) { + if (!link->dhcp6_configured) + return (void) log_link_debug(link, "%s(): DHCPv6 IA_PD prefix is assigned, but DHCPv6 protocol is not finished yet.", __func__); + if (!dhcp_pd_ready) + return (void) log_link_debug(link, "%s(): DHCPv6 is finished, but prefix acquired by DHCPv6 IA_PD is not assigned yet.", __func__); + } + } + + /* At least one dynamic addressing protocol is finished. */ + if (!ipv4ll_ready && !dhcp4_ready && !dhcp6_ready && !dhcp_pd_ready && !ndisc_ready) + return (void) log_link_debug(link, "%s(): dynamic addressing protocols are enabled but none of them finished yet.", __func__); + + log_link_debug(link, "%s(): IPv4LL:%s DHCPv4:%s DHCPv6:%s DHCP-PD:%s NDisc:%s", + __func__, + yes_no(ipv4ll_ready), + yes_no(dhcp4_ready), + yes_no(dhcp6_ready), + yes_no(dhcp_pd_ready), + yes_no(ndisc_ready)); + +ready: + link_set_state(link, LINK_STATE_CONFIGURED); +} + +static int link_request_static_configs(Link *link) { + int r; + + assert(link); + assert(link->network); + assert(link->state != _LINK_STATE_INVALID); + + r = link_request_static_addresses(link); + if (r < 0) + return r; + + r = link_request_static_address_labels(link); + if (r < 0) + return r; + + r = link_request_static_bridge_fdb(link); + if (r < 0) + return r; + + r = link_request_static_bridge_mdb(link); + if (r < 0) + return r; + + r = link_request_static_ipv6_proxy_ndp_addresses(link); + if (r < 0) + return r; + + r = link_request_static_neighbors(link); + if (r < 0) + return r; + + r = link_request_static_nexthops(link, false); + if (r < 0) + return r; + + r = link_request_static_routes(link, false); + if (r < 0) + return r; + + r = link_request_static_routing_policy_rules(link); + if (r < 0) + return r; + + return 0; +} + +static int link_request_stacked_netdevs(Link *link) { + NetDev *netdev; + int r; + + assert(link); + + link->stacked_netdevs_created = false; + + HASHMAP_FOREACH(netdev, link->network->stacked_netdevs) { + r = link_request_stacked_netdev(link, netdev); + if (r < 0) + return r; + } + + if (link->create_stacked_netdev_messages == 0) { + link->stacked_netdevs_created = true; + link_check_ready(link); + } + + return 0; +} + +static int link_acquire_dynamic_ipv6_conf(Link *link) { + int r; + + assert(link); + + r = radv_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start IPv6 Router Advertisement engine: %m"); + + r = ndisc_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start IPv6 Router Discovery: %m"); + + r = dhcp6_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start DHCPv6 client: %m"); + + return 0; +} + +static int link_acquire_dynamic_ipv4_conf(Link *link) { + int r; + + assert(link); + assert(link->manager); + assert(link->manager->event); + + if (link->dhcp_client) { + r = dhcp4_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start DHCPv4 client: %m"); + + log_link_debug(link, "Acquiring DHCPv4 lease."); + + } else if (link->ipv4ll) { + if (in4_addr_is_set(&link->network->ipv4ll_start_address)) { + r = sd_ipv4ll_set_address(link->ipv4ll, &link->network->ipv4ll_start_address); + if (r < 0) + return log_link_warning_errno(link, r, "Could not set IPv4 link-local start address: %m"); + } + + r = sd_ipv4ll_start(link->ipv4ll); + if (r < 0) + return log_link_warning_errno(link, r, "Could not acquire IPv4 link-local address: %m"); + + log_link_debug(link, "Acquiring IPv4 link-local address."); + } + + if (link->dhcp_server) { + r = sd_dhcp_server_start(link->dhcp_server); + if (r < 0) + return log_link_warning_errno(link, r, "Could not start DHCP server: %m"); + } + + r = ipv4acd_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Could not start IPv4 ACD client: %m"); + + return 0; +} + +static int link_acquire_dynamic_conf(Link *link) { + int r; + + assert(link); + assert(link->network); + + r = link_acquire_dynamic_ipv4_conf(link); + if (r < 0) + return r; + + if (in6_addr_is_set(&link->ipv6ll_address)) { + r = link_acquire_dynamic_ipv6_conf(link); + if (r < 0) + return r; + } + + if (!link_radv_enabled(link) || !link->network->dhcp_pd_announce) { + /* DHCPv6PD downstream does not require IPv6LL address. But may require RADV to be + * configured, and RADV may not be configured yet here. Only acquire subnet prefix when + * RADV is disabled, or the announcement of the prefix is disabled. Otherwise, the + * below will be called in radv_start(). */ + r = dhcp_request_prefix_delegation(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request DHCP delegated subnet prefix: %m"); + } + + if (link->lldp_tx) { + r = sd_lldp_tx_start(link->lldp_tx); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start LLDP transmission: %m"); + } + + if (link->lldp_rx) { + r = sd_lldp_rx_start(link->lldp_rx); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start LLDP client: %m"); + } + + return 0; +} + +int link_ipv6ll_gained(Link *link) { + int r; + + assert(link); + + log_link_info(link, "Gained IPv6LL"); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return 0; + + r = link_acquire_dynamic_ipv6_conf(link); + if (r < 0) + return r; + + link_check_ready(link); + return 0; +} + +int link_handle_bound_to_list(Link *link) { + bool required_up = false; + bool link_is_up = false; + Link *l; + + assert(link); + + /* If at least one interface in bound_to_links has carrier, then make this interface up. + * If all interfaces in bound_to_links do not, then make this interface down. */ + + if (hashmap_isempty(link->bound_to_links)) + return 0; + + if (link->flags & IFF_UP) + link_is_up = true; + + HASHMAP_FOREACH(l, link->bound_to_links) + if (link_has_carrier(l)) { + required_up = true; + break; + } + + if (!required_up && link_is_up) + return link_request_to_bring_up_or_down(link, /* up = */ false); + if (required_up && !link_is_up) + return link_request_to_bring_up_or_down(link, /* up = */ true); + + return 0; +} + +static int link_handle_bound_by_list(Link *link) { + Link *l; + int r; + + assert(link); + + /* Update up or down state of interfaces which depend on this interface's carrier state. */ + + if (hashmap_isempty(link->bound_by_links)) + return 0; + + HASHMAP_FOREACH(l, link->bound_by_links) { + r = link_handle_bound_to_list(l); + if (r < 0) + return r; + } + + return 0; +} + +static int link_put_carrier(Link *link, Link *carrier, Hashmap **h) { + int r; + + assert(link); + assert(carrier); + + if (link == carrier) + return 0; + + if (hashmap_get(*h, INT_TO_PTR(carrier->ifindex))) + return 0; + + r = hashmap_ensure_put(h, NULL, INT_TO_PTR(carrier->ifindex), carrier); + if (r < 0) + return r; + + link_dirty(link); + + return 0; +} + +static int link_new_bound_by_list(Link *link) { + Manager *m; + Link *carrier; + int r; + + assert(link); + assert(link->manager); + + m = link->manager; + + HASHMAP_FOREACH(carrier, m->links_by_index) { + if (!carrier->network) + continue; + + if (strv_isempty(carrier->network->bind_carrier)) + continue; + + if (strv_fnmatch(carrier->network->bind_carrier, link->ifname)) { + r = link_put_carrier(link, carrier, &link->bound_by_links); + if (r < 0) + return r; + } + } + + HASHMAP_FOREACH(carrier, link->bound_by_links) { + r = link_put_carrier(carrier, link, &carrier->bound_to_links); + if (r < 0) + return r; + } + + return 0; +} + +static int link_new_bound_to_list(Link *link) { + Manager *m; + Link *carrier; + int r; + + assert(link); + assert(link->manager); + + if (!link->network) + return 0; + + if (strv_isempty(link->network->bind_carrier)) + return 0; + + m = link->manager; + + HASHMAP_FOREACH(carrier, m->links_by_index) { + if (strv_fnmatch(link->network->bind_carrier, carrier->ifname)) { + r = link_put_carrier(link, carrier, &link->bound_to_links); + if (r < 0) + return r; + } + } + + HASHMAP_FOREACH(carrier, link->bound_to_links) { + r = link_put_carrier(carrier, link, &carrier->bound_by_links); + if (r < 0) + return r; + } + + return 0; +} + +static void link_free_bound_to_list(Link *link) { + bool updated = false; + Link *bound_to; + + assert(link); + + while ((bound_to = hashmap_steal_first(link->bound_to_links))) { + updated = true; + + if (hashmap_remove(bound_to->bound_by_links, INT_TO_PTR(link->ifindex))) + link_dirty(bound_to); + } + + if (updated) + link_dirty(link); +} + +static void link_free_bound_by_list(Link *link) { + bool updated = false; + Link *bound_by; + + assert(link); + + while ((bound_by = hashmap_steal_first(link->bound_by_links))) { + updated = true; + + if (hashmap_remove(bound_by->bound_to_links, INT_TO_PTR(link->ifindex))) { + link_dirty(bound_by); + link_handle_bound_to_list(bound_by); + } + } + + if (updated) + link_dirty(link); +} + +static int link_append_to_master(Link *link) { + Link *master; + int r; + + assert(link); + + /* - The link may have no master. + * - RTM_NEWLINK message about master interface may not be received yet. */ + if (link_get_master(link, &master) < 0) + return 0; + + r = set_ensure_put(&master->slaves, NULL, link); + if (r <= 0) + return r; + + link_ref(link); + return 0; +} + +static void link_drop_from_master(Link *link) { + Link *master; + + assert(link); + + if (!link->manager) + return; + + if (link_get_master(link, &master) < 0) + return; + + link_unref(set_remove(master->slaves, link)); +} + +static void link_drop_requests(Link *link) { + Request *req; + + assert(link); + assert(link->manager); + + ORDERED_SET_FOREACH(req, link->manager->request_queue) + if (req->link == link) + request_detach(link->manager, req); +} + +static Link *link_drop(Link *link) { + if (!link) + return NULL; + + assert(link->manager); + + link_set_state(link, LINK_STATE_LINGER); + + /* Drop all references from other links and manager. Note that async netlink calls may have + * references to the link, and they will be dropped when we receive replies. */ + + link_drop_requests(link); + + link_free_bound_to_list(link); + link_free_bound_by_list(link); + + link_clear_sr_iov_ifindices(link); + + link_drop_from_master(link); + + if (link->state_file) + (void) unlink(link->state_file); + + link_clean(link); + + STRV_FOREACH(n, link->alternative_names) + hashmap_remove(link->manager->links_by_name, *n); + hashmap_remove(link->manager->links_by_name, link->ifname); + + /* bonding master and its slaves have the same hardware address. */ + hashmap_remove_value(link->manager->links_by_hw_addr, &link->hw_addr, link); + + /* The following must be called at last. */ + assert_se(hashmap_remove(link->manager->links_by_index, INT_TO_PTR(link->ifindex)) == link); + return link_unref(link); +} + +static int link_drop_foreign_config(Link *link) { + int r; + + assert(link); + assert(link->manager); + + /* Drop foreign config, but ignore unmanaged, loopback, or critical interfaces. We do not want + * to remove loopback address or addresses used for root NFS. */ + + if (IN_SET(link->state, LINK_STATE_UNMANAGED, LINK_STATE_PENDING, LINK_STATE_INITIALIZED)) + return 0; + if (FLAGS_SET(link->flags, IFF_LOOPBACK)) + return 0; + if (link->network->keep_configuration == KEEP_CONFIGURATION_YES) + return 0; + + r = link_drop_foreign_routes(link); + + RET_GATHER(r, link_drop_foreign_nexthops(link)); + RET_GATHER(r, link_drop_foreign_addresses(link)); + RET_GATHER(r, link_drop_foreign_neighbors(link)); + RET_GATHER(r, manager_drop_foreign_routing_policy_rules(link->manager)); + + return r; +} + +static int link_drop_managed_config(Link *link) { + int r; + + assert(link); + assert(link->manager); + + r = link_drop_managed_routes(link); + + RET_GATHER(r, link_drop_managed_nexthops(link)); + RET_GATHER(r, link_drop_managed_addresses(link)); + RET_GATHER(r, link_drop_managed_neighbors(link)); + RET_GATHER(r, link_drop_managed_routing_policy_rules(link)); + + return r; +} + +static void link_foreignize_config(Link *link) { + assert(link); + assert(link->manager); + + link_foreignize_routes(link); + link_foreignize_nexthops(link); + link_foreignize_addresses(link); + link_foreignize_neighbors(link); + link_foreignize_routing_policy_rules(link); +} + +static int link_configure(Link *link) { + int r; + + assert(link); + assert(link->network); + assert(link->state == LINK_STATE_INITIALIZED); + + link_set_state(link, LINK_STATE_CONFIGURING); + + r = link_new_bound_to_list(link); + if (r < 0) + return r; + + r = link_request_traffic_control(link); + if (r < 0) + return r; + + r = link_configure_mtu(link); + if (r < 0) + return r; + + if (link->iftype == ARPHRD_CAN) { + /* let's shortcut things for CAN which doesn't need most of what's done below. */ + r = link_request_to_set_can(link); + if (r < 0) + return r; + + return link_request_to_activate(link); + } + + r = link_request_sr_iov_vfs(link); + if (r < 0) + return r; + + r = link_set_sysctl(link); + if (r < 0) + return r; + + r = link_request_to_set_mac(link, /* allow_retry = */ true); + if (r < 0) + return r; + + r = link_request_to_set_ipoib(link); + if (r < 0) + return r; + + r = link_request_to_set_flags(link); + if (r < 0) + return r; + + r = link_request_to_set_group(link); + if (r < 0) + return r; + + r = link_request_to_set_addrgen_mode(link); + if (r < 0) + return r; + + r = link_request_to_set_master(link); + if (r < 0) + return r; + + r = link_request_stacked_netdevs(link); + if (r < 0) + return r; + + r = link_request_to_set_bond(link); + if (r < 0) + return r; + + r = link_request_to_set_bridge(link); + if (r < 0) + return r; + + r = link_request_to_set_bridge_vlan(link); + if (r < 0) + return r; + + r = link_request_to_activate(link); + if (r < 0) + return r; + + r = ipv4ll_configure(link); + if (r < 0) + return r; + + r = link_request_dhcp4_client(link); + if (r < 0) + return r; + + r = link_request_dhcp6_client(link); + if (r < 0) + return r; + + r = link_request_ndisc(link); + if (r < 0) + return r; + + r = link_request_dhcp_server(link); + if (r < 0) + return r; + + r = link_request_radv(link); + if (r < 0) + return r; + + r = link_lldp_rx_configure(link); + if (r < 0) + return r; + + r = link_lldp_tx_configure(link); + if (r < 0) + return r; + + r = link_drop_foreign_config(link); + if (r < 0) + return r; + + r = link_request_static_configs(link); + if (r < 0) + return r; + + if (!link_has_carrier(link)) + return 0; + + return link_acquire_dynamic_conf(link); +} + +static int link_get_network(Link *link, Network **ret) { + Network *network; + int r; + + assert(link); + assert(link->manager); + assert(ret); + + ORDERED_HASHMAP_FOREACH(network, link->manager->networks) { + bool warn = false; + + r = net_match_config( + &network->match, + link->dev, + &link->hw_addr, + &link->permanent_hw_addr, + link->driver, + link->iftype, + link->kind, + link->ifname, + link->alternative_names, + link->wlan_iftype, + link->ssid, + &link->bssid); + if (r < 0) + return r; + if (r == 0) + continue; + + if (network->match.ifname && link->dev) { + uint8_t name_assign_type = NET_NAME_UNKNOWN; + const char *attr; + + if (sd_device_get_sysattr_value(link->dev, "name_assign_type", &attr) >= 0) + (void) safe_atou8(attr, &name_assign_type); + + warn = name_assign_type == NET_NAME_ENUM; + } + + log_link_full(link, warn ? LOG_WARNING : LOG_DEBUG, + "found matching network '%s'%s.", + network->filename, + warn ? ", based on potentially unpredictable interface name" : ""); + + if (network->unmanaged) + return -ENOENT; + + *ret = network; + return 0; + } + + return -ENOENT; +} + +int link_reconfigure_impl(Link *link, bool force) { + Network *network = NULL; + NetDev *netdev = NULL; + int r; + + assert(link); + + if (IN_SET(link->state, LINK_STATE_PENDING, LINK_STATE_LINGER)) + return 0; + + r = netdev_get(link->manager, link->ifname, &netdev); + if (r < 0 && r != -ENOENT) + return r; + + r = link_get_network(link, &network); + if (r < 0 && r != -ENOENT) + return r; + + if (link->state != LINK_STATE_UNMANAGED && !network) + /* If link is in initialized state, then link->network is also NULL. */ + force = true; + + if (link->network == network && !force) + return 0; + + if (network) { + if (link->state == LINK_STATE_INITIALIZED) + log_link_info(link, "Configuring with %s.", network->filename); + else + log_link_info(link, "Reconfiguring with %s.", network->filename); + } else + log_link_full(link, link->state == LINK_STATE_INITIALIZED ? LOG_DEBUG : LOG_INFO, + "Unmanaging interface."); + + /* Dropping old .network file */ + r = link_stop_engines(link, false); + if (r < 0) + return r; + + link_drop_requests(link); + + if (network && !force && network->keep_configuration != KEEP_CONFIGURATION_YES) + /* When a new/updated .network file is assigned, first make all configs (addresses, + * routes, and so on) foreign, and then drop unnecessary configs later by + * link_drop_foreign_config() in link_configure(). + * Note, when KeepConfiguration=yes, link_drop_foreign_config() does nothing. Hence, + * here we need to drop the configs such as addresses, routes, and so on configured by + * the previously assigned .network file. */ + link_foreignize_config(link); + else { + /* Remove all managed configs. Note, foreign configs are removed in later by + * link_configure() -> link_drop_foreign_config() if the link is managed by us. */ + r = link_drop_managed_config(link); + if (r < 0) + return r; + } + + /* The bound_to map depends on .network file, hence it needs to be freed. But, do not free the + * bound_by map. Otherwise, if a link enters unmanaged state below, then its carrier state will + * not propagated to other interfaces anymore. Moreover, it is not necessary to recreate the + * map here, as it depends on .network files assigned to other links. */ + link_free_bound_to_list(link); + + link_free_engines(link); + link->network = network_unref(link->network); + + netdev_unref(link->netdev); + link->netdev = netdev_ref(netdev); + + if (!network) { + link_set_state(link, LINK_STATE_UNMANAGED); + return 0; + } + + /* Then, apply new .network file */ + link->network = network_ref(network); + link_update_operstate(link, true); + link_dirty(link); + + link_set_state(link, LINK_STATE_INITIALIZED); + link->activated = false; + + r = link_configure(link); + if (r < 0) + return r; + + return 1; +} + +static int link_reconfigure_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, bool force) { + int r; + + assert(link); + + r = link_getlink_handler_internal(rtnl, m, link, "Failed to update link state"); + if (r <= 0) + return r; + + r = link_reconfigure_impl(link, force); + if (r < 0) { + link_enter_failed(link); + return 0; + } + + return r; +} + +static int link_reconfigure_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + return link_reconfigure_handler_internal(rtnl, m, link, /* force = */ false); +} + +static int link_force_reconfigure_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + return link_reconfigure_handler_internal(rtnl, m, link, /* force = */ true); +} + +int link_reconfigure(Link *link, bool force) { + int r; + + assert(link); + + /* When link in pending or initialized state, then link_configure() will be called. To prevent + * the function from being called multiple times simultaneously, refuse to reconfigure the + * interface in these cases. */ + if (IN_SET(link->state, LINK_STATE_PENDING, LINK_STATE_INITIALIZED, LINK_STATE_LINGER)) + return 0; /* 0 means no-op. */ + + r = link_call_getlink(link, force ? link_force_reconfigure_handler : link_reconfigure_handler); + if (r < 0) + return r; + + return 1; /* 1 means the interface will be reconfigured. */ +} + +static int link_initialized_and_synced(Link *link) { + int r; + + assert(link); + assert(link->manager); + + if (link->manager->test_mode) { + log_link_debug(link, "Running in test mode, refusing to enter initialized state."); + link_set_state(link, LINK_STATE_UNMANAGED); + return 0; + } + + if (link->state == LINK_STATE_PENDING) { + log_link_debug(link, "Link state is up-to-date"); + link_set_state(link, LINK_STATE_INITIALIZED); + + r = link_new_bound_by_list(link); + if (r < 0) + return r; + + r = link_handle_bound_by_list(link); + if (r < 0) + return r; + } + + return link_reconfigure_impl(link, /* force = */ false); +} + +static int link_initialized_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + r = link_getlink_handler_internal(rtnl, m, link, "Failed to wait for the interface to be initialized"); + if (r <= 0) + return r; + + r = link_initialized_and_synced(link); + if (r < 0) + link_enter_failed(link); + + return 0; +} + +static int link_initialized(Link *link, sd_device *device) { + int r; + + assert(link); + assert(device); + + /* Always replace with the new sd_device object. As the sysname (and possibly other properties + * or sysattrs) may be outdated. */ + device_unref_and_replace(link->dev, device); + + if (link->dhcp_client) { + r = sd_dhcp_client_attach_device(link->dhcp_client, link->dev); + if (r < 0) + log_link_warning_errno(link, r, "Failed to attach device to DHCPv4 client, ignoring: %m"); + } + + if (link->dhcp6_client) { + r = sd_dhcp6_client_attach_device(link->dhcp6_client, link->dev); + if (r < 0) + log_link_warning_errno(link, r, "Failed to attach device to DHCPv6 client, ignoring: %m"); + } + + r = link_set_sr_iov_ifindices(link); + if (r < 0) + log_link_warning_errno(link, r, "Failed to manage SR-IOV PF and VF ports, ignoring: %m"); + + if (link->state != LINK_STATE_PENDING) + return link_reconfigure(link, /* force = */ false); + + log_link_debug(link, "udev initialized link"); + + /* udev has initialized the link, but we don't know if we have yet + * processed the NEWLINK messages with the latest state. Do a GETLINK, + * when it returns we know that the pending NEWLINKs have already been + * processed and that we are up-to-date */ + + return link_call_getlink(link, link_initialized_handler); +} + +static int link_check_initialized(Link *link) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + int r; + + assert(link); + + if (!udev_available()) + return link_initialized_and_synced(link); + + /* udev should be around */ + r = sd_device_new_from_ifindex(&device, link->ifindex); + if (r < 0) { + log_link_debug_errno(link, r, "Could not find device, waiting for device initialization: %m"); + return 0; + } + + r = sd_device_get_is_initialized(device); + if (r < 0) + return log_link_warning_errno(link, r, "Could not determine whether the device is initialized: %m"); + if (r == 0) { + /* not yet ready */ + log_link_debug(link, "link pending udev initialization..."); + return 0; + } + + r = device_is_renaming(device); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to determine the device is being renamed: %m"); + if (r > 0) { + log_link_debug(link, "Interface is being renamed, pending initialization."); + return 0; + } + + return link_initialized(link, device); +} + +int manager_udev_process_link(Manager *m, sd_device *device, sd_device_action_t action) { + int r, ifindex; + const char *s; + Link *link; + + assert(m); + assert(device); + + r = sd_device_get_ifindex(device, &ifindex); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get ifindex: %m"); + + r = link_get_by_index(m, ifindex, &link); + if (r < 0) { + /* This error is not critical, as the corresponding rtnl message may be received later. */ + log_device_debug_errno(device, r, "Failed to get link from ifindex %i, ignoring: %m", ifindex); + return 0; + } + + /* Let's unref the sd-device object assigned to the corresponding Link object, but keep the Link + * object here. It will be removed only when rtnetlink says so. */ + if (action == SD_DEVICE_REMOVE) { + link->dev = sd_device_unref(link->dev); + return 0; + } + + r = device_is_renaming(device); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to determine if the device is renaming or not: %m"); + if (r > 0) { + log_device_debug(device, "Device is renaming, waiting for the interface to be renamed."); + /* TODO: + * What happens when a device is initialized, then soon renamed after that? When we detect + * such, maybe we should cancel or postpone all queued requests for the interface. */ + return 0; + } + + r = sd_device_get_property_value(device, "ID_NET_MANAGED_BY", &s); + if (r < 0 && r != -ENOENT) + log_device_debug_errno(device, r, "Failed to get ID_NET_MANAGED_BY udev property, ignoring: %m"); + if (r >= 0 && !streq(s, "io.systemd.Network")) { + log_device_debug(device, "Interface is requested to be managed by '%s', not managing the interface.", s); + link_set_state(link, LINK_STATE_UNMANAGED); + return 0; + } + + r = link_initialized(link, device); + if (r < 0) + link_enter_failed(link); + + return 0; +} + +static int link_carrier_gained(Link *link) { + bool force_reconfigure; + int r; + + assert(link); + + r = event_source_disable(link->carrier_lost_timer); + if (r < 0) + log_link_warning_errno(link, r, "Failed to disable carrier lost timer, ignoring: %m"); + + /* If a wireless interface was connected to an access point, and the SSID is changed (that is, + * both previous_ssid and ssid are non-NULL), then the connected wireless network could be + * changed. So, always reconfigure the link. Which means e.g. the DHCP client will be + * restarted, and the correct network information will be gained. + * + * However, do not reconfigure the wireless interface forcibly if it was not connected to any + * access points previously (previous_ssid is NULL in this case). As, a .network file may be + * already assigned to the interface (in that case, the .network file does not have the SSID= + * setting in the [Match] section), and the interface is already being configured. Of course, + * there may exist another .network file with higher priority and a matching SSID= setting. But + * in that case, link_reconfigure_impl() can handle that without the force_reconfigure flag. + * + * For non-wireless interfaces, we have no way to detect the connected network change. So, + * setting force_reconfigure = false. Note, both ssid and previous_ssid are NULL in that case. */ + force_reconfigure = link->previous_ssid && !streq_ptr(link->previous_ssid, link->ssid); + link->previous_ssid = mfree(link->previous_ssid); + + /* AP and P2P-GO interfaces may have a new SSID - update the link properties in case a new .network + * profile wants to match on it with SSID= in its [Match] section. + */ + if (IN_SET(link->wlan_iftype, NL80211_IFTYPE_AP, NL80211_IFTYPE_P2P_GO)) { + r = link_get_wlan_interface(link); + if (r < 0) + return r; + } + + /* At this stage, both wlan and link information should be up-to-date. Hence, it is not necessary to + * call RTM_GETLINK, NL80211_CMD_GET_INTERFACE, or NL80211_CMD_GET_STATION commands, and simply call + * link_reconfigure_impl(). Note, link_reconfigure_impl() returns 1 when the link is reconfigured. */ + r = link_reconfigure_impl(link, force_reconfigure); + if (r != 0) + return r; + + r = link_handle_bound_by_list(link); + if (r < 0) + return r; + + if (link->iftype == ARPHRD_CAN) + /* let's shortcut things for CAN which doesn't need most of what's done below. */ + return 0; + + if (IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) { + r = link_acquire_dynamic_conf(link); + if (r < 0) + return r; + + r = link_request_static_configs(link); + if (r < 0) + return r; + } + + return 0; +} + +static int link_carrier_lost_impl(Link *link) { + int r, ret = 0; + + assert(link); + + link->previous_ssid = mfree(link->previous_ssid); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + if (!link->network) + return 0; + + r = link_stop_engines(link, false); + if (r < 0) + ret = r; + + r = link_drop_managed_config(link); + if (r < 0 && ret >= 0) + ret = r; + + return ret; +} + +static int link_carrier_lost_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Link *link = ASSERT_PTR(userdata); + int r; + + r = link_carrier_lost_impl(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to process carrier lost event: %m"); + link_enter_failed(link); + } + + return 0; +} + +static int link_carrier_lost(Link *link) { + uint16_t dhcp_mtu; + usec_t usec; + int r; + + assert(link); + + r = link_handle_bound_by_list(link); + if (r < 0) + return r; + + if (link->iftype == ARPHRD_CAN) + /* let's shortcut things for CAN which doesn't need most of what's done below. */ + return 0; + + if (!link->network) + return 0; + + if (link->network->ignore_carrier_loss_set) + /* If IgnoreCarrierLoss= is explicitly specified, then use the specified value. */ + usec = link->network->ignore_carrier_loss_usec; + + else if (link->network->bond && link->wlan_iftype > 0) + /* Enslaving wlan interface to a bond disconnects from the connected AP, and causes its + * carrier to be lost. See #19832. */ + usec = 3 * USEC_PER_SEC; + + else if (link->network->dhcp_use_mtu && + link->dhcp_lease && + sd_dhcp_lease_get_mtu(link->dhcp_lease, &dhcp_mtu) >= 0 && + dhcp_mtu != link->original_mtu) + /* Some drivers reset interfaces when changing MTU. Resetting interfaces by the static + * MTU should not cause any issues, as MTU is changed only once. However, setting MTU + * through DHCP lease causes an infinite loop of resetting the interface. See #18738. */ + usec = 5 * USEC_PER_SEC; + + else + /* Otherwise, use the implied default value. */ + usec = link->network->ignore_carrier_loss_usec; + + if (usec == USEC_INFINITY) + return 0; + + if (usec == 0) + return link_carrier_lost_impl(link); + + return event_reset_time_relative(link->manager->event, + &link->carrier_lost_timer, + CLOCK_BOOTTIME, + usec, + 0, + link_carrier_lost_handler, + link, + 0, + "link-carrier-loss", + true); +} + +static int link_admin_state_up(Link *link) { + int r; + + assert(link); + + /* This is called every time an interface admin state changes to up; + * specifically, when IFF_UP flag changes from unset to set. */ + + if (!link->network) + return 0; + + if (link->activated && link->network->activation_policy == ACTIVATION_POLICY_ALWAYS_DOWN) { + log_link_info(link, "Activation policy is \"always-down\", forcing link down."); + return link_request_to_bring_up_or_down(link, /* up = */ false); + } + + /* We set the ipv6 mtu after the device mtu, but the kernel resets + * ipv6 mtu on NETDEV_UP, so we need to reset it. */ + r = link_set_ipv6_mtu(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv6 MTU, ignoring: %m"); + + return 0; +} + +static int link_admin_state_down(Link *link) { + assert(link); + + if (!link->network) + return 0; + + if (link->activated && link->network->activation_policy == ACTIVATION_POLICY_ALWAYS_UP) { + log_link_info(link, "Activation policy is \"always-up\", forcing link up."); + return link_request_to_bring_up_or_down(link, /* up = */ true); + } + + return 0; +} + +static bool link_is_enslaved(Link *link) { + if (link->flags & IFF_SLAVE) + return true; + + if (link->master_ifindex > 0) + return true; + + return false; +} + +void link_update_operstate(Link *link, bool also_update_master) { + LinkOperationalState operstate; + LinkCarrierState carrier_state; + LinkAddressState ipv4_address_state, ipv6_address_state, address_state; + LinkOnlineState online_state; + _cleanup_strv_free_ char **p = NULL; + bool changed = false; + + assert(link); + + if (link->kernel_operstate == IF_OPER_DORMANT) + carrier_state = LINK_CARRIER_STATE_DORMANT; + else if (link_has_carrier(link)) { + if (link_is_enslaved(link)) + carrier_state = LINK_CARRIER_STATE_ENSLAVED; + else + carrier_state = LINK_CARRIER_STATE_CARRIER; + } else if (link->flags & IFF_UP) + carrier_state = LINK_CARRIER_STATE_NO_CARRIER; + else + carrier_state = LINK_CARRIER_STATE_OFF; + + if (carrier_state >= LINK_CARRIER_STATE_CARRIER) { + Link *slave; + + SET_FOREACH(slave, link->slaves) { + link_update_operstate(slave, false); + + if (slave->carrier_state < LINK_CARRIER_STATE_CARRIER) + carrier_state = LINK_CARRIER_STATE_DEGRADED_CARRIER; + } + } + + link_get_address_states(link, &ipv4_address_state, &ipv6_address_state, &address_state); + + /* Mapping of address and carrier state vs operational state + * carrier state + * | off | no-carrier | dormant | degraded-carrier | carrier | enslaved + * ------------------------------------------------------------------------------ + * off | off | no-carrier | dormant | degraded-carrier | carrier | enslaved + * address_state degraded | off | no-carrier | dormant | degraded | degraded | enslaved + * routable | off | no-carrier | dormant | routable | routable | routable + */ + + if (carrier_state == LINK_CARRIER_STATE_DEGRADED_CARRIER && address_state == LINK_ADDRESS_STATE_ROUTABLE) + operstate = LINK_OPERSTATE_ROUTABLE; + else if (carrier_state == LINK_CARRIER_STATE_DEGRADED_CARRIER && address_state == LINK_ADDRESS_STATE_DEGRADED) + operstate = LINK_OPERSTATE_DEGRADED; + else if (carrier_state < LINK_CARRIER_STATE_CARRIER || address_state == LINK_ADDRESS_STATE_OFF) + operstate = (LinkOperationalState) carrier_state; + else if (address_state == LINK_ADDRESS_STATE_ROUTABLE) + operstate = LINK_OPERSTATE_ROUTABLE; + else if (carrier_state == LINK_CARRIER_STATE_CARRIER) + operstate = LINK_OPERSTATE_DEGRADED; + else + operstate = LINK_OPERSTATE_ENSLAVED; + + /* Only determine online state for managed links with RequiredForOnline=yes */ + if (!link->network || !link->network->required_for_online) + online_state = _LINK_ONLINE_STATE_INVALID; + else if (operstate < link->network->required_operstate_for_online.min || + operstate > link->network->required_operstate_for_online.max) + online_state = LINK_ONLINE_STATE_OFFLINE; + else { + AddressFamily required_family = link->network->required_family_for_online; + bool needs_ipv4 = required_family & ADDRESS_FAMILY_IPV4; + bool needs_ipv6 = required_family & ADDRESS_FAMILY_IPV6; + + /* The operational state is within the range required for online. + * If a particular address family is also required, we might revert + * to offline in the blocks below. */ + online_state = LINK_ONLINE_STATE_ONLINE; + + if (link->network->required_operstate_for_online.min >= LINK_OPERSTATE_DEGRADED) { + if (needs_ipv4 && ipv4_address_state < LINK_ADDRESS_STATE_DEGRADED) + online_state = LINK_ONLINE_STATE_OFFLINE; + if (needs_ipv6 && ipv6_address_state < LINK_ADDRESS_STATE_DEGRADED) + online_state = LINK_ONLINE_STATE_OFFLINE; + } + + if (link->network->required_operstate_for_online.min >= LINK_OPERSTATE_ROUTABLE) { + if (needs_ipv4 && ipv4_address_state < LINK_ADDRESS_STATE_ROUTABLE) + online_state = LINK_ONLINE_STATE_OFFLINE; + if (needs_ipv6 && ipv6_address_state < LINK_ADDRESS_STATE_ROUTABLE) + online_state = LINK_ONLINE_STATE_OFFLINE; + } + } + + if (link->carrier_state != carrier_state) { + link->carrier_state = carrier_state; + changed = true; + if (strv_extend(&p, "CarrierState") < 0) + log_oom(); + } + + if (link->address_state != address_state) { + link->address_state = address_state; + changed = true; + if (strv_extend(&p, "AddressState") < 0) + log_oom(); + } + + if (link->ipv4_address_state != ipv4_address_state) { + link->ipv4_address_state = ipv4_address_state; + changed = true; + if (strv_extend(&p, "IPv4AddressState") < 0) + log_oom(); + } + + if (link->ipv6_address_state != ipv6_address_state) { + link->ipv6_address_state = ipv6_address_state; + changed = true; + if (strv_extend(&p, "IPv6AddressState") < 0) + log_oom(); + } + + if (link->operstate != operstate) { + link->operstate = operstate; + changed = true; + if (strv_extend(&p, "OperationalState") < 0) + log_oom(); + } + + if (link->online_state != online_state) { + link->online_state = online_state; + changed = true; + if (strv_extend(&p, "OnlineState") < 0) + log_oom(); + } + + if (p) + link_send_changed_strv(link, p); + if (changed) + link_dirty(link); + + if (also_update_master) { + Link *master; + + if (link_get_master(link, &master) >= 0) + link_update_operstate(master, true); + } +} + +#define FLAG_STRING(string, flag, old, new) \ + (((old ^ new) & flag) \ + ? ((old & flag) ? (" -" string) : (" +" string)) \ + : "") + +static int link_update_flags(Link *link, sd_netlink_message *message) { + bool link_was_admin_up, had_carrier; + uint8_t operstate; + unsigned flags; + int r; + + assert(link); + assert(message); + + r = sd_rtnl_message_link_get_flags(message, &flags); + if (r < 0) + return log_link_debug_errno(link, r, "rtnl: failed to read link flags: %m"); + + r = sd_netlink_message_read_u8(message, IFLA_OPERSTATE, &operstate); + if (r == -ENODATA) + /* If we got a message without operstate, assume the state was unchanged. */ + operstate = link->kernel_operstate; + else if (r < 0) + return log_link_debug_errno(link, r, "rtnl: failed to read operational state: %m"); + + if (link->flags == flags && link->kernel_operstate == operstate) + return 0; + + if (link->flags != flags) { + unsigned unknown_flags, unknown_flags_added, unknown_flags_removed; + + log_link_debug(link, "Flags change:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + FLAG_STRING("LOOPBACK", IFF_LOOPBACK, link->flags, flags), + FLAG_STRING("MASTER", IFF_MASTER, link->flags, flags), + FLAG_STRING("SLAVE", IFF_SLAVE, link->flags, flags), + FLAG_STRING("UP", IFF_UP, link->flags, flags), + FLAG_STRING("DORMANT", IFF_DORMANT, link->flags, flags), + FLAG_STRING("LOWER_UP", IFF_LOWER_UP, link->flags, flags), + FLAG_STRING("RUNNING", IFF_RUNNING, link->flags, flags), + FLAG_STRING("MULTICAST", IFF_MULTICAST, link->flags, flags), + FLAG_STRING("BROADCAST", IFF_BROADCAST, link->flags, flags), + FLAG_STRING("POINTOPOINT", IFF_POINTOPOINT, link->flags, flags), + FLAG_STRING("PROMISC", IFF_PROMISC, link->flags, flags), + FLAG_STRING("ALLMULTI", IFF_ALLMULTI, link->flags, flags), + FLAG_STRING("PORTSEL", IFF_PORTSEL, link->flags, flags), + FLAG_STRING("AUTOMEDIA", IFF_AUTOMEDIA, link->flags, flags), + FLAG_STRING("DYNAMIC", IFF_DYNAMIC, link->flags, flags), + FLAG_STRING("NOARP", IFF_NOARP, link->flags, flags), + FLAG_STRING("NOTRAILERS", IFF_NOTRAILERS, link->flags, flags), + FLAG_STRING("DEBUG", IFF_DEBUG, link->flags, flags), + FLAG_STRING("ECHO", IFF_ECHO, link->flags, flags)); + + unknown_flags = ~(IFF_LOOPBACK | IFF_MASTER | IFF_SLAVE | IFF_UP | + IFF_DORMANT | IFF_LOWER_UP | IFF_RUNNING | + IFF_MULTICAST | IFF_BROADCAST | IFF_POINTOPOINT | + IFF_PROMISC | IFF_ALLMULTI | IFF_PORTSEL | + IFF_AUTOMEDIA | IFF_DYNAMIC | IFF_NOARP | + IFF_NOTRAILERS | IFF_DEBUG | IFF_ECHO); + unknown_flags_added = ((link->flags ^ flags) & flags & unknown_flags); + unknown_flags_removed = ((link->flags ^ flags) & link->flags & unknown_flags); + + if (unknown_flags_added) + log_link_debug(link, "Unknown link flags gained, ignoring: %#.5x", unknown_flags_added); + + if (unknown_flags_removed) + log_link_debug(link, "Unknown link flags lost, ignoring: %#.5x", unknown_flags_removed); + } + + link_was_admin_up = link->flags & IFF_UP; + had_carrier = link_has_carrier(link); + + link->flags = flags; + link->kernel_operstate = operstate; + + link_update_operstate(link, true); + + if (!link_was_admin_up && (link->flags & IFF_UP)) { + log_link_info(link, "Link UP"); + + r = link_admin_state_up(link); + if (r < 0) + return r; + } else if (link_was_admin_up && !(link->flags & IFF_UP)) { + log_link_info(link, "Link DOWN"); + + r = link_admin_state_down(link); + if (r < 0) + return r; + } + + if (!had_carrier && link_has_carrier(link)) { + log_link_info(link, "Gained carrier"); + + r = link_carrier_gained(link); + if (r < 0) + return r; + } else if (had_carrier && !link_has_carrier(link)) { + log_link_info(link, "Lost carrier"); + + r = link_carrier_lost(link); + if (r < 0) + return r; + } + + return 0; +} + +static int link_update_master(Link *link, sd_netlink_message *message) { + int master_ifindex, r; + + assert(link); + assert(message); + + r = sd_netlink_message_read_u32(message, IFLA_MASTER, (uint32_t*) &master_ifindex); + if (r == -ENODATA) + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "rtnl: failed to read master ifindex: %m"); + + if (master_ifindex == link->ifindex) + master_ifindex = 0; + + if (master_ifindex != link->master_ifindex) { + if (link->master_ifindex == 0) + log_link_debug(link, "Attached to master interface: %i", master_ifindex); + else if (master_ifindex == 0) + log_link_debug(link, "Detached from master interface: %i", link->master_ifindex); + else + log_link_debug(link, "Master interface changed: %i %s %i", link->master_ifindex, + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), master_ifindex); + + link_drop_from_master(link); + link->master_ifindex = master_ifindex; + } + + r = link_append_to_master(link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to append link to master: %m"); + + return 0; +} + +static int link_update_driver(Link *link, sd_netlink_message *message) { + int r; + + assert(link); + assert(link->manager); + assert(message); + + /* Driver is already read. Assuming the driver is never changed. */ + if (link->ethtool_driver_read) + return 0; + + /* When udevd is running, read the driver after the interface is initialized by udevd. + * Otherwise, ethtool may not work correctly. See issue #22538. + * When udevd is not running, read the value when the interface is detected. */ + if (udev_available() && !link->dev) + return 0; + + link->ethtool_driver_read = true; + + r = ethtool_get_driver(&link->manager->ethtool_fd, link->ifname, &link->driver); + if (r < 0) { + log_link_debug_errno(link, r, "Failed to get driver, continuing without: %m"); + return 0; + } + + log_link_debug(link, "Found driver: %s", strna(link->driver)); + + if (streq_ptr(link->driver, "dsa")) { + uint32_t dsa_master_ifindex = 0; + + r = sd_netlink_message_read_u32(message, IFLA_LINK, &dsa_master_ifindex); + if (r < 0 && r != -ENODATA) + return log_link_debug_errno(link, r, "rtnl: failed to read ifindex of the DSA master interface: %m"); + + if (dsa_master_ifindex > INT_MAX) { + log_link_debug(link, "rtnl: received too large DSA master ifindex (%"PRIu32" > INT_MAX), ignoring.", + dsa_master_ifindex); + dsa_master_ifindex = 0; + } + + link->dsa_master_ifindex = (int) dsa_master_ifindex; + } + + return 1; /* needs reconfigure */ +} + +static int link_update_permanent_hardware_address_from_ethtool(Link *link, sd_netlink_message *message) { + int r; + + assert(link); + assert(link->manager); + assert(message); + + if (link->ethtool_permanent_hw_addr_read) + return 0; + + /* When udevd is running, read the permanent hardware address after the interface is + * initialized by udevd. Otherwise, ethtool may not work correctly. See issue #22538. + * When udevd is not running, read the value when the interface is detected. */ + if (udev_available() && !link->dev) + return 0; + + /* If the interface does not have a hardware address, then it will not have a permanent address either. */ + r = netlink_message_read_hw_addr(message, IFLA_ADDRESS, NULL); + if (r == -ENODATA) + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "Failed to read IFLA_ADDRESS attribute: %m"); + + link->ethtool_permanent_hw_addr_read = true; + + r = ethtool_get_permanent_hw_addr(&link->manager->ethtool_fd, link->ifname, &link->permanent_hw_addr); + if (r < 0) + log_link_debug_errno(link, r, "Permanent hardware address not found, continuing without: %m"); + + return 0; +} + +static int link_update_permanent_hardware_address(Link *link, sd_netlink_message *message) { + int r; + + assert(link); + assert(link->manager); + assert(message); + + if (link->permanent_hw_addr.length > 0) + return 0; + + r = netlink_message_read_hw_addr(message, IFLA_PERM_ADDRESS, &link->permanent_hw_addr); + if (r < 0) { + if (r != -ENODATA) + return log_link_debug_errno(link, r, "Failed to read IFLA_PERM_ADDRESS attribute: %m"); + + /* Fallback to ethtool for older kernels. */ + r = link_update_permanent_hardware_address_from_ethtool(link, message); + if (r < 0) + return r; + } + + if (link->permanent_hw_addr.length > 0) + log_link_debug(link, "Saved permanent hardware address: %s", HW_ADDR_TO_STR(&link->permanent_hw_addr)); + + return 1; /* needs reconfigure */ +} + +static int link_update_hardware_address(Link *link, sd_netlink_message *message) { + struct hw_addr_data addr; + int r; + + assert(link); + assert(message); + + r = netlink_message_read_hw_addr(message, IFLA_BROADCAST, &link->bcast_addr); + if (r < 0 && r != -ENODATA) + return log_link_debug_errno(link, r, "rtnl: failed to read broadcast address: %m"); + + r = netlink_message_read_hw_addr(message, IFLA_ADDRESS, &addr); + if (r == -ENODATA) + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "rtnl: failed to read hardware address: %m"); + + if (hw_addr_equal(&link->hw_addr, &addr)) + return 0; + + if (link->hw_addr.length == 0) + log_link_debug(link, "Saved hardware address: %s", HW_ADDR_TO_STR(&addr)); + else { + log_link_debug(link, "Hardware address is changed: %s %s %s", + HW_ADDR_TO_STR(&link->hw_addr), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + HW_ADDR_TO_STR(&addr)); + + hashmap_remove_value(link->manager->links_by_hw_addr, &link->hw_addr, link); + } + + link->hw_addr = addr; + + if (!hw_addr_is_null(&link->hw_addr)) { + r = hashmap_ensure_put(&link->manager->links_by_hw_addr, &hw_addr_hash_ops, &link->hw_addr, link); + if (r == -EEXIST && streq_ptr(link->kind, "bond")) + /* bonding master and its slaves have the same hardware address. */ + r = hashmap_replace(link->manager->links_by_hw_addr, &link->hw_addr, link); + if (r < 0) + log_link_debug_errno(link, r, "Failed to manage link by its new hardware address, ignoring: %m"); + } + + r = ipv4acd_update_mac(link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address in IPv4 ACD client: %m"); + + r = ipv4ll_update_mac(link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address in IPv4LL client: %m"); + + r = dhcp4_update_mac(link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address in DHCP client: %m"); + + r = dhcp6_update_mac(link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address in DHCPv6 client: %m"); + + r = radv_update_mac(link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address for Router Advertisement: %m"); + + if (link->ndisc && link->hw_addr.length == ETH_ALEN) { + r = sd_ndisc_set_mac(link->ndisc, &link->hw_addr.ether); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC for NDisc: %m"); + } + + if (link->lldp_rx) { + r = sd_lldp_rx_set_filter_address(link->lldp_rx, &link->hw_addr.ether); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address for LLDP Rx: %m"); + } + + if (link->lldp_tx) { + r = sd_lldp_tx_set_hwaddr(link->lldp_tx, &link->hw_addr.ether); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MAC address for LLDP Tx: %m"); + } + + return 1; /* needs reconfigure */ +} + +static int link_update_mtu(Link *link, sd_netlink_message *message) { + uint32_t mtu, min_mtu = 0, max_mtu = UINT32_MAX; + int r; + + assert(link); + assert(message); + + r = sd_netlink_message_read_u32(message, IFLA_MTU, &mtu); + if (r == -ENODATA) + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "rtnl: failed to read MTU in RTM_NEWLINK message: %m"); + if (mtu == 0) + return 0; + + r = sd_netlink_message_read_u32(message, IFLA_MIN_MTU, &min_mtu); + if (r < 0 && r != -ENODATA) + return log_link_debug_errno(link, r, "rtnl: failed to read minimum MTU in RTM_NEWLINK message: %m"); + + r = sd_netlink_message_read_u32(message, IFLA_MAX_MTU, &max_mtu); + if (r < 0 && r != -ENODATA) + return log_link_debug_errno(link, r, "rtnl: failed to read maximum MTU in RTM_NEWLINK message: %m"); + + if (max_mtu == 0) + max_mtu = UINT32_MAX; + + link->min_mtu = min_mtu; + link->max_mtu = max_mtu; + + if (link->original_mtu == 0) { + link->original_mtu = mtu; + log_link_debug(link, "Saved original MTU %" PRIu32" (min: %"PRIu32", max: %"PRIu32")", + link->original_mtu, link->min_mtu, link->max_mtu); + } + + if (link->mtu == mtu) + return 0; + + if (link->mtu != 0) + log_link_debug(link, "MTU is changed: %"PRIu32" %s %"PRIu32" (min: %"PRIu32", max: %"PRIu32")", + link->mtu, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), mtu, + link->min_mtu, link->max_mtu); + + link->mtu = mtu; + + if (link->dhcp_client) { + r = sd_dhcp_client_set_mtu(link->dhcp_client, link->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Could not update MTU in DHCP client: %m"); + } + + if (link->radv) { + r = sd_radv_set_mtu(link->radv, link->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Could not set MTU for Router Advertisement: %m"); + } + + return 0; +} + +static int link_update_alternative_names(Link *link, sd_netlink_message *message) { + _cleanup_strv_free_ char **altnames = NULL; + int r; + + assert(link); + assert(message); + + r = sd_netlink_message_read_strv(message, IFLA_PROP_LIST, IFLA_ALT_IFNAME, &altnames); + if (r == -ENODATA) + /* The message does not have IFLA_PROP_LIST container attribute. It does not mean the + * interface has no alternative name. */ + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "rtnl: failed to read alternative names: %m"); + + if (strv_equal(altnames, link->alternative_names)) + return 0; + + STRV_FOREACH(n, link->alternative_names) + hashmap_remove(link->manager->links_by_name, *n); + + strv_free_and_replace(link->alternative_names, altnames); + + STRV_FOREACH(n, link->alternative_names) { + r = hashmap_ensure_put(&link->manager->links_by_name, &string_hash_ops, *n, link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to manage link by its new alternative names: %m"); + } + + return 1; /* needs reconfigure */ +} + +static int link_update_name(Link *link, sd_netlink_message *message) { + char ifname_from_index[IF_NAMESIZE]; + const char *ifname; + int r; + + assert(link); + assert(message); + + r = sd_netlink_message_read_string(message, IFLA_IFNAME, &ifname); + if (r == -ENODATA) + /* Hmm?? But ok. */ + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "Failed to read interface name in RTM_NEWLINK message: %m"); + + if (streq(ifname, link->ifname)) + return 0; + + r = format_ifname(link->ifindex, ifname_from_index); + if (r < 0) + return log_link_debug_errno(link, r, "Could not get interface name for index %i.", link->ifindex); + + if (!streq(ifname, ifname_from_index)) { + log_link_debug(link, "New interface name '%s' received from the kernel does not correspond " + "with the name currently configured on the actual interface '%s'. Ignoring.", + ifname, ifname_from_index); + return 0; + } + + log_link_info(link, "Interface name change detected, renamed to %s.", ifname); + + hashmap_remove(link->manager->links_by_name, link->ifname); + + r = free_and_strdup(&link->ifname, ifname); + if (r < 0) + return log_oom_debug(); + + r = hashmap_ensure_put(&link->manager->links_by_name, &string_hash_ops, link->ifname, link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to manage link by its new name: %m"); + + if (link->dhcp_client) { + r = sd_dhcp_client_set_ifname(link->dhcp_client, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in DHCP client: %m"); + } + + if (link->dhcp6_client) { + r = sd_dhcp6_client_set_ifname(link->dhcp6_client, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in DHCP6 client: %m"); + } + + if (link->ndisc) { + r = sd_ndisc_set_ifname(link->ndisc, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in NDisc: %m"); + } + + if (link->dhcp_server) { + r = sd_dhcp_server_set_ifname(link->dhcp_server, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in DHCP server: %m"); + } + + if (link->radv) { + r = sd_radv_set_ifname(link->radv, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in Router Advertisement: %m"); + } + + if (link->lldp_rx) { + r = sd_lldp_rx_set_ifname(link->lldp_rx, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in LLDP Rx: %m"); + } + + if (link->lldp_tx) { + r = sd_lldp_tx_set_ifname(link->lldp_tx, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in LLDP Tx: %m"); + } + + if (link->ipv4ll) { + r = sd_ipv4ll_set_ifname(link->ipv4ll, link->ifname); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in IPv4LL client: %m"); + } + + r = ipv4acd_set_ifname(link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to update interface name in IPv4ACD client: %m"); + + return 1; /* needs reconfigure */ +} + +static int link_update(Link *link, sd_netlink_message *message) { + bool needs_reconfigure = false; + int r; + + assert(link); + assert(message); + + r = link_update_name(link, message); + if (r < 0) + return r; + needs_reconfigure = needs_reconfigure || r > 0; + + r = link_update_alternative_names(link, message); + if (r < 0) + return r; + needs_reconfigure = needs_reconfigure || r > 0; + + r = link_update_mtu(link, message); + if (r < 0) + return r; + + r = link_update_driver(link, message); + if (r < 0) + return r; + needs_reconfigure = needs_reconfigure || r > 0; + + r = link_update_permanent_hardware_address(link, message); + if (r < 0) + return r; + needs_reconfigure = needs_reconfigure || r > 0; + + r = link_update_hardware_address(link, message); + if (r < 0) + return r; + needs_reconfigure = needs_reconfigure || r > 0; + + r = link_update_master(link, message); + if (r < 0) + return r; + + r = link_update_ipv6ll_addrgen_mode(link, message); + if (r < 0) + return r; + + r = link_update_flags(link, message); + if (r < 0) + return r; + + return needs_reconfigure; +} + +static Link *link_drop_or_unref(Link *link) { + if (!link) + return NULL; + if (!link->manager) + return link_unref(link); + return link_drop(link); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_drop_or_unref); + +static int link_new(Manager *manager, sd_netlink_message *message, Link **ret) { + _cleanup_free_ char *ifname = NULL, *kind = NULL, *state_file = NULL, *lease_file = NULL, *lldp_file = NULL; + _cleanup_(link_drop_or_unrefp) Link *link = NULL; + unsigned short iftype; + int r, ifindex; + + assert(manager); + assert(message); + assert(ret); + + r = sd_rtnl_message_link_get_ifindex(message, &ifindex); + if (r < 0) + return log_debug_errno(r, "rtnl: failed to read ifindex from link message: %m"); + else if (ifindex <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "rtnl: received link message without valid ifindex."); + + r = sd_rtnl_message_link_get_type(message, &iftype); + if (r < 0) + return log_debug_errno(r, "rtnl: failed to read interface type from link message: %m"); + + r = sd_netlink_message_read_string_strdup(message, IFLA_IFNAME, &ifname); + if (r < 0) + return log_debug_errno(r, "rtnl: failed to read interface name from link message: %m"); + + /* check for link kind */ + r = sd_netlink_message_enter_container(message, IFLA_LINKINFO); + if (r >= 0) { + r = sd_netlink_message_read_string_strdup(message, IFLA_INFO_KIND, &kind); + if (r < 0 && r != -ENODATA) + return log_debug_errno(r, "rtnl: failed to read interface kind from link message: %m"); + r = sd_netlink_message_exit_container(message); + if (r < 0) + return log_debug_errno(r, "rtnl: failed to exit IFLA_LINKINFO container: %m"); + } + + if (!manager->test_mode) { + /* Do not update state files when running in test mode. */ + if (asprintf(&state_file, "/run/systemd/netif/links/%d", ifindex) < 0) + return log_oom_debug(); + + if (asprintf(&lease_file, "/run/systemd/netif/leases/%d", ifindex) < 0) + return log_oom_debug(); + + if (asprintf(&lldp_file, "/run/systemd/netif/lldp/%d", ifindex) < 0) + return log_oom_debug(); + } + + link = new(Link, 1); + if (!link) + return -ENOMEM; + + *link = (Link) { + .n_ref = 1, + .state = LINK_STATE_PENDING, + .online_state = _LINK_ONLINE_STATE_INVALID, + .ifindex = ifindex, + .iftype = iftype, + .ifname = TAKE_PTR(ifname), + .kind = TAKE_PTR(kind), + + .ipv6ll_address_gen_mode = _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_INVALID, + + .state_file = TAKE_PTR(state_file), + .lease_file = TAKE_PTR(lease_file), + .lldp_file = TAKE_PTR(lldp_file), + + .n_dns = UINT_MAX, + .dns_default_route = -1, + .llmnr = _RESOLVE_SUPPORT_INVALID, + .mdns = _RESOLVE_SUPPORT_INVALID, + .dnssec_mode = _DNSSEC_MODE_INVALID, + .dns_over_tls_mode = _DNS_OVER_TLS_MODE_INVALID, + }; + + r = hashmap_ensure_put(&manager->links_by_index, NULL, INT_TO_PTR(link->ifindex), link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to store link into manager: %m"); + + link->manager = manager; + + r = hashmap_ensure_put(&manager->links_by_name, &string_hash_ops, link->ifname, link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to manage link by its interface name: %m"); + + log_link_debug(link, "Saved new link: ifindex=%i, iftype=%s(%u), kind=%s", + link->ifindex, strna(arphrd_to_name(link->iftype)), link->iftype, strna(link->kind)); + + /* If contained in this set, the link is wireless and the corresponding NL80211_CMD_NEW_INTERFACE + * message arrived too early. Request the wireless link information again. + */ + if (set_remove(manager->new_wlan_ifindices, INT_TO_PTR(link->ifindex))) { + r = link_get_wlan_interface(link); + if (r < 0) + log_link_warning_errno(link, r, "Failed to get wireless interface, ignoring: %m"); + } + + *ret = TAKE_PTR(link); + return 0; +} + +int manager_rtnl_process_link(sd_netlink *rtnl, sd_netlink_message *message, Manager *manager) { + Link *link = NULL; + NetDev *netdev = NULL; + uint16_t type; + const char *name; + int r, ifindex; + + assert(rtnl); + assert(message); + assert(manager); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: Could not receive link message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: Could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWLINK, RTM_DELLINK)) { + log_warning("rtnl: Received unexpected message type %u when processing link, ignoring.", type); + return 0; + } + + r = sd_rtnl_message_link_get_ifindex(message, &ifindex); + if (r < 0) { + log_warning_errno(r, "rtnl: Could not get ifindex from link message, ignoring: %m"); + return 0; + } else if (ifindex <= 0) { + log_warning("rtnl: received link message with invalid ifindex %d, ignoring.", ifindex); + return 0; + } + + r = sd_netlink_message_read_string(message, IFLA_IFNAME, &name); + if (r < 0) { + log_warning_errno(r, "rtnl: Received link message without ifname, ignoring: %m"); + return 0; + } + + (void) link_get_by_index(manager, ifindex, &link); + (void) netdev_get(manager, name, &netdev); + + switch (type) { + case RTM_NEWLINK: + if (netdev) { + /* netdev exists, so make sure the ifindex matches */ + r = netdev_set_ifindex(netdev, message); + if (r < 0) { + log_netdev_warning_errno(netdev, r, "Could not process new link message for netdev, ignoring: %m"); + return 0; + } + } + + if (!link) { + /* link is new, so add it */ + r = link_new(manager, message, &link); + if (r < 0) { + log_warning_errno(r, "Could not process new link message: %m"); + return 0; + } + + r = link_update(link, message); + if (r < 0) { + log_link_warning_errno(link, r, "Could not process link message: %m"); + link_enter_failed(link); + return 0; + } + + r = link_check_initialized(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to check link is initialized: %m"); + link_enter_failed(link); + return 0; + } + } else { + r = link_update(link, message); + if (r < 0) { + log_link_warning_errno(link, r, "Could not process link message: %m"); + link_enter_failed(link); + return 0; + } + if (r > 0) { + r = link_reconfigure_impl(link, /* force = */ false); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to reconfigure interface: %m"); + link_enter_failed(link); + return 0; + } + } + } + break; + + case RTM_DELLINK: + link_drop(link); + netdev_drop(netdev); + break; + + default: + assert_not_reached(); + } + + return 1; +} + +int link_getlink_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, const char *error_msg) { + uint16_t message_type; + int r; + + assert(m); + assert(link); + assert(error_msg); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_link_message_warning_errno(link, m, r, error_msg); + link_enter_failed(link); + return 0; + } + + r = sd_netlink_message_get_type(m, &message_type); + if (r < 0) { + log_link_debug_errno(link, r, "rtnl: failed to read link message type, ignoring: %m"); + return 0; + } + if (message_type != RTM_NEWLINK) { + log_link_debug(link, "rtnl: received invalid link message type, ignoring."); + return 0; + } + + r = link_update(link, m); + if (r < 0) { + link_enter_failed(link); + return 0; + } + + return 1; +} + +int link_call_getlink(Link *link, link_netlink_message_handler_t callback) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(callback); + + r = sd_rtnl_message_new_link(link->manager->rtnl, &req, RTM_GETLINK, link->ifindex); + if (r < 0) + return r; + + r = netlink_call_async(link->manager->rtnl, NULL, req, callback, + link_netlink_destroy_callback, link); + if (r < 0) + return r; + + link_ref(link); + return 0; +} + +static const char* const link_state_table[_LINK_STATE_MAX] = { + [LINK_STATE_PENDING] = "pending", + [LINK_STATE_INITIALIZED] = "initialized", + [LINK_STATE_CONFIGURING] = "configuring", + [LINK_STATE_CONFIGURED] = "configured", + [LINK_STATE_UNMANAGED] = "unmanaged", + [LINK_STATE_FAILED] = "failed", + [LINK_STATE_LINGER] = "linger", +}; + +DEFINE_STRING_TABLE_LOOKUP(link_state, LinkState); + +int link_flags_to_string_alloc(uint32_t flags, char **ret) { + _cleanup_free_ char *str = NULL; + static const char* map[] = { + [LOG2U(IFF_UP)] = "up", /* interface is up. */ + [LOG2U(IFF_BROADCAST)] = "broadcast", /* broadcast address valid. */ + [LOG2U(IFF_DEBUG)] = "debug", /* turn on debugging. */ + [LOG2U(IFF_LOOPBACK)] = "loopback", /* interface is a loopback net. */ + [LOG2U(IFF_POINTOPOINT)] = "point-to-point", /* interface has p-p link. */ + [LOG2U(IFF_NOTRAILERS)] = "no-trailers", /* avoid use of trailers. */ + [LOG2U(IFF_RUNNING)] = "running", /* interface RFC2863 OPER_UP. */ + [LOG2U(IFF_NOARP)] = "no-arp", /* no ARP protocol. */ + [LOG2U(IFF_PROMISC)] = "promiscuous", /* receive all packets. */ + [LOG2U(IFF_ALLMULTI)] = "all-multicast", /* receive all multicast packets. */ + [LOG2U(IFF_MASTER)] = "master", /* master of a load balancer. */ + [LOG2U(IFF_SLAVE)] = "slave", /* slave of a load balancer. */ + [LOG2U(IFF_MULTICAST)] = "multicast", /* supports multicast. */ + [LOG2U(IFF_PORTSEL)] = "portsel", /* can set media type. */ + [LOG2U(IFF_AUTOMEDIA)] = "auto-media", /* auto media select active. */ + [LOG2U(IFF_DYNAMIC)] = "dynamic", /* dialup device with changing addresses. */ + [LOG2U(IFF_LOWER_UP)] = "lower-up", /* driver signals L1 up. */ + [LOG2U(IFF_DORMANT)] = "dormant", /* driver signals dormant. */ + [LOG2U(IFF_ECHO)] = "echo", /* echo sent packets. */ + }; + + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(map); i++) + if (FLAGS_SET(flags, 1 << i) && map[i]) + if (!strextend_with_separator(&str, ",", map[i])) + return -ENOMEM; + + *ret = TAKE_PTR(str); + return 0; +} + +static const char * const kernel_operstate_table[] = { + [IF_OPER_UNKNOWN] = "unknown", + [IF_OPER_NOTPRESENT] = "not-present", + [IF_OPER_DOWN] = "down", + [IF_OPER_LOWERLAYERDOWN] = "lower-layer-down", + [IF_OPER_TESTING] = "testing", + [IF_OPER_DORMANT] = "dormant", + [IF_OPER_UP] = "up", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(kernel_operstate, int); diff --git a/src/network/networkd-link.h b/src/network/networkd-link.h new file mode 100644 index 0000000..938bbf4 --- /dev/null +++ b/src/network/networkd-link.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-dhcp-client.h" +#include "sd-dhcp-server.h" +#include "sd-dhcp6-client.h" +#include "sd-ipv4acd.h" +#include "sd-ipv4ll.h" +#include "sd-lldp-rx.h" +#include "sd-lldp-tx.h" +#include "sd-ndisc.h" +#include "sd-radv.h" +#include "sd-netlink.h" + +#include "ether-addr-util.h" +#include "log-link.h" +#include "netif-util.h" +#include "network-util.h" +#include "networkd-ipv6ll.h" +#include "networkd-util.h" +#include "ordered-set.h" +#include "resolve-util.h" +#include "set.h" + +typedef enum LinkState { + LINK_STATE_PENDING, /* udev has not initialized the link */ + LINK_STATE_INITIALIZED, /* udev has initialized the link */ + LINK_STATE_CONFIGURING, /* configuring addresses, routes, etc. */ + LINK_STATE_CONFIGURED, /* everything is configured */ + LINK_STATE_UNMANAGED, /* Unmanaged=yes is set */ + LINK_STATE_FAILED, /* at least one configuration process failed */ + LINK_STATE_LINGER, /* RTM_DELLINK for the link has been received */ + _LINK_STATE_MAX, + _LINK_STATE_INVALID = -EINVAL, +} LinkState; + +typedef struct Manager Manager; +typedef struct Network Network; +typedef struct NetDev NetDev; +typedef struct DUID DUID; + +typedef struct Link { + Manager *manager; + + unsigned n_ref; + + int ifindex; + int master_ifindex; + int dsa_master_ifindex; + int sr_iov_phys_port_ifindex; + Set *sr_iov_virt_port_ifindices; + + char *ifname; + char **alternative_names; + char *kind; + unsigned short iftype; + char *state_file; + struct hw_addr_data hw_addr; + struct hw_addr_data bcast_addr; + struct hw_addr_data permanent_hw_addr; + struct hw_addr_data requested_hw_addr; + struct in6_addr ipv6ll_address; + uint32_t mtu; + uint32_t min_mtu; + uint32_t max_mtu; + uint32_t original_mtu; + sd_device *dev; + char *driver; + + /* to prevent multiple ethtool calls */ + bool ethtool_driver_read; + bool ethtool_permanent_hw_addr_read; + + /* link-local addressing */ + IPv6LinkLocalAddressGenMode ipv6ll_address_gen_mode; + + /* wlan */ + enum nl80211_iftype wlan_iftype; + char *ssid; + char *previous_ssid; + struct ether_addr bssid; + + unsigned flags; + uint8_t kernel_operstate; + + sd_event_source *carrier_lost_timer; + + Network *network; + NetDev *netdev; + + LinkState state; + LinkOperationalState operstate; + LinkCarrierState carrier_state; + LinkAddressState address_state; + LinkAddressState ipv4_address_state; + LinkAddressState ipv6_address_state; + LinkOnlineState online_state; + + unsigned static_address_messages; + unsigned static_address_label_messages; + unsigned static_bridge_fdb_messages; + unsigned static_bridge_mdb_messages; + unsigned static_ipv6_proxy_ndp_messages; + unsigned static_neighbor_messages; + unsigned static_nexthop_messages; + unsigned static_route_messages; + unsigned static_routing_policy_rule_messages; + unsigned tc_messages; + unsigned sr_iov_messages; + unsigned set_link_messages; + unsigned set_flags_messages; + unsigned create_stacked_netdev_messages; + + Set *addresses; + Set *neighbors; + Set *routes; + Set *nexthops; + Set *qdiscs; + Set *tclasses; + + sd_dhcp_client *dhcp_client; + sd_dhcp_lease *dhcp_lease; + char *lease_file; + unsigned dhcp4_messages; + bool dhcp4_configured; + char *dhcp4_6rd_tunnel_name; + + Hashmap *ipv4acd_by_address; + + sd_ipv4ll *ipv4ll; + bool ipv4ll_address_configured:1; + + bool static_addresses_configured:1; + bool static_address_labels_configured:1; + bool static_bridge_fdb_configured:1; + bool static_bridge_mdb_configured:1; + bool static_ipv6_proxy_ndp_configured:1; + bool static_neighbors_configured:1; + bool static_nexthops_configured:1; + bool static_routes_configured:1; + bool static_routing_policy_rules_configured:1; + bool tc_configured:1; + bool sr_iov_configured:1; + bool activated:1; + bool master_set:1; + bool stacked_netdevs_created:1; + + sd_dhcp_server *dhcp_server; + + sd_ndisc *ndisc; + sd_event_source *ndisc_expire; + Set *ndisc_rdnss; + Set *ndisc_dnssl; + Set *ndisc_captive_portals; + Set *ndisc_pref64; + unsigned ndisc_messages; + bool ndisc_configured:1; + + sd_radv *radv; + + sd_dhcp6_client *dhcp6_client; + sd_dhcp6_lease *dhcp6_lease; + unsigned dhcp6_messages; + bool dhcp6_configured; + + Set *dhcp_pd_prefixes; + unsigned dhcp_pd_messages; + bool dhcp_pd_configured; + + /* This is about LLDP reception */ + sd_lldp_rx *lldp_rx; + char *lldp_file; + + /* This is about LLDP transmission */ + sd_lldp_tx *lldp_tx; + + Hashmap *bound_by_links; + Hashmap *bound_to_links; + Set *slaves; + + /* For speed meter */ + struct rtnl_link_stats64 stats_old, stats_new; + bool stats_updated; + + /* All kinds of DNS configuration the user configured via D-Bus */ + struct in_addr_full **dns; + unsigned n_dns; + OrderedSet *search_domains, *route_domains; + + int dns_default_route; + ResolveSupport llmnr; + ResolveSupport mdns; + DnssecMode dnssec_mode; + DnsOverTlsMode dns_over_tls_mode; + Set *dnssec_negative_trust_anchors; + + /* Similar, but NTP server configuration */ + char **ntp; +} Link; + +typedef int (*link_netlink_message_handler_t)(sd_netlink*, sd_netlink_message*, Link*); + +bool link_is_ready_to_configure(Link *link, bool allow_unmanaged); + +void link_ntp_settings_clear(Link *link); +void link_dns_settings_clear(Link *link); +Link *link_unref(Link *link); +Link *link_ref(Link *link); +DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_unref); +DEFINE_TRIVIAL_DESTRUCTOR(link_netlink_destroy_callback, Link, link_unref); + +int link_get_by_index(Manager *m, int ifindex, Link **ret); +int link_get_by_name(Manager *m, const char *ifname, Link **ret); +int link_get_by_hw_addr(Manager *m, const struct hw_addr_data *hw_addr, Link **ret); +int link_get_master(Link *link, Link **ret); + +int link_getlink_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, const char *error_msg); +int link_call_getlink(Link *link, link_netlink_message_handler_t callback); +int link_handle_bound_to_list(Link *link); + +void link_enter_failed(Link *link); +void link_set_state(Link *link, LinkState state); +void link_check_ready(Link *link); + +void link_update_operstate(Link *link, bool also_update_bond_master); + +static inline bool link_has_carrier(Link *link) { + assert(link); + return netif_has_carrier(link->kernel_operstate, link->flags); +} + +bool link_ipv6_enabled(Link *link); +int link_ipv6ll_gained(Link *link); +bool link_has_ipv6_connectivity(Link *link); + +int link_stop_engines(Link *link, bool may_keep_dhcp); + +const char* link_state_to_string(LinkState s) _const_; +LinkState link_state_from_string(const char *s) _pure_; + +int link_reconfigure_impl(Link *link, bool force); +int link_reconfigure(Link *link, bool force); + +int manager_udev_process_link(Manager *m, sd_device *device, sd_device_action_t action); +int manager_rtnl_process_link(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); + +int link_flags_to_string_alloc(uint32_t flags, char **ret); +const char *kernel_operstate_to_string(int t) _const_; diff --git a/src/network/networkd-lldp-rx.c b/src/network/networkd-lldp-rx.c new file mode 100644 index 0000000..3a59884 --- /dev/null +++ b/src/network/networkd-lldp-rx.c @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "networkd-link.h" +#include "networkd-lldp-rx.h" +#include "networkd-lldp-tx.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" + +DEFINE_CONFIG_PARSE_ENUM(config_parse_lldp_mode, lldp_mode, LLDPMode, "Failed to parse LLDP= setting."); + +static const char* const lldp_mode_table[_LLDP_MODE_MAX] = { + [LLDP_MODE_NO] = "no", + [LLDP_MODE_YES] = "yes", + [LLDP_MODE_ROUTERS_ONLY] = "routers-only", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(lldp_mode, LLDPMode, LLDP_MODE_YES); + +static bool link_lldp_rx_enabled(Link *link) { + assert(link); + + if (link->flags & IFF_LOOPBACK) + return false; + + if (link->iftype != ARPHRD_ETHER) + return false; + + if (!link->network) + return false; + + /* LLDP should be handled on bridge and bond slaves as those have a direct connection to their peers, + * not on the bridge/bond master. Linux doesn't even (by default) forward lldp packets to the bridge + * master. */ + if (link->kind && STR_IN_SET(link->kind, "bridge", "bond")) + return false; + + return link->network->lldp_mode != LLDP_MODE_NO; +} + +static void lldp_rx_handler(sd_lldp_rx *lldp_rx, sd_lldp_rx_event_t event, sd_lldp_neighbor *n, void *userdata) { + Link *link = ASSERT_PTR(userdata); + int r; + + (void) link_lldp_save(link); + + if (link->lldp_tx && event == SD_LLDP_RX_EVENT_ADDED) { + /* If we received information about a new neighbor, restart the LLDP "fast" logic */ + + log_link_debug(link, "Received LLDP datagram from previously unknown neighbor, restarting 'fast' LLDP transmission."); + + (void) sd_lldp_tx_stop(link->lldp_tx); + r = sd_lldp_tx_start(link->lldp_tx); + if (r < 0) + log_link_warning_errno(link, r, "Failed to restart LLDP transmission: %m"); + } +} + +int link_lldp_rx_configure(Link *link) { + int r; + + if (!link_lldp_rx_enabled(link)) + return 0; + + if (link->lldp_rx) + return -EBUSY; + + r = sd_lldp_rx_new(&link->lldp_rx); + if (r < 0) + return r; + + r = sd_lldp_rx_attach_event(link->lldp_rx, link->manager->event, 0); + if (r < 0) + return r; + + r = sd_lldp_rx_set_ifindex(link->lldp_rx, link->ifindex); + if (r < 0) + return r; + + r = sd_lldp_rx_match_capabilities(link->lldp_rx, + link->network->lldp_mode == LLDP_MODE_ROUTERS_ONLY ? + SD_LLDP_SYSTEM_CAPABILITIES_ALL_ROUTERS : + SD_LLDP_SYSTEM_CAPABILITIES_ALL); + if (r < 0) + return r; + + r = sd_lldp_rx_set_filter_address(link->lldp_rx, &link->hw_addr.ether); + if (r < 0) + return r; + + r = sd_lldp_rx_set_callback(link->lldp_rx, lldp_rx_handler, link); + if (r < 0) + return r; + + return 0; +} + +int link_lldp_save(Link *link) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + sd_lldp_neighbor **l = NULL; + int n = 0, r, i; + + assert(link); + + if (isempty(link->lldp_file)) + return 0; /* Do not update state file when running in test mode. */ + + if (!link->lldp_rx) { + (void) unlink(link->lldp_file); + return 0; + } + + r = sd_lldp_rx_get_neighbors(link->lldp_rx, &l); + if (r < 0) + return r; + if (r == 0) { + (void) unlink(link->lldp_file); + return 0; + } + + n = r; + + r = fopen_temporary(link->lldp_file, &f, &temp_path); + if (r < 0) + goto finish; + + (void) fchmod(fileno(f), 0644); + + for (i = 0; i < n; i++) { + const void *p; + le64_t u; + size_t sz; + + r = sd_lldp_neighbor_get_raw(l[i], &p, &sz); + if (r < 0) + goto finish; + + u = htole64(sz); + (void) fwrite(&u, 1, sizeof(u), f); + (void) fwrite(p, 1, sz, f); + } + + r = fflush_and_check(f); + if (r < 0) + goto finish; + + r = conservative_rename(temp_path, link->lldp_file); + if (r < 0) + goto finish; + +finish: + if (r < 0) + log_link_error_errno(link, r, "Failed to save LLDP data to %s: %m", link->lldp_file); + + if (l) { + for (i = 0; i < n; i++) + sd_lldp_neighbor_unref(l[i]); + free(l); + } + + return r; +} diff --git a/src/network/networkd-lldp-rx.h b/src/network/networkd-lldp-rx.h new file mode 100644 index 0000000..22f6602 --- /dev/null +++ b/src/network/networkd-lldp-rx.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +typedef struct Link Link; + +typedef enum LLDPMode { + LLDP_MODE_NO = 0, + LLDP_MODE_YES = 1, + LLDP_MODE_ROUTERS_ONLY = 2, + _LLDP_MODE_MAX, + _LLDP_MODE_INVALID = -EINVAL, +} LLDPMode; + +int link_lldp_rx_configure(Link *link); +int link_lldp_save(Link *link); + +const char* lldp_mode_to_string(LLDPMode m) _const_; +LLDPMode lldp_mode_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_lldp_mode); diff --git a/src/network/networkd-lldp-tx.c b/src/network/networkd-lldp-tx.c new file mode 100644 index 0000000..fc9196f --- /dev/null +++ b/src/network/networkd-lldp-tx.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-lldp-tx.h" + +#include "networkd-link.h" +#include "networkd-lldp-tx.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" + +static bool link_lldp_tx_enabled(Link *link) { + assert(link); + + if (link->flags & IFF_LOOPBACK) + return false; + + if (link->iftype != ARPHRD_ETHER) + return false; + + if (!link->network) + return false; + + if (link->kind && STR_IN_SET(link->kind, "bridge", "bond")) + return false; + + return link->network->lldp_multicast_mode >= 0 && + link->network->lldp_multicast_mode < _SD_LLDP_MULTICAST_MODE_MAX; +} + +int link_lldp_tx_configure(Link *link) { + int r; + + assert(link); + + if (!link_lldp_tx_enabled(link)) + return 0; + + if (link->lldp_tx) + return -EBUSY; + + r = sd_lldp_tx_new(&link->lldp_tx); + if (r < 0) + return r; + + r = sd_lldp_tx_attach_event(link->lldp_tx, link->manager->event, 0); + if (r < 0) + return r; + + r = sd_lldp_tx_set_ifindex(link->lldp_tx, link->ifindex); + if (r < 0) + return r; + + r = sd_lldp_tx_set_hwaddr(link->lldp_tx, &link->hw_addr.ether); + if (r < 0) + return r; + + assert(link->network); + + r = sd_lldp_tx_set_multicast_mode(link->lldp_tx, link->network->lldp_multicast_mode); + if (r < 0) + return r; + + r = sd_lldp_tx_set_capabilities(link->lldp_tx, + SD_LLDP_SYSTEM_CAPABILITIES_STATION | + SD_LLDP_SYSTEM_CAPABILITIES_BRIDGE | + SD_LLDP_SYSTEM_CAPABILITIES_ROUTER, + (link->network->ip_forward != ADDRESS_FAMILY_NO) ? + SD_LLDP_SYSTEM_CAPABILITIES_ROUTER : + SD_LLDP_SYSTEM_CAPABILITIES_STATION); + if (r < 0) + return r; + + r = sd_lldp_tx_set_port_description(link->lldp_tx, link->network->description); + if (r < 0) + return r; + + r = sd_lldp_tx_set_mud_url(link->lldp_tx, link->network->lldp_mudurl); + if (r < 0) + return r; + + return 0; +} + +static const char * const lldp_multicast_mode_table[_SD_LLDP_MULTICAST_MODE_MAX] = { + [SD_LLDP_MULTICAST_MODE_NEAREST_BRIDGE] = "nearest-bridge", + [SD_LLDP_MULTICAST_MODE_NON_TPMR_BRIDGE] = "non-tpmr-bridge", + [SD_LLDP_MULTICAST_MODE_CUSTOMER_BRIDGE] = "customer-bridge", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(lldp_multicast_mode, sd_lldp_multicast_mode_t); + +int config_parse_lldp_multicast_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + sd_lldp_multicast_mode_t m, *mode = ASSERT_PTR(data); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *mode = _SD_LLDP_MULTICAST_MODE_INVALID; + return 0; + } + + r = parse_boolean(rvalue); + if (r >= 0) { + *mode = r == 0 ? _SD_LLDP_MULTICAST_MODE_INVALID : SD_LLDP_MULTICAST_MODE_NEAREST_BRIDGE; + return 0; + } + + m = lldp_multicast_mode_from_string(rvalue); + if (m < 0) { + log_syntax(unit, LOG_WARNING, filename, line, m, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + *mode = m; + return 0; +} diff --git a/src/network/networkd-lldp-tx.h b/src/network/networkd-lldp-tx.h new file mode 100644 index 0000000..73757f1 --- /dev/null +++ b/src/network/networkd-lldp-tx.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +typedef struct Link Link; + +int link_lldp_tx_configure(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_lldp_multicast_mode); diff --git a/src/network/networkd-manager-bus.c b/src/network/networkd-manager-bus.c new file mode 100644 index 0000000..aecbc1d --- /dev/null +++ b/src/network/networkd-manager-bus.c @@ -0,0 +1,425 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-message-util.h" +#include "bus-polkit.h" +#include "networkd-dhcp-server-bus.h" +#include "networkd-dhcp4-bus.h" +#include "networkd-dhcp6-bus.h" +#include "networkd-json.h" +#include "networkd-link-bus.h" +#include "networkd-link.h" +#include "networkd-manager-bus.h" +#include "networkd-manager.h" +#include "networkd-network-bus.h" +#include "path-util.h" +#include "strv.h" +#include "user-util.h" + +static int method_list_links(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Manager *manager = userdata; + Link *link; + int r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(iso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(link, manager->links_by_index) { + _cleanup_free_ char *path = NULL; + + path = link_bus_path(link); + if (!path) + return -ENOMEM; + + r = sd_bus_message_append( + reply, "(iso)", + link->ifindex, + link->ifname, + empty_to_root(path)); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_link_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *path = NULL; + Manager *manager = userdata; + const char *name; + Link *link; + int r; + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (link_get_by_name(manager, name, &link) < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_LINK, "Link %s not known", name); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + path = link_bus_path(link); + if (!path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "io", link->ifindex, empty_to_root(path)); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_get_link_by_index(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *path = NULL; + Manager *manager = userdata; + int ifindex, r; + Link *link; + + r = bus_message_read_ifindex(message, error, &ifindex); + if (r < 0) + return r; + + r = link_get_by_index(manager, ifindex, &link); + if (r < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_LINK, "Link %i not known", ifindex); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + path = link_bus_path(link); + if (!path) + return -ENOMEM; + + r = sd_bus_message_append(reply, "so", link->ifname, empty_to_root(path)); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int call_link_method(Manager *m, sd_bus_message *message, sd_bus_message_handler_t handler, sd_bus_error *error) { + int ifindex, r; + Link *l; + + assert(m); + assert(message); + assert(handler); + + r = bus_message_read_ifindex(message, error, &ifindex); + if (r < 0) + return r; + + r = link_get_by_index(m, ifindex, &l); + if (r < 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_LINK, "Link %i not known", ifindex); + + return handler(message, l, error); +} + +static int bus_method_set_link_ntp_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_ntp_servers, error); +} + +static int bus_method_set_link_dns_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dns_servers, error); +} + +static int bus_method_set_link_dns_servers_ex(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dns_servers_ex, error); +} + +static int bus_method_set_link_domains(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_domains, error); +} + +static int bus_method_set_link_default_route(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_default_route, error); +} + +static int bus_method_set_link_llmnr(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_llmnr, error); +} + +static int bus_method_set_link_mdns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_mdns, error); +} + +static int bus_method_set_link_dns_over_tls(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dns_over_tls, error); +} + +static int bus_method_set_link_dnssec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dnssec, error); +} + +static int bus_method_set_link_dnssec_negative_trust_anchors(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dnssec_negative_trust_anchors, error); +} + +static int bus_method_revert_link_ntp(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_revert_ntp, error); +} + +static int bus_method_revert_link_dns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_revert_dns, error); +} + +static int bus_method_renew_link(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_renew, error); +} + +static int bus_method_force_renew_link(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_force_renew, error); +} + +static int bus_method_reconfigure_link(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_reconfigure, error); +} + +static int bus_method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *manager = userdata; + int r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.network1.reload", + NULL, true, UID_INVALID, + &manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + r = manager_reload(manager); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_method_describe_link(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_describe, error); +} + +static int bus_method_describe(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_free_ char *text = NULL; + Manager *manager = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = manager_build_json(manager, &v); + if (r < 0) + return log_error_errno(r, "Failed to build JSON data: %m"); + + r = json_variant_format(v, 0, &text); + if (r < 0) + return log_error_errno(r, "Failed to format JSON data: %m"); + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", text); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int property_get_namespace_id( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t id = 0; + struct stat st; + + assert(bus); + assert(reply); + + /* Returns our own network namespace ID, i.e. the inode number of /proc/self/ns/net. This allows + * unprivileged clients to determine whether they are in the same network namespace as us (note that + * access to that path is restricted, thus they can't check directly unless privileged). */ + + if (stat("/proc/self/ns/net", &st) < 0) { + log_warning_errno(errno, "Failed to stat network namespace, ignoring: %m"); + id = 0; + } else + id = st.st_ino; + + return sd_bus_message_append(reply, "t", id); +} + +static const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("OperationalState", "s", property_get_operational_state, offsetof(Manager, operational_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CarrierState", "s", property_get_carrier_state, offsetof(Manager, carrier_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("AddressState", "s", property_get_address_state, offsetof(Manager, address_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IPv4AddressState", "s", property_get_address_state, offsetof(Manager, ipv4_address_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("IPv6AddressState", "s", property_get_address_state, offsetof(Manager, ipv6_address_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("OnlineState", "s", property_get_online_state, offsetof(Manager, online_state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NamespaceId", "t", property_get_namespace_id, 0, SD_BUS_VTABLE_PROPERTY_CONST), + + SD_BUS_METHOD_WITH_ARGS("ListLinks", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(iso)", links), + method_list_links, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetLinkByName", + SD_BUS_ARGS("s", name), + SD_BUS_RESULT("i", ifindex, "o", path), + method_get_link_by_name, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetLinkByIndex", + SD_BUS_ARGS("i", ifindex), + SD_BUS_RESULT("s", name, "o", path), + method_get_link_by_index, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkNTP", + SD_BUS_ARGS("i", ifindex, "as", servers), + SD_BUS_NO_RESULT, + bus_method_set_link_ntp_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNS", + SD_BUS_ARGS("i", ifindex, "a(iay)", addresses), + SD_BUS_NO_RESULT, + bus_method_set_link_dns_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSEx", + SD_BUS_ARGS("i", ifindex, "a(iayqs)", addresses), + SD_BUS_NO_RESULT, + bus_method_set_link_dns_servers_ex, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDomains", + SD_BUS_ARGS("i", ifindex, "a(sb)", domains), + SD_BUS_NO_RESULT, + bus_method_set_link_domains, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDefaultRoute", + SD_BUS_ARGS("i", ifindex, "b", enable), + SD_BUS_NO_RESULT, + bus_method_set_link_default_route, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkLLMNR", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_llmnr, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkMulticastDNS", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_mdns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSOverTLS", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_dns_over_tls, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSSEC", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_dnssec, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSSECNegativeTrustAnchors", + SD_BUS_ARGS("i", ifindex, "as", names), + SD_BUS_NO_RESULT, + bus_method_set_link_dnssec_negative_trust_anchors, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RevertLinkNTP", + SD_BUS_ARGS("i", ifindex), + SD_BUS_NO_RESULT, + bus_method_revert_link_ntp, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RevertLinkDNS", + SD_BUS_ARGS("i", ifindex), + SD_BUS_NO_RESULT, + bus_method_revert_link_dns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RenewLink", + SD_BUS_ARGS("i", ifindex), + SD_BUS_NO_RESULT, + bus_method_renew_link, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ForceRenewLink", + SD_BUS_ARGS("i", ifindex), + SD_BUS_NO_RESULT, + bus_method_force_renew_link, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReconfigureLink", + SD_BUS_ARGS("i", ifindex), + SD_BUS_NO_RESULT, + bus_method_reconfigure_link, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Reload", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_method_reload, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DescribeLink", + SD_BUS_ARGS("i", ifindex), + SD_BUS_RESULT("s", json), + bus_method_describe_link, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Describe", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", json), + bus_method_describe, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +int manager_send_changed_strv(Manager *manager, char **properties) { + assert(manager); + assert(properties); + + if (sd_bus_is_ready(manager->bus) <= 0) + return 0; + + return sd_bus_emit_properties_changed_strv( + manager->bus, + "/org/freedesktop/network1", + "org.freedesktop.network1.Manager", + properties); +} + +const BusObjectImplementation manager_object = { + "/org/freedesktop/network1", + "org.freedesktop.network1.Manager", + .vtables = BUS_VTABLES(manager_vtable), + .children = BUS_IMPLEMENTATIONS( + &link_object, /* This is the main implementation for /org/freedesktop/network1/link, + * and must be earlier than the dhcp objects below. */ + &dhcp_server_object, + &dhcp_client_object, + &dhcp6_client_object, + &network_object), +}; diff --git a/src/network/networkd-manager-bus.h b/src/network/networkd-manager-bus.h new file mode 100644 index 0000000..5cd7f16 --- /dev/null +++ b/src/network/networkd-manager-bus.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" + +typedef struct Manager Manager; + +extern const BusObjectImplementation manager_object; + +int manager_send_changed_strv(Manager *m, char **properties); diff --git a/src/network/networkd-manager.c b/src/network/networkd-manager.c new file mode 100644 index 0000000..c09dcfb --- /dev/null +++ b/src/network/networkd-manager.c @@ -0,0 +1,1108 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-log-control-api.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "common-signal.h" +#include "conf-parser.h" +#include "constants.h" +#include "daemon-util.h" +#include "device-private.h" +#include "device-util.h" +#include "dns-domain.h" +#include "fd-util.h" +#include "fileio.h" +#include "firewall-util.h" +#include "fs-util.h" +#include "initrd-util.h" +#include "local-addresses.h" +#include "netlink-util.h" +#include "network-internal.h" +#include "networkd-address-pool.h" +#include "networkd-address.h" +#include "networkd-dhcp-server-bus.h" +#include "networkd-dhcp6.h" +#include "networkd-link-bus.h" +#include "networkd-manager-bus.h" +#include "networkd-manager.h" +#include "networkd-neighbor.h" +#include "networkd-network-bus.h" +#include "networkd-nexthop.h" +#include "networkd-queue.h" +#include "networkd-route.h" +#include "networkd-routing-policy-rule.h" +#include "networkd-speed-meter.h" +#include "networkd-state-file.h" +#include "networkd-wifi.h" +#include "networkd-wiphy.h" +#include "ordered-set.h" +#include "path-lookup.h" +#include "path-util.h" +#include "qdisc.h" +#include "selinux-util.h" +#include "set.h" +#include "signal-util.h" +#include "stat-util.h" +#include "strv.h" +#include "sysctl-util.h" +#include "tclass.h" +#include "tmpfile-util.h" +#include "tuntap.h" +#include "udev-util.h" + +/* use 128 MB for receive socket kernel queue. */ +#define RCVBUF_SIZE (128*1024*1024) + +static int match_prepare_for_sleep(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + Manager *m = ASSERT_PTR(userdata); + Link *link; + int b, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (b) + return 0; + + log_debug("Coming back from suspend, reconfiguring all connections..."); + + HASHMAP_FOREACH(link, m->links_by_index) { + r = link_reconfigure(link, /* force = */ true); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to reconfigure interface: %m"); + link_enter_failed(link); + } + } + + return 0; +} + +static int on_connected(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + Manager *m = ASSERT_PTR(userdata); + + assert(message); + + /* Did we get a timezone or transient hostname from DHCP while D-Bus wasn't up yet? */ + if (m->dynamic_hostname) + (void) manager_set_hostname(m, m->dynamic_hostname); + if (m->dynamic_timezone) + (void) manager_set_timezone(m, m->dynamic_timezone); + if (m->product_uuid_requested) + (void) manager_request_product_uuid(m); + + return 0; +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-network"); + if (r < 0) + return log_error_errno(r, "Failed to connect to bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.network1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + r = sd_bus_match_signal_async( + m->bus, + NULL, + "org.freedesktop.DBus.Local", + NULL, + "org.freedesktop.DBus.Local", + "Connected", + on_connected, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to request match on Connected signal: %m"); + + r = bus_match_signal_async( + m->bus, + NULL, + bus_login_mgr, + "PrepareForSleep", + match_prepare_for_sleep, NULL, m); + if (r < 0) + log_warning_errno(r, "Failed to request match for PrepareForSleep, ignoring: %m"); + + return 0; +} + +static int manager_process_uevent(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + sd_device_action_t action; + const char *s; + int r; + + assert(device); + + r = sd_device_get_action(device, &action); + if (r < 0) + return log_device_warning_errno(device, r, "Failed to get udev action, ignoring: %m"); + + r = sd_device_get_subsystem(device, &s); + if (r < 0) + return log_device_warning_errno(device, r, "Failed to get subsystem, ignoring: %m"); + + if (streq(s, "net")) + r = manager_udev_process_link(m, device, action); + else if (streq(s, "ieee80211")) + r = manager_udev_process_wiphy(m, device, action); + else if (streq(s, "rfkill")) + r = manager_udev_process_rfkill(m, device, action); + else { + log_device_debug(device, "Received device with unexpected subsystem \"%s\", ignoring.", s); + return 0; + } + if (r < 0) + log_device_warning_errno(device, r, "Failed to process \"%s\" uevent, ignoring: %m", + device_action_to_string(action)); + + return 0; +} + +static int manager_connect_udev(Manager *m) { + int r; + + /* udev does not initialize devices inside containers, so we rely on them being already + * initialized before entering the container. */ + if (!udev_available()) + return 0; + + r = sd_device_monitor_new(&m->device_monitor); + if (r < 0) + return log_error_errno(r, "Failed to initialize device monitor: %m"); + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "net", NULL); + if (r < 0) + return log_error_errno(r, "Could not add device monitor filter for net subsystem: %m"); + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "ieee80211", NULL); + if (r < 0) + return log_error_errno(r, "Could not add device monitor filter for ieee80211 subsystem: %m"); + + r = sd_device_monitor_filter_add_match_subsystem_devtype(m->device_monitor, "rfkill", NULL); + if (r < 0) + return log_error_errno(r, "Could not add device monitor filter for rfkill subsystem: %m"); + + r = sd_device_monitor_attach_event(m->device_monitor, m->event); + if (r < 0) + return log_error_errno(r, "Failed to attach event to device monitor: %m"); + + r = sd_device_monitor_start(m->device_monitor, manager_process_uevent, m); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + return 0; +} + +static int manager_listen_fds(Manager *m, int *ret_rtnl_fd) { + _cleanup_strv_free_ char **names = NULL; + int n, rtnl_fd = -EBADF; + + assert(m); + assert(ret_rtnl_fd); + + n = sd_listen_fds_with_names(/* unset_environment = */ true, &names); + if (n < 0) + return n; + + if (strv_length(names) != (size_t) n) + return -EINVAL; + + for (int i = 0; i < n; i++) { + int fd = i + SD_LISTEN_FDS_START; + + if (sd_is_socket(fd, AF_NETLINK, SOCK_RAW, -1) > 0) { + if (rtnl_fd >= 0) { + log_debug("Received multiple netlink socket, ignoring."); + safe_close(fd); + continue; + } + + rtnl_fd = fd; + continue; + } + + if (manager_add_tuntap_fd(m, fd, names[i]) >= 0) + continue; + + if (m->test_mode) + safe_close(fd); + else + close_and_notify_warn(fd, names[i]); + } + + *ret_rtnl_fd = rtnl_fd; + return 0; +} + +static int manager_connect_genl(Manager *m) { + int r; + + assert(m); + + r = sd_genl_socket_open(&m->genl); + if (r < 0) + return r; + + r = sd_netlink_increase_rxbuf(m->genl, RCVBUF_SIZE); + if (r < 0) + log_warning_errno(r, "Failed to increase receive buffer size for general netlink socket, ignoring: %m"); + + r = sd_netlink_attach_event(m->genl, m->event, 0); + if (r < 0) + return r; + + r = genl_add_match(m->genl, NULL, NL80211_GENL_NAME, NL80211_MULTICAST_GROUP_CONFIG, 0, + &manager_genl_process_nl80211_config, NULL, m, "network-genl_process_nl80211_config"); + if (r < 0 && r != -EOPNOTSUPP) + return r; + + r = genl_add_match(m->genl, NULL, NL80211_GENL_NAME, NL80211_MULTICAST_GROUP_MLME, 0, + &manager_genl_process_nl80211_mlme, NULL, m, "network-genl_process_nl80211_mlme"); + if (r < 0 && r != -EOPNOTSUPP) + return r; + + return 0; +} + +static int manager_setup_rtnl_filter(Manager *manager) { + struct sock_filter filter[] = { + /* Check the packet length. */ + BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0), /* A <- packet length */ + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, sizeof(struct nlmsghdr), 1, 0), /* A (packet length) >= sizeof(struct nlmsghdr) ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* reject */ + /* Always accept multipart message. */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct nlmsghdr, nlmsg_flags)), /* A <- message flags */ + BPF_JUMP(BPF_JMP + BPF_JSET + BPF_K, htobe16(NLM_F_MULTI), 0, 1), /* message flags has NLM_F_MULTI ? */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept */ + /* Accept all message types except for RTM_NEWNEIGH or RTM_DELNEIGH. */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct nlmsghdr, nlmsg_type)), /* A <- message type */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, htobe16(RTM_NEWNEIGH), 2, 0), /* message type == RTM_NEWNEIGH ? */ + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, htobe16(RTM_DELNEIGH), 1, 0), /* message type == RTM_DELNEIGH ? */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept */ + /* Check the packet length. */ + BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0), /* A <- packet length */ + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, sizeof(struct nlmsghdr) + sizeof(struct ndmsg), 1, 0), + /* packet length >= sizeof(struct nlmsghdr) + sizeof(struct ndmsg) ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* reject */ + /* Reject the message when the neighbor state does not have NUD_PERMANENT flag. */ + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, sizeof(struct nlmsghdr) + offsetof(struct ndmsg, ndm_state)), + /* A <- neighbor state */ + BPF_JUMP(BPF_JMP + BPF_JSET + BPF_K, htobe16(NUD_PERMANENT), 1, 0), /* neighbor state has NUD_PERMANENT ? */ + BPF_STMT(BPF_RET + BPF_K, 0), /* reject */ + BPF_STMT(BPF_RET + BPF_K, UINT32_MAX), /* accept */ + }; + + assert(manager); + assert(manager->rtnl); + + return sd_netlink_attach_filter(manager->rtnl, ELEMENTSOF(filter), filter); +} + +static int manager_connect_rtnl(Manager *m, int fd) { + _unused_ _cleanup_close_ int fd_close = fd; + int r; + + assert(m); + + /* This takes input fd. */ + + if (fd < 0) + r = sd_netlink_open(&m->rtnl); + else + r = sd_netlink_open_fd(&m->rtnl, fd); + if (r < 0) + return r; + TAKE_FD(fd_close); + + /* Bump receiver buffer, but only if we are not called via socket activation, as in that + * case systemd sets the receive buffer size for us, and the value in the .socket unit + * should take full effect. */ + if (fd < 0) { + r = sd_netlink_increase_rxbuf(m->rtnl, RCVBUF_SIZE); + if (r < 0) + log_warning_errno(r, "Failed to increase receive buffer size for rtnl socket, ignoring: %m"); + } + + r = sd_netlink_attach_event(m->rtnl, m->event, 0); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWLINK, &manager_rtnl_process_link, NULL, m, "network-rtnl_process_link"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELLINK, &manager_rtnl_process_link, NULL, m, "network-rtnl_process_link"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWQDISC, &manager_rtnl_process_qdisc, NULL, m, "network-rtnl_process_qdisc"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELQDISC, &manager_rtnl_process_qdisc, NULL, m, "network-rtnl_process_qdisc"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWTCLASS, &manager_rtnl_process_tclass, NULL, m, "network-rtnl_process_tclass"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELTCLASS, &manager_rtnl_process_tclass, NULL, m, "network-rtnl_process_tclass"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWADDR, &manager_rtnl_process_address, NULL, m, "network-rtnl_process_address"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELADDR, &manager_rtnl_process_address, NULL, m, "network-rtnl_process_address"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWNEIGH, &manager_rtnl_process_neighbor, NULL, m, "network-rtnl_process_neighbor"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELNEIGH, &manager_rtnl_process_neighbor, NULL, m, "network-rtnl_process_neighbor"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWROUTE, &manager_rtnl_process_route, NULL, m, "network-rtnl_process_route"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELROUTE, &manager_rtnl_process_route, NULL, m, "network-rtnl_process_route"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWRULE, &manager_rtnl_process_rule, NULL, m, "network-rtnl_process_rule"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELRULE, &manager_rtnl_process_rule, NULL, m, "network-rtnl_process_rule"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_NEWNEXTHOP, &manager_rtnl_process_nexthop, NULL, m, "network-rtnl_process_nexthop"); + if (r < 0) + return r; + + r = netlink_add_match(m->rtnl, NULL, RTM_DELNEXTHOP, &manager_rtnl_process_nexthop, NULL, m, "network-rtnl_process_nexthop"); + if (r < 0) + return r; + + return manager_setup_rtnl_filter(m); +} + +static int manager_dirty_handler(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Link *link; + int r; + + if (m->dirty) { + r = manager_save(m); + if (r < 0) + log_warning_errno(r, "Failed to update state file %s, ignoring: %m", m->state_file); + } + + SET_FOREACH(link, m->dirty_links) { + r = link_save_and_clean(link); + if (r < 0) + log_link_warning_errno(link, r, "Failed to update link state file %s, ignoring: %m", link->state_file); + } + + return 1; +} + +static int signal_terminate_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + m->restarting = false; + + log_debug("Terminate operation initiated."); + + return sd_event_exit(sd_event_source_get_event(s), 0); +} + +static int signal_restart_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + m->restarting = true; + + log_debug("Restart operation initiated."); + + return sd_event_exit(sd_event_source_get_event(s), 0); +} + +static int signal_reload_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + manager_reload(m); + + return 0; +} + +static int manager_set_keep_configuration(Manager *m) { + int r; + + assert(m); + + if (in_initrd()) { + log_debug("Running in initrd, keep DHCPv4 addresses on stopping networkd by default."); + m->keep_configuration = KEEP_CONFIGURATION_DHCP_ON_STOP; + return 0; + } + + r = path_is_network_fs("/"); + if (r < 0) + return log_error_errno(r, "Failed to detect if root is network filesystem: %m"); + if (r == 0) { + m->keep_configuration = _KEEP_CONFIGURATION_INVALID; + return 0; + } + + log_debug("Running on network filesystem, enabling KeepConfiguration= by default."); + m->keep_configuration = KEEP_CONFIGURATION_YES; + return 0; +} + +int manager_setup(Manager *m) { + _cleanup_close_ int rtnl_fd = -EBADF; + int r; + + assert(m); + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_set_watchdog(m->event, true); + (void) sd_event_add_signal(m->event, NULL, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, signal_terminate_callback, m); + (void) sd_event_add_signal(m->event, NULL, SIGINT | SD_EVENT_SIGNAL_PROCMASK, signal_terminate_callback, m); + (void) sd_event_add_signal(m->event, NULL, SIGUSR2 | SD_EVENT_SIGNAL_PROCMASK, signal_restart_callback, m); + (void) sd_event_add_signal(m->event, NULL, SIGHUP | SD_EVENT_SIGNAL_PROCMASK, signal_reload_callback, m); + (void) sd_event_add_signal(m->event, NULL, (SIGRTMIN+18) | SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL); + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + r = sd_event_add_post(m->event, NULL, manager_dirty_handler, m); + if (r < 0) + return r; + + r = sd_event_add_post(m->event, NULL, manager_process_requests, m); + if (r < 0) + return r; + + r = manager_listen_fds(m, &rtnl_fd); + if (r < 0) + return r; + + r = manager_connect_rtnl(m, TAKE_FD(rtnl_fd)); + if (r < 0) + return r; + + r = manager_connect_genl(m); + if (r < 0) + return r; + + if (m->test_mode) + return 0; + + r = manager_connect_bus(m); + if (r < 0) + return r; + + r = manager_connect_udev(m); + if (r < 0) + return r; + + r = sd_resolve_default(&m->resolve); + if (r < 0) + return r; + + r = sd_resolve_attach_event(m->resolve, m->event, 0); + if (r < 0) + return r; + + r = address_pool_setup_default(m); + if (r < 0) + return r; + + r = manager_set_keep_configuration(m); + if (r < 0) + return r; + + m->state_file = strdup("/run/systemd/netif/state"); + if (!m->state_file) + return -ENOMEM; + + return 0; +} + +int manager_new(Manager **ret, bool test_mode) { + _cleanup_(manager_freep) Manager *m = NULL; + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .keep_configuration = _KEEP_CONFIGURATION_INVALID, + .ipv6_privacy_extensions = IPV6_PRIVACY_EXTENSIONS_NO, + .test_mode = test_mode, + .speed_meter_interval_usec = SPEED_METER_DEFAULT_TIME_INTERVAL, + .online_state = _LINK_ONLINE_STATE_INVALID, + .manage_foreign_routes = true, + .manage_foreign_rules = true, + .ethtool_fd = -EBADF, + .dhcp_duid.type = DUID_TYPE_EN, + .dhcp6_duid.type = DUID_TYPE_EN, + .duid_product_uuid.type = DUID_TYPE_UUID, + }; + + *ret = TAKE_PTR(m); + return 0; +} + +Manager* manager_free(Manager *m) { + Link *link; + + if (!m) + return NULL; + + free(m->state_file); + + HASHMAP_FOREACH(link, m->links_by_index) + (void) link_stop_engines(link, true); + + m->request_queue = ordered_set_free(m->request_queue); + + m->dirty_links = set_free_with_destructor(m->dirty_links, link_unref); + m->new_wlan_ifindices = set_free(m->new_wlan_ifindices); + m->links_by_name = hashmap_free(m->links_by_name); + m->links_by_hw_addr = hashmap_free(m->links_by_hw_addr); + m->links_by_dhcp_pd_subnet_prefix = hashmap_free(m->links_by_dhcp_pd_subnet_prefix); + m->links_by_index = hashmap_free_with_destructor(m->links_by_index, link_unref); + + m->dhcp_pd_subnet_ids = set_free(m->dhcp_pd_subnet_ids); + m->networks = ordered_hashmap_free_with_destructor(m->networks, network_unref); + + m->netdevs = hashmap_free_with_destructor(m->netdevs, netdev_unref); + + m->tuntap_fds_by_name = hashmap_free(m->tuntap_fds_by_name); + + m->wiphy_by_name = hashmap_free(m->wiphy_by_name); + m->wiphy_by_index = hashmap_free_with_destructor(m->wiphy_by_index, wiphy_free); + + ordered_set_free_free(m->address_pools); + + hashmap_free(m->route_table_names_by_number); + hashmap_free(m->route_table_numbers_by_name); + + set_free(m->rules); + + sd_netlink_unref(m->rtnl); + sd_netlink_unref(m->genl); + sd_resolve_unref(m->resolve); + + /* reject (e.g. unreachable) type routes are managed by Manager, but may be referenced by a + * link. E.g., DHCP6 with prefix delegation creates unreachable routes, and they are referenced + * by the upstream link. And the links may be referenced by netlink slots. Hence, two + * set_free() must be called after the above sd_netlink_unref(). */ + m->routes = set_free(m->routes); + + m->nexthops = set_free(m->nexthops); + m->nexthops_by_id = hashmap_free(m->nexthops_by_id); + + sd_event_source_unref(m->speed_meter_event_source); + sd_event_unref(m->event); + + sd_device_monitor_unref(m->device_monitor); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + sd_bus_flush_close_unref(m->bus); + + free(m->dynamic_timezone); + free(m->dynamic_hostname); + + safe_close(m->ethtool_fd); + + m->fw_ctx = fw_ctx_free(m->fw_ctx); + + return mfree(m); +} + +int manager_start(Manager *m) { + Link *link; + int r; + + assert(m); + + r = manager_start_speed_meter(m); + if (r < 0) + return log_error_errno(r, "Failed to initialize speed meter: %m"); + + /* The dirty handler will deal with future serialization, but the first one + must be done explicitly. */ + + r = manager_save(m); + if (r < 0) + log_warning_errno(r, "Failed to update state file %s, ignoring: %m", m->state_file); + + HASHMAP_FOREACH(link, m->links_by_index) { + r = link_save_and_clean(link); + if (r < 0) + log_link_warning_errno(link, r, "Failed to update link state file %s, ignoring: %m", link->state_file); + } + + return 0; +} + +int manager_load_config(Manager *m) { + int r; + + r = netdev_load(m, false); + if (r < 0) + return r; + + manager_clear_unmanaged_tuntap_fds(m); + + r = network_load(m, &m->networks); + if (r < 0) + return r; + + return manager_build_dhcp_pd_subnet_ids(m); +} + +int manager_enumerate_internal( + Manager *m, + sd_netlink *nl, + sd_netlink_message *req, + int (*process)(sd_netlink *, sd_netlink_message *, Manager *)) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *reply = NULL; + int r; + + assert(m); + assert(nl); + assert(req); + assert(process); + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(nl, req, 0, &reply); + if (r < 0) + return r; + + m->enumerating = true; + for (sd_netlink_message *reply_one = reply; reply_one; reply_one = sd_netlink_message_next(reply_one)) + RET_GATHER(r, process(nl, reply_one, m)); + m->enumerating = false; + + return r; +} + +static int manager_enumerate_links(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + r = sd_rtnl_message_new_link(m->rtnl, &req, RTM_GETLINK, 0); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_link); +} + +static int manager_enumerate_qdisc(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + r = sd_rtnl_message_new_traffic_control(m->rtnl, &req, RTM_GETQDISC, 0, 0, 0); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_qdisc); +} + +static int manager_enumerate_tclass(Manager *m) { + Link *link; + int r = 0; + + assert(m); + assert(m->rtnl); + + /* TC class can be enumerated only per link. See tc_dump_tclass() in net/sched/sched_api.c. */ + + HASHMAP_FOREACH(link, m->links_by_index) + RET_GATHER(r, link_enumerate_tclass(link, 0)); + + return r; +} + +static int manager_enumerate_addresses(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + r = sd_rtnl_message_new_addr(m->rtnl, &req, RTM_GETADDR, 0, 0); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_address); +} + +static int manager_enumerate_neighbors(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + r = sd_rtnl_message_new_neigh(m->rtnl, &req, RTM_GETNEIGH, 0, AF_UNSPEC); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_neighbor); +} + +static int manager_enumerate_routes(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + if (!m->manage_foreign_routes) + return 0; + + r = sd_rtnl_message_new_route(m->rtnl, &req, RTM_GETROUTE, 0, 0); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_route); +} + +static int manager_enumerate_rules(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + if (!m->manage_foreign_rules) + return 0; + + r = sd_rtnl_message_new_routing_policy_rule(m->rtnl, &req, RTM_GETRULE, 0); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_rule); +} + +static int manager_enumerate_nexthop(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->rtnl); + + r = sd_rtnl_message_new_nexthop(m->rtnl, &req, RTM_GETNEXTHOP, 0, 0); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->rtnl, req, manager_rtnl_process_nexthop); +} + +static int manager_enumerate_nl80211_wiphy(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->genl); + + r = sd_genl_message_new(m->genl, NL80211_GENL_NAME, NL80211_CMD_GET_WIPHY, &req); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->genl, req, manager_genl_process_nl80211_wiphy); +} + +static int manager_enumerate_nl80211_config(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(m); + assert(m->genl); + + r = sd_genl_message_new(m->genl, NL80211_GENL_NAME, NL80211_CMD_GET_INTERFACE, &req); + if (r < 0) + return r; + + return manager_enumerate_internal(m, m->genl, req, manager_genl_process_nl80211_config); +} + +static int manager_enumerate_nl80211_mlme(Manager *m) { + Link *link; + int r; + + assert(m); + assert(m->genl); + + HASHMAP_FOREACH(link, m->links_by_index) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + + if (link->wlan_iftype != NL80211_IFTYPE_STATION) + continue; + + r = sd_genl_message_new(m->genl, NL80211_GENL_NAME, NL80211_CMD_GET_STATION, &req); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(req, NL80211_ATTR_IFINDEX, link->ifindex); + if (r < 0) + return r; + + r = manager_enumerate_internal(m, m->genl, req, manager_genl_process_nl80211_mlme); + if (r < 0) + return r; + } + + return 0; +} + +int manager_enumerate(Manager *m) { + int r; + + r = manager_enumerate_links(m); + if (r < 0) + return log_error_errno(r, "Could not enumerate links: %m"); + + r = manager_enumerate_qdisc(m); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "Could not enumerate QDiscs, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate QDisc: %m"); + + r = manager_enumerate_tclass(m); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "Could not enumerate TClasses, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate TClass: %m"); + + r = manager_enumerate_addresses(m); + if (r < 0) + return log_error_errno(r, "Could not enumerate addresses: %m"); + + r = manager_enumerate_neighbors(m); + if (r < 0) + return log_error_errno(r, "Could not enumerate neighbors: %m"); + + /* NextHop support is added in kernel v5.3 (65ee00a9409f751188a8cdc0988167858eb4a536), + * and older kernels return -EOPNOTSUPP, or -EINVAL if SELinux is enabled. */ + r = manager_enumerate_nexthop(m); + if (r == -EOPNOTSUPP || (r == -EINVAL && mac_selinux_enforcing())) + log_debug_errno(r, "Could not enumerate nexthops, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate nexthops: %m"); + + r = manager_enumerate_routes(m); + if (r < 0) + return log_error_errno(r, "Could not enumerate routes: %m"); + + /* If kernel is built with CONFIG_FIB_RULES=n, it returns -EOPNOTSUPP. */ + r = manager_enumerate_rules(m); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "Could not enumerate routing policy rules, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate routing policy rules: %m"); + + r = manager_enumerate_nl80211_wiphy(m); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "Could not enumerate wireless LAN phy, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate wireless LAN phy: %m"); + + r = manager_enumerate_nl80211_config(m); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "Could not enumerate wireless LAN interfaces, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate wireless LAN interfaces: %m"); + + r = manager_enumerate_nl80211_mlme(m); + if (r == -EOPNOTSUPP) + log_debug_errno(r, "Could not enumerate wireless LAN stations, ignoring: %m"); + else if (r < 0) + return log_error_errno(r, "Could not enumerate wireless LAN stations: %m"); + + return 0; +} + +static int set_hostname_handler(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + const sd_bus_error *e; + int r; + + assert(m); + + e = sd_bus_message_get_error(m); + if (e) { + r = sd_bus_error_get_errno(e); + log_warning_errno(r, "Could not set hostname: %s", bus_error_message(e, r)); + } + + return 1; +} + +int manager_set_hostname(Manager *m, const char *hostname) { + int r; + + log_debug("Setting transient hostname: '%s'", strna(hostname)); + + r = free_and_strdup_warn(&m->dynamic_hostname, hostname); + if (r < 0) + return r; + + if (sd_bus_is_ready(m->bus) <= 0) { + log_debug("Not connected to system bus, setting system hostname later."); + return 0; + } + + r = bus_call_method_async( + m->bus, + NULL, + bus_hostname, + "SetHostname", + set_hostname_handler, + m, + "sb", + hostname, + false); + if (r < 0) + return log_error_errno(r, "Could not set transient hostname: %m"); + + return 0; +} + +static int set_timezone_handler(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + const sd_bus_error *e; + int r; + + assert(m); + + e = sd_bus_message_get_error(m); + if (e) { + r = sd_bus_error_get_errno(e); + log_warning_errno(r, "Could not set timezone: %s", bus_error_message(e, r)); + } + + return 1; +} + +int manager_set_timezone(Manager *m, const char *tz) { + int r; + + assert(m); + assert(tz); + + log_debug("Setting system timezone: '%s'", tz); + r = free_and_strdup_warn(&m->dynamic_timezone, tz); + if (r < 0) + return r; + + if (sd_bus_is_ready(m->bus) <= 0) { + log_debug("Not connected to system bus, setting system timezone later."); + return 0; + } + + r = bus_call_method_async( + m->bus, + NULL, + bus_timedate, + "SetTimezone", + set_timezone_handler, + m, + "sb", + tz, + false); + if (r < 0) + return log_error_errno(r, "Could not set timezone: %m"); + + return 0; +} + +int manager_reload(Manager *m) { + Link *link; + int r; + + assert(m); + + (void) sd_notifyf(/* unset= */ false, + "RELOADING=1\n" + "STATUS=Reloading configuration...\n" + "MONOTONIC_USEC=" USEC_FMT, now(CLOCK_MONOTONIC)); + + r = netdev_load(m, /* reload= */ true); + if (r < 0) + goto finish; + + r = network_reload(m); + if (r < 0) + goto finish; + + HASHMAP_FOREACH(link, m->links_by_index) { + r = link_reconfigure(link, /* force = */ false); + if (r < 0) + goto finish; + } + + r = 0; +finish: + (void) sd_notify(/* unset= */ false, NOTIFY_READY); + return r; +} diff --git a/src/network/networkd-manager.h b/src/network/networkd-manager.h new file mode 100644 index 0000000..fbef528 --- /dev/null +++ b/src/network/networkd-manager.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-event.h" +#include "sd-id128.h" +#include "sd-netlink.h" +#include "sd-resolve.h" + +#include "dhcp-identifier.h" +#include "firewall-util.h" +#include "hashmap.h" +#include "networkd-link.h" +#include "networkd-network.h" +#include "networkd-sysctl.h" +#include "ordered-set.h" +#include "set.h" +#include "time-util.h" + +struct Manager { + sd_netlink *rtnl; + /* lazy initialized */ + sd_netlink *genl; + sd_event *event; + sd_resolve *resolve; + sd_bus *bus; + sd_device_monitor *device_monitor; + Hashmap *polkit_registry; + int ethtool_fd; + + KeepConfiguration keep_configuration; + IPv6PrivacyExtensions ipv6_privacy_extensions; + + bool test_mode; + bool enumerating; + bool dirty; + bool restarting; + bool manage_foreign_routes; + bool manage_foreign_rules; + + Set *dirty_links; + Set *new_wlan_ifindices; + + char *state_file; + LinkOperationalState operational_state; + LinkCarrierState carrier_state; + LinkAddressState address_state; + LinkAddressState ipv4_address_state; + LinkAddressState ipv6_address_state; + LinkOnlineState online_state; + + Hashmap *links_by_index; + Hashmap *links_by_name; + Hashmap *links_by_hw_addr; + Hashmap *links_by_dhcp_pd_subnet_prefix; + Hashmap *netdevs; + OrderedHashmap *networks; + OrderedSet *address_pools; + Set *dhcp_pd_subnet_ids; + + DUID dhcp_duid; + DUID dhcp6_duid; + DUID duid_product_uuid; + bool has_product_uuid; + bool product_uuid_requested; + + char* dynamic_hostname; + char* dynamic_timezone; + + Set *rules; + + /* Manage nexthops by id. */ + Hashmap *nexthops_by_id; + + /* Manager stores nexthops without RTA_OIF attribute. */ + Set *nexthops; + + /* Manager stores routes without RTA_OIF attribute. */ + unsigned route_remove_messages; + Set *routes; + + /* Route table name */ + Hashmap *route_table_numbers_by_name; + Hashmap *route_table_names_by_number; + + /* Wiphy */ + Hashmap *wiphy_by_index; + Hashmap *wiphy_by_name; + + /* For link speed meter */ + bool use_speed_meter; + sd_event_source *speed_meter_event_source; + usec_t speed_meter_interval_usec; + usec_t speed_meter_usec_new; + usec_t speed_meter_usec_old; + + bool bridge_mdb_on_master_not_supported; + + FirewallContext *fw_ctx; + + OrderedSet *request_queue; + + Hashmap *tuntap_fds_by_name; +}; + +int manager_new(Manager **ret, bool test_mode); +Manager* manager_free(Manager *m); + +int manager_setup(Manager *m); +int manager_start(Manager *m); + +int manager_load_config(Manager *m); + +int manager_enumerate_internal( + Manager *m, + sd_netlink *nl, + sd_netlink_message *req, + int (*process)(sd_netlink *, sd_netlink_message *, Manager *)); +int manager_enumerate(Manager *m); + +int manager_set_hostname(Manager *m, const char *hostname); +int manager_set_timezone(Manager *m, const char *timezone); + +int manager_reload(Manager *m); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); diff --git a/src/network/networkd-ndisc.c b/src/network/networkd-ndisc.c new file mode 100644 index 0000000..840ccb1 --- /dev/null +++ b/src/network/networkd-ndisc.c @@ -0,0 +1,1531 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Intel Corporation. All rights reserved. +***/ + +#include +#include +#include +#include + +#include "sd-ndisc.h" + +#include "event-util.h" +#include "missing_network.h" +#include "networkd-address-generation.h" +#include "networkd-address.h" +#include "networkd-dhcp6.h" +#include "networkd-manager.h" +#include "networkd-ndisc.h" +#include "networkd-queue.h" +#include "networkd-route.h" +#include "networkd-state-file.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "sysctl-util.h" + +#define NDISC_DNSSL_MAX 64U +#define NDISC_RDNSS_MAX 64U +/* Not defined in the RFC, but let's set an upper limit to make not consume much memory. + * This should be safe as typically there should be at most 1 portal per network. */ +#define NDISC_CAPTIVE_PORTAL_MAX 64U +/* Neither defined in the RFC. Just for safety. Otherwise, malformed messages can make clients trigger OOM. + * Not sure if the threshold is high enough. Let's adjust later if not. */ +#define NDISC_PREF64_MAX 64U + +bool link_ipv6_accept_ra_enabled(Link *link) { + assert(link); + + if (!socket_ipv6_is_supported()) + return false; + + if (link->flags & IFF_LOOPBACK) + return false; + + if (link->iftype == ARPHRD_CAN) + return false; + + if (!link->network) + return false; + + if (!link_may_have_ipv6ll(link, /* check_multicast = */ true)) + return false; + + assert(link->network->ipv6_accept_ra >= 0); + return link->network->ipv6_accept_ra; +} + +void network_adjust_ipv6_accept_ra(Network *network) { + assert(network); + + if (!FLAGS_SET(network->link_local, ADDRESS_FAMILY_IPV6)) { + if (network->ipv6_accept_ra > 0) + log_warning("%s: IPv6AcceptRA= is enabled but IPv6 link-local addressing is disabled or not supported. " + "Disabling IPv6AcceptRA=.", network->filename); + network->ipv6_accept_ra = false; + } + + if (network->ipv6_accept_ra < 0) + /* default to accept RA if ip_forward is disabled and ignore RA if ip_forward is enabled */ + network->ipv6_accept_ra = !FLAGS_SET(network->ip_forward, ADDRESS_FAMILY_IPV6); + + /* When RouterAllowList=, PrefixAllowList= or RouteAllowList= are specified, then + * RouterDenyList=, PrefixDenyList= or RouteDenyList= are ignored, respectively. */ + if (!set_isempty(network->ndisc_allow_listed_router)) + network->ndisc_deny_listed_router = set_free_free(network->ndisc_deny_listed_router); + if (!set_isempty(network->ndisc_allow_listed_prefix)) + network->ndisc_deny_listed_prefix = set_free_free(network->ndisc_deny_listed_prefix); + if (!set_isempty(network->ndisc_allow_listed_route_prefix)) + network->ndisc_deny_listed_route_prefix = set_free_free(network->ndisc_deny_listed_route_prefix); +} + +static int ndisc_check_ready(Link *link); + +static int ndisc_address_ready_callback(Address *address) { + Address *a; + + assert(address); + assert(address->link); + + SET_FOREACH(a, address->link->addresses) + if (a->source == NETWORK_CONFIG_SOURCE_NDISC) + a->callback = NULL; + + return ndisc_check_ready(address->link); +} + +static int ndisc_check_ready(Link *link) { + bool found = false, ready = false; + Address *address; + + assert(link); + + if (link->ndisc_messages > 0) { + log_link_debug(link, "%s(): SLAAC addresses and routes are not set.", __func__); + return 0; + } + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_NDISC) + continue; + + found = true; + + if (address_is_ready(address)) { + ready = true; + break; + } + } + + if (found && !ready) { + SET_FOREACH(address, link->addresses) + if (address->source == NETWORK_CONFIG_SOURCE_NDISC) + address->callback = ndisc_address_ready_callback; + + log_link_debug(link, "%s(): no SLAAC address is ready.", __func__); + return 0; + } + + link->ndisc_configured = true; + log_link_debug(link, "SLAAC addresses and routes set."); + + link_check_ready(link); + return 0; +} + +static int ndisc_route_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Route *route) { + int r; + + assert(link); + + r = route_configure_handler_internal(rtnl, m, link, "Could not set NDisc route"); + if (r <= 0) + return r; + + r = ndisc_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static void ndisc_set_route_priority(Link *link, Route *route) { + assert(link); + assert(route); + + if (route->priority_set) + return; /* explicitly configured. */ + + switch (route->pref) { + case SD_NDISC_PREFERENCE_LOW: + route->priority = link->network->ipv6_accept_ra_route_metric_low; + break; + case SD_NDISC_PREFERENCE_MEDIUM: + route->priority = link->network->ipv6_accept_ra_route_metric_medium; + break; + case SD_NDISC_PREFERENCE_HIGH: + route->priority = link->network->ipv6_accept_ra_route_metric_high; + break; + default: + assert_not_reached(); + } +} + +static int ndisc_request_route(Route *in, Link *link, sd_ndisc_router *rt) { + _cleanup_(route_freep) Route *route = in; + struct in6_addr router; + uint8_t hop_limit = 0; + uint32_t mtu = 0; + bool is_new; + int r; + + assert(route); + assert(link); + assert(link->network); + assert(rt); + + r = sd_ndisc_router_get_address(rt, &router); + if (r < 0) + return r; + + if (link->network->ipv6_accept_ra_use_mtu) { + r = sd_ndisc_router_get_mtu(rt, &mtu); + if (r < 0 && r != -ENODATA) + return log_link_warning_errno(link, r, "Failed to get default router MTU from RA: %m"); + } + + if (link->network->ipv6_accept_ra_use_hop_limit) { + r = sd_ndisc_router_get_hop_limit(rt, &hop_limit); + if (r < 0 && r != -ENODATA) + return log_link_warning_errno(link, r, "Failed to get default router hop limit from RA: %m"); + } + + route->source = NETWORK_CONFIG_SOURCE_NDISC; + route->provider.in6 = router; + if (!route->table_set) + route->table = link_get_ipv6_accept_ra_route_table(link); + ndisc_set_route_priority(link, route); + if (!route->protocol_set) + route->protocol = RTPROT_RA; + if (route->quickack < 0) + route->quickack = link->network->ipv6_accept_ra_quickack; + if (route->mtu == 0) + route->mtu = mtu; + if (route->hop_limit == 0) + route->hop_limit = hop_limit; + + is_new = route_get(NULL, link, route, NULL) < 0; + + r = link_request_route(link, TAKE_PTR(route), true, &link->ndisc_messages, + ndisc_route_handler, NULL); + if (r < 0) + return r; + if (r > 0 && is_new) + link->ndisc_configured = false; + + return 0; +} + +static int ndisc_address_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Address *address) { + int r; + + assert(link); + + r = address_configure_handler_internal(rtnl, m, link, "Could not set NDisc address"); + if (r <= 0) + return r; + + r = ndisc_check_ready(link); + if (r < 0) + link_enter_failed(link); + + return 1; +} + +static int ndisc_request_address(Address *in, Link *link, sd_ndisc_router *rt) { + _cleanup_(address_freep) Address *address = in; + struct in6_addr router; + bool is_new; + int r; + + assert(address); + assert(link); + assert(rt); + + r = sd_ndisc_router_get_address(rt, &router); + if (r < 0) + return r; + + address->source = NETWORK_CONFIG_SOURCE_NDISC; + address->provider.in6 = router; + + r = free_and_strdup_warn(&address->netlabel, link->network->ndisc_netlabel); + if (r < 0) + return r; + + is_new = address_get(link, address, NULL) < 0; + + r = link_request_address(link, address, &link->ndisc_messages, + ndisc_address_handler, NULL); + if (r < 0) + return r; + if (r > 0 && is_new) + link->ndisc_configured = false; + + return 0; +} + +static int ndisc_router_process_default(Link *link, sd_ndisc_router *rt) { + usec_t lifetime_usec; + struct in6_addr gateway; + unsigned preference; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_gateway && + hashmap_isempty(link->network->routes_by_section)) + return 0; + + r = sd_ndisc_router_get_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get gateway lifetime from RA: %m"); + + r = sd_ndisc_router_get_address(rt, &gateway); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get gateway address from RA: %m"); + + if (link_get_ipv6_address(link, &gateway, 0, NULL) >= 0) { + if (DEBUG_LOGGING) + log_link_debug(link, "No NDisc route added, gateway %s matches local address", + IN6_ADDR_TO_STRING(&gateway)); + return 0; + } + + r = sd_ndisc_router_get_preference(rt, &preference); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get default router preference from RA: %m"); + + if (link->network->ipv6_accept_ra_use_gateway) { + _cleanup_(route_freep) Route *route = NULL; + + r = route_new(&route); + if (r < 0) + return log_oom(); + + route->family = AF_INET6; + route->pref = preference; + route->gw_family = AF_INET6; + route->gw.in6 = gateway; + route->lifetime_usec = lifetime_usec; + + r = ndisc_request_route(TAKE_PTR(route), link, rt); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request default route: %m"); + } + + Route *route_gw; + HASHMAP_FOREACH(route_gw, link->network->routes_by_section) { + _cleanup_(route_freep) Route *route = NULL; + + if (!route_gw->gateway_from_dhcp_or_ra) + continue; + + if (route_gw->gw_family != AF_INET6) + continue; + + r = route_dup(route_gw, &route); + if (r < 0) + return r; + + route->gw.in6 = gateway; + if (!route->pref_set) + route->pref = preference; + route->lifetime_usec = lifetime_usec; + + r = ndisc_request_route(TAKE_PTR(route), link, rt); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request gateway: %m"); + } + + return 0; +} + +static int ndisc_router_process_icmp6_ratelimit(Link *link, sd_ndisc_router *rt) { + usec_t icmp6_ratelimit, msec; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_icmp6_ratelimit) + return 0; + + r = sd_ndisc_router_get_icmp6_ratelimit(rt, &icmp6_ratelimit); + if (r < 0) { + log_link_debug(link, "Failed to get ICMP6 ratelimit from RA, ignoring: %m"); + return 0; + } + + /* We do not allow 0 here. */ + if (!timestamp_is_set(icmp6_ratelimit)) + return 0; + + msec = DIV_ROUND_UP(icmp6_ratelimit, USEC_PER_MSEC); + if (msec <= 0 || msec > INT_MAX) + return 0; + + /* Limit the maximal rates for sending ICMPv6 packets. 0 to disable any limiting, otherwise the + * minimal space between responses in milliseconds. Default: 1000. */ + r = sysctl_write_ip_property_int(AF_INET6, NULL, "icmp/ratelimit", (int) msec); + if (r < 0) + log_link_warning_errno(link, r, "Failed to apply ICMP6 ratelimit, ignoring: %m"); + + return 0; +} + +static int ndisc_router_process_autonomous_prefix(Link *link, sd_ndisc_router *rt) { + usec_t lifetime_valid_usec, lifetime_preferred_usec; + _cleanup_set_free_ Set *addresses = NULL; + struct in6_addr prefix, *a; + unsigned prefixlen; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_autonomous_prefix) + return 0; + + r = sd_ndisc_router_prefix_get_address(rt, &prefix); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix address: %m"); + + r = sd_ndisc_router_prefix_get_prefixlen(rt, &prefixlen); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix length: %m"); + + /* ndisc_generate_addresses() below requires the prefix length <= 64. */ + if (prefixlen > 64) { + log_link_debug(link, "Prefix is longer than 64, ignoring autonomous prefix %s.", + IN6_ADDR_PREFIX_TO_STRING(&prefix, prefixlen)); + return 0; + } + + r = sd_ndisc_router_prefix_get_valid_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_valid_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix valid lifetime: %m"); + + r = sd_ndisc_router_prefix_get_preferred_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_preferred_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix preferred lifetime: %m"); + + /* The preferred lifetime is never greater than the valid lifetime */ + if (lifetime_preferred_usec > lifetime_valid_usec) + return 0; + + r = ndisc_generate_addresses(link, &prefix, prefixlen, &addresses); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to generate SLAAC addresses: %m"); + + SET_FOREACH(a, addresses) { + _cleanup_(address_freep) Address *address = NULL; + + r = address_new(&address); + if (r < 0) + return log_oom(); + + address->family = AF_INET6; + address->in_addr.in6 = *a; + address->prefixlen = prefixlen; + address->flags = IFA_F_NOPREFIXROUTE|IFA_F_MANAGETEMPADDR; + address->lifetime_valid_usec = lifetime_valid_usec; + address->lifetime_preferred_usec = lifetime_preferred_usec; + + r = ndisc_request_address(TAKE_PTR(address), link, rt); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request SLAAC address: %m"); + } + + return 0; +} + +static int ndisc_router_process_onlink_prefix(Link *link, sd_ndisc_router *rt) { + _cleanup_(route_freep) Route *route = NULL; + unsigned prefixlen, preference; + usec_t lifetime_usec; + struct in6_addr prefix; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_onlink_prefix) + return 0; + + r = sd_ndisc_router_prefix_get_valid_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix lifetime: %m"); + + r = sd_ndisc_router_prefix_get_address(rt, &prefix); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix address: %m"); + + r = sd_ndisc_router_prefix_get_prefixlen(rt, &prefixlen); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix length: %m"); + + /* Prefix Information option does not have preference, hence we use the 'main' preference here */ + r = sd_ndisc_router_get_preference(rt, &preference); + if (r < 0) + log_link_warning_errno(link, r, "Failed to get default router preference from RA: %m"); + + r = route_new(&route); + if (r < 0) + return log_oom(); + + route->family = AF_INET6; + route->dst.in6 = prefix; + route->dst_prefixlen = prefixlen; + route->pref = preference; + route->lifetime_usec = lifetime_usec; + + r = ndisc_request_route(TAKE_PTR(route), link, rt); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request prefix route: %m"); + + return 0; +} + +static int ndisc_router_process_prefix(Link *link, sd_ndisc_router *rt) { + unsigned prefixlen; + struct in6_addr a; + uint8_t flags; + int r; + + assert(link); + assert(link->network); + assert(rt); + + r = sd_ndisc_router_prefix_get_address(rt, &a); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix address: %m"); + + /* RFC 4861 Section 4.6.2: + * A router SHOULD NOT send a prefix option for the link-local prefix and a host SHOULD ignore such + * a prefix option. */ + if (in6_addr_is_link_local(&a)) { + log_link_debug(link, "Received link-local prefix, ignoring autonomous prefix."); + return 0; + } + + r = sd_ndisc_router_prefix_get_prefixlen(rt, &prefixlen); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get prefix length: %m"); + + if (in6_prefix_is_filtered(&a, prefixlen, link->network->ndisc_allow_listed_prefix, link->network->ndisc_deny_listed_prefix)) { + if (DEBUG_LOGGING) + log_link_debug(link, "Prefix '%s' is %s, ignoring", + !set_isempty(link->network->ndisc_allow_listed_prefix) ? "not in allow list" + : "in deny list", + IN6_ADDR_PREFIX_TO_STRING(&a, prefixlen)); + return 0; + } + + r = sd_ndisc_router_prefix_get_flags(rt, &flags); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get RA prefix flags: %m"); + + if (FLAGS_SET(flags, ND_OPT_PI_FLAG_ONLINK)) { + r = ndisc_router_process_onlink_prefix(link, rt); + if (r < 0) + return r; + } + + if (FLAGS_SET(flags, ND_OPT_PI_FLAG_AUTO)) { + r = ndisc_router_process_autonomous_prefix(link, rt); + if (r < 0) + return r; + } + + return 0; +} + +static int ndisc_router_process_route(Link *link, sd_ndisc_router *rt) { + _cleanup_(route_freep) Route *route = NULL; + unsigned preference, prefixlen; + struct in6_addr gateway, dst; + usec_t lifetime_usec; + int r; + + assert(link); + + if (!link->network->ipv6_accept_ra_use_route_prefix) + return 0; + + r = sd_ndisc_router_route_get_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get route lifetime from RA: %m"); + + r = sd_ndisc_router_route_get_address(rt, &dst); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get route destination address: %m"); + + r = sd_ndisc_router_route_get_prefixlen(rt, &prefixlen); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get route prefix length: %m"); + + if (in6_addr_is_null(&dst) && prefixlen == 0) { + log_link_debug(link, "Route prefix is ::/0, ignoring"); + return 0; + } + + if (in6_prefix_is_filtered(&dst, prefixlen, + link->network->ndisc_allow_listed_route_prefix, + link->network->ndisc_deny_listed_route_prefix)) { + + if (DEBUG_LOGGING) + log_link_debug(link, "Route prefix %s is %s, ignoring", + !set_isempty(link->network->ndisc_allow_listed_route_prefix) ? "not in allow list" + : "in deny list", + IN6_ADDR_PREFIX_TO_STRING(&dst, prefixlen)); + return 0; + } + + r = sd_ndisc_router_get_address(rt, &gateway); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get gateway address from RA: %m"); + + if (link_get_ipv6_address(link, &gateway, 0, NULL) >= 0) { + if (DEBUG_LOGGING) + log_link_debug(link, "Advertised route gateway %s is local to the link, ignoring route", + IN6_ADDR_TO_STRING(&gateway)); + return 0; + } + + r = sd_ndisc_router_route_get_preference(rt, &preference); + if (r == -ENOTSUP) { + log_link_debug_errno(link, r, "Received route prefix with unsupported preference, ignoring: %m"); + return 0; + } + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get default router preference from RA: %m"); + + r = route_new(&route); + if (r < 0) + return log_oom(); + + route->family = AF_INET6; + route->pref = preference; + route->gw.in6 = gateway; + route->gw_family = AF_INET6; + route->dst.in6 = dst; + route->dst_prefixlen = prefixlen; + route->lifetime_usec = lifetime_usec; + + r = ndisc_request_route(TAKE_PTR(route), link, rt); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request additional route: %m"); + + return 0; +} + +static void ndisc_rdnss_hash_func(const NDiscRDNSS *x, struct siphash *state) { + siphash24_compress(&x->address, sizeof(x->address), state); +} + +static int ndisc_rdnss_compare_func(const NDiscRDNSS *a, const NDiscRDNSS *b) { + return memcmp(&a->address, &b->address, sizeof(a->address)); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + ndisc_rdnss_hash_ops, + NDiscRDNSS, + ndisc_rdnss_hash_func, + ndisc_rdnss_compare_func, + free); + +static int ndisc_router_process_rdnss(Link *link, sd_ndisc_router *rt) { + usec_t lifetime_usec; + const struct in6_addr *a; + struct in6_addr router; + bool updated = false, logged_about_too_many = false; + int n, r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_dns) + return 0; + + r = sd_ndisc_router_get_address(rt, &router); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get router address from RA: %m"); + + r = sd_ndisc_router_rdnss_get_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get RDNSS lifetime: %m"); + + n = sd_ndisc_router_rdnss_get_addresses(rt, &a); + if (n < 0) + return log_link_warning_errno(link, n, "Failed to get RDNSS addresses: %m"); + + for (int j = 0; j < n; j++) { + _cleanup_free_ NDiscRDNSS *x = NULL; + NDiscRDNSS *rdnss, d = { + .address = a[j], + }; + + if (lifetime_usec == 0) { + /* The entry is outdated. */ + free(set_remove(link->ndisc_rdnss, &d)); + updated = true; + continue; + } + + rdnss = set_get(link->ndisc_rdnss, &d); + if (rdnss) { + rdnss->router = router; + rdnss->lifetime_usec = lifetime_usec; + continue; + } + + if (set_size(link->ndisc_rdnss) >= NDISC_RDNSS_MAX) { + if (!logged_about_too_many) + log_link_warning(link, "Too many RDNSS records per link. Only first %u records will be used.", NDISC_RDNSS_MAX); + logged_about_too_many = true; + continue; + } + + x = new(NDiscRDNSS, 1); + if (!x) + return log_oom(); + + *x = (NDiscRDNSS) { + .address = a[j], + .router = router, + .lifetime_usec = lifetime_usec, + }; + + r = set_ensure_consume(&link->ndisc_rdnss, &ndisc_rdnss_hash_ops, TAKE_PTR(x)); + if (r < 0) + return log_oom(); + assert(r > 0); + + updated = true; + } + + if (updated) + link_dirty(link); + + return 0; +} + +static void ndisc_dnssl_hash_func(const NDiscDNSSL *x, struct siphash *state) { + siphash24_compress_string(NDISC_DNSSL_DOMAIN(x), state); +} + +static int ndisc_dnssl_compare_func(const NDiscDNSSL *a, const NDiscDNSSL *b) { + return strcmp(NDISC_DNSSL_DOMAIN(a), NDISC_DNSSL_DOMAIN(b)); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + ndisc_dnssl_hash_ops, + NDiscDNSSL, + ndisc_dnssl_hash_func, + ndisc_dnssl_compare_func, + free); + +static int ndisc_router_process_dnssl(Link *link, sd_ndisc_router *rt) { + _cleanup_strv_free_ char **l = NULL; + usec_t lifetime_usec; + struct in6_addr router; + bool updated = false, logged_about_too_many = false; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (link->network->ipv6_accept_ra_use_domains == DHCP_USE_DOMAINS_NO) + return 0; + + r = sd_ndisc_router_get_address(rt, &router); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get router address from RA: %m"); + + r = sd_ndisc_router_dnssl_get_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get DNSSL lifetime: %m"); + + r = sd_ndisc_router_dnssl_get_domains(rt, &l); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get DNSSL addresses: %m"); + + STRV_FOREACH(j, l) { + _cleanup_free_ NDiscDNSSL *s = NULL; + NDiscDNSSL *dnssl; + + s = malloc0(ALIGN(sizeof(NDiscDNSSL)) + strlen(*j) + 1); + if (!s) + return log_oom(); + + strcpy(NDISC_DNSSL_DOMAIN(s), *j); + + if (lifetime_usec == 0) { + /* The entry is outdated. */ + free(set_remove(link->ndisc_dnssl, s)); + updated = true; + continue; + } + + dnssl = set_get(link->ndisc_dnssl, s); + if (dnssl) { + dnssl->router = router; + dnssl->lifetime_usec = lifetime_usec; + continue; + } + + if (set_size(link->ndisc_dnssl) >= NDISC_DNSSL_MAX) { + if (!logged_about_too_many) + log_link_warning(link, "Too many DNSSL records per link. Only first %u records will be used.", NDISC_DNSSL_MAX); + logged_about_too_many = true; + continue; + } + + s->router = router; + s->lifetime_usec = lifetime_usec; + + r = set_ensure_consume(&link->ndisc_dnssl, &ndisc_dnssl_hash_ops, TAKE_PTR(s)); + if (r < 0) + return log_oom(); + assert(r > 0); + + updated = true; + } + + if (updated) + link_dirty(link); + + return 0; +} + +static NDiscCaptivePortal* ndisc_captive_portal_free(NDiscCaptivePortal *x) { + if (!x) + return NULL; + + free(x->captive_portal); + return mfree(x); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(NDiscCaptivePortal*, ndisc_captive_portal_free); + +static void ndisc_captive_portal_hash_func(const NDiscCaptivePortal *x, struct siphash *state) { + assert(x); + siphash24_compress_string(x->captive_portal, state); +} + +static int ndisc_captive_portal_compare_func(const NDiscCaptivePortal *a, const NDiscCaptivePortal *b) { + assert(a); + assert(b); + return strcmp_ptr(a->captive_portal, b->captive_portal); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + ndisc_captive_portal_hash_ops, + NDiscCaptivePortal, + ndisc_captive_portal_hash_func, + ndisc_captive_portal_compare_func, + ndisc_captive_portal_free); + +static int ndisc_router_process_captive_portal(Link *link, sd_ndisc_router *rt) { + _cleanup_(ndisc_captive_portal_freep) NDiscCaptivePortal *new_entry = NULL; + _cleanup_free_ char *captive_portal = NULL; + usec_t lifetime_usec; + NDiscCaptivePortal *exist; + struct in6_addr router; + const char *uri; + size_t len; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_captive_portal) + return 0; + + r = sd_ndisc_router_get_address(rt, &router); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get router address from RA: %m"); + + /* RFC 4861 section 4.2. states that the lifetime in the message header should be used only for the + * default gateway, but the captive portal option does not have a lifetime field, hence, we use the + * main lifetime for the portal. */ + r = sd_ndisc_router_get_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get lifetime of RA message: %m"); + + r = sd_ndisc_router_captive_portal_get_uri(rt, &uri, &len); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get captive portal from RA: %m"); + + if (len == 0) + return log_link_warning_errno(link, SYNTHETIC_ERRNO(EBADMSG), "Received empty captive portal, ignoring."); + + r = make_cstring(uri, len, MAKE_CSTRING_REFUSE_TRAILING_NUL, &captive_portal); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to convert captive portal URI: %m"); + + if (!in_charset(captive_portal, URI_VALID)) + return log_link_warning_errno(link, SYNTHETIC_ERRNO(EBADMSG), "Received invalid captive portal, ignoring."); + + if (lifetime_usec == 0) { + /* Drop the portal with zero lifetime. */ + ndisc_captive_portal_free(set_remove(link->ndisc_captive_portals, + &(NDiscCaptivePortal) { + .captive_portal = captive_portal, + })); + return 0; + } + + exist = set_get(link->ndisc_captive_portals, + &(NDiscCaptivePortal) { + .captive_portal = captive_portal, + }); + if (exist) { + /* update existing entry */ + exist->router = router; + exist->lifetime_usec = lifetime_usec; + return 1; + } + + if (set_size(link->ndisc_captive_portals) >= NDISC_CAPTIVE_PORTAL_MAX) { + NDiscCaptivePortal *c, *target = NULL; + + /* Find the portal who has the minimal lifetime and drop it to store new one. */ + SET_FOREACH(c, link->ndisc_captive_portals) + if (!target || c->lifetime_usec < target->lifetime_usec) + target = c; + + assert(target); + assert(set_remove(link->ndisc_captive_portals, target) == target); + ndisc_captive_portal_free(target); + } + + new_entry = new(NDiscCaptivePortal, 1); + if (!new_entry) + return log_oom(); + + *new_entry = (NDiscCaptivePortal) { + .router = router, + .lifetime_usec = lifetime_usec, + .captive_portal = TAKE_PTR(captive_portal), + }; + + r = set_ensure_put(&link->ndisc_captive_portals, &ndisc_captive_portal_hash_ops, new_entry); + if (r < 0) + return log_oom(); + assert(r > 0); + TAKE_PTR(new_entry); + + link_dirty(link); + return 1; +} + +static void ndisc_pref64_hash_func(const NDiscPREF64 *x, struct siphash *state) { + assert(x); + + siphash24_compress(&x->prefix_len, sizeof(x->prefix_len), state); + siphash24_compress(&x->prefix, sizeof(x->prefix), state); +} + +static int ndisc_pref64_compare_func(const NDiscPREF64 *a, const NDiscPREF64 *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->prefix_len, b->prefix_len); + if (r != 0) + return r; + + return memcmp(&a->prefix, &b->prefix, sizeof(a->prefix)); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + ndisc_pref64_hash_ops, + NDiscPREF64, + ndisc_pref64_hash_func, + ndisc_pref64_compare_func, + mfree); + +static int ndisc_router_process_pref64(Link *link, sd_ndisc_router *rt) { + _cleanup_free_ NDiscPREF64 *new_entry = NULL; + usec_t lifetime_usec; + struct in6_addr a, router; + unsigned prefix_len; + NDiscPREF64 *exist; + int r; + + assert(link); + assert(link->network); + assert(rt); + + if (!link->network->ipv6_accept_ra_use_pref64) + return 0; + + r = sd_ndisc_router_get_address(rt, &router); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get router address from RA: %m"); + + r = sd_ndisc_router_prefix64_get_prefix(rt, &a); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get pref64 prefix: %m"); + + r = sd_ndisc_router_prefix64_get_prefixlen(rt, &prefix_len); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get pref64 prefix length: %m"); + + r = sd_ndisc_router_prefix64_get_lifetime_timestamp(rt, CLOCK_BOOTTIME, &lifetime_usec); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get pref64 prefix lifetime: %m"); + + if (lifetime_usec == 0) { + free(set_remove(link->ndisc_pref64, + &(NDiscPREF64) { + .prefix = a, + .prefix_len = prefix_len + })); + return 0; + } + + exist = set_get(link->ndisc_pref64, + &(NDiscPREF64) { + .prefix = a, + .prefix_len = prefix_len + }); + if (exist) { + /* update existing entry */ + exist->router = router; + exist->lifetime_usec = lifetime_usec; + return 0; + } + + if (set_size(link->ndisc_pref64) >= NDISC_PREF64_MAX) { + log_link_debug(link, "Too many PREF64 records received. Only first %u records will be used.", NDISC_PREF64_MAX); + return 0; + } + + new_entry = new(NDiscPREF64, 1); + if (!new_entry) + return log_oom(); + + *new_entry = (NDiscPREF64) { + .router = router, + .lifetime_usec = lifetime_usec, + .prefix = a, + .prefix_len = prefix_len, + }; + + r = set_ensure_put(&link->ndisc_pref64, &ndisc_pref64_hash_ops, new_entry); + if (r < 0) + return log_oom(); + + assert(r > 0); + TAKE_PTR(new_entry); + + return 0; +} + +static int ndisc_router_process_options(Link *link, sd_ndisc_router *rt) { + size_t n_captive_portal = 0; + int r; + + assert(link); + assert(link->network); + assert(rt); + + for (r = sd_ndisc_router_option_rewind(rt); ; r = sd_ndisc_router_option_next(rt)) { + uint8_t type; + + if (r < 0) + return log_link_warning_errno(link, r, "Failed to iterate through options: %m"); + if (r == 0) /* EOF */ + return 0; + + r = sd_ndisc_router_option_get_type(rt, &type); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get RA option type: %m"); + + switch (type) { + case SD_NDISC_OPTION_PREFIX_INFORMATION: + r = ndisc_router_process_prefix(link, rt); + break; + + case SD_NDISC_OPTION_ROUTE_INFORMATION: + r = ndisc_router_process_route(link, rt); + break; + + case SD_NDISC_OPTION_RDNSS: + r = ndisc_router_process_rdnss(link, rt); + break; + + case SD_NDISC_OPTION_DNSSL: + r = ndisc_router_process_dnssl(link, rt); + break; + case SD_NDISC_OPTION_CAPTIVE_PORTAL: + if (n_captive_portal > 0) { + if (n_captive_portal == 1) + log_link_notice(link, "Received RA with multiple captive portals, only using the first one."); + + n_captive_portal++; + continue; + } + r = ndisc_router_process_captive_portal(link, rt); + if (r > 0) + n_captive_portal++; + break; + case SD_NDISC_OPTION_PREF64: + r = ndisc_router_process_pref64(link, rt); + break; + } + if (r < 0 && r != -EBADMSG) + return r; + } +} + +static int ndisc_drop_outdated(Link *link, usec_t timestamp_usec) { + bool updated = false; + NDiscDNSSL *dnssl; + NDiscRDNSS *rdnss; + NDiscCaptivePortal *cp; + NDiscPREF64 *p64; + Address *address; + Route *route; + int r = 0, k; + + assert(link); + + /* If an address or friends is already assigned, but not valid anymore, then refuse to update it, + * and let's immediately remove it. + * See RFC4862, section 5.5.3.e. But the following logic is deviated from RFC4862 by honoring all + * valid lifetimes to improve the reaction of SLAAC to renumbering events. + * See draft-ietf-6man-slaac-renum-02, section 4.2. */ + + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_NDISC) + continue; + + if (route->lifetime_usec >= timestamp_usec) + continue; /* the route is still valid */ + + k = route_remove_and_drop(route); + if (k < 0) + r = log_link_warning_errno(link, k, "Failed to remove outdated SLAAC route, ignoring: %m"); + } + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_NDISC) + continue; + + if (address->lifetime_valid_usec >= timestamp_usec) + continue; /* the address is still valid */ + + k = address_remove_and_drop(address); + if (k < 0) + r = log_link_warning_errno(link, k, "Failed to remove outdated SLAAC address, ignoring: %m"); + } + + SET_FOREACH(rdnss, link->ndisc_rdnss) { + if (rdnss->lifetime_usec >= timestamp_usec) + continue; /* the DNS server is still valid */ + + free(set_remove(link->ndisc_rdnss, rdnss)); + updated = true; + } + + SET_FOREACH(dnssl, link->ndisc_dnssl) { + if (dnssl->lifetime_usec >= timestamp_usec) + continue; /* the DNS domain is still valid */ + + free(set_remove(link->ndisc_dnssl, dnssl)); + updated = true; + } + + SET_FOREACH(cp, link->ndisc_captive_portals) { + if (cp->lifetime_usec >= timestamp_usec) + continue; /* the captive portal is still valid */ + + ndisc_captive_portal_free(set_remove(link->ndisc_captive_portals, cp)); + updated = true; + } + + SET_FOREACH(p64, link->ndisc_pref64) { + if (p64->lifetime_usec >= timestamp_usec) + continue; /* the pref64 prefix is still valid */ + + free(set_remove(link->ndisc_pref64, p64)); + /* The pref64 prefix is not exported through the state file, hence it is not necessary to set + * the 'updated' flag. */ + } + + if (updated) + link_dirty(link); + + return r; +} + +static int ndisc_setup_expire(Link *link); + +static int ndisc_expire_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Link *link = ASSERT_PTR(userdata); + usec_t now_usec; + + assert(link->manager); + + assert_se(sd_event_now(link->manager->event, CLOCK_BOOTTIME, &now_usec) >= 0); + + (void) ndisc_drop_outdated(link, now_usec); + (void) ndisc_setup_expire(link); + return 0; +} + +static int ndisc_setup_expire(Link *link) { + usec_t lifetime_usec = USEC_INFINITY; + NDiscCaptivePortal *cp; + NDiscDNSSL *dnssl; + NDiscRDNSS *rdnss; + NDiscPREF64 *p64; + Address *address; + Route *route; + int r; + + assert(link); + assert(link->manager); + + SET_FOREACH(route, link->routes) { + if (route->source != NETWORK_CONFIG_SOURCE_NDISC) + continue; + + if (!route_exists(route)) + continue; + + lifetime_usec = MIN(lifetime_usec, route->lifetime_usec); + } + + SET_FOREACH(address, link->addresses) { + if (address->source != NETWORK_CONFIG_SOURCE_NDISC) + continue; + + if (!address_exists(address)) + continue; + + lifetime_usec = MIN(lifetime_usec, address->lifetime_valid_usec); + } + + SET_FOREACH(rdnss, link->ndisc_rdnss) + lifetime_usec = MIN(lifetime_usec, rdnss->lifetime_usec); + + SET_FOREACH(dnssl, link->ndisc_dnssl) + lifetime_usec = MIN(lifetime_usec, dnssl->lifetime_usec); + + SET_FOREACH(cp, link->ndisc_captive_portals) + lifetime_usec = MIN(lifetime_usec, cp->lifetime_usec); + + SET_FOREACH(p64, link->ndisc_pref64) + lifetime_usec = MIN(lifetime_usec, p64->lifetime_usec); + + if (lifetime_usec == USEC_INFINITY) + return 0; + + r = event_reset_time(link->manager->event, &link->ndisc_expire, CLOCK_BOOTTIME, + lifetime_usec, 0, ndisc_expire_handler, link, 0, "ndisc-expiration", true); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to update expiration timer for ndisc: %m"); + + return 0; +} + +static int ndisc_start_dhcp6_client(Link *link, sd_ndisc_router *rt) { + int r; + + assert(link); + assert(link->network); + + switch (link->network->ipv6_accept_ra_start_dhcp6_client) { + case IPV6_ACCEPT_RA_START_DHCP6_CLIENT_NO: + return 0; + + case IPV6_ACCEPT_RA_START_DHCP6_CLIENT_YES: { + uint64_t flags; + + r = sd_ndisc_router_get_flags(rt, &flags); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get RA flags: %m"); + + if ((flags & (ND_RA_FLAG_MANAGED | ND_RA_FLAG_OTHER)) == 0) + return 0; + + /* (re)start DHCPv6 client in stateful or stateless mode according to RA flags. + * Note, if both "managed" and "other configuration" bits are set, then ignore + * "other configuration" bit. See RFC 4861. */ + r = dhcp6_start_on_ra(link, !(flags & ND_RA_FLAG_MANAGED)); + break; + } + case IPV6_ACCEPT_RA_START_DHCP6_CLIENT_ALWAYS: + /* When IPv6AcceptRA.DHCPv6Client=always, start dhcp6 client in solicit mode + * even if the router flags have neither M nor O flags. */ + r = dhcp6_start_on_ra(link, /* information_request = */ false); + break; + + default: + assert_not_reached(); + } + + if (r < 0) + return log_link_warning_errno(link, r, "Could not acquire DHCPv6 lease on NDisc request: %m"); + + log_link_debug(link, "Acquiring DHCPv6 lease on NDisc request"); + return 0; +} + +static int ndisc_router_handler(Link *link, sd_ndisc_router *rt) { + struct in6_addr router; + usec_t timestamp_usec; + int r; + + assert(link); + assert(link->network); + assert(link->manager); + assert(rt); + + r = sd_ndisc_router_get_address(rt, &router); + if (r == -ENODATA) { + log_link_debug(link, "Received RA without router address, ignoring."); + return 0; + } + if (r < 0) + return log_link_warning_errno(link, r, "Failed to get router address from RA: %m"); + + if (in6_prefix_is_filtered(&router, 128, link->network->ndisc_allow_listed_router, link->network->ndisc_deny_listed_router)) { + if (DEBUG_LOGGING) { + if (!set_isempty(link->network->ndisc_allow_listed_router)) + log_link_debug(link, "Router %s is not in allow list, ignoring.", IN6_ADDR_TO_STRING(&router)); + else + log_link_debug(link, "Router %s is in deny list, ignoring.", IN6_ADDR_TO_STRING(&router)); + } + return 0; + } + + r = sd_ndisc_router_get_timestamp(rt, CLOCK_BOOTTIME, ×tamp_usec); + if (r == -ENODATA) { + log_link_debug(link, "Received RA without timestamp, ignoring."); + return 0; + } + if (r < 0) + return r; + + r = ndisc_drop_outdated(link, timestamp_usec); + if (r < 0) + return r; + + r = ndisc_start_dhcp6_client(link, rt); + if (r < 0) + return r; + + r = ndisc_router_process_default(link, rt); + if (r < 0) + return r; + + r = ndisc_router_process_icmp6_ratelimit(link, rt); + if (r < 0) + return r; + + r = ndisc_router_process_options(link, rt); + if (r < 0) + return r; + + r = ndisc_setup_expire(link); + if (r < 0) + return r; + + if (link->ndisc_messages == 0) + link->ndisc_configured = true; + else + log_link_debug(link, "Setting SLAAC addresses and router."); + + if (!link->ndisc_configured) + link_set_state(link, LINK_STATE_CONFIGURING); + + link_check_ready(link); + return 0; +} + +static void ndisc_handler(sd_ndisc *nd, sd_ndisc_event_t event, sd_ndisc_router *rt, void *userdata) { + Link *link = ASSERT_PTR(userdata); + int r; + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return; + + switch (event) { + + case SD_NDISC_EVENT_ROUTER: + r = ndisc_router_handler(link, rt); + if (r < 0 && r != -EBADMSG) { + link_enter_failed(link); + return; + } + break; + + case SD_NDISC_EVENT_TIMEOUT: + log_link_debug(link, "NDisc handler get timeout event"); + if (link->ndisc_messages == 0) { + link->ndisc_configured = true; + link_check_ready(link); + } + break; + default: + assert_not_reached(); + } +} + +static int ndisc_configure(Link *link) { + int r; + + assert(link); + + if (!link_ipv6_accept_ra_enabled(link)) + return 0; + + if (link->ndisc) + return -EBUSY; /* Already configured. */ + + r = sd_ndisc_new(&link->ndisc); + if (r < 0) + return r; + + r = sd_ndisc_attach_event(link->ndisc, link->manager->event, 0); + if (r < 0) + return r; + + if (link->hw_addr.length == ETH_ALEN) { + r = sd_ndisc_set_mac(link->ndisc, &link->hw_addr.ether); + if (r < 0) + return r; + } + + r = sd_ndisc_set_ifindex(link->ndisc, link->ifindex); + if (r < 0) + return r; + + r = sd_ndisc_set_callback(link->ndisc, ndisc_handler, link); + if (r < 0) + return r; + + return 0; +} + +int ndisc_start(Link *link) { + int r; + + assert(link); + + if (!link->ndisc || !link->dhcp6_client) + return 0; + + if (!link_has_carrier(link)) + return 0; + + if (in6_addr_is_null(&link->ipv6ll_address)) + return 0; + + log_link_debug(link, "Discovering IPv6 routers"); + + r = sd_ndisc_start(link->ndisc); + if (r < 0) + return r; + + return 1; +} + +static int ndisc_process_request(Request *req, Link *link, void *userdata) { + int r; + + assert(link); + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return 0; + + r = ndisc_configure(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure IPv6 Router Discovery: %m"); + + r = ndisc_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start IPv6 Router Discovery: %m"); + + log_link_debug(link, "IPv6 Router Discovery is configured%s.", + r > 0 ? " and started" : ""); + return 1; +} + +int link_request_ndisc(Link *link) { + int r; + + assert(link); + + if (!link_ipv6_accept_ra_enabled(link)) + return 0; + + if (link->ndisc) + return 0; + + r = link_queue_request(link, REQUEST_TYPE_NDISC, ndisc_process_request, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request configuring of the IPv6 Router Discovery: %m"); + + log_link_debug(link, "Requested configuring of the IPv6 Router Discovery."); + return 0; +} + +int ndisc_stop(Link *link) { + assert(link); + + link->ndisc_expire = sd_event_source_disable_unref(link->ndisc_expire); + + return sd_ndisc_stop(link->ndisc); +} + + +void ndisc_flush(Link *link) { + assert(link); + + /* Remove all RDNSS, DNSSL, and Captive Portal entries, without exception. */ + + link->ndisc_rdnss = set_free(link->ndisc_rdnss); + link->ndisc_dnssl = set_free(link->ndisc_dnssl); + link->ndisc_captive_portals = set_free(link->ndisc_captive_portals); + link->ndisc_pref64 = set_free(link->ndisc_pref64); +} + +static const char* const ipv6_accept_ra_start_dhcp6_client_table[_IPV6_ACCEPT_RA_START_DHCP6_CLIENT_MAX] = { + [IPV6_ACCEPT_RA_START_DHCP6_CLIENT_NO] = "no", + [IPV6_ACCEPT_RA_START_DHCP6_CLIENT_ALWAYS] = "always", + [IPV6_ACCEPT_RA_START_DHCP6_CLIENT_YES] = "yes", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(ipv6_accept_ra_start_dhcp6_client, IPv6AcceptRAStartDHCP6Client, IPV6_ACCEPT_RA_START_DHCP6_CLIENT_YES); + +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipv6_accept_ra_use_domains, dhcp_use_domains, DHCPUseDomains, + "Failed to parse UseDomains= setting"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipv6_accept_ra_start_dhcp6_client, ipv6_accept_ra_start_dhcp6_client, IPv6AcceptRAStartDHCP6Client, + "Failed to parse DHCPv6Client= setting"); diff --git a/src/network/networkd-ndisc.h b/src/network/networkd-ndisc.h new file mode 100644 index 0000000..a463f42 --- /dev/null +++ b/src/network/networkd-ndisc.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "time-util.h" + +typedef struct Link Link; +typedef struct Network Network; + +typedef enum IPv6AcceptRAStartDHCP6Client { + IPV6_ACCEPT_RA_START_DHCP6_CLIENT_NO, + IPV6_ACCEPT_RA_START_DHCP6_CLIENT_ALWAYS, + IPV6_ACCEPT_RA_START_DHCP6_CLIENT_YES, + _IPV6_ACCEPT_RA_START_DHCP6_CLIENT_MAX, + _IPV6_ACCEPT_RA_START_DHCP6_CLIENT_INVALID = -EINVAL, +} IPv6AcceptRAStartDHCP6Client; + +typedef struct NDiscRDNSS { + struct in6_addr router; + /* This is an absolute point in time, and NOT a timespan/duration. + * Must be specified with CLOCK_BOOTTIME. */ + usec_t lifetime_usec; + struct in6_addr address; +} NDiscRDNSS; + +typedef struct NDiscDNSSL { + struct in6_addr router; + /* This is an absolute point in time, and NOT a timespan/duration. + * Must be specified with CLOCK_BOOTTIME. */ + usec_t lifetime_usec; + /* The domain name follows immediately. */ +} NDiscDNSSL; + +typedef struct NDiscCaptivePortal { + struct in6_addr router; + /* This is an absolute point in time, and NOT a timespan/duration. + * Must be specified with CLOCK_BOOTTIME. */ + usec_t lifetime_usec; + char *captive_portal; +} NDiscCaptivePortal; + +typedef struct NDiscPREF64 { + struct in6_addr router; + /* This is an absolute point in time, and NOT a timespan/duration. + * Must be specified with CLOCK_BOOTTIME. */ + usec_t lifetime_usec; + uint8_t prefix_len; + struct in6_addr prefix; +} NDiscPREF64; + +static inline char* NDISC_DNSSL_DOMAIN(const NDiscDNSSL *n) { + return ((char*) n) + ALIGN(sizeof(NDiscDNSSL)); +} + +bool link_ipv6_accept_ra_enabled(Link *link); + +void network_adjust_ipv6_accept_ra(Network *network); + +int ndisc_start(Link *link); +int ndisc_stop(Link *link); +void ndisc_flush(Link *link); + +int link_request_ndisc(Link *link); + +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_accept_ra_start_dhcp6_client); +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_accept_ra_use_domains); diff --git a/src/network/networkd-neighbor.c b/src/network/networkd-neighbor.c new file mode 100644 index 0000000..8321831 --- /dev/null +++ b/src/network/networkd-neighbor.c @@ -0,0 +1,756 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "hashmap.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-neighbor.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "set.h" + +Neighbor *neighbor_free(Neighbor *neighbor) { + if (!neighbor) + return NULL; + + if (neighbor->network) { + assert(neighbor->section); + ordered_hashmap_remove(neighbor->network->neighbors_by_section, neighbor->section); + } + + config_section_free(neighbor->section); + + if (neighbor->link) + set_remove(neighbor->link->neighbors, neighbor); + + return mfree(neighbor); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(Neighbor, neighbor_free); + +static int neighbor_new_static(Network *network, const char *filename, unsigned section_line, Neighbor **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(neighbor_freep) Neighbor *neighbor = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + neighbor = ordered_hashmap_get(network->neighbors_by_section, n); + if (neighbor) { + *ret = TAKE_PTR(neighbor); + return 0; + } + + neighbor = new(Neighbor, 1); + if (!neighbor) + return -ENOMEM; + + *neighbor = (Neighbor) { + .network = network, + .family = AF_UNSPEC, + .section = TAKE_PTR(n), + .source = NETWORK_CONFIG_SOURCE_STATIC, + }; + + r = ordered_hashmap_ensure_put(&network->neighbors_by_section, &config_section_hash_ops, neighbor->section, neighbor); + if (r < 0) + return r; + + *ret = TAKE_PTR(neighbor); + return 0; +} + +static int neighbor_dup(const Neighbor *neighbor, Neighbor **ret) { + _cleanup_(neighbor_freep) Neighbor *dest = NULL; + + assert(neighbor); + assert(ret); + + dest = newdup(Neighbor, neighbor, 1); + if (!dest) + return -ENOMEM; + + /* Unset all pointers */ + dest->link = NULL; + dest->network = NULL; + dest->section = NULL; + + *ret = TAKE_PTR(dest); + return 0; +} + +static void neighbor_hash_func(const Neighbor *neighbor, struct siphash *state) { + assert(neighbor); + + siphash24_compress(&neighbor->family, sizeof(neighbor->family), state); + + if (!IN_SET(neighbor->family, AF_INET, AF_INET6)) + /* treat any other address family as AF_UNSPEC */ + return; + + /* Equality of neighbors are given by the destination address. + * See neigh_lookup() in the kernel. */ + siphash24_compress(&neighbor->in_addr, FAMILY_ADDRESS_SIZE(neighbor->family), state); +} + +static int neighbor_compare_func(const Neighbor *a, const Neighbor *b) { + int r; + + r = CMP(a->family, b->family); + if (r != 0) + return r; + + if (!IN_SET(a->family, AF_INET, AF_INET6)) + /* treat any other address family as AF_UNSPEC */ + return 0; + + return memcmp(&a->in_addr, &b->in_addr, FAMILY_ADDRESS_SIZE(a->family)); +} + +DEFINE_PRIVATE_HASH_OPS( + neighbor_hash_ops, + Neighbor, + neighbor_hash_func, + neighbor_compare_func); + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + neighbor_hash_ops_free, + Neighbor, + neighbor_hash_func, + neighbor_compare_func, + neighbor_free); + +static int neighbor_get_request(Link *link, const Neighbor *neighbor, Request **ret) { + Request *req; + + assert(link); + assert(link->manager); + assert(neighbor); + + req = ordered_set_get( + link->manager->request_queue, + &(Request) { + .link = link, + .type = REQUEST_TYPE_NEIGHBOR, + .userdata = (void*) neighbor, + .hash_func = (hash_func_t) neighbor_hash_func, + .compare_func = (compare_func_t) neighbor_compare_func, + }); + if (!req) + return -ENOENT; + + if (ret) + *ret = req; + return 0; +} + +static int neighbor_get(Link *link, const Neighbor *in, Neighbor **ret) { + Neighbor *existing; + + assert(link); + assert(in); + + existing = set_get(link->neighbors, in); + if (!existing) + return -ENOENT; + + if (ret) + *ret = existing; + return 0; +} + +static int neighbor_add(Link *link, Neighbor *neighbor) { + int r; + + assert(link); + assert(neighbor); + + r = set_ensure_put(&link->neighbors, &neighbor_hash_ops_free, neighbor); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + neighbor->link = link; + return 0; +} + +static void log_neighbor_debug(const Neighbor *neighbor, const char *str, const Link *link) { + _cleanup_free_ char *state = NULL; + + assert(neighbor); + assert(str); + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(neighbor->state, &state); + + log_link_debug(link, + "%s %s neighbor (%s): lladdr: %s, dst: %s", + str, strna(network_config_source_to_string(neighbor->source)), strna(state), + HW_ADDR_TO_STR(&neighbor->ll_addr), + IN_ADDR_TO_STRING(neighbor->family, &neighbor->in_addr)); +} + +static int neighbor_configure(Neighbor *neighbor, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(neighbor); + assert(link); + assert(link->ifindex > 0); + assert(link->manager); + assert(link->manager->rtnl); + assert(req); + + log_neighbor_debug(neighbor, "Configuring", link); + + r = sd_rtnl_message_new_neigh(link->manager->rtnl, &m, RTM_NEWNEIGH, + link->ifindex, neighbor->family); + if (r < 0) + return r; + + r = sd_rtnl_message_neigh_set_state(m, NUD_PERMANENT); + if (r < 0) + return r; + + r = netlink_message_append_hw_addr(m, NDA_LLADDR, &neighbor->ll_addr); + if (r < 0) + return r; + + r = netlink_message_append_in_addr_union(m, NDA_DST, neighbor->family, &neighbor->in_addr); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static int neighbor_process_request(Request *req, Link *link, Neighbor *neighbor) { + Neighbor *existing; + int r; + + assert(req); + assert(link); + assert(neighbor); + + if (!link_is_ready_to_configure(link, false)) + return 0; + + r = neighbor_configure(neighbor, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure neighbor: %m"); + + neighbor_enter_configuring(neighbor); + if (neighbor_get(link, neighbor, &existing) >= 0) + neighbor_enter_configuring(existing); + + return 1; +} + +static int static_neighbor_configure_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Neighbor *neighbor) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not set neighbor"); + link_enter_failed(link); + return 1; + } + + if (link->static_neighbor_messages == 0) { + log_link_debug(link, "Neighbors set"); + link->static_neighbors_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int link_request_neighbor(Link *link, const Neighbor *neighbor) { + _cleanup_(neighbor_freep) Neighbor *tmp = NULL; + Neighbor *existing = NULL; + int r; + + assert(link); + assert(neighbor); + assert(neighbor->source != NETWORK_CONFIG_SOURCE_FOREIGN); + + if (neighbor->ll_addr.length != link->hw_addr.length) { + log_link_debug(link, + "The link layer address length (%zu) for neighbor %s does not match with " + "the hardware address length (%zu), ignoring the setting.", + neighbor->ll_addr.length, + IN_ADDR_TO_STRING(neighbor->family, &neighbor->in_addr), + link->hw_addr.length); + return 0; + } + + r = neighbor_dup(neighbor, &tmp); + if (r < 0) + return r; + + if (neighbor_get(link, neighbor, &existing) >= 0) + /* Copy state for logging below. */ + tmp->state = existing->state; + + log_neighbor_debug(tmp, "Requesting", link); + r = link_queue_request_safe(link, REQUEST_TYPE_NEIGHBOR, + tmp, + neighbor_free, + neighbor_hash_func, + neighbor_compare_func, + neighbor_process_request, + &link->static_neighbor_messages, + static_neighbor_configure_handler, + NULL); + if (r <= 0) + return r; + + neighbor_enter_requesting(tmp); + if (existing) + neighbor_enter_requesting(existing); + + TAKE_PTR(tmp); + return 1; +} + +int link_request_static_neighbors(Link *link) { + Neighbor *neighbor; + int r; + + assert(link); + assert(link->network); + assert(link->state != _LINK_STATE_INVALID); + + link->static_neighbors_configured = false; + + ORDERED_HASHMAP_FOREACH(neighbor, link->network->neighbors_by_section) { + r = link_request_neighbor(link, neighbor); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request neighbor: %m"); + } + + if (link->static_neighbor_messages == 0) { + link->static_neighbors_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Requesting neighbors"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +static int neighbor_remove_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + assert(link); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 1; + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -ESRCH) + /* Neighbor may not exist because it already got deleted, ignore that. */ + log_link_message_warning_errno(link, m, r, "Could not remove neighbor"); + + return 1; +} + +static int neighbor_remove(Neighbor *neighbor) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + Request *req; + Link *link; + int r; + + assert(neighbor); + assert(neighbor->link); + assert(neighbor->link->manager); + assert(neighbor->link->manager->rtnl); + + link = neighbor->link; + + log_neighbor_debug(neighbor, "Removing", link); + + r = sd_rtnl_message_new_neigh(link->manager->rtnl, &m, RTM_DELNEIGH, + link->ifindex, neighbor->family); + if (r < 0) + return log_link_error_errno(link, r, "Could not allocate RTM_DELNEIGH message: %m"); + + r = netlink_message_append_in_addr_union(m, NDA_DST, neighbor->family, &neighbor->in_addr); + if (r < 0) + return log_link_error_errno(link, r, "Could not append NDA_DST attribute: %m"); + + r = netlink_call_async(link->manager->rtnl, NULL, m, neighbor_remove_handler, + link_netlink_destroy_callback, link); + if (r < 0) + return log_link_error_errno(link, r, "Could not send rtnetlink message: %m"); + + link_ref(link); + + neighbor_enter_removing(neighbor); + if (neighbor_get_request(neighbor->link, neighbor, &req) >= 0) + neighbor_enter_removing(req->userdata); + + return 0; +} + +int link_drop_foreign_neighbors(Link *link) { + Neighbor *neighbor; + int r = 0; + + assert(link); + assert(link->network); + + /* First, mark all neighbors. */ + SET_FOREACH(neighbor, link->neighbors) { + /* Do not remove neighbors we configured. */ + if (neighbor->source != NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore neighbors not assigned yet or already removing. */ + if (!neighbor_exists(neighbor)) + continue; + + neighbor_mark(neighbor); + } + + /* Next, unmark requested neighbors. They will be configured later. */ + ORDERED_HASHMAP_FOREACH(neighbor, link->network->neighbors_by_section) { + Neighbor *existing; + + if (neighbor_get(link, neighbor, &existing) >= 0) + neighbor_unmark(existing); + } + + SET_FOREACH(neighbor, link->neighbors) { + if (!neighbor_is_marked(neighbor)) + continue; + + RET_GATHER(r, neighbor_remove(neighbor)); + } + + return r; +} + +int link_drop_managed_neighbors(Link *link) { + Neighbor *neighbor; + int r = 0; + + assert(link); + + SET_FOREACH(neighbor, link->neighbors) { + /* Do not touch nexthops managed by kernel or other tools. */ + if (neighbor->source == NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore neighbors not assigned yet or already removing. */ + if (!neighbor_exists(neighbor)) + continue; + + RET_GATHER(r, neighbor_remove(neighbor)); + } + + return r; +} + +void link_foreignize_neighbors(Link *link) { + Neighbor *neighbor; + + assert(link); + + SET_FOREACH(neighbor, link->neighbors) + neighbor->source = NETWORK_CONFIG_SOURCE_FOREIGN; +} + +int manager_rtnl_process_neighbor(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(neighbor_freep) Neighbor *tmp = NULL; + Neighbor *neighbor = NULL; + Request *req = NULL; + uint16_t type, state; + bool is_new = false; + int ifindex, r; + Link *link; + + assert(rtnl); + assert(message); + assert(m); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive neighbor message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWNEIGH, RTM_DELNEIGH)) { + log_warning("rtnl: received unexpected message type %u when processing neighbor, ignoring.", type); + return 0; + } + + r = sd_rtnl_message_neigh_get_state(message, &state); + if (r < 0) { + log_warning_errno(r, "rtnl: received neighbor message with invalid state, ignoring: %m"); + return 0; + } else if (!FLAGS_SET(state, NUD_PERMANENT)) { + log_debug("rtnl: received non-static neighbor, ignoring."); + return 0; + } + + r = sd_rtnl_message_neigh_get_ifindex(message, &ifindex); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get ifindex from message, ignoring: %m"); + return 0; + } else if (ifindex <= 0) { + log_warning("rtnl: received neighbor message with invalid ifindex %d, ignoring.", ifindex); + return 0; + } + + r = link_get_by_index(m, ifindex, &link); + if (r < 0) { + /* when enumerating we might be out of sync, but we will get the neighbor again. Also, + * kernel sends messages about neighbors after a link is removed. So, just ignore it. */ + log_debug("rtnl: received neighbor for link '%d' we don't know about, ignoring.", ifindex); + return 0; + } + + tmp = new0(Neighbor, 1); + if (!tmp) + return log_oom(); + + /* First, retrieve the fundamental information about the neighbor. */ + r = sd_rtnl_message_neigh_get_family(message, &tmp->family); + if (r < 0) { + log_link_warning(link, "rtnl: received neighbor message without family, ignoring."); + return 0; + } else if (!IN_SET(tmp->family, AF_INET, AF_INET6)) { + log_link_debug(link, "rtnl: received neighbor message with invalid family '%i', ignoring.", tmp->family); + return 0; + } + + r = netlink_message_read_in_addr_union(message, NDA_DST, tmp->family, &tmp->in_addr); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received neighbor message without valid address, ignoring: %m"); + return 0; + } + + /* Then, find the managed Neighbor and Request objects corresponding to the netlink notification. */ + (void) neighbor_get(link, tmp, &neighbor); + (void) neighbor_get_request(link, tmp, &req); + + if (type == RTM_DELNEIGH) { + if (neighbor) { + neighbor_enter_removed(neighbor); + log_neighbor_debug(neighbor, "Forgetting removed", link); + neighbor_free(neighbor); + } else + log_neighbor_debug(tmp, "Kernel removed unknown", link); + + if (req) + neighbor_enter_removed(req->userdata); + + return 0; + } + + /* If we did not know the neighbor, then save it. */ + if (!neighbor) { + r = neighbor_add(link, tmp); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to save received neighbor, ignoring: %m"); + return 0; + } + neighbor = TAKE_PTR(tmp); + is_new = true; + } + + /* Also update information that cannot be obtained through netlink notification. */ + if (req && req->waiting_reply) { + Neighbor *n = ASSERT_PTR(req->userdata); + + neighbor->source = n->source; + } + + /* Then, update miscellaneous info. */ + r = netlink_message_read_hw_addr(message, NDA_LLADDR, &neighbor->ll_addr); + if (r < 0 && r != -ENODATA) + log_link_debug_errno(link, r, "rtnl: received neighbor message without valid link layer address, ignoring: %m"); + + neighbor_enter_configured(neighbor); + if (req) + neighbor_enter_configured(req->userdata); + + log_neighbor_debug(neighbor, is_new ? "Remembering" : "Received remembered", link); + return 1; +} + +static int neighbor_section_verify(Neighbor *neighbor) { + if (section_is_invalid(neighbor->section)) + return -EINVAL; + + if (neighbor->family == AF_UNSPEC) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Neighbor section without Address= configured. " + "Ignoring [Neighbor] section from line %u.", + neighbor->section->filename, neighbor->section->line); + + if (neighbor->family == AF_INET6 && !socket_ipv6_is_supported()) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Neighbor section with an IPv6 destination address configured, " + "but the kernel does not support IPv6. " + "Ignoring [Neighbor] section from line %u.", + neighbor->section->filename, neighbor->section->line); + + if (neighbor->ll_addr.length == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Neighbor section without LinkLayerAddress= configured. " + "Ignoring [Neighbor] section from line %u.", + neighbor->section->filename, neighbor->section->line); + + return 0; +} + +int network_drop_invalid_neighbors(Network *network) { + _cleanup_set_free_ Set *neighbors = NULL; + Neighbor *neighbor; + int r; + + assert(network); + + ORDERED_HASHMAP_FOREACH(neighbor, network->neighbors_by_section) { + Neighbor *dup; + + if (neighbor_section_verify(neighbor) < 0) { + /* Drop invalid [Neighbor] sections. Note that neighbor_free() will drop the + * neighbor from neighbors_by_section. */ + neighbor_free(neighbor); + continue; + } + + /* Always use the setting specified later. So, remove the previously assigned setting. */ + dup = set_remove(neighbors, neighbor); + if (dup) { + log_warning("%s: Duplicated neighbor settings for %s is specified at line %u and %u, " + "dropping the address setting specified at line %u.", + dup->section->filename, + IN_ADDR_TO_STRING(neighbor->family, &neighbor->in_addr), + neighbor->section->line, + dup->section->line, dup->section->line); + /* neighbor_free() will drop the address from neighbors_by_section. */ + neighbor_free(dup); + } + + /* Use neighbor_hash_ops, instead of neighbor_hash_ops_free. Otherwise, the Neighbor objects + * will be freed. */ + r = set_ensure_put(&neighbors, &neighbor_hash_ops, neighbor); + if (r < 0) + return log_oom(); + assert(r > 0); + } + + return 0; +} + + +int config_parse_neighbor_address( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(neighbor_free_or_set_invalidp) Neighbor *n = NULL; + Network *network = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = neighbor_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->family = AF_UNSPEC; + n->in_addr = IN_ADDR_NULL; + TAKE_PTR(n); + return 0; + } + + r = in_addr_from_string_auto(rvalue, &n->family, &n->in_addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Neighbor Address is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_neighbor_lladdr( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(neighbor_free_or_set_invalidp) Neighbor *n = NULL; + Network *network = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = neighbor_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->ll_addr = HW_ADDR_NULL; + TAKE_PTR(n); + return 0; + } + + r = parse_hw_addr(rvalue, &n->ll_addr); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Neighbor %s= is invalid, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} diff --git a/src/network/networkd-neighbor.h b/src/network/networkd-neighbor.h new file mode 100644 index 0000000..683a310 --- /dev/null +++ b/src/network/networkd-neighbor.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "ether-addr-util.h" +#include "in-addr-util.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Network Network; + +typedef struct Neighbor { + Network *network; + Link *link; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + + int family; + union in_addr_union in_addr; + struct hw_addr_data ll_addr; +} Neighbor; + +Neighbor *neighbor_free(Neighbor *neighbor); + +int network_drop_invalid_neighbors(Network *network); + +int link_drop_managed_neighbors(Link *link); +int link_drop_foreign_neighbors(Link *link); +void link_foreignize_neighbors(Link *link); + +int link_request_static_neighbors(Link *link); + +int manager_rtnl_process_neighbor(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(Neighbor, neighbor); + +CONFIG_PARSER_PROTOTYPE(config_parse_neighbor_address); +CONFIG_PARSER_PROTOTYPE(config_parse_neighbor_lladdr); diff --git a/src/network/networkd-netlabel.c b/src/network/networkd-netlabel.c new file mode 100644 index 0000000..94bf8f5 --- /dev/null +++ b/src/network/networkd-netlabel.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "escape.h" +#include "netlink-util.h" +#include "networkd-address.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-netlabel.h" +#include "networkd-network.h" + +static int netlabel_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert_se(rtnl); + assert_se(m); + assert_se(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + log_link_message_warning_errno(link, m, r, "NetLabel operation failed, ignoring"); + return 1; + } + + log_link_debug(link, "NetLabel operation successful"); + + return 1; +} + +static int netlabel_command(uint16_t command, const char *label, const Address *address) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(command != NLBL_UNLABEL_C_UNSPEC && command < __NLBL_UNLABEL_C_MAX); + assert(address); + assert(address->link); + assert(address->link->ifname); + assert(address->link->manager); + assert(address->link->manager->genl); + assert(IN_SET(address->family, AF_INET, AF_INET6)); + + r = sd_genl_message_new(address->link->manager->genl, NETLBL_NLTYPE_UNLABELED_NAME, command, &m); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NLBL_UNLABEL_A_IFACE, address->link->ifname); + if (r < 0) + return r; + + if (command == NLBL_UNLABEL_C_STATICADD) { + assert(label); + r = sd_netlink_message_append_string(m, NLBL_UNLABEL_A_SECCTX, label); + if (r < 0) + return r; + } + + union in_addr_union netmask, masked_addr; + r = in_addr_prefixlen_to_netmask(address->family, &netmask, address->prefixlen); + if (r < 0) + return r; + + /* + * When adding rules, kernel adds the address to its hash table _applying also the netmask_, but on + * removal, an exact match is required _without netmask applied_, so apply the mask on both + * operations. + */ + masked_addr = address->in_addr; + r = in_addr_mask(address->family, &masked_addr, address->prefixlen); + if (r < 0) + return r; + + if (address->family == AF_INET) { + r = sd_netlink_message_append_in_addr(m, NLBL_UNLABEL_A_IPV4ADDR, &masked_addr.in); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(m, NLBL_UNLABEL_A_IPV4MASK, &netmask.in); + } else if (address->family == AF_INET6) { + r = sd_netlink_message_append_in6_addr(m, NLBL_UNLABEL_A_IPV6ADDR, &masked_addr.in6); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(m, NLBL_UNLABEL_A_IPV6MASK, &netmask.in6); + } + if (r < 0) + return r; + + r = netlink_call_async(address->link->manager->genl, NULL, m, netlabel_handler, link_netlink_destroy_callback, + address->link); + if (r < 0) + return r; + + link_ref(address->link); + return 0; +} + +void address_add_netlabel(const Address *address) { + int r; + + assert(address); + + if (!address->netlabel) + return; + + r = netlabel_command(NLBL_UNLABEL_C_STATICADD, address->netlabel, address); + if (r < 0) + log_link_warning_errno(address->link, r, "Adding NetLabel %s for IP address %s failed, ignoring", address->netlabel, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); + else + log_link_debug(address->link, "Adding NetLabel %s for IP address %s", address->netlabel, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); +} + +void address_del_netlabel(const Address *address) { + int r; + + assert(address); + + if (!address->netlabel) + return; + + r = netlabel_command(NLBL_UNLABEL_C_STATICREMOVE, address->netlabel, address); + if (r < 0) + log_link_warning_errno(address->link, r, "Deleting NetLabel %s for IP address %s failed, ignoring", address->netlabel, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); + else + log_link_debug(address->link, "Deleting NetLabel %s for IP address %s", address->netlabel, + IN_ADDR_PREFIX_TO_STRING(address->family, &address->in_addr, address->prefixlen)); +} diff --git a/src/network/networkd-netlabel.h b/src/network/networkd-netlabel.h new file mode 100644 index 0000000..2f30b8f --- /dev/null +++ b/src/network/networkd-netlabel.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +void address_add_netlabel(const Address *address); +void address_del_netlabel(const Address *address); diff --git a/src/network/networkd-network-bus.c b/src/network/networkd-network-bus.c new file mode 100644 index 0000000..0c40326 --- /dev/null +++ b/src/network/networkd-network-bus.c @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "networkd-manager.h" +#include "networkd-network-bus.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" + +static int property_get_hw_addrs( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const struct hw_addr_data *p; + Set *s; + int r; + + assert(bus); + assert(reply); + assert(userdata); + + s = *(Set **) userdata; + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + SET_FOREACH(p, s) { + r = sd_bus_message_append(reply, "s", HW_ADDR_TO_STR(p)); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static const sd_bus_vtable network_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Description", "s", NULL, offsetof(Network, description), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Network, filename), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MatchMAC", "as", property_get_hw_addrs, offsetof(Network, match.hw_addr), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MatchPath", "as", NULL, offsetof(Network, match.path), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MatchDriver", "as", NULL, offsetof(Network, match.driver), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MatchType", "as", NULL, offsetof(Network, match.iftype), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MatchName", "as", NULL, offsetof(Network, match.ifname), SD_BUS_VTABLE_PROPERTY_CONST), + + SD_BUS_VTABLE_END +}; + +static char *network_bus_path(Network *network) { + _cleanup_free_ char *name = NULL, *networkname= NULL; + char *d, *path; + int r; + + assert(network); + assert(network->filename); + + name = strdup(network->filename); + if (!name) + return NULL; + + r = path_extract_filename(name, &networkname); + if (r < 0) + return NULL; + + d = strrchr(networkname, '.'); + if (!d) + return NULL; + + assert(streq(d, ".network")); + + *d = '\0'; + + r = sd_bus_path_encode("/org/freedesktop/network1/network", networkname, &path); + if (r < 0) + return NULL; + + return path; +} + +int network_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = ASSERT_PTR(userdata); + Network *network; + int r; + + assert(bus); + assert(path); + assert(nodes); + + ORDERED_HASHMAP_FOREACH(network, m->networks) { + char *p; + + p = network_bus_path(network); + if (!p) + return -ENOMEM; + + r = strv_consume(&l, p); + if (r < 0) + return r; + } + + *nodes = TAKE_PTR(l); + + return 1; +} + +int network_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Network *network; + _cleanup_free_ char *name = NULL; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = sd_bus_path_decode(path, "/org/freedesktop/network1/network", &name); + if (r < 0) + return 0; + + r = network_get_by_name(m, name, &network); + if (r < 0) + return 0; + + *found = network; + + return 1; +} + +const BusObjectImplementation network_object = { + "/org/freedesktop/network1/network", + "org.freedesktop.network1.Network", + .fallback_vtables = BUS_FALLBACK_VTABLES({network_vtable, network_object_find}), + .node_enumerator = network_node_enumerator, +}; diff --git a/src/network/networkd-network-bus.h b/src/network/networkd-network-bus.h new file mode 100644 index 0000000..68ed951 --- /dev/null +++ b/src/network/networkd-network-bus.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" + +typedef struct Link Link; + +extern const BusObjectImplementation network_object; + +int network_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error); +int network_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error); diff --git a/src/network/networkd-network-gperf.gperf b/src/network/networkd-network-gperf.gperf new file mode 100644 index 0000000..a6593a0 --- /dev/null +++ b/src/network/networkd-network-gperf.gperf @@ -0,0 +1,627 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "in-addr-prefix-util.h" +#include "netem.h" +#include "net-condition.h" +#include "networkd-address-generation.h" +#include "networkd-address-label.h" +#include "networkd-address.h" +#include "networkd-bridge-fdb.h" +#include "networkd-bridge-mdb.h" +#include "networkd-can.h" +#include "networkd-dhcp-common.h" +#include "networkd-dhcp-prefix-delegation.h" +#include "networkd-dhcp-server-static-lease.h" +#include "networkd-dhcp-server.h" +#include "networkd-dhcp4.h" +#include "networkd-dhcp6.h" +#include "networkd-ipv4ll.h" +#include "networkd-ipv6-proxy-ndp.h" +#include "networkd-ipv6ll.h" +#include "networkd-lldp-tx.h" +#include "networkd-ndisc.h" +#include "networkd-netlabel.h" +#include "networkd-network.h" +#include "networkd-neighbor.h" +#include "networkd-nexthop.h" +#include "networkd-radv.h" +#include "networkd-route.h" +#include "networkd-routing-policy-rule.h" +#include "networkd-sriov.h" +#include "qdisc.h" +#include "tclass.h" +#include "vlan-util.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name network_network_gperf_hash +%define lookup-function-name network_network_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Match.MACAddress, config_parse_hw_addrs, 0, offsetof(Network, match.hw_addr) +Match.PermanentMACAddress, config_parse_hw_addrs, 0, offsetof(Network, match.permanent_hw_addr) +Match.Path, config_parse_match_strv, 0, offsetof(Network, match.path) +Match.Driver, config_parse_match_strv, 0, offsetof(Network, match.driver) +Match.Type, config_parse_match_strv, 0, offsetof(Network, match.iftype) +Match.Kind, config_parse_match_strv, 0, offsetof(Network, match.kind) +Match.WLANInterfaceType, config_parse_match_strv, 0, offsetof(Network, match.wlan_iftype) +Match.SSID, config_parse_match_strv, 0, offsetof(Network, match.ssid) +Match.BSSID, config_parse_ether_addrs, 0, offsetof(Network, match.bssid) +Match.Name, config_parse_match_ifnames, IFNAME_VALID_ALTERNATIVE, offsetof(Network, match.ifname) +Match.Property, config_parse_match_property, 0, offsetof(Network, match.property) +Match.Host, config_parse_net_condition, CONDITION_HOST, offsetof(Network, conditions) +Match.Virtualization, config_parse_net_condition, CONDITION_VIRTUALIZATION, offsetof(Network, conditions) +Match.KernelCommandLine, config_parse_net_condition, CONDITION_KERNEL_COMMAND_LINE, offsetof(Network, conditions) +Match.KernelVersion, config_parse_net_condition, CONDITION_KERNEL_VERSION, offsetof(Network, conditions) +Match.Credential, config_parse_net_condition, CONDITION_CREDENTIAL, offsetof(Network, conditions) +Match.Architecture, config_parse_net_condition, CONDITION_ARCHITECTURE, offsetof(Network, conditions) +Match.Firmware, config_parse_net_condition, CONDITION_FIRMWARE, offsetof(Network, conditions) +Link.MACAddress, config_parse_hw_addr, 0, offsetof(Network, hw_addr) +Link.MTUBytes, config_parse_mtu, AF_UNSPEC, offsetof(Network, mtu) +Link.Group, config_parse_link_group, 0, 0 +Link.ARP, config_parse_tristate, 0, offsetof(Network, arp) +Link.Multicast, config_parse_tristate, 0, offsetof(Network, multicast) +Link.AllMulticast, config_parse_tristate, 0, offsetof(Network, allmulticast) +Link.Promiscuous, config_parse_tristate, 0, offsetof(Network, promiscuous) +Link.Unmanaged, config_parse_bool, 0, offsetof(Network, unmanaged) +Link.ActivationPolicy, config_parse_activation_policy, 0, offsetof(Network, activation_policy) +Link.RequiredForOnline, config_parse_required_for_online, 0, 0 +Link.RequiredFamilyForOnline, config_parse_required_family_for_online, 0, offsetof(Network, required_family_for_online) +SR-IOV.VirtualFunction, config_parse_sr_iov_uint32, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.VLANId, config_parse_sr_iov_uint32, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.QualityOfService, config_parse_sr_iov_uint32, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.VLANProtocol, config_parse_sr_iov_vlan_proto, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.MACSpoofCheck, config_parse_sr_iov_boolean, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.QueryReceiveSideScaling, config_parse_sr_iov_boolean, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.Trust, config_parse_sr_iov_boolean, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.LinkState, config_parse_sr_iov_link_state, 0, offsetof(Network, sr_iov_by_section) +SR-IOV.MACAddress, config_parse_sr_iov_mac, 0, offsetof(Network, sr_iov_by_section) +Network.Description, config_parse_string, 0, offsetof(Network, description) +Network.KeepMaster, config_parse_bool, 0, offsetof(Network, keep_master) +Network.BatmanAdvanced, config_parse_ifname, 0, offsetof(Network, batadv_name) +Network.Bond, config_parse_ifname, 0, offsetof(Network, bond_name) +Network.Bridge, config_parse_ifname, 0, offsetof(Network, bridge_name) +Network.VRF, config_parse_ifname, 0, offsetof(Network, vrf_name) +Network.IPoIB, config_parse_stacked_netdev, NETDEV_KIND_IPOIB, offsetof(Network, stacked_netdev_names) +Network.IPVLAN, config_parse_stacked_netdev, NETDEV_KIND_IPVLAN, offsetof(Network, stacked_netdev_names) +Network.IPVTAP, config_parse_stacked_netdev, NETDEV_KIND_IPVTAP, offsetof(Network, stacked_netdev_names) +Network.L2TP, config_parse_warn_compat, DISABLED_LEGACY, 0 +Network.MACsec, config_parse_stacked_netdev, NETDEV_KIND_MACSEC, offsetof(Network, stacked_netdev_names) +Network.MACVLAN, config_parse_stacked_netdev, NETDEV_KIND_MACVLAN, offsetof(Network, stacked_netdev_names) +Network.MACVTAP, config_parse_stacked_netdev, NETDEV_KIND_MACVTAP, offsetof(Network, stacked_netdev_names) +Network.Tunnel, config_parse_stacked_netdev, _NETDEV_KIND_TUNNEL, offsetof(Network, stacked_netdev_names) +Network.VLAN, config_parse_stacked_netdev, NETDEV_KIND_VLAN, offsetof(Network, stacked_netdev_names) +Network.VXLAN, config_parse_stacked_netdev, NETDEV_KIND_VXLAN, offsetof(Network, stacked_netdev_names) +Network.Xfrm, config_parse_stacked_netdev, NETDEV_KIND_XFRM, offsetof(Network, stacked_netdev_names) +Network.DHCP, config_parse_dhcp, 0, offsetof(Network, dhcp) +Network.DHCPServer, config_parse_bool, 0, offsetof(Network, dhcp_server) +Network.LinkLocalAddressing, config_parse_link_local_address_family, 0, offsetof(Network, link_local) +Network.IPv6LinkLocalAddressGenerationMode, config_parse_ipv6_link_local_address_gen_mode, 0, offsetof(Network, ipv6ll_address_gen_mode) +Network.IPv6StableSecretAddress, config_parse_in_addr_non_null, AF_INET6, offsetof(Network, ipv6ll_stable_secret) +Network.IPv4LLStartAddress, config_parse_ipv4ll_address, 0, offsetof(Network, ipv4ll_start_address) +Network.IPv4LLRoute, config_parse_bool, 0, offsetof(Network, ipv4ll_route) +Network.DefaultRouteOnDevice, config_parse_bool, 0, offsetof(Network, default_route_on_device) +Network.LLDP, config_parse_lldp_mode, 0, offsetof(Network, lldp_mode) +Network.EmitLLDP, config_parse_lldp_multicast_mode, 0, offsetof(Network, lldp_multicast_mode) +Network.Address, config_parse_address, 0, 0 +Network.Gateway, config_parse_gateway, 0, 0 +Network.Domains, config_parse_domains, 0, 0 +Network.DNS, config_parse_dns, 0, 0 +Network.DNSDefaultRoute, config_parse_tristate, 0, offsetof(Network, dns_default_route) +Network.LLMNR, config_parse_resolve_support, 0, offsetof(Network, llmnr) +Network.MulticastDNS, config_parse_resolve_support, 0, offsetof(Network, mdns) +Network.DNSOverTLS, config_parse_dns_over_tls_mode, 0, offsetof(Network, dns_over_tls_mode) +Network.DNSSEC, config_parse_dnssec_mode, 0, offsetof(Network, dnssec_mode) +Network.DNSSECNegativeTrustAnchors, config_parse_dnssec_negative_trust_anchors, 0, offsetof(Network, dnssec_negative_trust_anchors) +Network.NTP, config_parse_ntp, 0, offsetof(Network, ntp) +Network.IPForward, config_parse_address_family_with_kernel, 0, offsetof(Network, ip_forward) +Network.IPMasquerade, config_parse_ip_masquerade, 0, offsetof(Network, ip_masquerade) +Network.IPv6PrivacyExtensions, config_parse_ipv6_privacy_extensions, 0, offsetof(Network, ipv6_privacy_extensions) +Network.IPv6AcceptRA, config_parse_tristate, 0, offsetof(Network, ipv6_accept_ra) +Network.IPv6AcceptRouterAdvertisements, config_parse_tristate, 0, offsetof(Network, ipv6_accept_ra) +Network.IPv6DuplicateAddressDetection, config_parse_int, 0, offsetof(Network, ipv6_dad_transmits) +Network.IPv6HopLimit, config_parse_uint8, 0, offsetof(Network, ipv6_hop_limit) +Network.IPv6ProxyNDP, config_parse_tristate, 0, offsetof(Network, ipv6_proxy_ndp) +Network.IPv6MTUBytes, config_parse_mtu, AF_INET6, offsetof(Network, ipv6_mtu) +Network.IPv4AcceptLocal, config_parse_tristate, 0, offsetof(Network, ipv4_accept_local) +Network.IPv4RouteLocalnet, config_parse_tristate, 0, offsetof(Network, ipv4_route_localnet) +Network.ActiveSlave, config_parse_bool, 0, offsetof(Network, active_slave) +Network.PrimarySlave, config_parse_bool, 0, offsetof(Network, primary_slave) +Network.IPv4ProxyARP, config_parse_tristate, 0, offsetof(Network, proxy_arp) +Network.ProxyARP, config_parse_tristate, 0, offsetof(Network, proxy_arp) +Network.IPv6ProxyNDPAddress, config_parse_ipv6_proxy_ndp_address, 0, 0 +Network.IPv4ReversePathFilter, config_parse_ip_reverse_path_filter, 0, offsetof(Network, ipv4_rp_filter) +Network.BindCarrier, config_parse_strv, 0, offsetof(Network, bind_carrier) +Network.ConfigureWithoutCarrier, config_parse_bool, 0, offsetof(Network, configure_without_carrier) +Network.IgnoreCarrierLoss, config_parse_ignore_carrier_loss, 0, 0 +Network.KeepConfiguration, config_parse_keep_configuration, 0, offsetof(Network, keep_configuration) +Network.IPv6SendRA, config_parse_router_prefix_delegation, 0, offsetof(Network, router_prefix_delegation) +Network.DHCPPrefixDelegation, config_parse_tristate, 0, offsetof(Network, dhcp_pd) +Address.Address, config_parse_address, 0, 0 +Address.Peer, config_parse_address, 0, 0 +Address.Broadcast, config_parse_broadcast, 0, 0 +Address.Label, config_parse_label, 0, 0 +Address.PreferredLifetime, config_parse_lifetime, 0, 0 +Address.HomeAddress, config_parse_address_flags, IFA_F_HOMEADDRESS, 0 +Address.ManageTemporaryAddress, config_parse_address_flags, IFA_F_MANAGETEMPADDR, 0 +Address.PrefixRoute, config_parse_address_flags, IFA_F_NOPREFIXROUTE, 0 /* deprecated */ +Address.AddPrefixRoute, config_parse_address_flags, IFA_F_NOPREFIXROUTE, 0 +Address.AutoJoin, config_parse_address_flags, IFA_F_MCAUTOJOIN, 0 +Address.DuplicateAddressDetection, config_parse_duplicate_address_detection, 0, 0 +Address.Scope, config_parse_address_scope, 0, 0 +Address.RouteMetric, config_parse_address_route_metric, 0, 0 +Address.NetLabel, config_parse_address_netlabel, 0, 0 +Address.NFTSet, config_parse_address_ip_nft_set, NFT_SET_PARSE_NETWORK, 0 +IPv6AddressLabel.Prefix, config_parse_address_label_prefix, 0, 0 +IPv6AddressLabel.Label, config_parse_address_label, 0, 0 +Neighbor.Address, config_parse_neighbor_address, 0, 0 +Neighbor.LinkLayerAddress, config_parse_neighbor_lladdr, 0, 0 +Neighbor.MACAddress, config_parse_neighbor_lladdr, 0, 0 /* deprecated */ +RoutingPolicyRule.TypeOfService, config_parse_routing_policy_rule_tos, 0, 0 +RoutingPolicyRule.Priority, config_parse_routing_policy_rule_priority, 0, 0 +RoutingPolicyRule.Table, config_parse_routing_policy_rule_table, 0, 0 +RoutingPolicyRule.FirewallMark, config_parse_routing_policy_rule_fwmark_mask, 0, 0 +RoutingPolicyRule.From, config_parse_routing_policy_rule_prefix, 0, 0 +RoutingPolicyRule.To, config_parse_routing_policy_rule_prefix, 0, 0 +RoutingPolicyRule.IncomingInterface, config_parse_routing_policy_rule_device, 0, 0 +RoutingPolicyRule.OutgoingInterface, config_parse_routing_policy_rule_device, 0, 0 +RoutingPolicyRule.IPProtocol, config_parse_routing_policy_rule_ip_protocol, 0, 0 +RoutingPolicyRule.SourcePort, config_parse_routing_policy_rule_port_range, 0, 0 +RoutingPolicyRule.DestinationPort, config_parse_routing_policy_rule_port_range, 0, 0 +RoutingPolicyRule.InvertRule, config_parse_routing_policy_rule_invert, 0, 0 +RoutingPolicyRule.Family, config_parse_routing_policy_rule_family, 0, 0 +RoutingPolicyRule.User, config_parse_routing_policy_rule_uid_range, 0, 0 +RoutingPolicyRule.SuppressInterfaceGroup, config_parse_routing_policy_rule_suppress_ifgroup, 0, 0 +RoutingPolicyRule.SuppressPrefixLength, config_parse_routing_policy_rule_suppress_prefixlen, 0, 0 +RoutingPolicyRule.Type, config_parse_routing_policy_rule_type, 0, 0 +Route.Gateway, config_parse_gateway, 0, 0 +Route.Destination, config_parse_destination, 0, 0 +Route.Source, config_parse_destination, 0, 0 +Route.Metric, config_parse_route_priority, 0, 0 +Route.Scope, config_parse_route_scope, 0, 0 +Route.PreferredSource, config_parse_preferred_src, 0, 0 +Route.Table, config_parse_route_table, 0, 0 +Route.MTUBytes, config_parse_route_mtu, AF_UNSPEC, 0 +Route.GatewayOnLink, config_parse_route_boolean, 0, 0 +Route.GatewayOnlink, config_parse_route_boolean, 0, 0 +Route.IPv6Preference, config_parse_ipv6_route_preference, 0, 0 +Route.Protocol, config_parse_route_protocol, 0, 0 +Route.Type, config_parse_route_type, 0, 0 +Route.TCPRetransmissionTimeoutSec, config_parse_route_tcp_rto, 0, 0 +Route.HopLimit, config_parse_route_hop_limit, 0, 0 +Route.InitialCongestionWindow, config_parse_route_tcp_window, 0, 0 +Route.InitialAdvertisedReceiveWindow, config_parse_route_tcp_window, 0, 0 +Route.TCPAdvertisedMaximumSegmentSize, config_parse_tcp_advmss, 0, 0 +Route.TCPCongestionControlAlgorithm, config_parse_tcp_congestion, 0, 0 +Route.QuickAck, config_parse_route_boolean, 0, 0 +Route.FastOpenNoCookie, config_parse_route_boolean, 0, 0 +Route.TTLPropagate, config_parse_route_boolean, 0, 0 +Route.MultiPathRoute, config_parse_multipath_route, 0, 0 +Route.NextHop, config_parse_route_nexthop, 0, 0 +NextHop.Id, config_parse_nexthop_id, 0, 0 +NextHop.Gateway, config_parse_nexthop_gateway, 0, 0 +NextHop.Family, config_parse_nexthop_family, 0, 0 +NextHop.OnLink, config_parse_nexthop_onlink, 0, 0 +NextHop.Blackhole, config_parse_nexthop_blackhole, 0, 0 +NextHop.Group, config_parse_nexthop_group, 0, 0 +DHCPv4.RequestAddress, config_parse_in_addr_non_null, AF_INET, offsetof(Network, dhcp_request_address) +DHCPv4.ClientIdentifier, config_parse_dhcp_client_identifier, 0, offsetof(Network, dhcp_client_identifier) +DHCPv4.UseDNS, config_parse_dhcp_use_dns, AF_INET, 0 +DHCPv4.RoutesToDNS, config_parse_bool, 0, offsetof(Network, dhcp_routes_to_dns) +DHCPv4.UseNTP, config_parse_dhcp_use_ntp, AF_INET, 0 +DHCPv4.RoutesToNTP, config_parse_bool, 0, offsetof(Network, dhcp_routes_to_ntp) +DHCPv4.UseSIP, config_parse_bool, 0, offsetof(Network, dhcp_use_sip) +DHCPv4.UseCaptivePortal, config_parse_bool, 0, offsetof(Network, dhcp_use_captive_portal) +DHCPv4.UseMTU, config_parse_bool, 0, offsetof(Network, dhcp_use_mtu) +DHCPv4.UseHostname, config_parse_bool, 0, offsetof(Network, dhcp_use_hostname) +DHCPv4.UseDomains, config_parse_dhcp_use_domains, AF_INET, 0 +DHCPv4.UseRoutes, config_parse_bool, 0, offsetof(Network, dhcp_use_routes) +DHCPv4.UseGateway, config_parse_tristate, 0, offsetof(Network, dhcp_use_gateway) +DHCPv4.QuickAck, config_parse_bool, 0, offsetof(Network, dhcp_quickack) +DHCPv4.RequestOptions, config_parse_dhcp_request_options, AF_INET, 0 +DHCPv4.Anonymize, config_parse_bool, 0, offsetof(Network, dhcp_anonymize) +DHCPv4.SendHostname, config_parse_dhcp_send_hostname, AF_INET, 0 +DHCPv4.Hostname, config_parse_hostname, 0, offsetof(Network, dhcp_hostname) +DHCPv4.Label, config_parse_dhcp_label, 0, offsetof(Network, dhcp_label) +DHCPv4.RequestBroadcast, config_parse_tristate, 0, offsetof(Network, dhcp_broadcast) +DHCPv4.VendorClassIdentifier, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Network, dhcp_vendor_class_identifier) +DHCPv4.MUDURL, config_parse_mud_url, 0, offsetof(Network, dhcp_mudurl) +DHCPv4.MaxAttempts, config_parse_dhcp_max_attempts, 0, 0 +DHCPv4.UserClass, config_parse_dhcp_user_or_vendor_class, AF_INET, offsetof(Network, dhcp_user_class) +DHCPv4.IAID, config_parse_iaid, AF_INET, 0 +DHCPv4.DUIDType, config_parse_network_duid_type, 0, 0 +DHCPv4.DUIDRawData, config_parse_network_duid_rawdata, 0, 0 +DHCPv4.RouteMetric, config_parse_dhcp_route_metric, AF_INET, 0 +DHCPv4.RouteTable, config_parse_dhcp_or_ra_route_table, AF_INET, 0 +DHCPv4.UseTimezone, config_parse_bool, 0, offsetof(Network, dhcp_use_timezone) +DHCPv4.ListenPort, config_parse_uint16, 0, offsetof(Network, dhcp_client_port) +DHCPv4.SendRelease, config_parse_bool, 0, offsetof(Network, dhcp_send_release) +DHCPv4.SendDecline, config_parse_bool, 0, offsetof(Network, dhcp_send_decline) +DHCPv4.DenyList, config_parse_in_addr_prefixes, AF_INET, offsetof(Network, dhcp_deny_listed_ip) +DHCPv4.AllowList, config_parse_in_addr_prefixes, AF_INET, offsetof(Network, dhcp_allow_listed_ip) +DHCPv4.IPServiceType, config_parse_dhcp_ip_service_type, 0, offsetof(Network, dhcp_ip_service_type) +DHCPv4.SocketPriority, config_parse_dhcp_socket_priority, 0, 0 +DHCPv4.SendOption, config_parse_dhcp_send_option, AF_INET, offsetof(Network, dhcp_client_send_options) +DHCPv4.SendVendorOption, config_parse_dhcp_send_option, 0, offsetof(Network, dhcp_client_send_vendor_options) +DHCPv4.RouteMTUBytes, config_parse_mtu, AF_INET, offsetof(Network, dhcp_route_mtu) +DHCPv4.InitialCongestionWindow, config_parse_tcp_window, 0, offsetof(Network, dhcp_initial_congestion_window) +DHCPv4.InitialAdvertisedReceiveWindow, config_parse_tcp_window, 0, offsetof(Network, dhcp_advertised_receive_window) +DHCPv4.FallbackLeaseLifetimeSec, config_parse_dhcp_fallback_lease_lifetime, 0, 0 +DHCPv4.Use6RD, config_parse_bool, 0, offsetof(Network, dhcp_use_6rd) +DHCPv4.IPv6OnlyMode, config_parse_tristate, 0, offsetof(Network, dhcp_ipv6_only_mode) +DHCPv4.NetLabel, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Network, dhcp_netlabel) +DHCPv4.NFTSet, config_parse_nft_set, NFT_SET_PARSE_NETWORK, offsetof(Network, dhcp_nft_set_context) +DHCPv4.RapidCommit, config_parse_tristate, 0, offsetof(Network, dhcp_use_rapid_commit) +DHCPv6.UseAddress, config_parse_bool, 0, offsetof(Network, dhcp6_use_address) +DHCPv6.UseDelegatedPrefix, config_parse_bool, 0, offsetof(Network, dhcp6_use_pd_prefix) +DHCPv6.UseDNS, config_parse_dhcp_use_dns, AF_INET6, 0 +DHCPv6.UseHostname, config_parse_bool, 0, offsetof(Network, dhcp6_use_hostname) +DHCPv6.UseDomains, config_parse_dhcp_use_domains, AF_INET6, 0 +DHCPv6.UseNTP, config_parse_dhcp_use_ntp, AF_INET6, 0 +DHCPv6.UseCaptivePortal, config_parse_bool, 0, offsetof(Network, dhcp6_use_captive_portal) +DHCPv6.MUDURL, config_parse_mud_url, 0, offsetof(Network, dhcp6_mudurl) +DHCPv6.SendHostname, config_parse_dhcp_send_hostname, AF_INET6, 0 +DHCPv6.Hostname, config_parse_hostname, 0, offsetof(Network, dhcp6_hostname) +DHCPv6.RequestOptions, config_parse_dhcp_request_options, AF_INET6, 0 +DHCPv6.UserClass, config_parse_dhcp_user_or_vendor_class, AF_INET6, offsetof(Network, dhcp6_user_class) +DHCPv6.VendorClass, config_parse_dhcp_user_or_vendor_class, AF_INET6, offsetof(Network, dhcp6_vendor_class) +DHCPv6.SendVendorOption, config_parse_dhcp_send_option, AF_INET6, offsetof(Network, dhcp6_client_send_vendor_options) +DHCPv6.PrefixDelegationHint, config_parse_dhcp6_pd_prefix_hint, 0, 0 +DHCPv6.WithoutRA, config_parse_dhcp6_client_start_mode, 0, offsetof(Network, dhcp6_client_start_mode) +DHCPv6.SendOption, config_parse_dhcp_send_option, AF_INET6, offsetof(Network, dhcp6_client_send_options) +DHCPv6.IAID, config_parse_iaid, AF_INET6, 0 +DHCPv6.DUIDType, config_parse_duid_type, 0, offsetof(Network, dhcp6_duid) +DHCPv6.DUIDRawData, config_parse_duid_rawdata, 0, offsetof(Network, dhcp6_duid) +DHCPv6.RapidCommit, config_parse_bool, 0, offsetof(Network, dhcp6_use_rapid_commit) +DHCPv6.NetLabel, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Network, dhcp6_netlabel) +DHCPv6.SendRelease, config_parse_bool, 0, offsetof(Network, dhcp6_send_release) +DHCPv6.NFTSet, config_parse_nft_set, NFT_SET_PARSE_NETWORK, offsetof(Network, dhcp6_nft_set_context) +IPv6AcceptRA.UseGateway, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_gateway) +IPv6AcceptRA.UseRoutePrefix, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_route_prefix) +IPv6AcceptRA.UseAutonomousPrefix, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_autonomous_prefix) +IPv6AcceptRA.UseOnLinkPrefix, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_onlink_prefix) +IPv6AcceptRA.UsePREF64, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_pref64) +IPv6AcceptRA.UseDNS, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_dns) +IPv6AcceptRA.UseDomains, config_parse_ipv6_accept_ra_use_domains, 0, offsetof(Network, ipv6_accept_ra_use_domains) +IPv6AcceptRA.UseMTU, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_mtu) +IPv6AcceptRA.UseHopLimit, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_hop_limit) +IPv6AcceptRA.UseICMP6RateLimit, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_icmp6_ratelimit) +IPv6AcceptRA.DHCPv6Client, config_parse_ipv6_accept_ra_start_dhcp6_client, 0, offsetof(Network, ipv6_accept_ra_start_dhcp6_client) +IPv6AcceptRA.RouteTable, config_parse_dhcp_or_ra_route_table, AF_INET6, 0 +IPv6AcceptRA.RouteMetric, config_parse_ipv6_accept_ra_route_metric, 0, 0 +IPv6AcceptRA.QuickAck, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_quickack) +IPv6AcceptRA.UseCaptivePortal, config_parse_bool, 0, offsetof(Network, ipv6_accept_ra_use_captive_portal) +IPv6AcceptRA.RouterAllowList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_allow_listed_router) +IPv6AcceptRA.RouterDenyList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_deny_listed_router) +IPv6AcceptRA.PrefixAllowList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_allow_listed_prefix) +IPv6AcceptRA.PrefixDenyList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_deny_listed_prefix) +IPv6AcceptRA.RouteAllowList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_allow_listed_route_prefix) +IPv6AcceptRA.RouteDenyList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_deny_listed_route_prefix) +IPv6AcceptRA.Token, config_parse_address_generation_type, 0, offsetof(Network, ndisc_tokens) +IPv6AcceptRA.NetLabel, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Network, ndisc_netlabel) +IPv6AcceptRA.NFTSet, config_parse_nft_set, NFT_SET_PARSE_NETWORK, offsetof(Network, ndisc_nft_set_context) +DHCPServer.ServerAddress, config_parse_dhcp_server_address, 0, 0 +DHCPServer.UplinkInterface, config_parse_uplink, 0, 0 +DHCPServer.RelayTarget, config_parse_in_addr_non_null, AF_INET, offsetof(Network, dhcp_server_relay_target) +DHCPServer.RelayAgentCircuitId, config_parse_dhcp_server_relay_agent_suboption, 0, offsetof(Network, dhcp_server_relay_agent_circuit_id) +DHCPServer.RelayAgentRemoteId, config_parse_dhcp_server_relay_agent_suboption, 0, offsetof(Network, dhcp_server_relay_agent_remote_id) +DHCPServer.MaxLeaseTimeSec, config_parse_sec, 0, offsetof(Network, dhcp_server_max_lease_time_usec) +DHCPServer.DefaultLeaseTimeSec, config_parse_sec, 0, offsetof(Network, dhcp_server_default_lease_time_usec) +DHCPServer.IPv6OnlyPreferredSec, config_parse_dhcp_server_ipv6_only_preferred, 0, offsetof(Network, dhcp_server_ipv6_only_preferred_usec) +DHCPServer.EmitDNS, config_parse_bool, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_DNS].emit) +DHCPServer.DNS, config_parse_dhcp_server_emit, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_DNS]) +DHCPServer.EmitNTP, config_parse_bool, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_NTP].emit) +DHCPServer.NTP, config_parse_dhcp_server_emit, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_NTP]) +DHCPServer.EmitSIP, config_parse_bool, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_SIP].emit) +DHCPServer.SIP, config_parse_dhcp_server_emit, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_SIP]) +DHCPServer.EmitPOP3, config_parse_bool, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_POP3].emit) +DHCPServer.POP3, config_parse_dhcp_server_emit, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_POP3]) +DHCPServer.EmitSMTP, config_parse_bool, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_SMTP].emit) +DHCPServer.SMTP, config_parse_dhcp_server_emit, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_SMTP]) +DHCPServer.EmitLPR, config_parse_bool, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_LPR].emit) +DHCPServer.LPR, config_parse_dhcp_server_emit, 0, offsetof(Network, dhcp_server_emit[SD_DHCP_LEASE_LPR]) +DHCPServer.EmitRouter, config_parse_bool, 0, offsetof(Network, dhcp_server_emit_router) +DHCPServer.Router, config_parse_in_addr_non_null, AF_INET, offsetof(Network, dhcp_server_router) +DHCPServer.EmitTimezone, config_parse_bool, 0, offsetof(Network, dhcp_server_emit_timezone) +DHCPServer.Timezone, config_parse_timezone, 0, offsetof(Network, dhcp_server_timezone) +DHCPServer.PoolOffset, config_parse_uint32, 0, offsetof(Network, dhcp_server_pool_offset) +DHCPServer.PoolSize, config_parse_uint32, 0, offsetof(Network, dhcp_server_pool_size) +DHCPServer.SendVendorOption, config_parse_dhcp_send_option, 0, offsetof(Network, dhcp_server_send_vendor_options) +DHCPServer.SendOption, config_parse_dhcp_send_option, 0, offsetof(Network, dhcp_server_send_options) +DHCPServer.BindToInterface, config_parse_bool, 0, offsetof(Network, dhcp_server_bind_to_interface) +DHCPServer.BootServerAddress, config_parse_in_addr_non_null, AF_INET, offsetof(Network, dhcp_server_boot_server_address) +DHCPServer.BootServerName, config_parse_dns_name, 0, offsetof(Network, dhcp_server_boot_server_name) +DHCPServer.BootFilename, config_parse_string, CONFIG_PARSE_STRING_SAFE_AND_ASCII, offsetof(Network, dhcp_server_boot_filename) +DHCPServer.RapidCommit, config_parse_bool, 0, offsetof(Network, dhcp_server_rapid_commit) +DHCPServerStaticLease.Address, config_parse_dhcp_static_lease_address, 0, 0 +DHCPServerStaticLease.MACAddress, config_parse_dhcp_static_lease_hwaddr, 0, 0 +Bridge.Cost, config_parse_uint32, 0, offsetof(Network, cost) +Bridge.UseBPDU, config_parse_tristate, 0, offsetof(Network, use_bpdu) +Bridge.HairPin, config_parse_tristate, 0, offsetof(Network, hairpin) +Bridge.Isolated, config_parse_tristate, 0, offsetof(Network, isolated) +Bridge.FastLeave, config_parse_tristate, 0, offsetof(Network, fast_leave) +Bridge.AllowPortToBeRoot, config_parse_tristate, 0, offsetof(Network, allow_port_to_be_root) +Bridge.UnicastFlood, config_parse_tristate, 0, offsetof(Network, unicast_flood) +Bridge.MulticastFlood, config_parse_tristate, 0, offsetof(Network, multicast_flood) +Bridge.MulticastToUnicast, config_parse_tristate, 0, offsetof(Network, multicast_to_unicast) +Bridge.NeighborSuppression, config_parse_tristate, 0, offsetof(Network, neighbor_suppression) +Bridge.Learning, config_parse_tristate, 0, offsetof(Network, learning) +Bridge.ProxyARP, config_parse_tristate, 0, offsetof(Network, bridge_proxy_arp) +Bridge.ProxyARPWiFi, config_parse_tristate, 0, offsetof(Network, bridge_proxy_arp_wifi) +Bridge.Priority, config_parse_bridge_port_priority, 0, offsetof(Network, priority) +Bridge.MulticastRouter, config_parse_multicast_router, 0, offsetof(Network, multicast_router) +BridgeFDB.MACAddress, config_parse_fdb_hwaddr, 0, 0 +BridgeFDB.VLANId, config_parse_fdb_vlan_id, 0, 0 +BridgeFDB.Destination, config_parse_fdb_destination, 0, 0 +BridgeFDB.VNI, config_parse_fdb_vxlan_vni, 0, 0 +BridgeFDB.AssociatedWith, config_parse_fdb_ntf_flags, 0, 0 +BridgeFDB.OutgoingInterface, config_parse_fdb_interface, 0, 0 +BridgeMDB.MulticastGroupAddress, config_parse_mdb_group_address, 0, 0 +BridgeMDB.VLANId, config_parse_mdb_vlan_id, 0, 0 +BridgeVLAN.PVID, config_parse_brvlan_pvid, 0, 0 +BridgeVLAN.VLAN, config_parse_brvlan_vlan, 0, 0 +BridgeVLAN.EgressUntagged, config_parse_brvlan_untagged, 0, 0 +DHCPPrefixDelegation.UplinkInterface, config_parse_uplink, 0, 0 +DHCPPrefixDelegation.SubnetId, config_parse_dhcp_pd_subnet_id, 0, offsetof(Network, dhcp_pd_subnet_id) +DHCPPrefixDelegation.Announce, config_parse_bool, 0, offsetof(Network, dhcp_pd_announce) +DHCPPrefixDelegation.Assign, config_parse_bool, 0, offsetof(Network, dhcp_pd_assign) +DHCPPrefixDelegation.ManageTemporaryAddress, config_parse_bool, 0, offsetof(Network, dhcp_pd_manage_temporary_address) +DHCPPrefixDelegation.Token, config_parse_address_generation_type, 0, offsetof(Network, dhcp_pd_tokens) +DHCPPrefixDelegation.RouteMetric, config_parse_uint32, 0, offsetof(Network, dhcp_pd_route_metric) +DHCPPrefixDelegation.NetLabel, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Network, dhcp_pd_netlabel) +DHCPPrefixDelegation.NFTSet, config_parse_nft_set, NFT_SET_PARSE_NETWORK, offsetof(Network, dhcp_pd_nft_set_context) +IPv6SendRA.RouterLifetimeSec, config_parse_router_lifetime, 0, offsetof(Network, router_lifetime_usec) +IPv6SendRA.RetransmitSec, config_parse_router_retransmit, 0, offsetof(Network, router_retransmit_usec) +IPv6SendRA.Managed, config_parse_bool, 0, offsetof(Network, router_managed) +IPv6SendRA.OtherInformation, config_parse_bool, 0, offsetof(Network, router_other_information) +IPv6SendRA.RouterPreference, config_parse_router_preference, 0, 0 +IPv6SendRA.HopLimit, config_parse_uint8, 0, offsetof(Network, router_hop_limit) +IPv6SendRA.EmitDNS, config_parse_bool, 0, offsetof(Network, router_emit_dns) +IPv6SendRA.DNS, config_parse_radv_dns, 0, 0 +IPv6SendRA.EmitDomains, config_parse_bool, 0, offsetof(Network, router_emit_domains) +IPv6SendRA.Domains, config_parse_radv_search_domains, 0, 0 +IPv6SendRA.DNSLifetimeSec, config_parse_sec, 0, offsetof(Network, router_dns_lifetime_usec) +IPv6SendRA.UplinkInterface, config_parse_uplink, 0, 0 +IPv6SendRA.HomeAgent, config_parse_bool, 0, offsetof(Network, router_home_agent_information) +IPv6SendRA.HomeAgentLifetimeSec, config_parse_router_home_agent_lifetime, 0, offsetof(Network, home_agent_lifetime_usec) +IPv6SendRA.HomeAgentPreference, config_parse_uint16, 0, offsetof(Network, router_home_agent_preference) +IPv6Prefix.Prefix, config_parse_prefix, 0, 0 +IPv6Prefix.OnLink, config_parse_prefix_boolean, 0, 0 +IPv6Prefix.AddressAutoconfiguration, config_parse_prefix_boolean, 0, 0 +IPv6Prefix.ValidLifetimeSec, config_parse_prefix_lifetime, 0, 0 +IPv6Prefix.PreferredLifetimeSec, config_parse_prefix_lifetime, 0, 0 +IPv6Prefix.Assign, config_parse_prefix_boolean, 0, 0 +IPv6Prefix.RouteMetric, config_parse_prefix_metric, 0, 0 +IPv6Prefix.Token, config_parse_prefix_token, 0, 0 +IPv6RoutePrefix.Route, config_parse_route_prefix, 0, 0 +IPv6RoutePrefix.LifetimeSec, config_parse_route_prefix_lifetime, 0, 0 +IPv6PREF64Prefix.Prefix, config_parse_pref64_prefix, 0, 0 +IPv6PREF64Prefix.LifetimeSec, config_parse_pref64_prefix_lifetime, 0, 0 +LLDP.MUDURL, config_parse_mud_url, 0, offsetof(Network, lldp_mudurl) +CAN.BitRate, config_parse_can_bitrate, 0, offsetof(Network, can_bitrate) +CAN.SamplePoint, config_parse_permille, 0, offsetof(Network, can_sample_point) +CAN.TimeQuantaNSec, config_parse_can_time_quanta, 0, offsetof(Network, can_time_quanta_ns) +CAN.PropagationSegment, config_parse_uint32, 0, offsetof(Network, can_propagation_segment) +CAN.PhaseBufferSegment1, config_parse_uint32, 0, offsetof(Network, can_phase_buffer_segment_1) +CAN.PhaseBufferSegment2, config_parse_uint32, 0, offsetof(Network, can_phase_buffer_segment_2) +CAN.SyncJumpWidth, config_parse_uint32, 0, offsetof(Network, can_sync_jump_width) +CAN.DataBitRate, config_parse_can_bitrate, 0, offsetof(Network, can_data_bitrate) +CAN.DataSamplePoint, config_parse_permille, 0, offsetof(Network, can_data_sample_point) +CAN.DataTimeQuantaNSec, config_parse_can_time_quanta, 0, offsetof(Network, can_data_time_quanta_ns) +CAN.DataPropagationSegment, config_parse_uint32, 0, offsetof(Network, can_data_propagation_segment) +CAN.DataPhaseBufferSegment1, config_parse_uint32, 0, offsetof(Network, can_data_phase_buffer_segment_1) +CAN.DataPhaseBufferSegment2, config_parse_uint32, 0, offsetof(Network, can_data_phase_buffer_segment_2) +CAN.DataSyncJumpWidth, config_parse_uint32, 0, offsetof(Network, can_data_sync_jump_width) +CAN.RestartSec, config_parse_can_restart_usec, 0, offsetof(Network, can_restart_us) +CAN.Loopback, config_parse_can_control_mode, CAN_CTRLMODE_LOOPBACK, 0 +CAN.ListenOnly, config_parse_can_control_mode, CAN_CTRLMODE_LISTENONLY, 0 +CAN.TripleSampling, config_parse_can_control_mode, CAN_CTRLMODE_3_SAMPLES, 0 +CAN.OneShot, config_parse_can_control_mode, CAN_CTRLMODE_ONE_SHOT, 0 +CAN.BusErrorReporting, config_parse_can_control_mode, CAN_CTRLMODE_BERR_REPORTING, 0 +CAN.FDMode, config_parse_can_control_mode, CAN_CTRLMODE_FD, 0 +CAN.PresumeACK, config_parse_can_control_mode, CAN_CTRLMODE_PRESUME_ACK, 0 +CAN.FDNonISO, config_parse_can_control_mode, CAN_CTRLMODE_FD_NON_ISO, 0 +CAN.ClassicDataLengthCode, config_parse_can_control_mode, CAN_CTRLMODE_CC_LEN8_DLC, 0 +CAN.Termination, config_parse_can_termination, 0, 0 +IPoIB.Mode, config_parse_ipoib_mode, 0, offsetof(Network, ipoib_mode) +IPoIB.IgnoreUserspaceMulticastGroups, config_parse_tristate, 0, offsetof(Network, ipoib_umcast) +QDisc.Parent, config_parse_qdisc_parent, _QDISC_KIND_INVALID, 0 +QDisc.Handle, config_parse_qdisc_handle, _QDISC_KIND_INVALID, 0 +BFIFO.Parent, config_parse_qdisc_parent, QDISC_KIND_BFIFO, 0 +BFIFO.Handle, config_parse_qdisc_handle, QDISC_KIND_BFIFO, 0 +BFIFO.LimitBytes, config_parse_bfifo_size, QDISC_KIND_BFIFO, 0 +CAKE.Parent, config_parse_qdisc_parent, QDISC_KIND_CAKE, 0 +CAKE.Handle, config_parse_qdisc_handle, QDISC_KIND_CAKE, 0 +CAKE.Bandwidth, config_parse_cake_bandwidth, QDISC_KIND_CAKE, 0 +CAKE.AutoRateIngress, config_parse_cake_tristate, QDISC_KIND_CAKE, 0 +CAKE.OverheadBytes, config_parse_cake_overhead, QDISC_KIND_CAKE, 0 +CAKE.MPUBytes, config_parse_cake_mpu, QDISC_KIND_CAKE, 0 +CAKE.CompensationMode, config_parse_cake_compensation_mode, QDISC_KIND_CAKE, 0 +CAKE.UseRawPacketSize, config_parse_cake_tristate, QDISC_KIND_CAKE, 0 +CAKE.FlowIsolationMode, config_parse_cake_flow_isolation_mode, QDISC_KIND_CAKE, 0 +CAKE.NAT, config_parse_cake_tristate, QDISC_KIND_CAKE, 0 +CAKE.PriorityQueueingPreset, config_parse_cake_priority_queueing_preset, QDISC_KIND_CAKE, 0 +CAKE.FirewallMark, config_parse_cake_fwmark, QDISC_KIND_CAKE, 0 +CAKE.Wash, config_parse_cake_tristate, QDISC_KIND_CAKE, 0 +CAKE.SplitGSO, config_parse_cake_tristate, QDISC_KIND_CAKE, 0 +CAKE.RTTSec, config_parse_cake_rtt, QDISC_KIND_CAKE, 0 +CAKE.AckFilter, config_parse_cake_ack_filter, QDISC_KIND_CAKE, 0 +ControlledDelay.Parent, config_parse_qdisc_parent, QDISC_KIND_CODEL, 0 +ControlledDelay.Handle, config_parse_qdisc_handle, QDISC_KIND_CODEL, 0 +ControlledDelay.PacketLimit, config_parse_controlled_delay_u32, QDISC_KIND_CODEL, 0 +ControlledDelay.TargetSec, config_parse_controlled_delay_usec, QDISC_KIND_CODEL, 0 +ControlledDelay.IntervalSec, config_parse_controlled_delay_usec, QDISC_KIND_CODEL, 0 +ControlledDelay.CEThresholdSec, config_parse_controlled_delay_usec, QDISC_KIND_CODEL, 0 +ControlledDelay.ECN, config_parse_controlled_delay_bool, QDISC_KIND_CODEL, 0 +DeficitRoundRobinScheduler.Parent, config_parse_qdisc_parent, QDISC_KIND_DRR, 0 +DeficitRoundRobinScheduler.Handle, config_parse_qdisc_handle, QDISC_KIND_DRR, 0 +DeficitRoundRobinSchedulerClass.Parent, config_parse_tclass_parent, TCLASS_KIND_DRR, 0 +DeficitRoundRobinSchedulerClass.ClassId, config_parse_tclass_classid, TCLASS_KIND_DRR, 0 +DeficitRoundRobinSchedulerClass.QuantumBytes, config_parse_drr_size, TCLASS_KIND_DRR, 0 +EnhancedTransmissionSelection.Parent, config_parse_qdisc_parent, QDISC_KIND_ETS, 0 +EnhancedTransmissionSelection.Handle, config_parse_qdisc_handle, QDISC_KIND_ETS, 0 +EnhancedTransmissionSelection.Bands, config_parse_ets_u8, QDISC_KIND_ETS, 0 +EnhancedTransmissionSelection.StrictBands, config_parse_ets_u8, QDISC_KIND_ETS, 0 +EnhancedTransmissionSelection.QuantumBytes, config_parse_ets_quanta, QDISC_KIND_ETS, 0 +EnhancedTransmissionSelection.PriorityMap, config_parse_ets_prio, QDISC_KIND_ETS, 0 +PFIFO.Parent, config_parse_qdisc_parent, QDISC_KIND_PFIFO, 0 +PFIFO.Handle, config_parse_qdisc_handle, QDISC_KIND_PFIFO, 0 +PFIFO.PacketLimit, config_parse_pfifo_size, QDISC_KIND_PFIFO, 0 +PFIFOFast.Parent, config_parse_qdisc_parent, QDISC_KIND_PFIFO_FAST, 0 +PFIFOFast.Handle, config_parse_qdisc_handle, QDISC_KIND_PFIFO_FAST, 0 +PFIFOHeadDrop.Parent, config_parse_qdisc_parent, QDISC_KIND_PFIFO_HEAD_DROP, 0 +PFIFOHeadDrop.Handle, config_parse_qdisc_handle, QDISC_KIND_PFIFO_HEAD_DROP, 0 +PFIFOHeadDrop.PacketLimit, config_parse_pfifo_size, QDISC_KIND_PFIFO_HEAD_DROP, 0 +QuickFairQueueing.Parent, config_parse_qdisc_parent, QDISC_KIND_QFQ, 0 +QuickFairQueueing.Handle, config_parse_qdisc_handle, QDISC_KIND_QFQ, 0 +QuickFairQueueingClass.Parent, config_parse_tclass_parent, TCLASS_KIND_QFQ, 0 +QuickFairQueueingClass.ClassId, config_parse_tclass_classid, TCLASS_KIND_QFQ, 0 +QuickFairQueueingClass.Weight, config_parse_quick_fair_queueing_weight, TCLASS_KIND_QFQ, 0 +QuickFairQueueingClass.MaxPacketBytes, config_parse_quick_fair_queueing_max_packet, TCLASS_KIND_QFQ, 0 +FairQueueing.Parent, config_parse_qdisc_parent, QDISC_KIND_FQ, 0 +FairQueueing.Handle, config_parse_qdisc_handle, QDISC_KIND_FQ, 0 +FairQueueing.PacketLimit, config_parse_fair_queueing_u32, QDISC_KIND_FQ, 0 +FairQueueing.FlowLimit, config_parse_fair_queueing_u32, QDISC_KIND_FQ, 0 +FairQueueing.QuantumBytes, config_parse_fair_queueing_size, QDISC_KIND_FQ, 0 +FairQueueing.InitialQuantumBytes, config_parse_fair_queueing_size, QDISC_KIND_FQ, 0 +FairQueueing.MaximumRate, config_parse_fair_queueing_max_rate, QDISC_KIND_FQ, 0 +FairQueueing.Buckets, config_parse_fair_queueing_u32, QDISC_KIND_FQ, 0 +FairQueueing.OrphanMask, config_parse_fair_queueing_u32, QDISC_KIND_FQ, 0 +FairQueueing.Pacing, config_parse_fair_queueing_bool, QDISC_KIND_FQ, 0 +FairQueueing.CEThresholdSec, config_parse_fair_queueing_usec, QDISC_KIND_FQ, 0 +FairQueueingControlledDelay.Parent, config_parse_qdisc_parent, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.Handle, config_parse_qdisc_handle, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.PacketLimit, config_parse_fair_queueing_controlled_delay_u32, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.MemoryLimitBytes, config_parse_fair_queueing_controlled_delay_size, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.Flows, config_parse_fair_queueing_controlled_delay_u32, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.QuantumBytes, config_parse_fair_queueing_controlled_delay_size, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.TargetSec, config_parse_fair_queueing_controlled_delay_usec, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.IntervalSec, config_parse_fair_queueing_controlled_delay_usec, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.CEThresholdSec, config_parse_fair_queueing_controlled_delay_usec, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.ECN, config_parse_fair_queueing_controlled_delay_bool, QDISC_KIND_FQ_CODEL, 0 +FlowQueuePIE.Parent, config_parse_qdisc_parent, QDISC_KIND_FQ_PIE, 0 +FlowQueuePIE.Handle, config_parse_qdisc_handle, QDISC_KIND_FQ_PIE, 0 +FlowQueuePIE.PacketLimit, config_parse_fq_pie_packet_limit, QDISC_KIND_FQ_PIE, 0 +GenericRandomEarlyDetection.Parent, config_parse_qdisc_parent, QDISC_KIND_GRED, 0 +GenericRandomEarlyDetection.Handle, config_parse_qdisc_handle, QDISC_KIND_GRED, 0 +GenericRandomEarlyDetection.VirtualQueues, config_parse_generic_random_early_detection_u32, QDISC_KIND_GRED, 0 +GenericRandomEarlyDetection.DefaultVirtualQueue, config_parse_generic_random_early_detection_u32, QDISC_KIND_GRED, 0 +GenericRandomEarlyDetection.GenericRIO, config_parse_generic_random_early_detection_bool, QDISC_KIND_GRED, 0 +HeavyHitterFilter.Parent, config_parse_qdisc_parent, QDISC_KIND_HHF, 0 +HeavyHitterFilter.Handle, config_parse_qdisc_handle, QDISC_KIND_HHF, 0 +HeavyHitterFilter.PacketLimit, config_parse_heavy_hitter_filter_packet_limit, QDISC_KIND_HHF, 0 +HierarchyTokenBucket.Parent, config_parse_qdisc_parent, QDISC_KIND_HTB, 0 +HierarchyTokenBucket.Handle, config_parse_qdisc_handle, QDISC_KIND_HTB, 0 +HierarchyTokenBucket.DefaultClass, config_parse_hierarchy_token_bucket_default_class, QDISC_KIND_HTB, 0 +HierarchyTokenBucket.RateToQuantum, config_parse_hierarchy_token_bucket_u32, QDISC_KIND_HTB, 0 +HierarchyTokenBucketClass.Parent, config_parse_tclass_parent, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.ClassId, config_parse_tclass_classid, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.Priority, config_parse_hierarchy_token_bucket_class_u32, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.QuantumBytes, config_parse_hierarchy_token_bucket_class_size, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.MTUBytes, config_parse_hierarchy_token_bucket_class_size, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.OverheadBytes, config_parse_hierarchy_token_bucket_class_size, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.Rate, config_parse_hierarchy_token_bucket_class_rate, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.CeilRate, config_parse_hierarchy_token_bucket_class_rate, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.BufferBytes, config_parse_hierarchy_token_bucket_class_size, TCLASS_KIND_HTB, 0 +HierarchyTokenBucketClass.CeilBufferBytes, config_parse_hierarchy_token_bucket_class_size, TCLASS_KIND_HTB, 0 +NetworkEmulator.Parent, config_parse_qdisc_parent, QDISC_KIND_NETEM, 0 +NetworkEmulator.Handle, config_parse_qdisc_handle, QDISC_KIND_NETEM, 0 +NetworkEmulator.DelaySec, config_parse_network_emulator_delay, QDISC_KIND_NETEM, 0 +NetworkEmulator.DelayJitterSec, config_parse_network_emulator_delay, QDISC_KIND_NETEM, 0 +NetworkEmulator.LossRate, config_parse_network_emulator_rate, QDISC_KIND_NETEM, 0 +NetworkEmulator.DuplicateRate, config_parse_network_emulator_rate, QDISC_KIND_NETEM, 0 +NetworkEmulator.PacketLimit, config_parse_network_emulator_packet_limit, QDISC_KIND_NETEM, 0 +PIE.Parent, config_parse_qdisc_parent, QDISC_KIND_PIE, 0 +PIE.Handle, config_parse_qdisc_handle, QDISC_KIND_PIE, 0 +PIE.PacketLimit, config_parse_pie_packet_limit, QDISC_KIND_PIE, 0 +StochasticFairBlue.Parent, config_parse_qdisc_parent, QDISC_KIND_SFB, 0 +StochasticFairBlue.Handle, config_parse_qdisc_handle, QDISC_KIND_SFB, 0 +StochasticFairBlue.PacketLimit, config_parse_stochastic_fair_blue_u32, QDISC_KIND_SFB, 0 +StochasticFairnessQueueing.Parent, config_parse_qdisc_parent, QDISC_KIND_SFQ, 0 +StochasticFairnessQueueing.Handle, config_parse_qdisc_handle, QDISC_KIND_SFQ, 0 +StochasticFairnessQueueing.PerturbPeriodSec, config_parse_stochastic_fairness_queueing_perturb_period, QDISC_KIND_SFQ, 0 +TokenBucketFilter.Parent, config_parse_qdisc_parent, QDISC_KIND_TBF, 0 +TokenBucketFilter.Handle, config_parse_qdisc_handle, QDISC_KIND_TBF, 0 +TokenBucketFilter.Rate, config_parse_token_bucket_filter_rate, QDISC_KIND_TBF, 0 +TokenBucketFilter.BurstBytes, config_parse_token_bucket_filter_size, QDISC_KIND_TBF, 0 +TokenBucketFilter.LimitBytes, config_parse_token_bucket_filter_size, QDISC_KIND_TBF, 0 +TokenBucketFilter.MTUBytes, config_parse_token_bucket_filter_size, QDISC_KIND_TBF, 0 +TokenBucketFilter.MPUBytes, config_parse_token_bucket_filter_size, QDISC_KIND_TBF, 0 +TokenBucketFilter.PeakRate, config_parse_token_bucket_filter_rate, QDISC_KIND_TBF, 0 +TokenBucketFilter.LatencySec, config_parse_token_bucket_filter_latency, QDISC_KIND_TBF, 0 +TrivialLinkEqualizer.Parent, config_parse_qdisc_parent, QDISC_KIND_TEQL, 0 +TrivialLinkEqualizer.Handle, config_parse_qdisc_handle, QDISC_KIND_TEQL, 0 +TrivialLinkEqualizer.Id, config_parse_trivial_link_equalizer_id, QDISC_KIND_TEQL, 0 +/* backwards compatibility: do not add new entries to this section */ +Network.IPv4LL, config_parse_ipv4ll, 0, offsetof(Network, link_local) +Network.IPv6Token, config_parse_address_generation_type, 0, offsetof(Network, ndisc_tokens) +Network.IPv6PrefixDelegation, config_parse_router_prefix_delegation, 0, offsetof(Network, router_prefix_delegation) +Network.DHCPv6PrefixDelegation, config_parse_tristate, 0, offsetof(Network, dhcp_pd) +IPv6PrefixDelegation.RouterLifetimeSec, config_parse_sec, 0, offsetof(Network, router_lifetime_usec) +IPv6PrefixDelegation.Managed, config_parse_bool, 0, offsetof(Network, router_managed) +IPv6PrefixDelegation.OtherInformation, config_parse_bool, 0, offsetof(Network, router_other_information) +IPv6PrefixDelegation.RouterPreference, config_parse_router_preference, 0, 0 +IPv6PrefixDelegation.EmitDNS, config_parse_bool, 0, offsetof(Network, router_emit_dns) +IPv6PrefixDelegation.DNS, config_parse_radv_dns, 0, 0 +IPv6PrefixDelegation.EmitDomains, config_parse_bool, 0, offsetof(Network, router_emit_domains) +IPv6PrefixDelegation.Domains, config_parse_radv_search_domains, 0, 0 +IPv6PrefixDelegation.DNSLifetimeSec, config_parse_sec, 0, offsetof(Network, router_dns_lifetime_usec) +DHCPv4.BlackList, config_parse_in_addr_prefixes, AF_INET, offsetof(Network, dhcp_deny_listed_ip) +DHCP.ClientIdentifier, config_parse_dhcp_client_identifier, 0, offsetof(Network, dhcp_client_identifier) +DHCP.UseDNS, config_parse_dhcp_use_dns, AF_UNSPEC, 0 +DHCP.UseNTP, config_parse_dhcp_use_ntp, AF_UNSPEC, 0 +DHCP.UseMTU, config_parse_bool, 0, offsetof(Network, dhcp_use_mtu) +DHCP.UseHostname, config_parse_bool, 0, offsetof(Network, dhcp_use_hostname) +DHCP.UseDomains, config_parse_dhcp_use_domains, AF_UNSPEC, 0 +DHCP.UseDomainName, config_parse_dhcp_use_domains, AF_UNSPEC, 0 +DHCP.UseRoutes, config_parse_bool, 0, offsetof(Network, dhcp_use_routes) +DHCP.Anonymize, config_parse_bool, 0, offsetof(Network, dhcp_anonymize) +DHCP.SendHostname, config_parse_dhcp_send_hostname, AF_UNSPEC, 0 +DHCP.Hostname, config_parse_hostname, 0, offsetof(Network, dhcp_hostname) +DHCP.RequestBroadcast, config_parse_tristate, 0, offsetof(Network, dhcp_broadcast) +DHCP.CriticalConnection, config_parse_tristate, 0, offsetof(Network, dhcp_critical) +DHCP.VendorClassIdentifier, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Network, dhcp_vendor_class_identifier) +DHCP.UserClass, config_parse_dhcp_user_or_vendor_class, AF_INET, offsetof(Network, dhcp_user_class) +DHCP.IAID, config_parse_iaid, AF_INET, 0 +DHCP.DUIDType, config_parse_network_duid_type, 0, 0 +DHCP.DUIDRawData, config_parse_network_duid_rawdata, 0, 0 +DHCP.RouteMetric, config_parse_dhcp_route_metric, AF_UNSPEC, 0 +DHCP.RouteTable, config_parse_dhcp_or_ra_route_table, AF_INET, 0 +DHCP.UseTimezone, config_parse_bool, 0, offsetof(Network, dhcp_use_timezone) +DHCP.ListenPort, config_parse_uint16, 0, offsetof(Network, dhcp_client_port) +DHCP.RapidCommit, config_parse_bool, 0, offsetof(Network, dhcp6_use_rapid_commit) +DHCP.ForceDHCPv6PDOtherInformation, config_parse_warn_compat, DISABLED_LEGACY, 0 +DHCPv4.UseDomainName, config_parse_dhcp_use_domains, AF_INET, 0 +DHCPv4.CriticalConnection, config_parse_tristate, 0, offsetof(Network, dhcp_critical) +DHCPv6.RouteMetric, config_parse_ipv6_accept_ra_route_metric, AF_INET6, 0 +DHCPv6.ForceDHCPv6PDOtherInformation, config_parse_warn_compat, DISABLED_LEGACY, 0 +DHCPv6PrefixDelegation.SubnetId, config_parse_dhcp_pd_subnet_id, 0, offsetof(Network, dhcp_pd_subnet_id) +DHCPv6PrefixDelegation.Announce, config_parse_bool, 0, offsetof(Network, dhcp_pd_announce) +DHCPv6PrefixDelegation.Assign, config_parse_bool, 0, offsetof(Network, dhcp_pd_assign) +DHCPv6PrefixDelegation.ManageTemporaryAddress, config_parse_bool, 0, offsetof(Network, dhcp_pd_manage_temporary_address) +DHCPv6PrefixDelegation.Token, config_parse_address_generation_type, 0, offsetof(Network, dhcp_pd_tokens) +DHCPv6PrefixDelegation.RouteMetric, config_parse_uint32, 0, offsetof(Network, dhcp_pd_route_metric) +IPv6AcceptRA.DenyList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_deny_listed_prefix) +IPv6AcceptRA.BlackList, config_parse_in_addr_prefixes, AF_INET6, offsetof(Network, ndisc_deny_listed_prefix) +TrafficControlQueueingDiscipline.Parent, config_parse_qdisc_parent, _QDISC_KIND_INVALID, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorDelaySec, config_parse_network_emulator_delay, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorDelayJitterSec, config_parse_network_emulator_delay, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorLossRate, config_parse_network_emulator_rate, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorDuplicateRate, config_parse_network_emulator_rate, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorPacketLimit, config_parse_network_emulator_packet_limit, 0, 0 +FairQueueing.Quantum, config_parse_fair_queueing_size, QDISC_KIND_FQ, 0 +FairQueueing.InitialQuantum, config_parse_fair_queueing_size, QDISC_KIND_FQ, 0 +FairQueueingControlledDelay.MemoryLimit, config_parse_fair_queueing_controlled_delay_size, QDISC_KIND_FQ_CODEL, 0 +FairQueueingControlledDelay.Quantum, config_parse_fair_queueing_controlled_delay_size, QDISC_KIND_FQ_CODEL, 0 +TokenBucketFilter.Burst, config_parse_token_bucket_filter_size, QDISC_KIND_TBF, 0 +TokenBucketFilter.LimitSize, config_parse_token_bucket_filter_size, QDISC_KIND_TBF, 0 diff --git a/src/network/networkd-network.c b/src/network/networkd-network.c new file mode 100644 index 0000000..dcd3e5a --- /dev/null +++ b/src/network/networkd-network.c @@ -0,0 +1,1349 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "dns-domain.h" +#include "fd-util.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "net-condition.h" +#include "netdev/macvlan.h" +#include "networkd-address-label.h" +#include "networkd-address.h" +#include "networkd-bridge-fdb.h" +#include "networkd-bridge-mdb.h" +#include "networkd-dhcp-common.h" +#include "networkd-dhcp-server-static-lease.h" +#include "networkd-ipv6-proxy-ndp.h" +#include "networkd-manager.h" +#include "networkd-ndisc.h" +#include "networkd-neighbor.h" +#include "networkd-network.h" +#include "networkd-nexthop.h" +#include "networkd-radv.h" +#include "networkd-route.h" +#include "networkd-routing-policy-rule.h" +#include "networkd-sriov.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "qdisc.h" +#include "radv-internal.h" +#include "set.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tclass.h" + +/* Let's assume that anything above this number is a user misconfiguration. */ +#define MAX_NTP_SERVERS 128U + +static int network_resolve_netdev_one(Network *network, const char *name, NetDevKind kind, NetDev **ret) { + const char *kind_string; + NetDev *netdev; + int r; + + /* For test-networkd-conf, the check must be earlier than the assertions. */ + if (!name) + return 0; + + assert(network); + assert(network->manager); + assert(network->filename); + assert(ret); + + if (kind == _NETDEV_KIND_TUNNEL) + kind_string = "tunnel"; + else { + kind_string = netdev_kind_to_string(kind); + if (!kind_string) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Invalid NetDev kind of %s, ignoring assignment.", + network->filename, name); + } + + r = netdev_get(network->manager, name, &netdev); + if (r < 0) + return log_warning_errno(r, "%s: %s NetDev could not be found, ignoring assignment.", + network->filename, name); + + if (netdev->kind != kind && !(kind == _NETDEV_KIND_TUNNEL && + IN_SET(netdev->kind, + NETDEV_KIND_ERSPAN, + NETDEV_KIND_GRE, + NETDEV_KIND_GRETAP, + NETDEV_KIND_IP6GRE, + NETDEV_KIND_IP6GRETAP, + NETDEV_KIND_IP6TNL, + NETDEV_KIND_IPIP, + NETDEV_KIND_SIT, + NETDEV_KIND_VTI, + NETDEV_KIND_VTI6))) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: NetDev %s is not a %s, ignoring assignment", + network->filename, name, kind_string); + + *ret = netdev_ref(netdev); + return 1; +} + +static int network_resolve_stacked_netdevs(Network *network) { + void *name, *kind; + int r; + + assert(network); + + HASHMAP_FOREACH_KEY(kind, name, network->stacked_netdev_names) { + _cleanup_(netdev_unrefp) NetDev *netdev = NULL; + + if (network_resolve_netdev_one(network, name, PTR_TO_INT(kind), &netdev) <= 0) + continue; + + r = hashmap_ensure_put(&network->stacked_netdevs, &string_hash_ops, netdev->ifname, netdev); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_warning_errno(r, "%s: Failed to add NetDev '%s' to network, ignoring: %m", + network->filename, (const char *) name); + + netdev = NULL; + } + + return 0; +} + +int network_verify(Network *network) { + int r; + + assert(network); + assert(network->manager); + assert(network->filename); + + if (net_match_is_empty(&network->match) && !network->conditions) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: No valid settings found in the [Match] section, ignoring file. " + "To match all interfaces, add Name=* in the [Match] section.", + network->filename); + + /* skip out early if configuration does not match the environment */ + if (!condition_test_list(network->conditions, environ, NULL, NULL, NULL)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Conditions in the file do not match the system environment, skipping.", + network->filename); + + if (network->keep_master) { + if (network->batadv_name) + log_warning("%s: BatmanAdvanced= set with KeepMaster= enabled, ignoring BatmanAdvanced=.", + network->filename); + if (network->bond_name) + log_warning("%s: Bond= set with KeepMaster= enabled, ignoring Bond=.", + network->filename); + if (network->bridge_name) + log_warning("%s: Bridge= set with KeepMaster= enabled, ignoring Bridge=.", + network->filename); + if (network->vrf_name) + log_warning("%s: VRF= set with KeepMaster= enabled, ignoring VRF=.", + network->filename); + + network->batadv_name = mfree(network->batadv_name); + network->bond_name = mfree(network->bond_name); + network->bridge_name = mfree(network->bridge_name); + network->vrf_name = mfree(network->vrf_name); + } + + (void) network_resolve_netdev_one(network, network->batadv_name, NETDEV_KIND_BATADV, &network->batadv); + (void) network_resolve_netdev_one(network, network->bond_name, NETDEV_KIND_BOND, &network->bond); + (void) network_resolve_netdev_one(network, network->bridge_name, NETDEV_KIND_BRIDGE, &network->bridge); + (void) network_resolve_netdev_one(network, network->vrf_name, NETDEV_KIND_VRF, &network->vrf); + r = network_resolve_stacked_netdevs(network); + if (r < 0) + return r; + + /* Free unnecessary entries. */ + network->batadv_name = mfree(network->batadv_name); + network->bond_name = mfree(network->bond_name); + network->bridge_name = mfree(network->bridge_name); + network->vrf_name = mfree(network->vrf_name); + network->stacked_netdev_names = hashmap_free_free_key(network->stacked_netdev_names); + + if (network->bond) { + /* Bonding slave does not support addressing. */ + if (network->link_local >= 0 && network->link_local != ADDRESS_FAMILY_NO) { + log_warning("%s: Cannot enable LinkLocalAddressing= when Bond= is specified, disabling LinkLocalAddressing=.", + network->filename); + network->link_local = ADDRESS_FAMILY_NO; + } + if (!ordered_hashmap_isempty(network->addresses_by_section)) + log_warning("%s: Cannot set addresses when Bond= is specified, ignoring addresses.", + network->filename); + if (!hashmap_isempty(network->routes_by_section)) + log_warning("%s: Cannot set routes when Bond= is specified, ignoring routes.", + network->filename); + + network->addresses_by_section = ordered_hashmap_free_with_destructor(network->addresses_by_section, address_free); + network->routes_by_section = hashmap_free_with_destructor(network->routes_by_section, route_free); + } + + if (network->link_local < 0) { + network->link_local = ADDRESS_FAMILY_IPV6; + + if (network->keep_master || network->bridge) + network->link_local = ADDRESS_FAMILY_NO; + else { + NetDev *netdev; + + HASHMAP_FOREACH(netdev, network->stacked_netdevs) { + MacVlan *m; + + if (netdev->kind == NETDEV_KIND_MACVLAN) + m = MACVLAN(netdev); + else if (netdev->kind == NETDEV_KIND_MACVTAP) + m = MACVTAP(netdev); + else + continue; + + if (m->mode == NETDEV_MACVLAN_MODE_PASSTHRU) + network->link_local = ADDRESS_FAMILY_NO; + + /* There won't be a passthru MACVLAN/MACVTAP if there's already one in another mode */ + break; + } + } + } + + if (network->ipv6ll_address_gen_mode == IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_NONE) + SET_FLAG(network->link_local, ADDRESS_FAMILY_IPV6, false); + + if (in6_addr_is_set(&network->ipv6ll_stable_secret) && + network->ipv6ll_address_gen_mode < 0) + network->ipv6ll_address_gen_mode = IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY; + + /* IPMasquerade implies IPForward */ + network->ip_forward |= network->ip_masquerade; + + network_adjust_ipv6_proxy_ndp(network); + network_adjust_ipv6_accept_ra(network); + network_adjust_dhcp(network); + network_adjust_radv(network); + network_adjust_bridge_vlan(network); + + if (network->mtu > 0 && network->dhcp_use_mtu) { + log_warning("%s: MTUBytes= in [Link] section and UseMTU= in [DHCP] section are set. " + "Disabling UseMTU=.", network->filename); + network->dhcp_use_mtu = false; + } + + if (network->dhcp_critical >= 0) { + if (network->keep_configuration >= 0) { + if (network->manager->keep_configuration < 0) + log_warning("%s: Both KeepConfiguration= and deprecated CriticalConnection= are set. " + "Ignoring CriticalConnection=.", network->filename); + } else if (network->dhcp_critical) + /* CriticalConnection=yes also preserve foreign static configurations. */ + network->keep_configuration = KEEP_CONFIGURATION_YES; + else + network->keep_configuration = KEEP_CONFIGURATION_NO; + } + + if (!strv_isempty(network->bind_carrier)) { + if (!IN_SET(network->activation_policy, _ACTIVATION_POLICY_INVALID, ACTIVATION_POLICY_BOUND)) + log_warning("%s: ActivationPolicy=bound is required with BindCarrier=. " + "Setting ActivationPolicy=bound.", network->filename); + network->activation_policy = ACTIVATION_POLICY_BOUND; + } else if (network->activation_policy == ACTIVATION_POLICY_BOUND) { + log_warning("%s: ActivationPolicy=bound requires BindCarrier=. " + "Ignoring ActivationPolicy=bound.", network->filename); + network->activation_policy = ACTIVATION_POLICY_UP; + } + + if (network->activation_policy == _ACTIVATION_POLICY_INVALID) + network->activation_policy = ACTIVATION_POLICY_UP; + + if (network->activation_policy == ACTIVATION_POLICY_ALWAYS_UP) { + if (network->ignore_carrier_loss_set && network->ignore_carrier_loss_usec < USEC_INFINITY) + log_warning("%s: IgnoreCarrierLoss=no or finite timespan conflicts with ActivationPolicy=always-up. " + "Setting IgnoreCarrierLoss=yes.", network->filename); + network->ignore_carrier_loss_set = true; + network->ignore_carrier_loss_usec = USEC_INFINITY; + } + + if (!network->ignore_carrier_loss_set) /* Set implied default. */ + network->ignore_carrier_loss_usec = network->configure_without_carrier ? USEC_INFINITY : 0; + + if (IN_SET(network->activation_policy, ACTIVATION_POLICY_DOWN, ACTIVATION_POLICY_ALWAYS_DOWN, ACTIVATION_POLICY_MANUAL)) { + if (network->required_for_online < 0 || + (network->required_for_online == true && network->activation_policy == ACTIVATION_POLICY_ALWAYS_DOWN)) { + log_debug("%s: Setting RequiredForOnline=no because ActivationPolicy=%s.", network->filename, + activation_policy_to_string(network->activation_policy)); + network->required_for_online = false; + } else if (network->required_for_online == true) + log_warning("%s: RequiredForOnline=yes and ActivationPolicy=%s, " + "this may cause a delay at boot.", network->filename, + activation_policy_to_string(network->activation_policy)); + } + + if (network->required_for_online < 0) + network->required_for_online = true; + + if (network->keep_configuration < 0) + network->keep_configuration = KEEP_CONFIGURATION_NO; + + if (network->ipv6_proxy_ndp == 0 && !set_isempty(network->ipv6_proxy_ndp_addresses)) { + log_warning("%s: IPv6ProxyNDP= is disabled. Ignoring IPv6ProxyNDPAddress=.", network->filename); + network->ipv6_proxy_ndp_addresses = set_free_free(network->ipv6_proxy_ndp_addresses); + } + + r = network_drop_invalid_addresses(network); + if (r < 0) + return r; /* network_drop_invalid_addresses() logs internally. */ + network_drop_invalid_routes(network); + network_drop_invalid_nexthops(network); + network_drop_invalid_bridge_fdb_entries(network); + network_drop_invalid_bridge_mdb_entries(network); + r = network_drop_invalid_neighbors(network); + if (r < 0) + return r; + network_drop_invalid_address_labels(network); + network_drop_invalid_prefixes(network); + network_drop_invalid_route_prefixes(network); + network_drop_invalid_routing_policy_rules(network); + network_drop_invalid_qdisc(network); + network_drop_invalid_tclass(network); + r = sr_iov_drop_invalid_sections(UINT32_MAX, network->sr_iov_by_section); + if (r < 0) + return r; /* sr_iov_drop_invalid_sections() logs internally. */ + network_drop_invalid_static_leases(network); + + return 0; +} + +int network_load_one(Manager *manager, OrderedHashmap **networks, const char *filename) { + _cleanup_free_ char *fname = NULL, *name = NULL; + _cleanup_(network_unrefp) Network *network = NULL; + const char *dropin_dirname; + char *d; + int r; + + assert(manager); + assert(filename); + + r = null_or_empty_path(filename); + if (r < 0) + return log_warning_errno(r, "Failed to check if \"%s\" is empty: %m", filename); + if (r > 0) { + log_debug("Skipping empty file: %s", filename); + return 0; + } + + fname = strdup(filename); + if (!fname) + return log_oom(); + + name = strdup(basename(filename)); + if (!name) + return log_oom(); + + d = strrchr(name, '.'); + if (!d) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid file name: %s", filename); + + *d = '\0'; + + dropin_dirname = strjoina(name, ".network.d"); + + network = new(Network, 1); + if (!network) + return log_oom(); + + *network = (Network) { + .filename = TAKE_PTR(fname), + .name = TAKE_PTR(name), + + .manager = manager, + .n_ref = 1, + + .required_for_online = -1, + .required_operstate_for_online = LINK_OPERSTATE_RANGE_DEFAULT, + .activation_policy = _ACTIVATION_POLICY_INVALID, + .group = -1, + .arp = -1, + .multicast = -1, + .allmulticast = -1, + .promiscuous = -1, + + .keep_configuration = manager->keep_configuration, + + .dhcp_duid.type = _DUID_TYPE_INVALID, + .dhcp_critical = -1, + .dhcp_use_ntp = true, + .dhcp_routes_to_ntp = true, + .dhcp_use_sip = true, + .dhcp_use_captive_portal = true, + .dhcp_use_dns = true, + .dhcp_routes_to_dns = true, + .dhcp_use_hostname = true, + .dhcp_use_routes = true, + .dhcp_use_gateway = -1, + .dhcp_send_hostname = true, + .dhcp_send_release = true, + .dhcp_route_metric = DHCP_ROUTE_METRIC, + .dhcp_use_rapid_commit = -1, + .dhcp_client_identifier = _DHCP_CLIENT_ID_INVALID, + .dhcp_route_table = RT_TABLE_MAIN, + .dhcp_ip_service_type = -1, + .dhcp_broadcast = -1, + .dhcp_ipv6_only_mode = -1, + + .dhcp6_use_address = true, + .dhcp6_use_pd_prefix = true, + .dhcp6_use_dns = true, + .dhcp6_use_hostname = true, + .dhcp6_use_ntp = true, + .dhcp6_use_captive_portal = true, + .dhcp6_use_rapid_commit = true, + .dhcp6_send_hostname = true, + .dhcp6_duid.type = _DUID_TYPE_INVALID, + .dhcp6_client_start_mode = _DHCP6_CLIENT_START_MODE_INVALID, + .dhcp6_send_release = true, + + .dhcp_pd = -1, + .dhcp_pd_announce = true, + .dhcp_pd_assign = true, + .dhcp_pd_manage_temporary_address = true, + .dhcp_pd_subnet_id = -1, + .dhcp_pd_route_metric = DHCP6PD_ROUTE_METRIC, + + .dhcp_server_bind_to_interface = true, + .dhcp_server_emit[SD_DHCP_LEASE_DNS].emit = true, + .dhcp_server_emit[SD_DHCP_LEASE_NTP].emit = true, + .dhcp_server_emit[SD_DHCP_LEASE_SIP].emit = true, + .dhcp_server_emit_router = true, + .dhcp_server_emit_timezone = true, + .dhcp_server_rapid_commit = true, + + .router_lifetime_usec = RADV_DEFAULT_ROUTER_LIFETIME_USEC, + .router_dns_lifetime_usec = RADV_DEFAULT_VALID_LIFETIME_USEC, + .router_emit_dns = true, + .router_emit_domains = true, + + .use_bpdu = -1, + .hairpin = -1, + .isolated = -1, + .fast_leave = -1, + .allow_port_to_be_root = -1, + .unicast_flood = -1, + .multicast_flood = -1, + .multicast_to_unicast = -1, + .neighbor_suppression = -1, + .learning = -1, + .bridge_proxy_arp = -1, + .bridge_proxy_arp_wifi = -1, + .priority = LINK_BRIDGE_PORT_PRIORITY_INVALID, + .multicast_router = _MULTICAST_ROUTER_INVALID, + + .lldp_mode = LLDP_MODE_ROUTERS_ONLY, + .lldp_multicast_mode = _SD_LLDP_MULTICAST_MODE_INVALID, + + .dns_default_route = -1, + .llmnr = RESOLVE_SUPPORT_YES, + .mdns = RESOLVE_SUPPORT_NO, + .dnssec_mode = _DNSSEC_MODE_INVALID, + .dns_over_tls_mode = _DNS_OVER_TLS_MODE_INVALID, + + /* If LinkLocalAddressing= is not set, then set to ADDRESS_FAMILY_IPV6 later. */ + .link_local = _ADDRESS_FAMILY_INVALID, + .ipv6ll_address_gen_mode = _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_INVALID, + + .ipv4_accept_local = -1, + .ipv4_route_localnet = -1, + .ipv6_privacy_extensions = _IPV6_PRIVACY_EXTENSIONS_INVALID, + .ipv6_dad_transmits = -1, + .ipv6_proxy_ndp = -1, + .proxy_arp = -1, + .ipv4_rp_filter = _IP_REVERSE_PATH_FILTER_INVALID, + + .ipv6_accept_ra = -1, + .ipv6_accept_ra_use_dns = true, + .ipv6_accept_ra_use_gateway = true, + .ipv6_accept_ra_use_captive_portal = true, + .ipv6_accept_ra_use_route_prefix = true, + .ipv6_accept_ra_use_autonomous_prefix = true, + .ipv6_accept_ra_use_onlink_prefix = true, + .ipv6_accept_ra_use_mtu = true, + .ipv6_accept_ra_use_hop_limit = true, + .ipv6_accept_ra_use_icmp6_ratelimit = true, + .ipv6_accept_ra_route_table = RT_TABLE_MAIN, + .ipv6_accept_ra_route_metric_high = IPV6RA_ROUTE_METRIC_HIGH, + .ipv6_accept_ra_route_metric_medium = IPV6RA_ROUTE_METRIC_MEDIUM, + .ipv6_accept_ra_route_metric_low = IPV6RA_ROUTE_METRIC_LOW, + .ipv6_accept_ra_start_dhcp6_client = IPV6_ACCEPT_RA_START_DHCP6_CLIENT_YES, + + .can_termination = -1, + + .ipoib_mode = _IP_OVER_INFINIBAND_MODE_INVALID, + .ipoib_umcast = -1, + }; + + r = config_parse_many( + STRV_MAKE_CONST(filename), NETWORK_DIRS, dropin_dirname, /* root = */ NULL, + "Match\0" + "Link\0" + "SR-IOV\0" + "Network\0" + "Address\0" + "Neighbor\0" + "IPv6AddressLabel\0" + "RoutingPolicyRule\0" + "Route\0" + "NextHop\0" + "DHCP\0" /* compat */ + "DHCPv4\0" + "DHCPv6\0" + "DHCPv6PrefixDelegation\0" /* compat */ + "DHCPPrefixDelegation\0" + "DHCPServer\0" + "DHCPServerStaticLease\0" + "IPv6AcceptRA\0" + "IPv6NDPProxyAddress\0" + "Bridge\0" + "BridgeFDB\0" + "BridgeMDB\0" + "BridgeVLAN\0" + "IPv6SendRA\0" + "IPv6PrefixDelegation\0" + "IPv6Prefix\0" + "IPv6RoutePrefix\0" + "IPv6PREF64Prefix\0" + "LLDP\0" + "TrafficControlQueueingDiscipline\0" + "CAN\0" + "QDisc\0" + "BFIFO\0" + "CAKE\0" + "ControlledDelay\0" + "DeficitRoundRobinScheduler\0" + "DeficitRoundRobinSchedulerClass\0" + "EnhancedTransmissionSelection\0" + "FairQueueing\0" + "FairQueueingControlledDelay\0" + "FlowQueuePIE\0" + "GenericRandomEarlyDetection\0" + "HeavyHitterFilter\0" + "HierarchyTokenBucket\0" + "HierarchyTokenBucketClass\0" + "NetworkEmulator\0" + "PFIFO\0" + "PFIFOFast\0" + "PFIFOHeadDrop\0" + "PIE\0" + "QuickFairQueueing\0" + "QuickFairQueueingClass\0" + "StochasticFairBlue\0" + "StochasticFairnessQueueing\0" + "TokenBucketFilter\0" + "TrivialLinkEqualizer\0", + config_item_perf_lookup, network_network_gperf_lookup, + CONFIG_PARSE_WARN, + network, + &network->stats_by_path, + &network->dropins); + if (r < 0) + return r; /* config_parse_many() logs internally. */ + + r = network_add_ipv4ll_route(network); + if (r < 0) + return log_warning_errno(r, "%s: Failed to add IPv4LL route: %m", network->filename); + + r = network_add_default_route_on_device(network); + if (r < 0) + return log_warning_errno(r, "%s: Failed to add default route on device: %m", + network->filename); + + r = network_verify(network); + if (r < 0) + return r; /* network_verify() logs internally. */ + + r = ordered_hashmap_ensure_put(networks, &string_hash_ops, network->name, network); + if (r < 0) + return log_warning_errno(r, "%s: Failed to store configuration into hashmap: %m", filename); + + TAKE_PTR(network); + return 0; +} + +int network_load(Manager *manager, OrderedHashmap **networks) { + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(manager); + + ordered_hashmap_clear_with_destructor(*networks, network_unref); + + r = conf_files_list_strv(&files, ".network", NULL, 0, NETWORK_DIRS); + if (r < 0) + return log_error_errno(r, "Failed to enumerate network files: %m"); + + STRV_FOREACH(f, files) + (void) network_load_one(manager, networks, *f); + + return 0; +} + +int network_reload(Manager *manager) { + OrderedHashmap *new_networks = NULL; + Network *n, *old; + int r; + + assert(manager); + + r = network_load(manager, &new_networks); + if (r < 0) + goto failure; + + ORDERED_HASHMAP_FOREACH(n, new_networks) { + r = network_get_by_name(manager, n->name, &old); + if (r < 0) { + log_debug("Found new .network file: %s", n->filename); + continue; + } + + if (!stats_by_path_equal(n->stats_by_path, old->stats_by_path)) { + log_debug("Found updated .network file: %s", n->filename); + continue; + } + + r = ordered_hashmap_replace(new_networks, old->name, old); + if (r < 0) + goto failure; + + network_ref(old); + network_unref(n); + } + + ordered_hashmap_free_with_destructor(manager->networks, network_unref); + manager->networks = new_networks; + + return manager_build_dhcp_pd_subnet_ids(manager); + +failure: + ordered_hashmap_free_with_destructor(new_networks, network_unref); + + return r; +} + +int manager_build_dhcp_pd_subnet_ids(Manager *manager) { + Network *n; + int r; + + assert(manager); + + set_clear(manager->dhcp_pd_subnet_ids); + + ORDERED_HASHMAP_FOREACH(n, manager->networks) { + if (n->unmanaged) + continue; + + if (!n->dhcp_pd) + continue; + + if (n->dhcp_pd_subnet_id < 0) + continue; + + r = set_ensure_put(&manager->dhcp_pd_subnet_ids, &uint64_hash_ops, &n->dhcp_pd_subnet_id); + if (r < 0) + return r; + } + + return 0; +} + +static Network *network_free(Network *network) { + if (!network) + return NULL; + + free(network->name); + free(network->filename); + free(network->description); + strv_free(network->dropins); + hashmap_free(network->stats_by_path); + + /* conditions */ + net_match_clear(&network->match); + condition_free_list(network->conditions); + + /* link settings */ + strv_free(network->bind_carrier); + + /* NTP */ + strv_free(network->ntp); + + /* DNS */ + for (unsigned i = 0; i < network->n_dns; i++) + in_addr_full_free(network->dns[i]); + free(network->dns); + ordered_set_free(network->search_domains); + ordered_set_free(network->route_domains); + set_free_free(network->dnssec_negative_trust_anchors); + + /* DHCP server */ + free(network->dhcp_server_relay_agent_circuit_id); + free(network->dhcp_server_relay_agent_remote_id); + free(network->dhcp_server_boot_server_name); + free(network->dhcp_server_boot_filename); + free(network->dhcp_server_timezone); + free(network->dhcp_server_uplink_name); + for (sd_dhcp_lease_server_type_t t = 0; t < _SD_DHCP_LEASE_SERVER_TYPE_MAX; t++) + free(network->dhcp_server_emit[t].addresses); + ordered_hashmap_free(network->dhcp_server_send_options); + ordered_hashmap_free(network->dhcp_server_send_vendor_options); + + /* DHCP client */ + free(network->dhcp_vendor_class_identifier); + free(network->dhcp_mudurl); + free(network->dhcp_hostname); + free(network->dhcp_label); + set_free(network->dhcp_deny_listed_ip); + set_free(network->dhcp_allow_listed_ip); + strv_free(network->dhcp_user_class); + set_free(network->dhcp_request_options); + ordered_hashmap_free(network->dhcp_client_send_options); + ordered_hashmap_free(network->dhcp_client_send_vendor_options); + free(network->dhcp_netlabel); + nft_set_context_clear(&network->dhcp_nft_set_context); + + /* DHCPv6 client */ + free(network->dhcp6_mudurl); + free(network->dhcp6_hostname); + strv_free(network->dhcp6_user_class); + strv_free(network->dhcp6_vendor_class); + set_free(network->dhcp6_request_options); + ordered_hashmap_free(network->dhcp6_client_send_options); + ordered_hashmap_free(network->dhcp6_client_send_vendor_options); + free(network->dhcp6_netlabel); + nft_set_context_clear(&network->dhcp6_nft_set_context); + + /* DHCP PD */ + free(network->dhcp_pd_uplink_name); + set_free(network->dhcp_pd_tokens); + free(network->dhcp_pd_netlabel); + nft_set_context_clear(&network->dhcp_pd_nft_set_context); + + /* Router advertisement */ + ordered_set_free(network->router_search_domains); + free(network->router_dns); + free(network->router_uplink_name); + + /* NDisc */ + set_free(network->ndisc_deny_listed_router); + set_free(network->ndisc_allow_listed_router); + set_free(network->ndisc_deny_listed_prefix); + set_free(network->ndisc_allow_listed_prefix); + set_free(network->ndisc_deny_listed_route_prefix); + set_free(network->ndisc_allow_listed_route_prefix); + set_free(network->ndisc_tokens); + free(network->ndisc_netlabel); + nft_set_context_clear(&network->ndisc_nft_set_context); + + /* LLDP */ + free(network->lldp_mudurl); + + /* netdev */ + free(network->batadv_name); + free(network->bridge_name); + free(network->bond_name); + free(network->vrf_name); + hashmap_free_free_key(network->stacked_netdev_names); + netdev_unref(network->bridge); + netdev_unref(network->bond); + netdev_unref(network->vrf); + hashmap_free_with_destructor(network->stacked_netdevs, netdev_unref); + + /* static configs */ + set_free_free(network->ipv6_proxy_ndp_addresses); + ordered_hashmap_free_with_destructor(network->addresses_by_section, address_free); + hashmap_free_with_destructor(network->routes_by_section, route_free); + hashmap_free_with_destructor(network->nexthops_by_section, nexthop_free); + hashmap_free_with_destructor(network->bridge_fdb_entries_by_section, bridge_fdb_free); + hashmap_free_with_destructor(network->bridge_mdb_entries_by_section, bridge_mdb_free); + ordered_hashmap_free_with_destructor(network->neighbors_by_section, neighbor_free); + hashmap_free_with_destructor(network->address_labels_by_section, address_label_free); + hashmap_free_with_destructor(network->prefixes_by_section, prefix_free); + hashmap_free_with_destructor(network->route_prefixes_by_section, route_prefix_free); + hashmap_free_with_destructor(network->pref64_prefixes_by_section, pref64_prefix_free); + hashmap_free_with_destructor(network->rules_by_section, routing_policy_rule_free); + hashmap_free_with_destructor(network->dhcp_static_leases_by_section, dhcp_static_lease_free); + ordered_hashmap_free_with_destructor(network->sr_iov_by_section, sr_iov_free); + hashmap_free_with_destructor(network->qdiscs_by_section, qdisc_free); + hashmap_free_with_destructor(network->tclasses_by_section, tclass_free); + + return mfree(network); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Network, network, network_free); + +int network_get_by_name(Manager *manager, const char *name, Network **ret) { + Network *network; + + assert(manager); + assert(name); + assert(ret); + + network = ordered_hashmap_get(manager->networks, name); + if (!network) + return -ENOENT; + + *ret = network; + + return 0; +} + +bool network_has_static_ipv6_configurations(Network *network) { + Address *address; + Route *route; + BridgeFDB *fdb; + BridgeMDB *mdb; + Neighbor *neighbor; + + assert(network); + + ORDERED_HASHMAP_FOREACH(address, network->addresses_by_section) + if (address->family == AF_INET6) + return true; + + HASHMAP_FOREACH(route, network->routes_by_section) + if (route->family == AF_INET6) + return true; + + HASHMAP_FOREACH(fdb, network->bridge_fdb_entries_by_section) + if (fdb->family == AF_INET6) + return true; + + HASHMAP_FOREACH(mdb, network->bridge_mdb_entries_by_section) + if (mdb->family == AF_INET6) + return true; + + ORDERED_HASHMAP_FOREACH(neighbor, network->neighbors_by_section) + if (neighbor->family == AF_INET6) + return true; + + if (!hashmap_isempty(network->address_labels_by_section)) + return true; + + if (!hashmap_isempty(network->prefixes_by_section)) + return true; + + if (!hashmap_isempty(network->route_prefixes_by_section)) + return true; + + if (!hashmap_isempty(network->pref64_prefixes_by_section)) + return true; + + return false; +} + +int config_parse_stacked_netdev( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *name = NULL; + NetDevKind kind = ltype; + Hashmap **h = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(IN_SET(kind, + NETDEV_KIND_IPOIB, + NETDEV_KIND_IPVLAN, + NETDEV_KIND_IPVTAP, + NETDEV_KIND_MACSEC, + NETDEV_KIND_MACVLAN, + NETDEV_KIND_MACVTAP, + NETDEV_KIND_VLAN, + NETDEV_KIND_VXLAN, + NETDEV_KIND_XFRM, + _NETDEV_KIND_TUNNEL)); + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid netdev name in %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + name = strdup(rvalue); + if (!name) + return log_oom(); + + r = hashmap_ensure_put(h, &string_hash_ops, name, INT_TO_PTR(kind)); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Cannot add NetDev '%s' to network, ignoring assignment: %m", name); + else if (r == 0) + log_syntax(unit, LOG_DEBUG, filename, line, r, + "NetDev '%s' specified twice, ignoring.", name); + else + TAKE_PTR(name); + + return 0; +} + +int config_parse_domains( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *n = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + n->search_domains = ordered_set_free(n->search_domains); + n->route_domains = ordered_set_free(n->route_domains); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL, *normalized = NULL; + const char *domain; + bool is_route; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract search or route domain, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + is_route = w[0] == '~'; + domain = is_route ? w + 1 : w; + + if (dns_name_is_root(domain) || streq(domain, "*")) { + /* If the root domain appears as is, or the special token "*" is found, we'll + * consider this as routing domain, unconditionally. */ + is_route = true; + domain = "."; /* make sure we don't allow empty strings, thus write the root + * domain as "." */ + } else { + r = dns_name_normalize(domain, 0, &normalized); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "'%s' is not a valid domain name, ignoring.", domain); + continue; + } + + domain = normalized; + + if (is_localhost(domain)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "'localhost' domain may not be configured as search or route domain, ignoring assignment: %s", + domain); + continue; + } + } + + OrderedSet **set = is_route ? &n->route_domains : &n->search_domains; + r = ordered_set_put_strdup(set, domain); + if (r == -EEXIST) + continue; + if (r < 0) + return log_oom(); + } +} + +int config_parse_timezone( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **tz = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *tz = mfree(*tz); + return 0; + } + + r = verify_timezone(rvalue, LOG_WARNING); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Timezone is not valid, ignoring assignment: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(tz, rvalue); +} + +int config_parse_dns( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *n = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + for (unsigned i = 0; i < n->n_dns; i++) + in_addr_full_free(n->dns[i]); + n->dns = mfree(n->dns); + n->n_dns = 0; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_(in_addr_full_freep) struct in_addr_full *dns = NULL; + _cleanup_free_ char *w = NULL; + struct in_addr_full **m; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = in_addr_full_new_from_string(w, &dns); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse dns server address, ignoring: %s", w); + continue; + } + + if (IN_SET(dns->port, 53, 853)) + dns->port = 0; + + m = reallocarray(n->dns, n->n_dns + 1, sizeof(struct in_addr_full*)); + if (!m) + return log_oom(); + + m[n->n_dns++] = TAKE_PTR(dns); + n->dns = m; + } +} + +int config_parse_dnssec_negative_trust_anchors( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **nta = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *nta = set_free_free(*nta); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract negative trust anchor domain, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = dns_name_is_valid(w); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "%s is not a valid domain name, ignoring.", w); + continue; + } + + r = set_ensure_consume(nta, &dns_name_hash_ops, TAKE_PTR(w)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_ntp( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***l = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *l = strv_free(*l); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract NTP server name, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = dns_name_is_valid_or_address(w); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "%s is not a valid domain name or IP address, ignoring.", w); + continue; + } + + if (strv_length(*l) > MAX_NTP_SERVERS) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "More than %u NTP servers specified, ignoring \"%s\" and any subsequent entries.", + MAX_NTP_SERVERS, w); + return 0; + } + + r = strv_consume(l, TAKE_PTR(w)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_required_for_online( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + LinkOperationalStateRange range; + bool required = true; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + network->required_for_online = -1; + network->required_operstate_for_online = LINK_OPERSTATE_RANGE_DEFAULT; + return 0; + } + + r = parse_operational_state_range(rvalue, &range); + if (r < 0) { + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s= setting, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + required = r; + range = LINK_OPERSTATE_RANGE_DEFAULT; + } + + network->required_for_online = required; + network->required_operstate_for_online = range; + + return 0; +} + +int config_parse_link_group( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + int r; + int32_t group; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + network->group = -1; + return 0; + } + + r = safe_atoi32(rvalue, &group); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse Group=, ignoring assignment: %s", rvalue); + return 0; + } + + if (group < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Value of Group= must be in the range 0…2147483647, ignoring assignment: %s", rvalue); + return 0; + } + + network->group = group; + return 0; +} + +int config_parse_ignore_carrier_loss( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = ASSERT_PTR(userdata); + usec_t usec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + network->ignore_carrier_loss_set = false; + return 0; + } + + r = parse_boolean(rvalue); + if (r >= 0) { + network->ignore_carrier_loss_set = true; + network->ignore_carrier_loss_usec = r > 0 ? USEC_INFINITY : 0; + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + network->ignore_carrier_loss_set = true; + network->ignore_carrier_loss_usec = usec; + return 0; +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_required_family_for_online, link_required_address_family, AddressFamily, + "Failed to parse RequiredFamilyForOnline= setting"); + +DEFINE_CONFIG_PARSE_ENUM(config_parse_keep_configuration, keep_configuration, KeepConfiguration, + "Failed to parse KeepConfiguration= setting"); + +static const char* const keep_configuration_table[_KEEP_CONFIGURATION_MAX] = { + [KEEP_CONFIGURATION_NO] = "no", + [KEEP_CONFIGURATION_DHCP_ON_STOP] = "dhcp-on-stop", + [KEEP_CONFIGURATION_DHCP] = "dhcp", + [KEEP_CONFIGURATION_STATIC] = "static", + [KEEP_CONFIGURATION_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(keep_configuration, KeepConfiguration, KEEP_CONFIGURATION_YES); + +static const char* const activation_policy_table[_ACTIVATION_POLICY_MAX] = { + [ACTIVATION_POLICY_UP] = "up", + [ACTIVATION_POLICY_ALWAYS_UP] = "always-up", + [ACTIVATION_POLICY_MANUAL] = "manual", + [ACTIVATION_POLICY_ALWAYS_DOWN] = "always-down", + [ACTIVATION_POLICY_DOWN] = "down", + [ACTIVATION_POLICY_BOUND] = "bound", +}; + +DEFINE_STRING_TABLE_LOOKUP(activation_policy, ActivationPolicy); +DEFINE_CONFIG_PARSE_ENUM(config_parse_activation_policy, activation_policy, ActivationPolicy, "Failed to parse activation policy"); diff --git a/src/network/networkd-network.h b/src/network/networkd-network.h new file mode 100644 index 0000000..03131b7 --- /dev/null +++ b/src/network/networkd-network.h @@ -0,0 +1,440 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-lldp-tx.h" + +#include "bridge.h" +#include "condition.h" +#include "conf-parser.h" +#include "firewall-util.h" +#include "hashmap.h" +#include "ipoib.h" +#include "net-condition.h" +#include "netdev.h" +#include "networkd-address.h" +#include "networkd-bridge-vlan.h" +#include "networkd-dhcp-common.h" +#include "networkd-dhcp4.h" +#include "networkd-dhcp6.h" +#include "networkd-ipv6ll.h" +#include "networkd-lldp-rx.h" +#include "networkd-ndisc.h" +#include "networkd-radv.h" +#include "networkd-sysctl.h" +#include "networkd-util.h" +#include "ordered-set.h" +#include "resolve-util.h" +#include "socket-netlink.h" + +typedef enum KeepConfiguration { + KEEP_CONFIGURATION_NO = 0, + KEEP_CONFIGURATION_DHCP_ON_START = 1 << 0, + KEEP_CONFIGURATION_DHCP_ON_STOP = 1 << 1, + KEEP_CONFIGURATION_DHCP = KEEP_CONFIGURATION_DHCP_ON_START | KEEP_CONFIGURATION_DHCP_ON_STOP, + KEEP_CONFIGURATION_STATIC = 1 << 2, + KEEP_CONFIGURATION_YES = KEEP_CONFIGURATION_DHCP | KEEP_CONFIGURATION_STATIC, + _KEEP_CONFIGURATION_MAX, + _KEEP_CONFIGURATION_INVALID = -EINVAL, +} KeepConfiguration; + +typedef enum ActivationPolicy { + ACTIVATION_POLICY_UP, + ACTIVATION_POLICY_ALWAYS_UP, + ACTIVATION_POLICY_MANUAL, + ACTIVATION_POLICY_ALWAYS_DOWN, + ACTIVATION_POLICY_DOWN, + ACTIVATION_POLICY_BOUND, + _ACTIVATION_POLICY_MAX, + _ACTIVATION_POLICY_INVALID = -EINVAL, +} ActivationPolicy; + +typedef struct Manager Manager; + +typedef struct NetworkDHCPServerEmitAddress { + bool emit; + struct in_addr *addresses; + size_t n_addresses; +} NetworkDHCPServerEmitAddress; + +struct Network { + Manager *manager; + + unsigned n_ref; + + char *name; + char *filename; + char **dropins; + Hashmap *stats_by_path; + char *description; + + /* [Match] section */ + NetMatch match; + LIST_HEAD(Condition, conditions); + + /* Master or stacked netdevs */ + bool keep_master; + NetDev *batadv; + NetDev *bridge; + NetDev *bond; + NetDev *vrf; + NetDev *xfrm; + Hashmap *stacked_netdevs; + char *batadv_name; + char *bridge_name; + char *bond_name; + char *vrf_name; + Hashmap *stacked_netdev_names; + + /* [Link] section */ + struct hw_addr_data hw_addr; + uint32_t mtu; + int32_t group; + int arp; + int multicast; + int allmulticast; + int promiscuous; + bool unmanaged; + int required_for_online; /* Is this network required to be considered online? */ + LinkOperationalStateRange required_operstate_for_online; + AddressFamily required_family_for_online; + ActivationPolicy activation_policy; + + /* misc settings */ + bool configure_without_carrier; + bool ignore_carrier_loss_set; + usec_t ignore_carrier_loss_usec; /* timespan */ + KeepConfiguration keep_configuration; + char **bind_carrier; + bool default_route_on_device; + AddressFamily ip_masquerade; + + /* DHCP Client Support */ + AddressFamily dhcp; + struct in_addr dhcp_request_address; + DHCPClientIdentifier dhcp_client_identifier; + DUID dhcp_duid; + uint32_t dhcp_iaid; + bool dhcp_iaid_set; + char *dhcp_vendor_class_identifier; + char *dhcp_mudurl; + char **dhcp_user_class; + char *dhcp_hostname; + char *dhcp_label; + uint64_t dhcp_max_attempts; + uint32_t dhcp_route_metric; + bool dhcp_route_metric_set; + uint32_t dhcp_route_table; + bool dhcp_route_table_set; + usec_t dhcp_fallback_lease_lifetime_usec; + uint32_t dhcp_route_mtu; + uint16_t dhcp_client_port; + int dhcp_critical; + int dhcp_ip_service_type; + int dhcp_socket_priority; + bool dhcp_socket_priority_set; + bool dhcp_anonymize; + bool dhcp_send_hostname; + bool dhcp_send_hostname_set; + int dhcp_broadcast; + int dhcp_ipv6_only_mode; + int dhcp_use_rapid_commit; + bool dhcp_use_dns; + bool dhcp_use_dns_set; + bool dhcp_routes_to_dns; + bool dhcp_use_ntp; + bool dhcp_use_ntp_set; + bool dhcp_routes_to_ntp; + bool dhcp_use_sip; + bool dhcp_use_captive_portal; + bool dhcp_use_mtu; + bool dhcp_use_routes; + int dhcp_use_gateway; + bool dhcp_quickack; + uint32_t dhcp_initial_congestion_window; + uint32_t dhcp_advertised_receive_window; + bool dhcp_use_timezone; + bool dhcp_use_hostname; + bool dhcp_use_6rd; + bool dhcp_send_release; + bool dhcp_send_decline; + DHCPUseDomains dhcp_use_domains; + bool dhcp_use_domains_set; + Set *dhcp_deny_listed_ip; + Set *dhcp_allow_listed_ip; + Set *dhcp_request_options; + OrderedHashmap *dhcp_client_send_options; + OrderedHashmap *dhcp_client_send_vendor_options; + char *dhcp_netlabel; + NFTSetContext dhcp_nft_set_context; + + /* DHCPv6 Client support */ + bool dhcp6_use_address; + bool dhcp6_use_pd_prefix; + bool dhcp6_send_hostname; + bool dhcp6_send_hostname_set; + bool dhcp6_use_dns; + bool dhcp6_use_dns_set; + bool dhcp6_use_hostname; + bool dhcp6_use_ntp; + bool dhcp6_use_ntp_set; + bool dhcp6_use_captive_portal; + bool dhcp6_use_rapid_commit; + DHCPUseDomains dhcp6_use_domains; + bool dhcp6_use_domains_set; + uint32_t dhcp6_iaid; + bool dhcp6_iaid_set; + bool dhcp6_iaid_set_explicitly; + DUID dhcp6_duid; + uint8_t dhcp6_pd_prefix_length; + struct in6_addr dhcp6_pd_prefix_hint; + char *dhcp6_hostname; + char *dhcp6_mudurl; + char **dhcp6_user_class; + char **dhcp6_vendor_class; + DHCP6ClientStartMode dhcp6_client_start_mode; + OrderedHashmap *dhcp6_client_send_options; + OrderedHashmap *dhcp6_client_send_vendor_options; + Set *dhcp6_request_options; + char *dhcp6_netlabel; + bool dhcp6_send_release; + NFTSetContext dhcp6_nft_set_context; + + /* DHCP Server Support */ + bool dhcp_server; + bool dhcp_server_bind_to_interface; + unsigned char dhcp_server_address_prefixlen; + struct in_addr dhcp_server_address_in_addr; + const Address *dhcp_server_address; + int dhcp_server_uplink_index; + char *dhcp_server_uplink_name; + struct in_addr dhcp_server_relay_target; + char *dhcp_server_relay_agent_circuit_id; + char *dhcp_server_relay_agent_remote_id; + NetworkDHCPServerEmitAddress dhcp_server_emit[_SD_DHCP_LEASE_SERVER_TYPE_MAX]; + bool dhcp_server_emit_router; + struct in_addr dhcp_server_router; + bool dhcp_server_emit_timezone; + char *dhcp_server_timezone; + usec_t dhcp_server_default_lease_time_usec, dhcp_server_max_lease_time_usec; + uint32_t dhcp_server_pool_offset; + uint32_t dhcp_server_pool_size; + OrderedHashmap *dhcp_server_send_options; + OrderedHashmap *dhcp_server_send_vendor_options; + struct in_addr dhcp_server_boot_server_address; + char *dhcp_server_boot_server_name; + char *dhcp_server_boot_filename; + usec_t dhcp_server_ipv6_only_preferred_usec; + bool dhcp_server_rapid_commit; + + /* link-local addressing support */ + AddressFamily link_local; + IPv6LinkLocalAddressGenMode ipv6ll_address_gen_mode; + struct in6_addr ipv6ll_stable_secret; + struct in_addr ipv4ll_start_address; + bool ipv4ll_route; + + /* IPv6 RA support */ + RADVPrefixDelegation router_prefix_delegation; + usec_t router_lifetime_usec; + uint8_t router_preference; + usec_t router_retransmit_usec; + uint8_t router_hop_limit; + bool router_managed; + bool router_other_information; + bool router_emit_dns; + bool router_emit_domains; + usec_t router_dns_lifetime_usec; + struct in6_addr *router_dns; + unsigned n_router_dns; + OrderedSet *router_search_domains; + int router_uplink_index; + char *router_uplink_name; + /* Mobile IPv6 Home Agent */ + bool router_home_agent_information; + uint16_t router_home_agent_preference; + usec_t home_agent_lifetime_usec; + + /* DHCP Prefix Delegation support */ + int dhcp_pd; + bool dhcp_pd_announce; + bool dhcp_pd_assign; + bool dhcp_pd_manage_temporary_address; + int64_t dhcp_pd_subnet_id; + uint32_t dhcp_pd_route_metric; + Set *dhcp_pd_tokens; + int dhcp_pd_uplink_index; + char *dhcp_pd_uplink_name; + char *dhcp_pd_netlabel; + NFTSetContext dhcp_pd_nft_set_context; + + /* Bridge Support */ + int use_bpdu; + int hairpin; + int isolated; + int fast_leave; + int allow_port_to_be_root; + int unicast_flood; + int multicast_flood; + int multicast_to_unicast; + int neighbor_suppression; + int learning; + int bridge_proxy_arp; + int bridge_proxy_arp_wifi; + uint32_t cost; + uint16_t priority; + MulticastRouter multicast_router; + + /* Bridge VLAN */ + bool use_br_vlan; + uint16_t pvid; + uint32_t br_vid_bitmap[BRIDGE_VLAN_BITMAP_LEN]; + uint32_t br_untagged_bitmap[BRIDGE_VLAN_BITMAP_LEN]; + + /* CAN support */ + uint32_t can_bitrate; + unsigned can_sample_point; + nsec_t can_time_quanta_ns; + uint32_t can_propagation_segment; + uint32_t can_phase_buffer_segment_1; + uint32_t can_phase_buffer_segment_2; + uint32_t can_sync_jump_width; + uint32_t can_data_bitrate; + unsigned can_data_sample_point; + nsec_t can_data_time_quanta_ns; + uint32_t can_data_propagation_segment; + uint32_t can_data_phase_buffer_segment_1; + uint32_t can_data_phase_buffer_segment_2; + uint32_t can_data_sync_jump_width; + usec_t can_restart_us; + uint32_t can_control_mode_mask; + uint32_t can_control_mode_flags; + uint16_t can_termination; + bool can_termination_set; + + /* IPoIB support */ + IPoIBMode ipoib_mode; + int ipoib_umcast; + + /* sysctl settings */ + AddressFamily ip_forward; + int ipv4_accept_local; + int ipv4_route_localnet; + int ipv6_dad_transmits; + uint8_t ipv6_hop_limit; + int proxy_arp; + uint32_t ipv6_mtu; + IPv6PrivacyExtensions ipv6_privacy_extensions; + IPReversePathFilter ipv4_rp_filter; + int ipv6_proxy_ndp; + Set *ipv6_proxy_ndp_addresses; + + /* IPv6 accept RA */ + int ipv6_accept_ra; + bool ipv6_accept_ra_use_dns; + bool ipv6_accept_ra_use_gateway; + bool ipv6_accept_ra_use_route_prefix; + bool ipv6_accept_ra_use_autonomous_prefix; + bool ipv6_accept_ra_use_onlink_prefix; + bool ipv6_accept_ra_use_mtu; + bool ipv6_accept_ra_use_hop_limit; + bool ipv6_accept_ra_use_icmp6_ratelimit; + bool ipv6_accept_ra_quickack; + bool ipv6_accept_ra_use_captive_portal; + bool ipv6_accept_ra_use_pref64; + bool active_slave; + bool primary_slave; + DHCPUseDomains ipv6_accept_ra_use_domains; + IPv6AcceptRAStartDHCP6Client ipv6_accept_ra_start_dhcp6_client; + uint32_t ipv6_accept_ra_route_table; + bool ipv6_accept_ra_route_table_set; + uint32_t ipv6_accept_ra_route_metric_high; + uint32_t ipv6_accept_ra_route_metric_medium; + uint32_t ipv6_accept_ra_route_metric_low; + bool ipv6_accept_ra_route_metric_set; + Set *ndisc_deny_listed_router; + Set *ndisc_allow_listed_router; + Set *ndisc_deny_listed_prefix; + Set *ndisc_allow_listed_prefix; + Set *ndisc_deny_listed_route_prefix; + Set *ndisc_allow_listed_route_prefix; + Set *ndisc_tokens; + char *ndisc_netlabel; + NFTSetContext ndisc_nft_set_context; + + /* LLDP support */ + LLDPMode lldp_mode; /* LLDP reception */ + sd_lldp_multicast_mode_t lldp_multicast_mode; /* LLDP transmission */ + char *lldp_mudurl; /* LLDP MUD URL */ + + OrderedHashmap *addresses_by_section; + Hashmap *routes_by_section; + Hashmap *nexthops_by_section; + Hashmap *bridge_fdb_entries_by_section; + Hashmap *bridge_mdb_entries_by_section; + OrderedHashmap *neighbors_by_section; + Hashmap *address_labels_by_section; + Hashmap *prefixes_by_section; + Hashmap *route_prefixes_by_section; + Hashmap *pref64_prefixes_by_section; + Hashmap *rules_by_section; + Hashmap *dhcp_static_leases_by_section; + Hashmap *qdiscs_by_section; + Hashmap *tclasses_by_section; + OrderedHashmap *sr_iov_by_section; + + /* All kinds of DNS configuration */ + struct in_addr_full **dns; + unsigned n_dns; + OrderedSet *search_domains, *route_domains; + int dns_default_route; + ResolveSupport llmnr; + ResolveSupport mdns; + DnssecMode dnssec_mode; + DnsOverTlsMode dns_over_tls_mode; + Set *dnssec_negative_trust_anchors; + + /* NTP */ + char **ntp; +}; + +Network *network_ref(Network *network); +Network *network_unref(Network *network); +DEFINE_TRIVIAL_CLEANUP_FUNC(Network*, network_unref); + +int network_load(Manager *manager, OrderedHashmap **networks); +int network_reload(Manager *manager); +int network_load_one(Manager *manager, OrderedHashmap **networks, const char *filename); +int network_verify(Network *network); + +int manager_build_dhcp_pd_subnet_ids(Manager *manager); + +int network_get_by_name(Manager *manager, const char *name, Network **ret); +void network_apply_anonymize_if_set(Network *network); + +bool network_has_static_ipv6_configurations(Network *network); + +CONFIG_PARSER_PROTOTYPE(config_parse_stacked_netdev); +CONFIG_PARSER_PROTOTYPE(config_parse_tunnel); +CONFIG_PARSER_PROTOTYPE(config_parse_domains); +CONFIG_PARSER_PROTOTYPE(config_parse_dns); +CONFIG_PARSER_PROTOTYPE(config_parse_timezone); +CONFIG_PARSER_PROTOTYPE(config_parse_dnssec_negative_trust_anchors); +CONFIG_PARSER_PROTOTYPE(config_parse_ntp); +CONFIG_PARSER_PROTOTYPE(config_parse_required_for_online); +CONFIG_PARSER_PROTOTYPE(config_parse_required_family_for_online); +CONFIG_PARSER_PROTOTYPE(config_parse_keep_configuration); +CONFIG_PARSER_PROTOTYPE(config_parse_activation_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_link_group); +CONFIG_PARSER_PROTOTYPE(config_parse_ignore_carrier_loss); + +const struct ConfigPerfItem* network_network_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +const char* keep_configuration_to_string(KeepConfiguration i) _const_; +KeepConfiguration keep_configuration_from_string(const char *s) _pure_; + +const char* activation_policy_to_string(ActivationPolicy i) _const_; +ActivationPolicy activation_policy_from_string(const char *s) _pure_; diff --git a/src/network/networkd-nexthop.c b/src/network/networkd-nexthop.c new file mode 100644 index 0000000..e2ded28 --- /dev/null +++ b/src/network/networkd-nexthop.c @@ -0,0 +1,1384 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. + */ + +#include +#include + +#include "alloc-util.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-nexthop.h" +#include "networkd-queue.h" +#include "networkd-route-util.h" +#include "parse-util.h" +#include "set.h" +#include "stdio-util.h" +#include "string-util.h" + +NextHop *nexthop_free(NextHop *nexthop) { + if (!nexthop) + return NULL; + + if (nexthop->network) { + assert(nexthop->section); + hashmap_remove(nexthop->network->nexthops_by_section, nexthop->section); + } + + config_section_free(nexthop->section); + + if (nexthop->link) { + set_remove(nexthop->link->nexthops, nexthop); + + if (nexthop->link->manager && nexthop->id > 0) + hashmap_remove(nexthop->link->manager->nexthops_by_id, UINT32_TO_PTR(nexthop->id)); + } + + if (nexthop->manager) { + set_remove(nexthop->manager->nexthops, nexthop); + + if (nexthop->id > 0) + hashmap_remove(nexthop->manager->nexthops_by_id, UINT32_TO_PTR(nexthop->id)); + } + + hashmap_free_free(nexthop->group); + + return mfree(nexthop); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(NextHop, nexthop_free); + +static int nexthop_new(NextHop **ret) { + _cleanup_(nexthop_freep) NextHop *nexthop = NULL; + + nexthop = new(NextHop, 1); + if (!nexthop) + return -ENOMEM; + + *nexthop = (NextHop) { + .family = AF_UNSPEC, + .onlink = -1, + }; + + *ret = TAKE_PTR(nexthop); + + return 0; +} + +static int nexthop_new_static(Network *network, const char *filename, unsigned section_line, NextHop **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(nexthop_freep) NextHop *nexthop = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + nexthop = hashmap_get(network->nexthops_by_section, n); + if (nexthop) { + *ret = TAKE_PTR(nexthop); + return 0; + } + + r = nexthop_new(&nexthop); + if (r < 0) + return r; + + nexthop->protocol = RTPROT_STATIC; + nexthop->network = network; + nexthop->section = TAKE_PTR(n); + nexthop->source = NETWORK_CONFIG_SOURCE_STATIC; + + r = hashmap_ensure_put(&network->nexthops_by_section, &config_section_hash_ops, nexthop->section, nexthop); + if (r < 0) + return r; + + *ret = TAKE_PTR(nexthop); + return 0; +} + +static void nexthop_hash_func(const NextHop *nexthop, struct siphash *state) { + assert(nexthop); + + siphash24_compress(&nexthop->protocol, sizeof(nexthop->protocol), state); + siphash24_compress(&nexthop->id, sizeof(nexthop->id), state); + siphash24_compress(&nexthop->blackhole, sizeof(nexthop->blackhole), state); + siphash24_compress(&nexthop->family, sizeof(nexthop->family), state); + + switch (nexthop->family) { + case AF_INET: + case AF_INET6: + siphash24_compress(&nexthop->gw, FAMILY_ADDRESS_SIZE(nexthop->family), state); + + break; + default: + /* treat any other address family as AF_UNSPEC */ + break; + } +} + +static int nexthop_compare_func(const NextHop *a, const NextHop *b) { + int r; + + r = CMP(a->protocol, b->protocol); + if (r != 0) + return r; + + r = CMP(a->id, b->id); + if (r != 0) + return r; + + r = CMP(a->blackhole, b->blackhole); + if (r != 0) + return r; + + r = CMP(a->family, b->family); + if (r != 0) + return r; + + if (IN_SET(a->family, AF_INET, AF_INET6)) + return memcmp(&a->gw, &b->gw, FAMILY_ADDRESS_SIZE(a->family)); + + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + nexthop_hash_ops, + NextHop, + nexthop_hash_func, + nexthop_compare_func, + nexthop_free); + +static bool nexthop_equal(const NextHop *a, const NextHop *b) { + if (a == b) + return true; + + if (!a || !b) + return false; + + return nexthop_compare_func(a, b) == 0; +} + +static int nexthop_dup(const NextHop *src, NextHop **ret) { + _cleanup_(nexthop_freep) NextHop *dest = NULL; + struct nexthop_grp *nhg; + int r; + + assert(src); + assert(ret); + + dest = newdup(NextHop, src, 1); + if (!dest) + return -ENOMEM; + + /* unset all pointers */ + dest->manager = NULL; + dest->link = NULL; + dest->network = NULL; + dest->section = NULL; + dest->group = NULL; + + HASHMAP_FOREACH(nhg, src->group) { + _cleanup_free_ struct nexthop_grp *g = NULL; + + g = newdup(struct nexthop_grp, nhg, 1); + if (!g) + return -ENOMEM; + + r = hashmap_ensure_put(&dest->group, NULL, UINT32_TO_PTR(g->id), g); + if (r < 0) + return r; + if (r > 0) + TAKE_PTR(g); + } + + *ret = TAKE_PTR(dest); + return 0; +} + +int manager_get_nexthop_by_id(Manager *manager, uint32_t id, NextHop **ret) { + NextHop *nh; + + assert(manager); + + if (id == 0) + return -EINVAL; + + nh = hashmap_get(manager->nexthops_by_id, UINT32_TO_PTR(id)); + if (!nh) + return -ENOENT; + + if (ret) + *ret = nh; + return 0; +} + +static bool nexthop_owned_by_link(const NextHop *nexthop) { + return !nexthop->blackhole && hashmap_isempty(nexthop->group); +} + +static int nexthop_get(Manager *manager, Link *link, NextHop *in, NextHop **ret) { + NextHop *nexthop; + Set *nexthops; + + assert(in); + + if (nexthop_owned_by_link(in)) { + if (!link) + return -ENOENT; + + nexthops = link->nexthops; + } else { + if (!manager) + return -ENOENT; + + nexthops = manager->nexthops; + } + + nexthop = set_get(nexthops, in); + if (nexthop) { + if (ret) + *ret = nexthop; + return 0; + } + + if (in->id > 0) + return -ENOENT; + + /* Also find nexthop configured without ID. */ + SET_FOREACH(nexthop, nexthops) { + uint32_t id; + bool found; + + id = nexthop->id; + nexthop->id = 0; + found = nexthop_equal(nexthop, in); + nexthop->id = id; + + if (!found) + continue; + + if (ret) + *ret = nexthop; + return 0; + } + + return -ENOENT; +} + +static int nexthop_add(Manager *manager, Link *link, NextHop *nexthop) { + int r; + + assert(nexthop); + assert(nexthop->id > 0); + + if (nexthop_owned_by_link(nexthop)) { + assert(link); + + r = set_ensure_put(&link->nexthops, &nexthop_hash_ops, nexthop); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + nexthop->link = link; + + manager = link->manager; + } else { + assert(manager); + + r = set_ensure_put(&manager->nexthops, &nexthop_hash_ops, nexthop); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + nexthop->manager = manager; + } + + return hashmap_ensure_put(&manager->nexthops_by_id, NULL, UINT32_TO_PTR(nexthop->id), nexthop); +} + +static int nexthop_acquire_id(Manager *manager, NextHop *nexthop) { + _cleanup_set_free_ Set *ids = NULL; + Network *network; + uint32_t id; + int r; + + assert(manager); + assert(nexthop); + + if (nexthop->id > 0) + return 0; + + /* Find the lowest unused ID. */ + + ORDERED_HASHMAP_FOREACH(network, manager->networks) { + NextHop *tmp; + + HASHMAP_FOREACH(tmp, network->nexthops_by_section) { + if (tmp->id == 0) + continue; + + r = set_ensure_put(&ids, NULL, UINT32_TO_PTR(tmp->id)); + if (r < 0) + return r; + } + } + + for (id = 1; id < UINT32_MAX; id++) { + if (manager_get_nexthop_by_id(manager, id, NULL) >= 0) + continue; + if (set_contains(ids, UINT32_TO_PTR(id))) + continue; + break; + } + + nexthop->id = id; + return 0; +} + +static void log_nexthop_debug(const NextHop *nexthop, const char *str, const Link *link) { + _cleanup_free_ char *state = NULL, *group = NULL, *flags = NULL; + struct nexthop_grp *nhg; + + assert(nexthop); + assert(str); + + /* link may be NULL. */ + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(nexthop->state, &state); + (void) route_flags_to_string_alloc(nexthop->flags, &flags); + + HASHMAP_FOREACH(nhg, nexthop->group) + (void) strextendf_with_separator(&group, ",", "%"PRIu32":%"PRIu32, nhg->id, nhg->weight+1u); + + log_link_debug(link, "%s %s nexthop (%s): id: %"PRIu32", gw: %s, blackhole: %s, group: %s, flags: %s", + str, strna(network_config_source_to_string(nexthop->source)), strna(state), + nexthop->id, + IN_ADDR_TO_STRING(nexthop->family, &nexthop->gw), + yes_no(nexthop->blackhole), strna(group), strna(flags)); +} + +static int nexthop_remove_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + + /* link may be NULL. */ + + if (link && IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 1; + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -ENOENT) + log_link_message_warning_errno(link, m, r, "Could not drop nexthop, ignoring"); + + return 1; +} + +static int nexthop_remove(NextHop *nexthop) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + Manager *manager; + Link *link; + int r; + + assert(nexthop); + assert(nexthop->manager || (nexthop->link && nexthop->link->manager)); + + /* link may be NULL. */ + link = nexthop->link; + manager = nexthop->manager ?: nexthop->link->manager; + + if (nexthop->id == 0) { + log_link_debug(link, "Cannot remove nexthop without valid ID, ignoring."); + return 0; + } + + log_nexthop_debug(nexthop, "Removing", link); + + r = sd_rtnl_message_new_nexthop(manager->rtnl, &m, RTM_DELNEXTHOP, AF_UNSPEC, RTPROT_UNSPEC); + if (r < 0) + return log_link_error_errno(link, r, "Could not create RTM_DELNEXTHOP message: %m"); + + r = sd_netlink_message_append_u32(m, NHA_ID, nexthop->id); + if (r < 0) + return log_link_error_errno(link, r, "Could not append NHA_ID attribute: %m"); + + r = netlink_call_async(manager->rtnl, NULL, m, nexthop_remove_handler, + link ? link_netlink_destroy_callback : NULL, link); + if (r < 0) + return log_link_error_errno(link, r, "Could not send rtnetlink message: %m"); + + link_ref(link); /* link may be NULL, link_ref() is OK with that */ + + nexthop_enter_removing(nexthop); + return 0; +} + +static int nexthop_configure(NextHop *nexthop, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(nexthop); + assert(IN_SET(nexthop->family, AF_UNSPEC, AF_INET, AF_INET6)); + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(link->ifindex > 0); + assert(req); + + log_nexthop_debug(nexthop, "Configuring", link); + + r = sd_rtnl_message_new_nexthop(link->manager->rtnl, &m, RTM_NEWNEXTHOP, nexthop->family, nexthop->protocol); + if (r < 0) + return r; + + if (nexthop->id > 0) { + r = sd_netlink_message_append_u32(m, NHA_ID, nexthop->id); + if (r < 0) + return r; + } + + if (!hashmap_isempty(nexthop->group)) { + _cleanup_free_ struct nexthop_grp *group = NULL; + struct nexthop_grp *p, *nhg; + + group = new(struct nexthop_grp, hashmap_size(nexthop->group)); + if (!group) + return log_oom(); + + p = group; + HASHMAP_FOREACH(nhg, nexthop->group) + *p++ = *nhg; + + r = sd_netlink_message_append_data(m, NHA_GROUP, group, sizeof(struct nexthop_grp) * hashmap_size(nexthop->group)); + if (r < 0) + return r; + + } else if (nexthop->blackhole) { + r = sd_netlink_message_append_flag(m, NHA_BLACKHOLE); + if (r < 0) + return r; + } else { + r = sd_netlink_message_append_u32(m, NHA_OIF, link->ifindex); + if (r < 0) + return r; + + if (in_addr_is_set(nexthop->family, &nexthop->gw)) { + r = netlink_message_append_in_addr_union(m, NHA_GATEWAY, nexthop->family, &nexthop->gw); + if (r < 0) + return r; + + r = sd_rtnl_message_nexthop_set_flags(m, nexthop->flags & RTNH_F_ONLINK); + if (r < 0) + return r; + } + } + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static int static_nexthop_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, NextHop *nexthop) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not set nexthop"); + link_enter_failed(link); + return 1; + } + + if (link->static_nexthop_messages == 0) { + log_link_debug(link, "Nexthops set"); + link->static_nexthops_configured = true; + link_check_ready(link); + } + + return 1; +} + +static bool nexthop_is_ready_to_configure(Link *link, const NextHop *nexthop) { + struct nexthop_grp *nhg; + + assert(link); + assert(nexthop); + + if (!link_is_ready_to_configure(link, false)) + return false; + + if (nexthop_owned_by_link(nexthop)) { + /* TODO: fdb nexthop does not require IFF_UP. The conditions below needs to be updated + * when fdb nexthop support is added. See rtm_to_nh_config() in net/ipv4/nexthop.c of + * kernel. */ + if (link->set_flags_messages > 0) + return false; + if (!FLAGS_SET(link->flags, IFF_UP)) + return false; + } + + /* All group members must be configured first. */ + HASHMAP_FOREACH(nhg, nexthop->group) { + NextHop *g; + + if (manager_get_nexthop_by_id(link->manager, nhg->id, &g) < 0) + return false; + + if (!nexthop_exists(g)) + return false; + } + + if (nexthop->id == 0) { + Request *req; + + ORDERED_SET_FOREACH(req, link->manager->request_queue) { + if (req->type != REQUEST_TYPE_NEXTHOP) + continue; + if (((NextHop*) req->userdata)->id != 0) + return false; /* first configure nexthop with id. */ + } + } + + return gateway_is_ready(link, FLAGS_SET(nexthop->flags, RTNH_F_ONLINK), nexthop->family, &nexthop->gw); +} + +static int nexthop_process_request(Request *req, Link *link, NextHop *nexthop) { + int r; + + assert(req); + assert(link); + assert(nexthop); + + if (!nexthop_is_ready_to_configure(link, nexthop)) + return 0; + + r = nexthop_configure(nexthop, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure nexthop"); + + nexthop_enter_configuring(nexthop); + return 1; +} + +static int link_request_nexthop(Link *link, NextHop *nexthop) { + NextHop *existing; + int r; + + assert(link); + assert(nexthop); + assert(nexthop->source != NETWORK_CONFIG_SOURCE_FOREIGN); + + if (nexthop_get(link->manager, link, nexthop, &existing) < 0) { + _cleanup_(nexthop_freep) NextHop *tmp = NULL; + + r = nexthop_dup(nexthop, &tmp); + if (r < 0) + return r; + + r = nexthop_acquire_id(link->manager, tmp); + if (r < 0) + return r; + + r = nexthop_add(link->manager, link, tmp); + if (r < 0) + return r; + + existing = TAKE_PTR(tmp); + } else + existing->source = nexthop->source; + + log_nexthop_debug(existing, "Requesting", link); + r = link_queue_request_safe(link, REQUEST_TYPE_NEXTHOP, + existing, NULL, + nexthop_hash_func, + nexthop_compare_func, + nexthop_process_request, + &link->static_nexthop_messages, + static_nexthop_handler, + NULL); + if (r <= 0) + return r; + + nexthop_enter_requesting(existing); + return 1; +} + +int link_request_static_nexthops(Link *link, bool only_ipv4) { + NextHop *nh; + int r; + + assert(link); + assert(link->network); + + link->static_nexthops_configured = false; + + HASHMAP_FOREACH(nh, link->network->nexthops_by_section) { + if (only_ipv4 && nh->family != AF_INET) + continue; + + r = link_request_nexthop(link, nh); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request nexthop: %m"); + } + + if (link->static_nexthop_messages == 0) { + link->static_nexthops_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Requesting nexthops"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +static void manager_mark_nexthops(Manager *manager, bool foreign, const Link *except) { + NextHop *nexthop; + Link *link; + + assert(manager); + + /* First, mark all nexthops. */ + SET_FOREACH(nexthop, manager->nexthops) { + /* do not touch nexthop created by the kernel */ + if (nexthop->protocol == RTPROT_KERNEL) + continue; + + /* When 'foreign' is true, mark only foreign nexthops, and vice versa. */ + if (foreign != (nexthop->source == NETWORK_CONFIG_SOURCE_FOREIGN)) + continue; + + /* Ignore nexthops not assigned yet or already removed. */ + if (!nexthop_exists(nexthop)) + continue; + + nexthop_mark(nexthop); + } + + /* Then, unmark all nexthops requested by active links. */ + HASHMAP_FOREACH(link, manager->links_by_index) { + if (link == except) + continue; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + continue; + + HASHMAP_FOREACH(nexthop, link->network->nexthops_by_section) { + NextHop *existing; + + if (nexthop_get(manager, NULL, nexthop, &existing) >= 0) + nexthop_unmark(existing); + } + } +} + +static int manager_drop_marked_nexthops(Manager *manager) { + NextHop *nexthop; + int r = 0; + + assert(manager); + + SET_FOREACH(nexthop, manager->nexthops) { + if (!nexthop_is_marked(nexthop)) + continue; + + RET_GATHER(r, nexthop_remove(nexthop)); + } + + return r; +} + +int link_drop_foreign_nexthops(Link *link) { + NextHop *nexthop; + int r = 0; + + assert(link); + assert(link->manager); + assert(link->network); + + /* First, mark all nexthops. */ + SET_FOREACH(nexthop, link->nexthops) { + /* do not touch nexthop created by the kernel */ + if (nexthop->protocol == RTPROT_KERNEL) + continue; + + /* Do not remove nexthops we configured. */ + if (nexthop->source != NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore nexthops not assigned yet or already removed. */ + if (!nexthop_exists(nexthop)) + continue; + + nexthop_mark(nexthop); + } + + /* Then, unmark all nexthops requested by active links. */ + HASHMAP_FOREACH(nexthop, link->network->nexthops_by_section) { + NextHop *existing; + + if (nexthop_get(NULL, link, nexthop, &existing) >= 0) + nexthop_unmark(existing); + } + + /* Finally, remove all marked rules. */ + SET_FOREACH(nexthop, link->nexthops) { + if (!nexthop_is_marked(nexthop)) + continue; + + RET_GATHER(r, nexthop_remove(nexthop)); + } + + manager_mark_nexthops(link->manager, /* foreign = */ true, NULL); + + return RET_GATHER(r, manager_drop_marked_nexthops(link->manager)); +} + +int link_drop_managed_nexthops(Link *link) { + NextHop *nexthop; + int r = 0; + + assert(link); + assert(link->manager); + + SET_FOREACH(nexthop, link->nexthops) { + /* do not touch nexthop created by the kernel */ + if (nexthop->protocol == RTPROT_KERNEL) + continue; + + /* Do not touch addresses managed by kernel or other tools. */ + if (nexthop->source == NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore nexthops not assigned yet or already removing. */ + if (!nexthop_exists(nexthop)) + continue; + + RET_GATHER(r, nexthop_remove(nexthop)); + } + + manager_mark_nexthops(link->manager, /* foreign = */ false, link); + + return RET_GATHER(r, manager_drop_marked_nexthops(link->manager)); +} + +void link_foreignize_nexthops(Link *link) { + NextHop *nexthop; + + assert(link); + + SET_FOREACH(nexthop, link->nexthops) + nexthop->source = NETWORK_CONFIG_SOURCE_FOREIGN; + + manager_mark_nexthops(link->manager, /* foreign = */ false, link); + + SET_FOREACH(nexthop, link->manager->nexthops) { + if (!nexthop_is_marked(nexthop)) + continue; + + nexthop->source = NETWORK_CONFIG_SOURCE_FOREIGN; + } +} + +int manager_rtnl_process_nexthop(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(nexthop_freep) NextHop *tmp = NULL; + _cleanup_free_ void *raw_group = NULL; + NextHop *nexthop = NULL; + size_t raw_group_size; + uint32_t ifindex; + uint16_t type; + Link *link = NULL; + int r; + + assert(rtnl); + assert(message); + assert(m); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive rule message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWNEXTHOP, RTM_DELNEXTHOP)) { + log_warning("rtnl: received unexpected message type %u when processing nexthop, ignoring.", type); + return 0; + } + + r = sd_netlink_message_read_u32(message, NHA_OIF, &ifindex); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get NHA_OIF attribute, ignoring: %m"); + return 0; + } else if (r >= 0) { + if (ifindex <= 0) { + log_warning("rtnl: received nexthop message with invalid ifindex %"PRIu32", ignoring.", ifindex); + return 0; + } + + r = link_get_by_index(m, ifindex, &link); + if (r < 0) { + if (!m->enumerating) + log_warning("rtnl: received nexthop message for link (%"PRIu32") we do not know about, ignoring", ifindex); + return 0; + } + } + + r = nexthop_new(&tmp); + if (r < 0) + return log_oom(); + + r = sd_rtnl_message_get_family(message, &tmp->family); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: could not get nexthop family, ignoring: %m"); + return 0; + } else if (!IN_SET(tmp->family, AF_UNSPEC, AF_INET, AF_INET6)) { + log_link_debug(link, "rtnl: received nexthop message with invalid family %d, ignoring.", tmp->family); + return 0; + } + + r = sd_rtnl_message_nexthop_get_protocol(message, &tmp->protocol); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: could not get nexthop protocol, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_nexthop_get_flags(message, &tmp->flags); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: could not get nexthop flags, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_data(message, NHA_GROUP, &raw_group_size, &raw_group); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: could not get NHA_GROUP attribute, ignoring: %m"); + return 0; + } else if (r >= 0) { + struct nexthop_grp *group = raw_group; + size_t n_group; + + if (raw_group_size == 0 || raw_group_size % sizeof(struct nexthop_grp) != 0) { + log_link_warning(link, "rtnl: received nexthop message with invalid nexthop group size, ignoring."); + return 0; + } + + assert((uintptr_t) group % alignof(struct nexthop_grp) == 0); + + n_group = raw_group_size / sizeof(struct nexthop_grp); + for (size_t i = 0; i < n_group; i++) { + _cleanup_free_ struct nexthop_grp *nhg = NULL; + + if (group[i].id == 0) { + log_link_warning(link, "rtnl: received nexthop message with invalid ID in group, ignoring."); + return 0; + } + if (group[i].weight > 254) { + log_link_warning(link, "rtnl: received nexthop message with invalid weight in group, ignoring."); + return 0; + } + + nhg = newdup(struct nexthop_grp, group + i, 1); + if (!nhg) + return log_oom(); + + r = hashmap_ensure_put(&tmp->group, NULL, UINT32_TO_PTR(nhg->id), nhg); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to store nexthop group, ignoring: %m"); + return 0; + } + if (r > 0) + TAKE_PTR(nhg); + } + } + + if (tmp->family != AF_UNSPEC) { + r = netlink_message_read_in_addr_union(message, NHA_GATEWAY, tmp->family, &tmp->gw); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: could not get NHA_GATEWAY attribute, ignoring: %m"); + return 0; + } + } + + r = sd_netlink_message_has_flag(message, NHA_BLACKHOLE); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: could not get NHA_BLACKHOLE attribute, ignoring: %m"); + return 0; + } + tmp->blackhole = r; + + r = sd_netlink_message_read_u32(message, NHA_ID, &tmp->id); + if (r == -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received nexthop message without NHA_ID attribute, ignoring: %m"); + return 0; + } else if (r < 0) { + log_link_warning_errno(link, r, "rtnl: could not get NHA_ID attribute, ignoring: %m"); + return 0; + } else if (tmp->id == 0) { + log_link_warning(link, "rtnl: received nexthop message with invalid nexthop ID, ignoring: %m"); + return 0; + } + + /* All blackhole or group nexthops are managed by Manager. Note that the linux kernel does not + * set NHA_OID attribute when NHA_BLACKHOLE or NHA_GROUP is set. Just for safety. */ + if (!nexthop_owned_by_link(tmp)) + link = NULL; + + (void) nexthop_get(m, link, tmp, &nexthop); + + switch (type) { + case RTM_NEWNEXTHOP: + if (nexthop) { + nexthop->flags = tmp->flags; + nexthop_enter_configured(nexthop); + log_nexthop_debug(tmp, "Received remembered", link); + } else { + nexthop_enter_configured(tmp); + log_nexthop_debug(tmp, "Remembering", link); + + r = nexthop_add(m, link, tmp); + if (r < 0) { + log_link_warning_errno(link, r, "Could not remember foreign nexthop, ignoring: %m"); + return 0; + } + + TAKE_PTR(tmp); + } + + break; + case RTM_DELNEXTHOP: + if (nexthop) { + nexthop_enter_removed(nexthop); + if (nexthop->state == 0) { + log_nexthop_debug(nexthop, "Forgetting", link); + nexthop_free(nexthop); + } else + log_nexthop_debug(nexthop, "Removed", link); + } else + log_nexthop_debug(tmp, "Kernel removed unknown", link); + break; + + default: + assert_not_reached(); + } + + return 1; +} + +static int nexthop_section_verify(NextHop *nh) { + if (section_is_invalid(nh->section)) + return -EINVAL; + + if (!hashmap_isempty(nh->group)) { + if (in_addr_is_set(nh->family, &nh->gw)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: nexthop group cannot have gateway address. " + "Ignoring [NextHop] section from line %u.", + nh->section->filename, nh->section->line); + + if (nh->family != AF_UNSPEC) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: nexthop group cannot have Family= setting. " + "Ignoring [NextHop] section from line %u.", + nh->section->filename, nh->section->line); + + if (nh->blackhole && in_addr_is_set(nh->family, &nh->gw)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: nexthop group cannot be a blackhole. " + "Ignoring [NextHop] section from line %u.", + nh->section->filename, nh->section->line); + } else if (nh->family == AF_UNSPEC) + /* When neither Family=, Gateway=, nor Group= is specified, assume IPv4. */ + nh->family = AF_INET; + + if (nh->blackhole && in_addr_is_set(nh->family, &nh->gw)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: blackhole nexthop cannot have gateway address. " + "Ignoring [NextHop] section from line %u.", + nh->section->filename, nh->section->line); + + if (nh->onlink < 0 && in_addr_is_set(nh->family, &nh->gw) && + ordered_hashmap_isempty(nh->network->addresses_by_section)) { + /* If no address is configured, in most cases the gateway cannot be reachable. + * TODO: we may need to improve the condition above. */ + log_warning("%s: Gateway= without static address configured. " + "Enabling OnLink= option.", + nh->section->filename); + nh->onlink = true; + } + + if (nh->onlink >= 0) + SET_FLAG(nh->flags, RTNH_F_ONLINK, nh->onlink); + + return 0; +} + +void network_drop_invalid_nexthops(Network *network) { + NextHop *nh; + + assert(network); + + HASHMAP_FOREACH(nh, network->nexthops_by_section) + if (nexthop_section_verify(nh) < 0) + nexthop_free(nh); +} + +int config_parse_nexthop_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(nexthop_free_or_set_invalidp) NextHop *n = NULL; + Network *network = userdata; + uint32_t id; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = nexthop_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->id = 0; + TAKE_PTR(n); + return 0; + } + + r = safe_atou32(rvalue, &id); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse nexthop id \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + if (id == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid nexthop id \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->id = id; + TAKE_PTR(n); + return 0; +} + +int config_parse_nexthop_gateway( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(nexthop_free_or_set_invalidp) NextHop *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = nexthop_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->family = AF_UNSPEC; + n->gw = IN_ADDR_NULL; + + TAKE_PTR(n); + return 0; + } + + r = in_addr_from_string_auto(rvalue, &n->family, &n->gw); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid %s='%s', ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_nexthop_family( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(nexthop_free_or_set_invalidp) NextHop *n = NULL; + Network *network = userdata; + AddressFamily a; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = nexthop_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue) && + !in_addr_is_set(n->family, &n->gw)) { + /* Accept an empty string only when Gateway= is null or not specified. */ + n->family = AF_UNSPEC; + TAKE_PTR(n); + return 0; + } + + a = nexthop_address_family_from_string(rvalue); + if (a < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid %s='%s', ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + if (in_addr_is_set(n->family, &n->gw) && + ((a == ADDRESS_FAMILY_IPV4 && n->family == AF_INET6) || + (a == ADDRESS_FAMILY_IPV6 && n->family == AF_INET))) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified family '%s' conflicts with the family of the previously specified Gateway=, " + "ignoring assignment.", rvalue); + return 0; + } + + switch (a) { + case ADDRESS_FAMILY_IPV4: + n->family = AF_INET; + break; + case ADDRESS_FAMILY_IPV6: + n->family = AF_INET6; + break; + default: + assert_not_reached(); + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_nexthop_onlink( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(nexthop_free_or_set_invalidp) NextHop *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = nexthop_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_tristate(rvalue, &n->onlink); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_nexthop_blackhole( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(nexthop_free_or_set_invalidp) NextHop *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = nexthop_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + n->blackhole = r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_nexthop_group( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(nexthop_free_or_set_invalidp) NextHop *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = nexthop_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->group = hashmap_free_free(n->group); + TAKE_PTR(n); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ struct nexthop_grp *nhg = NULL; + _cleanup_free_ char *word = NULL; + uint32_t w; + char *sep; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + if (r == 0) + break; + + nhg = new0(struct nexthop_grp, 1); + if (!nhg) + return log_oom(); + + sep = strchr(word, ':'); + if (sep) { + *sep++ = '\0'; + r = safe_atou32(sep, &w); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse weight for nexthop group, ignoring assignment: %s:%s", + word, sep); + continue; + } + if (w == 0 || w > 256) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid weight for nexthop group, ignoring assignment: %s:%s", + word, sep); + continue; + } + /* See comments in config_parse_multipath_route(). */ + nhg->weight = w - 1; + } + + r = safe_atou32(word, &nhg->id); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse nexthop ID in %s=, ignoring assignment: %s%s%s", + lvalue, word, sep ? ":" : "", strempty(sep)); + continue; + } + if (nhg->id == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Nexthop ID in %s= must be positive, ignoring assignment: %s%s%s", + lvalue, word, sep ? ":" : "", strempty(sep)); + continue; + } + + r = hashmap_ensure_put(&n->group, NULL, UINT32_TO_PTR(nhg->id), nhg); + if (r == -ENOMEM) + return log_oom(); + if (r == -EEXIST) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Nexthop ID %"PRIu32" is specified multiple times in %s=, ignoring assignment: %s%s%s", + nhg->id, lvalue, word, sep ? ":" : "", strempty(sep)); + continue; + } + assert(r > 0); + TAKE_PTR(nhg); + } + + TAKE_PTR(n); + return 0; +} diff --git a/src/network/networkd-nexthop.h b/src/network/networkd-nexthop.h new file mode 100644 index 0000000..6f2aa6f --- /dev/null +++ b/src/network/networkd-nexthop.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. + */ + +#pragma once + +#include + +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "hashmap.h" +#include "in-addr-util.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Network Network; + +typedef struct NextHop { + Network *network; + Manager *manager; + Link *link; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + + uint8_t protocol; + + uint32_t id; + bool blackhole; + int family; + union in_addr_union gw; + uint8_t flags; + int onlink; /* Only used in conf parser and nexthop_section_verify(). */ + Hashmap *group; +} NextHop; + +NextHop *nexthop_free(NextHop *nexthop); + +void network_drop_invalid_nexthops(Network *network); + +int link_drop_managed_nexthops(Link *link); +int link_drop_foreign_nexthops(Link *link); +void link_foreignize_nexthops(Link *link); + +int link_request_static_nexthops(Link *link, bool only_ipv4); + +int manager_get_nexthop_by_id(Manager *manager, uint32_t id, NextHop **ret); +int manager_rtnl_process_nexthop(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(NextHop, nexthop); + +CONFIG_PARSER_PROTOTYPE(config_parse_nexthop_id); +CONFIG_PARSER_PROTOTYPE(config_parse_nexthop_gateway); +CONFIG_PARSER_PROTOTYPE(config_parse_nexthop_family); +CONFIG_PARSER_PROTOTYPE(config_parse_nexthop_onlink); +CONFIG_PARSER_PROTOTYPE(config_parse_nexthop_blackhole); +CONFIG_PARSER_PROTOTYPE(config_parse_nexthop_group); diff --git a/src/network/networkd-queue.c b/src/network/networkd-queue.c new file mode 100644 index 0000000..1128987 --- /dev/null +++ b/src/network/networkd-queue.c @@ -0,0 +1,333 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "netdev.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "string-table.h" + +#define REPLY_CALLBACK_COUNT_THRESHOLD 128 + +static Request *request_free(Request *req) { + if (!req) + return NULL; + + /* To prevent from triggering assertions in the hash and compare functions, remove this request + * from the set before freeing userdata below. */ + if (req->manager) + ordered_set_remove(req->manager->request_queue, req); + + if (req->free_func) + req->free_func(req->userdata); + + if (req->counter) + (*req->counter)--; + + link_unref(req->link); /* link may be NULL, but link_unref() can handle it gracefully. */ + + return mfree(req); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Request, request, request_free); + +void request_detach(Manager *manager, Request *req) { + assert(manager); + + if (!req) + return; + + req = ordered_set_remove(manager->request_queue, req); + if (!req) + return; + + req->manager = NULL; + request_unref(req); +} + +static void request_destroy_callback(Request *req) { + assert(req); + + if (req->manager) + request_detach(req->manager, req); + + request_unref(req); +} + +static void request_hash_func(const Request *req, struct siphash *state) { + assert(req); + assert(state); + + siphash24_compress_boolean(req->link, state); + if (req->link) + siphash24_compress(&req->link->ifindex, sizeof(req->link->ifindex), state); + + siphash24_compress(&req->type, sizeof(req->type), state); + + siphash24_compress(&req->hash_func, sizeof(req->hash_func), state); + siphash24_compress(&req->compare_func, sizeof(req->compare_func), state); + + if (req->hash_func) + req->hash_func(req->userdata, state); +} + +static int request_compare_func(const struct Request *a, const struct Request *b) { + int r; + + assert(a); + assert(b); + + r = CMP(!!a->link, !!b->link); + if (r != 0) + return r; + + if (a->link) { + r = CMP(a->link->ifindex, b->link->ifindex); + if (r != 0) + return r; + } + + r = CMP(a->type, b->type); + if (r != 0) + return r; + + r = CMP(PTR_TO_UINT64(a->hash_func), PTR_TO_UINT64(b->hash_func)); + if (r != 0) + return r; + + r = CMP(PTR_TO_UINT64(a->compare_func), PTR_TO_UINT64(b->compare_func)); + if (r != 0) + return r; + + if (a->compare_func) + return a->compare_func(a->userdata, b->userdata); + + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + request_hash_ops, + Request, + request_hash_func, + request_compare_func, + request_unref); + +static int request_new( + Manager *manager, + Link *link, + RequestType type, + void *userdata, + mfree_func_t free_func, + hash_func_t hash_func, + compare_func_t compare_func, + request_process_func_t process, + unsigned *counter, + request_netlink_handler_t netlink_handler, + Request **ret) { + + _cleanup_(request_unrefp) Request *req = NULL; + Request *existing; + int r; + + assert(manager); + assert(process); + + req = new(Request, 1); + if (!req) + return -ENOMEM; + + *req = (Request) { + .n_ref = 1, + .link = link_ref(link), /* link may be NULL, but link_ref() handles it gracefully. */ + .type = type, + .userdata = userdata, + .hash_func = hash_func, + .compare_func = compare_func, + .process = process, + .netlink_handler = netlink_handler, + }; + + existing = ordered_set_get(manager->request_queue, req); + if (existing) { + if (ret) + *ret = existing; + return 0; + } + + r = ordered_set_ensure_put(&manager->request_queue, &request_hash_ops, req); + if (r < 0) + return r; + + req->manager = manager; + req->free_func = free_func; + req->counter = counter; + if (req->counter) + (*req->counter)++; + + if (ret) + *ret = req; + + TAKE_PTR(req); + return 1; +} + +int netdev_queue_request( + NetDev *netdev, + request_process_func_t process, + Request **ret) { + + int r; + + assert(netdev); + + r = request_new(netdev->manager, NULL, REQUEST_TYPE_NETDEV_INDEPENDENT, + netdev, (mfree_func_t) netdev_unref, + trivial_hash_func, trivial_compare_func, + process, NULL, NULL, ret); + if (r <= 0) + return r; + + netdev_ref(netdev); + return 1; +} + +int link_queue_request_full( + Link *link, + RequestType type, + void *userdata, + mfree_func_t free_func, + hash_func_t hash_func, + compare_func_t compare_func, + request_process_func_t process, + unsigned *counter, + request_netlink_handler_t netlink_handler, + Request **ret) { + + assert(link); + + return request_new(link->manager, link, type, + userdata, free_func, hash_func, compare_func, + process, counter, netlink_handler, ret); +} + +int manager_process_requests(sd_event_source *s, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + int r; + + for (;;) { + bool processed = false; + Request *req; + + ORDERED_SET_FOREACH(req, manager->request_queue) { + _cleanup_(link_unrefp) Link *link = link_ref(req->link); + + assert(req->process); + + if (req->waiting_reply) + continue; /* Waiting for netlink reply. */ + + /* Typically, requests send netlink message asynchronously. If there are many requests + * queued, then this event may make reply callback queue in sd-netlink full. */ + if (netlink_get_reply_callback_count(manager->rtnl) >= REPLY_CALLBACK_COUNT_THRESHOLD || + netlink_get_reply_callback_count(manager->genl) >= REPLY_CALLBACK_COUNT_THRESHOLD || + fw_ctx_get_reply_callback_count(manager->fw_ctx) >= REPLY_CALLBACK_COUNT_THRESHOLD) + return 0; + + r = req->process(req, link, req->userdata); + if (r == 0) + continue; + + processed = true; + + /* If the request sends netlink message, e.g. for Address or so, the Request object + * is referenced by the netlink slot, and will be detached later by its destroy callback. + * Otherwise, e.g. for DHCP client or so, detach the request from queue now. */ + if (!req->waiting_reply) + request_detach(manager, req); + + if (r < 0 && link) { + link_enter_failed(link); + /* link_enter_failed() may remove multiple requests, + * hence we need to exit from the loop. */ + break; + } + } + + /* When at least one request is processed, then another request may be ready now. */ + if (!processed) + break; + } + + return 0; +} + +static int request_netlink_handler(sd_netlink *nl, sd_netlink_message *m, Request *req) { + assert(req); + + if (req->counter) { + assert(*req->counter > 0); + (*req->counter)--; + req->counter = NULL; /* To prevent double decrement on free. */ + } + + if (req->link && IN_SET(req->link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + if (req->netlink_handler) + return req->netlink_handler(nl, m, req, req->link, req->userdata); + + return 0; +} + +int request_call_netlink_async(sd_netlink *nl, sd_netlink_message *m, Request *req) { + int r; + + assert(nl); + assert(m); + assert(req); + + r = netlink_call_async(nl, NULL, m, request_netlink_handler, request_destroy_callback, req); + if (r < 0) + return r; + + request_ref(req); + req->waiting_reply = true; + return 0; +} + +static const char *const request_type_table[_REQUEST_TYPE_MAX] = { + [REQUEST_TYPE_ACTIVATE_LINK] = "activate link", + [REQUEST_TYPE_ADDRESS] = "address", + [REQUEST_TYPE_ADDRESS_LABEL] = "address label", + [REQUEST_TYPE_BRIDGE_FDB] = "bridge FDB", + [REQUEST_TYPE_BRIDGE_MDB] = "bridge MDB", + [REQUEST_TYPE_DHCP_SERVER] = "DHCP server", + [REQUEST_TYPE_DHCP4_CLIENT] = "DHCPv4 client", + [REQUEST_TYPE_DHCP6_CLIENT] = "DHCPv6 client", + [REQUEST_TYPE_IPV6_PROXY_NDP] = "IPv6 proxy NDP", + [REQUEST_TYPE_NDISC] = "NDisc", + [REQUEST_TYPE_NEIGHBOR] = "neighbor", + [REQUEST_TYPE_NETDEV_INDEPENDENT] = "independent netdev", + [REQUEST_TYPE_NETDEV_STACKED] = "stacked netdev", + [REQUEST_TYPE_NEXTHOP] = "nexthop", + [REQUEST_TYPE_RADV] = "RADV", + [REQUEST_TYPE_ROUTE] = "route", + [REQUEST_TYPE_ROUTING_POLICY_RULE] = "routing policy rule", + [REQUEST_TYPE_SET_LINK_ADDRESS_GENERATION_MODE] = "IPv6LL address generation mode", + [REQUEST_TYPE_SET_LINK_BOND] = "bond configurations", + [REQUEST_TYPE_SET_LINK_BRIDGE] = "bridge configurations", + [REQUEST_TYPE_SET_LINK_BRIDGE_VLAN] = "bridge VLAN configurations", + [REQUEST_TYPE_SET_LINK_CAN] = "CAN interface configurations", + [REQUEST_TYPE_SET_LINK_FLAGS] = "link flags", + [REQUEST_TYPE_SET_LINK_GROUP] = "interface group", + [REQUEST_TYPE_SET_LINK_IPOIB] = "IPoIB configurations", + [REQUEST_TYPE_SET_LINK_MAC] = "MAC address", + [REQUEST_TYPE_SET_LINK_MASTER] = "master interface", + [REQUEST_TYPE_SET_LINK_MTU] = "MTU", + [REQUEST_TYPE_SRIOV] = "SR-IOV", + [REQUEST_TYPE_TC_QDISC] = "QDisc", + [REQUEST_TYPE_TC_CLASS] = "TClass", + [REQUEST_TYPE_UP_DOWN] = "bring link up or down", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(request_type, RequestType); diff --git a/src/network/networkd-queue.h b/src/network/networkd-queue.h new file mode 100644 index 0000000..e58d1be --- /dev/null +++ b/src/network/networkd-queue.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "hash-funcs.h" + +typedef struct Link Link; +typedef struct NetDev NetDev; +typedef struct Manager Manager; +typedef struct Request Request; + +typedef int (*request_process_func_t)(Request *req, Link *link, void *userdata); +typedef int (*request_netlink_handler_t)(sd_netlink *nl, sd_netlink_message *m, Request *req, Link *link, void *userdata); + +typedef enum RequestType { + REQUEST_TYPE_ACTIVATE_LINK, + REQUEST_TYPE_ADDRESS, + REQUEST_TYPE_ADDRESS_LABEL, + REQUEST_TYPE_BRIDGE_FDB, + REQUEST_TYPE_BRIDGE_MDB, + REQUEST_TYPE_DHCP_SERVER, + REQUEST_TYPE_DHCP4_CLIENT, + REQUEST_TYPE_DHCP6_CLIENT, + REQUEST_TYPE_IPV6_PROXY_NDP, + REQUEST_TYPE_NDISC, + REQUEST_TYPE_NEIGHBOR, + REQUEST_TYPE_NETDEV_INDEPENDENT, + REQUEST_TYPE_NETDEV_STACKED, + REQUEST_TYPE_NEXTHOP, + REQUEST_TYPE_RADV, + REQUEST_TYPE_ROUTE, + REQUEST_TYPE_ROUTING_POLICY_RULE, + REQUEST_TYPE_SET_LINK_ADDRESS_GENERATION_MODE, /* Setting IPv6LL address generation mode. */ + REQUEST_TYPE_SET_LINK_BOND, /* Setting bond configs. */ + REQUEST_TYPE_SET_LINK_BRIDGE, /* Setting bridge configs. */ + REQUEST_TYPE_SET_LINK_BRIDGE_VLAN, /* Setting bridge VLAN configs. */ + REQUEST_TYPE_SET_LINK_CAN, /* Setting CAN interface configs. */ + REQUEST_TYPE_SET_LINK_FLAGS, /* Setting IFF_NOARP or friends. */ + REQUEST_TYPE_SET_LINK_GROUP, /* Setting interface group. */ + REQUEST_TYPE_SET_LINK_IPOIB, /* Setting IPoIB configs. */ + REQUEST_TYPE_SET_LINK_MAC, /* Setting MAC address. */ + REQUEST_TYPE_SET_LINK_MASTER, /* Setting IFLA_MASTER. */ + REQUEST_TYPE_SET_LINK_MTU, /* Setting MTU. */ + REQUEST_TYPE_SRIOV, + REQUEST_TYPE_TC_CLASS, + REQUEST_TYPE_TC_QDISC, + REQUEST_TYPE_UP_DOWN, + _REQUEST_TYPE_MAX, + _REQUEST_TYPE_INVALID = -EINVAL, +} RequestType; + +struct Request { + unsigned n_ref; + + Manager *manager; /* must be non-NULL */ + Link *link; /* can be NULL */ + + RequestType type; + + /* Target object, e.g. Address, Route, NetDev, and so on. */ + void *userdata; + /* freeing userdata when the request is completed or failed. */ + mfree_func_t free_func; + + /* hash and compare functions for userdata, used for dedup requests. */ + hash_func_t hash_func; + compare_func_t compare_func; + + /* Checks the request dependencies, and then processes this request, e.g. call address_configure(). + * Return 1 when processed, 0 when its dependencies not resolved, and negative errno on failure. */ + request_process_func_t process; + + /* incremented when requested, decremented when request is completed or failed. */ + unsigned *counter; + /* called in netlink handler, the 'counter' is decremented before this is called. + * If this is specified, then the 'process' function must increment the reference of this + * request, and pass this request to the netlink_call_async(), and set the destroy function + * to the slot. */ + request_netlink_handler_t netlink_handler; + + bool waiting_reply; +}; + +Request *request_ref(Request *req); +Request *request_unref(Request *req); +DEFINE_TRIVIAL_CLEANUP_FUNC(Request*, request_unref); + +void request_detach(Manager *manager, Request *req); + +int netdev_queue_request( + NetDev *netdev, + request_process_func_t process, + Request **ret); + +int link_queue_request_full( + Link *link, + RequestType type, + void *userdata, + mfree_func_t free_func, + hash_func_t hash_func, + compare_func_t compare_func, + request_process_func_t process, + unsigned *counter, + request_netlink_handler_t netlink_handler, + Request **ret); + +static inline int link_queue_request( + Link *link, + RequestType type, + request_process_func_t process, + Request **ret) { + + return link_queue_request_full(link, type, NULL, NULL, NULL, NULL, + process, NULL, NULL, ret); +} + +#define link_queue_request_safe(link, type, userdata, free_func, hash_func, compare_func, process, counter, netlink_handler, ret) \ + ({ \ + typeof(userdata) (*_f)(typeof(userdata)) = (free_func); \ + void (*_h)(const typeof(*userdata)*, struct siphash*) = (hash_func); \ + int (*_c)(const typeof(*userdata)*, const typeof(*userdata)*) = (compare_func); \ + int (*_p)(Request*, Link*, typeof(userdata)) = (process); \ + int (*_n)(sd_netlink*, sd_netlink_message*, Request*, Link*, typeof(userdata)) = (netlink_handler); \ + \ + link_queue_request_full(link, type, userdata, \ + (mfree_func_t) _f, \ + (hash_func_t) _h, \ + (compare_func_t) _c, \ + (request_process_func_t) _p, \ + counter, \ + (request_netlink_handler_t) _n, \ + ret); \ + }) + +int manager_process_requests(sd_event_source *s, void *userdata); +int request_call_netlink_async(sd_netlink *nl, sd_netlink_message *m, Request *req); + +const char* request_type_to_string(RequestType t) _const_; diff --git a/src/network/networkd-radv.c b/src/network/networkd-radv.c new file mode 100644 index 0000000..fc36a00 --- /dev/null +++ b/src/network/networkd-radv.c @@ -0,0 +1,1619 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2017 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "dns-domain.h" +#include "networkd-address-generation.h" +#include "networkd-address.h" +#include "networkd-dhcp-prefix-delegation.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "networkd-radv.h" +#include "networkd-route-util.h" +#include "parse-util.h" +#include "radv-internal.h" +#include "string-util.h" +#include "string-table.h" +#include "strv.h" + +void network_adjust_radv(Network *network) { + assert(network); + + /* After this function is called, network->router_prefix_delegation can be treated as a boolean. */ + + if (network->dhcp_pd < 0) + /* For backward compatibility. */ + network->dhcp_pd = FLAGS_SET(network->router_prefix_delegation, RADV_PREFIX_DELEGATION_DHCP6); + + if (!FLAGS_SET(network->link_local, ADDRESS_FAMILY_IPV6)) { + if (network->router_prefix_delegation != RADV_PREFIX_DELEGATION_NONE) + log_warning("%s: IPv6PrefixDelegation= is enabled but IPv6 link-local addressing is disabled. " + "Disabling IPv6PrefixDelegation=.", network->filename); + + network->router_prefix_delegation = RADV_PREFIX_DELEGATION_NONE; + } + + if (network->router_prefix_delegation == RADV_PREFIX_DELEGATION_NONE) { + network->n_router_dns = 0; + network->router_dns = mfree(network->router_dns); + network->router_search_domains = ordered_set_free(network->router_search_domains); + } + + if (!FLAGS_SET(network->router_prefix_delegation, RADV_PREFIX_DELEGATION_STATIC)) { + network->prefixes_by_section = hashmap_free_with_destructor(network->prefixes_by_section, prefix_free); + network->route_prefixes_by_section = hashmap_free_with_destructor(network->route_prefixes_by_section, route_prefix_free); + network->pref64_prefixes_by_section = hashmap_free_with_destructor(network->pref64_prefixes_by_section, pref64_prefix_free); + } +} + +bool link_radv_enabled(Link *link) { + assert(link); + + if (!link_may_have_ipv6ll(link, /* check_multicast = */ true)) + return false; + + if (link->hw_addr.length != ETH_ALEN) + return false; + + return link->network->router_prefix_delegation; +} + +Prefix *prefix_free(Prefix *prefix) { + if (!prefix) + return NULL; + + if (prefix->network) { + assert(prefix->section); + hashmap_remove(prefix->network->prefixes_by_section, prefix->section); + } + + config_section_free(prefix->section); + set_free(prefix->tokens); + + return mfree(prefix); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(Prefix, prefix_free); + +static int prefix_new_static(Network *network, const char *filename, unsigned section_line, Prefix **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(prefix_freep) Prefix *prefix = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + prefix = hashmap_get(network->prefixes_by_section, n); + if (prefix) { + *ret = TAKE_PTR(prefix); + return 0; + } + + prefix = new(Prefix, 1); + if (!prefix) + return -ENOMEM; + + *prefix = (Prefix) { + .network = network, + .section = TAKE_PTR(n), + + .preferred_lifetime = RADV_DEFAULT_PREFERRED_LIFETIME_USEC, + .valid_lifetime = RADV_DEFAULT_VALID_LIFETIME_USEC, + .onlink = true, + .address_auto_configuration = true, + }; + + r = hashmap_ensure_put(&network->prefixes_by_section, &config_section_hash_ops, prefix->section, prefix); + if (r < 0) + return r; + + *ret = TAKE_PTR(prefix); + return 0; +} + +RoutePrefix *route_prefix_free(RoutePrefix *prefix) { + if (!prefix) + return NULL; + + if (prefix->network) { + assert(prefix->section); + hashmap_remove(prefix->network->route_prefixes_by_section, prefix->section); + } + + config_section_free(prefix->section); + + return mfree(prefix); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(RoutePrefix, route_prefix_free); + +static int route_prefix_new_static(Network *network, const char *filename, unsigned section_line, RoutePrefix **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(route_prefix_freep) RoutePrefix *prefix = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + prefix = hashmap_get(network->route_prefixes_by_section, n); + if (prefix) { + *ret = TAKE_PTR(prefix); + return 0; + } + + prefix = new(RoutePrefix, 1); + if (!prefix) + return -ENOMEM; + + *prefix = (RoutePrefix) { + .network = network, + .section = TAKE_PTR(n), + + .lifetime = RADV_DEFAULT_VALID_LIFETIME_USEC, + }; + + r = hashmap_ensure_put(&network->route_prefixes_by_section, &config_section_hash_ops, prefix->section, prefix); + if (r < 0) + return r; + + *ret = TAKE_PTR(prefix); + return 0; +} + +pref64Prefix *pref64_prefix_free(pref64Prefix *prefix) { + if (!prefix) + return NULL; + + if (prefix->network) { + assert(prefix->section); + hashmap_remove(prefix->network->pref64_prefixes_by_section, prefix->section); + } + + config_section_free(prefix->section); + + return mfree(prefix); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(pref64Prefix, pref64_prefix_free); + +static int pref64_prefix_new_static(Network *network, const char *filename, unsigned section_line, pref64Prefix **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(pref64_prefix_freep) pref64Prefix *prefix = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + prefix = hashmap_get(network->pref64_prefixes_by_section, n); + if (prefix) { + *ret = TAKE_PTR(prefix); + return 0; + } + + prefix = new(pref64Prefix, 1); + if (!prefix) + return -ENOMEM; + + *prefix = (pref64Prefix) { + .network = network, + .section = TAKE_PTR(n), + + .lifetime = RADV_PREF64_DEFAULT_LIFETIME_USEC, + }; + + r = hashmap_ensure_put(&network->pref64_prefixes_by_section, &config_section_hash_ops, prefix->section, prefix); + if (r < 0) + return r; + + *ret = TAKE_PTR(prefix); + return 0; +} + +int link_request_radv_addresses(Link *link) { + Prefix *p; + int r; + + assert(link); + + if (!link_radv_enabled(link)) + return 0; + + HASHMAP_FOREACH(p, link->network->prefixes_by_section) { + _cleanup_set_free_ Set *addresses = NULL; + struct in6_addr *a; + + if (!p->assign) + continue; + + /* radv_generate_addresses() below requires the prefix length <= 64. */ + if (p->prefixlen > 64) + continue; + + r = radv_generate_addresses(link, p->tokens, &p->prefix, p->prefixlen, &addresses); + if (r < 0) + return r; + + SET_FOREACH(a, addresses) { + _cleanup_(address_freep) Address *address = NULL; + + r = address_new(&address); + if (r < 0) + return -ENOMEM; + + address->source = NETWORK_CONFIG_SOURCE_STATIC; + address->family = AF_INET6; + address->in_addr.in6 = *a; + address->prefixlen = p->prefixlen; + address->route_metric = p->route_metric; + + r = link_request_static_address(link, address); + if (r < 0) + return r; + } + } + + return 0; +} + +static int radv_set_prefix(Link *link, Prefix *prefix) { + _cleanup_(sd_radv_prefix_unrefp) sd_radv_prefix *p = NULL; + int r; + + assert(link); + assert(link->radv); + assert(prefix); + + r = sd_radv_prefix_new(&p); + if (r < 0) + return r; + + r = sd_radv_prefix_set_prefix(p, &prefix->prefix, prefix->prefixlen); + if (r < 0) + return r; + + r = sd_radv_prefix_set_preferred_lifetime(p, prefix->preferred_lifetime, USEC_INFINITY); + if (r < 0) + return r; + + r = sd_radv_prefix_set_valid_lifetime(p, prefix->valid_lifetime, USEC_INFINITY); + if (r < 0) + return r; + + r = sd_radv_prefix_set_onlink(p, prefix->onlink); + if (r < 0) + return r; + + r = sd_radv_prefix_set_address_autoconfiguration(p, prefix->address_auto_configuration); + if (r < 0) + return r; + + return sd_radv_add_prefix(link->radv, p); +} + +static int radv_set_route_prefix(Link *link, RoutePrefix *prefix) { + _cleanup_(sd_radv_route_prefix_unrefp) sd_radv_route_prefix *p = NULL; + int r; + + assert(link); + assert(link->radv); + assert(prefix); + + r = sd_radv_route_prefix_new(&p); + if (r < 0) + return r; + + r = sd_radv_route_prefix_set_prefix(p, &prefix->prefix, prefix->prefixlen); + if (r < 0) + return r; + + r = sd_radv_route_prefix_set_lifetime(p, prefix->lifetime, USEC_INFINITY); + if (r < 0) + return r; + + return sd_radv_add_route_prefix(link->radv, p); +} + +static int radv_set_pref64_prefix(Link *link, pref64Prefix *prefix) { + _cleanup_(sd_radv_pref64_prefix_unrefp) sd_radv_pref64_prefix *p = NULL; + int r; + + assert(link); + assert(link->radv); + assert(prefix); + + r = sd_radv_pref64_prefix_new(&p); + if (r < 0) + return r; + + r = sd_radv_pref64_prefix_set_prefix(p, &prefix->prefix, prefix->prefixlen, prefix->lifetime); + if (r < 0) + return r; + + return sd_radv_add_pref64_prefix(link->radv, p); +} + +static int network_get_ipv6_dns(Network *network, struct in6_addr **ret_addresses, size_t *ret_size) { + _cleanup_free_ struct in6_addr *addresses = NULL; + size_t n_addresses = 0; + + assert(network); + assert(ret_addresses); + assert(ret_size); + + for (size_t i = 0; i < network->n_dns; i++) { + union in_addr_union *addr; + + if (network->dns[i]->family != AF_INET6) + continue; + + addr = &network->dns[i]->address; + + if (in_addr_is_null(AF_INET6, addr) || + in_addr_is_link_local(AF_INET6, addr) || + in_addr_is_localhost(AF_INET6, addr)) + continue; + + if (!GREEDY_REALLOC(addresses, n_addresses + 1)) + return -ENOMEM; + + addresses[n_addresses++] = addr->in6; + } + + *ret_addresses = TAKE_PTR(addresses); + *ret_size = n_addresses; + + return n_addresses; +} + +static int radv_set_dns(Link *link, Link *uplink) { + _cleanup_free_ struct in6_addr *dns = NULL; + size_t n_dns; + int r; + + if (!link->network->router_emit_dns) + return 0; + + if (link->network->router_dns) { + struct in6_addr *p; + + dns = new(struct in6_addr, link->network->n_router_dns); + if (!dns) + return -ENOMEM; + + p = dns; + for (size_t i = 0; i < link->network->n_router_dns; i++) + if (in6_addr_is_null(&link->network->router_dns[i])) { + if (in6_addr_is_set(&link->ipv6ll_address)) + *(p++) = link->ipv6ll_address; + } else + *(p++) = link->network->router_dns[i]; + + n_dns = p - dns; + + goto set_dns; + } + + r = network_get_ipv6_dns(link->network, &dns, &n_dns); + if (r > 0) + goto set_dns; + + if (uplink) { + assert(uplink->network); + + r = network_get_ipv6_dns(uplink->network, &dns, &n_dns); + if (r > 0) + goto set_dns; + } + + return 0; + +set_dns: + return sd_radv_set_rdnss(link->radv, + link->network->router_dns_lifetime_usec, + dns, n_dns); +} + +static int radv_set_domains(Link *link, Link *uplink) { + _cleanup_free_ char **s = NULL; /* just free() because the strings are owned by the set */ + OrderedSet *search_domains; + + if (!link->network->router_emit_domains) + return 0; + + search_domains = link->network->router_search_domains; + + if (search_domains) + goto set_domains; + + search_domains = link->network->search_domains; + if (search_domains) + goto set_domains; + + if (uplink) { + assert(uplink->network); + + search_domains = uplink->network->search_domains; + if (search_domains) + goto set_domains; + } + + return 0; + +set_domains: + s = ordered_set_get_strv(search_domains); + if (!s) + return log_oom(); + + return sd_radv_set_dnssl(link->radv, + link->network->router_dns_lifetime_usec, + s); + +} + +static int radv_find_uplink(Link *link, Link **ret) { + int r; + + assert(link); + + if (link->network->router_uplink_name) + return link_get_by_name(link->manager, link->network->router_uplink_name, ret); + + if (link->network->router_uplink_index > 0) + return link_get_by_index(link->manager, link->network->router_uplink_index, ret); + + if (link->network->router_uplink_index == UPLINK_INDEX_AUTO) { + if (link_dhcp_pd_is_enabled(link)) + r = dhcp_pd_find_uplink(link, ret); /* When DHCP-PD is enabled, use its uplink. */ + else + r = manager_find_uplink(link->manager, AF_INET6, link, ret); + if (r < 0) + /* It is not necessary to propagate error in automatic selection. */ + *ret = NULL; + return 0; + } + + *ret = NULL; + return 0; +} + +static int radv_configure(Link *link) { + Link *uplink = NULL; + RoutePrefix *q; + pref64Prefix *n; + Prefix *p; + int r; + + assert(link); + assert(link->network); + + if (link->radv) + return -EBUSY; + + r = sd_radv_new(&link->radv); + if (r < 0) + return r; + + r = sd_radv_attach_event(link->radv, link->manager->event, 0); + if (r < 0) + return r; + + if (link->hw_addr.length == ETH_ALEN) { + r = sd_radv_set_mac(link->radv, &link->hw_addr.ether); + if (r < 0) + return r; + } + + r = sd_radv_set_ifindex(link->radv, link->ifindex); + if (r < 0) + return r; + + r = sd_radv_set_managed_information(link->radv, link->network->router_managed); + if (r < 0) + return r; + + r = sd_radv_set_other_information(link->radv, link->network->router_other_information); + if (r < 0) + return r; + + r = sd_radv_set_router_lifetime(link->radv, link->network->router_lifetime_usec); + if (r < 0) + return r; + + r = sd_radv_set_hop_limit(link->radv, link->network->router_hop_limit); + if (r < 0) + return r; + + if (link->network->router_lifetime_usec > 0) { + r = sd_radv_set_preference(link->radv, link->network->router_preference); + if (r < 0) + return r; + } + + if (link->network->router_retransmit_usec > 0) { + r = sd_radv_set_retransmit(link->radv, link->network->router_retransmit_usec); + if (r < 0) + return r; + } + + HASHMAP_FOREACH(p, link->network->prefixes_by_section) { + r = radv_set_prefix(link, p); + if (r < 0 && r != -EEXIST) + return r; + } + + HASHMAP_FOREACH(q, link->network->route_prefixes_by_section) { + r = radv_set_route_prefix(link, q); + if (r < 0 && r != -EEXIST) + return r; + } + + HASHMAP_FOREACH(n, link->network->pref64_prefixes_by_section) { + r = radv_set_pref64_prefix(link, n); + if (r < 0 && r != -EEXIST) + return r; + } + + (void) radv_find_uplink(link, &uplink); + + r = radv_set_dns(link, uplink); + if (r < 0) + return log_link_debug_errno(link, r, "Could not set RA DNS: %m"); + + r = radv_set_domains(link, uplink); + if (r < 0) + return log_link_debug_errno(link, r, "Could not set RA Domains: %m"); + + r = sd_radv_set_home_agent_information(link->radv, link->network->router_home_agent_information); + if (r < 0) + return r; + + r = sd_radv_set_home_agent_preference(link->radv, link->network->router_home_agent_preference); + if (r < 0) + return r; + + r = sd_radv_set_home_agent_lifetime(link->radv, link->network->home_agent_lifetime_usec); + if (r < 0) + return r; + + return 0; +} + +int radv_update_mac(Link *link) { + bool restart; + int r; + + assert(link); + + if (!link->radv) + return 0; + + if (link->hw_addr.length != ETH_ALEN) + return 0; + + restart = sd_radv_is_running(link->radv); + + r = sd_radv_stop(link->radv); + if (r < 0) + return r; + + r = sd_radv_set_mac(link->radv, &link->hw_addr.ether); + if (r < 0) + return r; + + if (restart) { + r = sd_radv_start(link->radv); + if (r < 0) + return r; + } + + return 0; +} + +static int radv_is_ready_to_configure(Link *link) { + bool needs_uplink = false; + int r; + + assert(link); + assert(link->network); + + if (!link_is_ready_to_configure(link, /* allow_unmanaged = */ false)) + return false; + + if (in6_addr_is_null(&link->ipv6ll_address)) + return false; + + if (link->hw_addr.length != ETH_ALEN || hw_addr_is_null(&link->hw_addr)) + return false; + + if (link->network->router_emit_dns && !link->network->router_dns) { + _cleanup_free_ struct in6_addr *dns = NULL; + size_t n_dns; + + r = network_get_ipv6_dns(link->network, &dns, &n_dns); + if (r < 0) + return r; + + needs_uplink = r == 0; + } + + if (link->network->router_emit_domains && + !link->network->router_search_domains && + !link->network->search_domains) + needs_uplink = true; + + if (needs_uplink) { + Link *uplink = NULL; + + if (radv_find_uplink(link, &uplink) < 0) + return false; + + if (uplink && !uplink->network) + return false; + } + + return true; +} + +static int radv_process_request(Request *req, Link *link, void *userdata) { + int r; + + assert(link); + + r = radv_is_ready_to_configure(link); + if (r <= 0) + return r; + + r = radv_configure(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure IPv6 Router Advertisement engine: %m"); + + if (link_has_carrier(link)) { + r = radv_start(link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to start IPv6 Router Advertisement engine: %m"); + } + + log_link_debug(link, "IPv6 Router Advertisement engine is configured%s.", + link_has_carrier(link) ? " and started" : ""); + return 1; +} + +int link_request_radv(Link *link) { + int r; + + assert(link); + + if (!link_radv_enabled(link)) + return 0; + + if (link->radv) + return 0; + + r = link_queue_request(link, REQUEST_TYPE_RADV, radv_process_request, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request configuring of the IPv6 Router Advertisement engine: %m"); + + log_link_debug(link, "Requested configuring of the IPv6 Router Advertisement engine."); + return 0; +} + +int radv_start(Link *link) { + int r; + + assert(link); + assert(link->network); + + if (!link->radv) + return 0; + + if (!link_has_carrier(link)) + return 0; + + if (in6_addr_is_null(&link->ipv6ll_address)) + return 0; + + if (sd_radv_is_running(link->radv)) + return 0; + + if (link->network->dhcp_pd_announce) { + r = dhcp_request_prefix_delegation(link); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to request DHCP delegated subnet prefix: %m"); + } + + log_link_debug(link, "Starting IPv6 Router Advertisements"); + return sd_radv_start(link->radv); +} + +int radv_add_prefix( + Link *link, + const struct in6_addr *prefix, + uint8_t prefix_len, + usec_t lifetime_preferred_usec, + usec_t lifetime_valid_usec) { + + _cleanup_(sd_radv_prefix_unrefp) sd_radv_prefix *p = NULL; + int r; + + assert(link); + + if (!link->radv) + return 0; + + r = sd_radv_prefix_new(&p); + if (r < 0) + return r; + + r = sd_radv_prefix_set_prefix(p, prefix, prefix_len); + if (r < 0) + return r; + + r = sd_radv_prefix_set_preferred_lifetime(p, RADV_DEFAULT_PREFERRED_LIFETIME_USEC, lifetime_preferred_usec); + if (r < 0) + return r; + + r = sd_radv_prefix_set_valid_lifetime(p, RADV_DEFAULT_VALID_LIFETIME_USEC, lifetime_valid_usec); + if (r < 0) + return r; + + r = sd_radv_add_prefix(link->radv, p); + if (r < 0 && r != -EEXIST) + return r; + + return 0; +} + +static int prefix_section_verify(Prefix *p) { + assert(p); + + if (section_is_invalid(p->section)) + return -EINVAL; + + if (in6_addr_is_null(&p->prefix)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: [IPv6Prefix] section without Prefix= field configured, " + "or specified prefix is the null address. " + "Ignoring [IPv6Prefix] section from line %u.", + p->section->filename, p->section->line); + + if (p->prefixlen < 3 || p->prefixlen > 128) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Invalid prefix length %u is specified in [IPv6Prefix] section. " + "Valid range is 3…128. Ignoring [IPv6Prefix] section from line %u.", + p->section->filename, p->prefixlen, p->section->line); + + if (p->prefixlen > 64) { + log_info("%s:%u: Unusual prefix length %u (> 64) is specified in [IPv6Prefix] section from line %s%s.", + p->section->filename, p->section->line, + p->prefixlen, + p->assign ? ", refusing to assign an address in " : "", + p->assign ? IN6_ADDR_PREFIX_TO_STRING(&p->prefix, p->prefixlen) : ""); + + p->assign = false; + } + + if (p->valid_lifetime == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: The valid lifetime of prefix cannot be zero. " + "Ignoring [IPv6Prefix] section from line %u.", + p->section->filename, p->section->line); + + if (p->preferred_lifetime > p->valid_lifetime) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: The preferred lifetime %s is longer than the valid lifetime %s. " + "Ignoring [IPv6Prefix] section from line %u.", + p->section->filename, + FORMAT_TIMESPAN(p->preferred_lifetime, USEC_PER_SEC), + FORMAT_TIMESPAN(p->valid_lifetime, USEC_PER_SEC), + p->section->line); + + return 0; +} + +void network_drop_invalid_prefixes(Network *network) { + Prefix *p; + + assert(network); + + HASHMAP_FOREACH(p, network->prefixes_by_section) + if (prefix_section_verify(p) < 0) + prefix_free(p); +} + +static int route_prefix_section_verify(RoutePrefix *p) { + if (section_is_invalid(p->section)) + return -EINVAL; + + if (p->prefixlen > 128) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Invalid prefix length %u is specified in [IPv6RoutePrefix] section. " + "Valid range is 0…128. Ignoring [IPv6RoutePrefix] section from line %u.", + p->section->filename, p->prefixlen, p->section->line); + + if (p->lifetime == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: The lifetime of route cannot be zero. " + "Ignoring [IPv6RoutePrefix] section from line %u.", + p->section->filename, p->section->line); + + return 0; +} + +void network_drop_invalid_route_prefixes(Network *network) { + RoutePrefix *p; + + assert(network); + + HASHMAP_FOREACH(p, network->route_prefixes_by_section) + if (route_prefix_section_verify(p) < 0) + route_prefix_free(p); +} + +void network_drop_invalid_pref64_prefixes(Network *network) { + pref64Prefix *p; + + assert(network); + + HASHMAP_FOREACH(p, network->pref64_prefixes_by_section) + if (section_is_invalid(p->section)) + pref64_prefix_free(p); +} + +int config_parse_prefix( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(prefix_free_or_set_invalidp) Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + union in_addr_union a; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = in_addr_prefix_from_string(rvalue, AF_INET6, &a, &p->prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Prefix is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + (void) in6_addr_mask(&a.in6, p->prefixlen); + p->prefix = a.in6; + + TAKE_PTR(p); + return 0; +} + +int config_parse_prefix_boolean( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(prefix_free_or_set_invalidp) Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (streq(lvalue, "OnLink")) + p->onlink = r; + else if (streq(lvalue, "AddressAutoconfiguration")) + p->address_auto_configuration = r; + else if (streq(lvalue, "Assign")) + p->assign = r; + else + assert_not_reached(); + + TAKE_PTR(p); + return 0; +} + +int config_parse_prefix_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(prefix_free_or_set_invalidp) Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + usec_t usec; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Lifetime is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + if (usec != USEC_INFINITY && DIV_ROUND_UP(usec, USEC_PER_SEC) >= UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Lifetime is too long, ignoring assignment: %s", rvalue); + return 0; + } + + if (streq(lvalue, "PreferredLifetimeSec")) + p->preferred_lifetime = usec; + else if (streq(lvalue, "ValidLifetimeSec")) + p->valid_lifetime = usec; + else + assert_not_reached(); + + TAKE_PTR(p); + return 0; +} + +int config_parse_prefix_metric( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(prefix_free_or_set_invalidp) Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = safe_atou32(rvalue, &p->route_metric); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(p); + return 0; +} + +int config_parse_prefix_token( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(prefix_free_or_set_invalidp) Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = config_parse_address_generation_type(unit, filename, line, section, section_line, + lvalue, ltype, rvalue, &p->tokens, userdata); + if (r < 0) + return r; + + TAKE_PTR(p); + return 0; +} + +int config_parse_route_prefix( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(route_prefix_free_or_set_invalidp) RoutePrefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + union in_addr_union a; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = route_prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = in_addr_prefix_from_string(rvalue, AF_INET6, &a, &p->prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Route prefix is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + (void) in6_addr_mask(&a.in6, p->prefixlen); + p->prefix = a.in6; + + TAKE_PTR(p); + return 0; +} + +int config_parse_route_prefix_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(route_prefix_free_or_set_invalidp) RoutePrefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + usec_t usec; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = route_prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Route lifetime is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + if (usec != USEC_INFINITY && DIV_ROUND_UP(usec, USEC_PER_SEC) >= UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Lifetime is too long, ignoring assignment: %s", rvalue); + return 0; + } + + p->lifetime = usec; + + TAKE_PTR(p); + return 0; +} + +int config_parse_pref64_prefix( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(pref64_prefix_free_or_set_invalidp) pref64Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + union in_addr_union a; + uint8_t prefixlen; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = pref64_prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = in_addr_prefix_from_string(rvalue, AF_INET6, &a, &prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "PREF64 prefix is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + if (!IN_SET(prefixlen, 96, 64, 56, 48, 40, 32)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "PREF64 prefixlen is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + (void) in6_addr_mask(&a.in6,prefixlen); + p->prefix = a.in6; + p->prefixlen = prefixlen; + + TAKE_PTR(p); + return 0; +} + +int config_parse_pref64_prefix_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(pref64_prefix_free_or_set_invalidp) pref64Prefix *p = NULL; + Network *network = ASSERT_PTR(userdata); + usec_t usec; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + r = pref64_prefix_new_static(network, filename, section_line, &p); + if (r < 0) + return log_oom(); + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "PREF64 lifetime is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + if (usec == USEC_INFINITY || DIV_ROUND_UP(usec, 8 * USEC_PER_SEC) >= UINT64_C(1) << 13) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "PREF64 lifetime is too long, ignoring assignment: %s", rvalue); + return 0; + } + + p->lifetime = usec; + + TAKE_PTR(p); + return 0; +} + +int config_parse_radv_dns( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *n = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + n->n_router_dns = 0; + n->router_dns = mfree(n->router_dns); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + union in_addr_union a; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract word, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + if (streq(w, "_link_local")) + a = IN_ADDR_NULL; + else { + r = in_addr_from_string(AF_INET6, w, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DNS server address, ignoring: %s", w); + continue; + } + + if (in_addr_is_null(AF_INET6, &a)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "DNS server address is null, ignoring: %s", w); + continue; + } + } + + struct in6_addr *m; + m = reallocarray(n->router_dns, n->n_router_dns + 1, sizeof(struct in6_addr)); + if (!m) + return log_oom(); + + m[n->n_router_dns++] = a.in6; + n->router_dns = m; + } +} + +int config_parse_radv_search_domains( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *n = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + n->router_search_domains = ordered_set_free(n->router_search_domains); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL, *idna = NULL; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract word, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = dns_name_apply_idna(w, &idna); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to apply IDNA to domain name '%s', ignoring: %m", w); + continue; + } else if (r == 0) + /* transfer ownership to simplify subsequent operations */ + idna = TAKE_PTR(w); + + r = ordered_set_ensure_allocated(&n->router_search_domains, &string_hash_ops_free); + if (r < 0) + return log_oom(); + + r = ordered_set_consume(n->router_search_domains, TAKE_PTR(idna)); + if (r < 0) + return log_oom(); + } +} + +static const char * const radv_prefix_delegation_table[_RADV_PREFIX_DELEGATION_MAX] = { + [RADV_PREFIX_DELEGATION_NONE] = "no", + [RADV_PREFIX_DELEGATION_STATIC] = "static", + [RADV_PREFIX_DELEGATION_DHCP6] = "dhcpv6", + [RADV_PREFIX_DELEGATION_BOTH] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN( + radv_prefix_delegation, + RADVPrefixDelegation, + RADV_PREFIX_DELEGATION_BOTH); + +int config_parse_router_prefix_delegation( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + RADVPrefixDelegation val, *ra = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (streq(lvalue, "IPv6SendRA")) { + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid %s= setting, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + /* When IPv6SendRA= is enabled, only static prefixes are sent by default, and users + * need to explicitly enable DHCPv6PrefixDelegation=. */ + *ra = r ? RADV_PREFIX_DELEGATION_STATIC : RADV_PREFIX_DELEGATION_NONE; + return 0; + } + + /* For backward compatibility */ + val = radv_prefix_delegation_from_string(rvalue); + if (val < 0) { + log_syntax(unit, LOG_WARNING, filename, line, val, + "Invalid %s= setting, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + *ra = val; + return 0; +} + +int config_parse_router_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t usec, *lifetime = ASSERT_PTR(data); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *lifetime = RADV_DEFAULT_ROUTER_LIFETIME_USEC; + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse router lifetime, ignoring assignment: %s", rvalue); + return 0; + } + if (usec > 0) { + if (usec < RADV_MIN_ROUTER_LIFETIME_USEC) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Router lifetime %s is too short, using %s.", + FORMAT_TIMESPAN(usec, USEC_PER_SEC), + FORMAT_TIMESPAN(RADV_MIN_ROUTER_LIFETIME_USEC, USEC_PER_SEC)); + usec = RADV_MIN_ROUTER_LIFETIME_USEC; + } else if (usec > RADV_MAX_ROUTER_LIFETIME_USEC) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Router lifetime %s is too large, using %s.", + FORMAT_TIMESPAN(usec, USEC_PER_SEC), + FORMAT_TIMESPAN(RADV_MAX_ROUTER_LIFETIME_USEC, USEC_PER_SEC)); + usec = RADV_MAX_ROUTER_LIFETIME_USEC; + } + } + + *lifetime = usec; + return 0; +} + +int config_parse_router_retransmit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t usec, *router_retransmit_usec = ASSERT_PTR(data); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *router_retransmit_usec = 0; + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (usec != USEC_INFINITY && + usec > RADV_MAX_RETRANSMIT_USEC) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid [%s] %s=, ignoring assignment: %s", section, lvalue, rvalue); + return 0; + } + + *router_retransmit_usec = usec; + return 0; +} + +int config_parse_router_preference( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(rvalue, "high")) + network->router_preference = SD_NDISC_PREFERENCE_HIGH; + else if (STR_IN_SET(rvalue, "medium", "normal", "default")) + network->router_preference = SD_NDISC_PREFERENCE_MEDIUM; + else if (streq(rvalue, "low")) + network->router_preference = SD_NDISC_PREFERENCE_LOW; + else + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid router preference, ignoring assignment: %s", rvalue); + + return 0; +} + +int config_parse_router_home_agent_lifetime( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t usec, *home_agent_lifetime_usec = ASSERT_PTR(data); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *home_agent_lifetime_usec = 0; + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (!timestamp_is_set(usec) || + usec > RADV_HOME_AGENT_MAX_LIFETIME_USEC) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid [%s] %s=, ignoring assignment: %s", section, lvalue, rvalue); + return 0; + } + + *home_agent_lifetime_usec = usec; + return 0; +} diff --git a/src/network/networkd-radv.h b/src/network/networkd-radv.h new file mode 100644 index 0000000..48677b5 --- /dev/null +++ b/src/network/networkd-radv.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2017 Intel Corporation. All rights reserved. +***/ + +#include +#include + +#include "sd-radv.h" + +#include "in-addr-util.h" +#include "conf-parser.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Network Network; + +typedef enum RADVPrefixDelegation { + RADV_PREFIX_DELEGATION_NONE = 0, + RADV_PREFIX_DELEGATION_STATIC = 1 << 0, + RADV_PREFIX_DELEGATION_DHCP6 = 1 << 1, + RADV_PREFIX_DELEGATION_BOTH = RADV_PREFIX_DELEGATION_STATIC | RADV_PREFIX_DELEGATION_DHCP6, + _RADV_PREFIX_DELEGATION_MAX, + _RADV_PREFIX_DELEGATION_INVALID = -EINVAL, +} RADVPrefixDelegation; + +typedef struct Prefix { + Network *network; + ConfigSection *section; + + struct in6_addr prefix; + uint8_t prefixlen; + usec_t preferred_lifetime; + usec_t valid_lifetime; + + bool onlink; + bool address_auto_configuration; + + bool assign; + uint32_t route_metric; + Set *tokens; +} Prefix; + +typedef struct RoutePrefix { + Network *network; + ConfigSection *section; + + struct in6_addr prefix; + uint8_t prefixlen; + usec_t lifetime; +} RoutePrefix; + +typedef struct pref64Prefix { + Network *network; + ConfigSection *section; + + struct in6_addr prefix; + uint8_t prefixlen; + usec_t lifetime; +} pref64Prefix; + +Prefix *prefix_free(Prefix *prefix); +RoutePrefix *route_prefix_free(RoutePrefix *prefix); +pref64Prefix *pref64_prefix_free(pref64Prefix *prefix); + +void network_drop_invalid_prefixes(Network *network); +void network_drop_invalid_route_prefixes(Network *network); +void network_drop_invalid_pref64_prefixes(Network *network); +void network_adjust_radv(Network *network); + +int link_request_radv_addresses(Link *link); + +bool link_radv_enabled(Link *link); +int radv_start(Link *link); +int radv_update_mac(Link *link); +int radv_add_prefix(Link *link, const struct in6_addr *prefix, uint8_t prefix_len, + usec_t lifetime_preferred_usec, usec_t lifetime_valid_usec); + +int link_request_radv(Link *link); + +const char* radv_prefix_delegation_to_string(RADVPrefixDelegation i) _const_; +RADVPrefixDelegation radv_prefix_delegation_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_router_prefix_delegation); +CONFIG_PARSER_PROTOTYPE(config_parse_router_lifetime); +CONFIG_PARSER_PROTOTYPE(config_parse_router_retransmit); +CONFIG_PARSER_PROTOTYPE(config_parse_router_preference); +CONFIG_PARSER_PROTOTYPE(config_parse_prefix); +CONFIG_PARSER_PROTOTYPE(config_parse_prefix_boolean); +CONFIG_PARSER_PROTOTYPE(config_parse_prefix_lifetime); +CONFIG_PARSER_PROTOTYPE(config_parse_prefix_metric); +CONFIG_PARSER_PROTOTYPE(config_parse_prefix_token); +CONFIG_PARSER_PROTOTYPE(config_parse_radv_dns); +CONFIG_PARSER_PROTOTYPE(config_parse_radv_search_domains); +CONFIG_PARSER_PROTOTYPE(config_parse_route_prefix); +CONFIG_PARSER_PROTOTYPE(config_parse_route_prefix_lifetime); +CONFIG_PARSER_PROTOTYPE(config_parse_pref64_prefix); +CONFIG_PARSER_PROTOTYPE(config_parse_pref64_prefix_lifetime); +CONFIG_PARSER_PROTOTYPE(config_parse_router_home_agent_lifetime); diff --git a/src/network/networkd-route-util.c b/src/network/networkd-route-util.c new file mode 100644 index 0000000..d49a0b9 --- /dev/null +++ b/src/network/networkd-route-util.c @@ -0,0 +1,586 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "logarithm.h" +#include "missing_threads.h" +#include "networkd-address.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-route-util.h" +#include "networkd-route.h" +#include "parse-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "sysctl-util.h" + +#define ROUTES_DEFAULT_MAX_PER_FAMILY 4096U + +unsigned routes_max(void) { + static thread_local unsigned cached = 0; + _cleanup_free_ char *s4 = NULL, *s6 = NULL; + unsigned val4 = ROUTES_DEFAULT_MAX_PER_FAMILY, val6 = ROUTES_DEFAULT_MAX_PER_FAMILY; + + if (cached > 0) + return cached; + + if (sysctl_read_ip_property(AF_INET, NULL, "route/max_size", &s4) >= 0) + if (safe_atou(s4, &val4) >= 0 && val4 == 2147483647U) + /* This is the default "no limit" value in the kernel */ + val4 = ROUTES_DEFAULT_MAX_PER_FAMILY; + + if (sysctl_read_ip_property(AF_INET6, NULL, "route/max_size", &s6) >= 0) + (void) safe_atou(s6, &val6); + + cached = MAX(ROUTES_DEFAULT_MAX_PER_FAMILY, val4) + + MAX(ROUTES_DEFAULT_MAX_PER_FAMILY, val6); + return cached; +} + +static bool route_lifetime_is_valid(const Route *route) { + assert(route); + + return + route->lifetime_usec == USEC_INFINITY || + route->lifetime_usec > now(CLOCK_BOOTTIME); +} + +bool link_find_default_gateway(Link *link, int family, Route **gw) { + bool found = false; + Route *route; + + assert(link); + + SET_FOREACH(route, link->routes) { + if (!route_exists(route)) + continue; + if (family != AF_UNSPEC && route->family != family) + continue; + if (route->dst_prefixlen != 0) + continue; + if (route->src_prefixlen != 0) + continue; + if (route->table != RT_TABLE_MAIN) + continue; + if (route->type != RTN_UNICAST) + continue; + if (route->scope != RT_SCOPE_UNIVERSE) + continue; + if (!in_addr_is_set(route->gw_family, &route->gw)) + continue; + + /* Found a default gateway. */ + if (!gw) + return true; + + /* If we have already found another gw, then let's compare their weight and priority. */ + if (*gw) { + if (route->gw_weight > (*gw)->gw_weight) + continue; + if (route->priority >= (*gw)->priority) + continue; + } + + *gw = route; + found = true; + } + + return found; +} + +int manager_find_uplink(Manager *m, int family, Link *exclude, Link **ret) { + Route *gw = NULL; + Link *link; + + assert(m); + assert(IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6)); + + /* Looks for a suitable "uplink", via black magic: an interface that is up and where the + * default route with the highest priority points to. */ + + HASHMAP_FOREACH(link, m->links_by_index) { + if (link == exclude) + continue; + + if (link->state != LINK_STATE_CONFIGURED) + continue; + + link_find_default_gateway(link, family, &gw); + } + + if (!gw) + return -ENOENT; + + if (ret) { + assert(gw->link); + *ret = gw->link; + } + + return 0; +} + +bool gateway_is_ready(Link *link, bool onlink, int family, const union in_addr_union *gw) { + Route *route; + Address *a; + + assert(link); + assert(link->manager); + + if (onlink) + return true; + + if (!gw || !in_addr_is_set(family, gw)) + return true; + + if (family == AF_INET6 && in6_addr_is_link_local(&gw->in6)) + return true; + + SET_FOREACH(route, link->routes) { + if (!route_exists(route)) + continue; + if (!route_lifetime_is_valid(route)) + continue; + if (route->family != family) + continue; + if (!in_addr_is_set(route->family, &route->dst) && route->dst_prefixlen == 0) + continue; + if (in_addr_prefix_covers(family, &route->dst, route->dst_prefixlen, gw) > 0) + return true; + } + + if (link->manager->manage_foreign_routes) + return false; + + /* If we do not manage foreign routes, then there may exist a prefix route we do not know, + * which was created on configuring an address. Hence, also check the addresses. */ + SET_FOREACH(a, link->addresses) { + if (!address_is_ready(a)) + continue; + if (a->family != family) + continue; + if (FLAGS_SET(a->flags, IFA_F_NOPREFIXROUTE)) + continue; + if (in_addr_prefix_covers(a->family, + in_addr_is_set(a->family, &a->in_addr_peer) ? &a->in_addr_peer : &a->in_addr, + a->prefixlen, gw) > 0) + return true; + } + + return false; +} + +static int link_address_is_reachable_internal( + Link *link, + int family, + const union in_addr_union *address, + const union in_addr_union *prefsrc, /* optional */ + Route **ret) { + + Route *route, *found = NULL; + + assert(link); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + SET_FOREACH(route, link->routes) { + if (!route_exists(route)) + continue; + + if (!route_lifetime_is_valid(route)) + continue; + + if (route->type != RTN_UNICAST) + continue; + + if (route->family != family) + continue; + + if (in_addr_prefix_covers(family, &route->dst, route->dst_prefixlen, address) <= 0) + continue; + + if (prefsrc && + in_addr_is_set(family, prefsrc) && + in_addr_is_set(family, &route->prefsrc) && + !in_addr_equal(family, prefsrc, &route->prefsrc)) + continue; + + if (found && found->priority <= route->priority) + continue; + + found = route; + } + + if (!found) + return -ENOENT; + + if (ret) + *ret = found; + + return 0; +} + +int link_address_is_reachable( + Link *link, + int family, + const union in_addr_union *address, + const union in_addr_union *prefsrc, /* optional */ + Address **ret) { + + Route *route; + Address *a; + int r; + + assert(link); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + /* This checks if the address is reachable, and optionally return the Address object of the + * preferred source to access the address. */ + + r = link_address_is_reachable_internal(link, family, address, prefsrc, &route); + if (r < 0) + return r; + + if (!in_addr_is_set(route->family, &route->prefsrc)) { + if (ret) + *ret = NULL; + return 0; + } + + r = link_get_address(link, route->family, &route->prefsrc, 0, &a); + if (r < 0) + return r; + + if (!address_is_ready(a)) + return -EBUSY; + + if (ret) + *ret = a; + + return 0; +} + +int manager_address_is_reachable( + Manager *manager, + int family, + const union in_addr_union *address, + const union in_addr_union *prefsrc, /* optional */ + Address **ret) { + + Route *route, *found = NULL; + Address *a; + Link *link; + int r; + + assert(manager); + + HASHMAP_FOREACH(link, manager->links_by_index) { + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + continue; + + if (link_address_is_reachable_internal(link, family, address, prefsrc, &route) < 0) + continue; + + if (found && found->priority <= route->priority) + continue; + + found = route; + } + + if (!found) + return -ENOENT; + + if (!in_addr_is_set(found->family, &found->prefsrc)) { + if (ret) + *ret = NULL; + return 0; + } + + r = link_get_address(found->link, found->family, &found->prefsrc, 0, &a); + if (r < 0) + return r; + + if (!address_is_ready(a)) + return -EBUSY; + + if (ret) + *ret = a; + + return 0; +} + +static const char * const route_type_table[__RTN_MAX] = { + [RTN_UNICAST] = "unicast", + [RTN_LOCAL] = "local", + [RTN_BROADCAST] = "broadcast", + [RTN_ANYCAST] = "anycast", + [RTN_MULTICAST] = "multicast", + [RTN_BLACKHOLE] = "blackhole", + [RTN_UNREACHABLE] = "unreachable", + [RTN_PROHIBIT] = "prohibit", + [RTN_THROW] = "throw", + [RTN_NAT] = "nat", + [RTN_XRESOLVE] = "xresolve", +}; + +assert_cc(__RTN_MAX <= UCHAR_MAX); +DEFINE_STRING_TABLE_LOOKUP(route_type, int); + +static const char * const route_scope_table[] = { + [RT_SCOPE_UNIVERSE] = "global", + [RT_SCOPE_SITE] = "site", + [RT_SCOPE_LINK] = "link", + [RT_SCOPE_HOST] = "host", + [RT_SCOPE_NOWHERE] = "nowhere", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(route_scope, int, UINT8_MAX); + +static const char * const route_protocol_table[] = { + [RTPROT_KERNEL] = "kernel", + [RTPROT_BOOT] = "boot", + [RTPROT_STATIC] = "static", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(route_protocol, int, UINT8_MAX); + +static const char * const route_protocol_full_table[] = { + [RTPROT_REDIRECT] = "redirect", + [RTPROT_KERNEL] = "kernel", + [RTPROT_BOOT] = "boot", + [RTPROT_STATIC] = "static", + [RTPROT_GATED] = "gated", + [RTPROT_RA] = "ra", + [RTPROT_MRT] = "mrt", + [RTPROT_ZEBRA] = "zebra", + [RTPROT_BIRD] = "bird", + [RTPROT_DNROUTED] = "dnrouted", + [RTPROT_XORP] = "xorp", + [RTPROT_NTK] = "ntk", + [RTPROT_DHCP] = "dhcp", + [RTPROT_MROUTED] = "mrouted", + [RTPROT_BABEL] = "babel", + [RTPROT_BGP] = "bgp", + [RTPROT_ISIS] = "isis", + [RTPROT_OSPF] = "ospf", + [RTPROT_RIP] = "rip", + [RTPROT_EIGRP] = "eigrp", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(route_protocol_full, int, UINT8_MAX); + +int route_flags_to_string_alloc(uint32_t flags, char **ret) { + _cleanup_free_ char *str = NULL; + static const char* map[] = { + [LOG2U(RTNH_F_DEAD)] = "dead", /* Nexthop is dead (used by multipath) */ + [LOG2U(RTNH_F_PERVASIVE)] = "pervasive", /* Do recursive gateway lookup */ + [LOG2U(RTNH_F_ONLINK)] = "onlink" , /* Gateway is forced on link */ + [LOG2U(RTNH_F_OFFLOAD)] = "offload", /* Nexthop is offloaded */ + [LOG2U(RTNH_F_LINKDOWN)] = "linkdown", /* carrier-down on nexthop */ + [LOG2U(RTNH_F_UNRESOLVED)] = "unresolved", /* The entry is unresolved (ipmr) */ + [LOG2U(RTNH_F_TRAP)] = "trap", /* Nexthop is trapping packets */ + }; + + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(map); i++) + if (FLAGS_SET(flags, 1 << i) && map[i]) + if (!strextend_with_separator(&str, ",", map[i])) + return -ENOMEM; + + *ret = TAKE_PTR(str); + return 0; +} + +static const char * const route_table_table[] = { + [RT_TABLE_DEFAULT] = "default", + [RT_TABLE_MAIN] = "main", + [RT_TABLE_LOCAL] = "local", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(route_table, int); + +int manager_get_route_table_from_string(const Manager *m, const char *s, uint32_t *ret) { + uint32_t t; + int r; + + assert(m); + assert(s); + assert(ret); + + r = route_table_from_string(s); + if (r >= 0) { + *ret = (uint32_t) r; + return 0; + } + + t = PTR_TO_UINT32(hashmap_get(m->route_table_numbers_by_name, s)); + if (t != 0) { + *ret = t; + return 0; + } + + r = safe_atou32(s, &t); + if (r < 0) + return r; + + if (t == 0) + return -ERANGE; + + *ret = t; + return 0; +} + +int manager_get_route_table_to_string(const Manager *m, uint32_t table, bool append_num, char **ret) { + _cleanup_free_ char *str = NULL; + const char *s; + + assert(m); + assert(ret); + + /* Unlike manager_get_route_table_from_string(), this accepts 0, as the kernel may create routes with + * table 0. See issue #25089. */ + + s = route_table_to_string(table); + if (!s) + s = hashmap_get(m->route_table_names_by_number, UINT32_TO_PTR(table)); + + if (s && !append_num) { + str = strdup(s); + if (!str) + return -ENOMEM; + + } else if (asprintf(&str, "%s%s%" PRIu32 "%s", + strempty(s), + s ? "(" : "", + table, + s ? ")" : "") < 0) + return -ENOMEM; + + *ret = TAKE_PTR(str); + return 0; +} + +int config_parse_route_table_names( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + m->route_table_names_by_number = hashmap_free(m->route_table_names_by_number); + m->route_table_numbers_by_name = hashmap_free(m->route_table_numbers_by_name); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *name = NULL; + uint32_t table; + char *num; + + r = extract_first_word(&p, &name, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid RouteTable=, ignoring assignment: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + num = strchr(name, ':'); + if (!num) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid route table name and number pair, ignoring assignment: %s", name); + continue; + } + + *num++ = '\0'; + + if (isempty(name)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Route table name cannot be empty. Ignoring assignment: %s:%s", name, num); + continue; + } + if (in_charset(name, DIGITS)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Route table name cannot be numeric. Ignoring assignment: %s:%s", name, num); + continue; + } + if (route_table_from_string(name) >= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Route table name %s is predefined for %i. Ignoring assignment: %s:%s", + name, route_table_from_string(name), name, num); + continue; + } + + r = safe_atou32(num, &table); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse route table number '%s', ignoring assignment: %s:%s", num, name, num); + continue; + } + if (table == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid route table number, ignoring assignment: %s:%s", name, num); + continue; + } + if (route_table_to_string(table)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Route table name for %s is predefined (%s). Ignoring assignment: %s:%s", + num, route_table_to_string(table), name, num); + continue; + } + + r = hashmap_ensure_put(&m->route_table_numbers_by_name, &string_hash_ops_free, name, UINT32_TO_PTR(table)); + if (r == -ENOMEM) + return log_oom(); + if (r == -EEXIST) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Specified route table name and number pair conflicts with others, ignoring assignment: %s:%s", name, num); + continue; + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store route table name and number pair, ignoring assignment: %s:%s", name, num); + continue; + } + if (r == 0) + /* The entry is duplicated. It should not be added to route_table_names_by_number hashmap. */ + continue; + + r = hashmap_ensure_put(&m->route_table_names_by_number, NULL, UINT32_TO_PTR(table), name); + if (r < 0) { + hashmap_remove(m->route_table_numbers_by_name, name); + + if (r == -ENOMEM) + return log_oom(); + if (r == -EEXIST) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Specified route table name and number pair conflicts with others, ignoring assignment: %s:%s", name, num); + else + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store route table name and number pair, ignoring assignment: %s:%s", name, num); + continue; + } + assert(r > 0); + + TAKE_PTR(name); + } +} diff --git a/src/network/networkd-route-util.h b/src/network/networkd-route-util.h new file mode 100644 index 0000000..f326888 --- /dev/null +++ b/src/network/networkd-route-util.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "conf-parser.h" + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Address Address; +typedef struct Route Route; + +unsigned routes_max(void); + +bool link_find_default_gateway(Link *link, int family, Route **gw); +static inline bool link_has_default_gateway(Link *link, int family) { + return link_find_default_gateway(link, family, NULL); +} + +int manager_find_uplink(Manager *m, int family, Link *exclude, Link **ret); + +bool gateway_is_ready(Link *link, bool onlink, int family, const union in_addr_union *gw); + +int link_address_is_reachable( + Link *link, + int family, + const union in_addr_union *address, + const union in_addr_union *prefsrc, /* optional */ + Address **ret); + +int manager_address_is_reachable( + Manager *manager, + int family, + const union in_addr_union *address, + const union in_addr_union *prefsrc, /* optional */ + Address **ret); + +int route_type_from_string(const char *s) _pure_; +const char *route_type_to_string(int t) _const_; + +int route_scope_from_string(const char *s); +int route_scope_to_string_alloc(int t, char **ret); + +int route_protocol_from_string(const char *s); +int route_protocol_to_string_alloc(int t, char **ret); +int route_protocol_full_from_string(const char *s); +int route_protocol_full_to_string_alloc(int t, char **ret); + +int route_flags_to_string_alloc(uint32_t flags, char **ret); + +int manager_get_route_table_from_string(const Manager *m, const char *table, uint32_t *ret); +int manager_get_route_table_to_string(const Manager *m, uint32_t table, bool append_num, char **ret); + +CONFIG_PARSER_PROTOTYPE(config_parse_route_table_names); diff --git a/src/network/networkd-route.c b/src/network/networkd-route.c new file mode 100644 index 0000000..eb502ae --- /dev/null +++ b/src/network/networkd-route.c @@ -0,0 +1,3148 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "event-util.h" +#include "netlink-util.h" +#include "networkd-address.h" +#include "networkd-ipv4ll.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-nexthop.h" +#include "networkd-queue.h" +#include "networkd-route-util.h" +#include "networkd-route.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "vrf.h" +#include "wireguard.h" + +int route_new(Route **ret) { + _cleanup_(route_freep) Route *route = NULL; + + route = new(Route, 1); + if (!route) + return -ENOMEM; + + *route = (Route) { + .family = AF_UNSPEC, + .scope = RT_SCOPE_UNIVERSE, + .protocol = RTPROT_UNSPEC, + .type = RTN_UNICAST, + .table = RT_TABLE_MAIN, + .lifetime_usec = USEC_INFINITY, + .quickack = -1, + .fast_open_no_cookie = -1, + .gateway_onlink = -1, + .ttl_propagate = -1, + }; + + *ret = TAKE_PTR(route); + + return 0; +} + +static int route_new_static(Network *network, const char *filename, unsigned section_line, Route **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(route_freep) Route *route = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + route = hashmap_get(network->routes_by_section, n); + if (route) { + *ret = TAKE_PTR(route); + return 0; + } + + if (hashmap_size(network->routes_by_section) >= routes_max()) + return -E2BIG; + + r = route_new(&route); + if (r < 0) + return r; + + route->protocol = RTPROT_STATIC; + route->network = network; + route->section = TAKE_PTR(n); + route->source = NETWORK_CONFIG_SOURCE_STATIC; + + r = hashmap_ensure_put(&network->routes_by_section, &config_section_hash_ops, route->section, route); + if (r < 0) + return r; + + *ret = TAKE_PTR(route); + return 0; +} + +Route *route_free(Route *route) { + if (!route) + return NULL; + + if (route->network) { + assert(route->section); + hashmap_remove(route->network->routes_by_section, route->section); + } + + config_section_free(route->section); + + if (route->link) + set_remove(route->link->routes, route); + + if (route->manager) + set_remove(route->manager->routes, route); + + ordered_set_free_with_destructor(route->multipath_routes, multipath_route_free); + + sd_event_source_disable_unref(route->expire); + + free(route->tcp_congestion_control_algo); + + return mfree(route); +} + +static void route_hash_func(const Route *route, struct siphash *state) { + assert(route); + + siphash24_compress(&route->family, sizeof(route->family), state); + + switch (route->family) { + case AF_INET: + case AF_INET6: + siphash24_compress(&route->dst_prefixlen, sizeof(route->dst_prefixlen), state); + siphash24_compress(&route->dst, FAMILY_ADDRESS_SIZE(route->family), state); + + siphash24_compress(&route->src_prefixlen, sizeof(route->src_prefixlen), state); + siphash24_compress(&route->src, FAMILY_ADDRESS_SIZE(route->family), state); + + siphash24_compress(&route->gw_family, sizeof(route->gw_family), state); + if (IN_SET(route->gw_family, AF_INET, AF_INET6)) { + siphash24_compress(&route->gw, FAMILY_ADDRESS_SIZE(route->gw_family), state); + siphash24_compress(&route->gw_weight, sizeof(route->gw_weight), state); + } + + siphash24_compress(&route->prefsrc, FAMILY_ADDRESS_SIZE(route->family), state); + + siphash24_compress(&route->tos, sizeof(route->tos), state); + siphash24_compress(&route->priority, sizeof(route->priority), state); + siphash24_compress(&route->table, sizeof(route->table), state); + siphash24_compress(&route->protocol, sizeof(route->protocol), state); + siphash24_compress(&route->scope, sizeof(route->scope), state); + siphash24_compress(&route->type, sizeof(route->type), state); + + siphash24_compress(&route->initcwnd, sizeof(route->initcwnd), state); + siphash24_compress(&route->initrwnd, sizeof(route->initrwnd), state); + + siphash24_compress(&route->advmss, sizeof(route->advmss), state); + siphash24_compress(&route->nexthop_id, sizeof(route->nexthop_id), state); + + break; + default: + /* treat any other address family as AF_UNSPEC */ + break; + } +} + +static int route_compare_func(const Route *a, const Route *b) { + int r; + + r = CMP(a->family, b->family); + if (r != 0) + return r; + + switch (a->family) { + case AF_INET: + case AF_INET6: + r = CMP(a->dst_prefixlen, b->dst_prefixlen); + if (r != 0) + return r; + + r = memcmp(&a->dst, &b->dst, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + r = CMP(a->src_prefixlen, b->src_prefixlen); + if (r != 0) + return r; + + r = memcmp(&a->src, &b->src, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + r = CMP(a->gw_family, b->gw_family); + if (r != 0) + return r; + + if (IN_SET(a->gw_family, AF_INET, AF_INET6)) { + r = memcmp(&a->gw, &b->gw, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + r = CMP(a->gw_weight, b->gw_weight); + if (r != 0) + return r; + } + + r = memcmp(&a->prefsrc, &b->prefsrc, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + r = CMP(a->tos, b->tos); + if (r != 0) + return r; + + r = CMP(a->priority, b->priority); + if (r != 0) + return r; + + r = CMP(a->table, b->table); + if (r != 0) + return r; + + r = CMP(a->protocol, b->protocol); + if (r != 0) + return r; + + r = CMP(a->scope, b->scope); + if (r != 0) + return r; + + r = CMP(a->type, b->type); + if (r != 0) + return r; + + r = CMP(a->initcwnd, b->initcwnd); + if (r != 0) + return r; + + r = CMP(a->initrwnd, b->initrwnd); + if (r != 0) + return r; + + r = CMP(a->advmss, b->advmss); + if (r != 0) + return r; + + r = CMP(a->nexthop_id, b->nexthop_id); + if (r != 0) + return r; + + return 0; + default: + /* treat any other address family as AF_UNSPEC */ + return 0; + } +} + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + route_hash_ops, + Route, + route_hash_func, + route_compare_func, + route_free); + +static bool route_type_is_reject(const Route *route) { + assert(route); + + return IN_SET(route->type, RTN_UNREACHABLE, RTN_PROHIBIT, RTN_BLACKHOLE, RTN_THROW); +} + +static bool route_needs_convert(const Route *route) { + assert(route); + + return route->nexthop_id > 0 || !ordered_set_isempty(route->multipath_routes); +} + +static int route_add(Manager *manager, Link *link, Route *route) { + int r; + + assert(route); + + if (route_type_is_reject(route)) { + assert(manager); + + r = set_ensure_put(&manager->routes, &route_hash_ops, route); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + route->manager = manager; + } else { + assert(link); + + r = set_ensure_put(&link->routes, &route_hash_ops, route); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + route->link = link; + } + + return 0; +} + +int route_get(Manager *manager, Link *link, const Route *in, Route **ret) { + Route *route; + + assert(in); + + if (route_type_is_reject(in)) { + if (!manager) + return -ENOENT; + + route = set_get(manager->routes, in); + } else { + if (!link) + return -ENOENT; + + route = set_get(link->routes, in); + } + if (!route) + return -ENOENT; + + if (ret) + *ret = route; + + return 0; +} + +int route_dup(const Route *src, Route **ret) { + _cleanup_(route_freep) Route *dest = NULL; + int r; + + /* This does not copy mulipath routes. */ + + assert(src); + assert(ret); + + dest = newdup(Route, src, 1); + if (!dest) + return -ENOMEM; + + /* Unset all pointers */ + dest->network = NULL; + dest->section = NULL; + dest->link = NULL; + dest->manager = NULL; + dest->multipath_routes = NULL; + dest->expire = NULL; + dest->tcp_congestion_control_algo = NULL; + + r = free_and_strdup(&dest->tcp_congestion_control_algo, src->tcp_congestion_control_algo); + if (r < 0) + return r; + + *ret = TAKE_PTR(dest); + return 0; +} + +static void route_apply_nexthop(Route *route, const NextHop *nh, uint8_t nh_weight) { + assert(route); + assert(nh); + assert(hashmap_isempty(nh->group)); + + route->gw_family = nh->family; + route->gw = nh->gw; + + if (nh_weight != UINT8_MAX) + route->gw_weight = nh_weight; + + if (nh->blackhole) + route->type = RTN_BLACKHOLE; +} + +static void route_apply_multipath_route(Route *route, const MultipathRoute *m) { + assert(route); + assert(m); + + route->gw_family = m->gateway.family; + route->gw = m->gateway.address; + route->gw_weight = m->weight; +} + +static int multipath_route_get_link(Manager *manager, const MultipathRoute *m, Link **ret) { + int r; + + assert(manager); + assert(m); + + if (m->ifname) { + r = link_get_by_name(manager, m->ifname, ret); + return r < 0 ? r : 1; + + } else if (m->ifindex > 0) { /* Always ignore ifindex if ifname is set. */ + r = link_get_by_index(manager, m->ifindex, ret); + return r < 0 ? r : 1; + } + + if (ret) + *ret = NULL; + return 0; +} + +typedef struct ConvertedRoutes { + size_t n; + Route **routes; + Link **links; +} ConvertedRoutes; + +static ConvertedRoutes *converted_routes_free(ConvertedRoutes *c) { + if (!c) + return NULL; + + for (size_t i = 0; i < c->n; i++) + route_free(c->routes[i]); + + free(c->routes); + free(c->links); + + return mfree(c); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(ConvertedRoutes*, converted_routes_free); + +static int converted_routes_new(size_t n, ConvertedRoutes **ret) { + _cleanup_(converted_routes_freep) ConvertedRoutes *c = NULL; + _cleanup_free_ Route **routes = NULL; + _cleanup_free_ Link **links = NULL; + + assert(n > 0); + assert(ret); + + routes = new0(Route*, n); + if (!routes) + return -ENOMEM; + + links = new0(Link*, n); + if (!links) + return -ENOMEM; + + c = new(ConvertedRoutes, 1); + if (!c) + return -ENOMEM; + + *c = (ConvertedRoutes) { + .n = n, + .routes = TAKE_PTR(routes), + .links = TAKE_PTR(links), + }; + + *ret = TAKE_PTR(c); + return 0; +} + +static int route_convert(Manager *manager, const Route *route, ConvertedRoutes **ret) { + _cleanup_(converted_routes_freep) ConvertedRoutes *c = NULL; + int r; + + assert(manager); + assert(route); + assert(ret); + + if (!route_needs_convert(route)) { + *ret = NULL; + return 0; + } + + if (route->nexthop_id > 0) { + struct nexthop_grp *nhg; + NextHop *nh; + + r = manager_get_nexthop_by_id(manager, route->nexthop_id, &nh); + if (r < 0) + return r; + + if (hashmap_isempty(nh->group)) { + r = converted_routes_new(1, &c); + if (r < 0) + return r; + + r = route_dup(route, &c->routes[0]); + if (r < 0) + return r; + + route_apply_nexthop(c->routes[0], nh, UINT8_MAX); + c->links[0] = nh->link; + + *ret = TAKE_PTR(c); + return 1; + } + + r = converted_routes_new(hashmap_size(nh->group), &c); + if (r < 0) + return r; + + size_t i = 0; + HASHMAP_FOREACH(nhg, nh->group) { + NextHop *h; + + r = manager_get_nexthop_by_id(manager, nhg->id, &h); + if (r < 0) + return r; + + r = route_dup(route, &c->routes[i]); + if (r < 0) + return r; + + route_apply_nexthop(c->routes[i], h, nhg->weight); + c->links[i] = h->link; + + i++; + } + + *ret = TAKE_PTR(c); + return 1; + + } + + assert(!ordered_set_isempty(route->multipath_routes)); + + r = converted_routes_new(ordered_set_size(route->multipath_routes), &c); + if (r < 0) + return r; + + size_t i = 0; + MultipathRoute *m; + ORDERED_SET_FOREACH(m, route->multipath_routes) { + r = route_dup(route, &c->routes[i]); + if (r < 0) + return r; + + route_apply_multipath_route(c->routes[i], m); + + r = multipath_route_get_link(manager, m, &c->links[i]); + if (r < 0) + return r; + + i++; + } + + *ret = TAKE_PTR(c); + return 1; +} + +void link_mark_routes(Link *link, NetworkConfigSource source) { + Route *route; + + assert(link); + + SET_FOREACH(route, link->routes) { + if (route->source != source) + continue; + + route_mark(route); + } +} + +static void log_route_debug(const Route *route, const char *str, const Link *link, const Manager *manager) { + _cleanup_free_ char *state = NULL, *gw_alloc = NULL, *prefsrc = NULL, + *table = NULL, *scope = NULL, *proto = NULL, *flags = NULL; + const char *gw = NULL, *dst, *src; + + assert(route); + assert(str); + assert(manager); + + /* link may be NULL. */ + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(route->state, &state); + + dst = in_addr_is_set(route->family, &route->dst) || route->dst_prefixlen > 0 ? + IN_ADDR_PREFIX_TO_STRING(route->family, &route->dst, route->dst_prefixlen) : NULL; + src = in_addr_is_set(route->family, &route->src) || route->src_prefixlen > 0 ? + IN_ADDR_PREFIX_TO_STRING(route->family, &route->src, route->src_prefixlen) : NULL; + + if (in_addr_is_set(route->gw_family, &route->gw)) { + (void) in_addr_to_string(route->gw_family, &route->gw, &gw_alloc); + gw = gw_alloc; + } else if (route->gateway_from_dhcp_or_ra) { + if (route->gw_family == AF_INET) + gw = "_dhcp4"; + else if (route->gw_family == AF_INET6) + gw = "_ipv6ra"; + } else { + MultipathRoute *m; + + ORDERED_SET_FOREACH(m, route->multipath_routes) { + _cleanup_free_ char *buf = NULL; + union in_addr_union a = m->gateway.address; + + (void) in_addr_to_string(m->gateway.family, &a, &buf); + (void) strextend_with_separator(&gw_alloc, ",", strna(buf)); + if (m->ifname) + (void) strextend(&gw_alloc, "@", m->ifname); + else if (m->ifindex > 0) + (void) strextendf(&gw_alloc, "@%i", m->ifindex); + /* See comments in config_parse_multipath_route(). */ + (void) strextendf(&gw_alloc, ":%"PRIu32, m->weight + 1); + } + gw = gw_alloc; + } + if (in_addr_is_set(route->family, &route->prefsrc)) + (void) in_addr_to_string(route->family, &route->prefsrc, &prefsrc); + (void) route_scope_to_string_alloc(route->scope, &scope); + (void) manager_get_route_table_to_string(manager, route->table, /* append_num = */ true, &table); + (void) route_protocol_full_to_string_alloc(route->protocol, &proto); + (void) route_flags_to_string_alloc(route->flags, &flags); + + log_link_debug(link, + "%s %s route (%s): dst: %s, src: %s, gw: %s, prefsrc: %s, scope: %s, table: %s, " + "proto: %s, type: %s, nexthop: %"PRIu32", priority: %"PRIu32", flags: %s", + str, strna(network_config_source_to_string(route->source)), strna(state), + strna(dst), strna(src), strna(gw), strna(prefsrc), + strna(scope), strna(table), strna(proto), + strna(route_type_to_string(route->type)), + route->nexthop_id, route->priority, strna(flags)); +} + +static int route_set_netlink_message(const Route *route, sd_netlink_message *req, Link *link) { + int r; + + assert(route); + assert(req); + + /* link may be NULL */ + + if (in_addr_is_set(route->gw_family, &route->gw) && route->nexthop_id == 0) { + if (route->gw_family == route->family) { + r = netlink_message_append_in_addr_union(req, RTA_GATEWAY, route->gw_family, &route->gw); + if (r < 0) + return r; + } else { + RouteVia rtvia = { + .family = route->gw_family, + .address = route->gw, + }; + + r = sd_netlink_message_append_data(req, RTA_VIA, &rtvia, sizeof(rtvia)); + if (r < 0) + return r; + } + } + + if (route->dst_prefixlen > 0) { + r = netlink_message_append_in_addr_union(req, RTA_DST, route->family, &route->dst); + if (r < 0) + return r; + + r = sd_rtnl_message_route_set_dst_prefixlen(req, route->dst_prefixlen); + if (r < 0) + return r; + } + + if (route->src_prefixlen > 0) { + r = netlink_message_append_in_addr_union(req, RTA_SRC, route->family, &route->src); + if (r < 0) + return r; + + r = sd_rtnl_message_route_set_src_prefixlen(req, route->src_prefixlen); + if (r < 0) + return r; + } + + if (in_addr_is_set(route->family, &route->prefsrc)) { + r = netlink_message_append_in_addr_union(req, RTA_PREFSRC, route->family, &route->prefsrc); + if (r < 0) + return r; + } + + r = sd_rtnl_message_route_set_scope(req, route->scope); + if (r < 0) + return r; + + r = sd_rtnl_message_route_set_flags(req, route->flags & RTNH_F_ONLINK); + if (r < 0) + return r; + + if (route->table < 256) { + r = sd_rtnl_message_route_set_table(req, route->table); + if (r < 0) + return r; + } else { + r = sd_rtnl_message_route_set_table(req, RT_TABLE_UNSPEC); + if (r < 0) + return r; + + /* Table attribute to allow more than 256. */ + r = sd_netlink_message_append_u32(req, RTA_TABLE, route->table); + if (r < 0) + return r; + } + + if (!route_type_is_reject(route) && + route->nexthop_id == 0 && + ordered_set_isempty(route->multipath_routes)) { + assert(link); /* Those routes must be attached to a specific link */ + + r = sd_netlink_message_append_u32(req, RTA_OIF, link->ifindex); + if (r < 0) + return r; + } + + if (route->nexthop_id > 0) { + r = sd_netlink_message_append_u32(req, RTA_NH_ID, route->nexthop_id); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(req, RTA_PREF, route->pref); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(req, RTA_PRIORITY, route->priority); + if (r < 0) + return r; + + return 0; +} + +static int route_remove_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + + /* link may be NULL. */ + + if (link && IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -ESRCH) + log_link_message_warning_errno(link, m, r, "Could not drop route, ignoring"); + + return 1; +} + +int route_remove(Route *route) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + unsigned char type; + Manager *manager; + Link *link; + int r; + + assert(route); + assert(route->manager || (route->link && route->link->manager)); + assert(IN_SET(route->family, AF_INET, AF_INET6)); + + link = route->link; + manager = route->manager ?: link->manager; + + log_route_debug(route, "Removing", link, manager); + + r = sd_rtnl_message_new_route(manager->rtnl, &req, + RTM_DELROUTE, route->family, + route->protocol); + if (r < 0) + return log_link_error_errno(link, r, "Could not create netlink message: %m"); + + if (route->family == AF_INET && route->nexthop_id > 0 && route->type == RTN_BLACKHOLE) + /* When IPv4 route has nexthop id and the nexthop type is blackhole, even though kernel + * sends RTM_NEWROUTE netlink message with blackhole type, kernel's internal route type + * fib_rt_info::type may not be blackhole. Thus, we cannot know the internal value. + * Moreover, on route removal, the matching is done with the hidden value if we set + * non-zero type in RTM_DELROUTE message. Note, sd_rtnl_message_new_route() sets + * RTN_UNICAST by default. So, we need to clear the type here. */ + type = RTN_UNSPEC; + else + type = route->type; + + r = sd_rtnl_message_route_set_type(req, type); + if (r < 0) + return log_link_error_errno(link, r, "Could not set route type: %m"); + + r = route_set_netlink_message(route, req, link); + if (r < 0) + return log_error_errno(r, "Could not fill netlink message: %m"); + + r = netlink_call_async(manager->rtnl, NULL, req, route_remove_handler, + link ? link_netlink_destroy_callback : NULL, link); + if (r < 0) + return log_link_error_errno(link, r, "Could not send netlink message: %m"); + + link_ref(link); + + route_enter_removing(route); + return 0; +} + +int route_remove_and_drop(Route *route) { + if (!route) + return 0; + + route_cancel_request(route, NULL); + + if (route_exists(route)) + return route_remove(route); + + if (route->state == 0) + route_free(route); + + return 0; +} + +static void manager_mark_routes(Manager *manager, bool foreign, const Link *except) { + Route *route; + Link *link; + int r; + + assert(manager); + + /* First, mark all routes. */ + SET_FOREACH(route, manager->routes) { + /* Do not touch routes managed by the kernel. */ + if (route->protocol == RTPROT_KERNEL) + continue; + + /* When 'foreign' is true, mark only foreign routes, and vice versa. */ + if (foreign != (route->source == NETWORK_CONFIG_SOURCE_FOREIGN)) + continue; + + /* Do not touch dynamic routes. They will removed by dhcp_pd_prefix_lost() */ + if (IN_SET(route->source, NETWORK_CONFIG_SOURCE_DHCP4, NETWORK_CONFIG_SOURCE_DHCP6)) + continue; + + /* Ignore routes not assigned yet or already removed. */ + if (!route_exists(route)) + continue; + + route_mark(route); + } + + /* Then, unmark all routes requested by active links. */ + HASHMAP_FOREACH(link, manager->links_by_index) { + if (link == except) + continue; + + if (!link->network) + continue; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + continue; + + HASHMAP_FOREACH(route, link->network->routes_by_section) { + _cleanup_(converted_routes_freep) ConvertedRoutes *converted = NULL; + Route *existing; + + r = route_convert(manager, route, &converted); + if (r < 0) + continue; + if (r == 0) { + if (route_get(manager, NULL, route, &existing) >= 0) + route_unmark(existing); + continue; + } + + for (size_t i = 0; i < converted->n; i++) + if (route_get(manager, NULL, converted->routes[i], &existing) >= 0) + route_unmark(existing); + } + } +} + +static int manager_drop_marked_routes(Manager *manager) { + Route *route; + int r = 0; + + assert(manager); + + SET_FOREACH(route, manager->routes) { + if (!route_is_marked(route)) + continue; + + RET_GATHER(r, route_remove(route)); + } + + return r; +} + +static bool route_by_kernel(const Route *route) { + assert(route); + + if (route->protocol == RTPROT_KERNEL) + return true; + + /* The kernels older than a826b04303a40d52439aa141035fca5654ccaccd (v5.11) create the IPv6 + * multicast with RTPROT_BOOT. Do not touch it. */ + if (route->protocol == RTPROT_BOOT && + route->family == AF_INET6 && + route->dst_prefixlen == 8 && + in6_addr_equal(&route->dst.in6, & (struct in6_addr) {{{ 0xff,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 }}})) + return true; + + return false; +} + +static void link_unmark_wireguard_routes(Link *link) { + assert(link); + + if (!link->netdev || link->netdev->kind != NETDEV_KIND_WIREGUARD) + return; + + Route *route, *existing; + Wireguard *w = WIREGUARD(link->netdev); + + SET_FOREACH(route, w->routes) + if (route_get(NULL, link, route, &existing) >= 0) + route_unmark(existing); +} + +int link_drop_foreign_routes(Link *link) { + Route *route; + int r; + + assert(link); + assert(link->manager); + assert(link->network); + + SET_FOREACH(route, link->routes) { + /* do not touch routes managed by the kernel */ + if (route_by_kernel(route)) + continue; + + /* Do not remove routes we configured. */ + if (route->source != NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + /* Ignore routes not assigned yet or already removed. */ + if (!route_exists(route)) + continue; + + if (route->protocol == RTPROT_STATIC && + FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_STATIC)) + continue; + + if (route->protocol == RTPROT_DHCP && + FLAGS_SET(link->network->keep_configuration, KEEP_CONFIGURATION_DHCP)) + continue; + + route_mark(route); + } + + HASHMAP_FOREACH(route, link->network->routes_by_section) { + _cleanup_(converted_routes_freep) ConvertedRoutes *converted = NULL; + Route *existing; + + r = route_convert(link->manager, route, &converted); + if (r < 0) + continue; + if (r == 0) { + if (route_get(NULL, link, route, &existing) >= 0) + route_unmark(existing); + continue; + } + + for (size_t i = 0; i < converted->n; i++) + if (route_get(NULL, link, converted->routes[i], &existing) >= 0) + route_unmark(existing); + } + + link_unmark_wireguard_routes(link); + + r = 0; + SET_FOREACH(route, link->routes) { + if (!route_is_marked(route)) + continue; + + RET_GATHER(r, route_remove(route)); + } + + manager_mark_routes(link->manager, /* foreign = */ true, NULL); + + return RET_GATHER(r, manager_drop_marked_routes(link->manager)); +} + +int link_drop_managed_routes(Link *link) { + Route *route; + int r = 0; + + assert(link); + + SET_FOREACH(route, link->routes) { + /* do not touch routes managed by the kernel */ + if (route_by_kernel(route)) + continue; + + /* Do not touch routes managed by kernel or other tools. */ + if (route->source == NETWORK_CONFIG_SOURCE_FOREIGN) + continue; + + if (!route_exists(route)) + continue; + + RET_GATHER(r, route_remove(route)); + } + + manager_mark_routes(link->manager, /* foreign = */ false, link); + + return RET_GATHER(r, manager_drop_marked_routes(link->manager)); +} + +void link_foreignize_routes(Link *link) { + Route *route; + + assert(link); + + SET_FOREACH(route, link->routes) + route->source = NETWORK_CONFIG_SOURCE_FOREIGN; + + manager_mark_routes(link->manager, /* foreign = */ false, link); + + SET_FOREACH(route, link->manager->routes) { + if (!route_is_marked(route)) + continue; + + route->source = NETWORK_CONFIG_SOURCE_FOREIGN; + } +} + +static int route_expire_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Route *route = ASSERT_PTR(userdata); + Link *link; + int r; + + assert(route->manager || (route->link && route->link->manager)); + + link = route->link; /* This may be NULL. */ + + r = route_remove(route); + if (r < 0) { + log_link_warning_errno(link, r, "Could not remove route: %m"); + if (link) + link_enter_failed(link); + } + + return 1; +} + +static int route_setup_timer(Route *route, const struct rta_cacheinfo *cacheinfo) { + Manager *manager; + int r; + + assert(route); + assert(route->manager || (route->link && route->link->manager)); + + manager = route->manager ?: route->link->manager; + + if (route->lifetime_usec == USEC_INFINITY) + return 0; + + if (cacheinfo && cacheinfo->rta_expires != 0) + /* Assume that non-zero rta_expires means kernel will handle the route expiration. */ + return 0; + + r = event_reset_time(manager->event, &route->expire, CLOCK_BOOTTIME, + route->lifetime_usec, 0, route_expire_handler, route, 0, "route-expiration", true); + if (r < 0) + return r; + + return 1; +} + +static int append_nexthop_one(const Link *link, const Route *route, const MultipathRoute *m, struct rtattr **rta, size_t offset) { + struct rtnexthop *rtnh; + struct rtattr *new_rta; + int r; + + assert(route); + assert(m); + assert(rta); + assert(*rta); + + new_rta = realloc(*rta, RTA_ALIGN((*rta)->rta_len) + RTA_SPACE(sizeof(struct rtnexthop))); + if (!new_rta) + return -ENOMEM; + *rta = new_rta; + + rtnh = (struct rtnexthop *)((uint8_t *) *rta + offset); + *rtnh = (struct rtnexthop) { + .rtnh_len = sizeof(*rtnh), + .rtnh_ifindex = m->ifindex > 0 ? m->ifindex : link->ifindex, + .rtnh_hops = m->weight, + }; + + (*rta)->rta_len += sizeof(struct rtnexthop); + + if (route->family == m->gateway.family) { + r = rtattr_append_attribute(rta, RTA_GATEWAY, &m->gateway.address, FAMILY_ADDRESS_SIZE(m->gateway.family)); + if (r < 0) + goto clear; + rtnh = (struct rtnexthop *)((uint8_t *) *rta + offset); + rtnh->rtnh_len += RTA_SPACE(FAMILY_ADDRESS_SIZE(m->gateway.family)); + } else { + r = rtattr_append_attribute(rta, RTA_VIA, &m->gateway, FAMILY_ADDRESS_SIZE(m->gateway.family) + sizeof(m->gateway.family)); + if (r < 0) + goto clear; + rtnh = (struct rtnexthop *)((uint8_t *) *rta + offset); + rtnh->rtnh_len += RTA_SPACE(FAMILY_ADDRESS_SIZE(m->gateway.family) + sizeof(m->gateway.family)); + } + + return 0; + +clear: + (*rta)->rta_len -= sizeof(struct rtnexthop); + return r; +} + +static int append_nexthops(const Link *link, const Route *route, sd_netlink_message *req) { + _cleanup_free_ struct rtattr *rta = NULL; + struct rtnexthop *rtnh; + MultipathRoute *m; + size_t offset; + int r; + + assert(link); + assert(route); + assert(req); + + if (ordered_set_isempty(route->multipath_routes)) + return 0; + + rta = new(struct rtattr, 1); + if (!rta) + return -ENOMEM; + + *rta = (struct rtattr) { + .rta_type = RTA_MULTIPATH, + .rta_len = RTA_LENGTH(0), + }; + offset = (uint8_t *) RTA_DATA(rta) - (uint8_t *) rta; + + ORDERED_SET_FOREACH(m, route->multipath_routes) { + r = append_nexthop_one(link, route, m, &rta, offset); + if (r < 0) + return r; + + rtnh = (struct rtnexthop *)((uint8_t *) rta + offset); + offset = (uint8_t *) RTNH_NEXT(rtnh) - (uint8_t *) rta; + } + + r = sd_netlink_message_append_data(req, RTA_MULTIPATH, RTA_DATA(rta), RTA_PAYLOAD(rta)); + if (r < 0) + return r; + + return 0; +} + +int route_configure_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, const char *error_msg) { + int r; + + assert(m); + assert(link); + assert(error_msg); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not set route"); + link_enter_failed(link); + return 0; + } + + return 1; +} + +static int route_configure(const Route *route, uint32_t lifetime_sec, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(route); + assert(IN_SET(route->family, AF_INET, AF_INET6)); + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(link->ifindex > 0); + assert(req); + + log_route_debug(route, "Configuring", link, link->manager); + + r = sd_rtnl_message_new_route(link->manager->rtnl, &m, RTM_NEWROUTE, route->family, route->protocol); + if (r < 0) + return r; + + r = sd_rtnl_message_route_set_type(m, route->type); + if (r < 0) + return r; + + r = route_set_netlink_message(route, m, link); + if (r < 0) + return r; + + if (lifetime_sec != UINT32_MAX) { + r = sd_netlink_message_append_u32(m, RTA_EXPIRES, lifetime_sec); + if (r < 0) + return r; + } + + if (route->ttl_propagate >= 0) { + r = sd_netlink_message_append_u8(m, RTA_TTL_PROPAGATE, route->ttl_propagate); + if (r < 0) + return r; + } + + r = sd_netlink_message_open_container(m, RTA_METRICS); + if (r < 0) + return r; + + if (route->mtu > 0) { + r = sd_netlink_message_append_u32(m, RTAX_MTU, route->mtu); + if (r < 0) + return r; + } + + if (route->initcwnd > 0) { + r = sd_netlink_message_append_u32(m, RTAX_INITCWND, route->initcwnd); + if (r < 0) + return r; + } + + if (route->initrwnd > 0) { + r = sd_netlink_message_append_u32(m, RTAX_INITRWND, route->initrwnd); + if (r < 0) + return r; + } + + if (route->quickack >= 0) { + r = sd_netlink_message_append_u32(m, RTAX_QUICKACK, route->quickack); + if (r < 0) + return r; + } + + if (route->fast_open_no_cookie >= 0) { + r = sd_netlink_message_append_u32(m, RTAX_FASTOPEN_NO_COOKIE, route->fast_open_no_cookie); + if (r < 0) + return r; + } + + if (route->advmss > 0) { + r = sd_netlink_message_append_u32(m, RTAX_ADVMSS, route->advmss); + if (r < 0) + return r; + } + + if (!isempty(route->tcp_congestion_control_algo)) { + r = sd_netlink_message_append_string(m, RTAX_CC_ALGO, route->tcp_congestion_control_algo); + if (r < 0) + return r; + } + + if (route->hop_limit > 0) { + r = sd_netlink_message_append_u32(m, RTAX_HOPLIMIT, route->hop_limit); + if (r < 0) + return r; + } + + if (route->tcp_rto_usec > 0) { + r = sd_netlink_message_append_u32(m, RTAX_RTO_MIN, DIV_ROUND_UP(route->tcp_rto_usec, USEC_PER_MSEC)); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + if (!ordered_set_isempty(route->multipath_routes)) { + assert(route->nexthop_id == 0); + assert(!in_addr_is_set(route->gw_family, &route->gw)); + + r = append_nexthops(link, route, m); + if (r < 0) + return r; + } + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static int route_is_ready_to_configure(const Route *route, Link *link) { + int r; + + assert(route); + assert(link); + + if (!link_is_ready_to_configure(link, false)) + return false; + + if (set_size(link->routes) >= routes_max()) + return false; + + if (route->nexthop_id > 0) { + struct nexthop_grp *nhg; + NextHop *nh; + + if (manager_get_nexthop_by_id(link->manager, route->nexthop_id, &nh) < 0) + return false; + + if (!nexthop_exists(nh)) + return false; + + HASHMAP_FOREACH(nhg, nh->group) { + NextHop *g; + + if (manager_get_nexthop_by_id(link->manager, nhg->id, &g) < 0) + return false; + + if (!nexthop_exists(g)) + return false; + } + } + + if (in_addr_is_set(route->family, &route->prefsrc) > 0) { + r = manager_has_address(link->manager, route->family, &route->prefsrc, route->family == AF_INET6); + if (r <= 0) + return r; + } + + if (!gateway_is_ready(link, FLAGS_SET(route->flags, RTNH_F_ONLINK), route->gw_family, &route->gw)) + return false; + + MultipathRoute *m; + ORDERED_SET_FOREACH(m, route->multipath_routes) { + union in_addr_union a = m->gateway.address; + Link *l = NULL; + + r = multipath_route_get_link(link->manager, m, &l); + if (r < 0) + return false; + if (r > 0) { + if (!link_is_ready_to_configure(l, /* allow_unmanaged = */ true) || + !link_has_carrier(l)) + return false; + + m->ifindex = l->ifindex; + } + + if (!gateway_is_ready(l ?: link, FLAGS_SET(route->flags, RTNH_F_ONLINK), m->gateway.family, &a)) + return false; + } + + return true; +} + +static int route_process_request(Request *req, Link *link, Route *route) { + _cleanup_(converted_routes_freep) ConvertedRoutes *converted = NULL; + int r; + + assert(req); + assert(link); + assert(link->manager); + assert(route); + + r = route_is_ready_to_configure(route, link); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to check if route is ready to configure: %m"); + if (r == 0) + return 0; + + if (route_needs_convert(route)) { + r = route_convert(link->manager, route, &converted); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to convert route: %m"); + + assert(r > 0); + assert(converted); + + for (size_t i = 0; i < converted->n; i++) { + Route *existing; + + if (route_get(link->manager, converted->links[i] ?: link, converted->routes[i], &existing) < 0) { + _cleanup_(route_freep) Route *tmp = NULL; + + r = route_dup(converted->routes[i], &tmp); + if (r < 0) + return log_oom(); + + r = route_add(link->manager, converted->links[i] ?: link, tmp); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to add route: %m"); + + TAKE_PTR(tmp); + } else { + existing->source = converted->routes[i]->source; + existing->provider = converted->routes[i]->provider; + } + } + } + + usec_t now_usec; + assert_se(sd_event_now(link->manager->event, CLOCK_BOOTTIME, &now_usec) >= 0); + uint32_t sec = usec_to_sec(route->lifetime_usec, now_usec); + if (sec == 0) { + log_link_debug(link, "Refuse to configure %s route with zero lifetime.", + network_config_source_to_string(route->source)); + + if (converted) + for (size_t i = 0; i < converted->n; i++) { + Route *existing; + + assert_se(route_get(link->manager, converted->links[i] ?: link, converted->routes[i], &existing) >= 0); + route_cancel_requesting(existing); + } + else + route_cancel_requesting(route); + + return 1; + } + + r = route_configure(route, sec, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure route: %m"); + + if (converted) + for (size_t i = 0; i < converted->n; i++) { + Route *existing; + + assert_se(route_get(link->manager, converted->links[i] ?: link, converted->routes[i], &existing) >= 0); + route_enter_configuring(existing); + } + else + route_enter_configuring(route); + + return 1; +} + +int link_request_route( + Link *link, + Route *route, + bool consume_object, + unsigned *message_counter, + route_netlink_handler_t netlink_handler, + Request **ret) { + + Route *existing = NULL; + int r; + + assert(link); + assert(link->manager); + assert(route); + assert(route->source != NETWORK_CONFIG_SOURCE_FOREIGN); + assert(!route_needs_convert(route)); + + (void) route_get(link->manager, link, route, &existing); + + if (route->lifetime_usec == 0) { + if (consume_object) + route_free(route); + + /* The requested route is outdated. Let's remove it. */ + return route_remove_and_drop(existing); + } + + if (!existing) { + _cleanup_(route_freep) Route *tmp = NULL; + + if (consume_object) + tmp = route; + else { + r = route_dup(route, &tmp); + if (r < 0) + return r; + } + + r = route_add(link->manager, link, tmp); + if (r < 0) + return r; + + existing = TAKE_PTR(tmp); + } else { + existing->source = route->source; + existing->provider = route->provider; + existing->lifetime_usec = route->lifetime_usec; + if (consume_object) + route_free(route); + + if (existing->expire) { + /* When re-configuring an existing route, kernel does not send RTM_NEWROUTE + * message, so we need to update the timer here. */ + r = route_setup_timer(existing, NULL); + if (r < 0) + log_link_warning_errno(link, r, "Failed to update expiration timer for route, ignoring: %m"); + if (r > 0) + log_route_debug(existing, "Updated expiration timer for", link, link->manager); + } + } + + log_route_debug(existing, "Requesting", link, link->manager); + r = link_queue_request_safe(link, REQUEST_TYPE_ROUTE, + existing, NULL, + route_hash_func, + route_compare_func, + route_process_request, + message_counter, netlink_handler, ret); + if (r <= 0) + return r; + + route_enter_requesting(existing); + return 1; +} + +static int static_route_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, Route *route) { + int r; + + assert(link); + + r = route_configure_handler_internal(rtnl, m, link, "Could not set route"); + if (r <= 0) + return r; + + if (link->static_route_messages == 0) { + log_link_debug(link, "Routes set"); + link->static_routes_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int link_request_static_route(Link *link, Route *route) { + assert(link); + assert(link->manager); + assert(route); + + if (!route_needs_convert(route)) + return link_request_route(link, route, false, &link->static_route_messages, + static_route_handler, NULL); + + log_route_debug(route, "Requesting", link, link->manager); + return link_queue_request_safe(link, REQUEST_TYPE_ROUTE, + route, NULL, route_hash_func, route_compare_func, + route_process_request, + &link->static_route_messages, static_route_handler, NULL); +} + +static int link_request_wireguard_routes(Link *link, bool only_ipv4) { + NetDev *netdev; + Route *route; + int r; + + assert(link); + + if (!streq_ptr(link->kind, "wireguard")) + return 0; + + if (netdev_get(link->manager, link->ifname, &netdev) < 0) + return 0; + + Wireguard *w = WIREGUARD(netdev); + + SET_FOREACH(route, w->routes) { + if (only_ipv4 && route->family != AF_INET) + continue; + + r = link_request_static_route(link, route); + if (r < 0) + return r; + } + + return 0; +} + +int link_request_static_routes(Link *link, bool only_ipv4) { + Route *route; + int r; + + assert(link); + assert(link->network); + + link->static_routes_configured = false; + + HASHMAP_FOREACH(route, link->network->routes_by_section) { + if (route->gateway_from_dhcp_or_ra) + continue; + + if (only_ipv4 && route->family != AF_INET) + continue; + + r = link_request_static_route(link, route); + if (r < 0) + return r; + } + + r = link_request_wireguard_routes(link, only_ipv4); + if (r < 0) + return r; + + if (link->static_route_messages == 0) { + link->static_routes_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Requesting routes"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +void route_cancel_request(Route *route, Link *link) { + Request req; + + assert(route); + + link = route->link ?: link; + + assert(link); + + if (!route_is_requesting(route)) + return; + + req = (Request) { + .link = link, + .type = REQUEST_TYPE_ROUTE, + .userdata = route, + .hash_func = (hash_func_t) route_hash_func, + .compare_func = (compare_func_t) route_compare_func, + }; + + request_detach(link->manager, &req); + route_cancel_requesting(route); +} + +static int process_route_one( + Manager *manager, + Link *link, + uint16_t type, + Route *in, + const struct rta_cacheinfo *cacheinfo) { + + _cleanup_(route_freep) Route *tmp = in; + Route *route = NULL; + bool update_dhcp4; + int r; + + assert(manager); + assert(tmp); + assert(IN_SET(type, RTM_NEWROUTE, RTM_DELROUTE)); + + /* link may be NULL. This consumes 'in'. */ + + update_dhcp4 = link && tmp->family == AF_INET6 && tmp->dst_prefixlen == 0; + + (void) route_get(manager, link, tmp, &route); + + switch (type) { + case RTM_NEWROUTE: + if (route) { + route->flags = tmp->flags; + route_enter_configured(route); + log_route_debug(route, "Received remembered", link, manager); + + r = route_setup_timer(route, cacheinfo); + if (r < 0) + log_link_warning_errno(link, r, "Failed to configure expiration timer for route, ignoring: %m"); + if (r > 0) + log_route_debug(route, "Configured expiration timer for", link, manager); + + } else if (!manager->manage_foreign_routes) { + route_enter_configured(tmp); + log_route_debug(tmp, "Ignoring received", link, manager); + + } else { + /* A route appeared that we did not request */ + route_enter_configured(tmp); + log_route_debug(tmp, "Received new", link, manager); + r = route_add(manager, link, tmp); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to remember foreign route, ignoring: %m"); + return 0; + } + TAKE_PTR(tmp); + } + + break; + + case RTM_DELROUTE: + if (route) { + route_enter_removed(route); + if (route->state == 0) { + log_route_debug(route, "Forgetting", link, manager); + route_free(route); + } else + log_route_debug(route, "Removed", link, manager); + } else + log_route_debug(tmp, + manager->manage_foreign_routes ? "Kernel removed unknown" : "Ignoring received", + link, manager); + + break; + + default: + assert_not_reached(); + } + + if (update_dhcp4) { + r = dhcp4_update_ipv6_connectivity(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to notify IPv6 connectivity to DHCPv4 client: %m"); + link_enter_failed(link); + } + } + + return 1; +} + +int manager_rtnl_process_route(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(converted_routes_freep) ConvertedRoutes *converted = NULL; + _cleanup_(route_freep) Route *tmp = NULL; + _cleanup_free_ void *rta_multipath = NULL; + struct rta_cacheinfo cacheinfo; + bool has_cacheinfo; + Link *link = NULL; + uint32_t ifindex; + uint16_t type; + size_t rta_len; + int r; + + assert(rtnl); + assert(message); + assert(m); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive route message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWROUTE, RTM_DELROUTE)) { + log_warning("rtnl: received unexpected message type %u when processing route, ignoring.", type); + return 0; + } + + r = sd_netlink_message_read_u32(message, RTA_OIF, &ifindex); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get ifindex from route message, ignoring: %m"); + return 0; + } else if (r >= 0) { + if (ifindex <= 0) { + log_warning("rtnl: received route message with invalid ifindex %u, ignoring.", ifindex); + return 0; + } + + r = link_get_by_index(m, ifindex, &link); + if (r < 0) { + /* when enumerating we might be out of sync, but we will + * get the route again, so just ignore it */ + if (!m->enumerating) + log_warning("rtnl: received route message for link (%u) we do not know about, ignoring", ifindex); + return 0; + } + } + + r = route_new(&tmp); + if (r < 0) + return log_oom(); + + r = sd_rtnl_message_route_get_family(message, &tmp->family); + if (r < 0) { + log_link_warning(link, "rtnl: received route message without family, ignoring"); + return 0; + } else if (!IN_SET(tmp->family, AF_INET, AF_INET6)) { + log_link_debug(link, "rtnl: received route message with invalid family '%i', ignoring", tmp->family); + return 0; + } + + r = sd_rtnl_message_route_get_protocol(message, &tmp->protocol); + if (r < 0) { + log_warning_errno(r, "rtnl: received route message without route protocol, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_route_get_flags(message, &tmp->flags); + if (r < 0) { + log_warning_errno(r, "rtnl: received route message without route flags, ignoring: %m"); + return 0; + } + + r = netlink_message_read_in_addr_union(message, RTA_DST, tmp->family, &tmp->dst); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message without valid destination, ignoring: %m"); + return 0; + } + + r = netlink_message_read_in_addr_union(message, RTA_GATEWAY, tmp->family, &tmp->gw); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message without valid gateway, ignoring: %m"); + return 0; + } else if (r >= 0) + tmp->gw_family = tmp->family; + else if (tmp->family == AF_INET) { + RouteVia via; + + r = sd_netlink_message_read(message, RTA_VIA, sizeof(via), &via); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message without valid gateway, ignoring: %m"); + return 0; + } else if (r >= 0) { + tmp->gw_family = via.family; + tmp->gw = via.address; + } + } + + r = netlink_message_read_in_addr_union(message, RTA_SRC, tmp->family, &tmp->src); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message without valid source, ignoring: %m"); + return 0; + } + + r = netlink_message_read_in_addr_union(message, RTA_PREFSRC, tmp->family, &tmp->prefsrc); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message without valid preferred source, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_route_get_dst_prefixlen(message, &tmp->dst_prefixlen); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid destination prefixlen, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_route_get_src_prefixlen(message, &tmp->src_prefixlen); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid source prefixlen, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_route_get_scope(message, &tmp->scope); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid scope, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_route_get_tos(message, &tmp->tos); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid tos, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_route_get_type(message, &tmp->type); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid type, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, RTA_TABLE, &tmp->table); + if (r == -ENODATA) { + unsigned char table; + + r = sd_rtnl_message_route_get_table(message, &table); + if (r >= 0) + tmp->table = table; + } + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid table, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, RTA_PRIORITY, &tmp->priority); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid priority, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, RTA_NH_ID, &tmp->nexthop_id); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid nexthop id, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_enter_container(message, RTA_METRICS); + if (r < 0 && r != -ENODATA) { + log_link_error_errno(link, r, "rtnl: Could not enter RTA_METRICS container, ignoring: %m"); + return 0; + } + if (r >= 0) { + r = sd_netlink_message_read_u32(message, RTAX_INITCWND, &tmp->initcwnd); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid initcwnd, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, RTAX_INITRWND, &tmp->initrwnd); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid initrwnd, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, RTAX_ADVMSS, &tmp->advmss); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: received route message with invalid advmss, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_exit_container(message); + if (r < 0) { + log_link_error_errno(link, r, "rtnl: Could not exit from RTA_METRICS container, ignoring: %m"); + return 0; + } + } + + r = sd_netlink_message_read_data(message, RTA_MULTIPATH, &rta_len, &rta_multipath); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: failed to read RTA_MULTIPATH attribute, ignoring: %m"); + return 0; + } else if (r >= 0) { + r = rtattr_read_nexthop(rta_multipath, rta_len, tmp->family, &tmp->multipath_routes); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: failed to parse RTA_MULTIPATH attribute, ignoring: %m"); + return 0; + } + } + + r = sd_netlink_message_read(message, RTA_CACHEINFO, sizeof(cacheinfo), &cacheinfo); + if (r < 0 && r != -ENODATA) { + log_link_warning_errno(link, r, "rtnl: failed to read RTA_CACHEINFO attribute, ignoring: %m"); + return 0; + } + has_cacheinfo = r >= 0; + + /* IPv6 routes with reject type are always assigned to the loopback interface. See kernel's + * fib6_nh_init() in net/ipv6/route.c. However, we'd like to manage them by Manager. Hence, set + * link to NULL here. */ + if (route_type_is_reject(tmp)) + link = NULL; + + if (!route_needs_convert(tmp)) + return process_route_one(m, link, type, TAKE_PTR(tmp), has_cacheinfo ? &cacheinfo : NULL); + + r = route_convert(m, tmp, &converted); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: failed to convert received route, ignoring: %m"); + return 0; + } + + assert(r > 0); + assert(converted); + + for (size_t i = 0; i < converted->n; i++) + (void) process_route_one(m, + converted->links[i] ?: link, + type, + TAKE_PTR(converted->routes[i]), + has_cacheinfo ? &cacheinfo : NULL); + + return 1; +} + +int network_add_ipv4ll_route(Network *network) { + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + unsigned section_line; + int r; + + assert(network); + + if (!network->ipv4ll_route) + return 0; + + r = hashmap_by_section_find_unused_line(network->routes_by_section, network->filename, §ion_line); + if (r < 0) + return r; + + /* IPv4LLRoute= is in [Network] section. */ + r = route_new_static(network, network->filename, section_line, &n); + if (r < 0) + return r; + + r = in_addr_from_string(AF_INET, "169.254.0.0", &n->dst); + if (r < 0) + return r; + + n->family = AF_INET; + n->dst_prefixlen = 16; + n->scope = RT_SCOPE_LINK; + n->scope_set = true; + n->table_set = true; + n->priority = IPV4LL_ROUTE_METRIC; + n->protocol = RTPROT_STATIC; + + TAKE_PTR(n); + return 0; +} + +int network_add_default_route_on_device(Network *network) { + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + unsigned section_line; + int r; + + assert(network); + + if (!network->default_route_on_device) + return 0; + + r = hashmap_by_section_find_unused_line(network->routes_by_section, network->filename, §ion_line); + if (r < 0) + return r; + + /* DefaultRouteOnDevice= is in [Network] section. */ + r = route_new_static(network, network->filename, section_line, &n); + if (r < 0) + return r; + + n->family = AF_INET; + n->scope = RT_SCOPE_LINK; + n->scope_set = true; + n->protocol = RTPROT_STATIC; + + TAKE_PTR(n); + return 0; +} + +int config_parse_gateway( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (streq(section, "Network")) { + /* we are not in an Route section, so use line number instead */ + r = route_new_static(network, filename, line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + } else { + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + n->gateway_from_dhcp_or_ra = false; + n->gw_family = AF_UNSPEC; + n->gw = IN_ADDR_NULL; + TAKE_PTR(n); + return 0; + } + + if (streq(rvalue, "_dhcp")) { + n->gateway_from_dhcp_or_ra = true; + TAKE_PTR(n); + return 0; + } + + if (streq(rvalue, "_dhcp4")) { + n->gw_family = AF_INET; + n->gateway_from_dhcp_or_ra = true; + TAKE_PTR(n); + return 0; + } + + if (streq(rvalue, "_ipv6ra")) { + n->gw_family = AF_INET6; + n->gateway_from_dhcp_or_ra = true; + TAKE_PTR(n); + return 0; + } + } + + r = in_addr_from_string_auto(rvalue, &n->gw_family, &n->gw); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid %s='%s', ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + n->gateway_from_dhcp_or_ra = false; + TAKE_PTR(n); + return 0; +} + +int config_parse_preferred_src( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (n->family == AF_UNSPEC) + r = in_addr_from_string_auto(rvalue, &n->family, &n->prefsrc); + else + r = in_addr_from_string(n->family, rvalue, &n->prefsrc); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, EINVAL, + "Invalid %s='%s', ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_destination( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + union in_addr_union *buffer; + unsigned char *prefixlen; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (streq(lvalue, "Destination")) { + buffer = &n->dst; + prefixlen = &n->dst_prefixlen; + } else if (streq(lvalue, "Source")) { + buffer = &n->src; + prefixlen = &n->src_prefixlen; + } else + assert_not_reached(); + + if (n->family == AF_UNSPEC) + r = in_addr_prefix_from_string_auto(rvalue, &n->family, buffer, prefixlen); + else + r = in_addr_prefix_from_string(rvalue, n->family, buffer, prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, EINVAL, + "Invalid %s='%s', ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + (void) in_addr_mask(n->family, buffer, *prefixlen); + + TAKE_PTR(n); + return 0; +} + +int config_parse_route_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = safe_atou32(rvalue, &n->priority); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse route priority \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->priority_set = true; + TAKE_PTR(n); + return 0; +} + +int config_parse_route_scope( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = route_scope_from_string(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Unknown route scope: %s", rvalue); + return 0; + } + + n->scope = r; + n->scope_set = true; + TAKE_PTR(n); + return 0; +} + +int config_parse_route_nexthop( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + uint32_t id; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + n->nexthop_id = 0; + TAKE_PTR(n); + return 0; + } + + r = safe_atou32(rvalue, &id); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse nexthop ID, ignoring assignment: %s", rvalue); + return 0; + } + if (id == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid nexthop ID, ignoring assignment: %s", rvalue); + return 0; + } + + n->nexthop_id = id; + TAKE_PTR(n); + return 0; +} + +int config_parse_route_table( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = manager_get_route_table_from_string(network->manager, rvalue, &n->table); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse route table \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->table_set = true; + TAKE_PTR(n); + return 0; +} + +int config_parse_route_boolean( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse %s=\"%s\", ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + if (STR_IN_SET(lvalue, "GatewayOnLink", "GatewayOnlink")) + n->gateway_onlink = r; + else if (streq(lvalue, "QuickAck")) + n->quickack = r; + else if (streq(lvalue, "FastOpenNoCookie")) + n->fast_open_no_cookie = r; + else if (streq(lvalue, "TTLPropagate")) + n->ttl_propagate = r; + else + assert_not_reached(); + + TAKE_PTR(n); + return 0; +} + +int config_parse_ipv6_route_preference( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (streq(rvalue, "low")) + n->pref = ICMPV6_ROUTER_PREF_LOW; + else if (streq(rvalue, "medium")) + n->pref = ICMPV6_ROUTER_PREF_MEDIUM; + else if (streq(rvalue, "high")) + n->pref = ICMPV6_ROUTER_PREF_HIGH; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown route preference: %s", rvalue); + return 0; + } + + n->pref_set = true; + TAKE_PTR(n); + return 0; +} + +int config_parse_route_protocol( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = route_protocol_from_string(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse route protocol \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->protocol = r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_route_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int t, r; + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + t = route_type_from_string(rvalue); + if (t < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse route type \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->type = (unsigned char) t; + + TAKE_PTR(n); + return 0; +} + +int config_parse_route_hop_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + Network *network = userdata; + uint32_t k; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + n->hop_limit = 0; + TAKE_PTR(n); + return 0; + } + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse per route hop limit, ignoring assignment: %s", rvalue); + return 0; + } + if (k > 255) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified per route hop limit \"%s\" is too large, ignoring assignment: %m", rvalue); + return 0; + } + if (k == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid per route hop limit \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->hop_limit = k; + + TAKE_PTR(n); + return 0; +} + +int config_parse_tcp_congestion( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, + rvalue, &n->tcp_congestion_control_algo, userdata); + if (r < 0) + return r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_tcp_advmss( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + Network *network = userdata; + uint64_t u; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + n->advmss = 0; + TAKE_PTR(n); + return 0; + } + + r = parse_size(rvalue, 1024, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse TCPAdvertisedMaximumSegmentSize= \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + if (u == 0 || u > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid TCPAdvertisedMaximumSegmentSize= \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->advmss = u; + + TAKE_PTR(n); + return 0; +} + +int config_parse_tcp_window( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *window = ASSERT_PTR(data); + uint32_t k; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse TCP %s \"%s\", ignoring assignment: %m", lvalue, rvalue); + return 0; + } + if (k >= 1024) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified TCP %s \"%s\" is too large, ignoring assignment: %m", lvalue, rvalue); + return 0; + } + if (k == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid TCP %s \"%s\", ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + *window = k; + return 0; +} + +int config_parse_route_tcp_window( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + Network *network = userdata; + uint32_t *d; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (streq(lvalue, "InitialCongestionWindow")) + d = &n->initcwnd; + else if (streq(lvalue, "InitialAdvertisedReceiveWindow")) + d = &n->initrwnd; + else + assert_not_reached(); + + r = config_parse_tcp_window(unit, filename, line, section, section_line, lvalue, ltype, rvalue, d, userdata); + if (r < 0) + return r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_route_mtu( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = config_parse_mtu(unit, filename, line, section, section_line, lvalue, ltype, rvalue, &n->mtu, userdata); + if (r < 0) + return r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_route_tcp_rto( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Network *network = userdata; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + usec_t usec; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse route TCP retransmission timeout (RTO), ignoring assignment: %s", rvalue); + return 0; + } + + if (IN_SET(usec, 0, USEC_INFINITY) || + DIV_ROUND_UP(usec, USEC_PER_MSEC) > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Route TCP retransmission timeout (RTO) must be in the range 0…%"PRIu32"ms, ignoring assignment: %s", UINT32_MAX, rvalue); + return 0; + } + + n->tcp_rto_usec = usec; + + TAKE_PTR(n); + return 0; +} + +int config_parse_multipath_route( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(multipath_route_freep) MultipathRoute *m = NULL; + _cleanup_(route_free_or_set_invalidp) Route *n = NULL; + _cleanup_free_ char *word = NULL; + Network *network = userdata; + union in_addr_union a; + int family, r; + const char *p; + char *dev; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = route_new_static(network, filename, section_line, &n); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to allocate route, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + n->multipath_routes = ordered_set_free_with_destructor(n->multipath_routes, multipath_route_free); + TAKE_PTR(n); + return 0; + } + + m = new0(MultipathRoute, 1); + if (!m) + return log_oom(); + + p = rvalue; + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid multipath route option, ignoring assignment: %s", rvalue); + return 0; + } + + dev = strchr(word, '@'); + if (dev) { + *dev++ = '\0'; + + r = parse_ifindex(dev); + if (r > 0) + m->ifindex = r; + else { + if (!ifname_valid_full(dev, IFNAME_VALID_ALTERNATIVE)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid interface name '%s' in %s=, ignoring: %s", dev, lvalue, rvalue); + return 0; + } + + m->ifname = strdup(dev); + if (!m->ifname) + return log_oom(); + } + } + + r = in_addr_from_string_auto(word, &family, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid multipath route gateway '%s', ignoring assignment: %m", rvalue); + return 0; + } + m->gateway.address = a; + m->gateway.family = family; + + if (!isempty(p)) { + r = safe_atou32(p, &m->weight); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid multipath route weight, ignoring assignment: %s", p); + return 0; + } + /* ip command takes weight in the range 1…255, while kernel takes the value in the + * range 0…254. MultiPathRoute= setting also takes weight in the same range which ip + * command uses, then networkd decreases by one and stores it to match the range which + * kernel uses. */ + if (m->weight == 0 || m->weight > 256) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid multipath route weight, ignoring assignment: %s", p); + return 0; + } + m->weight--; + } + + r = ordered_set_ensure_put(&n->multipath_routes, NULL, m); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store multipath route, ignoring assignment: %m"); + return 0; + } + + TAKE_PTR(m); + TAKE_PTR(n); + return 0; +} + +static int route_section_verify(Route *route, Network *network) { + if (section_is_invalid(route->section)) + return -EINVAL; + + /* Currently, we do not support static route with finite lifetime. */ + assert(route->lifetime_usec == USEC_INFINITY); + + if (route->gateway_from_dhcp_or_ra) { + if (route->gw_family == AF_UNSPEC) { + /* When deprecated Gateway=_dhcp is set, then assume gateway family based on other settings. */ + switch (route->family) { + case AF_UNSPEC: + log_warning("%s: Deprecated value \"_dhcp\" is specified for Gateway= in [Route] section from line %u. " + "Please use \"_dhcp4\" or \"_ipv6ra\" instead. Assuming \"_dhcp4\".", + route->section->filename, route->section->line); + route->family = AF_INET; + break; + case AF_INET: + case AF_INET6: + log_warning("%s: Deprecated value \"_dhcp\" is specified for Gateway= in [Route] section from line %u. " + "Assuming \"%s\" based on Destination=, Source=, or PreferredSource= setting.", + route->section->filename, route->section->line, route->family == AF_INET ? "_dhcp4" : "_ipv6ra"); + break; + default: + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Invalid route family. Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + } + route->gw_family = route->family; + } + + if (route->gw_family == AF_INET && !FLAGS_SET(network->dhcp, ADDRESS_FAMILY_IPV4)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Gateway=\"_dhcp4\" is specified but DHCPv4 client is disabled. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + + if (route->gw_family == AF_INET6 && !network->ipv6_accept_ra) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Gateway=\"_ipv6ra\" is specified but IPv6AcceptRA= is disabled. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + } + + /* When only Gateway= is specified, assume the route family based on the Gateway address. */ + if (route->family == AF_UNSPEC) + route->family = route->gw_family; + + if (route->family == AF_UNSPEC) { + assert(route->section); + + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Route section without Gateway=, Destination=, Source=, " + "or PreferredSource= field configured. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + } + + if (route->family == AF_INET6 && route->gw_family == AF_INET) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: IPv4 gateway is configured for IPv6 route. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + + if (!route->table_set && network->vrf) { + route->table = VRF(network->vrf)->table; + route->table_set = true; + } + + if (!route->table_set && IN_SET(route->type, RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, RTN_NAT)) + route->table = RT_TABLE_LOCAL; + + if (!route->scope_set && route->family != AF_INET6) { + if (IN_SET(route->type, RTN_LOCAL, RTN_NAT)) + route->scope = RT_SCOPE_HOST; + else if (IN_SET(route->type, RTN_BROADCAST, RTN_ANYCAST, RTN_MULTICAST)) + route->scope = RT_SCOPE_LINK; + else if (IN_SET(route->type, RTN_UNICAST, RTN_UNSPEC) && + !route->gateway_from_dhcp_or_ra && + !in_addr_is_set(route->gw_family, &route->gw) && + ordered_set_isempty(route->multipath_routes) && + route->nexthop_id == 0) + route->scope = RT_SCOPE_LINK; + } + + if (route->scope != RT_SCOPE_UNIVERSE && route->family == AF_INET6) { + log_warning("%s: Scope= is specified for IPv6 route. It will be ignored.", route->section->filename); + route->scope = RT_SCOPE_UNIVERSE; + } + + if (route->family == AF_INET6 && route->priority == 0) + route->priority = IP6_RT_PRIO_USER; + + if (route->gateway_onlink < 0 && in_addr_is_set(route->gw_family, &route->gw) && + ordered_hashmap_isempty(network->addresses_by_section)) { + /* If no address is configured, in most cases the gateway cannot be reachable. + * TODO: we may need to improve the condition above. */ + log_warning("%s: Gateway= without static address configured. " + "Enabling GatewayOnLink= option.", + network->filename); + route->gateway_onlink = true; + } + + if (route->gateway_onlink >= 0) + SET_FLAG(route->flags, RTNH_F_ONLINK, route->gateway_onlink); + + if (route->family == AF_INET6) { + MultipathRoute *m; + + ORDERED_SET_FOREACH(m, route->multipath_routes) + if (m->gateway.family == AF_INET) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: IPv4 multipath route is specified for IPv6 route. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + } + + if ((route->gateway_from_dhcp_or_ra || + in_addr_is_set(route->gw_family, &route->gw)) && + !ordered_set_isempty(route->multipath_routes)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Gateway= cannot be specified with MultiPathRoute=. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + + if (route->nexthop_id > 0 && + (route->gateway_from_dhcp_or_ra || + in_addr_is_set(route->gw_family, &route->gw) || + !ordered_set_isempty(route->multipath_routes))) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: NextHopId= cannot be specified with Gateway= or MultiPathRoute=. " + "Ignoring [Route] section from line %u.", + route->section->filename, route->section->line); + + return 0; +} + +void network_drop_invalid_routes(Network *network) { + Route *route; + + assert(network); + + HASHMAP_FOREACH(route, network->routes_by_section) + if (route_section_verify(route, network) < 0) + route_free(route); +} diff --git a/src/network/networkd-route.h b/src/network/networkd-route.h new file mode 100644 index 0000000..3d85889 --- /dev/null +++ b/src/network/networkd-route.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "networkd-link.h" +#include "networkd-util.h" + +typedef struct Manager Manager; +typedef struct Network Network; +typedef struct Request Request; +typedef struct Route Route; +typedef int (*route_netlink_handler_t)( + sd_netlink *rtnl, + sd_netlink_message *m, + Request *req, + Link *link, + Route *route); + +struct Route { + Link *link; + Manager *manager; + Network *network; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + union in_addr_union provider; /* DHCP server or router address */ + + int family; + int gw_family; + uint32_t gw_weight; + int quickack; + int fast_open_no_cookie; + int ttl_propagate; + + unsigned char dst_prefixlen; + unsigned char src_prefixlen; + unsigned char scope; + unsigned char protocol; /* RTPROT_* */ + unsigned char type; /* RTN_* */ + unsigned char tos; + uint32_t priority; /* note that ip(8) calls this 'metric' */ + uint32_t table; + uint32_t mtu; + uint32_t initcwnd; + uint32_t initrwnd; + uint32_t advmss; + uint32_t hop_limit; + char *tcp_congestion_control_algo; + unsigned char pref; + unsigned flags; + int gateway_onlink; /* Only used in conf parser and route_section_verify(). */ + uint32_t nexthop_id; + usec_t tcp_rto_usec; + + bool scope_set:1; + bool table_set:1; + bool priority_set:1; + bool protocol_set:1; + bool pref_set:1; + bool gateway_from_dhcp_or_ra:1; + + union in_addr_union gw; + union in_addr_union dst; + union in_addr_union src; + union in_addr_union prefsrc; + OrderedSet *multipath_routes; + + /* This is an absolute point in time, and NOT a timespan/duration. + * Must be specified with clock_boottime_or_monotonic(). */ + usec_t lifetime_usec; + /* Used when kernel does not support RTA_EXPIRES attribute. */ + sd_event_source *expire; +}; + +extern const struct hash_ops route_hash_ops; + +int route_new(Route **ret); +Route *route_free(Route *route); +DEFINE_SECTION_CLEANUP_FUNCTIONS(Route, route_free); +int route_dup(const Route *src, Route **ret); + +int route_configure_handler_internal(sd_netlink *rtnl, sd_netlink_message *m, Link *link, const char *error_msg); +int route_remove(Route *route); +int route_remove_and_drop(Route *route); + +int route_get(Manager *manager, Link *link, const Route *in, Route **ret); + +int link_drop_managed_routes(Link *link); +int link_drop_foreign_routes(Link *link); +void link_foreignize_routes(Link *link); + +void route_cancel_request(Route *route, Link *link); +int link_request_route( + Link *link, + Route *route, + bool consume_object, + unsigned *message_counter, + route_netlink_handler_t netlink_handler, + Request **ret); +int link_request_static_routes(Link *link, bool only_ipv4); + +int manager_rtnl_process_route(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); + +int network_add_ipv4ll_route(Network *network); +int network_add_default_route_on_device(Network *network); +void network_drop_invalid_routes(Network *network); + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(Route, route); +void link_mark_routes(Link *link, NetworkConfigSource source); + +CONFIG_PARSER_PROTOTYPE(config_parse_gateway); +CONFIG_PARSER_PROTOTYPE(config_parse_preferred_src); +CONFIG_PARSER_PROTOTYPE(config_parse_destination); +CONFIG_PARSER_PROTOTYPE(config_parse_route_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_route_scope); +CONFIG_PARSER_PROTOTYPE(config_parse_route_table); +CONFIG_PARSER_PROTOTYPE(config_parse_route_boolean); +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_route_preference); +CONFIG_PARSER_PROTOTYPE(config_parse_route_protocol); +CONFIG_PARSER_PROTOTYPE(config_parse_route_type); +CONFIG_PARSER_PROTOTYPE(config_parse_route_tcp_window); +CONFIG_PARSER_PROTOTYPE(config_parse_route_hop_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_tcp_window); +CONFIG_PARSER_PROTOTYPE(config_parse_route_tcp_rto); +CONFIG_PARSER_PROTOTYPE(config_parse_route_mtu); +CONFIG_PARSER_PROTOTYPE(config_parse_multipath_route); +CONFIG_PARSER_PROTOTYPE(config_parse_tcp_congestion); +CONFIG_PARSER_PROTOTYPE(config_parse_tcp_advmss); +CONFIG_PARSER_PROTOTYPE(config_parse_route_nexthop); diff --git a/src/network/networkd-routing-policy-rule.c b/src/network/networkd-routing-policy-rule.c new file mode 100644 index 0000000..0cb5831 --- /dev/null +++ b/src/network/networkd-routing-policy-rule.c @@ -0,0 +1,1754 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "conf-parser.h" +#include "fileio.h" +#include "format-util.h" +#include "hashmap.h" +#include "ip-protocol-list.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "networkd-route-util.h" +#include "networkd-routing-policy-rule.h" +#include "networkd-util.h" +#include "parse-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static const char *const fr_act_type_table[__FR_ACT_MAX] = { + [FR_ACT_BLACKHOLE] = "blackhole", + [FR_ACT_UNREACHABLE] = "unreachable", + [FR_ACT_PROHIBIT] = "prohibit", +}; + +static const char *const fr_act_type_full_table[__FR_ACT_MAX] = { + [FR_ACT_TO_TBL] = "table", + [FR_ACT_GOTO] = "goto", + [FR_ACT_NOP] = "nop", + [FR_ACT_BLACKHOLE] = "blackhole", + [FR_ACT_UNREACHABLE] = "unreachable", + [FR_ACT_PROHIBIT] = "prohibit", +}; + +assert_cc(__FR_ACT_MAX <= UINT8_MAX); +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(fr_act_type, int); +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(fr_act_type_full, int); + +RoutingPolicyRule *routing_policy_rule_free(RoutingPolicyRule *rule) { + if (!rule) + return NULL; + + if (rule->network) { + assert(rule->section); + hashmap_remove(rule->network->rules_by_section, rule->section); + } + + if (rule->manager) + set_remove(rule->manager->rules, rule); + + config_section_free(rule->section); + free(rule->iif); + free(rule->oif); + + return mfree(rule); +} + +DEFINE_SECTION_CLEANUP_FUNCTIONS(RoutingPolicyRule, routing_policy_rule_free); + +static int routing_policy_rule_new(RoutingPolicyRule **ret) { + RoutingPolicyRule *rule; + + rule = new(RoutingPolicyRule, 1); + if (!rule) + return -ENOMEM; + + *rule = (RoutingPolicyRule) { + .table = RT_TABLE_MAIN, + .uid_range.start = UID_INVALID, + .uid_range.end = UID_INVALID, + .suppress_prefixlen = -1, + .suppress_ifgroup = -1, + .protocol = RTPROT_UNSPEC, + .type = FR_ACT_TO_TBL, + }; + + *ret = rule; + return 0; +} + +static int routing_policy_rule_new_static(Network *network, const char *filename, unsigned section_line, RoutingPolicyRule **ret) { + _cleanup_(routing_policy_rule_freep) RoutingPolicyRule *rule = NULL; + _cleanup_(config_section_freep) ConfigSection *n = NULL; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + rule = hashmap_get(network->rules_by_section, n); + if (rule) { + *ret = TAKE_PTR(rule); + return 0; + } + + r = routing_policy_rule_new(&rule); + if (r < 0) + return r; + + rule->network = network; + rule->section = TAKE_PTR(n); + rule->source = NETWORK_CONFIG_SOURCE_STATIC; + rule->protocol = RTPROT_STATIC; + + r = hashmap_ensure_put(&network->rules_by_section, &config_section_hash_ops, rule->section, rule); + if (r < 0) + return r; + + *ret = TAKE_PTR(rule); + return 0; +} + +static int routing_policy_rule_dup(const RoutingPolicyRule *src, RoutingPolicyRule **ret) { + _cleanup_(routing_policy_rule_freep) RoutingPolicyRule *dest = NULL; + + assert(src); + assert(ret); + + dest = newdup(RoutingPolicyRule, src, 1); + if (!dest) + return -ENOMEM; + + /* Unset all pointers */ + dest->manager = NULL; + dest->network = NULL; + dest->section = NULL; + dest->iif = dest->oif = NULL; + + if (src->iif) { + dest->iif = strdup(src->iif); + if (!dest->iif) + return -ENOMEM; + } + + if (src->oif) { + dest->oif = strdup(src->oif); + if (!dest->oif) + return -ENOMEM; + } + + *ret = TAKE_PTR(dest); + return 0; +} + +static void routing_policy_rule_hash_func(const RoutingPolicyRule *rule, struct siphash *state) { + assert(rule); + + siphash24_compress(&rule->family, sizeof(rule->family), state); + + switch (rule->family) { + case AF_INET: + case AF_INET6: + siphash24_compress(&rule->from, FAMILY_ADDRESS_SIZE(rule->family), state); + siphash24_compress(&rule->from_prefixlen, sizeof(rule->from_prefixlen), state); + + siphash24_compress(&rule->to, FAMILY_ADDRESS_SIZE(rule->family), state); + siphash24_compress(&rule->to_prefixlen, sizeof(rule->to_prefixlen), state); + + siphash24_compress_boolean(rule->invert_rule, state); + + siphash24_compress(&rule->tos, sizeof(rule->tos), state); + siphash24_compress(&rule->type, sizeof(rule->type), state); + siphash24_compress(&rule->fwmark, sizeof(rule->fwmark), state); + siphash24_compress(&rule->fwmask, sizeof(rule->fwmask), state); + siphash24_compress(&rule->priority, sizeof(rule->priority), state); + siphash24_compress(&rule->table, sizeof(rule->table), state); + siphash24_compress(&rule->suppress_prefixlen, sizeof(rule->suppress_prefixlen), state); + siphash24_compress(&rule->suppress_ifgroup, sizeof(rule->suppress_ifgroup), state); + + siphash24_compress(&rule->ipproto, sizeof(rule->ipproto), state); + siphash24_compress(&rule->protocol, sizeof(rule->protocol), state); + siphash24_compress(&rule->sport, sizeof(rule->sport), state); + siphash24_compress(&rule->dport, sizeof(rule->dport), state); + siphash24_compress(&rule->uid_range, sizeof(rule->uid_range), state); + + siphash24_compress_string(rule->iif, state); + siphash24_compress_string(rule->oif, state); + + break; + default: + /* treat any other address family as AF_UNSPEC */ + break; + } +} + +static int routing_policy_rule_compare_func(const RoutingPolicyRule *a, const RoutingPolicyRule *b) { + int r; + + r = CMP(a->family, b->family); + if (r != 0) + return r; + + switch (a->family) { + case AF_INET: + case AF_INET6: + r = CMP(a->from_prefixlen, b->from_prefixlen); + if (r != 0) + return r; + + r = memcmp(&a->from, &b->from, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + r = CMP(a->to_prefixlen, b->to_prefixlen); + if (r != 0) + return r; + + r = memcmp(&a->to, &b->to, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + r = CMP(a->invert_rule, b->invert_rule); + if (r != 0) + return r; + + r = CMP(a->tos, b->tos); + if (r != 0) + return r; + + r = CMP(a->type, b->type); + if (r != 0) + return r; + + r = CMP(a->fwmark, b->fwmark); + if (r != 0) + return r; + + r = CMP(a->fwmask, b->fwmask); + if (r != 0) + return r; + + r = CMP(a->priority, b->priority); + if (r != 0) + return r; + + r = CMP(a->table, b->table); + if (r != 0) + return r; + + r = CMP(a->suppress_prefixlen, b->suppress_prefixlen); + if (r != 0) + return r; + + r = CMP(a->suppress_ifgroup, b->suppress_ifgroup); + if (r != 0) + return r; + + r = CMP(a->ipproto, b->ipproto); + if (r != 0) + return r; + + r = CMP(a->protocol, b->protocol); + if (r != 0) + return r; + + r = memcmp(&a->sport, &b->sport, sizeof(a->sport)); + if (r != 0) + return r; + + r = memcmp(&a->dport, &b->dport, sizeof(a->dport)); + if (r != 0) + return r; + + r = memcmp(&a->uid_range, &b->uid_range, sizeof(a->uid_range)); + if (r != 0) + return r; + + r = strcmp_ptr(a->iif, b->iif); + if (r != 0) + return r; + + r = strcmp_ptr(a->oif, b->oif); + if (r != 0) + return r; + + return 0; + default: + /* treat any other address family as AF_UNSPEC */ + return 0; + } +} + +static bool routing_policy_rule_equal(const RoutingPolicyRule *rule1, const RoutingPolicyRule *rule2) { + if (rule1 == rule2) + return true; + + if (!rule1 || !rule2) + return false; + + return routing_policy_rule_compare_func(rule1, rule2) == 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + routing_policy_rule_hash_ops, + RoutingPolicyRule, + routing_policy_rule_hash_func, + routing_policy_rule_compare_func, + routing_policy_rule_free); + +static int routing_policy_rule_get(Manager *m, const RoutingPolicyRule *in, RoutingPolicyRule **ret) { + RoutingPolicyRule *rule; + + assert(m); + assert(in); + + rule = set_get(m->rules, in); + if (rule) { + if (ret) + *ret = rule; + return 0; + } + + if (in->priority_set) + return -ENOENT; + + /* Also find rules configured without priority. */ + SET_FOREACH(rule, m->rules) { + uint32_t priority; + bool found; + + if (rule->priority_set) + /* The rule is configured with priority. */ + continue; + + priority = rule->priority; + rule->priority = 0; + found = routing_policy_rule_equal(rule, in); + rule->priority = priority; + + if (found) { + if (ret) + *ret = rule; + return 0; + } + } + + return -ENOENT; +} + +static int routing_policy_rule_add(Manager *m, RoutingPolicyRule *rule) { + int r; + + assert(m); + assert(rule); + assert(IN_SET(rule->family, AF_INET, AF_INET6)); + + r = set_ensure_put(&m->rules, &routing_policy_rule_hash_ops, rule); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + rule->manager = m; + return 0; +} + +static int routing_policy_rule_acquire_priority(Manager *manager, RoutingPolicyRule *rule) { + _cleanup_set_free_ Set *priorities = NULL; + RoutingPolicyRule *tmp; + uint32_t priority; + Network *network; + int r; + + assert(manager); + assert(rule); + assert(IN_SET(rule->family, AF_INET, AF_INET6)); + + if (rule->priority_set) + return 0; + + /* Find the highest unused priority. Note that 32766 is already used by kernel. + * See kernel_rules[] below. */ + + SET_FOREACH(tmp, manager->rules) { + if (tmp->family != rule->family) + continue; + if (tmp->priority == 0 || tmp->priority > 32765) + continue; + r = set_ensure_put(&priorities, NULL, UINT32_TO_PTR(tmp->priority)); + if (r < 0) + return r; + } + + ORDERED_HASHMAP_FOREACH(network, manager->networks) + HASHMAP_FOREACH(tmp, network->rules_by_section) { + if (tmp->family != AF_UNSPEC && tmp->family != rule->family) + continue; + if (!tmp->priority_set) + continue; + if (tmp->priority == 0 || tmp->priority > 32765) + continue; + r = set_ensure_put(&priorities, NULL, UINT32_TO_PTR(tmp->priority)); + if (r < 0) + return r; + } + + for (priority = 32765; priority > 0; priority--) + if (!set_contains(priorities, UINT32_TO_PTR(priority))) + break; + + rule->priority = priority; + return 0; +} + +static void log_routing_policy_rule_debug(const RoutingPolicyRule *rule, const char *str, const Link *link, const Manager *m) { + _cleanup_free_ char *state = NULL, *table = NULL; + + assert(rule); + assert(IN_SET(rule->family, AF_INET, AF_INET6)); + assert(str); + assert(m); + + /* link may be NULL. */ + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(rule->state, &state); + (void) manager_get_route_table_to_string(m, rule->table, /* append_num = */ true, &table); + + log_link_debug(link, + "%s %s routing policy rule (%s): priority: %"PRIu32", %s -> %s, iif: %s, oif: %s, table: %s", + str, strna(network_config_source_to_string(rule->source)), strna(state), + rule->priority, + IN_ADDR_PREFIX_TO_STRING(rule->family, &rule->from, rule->from_prefixlen), + IN_ADDR_PREFIX_TO_STRING(rule->family, &rule->to, rule->to_prefixlen), + strna(rule->iif), strna(rule->oif), strna(table)); +} + +static int routing_policy_rule_set_netlink_message(const RoutingPolicyRule *rule, sd_netlink_message *m, Link *link) { + int r; + + assert(rule); + assert(m); + + /* link may be NULL. */ + + if (rule->from_prefixlen > 0) { + r = netlink_message_append_in_addr_union(m, FRA_SRC, rule->family, &rule->from); + if (r < 0) + return r; + + r = sd_rtnl_message_routing_policy_rule_set_fib_src_prefixlen(m, rule->from_prefixlen); + if (r < 0) + return r; + } + + if (rule->to_prefixlen > 0) { + r = netlink_message_append_in_addr_union(m, FRA_DST, rule->family, &rule->to); + if (r < 0) + return r; + + r = sd_rtnl_message_routing_policy_rule_set_fib_dst_prefixlen(m, rule->to_prefixlen); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(m, FRA_PRIORITY, rule->priority); + if (r < 0) + return r; + + if (rule->tos > 0) { + r = sd_rtnl_message_routing_policy_rule_set_tos(m, rule->tos); + if (r < 0) + return r; + } + + if (rule->table < 256) { + r = sd_rtnl_message_routing_policy_rule_set_table(m, rule->table); + if (r < 0) + return r; + } else { + r = sd_rtnl_message_routing_policy_rule_set_table(m, RT_TABLE_UNSPEC); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, FRA_TABLE, rule->table); + if (r < 0) + return r; + } + + if (rule->fwmark > 0) { + r = sd_netlink_message_append_u32(m, FRA_FWMARK, rule->fwmark); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, FRA_FWMASK, rule->fwmask); + if (r < 0) + return r; + } + + if (rule->iif) { + r = sd_netlink_message_append_string(m, FRA_IIFNAME, rule->iif); + if (r < 0) + return r; + } + + if (rule->oif) { + r = sd_netlink_message_append_string(m, FRA_OIFNAME, rule->oif); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u8(m, FRA_IP_PROTO, rule->ipproto); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(m, FRA_PROTOCOL, rule->protocol); + if (r < 0) + return r; + + if (rule->sport.start != 0 || rule->sport.end != 0) { + r = sd_netlink_message_append_data(m, FRA_SPORT_RANGE, &rule->sport, sizeof(rule->sport)); + if (r < 0) + return r; + } + + if (rule->dport.start != 0 || rule->dport.end != 0) { + r = sd_netlink_message_append_data(m, FRA_DPORT_RANGE, &rule->dport, sizeof(rule->dport)); + if (r < 0) + return r; + } + + if (rule->uid_range.start != UID_INVALID && rule->uid_range.end != UID_INVALID) { + r = sd_netlink_message_append_data(m, FRA_UID_RANGE, &rule->uid_range, sizeof(rule->uid_range)); + if (r < 0) + return r; + } + + if (rule->invert_rule) { + r = sd_rtnl_message_routing_policy_rule_set_flags(m, FIB_RULE_INVERT); + if (r < 0) + return r; + } + + if (rule->suppress_prefixlen >= 0) { + r = sd_netlink_message_append_u32(m, FRA_SUPPRESS_PREFIXLEN, (uint32_t) rule->suppress_prefixlen); + if (r < 0) + return r; + } + + if (rule->suppress_ifgroup >= 0) { + r = sd_netlink_message_append_u32(m, FRA_SUPPRESS_IFGROUP, (uint32_t) rule->suppress_ifgroup); + if (r < 0) + return r; + } + + r = sd_rtnl_message_routing_policy_rule_set_fib_type(m, rule->type); + if (r < 0) + return r; + + return 0; +} + +static int routing_policy_rule_remove_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + int r; + + assert(m); + + r = sd_netlink_message_get_errno(m); + if (r < 0) + log_message_warning_errno(m, r, "Could not drop routing policy rule"); + + return 1; +} + +static int routing_policy_rule_remove(RoutingPolicyRule *rule) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(rule); + assert(rule->manager); + assert(rule->manager->rtnl); + assert(IN_SET(rule->family, AF_INET, AF_INET6)); + + log_routing_policy_rule_debug(rule, "Removing", NULL, rule->manager); + + r = sd_rtnl_message_new_routing_policy_rule(rule->manager->rtnl, &m, RTM_DELRULE, rule->family); + if (r < 0) + return log_warning_errno(r, "Could not allocate netlink message: %m"); + + r = routing_policy_rule_set_netlink_message(rule, m, NULL); + if (r < 0) + return log_warning_errno(r, "Could not create netlink message: %m"); + + r = netlink_call_async(rule->manager->rtnl, NULL, m, + routing_policy_rule_remove_handler, + NULL, NULL); + if (r < 0) + return log_warning_errno(r, "Could not send netlink message: %m"); + + routing_policy_rule_enter_removing(rule); + return 0; +} + +static int routing_policy_rule_configure(RoutingPolicyRule *rule, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(rule); + assert(IN_SET(rule->family, AF_INET, AF_INET6)); + assert(link); + assert(link->ifindex > 0); + assert(link->manager); + assert(link->manager->rtnl); + assert(req); + + log_routing_policy_rule_debug(rule, "Configuring", link, link->manager); + + r = sd_rtnl_message_new_routing_policy_rule(link->manager->rtnl, &m, RTM_NEWRULE, rule->family); + if (r < 0) + return r; + + r = routing_policy_rule_set_netlink_message(rule, m, link); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static void manager_mark_routing_policy_rules(Manager *m, bool foreign, const Link *except) { + RoutingPolicyRule *rule; + Link *link; + + assert(m); + + /* First, mark all existing rules. */ + SET_FOREACH(rule, m->rules) { + /* Do not touch rules managed by kernel. */ + if (rule->protocol == RTPROT_KERNEL) + continue; + + /* When 'foreign' is true, mark only foreign rules, and vice versa. */ + if (foreign != (rule->source == NETWORK_CONFIG_SOURCE_FOREIGN)) + continue; + + /* Ignore rules not assigned yet or already removing. */ + if (!routing_policy_rule_exists(rule)) + continue; + + routing_policy_rule_mark(rule); + } + + /* Then, unmark all rules requested by active links. */ + HASHMAP_FOREACH(link, m->links_by_index) { + if (link == except) + continue; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + continue; + + HASHMAP_FOREACH(rule, link->network->rules_by_section) { + RoutingPolicyRule *existing; + + if (IN_SET(rule->family, AF_INET, AF_INET6)) { + if (routing_policy_rule_get(m, rule, &existing) >= 0) + routing_policy_rule_unmark(existing); + } else { + /* The case Family=both. */ + rule->family = AF_INET; + if (routing_policy_rule_get(m, rule, &existing) >= 0) + routing_policy_rule_unmark(existing); + + rule->family = AF_INET6; + if (routing_policy_rule_get(m, rule, &existing) >= 0) + routing_policy_rule_unmark(existing); + + rule->family = AF_UNSPEC; + } + } + } +} + +int manager_drop_routing_policy_rules_internal(Manager *m, bool foreign, const Link *except) { + RoutingPolicyRule *rule; + int r = 0; + + assert(m); + + manager_mark_routing_policy_rules(m, foreign, except); + + SET_FOREACH(rule, m->rules) { + if (!routing_policy_rule_is_marked(rule)) + continue; + + RET_GATHER(r, routing_policy_rule_remove(rule)); + } + + return r; +} + +void link_foreignize_routing_policy_rules(Link *link) { + RoutingPolicyRule *rule; + + assert(link); + assert(link->manager); + + manager_mark_routing_policy_rules(link->manager, /* foreign = */ false, link); + + SET_FOREACH(rule, link->manager->rules) { + if (!routing_policy_rule_is_marked(rule)) + continue; + + rule->source = NETWORK_CONFIG_SOURCE_FOREIGN; + } +} + +static int routing_policy_rule_process_request(Request *req, Link *link, RoutingPolicyRule *rule) { + int r; + + assert(req); + assert(link); + assert(rule); + + if (!link_is_ready_to_configure(link, false)) + return 0; + + r = routing_policy_rule_configure(rule, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure routing policy rule: %m"); + + routing_policy_rule_enter_configuring(rule); + return 1; +} + +static int static_routing_policy_rule_configure_handler( + sd_netlink *rtnl, + sd_netlink_message *m, + Request *req, + Link *link, + RoutingPolicyRule *rule) { + + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_warning_errno(link, m, r, "Could not add routing policy rule"); + link_enter_failed(link); + return 1; + } + + if (link->static_routing_policy_rule_messages == 0) { + log_link_debug(link, "Routing policy rule configured"); + link->static_routing_policy_rules_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int link_request_routing_policy_rule(Link *link, RoutingPolicyRule *rule) { + RoutingPolicyRule *existing; + int r; + + assert(link); + assert(link->manager); + assert(rule); + assert(rule->source != NETWORK_CONFIG_SOURCE_FOREIGN); + + if (routing_policy_rule_get(link->manager, rule, &existing) < 0) { + _cleanup_(routing_policy_rule_freep) RoutingPolicyRule *tmp = NULL; + + r = routing_policy_rule_dup(rule, &tmp); + if (r < 0) + return r; + + r = routing_policy_rule_acquire_priority(link->manager, tmp); + if (r < 0) + return r; + + r = routing_policy_rule_add(link->manager, tmp); + if (r < 0) + return r; + + existing = TAKE_PTR(tmp); + } else + existing->source = rule->source; + + log_routing_policy_rule_debug(existing, "Requesting", link, link->manager); + r = link_queue_request_safe(link, REQUEST_TYPE_ROUTING_POLICY_RULE, + existing, NULL, + routing_policy_rule_hash_func, + routing_policy_rule_compare_func, + routing_policy_rule_process_request, + &link->static_routing_policy_rule_messages, + static_routing_policy_rule_configure_handler, + NULL); + if (r <= 0) + return r; + + routing_policy_rule_enter_requesting(existing); + return 1; +} + +static int link_request_static_routing_policy_rule(Link *link, RoutingPolicyRule *rule) { + int r; + + if (IN_SET(rule->family, AF_INET, AF_INET6)) + return link_request_routing_policy_rule(link, rule); + + rule->family = AF_INET; + r = link_request_routing_policy_rule(link, rule); + if (r < 0) { + rule->family = AF_UNSPEC; + return r; + } + + rule->family = AF_INET6; + r = link_request_routing_policy_rule(link, rule); + rule->family = AF_UNSPEC; + return r; +} + +int link_request_static_routing_policy_rules(Link *link) { + RoutingPolicyRule *rule; + int r; + + assert(link); + assert(link->network); + + link->static_routing_policy_rules_configured = false; + + HASHMAP_FOREACH(rule, link->network->rules_by_section) { + r = link_request_static_routing_policy_rule(link, rule); + if (r < 0) + return log_link_warning_errno(link, r, "Could not request routing policy rule: %m"); + } + + if (link->static_routing_policy_rule_messages == 0) { + link->static_routing_policy_rules_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Requesting routing policy rules"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} + +static const RoutingPolicyRule kernel_rules[] = { + { .family = AF_INET, .priority_set = true, .priority = 0, .table = RT_TABLE_LOCAL, .type = FR_ACT_TO_TBL, .uid_range.start = UID_INVALID, .uid_range.end = UID_INVALID, .suppress_prefixlen = -1, .suppress_ifgroup = -1, }, + { .family = AF_INET, .priority_set = true, .priority = 32766, .table = RT_TABLE_MAIN, .type = FR_ACT_TO_TBL, .uid_range.start = UID_INVALID, .uid_range.end = UID_INVALID, .suppress_prefixlen = -1, .suppress_ifgroup = -1, }, + { .family = AF_INET, .priority_set = true, .priority = 32767, .table = RT_TABLE_DEFAULT, .type = FR_ACT_TO_TBL, .uid_range.start = UID_INVALID, .uid_range.end = UID_INVALID, .suppress_prefixlen = -1, .suppress_ifgroup = -1, }, + { .family = AF_INET6, .priority_set = true, .priority = 0, .table = RT_TABLE_LOCAL, .type = FR_ACT_TO_TBL, .uid_range.start = UID_INVALID, .uid_range.end = UID_INVALID, .suppress_prefixlen = -1, .suppress_ifgroup = -1, }, + { .family = AF_INET6, .priority_set = true, .priority = 32766, .table = RT_TABLE_MAIN, .type = FR_ACT_TO_TBL, .uid_range.start = UID_INVALID, .uid_range.end = UID_INVALID, .suppress_prefixlen = -1, .suppress_ifgroup = -1, }, +}; + +static bool routing_policy_rule_is_created_by_kernel(const RoutingPolicyRule *rule) { + assert(rule); + + if (rule->l3mdev > 0) + /* Currently, [RoutingPolicyRule] does not explicitly set FRA_L3MDEV. So, if the flag + * is set, it is safe to treat the rule as created by kernel. */ + return true; + + for (size_t i = 0; i < ELEMENTSOF(kernel_rules); i++) + if (routing_policy_rule_equal(rule, &kernel_rules[i])) + return true; + + return false; +} + +int manager_rtnl_process_rule(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(routing_policy_rule_freep) RoutingPolicyRule *tmp = NULL; + RoutingPolicyRule *rule = NULL; + bool adjust_protocol = false; + uint16_t type; + int r; + + assert(rtnl); + assert(message); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive rule message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWRULE, RTM_DELRULE)) { + log_warning("rtnl: received unexpected message type %u when processing rule, ignoring.", type); + return 0; + } + + r = routing_policy_rule_new(&tmp); + if (r < 0) { + log_oom(); + return 0; + } + + r = sd_rtnl_message_get_family(message, &tmp->family); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get rule family, ignoring: %m"); + return 0; + } else if (!IN_SET(tmp->family, AF_INET, AF_INET6)) { + log_debug("rtnl: received rule message with invalid family %d, ignoring.", tmp->family); + return 0; + } + + r = netlink_message_read_in_addr_union(message, FRA_SRC, tmp->family, &tmp->from); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_SRC attribute, ignoring: %m"); + return 0; + } else if (r >= 0) { + r = sd_rtnl_message_routing_policy_rule_get_fib_src_prefixlen(message, &tmp->from_prefixlen); + if (r < 0) { + log_warning_errno(r, "rtnl: received rule message without valid source prefix length, ignoring: %m"); + return 0; + } + } + + r = netlink_message_read_in_addr_union(message, FRA_DST, tmp->family, &tmp->to); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_DST attribute, ignoring: %m"); + return 0; + } else if (r >= 0) { + r = sd_rtnl_message_routing_policy_rule_get_fib_dst_prefixlen(message, &tmp->to_prefixlen); + if (r < 0) { + log_warning_errno(r, "rtnl: received rule message without valid destination prefix length, ignoring: %m"); + return 0; + } + } + + unsigned flags; + r = sd_rtnl_message_routing_policy_rule_get_flags(message, &flags); + if (r < 0) { + log_warning_errno(r, "rtnl: received rule message without valid flag, ignoring: %m"); + return 0; + } + tmp->invert_rule = flags & FIB_RULE_INVERT; + + r = sd_netlink_message_read_u32(message, FRA_FWMARK, &tmp->fwmark); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_FWMARK attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, FRA_FWMASK, &tmp->fwmask); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_FWMASK attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, FRA_PRIORITY, &tmp->priority); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_PRIORITY attribute, ignoring: %m"); + return 0; + } + /* The kernel does not send priority if priority is zero. So, the flag below must be always set + * even if the message does not contain FRA_PRIORITY. */ + tmp->priority_set = true; + + r = sd_netlink_message_read_u32(message, FRA_TABLE, &tmp->table); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_TABLE attribute, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_routing_policy_rule_get_tos(message, &tmp->tos); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FIB rule TOS, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_routing_policy_rule_get_fib_type(message, &tmp->type); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FIB rule type, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_string_strdup(message, FRA_IIFNAME, &tmp->iif); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_IIFNAME attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_string_strdup(message, FRA_OIFNAME, &tmp->oif); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_OIFNAME attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u8(message, FRA_IP_PROTO, &tmp->ipproto); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_IP_PROTO attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u8(message, FRA_PROTOCOL, &tmp->protocol); + if (r == -ENODATA) + /* If FRA_PROTOCOL is supported by kernel, then the attribute is always appended. + * When the received message does not have FRA_PROTOCOL, then we need to adjust the + * protocol of the rule later. */ + adjust_protocol = true; + else if (r < 0) { + log_warning_errno(r, "rtnl: could not get FRA_PROTOCOL attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u8(message, FRA_L3MDEV, &tmp->l3mdev); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_L3MDEV attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read(message, FRA_SPORT_RANGE, sizeof(tmp->sport), &tmp->sport); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_SPORT_RANGE attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read(message, FRA_DPORT_RANGE, sizeof(tmp->dport), &tmp->dport); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_DPORT_RANGE attribute, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read(message, FRA_UID_RANGE, sizeof(tmp->uid_range), &tmp->uid_range); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_UID_RANGE attribute, ignoring: %m"); + return 0; + } + + uint32_t suppress_prefixlen; + r = sd_netlink_message_read_u32(message, FRA_SUPPRESS_PREFIXLEN, &suppress_prefixlen); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_SUPPRESS_PREFIXLEN attribute, ignoring: %m"); + return 0; + } + if (r >= 0) + tmp->suppress_prefixlen = (int32_t) suppress_prefixlen; + + uint32_t suppress_ifgroup; + r = sd_netlink_message_read_u32(message, FRA_SUPPRESS_IFGROUP, &suppress_ifgroup); + if (r < 0 && r != -ENODATA) { + log_warning_errno(r, "rtnl: could not get FRA_SUPPRESS_IFGROUP attribute, ignoring: %m"); + return 0; + } + if (r >= 0) + tmp->suppress_ifgroup = (int32_t) suppress_ifgroup; + + if (adjust_protocol) + /* As .network files does not have setting to specify protocol, we can assume the + * protocol of the received rule is RTPROT_KERNEL or RTPROT_STATIC. */ + tmp->protocol = routing_policy_rule_is_created_by_kernel(tmp) ? RTPROT_KERNEL : RTPROT_STATIC; + + (void) routing_policy_rule_get(m, tmp, &rule); + + switch (type) { + case RTM_NEWRULE: + if (rule) { + routing_policy_rule_enter_configured(rule); + log_routing_policy_rule_debug(rule, "Received remembered", NULL, m); + } else if (!m->manage_foreign_rules) { + routing_policy_rule_enter_configured(tmp); + log_routing_policy_rule_debug(tmp, "Ignoring received", NULL, m); + } else { + routing_policy_rule_enter_configured(tmp); + log_routing_policy_rule_debug(tmp, "Remembering", NULL, m); + r = routing_policy_rule_add(m, tmp); + if (r < 0) { + log_warning_errno(r, "Could not remember foreign rule, ignoring: %m"); + return 0; + } + TAKE_PTR(tmp); + } + break; + case RTM_DELRULE: + if (rule) { + routing_policy_rule_enter_removed(rule); + if (rule->state == 0) { + log_routing_policy_rule_debug(rule, "Forgetting", NULL, m); + routing_policy_rule_free(rule); + } else + log_routing_policy_rule_debug(rule, "Removed", NULL, m); + } else + log_routing_policy_rule_debug(tmp, "Kernel removed unknown", NULL, m); + break; + + default: + assert_not_reached(); + } + + return 1; +} + +static int parse_fwmark_fwmask(const char *s, uint32_t *ret_fwmark, uint32_t *ret_fwmask) { + _cleanup_free_ char *fwmark_str = NULL; + uint32_t fwmark, fwmask = 0; + const char *slash; + int r; + + assert(s); + assert(ret_fwmark); + assert(ret_fwmask); + + slash = strchr(s, '/'); + if (slash) { + fwmark_str = strndup(s, slash - s); + if (!fwmark_str) + return -ENOMEM; + } + + r = safe_atou32(fwmark_str ?: s, &fwmark); + if (r < 0) + return r; + + if (fwmark > 0) { + if (slash) { + r = safe_atou32(slash + 1, &fwmask); + if (r < 0) + return r; + } else + fwmask = UINT32_MAX; + } + + *ret_fwmark = fwmark; + *ret_fwmask = fwmask; + + return 0; +} + +int config_parse_routing_policy_rule_tos( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = safe_atou8(rvalue, &n->tos); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse RPDB rule TOS, ignoring: %s", rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_priority( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->priority = 0; + n->priority_set = false; + TAKE_PTR(n); + return 0; + } + + r = safe_atou32(rvalue, &n->priority); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse RPDB rule priority, ignoring: %s", rvalue); + return 0; + } + n->priority_set = true; + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_table( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = manager_get_route_table_from_string(network->manager, rvalue, &n->table); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Could not parse RPDB rule route table \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_fwmark_mask( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_fwmark_fwmask(rvalue, &n->fwmark, &n->fwmask); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse RPDB rule firewall mark or mask, ignoring: %s", rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_prefix( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + union in_addr_union *buffer; + uint8_t *prefixlen; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (streq(lvalue, "To")) { + buffer = &n->to; + prefixlen = &n->to_prefixlen; + } else { + buffer = &n->from; + prefixlen = &n->from_prefixlen; + } + + if (n->family == AF_UNSPEC) + r = in_addr_prefix_from_string_auto(rvalue, &n->family, buffer, prefixlen); + else + r = in_addr_prefix_from_string(rvalue, n->family, buffer, prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "RPDB rule prefix is invalid, ignoring assignment: %s", rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_device( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid interface name '%s' in %s=, ignoring assignment.", rvalue, lvalue); + return 0; + } + + r = free_and_strdup(streq(lvalue, "IncomingInterface") ? &n->iif : &n->oif, rvalue); + if (r < 0) + return log_oom(); + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_port_range( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + uint16_t low, high; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_ip_port_range(rvalue, &low, &high); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse routing policy rule port range '%s'", rvalue); + return 0; + } + + if (streq(lvalue, "SourcePort")) { + n->sport.start = low; + n->sport.end = high; + } else { + n->dport.start = low; + n->dport.end = high; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_ip_protocol( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_ip_protocol(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse IP protocol '%s' for routing policy rule, ignoring: %m", rvalue); + return 0; + } + + n->ipproto = r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_invert( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse RPDB rule invert, ignoring: %s", rvalue); + return 0; + } + + n->invert_rule = r; + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_family( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + AddressFamily a; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + a = routing_policy_rule_address_family_from_string(rvalue); + if (a < 0) { + log_syntax(unit, LOG_WARNING, filename, line, a, + "Invalid address family '%s', ignoring.", rvalue); + return 0; + } + + n->address_family = a; + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_uid_range( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + uid_t start, end; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = get_user_creds(&rvalue, &start, NULL, NULL, NULL, 0); + if (r >= 0) + end = start; + else { + r = parse_uid_range(rvalue, &start, &end); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid uid or uid range '%s', ignoring: %m", rvalue); + return 0; + } + } + + n->uid_range.start = start; + n->uid_range.end = end; + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_suppress_prefixlen( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + r = parse_ip_prefix_length(rvalue, &n->suppress_prefixlen); + if (r == -ERANGE) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Prefix length outside of valid range 0-128, ignoring: %s", rvalue); + return 0; + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse RPDB rule suppress_prefixlen, ignoring: %s", rvalue); + return 0; + } + + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_suppress_ifgroup( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int32_t suppress_ifgroup; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + if (isempty(rvalue)) { + n->suppress_ifgroup = -1; + return 0; + } + + r = safe_atoi32(rvalue, &suppress_ifgroup); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse SuppressInterfaceGroup=, ignoring assignment: %s", rvalue); + return 0; + } + if (suppress_ifgroup < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Value of SuppressInterfaceGroup= must be in the range 0…2147483647, ignoring assignment: %s", rvalue); + return 0; + } + n->suppress_ifgroup = suppress_ifgroup; + TAKE_PTR(n); + return 0; +} + +int config_parse_routing_policy_rule_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(routing_policy_rule_free_or_set_invalidp) RoutingPolicyRule *n = NULL; + Network *network = userdata; + int r, t; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + r = routing_policy_rule_new_static(network, filename, section_line, &n); + if (r < 0) + return log_oom(); + + t = fr_act_type_from_string(rvalue); + if (t < 0) { + log_syntax(unit, LOG_WARNING, filename, line, t, + "Could not parse FIB rule type \"%s\", ignoring assignment: %m", rvalue); + return 0; + } + + n->type = (uint8_t) t; + + TAKE_PTR(n); + return 0; +} + +static int routing_policy_rule_section_verify(RoutingPolicyRule *rule) { + if (section_is_invalid(rule->section)) + return -EINVAL; + + if ((rule->family == AF_INET && FLAGS_SET(rule->address_family, ADDRESS_FAMILY_IPV6)) || + (rule->family == AF_INET6 && FLAGS_SET(rule->address_family, ADDRESS_FAMILY_IPV4))) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: address family specified by Family= conflicts with the address " + "specified by To= or From=. Ignoring [RoutingPolicyRule] section from line %u.", + rule->section->filename, rule->section->line); + + if (rule->family == AF_UNSPEC) { + if (IN_SET(rule->address_family, ADDRESS_FAMILY_IPV4, ADDRESS_FAMILY_NO)) + rule->family = AF_INET; + else if (rule->address_family == ADDRESS_FAMILY_IPV6) + rule->family = AF_INET6; + /* rule->family can be AF_UNSPEC only when Family=both. */ + } + + /* Currently, [RoutingPolicyRule] does not have a setting to set FRA_L3MDEV flag. Please also + * update routing_policy_rule_is_created_by_kernel() when a new setting which sets the flag is + * added in the future. */ + if (rule->l3mdev > 0) + assert_not_reached(); + + return 0; +} + +void network_drop_invalid_routing_policy_rules(Network *network) { + RoutingPolicyRule *rule; + + assert(network); + + HASHMAP_FOREACH(rule, network->rules_by_section) + if (routing_policy_rule_section_verify(rule) < 0) + routing_policy_rule_free(rule); +} diff --git a/src/network/networkd-routing-policy-rule.h b/src/network/networkd-routing-policy-rule.h new file mode 100644 index 0000000..b6ce2fa --- /dev/null +++ b/src/network/networkd-routing-policy-rule.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Network Network; + +typedef struct RoutingPolicyRule { + Manager *manager; + Network *network; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + + bool invert_rule; + bool priority_set; + + uint8_t tos; + uint8_t type; + uint8_t ipproto; /* FRA_IP_PROTO */ + uint8_t protocol; /* FRA_PROTOCOL */ + uint8_t to_prefixlen; + uint8_t from_prefixlen; + uint8_t l3mdev; /* FRA_L3MDEV */ + + uint32_t table; + uint32_t fwmark; + uint32_t fwmask; + uint32_t priority; + + AddressFamily address_family; /* Specified by Family= */ + int family; /* Automatically determined by From= or To= */ + + char *iif; + char *oif; + + union in_addr_union to; + union in_addr_union from; + + struct fib_rule_port_range sport; + struct fib_rule_port_range dport; + struct fib_rule_uid_range uid_range; + + int suppress_prefixlen; + int32_t suppress_ifgroup; +} RoutingPolicyRule; + +const char *fr_act_type_full_to_string(int t) _const_; + +RoutingPolicyRule *routing_policy_rule_free(RoutingPolicyRule *rule); + +void network_drop_invalid_routing_policy_rules(Network *network); + +int link_request_static_routing_policy_rules(Link *link); + +int manager_rtnl_process_rule(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); +int manager_drop_routing_policy_rules_internal(Manager *m, bool foreign, const Link *except); +static inline int manager_drop_foreign_routing_policy_rules(Manager *m) { + return manager_drop_routing_policy_rules_internal(m, true, NULL); +} +static inline int link_drop_managed_routing_policy_rules(Link *link) { + assert(link); + return manager_drop_routing_policy_rules_internal(link->manager, false, link); +} +void link_foreignize_routing_policy_rules(Link *link); + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(RoutingPolicyRule, routing_policy_rule); + +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_tos); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_table); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_fwmark_mask); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_prefix); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_priority); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_device); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_port_range); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_ip_protocol); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_invert); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_family); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_uid_range); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_suppress_prefixlen); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_suppress_ifgroup); +CONFIG_PARSER_PROTOTYPE(config_parse_routing_policy_rule_type); diff --git a/src/network/networkd-setlink.c b/src/network/networkd-setlink.c new file mode 100644 index 0000000..011ea1f --- /dev/null +++ b/src/network/networkd-setlink.c @@ -0,0 +1,1309 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "missing_network.h" +#include "netif-util.h" +#include "netlink-util.h" +#include "networkd-address.h" +#include "networkd-can.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "networkd-setlink.h" +#include "networkd-sriov.h" +#include "networkd-wiphy.h" + +static int get_link_default_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + return link_getlink_handler_internal(rtnl, m, link, "Failed to sync link information"); +} + +static int get_link_master_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + if (get_link_default_handler(rtnl, m, link) > 0) + link->master_set = true; + return 0; +} + +static int get_link_update_flag_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + assert(link); + assert(link->set_flags_messages > 0); + + link->set_flags_messages--; + + return get_link_default_handler(rtnl, m, link); +} + +static int set_link_handler_internal( + sd_netlink *rtnl, + sd_netlink_message *m, + Request *req, + Link *link, + bool ignore, + link_netlink_message_handler_t get_link_handler) { + + int r; + + assert(m); + assert(req); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0) { + const char *error_msg; + + error_msg = strjoina("Failed to set ", request_type_to_string(req->type), ignore ? ", ignoring" : ""); + log_link_message_warning_errno(link, m, r, error_msg); + + if (!ignore) + link_enter_failed(link); + return 0; + } + + log_link_debug(link, "%s set.", request_type_to_string(req->type)); + + if (get_link_handler) { + r = link_call_getlink(link, get_link_handler); + if (r < 0) { + link_enter_failed(link); + return 0; + } + } + + if (link->set_link_messages == 0) + link_check_ready(link); + + return 1; +} + +static int link_set_addrgen_mode_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + int r; + + r = set_link_handler_internal(rtnl, m, req, link, /* ignore = */ true, NULL); + if (r <= 0) + return r; + + r = link_drop_ipv6ll_addresses(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to drop IPv6LL addresses: %m"); + link_enter_failed(link); + } + + return 0; +} + +static int link_set_bond_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ false, NULL); +} + +static int link_set_bridge_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ true, NULL); +} + +static int link_set_bridge_vlan_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ false, NULL); +} + +static int link_set_can_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ false, NULL); +} + +static int link_set_flags_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ false, get_link_default_handler); +} + +static int link_set_group_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ false, NULL); +} + +static int link_set_ipoib_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ true, NULL); +} + +static int link_set_mac_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ true, get_link_default_handler); +} + +static int link_set_mac_allow_retry_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r == -EBUSY) { + /* Most real network devices refuse to set its hardware address with -EBUSY when its + * operstate is not down. See, eth_prepare_mac_addr_change() in net/ethernet/eth.c + * of kernel. */ + + log_link_message_debug_errno(link, m, r, "Failed to set MAC address, retrying again: %m"); + + r = link_request_to_set_mac(link, /* allow_retry = */ false); + if (r < 0) + link_enter_failed(link); + + return 0; + } + + return link_set_mac_handler(rtnl, m, req, link, userdata); +} + +static int link_set_master_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ false, get_link_master_handler); +} + +static int link_unset_master_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + /* Some devices do not support setting master ifindex. Let's ignore error on unsetting master ifindex. */ + return set_link_handler_internal(rtnl, m, req, link, /* ignore = */ true, get_link_master_handler); +} + +static int link_set_mtu_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + int r; + + r = set_link_handler_internal(rtnl, m, req, link, /* ignore = */ true, get_link_default_handler); + if (r <= 0) + return r; + + /* The kernel resets ipv6 mtu after changing device mtu; + * we must set this here, after we've set device mtu */ + r = link_set_ipv6_mtu(link); + if (r < 0) + log_link_warning_errno(link, r, "Failed to set IPv6 MTU, ignoring: %m"); + + return 0; +} + +static int link_configure_fill_message( + Link *link, + sd_netlink_message *req, + RequestType type, + void *userdata) { + int r; + + switch (type) { + case REQUEST_TYPE_SET_LINK_ADDRESS_GENERATION_MODE: + r = ipv6ll_addrgen_mode_fill_message(req, PTR_TO_UINT8(userdata)); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_BOND: + r = sd_netlink_message_set_flags(req, NLM_F_REQUEST | NLM_F_ACK); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(req, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(req, IFLA_INFO_DATA, "bond"); + if (r < 0) + return r; + + if (link->network->active_slave) { + r = sd_netlink_message_append_u32(req, IFLA_BOND_ACTIVE_SLAVE, link->ifindex); + if (r < 0) + return r; + } + + if (link->network->primary_slave) { + r = sd_netlink_message_append_u32(req, IFLA_BOND_PRIMARY, link->ifindex); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + break; + case REQUEST_TYPE_SET_LINK_BRIDGE: + r = sd_rtnl_message_link_set_family(req, AF_BRIDGE); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(req, IFLA_PROTINFO); + if (r < 0) + return r; + + if (link->network->use_bpdu >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_GUARD, !link->network->use_bpdu); + if (r < 0) + return r; + } + + if (link->network->hairpin >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_MODE, link->network->hairpin); + if (r < 0) + return r; + } + + if (link->network->isolated >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_ISOLATED, link->network->isolated); + if (r < 0) + return r; + } + + if (link->network->fast_leave >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_FAST_LEAVE, link->network->fast_leave); + if (r < 0) + return r; + } + + if (link->network->allow_port_to_be_root >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_PROTECT, !link->network->allow_port_to_be_root); + if (r < 0) + return r; + } + + if (link->network->unicast_flood >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_UNICAST_FLOOD, link->network->unicast_flood); + if (r < 0) + return r; + } + + if (link->network->multicast_flood >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_MCAST_FLOOD, link->network->multicast_flood); + if (r < 0) + return r; + } + + if (link->network->multicast_to_unicast >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_MCAST_TO_UCAST, link->network->multicast_to_unicast); + if (r < 0) + return r; + } + + if (link->network->neighbor_suppression >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_NEIGH_SUPPRESS, link->network->neighbor_suppression); + if (r < 0) + return r; + } + + if (link->network->learning >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_LEARNING, link->network->learning); + if (r < 0) + return r; + } + + if (link->network->bridge_proxy_arp >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_PROXYARP, link->network->bridge_proxy_arp); + if (r < 0) + return r; + } + + if (link->network->bridge_proxy_arp_wifi >= 0) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_PROXYARP_WIFI, link->network->bridge_proxy_arp_wifi); + if (r < 0) + return r; + } + + if (link->network->cost != 0) { + r = sd_netlink_message_append_u32(req, IFLA_BRPORT_COST, link->network->cost); + if (r < 0) + return r; + } + + if (link->network->priority != LINK_BRIDGE_PORT_PRIORITY_INVALID) { + r = sd_netlink_message_append_u16(req, IFLA_BRPORT_PRIORITY, link->network->priority); + if (r < 0) + return r; + } + + if (link->network->multicast_router != _MULTICAST_ROUTER_INVALID) { + r = sd_netlink_message_append_u8(req, IFLA_BRPORT_MULTICAST_ROUTER, link->network->multicast_router); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_BRIDGE_VLAN: + r = sd_rtnl_message_link_set_family(req, AF_BRIDGE); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(req, IFLA_AF_SPEC); + if (r < 0) + return r; + + if (link->master_ifindex <= 0) { + /* master needs BRIDGE_FLAGS_SELF flag */ + r = sd_netlink_message_append_u16(req, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF); + if (r < 0) + return r; + } + + r = bridge_vlan_append_info(link, req, link->network->pvid, link->network->br_vid_bitmap, link->network->br_untagged_bitmap); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + break; + case REQUEST_TYPE_SET_LINK_CAN: + r = can_set_netlink_message(link, req); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_FLAGS: { + unsigned ifi_change = 0, ifi_flags = 0; + + if (link->network->arp >= 0) { + ifi_change |= IFF_NOARP; + SET_FLAG(ifi_flags, IFF_NOARP, link->network->arp == 0); + } + + if (link->network->multicast >= 0) { + ifi_change |= IFF_MULTICAST; + SET_FLAG(ifi_flags, IFF_MULTICAST, link->network->multicast); + } + + if (link->network->allmulticast >= 0) { + ifi_change |= IFF_ALLMULTI; + SET_FLAG(ifi_flags, IFF_ALLMULTI, link->network->allmulticast); + } + + if (link->network->promiscuous >= 0) { + ifi_change |= IFF_PROMISC; + SET_FLAG(ifi_flags, IFF_PROMISC, link->network->promiscuous); + } + + r = sd_rtnl_message_link_set_flags(req, ifi_flags, ifi_change); + if (r < 0) + return r; + + break; + } + case REQUEST_TYPE_SET_LINK_GROUP: + r = sd_netlink_message_append_u32(req, IFLA_GROUP, (uint32_t) link->network->group); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_MAC: + r = netlink_message_append_hw_addr(req, IFLA_ADDRESS, &link->requested_hw_addr); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_IPOIB: + r = ipoib_set_netlink_message(link, req); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_MASTER: + r = sd_netlink_message_append_u32(req, IFLA_MASTER, PTR_TO_UINT32(userdata)); + if (r < 0) + return r; + break; + case REQUEST_TYPE_SET_LINK_MTU: + r = sd_netlink_message_append_u32(req, IFLA_MTU, PTR_TO_UINT32(userdata)); + if (r < 0) + return r; + break; + default: + assert_not_reached(); + } + + return 0; +} + +static int link_configure(Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(link); + assert(link->manager); + assert(req); + + log_link_debug(link, "Setting %s", request_type_to_string(req->type)); + + if (req->type == REQUEST_TYPE_SET_LINK_BOND) + r = sd_rtnl_message_new_link(link->manager->rtnl, &m, RTM_NEWLINK, link->master_ifindex); + else if (IN_SET(req->type, REQUEST_TYPE_SET_LINK_CAN, REQUEST_TYPE_SET_LINK_IPOIB)) + r = sd_rtnl_message_new_link(link->manager->rtnl, &m, RTM_NEWLINK, link->ifindex); + else + r = sd_rtnl_message_new_link(link->manager->rtnl, &m, RTM_SETLINK, link->ifindex); + if (r < 0) + return r; + + r = link_configure_fill_message(link, m, req->type, req->userdata); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool netdev_is_ready(NetDev *netdev) { + assert(netdev); + + if (netdev->state != NETDEV_STATE_READY) + return false; + if (netdev->ifindex == 0) + return false; + + return true; +} + +static int link_is_ready_to_set_link(Link *link, Request *req) { + int r; + + assert(link); + assert(link->manager); + assert(link->network); + assert(req); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + switch (req->type) { + case REQUEST_TYPE_SET_LINK_BOND: + case REQUEST_TYPE_SET_LINK_BRIDGE: + if (!link->master_set) + return false; + + if (link->network->keep_master && link->master_ifindex <= 0) + return false; + break; + + case REQUEST_TYPE_SET_LINK_BRIDGE_VLAN: + if (!link->master_set) + return false; + + if (link->network->keep_master && link->master_ifindex <= 0 && !streq_ptr(link->kind, "bridge")) + return false; + + break; + + case REQUEST_TYPE_SET_LINK_CAN: + /* Do not check link->set_flags_messages here, as it is ok even if link->flags + * is outdated, and checking the counter causes a deadlock. */ + if (FLAGS_SET(link->flags, IFF_UP)) { + /* The CAN interface must be down to configure bitrate, etc... */ + r = link_down_now(link); + if (r < 0) + return r; + } + break; + + case REQUEST_TYPE_SET_LINK_MAC: + if (req->netlink_handler == link_set_mac_handler) { + /* This is the second attempt to set hardware address. On the first attempt + * req->netlink_handler points to link_set_mac_allow_retry_handler(). + * The first attempt failed as the interface was up. */ + r = link_down_now(link); + if (r < 0) + return r; + + /* If the kind of the link is "bond", we need + * set the slave link down as well. */ + if (streq_ptr(link->kind, "bond")) { + r = link_down_slave_links(link); + if (r < 0) + return r; + } + } + break; + + case REQUEST_TYPE_SET_LINK_MASTER: { + uint32_t m = 0; + Request req_mac = { + .link = link, + .type = REQUEST_TYPE_SET_LINK_MAC, + }; + + if (link->network->batadv) { + if (!netdev_is_ready(link->network->batadv)) + return false; + m = link->network->batadv->ifindex; + } else if (link->network->bond) { + if (ordered_set_contains(link->manager->request_queue, &req_mac)) + return false; + if (!netdev_is_ready(link->network->bond)) + return false; + m = link->network->bond->ifindex; + } else if (link->network->bridge) { + if (ordered_set_contains(link->manager->request_queue, &req_mac)) + return false; + if (!netdev_is_ready(link->network->bridge)) + return false; + m = link->network->bridge->ifindex; + } else if (link->network->vrf) { + if (!netdev_is_ready(link->network->vrf)) + return false; + m = link->network->vrf->ifindex; + } + + if (m == (uint32_t) link->master_ifindex) { + /* The requested master is already set. */ + link->master_set = true; + return -EALREADY; /* indicate to cancel the request. */ + } + + /* Do not check link->set_flags_messages here, as it is ok even if link->flags is outdated, + * and checking the counter causes a deadlock. */ + if (link->network->bond && FLAGS_SET(link->flags, IFF_UP)) { + /* link must be down when joining to bond master. */ + r = link_down_now(link); + if (r < 0) + return r; + } + + req->userdata = UINT32_TO_PTR(m); + break; + } + case REQUEST_TYPE_SET_LINK_MTU: { + if (ordered_set_contains(link->manager->request_queue, + &(const Request) { + .link = link, + .type = REQUEST_TYPE_SET_LINK_IPOIB, + })) + return false; + + /* Changing FD mode may affect MTU. */ + if (ordered_set_contains(link->manager->request_queue, + &(const Request) { + .link = link, + .type = REQUEST_TYPE_SET_LINK_CAN, + })) + return false; + } + default: + break; + } + + return true; +} + +static int link_process_set_link(Request *req, Link *link, void *userdata) { + int r; + + assert(req); + assert(link); + + r = link_is_ready_to_set_link(link, req); + if (r == -EALREADY) + return 1; /* Cancel the request. */ + if (r <= 0) + return r; + + r = link_configure(link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to set %s", request_type_to_string(req->type)); + + return 1; +} + +static int link_request_set_link( + Link *link, + RequestType type, + request_netlink_handler_t netlink_handler, + Request **ret) { + + Request *req; + int r; + + assert(link); + + r = link_queue_request_full(link, type, NULL, NULL, NULL, NULL, + link_process_set_link, + &link->set_link_messages, + netlink_handler, + &req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request to set %s: %m", + request_type_to_string(type)); + + log_link_debug(link, "Requested to set %s", request_type_to_string(type)); + + if (ret) + *ret = req; + return 0; +} + +int link_request_to_set_addrgen_mode(Link *link) { + IPv6LinkLocalAddressGenMode mode; + Request *req; + int r; + + assert(link); + assert(link->network); + + if (!socket_ipv6_is_supported()) + return 0; + + mode = link_get_ipv6ll_addrgen_mode(link); + + if (mode == link->ipv6ll_address_gen_mode) + return 0; + + /* If the link is already up, then changing the mode by netlink does not take effect until the + * link goes down. Hence, we need to reset the interface. However, setting the mode by sysctl + * does not need that. Let's use the sysctl interface when the link is already up. + * See also issue #22424. */ + if (mode != IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_NONE && + FLAGS_SET(link->flags, IFF_UP)) { + r = link_set_ipv6ll_addrgen_mode(link, mode); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv6 address generation mode, ignoring: %m"); + + return 0; + } + + r = link_request_set_link(link, REQUEST_TYPE_SET_LINK_ADDRESS_GENERATION_MODE, + link_set_addrgen_mode_handler, + &req); + if (r < 0) + return r; + + req->userdata = UINT8_TO_PTR(mode); + return 0; +} + +int link_request_to_set_bond(Link *link) { + assert(link); + assert(link->network); + + if (!link->network->bond) { + Link *master; + + if (!link->network->keep_master) + return 0; + + if (link_get_master(link, &master) < 0) + return 0; + + if (!streq_ptr(master->kind, "bond")) + return 0; + } + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_BOND, + link_set_bond_handler, NULL); +} + +int link_request_to_set_bridge(Link *link) { + assert(link); + assert(link->network); + + if (!link->network->bridge) { + Link *master; + + if (!link->network->keep_master) + return 0; + + if (link_get_master(link, &master) < 0) + return 0; + + if (!streq_ptr(master->kind, "bridge")) + return 0; + } + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_BRIDGE, + link_set_bridge_handler, + NULL); +} + +int link_request_to_set_bridge_vlan(Link *link) { + assert(link); + assert(link->network); + + if (!link->network->use_br_vlan) + return 0; + + if (!link->network->bridge && !streq_ptr(link->kind, "bridge")) { + Link *master; + + if (!link->network->keep_master) + return 0; + + if (link_get_master(link, &master) < 0) + return 0; + + if (!streq_ptr(master->kind, "bridge")) + return 0; + } + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_BRIDGE_VLAN, + link_set_bridge_vlan_handler, + NULL); +} + +int link_request_to_set_can(Link *link) { + assert(link); + assert(link->network); + + if (link->iftype != ARPHRD_CAN) + return 0; + + if (!streq_ptr(link->kind, "can")) + return 0; + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_CAN, + link_set_can_handler, + NULL); +} + +int link_request_to_set_flags(Link *link) { + assert(link); + assert(link->network); + + if (link->network->arp < 0 && + link->network->multicast < 0 && + link->network->allmulticast < 0 && + link->network->promiscuous < 0) + return 0; + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_FLAGS, + link_set_flags_handler, + NULL); +} + +int link_request_to_set_group(Link *link) { + assert(link); + assert(link->network); + + if (link->network->group < 0) + return 0; + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_GROUP, + link_set_group_handler, + NULL); +} + +int link_request_to_set_mac(Link *link, bool allow_retry) { + int r; + + assert(link); + assert(link->network); + + if (link->network->hw_addr.length == 0) + return 0; + + link->requested_hw_addr = link->network->hw_addr; + r = net_verify_hardware_address(link->ifname, /* is_static = */ true, + link->iftype, &link->hw_addr, &link->requested_hw_addr); + if (r < 0) + return r; + + if (hw_addr_equal(&link->hw_addr, &link->requested_hw_addr)) + return 0; + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_MAC, + allow_retry ? link_set_mac_allow_retry_handler : link_set_mac_handler, + NULL); +} + +int link_request_to_set_ipoib(Link *link) { + assert(link); + assert(link->network); + + if (link->iftype != ARPHRD_INFINIBAND) + return 0; + + if (link->network->ipoib_mode < 0 && + link->network->ipoib_umcast < 0) + return 0; + + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_IPOIB, + link_set_ipoib_handler, + NULL); +} + +int link_request_to_set_master(Link *link) { + assert(link); + assert(link->network); + + if (link->network->keep_master) { + /* When KeepMaster=yes, BatmanAdvanced=, Bond=, Bridge=, and VRF= are ignored. */ + link->master_set = true; + return 0; + + } else if (link->network->batadv || link->network->bond || link->network->bridge || link->network->vrf) { + link->master_set = false; + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_MASTER, + link_set_master_handler, + NULL); + + } else if (link->master_ifindex != 0) { + /* Unset master only when it is set. */ + link->master_set = false; + return link_request_set_link(link, REQUEST_TYPE_SET_LINK_MASTER, + link_unset_master_handler, + NULL); + + } else { + /* Nothing we need to do. */ + link->master_set = true; + return 0; + } +} + +int link_request_to_set_mtu(Link *link, uint32_t mtu) { + const char *origin; + uint32_t min_mtu, max_mtu; + Request *req; + int r; + + assert(link); + assert(link->network); + + min_mtu = link->min_mtu; + origin = "the minimum MTU of the interface"; + if (link_ipv6_enabled(link)) { + /* IPv6 protocol requires a minimum MTU of IPV6_MTU_MIN(1280) bytes on the interface. Bump up + * MTU bytes to IPV6_MTU_MIN. */ + if (min_mtu < IPV6_MIN_MTU) { + min_mtu = IPV6_MIN_MTU; + origin = "the minimum IPv6 MTU"; + } + if (min_mtu < link->network->ipv6_mtu) { + min_mtu = link->network->ipv6_mtu; + origin = "the requested IPv6 MTU in IPv6MTUBytes="; + } + } + + if (mtu < min_mtu) { + log_link_warning(link, "Bumping the requested MTU %"PRIu32" to %s (%"PRIu32")", + mtu, origin, min_mtu); + mtu = min_mtu; + } + + max_mtu = link->max_mtu; + if (link->iftype == ARPHRD_CAN) + /* The maximum MTU may be changed when FD mode is changed. + * See https://docs.kernel.org/networking/can.html#can-fd-flexible-data-rate-driver-support + * MTU = 16 (CAN_MTU) => Classical CAN device + * MTU = 72 (CANFD_MTU) => CAN FD capable device + * So, even if the current maximum is 16, we should not reduce the requested value now. */ + max_mtu = MAX(max_mtu, 72u); + + if (mtu > max_mtu) { + log_link_warning(link, "Reducing the requested MTU %"PRIu32" to the interface's maximum MTU %"PRIu32".", + mtu, max_mtu); + mtu = max_mtu; + } + + if (link->mtu == mtu) + return 0; + + r = link_request_set_link(link, REQUEST_TYPE_SET_LINK_MTU, + link_set_mtu_handler, + &req); + if (r < 0) + return r; + + req->userdata = UINT32_TO_PTR(mtu); + return 0; +} + +static bool link_reduces_vlan_mtu(Link *link) { + /* See netif_reduces_vlan_mtu() in kernel. */ + return streq_ptr(link->kind, "macsec"); +} + +static uint32_t link_get_requested_mtu_by_stacked_netdevs(Link *link) { + uint32_t mtu = 0; + NetDev *dev; + + HASHMAP_FOREACH(dev, link->network->stacked_netdevs) + if (dev->kind == NETDEV_KIND_VLAN && dev->mtu > 0) + /* See vlan_dev_change_mtu() in kernel. */ + mtu = MAX(mtu, link_reduces_vlan_mtu(link) ? dev->mtu + 4 : dev->mtu); + + else if (dev->kind == NETDEV_KIND_MACVLAN && dev->mtu > mtu) + /* See macvlan_change_mtu() in kernel. */ + mtu = dev->mtu; + + return mtu; +} + +int link_configure_mtu(Link *link) { + uint32_t mtu; + + assert(link); + assert(link->network); + + if (link->network->mtu > 0) + return link_request_to_set_mtu(link, link->network->mtu); + + mtu = link_get_requested_mtu_by_stacked_netdevs(link); + if (link->mtu >= mtu) + return 0; + + log_link_notice(link, "Bumping MTU bytes from %"PRIu32" to %"PRIu32" because of stacked device. " + "If it is not desired, then please explicitly specify MTUBytes= setting.", + link->mtu, mtu); + + return link_request_to_set_mtu(link, mtu); +} + +static int link_up_dsa_slave(Link *link) { + Link *master; + int r; + + assert(link); + + /* For older kernels (specifically, older than 9d5ef190e5615a7b63af89f88c4106a5bc127974, kernel-5.12), + * it is necessary to bring up a DSA slave that its master interface is already up. And bringing up + * the slave fails with -ENETDOWN. So, let's bring up the master even if it is not managed by us, + * and try to bring up the slave after the master becomes up. */ + + if (link->dsa_master_ifindex <= 0) + return 0; + + if (!streq_ptr(link->driver, "dsa")) + return 0; + + if (link_get_by_index(link->manager, link->dsa_master_ifindex, &master) < 0) + return 0; + + if (master->state == LINK_STATE_UNMANAGED) { + /* If the DSA master interface is unmanaged, then it will never become up. + * Let's request to bring up the master. */ + r = link_request_to_bring_up_or_down(master, /* up = */ true); + if (r < 0) + return r; + } + + r = link_request_to_bring_up_or_down(link, /* up = */ true); + if (r < 0) + return r; + + return 1; +} + +static int link_up_or_down_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, void *userdata) { + bool on_activate, up; + int r; + + assert(m); + assert(req); + assert(link); + + on_activate = req->type == REQUEST_TYPE_ACTIVATE_LINK; + up = PTR_TO_INT(req->userdata); + + r = sd_netlink_message_get_errno(m); + if (r == -ENETDOWN && up && link_up_dsa_slave(link) > 0) + log_link_message_debug_errno(link, m, r, "Could not bring up dsa slave, retrying again after dsa master becomes up"); + else if (r < 0) + log_link_message_warning_errno(link, m, r, up ? + "Could not bring up interface, ignoring" : + "Could not bring down interface, ignoring"); + + r = link_call_getlink(link, get_link_update_flag_handler); + if (r < 0) { + link_enter_failed(link); + return 0; + } + + link->set_flags_messages++; + + if (on_activate) { + link->activated = true; + link_check_ready(link); + } + + return 0; +} + +static const char *up_or_down(bool up) { + return up ? "up" : "down"; +} + +static int link_up_or_down(Link *link, bool up, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(req); + + /* The log message is checked in the test. Please also update test_bond_active_slave() in + * test/test-network/systemd-networkd-tests.py. when the log message below is modified. */ + log_link_debug(link, "Bringing link %s", up_or_down(up)); + + r = sd_rtnl_message_new_link(link->manager->rtnl, &m, RTM_SETLINK, link->ifindex); + if (r < 0) + return r; + + r = sd_rtnl_message_link_set_flags(m, up ? IFF_UP : 0, IFF_UP); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool link_is_ready_to_activate_one(Link *link, bool allow_unmanaged) { + assert(link); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED, LINK_STATE_UNMANAGED)) + return false; + + if (!link->network) + return allow_unmanaged; + + if (link->set_link_messages > 0) + return false; + + return true; +} + + static bool link_is_ready_to_activate(Link *link, bool up) { + assert(link); + + if (!check_ready_for_all_sr_iov_ports(link, /* allow_unmanaged = */ false, + link_is_ready_to_activate_one)) + return false; + + if (up && link_rfkilled(link) > 0) + return false; + + return true; +} + +static int link_process_activation(Request *req, Link *link, void *userdata) { + bool up = PTR_TO_INT(userdata); + int r; + + assert(req); + assert(link); + + if (!link_is_ready_to_activate(link, up)) + return 0; + + r = link_up_or_down(link, up, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to activate link: %m"); + + return 1; +} + +int link_request_to_activate(Link *link) { + bool up; + int r; + + assert(link); + assert(link->network); + + switch (link->network->activation_policy) { + case ACTIVATION_POLICY_BOUND: + r = link_handle_bound_to_list(link); + if (r < 0) + return r; + _fallthrough_; + case ACTIVATION_POLICY_MANUAL: + link->activated = true; + link_check_ready(link); + return 0; + case ACTIVATION_POLICY_UP: + case ACTIVATION_POLICY_ALWAYS_UP: + up = true; + break; + case ACTIVATION_POLICY_DOWN: + case ACTIVATION_POLICY_ALWAYS_DOWN: + up = false; + break; + default: + assert_not_reached(); + } + + link->activated = false; + + r = link_queue_request_full(link, REQUEST_TYPE_ACTIVATE_LINK, + INT_TO_PTR(up), NULL, NULL, NULL, + link_process_activation, + &link->set_flags_messages, + link_up_or_down_handler, NULL); + if (r < 0) + return log_link_error_errno(link, r, "Failed to request to activate link: %m"); + + log_link_debug(link, "Requested to activate link"); + return 0; +} + +static bool link_is_ready_to_bring_up_or_down(Link *link, bool up) { + assert(link); + + if (up && link->dsa_master_ifindex > 0) { + Link *master; + + /* The master interface must be up. See comments in link_up_dsa_slave(). */ + + if (link_get_by_index(link->manager, link->dsa_master_ifindex, &master) < 0) + return false; + + if (!FLAGS_SET(master->flags, IFF_UP)) + return false; + } + + if (link->state == LINK_STATE_UNMANAGED) + return true; + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + if (link->set_link_messages > 0) + return false; + + if (!link->activated) + return false; + + if (up && link_rfkilled(link) > 0) + return false; + + return true; +} + +static int link_process_up_or_down(Request *req, Link *link, void *userdata) { + bool up = PTR_TO_INT(userdata); + int r; + + assert(req); + assert(link); + + if (!link_is_ready_to_bring_up_or_down(link, up)) + return 0; + + r = link_up_or_down(link, up, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to bring link %s: %m", up_or_down(up)); + + return 1; +} + +int link_request_to_bring_up_or_down(Link *link, bool up) { + int r; + + assert(link); + + r = link_queue_request_full(link, REQUEST_TYPE_UP_DOWN, + INT_TO_PTR(up), NULL, NULL, NULL, + link_process_up_or_down, + &link->set_flags_messages, + link_up_or_down_handler, NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request to bring link %s: %m", + up_or_down(up)); + + log_link_debug(link, "Requested to bring link %s", up_or_down(up)); + return 0; +} + +static int link_down_now_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + assert(link); + assert(link->set_flags_messages > 0); + + link->set_flags_messages--; + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0) + log_link_message_warning_errno(link, m, r, "Could not bring down interface, ignoring"); + + r = link_call_getlink(link, get_link_update_flag_handler); + if (r < 0) { + link_enter_failed(link); + return 0; + } + + link->set_flags_messages++; + return 0; +} + +int link_down_now(Link *link) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + + log_link_debug(link, "Bringing link down"); + + r = sd_rtnl_message_new_link(link->manager->rtnl, &req, RTM_SETLINK, link->ifindex); + if (r < 0) + return log_link_warning_errno(link, r, "Could not allocate RTM_SETLINK message: %m"); + + r = sd_rtnl_message_link_set_flags(req, 0, IFF_UP); + if (r < 0) + return log_link_warning_errno(link, r, "Could not set link flags: %m"); + + r = netlink_call_async(link->manager->rtnl, NULL, req, link_down_now_handler, + link_netlink_destroy_callback, link); + if (r < 0) + return log_link_warning_errno(link, r, "Could not send rtnetlink message: %m"); + + link->set_flags_messages++; + link_ref(link); + return 0; +} + +int link_down_slave_links(Link *link) { + Link *slave; + int r; + + assert(link); + + SET_FOREACH(slave, link->slaves) { + r = link_down_now(slave); + if (r < 0) + return r; + } + + return 0; +} + +static int link_remove_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(m); + assert(link); + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 0; + + r = sd_netlink_message_get_errno(m); + if (r < 0) + log_link_message_warning_errno(link, m, r, "Could not remove interface, ignoring"); + + return 0; +} + +int link_remove(Link *link) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + + log_link_debug(link, "Removing link."); + + r = sd_rtnl_message_new_link(link->manager->rtnl, &req, RTM_DELLINK, link->ifindex); + if (r < 0) + return log_link_debug_errno(link, r, "Could not allocate RTM_DELLINK message: %m"); + + r = netlink_call_async(link->manager->rtnl, NULL, req, link_remove_handler, + link_netlink_destroy_callback, link); + if (r < 0) + return log_link_debug_errno(link, r, "Could not send rtnetlink message: %m"); + + link_ref(link); + + return 0; +} diff --git a/src/network/networkd-setlink.h b/src/network/networkd-setlink.h new file mode 100644 index 0000000..841e5ee --- /dev/null +++ b/src/network/networkd-setlink.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +typedef struct Link Link; + +int link_request_to_set_addrgen_mode(Link *link); +int link_request_to_set_bond(Link *link); +int link_request_to_set_bridge(Link *link); +int link_request_to_set_bridge_vlan(Link *link); +int link_request_to_set_can(Link *link); +int link_request_to_set_flags(Link *link); +int link_request_to_set_group(Link *link); +int link_request_to_set_mac(Link *link, bool allow_retry); +int link_request_to_set_ipoib(Link *link); +int link_request_to_set_master(Link *link); +int link_request_to_set_mtu(Link *link, uint32_t mtu); + +int link_configure_mtu(Link *link); + +int link_request_to_activate(Link *link); + +int link_request_to_bring_up_or_down(Link *link, bool up); + +int link_down_now(Link *link); +int link_down_slave_links(Link *link); +int link_remove(Link *link); diff --git a/src/network/networkd-speed-meter.c b/src/network/networkd-speed-meter.c new file mode 100644 index 0000000..cf8294e --- /dev/null +++ b/src/network/networkd-speed-meter.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-event.h" +#include "sd-netlink.h" + +#include "networkd-link-bus.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-speed-meter.h" + +static int process_message(Manager *manager, sd_netlink_message *message) { + uint16_t type; + int ifindex, r; + Link *link; + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) + return r; + + if (type != RTM_NEWLINK) + return 0; + + r = sd_rtnl_message_link_get_ifindex(message, &ifindex); + if (r < 0) + return r; + + r = link_get_by_index(manager, ifindex, &link); + if (r < 0) + return r; + + link->stats_old = link->stats_new; + + r = sd_netlink_message_read(message, IFLA_STATS64, sizeof link->stats_new, &link->stats_new); + if (r < 0) + return r; + + link->stats_updated = true; + + return 0; +} + +static int speed_meter_handler(sd_event_source *s, uint64_t usec, void *userdata) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + Manager *manager = ASSERT_PTR(userdata); + usec_t usec_now; + Link *link; + int r; + + assert(s); + + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return r; + + r = sd_event_source_set_time(s, usec_now + manager->speed_meter_interval_usec); + if (r < 0) + return r; + + manager->speed_meter_usec_old = manager->speed_meter_usec_new; + manager->speed_meter_usec_new = usec_now; + + HASHMAP_FOREACH(link, manager->links_by_index) + link->stats_updated = false; + + r = sd_rtnl_message_new_link(manager->rtnl, &req, RTM_GETLINK, 0); + if (r < 0) { + log_warning_errno(r, "Failed to allocate RTM_GETLINK netlink message, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) { + log_warning_errno(r, "Failed to set dump flag, ignoring: %m"); + return 0; + } + + r = sd_netlink_call(manager->rtnl, req, 0, &reply); + if (r < 0) { + log_warning_errno(r, "Failed to call RTM_GETLINK, ignoring: %m"); + return 0; + } + + for (sd_netlink_message *i = reply; i; i = sd_netlink_message_next(i)) + (void) process_message(manager, i); + + return 0; +} + +int manager_start_speed_meter(Manager *manager) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(manager); + assert(manager->event); + + if (!manager->use_speed_meter) + return 0; + + r = sd_event_add_time(manager->event, &s, CLOCK_MONOTONIC, 0, 0, speed_meter_handler, manager); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_ON); + if (r < 0) + return r; + + manager->speed_meter_event_source = TAKE_PTR(s); + return 0; +} diff --git a/src/network/networkd-speed-meter.h b/src/network/networkd-speed-meter.h new file mode 100644 index 0000000..4dd024b --- /dev/null +++ b/src/network/networkd-speed-meter.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* Default interval is 10sec. The speed meter periodically make networkd + * to be woke up. So, too small interval value is not desired. + * We set the minimum value 100msec = 0.1sec. */ +#define SPEED_METER_DEFAULT_TIME_INTERVAL (10 * USEC_PER_SEC) +#define SPEED_METER_MINIMUM_TIME_INTERVAL (100 * USEC_PER_MSEC) + +typedef struct Manager Manager; + +int manager_start_speed_meter(Manager *m); diff --git a/src/network/networkd-sriov.c b/src/network/networkd-sriov.c new file mode 100644 index 0000000..78d8cef --- /dev/null +++ b/src/network/networkd-sriov.c @@ -0,0 +1,352 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include "device-enumerator-private.h" +#include "device-util.h" +#include "fd-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-queue.h" +#include "networkd-sriov.h" + +static int sr_iov_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, SRIOV *sr_iov) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_error_errno(link, m, r, "Could not set up SR-IOV"); + link_enter_failed(link); + return 1; + } + + if (link->sr_iov_messages == 0) { + log_link_debug(link, "SR-IOV configured"); + link->sr_iov_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int sr_iov_configure(SRIOV *sr_iov, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(sr_iov); + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(link->ifindex > 0); + assert(req); + + log_link_debug(link, "Setting SR-IOV virtual function %"PRIu32".", sr_iov->vf); + + r = sd_rtnl_message_new_link(link->manager->rtnl, &m, RTM_SETLINK, link->ifindex); + if (r < 0) + return r; + + r = sr_iov_set_netlink_message(sr_iov, m); + if (r < 0) + return r; + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static int sr_iov_process_request(Request *req, Link *link, SRIOV *sr_iov) { + int r; + + assert(req); + assert(link); + assert(sr_iov); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return 0; + + r = sr_iov_configure(sr_iov, link, req); + if (r < 0) + return log_link_warning_errno(link, r, + "Failed to configure SR-IOV virtual function %"PRIu32": %m", + sr_iov->vf); + + return 1; +} + +int link_request_sr_iov_vfs(Link *link) { + SRIOV *sr_iov; + int r; + + assert(link); + assert(link->network); + + link->sr_iov_configured = false; + + ORDERED_HASHMAP_FOREACH(sr_iov, link->network->sr_iov_by_section) { + r = link_queue_request_safe(link, REQUEST_TYPE_SRIOV, + sr_iov, NULL, + sr_iov_hash_func, + sr_iov_compare_func, + sr_iov_process_request, + &link->sr_iov_messages, + sr_iov_handler, + NULL); + if (r < 0) + return log_link_warning_errno(link, r, + "Failed to request SR-IOV virtual function %"PRIu32": %m", + sr_iov->vf); + } + + if (link->sr_iov_messages == 0) { + link->sr_iov_configured = true; + link_check_ready(link); + } else + log_link_debug(link, "Configuring SR-IOV"); + + return 0; +} + +static int find_ifindex_from_pci_dev_port(sd_device *pci_dev, const char *dev_port) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *dev; + int ifindex, r; + + assert(pci_dev); + assert(dev_port); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_parent(e, pci_dev); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "net", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, "dev_port", dev_port, true); + if (r < 0) + return r; + + dev = sd_device_enumerator_get_device_first(e); + if (!dev) + return -ENODEV; /* no device found */ + + if (sd_device_enumerator_get_device_next(e)) + return -ENXIO; /* multiple devices found */ + + r = sd_device_get_ifindex(dev, &ifindex); + if (r < 0) + return r; + + assert(ifindex > 0); + return ifindex; +} + +static int manager_update_sr_iov_ifindices(Manager *manager, int phys_port_ifindex, int virt_port_ifindex) { + Link *phys_link = NULL, *virt_link = NULL; + int r; + + assert(manager); + assert(phys_port_ifindex > 0); + assert(virt_port_ifindex > 0); + + /* This sets ifindices only when both interfaces are already managed by us. */ + + r = link_get_by_index(manager, phys_port_ifindex, &phys_link); + if (r < 0) + return r; + + r = link_get_by_index(manager, virt_port_ifindex, &virt_link); + if (r < 0) + return r; + + /* update VF ifindex in PF */ + r = set_ensure_put(&phys_link->sr_iov_virt_port_ifindices, NULL, INT_TO_PTR(virt_port_ifindex)); + if (r < 0) + return r; + + log_link_debug(phys_link, + "Found SR-IOV VF port %s(%i).", + virt_link ? virt_link->ifname : "n/a", virt_port_ifindex); + + /* update PF ifindex in VF */ + if (virt_link->sr_iov_phys_port_ifindex > 0 && virt_link->sr_iov_phys_port_ifindex != phys_port_ifindex) { + Link *old_phys_link; + + if (link_get_by_index(manager, virt_link->sr_iov_phys_port_ifindex, &old_phys_link) >= 0) + set_remove(old_phys_link->sr_iov_virt_port_ifindices, INT_TO_PTR(virt_port_ifindex)); + } + + virt_link->sr_iov_phys_port_ifindex = phys_port_ifindex; + + log_link_debug(virt_link, + "Found SR-IOV PF port %s(%i).", + phys_link ? phys_link->ifname : "n/a", phys_port_ifindex); + + return 0; +} + +static int link_set_sr_iov_phys_port(Link *link, sd_device *pci_dev, const char *dev_port) { + _cleanup_(sd_device_unrefp) sd_device *pci_physfn_dev = NULL; + int r; + + assert(link); + assert(link->manager); + assert(pci_dev); + assert(dev_port); + + if (link->sr_iov_phys_port_ifindex > 0) + return 0; + + r = sd_device_new_child(&pci_physfn_dev, pci_dev, "physfn"); + if (r < 0) + return r; + + r = find_ifindex_from_pci_dev_port(pci_physfn_dev, dev_port); + if (r < 0) + return r; + + return manager_update_sr_iov_ifindices(link->manager, r, link->ifindex); +} + +static int link_set_sr_iov_virt_ports(Link *link, sd_device *pci_dev, const char *dev_port) { + const char *name; + int r; + + assert(link); + assert(link->manager); + assert(pci_dev); + assert(dev_port); + + set_clear(link->sr_iov_virt_port_ifindices); + + FOREACH_DEVICE_CHILD_WITH_SUFFIX(pci_dev, child, name) { + const char *n; + + /* Accept name prefixed with "virtfn", but refuse "virtfn" itself. */ + n = startswith(name, "virtfn"); + if (isempty(n) || !in_charset(n, DIGITS)) + continue; + + r = find_ifindex_from_pci_dev_port(child, dev_port); + if (r < 0) + continue; + + if (manager_update_sr_iov_ifindices(link->manager, link->ifindex, r) < 0) + continue; + } + + return 0; +} + +int link_set_sr_iov_ifindices(Link *link) { + const char *dev_port; + sd_device *pci_dev; + int r; + + assert(link); + + if (!link->dev) + return -ENODEV; + + r = sd_device_get_parent_with_subsystem_devtype(link->dev, "pci", NULL, &pci_dev); + if (ERRNO_IS_NEG_DEVICE_ABSENT(r)) + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get parent PCI device: %m"); + + /* This may return -EINVAL or -ENODEV, instead of -ENOENT, if the device has been removed or is being + * removed. Let's ignore the error codes here. */ + r = sd_device_get_sysattr_value(link->dev, "dev_port", &dev_port); + if (ERRNO_IS_NEG_DEVICE_ABSENT(r) || r == -EINVAL) + return 0; + if (r < 0) + return log_link_debug_errno(link, r, "Failed to get 'dev_port' sysfs attribute: %m"); + + r = link_set_sr_iov_phys_port(link, pci_dev, dev_port); + if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r)) + return log_link_debug_errno(link, r, "Failed to set SR-IOV physical port: %m"); + + r = link_set_sr_iov_virt_ports(link, pci_dev, dev_port); + if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r)) + return log_link_debug_errno(link, r, "Failed to set SR-IOV virtual ports: %m"); + + return 0; +} + +void link_clear_sr_iov_ifindices(Link *link) { + void *v; + + assert(link); + assert(link->manager); + + if (link->sr_iov_phys_port_ifindex > 0) { + Link *phys_link; + + if (link_get_by_index(link->manager, link->sr_iov_phys_port_ifindex, &phys_link) >= 0) + set_remove(phys_link->sr_iov_virt_port_ifindices, INT_TO_PTR(link->ifindex)); + + link->sr_iov_phys_port_ifindex = 0; + } + + while ((v = set_steal_first(link->sr_iov_virt_port_ifindices))) { + Link *virt_link; + + if (link_get_by_index(link->manager, PTR_TO_INT(v), &virt_link) >= 0) + virt_link->sr_iov_phys_port_ifindex = 0; + } +} + +bool check_ready_for_all_sr_iov_ports( + Link *link, + bool allow_unmanaged, /* for the main target */ + bool (check_one)(Link *link, bool allow_unmanaged)) { + + Link *phys_link; + void *v; + + assert(link); + assert(link->manager); + assert(check_one); + + /* Some drivers make VF ports become down when their PF port becomes down, and may fail to configure + * VF ports. Also, when a VF port becomes up/down, its PF port and other VF ports may become down. + * See issue #23315. */ + + /* First, check the main target. */ + if (!check_one(link, allow_unmanaged)) + return false; + + /* If this is a VF port, then also check the PF port. */ + if (link->sr_iov_phys_port_ifindex > 0) { + if (link_get_by_index(link->manager, link->sr_iov_phys_port_ifindex, &phys_link) < 0 || + !check_one(phys_link, /* allow_unmanaged = */ true)) + return false; + } else + phys_link = link; + + /* Also check all VF ports. */ + SET_FOREACH(v, phys_link->sr_iov_virt_port_ifindices) { + int ifindex = PTR_TO_INT(v); + Link *virt_link; + + if (ifindex == link->ifindex) + continue; /* The main target link is a VF port, and its state is already checked. */ + + if (link_get_by_index(link->manager, ifindex, &virt_link) < 0) + return false; + + if (!check_one(virt_link, /* allow_unmanaged = */ true)) + return false; + } + + return true; +} diff --git a/src/network/networkd-sriov.h b/src/network/networkd-sriov.h new file mode 100644 index 0000000..0d4276e --- /dev/null +++ b/src/network/networkd-sriov.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "netif-sriov.h" + +typedef struct Link Link; + +int link_request_sr_iov_vfs(Link *link); + +int link_set_sr_iov_ifindices(Link *link); +void link_clear_sr_iov_ifindices(Link *link); + +bool check_ready_for_all_sr_iov_ports( + Link *link, + bool allow_unmanaged, /* for the main target */ + bool (check_one)(Link *link, bool allow_unmanaged)); diff --git a/src/network/networkd-state-file.c b/src/network/networkd-state-file.c new file mode 100644 index 0000000..3a95ba8 --- /dev/null +++ b/src/network/networkd-state-file.c @@ -0,0 +1,863 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "dns-domain.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "network-internal.h" +#include "networkd-dhcp-common.h" +#include "networkd-link.h" +#include "networkd-manager-bus.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-state-file.h" +#include "ordered-set.h" +#include "set.h" +#include "strv.h" +#include "tmpfile-util.h" + +static int ordered_set_put_dns_servers(OrderedSet **s, int ifindex, struct in_addr_full **dns, unsigned n) { + int r; + + assert(s); + assert(dns || n == 0); + + FOREACH_ARRAY(a, dns, n) { + const char *p; + + if ((*a)->ifindex != 0 && (*a)->ifindex != ifindex) + return 0; + + p = in_addr_full_to_string(*a); + if (!p) + return 0; + + r = ordered_set_put_strdup(s, p); + if (r < 0) + return r; + } + + return 0; +} + +static int ordered_set_put_in4_addrv( + OrderedSet **s, + const struct in_addr *addresses, + size_t n, + bool (*predicate)(const struct in_addr *addr)) { + + int r; + + assert(s); + assert(n == 0 || addresses); + + FOREACH_ARRAY(a, addresses, n) { + if (predicate && !predicate(a)) + continue; + + r = ordered_set_put_strdup(s, IN4_ADDR_TO_STRING(a)); + if (r < 0) + return r; + } + + return 0; +} + +static int ordered_set_put_in6_addrv( + OrderedSet **s, + const struct in6_addr *addresses, + size_t n) { + + int r; + + assert(s); + assert(n == 0 || addresses); + + FOREACH_ARRAY(a, addresses, n) { + r = ordered_set_put_strdup(s, IN6_ADDR_TO_STRING(a)); + if (r < 0) + return r; + } + + return 0; +} + +static int link_put_dns(Link *link, OrderedSet **s) { + int r; + + assert(link); + assert(link->network); + assert(s); + + if (link->n_dns != UINT_MAX) + return ordered_set_put_dns_servers(s, link->ifindex, link->dns, link->n_dns); + + r = ordered_set_put_dns_servers(s, link->ifindex, link->network->dns, link->network->n_dns); + if (r < 0) + return r; + + if (link->dhcp_lease && link->network->dhcp_use_dns) { + const struct in_addr *addresses; + + r = sd_dhcp_lease_get_dns(link->dhcp_lease, &addresses); + if (r >= 0) { + r = ordered_set_put_in4_addrv(s, addresses, r, in4_addr_is_non_local); + if (r < 0) + return r; + } + } + + if (link->dhcp6_lease && link->network->dhcp6_use_dns) { + const struct in6_addr *addresses; + + r = sd_dhcp6_lease_get_dns(link->dhcp6_lease, &addresses); + if (r >= 0) { + r = ordered_set_put_in6_addrv(s, addresses, r); + if (r < 0) + return r; + } + } + + if (link->network->ipv6_accept_ra_use_dns) { + NDiscRDNSS *a; + + SET_FOREACH(a, link->ndisc_rdnss) { + r = ordered_set_put_in6_addrv(s, &a->router, 1); + if (r < 0) + return r; + } + } + + return 0; +} + +static int link_put_ntp(Link *link, OrderedSet **s) { + int r; + + assert(link); + assert(link->network); + assert(s); + + if (link->ntp) + return ordered_set_put_strdupv(s, link->ntp); + + r = ordered_set_put_strdupv(s, link->network->ntp); + if (r < 0) + return r; + + if (link->dhcp_lease && link->network->dhcp_use_ntp) { + const struct in_addr *addresses; + + r = sd_dhcp_lease_get_ntp(link->dhcp_lease, &addresses); + if (r >= 0) { + r = ordered_set_put_in4_addrv(s, addresses, r, in4_addr_is_non_local); + if (r < 0) + return r; + } + } + + if (link->dhcp6_lease && link->network->dhcp6_use_ntp) { + const struct in6_addr *addresses; + char **fqdn; + + r = sd_dhcp6_lease_get_ntp_addrs(link->dhcp6_lease, &addresses); + if (r >= 0) { + r = ordered_set_put_in6_addrv(s, addresses, r); + if (r < 0) + return r; + } + + r = sd_dhcp6_lease_get_ntp_fqdn(link->dhcp6_lease, &fqdn); + if (r >= 0) { + r = ordered_set_put_strdupv(s, fqdn); + if (r < 0) + return r; + } + } + + return 0; +} + +static int link_put_sip(Link *link, OrderedSet **s) { + int r; + + assert(link); + assert(link->network); + assert(s); + + if (link->dhcp_lease && link->network->dhcp_use_ntp) { + const struct in_addr *addresses; + + r = sd_dhcp_lease_get_sip(link->dhcp_lease, &addresses); + if (r >= 0) { + r = ordered_set_put_in4_addrv(s, addresses, r, in4_addr_is_non_local); + if (r < 0) + return r; + } + } + + return 0; +} + +static int link_put_domains(Link *link, bool is_route, OrderedSet **s) { + OrderedSet *link_domains, *network_domains; + DHCPUseDomains use_domains; + int r; + + assert(link); + assert(link->network); + assert(s); + + link_domains = is_route ? link->route_domains : link->search_domains; + network_domains = is_route ? link->network->route_domains : link->network->search_domains; + use_domains = is_route ? DHCP_USE_DOMAINS_ROUTE : DHCP_USE_DOMAINS_YES; + + if (link_domains) + return ordered_set_put_string_set(s, link_domains); + + r = ordered_set_put_string_set(s, network_domains); + if (r < 0) + return r; + + if (link->dhcp_lease && link->network->dhcp_use_domains == use_domains) { + const char *domainname; + char **domains; + + r = sd_dhcp_lease_get_domainname(link->dhcp_lease, &domainname); + if (r >= 0) { + r = ordered_set_put_strdup(s, domainname); + if (r < 0) + return r; + } + + r = sd_dhcp_lease_get_search_domains(link->dhcp_lease, &domains); + if (r >= 0) { + r = ordered_set_put_strdupv(s, domains); + if (r < 0) + return r; + } + } + + if (link->dhcp6_lease && link->network->dhcp6_use_domains == use_domains) { + char **domains; + + r = sd_dhcp6_lease_get_domains(link->dhcp6_lease, &domains); + if (r >= 0) { + r = ordered_set_put_strdupv(s, domains); + if (r < 0) + return r; + } + } + + if (link->network->ipv6_accept_ra_use_domains == use_domains) { + NDiscDNSSL *a; + + SET_FOREACH(a, link->ndisc_dnssl) { + r = ordered_set_put_strdup(s, NDISC_DNSSL_DOMAIN(a)); + if (r < 0) + return r; + } + } + + return 0; +} + +int manager_save(Manager *m) { + _cleanup_ordered_set_free_ OrderedSet *dns = NULL, *ntp = NULL, *sip = NULL, *search_domains = NULL, *route_domains = NULL; + const char *operstate_str, *carrier_state_str, *address_state_str, *ipv4_address_state_str, *ipv6_address_state_str, *online_state_str; + LinkOperationalState operstate = LINK_OPERSTATE_OFF; + LinkCarrierState carrier_state = LINK_CARRIER_STATE_OFF; + LinkAddressState ipv4_address_state = LINK_ADDRESS_STATE_OFF, ipv6_address_state = LINK_ADDRESS_STATE_OFF, + address_state = LINK_ADDRESS_STATE_OFF; + LinkOnlineState online_state; + size_t links_offline = 0, links_online = 0; + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_strv_free_ char **p = NULL; + _cleanup_fclose_ FILE *f = NULL; + Link *link; + int r; + + assert(m); + + if (isempty(m->state_file)) + return 0; /* Do not update state file when running in test mode. */ + + HASHMAP_FOREACH(link, m->links_by_index) { + if (link->flags & IFF_LOOPBACK) + continue; + + operstate = MAX(operstate, link->operstate); + carrier_state = MAX(carrier_state, link->carrier_state); + address_state = MAX(address_state, link->address_state); + ipv4_address_state = MAX(ipv4_address_state, link->ipv4_address_state); + ipv6_address_state = MAX(ipv6_address_state, link->ipv6_address_state); + + if (!link->network) + continue; + + if (link->network->required_for_online) { + if (link->online_state == LINK_ONLINE_STATE_OFFLINE) + links_offline++; + else if (link->online_state == LINK_ONLINE_STATE_ONLINE) + links_online++; + } + + r = link_put_dns(link, &dns); + if (r < 0) + return r; + + r = link_put_ntp(link, &ntp); + if (r < 0) + return r; + + r = link_put_sip(link, &sip); + if (r < 0) + return r; + + r = link_put_domains(link, /* is_route = */ false, &search_domains); + if (r < 0) + return r; + + r = link_put_domains(link, /* is_route = */ true, &route_domains); + if (r < 0) + return r; + } + + if (carrier_state >= LINK_CARRIER_STATE_ENSLAVED) + carrier_state = LINK_CARRIER_STATE_CARRIER; + + online_state = links_online > 0 ? + (links_offline > 0 ? LINK_ONLINE_STATE_PARTIAL : LINK_ONLINE_STATE_ONLINE) : + (links_offline > 0 ? LINK_ONLINE_STATE_OFFLINE : _LINK_ONLINE_STATE_INVALID); + + operstate_str = link_operstate_to_string(operstate); + assert(operstate_str); + + carrier_state_str = link_carrier_state_to_string(carrier_state); + assert(carrier_state_str); + + address_state_str = link_address_state_to_string(address_state); + assert(address_state_str); + + ipv4_address_state_str = link_address_state_to_string(ipv4_address_state); + assert(ipv4_address_state_str); + + ipv6_address_state_str = link_address_state_to_string(ipv6_address_state); + assert(ipv6_address_state_str); + + r = fopen_temporary(m->state_file, &f, &temp_path); + if (r < 0) + return r; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "OPER_STATE=%s\n" + "CARRIER_STATE=%s\n" + "ADDRESS_STATE=%s\n" + "IPV4_ADDRESS_STATE=%s\n" + "IPV6_ADDRESS_STATE=%s\n", + operstate_str, carrier_state_str, address_state_str, ipv4_address_state_str, ipv6_address_state_str); + + online_state_str = link_online_state_to_string(online_state); + if (online_state_str) + fprintf(f, "ONLINE_STATE=%s\n", online_state_str); + + ordered_set_print(f, "DNS=", dns); + ordered_set_print(f, "NTP=", ntp); + ordered_set_print(f, "SIP=", sip); + ordered_set_print(f, "DOMAINS=", search_domains); + ordered_set_print(f, "ROUTE_DOMAINS=", route_domains); + + r = fflush_and_check(f); + if (r < 0) + return r; + + r = conservative_rename(temp_path, m->state_file); + if (r < 0) + return r; + + temp_path = mfree(temp_path); + + if (m->operational_state != operstate) { + m->operational_state = operstate; + if (strv_extend(&p, "OperationalState") < 0) + log_oom(); + } + + if (m->carrier_state != carrier_state) { + m->carrier_state = carrier_state; + if (strv_extend(&p, "CarrierState") < 0) + log_oom(); + } + + if (m->address_state != address_state) { + m->address_state = address_state; + if (strv_extend(&p, "AddressState") < 0) + log_oom(); + } + + if (m->ipv4_address_state != ipv4_address_state) { + m->ipv4_address_state = ipv4_address_state; + if (strv_extend(&p, "IPv4AddressState") < 0) + log_oom(); + } + + if (m->ipv6_address_state != ipv6_address_state) { + m->ipv6_address_state = ipv6_address_state; + if (strv_extend(&p, "IPv6AddressState") < 0) + log_oom(); + } + + if (m->online_state != online_state) { + m->online_state = online_state; + if (strv_extend(&p, "OnlineState") < 0) + log_oom(); + } + + if (p) { + r = manager_send_changed_strv(m, p); + if (r < 0) + log_warning_errno(r, "Could not emit changed properties, ignoring: %m"); + } + + m->dirty = false; + + return 0; +} + +static void print_link_hashmap(FILE *f, const char *prefix, Hashmap* h) { + bool space = false; + Link *link; + + assert(f); + assert(prefix); + + if (hashmap_isempty(h)) + return; + + fputs(prefix, f); + HASHMAP_FOREACH(link, h) { + if (space) + fputc(' ', f); + + fprintf(f, "%i", link->ifindex); + space = true; + } + + fputc('\n', f); +} + +static void link_save_dns(Link *link, FILE *f, struct in_addr_full **dns, unsigned n_dns, bool *space) { + bool _space = false; + + if (!space) + space = &_space; + + for (unsigned j = 0; j < n_dns; j++) { + const char *str; + + if (dns[j]->ifindex != 0 && dns[j]->ifindex != link->ifindex) + continue; + + str = in_addr_full_to_string(dns[j]); + if (!str) + continue; + + if (*space) + fputc(' ', f); + fputs(str, f); + *space = true; + } +} + +static void serialize_addresses( + FILE *f, + const char *lvalue, + bool *space, + char **addresses, + sd_dhcp_lease *lease, + bool conditional, + sd_dhcp_lease_server_type_t what, + sd_dhcp6_lease *lease6, + bool conditional6, + int (*lease6_get_addr)(sd_dhcp6_lease*, const struct in6_addr**), + int (*lease6_get_fqdn)(sd_dhcp6_lease*, char ***)) { + + bool _space = false; + int r; + + if (!space) + space = &_space; + + if (lvalue) + fprintf(f, "%s=", lvalue); + fputstrv(f, addresses, NULL, space); + + if (lease && conditional) { + const struct in_addr *lease_addresses; + + r = sd_dhcp_lease_get_servers(lease, what, &lease_addresses); + if (r > 0) + serialize_in_addrs(f, lease_addresses, r, space, in4_addr_is_non_local); + } + + if (lease6 && conditional6 && lease6_get_addr) { + const struct in6_addr *in6_addrs; + + r = lease6_get_addr(lease6, &in6_addrs); + if (r > 0) + serialize_in6_addrs(f, in6_addrs, r, space); + } + + if (lease6 && conditional6 && lease6_get_fqdn) { + char **in6_hosts; + + r = lease6_get_fqdn(lease6, &in6_hosts); + if (r > 0) + fputstrv(f, in6_hosts, NULL, space); + } + + if (lvalue) + fputc('\n', f); +} + +static void link_save_domains(Link *link, FILE *f, OrderedSet *static_domains, DHCPUseDomains use_domains) { + bool space = false; + const char *p; + + assert(link); + assert(link->network); + assert(f); + + ORDERED_SET_FOREACH(p, static_domains) + fputs_with_space(f, p, NULL, &space); + + if (use_domains == DHCP_USE_DOMAINS_NO) + return; + + if (link->dhcp_lease && link->network->dhcp_use_domains == use_domains) { + const char *domainname; + char **domains; + + if (sd_dhcp_lease_get_domainname(link->dhcp_lease, &domainname) >= 0) + fputs_with_space(f, domainname, NULL, &space); + if (sd_dhcp_lease_get_search_domains(link->dhcp_lease, &domains) >= 0) + fputstrv(f, domains, NULL, &space); + } + + if (link->dhcp6_lease && link->network->dhcp6_use_domains == use_domains) { + char **domains; + + if (sd_dhcp6_lease_get_domains(link->dhcp6_lease, &domains) >= 0) + fputstrv(f, domains, NULL, &space); + } + + if (link->network->ipv6_accept_ra_use_domains == use_domains) { + NDiscDNSSL *dd; + + SET_FOREACH(dd, link->ndisc_dnssl) + fputs_with_space(f, NDISC_DNSSL_DOMAIN(dd), NULL, &space); + } +} + +static int link_save(Link *link) { + const char *admin_state, *oper_state, *carrier_state, *address_state, *ipv4_address_state, *ipv6_address_state, + *captive_portal; + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(link); + assert(link->manager); + + if (isempty(link->state_file)) + return 0; /* Do not update state files when running in test mode. */ + + if (link->state == LINK_STATE_LINGER) + return 0; + + link_lldp_save(link); + + admin_state = link_state_to_string(link->state); + assert(admin_state); + + oper_state = link_operstate_to_string(link->operstate); + assert(oper_state); + + carrier_state = link_carrier_state_to_string(link->carrier_state); + assert(carrier_state); + + address_state = link_address_state_to_string(link->address_state); + assert(address_state); + + ipv4_address_state = link_address_state_to_string(link->ipv4_address_state); + assert(ipv4_address_state); + + ipv6_address_state = link_address_state_to_string(link->ipv6_address_state); + assert(ipv6_address_state); + + r = fopen_temporary(link->state_file, &f, &temp_path); + if (r < 0) + return r; + + (void) fchmod(fileno(f), 0644); + + fprintf(f, + "# This is private data. Do not parse.\n" + "ADMIN_STATE=%s\n" + "OPER_STATE=%s\n" + "CARRIER_STATE=%s\n" + "ADDRESS_STATE=%s\n" + "IPV4_ADDRESS_STATE=%s\n" + "IPV6_ADDRESS_STATE=%s\n", + admin_state, oper_state, carrier_state, address_state, ipv4_address_state, ipv6_address_state); + + if (link->network) { + const char *online_state; + bool space = false; + + online_state = link_online_state_to_string(link->online_state); + if (online_state) + fprintf(f, "ONLINE_STATE=%s\n", online_state); + + fprintf(f, "REQUIRED_FOR_ONLINE=%s\n", + yes_no(link->network->required_for_online)); + + LinkOperationalStateRange st = link->network->required_operstate_for_online; + fprintf(f, "REQUIRED_OPER_STATE_FOR_ONLINE=%s%s%s\n", + strempty(link_operstate_to_string(st.min)), + st.max != LINK_OPERSTATE_RANGE_DEFAULT.max ? ":" : "", + st.max != LINK_OPERSTATE_RANGE_DEFAULT.max ? strempty(link_operstate_to_string(st.max)) : ""); + + fprintf(f, "REQUIRED_FAMILY_FOR_ONLINE=%s\n", + link_required_address_family_to_string(link->network->required_family_for_online)); + + fprintf(f, "ACTIVATION_POLICY=%s\n", + activation_policy_to_string(link->network->activation_policy)); + + fprintf(f, "NETWORK_FILE=%s\n", link->network->filename); + + fputs("NETWORK_FILE_DROPINS=\"", f); + STRV_FOREACH(d, link->network->dropins) { + _cleanup_free_ char *escaped = NULL; + + escaped = xescape(*d, ":"); + if (!escaped) + return -ENOMEM; + + fputs_with_space(f, escaped, ":", &space); + } + fputs("\"\n", f); + + /************************************************************/ + + fputs("DNS=", f); + if (link->n_dns != UINT_MAX) + link_save_dns(link, f, link->dns, link->n_dns, NULL); + else { + space = false; + link_save_dns(link, f, link->network->dns, link->network->n_dns, &space); + + serialize_addresses(f, NULL, &space, + NULL, + link->dhcp_lease, + link->network->dhcp_use_dns, + SD_DHCP_LEASE_DNS, + link->dhcp6_lease, + link->network->dhcp6_use_dns, + sd_dhcp6_lease_get_dns, + NULL); + + if (link->network->ipv6_accept_ra_use_dns) { + NDiscRDNSS *dd; + + SET_FOREACH(dd, link->ndisc_rdnss) + serialize_in6_addrs(f, &dd->address, 1, &space); + } + } + + fputc('\n', f); + + /************************************************************/ + + if (link->ntp) { + fputs("NTP=", f); + fputstrv(f, link->ntp, NULL, NULL); + fputc('\n', f); + } else + serialize_addresses(f, "NTP", NULL, + link->network->ntp, + link->dhcp_lease, + link->network->dhcp_use_ntp, + SD_DHCP_LEASE_NTP, + link->dhcp6_lease, + link->network->dhcp6_use_ntp, + sd_dhcp6_lease_get_ntp_addrs, + sd_dhcp6_lease_get_ntp_fqdn); + + serialize_addresses(f, "SIP", NULL, + NULL, + link->dhcp_lease, + link->network->dhcp_use_sip, + SD_DHCP_LEASE_SIP, + NULL, false, NULL, NULL); + + /************************************************************/ + + r = link_get_captive_portal(link, &captive_portal); + if (r < 0) + return r; + + if (captive_portal) + fprintf(f, "CAPTIVE_PORTAL=%s\n", captive_portal); + + /************************************************************/ + + fputs("DOMAINS=", f); + if (link->search_domains) + link_save_domains(link, f, link->search_domains, DHCP_USE_DOMAINS_NO); + else + link_save_domains(link, f, link->network->search_domains, DHCP_USE_DOMAINS_YES); + fputc('\n', f); + + /************************************************************/ + + fputs("ROUTE_DOMAINS=", f); + if (link->route_domains) + link_save_domains(link, f, link->route_domains, DHCP_USE_DOMAINS_NO); + else + link_save_domains(link, f, link->network->route_domains, DHCP_USE_DOMAINS_ROUTE); + fputc('\n', f); + + /************************************************************/ + + fprintf(f, "LLMNR=%s\n", + resolve_support_to_string(link->llmnr >= 0 ? link->llmnr : link->network->llmnr)); + + /************************************************************/ + + fprintf(f, "MDNS=%s\n", + resolve_support_to_string(link->mdns >= 0 ? link->mdns : link->network->mdns)); + + /************************************************************/ + + int dns_default_route = + link->dns_default_route >= 0 ? link->dns_default_route : + link->network->dns_default_route; + if (dns_default_route >= 0) + fprintf(f, "DNS_DEFAULT_ROUTE=%s\n", yes_no(dns_default_route)); + + /************************************************************/ + + DnsOverTlsMode dns_over_tls_mode = + link->dns_over_tls_mode != _DNS_OVER_TLS_MODE_INVALID ? link->dns_over_tls_mode : + link->network->dns_over_tls_mode; + if (dns_over_tls_mode != _DNS_OVER_TLS_MODE_INVALID) + fprintf(f, "DNS_OVER_TLS=%s\n", dns_over_tls_mode_to_string(dns_over_tls_mode)); + + /************************************************************/ + + DnssecMode dnssec_mode = + link->dnssec_mode != _DNSSEC_MODE_INVALID ? link->dnssec_mode : + link->network->dnssec_mode; + if (dnssec_mode != _DNSSEC_MODE_INVALID) + fprintf(f, "DNSSEC=%s\n", dnssec_mode_to_string(dnssec_mode)); + + /************************************************************/ + + Set *nta_anchors = link->dnssec_negative_trust_anchors; + if (set_isempty(nta_anchors)) + nta_anchors = link->network->dnssec_negative_trust_anchors; + + if (!set_isempty(nta_anchors)) { + const char *n; + + fputs("DNSSEC_NTA=", f); + space = false; + SET_FOREACH(n, nta_anchors) + fputs_with_space(f, n, NULL, &space); + fputc('\n', f); + } + } + + print_link_hashmap(f, "CARRIER_BOUND_TO=", link->bound_to_links); + print_link_hashmap(f, "CARRIER_BOUND_BY=", link->bound_by_links); + + if (link->dhcp_lease) { + r = dhcp_lease_save(link->dhcp_lease, link->lease_file); + if (r < 0) + return r; + + fprintf(f, + "DHCP_LEASE=%s\n", + link->lease_file); + } else + (void) unlink(link->lease_file); + + r = link_serialize_dhcp6_client(link, f); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return r; + + r = conservative_rename(temp_path, link->state_file); + if (r < 0) + return r; + + temp_path = mfree(temp_path); + + return 0; +} + +void link_dirty(Link *link) { + int r; + + assert(link); + assert(link->manager); + + /* The serialized state in /run is no longer up-to-date. */ + + /* Also mark manager dirty as link is dirty */ + link->manager->dirty = true; + + r = set_ensure_put(&link->manager->dirty_links, NULL, link); + if (r <= 0) + /* Ignore allocation errors and don't take another ref if the link was already dirty */ + return; + link_ref(link); +} + +void link_clean(Link *link) { + assert(link); + assert(link->manager); + + /* The serialized state in /run is up-to-date */ + + link_unref(set_remove(link->manager->dirty_links, link)); +} + +int link_save_and_clean_full(Link *link, bool also_save_manager) { + int r, k = 0; + + assert(link); + assert(link->manager); + + if (also_save_manager) + k = manager_save(link->manager); + + r = link_save(link); + if (r < 0) + return r; + + link_clean(link); + return k; +} diff --git a/src/network/networkd-state-file.h b/src/network/networkd-state-file.h new file mode 100644 index 0000000..684f0d1 --- /dev/null +++ b/src/network/networkd-state-file.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Link Link; +typedef struct Manager Manager; + +void link_dirty(Link *link); +void link_clean(Link *link); +int link_save_and_clean_full(Link *link, bool also_save_manager); +static inline int link_save_and_clean(Link *link) { + return link_save_and_clean_full(link, false); +} + +int manager_save(Manager *m); diff --git a/src/network/networkd-sysctl.c b/src/network/networkd-sysctl.c new file mode 100644 index 0000000..2b226b2 --- /dev/null +++ b/src/network/networkd-sysctl.c @@ -0,0 +1,335 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "missing_network.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-sysctl.h" +#include "socket-util.h" +#include "string-table.h" +#include "sysctl-util.h" + +static bool link_is_configured_for_family(Link *link, int family) { + assert(link); + + if (!link->network) + return false; + + if (link->flags & IFF_LOOPBACK) + return false; + + /* CAN devices do not support IP layer. Most of the functions below are never called for CAN devices, + * but link_set_ipv6_mtu() may be called after setting interface MTU, and warn about the failure. For + * safety, let's unconditionally check if the interface is not a CAN device. */ + if (IN_SET(family, AF_INET, AF_INET6) && link->iftype == ARPHRD_CAN) + return false; + + if (family == AF_INET6 && !socket_ipv6_is_supported()) + return false; + + return true; +} + +static int link_update_ipv6_sysctl(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + if (!link_ipv6_enabled(link)) + return 0; + + return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "disable_ipv6", false); +} + +static int link_set_proxy_arp(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET)) + return 0; + + if (link->network->proxy_arp < 0) + return 0; + + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp", link->network->proxy_arp > 0); +} + +static bool link_ip_forward_enabled(Link *link, int family) { + assert(link); + assert(IN_SET(family, AF_INET, AF_INET6)); + + if (!link_is_configured_for_family(link, family)) + return false; + + return link->network->ip_forward & (family == AF_INET ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6); +} + +static int link_set_ipv4_forward(Link *link) { + assert(link); + + if (!link_ip_forward_enabled(link, AF_INET)) + return 0; + + /* We propagate the forwarding flag from one interface to the + * global setting one way. This means: as long as at least one + * interface was configured at any time that had IP forwarding + * enabled the setting will stay on for good. We do this + * primarily to keep IPv4 and IPv6 packet forwarding behaviour + * somewhat in sync (see below). */ + + return sysctl_write_ip_property(AF_INET, NULL, "ip_forward", "1"); +} + +static int link_set_ipv6_forward(Link *link) { + assert(link); + + if (!link_ip_forward_enabled(link, AF_INET6)) + return 0; + + /* On Linux, the IPv6 stack does not know a per-interface + * packet forwarding setting: either packet forwarding is on + * for all, or off for all. We hence don't bother with a + * per-interface setting, but simply propagate the interface + * flag, if it is set, to the global flag, one-way. Note that + * while IPv4 would allow a per-interface flag, we expose the + * same behaviour there and also propagate the setting from + * one to all, to keep things simple (see above). */ + + return sysctl_write_ip_property(AF_INET6, "all", "forwarding", "1"); +} + +static int link_set_ipv4_rp_filter(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET)) + return 0; + + if (link->network->ipv4_rp_filter < 0) + return 0; + + return sysctl_write_ip_property_int(AF_INET, link->ifname, "rp_filter", link->network->ipv4_rp_filter); +} + +static int link_set_ipv6_privacy_extensions(Link *link) { + IPv6PrivacyExtensions val; + + assert(link); + assert(link->manager); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + val = link->network->ipv6_privacy_extensions; + if (val < 0) /* If not specified, then use the global setting. */ + val = link->manager->ipv6_privacy_extensions; + + /* When "kernel", do not update the setting. */ + if (val == IPV6_PRIVACY_EXTENSIONS_KERNEL) + return 0; + + return sysctl_write_ip_property_int(AF_INET6, link->ifname, "use_tempaddr", (int) val); +} + +static int link_set_ipv6_accept_ra(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + return sysctl_write_ip_property(AF_INET6, link->ifname, "accept_ra", "0"); +} + +static int link_set_ipv6_dad_transmits(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + if (link->network->ipv6_dad_transmits < 0) + return 0; + + return sysctl_write_ip_property_int(AF_INET6, link->ifname, "dad_transmits", link->network->ipv6_dad_transmits); +} + +static int link_set_ipv6_hop_limit(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + if (link->network->ipv6_hop_limit <= 0) + return 0; + + return sysctl_write_ip_property_int(AF_INET6, link->ifname, "hop_limit", link->network->ipv6_hop_limit); +} + +static int link_set_ipv6_proxy_ndp(Link *link) { + bool v; + + assert(link); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + if (link->network->ipv6_proxy_ndp >= 0) + v = link->network->ipv6_proxy_ndp; + else + v = !set_isempty(link->network->ipv6_proxy_ndp_addresses); + + return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "proxy_ndp", v); +} + +int link_set_ipv6_mtu(Link *link) { + uint32_t mtu; + + assert(link); + + if (!link_is_configured_for_family(link, AF_INET6)) + return 0; + + if (link->network->ipv6_mtu == 0) + return 0; + + mtu = link->network->ipv6_mtu; + if (mtu > link->max_mtu) { + log_link_warning(link, "Reducing requested IPv6 MTU %"PRIu32" to the interface's maximum MTU %"PRIu32".", + mtu, link->max_mtu); + mtu = link->max_mtu; + } + + return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "mtu", mtu); +} + +static int link_set_ipv4_accept_local(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET)) + return 0; + + if (link->network->ipv4_accept_local < 0) + return 0; + + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "accept_local", link->network->ipv4_accept_local > 0); +} + +static int link_set_ipv4_route_localnet(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET)) + return 0; + + if (link->network->ipv4_route_localnet < 0) + return 0; + + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "route_localnet", link->network->ipv4_route_localnet > 0); +} + +static int link_set_ipv4_promote_secondaries(Link *link) { + assert(link); + + if (!link_is_configured_for_family(link, AF_INET)) + return 0; + + /* If promote_secondaries is not set, DHCP will work only as long as the IP address does not + * changes between leases. The kernel will remove all secondary IP addresses of an interface + * otherwise. The way systemd-networkd works is that the new IP of a lease is added as a + * secondary IP and when the primary one expires it relies on the kernel to promote the + * secondary IP. See also https://github.com/systemd/systemd/issues/7163 */ + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "promote_secondaries", true); +} + +int link_set_sysctl(Link *link) { + int r; + + assert(link); + + /* If IPv6 configured that is static IPv6 address and IPv6LL autoconfiguration is enabled + * for this interface, then enable IPv6 */ + r = link_update_ipv6_sysctl(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot enable IPv6, ignoring: %m"); + + r = link_set_proxy_arp(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot configure proxy ARP for interface, ignoring: %m"); + + r = link_set_ipv4_forward(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot turn on IPv4 packet forwarding, ignoring: %m"); + + r = link_set_ipv6_forward(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot configure IPv6 packet forwarding, ignoring: %m"); + + r = link_set_ipv6_privacy_extensions(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot configure IPv6 privacy extensions for interface, ignoring: %m"); + + r = link_set_ipv6_accept_ra(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot disable kernel IPv6 accept_ra for interface, ignoring: %m"); + + r = link_set_ipv6_dad_transmits(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv6 dad transmits for interface, ignoring: %m"); + + r = link_set_ipv6_hop_limit(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv6 hop limit for interface, ignoring: %m"); + + r = link_set_ipv6_proxy_ndp(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv6 proxy NDP, ignoring: %m"); + + r = link_set_ipv6_mtu(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv6 MTU, ignoring: %m"); + + r = link_set_ipv6ll_stable_secret(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set stable secret address for IPv6 link-local address: %m"); + + r = link_set_ipv4_accept_local(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv4 accept_local flag for interface, ignoring: %m"); + + r = link_set_ipv4_route_localnet(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv4 route_localnet flag for interface, ignoring: %m"); + + r = link_set_ipv4_rp_filter(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot set IPv4 reverse path filtering for interface, ignoring: %m"); + + r = link_set_ipv4_promote_secondaries(link); + if (r < 0) + log_link_warning_errno(link, r, "Cannot enable promote_secondaries for interface, ignoring: %m"); + + return 0; +} + +static const char* const ipv6_privacy_extensions_table[_IPV6_PRIVACY_EXTENSIONS_MAX] = { + [IPV6_PRIVACY_EXTENSIONS_NO] = "no", + [IPV6_PRIVACY_EXTENSIONS_PREFER_PUBLIC] = "prefer-public", + [IPV6_PRIVACY_EXTENSIONS_YES] = "yes", + [IPV6_PRIVACY_EXTENSIONS_KERNEL] = "kernel", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(ipv6_privacy_extensions, IPv6PrivacyExtensions, + IPV6_PRIVACY_EXTENSIONS_YES); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipv6_privacy_extensions, ipv6_privacy_extensions, IPv6PrivacyExtensions, + "Failed to parse IPv6 privacy extensions option"); + +static const char* const ip_reverse_path_filter_table[_IP_REVERSE_PATH_FILTER_MAX] = { + [IP_REVERSE_PATH_FILTER_NO] = "no", + [IP_REVERSE_PATH_FILTER_STRICT] = "strict", + [IP_REVERSE_PATH_FILTER_LOOSE] = "loose", +}; + +DEFINE_STRING_TABLE_LOOKUP(ip_reverse_path_filter, IPReversePathFilter); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ip_reverse_path_filter, ip_reverse_path_filter, IPReversePathFilter, + "Failed to parse IP reverse path filter option"); diff --git a/src/network/networkd-sysctl.h b/src/network/networkd-sysctl.h new file mode 100644 index 0000000..0644384 --- /dev/null +++ b/src/network/networkd-sysctl.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" + +typedef struct Link Link; + +typedef enum IPv6PrivacyExtensions { + /* These values map to the kernel's /proc/sys/net/ipv6/conf/xxx/use_tempaddr values. Do not reorder! */ + IPV6_PRIVACY_EXTENSIONS_NO, + IPV6_PRIVACY_EXTENSIONS_PREFER_PUBLIC, + IPV6_PRIVACY_EXTENSIONS_YES, /* aka prefer-temporary */ + IPV6_PRIVACY_EXTENSIONS_KERNEL, /* keep the kernel's default value */ + _IPV6_PRIVACY_EXTENSIONS_MAX, + _IPV6_PRIVACY_EXTENSIONS_INVALID = -EINVAL, +} IPv6PrivacyExtensions; + +typedef enum IPReversePathFilter { + /* These values map to the kernel's /proc/sys/net/ipv6/conf/xxx/rp_filter values. Do not reorder! */ + IP_REVERSE_PATH_FILTER_NO, + IP_REVERSE_PATH_FILTER_STRICT, + IP_REVERSE_PATH_FILTER_LOOSE, + _IP_REVERSE_PATH_FILTER_MAX, + _IP_REVERSE_PATH_FILTER_INVALID = -EINVAL, +} IPReversePathFilter; + +int link_set_sysctl(Link *link); +int link_set_ipv6_mtu(Link *link); + +const char* ipv6_privacy_extensions_to_string(IPv6PrivacyExtensions i) _const_; +IPv6PrivacyExtensions ipv6_privacy_extensions_from_string(const char *s) _pure_; + +const char* ip_reverse_path_filter_to_string(IPReversePathFilter i) _const_; +IPReversePathFilter ip_reverse_path_filter_from_string(const char *s) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_ipv6_privacy_extensions); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_reverse_path_filter); diff --git a/src/network/networkd-util.c b/src/network/networkd-util.c new file mode 100644 index 0000000..33352ba --- /dev/null +++ b/src/network/networkd-util.c @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "condition.h" +#include "conf-parser.h" +#include "escape.h" +#include "logarithm.h" +#include "networkd-link.h" +#include "networkd-util.h" +#include "parse-util.h" +#include "string-table.h" +#include "string-util.h" +#include "web-util.h" + +/* This is used in log messages, and never used in parsing settings. So, upper cases are OK. */ +static const char * const network_config_source_table[_NETWORK_CONFIG_SOURCE_MAX] = { + [NETWORK_CONFIG_SOURCE_FOREIGN] = "foreign", + [NETWORK_CONFIG_SOURCE_STATIC] = "static", + [NETWORK_CONFIG_SOURCE_IPV4LL] = "IPv4LL", + [NETWORK_CONFIG_SOURCE_DHCP4] = "DHCPv4", + [NETWORK_CONFIG_SOURCE_DHCP6] = "DHCPv6", + [NETWORK_CONFIG_SOURCE_DHCP_PD] = "DHCP-PD", + [NETWORK_CONFIG_SOURCE_NDISC] = "NDisc", + [NETWORK_CONFIG_SOURCE_RUNTIME] = "runtime", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(network_config_source, NetworkConfigSource); + +int network_config_state_to_string_alloc(NetworkConfigState s, char **ret) { + static const char* states[] = { + [LOG2U(NETWORK_CONFIG_STATE_REQUESTING)] = "requesting", + [LOG2U(NETWORK_CONFIG_STATE_CONFIGURING)] = "configuring", + [LOG2U(NETWORK_CONFIG_STATE_CONFIGURED)] = "configured", + [LOG2U(NETWORK_CONFIG_STATE_MARKED)] = "marked", + [LOG2U(NETWORK_CONFIG_STATE_REMOVING)] = "removing", + }; + _cleanup_free_ char *buf = NULL; + + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(states); i++) + if (FLAGS_SET(s, 1 << i)) { + assert(states[i]); + + if (!strextend_with_separator(&buf, ",", states[i])) + return -ENOMEM; + } + + *ret = TAKE_PTR(buf); + return 0; +} + +static const char * const address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_NO] = "no", + [ADDRESS_FAMILY_YES] = "yes", + [ADDRESS_FAMILY_IPV4] = "ipv4", + [ADDRESS_FAMILY_IPV6] = "ipv6", +}; + +static const char * const routing_policy_rule_address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_YES] = "both", + [ADDRESS_FAMILY_IPV4] = "ipv4", + [ADDRESS_FAMILY_IPV6] = "ipv6", +}; + +static const char * const nexthop_address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_IPV4] = "ipv4", + [ADDRESS_FAMILY_IPV6] = "ipv6", +}; + +static const char * const duplicate_address_detection_address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_NO] = "none", + [ADDRESS_FAMILY_YES] = "both", + [ADDRESS_FAMILY_IPV4] = "ipv4", + [ADDRESS_FAMILY_IPV6] = "ipv6", +}; + +static const char * const dhcp_deprecated_address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_NO] = "none", + [ADDRESS_FAMILY_YES] = "both", + [ADDRESS_FAMILY_IPV4] = "v4", + [ADDRESS_FAMILY_IPV6] = "v6", +}; + +static const char * const ip_masquerade_address_family_table[_ADDRESS_FAMILY_MAX] = { + [ADDRESS_FAMILY_NO] = "no", + [ADDRESS_FAMILY_YES] = "both", + [ADDRESS_FAMILY_IPV4] = "ipv4", + [ADDRESS_FAMILY_IPV6] = "ipv6", +}; + +static const char * const dhcp_lease_server_type_table[_SD_DHCP_LEASE_SERVER_TYPE_MAX] = { + [SD_DHCP_LEASE_DNS] = "DNS servers", + [SD_DHCP_LEASE_NTP] = "NTP servers", + [SD_DHCP_LEASE_SIP] = "SIP servers", + [SD_DHCP_LEASE_POP3] = "POP3 servers", + [SD_DHCP_LEASE_SMTP] = "SMTP servers", + [SD_DHCP_LEASE_LPR] = "LPR servers", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(address_family, AddressFamily, ADDRESS_FAMILY_YES); + +AddressFamily link_local_address_family_from_string(const char *s) { + if (streq_ptr(s, "fallback")) /* compat name */ + return ADDRESS_FAMILY_YES; + if (streq_ptr(s, "fallback-ipv4")) /* compat name */ + return ADDRESS_FAMILY_IPV4; + return address_family_from_string(s); +} + +DEFINE_STRING_TABLE_LOOKUP(routing_policy_rule_address_family, AddressFamily); +DEFINE_STRING_TABLE_LOOKUP(nexthop_address_family, AddressFamily); +DEFINE_STRING_TABLE_LOOKUP(duplicate_address_detection_address_family, AddressFamily); +DEFINE_CONFIG_PARSE_ENUM(config_parse_link_local_address_family, link_local_address_family, + AddressFamily, "Failed to parse option"); +DEFINE_STRING_TABLE_LOOKUP_FROM_STRING(dhcp_deprecated_address_family, AddressFamily); +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(ip_masquerade_address_family, AddressFamily); +DEFINE_STRING_TABLE_LOOKUP(dhcp_lease_server_type, sd_dhcp_lease_server_type_t); + +int config_parse_address_family_with_kernel( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + AddressFamily *fwd = data, s; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + /* This function is mostly obsolete now. It simply redirects + * "kernel" to "no". In older networkd versions we used to + * distinguish IPForward=off from IPForward=kernel, where the + * former would explicitly turn off forwarding while the + * latter would simply not touch the setting. But that logic + * is gone, hence silently accept the old setting, but turn it + * to "no". */ + + s = address_family_from_string(rvalue); + if (s < 0) { + if (streq(rvalue, "kernel")) + s = ADDRESS_FAMILY_NO; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse IPForward= option, ignoring: %s", rvalue); + return 0; + } + } + + *fwd = s; + + return 0; +} + +int config_parse_ip_masquerade( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + AddressFamily a, *ret = data; + int r; + + if (isempty(rvalue)) { + *ret = ADDRESS_FAMILY_NO; + return 0; + } + + r = parse_boolean(rvalue); + if (r >= 0) { + if (r) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "IPMasquerade=%s is deprecated, and it is handled as \"ipv4\" instead of \"both\". " + "Please use \"ipv4\" or \"both\".", + rvalue); + + *ret = r ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_NO; + return 0; + } + + a = ip_masquerade_address_family_from_string(rvalue); + if (a < 0) { + log_syntax(unit, LOG_WARNING, filename, line, a, + "Failed to parse IPMasquerade= setting, ignoring assignment: %s", rvalue); + return 0; + } + + *ret = a; + return 0; +} + +int config_parse_mud_url( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *unescaped = NULL; + char **url = ASSERT_PTR(data); + ssize_t l; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *url = mfree(*url); + return 0; + } + + l = cunescape(rvalue, 0, &unescaped); + if (l < 0) { + log_syntax(unit, LOG_WARNING, filename, line, l, + "Failed to unescape MUD URL, ignoring: %s", rvalue); + return 0; + } + + if (l > UINT8_MAX || !http_url_is_valid(unescaped)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid MUD URL, ignoring: %s", rvalue); + return 0; + } + + return free_and_replace(*url, unescaped); +} + +int log_link_message_full_errno(Link *link, sd_netlink_message *m, int level, int err, const char *msg) { + const char *err_msg = NULL; + + /* link may be NULL. */ + + (void) sd_netlink_message_read_string(m, NLMSGERR_ATTR_MSG, &err_msg); + return log_link_full_errno(link, level, err, + "%s: %s%s%s%m", + msg, + strempty(err_msg), + err_msg && !endswith(err_msg, ".") ? "." : "", + err_msg ? " " : ""); +} diff --git a/src/network/networkd-util.h b/src/network/networkd-util.h new file mode 100644 index 0000000..9c360f5 --- /dev/null +++ b/src/network/networkd-util.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-dhcp-lease.h" +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "hashmap.h" +#include "log.h" +#include "macro.h" +#include "network-util.h" +#include "string-util.h" + +typedef struct Link Link; + +typedef enum NetworkConfigSource { + NETWORK_CONFIG_SOURCE_FOREIGN, /* configured by kernel */ + NETWORK_CONFIG_SOURCE_STATIC, + NETWORK_CONFIG_SOURCE_IPV4LL, + NETWORK_CONFIG_SOURCE_DHCP4, + NETWORK_CONFIG_SOURCE_DHCP6, + NETWORK_CONFIG_SOURCE_DHCP_PD, + NETWORK_CONFIG_SOURCE_NDISC, + NETWORK_CONFIG_SOURCE_RUNTIME, /* through D-Bus method */ + _NETWORK_CONFIG_SOURCE_MAX, + _NETWORK_CONFIG_SOURCE_INVALID = -EINVAL, +} NetworkConfigSource; + +typedef enum NetworkConfigState { + NETWORK_CONFIG_STATE_REQUESTING = 1 << 0, /* request is queued */ + NETWORK_CONFIG_STATE_CONFIGURING = 1 << 1, /* e.g. address_configure() is called, but no response is received yet */ + NETWORK_CONFIG_STATE_CONFIGURED = 1 << 2, /* e.g. address_configure() is called and received a response from kernel. + * Note that address may not be ready yet, so please use address_is_ready() + * to check whether the address can be usable or not. */ + NETWORK_CONFIG_STATE_MARKED = 1 << 3, /* used GC'ing the old config */ + NETWORK_CONFIG_STATE_REMOVING = 1 << 4, /* e.g. address_remove() is called, but no response is received yet */ +} NetworkConfigState; + +static inline usec_t sec_to_usec(uint32_t sec, usec_t timestamp_usec) { + return + sec == 0 ? 0 : + sec == UINT32_MAX ? USEC_INFINITY : + usec_add(timestamp_usec, sec * USEC_PER_SEC); +} + +static inline usec_t sec16_to_usec(uint16_t sec, usec_t timestamp_usec) { + return sec_to_usec(sec == UINT16_MAX ? UINT32_MAX : (uint32_t) sec, timestamp_usec); +} + +static inline uint32_t usec_to_sec(usec_t usec, usec_t now_usec) { + return MIN(DIV_ROUND_UP(usec_sub_unsigned(usec, now_usec), USEC_PER_SEC), UINT32_MAX); +} + +CONFIG_PARSER_PROTOTYPE(config_parse_link_local_address_family); +CONFIG_PARSER_PROTOTYPE(config_parse_address_family_with_kernel); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_masquerade); +CONFIG_PARSER_PROTOTYPE(config_parse_mud_url); + +const char *network_config_source_to_string(NetworkConfigSource s) _const_; + +int network_config_state_to_string_alloc(NetworkConfigState s, char **ret); + +#define DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(type, name) \ + static inline void name##_update_state( \ + type *t, \ + NetworkConfigState mask, \ + NetworkConfigState value) { \ + \ + assert(t); \ + \ + t->state = (t->state & ~mask) | (value & mask); \ + } \ + static inline bool name##_exists(const type *t) { \ + assert(t); \ + \ + if ((t->state & (NETWORK_CONFIG_STATE_CONFIGURING | \ + NETWORK_CONFIG_STATE_CONFIGURED)) == 0) \ + return false; /* Not assigned yet. */ \ + if (FLAGS_SET(t->state, NETWORK_CONFIG_STATE_REMOVING)) \ + return false; /* Already removing. */ \ + return true; \ + } \ + static inline void name##_enter_requesting(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_REQUESTING, \ + NETWORK_CONFIG_STATE_REQUESTING); \ + } \ + static inline void name##_cancel_requesting(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_REQUESTING, \ + 0); \ + } \ + static inline bool name##_is_requesting(const type *t) { \ + assert(t); \ + return FLAGS_SET(t->state, NETWORK_CONFIG_STATE_REQUESTING); \ + } \ + static inline void name##_enter_configuring(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_REQUESTING | \ + NETWORK_CONFIG_STATE_CONFIGURING | \ + NETWORK_CONFIG_STATE_REMOVING, \ + NETWORK_CONFIG_STATE_CONFIGURING); \ + } \ + static inline void name##_enter_configured(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_CONFIGURING | \ + NETWORK_CONFIG_STATE_CONFIGURED, \ + NETWORK_CONFIG_STATE_CONFIGURED); \ + } \ + static inline void name##_mark(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_MARKED, \ + NETWORK_CONFIG_STATE_MARKED); \ + } \ + static inline void name##_unmark(type *t) { \ + name##_update_state(t, NETWORK_CONFIG_STATE_MARKED, 0); \ + } \ + static inline bool name##_is_marked(const type *t) { \ + assert(t); \ + return FLAGS_SET(t->state, NETWORK_CONFIG_STATE_MARKED); \ + } \ + static inline void name##_enter_removing(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_MARKED | \ + NETWORK_CONFIG_STATE_REMOVING, \ + NETWORK_CONFIG_STATE_REMOVING); \ + } \ + static inline void name##_enter_removed(type *t) { \ + name##_update_state(t, \ + NETWORK_CONFIG_STATE_CONFIGURED | \ + NETWORK_CONFIG_STATE_REMOVING, \ + 0); \ + } + +const char *address_family_to_string(AddressFamily b) _const_; +AddressFamily address_family_from_string(const char *s) _pure_; + +AddressFamily link_local_address_family_from_string(const char *s) _pure_; + +const char *routing_policy_rule_address_family_to_string(AddressFamily b) _const_; +AddressFamily routing_policy_rule_address_family_from_string(const char *s) _pure_; + +const char *nexthop_address_family_to_string(AddressFamily b) _const_; +AddressFamily nexthop_address_family_from_string(const char *s) _pure_; + +const char *duplicate_address_detection_address_family_to_string(AddressFamily b) _const_; +AddressFamily duplicate_address_detection_address_family_from_string(const char *s) _pure_; + +AddressFamily dhcp_deprecated_address_family_from_string(const char *s) _pure_; + +const char *dhcp_lease_server_type_to_string(sd_dhcp_lease_server_type_t t) _const_; +sd_dhcp_lease_server_type_t dhcp_lease_server_type_from_string(const char *s) _pure_; + +int log_link_message_full_errno(Link *link, sd_netlink_message *m, int level, int err, const char *msg); +#define log_link_message_error_errno(link, m, err, msg) log_link_message_full_errno(link, m, LOG_ERR, err, msg) +#define log_link_message_warning_errno(link, m, err, msg) log_link_message_full_errno(link, m, LOG_WARNING, err, msg) +#define log_link_message_notice_errno(link, m, err, msg) log_link_message_full_errno(link, m, LOG_NOTICE, err, msg) +#define log_link_message_info_errno(link, m, err, msg) log_link_message_full_errno(link, m, LOG_INFO, err, msg) +#define log_link_message_debug_errno(link, m, err, msg) log_link_message_full_errno(link, m, LOG_DEBUG, err, msg) +#define log_message_full_errno(m, level, err, msg) log_link_message_full_errno(NULL, m, level, err, msg) +#define log_message_error_errno(m, err, msg) log_message_full_errno(m, LOG_ERR, err, msg) +#define log_message_warning_errno(m, err, msg) log_message_full_errno(m, LOG_WARNING, err, msg) +#define log_message_notice_errno(m, err, msg) log_message_full_errno(m, LOG_NOTICE, err, msg) +#define log_message_info_errno(m, err, msg) log_message_full_errno(m, LOG_INFO, err, msg) +#define log_message_debug_errno(m, err, msg) log_message_full_errno(m, LOG_DEBUG, err, msg) diff --git a/src/network/networkd-wifi.c b/src/network/networkd-wifi.c new file mode 100644 index 0000000..98e7a72 --- /dev/null +++ b/src/network/networkd-wifi.c @@ -0,0 +1,345 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "ether-addr-util.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-wifi.h" +#include "networkd-wiphy.h" +#include "string-util.h" +#include "wifi-util.h" + +int link_get_wlan_interface(Link *link) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + int r; + + assert(link); + + r = sd_genl_message_new(link->manager->genl, NL80211_GENL_NAME, NL80211_CMD_GET_INTERFACE, &req); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to create generic netlink message: %m"); + + r = sd_netlink_message_append_u32(req, NL80211_ATTR_IFINDEX, link->ifindex); + if (r < 0) + return log_link_debug_errno(link, r, "Could not append NL80211_ATTR_IFINDEX attribute: %m"); + + r = sd_netlink_call(link->manager->genl, req, 0, &reply); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to request information about wlan interface: %m"); + if (!reply) { + log_link_debug(link, "No reply received to request for information about wifi interface, ignoring."); + return 0; + } + + return manager_genl_process_nl80211_config(link->manager->genl, reply, link->manager); +} + +int manager_genl_process_nl80211_config(sd_netlink *genl, sd_netlink_message *message, Manager *manager) { + _cleanup_free_ char *ssid = NULL; + uint32_t ifindex, wlan_iftype; + const char *family, *ifname; + uint8_t cmd; + size_t len; + Link *link; + int r; + + assert(genl); + assert(message); + assert(manager); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "nl80211: received error message, ignoring"); + + return 0; + } + + r = sd_genl_message_get_family_name(genl, message, &family); + if (r < 0) { + log_debug_errno(r, "nl80211: failed to determine genl family, ignoring: %m"); + return 0; + } + if (!streq(family, NL80211_GENL_NAME)) { + log_debug("nl80211: received message of unexpected genl family '%s', ignoring.", family); + return 0; + } + + r = sd_genl_message_get_command(genl, message, &cmd); + if (r < 0) { + log_debug_errno(r, "nl80211: failed to determine genl message command, ignoring: %m"); + return 0; + } + if (IN_SET(cmd, NL80211_CMD_NEW_WIPHY, NL80211_CMD_DEL_WIPHY)) + return manager_genl_process_nl80211_wiphy(genl, message, manager); + if (!IN_SET(cmd, NL80211_CMD_SET_INTERFACE, NL80211_CMD_NEW_INTERFACE, NL80211_CMD_DEL_INTERFACE)) { + log_debug("nl80211: ignoring nl80211 %s(%u) message.", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + r = sd_netlink_message_read_u32(message, NL80211_ATTR_IFINDEX, &ifindex); + if (r < 0) { + log_debug_errno(r, "nl80211: received %s(%u) message without valid ifindex, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + r = link_get_by_index(manager, ifindex, &link); + if (r < 0) { + log_debug_errno(r, "nl80211: received %s(%u) message for link '%"PRIu32"' we don't know about, ignoring.", + strna(nl80211_cmd_to_string(cmd)), cmd, ifindex); + + /* The NL80211_CMD_NEW_INTERFACE message might arrive before RTM_NEWLINK, in which case a + * link will not have been created yet. Store the interface index such that the wireless + * properties of the link (such as wireless interface type) are queried again after the link + * is created. + */ + if (cmd == NL80211_CMD_NEW_INTERFACE) { + r = set_ensure_put(&manager->new_wlan_ifindices, NULL, INT_TO_PTR(ifindex)); + if (r < 0) + log_warning_errno(r, "Failed to add new wireless interface index to set, ignoring: %m"); + } else if (cmd == NL80211_CMD_DEL_INTERFACE) + set_remove(manager->new_wlan_ifindices, INT_TO_PTR(ifindex)); + + return 0; + } + + r = sd_netlink_message_read_string(message, NL80211_ATTR_IFNAME, &ifname); + if (r < 0) { + log_link_debug_errno(link, r, "nl80211: received %s(%u) message without valid interface name, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + if (!streq(ifname, link->ifname)) { + log_link_debug(link, "nl80211: received %s(%u) message with invalid interface name '%s', ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd, ifname); + return 0; + } + + r = sd_netlink_message_read_u32(message, NL80211_ATTR_IFTYPE, &wlan_iftype); + if (r < 0) { + log_link_debug_errno(link, r, "nl80211: received %s(%u) message without valid wlan interface type, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + r = sd_netlink_message_read_data_suffix0(message, NL80211_ATTR_SSID, &len, (void**) &ssid); + if (r < 0 && r != -ENODATA) { + log_link_debug_errno(link, r, "nl80211: received %s(%u) message without valid SSID, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + if (r >= 0) { + if (len == 0) { + log_link_debug(link, "nl80211: received SSID has zero length, ignoring it: %m"); + ssid = mfree(ssid); + } else if (strlen_ptr(ssid) != len) { + log_link_debug(link, "nl80211: received SSID contains NUL characters, ignoring it."); + ssid = mfree(ssid); + } + } + + log_link_debug(link, "nl80211: received %s(%u) message: iftype=%s, ssid=%s", + strna(nl80211_cmd_to_string(cmd)), cmd, + strna(nl80211_iftype_to_string(wlan_iftype)), strna(ssid)); + + switch (cmd) { + case NL80211_CMD_SET_INTERFACE: + case NL80211_CMD_NEW_INTERFACE: + link->wlan_iftype = wlan_iftype; + free_and_replace(link->ssid, ssid); + break; + + case NL80211_CMD_DEL_INTERFACE: + link->wlan_iftype = NL80211_IFTYPE_UNSPECIFIED; + link->ssid = mfree(link->ssid); + break; + + default: + assert_not_reached(); + } + + return 0; +} + +int manager_genl_process_nl80211_mlme(sd_netlink *genl, sd_netlink_message *message, Manager *manager) { + const char *family; + uint32_t ifindex; + uint8_t cmd; + Link *link; + int r; + + assert(genl); + assert(message); + assert(manager); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "nl80211: received error message, ignoring"); + + return 0; + } + + r = sd_genl_message_get_family_name(genl, message, &family); + if (r < 0) { + log_debug_errno(r, "nl80211: failed to determine genl family, ignoring: %m"); + return 0; + } + if (!streq(family, NL80211_GENL_NAME)) { + log_debug("nl80211: Received message of unexpected genl family '%s', ignoring.", family); + return 0; + } + + r = sd_genl_message_get_command(genl, message, &cmd); + if (r < 0) { + log_debug_errno(r, "nl80211: failed to determine genl message command, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, NL80211_ATTR_IFINDEX, &ifindex); + if (r < 0) { + log_debug_errno(r, "nl80211: received %s(%u) message without valid ifindex, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + r = link_get_by_index(manager, ifindex, &link); + if (r < 0) { + log_debug_errno(r, "nl80211: received %s(%u) message for link '%"PRIu32"' we don't know about, ignoring.", + strna(nl80211_cmd_to_string(cmd)), cmd, ifindex); + return 0; + } + + switch (cmd) { + case NL80211_CMD_NEW_STATION: + case NL80211_CMD_DEL_STATION: { + struct ether_addr bssid; + + r = sd_netlink_message_read_ether_addr(message, NL80211_ATTR_MAC, &bssid); + if (r < 0) { + log_link_debug_errno(link, r, "nl80211: received %s(%u) message without valid BSSID, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + log_link_debug(link, "nl80211: received %s(%u) message: bssid=%s", + strna(nl80211_cmd_to_string(cmd)), cmd, ETHER_ADDR_TO_STR(&bssid)); + + if (cmd == NL80211_CMD_DEL_STATION) { + link->bssid = ETHER_ADDR_NULL; + return 0; + } + + link->bssid = bssid; + + if (manager->enumerating && + link->wlan_iftype == NL80211_IFTYPE_STATION && link->ssid) + log_link_info(link, "Connected WiFi access point: %s (%s)", + link->ssid, ETHER_ADDR_TO_STR(&link->bssid)); + break; + } + case NL80211_CMD_CONNECT: { + struct ether_addr bssid; + uint16_t status_code; + + r = sd_netlink_message_read_ether_addr(message, NL80211_ATTR_MAC, &bssid); + if (r < 0 && r != -ENODATA) { + log_link_debug_errno(link, r, "nl80211: received %s(%u) message without valid BSSID, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + r = sd_netlink_message_read_u16(message, NL80211_ATTR_STATUS_CODE, &status_code); + if (r < 0) { + log_link_debug_errno(link, r, "nl80211: received %s(%u) message without valid status code, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + log_link_debug(link, "nl80211: received %s(%u) message: status=%u, bssid=%s", + strna(nl80211_cmd_to_string(cmd)), cmd, status_code, ETHER_ADDR_TO_STR(&bssid)); + + if (status_code != 0) + return 0; + + link->bssid = bssid; + + if (!manager->enumerating) { + r = link_get_wlan_interface(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to update wireless LAN interface: %m"); + link_enter_failed(link); + return 0; + } + } + + if (link->wlan_iftype == NL80211_IFTYPE_STATION && link->ssid) + log_link_info(link, "Connected WiFi access point: %s (%s)", + link->ssid, ETHER_ADDR_TO_STR(&link->bssid)); + + /* Sometimes, RTM_NEWLINK message with carrier is received earlier than NL80211_CMD_CONNECT. + * To make SSID= or other WiFi related settings in [Match] section work, let's try to + * reconfigure the interface. */ + if (link->ssid && link_has_carrier(link)) { + r = link_reconfigure_impl(link, /* force = */ false); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to reconfigure interface: %m"); + link_enter_failed(link); + return 0; + } + } + break; + } + case NL80211_CMD_DISCONNECT: + log_link_debug(link, "nl80211: received %s(%u) message.", + strna(nl80211_cmd_to_string(cmd)), cmd); + + link->bssid = ETHER_ADDR_NULL; + free_and_replace(link->previous_ssid, link->ssid); + break; + + case NL80211_CMD_START_AP: { + log_link_debug(link, "nl80211: received %s(%u) message.", + strna(nl80211_cmd_to_string(cmd)), cmd); + + /* No need to reconfigure during enumeration */ + if (manager->enumerating) + break; + + /* If there is no carrier, let the link get configured on + * carrier gain instead */ + if (!link_has_carrier(link)) + break; + + /* AP start event may indicate different properties (e.g. SSID) */ + r = link_get_wlan_interface(link); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to update wireless LAN interface: %m"); + link_enter_failed(link); + return 0; + } + + /* If necessary, reconfigure based on those new properties */ + r = link_reconfigure_impl(link, /* force = */ false); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to reconfigure interface: %m"); + link_enter_failed(link); + return 0; + } + + break; + } + + default: + log_link_debug(link, "nl80211: received %s(%u) message.", + strna(nl80211_cmd_to_string(cmd)), cmd); + } + + return 0; +} diff --git a/src/network/networkd-wifi.h b/src/network/networkd-wifi.h new file mode 100644 index 0000000..2ef0d30 --- /dev/null +++ b/src/network/networkd-wifi.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-netlink.h" + +typedef struct Link Link; +typedef struct Manager Manager; + +int manager_genl_process_nl80211_config(sd_netlink *genl, sd_netlink_message *message, Manager *manager); +int manager_genl_process_nl80211_mlme(sd_netlink *genl, sd_netlink_message *message, Manager *manager); +int link_get_wlan_interface(Link *link); diff --git a/src/network/networkd-wiphy.c b/src/network/networkd-wiphy.c new file mode 100644 index 0000000..13f2d72 --- /dev/null +++ b/src/network/networkd-wiphy.c @@ -0,0 +1,495 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "device-private.h" +#include "device-util.h" +#include "networkd-manager.h" +#include "networkd-wiphy.h" +#include "parse-util.h" +#include "path-util.h" +#include "udev-util.h" +#include "wifi-util.h" + +Wiphy *wiphy_free(Wiphy *w) { + if (!w) + return NULL; + + if (w->manager) { + hashmap_remove_value(w->manager->wiphy_by_index, UINT32_TO_PTR(w->index), w); + if (w->name) + hashmap_remove_value(w->manager->wiphy_by_name, w->name, w); + } + + sd_device_unref(w->dev); + sd_device_unref(w->rfkill); + + free(w->name); + return mfree(w); +} + +static int wiphy_new(Manager *manager, sd_netlink_message *message, Wiphy **ret) { + _cleanup_(wiphy_freep) Wiphy *w = NULL; + _cleanup_free_ char *name = NULL; + uint32_t index; + int r; + + assert(manager); + assert(message); + + r = sd_netlink_message_read_u32(message, NL80211_ATTR_WIPHY, &index); + if (r < 0) + return r; + + r = sd_netlink_message_read_string_strdup(message, NL80211_ATTR_WIPHY_NAME, &name); + if (r < 0) + return r; + + w = new(Wiphy, 1); + if (!w) + return -ENOMEM; + + *w = (Wiphy) { + .manager = manager, + .index = index, + .name = TAKE_PTR(name), + }; + + r = hashmap_ensure_put(&manager->wiphy_by_index, NULL, UINT32_TO_PTR(w->index), w); + if (r < 0) + return r; + + r = hashmap_ensure_put(&w->manager->wiphy_by_name, &string_hash_ops, w->name, w); + if (r < 0) + return r; + + log_wiphy_debug(w, "Saved new wiphy: index=%"PRIu32, w->index); + + if (ret) + *ret = w; + + TAKE_PTR(w); + return 0; +} + +int wiphy_get_by_index(Manager *manager, uint32_t index, Wiphy **ret) { + Wiphy *w; + + assert(manager); + + w = hashmap_get(manager->wiphy_by_index, UINT32_TO_PTR(index)); + if (!w) + return -ENODEV; + + if (ret) + *ret = w; + + return 0; +} + +int wiphy_get_by_name(Manager *manager, const char *name, Wiphy **ret) { + Wiphy *w; + + assert(manager); + assert(name); + + w = hashmap_get(manager->wiphy_by_name, name); + if (!w) + return -ENODEV; + + if (ret) + *ret = w; + + return 0; +} + +static int link_get_wiphy(Link *link, Wiphy **ret) { + _cleanup_(sd_device_unrefp) sd_device *phy = NULL; + const char *s; + int r; + + assert(link); + assert(link->manager); + + if (link->iftype != ARPHRD_ETHER) + return -EOPNOTSUPP; + + if (!link->dev) + return -ENODEV; + + r = sd_device_get_devtype(link->dev, &s); + if (r < 0) + return r; + + if (!streq_ptr(s, "wlan")) + return -EOPNOTSUPP; + + r = sd_device_new_child(&phy, link->dev, "phy80211"); + if (r < 0) + return r; + + r = sd_device_get_sysname(phy, &s); + if (r < 0) + return r; + + /* TODO: + * Maybe, it is better to cache the found Wiphy object in the Link object. + * To support that, we need to investigate what happens when the _phy_ is renamed. */ + + return wiphy_get_by_name(link->manager, s, ret); +} + +static int rfkill_get_state(sd_device *dev) { + int r; + + assert(dev); + + /* The previous values may be outdated. Let's clear cache and re-read the values. */ + device_clear_sysattr_cache(dev); + + r = device_get_sysattr_bool(dev, "soft"); + if (r < 0 && r != -ENOENT) + return r; + if (r > 0) + return RFKILL_SOFT; + + r = device_get_sysattr_bool(dev, "hard"); + if (r < 0 && r != -ENOENT) + return r; + if (r > 0) + return RFKILL_HARD; + + return RFKILL_UNBLOCKED; +} + +static int wiphy_rfkilled(Wiphy *w) { + int r; + + assert(w); + + if (!udev_available()) { + if (w->rfkill_state != RFKILL_UNBLOCKED) { + log_wiphy_debug(w, "Running in container, assuming the radio transmitter is unblocked."); + w->rfkill_state = RFKILL_UNBLOCKED; /* To suppress the above log message, cache the state. */ + } + return false; + } + + if (!w->rfkill) { + if (w->rfkill_state != RFKILL_UNBLOCKED) { + log_wiphy_debug(w, "No rfkill device found, assuming the radio transmitter is unblocked."); + w->rfkill_state = RFKILL_UNBLOCKED; /* To suppress the above log message, cache the state. */ + } + return false; + } + + r = rfkill_get_state(w->rfkill); + if (r < 0) + return log_wiphy_debug_errno(w, r, "Could not get rfkill state: %m"); + + if (w->rfkill_state != r) + switch (r) { + case RFKILL_UNBLOCKED: + log_wiphy_debug(w, "The radio transmitter is unblocked."); + break; + case RFKILL_SOFT: + log_wiphy_debug(w, "The radio transmitter is turned off by software."); + break; + case RFKILL_HARD: + log_wiphy_debug(w, "The radio transmitter is forced off by something outside of the driver's control."); + break; + default: + assert_not_reached(); + } + + w->rfkill_state = r; /* Cache the state to suppress the above log messages. */ + return r != RFKILL_UNBLOCKED; +} + +int link_rfkilled(Link *link) { + Wiphy *w; + int r; + + assert(link); + + r = link_get_wiphy(link, &w); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || ERRNO_IS_NEG_DEVICE_ABSENT(r)) + return false; /* Typically, non-wifi interface or running in container */ + if (r < 0) + return log_link_debug_errno(link, r, "Could not get phy: %m"); + + return wiphy_rfkilled(w); +} + +static int wiphy_update_name(Wiphy *w, sd_netlink_message *message) { + const char *name; + int r; + + assert(w); + assert(w->manager); + assert(message); + + r = sd_netlink_message_read_string(message, NL80211_ATTR_WIPHY_NAME, &name); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + if (streq(w->name, name)) + return 0; + + log_wiphy_debug(w, "Wiphy name change detected, renamed to %s.", name); + + hashmap_remove_value(w->manager->wiphy_by_name, w->name, w); + + r = free_and_strdup(&w->name, name); + if (r < 0) + return r; + + r = hashmap_ensure_put(&w->manager->wiphy_by_name, &string_hash_ops, w->name, w); + if (r < 0) + return r; + + return 1; /* updated */ +} + +static int wiphy_update_device(Wiphy *w) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int r; + + assert(w); + assert(w->name); + + if (!udev_available()) + return 0; + + w->dev = sd_device_unref(w->dev); + + r = sd_device_new_from_subsystem_sysname(&dev, "ieee80211", w->name); + if (r < 0) + return r; + + if (DEBUG_LOGGING) { + const char *s = NULL; + + (void) sd_device_get_syspath(dev, &s); + log_wiphy_debug(w, "Found device: %s", strna(s)); + } + + w->dev = TAKE_PTR(dev); + return 0; +} + +static int wiphy_update_rfkill(Wiphy *w) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *rfkill; + int r; + + assert(w); + + if (!udev_available()) + return 0; + + w->rfkill = sd_device_unref(w->rfkill); + + if (!w->dev) + return 0; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "rfkill", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_parent(e, w->dev); + if (r < 0) + return r; + + rfkill = sd_device_enumerator_get_device_first(e); + if (!rfkill) + /* rfkill device may not detected by the kernel yet, and may appear later. */ + return -ENODEV; + + if (sd_device_enumerator_get_device_next(e)) + return -ENXIO; /* multiple devices found */ + + w->rfkill = sd_device_ref(rfkill); + + if (DEBUG_LOGGING) { + const char *s = NULL; + + (void) sd_device_get_syspath(rfkill, &s); + log_wiphy_debug(w, "Found rfkill device: %s", strna(s)); + } + + return 0; +} + +static int wiphy_update(Wiphy *w) { + int r; + + assert(w); + + r = wiphy_update_device(w); + if (ERRNO_IS_NEG_DEVICE_ABSENT(r)) + log_wiphy_debug_errno(w, r, "Failed to update wiphy device, ignoring: %m"); + else if (r < 0) + return log_wiphy_warning_errno(w, r, "Failed to update wiphy device: %m"); + + r = wiphy_update_rfkill(w); + if (ERRNO_IS_NEG_DEVICE_ABSENT(r)) + log_wiphy_debug_errno(w, r, "Failed to update rfkill device, ignoring: %m"); + else if (r < 0) + return log_wiphy_warning_errno(w, r, "Failed to update rfkill device: %m"); + + return 0; +} + +int manager_genl_process_nl80211_wiphy(sd_netlink *genl, sd_netlink_message *message, Manager *manager) { + const char *family; + uint32_t index; + uint8_t cmd; + Wiphy *w = NULL; + int r; + + assert(genl); + assert(message); + assert(manager); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "nl80211: received error message, ignoring"); + + return 0; + } + + r = sd_genl_message_get_family_name(genl, message, &family); + if (r < 0) { + log_debug_errno(r, "nl80211: failed to determine genl family, ignoring: %m"); + return 0; + } + if (!streq(family, NL80211_GENL_NAME)) { + log_debug("nl80211: Received message of unexpected genl family '%s', ignoring.", family); + return 0; + } + + r = sd_genl_message_get_command(genl, message, &cmd); + if (r < 0) { + log_debug_errno(r, "nl80211: failed to determine genl message command, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_u32(message, NL80211_ATTR_WIPHY, &index); + if (r < 0) { + log_debug_errno(r, "nl80211: received %s(%u) message without valid index, ignoring: %m", + strna(nl80211_cmd_to_string(cmd)), cmd); + return 0; + } + + (void) wiphy_get_by_index(manager, index, &w); + + switch (cmd) { + case NL80211_CMD_NEW_WIPHY: { + + if (!w) { + r = wiphy_new(manager, message, &w); + if (r < 0) { + log_warning_errno(r, "Failed to save new wiphy, ignoring: %m"); + return 0; + } + } else { + r = wiphy_update_name(w, message); + if (r < 0) { + log_wiphy_warning_errno(w, r, "Failed to update wiphy name, ignoring: %m"); + return 0; + } + if (r == 0) + return 0; + } + + r = wiphy_update(w); + if (r < 0) + log_wiphy_warning_errno(w, r, "Failed to update wiphy, ignoring: %m"); + + break; + } + case NL80211_CMD_DEL_WIPHY: + + if (!w) { + log_debug("The kernel removes wiphy we do not know, ignoring: %m"); + return 0; + } + + log_wiphy_debug(w, "Removed."); + wiphy_free(w); + break; + + default: + log_wiphy_debug(w, "nl80211: received %s(%u) message.", + strna(nl80211_cmd_to_string(cmd)), cmd); + } + + return 0; +} + +int manager_udev_process_wiphy(Manager *m, sd_device *device, sd_device_action_t action) { + const char *name; + Wiphy *w; + int r; + + assert(m); + assert(device); + + r = sd_device_get_sysname(device, &name); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get sysname: %m"); + + r = wiphy_get_by_name(m, name, &w); + if (r < 0) { + /* This error is not critical, as the corresponding genl message may be received later. */ + log_device_debug_errno(device, r, "Failed to get Wiphy object, ignoring: %m"); + return 0; + } + + return device_unref_and_replace(w->dev, action == SD_DEVICE_REMOVE ? NULL : device); +} + +int manager_udev_process_rfkill(Manager *m, sd_device *device, sd_device_action_t action) { + _cleanup_free_ char *parent_path = NULL, *parent_name = NULL; + const char *s; + Wiphy *w; + int r; + + assert(m); + assert(device); + + r = sd_device_get_syspath(device, &s); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get syspath: %m"); + + /* Do not use sd_device_get_parent() here, as this might be a 'remove' uevent. */ + r = path_extract_directory(s, &parent_path); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get parent syspath: %m"); + + r = path_extract_filename(parent_path, &parent_name); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get parent name: %m"); + + r = wiphy_get_by_name(m, parent_name, &w); + if (r < 0) { + /* This error is not critical, as the corresponding genl message may be received later. */ + log_device_debug_errno(device, r, "Failed to get Wiphy object: %m"); + return 0; + } + + return device_unref_and_replace(w->rfkill, action == SD_DEVICE_REMOVE ? NULL : device); +} diff --git a/src/network/networkd-wiphy.h b/src/network/networkd-wiphy.h new file mode 100644 index 0000000..b9056e8 --- /dev/null +++ b/src/network/networkd-wiphy.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +#include "macro.h" + +typedef struct Link Link; +typedef struct Manager Manager; + +/* The following values are different from the ones defined in linux/rfkill.h. */ +typedef enum RFKillState { + RFKILL_UNKNOWN, + RFKILL_UNBLOCKED, + RFKILL_SOFT, + RFKILL_HARD, + _RFKILL_STATE_MAX, + _RFKILL_STATE_INVALID = -EINVAL, +} RFKillState; + +typedef struct Wiphy { + Manager *manager; + + uint32_t index; + char *name; + + sd_device *dev; + sd_device *rfkill; + RFKillState rfkill_state; +} Wiphy; + +Wiphy *wiphy_free(Wiphy *w); +DEFINE_TRIVIAL_CLEANUP_FUNC(Wiphy*, wiphy_free); + +int wiphy_get_by_index(Manager *manager, uint32_t index, Wiphy **ret); +int wiphy_get_by_name(Manager *manager, const char *name, Wiphy **ret); + +int link_rfkilled(Link *link); + +int manager_genl_process_nl80211_wiphy(sd_netlink *genl, sd_netlink_message *message, Manager *manager); +int manager_udev_process_wiphy(Manager *m, sd_device *device, sd_device_action_t action); +int manager_udev_process_rfkill(Manager *m, sd_device *device, sd_device_action_t action); + +#define log_wiphy_full_errno_zerook(w, level, error, ...) \ + ({ \ + const Wiphy *_w = (w); \ + log_interface_full_errno_zerook(_w ? _w->name : NULL, level, error, __VA_ARGS__); \ + }) + +#define log_wiphy_full_errno(w, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_wiphy_full_errno_zerook(w, level, _error, __VA_ARGS__); \ + }) + +#define log_wiphy_full(w, level, ...) (void) log_wiphy_full_errno_zerook(w, level, 0, __VA_ARGS__) + +#define log_wiphy_debug(w, ...) log_wiphy_full(w, LOG_DEBUG, __VA_ARGS__) +#define log_wiphy_info(w, ...) log_wiphy_full(w, LOG_INFO, __VA_ARGS__) +#define log_wiphy_notice(w, ...) log_wiphy_full(w, LOG_NOTICE, __VA_ARGS__) +#define log_wiphy_warning(w, ...) log_wiphy_full(w, LOG_WARNING, __VA_ARGS__) +#define log_wiphy_error(w, ...) log_wiphy_full(w, LOG_ERR, __VA_ARGS__) + +#define log_wiphy_debug_errno(w, error, ...) log_wiphy_full_errno(w, LOG_DEBUG, error, __VA_ARGS__) +#define log_wiphy_info_errno(w, error, ...) log_wiphy_full_errno(w, LOG_INFO, error, __VA_ARGS__) +#define log_wiphy_notice_errno(w, error, ...) log_wiphy_full_errno(w, LOG_NOTICE, error, __VA_ARGS__) +#define log_wiphy_warning_errno(w, error, ...) log_wiphy_full_errno(w, LOG_WARNING, error, __VA_ARGS__) +#define log_wiphy_error_errno(w, error, ...) log_wiphy_full_errno(w, LOG_ERR, error, __VA_ARGS__) diff --git a/src/network/networkd.c b/src/network/networkd.c new file mode 100644 index 0000000..46c2c74 --- /dev/null +++ b/src/network/networkd.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-event.h" + +#include "bus-log-control-api.h" +#include "capability-util.h" +#include "daemon-util.h" +#include "firewall-util.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "networkd-conf.h" +#include "networkd-manager-bus.h" +#include "networkd-manager.h" +#include "service-util.h" +#include "signal-util.h" +#include "user-util.h" + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = NULL; + int r; + + log_setup(); + + r = service_parse_argv("systemd-networkd.service", + "Manage and configure network devices, create virtual network devices", + BUS_IMPLEMENTATIONS(&manager_object, &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + if (argc != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments."); + + /* Drop privileges, but only if we have been started as root. If we are not running as root we assume all + * privileges are already dropped and we can't create our runtime directory. */ + if (geteuid() == 0) { + const char *user = "systemd-network"; + uid_t uid; + gid_t gid; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Cannot resolve user name %s: %m", user); + + /* Create runtime directory. This is not necessary when networkd is + * started with "RuntimeDirectory=systemd/netif", or after + * systemd-tmpfiles-setup.service. */ + r = mkdir_safe_label("/run/systemd/netif", 0755, uid, gid, MKDIR_WARN_MODE); + if (r < 0) + log_warning_errno(r, "Could not create runtime directory: %m"); + + r = drop_privileges(uid, gid, + (1ULL << CAP_NET_ADMIN) | + (1ULL << CAP_NET_BIND_SERVICE) | + (1ULL << CAP_NET_BROADCAST) | + (1ULL << CAP_NET_RAW)); + if (r < 0) + return log_error_errno(r, "Failed to drop privileges: %m"); + } + + /* Always create the directories people can create inotify watches in. + * It is necessary to create the following subdirectories after drop_privileges() + * to support old kernels not supporting AmbientCapabilities=. */ + r = mkdir_safe_label("/run/systemd/netif/links", 0755, UID_INVALID, GID_INVALID, MKDIR_WARN_MODE); + if (r < 0) + log_warning_errno(r, "Could not create runtime directory 'links': %m"); + + r = mkdir_safe_label("/run/systemd/netif/leases", 0755, UID_INVALID, GID_INVALID, MKDIR_WARN_MODE); + if (r < 0) + log_warning_errno(r, "Could not create runtime directory 'leases': %m"); + + r = mkdir_safe_label("/run/systemd/netif/lldp", 0755, UID_INVALID, GID_INVALID, MKDIR_WARN_MODE); + if (r < 0) + log_warning_errno(r, "Could not create runtime directory 'lldp': %m"); + + r = manager_new(&m, /* test_mode = */ false); + if (r < 0) + return log_error_errno(r, "Could not create manager: %m"); + + r = manager_setup(m); + if (r < 0) + return log_error_errno(r, "Could not set up manager: %m"); + + r = manager_parse_config_file(m); + if (r < 0) + log_warning_errno(r, "Failed to parse configuration file: %m"); + + r = manager_load_config(m); + if (r < 0) + return log_error_errno(r, "Could not load configuration files: %m"); + + r = manager_enumerate(m); + if (r < 0) + return r; + + r = manager_start(m); + if (r < 0) + return log_error_errno(r, "Could not start manager: %m"); + + log_info("Enumeration completed"); + + notify_message = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/network/networkd.conf b/src/network/networkd.conf new file mode 100644 index 0000000..e5a5e88 --- /dev/null +++ b/src/network/networkd.conf @@ -0,0 +1,33 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/networkd.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/networkd.conf' to display the full config. +# +# See networkd.conf(5) for details. + +[Network] +#SpeedMeter=no +#SpeedMeterIntervalSec=10sec +#ManageForeignRoutingPolicyRules=yes +#ManageForeignRoutes=yes +#RouteTable= +#IPv6PrivacyExtensions=no + +[DHCPv4] +#DUIDType=vendor +#DUIDRawData= + +[DHCPv6] +#DUIDType=vendor +#DUIDRawData= diff --git a/src/network/org.freedesktop.network1.conf b/src/network/org.freedesktop.network1.conf new file mode 100644 index 0000000..5bd796d --- /dev/null +++ b/src/network/org.freedesktop.network1.conf @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/network/org.freedesktop.network1.policy b/src/network/org.freedesktop.network1.policy new file mode 100644 index 0000000..1e2d8d7 --- /dev/null +++ b/src/network/org.freedesktop.network1.policy @@ -0,0 +1,186 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Set NTP servers + Authentication is required to set NTP servers. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Set DNS servers + Authentication is required to set DNS servers. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Set domains + Authentication is required to set domains. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Set default route + Authentication is required to set default route. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Enable/disable LLMNR + Authentication is required to enable or disable LLMNR. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Enable/disable multicast DNS + Authentication is required to enable or disable multicast DNS. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Enable/disable DNS over TLS + Authentication is required to enable or disable DNS over TLS. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Enable/disable DNSSEC + Authentication is required to enable or disable DNSSEC. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Set DNSSEC Negative Trust Anchors + Authentication is required to set DNSSEC Negative Trust Anchors. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Revert NTP settings + Authentication is required to reset NTP settings. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Revert DNS settings + Authentication is required to reset DNS settings. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + DHCP server sends force renew message + Authentication is required to send force renew message. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Renew dynamic addresses + Authentication is required to renew dynamic addresses. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Reload network settings + Authentication is required to reload network settings. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + + Reconfigure network interface + Authentication is required to reconfigure network interface. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-network + + + diff --git a/src/network/org.freedesktop.network1.service b/src/network/org.freedesktop.network1.service new file mode 100644 index 0000000..ddbf3eb --- /dev/null +++ b/src/network/org.freedesktop.network1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.network1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.network1.service diff --git a/src/network/systemd-networkd.pkla b/src/network/systemd-networkd.pkla new file mode 100644 index 0000000..c56ea1b --- /dev/null +++ b/src/network/systemd-networkd.pkla @@ -0,0 +1,7 @@ +# This file is part of systemd. +# See systemd-networkd.service(8) and polkit(8) for more information. + +[Allow systemd-networkd to set timezone and transient hostname] +Identity=unix-user:systemd-network +Action=org.freedesktop.hostname1.set-hostname;org.freedesktop.hostname1.get-product-uuid;org.freedesktop.timedate1.set-timezone; +ResultAny=yes diff --git a/src/network/systemd-networkd.rules b/src/network/systemd-networkd.rules new file mode 100644 index 0000000..86cc849 --- /dev/null +++ b/src/network/systemd-networkd.rules @@ -0,0 +1,13 @@ +// This file is part of systemd. +// See systemd-networkd.service(8) and polkit(8) for more information. + +// Allow systemd-networkd to set timezone, get product UUID, +// and transient hostname +polkit.addRule(function(action, subject) { + if ((action.id == "org.freedesktop.hostname1.set-hostname" || + action.id == "org.freedesktop.hostname1.get-product-uuid" || + action.id == "org.freedesktop.timedate1.set-timezone") && + subject.user == "systemd-network") { + return polkit.Result.YES; + } +}); diff --git a/src/network/tc/cake.c b/src/network/tc/cake.c new file mode 100644 index 0000000..c495faf --- /dev/null +++ b/src/network/tc/cake.c @@ -0,0 +1,737 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "cake.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-table.h" +#include "string-util.h" + +static int cake_init(QDisc *qdisc) { + CommonApplicationsKeptEnhanced *c; + + assert(qdisc); + + c = CAKE(qdisc); + + c->autorate = -1; + c->compensation_mode = _CAKE_COMPENSATION_MODE_INVALID; + c->raw = -1; + c->flow_isolation_mode = _CAKE_FLOW_ISOLATION_MODE_INVALID; + c->nat = -1; + c->preset = _CAKE_PRESET_INVALID; + c->wash = -1; + c->split_gso = -1; + c->ack_filter = _CAKE_ACK_FILTER_INVALID; + + return 0; +} + +static int cake_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + CommonApplicationsKeptEnhanced *c; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(c = CAKE(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "cake"); + if (r < 0) + return r; + + if (c->bandwidth > 0) { + r = sd_netlink_message_append_u64(req, TCA_CAKE_BASE_RATE64, c->bandwidth); + if (r < 0) + return r; + } + + if (c->autorate >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_AUTORATE, c->autorate); + if (r < 0) + return r; + } + + if (c->overhead_set) { + r = sd_netlink_message_append_s32(req, TCA_CAKE_OVERHEAD, c->overhead); + if (r < 0) + return r; + } + + if (c->mpu > 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_MPU, c->mpu); + if (r < 0) + return r; + } + + if (c->compensation_mode >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_ATM, c->compensation_mode); + if (r < 0) + return r; + } + + if (c->raw > 0) { + /* TCA_CAKE_RAW attribute is mostly a flag, not boolean. */ + r = sd_netlink_message_append_u32(req, TCA_CAKE_RAW, 0); + if (r < 0) + return r; + } + + if (c->flow_isolation_mode >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_FLOW_MODE, c->flow_isolation_mode); + if (r < 0) + return r; + } + + if (c->nat >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_NAT, c->nat); + if (r < 0) + return r; + } + + if (c->preset >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_DIFFSERV_MODE, c->preset); + if (r < 0) + return r; + } + + if (c->fwmark > 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_FWMARK, c->fwmark); + if (r < 0) + return r; + } + + if (c->wash >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_WASH, c->wash); + if (r < 0) + return r; + } + + if (c->split_gso >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_SPLIT_GSO, c->split_gso); + if (r < 0) + return r; + } + + if (c->rtt > 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_RTT, c->rtt); + if (r < 0) + return r; + } + + if (c->ack_filter >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CAKE_ACK_FILTER, c->ack_filter); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_cake_bandwidth( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + uint64_t k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->bandwidth = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_size(rvalue, 1000, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->bandwidth = k/8; + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_cake_overhead( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + int32_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->overhead_set = false; + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atoi32(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (v < -64 || v > 256) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->overhead = v; + c->overhead_set = true; + TAKE_PTR(qdisc); + return 0; +} + +int config_parse_cake_mpu( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + uint32_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->mpu = 0; + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (v <= 0 || v > 256) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->mpu = v; + TAKE_PTR(qdisc); + return 0; +} + +int config_parse_cake_tristate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + int *dest, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (streq(lvalue, "AutoRateIngress")) + dest = &c->autorate; + else if (streq(lvalue, "UseRawPacketSize")) + dest = &c->raw; + else if (streq(lvalue, "NAT")) + dest = &c->nat; + else if (streq(lvalue, "Wash")) + dest = &c->wash; + else if (streq(lvalue, "SplitGSO")) + dest = &c->split_gso; + else + assert_not_reached(); + + r = parse_tristate(rvalue, dest); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + return 0; +} + +static const char * const cake_compensation_mode_table[_CAKE_COMPENSATION_MODE_MAX] = { + [CAKE_COMPENSATION_MODE_NONE] = "none", + [CAKE_COMPENSATION_MODE_ATM] = "atm", + [CAKE_COMPENSATION_MODE_PTM] = "ptm", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(cake_compensation_mode, CakeCompensationMode); + +int config_parse_cake_compensation_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + CakeCompensationMode mode; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->compensation_mode = _CAKE_COMPENSATION_MODE_INVALID; + TAKE_PTR(qdisc); + return 0; + } + + mode = cake_compensation_mode_from_string(rvalue); + if (mode < 0) { + log_syntax(unit, LOG_WARNING, filename, line, mode, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->compensation_mode = mode; + TAKE_PTR(qdisc); + return 0; +} + +static const char * const cake_flow_isolation_mode_table[_CAKE_FLOW_ISOLATION_MODE_MAX] = { + [CAKE_FLOW_ISOLATION_MODE_NONE] = "none", + [CAKE_FLOW_ISOLATION_MODE_SRC_IP] = "src-host", + [CAKE_FLOW_ISOLATION_MODE_DST_IP] = "dst-host", + [CAKE_FLOW_ISOLATION_MODE_HOSTS] = "hosts", + [CAKE_FLOW_ISOLATION_MODE_FLOWS] = "flows", + [CAKE_FLOW_ISOLATION_MODE_DUAL_SRC] = "dual-src-host", + [CAKE_FLOW_ISOLATION_MODE_DUAL_DST] = "dual-dst-host", + [CAKE_FLOW_ISOLATION_MODE_TRIPLE] = "triple", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(cake_flow_isolation_mode, CakeFlowIsolationMode); + +int config_parse_cake_flow_isolation_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + CakeFlowIsolationMode mode; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->flow_isolation_mode = _CAKE_FLOW_ISOLATION_MODE_INVALID; + TAKE_PTR(qdisc); + return 0; + } + + mode = cake_flow_isolation_mode_from_string(rvalue); + if (mode < 0) { + log_syntax(unit, LOG_WARNING, filename, line, mode, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->flow_isolation_mode = mode; + TAKE_PTR(qdisc); + return 0; +} + +static const char * const cake_priority_queueing_preset_table[_CAKE_PRESET_MAX] = { + [CAKE_PRESET_DIFFSERV3] = "diffserv3", + [CAKE_PRESET_DIFFSERV4] = "diffserv4", + [CAKE_PRESET_DIFFSERV8] = "diffserv8", + [CAKE_PRESET_BESTEFFORT] = "besteffort", + [CAKE_PRESET_PRECEDENCE] = "precedence", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(cake_priority_queueing_preset, CakePriorityQueueingPreset); + +int config_parse_cake_priority_queueing_preset( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + CakePriorityQueueingPreset preset; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->preset = _CAKE_PRESET_INVALID; + TAKE_PTR(qdisc); + return 0; + } + + preset = cake_priority_queueing_preset_from_string(rvalue); + if (preset < 0) { + log_syntax(unit, LOG_WARNING, filename, line, preset, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->preset = preset; + TAKE_PTR(qdisc); + return 0; +} + +int config_parse_cake_fwmark( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + uint32_t fwmark; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->fwmark = 0; + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &fwmark); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (fwmark <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->fwmark = fwmark; + TAKE_PTR(qdisc); + return 0; +} + +int config_parse_cake_rtt( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + Network *network = ASSERT_PTR(data); + usec_t t; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->rtt = 0; + TAKE_PTR(qdisc); + return 0; + } + + r = parse_sec(rvalue, &t); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (t <= 0 || t > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->rtt = t; + TAKE_PTR(qdisc); + return 0; +} + +static const char * const cake_ack_filter_table[_CAKE_ACK_FILTER_MAX] = { + [CAKE_ACK_FILTER_NO] = "no", + [CAKE_ACK_FILTER_YES] = "yes", + [CAKE_ACK_FILTER_AGGRESSIVE] = "aggressive", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(cake_ack_filter, CakeAckFilter, CAKE_ACK_FILTER_YES); + +int config_parse_cake_ack_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + CommonApplicationsKeptEnhanced *c; + CakeAckFilter ack_filter; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CAKE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + c = CAKE(qdisc); + + if (isempty(rvalue)) { + c->ack_filter = _CAKE_ACK_FILTER_INVALID; + TAKE_PTR(qdisc); + return 0; + } + + ack_filter = cake_ack_filter_from_string(rvalue); + if (ack_filter < 0) { + log_syntax(unit, LOG_WARNING, filename, line, ack_filter, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + c->ack_filter = ack_filter; + TAKE_PTR(qdisc); + return 0; +} + +const QDiscVTable cake_vtable = { + .object_size = sizeof(CommonApplicationsKeptEnhanced), + .tca_kind = "cake", + .init = cake_init, + .fill_message = cake_fill_message, +}; diff --git a/src/network/tc/cake.h b/src/network/tc/cake.h new file mode 100644 index 0000000..5ca6dc6 --- /dev/null +++ b/src/network/tc/cake.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include + +#include "conf-parser.h" +#include "qdisc.h" + +typedef enum CakeCompensationMode { + CAKE_COMPENSATION_MODE_NONE = CAKE_ATM_NONE, + CAKE_COMPENSATION_MODE_ATM = CAKE_ATM_ATM, + CAKE_COMPENSATION_MODE_PTM = CAKE_ATM_PTM, + _CAKE_COMPENSATION_MODE_MAX, + _CAKE_COMPENSATION_MODE_INVALID = -EINVAL, +} CakeCompensationMode; + +typedef enum CakeFlowIsolationMode { + CAKE_FLOW_ISOLATION_MODE_NONE = CAKE_FLOW_NONE, + CAKE_FLOW_ISOLATION_MODE_SRC_IP = CAKE_FLOW_SRC_IP, + CAKE_FLOW_ISOLATION_MODE_DST_IP = CAKE_FLOW_DST_IP, + CAKE_FLOW_ISOLATION_MODE_HOSTS = CAKE_FLOW_HOSTS, + CAKE_FLOW_ISOLATION_MODE_FLOWS = CAKE_FLOW_FLOWS, + CAKE_FLOW_ISOLATION_MODE_DUAL_SRC = CAKE_FLOW_DUAL_SRC, + CAKE_FLOW_ISOLATION_MODE_DUAL_DST = CAKE_FLOW_DUAL_DST, + CAKE_FLOW_ISOLATION_MODE_TRIPLE = CAKE_FLOW_TRIPLE, + _CAKE_FLOW_ISOLATION_MODE_MAX, + _CAKE_FLOW_ISOLATION_MODE_INVALID = -EINVAL, +} CakeFlowIsolationMode; + +typedef enum CakePriorityQueueingPreset { + CAKE_PRESET_DIFFSERV3 = CAKE_DIFFSERV_DIFFSERV3, + CAKE_PRESET_DIFFSERV4 = CAKE_DIFFSERV_DIFFSERV4, + CAKE_PRESET_DIFFSERV8 = CAKE_DIFFSERV_DIFFSERV8, + CAKE_PRESET_BESTEFFORT = CAKE_DIFFSERV_BESTEFFORT, + CAKE_PRESET_PRECEDENCE = CAKE_DIFFSERV_PRECEDENCE, + _CAKE_PRESET_MAX, + _CAKE_PRESET_INVALID = -EINVAL, +} CakePriorityQueueingPreset; + +typedef enum CakeAckFilter { + CAKE_ACK_FILTER_NO = CAKE_ACK_NONE, + CAKE_ACK_FILTER_YES = CAKE_ACK_FILTER, + CAKE_ACK_FILTER_AGGRESSIVE = CAKE_ACK_AGGRESSIVE, + _CAKE_ACK_FILTER_MAX, + _CAKE_ACK_FILTER_INVALID = -EINVAL, +} CakeAckFilter; + +typedef struct CommonApplicationsKeptEnhanced { + QDisc meta; + + /* Shaper parameters */ + int autorate; + uint64_t bandwidth; + + /* Overhead compensation parameters */ + bool overhead_set; + int overhead; + uint32_t mpu; + CakeCompensationMode compensation_mode; + int raw; + + /* Flow isolation parameters */ + CakeFlowIsolationMode flow_isolation_mode; + int nat; + + /* Priority queue parameters */ + CakePriorityQueueingPreset preset; + uint32_t fwmark; + + /* Other parameters */ + int wash; + int split_gso; + usec_t rtt; + CakeAckFilter ack_filter; +} CommonApplicationsKeptEnhanced; + +DEFINE_QDISC_CAST(CAKE, CommonApplicationsKeptEnhanced); +extern const QDiscVTable cake_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_cake_bandwidth); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_overhead); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_mpu); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_tristate); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_compensation_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_flow_isolation_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_priority_queueing_preset); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_fwmark); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_rtt); +CONFIG_PARSER_PROTOTYPE(config_parse_cake_ack_filter); diff --git a/src/network/tc/codel.c b/src/network/tc/codel.c new file mode 100644 index 0000000..e212523 --- /dev/null +++ b/src/network/tc/codel.c @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-util.h" + +static int controlled_delay_init(QDisc *qdisc) { + ControlledDelay *cd; + + assert(qdisc); + + cd = CODEL(qdisc); + + cd->ce_threshold_usec = USEC_INFINITY; + cd->ecn = -1; + + return 0; +} + +static int controlled_delay_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + ControlledDelay *cd; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(cd = CODEL(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "codel"); + if (r < 0) + return r; + + if (cd->packet_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_CODEL_LIMIT, cd->packet_limit); + if (r < 0) + return r; + } + + if (cd->interval_usec > 0) { + r = sd_netlink_message_append_u32(req, TCA_CODEL_INTERVAL, cd->interval_usec); + if (r < 0) + return r; + } + + if (cd->target_usec > 0) { + r = sd_netlink_message_append_u32(req, TCA_CODEL_TARGET, cd->target_usec); + if (r < 0) + return r; + } + + if (cd->ecn >= 0) { + r = sd_netlink_message_append_u32(req, TCA_CODEL_ECN, cd->ecn); + if (r < 0) + return r; + } + + if (cd->ce_threshold_usec != USEC_INFINITY) { + r = sd_netlink_message_append_u32(req, TCA_CODEL_CE_THRESHOLD, cd->ce_threshold_usec); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_controlled_delay_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + ControlledDelay *cd; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + cd = CODEL(qdisc); + + if (isempty(rvalue)) { + cd->packet_limit = 0; + + qdisc = NULL; + return 0; + } + + r = safe_atou32(rvalue, &cd->packet_limit); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + qdisc = NULL; + + return 0; +} + +int config_parse_controlled_delay_usec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + ControlledDelay *cd; + Network *network = ASSERT_PTR(data); + usec_t *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + cd = CODEL(qdisc); + + if (streq(lvalue, "TargetSec")) + p = &cd->target_usec; + else if (streq(lvalue, "IntervalSec")) + p = &cd->interval_usec; + else if (streq(lvalue, "CEThresholdSec")) + p = &cd->ce_threshold_usec; + else + assert_not_reached(); + + if (isempty(rvalue)) { + if (streq(lvalue, "CEThresholdSec")) + *p = USEC_INFINITY; + else + *p = 0; + + qdisc = NULL; + return 0; + } + + r = parse_sec(rvalue, p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + qdisc = NULL; + + return 0; +} + +int config_parse_controlled_delay_bool( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + ControlledDelay *cd; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + cd = CODEL(qdisc); + + r = parse_tristate(rvalue, &cd->ecn); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable codel_vtable = { + .object_size = sizeof(ControlledDelay), + .tca_kind = "codel", + .init = controlled_delay_init, + .fill_message = controlled_delay_fill_message, +}; diff --git a/src/network/tc/codel.h b/src/network/tc/codel.h new file mode 100644 index 0000000..4fe5283 --- /dev/null +++ b/src/network/tc/codel.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" +#include "time-util.h" + +typedef struct ControlledDelay { + QDisc meta; + + uint32_t packet_limit; + usec_t interval_usec; + usec_t target_usec; + usec_t ce_threshold_usec; + int ecn; +} ControlledDelay; + +DEFINE_QDISC_CAST(CODEL, ControlledDelay); +extern const QDiscVTable codel_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_controlled_delay_u32); +CONFIG_PARSER_PROTOTYPE(config_parse_controlled_delay_usec); +CONFIG_PARSER_PROTOTYPE(config_parse_controlled_delay_bool); diff --git a/src/network/tc/drr.c b/src/network/tc/drr.c new file mode 100644 index 0000000..373911b --- /dev/null +++ b/src/network/tc/drr.c @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "drr.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "string-util.h" + +const QDiscVTable drr_vtable = { + .object_size = sizeof(DeficitRoundRobinScheduler), + .tca_kind = "drr", +}; + +static int drr_class_fill_message(Link *link, TClass *tclass, sd_netlink_message *req) { + DeficitRoundRobinSchedulerClass *drr; + int r; + + assert(link); + assert(tclass); + assert(req); + + assert_se(drr = TCLASS_TO_DRR(tclass)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "drr"); + if (r < 0) + return r; + + if (drr->quantum > 0) { + r = sd_netlink_message_append_u32(req, TCA_DRR_QUANTUM, drr->quantum); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_drr_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + DeficitRoundRobinSchedulerClass *drr; + Network *network = ASSERT_PTR(data); + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(TCLASS_KIND_DRR, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + drr = TCLASS_TO_DRR(tclass); + + if (isempty(rvalue)) { + drr->quantum = 0; + + TAKE_PTR(tclass); + return 0; + } + + r = parse_size(rvalue, 1024, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (u > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + drr->quantum = (uint32_t) u; + + TAKE_PTR(tclass); + return 0; +} + +const TClassVTable drr_tclass_vtable = { + .object_size = sizeof(DeficitRoundRobinSchedulerClass), + .tca_kind = "drr", + .fill_message = drr_class_fill_message, +}; diff --git a/src/network/tc/drr.h b/src/network/tc/drr.h new file mode 100644 index 0000000..c96cc4d --- /dev/null +++ b/src/network/tc/drr.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "qdisc.h" + +typedef struct DeficitRoundRobinScheduler { + QDisc meta; +} DeficitRoundRobinScheduler; + +DEFINE_QDISC_CAST(DRR, DeficitRoundRobinScheduler); +extern const QDiscVTable drr_vtable; + +typedef struct DeficitRoundRobinSchedulerClass { + TClass meta; + + uint32_t quantum; +} DeficitRoundRobinSchedulerClass; + +DEFINE_TCLASS_CAST(DRR, DeficitRoundRobinSchedulerClass); +extern const TClassVTable drr_tclass_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_drr_size); diff --git a/src/network/tc/ets.c b/src/network/tc/ets.c new file mode 100644 index 0000000..730b0a1 --- /dev/null +++ b/src/network/tc/ets.c @@ -0,0 +1,342 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "ets.h" +#include "extract-word.h" +#include "memory-util.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-util.h" +#include "tc-util.h" + +static int enhanced_transmission_selection_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + EnhancedTransmissionSelection *ets; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(ets = ETS(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "ets"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u8(req, TCA_ETS_NBANDS, ets->n_bands); + if (r < 0) + return r; + + if (ets->n_strict > 0) { + r = sd_netlink_message_append_u8(req, TCA_ETS_NSTRICT, ets->n_strict); + if (r < 0) + return r; + } + + if (ets->n_quanta > 0) { + r = sd_netlink_message_open_container(req, TCA_ETS_QUANTA); + if (r < 0) + return r; + + for (unsigned i = 0; i < ets->n_quanta; i++) { + r = sd_netlink_message_append_u32(req, TCA_ETS_QUANTA_BAND, ets->quanta[i]); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + } + + if (ets->n_prio > 0) { + r = sd_netlink_message_open_container(req, TCA_ETS_PRIOMAP); + if (r < 0) + return r; + + for (unsigned i = 0; i < ets->n_prio; i++) { + r = sd_netlink_message_append_u8(req, TCA_ETS_PRIOMAP_BAND, ets->prio[i]); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_ets_u8( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + EnhancedTransmissionSelection *ets; + Network *network = ASSERT_PTR(data); + uint8_t v, *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_ETS, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + ets = ETS(qdisc); + if (streq(lvalue, "Bands")) + p = &ets->n_bands; + else if (streq(lvalue, "StrictBands")) + p = &ets->n_strict; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *p = 0; + + qdisc = NULL; + return 0; + } + + r = safe_atou8(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (v > TCQ_ETS_MAX_BANDS) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s='. The value must be <= %d, ignoring assignment: %s", + lvalue, TCQ_ETS_MAX_BANDS, rvalue); + return 0; + } + + *p = v; + qdisc = NULL; + + return 0; +} + +int config_parse_ets_quanta( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + EnhancedTransmissionSelection *ets; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_ETS, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + ets = ETS(qdisc); + + if (isempty(rvalue)) { + memzero(ets->quanta, sizeof(uint32_t) * TCQ_ETS_MAX_BANDS); + ets->n_quanta = 0; + + qdisc = NULL; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + uint64_t v; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract next value, ignoring: %m"); + break; + } + if (r == 0) + break; + + r = parse_size(word, 1024, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, word); + continue; + } + if (v == 0 || v > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, word); + continue; + } + if (ets->n_quanta >= TCQ_ETS_MAX_BANDS) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Too many quanta in '%s=', ignoring assignment: %s", + lvalue, word); + continue; + } + + ets->quanta[ets->n_quanta++] = v; + } + + qdisc = NULL; + + return 0; +} + +int config_parse_ets_prio( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + EnhancedTransmissionSelection *ets; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_ETS, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + ets = ETS(qdisc); + + if (isempty(rvalue)) { + memzero(ets->prio, sizeof(uint8_t) * (TC_PRIO_MAX + 1)); + ets->n_prio = 0; + + qdisc = NULL; + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + uint8_t v; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract next value, ignoring: %m"); + break; + } + if (r == 0) + break; + + r = safe_atou8(word, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, word); + continue; + } + if (ets->n_prio > TC_PRIO_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Too many priomap in '%s=', ignoring assignment: %s", + lvalue, word); + continue; + } + + ets->prio[ets->n_prio++] = v; + } + + qdisc = NULL; + + return 0; +} + +static int enhanced_transmission_selection_verify(QDisc *qdisc) { + EnhancedTransmissionSelection *ets; + + assert(qdisc); + + ets = ETS(qdisc); + + if (ets->n_bands == 0) + ets->n_bands = ets->n_strict + ets->n_quanta; + + if (ets->n_bands == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: At least one of Band=, Strict=, or Quanta= must be specified. " + "Ignoring [EnhancedTransmissionSelection] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + if (ets->n_bands < ets->n_strict + ets->n_quanta) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Not enough total bands to cover all the strict bands and quanta. " + "Ignoring [EnhancedTransmissionSelection] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + for (unsigned i = 0; i < ets->n_prio; i++) + if (ets->prio[i] >= ets->n_bands) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: PriorityMap= element is out of bands. " + "Ignoring [EnhancedTransmissionSelection] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + return 0; +} + +const QDiscVTable ets_vtable = { + .object_size = sizeof(EnhancedTransmissionSelection), + .tca_kind = "ets", + .fill_message = enhanced_transmission_selection_fill_message, + .verify = enhanced_transmission_selection_verify, +}; diff --git a/src/network/tc/ets.h b/src/network/tc/ets.h new file mode 100644 index 0000000..b6dd428 --- /dev/null +++ b/src/network/tc/ets.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct EnhancedTransmissionSelection { + QDisc meta; + + uint8_t n_bands; + uint8_t n_strict; + unsigned n_quanta; + uint32_t quanta[TCQ_ETS_MAX_BANDS]; + unsigned n_prio; + uint8_t prio[TC_PRIO_MAX + 1]; +} EnhancedTransmissionSelection; + +DEFINE_QDISC_CAST(ETS, EnhancedTransmissionSelection); +extern const QDiscVTable ets_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_ets_u8); +CONFIG_PARSER_PROTOTYPE(config_parse_ets_quanta); +CONFIG_PARSER_PROTOTYPE(config_parse_ets_prio); diff --git a/src/network/tc/fifo.c b/src/network/tc/fifo.c new file mode 100644 index 0000000..940fa00 --- /dev/null +++ b/src/network/tc/fifo.c @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "fifo.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "string-util.h" + +static int fifo_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + FirstInFirstOut *fifo; + int r; + + assert(link); + assert(qdisc); + assert(req); + + switch (qdisc->kind) { + case QDISC_KIND_PFIFO: + assert_se(fifo = PFIFO(qdisc)); + break; + case QDISC_KIND_BFIFO: + assert_se(fifo = BFIFO(qdisc)); + break; + case QDISC_KIND_PFIFO_HEAD_DROP: + assert_se(fifo = PFIFO_HEAD_DROP(qdisc)); + break; + default: + assert_not_reached(); + } + + const struct tc_fifo_qopt opt = { .limit = fifo->limit }; + r = sd_netlink_message_append_data(req, TCA_OPTIONS, &opt, sizeof(opt)); + if (r < 0) + return r; + + return 0; +} + +int config_parse_pfifo_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + FirstInFirstOut *fifo; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(ltype, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + switch (qdisc->kind) { + case QDISC_KIND_PFIFO: + fifo = PFIFO(qdisc); + break; + case QDISC_KIND_PFIFO_HEAD_DROP: + fifo = PFIFO_HEAD_DROP(qdisc); + break; + default: + assert_not_reached(); + } + + if (isempty(rvalue)) { + fifo->limit = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &fifo->limit); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + return 0; +} + +int config_parse_bfifo_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + FirstInFirstOut *fifo; + uint64_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_BFIFO, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fifo = BFIFO(qdisc); + + if (isempty(rvalue)) { + fifo->limit = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_size(rvalue, 1024, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (u > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + fifo->limit = (uint32_t) u; + + TAKE_PTR(qdisc); + return 0; +} + +const QDiscVTable pfifo_vtable = { + .object_size = sizeof(FirstInFirstOut), + .tca_kind = "pfifo", + .fill_message = fifo_fill_message, +}; + +const QDiscVTable bfifo_vtable = { + .object_size = sizeof(FirstInFirstOut), + .tca_kind = "bfifo", + .fill_message = fifo_fill_message, +}; + +const QDiscVTable pfifo_head_drop_vtable = { + .object_size = sizeof(FirstInFirstOut), + .tca_kind = "pfifo_head_drop", + .fill_message = fifo_fill_message, +}; + +const QDiscVTable pfifo_fast_vtable = { + .object_size = sizeof(FirstInFirstOut), + .tca_kind = "pfifo_fast", +}; diff --git a/src/network/tc/fifo.h b/src/network/tc/fifo.h new file mode 100644 index 0000000..b9bbd09 --- /dev/null +++ b/src/network/tc/fifo.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct FirstInFirstOut { + QDisc meta; + + uint32_t limit; +} FirstInFirstOut; + +DEFINE_QDISC_CAST(PFIFO, FirstInFirstOut); +DEFINE_QDISC_CAST(BFIFO, FirstInFirstOut); +DEFINE_QDISC_CAST(PFIFO_HEAD_DROP, FirstInFirstOut); +DEFINE_QDISC_CAST(PFIFO_FAST, FirstInFirstOut); + +extern const QDiscVTable pfifo_vtable; +extern const QDiscVTable bfifo_vtable; +extern const QDiscVTable pfifo_head_drop_vtable; +extern const QDiscVTable pfifo_fast_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_pfifo_size); +CONFIG_PARSER_PROTOTYPE(config_parse_bfifo_size); diff --git a/src/network/tc/fq-codel.c b/src/network/tc/fq-codel.c new file mode 100644 index 0000000..124faf7 --- /dev/null +++ b/src/network/tc/fq-codel.c @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-util.h" +#include "strv.h" + +static int fair_queueing_controlled_delay_init(QDisc *qdisc) { + FairQueueingControlledDelay *fqcd; + + assert(qdisc); + + fqcd = FQ_CODEL(qdisc); + + fqcd->memory_limit = UINT32_MAX; + fqcd->ce_threshold_usec = USEC_INFINITY; + fqcd->ecn = -1; + + return 0; +} + +static int fair_queueing_controlled_delay_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + FairQueueingControlledDelay *fqcd; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(fqcd = FQ_CODEL(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "fq_codel"); + if (r < 0) + return r; + + if (fqcd->packet_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_LIMIT, fqcd->packet_limit); + if (r < 0) + return r; + } + + if (fqcd->flows > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_FLOWS, fqcd->flows); + if (r < 0) + return r; + } + + if (fqcd->quantum > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_QUANTUM, fqcd->quantum); + if (r < 0) + return r; + } + + if (fqcd->interval_usec > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_INTERVAL, fqcd->interval_usec); + if (r < 0) + return r; + } + + if (fqcd->target_usec > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_TARGET, fqcd->target_usec); + if (r < 0) + return r; + } + + if (fqcd->ecn >= 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_ECN, fqcd->ecn); + if (r < 0) + return r; + } + + if (fqcd->ce_threshold_usec != USEC_INFINITY) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_CE_THRESHOLD, fqcd->ce_threshold_usec); + if (r < 0) + return r; + } + + if (fqcd->memory_limit != UINT32_MAX) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CODEL_MEMORY_LIMIT, fqcd->memory_limit); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_fair_queueing_controlled_delay_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueingControlledDelay *fqcd; + Network *network = ASSERT_PTR(data); + uint32_t *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fqcd = FQ_CODEL(qdisc); + + if (streq(lvalue, "PacketLimit")) + p = &fqcd->packet_limit; + else if (streq(lvalue, "Flows")) + p = &fqcd->flows; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *p = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_fair_queueing_controlled_delay_usec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueingControlledDelay *fqcd; + Network *network = ASSERT_PTR(data); + usec_t *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fqcd = FQ_CODEL(qdisc); + + if (streq(lvalue, "TargetSec")) + p = &fqcd->target_usec; + else if (streq(lvalue, "IntervalSec")) + p = &fqcd->interval_usec; + else if (streq(lvalue, "CEThresholdSec")) + p = &fqcd->ce_threshold_usec; + else + assert_not_reached(); + + if (isempty(rvalue)) { + if (streq(lvalue, "CEThresholdSec")) + *p = USEC_INFINITY; + else + *p = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_sec(rvalue, p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_fair_queueing_controlled_delay_bool( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueingControlledDelay *fqcd; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fqcd = FQ_CODEL(qdisc); + + r = parse_tristate(rvalue, &fqcd->ecn); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_fair_queueing_controlled_delay_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueingControlledDelay *fqcd; + Network *network = ASSERT_PTR(data); + uint64_t sz; + uint32_t *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ_CODEL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fqcd = FQ_CODEL(qdisc); + + if (STR_IN_SET(lvalue, "MemoryLimitBytes", "MemoryLimit")) + p = &fqcd->memory_limit; + else if (STR_IN_SET(lvalue, "QuantumBytes", "Quantum")) + p = &fqcd->quantum; + else + assert_not_reached(); + + if (isempty(rvalue)) { + if (STR_IN_SET(lvalue, "MemoryLimitBytes", "MemoryLimit")) + *p = UINT32_MAX; + else + *p = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_size(rvalue, 1024, &sz); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (sz >= UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified '%s=' is too large, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + *p = sz; + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable fq_codel_vtable = { + .object_size = sizeof(FairQueueingControlledDelay), + .tca_kind = "fq_codel", + .init = fair_queueing_controlled_delay_init, + .fill_message = fair_queueing_controlled_delay_fill_message, +}; diff --git a/src/network/tc/fq-codel.h b/src/network/tc/fq-codel.h new file mode 100644 index 0000000..2553c59 --- /dev/null +++ b/src/network/tc/fq-codel.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" +#include "time-util.h" + +typedef struct FairQueueingControlledDelay { + QDisc meta; + + uint32_t packet_limit; + uint32_t flows; + uint32_t quantum; + uint32_t memory_limit; + usec_t target_usec; + usec_t interval_usec; + usec_t ce_threshold_usec; + int ecn; +} FairQueueingControlledDelay; + +DEFINE_QDISC_CAST(FQ_CODEL, FairQueueingControlledDelay); +extern const QDiscVTable fq_codel_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_controlled_delay_u32); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_controlled_delay_usec); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_controlled_delay_bool); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_controlled_delay_size); diff --git a/src/network/tc/fq-pie.c b/src/network/tc/fq-pie.c new file mode 100644 index 0000000..c8b2e7b --- /dev/null +++ b/src/network/tc/fq-pie.c @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "fq-pie.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "string-util.h" + +static int fq_pie_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + FlowQueuePIE *fq_pie; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(fq_pie = FQ_PIE(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "fq_pie"); + if (r < 0) + return r; + + if (fq_pie->packet_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_PIE_LIMIT, fq_pie->packet_limit); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_fq_pie_packet_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FlowQueuePIE *fq_pie; + Network *network = ASSERT_PTR(data); + uint32_t val; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ_PIE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + + fq_pie = FQ_PIE(qdisc); + + if (isempty(rvalue)) { + fq_pie->packet_limit = 0; + + qdisc = NULL; + return 0; + } + + r = safe_atou32(rvalue, &val); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (val == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + fq_pie->packet_limit = val; + qdisc = NULL; + + return 0; +} + +const QDiscVTable fq_pie_vtable = { + .object_size = sizeof(FlowQueuePIE), + .tca_kind = "fq_pie", + .fill_message = fq_pie_fill_message, +}; diff --git a/src/network/tc/fq-pie.h b/src/network/tc/fq-pie.h new file mode 100644 index 0000000..51fb626 --- /dev/null +++ b/src/network/tc/fq-pie.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct FlowQueuePIE { + QDisc meta; + + uint32_t packet_limit; +} FlowQueuePIE; + +DEFINE_QDISC_CAST(FQ_PIE, FlowQueuePIE); +extern const QDiscVTable fq_pie_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_fq_pie_packet_limit); diff --git a/src/network/tc/fq.c b/src/network/tc/fq.c new file mode 100644 index 0000000..74785c9 --- /dev/null +++ b/src/network/tc/fq.c @@ -0,0 +1,409 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "fq.h" +#include "logarithm.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" + +static int fair_queueing_init(QDisc *qdisc) { + FairQueueing *fq; + + assert(qdisc); + + fq = FQ(qdisc); + + fq->pacing = -1; + fq->ce_threshold_usec = USEC_INFINITY; + + return 0; +} + +static int fair_queueing_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + FairQueueing *fq; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(fq = FQ(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "fq"); + if (r < 0) + return r; + + if (fq->packet_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_PLIMIT, fq->packet_limit); + if (r < 0) + return r; + } + + if (fq->flow_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_FLOW_PLIMIT, fq->flow_limit); + if (r < 0) + return r; + } + + if (fq->quantum > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_QUANTUM, fq->quantum); + if (r < 0) + return r; + } + + if (fq->initial_quantum > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_INITIAL_QUANTUM, fq->initial_quantum); + if (r < 0) + return r; + } + + if (fq->pacing >= 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_RATE_ENABLE, fq->pacing); + if (r < 0) + return r; + } + + if (fq->max_rate > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_FLOW_MAX_RATE, fq->max_rate); + if (r < 0) + return r; + } + + if (fq->buckets > 0) { + uint32_t l; + + l = log2u(fq->buckets); + r = sd_netlink_message_append_u32(req, TCA_FQ_BUCKETS_LOG, l); + if (r < 0) + return r; + } + + if (fq->orphan_mask > 0) { + r = sd_netlink_message_append_u32(req, TCA_FQ_ORPHAN_MASK, fq->orphan_mask); + if (r < 0) + return r; + } + + if (fq->ce_threshold_usec != USEC_INFINITY) { + r = sd_netlink_message_append_u32(req, TCA_FQ_CE_THRESHOLD, fq->ce_threshold_usec); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_fair_queueing_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueing *fq; + Network *network = ASSERT_PTR(data); + uint32_t *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fq = FQ(qdisc); + + if (streq(lvalue, "PacketLimit")) + p = &fq->packet_limit; + else if (streq(lvalue, "FlowLimit")) + p = &fq->flow_limit; + else if (streq(lvalue, "Buckets")) + p = &fq->buckets; + else if (streq(lvalue, "OrphanMask")) + p = &fq->orphan_mask; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *p = 0; + + qdisc = NULL; + return 0; + } + + r = safe_atou32(rvalue, p); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + qdisc = NULL; + + return 0; +} + +int config_parse_fair_queueing_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueing *fq; + Network *network = ASSERT_PTR(data); + uint64_t sz; + uint32_t *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fq = FQ(qdisc); + + if (STR_IN_SET(lvalue, "QuantumBytes", "Quantum")) + p = &fq->quantum; + else if (STR_IN_SET(lvalue, "InitialQuantumBytes", "InitialQuantum")) + p = &fq->initial_quantum; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *p = 0; + + qdisc = NULL; + return 0; + } + + r = parse_size(rvalue, 1024, &sz); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (sz > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified '%s=' is too large, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + *p = sz; + qdisc = NULL; + + return 0; +} + +int config_parse_fair_queueing_bool( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueing *fq; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fq = FQ(qdisc); + + r = parse_tristate(rvalue, &fq->pacing); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + fq->pacing = r; + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_fair_queueing_usec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueing *fq; + Network *network = ASSERT_PTR(data); + usec_t sec; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fq = FQ(qdisc); + + if (isempty(rvalue)) { + fq->ce_threshold_usec = USEC_INFINITY; + + qdisc = NULL; + return 0; + } + + r = parse_sec(rvalue, &sec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (sec > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified '%s=' is too large, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + fq->ce_threshold_usec = sec; + qdisc = NULL; + + return 0; +} + +int config_parse_fair_queueing_max_rate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + FairQueueing *fq; + Network *network = ASSERT_PTR(data); + uint64_t sz; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_FQ, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + fq = FQ(qdisc); + + if (isempty(rvalue)) { + fq->max_rate = 0; + + qdisc = NULL; + return 0; + } + + r = parse_size(rvalue, 1000, &sz); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (sz / 8 > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified '%s=' is too large, ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + fq->max_rate = sz / 8; + qdisc = NULL; + + return 0; +} + +const QDiscVTable fq_vtable = { + .init = fair_queueing_init, + .object_size = sizeof(FairQueueing), + .tca_kind = "fq", + .fill_message = fair_queueing_fill_message, +}; diff --git a/src/network/tc/fq.h b/src/network/tc/fq.h new file mode 100644 index 0000000..77469c4 --- /dev/null +++ b/src/network/tc/fq.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct FairQueueing { + QDisc meta; + + uint32_t packet_limit; + uint32_t flow_limit; + uint32_t quantum; + uint32_t initial_quantum; + uint32_t max_rate; + uint32_t buckets; + uint32_t orphan_mask; + int pacing; + usec_t ce_threshold_usec; +} FairQueueing; + +DEFINE_QDISC_CAST(FQ, FairQueueing); +extern const QDiscVTable fq_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_u32); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_size); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_bool); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_usec); +CONFIG_PARSER_PROTOTYPE(config_parse_fair_queueing_max_rate); diff --git a/src/network/tc/gred.c b/src/network/tc/gred.c new file mode 100644 index 0000000..2efb02c --- /dev/null +++ b/src/network/tc/gred.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-util.h" + +static int generic_random_early_detection_init(QDisc *qdisc) { + GenericRandomEarlyDetection *gred; + + assert(qdisc); + + gred = GRED(qdisc); + + gred->grio = -1; + + return 0; +} + +static int generic_random_early_detection_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + GenericRandomEarlyDetection *gred; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(gred = GRED(qdisc)); + + const struct tc_gred_sopt opt = { + .DPs = gred->virtual_queues, + .def_DP = gred->default_virtual_queue, + .grio = gred->grio, + }; + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "gred"); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_GRED_DPS, &opt, sizeof(opt)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +static int generic_random_early_detection_verify(QDisc *qdisc) { + GenericRandomEarlyDetection *gred = GRED(qdisc); + + if (gred->default_virtual_queue >= gred->virtual_queues) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: DefaultVirtualQueue= must be less than VirtualQueues=. " + "Ignoring [GenericRandomEarlyDetection] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + return 0; +} + +int config_parse_generic_random_early_detection_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + GenericRandomEarlyDetection *gred; + Network *network = ASSERT_PTR(data); + uint32_t *p; + uint32_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_GRED, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + gred = GRED(qdisc); + + if (streq(lvalue, "VirtualQueues")) + p = &gred->virtual_queues; + else if (streq(lvalue, "DefaultVirtualQueue")) + p = &gred->default_virtual_queue; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *p = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (v > MAX_DPs) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + + *p = v; + TAKE_PTR(qdisc); + + return 0; +} +int config_parse_generic_random_early_detection_bool( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + GenericRandomEarlyDetection *gred; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_GRED, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + gred = GRED(qdisc); + + r = parse_tristate(rvalue, &gred->grio); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable gred_vtable = { + .object_size = sizeof(GenericRandomEarlyDetection), + .tca_kind = "gred", + .init = generic_random_early_detection_init, + .fill_message = generic_random_early_detection_fill_message, + .verify = generic_random_early_detection_verify, +}; diff --git a/src/network/tc/gred.h b/src/network/tc/gred.h new file mode 100644 index 0000000..c084ff1 --- /dev/null +++ b/src/network/tc/gred.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct GenericRandomEarlyDetection { + QDisc meta; + + uint32_t virtual_queues; + uint32_t default_virtual_queue; + int grio; +} GenericRandomEarlyDetection; + +DEFINE_QDISC_CAST(GRED, GenericRandomEarlyDetection); +extern const QDiscVTable gred_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_generic_random_early_detection_u32); +CONFIG_PARSER_PROTOTYPE(config_parse_generic_random_early_detection_bool); diff --git a/src/network/tc/hhf.c b/src/network/tc/hhf.c new file mode 100644 index 0000000..d44522f --- /dev/null +++ b/src/network/tc/hhf.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "hhf.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "string-util.h" + +static int heavy_hitter_filter_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + HeavyHitterFilter *hhf; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(hhf = HHF(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "hhf"); + if (r < 0) + return r; + + if (hhf->packet_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_HHF_BACKLOG_LIMIT, hhf->packet_limit); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_heavy_hitter_filter_packet_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + HeavyHitterFilter *hhf; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_HHF, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + hhf = HHF(qdisc); + + if (isempty(rvalue)) { + hhf->packet_limit = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &hhf->packet_limit); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable hhf_vtable = { + .object_size = sizeof(HeavyHitterFilter), + .tca_kind = "hhf", + .fill_message = heavy_hitter_filter_fill_message, +}; diff --git a/src/network/tc/hhf.h b/src/network/tc/hhf.h new file mode 100644 index 0000000..04caaa8 --- /dev/null +++ b/src/network/tc/hhf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct HeavyHitterFilter { + QDisc meta; + + uint32_t packet_limit; +} HeavyHitterFilter; + +DEFINE_QDISC_CAST(HHF, HeavyHitterFilter); +extern const QDiscVTable hhf_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_heavy_hitter_filter_packet_limit); diff --git a/src/network/tc/htb.c b/src/network/tc/htb.c new file mode 100644 index 0000000..eb2c8cf --- /dev/null +++ b/src/network/tc/htb.c @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "parse-util.h" +#include "qdisc.h" +#include "htb.h" +#include "string-util.h" +#include "tc-util.h" + +#define HTB_DEFAULT_RATE_TO_QUANTUM 10 +#define HTB_DEFAULT_MTU 1600 /* Ethernet packet length */ + +static int hierarchy_token_bucket_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + HierarchyTokenBucket *htb; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(htb = HTB(qdisc)); + + struct tc_htb_glob opt = { + .version = 3, + .rate2quantum = htb->rate_to_quantum, + .defcls = htb->default_class, + }; + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "htb"); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_HTB_INIT, &opt, sizeof(opt)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + return 0; +} + +int config_parse_hierarchy_token_bucket_default_class( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + HierarchyTokenBucket *htb; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_HTB, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + htb = HTB(qdisc); + + if (isempty(rvalue)) { + htb->default_class = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32_full(rvalue, 16, &htb->default_class); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_hierarchy_token_bucket_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + HierarchyTokenBucket *htb; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_HTB, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + htb = HTB(qdisc); + + if (isempty(rvalue)) { + htb->rate_to_quantum = HTB_DEFAULT_RATE_TO_QUANTUM; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &htb->rate_to_quantum); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +static int hierarchy_token_bucket_init(QDisc *qdisc) { + HierarchyTokenBucket *htb; + + assert(qdisc); + + htb = HTB(qdisc); + + htb->rate_to_quantum = HTB_DEFAULT_RATE_TO_QUANTUM; + + return 0; +} + +const QDiscVTable htb_vtable = { + .object_size = sizeof(HierarchyTokenBucket), + .tca_kind = "htb", + .fill_message = hierarchy_token_bucket_fill_message, + .init = hierarchy_token_bucket_init, +}; + +static int hierarchy_token_bucket_class_fill_message(Link *link, TClass *tclass, sd_netlink_message *req) { + HierarchyTokenBucketClass *htb; + uint32_t rtab[256], ctab[256]; + int r; + + assert(link); + assert(tclass); + assert(req); + + assert_se(htb = TCLASS_TO_HTB(tclass)); + + struct tc_htb_opt opt = { + .prio = htb->priority, + .quantum = htb->quantum, + .rate.rate = (htb->rate >= (1ULL << 32)) ? ~0U : htb->rate, + .ceil.rate = (htb->ceil_rate >= (1ULL << 32)) ? ~0U : htb->ceil_rate, + .rate.overhead = htb->overhead, + .ceil.overhead = htb->overhead, + }; + + r = tc_transmit_time(htb->rate, htb->buffer, &opt.buffer); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate buffer size: %m"); + + r = tc_transmit_time(htb->ceil_rate, htb->ceil_buffer, &opt.cbuffer); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate ceil buffer size: %m"); + + r = tc_fill_ratespec_and_table(&opt.rate, rtab, htb->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate rate table: %m"); + + r = tc_fill_ratespec_and_table(&opt.ceil, ctab, htb->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate ceil rate table: %m"); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "htb"); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_HTB_PARMS, &opt, sizeof(opt)); + if (r < 0) + return r; + + if (htb->rate >= (1ULL << 32)) { + r = sd_netlink_message_append_u64(req, TCA_HTB_RATE64, htb->rate); + if (r < 0) + return r; + } + + if (htb->ceil_rate >= (1ULL << 32)) { + r = sd_netlink_message_append_u64(req, TCA_HTB_CEIL64, htb->ceil_rate); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_data(req, TCA_HTB_RTAB, rtab, sizeof(rtab)); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_HTB_CTAB, ctab, sizeof(ctab)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_hierarchy_token_bucket_class_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + HierarchyTokenBucketClass *htb; + Network *network = ASSERT_PTR(data); + uint32_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(TCLASS_KIND_HTB, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + htb = TCLASS_TO_HTB(tclass); + + if (isempty(rvalue)) { + htb->priority = 0; + tclass = NULL; + return 0; + } + + r = safe_atou32(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + htb->priority = v; + tclass = NULL; + + return 0; +} + +int config_parse_hierarchy_token_bucket_class_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + HierarchyTokenBucketClass *htb; + Network *network = ASSERT_PTR(data); + uint64_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(TCLASS_KIND_HTB, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + htb = TCLASS_TO_HTB(tclass); + + if (isempty(rvalue)) { + if (streq(lvalue, "QuantumBytes")) + htb->quantum = 0; + else if (streq(lvalue, "MTUBytes")) + htb->mtu = HTB_DEFAULT_MTU; + else if (streq(lvalue, "OverheadBytes")) + htb->overhead = 0; + else if (streq(lvalue, "BufferBytes")) + htb->buffer = 0; + else if (streq(lvalue, "CeilBufferBytes")) + htb->ceil_buffer = 0; + else + assert_not_reached(); + + tclass = NULL; + return 0; + } + + r = parse_size(rvalue, 1024, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if ((streq(lvalue, "OverheadBytes") && v > UINT16_MAX) || v > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (streq(lvalue, "QuantumBytes")) + htb->quantum = v; + else if (streq(lvalue, "OverheadBytes")) + htb->overhead = v; + else if (streq(lvalue, "MTUBytes")) + htb->mtu = v; + else if (streq(lvalue, "BufferBytes")) + htb->buffer = v; + else if (streq(lvalue, "CeilBufferBytes")) + htb->ceil_buffer = v; + else + assert_not_reached(); + + tclass = NULL; + + return 0; +} + +int config_parse_hierarchy_token_bucket_class_rate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + HierarchyTokenBucketClass *htb; + Network *network = ASSERT_PTR(data); + uint64_t *v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(TCLASS_KIND_HTB, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + htb = TCLASS_TO_HTB(tclass); + if (streq(lvalue, "Rate")) + v = &htb->rate; + else if (streq(lvalue, "CeilRate")) + v = &htb->ceil_rate; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *v = 0; + + tclass = NULL; + return 0; + } + + r = parse_size(rvalue, 1000, v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + *v /= 8; + tclass = NULL; + + return 0; +} + +static int hierarchy_token_bucket_class_init(TClass *tclass) { + HierarchyTokenBucketClass *htb; + + assert(tclass); + + htb = TCLASS_TO_HTB(tclass); + + htb->mtu = HTB_DEFAULT_MTU; + + return 0; +} + +static int hierarchy_token_bucket_class_verify(TClass *tclass) { + HierarchyTokenBucketClass *htb; + uint32_t hz; + int r; + + assert(tclass); + + htb = TCLASS_TO_HTB(tclass); + + if (htb->rate == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Rate= is mandatory. " + "Ignoring [HierarchyTokenBucketClass] section from line %u.", + tclass->section->filename, tclass->section->line); + + /* if CeilRate= setting is missing, use the same as Rate= */ + if (htb->ceil_rate == 0) + htb->ceil_rate = htb->rate; + + r = tc_init(NULL, &hz); + if (r < 0) + return log_error_errno(r, "Failed to read /proc/net/psched: %m"); + + if (htb->buffer == 0) + htb->buffer = htb->rate / hz + htb->mtu; + if (htb->ceil_buffer == 0) + htb->ceil_buffer = htb->ceil_rate / hz + htb->mtu; + + return 0; +} + +const TClassVTable htb_tclass_vtable = { + .object_size = sizeof(HierarchyTokenBucketClass), + .tca_kind = "htb", + .fill_message = hierarchy_token_bucket_class_fill_message, + .init = hierarchy_token_bucket_class_init, + .verify = hierarchy_token_bucket_class_verify, +}; diff --git a/src/network/tc/htb.h b/src/network/tc/htb.h new file mode 100644 index 0000000..55644db --- /dev/null +++ b/src/network/tc/htb.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" +#include "tclass.h" + +typedef struct HierarchyTokenBucket { + QDisc meta; + + uint32_t default_class; + uint32_t rate_to_quantum; +} HierarchyTokenBucket; + +DEFINE_QDISC_CAST(HTB, HierarchyTokenBucket); +extern const QDiscVTable htb_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_hierarchy_token_bucket_default_class); +CONFIG_PARSER_PROTOTYPE(config_parse_hierarchy_token_bucket_u32); + +typedef struct HierarchyTokenBucketClass { + TClass meta; + + uint32_t priority; + uint32_t quantum; + uint32_t mtu; + uint16_t overhead; + uint64_t rate; + uint32_t buffer; + uint64_t ceil_rate; + uint32_t ceil_buffer; +} HierarchyTokenBucketClass; + +DEFINE_TCLASS_CAST(HTB, HierarchyTokenBucketClass); +extern const TClassVTable htb_tclass_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_hierarchy_token_bucket_class_u32); +CONFIG_PARSER_PROTOTYPE(config_parse_hierarchy_token_bucket_class_size); +CONFIG_PARSER_PROTOTYPE(config_parse_hierarchy_token_bucket_class_rate); diff --git a/src/network/tc/netem.c b/src/network/tc/netem.c new file mode 100644 index 0000000..6a63221 --- /dev/null +++ b/src/network/tc/netem.c @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netem.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "qdisc.h" +#include "strv.h" +#include "tc-util.h" + +static int network_emulator_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + NetworkEmulator *ne; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(ne = NETEM(qdisc)); + + struct tc_netem_qopt opt = { + .limit = ne->limit > 0 ? ne->limit : 1000, + .loss = ne->loss, + .duplicate = ne->duplicate, + }; + + if (ne->delay != USEC_INFINITY) { + r = tc_time_to_tick(ne->delay, &opt.latency); + if (r < 0) + return log_link_error_errno(link, r, "Failed to calculate latency in TCA_OPTION: %m"); + } + + if (ne->jitter != USEC_INFINITY) { + r = tc_time_to_tick(ne->jitter, &opt.jitter); + if (r < 0) + return log_link_error_errno(link, r, "Failed to calculate jitter in TCA_OPTION: %m"); + } + + r = sd_netlink_message_append_data(req, TCA_OPTIONS, &opt, sizeof(opt)); + if (r < 0) + return r; + + return 0; +} + +int config_parse_network_emulator_delay( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + NetworkEmulator *ne; + usec_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_NETEM, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + ne = NETEM(qdisc); + + if (isempty(rvalue)) { + if (STR_IN_SET(lvalue, "DelaySec", "NetworkEmulatorDelaySec")) + ne->delay = USEC_INFINITY; + else if (STR_IN_SET(lvalue, "DelayJitterSec", "NetworkEmulatorDelayJitterSec")) + ne->jitter = USEC_INFINITY; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_sec(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (STR_IN_SET(lvalue, "DelaySec", "NetworkEmulatorDelaySec")) + ne->delay = u; + else if (STR_IN_SET(lvalue, "DelayJitterSec", "NetworkEmulatorDelayJitterSec")) + ne->jitter = u; + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_network_emulator_rate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + NetworkEmulator *ne; + uint32_t rate; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_NETEM, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + ne = NETEM(qdisc); + + if (isempty(rvalue)) { + if (STR_IN_SET(lvalue, "LossRate", "NetworkEmulatorLossRate")) + ne->loss = 0; + else if (STR_IN_SET(lvalue, "DuplicateRate", "NetworkEmulatorDuplicateRate")) + ne->duplicate = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_tc_percent(rvalue, &rate); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (STR_IN_SET(lvalue, "LossRate", "NetworkEmulatorLossRate")) + ne->loss = rate; + else if (STR_IN_SET(lvalue, "DuplicateRate", "NetworkEmulatorDuplicateRate")) + ne->duplicate = rate; + + TAKE_PTR(qdisc); + return 0; +} + +int config_parse_network_emulator_packet_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + NetworkEmulator *ne; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_NETEM, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + ne = NETEM(qdisc); + + if (isempty(rvalue)) { + ne->limit = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou(rvalue, &ne->limit); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + return 0; +} + +const QDiscVTable netem_vtable = { + .object_size = sizeof(NetworkEmulator), + .tca_kind = "netem", + .fill_message = network_emulator_fill_message, +}; diff --git a/src/network/tc/netem.h b/src/network/tc/netem.h new file mode 100644 index 0000000..d58d5ac --- /dev/null +++ b/src/network/tc/netem.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" +#include "time-util.h" + +typedef struct NetworkEmulator { + QDisc meta; + + usec_t delay; + usec_t jitter; + + uint32_t limit; + uint32_t loss; + uint32_t duplicate; +} NetworkEmulator; + +DEFINE_QDISC_CAST(NETEM, NetworkEmulator); +extern const QDiscVTable netem_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_network_emulator_delay); +CONFIG_PARSER_PROTOTYPE(config_parse_network_emulator_rate); +CONFIG_PARSER_PROTOTYPE(config_parse_network_emulator_packet_limit); diff --git a/src/network/tc/pie.c b/src/network/tc/pie.c new file mode 100644 index 0000000..c9b171b --- /dev/null +++ b/src/network/tc/pie.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "pie.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "string-util.h" + +static int pie_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + ProportionalIntegralControllerEnhanced *pie; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(pie = PIE(qdisc)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "pie"); + if (r < 0) + return r; + + if (pie->packet_limit > 0) { + r = sd_netlink_message_append_u32(req, TCA_PIE_LIMIT, pie->packet_limit); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_pie_packet_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + ProportionalIntegralControllerEnhanced *pie; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_PIE, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + pie = PIE(qdisc); + + if (isempty(rvalue)) { + pie->packet_limit = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &pie->packet_limit); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable pie_vtable = { + .object_size = sizeof(ProportionalIntegralControllerEnhanced), + .tca_kind = "pie", + .fill_message = pie_fill_message, +}; diff --git a/src/network/tc/pie.h b/src/network/tc/pie.h new file mode 100644 index 0000000..40a114e --- /dev/null +++ b/src/network/tc/pie.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct ProportionalIntegralControllerEnhanced { + QDisc meta; + + uint32_t packet_limit; +} ProportionalIntegralControllerEnhanced; + +DEFINE_QDISC_CAST(PIE, ProportionalIntegralControllerEnhanced); +extern const QDiscVTable pie_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_pie_packet_limit); diff --git a/src/network/tc/qdisc.c b/src/network/tc/qdisc.c new file mode 100644 index 0000000..f20f410 --- /dev/null +++ b/src/network/tc/qdisc.c @@ -0,0 +1,715 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "in-addr-util.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "parse-util.h" +#include "qdisc.h" +#include "set.h" +#include "string-util.h" +#include "strv.h" +#include "tc-util.h" + +const QDiscVTable * const qdisc_vtable[_QDISC_KIND_MAX] = { + [QDISC_KIND_BFIFO] = &bfifo_vtable, + [QDISC_KIND_CAKE] = &cake_vtable, + [QDISC_KIND_CODEL] = &codel_vtable, + [QDISC_KIND_DRR] = &drr_vtable, + [QDISC_KIND_ETS] = &ets_vtable, + [QDISC_KIND_FQ] = &fq_vtable, + [QDISC_KIND_FQ_CODEL] = &fq_codel_vtable, + [QDISC_KIND_FQ_PIE] = &fq_pie_vtable, + [QDISC_KIND_GRED] = &gred_vtable, + [QDISC_KIND_HHF] = &hhf_vtable, + [QDISC_KIND_HTB] = &htb_vtable, + [QDISC_KIND_NETEM] = &netem_vtable, + [QDISC_KIND_PIE] = &pie_vtable, + [QDISC_KIND_QFQ] = &qfq_vtable, + [QDISC_KIND_PFIFO] = &pfifo_vtable, + [QDISC_KIND_PFIFO_FAST] = &pfifo_fast_vtable, + [QDISC_KIND_PFIFO_HEAD_DROP] = &pfifo_head_drop_vtable, + [QDISC_KIND_SFB] = &sfb_vtable, + [QDISC_KIND_SFQ] = &sfq_vtable, + [QDISC_KIND_TBF] = &tbf_vtable, + [QDISC_KIND_TEQL] = &teql_vtable, +}; + +static int qdisc_new(QDiscKind kind, QDisc **ret) { + _cleanup_(qdisc_freep) QDisc *qdisc = NULL; + int r; + + if (kind == _QDISC_KIND_INVALID) { + qdisc = new(QDisc, 1); + if (!qdisc) + return -ENOMEM; + + *qdisc = (QDisc) { + .parent = TC_H_ROOT, + .kind = kind, + }; + } else { + assert(kind >= 0 && kind < _QDISC_KIND_MAX); + qdisc = malloc0(qdisc_vtable[kind]->object_size); + if (!qdisc) + return -ENOMEM; + + qdisc->parent = TC_H_ROOT; + qdisc->kind = kind; + + if (QDISC_VTABLE(qdisc)->init) { + r = QDISC_VTABLE(qdisc)->init(qdisc); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(qdisc); + + return 0; +} + +int qdisc_new_static(QDiscKind kind, Network *network, const char *filename, unsigned section_line, QDisc **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(qdisc_freep) QDisc *qdisc = NULL; + QDisc *existing; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + existing = hashmap_get(network->qdiscs_by_section, n); + if (existing) { + if (existing->kind != _QDISC_KIND_INVALID && + kind != _QDISC_KIND_INVALID && + existing->kind != kind) + return -EINVAL; + + if (existing->kind == kind || kind == _QDISC_KIND_INVALID) { + *ret = existing; + return 0; + } + } + + r = qdisc_new(kind, &qdisc); + if (r < 0) + return r; + + if (existing) { + qdisc->handle = existing->handle; + qdisc->parent = existing->parent; + qdisc->tca_kind = TAKE_PTR(existing->tca_kind); + + qdisc_free(existing); + } + + qdisc->network = network; + qdisc->section = TAKE_PTR(n); + qdisc->source = NETWORK_CONFIG_SOURCE_STATIC; + + r = hashmap_ensure_put(&network->qdiscs_by_section, &config_section_hash_ops, qdisc->section, qdisc); + if (r < 0) + return r; + + *ret = TAKE_PTR(qdisc); + return 0; +} + +QDisc* qdisc_free(QDisc *qdisc) { + if (!qdisc) + return NULL; + + if (qdisc->network && qdisc->section) + hashmap_remove(qdisc->network->qdiscs_by_section, qdisc->section); + + config_section_free(qdisc->section); + + if (qdisc->link) + set_remove(qdisc->link->qdiscs, qdisc); + + free(qdisc->tca_kind); + return mfree(qdisc); +} + +static const char *qdisc_get_tca_kind(const QDisc *qdisc) { + assert(qdisc); + + return (QDISC_VTABLE(qdisc) && QDISC_VTABLE(qdisc)->tca_kind) ? + QDISC_VTABLE(qdisc)->tca_kind : qdisc->tca_kind; +} + +static void qdisc_hash_func(const QDisc *qdisc, struct siphash *state) { + assert(qdisc); + assert(state); + + siphash24_compress(&qdisc->handle, sizeof(qdisc->handle), state); + siphash24_compress(&qdisc->parent, sizeof(qdisc->parent), state); + siphash24_compress_string(qdisc_get_tca_kind(qdisc), state); +} + +static int qdisc_compare_func(const QDisc *a, const QDisc *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->handle, b->handle); + if (r != 0) + return r; + + r = CMP(a->parent, b->parent); + if (r != 0) + return r; + + return strcmp_ptr(qdisc_get_tca_kind(a), qdisc_get_tca_kind(b)); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + qdisc_hash_ops, + QDisc, + qdisc_hash_func, + qdisc_compare_func, + qdisc_free); + +static int qdisc_get(Link *link, const QDisc *in, QDisc **ret) { + QDisc *existing; + + assert(link); + assert(in); + + existing = set_get(link->qdiscs, in); + if (!existing) + return -ENOENT; + + if (ret) + *ret = existing; + return 0; +} + +static int qdisc_add(Link *link, QDisc *qdisc) { + int r; + + assert(link); + assert(qdisc); + + r = set_ensure_put(&link->qdiscs, &qdisc_hash_ops, qdisc); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + qdisc->link = link; + return 0; +} + +static int qdisc_dup(const QDisc *src, QDisc **ret) { + _cleanup_(qdisc_freep) QDisc *dst = NULL; + + assert(src); + assert(ret); + + if (QDISC_VTABLE(src)) + dst = memdup(src, QDISC_VTABLE(src)->object_size); + else + dst = newdup(QDisc, src, 1); + if (!dst) + return -ENOMEM; + + /* clear all pointers */ + dst->network = NULL; + dst->section = NULL; + dst->link = NULL; + dst->tca_kind = NULL; + + if (src->tca_kind) { + dst->tca_kind = strdup(src->tca_kind); + if (!dst->tca_kind) + return -ENOMEM; + } + + *ret = TAKE_PTR(dst); + return 0; +} + +static void log_qdisc_debug(QDisc *qdisc, Link *link, const char *str) { + _cleanup_free_ char *state = NULL; + + assert(qdisc); + assert(str); + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(qdisc->state, &state); + + log_link_debug(link, "%s %s QDisc (%s): handle=%"PRIx32":%"PRIx32", parent=%"PRIx32":%"PRIx32", kind=%s", + str, strna(network_config_source_to_string(qdisc->source)), strna(state), + TC_H_MAJ(qdisc->handle) >> 16, TC_H_MIN(qdisc->handle), + TC_H_MAJ(qdisc->parent) >> 16, TC_H_MIN(qdisc->parent), + strna(qdisc_get_tca_kind(qdisc))); +} + +int link_find_qdisc(Link *link, uint32_t handle, const char *kind, QDisc **ret) { + QDisc *qdisc; + + assert(link); + + SET_FOREACH(qdisc, link->qdiscs) { + if (qdisc->handle != handle) + continue; + + if (!qdisc_exists(qdisc)) + continue; + + if (kind && !streq_ptr(kind, qdisc_get_tca_kind(qdisc))) + continue; + + if (ret) + *ret = qdisc; + return 0; + } + + return -ENOENT; +} + +QDisc* qdisc_drop(QDisc *qdisc) { + TClass *tclass; + Link *link; + + assert(qdisc); + + link = ASSERT_PTR(qdisc->link); + + /* also drop all child classes assigned to the qdisc. */ + SET_FOREACH(tclass, link->tclasses) { + if (TC_H_MAJ(tclass->classid) != qdisc->handle) + continue; + + tclass_drop(tclass); + } + + qdisc_enter_removed(qdisc); + + if (qdisc->state == 0) { + log_qdisc_debug(qdisc, link, "Forgetting"); + qdisc = qdisc_free(qdisc); + } else + log_qdisc_debug(qdisc, link, "Removed"); + + return qdisc; +} + +static int qdisc_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, QDisc *qdisc) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_error_errno(link, m, r, "Could not set QDisc"); + link_enter_failed(link); + return 1; + } + + if (link->tc_messages == 0) { + log_link_debug(link, "Traffic control configured"); + link->tc_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int qdisc_configure(QDisc *qdisc, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(qdisc); + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(link->ifindex > 0); + assert(req); + + log_qdisc_debug(qdisc, link, "Configuring"); + + r = sd_rtnl_message_new_traffic_control(link->manager->rtnl, &m, RTM_NEWQDISC, + link->ifindex, qdisc->handle, qdisc->parent); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, TCA_KIND, qdisc_get_tca_kind(qdisc)); + if (r < 0) + return r; + + if (QDISC_VTABLE(qdisc) && QDISC_VTABLE(qdisc)->fill_message) { + r = QDISC_VTABLE(qdisc)->fill_message(link, qdisc, m); + if (r < 0) + return r; + } + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool qdisc_is_ready_to_configure(QDisc *qdisc, Link *link) { + assert(qdisc); + assert(link); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + /* TC_H_CLSACT == TC_H_INGRESS */ + if (!IN_SET(qdisc->parent, TC_H_ROOT, TC_H_CLSACT)) { + if (TC_H_MIN(qdisc->parent) == 0) { + if (link_find_qdisc(link, qdisc->parent, NULL, NULL) < 0) + return false; + } else { + if (link_find_tclass(link, qdisc->parent, NULL) < 0) + return false; + } + } + + if (QDISC_VTABLE(qdisc) && + QDISC_VTABLE(qdisc)->is_ready && + QDISC_VTABLE(qdisc)->is_ready(qdisc, link) <= 0) + return false; + + return true; +} + +static int qdisc_process_request(Request *req, Link *link, QDisc *qdisc) { + int r; + + assert(req); + assert(link); + assert(qdisc); + + if (!qdisc_is_ready_to_configure(qdisc, link)) + return 0; + + r = qdisc_configure(qdisc, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure QDisc: %m"); + + qdisc_enter_configuring(qdisc); + return 1; +} + +int link_request_qdisc(Link *link, QDisc *qdisc) { + QDisc *existing; + int r; + + assert(link); + assert(qdisc); + + if (qdisc_get(link, qdisc, &existing) < 0) { + _cleanup_(qdisc_freep) QDisc *tmp = NULL; + + r = qdisc_dup(qdisc, &tmp); + if (r < 0) + return log_oom(); + + r = qdisc_add(link, tmp); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to store QDisc: %m"); + + existing = TAKE_PTR(tmp); + } else + existing->source = qdisc->source; + + log_qdisc_debug(existing, link, "Requesting"); + r = link_queue_request_safe(link, REQUEST_TYPE_TC_QDISC, + existing, NULL, + qdisc_hash_func, + qdisc_compare_func, + qdisc_process_request, + &link->tc_messages, + qdisc_handler, + NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request QDisc: %m"); + if (r == 0) + return 0; + + qdisc_enter_requesting(existing); + return 1; +} + +int manager_rtnl_process_qdisc(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(qdisc_freep) QDisc *tmp = NULL; + QDisc *qdisc = NULL; + Link *link; + uint16_t type; + int ifindex, r; + + assert(rtnl); + assert(message); + assert(m); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive QDisc message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWQDISC, RTM_DELQDISC)) { + log_warning("rtnl: received unexpected message type %u when processing QDisc, ignoring.", type); + return 0; + } + + r = sd_rtnl_message_traffic_control_get_ifindex(message, &ifindex); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get ifindex from message, ignoring: %m"); + return 0; + } else if (ifindex <= 0) { + log_warning("rtnl: received QDisc message with invalid ifindex %d, ignoring.", ifindex); + return 0; + } + + if (link_get_by_index(m, ifindex, &link) < 0) { + if (!m->enumerating) + log_warning("rtnl: received QDisc for link '%d' we don't know about, ignoring.", ifindex); + return 0; + } + + r = qdisc_new(_QDISC_KIND_INVALID, &tmp); + if (r < 0) + return log_oom(); + + r = sd_rtnl_message_traffic_control_get_handle(message, &tmp->handle); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received QDisc message without handle, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_traffic_control_get_parent(message, &tmp->parent); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received QDisc message without parent, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_string_strdup(message, TCA_KIND, &tmp->tca_kind); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received QDisc message without kind, ignoring: %m"); + return 0; + } + + (void) qdisc_get(link, tmp, &qdisc); + + switch (type) { + case RTM_NEWQDISC: + if (qdisc) { + qdisc_enter_configured(qdisc); + log_qdisc_debug(qdisc, link, "Received remembered"); + } else { + qdisc_enter_configured(tmp); + log_qdisc_debug(tmp, link, "Received new"); + + r = qdisc_add(link, tmp); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to remember QDisc, ignoring: %m"); + return 0; + } + + qdisc = TAKE_PTR(tmp); + } + + if (!m->enumerating) { + /* Some kind of QDisc (e.g. tbf) also create an implicit class under the qdisc, but + * the kernel may not notify about the class. Hence, we need to enumerate classes. */ + r = link_enumerate_tclass(link, qdisc->handle); + if (r < 0) + log_link_warning_errno(link, r, "Failed to enumerate TClass, ignoring: %m"); + } + + break; + + case RTM_DELQDISC: + if (qdisc) + qdisc_drop(qdisc); + else + log_qdisc_debug(tmp, link, "Kernel removed unknown"); + + break; + + default: + assert_not_reached(); + } + + return 1; +} + +static int qdisc_section_verify(QDisc *qdisc, bool *has_root, bool *has_clsact) { + int r; + + assert(qdisc); + assert(has_root); + assert(has_clsact); + + if (section_is_invalid(qdisc->section)) + return -EINVAL; + + if (QDISC_VTABLE(qdisc) && QDISC_VTABLE(qdisc)->verify) { + r = QDISC_VTABLE(qdisc)->verify(qdisc); + if (r < 0) + return r; + } + + if (qdisc->parent == TC_H_ROOT) { + if (*has_root) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: More than one root qdisc section is defined. " + "Ignoring the qdisc section from line %u.", + qdisc->section->filename, qdisc->section->line); + *has_root = true; + } else if (qdisc->parent == TC_H_CLSACT) { /* TC_H_CLSACT == TC_H_INGRESS */ + if (*has_clsact) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: More than one clsact or ingress qdisc section is defined. " + "Ignoring the qdisc section from line %u.", + qdisc->section->filename, qdisc->section->line); + *has_clsact = true; + } + + return 0; +} + +void network_drop_invalid_qdisc(Network *network) { + bool has_root = false, has_clsact = false; + QDisc *qdisc; + + assert(network); + + HASHMAP_FOREACH(qdisc, network->qdiscs_by_section) + if (qdisc_section_verify(qdisc, &has_root, &has_clsact) < 0) + qdisc_free(qdisc); +} + +int config_parse_qdisc_parent( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(ltype, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + if (streq(rvalue, "root")) + qdisc->parent = TC_H_ROOT; + else if (streq(rvalue, "clsact")) { + qdisc->parent = TC_H_CLSACT; + qdisc->handle = TC_H_MAKE(TC_H_CLSACT, 0); + } else if (streq(rvalue, "ingress")) { + qdisc->parent = TC_H_INGRESS; + qdisc->handle = TC_H_MAKE(TC_H_INGRESS, 0); + } else { + r = parse_handle(rvalue, &qdisc->parent); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse 'Parent=', ignoring assignment: %s", + rvalue); + return 0; + } + } + + if (STR_IN_SET(rvalue, "clsact", "ingress")) { + r = free_and_strdup(&qdisc->tca_kind, rvalue); + if (r < 0) + return log_oom(); + } else + qdisc->tca_kind = mfree(qdisc->tca_kind); + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_qdisc_handle( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + uint16_t n; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(ltype, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + qdisc->handle = TC_H_UNSPEC; + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou16_full(rvalue, 16, &n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse 'Handle=', ignoring assignment: %s", + rvalue); + return 0; + } + + qdisc->handle = (uint32_t) n << 16; + TAKE_PTR(qdisc); + + return 0; +} diff --git a/src/network/tc/qdisc.h b/src/network/tc/qdisc.h new file mode 100644 index 0000000..a62b941 --- /dev/null +++ b/src/network/tc/qdisc.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Network Network; + +typedef enum QDiscKind { + QDISC_KIND_BFIFO, + QDISC_KIND_CAKE, + QDISC_KIND_CODEL, + QDISC_KIND_DRR, + QDISC_KIND_ETS, + QDISC_KIND_FQ, + QDISC_KIND_FQ_CODEL, + QDISC_KIND_FQ_PIE, + QDISC_KIND_GRED, + QDISC_KIND_HHF, + QDISC_KIND_HTB, + QDISC_KIND_NETEM, + QDISC_KIND_PFIFO, + QDISC_KIND_PFIFO_FAST, + QDISC_KIND_PFIFO_HEAD_DROP, + QDISC_KIND_PIE, + QDISC_KIND_QFQ, + QDISC_KIND_SFB, + QDISC_KIND_SFQ, + QDISC_KIND_TBF, + QDISC_KIND_TEQL, + _QDISC_KIND_MAX, + _QDISC_KIND_INVALID = -EINVAL, +} QDiscKind; + +typedef struct QDisc { + Link *link; + Network *network; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + + uint32_t handle; + uint32_t parent; + + char *tca_kind; + QDiscKind kind; +} QDisc; + +typedef struct QDiscVTable { + size_t object_size; + const char *tca_kind; + /* called in qdisc_new() */ + int (*init)(QDisc *qdisc); + int (*fill_message)(Link *link, QDisc *qdisc, sd_netlink_message *m); + int (*verify)(QDisc *qdisc); + int (*is_ready)(QDisc *qdisc, Link *link); +} QDiscVTable; + +extern const QDiscVTable * const qdisc_vtable[_QDISC_KIND_MAX]; + +#define QDISC_VTABLE(q) ((q)->kind != _QDISC_KIND_INVALID ? qdisc_vtable[(q)->kind] : NULL) + +/* For casting a qdisc into the various qdisc kinds */ +#define DEFINE_QDISC_CAST(UPPERCASE, MixedCase) \ + static inline MixedCase* UPPERCASE(QDisc *q) { \ + if (_unlikely_(!q || q->kind != QDISC_KIND_##UPPERCASE)) \ + return NULL; \ + \ + return (MixedCase*) q; \ + } + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(QDisc, qdisc); + +QDisc* qdisc_free(QDisc *qdisc); +int qdisc_new_static(QDiscKind kind, Network *network, const char *filename, unsigned section_line, QDisc **ret); + +QDisc* qdisc_drop(QDisc *qdisc); + +int link_find_qdisc(Link *link, uint32_t handle, const char *kind, QDisc **qdisc); + +int link_request_qdisc(Link *link, QDisc *qdisc); + +void network_drop_invalid_qdisc(Network *network); + +int manager_rtnl_process_qdisc(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); + +DEFINE_SECTION_CLEANUP_FUNCTIONS(QDisc, qdisc_free); + +CONFIG_PARSER_PROTOTYPE(config_parse_qdisc_parent); +CONFIG_PARSER_PROTOTYPE(config_parse_qdisc_handle); + +#include "cake.h" +#include "codel.h" +#include "ets.h" +#include "fifo.h" +#include "fq-codel.h" +#include "fq-pie.h" +#include "fq.h" +#include "gred.h" +#include "hhf.h" +#include "htb.h" +#include "pie.h" +#include "qfq.h" +#include "netem.h" +#include "drr.h" +#include "sfb.h" +#include "sfq.h" +#include "tbf.h" +#include "teql.h" diff --git a/src/network/tc/qfq.c b/src/network/tc/qfq.c new file mode 100644 index 0000000..7702e6f --- /dev/null +++ b/src/network/tc/qfq.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "parse-util.h" +#include "qdisc.h" +#include "qfq.h" +#include "string-util.h" + +#define QFQ_MAX_WEIGHT (1 << 10) +#define QFQ_MIN_MAX_PACKET 512 +#define QFQ_MAX_MAX_PACKET (1 << 16) + +const QDiscVTable qfq_vtable = { + .object_size = sizeof(QuickFairQueueing), + .tca_kind = "qfq", +}; + +static int quick_fair_queueing_class_fill_message(Link *link, TClass *tclass, sd_netlink_message *req) { + QuickFairQueueingClass *qfq; + int r; + + assert(link); + assert(tclass); + assert(req); + + assert_se(qfq = TCLASS_TO_QFQ(tclass)); + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "qfq"); + if (r < 0) + return r; + + if (qfq->weight > 0) { + r = sd_netlink_message_append_u32(req, TCA_QFQ_WEIGHT, qfq->weight); + if (r < 0) + return r; + } + + if (qfq->max_packet > 0) { + r = sd_netlink_message_append_u32(req, TCA_QFQ_LMAX, qfq->max_packet); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_quick_fair_queueing_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + QuickFairQueueingClass *qfq; + Network *network = ASSERT_PTR(data); + uint32_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(TCLASS_KIND_QFQ, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + qfq = TCLASS_TO_QFQ(tclass); + + if (isempty(rvalue)) { + qfq->weight = 0; + TAKE_PTR(tclass); + return 0; + } + + r = safe_atou32(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (v == 0 || v > QFQ_MAX_WEIGHT) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + qfq->weight = v; + TAKE_PTR(tclass); + + return 0; +} + +int config_parse_quick_fair_queueing_max_packet( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + QuickFairQueueingClass *qfq; + Network *network = ASSERT_PTR(data); + uint64_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(TCLASS_KIND_QFQ, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + qfq = TCLASS_TO_QFQ(tclass); + + if (isempty(rvalue)) { + qfq->max_packet = 0; + TAKE_PTR(tclass); + return 0; + } + + r = parse_size(rvalue, 1024, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (v < QFQ_MIN_MAX_PACKET || v > QFQ_MAX_MAX_PACKET) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + qfq->max_packet = (uint32_t) v; + TAKE_PTR(tclass); + + return 0; +} + +const TClassVTable qfq_tclass_vtable = { + .object_size = sizeof(QuickFairQueueingClass), + .tca_kind = "qfq", + .fill_message = quick_fair_queueing_class_fill_message, +}; diff --git a/src/network/tc/qfq.h b/src/network/tc/qfq.h new file mode 100644 index 0000000..0f013a9 --- /dev/null +++ b/src/network/tc/qfq.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct QuickFairQueueing { + QDisc meta; +} QuickFairQueueing; + +DEFINE_QDISC_CAST(QFQ, QuickFairQueueing); +extern const QDiscVTable qfq_vtable; + +typedef struct QuickFairQueueingClass { + TClass meta; + + uint32_t weight; + uint32_t max_packet; +} QuickFairQueueingClass; + +DEFINE_TCLASS_CAST(QFQ, QuickFairQueueingClass); +extern const TClassVTable qfq_tclass_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_quick_fair_queueing_weight); +CONFIG_PARSER_PROTOTYPE(config_parse_quick_fair_queueing_max_packet); diff --git a/src/network/tc/sfb.c b/src/network/tc/sfb.c new file mode 100644 index 0000000..861c5fe --- /dev/null +++ b/src/network/tc/sfb.c @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "sfb.h" +#include "string-util.h" + +static int stochastic_fair_blue_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + StochasticFairBlue *sfb; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(sfb = SFB(qdisc)); + + const struct tc_sfb_qopt opt = { + .rehash_interval = 600*1000, + .warmup_time = 60*1000, + .penalty_rate = 10, + .penalty_burst = 20, + .increment = (SFB_MAX_PROB + 1000) / 2000, + .decrement = (SFB_MAX_PROB + 10000) / 20000, + .max = 25, + .bin_size = 20, + .limit = sfb->packet_limit, + }; + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "sfb"); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_SFB_PARMS, &opt, sizeof(opt)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_stochastic_fair_blue_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + StochasticFairBlue *sfb; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_SFB, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + sfb = SFB(qdisc); + + if (isempty(rvalue)) { + sfb->packet_limit = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou32(rvalue, &sfb->packet_limit); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable sfb_vtable = { + .object_size = sizeof(StochasticFairBlue), + .tca_kind = "sfb", + .fill_message = stochastic_fair_blue_fill_message, +}; diff --git a/src/network/tc/sfb.h b/src/network/tc/sfb.h new file mode 100644 index 0000000..628df35 --- /dev/null +++ b/src/network/tc/sfb.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2020 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct StochasticFairBlue { + QDisc meta; + + uint32_t packet_limit; +} StochasticFairBlue; + +DEFINE_QDISC_CAST(SFB, StochasticFairBlue); +extern const QDiscVTable sfb_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_stochastic_fair_blue_u32); diff --git a/src/network/tc/sfq.c b/src/network/tc/sfq.c new file mode 100644 index 0000000..92dbae1 --- /dev/null +++ b/src/network/tc/sfq.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "qdisc.h" +#include "sfq.h" +#include "string-util.h" + +static int stochastic_fairness_queueing_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + StochasticFairnessQueueing *sfq; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(sfq = SFQ(qdisc)); + + const struct tc_sfq_qopt_v1 opt = { + .v0.perturb_period = sfq->perturb_period / USEC_PER_SEC, + }; + + r = sd_netlink_message_append_data(req, TCA_OPTIONS, &opt, sizeof(opt)); + if (r < 0) + return r; + + return 0; +} + +int config_parse_stochastic_fairness_queueing_perturb_period( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + StochasticFairnessQueueing *sfq; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_SFQ, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + sfq = SFQ(qdisc); + + if (isempty(rvalue)) { + sfq->perturb_period = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_sec(rvalue, &sfq->perturb_period); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + TAKE_PTR(qdisc); + + return 0; +} + +const QDiscVTable sfq_vtable = { + .object_size = sizeof(StochasticFairnessQueueing), + .tca_kind = "sfq", + .fill_message = stochastic_fairness_queueing_fill_message, +}; diff --git a/src/network/tc/sfq.h b/src/network/tc/sfq.h new file mode 100644 index 0000000..1626775 --- /dev/null +++ b/src/network/tc/sfq.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" +#include "time-util.h" + +typedef struct StochasticFairnessQueueing { + QDisc meta; + + usec_t perturb_period; +} StochasticFairnessQueueing; + +DEFINE_QDISC_CAST(SFQ, StochasticFairnessQueueing); +extern const QDiscVTable sfq_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_stochastic_fairness_queueing_perturb_period); diff --git a/src/network/tc/tbf.c b/src/network/tc/tbf.c new file mode 100644 index 0000000..647fc8c --- /dev/null +++ b/src/network/tc/tbf.c @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "netem.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-util.h" +#include "strv.h" +#include "tc-util.h" + +static int token_bucket_filter_fill_message(Link *link, QDisc *qdisc, sd_netlink_message *req) { + uint32_t rtab[256], ptab[256]; + TokenBucketFilter *tbf; + int r; + + assert(link); + assert(qdisc); + assert(req); + + assert_se(tbf = TBF(qdisc)); + + struct tc_tbf_qopt opt = { + .rate.rate = tbf->rate >= (1ULL << 32) ? ~0U : tbf->rate, + .peakrate.rate = tbf->peak_rate >= (1ULL << 32) ? ~0U : tbf->peak_rate, + .rate.mpu = tbf->mpu, + }; + + if (tbf->limit > 0) + opt.limit = tbf->limit; + else { + double lim, lim2; + + lim = tbf->rate * (double) tbf->latency / USEC_PER_SEC + tbf->burst; + if (tbf->peak_rate > 0) { + lim2 = tbf->peak_rate * (double) tbf->latency / USEC_PER_SEC + tbf->mtu; + lim = MIN(lim, lim2); + } + opt.limit = lim; + } + + r = tc_fill_ratespec_and_table(&opt.rate, rtab, tbf->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate ratespec: %m"); + + r = tc_transmit_time(opt.rate.rate, tbf->burst, &opt.buffer); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate buffer size: %m"); + + if (opt.peakrate.rate > 0) { + opt.peakrate.mpu = tbf->mpu; + + r = tc_fill_ratespec_and_table(&opt.peakrate, ptab, tbf->mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate ratespec: %m"); + + r = tc_transmit_time(opt.peakrate.rate, tbf->mtu, &opt.mtu); + if (r < 0) + return log_link_debug_errno(link, r, "Failed to calculate mtu size: %m"); + } + + r = sd_netlink_message_open_container_union(req, TCA_OPTIONS, "tbf"); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_TBF_PARMS, &opt, sizeof(opt)); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_TBF_BURST, &tbf->burst, sizeof(tbf->burst)); + if (r < 0) + return r; + + if (tbf->rate >= (1ULL << 32)) { + r = sd_netlink_message_append_u64(req, TCA_TBF_RATE64, tbf->rate); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_data(req, TCA_TBF_RTAB, rtab, sizeof(rtab)); + if (r < 0) + return r; + + if (opt.peakrate.rate > 0) { + if (tbf->peak_rate >= (1ULL << 32)) { + r = sd_netlink_message_append_u64(req, TCA_TBF_PRATE64, tbf->peak_rate); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(req, TCA_TBF_PBURST, tbf->mtu); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, TCA_TBF_PTAB, ptab, sizeof(ptab)); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int config_parse_token_bucket_filter_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + TokenBucketFilter *tbf; + uint64_t k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_TBF, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + tbf = TBF(qdisc); + + if (isempty(rvalue)) { + if (STR_IN_SET(lvalue, "BurstBytes", "Burst")) + tbf->burst = 0; + else if (STR_IN_SET(lvalue, "LimitBytes", "LimitSize")) + tbf->limit = 0; + else if (streq(lvalue, "MTUBytes")) + tbf->mtu = 0; + else if (streq(lvalue, "MPUBytes")) + tbf->mpu = 0; + else + assert_not_reached(); + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_size(rvalue, 1024, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (STR_IN_SET(lvalue, "BurstBytes", "Burst")) + tbf->burst = k; + else if (STR_IN_SET(lvalue, "LimitBytes", "LimitSize")) + tbf->limit = k; + else if (streq(lvalue, "MPUBytes")) + tbf->mpu = k; + else if (streq(lvalue, "MTUBytes")) + tbf->mtu = k; + else + assert_not_reached(); + + TAKE_PTR(qdisc); + + return 0; +} + +int config_parse_token_bucket_filter_rate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + TokenBucketFilter *tbf; + uint64_t k, *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_TBF, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + tbf = TBF(qdisc); + if (streq(lvalue, "Rate")) + p = &tbf->rate; + else if (streq(lvalue, "PeakRate")) + p = &tbf->peak_rate; + else + assert_not_reached(); + + if (isempty(rvalue)) { + *p = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = parse_size(rvalue, 1000, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + *p = k / 8; + + qdisc = NULL; + + return 0; +} + +int config_parse_token_bucket_filter_latency( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + Network *network = ASSERT_PTR(data); + TokenBucketFilter *tbf; + usec_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_TBF, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + tbf = TBF(qdisc); + + if (isempty(rvalue)) { + tbf->latency = 0; + + qdisc = NULL; + return 0; + } + + r = parse_sec(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + tbf->latency = u; + + qdisc = NULL; + + return 0; +} + +static int token_bucket_filter_verify(QDisc *qdisc) { + TokenBucketFilter *tbf = TBF(qdisc); + + if (tbf->limit > 0 && tbf->latency > 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Specifying both LimitBytes= and LatencySec= is not allowed. " + "Ignoring [TokenBucketFilter] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + if (tbf->limit == 0 && tbf->latency == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Either LimitBytes= or LatencySec= is required. " + "Ignoring [TokenBucketFilter] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + if (tbf->rate == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: Rate= is mandatory. " + "Ignoring [TokenBucketFilter] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + if (tbf->burst == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: BurstBytes= is mandatory. " + "Ignoring [TokenBucketFilter] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + if (tbf->peak_rate > 0 && tbf->mtu == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: MTUBytes= is mandatory when PeakRate= is specified. " + "Ignoring [TokenBucketFilter] section from line %u.", + qdisc->section->filename, qdisc->section->line); + + return 0; +} + +const QDiscVTable tbf_vtable = { + .object_size = sizeof(TokenBucketFilter), + .tca_kind = "tbf", + .fill_message = token_bucket_filter_fill_message, + .verify = token_bucket_filter_verify +}; diff --git a/src/network/tc/tbf.h b/src/network/tc/tbf.h new file mode 100644 index 0000000..6b4b017 --- /dev/null +++ b/src/network/tc/tbf.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" +#include "time-util.h" + +typedef struct TokenBucketFilter { + QDisc meta; + + uint64_t rate; + uint64_t peak_rate; + uint32_t burst; + uint32_t mtu; + usec_t latency; + size_t limit; + size_t mpu; +} TokenBucketFilter; + +DEFINE_QDISC_CAST(TBF, TokenBucketFilter); +extern const QDiscVTable tbf_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_token_bucket_filter_latency); +CONFIG_PARSER_PROTOTYPE(config_parse_token_bucket_filter_size); +CONFIG_PARSER_PROTOTYPE(config_parse_token_bucket_filter_rate); diff --git a/src/network/tc/tc-util.c b/src/network/tc/tc-util.c new file mode 100644 index 0000000..3781182 --- /dev/null +++ b/src/network/tc/tc-util.c @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include "alloc-util.h" +#include "extract-word.h" +#include "fileio.h" +#include "parse-util.h" +#include "percent-util.h" +#include "tc-util.h" +#include "time-util.h" + +int tc_init(double *ret_ticks_in_usec, uint32_t *ret_hz) { + static double ticks_in_usec = -1; + static uint32_t hz; + + if (ticks_in_usec < 0) { + uint32_t clock_resolution, ticks_to_usec, usec_to_ticks; + _cleanup_free_ char *line = NULL; + double clock_factor; + int r; + + r = read_one_line_file("/proc/net/psched", &line); + if (r < 0) + return r; + + r = sscanf(line, "%08x%08x%08x%08x", &ticks_to_usec, &usec_to_ticks, &clock_resolution, &hz); + if (r < 4) + return -EIO; + + clock_factor = (double) clock_resolution / USEC_PER_SEC; + ticks_in_usec = (double) ticks_to_usec / usec_to_ticks * clock_factor; + } + + if (ret_ticks_in_usec) + *ret_ticks_in_usec = ticks_in_usec; + if (ret_hz) + *ret_hz = hz; + + return 0; +} + +int tc_time_to_tick(usec_t t, uint32_t *ret) { + double ticks_in_usec; + usec_t a; + int r; + + assert(ret); + + r = tc_init(&ticks_in_usec, NULL); + if (r < 0) + return r; + + a = t * ticks_in_usec; + if (a > UINT32_MAX) + return -ERANGE; + + *ret = a; + return 0; +} + +int parse_tc_percent(const char *s, uint32_t *ret_fraction) { + int r; + + assert(s); + assert(ret_fraction); + + r = parse_permyriad(s); + if (r < 0) + return r; + + *ret_fraction = (double) r / 10000 * UINT32_MAX; + return 0; +} + +int tc_transmit_time(uint64_t rate, uint32_t size, uint32_t *ret) { + return tc_time_to_tick(USEC_PER_SEC * ((double)size / (double)rate), ret); +} + +int tc_fill_ratespec_and_table(struct tc_ratespec *rate, uint32_t *rtab, uint32_t mtu) { + uint32_t cell_log = 0; + int r; + + if (mtu == 0) + mtu = 2047; + + while ((mtu >> cell_log) > 255) + cell_log++; + + for (size_t i = 0; i < 256; i++) { + uint32_t sz; + + sz = (i + 1) << cell_log; + if (sz < rate->mpu) + sz = rate->mpu; + r = tc_transmit_time(rate->rate, sz, &rtab[i]); + if (r < 0) + return r; + } + + rate->cell_align = -1; + rate->cell_log = cell_log; + rate->linklayer = TC_LINKLAYER_ETHERNET; + return 0; +} + +int parse_handle(const char *t, uint32_t *ret) { + _cleanup_free_ char *word = NULL; + uint16_t major, minor; + int r; + + assert(t); + assert(ret); + + /* Extract the major number. */ + r = extract_first_word(&t, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + if (!t) + return -EINVAL; + + r = safe_atou16_full(word, 16, &major); + if (r < 0) + return r; + + r = safe_atou16_full(t, 16, &minor); + if (r < 0) + return r; + + *ret = ((uint32_t) major << 16) | minor; + return 0; +} diff --git a/src/network/tc/tc-util.h b/src/network/tc/tc-util.h new file mode 100644 index 0000000..83bad8e --- /dev/null +++ b/src/network/tc/tc-util.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include + +#include "time-util.h" + +int tc_init(double *ret_ticks_in_usec, uint32_t *ret_hz); +int tc_time_to_tick(usec_t t, uint32_t *ret); +int parse_tc_percent(const char *s, uint32_t *percent); +int tc_transmit_time(uint64_t rate, uint32_t size, uint32_t *ret); +int tc_fill_ratespec_and_table(struct tc_ratespec *rate, uint32_t *rtab, uint32_t mtu); +int parse_handle(const char *t, uint32_t *ret); diff --git a/src/network/tc/tc.c b/src/network/tc/tc.c new file mode 100644 index 0000000..8a1c5b3 --- /dev/null +++ b/src/network/tc/tc.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "macro.h" +#include "networkd-link.h" +#include "networkd-network.h" +#include "qdisc.h" +#include "tc.h" +#include "tclass.h" + +int link_request_traffic_control(Link *link) { + TClass *tclass; + QDisc *qdisc; + int r; + + assert(link); + assert(link->network); + + link->tc_configured = false; + + HASHMAP_FOREACH(qdisc, link->network->qdiscs_by_section) { + r = link_request_qdisc(link, qdisc); + if (r < 0) + return r; + } + + HASHMAP_FOREACH(tclass, link->network->tclasses_by_section) { + r = link_request_tclass(link, tclass); + if (r < 0) + return r; + } + + if (link->tc_messages == 0) { + link->tc_configured = true; + link_check_ready(link); + } else { + log_link_debug(link, "Setting traffic control"); + link_set_state(link, LINK_STATE_CONFIGURING); + } + + return 0; +} diff --git a/src/network/tc/tc.h b/src/network/tc/tc.h new file mode 100644 index 0000000..6226578 --- /dev/null +++ b/src/network/tc/tc.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct Link Link; + +int link_request_traffic_control(Link *link); diff --git a/src/network/tc/tclass.c b/src/network/tc/tclass.c new file mode 100644 index 0000000..0a5fec0 --- /dev/null +++ b/src/network/tc/tclass.c @@ -0,0 +1,639 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "in-addr-util.h" +#include "netlink-util.h" +#include "networkd-link.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "networkd-queue.h" +#include "parse-util.h" +#include "set.h" +#include "string-util.h" +#include "strv.h" +#include "tc-util.h" +#include "tclass.h" + +const TClassVTable * const tclass_vtable[_TCLASS_KIND_MAX] = { + [TCLASS_KIND_DRR] = &drr_tclass_vtable, + [TCLASS_KIND_HTB] = &htb_tclass_vtable, + [TCLASS_KIND_QFQ] = &qfq_tclass_vtable, +}; + +static int tclass_new(TClassKind kind, TClass **ret) { + _cleanup_(tclass_freep) TClass *tclass = NULL; + int r; + + if (kind == _TCLASS_KIND_INVALID) { + tclass = new(TClass, 1); + if (!tclass) + return -ENOMEM; + + *tclass = (TClass) { + .parent = TC_H_ROOT, + .kind = kind, + }; + } else { + assert(kind >= 0 && kind < _TCLASS_KIND_MAX); + tclass = malloc0(tclass_vtable[kind]->object_size); + if (!tclass) + return -ENOMEM; + + tclass->parent = TC_H_ROOT; + tclass->kind = kind; + + if (TCLASS_VTABLE(tclass)->init) { + r = TCLASS_VTABLE(tclass)->init(tclass); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(tclass); + + return 0; +} + +int tclass_new_static(TClassKind kind, Network *network, const char *filename, unsigned section_line, TClass **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(tclass_freep) TClass *tclass = NULL; + TClass *existing; + int r; + + assert(network); + assert(ret); + assert(filename); + assert(section_line > 0); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + existing = hashmap_get(network->tclasses_by_section, n); + if (existing) { + if (existing->kind != kind) + return -EINVAL; + + *ret = existing; + return 0; + } + + r = tclass_new(kind, &tclass); + if (r < 0) + return r; + + tclass->network = network; + tclass->section = TAKE_PTR(n); + tclass->source = NETWORK_CONFIG_SOURCE_STATIC; + + r = hashmap_ensure_put(&network->tclasses_by_section, &config_section_hash_ops, tclass->section, tclass); + if (r < 0) + return r; + + *ret = TAKE_PTR(tclass); + return 0; +} + +TClass* tclass_free(TClass *tclass) { + if (!tclass) + return NULL; + + if (tclass->network && tclass->section) + hashmap_remove(tclass->network->tclasses_by_section, tclass->section); + + config_section_free(tclass->section); + + if (tclass->link) + set_remove(tclass->link->tclasses, tclass); + + free(tclass->tca_kind); + return mfree(tclass); +} + +static const char *tclass_get_tca_kind(const TClass *tclass) { + assert(tclass); + + return (TCLASS_VTABLE(tclass) && TCLASS_VTABLE(tclass)->tca_kind) ? + TCLASS_VTABLE(tclass)->tca_kind : tclass->tca_kind; +} + +static void tclass_hash_func(const TClass *tclass, struct siphash *state) { + assert(tclass); + assert(state); + + siphash24_compress(&tclass->classid, sizeof(tclass->classid), state); + siphash24_compress(&tclass->parent, sizeof(tclass->parent), state); + siphash24_compress_string(tclass_get_tca_kind(tclass), state); +} + +static int tclass_compare_func(const TClass *a, const TClass *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->classid, b->classid); + if (r != 0) + return r; + + r = CMP(a->parent, b->parent); + if (r != 0) + return r; + + return strcmp_ptr(tclass_get_tca_kind(a), tclass_get_tca_kind(b)); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + tclass_hash_ops, + TClass, + tclass_hash_func, + tclass_compare_func, + tclass_free); + +static int tclass_get(Link *link, const TClass *in, TClass **ret) { + TClass *existing; + + assert(link); + assert(in); + + existing = set_get(link->tclasses, in); + if (!existing) + return -ENOENT; + + if (ret) + *ret = existing; + return 0; +} + +static int tclass_add(Link *link, TClass *tclass) { + int r; + + assert(link); + assert(tclass); + + r = set_ensure_put(&link->tclasses, &tclass_hash_ops, tclass); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + + tclass->link = link; + return 0; +} + +static int tclass_dup(const TClass *src, TClass **ret) { + _cleanup_(tclass_freep) TClass *dst = NULL; + + assert(src); + assert(ret); + + if (TCLASS_VTABLE(src)) + dst = memdup(src, TCLASS_VTABLE(src)->object_size); + else + dst = newdup(TClass, src, 1); + if (!dst) + return -ENOMEM; + + /* clear all pointers */ + dst->network = NULL; + dst->section = NULL; + dst->link = NULL; + dst->tca_kind = NULL; + + if (src->tca_kind) { + dst->tca_kind = strdup(src->tca_kind); + if (!dst->tca_kind) + return -ENOMEM; + } + + *ret = TAKE_PTR(dst); + return 0; +} + +int link_find_tclass(Link *link, uint32_t classid, TClass **ret) { + TClass *tclass; + + assert(link); + + SET_FOREACH(tclass, link->tclasses) { + if (tclass->classid != classid) + continue; + + if (!tclass_exists(tclass)) + continue; + + if (ret) + *ret = tclass; + return 0; + } + + return -ENOENT; +} + +static void log_tclass_debug(TClass *tclass, Link *link, const char *str) { + _cleanup_free_ char *state = NULL; + + assert(tclass); + assert(str); + + if (!DEBUG_LOGGING) + return; + + (void) network_config_state_to_string_alloc(tclass->state, &state); + + log_link_debug(link, "%s %s TClass (%s): classid=%"PRIx32":%"PRIx32", parent=%"PRIx32":%"PRIx32", kind=%s", + str, strna(network_config_source_to_string(tclass->source)), strna(state), + TC_H_MAJ(tclass->classid) >> 16, TC_H_MIN(tclass->classid), + TC_H_MAJ(tclass->parent) >> 16, TC_H_MIN(tclass->parent), + strna(tclass_get_tca_kind(tclass))); +} + +TClass* tclass_drop(TClass *tclass) { + QDisc *qdisc; + Link *link; + + assert(tclass); + + link = ASSERT_PTR(tclass->link); + + /* Also drop all child qdiscs assigned to the class. */ + SET_FOREACH(qdisc, link->qdiscs) { + if (qdisc->parent != tclass->classid) + continue; + + qdisc_drop(qdisc); + } + + tclass_enter_removed(tclass); + + if (tclass->state == 0) { + log_tclass_debug(tclass, link, "Forgetting"); + tclass = tclass_free(tclass); + } else + log_tclass_debug(tclass, link, "Removed"); + + return tclass; +} + +static int tclass_handler(sd_netlink *rtnl, sd_netlink_message *m, Request *req, Link *link, TClass *tclass) { + int r; + + assert(m); + assert(link); + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_message_error_errno(link, m, r, "Could not set TClass"); + link_enter_failed(link); + return 1; + } + + if (link->tc_messages == 0) { + log_link_debug(link, "Traffic control configured"); + link->tc_configured = true; + link_check_ready(link); + } + + return 1; +} + +static int tclass_configure(TClass *tclass, Link *link, Request *req) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(tclass); + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(link->ifindex > 0); + assert(req); + + log_tclass_debug(tclass, link, "Configuring"); + + r = sd_rtnl_message_new_traffic_control(link->manager->rtnl, &m, RTM_NEWTCLASS, + link->ifindex, tclass->classid, tclass->parent); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, TCA_KIND, TCLASS_VTABLE(tclass)->tca_kind); + if (r < 0) + return r; + + if (TCLASS_VTABLE(tclass)->fill_message) { + r = TCLASS_VTABLE(tclass)->fill_message(link, tclass, m); + if (r < 0) + return r; + } + + return request_call_netlink_async(link->manager->rtnl, m, req); +} + +static bool tclass_is_ready_to_configure(TClass *tclass, Link *link) { + assert(tclass); + assert(link); + + if (!IN_SET(link->state, LINK_STATE_CONFIGURING, LINK_STATE_CONFIGURED)) + return false; + + return link_find_qdisc(link, TC_H_MAJ(tclass->classid), tclass_get_tca_kind(tclass), NULL) >= 0; +} + +static int tclass_process_request(Request *req, Link *link, TClass *tclass) { + int r; + + assert(req); + assert(link); + assert(tclass); + + if (!tclass_is_ready_to_configure(tclass, link)) + return 0; + + r = tclass_configure(tclass, link, req); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to configure TClass: %m"); + + tclass_enter_configuring(tclass); + return 1; +} + +int link_request_tclass(Link *link, TClass *tclass) { + TClass *existing; + int r; + + assert(link); + assert(tclass); + + if (tclass_get(link, tclass, &existing) < 0) { + _cleanup_(tclass_freep) TClass *tmp = NULL; + + r = tclass_dup(tclass, &tmp); + if (r < 0) + return log_oom(); + + r = tclass_add(link, tmp); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to store TClass: %m"); + + existing = TAKE_PTR(tmp); + } else + existing->source = tclass->source; + + log_tclass_debug(existing, link, "Requesting"); + r = link_queue_request_safe(link, REQUEST_TYPE_TC_CLASS, + existing, NULL, + tclass_hash_func, + tclass_compare_func, + tclass_process_request, + &link->tc_messages, + tclass_handler, + NULL); + if (r < 0) + return log_link_warning_errno(link, r, "Failed to request TClass: %m"); + if (r == 0) + return 0; + + tclass_enter_requesting(existing); + return 1; +} + +int manager_rtnl_process_tclass(sd_netlink *rtnl, sd_netlink_message *message, Manager *m) { + _cleanup_(tclass_freep) TClass *tmp = NULL; + TClass *tclass = NULL; + Link *link; + uint16_t type; + int ifindex, r; + + assert(rtnl); + assert(message); + assert(m); + + if (sd_netlink_message_is_error(message)) { + r = sd_netlink_message_get_errno(message); + if (r < 0) + log_message_warning_errno(message, r, "rtnl: failed to receive TClass message, ignoring"); + + return 0; + } + + r = sd_netlink_message_get_type(message, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get message type, ignoring: %m"); + return 0; + } else if (!IN_SET(type, RTM_NEWTCLASS, RTM_DELTCLASS)) { + log_warning("rtnl: received unexpected message type %u when processing TClass, ignoring.", type); + return 0; + } + + r = sd_rtnl_message_traffic_control_get_ifindex(message, &ifindex); + if (r < 0) { + log_warning_errno(r, "rtnl: could not get ifindex from message, ignoring: %m"); + return 0; + } else if (ifindex <= 0) { + log_warning("rtnl: received TClass message with invalid ifindex %d, ignoring.", ifindex); + return 0; + } + + if (link_get_by_index(m, ifindex, &link) < 0) { + if (!m->enumerating) + log_warning("rtnl: received TClass for link '%d' we don't know about, ignoring.", ifindex); + return 0; + } + + r = tclass_new(_TCLASS_KIND_INVALID, &tmp); + if (r < 0) + return log_oom(); + + r = sd_rtnl_message_traffic_control_get_handle(message, &tmp->classid); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received TClass message without handle, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_traffic_control_get_parent(message, &tmp->parent); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received TClass message without parent, ignoring: %m"); + return 0; + } + + r = sd_netlink_message_read_string_strdup(message, TCA_KIND, &tmp->tca_kind); + if (r < 0) { + log_link_warning_errno(link, r, "rtnl: received TClass message without kind, ignoring: %m"); + return 0; + } + + (void) tclass_get(link, tmp, &tclass); + + switch (type) { + case RTM_NEWTCLASS: + if (tclass) { + tclass_enter_configured(tclass); + log_tclass_debug(tclass, link, "Received remembered"); + } else { + tclass_enter_configured(tmp); + log_tclass_debug(tmp, link, "Received new"); + + r = tclass_add(link, tmp); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to remember TClass, ignoring: %m"); + return 0; + } + + tclass = TAKE_PTR(tmp); + } + + break; + + case RTM_DELTCLASS: + if (tclass) + (void) tclass_drop(tclass); + else + log_tclass_debug(tmp, link, "Kernel removed unknown"); + + break; + + default: + assert_not_reached(); + } + + return 1; +} + +int link_enumerate_tclass(Link *link, uint32_t parent) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + + r = sd_rtnl_message_new_traffic_control(link->manager->rtnl, &req, RTM_GETTCLASS, link->ifindex, 0, parent); + if (r < 0) + return r; + + return manager_enumerate_internal(link->manager, link->manager->rtnl, req, manager_rtnl_process_tclass); +} + +static int tclass_section_verify(TClass *tclass) { + int r; + + assert(tclass); + + if (section_is_invalid(tclass->section)) + return -EINVAL; + + if (TCLASS_VTABLE(tclass)->verify) { + r = TCLASS_VTABLE(tclass)->verify(tclass); + if (r < 0) + return r; + } + + return 0; +} + +void network_drop_invalid_tclass(Network *network) { + TClass *tclass; + + assert(network); + + HASHMAP_FOREACH(tclass, network->tclasses_by_section) + if (tclass_section_verify(tclass) < 0) + tclass_free(tclass); +} + +int config_parse_tclass_parent( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(ltype, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + if (streq(rvalue, "root")) + tclass->parent = TC_H_ROOT; + else { + r = parse_handle(rvalue, &tclass->parent); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse 'Parent=', ignoring assignment: %s", + rvalue); + return 0; + } + } + + TAKE_PTR(tclass); + + return 0; +} + +int config_parse_tclass_classid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(tclass_free_or_set_invalidp) TClass *tclass = NULL; + Network *network = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tclass_new_static(ltype, network, filename, section_line, &tclass); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to create traffic control class, ignoring assignment: %m"); + return 0; + } + + if (isempty(rvalue)) { + tclass->classid = TC_H_UNSPEC; + TAKE_PTR(tclass); + return 0; + } + + r = parse_handle(rvalue, &tclass->classid); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse 'ClassId=', ignoring assignment: %s", + rvalue); + return 0; + } + + TAKE_PTR(tclass); + + return 0; +} diff --git a/src/network/tc/tclass.h b/src/network/tc/tclass.h new file mode 100644 index 0000000..e73e23c --- /dev/null +++ b/src/network/tc/tclass.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "networkd-util.h" + +typedef struct Link Link; +typedef struct Manager Manager; +typedef struct Network Network; + +typedef enum TClassKind { + TCLASS_KIND_DRR, + TCLASS_KIND_HTB, + TCLASS_KIND_QFQ, + _TCLASS_KIND_MAX, + _TCLASS_KIND_INVALID = -EINVAL, +} TClassKind; + +typedef struct TClass { + Link *link; + Network *network; + ConfigSection *section; + NetworkConfigSource source; + NetworkConfigState state; + + uint32_t classid; + uint32_t parent; + + TClassKind kind; + char *tca_kind; +} TClass; + +typedef struct TClassVTable { + size_t object_size; + const char *tca_kind; + /* called in tclass_new() */ + int (*init)(TClass *tclass); + int (*fill_message)(Link *link, TClass *tclass, sd_netlink_message *m); + int (*verify)(TClass *tclass); +} TClassVTable; + +extern const TClassVTable * const tclass_vtable[_TCLASS_KIND_MAX]; + +#define TCLASS_VTABLE(t) ((t)->kind != _TCLASS_KIND_INVALID ? tclass_vtable[(t)->kind] : NULL) + +/* For casting a tclass into the various tclass kinds */ +#define DEFINE_TCLASS_CAST(UPPERCASE, MixedCase) \ + static inline MixedCase* TCLASS_TO_##UPPERCASE(TClass *t) { \ + if (_unlikely_(!t || t->kind != TCLASS_KIND_##UPPERCASE)) \ + return NULL; \ + \ + return (MixedCase*) t; \ + } + +DEFINE_NETWORK_CONFIG_STATE_FUNCTIONS(TClass, tclass); + +TClass* tclass_free(TClass *tclass); +int tclass_new_static(TClassKind kind, Network *network, const char *filename, unsigned section_line, TClass **ret); + +TClass* tclass_drop(TClass *tclass); + +int link_find_tclass(Link *link, uint32_t classid, TClass **ret); + +int link_request_tclass(Link *link, TClass *tclass); + +void network_drop_invalid_tclass(Network *network); + +int manager_rtnl_process_tclass(sd_netlink *rtnl, sd_netlink_message *message, Manager *m); +int link_enumerate_tclass(Link *link, uint32_t parent); + +DEFINE_SECTION_CLEANUP_FUNCTIONS(TClass, tclass_free); + +CONFIG_PARSER_PROTOTYPE(config_parse_tclass_parent); +CONFIG_PARSER_PROTOTYPE(config_parse_tclass_classid); + +#include "drr.h" +#include "htb.h" +#include "qfq.h" diff --git a/src/network/tc/teql.c b/src/network/tc/teql.c new file mode 100644 index 0000000..dcb149d --- /dev/null +++ b/src/network/tc/teql.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "macro.h" +#include "networkd-link.h" +#include "parse-util.h" +#include "string-util.h" +#include "teql.h" + +static int trivial_link_equalizer_verify(QDisc *qdisc) { + _cleanup_free_ char *tca_kind = NULL; + TrivialLinkEqualizer *teql; + + teql = TEQL(ASSERT_PTR(qdisc)); + + if (asprintf(&tca_kind, "teql%u", teql->id) < 0) + return log_oom(); + + return free_and_replace(qdisc->tca_kind, tca_kind); +} + +static int trivial_link_equalizer_is_ready(QDisc *qdisc, Link *link) { + Link *teql; + + assert(qdisc); + assert(qdisc->tca_kind); + assert(link); + assert(link->manager); + + if (link_get_by_name(link->manager, qdisc->tca_kind, &teql) < 0) + return false; + + return link_is_ready_to_configure(teql, /* allow_unmanaged = */ true); +} + +const QDiscVTable teql_vtable = { + .object_size = sizeof(TrivialLinkEqualizer), + .verify = trivial_link_equalizer_verify, + .is_ready = trivial_link_equalizer_is_ready, +}; + +int config_parse_trivial_link_equalizer_id( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDisc *qdisc = NULL; + TrivialLinkEqualizer *teql; + Network *network = ASSERT_PTR(data); + unsigned id; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = qdisc_new_static(QDISC_KIND_TEQL, network, filename, section_line, &qdisc); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "More than one kind of queueing discipline, ignoring assignment: %m"); + return 0; + } + + teql = TEQL(qdisc); + + if (isempty(rvalue)) { + teql->id = 0; + + TAKE_PTR(qdisc); + return 0; + } + + r = safe_atou(rvalue, &id); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + if (id > INT_MAX) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "'%s=' is too large, ignoring assignment: %s", + lvalue, rvalue); + + teql->id = id; + + TAKE_PTR(qdisc); + return 0; +} diff --git a/src/network/tc/teql.h b/src/network/tc/teql.h new file mode 100644 index 0000000..8d0085e --- /dev/null +++ b/src/network/tc/teql.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "qdisc.h" + +typedef struct TrivialLinkEqualizer { + QDisc meta; + + unsigned id; +} TrivialLinkEqualizer; + +DEFINE_QDISC_CAST(TEQL, TrivialLinkEqualizer); +extern const QDiscVTable teql_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_trivial_link_equalizer_id); diff --git a/src/network/test-network-tables.c b/src/network/test-network-tables.c new file mode 100644 index 0000000..564ca09 --- /dev/null +++ b/src/network/test-network-tables.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bond.h" +#include "dhcp6-internal.h" +#include "dhcp6-protocol.h" +#include "ethtool-util.h" +#include "ipvlan.h" +#include "lldp-rx-internal.h" +#include "macvlan.h" +#include "ndisc-internal.h" +#include "networkd-link.h" +#include "networkd-network.h" +#include "networkd-util.h" +#include "test-tables.h" +#include "tests.h" +#include "tunnel.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(bond_ad_select, NETDEV_BOND_AD_SELECT); + test_table(bond_arp_all_targets, NETDEV_BOND_ARP_ALL_TARGETS); + test_table(bond_arp_validate, NETDEV_BOND_ARP_VALIDATE); + test_table(bond_fail_over_mac, NETDEV_BOND_FAIL_OVER_MAC); + test_table(bond_lacp_rate, NETDEV_BOND_LACP_RATE); + test_table(bond_mode, NETDEV_BOND_MODE); + test_table(bond_primary_reselect, NETDEV_BOND_PRIMARY_RESELECT); + test_table(bond_xmit_hash_policy, NETDEV_BOND_XMIT_HASH_POLICY); + test_table(dhcp6_message_status, DHCP6_STATUS); + test_table_sparse(dhcp6_message_type, DHCP6_MESSAGE_TYPE); /* enum starts from 1 */ + test_table(dhcp_use_domains, DHCP_USE_DOMAINS); + test_table(duplex, DUP); + test_table(ip6tnl_mode, NETDEV_IP6_TNL_MODE); + test_table(ipv6_privacy_extensions, IPV6_PRIVACY_EXTENSIONS); + test_table(ipvlan_flags, NETDEV_IPVLAN_FLAGS); + test_table(link_operstate, LINK_OPERSTATE); + /* test_table(link_state, LINK_STATE); — not a reversible mapping */ + test_table(lldp_mode, LLDP_MODE); + test_table(netdev_kind, NETDEV_KIND); + test_table(radv_prefix_delegation, RADV_PREFIX_DELEGATION); + test_table(lldp_rx_event, SD_LLDP_RX_EVENT); + test_table(ndisc_event, SD_NDISC_EVENT); + test_table(dhcp_lease_server_type, SD_DHCP_LEASE_SERVER_TYPE); + + test_table_sparse(ipvlan_mode, NETDEV_IPVLAN_MODE); + test_table_sparse(macvlan_mode, NETDEV_MACVLAN_MODE); + test_table_sparse(address_family, ADDRESS_FAMILY); + + assert_cc(sizeof(sd_lldp_rx_event_t) == sizeof(int64_t)); + assert_cc(sizeof(sd_ndisc_event_t) == sizeof(int64_t)); + assert_cc(sizeof(sd_dhcp_lease_server_type_t) == sizeof(int64_t)); + + return EXIT_SUCCESS; +} diff --git a/src/network/test-network.c b/src/network/test-network.c new file mode 100644 index 0000000..5f3b4c0 --- /dev/null +++ b/src/network/test-network.c @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "dhcp-lease-internal.h" +#include "ether-addr-util.h" +#include "hostname-setup.h" +#include "network-internal.h" +#include "networkd-address.h" +#include "networkd-manager.h" +#include "networkd-route-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static void test_deserialize_in_addr(void) { + _cleanup_free_ struct in_addr *addresses = NULL; + _cleanup_free_ struct in6_addr *addresses6 = NULL; + union in_addr_union a, b, c, d, e, f; + int size; + const char *addresses_string = "192.168.0.1 0:0:0:0:0:FFFF:204.152.189.116 192.168.0.2 ::1 192.168.0.3 1:0:0:0:0:0:0:8"; + + assert_se(in_addr_from_string(AF_INET, "0:0:0:0:0:FFFF:204.152.189.116", &a) < 0); + assert_se(in_addr_from_string(AF_INET6, "192.168.0.1", &d) < 0); + + assert_se(in_addr_from_string(AF_INET, "192.168.0.1", &a) >= 0); + assert_se(in_addr_from_string(AF_INET, "192.168.0.2", &b) >= 0); + assert_se(in_addr_from_string(AF_INET, "192.168.0.3", &c) >= 0); + assert_se(in_addr_from_string(AF_INET6, "0:0:0:0:0:FFFF:204.152.189.116", &d) >= 0); + assert_se(in_addr_from_string(AF_INET6, "::1", &e) >= 0); + assert_se(in_addr_from_string(AF_INET6, "1:0:0:0:0:0:0:8", &f) >= 0); + + assert_se((size = deserialize_in_addrs(&addresses, addresses_string)) >= 0); + assert_se(size == 3); + assert_se(in4_addr_equal(&a.in, &addresses[0])); + assert_se(in4_addr_equal(&b.in, &addresses[1])); + assert_se(in4_addr_equal(&c.in, &addresses[2])); + + assert_se((size = deserialize_in6_addrs(&addresses6, addresses_string)) >= 0); + assert_se(size == 3); + assert_se(in6_addr_equal(&d.in6, &addresses6[0])); + assert_se(in6_addr_equal(&e.in6, &addresses6[1])); + assert_se(in6_addr_equal(&f.in6, &addresses6[2])); +} + +static void test_deserialize_dhcp_routes(void) { + size_t size; + + { + _cleanup_free_ struct sd_dhcp_route *routes = NULL; + assert_se(deserialize_dhcp_routes(&routes, &size, "") >= 0); + assert_se(size == 0); + } + + { + /* no errors */ + _cleanup_free_ struct sd_dhcp_route *routes = NULL; + const char *routes_string = "192.168.0.0/16,192.168.0.1 10.1.2.0/24,10.1.2.1 0.0.0.0/0,10.0.1.1"; + + assert_se(deserialize_dhcp_routes(&routes, &size, routes_string) >= 0); + + assert_se(size == 3); + assert_se(routes[0].dst_addr.s_addr == inet_addr("192.168.0.0")); + assert_se(routes[0].gw_addr.s_addr == inet_addr("192.168.0.1")); + assert_se(routes[0].dst_prefixlen == 16); + + assert_se(routes[1].dst_addr.s_addr == inet_addr("10.1.2.0")); + assert_se(routes[1].gw_addr.s_addr == inet_addr("10.1.2.1")); + assert_se(routes[1].dst_prefixlen == 24); + + assert_se(routes[2].dst_addr.s_addr == inet_addr("0.0.0.0")); + assert_se(routes[2].gw_addr.s_addr == inet_addr("10.0.1.1")); + assert_se(routes[2].dst_prefixlen == 0); + } + + { + /* error in second word */ + _cleanup_free_ struct sd_dhcp_route *routes = NULL; + const char *routes_string = "192.168.0.0/16,192.168.0.1 10.1.2.0#24,10.1.2.1 0.0.0.0/0,10.0.1.1"; + + assert_se(deserialize_dhcp_routes(&routes, &size, routes_string) >= 0); + + assert_se(size == 2); + assert_se(routes[0].dst_addr.s_addr == inet_addr("192.168.0.0")); + assert_se(routes[0].gw_addr.s_addr == inet_addr("192.168.0.1")); + assert_se(routes[0].dst_prefixlen == 16); + + assert_se(routes[1].dst_addr.s_addr == inet_addr("0.0.0.0")); + assert_se(routes[1].gw_addr.s_addr == inet_addr("10.0.1.1")); + assert_se(routes[1].dst_prefixlen == 0); + } + + { + /* error in every word */ + _cleanup_free_ struct sd_dhcp_route *routes = NULL; + const char *routes_string = "192.168.0.0/55,192.168.0.1 10.1.2.0#24,10.1.2.1 0.0.0.0/0,10.0.1.X"; + + assert_se(deserialize_dhcp_routes(&routes, &size, routes_string) >= 0); + assert_se(size == 0); + } +} + +static void test_route_tables_one(Manager *manager, const char *name, uint32_t number) { + _cleanup_free_ char *str = NULL, *expected = NULL, *num_str = NULL; + uint32_t t; + + if (!STR_IN_SET(name, "default", "main", "local")) { + assert_se(streq(hashmap_get(manager->route_table_names_by_number, UINT32_TO_PTR(number)), name)); + assert_se(PTR_TO_UINT32(hashmap_get(manager->route_table_numbers_by_name, name)) == number); + } + + assert_se(asprintf(&expected, "%s(%" PRIu32 ")", name, number) >= 0); + assert_se(manager_get_route_table_to_string(manager, number, /* append_num = */ true, &str) >= 0); + assert_se(streq(str, expected)); + + str = mfree(str); + + assert_se(manager_get_route_table_to_string(manager, number, /* append_num = */ false, &str) >= 0); + assert_se(streq(str, name)); + + assert_se(manager_get_route_table_from_string(manager, name, &t) >= 0); + assert_se(t == number); + + assert_se(asprintf(&num_str, "%" PRIu32, number) >= 0); + assert_se(manager_get_route_table_from_string(manager, num_str, &t) >= 0); + assert_se(t == number); +} + +static void test_route_tables(Manager *manager) { + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "hoge:123 foo:456 aaa:111", manager, manager) >= 0); + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "bbb:11111 ccc:22222", manager, manager) >= 0); + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "ddd:22222", manager, manager) >= 0); + + test_route_tables_one(manager, "hoge", 123); + test_route_tables_one(manager, "foo", 456); + test_route_tables_one(manager, "aaa", 111); + test_route_tables_one(manager, "bbb", 11111); + test_route_tables_one(manager, "ccc", 22222); + + assert_se(!hashmap_get(manager->route_table_numbers_by_name, "ddd")); + + test_route_tables_one(manager, "default", 253); + test_route_tables_one(manager, "main", 254); + test_route_tables_one(manager, "local", 255); + + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "", manager, manager) >= 0); + assert_se(!manager->route_table_names_by_number); + assert_se(!manager->route_table_numbers_by_name); + + /* Invalid pairs */ + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "main:123 default:333 local:999", manager, manager) >= 0); + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "xxx:253 yyy:254 local:255", manager, manager) >= 0); + assert_se(config_parse_route_table_names("manager", "filename", 1, "section", 1, "RouteTable", 0, "1234:321 :567 hoge:foo aaa:-888", manager, manager) >= 0); + assert_se(!manager->route_table_names_by_number); + assert_se(!manager->route_table_numbers_by_name); + + test_route_tables_one(manager, "default", 253); + test_route_tables_one(manager, "main", 254); + test_route_tables_one(manager, "local", 255); +} + +static int test_load_config(Manager *manager) { + int r; +/* TODO: should_reload, is false if the config dirs do not exist, so + * so we can't do this test here, move it to a test for paths_check_timestamps + * directly + * + * assert_se(network_should_reload(manager) == true); +*/ + + r = manager_load_config(manager); + if (r == -EPERM) + return r; + assert_se(r >= 0); + + return 0; +} + +static void test_dhcp_hostname_shorten_overlong(void) { + int r; + + { + /* simple hostname, no actions, no errors */ + _cleanup_free_ char *shortened = NULL; + r = shorten_overlong("name1", &shortened); + assert_se(r == 0); + assert_se(streq("name1", shortened)); + } + + { + /* simple fqdn, no actions, no errors */ + _cleanup_free_ char *shortened = NULL; + r = shorten_overlong("name1.example.com", &shortened); + assert_se(r == 0); + assert_se(streq("name1.example.com", shortened)); + } + + { + /* overlong fqdn, cut to first dot, no errors */ + _cleanup_free_ char *shortened = NULL; + r = shorten_overlong("name1.test-dhcp-this-one-here-is-a-very-very-long-domain.example.com", &shortened); + assert_se(r == 1); + assert_se(streq("name1", shortened)); + } + + { + /* overlong hostname, cut to HOST_MAX_LEN, no errors */ + _cleanup_free_ char *shortened = NULL; + r = shorten_overlong("test-dhcp-this-one-here-is-a-very-very-long-hostname-without-domainname", &shortened); + assert_se(r == 1); + assert_se(streq("test-dhcp-this-one-here-is-a-very-very-long-hostname-without-dom", shortened)); + } + + { + /* overlong fqdn, cut to first dot, empty result error */ + _cleanup_free_ char *shortened = NULL; + r = shorten_overlong(".test-dhcp-this-one-here-is-a-very-very-long-hostname.example.com", &shortened); + assert_se(r == -EDOM); + assert_se(shortened == NULL); + } + +} + +int main(void) { + _cleanup_(manager_freep) Manager *manager = NULL; + int r; + + test_setup_logging(LOG_INFO); + + test_deserialize_in_addr(); + test_deserialize_dhcp_routes(); + test_dhcp_hostname_shorten_overlong(); + + assert_se(manager_new(&manager, /* test_mode = */ true) >= 0); + assert_se(manager_setup(manager) >= 0); + + test_route_tables(manager); + + r = test_load_config(manager); + if (r == -EPERM) + log_debug("Cannot load configuration, ignoring."); + else + assert_se(r == 0); + + assert_se(manager_enumerate(manager) >= 0); + return 0; +} diff --git a/src/network/test-networkd-address.c b/src/network/test-networkd-address.c new file mode 100644 index 0000000..a40c571 --- /dev/null +++ b/src/network/test-networkd-address.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "networkd-address.h" +#include "tests.h" +#include "time-util.h" + +static void test_FORMAT_LIFETIME_one(usec_t lifetime, const char *expected) { + const char *t = FORMAT_LIFETIME(lifetime); + + log_debug(USEC_FMT " → \"%s\" (expected \"%s\")", lifetime, t, expected); + assert_se(streq(t, expected)); +} + +TEST(FORMAT_LIFETIME) { + usec_t now_usec; + + now_usec = now(CLOCK_BOOTTIME); + + test_FORMAT_LIFETIME_one(now_usec, "for 0"); + test_FORMAT_LIFETIME_one(usec_add(now_usec, 2 * USEC_PER_SEC - 1), "for 1s"); + test_FORMAT_LIFETIME_one(usec_add(now_usec, 3 * USEC_PER_WEEK + USEC_PER_SEC - 1), "for 3w"); + test_FORMAT_LIFETIME_one(USEC_INFINITY, "forever"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/network/test-networkd-conf.c b/src/network/test-networkd-conf.c new file mode 100644 index 0000000..808db99 --- /dev/null +++ b/src/network/test-networkd-conf.c @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "hexdecoct.h" +#include "log.h" +#include "macro.h" +#include "net-condition.h" +#include "networkd-address.h" +#include "networkd-conf.h" +#include "networkd-manager.h" +#include "networkd-network.h" +#include "strv.h" +#include "tests.h" + +static void test_config_parse_duid_type_one(const char *rvalue, int ret, DUIDType expected, usec_t expected_time) { + DUID actual = {}; + int r; + + r = config_parse_duid_type("network", "filename", 1, "section", 1, "lvalue", 0, rvalue, &actual, NULL); + log_info_errno(r, "\"%s\" → %d (%m)", rvalue, actual.type); + assert_se(r == ret); + assert_se(expected == actual.type); + if (expected == DUID_TYPE_LLT) + assert_se(expected_time == actual.llt_time); +} + +TEST(config_parse_duid_type) { + test_config_parse_duid_type_one("", 0, 0, 0); + test_config_parse_duid_type_one("link-layer-time", 0, DUID_TYPE_LLT, 0); + test_config_parse_duid_type_one("link-layer-time:2000-01-01 00:00:00 UTC", 0, DUID_TYPE_LLT, (usec_t) 946684800000000); + test_config_parse_duid_type_one("vendor", 0, DUID_TYPE_EN, 0); + test_config_parse_duid_type_one("vendor:2000-01-01 00:00:00 UTC", 0, 0, 0); + test_config_parse_duid_type_one("link-layer", 0, DUID_TYPE_LL, 0); + test_config_parse_duid_type_one("link-layer:2000-01-01 00:00:00 UTC", 0, 0, 0); + test_config_parse_duid_type_one("uuid", 0, DUID_TYPE_UUID, 0); + test_config_parse_duid_type_one("uuid:2000-01-01 00:00:00 UTC", 0, 0, 0); + test_config_parse_duid_type_one("foo", 0, 0, 0); + test_config_parse_duid_type_one("foo:2000-01-01 00:00:00 UTC", 0, 0, 0); +} + +static void test_config_parse_duid_rawdata_one(const char *rvalue, int ret, const DUID* expected) { + DUID actual = {}; + int r; + _cleanup_free_ char *d = NULL; + + r = config_parse_duid_rawdata("network", "filename", 1, "section", 1, "lvalue", 0, rvalue, &actual, NULL); + d = hexmem(actual.raw_data, actual.raw_data_len); + log_info_errno(r, "\"%s\" → \"%s\" (%m)", + rvalue, strnull(d)); + assert_se(r == ret); + if (expected) { + assert_se(actual.raw_data_len == expected->raw_data_len); + assert_se(memcmp(actual.raw_data, expected->raw_data, expected->raw_data_len) == 0); + } +} + +static void test_config_parse_ether_addr_one(const char *rvalue, int ret, const struct ether_addr* expected) { + struct ether_addr *actual = NULL; + int r; + + r = config_parse_ether_addr("network", "filename", 1, "section", 1, "lvalue", 0, rvalue, &actual, NULL); + assert_se(ret == r); + if (expected) { + assert_se(actual); + assert_se(ether_addr_equal(expected, actual)); + } else + assert_se(actual == NULL); + + free(actual); +} + +static void test_config_parse_ether_addrs_one(const char *rvalue, const struct ether_addr* list, size_t n) { + _cleanup_set_free_free_ Set *s = NULL; + + assert_se(config_parse_ether_addrs("network", "filename", 1, "section", 1, "lvalue", 0, rvalue, &s, NULL) == 0); + assert_se(set_size(s) == n); + + for (size_t m = 0; m < n; m++) { + _cleanup_free_ struct ether_addr *q = NULL; + + assert_se(q = set_remove(s, &list[m])); + } + + assert_se(set_size(s) == 0); +} + +#define STR_OK \ + "00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:" \ + "10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f:" \ + "20:21:22:23:24:25:26:27:28:29:2a:2b:2c:2d:2e:2f:" \ + "30:31:32:33:34:35:36:37:38:39:3a:3b:3c:3d:3e:3f:" \ + "40:41:42:43:44:45:46:47:48:49:4a:4b:4c:4d:4e:4f:" \ + "50:51:52:53:54:55:56:57:58:59:5a:5b:5c:5d:5e:5f:" \ + "60:61:62:63:64:65:66:67:68:69:6a:6b:6c:6d:6e:6f:" \ + "70:71:72:73:74:75:76:77:78:79:7a:7b:7c:7d:7e:7f" +#define STR_TOO_LONG \ + "00:01:02:03:04:05:06:07:08:09:0a:0b:0c:0d:0e:0f:" \ + "10:11:12:13:14:15:16:17:18:19:1a:1b:1c:1d:1e:1f:" \ + "20:21:22:23:24:25:26:27:28:29:2a:2b:2c:2d:2e:2f:" \ + "30:31:32:33:34:35:36:37:38:39:3a:3b:3c:3d:3e:3f:" \ + "40:41:42:43:44:45:46:47:48:49:4a:4b:4c:4d:4e:4f:" \ + "50:51:52:53:54:55:56:57:58:59:5a:5b:5c:5d:5e:5f:" \ + "60:61:62:63:64:65:66:67:68:69:6a:6b:6c:6d:6e:6f:" \ + "70:71:72:73:74:75:76:77:78:79:7a:7b:7c:7d:7e:7f:" \ + "80" + +#define BYTES_OK { \ + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \ + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, \ + 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, \ + 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, \ + 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f, \ + 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, \ + 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, \ + 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, \ +} + +TEST(config_parse_duid_rawdata) { + test_config_parse_duid_rawdata_one("", 0, &(DUID){}); + test_config_parse_duid_rawdata_one("00:11:22:33:44:55:66:77", 0, + &(DUID){0, 8, {0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77}}); + test_config_parse_duid_rawdata_one("00:11:22:", 0, + &(DUID){0, 3, {0x00,0x11,0x22}}); + test_config_parse_duid_rawdata_one("000:11:22", 0, &(DUID){}); /* error, output is all zeros */ + test_config_parse_duid_rawdata_one("00:111:22", 0, &(DUID){}); + test_config_parse_duid_rawdata_one("0:1:2:3:4:5:6:7", 0, + &(DUID){0, 8, {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7}}); + test_config_parse_duid_rawdata_one("11::", 0, &(DUID){0, 1, {0x11}}); /* FIXME: should this be an error? */ + test_config_parse_duid_rawdata_one("abcdef", 0, &(DUID){}); + test_config_parse_duid_rawdata_one(STR_TOO_LONG, 0, &(DUID){}); + test_config_parse_duid_rawdata_one(STR_OK, 0, &(DUID){0, 128, BYTES_OK}); +} + +TEST(config_parse_ether_addr) { + const struct ether_addr t[] = { + { .ether_addr_octet = { 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff } }, + { .ether_addr_octet = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab } }, + }; + + test_config_parse_ether_addr_one("", 0, NULL); + test_config_parse_ether_addr_one("no:ta:ma:ca:dd:re", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:dd:ee:fx", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:dd:ee:ff", 0, &t[0]); + test_config_parse_ether_addr_one(" aa:bb:cc:dd:ee:ff", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:dd:ee:ff \t\n", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:dd:ee:ff \t\nxxx", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc: dd:ee:ff", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:d d:ee:ff", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:dd:ee", 0, NULL); + test_config_parse_ether_addr_one("9:aa:bb:cc:dd:ee:ff", 0, NULL); + test_config_parse_ether_addr_one("aa:bb:cc:dd:ee:ff:gg", 0, NULL); + test_config_parse_ether_addr_one("aa:Bb:CC:dd:ee:ff", 0, &t[0]); + test_config_parse_ether_addr_one("01:23:45:67:89:aB", 0, &t[1]); + test_config_parse_ether_addr_one("1:23:45:67:89:aB", 0, &t[1]); + test_config_parse_ether_addr_one("aa-bb-cc-dd-ee-ff", 0, &t[0]); + test_config_parse_ether_addr_one("AA-BB-CC-DD-EE-FF", 0, &t[0]); + test_config_parse_ether_addr_one("01-23-45-67-89-ab", 0, &t[1]); + test_config_parse_ether_addr_one("aabb.ccdd.eeff", 0, &t[0]); + test_config_parse_ether_addr_one("0123.4567.89ab", 0, &t[1]); + test_config_parse_ether_addr_one("123.4567.89ab.", 0, NULL); + test_config_parse_ether_addr_one("aabbcc.ddeeff", 0, NULL); + test_config_parse_ether_addr_one("aabbccddeeff", 0, NULL); + test_config_parse_ether_addr_one("aabbccddee:ff", 0, NULL); + test_config_parse_ether_addr_one("012345.6789ab", 0, NULL); + test_config_parse_ether_addr_one("123.4567.89ab", 0, &t[1]); + + test_config_parse_ether_addrs_one("", t, 0); + test_config_parse_ether_addrs_one("no:ta:ma:ca:dd:re", t, 0); + test_config_parse_ether_addrs_one("aa:bb:cc:dd:ee:fx", t, 0); + test_config_parse_ether_addrs_one("aa:bb:cc:dd:ee:ff", t, 1); + test_config_parse_ether_addrs_one(" aa:bb:cc:dd:ee:ff", t, 1); + test_config_parse_ether_addrs_one("aa:bb:cc:dd:ee:ff \t\n", t, 1); + test_config_parse_ether_addrs_one("aa:bb:cc:dd:ee:ff \t\nxxx", t, 1); + test_config_parse_ether_addrs_one("aa:bb:cc: dd:ee:ff", t, 0); + test_config_parse_ether_addrs_one("aa:bb:cc:d d:ee:ff", t, 0); + test_config_parse_ether_addrs_one("aa:bb:cc:dd:ee", t, 0); + test_config_parse_ether_addrs_one("9:aa:bb:cc:dd:ee:ff", t, 0); + test_config_parse_ether_addrs_one("aa:bb:cc:dd:ee:ff:gg", t, 0); + test_config_parse_ether_addrs_one("aa:Bb:CC:dd:ee:ff", t, 1); + test_config_parse_ether_addrs_one("01:23:45:67:89:aB", &t[1], 1); + test_config_parse_ether_addrs_one("1:23:45:67:89:aB", &t[1], 1); + test_config_parse_ether_addrs_one("aa-bb-cc-dd-ee-ff", t, 1); + test_config_parse_ether_addrs_one("AA-BB-CC-DD-EE-FF", t, 1); + test_config_parse_ether_addrs_one("01-23-45-67-89-ab", &t[1], 1); + test_config_parse_ether_addrs_one("aabb.ccdd.eeff", t, 1); + test_config_parse_ether_addrs_one("0123.4567.89ab", &t[1], 1); + test_config_parse_ether_addrs_one("123.4567.89ab.", t, 0); + test_config_parse_ether_addrs_one("aabbcc.ddeeff", t, 0); + test_config_parse_ether_addrs_one("aabbccddeeff", t, 0); + test_config_parse_ether_addrs_one("aabbccddee:ff", t, 0); + test_config_parse_ether_addrs_one("012345.6789ab", t, 0); + test_config_parse_ether_addrs_one("123.4567.89ab", &t[1], 1); + + test_config_parse_ether_addrs_one("123.4567.89ab aa:bb:cc:dd:ee:ff 01-23-45-67-89-ab aa:Bb:CC:dd:ee:ff", t, 2); + test_config_parse_ether_addrs_one("123.4567.89ab aa:bb:cc:dd:ee:fx hogehoge 01-23-45-67-89-ab aaaa aa:Bb:CC:dd:ee:ff", t, 2); +} + +static void test_config_parse_address_one(const char *rvalue, int family, unsigned n_addresses, const union in_addr_union *u, unsigned char prefixlen) { + _cleanup_(manager_freep) Manager *manager = NULL; + _cleanup_(network_unrefp) Network *network = NULL; + + assert_se(manager_new(&manager, /* test_mode = */ true) >= 0); + assert_se(network = new0(Network, 1)); + network->n_ref = 1; + network->manager = manager; + assert_se(network->filename = strdup("hogehoge.network")); + + assert_se(config_parse_match_ifnames("network", "filename", 1, "section", 1, "Name", 0, "*", &network->match.ifname, network) == 0); + assert_se(config_parse_address("network", "filename", 1, "section", 1, "Address", 0, rvalue, network, network) == 0); + assert_se(ordered_hashmap_size(network->addresses_by_section) == 1); + assert_se(network_verify(network) >= 0); + assert_se(ordered_hashmap_size(network->addresses_by_section) == n_addresses); + if (n_addresses > 0) { + Address *a; + + assert_se(a = ordered_hashmap_first(network->addresses_by_section)); + assert_se(a->prefixlen == prefixlen); + assert_se(a->family == family); + assert_se(in_addr_equal(family, &a->in_addr, u)); + /* TODO: check Address.in_addr and Address.broadcast */ + } +} + +TEST(config_parse_address) { + test_config_parse_address_one("", AF_INET, 0, NULL, 0); + test_config_parse_address_one("/", AF_INET, 0, NULL, 0); + test_config_parse_address_one("/8", AF_INET, 0, NULL, 0); + test_config_parse_address_one("1.2.3.4", AF_INET, 1, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 32); + test_config_parse_address_one("1.2.3.4/0", AF_INET, 1, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 0); + test_config_parse_address_one("1.2.3.4/1", AF_INET, 1, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 1); + test_config_parse_address_one("1.2.3.4/2", AF_INET, 1, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 2); + test_config_parse_address_one("1.2.3.4/32", AF_INET, 1, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 32); + test_config_parse_address_one("1.2.3.4/33", AF_INET, 0, NULL, 0); + test_config_parse_address_one("1.2.3.4/-1", AF_INET, 0, NULL, 0); + + test_config_parse_address_one("", AF_INET6, 0, NULL, 0); + test_config_parse_address_one("/", AF_INET6, 0, NULL, 0); + test_config_parse_address_one("/8", AF_INET6, 0, NULL, 0); + test_config_parse_address_one("::1", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 128); + test_config_parse_address_one("::1/0", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 0); + test_config_parse_address_one("::1/1", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 1); + test_config_parse_address_one("::1/2", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 2); + test_config_parse_address_one("::1/32", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 32); + test_config_parse_address_one("::1/33", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 33); + test_config_parse_address_one("::1/64", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 64); + test_config_parse_address_one("::1/128", AF_INET6, 1, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 128); + test_config_parse_address_one("::1/129", AF_INET6, 0, NULL, 0); + test_config_parse_address_one("::1/-1", AF_INET6, 0, NULL, 0); +} + +TEST(config_parse_match_ifnames) { + _cleanup_strv_free_ char **names = NULL; + + assert_se(config_parse_match_ifnames("network", "filename", 1, "section", 1, "Name", 0, "!hoge hogehoge foo", &names, NULL) == 0); + assert_se(config_parse_match_ifnames("network", "filename", 1, "section", 1, "Name", 0, "!baz", &names, NULL) == 0); + assert_se(config_parse_match_ifnames("network", "filename", 1, "section", 1, "Name", 0, "aaa bbb ccc", &names, NULL) == 0); + + assert_se(strv_equal(names, STRV_MAKE("!hoge", "!hogehoge", "!foo", "!baz", "aaa", "bbb", "ccc"))); +} + +TEST(config_parse_match_strv) { + _cleanup_strv_free_ char **names = NULL; + + assert_se(config_parse_match_strv("network", "filename", 1, "section", 1, "Name", 0, "!hoge hogehoge foo", &names, NULL) == 0); + assert_se(config_parse_match_strv("network", "filename", 1, "section", 1, "Name", 0, "!baz", &names, NULL) == 0); + assert_se(config_parse_match_strv("network", "filename", 1, "section", 1, "Name", 0, + "KEY=val \"KEY2=val with space\" \"KEY3=val with \\\"quotation\\\"\"", &names, NULL) == 0); + + assert_se(strv_equal(names, + STRV_MAKE("!hoge", + "!hogehoge", + "!foo", + "!baz", + "KEY=val", + "KEY2=val with space", + "KEY3=val with \\quotation\\"))); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/network/test-networkd-util.c b/src/network/test-networkd-util.c new file mode 100644 index 0000000..f29ca2c --- /dev/null +++ b/src/network/test-networkd-util.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "networkd-util.h" +#include "tests.h" + +TEST(network_config_state_to_string_alloc) { + for (unsigned i = 1; i <= NETWORK_CONFIG_STATE_REMOVING; i <<= 1) { + _cleanup_free_ char *x; + + assert_se(network_config_state_to_string_alloc(i, &x) == 0); + log_debug("%u → %s", i, x); + } + + _cleanup_free_ char *x; + assert_se(network_config_state_to_string_alloc(~0u, &x) == 0); + log_debug("%u → %s", ~0u, x); +}; + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/network/wait-online/link.c b/src/network/wait-online/link.c new file mode 100644 index 0000000..a8ab7f5 --- /dev/null +++ b/src/network/wait-online/link.c @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-network.h" + +#include "alloc-util.h" +#include "format-util.h" +#include "hashmap.h" +#include "link.h" +#include "manager.h" +#include "string-util.h" +#include "strv.h" + +int link_new(Manager *m, Link **ret, int ifindex, const char *ifname) { + _cleanup_(link_freep) Link *l = NULL; + _cleanup_free_ char *n = NULL; + int r; + + assert(m); + assert(ifindex > 0); + assert(ifname); + + n = strdup(ifname); + if (!n) + return -ENOMEM; + + l = new(Link, 1); + if (!l) + return -ENOMEM; + + *l = (Link) { + .manager = m, + .ifname = TAKE_PTR(n), + .ifindex = ifindex, + .required_operstate = LINK_OPERSTATE_RANGE_DEFAULT, + }; + + r = hashmap_ensure_put(&m->links_by_index, NULL, INT_TO_PTR(ifindex), l); + if (r < 0) + return r; + + r = hashmap_ensure_put(&m->links_by_name, &string_hash_ops, l->ifname, l); + if (r < 0) + return r; + + if (ret) + *ret = l; + + TAKE_PTR(l); + return 0; +} + +Link *link_free(Link *l) { + + if (!l) + return NULL; + + if (l->manager) { + hashmap_remove(l->manager->links_by_index, INT_TO_PTR(l->ifindex)); + hashmap_remove(l->manager->links_by_name, l->ifname); + + STRV_FOREACH(n, l->altnames) + hashmap_remove(l->manager->links_by_name, *n); + } + + free(l->state); + free(l->ifname); + strv_free(l->altnames); + return mfree(l); +} + +static int link_update_name(Link *l, sd_netlink_message *m) { + char ifname_from_index[IF_NAMESIZE]; + const char *ifname; + int r; + + assert(l); + assert(l->manager); + assert(m); + + r = sd_netlink_message_read_string(m, IFLA_IFNAME, &ifname); + if (r == -ENODATA) + /* Hmm? But ok. */ + return 0; + if (r < 0) + return r; + + if (streq(ifname, l->ifname)) + return 0; + + /* The kernel sometimes sends wrong ifname change. Let's confirm the received name. */ + r = format_ifname(l->ifindex, ifname_from_index); + if (r < 0) + return r; + + if (!streq(ifname, ifname_from_index)) { + log_link_debug(l, "New interface name '%s' received from the kernel does not correspond " + "with the name currently configured on the actual interface '%s'. Ignoring.", + ifname, ifname_from_index); + return 0; + } + + hashmap_remove(l->manager->links_by_name, l->ifname); + + r = free_and_strdup(&l->ifname, ifname); + if (r < 0) + return r; + + r = hashmap_ensure_put(&l->manager->links_by_name, &string_hash_ops, l->ifname, l); + if (r < 0) + return r; + + return 0; +} + +static int link_update_altnames(Link *l, sd_netlink_message *m) { + _cleanup_strv_free_ char **altnames = NULL; + int r; + + assert(l); + assert(l->manager); + assert(m); + + r = sd_netlink_message_read_strv(m, IFLA_PROP_LIST, IFLA_ALT_IFNAME, &altnames); + if (r == -ENODATA) + /* The message does not have IFLA_PROP_LIST container attribute. It does not mean the + * interface has no alternative name. */ + return 0; + if (r < 0) + return r; + + if (strv_equal(altnames, l->altnames)) + return 0; + + STRV_FOREACH(n, l->altnames) + hashmap_remove(l->manager->links_by_name, *n); + + strv_free_and_replace(l->altnames, altnames); + + STRV_FOREACH(n, l->altnames) { + r = hashmap_ensure_put(&l->manager->links_by_name, &string_hash_ops, *n, l); + if (r < 0) + return r; + } + + return 0; +} + +int link_update_rtnl(Link *l, sd_netlink_message *m) { + int r; + + assert(l); + assert(l->manager); + assert(m); + + r = sd_rtnl_message_link_get_flags(m, &l->flags); + if (r < 0) + return r; + + r = link_update_name(l, m); + if (r < 0) + return r; + + r = link_update_altnames(l, m); + if (r < 0) + return r; + + return 0; +} + +int link_update_monitor(Link *l) { + _cleanup_free_ char *required_operstate = NULL, *required_family = NULL, + *ipv4_address_state = NULL, *ipv6_address_state = NULL, *state = NULL; + int r, ret = 0; + + assert(l); + assert(l->ifname); + + r = sd_network_link_get_required_for_online(l->ifindex); + if (r < 0 && r != -ENODATA) + ret = log_link_debug_errno(l, r, "Failed to determine whether the link is required for online or not, " + "assuming required: %m"); + l->required_for_online = r != 0; + + r = sd_network_link_get_required_operstate_for_online(l->ifindex, &required_operstate); + if (r < 0 && r != -ENODATA) + ret = log_link_debug_errno(l, r, "Failed to get required operational state, ignoring: %m"); + + if (isempty(required_operstate)) + l->required_operstate = LINK_OPERSTATE_RANGE_DEFAULT; + else { + r = parse_operational_state_range(required_operstate, &l->required_operstate); + if (r < 0) + ret = log_link_debug_errno(l, SYNTHETIC_ERRNO(EINVAL), + "Failed to parse required operational state, ignoring: %m"); + } + + r = network_link_get_operational_state(l->ifindex, &l->operational_state); + if (r < 0) + ret = log_link_debug_errno(l, r, "Failed to get operational state, ignoring: %m"); + + r = sd_network_link_get_required_family_for_online(l->ifindex, &required_family); + if (r < 0 && r != -ENODATA) + ret = log_link_debug_errno(l, r, "Failed to get required address family, ignoring: %m"); + + if (isempty(required_family)) + l->required_family = ADDRESS_FAMILY_NO; + else { + AddressFamily f; + + f = link_required_address_family_from_string(required_family); + if (f < 0) + ret = log_link_debug_errno(l, f, "Failed to parse required address family, ignoring: %m"); + else + l->required_family = f; + } + + r = sd_network_link_get_ipv4_address_state(l->ifindex, &ipv4_address_state); + if (r < 0) + ret = log_link_debug_errno(l, r, "Failed to get IPv4 address state, ignoring: %m"); + else { + LinkAddressState s; + + s = link_address_state_from_string(ipv4_address_state); + if (s < 0) + ret = log_link_debug_errno(l, s, "Failed to parse IPv4 address state, ignoring: %m"); + else + l->ipv4_address_state = s; + } + + r = sd_network_link_get_ipv6_address_state(l->ifindex, &ipv6_address_state); + if (r < 0) + ret = log_link_debug_errno(l, r, "Failed to get IPv6 address state, ignoring: %m"); + else { + LinkAddressState s; + + s = link_address_state_from_string(ipv6_address_state); + if (s < 0) + ret = log_link_debug_errno(l, s, "Failed to parse IPv6 address state, ignoring: %m"); + else + l->ipv6_address_state = s; + } + + r = sd_network_link_get_setup_state(l->ifindex, &state); + if (r < 0) + ret = log_link_debug_errno(l, r, "Failed to get setup state, ignoring: %m"); + else + free_and_replace(l->state, state); + + return ret; +} diff --git a/src/network/wait-online/link.h b/src/network/wait-online/link.h new file mode 100644 index 0000000..5dc26d9 --- /dev/null +++ b/src/network/wait-online/link.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-netlink.h" + +#include "log-link.h" +#include "network-util.h" + +typedef struct Link Link; +typedef struct Manager Manager; + +struct Link { + Manager *manager; + + int ifindex; + char *ifname; + char **altnames; + unsigned flags; + + bool required_for_online; + LinkOperationalStateRange required_operstate; + LinkOperationalState operational_state; + AddressFamily required_family; + LinkAddressState ipv4_address_state; + LinkAddressState ipv6_address_state; + char *state; +}; + +int link_new(Manager *m, Link **ret, int ifindex, const char *ifname); +Link *link_free(Link *l); +int link_update_rtnl(Link *l, sd_netlink_message *m); +int link_update_monitor(Link *l); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_free); diff --git a/src/network/wait-online/manager.c b/src/network/wait-online/manager.c new file mode 100644 index 0000000..40a9fba --- /dev/null +++ b/src/network/wait-online/manager.c @@ -0,0 +1,441 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "link.h" +#include "manager.h" +#include "netlink-util.h" +#include "strv.h" +#include "time-util.h" + +static bool link_in_command_line_interfaces(Link *link, Manager *m) { + assert(link); + assert(m); + + if (hashmap_contains(m->command_line_interfaces_by_name, link->ifname)) + return true; + + STRV_FOREACH(n, link->altnames) + if (hashmap_contains(m->command_line_interfaces_by_name, *n)) + return true; + + return false; +} + +static bool manager_ignore_link(Manager *m, Link *link) { + assert(m); + assert(link); + + /* always ignore the loopback interface */ + if (link->flags & IFF_LOOPBACK) + return true; + + /* if interfaces are given on the command line, ignore all others */ + if (m->command_line_interfaces_by_name && + !link_in_command_line_interfaces(link, m)) + return true; + + if (!link->required_for_online) + return true; + + /* ignore interfaces we explicitly are asked to ignore */ + if (strv_fnmatch(m->ignored_interfaces, link->ifname)) + return true; + + STRV_FOREACH(n, link->altnames) + if (strv_fnmatch(m->ignored_interfaces, *n)) + return true; + + return false; +} + +static int manager_link_is_online(Manager *m, Link *l, LinkOperationalStateRange s) { + AddressFamily required_family; + bool needs_ipv4; + bool needs_ipv6; + + assert(m); + assert(l); + + /* This returns the following: + * -EAGAIN : not processed by udev + * -EBUSY : being processed by networkd + * -EADDRNOTAVAIL: requested conditions (operstate and/or addresses) are not satisfied + * false : unmanaged + * true : online */ + + if (!l->state || streq(l->state, "pending")) + /* If no state string exists, networkd (and possibly also udevd) has not detected the + * interface yet, that mean we cannot determine whether the interface is managed or + * not. Hence, return negative value. + * If the link is in pending state, then udevd has not processed the link, and networkd + * has not tried to find .network file for the link. Hence, return negative value. */ + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EAGAIN), + "link has not yet been processed by udev: setup state is %s.", + strna(l->state)); + + if (streq(l->state, "unmanaged")) { + /* If the link is in unmanaged state, then ignore the interface unless the interface is + * specified in '--interface/-i' option. */ + if (!link_in_command_line_interfaces(l, m)) { + log_link_debug(l, "link is not managed by networkd."); + return false; + } + + } else if (!streq(l->state, "configured")) + /* If the link is in non-configured state, return negative value here. */ + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EBUSY), + "link is being processed by networkd: setup state is %s.", + l->state); + + if (s.min < 0) + s.min = m->required_operstate.min >= 0 ? m->required_operstate.min + : l->required_operstate.min; + + if (s.max < 0) + s.max = m->required_operstate.max >= 0 ? m->required_operstate.max + : l->required_operstate.max; + + if (l->operational_state < s.min || l->operational_state > s.max) + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "Operational state '%s' is not in range ['%s':'%s']", + link_operstate_to_string(l->operational_state), + link_operstate_to_string(s.min), link_operstate_to_string(s.max)); + + required_family = m->required_family > 0 ? m->required_family : l->required_family; + needs_ipv4 = required_family & ADDRESS_FAMILY_IPV4; + needs_ipv6 = required_family & ADDRESS_FAMILY_IPV6; + + if (s.min < LINK_OPERSTATE_ROUTABLE) { + if (needs_ipv4 && l->ipv4_address_state < LINK_ADDRESS_STATE_DEGRADED) + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "No routable or link-local IPv4 address is configured."); + + if (needs_ipv6 && l->ipv6_address_state < LINK_ADDRESS_STATE_DEGRADED) + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "No routable or link-local IPv6 address is configured."); + } else { + if (needs_ipv4 && l->ipv4_address_state < LINK_ADDRESS_STATE_ROUTABLE) + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "No routable IPv4 address is configured."); + + if (needs_ipv6 && l->ipv6_address_state < LINK_ADDRESS_STATE_ROUTABLE) + return log_link_debug_errno(l, SYNTHETIC_ERRNO(EADDRNOTAVAIL), + "No routable IPv6 address is configured."); + } + + log_link_debug(l, "link is configured by networkd and online."); + return true; +} + +bool manager_configured(Manager *m) { + Link *l; + int r; + + if (!hashmap_isempty(m->command_line_interfaces_by_name)) { + LinkOperationalStateRange *range; + const char *ifname; + + /* wait for all the links given on the command line to appear */ + HASHMAP_FOREACH_KEY(range, ifname, m->command_line_interfaces_by_name) { + + l = hashmap_get(m->links_by_name, ifname); + if (!l) { + if (range->min == LINK_OPERSTATE_MISSING) { + if (m->any) + return true; + } else { + log_debug("still waiting for %s", ifname); + if (!m->any) + return false; + } + continue; + } + + r = manager_link_is_online(m, l, *range); + if (r <= 0 && !m->any) + return false; + if (r > 0 && m->any) + return true; + } + + /* With '--any' : no interface is ready → return false + * Without '--any': all interfaces are ready → return true */ + return !m->any; + } + + /* wait for all links networkd manages */ + bool has_online = false; + HASHMAP_FOREACH(l, m->links_by_index) { + if (manager_ignore_link(m, l)) { + log_link_debug(l, "link is ignored"); + continue; + } + + r = manager_link_is_online(m, l, + (LinkOperationalStateRange) { _LINK_OPERSTATE_INVALID, + _LINK_OPERSTATE_INVALID }); + /* Unlike the above loop, unmanaged interfaces are ignored here. Also, Configured but offline + * interfaces are ignored. See issue #29506. */ + if (r < 0 && r != -EADDRNOTAVAIL && !m->any) + return false; + if (r > 0) { + if (m->any) + return true; + has_online = true; + } + } + + /* With '--any' : no interface is ready → return false + * Without '--any': all interfaces are ready or unmanaged + * + * In this stage, drivers for interfaces may not be loaded yet, and there may be only lo. + * To avoid that wait-online exits earlier than that drivers are loaded, let's request at least one + * managed online interface exists. See issue #27822. */ + return !m->any && has_online; +} + +static int manager_process_link(sd_netlink *rtnl, sd_netlink_message *mm, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + uint16_t type; + Link *l; + const char *ifname; + int ifindex, r; + + assert(rtnl); + assert(mm); + + r = sd_netlink_message_get_type(mm, &type); + if (r < 0) { + log_warning_errno(r, "rtnl: Could not get message type, ignoring: %m"); + return 0; + } + + r = sd_rtnl_message_link_get_ifindex(mm, &ifindex); + if (r < 0) { + log_warning_errno(r, "rtnl: Could not get ifindex from link, ignoring: %m"); + return 0; + } else if (ifindex <= 0) { + log_warning("rtnl: received link message with invalid ifindex %d, ignoring", ifindex); + return 0; + } + + r = sd_netlink_message_read_string(mm, IFLA_IFNAME, &ifname); + if (r < 0) { + log_warning_errno(r, "rtnl: Received link message without ifname, ignoring: %m"); + return 0; + } + + l = hashmap_get(m->links_by_index, INT_TO_PTR(ifindex)); + + switch (type) { + + case RTM_NEWLINK: + if (!l) { + log_debug("Found link %s(%i)", ifname, ifindex); + + r = link_new(m, &l, ifindex, ifname); + if (r < 0) { + log_warning_errno(r, "Failed to create link object for %s(%i), ignoring: %m", ifname, ifindex); + return 0; + } + } + + r = link_update_rtnl(l, mm); + if (r < 0) + log_link_warning_errno(l, r, "Failed to process RTNL link message, ignoring: %m"); + + r = link_update_monitor(l); + if (r < 0) + log_link_full_errno(l, IN_SET(r, -ENODATA, -ENOENT) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to update link state, ignoring: %m"); + + break; + + case RTM_DELLINK: + if (l) { + log_link_debug(l, "Removing link"); + link_free(l); + } + + break; + } + + return 0; +} + +static int on_rtnl_event(sd_netlink *rtnl, sd_netlink_message *mm, void *userdata) { + Manager *m = userdata; + int r; + + r = manager_process_link(rtnl, mm, m); + if (r < 0) + return r; + + if (manager_configured(m)) + sd_event_exit(m->event, 0); + + return 1; +} + +static int manager_rtnl_listen(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + int r; + + assert(m); + + /* First, subscribe to interfaces coming and going */ + r = sd_netlink_open(&m->rtnl); + if (r < 0) + return r; + + r = sd_netlink_attach_event(m->rtnl, m->event, 0); + if (r < 0) + return r; + + r = sd_netlink_add_match(m->rtnl, NULL, RTM_NEWLINK, on_rtnl_event, NULL, m, "wait-online-on-NEWLINK"); + if (r < 0) + return r; + + r = sd_netlink_add_match(m->rtnl, NULL, RTM_DELLINK, on_rtnl_event, NULL, m, "wait-online-on-DELLINK"); + if (r < 0) + return r; + + /* Then, enumerate all links */ + r = sd_rtnl_message_new_link(m->rtnl, &req, RTM_GETLINK, 0); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(m->rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *i = reply; i; i = sd_netlink_message_next(i)) { + r = manager_process_link(m->rtnl, i, m); + if (r < 0) + return r; + } + + return r; +} + +static int on_network_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Link *l; + int r; + + sd_network_monitor_flush(m->network_monitor); + + HASHMAP_FOREACH(l, m->links_by_index) { + r = link_update_monitor(l); + if (r < 0) + log_link_full_errno(l, IN_SET(r, -ENODATA, -ENOENT) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to update link state, ignoring: %m"); + } + + if (manager_configured(m)) + sd_event_exit(m->event, 0); + + return 0; +} + +static int manager_network_monitor_listen(Manager *m) { + int r, fd, events; + + assert(m); + + r = sd_network_monitor_new(&m->network_monitor, NULL); + if (r < 0) + return r; + + fd = sd_network_monitor_get_fd(m->network_monitor); + if (fd < 0) + return fd; + + events = sd_network_monitor_get_events(m->network_monitor); + if (events < 0) + return events; + + r = sd_event_add_io(m->event, &m->network_monitor_event_source, + fd, events, &on_network_event, m); + if (r < 0) + return r; + + return 0; +} + +int manager_new(Manager **ret, + Hashmap *command_line_interfaces_by_name, + char **ignored_interfaces, + LinkOperationalStateRange required_operstate, + AddressFamily required_family, + bool any, + usec_t timeout) { + + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .command_line_interfaces_by_name = command_line_interfaces_by_name, + .ignored_interfaces = ignored_interfaces, + .required_operstate = required_operstate, + .required_family = required_family, + .any = any, + }; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + (void) sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + + if (timeout > 0) { + r = sd_event_add_time_relative(m->event, NULL, CLOCK_BOOTTIME, timeout, 0, NULL, INT_TO_PTR(-ETIMEDOUT)); + if (r < 0 && r != -EOVERFLOW) + return r; + } + + sd_event_set_watchdog(m->event, true); + + r = manager_network_monitor_listen(m); + if (r < 0) + return r; + + r = manager_rtnl_listen(m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + + return 0; +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + hashmap_free_with_destructor(m->links_by_index, link_free); + hashmap_free(m->links_by_name); + + sd_event_source_unref(m->network_monitor_event_source); + sd_network_monitor_unref(m->network_monitor); + sd_event_source_unref(m->rtnl_event_source); + sd_netlink_unref(m->rtnl); + sd_event_unref(m->event); + + return mfree(m); +} diff --git a/src/network/wait-online/manager.h b/src/network/wait-online/manager.h new file mode 100644 index 0000000..01ad18f --- /dev/null +++ b/src/network/wait-online/manager.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" +#include "sd-netlink.h" +#include "sd-network.h" + +#include "hashmap.h" +#include "network-util.h" +#include "time-util.h" + +typedef struct Manager Manager; +typedef struct Link Link; + +struct Manager { + Hashmap *links_by_index; + Hashmap *links_by_name; + + /* Do not free the two members below. */ + Hashmap *command_line_interfaces_by_name; + char **ignored_interfaces; + + LinkOperationalStateRange required_operstate; + AddressFamily required_family; + bool any; + + sd_netlink *rtnl; + sd_event_source *rtnl_event_source; + + sd_network_monitor *network_monitor; + sd_event_source *network_monitor_event_source; + + sd_event *event; +}; + +Manager* manager_free(Manager *m); +int manager_new(Manager **ret, Hashmap *command_line_interfaces_by_name, char **ignored_interfaces, + LinkOperationalStateRange required_operstate, + AddressFamily required_family, + bool any, usec_t timeout); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +bool manager_configured(Manager *m); diff --git a/src/network/wait-online/wait-online.c b/src/network/wait-online/wait-online.c new file mode 100644 index 0000000..5328bba --- /dev/null +++ b/src/network/wait-online/wait-online.c @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" + +#include "build.h" +#include "daemon-util.h" +#include "main-func.h" +#include "manager.h" +#include "pretty-print.h" +#include "signal-util.h" +#include "socket-util.h" +#include "strv.h" + +static bool arg_quiet = false; +static usec_t arg_timeout = 120 * USEC_PER_SEC; +static Hashmap *arg_interfaces = NULL; +static char **arg_ignore = NULL; +static LinkOperationalStateRange arg_required_operstate = { _LINK_OPERSTATE_INVALID, _LINK_OPERSTATE_INVALID }; +static AddressFamily arg_required_family = ADDRESS_FAMILY_NO; +static bool arg_any = false; + +STATIC_DESTRUCTOR_REGISTER(arg_interfaces, hashmap_free_free_freep); +STATIC_DESTRUCTOR_REGISTER(arg_ignore, strv_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-networkd-wait-online.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "Block until network is configured.\n\n" + " -h --help Show this help\n" + " --version Print version string\n" + " -q --quiet Do not show status information\n" + " -i --interface=INTERFACE[:MIN_OPERSTATE[:MAX_OPERSTATE]]\n" + " Block until at least these interfaces have appeared\n" + " --ignore=INTERFACE Don't take these interfaces into account\n" + " -o --operational-state=MIN_OPERSTATE[:MAX_OPERSTATE]\n" + " Required operational state\n" + " -4 --ipv4 Requires at least one IPv4 address\n" + " -6 --ipv6 Requires at least one IPv6 address\n" + " --any Wait until at least one of the interfaces is online\n" + " --timeout=SECS Maximum time to wait for network connectivity\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_interface_with_operstate_range(const char *str) { + _cleanup_free_ char *ifname = NULL; + _cleanup_free_ LinkOperationalStateRange *range = NULL; + const char *p; + int r; + + assert(str); + + range = new(LinkOperationalStateRange, 1); + if (!range) + return log_oom(); + + p = strchr(str, ':'); + if (p) { + r = parse_operational_state_range(p + 1, range); + if (r < 0) + log_error_errno(r, "Invalid operational state range '%s'", p + 1); + + ifname = strndup(optarg, p - optarg); + } else { + range->min = _LINK_OPERSTATE_INVALID; + range->max = _LINK_OPERSTATE_INVALID; + ifname = strdup(str); + } + if (!ifname) + return log_oom(); + + if (!ifname_valid(ifname)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid interface name '%s'", ifname); + + r = hashmap_ensure_put(&arg_interfaces, &string_hash_ops, ifname, TAKE_PTR(range)); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to store interface name: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Interface name %s is already specified", ifname); + + TAKE_PTR(ifname); + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_IGNORE, + ARG_ANY, + ARG_TIMEOUT, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "quiet", no_argument, NULL, 'q' }, + { "interface", required_argument, NULL, 'i' }, + { "ignore", required_argument, NULL, ARG_IGNORE }, + { "operational-state", required_argument, NULL, 'o' }, + { "ipv4", no_argument, NULL, '4' }, + { "ipv6", no_argument, NULL, '6' }, + { "any", no_argument, NULL, ARG_ANY }, + { "timeout", required_argument, NULL, ARG_TIMEOUT }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hi:qo:46", options, NULL)) >= 0) + + switch (c) { + + case 'h': + help(); + return 0; + + case 'q': + arg_quiet = true; + break; + + case ARG_VERSION: + return version(); + + case 'i': + r = parse_interface_with_operstate_range(optarg); + if (r < 0) + return r; + break; + + case ARG_IGNORE: + if (strv_extend(&arg_ignore, optarg) < 0) + return log_oom(); + + break; + + case 'o': { + LinkOperationalStateRange range; + + r = parse_operational_state_range(optarg, &range); + if (r < 0) + return log_error_errno(r, "Invalid operational state range '%s'", optarg); + + arg_required_operstate = range; + + break; + } + + case '4': + arg_required_family |= ADDRESS_FAMILY_IPV4; + break; + + case '6': + arg_required_family |= ADDRESS_FAMILY_IPV6; + break; + + case ARG_ANY: + arg_any = true; + break; + + case ARG_TIMEOUT: + r = parse_sec(optarg, &arg_timeout); + if (r < 0) + return r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = NULL; + int r; + + log_setup(); + + umask(0022); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_quiet) + log_set_max_level(LOG_ERR); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + r = manager_new(&m, arg_interfaces, arg_ignore, arg_required_operstate, arg_required_family, arg_any, arg_timeout); + if (r < 0) + return log_error_errno(r, "Could not create manager: %m"); + + if (manager_configured(m)) + goto success; + + notify_message = notify_start("READY=1\n" + "STATUS=Waiting for network connections...", + "STATUS=Failed to wait for network connectivity..."); + + r = sd_event_loop(m->event); + if (r == -ETIMEDOUT) + return log_error_errno(r, "Timeout occurred while waiting for network connectivity."); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + +success: + notify_message = "STATUS=All interfaces configured..."; + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/notify/meson.build b/src/notify/meson.build new file mode 100644 index 0000000..3baa086 --- /dev/null +++ b/src/notify/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-notify', + 'public' : true, + 'sources' : files('notify.c'), + }, +] diff --git a/src/notify/notify.c b/src/notify/notify.c new file mode 100644 index 0000000..f63ec8b --- /dev/null +++ b/src/notify/notify.c @@ -0,0 +1,473 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "build.h" +#include "env-util.h" +#include "fd-util.h" +#include "fdset.h" +#include "format-util.h" +#include "log.h" +#include "main-func.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "time-util.h" +#include "user-util.h" + +static bool arg_ready = false; +static bool arg_reloading = false; +static bool arg_stopping = false; +static pid_t arg_pid = 0; +static const char *arg_status = NULL; +static bool arg_booted = false; +static uid_t arg_uid = UID_INVALID; +static gid_t arg_gid = GID_INVALID; +static bool arg_no_block = false; +static char **arg_env = NULL; +static char **arg_exec = NULL; +static FDSet *arg_fds = NULL; +static char *arg_fdname = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_env, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_exec, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_fds, fdset_freep); +STATIC_DESTRUCTOR_REGISTER(arg_fdname, freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-notify", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [VARIABLE=VALUE...]\n" + "%s [OPTIONS...] --exec [VARIABLE=VALUE...] ; CMDLINE...\n" + "\n%sNotify the init system about service status updates.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --ready Inform the service manager about service start-up/reload\n" + " completion\n" + " --reloading Inform the service manager about configuration reloading\n" + " --stopping Inform the service manager about service shutdown\n" + " --pid[=PID] Set main PID of daemon\n" + " --uid=USER Set user to send from\n" + " --status=TEXT Set status text\n" + " --booted Check if the system was booted up with systemd\n" + " --no-block Do not wait until operation finished\n" + " --exec Execute command line separated by ';' once done\n" + " --fd=FD Pass specified file descriptor with along with message\n" + " --fdname=NAME Name to assign to passed file descriptor(s)\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static pid_t manager_pid(void) { + const char *e; + pid_t pid; + int r; + + /* If we run as a service managed by systemd --user the $MANAGERPID environment variable points to + * the service manager's PID. */ + e = getenv("MANAGERPID"); + if (!e) + return 0; + + r = parse_pid(e, &pid); + if (r < 0) { + log_warning_errno(r, "$MANAGERPID is set to an invalid PID, ignoring: %s", e); + return 0; + } + + return pid; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_READY = 0x100, + ARG_RELOADING, + ARG_STOPPING, + ARG_VERSION, + ARG_PID, + ARG_STATUS, + ARG_BOOTED, + ARG_UID, + ARG_NO_BLOCK, + ARG_EXEC, + ARG_FD, + ARG_FDNAME, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "ready", no_argument, NULL, ARG_READY }, + { "reloading", no_argument, NULL, ARG_RELOADING }, + { "stopping", no_argument, NULL, ARG_STOPPING }, + { "pid", optional_argument, NULL, ARG_PID }, + { "status", required_argument, NULL, ARG_STATUS }, + { "booted", no_argument, NULL, ARG_BOOTED }, + { "uid", required_argument, NULL, ARG_UID }, + { "no-block", no_argument, NULL, ARG_NO_BLOCK }, + { "exec", no_argument, NULL, ARG_EXEC }, + { "fd", required_argument, NULL, ARG_FD }, + { "fdname", required_argument, NULL, ARG_FDNAME }, + {} + }; + + _cleanup_fdset_free_ FDSet *passed = NULL; + bool do_exec = false; + int c, r, n_env; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) { + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_READY: + arg_ready = true; + break; + + case ARG_RELOADING: + arg_reloading = true; + break; + + case ARG_STOPPING: + arg_stopping = true; + break; + + case ARG_PID: + if (isempty(optarg) || streq(optarg, "auto")) { + arg_pid = getppid(); + + if (arg_pid <= 1 || + arg_pid == manager_pid()) /* Don't send from PID 1 or the service + * manager's PID (which might be distinct from + * 1, if we are a --user instance), that'd just + * be confusing for the service manager */ + arg_pid = getpid_cached(); + } else if (streq(optarg, "parent")) + arg_pid = getppid(); + else if (streq(optarg, "self")) + arg_pid = getpid_cached(); + else { + r = parse_pid(optarg, &arg_pid); + if (r < 0) + return log_error_errno(r, "Failed to parse PID %s.", optarg); + } + + break; + + case ARG_STATUS: + arg_status = optarg; + break; + + case ARG_BOOTED: + arg_booted = true; + break; + + case ARG_UID: { + const char *u = optarg; + + r = get_user_creds(&u, &arg_uid, &arg_gid, NULL, NULL, 0); + if (r == -ESRCH) /* If the user doesn't exist, then accept it anyway as numeric */ + r = parse_uid(u, &arg_uid); + if (r < 0) + return log_error_errno(r, "Can't resolve user %s: %m", optarg); + + break; + } + + case ARG_NO_BLOCK: + arg_no_block = true; + break; + + case ARG_EXEC: + do_exec = true; + break; + + case ARG_FD: { + _cleanup_close_ int owned_fd = -EBADF; + int fdnr; + + fdnr = parse_fd(optarg); + if (fdnr < 0) + return log_error_errno(fdnr, "Failed to parse file descriptor: %s", optarg); + + if (!passed) { + /* Take possession of all passed fds */ + r = fdset_new_fill(/* filter_cloexec= */ 0, &passed); + if (r < 0) + return log_error_errno(r, "Failed to take possession of passed file descriptors: %m"); + } + + if (fdnr < 3) { + /* For stdin/stdout/stderr we want to keep the fd, too, hence make a copy */ + owned_fd = fcntl(fdnr, F_DUPFD_CLOEXEC, 3); + if (owned_fd < 0) + return log_error_errno(errno, "Failed to duplicate file descriptor: %m"); + } else { + /* Otherwise, move the fd over */ + owned_fd = fdset_remove(passed, fdnr); + if (owned_fd < 0) + return log_error_errno(owned_fd, "Specified file descriptor '%i' not passed or specified more than once: %m", fdnr); + } + + if (!arg_fds) { + arg_fds = fdset_new(); + if (!arg_fds) + return log_oom(); + } + + r = fdset_consume(arg_fds, TAKE_FD(owned_fd)); + if (r < 0) + return log_error_errno(r, "Failed to add file descriptor to set: %m"); + break; + } + + case ARG_FDNAME: + if (!fdname_is_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File descriptor name invalid: %s", optarg); + + if (free_and_strdup(&arg_fdname, optarg) < 0) + return log_oom(); + + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + } + + if (optind >= argc && + !arg_ready && + !arg_stopping && + !arg_reloading && + !arg_status && + !arg_pid && + !arg_booted && + fdset_isempty(arg_fds)) { + help(); + return -EINVAL; + } + + if (arg_fdname && fdset_isempty(arg_fds)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No file descriptors passed, but --fdname= set, refusing."); + + if (do_exec) { + int i; + + for (i = optind; i < argc; i++) + if (streq(argv[i], ";")) + break; + + if (i >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "If --exec is used argument list must contain ';' separator, refusing."); + if (i+1 == argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Empty command line specified after ';' separator, refusing"); + + arg_exec = strv_copy_n(argv + i + 1, argc - i - 1); + if (!arg_exec) + return log_oom(); + + n_env = i - optind; + } else + n_env = argc - optind; + + if (n_env > 0) { + arg_env = strv_copy_n(argv + optind, n_env); + if (!arg_env) + return log_oom(); + } + + if (!fdset_isempty(passed)) + log_warning("Warning: %u more file descriptors passed than referenced with --fd=.", fdset_size(passed)); + + return 1; +} + +static int run(int argc, char* argv[]) { + _cleanup_free_ char *status = NULL, *cpid = NULL, *n = NULL, *monotonic_usec = NULL, *fdn = NULL; + _cleanup_strv_free_ char **final_env = NULL; + char* our_env[9]; + size_t i = 0; + pid_t source_pid; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_booted) { + r = sd_booted(); + if (r < 0) + log_debug_errno(r, "Failed to determine whether we are booted with systemd, assuming we aren't: %m"); + else + log_debug("The system %s booted with systemd.", r ? "was" : "was not"); + + return r <= 0; + } + + if (arg_reloading) { + our_env[i++] = (char*) "RELOADING=1"; + + if (asprintf(&monotonic_usec, "MONOTONIC_USEC=" USEC_FMT, now(CLOCK_MONOTONIC)) < 0) + return log_oom(); + + our_env[i++] = monotonic_usec; + } + + if (arg_ready) + our_env[i++] = (char*) "READY=1"; + + if (arg_stopping) + our_env[i++] = (char*) "STOPPING=1"; + + if (arg_status) { + status = strjoin("STATUS=", arg_status); + if (!status) + return log_oom(); + + our_env[i++] = status; + } + + if (arg_pid > 0) { + if (asprintf(&cpid, "MAINPID="PID_FMT, arg_pid) < 0) + return log_oom(); + + our_env[i++] = cpid; + } + + if (!fdset_isempty(arg_fds)) { + our_env[i++] = (char*) "FDSTORE=1"; + + if (arg_fdname) { + fdn = strjoin("FDNAME=", arg_fdname); + if (!fdn) + return log_oom(); + + our_env[i++] = fdn; + } + } + + our_env[i++] = NULL; + + final_env = strv_env_merge(our_env, arg_env); + if (!final_env) + return log_oom(); + + if (strv_isempty(final_env)) + return 0; + + n = strv_join(final_env, "\n"); + if (!n) + return log_oom(); + + /* If this is requested change to the requested UID/GID. Note that we only change the real UID here, and leave + the effective UID in effect (which is 0 for this to work). That's because we want the privileges to fake the + ucred data, and sd_pid_notify() uses the real UID for filling in ucred. */ + + if (arg_gid != GID_INVALID && + setregid(arg_gid, GID_INVALID) < 0) + return log_error_errno(errno, "Failed to change GID: %m"); + + if (arg_uid != UID_INVALID && + setreuid(arg_uid, UID_INVALID) < 0) + return log_error_errno(errno, "Failed to change UID: %m"); + + if (arg_pid > 0) + source_pid = arg_pid; + else { + /* Pretend the message originates from our parent, given that we are typically called from a + * shell script, i.e. we are not the main process of a service but only a child of it. */ + source_pid = getppid(); + if (source_pid <= 1 || + source_pid == manager_pid()) /* safety check: don't claim we'd send anything from PID 1 + * or the service manager itself */ + source_pid = 0; + } + + if (fdset_isempty(arg_fds)) + r = sd_pid_notify(source_pid, /* unset_environment= */ false, n); + else { + _cleanup_free_ int *a = NULL; + int k; + + k = fdset_to_array(arg_fds, &a); + if (k < 0) + return log_error_errno(k, "Failed to convert file descriptor set to array: %m"); + + r = sd_pid_notify_with_fds(source_pid, /* unset_environment= */ false, n, a, k); + + } + if (r < 0) + return log_error_errno(r, "Failed to notify init system: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "No status data could be sent: $NOTIFY_SOCKET was not set"); + + arg_fds = fdset_free(arg_fds); /* Close before we execute anything */ + + if (!arg_no_block) { + r = sd_pid_notify_barrier(source_pid, /* unset_environment= */ false, 5 * USEC_PER_SEC); + if (r < 0) + return log_error_errno(r, "Failed to invoke barrier: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "No status data could be sent: $NOTIFY_SOCKET was not set"); + } + + if (arg_exec) { + _cleanup_free_ char *cmdline = NULL; + + execvp(arg_exec[0], arg_exec); + + cmdline = strv_join(arg_exec, " "); + if (!cmdline) + return log_oom(); + + return log_error_errno(errno, "Failed to execute command line: %s", cmdline); + } + + /* The DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE() boilerplate will send the exit status via + * sd_notify(). Which is normally fine, but very confusing in systemd-notify, whose purpose is to + * send user-controllable notification messages, and not implicit ones. Let's turn if off, by + * unsetting the $NOTIFY_SOCKET environment variable. */ + (void) unsetenv("NOTIFY_SOCKET"); + return 0; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/nspawn/fuzz-nspawn-oci.c b/src/nspawn/fuzz-nspawn-oci.c new file mode 100644 index 0000000..daa478e --- /dev/null +++ b/src/nspawn/fuzz-nspawn-oci.c @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "fuzz.h" +#include "nspawn-oci.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(settings_freep) Settings *s = NULL; + + if (outside_size_range(size, 0, 65536)) + return 0; + + f = data_to_file(data, size); + assert_se(f); + + fuzz_setup_logging(); + + (void) oci_load(f, "/dev/null", &s); + + return 0; +} diff --git a/src/nspawn/fuzz-nspawn-oci.options b/src/nspawn/fuzz-nspawn-oci.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/nspawn/fuzz-nspawn-oci.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/nspawn/fuzz-nspawn-settings.c b/src/nspawn/fuzz-nspawn-settings.c new file mode 100644 index 0000000..e45bfd8 --- /dev/null +++ b/src/nspawn/fuzz-nspawn-settings.c @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "fuzz.h" +#include "nspawn-settings.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(settings_freep) Settings *s = NULL; + + if (outside_size_range(size, 0, 65536)) + return 0; + + f = data_to_file(data, size); + assert_se(f); + + fuzz_setup_logging(); + + (void) settings_load(f, "/dev/null", &s); + + return 0; +} diff --git a/src/nspawn/fuzz-nspawn-settings.options b/src/nspawn/fuzz-nspawn-settings.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/nspawn/fuzz-nspawn-settings.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/nspawn/meson.build b/src/nspawn/meson.build new file mode 100644 index 0000000..2a913b1 --- /dev/null +++ b/src/nspawn/meson.build @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libnspawn_core_sources = files( + 'nspawn-bind-user.c', + 'nspawn-cgroup.c', + 'nspawn-expose-ports.c', + 'nspawn-mount.c', + 'nspawn-network.c', + 'nspawn-oci.c', + 'nspawn-patch-uid.c', + 'nspawn-register.c', + 'nspawn-seccomp.c', + 'nspawn-settings.c', + 'nspawn-setuid.c', + 'nspawn-stub-pid1.c', + 'nspawn-util.c', +) + +nspawn_gperf_c = custom_target( + 'nspawn-gperf.c', + input : 'nspawn-gperf.gperf', + output : 'nspawn-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +libnspawn_core_sources += [nspawn_gperf_c] + +libnspawn_core = static_library( + 'nspawn-core', + libnspawn_core_sources, + include_directories : includes, + dependencies : [libacl, + libseccomp, + libselinux, + userspace], + build_by_default : false) + +nspawn_libs = [ + libnspawn_core, + libshared, +] + +nspawn_common_template = { + 'link_with' : nspawn_libs, + 'dependencies' : libseccomp, +} +nspawn_test_template = test_template + nspawn_common_template +nspawn_fuzz_template = fuzz_template + nspawn_common_template + +executables += [ + executable_template + { + 'name' : 'systemd-nspawn', + 'public' : true, + 'sources' : files('nspawn.c'), + 'link_with' : nspawn_libs, + 'dependencies' : [ + libblkid, + libseccomp, + ], + }, + nspawn_test_template + { + 'sources' : files('test-nspawn-tables.c'), + }, + nspawn_test_template + { + 'sources' : files('test-nspawn-util.c'), + }, + test_template + { + 'sources' : files('test-patch-uid.c'), + 'link_with' : nspawn_libs, + 'dependencies' : libacl, + 'type' : 'manual', + }, + nspawn_fuzz_template + { + 'sources' : files('fuzz-nspawn-settings.c'), + }, + nspawn_fuzz_template + { + 'sources' : files('fuzz-nspawn-oci.c'), + }, +] diff --git a/src/nspawn/nspawn-bind-user.c b/src/nspawn/nspawn-bind-user.c new file mode 100644 index 0000000..61d8d30 --- /dev/null +++ b/src/nspawn/nspawn-bind-user.c @@ -0,0 +1,474 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "chase.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "nspawn-bind-user.h" +#include "nspawn.h" +#include "path-util.h" +#include "user-util.h" +#include "userdb.h" + +static int check_etc_passwd_collisions( + const char *directory, + const char *name, + uid_t uid) { + + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(directory); + assert(name || uid_is_valid(uid)); + + r = chase_and_fopen_unlocked("/etc/passwd", directory, CHASE_PREFIX_ROOT, "re", NULL, &f); + if (r == -ENOENT) + return 0; /* no user database? then no user, hence no collision */ + if (r < 0) + return log_error_errno(r, "Failed to open /etc/passwd of container: %m"); + + for (;;) { + struct passwd *pw; + + r = fgetpwent_sane(f, &pw); + if (r < 0) + return log_error_errno(r, "Failed to iterate through /etc/passwd of container: %m"); + if (r == 0) /* EOF */ + return 0; /* no collision */ + + if (name && streq_ptr(pw->pw_name, name)) + return 1; /* name collision */ + if (uid_is_valid(uid) && pw->pw_uid == uid) + return 1; /* UID collision */ + } +} + +static int check_etc_group_collisions( + const char *directory, + const char *name, + gid_t gid) { + + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(directory); + assert(name || gid_is_valid(gid)); + + r = chase_and_fopen_unlocked("/etc/group", directory, CHASE_PREFIX_ROOT, "re", NULL, &f); + if (r == -ENOENT) + return 0; /* no group database? then no group, hence no collision */ + if (r < 0) + return log_error_errno(r, "Failed to open /etc/group of container: %m"); + + for (;;) { + struct group *gr; + + r = fgetgrent_sane(f, &gr); + if (r < 0) + return log_error_errno(r, "Failed to iterate through /etc/group of container: %m"); + if (r == 0) + return 0; /* no collision */ + + if (name && streq_ptr(gr->gr_name, name)) + return 1; /* name collision */ + if (gid_is_valid(gid) && gr->gr_gid == gid) + return 1; /* gid collision */ + } +} + +static int convert_user( + const char *directory, + UserRecord *u, + GroupRecord *g, + uid_t allocate_uid, + UserRecord **ret_converted_user, + GroupRecord **ret_converted_group) { + + _cleanup_(group_record_unrefp) GroupRecord *converted_group = NULL; + _cleanup_(user_record_unrefp) UserRecord *converted_user = NULL; + _cleanup_free_ char *h = NULL; + JsonVariant *p, *hp = NULL; + int r; + + assert(u); + assert(g); + assert(u->gid == g->gid); + + r = check_etc_passwd_collisions(directory, u->user_name, UID_INVALID); + if (r < 0) + return r; + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "Sorry, the user '%s' already exists in the container.", u->user_name); + + r = check_etc_group_collisions(directory, g->group_name, GID_INVALID); + if (r < 0) + return r; + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "Sorry, the group '%s' already exists in the container.", g->group_name); + + h = path_join("/run/host/home/", u->user_name); + if (!h) + return log_oom(); + + /* Acquire the source hashed password array as-is, so that it retains the JSON_VARIANT_SENSITIVE flag */ + p = json_variant_by_key(u->json, "privileged"); + if (p) + hp = json_variant_by_key(p, "hashedPassword"); + + r = user_record_build( + &converted_user, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(u->user_name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(allocate_uid)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(allocate_uid)), + JSON_BUILD_PAIR_CONDITION(u->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(u->disposition))), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_STRING(h)), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn")), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(u->hashed_password), "privileged", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_VARIANT(hp)))))); + if (r < 0) + return log_error_errno(r, "Failed to build container user record: %m"); + + r = group_record_build( + &converted_group, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(g->group_name)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(allocate_uid)), + JSON_BUILD_PAIR_CONDITION(g->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(g->disposition))), + JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NSpawn")))); + if (r < 0) + return log_error_errno(r, "Failed to build container group record: %m"); + + *ret_converted_user = TAKE_PTR(converted_user); + *ret_converted_group = TAKE_PTR(converted_group); + + return 0; +} + +static int find_free_uid(const char *directory, uid_t max_uid, uid_t *current_uid) { + int r; + + assert(directory); + assert(current_uid); + + for (;; (*current_uid) ++) { + if (*current_uid > MAP_UID_MAX || *current_uid > max_uid) + return log_error_errno( + SYNTHETIC_ERRNO(EBUSY), + "No suitable available UID in range " UID_FMT "…" UID_FMT " in container detected, can't map user.", + MAP_UID_MIN, MAP_UID_MAX); + + r = check_etc_passwd_collisions(directory, NULL, *current_uid); + if (r < 0) + return r; + if (r > 0) /* already used */ + continue; + + /* We want to use the UID also as GID, hence check for it in /etc/group too */ + r = check_etc_group_collisions(directory, NULL, (gid_t) *current_uid); + if (r <= 0) + return r; + } +} + +BindUserContext* bind_user_context_free(BindUserContext *c) { + if (!c) + return NULL; + + assert(c->n_data == 0 || c->data); + + for (size_t i = 0; i < c->n_data; i++) { + user_record_unref(c->data[i].host_user); + group_record_unref(c->data[i].host_group); + user_record_unref(c->data[i].payload_user); + group_record_unref(c->data[i].payload_group); + } + + return mfree(c); +} + +int bind_user_prepare( + const char *directory, + char **bind_user, + uid_t uid_shift, + uid_t uid_range, + CustomMount **custom_mounts, + size_t *n_custom_mounts, + BindUserContext **ret) { + + _cleanup_(bind_user_context_freep) BindUserContext *c = NULL; + uid_t current_uid = MAP_UID_MIN; + int r; + + assert(custom_mounts); + assert(n_custom_mounts); + assert(ret); + + /* This resolves the users specified in 'bind_user', generates a minimalized JSON user + group record + * for it to stick in the container, allocates a UID/GID for it, and updates the custom mount table, + * to include an appropriate bind mount mapping. + * + * This extends the passed custom_mounts/n_custom_mounts with the home directories, and allocates a + * new BindUserContext for the user records */ + + if (strv_isempty(bind_user)) { + *ret = NULL; + return 0; + } + + c = new0(BindUserContext, 1); + if (!c) + return log_oom(); + + STRV_FOREACH(n, bind_user) { + _cleanup_(user_record_unrefp) UserRecord *u = NULL, *cu = NULL; + _cleanup_(group_record_unrefp) GroupRecord *g = NULL, *cg = NULL; + _cleanup_free_ char *sm = NULL, *sd = NULL; + CustomMount *cm; + + r = userdb_by_name(*n, USERDB_DONT_SYNTHESIZE, &u); + if (r < 0) + return log_error_errno(r, "Failed to resolve user '%s': %m", *n); + + /* For now, let's refuse mapping the root/nobody users explicitly. The records we generate + * are strictly additive, nss-systemd is typically placed last in /etc/nsswitch.conf. Thus + * even if we wanted, we couldn't override the root or nobody user records. Note we also + * check for name conflicts in /etc/passwd + /etc/group later on, which would usually filter + * out root/nobody too, hence these checks might appear redundant — but they actually are + * not, as we want to support environments where /etc/passwd and /etc/group are non-existent, + * and the user/group databases fully synthesized at runtime. Moreover, the name of the + * user/group name of the "nobody" account differs between distros, hence a check by numeric + * UID is safer. */ + if (u->uid == 0 || streq(u->user_name, "root")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'root' user not supported, sorry."); + if (u->uid == UID_NOBODY || STR_IN_SET(u->user_name, NOBODY_USER_NAME, "nobody")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'nobody' user not supported, sorry."); + + if (u->uid >= uid_shift && u->uid < uid_shift + uid_range) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID of user '%s' to map is already in container UID range, refusing.", u->user_name); + + r = groupdb_by_gid(u->gid, USERDB_DONT_SYNTHESIZE, &g); + if (r < 0) + return log_error_errno(r, "Failed to resolve group of user '%s': %m", u->user_name); + + if (g->gid >= uid_shift && g->gid < uid_shift + uid_range) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "GID of group '%s' to map is already in container GID range, refusing.", g->group_name); + + /* We want to synthesize exactly one user + group from the host into the container. This only + * makes sense if the user on the host has its own private group. We can't reasonably check + * this, so we just check of the name of user and group match. + * + * One of these days we might want to support users in a shared/common group too, but it's + * not clear to me how this would have to be mapped, precisely given that the common group + * probably already exists in the container. */ + if (!streq(u->user_name, g->group_name)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Sorry, mapping users without private groups is currently not supported."); + + r = find_free_uid(directory, uid_range, ¤t_uid); + if (r < 0) + return r; + + r = convert_user(directory, u, g, current_uid, &cu, &cg); + if (r < 0) + return r; + + if (!GREEDY_REALLOC(c->data, c->n_data + 1)) + return log_oom(); + + sm = strdup(u->home_directory); + if (!sm) + return log_oom(); + + sd = strdup(cu->home_directory); + if (!sd) + return log_oom(); + + cm = reallocarray(*custom_mounts, *n_custom_mounts + 1, sizeof(CustomMount)); + if (!cm) + return log_oom(); + + *custom_mounts = cm; + + (*custom_mounts)[(*n_custom_mounts)++] = (CustomMount) { + .type = CUSTOM_MOUNT_BIND, + .source = TAKE_PTR(sm), + .destination = TAKE_PTR(sd), + }; + + c->data[c->n_data++] = (BindUserData) { + .host_user = TAKE_PTR(u), + .host_group = TAKE_PTR(g), + .payload_user = TAKE_PTR(cu), + .payload_group = TAKE_PTR(cg), + }; + + current_uid++; + } + + *ret = TAKE_PTR(c); + return 1; +} + +static int write_and_symlink( + const char *root, + JsonVariant *v, + const char *name, + uid_t uid, + const char *suffix, + WriteStringFileFlags extra_flags) { + + _cleanup_free_ char *j = NULL, *f = NULL, *p = NULL, *q = NULL; + int r; + + assert(root); + assert(v); + assert(name); + assert(uid_is_valid(uid)); + assert(suffix); + + r = json_variant_format(v, JSON_FORMAT_NEWLINE, &j); + if (r < 0) + return log_error_errno(r, "Failed to format user record JSON: %m"); + + f = strjoin(name, suffix); + if (!f) + return log_oom(); + + p = path_join(root, "/run/host/userdb/", f); + if (!p) + return log_oom(); + + if (asprintf(&q, "%s/run/host/userdb/" UID_FMT "%s", root, uid, suffix) < 0) + return log_oom(); + + if (symlink(f, q) < 0) + return log_error_errno(errno, "Failed to create symlink '%s': %m", q); + + r = userns_lchown(q, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to adjust access mode of '%s': %m", q); + + r = write_string_file(p, j, WRITE_STRING_FILE_CREATE|extra_flags); + if (r < 0) + return log_error_errno(r, "Failed to write %s: %m", p); + + r = userns_lchown(p, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to adjust access mode of '%s': %m", p); + + return 0; +} + +int bind_user_setup( + const BindUserContext *c, + const char *root) { + + static const UserRecordLoadFlags strip_flags = /* Removes privileged info */ + USER_RECORD_REQUIRE_REGULAR| + USER_RECORD_STRIP_PRIVILEGED| + USER_RECORD_ALLOW_PER_MACHINE| + USER_RECORD_ALLOW_BINDING| + USER_RECORD_ALLOW_SIGNATURE| + USER_RECORD_PERMISSIVE; + static const UserRecordLoadFlags shadow_flags = /* Extracts privileged info */ + USER_RECORD_STRIP_REGULAR| + USER_RECORD_ALLOW_PRIVILEGED| + USER_RECORD_STRIP_PER_MACHINE| + USER_RECORD_STRIP_BINDING| + USER_RECORD_STRIP_SIGNATURE| + USER_RECORD_EMPTY_OK| + USER_RECORD_PERMISSIVE; + int r; + + assert(root); + + if (!c || c->n_data == 0) + return 0; + + r = userns_mkdir(root, "/run/host", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host: %m"); + + r = userns_mkdir(root, "/run/host/home", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host/home: %m"); + + r = userns_mkdir(root, "/run/host/userdb", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host/userdb: %m"); + + for (size_t i = 0; i < c->n_data; i++) { + _cleanup_(group_record_unrefp) GroupRecord *stripped_group = NULL, *shadow_group = NULL; + _cleanup_(user_record_unrefp) UserRecord *stripped_user = NULL, *shadow_user = NULL; + const BindUserData *d = c->data + i; + + /* First, write shadow (i.e. privileged) data for group record */ + r = group_record_clone(d->payload_group, shadow_flags, &shadow_group); + if (r < 0) + return log_error_errno(r, "Failed to extract privileged information from group record: %m"); + + if (!json_variant_is_blank_object(shadow_group->json)) { + r = write_and_symlink( + root, + shadow_group->json, + d->payload_group->group_name, + d->payload_group->gid, + ".group-privileged", + WRITE_STRING_FILE_MODE_0600); + if (r < 0) + return r; + } + + /* Second, write main part of group record. */ + r = group_record_clone(d->payload_group, strip_flags, &stripped_group); + if (r < 0) + return log_error_errno(r, "Failed to strip privileged information from group record: %m"); + + r = write_and_symlink( + root, + stripped_group->json, + d->payload_group->group_name, + d->payload_group->gid, + ".group", + 0); + if (r < 0) + return r; + + /* Third, write out user shadow data. i.e. extract privileged info from user record */ + r = user_record_clone(d->payload_user, shadow_flags, &shadow_user); + if (r < 0) + return log_error_errno(r, "Failed to extract privileged information from user record: %m"); + + if (!json_variant_is_blank_object(shadow_user->json)) { + r = write_and_symlink( + root, + shadow_user->json, + d->payload_user->user_name, + d->payload_user->uid, + ".user-privileged", + WRITE_STRING_FILE_MODE_0600); + if (r < 0) + return r; + } + + /* Finally write out the main part of the user record */ + r = user_record_clone(d->payload_user, strip_flags, &stripped_user); + if (r < 0) + return log_error_errno(r, "Failed to strip privileged information from user record: %m"); + + r = write_and_symlink( + root, + stripped_user->json, + d->payload_user->user_name, + d->payload_user->uid, + ".user", + 0); + if (r < 0) + return r; + } + + return 1; +} diff --git a/src/nspawn/nspawn-bind-user.h b/src/nspawn/nspawn-bind-user.h new file mode 100644 index 0000000..4352ce0 --- /dev/null +++ b/src/nspawn/nspawn-bind-user.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "user-record.h" +#include "group-record.h" +#include "nspawn-mount.h" + +typedef struct BindUserData { + /* The host's user/group records */ + UserRecord *host_user; + GroupRecord *host_group; + + /* The mapped records to place into the container */ + UserRecord *payload_user; + GroupRecord *payload_group; +} BindUserData; + +typedef struct BindUserContext { + BindUserData *data; + size_t n_data; +} BindUserContext; + +BindUserContext* bind_user_context_free(BindUserContext *c); + +DEFINE_TRIVIAL_CLEANUP_FUNC(BindUserContext*, bind_user_context_free); + +int bind_user_prepare(const char *directory, char **bind_user, uid_t uid_shift, uid_t uid_range, CustomMount **custom_mounts, size_t *n_custom_mounts, BindUserContext **ret); + +int bind_user_setup(const BindUserContext *c, const char *root); diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c new file mode 100644 index 0000000..a500243 --- /dev/null +++ b/src/nspawn/nspawn-cgroup.c @@ -0,0 +1,621 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nspawn-cgroup.h" +#include "nspawn-mount.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static int chown_cgroup_path(const char *path, uid_t uid_shift) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd < 0) + return -errno; + + FOREACH_STRING(fn, + ".", + "cgroup.clone_children", + "cgroup.controllers", + "cgroup.events", + "cgroup.procs", + "cgroup.stat", + "cgroup.subtree_control", + "cgroup.threads", + "memory.oom.group", + "memory.reclaim", + "notify_on_release", + "tasks") + if (fchownat(fd, fn, uid_shift, uid_shift, 0) < 0) + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to chown \"%s/%s\", ignoring: %m", path, fn); + + return 0; +} + +int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { + _cleanup_free_ char *path = NULL, *fs = NULL; + int r; + + r = cg_pid_get_path(NULL, pid, &path); + if (r < 0) + return log_error_errno(r, "Failed to get container cgroup path: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + r = chown_cgroup_path(fs, uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs); + + if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) { + _cleanup_free_ char *lfs = NULL; + /* Always propagate access rights from unified to legacy controller */ + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + r = chown_cgroup_path(lfs, uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs); + } + + return 0; +} + +int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { + _cleanup_free_ char *cgroup = NULL; + char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; + bool undo_mount = false; + const char *fn; + int r, unified_controller; + + unified_controller = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (unified_controller < 0) + return log_error_errno(unified_controller, "Failed to determine whether the systemd hierarchy is unified: %m"); + if ((unified_controller > 0) == (unified_requested >= CGROUP_UNIFIED_SYSTEMD)) + return 0; + + /* When the host uses the legacy cgroup setup, but the + * container shall use the unified hierarchy, let's make sure + * we copy the path from the name=systemd hierarchy into the + * unified hierarchy. Similar for the reverse situation. */ + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid); + + /* In order to access the unified hierarchy we need to mount it */ + if (!mkdtemp(tree)) + return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m"); + + if (unified_controller > 0) + r = mount_nofollow_verbose(LOG_ERR, "cgroup", tree, "cgroup", + MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); + else + r = mount_nofollow_verbose(LOG_ERR, "cgroup", tree, "cgroup2", + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) + goto finish; + + undo_mount = true; + + /* If nspawn dies abruptly the cgroup hierarchy created below + * its unit isn't cleaned up. So, let's remove it + * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */ + fn = strjoina(tree, cgroup); + (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES); + + fn = strjoina(tree, cgroup, "/cgroup.procs"); + + sprintf(pid_string, PID_FMT, pid); + r = write_string_file(fn, pid_string, WRITE_STRING_FILE_DISABLE_BUFFER|WRITE_STRING_FILE_MKDIR_0755); + if (r < 0) { + log_error_errno(r, "Failed to move process: %m"); + goto finish; + } + + fn = strjoina(tree, cgroup); + r = chown_cgroup_path(fn, uid_shift); + if (r < 0) + log_error_errno(r, "Failed to chown() cgroup %s: %m", fn); +finish: + if (undo_mount) + (void) umount_verbose(LOG_ERR, tree, UMOUNT_NOFOLLOW); + + (void) rmdir(tree); + return r; +} + +int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) { + _cleanup_free_ char *cgroup = NULL, *payload = NULL; + CGroupMask supported; + char *e; + int r; + + assert(pid > 1); + + /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in + * the unified hierarchy and the container does the same, and we did not create a scope unit for the container + * move us and the container into two separate subcgroups. + * + * Moreover, container payloads such as systemd try to manage the cgroup they run in full (i.e. including + * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a + * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the + * host systemd directly to the payload, the host and payload systemd might fight for the cgroup + * attributes. Hence, let's insert an intermediary cgroup to cover that case too. + * + * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup + * that's fine because there's only one hierarchy anyway and controllers are enabled directly on it. On the + * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't + * do it. */ + + r = cg_mask_supported(&supported); + if (r < 0) + return log_error_errno(r, "Failed to determine supported controllers: %m"); + + if (keep_unit) + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + else + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get our control group: %m"); + + /* If the service manager already placed us in the supervisor cgroup, let's handle that. */ + e = endswith(cgroup, "/supervisor"); + if (e) + *e = 0; /* chop off, we want the main path delegated to us */ + + payload = path_join(cgroup, "payload"); + if (!payload) + return log_oom(); + + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", payload); + + if (keep_unit) { + _cleanup_free_ char *supervisor = NULL; + + supervisor = path_join(cgroup, "supervisor"); + if (!supervisor) + return log_oom(); + + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor); + } + + /* Try to enable as many controllers as possible for the new payload. */ + (void) cg_enable_everywhere(supported, supported, cgroup, NULL); + return 0; +} + +/* Retrieve existing subsystems. This function is called in a new cgroup + * namespace. + */ +static int get_process_controllers(Set **ret) { + _cleanup_set_free_ Set *controllers = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(ret); + + f = fopen("/proc/self/cgroup", "re"); + if (!f) + return errno == ENOENT ? -ESRCH : -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + char *e, *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + l = strchr(line, ':'); + if (!l) + continue; + + l++; + e = strchr(l, ':'); + if (!e) + continue; + + *e = 0; + + if (STR_IN_SET(l, "", "name=systemd", "name=unified")) + continue; + + r = set_put_strdup(&controllers, l); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(controllers); + + return 0; +} + +static int mount_legacy_cgroup_hierarchy( + const char *dest, + const char *controller, + const char *hierarchy, + bool read_only) { + + const char *to, *fstype, *opts; + int r; + + to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy); + + r = path_is_mount_point(to, dest, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); + if (r > 0) + return 0; + + (void) mkdir_p(to, 0755); + + /* The superblock mount options of the mount point need to be + * identical to the hosts', and hence writable... */ + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) { + fstype = "cgroup2"; + opts = NULL; + } else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) { + fstype = "cgroup"; + opts = "none,name=systemd,xattr"; + } else { + fstype = "cgroup"; + opts = controller; + } + + r = mount_nofollow_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r < 0) + return r; + + /* ... hence let's only make the bind mount read-only, not the superblock. */ + if (read_only) { + r = mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (r < 0) + return r; + } + + return 1; +} + +/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */ +static int mount_legacy_cgns_supported( + const char *dest, + CGroupUnified unified_requested, + bool userns, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_set_free_ Set *controllers = NULL; + const char *cgroup_root = "/sys/fs/cgroup", *c; + int r; + + (void) mkdir_p(cgroup_root, 0755); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + /* When cgroup namespaces are enabled and user namespaces are + * used then the mount of the cgroupfs is done *inside* the new + * user namespace. We're root in the new user namespace and the + * kernel will happily translate our uid/gid to the correct + * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply + * pass uid 0 and not uid_shift to tmpfs_patch_options(). + */ + r = tmpfs_patch_options("mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP, 0, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); + if (r < 0) + return r; + } + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + goto skip_controllers; + + r = get_process_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ const char *controller = NULL; + + controller = set_steal_first(controllers); + if (!controller) + break; + + r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns); + if (r < 0) + return r; + + /* When multiple hierarchies are co-mounted, make their + * constituting individual hierarchies a symlink to the + * co-mount. + */ + c = controller; + for (;;) { + _cleanup_free_ char *target = NULL, *tok = NULL; + + r = extract_first_word(&c, &tok, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m"); + if (r == 0) + break; + + if (streq(controller, tok)) + break; + + target = path_join("/sys/fs/cgroup/", tok); + if (!target) + return log_oom(); + + r = symlink_idempotent(controller, target, false); + if (r == -EINVAL) + return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); + if (r < 0) + return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + } + } + +skip_controllers: + if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { + r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false); + if (r < 0) + return r; + } + + r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false); + if (r < 0) + return r; + + if (!userns) + return mount_nofollow_verbose(LOG_ERR, NULL, cgroup_root, NULL, + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, + "mode=0755"); + + return 0; +} + +/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */ +static int mount_legacy_cgns_unsupported( + const char *dest, + CGroupUnified unified_requested, + bool userns, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_set_free_ Set *controllers = NULL; + const char *cgroup_root; + int r; + + cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + + (void) mkdir_p(cgroup_root, 0755); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + r = tmpfs_patch_options("mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP, + uid_shift == 0 ? UID_INVALID : uid_shift, + selinux_apifs_context, + &options); + if (r < 0) + return log_oom(); + + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); + if (r < 0) + return r; + } + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + goto skip_controllers; + + r = cg_kernel_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; + + controller = set_steal_first(controllers); + if (!controller) + break; + + origin = path_join("/sys/fs/cgroup/", controller); + if (!origin) + return log_oom(); + + r = readlink_malloc(origin, &combined); + if (r == -EINVAL) { + /* Not a symbolic link, but directly a single cgroup hierarchy */ + + r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); + if (r < 0) + return r; + + } else if (r < 0) + return log_error_errno(r, "Failed to read link %s: %m", origin); + else { + _cleanup_free_ char *target = NULL; + + target = path_join(dest, origin); + if (!target) + return log_oom(); + + /* A symbolic link, a combination of controllers in one hierarchy */ + + if (!filename_is_valid(combined)) { + log_warning("Ignoring invalid combined hierarchy %s.", combined); + continue; + } + + r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); + if (r < 0) + return r; + + r = symlink_idempotent(combined, target, false); + if (r == -EINVAL) + return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); + if (r < 0) + return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + } + } + +skip_controllers: + if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { + r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false); + if (r < 0) + return r; + } + + r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false); + if (r < 0) + return r; + + return mount_nofollow_verbose(LOG_ERR, NULL, cgroup_root, NULL, + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, + "mode=0755"); +} + +static int mount_unified_cgroups(const char *dest) { + const char *p; + int r; + + assert(dest); + + p = prefix_roota(dest, "/sys/fs/cgroup"); + + (void) mkdir_p(p, 0755); + + r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); + if (r > 0) { + p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs"); + if (access(p, F_OK) >= 0) + return 0; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + } + + return mount_nofollow_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); +} + +int mount_cgroups( + const char *dest, + CGroupUnified unified_requested, + bool userns, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context, + bool use_cgns) { + + if (unified_requested >= CGROUP_UNIFIED_ALL) + return mount_unified_cgroups(dest); + if (use_cgns) + return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context); + + return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context); +} + +static int mount_systemd_cgroup_writable_one(const char *root, const char *own) { + int r; + + assert(root); + assert(own); + + /* Make our own cgroup a (writable) bind mount */ + r = mount_nofollow_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + /* And then remount the systemd cgroup root read-only */ + return mount_nofollow_verbose(LOG_ERR, NULL, root, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); +} + +int mount_systemd_cgroup_writable( + const char *dest, + CGroupUnified unified_requested) { + + _cleanup_free_ char *own_cgroup_path = NULL; + const char *root, *own; + int r; + + assert(dest); + + r = cg_pid_get_path(NULL, 0, &own_cgroup_path); + if (r < 0) + return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + + /* If we are living in the top-level, then there's nothing to do... */ + if (path_equal(own_cgroup_path, "/")) + return 0; + + if (unified_requested >= CGROUP_UNIFIED_ALL) { + + root = prefix_roota(dest, "/sys/fs/cgroup"); + own = strjoina(root, own_cgroup_path); + + } else { + + if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) { + root = prefix_roota(dest, "/sys/fs/cgroup/unified"); + own = strjoina(root, own_cgroup_path); + + r = mount_systemd_cgroup_writable_one(root, own); + if (r < 0) + return r; + } + + root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + own = strjoina(root, own_cgroup_path); + } + + return mount_systemd_cgroup_writable_one(root, own); +} diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h new file mode 100644 index 0000000..3f5ba62 --- /dev/null +++ b/src/nspawn/nspawn-cgroup.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "cgroup-util.h" + +int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); +int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); +int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested); + +int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); +int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested); diff --git a/src/nspawn/nspawn-def.h b/src/nspawn/nspawn-def.h new file mode 100644 index 0000000..32a20aa --- /dev/null +++ b/src/nspawn/nspawn-def.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* While we are chmod()ing a directory tree, we set the top-level UID base to this "busy" base, so that we can always + * recognize trees we are were chmod()ing recursively and got interrupted in */ +#define UID_BUSY_BASE ((uid_t) UINT32_C(0xFFFE0000)) +#define UID_BUSY_MASK ((uid_t) UINT32_C(0xFFFF0000)) diff --git a/src/nspawn/nspawn-expose-ports.c b/src/nspawn/nspawn-expose-ports.c new file mode 100644 index 0000000..5644068 --- /dev/null +++ b/src/nspawn/nspawn-expose-ports.c @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-netlink.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "fd-util.h" +#include "firewall-util.h" +#include "in-addr-util.h" +#include "local-addresses.h" +#include "netlink-util.h" +#include "nspawn-expose-ports.h" +#include "parse-util.h" +#include "socket-util.h" +#include "string-util.h" + +int expose_port_parse(ExposePort **l, const char *s) { + const char *split, *e; + uint16_t container_port, host_port; + ExposePort *port; + int protocol; + int r; + + assert(l); + assert(s); + + if ((e = startswith(s, "tcp:"))) + protocol = IPPROTO_TCP; + else if ((e = startswith(s, "udp:"))) + protocol = IPPROTO_UDP; + else { + e = s; + protocol = IPPROTO_TCP; + } + + split = strchr(e, ':'); + if (split) { + char v[split - e + 1]; + + memcpy(v, e, split - e); + v[split - e] = 0; + + r = parse_ip_port(v, &host_port); + if (r < 0) + return -EINVAL; + + r = parse_ip_port(split + 1, &container_port); + } else { + r = parse_ip_port(e, &container_port); + host_port = container_port; + } + + if (r < 0) + return r; + + LIST_FOREACH(ports, p, *l) + if (p->protocol == protocol && p->host_port == host_port) + return -EEXIST; + + port = new(ExposePort, 1); + if (!port) + return -ENOMEM; + + *port = (ExposePort) { + .protocol = protocol, + .host_port = host_port, + .container_port = container_port, + }; + + LIST_PREPEND(ports, *l, port); + + return 0; +} + +void expose_port_free_all(ExposePort *p) { + LIST_CLEAR(ports, p, free); +} + +int expose_port_flush(FirewallContext **fw_ctx, ExposePort* l, int af, union in_addr_union *exposed) { + int r; + + assert(exposed); + + if (!l) + return 0; + + if (!in_addr_is_set(af, exposed)) + return 0; + + log_debug("Lost IP address."); + + LIST_FOREACH(ports, p, l) { + r = fw_add_local_dnat(fw_ctx, + false, + af, + p->protocol, + p->host_port, + exposed, + p->container_port, + NULL); + if (r < 0) + log_warning_errno(r, "Failed to modify %s firewall: %m", af_to_name(af)); + } + + *exposed = IN_ADDR_NULL; + return 0; +} + +int expose_port_execute(sd_netlink *rtnl, FirewallContext **fw_ctx, ExposePort *l, int af, union in_addr_union *exposed) { + _cleanup_free_ struct local_address *addresses = NULL; + union in_addr_union new_exposed; + bool add; + int r; + + assert(exposed); + + /* Invoked each time an address is added or removed inside the + * container */ + + if (!l) + return 0; + + r = local_addresses(rtnl, 0, af, &addresses); + if (r < 0) + return log_error_errno(r, "Failed to enumerate local addresses: %m"); + + add = r > 0 && + addresses[0].family == af && + addresses[0].scope < RT_SCOPE_LINK; + + if (!add) + return expose_port_flush(fw_ctx, l, af, exposed); + + new_exposed = addresses[0].address; + if (in_addr_equal(af, exposed, &new_exposed)) + return 0; + + log_debug("New container IP is %s.", IN_ADDR_TO_STRING(af, &new_exposed)); + + LIST_FOREACH(ports, p, l) { + r = fw_add_local_dnat(fw_ctx, + true, + af, + p->protocol, + p->host_port, + &new_exposed, + p->container_port, + in_addr_is_set(af, exposed) ? exposed : NULL); + if (r < 0) + log_warning_errno(r, "Failed to modify %s firewall: %m", af_to_name(af)); + } + + *exposed = new_exposed; + return 0; +} + +int expose_port_send_rtnl(int send_fd) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(send_fd >= 0); + + fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate container netlink: %m"); + + /* Store away the fd in the socket, so that it stays open as + * long as we run the child */ + r = send_one_fd(send_fd, fd, 0); + if (r < 0) + return log_error_errno(r, "Failed to send netlink fd: %m"); + + return 0; +} + +int expose_port_watch_rtnl( + sd_event *event, + int recv_fd, + sd_netlink_message_handler_t handler, + void *userdata, + sd_netlink **ret) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int fd, r; + + assert(event); + assert(recv_fd >= 0); + assert(ret); + + fd = receive_one_fd(recv_fd, 0); + if (fd < 0) + return log_error_errno(fd, "Failed to recv netlink fd: %m"); + + r = sd_netlink_open_fd(&rtnl, fd); + if (r < 0) { + safe_close(fd); + return log_error_errno(r, "Failed to create rtnl object: %m"); + } + + r = sd_netlink_add_match(rtnl, NULL, RTM_NEWADDR, handler, NULL, userdata, "nspawn-NEWADDR"); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m"); + + r = sd_netlink_add_match(rtnl, NULL, RTM_DELADDR, handler, NULL, userdata, "nspawn-DELADDR"); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m"); + + r = sd_netlink_attach_event(rtnl, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to add to event loop: %m"); + + *ret = TAKE_PTR(rtnl); + + return 0; +} diff --git a/src/nspawn/nspawn-expose-ports.h b/src/nspawn/nspawn-expose-ports.h new file mode 100644 index 0000000..27cfccf --- /dev/null +++ b/src/nspawn/nspawn-expose-ports.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-event.h" +#include "sd-netlink.h" + +#include "firewall-util.h" +#include "in-addr-util.h" +#include "list.h" + +typedef struct ExposePort { + int protocol; + uint16_t host_port; + uint16_t container_port; + LIST_FIELDS(struct ExposePort, ports); +} ExposePort; + +void expose_port_free_all(ExposePort *p); +int expose_port_parse(ExposePort **l, const char *s); + +int expose_port_watch_rtnl(sd_event *event, int recv_fd, sd_netlink_message_handler_t handler, void *userdata, sd_netlink **ret); +int expose_port_send_rtnl(int send_fd); + +int expose_port_execute(sd_netlink *rtnl, FirewallContext **fw_ctx, ExposePort *l, int af, union in_addr_union *exposed); +int expose_port_flush(FirewallContext **fw_ctx, ExposePort* l, int af, union in_addr_union *exposed); diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf new file mode 100644 index 0000000..9e1210f --- /dev/null +++ b/src/nspawn/nspawn-gperf.gperf @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "nspawn-settings.h" +#include "nspawn-expose-ports.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name nspawn_gperf_hash +%define lookup-function-name nspawn_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Exec.Boot, config_parse_boot, 0, 0 +Exec.Ephemeral, config_parse_tristate, 0, offsetof(Settings, ephemeral) +Exec.ProcessTwo, config_parse_pid2, 0, 0 +Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) +Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) +Exec.User, config_parse_string, CONFIG_PARSE_STRING_SAFE, offsetof(Settings, user) +Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability) +Exec.AmbientCapability, config_parse_capability, 0, offsetof(Settings, ambient_capability) +Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability) +Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal) +Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) +Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) +Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) +Exec.PivotRoot, config_parse_pivot_root, 0, 0 +Exec.PrivateUsers, config_parse_private_users, 0, 0 +Exec.NotifyReady, config_parse_tristate, 0, offsetof(Settings, notify_ready) +Exec.SystemCallFilter, config_parse_syscall_filter, 0, 0 +Exec.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof(Settings, rlimit) +Exec.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof(Settings, rlimit) +Exec.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof(Settings, rlimit) +Exec.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof(Settings, rlimit) +Exec.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof(Settings, rlimit) +Exec.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof(Settings, rlimit) +Exec.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof(Settings, rlimit) +Exec.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof(Settings, rlimit) +Exec.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof(Settings, rlimit) +Exec.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof(Settings, rlimit) +Exec.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof(Settings, rlimit) +Exec.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof(Settings, rlimit) +Exec.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof(Settings, rlimit) +Exec.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof(Settings, rlimit) +Exec.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof(Settings, rlimit) +Exec.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof(Settings, rlimit) +Exec.Hostname, config_parse_hostname, 0, offsetof(Settings, hostname) +Exec.NoNewPrivileges, config_parse_tristate, 0, offsetof(Settings, no_new_privileges) +Exec.OOMScoreAdjust, config_parse_oom_score_adjust, 0, 0 +Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0 +Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf) +Exec.LinkJournal, config_parse_link_journal, 0, 0 +Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone) +Exec.SuppressSync, config_parse_tristate, 0, offsetof(Settings, suppress_sync) +Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) +Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) +Files.Bind, config_parse_bind, 0, 0 +Files.BindReadOnly, config_parse_bind, 1, 0 +Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.Inaccessible, config_parse_inaccessible, 0, 0 +Files.Overlay, config_parse_overlay, 0, 0 +Files.OverlayReadOnly, config_parse_overlay, 1, 0 +Files.PrivateUsersChown, config_parse_userns_chown, 0, offsetof(Settings, userns_ownership) +Files.PrivateUsersOwnership, config_parse_userns_ownership, 0, offsetof(Settings, userns_ownership) +Files.BindUser, config_parse_bind_user, 0, offsetof(Settings, bind_user) +Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) +Network.Interface, config_parse_network_iface_pair, 0, offsetof(Settings, network_interfaces) +Network.MACVLAN, config_parse_macvlan_iface_pair, 0, offsetof(Settings, network_macvlan) +Network.IPVLAN, config_parse_ipvlan_iface_pair, 0, offsetof(Settings, network_ipvlan) +Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) +Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0 +Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge) +Network.Zone, config_parse_network_zone, 0, 0 +Network.Port, config_parse_expose_port, 0, 0 diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c new file mode 100644 index 0000000..470f477 --- /dev/null +++ b/src/nspawn/nspawn-mount.c @@ -0,0 +1,1406 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "label-util.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "nspawn-mount.h" +#include "parse-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "set.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "user-util.h" + +CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) { + CustomMount *c, *ret; + + assert(l); + assert(n); + assert(t >= 0); + assert(t < _CUSTOM_MOUNT_TYPE_MAX); + + c = reallocarray(*l, *n + 1, sizeof(CustomMount)); + if (!c) + return NULL; + + *l = c; + ret = *l + *n; + (*n)++; + + *ret = (CustomMount) { + .type = t + }; + + return ret; +} + +void custom_mount_free_all(CustomMount *l, size_t n) { + for (size_t i = 0; i < n; i++) { + CustomMount *m = l + i; + + free(m->source); + free(m->destination); + free(m->options); + + if (m->work_dir) { + (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL); + free(m->work_dir); + } + + if (m->rm_rf_tmpdir) { + (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL); + free(m->rm_rf_tmpdir); + } + + strv_free(m->lower); + free(m->type_argument); + } + + free(l); +} + +static int custom_mount_compare(const CustomMount *a, const CustomMount *b) { + int r; + + r = path_compare(a->destination, b->destination); + if (r != 0) + return r; + + return CMP(a->type, b->type); +} + +static int source_path_parse(const char *p, char **ret) { + assert(p); + assert(ret); + + if (isempty(p)) + return -EINVAL; + + if (*p == '+') { + if (!path_is_absolute(p + 1)) + return -EINVAL; + + char *s = strdup(p); + if (!s) + return -ENOMEM; + + *ret = TAKE_PTR(s); + return 0; + } + + return path_make_absolute_cwd(p, ret); +} + +static int source_path_parse_nullable(const char *p, char **ret) { + assert(p); + assert(ret); + + if (isempty(p)) { + *ret = NULL; + return 0; + } + + return source_path_parse(p, ret); +} + +static char *resolve_source_path(const char *dest, const char *source) { + if (!source) + return NULL; + + if (source[0] == '+') + return path_join(dest, source + 1); + + return strdup(source); +} + +static int allocate_temporary_source(CustomMount *m) { + assert(m); + assert(!m->source); + assert(!m->rm_rf_tmpdir); + + m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX"); + if (!m->rm_rf_tmpdir) + return log_oom(); + + if (!mkdtemp(m->rm_rf_tmpdir)) { + m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir); + return log_error_errno(errno, "Failed to acquire temporary directory: %m"); + } + + m->source = path_join(m->rm_rf_tmpdir, "src"); + if (!m->source) + return log_oom(); + + if (mkdir(m->source, 0755) < 0) + return log_error_errno(errno, "Failed to create %s: %m", m->source); + + return 0; +} + +int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) { + int r; + + /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the + * parent process, so that we know the temporary directories to remove on exit before we fork off the + * children. */ + + assert(l || n == 0); + + /* Order the custom mounts, and make sure we have a working directory */ + typesafe_qsort(l, n, custom_mount_compare); + + for (size_t i = 0; i < n; i++) { + CustomMount *m = l + i; + + /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount + * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in + * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in + * the inner child, not the outer one. Determine this here. */ + m->in_userns = path_startswith(m->destination, "/proc"); + + if (m->type == CUSTOM_MOUNT_BIND) { + if (m->source) { + char *s; + + s = resolve_source_path(dest, m->source); + if (!s) + return log_oom(); + + free_and_replace(m->source, s); + } else { + /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */ + + r = allocate_temporary_source(m); + if (r < 0) + return r; + } + } + + if (m->type == CUSTOM_MOUNT_OVERLAY) { + STRV_FOREACH(j, m->lower) { + char *s; + + s = resolve_source_path(dest, *j); + if (!s) + return log_oom(); + + free_and_replace(*j, s); + } + + if (m->source) { + char *s; + + s = resolve_source_path(dest, m->source); + if (!s) + return log_oom(); + + free_and_replace(m->source, s); + } else { + r = allocate_temporary_source(m); + if (r < 0) + return r; + } + + if (m->work_dir) { + char *s; + + s = resolve_source_path(dest, m->work_dir); + if (!s) + return log_oom(); + + free_and_replace(m->work_dir, s); + } else { + r = tempfn_random(m->source, NULL, &m->work_dir); + if (r < 0) + return log_error_errno(r, "Failed to acquire working directory: %m"); + } + + (void) mkdir_label(m->work_dir, 0700); + } + } + + return 0; +} + +int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) { + _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL, *p = NULL; + CustomMount *m; + int r; + + assert(l); + assert(n); + + r = extract_many_words(&s, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + if (r == 1) { + destination = strdup(source[0] == '+' ? source+1 : source); + if (!destination) + return -ENOMEM; + } + if (r == 2 && !isempty(s)) { + opts = strdup(s); + if (!opts) + return -ENOMEM; + } + + r = source_path_parse_nullable(source, &p); + if (r < 0) + return r; + + if (!path_is_absolute(destination)) + return -EINVAL; + + m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND); + if (!m) + return -ENOMEM; + + m->source = TAKE_PTR(p); + m->destination = TAKE_PTR(destination); + m->read_only = read_only; + m->options = TAKE_PTR(opts); + + return 0; +} + +int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) { + _cleanup_free_ char *path = NULL, *opts = NULL; + const char *p = ASSERT_PTR(s); + CustomMount *m; + int r; + + assert(l); + assert(n); + + r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if (isempty(p)) + opts = strdup("mode=0755"); + else + opts = strdup(p); + if (!opts) + return -ENOMEM; + + if (!path_is_absolute(path)) + return -EINVAL; + + m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS); + if (!m) + return -ENOMEM; + + m->destination = TAKE_PTR(path); + m->options = TAKE_PTR(opts); + + return 0; +} + +int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) { + _cleanup_free_ char *upper = NULL, *destination = NULL; + _cleanup_strv_free_ char **lower = NULL; + CustomMount *m; + int r, k; + + k = strv_split_full(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (k < 0) + return k; + if (k < 2) + return -EADDRNOTAVAIL; + if (k == 2) { + _cleanup_free_ char *p = NULL; + + /* If two parameters are specified, the first one is the lower, the second one the upper directory. And + * we'll also define the destination mount point the same as the upper. */ + + r = source_path_parse(lower[0], &p); + if (r < 0) + return r; + + free_and_replace(lower[0], p); + + r = source_path_parse(lower[1], &p); + if (r < 0) + return r; + + free_and_replace(lower[1], p); + + upper = TAKE_PTR(lower[1]); + + destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */ + if (!destination) + return -ENOMEM; + } else { + _cleanup_free_ char *p = NULL; + + /* If more than two parameters are specified, the last one is the destination, the second to last one + * the "upper", and all before that the "lower" directories. */ + + destination = lower[k - 1]; + upper = TAKE_PTR(lower[k - 2]); + + STRV_FOREACH(i, lower) { + r = source_path_parse(*i, &p); + if (r < 0) + return r; + + free_and_replace(*i, p); + } + + /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory + * in /var/tmp */ + r = source_path_parse_nullable(upper, &p); + if (r < 0) + return r; + + free_and_replace(upper, p); + + if (!path_is_absolute(destination)) + return -EINVAL; + } + + m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY); + if (!m) + return -ENOMEM; + + m->destination = TAKE_PTR(destination); + m->source = TAKE_PTR(upper); + m->lower = TAKE_PTR(lower); + m->read_only = read_only; + + return 0; +} + +int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) { + _cleanup_free_ char *path = NULL; + CustomMount *m; + + assert(l); + assert(n); + assert(s); + + if (!path_is_absolute(s)) + return -EINVAL; + + path = strdup(s); + if (!path) + return -ENOMEM; + + m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE); + if (!m) + return -ENOMEM; + + m->destination = TAKE_PTR(path); + return 0; +} + +int tmpfs_patch_options( + const char *options, + uid_t uid_shift, + const char *selinux_apifs_context, + char **ret) { + + _cleanup_free_ char *buf = NULL; + + assert(ret); + + if (options) { + buf = strdup(options); + if (!buf) + return -ENOMEM; + } + + if (uid_shift != UID_INVALID) + if (strextendf_with_separator(&buf, ",", "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift) < 0) + return -ENOMEM; + +#if HAVE_SELINUX + if (selinux_apifs_context) + if (strextendf_with_separator(&buf, ",", "context=\"%s\"", selinux_apifs_context) < 0) + return -ENOMEM; +#endif + + *ret = TAKE_PTR(buf); + return !!*ret; +} + +int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { + const char *full, *top; + int r; + unsigned long extra_flags = 0; + + top = prefix_roota(dest, "/sys"); + r = path_is_fs_type(top, SYSFS_MAGIC); + if (r < 0) + return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top); + /* /sys might already be mounted as sysfs by the outer child in the + * !netns case. In this case, it's all good. Don't touch it because we + * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555. + */ + if (r > 0) + return 0; + + full = prefix_roota(top, "/full"); + + (void) mkdir(full, 0755); + + if (FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO)) + extra_flags |= MS_RDONLY; + + r = mount_nofollow_verbose(LOG_ERR, "sysfs", full, "sysfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL); + if (r < 0) + return r; + + FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") { + _cleanup_free_ char *from = NULL, *to = NULL; + + from = path_join(full, x); + if (!from) + return log_oom(); + + to = path_join(top, x); + if (!to) + return log_oom(); + + (void) mkdir(to, 0755); + + r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + r = mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, + MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); + if (r < 0) + return r; + } + + r = umount_verbose(LOG_ERR, full, UMOUNT_NOFOLLOW); + if (r < 0) + return r; + + if (rmdir(full) < 0) + return log_error_errno(errno, "Failed to remove %s: %m", full); + + /* Create mountpoint for cgroups. Otherwise we are not allowed since we + * remount /sys read-only. + */ + const char *x = prefix_roota(top, "/fs/cgroup"); + (void) mkdir_p(x, 0755); + + return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL, + MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); +} + +#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV) +#define SYS_DEFAULT_MOUNT_FLAGS (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV) + +int mount_all(const char *dest, + MountSettingsMask mount_settings, + uid_t uid_shift, + const char *selinux_apifs_context) { + +#define PROC_INACCESSIBLE_REG(path) \ + { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \ + { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */ + +#define PROC_READ_ONLY(path) \ + { (path), (path), NULL, NULL, MS_BIND, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \ + { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */ + + typedef struct MountPoint { + const char *what; + const char *where; + const char *type; + const char *options; + unsigned long flags; + MountSettingsMask mount_settings; + } MountPoint; + + static const MountPoint mount_table[] = { + /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */ + { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */ + + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ + + { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */ + + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */ + + /* Make these files inaccessible to container payloads: they potentially leak information about kernel + * internals or the host's execution environment to the container */ + PROC_INACCESSIBLE_REG("/proc/kallsyms"), + PROC_INACCESSIBLE_REG("/proc/kcore"), + PROC_INACCESSIBLE_REG("/proc/keys"), + PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"), + PROC_INACCESSIBLE_REG("/proc/timer_list"), + + /* Make these directories read-only to container payloads: they show hardware information, and in some + * cases contain tunables the container really shouldn't have access to. */ + PROC_READ_ONLY("/proc/acpi"), + PROC_READ_ONLY("/proc/apm"), + PROC_READ_ONLY("/proc/asound"), + PROC_READ_ONLY("/proc/bus"), + PROC_READ_ONLY("/proc/fs"), + PROC_READ_ONLY("/proc/irq"), + PROC_READ_ONLY("/proc/scsi"), + + { "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_IN_USERNS|MOUNT_MKDIR }, + + /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */ + { "tmpfs", "/tmp", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR }, + { "tmpfs", "/sys", "tmpfs", "mode=0555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR }, + { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS, + MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */ + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */ + { "tmpfs", "/dev", "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME, + MOUNT_FATAL|MOUNT_MKDIR }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + MOUNT_FATAL|MOUNT_MKDIR }, + { "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + MOUNT_FATAL|MOUNT_MKDIR }, + { "/run/host", "/run/host", NULL, NULL, MS_BIND, + MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PREFIX_ROOT }, /* Prepare this so that we can make it read-only when we are done */ + { "/etc/os-release", "/run/host/os-release", NULL, NULL, MS_BIND, + MOUNT_TOUCH }, /* As per kernel interface requirements, bind mount first (creating mount points) and make read-only later */ + { "/usr/lib/os-release", "/run/host/os-release", NULL, NULL, MS_BIND, + MOUNT_FATAL }, /* If /etc/os-release doesn't exist use the version in /usr/lib as fallback */ + { NULL, "/run/host/os-release", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + MOUNT_FATAL }, + { NULL, "/run/host/os-release", NULL, NULL, MS_PRIVATE, + MOUNT_FATAL }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */ + { NULL, "/run/host", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + MOUNT_FATAL|MOUNT_IN_USERNS }, +#if HAVE_SELINUX + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, + MOUNT_MKDIR }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + 0 }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_PRIVATE, + 0 }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */ +#endif + }; + + bool use_userns = FLAGS_SET(mount_settings, MOUNT_USE_USERNS); + bool netns = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_NETNS); + bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO); + bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS); + bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP); + int r; + + for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) { + _cleanup_free_ char *where = NULL, *options = NULL, *prefixed = NULL; + bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL); + const char *o; + + if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS)) + continue; + + if (!netns && FLAGS_SET(mount_table[k].mount_settings, MOUNT_APPLY_APIVFS_NETNS)) + continue; + + if (!ro && FLAGS_SET(mount_table[k].mount_settings, MOUNT_APPLY_APIVFS_RO)) + continue; + + if (!tmpfs_tmp && FLAGS_SET(mount_table[k].mount_settings, MOUNT_APPLY_TMPFS_TMP)) + continue; + + r = chase(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s%s: %m", strempty(dest), mount_table[k].where); + + /* Skip this entry if it is not a remount. */ + if (mount_table[k].what) { + r = path_is_mount_point(where, NULL, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); + if (r > 0) + continue; + } + + if ((mount_table[k].mount_settings & (MOUNT_MKDIR|MOUNT_TOUCH)) != 0) { + uid_t u = (use_userns && !in_userns) ? uid_shift : UID_INVALID; + + if (FLAGS_SET(mount_table[k].mount_settings, MOUNT_TOUCH)) + r = mkdir_parents_safe(dest, where, 0755, u, u, 0); + else + r = mkdir_p_safe(dest, where, 0755, u, u, 0); + if (r < 0 && r != -EEXIST) { + if (fatal && r != -EROFS) + return log_error_errno(r, "Failed to create directory %s: %m", where); + + log_debug_errno(r, "Failed to create directory %s: %m", where); + + /* If we failed mkdir() or chown() due to the root directory being read only, + * attempt to mount this fs anyway and let mount_verbose log any errors */ + if (r != -EROFS) + continue; + } + } + + if (FLAGS_SET(mount_table[k].mount_settings, MOUNT_TOUCH)) { + r = touch(where); + if (r < 0 && r != -EEXIST) { + if (fatal && r != -EROFS) + return log_error_errno(r, "Failed to create file %s: %m", where); + + log_debug_errno(r, "Failed to create file %s: %m", where); + if (r != -EROFS) + continue; + } + } + + o = mount_table[k].options; + if (streq_ptr(mount_table[k].type, "tmpfs")) { + r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + if (r > 0) + o = options; + } + + if (FLAGS_SET(mount_table[k].mount_settings, MOUNT_PREFIX_ROOT)) { + /* Optionally prefix the mount source with the root dir. This is useful in bind + * mounts to be created within the container image before we transition into it. Note + * that MOUNT_IN_USERNS is run after we transitioned hence prefixing is not necessary + * for those. */ + r = chase(mount_table[k].what, dest, CHASE_PREFIX_ROOT, &prefixed, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s%s: %m", strempty(dest), mount_table[k].what); + } + + r = mount_verbose_full( + fatal ? LOG_ERR : LOG_DEBUG, + prefixed ?: mount_table[k].what, + where, + mount_table[k].type, + mount_table[k].flags, + o, + FLAGS_SET(mount_table[k].mount_settings, MOUNT_FOLLOW_SYMLINKS)); + if (r < 0 && fatal) + return r; + } + + return 0; +} + +static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, RemountIdmapping *idmapping) { + unsigned long flags = *mount_flags; + char *opts = NULL; + RemountIdmapping new_idmapping = *idmapping; + int r; + + assert(options); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&options, &word, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to extract mount option: %m"); + if (r == 0) + break; + + if (streq(word, "rbind")) + flags |= MS_REC; + else if (streq(word, "norbind")) + flags &= ~MS_REC; + else if (streq(word, "idmap")) + new_idmapping = REMOUNT_IDMAPPING_HOST_ROOT; + else if (streq(word, "noidmap")) + new_idmapping = REMOUNT_IDMAPPING_NONE; + else if (streq(word, "rootidmap")) + new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid bind mount option: %s", word); + } + + *mount_flags = flags; + *idmapping = new_idmapping; + /* in the future mount_opts will hold string options for mount(2) */ + *mount_opts = opts; + + return 0; +} + +static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t uid_range) { + _cleanup_free_ char *mount_opts = NULL, *where = NULL; + unsigned long mount_flags = MS_BIND | MS_REC; + struct stat source_st, dest_st; + int r; + RemountIdmapping idmapping = REMOUNT_IDMAPPING_NONE; + + assert(dest); + assert(m); + + if (m->options) { + r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapping); + if (r < 0) + return r; + } + + /* If this is a bind mount from a temporary sources change ownership of the source to the container's + * root UID. Otherwise it would always show up as "nobody" if user namespacing is used. */ + if (m->rm_rf_tmpdir && chown(m->source, uid_shift, uid_shift) < 0) + return log_error_errno(errno, "Failed to chown %s: %m", m->source); + + if (stat(m->source, &source_st) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", m->source); + + r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination); + if (r > 0) { /* Path exists already? */ + + if (stat(where, &dest_st) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", where); + + if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot bind mount directory %s on file %s.", + m->source, where); + + if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot bind mount file %s on directory %s.", + m->source, where); + + } else { /* Path doesn't exist yet? */ + r = mkdir_parents_safe_label(dest, where, 0755, uid_shift, uid_shift, MKDIR_IGNORE_EXISTING); + if (r < 0) + return log_error_errno(r, "Failed to make parents of %s: %m", where); + + /* Create the mount point. Any non-directory file can be + * mounted on any non-directory file (regular, fifo, socket, + * char, block). + */ + if (S_ISDIR(source_st.st_mode)) + r = mkdir_label(where, 0755); + else + r = touch(where); + if (r < 0) + return log_error_errno(r, "Failed to create mount point %s: %m", where); + + if (chown(where, uid_shift, uid_shift) < 0) + return log_error_errno(errno, "Failed to chown %s: %m", where); + } + + r = mount_nofollow_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts); + if (r < 0) + return r; + + if (m->read_only) { + r = bind_remount_recursive(where, MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) + return log_error_errno(r, "Read-only bind mount failed: %m"); + } + + if (idmapping != REMOUNT_IDMAPPING_NONE) { + r = remount_idmap(STRV_MAKE(where), uid_shift, uid_range, source_st.st_uid, idmapping); + if (r < 0) + return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where); + } + + return 0; +} + +static int mount_tmpfs(const char *dest, CustomMount *m, uid_t uid_shift, const char *selinux_apifs_context) { + const char *options; + _cleanup_free_ char *buf = NULL, *where = NULL; + int r; + + assert(dest); + assert(m); + + r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination); + if (r == 0) { /* Doesn't exist yet? */ + r = mkdir_p_label(where, 0755); + if (r < 0) + return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where); + } + + r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); + if (r < 0) + return log_oom(); + options = r > 0 ? buf : m->options; + + return mount_nofollow_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options); +} + +static char *joined_and_escaped_lower_dirs(char **lower) { + _cleanup_strv_free_ char **sv = NULL; + + sv = strv_copy(lower); + if (!sv) + return NULL; + + strv_reverse(sv); + + if (!strv_shell_escape(sv, ",:")) + return NULL; + + return strv_join(sv, ":"); +} + +static int mount_overlay(const char *dest, CustomMount *m) { + _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL; + const char *options; + int r; + + assert(dest); + assert(m); + + r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination); + if (r == 0) { /* Doesn't exist yet? */ + r = mkdir_label(where, 0755); + if (r < 0) + return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where); + } + + (void) mkdir_p_label(m->source, 0755); + + lower = joined_and_escaped_lower_dirs(m->lower); + if (!lower) + return log_oom(); + + escaped_source = shell_escape(m->source, ",:"); + if (!escaped_source) + return log_oom(); + + if (m->read_only) + options = strjoina("lowerdir=", escaped_source, ":", lower); + else { + _cleanup_free_ char *escaped_work_dir = NULL; + + escaped_work_dir = shell_escape(m->work_dir, ",:"); + if (!escaped_work_dir) + return log_oom(); + + options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir); + } + + return mount_nofollow_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options); +} + +static int mount_inaccessible(const char *dest, CustomMount *m) { + _cleanup_free_ char *where = NULL, *source = NULL; + struct stat st; + int r; + + assert(dest); + assert(m); + + r = chase_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st); + if (r < 0) { + log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination); + return m->graceful ? 0 : r; + } + + r = mode_to_inaccessible_node(NULL, st.st_mode, &source); + if (r < 0) + return m->graceful ? 0 : r; + + r = mount_nofollow_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL); + if (r < 0) + return m->graceful ? 0 : r; + + r = mount_nofollow_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL); + if (r < 0) { + (void) umount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, where, UMOUNT_NOFOLLOW); + return m->graceful ? 0 : r; + } + + return 0; +} + +static int mount_arbitrary(const char *dest, CustomMount *m) { + _cleanup_free_ char *where = NULL; + int r; + + assert(dest); + assert(m); + + r = chase(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination); + if (r == 0) { /* Doesn't exist yet? */ + r = mkdir_p_label(where, 0755); + if (r < 0) + return log_error_errno(r, "Creating mount point for mount %s failed: %m", where); + } + + return mount_nofollow_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options); +} + +int mount_custom( + const char *dest, + CustomMount *mounts, size_t n, + uid_t uid_shift, + uid_t uid_range, + const char *selinux_apifs_context, + MountSettingsMask mount_settings) { + int r; + + assert(dest); + + for (size_t i = 0; i < n; i++) { + CustomMount *m = mounts + i; + + if (FLAGS_SET(mount_settings, MOUNT_IN_USERNS) != m->in_userns) + continue; + + if (FLAGS_SET(mount_settings, MOUNT_ROOT_ONLY) && !path_equal(m->destination, "/")) + continue; + + if (FLAGS_SET(mount_settings, MOUNT_NON_ROOT_ONLY) && path_equal(m->destination, "/")) + continue; + + switch (m->type) { + + case CUSTOM_MOUNT_BIND: + r = mount_bind(dest, m, uid_shift, uid_range); + break; + + case CUSTOM_MOUNT_TMPFS: + r = mount_tmpfs(dest, m, uid_shift, selinux_apifs_context); + break; + + case CUSTOM_MOUNT_OVERLAY: + r = mount_overlay(dest, m); + break; + + case CUSTOM_MOUNT_INACCESSIBLE: + r = mount_inaccessible(dest, m); + break; + + case CUSTOM_MOUNT_ARBITRARY: + r = mount_arbitrary(dest, m); + break; + + default: + assert_not_reached(); + } + + if (r < 0) + return r; + } + + return 0; +} + +bool has_custom_root_mount(const CustomMount *mounts, size_t n) { + for (size_t i = 0; i < n; i++) + if (path_equal(mounts[i].destination, "/")) + return true; + + return false; +} + +static int setup_volatile_state(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) { + _cleanup_free_ char *buf = NULL; + const char *p, *options; + int r; + + assert(directory); + + /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */ + + r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) + return log_error_errno(r, "Failed to remount %s read-only: %m", directory); + + p = prefix_roota(directory, "/var"); + r = mkdir(p, 0755); + if (r < 0 && errno != EEXIST) + return log_error_errno(errno, "Failed to create %s: %m", directory); + + options = "mode=0755" TMPFS_LIMITS_VOLATILE_STATE; + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); + if (r < 0) + return log_oom(); + if (r > 0) + options = buf; + + return mount_nofollow_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options); +} + +static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) { + bool tmpfs_mounted = false, bind_mounted = false; + char template[] = "/tmp/nspawn-volatile-XXXXXX"; + _cleanup_free_ char *buf = NULL, *bindir = NULL; + const char *f, *t, *options; + struct stat st; + int r; + + assert(directory); + + /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and + * that read-only. Before we start setting this up let's validate if the image has the /usr merge + * implemented, and let's output a friendly log message if it hasn't. */ + + bindir = path_join(directory, "/bin"); + if (!bindir) + return log_oom(); + if (lstat(bindir, &st) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to stat /bin directory below image: %m"); + + /* ENOENT is fine, just means the image is probably just a naked /usr and we can create the + * rest. */ + } else if (S_ISDIR(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), + "Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. " + "Please work with your distribution and help them adopt the merged /usr scheme."); + else if (!S_ISLNK(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically)."); + + if (!mkdtemp(template)) + return log_error_errno(errno, "Failed to create temporary directory: %m"); + + options = "mode=0755" TMPFS_LIMITS_ROOTFS; + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); + if (r < 0) + goto fail; + if (r > 0) + options = buf; + + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options); + if (r < 0) + goto fail; + + tmpfs_mounted = true; + + f = prefix_roota(directory, "/usr"); + t = prefix_roota(template, "/usr"); + + r = mkdir(t, 0755); + if (r < 0 && errno != EEXIST) { + r = log_error_errno(errno, "Failed to create %s: %m", t); + goto fail; + } + + r = mount_nofollow_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + goto fail; + + bind_mounted = true; + + r = bind_remount_recursive(t, MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) { + log_error_errno(r, "Failed to remount %s read-only: %m", t); + goto fail; + } + + r = mount_nofollow_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL); + if (r < 0) + goto fail; + + (void) rmdir(template); + + return 0; + +fail: + if (bind_mounted) + (void) umount_verbose(LOG_ERR, t, UMOUNT_NOFOLLOW); + + if (tmpfs_mounted) + (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW); + + (void) rmdir(template); + return r; +} + +static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) { + _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL; + char template[] = "/tmp/nspawn-volatile-XXXXXX"; + const char *upper, *work, *options; + bool tmpfs_mounted = false; + int r; + + assert(directory); + + /* --volatile=overlay means we mount an overlayfs to the root dir. */ + + if (!mkdtemp(template)) + return log_error_errno(errno, "Failed to create temporary directory: %m"); + + options = "mode=0755" TMPFS_LIMITS_ROOTFS; + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); + if (r < 0) + goto finish; + if (r > 0) + options = buf; + + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options); + if (r < 0) + goto finish; + + tmpfs_mounted = true; + + upper = strjoina(template, "/upper"); + work = strjoina(template, "/work"); + + if (mkdir(upper, 0755) < 0) { + r = log_error_errno(errno, "Failed to create %s: %m", upper); + goto finish; + } + if (mkdir(work, 0755) < 0) { + r = log_error_errno(errno, "Failed to create %s: %m", work); + goto finish; + } + + /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice + * that the kernel allows us to do that without going through some mount point rearrangements. */ + + escaped_directory = shell_escape(directory, ",:"); + escaped_upper = shell_escape(upper, ",:"); + escaped_work = shell_escape(work, ",:"); + if (!escaped_directory || !escaped_upper || !escaped_work) { + r = -ENOMEM; + goto finish; + } + + options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work); + r = mount_nofollow_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options); + +finish: + if (tmpfs_mounted) + (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW); + + (void) rmdir(template); + return r; +} + +int setup_volatile_mode( + const char *directory, + VolatileMode mode, + uid_t uid_shift, + const char *selinux_apifs_context) { + + switch (mode) { + + case VOLATILE_YES: + return setup_volatile_yes(directory, uid_shift, selinux_apifs_context); + + case VOLATILE_STATE: + return setup_volatile_state(directory, uid_shift, selinux_apifs_context); + + case VOLATILE_OVERLAY: + return setup_volatile_overlay(directory, uid_shift, selinux_apifs_context); + + default: + return 0; + } +} + +/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */ +int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) { + _cleanup_free_ char *root_new = NULL, *root_old = NULL; + const char *p = s; + int r; + + assert(pivot_root_new); + assert(pivot_root_old); + + r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if (isempty(p)) + root_old = NULL; + else { + root_old = strdup(p); + if (!root_old) + return -ENOMEM; + } + + if (!path_is_absolute(root_new)) + return -EINVAL; + if (root_old && !path_is_absolute(root_old)) + return -EINVAL; + + free_and_replace(*pivot_root_new, root_new); + free_and_replace(*pivot_root_old, root_old); + + return 0; +} + +int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) { + _cleanup_free_ char *directory_pivot_root_new = NULL; + _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL; + char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX"; + bool remove_pivot_tmp = false; + int r; + + assert(directory); + + if (!pivot_root_new) + return 0; + + /* Pivot pivot_root_new to / and the existing / to pivot_root_old. + * If pivot_root_old is NULL, the existing / disappears. + * This requires a temporary directory, pivot_tmp, which is + * not a child of either. + * + * This is typically used for OSTree-style containers, where the root partition contains several + * sysroots which could be run. Normally, one would be chosen by the bootloader and pivoted to / by + * initrd. + * + * For example, for an OSTree deployment, pivot_root_new + * would be: /ostree/deploy/$os/deploy/$checksum. Note that this + * code doesn’t do the /var mount which OSTree expects: use + * --bind +/sysroot/ostree/deploy/$os/var:/var for that. + * + * So in the OSTree case, we’ll end up with something like: + * - directory = /tmp/nspawn-root-123456 + * - pivot_root_new = /ostree/deploy/os/deploy/123abc + * - pivot_root_old = /sysroot + * - directory_pivot_root_new = + * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc + * - pivot_tmp = /tmp/nspawn-pivot-123456 + * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot + * + * Requires all file systems at directory and below to be mounted + * MS_PRIVATE or MS_SLAVE so they can be moved. + */ + directory_pivot_root_new = path_join(directory, pivot_root_new); + if (!directory_pivot_root_new) + return log_oom(); + + /* Remount directory_pivot_root_new to make it movable. */ + r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL); + if (r < 0) + goto done; + + if (pivot_root_old) { + if (!mkdtemp(pivot_tmp)) { + r = log_error_errno(errno, "Failed to create temporary directory: %m"); + goto done; + } + + remove_pivot_tmp = true; + pivot_tmp_pivot_root_old = path_join(pivot_tmp, pivot_root_old); + if (!pivot_tmp_pivot_root_old) { + r = log_oom(); + goto done; + } + + r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL); + if (r < 0) + goto done; + + r = mount_nofollow_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL); + if (r < 0) + goto done; + + r = mount_nofollow_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL); + if (r < 0) + goto done; + } else { + r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL); + if (r < 0) + goto done; + } + +done: + if (remove_pivot_tmp) + (void) rmdir(pivot_tmp); + + return r; +} + +#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc" +#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys" + +int pin_fully_visible_fs(void) { + int r; + + (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755); + (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755); + + r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL); + if (r < 0) + return r; + + r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL); + if (r < 0) + return r; + + return 0; +} + +static int do_wipe_fully_visible_fs(void) { + if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0) + return log_error_errno(errno, "Failed to unmount temporary proc: %m"); + + if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0) + return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m"); + + if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0) + return log_error_errno(errno, "Failed to unmount temporary sys: %m"); + + if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0) + return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m"); + + return 0; +} + +int wipe_fully_visible_fs(int mntns_fd) { + _cleanup_close_ int orig_mntns_fd = -EBADF; + int r, rr; + + r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to pin originating mount namespace: %m"); + + r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF); + if (r < 0) + return log_error_errno(r, "Failed to enter mount namespace: %m"); + + rr = do_wipe_fully_visible_fs(); + + r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF); + if (r < 0) + return log_error_errno(r, "Failed to enter original mount namespace: %m"); + + return rr; +} diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h new file mode 100644 index 0000000..bf5e47d --- /dev/null +++ b/src/nspawn/nspawn-mount.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "cgroup-util.h" +#include "volatile-util.h" + +typedef enum MountSettingsMask { + MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */ + MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */ + MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */ + MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */ + MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write. + Works only if MOUNT_APPLY_APIVFS_RO is also set. */ + MOUNT_APPLY_TMPFS_TMP = 1 << 5, /* if set, /tmp will be mounted as tmpfs */ + MOUNT_ROOT_ONLY = 1 << 6, /* if set, only root mounts are mounted */ + MOUNT_NON_ROOT_ONLY = 1 << 7, /* if set, only non-root mounts are mounted */ + MOUNT_MKDIR = 1 << 8, /* if set, make directory to mount over first */ + MOUNT_TOUCH = 1 << 9, /* if set, touch file to mount over first */ + MOUNT_PREFIX_ROOT = 1 << 10,/* if set, prefix the source path with the container's root directory */ + MOUNT_FOLLOW_SYMLINKS = 1 << 11,/* if set, we'll follow symlinks for the mount target */ +} MountSettingsMask; + +typedef enum CustomMountType { + CUSTOM_MOUNT_BIND, + CUSTOM_MOUNT_TMPFS, + CUSTOM_MOUNT_OVERLAY, + CUSTOM_MOUNT_INACCESSIBLE, + CUSTOM_MOUNT_ARBITRARY, + _CUSTOM_MOUNT_TYPE_MAX, + _CUSTOM_MOUNT_TYPE_INVALID = -EINVAL, +} CustomMountType; + +typedef struct CustomMount { + CustomMountType type; + bool read_only; + char *source; /* for overlayfs this is the upper directory */ + char *destination; + char *options; + char *work_dir; + char **lower; + char *rm_rf_tmpdir; + char *type_argument; /* only for CUSTOM_MOUNT_ARBITRARY */ + bool graceful; + bool in_userns; +} CustomMount; + +CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t); +void custom_mount_free_all(CustomMount *l, size_t n); +int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n); + +int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); +int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s); +int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); +int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s); + +int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, const char *selinux_apifs_context); +int mount_sysfs(const char *dest, MountSettingsMask mount_settings); + +int mount_custom(const char *dest, CustomMount *mounts, size_t n, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, MountSettingsMask mount_settings); +bool has_custom_root_mount(const CustomMount *mounts, size_t n); + +int setup_volatile_mode(const char *directory, VolatileMode mode, uid_t uid_shift, const char *selinux_apifs_context); + +int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s); +int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old); + +int tmpfs_patch_options(const char *options,uid_t uid_shift, const char *selinux_apifs_context, char **ret); +int pin_fully_visible_fs(void); +int wipe_fully_visible_fs(int mntns_fd); diff --git a/src/nspawn/nspawn-network.c b/src/nspawn/nspawn-network.c new file mode 100644 index 0000000..c661f1d --- /dev/null +++ b/src/nspawn/nspawn-network.c @@ -0,0 +1,815 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" +#include "sd-id128.h" +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "hexdecoct.h" +#include "lock-util.h" +#include "missing_network.h" +#include "netif-naming-scheme.h" +#include "netlink-util.h" +#include "nspawn-network.h" +#include "parse-util.h" +#include "siphash24.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "udev-util.h" + +#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1) +#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2) +#define VETH_EXTRA_HOST_HASH_KEY SD_ID128_MAKE(48,c7,f6,b7,ea,9d,4c,9e,b7,28,d4,de,91,d5,bf,66) +#define VETH_EXTRA_CONTAINER_HASH_KEY SD_ID128_MAKE(af,50,17,61,ce,f9,4d,35,84,0d,2b,20,54,be,ce,59) +#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f) +#define SHORTEN_IFNAME_HASH_KEY SD_ID128_MAKE(e1,90,a4,04,a8,ef,4b,51,8c,cc,c3,3a,9f,11,fc,a2) + +static int remove_one_link(sd_netlink *rtnl, const char *name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + if (isempty(name)) + return 0; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_DELLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, name); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r == -ENODEV) /* Already gone */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to remove interface %s: %m", name); + + return 1; +} + +static int generate_mac( + const char *machine_name, + struct ether_addr *mac, + sd_id128_t hash_key, + uint64_t idx) { + + uint64_t result; + size_t l, sz; + uint8_t *v, *i; + int r; + + l = strlen(machine_name); + sz = sizeof(sd_id128_t) + l; + if (idx > 0) + sz += sizeof(idx); + + v = newa(uint8_t, sz); + + /* fetch some persistent data unique to the host */ + r = sd_id128_get_machine((sd_id128_t*) v); + if (r < 0) + return r; + + /* combine with some data unique (on this host) to this + * container instance */ + i = mempcpy(v + sizeof(sd_id128_t), machine_name, l); + if (idx > 0) { + idx = htole64(idx); + memcpy(i, &idx, sizeof(idx)); + } + + /* Let's hash the host machine ID plus the container name. We + * use a fixed, but originally randomly created hash key here. */ + result = htole64(siphash24(v, sz, hash_key.bytes)); + + assert_cc(ETH_ALEN <= sizeof(result)); + memcpy(mac->ether_addr_octet, &result, ETH_ALEN); + + /* see eth_random_addr in the kernel */ + mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */ + mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */ + + return 0; +} + +static int set_alternative_ifname(sd_netlink *rtnl, const char *ifname, const char *altifname) { + int r; + + assert(rtnl); + assert(ifname); + + if (!altifname) + return 0; + + if (strlen(altifname) >= ALTIFNAMSIZ) + return log_warning_errno(SYNTHETIC_ERRNO(ERANGE), + "Alternative interface name '%s' for '%s' is too long, ignoring", + altifname, ifname); + + r = rtnl_set_link_alternative_names_by_ifname(&rtnl, ifname, STRV_MAKE(altifname)); + if (r < 0) + return log_warning_errno(r, + "Failed to set alternative interface name '%s' to '%s', ignoring: %m", + altifname, ifname); + + return 0; +} + +static int add_veth( + sd_netlink *rtnl, + pid_t pid, + const char *ifname_host, + const char *altifname_host, + const struct ether_addr *mac_host, + const char *ifname_container, + const struct ether_addr *mac_container) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(rtnl); + assert(ifname_host); + assert(mac_host); + assert(ifname_container); + assert(mac_container); + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container(m, VETH_INFO_PEER); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_container); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_container); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new veth interfaces (%s:%s): %m", ifname_host, ifname_container); + + (void) set_alternative_ifname(rtnl, ifname_host, altifname_host); + + return 0; +} + +static int shorten_ifname(char *ifname) { + char new_ifname[IFNAMSIZ]; + + assert(ifname); + + if (strlen(ifname) < IFNAMSIZ) /* Name is short enough */ + return 0; + + if (naming_scheme_has(NAMING_NSPAWN_LONG_HASH)) { + uint64_t h; + + /* Calculate 64-bit hash value */ + h = siphash24(ifname, strlen(ifname), SHORTEN_IFNAME_HASH_KEY.bytes); + + /* Set the final four bytes (i.e. 32-bit) to the lower 24bit of the hash, encoded in url-safe base64 */ + memcpy(new_ifname, ifname, IFNAMSIZ - 5); + new_ifname[IFNAMSIZ - 5] = urlsafe_base64char(h >> 18); + new_ifname[IFNAMSIZ - 4] = urlsafe_base64char(h >> 12); + new_ifname[IFNAMSIZ - 3] = urlsafe_base64char(h >> 6); + new_ifname[IFNAMSIZ - 2] = urlsafe_base64char(h); + } else + /* On old nspawn versions we just truncated the name, provide compatibility */ + memcpy(new_ifname, ifname, IFNAMSIZ-1); + + new_ifname[IFNAMSIZ - 1] = 0; + + /* Log the incident to make it more discoverable */ + log_warning("Network interface name '%s' has been changed to '%s' to fit length constraints.", ifname, new_ifname); + + strcpy(ifname, new_ifname); + return 1; +} + +int setup_veth(const char *machine_name, + pid_t pid, + char iface_name[IFNAMSIZ], + bool bridge, + const struct ether_addr *provided_mac) { + + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + struct ether_addr mac_host, mac_container; + unsigned u; + char *n, *a = NULL; + int r; + + assert(machine_name); + assert(pid > 0); + assert(iface_name); + + /* Use two different interface name prefixes depending whether + * we are in bridge mode or not. */ + n = strjoina(bridge ? "vb-" : "ve-", machine_name); + r = shorten_ifname(n); + if (r > 0) + a = strjoina(bridge ? "vb-" : "ve-", machine_name); + + if (ether_addr_is_null(provided_mac)){ + r = generate_mac(machine_name, &mac_container, CONTAINER_HASH_KEY, 0); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m"); + } else + mac_container = *provided_mac; + + r = generate_mac(machine_name, &mac_host, HOST_HASH_KEY, 0); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m"); + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = add_veth(rtnl, pid, n, a, &mac_host, "host0", &mac_container); + if (r < 0) + return r; + + u = if_nametoindex(n); /* We don't need to use rtnl_resolve_ifname() here because the + * name we assigned is always the main name. */ + if (u == 0) + return log_error_errno(errno, "Failed to resolve interface %s: %m", n); + + strcpy(iface_name, n); + return (int) u; +} + +int setup_veth_extra( + const char *machine_name, + pid_t pid, + char **pairs) { + + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + uint64_t idx = 0; + int r; + + assert(machine_name); + assert(pid > 0); + + if (strv_isempty(pairs)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + STRV_FOREACH_PAIR(a, b, pairs) { + struct ether_addr mac_host, mac_container; + + r = generate_mac(machine_name, &mac_container, VETH_EXTRA_CONTAINER_HASH_KEY, idx); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for container side of extra veth link: %m"); + + r = generate_mac(machine_name, &mac_host, VETH_EXTRA_HOST_HASH_KEY, idx); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for host side of extra veth link: %m"); + + r = add_veth(rtnl, pid, *a, NULL, &mac_host, *b, &mac_container); + if (r < 0) + return r; + + idx++; + } + + return 0; +} + +static int join_bridge(sd_netlink *rtnl, const char *veth_name, const char *bridge_name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r, bridge_ifi; + + assert(rtnl); + assert(veth_name); + assert(bridge_name); + + bridge_ifi = rtnl_resolve_interface(&rtnl, bridge_name); + if (bridge_ifi < 0) + return bridge_ifi; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0); + if (r < 0) + return r; + + r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge_ifi); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return r; + + return bridge_ifi; +} + +static int create_bridge(sd_netlink *rtnl, const char *bridge_name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, bridge_name); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "bridge"); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +int setup_bridge(const char *veth_name, const char *bridge_name, bool create) { + _cleanup_(release_lock_file) LockFile bridge_lock = LOCK_FILE_INIT; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r, bridge_ifi; + unsigned n = 0; + + assert(veth_name); + assert(bridge_name); + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + if (create) { + /* We take a system-wide lock here, so that we can safely check whether there's still a member in the + * bridge before removing it, without risking interference from other nspawn instances. */ + + r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock); + if (r < 0) + return log_error_errno(r, "Failed to take network zone lock: %m"); + } + + for (;;) { + bridge_ifi = join_bridge(rtnl, veth_name, bridge_name); + if (bridge_ifi >= 0) + return bridge_ifi; + if (bridge_ifi != -ENODEV || !create || n > 10) + return log_error_errno(bridge_ifi, "Failed to add interface %s to bridge %s: %m", veth_name, bridge_name); + + /* Count attempts, so that we don't enter an endless loop here. */ + n++; + + /* The bridge doesn't exist yet. Let's create it */ + r = create_bridge(rtnl, bridge_name); + if (r < 0) + return log_error_errno(r, "Failed to create bridge interface %s: %m", bridge_name); + + /* Try again, now that the bridge exists */ + } +} + +int remove_bridge(const char *bridge_name) { + _cleanup_(release_lock_file) LockFile bridge_lock = LOCK_FILE_INIT; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + const char *path; + int r; + + /* Removes the specified bridge, but only if it is currently empty */ + + if (isempty(bridge_name)) + return 0; + + r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock); + if (r < 0) + return log_error_errno(r, "Failed to take network zone lock: %m"); + + path = strjoina("/sys/class/net/", bridge_name, "/brif"); + + r = dir_is_empty(path, /* ignore_hidden_or_backup= */ false); + if (r == -ENOENT) /* Already gone? */ + return 0; + if (r < 0) + return log_error_errno(r, "Can't detect if bridge %s is empty: %m", bridge_name); + if (r == 0) /* Still populated, leave it around */ + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + return remove_one_link(rtnl, bridge_name); +} + +static int test_network_interface_initialized(const char *name) { + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + int r; + + if (!udev_available()) + return 0; + + /* udev should be around. */ + + r = sd_device_new_from_ifname(&d, name); + if (r < 0) + return log_error_errno(r, "Failed to get device %s: %m", name); + + r = sd_device_get_is_initialized(d); + if (r < 0) + return log_error_errno(r, "Failed to determine whether interface %s is initialized: %m", name); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Network interface %s is not initialized yet.", name); + + r = device_is_renaming(d); + if (r < 0) + return log_error_errno(r, "Failed to determine the interface %s is being renamed: %m", name); + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Interface %s is being renamed.", name); + + return 0; +} + +int test_network_interfaces_initialized(char **iface_pairs) { + int r; + STRV_FOREACH_PAIR(a, b, iface_pairs) { + r = test_network_interface_initialized(*a); + if (r < 0) + return r; + } + return 0; +} + +int move_network_interfaces(int netns_fd, char **iface_pairs) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r; + + if (strv_isempty(iface_pairs)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + STRV_FOREACH_PAIR(i, b, iface_pairs) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int ifi; + + ifi = rtnl_resolve_interface_or_warn(&rtnl, *i); + if (ifi < 0) + return ifi; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd); + if (r < 0) + return log_error_errno(r, "Failed to append namespace fd to netlink message: %m"); + + if (!streq(*b, *i)) { + r = sd_netlink_message_append_string(m, IFLA_IFNAME, *b); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + } + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i); + } + + return 0; +} + +int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + unsigned idx = 0; + int r; + + if (strv_isempty(iface_pairs)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + STRV_FOREACH_PAIR(i, b, iface_pairs) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + _cleanup_free_ char *n = NULL; + int shortened, ifi; + struct ether_addr mac; + + ifi = rtnl_resolve_interface_or_warn(&rtnl, *i); + if (ifi < 0) + return ifi; + + r = generate_mac(machine_name, &mac, MACVLAN_HASH_KEY, idx++); + if (r < 0) + return log_error_errno(r, "Failed to create MACVLAN MAC address: %m"); + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface index: %m"); + + n = strdup(*b); + if (!n) + return log_oom(); + + shortened = shorten_ifname(n); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE); + if (r < 0) + return log_error_errno(r, "Failed to append macvlan mode: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new macvlan interfaces: %m"); + + if (shortened > 0) + (void) set_alternative_ifname(rtnl, n, *b); + } + + return 0; +} + +int setup_ipvlan(const char *machine_name, pid_t pid, char **iface_pairs) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r; + + if (strv_isempty(iface_pairs)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + STRV_FOREACH_PAIR(i, b, iface_pairs) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + _cleanup_free_ char *n = NULL; + int shortened, ifi ; + + ifi = rtnl_resolve_interface_or_warn(&rtnl, *i); + if (ifi < 0) + return ifi; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface index: %m"); + + n = strdup(*b); + if (!n) + return log_oom(); + + shortened = shorten_ifname(n); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2); + if (r < 0) + return log_error_errno(r, "Failed to add ipvlan mode: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new ipvlan interfaces: %m"); + + if (shortened > 0) + (void) set_alternative_ifname(rtnl, n, *b); + } + + return 0; +} + +int veth_extra_parse(char ***l, const char *p) { + _cleanup_free_ char *a = NULL, *b = NULL; + int r; + + r = extract_first_word(&p, &a, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0 || !ifname_valid(a)) + return -EINVAL; + + r = extract_first_word(&p, &b, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0 || !ifname_valid(b)) { + r = free_and_strdup(&b, a); + if (r < 0) + return r; + } + + if (p) + return -EINVAL; + + r = strv_push_pair(l, a, b); + if (r < 0) + return -ENOMEM; + + a = b = NULL; + return 0; +} + +int remove_veth_links(const char *primary, char **pairs) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r; + + /* In some cases the kernel might pin the veth links between host and container even after the namespace + * died. Hence, let's better remove them explicitly too. */ + + if (isempty(primary) && strv_isempty(pairs)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + remove_one_link(rtnl, primary); + + STRV_FOREACH_PAIR(a, b, pairs) + remove_one_link(rtnl, *a); + + return 0; +} + +static int network_iface_pair_parse(const char* iftype, char ***l, const char *p, const char* ifprefix) { + int r; + + for (;;) { + _cleanup_free_ char *word = NULL, *a = NULL, *b = NULL; + const char *interface; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to parse interface name: %m"); + if (r == 0) + break; + + interface = word; + r = extract_first_word(&interface, &a, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to extract first word in %s parameter: %m", iftype); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Short read while reading %s parameter: %m", iftype); + if (!ifname_valid(a)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s, interface name not valid: %s", iftype, a); + + /* Here, we only check the validity of the specified second name. If it is not specified, + * the copied or prefixed name should be already valid, except for its length. If it is too + * long, then it will be shortened later. */ + if (!isempty(interface)) { + if (!ifname_valid(interface)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s, interface name not valid: %s", iftype, interface); + + b = strdup(interface); + } else if (ifprefix) + b = strjoin(ifprefix, a); + else + b = strdup(a); + if (!b) + return log_oom(); + + r = strv_consume_pair(l, TAKE_PTR(a), TAKE_PTR(b)); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int interface_pair_parse(char ***l, const char *p) { + return network_iface_pair_parse("Network interface", l, p, NULL); +} + +int macvlan_pair_parse(char ***l, const char *p) { + return network_iface_pair_parse("MACVLAN network interface", l, p, "mv-"); +} + +int ipvlan_pair_parse(char ***l, const char *p) { + return network_iface_pair_parse("IPVLAN network interface", l, p, "iv-"); +} diff --git a/src/nspawn/nspawn-network.h b/src/nspawn/nspawn-network.h new file mode 100644 index 0000000..a785f8e --- /dev/null +++ b/src/nspawn/nspawn-network.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "ether-addr-util.h" + +int test_network_interfaces_initialized(char **iface_pairs); + +int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], bool bridge, const struct ether_addr *provided_mac); +int setup_veth_extra(const char *machine_name, pid_t pid, char **pairs); + +int setup_bridge(const char *veth_name, const char *bridge_name, bool create); +int remove_bridge(const char *bridge_name); + +int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs); +int setup_ipvlan(const char *machine_name, pid_t pid, char **iface_pairs); + +int move_network_interfaces(int netns_fd, char **iface_pairs); + +int veth_extra_parse(char ***l, const char *p); + +int remove_veth_links(const char *primary, char **pairs); + +int interface_pair_parse(char ***l, const char *p); +int macvlan_pair_parse(char ***l, const char *p); +int ipvlan_pair_parse(char ***l, const char *p); diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c new file mode 100644 index 0000000..8f1ac7c --- /dev/null +++ b/src/nspawn/nspawn-oci.c @@ -0,0 +1,2197 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "bus-util.h" +#include "cap-list.h" +#include "cpu-set-util.h" +#include "device-util.h" +#include "devnum-util.h" +#include "env-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "json.h" +#include "missing_sched.h" +#include "nspawn-oci.h" +#include "path-util.h" +#include "rlimit-util.h" +#include "seccomp-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +/* TODO: + * OCI runtime tool implementation + * hooks + * + * Spec issues: + * + * How is RLIM_INFINITY supposed to be encoded? + * configured effective caps is bullshit, as execv() corrupts it anyway + * pipes bind mounted is *very* different from pipes newly created, comments regarding bind mount or not are bogus + * annotation values structured? or string? + * configurable file system namespace path, but then also root path? wtf? + * apply sysctl inside of the container? or outside? + * how is unlimited pids tasks limit to be encoded? + * what are the defaults for caps if not specified? + * what are the default uid/gid mappings if one is missing but the other set, or when user ns is on but no namespace configured + * the source field of "mounts" is really weird, as it cannot realistically be relative to the bundle, since we never know if that's what the fs wants + * spec contradicts itself on the mount "type" field, as the example uses "bind" as type, but it's not listed in /proc/filesystem, and is something made up by /bin/mount + * if type of mount is left out, what shall be assumed? "bind"? + * readonly mounts is entirely redundant? + * should escaping be applied when joining mount options with ","? + * devices cgroup support is bogus, "allow" and "deny" on the kernel level is about adding/removing entries, not about access + * spec needs to say that "rwm" devices cgroup combination can't be the empty string + * cgrouspv1 crap: kernel, kernelTCP, swappiness, disableOOMKiller, swap, devices, leafWeight + * general: it shouldn't leak lower level abstractions this obviously + * unmanagable cgroups stuff: realtimeRuntime/realtimePeriod + * needs to say what happense when some option is not specified, i.e. which defaults apply + * no architecture? no personality? + * seccomp example and logic is simply broken: there's no constant "SCMP_ACT_ERRNO". + * spec should say what to do with unknown props + * /bin/mount regarding NFS and FUSE required? + * what does terminal=false mean? + * sysctl inside or outside? allow-listing? + * swapiness typo -> swappiness + * + * Unsupported: + * + * apparmorProfile + * selinuxLabel + mountLabel + * hugepageLimits + * network + * rdma + * intelRdt + * swappiness, disableOOMKiller, kernel, kernelTCP, leafWeight (because it's dead, cgroupsv2 can't do it and hence systemd neither) + * + * Non-slice cgroup paths + * Propagation that is not slave + shared + * more than one uid/gid mapping, mappings with a container base != 0, or non-matching uid/gid mappings + * device cgroups access = false items that are not catchall + * device cgroups matches where minor is specified, but major isn't. similar where major is specified but char/block is not. also, any match that only has a type set that has less than "rwm" set. also, any entry that has none of rwm set. + * + */ + +static int oci_unexpected(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unexpected OCI element '%s' of type '%s'.", name, json_variant_type_to_string(json_variant_type(v))); +} + +static int oci_dispatch(JsonVariant *v, const JsonDispatch table[], JsonDispatchFlags flags, void *userdata) { + return json_dispatch_full(v, table, oci_unexpected, flags, userdata, /* reterr_bad_field= */ NULL); +} + +static int oci_unsupported(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unsupported OCI element '%s' of type '%s'.", name, json_variant_type_to_string(json_variant_type(v))); +} + +static int oci_terminal(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + + /* If not specified, or set to true, we'll default to either an interactive or a read-only + * console. If specified as false, we'll forcibly move to "pipe" mode though. */ + s->console_mode = json_variant_boolean(v) ? _CONSOLE_MODE_INVALID : CONSOLE_PIPE; + return 0; +} + +static int oci_console_dimension(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + unsigned *u = ASSERT_PTR(userdata); + uint64_t k; + + k = json_variant_unsigned(variant); + if (k == 0) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "Console size field '%s' is too small.", strna(name)); + if (k > USHRT_MAX) /* TIOCSWINSZ's struct winsize uses "unsigned short" for width and height */ + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "Console size field '%s' is too large.", strna(name)); + + *u = (unsigned) k; + return 0; +} + +static int oci_console_size(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + + static const JsonDispatch table[] = { + { "height", JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_height), JSON_MANDATORY }, + { "width", JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_width), JSON_MANDATORY }, + {} + }; + + return oci_dispatch(v, table, flags, s); +} + +static int oci_absolute_path(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + char **p = ASSERT_PTR(userdata); + const char *n; + + n = json_variant_string(v); + + if (!path_is_absolute(n)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path in JSON field '%s' is not absolute: %s", strna(name), n); + + return free_and_strdup_warn(p, n); +} + +static int oci_env(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + char ***l = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Environment array contains non-string."); + + assert_se(n = json_variant_string(e)); + + if (!env_assignment_is_valid(n)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Environment assignment not valid: %s", n); + + r = strv_extend(l, n); + if (r < 0) + return log_oom(); + } + + return 0; +} + +static int oci_args(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + char ***value = ASSERT_PTR(userdata); + int r; + + r = json_variant_strv(v, &l); + if (r < 0) + return json_log(v, flags, r, "Cannot parse arguments as list of strings: %m"); + + if (strv_isempty(l)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Argument list empty, refusing."); + + if (isempty(l[0])) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Executable name is empty, refusing."); + + return strv_free_and_replace(*value, l); +} + +static int oci_rlimit_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + const char *z; + int *type = ASSERT_PTR(userdata); + int t; + + z = startswith(json_variant_string(v), "RLIMIT_"); + if (!z) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimit entry's name does not begin with 'RLIMIT_', refusing: %s", + json_variant_string(v)); + + t = rlimit_from_string(z); + if (t < 0) + return json_log(v, flags, t, + "rlimit name unknown: %s", json_variant_string(v)); + + *type = t; + return 0; +} + +static int oci_rlimit_value(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + rlim_t *value = ASSERT_PTR(userdata); + rlim_t z; + + if (json_variant_is_negative(v)) + z = RLIM_INFINITY; + else { + if (!json_variant_is_unsigned(v)) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "rlimits limit not unsigned, refusing."); + + z = (rlim_t) json_variant_unsigned(v); + + if ((uint64_t) z != json_variant_unsigned(v)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimits limit out of range, refusing."); + } + + *value = z; + return 0; +} + +static int oci_rlimits(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct rlimit_data { + int type; + rlim_t soft; + rlim_t hard; + } data = { + .type = -1, + .soft = RLIM_INFINITY, + .hard = RLIM_INFINITY, + }; + + static const JsonDispatch table[] = { + { "soft", JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, soft), JSON_MANDATORY }, + { "hard", JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, hard), JSON_MANDATORY }, + { "type", JSON_VARIANT_STRING, oci_rlimit_type, offsetof(struct rlimit_data, type), JSON_MANDATORY }, + {} + }; + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + assert(data.type >= 0); + assert(data.type < _RLIMIT_MAX); + + if (s->rlimit[data.type]) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimits array contains duplicate entry, refusing."); + + s->rlimit[data.type] = new(struct rlimit, 1); + if (!s->rlimit[data.type]) + return log_oom(); + + *s->rlimit[data.type] = (struct rlimit) { + .rlim_cur = data.soft, + .rlim_max = data.hard, + }; + + } + return 0; +} + +static int oci_capability_array(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *mask = ASSERT_PTR(userdata); + uint64_t m = 0; + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + int cap; + + if (!json_variant_is_string(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Entry in capabilities array is not a string."); + + assert_se(n = json_variant_string(e)); + + cap = capability_from_name(n); + if (cap < 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown capability: %s", n); + + m |= UINT64_C(1) << cap; + } + + if (*mask == UINT64_MAX) + *mask = m; + else + *mask |= m; + + return 0; +} + +static int oci_capabilities(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "effective", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, effective) }, + { "bounding", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, bounding) }, + { "inheritable", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, inheritable) }, + { "permitted", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, permitted) }, + { "ambient", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, ambient) }, + {} + }; + + Settings *s = ASSERT_PTR(userdata); + int r; + + r = oci_dispatch(v, table, flags, &s->full_capabilities); + if (r < 0) + return r; + + if (s->full_capabilities.bounding != UINT64_MAX) { + s->capability = s->full_capabilities.bounding; + s->drop_capability = ~s->full_capabilities.bounding; + } + + return 0; +} + +static int oci_oom_score_adj(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + int64_t k; + + k = json_variant_integer(v); + if (k < OOM_SCORE_ADJ_MIN || k > OOM_SCORE_ADJ_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "oomScoreAdj value out of range: %" PRIi64, k); + + s->oom_score_adjust = (int) k; + s->oom_score_adjust_set = true; + + return 0; +} + +static int oci_uid_gid(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = ASSERT_PTR(userdata); + uid_t u; + uint64_t k; + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + k = json_variant_unsigned(v); + u = (uid_t) k; + if ((uint64_t) u != k) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "UID/GID out of range: %" PRIu64, k); + + if (!uid_is_valid(u)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "UID/GID is not valid: " UID_FMT, u); + + *uid = u; + return 0; +} + +static int oci_supplementary_gids(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + gid_t gid, *a; + + if (!json_variant_is_unsigned(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Supplementary GID entry is not a UID."); + + r = oci_uid_gid(name, e, flags, &gid); + if (r < 0) + return r; + + a = reallocarray(s->supplementary_gids, s->n_supplementary_gids + 1, sizeof(gid_t)); + if (!a) + return log_oom(); + + s->supplementary_gids = a; + s->supplementary_gids[s->n_supplementary_gids++] = gid; + } + + return 0; +} + +static int oci_user(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "uid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(Settings, uid), JSON_MANDATORY }, + { "gid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(Settings, gid), JSON_MANDATORY }, + { "additionalGids", JSON_VARIANT_ARRAY, oci_supplementary_gids, 0, 0 }, + {} + }; + + return oci_dispatch(v, table, flags, userdata); +} + +static int oci_process(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "terminal", JSON_VARIANT_BOOLEAN, oci_terminal, 0, 0 }, + { "consoleSize", JSON_VARIANT_OBJECT, oci_console_size, 0, 0 }, + { "cwd", JSON_VARIANT_STRING, oci_absolute_path, offsetof(Settings, working_directory), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(Settings, environment), 0 }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(Settings, parameters), 0 }, + { "rlimits", JSON_VARIANT_ARRAY, oci_rlimits, 0, 0 }, + { "apparmorProfile", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "capabilities", JSON_VARIANT_OBJECT, oci_capabilities, 0, 0 }, + { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, no_new_privileges), 0 }, + { "oomScoreAdj", JSON_VARIANT_INTEGER, oci_oom_score_adj, 0, 0 }, + { "selinuxLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "user", JSON_VARIANT_OBJECT, oci_user, 0, 0 }, + {} + }; + + return oci_dispatch(v, table, flags, userdata); +} + +static int oci_root(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + int r; + + static const JsonDispatch table[] = { + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Settings, root) }, + { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, read_only) }, + {} + }; + + r = oci_dispatch(v, table, flags, s); + if (r < 0) + return r; + + if (s->root && !path_is_absolute(s->root)) { + char *joined; + + joined = path_join(s->bundle, s->root); + if (!joined) + return log_oom(); + + free_and_replace(s->root, joined); + } + + return 0; +} + +static int oci_hostname(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + const char *n; + + assert_se(n = json_variant_string(v)); + + if (!hostname_is_valid(n, 0)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Hostname string is not a valid hostname: %s", n); + + return free_and_strdup_warn(&s->hostname, n); +} + +static bool oci_exclude_mount(const char *path) { + + /* Returns "true" for all mounts we insist to mount on our own, and hence ignore the OCI data. */ + + if (PATH_IN_SET(path, + "/dev", + "/dev/mqueue", + "/dev/pts", + "/dev/shm", + "/proc", + "/proc/acpi", + "/proc/apm", + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/kallsyms", + "/proc/kcore", + "/proc/keys", + "/proc/scsi", + "/proc/sys", + "/proc/sys/net", + "/proc/sysrq-trigger", + "/proc/timer_list", + "/run", + "/sys", + "/sys", + "/sys/fs/selinux", + "/tmp")) + return true; + + /* Similar, skip the whole /sys/fs/cgroups subtree */ + if (path_startswith(path, "/sys/fs/cgroup")) + return true; + + return false; +} + +typedef struct oci_mount_data { + char *destination; + char *source; + char *type; + char **options; +} oci_mount_data; + +static void oci_mount_data_done(oci_mount_data *data) { + assert(data); + + free(data->destination); + free(data->source); + free(data->type); + strv_free(data->options); +} + +static int oci_mounts(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const JsonDispatch table[] = { + { "destination", JSON_VARIANT_STRING, oci_absolute_path, offsetof(oci_mount_data, destination), JSON_MANDATORY }, + { "source", JSON_VARIANT_STRING, json_dispatch_string, offsetof(oci_mount_data, source), 0 }, + { "options", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(oci_mount_data, options), 0, }, + { "type", JSON_VARIANT_STRING, json_dispatch_string, offsetof(oci_mount_data, type), 0 }, + {} + }; + + _cleanup_free_ char *joined_options = NULL; + _cleanup_(oci_mount_data_done) oci_mount_data data = {}; + CustomMount *m; + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + if (!path_is_absolute(data.destination)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Mount destination not an absolute path: %s", data.destination); + + if (oci_exclude_mount(data.destination)) + continue; + + if (data.options) { + joined_options = strv_join(data.options, ","); + if (!joined_options) + return log_oom(); + } + + if (!data.type || streq(data.type, "bind")) { + if (data.source && !path_is_absolute(data.source)) { + char *joined; + + joined = path_join(s->bundle, data.source); + if (!joined) + return log_oom(); + + free_and_replace(data.source, joined); + } + + data.type = mfree(data.type); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND); + } else + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_ARBITRARY); + if (!m) + return log_oom(); + + m->destination = TAKE_PTR(data.destination); + m->source = TAKE_PTR(data.source); + m->options = TAKE_PTR(joined_options); + m->type_argument = TAKE_PTR(data.type); + } + + return 0; +} + +static int oci_namespace_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned long *nsflags = ASSERT_PTR(userdata); + const char *n; + + assert_se(n = json_variant_string(v)); + + /* We don't use namespace_flags_from_string() here, as the OCI spec uses slightly different names than the + * kernel here. */ + if (streq(n, "pid")) + *nsflags = CLONE_NEWPID; + else if (streq(n, "network")) + *nsflags = CLONE_NEWNET; + else if (streq(n, "mount")) + *nsflags = CLONE_NEWNS; + else if (streq(n, "ipc")) + *nsflags = CLONE_NEWIPC; + else if (streq(n, "uts")) + *nsflags = CLONE_NEWUTS; + else if (streq(n, "user")) + *nsflags = CLONE_NEWUSER; + else if (streq(n, "cgroup")) + *nsflags = CLONE_NEWCGROUP; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown namespace type, refusing: %s", n); + + return 0; +} + +struct namespace_data { + unsigned long type; + char *path; +}; + +static void namespace_data_done(struct namespace_data *data) { + assert(data); + + free(data->path); +} + +static int oci_namespaces(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + unsigned long n = 0; + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_(namespace_data_done) struct namespace_data data = {}; + + static const JsonDispatch table[] = { + { "type", JSON_VARIANT_STRING, oci_namespace_type, offsetof(struct namespace_data, type), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(struct namespace_data, path), 0 }, + {} + }; + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + if (data.path) { + if (data.type != CLONE_NEWNET) + return json_log(e, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Specifying namespace path for non-network namespace is not supported."); + + if (s->network_namespace_path) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Network namespace path specified more than once, refusing."); + + free_and_replace(s->network_namespace_path, data.path); + } + + if (FLAGS_SET(n, data.type)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Duplicate namespace specification, refusing."); + + n |= data.type; + } + + if (!FLAGS_SET(n, CLONE_NEWNS)) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Containers without a mount namespace aren't supported."); + + s->private_network = FLAGS_SET(n, CLONE_NEWNET); + s->userns_mode = FLAGS_SET(n, CLONE_NEWUSER) ? USER_NAMESPACE_FIXED : USER_NAMESPACE_NO; + s->use_cgns = FLAGS_SET(n, CLONE_NEWCGROUP); + + s->clone_ns_flags = n & (CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS); + + return 0; +} + +static int oci_uid_gid_range(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = ASSERT_PTR(userdata); + uid_t u; + uint64_t k; + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + /* This is very much like oci_uid_gid(), except the checks are a bit different, as this is a UID range rather + * than a specific UID, and hence UID_INVALID has no special significance. OTOH a range of zero makes no + * sense. */ + + k = json_variant_unsigned(v); + u = (uid_t) k; + if ((uint64_t) u != k) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "UID/GID out of range: %" PRIu64, k); + if (u == 0) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "UID/GID range can't be zero."); + + *uid = u; + return 0; +} + +static int oci_uid_gid_mappings(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct mapping_data { + uid_t host_id; + uid_t container_id; + uid_t range; + } data = { + .host_id = UID_INVALID, + .container_id = UID_INVALID, + .range = 0, + }; + + static const JsonDispatch table[] = { + { "containerID", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(struct mapping_data, container_id), JSON_MANDATORY }, + { "hostID", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(struct mapping_data, host_id), JSON_MANDATORY }, + { "size", JSON_VARIANT_UNSIGNED, oci_uid_gid_range, offsetof(struct mapping_data, range), JSON_MANDATORY }, + {} + }; + + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + if (json_variant_elements(v) == 0) + return 0; + + if (json_variant_elements(v) > 1) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "UID/GID mappings with more than one entry are not supported."); + + assert_se(e = json_variant_by_index(v, 0)); + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + if (data.host_id + data.range < data.host_id || + data.container_id + data.range < data.container_id) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "UID/GID range goes beyond UID/GID validity range, refusing."); + + if (data.container_id != 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "UID/GID mappings with a non-zero container base are not supported."); + + if (data.range < 0x10000) + json_log(v, flags|JSON_WARNING, 0, + "UID/GID mapping with less than 65536 UID/GIDS set up, you are looking for trouble."); + + if (s->uid_range != UID_INVALID && + (s->uid_shift != data.host_id || s->uid_range != data.range)) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Non-matching UID and GID mappings are not supported."); + + s->uid_shift = data.host_id; + s->uid_range = data.range; + + return 0; +} + +static int oci_device_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = ASSERT_PTR(userdata); + const char *t; + + assert_se(t = json_variant_string(v)); + + if (STR_IN_SET(t, "c", "u")) + *mode = (*mode & ~S_IFMT) | S_IFCHR; + else if (streq(t, "b")) + *mode = (*mode & ~S_IFMT) | S_IFBLK; + else if (streq(t, "p")) + *mode = (*mode & ~S_IFMT) | S_IFIFO; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown device type: %s", t); + + return 0; +} + +static int oci_device_major(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned *u = ASSERT_PTR(userdata); + uint64_t k; + + k = json_variant_unsigned(v); + if (!DEVICE_MAJOR_VALID(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Device major %" PRIu64 " out of range.", k); + + *u = (unsigned) k; + return 0; +} + +static int oci_device_minor(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned *u = ASSERT_PTR(userdata); + uint64_t k; + + k = json_variant_unsigned(v); + if (!DEVICE_MINOR_VALID(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Device minor %" PRIu64 " out of range.", k); + + *u = (unsigned) k; + return 0; +} + +static int oci_device_file_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = ASSERT_PTR(userdata); + mode_t m; + uint64_t k; + + k = json_variant_unsigned(v); + m = (mode_t) k; + + if ((m & ~07777) != 0 || (uint64_t) m != k) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "fileMode out of range, refusing."); + + *mode = (*mode & ~07777) | m; + return 0; +} + +static int oci_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + static const JsonDispatch table[] = { + { "type", JSON_VARIANT_STRING, oci_device_type, offsetof(DeviceNode, mode), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(DeviceNode, path), JSON_MANDATORY }, + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(DeviceNode, major), 0 }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(DeviceNode, minor), 0 }, + { "fileMode", JSON_VARIANT_UNSIGNED, oci_device_file_mode, offsetof(DeviceNode, mode), 0 }, + { "uid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(DeviceNode, uid), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(DeviceNode, gid), 0 }, + {} + }; + + DeviceNode *node, *nodes; + + nodes = reallocarray(s->extra_nodes, s->n_extra_nodes + 1, sizeof(DeviceNode)); + if (!nodes) + return log_oom(); + + s->extra_nodes = nodes; + + node = nodes + s->n_extra_nodes; + *node = (DeviceNode) { + .uid = UID_INVALID, + .gid = GID_INVALID, + .major = UINT_MAX, + .minor = UINT_MAX, + .mode = 0644, + }; + + r = oci_dispatch(e, table, flags, node); + if (r < 0) + goto fail_element; + + if (S_ISCHR(node->mode) || S_ISBLK(node->mode)) { + _cleanup_free_ char *path = NULL; + + if (node->major == UINT_MAX || node->minor == UINT_MAX) { + r = json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Major/minor required when device node is device node"); + goto fail_element; + } + + /* Suppress a couple of implicit device nodes */ + r = devname_from_devnum(node->mode, makedev(node->major, node->minor), &path); + if (r < 0) + json_log(e, flags|JSON_DEBUG, r, "Failed to resolve device node %u:%u, ignoring: %m", node->major, node->minor); + else { + if (PATH_IN_SET(path, + "/dev/null", + "/dev/zero", + "/dev/full", + "/dev/random", + "/dev/urandom", + "/dev/tty", + "/dev/net/tun", + "/dev/ptmx", + "/dev/pts/ptmx", + "/dev/console")) { + + json_log(e, flags|JSON_DEBUG, 0, "Ignoring devices item for device '%s', as it is implicitly created anyway.", path); + free(node->path); + continue; + } + } + } + + s->n_extra_nodes++; + continue; + + fail_element: + free(node->path); + return r; + } + + return 0; +} + +static int oci_cgroups_path(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_free_ char *slice = NULL, *backwards = NULL; + Settings *s = ASSERT_PTR(userdata); + const char *p; + int r; + + assert_se(p = json_variant_string(v)); + + r = cg_path_get_slice(p, &slice); + if (r < 0) + return json_log(v, flags, r, "Couldn't derive slice unit name from path '%s': %m", p); + + r = cg_slice_to_path(slice, &backwards); + if (r < 0) + return json_log(v, flags, r, "Couldn't convert slice unit name '%s' back to path: %m", slice); + + if (!path_equal(backwards, p)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Control group path '%s' does not refer to slice unit, refusing.", p); + + free_and_replace(s->slice, slice); + return 0; +} + +static int oci_cgroup_device_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = ASSERT_PTR(userdata); + const char *n; + + assert_se(n = json_variant_string(v)); + + if (streq(n, "c")) + *mode = S_IFCHR; + else if (streq(n, "b")) + *mode = S_IFBLK; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Control group device type unknown: %s", n); + + return 0; +} + +struct device_data { + bool allow; + bool r; + bool w; + bool m; + mode_t type; + unsigned major; + unsigned minor; +}; + +static int oci_cgroup_device_access(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct device_data *d = ASSERT_PTR(userdata); + bool r = false, w = false, m = false; + const char *s; + size_t i; + + assert_se(s = json_variant_string(v)); + + for (i = 0; s[i]; i++) + if (s[i] == 'r') + r = true; + else if (s[i] == 'w') + w = true; + else if (s[i] == 'm') + m = true; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown device access character '%c'.", s[i]); + + d->r = r; + d->w = w; + d->m = m; + + return 0; +} + +static int oci_cgroup_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_free_ struct device_data *list = NULL; + Settings *s = ASSERT_PTR(userdata); + size_t n_list = 0, i; + bool noop = false; + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct device_data data = { + .major = UINT_MAX, + .minor = UINT_MAX, + }, *a; + + static const JsonDispatch table[] = { + { "allow", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct device_data, allow), JSON_MANDATORY }, + { "type", JSON_VARIANT_STRING, oci_cgroup_device_type, offsetof(struct device_data, type), 0 }, + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), 0 }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), 0 }, + { "access", JSON_VARIANT_STRING, oci_cgroup_device_access, 0, 0 }, + {} + }; + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + if (!data.allow) { + /* The fact that OCI allows 'deny' entries makes really no sense, as 'allow' + * vs. 'deny' for the devices cgroup controller is really not about allow-listing and + * deny-listing but about adding and removing entries from the allow list. Since we + * always start out with an empty allow list we hence ignore the whole thing, as + * removing entries which don't exist make no sense. We'll log about this, since this + * is really borked in the spec, with one exception: the entry that's supposed to + * drop the kernel's default we ignore silently */ + + if (!data.r || !data.w || !data.m || data.type != 0 || data.major != UINT_MAX || data.minor != UINT_MAX) + json_log(v, flags|JSON_WARNING, 0, "Devices cgroup allow list with arbitrary 'allow' entries not supported, ignoring."); + + /* We ignore the 'deny' entry as for us that's implied */ + continue; + } + + if (!data.r && !data.w && !data.m) { + json_log(v, flags|LOG_WARNING, 0, "Device cgroup allow list entry with no effect found, ignoring."); + continue; + } + + if (data.minor != UINT_MAX && data.major == UINT_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Device cgroup allow list entries with minors but no majors not supported."); + + if (data.major != UINT_MAX && data.type == 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Device cgroup allow list entries with majors but no device node type not supported."); + + if (data.type == 0) { + if (data.r && data.w && data.m) /* a catchall allow list entry means we are looking at a noop */ + noop = true; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Device cgroup allow list entries with no type not supported."); + } + + a = reallocarray(list, n_list + 1, sizeof(struct device_data)); + if (!a) + return log_oom(); + + list = a; + list[n_list++] = data; + } + + if (noop) + return 0; + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_open_container(s->properties, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(s->properties, "s", "DeviceAllow"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(s->properties, 'v', "a(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(s->properties, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + for (i = 0; i < n_list; i++) { + _cleanup_free_ char *pattern = NULL; + char access[4]; + size_t n = 0; + + if (list[i].minor == UINT_MAX) { + const char *t; + + if (list[i].type == S_IFBLK) + t = "block"; + else { + assert(list[i].type == S_IFCHR); + t = "char"; + } + + if (list[i].major == UINT_MAX) { + pattern = strjoin(t, "-*"); + if (!pattern) + return log_oom(); + } else { + if (asprintf(&pattern, "%s-%u", t, list[i].major) < 0) + return log_oom(); + } + + } else { + assert(list[i].major != UINT_MAX); /* If a minor is specified, then a major also needs to be specified */ + + r = device_path_make_major_minor(list[i].type, makedev(list[i].major, list[i].minor), &pattern); + if (r < 0) + return log_oom(); + } + + if (list[i].r) + access[n++] = 'r'; + if (list[i].w) + access[n++] = 'w'; + if (list[i].m) + access[n++] = 'm'; + access[n] = 0; + + assert(n > 0); + + r = sd_bus_message_append(s->properties, "(ss)", pattern, access); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_cgroup_memory_limit(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *m = ASSERT_PTR(userdata); + uint64_t k; + + if (json_variant_is_negative(v)) { + *m = UINT64_MAX; + return 0; + } + + if (!json_variant_is_unsigned(v)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Memory limit is not an unsigned integer"); + + k = json_variant_unsigned(v); + if (k >= UINT64_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Memory limit too large: %" PRIu64, k); + + *m = (uint64_t) k; + return 0; +} + +static int oci_cgroup_memory(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + struct memory_data { + uint64_t limit; + uint64_t reservation; + uint64_t swap; + } data = { + .limit = UINT64_MAX, + .reservation = UINT64_MAX, + .swap = UINT64_MAX, + }; + + static const JsonDispatch table[] = { + { "limit", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, limit), 0 }, + { "reservation", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, reservation), 0 }, + { "swap", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, swap), 0 }, + { "kernel", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "kernelTCP", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "swapiness", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "disableOOMKiller", JSON_VARIANT_BOOLEAN, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + Settings *s = ASSERT_PTR(userdata); + int r; + + r = oci_dispatch(v, table, flags, &data); + if (r < 0) + return r; + + if (data.swap != UINT64_MAX) { + if (data.limit == UINT64_MAX) + json_log(v, flags|LOG_WARNING, 0, "swap limit without memory limit is not supported, ignoring."); + else if (data.swap < data.limit) + json_log(v, flags|LOG_WARNING, 0, "swap limit is below memory limit, ignoring."); + else { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemorySwapMax", "t", data.swap - data.limit); + if (r < 0) + return bus_log_create_error(r); + } + } + + if (data.limit != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemoryMax", "t", data.limit); + if (r < 0) + return bus_log_create_error(r); + } + + if (data.reservation != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemoryLow", "t", data.reservation); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +struct cpu_data { + uint64_t shares; + uint64_t quota; + uint64_t period; + CPUSet cpu_set; +}; + +static int oci_cgroup_cpu_shares(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = ASSERT_PTR(userdata); + uint64_t k; + + k = json_variant_unsigned(v); + if (k < CGROUP_CPU_SHARES_MIN || k > CGROUP_CPU_SHARES_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "shares value out of range."); + + *u = (uint64_t) k; + return 0; +} + +static int oci_cgroup_cpu_quota(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = ASSERT_PTR(userdata); + uint64_t k; + + k = json_variant_unsigned(v); + if (k <= 0 || k >= UINT64_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "period/quota value out of range."); + + *u = (uint64_t) k; + return 0; +} + +static int oci_cgroup_cpu_cpus(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct cpu_data *data = ASSERT_PTR(userdata); + CPUSet set; + const char *n; + int r; + + assert_se(n = json_variant_string(v)); + + r = parse_cpu_set(n, &set); + if (r < 0) + return json_log(v, flags, r, "Failed to parse CPU set specification: %s", n); + + cpu_set_reset(&data->cpu_set); + data->cpu_set = set; + + return 0; +} + +static int oci_cgroup_cpu(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "shares", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_shares, offsetof(struct cpu_data, shares), 0 }, + { "quota", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, quota), 0 }, + { "period", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, period), 0 }, + { "realtimeRuntime", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 }, + { "realtimePeriod", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 }, + { "cpus", JSON_VARIANT_STRING, oci_cgroup_cpu_cpus, 0, 0 }, + { "mems", JSON_VARIANT_STRING, oci_unsupported, 0, 0 }, + {} + }; + + struct cpu_data data = { + .shares = UINT64_MAX, + .quota = UINT64_MAX, + .period = UINT64_MAX, + }; + + Settings *s = ASSERT_PTR(userdata); + int r; + + r = oci_dispatch(v, table, flags, &data); + if (r < 0) { + cpu_set_reset(&data.cpu_set); + return r; + } + + cpu_set_reset(&s->cpu_set); + s->cpu_set = data.cpu_set; + + if (data.shares != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "CPUShares", "t", data.shares); + if (r < 0) + return bus_log_create_error(r); + } + + if (data.quota != UINT64_MAX && data.period != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "CPUQuotaPerSecUSec", "t", (uint64_t) (data.quota * USEC_PER_SEC / data.period)); + if (r < 0) + return bus_log_create_error(r); + + } else if ((data.quota != UINT64_MAX) != (data.period != UINT64_MAX)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "CPU quota and period not used together."); + + return 0; +} + +static int oci_cgroup_block_io_weight(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + uint64_t k; + int r; + + k = json_variant_unsigned(v); + if (k < CGROUP_BLKIO_WEIGHT_MIN || k > CGROUP_BLKIO_WEIGHT_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Block I/O weight out of range."); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "BlockIOWeight", "t", (uint64_t) k); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_cgroup_block_io_weight_device(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + struct device_data { + unsigned major; + unsigned minor; + uint64_t weight; + } data = { + .major = UINT_MAX, + .minor = UINT_MAX, + .weight = UINT64_MAX, + }; + + static const JsonDispatch table[] = { + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), JSON_MANDATORY }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), JSON_MANDATORY }, + { "weight", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct device_data, weight), 0 }, + { "leafWeight", JSON_VARIANT_INTEGER, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + _cleanup_free_ char *path = NULL; + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + if (data.weight == UINT64_MAX) + continue; + + if (data.weight < CGROUP_BLKIO_WEIGHT_MIN || data.weight > CGROUP_BLKIO_WEIGHT_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Block I/O device weight out of range."); + + r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path); + if (r < 0) + return json_log(v, flags, r, "Failed to build device path: %m"); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "BlockIODeviceWeight", "a(st)", 1, path, (uint64_t) data.weight); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int oci_cgroup_block_io_throttle(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + const char *pname; + JsonVariant *e; + int r; + + pname = streq(name, "throttleReadBpsDevice") ? "IOReadBandwidthMax" : + streq(name, "throttleWriteBpsDevice") ? "IOWriteBandwidthMax" : + streq(name, "throttleReadIOPSDevice") ? "IOReadIOPSMax" : + "IOWriteIOPSMax"; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + struct device_data { + unsigned major; + unsigned minor; + uint64_t rate; + } data = { + .major = UINT_MAX, + .minor = UINT_MAX, + }; + + static const JsonDispatch table[] = { + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), JSON_MANDATORY }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), JSON_MANDATORY }, + { "rate", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct device_data, rate), JSON_MANDATORY }, + {} + }; + + _cleanup_free_ char *path = NULL; + + r = oci_dispatch(e, table, flags, &data); + if (r < 0) + return r; + + if (data.rate >= UINT64_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Block I/O device rate out of range."); + + r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path); + if (r < 0) + return json_log(v, flags, r, "Failed to build device path: %m"); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", pname, "a(st)", 1, path, (uint64_t) data.rate); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int oci_cgroup_block_io(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "weight", JSON_VARIANT_UNSIGNED, oci_cgroup_block_io_weight, 0, 0 }, + { "leafWeight", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, JSON_PERMISSIVE }, + { "weightDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_weight_device, 0, 0 }, + { "throttleReadBpsDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleWriteBpsDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleReadIOPSDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleWriteIOPSDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + {} + }; + + return oci_dispatch(v, table, flags, userdata); +} + +static int oci_cgroup_pids(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "limit", JSON_VARIANT_NUMBER, json_dispatch_variant, 0, JSON_MANDATORY }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *k = NULL; + Settings *s = ASSERT_PTR(userdata); + uint64_t m; + int r; + + r = oci_dispatch(v, table, flags, &k); + if (r < 0) + return r; + + if (json_variant_is_negative(k)) + m = UINT64_MAX; + else { + if (!json_variant_is_unsigned(k)) + return json_log(k, flags, SYNTHETIC_ERRNO(EINVAL), + "pids limit not unsigned integer, refusing."); + + m = (uint64_t) json_variant_unsigned(k); + + if ((uint64_t) m != json_variant_unsigned(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "pids limit out of range, refusing."); + } + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "TasksMax", "t", m); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_resources(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "devices", JSON_VARIANT_ARRAY, oci_cgroup_devices, 0, 0 }, + { "memory", JSON_VARIANT_OBJECT, oci_cgroup_memory, 0, 0 }, + { "cpu", JSON_VARIANT_OBJECT, oci_cgroup_cpu, 0, 0 }, + { "blockIO", JSON_VARIANT_OBJECT, oci_cgroup_block_io, 0, 0 }, + { "hugepageLimits", JSON_VARIANT_ARRAY, oci_unsupported, 0, 0 }, + { "network", JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 }, + { "pids", JSON_VARIANT_OBJECT, oci_cgroup_pids, 0, 0 }, + { "rdma", JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 }, + {} + }; + + return oci_dispatch(v, table, flags, userdata); +} + +static bool sysctl_key_valid(const char *s) { + bool dot = true; + + /* Note that we are a bit stricter here than in systemd-sysctl, as that inherited semantics from the old sysctl + * tool, which were really weird (as it swaps / and . in both ways) */ + + if (isempty(s)) + return false; + + for (; *s; s++) { + + if (*s <= ' ' || *s >= 127) + return false; + if (*s == '/') + return false; + if (*s == '.') { + + if (dot) /* Don't allow two dots next to each other (or at the beginning) */ + return false; + + dot = true; + } else + dot = false; + } + + if (dot) /* don't allow a dot at the end */ + return false; + + return true; +} + +static int oci_sysctl(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *w; + const char *k; + int r; + + JSON_VARIANT_OBJECT_FOREACH(k, w, v) { + const char *m; + + if (!json_variant_is_string(w)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "sysctl parameter is not a string, refusing."); + + assert_se(m = json_variant_string(w)); + + if (!sysctl_key_valid(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "sysctl key invalid, refusing: %s", k); + + r = strv_extend_strv(&s->sysctl, STRV_MAKE(k, m), false); + if (r < 0) + return log_oom(); + } + + return 0; +} + +#if HAVE_SECCOMP +static int oci_seccomp_action_from_string(const char *name, uint32_t *ret) { + + static const struct { + const char *name; + uint32_t action; + } table[] = { + { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, + { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ + { "SCMP_ACT_KILL", SCMP_ACT_KILL }, +#ifdef SCMP_ACT_KILL_PROCESS + { "SCMP_ACT_KILL_PROCESS", SCMP_ACT_KILL_PROCESS }, +#endif +#ifdef SCMP_ACT_KILL_THREAD + { "SCMP_ACT_KILL_THREAD", SCMP_ACT_KILL_THREAD }, +#endif +#ifdef SCMP_ACT_LOG + { "SCMP_ACT_LOG", SCMP_ACT_LOG }, +#endif + { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, + + /* We don't support SCMP_ACT_TRACE because that requires a tracer, and that doesn't really make sense + * here */ + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(name, table[i].name)) { + *ret = table[i].action; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_arch_from_string(const char *name, uint32_t *ret) { + + static const struct { + const char *name; + uint32_t arch; + } table[] = { + { "SCMP_ARCH_AARCH64", SCMP_ARCH_AARCH64 }, + { "SCMP_ARCH_ARM", SCMP_ARCH_ARM }, +#ifdef SCMP_ARCH_LOONGARCH64 + { "SCMP_ARCH_LOONGARCH64", SCMP_ARCH_LOONGARCH64 }, +#endif + { "SCMP_ARCH_MIPS", SCMP_ARCH_MIPS }, + { "SCMP_ARCH_MIPS64", SCMP_ARCH_MIPS64 }, + { "SCMP_ARCH_MIPS64N32", SCMP_ARCH_MIPS64N32 }, + { "SCMP_ARCH_MIPSEL", SCMP_ARCH_MIPSEL }, + { "SCMP_ARCH_MIPSEL64", SCMP_ARCH_MIPSEL64 }, + { "SCMP_ARCH_MIPSEL64N32", SCMP_ARCH_MIPSEL64N32 }, + { "SCMP_ARCH_NATIVE", SCMP_ARCH_NATIVE }, +#ifdef SCMP_ARCH_PARISC + { "SCMP_ARCH_PARISC", SCMP_ARCH_PARISC }, +#endif +#ifdef SCMP_ARCH_PARISC64 + { "SCMP_ARCH_PARISC64", SCMP_ARCH_PARISC64 }, +#endif + { "SCMP_ARCH_PPC", SCMP_ARCH_PPC }, + { "SCMP_ARCH_PPC64", SCMP_ARCH_PPC64 }, + { "SCMP_ARCH_PPC64LE", SCMP_ARCH_PPC64LE }, +#ifdef SCMP_ARCH_RISCV64 + { "SCMP_ARCH_RISCV64", SCMP_ARCH_RISCV64 }, +#endif + { "SCMP_ARCH_S390", SCMP_ARCH_S390 }, + { "SCMP_ARCH_S390X", SCMP_ARCH_S390X }, + { "SCMP_ARCH_X32", SCMP_ARCH_X32 }, + { "SCMP_ARCH_X86", SCMP_ARCH_X86 }, + { "SCMP_ARCH_X86_64", SCMP_ARCH_X86_64 }, + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(table[i].name, name)) { + *ret = table[i].arch; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_compare_from_string(const char *name, enum scmp_compare *ret) { + + static const struct { + const char *name; + enum scmp_compare op; + } table[] = { + { "SCMP_CMP_NE", SCMP_CMP_NE }, + { "SCMP_CMP_LT", SCMP_CMP_LT }, + { "SCMP_CMP_LE", SCMP_CMP_LE }, + { "SCMP_CMP_EQ", SCMP_CMP_EQ }, + { "SCMP_CMP_GE", SCMP_CMP_GE }, + { "SCMP_CMP_GT", SCMP_CMP_GT }, + { "SCMP_CMP_MASKED_EQ", SCMP_CMP_MASKED_EQ }, + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(table[i].name, name)) { + *ret = table[i].op; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_archs(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + scmp_filter_ctx *sc = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + uint32_t a; + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Architecture entry is not a string"); + + r = oci_seccomp_arch_from_string(json_variant_string(e), &a); + if (r < 0) + return json_log(e, flags, r, "Unknown architecture: %s", json_variant_string(e)); + + r = seccomp_arch_add(sc, a); + if (r == -EEXIST) + continue; + if (r < 0) + return json_log(e, flags, r, "Failed to add architecture to seccomp filter: %m"); + } + + return 0; +} + +struct syscall_rule { + char **names; + uint32_t action; + struct scmp_arg_cmp *arguments; + size_t n_arguments; +}; + +static void syscall_rule_done(struct syscall_rule *rule) { + assert(rule); + + strv_free(rule->names); + free(rule->arguments); +}; + +static int oci_seccomp_action(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint32_t *action = ASSERT_PTR(userdata); + int r; + + r = oci_seccomp_action_from_string(json_variant_string(v), action); + if (r < 0) + return json_log(v, flags, r, "Unknown system call action '%s': %m", json_variant_string(v)); + + return 0; +} + +static int oci_seccomp_op(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + enum scmp_compare *op = ASSERT_PTR(userdata); + int r; + + r = oci_seccomp_compare_from_string(json_variant_string(v), op); + if (r < 0) + return json_log(v, flags, r, "Unknown seccomp operator '%s': %m", json_variant_string(v)); + + return 0; +} + +static int oci_seccomp_args(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct syscall_rule *rule = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const struct JsonDispatch table[] = { + { "index", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(struct scmp_arg_cmp, arg), JSON_MANDATORY }, + { "value", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_a), JSON_MANDATORY }, + { "valueTwo", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_b), 0 }, + { "op", JSON_VARIANT_STRING, oci_seccomp_op, offsetof(struct scmp_arg_cmp, op), JSON_MANDATORY }, + {}, + }; + + struct scmp_arg_cmp *a, *p; + int expected; + + a = reallocarray(rule->arguments, rule->n_arguments + 1, sizeof(struct syscall_rule)); + if (!a) + return log_oom(); + + rule->arguments = a; + p = rule->arguments + rule->n_arguments; + + *p = (struct scmp_arg_cmp) { + .arg = 0, + .datum_a = 0, + .datum_b = 0, + .op = 0, + }; + + r = oci_dispatch(e, table, flags, p); + if (r < 0) + return r; + + expected = p->op == SCMP_CMP_MASKED_EQ ? 4 : 3; + if (r != expected) + json_log(e, flags|JSON_WARNING, 0, "Wrong number of system call arguments for JSON data, ignoring."); + + /* Note that we are a bit sloppy here and do not insist that SCMP_CMP_MASKED_EQ gets two datum values, + * and the other only one. That's because buildah for example by default calls things with + * SCMP_CMP_MASKED_EQ but only one argument. We use 0 when the value is not specified. */ + + rule->n_arguments++; + } + + return 0; +} + +static int oci_seccomp_syscalls(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + scmp_filter_ctx *sc = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const JsonDispatch table[] = { + { "names", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(struct syscall_rule, names), JSON_MANDATORY }, + { "action", JSON_VARIANT_STRING, oci_seccomp_action, offsetof(struct syscall_rule, action), JSON_MANDATORY }, + { "args", JSON_VARIANT_ARRAY, oci_seccomp_args, 0, 0 }, + {} + }; + _cleanup_(syscall_rule_done) struct syscall_rule rule = { + .action = UINT32_MAX, + }; + + r = oci_dispatch(e, table, flags, &rule); + if (r < 0) + return r; + + if (strv_isempty(rule.names)) { + json_log(e, flags, 0, "System call name list is empty."); + return -EINVAL; + } + + STRV_FOREACH(i, rule.names) { + int nr; + + nr = seccomp_syscall_resolve_name(*i); + if (nr == __NR_SCMP_ERROR) { + log_debug("Unknown syscall %s, skipping.", *i); + continue; + } + + r = seccomp_rule_add_array(sc, rule.action, nr, rule.n_arguments, rule.arguments); + if (r < 0) + return r; + } + } + + return 0; +} +#endif + +static int oci_seccomp(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + +#if HAVE_SECCOMP + static const JsonDispatch table[] = { + { "defaultAction", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "architectures", JSON_VARIANT_ARRAY, oci_seccomp_archs, 0, 0 }, + { "syscalls", JSON_VARIANT_ARRAY, oci_seccomp_syscalls, 0, 0 }, + {} + }; + + _cleanup_(seccomp_releasep) scmp_filter_ctx sc = NULL; + Settings *s = ASSERT_PTR(userdata); + JsonVariant *def; + uint32_t d; + int r; + + def = json_variant_by_key(v, "defaultAction"); + if (!def) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "defaultAction element missing."); + + if (!json_variant_is_string(def)) + return json_log(def, flags, SYNTHETIC_ERRNO(EINVAL), "defaultAction is not a string."); + + r = oci_seccomp_action_from_string(json_variant_string(def), &d); + if (r < 0) + return json_log(def, flags, r, "Unknown default action: %s", json_variant_string(def)); + + sc = seccomp_init(d); + if (!sc) + return json_log(v, flags, SYNTHETIC_ERRNO(ENOMEM), "Couldn't allocate seccomp object."); + + r = oci_dispatch(v, table, flags, sc); + if (r < 0) + return r; + + seccomp_release(s->seccomp); + s->seccomp = TAKE_PTR(sc); + return 0; +#else + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), "libseccomp support not enabled, can't parse seccomp object."); +#endif +} + +static int oci_rootfs_propagation(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + const char *s; + + s = json_variant_string(v); + + if (streq(s, "shared")) + return 0; + + json_log(v, flags|JSON_DEBUG, 0, "Ignoring rootfsPropagation setting '%s'.", s); + return 0; +} + +static int oci_masked_paths(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *destination = NULL; + CustomMount *m; + const char *p; + + if (!json_variant_is_string(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not a string, refusing."); + + assert_se(p = json_variant_string(e)); + + if (!path_is_absolute(p)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not absolute, refusing: %s", p); + + if (oci_exclude_mount(p)) + continue; + + destination = strdup(p); + if (!destination) + return log_oom(); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_INACCESSIBLE); + if (!m) + return log_oom(); + + m->destination = TAKE_PTR(destination); + + /* The spec doesn't say this, but apparently pre-existing implementations are lenient towards + * non-existing paths to mask. Let's hence be too. */ + m->graceful = true; + } + + return 0; +} + +static int oci_readonly_paths(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *source = NULL, *destination = NULL; + CustomMount *m; + const char *p; + + if (!json_variant_is_string(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not a string, refusing."); + + assert_se(p = json_variant_string(e)); + + if (!path_is_absolute(p)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not absolute, refusing: %s", p); + + if (oci_exclude_mount(p)) + continue; + + source = strjoin("+", p); + if (!source) + return log_oom(); + + destination = strdup(p); + if (!destination) + return log_oom(); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND); + if (!m) + return log_oom(); + + m->source = TAKE_PTR(source); + m->destination = TAKE_PTR(destination); + m->read_only = true; + } + + return 0; +} + +static int oci_linux(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "namespaces", JSON_VARIANT_ARRAY, oci_namespaces, 0, 0 }, + { "uidMappings", JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 }, + { "gidMappings", JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 }, + { "devices", JSON_VARIANT_ARRAY, oci_devices, 0, 0 }, + { "cgroupsPath", JSON_VARIANT_STRING, oci_cgroups_path, 0, 0 }, + { "resources", JSON_VARIANT_OBJECT, oci_resources, 0, 0 }, + { "intelRdt", JSON_VARIANT_OBJECT, oci_unsupported, 0, JSON_PERMISSIVE }, + { "sysctl", JSON_VARIANT_OBJECT, oci_sysctl, 0, 0 }, + { "seccomp", JSON_VARIANT_OBJECT, oci_seccomp, 0, 0 }, + { "rootfsPropagation", JSON_VARIANT_STRING, oci_rootfs_propagation, 0, 0 }, + { "maskedPaths", JSON_VARIANT_ARRAY, oci_masked_paths, 0, 0 }, + { "readonlyPaths", JSON_VARIANT_ARRAY, oci_readonly_paths, 0, 0 }, + { "mountLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + return oci_dispatch(v, table, flags, userdata); +} + +static int oci_hook_timeout(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + usec_t *u = ASSERT_PTR(userdata); + uint64_t k; + + k = json_variant_unsigned(v); + if (k == 0 || k > (UINT64_MAX-1)/USEC_PER_SEC) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Hook timeout value out of range."); + + *u = k * USEC_PER_SEC; + return 0; +} + +static int oci_hooks_array(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + static const JsonDispatch table[] = { + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(OciHook, path), JSON_MANDATORY }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(OciHook, args), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(OciHook, env), 0 }, + { "timeout", JSON_VARIANT_UNSIGNED, oci_hook_timeout, offsetof(OciHook, timeout), 0 }, + {} + }; + + OciHook *a, **array, *new_item; + size_t *n_array; + + if (streq(name, "prestart")) { + array = &s->oci_hooks_prestart; + n_array = &s->n_oci_hooks_prestart; + } else if (streq(name, "poststart")) { + array = &s->oci_hooks_poststart; + n_array = &s->n_oci_hooks_poststart; + } else { + assert(streq(name, "poststop")); + array = &s->oci_hooks_poststop; + n_array = &s->n_oci_hooks_poststop; + } + + a = reallocarray(*array, *n_array + 1, sizeof(OciHook)); + if (!a) + return log_oom(); + + *array = a; + new_item = a + *n_array; + + *new_item = (OciHook) { + .timeout = USEC_INFINITY, + }; + + r = oci_dispatch(e, table, flags, new_item); + if (r < 0) { + free(new_item->path); + strv_free(new_item->args); + strv_free(new_item->env); + return r; + } + + (*n_array) ++; + } + + return 0; +} + +static int oci_hooks(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "prestart", JSON_VARIANT_ARRAY, oci_hooks_array, 0, 0 }, + { "poststart", JSON_VARIANT_ARRAY, oci_hooks_array, 0, 0 }, + { "poststop", JSON_VARIANT_ARRAY, oci_hooks_array, 0, 0 }, + {} + }; + + return oci_dispatch(v, table, flags, userdata); +} + +static int oci_annotations(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + JsonVariant *w; + const char *k; + + JSON_VARIANT_OBJECT_FOREACH(k, w, v) { + + if (isempty(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Annotation with empty key, refusing."); + + if (!json_variant_is_string(w)) + return json_log(w, flags, SYNTHETIC_ERRNO(EINVAL), + "Annotation has non-string value, refusing."); + + json_log(w, flags|JSON_DEBUG, 0, "Ignoring annotation '%s' with value '%s'.", k, json_variant_string(w)); + } + + return 0; +} + +int oci_load(FILE *f, const char *bundle, Settings **ret) { + + static const JsonDispatch table[] = { + { "ociVersion", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "process", JSON_VARIANT_OBJECT, oci_process, 0, 0 }, + { "root", JSON_VARIANT_OBJECT, oci_root, 0, 0 }, + { "hostname", JSON_VARIANT_STRING, oci_hostname, 0, 0 }, + { "mounts", JSON_VARIANT_ARRAY, oci_mounts, 0, 0 }, + { "linux", JSON_VARIANT_OBJECT, oci_linux, 0, 0 }, + { "hooks", JSON_VARIANT_OBJECT, oci_hooks, 0, 0 }, + { "annotations", JSON_VARIANT_OBJECT, oci_annotations, 0, 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *oci = NULL; + _cleanup_(settings_freep) Settings *s = NULL; + unsigned line = 0, column = 0; + JsonVariant *v; + const char *path; + int r; + + assert_se(bundle); + + path = strjoina(bundle, "/config.json"); + + r = json_parse_file(f, path, 0, &oci, &line, &column); + if (r < 0) { + if (line != 0 && column != 0) + return log_error_errno(r, "Failed to parse '%s' at %u:%u: %m", path, line, column); + else + return log_error_errno(r, "Failed to parse '%s': %m", path); + } + + v = json_variant_by_key(oci, "ociVersion"); + if (!v) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "JSON file '%s' is not an OCI bundle configuration file. Refusing.", + path); + if (!streq_ptr(json_variant_string(v), "1.0.0")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "OCI bundle version not supported: %s", + strna(json_variant_string(v))); + + // { + // _cleanup_free_ char *formatted = NULL; + // assert_se(json_variant_format(oci, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR, &formatted) >= 0); + // fputs(formatted, stdout); + // } + + s = settings_new(); + if (!s) + return log_oom(); + + s->start_mode = START_PID1; + s->resolv_conf = RESOLV_CONF_OFF; + s->link_journal = LINK_NO; + s->timezone = TIMEZONE_OFF; + + s->bundle = strdup(bundle); + if (!s->bundle) + return log_oom(); + + r = oci_dispatch(oci, table, 0, s); + if (r < 0) + return r; + + if (s->properties) { + r = sd_bus_message_seal(s->properties, 0, 0); + if (r < 0) + return log_error_errno(r, "Cannot seal properties bus message: %m"); + } + + *ret = TAKE_PTR(s); + return 0; +} diff --git a/src/nspawn/nspawn-oci.h b/src/nspawn/nspawn-oci.h new file mode 100644 index 0000000..ee72c91 --- /dev/null +++ b/src/nspawn/nspawn-oci.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "nspawn-settings.h" + +int oci_load(FILE *f, const char *path, Settings **ret); diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c new file mode 100644 index 0000000..b8918a2 --- /dev/null +++ b/src/nspawn/nspawn-patch-uid.c @@ -0,0 +1,477 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "acl-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "missing_magic.h" +#include "nspawn-def.h" +#include "nspawn-patch-uid.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +#if HAVE_ACL + +static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) { + acl_t acl; + + assert(fd >= 0); + assert(ret); + + if (name) { + _cleanup_close_ int child_fd = -EBADF; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + acl = acl_get_file(FORMAT_PROC_FD_PATH(child_fd), type); + } else if (type == ACL_TYPE_ACCESS) + acl = acl_get_fd(fd); + else + acl = acl_get_file(FORMAT_PROC_FD_PATH(fd), type); + if (!acl) + return -errno; + + *ret = acl; + return 0; +} + +static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) { + int r; + + assert(fd >= 0); + assert(acl); + + if (name) { + _cleanup_close_ int child_fd = -EBADF; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + r = acl_set_file(FORMAT_PROC_FD_PATH(child_fd), type, acl); + } else if (type == ACL_TYPE_ACCESS) + r = acl_set_fd(fd, acl); + else + r = acl_set_file(FORMAT_PROC_FD_PATH(fd), type, acl); + if (r < 0) + return -errno; + + return 0; +} + +static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) { + _cleanup_(acl_freep) acl_t copy = NULL; + acl_entry_t i; + int r; + + assert(acl); + assert(ret); + + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + while (r > 0) { + uid_t *old_uid, new_uid; + bool modify = false; + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (IN_SET(tag, ACL_USER, ACL_GROUP)) { + + /* We don't distinguish here between uid_t and gid_t, let's make sure the compiler checks that + * this is actually OK */ + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + old_uid = acl_get_qualifier(i); + if (!old_uid) + return -errno; + + new_uid = shift | (*old_uid & UINT32_C(0xFFFF)); + if (!uid_is_valid(new_uid)) + return -EINVAL; + + modify = new_uid != *old_uid; + if (modify && !copy) { + int n; + + /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the + * beginning, so that we copy all entries, starting from the first, this time. */ + + n = acl_entries(acl); + if (n < 0) + return -errno; + + copy = acl_init(n); + if (!copy) + return -errno; + + /* Seek back to the beginning */ + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + continue; + } + } + + if (copy) { + acl_entry_t new_entry; + + if (acl_create_entry(©, &new_entry) < 0) + return -errno; + + if (acl_copy_entry(new_entry, i) < 0) + return -errno; + + if (modify) + if (acl_set_qualifier(new_entry, &new_uid) < 0) + return -errno; + } + + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i); + if (r < 0) + return -errno; + } + + *ret = TAKE_PTR(copy); + + return !!*ret; +} + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + /* ACLs are not supported on symlinks, there's no point in trying */ + if (S_ISLNK(st->st_mode)) + return 0; + + r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl); + if (r == -EOPNOTSUPP) + return 0; + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted); + if (r < 0) + return r; + + changed = true; + } + + if (S_ISDIR(st->st_mode)) { + acl_free(acl); + + if (shifted) + acl_free(shifted); + + acl = shifted = NULL; + + r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl); + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted); + if (r < 0) + return r; + + changed = true; + } + } + + return changed; +} + +#else + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + return 0; +} + +#endif + +static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) { + uid_t new_uid; + gid_t new_gid; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + new_uid = shift | (st->st_uid & UINT32_C(0xFFFF)); + new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF)); + + if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid)) + return -EINVAL; + + if (st->st_uid != new_uid || st->st_gid != new_gid) { + if (name) + r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW); + else + r = fchown(fd, new_uid, new_gid); + if (r < 0) + return -errno; + + /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */ + if (name) { + if (!S_ISLNK(st->st_mode)) + r = fchmodat(fd, name, st->st_mode, 0); + else /* Changing the mode of a symlink is not supported by Linux kernel. Don't bother. */ + r = 0; + } else + r = fchmod(fd, st->st_mode); + if (r < 0) + return -errno; + + changed = true; + } + + r = patch_acls(fd, name, st, shift); + if (r < 0) + return r; + + return r > 0 || changed; +} + +/* + * Check if the filesystem is fully compatible with user namespaces or + * UID/GID patching. Some filesystems in this list can be fully mounted inside + * user namespaces, however their inodes may relate to host resources or only + * valid in the global user namespace, therefore no patching should be applied. + */ +static int is_fs_fully_userns_compatible(const struct statfs *sfs) { + + assert(sfs); + + return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) || + F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC); +} + +static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) { + _cleanup_closedir_ DIR *d = NULL; + bool changed = false; + struct statfs sfs; + int r; + + assert(fd >= 0); + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably + * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion + * when we hit procfs, sysfs or some other special file systems. */ + + r = is_fs_fully_userns_compatible(&sfs); + if (r < 0) + goto finish; + if (r > 0) { + r = 0; /* don't recurse */ + goto finish; + } + + /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */ + if ((sfs.f_flags & ST_RDONLY) || + access_fd(fd, W_OK) == -EROFS) + goto read_only; + + if (S_ISDIR(st->st_mode)) { + if (!donate_fd) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) { + r = -errno; + goto finish; + } + + fd = copy; + donate_fd = true; + } + + d = take_fdopendir(&fd); + if (!d) { + r = -errno; + goto finish; + } + + FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) { + struct stat fst; + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) { + r = -errno; + goto finish; + } + + if (S_ISDIR(fst.st_mode)) { + int subdir_fd; + + subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (subdir_fd < 0) { + r = -errno; + goto finish; + + } + + r = recurse_fd(subdir_fd, true, &fst, shift, false); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + + } else { + r = patch_fd(dirfd(d), de->d_name, &fst, shift); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + } + } + } + + /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level + * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the + * tree is properly chown()ed already. */ + r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift); + if (r == -EROFS) + goto read_only; + if (r > 0) + changed = true; + + r = changed; + goto finish; + +read_only: + if (!is_toplevel) { + _cleanup_free_ char *name = NULL; + + /* When we hit a ready-only subtree we simply skip it, but log about it. */ + (void) fd_get_path(fd, &name); + log_debug("Skipping read-only file or directory %s.", strna(name)); + r = changed; + } + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) { + struct stat st; + int r; + + assert(fd >= 0); + + /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an + * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges + * following the concept that the upper 16-bit of a UID identify the container, and the lower 16-bit are the actual + * UID within the container. */ + + if ((shift & 0xFFFF) != 0) { + /* We only support containers where the shift starts at a 2^16 boundary */ + r = -EOPNOTSUPP; + goto finish; + } + + if (shift == UID_BUSY_BASE) { + r = -EINVAL; + goto finish; + } + + if (range != 0x10000) { + /* We only support containers with 16-bit UID ranges for the patching logic */ + r = -EOPNOTSUPP; + goto finish; + } + + if (fstat(fd, &st) < 0) { + r = -errno; + goto finish; + } + + if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) { + /* We only support containers where the uid/gid container ID match */ + r = -EBADE; + goto finish; + } + + /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume + * that if the top-level dir has the right upper 16-bit assigned, then everything below will have too... */ + if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0) + return 0; + + /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy" + * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start + * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */ + + if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) { + if (fchown(fd, + UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK), + (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) { + r = -errno; + goto finish; + } + } + + return recurse_fd(fd, donate_fd, &st, shift, true); + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +int path_patch_uid(const char *path, uid_t shift, uid_t range) { + int fd; + + fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (fd < 0) + return -errno; + + return fd_patch_uid_internal(fd, true, shift, range); +} diff --git a/src/nspawn/nspawn-patch-uid.h b/src/nspawn/nspawn-patch-uid.h new file mode 100644 index 0000000..5c7349b --- /dev/null +++ b/src/nspawn/nspawn-patch-uid.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include + +int path_patch_uid(const char *path, uid_t shift, uid_t range); diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c new file mode 100644 index 0000000..66962d7 --- /dev/null +++ b/src/nspawn/nspawn-register.c @@ -0,0 +1,416 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "bus-wait-for-jobs.h" +#include "nspawn-register.h" +#include "nspawn-settings.h" +#include "special.h" +#include "stat-util.h" +#include "strv.h" + +static int append_machine_properties( + sd_bus_message *m, + CustomMount *mounts, + unsigned n_mounts, + int kill_signal, + bool coredump_receive) { + + unsigned j; + int r; + + assert(m); + + r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "closed"); + if (r < 0) + return bus_log_create_error(r); + + /* If you make changes here, also make sure to update systemd-nspawn@.service, to keep the device policies in + * sync regardless if we are run with or without the --keep-unit switch. */ + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2, + /* Allow the container to + * access and create the API + * device nodes, so that + * PrivateDevices= in the + * container can work + * fine */ + "/dev/net/tun", "rwm", + /* Allow the container + * access to ptys. However, + * do not permit the + * container to ever create + * these device nodes. */ + "char-pts", "rw"); + if (r < 0) + return bus_log_create_error(r); + + for (j = 0; j < n_mounts; j++) { + CustomMount *cm = mounts + j; + + if (cm->type != CUSTOM_MOUNT_BIND) + continue; + + r = is_device_node(cm->source); + if (r == -ENOENT) { + /* The bind source might only appear as the image is put together, hence don't complain */ + log_debug_errno(r, "Bind mount source %s not found, ignoring: %m", cm->source); + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to stat %s: %m", cm->source); + + if (r) { + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1, + cm->source, cm->read_only ? "r" : "rw"); + if (r < 0) + return log_error_errno(r, "Failed to append message arguments: %m"); + } + } + + if (kill_signal != 0) { + r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", kill_signal); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed"); + if (r < 0) + return bus_log_create_error(r); + } + + if (coredump_receive) { + r = sd_bus_message_append(m, "(sv)", "CoredumpReceive", "b", true); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int append_controller_property(sd_bus *bus, sd_bus_message *m) { + const char *unique; + int r; + + assert(bus); + assert(m); + + r = sd_bus_get_unique_name(bus, &unique); + if (r < 0) + return log_error_errno(r, "Failed to get unique name: %m"); + + r = sd_bus_message_append(m, "(sv)", "Controller", "s", unique); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int can_set_coredump_receive(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL; + int b, r; + + assert(bus); + + path = unit_dbus_path_from_name(SPECIAL_INIT_SCOPE); + if (!path) + return log_oom(); + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Scope", + "CoredumpReceive", + &e, + 'b', &b); + if (r < 0 && !sd_bus_error_has_names(&e, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY)) + log_warning_errno(r, "Failed to determine if CoredumpReceive= can be set, assuming it cannot be: %s", + bus_error_message(&e, r)); + + return r >= 0; +} + +int register_machine( + sd_bus *bus, + const char *machine_name, + pid_t pid, + const char *directory, + sd_id128_t uuid, + int local_ifindex, + const char *slice, + CustomMount *mounts, + unsigned n_mounts, + int kill_signal, + char **properties, + sd_bus_message *properties_message, + bool keep_unit, + const char *service, + StartMode start_mode) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + + if (keep_unit) { + r = bus_call_method( + bus, + bus_machine_mgr, + "RegisterMachineWithNetwork", + &error, + NULL, + "sayssusai", + machine_name, + SD_BUS_MESSAGE_APPEND_ID128(uuid), + service, + "container", + (uint32_t) pid, + strempty(directory), + local_ifindex > 0 ? 1 : 0, local_ifindex); + } else { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + r = bus_message_new_method_call(bus, &m, bus_machine_mgr, "CreateMachineWithNetwork"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "sayssusai", + machine_name, + SD_BUS_MESSAGE_APPEND_ID128(uuid), + service, + "container", + (uint32_t) pid, + strempty(directory), + local_ifindex > 0 ? 1 : 0, local_ifindex); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + if (!isempty(slice)) { + r = sd_bus_message_append(m, "(sv)", "Slice", "s", slice); + if (r < 0) + return bus_log_create_error(r); + } + + r = append_controller_property(bus, m); + if (r < 0) + return r; + + r = append_machine_properties( + m, + mounts, + n_mounts, + kill_signal, + start_mode == START_BOOT && can_set_coredump_receive(bus) > 0); + if (r < 0) + return r; + + if (properties_message) { + r = sd_bus_message_copy(m, properties_message, true); + if (r < 0) + return bus_log_create_error(r); + } + + r = bus_append_unit_property_assignment_many(m, UNIT_SERVICE, properties); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, NULL); + } + if (r < 0) + return log_error_errno(r, "Failed to register machine: %s", bus_error_message(&error, r)); + + return 0; +} + +int unregister_machine( + sd_bus *bus, + const char *machine_name) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + + r = bus_call_method(bus, bus_machine_mgr, "UnregisterMachine", &error, NULL, "s", machine_name); + if (r < 0) + log_debug("Failed to unregister machine: %s", bus_error_message(&error, r)); + + return 0; +} + +int allocate_scope( + sd_bus *bus, + const char *machine_name, + pid_t pid, + const char *slice, + CustomMount *mounts, + unsigned n_mounts, + int kill_signal, + char **properties, + sd_bus_message *properties_message, + bool allow_pidfd, + StartMode start_mode) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_free_ char *scope = NULL; + const char *description, *object; + int r; + + assert(bus); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch job: %m"); + + r = unit_name_mangle_with_suffix(machine_name, "as machine name", 0, ".scope", &scope); + if (r < 0) + return log_error_errno(r, "Failed to mangle scope name: %m"); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "ss", scope, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + description = strjoina("Container ", machine_name); + + if (allow_pidfd) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return log_error_errno(r, "Failed to allocate PID reference: %m"); + + r = bus_append_scope_pidref(m, &pidref); + } else + r = sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, pid); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)(sv)", + "Description", "s", description, + "Delegate", "b", 1, + "CollectMode", "s", "inactive-or-failed", + "AddRef", "b", 1, + "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice); + if (r < 0) + return bus_log_create_error(r); + + r = append_controller_property(bus, m); + if (r < 0) + return r; + + if (properties_message) { + r = sd_bus_message_copy(m, properties_message, true); + if (r < 0) + return bus_log_create_error(r); + } + + r = append_machine_properties( + m, + mounts, + n_mounts, + kill_signal, + start_mode == START_BOOT && can_set_coredump_receive(bus) > 0); + if (r < 0) + return r; + + r = bus_append_unit_property_assignment_many(m, UNIT_SCOPE, properties); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* No auxiliary units */ + r = sd_bus_message_append( + m, + "a(sa(sv))", + 0); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + /* If this failed with a property we couldn't write, this is quite likely because the server + * doesn't support PIDFDs yet, let's try without. */ + if (allow_pidfd && + sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY)) + return allocate_scope(bus, machine_name, pid, slice, mounts, n_mounts, kill_signal, properties, properties_message, /* allow_pidfd= */ false, start_mode); + + return log_error_errno(r, "Failed to allocate scope: %s", bus_error_message(&error, r)); + } + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, false, NULL); + if (r < 0) + return r; + + return 0; +} + +int terminate_scope( + sd_bus *bus, + const char *machine_name) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *scope = NULL; + int r; + + r = unit_name_mangle_with_suffix(machine_name, "to terminate", 0, ".scope", &scope); + if (r < 0) + return log_error_errno(r, "Failed to mangle scope name: %m"); + + r = bus_call_method(bus, bus_systemd_mgr, "AbandonScope", &error, NULL, "s", scope); + if (r < 0) { + log_debug_errno(r, "Failed to abandon scope '%s', ignoring: %s", scope, bus_error_message(&error, r)); + sd_bus_error_free(&error); + } + + r = bus_call_method( + bus, + bus_systemd_mgr, + "KillUnit", + &error, + NULL, + "ssi", + scope, + "all", + (int32_t) SIGKILL); + if (r < 0) { + log_debug_errno(r, "Failed to SIGKILL scope '%s', ignoring: %s", scope, bus_error_message(&error, r)); + sd_bus_error_free(&error); + } + + r = bus_call_method(bus, bus_systemd_mgr, "UnrefUnit", &error, NULL, "s", scope); + if (r < 0) + log_debug_errno(r, "Failed to drop reference to scope '%s', ignoring: %s", scope, bus_error_message(&error, r)); + + return 0; +} diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h new file mode 100644 index 0000000..4d16ac2 --- /dev/null +++ b/src/nspawn/nspawn-register.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-id128.h" + +#include "nspawn-mount.h" +#include "nspawn-settings.h" + +int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool keep_unit, const char *service, StartMode start_mode); +int unregister_machine(sd_bus *bus, const char *machine_name); + +int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool allow_pidfds, StartMode start_mode); +int terminate_scope(sd_bus *bus, const char *machine_name); diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c new file mode 100644 index 0000000..fa05a8a --- /dev/null +++ b/src/nspawn/nspawn-seccomp.c @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "log.h" +#include "nspawn-seccomp.h" +#include "seccomp-util.h" +#include "string-util.h" +#include "strv.h" + +#if HAVE_SECCOMP + +static int add_syscall_filters( + scmp_filter_ctx ctx, + uint32_t arch, + uint64_t cap_list_retain, + char **syscall_allow_list, + char **syscall_deny_list) { + + static const struct { + uint64_t capability; + const char* name; + } allow_list[] = { + /* Let's use set names where we can */ + { 0, "@aio" }, + { 0, "@basic-io" }, + { 0, "@chown" }, + { 0, "@default" }, + { 0, "@file-system" }, + { 0, "@io-event" }, + { 0, "@ipc" }, + { 0, "@mount" }, + { 0, "@network-io" }, + { 0, "@process" }, + { 0, "@resources" }, + { 0, "@setuid" }, + { 0, "@signal" }, + { 0, "@sync" }, + { 0, "@timer" }, + + /* The following four are sets we optionally enable, n case the caps have been configured for it */ + { CAP_SYS_TIME, "@clock" }, + { CAP_SYS_MODULE, "@module" }, + { CAP_SYS_RAWIO, "@raw-io" }, + { CAP_IPC_LOCK, "@memlock" }, + + /* Plus a good set of additional syscalls which are not part of any of the groups above */ + { 0, "brk" }, + { 0, "capget" }, + { 0, "capset" }, + { 0, "copy_file_range" }, + { 0, "fadvise64" }, + { 0, "fadvise64_64" }, + { 0, "flock" }, + { 0, "get_mempolicy" }, + { 0, "getcpu" }, + { 0, "getpriority" }, + { 0, "getrandom" }, + { 0, "ioctl" }, + { 0, "ioprio_get" }, + { 0, "kcmp" }, + { 0, "madvise" }, + { 0, "mincore" }, + { 0, "mprotect" }, + { 0, "mremap" }, + { 0, "name_to_handle_at" }, + { 0, "oldolduname" }, + { 0, "olduname" }, + { 0, "personality" }, + { 0, "readahead" }, + { 0, "readdir" }, + { 0, "remap_file_pages" }, + { 0, "sched_get_priority_max" }, + { 0, "sched_get_priority_min" }, + { 0, "sched_getaffinity" }, + { 0, "sched_getattr" }, + { 0, "sched_getparam" }, + { 0, "sched_getscheduler" }, + { 0, "sched_rr_get_interval" }, + { 0, "sched_rr_get_interval_time64" }, + { 0, "sched_yield" }, + { 0, "seccomp" }, + { 0, "sendfile" }, + { 0, "sendfile64" }, + { 0, "setdomainname" }, + { 0, "setfsgid" }, + { 0, "setfsgid32" }, + { 0, "setfsuid" }, + { 0, "setfsuid32" }, + { 0, "sethostname" }, + { 0, "setpgid" }, + { 0, "setsid" }, + { 0, "splice" }, + { 0, "sysinfo" }, + { 0, "tee" }, + { 0, "umask" }, + { 0, "uname" }, + { 0, "userfaultfd" }, + { 0, "vmsplice" }, + + /* The following individual syscalls are added depending on specified caps */ + { CAP_SYS_PACCT, "acct" }, + { CAP_SYS_PTRACE, "process_vm_readv" }, + { CAP_SYS_PTRACE, "process_vm_writev" }, + { CAP_SYS_PTRACE, "ptrace" }, + { CAP_SYS_BOOT, "reboot" }, + { CAP_SYSLOG, "syslog" }, + { CAP_SYS_TTY_CONFIG, "vhangup" }, + + /* + * The following syscalls and groups are knowingly excluded: + * + * @cpu-emulation + * @keyring (NB: keyring is not namespaced!) + * @obsolete + * @pkey + * @swap + * + * bpf + * fanotify_init + * fanotify_mark + * kexec_file_load + * kexec_load + * lookup_dcookie + * nfsservctl + * open_by_handle_at + * perf_event_open + * quotactl + */ + }; + + _cleanup_strv_free_ char **added = NULL; + int r; + + for (size_t i = 0; i < ELEMENTSOF(allow_list); i++) { + if (allow_list[i].capability != 0 && (cap_list_retain & (1ULL << allow_list[i].capability)) == 0) + continue; + + r = seccomp_add_syscall_filter_item(ctx, + allow_list[i].name, + SCMP_ACT_ALLOW, + syscall_deny_list, + false, + &added); + if (r < 0) + return log_error_errno(r, "Failed to add syscall filter item %s: %m", allow_list[i].name); + } + + STRV_FOREACH(p, syscall_allow_list) { + r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_deny_list, true, &added); + if (r < 0) + log_warning_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", + *p, seccomp_arch_to_string(arch)); + } + + /* The default action is ENOSYS. Respond with EPERM to all other "known" but not allow-listed + * syscalls. */ + r = seccomp_add_syscall_filter_item(ctx, "@known", SCMP_ACT_ERRNO(EPERM), added, true, NULL); + if (r < 0) + log_warning_errno(r, "Failed to add rule for @known set on %s, ignoring: %m", + seccomp_arch_to_string(arch)); + +#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2 + /* We have a large filter here, so let's turn on the binary tree mode if possible. */ + r = seccomp_attr_set(ctx, SCMP_FLTATR_CTL_OPTIMIZE, 2); + if (r < 0) + log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m"); +#endif + + return 0; +} + +int setup_seccomp(uint64_t cap_list_retain, char **syscall_allow_list, char **syscall_deny_list) { + uint32_t arch; + int r; + + if (!is_seccomp_available()) { + log_debug("SECCOMP features not detected in the kernel or disabled at runtime, disabling SECCOMP filtering"); + return 0; + } + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Applying allow list on architecture: %s", seccomp_arch_to_string(arch)); + + /* We install ENOSYS as the default action, but it will only apply to syscalls which are not + * in the @known set, see above. */ + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(ENOSYS)); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); + + r = add_syscall_filters(seccomp, arch, cap_list_retain, syscall_allow_list, syscall_deny_list); + if (r < 0) + return r; + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return log_error_errno(r, "Failed to install seccomp filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); + + /* + Audit is broken in containers, much of the userspace audit hookup will fail if running inside a + container. We don't care and just turn off creation of audit sockets. + + This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses + as indication that audit is disabled in the kernel. + */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 2, + SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), + SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); + if (r < 0) { + log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return log_error_errno(r, "Failed to install seccomp audit filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +#else + +int setup_seccomp(uint64_t cap_list_retain, char **syscall_allow_list, char **syscall_deny_list) { + return 0; +} + +#endif diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h new file mode 100644 index 0000000..2690fba --- /dev/null +++ b/src/nspawn/nspawn-seccomp.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int setup_seccomp(uint64_t cap_list_retain, char **syscall_allow_list, char **syscall_deny_list); diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c new file mode 100644 index 0000000..161b1c1 --- /dev/null +++ b/src/nspawn/nspawn-settings.c @@ -0,0 +1,1015 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "cap-list.h" +#include "conf-parser.h" +#include "cpu-set-util.h" +#include "hostname-util.h" +#include "namespace-util.h" +#include "nspawn-network.h" +#include "nspawn-settings.h" +#include "parse-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +Settings *settings_new(void) { + Settings *s; + + s = new(Settings, 1); + if (!s) + return NULL; + + *s = (Settings) { + .start_mode = _START_MODE_INVALID, + .ephemeral = -1, + .personality = PERSONALITY_INVALID, + + .resolv_conf = _RESOLV_CONF_MODE_INVALID, + .link_journal = _LINK_JOURNAL_INVALID, + .timezone = _TIMEZONE_MODE_INVALID, + + .userns_mode = _USER_NAMESPACE_MODE_INVALID, + .userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID, + .uid_shift = UID_INVALID, + .uid_range = UID_INVALID, + + .no_new_privileges = -1, + + .read_only = -1, + .volatile_mode = _VOLATILE_MODE_INVALID, + + .private_network = -1, + .network_veth = -1, + + .full_capabilities = CAPABILITY_QUINTET_NULL, + + .uid = UID_INVALID, + .gid = GID_INVALID, + + .console_mode = _CONSOLE_MODE_INVALID, + .console_width = UINT_MAX, + .console_height = UINT_MAX, + + .clone_ns_flags = ULONG_MAX, + .use_cgns = -1, + + .notify_ready = -1, + .suppress_sync = -1, + }; + + return s; +} + +int settings_load(FILE *f, const char *path, Settings **ret) { + _cleanup_(settings_freep) Settings *s = NULL; + int r; + + assert(path); + assert(ret); + + s = settings_new(); + if (!s) + return -ENOMEM; + + r = config_parse(NULL, path, f, + "Exec\0" + "Network\0" + "Files\0", + config_item_perf_lookup, nspawn_gperf_lookup, + CONFIG_PARSE_WARN, + s, NULL); + if (r < 0) + return r; + + /* Make sure that if userns_mode is set, userns_chown is set to something appropriate, and vice versa. Either + * both fields shall be initialized or neither. */ + if (s->userns_mode >= 0 && s->userns_ownership < 0) + s->userns_ownership = s->userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_CHOWN : USER_NAMESPACE_OWNERSHIP_OFF; + if (s->userns_ownership >= 0 && s->userns_mode < 0) + s->userns_mode = USER_NAMESPACE_NO; + + *ret = TAKE_PTR(s); + return 0; +} + +static void free_oci_hooks(OciHook *hooks, size_t n) { + assert(hooks || n == 0); + + FOREACH_ARRAY(hook, hooks, n) { + free(hook->path); + strv_free(hook->args); + strv_free(hook->env); + } + + free(hooks); +} + +void device_node_array_free(DeviceNode *nodes, size_t n) { + assert(nodes || n == 0); + + FOREACH_ARRAY(node, nodes, n) + free(node->path); + + free(nodes); +} + +Settings* settings_free(Settings *s) { + if (!s) + return NULL; + + strv_free(s->parameters); + strv_free(s->environment); + free(s->user); + free(s->pivot_root_new); + free(s->pivot_root_old); + free(s->working_directory); + strv_free(s->syscall_allow_list); + strv_free(s->syscall_deny_list); + rlimit_free_all(s->rlimit); + free(s->hostname); + cpu_set_reset(&s->cpu_set); + strv_free(s->bind_user); + + strv_free(s->network_interfaces); + strv_free(s->network_macvlan); + strv_free(s->network_ipvlan); + strv_free(s->network_veth_extra); + free(s->network_bridge); + free(s->network_zone); + expose_port_free_all(s->expose_ports); + + custom_mount_free_all(s->custom_mounts, s->n_custom_mounts); + + free(s->bundle); + free(s->root); + + free_oci_hooks(s->oci_hooks_prestart, s->n_oci_hooks_prestart); + free_oci_hooks(s->oci_hooks_poststart, s->n_oci_hooks_poststart); + free_oci_hooks(s->oci_hooks_poststop, s->n_oci_hooks_poststop); + + free(s->slice); + sd_bus_message_unref(s->properties); + + free(s->supplementary_gids); + device_node_array_free(s->extra_nodes, s->n_extra_nodes); + free(s->network_namespace_path); + + strv_free(s->sysctl); + +#if HAVE_SECCOMP + seccomp_release(s->seccomp); +#endif + + return mfree(s); +} + +bool settings_private_network(Settings *s) { + assert(s); + + /* Determines whether we shall open up our own private network */ + + return + s->private_network > 0 || + s->network_veth > 0 || + s->network_bridge || + s->network_zone || + s->network_interfaces || + s->network_macvlan || + s->network_ipvlan || + s->network_veth_extra; +} + +bool settings_network_veth(Settings *s) { + assert(s); + + return + s->network_veth > 0 || + s->network_bridge || + s->network_zone; +} + +bool settings_network_configured(Settings *s) { + assert(s); + + /* Determines whether any network configuration setting was used. (i.e. in contrast to + * settings_private_network() above this might also indicate if private networking was explicitly + * turned off.) */ + + return + s->private_network >= 0 || + s->network_veth >= 0 || + s->network_bridge || + s->network_zone || + s->network_interfaces || + s->network_macvlan || + s->network_ipvlan || + s->network_veth_extra || + s->network_namespace_path; +} + +int settings_allocate_properties(Settings *s) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + int r; + + assert(s); + + if (s->properties) + return 0; + + r = sd_bus_default_system(&bus); + if (r < 0) + return r; + + r = sd_bus_message_new(bus, &s->properties, SD_BUS_MESSAGE_METHOD_CALL); + if (r < 0) + return r; + + return 0; +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_volatile_mode, volatile_mode, VolatileMode, "Failed to parse volatile mode"); + +int config_parse_expose_port( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = expose_port_parse(&s->expose_ports, rvalue); + if (r == -EEXIST) + log_syntax(unit, LOG_WARNING, filename, line, r, "Duplicate port specification, ignoring: %s", rvalue); + else if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse host port %s: %m", rvalue); + + return 0; +} + +int config_parse_capability( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t u = 0, *result = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&rvalue, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract capability string, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + if (streq(word, "all")) + u = UINT64_MAX; + else { + r = capability_from_name(word); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse capability, ignoring: %s", word); + continue; + } + + u |= UINT64_C(1) << r; + } + } + + *result |= u; + return 0; +} + +int config_parse_pivot_root( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = pivot_root_parse(&settings->pivot_root_new, &settings->pivot_root_old, rvalue); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid pivot root mount specification %s: %m", rvalue); + + return 0; +} + +int config_parse_bind( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = bind_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue, ltype); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid bind mount specification %s: %m", rvalue); + + return 0; +} + +int config_parse_tmpfs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tmpfs_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid temporary file system specification %s: %m", rvalue); + + return 0; +} + +int config_parse_inaccessible( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = inaccessible_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid inaccessible file system specification %s: %m", rvalue); + + return 0; +} + +int config_parse_overlay( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = overlay_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue, ltype); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid overlay file system specification %s, ignoring: %m", rvalue); + + return 0; +} + +int config_parse_veth_extra( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = veth_extra_parse(&settings->network_veth_extra, rvalue); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid extra virtual Ethernet link specification %s: %m", rvalue); + + return 0; +} + +int config_parse_network_iface_pair( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char*** l = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + + return interface_pair_parse(l, rvalue); +} + +int config_parse_macvlan_iface_pair( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char*** l = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + + return macvlan_pair_parse(l, rvalue); +} + +int config_parse_ipvlan_iface_pair( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char*** l = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + + return ipvlan_pair_parse(l, rvalue); +} + +int config_parse_network_zone( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + _cleanup_free_ char *j = NULL; + + assert(filename); + assert(lvalue); + assert(rvalue); + + j = strjoin("vz-", rvalue); + if (!ifname_valid(j)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid network zone name, ignoring: %s", rvalue); + return 0; + } + + return free_and_replace(settings->network_zone, j); +} + +int config_parse_boot( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse Boot= parameter %s, ignoring: %m", rvalue); + return 0; + } + + if (r) { + if (settings->start_mode == START_PID2) + goto conflict; + + settings->start_mode = START_BOOT; + } else { + if (settings->start_mode == START_BOOT) + goto conflict; + + if (settings->start_mode < 0) + settings->start_mode = START_PID1; + } + + return 0; + +conflict: + log_syntax(unit, LOG_WARNING, filename, line, 0, "Conflicting Boot= or ProcessTwo= setting found. Ignoring."); + return 0; +} + +int config_parse_pid2( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse ProcessTwo= parameter %s, ignoring: %m", rvalue); + return 0; + } + + if (r) { + if (settings->start_mode == START_BOOT) + goto conflict; + + settings->start_mode = START_PID2; + } else { + if (settings->start_mode == START_PID2) + goto conflict; + + if (settings->start_mode < 0) + settings->start_mode = START_PID1; + } + + return 0; + +conflict: + log_syntax(unit, LOG_WARNING, filename, line, 0, "Conflicting Boot= or ProcessTwo= setting found. Ignoring."); + return 0; +} + +int config_parse_private_users( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r == 0) { + /* no: User namespacing off */ + settings->userns_mode = USER_NAMESPACE_NO; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (streq(rvalue, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + settings->userns_mode = USER_NAMESPACE_PICK; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (streq(rvalue, "identity")) { + /* identity: User namespacing on, UID range is 0:65536 */ + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = 0; + settings->uid_range = UINT32_C(0x10000); + } else { + const char *range, *shift; + uid_t sh, rn; + + /* anything else: User namespacing on, UID range is explicitly configured */ + + range = strchr(rvalue, ':'); + if (range) { + shift = strndupa_safe(rvalue, range - rvalue); + range++; + + r = safe_atou32(range, &rn); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "UID/GID range invalid, ignoring: %s", range); + return 0; + } + } else { + shift = rvalue; + rn = UINT32_C(0x10000); + } + + r = parse_uid(shift, &sh); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "UID/GID shift invalid, ignoring: %s", range); + return 0; + } + + if (!userns_shift_range_valid(sh, rn)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "UID/GID shift and range combination invalid, ignoring: %s", range); + return 0; + } + + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = sh; + settings->uid_range = rn; + } + + return 0; +} + +int config_parse_syscall_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + bool negative; + const char *items; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + negative = rvalue[0] == '~'; + items = negative ? rvalue + 1 : rvalue; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&items, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue); + return 0; + } + + if (negative) + r = strv_extend(&settings->syscall_deny_list, word); + else + r = strv_extend(&settings->syscall_allow_list, word); + if (r < 0) + return log_oom(); + } +} + +int config_parse_oom_score_adjust( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = ASSERT_PTR(data); + int oa, r; + + assert(rvalue); + + if (isempty(rvalue)) { + settings->oom_score_adjust_set = false; + return 0; + } + + r = parse_oom_score_adjust(rvalue, &oa); + if (r == -ERANGE) { + log_syntax(unit, LOG_WARNING, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue); + return 0; + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value, ignoring: %s", rvalue); + return 0; + } + + settings->oom_score_adjust = oa; + settings->oom_score_adjust_set = true; + + return 0; +} + +int config_parse_cpu_affinity( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = ASSERT_PTR(data); + + assert(rvalue); + + return parse_cpu_set_extend(rvalue, &settings->cpu_set, true, unit, filename, line, lvalue); +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_resolv_conf, resolv_conf_mode, ResolvConfMode, "Failed to parse resolv.conf mode"); + +static const char *const resolv_conf_mode_table[_RESOLV_CONF_MODE_MAX] = { + [RESOLV_CONF_OFF] = "off", + [RESOLV_CONF_COPY_HOST] = "copy-host", + [RESOLV_CONF_COPY_STATIC] = "copy-static", + [RESOLV_CONF_COPY_UPLINK] = "copy-uplink", + [RESOLV_CONF_COPY_STUB] = "copy-stub", + [RESOLV_CONF_REPLACE_HOST] = "replace-host", + [RESOLV_CONF_REPLACE_STATIC] = "replace-static", + [RESOLV_CONF_REPLACE_UPLINK] = "replace-uplink", + [RESOLV_CONF_REPLACE_STUB] = "replace-stub", + [RESOLV_CONF_BIND_HOST] = "bind-host", + [RESOLV_CONF_BIND_STATIC] = "bind-static", + [RESOLV_CONF_BIND_UPLINK] = "bind-uplink", + [RESOLV_CONF_BIND_STUB] = "bind-stub", + [RESOLV_CONF_DELETE] = "delete", + [RESOLV_CONF_AUTO] = "auto", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(resolv_conf_mode, ResolvConfMode, RESOLV_CONF_AUTO); + +int parse_link_journal(const char *s, LinkJournal *ret_mode, bool *ret_try) { + int r; + + assert(s); + assert(ret_mode); + assert(ret_try); + + if (streq(s, "auto")) { + *ret_mode = LINK_AUTO; + *ret_try = false; + } else if (streq(s, "guest")) { + *ret_mode = LINK_GUEST; + *ret_try = false; + } else if (streq(s, "host")) { + *ret_mode = LINK_HOST; + *ret_try = false; + } else if (streq(s, "try-guest")) { + *ret_mode = LINK_GUEST; + *ret_try = true; + } else if (streq(s, "try-host")) { + *ret_mode = LINK_HOST; + *ret_try = true; + } else { + /* Also support boolean values, to make things less confusing. */ + r = parse_boolean(s); + if (r < 0) + return r; + + /* Let's consider "true" to be equivalent to "auto". */ + *ret_mode = r ? LINK_AUTO : LINK_NO; + *ret_try = false; + } + + return 0; +} + +int config_parse_link_journal( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = parse_link_journal(rvalue, &settings->link_journal, &settings->link_journal_try); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse link journal mode, ignoring: %s", rvalue); + + return 0; +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_timezone, timezone_mode, TimezoneMode, "Failed to parse timezone mode"); + +static const char *const timezone_mode_table[_TIMEZONE_MODE_MAX] = { + [TIMEZONE_OFF] = "off", + [TIMEZONE_COPY] = "copy", + [TIMEZONE_BIND] = "bind", + [TIMEZONE_SYMLINK] = "symlink", + [TIMEZONE_DELETE] = "delete", + [TIMEZONE_AUTO] = "auto", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(timezone_mode, TimezoneMode, TIMEZONE_AUTO); + +DEFINE_CONFIG_PARSE_ENUM(config_parse_userns_ownership, user_namespace_ownership, UserNamespaceOwnership, "Failed to parse user namespace ownership mode"); + +static const char *const user_namespace_ownership_table[_USER_NAMESPACE_OWNERSHIP_MAX] = { + [USER_NAMESPACE_OWNERSHIP_OFF] = "off", + [USER_NAMESPACE_OWNERSHIP_CHOWN] = "chown", + [USER_NAMESPACE_OWNERSHIP_MAP] = "map", + [USER_NAMESPACE_OWNERSHIP_AUTO] = "auto", +}; + +DEFINE_STRING_TABLE_LOOKUP(user_namespace_ownership, UserNamespaceOwnership); + +int config_parse_userns_chown( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + UserNamespaceOwnership *ownership = ASSERT_PTR(data); + int r; + + assert(rvalue); + + /* Compatibility support for UserNamespaceChown=, whose job has been taken over by UserNamespaceOwnership= */ + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse user namespace ownership mode, ignoring: %s", rvalue); + return 0; + } + + *ownership = r ? USER_NAMESPACE_OWNERSHIP_CHOWN : USER_NAMESPACE_OWNERSHIP_OFF; + return 0; +} + +int config_parse_bind_user( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***bind_user = ASSERT_PTR(data); + int r; + + assert(rvalue); + + if (isempty(rvalue)) { + *bind_user = strv_free(*bind_user); + return 0; + } + + for (const char* p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse BindUser= list, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + if (!valid_user_group_name(word, 0)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "User name '%s' not valid, ignoring.", word); + return 0; + } + + if (strv_consume(bind_user, TAKE_PTR(word)) < 0) + return log_oom(); + } + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h new file mode 100644 index 0000000..8edf8a3 --- /dev/null +++ b/src/nspawn/nspawn-settings.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" +#include "sd-id128.h" + +#include "capability-util.h" +#include "conf-parser.h" +#include "cpu-set-util.h" +#include "macro.h" +#include "missing_resource.h" +#include "nspawn-expose-ports.h" +#include "nspawn-mount.h" +#include "seccomp-util.h" +#include "time-util.h" + +typedef enum StartMode { + START_PID1, /* Run parameters as command line as process 1 */ + START_PID2, /* Use stub init process as PID 1, run parameters as command line as process 2 */ + START_BOOT, /* Search for init system, pass arguments as parameters */ + _START_MODE_MAX, + _START_MODE_INVALID = -EINVAL, +} StartMode; + +typedef enum UserNamespaceMode { + USER_NAMESPACE_NO, + USER_NAMESPACE_FIXED, + USER_NAMESPACE_PICK, + _USER_NAMESPACE_MODE_MAX, + _USER_NAMESPACE_MODE_INVALID = -EINVAL, +} UserNamespaceMode; + +typedef enum UserNamespaceOwnership { + USER_NAMESPACE_OWNERSHIP_OFF, + USER_NAMESPACE_OWNERSHIP_CHOWN, + USER_NAMESPACE_OWNERSHIP_MAP, + USER_NAMESPACE_OWNERSHIP_AUTO, + _USER_NAMESPACE_OWNERSHIP_MAX, + _USER_NAMESPACE_OWNERSHIP_INVALID = -1, +} UserNamespaceOwnership; + +typedef enum ResolvConfMode { + RESOLV_CONF_OFF, + RESOLV_CONF_COPY_HOST, /* /etc/resolv.conf */ + RESOLV_CONF_COPY_STATIC, /* /usr/lib/systemd/resolv.conf */ + RESOLV_CONF_COPY_UPLINK, /* /run/systemd/resolve/resolv.conf */ + RESOLV_CONF_COPY_STUB, /* /run/systemd/resolve/stub-resolv.conf */ + RESOLV_CONF_REPLACE_HOST, + RESOLV_CONF_REPLACE_STATIC, + RESOLV_CONF_REPLACE_UPLINK, + RESOLV_CONF_REPLACE_STUB, + RESOLV_CONF_BIND_HOST, + RESOLV_CONF_BIND_STATIC, + RESOLV_CONF_BIND_UPLINK, + RESOLV_CONF_BIND_STUB, + RESOLV_CONF_DELETE, + RESOLV_CONF_AUTO, + _RESOLV_CONF_MODE_MAX, + _RESOLV_CONF_MODE_INVALID = -EINVAL, +} ResolvConfMode; + +typedef enum LinkJournal { + LINK_NO, + LINK_AUTO, + LINK_HOST, + LINK_GUEST, + _LINK_JOURNAL_MAX, + _LINK_JOURNAL_INVALID = -EINVAL, +} LinkJournal; + +typedef enum TimezoneMode { + TIMEZONE_OFF, + TIMEZONE_COPY, + TIMEZONE_BIND, + TIMEZONE_SYMLINK, + TIMEZONE_DELETE, + TIMEZONE_AUTO, + _TIMEZONE_MODE_MAX, + _TIMEZONE_MODE_INVALID = -EINVAL, +} TimezoneMode; + +typedef enum ConsoleMode { + CONSOLE_INTERACTIVE, + CONSOLE_READ_ONLY, + CONSOLE_PASSIVE, + CONSOLE_PIPE, + _CONSOLE_MODE_MAX, + _CONSOLE_MODE_INVALID = -EINVAL, +} ConsoleMode; + +typedef enum SettingsMask { + SETTING_START_MODE = UINT64_C(1) << 0, + SETTING_ENVIRONMENT = UINT64_C(1) << 1, + SETTING_USER = UINT64_C(1) << 2, + SETTING_CAPABILITY = UINT64_C(1) << 3, + SETTING_KILL_SIGNAL = UINT64_C(1) << 4, + SETTING_PERSONALITY = UINT64_C(1) << 5, + SETTING_MACHINE_ID = UINT64_C(1) << 6, + SETTING_NETWORK = UINT64_C(1) << 7, + SETTING_EXPOSE_PORTS = UINT64_C(1) << 8, + SETTING_READ_ONLY = UINT64_C(1) << 9, + SETTING_VOLATILE_MODE = UINT64_C(1) << 10, + SETTING_CUSTOM_MOUNTS = UINT64_C(1) << 11, + SETTING_WORKING_DIRECTORY = UINT64_C(1) << 12, + SETTING_USERNS = UINT64_C(1) << 13, + SETTING_NOTIFY_READY = UINT64_C(1) << 14, + SETTING_PIVOT_ROOT = UINT64_C(1) << 15, + SETTING_SYSCALL_FILTER = UINT64_C(1) << 16, + SETTING_HOSTNAME = UINT64_C(1) << 17, + SETTING_NO_NEW_PRIVILEGES = UINT64_C(1) << 18, + SETTING_OOM_SCORE_ADJUST = UINT64_C(1) << 19, + SETTING_CPU_AFFINITY = UINT64_C(1) << 20, + SETTING_RESOLV_CONF = UINT64_C(1) << 21, + SETTING_LINK_JOURNAL = UINT64_C(1) << 22, + SETTING_TIMEZONE = UINT64_C(1) << 23, + SETTING_EPHEMERAL = UINT64_C(1) << 24, + SETTING_SLICE = UINT64_C(1) << 25, + SETTING_DIRECTORY = UINT64_C(1) << 26, + SETTING_USE_CGNS = UINT64_C(1) << 27, + SETTING_CLONE_NS_FLAGS = UINT64_C(1) << 28, + SETTING_CONSOLE_MODE = UINT64_C(1) << 29, + SETTING_CREDENTIALS = UINT64_C(1) << 30, + SETTING_BIND_USER = UINT64_C(1) << 31, + SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32, + SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */ + SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1), + _SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1, + _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX +} SettingsMask; + +/* We want to use SETTING_RLIMIT_FIRST in shifts, so make sure it is really 64 bits + * when used in expressions. */ +#define SETTING_RLIMIT_FIRST ((uint64_t) SETTING_RLIMIT_FIRST) +#define SETTING_RLIMIT_LAST ((uint64_t) SETTING_RLIMIT_LAST) + +assert_cc(sizeof(SettingsMask) == 8); +assert_cc(sizeof(SETTING_RLIMIT_FIRST) == 8); +assert_cc(sizeof(SETTING_RLIMIT_LAST) == 8); + +typedef struct DeviceNode { + char *path; + unsigned major; + unsigned minor; + mode_t mode; + uid_t uid; + gid_t gid; +} DeviceNode; + +typedef struct OciHook { + char *path; + char **args; + char **env; + usec_t timeout; +} OciHook; + +typedef struct Settings { + /* [Exec] */ + StartMode start_mode; + int ephemeral; + char **parameters; + char **environment; + char *user; + uint64_t capability; + uint64_t drop_capability; + uint64_t ambient_capability; + int kill_signal; + unsigned long personality; + sd_id128_t machine_id; + char *working_directory; + char *pivot_root_new; + char *pivot_root_old; + UserNamespaceMode userns_mode; + uid_t uid_shift, uid_range; + int notify_ready; + char **syscall_allow_list; + char **syscall_deny_list; + struct rlimit *rlimit[_RLIMIT_MAX]; + char *hostname; + int no_new_privileges; + int oom_score_adjust; + bool oom_score_adjust_set; + CPUSet cpu_set; + ResolvConfMode resolv_conf; + LinkJournal link_journal; + bool link_journal_try; + TimezoneMode timezone; + int suppress_sync; + + /* [Files] */ + int read_only; + VolatileMode volatile_mode; + CustomMount *custom_mounts; + size_t n_custom_mounts; + UserNamespaceOwnership userns_ownership; + char **bind_user; + + /* [Network] */ + int private_network; + int network_veth; + char *network_bridge; + char *network_zone; + char **network_interfaces; + char **network_macvlan; + char **network_ipvlan; + char **network_veth_extra; + ExposePort *expose_ports; + + /* Additional fields, that are specific to OCI runtime case */ + char *bundle; + char *root; + OciHook *oci_hooks_prestart, *oci_hooks_poststart, *oci_hooks_poststop; + size_t n_oci_hooks_prestart, n_oci_hooks_poststart, n_oci_hooks_poststop; + char *slice; + sd_bus_message *properties; + CapabilityQuintet full_capabilities; + uid_t uid; + gid_t gid; + gid_t *supplementary_gids; + size_t n_supplementary_gids; + unsigned console_width, console_height; + ConsoleMode console_mode; + DeviceNode *extra_nodes; + size_t n_extra_nodes; + unsigned long clone_ns_flags; + char *network_namespace_path; + int use_cgns; + char **sysctl; +#if HAVE_SECCOMP + scmp_filter_ctx seccomp; +#endif +} Settings; + +Settings *settings_new(void); +int settings_load(FILE *f, const char *path, Settings **ret); +Settings* settings_free(Settings *s); + +bool settings_network_veth(Settings *s); +bool settings_private_network(Settings *s); +bool settings_network_configured(Settings *s); + +int settings_allocate_properties(Settings *s); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Settings*, settings_free); + +const struct ConfigPerfItem* nspawn_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +CONFIG_PARSER_PROTOTYPE(config_parse_capability); +CONFIG_PARSER_PROTOTYPE(config_parse_expose_port); +CONFIG_PARSER_PROTOTYPE(config_parse_volatile_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_pivot_root); +CONFIG_PARSER_PROTOTYPE(config_parse_bind); +CONFIG_PARSER_PROTOTYPE(config_parse_tmpfs); +CONFIG_PARSER_PROTOTYPE(config_parse_overlay); +CONFIG_PARSER_PROTOTYPE(config_parse_inaccessible); +CONFIG_PARSER_PROTOTYPE(config_parse_veth_extra); +CONFIG_PARSER_PROTOTYPE(config_parse_network_iface_pair); +CONFIG_PARSER_PROTOTYPE(config_parse_macvlan_iface_pair); +CONFIG_PARSER_PROTOTYPE(config_parse_ipvlan_iface_pair); +CONFIG_PARSER_PROTOTYPE(config_parse_network_zone); +CONFIG_PARSER_PROTOTYPE(config_parse_boot); +CONFIG_PARSER_PROTOTYPE(config_parse_pid2); +CONFIG_PARSER_PROTOTYPE(config_parse_private_users); +CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter); +CONFIG_PARSER_PROTOTYPE(config_parse_oom_score_adjust); +CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity); +CONFIG_PARSER_PROTOTYPE(config_parse_resolv_conf); +CONFIG_PARSER_PROTOTYPE(config_parse_link_journal); +CONFIG_PARSER_PROTOTYPE(config_parse_timezone); +CONFIG_PARSER_PROTOTYPE(config_parse_userns_chown); +CONFIG_PARSER_PROTOTYPE(config_parse_userns_ownership); +CONFIG_PARSER_PROTOTYPE(config_parse_bind_user); + +const char *resolv_conf_mode_to_string(ResolvConfMode a) _const_; +ResolvConfMode resolv_conf_mode_from_string(const char *s) _pure_; + +const char *timezone_mode_to_string(TimezoneMode a) _const_; +TimezoneMode timezone_mode_from_string(const char *s) _pure_; + +const char *user_namespace_ownership_to_string(UserNamespaceOwnership a) _const_; +UserNamespaceOwnership user_namespace_ownership_from_string(const char *s) _pure_; + +int parse_link_journal(const char *s, LinkJournal *ret_mode, bool *ret_try); + +void device_node_array_free(DeviceNode *node, size_t n); diff --git a/src/nspawn/nspawn-setuid.c b/src/nspawn/nspawn-setuid.c new file mode 100644 index 0000000..2d67c3d --- /dev/null +++ b/src/nspawn/nspawn-setuid.c @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "constants.h" +#include "errno.h" +#include "fd-util.h" +#include "fileio.h" +#include "mkdir.h" +#include "nspawn-setuid.h" +#include "process-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static int spawn_getent(const char *database, const char *key, pid_t *rpid) { + int pipe_fds[2], r; + pid_t pid; + + assert(database); + assert(key); + assert(rpid); + + if (pipe2(pipe_fds, O_CLOEXEC) < 0) + return log_error_errno(errno, "Failed to allocate pipe: %m"); + + r = safe_fork_full("(getent)", + (int[]) { -EBADF, pipe_fds[1], -EBADF }, NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, + &pid); + if (r < 0) { + safe_close_pair(pipe_fds); + return r; + } + if (r == 0) { + execle("/usr/bin/getent", "getent", database, key, NULL, &(char*[1]){}); + execle("/bin/getent", "getent", database, key, NULL, &(char*[1]){}); + _exit(EXIT_FAILURE); + } + + pipe_fds[1] = safe_close(pipe_fds[1]); + + *rpid = pid; + + return pipe_fds[0]; +} + +int change_uid_gid_raw( + uid_t uid, + gid_t gid, + const gid_t *supplementary_gids, + size_t n_supplementary_gids, + bool chown_stdio) { + + if (!uid_is_valid(uid)) + uid = 0; + if (!gid_is_valid(gid)) + gid = 0; + + if (chown_stdio) { + (void) fchown(STDIN_FILENO, uid, gid); + (void) fchown(STDOUT_FILENO, uid, gid); + (void) fchown(STDERR_FILENO, uid, gid); + } + + if (setgroups(n_supplementary_gids, supplementary_gids) < 0) + return log_error_errno(errno, "Failed to set auxiliary groups: %m"); + + if (setresgid(gid, gid, gid) < 0) + return log_error_errno(errno, "setresgid() failed: %m"); + + if (setresuid(uid, uid, uid) < 0) + return log_error_errno(errno, "setresuid() failed: %m"); + + return 0; +} + +int change_uid_gid(const char *user, bool chown_stdio, char **ret_home) { + char *x, *u, *g, *h; + _cleanup_free_ gid_t *gids = NULL; + _cleanup_free_ char *home = NULL, *line = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -EBADF; + unsigned n_gids = 0; + uid_t uid; + gid_t gid; + pid_t pid; + int r; + + assert(ret_home); + + if (!user || STR_IN_SET(user, "root", "0")) { + /* Reset everything fully to 0, just in case */ + + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Failed to become root: %m"); + + *ret_home = NULL; + return 0; + } + + /* First, get user credentials */ + fd = spawn_getent("passwd", user, &pid); + if (fd < 0) + return fd; + + f = take_fdopen(&fd, "r"); + if (!f) + return log_oom(); + + r = read_line(f, LONG_LINE_MAX, &line); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "Failed to resolve user %s.", user); + if (r < 0) + return log_error_errno(r, "Failed to read from getent: %m"); + + (void) wait_for_terminate_and_check("getent passwd", pid, WAIT_LOG); + + x = strchr(line, ':'); + if (!x) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "/etc/passwd entry has invalid user field."); + + u = strchr(x+1, ':'); + if (!u) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "/etc/passwd entry has invalid password field."); + + u++; + g = strchr(u, ':'); + if (!g) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "/etc/passwd entry has invalid UID field."); + + *g = 0; + g++; + x = strchr(g, ':'); + if (!x) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "/etc/passwd entry has invalid GID field."); + + *x = 0; + h = strchr(x+1, ':'); + if (!h) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "/etc/passwd entry has invalid GECOS field."); + + h++; + x = strchr(h, ':'); + if (!x) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "/etc/passwd entry has invalid home directory field."); + + *x = 0; + + r = parse_uid(u, &uid); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to parse UID of user."); + + r = parse_gid(g, &gid); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to parse GID of user."); + + home = strdup(h); + if (!home) + return log_oom(); + + f = safe_fclose(f); + line = mfree(line); + + /* Second, get group memberships */ + fd = spawn_getent("initgroups", user, &pid); + if (fd < 0) + return fd; + + f = take_fdopen(&fd, "r"); + if (!f) + return log_oom(); + + r = read_line(f, LONG_LINE_MAX, &line); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "Failed to resolve user %s.", user); + if (r < 0) + return log_error_errno(r, "Failed to read from getent: %m"); + + (void) wait_for_terminate_and_check("getent initgroups", pid, WAIT_LOG); + + /* Skip over the username and subsequent separator whitespace */ + x = line; + x += strcspn(x, WHITESPACE); + x += strspn(x, WHITESPACE); + + for (const char *p = x;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to parse group data from getent: %m"); + if (r == 0) + break; + + if (!GREEDY_REALLOC(gids, n_gids+1)) + return log_oom(); + + r = parse_gid(word, &gids[n_gids++]); + if (r < 0) + return log_error_errno(r, "Failed to parse group data from getent: %m"); + } + + r = mkdir_parents(home, 0775); + if (r < 0) + return log_error_errno(r, "Failed to make home root directory: %m"); + + r = mkdir_safe(home, 0755, uid, gid, 0); + if (r < 0 && !IN_SET(r, -EEXIST, -ENOTDIR)) + return log_error_errno(r, "Failed to make home directory: %m"); + + r = change_uid_gid_raw(uid, gid, gids, n_gids, chown_stdio); + if (r < 0) + return r; + + if (ret_home) + *ret_home = TAKE_PTR(home); + + return 0; +} diff --git a/src/nspawn/nspawn-setuid.h b/src/nspawn/nspawn-setuid.h new file mode 100644 index 0000000..1924711 --- /dev/null +++ b/src/nspawn/nspawn-setuid.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int change_uid_gid_raw(uid_t uid, gid_t gid, const gid_t *supplementary_gids, size_t n_supplementary_gids, bool chown_stdio); +int change_uid_gid(const char *user, bool chown_stdio, char **ret_home); diff --git a/src/nspawn/nspawn-stub-pid1.c b/src/nspawn/nspawn-stub-pid1.c new file mode 100644 index 0000000..47f7155 --- /dev/null +++ b/src/nspawn/nspawn-stub-pid1.c @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "argv-util.h" +#include "constants.h" +#include "exit-status.h" +#include "fd-util.h" +#include "log.h" +#include "nspawn-stub-pid1.h" +#include "process-util.h" +#include "signal-util.h" +#include "time-util.h" + +static int reset_environ(const char *new_environment, size_t length) { + unsigned long start, end; + + start = (unsigned long) new_environment; + end = start + length; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0) + return -errno; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0) + return -errno; + + return 0; +} + +int stub_pid1(sd_id128_t uuid) { + enum { + STATE_RUNNING, + STATE_REBOOT, + STATE_POWEROFF, + } state = STATE_RUNNING; + + sigset_t fullmask, oldmask, waitmask; + usec_t quit_usec = USEC_INFINITY; + pid_t pid; + int r; + + /* The new environment we set up, on the stack. */ + char new_environment[] = + "container=systemd-nspawn\0" + "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; + + /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful + * for allowing arbitrary processes run in a container, and still have all zombies reaped. */ + + assert_se(sigfillset(&fullmask) >= 0); + assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0); + + pid = fork(); + if (pid < 0) + return log_error_errno(errno, "Failed to fork child pid: %m"); + + if (pid == 0) { + /* Return in the child */ + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0); + + if (setsid() < 0) + return log_error_errno(errno, "Failed to become session leader in payload process: %m"); + + return 0; + } + + reset_all_signal_handlers(); + + log_close(); + (void) close_all_fds(NULL, 0); + log_open(); + + if (ioctl(STDIN_FILENO, TIOCNOTTY) < 0) { + if (errno != ENOTTY) + log_warning_errno(errno, "Unexpected error from TIOCNOTTY ioctl in init stub process, ignoring: %m"); + } else + log_warning("Expected TIOCNOTTY to fail, but it succeeded in init stub process, ignoring."); + + /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also, + * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ + * find them set. */ + sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX); + reset_environ(new_environment, sizeof(new_environment)); + + (void) rename_process("(sd-stubinit)"); + + assert_se(sigemptyset(&waitmask) >= 0); + assert_se(sigset_add_many(&waitmask, + SIGCHLD, /* posix: process died */ + SIGINT, /* sysv: ctrl-alt-del */ + SIGRTMIN+3, /* systemd: halt */ + SIGRTMIN+4, /* systemd: poweroff */ + SIGRTMIN+5, /* systemd: reboot */ + SIGRTMIN+6, /* systemd: kexec */ + SIGRTMIN+13, /* systemd: halt */ + SIGRTMIN+14, /* systemd: poweroff */ + SIGRTMIN+15, /* systemd: reboot */ + SIGRTMIN+16, /* systemd: kexec */ + -1) >= 0); + + /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't + * support reexec/reloading in this stub process. */ + + for (;;) { + siginfo_t si; + usec_t current_usec; + + si.si_pid = 0; + r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG); + if (r < 0) { + r = log_error_errno(errno, "Failed to reap children: %m"); + goto finish; + } + + current_usec = now(CLOCK_MONOTONIC); + + if (si.si_pid == pid || current_usec >= quit_usec) { + + /* The child we started ourselves died or we reached a timeout. */ + + if (state == STATE_REBOOT) { /* dispatch a queued reboot */ + (void) reboot(RB_AUTOBOOT); + r = log_error_errno(errno, "Failed to reboot: %m"); + goto finish; + + } else if (state == STATE_POWEROFF) + (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */ + + if (si.si_pid == pid && si.si_code == CLD_EXITED) + r = si.si_status; /* pass on exit code */ + else + r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */ + + goto finish; + } + if (si.si_pid != 0) + /* We reaped something. Retry until there's nothing more to reap. */ + continue; + + if (quit_usec == USEC_INFINITY) + r = sigwaitinfo(&waitmask, &si); + else + r = sigtimedwait(&waitmask, &si, TIMESPEC_STORE(quit_usec - current_usec)); + if (r < 0) { + if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */ + continue; + if (errno == EAGAIN) /* timeout reached */ + continue; + + r = log_error_errno(errno, "Failed to wait for signal: %m"); + goto finish; + } + + if (si.si_signo == SIGCHLD) + continue; /* Let's reap this */ + + if (state != STATE_RUNNING) + continue; + + /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a + * constant… */ + + if (si.si_signo == SIGRTMIN+3 || + si.si_signo == SIGRTMIN+4 || + si.si_signo == SIGRTMIN+13 || + si.si_signo == SIGRTMIN+14) + + state = STATE_POWEROFF; + + else if (si.si_signo == SIGINT || + si.si_signo == SIGRTMIN+5 || + si.si_signo == SIGRTMIN+6 || + si.si_signo == SIGRTMIN+15 || + si.si_signo == SIGRTMIN+16) + + state = STATE_REBOOT; + else + assert_not_reached(); + + r = kill_and_sigcont(pid, SIGTERM); + + /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We + * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those + * processes which handle both. That's because services tend to bind configuration reload or something + * else to SIGHUP. */ + + if (r != -ESRCH) + (void) kill(pid, SIGHUP); + + quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC; + } + +finish: + _exit(r < 0 ? EXIT_FAILURE : r); +} diff --git a/src/nspawn/nspawn-stub-pid1.h b/src/nspawn/nspawn-stub-pid1.h new file mode 100644 index 0000000..e0810fe --- /dev/null +++ b/src/nspawn/nspawn-stub-pid1.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-id128.h" + +int stub_pid1(sd_id128_t uuid); diff --git a/src/nspawn/nspawn-util.c b/src/nspawn/nspawn-util.c new file mode 100644 index 0000000..6c38489 --- /dev/null +++ b/src/nspawn/nspawn-util.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "glob-util.h" +#include "log.h" +#include "nspawn-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" + +int systemd_installation_has_version(const char *root, const char *minimal_version) { + int r; + + /* Try to guess if systemd installation is later than the specified version. This + * is hacky and likely to yield false negatives, particularly if the installation + * is non-standard. False positives should be relatively rare. + */ + + FOREACH_STRING(pattern, + /* /lib works for systems without usr-merge, and for systems with a sane + * usr-merge, where /lib is a symlink to /usr/lib. /usr/lib is necessary + * for Gentoo which does a merge without making /lib a symlink. + * Also support multiarch paths von Debian/Ubuntu; *-linux-* is a small + * optimization based on the naming scheme of existing multiarch tuples. + */ + "/lib/systemd/libsystemd-shared-*.so", + "/lib64/systemd/libsystemd-shared-*.so", + "/usr/lib/*-linux-*/systemd/libsystemd-shared-*.so", + "/usr/lib/systemd/libsystemd-shared-*.so", + "/usr/lib64/systemd/libsystemd-shared-*.so") { + + _cleanup_strv_free_ char **names = NULL; + _cleanup_free_ char *path = NULL; + char *c; + + path = path_join(root, pattern); + if (!path) + return -ENOMEM; + + r = glob_extend(&names, path, 0); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + + assert_se(c = endswith(path, "*.so")); + *c = '\0'; /* truncate the glob part */ + + STRV_FOREACH(name, names) { + _cleanup_free_ char *bn = NULL; + /* This is most likely to run only once, hence let's not optimize anything. */ + char *t, *t2; + + if (path_extract_filename(*name, &bn) < 0) + continue; + + t = startswith(bn, "libsystemd-shared-"); + if (!t) + continue; + + t2 = endswith(t, ".so"); + if (!t2) + continue; + *t2 = '\0'; + + r = strverscmp_improved(t, minimal_version); + log_debug("Found libsystemd shared at \"%s.so\", version %s (%s).", + *name, t, + r >= 0 ? "OK" : "too old"); + if (r >= 0) + return true; + } + } + + return false; +} diff --git a/src/nspawn/nspawn-util.h b/src/nspawn/nspawn-util.h new file mode 100644 index 0000000..e83cd56 --- /dev/null +++ b/src/nspawn/nspawn-util.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int systemd_installation_has_version(const char *root, const char *minimal_version); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c new file mode 100644 index 0000000..e46cc1c --- /dev/null +++ b/src/nspawn/nspawn.c @@ -0,0 +1,5870 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_BLKID +#endif +#include +#include +#include +#include +#if HAVE_SELINUX +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-daemon.h" +#include "sd-id128.h" + +#include "alloc-util.h" +#include "ether-addr-util.h" +#include "barrier.h" +#include "base-filesystem.h" +#include "blkid-util.h" +#include "btrfs-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "chase.h" +#include "common-signal.h" +#include "copy.h" +#include "cpu-set-util.h" +#include "creds-util.h" +#include "dev-setup.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "env-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fdset.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "gpt.h" +#include "hexdecoct.h" +#include "hostname-setup.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "log.h" +#include "loop-util.h" +#include "loopback-setup.h" +#include "machine-credential.h" +#include "macro.h" +#include "main-func.h" +#include "missing_sched.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "netlink-util.h" +#include "nspawn-bind-user.h" +#include "nspawn-cgroup.h" +#include "nspawn-def.h" +#include "nspawn-expose-ports.h" +#include "nspawn-mount.h" +#include "nspawn-network.h" +#include "nspawn-oci.h" +#include "nspawn-patch-uid.h" +#include "nspawn-register.h" +#include "nspawn-seccomp.h" +#include "nspawn-settings.h" +#include "nspawn-setuid.h" +#include "nspawn-stub-pid1.h" +#include "nspawn-util.h" +#include "nspawn.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "ptyfwd.h" +#include "random-util.h" +#include "raw-clone.h" +#include "resolve-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "seccomp-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "sysctl-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "unit-name.h" +#include "user-util.h" + +/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */ +#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify" +#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming" + +#define EXIT_FORCE_RESTART 133 + +typedef enum ContainerStatus { + CONTAINER_TERMINATED, + CONTAINER_REBOOTED, +} ContainerStatus; + +static char *arg_directory = NULL; +static char *arg_template = NULL; +static char *arg_chdir = NULL; +static char *arg_pivot_root_new = NULL; +static char *arg_pivot_root_old = NULL; +static char *arg_user = NULL; +static uid_t arg_uid = UID_INVALID; +static gid_t arg_gid = GID_INVALID; +static gid_t* arg_supplementary_gids = NULL; +static size_t arg_n_supplementary_gids = 0; +static sd_id128_t arg_uuid = {}; +static char *arg_machine = NULL; /* The name used by the host to refer to this */ +static char *arg_hostname = NULL; /* The name the payload sees by default */ +static const char *arg_selinux_context = NULL; +static const char *arg_selinux_apifs_context = NULL; +static char *arg_slice = NULL; +static bool arg_private_network = false; +static bool arg_read_only = false; +static StartMode arg_start_mode = START_PID1; +static bool arg_ephemeral = false; +static LinkJournal arg_link_journal = LINK_AUTO; +static bool arg_link_journal_try = false; +static uint64_t arg_caps_retain = + (1ULL << CAP_AUDIT_CONTROL) | + (1ULL << CAP_AUDIT_WRITE) | + (1ULL << CAP_CHOWN) | + (1ULL << CAP_DAC_OVERRIDE) | + (1ULL << CAP_DAC_READ_SEARCH) | + (1ULL << CAP_FOWNER) | + (1ULL << CAP_FSETID) | + (1ULL << CAP_IPC_OWNER) | + (1ULL << CAP_KILL) | + (1ULL << CAP_LEASE) | + (1ULL << CAP_LINUX_IMMUTABLE) | + (1ULL << CAP_MKNOD) | + (1ULL << CAP_NET_BIND_SERVICE) | + (1ULL << CAP_NET_BROADCAST) | + (1ULL << CAP_NET_RAW) | + (1ULL << CAP_SETFCAP) | + (1ULL << CAP_SETGID) | + (1ULL << CAP_SETPCAP) | + (1ULL << CAP_SETUID) | + (1ULL << CAP_SYS_ADMIN) | + (1ULL << CAP_SYS_BOOT) | + (1ULL << CAP_SYS_CHROOT) | + (1ULL << CAP_SYS_NICE) | + (1ULL << CAP_SYS_PTRACE) | + (1ULL << CAP_SYS_RESOURCE) | + (1ULL << CAP_SYS_TTY_CONFIG); +static uint64_t arg_caps_ambient = 0; +static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL; +static CustomMount *arg_custom_mounts = NULL; +static size_t arg_n_custom_mounts = 0; +static char **arg_setenv = NULL; +static bool arg_quiet = false; +static bool arg_register = true; +static bool arg_keep_unit = false; +static char **arg_network_interfaces = NULL; +static char **arg_network_macvlan = NULL; +static char **arg_network_ipvlan = NULL; +static bool arg_network_veth = false; +static char **arg_network_veth_extra = NULL; +static char *arg_network_bridge = NULL; +static char *arg_network_zone = NULL; +static char *arg_network_namespace_path = NULL; +struct ether_addr arg_network_provided_mac = {}; +static PagerFlags arg_pager_flags = 0; +static unsigned long arg_personality = PERSONALITY_INVALID; +static char *arg_image = NULL; +static char *arg_oci_bundle = NULL; +static VolatileMode arg_volatile_mode = VOLATILE_NO; +static ExposePort *arg_expose_ports = NULL; +static char **arg_property = NULL; +static sd_bus_message *arg_property_message = NULL; +static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO; +static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; +static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID; +static int arg_kill_signal = 0; +static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN; +static SettingsMask arg_settings_mask = 0; +static int arg_settings_trusted = -1; +static char **arg_parameters = NULL; +static const char *arg_container_service_name = "systemd-nspawn"; +static bool arg_notify_ready = false; +static bool arg_use_cgns = true; +static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS; +static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP; +static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT; +static char **arg_syscall_allow_list = NULL; +static char **arg_syscall_deny_list = NULL; +#if HAVE_SECCOMP +static scmp_filter_ctx arg_seccomp = NULL; +#endif +static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {}; +static bool arg_no_new_privileges = false; +static int arg_oom_score_adjust = 0; +static bool arg_oom_score_adjust_set = false; +static CPUSet arg_cpu_set = {}; +static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO; +static TimezoneMode arg_timezone = TIMEZONE_AUTO; +static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX; +static DeviceNode* arg_extra_nodes = NULL; +static size_t arg_n_extra_nodes = 0; +static char **arg_sysctl = NULL; +static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID; +static MachineCredential *arg_credentials = NULL; +static size_t arg_n_credentials = 0; +static char **arg_bind_user = NULL; +static bool arg_suppress_sync = false; +static char *arg_settings_filename = NULL; +static Architecture arg_architecture = _ARCHITECTURE_INVALID; +static ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); +STATIC_DESTRUCTOR_REGISTER(arg_template, freep); +STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep); +STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep); +STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep); +STATIC_DESTRUCTOR_REGISTER(arg_user, freep); +STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep); +STATIC_DESTRUCTOR_REGISTER(arg_machine, freep); +STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep); +STATIC_DESTRUCTOR_REGISTER(arg_slice, freep); +STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep); +STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep); +STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp); +STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done); +STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep); +#if HAVE_SECCOMP +STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep); +#endif +STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset); +STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +static int handle_arg_console(const char *arg) { + if (streq(arg, "help")) { + puts("autopipe\n" + "interactive\n" + "passive\n" + "pipe\n" + "read-only"); + return 0; + } + + if (streq(arg, "interactive")) + arg_console_mode = CONSOLE_INTERACTIVE; + else if (streq(arg, "read-only")) + arg_console_mode = CONSOLE_READ_ONLY; + else if (streq(arg, "passive")) + arg_console_mode = CONSOLE_PASSIVE; + else if (streq(arg, "pipe")) { + if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0) + log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE, + "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. " + "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. " + "Proceeding anyway."); + + arg_console_mode = CONSOLE_PIPE; + } else if (streq(arg, "autopipe")) { + if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0) + arg_console_mode = CONSOLE_INTERACTIVE; + else + arg_console_mode = CONSOLE_PIPE; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg); + + arg_settings_mask |= SETTING_CONSOLE_MODE; + return 1; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("systemd-nspawn", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" + "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n" + " -h --help Show this help\n" + " --version Print version string\n" + " -q --quiet Do not show status information\n" + " --no-pager Do not pipe output into a pager\n" + " --settings=BOOLEAN Load additional settings from .nspawn file\n\n" + "%3$sImage:%4$s\n" + " -D --directory=PATH Root directory for the container\n" + " --template=PATH Initialize root directory from template directory,\n" + " if missing\n" + " -x --ephemeral Run container with snapshot of root directory, and\n" + " remove it after exit\n" + " -i --image=PATH Root file system disk image (or device node) for\n" + " the container\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + " --oci-bundle=PATH OCI bundle directory\n" + " --read-only Mount the root directory read-only\n" + " --volatile[=MODE] Run the system in volatile mode\n" + " --root-hash=HASH Specify verity root hash for root disk image\n" + " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n" + " as a DER encoded PKCS7, either as a path to a file\n" + " or as an ASCII base64 encoded string prefixed by\n" + " 'base64:'\n" + " --verity-data=PATH Specify hash device for verity\n" + " --pivot-root=PATH[:PATH]\n" + " Pivot root to given directory in the container\n\n" + "%3$sExecution:%4$s\n" + " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n" + " -b --boot Boot up full system (i.e. invoke init)\n" + " --chdir=PATH Set working directory in the container\n" + " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n" + " -u --user=USER Run the command under specified user or UID\n" + " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" + " --notify-ready=BOOLEAN Receive notifications from the child init process\n" + " --suppress-sync=BOOLEAN\n" + " Suppress any form of disk data synchronization\n\n" + "%3$sSystem Identity:%4$s\n" + " -M --machine=NAME Set the machine name for the container\n" + " --hostname=NAME Override the hostname for the container\n" + " --uuid=UUID Set a specific machine UUID for the container\n\n" + "%3$sProperties:%4$s\n" + " -S --slice=SLICE Place the container in the specified slice\n" + " --property=NAME=VALUE Set scope unit property\n" + " --register=BOOLEAN Register container as machine\n" + " --keep-unit Do not register a scope for the machine, reuse\n" + " the service unit nspawn is running in\n\n" + "%3$sUser Namespacing:%4$s\n" + " --private-users=no Run without user namespacing\n" + " --private-users=yes|pick|identity\n" + " Run within user namespace, autoselect UID/GID range\n" + " --private-users=UIDBASE[:NUIDS]\n" + " Similar, but with user configured UID/GID range\n" + " --private-users-ownership=MODE\n" + " Adjust ('chown') or map ('map') OS tree ownership\n" + " to private UID/GID range\n" + " -U Equivalent to --private-users=pick and\n" + " --private-users-ownership=auto\n\n" + "%3$sNetworking:%4$s\n" + " --private-network Disable network in container\n" + " --network-interface=HOSTIF[:CONTAINERIF]\n" + " Assign an existing network interface to the\n" + " container\n" + " --network-macvlan=HOSTIF[:CONTAINERIF]\n" + " Create a macvlan network interface based on an\n" + " existing network interface to the container\n" + " --network-ipvlan=HOSTIF[:CONTAINERIF]\n" + " Create an ipvlan network interface based on an\n" + " existing network interface to the container\n" + " -n --network-veth Add a virtual Ethernet connection between host\n" + " and container\n" + " --network-veth-extra=HOSTIF[:CONTAINERIF]\n" + " Add an additional virtual Ethernet link between\n" + " host and container\n" + " --network-bridge=INTERFACE\n" + " Add a virtual Ethernet connection to the container\n" + " and attach it to an existing bridge on the host\n" + " --network-zone=NAME Similar, but attach the new interface to an\n" + " an automatically managed bridge interface\n" + " --network-namespace-path=PATH\n" + " Set network namespace to the one represented by\n" + " the specified kernel namespace file node\n" + " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" + " Expose a container IP port on the host\n\n" + "%3$sSecurity:%4$s\n" + " --capability=CAP In addition to the default, retain specified\n" + " capability\n" + " --drop-capability=CAP Drop the specified capability from the default set\n" + " --ambient-capability=CAP\n" + " Sets the specified capability for the started\n" + " process. Not useful if booting a machine.\n" + " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n" + " --system-call-filter=LIST|~LIST\n" + " Permit/prohibit specific system calls\n" + " -Z --selinux-context=SECLABEL\n" + " Set the SELinux security context to be used by\n" + " processes in the container\n" + " -L --selinux-apifs-context=SECLABEL\n" + " Set the SELinux security context to be used by\n" + " API/tmpfs file systems in the container\n\n" + "%3$sResources:%4$s\n" + " --rlimit=NAME=LIMIT Set a resource limit for the payload\n" + " --oom-score-adjust=VALUE\n" + " Adjust the OOM score value for the payload\n" + " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n" + " --personality=ARCH Pick personality for this container\n\n" + "%3$sIntegration:%4$s\n" + " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n" + " --timezone=MODE Select mode of /etc/localtime initialization\n" + " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n" + " host, try-guest, try-host\n" + " -j Equivalent to --link-journal=try-guest\n\n" + "%3$sMounts:%4$s\n" + " --bind=PATH[:PATH[:OPTIONS]]\n" + " Bind mount a file or directory from the host into\n" + " the container\n" + " --bind-ro=PATH[:PATH[:OPTIONS]\n" + " Similar, but creates a read-only bind mount\n" + " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n" + " it\n" + " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n" + " --overlay=PATH[:PATH...]:PATH\n" + " Create an overlay mount from the host to \n" + " the container\n" + " --overlay-ro=PATH[:PATH...]:PATH\n" + " Similar, but creates a read-only overlay mount\n" + " --bind-user=NAME Bind user from host to container\n\n" + "%3$sInput/Output:%4$s\n" + " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n" + " set up for the container.\n" + " -P --pipe Equivalent to --console=pipe\n\n" + "%3$sCredentials:%4$s\n" + " --set-credential=ID:VALUE\n" + " Pass a credential with literal value to container.\n" + " --load-credential=ID:PATH\n" + " Load credential to pass to container from file or\n" + " AF_UNIX stream socket.\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int custom_mount_check_all(void) { + size_t i; + + for (i = 0; i < arg_n_custom_mounts; i++) { + CustomMount *m = &arg_custom_mounts[i]; + + if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) { + if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--private-users-ownership=own may not be combined with custom root mounts."); + if (arg_uid_shift == UID_INVALID) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--private-users with automatic UID shift may not be combined with custom root mounts."); + } + } + + return 0; +} + +static int detect_unified_cgroup_hierarchy_from_environment(void) { + const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY"; + int r; + + /* Allow the user to control whether the unified hierarchy is used */ + + e = getenv(var); + if (!e) { + /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */ + var = "UNIFIED_CGROUP_HIERARCHY"; + e = getenv(var); + } + + if (!isempty(e)) { + r = parse_boolean(e); + if (r < 0) + return log_error_errno(r, "Failed to parse $%s: %m", var); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + } + + return 0; +} + +static int detect_unified_cgroup_hierarchy_from_image(const char *directory) { + int r; + + /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd + * in the image actually supports. */ + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether we are in all unified mode."); + if (r > 0) { + /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection + * routine only detects 231, so we'll have a false negative here for 230. */ + r = systemd_installation_has_version(directory, "230"); + if (r < 0) + return log_error_errno(r, "Failed to determine systemd version in container: %m"); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { + /* Mixed cgroup hierarchy support was added in 233 */ + r = systemd_installation_has_version(directory, "233"); + if (r < 0) + return log_error_errno(r, "Failed to determine systemd version in container: %m"); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + } else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + + log_debug("Using %s hierarchy for container.", + arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" : + arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified"); + + return 0; +} + +static int parse_capability_spec(const char *spec, uint64_t *ret_mask) { + uint64_t mask = 0; + int r; + + for (;;) { + _cleanup_free_ char *t = NULL; + + r = extract_first_word(&spec, &t, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse capability %s.", t); + if (r == 0) + break; + + if (streq(t, "help")) { + for (int i = 0; i < capability_list_length(); i++) { + const char *name; + + name = capability_to_name(i); + if (name) + puts(name); + } + + return 0; /* quit */ + } + + if (streq(t, "all")) + mask = UINT64_MAX; + else { + r = capability_from_name(t); + if (r < 0) + return log_error_errno(r, "Failed to parse capability %s.", t); + + mask |= 1ULL << r; + } + } + + *ret_mask = mask; + return 1; /* continue */ +} + +static int parse_share_ns_env(const char *name, unsigned long ns_flag) { + int r; + + r = getenv_bool(name); + if (r == -ENXIO) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to parse $%s: %m", name); + + arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag); + arg_settings_mask |= SETTING_CLONE_NS_FLAGS; + return 0; +} + +static int parse_mount_settings_env(void) { + const char *e; + int r; + + r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP"); + if (r < 0 && r != -ENXIO) + return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m"); + if (r >= 0) + SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0); + + e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE"); + if (streq_ptr(e, "network")) + arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS; + + else if (e) { + r = parse_boolean(e); + if (r < 0) + return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m"); + + SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0); + SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false); + } + + return 0; +} + +static int parse_environment(void) { + const char *e; + int r; + + r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC); + if (r < 0) + return r; + r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID); + if (r < 0) + return r; + r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS); + if (r < 0) + return r; + r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS); + if (r < 0) + return r; + + r = parse_mount_settings_env(); + if (r < 0) + return r; + + /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use, + * even if it is supported. If not supported, it has no effect. */ + if (!cg_ns_supported()) + arg_use_cgns = false; + else { + r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS"); + if (r < 0) { + if (r != -ENXIO) + return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m"); + + arg_use_cgns = true; + } else { + arg_use_cgns = r > 0; + arg_settings_mask |= SETTING_USE_CGNS; + } + } + + e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE"); + if (e) + arg_container_service_name = e; + + e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC"); + if (e) { + r = parse_ether_addr(e, &arg_network_provided_mac); + if (r < 0) + return log_error_errno(r, "Failed to parse provided MAC address via environment variable"); + } + + r = getenv_bool("SYSTEMD_SUPPRESS_SYNC"); + if (r >= 0) + arg_suppress_sync = r; + else if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m"); + + return detect_unified_cgroup_hierarchy_from_environment(); +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_PRIVATE_NETWORK, + ARG_UUID, + ARG_READ_ONLY, + ARG_CAPABILITY, + ARG_AMBIENT_CAPABILITY, + ARG_DROP_CAPABILITY, + ARG_LINK_JOURNAL, + ARG_BIND, + ARG_BIND_RO, + ARG_TMPFS, + ARG_OVERLAY, + ARG_OVERLAY_RO, + ARG_INACCESSIBLE, + ARG_SHARE_SYSTEM, + ARG_REGISTER, + ARG_KEEP_UNIT, + ARG_NETWORK_INTERFACE, + ARG_NETWORK_MACVLAN, + ARG_NETWORK_IPVLAN, + ARG_NETWORK_BRIDGE, + ARG_NETWORK_ZONE, + ARG_NETWORK_VETH_EXTRA, + ARG_NETWORK_NAMESPACE_PATH, + ARG_PERSONALITY, + ARG_VOLATILE, + ARG_TEMPLATE, + ARG_PROPERTY, + ARG_PRIVATE_USERS, + ARG_KILL_SIGNAL, + ARG_SETTINGS, + ARG_CHDIR, + ARG_PIVOT_ROOT, + ARG_PRIVATE_USERS_CHOWN, + ARG_PRIVATE_USERS_OWNERSHIP, + ARG_NOTIFY_READY, + ARG_ROOT_HASH, + ARG_ROOT_HASH_SIG, + ARG_VERITY_DATA, + ARG_SYSTEM_CALL_FILTER, + ARG_RLIMIT, + ARG_HOSTNAME, + ARG_NO_NEW_PRIVILEGES, + ARG_OOM_SCORE_ADJUST, + ARG_CPU_AFFINITY, + ARG_RESOLV_CONF, + ARG_TIMEZONE, + ARG_CONSOLE, + ARG_PIPE, + ARG_OCI_BUNDLE, + ARG_NO_PAGER, + ARG_SET_CREDENTIAL, + ARG_LOAD_CREDENTIAL, + ARG_BIND_USER, + ARG_SUPPRESS_SYNC, + ARG_IMAGE_POLICY, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "directory", required_argument, NULL, 'D' }, + { "template", required_argument, NULL, ARG_TEMPLATE }, + { "ephemeral", no_argument, NULL, 'x' }, + { "user", required_argument, NULL, 'u' }, + { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK }, + { "as-pid2", no_argument, NULL, 'a' }, + { "boot", no_argument, NULL, 'b' }, + { "uuid", required_argument, NULL, ARG_UUID }, + { "read-only", no_argument, NULL, ARG_READ_ONLY }, + { "capability", required_argument, NULL, ARG_CAPABILITY }, + { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY }, + { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY }, + { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES }, + { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL }, + { "bind", required_argument, NULL, ARG_BIND }, + { "bind-ro", required_argument, NULL, ARG_BIND_RO }, + { "tmpfs", required_argument, NULL, ARG_TMPFS }, + { "overlay", required_argument, NULL, ARG_OVERLAY }, + { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, + { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE }, + { "machine", required_argument, NULL, 'M' }, + { "hostname", required_argument, NULL, ARG_HOSTNAME }, + { "slice", required_argument, NULL, 'S' }, + { "setenv", required_argument, NULL, 'E' }, + { "selinux-context", required_argument, NULL, 'Z' }, + { "selinux-apifs-context", required_argument, NULL, 'L' }, + { "quiet", no_argument, NULL, 'q' }, + { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */ + { "register", required_argument, NULL, ARG_REGISTER }, + { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT }, + { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE }, + { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN }, + { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN }, + { "network-veth", no_argument, NULL, 'n' }, + { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA }, + { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE }, + { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE }, + { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH }, + { "personality", required_argument, NULL, ARG_PERSONALITY }, + { "image", required_argument, NULL, 'i' }, + { "volatile", optional_argument, NULL, ARG_VOLATILE }, + { "port", required_argument, NULL, 'p' }, + { "property", required_argument, NULL, ARG_PROPERTY }, + { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, + { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */ + { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP}, + { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, + { "settings", required_argument, NULL, ARG_SETTINGS }, + { "chdir", required_argument, NULL, ARG_CHDIR }, + { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT }, + { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, + { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, + { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG }, + { "verity-data", required_argument, NULL, ARG_VERITY_DATA }, + { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, + { "rlimit", required_argument, NULL, ARG_RLIMIT }, + { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST }, + { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY }, + { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF }, + { "timezone", required_argument, NULL, ARG_TIMEZONE }, + { "console", required_argument, NULL, ARG_CONSOLE }, + { "pipe", no_argument, NULL, ARG_PIPE }, + { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, + { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, + { "bind-user", required_argument, NULL, ARG_BIND_USER }, + { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + {} + }; + + int c, r; + uint64_t plus = 0, minus = 0; + bool mask_all_settings = false, mask_no_settings = false; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'D': + r = parse_path_argument(optarg, false, &arg_directory); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + + case ARG_TEMPLATE: + r = parse_path_argument(optarg, false, &arg_template); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + + case 'i': + r = parse_path_argument(optarg, false, &arg_image); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + + case ARG_OCI_BUNDLE: + r = parse_path_argument(optarg, false, &arg_oci_bundle); + if (r < 0) + return r; + + break; + + case 'x': + arg_ephemeral = true; + arg_settings_mask |= SETTING_EPHEMERAL; + break; + + case 'u': + r = free_and_strdup(&arg_user, optarg); + if (r < 0) + return log_oom(); + + arg_settings_mask |= SETTING_USER; + break; + + case ARG_NETWORK_ZONE: { + _cleanup_free_ char *j = NULL; + + j = strjoin("vz-", optarg); + if (!j) + return log_oom(); + + if (!ifname_valid(j)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Network zone name not valid: %s", j); + + free_and_replace(arg_network_zone, j); + + arg_network_veth = true; + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + } + + case ARG_NETWORK_BRIDGE: + + if (!ifname_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Bridge interface name not valid: %s", optarg); + + r = free_and_strdup(&arg_network_bridge, optarg); + if (r < 0) + return log_oom(); + + _fallthrough_; + case 'n': + arg_network_veth = true; + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + + case ARG_NETWORK_VETH_EXTRA: + r = veth_extra_parse(&arg_network_veth_extra, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg); + + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + + case ARG_NETWORK_INTERFACE: + r = interface_pair_parse(&arg_network_interfaces, optarg); + if (r < 0) + return r; + + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + + case ARG_NETWORK_MACVLAN: + r = macvlan_pair_parse(&arg_network_macvlan, optarg); + if (r < 0) + return r; + + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + + case ARG_NETWORK_IPVLAN: + r = ipvlan_pair_parse(&arg_network_ipvlan, optarg); + if (r < 0) + return r; + + _fallthrough_; + case ARG_PRIVATE_NETWORK: + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + + case ARG_NETWORK_NAMESPACE_PATH: + r = parse_path_argument(optarg, false, &arg_network_namespace_path); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_NETWORK; + break; + + case 'b': + if (arg_start_mode == START_PID2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--boot and --as-pid2 may not be combined."); + + arg_start_mode = START_BOOT; + arg_settings_mask |= SETTING_START_MODE; + break; + + case 'a': + if (arg_start_mode == START_BOOT) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--boot and --as-pid2 may not be combined."); + + arg_start_mode = START_PID2; + arg_settings_mask |= SETTING_START_MODE; + break; + + case ARG_UUID: + r = id128_from_string_nonzero(optarg, &arg_uuid); + if (r == -ENXIO) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Machine UUID may not be all zeroes."); + if (r < 0) + return log_error_errno(r, "Invalid UUID: %s", optarg); + + arg_settings_mask |= SETTING_MACHINE_ID; + break; + + case 'S': { + _cleanup_free_ char *mangled = NULL; + + r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled); + if (r < 0) + return log_oom(); + + free_and_replace(arg_slice, mangled); + arg_settings_mask |= SETTING_SLICE; + break; + } + + case 'M': + if (isempty(optarg)) + arg_machine = mfree(arg_machine); + else { + if (!hostname_is_valid(optarg, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid machine name: %s", optarg); + + r = free_and_strdup(&arg_machine, optarg); + if (r < 0) + return log_oom(); + } + break; + + case ARG_HOSTNAME: + if (isempty(optarg)) + arg_hostname = mfree(arg_hostname); + else { + if (!hostname_is_valid(optarg, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid hostname: %s", optarg); + + r = free_and_strdup(&arg_hostname, optarg); + if (r < 0) + return log_oom(); + } + + arg_settings_mask |= SETTING_HOSTNAME; + break; + + case 'Z': + arg_selinux_context = optarg; + break; + + case 'L': + arg_selinux_apifs_context = optarg; + break; + + case ARG_READ_ONLY: + arg_read_only = true; + arg_settings_mask |= SETTING_READ_ONLY; + break; + + case ARG_AMBIENT_CAPABILITY: { + uint64_t m; + r = parse_capability_spec(optarg, &m); + if (r <= 0) + return r; + arg_caps_ambient |= m; + arg_settings_mask |= SETTING_CAPABILITY; + break; + } + case ARG_CAPABILITY: + case ARG_DROP_CAPABILITY: { + uint64_t m; + r = parse_capability_spec(optarg, &m); + if (r <= 0) + return r; + + if (c == ARG_CAPABILITY) + plus |= m; + else + minus |= m; + arg_settings_mask |= SETTING_CAPABILITY; + break; + } + case ARG_NO_NEW_PRIVILEGES: + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg); + + arg_no_new_privileges = r; + arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES; + break; + + case 'j': + arg_link_journal = LINK_GUEST; + arg_link_journal_try = true; + arg_settings_mask |= SETTING_LINK_JOURNAL; + break; + + case ARG_LINK_JOURNAL: + r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try); + if (r < 0) + return log_error_errno(r, "Failed to parse link journal mode %s", optarg); + + arg_settings_mask |= SETTING_LINK_JOURNAL; + break; + + case ARG_BIND: + case ARG_BIND_RO: + r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO); + if (r < 0) + return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; + break; + + case ARG_TMPFS: + r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; + break; + + case ARG_OVERLAY: + case ARG_OVERLAY_RO: + r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO); + if (r == -EADDRNOTAVAIL) + return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified."); + if (r < 0) + return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; + break; + + case ARG_INACCESSIBLE: + r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; + break; + + case 'E': + r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg); + if (r < 0) + return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg); + + arg_settings_mask |= SETTING_ENVIRONMENT; + break; + + case 'q': + arg_quiet = true; + break; + + case ARG_SHARE_SYSTEM: + /* We don't officially support this anymore, except for compat reasons. People should use the + * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */ + log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead."); + arg_clone_ns_flags = 0; + break; + + case ARG_REGISTER: + r = parse_boolean(optarg); + if (r < 0) { + log_error("Failed to parse --register= argument: %s", optarg); + return r; + } + + arg_register = r; + break; + + case ARG_KEEP_UNIT: + arg_keep_unit = true; + break; + + case ARG_PERSONALITY: + + arg_personality = personality_from_string(optarg); + if (arg_personality == PERSONALITY_INVALID) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown or unsupported personality '%s'.", optarg); + + arg_settings_mask |= SETTING_PERSONALITY; + break; + + case ARG_VOLATILE: + + if (!optarg) + arg_volatile_mode = VOLATILE_YES; + else if (streq(optarg, "help")) { + DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX); + return 0; + } else { + VolatileMode m; + + m = volatile_mode_from_string(optarg); + if (m < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse --volatile= argument: %s", optarg); + else + arg_volatile_mode = m; + } + + arg_settings_mask |= SETTING_VOLATILE_MODE; + break; + + case 'p': + r = expose_port_parse(&arg_expose_ports, optarg); + if (r == -EEXIST) + return log_error_errno(r, "Duplicate port specification: %s", optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse host port %s: %m", optarg); + + arg_settings_mask |= SETTING_EXPOSE_PORTS; + break; + + case ARG_PROPERTY: + if (strv_extend(&arg_property, optarg) < 0) + return log_oom(); + + break; + + case ARG_PRIVATE_USERS: { + int boolean; + + if (!optarg) + boolean = true; + else if (!in_charset(optarg, DIGITS)) + /* do *not* parse numbers as booleans */ + boolean = parse_boolean(optarg); + else + boolean = -1; + + if (boolean == 0) { + /* no: User namespacing off */ + arg_userns_mode = USER_NAMESPACE_NO; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (boolean > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + arg_userns_mode = USER_NAMESPACE_FIXED; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (streq(optarg, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is + * implied by USER_NAMESPACE_PICK + * further down. */ + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + } else if (streq(optarg, "identity")) { + /* identity: User namespaces on, UID range is map the 0…0xFFFF range to + * itself, i.e. we don't actually map anything, but do take benefit of + * isolation of capability sets. */ + arg_userns_mode = USER_NAMESPACE_FIXED; + arg_uid_shift = 0; + arg_uid_range = UINT32_C(0x10000); + } else { + _cleanup_free_ char *buffer = NULL; + const char *range, *shift; + + /* anything else: User namespacing on, UID range is explicitly configured */ + + range = strchr(optarg, ':'); + if (range) { + buffer = strndup(optarg, range - optarg); + if (!buffer) + return log_oom(); + shift = buffer; + + range++; + r = safe_atou32(range, &arg_uid_range); + if (r < 0) + return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range); + } else + shift = optarg; + + r = parse_uid(shift, &arg_uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg); + + arg_userns_mode = USER_NAMESPACE_FIXED; + + if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID); + } + + arg_settings_mask |= SETTING_USERNS; + break; + } + + case 'U': + if (userns_supported()) { + arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is + * implied by USER_NAMESPACE_PICK + * further down. */ + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + arg_settings_mask |= SETTING_USERNS; + } + + break; + + case ARG_PRIVATE_USERS_CHOWN: + arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN; + + arg_settings_mask |= SETTING_USERNS; + break; + + case ARG_PRIVATE_USERS_OWNERSHIP: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX); + return 0; + } + + arg_userns_ownership = user_namespace_ownership_from_string(optarg); + if (arg_userns_ownership < 0) + return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg); + + arg_settings_mask |= SETTING_USERNS; + break; + + case ARG_KILL_SIGNAL: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(signal, int, _NSIG); + return 0; + } + + arg_kill_signal = signal_from_string(optarg); + if (arg_kill_signal < 0) + return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg); + + arg_settings_mask |= SETTING_KILL_SIGNAL; + break; + + case ARG_SETTINGS: + + /* no → do not read files + * yes → read files, do not override cmdline, trust only subset + * override → read files, override cmdline, trust only subset + * trusted → read files, do not override cmdline, trust all + */ + + r = parse_boolean(optarg); + if (r < 0) { + if (streq(optarg, "trusted")) { + mask_all_settings = false; + mask_no_settings = false; + arg_settings_trusted = true; + + } else if (streq(optarg, "override")) { + mask_all_settings = false; + mask_no_settings = true; + arg_settings_trusted = -1; + } else + return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg); + } else if (r > 0) { + /* yes */ + mask_all_settings = false; + mask_no_settings = false; + arg_settings_trusted = -1; + } else { + /* no */ + mask_all_settings = true; + mask_no_settings = false; + arg_settings_trusted = false; + } + + break; + + case ARG_CHDIR: + if (!path_is_absolute(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Working directory %s is not an absolute path.", optarg); + + r = free_and_strdup(&arg_chdir, optarg); + if (r < 0) + return log_oom(); + + arg_settings_mask |= SETTING_WORKING_DIRECTORY; + break; + + case ARG_PIVOT_ROOT: + r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_PIVOT_ROOT; + break; + + case ARG_NOTIFY_READY: + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg); + arg_notify_ready = r; + arg_settings_mask |= SETTING_NOTIFY_READY; + break; + + case ARG_ROOT_HASH: { + _cleanup_free_ void *k = NULL; + size_t l; + + r = unhexmem(optarg, strlen(optarg), &k, &l); + if (r < 0) + return log_error_errno(r, "Failed to parse root hash: %s", optarg); + if (l < sizeof(sd_id128_t)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg); + + free_and_replace(arg_verity_settings.root_hash, k); + arg_verity_settings.root_hash_size = l; + break; + } + + case ARG_ROOT_HASH_SIG: { + char *value; + size_t l; + void *p; + + if ((value = startswith(optarg, "base64:"))) { + r = unbase64mem(value, strlen(value), &p, &l); + if (r < 0) + return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg); + + } else { + r = read_full_file(optarg, (char**) &p, &l); + if (r < 0) + return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg); + } + + free_and_replace(arg_verity_settings.root_hash_sig, p); + arg_verity_settings.root_hash_sig_size = l; + break; + } + + case ARG_VERITY_DATA: + r = parse_path_argument(optarg, false, &arg_verity_settings.data_path); + if (r < 0) + return r; + break; + + case ARG_SYSTEM_CALL_FILTER: { + bool negative; + const char *items; + + negative = optarg[0] == '~'; + items = negative ? optarg + 1 : optarg; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&items, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse system call filter: %m"); + + if (negative) + r = strv_extend(&arg_syscall_deny_list, word); + else + r = strv_extend(&arg_syscall_allow_list, word); + if (r < 0) + return log_oom(); + } + + arg_settings_mask |= SETTING_SYSCALL_FILTER; + break; + } + + case ARG_RLIMIT: { + const char *eq; + _cleanup_free_ char *name = NULL; + int rl; + + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX); + return 0; + } + + eq = strchr(optarg, '='); + if (!eq) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--rlimit= expects an '=' assignment."); + + name = strndup(optarg, eq - optarg); + if (!name) + return log_oom(); + + rl = rlimit_from_string_harder(name); + if (rl < 0) + return log_error_errno(rl, "Unknown resource limit: %s", name); + + if (!arg_rlimit[rl]) { + arg_rlimit[rl] = new0(struct rlimit, 1); + if (!arg_rlimit[rl]) + return log_oom(); + } + + r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]); + if (r < 0) + return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1); + + arg_settings_mask |= SETTING_RLIMIT_FIRST << rl; + break; + } + + case ARG_OOM_SCORE_ADJUST: + r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust); + if (r < 0) + return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg); + + arg_oom_score_adjust_set = true; + arg_settings_mask |= SETTING_OOM_SCORE_ADJUST; + break; + + case ARG_CPU_AFFINITY: { + CPUSet cpuset; + + r = parse_cpu_set(optarg, &cpuset); + if (r < 0) + return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg); + + cpu_set_reset(&arg_cpu_set); + arg_cpu_set = cpuset; + arg_settings_mask |= SETTING_CPU_AFFINITY; + break; + } + + case ARG_RESOLV_CONF: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX); + return 0; + } + + arg_resolv_conf = resolv_conf_mode_from_string(optarg); + if (arg_resolv_conf < 0) + return log_error_errno(arg_resolv_conf, + "Failed to parse /etc/resolv.conf mode: %s", optarg); + + arg_settings_mask |= SETTING_RESOLV_CONF; + break; + + case ARG_TIMEZONE: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX); + return 0; + } + + arg_timezone = timezone_mode_from_string(optarg); + if (arg_timezone < 0) + return log_error_errno(arg_timezone, + "Failed to parse /etc/localtime mode: %s", optarg); + + arg_settings_mask |= SETTING_TIMEZONE; + break; + + case ARG_CONSOLE: + r = handle_arg_console(optarg); + if (r <= 0) + return r; + break; + + case 'P': + case ARG_PIPE: + r = handle_arg_console("pipe"); + if (r <= 0) + return r; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_SET_CREDENTIAL: + r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_CREDENTIALS; + break; + + case ARG_LOAD_CREDENTIAL: + r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_CREDENTIALS; + break; + + case ARG_BIND_USER: + if (!valid_user_group_name(optarg, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg); + + if (strv_extend(&arg_bind_user, optarg) < 0) + return log_oom(); + + arg_settings_mask |= SETTING_BIND_USER; + break; + + case ARG_SUPPRESS_SYNC: + r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_SUPPRESS_SYNC; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (argc > optind) { + strv_free(arg_parameters); + arg_parameters = strv_copy(argv + optind); + if (!arg_parameters) + return log_oom(); + + arg_settings_mask |= SETTING_START_MODE; + } + + if (arg_ephemeral && arg_template && !arg_directory) + /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically + * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's + * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral + * --directory=". */ + arg_directory = TAKE_PTR(arg_template); + + arg_caps_retain |= plus; + arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0; + arg_caps_retain &= ~minus; + + /* Make sure to parse environment before we reset the settings mask below */ + r = parse_environment(); + if (r < 0) + return r; + + /* Load all settings from .nspawn files */ + if (mask_no_settings) + arg_settings_mask = 0; + + /* Don't load any settings from .nspawn files */ + if (mask_all_settings) + arg_settings_mask = _SETTINGS_MASK_ALL; + + return 1; +} + +static int verify_arguments(void) { + int r; + + if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { + /* If we are running the stub init in the container, we don't need to look at what the init + * in the container supports, because we are not using it. Let's immediately pick the right + * setting based on the host system configuration. + * + * We only do this, if the user didn't use an environment variable to override the detection. + */ + + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether we are in all unified mode."); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + } + + if (arg_userns_mode != USER_NAMESPACE_NO) + arg_mount_settings |= MOUNT_USE_USERNS; + + if (arg_private_network) + arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS; + + if (!(arg_clone_ns_flags & CLONE_NEWPID) || + !(arg_clone_ns_flags & CLONE_NEWUTS)) { + arg_register = false; + if (arg_start_mode != START_PID1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing."); + } + + if (arg_userns_ownership < 0) + arg_userns_ownership = + arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO : + USER_NAMESPACE_OWNERSHIP_OFF; + + if (arg_start_mode == START_BOOT && arg_kill_signal <= 0) + arg_kill_signal = SIGRTMIN+3; + + if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */ + arg_read_only = true; + + if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) + arg_read_only = true; + + if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) + /* Save the user from accidentally registering either user-$SESSION.scope or user@.service. + * The latter is not technically a user session, but we don't need to labour the point. */ + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session."); + + if (arg_directory && arg_image) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined."); + + if (arg_template && arg_image) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined."); + + if (arg_template && !(arg_directory || arg_machine)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=."); + + if (arg_ephemeral && arg_template) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined."); + + /* Permit --ephemeral with --link-journal=try-* to satisfy principle of the least astonishment + * (by common sense, "try" means "do not fail if not possible") */ + if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO) && !arg_link_journal_try) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal={host,guest} may not be combined."); + + if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support."); + + if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--read-only and --private-users-ownership=chown may not be combined."); + + /* We don't support --private-users-ownership=chown together with any of the volatile modes since we + * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a + * massive copy-up (in case of overlay) making the entire exercise pointless. */ + if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined."); + + /* If --network-namespace-path is given with any other network-related option (except --private-network), + * we need to error out, to avoid conflicts between different network options. */ + if (arg_network_namespace_path && + (arg_network_interfaces || arg_network_macvlan || + arg_network_ipvlan || arg_network_veth_extra || + arg_network_bridge || arg_network_zone || + arg_network_veth)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options."); + + if (arg_network_bridge && arg_network_zone) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--network-bridge= and --network-zone= may not be combined."); + + if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network."); + + if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts."); + + if (arg_expose_ports && !arg_private_network) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking."); + + if (arg_caps_ambient) { + if (arg_caps_ambient == UINT64_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all."); + + if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting."); + + if (arg_start_mode == START_BOOT) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode."); + } + + if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users"); + + /* Drop duplicate --bind-user= entries */ + strv_uniq(arg_bind_user); + + r = custom_mount_check_all(); + if (r < 0) + return r; + + return 0; +} + +static int verify_network_interfaces_initialized(void) { + int r; + r = test_network_interfaces_initialized(arg_network_interfaces); + if (r < 0) + return r; + + r = test_network_interfaces_initialized(arg_network_macvlan); + if (r < 0) + return r; + + r = test_network_interfaces_initialized(arg_network_ipvlan); + if (r < 0) + return r; + + return 0; +} + +int userns_lchown(const char *p, uid_t uid, gid_t gid) { + assert(p); + + if (arg_userns_mode == USER_NAMESPACE_NO) + return 0; + + if (uid == UID_INVALID && gid == GID_INVALID) + return 0; + + if (uid != UID_INVALID) { + uid += arg_uid_shift; + + if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range) + return -EOVERFLOW; + } + + if (gid != GID_INVALID) { + gid += (gid_t) arg_uid_shift; + + if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range)) + return -EOVERFLOW; + } + + return RET_NERRNO(lchown(p, uid, gid)); +} + +int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) { + const char *q; + int r; + + q = prefix_roota(root, path); + r = RET_NERRNO(mkdir(q, mode)); + if (r == -EEXIST) + return 0; + if (r < 0) + return r; + + return userns_lchown(q, uid, gid); +} + +static const char *timezone_from_path(const char *path) { + return PATH_STARTSWITH_SET( + path, + "../usr/share/zoneinfo/", + "/usr/share/zoneinfo/"); +} + +static bool etc_writable(void) { + return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY); +} + +static int setup_timezone(const char *dest) { + _cleanup_free_ char *p = NULL, *etc = NULL; + const char *where, *check; + TimezoneMode m; + int r; + + assert(dest); + + if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) { + r = readlink_malloc("/etc/localtime", &p); + if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO) + m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF; + else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */ + m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND; + else if (r < 0) { + log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m"); + /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data + * file. + * + * Example: + * ln -s /usr/share/zoneinfo/UTC /etc/localtime + */ + return 0; + } else if (arg_timezone == TIMEZONE_AUTO) + m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND; + else + m = arg_timezone; + } else + m = arg_timezone; + + if (m == TIMEZONE_OFF) + return 0; + + r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL); + if (r < 0) { + log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m"); + return 0; + } + + where = strjoina(etc, "/localtime"); + + switch (m) { + + case TIMEZONE_DELETE: + if (unlink(where) < 0) + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where); + + return 0; + + case TIMEZONE_SYMLINK: { + _cleanup_free_ char *q = NULL; + const char *z, *what; + + z = timezone_from_path(p); + if (!z) { + log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone."); + return 0; + } + + r = readlink_malloc(where, &q); + if (r >= 0 && streq_ptr(timezone_from_path(q), z)) + return 0; /* Already pointing to the right place? Then do nothing .. */ + + check = strjoina(dest, "/usr/share/zoneinfo/", z); + r = chase(check, dest, 0, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z); + else { + if (unlink(where) < 0 && errno != ENOENT) { + log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */ + errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where); + return 0; + } + + what = strjoina("../usr/share/zoneinfo/", z); + if (symlink(what, where) < 0) { + log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, + errno, "Failed to correct timezone of container, ignoring: %m"); + return 0; + } + + break; + } + + _fallthrough_; + } + + case TIMEZONE_BIND: { + _cleanup_free_ char *resolved = NULL; + int found; + + found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL); + if (found < 0) { + log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m"); + return 0; + } + + if (found == 0) /* missing? */ + (void) touch(resolved); + + r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL); + if (r >= 0) + return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL); + + _fallthrough_; + } + + case TIMEZONE_COPY: + /* If mounting failed, try to copy */ + r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE); + if (r < 0) { + log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to copy /etc/localtime to %s, ignoring: %m", where); + return 0; + } + + break; + + default: + assert_not_reached(); + } + + /* Fix permissions of the symlink or file copy we just created */ + r = userns_lchown(where, 0, 0); + if (r < 0) + log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m"); + + return 0; +} + +static int have_resolv_conf(const char *path) { + assert(path); + + if (access(path, F_OK) < 0) { + if (errno == ENOENT) + return 0; + + return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path); + } + + return 1; +} + +static int resolved_listening(void) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *dns_stub_listener_mode = NULL; + int r; + + /* Check if resolved is listening */ + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_debug_errno(r, "Failed to open system bus: %m"); + + r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL); + if (r < 0) + return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m"); + if (r == 0) + return 0; + + r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode); + if (r < 0) + return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r)); + + return STR_IN_SET(dns_stub_listener_mode, "udp", "yes"); +} + +static int setup_resolv_conf(const char *dest) { + _cleanup_free_ char *etc = NULL; + const char *where, *what; + ResolvConfMode m; + int r; + + assert(dest); + + if (arg_resolv_conf == RESOLV_CONF_AUTO) { + if (arg_private_network) + m = RESOLV_CONF_OFF; + else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0) + m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB; + else if (have_resolv_conf("/etc/resolv.conf") > 0) + m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST; + else + m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF; + + } else + m = arg_resolv_conf; + + if (m == RESOLV_CONF_OFF) + return 0; + + r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL); + if (r < 0) { + log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m"); + return 0; + } + + where = strjoina(etc, "/resolv.conf"); + + if (m == RESOLV_CONF_DELETE) { + if (unlink(where) < 0) + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where); + + return 0; + } + + if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC)) + what = PRIVATE_STATIC_RESOLV_CONF; + else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK)) + what = PRIVATE_UPLINK_RESOLV_CONF; + else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB)) + what = PRIVATE_STUB_RESOLV_CONF; + else + what = "/etc/resolv.conf"; + + if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) { + _cleanup_free_ char *resolved = NULL; + int found; + + found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL); + if (found < 0) { + log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m"); + return 0; + } + + if (found == 0) /* missing? */ + (void) touch(resolved); + + r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL); + if (r >= 0) + return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL); + + /* If that didn't work, let's copy the file */ + } + + if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB)) + r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE); + else + r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK); + if (r < 0) { + /* If the file already exists as symlink, let's suppress the warning, under the assumption that + * resolved or something similar runs inside and the symlink points there. + * + * If the disk image is read-only, there's also no point in complaining. + */ + log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) && + IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where); + return 0; + } + + r = userns_lchown(where, 0, 0); + if (r < 0) + log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m"); + + return 0; +} + +static int setup_boot_id(void) { + _cleanup_(unlink_and_freep) char *from = NULL; + _cleanup_free_ char *path = NULL; + sd_id128_t rnd = SD_ID128_NULL; + const char *to; + int r; + + /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */ + + r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path); + if (r < 0) + return log_error_errno(r, "Failed to generate random boot ID path: %m"); + + r = sd_id128_randomize(&rnd); + if (r < 0) + return log_error_errno(r, "Failed to generate random boot id: %m"); + + r = id128_write(path, ID128_FORMAT_UUID, rnd); + if (r < 0) + return log_error_errno(r, "Failed to write boot id: %m"); + + from = TAKE_PTR(path); + to = "/proc/sys/kernel/random/boot_id"; + + r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); +} + +static int copy_devnodes(const char *dest) { + static const char devnodes[] = + "null\0" + "zero\0" + "full\0" + "random\0" + "urandom\0" + "tty\0" + "net/tun\0"; + + int r = 0; + + assert(dest); + + BLOCK_WITH_UMASK(0000); + + /* Create /dev/net, so that we can create /dev/net/tun in it */ + if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0) + return log_error_errno(r, "Failed to create /dev/net directory: %m"); + + NULSTR_FOREACH(d, devnodes) { + _cleanup_free_ char *from = NULL, *to = NULL; + struct stat st; + + from = path_join("/dev/", d); + if (!from) + return log_oom(); + + to = path_join(dest, from); + if (!to) + return log_oom(); + + if (stat(from, &st) < 0) { + + if (errno != ENOENT) + return log_error_errno(errno, "Failed to stat %s: %m", from); + + } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "%s is not a char or block device, cannot copy.", from); + else { + _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL; + + if (mknod(to, st.st_mode, st.st_rdev) < 0) { + /* Explicitly warn the user when /dev is already populated. */ + if (errno == EEXIST) + log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest); + if (errno != EPERM) + return log_error_errno(errno, "mknod(%s) failed: %m", to); + + /* Some systems abusively restrict mknod but allow bind mounts. */ + r = touch(to); + if (r < 0) + return log_error_errno(r, "touch (%s) failed: %m", to); + r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL); + if (r < 0) + return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to); + } + + r = userns_lchown(to, 0, 0); + if (r < 0) + return log_error_errno(r, "chown() of device node %s failed: %m", to); + + dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block"); + if (!dn) + return log_oom(); + + r = userns_mkdir(dest, dn, 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create '%s': %m", dn); + + if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0) + return log_oom(); + + prefixed = path_join(dest, sl); + if (!prefixed) + return log_oom(); + + t = path_join("..", d); + if (!t) + return log_oom(); + + if (symlink(t, prefixed) < 0) + log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed); + } + } + + return r; +} + +static int make_extra_nodes(const char *dest) { + size_t i; + int r; + + BLOCK_WITH_UMASK(0000); + + for (i = 0; i < arg_n_extra_nodes; i++) { + _cleanup_free_ char *path = NULL; + DeviceNode *n = arg_extra_nodes + i; + + path = path_join(dest, n->path); + if (!path) + return log_oom(); + + if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0) + return log_error_errno(errno, "Failed to create device node '%s': %m", path); + + r = chmod_and_chown(path, n->mode, n->uid, n->gid); + if (r < 0) + return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path); + } + + return 0; +} + +static int setup_pts(const char *dest) { + _cleanup_free_ char *options = NULL; + const char *p; + int r; + +#if HAVE_SELINUX + if (arg_selinux_apifs_context) + (void) asprintf(&options, + "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"", + arg_uid_shift + TTY_GID, + arg_selinux_apifs_context); + else +#endif + (void) asprintf(&options, + "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT, + arg_uid_shift + TTY_GID); + + if (!options) + return log_oom(); + + /* Mount /dev/pts itself */ + p = prefix_roota(dest, "/dev/pts"); + r = RET_NERRNO(mkdir(p, 0755)); + if (r < 0) + return log_error_errno(r, "Failed to create /dev/pts: %m"); + + r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options); + if (r < 0) + return r; + r = userns_lchown(p, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to chown /dev/pts: %m"); + + /* Create /dev/ptmx symlink */ + p = prefix_roota(dest, "/dev/ptmx"); + if (symlink("pts/ptmx", p) < 0) + return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m"); + r = userns_lchown(p, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to chown /dev/ptmx: %m"); + + /* And fix /dev/pts/ptmx ownership */ + p = prefix_roota(dest, "/dev/pts/ptmx"); + r = userns_lchown(p, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m"); + + return 0; +} + +static int setup_stdio_as_dev_console(void) { + _cleanup_close_ int terminal = -EBADF; + int r; + + /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later + * explicitly, if we are configured to. */ + terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY); + if (terminal < 0) + return log_error_errno(terminal, "Failed to open console: %m"); + + /* Make sure we can continue logging to the original stderr, even if + * stderr points elsewhere now */ + r = log_dup_console(); + if (r < 0) + return log_error_errno(r, "Failed to duplicate stderr: %m"); + + /* invalidates 'terminal' on success and failure */ + r = rearrange_stdio(terminal, terminal, terminal); + TAKE_FD(terminal); + if (r < 0) + return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m"); + + return 0; +} + +static int setup_dev_console(const char *console) { + _cleanup_free_ char *p = NULL; + int r; + + /* Create /dev/console symlink */ + r = path_make_relative("/dev", console, &p); + if (r < 0) + return log_error_errno(r, "Failed to create relative path: %m"); + + if (symlink(p, "/dev/console") < 0) + return log_error_errno(errno, "Failed to create /dev/console symlink: %m"); + + return 0; +} + +static int setup_keyring(void) { + key_serial_t keyring; + + /* Allocate a new session keyring for the container. This makes sure the keyring of the session + * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block + * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary, + * but in case people explicitly allow-list these system calls let's make sure we don't leak anything + * into the container. */ + + keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0); + if (keyring == -1) { + if (errno == ENOSYS) + log_debug_errno(errno, "Kernel keyring not supported, ignoring."); + else if (ERRNO_IS_PRIVILEGE(errno)) + log_debug_errno(errno, "Kernel keyring access prohibited, ignoring."); + else + return log_error_errno(errno, "Setting up kernel keyring failed: %m"); + } + + return 0; +} + +static int setup_credentials(const char *root) { + const char *q; + int r; + + if (arg_n_credentials <= 0) + return 0; + + r = userns_mkdir(root, "/run/host", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host: %m"); + + r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host/credentials: %m"); + + q = prefix_roota(root, "/run/host/credentials"); + r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700"); + if (r < 0) + return r; + + for (size_t i = 0; i < arg_n_credentials; i++) { + _cleanup_free_ char *j = NULL; + _cleanup_close_ int fd = -EBADF; + + j = path_join(q, arg_credentials[i].id); + if (!j) + return log_oom(); + + fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600); + if (fd < 0) + return log_error_errno(errno, "Failed to create credential file %s: %m", j); + + r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size); + if (r < 0) + return log_error_errno(r, "Failed to write credential to file %s: %m", j); + + if (fchmod(fd, 0400) < 0) + return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0) + return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j); + } + } + + if (chmod(q, 0500) < 0) + return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q); + + r = userns_lchown(q, 0, 0); + if (r < 0) + return r; + + /* Make both mount and superblock read-only now */ + r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) + return r; + + return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500"); +} + +static int setup_kmsg(int fd_inner_socket) { + _cleanup_(unlink_and_freep) char *from = NULL; + _cleanup_free_ char *fifo = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(fd_inner_socket >= 0); + + BLOCK_WITH_UMASK(0000); + + /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to + * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves + * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems + * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */ + + r = tempfn_random_child("/run", "proc-kmsg", &fifo); + if (r < 0) + return log_error_errno(r, "Failed to generate kmsg path: %m"); + + if (mkfifo(fifo, 0600) < 0) + return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m"); + + from = TAKE_PTR(fifo); + + r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL); + if (r < 0) + return r; + + fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open fifo: %m"); + + /* Store away the fd in the socket, so that it stays open as long as we run the child */ + r = send_one_fd(fd_inner_socket, fd, 0); + if (r < 0) + return log_error_errno(r, "Failed to send FIFO fd: %m"); + + return 0; +} + +struct ExposeArgs { + union in_addr_union address4; + union in_addr_union address6; + struct FirewallContext *fw_ctx; +}; + +static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + struct ExposeArgs *args = ASSERT_PTR(userdata); + + assert(rtnl); + assert(m); + + (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4); + (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6); + return 0; +} + +static int setup_hostname(void) { + int r; + + if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0) + return 0; + + r = sethostname_idempotent(arg_hostname ?: arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to set hostname: %m"); + + return 0; +} + +static int setup_journal(const char *directory) { + _cleanup_free_ char *d = NULL; + const char *p, *q; + sd_id128_t this_id; + bool try; + int r; + + /* Don't link journals in ephemeral mode */ + if (arg_ephemeral) + return 0; + + if (arg_link_journal == LINK_NO) + return 0; + + try = arg_link_journal_try || arg_link_journal == LINK_AUTO; + + r = sd_id128_get_machine(&this_id); + if (r < 0) + return log_error_errno(r, "Failed to retrieve machine ID: %m"); + + if (sd_id128_equal(arg_uuid, this_id)) { + log_full(try ? LOG_WARNING : LOG_ERR, + "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid)); + if (try) + return 0; + return -EEXIST; + } + + FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") { + r = userns_mkdir(directory, dirname, 0755, 0, 0); + if (r < 0) { + bool ignore = r == -EROFS && try; + log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r, + "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : ""); + return ignore ? 0 : r; + } + } + + p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid)); + q = prefix_roota(directory, p); + + if (path_is_mount_point(p, NULL, 0) > 0) { + if (try) + return 0; + + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: already a mount point, refusing to use for journal", p); + } + + if (path_is_mount_point(q, NULL, 0) > 0) { + if (try) + return 0; + + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: already a mount point, refusing to use for journal", q); + } + + r = readlink_and_make_absolute(p, &d); + if (r >= 0) { + if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) && + path_equal(d, q)) { + + r = userns_mkdir(directory, p, 0755, 0, 0); + if (r < 0) + log_warning_errno(r, "Failed to create directory %s: %m", q); + return 0; + } + + if (unlink(p) < 0) + return log_error_errno(errno, "Failed to remove symlink %s: %m", p); + } else if (r == -EINVAL) { + + if (arg_link_journal == LINK_GUEST && + rmdir(p) < 0) { + + if (errno == ENOTDIR) { + log_error("%s already exists and is neither a symlink nor a directory", p); + return r; + } else + return log_error_errno(errno, "Failed to remove %s: %m", p); + } + } else if (r != -ENOENT) + return log_error_errno(r, "readlink(%s) failed: %m", p); + + if (arg_link_journal == LINK_GUEST) { + + if (symlink(q, p) < 0) { + if (try) { + log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p); + return 0; + } else + return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p); + } + + r = userns_mkdir(directory, p, 0755, 0, 0); + if (r < 0) + log_warning_errno(r, "Failed to create directory %s: %m", q); + return 0; + } + + if (arg_link_journal == LINK_HOST) { + /* don't create parents here — if the host doesn't have + * permanent journal set up, don't force it here */ + + r = RET_NERRNO(mkdir(p, 0755)); + if (r < 0 && r != -EEXIST) { + if (try) { + log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p); + return 0; + } else + return log_error_errno(r, "Failed to create %s: %m", p); + } + + } else if (access(p, F_OK) < 0) + return 0; + + if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0) + log_warning("%s is not empty, proceeding anyway.", q); + + r = userns_mkdir(directory, p, 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s: %m", q); + + r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL); + if (r < 0) + return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m"); + + return 0; +} + +static int drop_capabilities(uid_t uid) { + CapabilityQuintet q; + + /* Let's initialize all five capability sets to something valid. If the quintet was configured via + * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from + * arg_caps_retain. */ + + if (capability_quintet_is_set(&arg_full_capabilities)) { + q = arg_full_capabilities; + + if (q.bounding == UINT64_MAX) + q.bounding = uid == 0 ? arg_caps_retain : 0; + + if (q.effective == UINT64_MAX) + q.effective = uid == 0 ? q.bounding : 0; + + if (q.inheritable == UINT64_MAX) + q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient; + + if (q.permitted == UINT64_MAX) + q.permitted = uid == 0 ? q.bounding : arg_caps_ambient; + + if (q.ambient == UINT64_MAX && ambient_capabilities_supported()) + q.ambient = arg_caps_ambient; + + if (capability_quintet_mangle(&q)) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set."); + + } else { + q = (CapabilityQuintet) { + .bounding = arg_caps_retain, + .effective = uid == 0 ? arg_caps_retain : 0, + .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient, + .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient, + .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX, + }; + + /* If we're not using OCI, proceed with mangled capabilities (so we don't error out) + * in order to maintain the same behavior as systemd < 242. */ + if (capability_quintet_mangle(&q)) + log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING, + "Some capabilities will not be set because they are not in the current bounding set."); + + } + + return capability_quintet_enforce(&q); +} + +static int reset_audit_loginuid(void) { + _cleanup_free_ char *p = NULL; + int r; + + if ((arg_clone_ns_flags & CLONE_NEWPID) == 0) + return 0; + + r = read_one_line_file("/proc/self/loginuid", &p); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to read /proc/self/loginuid: %m"); + + /* Already reset? */ + if (streq(p, "4294967295")) + return 0; + + r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) { + log_error_errno(r, + "Failed to reset audit login UID. This probably means that your kernel is too\n" + "old and you have audit enabled. Note that the auditing subsystem is known to\n" + "be incompatible with containers on old kernels. Please make sure to upgrade\n" + "your kernel or to off auditing with 'audit=0' on the kernel command line before\n" + "using systemd-nspawn. Sleeping for 5s... (%m)"); + + sleep(5); + } + + return 0; +} + +static int mount_tunnel_dig(const char *root) { + const char *p, *q; + int r; + + (void) mkdir_p("/run/systemd/nspawn/", 0755); + (void) mkdir_p("/run/systemd/nspawn/propagate", 0600); + p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); + (void) mkdir_p(p, 0600); + + r = userns_mkdir(root, "/run/host", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host: %m"); + + r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m"); + + q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL); + r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL); + if (r < 0) + return r; + + return 0; +} + +static int mount_tunnel_open(void) { + int r; + + r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL); + if (r < 0) + return r; + + return 0; +} + +static int setup_machine_id(const char *directory) { + int r; + + /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the + * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The + * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not + * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id + * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the + * container behaves nicely). */ + + r = id128_get_machine(directory, &arg_uuid); + if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) { + /* If the file is missing, empty, or uninitialized, we don't mind */ + if (sd_id128_is_null(arg_uuid)) { + r = sd_id128_randomize(&arg_uuid); + if (r < 0) + return log_error_errno(r, "Failed to acquire randomized machine UUID: %m"); + } + } else if (r < 0) + return log_error_errno(r, "Failed to read machine ID from container image: %m"); + + return 0; +} + +static int recursive_chown(const char *directory, uid_t shift, uid_t range) { + int r; + + assert(directory); + + if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN) + return 0; + + r = path_patch_uid(directory, arg_uid_shift, arg_uid_range); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16."); + if (r == -EBADE) + return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match."); + if (r < 0) + return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m"); + if (r == 0) + log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation."); + else + log_debug("Patched directory tree to match UID/GID range."); + + return r; +} + +/* + * Return values: + * < 0 : wait_for_terminate() failed to get the state of the + * container, the container was terminated by a signal, or + * failed for an unknown reason. No change is made to the + * container argument. + * > 0 : The program executed in the container terminated with an + * error. The exit code of the program executed in the + * container is returned. The container argument has been set + * to CONTAINER_TERMINATED. + * 0 : The container is being rebooted, has been shut down or exited + * successfully. The container argument has been set to either + * CONTAINER_TERMINATED or CONTAINER_REBOOTED. + * + * That is, success is indicated by a return value of zero, and an + * error is indicated by a non-zero value. + */ +static int wait_for_container(pid_t pid, ContainerStatus *container) { + siginfo_t status; + int r; + + r = wait_for_terminate(pid, &status); + if (r < 0) + return log_warning_errno(r, "Failed to wait for container: %m"); + + switch (status.si_code) { + + case CLD_EXITED: + if (status.si_status == 0) + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine); + else + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status); + + *container = CONTAINER_TERMINATED; + return status.si_status; + + case CLD_KILLED: + if (status.si_status == SIGINT) { + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine); + *container = CONTAINER_TERMINATED; + return 0; + + } else if (status.si_status == SIGHUP) { + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine); + *container = CONTAINER_REBOOTED; + return 0; + } + + _fallthrough_; + case CLD_DUMPED: + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status)); + + default: + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Container %s failed due to unknown reason.", arg_machine); + } +} + +static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + pid_t pid; + + pid = PTR_TO_PID(userdata); + if (pid > 0) { + if (kill(pid, arg_kill_signal) >= 0) { + log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination."); + sd_event_source_set_userdata(s, NULL); + return 0; + } + } + + sd_event_exit(sd_event_source_get_event(s), 0); + return 0; +} + +static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) { + pid_t pid; + + assert(s); + assert(ssi); + + pid = PTR_TO_PID(userdata); + + for (;;) { + siginfo_t si = {}; + + if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0) + return log_error_errno(errno, "Failed to waitid(): %m"); + if (si.si_pid == 0) /* No pending children. */ + break; + if (si.si_pid == pid) { + /* The main process we care for has exited. Return from + * signal handler but leave the zombie. */ + sd_event_exit(sd_event_source_get_event(s), 0); + break; + } + + /* Reap all other children. */ + (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED); + } + + return 0; +} + +static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) { + pid_t pid; + + assert(m); + + pid = PTR_TO_PID(userdata); + + if (arg_kill_signal > 0) { + log_info("Container termination requested. Attempting to halt container."); + (void) kill(pid, arg_kill_signal); + } else { + log_info("Container termination requested. Exiting."); + sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0); + } + + return 0; +} + +static int determine_names(void) { + int r; + + if (arg_template && !arg_directory && arg_machine) { + + /* If --template= was specified then we should not + * search for a machine, but instead create a new one + * in /var/lib/machine. */ + + arg_directory = path_join("/var/lib/machines", arg_machine); + if (!arg_directory) + return log_oom(); + } + + if (!arg_image && !arg_directory) { + if (arg_machine) { + _cleanup_(image_unrefp) Image *i = NULL; + + r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i); + if (r == -ENOENT) + return log_error_errno(r, "No image for machine '%s'.", arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine); + + if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK)) + r = free_and_strdup(&arg_image, i->path); + else + r = free_and_strdup(&arg_directory, i->path); + if (r < 0) + return log_oom(); + + if (!arg_ephemeral) + arg_read_only = arg_read_only || i->read_only; + } else { + r = safe_getcwd(&arg_directory); + if (r < 0) + return log_error_errno(r, "Failed to determine current directory: %m"); + } + + if (!arg_directory && !arg_image) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i."); + } + + if (!arg_machine) { + if (arg_directory && path_equal(arg_directory, "/")) + arg_machine = gethostname_malloc(); + else if (arg_image) { + char *e; + + r = path_extract_filename(arg_image, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image); + + /* Truncate suffix if there is one */ + e = endswith(arg_machine, ".raw"); + if (e) + *e = 0; + } else { + r = path_extract_filename(arg_directory, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory); + } + + hostname_cleanup(arg_machine); + if (!hostname_is_valid(arg_machine, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M."); + + /* Copy the machine name before the random suffix is added below, otherwise we won't be able + * to match fixed config file names. */ + arg_settings_filename = strjoin(arg_machine, ".nspawn"); + if (!arg_settings_filename) + return log_oom(); + + /* Add a random suffix when this is an ephemeral machine, so that we can run many + * instances at once without manually having to specify -M each time. */ + if (arg_ephemeral) + if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0) + return log_oom(); + } else { + arg_settings_filename = strjoin(arg_machine, ".nspawn"); + if (!arg_settings_filename) + return log_oom(); + } + + return 0; +} + +static int chase_and_update(char **p, unsigned flags) { + char *chased; + int r; + + assert(p); + + if (!*p) + return 0; + + r = chase(*p, NULL, flags, &chased, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve path %s: %m", *p); + + return free_and_replace(*p, chased); +} + +static int determine_uid_shift(const char *directory) { + + if (arg_userns_mode == USER_NAMESPACE_NO) { + arg_uid_shift = 0; + return 0; + } + + if (arg_uid_shift == UID_INVALID) { + struct stat st; + + /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */ + + if (stat(directory, &st) < 0) + return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory); + + arg_uid_shift = st.st_uid & UINT32_C(0xffff0000); + + if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "UID and GID base of %s don't match.", directory); + + arg_uid_range = UINT32_C(0x10000); + + if (arg_uid_shift != 0) { + /* If the image is shifted already, then we'll fall back to classic chowning, for + * compatibility (and simplicity), or refuse if mapping is explicitly requested. */ + + if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) { + log_debug("UID base of %s is non-zero, not using UID mapping.", directory); + arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN; + } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "UID base of %s is not zero, UID mapping not supported.", directory); + } + } + + if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range."); + + return 0; +} + +static unsigned long effective_clone_ns_flags(void) { + unsigned long flags = arg_clone_ns_flags; + + if (arg_private_network) + flags |= CLONE_NEWNET; + if (arg_use_cgns) + flags |= CLONE_NEWCGROUP; + if (arg_userns_mode != USER_NAMESPACE_NO) + flags |= CLONE_NEWUSER; + + return flags; +} + +static int patch_sysctl(void) { + + /* This table is inspired by runc's sysctl() function */ + static const struct { + const char *key; + bool prefix; + unsigned long clone_flags; + } safe_sysctl[] = { + { "kernel.hostname", false, CLONE_NEWUTS }, + { "kernel.domainname", false, CLONE_NEWUTS }, + { "kernel.msgmax", false, CLONE_NEWIPC }, + { "kernel.msgmnb", false, CLONE_NEWIPC }, + { "kernel.msgmni", false, CLONE_NEWIPC }, + { "kernel.sem", false, CLONE_NEWIPC }, + { "kernel.shmall", false, CLONE_NEWIPC }, + { "kernel.shmmax", false, CLONE_NEWIPC }, + { "kernel.shmmni", false, CLONE_NEWIPC }, + { "fs.mqueue.", true, CLONE_NEWIPC }, + { "net.", true, CLONE_NEWNET }, + }; + + unsigned long flags; + int r; + + flags = effective_clone_ns_flags(); + + STRV_FOREACH_PAIR(k, v, arg_sysctl) { + bool good = false; + size_t i; + + for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) { + + if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags)) + continue; + + if (safe_sysctl[i].prefix) + good = startswith(*k, safe_sysctl[i].key); + else + good = streq(*k, safe_sysctl[i].key); + + if (good) + break; + } + + if (!good) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k); + + r = sysctl_write(*k, *v); + if (r < 0) + return log_error_errno(r, "Failed to write sysctl '%s': %m", *k); + } + + return 0; +} + +static int inner_child( + Barrier *barrier, + int fd_inner_socket, + FDSet *fds, + char **os_release_pairs) { + + _cleanup_free_ char *home = NULL; + size_t n_env = 1; + char *envp[] = { + (char*) "PATH=" DEFAULT_PATH_COMPAT, + NULL, /* container */ + NULL, /* TERM */ + NULL, /* HOME */ + NULL, /* USER */ + NULL, /* LOGNAME */ + NULL, /* container_uuid */ + NULL, /* LISTEN_FDS */ + NULL, /* LISTEN_PID */ + NULL, /* NOTIFY_SOCKET */ + NULL, /* CREDENTIALS_DIRECTORY */ + NULL, /* LANG */ + NULL + }; + const char *exec_target; + _cleanup_strv_free_ char **env_use = NULL; + int r, which_failed; + + /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one + * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID, + * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace, + * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS + * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by + * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER + * namespace. + * + * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through + * unshare(). See below. */ + + assert(barrier); + assert(fd_inner_socket >= 0); + + log_debug("Inner child is initializing."); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* Tell the parent, that it now can write the UID map. */ + (void) barrier_place(barrier); /* #1 */ + + /* Wait until the parent wrote the UID map */ + if (!barrier_place_and_sync(barrier)) /* #2 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early"); + + /* Become the new root user inside our namespace */ + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Couldn't become new root: %m"); + + /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them + * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect + * propagation, but simply create new peer groups for all our mounts). */ + r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL); + if (r < 0) + return r; + } + + r = mount_all(NULL, + arg_mount_settings | MOUNT_IN_USERNS, + arg_uid_shift, + arg_selinux_apifs_context); + if (r < 0) + return r; + + if (!arg_network_namespace_path && arg_private_network) { + r = unshare(CLONE_NEWNET); + if (r < 0) + return log_error_errno(errno, "Failed to unshare network namespace: %m"); + + /* Tell the parent that it can setup network interfaces. */ + (void) barrier_place(barrier); /* #3 */ + } + + r = mount_sysfs(NULL, arg_mount_settings); + if (r < 0) + return r; + + /* Wait until we are cgroup-ified, so that we + * can mount the right cgroup path writable */ + if (!barrier_place_and_sync(barrier)) /* #4 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "Parent died too early"); + + if (arg_use_cgns) { + r = unshare(CLONE_NEWCGROUP); + if (r < 0) + return log_error_errno(errno, "Failed to unshare cgroup namespace: %m"); + r = mount_cgroups( + "", + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + true); + } else + r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); + if (r < 0) + return r; + + r = setup_boot_id(); + if (r < 0) + return r; + + r = setup_kmsg(fd_inner_socket); + if (r < 0) + return r; + + r = mount_custom( + "/", + arg_custom_mounts, + arg_n_custom_mounts, + 0, + 0, + arg_selinux_apifs_context, + MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS); + if (r < 0) + return r; + + if (setsid() < 0) + return log_error_errno(errno, "setsid() failed: %m"); + + if (arg_private_network) + (void) loopback_setup(); + + if (arg_expose_ports) { + r = expose_port_send_rtnl(fd_inner_socket); + if (r < 0) + return r; + } + + if (arg_console_mode != CONSOLE_PIPE) { + _cleanup_close_ int master = -EBADF; + _cleanup_free_ char *console = NULL; + + /* Allocate a pty and make it available as /dev/console. */ + master = openpt_allocate(O_RDWR|O_NONBLOCK, &console); + if (master < 0) + return log_error_errno(master, "Failed to allocate a pty: %m"); + + r = setup_dev_console(console); + if (r < 0) + return log_error_errno(r, "Failed to set up /dev/console: %m"); + + r = send_one_fd(fd_inner_socket, master, 0); + if (r < 0) + return log_error_errno(r, "Failed to send master fd: %m"); + + r = setup_stdio_as_dev_console(); + if (r < 0) + return r; + } + + r = patch_sysctl(); + if (r < 0) + return r; + + if (arg_oom_score_adjust_set) { + r = set_oom_score_adjust(arg_oom_score_adjust); + if (r < 0) + return log_error_errno(r, "Failed to adjust OOM score: %m"); + } + + if (arg_cpu_set.set) + if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0) + return log_error_errno(errno, "Failed to set CPU affinity: %m"); + + (void) setup_hostname(); + + if (arg_personality != PERSONALITY_INVALID) { + r = safe_personality(arg_personality); + if (r < 0) + return log_error_errno(r, "personality() failed: %m"); +#ifdef ARCHITECTURE_SECONDARY + } else if (arg_architecture == ARCHITECTURE_SECONDARY) { + r = safe_personality(PER_LINUX32); + if (r < 0) + return log_error_errno(r, "personality() failed: %m"); +#endif + } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture()) + log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming " + "invocation with qemu userspace emulator (or equivalent) in effect.", + architecture_to_string(arg_architecture)); + + r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed); + if (r < 0) + return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + +#if HAVE_SECCOMP + if (arg_seccomp) { + + if (is_seccomp_available()) { + r = seccomp_load(arg_seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return log_error_errno(r, "Failed to install seccomp filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install seccomp filter: %m"); + } + } else +#endif + { + r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list); + if (r < 0) + return r; + } + + if (arg_suppress_sync) { +#if HAVE_SECCOMP + r = seccomp_suppress_sync(); + if (r < 0) + log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m"); +#else + log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting."); +#endif + } + +#if HAVE_SELINUX + if (arg_selinux_context) + if (setexeccon(arg_selinux_context) < 0) + return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); +#endif + + /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps + * if we need to later on. */ + if (prctl(PR_SET_KEEPCAPS, 1) < 0) + return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m"); + + if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid)) + r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE); + else + r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home); + if (r < 0) + return r; + + r = drop_capabilities(getuid()); + if (r < 0) + return log_error_errno(r, "Dropping capabilities failed: %m"); + + if (arg_no_new_privileges) + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) + return log_error_errno(errno, "Failed to disable new privileges: %m"); + + /* LXC sets container=lxc, so follow the scheme here */ + envp[n_env++] = strjoina("container=", arg_container_service_name); + + envp[n_env] = strv_find_prefix(environ, "TERM="); + if (envp[n_env]) + n_env++; + + if (home || !uid_is_valid(arg_uid) || arg_uid == 0) + if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0) + return log_oom(); + + if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0) + if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 || + asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0) + return log_oom(); + + assert(!sd_id128_is_null(arg_uuid)); + + if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0) + return log_oom(); + + if (fdset_size(fds) > 0) { + r = fdset_cloexec(fds, false); + if (r < 0) + return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors."); + + if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) || + (asprintf(envp + n_env++, "LISTEN_PID=1") < 0)) + return log_oom(); + } + if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0) + return log_oom(); + + if (arg_n_credentials > 0) { + envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials"); + if (!envp[n_env]) + return log_oom(); + n_env++; + } + + if (arg_start_mode != START_BOOT) { + envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE); + if (!envp[n_env]) + return log_oom(); + n_env++; + } + + env_use = strv_env_merge(envp, os_release_pairs, arg_setenv); + if (!env_use) + return log_oom(); + + /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */ + if (!barrier_place_and_sync(barrier)) /* #5 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early"); + + if (arg_chdir) + if (chdir(arg_chdir) < 0) + return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir); + + if (arg_start_mode == START_PID2) { + r = stub_pid1(arg_uuid); + if (r < 0) + return r; + } + + if (arg_console_mode != CONSOLE_PIPE) { + /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we + * are configured for that. Acquire it as controlling tty. */ + if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0) + return log_error_errno(errno, "Failed to acquire controlling TTY: %m"); + } + + log_debug("Inner child completed, invoking payload."); + + /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first + * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need + * it again. Note that the other fds closed here are at least the locking and barrier fds. */ + log_close(); + log_set_open_when_needed(true); + log_settle_target(); + + (void) fdset_close_others(fds); + + if (arg_start_mode == START_BOOT) { + char **a; + size_t m; + + /* Automatically search for the init system */ + + m = strv_length(arg_parameters); + a = newa(char*, m + 2); + memcpy_safe(a + 1, arg_parameters, m * sizeof(char*)); + a[1 + m] = NULL; + + FOREACH_STRING(init, + "/usr/lib/systemd/systemd", + "/lib/systemd/systemd", + "/sbin/init") { + a[0] = (char*) init; + execve(a[0], a, env_use); + } + + exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init"; + } else if (!strv_isempty(arg_parameters)) { + const char *dollar_path; + + exec_target = arg_parameters[0]; + + /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the + * binary. */ + dollar_path = strv_env_get(env_use, "PATH"); + if (dollar_path) { + if (setenv("PATH", dollar_path, 1) < 0) + return log_error_errno(errno, "Failed to update $PATH: %m"); + } + + execvpe(arg_parameters[0], arg_parameters, env_use); + } else { + if (!arg_chdir) + /* If we cannot change the directory, we'll end up in /, that is expected. */ + (void) chdir(home ?: "/root"); + + execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use); + if (!streq(DEFAULT_USER_SHELL, "/bin/bash")) + execle("/bin/bash", "-bash", NULL, env_use); + if (!streq(DEFAULT_USER_SHELL, "/bin/sh")) + execle("/bin/sh", "-sh", NULL, env_use); + + exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh"; + } + + return log_error_errno(errno, "execv(%s) failed: %m", exec_target); +} + +static int setup_notify_child(void) { + _cleanup_close_ int fd = -EBADF; + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH, + }; + int r; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate notification socket: %m"); + + (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755); + (void) sockaddr_un_unlink(&sa.un); + + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) + return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m"); + + r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m"); + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + return TAKE_FD(fd); +} + +static int outer_child( + Barrier *barrier, + const char *directory, + DissectedImage *dissected_image, + int fd_outer_socket, + int fd_inner_socket, + FDSet *fds, + int netns_fd) { + + _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL; + _cleanup_strv_free_ char **os_release_pairs = NULL; + _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF; + bool idmap = false; + const char *p; + pid_t pid; + ssize_t l; + int r; + + /* This is the "outer" child process, i.e the one forked off by the container manager itself. It + * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in + * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET + * namespaces. After it completed a number of initializations a second child (the "inner" one) is + * forked off it, and it exits. */ + + assert(barrier); + assert(directory); + assert(fd_outer_socket >= 0); + assert(fd_inner_socket >= 0); + + log_debug("Outer child is initializing."); + + r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs); + if (r < 0) + log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m"); + + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) + return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); + + r = reset_audit_loginuid(); + if (r < 0) + return r; + + /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate + * mounts to the real root. */ + r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL); + if (r < 0) + return r; + + if (dissected_image) { + /* If we are operating on a disk image, then mount its root directory now, but leave out the + * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest, + * but then with the uid shift known. That way we can mount VFAT file systems shifted to the + * right place right away. This makes sure ESP partitions and userns are compatible. */ + + r = dissected_image_mount_and_warn( + dissected_image, + directory, + arg_uid_shift, + arg_uid_range, + /* userns_fd= */ -EBADF, + DISSECT_IMAGE_MOUNT_ROOT_ONLY| + DISSECT_IMAGE_DISCARD_ON_LOOP| + DISSECT_IMAGE_USR_NO_ROOT| + (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)| + (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0)); + if (r < 0) + return r; + } + + r = determine_uid_shift(directory); + if (r < 0) + return r; + + if (arg_userns_mode != USER_NAMESPACE_NO) { + r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to pin outer mount namespace: %m"); + + l = send_one_fd(fd_outer_socket, mntns_fd, 0); + if (l < 0) + return log_error_errno(l, "Failed to send outer mount namespace fd: %m"); + mntns_fd = safe_close(mntns_fd); + + /* Let the parent know which UID shift we read from the image */ + l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send UID shift: %m"); + if (l != sizeof(arg_uid_shift)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short write while sending UID shift."); + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* When we are supposed to pick the UID shift, the parent will check now whether the + * UID shift we just read from the image is available. If yes, it will send the UID + * shift back to us, if not it will pick a different one, and send it back to us. */ + + l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) + return log_error_errno(errno, "Failed to recv UID shift: %m"); + if (l != sizeof(arg_uid_shift)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short read while receiving UID shift."); + } + + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, + "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); + } + + if (path_equal(directory, "/")) { + /* If the directory we shall boot is the host, let's operate on a bind mount at a different + * place, so that we can make changes to its mount structure (for example, to implement + * --volatile=) without this interfering with our ability to access files such as + * /etc/localtime to copy into the container. Note that we use a fixed place for this + * (instead of a temporary directory, since we are living in our own mount namespace here + * already, and thus don't need to be afraid of colliding with anyone else's mounts). */ + (void) mkdir_p("/run/systemd/nspawn-root", 0755); + + r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + directory = "/run/systemd/nspawn-root"; + } + + /* Make sure we always have a mount that we can move to root later on. */ + r = make_mount_point(directory); + if (r < 0) + return r; + + /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host + * mount namespace. For the directory we are going to run our container let's turn this off, so that + * we'll live in our own little world from now on, and propagation from the host may only happen via + * the mount tunnel dir, or not at all. */ + r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL); + if (r < 0) + return r; + + r = setup_pivot_root( + directory, + arg_pivot_root_new, + arg_pivot_root_old); + if (r < 0) + return r; + + r = setup_volatile_mode( + directory, + arg_volatile_mode, + arg_uid_shift, + arg_selinux_apifs_context); + if (r < 0) + return r; + + r = bind_user_prepare( + directory, + arg_bind_user, + arg_uid_shift, + arg_uid_range, + &arg_custom_mounts, &arg_n_custom_mounts, + &bind_user_context); + if (r < 0) + return r; + + if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) { + /* Send the user maps we determined to the parent, so that it installs it in our user + * namespace UID map table */ + + for (size_t i = 0; i < bind_user_context->n_data; i++) { + uid_t map[] = { + bind_user_context->data[i].payload_user->uid, + bind_user_context->data[i].host_user->uid, + (uid_t) bind_user_context->data[i].payload_group->gid, + (uid_t) bind_user_context->data[i].host_group->gid, + }; + + l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send user UID map: %m"); + if (l != sizeof(map)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short write while sending user UID map."); + } + } + + r = mount_custom( + directory, + arg_custom_mounts, + arg_n_custom_mounts, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + MOUNT_ROOT_ONLY); + if (r < 0) + return r; + + if (arg_userns_mode != USER_NAMESPACE_NO && + IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) && + arg_uid_shift != 0) { + _cleanup_free_ char *usr_subtree = NULL; + char *dirs[3]; + size_t i = 0; + + dirs[i++] = (char*) directory; + + if (dissected_image && dissected_image->partitions[PARTITION_USR].found) { + usr_subtree = path_join(directory, "/usr"); + if (!usr_subtree) + return log_oom(); + + dirs[i++] = usr_subtree; + } + + dirs[i] = NULL; + + r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT); + if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) { + /* This might fail because the kernel or file system doesn't support idmapping. We + * can't really distinguish this nicely, nor do we have any guarantees about the + * error codes we see, could be EOPNOTSUPP or EINVAL. */ + if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "ID mapped mounts are apparently not available, sorry."); + + log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing."); + arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN; + } else if (r < 0) + return log_error_errno(r, "Failed to set up ID mapped mounts: %m"); + else { + log_debug("ID mapped mounts available, making use of them."); + idmap = true; + } + } + + if (dissected_image) { + /* Now we know the uid shift, let's now mount everything else that might be in the image. */ + r = dissected_image_mount( + dissected_image, + directory, + arg_uid_shift, + arg_uid_range, + /* userns_fd= */ -EBADF, + DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY| + DISSECT_IMAGE_DISCARD_ON_LOOP| + DISSECT_IMAGE_USR_NO_ROOT| + (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)| + (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0)); + if (r == -EUCLEAN) + return log_error_errno(r, "File system check for image failed: %m"); + if (r < 0) + return log_error_errno(r, "Failed to mount image file system: %m"); + } + + if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { + /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */ + + r = detect_unified_cgroup_hierarchy_from_image(directory); + if (r < 0) + return r; + + l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send cgroup mode: %m"); + if (l != sizeof(arg_unified_cgroup_hierarchy)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short write while sending cgroup mode."); + } + + r = recursive_chown(directory, arg_uid_shift, arg_uid_range); + if (r < 0) + return r; + + r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift); + if (r < 0) + return r; + + if (arg_read_only && arg_volatile_mode == VOLATILE_NO && + !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) { + r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) + return log_error_errno(r, "Failed to make tree read-only: %m"); + } + + r = mount_all(directory, + arg_mount_settings, + arg_uid_shift, + arg_selinux_apifs_context); + if (r < 0) + return r; + + r = copy_devnodes(directory); + if (r < 0) + return r; + + r = make_extra_nodes(directory); + if (r < 0) + return r; + + (void) dev_setup(directory, arg_uid_shift, arg_uid_shift); + + p = prefix_roota(directory, "/run/host"); + (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift); + + r = setup_pts(directory); + if (r < 0) + return r; + + r = mount_tunnel_dig(directory); + if (r < 0) + return r; + + r = setup_keyring(); + if (r < 0) + return r; + + r = setup_credentials(directory); + if (r < 0) + return r; + + r = bind_user_setup(bind_user_context, directory); + if (r < 0) + return r; + + r = mount_custom( + directory, + arg_custom_mounts, + arg_n_custom_mounts, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + MOUNT_NON_ROOT_ONLY); + if (r < 0) + return r; + + r = setup_timezone(directory); + if (r < 0) + return r; + + r = setup_resolv_conf(directory); + if (r < 0) + return r; + + r = setup_machine_id(directory); + if (r < 0) + return r; + + r = setup_journal(directory); + if (r < 0) + return r; + + /* The same stuff as the $container env var, but nicely readable for the entire payload */ + p = prefix_roota(directory, "/run/host/container-manager"); + (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE); + + /* The same stuff as the $container_uuid env var */ + p = prefix_roota(directory, "/run/host/container-uuid"); + (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)); + + if (!arg_use_cgns) { + r = mount_cgroups( + directory, + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + false); + if (r < 0) + return r; + } + + /* Mark everything as shared so our mounts get propagated down. This is required to make new bind + * mounts available in systemd services inside the container that create a new mount namespace. See + * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this + * will inherit the shared propagation mode. + * + * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root + * directory mount to root later on. + * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251 + */ + r = mount_switch_root(directory, MS_SHARED); + if (r < 0) + return log_error_errno(r, "Failed to move root directory: %m"); + + /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a + * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into + * the container. */ + r = mount_tunnel_open(); + if (r < 0) + return r; + + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* In order to mount procfs and sysfs in an unprivileged container the kernel + * requires that a fully visible instance is already present in the target mount + * namespace. Mount one here so the inner child can mount its own instances. Later + * we umount the temporary instances created here before we actually exec the + * payload. Since the rootfs is shared the umount will propagate into the container. + * Note, the inner child wouldn't be able to unmount the instances on its own since + * it doesn't own the originating mount namespace. IOW, the outer child needs to do + * this. */ + r = pin_fully_visible_fs(); + if (r < 0) + return r; + } + + fd = setup_notify_child(); + if (fd < 0) + return fd; + + pid = raw_clone(SIGCHLD|CLONE_NEWNS| + arg_clone_ns_flags | + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); + if (pid < 0) + return log_error_errno(errno, "Failed to fork inner child: %m"); + if (pid == 0) { + fd_outer_socket = safe_close(fd_outer_socket); + + /* The inner child has all namespaces that are requested, so that we all are owned by the + * user if user namespaces are turned on. */ + + if (arg_network_namespace_path) { + r = namespace_enter(-1, -1, netns_fd, -1, -1); + if (r < 0) + return log_error_errno(r, "Failed to join network namespace: %m"); + } + + r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send PID: %m"); + if (l != sizeof(pid)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short write while sending PID."); + + l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send machine ID: %m"); + if (l != sizeof(arg_uuid)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short write while sending machine ID."); + + l = send_one_fd(fd_outer_socket, fd, 0); + if (l < 0) + return log_error_errno(l, "Failed to send notify fd: %m"); + + fd_outer_socket = safe_close(fd_outer_socket); + fd_inner_socket = safe_close(fd_inner_socket); + netns_fd = safe_close(netns_fd); + + return 0; +} + +static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { + bool tried_hashed = false; + unsigned n_tries = 100; + uid_t candidate; + int r; + + assert(shift); + assert(ret_lock_file); + assert(arg_userns_mode == USER_NAMESPACE_PICK); + assert(arg_uid_range == 0x10000U); + + candidate = *shift; + + (void) mkdir("/run/systemd/nspawn-uid", 0755); + + for (;;) { + char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT; + + if (--n_tries <= 0) + return -EBUSY; + + if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX) + goto next; + if ((candidate & UINT32_C(0xFFFF)) != 0) + goto next; + + xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate); + r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf); + if (r == -EBUSY) /* Range already taken by another nspawn instance */ + goto next; + if (r < 0) + return r; + + /* Make some superficial checks whether the range is currently known in the user database */ + if (getpwuid(candidate)) + goto next; + if (getpwuid(candidate + UINT32_C(0xFFFE))) + goto next; + if (getgrgid(candidate)) + goto next; + if (getgrgid(candidate + UINT32_C(0xFFFE))) + goto next; + + *ret_lock_file = lf; + lf = (struct LockFile) LOCK_FILE_INIT; + *shift = candidate; + return 0; + + next: + if (arg_machine && !tried_hashed) { + /* Try to hash the base from the container name */ + + static const uint8_t hash_key[] = { + 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf, + 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72 + }; + + candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key); + + tried_hashed = true; + } else + random_bytes(&candidate, sizeof(candidate)); + + candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN; + candidate &= (uid_t) UINT32_C(0xFFFF0000); + } +} + +static int add_one_uid_map( + char **p, + uid_t container_uid, + uid_t host_uid, + uid_t range) { + + return strextendf(p, + UID_FMT " " UID_FMT " " UID_FMT "\n", + container_uid, host_uid, range); +} + +static int make_uid_map_string( + const uid_t bind_user_uid[], + size_t n_bind_user_uid, + size_t offset, + char **ret) { + + _cleanup_free_ char *s = NULL; + uid_t previous_uid = 0; + int r; + + assert(n_bind_user_uid == 0 || bind_user_uid); + assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */ + assert(ret); + + /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one + * quadruplet, consisting of host and container UID + GID. */ + + for (size_t i = 0; i < n_bind_user_uid; i++) { + uid_t payload_uid = bind_user_uid[i*4+offset], + host_uid = bind_user_uid[i*4+offset+1]; + + assert(previous_uid <= payload_uid); + assert(payload_uid < arg_uid_range); + + /* Add a range to close the gap to previous entry */ + if (payload_uid > previous_uid) { + r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid); + if (r < 0) + return r; + } + + /* Map this specific user */ + r = add_one_uid_map(&s, payload_uid, host_uid, 1); + if (r < 0) + return r; + + previous_uid = payload_uid + 1; + } + + /* And add a range to close the gap to finish the range */ + if (arg_uid_range > previous_uid) { + r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid); + if (r < 0) + return r; + } + + assert(s); + + *ret = TAKE_PTR(s); + return 0; +} + +static int setup_uid_map( + pid_t pid, + const uid_t bind_user_uid[], + size_t n_bind_user_uid) { + + char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_free_ char *s = NULL; + int r; + + assert(pid > 1); + + /* Build the UID map string */ + if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */ + return log_oom(); + + xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid); + r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write UID map: %m"); + + /* And now build the GID map string */ + s = mfree(s); + if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */ + return log_oom(); + + xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid); + r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write GID map: %m"); + + return 0; +} + +static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + char buf[NOTIFY_BUFFER_MAX+1]; + char *p = NULL; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct ucred *ucred; + ssize_t n; + pid_t inner_child_pid; + _cleanup_strv_free_ char **tags = NULL; + int r; + + assert(userdata); + + inner_child_pid = PTR_TO_PID(userdata); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for notify fd."); + return 0; + } + + n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; + else if (n == -EXFULL) { + log_warning("Got message with truncated control data (too many fds sent?), ignoring."); + return 0; + } else if (n < 0) + return log_warning_errno(n, "Couldn't read notification socket: %m"); + + cmsg_close_all(&msghdr); + + ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!ucred || ucred->pid != inner_child_pid) { + log_debug("Received notify message without valid credentials. Ignoring."); + return 0; + } + + if ((size_t) n >= sizeof(buf)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + buf[n] = 0; + tags = strv_split(buf, "\n\r"); + if (!tags) + return log_oom(); + + if (strv_contains(tags, "READY=1")) { + r = sd_notify(false, "READY=1\n"); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + } + + p = strv_find_startswith(tags, "STATUS="); + if (p) + (void) sd_notifyf(false, "STATUS=Container running: %s", p); + + return 0; +} + +static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) { + int r; + + r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + + (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify"); + + return 0; +} + +static int merge_settings(Settings *settings, const char *path) { + int rl; + + assert(settings); + assert(path); + + /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note + * that this steals the fields of the Settings* structure, and hence modifies it. */ + + if ((arg_settings_mask & SETTING_START_MODE) == 0 && + settings->start_mode >= 0) { + arg_start_mode = settings->start_mode; + strv_free_and_replace(arg_parameters, settings->parameters); + } + + if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 && + settings->ephemeral >= 0) + arg_ephemeral = settings->ephemeral; + + if ((arg_settings_mask & SETTING_DIRECTORY) == 0 && + settings->root) { + + if (!arg_settings_trusted) + log_warning("Ignoring root directory setting, file %s is not trusted.", path); + else + free_and_replace(arg_directory, settings->root); + } + + if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 && + settings->pivot_root_new) { + free_and_replace(arg_pivot_root_new, settings->pivot_root_new); + free_and_replace(arg_pivot_root_old, settings->pivot_root_old); + } + + if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 && + settings->working_directory) + free_and_replace(arg_chdir, settings->working_directory); + + if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 && + settings->environment) + strv_free_and_replace(arg_setenv, settings->environment); + + if ((arg_settings_mask & SETTING_USER) == 0) { + + if (settings->user) + free_and_replace(arg_user, settings->user); + + if (uid_is_valid(settings->uid)) + arg_uid = settings->uid; + if (gid_is_valid(settings->gid)) + arg_gid = settings->gid; + if (settings->n_supplementary_gids > 0) { + free_and_replace(arg_supplementary_gids, settings->supplementary_gids); + arg_n_supplementary_gids = settings->n_supplementary_gids; + } + } + + if ((arg_settings_mask & SETTING_CAPABILITY) == 0) { + uint64_t plus, minus; + uint64_t network_minus = 0; + uint64_t ambient; + + /* Note that we copy both the simple plus/minus caps here, and the full quintet from the + * Settings structure */ + + plus = settings->capability; + minus = settings->drop_capability; + + if ((arg_settings_mask & SETTING_NETWORK) == 0 && + settings_network_configured(settings)) { + if (settings_private_network(settings)) + plus |= UINT64_C(1) << CAP_NET_ADMIN; + else + network_minus |= UINT64_C(1) << CAP_NET_ADMIN; + } + + if (!arg_settings_trusted && plus != 0) { + if (settings->capability != 0) + log_warning("Ignoring Capability= setting, file %s is not trusted.", path); + } else { + arg_caps_retain &= ~network_minus; + arg_caps_retain |= plus; + } + + arg_caps_retain &= ~minus; + + /* Copy the full capabilities over too */ + if (capability_quintet_is_set(&settings->full_capabilities)) { + if (!arg_settings_trusted) + log_warning("Ignoring capability settings, file %s is not trusted.", path); + else + arg_full_capabilities = settings->full_capabilities; + } + + ambient = settings->ambient_capability; + if (!arg_settings_trusted && ambient != 0) + log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path); + else + arg_caps_ambient |= ambient; + } + + if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && + settings->kill_signal > 0) + arg_kill_signal = settings->kill_signal; + + if ((arg_settings_mask & SETTING_PERSONALITY) == 0 && + settings->personality != PERSONALITY_INVALID) + arg_personality = settings->personality; + + if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 && + !sd_id128_is_null(settings->machine_id)) { + + if (!arg_settings_trusted) + log_warning("Ignoring MachineID= setting, file %s is not trusted.", path); + else + arg_uuid = settings->machine_id; + } + + if ((arg_settings_mask & SETTING_READ_ONLY) == 0 && + settings->read_only >= 0) + arg_read_only = settings->read_only; + + if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 && + settings->volatile_mode != _VOLATILE_MODE_INVALID) + arg_volatile_mode = settings->volatile_mode; + + if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 && + settings->n_custom_mounts > 0) { + + if (!arg_settings_trusted) + log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path); + else { + custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); + arg_custom_mounts = TAKE_PTR(settings->custom_mounts); + arg_n_custom_mounts = settings->n_custom_mounts; + settings->n_custom_mounts = 0; + } + } + + if ((arg_settings_mask & SETTING_NETWORK) == 0 && + settings_network_configured(settings)) { + + if (!arg_settings_trusted) + log_warning("Ignoring network settings, file %s is not trusted.", path); + else { + arg_network_veth = settings_network_veth(settings); + arg_private_network = settings_private_network(settings); + + strv_free_and_replace(arg_network_interfaces, settings->network_interfaces); + strv_free_and_replace(arg_network_macvlan, settings->network_macvlan); + strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan); + strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra); + + free_and_replace(arg_network_bridge, settings->network_bridge); + free_and_replace(arg_network_zone, settings->network_zone); + + free_and_replace(arg_network_namespace_path, settings->network_namespace_path); + } + } + + if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 && + settings->expose_ports) { + + if (!arg_settings_trusted) + log_warning("Ignoring Port= setting, file %s is not trusted.", path); + else { + expose_port_free_all(arg_expose_ports); + arg_expose_ports = TAKE_PTR(settings->expose_ports); + } + } + + if ((arg_settings_mask & SETTING_USERNS) == 0 && + settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) { + + if (!arg_settings_trusted) + log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path); + else { + arg_userns_mode = settings->userns_mode; + arg_uid_shift = settings->uid_shift; + arg_uid_range = settings->uid_range; + arg_userns_ownership = settings->userns_ownership; + } + } + + if ((arg_settings_mask & SETTING_BIND_USER) == 0 && + !strv_isempty(settings->bind_user)) + strv_free_and_replace(arg_bind_user, settings->bind_user); + + if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 && + settings->notify_ready >= 0) + arg_notify_ready = settings->notify_ready; + + if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) { + + if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) { + if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list)) + log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path); + else { + strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list); + strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list); + } + } + +#if HAVE_SECCOMP + if (settings->seccomp) { + if (!arg_settings_trusted) + log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path); + else { + seccomp_release(arg_seccomp); + arg_seccomp = TAKE_PTR(settings->seccomp); + } + } +#endif + } + + for (rl = 0; rl < _RLIMIT_MAX; rl ++) { + if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl))) + continue; + + if (!settings->rlimit[rl]) + continue; + + if (!arg_settings_trusted) { + log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path); + continue; + } + + free_and_replace(arg_rlimit[rl], settings->rlimit[rl]); + } + + if ((arg_settings_mask & SETTING_HOSTNAME) == 0 && + settings->hostname) + free_and_replace(arg_hostname, settings->hostname); + + if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 && + settings->no_new_privileges >= 0) + arg_no_new_privileges = settings->no_new_privileges; + + if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 && + settings->oom_score_adjust_set) { + + if (!arg_settings_trusted) + log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path); + else { + arg_oom_score_adjust = settings->oom_score_adjust; + arg_oom_score_adjust_set = true; + } + } + + if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 && + settings->cpu_set.set) { + + if (!arg_settings_trusted) + log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path); + else { + cpu_set_reset(&arg_cpu_set); + arg_cpu_set = TAKE_STRUCT(settings->cpu_set); + } + } + + if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 && + settings->resolv_conf != _RESOLV_CONF_MODE_INVALID) + arg_resolv_conf = settings->resolv_conf; + + if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 && + settings->link_journal != _LINK_JOURNAL_INVALID) { + + if (!arg_settings_trusted) + log_warning("Ignoring journal link setting, file '%s' is not trusted.", path); + else { + arg_link_journal = settings->link_journal; + arg_link_journal_try = settings->link_journal_try; + } + } + + if ((arg_settings_mask & SETTING_TIMEZONE) == 0 && + settings->timezone != _TIMEZONE_MODE_INVALID) + arg_timezone = settings->timezone; + + if ((arg_settings_mask & SETTING_SLICE) == 0 && + settings->slice) { + + if (!arg_settings_trusted) + log_warning("Ignoring slice setting, file '%s' is not trusted.", path); + else + free_and_replace(arg_slice, settings->slice); + } + + if ((arg_settings_mask & SETTING_USE_CGNS) == 0 && + settings->use_cgns >= 0) { + + if (!arg_settings_trusted) + log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path); + else + arg_use_cgns = settings->use_cgns; + } + + if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 && + settings->clone_ns_flags != ULONG_MAX) { + + if (!arg_settings_trusted) + log_warning("Ignoring namespace setting, file '%s' is not trusted.", path); + else + arg_clone_ns_flags = settings->clone_ns_flags; + } + + if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 && + settings->console_mode >= 0) { + + if (!arg_settings_trusted) + log_warning("Ignoring console mode setting, file '%s' is not trusted.", path); + else + arg_console_mode = settings->console_mode; + } + + if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 && + settings->suppress_sync >= 0) + arg_suppress_sync = settings->suppress_sync; + + /* The following properties can only be set through the OCI settings logic, not from the command line, hence we + * don't consult arg_settings_mask for them. */ + + sd_bus_message_unref(arg_property_message); + arg_property_message = TAKE_PTR(settings->properties); + + arg_console_width = settings->console_width; + arg_console_height = settings->console_height; + + device_node_array_free(arg_extra_nodes, arg_n_extra_nodes); + arg_extra_nodes = TAKE_PTR(settings->extra_nodes); + arg_n_extra_nodes = settings->n_extra_nodes; + settings->n_extra_nodes = 0; + + return 0; +} + +static int load_settings(void) { + _cleanup_(settings_freep) Settings *settings = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + int r; + + if (arg_oci_bundle) + return 0; + + /* If all settings are masked, there's no point in looking for + * the settings file */ + if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL)) + return 0; + + /* We first look in the admin's directories in /etc and /run */ + FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { + _cleanup_free_ char *j = NULL; + + j = path_join(i, arg_settings_filename); + if (!j) + return log_oom(); + + f = fopen(j, "re"); + if (f) { + p = TAKE_PTR(j); + + /* By default, we trust configuration from /etc and /run */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; + + break; + } + + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", j); + } + + if (!f) { + /* After that, let's look for a file next to the + * actual image we shall boot. */ + + if (arg_image) { + r = file_in_same_dir(arg_image, arg_settings_filename, &p); + if (r < 0) + return log_error_errno(r, "Failed to generate settings path from image path: %m"); + } else if (arg_directory) { + r = file_in_same_dir(arg_directory, arg_settings_filename, &p); + if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */ + return log_error_errno(r, "Failed to generate settings path from directory path: %m"); + } + + if (p) { + f = fopen(p, "re"); + if (!f && errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", p); + + /* By default, we do not trust configuration from /var/lib/machines */ + if (arg_settings_trusted < 0) + arg_settings_trusted = false; + } + } + + if (!f) + return 0; + + log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted)); + + r = settings_load(f, p, &settings); + if (r < 0) + return r; + + return merge_settings(settings, p); +} + +static int load_oci_bundle(void) { + _cleanup_(settings_freep) Settings *settings = NULL; + int r; + + if (!arg_oci_bundle) + return 0; + + /* By default let's trust OCI bundles */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; + + r = oci_load(NULL, arg_oci_bundle, &settings); + if (r < 0) + return r; + + return merge_settings(settings, arg_oci_bundle); +} + +static int run_container( + DissectedImage *dissected_image, + FDSet *fds, + char veth_name[IFNAMSIZ], bool *veth_created, + struct ExposeArgs *expose_args, + int *master, pid_t *pid, int *ret) { + + static const struct sigaction sa = { + .sa_handler = nop_signal_handler, + .sa_flags = SA_NOCLDSTOP|SA_RESTART, + }; + + _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT; + _cleanup_close_ int etc_passwd_lock = -EBADF; + _cleanup_close_pair_ int + fd_inner_socket_pair[2] = EBADF_PAIR, + fd_outer_socket_pair[2] = EBADF_PAIR; + + _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF; + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ uid_t *bind_user_uid = NULL; + size_t n_bind_user_uid = 0; + ContainerStatus container_status = 0; + int ifi = 0, r; + ssize_t l; + sigset_t mask_chld; + _cleanup_close_ int child_netns_fd = -EBADF; + + assert_se(sigemptyset(&mask_chld) == 0); + assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely + * check with getpwuid() if the specific user already exists. Note that /etc might be + * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we + * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are + * really just an extra safety net. We kinda assume that the UID range we allocate from is + * really ours. */ + + etc_passwd_lock = take_etc_passwd_lock(NULL); + if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) + return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m"); + } + + r = barrier_create(&barrier); + if (r < 0) + return log_error_errno(r, "Cannot initialize IPC barrier: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0) + return log_error_errno(errno, "Failed to create inner socket pair: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0) + return log_error_errno(errno, "Failed to create outer socket pair: %m"); + + /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt + * parent's blocking calls and give it a chance to call wait() and terminate. */ + r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); + if (r < 0) + return log_error_errno(errno, "Failed to change the signal mask: %m"); + + r = sigaction(SIGCHLD, &sa, NULL); + if (r < 0) + return log_error_errno(errno, "Failed to install SIGCHLD handler: %m"); + + if (arg_network_namespace_path) { + child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (child_netns_fd < 0) + return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path); + + r = fd_is_ns(child_netns_fd, CLONE_NEWNET); + if (r == -EUCLEAN) + log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path); + else if (r < 0) + return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path); + else if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path); + } + + *pid = raw_clone(SIGCHLD|CLONE_NEWNS); + if (*pid < 0) + return log_error_errno(errno, "clone() failed%s: %m", + errno == EINVAL ? + ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : ""); + + if (*pid == 0) { + /* The outer child only has a file system namespace. */ + barrier_set_role(&barrier, BARRIER_CHILD); + + fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]); + fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]); + + (void) reset_all_signal_handlers(); + (void) reset_signal_mask(); + + r = outer_child(&barrier, + arg_directory, + dissected_image, + fd_outer_socket_pair[1], + fd_inner_socket_pair[1], + fds, + child_netns_fd); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + barrier_set_role(&barrier, BARRIER_PARENT); + + fdset_close(fds); + + fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]); + fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0); + if (mntns_fd < 0) + return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m"); + + /* The child just let us know the UID shift it might have read from the image. */ + l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read UID shift: %m"); + if (l != sizeof arg_uid_shift) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift."); + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* If we are supposed to pick the UID shift, let's try to use the shift read from the + * image, but if that's already in use, pick a new one, and report back to the child, + * which one we now picked. */ + + r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock); + if (r < 0) + return log_error_errno(r, "Failed to pick suitable UID/GID range: %m"); + + l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send UID shift: %m"); + if (l != sizeof arg_uid_shift) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift."); + } + + n_bind_user_uid = strv_length(arg_bind_user); + if (n_bind_user_uid > 0) { + /* Right after the UID shift, we'll receive the list of UID mappings for the + * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */ + + bind_user_uid = new(uid_t, n_bind_user_uid*4); + if (!bind_user_uid) + return log_oom(); + + for (size_t i = 0; i < n_bind_user_uid; i++) { + l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read user UID map pair: %m"); + if (l != sizeof(uid_t)*4) + return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING, + SYNTHETIC_ERRNO(EIO), + "Short read while reading bind user UID pairs."); + } + } + } + + if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { + /* The child let us know the support cgroup mode it might have read from the image. */ + l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0); + if (l < 0) + return log_error_errno(errno, "Failed to read cgroup mode: %m"); + if (l != sizeof(arg_unified_cgroup_hierarchy)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s", + l, l == 0 ? " The child is most likely dead." : ""); + } + + /* Wait for the outer child. */ + r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL); + if (r < 0) + return r; + if (r != EXIT_SUCCESS) + return -EIO; + + /* And now retrieve the PID of the inner child. */ + l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read inner child PID: %m"); + if (l != sizeof *pid) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID."); + + /* We also retrieve container UUID in case it was generated by outer child */ + l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read container machine ID: %m"); + if (l != sizeof(arg_uuid)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID."); + + /* We also retrieve the socket used for notifications generated by outer child */ + notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0); + if (notify_socket < 0) + return log_error_errno(notify_socket, + "Failed to receive notification socket from the outer child: %m"); + + log_debug("Init process invoked as PID "PID_FMT, *pid); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + if (!barrier_place_and_sync(&barrier)) /* #1 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early."); + + r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid); + if (r < 0) + return r; + + (void) barrier_place(&barrier); /* #2 */ + } + + if (arg_private_network) { + if (!arg_network_namespace_path) { + /* Wait until the child has unshared its network namespace. */ + if (!barrier_place_and_sync(&barrier)) /* #3 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early"); + } + + if (child_netns_fd < 0) { + /* Make sure we have an open file descriptor to the child's network + * namespace so it stays alive even if the child exits. */ + r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to open child network namespace: %m"); + } + + r = move_network_interfaces(child_netns_fd, arg_network_interfaces); + if (r < 0) + return r; + + if (arg_network_veth) { + r = setup_veth(arg_machine, *pid, veth_name, + arg_network_bridge || arg_network_zone, &arg_network_provided_mac); + if (r < 0) + return r; + else if (r > 0) + ifi = r; + + if (arg_network_bridge) { + /* Add the interface to a bridge */ + r = setup_bridge(veth_name, arg_network_bridge, false); + if (r < 0) + return r; + if (r > 0) + ifi = r; + } else if (arg_network_zone) { + /* Add the interface to a bridge, possibly creating it */ + r = setup_bridge(veth_name, arg_network_zone, true); + if (r < 0) + return r; + if (r > 0) + ifi = r; + } + } + + r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra); + if (r < 0) + return r; + + /* We created the primary and extra veth links now; let's remember this, so that we know to + remove them later on. Note that we don't bother with removing veth links that were created + here when their setup failed half-way, because in that case the kernel should be able to + remove them on its own, since they cannot be referenced by anything yet. */ + *veth_created = true; + + r = setup_macvlan(arg_machine, *pid, arg_network_macvlan); + if (r < 0) + return r; + + r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan); + if (r < 0) + return r; + } + + if (arg_register || !arg_keep_unit) { + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); + + r = sd_bus_set_close_on_exit(bus, false); + if (r < 0) + return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m"); + } + + if (!arg_keep_unit) { + /* When a new scope is created for this container, then we'll be registered as its controller, in which + * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the + * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */ + + r = sd_bus_match_signal_async( + bus, + NULL, + "org.freedesktop.systemd1", + NULL, + "org.freedesktop.systemd1.Scope", + "RequestStop", + on_request_stop, NULL, PID_TO_PTR(*pid)); + if (r < 0) + return log_error_errno(r, "Failed to request RequestStop match: %m"); + } + + if (arg_register) { + r = register_machine( + bus, + arg_machine, + *pid, + arg_directory, + arg_uuid, + ifi, + arg_slice, + arg_custom_mounts, arg_n_custom_mounts, + arg_kill_signal, + arg_property, + arg_property_message, + arg_keep_unit, + arg_container_service_name, + arg_start_mode); + if (r < 0) + return r; + + } else if (!arg_keep_unit) { + r = allocate_scope( + bus, + arg_machine, + *pid, + arg_slice, + arg_custom_mounts, arg_n_custom_mounts, + arg_kill_signal, + arg_property, + arg_property_message, + /* allow_pidfds= */ true, + arg_start_mode); + if (r < 0) + return r; + + } else if (arg_slice || arg_property) + log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect."); + + r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy); + if (r < 0) + return r; + + r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); + if (r < 0) + return r; + + r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); + if (r < 0) + return r; + + /* Notify the child that the parent is ready with all + * its setup (including cgroup-ification), and that + * the child can now hand over control to the code to + * run inside the container. */ + (void) barrier_place(&barrier); /* #4 */ + + /* Block SIGCHLD here, before notifying child. + * process_pty() will handle it with the other signals. */ + assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0); + + /* Reset signal to default */ + r = default_signals(SIGCHLD); + if (r < 0) + return log_error_errno(r, "Failed to reset SIGCHLD: %m"); + + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event source: %m"); + + (void) sd_event_set_watchdog(event, true); + + if (bus) { + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + } + + r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source); + if (r < 0) + return r; + + /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on, + * before we take the fully visible instances away. */ + if (!barrier_sync(&barrier)) /* #5.1 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early."); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + r = wipe_fully_visible_fs(mntns_fd); + if (r < 0) + return r; + mntns_fd = safe_close(mntns_fd); + } + + /* And now let the child know that we completed removing the procfs instances, and it can start the + * payload. */ + if (!barrier_place(&barrier)) /* #5.2 */ + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early."); + + /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service + * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */ + etc_passwd_lock = safe_close(etc_passwd_lock); + + (void) sd_notifyf(false, + "STATUS=Container running.\n" + "X_NSPAWN_LEADER_PID=" PID_FMT, *pid); + if (!arg_notify_ready) { + r = sd_notify(false, "READY=1\n"); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + } + + if (arg_kill_signal > 0) { + /* Try to kill the init system on SIGINT or SIGTERM */ + (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid)); + (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid)); + } else { + /* Immediately exit */ + (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + } + + (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + + r = sd_event_add_memory_pressure(event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + /* Exit when the child exits */ + (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid)); + + /* Retrieve the kmsg fifo allocated by inner child */ + fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0); + if (fd_kmsg_fifo < 0) + return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m"); + + if (arg_expose_ports) { + r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl); + if (r < 0) + return r; + + (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4); + (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6); + } + + if (arg_console_mode != CONSOLE_PIPE) { + _cleanup_close_ int fd = -EBADF; + PTYForwardFlags flags = 0; + + /* Retrieve the master pty allocated by inner child */ + fd = receive_one_fd(fd_inner_socket_pair[0], 0); + if (fd < 0) + return log_error_errno(fd, "Failed to receive master pty from the inner child: %m"); + + switch (arg_console_mode) { + + case CONSOLE_READ_ONLY: + flags |= PTY_FORWARD_READ_ONLY; + + _fallthrough_; + + case CONSOLE_INTERACTIVE: + flags |= PTY_FORWARD_IGNORE_VHANGUP; + + r = pty_forward_new(event, fd, flags, &forward); + if (r < 0) + return log_error_errno(r, "Failed to create PTY forwarder: %m"); + + if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX) + (void) pty_forward_set_width_height(forward, + arg_console_width, + arg_console_height); + break; + + default: + assert(arg_console_mode == CONSOLE_PASSIVE); + } + + *master = TAKE_FD(fd); + } + + fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + if (forward) { + char last_char = 0; + + (void) pty_forward_get_last_char(forward, &last_char); + forward = pty_forward_free(forward); + + if (!arg_quiet && last_char != '\n') + putc('\n', stdout); + } + + /* Kill if it is not dead yet anyway */ + if (!arg_register && !arg_keep_unit && bus) + terminate_scope(bus, arg_machine); + + /* Normally redundant, but better safe than sorry */ + (void) kill(*pid, SIGKILL); + + fd_kmsg_fifo = safe_close(fd_kmsg_fifo); + + if (arg_private_network) { + /* Move network interfaces back to the parent network namespace. We use `safe_fork` + * to avoid having to move the parent to the child network namespace. */ + r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL); + if (r < 0) + return r; + + if (r == 0) { + _cleanup_close_ int parent_netns_fd = -EBADF; + + r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL); + if (r < 0) { + log_error_errno(r, "Failed to open parent network namespace: %m"); + _exit(EXIT_FAILURE); + } + + r = namespace_enter(-1, -1, child_netns_fd, -1, -1); + if (r < 0) { + log_error_errno(r, "Failed to enter child network namespace: %m"); + _exit(EXIT_FAILURE); + } + + /* Reverse network interfaces pair list so that interfaces get their initial name back. + * This is about ensuring interfaces get their old name back when being moved back. */ + arg_network_interfaces = strv_reverse(arg_network_interfaces); + + r = move_network_interfaces(parent_netns_fd, arg_network_interfaces); + if (r < 0) + log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m"); + + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + } + + r = wait_for_container(TAKE_PID(*pid), &container_status); + + /* Tell machined that we are gone. */ + if (bus) + (void) unregister_machine(bus, arg_machine); + + if (r < 0) + /* We failed to wait for the container, or the container exited abnormally. */ + return r; + if (r > 0 || container_status == CONTAINER_TERMINATED) { + /* r > 0 → The container exited with a non-zero status. + * As a special case, we need to replace 133 with a different value, + * because 133 is special-cased in the service file to reboot the container. + * otherwise → The container exited with zero status and a reboot was not requested. + */ + if (r == EXIT_FORCE_RESTART) + r = EXIT_FAILURE; /* replace 133 with the general failure code */ + *ret = r; + return 0; /* finito */ + } + + /* CONTAINER_REBOOTED, loop again */ + + if (arg_keep_unit) { + /* Special handling if we are running as a service: instead of simply + * restarting the machine we want to restart the entire service, so let's + * inform systemd about this with the special exit code 133. The service + * file uses RestartForceExitStatus=133 so that this results in a full + * nspawn restart. This is necessary since we might have cgroup parameters + * set we want to have flushed out. */ + *ret = EXIT_FORCE_RESTART; + return 0; /* finito */ + } + + expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4); + expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6); + + (void) remove_veth_links(veth_name, arg_network_veth_extra); + *veth_created = false; + return 1; /* loop again */ +} + +static int initialize_rlimits(void) { + /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload + * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and + * container execution environments. */ + + static const struct rlimit kernel_defaults[_RLIMIT_MAX] = { + [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_CORE] = { 0, RLIM_INFINITY }, + [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK }, + [RLIMIT_MSGQUEUE] = { 819200, 819200 }, + [RLIMIT_NICE] = { 0, 0 }, + [RLIMIT_NOFILE] = { 1024, 4096 }, + [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_RTPRIO] = { 0, 0 }, + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_STACK] = { 8388608, RLIM_INFINITY }, + + /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of + * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them + * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original + * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note + * that PID 1 changes a number of other resource limits during early initialization which is why we + * don't read the other limits from PID 1 but prefer the static table above. */ + }; + + int rl; + + for (rl = 0; rl < _RLIMIT_MAX; rl++) { + /* Let's only fill in what the user hasn't explicitly configured anyway */ + if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) { + const struct rlimit *v; + struct rlimit buffer; + + if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) { + /* For these two let's read the limits off PID 1. See above for an explanation. */ + + if (prlimit(1, rl, NULL, &buffer) < 0) + return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl)); + + v = &buffer; + } else if (rl == RLIMIT_NOFILE) { + /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all + * userspace. Given that nspawn containers are often run without our PID 1, + * let's grant the containers a raised RLIMIT_NOFILE hard limit by default, + * so that container userspace gets similar resources as host userspace + * gets. */ + buffer = kernel_defaults[rl]; + buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE); + v = &buffer; + } else + v = kernel_defaults + rl; + + arg_rlimit[rl] = newdup(struct rlimit, v, 1); + if (!arg_rlimit[rl]) + return log_oom(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *k = NULL; + + (void) rlimit_format(arg_rlimit[rl], &k); + log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k); + } + } + + return 0; +} + +static int cant_be_in_netns(void) { + _cleanup_close_ int fd = -EBADF; + struct ucred ucred; + int r; + + /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting + * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a + * nice message. */ + + if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */ + return 0; + + fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate udev control socket: %m"); + + r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control"); + if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev."); + if (r < 0) + return log_error_errno(r, "Failed to connect socket to udev control socket: %m"); + + r = getpeercred(fd, &ucred); + if (r < 0) + return log_error_errno(r, "Failed to determine peer of udev control socket: %m"); + + r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET); + if (r < 0) + return log_error_errno(r, "Failed to determine network namespace of udev: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK."); + return 0; +} + +static int run(int argc, char *argv[]) { + bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false; + _cleanup_close_ int master = -EBADF; + _cleanup_fdset_free_ FDSet *fds = NULL; + int r, n_fd_passed, ret = EXIT_SUCCESS; + char veth_name[IFNAMSIZ] = ""; + struct ExposeArgs expose_args = {}; + _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; + char tmprootdir[] = "/tmp/nspawn-root-XXXXXX"; + _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL; + pid_t pid = 0; + + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + goto finish; + + if (geteuid() != 0) { + r = log_warning_errno(SYNTHETIC_ERRNO(EPERM), + argc >= 2 ? "Need to be root." : + "Need to be root (and some arguments are usually required).\nHint: try --help"); + goto finish; + } + + r = cant_be_in_netns(); + if (r < 0) + goto finish; + + r = initialize_rlimits(); + if (r < 0) + goto finish; + + r = load_oci_bundle(); + if (r < 0) + goto finish; + + r = determine_names(); + if (r < 0) + goto finish; + + r = load_settings(); + if (r < 0) + goto finish; + + /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have + * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to + * indicate that. */ + if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0) + arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE); + + r = cg_unified(); + if (r < 0) { + log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + goto finish; + } + + r = verify_arguments(); + if (r < 0) + goto finish; + + r = verify_network_interfaces_initialized(); + if (r < 0) + goto finish; + + /* Reapply environment settings. */ + (void) detect_unified_cgroup_hierarchy_from_environment(); + + /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if + * the result is closed. Note that the container payload child will reset signal mask+handler anyway, + * so just turning this off here means we only turn it off in nspawn itself, not any children. */ + (void) ignore_signals(SIGPIPE); + + n_fd_passed = sd_listen_fds(false); + if (n_fd_passed > 0) { + r = fdset_new_listen_fds(&fds, false); + if (r < 0) { + log_error_errno(r, "Failed to collect file descriptors: %m"); + goto finish; + } + } + + /* The "default" umask. This is appropriate for most file and directory + * operations performed by nspawn, and is the umask that will be used for + * the child. Functions like copy_devnodes() change the umask temporarily. */ + umask(0022); + + if (arg_directory) { + assert(!arg_image); + + /* Safety precaution: let's not allow running images from the live host OS image, as long as + * /var from the host will propagate into container dynamically (because bad things happen if + * two systems write to the same /var). Let's allow it for the special cases where /var is + * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */ + if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) { + r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state."); + goto finish; + } + + if (arg_ephemeral) { + _cleanup_free_ char *np = NULL; + + r = chase_and_update(&arg_directory, 0); + if (r < 0) + goto finish; + + /* If the specified path is a mount point we generate the new snapshot immediately + * inside it under a random name. However if the specified is not a mount point we + * create the new snapshot in the parent directory, just next to it. */ + r = path_is_mount_point(arg_directory, NULL, 0); + if (r < 0) { + log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory); + goto finish; + } + if (r > 0) + r = tempfn_random_child(arg_directory, "machine.", &np); + else + r = tempfn_random(arg_directory, "machine.", &np); + if (r < 0) { + log_error_errno(r, "Failed to generate name for directory snapshot: %m"); + goto finish; + } + + /* We take an exclusive lock on this image, since it's our private, ephemeral copy + * only owned by us and no one else. */ + r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock); + if (r < 0) { + log_error_errno(r, "Failed to lock %s: %m", np); + goto finish; + } + + { + BLOCK_SIGNALS(SIGINT); + r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np, + (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_QUOTA | + BTRFS_SNAPSHOT_SIGINT); + } + if (r == -EINTR) { + log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np); + goto finish; + } + if (r < 0) { + log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory); + goto finish; + } + + free_and_replace(arg_directory, np); + remove_directory = true; + } else { + r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0); + if (r < 0) + goto finish; + + r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + if (r == -EBUSY) { + log_error_errno(r, "Directory tree %s is currently busy.", arg_directory); + goto finish; + } + if (r < 0) { + log_error_errno(r, "Failed to lock %s: %m", arg_directory); + goto finish; + } + + if (arg_template) { + r = chase_and_update(&arg_template, 0); + if (r < 0) + goto finish; + + { + BLOCK_SIGNALS(SIGINT); + r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory, + (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_QUOTA | + BTRFS_SNAPSHOT_SIGINT); + } + if (r == -EEXIST) + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, + "Directory %s already exists, not populating from template %s.", arg_directory, arg_template); + else if (r == -EINTR) { + log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory); + goto finish; + } else if (r < 0) { + log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template); + goto finish; + } else + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, + "Populated %s from template %s.", arg_directory, arg_template); + } + } + + if (arg_start_mode == START_BOOT) { + _cleanup_free_ char *b = NULL; + const char *p; + int check_os_release, is_os_tree; + + if (arg_pivot_root_new) { + b = path_join(arg_directory, arg_pivot_root_new); + if (!b) { + r = log_oom(); + goto finish; + } + + p = b; + } else + p = arg_directory; + + check_os_release = getenv_bool("SYSTEMD_NSPAWN_CHECK_OS_RELEASE"); + if (check_os_release < 0 && check_os_release != -ENXIO) { + r = log_error_errno(check_os_release, "Failed to parse $SYSTEMD_NSPAWN_CHECK_OS_RELEASE: %m"); + goto finish; + } + + is_os_tree = path_is_os_tree(p); + if (is_os_tree == 0 && check_os_release == 0) + log_debug("Directory %s is missing an os-release file, continuing anyway.", p); + else if (is_os_tree <= 0) { + r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p); + goto finish; + } + } else { + _cleanup_free_ char *p = NULL; + + if (arg_pivot_root_new) + p = path_join(arg_directory, arg_pivot_root_new, "/usr/"); + else + p = path_join(arg_directory, "/usr/"); + if (!p) { + r = log_oom(); + goto finish; + } + + if (laccess(p, F_OK) < 0) { + r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory); + goto finish; + } + } + + } else { + DissectImageFlags dissect_image_flags = + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES; + assert(arg_image); + assert(!arg_template); + + r = chase_and_update(&arg_image, 0); + if (r < 0) + goto finish; + + if (arg_ephemeral) { + _cleanup_free_ char *np = NULL; + + r = tempfn_random(arg_image, "machine.", &np); + if (r < 0) { + log_error_errno(r, "Failed to generate name for image snapshot: %m"); + goto finish; + } + + /* Always take an exclusive lock on our own ephemeral copy. */ + r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock); + if (r < 0) { + r = log_error_errno(r, "Failed to create image lock: %m"); + goto finish; + } + + { + BLOCK_SIGNALS(SIGINT); + r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, + FS_NOCOW_FL, FS_NOCOW_FL, + COPY_REFLINK|COPY_CRTIME|COPY_SIGINT, + NULL, NULL); + } + if (r == -EINTR) { + log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np); + goto finish; + } + if (r < 0) { + r = log_error_errno(r, "Failed to copy image file: %m"); + goto finish; + } + + free_and_replace(arg_image, np); + remove_image = true; + } else { + r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + if (r == -EBUSY) { + r = log_error_errno(r, "Disk image %s is currently busy.", arg_image); + goto finish; + } + if (r < 0) { + r = log_error_errno(r, "Failed to create image lock: %m"); + goto finish; + } + + r = verity_settings_load( + &arg_verity_settings, + arg_image, NULL, NULL); + if (r < 0) { + log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image); + goto finish; + } + + if (arg_verity_settings.data_path) + dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE; + } + + if (!mkdtemp(tmprootdir)) { + r = log_error_errno(errno, "Failed to create temporary directory: %m"); + goto finish; + } + + remove_tmprootdir = true; + + arg_directory = strdup(tmprootdir); + if (!arg_directory) { + r = log_oom(); + goto finish; + } + + r = loop_device_make_by_path( + arg_image, + arg_read_only ? O_RDONLY : O_RDWR, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &loop); + if (r < 0) { + log_error_errno(r, "Failed to set up loopback block device: %m"); + goto finish; + } + + r = dissect_loop_device_and_warn( + loop, + &arg_verity_settings, + /* mount_options=*/ NULL, + arg_image_policy ?: &image_policy_container, + dissect_image_flags, + &dissected_image); + if (r == -ENOPKG) { + /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */ + log_notice("Note that the disk image needs to\n" + " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n" + " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n" + " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n" + " d) or contain a file system without a partition table\n" + "in order to be bootable with systemd-nspawn."); + goto finish; + } + if (r < 0) + goto finish; + + r = dissected_image_load_verity_sig_partition( + dissected_image, + loop->fd, + &arg_verity_settings); + if (r < 0) + goto finish; + + if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig) + log_notice("Note: image %s contains verity information, but no root hash specified and no embedded " + "root hash signature found! Proceeding without integrity checking.", arg_image); + + r = dissected_image_decrypt_interactively( + dissected_image, + NULL, + &arg_verity_settings, + 0); + if (r < 0) + goto finish; + + /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */ + if (remove_image && unlink(arg_image) >= 0) + remove_image = false; + + if (arg_architecture < 0) + arg_architecture = dissected_image_architecture(dissected_image); + } + + r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts); + if (r < 0) + goto finish; + + if (arg_console_mode < 0) + arg_console_mode = + isatty(STDIN_FILENO) > 0 && + isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY; + + if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ + arg_quiet = true; + + if (!arg_quiet) + log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.", + arg_machine, arg_image ?: arg_directory); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + + r = make_reaper_process(true); + if (r < 0) { + log_error_errno(r, "Failed to become subreaper: %m"); + goto finish; + } + + if (arg_expose_ports) { + r = fw_ctx_new(&fw_ctx); + if (r < 0) { + log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m"); + goto finish; + } + expose_args.fw_ctx = fw_ctx; + } + for (;;) { + r = run_container(dissected_image, + fds, + veth_name, &veth_created, + &expose_args, &master, + &pid, &ret); + if (r <= 0) + break; + } + +finish: + (void) sd_notify(false, + r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." : + "STOPPING=1\nSTATUS=Terminating..."); + + if (pid > 0) + (void) kill(pid, SIGKILL); + + /* Try to flush whatever is still queued in the pty */ + if (master >= 0) { + (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0); + master = safe_close(master); + } + + if (pid > 0) + (void) wait_for_terminate(pid, NULL); + + pager_close(); + + if (remove_directory && arg_directory) { + int k; + + k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME); + if (k < 0) + log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory); + } + + if (remove_image && arg_image) { + if (unlink(arg_image) < 0) + log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image); + } + + if (remove_tmprootdir) { + if (rmdir(tmprootdir) < 0) + log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir); + } + + if (arg_machine) { + const char *p; + + p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); + (void) rm_rf(p, REMOVE_ROOT); + } + + expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4); + expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6); + + if (veth_created) + (void) remove_veth_links(veth_name, arg_network_veth_extra); + (void) remove_bridge(arg_network_zone); + + custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); + expose_port_free_all(arg_expose_ports); + rlimit_free_all(arg_rlimit); + device_node_array_free(arg_extra_nodes, arg_n_extra_nodes); + machine_credential_free_all(arg_credentials, arg_n_credentials); + + if (r < 0) + return r; + + return ret; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/nspawn/nspawn.h b/src/nspawn/nspawn.h new file mode 100644 index 0000000..27fb0b4 --- /dev/null +++ b/src/nspawn/nspawn.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int userns_lchown(const char *p, uid_t uid, gid_t gid); +int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid); diff --git a/src/nspawn/test-nspawn-tables.c b/src/nspawn/test-nspawn-tables.c new file mode 100644 index 0000000..daea469 --- /dev/null +++ b/src/nspawn/test-nspawn-tables.c @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "nspawn-settings.h" +#include "test-tables.h" +#include "tests.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(resolv_conf_mode, RESOLV_CONF_MODE); + test_table(timezone_mode, TIMEZONE_MODE); + + return 0; +} diff --git a/src/nspawn/test-nspawn-util.c b/src/nspawn/test-nspawn-util.c new file mode 100644 index 0000000..08c8050 --- /dev/null +++ b/src/nspawn/test-nspawn-util.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "nspawn-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +TEST(systemd_installation_has_version) { + int r; + + FOREACH_STRING(version, "0", "231", STRINGIFY(PROJECT_VERSION), "999") { + r = systemd_installation_has_version(saved_argv[1], version); + assert_se(r >= 0); + log_info("%s has systemd >= %s: %s", + saved_argv[1] ?: "Current installation", version, yes_no(r)); + } +} + +/* This program can be called with a path to an installation root. + * For example: build/test-nspawn-util /var/lib/machines/rawhide + */ +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/nspawn/test-patch-uid.c b/src/nspawn/test-patch-uid.c new file mode 100644 index 0000000..f8f44b0 --- /dev/null +++ b/src/nspawn/test-patch-uid.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "log.h" +#include "nspawn-patch-uid.h" +#include "user-util.h" +#include "string-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + uid_t shift, range; + int r; + + test_setup_logging(LOG_DEBUG); + + if (argc != 4) { + log_error("Expected PATH SHIFT RANGE parameters."); + return EXIT_FAILURE; + } + + r = parse_uid(argv[2], &shift); + if (r < 0) { + log_error_errno(r, "Failed to parse UID shift %s.", argv[2]); + return EXIT_FAILURE; + } + + r = parse_gid(argv[3], &range); + if (r < 0) { + log_error_errno(r, "Failed to parse UID range %s.", argv[3]); + return EXIT_FAILURE; + } + + r = path_patch_uid(argv[1], shift, range); + if (r < 0) { + log_error_errno(r, "Failed to patch directory tree: %m"); + return EXIT_FAILURE; + } + + log_info("Changed: %s", yes_no(r)); + + return EXIT_SUCCESS; +} diff --git a/src/nss-myhostname/meson.build b/src/nss-myhostname/meson.build new file mode 100644 index 0000000..53d8bda --- /dev/null +++ b/src/nss-myhostname/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +modules += [ + nss_template + { + 'name' : 'nss_myhostname', + 'conditions' : ['ENABLE_NSS_MYHOSTNAME'], + 'sources' : files('nss-myhostname.c'), + 'version-script' : meson.current_source_dir() / 'nss-myhostname.sym', + }, +] diff --git a/src/nss-myhostname/nss-myhostname.c b/src/nss-myhostname/nss-myhostname.c new file mode 100644 index 0000000..ed41730 --- /dev/null +++ b/src/nss-myhostname/nss-myhostname.c @@ -0,0 +1,523 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "hostname-util.h" +#include "local-addresses.h" +#include "macro.h" +#include "nss-util.h" +#include "resolve-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "string-util.h" + +/* We use 127.0.0.2 as IPv4 address. This has the advantage over + * 127.0.0.1 that it can be translated back to the local hostname. For + * IPv6 we use ::1 which unfortunately will not translate back to the + * hostname but instead something like "localhost" or so. */ + +#define LOCALADDRESS_IPV4 (htobe32(INADDR_LOCALADDRESS)) +#define LOCALADDRESS_IPV6 &in6addr_loopback + +NSS_GETHOSTBYNAME_PROTOTYPES(myhostname); +NSS_GETHOSTBYADDR_PROTOTYPES(myhostname); + +enum nss_status _nss_myhostname_gethostbyname4_r( + const char *name, + struct gaih_addrtuple **pat, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp) { + + struct gaih_addrtuple *r_tuple, *r_tuple_prev = NULL; + _cleanup_free_ struct local_address *addresses = NULL; + _cleanup_free_ char *hn = NULL; + const char *canonical = NULL; + int n_addresses = 0; + uint32_t local_address_ipv4; + size_t l, idx, ms; + char *r_name; + + PROTECT_ERRNO; + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + assert(name); + assert(pat); + assert(buffer); + assert(errnop); + assert(h_errnop); + + if (is_localhost(name)) { + /* We respond to 'localhost', so that /etc/hosts is optional */ + + canonical = "localhost"; + local_address_ipv4 = htobe32(INADDR_LOOPBACK); + + } else if (is_gateway_hostname(name)) { + + n_addresses = local_gateways(NULL, 0, AF_UNSPEC, &addresses); + if (n_addresses <= 0) + goto not_found; + + canonical = "_gateway"; + + } else if (is_outbound_hostname(name)) { + + n_addresses = local_outbounds(NULL, 0, AF_UNSPEC, &addresses); + if (n_addresses <= 0) + goto not_found; + + canonical = "_outbound"; + + } else { + hn = gethostname_malloc(); + if (!hn) { + UNPROTECT_ERRNO; + *errnop = ENOMEM; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_TRYAGAIN; + } + + /* We respond to our local hostname, our hostname suffixed with a single dot. */ + if (!streq(name, hn) && !streq_ptr(startswith(name, hn), ".")) + goto not_found; + + n_addresses = local_addresses(NULL, 0, AF_UNSPEC, &addresses); + if (n_addresses < 0) + n_addresses = 0; + + canonical = hn; + local_address_ipv4 = LOCALADDRESS_IPV4; + } + + l = strlen(canonical); + ms = ALIGN(l+1) + ALIGN(sizeof(struct gaih_addrtuple)) * (n_addresses > 0 ? n_addresses : 1 + socket_ipv6_is_enabled()); + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, fill in hostname */ + r_name = buffer; + memcpy(r_name, canonical, l+1); + idx = ALIGN(l+1); + + assert(n_addresses >= 0); + if (n_addresses == 0) { + /* Second, fill in IPv6 tuple */ + if (socket_ipv6_is_enabled()) { + r_tuple = (struct gaih_addrtuple*) (buffer + idx); + r_tuple->next = r_tuple_prev; + r_tuple->name = r_name; + r_tuple->family = AF_INET6; + memcpy(r_tuple->addr, LOCALADDRESS_IPV6, 16); + r_tuple->scopeid = 0; + + idx += ALIGN(sizeof(struct gaih_addrtuple)); + r_tuple_prev = r_tuple; + } + + /* Third, fill in IPv4 tuple */ + r_tuple = (struct gaih_addrtuple*) (buffer + idx); + r_tuple->next = r_tuple_prev; + r_tuple->name = r_name; + r_tuple->family = AF_INET; + *(uint32_t*) r_tuple->addr = local_address_ipv4; + r_tuple->scopeid = 0; + + idx += ALIGN(sizeof(struct gaih_addrtuple)); + r_tuple_prev = r_tuple; + } + + /* Fourth, fill actual addresses in, but in backwards order */ + for (int i = n_addresses; i > 0; i--) { + struct local_address *a = addresses + i - 1; + + r_tuple = (struct gaih_addrtuple*) (buffer + idx); + r_tuple->next = r_tuple_prev; + r_tuple->name = r_name; + r_tuple->family = a->family; + r_tuple->scopeid = a->family == AF_INET6 && in6_addr_is_link_local(&a->address.in6) ? a->ifindex : 0; + memcpy(r_tuple->addr, &a->address, 16); + + idx += ALIGN(sizeof(struct gaih_addrtuple)); + r_tuple_prev = r_tuple; + } + + /* Verify the size matches */ + assert(idx == ms); + + /* Nscd expects us to store the first record in **pat. */ + if (*pat) + **pat = *r_tuple_prev; + else + *pat = r_tuple_prev; + + if (ttlp) + *ttlp = 0; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; + +not_found: + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; +} + +static enum nss_status fill_in_hostent( + const char *canonical, const char *additional, + int af, + struct local_address *addresses, unsigned n_addresses, + uint32_t local_address_ipv4, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp, + char **canonp) { + + size_t l_canonical, l_additional, idx, ms, alen; + char *r_addr, *r_name, *r_aliases, *r_alias = NULL, *r_addr_list; + struct local_address *a; + unsigned n, c; + + assert(canonical); + assert(IN_SET(af, AF_INET, AF_INET6)); + assert(result); + assert(buffer); + assert(errnop); + assert(h_errnop); + + PROTECT_ERRNO; + + alen = FAMILY_ADDRESS_SIZE(af); + + for (a = addresses, n = 0, c = 0; n < n_addresses; a++, n++) + if (af == a->family) + c++; + + l_canonical = strlen(canonical); + l_additional = strlen_ptr(additional); + ms = ALIGN(l_canonical+1)+ + (additional ? ALIGN(l_additional+1) : 0) + + sizeof(char*) + + (additional ? sizeof(char*) : 0) + + (c > 0 ? c : af == AF_INET ? 1 : socket_ipv6_is_enabled()) * ALIGN(alen) + + (c > 0 ? c+1 : af == AF_INET ? 2 : (unsigned) socket_ipv6_is_enabled() + 1) * sizeof(char*); + + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, fill in hostnames */ + r_name = buffer; + memcpy(r_name, canonical, l_canonical+1); + idx = ALIGN(l_canonical+1); + + if (additional) { + r_alias = buffer + idx; + memcpy(r_alias, additional, l_additional+1); + idx += ALIGN(l_additional+1); + } + + /* Second, create aliases array */ + r_aliases = buffer + idx; + if (additional) { + ((char**) r_aliases)[0] = r_alias; + ((char**) r_aliases)[1] = NULL; + idx += 2*sizeof(char*); + } else { + ((char**) r_aliases)[0] = NULL; + idx += sizeof(char*); + } + + /* Third, add addresses */ + r_addr = buffer + idx; + if (c > 0) { + unsigned i = 0; + + for (a = addresses, n = 0; n < n_addresses; a++, n++) { + if (af != a->family) + continue; + + memcpy(r_addr + i*ALIGN(alen), &a->address, alen); + i++; + } + + assert(i == c); + idx += c*ALIGN(alen); + + } else if (af == AF_INET) { + *(uint32_t*) r_addr = local_address_ipv4; + idx += ALIGN(alen); + } else if (socket_ipv6_is_enabled()) { + memcpy(r_addr, LOCALADDRESS_IPV6, 16); + idx += ALIGN(alen); + } + + /* Fourth, add address pointer array */ + r_addr_list = buffer + idx; + if (c > 0) { + unsigned i; + + for (i = 0; i < c; i++) + ((char**) r_addr_list)[i] = r_addr + i*ALIGN(alen); + + ((char**) r_addr_list)[i] = NULL; + idx += (c+1) * sizeof(char*); + + } else if (af == AF_INET || socket_ipv6_is_enabled()) { + ((char**) r_addr_list)[0] = r_addr; + ((char**) r_addr_list)[1] = NULL; + idx += 2 * sizeof(char*); + } else { + ((char**) r_addr_list)[0] = NULL; + idx += sizeof(char*); + } + + /* Verify the size matches */ + assert(idx == ms); + + result->h_name = r_name; + result->h_aliases = (char**) r_aliases; + result->h_addrtype = af; + result->h_length = alen; + result->h_addr_list = (char**) r_addr_list; + + if (ttlp) + *ttlp = 0; + + if (canonp) + *canonp = r_name; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_myhostname_gethostbyname3_r( + const char *name, + int af, + struct hostent *host, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp, + char **canonp) { + + _cleanup_free_ struct local_address *addresses = NULL; + const char *canonical, *additional = NULL; + _cleanup_free_ char *hn = NULL; + uint32_t local_address_ipv4 = 0; + int n_addresses = 0; + + PROTECT_ERRNO; + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + assert(name); + assert(host); + assert(buffer); + assert(errnop); + assert(h_errnop); + + if (af == AF_UNSPEC) + af = AF_INET; + + if (!IN_SET(af, AF_INET, AF_INET6)) { + UNPROTECT_ERRNO; + *errnop = EAFNOSUPPORT; + *h_errnop = NO_DATA; + return NSS_STATUS_UNAVAIL; + } + + if (af == AF_INET6 && !socket_ipv6_is_enabled()) + goto not_found; + + if (is_localhost(name)) { + + canonical = "localhost"; + local_address_ipv4 = htobe32(INADDR_LOOPBACK); + + } else if (is_gateway_hostname(name)) { + + n_addresses = local_gateways(NULL, 0, af, &addresses); + if (n_addresses <= 0) + goto not_found; + + canonical = "_gateway"; + + } else if (is_outbound_hostname(name)) { + + n_addresses = local_outbounds(NULL, 0, af, &addresses); + if (n_addresses <= 0) + goto not_found; + + canonical = "_outbound"; + + } else { + hn = gethostname_malloc(); + if (!hn) { + UNPROTECT_ERRNO; + *errnop = ENOMEM; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_TRYAGAIN; + } + + if (!streq(name, hn) && !streq_ptr(startswith(name, hn), ".")) + goto not_found; + + n_addresses = local_addresses(NULL, 0, af, &addresses); + if (n_addresses < 0) + n_addresses = 0; + + canonical = hn; + additional = n_addresses <= 0 && af == AF_INET6 ? "localhost" : NULL; + local_address_ipv4 = LOCALADDRESS_IPV4; + } + + UNPROTECT_ERRNO; + + return fill_in_hostent( + canonical, additional, + af, + addresses, n_addresses, + local_address_ipv4, + host, + buffer, buflen, + errnop, h_errnop, + ttlp, + canonp); + +not_found: + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; +} + +enum nss_status _nss_myhostname_gethostbyaddr2_r( + const void* addr, socklen_t len, + int af, + struct hostent *host, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp) { + + const char *canonical = NULL, *additional = NULL; + uint32_t local_address_ipv4 = LOCALADDRESS_IPV4; + _cleanup_free_ struct local_address *addresses = NULL; + _cleanup_free_ char *hn = NULL; + int n_addresses = 0; + struct local_address *a; + bool additional_from_hostname = false; + unsigned n; + + PROTECT_ERRNO; + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + assert(addr); + assert(host); + assert(buffer); + assert(errnop); + assert(h_errnop); + + if (!IN_SET(af, AF_INET, AF_INET6)) { + UNPROTECT_ERRNO; + *errnop = EAFNOSUPPORT; + *h_errnop = NO_DATA; + return NSS_STATUS_UNAVAIL; + } + + if (len != FAMILY_ADDRESS_SIZE(af)) { + UNPROTECT_ERRNO; + *errnop = EINVAL; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; + } + + if (af == AF_INET) { + if ((*(uint32_t*) addr) == LOCALADDRESS_IPV4) + goto found; + + if ((*(uint32_t*) addr) == htobe32(INADDR_LOOPBACK)) { + canonical = "localhost"; + local_address_ipv4 = htobe32(INADDR_LOOPBACK); + goto found; + } + + } else { + assert(af == AF_INET6); + + if (!socket_ipv6_is_enabled()) + goto not_found; + + if (memcmp(addr, LOCALADDRESS_IPV6, 16) == 0) { + canonical = "localhost"; + additional_from_hostname = true; + goto found; + } + } + + n_addresses = local_addresses(NULL, 0, af, &addresses); + for (a = addresses, n = 0; (int) n < n_addresses; n++, a++) + if (memcmp(addr, &a->address, FAMILY_ADDRESS_SIZE(af)) == 0) + goto found; + + addresses = mfree(addresses); + + n_addresses = local_gateways(NULL, 0, af, &addresses); + for (a = addresses, n = 0; (int) n < n_addresses; n++, a++) + if (memcmp(addr, &a->address, FAMILY_ADDRESS_SIZE(af)) == 0) { + canonical = "_gateway"; + goto found; + } + +not_found: + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; + +found: + if (!canonical || additional_from_hostname) { + hn = gethostname_malloc(); + if (!hn) { + UNPROTECT_ERRNO; + *errnop = ENOMEM; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_TRYAGAIN; + } + + if (!canonical) + canonical = hn; + else + additional = hn; + } + + UNPROTECT_ERRNO; + return fill_in_hostent( + canonical, additional, + af, + addresses, n_addresses, + local_address_ipv4, + host, + buffer, buflen, + errnop, h_errnop, + ttlp, + NULL); +} + +NSS_GETHOSTBYNAME_FALLBACKS(myhostname); +NSS_GETHOSTBYADDR_FALLBACKS(myhostname); diff --git a/src/nss-myhostname/nss-myhostname.sym b/src/nss-myhostname/nss-myhostname.sym new file mode 100644 index 0000000..21ab637 --- /dev/null +++ b/src/nss-myhostname/nss-myhostname.sym @@ -0,0 +1,19 @@ +/*** + SPDX-License-Identifier: LGPL-2.1-or-later + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +{ +global: + _nss_myhostname_gethostbyname_r; + _nss_myhostname_gethostbyname2_r; + _nss_myhostname_gethostbyname3_r; + _nss_myhostname_gethostbyname4_r; + _nss_myhostname_gethostbyaddr_r; + _nss_myhostname_gethostbyaddr2_r; +local: *; +}; diff --git a/src/nss-mymachines/meson.build b/src/nss-mymachines/meson.build new file mode 100644 index 0000000..3e10325 --- /dev/null +++ b/src/nss-mymachines/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +modules += [ + nss_template + { + 'name' : 'nss_mymachines', + 'conditions' : ['ENABLE_NSS_MYMACHINES'], + 'sources' : files('nss-mymachines.c'), + 'version-script' : meson.current_source_dir() / 'nss-mymachines.sym', + }, +] diff --git a/src/nss-mymachines/nss-mymachines.c b/src/nss-mymachines/nss-mymachines.c new file mode 100644 index 0000000..c64e79b --- /dev/null +++ b/src/nss-mymachines/nss-mymachines.c @@ -0,0 +1,440 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-bus.h" +#include "sd-login.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-locator.h" +#include "env-util.h" +#include "errno-util.h" +#include "format-util.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "nss-util.h" +#include "signal-util.h" +#include "string-util.h" + +static void setup_logging_once(void) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + assert_se(pthread_once(&once, log_parse_environment_variables) == 0); +} + +#define NSS_ENTRYPOINT_BEGIN \ + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); \ + setup_logging_once() + +NSS_GETHOSTBYNAME_PROTOTYPES(mymachines); +NSS_GETPW_PROTOTYPES(mymachines); +NSS_GETGR_PROTOTYPES(mymachines); + +static int count_addresses(sd_bus_message *m, int af, unsigned *ret) { + unsigned c = 0; + int r; + + assert(m); + assert(ret); + + while ((r = sd_bus_message_enter_container(m, 'r', "iay")) > 0) { + int family; + + r = sd_bus_message_read(m, "i", &family); + if (r < 0) + return r; + + r = sd_bus_message_skip(m, "ay"); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + if (af != AF_UNSPEC && family != af) + continue; + + c++; + } + if (r < 0) + return r; + + r = sd_bus_message_rewind(m, false); + if (r < 0) + return r; + + *ret = c; + return 0; +} + +static bool avoid_deadlock(void) { + + /* Check whether this lookup might have a chance of deadlocking because we are called from the service manager + * code activating systemd-machined.service. After all, we shouldn't synchronously do lookups to + * systemd-machined if we are required to finish before it can be started. This of course won't detect all + * possible dead locks of this kind, but it should work for the most obvious cases. */ + + if (geteuid() != 0) /* Ignore the env vars unless we are privileged. */ + return false; + + return streq_ptr(getenv("SYSTEMD_ACTIVATION_UNIT"), "systemd-machined.service") && + streq_ptr(getenv("SYSTEMD_ACTIVATION_SCOPE"), "system"); +} + +enum nss_status _nss_mymachines_gethostbyname4_r( + const char *name, + struct gaih_addrtuple **pat, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp) { + + struct gaih_addrtuple *r_tuple, *r_tuple_first = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ int *ifindices = NULL; + _cleanup_free_ char *class = NULL; + size_t l, ms, idx; + unsigned i = 0, c = 0; + char *r_name; + int n_ifindices, r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(pat); + assert(buffer); + assert(errnop); + assert(h_errnop); + + r = sd_machine_get_class(name, &class); + if (r < 0) + goto fail; + if (!streq(class, "container")) { + r = -ENOTTY; + goto fail; + } + + n_ifindices = sd_machine_get_ifindices(name, &ifindices); + if (n_ifindices < 0) { + r = n_ifindices; + goto fail; + } + + if (avoid_deadlock()) { + r = -EDEADLK; + goto fail; + } + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = bus_call_method(bus, bus_machine_mgr, "GetMachineAddresses", NULL, &reply, "s", name); + if (r < 0) + goto fail; + + r = sd_bus_message_enter_container(reply, 'a', "(iay)"); + if (r < 0) + goto fail; + + r = count_addresses(reply, AF_UNSPEC, &c); + if (r < 0) + goto fail; + + if (c <= 0) { + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; + } + + l = strlen(name); + ms = ALIGN(l+1) + ALIGN(sizeof(struct gaih_addrtuple)) * c; + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, append name */ + r_name = buffer; + memcpy(r_name, name, l+1); + idx = ALIGN(l+1); + + /* Second, append addresses */ + r_tuple_first = (struct gaih_addrtuple*) (buffer + idx); + while ((r = sd_bus_message_enter_container(reply, 'r', "iay")) > 0) { + int family; + const void *a; + size_t sz; + + r = sd_bus_message_read(reply, "i", &family); + if (r < 0) + goto fail; + + r = sd_bus_message_read_array(reply, 'y', &a, &sz); + if (r < 0) + goto fail; + + r = sd_bus_message_exit_container(reply); + if (r < 0) + goto fail; + + if (!IN_SET(family, AF_INET, AF_INET6)) { + r = -EAFNOSUPPORT; + goto fail; + } + + if (sz != FAMILY_ADDRESS_SIZE(family)) { + r = -EINVAL; + goto fail; + } + + r_tuple = (struct gaih_addrtuple*) (buffer + idx); + r_tuple->next = i == c-1 ? NULL : (struct gaih_addrtuple*) ((char*) r_tuple + ALIGN(sizeof(struct gaih_addrtuple))); + r_tuple->name = r_name; + r_tuple->family = family; + r_tuple->scopeid = n_ifindices == 1 ? ifindices[0] : 0; + memcpy(r_tuple->addr, a, sz); + + idx += ALIGN(sizeof(struct gaih_addrtuple)); + i++; + } + + assert(i == c); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + goto fail; + + assert(idx == ms); + + if (*pat) + **pat = *r_tuple_first; + else + *pat = r_tuple_first; + + if (ttlp) + *ttlp = 0; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; + +fail: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; +} + +enum nss_status _nss_mymachines_gethostbyname3_r( + const char *name, + int af, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp, + char **canonp) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *class = NULL; + unsigned c = 0, i = 0; + char *r_name, *r_aliases, *r_addr, *r_addr_list; + size_t l, idx, ms, alen; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(result); + assert(buffer); + assert(errnop); + assert(h_errnop); + + if (af == AF_UNSPEC) + af = AF_INET; + + if (af != AF_INET && af != AF_INET6) { + r = -EAFNOSUPPORT; + goto fail; + } + + r = sd_machine_get_class(name, &class); + if (r < 0) + goto fail; + if (!streq(class, "container")) { + r = -ENOTTY; + goto fail; + } + + if (avoid_deadlock()) { + r = -EDEADLK; + goto fail; + } + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = bus_call_method(bus, bus_machine_mgr, "GetMachineAddresses", NULL, &reply, "s", name); + if (r < 0) + goto fail; + + r = sd_bus_message_enter_container(reply, 'a', "(iay)"); + if (r < 0) + goto fail; + + r = count_addresses(reply, af, &c); + if (r < 0) + goto fail; + + if (c <= 0) { + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; + } + + alen = FAMILY_ADDRESS_SIZE(af); + l = strlen(name); + + ms = ALIGN(l+1) + c * ALIGN(alen) + (c+2) * sizeof(char*); + + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, append name */ + r_name = buffer; + memcpy(r_name, name, l+1); + idx = ALIGN(l+1); + + /* Second, create aliases array */ + r_aliases = buffer + idx; + ((char**) r_aliases)[0] = NULL; + idx += sizeof(char*); + + /* Third, append addresses */ + r_addr = buffer + idx; + while ((r = sd_bus_message_enter_container(reply, 'r', "iay")) > 0) { + int family; + const void *a; + size_t sz; + + r = sd_bus_message_read(reply, "i", &family); + if (r < 0) + goto fail; + + r = sd_bus_message_read_array(reply, 'y', &a, &sz); + if (r < 0) + goto fail; + + r = sd_bus_message_exit_container(reply); + if (r < 0) + goto fail; + + if (family != af) + continue; + + if (sz != alen) { + r = -EINVAL; + goto fail; + } + + memcpy(r_addr + i*ALIGN(alen), a, alen); + i++; + } + + assert(i == c); + idx += c * ALIGN(alen); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + goto fail; + + /* Third, append address pointer array */ + r_addr_list = buffer + idx; + for (i = 0; i < c; i++) + ((char**) r_addr_list)[i] = r_addr + i*ALIGN(alen); + + ((char**) r_addr_list)[i] = NULL; + idx += (c+1) * sizeof(char*); + + assert(idx == ms); + + result->h_name = r_name; + result->h_aliases = (char**) r_aliases; + result->h_addrtype = af; + result->h_length = alen; + result->h_addr_list = (char**) r_addr_list; + + if (ttlp) + *ttlp = 0; + + if (canonp) + *canonp = r_name; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; + +fail: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; +} + +NSS_GETHOSTBYNAME_FALLBACKS(mymachines); + +enum nss_status _nss_mymachines_getpwnam_r( + const char *name, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + return NSS_STATUS_NOTFOUND; +} + +enum nss_status _nss_mymachines_getpwuid_r( + uid_t uid, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + return NSS_STATUS_NOTFOUND; +} + +enum nss_status _nss_mymachines_getgrnam_r( + const char *name, + struct group *gr, + char *buffer, size_t buflen, + int *errnop) { + + return NSS_STATUS_NOTFOUND; +} + +enum nss_status _nss_mymachines_getgrgid_r( + gid_t gid, + struct group *gr, + char *buffer, size_t buflen, + int *errnop) { + + return NSS_STATUS_NOTFOUND; +} diff --git a/src/nss-mymachines/nss-mymachines.sym b/src/nss-mymachines/nss-mymachines.sym new file mode 100644 index 0000000..258244e --- /dev/null +++ b/src/nss-mymachines/nss-mymachines.sym @@ -0,0 +1,21 @@ +/*** + SPDX-License-Identifier: LGPL-2.1-or-later + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +{ +global: + _nss_mymachines_gethostbyname_r; + _nss_mymachines_gethostbyname2_r; + _nss_mymachines_gethostbyname3_r; + _nss_mymachines_gethostbyname4_r; + _nss_mymachines_getpwnam_r; + _nss_mymachines_getpwuid_r; + _nss_mymachines_getgrnam_r; + _nss_mymachines_getgrgid_r; +local: *; +}; diff --git a/src/nss-resolve/meson.build b/src/nss-resolve/meson.build new file mode 100644 index 0000000..1fae426 --- /dev/null +++ b/src/nss-resolve/meson.build @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +modules += [ + nss_template + { + 'name' : 'nss_resolve', + 'conditions' : ['ENABLE_NSS_RESOLVE'], + 'sources' : files('nss-resolve.c'), + 'version-script' : meson.current_source_dir() / 'nss-resolve.sym', + 'include_directories' : includes + + include_directories('../resolve'), + }, +] diff --git a/src/nss-resolve/nss-resolve.c b/src/nss-resolve/nss-resolve.c new file mode 100644 index 0000000..c4e02bc --- /dev/null +++ b/src/nss-resolve/nss-resolve.c @@ -0,0 +1,759 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "env-util.h" +#include "errno-util.h" +#include "glyph-util.h" +#include "in-addr-util.h" +#include "macro.h" +#include "nss-util.h" +#include "resolved-def.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "varlink.h" + +static JsonDispatchFlags json_dispatch_flags = 0; + +static void setup_logging(void) { + log_parse_environment_variables(); + + if (DEBUG_LOGGING) + json_dispatch_flags = JSON_LOG; +} + +static void setup_logging_once(void) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + assert_se(pthread_once(&once, setup_logging) == 0); +} + +#define NSS_ENTRYPOINT_BEGIN \ + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); \ + setup_logging_once() + +NSS_GETHOSTBYNAME_PROTOTYPES(resolve); +NSS_GETHOSTBYADDR_PROTOTYPES(resolve); + +static bool error_shall_fallback(const char *error_id) { + /* The Varlink errors where we shall signal "please fallback" back to the NSS stack, so that some + * fallback module can be loaded. (These are mostly all Varlink-internal errors, as apparently we + * then were unable to even do IPC with systemd-resolved.) */ + return STR_IN_SET(error_id, + VARLINK_ERROR_DISCONNECTED, + VARLINK_ERROR_TIMEOUT, + VARLINK_ERROR_PROTOCOL, + VARLINK_ERROR_INTERFACE_NOT_FOUND, + VARLINK_ERROR_METHOD_NOT_FOUND, + VARLINK_ERROR_METHOD_NOT_IMPLEMENTED); +} + +static bool error_shall_try_again(const char *error_id) { + /* The Varlink errors where we shall signal "can't answer now but might be able to later" back to the + * NSS stack. These are all errors that indicate lack of configuration or network problems. */ + return STR_IN_SET(error_id, + "io.systemd.Resolve.NoNameServers", + "io.systemd.Resolve.QueryTimedOut", + "io.systemd.Resolve.MaxAttemptsReached", + "io.systemd.Resolve.NetworkDown"); +} + +static int connect_to_resolved(Varlink **ret) { + _cleanup_(varlink_unrefp) Varlink *link = NULL; + int r; + + r = varlink_connect_address(&link, "/run/systemd/resolve/io.systemd.Resolve"); + if (r < 0) + return r; + + r = varlink_set_relative_timeout(link, SD_RESOLVED_QUERY_TIMEOUT_USEC); + if (r < 0) + return r; + + *ret = TAKE_PTR(link); + return 0; +} + +static uint32_t ifindex_to_scopeid(int family, const void *a, int ifindex) { + struct in6_addr in6; + + if (family != AF_INET6 || ifindex == 0) + return 0; + + /* Some apps can't deal with the scope ID attached to non-link-local addresses. Hence, let's suppress that. */ + + assert(sizeof(in6) == FAMILY_ADDRESS_SIZE(AF_INET6)); + memcpy(&in6, a, sizeof(struct in6_addr)); + + return in6_addr_is_link_local(&in6) ? ifindex : 0; +} + +static int json_dispatch_ifindex(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int *ifi = ASSERT_PTR(userdata); + int64_t t; + + assert(variant); + + if (!json_variant_is_integer(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name)); + + t = json_variant_integer(variant); + if (t > INT_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is out of bounds for an interface index.", strna(name)); + + *ifi = (int) t; + return 0; +} + +static int json_dispatch_family(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int *family = ASSERT_PTR(userdata); + int64_t t; + + assert(variant); + + if (!json_variant_is_integer(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name)); + + t = json_variant_integer(variant); + if (t < 0 || t > INT_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid family.", strna(name)); + + *family = (int) t; + return 0; +} + +typedef struct ResolveHostnameReply { + JsonVariant *addresses; + char *name; + uint64_t flags; +} ResolveHostnameReply; + +static void resolve_hostname_reply_destroy(ResolveHostnameReply *p) { + assert(p); + + json_variant_unref(p->addresses); + free(p->name); +} + +static const JsonDispatch resolve_hostname_reply_dispatch_table[] = { + { "addresses", JSON_VARIANT_ARRAY, json_dispatch_variant, offsetof(ResolveHostnameReply, addresses), JSON_MANDATORY }, + { "name", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ResolveHostnameReply, name), 0 }, + { "flags", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(ResolveHostnameReply, flags), 0 }, + {} +}; + +typedef struct AddressParameters { + int ifindex; + int family; + union in_addr_union address; + size_t address_size; +} AddressParameters; + +static int json_dispatch_address(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + AddressParameters *p = ASSERT_PTR(userdata); + union in_addr_union buf = {}; + JsonVariant *i; + size_t n, k = 0; + + assert(variant); + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + n = json_variant_elements(variant); + if (!IN_SET(n, 4, 16)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is array of unexpected size.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(i, variant) { + int64_t b; + + if (!json_variant_is_integer(i)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "Element %zu of JSON field '%s' is not an integer.", k, strna(name)); + + b = json_variant_integer(i); + if (b < 0 || b > 0xff) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), + "Element %zu of JSON field '%s' is out of range 0%s255.", + k, strna(name), special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + buf.bytes[k++] = (uint8_t) b; + } + + p->address = buf; + p->address_size = k; + + return 0; +} + +static const JsonDispatch address_parameters_dispatch_table[] = { + { "ifindex", JSON_VARIANT_INTEGER, json_dispatch_ifindex, offsetof(AddressParameters, ifindex), 0 }, + { "family", JSON_VARIANT_INTEGER, json_dispatch_family, offsetof(AddressParameters, family), JSON_MANDATORY }, + { "address", JSON_VARIANT_ARRAY, json_dispatch_address, 0, JSON_MANDATORY }, + {} +}; + +static uint64_t query_flag( + const char *name, + const int value, + uint64_t flag) { + int r; + + r = getenv_bool_secure(name); + if (r >= 0) + return r == value ? flag : 0; + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $%s, ignoring.", name); + return 0; +} + +static uint64_t query_flags(void) { + /* Allow callers to turn off validation, synthetization, caching, etc., when we resolve via + * nss-resolve. */ + return query_flag("SYSTEMD_NSS_RESOLVE_VALIDATE", 0, SD_RESOLVED_NO_VALIDATE) | + query_flag("SYSTEMD_NSS_RESOLVE_SYNTHESIZE", 0, SD_RESOLVED_NO_SYNTHESIZE) | + query_flag("SYSTEMD_NSS_RESOLVE_CACHE", 0, SD_RESOLVED_NO_CACHE) | + query_flag("SYSTEMD_NSS_RESOLVE_ZONE", 0, SD_RESOLVED_NO_ZONE) | + query_flag("SYSTEMD_NSS_RESOLVE_TRUST_ANCHOR", 0, SD_RESOLVED_NO_TRUST_ANCHOR) | + query_flag("SYSTEMD_NSS_RESOLVE_NETWORK", 0, SD_RESOLVED_NO_NETWORK); +} + +enum nss_status _nss_resolve_gethostbyname4_r( + const char *name, + struct gaih_addrtuple **pat, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp) { + + _cleanup_(varlink_unrefp) Varlink *link = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *cparams = NULL; + _cleanup_(resolve_hostname_reply_destroy) ResolveHostnameReply p = {}; + JsonVariant *rparams, *entry; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(pat); + assert(buffer); + assert(errnop); + assert(h_errnop); + + r = connect_to_resolved(&link); + if (r < 0) + goto fail; + + r = json_build(&cparams, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(name)), + JSON_BUILD_PAIR("flags", JSON_BUILD_UNSIGNED(query_flags())))); + if (r < 0) + goto fail; + + /* Return NSS_STATUS_UNAVAIL when communication with systemd-resolved fails, allowing falling + * back to other nss modules. Treat all other error conditions as NOTFOUND. This includes + * DNSSEC errors and suchlike. (We don't use UNAVAIL in this case so that the nsswitch.conf + * configuration can distinguish such executed but negative replies from complete failure to + * talk to resolved). */ + const char *error_id; + r = varlink_call(link, "io.systemd.Resolve.ResolveHostname", cparams, &rparams, &error_id, NULL); + if (r < 0) + goto fail; + if (!isempty(error_id)) { + if (error_shall_try_again(error_id)) + goto try_again; + if (error_shall_fallback(error_id)) + goto fail; + if (streq(error_id, "io.systemd.Resolve.NoSuchResourceRecord")) + goto no_data; + goto not_found; + } + + r = json_dispatch(rparams, resolve_hostname_reply_dispatch_table, json_dispatch_flags, &p); + if (r < 0) + goto fail; + if (json_variant_is_blank_object(p.addresses)) + goto not_found; + + size_t n_addresses = 0; + JSON_VARIANT_ARRAY_FOREACH(entry, p.addresses) { + AddressParameters q = {}; + + r = json_dispatch(entry, address_parameters_dispatch_table, json_dispatch_flags, &q); + if (r < 0) + goto fail; + + if (!IN_SET(q.family, AF_INET, AF_INET6)) + continue; + + if (q.address_size != FAMILY_ADDRESS_SIZE(q.family)) { + r = -EINVAL; + goto fail; + } + + n_addresses++; + } + + const char *canonical = p.name ?: name; + size_t l = strlen(canonical); + size_t idx, ms = ALIGN(l+1) + ALIGN(sizeof(struct gaih_addrtuple)) * n_addresses; + + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, append name */ + char *r_name = buffer; + memcpy(r_name, canonical, l + 1); + idx = ALIGN(l + 1); + + /* Second, append addresses */ + struct gaih_addrtuple *r_tuple = NULL, + *r_tuple_first = (struct gaih_addrtuple*) (buffer + idx); + + JSON_VARIANT_ARRAY_FOREACH(entry, p.addresses) { + AddressParameters q = {}; + + r = json_dispatch(entry, address_parameters_dispatch_table, json_dispatch_flags, &q); + if (r < 0) + goto fail; + + if (!IN_SET(q.family, AF_INET, AF_INET6)) + continue; + + r_tuple = (struct gaih_addrtuple*) (buffer + idx); + r_tuple->next = (struct gaih_addrtuple*) ((char*) r_tuple + ALIGN(sizeof(struct gaih_addrtuple))); + r_tuple->name = r_name; + r_tuple->family = q.family; + r_tuple->scopeid = ifindex_to_scopeid(q.family, &q.address, q.ifindex); + memcpy(r_tuple->addr, &q.address, q.address_size); + + idx += ALIGN(sizeof(struct gaih_addrtuple)); + } + + assert(r_tuple); /* We had at least one address, so r_tuple must be set */ + r_tuple->next = NULL; /* Override last next pointer */ + + assert(idx == ms); + + if (*pat) + **pat = *r_tuple_first; + else + *pat = r_tuple_first; + + if (ttlp) + *ttlp = 0; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; + +fail: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; + +not_found: + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; + +no_data: + *h_errnop = NO_DATA; + return NSS_STATUS_NOTFOUND; + +try_again: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = TRY_AGAIN; + return NSS_STATUS_TRYAGAIN; +} + +enum nss_status _nss_resolve_gethostbyname3_r( + const char *name, + int af, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp, + char **canonp) { + + _cleanup_(varlink_unrefp) Varlink *link = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *cparams = NULL; + _cleanup_(resolve_hostname_reply_destroy) ResolveHostnameReply p = {}; + JsonVariant *rparams, *entry; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(result); + assert(buffer); + assert(errnop); + assert(h_errnop); + + if (af == AF_UNSPEC) + af = AF_INET; + + if (!IN_SET(af, AF_INET, AF_INET6)) { + r = -EAFNOSUPPORT; + goto fail; + } + + r = connect_to_resolved(&link); + if (r < 0) + goto fail; + + r = json_build(&cparams, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("name", JSON_BUILD_STRING(name)), + JSON_BUILD_PAIR("family", JSON_BUILD_INTEGER(af)), + JSON_BUILD_PAIR("flags", JSON_BUILD_UNSIGNED(query_flags())))); + if (r < 0) + goto fail; + + const char *error_id; + r = varlink_call(link, "io.systemd.Resolve.ResolveHostname", cparams, &rparams, &error_id, NULL); + if (r < 0) + goto fail; + if (!isempty(error_id)) { + if (error_shall_try_again(error_id)) + goto try_again; + if (error_shall_fallback(error_id)) + goto fail; + if (streq(error_id, "io.systemd.Resolve.NoSuchResourceRecord")) + goto no_data; + goto not_found; + } + + r = json_dispatch(rparams, resolve_hostname_reply_dispatch_table, json_dispatch_flags, &p); + if (r < 0) + goto fail; + if (json_variant_is_blank_object(p.addresses)) + goto not_found; + + size_t n_addresses = 0; + JSON_VARIANT_ARRAY_FOREACH(entry, p.addresses) { + AddressParameters q = {}; + + r = json_dispatch(entry, address_parameters_dispatch_table, json_dispatch_flags, &q); + if (r < 0) + goto fail; + + if (!IN_SET(q.family, AF_INET, AF_INET6)) + continue; + + if (q.address_size != FAMILY_ADDRESS_SIZE(q.family)) { + r = -EINVAL; + goto fail; + } + + n_addresses++; + } + + const char *canonical = p.name ?: name; + + size_t alen = FAMILY_ADDRESS_SIZE(af); + size_t l = strlen(canonical); + + size_t idx, ms = ALIGN(l + 1) + n_addresses * ALIGN(alen) + (n_addresses + 2) * sizeof(char*); + + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, append name */ + char *r_name = buffer; + memcpy(r_name, canonical, l+1); + idx = ALIGN(l+1); + + /* Second, create empty aliases array */ + char *r_aliases = buffer + idx; + ((char**) r_aliases)[0] = NULL; + idx += sizeof(char*); + + /* Third, append addresses */ + char *r_addr = buffer + idx; + + size_t i = 0; + JSON_VARIANT_ARRAY_FOREACH(entry, p.addresses) { + AddressParameters q = {}; + + r = json_dispatch(entry, address_parameters_dispatch_table, json_dispatch_flags, &q); + if (r < 0) + goto fail; + + if (q.family != af) + continue; + + if (q.address_size != alen) { + r = -EINVAL; + goto fail; + } + + memcpy(r_addr + i*ALIGN(alen), &q.address, alen); + i++; + } + + assert(i == n_addresses); + idx += n_addresses * ALIGN(alen); + + /* Fourth, append address pointer array */ + char *r_addr_list = buffer + idx; + for (i = 0; i < n_addresses; i++) + ((char**) r_addr_list)[i] = r_addr + i*ALIGN(alen); + + ((char**) r_addr_list)[i] = NULL; + idx += (n_addresses + 1) * sizeof(char*); + + assert(idx == ms); + + result->h_name = r_name; + result->h_aliases = (char**) r_aliases; + result->h_addrtype = af; + result->h_length = alen; + result->h_addr_list = (char**) r_addr_list; + + if (ttlp) + *ttlp = 0; + + if (canonp) + *canonp = r_name; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; + +fail: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; + +not_found: + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; + +no_data: + *h_errnop = NO_DATA; + return NSS_STATUS_NOTFOUND; + +try_again: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = TRY_AGAIN; + return NSS_STATUS_TRYAGAIN; +} + +typedef struct ResolveAddressReply { + JsonVariant *names; + uint64_t flags; +} ResolveAddressReply; + +static void resolve_address_reply_destroy(ResolveAddressReply *p) { + assert(p); + + json_variant_unref(p->names); +} + +static const JsonDispatch resolve_address_reply_dispatch_table[] = { + { "names", JSON_VARIANT_ARRAY, json_dispatch_variant, offsetof(ResolveAddressReply, names), JSON_MANDATORY }, + { "flags", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(ResolveAddressReply, flags), 0 }, + {} +}; + +typedef struct NameParameters { + int ifindex; + char *name; +} NameParameters; + +static void name_parameters_destroy(NameParameters *p) { + assert(p); + + free(p->name); +} + +static const JsonDispatch name_parameters_dispatch_table[] = { + { "ifindex", JSON_VARIANT_INTEGER, json_dispatch_ifindex, offsetof(NameParameters, ifindex), 0 }, + { "name", JSON_VARIANT_STRING, json_dispatch_string, offsetof(NameParameters, name), JSON_MANDATORY }, + {} +}; + +enum nss_status _nss_resolve_gethostbyaddr2_r( + const void* addr, socklen_t len, + int af, + struct hostent *result, + char *buffer, size_t buflen, + int *errnop, int *h_errnop, + int32_t *ttlp) { + + _cleanup_(varlink_unrefp) Varlink *link = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *cparams = NULL; + _cleanup_(resolve_address_reply_destroy) ResolveAddressReply p = {}; + JsonVariant *rparams, *entry; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(addr); + assert(result); + assert(buffer); + assert(errnop); + assert(h_errnop); + + if (!IN_SET(af, AF_INET, AF_INET6)) { + UNPROTECT_ERRNO; + *errnop = EAFNOSUPPORT; + *h_errnop = NO_DATA; + return NSS_STATUS_UNAVAIL; + } + + if (len != FAMILY_ADDRESS_SIZE(af)) { + r = -EINVAL; + goto fail; + } + + r = connect_to_resolved(&link); + if (r < 0) + goto fail; + + r = json_build(&cparams, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("address", JSON_BUILD_BYTE_ARRAY(addr, len)), + JSON_BUILD_PAIR("family", JSON_BUILD_INTEGER(af)), + JSON_BUILD_PAIR("flags", JSON_BUILD_UNSIGNED(query_flags())))); + if (r < 0) + goto fail; + + const char* error_id; + r = varlink_call(link, "io.systemd.Resolve.ResolveAddress", cparams, &rparams, &error_id, NULL); + if (r < 0) + goto fail; + if (!isempty(error_id)) { + if (error_shall_try_again(error_id)) + goto try_again; + if (error_shall_fallback(error_id)) + goto fail; + goto not_found; + } + + r = json_dispatch(rparams, resolve_address_reply_dispatch_table, json_dispatch_flags, &p); + if (r < 0) + goto fail; + if (json_variant_is_blank_object(p.names)) + goto not_found; + + size_t ms = 0, idx; + + JSON_VARIANT_ARRAY_FOREACH(entry, p.names) { + _cleanup_(name_parameters_destroy) NameParameters q = {}; + + r = json_dispatch(entry, name_parameters_dispatch_table, json_dispatch_flags, &q); + if (r < 0) + goto fail; + + ms += ALIGN(strlen(q.name) + 1); + } + + size_t n_names = json_variant_elements(p.names); + ms += ALIGN(len) + /* the address */ + 2 * sizeof(char*) + /* pointer to the address, plus trailing NULL */ + n_names * sizeof(char*); /* pointers to aliases, plus trailing NULL */ + + if (buflen < ms) { + UNPROTECT_ERRNO; + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + + /* First, place address */ + char *r_addr = buffer; + memcpy(r_addr, addr, len); + idx = ALIGN(len); + + /* Second, place address list */ + char *r_addr_list = buffer + idx; + ((char**) r_addr_list)[0] = r_addr; + ((char**) r_addr_list)[1] = NULL; + idx += sizeof(char*) * 2; + + /* Third, reserve space for the aliases array, plus trailing NULL */ + char *r_aliases = buffer + idx; + idx += sizeof(char*) * n_names; + + /* Fourth, place aliases */ + char *r_name = buffer + idx; + + size_t i = 0; + JSON_VARIANT_ARRAY_FOREACH(entry, p.names) { + _cleanup_(name_parameters_destroy) NameParameters q = {}; + + r = json_dispatch(entry, name_parameters_dispatch_table, json_dispatch_flags, &q); + if (r < 0) + goto fail; + + size_t l = strlen(q.name); + char *z = buffer + idx; + memcpy(z, q.name, l + 1); + + if (i > 0) + ((char**) r_aliases)[i - 1] = z; + i++; + + idx += ALIGN(l + 1); + } + ((char**) r_aliases)[n_names - 1] = NULL; + + assert(idx == ms); + + result->h_name = r_name; + result->h_aliases = (char**) r_aliases; + result->h_addrtype = af; + result->h_length = len; + result->h_addr_list = (char**) r_addr_list; + + if (ttlp) + *ttlp = 0; + + /* Explicitly reset both *h_errnop and h_errno to work around + * https://bugzilla.redhat.com/show_bug.cgi?id=1125975 */ + *h_errnop = NETDB_SUCCESS; + h_errno = 0; + + return NSS_STATUS_SUCCESS; + +fail: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; + +not_found: + *h_errnop = HOST_NOT_FOUND; + return NSS_STATUS_NOTFOUND; + +try_again: + UNPROTECT_ERRNO; + *errnop = -r; + *h_errnop = TRY_AGAIN; + return NSS_STATUS_TRYAGAIN; +} + +NSS_GETHOSTBYNAME_FALLBACKS(resolve); +NSS_GETHOSTBYADDR_FALLBACKS(resolve); diff --git a/src/nss-resolve/nss-resolve.sym b/src/nss-resolve/nss-resolve.sym new file mode 100644 index 0000000..ecc958e --- /dev/null +++ b/src/nss-resolve/nss-resolve.sym @@ -0,0 +1,19 @@ +/*** + SPDX-License-Identifier: LGPL-2.1-or-later + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +{ +global: + _nss_resolve_gethostbyname_r; + _nss_resolve_gethostbyname2_r; + _nss_resolve_gethostbyname3_r; + _nss_resolve_gethostbyname4_r; + _nss_resolve_gethostbyaddr_r; + _nss_resolve_gethostbyaddr2_r; +local: *; +}; diff --git a/src/nss-systemd/meson.build b/src/nss-systemd/meson.build new file mode 100644 index 0000000..ec4f780 --- /dev/null +++ b/src/nss-systemd/meson.build @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +modules += [ + nss_template + { + 'name' : 'nss_systemd', + 'conditions' : ['ENABLE_NSS_SYSTEMD'], + 'sources' : files( + 'nss-systemd.c', + 'userdb-glue.c', + ), + 'version-script' : meson.current_source_dir() / 'nss-systemd.sym', + }, +] diff --git a/src/nss-systemd/nss-systemd.c b/src/nss-systemd/nss-systemd.c new file mode 100644 index 0000000..1d6e253 --- /dev/null +++ b/src/nss-systemd/nss-systemd.c @@ -0,0 +1,1084 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "missing_threads.h" +#include "nss-systemd.h" +#include "nss-util.h" +#include "pthread-util.h" +#include "signal-util.h" +#include "strv.h" +#include "user-record-nss.h" +#include "user-util.h" +#include "userdb-glue.h" +#include "userdb.h" + +static const struct passwd root_passwd = { + .pw_name = (char*) "root", + .pw_passwd = (char*) PASSWORD_SEE_SHADOW, + .pw_uid = 0, + .pw_gid = 0, + .pw_gecos = (char*) "Super User", + .pw_dir = (char*) "/root", + .pw_shell = NULL, +}; + +static const struct spwd root_spwd = { + .sp_namp = (char*) "root", + .sp_pwdp = (char*) PASSWORD_LOCKED_AND_INVALID, + .sp_lstchg = -1, + .sp_min = -1, + .sp_max = -1, + .sp_warn = -1, + .sp_inact = -1, + .sp_expire = -1, + .sp_flag = ULONG_MAX, /* this appears to be what everybody does ... */ +}; + +static const struct passwd nobody_passwd = { + .pw_name = (char*) NOBODY_USER_NAME, + .pw_passwd = (char*) PASSWORD_LOCKED_AND_INVALID, + .pw_uid = UID_NOBODY, + .pw_gid = GID_NOBODY, + .pw_gecos = (char*) "Kernel Overflow User", + .pw_dir = (char*) "/", + .pw_shell = (char*) NOLOGIN, +}; + +static const struct spwd nobody_spwd = { + .sp_namp = (char*) NOBODY_USER_NAME, + .sp_pwdp = (char*) PASSWORD_LOCKED_AND_INVALID, + .sp_lstchg = -1, + .sp_min = -1, + .sp_max = -1, + .sp_warn = -1, + .sp_inact = -1, + .sp_expire = -1, + .sp_flag = ULONG_MAX, /* this appears to be what everybody does ... */ +}; + +static const struct group root_group = { + .gr_name = (char*) "root", + .gr_gid = 0, + .gr_passwd = (char*) PASSWORD_SEE_SHADOW, + .gr_mem = (char*[]) { NULL }, +}; + +static const struct sgrp root_sgrp = { + .sg_namp = (char*) "root", + .sg_passwd = (char*) PASSWORD_LOCKED_AND_INVALID, +}; + +static const struct group nobody_group = { + .gr_name = (char*) NOBODY_GROUP_NAME, + .gr_gid = GID_NOBODY, + .gr_passwd = (char*) PASSWORD_LOCKED_AND_INVALID, + .gr_mem = (char*[]) { NULL }, +}; + +static const struct sgrp nobody_sgrp = { + .sg_namp = (char*) NOBODY_GROUP_NAME, + .sg_passwd = (char*) PASSWORD_LOCKED_AND_INVALID, +}; + +typedef struct GetentData { + /* As explained in NOTES section of getpwent_r(3) as 'getpwent_r() is not really reentrant since it + * shares the reading position in the stream with all other threads', we need to protect the data in + * UserDBIterator from multithreaded programs which may call setpwent(), getpwent_r(), or endpwent() + * simultaneously. So, each function locks the data by using the mutex below. */ + pthread_mutex_t mutex; + UserDBIterator *iterator; + + /* Applies to group iterations only: true while we iterate over groups defined through NSS, false + * otherwise. */ + bool by_membership; +} GetentData; + +static GetentData getpwent_data = { + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +static GetentData getgrent_data = { + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +static GetentData getspent_data = { + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +static GetentData getsgent_data = { + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +static void setup_logging_once(void) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + assert_se(pthread_once(&once, log_parse_environment_variables) == 0); +} + +#define NSS_ENTRYPOINT_BEGIN \ + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); \ + setup_logging_once() + +NSS_GETPW_PROTOTYPES(systemd); +NSS_GETSP_PROTOTYPES(systemd); +NSS_GETGR_PROTOTYPES(systemd); +NSS_GETSG_PROTOTYPES(systemd); +NSS_PWENT_PROTOTYPES(systemd); +NSS_SPENT_PROTOTYPES(systemd); +NSS_GRENT_PROTOTYPES(systemd); +NSS_SGENT_PROTOTYPES(systemd); +NSS_INITGROUPS_PROTOTYPE(systemd); + +/* Since our NSS functions implement reentrant glibc APIs, we have to guarantee + * all the string pointers we return point into the buffer provided by the + * caller, not into our own static memory. */ + +static enum nss_status copy_synthesized_passwd( + struct passwd *dest, + const struct passwd *src, + const char *fallback_shell, + char *buffer, size_t buflen, + int *errnop) { + + assert(dest); + assert(src); + assert(src->pw_name); + assert(src->pw_passwd); + assert(src->pw_gecos); + assert(src->pw_dir); + + const char *shell = ASSERT_PTR(src->pw_shell ?: fallback_shell); + + size_t required = + strlen(src->pw_name) + 1 + + strlen(src->pw_passwd) + 1 + + strlen(src->pw_gecos) + 1 + + strlen(src->pw_dir) + 1 + + strlen(shell) + 1; + + if (buflen < required) { + *errnop = ERANGE; + return NSS_STATUS_TRYAGAIN; + } + + assert(buffer); + + *dest = *src; + + /* String fields point into the user-provided buffer */ + dest->pw_name = buffer; + dest->pw_passwd = stpcpy(dest->pw_name, src->pw_name) + 1; + dest->pw_gecos = stpcpy(dest->pw_passwd, src->pw_passwd) + 1; + dest->pw_dir = stpcpy(dest->pw_gecos, src->pw_gecos) + 1; + dest->pw_shell = stpcpy(dest->pw_dir, src->pw_dir) + 1; + strcpy(dest->pw_shell, shell); + + return NSS_STATUS_SUCCESS; +} + +static enum nss_status copy_synthesized_spwd( + struct spwd *dest, + const struct spwd *src, + char *buffer, size_t buflen, + int *errnop) { + + assert(dest); + assert(src); + assert(src->sp_namp); + assert(src->sp_pwdp); + + size_t required = + strlen(src->sp_namp) + 1 + + strlen(src->sp_pwdp) + 1; + + if (buflen < required) { + *errnop = ERANGE; + return NSS_STATUS_TRYAGAIN; + } + + assert(buffer); + + *dest = *src; + + /* String fields point into the user-provided buffer */ + dest->sp_namp = buffer; + dest->sp_pwdp = stpcpy(dest->sp_namp, src->sp_namp) + 1; + strcpy(dest->sp_pwdp, src->sp_pwdp); + + return NSS_STATUS_SUCCESS; +} + +static enum nss_status copy_synthesized_group( + struct group *dest, + const struct group *src, + char *buffer, size_t buflen, + int *errnop) { + + assert(dest); + assert(src); + assert(src->gr_name); + assert(src->gr_passwd); + assert(src->gr_mem); + assert(!*src->gr_mem); /* Our synthesized records' gr_mem is always just NULL... */ + + size_t required = + strlen(src->gr_name) + 1 + + strlen(src->gr_passwd) + 1 + + sizeof(char*); /* ...but that NULL still needs to be stored into the buffer! */ + + if (buflen < ALIGN(required)) { + *errnop = ERANGE; + return NSS_STATUS_TRYAGAIN; + } + + assert(buffer); + + *dest = *src; + + /* String fields point into the user-provided buffer */ + dest->gr_name = buffer; + dest->gr_passwd = stpcpy(dest->gr_name, src->gr_name) + 1; + dest->gr_mem = ALIGN_PTR(stpcpy(dest->gr_passwd, src->gr_passwd) + 1); + *dest->gr_mem = NULL; + + return NSS_STATUS_SUCCESS; +} + +static enum nss_status copy_synthesized_sgrp( + struct sgrp *dest, + const struct sgrp *src, + char *buffer, size_t buflen, + int *errnop) { + + assert(dest); + assert(src); + assert(src->sg_namp); + assert(src->sg_passwd); + + size_t required = + strlen(src->sg_namp) + 1 + + strlen(src->sg_passwd) + 1; + + if (buflen < required) { + *errnop = ERANGE; + return NSS_STATUS_TRYAGAIN; + } + + assert(buffer); + + *dest = *src; + + /* String fields point into the user-provided buffer */ + dest->sg_namp = buffer; + dest->sg_passwd = stpcpy(dest->sg_namp, src->sg_namp) + 1; + strcpy(dest->sg_passwd, src->sg_passwd); + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_getpwnam_r( + const char *name, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + enum nss_status status; + int e; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(pwd); + assert(errnop); + + /* If the username is not valid, then we don't know it. Ideally libc would filter these for us + * anyway. We don't generate EINVAL here, because it isn't really out business to complain about + * invalid user names. */ + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return NSS_STATUS_NOTFOUND; + + /* Synthesize entries for the root and nobody users, in case they are missing in /etc/passwd */ + if (getenv_bool_secure("SYSTEMD_NSS_BYPASS_SYNTHETIC") <= 0) { + + if (streq(name, root_passwd.pw_name)) + return copy_synthesized_passwd(pwd, &root_passwd, + default_root_shell(NULL), + buffer, buflen, errnop); + + if (streq(name, nobody_passwd.pw_name)) { + if (!synthesize_nobody()) + return NSS_STATUS_NOTFOUND; + + return copy_synthesized_passwd(pwd, &nobody_passwd, + NULL, + buffer, buflen, errnop); + } + + } else if (STR_IN_SET(name, root_passwd.pw_name, nobody_passwd.pw_name)) + return NSS_STATUS_NOTFOUND; + + status = userdb_getpwnam(name, pwd, buffer, buflen, &e); + if (IN_SET(status, NSS_STATUS_UNAVAIL, NSS_STATUS_TRYAGAIN)) { + UNPROTECT_ERRNO; + *errnop = e; + return status; + } + + return status; +} + +enum nss_status _nss_systemd_getpwuid_r( + uid_t uid, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + enum nss_status status; + int e; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(pwd); + assert(errnop); + + if (!uid_is_valid(uid)) + return NSS_STATUS_NOTFOUND; + + /* Synthesize data for the root user and for nobody in case they are missing from /etc/passwd */ + if (getenv_bool_secure("SYSTEMD_NSS_BYPASS_SYNTHETIC") <= 0) { + + if (uid == root_passwd.pw_uid) + return copy_synthesized_passwd(pwd, &root_passwd, + default_root_shell(NULL), + buffer, buflen, errnop); + + if (uid == nobody_passwd.pw_uid) { + if (!synthesize_nobody()) + return NSS_STATUS_NOTFOUND; + + return copy_synthesized_passwd(pwd, &nobody_passwd, + NULL, + buffer, buflen, errnop); + } + + } else if (uid == root_passwd.pw_uid || uid == nobody_passwd.pw_uid) + return NSS_STATUS_NOTFOUND; + + status = userdb_getpwuid(uid, pwd, buffer, buflen, &e); + if (IN_SET(status, NSS_STATUS_UNAVAIL, NSS_STATUS_TRYAGAIN)) { + UNPROTECT_ERRNO; + *errnop = e; + return status; + } + + return status; +} + +enum nss_status _nss_systemd_getspnam_r( + const char *name, + struct spwd *spwd, + char *buffer, size_t buflen, + int *errnop) { + + enum nss_status status; + int e; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(spwd); + assert(errnop); + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return NSS_STATUS_NOTFOUND; + + /* Synthesize entries for the root and nobody users, in case they are missing in /etc/passwd */ + if (getenv_bool_secure("SYSTEMD_NSS_BYPASS_SYNTHETIC") <= 0) { + + if (streq(name, root_spwd.sp_namp)) + return copy_synthesized_spwd(spwd, &root_spwd, buffer, buflen, errnop); + + if (streq(name, nobody_spwd.sp_namp)) { + if (!synthesize_nobody()) + return NSS_STATUS_NOTFOUND; + + return copy_synthesized_spwd(spwd, &nobody_spwd, buffer, buflen, errnop); + } + + } else if (STR_IN_SET(name, root_spwd.sp_namp, nobody_spwd.sp_namp)) + return NSS_STATUS_NOTFOUND; + + status = userdb_getspnam(name, spwd, buffer, buflen, &e); + if (IN_SET(status, NSS_STATUS_UNAVAIL, NSS_STATUS_TRYAGAIN)) { + UNPROTECT_ERRNO; + *errnop = e; + return status; + } + + return status; +} + +#pragma GCC diagnostic ignored "-Wsizeof-pointer-memaccess" + +enum nss_status _nss_systemd_getgrnam_r( + const char *name, + struct group *gr, + char *buffer, size_t buflen, + int *errnop) { + + enum nss_status status; + int e; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(gr); + assert(errnop); + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return NSS_STATUS_NOTFOUND; + + /* Synthesize records for root and nobody, in case they are missing from /etc/group */ + if (getenv_bool_secure("SYSTEMD_NSS_BYPASS_SYNTHETIC") <= 0) { + + if (streq(name, root_group.gr_name)) + return copy_synthesized_group(gr, &root_group, buffer, buflen, errnop); + + if (streq(name, nobody_group.gr_name)) { + if (!synthesize_nobody()) + return NSS_STATUS_NOTFOUND; + + return copy_synthesized_group(gr, &nobody_group, buffer, buflen, errnop); + } + + } else if (STR_IN_SET(name, root_group.gr_name, nobody_group.gr_name)) + return NSS_STATUS_NOTFOUND; + + status = userdb_getgrnam(name, gr, buffer, buflen, &e); + if (IN_SET(status, NSS_STATUS_UNAVAIL, NSS_STATUS_TRYAGAIN)) { + UNPROTECT_ERRNO; + *errnop = e; + return status; + } + + return status; +} + +enum nss_status _nss_systemd_getgrgid_r( + gid_t gid, + struct group *gr, + char *buffer, size_t buflen, + int *errnop) { + + enum nss_status status; + int e; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(gr); + assert(errnop); + + if (!gid_is_valid(gid)) + return NSS_STATUS_NOTFOUND; + + /* Synthesize records for root and nobody, in case they are missing from /etc/group */ + if (getenv_bool_secure("SYSTEMD_NSS_BYPASS_SYNTHETIC") <= 0) { + + if (gid == root_group.gr_gid) + return copy_synthesized_group(gr, &root_group, buffer, buflen, errnop); + + if (gid == nobody_group.gr_gid) { + if (!synthesize_nobody()) + return NSS_STATUS_NOTFOUND; + + return copy_synthesized_group(gr, &nobody_group, buffer, buflen, errnop); + } + + } else if (gid == root_group.gr_gid || gid == nobody_group.gr_gid) + return NSS_STATUS_NOTFOUND; + + status = userdb_getgrgid(gid, gr, buffer, buflen, &e); + if (IN_SET(status, NSS_STATUS_UNAVAIL, NSS_STATUS_TRYAGAIN)) { + UNPROTECT_ERRNO; + *errnop = e; + return status; + } + + return status; +} + +enum nss_status _nss_systemd_getsgnam_r( + const char *name, + struct sgrp *sgrp, + char *buffer, size_t buflen, + int *errnop) { + + enum nss_status status; + int e; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(name); + assert(sgrp); + assert(errnop); + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return NSS_STATUS_NOTFOUND; + + /* Synthesize records for root and nobody, in case they are missing from /etc/group */ + if (getenv_bool_secure("SYSTEMD_NSS_BYPASS_SYNTHETIC") <= 0) { + + if (streq(name, root_sgrp.sg_namp)) + return copy_synthesized_sgrp(sgrp, &root_sgrp, buffer, buflen, errnop); + + if (streq(name, nobody_sgrp.sg_namp)) { + if (!synthesize_nobody()) + return NSS_STATUS_NOTFOUND; + + return copy_synthesized_sgrp(sgrp, &nobody_sgrp, buffer, buflen, errnop); + } + + } else if (STR_IN_SET(name, root_sgrp.sg_namp, nobody_sgrp.sg_namp)) + return NSS_STATUS_NOTFOUND; + + status = userdb_getsgnam(name, sgrp, buffer, buflen, &e); + if (IN_SET(status, NSS_STATUS_UNAVAIL, NSS_STATUS_TRYAGAIN)) { + UNPROTECT_ERRNO; + *errnop = e; + return status; + } + + return status; +} + +static enum nss_status nss_systemd_endent(GetentData *p) { + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(p); + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&p->mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + p->iterator = userdb_iterator_free(p->iterator); + p->by_membership = false; + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_endpwent(void) { + return nss_systemd_endent(&getpwent_data); +} + +enum nss_status _nss_systemd_endspent(void) { + return nss_systemd_endent(&getspent_data); +} + +enum nss_status _nss_systemd_endgrent(void) { + return nss_systemd_endent(&getgrent_data); +} + +enum nss_status _nss_systemd_endsgent(void) { + return nss_systemd_endent(&getsgent_data); +} + +enum nss_status _nss_systemd_setpwent(int stayopen) { + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getpwent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + getpwent_data.iterator = userdb_iterator_free(getpwent_data.iterator); + getpwent_data.by_membership = false; + + /* Don't synthesize root/nobody when iterating. Let nss-files take care of that. If the two records + * are missing there, then that's fine, after all getpwent() is known to be possibly incomplete + * (think: LDAP/NIS type situations), and our synthesizing of root/nobody is a robustness fallback + * only, which matters for getpwnam()/getpwuid() primarily, which are the main NSS entrypoints to the + * user database. */ + r = userdb_all(nss_glue_userdb_flags() | USERDB_DONT_SYNTHESIZE, &getpwent_data.iterator); + return r < 0 ? NSS_STATUS_UNAVAIL : NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_setgrent(int stayopen) { + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getgrent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + getgrent_data.iterator = userdb_iterator_free(getgrent_data.iterator); + getgrent_data.by_membership = false; + + /* See _nss_systemd_setpwent() for an explanation why we use USERDB_DONT_SYNTHESIZE here */ + r = groupdb_all(nss_glue_userdb_flags() | USERDB_DONT_SYNTHESIZE, &getgrent_data.iterator); + return r < 0 ? NSS_STATUS_UNAVAIL : NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_setspent(int stayopen) { + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getspent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + getspent_data.iterator = userdb_iterator_free(getspent_data.iterator); + getspent_data.by_membership = false; + + /* See _nss_systemd_setpwent() for an explanation why we use USERDB_DONT_SYNTHESIZE here */ + r = userdb_all(nss_glue_userdb_flags() | USERDB_DONT_SYNTHESIZE, &getspent_data.iterator); + return r < 0 ? NSS_STATUS_UNAVAIL : NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_setsgent(int stayopen) { + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getsgent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + getsgent_data.iterator = userdb_iterator_free(getsgent_data.iterator); + getsgent_data.by_membership = false; + + /* See _nss_systemd_setpwent() for an explanation why we use USERDB_DONT_SYNTHESIZE here */ + r = groupdb_all(nss_glue_userdb_flags() | USERDB_DONT_SYNTHESIZE, &getsgent_data.iterator); + return r < 0 ? NSS_STATUS_UNAVAIL : NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_getpwent_r( + struct passwd *result, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(result); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getpwent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + if (!getpwent_data.iterator) { + UNPROTECT_ERRNO; + *errnop = EHOSTDOWN; + return NSS_STATUS_UNAVAIL; + } + + r = userdb_iterator_get(getpwent_data.iterator, &ur); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + r = nss_pack_user_record(ur, result, buffer, buflen); + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_getgrent_r( + struct group *result, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(group_record_unrefp) GroupRecord *gr = NULL; + _cleanup_free_ char **members = NULL; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(result); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getgrent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + if (!getgrent_data.iterator) { + UNPROTECT_ERRNO; + *errnop = EHOSTDOWN; + return NSS_STATUS_UNAVAIL; + } + + if (!getgrent_data.by_membership) { + r = groupdb_iterator_get(getgrent_data.iterator, &gr); + if (r == -ESRCH) { + /* So we finished iterating native groups now. Let's now continue with iterating + * native memberships, and generate additional group entries for any groups + * referenced there that are defined in NSS only. This means for those groups there + * will be two or more entries generated during iteration, but this is apparently how + * this is supposed to work, and what other implementations do too. Clients are + * supposed to merge the group records found during iteration automatically. */ + getgrent_data.iterator = userdb_iterator_free(getgrent_data.iterator); + + r = membershipdb_all(nss_glue_userdb_flags(), &getgrent_data.iterator); + if (r < 0 && r != -ESRCH) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + getgrent_data.by_membership = true; + } else if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } else if (!STR_IN_SET(gr->group_name, root_group.gr_name, nobody_group.gr_name)) { + r = membershipdb_by_group_strv(gr->group_name, nss_glue_userdb_flags(), &members); + if (r < 0 && r != -ESRCH) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + } + } + + if (getgrent_data.by_membership) { + _cleanup_(_nss_systemd_unblockp) bool blocked = false; + + if (!getgrent_data.iterator) + return NSS_STATUS_NOTFOUND; + + for (;;) { + _cleanup_free_ char *user_name = NULL, *group_name = NULL; + + r = membershipdb_iterator_get(getgrent_data.iterator, &user_name, &group_name); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (STR_IN_SET(user_name, root_passwd.pw_name, nobody_passwd.pw_name)) + continue; + if (STR_IN_SET(group_name, root_group.gr_name, nobody_group.gr_name)) + continue; + + /* We are about to recursively call into NSS, let's make sure we disable recursion into our own code. */ + if (!blocked) { + r = _nss_systemd_block(true); + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + blocked = true; + } + + r = nss_group_record_by_name(group_name, false, &gr); + if (r == -ESRCH) + continue; + if (r < 0) { + log_debug_errno(r, "Failed to do NSS check for group '%s', ignoring: %m", group_name); + continue; + } + + members = strv_new(user_name); + if (!members) { + UNPROTECT_ERRNO; + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + /* Note that we currently generate one group entry per user that is part of a + * group. It's a bit ugly, but equivalent to generating a single entry with a set of + * members in them. */ + break; + } + } + + r = nss_pack_group_record(gr, members, result, buffer, buflen); + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_getspent_r( + struct spwd *result, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(result); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getspent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + if (!getspent_data.iterator) { + UNPROTECT_ERRNO; + *errnop = EHOSTDOWN; + return NSS_STATUS_UNAVAIL; + } + + for (;;) { + r = userdb_iterator_get(getspent_data.iterator, &ur); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (!ur->incomplete) /* don't synthesize shadow records for records where we couldn't read shadow data */ + break; + + ur = user_record_unref(ur); + } + + r = nss_pack_user_record_shadow(ur, result, buffer, buflen); + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_getsgent_r( + struct sgrp *result, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(group_record_unrefp) GroupRecord *gr = NULL; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(result); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + _cleanup_(pthread_mutex_unlock_assertp) pthread_mutex_t *_l = pthread_mutex_lock_assert(&getsgent_data.mutex); + (void) _l; /* make llvm shut up about _l not being used. */ + + if (!getsgent_data.iterator) { + UNPROTECT_ERRNO; + *errnop = EHOSTDOWN; + return NSS_STATUS_UNAVAIL; + } + + for (;;) { + r = groupdb_iterator_get(getsgent_data.iterator, &gr); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (!gr->incomplete) /* don't synthesize shadow records for records where we couldn't read shadow data */ + break; + + gr = group_record_unref(gr); + } + + r = nss_pack_group_record_shadow(gr, result, buffer, buflen); + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +enum nss_status _nss_systemd_initgroups_dyn( + const char *user_name, + gid_t gid, + long *start, + long *size, + gid_t **groupsp, + long int limit, + int *errnop) { + + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + bool any = false; + int r; + + PROTECT_ERRNO; + NSS_ENTRYPOINT_BEGIN; + + assert(user_name); + assert(start); + assert(size); + assert(groupsp); + assert(errnop); + + if (!valid_user_group_name(user_name, VALID_USER_RELAX)) + return NSS_STATUS_NOTFOUND; + + /* Don't allow extending these two special users, the same as we won't resolve them via getpwnam() */ + if (STR_IN_SET(user_name, root_passwd.pw_name, nobody_passwd.pw_name)) + return NSS_STATUS_NOTFOUND; + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = membershipdb_by_user(user_name, nss_glue_userdb_flags(), &iterator); + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + for (;;) { + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + _cleanup_free_ char *group_name = NULL; + + r = membershipdb_iterator_get(iterator, NULL, &group_name); + if (r == -ESRCH) + break; + if (r < 0) { + UNPROTECT_ERRNO; + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + /* The group might be defined via traditional NSS only, hence let's do a full look-up without + * disabling NSS. This means we are operating recursively here. */ + + r = groupdb_by_name(group_name, (nss_glue_userdb_flags() & ~USERDB_EXCLUDE_NSS) | USERDB_SUPPRESS_SHADOW, &g); + if (r == -ESRCH) + continue; + if (r < 0) { + log_debug_errno(r, "Failed to resolve group '%s', ignoring: %m", group_name); + continue; + } + + if (g->gid == gid) + continue; + + if (*start >= *size) { + gid_t *new_groups; + long new_size; + + if (limit > 0 && *size >= limit) /* Reached the limit.? */ + break; + + if (*size > LONG_MAX/2) { /* Check for overflow */ + UNPROTECT_ERRNO; + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + new_size = *start * 2; + if (limit > 0 && new_size > limit) + new_size = limit; + + /* Enlarge buffer */ + new_groups = reallocarray(*groupsp, new_size, sizeof(**groupsp)); + if (!new_groups) { + UNPROTECT_ERRNO; + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + *groupsp = new_groups; + *size = new_size; + } + + (*groupsp)[(*start)++] = g->gid; + any = true; + } + + return any ? NSS_STATUS_SUCCESS : NSS_STATUS_NOTFOUND; +} + +static thread_local unsigned _blocked = 0; + +_public_ int _nss_systemd_block(bool b) { + + /* This blocks recursively: it's blocked for as many times this function is called with `true` until + * it is called an equal time with `false`. */ + + if (b) { + if (_blocked >= UINT_MAX) + return -EOVERFLOW; + + _blocked++; + } else { + if (_blocked <= 0) + return -EOVERFLOW; + + _blocked--; + } + + return b; /* Return what is passed in, i.e. the new state from the PoV of the caller */ +} + +_public_ bool _nss_systemd_is_blocked(void) { + return _blocked > 0; +} diff --git a/src/nss-systemd/nss-systemd.h b/src/nss-systemd/nss-systemd.h new file mode 100644 index 0000000..e97b801 --- /dev/null +++ b/src/nss-systemd/nss-systemd.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int _nss_systemd_block(bool b); +bool _nss_systemd_is_blocked(void); + +/* For use with the _cleanup_() macro */ +static inline void _nss_systemd_unblockp(bool *b) { + if (*b) + assert_se(_nss_systemd_block(false) >= 0); +} diff --git a/src/nss-systemd/nss-systemd.sym b/src/nss-systemd/nss-systemd.sym new file mode 100644 index 0000000..5602f00 --- /dev/null +++ b/src/nss-systemd/nss-systemd.sym @@ -0,0 +1,36 @@ +/*** + SPDX-License-Identifier: LGPL-2.1-or-later + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +{ +global: + _nss_systemd_getpwnam_r; + _nss_systemd_getspnam_r; + _nss_systemd_getpwuid_r; + _nss_systemd_endpwent; + _nss_systemd_setpwent; + _nss_systemd_getpwent_r; + _nss_systemd_endspent; + _nss_systemd_setspent; + _nss_systemd_getspent_r; + _nss_systemd_getgrnam_r; + _nss_systemd_getsgnam_r; + _nss_systemd_getgrgid_r; + _nss_systemd_endgrent; + _nss_systemd_setgrent; + _nss_systemd_getgrent_r; + _nss_systemd_endsgent; + _nss_systemd_setsgent; + _nss_systemd_getsgent_r; + _nss_systemd_initgroups_dyn; + + /* These two are not used by glibc, but can be used by apps to explicitly disable nss-systemd for the calling thread. */ + _nss_systemd_block; + _nss_systemd_is_blocked; +local: *; +}; diff --git a/src/nss-systemd/userdb-glue.c b/src/nss-systemd/userdb-glue.c new file mode 100644 index 0000000..c69667d --- /dev/null +++ b/src/nss-systemd/userdb-glue.c @@ -0,0 +1,478 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "env-util.h" +#include "fd-util.h" +#include "nss-systemd.h" +#include "strv.h" +#include "user-record-nss.h" +#include "user-record.h" +#include "user-util.h" +#include "userdb-glue.h" +#include "userdb.h" + +UserDBFlags nss_glue_userdb_flags(void) { + UserDBFlags flags = USERDB_EXCLUDE_NSS; + + /* Make sure that we don't go in circles when allocating a dynamic UID by checking our own database */ + if (getenv_bool_secure("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) + flags |= USERDB_EXCLUDE_DYNAMIC_USER; + + return flags; +} + +int nss_pack_user_record( + UserRecord *hr, + struct passwd *pwd, + char *buffer, + size_t buflen) { + + const char *rn, *hd, *shell; + size_t required; + + assert(hr); + assert(pwd); + + assert(hr->user_name); + required = strlen(hr->user_name) + 1; + + required += 2; /* strlen(PASSWORD_SEE_SHADOW) + 1 */ + + assert_se(rn = user_record_real_name(hr)); + required += strlen(rn) + 1; + + assert_se(hd = user_record_home_directory(hr)); + required += strlen(hd) + 1; + + assert_se(shell = user_record_shell(hr)); + required += strlen(shell) + 1; + + if (buflen < required) + return -ERANGE; + + *pwd = (struct passwd) { + .pw_name = buffer, + .pw_uid = hr->uid, + .pw_gid = user_record_gid(hr), + }; + + assert(buffer); + + pwd->pw_passwd = stpcpy(pwd->pw_name, hr->user_name) + 1; + pwd->pw_gecos = stpcpy(pwd->pw_passwd, PASSWORD_SEE_SHADOW) + 1; + pwd->pw_dir = stpcpy(pwd->pw_gecos, rn) + 1; + pwd->pw_shell = stpcpy(pwd->pw_dir, hd) + 1; + strcpy(pwd->pw_shell, shell); + + return 0; +} + +enum nss_status userdb_getpwnam( + const char *name, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(user_record_unrefp) UserRecord *hr = NULL; + int r; + + assert(pwd); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = userdb_by_name(name, nss_glue_userdb_flags()|USERDB_SUPPRESS_SHADOW, &hr); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + r = nss_pack_user_record(hr, pwd, buffer, buflen); + if (r < 0) { + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +enum nss_status userdb_getpwuid( + uid_t uid, + struct passwd *pwd, + char *buffer, + size_t buflen, + int *errnop) { + + _cleanup_(user_record_unrefp) UserRecord *hr = NULL; + int r; + + assert(pwd); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = userdb_by_uid(uid, nss_glue_userdb_flags()|USERDB_SUPPRESS_SHADOW, &hr); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + r = nss_pack_user_record(hr, pwd, buffer, buflen); + if (r < 0) { + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +int nss_pack_user_record_shadow( + UserRecord *hr, + struct spwd *spwd, + char *buffer, + size_t buflen) { + + const char *hashed; + size_t required; + + assert(hr); + assert(spwd); + + assert(hr->user_name); + required = strlen(hr->user_name) + 1; + + assert_se(hashed = strv_isempty(hr->hashed_password) ? PASSWORD_LOCKED_AND_INVALID : hr->hashed_password[0]); + required += strlen(hashed) + 1; + + if (buflen < required) + return -ERANGE; + + *spwd = (struct spwd) { + .sp_namp = buffer, + .sp_lstchg = hr->last_password_change_usec == 0 ? 1 : /* map 0 to 1, since 0 means please change password on next login */ + hr->last_password_change_usec == UINT64_MAX ? -1 : + (long int) (hr->last_password_change_usec / USEC_PER_DAY), + .sp_min = hr->password_change_min_usec != UINT64_MAX ? (long int) (hr->password_change_min_usec / USEC_PER_DAY) : -1, + .sp_max = hr->password_change_max_usec != UINT64_MAX ? (long int) (hr->password_change_max_usec / USEC_PER_DAY) : -1, + .sp_warn = hr->password_change_warn_usec != UINT64_MAX ? (long int) (hr->password_change_warn_usec / USEC_PER_DAY) : -1, + .sp_inact = hr->password_change_inactive_usec != UINT64_MAX ? (long int) (hr->password_change_inactive_usec / USEC_PER_DAY) : -1, + .sp_expire = hr->locked > 0 || hr->not_after_usec == 0 ? 1 : /* already expired/locked */ + hr->not_after_usec == UINT64_MAX ? -1 : + (long int) (hr->not_after_usec / USEC_PER_DAY), + .sp_flag = ULONG_MAX, + }; + + assert(buffer); + + spwd->sp_pwdp = stpcpy(spwd->sp_namp, hr->user_name) + 1; + strcpy(spwd->sp_pwdp, hashed); + + return 0; +} + +enum nss_status userdb_getspnam( + const char *name, + struct spwd *spwd, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(user_record_unrefp) UserRecord *hr = NULL; + int r; + + assert(spwd); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = userdb_by_name(name, nss_glue_userdb_flags(), &hr); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (hr->incomplete) /* protected records missing? */ + return NSS_STATUS_NOTFOUND; + + r = nss_pack_user_record_shadow(hr, spwd, buffer, buflen); + if (r < 0) { + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +int nss_pack_group_record( + GroupRecord *g, + char **extra_members, + struct group *gr, + char *buffer, + size_t buflen) { + + char **array = NULL, *p; + size_t required, n = 0, i = 0; + + assert(g); + assert(gr); + + assert(g->group_name); + required = strlen(g->group_name) + 1; + + STRV_FOREACH(m, g->members) { + required += sizeof(char*); /* space for ptr array entry */ + required += strlen(*m) + 1; + n++; + } + STRV_FOREACH(m, extra_members) { + if (strv_contains(g->members, *m)) + continue; + + required += sizeof(char*); + required += strlen(*m) + 1; + n++; + } + + required += sizeof(char*); /* trailing NULL in ptr array entry */ + + if (buflen < required) + return -ERANGE; + + array = (char**) buffer; /* place ptr array at beginning of buffer, under assumption buffer is aligned */ + p = buffer + sizeof(void*) * (n + 1); /* place member strings right after the ptr array */ + + STRV_FOREACH(m, g->members) { + array[i++] = p; + p = stpcpy(p, *m) + 1; + } + STRV_FOREACH(m, extra_members) { + if (strv_contains(g->members, *m)) + continue; + + array[i++] = p; + p = stpcpy(p, *m) + 1; + } + + assert_se(i == n); + array[n] = NULL; + + *gr = (struct group) { + .gr_name = strcpy(p, g->group_name), + .gr_gid = g->gid, + .gr_passwd = (char*) PASSWORD_SEE_SHADOW, + .gr_mem = array, + }; + + return 0; +} + +enum nss_status userdb_getgrnam( + const char *name, + struct group *gr, + char *buffer, + size_t buflen, + int *errnop) { + + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + _cleanup_strv_free_ char **members = NULL; + int r; + + assert(gr); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = groupdb_by_name(name, nss_glue_userdb_flags()|USERDB_SUPPRESS_SHADOW, &g); + if (r < 0 && r != -ESRCH) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + r = membershipdb_by_group_strv(name, nss_glue_userdb_flags()|USERDB_SUPPRESS_SHADOW, &members); + if (r < 0 && r != -ESRCH) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (!g) { + _unused_ _cleanup_(_nss_systemd_unblockp) bool blocked = false; + + if (strv_isempty(members)) + return NSS_STATUS_NOTFOUND; + + /* Grmbl, so we are supposed to extend a group entry, but the group entry itself is not + * accessible via non-NSS. Hence let's do what we have to do, and query NSS after all to + * acquire it, so that we can extend it (that's because glibc's group merging feature will + * merge groups only if both GID and name match and thus we need to have both first). It + * sucks behaving recursively likely this, but it's apparently what everybody does. We break + * the recursion for ourselves via the _nss_systemd_block_nss() lock. */ + + r = _nss_systemd_block(true); + if (r < 0) + return r; + + blocked = true; + + r = nss_group_record_by_name(name, false, &g); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + } + + r = nss_pack_group_record(g, members, gr, buffer, buflen); + if (r < 0) { + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +enum nss_status userdb_getgrgid( + gid_t gid, + struct group *gr, + char *buffer, + size_t buflen, + int *errnop) { + + + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + _cleanup_strv_free_ char **members = NULL; + bool from_nss; + int r; + + assert(gr); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = groupdb_by_gid(gid, nss_glue_userdb_flags()|USERDB_SUPPRESS_SHADOW, &g); + if (r < 0 && r != -ESRCH) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (!g) { + _unused_ _cleanup_(_nss_systemd_unblockp) bool blocked = false; + + /* So, quite possibly we have to extend an existing group record with additional members. But + * to do this we need to know the group name first. The group didn't exist via non-NSS + * queries though, hence let's try to acquire it here recursively via NSS. */ + + r = _nss_systemd_block(true); + if (r < 0) + return r; + + blocked = true; + + r = nss_group_record_by_gid(gid, false, &g); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + from_nss = true; + } else + from_nss = false; + + r = membershipdb_by_group_strv(g->group_name, nss_glue_userdb_flags()|USERDB_SUPPRESS_SHADOW, &members); + if (r < 0 && r != -ESRCH) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + /* If we acquired the record via NSS then there's no reason to respond unless we have to augment the + * list of members of the group */ + if (from_nss && strv_isempty(members)) + return NSS_STATUS_NOTFOUND; + + r = nss_pack_group_record(g, members, gr, buffer, buflen); + if (r < 0) { + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} + +int nss_pack_group_record_shadow( + GroupRecord *hr, + struct sgrp *sgrp, + char *buffer, + size_t buflen) { + + const char *hashed; + size_t required; + + assert(hr); + assert(sgrp); + + assert(hr->group_name); + required = strlen(hr->group_name) + 1; + + assert_se(hashed = strv_isempty(hr->hashed_password) ? PASSWORD_LOCKED_AND_INVALID : hr->hashed_password[0]); + required += strlen(hashed) + 1; + + if (buflen < required) + return -ERANGE; + + *sgrp = (struct sgrp) { + .sg_namp = buffer, + }; + + assert(buffer); + + sgrp->sg_passwd = stpcpy(sgrp->sg_namp, hr->group_name) + 1; + strcpy(sgrp->sg_passwd, hashed); + + return 0; +} + +enum nss_status userdb_getsgnam( + const char *name, + struct sgrp *sgrp, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(group_record_unrefp) GroupRecord *hr = NULL; + int r; + + assert(sgrp); + assert(errnop); + + if (_nss_systemd_is_blocked()) + return NSS_STATUS_NOTFOUND; + + r = groupdb_by_name(name, nss_glue_userdb_flags(), &hr); + if (r == -ESRCH) + return NSS_STATUS_NOTFOUND; + if (r < 0) { + *errnop = -r; + return NSS_STATUS_UNAVAIL; + } + + if (hr->incomplete) /* protected records missing? */ + return NSS_STATUS_NOTFOUND; + + r = nss_pack_group_record_shadow(hr, sgrp, buffer, buflen); + if (r < 0) { + *errnop = -r; + return NSS_STATUS_TRYAGAIN; + } + + return NSS_STATUS_SUCCESS; +} diff --git a/src/nss-systemd/userdb-glue.h b/src/nss-systemd/userdb-glue.h new file mode 100644 index 0000000..386cc88 --- /dev/null +++ b/src/nss-systemd/userdb-glue.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "userdb.h" + +UserDBFlags nss_glue_userdb_flags(void); + +int nss_pack_user_record(UserRecord *hr, struct passwd *pwd, char *buffer, size_t buflen); +int nss_pack_group_record(GroupRecord *g, char **extra_members, struct group *gr, char *buffer, size_t buflen); + +int nss_pack_user_record_shadow(UserRecord *hr, struct spwd *spwd, char *buffer, size_t buflen); +int nss_pack_group_record_shadow(GroupRecord *hr, struct sgrp *sgrp, char *buffer,size_t buflen); + +enum nss_status userdb_getpwnam(const char *name, struct passwd *pwd, char *buffer, size_t buflen, int *errnop); +enum nss_status userdb_getpwuid(uid_t uid, struct passwd *pwd, char *buffer, size_t buflen, int *errnop); + +enum nss_status userdb_getspnam(const char *name, struct spwd *spwd, char *buffer, size_t buflen, int *errnop); + +enum nss_status userdb_getgrnam(const char *name, struct group *gr, char *buffer, size_t buflen, int *errnop); +enum nss_status userdb_getgrgid(gid_t gid, struct group *gr, char *buffer, size_t buflen, int *errnop); + +enum nss_status userdb_getsgnam(const char *name, struct sgrp *sgrp, char *buffer, size_t buflen, int *errnop); diff --git a/src/oom/meson.build b/src/oom/meson.build new file mode 100644 index 0000000..690ed7a --- /dev/null +++ b/src/oom/meson.build @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_oomd_sources = files( + 'oomd-manager-bus.c', + 'oomd-manager.c', + 'oomd-util.c', + 'oomd.c', +) + +executables += [ + libexec_template + { + 'name' : 'systemd-oomd', + 'dbus' : true, + 'conditions' : ['ENABLE_OOMD'], + 'sources' : systemd_oomd_sources, + 'dependencies' : libatomic, + }, + executable_template + { + 'name' : 'oomctl', + 'public' : true, + 'conditions' : ['ENABLE_OOMD'], + 'sources' : files('oomctl.c'), + }, + test_template + { + 'sources' : files( + 'test-oomd-util.c', + 'oomd-util.c', + ), + 'dependencies' : libatomic, + }, +] + +if conf.get('ENABLE_OOMD') == 1 + install_data('org.freedesktop.oom1.conf', + install_dir : dbuspolicydir) + + install_data('org.freedesktop.oom1.service', + install_dir : dbussystemservicedir) + + if install_sysconfdir_samples + install_data('oomd.conf', + install_dir : pkgconfigfiledir) + endif +endif diff --git a/src/oom/oomctl.c b/src/oom/oomctl.c new file mode 100644 index 0000000..eb15f50 --- /dev/null +++ b/src/oom/oomctl.c @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "copy.h" +#include "main-func.h" +#include "pretty-print.h" +#include "terminal-util.h" +#include "verbs.h" + +static PagerFlags arg_pager_flags = 0; + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("oomctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%2$sManage or inspect the userspace OOM killer.%3$s\n" + "\n%4$sCommands:%5$s\n" + " dump Output the current state of systemd-oomd\n" + "\n%4$sOptions:%5$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %6$s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + ansi_underline(), + ansi_normal(), + link); + + return 0; +} + +static int dump_state(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int fd = -EBADF; + int r; + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect system bus: %m"); + + pager_open(arg_pager_flags); + + r = bus_call_method(bus, bus_oom_mgr, "DumpByFileDescriptor", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to dump context: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "h", &fd); + if (r < 0) + return bus_log_parse_error(r); + + fflush(stdout); + return copy_bytes(fd, STDOUT_FILENO, UINT64_MAX, 0); +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char* argv[]) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "dump", VERB_ANY, 1, VERB_DEFAULT, dump_state }, + {} + }; + + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/oom/oomd-manager-bus.c b/src/oom/oomd-manager-bus.c new file mode 100644 index 0000000..0581d58 --- /dev/null +++ b/src/oom/oomd-manager-bus.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "bus-common-errors.h" +#include "bus-polkit.h" +#include "data-fd-util.h" +#include "fd-util.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "user-util.h" + +static int bus_method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *dump = NULL; + _cleanup_close_ int fd = -EBADF; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = manager_get_dump_string(m, &dump); + if (r < 0) + return r; + + fd = acquire_data_fd(dump, strlen(dump), 0); + if (fd < 0) + return fd; + + return sd_bus_reply_method_return(message, "h", fd); +} + +static const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD_WITH_NAMES("DumpByFileDescriptor", + NULL,, + "h", + SD_BUS_PARAM(fd), + bus_method_dump_by_fd, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_SIGNAL_WITH_NAMES("Killed", + "ss", + SD_BUS_PARAM(cgroup) + SD_BUS_PARAM(reason), + 0), + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + .vtables = BUS_VTABLES(manager_vtable), +}; diff --git a/src/oom/oomd-manager-bus.h b/src/oom/oomd-manager-bus.h new file mode 100644 index 0000000..7935b35 --- /dev/null +++ b/src/oom/oomd-manager-bus.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-object.h" + +typedef struct Manager Manager; + +extern const BusObjectImplementation manager_object; diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c new file mode 100644 index 0000000..6081254 --- /dev/null +++ b/src/oom/oomd-manager.c @@ -0,0 +1,851 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-daemon.h" + +#include "bus-log-control-api.h" +#include "bus-util.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "path-util.h" +#include "percent-util.h" +#include "varlink-io.systemd.oom.h" + +typedef struct ManagedOOMMessage { + ManagedOOMMode mode; + char *path; + char *property; + uint32_t limit; +} ManagedOOMMessage; + +static void managed_oom_message_destroy(ManagedOOMMessage *message) { + assert(message); + free(message->path); + free(message->property); +} + +static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + ManagedOOMMode *mode = userdata, m; + const char *s; + + assert(mode); + assert_se(s = json_variant_string(v)); + + m = managed_oom_mode_from_string(s); + if (m < 0) + return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s); + + *mode = m; + return 0; +} + +static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) { + JsonVariant *c, *cgroups; + int r; + + static const JsonDispatch dispatch_table[] = { + { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY }, + { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY }, + { "limit", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, + {}, + }; + + assert(m); + assert(parameters); + + cgroups = json_variant_by_key(parameters, "cgroups"); + if (!cgroups) + return -EINVAL; + + /* Skip malformed elements and keep processing in case the others are good */ + JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { + _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {}; + OomdCGroupContext *ctx; + Hashmap *monitor_hm; + loadavg_t limit; + + if (!json_variant_is_object(c)) + continue; + + r = json_dispatch(c, dispatch_table, 0, &message); + if (r == -ENOMEM) + return r; + if (r < 0) + continue; + + if (uid != 0) { + uid_t cg_uid; + + r = cg_path_get_owner_uid(message.path, &cg_uid); + if (r < 0) { + log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path); + continue; + } + + /* Let's not be lenient for permission errors and skip processing if we receive an + * update for a cgroup that doesn't belong to the user. */ + if (uid != cg_uid) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "cgroup path owner UID does not match sender uid " + "(" UID_FMT " != " UID_FMT ")", uid, cg_uid); + } + + monitor_hm = streq(message.property, "ManagedOOMSwap") ? + m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; + + if (message.mode == MANAGED_OOM_AUTO) { + (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path))); + continue; + } + + limit = m->default_mem_pressure_limit; + + if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) { + int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit); + + r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit); + if (r < 0) + continue; + } + + r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); + if (r == -ENOMEM) + return r; + if (r < 0 && r != -EEXIST) + log_debug_errno(r, "Failed to insert message, ignoring: %m"); + + /* Always update the limit in case it was changed. For non-memory pressure detection the value is + * ignored so always updating it here is not a problem. */ + ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); + if (ctx) + ctx->mem_pressure_limit = limit; + } + + /* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */ + r = sd_event_source_set_enabled(m->swap_context_event_source, + hashmap_isempty(m->monitored_swap_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m"); + + return 0; +} + +static int process_managed_oom_request( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + Manager *m = ASSERT_PTR(userdata); + uid_t uid; + int r; + + r = varlink_get_peer_uid(link, &uid); + if (r < 0) + return log_error_errno(r, "Failed to get varlink peer uid: %m"); + + return process_managed_oom_message(m, uid, parameters); +} + +static int process_managed_oom_reply( + Varlink *link, + JsonVariant *parameters, + const char *error_id, + VarlinkReplyFlags flags, + void *userdata) { + Manager *m = ASSERT_PTR(userdata); + uid_t uid; + int r; + + if (error_id) { + r = -EIO; + log_debug("Error getting ManagedOOM cgroups: %s", error_id); + goto finish; + } + + r = varlink_get_peer_uid(link, &uid); + if (r < 0) { + log_error_errno(r, "Failed to get varlink peer uid: %m"); + goto finish; + } + + r = process_managed_oom_message(m, uid, parameters); + +finish: + if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + m->varlink_client = varlink_close_unref(link); + + return r; +} + +/* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible + * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1". + * + * This function ignores most errors in order to handle cgroups that may have been cleaned up while + * populating the hashmap. + * + * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */ +static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) { + _cleanup_free_ char *subpath = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(new_h); + assert(path); + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) + return r; + + r = cg_read_subgroup(d, &subpath); + if (r < 0) + return r; + else if (r == 0) { /* No subgroups? We're a leaf node */ + r = oomd_insert_cgroup_context(NULL, new_h, path); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path); + return 0; + } + + do { + _cleanup_free_ char *cg_path = NULL; + bool oom_group; + + cg_path = path_join(empty_to_root(path), subpath); + if (!cg_path) + return -ENOMEM; + + subpath = mfree(subpath); + + r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group); + /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */ + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path); + return 0; + } + + if (oom_group) + r = oomd_insert_cgroup_context(NULL, new_h, cg_path); + else + r = recursively_get_cgroup_context(new_h, cg_path); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path); + } while ((r = cg_read_subgroup(d, &subpath)) > 0); + + return 0; +} + +static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) { + _cleanup_hashmap_free_ Hashmap *new_base = NULL; + OomdCGroupContext *ctx; + int r; + + assert(monitored_cgroups); + + new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!new_base) + return -ENOMEM; + + HASHMAP_FOREACH(ctx, *monitored_cgroups) { + /* Skip most errors since the cgroup we're trying to update might not exist anymore. */ + r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path); + if (r == -ENOMEM) + return r; + if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT)) + log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path); + } + + hashmap_free(*monitored_cgroups); + *monitored_cgroups = TAKE_PTR(new_base); + + return 0; +} + +static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + OomdCGroupContext *ctx; + int r; + + assert(monitored_cgroups); + assert(ret_candidates); + + candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!candidates) + return -ENOMEM; + + HASHMAP_FOREACH(ctx, monitored_cgroups) { + r = recursively_get_cgroup_context(candidates, ctx->path); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path); + } + + *ret_candidates = TAKE_PTR(candidates); + + return 0; +} + +static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) { + _cleanup_hashmap_free_ Hashmap *new_candidates = NULL; + int r; + + assert(monitored_cgroups); + assert(candidates); + assert(*candidates); + + r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates); + if (r < 0) + return log_debug_errno(r, "Failed to get candidate contexts: %m"); + + oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates); + + hashmap_free(*candidates); + *candidates = TAKE_PTR(new_candidates); + + return 0; +} + +static int acquire_managed_oom_connect(Manager *m) { + _cleanup_(varlink_close_unrefp) Varlink *link = NULL; + int r; + + assert(m); + assert(m->event); + + r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM); + if (r < 0) + return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m"); + + (void) varlink_set_userdata(link, m); + (void) varlink_set_description(link, "oomd"); + (void) varlink_set_relative_timeout(link, USEC_INFINITY); + + r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + r = varlink_bind_reply(link, process_managed_oom_reply); + if (r < 0) + return log_error_errno(r, "Failed to bind reply callback: %m"); + + r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL); + if (r < 0) + return log_error_errno(r, "Failed to observe varlink call: %m"); + + m->varlink_client = TAKE_PTR(link); + return 0; +} + +static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + usec_t usec_now; + int r; + + assert(s); + assert(!hashmap_isempty(m->monitored_swap_cgroup_contexts)); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer: %m"); + + r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer: %m"); + + /* Reconnect if our connection dropped */ + if (!m->varlink_client) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection: %m"); + } + + /* We still try to acquire system information for oomctl even if no units want swap monitoring */ + r = oomd_system_context_acquire("/proc/meminfo", &m->system_context); + /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */ + if (r < 0) + return log_error_errno(r, "Failed to acquire system context: %m"); + + /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the + * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts + * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent + * nodes are the ones that matter). */ + + /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */ + if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) && + oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + _cleanup_free_ char *selected = NULL; + uint64_t threshold; + + log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and " + "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, + m->system_context.mem_used, m->system_context.mem_total, + m->system_context.swap_used, m->system_context.swap_total, + PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); + + r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); + + threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; + r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_notice_errno(r, "Failed to kill any cgroups based on swap: %m"); + else { + if (selected && r > 0) { + log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and " + "swap used (%"PRIu64") / total (%"PRIu64") being more than " + PERMYRIAD_AS_PERCENT_FORMAT_STR, + selected, + m->system_context.mem_used, m->system_context.mem_total, + m->system_context.swap_used, m->system_context.swap_total, + PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); + + /* send dbus signal */ + (void) sd_bus_emit_signal(m->bus, + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "Killed", + "ss", + selected, + "memory-used"); + } + return 0; + } + } + + return 0; +} + +static void clear_candidate_hashmapp(Manager **m) { + if (*m) + hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates); +} + +static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we + * update the candidate data (in which case clear_candidates will be NULL). */ + _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata; + _cleanup_set_free_ Set *targets = NULL; + bool in_post_action_delay = false; + Manager *m = ASSERT_PTR(userdata); + usec_t usec_now; + int r; + + assert(s); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer: %m"); + + r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer: %m"); + + /* Reconnect if our connection dropped */ + if (!m->varlink_client) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection: %m"); + } + + /* Return early if nothing is requesting memory pressure monitoring */ + if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) + return 0; + + /* Update the cgroups used for detection/action */ + r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); + + /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale + * values and go on a kill storm. */ + if (m->mem_pressure_post_action_delay_start > 0) { + if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) + in_post_action_delay = true; + else + m->mem_pressure_post_action_delay_start = 0; + } + + r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); + else if (r == 1 && !in_post_action_delay) { + OomdCGroupContext *t; + SET_FOREACH(t, targets) { + _cleanup_free_ char *selected = NULL; + + /* Check if there was reclaim activity in the given interval. The concern is the following case: + * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending + * cgroup. Even after this, well-behaved processes will fault in recently resident pages and + * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need + * to kill something (it won't help anyways). */ + if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) + continue; + + log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", + t->path, + LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), + LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), + FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + else + clear_candidates = NULL; + + r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, + /* prefix= */ t->path, + /* dry_run= */ m->dry_run, + &selected); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path); + else { + /* Don't act on all the high pressure cgroups at once; return as soon as we kill one. + * If r == 0 then it means there were not eligible candidates, the candidate cgroup + * disappeared, or the candidate cgroup has no processes by the time we tried to kill + * it. In either case, go through the event loop again and select a new candidate if + * pressure is still high. */ + m->mem_pressure_post_action_delay_start = usec_now; + if (selected && r > 0) { + log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" + " for > %s with reclaim activity", + selected, t->path, + LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), + LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), + FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + + /* send dbus signal */ + (void) sd_bus_emit_signal(m->bus, + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "Killed", + "ss", + selected, + "memory-pressure"); + } + return 0; + } + } + } else { + /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every + * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill + * might happen. + * Candidate cgroup data will continue to get updated during the post-action delay period in case + * pressure continues to be high after a kill. */ + OomdCGroupContext *c; + HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) { + if (c->mem_pressure_limit_hit_start == 0) + continue; + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + else { + clear_candidates = NULL; + break; + } + } + } + + return 0; +} + +static int monitor_swap_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-swap-timer"); + + m->swap_context_event_source = TAKE_PTR(s); + return 0; +} + +static int monitor_memory_pressure_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_ON); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer"); + + m->mem_pressure_context_event_source = TAKE_PTR(s); + return 0; +} + +Manager* manager_free(Manager *m) { + assert(m); + + varlink_server_unref(m->varlink_server); + varlink_close_unref(m->varlink_client); + sd_event_source_unref(m->swap_context_event_source); + sd_event_source_unref(m->mem_pressure_context_event_source); + sd_event_unref(m->event); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + sd_bus_flush_close_unref(m->bus); + + hashmap_free(m->monitored_swap_cgroup_contexts); + hashmap_free(m->monitored_mem_pressure_cgroup_contexts); + hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); + + return mfree(m); +} + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new0(Manager, 1); + if (!m) + return -ENOMEM; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_set_watchdog(m->event, true); + + r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return r; + + m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_swap_cgroup_contexts) + return -ENOMEM; + + m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_mem_pressure_cgroup_contexts) + return -ENOMEM; + + m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_mem_pressure_cgroup_contexts_candidates) + return -ENOMEM; + + *ret = TAKE_PTR(m); + return 0; +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom"); + if (r < 0) + return log_error_errno(r, "Failed to connect to bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + return 0; +} + +static int manager_varlink_init(Manager *m, int fd) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + assert(!m->varlink_server); + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_error_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_add_interface(s, &vl_interface_io_systemd_oom); + if (r < 0) + return log_error_errno(r, "Failed to add oom interface to varlink server: %m"); + + r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request); + if (r < 0) + return log_error_errno(r, "Failed to register varlink method: %m"); + + if (fd < 0) + r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666); + else + r = varlink_server_listen_fd(s, fd); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + log_debug("Initialized systemd-oomd varlink server"); + + m->varlink_server = TAKE_PTR(s); + return 0; +} + +int manager_start( + Manager *m, + bool dry_run, + int swap_used_limit_permyriad, + int mem_pressure_limit_permyriad, + usec_t mem_pressure_usec, + int fd) { + + unsigned long l, f; + int r; + + assert(m); + + m->dry_run = dry_run; + + m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100; + assert(m->swap_used_limit_permyriad <= 10000); + + if (mem_pressure_limit_permyriad >= 0) { + assert(mem_pressure_limit_permyriad <= 10000); + + l = mem_pressure_limit_permyriad / 100; + f = mem_pressure_limit_permyriad % 100; + } else { + l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; + f = 0; + } + r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); + if (r < 0) + return r; + + m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; + + r = manager_connect_bus(m); + if (r < 0) + return r; + + r = acquire_managed_oom_connect(m); + if (r < 0) + return r; + + r = manager_varlink_init(m, fd); + if (r < 0) + return r; + + r = monitor_memory_pressure_contexts(m); + if (r < 0) + return r; + + r = monitor_swap_contexts(m); + if (r < 0) + return r; + + return 0; +} + +int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_(memstream_done) MemStream ms = {}; + OomdCGroupContext *c; + FILE *f; + + assert(m); + assert(ret); + + f = memstream_init(&ms); + if (!f) + return -ENOMEM; + + fprintf(f, + "Dry Run: %s\n" + "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" + "Default Memory Pressure Limit: %lu.%02lu%%\n" + "Default Memory Pressure Duration: %s\n" + "System Context:\n", + yes_no(m->dry_run), + PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad), + LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit), + FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + oomd_dump_system_context(&m->system_context, f, "\t"); + + fprintf(f, "Swap Monitored CGroups:\n"); + HASHMAP_FOREACH(c, m->monitored_swap_cgroup_contexts) + oomd_dump_swap_cgroup_context(c, f, "\t"); + + fprintf(f, "Memory Pressure Monitored CGroups:\n"); + HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) + oomd_dump_memory_pressure_cgroup_context(c, f, "\t"); + + return memstream_finalize(&ms, ret, NULL); +} diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h new file mode 100644 index 0000000..8f0dd41 --- /dev/null +++ b/src/oom/oomd-manager.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-event.h" + +#include "conf-parser.h" +#include "oomd-util.h" +#include "varlink.h" + +/* Polling interval for monitoring stats */ +#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */ +/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */ +#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC) + +/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the + * percentage of time all tasks were delayed (i.e. unproductive). + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ +#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) +#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60 +#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90 + +/* Only tackle candidates with large swap usage. */ +#define THRESHOLD_SWAP_USED_PERCENT 5 + +#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) +#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) + +typedef struct Manager Manager; + +struct Manager { + sd_bus *bus; + sd_event *event; + + Hashmap *polkit_registry; + + bool dry_run; + int swap_used_limit_permyriad; + loadavg_t default_mem_pressure_limit; + usec_t default_mem_pressure_duration_usec; + + /* k: cgroup paths -> v: OomdCGroupContext + * Used to detect when to take action. */ + Hashmap *monitored_swap_cgroup_contexts; + Hashmap *monitored_mem_pressure_cgroup_contexts; + Hashmap *monitored_mem_pressure_cgroup_contexts_candidates; + + OomdSystemContext system_context; + + usec_t mem_pressure_post_action_delay_start; + + sd_event_source *swap_context_event_source; + sd_event_source *mem_pressure_context_event_source; + + /* This varlink object is used to manage the subscription from systemd-oomd to PID1 which it uses to + * listen for changes in ManagedOOM settings (oomd client - systemd server). */ + Varlink *varlink_client; + /* This varlink server object is used to manage systemd-oomd's varlink server which is used by user + * managers to report changes in ManagedOOM settings (oomd server - systemd client). */ + VarlinkServer *varlink_server; +}; + +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_new(Manager **ret); + +int manager_start(Manager *m, bool dry_run, int swap_used_limit_permyriad, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec, int fd); + +int manager_get_dump_string(Manager *m, char **ret); + +CONFIG_PARSER_PROTOTYPE(config_parse_oomd_default); diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c new file mode 100644 index 0000000..f9f0af2 --- /dev/null +++ b/src/oom/oomd-util.c @@ -0,0 +1,648 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "memstream-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "procfs-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "user-util.h" + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + oomd_cgroup_ctx_hash_ops, + char, + string_hash_func, + string_compare_func, + OomdCGroupContext, + oomd_cgroup_context_free); + +static int log_kill(const PidRef *pid, int sig, void *userdata) { + log_debug("oomd attempting to kill " PID_FMT " with %s", pid->pid, signal_to_string(sig)); + return 0; +} + +static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) { + _cleanup_free_ char *value = NULL; + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t curr_count = 0; + int r; + + assert(path); + assert(xattr); + + r = cg_get_xattr_malloc(path, xattr, &value); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &curr_count); + if (r < 0) + return r; + } + + if (curr_count > UINT64_MAX - num_procs_killed) + return -EOVERFLOW; + + xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed); + r = cg_set_xattr(path, xattr, buf, strlen(buf), 0); + if (r < 0) + return r; + + return 0; +} + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { + if (!ctx) + return NULL; + + free(ctx->path); + return mfree(ctx); +} + +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { + _cleanup_set_free_ Set *targets = NULL; + OomdCGroupContext *ctx; + char *key; + int r; + + assert(h); + assert(ret); + + targets = set_new(NULL); + if (!targets) + return -ENOMEM; + + HASHMAP_FOREACH_KEY(ctx, key, h) { + if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { + usec_t diff; + + if (ctx->mem_pressure_limit_hit_start == 0) + ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC); + + diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start; + if (diff >= duration) { + r = set_put(targets, ctx); + if (r < 0) + return -ENOMEM; + } + } else + ctx->mem_pressure_limit_hit_start = 0; + } + + if (!set_isempty(targets)) { + *ret = TAKE_PTR(targets); + return 1; + } + + *ret = NULL; + return 0; +} + +uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) { + uint64_t last_pgscan; + + assert(c); + + /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. + * pgscan is monotonic and in practice should not decrease (except in the recreation case). */ + last_pgscan = c->last_pgscan; + if (c->last_pgscan > c->pgscan) { + log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.", + c->last_pgscan, c->pgscan, c->path); + last_pgscan = 0; + } + + return c->pgscan - last_pgscan; +} + +bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad) { + uint64_t mem_threshold; + + assert(ctx); + assert(threshold_permyriad <= 10000); + + mem_threshold = ctx->mem_total * threshold_permyriad / (uint64_t) 10000; + return LESS_BY(ctx->mem_total, ctx->mem_used) < mem_threshold; +} + +bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) { + uint64_t swap_threshold; + + assert(ctx); + assert(threshold_permyriad <= 10000); + + swap_threshold = ctx->swap_total * threshold_permyriad / (uint64_t) 10000; + return (ctx->swap_total - ctx->swap_used) < swap_threshold; +} + +int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix) { + uid_t uid; + int r; + + assert(ctx); + + prefix = empty_to_root(prefix); + + if (!path_startswith(ctx->path, prefix)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s is not a descendant of %s", ctx->path, prefix); + + r = cg_get_owner(ctx->path, &uid); + if (r < 0) + return log_debug_errno(r, "Failed to get owner/group from %s: %m", ctx->path); + + if (uid != 0) { + uid_t prefix_uid; + + r = cg_get_owner(prefix, &prefix_uid); + if (r < 0) + return log_debug_errno(r, "Failed to get owner/group from %s: %m", prefix); + + if (uid != prefix_uid) { + ctx->preference = MANAGED_OOM_PREFERENCE_NONE; + return 0; + } + } + + /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used + * as an optional feature of systemd-oomd (and the system might not even support them). */ + r = cg_get_xattr_bool(ctx->path, "user.oomd_avoid"); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + log_debug_errno(r, "Failed to get xattr user.oomd_avoid, ignoring: %m"); + ctx->preference = r > 0 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference; + + r = cg_get_xattr_bool(ctx->path, "user.oomd_omit"); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + log_debug_errno(r, "Failed to get xattr user.oomd_omit, ignoring: %m"); + ctx->preference = r > 0 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference; + + return 0; +} + +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + OomdCGroupContext *item; + size_t k = 0; + int r; + + assert(h); + assert(compare_func); + assert(ret); + + sorted = new0(OomdCGroupContext*, hashmap_size(h)); + if (!sorted) + return -ENOMEM; + + HASHMAP_FOREACH(item, h) { + /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */ + if (item->path && prefix && !path_startswith(item->path, prefix)) + continue; + + r = oomd_fetch_cgroup_oom_preference(item, prefix); + if (r == -ENOMEM) + return r; + + if (item->preference == MANAGED_OOM_PREFERENCE_OMIT) + continue; + + sorted[k++] = item; + } + + typesafe_qsort(sorted, k, compare_func); + + *ret = TAKE_PTR(sorted); + + assert(k <= INT_MAX); + return (int) k; +} + +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { + _cleanup_set_free_ Set *pids_killed = NULL; + int r; + + assert(path); + + if (dry_run) { + _cleanup_free_ char *cg_path = NULL; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path); + if (r < 0) + return r; + + log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); + return 0; + } + + pids_killed = set_new(NULL); + if (!pids_killed) + return -ENOMEM; + + r = increment_oomd_xattr(path, "user.oomd_ooms", 1); + if (r < 0) + log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m"); + + if (recurse) + r = cg_kill_recursive(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + else + r = cg_kill(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + + /* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before + * we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and + * was removed between picking candidates and coming into this function. In either case, let's log + * about it let the caller decide what to do once they know how many PIDs were killed. */ + if (IN_SET(r, -ENOENT, -ENODEV)) + log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path); + else if (r < 0) + return r; + + if (set_isempty(pids_killed)) + log_debug("Nothing killed when attempting to kill %s", path); + + r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed)); + if (r < 0) + log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m"); + + return set_size(pids_killed) != 0; +} + +typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix); + +static int dump_kill_candidates(OomdCGroupContext **sorted, int n, int dump_until, dump_candidate_func dump_func) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + /* Try dumping top offendors, ignoring any errors that might happen. */ + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fprintf(f, "Considered %d cgroups for killing, top candidates were:\n", n); + for (int i = 0; i < dump_until; i++) + dump_func(sorted[i], f, "\t"); + + return memstream_dump(LOG_INFO, &m); +} + +int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int n, r, ret = 0; + int dump_until; + + assert(h); + assert(ret_selected); + + n = oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, prefix, &sorted); + if (n < 0) + return n; + + dump_until = MIN(n, DUMP_ON_KILL_COUNT); + for (int i = 0; i < n; i++) { + /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. + * Continue since there might be "avoid" cgroups at the end. */ + if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) + continue; + + r = oomd_cgroup_kill(sorted[i]->path, /* recurse= */ true, /* dry_run= */ dry_run); + if (r == -ENOMEM) + return r; /* Treat oom as a hard error */ + if (r < 0) { + if (ret == 0) + ret = r; + continue; /* Try to find something else to kill */ + } + + dump_until = MAX(dump_until, i + 1); + char *selected = strdup(sorted[i]->path); + if (!selected) + return -ENOMEM; + *ret_selected = selected; + ret = r; + break; + } + + dump_kill_candidates(sorted, n, dump_until, oomd_dump_memory_pressure_cgroup_context); + + return ret; +} + +int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int n, r, ret = 0; + int dump_until; + + assert(h); + assert(ret_selected); + + n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); + if (n < 0) + return n; + + dump_until = MIN(n, DUMP_ON_KILL_COUNT); + /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with + * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */ + for (int i = 0; i < n; i++) { + /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid" + * cgroups at the end. */ + if (sorted[i]->swap_usage <= threshold_usage) + continue; + + r = oomd_cgroup_kill(sorted[i]->path, /* recurse= */ true, /* dry_run= */ dry_run); + if (r == -ENOMEM) + return r; /* Treat oom as a hard error */ + if (r < 0) { + if (ret == 0) + ret = r; + continue; /* Try to find something else to kill */ + } + + dump_until = MAX(dump_until, i + 1); + char *selected = strdup(sorted[i]->path); + if (!selected) + return -ENOMEM; + *ret_selected = selected; + ret = r; + break; + } + + dump_kill_candidates(sorted, n, dump_until, oomd_dump_swap_cgroup_context); + + return ret; +} + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *p = NULL, *val = NULL; + bool is_root; + int r; + + assert(path); + assert(ret); + + ctx = new0(OomdCGroupContext, 1); + if (!ctx) + return -ENOMEM; + + is_root = empty_or_root(path); + ctx->preference = MANAGED_OOM_PREFERENCE_NONE; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p); + if (r < 0) + return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path); + + r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure); + if (r < 0) + return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); + + if (is_root) { + r = procfs_memory_get_used(&ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory used from procfs: %m"); + } else { + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory.current from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min); + if (r < 0) + return log_debug_errno(r, "Error getting memory.min from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low); + if (r < 0) + return log_debug_errno(r, "Error getting memory.low from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage); + if (r == -ENODATA) + /* The kernel can be compiled without support for memory.swap.* files, + * or it can be disabled with boot param 'swapaccount=0' */ + log_once(LOG_WARNING, "No kernel support for memory.swap.current from %s (try boot param swapaccount=1), ignoring.", path); + else if (r < 0) + return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path); + + r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val); + if (r < 0) + return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path); + + r = safe_atou64(val, &ctx->pgscan); + if (r < 0) + return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m"); + } + + ctx->path = strdup(empty_to_root(path)); + if (!ctx->path) + return -ENOMEM; + + *ret = TAKE_PTR(ctx); + return 0; +} + +int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret) { + _cleanup_fclose_ FILE *f = NULL; + unsigned field_filled = 0; + OomdSystemContext ctx = {}; + uint64_t mem_available, swap_free; + int r; + + enum { + MEM_TOTAL = 1U << 0, + MEM_AVAILABLE = 1U << 1, + SWAP_TOTAL = 1U << 2, + SWAP_FREE = 1U << 3, + ALL = MEM_TOTAL|MEM_AVAILABLE|SWAP_TOTAL|SWAP_FREE, + }; + + assert(proc_meminfo_path); + assert(ret); + + f = fopen(proc_meminfo_path, "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + char *word; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if ((word = startswith(line, "MemTotal:"))) { + field_filled |= MEM_TOTAL; + r = convert_meminfo_value_to_uint64_bytes(word, &ctx.mem_total); + } else if ((word = startswith(line, "MemAvailable:"))) { + field_filled |= MEM_AVAILABLE; + r = convert_meminfo_value_to_uint64_bytes(word, &mem_available); + } else if ((word = startswith(line, "SwapTotal:"))) { + field_filled |= SWAP_TOTAL; + r = convert_meminfo_value_to_uint64_bytes(word, &ctx.swap_total); + } else if ((word = startswith(line, "SwapFree:"))) { + field_filled |= SWAP_FREE; + r = convert_meminfo_value_to_uint64_bytes(word, &swap_free); + } else + continue; + + if (r < 0) + return log_debug_errno(r, "Error converting '%s' from %s to uint64_t: %m", line, proc_meminfo_path); + + if (field_filled == ALL) + break; + } + + if (field_filled != ALL) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "%s is missing expected fields", proc_meminfo_path); + + if (mem_available > ctx.mem_total) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "MemAvailable (%" PRIu64 ") cannot be greater than MemTotal (%" PRIu64 ") %m", + mem_available, + ctx.mem_total); + + if (swap_free > ctx.swap_total) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "SwapFree (%" PRIu64 ") cannot be greater than SwapTotal (%" PRIu64 ") %m", + swap_free, + ctx.swap_total); + + ctx.mem_used = ctx.mem_total - mem_available; + ctx.swap_used = ctx.swap_total - swap_free; + + *ret = ctx; + return 0; +} + +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL; + OomdCGroupContext *old_ctx; + int r; + + assert(new_h); + assert(path); + + path = empty_to_root(path); + + r = oomd_cgroup_context_acquire(path, &curr_ctx); + if (r < 0) + return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path); + + assert_se(streq(path, curr_ctx->path)); + + old_ctx = hashmap_get(old_h, path); + if (old_ctx) { + curr_ctx->last_pgscan = old_ctx->pgscan; + curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; + curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + } + + if (oomd_pgscan_rate(curr_ctx) > 0) + curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); + + r = hashmap_put(new_h, curr_ctx->path, curr_ctx); + if (r < 0) + return r; + + TAKE_PTR(curr_ctx); + return 0; +} + +void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h) { + OomdCGroupContext *ctx; + + assert(old_h); + assert(curr_h); + + HASHMAP_FOREACH(ctx, curr_h) { + OomdCGroupContext *old_ctx; + + old_ctx = hashmap_get(old_h, ctx->path); + if (!old_ctx) + continue; + + ctx->last_pgscan = old_ctx->pgscan; + ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; + ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + + if (oomd_pgscan_rate(ctx) > 0) + ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); + } +} + +void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { + assert(ctx); + assert(f); + + if (!empty_or_root(ctx->path)) + fprintf(f, + "%sPath: %s\n" + "%s\tSwap Usage: %s\n", + strempty(prefix), ctx->path, + strempty(prefix), FORMAT_BYTES(ctx->swap_usage)); + else + fprintf(f, + "%sPath: %s\n" + "%s\tSwap Usage: (see System Context)\n", + strempty(prefix), ctx->path, + strempty(prefix)); +} + +void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { + assert(ctx); + assert(f); + + fprintf(f, + "%sPath: %s\n" + "%s\tMemory Pressure Limit: %lu.%02lu%%\n" + "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" + "%s\tCurrent Memory Usage: %s\n", + strempty(prefix), ctx->path, + strempty(prefix), LOADAVG_INT_SIDE(ctx->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(ctx->mem_pressure_limit), + strempty(prefix), + LOADAVG_INT_SIDE(ctx->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg10), + LOADAVG_INT_SIDE(ctx->memory_pressure.avg60), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg60), + LOADAVG_INT_SIDE(ctx->memory_pressure.avg300), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg300), + FORMAT_TIMESPAN(ctx->memory_pressure.total, USEC_PER_SEC), + strempty(prefix), FORMAT_BYTES(ctx->current_memory_usage)); + + if (!empty_or_root(ctx->path)) + fprintf(f, + "%s\tMemory Min: %s\n" + "%s\tMemory Low: %s\n" + "%s\tPgscan: %" PRIu64 "\n" + "%s\tLast Pgscan: %" PRIu64 "\n", + strempty(prefix), FORMAT_BYTES_CGROUP_PROTECTION(ctx->memory_min), + strempty(prefix), FORMAT_BYTES_CGROUP_PROTECTION(ctx->memory_low), + strempty(prefix), ctx->pgscan, + strempty(prefix), ctx->last_pgscan); +} + +void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) { + assert(ctx); + assert(f); + + fprintf(f, + "%sMemory: Used: %s Total: %s\n" + "%sSwap: Used: %s Total: %s\n", + strempty(prefix), + FORMAT_BYTES(ctx->mem_used), + FORMAT_BYTES(ctx->mem_total), + strempty(prefix), + FORMAT_BYTES(ctx->swap_used), + FORMAT_BYTES(ctx->swap_total)); +} diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h new file mode 100644 index 0000000..f53e4c4 --- /dev/null +++ b/src/oom/oomd-util.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "cgroup-util.h" +#include "hashmap.h" +#include "psi-util.h" + +#define DUMP_ON_KILL_COUNT 10 +#define GROWING_SIZE_PERCENTILE 80 + +extern const struct hash_ops oomd_cgroup_ctx_hash_ops; + +typedef struct OomdCGroupContext OomdCGroupContext; +typedef struct OomdSystemContext OomdSystemContext; + +typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *); + +struct OomdCGroupContext { + char *path; + + ResourcePressure memory_pressure; + + uint64_t current_memory_usage; + + uint64_t memory_min; + uint64_t memory_low; + uint64_t swap_usage; + + uint64_t last_pgscan; + uint64_t pgscan; + + ManagedOOMPreference preference; + + /* These are only used for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; + usec_t mem_pressure_limit_hit_start; + usec_t last_had_mem_reclaim; +}; + +struct OomdSystemContext { + uint64_t mem_total; + uint64_t mem_used; + uint64_t swap_total; + uint64_t swap_used; +}; + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx); +DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free); + +/* All hashmaps used with these functions are expected to be of the form + * key: cgroup paths -> value: OomdCGroupContext. */ + +/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` + * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. + * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns + * below the limit. + * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. + * Returns -ENOMEM for allocation errors. */ +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); + +/* Returns true if the amount of memory available (see proc(5)) is below the permyriad of memory specified by `threshold_permyriad`. */ +bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad); + +/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */ +bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad); + +/* Returns pgscan - last_pgscan, accounting for corner cases. */ +uint64_t oomd_pgscan_rate(const OomdCGroupContext *c); + +/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end + * (after the smallest values). */ +static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + uint64_t diff1, diff2; + int r; + + assert(c1); + assert(c2); + + r = CMP((*c1)->preference, (*c2)->preference); + if (r != 0) + return r; + + diff1 = oomd_pgscan_rate(*c1); + diff2 = oomd_pgscan_rate(*c2); + r = CMP(diff2, diff1); + if (r != 0) + return r; + + return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage); +} + +static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + int r; + + assert(c1); + assert(c2); + + r = CMP((*c1)->preference, (*c2)->preference); + if (r != 0) + return r; + + return CMP((*c2)->swap_usage, (*c1)->swap_usage); +} + +/* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`. + * If `prefix` is not NULL, only include OomdCGroupContexts whose paths start with prefix. Otherwise all paths are sorted. + * Returns the number of sorted items; negative on error. */ +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret); + +/* If the cgroup is owned by root, or the cgroups represented by `ctx` and + * `prefix` are owned by the same user, then set `ctx->preference` using the + * `user.oomd_avoid` and `user.oomd_omit` xattrs. Otherwise, set + * `ctx->preference` to MANAGED_OOM_PREFERENCE_NONE. + * + * If `prefix` is NULL or the empty string, it is treated as root. If `prefix` + * does not specify an ancestor cgroup of `ctx`, -EINVAL is returned. Returns + * negative on all other errors. */ +int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix); + +/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */ +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run); + +/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */ +/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise, + * everything in `h` is a candidate. + * Returns the killed cgroup in ret_selected. */ +int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected); +int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected); + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); +int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret); + +/* Get the OomdCGroupContext of `path` and insert it into `new_h`. The key for the inserted context will be `path`. + * + * `old_h` is used to get data used to calculate prior interval information. `old_h` can be NULL in which case there + * was no prior data to reference. */ +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path); + +/* Update each OomdCGroupContext in `curr_h` with prior interval information from `old_h`. */ +void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h); + +void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix); +void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix); +void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix); diff --git a/src/oom/oomd.c b/src/oom/oomd.c new file mode 100644 index 0000000..ecc2eda --- /dev/null +++ b/src/oom/oomd.c @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "build.h" +#include "bus-log-control-api.h" +#include "bus-object.h" +#include "cgroup-util.h" +#include "conf-parser.h" +#include "daemon-util.h" +#include "fileio.h" +#include "log.h" +#include "main-func.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "psi-util.h" +#include "signal-util.h" + +static bool arg_dry_run = false; +static int arg_swap_used_limit_permyriad = -1; +static int arg_mem_pressure_limit_permyriad = -1; +static usec_t arg_mem_pressure_usec = 0; + +static int parse_config(void) { + static const ConfigTableItem items[] = { + { "OOM", "SwapUsedLimit", config_parse_permyriad, 0, &arg_swap_used_limit_permyriad }, + { "OOM", "DefaultMemoryPressureLimit", config_parse_permyriad, 0, &arg_mem_pressure_limit_permyriad }, + { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, + {} + }; + + return config_parse_config_file("oomd.conf", "OOM\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, NULL); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-oomd", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "Run the userspace out-of-memory (OOM) killer.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --dry-run Only print destructive actions instead of doing them\n" + " --bus-introspect=PATH Write D-Bus XML introspection data\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_DRY_RUN, + ARG_BUS_INTROSPECT, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "dry-run", no_argument, NULL, ARG_DRY_RUN }, + { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_DRY_RUN: + arg_dry_run = true; + break; + + case ARG_BUS_INTROSPECT: + return bus_introspect_implementations( + stdout, + optarg, + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object)); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes no arguments."); + + return 1; +} + +static int run(int argc, char *argv[]) { + _unused_ _cleanup_(notify_on_cleanup) const char *notify_msg = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_free_ char *swap = NULL; + unsigned long long s = 0; + CGroupMask mask; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = parse_config(); + if (r < 0) + return r; + + /* Do some basic requirement checks for running systemd-oomd. It's not exhaustive as some of the other + * requirements do not have a reliable means to check for in code. */ + + int n = sd_listen_fds(0); + if (n > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Received too many file descriptors"); + + int fd = n == 1 ? SD_LISTEN_FDS_START : -1; + + /* SwapTotal is always available in /proc/meminfo and defaults to 0, even on swap-disabled kernels. */ + r = get_proc_field("/proc/meminfo", "SwapTotal", WHITESPACE, &swap); + if (r < 0) + return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m"); + + r = safe_atollu(swap, &s); + if (r < 0 || s == 0) + log_warning("No swap; memory pressure usage will be degraded"); + + if (!is_pressure_supported()) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported"); + + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the unified cgroups hierarchy"); + + r = cg_mask_supported(&mask); + if (r < 0) + return log_error_errno(r, "Failed to get supported cgroup controllers: %m"); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the cgroup memory controller."); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s"); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + + r = manager_start( + m, + arg_dry_run, + arg_swap_used_limit_permyriad, + arg_mem_pressure_limit_permyriad, + arg_mem_pressure_usec, + fd); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + + notify_msg = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + log_debug("systemd-oomd started%s.", arg_dry_run ? " in dry run mode" : ""); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf new file mode 100644 index 0000000..1c8fa76 --- /dev/null +++ b/src/oom/oomd.conf @@ -0,0 +1,22 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/oomd.conf.d/ directory. The latter is generally recommended. +# Defaults can be restored by simply deleting the main configuration file and +# all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/oomd.conf' to display the full config. +# +# See oomd.conf(5) for details + +[OOM] +#SwapUsedLimit=90% +#DefaultMemoryPressureLimit=60% +#DefaultMemoryPressureDurationSec=30s diff --git a/src/oom/org.freedesktop.oom1.conf b/src/oom/org.freedesktop.oom1.conf new file mode 100644 index 0000000..d00bdcd --- /dev/null +++ b/src/oom/org.freedesktop.oom1.conf @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/oom/org.freedesktop.oom1.service b/src/oom/org.freedesktop.oom1.service new file mode 100644 index 0000000..4fd5138 --- /dev/null +++ b/src/oom/org.freedesktop.oom1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.oom1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.oom1.service diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c new file mode 100644 index 0000000..1aef603 --- /dev/null +++ b/src/oom/test-oomd-util.c @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static int fork_and_sleep(unsigned sleep_min) { + usec_t n, timeout, ts; + + pid_t pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + timeout = sleep_min * USEC_PER_MINUTE; + ts = now(CLOCK_MONOTONIC); + for (;;) { + n = now(CLOCK_MONOTONIC); + if (ts + timeout < n) { + log_error("Child timed out waiting to be killed"); + abort(); + } + sleep(1); + } + } + + return pid; +} + +static void test_oomd_cgroup_kill(void) { + _cleanup_free_ char *cgroup_root = NULL, *cgroup = NULL; + int pid[2]; + int r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup_root) >= 0); + + /* Create another cgroup below this one for the pids we forked off. We need this to be managed + * by the test so that pid1 doesn't delete it before we can read the xattrs. */ + cgroup = path_join(cgroup_root, "oomdkilltest"); + assert_se(cgroup); + assert_se(cg_create(SYSTEMD_CGROUP_CONTROLLER, cgroup) >= 0); + + /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities */ + r = cg_set_xattr(cgroup, "user.oomd_test", "test", 4, 0); + if (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r)) + return (void) log_tests_skipped("Cannot set user xattrs"); + + /* Do this twice to also check the increment behavior on the xattrs */ + for (int i = 0; i < 2; i++) { + _cleanup_free_ char *v = NULL; + + for (int j = 0; j < 2; j++) { + pid[j] = fork_and_sleep(5); + assert_se(cg_attach(SYSTEMD_CGROUP_CONTROLLER, cgroup, pid[j]) >= 0); + } + + r = oomd_cgroup_kill(cgroup, false /* recurse */, false /* dry run */); + if (r <= 0) { + log_debug_errno(r, "Failed to kill processes under %s: %m", cgroup); + abort(); + } + + assert_se(cg_get_xattr_malloc(cgroup, "user.oomd_ooms", &v) >= 0); + assert_se(streq(v, i == 0 ? "1" : "2")); + v = mfree(v); + + /* Wait a bit since processes may take some time to be cleaned up. */ + sleep(2); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); + + assert_se(cg_get_xattr_malloc(cgroup, "user.oomd_kill", &v) >= 0); + assert_se(streq(v, i == 0 ? "2" : "4")); + } +} + +static void test_oomd_cgroup_context_acquire_and_insert(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + OomdCGroupContext *c1, *c2; + CGroupMask mask; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (!is_pressure_supported()) + return (void) log_tests_skipped("system does not support pressure"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_mask_supported(&mask) >= 0); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return (void) log_tests_skipped("cgroup memory controller is not available"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(streq(ctx->path, cgroup)); + assert_se(ctx->current_memory_usage > 0); + assert_se(ctx->memory_min == 0); + assert_se(ctx->memory_low == 0); + assert_se(ctx->swap_usage == 0); + assert_se(ctx->last_pgscan == 0); + assert_se(ctx->pgscan == 0); + ctx = oomd_cgroup_context_free(ctx); + + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + assert_se(streq(ctx->path, "/")); + assert_se(ctx->current_memory_usage > 0); + + /* Test hashmap inserts */ + assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + assert_se(c1); + assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST); + + /* make sure certain values from h1 get updated in h2 */ + c1->pgscan = UINT64_MAX; + c1->mem_pressure_limit = 6789; + c1->mem_pressure_limit_hit_start = 42; + c1->last_had_mem_reclaim = 888; + assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + c2 = hashmap_get(h2, cgroup); + assert_se(c1); + assert_se(c2); + assert_se(c1 != c2); + assert_se(c2->last_pgscan == UINT64_MAX); + assert_se(c2->mem_pressure_limit == 6789); + assert_se(c2->mem_pressure_limit_hit_start == 42); + assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */ +} + +static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { + _cleanup_hashmap_free_ Hashmap *h_old = NULL, *h_new = NULL; + OomdCGroupContext *c_old, *c_new; + char **paths = STRV_MAKE("/0.slice", + "/1.slice"); + + OomdCGroupContext ctx_old[2] = { + { .path = paths[0], + .mem_pressure_limit = 5, + .mem_pressure_limit_hit_start = 777, + .last_had_mem_reclaim = 888, + .pgscan = 57 }, + { .path = paths[1], + .mem_pressure_limit = 6, + .mem_pressure_limit_hit_start = 888, + .last_had_mem_reclaim = 888, + .pgscan = 42 }, + }; + + OomdCGroupContext ctx_new[2] = { + { .path = paths[0], + .pgscan = 57 }, + { .path = paths[1], + .pgscan = 101 }, + }; + + assert_se(h_old = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h_old, paths[0], &ctx_old[0]) >= 0); + assert_se(hashmap_put(h_old, paths[1], &ctx_old[1]) >= 0); + + assert_se(h_new = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h_new, paths[0], &ctx_new[0]) >= 0); + assert_se(hashmap_put(h_new, paths[1], &ctx_new[1]) >= 0); + + oomd_update_cgroup_contexts_between_hashmaps(h_old, h_new); + + assert_se(c_old = hashmap_get(h_old, "/0.slice")); + assert_se(c_new = hashmap_get(h_new, "/0.slice")); + assert_se(c_old->pgscan == c_new->last_pgscan); + assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); + assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim); + + assert_se(c_old = hashmap_get(h_old, "/1.slice")); + assert_se(c_new = hashmap_get(h_new, "/1.slice")); + assert_se(c_old->pgscan == c_new->last_pgscan); + assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); + assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim); +} + +static void test_oomd_system_context_acquire(void) { + _cleanup_(unlink_tempfilep) char path[] = "/tmp/oomdgetsysctxtestXXXXXX"; + _cleanup_close_ int fd = -EBADF; + OomdSystemContext ctx; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se((fd = mkostemp_safe(path)) >= 0); + + assert_se(oomd_system_context_acquire("/verylikelynonexistentpath", &ctx) == -ENOENT); + + assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL); + + assert_se(write_string_file(path, "some\nwords\nacross\nmultiple\nlines", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL); + + assert_se(write_string_file(path, "MemTotal: 32495256 kB trailing\n" + "MemFree: 9880512 kB data\n" + "SwapTotal: 8388604 kB is\n" + "SwapFree: 7604 kB bad\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL); + + assert_se(write_string_file(path, "MemTotal: 32495256 kB\n" + "MemFree: 9880512 kB\n" + "MemAvailable: 21777088 kB\n" + "Buffers: 5968 kB\n" + "Cached: 14344796 kB\n" + "Unevictable: 740004 kB\n" + "Mlocked: 4484 kB\n" + "SwapTotal: 8388604 kB\n" + "SwapFree: 7604 kB\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.mem_total == 33275142144); + assert_se(ctx.mem_used == 10975404032); + assert_se(ctx.swap_total == 8589930496); + assert_se(ctx.swap_used == 8582144000); +} + +static void test_oomd_pressure_above(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_set_free_ Set *t1 = NULL, *t2 = NULL, *t3 = NULL; + OomdCGroupContext ctx[2] = {}, *c; + loadavg_t threshold; + + assert_se(store_loadavg_fixed_point(80, 0, &threshold) == 0); + + /* /herp.slice */ + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0); + ctx[0].mem_pressure_limit = threshold; + + /* /derp.slice */ + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0); + ctx[1].mem_pressure_limit = threshold; + + /* High memory pressure */ + assert_se(h1 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); + assert_se(set_contains(t1, &ctx[0])); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->mem_pressure_limit_hit_start > 0); + + /* Low memory pressure */ + assert_se(h2 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); + assert_se(!t2); + assert_se(c = hashmap_get(h2, "/derp.slice")); + assert_se(c->mem_pressure_limit_hit_start == 0); + + /* High memory pressure w/ multiple cgroups */ + assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1); + assert_se(set_contains(t3, &ctx[0])); + assert_se(set_size(t3) == 1); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->mem_pressure_limit_hit_start > 0); + assert_se(c = hashmap_get(h1, "/derp.slice")); + assert_se(c->mem_pressure_limit_hit_start == 0); +} + +static void test_oomd_mem_and_swap_free_below(void) { + OomdSystemContext ctx = (OomdSystemContext) { + .mem_total = UINT64_C(20971512) * 1024U, + .mem_used = UINT64_C(3310136) * 1024U, + .swap_total = UINT64_C(20971512) * 1024U, + .swap_used = UINT64_C(20971440) * 1024U, + }; + assert_se(oomd_mem_available_below(&ctx, 2000) == false); + assert_se(oomd_swap_free_below(&ctx, 2000) == true); + + ctx = (OomdSystemContext) { + .mem_total = UINT64_C(20971512) * 1024U, + .mem_used = UINT64_C(20971440) * 1024U, + .swap_total = UINT64_C(20971512) * 1024U, + .swap_used = UINT64_C(3310136) * 1024U, + }; + assert_se(oomd_mem_available_below(&ctx, 2000) == true); + assert_se(oomd_swap_free_below(&ctx, 2000) == false); + + ctx = (OomdSystemContext) { + .mem_total = 0, + .mem_used = 0, + .swap_total = 0, + .swap_used = 0, + }; + assert_se(oomd_mem_available_below(&ctx, 2000) == false); + assert_se(oomd_swap_free_below(&ctx, 2000) == false); +} + +static void test_oomd_sort_cgroups(void) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + _cleanup_free_ OomdCGroupContext **sorted_cgroups; + char **paths = STRV_MAKE("/herp.slice", + "/herp.slice/derp.scope", + "/herp.slice/derp.scope/sheep.service", + "/zupa.slice", + "/boop.slice", + "/omitted.slice", + "/avoid.slice"); + + OomdCGroupContext ctx[7] = { + { .path = paths[0], + .swap_usage = 20, + .last_pgscan = 0, + .pgscan = 33, + .current_memory_usage = 10 }, + { .path = paths[1], + .swap_usage = 60, + .last_pgscan = 33, + .pgscan = 1, + .current_memory_usage = 20 }, + { .path = paths[2], + .swap_usage = 40, + .last_pgscan = 1, + .pgscan = 33, + .current_memory_usage = 40 }, + { .path = paths[3], + .swap_usage = 10, + .last_pgscan = 33, + .pgscan = 2, + .current_memory_usage = 10 }, + { .path = paths[4], + .swap_usage = 11, + .last_pgscan = 33, + .pgscan = 33, + .current_memory_usage = 10 }, + { .path = paths[5], + .swap_usage = 90, + .last_pgscan = 0, + .pgscan = UINT64_MAX, + .preference = MANAGED_OOM_PREFERENCE_OMIT }, + { .path = paths[6], + .swap_usage = 99, + .last_pgscan = 0, + .pgscan = UINT64_MAX, + .preference = MANAGED_OOM_PREFERENCE_AVOID }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); + + assert_se(hashmap_put(h, "/herp.slice", &ctx[0]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0); + assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0); + assert_se(hashmap_put(h, "/boop.slice", &ctx[4]) >= 0); + assert_se(hashmap_put(h, "/omitted.slice", &ctx[5]) >= 0); + assert_se(hashmap_put(h, "/avoid.slice", &ctx[6]) >= 0); + + assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 6); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[0]); + assert_se(sorted_cgroups[3] == &ctx[4]); + assert_se(sorted_cgroups[4] == &ctx[3]); + assert_se(sorted_cgroups[5] == &ctx[6]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, NULL, &sorted_cgroups) == 6); + assert_se(sorted_cgroups[0] == &ctx[0]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[3]); + assert_se(sorted_cgroups[3] == &ctx[1]); + assert_se(sorted_cgroups[4] == &ctx[4]); + assert_se(sorted_cgroups[5] == &ctx[6]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2); + assert_se(sorted_cgroups[0] == &ctx[2]); + assert_se(sorted_cgroups[1] == &ctx[1]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); + assert_se(sorted_cgroups[4] == 0); + assert_se(sorted_cgroups[5] == 0); + assert_se(sorted_cgroups[6] == 0); + sorted_cgroups = mfree(sorted_cgroups); +} + +static void test_oomd_fetch_cgroup_oom_preference(void) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + ManagedOOMPreference root_pref; + CGroupMask mask; + bool test_xattrs; + int root_xattrs, r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (!is_pressure_supported()) + return (void) log_tests_skipped("system does not support pressure"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_mask_supported(&mask) >= 0); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return (void) log_tests_skipped("cgroup memory controller is not available"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities + * so skip the xattr portions of the test. */ + r = cg_set_xattr(cgroup, "user.oomd_test", "1", 1, 0); + test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r); + + if (test_xattrs) { + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(cg_set_xattr(cgroup, "user.oomd_omit", "1", 1, 0) >= 0); + assert_se(cg_set_xattr(cgroup, "user.oomd_avoid", "1", 1, 0) >= 0); + + /* omit takes precedence over avoid when both are set to true */ + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT); + } else { + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) < 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE); + } + ctx = oomd_cgroup_context_free(ctx); + + /* also check when only avoid is set to true */ + if (test_xattrs) { + assert_se(cg_set_xattr(cgroup, "user.oomd_omit", "0", 1, 0) >= 0); + assert_se(cg_set_xattr(cgroup, "user.oomd_avoid", "1", 1, 0) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID); + ctx = oomd_cgroup_context_free(ctx); + } + + /* Test the root cgroup */ + /* Root cgroup is live and not made on demand like the cgroup the test runs in. It can have varying + * xattrs set already so let's read in the booleans first to get the final preference value. */ + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + root_xattrs = cg_get_xattr_bool("", "user.oomd_omit"); + root_pref = root_xattrs > 0 ? MANAGED_OOM_PREFERENCE_OMIT : MANAGED_OOM_PREFERENCE_NONE; + root_xattrs = cg_get_xattr_bool("", "user.oomd_avoid"); + root_pref = root_xattrs > 0 ? MANAGED_OOM_PREFERENCE_AVOID : MANAGED_OOM_PREFERENCE_NONE; + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == root_pref); + + assert_se(oomd_fetch_cgroup_oom_preference(ctx, "/herp.slice/derp.scope") == -EINVAL); + + /* Assert that avoid/omit are not set if the cgroup and prefix are not + * owned by the same user. */ + if (test_xattrs && !empty_or_root(cgroup)) { + ctx = oomd_cgroup_context_free(ctx); + assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 61183, 0) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE); + + assert_se(oomd_fetch_cgroup_oom_preference(ctx, ctx->path) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID); + } +} + +int main(void) { + int r; + + test_setup_logging(LOG_DEBUG); + + test_oomd_update_cgroup_contexts_between_hashmaps(); + test_oomd_system_context_acquire(); + test_oomd_pressure_above(); + test_oomd_mem_and_swap_free_below(); + test_oomd_sort_cgroups(); + + /* The following tests operate on live cgroups */ + + r = enter_cgroup_root(NULL); + if (r < 0) + return log_tests_skipped_errno(r, "failed to enter a test cgroup scope"); + + test_oomd_cgroup_kill(); + test_oomd_cgroup_context_acquire_and_insert(); + test_oomd_fetch_cgroup_oom_preference(); + + return 0; +} diff --git a/src/partition/definitions/confext.repart.d/10-root.conf b/src/partition/definitions/confext.repart.d/10-root.conf new file mode 100644 index 0000000..f728ab6 --- /dev/null +++ b/src/partition/definitions/confext.repart.d/10-root.conf @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root +Format=erofs +CopyFiles=/etc/ +Verity=data +VerityMatchKey=root +Minimize=best diff --git a/src/partition/definitions/confext.repart.d/20-root-verity.conf b/src/partition/definitions/confext.repart.d/20-root-verity.conf new file mode 100644 index 0000000..8179351 --- /dev/null +++ b/src/partition/definitions/confext.repart.d/20-root-verity.conf @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root-verity +Verity=hash +VerityMatchKey=root +Minimize=best diff --git a/src/partition/definitions/confext.repart.d/30-root-verity-sig.conf b/src/partition/definitions/confext.repart.d/30-root-verity-sig.conf new file mode 100644 index 0000000..df16015 --- /dev/null +++ b/src/partition/definitions/confext.repart.d/30-root-verity-sig.conf @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root-verity-sig +Verity=signature +VerityMatchKey=root diff --git a/src/partition/definitions/portable.repart.d/10-root.conf b/src/partition/definitions/portable.repart.d/10-root.conf new file mode 100644 index 0000000..6f500d0 --- /dev/null +++ b/src/partition/definitions/portable.repart.d/10-root.conf @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root +Format=erofs +CopyFiles=/ +Verity=data +VerityMatchKey=root +Minimize=best diff --git a/src/partition/definitions/portable.repart.d/20-root-verity.conf b/src/partition/definitions/portable.repart.d/20-root-verity.conf new file mode 100644 index 0000000..8179351 --- /dev/null +++ b/src/partition/definitions/portable.repart.d/20-root-verity.conf @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root-verity +Verity=hash +VerityMatchKey=root +Minimize=best diff --git a/src/partition/definitions/portable.repart.d/30-root-verity-sig.conf b/src/partition/definitions/portable.repart.d/30-root-verity-sig.conf new file mode 100644 index 0000000..df16015 --- /dev/null +++ b/src/partition/definitions/portable.repart.d/30-root-verity-sig.conf @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root-verity-sig +Verity=signature +VerityMatchKey=root diff --git a/src/partition/definitions/sysext.repart.d/10-root.conf b/src/partition/definitions/sysext.repart.d/10-root.conf new file mode 100644 index 0000000..b8ef985 --- /dev/null +++ b/src/partition/definitions/sysext.repart.d/10-root.conf @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root +Format=erofs +CopyFiles=/usr/ +CopyFiles=/opt/ +Verity=data +VerityMatchKey=root +Minimize=best diff --git a/src/partition/definitions/sysext.repart.d/20-root-verity.conf b/src/partition/definitions/sysext.repart.d/20-root-verity.conf new file mode 100644 index 0000000..8179351 --- /dev/null +++ b/src/partition/definitions/sysext.repart.d/20-root-verity.conf @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root-verity +Verity=hash +VerityMatchKey=root +Minimize=best diff --git a/src/partition/definitions/sysext.repart.d/30-root-verity-sig.conf b/src/partition/definitions/sysext.repart.d/30-root-verity-sig.conf new file mode 100644 index 0000000..df16015 --- /dev/null +++ b/src/partition/definitions/sysext.repart.d/30-root-verity-sig.conf @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Partition] +Type=root-verity-sig +Verity=signature +VerityMatchKey=root diff --git a/src/partition/growfs.c b/src/partition/growfs.c new file mode 100644 index 0000000..62f3ee6 --- /dev/null +++ b/src/partition/growfs.c @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +/* This needs to be included after sys/mount.h, as since [0] linux/btrfs.h + * includes linux/fs.h causing build errors + * See: https://github.com/systemd/systemd/issues/8507 + * [0] https://github.com/torvalds/linux/commit/a28135303a669917002f569aecebd5758263e4aa + */ +#include + +#include "sd-device.h" + +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "build.h" +#include "cryptsetup-util.h" +#include "device-nodes.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dissect-image.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "log.h" +#include "main-func.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "resize-fs.h" + +static const char *arg_target = NULL; +static bool arg_dry_run = false; + +#if HAVE_LIBCRYPTSETUP +static int resize_crypt_luks_device(dev_t devno, const char *fstype, dev_t main_devno) { + _cleanup_free_ char *devpath = NULL, *main_devpath = NULL; + _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL; + _cleanup_close_ int main_devfd = -EBADF; + uint64_t size; + int r; + + r = dlopen_cryptsetup(); + if (r < 0) + return log_error_errno(r, "Cannot resize LUKS device: %m"); + + main_devfd = r = device_open_from_devnum(S_IFBLK, main_devno, O_RDONLY|O_CLOEXEC, &main_devpath); + if (r < 0) + return log_error_errno(r, "Failed to open main block device " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(main_devno)); + + if (ioctl(main_devfd, BLKGETSIZE64, &size) != 0) + return log_error_errno(errno, "Failed to query size of \"%s\" (before resize): %m", + main_devpath); + + log_debug("%s is %"PRIu64" bytes", main_devpath, size); + + r = devname_from_devnum(S_IFBLK, devno, &devpath); + if (r < 0) + return log_error_errno(r, "Failed to get devpath of " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(devno)); + + r = sym_crypt_init(&cd, devpath); + if (r < 0) + return log_error_errno(r, "crypt_init(\"%s\") failed: %m", devpath); + + cryptsetup_enable_logging(cd); + + r = sym_crypt_load(cd, CRYPT_LUKS, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to load LUKS metadata for %s: %m", devpath); + + if (arg_dry_run) + return 0; + + r = sym_crypt_resize(cd, main_devpath, 0); + if (r < 0) + return log_error_errno(r, "crypt_resize() of %s failed: %m", devpath); + + if (ioctl(main_devfd, BLKGETSIZE64, &size) != 0) + log_warning_errno(errno, "Failed to query size of \"%s\" (after resize): %m", + devpath); + else + log_debug("%s is now %"PRIu64" bytes", main_devpath, size); + + return 1; +} +#endif + +static int maybe_resize_underlying_device( + int mountfd, + const char *mountpath, + dev_t main_devno) { + + _cleanup_free_ char *devpath = NULL, *fstype = NULL; + dev_t devno; + int r; + + assert(mountfd >= 0); + assert(mountpath); + +#if HAVE_LIBCRYPTSETUP + cryptsetup_enable_logging(NULL); +#endif + + r = get_block_device_harder_fd(mountfd, &devno); + if (r < 0) + return log_error_errno(r, "Failed to determine underlying block device of \"%s\": %m", + mountpath); + if (devno == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system \"%s\" not backed by block device.", arg_target); + + log_debug("Underlying device " DEVNUM_FORMAT_STR ", main dev " DEVNUM_FORMAT_STR ", %s", + DEVNUM_FORMAT_VAL(devno), + DEVNUM_FORMAT_VAL(main_devno), + devno == main_devno ? "same" : "different"); + if (devno == main_devno) + return 0; + + r = devname_from_devnum(S_IFBLK, devno, &devpath); + if (r < 0) + return log_error_errno(r, "Failed to get devpath for block device " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(devno)); + + r = probe_filesystem(devpath, &fstype); + if (r == -EUCLEAN) + return log_warning_errno(r, "Cannot reliably determine probe \"%s\", refusing to proceed.", devpath); + if (r < 0) + return log_warning_errno(r, "Failed to probe \"%s\": %m", devpath); + +#if HAVE_LIBCRYPTSETUP + if (streq_ptr(fstype, "crypto_LUKS")) + return resize_crypt_luks_device(devno, fstype, main_devno); +#endif + + log_debug("Don't know how to resize %s of type %s, ignoring.", devpath, strnull(fstype)); + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-growfs@.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] /path/to/mountpoint\n\n" + "Grow filesystem or encrypted payload to device size.\n\n" + "Options:\n" + " -h --help Show this help and exit\n" + " --version Print version string and exit\n" + " -n --dry-run Just print what would be done\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + }; + + int c; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version" , no_argument, NULL, ARG_VERSION }, + { "dry-run", no_argument, NULL, 'n' }, + {} + }; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hn", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'n': + arg_dry_run = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind + 1 != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s excepts exactly one argument (the mount point).", + program_invocation_short_name); + + arg_target = argv[optind]; + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_close_ int mountfd = -EBADF, devfd = -EBADF; + _cleanup_free_ char *devpath = NULL; + uint64_t size, newsize; + dev_t devno; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = path_is_mount_point(arg_target, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to check if \"%s\" is a mount point: %m", arg_target); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "\"%s\" is not a mount point: %m", arg_target); + + mountfd = open(arg_target, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (mountfd < 0) + return log_error_errno(errno, "Failed to open \"%s\": %m", arg_target); + + r = get_block_device_fd(mountfd, &devno); + if (r == -EUCLEAN) + return btrfs_log_dev_root(LOG_ERR, r, arg_target); + if (r < 0) + return log_error_errno(r, "Failed to determine block device of \"%s\": %m", arg_target); + if (devno == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system \"%s\" not backed by block device.", arg_target); + + r = maybe_resize_underlying_device(mountfd, arg_target, devno); + if (r < 0) + log_warning_errno(r, "Unable to resize underlying device of \"%s\", proceeding anyway: %m", arg_target); + + devfd = r = device_open_from_devnum(S_IFBLK, devno, O_RDONLY|O_CLOEXEC, &devpath); + if (r < 0) + return log_error_errno(r, "Failed to open block device " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(devno)); + + if (ioctl(devfd, BLKGETSIZE64, &size) != 0) + return log_error_errno(errno, "Failed to query size of \"%s\": %m", devpath); + + log_debug("Resizing \"%s\" to %"PRIu64" bytes...", arg_target, size); + + if (arg_dry_run) + return 0; + + r = resize_fs(mountfd, size, &newsize); + if (r < 0) + return log_error_errno(r, "Failed to resize \"%s\" to %"PRIu64" bytes: %m", + arg_target, size); + if (newsize == size) + log_info("Successfully resized \"%s\" to %s bytes.", + arg_target, + FORMAT_BYTES(newsize)); + else + log_info("Successfully resized \"%s\" to %s bytes (%"PRIu64" bytes lost due to blocksize).", + arg_target, + FORMAT_BYTES(newsize), + size - newsize); + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/partition/makefs.c b/src/partition/makefs.c new file mode 100644 index 0000000..53439a4 --- /dev/null +++ b/src/partition/makefs.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "main-func.h" +#include "mkfs-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "string-util.h" + +static int run(int argc, char *argv[]) { + _cleanup_free_ char *device = NULL, *fstype = NULL, *detected = NULL, *label = NULL; + _cleanup_close_ int lock_fd = -EBADF; + sd_id128_t uuid; + struct stat st; + int r; + + log_setup(); + + if (argc != 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program expects two arguments."); + + /* type and device must be copied because makefs calls safe_fork, which clears argv[] */ + fstype = strdup(argv[1]); + if (!fstype) + return log_oom(); + + device = strdup(argv[2]); + if (!device) + return log_oom(); + + if (stat(device, &st) < 0) + return log_error_errno(errno, "Failed to stat \"%s\": %m", device); + + if (S_ISBLK(st.st_mode)) { + /* Lock the device so that udev doesn't interfere with our work */ + + lock_fd = lock_whole_block_device(st.st_rdev, LOCK_EX); + if (lock_fd < 0) + return log_error_errno(lock_fd, "Failed to lock whole block device of \"%s\": %m", device); + } else + log_debug("%s is not a block device, no need to lock.", device); + + r = probe_filesystem(device, &detected); + if (r == -EUCLEAN) + return log_error_errno(r, "Ambiguous results of probing for file system on \"%s\", refusing to proceed.", device); + if (r < 0) + return log_error_errno(r, "Failed to probe \"%s\": %m", device); + if (detected) { + log_info("'%s' is not empty (contains file system of type %s), exiting.", device, detected); + return 0; + } + + r = sd_id128_randomize(&uuid); + if (r < 0) + return log_error_errno(r, "Failed to generate UUID for file system: %m"); + + r = path_extract_filename(device, &label); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", device); + + return make_filesystem(device, + fstype, + label, + /* root = */ NULL, + uuid, + /* discard = */ true, + /* quiet = */ true, + /* sector_size = */ 0, + /* extra_mkfs_options = */ NULL); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/partition/meson.build b/src/partition/meson.build new file mode 100644 index 0000000..78cde2f --- /dev/null +++ b/src/partition/meson.build @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-growfs', + 'sources' : files('growfs.c'), + }, + libexec_template + { + 'name' : 'systemd-makefs', + 'sources' : files('makefs.c'), + }, + executable_template + { + 'name' : 'systemd-repart', + 'public' : true, + 'conditions' : ['ENABLE_REPART'], + 'sources' : files('repart.c'), + 'link_with' : [ + libshared, + libshared_fdisk, + ], + 'dependencies' : [ + libblkid, + libfdisk, + libopenssl, + threads, + ], + }, + executable_template + { + 'name' : 'systemd-repart.standalone', + 'public' : have_standalone_binaries, + 'conditions' : ['ENABLE_REPART'], + 'sources' : files('repart.c'), + 'c_args' : '-DSTANDALONE', + 'link_with' : [ + libbasic, + libbasic_gcrypt, + libshared_fdisk, + libshared_static, + libsystemd_static, + ], + 'dependencies' : [ + libblkid, + libfdisk, + libopenssl, + threads, + ], + 'build_by_default' : have_standalone_binaries, + 'install' : have_standalone_binaries, + }, +] + +if conf.get('ENABLE_REPART') == 1 + install_data('definitions/confext.repart.d/10-root.conf', install_dir : repartdefinitionsdir / 'confext.repart.d') + install_data('definitions/confext.repart.d/20-root-verity.conf', install_dir : repartdefinitionsdir / 'confext.repart.d') + install_data('definitions/confext.repart.d/30-root-verity-sig.conf', install_dir : repartdefinitionsdir / 'confext.repart.d') + install_data('definitions/portable.repart.d/10-root.conf', install_dir : repartdefinitionsdir / 'portable.repart.d') + install_data('definitions/portable.repart.d/20-root-verity.conf', install_dir : repartdefinitionsdir / 'portable.repart.d') + install_data('definitions/portable.repart.d/30-root-verity-sig.conf', install_dir : repartdefinitionsdir / 'portable.repart.d') + install_data('definitions/sysext.repart.d/10-root.conf', install_dir : repartdefinitionsdir / 'sysext.repart.d') + install_data('definitions/sysext.repart.d/20-root-verity.conf', install_dir : repartdefinitionsdir / 'sysext.repart.d') + install_data('definitions/sysext.repart.d/30-root-verity-sig.conf', install_dir : repartdefinitionsdir / 'sysext.repart.d') +endif diff --git a/src/partition/repart.c b/src/partition/repart.c new file mode 100644 index 0000000..5487aaf --- /dev/null +++ b/src/partition/repart.c @@ -0,0 +1,7753 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-device.h" +#include "sd-id128.h" + +#include "alloc-util.h" +#include "blkid-util.h" +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "build.h" +#include "chase.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "constants.h" +#include "cryptsetup-util.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "efivars.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fdisk-util.h" +#include "fileio.h" +#include "format-table.h" +#include "format-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "gpt.h" +#include "hexdecoct.h" +#include "hmac.h" +#include "id128-util.h" +#include "initrd-util.h" +#include "io-util.h" +#include "json.h" +#include "list.h" +#include "loop-util.h" +#include "main-func.h" +#include "mkdir.h" +#include "mkfs-util.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "openssl-util.h" +#include "parse-argument.h" +#include "parse-helpers.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "random-util.h" +#include "resize-fs.h" +#include "rm-rf.h" +#include "sort-util.h" +#include "specifier.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "sync-util.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "tpm2-pcr.h" +#include "tpm2-util.h" +#include "user-util.h" +#include "utf8.h" + +/* If not configured otherwise use a minimal partition size of 10M */ +#define DEFAULT_MIN_SIZE (10ULL*1024ULL*1024ULL) + +/* Hard lower limit for new partition sizes */ +#define HARD_MIN_SIZE 4096ULL + +/* We know up front we're never going to put more than this in a verity sig partition. */ +#define VERITY_SIG_SIZE (HARD_MIN_SIZE*4ULL) + +/* libfdisk takes off slightly more than 1M of the disk size when creating a GPT disk label */ +#define GPT_METADATA_SIZE (1044ULL*1024ULL) + +/* LUKS2 takes off 16M of the partition size with its metadata by default */ +#define LUKS2_METADATA_SIZE (16ULL*1024ULL*1024ULL) + +/* To do LUKS2 offline encryption, we need to keep some extra free space at the end of the partition. */ +#define LUKS2_METADATA_KEEP_FREE (LUKS2_METADATA_SIZE*2ULL) + +/* LUKS2 volume key size. */ +#define VOLUME_KEY_SIZE (512ULL/8ULL) + +/* Use 4K as the default filesystem sector size because as long as the partitions are aligned to 4K, the + * filesystems will then also be compatible with sector sizes 512, 1024 and 2048. */ +#define DEFAULT_FILESYSTEM_SECTOR_SIZE 4096ULL + +#define APIVFS_TMP_DIRS_NULSTR "proc\0sys\0dev\0tmp\0run\0var/tmp\0" + +/* Note: When growing and placing new partitions we always align to 4K sector size. It's how newer hard disks + * are designed, and if everything is aligned to that performance is best. And for older hard disks with 512B + * sector size devices were generally assumed to have an even number of sectors, hence at the worst we'll + * waste 3K per partition, which is probably fine. */ + +typedef enum EmptyMode { + EMPTY_UNSET, /* no choice has been made yet */ + EMPTY_REFUSE, /* refuse empty disks, never create a partition table */ + EMPTY_ALLOW, /* allow empty disks, create partition table if necessary */ + EMPTY_REQUIRE, /* require an empty disk, create a partition table */ + EMPTY_FORCE, /* make disk empty, erase everything, create a partition table always */ + EMPTY_CREATE, /* create disk as loopback file, create a partition table always */ + _EMPTY_MODE_MAX, + _EMPTY_MODE_INVALID = -EINVAL, +} EmptyMode; + +typedef enum FilterPartitionType { + FILTER_PARTITIONS_NONE, + FILTER_PARTITIONS_EXCLUDE, + FILTER_PARTITIONS_INCLUDE, + _FILTER_PARTITIONS_MAX, + _FILTER_PARTITIONS_INVALID = -EINVAL, +} FilterPartitionsType; + +static EmptyMode arg_empty = EMPTY_UNSET; +static bool arg_dry_run = true; +static const char *arg_node = NULL; +static char *arg_root = NULL; +static char *arg_image = NULL; +static char **arg_definitions = NULL; +static bool arg_discard = true; +static bool arg_can_factory_reset = false; +static int arg_factory_reset = -1; +static sd_id128_t arg_seed = SD_ID128_NULL; +static bool arg_randomize = false; +static int arg_pretty = -1; +static uint64_t arg_size = UINT64_MAX; +static bool arg_size_auto = false; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static void *arg_key = NULL; +static size_t arg_key_size = 0; +static EVP_PKEY *arg_private_key = NULL; +static X509 *arg_certificate = NULL; +static char *arg_tpm2_device = NULL; +static uint32_t arg_tpm2_seal_key_handle = 0; +static char *arg_tpm2_device_key = NULL; +static Tpm2PCRValue *arg_tpm2_hash_pcr_values = NULL; +static size_t arg_tpm2_n_hash_pcr_values = 0; +static char *arg_tpm2_public_key = NULL; +static uint32_t arg_tpm2_public_key_pcr_mask = 0; +static char *arg_tpm2_pcrlock = NULL; +static bool arg_split = false; +static GptPartitionType *arg_filter_partitions = NULL; +static size_t arg_n_filter_partitions = 0; +static FilterPartitionsType arg_filter_partitions_type = FILTER_PARTITIONS_NONE; +static GptPartitionType *arg_defer_partitions = NULL; +static size_t arg_n_defer_partitions = 0; +static uint64_t arg_sector_size = 0; +static ImagePolicy *arg_image_policy = NULL; +static Architecture arg_architecture = _ARCHITECTURE_INVALID; +static int arg_offline = -1; +static char **arg_copy_from = NULL; +static char *arg_copy_source = NULL; +static char *arg_make_ddi = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_definitions, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_key, erase_and_freep); +STATIC_DESTRUCTOR_REGISTER(arg_private_key, EVP_PKEY_freep); +STATIC_DESTRUCTOR_REGISTER(arg_certificate, X509_freep); +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep); +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device_key, freep); +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_hash_pcr_values, freep); +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_public_key, freep); +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_pcrlock, freep); +STATIC_DESTRUCTOR_REGISTER(arg_filter_partitions, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); +STATIC_DESTRUCTOR_REGISTER(arg_copy_from, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_copy_source, freep); +STATIC_DESTRUCTOR_REGISTER(arg_make_ddi, freep); + +typedef struct FreeArea FreeArea; + +typedef enum EncryptMode { + ENCRYPT_OFF, + ENCRYPT_KEY_FILE, + ENCRYPT_TPM2, + ENCRYPT_KEY_FILE_TPM2, + _ENCRYPT_MODE_MAX, + _ENCRYPT_MODE_INVALID = -EINVAL, +} EncryptMode; + +typedef enum VerityMode { + VERITY_OFF, + VERITY_DATA, + VERITY_HASH, + VERITY_SIG, + _VERITY_MODE_MAX, + _VERITY_MODE_INVALID = -EINVAL, +} VerityMode; + +typedef enum MinimizeMode { + MINIMIZE_OFF, + MINIMIZE_BEST, + MINIMIZE_GUESS, + _MINIMIZE_MODE_MAX, + _MINIMIZE_MODE_INVALID = -EINVAL, +} MinimizeMode; + +typedef struct Partition { + char *definition_path; + char **drop_in_files; + + GptPartitionType type; + sd_id128_t current_uuid, new_uuid; + bool new_uuid_is_set; + char *current_label, *new_label; + sd_id128_t fs_uuid, luks_uuid, verity_uuid; + uint8_t verity_salt[SHA256_DIGEST_SIZE]; + + bool dropped; + bool factory_reset; + int32_t priority; + + uint32_t weight, padding_weight; + + uint64_t current_size, new_size; + uint64_t size_min, size_max; + + uint64_t current_padding, new_padding; + uint64_t padding_min, padding_max; + + uint64_t partno; + uint64_t offset; + + struct fdisk_partition *current_partition; + struct fdisk_partition *new_partition; + FreeArea *padding_area; + FreeArea *allocated_to_area; + + char *copy_blocks_path; + bool copy_blocks_path_is_our_file; + bool copy_blocks_auto; + const char *copy_blocks_root; + int copy_blocks_fd; + uint64_t copy_blocks_offset; + uint64_t copy_blocks_size; + + char *format; + char **copy_files; + char **exclude_files_source; + char **exclude_files_target; + char **make_directories; + char **subvolumes; + EncryptMode encrypt; + VerityMode verity; + char *verity_match_key; + MinimizeMode minimize; + uint64_t verity_data_block_size; + uint64_t verity_hash_block_size; + + uint64_t gpt_flags; + int no_auto; + int read_only; + int growfs; + + struct iovec roothash; + + char *split_name_format; + char *split_path; + + struct Partition *siblings[_VERITY_MODE_MAX]; + + LIST_FIELDS(struct Partition, partitions); +} Partition; + +#define PARTITION_IS_FOREIGN(p) (!(p)->definition_path) +#define PARTITION_EXISTS(p) (!!(p)->current_partition) + +struct FreeArea { + Partition *after; + uint64_t size; + uint64_t allocated; +}; + +typedef struct Context { + LIST_HEAD(Partition, partitions); + size_t n_partitions; + + FreeArea **free_areas; + size_t n_free_areas; + + uint64_t start, end, total; + + struct fdisk_context *fdisk_context; + uint64_t sector_size, grain_size, fs_sector_size; + + sd_id128_t seed; + + char *node; + bool node_is_our_file; + int backing_fd; + + bool from_scratch; +} Context; + +static const char *empty_mode_table[_EMPTY_MODE_MAX] = { + [EMPTY_UNSET] = "unset", + [EMPTY_REFUSE] = "refuse", + [EMPTY_ALLOW] = "allow", + [EMPTY_REQUIRE] = "require", + [EMPTY_FORCE] = "force", + [EMPTY_CREATE] = "create", +}; + +static const char *encrypt_mode_table[_ENCRYPT_MODE_MAX] = { + [ENCRYPT_OFF] = "off", + [ENCRYPT_KEY_FILE] = "key-file", + [ENCRYPT_TPM2] = "tpm2", + [ENCRYPT_KEY_FILE_TPM2] = "key-file+tpm2", +}; + +static const char *verity_mode_table[_VERITY_MODE_MAX] = { + [VERITY_OFF] = "off", + [VERITY_DATA] = "data", + [VERITY_HASH] = "hash", + [VERITY_SIG] = "signature", +}; + +static const char *minimize_mode_table[_MINIMIZE_MODE_MAX] = { + [MINIMIZE_OFF] = "off", + [MINIMIZE_BEST] = "best", + [MINIMIZE_GUESS] = "guess", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(empty_mode, EmptyMode); +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(encrypt_mode, EncryptMode, ENCRYPT_KEY_FILE); +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(verity_mode, VerityMode); +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_BOOLEAN(minimize_mode, MinimizeMode, MINIMIZE_BEST); + +static uint64_t round_down_size(uint64_t v, uint64_t p) { + return (v / p) * p; +} + +static uint64_t round_up_size(uint64_t v, uint64_t p) { + + v = DIV_ROUND_UP(v, p); + + if (v > UINT64_MAX / p) + return UINT64_MAX; /* overflow */ + + return v * p; +} + +static Partition *partition_new(void) { + Partition *p; + + p = new(Partition, 1); + if (!p) + return NULL; + + *p = (Partition) { + .weight = 1000, + .padding_weight = 0, + .current_size = UINT64_MAX, + .new_size = UINT64_MAX, + .size_min = UINT64_MAX, + .size_max = UINT64_MAX, + .current_padding = UINT64_MAX, + .new_padding = UINT64_MAX, + .padding_min = UINT64_MAX, + .padding_max = UINT64_MAX, + .partno = UINT64_MAX, + .offset = UINT64_MAX, + .copy_blocks_fd = -EBADF, + .copy_blocks_offset = UINT64_MAX, + .copy_blocks_size = UINT64_MAX, + .no_auto = -1, + .read_only = -1, + .growfs = -1, + .verity_data_block_size = UINT64_MAX, + .verity_hash_block_size = UINT64_MAX, + }; + + return p; +} + +static Partition* partition_free(Partition *p) { + if (!p) + return NULL; + + free(p->current_label); + free(p->new_label); + free(p->definition_path); + strv_free(p->drop_in_files); + + if (p->current_partition) + fdisk_unref_partition(p->current_partition); + if (p->new_partition) + fdisk_unref_partition(p->new_partition); + + if (p->copy_blocks_path_is_our_file) + unlink_and_free(p->copy_blocks_path); + else + free(p->copy_blocks_path); + safe_close(p->copy_blocks_fd); + + free(p->format); + strv_free(p->copy_files); + strv_free(p->exclude_files_source); + strv_free(p->exclude_files_target); + strv_free(p->make_directories); + strv_free(p->subvolumes); + free(p->verity_match_key); + + iovec_done(&p->roothash); + + free(p->split_name_format); + unlink_and_free(p->split_path); + + return mfree(p); +} + +static void partition_foreignize(Partition *p) { + assert(p); + assert(PARTITION_EXISTS(p)); + + /* Reset several parameters set through definition file to make the partition foreign. */ + + p->definition_path = mfree(p->definition_path); + p->drop_in_files = strv_free(p->drop_in_files); + + p->copy_blocks_path = mfree(p->copy_blocks_path); + p->copy_blocks_fd = safe_close(p->copy_blocks_fd); + p->copy_blocks_root = NULL; + + p->format = mfree(p->format); + p->copy_files = strv_free(p->copy_files); + p->exclude_files_source = strv_free(p->exclude_files_source); + p->exclude_files_target = strv_free(p->exclude_files_target); + p->make_directories = strv_free(p->make_directories); + p->subvolumes = strv_free(p->subvolumes); + p->verity_match_key = mfree(p->verity_match_key); + + p->priority = 0; + p->weight = 1000; + p->padding_weight = 0; + p->size_min = UINT64_MAX; + p->size_max = UINT64_MAX; + p->padding_min = UINT64_MAX; + p->padding_max = UINT64_MAX; + p->no_auto = -1; + p->read_only = -1; + p->growfs = -1; + p->verity = VERITY_OFF; +} + +static bool partition_type_exclude(const GptPartitionType *type) { + if (arg_filter_partitions_type == FILTER_PARTITIONS_NONE) + return false; + + for (size_t i = 0; i < arg_n_filter_partitions; i++) + if (sd_id128_equal(type->uuid, arg_filter_partitions[i].uuid)) + return arg_filter_partitions_type == FILTER_PARTITIONS_EXCLUDE; + + return arg_filter_partitions_type == FILTER_PARTITIONS_INCLUDE; +} + +static bool partition_type_defer(const GptPartitionType *type) { + for (size_t i = 0; i < arg_n_defer_partitions; i++) + if (sd_id128_equal(type->uuid, arg_defer_partitions[i].uuid)) + return true; + + return false; +} + +static Partition* partition_unlink_and_free(Context *context, Partition *p) { + if (!p) + return NULL; + + LIST_REMOVE(partitions, context->partitions, p); + + assert(context->n_partitions > 0); + context->n_partitions--; + + return partition_free(p); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Partition*, partition_free); + +static Context *context_new(sd_id128_t seed) { + Context *context; + + context = new(Context, 1); + if (!context) + return NULL; + + *context = (Context) { + .start = UINT64_MAX, + .end = UINT64_MAX, + .total = UINT64_MAX, + .seed = seed, + }; + + return context; +} + +static void context_free_free_areas(Context *context) { + assert(context); + + for (size_t i = 0; i < context->n_free_areas; i++) + free(context->free_areas[i]); + + context->free_areas = mfree(context->free_areas); + context->n_free_areas = 0; +} + +static Context *context_free(Context *context) { + if (!context) + return NULL; + + while (context->partitions) + partition_unlink_and_free(context, context->partitions); + assert(context->n_partitions == 0); + + context_free_free_areas(context); + + if (context->fdisk_context) + fdisk_unref_context(context->fdisk_context); + + safe_close(context->backing_fd); + if (context->node_is_our_file) + unlink_and_free(context->node); + else + free(context->node); + + return mfree(context); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Context*, context_free); + +static int context_add_free_area( + Context *context, + uint64_t size, + Partition *after) { + + FreeArea *a; + + assert(context); + assert(!after || !after->padding_area); + + if (!GREEDY_REALLOC(context->free_areas, context->n_free_areas + 1)) + return -ENOMEM; + + a = new(FreeArea, 1); + if (!a) + return -ENOMEM; + + *a = (FreeArea) { + .size = size, + .after = after, + }; + + context->free_areas[context->n_free_areas++] = a; + + if (after) + after->padding_area = a; + + return 0; +} + +static void partition_drop_or_foreignize(Partition *p) { + if (!p || p->dropped || PARTITION_IS_FOREIGN(p)) + return; + + if (PARTITION_EXISTS(p)) { + log_info("Can't grow existing partition %s of priority %" PRIi32 ", ignoring.", + strna(p->current_label ?: p->new_label), p->priority); + + /* Handle the partition as foreign. Do not set dropped flag. */ + partition_foreignize(p); + } else { + log_info("Can't fit partition %s of priority %" PRIi32 ", dropping.", + p->definition_path, p->priority); + + p->dropped = true; + p->allocated_to_area = NULL; + } +} + +static bool context_drop_or_foreignize_one_priority(Context *context) { + int32_t priority = 0; + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->dropped) + continue; + + priority = MAX(priority, p->priority); + } + + /* Refuse to drop partitions with 0 or negative priorities or partitions of priorities that have at + * least one existing priority */ + if (priority <= 0) + return false; + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->priority < priority) + continue; + + partition_drop_or_foreignize(p); + + /* We ensure that all verity sibling partitions have the same priority, so it's safe + * to drop all siblings here as well. */ + + for (VerityMode mode = VERITY_OFF + 1; mode < _VERITY_MODE_MAX; mode++) + partition_drop_or_foreignize(p->siblings[mode]); + } + + return true; +} + +static uint64_t partition_min_size(const Context *context, const Partition *p) { + uint64_t sz; + + assert(context); + assert(p); + + /* Calculate the disk space we really need at minimum for this partition. If the partition already + * exists the current size is what we really need. If it doesn't exist yet refuse to allocate less + * than 4K. + * + * DEFAULT_MIN_SIZE is the default SizeMin= we configure if nothing else is specified. */ + + if (PARTITION_IS_FOREIGN(p)) { + /* Don't allow changing size of partitions not managed by us */ + assert(p->current_size != UINT64_MAX); + return p->current_size; + } + + if (p->verity == VERITY_SIG) + return VERITY_SIG_SIZE; + + sz = p->current_size != UINT64_MAX ? p->current_size : HARD_MIN_SIZE; + + if (!PARTITION_EXISTS(p)) { + uint64_t d = 0; + + if (p->encrypt != ENCRYPT_OFF) + d += round_up_size(LUKS2_METADATA_KEEP_FREE, context->grain_size); + + if (p->copy_blocks_size != UINT64_MAX) + d += round_up_size(p->copy_blocks_size, context->grain_size); + else if (p->format || p->encrypt != ENCRYPT_OFF) { + uint64_t f; + + /* If we shall synthesize a file system, take minimal fs size into account (assumed to be 4K if not known) */ + f = p->format ? round_up_size(minimal_size_by_fs_name(p->format), context->grain_size) : UINT64_MAX; + d += f == UINT64_MAX ? context->grain_size : f; + } + + if (d > sz) + sz = d; + } + + return MAX(round_up_size(p->size_min != UINT64_MAX ? p->size_min : DEFAULT_MIN_SIZE, context->grain_size), sz); +} + +static uint64_t partition_max_size(const Context *context, const Partition *p) { + uint64_t sm; + + /* Calculate how large the partition may become at max. This is generally the configured maximum + * size, except when it already exists and is larger than that. In that case it's the existing size, + * since we never want to shrink partitions. */ + + assert(context); + assert(p); + + if (PARTITION_IS_FOREIGN(p)) { + /* Don't allow changing size of partitions not managed by us */ + assert(p->current_size != UINT64_MAX); + return p->current_size; + } + + if (p->verity == VERITY_SIG) + return VERITY_SIG_SIZE; + + if (p->size_max == UINT64_MAX) + return UINT64_MAX; + + sm = round_down_size(p->size_max, context->grain_size); + + if (p->current_size != UINT64_MAX) + sm = MAX(p->current_size, sm); + + return MAX(partition_min_size(context, p), sm); +} + +static uint64_t partition_min_padding(const Partition *p) { + assert(p); + return p->padding_min != UINT64_MAX ? p->padding_min : 0; +} + +static uint64_t partition_max_padding(const Partition *p) { + assert(p); + return p->padding_max; +} + +static uint64_t partition_min_size_with_padding(Context *context, const Partition *p) { + uint64_t sz; + + /* Calculate the disk space we need for this partition plus any free space coming after it. This + * takes user configured padding into account as well as any additional whitespace needed to align + * the next partition to 4K again. */ + + assert(context); + assert(p); + + sz = partition_min_size(context, p) + partition_min_padding(p); + + if (PARTITION_EXISTS(p)) { + /* If the partition wasn't aligned, add extra space so that any we might add will be aligned */ + assert(p->offset != UINT64_MAX); + return round_up_size(p->offset + sz, context->grain_size) - p->offset; + } + + /* If this is a new partition we'll place it aligned, hence we just need to round up the required size here */ + return round_up_size(sz, context->grain_size); +} + +static uint64_t free_area_available(const FreeArea *a) { + assert(a); + + /* Determines how much of this free area is not allocated yet */ + + assert(a->size >= a->allocated); + return a->size - a->allocated; +} + +static uint64_t free_area_current_end(Context *context, const FreeArea *a) { + assert(context); + assert(a); + + if (!a->after) + return free_area_available(a); + + assert(a->after->offset != UINT64_MAX); + assert(a->after->current_size != UINT64_MAX); + + /* Calculate where the free area ends, based on the offset of the partition preceding it. */ + return round_up_size(a->after->offset + a->after->current_size, context->grain_size) + free_area_available(a); +} + +static uint64_t free_area_min_end(Context *context, const FreeArea *a) { + assert(context); + assert(a); + + if (!a->after) + return 0; + + assert(a->after->offset != UINT64_MAX); + assert(a->after->current_size != UINT64_MAX); + + /* Calculate where the partition would end when we give it as much as it needs. */ + return round_up_size(a->after->offset + partition_min_size_with_padding(context, a->after), context->grain_size); +} + +static uint64_t free_area_available_for_new_partitions(Context *context, const FreeArea *a) { + assert(context); + assert(a); + + /* Similar to free_area_available(), but takes into account that the required size and padding of the + * preceding partition is honoured. */ + + return LESS_BY(free_area_current_end(context, a), free_area_min_end(context, a)); +} + +static int free_area_compare(FreeArea *const *a, FreeArea *const*b, Context *context) { + assert(context); + + return CMP(free_area_available_for_new_partitions(context, *a), + free_area_available_for_new_partitions(context, *b)); +} + +static uint64_t charge_size(Context *context, uint64_t total, uint64_t amount) { + assert(context); + /* Subtract the specified amount from total, rounding up to multiple of 4K if there's room */ + assert(amount <= total); + return LESS_BY(total, round_up_size(amount, context->grain_size)); +} + +static uint64_t charge_weight(uint64_t total, uint64_t amount) { + assert(amount <= total); + return total - amount; +} + +static bool context_allocate_partitions(Context *context, uint64_t *ret_largest_free_area) { + assert(context); + + /* This may be called multiple times. Reset previous assignments. */ + for (size_t i = 0; i < context->n_free_areas; i++) + context->free_areas[i]->allocated = 0; + + /* Sort free areas by size, putting smallest first */ + typesafe_qsort_r(context->free_areas, context->n_free_areas, free_area_compare, context); + + /* In any case return size of the largest free area (i.e. not the size of all free areas + * combined!) */ + if (ret_largest_free_area) + *ret_largest_free_area = + context->n_free_areas == 0 ? 0 : + free_area_available_for_new_partitions(context, context->free_areas[context->n_free_areas-1]); + + /* Check that each existing partition can fit its area. */ + for (size_t i = 0; i < context->n_free_areas; i++) + if (free_area_current_end(context, context->free_areas[i]) < + free_area_min_end(context, context->free_areas[i])) + return false; + + /* A simple first-fit algorithm. We return true if we can fit the partitions in, otherwise false. */ + LIST_FOREACH(partitions, p, context->partitions) { + bool fits = false; + uint64_t required; + FreeArea *a = NULL; + + /* Skip partitions we already dropped or that already exist */ + if (p->dropped || PARTITION_EXISTS(p)) + continue; + + /* How much do we need to fit? */ + required = partition_min_size_with_padding(context, p); + assert(required % context->grain_size == 0); + + for (size_t i = 0; i < context->n_free_areas; i++) { + a = context->free_areas[i]; + + if (free_area_available_for_new_partitions(context, a) >= required) { + fits = true; + break; + } + } + + if (!fits) + return false; /* 😢 Oh no! We can't fit this partition into any free area! */ + + /* Assign the partition to this free area */ + p->allocated_to_area = a; + + /* Budget the minimal partition size */ + a->allocated += required; + } + + return true; +} + +static int context_sum_weights(Context *context, FreeArea *a, uint64_t *ret) { + uint64_t weight_sum = 0; + + assert(context); + assert(a); + assert(ret); + + /* Determine the sum of the weights of all partitions placed in or before the specified free area */ + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->padding_area != a && p->allocated_to_area != a) + continue; + + if (p->weight > UINT64_MAX - weight_sum) + goto overflow_sum; + weight_sum += p->weight; + + if (p->padding_weight > UINT64_MAX - weight_sum) + goto overflow_sum; + weight_sum += p->padding_weight; + } + + *ret = weight_sum; + return 0; + +overflow_sum: + return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "Combined weight of partition exceeds unsigned 64-bit range, refusing."); +} + +static uint64_t scale_by_weight(uint64_t value, uint64_t weight, uint64_t weight_sum) { + assert(weight_sum >= weight); + + for (;;) { + if (weight == 0) + return 0; + if (weight == weight_sum) + return value; + if (value <= UINT64_MAX / weight) + return value * weight / weight_sum; + + /* Rescale weight and weight_sum to make not the calculation overflow. To satisfy the + * following conditions, 'weight_sum' is rounded up but 'weight' is rounded down: + * - the sum of scale_by_weight() for all weights must not be larger than the input value, + * - scale_by_weight() must not be larger than the ideal value (i.e. calculated with uint128_t). */ + weight_sum = DIV_ROUND_UP(weight_sum, 2); + weight /= 2; + } +} + +typedef enum GrowPartitionPhase { + /* The zeroth phase: do not touch foreign partitions (i.e. those we don't manage). */ + PHASE_FOREIGN, + + /* The first phase: we charge partitions which need more (according to constraints) than their weight-based share. */ + PHASE_OVERCHARGE, + + /* The second phase: we charge partitions which need less (according to constraints) than their weight-based share. */ + PHASE_UNDERCHARGE, + + /* The third phase: we distribute what remains among the remaining partitions, according to the weights */ + PHASE_DISTRIBUTE, + + _GROW_PARTITION_PHASE_MAX, +} GrowPartitionPhase; + +static bool context_grow_partitions_phase( + Context *context, + FreeArea *a, + GrowPartitionPhase phase, + uint64_t *span, + uint64_t *weight_sum) { + + bool try_again = false; + + assert(context); + assert(a); + assert(span); + assert(weight_sum); + + /* Now let's look at the intended weights and adjust them taking the minimum space assignments into + * account. i.e. if a partition has a small weight but a high minimum space value set it should not + * get any additional room from the left-overs. Similar, if two partitions have the same weight they + * should get the same space if possible, even if one has a smaller minimum size than the other. */ + LIST_FOREACH(partitions, p, context->partitions) { + + /* Look only at partitions associated with this free area, i.e. immediately + * preceding it, or allocated into it */ + if (p->allocated_to_area != a && p->padding_area != a) + continue; + + if (p->new_size == UINT64_MAX) { + uint64_t share, rsz, xsz; + bool charge = false; + + /* Calculate how much this space this partition needs if everyone would get + * the weight based share */ + share = scale_by_weight(*span, p->weight, *weight_sum); + + rsz = partition_min_size(context, p); + xsz = partition_max_size(context, p); + + if (phase == PHASE_FOREIGN && PARTITION_IS_FOREIGN(p)) { + /* Never change of foreign partitions (i.e. those we don't manage) */ + + p->new_size = p->current_size; + charge = true; + + } else if (phase == PHASE_OVERCHARGE && rsz > share) { + /* This partition needs more than its calculated share. Let's assign + * it that, and take this partition out of all calculations and start + * again. */ + + p->new_size = rsz; + charge = try_again = true; + + } else if (phase == PHASE_UNDERCHARGE && xsz < share) { + /* This partition accepts less than its calculated + * share. Let's assign it that, and take this partition out + * of all calculations and start again. */ + + p->new_size = xsz; + charge = try_again = true; + + } else if (phase == PHASE_DISTRIBUTE) { + /* This partition can accept its calculated share. Let's + * assign it. There's no need to restart things here since + * assigning this shouldn't impact the shares of the other + * partitions. */ + + assert(share >= rsz); + p->new_size = CLAMP(round_down_size(share, context->grain_size), rsz, xsz); + charge = true; + } + + if (charge) { + *span = charge_size(context, *span, p->new_size); + *weight_sum = charge_weight(*weight_sum, p->weight); + } + } + + if (p->new_padding == UINT64_MAX) { + uint64_t share, rsz, xsz; + bool charge = false; + + share = scale_by_weight(*span, p->padding_weight, *weight_sum); + + rsz = partition_min_padding(p); + xsz = partition_max_padding(p); + + if (phase == PHASE_OVERCHARGE && rsz > share) { + p->new_padding = rsz; + charge = try_again = true; + } else if (phase == PHASE_UNDERCHARGE && xsz < share) { + p->new_padding = xsz; + charge = try_again = true; + } else if (phase == PHASE_DISTRIBUTE) { + assert(share >= rsz); + p->new_padding = CLAMP(round_down_size(share, context->grain_size), rsz, xsz); + charge = true; + } + + if (charge) { + *span = charge_size(context, *span, p->new_padding); + *weight_sum = charge_weight(*weight_sum, p->padding_weight); + } + } + } + + return !try_again; +} + +static void context_grow_partition_one(Context *context, FreeArea *a, Partition *p, uint64_t *span) { + uint64_t m; + + assert(context); + assert(a); + assert(p); + assert(span); + + if (*span == 0) + return; + + if (p->allocated_to_area != a) + return; + + if (PARTITION_IS_FOREIGN(p)) + return; + + assert(p->new_size != UINT64_MAX); + + /* Calculate new size and align. */ + m = round_down_size(p->new_size + *span, context->grain_size); + /* But ensure this doesn't shrink the size. */ + m = MAX(m, p->new_size); + /* And ensure this doesn't exceed the maximum size. */ + m = MIN(m, partition_max_size(context, p)); + + assert(m >= p->new_size); + + *span = charge_size(context, *span, m - p->new_size); + p->new_size = m; +} + +static int context_grow_partitions_on_free_area(Context *context, FreeArea *a) { + uint64_t weight_sum = 0, span; + int r; + + assert(context); + assert(a); + + r = context_sum_weights(context, a, &weight_sum); + if (r < 0) + return r; + + /* Let's calculate the total area covered by this free area and the partition before it */ + span = a->size; + if (a->after) { + assert(a->after->offset != UINT64_MAX); + assert(a->after->current_size != UINT64_MAX); + + span += round_up_size(a->after->offset + a->after->current_size, context->grain_size) - a->after->offset; + } + + for (GrowPartitionPhase phase = 0; phase < _GROW_PARTITION_PHASE_MAX;) + if (context_grow_partitions_phase(context, a, phase, &span, &weight_sum)) + phase++; /* go to the next phase */ + + /* We still have space left over? Donate to preceding partition if we have one */ + if (span > 0 && a->after) + context_grow_partition_one(context, a, a->after, &span); + + /* What? Even still some space left (maybe because there was no preceding partition, or it had a + * size limit), then let's donate it to whoever wants it. */ + if (span > 0) + LIST_FOREACH(partitions, p, context->partitions) { + context_grow_partition_one(context, a, p, &span); + if (span == 0) + break; + } + + /* Yuck, still no one? Then make it padding */ + if (span > 0 && a->after) { + assert(a->after->new_padding != UINT64_MAX); + a->after->new_padding += span; + } + + return 0; +} + +static int context_grow_partitions(Context *context) { + int r; + + assert(context); + + for (size_t i = 0; i < context->n_free_areas; i++) { + r = context_grow_partitions_on_free_area(context, context->free_areas[i]); + if (r < 0) + return r; + } + + /* All existing partitions that have no free space after them can't change size */ + LIST_FOREACH(partitions, p, context->partitions) { + if (p->dropped) + continue; + + if (!PARTITION_EXISTS(p) || p->padding_area) { + /* The algorithm above must have initialized this already */ + assert(p->new_size != UINT64_MAX); + continue; + } + + assert(p->new_size == UINT64_MAX); + p->new_size = p->current_size; + + assert(p->new_padding == UINT64_MAX); + p->new_padding = p->current_padding; + } + + return 0; +} + +static uint64_t find_first_unused_partno(Context *context) { + uint64_t partno = 0; + + assert(context); + + for (partno = 0;; partno++) { + bool found = false; + LIST_FOREACH(partitions, p, context->partitions) + if (p->partno != UINT64_MAX && p->partno == partno) + found = true; + if (!found) + break; + } + + return partno; +} + +static void context_place_partitions(Context *context) { + + assert(context); + + for (size_t i = 0; i < context->n_free_areas; i++) { + FreeArea *a = context->free_areas[i]; + _unused_ uint64_t left; + uint64_t start; + + if (a->after) { + assert(a->after->offset != UINT64_MAX); + assert(a->after->new_size != UINT64_MAX); + assert(a->after->new_padding != UINT64_MAX); + + start = a->after->offset + a->after->new_size + a->after->new_padding; + } else + start = context->start; + + start = round_up_size(start, context->grain_size); + left = a->size; + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->allocated_to_area != a) + continue; + + p->offset = start; + p->partno = find_first_unused_partno(context); + + assert(left >= p->new_size); + start += p->new_size; + left -= p->new_size; + + assert(left >= p->new_padding); + start += p->new_padding; + left -= p->new_padding; + } + } +} + +static int config_parse_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + GptPartitionType *type = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = gpt_partition_type_from_string(rvalue, type); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse partition type: %s", rvalue); + + if (arg_architecture >= 0) + *type = gpt_partition_type_override_architecture(*type, arg_architecture); + + return 0; +} + +static int config_parse_label( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + char **label = ASSERT_PTR(data); + int r; + + assert(rvalue); + + /* Nota bene: the empty label is a totally valid one. Let's hence not follow our usual rule of + * assigning the empty string to reset to default here, but really accept it as label to set. */ + + r = specifier_printf(rvalue, GPT_LABEL_MAX, system_and_tmp_specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in Label=, ignoring: %s", rvalue); + return 0; + } + + if (!utf8_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Partition label not valid UTF-8, ignoring: %s", rvalue); + return 0; + } + + r = gpt_partition_label_valid(resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to check if string is valid as GPT partition label, ignoring: \"%s\" (from \"%s\")", + resolved, rvalue); + return 0; + } + if (!r) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Partition label too long for GPT table, ignoring: \"%s\" (from \"%s\")", + resolved, rvalue); + return 0; + } + + free_and_replace(*label, resolved); + return 0; +} + +static int config_parse_weight( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *w = ASSERT_PTR(data), v; + int r; + + assert(rvalue); + + r = safe_atou32(rvalue, &v); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse weight value, ignoring: %s", rvalue); + return 0; + } + + if (v > 1000U*1000U) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Weight needs to be in range 0…10000000, ignoring: %" PRIu32, v); + return 0; + } + + *w = v; + return 0; +} + +static int config_parse_size4096( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *sz = data, parsed; + int r; + + assert(rvalue); + assert(data); + + r = parse_size(rvalue, 1024, &parsed); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse size value: %s", rvalue); + + if (ltype > 0) + *sz = round_up_size(parsed, 4096); + else if (ltype < 0) + *sz = round_down_size(parsed, 4096); + else + *sz = parsed; + + if (*sz != parsed) + log_syntax(unit, LOG_NOTICE, filename, line, r, "Rounded %s= size %" PRIu64 " %s %" PRIu64 ", a multiple of 4096.", + lvalue, parsed, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), *sz); + + return 0; +} + +static int config_parse_block_size( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *blksz = ASSERT_PTR(data), parsed; + int r; + + assert(rvalue); + + r = parse_size(rvalue, 1024, &parsed); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse size value: %s", rvalue); + + if (parsed < 512 || parsed > 4096) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "Value not between 512 and 4096: %s", rvalue); + + if (!ISPOWEROF2(parsed)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "Value not a power of 2: %s", rvalue); + + *blksz = parsed; + return 0; +} + +static int config_parse_fstype( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **fstype = ASSERT_PTR(data); + const char *e; + + assert(rvalue); + + /* Let's provide an easy way to override the chosen fstype for file system partitions */ + e = secure_getenv("SYSTEMD_REPART_OVERRIDE_FSTYPE"); + if (e && !streq(rvalue, e)) { + log_syntax(unit, LOG_NOTICE, filename, line, 0, + "Overriding defined file system type '%s' with '%s'.", rvalue, e); + rvalue = e; + } + + if (!filename_is_valid(rvalue)) + return log_syntax(unit, LOG_ERR, filename, line, 0, + "File system type is not valid, refusing: %s", rvalue); + + return free_and_strdup_warn(fstype, rvalue); +} + +static int config_parse_copy_files( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *source = NULL, *buffer = NULL, *resolved_source = NULL, *resolved_target = NULL; + const char *p = rvalue, *target; + char ***copy_files = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = extract_first_word(&p, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to extract source path: %s", rvalue); + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "No argument specified: %s", rvalue); + return 0; + } + + r = extract_first_word(&p, &buffer, ":", EXTRACT_CUNESCAPE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to extract target path: %s", rvalue); + if (r == 0) + target = source; /* No target, then it's the same as the source */ + else + target = buffer; + + if (!isempty(p)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), "Too many arguments: %s", rvalue); + + r = specifier_printf(source, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &resolved_source); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in CopyFiles= source, ignoring: %s", rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved_source, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = specifier_printf(target, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &resolved_target); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in CopyFiles= target, ignoring: %s", resolved_target); + return 0; + } + + r = path_simplify_and_warn(resolved_target, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = strv_consume_pair(copy_files, TAKE_PTR(resolved_source), TAKE_PTR(resolved_target)); + if (r < 0) + return log_oom(); + + return 0; +} + +static int config_parse_exclude_files( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + _cleanup_free_ char *resolved = NULL; + char ***exclude_files = ASSERT_PTR(data); + int r; + + if (isempty(rvalue)) { + *exclude_files = strv_free(*exclude_files); + return 0; + } + + r = specifier_printf(rvalue, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in ExcludeFiles= path, ignoring: %s", rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE|PATH_KEEP_TRAILING_SLASH, unit, filename, line, lvalue); + if (r < 0) + return 0; + + if (strv_consume(exclude_files, TAKE_PTR(resolved)) < 0) + return log_oom(); + + return 0; +} + +static int config_parse_copy_blocks( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *d = NULL; + Partition *partition = ASSERT_PTR(data); + int r; + + assert(rvalue); + + if (isempty(rvalue)) { + partition->copy_blocks_path = mfree(partition->copy_blocks_path); + partition->copy_blocks_auto = false; + return 0; + } + + if (streq(rvalue, "auto")) { + partition->copy_blocks_path = mfree(partition->copy_blocks_path); + partition->copy_blocks_auto = true; + partition->copy_blocks_root = arg_root; + return 0; + } + + r = specifier_printf(rvalue, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &d); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in CopyBlocks= source path, ignoring: %s", rvalue); + return 0; + } + + r = path_simplify_and_warn(d, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + free_and_replace(partition->copy_blocks_path, d); + partition->copy_blocks_auto = false; + partition->copy_blocks_root = arg_root; + return 0; +} + +static int config_parse_make_dirs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***sv = ASSERT_PTR(data); + const char *p = ASSERT_PTR(rvalue); + int r; + + for (;;) { + _cleanup_free_ char *word = NULL, *d = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + r = specifier_printf(word, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &d); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in MakeDirectories= parameter, ignoring: %s", word); + continue; + } + + r = path_simplify_and_warn(d, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + continue; + + r = strv_consume(sv, TAKE_PTR(d)); + if (r < 0) + return log_oom(); + } +} + +static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_encrypt, encrypt_mode, EncryptMode, ENCRYPT_OFF, "Invalid encryption mode"); + +static int config_parse_gpt_flags( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *gpt_flags = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = safe_atou64(rvalue, gpt_flags); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse Flags= value, ignoring: %s", rvalue); + return 0; + } + + return 0; +} + +static int config_parse_uuid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Partition *partition = ASSERT_PTR(data); + int r; + + if (isempty(rvalue)) { + partition->new_uuid = SD_ID128_NULL; + partition->new_uuid_is_set = false; + return 0; + } + + if (streq(rvalue, "null")) { + partition->new_uuid = SD_ID128_NULL; + partition->new_uuid_is_set = true; + return 0; + } + + r = sd_id128_from_string(rvalue, &partition->new_uuid); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse 128-bit ID/UUID, ignoring: %s", rvalue); + return 0; + } + + partition->new_uuid_is_set = true; + + return 0; +} + +static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_verity, verity_mode, VerityMode, VERITY_OFF, "Invalid verity mode"); +static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_minimize, minimize_mode, MinimizeMode, MINIMIZE_OFF, "Invalid minimize mode"); + +static int partition_read_definition(Partition *p, const char *path, const char *const *conf_file_dirs) { + + ConfigTableItem table[] = { + { "Partition", "Type", config_parse_type, 0, &p->type }, + { "Partition", "Label", config_parse_label, 0, &p->new_label }, + { "Partition", "UUID", config_parse_uuid, 0, p }, + { "Partition", "Priority", config_parse_int32, 0, &p->priority }, + { "Partition", "Weight", config_parse_weight, 0, &p->weight }, + { "Partition", "PaddingWeight", config_parse_weight, 0, &p->padding_weight }, + { "Partition", "SizeMinBytes", config_parse_size4096, -1, &p->size_min }, + { "Partition", "SizeMaxBytes", config_parse_size4096, 1, &p->size_max }, + { "Partition", "PaddingMinBytes", config_parse_size4096, -1, &p->padding_min }, + { "Partition", "PaddingMaxBytes", config_parse_size4096, 1, &p->padding_max }, + { "Partition", "FactoryReset", config_parse_bool, 0, &p->factory_reset }, + { "Partition", "CopyBlocks", config_parse_copy_blocks, 0, p }, + { "Partition", "Format", config_parse_fstype, 0, &p->format }, + { "Partition", "CopyFiles", config_parse_copy_files, 0, &p->copy_files }, + { "Partition", "ExcludeFiles", config_parse_exclude_files, 0, &p->exclude_files_source }, + { "Partition", "ExcludeFilesTarget", config_parse_exclude_files, 0, &p->exclude_files_target }, + { "Partition", "MakeDirectories", config_parse_make_dirs, 0, &p->make_directories }, + { "Partition", "Encrypt", config_parse_encrypt, 0, &p->encrypt }, + { "Partition", "Verity", config_parse_verity, 0, &p->verity }, + { "Partition", "VerityMatchKey", config_parse_string, 0, &p->verity_match_key }, + { "Partition", "Flags", config_parse_gpt_flags, 0, &p->gpt_flags }, + { "Partition", "ReadOnly", config_parse_tristate, 0, &p->read_only }, + { "Partition", "NoAuto", config_parse_tristate, 0, &p->no_auto }, + { "Partition", "GrowFileSystem", config_parse_tristate, 0, &p->growfs }, + { "Partition", "SplitName", config_parse_string, 0, &p->split_name_format }, + { "Partition", "Minimize", config_parse_minimize, 0, &p->minimize }, + { "Partition", "Subvolumes", config_parse_make_dirs, 0, &p->subvolumes }, + { "Partition", "VerityDataBlockSizeBytes", config_parse_block_size, 0, &p->verity_data_block_size }, + { "Partition", "VerityHashBlockSizeBytes", config_parse_block_size, 0, &p->verity_hash_block_size }, + {} + }; + int r; + _cleanup_free_ char *filename = NULL; + const char* dropin_dirname; + + r = path_extract_filename(path, &filename); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", path); + + dropin_dirname = strjoina(filename, ".d"); + + r = config_parse_many( + STRV_MAKE_CONST(path), + conf_file_dirs, + dropin_dirname, + arg_definitions ? NULL : arg_root, + "Partition\0", + config_item_table_lookup, table, + CONFIG_PARSE_WARN, + p, + NULL, + &p->drop_in_files); + if (r < 0) + return r; + + if (partition_type_exclude(&p->type)) + return 0; + + if (p->size_min != UINT64_MAX && p->size_max != UINT64_MAX && p->size_min > p->size_max) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "SizeMinBytes= larger than SizeMaxBytes=, refusing."); + + if (p->padding_min != UINT64_MAX && p->padding_max != UINT64_MAX && p->padding_min > p->padding_max) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "PaddingMinBytes= larger than PaddingMaxBytes=, refusing."); + + if (sd_id128_is_null(p->type.uuid)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Type= not defined, refusing."); + + if ((p->copy_blocks_path || p->copy_blocks_auto) && + (p->format || !strv_isempty(p->copy_files) || !strv_isempty(p->make_directories))) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Format=/CopyFiles=/MakeDirectories= and CopyBlocks= cannot be combined, refusing."); + + if ((!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories)) && streq_ptr(p->format, "swap")) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Format=swap and CopyFiles= cannot be combined, refusing."); + + if (!p->format) { + const char *format = NULL; + + if (!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories) || (p->encrypt != ENCRYPT_OFF && !(p->copy_blocks_path || p->copy_blocks_auto))) + /* Pick "vfat" as file system for esp and xbootldr partitions, otherwise default to "ext4". */ + format = IN_SET(p->type.designator, PARTITION_ESP, PARTITION_XBOOTLDR) ? "vfat" : "ext4"; + else if (p->type.designator == PARTITION_SWAP) + format = "swap"; + + if (format) { + p->format = strdup(format); + if (!p->format) + return log_oom(); + } + } + + if (p->minimize != MINIMIZE_OFF && !p->format && p->verity != VERITY_HASH) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Minimize= can only be enabled if Format= or Verity=hash are set"); + + if (p->minimize == MINIMIZE_BEST && (p->format && !fstype_is_ro(p->format)) && p->verity != VERITY_HASH) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Minimize=best can only be used with read-only filesystems or Verity=hash"); + + if ((!strv_isempty(p->copy_files) || !strv_isempty(p->make_directories)) && !mkfs_supports_root_option(p->format) && geteuid() != 0) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EPERM), + "Need to be root to populate %s filesystems with CopyFiles=/MakeDirectories=", + p->format); + + if (p->format && fstype_is_ro(p->format) && strv_isempty(p->copy_files) && strv_isempty(p->make_directories)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Cannot format %s filesystem without source files, refusing", p->format); + + if (p->verity != VERITY_OFF || p->encrypt != ENCRYPT_OFF) { + r = dlopen_cryptsetup(); + if (r < 0) + return log_syntax(NULL, LOG_ERR, path, 1, r, + "libcryptsetup not found, Verity=/Encrypt= are not supported: %m"); + } + + if (p->verity != VERITY_OFF && !p->verity_match_key) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "VerityMatchKey= must be set if Verity=%s", verity_mode_to_string(p->verity)); + + if (p->verity == VERITY_OFF && p->verity_match_key) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "VerityMatchKey= can only be set if Verity= is not \"%s\"", + verity_mode_to_string(p->verity)); + + if (IN_SET(p->verity, VERITY_HASH, VERITY_SIG) && + (p->copy_files || p->copy_blocks_path || p->copy_blocks_auto || p->format || p->make_directories)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "CopyBlocks=/CopyFiles=/Format=/MakeDirectories= cannot be used with Verity=%s", + verity_mode_to_string(p->verity)); + + if (p->verity != VERITY_OFF && p->encrypt != ENCRYPT_OFF) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Encrypting verity hash/data partitions is not supported"); + + if (p->verity == VERITY_SIG && !arg_private_key) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Verity signature partition requested but no private key provided (--private-key=)"); + + if (p->verity == VERITY_SIG && !arg_certificate) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Verity signature partition requested but no PEM certificate provided (--certificate=)"); + + if (p->verity == VERITY_SIG && (p->size_min != UINT64_MAX || p->size_max != UINT64_MAX)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "SizeMinBytes=/SizeMaxBytes= cannot be used with Verity=%s", + verity_mode_to_string(p->verity)); + + if (!strv_isempty(p->subvolumes) && arg_offline > 0) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Subvolumes= cannot be used with --offline=yes"); + + /* Verity partitions are read only, let's imply the RO flag hence, unless explicitly configured otherwise. */ + if ((IN_SET(p->type.designator, + PARTITION_ROOT_VERITY, + PARTITION_USR_VERITY) || p->verity == VERITY_DATA) && p->read_only < 0) + p->read_only = true; + + /* Default to "growfs" on, unless read-only */ + if (gpt_partition_type_knows_growfs(p->type) && + p->read_only <= 0) + p->growfs = true; + + if (!p->split_name_format) { + char *s = strdup("%t"); + if (!s) + return log_oom(); + + p->split_name_format = s; + } else if (streq(p->split_name_format, "-")) + p->split_name_format = mfree(p->split_name_format); + + return 1; +} + +static int find_verity_sibling(Context *context, Partition *p, VerityMode mode, Partition **ret) { + Partition *s = NULL; + + assert(p); + assert(p->verity != VERITY_OFF); + assert(p->verity_match_key); + assert(mode != VERITY_OFF); + assert(p->verity != mode); + assert(ret); + + /* Try to find the matching sibling partition of the given type for a verity partition. For a data + * partition, this is the corresponding hash partition with the same verity name (and vice versa for + * the hash partition). */ + + LIST_FOREACH(partitions, q, context->partitions) { + if (p == q) + continue; + + if (q->verity != mode) + continue; + + assert(q->verity_match_key); + + if (!streq(p->verity_match_key, q->verity_match_key)) + continue; + + if (s) + return -ENOTUNIQ; + + s = q; + } + + if (!s) + return -ENXIO; + + *ret = s; + + return 0; +} + +static int context_open_and_lock_backing_fd(const char *node, int operation, int *backing_fd) { + _cleanup_close_ int fd = -EBADF; + + assert(node); + assert(backing_fd); + + if (*backing_fd >= 0) + return 0; + + fd = open(node, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open device '%s': %m", node); + + /* Tell udev not to interfere while we are processing the device */ + if (flock(fd, operation) < 0) + return log_error_errno(errno, "Failed to lock device '%s': %m", node); + + log_debug("Device %s opened and locked.", node); + *backing_fd = TAKE_FD(fd); + return 1; +} + +static int determine_current_padding( + struct fdisk_context *c, + struct fdisk_table *t, + struct fdisk_partition *p, + uint64_t secsz, + uint64_t grainsz, + uint64_t *ret) { + + size_t n_partitions; + uint64_t offset, next = UINT64_MAX; + + assert(c); + assert(t); + assert(p); + assert(ret); + + if (!fdisk_partition_has_end(p)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Partition has no end!"); + + offset = fdisk_partition_get_end(p); + assert(offset < UINT64_MAX); + offset++; /* The end is one sector before the next partition or padding. */ + assert(offset < UINT64_MAX / secsz); + offset *= secsz; + + n_partitions = fdisk_table_get_nents(t); + for (size_t i = 0; i < n_partitions; i++) { + struct fdisk_partition *q; + uint64_t start; + + q = fdisk_table_get_partition(t, i); + if (!q) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to read partition metadata: %m"); + + if (fdisk_partition_is_used(q) <= 0) + continue; + + if (!fdisk_partition_has_start(q)) + continue; + + start = fdisk_partition_get_start(q); + assert(start < UINT64_MAX / secsz); + start *= secsz; + + if (start >= offset && (next == UINT64_MAX || next > start)) + next = start; + } + + if (next == UINT64_MAX) { + /* No later partition? In that case check the end of the usable area */ + next = fdisk_get_last_lba(c); + assert(next < UINT64_MAX); + next++; /* The last LBA is one sector before the end */ + + assert(next < UINT64_MAX / secsz); + next *= secsz; + + if (offset > next) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Partition end beyond disk end."); + } + + assert(next >= offset); + offset = round_up_size(offset, grainsz); + next = round_down_size(next, grainsz); + + *ret = LESS_BY(next, offset); /* Saturated subtraction, rounding might have fucked things up */ + return 0; +} + +static int context_copy_from_one(Context *context, const char *src) { + _cleanup_close_ int fd = -EBADF; + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + _cleanup_(fdisk_unref_tablep) struct fdisk_table *t = NULL; + Partition *last = NULL; + unsigned long secsz, grainsz; + size_t n_partitions; + int r; + + assert(src); + + r = context_open_and_lock_backing_fd(src, LOCK_SH, &fd); + if (r < 0) + return r; + + r = fd_verify_regular(fd); + if (r < 0) + return log_error_errno(r, "%s is not a file: %m", src); + + r = fdisk_new_context_at(fd, /* path = */ NULL, /* read_only = */ true, /* sector_size = */ UINT32_MAX, &c); + if (r < 0) + return log_error_errno(r, "Failed to create fdisk context: %m"); + + secsz = fdisk_get_sector_size(c); + grainsz = fdisk_get_grain_size(c); + + /* Insist on a power of two, and that it's a multiple of 512, i.e. the traditional sector size. */ + if (secsz < 512 || !ISPOWEROF2(secsz)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Sector size %lu is not a power of two larger than 512? Refusing.", secsz); + + if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), "Cannot copy from disk %s with no GPT disk label.", src); + + r = fdisk_get_partitions(c, &t); + if (r < 0) + return log_error_errno(r, "Failed to acquire partition table: %m"); + + n_partitions = fdisk_table_get_nents(t); + for (size_t i = 0; i < n_partitions; i++) { + _cleanup_(partition_freep) Partition *np = NULL; + _cleanup_free_ char *label_copy = NULL; + struct fdisk_partition *p; + const char *label; + uint64_t sz, start, padding; + sd_id128_t ptid, id; + GptPartitionType type; + + p = fdisk_table_get_partition(t, i); + if (!p) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to read partition metadata: %m"); + + if (fdisk_partition_is_used(p) <= 0) + continue; + + if (fdisk_partition_has_start(p) <= 0 || + fdisk_partition_has_size(p) <= 0 || + fdisk_partition_has_partno(p) <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Found a partition without a position, size or number."); + + r = fdisk_partition_get_type_as_id128(p, &ptid); + if (r < 0) + return log_error_errno(r, "Failed to query partition type UUID: %m"); + + type = gpt_partition_type_from_uuid(ptid); + + r = fdisk_partition_get_uuid_as_id128(p, &id); + if (r < 0) + return log_error_errno(r, "Failed to query partition UUID: %m"); + + label = fdisk_partition_get_name(p); + if (!isempty(label)) { + label_copy = strdup(label); + if (!label_copy) + return log_oom(); + } + + sz = fdisk_partition_get_size(p); + assert(sz <= UINT64_MAX/secsz); + sz *= secsz; + + start = fdisk_partition_get_start(p); + assert(start <= UINT64_MAX/secsz); + start *= secsz; + + if (partition_type_exclude(&type)) + continue; + + np = partition_new(); + if (!np) + return log_oom(); + + np->type = type; + np->new_uuid = id; + np->new_uuid_is_set = true; + np->size_min = np->size_max = sz; + np->new_label = TAKE_PTR(label_copy); + + np->definition_path = strdup(src); + if (!np->definition_path) + return log_oom(); + + r = determine_current_padding(c, t, p, secsz, grainsz, &padding); + if (r < 0) + return r; + + np->padding_min = np->padding_max = padding; + + np->copy_blocks_path = strdup(src); + if (!np->copy_blocks_path) + return log_oom(); + + np->copy_blocks_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (np->copy_blocks_fd < 0) + return log_error_errno(r, "Failed to duplicate file descriptor of %s: %m", src); + + np->copy_blocks_offset = start; + np->copy_blocks_size = sz; + + r = fdisk_partition_get_attrs_as_uint64(p, &np->gpt_flags); + if (r < 0) + return log_error_errno(r, "Failed to get partition flags: %m"); + + LIST_INSERT_AFTER(partitions, context->partitions, last, np); + last = TAKE_PTR(np); + context->n_partitions++; + } + + return 0; +} + +static int context_copy_from(Context *context) { + int r; + + assert(context); + + STRV_FOREACH(src, arg_copy_from) { + r = context_copy_from_one(context, *src); + if (r < 0) + return r; + } + + return 0; +} + +static int context_read_definitions(Context *context) { + _cleanup_strv_free_ char **files = NULL; + Partition *last = LIST_FIND_TAIL(partitions, context->partitions); + const char *const *dirs; + int r; + + assert(context); + + dirs = (const char* const*) (arg_definitions ?: CONF_PATHS_STRV("repart.d")); + + r = conf_files_list_strv(&files, ".conf", arg_definitions ? NULL : arg_root, CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, dirs); + if (r < 0) + return log_error_errno(r, "Failed to enumerate *.conf files: %m"); + + STRV_FOREACH(f, files) { + _cleanup_(partition_freep) Partition *p = NULL; + + p = partition_new(); + if (!p) + return log_oom(); + + p->definition_path = strdup(*f); + if (!p->definition_path) + return log_oom(); + + r = partition_read_definition(p, *f, dirs); + if (r < 0) + return r; + if (r == 0) + continue; + + LIST_INSERT_AFTER(partitions, context->partitions, last, p); + last = TAKE_PTR(p); + context->n_partitions++; + } + + /* Check that each configured verity hash/data partition has a matching verity data/hash partition. */ + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->verity == VERITY_OFF) + continue; + + for (VerityMode mode = VERITY_OFF + 1; mode < _VERITY_MODE_MAX; mode++) { + Partition *q = NULL; + + if (p->verity == mode) + continue; + + if (p->siblings[mode]) + continue; + + r = find_verity_sibling(context, p, mode, &q); + if (r == -ENXIO) { + if (mode != VERITY_SIG) + return log_syntax(NULL, LOG_ERR, p->definition_path, 1, SYNTHETIC_ERRNO(EINVAL), + "Missing verity %s partition for verity %s partition with VerityMatchKey=%s", + verity_mode_to_string(mode), verity_mode_to_string(p->verity), p->verity_match_key); + } else if (r == -ENOTUNIQ) + return log_syntax(NULL, LOG_ERR, p->definition_path, 1, SYNTHETIC_ERRNO(EINVAL), + "Multiple verity %s partitions found for verity %s partition with VerityMatchKey=%s", + verity_mode_to_string(mode), verity_mode_to_string(p->verity), p->verity_match_key); + else if (r < 0) + return log_syntax(NULL, LOG_ERR, p->definition_path, 1, r, + "Failed to find verity %s partition for verity %s partition with VerityMatchKey=%s", + verity_mode_to_string(mode), verity_mode_to_string(p->verity), p->verity_match_key); + + if (q) { + if (q->priority != p->priority) + return log_syntax(NULL, LOG_ERR, p->definition_path, 1, SYNTHETIC_ERRNO(EINVAL), + "Priority mismatch (%i != %i) for verity sibling partitions with VerityMatchKey=%s", + p->priority, q->priority, p->verity_match_key); + + p->siblings[mode] = q; + } + } + } + + LIST_FOREACH(partitions, p, context->partitions) { + Partition *dp; + + if (p->verity != VERITY_HASH) + continue; + + if (p->minimize == MINIMIZE_OFF) + continue; + + assert_se(dp = p->siblings[VERITY_DATA]); + + if (dp->minimize == MINIMIZE_OFF && !(dp->copy_blocks_path || dp->copy_blocks_auto)) + return log_syntax(NULL, LOG_ERR, p->definition_path, 1, SYNTHETIC_ERRNO(EINVAL), + "Minimize= set for verity hash partition but data partition does " + "not set CopyBlocks= or Minimize="); + + } + + return 0; +} + +static int fdisk_ask_cb(struct fdisk_context *c, struct fdisk_ask *ask, void *data) { + _cleanup_free_ char *ids = NULL; + int r; + + if (fdisk_ask_get_type(ask) != FDISK_ASKTYPE_STRING) + return -EINVAL; + + ids = new(char, SD_ID128_UUID_STRING_MAX); + if (!ids) + return -ENOMEM; + + r = fdisk_ask_string_set_result(ask, sd_id128_to_uuid_string(*(sd_id128_t*) data, ids)); + if (r < 0) + return r; + + TAKE_PTR(ids); + return 0; +} + +static int fdisk_set_disklabel_id_by_uuid(struct fdisk_context *c, sd_id128_t id) { + int r; + + r = fdisk_set_ask(c, fdisk_ask_cb, &id); + if (r < 0) + return r; + + r = fdisk_set_disklabel_id(c); + if (r < 0) + return r; + + return fdisk_set_ask(c, NULL, NULL); +} + +static int derive_uuid(sd_id128_t base, const char *token, sd_id128_t *ret) { + union { + uint8_t md[SHA256_DIGEST_SIZE]; + sd_id128_t id; + } result; + + assert(token); + assert(ret); + + /* Derive a new UUID from the specified UUID in a stable and reasonably safe way. Specifically, we + * calculate the HMAC-SHA256 of the specified token string, keyed by the supplied base (typically the + * machine ID). We use the machine ID as key (and not as cleartext!) of the HMAC operation since it's + * the machine ID we don't want to leak. */ + + hmac_sha256(base.bytes, sizeof(base.bytes), token, strlen(token), result.md); + + /* Take the first half, mark it as v4 UUID */ + assert_cc(sizeof(result.md) == sizeof(result.id) * 2); + *ret = id128_make_v4_uuid(result.id); + return 0; +} + +static void derive_salt(sd_id128_t base, const char *token, uint8_t ret[static SHA256_DIGEST_SIZE]) { + assert(token); + + hmac_sha256(base.bytes, sizeof(base.bytes), token, strlen(token), ret); +} + +static int context_load_partition_table(Context *context) { + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + _cleanup_(fdisk_unref_tablep) struct fdisk_table *t = NULL; + uint64_t left_boundary = UINT64_MAX, first_lba, last_lba, nsectors; + _cleanup_free_ char *disk_uuid_string = NULL; + bool from_scratch = false; + sd_id128_t disk_uuid; + size_t n_partitions; + unsigned long secsz; + uint64_t grainsz, fs_secsz = DEFAULT_FILESYSTEM_SECTOR_SIZE; + int r; + + assert(context); + assert(!context->fdisk_context); + assert(!context->free_areas); + assert(context->start == UINT64_MAX); + assert(context->end == UINT64_MAX); + assert(context->total == UINT64_MAX); + + c = fdisk_new_context(); + if (!c) + return log_oom(); + + if (arg_sector_size > 0) { + fs_secsz = arg_sector_size; + r = fdisk_save_user_sector_size(c, /* phy= */ 0, arg_sector_size); + } else { + uint32_t ssz; + struct stat st; + + r = context_open_and_lock_backing_fd( + context->node, + arg_dry_run ? LOCK_SH : LOCK_EX, + &context->backing_fd); + if (r < 0) + return r; + + if (fstat(context->backing_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", context->node); + + if (IN_SET(arg_empty, EMPTY_REQUIRE, EMPTY_FORCE, EMPTY_CREATE) && S_ISREG(st.st_mode)) + /* Don't probe sector size from partition table if we are supposed to strat from an empty disk */ + fs_secsz = ssz = 512; + else { + /* Auto-detect sector size if not specified. */ + r = probe_sector_size_prefer_ioctl(context->backing_fd, &ssz); + if (r < 0) + return log_error_errno(r, "Failed to probe sector size of '%s': %m", context->node); + + /* If we found the sector size and we're operating on a block device, use it as the file + * system sector size as well, as we know its the sector size of the actual block device and + * not just the offset at which we found the GPT header. */ + if (r > 0 && S_ISBLK(st.st_mode)) + fs_secsz = ssz; + } + + r = fdisk_save_user_sector_size(c, /* phy= */ 0, ssz); + } + if (r < 0) + return log_error_errno(r, "Failed to set sector size: %m"); + + /* libfdisk doesn't have an API to operate on arbitrary fds, hence reopen the fd going via the + * /proc/self/fd/ magic path if we have an existing fd. Open the original file otherwise. */ + r = fdisk_assign_device( + c, + context->backing_fd >= 0 ? FORMAT_PROC_FD_PATH(context->backing_fd) : context->node, + arg_dry_run); + if (r == -EINVAL && arg_size_auto) { + struct stat st; + + /* libfdisk returns EINVAL if opening a file of size zero. Let's check for that, and accept + * it if automatic sizing is requested. */ + + if (context->backing_fd < 0) + r = stat(context->node, &st); + else + r = fstat(context->backing_fd, &st); + if (r < 0) + return log_error_errno(errno, "Failed to stat block device '%s': %m", context->node); + + if (S_ISREG(st.st_mode) && st.st_size == 0) { + /* Use the fallback values if we have no better idea */ + context->sector_size = fdisk_get_sector_size(c); + context->fs_sector_size = fs_secsz; + context->grain_size = 4096; + return /* from_scratch = */ true; + } + + r = -EINVAL; + } + if (r < 0) + return log_error_errno(r, "Failed to open device '%s': %m", context->node); + + if (context->backing_fd < 0) { + /* If we have no fd referencing the device yet, make a copy of the fd now, so that we have one */ + r = context_open_and_lock_backing_fd(FORMAT_PROC_FD_PATH(fdisk_get_devfd(c)), + arg_dry_run ? LOCK_SH : LOCK_EX, + &context->backing_fd); + if (r < 0) + return r; + } + + /* The offsets/sizes libfdisk returns to us will be in multiple of the sector size of the + * device. This is typically 512, and sometimes 4096. Let's query libfdisk once for it, and then use + * it for all our needs. Note that the values we use ourselves always are in bytes though, thus mean + * the same thing universally. Also note that regardless what kind of sector size is in use we'll + * place partitions at multiples of 4K. */ + secsz = fdisk_get_sector_size(c); + + /* Insist on a power of two, and that it's a multiple of 512, i.e. the traditional sector size. */ + if (secsz < 512 || !ISPOWEROF2(secsz)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Sector size %lu is not a power of two larger than 512? Refusing.", secsz); + + /* Use at least 4K, and ensure it's a multiple of the sector size, regardless if that is smaller or + * larger */ + grainsz = secsz < 4096 ? 4096 : secsz; + + log_debug("Sector size of device is %lu bytes. Using grain size of %" PRIu64 ".", secsz, grainsz); + + switch (arg_empty) { + + case EMPTY_REFUSE: + /* Refuse empty disks, insist on an existing GPT partition table */ + if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT)) + return log_notice_errno(SYNTHETIC_ERRNO(EHWPOISON), "Disk %s has no GPT disk label, not repartitioning.", context->node); + + break; + + case EMPTY_REQUIRE: + /* Require an empty disk, refuse any existing partition table */ + r = fdisk_has_label(c); + if (r < 0) + return log_error_errno(r, "Failed to determine whether disk %s has a disk label: %m", context->node); + if (r > 0) + return log_notice_errno(SYNTHETIC_ERRNO(EHWPOISON), "Disk %s already has a disk label, refusing.", context->node); + + from_scratch = true; + break; + + case EMPTY_ALLOW: + /* Allow both an empty disk and an existing partition table, but only GPT */ + r = fdisk_has_label(c); + if (r < 0) + return log_error_errno(r, "Failed to determine whether disk %s has a disk label: %m", context->node); + if (r > 0) { + if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT)) + return log_notice_errno(SYNTHETIC_ERRNO(EHWPOISON), "Disk %s has non-GPT disk label, not repartitioning.", context->node); + } else + from_scratch = true; + + break; + + case EMPTY_FORCE: + case EMPTY_CREATE: + /* Always reinitiaize the disk, don't consider what there was on the disk before */ + from_scratch = true; + break; + + default: + assert_not_reached(); + } + + if (from_scratch) { + r = fdisk_create_disklabel(c, "gpt"); + if (r < 0) + return log_error_errno(r, "Failed to create GPT disk label: %m"); + + r = derive_uuid(context->seed, "disk-uuid", &disk_uuid); + if (r < 0) + return log_error_errno(r, "Failed to acquire disk GPT uuid: %m"); + + r = fdisk_set_disklabel_id_by_uuid(c, disk_uuid); + if (r < 0) + return log_error_errno(r, "Failed to set GPT disk label: %m"); + + goto add_initial_free_area; + } + + r = fdisk_get_disklabel_id(c, &disk_uuid_string); + if (r < 0) + return log_error_errno(r, "Failed to get current GPT disk label UUID: %m"); + + r = id128_from_string_nonzero(disk_uuid_string, &disk_uuid); + if (r == -ENXIO) { + r = derive_uuid(context->seed, "disk-uuid", &disk_uuid); + if (r < 0) + return log_error_errno(r, "Failed to acquire disk GPT uuid: %m"); + + r = fdisk_set_disklabel_id(c); + if (r < 0) + return log_error_errno(r, "Failed to set GPT disk label: %m"); + } else if (r < 0) + return log_error_errno(r, "Failed to parse current GPT disk label UUID: %m"); + + r = fdisk_get_partitions(c, &t); + if (r < 0) + return log_error_errno(r, "Failed to acquire partition table: %m"); + + n_partitions = fdisk_table_get_nents(t); + for (size_t i = 0; i < n_partitions; i++) { + _cleanup_free_ char *label_copy = NULL; + Partition *last = NULL; + struct fdisk_partition *p; + const char *label; + uint64_t sz, start; + bool found = false; + sd_id128_t ptid, id; + size_t partno; + + p = fdisk_table_get_partition(t, i); + if (!p) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to read partition metadata: %m"); + + if (fdisk_partition_is_used(p) <= 0) + continue; + + if (fdisk_partition_has_start(p) <= 0 || + fdisk_partition_has_size(p) <= 0 || + fdisk_partition_has_partno(p) <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Found a partition without a position, size or number."); + + r = fdisk_partition_get_type_as_id128(p, &ptid); + if (r < 0) + return log_error_errno(r, "Failed to query partition type UUID: %m"); + + r = fdisk_partition_get_uuid_as_id128(p, &id); + if (r < 0) + return log_error_errno(r, "Failed to query partition UUID: %m"); + + label = fdisk_partition_get_name(p); + if (!isempty(label)) { + label_copy = strdup(label); + if (!label_copy) + return log_oom(); + } + + sz = fdisk_partition_get_size(p); + assert(sz <= UINT64_MAX/secsz); + sz *= secsz; + + start = fdisk_partition_get_start(p); + assert(start <= UINT64_MAX/secsz); + start *= secsz; + + partno = fdisk_partition_get_partno(p); + + if (left_boundary == UINT64_MAX || left_boundary > start) + left_boundary = start; + + /* Assign this existing partition to the first partition of the right type that doesn't have + * an existing one assigned yet. */ + LIST_FOREACH(partitions, pp, context->partitions) { + last = pp; + + if (!sd_id128_equal(pp->type.uuid, ptid)) + continue; + + if (!pp->current_partition) { + pp->current_uuid = id; + pp->current_size = sz; + pp->offset = start; + pp->partno = partno; + pp->current_label = TAKE_PTR(label_copy); + + pp->current_partition = p; + fdisk_ref_partition(p); + + r = determine_current_padding(c, t, p, secsz, grainsz, &pp->current_padding); + if (r < 0) + return r; + + if (pp->current_padding > 0) { + r = context_add_free_area(context, pp->current_padding, pp); + if (r < 0) + return r; + } + + found = true; + break; + } + } + + /* If we have no matching definition, create a new one. */ + if (!found) { + _cleanup_(partition_freep) Partition *np = NULL; + + np = partition_new(); + if (!np) + return log_oom(); + + np->current_uuid = id; + np->type = gpt_partition_type_from_uuid(ptid); + np->current_size = sz; + np->offset = start; + np->partno = partno; + np->current_label = TAKE_PTR(label_copy); + + np->current_partition = p; + fdisk_ref_partition(p); + + r = determine_current_padding(c, t, p, secsz, grainsz, &np->current_padding); + if (r < 0) + return r; + + if (np->current_padding > 0) { + r = context_add_free_area(context, np->current_padding, np); + if (r < 0) + return r; + } + + LIST_INSERT_AFTER(partitions, context->partitions, last, TAKE_PTR(np)); + context->n_partitions++; + } + } + +add_initial_free_area: + nsectors = fdisk_get_nsectors(c); + assert(nsectors <= UINT64_MAX/secsz); + nsectors *= secsz; + + first_lba = fdisk_get_first_lba(c); + assert(first_lba <= UINT64_MAX/secsz); + first_lba *= secsz; + + last_lba = fdisk_get_last_lba(c); + assert(last_lba < UINT64_MAX); + last_lba++; + assert(last_lba <= UINT64_MAX/secsz); + last_lba *= secsz; + + assert(last_lba >= first_lba); + + if (left_boundary == UINT64_MAX) { + /* No partitions at all? Then the whole disk is up for grabs. */ + + first_lba = round_up_size(first_lba, grainsz); + last_lba = round_down_size(last_lba, grainsz); + + if (last_lba > first_lba) { + r = context_add_free_area(context, last_lba - first_lba, NULL); + if (r < 0) + return r; + } + } else { + /* Add space left of first partition */ + assert(left_boundary >= first_lba); + + first_lba = round_up_size(first_lba, grainsz); + left_boundary = round_down_size(left_boundary, grainsz); + last_lba = round_down_size(last_lba, grainsz); + + if (left_boundary > first_lba) { + r = context_add_free_area(context, left_boundary - first_lba, NULL); + if (r < 0) + return r; + } + } + + context->start = first_lba; + context->end = last_lba; + context->total = nsectors; + context->sector_size = secsz; + context->fs_sector_size = fs_secsz; + context->grain_size = grainsz; + context->fdisk_context = TAKE_PTR(c); + + return from_scratch; +} + +static void context_unload_partition_table(Context *context) { + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) { + + /* Entirely remove partitions that have no configuration */ + if (PARTITION_IS_FOREIGN(p)) { + partition_unlink_and_free(context, p); + continue; + } + + /* Otherwise drop all data we read off the block device and everything we might have + * calculated based on it */ + + p->dropped = false; + p->current_size = UINT64_MAX; + p->new_size = UINT64_MAX; + p->current_padding = UINT64_MAX; + p->new_padding = UINT64_MAX; + p->partno = UINT64_MAX; + p->offset = UINT64_MAX; + + if (p->current_partition) { + fdisk_unref_partition(p->current_partition); + p->current_partition = NULL; + } + + if (p->new_partition) { + fdisk_unref_partition(p->new_partition); + p->new_partition = NULL; + } + + p->padding_area = NULL; + p->allocated_to_area = NULL; + + p->current_uuid = SD_ID128_NULL; + p->current_label = mfree(p->current_label); + } + + context->start = UINT64_MAX; + context->end = UINT64_MAX; + context->total = UINT64_MAX; + + if (context->fdisk_context) { + fdisk_unref_context(context->fdisk_context); + context->fdisk_context = NULL; + } + + context_free_free_areas(context); +} + +static int format_size_change(uint64_t from, uint64_t to, char **ret) { + char *t; + + if (from != UINT64_MAX) { + if (from == to || to == UINT64_MAX) + t = strdup(FORMAT_BYTES(from)); + else + t = strjoin(FORMAT_BYTES(from), " ", special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), " ", FORMAT_BYTES(to)); + } else if (to != UINT64_MAX) + t = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), " ", FORMAT_BYTES(to)); + else { + *ret = NULL; + return 0; + } + + if (!t) + return log_oom(); + + *ret = t; + return 1; +} + +static const char *partition_label(const Partition *p) { + assert(p); + + if (p->new_label) + return p->new_label; + + if (p->current_label) + return p->current_label; + + return gpt_partition_type_uuid_to_string(p->type.uuid); +} + +static int context_dump_partitions(Context *context) { + _cleanup_(table_unrefp) Table *t = NULL; + uint64_t sum_padding = 0, sum_size = 0; + int r; + const size_t roothash_col = 14, dropin_files_col = 15, split_path_col = 16; + bool has_roothash = false, has_dropin_files = false, has_split_path = false; + + if ((arg_json_format_flags & JSON_FORMAT_OFF) && context->n_partitions == 0) { + log_info("Empty partition table."); + return 0; + } + + t = table_new("type", + "label", + "uuid", + "partno", + "file", + "node", + "offset", + "old size", + "raw size", + "size", + "old padding", + "raw padding", + "padding", + "activity", + "roothash", + "drop-in files", + "split path"); + if (!t) + return log_oom(); + + if (!DEBUG_LOGGING) { + if (arg_json_format_flags & JSON_FORMAT_OFF) + (void) table_set_display(t, (size_t) 0, (size_t) 1, (size_t) 2, (size_t) 3, (size_t) 4, + (size_t) 8, (size_t) 9, (size_t) 12, roothash_col, dropin_files_col, + split_path_col); + else + (void) table_set_display(t, (size_t) 0, (size_t) 1, (size_t) 2, (size_t) 3, (size_t) 4, + (size_t) 5, (size_t) 6, (size_t) 7, (size_t) 8, (size_t) 10, + (size_t) 11, (size_t) 13, roothash_col, dropin_files_col, + split_path_col); + } + + (void) table_set_align_percent(t, table_get_cell(t, 0, 5), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 6), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 7), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 8), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 9), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 10), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 11), 100); + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_free_ char *size_change = NULL, *padding_change = NULL, *partname = NULL, *rh = NULL; + char uuid_buffer[SD_ID128_UUID_STRING_MAX]; + const char *label, *activity = NULL; + + if (p->dropped) + continue; + + if (p->current_size == UINT64_MAX) + activity = "create"; + else if (p->current_size != p->new_size) + activity = "resize"; + + label = partition_label(p); + partname = p->partno != UINT64_MAX ? fdisk_partname(context->node, p->partno+1) : NULL; + + r = format_size_change(p->current_size, p->new_size, &size_change); + if (r < 0) + return r; + + r = format_size_change(p->current_padding, p->new_padding, &padding_change); + if (r < 0) + return r; + + if (p->new_size != UINT64_MAX) + sum_size += p->new_size; + if (p->new_padding != UINT64_MAX) + sum_padding += p->new_padding; + + if (p->verity != VERITY_OFF) { + Partition *hp = p->verity == VERITY_HASH ? p : p->siblings[VERITY_HASH]; + + rh = iovec_is_set(&hp->roothash) ? hexmem(hp->roothash.iov_base, hp->roothash.iov_len) : strdup("TBD"); + if (!rh) + return log_oom(); + } + + r = table_add_many( + t, + TABLE_STRING, gpt_partition_type_uuid_to_string_harder(p->type.uuid, uuid_buffer), + TABLE_STRING, empty_to_null(label) ?: "-", TABLE_SET_COLOR, empty_to_null(label) ? NULL : ansi_grey(), + TABLE_UUID, p->new_uuid_is_set ? p->new_uuid : p->current_uuid, + TABLE_UINT64, p->partno, + TABLE_PATH_BASENAME, p->definition_path, TABLE_SET_COLOR, p->definition_path ? NULL : ansi_grey(), + TABLE_STRING, partname ?: "-", TABLE_SET_COLOR, partname ? NULL : ansi_highlight(), + TABLE_UINT64, p->offset, + TABLE_UINT64, p->current_size == UINT64_MAX ? 0 : p->current_size, + TABLE_UINT64, p->new_size, + TABLE_STRING, size_change, TABLE_SET_COLOR, !p->partitions_next && sum_size > 0 ? ansi_underline() : NULL, + TABLE_UINT64, p->current_padding == UINT64_MAX ? 0 : p->current_padding, + TABLE_UINT64, p->new_padding, + TABLE_STRING, padding_change, TABLE_SET_COLOR, !p->partitions_next && sum_padding > 0 ? ansi_underline() : NULL, + TABLE_STRING, activity ?: "unchanged", + TABLE_STRING, rh, + TABLE_STRV, p->drop_in_files, + TABLE_STRING, empty_to_null(p->split_path) ?: "-"); + if (r < 0) + return table_log_add_error(r); + + has_roothash = has_roothash || !isempty(rh); + has_dropin_files = has_dropin_files || !strv_isempty(p->drop_in_files); + has_split_path = has_split_path || !isempty(p->split_path); + } + + if ((arg_json_format_flags & JSON_FORMAT_OFF) && (sum_padding > 0 || sum_size > 0)) { + const char *a, *b; + + a = strjoina(special_glyph(SPECIAL_GLYPH_SIGMA), " = ", FORMAT_BYTES(sum_size)); + b = strjoina(special_glyph(SPECIAL_GLYPH_SIGMA), " = ", FORMAT_BYTES(sum_padding)); + + r = table_add_many( + t, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_STRING, a, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_STRING, b, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_EMPTY); + if (r < 0) + return table_log_add_error(r); + } + + if (!has_roothash) { + r = table_hide_column_from_display(t, roothash_col); + if (r < 0) + return log_error_errno(r, "Failed to set columns to display: %m"); + } + + if (!has_dropin_files) { + r = table_hide_column_from_display(t, dropin_files_col); + if (r < 0) + return log_error_errno(r, "Failed to set columns to display: %m"); + } + + if (!has_split_path) { + r = table_hide_column_from_display(t, split_path_col); + if (r < 0) + return log_error_errno(r, "Failed to set columns to display: %m"); + } + + return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend); +} + +static int context_bar_char_process_partition( + Context *context, + Partition *bar[], + size_t n, + Partition *p, + size_t **start_array, + size_t *n_start_array) { + + uint64_t from, to, total; + size_t x, y; + + assert(context); + assert(bar); + assert(n > 0); + assert(p); + assert(start_array); + assert(n_start_array); + + if (p->dropped) + return 0; + + assert(p->offset != UINT64_MAX); + assert(p->new_size != UINT64_MAX); + + from = p->offset; + to = from + p->new_size; + + assert(context->total > 0); + total = context->total; + + assert(from <= total); + x = from * n / total; + + assert(to <= total); + y = to * n / total; + + assert(x <= y); + assert(y <= n); + + for (size_t i = x; i < y; i++) + bar[i] = p; + + if (!GREEDY_REALLOC_APPEND(*start_array, *n_start_array, &x, 1)) + return log_oom(); + + return 1; +} + +static int partition_hint(const Partition *p, const char *node, char **ret) { + _cleanup_free_ char *buf = NULL; + const char *label; + sd_id128_t id; + + /* Tries really hard to find a suitable description for this partition */ + + if (p->definition_path) + return path_extract_filename(p->definition_path, ret); + + label = partition_label(p); + if (!isempty(label)) { + buf = strdup(label); + goto done; + } + + if (p->partno != UINT64_MAX) { + buf = fdisk_partname(node, p->partno+1); + goto done; + } + + if (p->new_uuid_is_set) + id = p->new_uuid; + else if (!sd_id128_is_null(p->current_uuid)) + id = p->current_uuid; + else + id = p->type.uuid; + + buf = strdup(SD_ID128_TO_UUID_STRING(id)); + +done: + if (!buf) + return -ENOMEM; + + *ret = TAKE_PTR(buf); + return 0; +} + +static int context_dump_partition_bar(Context *context) { + _cleanup_free_ Partition **bar = NULL; + _cleanup_free_ size_t *start_array = NULL; + size_t n_start_array = 0; + Partition *last = NULL; + bool z = false; + size_t c, j = 0; + int r; + + assert_se((c = columns()) >= 2); + c -= 2; /* We do not use the leftmost and rightmost character cell */ + + bar = new0(Partition*, c); + if (!bar) + return log_oom(); + + LIST_FOREACH(partitions, p, context->partitions) { + r = context_bar_char_process_partition(context, bar, c, p, &start_array, &n_start_array); + if (r < 0) + return r; + } + + putc(' ', stdout); + + for (size_t i = 0; i < c; i++) { + if (bar[i]) { + if (last != bar[i]) + z = !z; + + fputs(z ? ansi_green() : ansi_yellow(), stdout); + fputs(special_glyph(SPECIAL_GLYPH_DARK_SHADE), stdout); + } else { + fputs(ansi_normal(), stdout); + fputs(special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), stdout); + } + + last = bar[i]; + } + + fputs(ansi_normal(), stdout); + putc('\n', stdout); + + for (size_t i = 0; i < n_start_array; i++) { + _cleanup_free_ char **line = NULL; + + line = new0(char*, c); + if (!line) + return log_oom(); + + j = 0; + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_free_ char *d = NULL; + + if (p->dropped) + continue; + + j++; + + if (i < n_start_array - j) { + + if (line[start_array[j-1]]) { + const char *e; + + /* Upgrade final corner to the right with a branch to the right */ + e = startswith(line[start_array[j-1]], special_glyph(SPECIAL_GLYPH_TREE_RIGHT)); + if (e) { + d = strjoin(special_glyph(SPECIAL_GLYPH_TREE_BRANCH), e); + if (!d) + return log_oom(); + } + } + + if (!d) { + d = strdup(special_glyph(SPECIAL_GLYPH_TREE_VERTICAL)); + if (!d) + return log_oom(); + } + + } else if (i == n_start_array - j) { + _cleanup_free_ char *hint = NULL; + + (void) partition_hint(p, context->node, &hint); + + if (streq_ptr(line[start_array[j-1]], special_glyph(SPECIAL_GLYPH_TREE_VERTICAL))) + d = strjoin(special_glyph(SPECIAL_GLYPH_TREE_BRANCH), " ", strna(hint)); + else + d = strjoin(special_glyph(SPECIAL_GLYPH_TREE_RIGHT), " ", strna(hint)); + + if (!d) + return log_oom(); + } + + if (d) + free_and_replace(line[start_array[j-1]], d); + } + + putc(' ', stdout); + + j = 0; + while (j < c) { + if (line[j]) { + fputs(line[j], stdout); + j += utf8_console_width(line[j]); + } else { + putc(' ', stdout); + j++; + } + } + + putc('\n', stdout); + + for (j = 0; j < c; j++) + free(line[j]); + } + + return 0; +} + +static bool context_has_roothash(Context *context) { + LIST_FOREACH(partitions, p, context->partitions) + if (iovec_is_set(&p->roothash)) + return true; + + return false; +} + +static int context_dump(Context *context, bool late) { + int r; + + assert(context); + + if (arg_pretty == 0 && FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + return 0; + + /* If we're outputting JSON, only dump after doing all operations so we can include the roothashes + * in the output. */ + if (!late && !FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + return 0; + + /* If we're not outputting JSON, only dump again after doing all operations if there are any + * roothashes that we need to communicate to the user. */ + if (late && FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF) && !context_has_roothash(context)) + return 0; + + r = context_dump_partitions(context); + if (r < 0) + return r; + + /* Make sure we only write the partition bar once, even if we're writing the partition table twice to + * communicate roothashes. */ + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF) && !late) { + putc('\n', stdout); + + r = context_dump_partition_bar(context); + if (r < 0) + return r; + + putc('\n', stdout); + } + + fflush(stdout); + + return 0; +} + + +static bool context_changed(const Context *context) { + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->dropped) + continue; + + if (p->allocated_to_area) + return true; + + if (p->new_size != p->current_size) + return true; + } + + return false; +} + +static int context_wipe_range(Context *context, uint64_t offset, uint64_t size) { + _cleanup_(blkid_free_probep) blkid_probe probe = NULL; + int r; + + assert(context); + assert(offset != UINT64_MAX); + assert(size != UINT64_MAX); + + probe = blkid_new_probe(); + if (!probe) + return log_oom(); + + errno = 0; + r = blkid_probe_set_device(probe, fdisk_get_devfd(context->fdisk_context), offset, size); + if (r < 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to allocate device probe for wiping."); + + errno = 0; + if (blkid_probe_enable_superblocks(probe, true) < 0 || + blkid_probe_set_superblocks_flags(probe, BLKID_SUBLKS_MAGIC|BLKID_SUBLKS_BADCSUM) < 0 || + blkid_probe_enable_partitions(probe, true) < 0 || + blkid_probe_set_partitions_flags(probe, BLKID_PARTS_MAGIC) < 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to enable superblock and partition probing."); + + for (;;) { + errno = 0; + r = blkid_do_probe(probe); + if (r < 0) + return log_error_errno(errno_or_else(EIO), "Failed to probe for file systems."); + if (r > 0) + break; + + errno = 0; + if (blkid_do_wipe(probe, false) < 0) + return log_error_errno(errno_or_else(EIO), "Failed to wipe file system signature."); + } + + return 0; +} + +static int context_wipe_partition(Context *context, Partition *p) { + int r; + + assert(context); + assert(p); + assert(!PARTITION_EXISTS(p)); /* Safety check: never wipe existing partitions */ + + assert(p->offset != UINT64_MAX); + assert(p->new_size != UINT64_MAX); + + r = context_wipe_range(context, p->offset, p->new_size); + if (r < 0) + return r; + + log_info("Successfully wiped file system signatures from future partition %" PRIu64 ".", p->partno); + return 0; +} + +static int context_discard_range( + Context *context, + uint64_t offset, + uint64_t size) { + + struct stat st; + int fd; + + assert(context); + assert(offset != UINT64_MAX); + assert(size != UINT64_MAX); + + if (size <= 0) + return 0; + + assert_se((fd = fdisk_get_devfd(context->fdisk_context)) >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + if (S_ISREG(st.st_mode)) { + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, offset, size) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + return -EOPNOTSUPP; + + return -errno; + } + + return 1; + } + + if (S_ISBLK(st.st_mode)) { + uint64_t range[2], end; + + range[0] = round_up_size(offset, context->sector_size); + + if (offset > UINT64_MAX - size) + return -ERANGE; + + end = offset + size; + if (end <= range[0]) + return 0; + + range[1] = round_down_size(end - range[0], context->sector_size); + if (range[1] <= 0) + return 0; + + if (ioctl(fd, BLKDISCARD, range) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + return -EOPNOTSUPP; + + return -errno; + } + + return 1; + } + + return -EOPNOTSUPP; +} + +static int context_discard_partition(Context *context, Partition *p) { + int r; + + assert(context); + assert(p); + + assert(p->offset != UINT64_MAX); + assert(p->new_size != UINT64_MAX); + assert(!PARTITION_EXISTS(p)); /* Safety check: never discard existing partitions */ + + if (!arg_discard) + return 0; + + r = context_discard_range(context, p->offset, p->new_size); + if (r == -EOPNOTSUPP) { + log_info("Storage does not support discard, not discarding data in future partition %" PRIu64 ".", p->partno); + return 0; + } + if (r == -EBUSY) { + /* Let's handle this gracefully: https://bugzilla.kernel.org/show_bug.cgi?id=211167 */ + log_info("Block device is busy, not discarding partition %" PRIu64 " because it probably is mounted.", p->partno); + return 0; + } + if (r == 0) { + log_info("Partition %" PRIu64 " too short for discard, skipping.", p->partno); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to discard data for future partition %" PRIu64 ".", p->partno); + + log_info("Successfully discarded data from future partition %" PRIu64 ".", p->partno); + return 1; +} + +static int context_discard_gap_after(Context *context, Partition *p) { + uint64_t gap, next = UINT64_MAX; + int r; + + assert(context); + assert(!p || (p->offset != UINT64_MAX && p->new_size != UINT64_MAX)); + + if (!arg_discard) + return 0; + + if (p) + gap = p->offset + p->new_size; + else + /* The context start gets rounded up to grain_size, however + * existing partitions may be before that so ensure the gap + * starts at the first actually usable lba + */ + gap = fdisk_get_first_lba(context->fdisk_context) * context->sector_size; + + LIST_FOREACH(partitions, q, context->partitions) { + if (q->dropped) + continue; + + assert(q->offset != UINT64_MAX); + assert(q->new_size != UINT64_MAX); + + if (q->offset < gap) + continue; + + if (next == UINT64_MAX || q->offset < next) + next = q->offset; + } + + if (next == UINT64_MAX) { + next = (fdisk_get_last_lba(context->fdisk_context) + 1) * context->sector_size; + if (gap > next) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Partition end beyond disk end."); + } + + assert(next >= gap); + r = context_discard_range(context, gap, next - gap); + if (r == -EOPNOTSUPP) { + if (p) + log_info("Storage does not support discard, not discarding gap after partition %" PRIu64 ".", p->partno); + else + log_info("Storage does not support discard, not discarding gap at beginning of disk."); + return 0; + } + if (r == 0) /* Too short */ + return 0; + if (r < 0) { + if (p) + return log_error_errno(r, "Failed to discard gap after partition %" PRIu64 ".", p->partno); + else + return log_error_errno(r, "Failed to discard gap at beginning of disk."); + } + + if (p) + log_info("Successfully discarded gap after partition %" PRIu64 ".", p->partno); + else + log_info("Successfully discarded gap at beginning of disk."); + + return 0; +} + +static int context_wipe_and_discard(Context *context) { + int r; + + assert(context); + + if (arg_empty == EMPTY_CREATE) /* If we just created the image, no need to wipe */ + return 0; + + /* Wipe and discard the contents of all partitions we are about to create. We skip the discarding if + * we were supposed to start from scratch anyway, as in that case we just discard the whole block + * device in one go early on. */ + + LIST_FOREACH(partitions, p, context->partitions) { + + if (!p->allocated_to_area) + continue; + + if (partition_type_defer(&p->type)) + continue; + + r = context_wipe_partition(context, p); + if (r < 0) + return r; + + if (!context->from_scratch) { + r = context_discard_partition(context, p); + if (r < 0) + return r; + + r = context_discard_gap_after(context, p); + if (r < 0) + return r; + } + } + + if (!context->from_scratch) { + r = context_discard_gap_after(context, NULL); + if (r < 0) + return r; + } + + return 0; +} + +typedef struct DecryptedPartitionTarget { + int fd; + char *dm_name; + char *volume; + struct crypt_device *device; +} DecryptedPartitionTarget; + +static DecryptedPartitionTarget* decrypted_partition_target_free(DecryptedPartitionTarget *t) { +#if HAVE_LIBCRYPTSETUP + int r; + + if (!t) + return NULL; + + safe_close(t->fd); + + /* udev or so might access out block device in the background while we are done. Let's hence + * force detach the volume. We sync'ed before, hence this should be safe. */ + r = sym_crypt_deactivate_by_name(t->device, t->dm_name, CRYPT_DEACTIVATE_FORCE); + if (r < 0) + log_warning_errno(r, "Failed to deactivate LUKS device, ignoring: %m"); + + sym_crypt_free(t->device); + free(t->dm_name); + free(t->volume); + free(t); +#endif + return NULL; +} + +typedef struct { + LoopDevice *loop; + int fd; + char *path; + int whole_fd; + DecryptedPartitionTarget *decrypted; +} PartitionTarget; + +static int partition_target_fd(PartitionTarget *t) { + assert(t); + assert(t->loop || t->fd >= 0 || t->whole_fd >= 0); + + if (t->decrypted) + return t->decrypted->fd; + + if (t->loop) + return t->loop->fd; + + if (t->fd >= 0) + return t->fd; + + return t->whole_fd; +} + +static const char* partition_target_path(PartitionTarget *t) { + assert(t); + assert(t->loop || t->path); + + if (t->decrypted) + return t->decrypted->volume; + + if (t->loop) + return t->loop->node; + + return t->path; +} + +static PartitionTarget *partition_target_free(PartitionTarget *t) { + if (!t) + return NULL; + + decrypted_partition_target_free(t->decrypted); + loop_device_unref(t->loop); + safe_close(t->fd); + unlink_and_free(t->path); + + return mfree(t); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(PartitionTarget*, partition_target_free); + +static int prepare_temporary_file(PartitionTarget *t, uint64_t size) { + _cleanup_(unlink_and_freep) char *temp = NULL; + _cleanup_close_ int fd = -EBADF; + const char *vt; + int r; + + assert(t); + + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Could not determine temporary directory: %m"); + + temp = path_join(vt, "repart-XXXXXX"); + if (!temp) + return log_oom(); + + fd = mkostemp_safe(temp); + if (fd < 0) + return log_error_errno(fd, "Failed to create temporary file: %m"); + + if (ftruncate(fd, size) < 0) + return log_error_errno(errno, "Failed to truncate temporary file to %s: %m", + FORMAT_BYTES(size)); + + t->fd = TAKE_FD(fd); + t->path = TAKE_PTR(temp); + + return 0; +} + +static int partition_target_prepare( + Context *context, + Partition *p, + uint64_t size, + bool need_path, + PartitionTarget **ret) { + + _cleanup_(partition_target_freep) PartitionTarget *t = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + int whole_fd, r; + + assert(context); + assert(p); + assert(ret); + + assert_se((whole_fd = fdisk_get_devfd(context->fdisk_context)) >= 0); + + t = new(PartitionTarget, 1); + if (!t) + return log_oom(); + *t = (PartitionTarget) { + .fd = -EBADF, + .whole_fd = -EBADF, + }; + + if (!need_path) { + if (lseek(whole_fd, p->offset, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to partition offset: %m"); + + t->whole_fd = whole_fd; + *ret = TAKE_PTR(t); + return 0; + } + + /* Loopback block devices are not only useful to turn regular files into block devices, but + * also to cut out sections of block devices into new block devices. */ + + if (arg_offline <= 0) { + r = loop_device_make(whole_fd, O_RDWR, p->offset, size, context->sector_size, 0, LOCK_EX, &d); + if (r < 0 && (arg_offline == 0 || (r != -ENOENT && !ERRNO_IS_PRIVILEGE(r)) || !strv_isempty(p->subvolumes))) + return log_error_errno(r, "Failed to make loopback device of future partition %" PRIu64 ": %m", p->partno); + if (r >= 0) { + t->loop = TAKE_PTR(d); + *ret = TAKE_PTR(t); + return 0; + } + + log_debug_errno(r, "No access to loop devices, falling back to a regular file"); + } + + /* If we can't allocate a loop device, let's write to a regular file that we copy into the final + * image so we can run in containers and without needing root privileges. On filesystems with + * reflinking support, we can take advantage of this and just reflink the result into the image. + */ + + r = prepare_temporary_file(t, size); + if (r < 0) + return r; + + *ret = TAKE_PTR(t); + + return 0; +} + +static int partition_target_grow(PartitionTarget *t, uint64_t size) { + int r; + + assert(t); + assert(!t->decrypted); + + if (t->loop) { + r = loop_device_refresh_size(t->loop, UINT64_MAX, size); + if (r < 0) + return log_error_errno(r, "Failed to refresh loopback device size: %m"); + } else if (t->fd >= 0) { + if (ftruncate(t->fd, size) < 0) + return log_error_errno(errno, "Failed to grow '%s' to %s by truncation: %m", + t->path, FORMAT_BYTES(size)); + } + + return 0; +} + +static int partition_target_sync(Context *context, Partition *p, PartitionTarget *t) { + int whole_fd, r; + + assert(context); + assert(p); + assert(t); + + assert_se((whole_fd = fdisk_get_devfd(context->fdisk_context)) >= 0); + + if (t->decrypted && fsync(t->decrypted->fd) < 0) + return log_error_errno(errno, "Failed to sync changes to '%s': %m", t->decrypted->volume); + + if (t->loop) { + r = loop_device_sync(t->loop); + if (r < 0) + return log_error_errno(r, "Failed to sync loopback device: %m"); + } else if (t->fd >= 0) { + struct stat st; + + if (lseek(whole_fd, p->offset, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to partition offset: %m"); + + if (lseek(t->fd, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to start of temporary file: %m"); + + if (fstat(t->fd, &st) < 0) + return log_error_errno(errno, "Failed to stat temporary file: %m"); + + if (st.st_size > (off_t) p->new_size) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Partition %" PRIu64 "'s contents (%s) don't fit in the partition (%s)", + p->partno, FORMAT_BYTES(st.st_size), FORMAT_BYTES(p->new_size)); + + r = copy_bytes(t->fd, whole_fd, UINT64_MAX, COPY_REFLINK|COPY_HOLES|COPY_FSYNC); + if (r < 0) + return log_error_errno(r, "Failed to copy bytes to partition: %m"); + } else { + if (fsync(t->whole_fd) < 0) + return log_error_errno(errno, "Failed to sync changes: %m"); + } + + return 0; +} + +static int partition_encrypt(Context *context, Partition *p, PartitionTarget *target, bool offline) { +#if HAVE_LIBCRYPTSETUP && HAVE_CRYPT_SET_DATA_OFFSET && HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE && HAVE_CRYPT_REENCRYPT + const char *node = partition_target_path(target); + struct crypt_params_luks2 luks_params = { + .label = strempty(ASSERT_PTR(p)->new_label), + .sector_size = ASSERT_PTR(context)->fs_sector_size, + .data_device = offline ? node : NULL, + }; + struct crypt_params_reencrypt reencrypt_params = { + .mode = CRYPT_REENCRYPT_ENCRYPT, + .direction = CRYPT_REENCRYPT_BACKWARD, + .resilience = "datashift", + .data_shift = LUKS2_METADATA_SIZE / 512, + .luks2 = &luks_params, + .flags = CRYPT_REENCRYPT_INITIALIZE_ONLY|CRYPT_REENCRYPT_MOVE_FIRST_SEGMENT, + }; + _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL; + _cleanup_(erase_and_freep) char *base64_encoded = NULL; + _cleanup_fclose_ FILE *h = NULL; + _cleanup_free_ char *hp = NULL, *vol = NULL, *dm_name = NULL; + const char *passphrase = NULL; + size_t passphrase_size = 0; + const char *vt; + int r; + + assert(context); + assert(p); + assert(p->encrypt != ENCRYPT_OFF); + + r = dlopen_cryptsetup(); + if (r < 0) + return log_error_errno(r, "libcryptsetup not found, cannot encrypt: %m"); + + log_info("Encrypting future partition %" PRIu64 "...", p->partno); + + if (offline) { + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Failed to determine temporary files directory: %m"); + + r = fopen_temporary_child(vt, &h, &hp); + if (r < 0) + return log_error_errno(r, "Failed to create temporary LUKS header file: %m"); + + /* Weird cryptsetup requirement which requires the header file to be the size of at least one + * sector. */ + if (ftruncate(fileno(h), luks_params.sector_size) < 0) + return log_error_errno(errno, "Failed to grow temporary LUKS header file: %m"); + } else { + if (asprintf(&dm_name, "luks-repart-%08" PRIx64, random_u64()) < 0) + return log_oom(); + + vol = path_join("/dev/mapper/", dm_name); + if (!vol) + return log_oom(); + } + + r = sym_crypt_init(&cd, offline ? hp : node); + if (r < 0) + return log_error_errno(r, "Failed to allocate libcryptsetup context for %s: %m", hp); + + cryptsetup_enable_logging(cd); + + if (offline) { + /* Disable kernel keyring usage by libcryptsetup as a workaround for + * https://gitlab.com/cryptsetup/cryptsetup/-/merge_requests/273. This makes sure that we can + * do offline encryption even when repart is running in a container. */ + r = sym_crypt_volume_key_keyring(cd, false); + if (r < 0) + return log_error_errno(r, "Failed to disable kernel keyring: %m"); + + r = sym_crypt_metadata_locking(cd, false); + if (r < 0) + return log_error_errno(r, "Failed to disable metadata locking: %m"); + + r = sym_crypt_set_data_offset(cd, LUKS2_METADATA_SIZE / 512); + if (r < 0) + return log_error_errno(r, "Failed to set data offset: %m"); + } + + r = sym_crypt_format( + cd, + CRYPT_LUKS2, + "aes", + "xts-plain64", + SD_ID128_TO_UUID_STRING(p->luks_uuid), + NULL, + VOLUME_KEY_SIZE, + &luks_params); + if (r < 0) + return log_error_errno(r, "Failed to LUKS2 format future partition: %m"); + + if (IN_SET(p->encrypt, ENCRYPT_KEY_FILE, ENCRYPT_KEY_FILE_TPM2)) { + r = sym_crypt_keyslot_add_by_volume_key( + cd, + CRYPT_ANY_SLOT, + NULL, + VOLUME_KEY_SIZE, + strempty(arg_key), + arg_key_size); + if (r < 0) + return log_error_errno(r, "Failed to add LUKS2 key: %m"); + + passphrase = strempty(arg_key); + passphrase_size = arg_key_size; + } + + if (IN_SET(p->encrypt, ENCRYPT_TPM2, ENCRYPT_KEY_FILE_TPM2)) { +#if HAVE_TPM2 + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(erase_and_freep) void *secret = NULL; + _cleanup_free_ void *pubkey = NULL; + _cleanup_free_ void *blob = NULL, *srk_buf = NULL; + size_t secret_size, blob_size, pubkey_size = 0, srk_buf_size = 0; + ssize_t base64_encoded_size; + int keyslot; + TPM2Flags flags = 0; + + if (arg_tpm2_public_key_pcr_mask != 0) { + r = tpm2_load_pcr_public_key(arg_tpm2_public_key, &pubkey, &pubkey_size); + if (r < 0) { + if (arg_tpm2_public_key || r != -ENOENT) + return log_error_errno(r, "Failed to read TPM PCR public key: %m"); + + log_debug_errno(r, "Failed to read TPM2 PCR public key, proceeding without: %m"); + arg_tpm2_public_key_pcr_mask = 0; + } + } + + TPM2B_PUBLIC public; + if (pubkey) { + r = tpm2_tpm2b_public_from_pem(pubkey, pubkey_size, &public); + if (r < 0) + return log_error_errno(r, "Could not convert public key to TPM2B_PUBLIC: %m"); + } + + _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy pcrlock_policy = {}; + if (arg_tpm2_pcrlock) { + r = tpm2_pcrlock_policy_load(arg_tpm2_pcrlock, &pcrlock_policy); + if (r < 0) + return r; + + flags |= TPM2_FLAGS_USE_PCRLOCK; + } + + _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL; + TPM2B_PUBLIC device_key_public = {}; + if (arg_tpm2_device_key) { + r = tpm2_load_public_key_file(arg_tpm2_device_key, &device_key_public); + if (r < 0) + return r; + + if (!tpm2_pcr_values_has_all_values(arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Must provide all PCR values when using TPM2 device key."); + } else { + r = tpm2_context_new(arg_tpm2_device, &tpm2_context); + if (r < 0) + return log_error_errno(r, "Failed to create TPM2 context: %m"); + + if (!tpm2_pcr_values_has_all_values(arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values)) { + r = tpm2_pcr_read_missing_values(tpm2_context, arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values); + if (r < 0) + return log_error_errno(r, "Could not read pcr values: %m"); + } + } + + uint16_t hash_pcr_bank = 0; + uint32_t hash_pcr_mask = 0; + if (arg_tpm2_n_hash_pcr_values > 0) { + size_t hash_count; + r = tpm2_pcr_values_hash_count(arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values, &hash_count); + if (r < 0) + return log_error_errno(r, "Could not get hash count: %m"); + + if (hash_count > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Multiple PCR banks selected."); + + hash_pcr_bank = arg_tpm2_hash_pcr_values[0].hash; + r = tpm2_pcr_values_to_mask(arg_tpm2_hash_pcr_values, arg_tpm2_n_hash_pcr_values, hash_pcr_bank, &hash_pcr_mask); + if (r < 0) + return log_error_errno(r, "Could not get hash mask: %m"); + } + + TPM2B_DIGEST policy = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + r = tpm2_calculate_sealing_policy( + arg_tpm2_hash_pcr_values, + arg_tpm2_n_hash_pcr_values, + pubkey ? &public : NULL, + /* use_pin= */ false, + arg_tpm2_pcrlock ? &pcrlock_policy : NULL, + &policy); + if (r < 0) + return log_error_errno(r, "Could not calculate sealing policy digest: %m"); + + if (arg_tpm2_device_key) + r = tpm2_calculate_seal( + arg_tpm2_seal_key_handle, + &device_key_public, + /* attributes= */ NULL, + /* secret= */ NULL, /* secret_size= */ 0, + &policy, + /* pin= */ NULL, + &secret, &secret_size, + &blob, &blob_size, + &srk_buf, &srk_buf_size); + else + r = tpm2_seal(tpm2_context, + arg_tpm2_seal_key_handle, + &policy, + /* pin= */ NULL, + &secret, &secret_size, + &blob, &blob_size, + /* ret_primary_alg= */ NULL, + &srk_buf, &srk_buf_size); + if (r < 0) + return log_error_errno(r, "Failed to seal to TPM2: %m"); + + base64_encoded_size = base64mem(secret, secret_size, &base64_encoded); + if (base64_encoded_size < 0) + return log_error_errno(base64_encoded_size, "Failed to base64 encode secret key: %m"); + + r = cryptsetup_set_minimal_pbkdf(cd); + if (r < 0) + return log_error_errno(r, "Failed to set minimal PBKDF: %m"); + + keyslot = sym_crypt_keyslot_add_by_volume_key( + cd, + CRYPT_ANY_SLOT, + /* volume_key= */ NULL, + /* volume_key_size= */ VOLUME_KEY_SIZE, + base64_encoded, + base64_encoded_size); + if (keyslot < 0) + return log_error_errno(keyslot, "Failed to add new TPM2 key: %m"); + + r = tpm2_make_luks2_json( + keyslot, + hash_pcr_mask, + hash_pcr_bank, + pubkey, pubkey_size, + arg_tpm2_public_key_pcr_mask, + /* primary_alg= */ 0, + blob, blob_size, + policy.buffer, policy.size, + NULL, 0, /* no salt because tpm2_seal has no pin */ + srk_buf, srk_buf_size, + flags, + &v); + if (r < 0) + return log_error_errno(r, "Failed to prepare TPM2 JSON token object: %m"); + + r = cryptsetup_add_token_json(cd, v); + if (r < 0) + return log_error_errno(r, "Failed to add TPM2 JSON token to LUKS2 header: %m"); + + passphrase = base64_encoded; + passphrase_size = strlen(base64_encoded); +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Support for TPM2 enrollment not enabled."); +#endif + } + + if (offline) { + r = sym_crypt_reencrypt_init_by_passphrase( + cd, + NULL, + passphrase, + passphrase_size, + CRYPT_ANY_SLOT, + 0, + sym_crypt_get_cipher(cd), + sym_crypt_get_cipher_mode(cd), + &reencrypt_params); + if (r < 0) + return log_error_errno(r, "Failed to prepare for reencryption: %m"); + + /* crypt_reencrypt_init_by_passphrase() doesn't actually put the LUKS header at the front, we + * have to do that ourselves. */ + + sym_crypt_free(cd); + cd = NULL; + + r = sym_crypt_init(&cd, node); + if (r < 0) + return log_error_errno(r, "Failed to allocate libcryptsetup context for %s: %m", node); + + r = sym_crypt_header_restore(cd, CRYPT_LUKS2, hp); + if (r < 0) + return log_error_errno(r, "Failed to place new LUKS header at head of %s: %m", node); + + reencrypt_params.flags &= ~CRYPT_REENCRYPT_INITIALIZE_ONLY; + + r = sym_crypt_reencrypt_init_by_passphrase( + cd, + NULL, + passphrase, + passphrase_size, + CRYPT_ANY_SLOT, + 0, + NULL, + NULL, + &reencrypt_params); + if (r < 0) + return log_error_errno(r, "Failed to load reencryption context: %m"); + + r = sym_crypt_reencrypt(cd, NULL); + if (r < 0) + return log_error_errno(r, "Failed to encrypt %s: %m", node); + } else { + _cleanup_free_ DecryptedPartitionTarget *t = NULL; + _cleanup_close_ int dev_fd = -1; + + r = sym_crypt_activate_by_volume_key( + cd, + dm_name, + NULL, + VOLUME_KEY_SIZE, + arg_discard ? CRYPT_ACTIVATE_ALLOW_DISCARDS : 0); + if (r < 0) + return log_error_errno(r, "Failed to activate LUKS superblock: %m"); + + dev_fd = open(vol, O_RDWR|O_CLOEXEC|O_NOCTTY); + if (dev_fd < 0) + return log_error_errno(errno, "Failed to open LUKS volume '%s': %m", vol); + + if (flock(dev_fd, LOCK_EX) < 0) + return log_error_errno(errno, "Failed to lock '%s': %m", vol); + + t = new(DecryptedPartitionTarget, 1); + if (!t) + return log_oom(); + + *t = (DecryptedPartitionTarget) { + .fd = TAKE_FD(dev_fd), + .dm_name = TAKE_PTR(dm_name), + .volume = TAKE_PTR(vol), + .device = TAKE_PTR(cd), + }; + + target->decrypted = TAKE_PTR(t); + } + + log_info("Successfully encrypted future partition %" PRIu64 ".", p->partno); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "libcryptsetup is not supported or is missing required symbols, cannot encrypt: %m"); +#endif +} + +static int partition_format_verity_hash( + Context *context, + Partition *p, + const char *node, + const char *data_node) { + +#if HAVE_LIBCRYPTSETUP + Partition *dp; + _cleanup_(partition_target_freep) PartitionTarget *t = NULL; + _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL; + _cleanup_free_ char *hint = NULL; + int r; + + assert(context); + assert(p); + assert(p->verity == VERITY_HASH); + assert(data_node); + + if (p->dropped) + return 0; + + if (PARTITION_EXISTS(p)) /* Never format existing partitions */ + return 0; + + /* Minimized partitions will use the copy blocks logic so let's make sure to skip those here. */ + if (p->copy_blocks_fd >= 0) + return 0; + + assert_se(dp = p->siblings[VERITY_DATA]); + assert(!dp->dropped); + + (void) partition_hint(p, node, &hint); + + r = dlopen_cryptsetup(); + if (r < 0) + return log_error_errno(r, "libcryptsetup not found, cannot setup verity: %m"); + + if (!node) { + r = partition_target_prepare(context, p, p->new_size, /*need_path=*/ true, &t); + if (r < 0) + return r; + + node = partition_target_path(t); + } + + if (p->verity_data_block_size == UINT64_MAX) + p->verity_data_block_size = context->fs_sector_size; + if (p->verity_hash_block_size == UINT64_MAX) + p->verity_hash_block_size = context->fs_sector_size; + + r = sym_crypt_init(&cd, node); + if (r < 0) + return log_error_errno(r, "Failed to allocate libcryptsetup context for %s: %m", node); + + cryptsetup_enable_logging(cd); + + r = sym_crypt_format( + cd, CRYPT_VERITY, NULL, NULL, SD_ID128_TO_UUID_STRING(p->verity_uuid), NULL, 0, + &(struct crypt_params_verity){ + .data_device = data_node, + .flags = CRYPT_VERITY_CREATE_HASH, + .hash_name = "sha256", + .hash_type = 1, + .data_block_size = p->verity_data_block_size, + .hash_block_size = p->verity_hash_block_size, + .salt_size = sizeof(p->verity_salt), + .salt = (const char*)p->verity_salt, + }); + if (r < 0) { + /* libcryptsetup reports non-descriptive EIO errors for every I/O failure. Luckily, it + * doesn't clobber errno so let's check for ENOSPC so we can report a better error if the + * partition is too small. */ + if (r == -EIO && errno == ENOSPC) + return log_error_errno(errno, + "Verity hash data does not fit in partition %s with size %s", + strna(hint), FORMAT_BYTES(p->new_size)); + + return log_error_errno(r, "Failed to setup verity hash data of partition %s: %m", strna(hint)); + } + + if (t) { + r = partition_target_sync(context, p, t); + if (r < 0) + return r; + } + + r = sym_crypt_get_volume_key_size(cd); + if (r < 0) + return log_error_errno(r, "Failed to determine verity root hash size of partition %s: %m", strna(hint)); + + _cleanup_(iovec_done) struct iovec rh = { + .iov_base = malloc(r), + .iov_len = r, + }; + if (!rh.iov_base) + return log_oom(); + + r = sym_crypt_volume_key_get(cd, CRYPT_ANY_SLOT, (char *) rh.iov_base, &rh.iov_len, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to get verity root hash of partition %s: %m", strna(hint)); + + assert(rh.iov_len >= sizeof(sd_id128_t) * 2); + + if (!dp->new_uuid_is_set) { + memcpy_safe(dp->new_uuid.bytes, rh.iov_base, sizeof(sd_id128_t)); + dp->new_uuid_is_set = true; + } + + if (!p->new_uuid_is_set) { + memcpy_safe(p->new_uuid.bytes, (uint8_t*) rh.iov_base + (rh.iov_len - sizeof(sd_id128_t)), sizeof(sd_id128_t)); + p->new_uuid_is_set = true; + } + + p->roothash = TAKE_STRUCT(rh); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libcryptsetup is not supported, cannot setup verity hashes: %m"); +#endif +} + +static int sign_verity_roothash( + const struct iovec *roothash, + struct iovec *ret_signature) { + +#if HAVE_OPENSSL + _cleanup_(BIO_freep) BIO *rb = NULL; + _cleanup_(PKCS7_freep) PKCS7 *p7 = NULL; + _cleanup_free_ char *hex = NULL; + _cleanup_free_ uint8_t *sig = NULL; + int sigsz; + + assert(roothash); + assert(iovec_is_set(roothash)); + assert(ret_signature); + + hex = hexmem(roothash->iov_base, roothash->iov_len); + if (!hex) + return log_oom(); + + rb = BIO_new_mem_buf(hex, -1); + if (!rb) + return log_oom(); + + p7 = PKCS7_sign(arg_certificate, arg_private_key, NULL, rb, PKCS7_DETACHED|PKCS7_NOATTR|PKCS7_BINARY); + if (!p7) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to calculate PKCS7 signature: %s", + ERR_error_string(ERR_get_error(), NULL)); + + sigsz = i2d_PKCS7(p7, &sig); + if (sigsz < 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to convert PKCS7 signature to DER: %s", + ERR_error_string(ERR_get_error(), NULL)); + + ret_signature->iov_base = TAKE_PTR(sig); + ret_signature->iov_len = sigsz; + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL is not supported, cannot setup verity signature: %m"); +#endif +} + +static int partition_format_verity_sig(Context *context, Partition *p) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(iovec_done) struct iovec sig = {}; + _cleanup_free_ char *text = NULL, *hint = NULL; + Partition *hp; + uint8_t fp[X509_FINGERPRINT_SIZE]; + int whole_fd, r; + + assert(p->verity == VERITY_SIG); + + if (p->dropped) + return 0; + + if (PARTITION_EXISTS(p)) + return 0; + + (void) partition_hint(p, context->node, &hint); + + assert_se(hp = p->siblings[VERITY_HASH]); + assert(!hp->dropped); + + assert(arg_certificate); + + assert_se((whole_fd = fdisk_get_devfd(context->fdisk_context)) >= 0); + + r = sign_verity_roothash(&hp->roothash, &sig); + if (r < 0) + return r; + + r = x509_fingerprint(arg_certificate, fp); + if (r < 0) + return log_error_errno(r, "Unable to calculate X509 certificate fingerprint: %m"); + + r = json_build(&v, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("rootHash", JSON_BUILD_HEX(hp->roothash.iov_base, hp->roothash.iov_len)), + JSON_BUILD_PAIR( + "certificateFingerprint", + JSON_BUILD_HEX(fp, sizeof(fp)) + ), + JSON_BUILD_PAIR("signature", JSON_BUILD_IOVEC_BASE64(&sig)) + ) + ); + if (r < 0) + return log_error_errno(r, "Failed to build verity signature JSON object: %m"); + + r = json_variant_format(v, 0, &text); + if (r < 0) + return log_error_errno(r, "Failed to format verity signature JSON object: %m"); + + if (strlen(text)+1 > p->new_size) + return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "Verity signature too long for partition: %m"); + + r = strgrowpad0(&text, p->new_size); + if (r < 0) + return log_error_errno(r, "Failed to pad string to %s", FORMAT_BYTES(p->new_size)); + + if (lseek(whole_fd, p->offset, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to partition %s offset: %m", strna(hint)); + + r = loop_write(whole_fd, text, p->new_size); + if (r < 0) + return log_error_errno(r, "Failed to write verity signature to partition %s: %m", strna(hint)); + + if (fsync(whole_fd) < 0) + return log_error_errno(errno, "Failed to synchronize partition %s: %m", strna(hint)); + + return 0; +} + +static int context_copy_blocks(Context *context) { + int r; + + assert(context); + + /* Copy in file systems on the block level */ + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_(partition_target_freep) PartitionTarget *t = NULL; + + if (p->copy_blocks_fd < 0) + continue; + + if (p->dropped) + continue; + + if (PARTITION_EXISTS(p)) /* Never copy over existing partitions */ + continue; + + if (partition_type_defer(&p->type)) + continue; + + assert(p->new_size != UINT64_MAX); + assert(p->copy_blocks_size != UINT64_MAX); + assert(p->new_size >= p->copy_blocks_size + (p->encrypt != ENCRYPT_OFF ? LUKS2_METADATA_KEEP_FREE : 0)); + + usec_t start_timestamp = now(CLOCK_MONOTONIC); + + r = partition_target_prepare(context, p, p->new_size, + /*need_path=*/ p->encrypt != ENCRYPT_OFF || p->siblings[VERITY_HASH], + &t); + if (r < 0) + return r; + + if (p->encrypt != ENCRYPT_OFF && t->loop) { + r = partition_encrypt(context, p, t, /* offline = */ false); + if (r < 0) + return r; + } + + if (p->copy_blocks_offset == UINT64_MAX) + log_info("Copying in '%s' (%s) on block level into future partition %" PRIu64 ".", + p->copy_blocks_path, FORMAT_BYTES(p->copy_blocks_size), p->partno); + else { + log_info("Copying in '%s' @ %" PRIu64 " (%s) on block level into future partition %" PRIu64 ".", + p->copy_blocks_path, p->copy_blocks_offset, FORMAT_BYTES(p->copy_blocks_size), p->partno); + + if (lseek(p->copy_blocks_fd, p->copy_blocks_offset, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to copy blocks offset in %s: %m", p->copy_blocks_path); + } + + r = copy_bytes(p->copy_blocks_fd, partition_target_fd(t), p->copy_blocks_size, COPY_REFLINK); + if (r < 0) + return log_error_errno(r, "Failed to copy in data from '%s': %m", p->copy_blocks_path); + + log_info("Copying in of '%s' on block level completed.", p->copy_blocks_path); + + if (p->encrypt != ENCRYPT_OFF && !t->loop) { + r = partition_encrypt(context, p, t, /* offline = */ true); + if (r < 0) + return r; + } + + r = partition_target_sync(context, p, t); + if (r < 0) + return r; + + usec_t time_spent = usec_sub_unsigned(now(CLOCK_MONOTONIC), start_timestamp); + if (time_spent > 250 * USEC_PER_MSEC) /* Show throughput, but not if we spent too little time on it, since it's just noise then */ + log_info("Block level copying and synchronization of partition %" PRIu64 " complete in %s (%s/s).", + p->partno, FORMAT_TIMESPAN(time_spent, 0), FORMAT_BYTES((uint64_t) ((double) p->copy_blocks_size / time_spent * USEC_PER_SEC))); + else + log_info("Block level copying and synchronization of partition %" PRIu64 " complete in %s.", + p->partno, FORMAT_TIMESPAN(time_spent, 0)); + + if (p->siblings[VERITY_HASH] && !partition_type_defer(&p->siblings[VERITY_HASH]->type)) { + r = partition_format_verity_hash(context, p->siblings[VERITY_HASH], + /* node = */ NULL, partition_target_path(t)); + if (r < 0) + return r; + } + + if (p->siblings[VERITY_SIG] && !partition_type_defer(&p->siblings[VERITY_SIG]->type)) { + r = partition_format_verity_sig(context, p->siblings[VERITY_SIG]); + if (r < 0) + return r; + } + } + + return 0; +} + +static int add_exclude_path(const char *path, Hashmap **denylist, DenyType type) { + _cleanup_free_ struct stat *st = NULL; + int r; + + assert(path); + assert(denylist); + + st = new(struct stat, 1); + if (!st) + return log_oom(); + + r = chase_and_stat(path, arg_copy_source, CHASE_PREFIX_ROOT, NULL, st); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to stat source file '%s/%s': %m", strempty(arg_copy_source), path); + + r = hashmap_ensure_put(denylist, &inode_hash_ops, st, INT_TO_PTR(type)); + if (r == -EEXIST) + return 0; + if (r < 0) + return log_oom(); + if (r > 0) + TAKE_PTR(st); + + return 0; +} + +static int make_copy_files_denylist( + Context *context, + const Partition *p, + const char *source, + const char *target, + Hashmap **ret) { + + _cleanup_hashmap_free_ Hashmap *denylist = NULL; + int r; + + assert(context); + assert(p); + assert(source); + assert(target); + assert(ret); + + /* Always exclude the top level APIVFS and temporary directories since the contents of these + * directories are almost certainly not intended to end up in an image. */ + + NULSTR_FOREACH(s, APIVFS_TMP_DIRS_NULSTR) { + r = add_exclude_path(s, &denylist, DENY_CONTENTS); + if (r < 0) + return r; + } + + /* Add the user configured excludes. */ + + STRV_FOREACH(e, p->exclude_files_source) { + r = add_exclude_path(*e, &denylist, endswith(*e, "/") ? DENY_CONTENTS : DENY_INODE); + if (r < 0) + return r; + } + + STRV_FOREACH(e, p->exclude_files_target) { + _cleanup_free_ char *path = NULL; + + const char *s = path_startswith(*e, target); + if (!s) + continue; + + path = path_join(source, s); + if (!path) + return log_oom(); + + r = add_exclude_path(path, &denylist, endswith(*e, "/") ? DENY_CONTENTS : DENY_INODE); + if (r < 0) + return r; + } + + /* If we're populating a root partition, we don't want any files to end up under the APIVFS mount + * points. While we already exclude /proc, users could still do something such as + * "CopyFiles=/abc:/". Now, if /abc has a proc subdirectory with files in it, those will end up in + * the top level proc directory in the root partition, which we want to avoid. To deal with these + * cases, whenever we're populating a root partition and the target of CopyFiles= is the root + * directory of the root partition, we exclude all directories under the source that are named after + * APIVFS directories or named after mount points of other partitions that are also going to be part + * of the image. */ + + if (p->type.designator == PARTITION_ROOT && empty_or_root(target)) { + LIST_FOREACH(partitions, q, context->partitions) { + if (q->type.designator == PARTITION_ROOT) + continue; + + const char *sources = gpt_partition_type_mountpoint_nulstr(q->type); + if (!sources) + continue; + + NULSTR_FOREACH(s, sources) { + _cleanup_free_ char *path = NULL; + + /* Exclude only the children of partition mount points so that the nested + * partition mount point itself still ends up in the upper partition. */ + + path = path_join(source, s); + if (!path) + return -ENOMEM; + + r = add_exclude_path(path, &denylist, DENY_CONTENTS); + if (r < 0) + return r; + } + } + + NULSTR_FOREACH(s, APIVFS_TMP_DIRS_NULSTR) { + _cleanup_free_ char *path = NULL; + + path = path_join(source, s); + if (!path) + return -ENOMEM; + + r = add_exclude_path(path, &denylist, DENY_CONTENTS); + if (r < 0) + return r; + } + } + + *ret = TAKE_PTR(denylist); + return 0; +} + +static int add_subvolume_path(const char *path, Set **subvolumes) { + _cleanup_free_ struct stat *st = NULL; + int r; + + assert(path); + assert(subvolumes); + + st = new(struct stat, 1); + if (!st) + return log_oom(); + + r = chase_and_stat(path, arg_copy_source, CHASE_PREFIX_ROOT, NULL, st); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to stat source file '%s/%s': %m", strempty(arg_copy_source), path); + + r = set_ensure_consume(subvolumes, &inode_hash_ops, TAKE_PTR(st)); + if (r < 0) + return log_oom(); + + return 0; +} + +static int make_subvolumes_set( + Context *context, + const Partition *p, + const char *source, + const char *target, + Set **ret) { + _cleanup_set_free_ Set *subvolumes = NULL; + int r; + + assert(context); + assert(p); + assert(target); + assert(ret); + + STRV_FOREACH(subvolume, p->subvolumes) { + _cleanup_free_ char *path = NULL; + + const char *s = path_startswith(*subvolume, target); + if (!s) + continue; + + path = path_join(source, s); + if (!path) + return log_oom(); + + r = add_subvolume_path(path, &subvolumes); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(subvolumes); + return 0; +} + +static int do_copy_files(Context *context, Partition *p, const char *root) { + int r; + + assert(p); + assert(root); + + /* copy_tree_at() automatically copies the permissions of source directories to target directories if + * it created them. However, the root directory is created by us, so we have to manually take care + * that it is initialized. We use the first source directory targeting "/" as the metadata source for + * the root directory. */ + STRV_FOREACH_PAIR(source, target, p->copy_files) { + _cleanup_close_ int rfd = -EBADF, sfd = -EBADF; + + if (!path_equal(*target, "/")) + continue; + + rfd = open(root, O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + if (rfd < 0) + return -errno; + + sfd = chase_and_open(*source, arg_copy_source, CHASE_PREFIX_ROOT, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOCTTY, NULL); + if (sfd < 0) + return log_error_errno(sfd, "Failed to open source file '%s%s': %m", strempty(arg_copy_source), *source); + + (void) copy_xattr(sfd, NULL, rfd, NULL, COPY_ALL_XATTRS); + (void) copy_access(sfd, rfd); + (void) copy_times(sfd, rfd, 0); + + break; + } + + STRV_FOREACH_PAIR(source, target, p->copy_files) { + _cleanup_hashmap_free_ Hashmap *denylist = NULL; + _cleanup_set_free_ Set *subvolumes_by_source_inode = NULL; + _cleanup_close_ int sfd = -EBADF, pfd = -EBADF, tfd = -EBADF; + + r = make_copy_files_denylist(context, p, *source, *target, &denylist); + if (r < 0) + return r; + + r = make_subvolumes_set(context, p, *source, *target, &subvolumes_by_source_inode); + if (r < 0) + return r; + + sfd = chase_and_open(*source, arg_copy_source, CHASE_PREFIX_ROOT, O_CLOEXEC|O_NOCTTY, NULL); + if (sfd == -ENOENT) { + log_notice_errno(sfd, "Failed to open source file '%s%s', skipping: %m", strempty(arg_copy_source), *source); + continue; + } + if (sfd < 0) + return log_error_errno(sfd, "Failed to open source file '%s%s': %m", strempty(arg_copy_source), *source); + + r = fd_verify_regular(sfd); + if (r < 0) { + if (r != -EISDIR) + return log_error_errno(r, "Failed to check type of source file '%s': %m", *source); + + /* We are looking at a directory */ + tfd = chase_and_open(*target, root, CHASE_PREFIX_ROOT, O_RDONLY|O_DIRECTORY|O_CLOEXEC, NULL); + if (tfd < 0) { + _cleanup_free_ char *dn = NULL, *fn = NULL; + + if (tfd != -ENOENT) + return log_error_errno(tfd, "Failed to open target directory '%s': %m", *target); + + r = path_extract_filename(*target, &fn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from '%s': %m", *target); + + r = path_extract_directory(*target, &dn); + if (r < 0) + return log_error_errno(r, "Failed to extract directory from '%s': %m", *target); + + r = mkdir_p_root(root, dn, UID_INVALID, GID_INVALID, 0755, p->subvolumes); + if (r < 0) + return log_error_errno(r, "Failed to create parent directory '%s': %m", dn); + + pfd = chase_and_open(dn, root, CHASE_PREFIX_ROOT, O_RDONLY|O_DIRECTORY|O_CLOEXEC, NULL); + if (pfd < 0) + return log_error_errno(pfd, "Failed to open parent directory of target: %m"); + + r = copy_tree_at( + sfd, ".", + pfd, fn, + UID_INVALID, GID_INVALID, + COPY_REFLINK|COPY_HOLES|COPY_MERGE|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS|COPY_ALL_XATTRS|COPY_GRACEFUL_WARN|COPY_TRUNCATE, + denylist, subvolumes_by_source_inode); + } else + r = copy_tree_at( + sfd, ".", + tfd, ".", + UID_INVALID, GID_INVALID, + COPY_REFLINK|COPY_HOLES|COPY_MERGE|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS|COPY_ALL_XATTRS|COPY_GRACEFUL_WARN|COPY_TRUNCATE, + denylist, subvolumes_by_source_inode); + if (r < 0) + return log_error_errno(r, "Failed to copy '%s%s' to '%s%s': %m", + strempty(arg_copy_source), *source, strempty(root), *target); + } else { + _cleanup_free_ char *dn = NULL, *fn = NULL; + + /* We are looking at a regular file */ + + r = path_extract_filename(*target, &fn); + if (r == -EADDRNOTAVAIL || r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), + "Target path '%s' refers to a directory, but source path '%s' refers to regular file, can't copy.", *target, *source); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from '%s': %m", *target); + + r = path_extract_directory(*target, &dn); + if (r < 0) + return log_error_errno(r, "Failed to extract directory from '%s': %m", *target); + + r = mkdir_p_root(root, dn, UID_INVALID, GID_INVALID, 0755, p->subvolumes); + if (r < 0) + return log_error_errno(r, "Failed to create parent directory: %m"); + + pfd = chase_and_open(dn, root, CHASE_PREFIX_ROOT, O_RDONLY|O_DIRECTORY|O_CLOEXEC, NULL); + if (pfd < 0) + return log_error_errno(pfd, "Failed to open parent directory of target: %m"); + + tfd = openat(pfd, fn, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC, 0700); + if (tfd < 0) + return log_error_errno(errno, "Failed to create target file '%s': %m", *target); + + r = copy_bytes(sfd, tfd, UINT64_MAX, COPY_REFLINK|COPY_HOLES|COPY_SIGINT|COPY_TRUNCATE); + if (r < 0) + return log_error_errno(r, "Failed to copy '%s' to '%s%s': %m", *source, strempty(arg_copy_source), *target); + + (void) copy_xattr(sfd, NULL, tfd, NULL, COPY_ALL_XATTRS); + (void) copy_access(sfd, tfd); + (void) copy_times(sfd, tfd, 0); + } + } + + return 0; +} + +static int do_make_directories(Partition *p, const char *root) { + int r; + + assert(p); + assert(root); + + STRV_FOREACH(d, p->make_directories) { + r = mkdir_p_root(root, *d, UID_INVALID, GID_INVALID, 0755, p->subvolumes); + if (r < 0) + return log_error_errno(r, "Failed to create directory '%s' in file system: %m", *d); + } + + return 0; +} + +static bool partition_needs_populate(Partition *p) { + assert(p); + return !strv_isempty(p->copy_files) || !strv_isempty(p->make_directories); +} + +static int partition_populate_directory(Context *context, Partition *p, char **ret) { + _cleanup_(rm_rf_physical_and_freep) char *root = NULL; + const char *vt; + int r; + + assert(ret); + + log_info("Populating %s filesystem.", p->format); + + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Could not determine temporary directory: %m"); + + r = tempfn_random_child(vt, "repart", &root); + if (r < 0) + return log_error_errno(r, "Failed to generate temporary directory: %m"); + + r = mkdir(root, 0755); + if (r < 0) + return log_error_errno(errno, "Failed to create temporary directory: %m"); + + r = do_copy_files(context, p, root); + if (r < 0) + return r; + + r = do_make_directories(p, root); + if (r < 0) + return r; + + log_info("Successfully populated %s filesystem.", p->format); + + *ret = TAKE_PTR(root); + return 0; +} + +static int partition_populate_filesystem(Context *context, Partition *p, const char *node) { + int r; + + assert(p); + assert(node); + + log_info("Populating %s filesystem.", p->format); + + /* We copy in a child process, since we have to mount the fs for that, and we don't want that fs to + * appear in the host namespace. Hence we fork a child that has its own file system namespace and + * detached mount propagation. */ + + r = safe_fork("(sd-copy)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL); + if (r < 0) + return r; + if (r == 0) { + static const char fs[] = "/run/systemd/mount-root"; + /* This is a child process with its own mount namespace and propagation to host turned off */ + + r = mkdir_p(fs, 0700); + if (r < 0) { + log_error_errno(r, "Failed to create mount point: %m"); + _exit(EXIT_FAILURE); + } + + if (mount_nofollow_verbose(LOG_ERR, node, fs, p->format, MS_NOATIME|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL) < 0) + _exit(EXIT_FAILURE); + + if (do_copy_files(context, p, fs) < 0) + _exit(EXIT_FAILURE); + + if (do_make_directories(p, fs) < 0) + _exit(EXIT_FAILURE); + + r = syncfs_path(AT_FDCWD, fs); + if (r < 0) { + log_error_errno(r, "Failed to synchronize written files: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + log_info("Successfully populated %s filesystem.", p->format); + return 0; +} + +static int context_mkfs(Context *context) { + int r; + + assert(context); + + /* Make a file system */ + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_(rm_rf_physical_and_freep) char *root = NULL; + _cleanup_(partition_target_freep) PartitionTarget *t = NULL; + _cleanup_strv_free_ char **extra_mkfs_options = NULL; + + if (p->dropped) + continue; + + if (PARTITION_EXISTS(p)) /* Never format existing partitions */ + continue; + + if (!p->format) + continue; + + /* Minimized partitions will use the copy blocks logic so let's make sure to skip those here. */ + if (p->copy_blocks_fd >= 0) + continue; + + if (partition_type_defer(&p->type)) + continue; + + assert(p->offset != UINT64_MAX); + assert(p->new_size != UINT64_MAX); + assert(p->new_size >= (p->encrypt != ENCRYPT_OFF ? LUKS2_METADATA_KEEP_FREE : 0)); + + /* If we're doing encryption, we make sure we keep free space at the end which is required + * for cryptsetup's offline encryption. */ + r = partition_target_prepare(context, p, + p->new_size - (p->encrypt != ENCRYPT_OFF ? LUKS2_METADATA_KEEP_FREE : 0), + /*need_path=*/ true, + &t); + if (r < 0) + return r; + + if (p->encrypt != ENCRYPT_OFF && t->loop) { + r = partition_target_grow(t, p->new_size); + if (r < 0) + return r; + + r = partition_encrypt(context, p, t, /* offline = */ false); + if (r < 0) + return log_error_errno(r, "Failed to encrypt device: %m"); + } + + log_info("Formatting future partition %" PRIu64 ".", p->partno); + + /* If we're not writing to a loop device or if we're populating a read-only filesystem, we + * have to populate using the filesystem's mkfs's --root (or equivalent) option. To do that, + * we need to set up the final directory tree beforehand. */ + + if (partition_needs_populate(p) && (!t->loop || fstype_is_ro(p->format))) { + if (!mkfs_supports_root_option(p->format)) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), + "Loop device access is required to populate %s filesystems.", + p->format); + + r = partition_populate_directory(context, p, &root); + if (r < 0) + return r; + } + + r = mkfs_options_from_env("REPART", p->format, &extra_mkfs_options); + if (r < 0) + return log_error_errno(r, + "Failed to determine mkfs command line options for '%s': %m", + p->format); + + r = make_filesystem(partition_target_path(t), p->format, strempty(p->new_label), root, + p->fs_uuid, arg_discard, /* quiet = */ false, + context->fs_sector_size, extra_mkfs_options); + if (r < 0) + return r; + + /* The mkfs binary we invoked might have removed our temporary file when we're not operating + * on a loop device, so let's make sure we open the file again to make sure our file + * descriptor points to any potential new file. */ + + if (t->fd >= 0 && t->path && !t->loop) { + safe_close(t->fd); + t->fd = open(t->path, O_RDWR|O_CLOEXEC); + if (t->fd < 0) + return log_error_errno(errno, "Failed to reopen temporary file: %m"); + } + + log_info("Successfully formatted future partition %" PRIu64 ".", p->partno); + + /* If we're writing to a loop device, we can now mount the empty filesystem and populate it. */ + if (partition_needs_populate(p) && !root) { + assert(t->loop); + + r = partition_populate_filesystem(context, p, partition_target_path(t)); + if (r < 0) + return r; + } + + if (p->encrypt != ENCRYPT_OFF && !t->loop) { + r = partition_target_grow(t, p->new_size); + if (r < 0) + return r; + + r = partition_encrypt(context, p, t, /* offline = */ true); + if (r < 0) + return log_error_errno(r, "Failed to encrypt device: %m"); + } + + /* Note that we always sync explicitly here, since mkfs.fat doesn't do that on its own, and + * if we don't sync before detaching a block device the in-flight sectors possibly won't hit + * the disk. */ + + r = partition_target_sync(context, p, t); + if (r < 0) + return r; + + if (p->siblings[VERITY_HASH] && !partition_type_defer(&p->siblings[VERITY_HASH]->type)) { + r = partition_format_verity_hash(context, p->siblings[VERITY_HASH], + /* node = */ NULL, partition_target_path(t)); + if (r < 0) + return r; + } + + if (p->siblings[VERITY_SIG] && !partition_type_defer(&p->siblings[VERITY_SIG]->type)) { + r = partition_format_verity_sig(context, p->siblings[VERITY_SIG]); + if (r < 0) + return r; + } + } + + return 0; +} + +static int parse_x509_certificate(const char *certificate, size_t certificate_size, X509 **ret) { +#if HAVE_OPENSSL + _cleanup_(X509_freep) X509 *cert = NULL; + _cleanup_(BIO_freep) BIO *cb = NULL; + + assert(certificate); + assert(certificate_size > 0); + assert(ret); + + cb = BIO_new_mem_buf(certificate, certificate_size); + if (!cb) + return log_oom(); + + cert = PEM_read_bio_X509(cb, NULL, NULL, NULL); + if (!cert) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed to parse X.509 certificate: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (ret) + *ret = TAKE_PTR(cert); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL is not supported, cannot parse X509 certificate."); +#endif +} + +static int parse_private_key(const char *key, size_t key_size, EVP_PKEY **ret) { +#if HAVE_OPENSSL + _cleanup_(BIO_freep) BIO *kb = NULL; + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pk = NULL; + + assert(key); + assert(key_size > 0); + assert(ret); + + kb = BIO_new_mem_buf(key, key_size); + if (!kb) + return log_oom(); + + pk = PEM_read_bio_PrivateKey(kb, NULL, NULL, NULL); + if (!pk) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse PEM private key: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (ret) + *ret = TAKE_PTR(pk); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL is not supported, cannot parse private key."); +#endif +} + +static int partition_acquire_uuid(Context *context, Partition *p, sd_id128_t *ret) { + struct { + sd_id128_t type_uuid; + uint64_t counter; + } _packed_ plaintext = {}; + union { + uint8_t md[SHA256_DIGEST_SIZE]; + sd_id128_t id; + } result; + + uint64_t k = 0; + int r; + + assert(context); + assert(p); + assert(ret); + + /* Calculate a good UUID for the indicated partition. We want a certain degree of reproducibility, + * hence we won't generate the UUIDs randomly. Instead we use a cryptographic hash (precisely: + * HMAC-SHA256) to derive them from a single seed. The seed is generally the machine ID of the + * installation we are processing, but if random behaviour is desired can be random, too. We use the + * seed value as key for the HMAC (since the machine ID is something we generally don't want to leak) + * and the partition type as plaintext. The partition type is suffixed with a counter (only for the + * second and later partition of the same type) if we have more than one partition of the same + * time. Or in other words: + * + * With: + * SEED := /etc/machine-id + * + * If first partition instance of type TYPE_UUID: + * PARTITION_UUID := HMAC-SHA256(SEED, TYPE_UUID) + * + * For all later partition instances of type TYPE_UUID with INSTANCE being the LE64 encoded instance number: + * PARTITION_UUID := HMAC-SHA256(SEED, TYPE_UUID || INSTANCE) + */ + + LIST_FOREACH(partitions, q, context->partitions) { + if (p == q) + break; + + if (!sd_id128_equal(p->type.uuid, q->type.uuid)) + continue; + + k++; + } + + plaintext.type_uuid = p->type.uuid; + plaintext.counter = htole64(k); + + hmac_sha256(context->seed.bytes, sizeof(context->seed.bytes), + &plaintext, + k == 0 ? sizeof(sd_id128_t) : sizeof(plaintext), + result.md); + + /* Take the first half, mark it as v4 UUID */ + assert_cc(sizeof(result.md) == sizeof(result.id) * 2); + result.id = id128_make_v4_uuid(result.id); + + /* Ensure this partition UUID is actually unique, and there's no remaining partition from an earlier run? */ + LIST_FOREACH(partitions, q, context->partitions) { + if (p == q) + continue; + + if (sd_id128_in_set(result.id, q->current_uuid, q->new_uuid)) { + log_warning("Partition UUID calculated from seed for partition %" PRIu64 " already used, reverting to randomized UUID.", p->partno); + + r = sd_id128_randomize(&result.id); + if (r < 0) + return log_error_errno(r, "Failed to generate randomized UUID: %m"); + + break; + } + } + + *ret = result.id; + return 0; +} + +static int partition_acquire_label(Context *context, Partition *p, char **ret) { + _cleanup_free_ char *label = NULL; + const char *prefix; + unsigned k = 1; + + assert(context); + assert(p); + assert(ret); + + prefix = gpt_partition_type_uuid_to_string(p->type.uuid); + if (!prefix) + prefix = "linux"; + + for (;;) { + const char *ll = label ?: prefix; + bool retry = false; + + LIST_FOREACH(partitions, q, context->partitions) { + if (p == q) + break; + + if (streq_ptr(ll, q->current_label) || + streq_ptr(ll, q->new_label)) { + retry = true; + break; + } + } + + if (!retry) + break; + + label = mfree(label); + if (asprintf(&label, "%s-%u", prefix, ++k) < 0) + return log_oom(); + } + + if (!label) { + label = strdup(prefix); + if (!label) + return log_oom(); + } + + *ret = TAKE_PTR(label); + return 0; +} + +static int context_acquire_partition_uuids_and_labels(Context *context) { + int r; + + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) { + sd_id128_t uuid; + + /* Never touch foreign partitions */ + if (PARTITION_IS_FOREIGN(p)) { + p->new_uuid = p->current_uuid; + + if (p->current_label) { + r = free_and_strdup_warn(&p->new_label, strempty(p->current_label)); + if (r < 0) + return r; + } + + continue; + } + + if (!sd_id128_is_null(p->current_uuid)) + p->new_uuid = uuid = p->current_uuid; /* Never change initialized UUIDs */ + else if (p->new_uuid_is_set) + uuid = p->new_uuid; + else { + /* Not explicitly set by user! */ + r = partition_acquire_uuid(context, p, &uuid); + if (r < 0) + return r; + + /* The final verity hash/data UUIDs can only be determined after formatting the + * verity hash partition. However, we still want to use the generated partition UUID + * to derive other UUIDs to keep things unique and reproducible, so we always + * generate a UUID if none is set, but we only use it as the actual partition UUID if + * verity is not configured. */ + if (!IN_SET(p->verity, VERITY_DATA, VERITY_HASH)) { + p->new_uuid = uuid; + p->new_uuid_is_set = true; + } + } + + /* Calculate the UUID for the file system as HMAC-SHA256 of the string "file-system-uuid", + * keyed off the partition UUID. */ + r = derive_uuid(uuid, "file-system-uuid", &p->fs_uuid); + if (r < 0) + return r; + + if (p->encrypt != ENCRYPT_OFF) { + r = derive_uuid(uuid, "luks-uuid", &p->luks_uuid); + if (r < 0) + return r; + } + + /* Derive the verity salt and verity superblock UUID from the seed to keep them reproducible */ + if (p->verity == VERITY_HASH) { + derive_salt(context->seed, "verity-salt", p->verity_salt); + + r = derive_uuid(context->seed, "verity-uuid", &p->verity_uuid); + if (r < 0) + return log_error_errno(r, "Failed to acquire verity uuid: %m"); + } + + if (!isempty(p->current_label)) { + /* never change initialized labels */ + r = free_and_strdup_warn(&p->new_label, p->current_label); + if (r < 0) + return r; + } else if (!p->new_label) { + /* Not explicitly set by user! */ + + r = partition_acquire_label(context, p, &p->new_label); + if (r < 0) + return r; + } + } + + return 0; +} + +static int set_gpt_flags(struct fdisk_partition *q, uint64_t flags) { + _cleanup_free_ char *a = NULL; + + for (unsigned i = 0; i < sizeof(flags) * 8; i++) { + uint64_t bit = UINT64_C(1) << i; + char buf[DECIMAL_STR_MAX(unsigned)+1]; + + if (!FLAGS_SET(flags, bit)) + continue; + + xsprintf(buf, "%u", i); + if (!strextend_with_separator(&a, ",", buf)) + return -ENOMEM; + } + + return fdisk_partition_set_attrs(q, a); +} + +static uint64_t partition_merge_flags(Partition *p) { + uint64_t f; + + assert(p); + + f = p->gpt_flags; + + if (p->no_auto >= 0) { + if (gpt_partition_type_knows_no_auto(p->type)) + SET_FLAG(f, SD_GPT_FLAG_NO_AUTO, p->no_auto); + else { + char buffer[SD_ID128_UUID_STRING_MAX]; + log_warning("Configured NoAuto=%s for partition type '%s' that doesn't support it, ignoring.", + yes_no(p->no_auto), + gpt_partition_type_uuid_to_string_harder(p->type.uuid, buffer)); + } + } + + if (p->read_only >= 0) { + if (gpt_partition_type_knows_read_only(p->type)) + SET_FLAG(f, SD_GPT_FLAG_READ_ONLY, p->read_only); + else { + char buffer[SD_ID128_UUID_STRING_MAX]; + log_warning("Configured ReadOnly=%s for partition type '%s' that doesn't support it, ignoring.", + yes_no(p->read_only), + gpt_partition_type_uuid_to_string_harder(p->type.uuid, buffer)); + } + } + + if (p->growfs >= 0) { + if (gpt_partition_type_knows_growfs(p->type)) + SET_FLAG(f, SD_GPT_FLAG_GROWFS, p->growfs); + else { + char buffer[SD_ID128_UUID_STRING_MAX]; + log_warning("Configured GrowFileSystem=%s for partition type '%s' that doesn't support it, ignoring.", + yes_no(p->growfs), + gpt_partition_type_uuid_to_string_harder(p->type.uuid, buffer)); + } + } + + return f; +} + +static int context_mangle_partitions(Context *context) { + int r; + + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) { + if (p->dropped) + continue; + + if (partition_type_defer(&p->type)) + continue; + + assert(p->new_size != UINT64_MAX); + assert(p->offset != UINT64_MAX); + assert(p->partno != UINT64_MAX); + + if (PARTITION_EXISTS(p)) { + bool changed = false; + + assert(p->current_partition); + + if (p->new_size != p->current_size) { + assert(p->new_size >= p->current_size); + assert(p->new_size % context->sector_size == 0); + + r = fdisk_partition_size_explicit(p->current_partition, true); + if (r < 0) + return log_error_errno(r, "Failed to enable explicit sizing: %m"); + + r = fdisk_partition_set_size(p->current_partition, p->new_size / context->sector_size); + if (r < 0) + return log_error_errno(r, "Failed to grow partition: %m"); + + log_info("Growing existing partition %" PRIu64 ".", p->partno); + changed = true; + } + + if (!sd_id128_equal(p->new_uuid, p->current_uuid)) { + r = fdisk_partition_set_uuid(p->current_partition, SD_ID128_TO_UUID_STRING(p->new_uuid)); + if (r < 0) + return log_error_errno(r, "Failed to set partition UUID: %m"); + + log_info("Initializing UUID of existing partition %" PRIu64 ".", p->partno); + changed = true; + } + + if (!streq_ptr(p->new_label, p->current_label)) { + r = fdisk_partition_set_name(p->current_partition, strempty(p->new_label)); + if (r < 0) + return log_error_errno(r, "Failed to set partition label: %m"); + + log_info("Setting partition label of existing partition %" PRIu64 ".", p->partno); + changed = true; + } + + if (changed) { + assert(!PARTITION_IS_FOREIGN(p)); /* never touch foreign partitions */ + + r = fdisk_set_partition(context->fdisk_context, p->partno, p->current_partition); + if (r < 0) + return log_error_errno(r, "Failed to update partition: %m"); + } + } else { + _cleanup_(fdisk_unref_partitionp) struct fdisk_partition *q = NULL; + _cleanup_(fdisk_unref_parttypep) struct fdisk_parttype *t = NULL; + + assert(!p->new_partition); + assert(p->offset % context->sector_size == 0); + assert(p->new_size % context->sector_size == 0); + assert(p->new_label); + + t = fdisk_new_parttype(); + if (!t) + return log_oom(); + + r = fdisk_parttype_set_typestr(t, SD_ID128_TO_UUID_STRING(p->type.uuid)); + if (r < 0) + return log_error_errno(r, "Failed to initialize partition type: %m"); + + q = fdisk_new_partition(); + if (!q) + return log_oom(); + + r = fdisk_partition_set_type(q, t); + if (r < 0) + return log_error_errno(r, "Failed to set partition type: %m"); + + r = fdisk_partition_size_explicit(q, true); + if (r < 0) + return log_error_errno(r, "Failed to enable explicit sizing: %m"); + + r = fdisk_partition_set_start(q, p->offset / context->sector_size); + if (r < 0) + return log_error_errno(r, "Failed to position partition: %m"); + + r = fdisk_partition_set_size(q, p->new_size / context->sector_size); + if (r < 0) + return log_error_errno(r, "Failed to grow partition: %m"); + + r = fdisk_partition_set_partno(q, p->partno); + if (r < 0) + return log_error_errno(r, "Failed to set partition number: %m"); + + r = fdisk_partition_set_uuid(q, SD_ID128_TO_UUID_STRING(p->new_uuid)); + if (r < 0) + return log_error_errno(r, "Failed to set partition UUID: %m"); + + r = fdisk_partition_set_name(q, strempty(p->new_label)); + if (r < 0) + return log_error_errno(r, "Failed to set partition label: %m"); + + /* Merge the no auto + read only + growfs setting with the literal flags, and set them for the partition */ + r = set_gpt_flags(q, partition_merge_flags(p)); + if (r < 0) + return log_error_errno(r, "Failed to set GPT partition flags: %m"); + + log_info("Adding new partition %" PRIu64 " to partition table.", p->partno); + + r = fdisk_add_partition(context->fdisk_context, q, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add partition: %m"); + + assert(!p->new_partition); + p->new_partition = TAKE_PTR(q); + } + } + + return 0; +} + +static int split_name_printf(Partition *p, char **ret) { + assert(p); + + const Specifier table[] = { + { 't', specifier_string, GPT_PARTITION_TYPE_UUID_TO_STRING_HARDER(p->type.uuid) }, + { 'T', specifier_id128, &p->type.uuid }, + { 'U', specifier_id128, &p->new_uuid }, + { 'n', specifier_uint64, &p->partno }, + + COMMON_SYSTEM_SPECIFIERS, + {} + }; + + return specifier_printf(p->split_name_format, NAME_MAX, table, arg_root, p, ret); +} + +static int split_node(const char *node, char **ret_base, char **ret_ext) { + _cleanup_free_ char *base = NULL, *ext = NULL; + char *e; + int r; + + assert(node); + assert(ret_base); + assert(ret_ext); + + r = path_extract_filename(node, &base); + if (r == O_DIRECTORY || r == -EADDRNOTAVAIL) + return log_error_errno(r, "Device node %s cannot be a directory", node); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from %s: %m", node); + + e = endswith(base, ".raw"); + if (e) { + ext = strdup(e); + if (!ext) + return log_oom(); + + *e = 0; + } + + *ret_base = TAKE_PTR(base); + *ret_ext = TAKE_PTR(ext); + + return 0; +} + +static int split_name_resolve(Context *context) { + _cleanup_free_ char *parent = NULL, *base = NULL, *ext = NULL; + int r; + + assert(context); + + r = path_extract_directory(context->node, &parent); + if (r < 0 && r != -EDESTADDRREQ) + return log_error_errno(r, "Failed to extract directory from %s: %m", context->node); + + r = split_node(context->node, &base, &ext); + if (r < 0) + return r; + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_free_ char *resolved = NULL; + + if (p->dropped) + continue; + + if (!p->split_name_format) + continue; + + r = split_name_printf(p, &resolved); + if (r < 0) + return log_error_errno(r, "Failed to resolve specifiers in %s: %m", p->split_name_format); + + if (parent) + p->split_path = strjoin(parent, "/", base, ".", resolved, ext); + else + p->split_path = strjoin(base, ".", resolved, ext); + if (!p->split_path) + return log_oom(); + } + + LIST_FOREACH(partitions, p, context->partitions) { + if (!p->split_path) + continue; + + LIST_FOREACH(partitions, q, context->partitions) { + if (p == q) + continue; + + if (!q->split_path) + continue; + + if (!streq(p->split_path, q->split_path)) + continue; + + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "%s and %s have the same resolved split name \"%s\", refusing", + p->definition_path, q->definition_path, p->split_path); + } + } + + return 0; +} + +static int context_split(Context *context) { + int fd = -EBADF, r; + + if (!arg_split) + return 0; + + assert(context); + + /* We can't do resolution earlier because the partition UUIDs for verity partitions are only filled + * in after they've been generated. */ + + r = split_name_resolve(context); + if (r < 0) + return r; + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_close_ int fdt = -EBADF; + + if (p->dropped) + continue; + + if (!p->split_path) + continue; + + if (partition_type_defer(&p->type)) + continue; + + fdt = open(p->split_path, O_WRONLY|O_NOCTTY|O_CLOEXEC|O_NOFOLLOW|O_CREAT|O_EXCL, 0666); + if (fdt < 0) + return log_error_errno(fdt, "Failed to open split partition file %s: %m", p->split_path); + + if (fd < 0) + assert_se((fd = fdisk_get_devfd(context->fdisk_context)) >= 0); + + if (lseek(fd, p->offset, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek to partition offset: %m"); + + r = copy_bytes(fd, fdt, p->new_size, COPY_REFLINK|COPY_HOLES|COPY_TRUNCATE); + if (r < 0) + return log_error_errno(r, "Failed to copy to split partition %s: %m", p->split_path); + } + + return 0; +} + +static int context_write_partition_table(Context *context) { + _cleanup_(fdisk_unref_tablep) struct fdisk_table *original_table = NULL; + int capable, r; + + assert(context); + + if (!context->from_scratch && !context_changed(context)) { + log_info("No changes."); + return 0; + } + + if (arg_dry_run) { + log_notice("Refusing to repartition, please re-run with --dry-run=no."); + return 0; + } + + log_info("Applying changes to %s.", context->node); + + if (context->from_scratch && arg_empty != EMPTY_CREATE) { + /* Erase everything if we operate from scratch, except if the image was just created anyway, and thus is definitely empty. */ + r = context_wipe_range(context, 0, context->total); + if (r < 0) + return r; + + log_info("Wiped block device."); + + if (arg_discard) { + r = context_discard_range(context, 0, context->total); + if (r == -EOPNOTSUPP) + log_info("Storage does not support discard, not discarding entire block device data."); + else if (r < 0) + return log_error_errno(r, "Failed to discard entire block device: %m"); + else if (r > 0) + log_info("Discarded entire block device."); + } + } + + r = fdisk_get_partitions(context->fdisk_context, &original_table); + if (r < 0) + return log_error_errno(r, "Failed to acquire partition table: %m"); + + /* Wipe fs signatures and discard sectors where the new partitions are going to be placed and in the + * gaps between partitions, just to be sure. */ + r = context_wipe_and_discard(context); + if (r < 0) + return r; + + r = context_copy_blocks(context); + if (r < 0) + return r; + + r = context_mkfs(context); + if (r < 0) + return r; + + r = context_mangle_partitions(context); + if (r < 0) + return r; + + log_info("Writing new partition table."); + + r = fdisk_write_disklabel(context->fdisk_context); + if (r < 0) + return log_error_errno(r, "Failed to write partition table: %m"); + + capable = blockdev_partscan_enabled(fdisk_get_devfd(context->fdisk_context)); + if (capable == -ENOTBLK) + log_debug("Not telling kernel to reread partition table, since we are not operating on a block device."); + else if (capable < 0) + return log_error_errno(capable, "Failed to check if block device supports partition scanning: %m"); + else if (capable > 0) { + log_info("Telling kernel to reread partition table."); + + if (context->from_scratch) + r = fdisk_reread_partition_table(context->fdisk_context); + else + r = fdisk_reread_changes(context->fdisk_context, original_table); + if (r < 0) + return log_error_errno(r, "Failed to reread partition table: %m"); + } else + log_notice("Not telling kernel to reread partition table, because selected image does not support kernel partition block devices."); + + log_info("All done."); + + return 0; +} + +static int context_read_seed(Context *context, const char *root) { + int r; + + assert(context); + + if (!sd_id128_is_null(context->seed)) + return 0; + + if (!arg_randomize) { + r = id128_get_machine(root, &context->seed); + if (r >= 0) + return 0; + + if (!ERRNO_IS_MACHINE_ID_UNSET(r)) + return log_error_errno(r, "Failed to parse machine ID of image: %m"); + + log_info("No machine ID set, using randomized partition UUIDs."); + } + + r = sd_id128_randomize(&context->seed); + if (r < 0) + return log_error_errno(r, "Failed to generate randomized seed: %m"); + + return 0; +} + +static int context_factory_reset(Context *context) { + size_t n = 0; + int r; + + assert(context); + + if (arg_factory_reset <= 0) + return 0; + + if (context->from_scratch) /* Nothing to reset if we start from scratch */ + return 0; + + if (arg_dry_run) { + log_notice("Refusing to factory reset, please re-run with --dry-run=no."); + return 0; + } + + log_info("Applying factory reset."); + + LIST_FOREACH(partitions, p, context->partitions) { + + if (!p->factory_reset || !PARTITION_EXISTS(p)) + continue; + + assert(p->partno != UINT64_MAX); + + log_info("Removing partition %" PRIu64 " for factory reset.", p->partno); + + r = fdisk_delete_partition(context->fdisk_context, p->partno); + if (r < 0) + return log_error_errno(r, "Failed to remove partition %" PRIu64 ": %m", p->partno); + + n++; + } + + if (n == 0) { + log_info("Factory reset requested, but no partitions to delete found."); + return 0; + } + + r = fdisk_write_disklabel(context->fdisk_context); + if (r < 0) + return log_error_errno(r, "Failed to write disk label: %m"); + + log_info("Successfully deleted %zu partitions.", n); + return 1; +} + +static int context_can_factory_reset(Context *context) { + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) + if (p->factory_reset && PARTITION_EXISTS(p)) + return true; + + return false; +} + +static int resolve_copy_blocks_auto_candidate( + dev_t partition_devno, + GptPartitionType partition_type, + dev_t restrict_devno, + sd_id128_t *ret_uuid) { + + _cleanup_(blkid_free_probep) blkid_probe b = NULL; + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *p = NULL; + const char *pttype, *t; + sd_id128_t pt_parsed, u; + blkid_partition pp; + dev_t whole_devno; + blkid_partlist pl; + int r; + + /* Checks if the specified partition has the specified GPT type UUID, and is located on the specified + * 'restrict_devno' device. The type check is particularly relevant if we have Verity volume which is + * backed by two separate partitions: the data and the hash partitions, and we need to find the right + * one of the two. */ + + r = block_get_whole_disk(partition_devno, &whole_devno); + if (r < 0) + return log_error_errno( + r, + "Unable to determine containing block device of partition %u:%u: %m", + major(partition_devno), minor(partition_devno)); + + if (restrict_devno != (dev_t) -1 && + restrict_devno != whole_devno) + return log_error_errno( + SYNTHETIC_ERRNO(EPERM), + "Partition %u:%u is located outside of block device %u:%u, refusing.", + major(partition_devno), minor(partition_devno), + major(restrict_devno), minor(restrict_devno)); + + fd = r = device_open_from_devnum(S_IFBLK, whole_devno, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &p); + if (r < 0) + return log_error_errno(r, "Failed to open block device " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(whole_devno)); + + b = blkid_new_probe(); + if (!b) + return log_oom(); + + errno = 0; + r = blkid_probe_set_device(b, fd, 0, 0); + if (r != 0) + return log_error_errno(errno_or_else(ENOMEM), "Failed to open block device '%s': %m", p); + + (void) blkid_probe_enable_partitions(b, 1); + (void) blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS); + + errno = 0; + r = blkid_do_safeprobe(b); + if (r == _BLKID_SAFEPROBE_ERROR) + return log_error_errno(errno_or_else(EIO), "Unable to probe for partition table of '%s': %m", p); + if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND)) { + log_debug("Didn't find partition table on block device '%s'.", p); + return false; + } + + assert(r == _BLKID_SAFEPROBE_FOUND); + + (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL); + if (!streq_ptr(pttype, "gpt")) { + log_debug("Didn't find a GPT partition table on '%s'.", p); + return false; + } + + errno = 0; + pl = blkid_probe_get_partitions(b); + if (!pl) + return log_error_errno(errno_or_else(EIO), "Unable read partition table of '%s': %m", p); + + pp = blkid_partlist_devno_to_partition(pl, partition_devno); + if (!pp) { + log_debug("Partition %u:%u has no matching partition table entry on '%s'.", + major(partition_devno), minor(partition_devno), p); + return false; + } + + t = blkid_partition_get_type_string(pp); + if (isempty(t)) { + log_debug("Partition %u:%u has no type on '%s'.", + major(partition_devno), minor(partition_devno), p); + return false; + } + + r = sd_id128_from_string(t, &pt_parsed); + if (r < 0) { + log_debug_errno(r, "Failed to parse partition type \"%s\": %m", t); + return false; + } + + if (!sd_id128_equal(pt_parsed, partition_type.uuid)) { + log_debug("Partition %u:%u has non-matching partition type " SD_ID128_FORMAT_STR " (needed: " SD_ID128_FORMAT_STR "), ignoring.", + major(partition_devno), minor(partition_devno), + SD_ID128_FORMAT_VAL(pt_parsed), SD_ID128_FORMAT_VAL(partition_type.uuid)); + return false; + } + + r = blkid_partition_get_uuid_id128(pp, &u); + if (r == -ENXIO) { + log_debug_errno(r, "Partition " DEVNUM_FORMAT_STR " has no UUID.", DEVNUM_FORMAT_VAL(partition_devno)); + return false; + } + if (r < 0) { + log_debug_errno(r, "Failed to read partition UUID of " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(partition_devno)); + return false; + } + + log_debug("Automatically found partition " DEVNUM_FORMAT_STR " of right type " SD_ID128_FORMAT_STR ".", + DEVNUM_FORMAT_VAL(partition_devno), + SD_ID128_FORMAT_VAL(pt_parsed)); + + if (ret_uuid) + *ret_uuid = u; + + return true; +} + +static int find_backing_devno( + const char *path, + const char *root, + dev_t *ret) { + + _cleanup_free_ char *resolved = NULL; + int r; + + assert(path); + + r = chase(path, root, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r < 0) + return r; + + r = path_is_mount_point(resolved, NULL, 0); + if (r < 0) + return r; + if (r == 0) /* Not a mount point, then it's not a partition of its own, let's not automatically use it. */ + return -ENOENT; + + r = get_block_device(resolved, ret); + if (r < 0) + return r; + if (r == 0) /* Not backed by physical file system, we can't use this */ + return -ENOENT; + + return 0; +} + +static int resolve_copy_blocks_auto( + GptPartitionType type, + const char *root, + dev_t restrict_devno, + dev_t *ret_devno, + sd_id128_t *ret_uuid) { + + const char *try1 = NULL, *try2 = NULL; + char p[SYS_BLOCK_PATH_MAX("/slaves")]; + _cleanup_closedir_ DIR *d = NULL; + sd_id128_t found_uuid = SD_ID128_NULL; + dev_t devno, found = 0; + int r; + + /* Enforce some security restrictions: CopyBlocks=auto should not be an avenue to get outside of the + * --root=/--image= confinement. Specifically, refuse CopyBlocks= in combination with --root= at all, + * and restrict block device references in the --image= case to loopback block device we set up. + * + * restrict_devno contain the dev_t of the loop back device we operate on in case of --image=, and + * thus declares which device (and its partition subdevices) we shall limit access to. If + * restrict_devno is zero no device probing access shall be allowed at all (used for --root=) and if + * it is (dev_t) -1 then free access shall be allowed (if neither switch is used). */ + + if (restrict_devno == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Automatic discovery of backing block devices not permitted in --root= mode, refusing."); + + /* Handles CopyBlocks=auto, and finds the right source partition to copy from. We look for matching + * partitions in the host, using the appropriate directory as key and ensuring that the partition + * type matches. */ + + if (type.designator == PARTITION_ROOT) + try1 = "/"; + else if (type.designator == PARTITION_USR) + try1 = "/usr/"; + else if (type.designator == PARTITION_ROOT_VERITY) + try1 = "/"; + else if (type.designator == PARTITION_USR_VERITY) + try1 = "/usr/"; + else if (type.designator == PARTITION_ESP) { + try1 = "/efi/"; + try2 = "/boot/"; + } else if (type.designator == PARTITION_XBOOTLDR) + try1 = "/boot/"; + else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Partition type " SD_ID128_FORMAT_STR " not supported from automatic source block device discovery.", + SD_ID128_FORMAT_VAL(type.uuid)); + + r = find_backing_devno(try1, root, &devno); + if (r == -ENOENT && try2) + r = find_backing_devno(try2, root, &devno); + if (r < 0) + return log_error_errno(r, "Failed to resolve automatic CopyBlocks= path for partition type " SD_ID128_FORMAT_STR ", sorry: %m", + SD_ID128_FORMAT_VAL(type.uuid)); + + xsprintf_sys_block_path(p, "/slaves", devno); + d = opendir(p); + if (d) { + struct dirent *de; + + for (;;) { + _cleanup_free_ char *q = NULL, *t = NULL; + sd_id128_t u; + dev_t sl; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + return log_error_errno(errno, "Failed to read directory '%s': %m", p); + + break; + } + + if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN)) + continue; + + q = path_join(p, de->d_name, "/dev"); + if (!q) + return log_oom(); + + r = read_one_line_file(q, &t); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", q); + + r = parse_devnum(t, &sl); + if (r < 0) { + log_debug_errno(r, "Failed to parse %s, ignoring: %m", q); + continue; + } + if (major(sl) == 0) { + log_debug("Device backing %s is special, ignoring.", q); + continue; + } + + r = resolve_copy_blocks_auto_candidate(sl, type, restrict_devno, &u); + if (r < 0) + return r; + if (r > 0) { + /* We found a matching one! */ + if (found != 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "Multiple matching partitions found, refusing."); + + found = sl; + found_uuid = u; + } + } + } else if (errno != ENOENT) + return log_error_errno(errno, "Failed open %s: %m", p); + else { + r = resolve_copy_blocks_auto_candidate(devno, type, restrict_devno, &found_uuid); + if (r < 0) + return r; + if (r > 0) + found = devno; + } + + if (found == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), + "Unable to automatically discover suitable partition to copy blocks from."); + + if (ret_devno) + *ret_devno = found; + + if (ret_uuid) + *ret_uuid = found_uuid; + + return 0; +} + +static int context_open_copy_block_paths( + Context *context, + dev_t restrict_devno) { + + int r; + + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_close_ int source_fd = -EBADF; + _cleanup_free_ char *opened = NULL; + sd_id128_t uuid = SD_ID128_NULL; + uint64_t size; + struct stat st; + + if (p->copy_blocks_fd >= 0) + continue; + + assert(p->copy_blocks_size == UINT64_MAX); + + if (PARTITION_EXISTS(p)) /* Never copy over partitions that already exist! */ + continue; + + if (p->copy_blocks_path) { + + source_fd = chase_and_open(p->copy_blocks_path, p->copy_blocks_root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &opened); + if (source_fd < 0) + return log_error_errno(source_fd, "Failed to open '%s': %m", p->copy_blocks_path); + + if (fstat(source_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat block copy file '%s': %m", opened); + + if (!S_ISREG(st.st_mode) && restrict_devno != (dev_t) -1) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Copying from block device node is not permitted in --image=/--root= mode, refusing."); + + } else if (p->copy_blocks_auto) { + dev_t devno = 0; /* Fake initialization to appease gcc. */ + + r = resolve_copy_blocks_auto(p->type, p->copy_blocks_root, restrict_devno, &devno, &uuid); + if (r < 0) + return r; + assert(devno != 0); + + source_fd = r = device_open_from_devnum(S_IFBLK, devno, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &opened); + if (r < 0) + return log_error_errno(r, "Failed to open automatically determined source block copy device " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(devno)); + + if (fstat(source_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat block copy file '%s': %m", opened); + } else + continue; + + if (S_ISDIR(st.st_mode)) { + _cleanup_free_ char *bdev = NULL; + dev_t devt; + + /* If the file is a directory, automatically find the backing block device */ + + if (major(st.st_dev) != 0) + devt = st.st_dev; + else { + /* Special support for btrfs */ + r = btrfs_get_block_device_fd(source_fd, &devt); + if (r == -EUCLEAN) + return btrfs_log_dev_root(LOG_ERR, r, opened); + if (r < 0) + return log_error_errno(r, "Unable to determine backing block device of '%s': %m", opened); + } + + safe_close(source_fd); + + source_fd = r = device_open_from_devnum(S_IFBLK, devt, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &bdev); + if (r < 0) + return log_error_errno(r, "Failed to open block device backing '%s': %m", opened); + + if (fstat(source_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat block device '%s': %m", bdev); + } + + if (S_ISREG(st.st_mode)) + size = st.st_size; + else if (S_ISBLK(st.st_mode)) { + if (ioctl(source_fd, BLKGETSIZE64, &size) != 0) + return log_error_errno(errno, "Failed to determine size of block device to copy from: %m"); + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path to copy blocks from '%s' is not a regular file, block device or directory, refusing: %m", opened); + + if (size <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File to copy bytes from '%s' has zero size, refusing.", opened); + if (size % 512 != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "File to copy bytes from '%s' has size that is not multiple of 512, refusing.", opened); + + p->copy_blocks_fd = TAKE_FD(source_fd); + p->copy_blocks_size = size; + + free_and_replace(p->copy_blocks_path, opened); + + /* When copying from an existing partition copy that partitions UUID if none is configured explicitly */ + if (!p->new_uuid_is_set && !sd_id128_is_null(uuid)) { + p->new_uuid = uuid; + p->new_uuid_is_set = true; + } + } + + return 0; +} + +static int fd_apparent_size(int fd, uint64_t *ret) { + off_t initial = 0; + uint64_t size = 0; + + assert(fd >= 0); + assert(ret); + + initial = lseek(fd, 0, SEEK_CUR); + if (initial < 0) + return log_error_errno(errno, "Failed to get file offset: %m"); + + for (off_t off = 0;;) { + off_t r; + + r = lseek(fd, off, SEEK_DATA); + if (r < 0 && errno == ENXIO) + /* If errno == ENXIO, that means we've reached the final hole of the file and + * that hole isn't followed by more data. */ + break; + if (r < 0) + return log_error_errno(errno, "Failed to seek data in file from offset %"PRIi64": %m", off); + + off = r; /* Set the offset to the start of the data segment. */ + + /* After copying a potential hole, find the end of the data segment by looking for + * the next hole. If we get ENXIO, we're at EOF. */ + r = lseek(fd, off, SEEK_HOLE); + if (r < 0) { + if (errno == ENXIO) + break; + return log_error_errno(errno, "Failed to seek hole in file from offset %"PRIi64": %m", off); + } + + size += r - off; + off = r; + } + + if (lseek(fd, initial, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to reset file offset: %m"); + + *ret = size; + + return 0; +} + +static int context_minimize(Context *context) { + const char *vt = NULL; + int r; + + assert(context); + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_(rm_rf_physical_and_freep) char *root = NULL; + _cleanup_(unlink_and_freep) char *temp = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + _cleanup_strv_free_ char **extra_mkfs_options = NULL; + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *hint = NULL; + sd_id128_t fs_uuid; + struct stat st; + uint64_t fsz; + + if (p->dropped) + continue; + + if (PARTITION_EXISTS(p)) /* Never format existing partitions */ + continue; + + if (!p->format) + continue; + + if (p->copy_blocks_fd >= 0) + continue; + + if (p->minimize == MINIMIZE_OFF) + continue; + + if (!partition_needs_populate(p)) + continue; + + assert(!p->copy_blocks_path); + + (void) partition_hint(p, context->node, &hint); + + log_info("Pre-populating %s filesystem of partition %s twice to calculate minimal partition size", + p->format, strna(hint)); + + if (!vt) { + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Could not determine temporary directory: %m"); + } + + r = tempfn_random_child(vt, "repart", &temp); + if (r < 0) + return log_error_errno(r, "Failed to generate temporary file path: %m"); + + if (fstype_is_ro(p->format)) + fs_uuid = p->fs_uuid; + else { + fd = open(temp, O_CREAT|O_EXCL|O_CLOEXEC|O_RDWR|O_NOCTTY, 0600); + if (fd < 0) + return log_error_errno(errno, "Failed to open temporary file %s: %m", temp); + + /* This may seem huge but it will be created sparse so it doesn't take up any space + * on disk until written to. */ + if (ftruncate(fd, 1024ULL * 1024ULL * 1024ULL * 1024ULL) < 0) + return log_error_errno(errno, "Failed to truncate temporary file to %s: %m", + FORMAT_BYTES(1024ULL * 1024ULL * 1024ULL * 1024ULL)); + + if (arg_offline <= 0) { + r = loop_device_make(fd, O_RDWR, 0, UINT64_MAX, context->sector_size, 0, LOCK_EX, &d); + if (r < 0 && (arg_offline == 0 || (r != -ENOENT && !ERRNO_IS_PRIVILEGE(r)) || !strv_isempty(p->subvolumes))) + return log_error_errno(r, "Failed to make loopback device of %s: %m", temp); + } + + /* We're going to populate this filesystem twice so use a random UUID the first time + * to avoid UUID conflicts. */ + r = sd_id128_randomize(&fs_uuid); + if (r < 0) + return r; + } + + if (!d || fstype_is_ro(p->format)) { + if (!mkfs_supports_root_option(p->format)) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), + "Loop device access is required to populate %s filesystems", + p->format); + + r = partition_populate_directory(context, p, &root); + if (r < 0) + return r; + } + + r = mkfs_options_from_env("REPART", p->format, &extra_mkfs_options); + if (r < 0) + return log_error_errno(r, + "Failed to determine mkfs command line options for '%s': %m", + p->format); + + r = make_filesystem(d ? d->node : temp, + p->format, + strempty(p->new_label), + root, + fs_uuid, + arg_discard, /* quiet = */ false, + context->fs_sector_size, + extra_mkfs_options); + if (r < 0) + return r; + + /* Read-only filesystems are minimal from the first try because they create and size the + * loopback file for us. */ + if (fstype_is_ro(p->format)) { + assert(fd < 0); + + fd = open(temp, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return log_error_errno(errno, "Failed to open temporary file %s: %m", temp); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat temporary file: %m"); + + log_info("Minimal partition size of %s filesystem of partition %s is %s", + p->format, strna(hint), FORMAT_BYTES(st.st_size)); + + p->copy_blocks_path = TAKE_PTR(temp); + p->copy_blocks_path_is_our_file = true; + p->copy_blocks_fd = TAKE_FD(fd); + p->copy_blocks_size = st.st_size; + continue; + } + + if (!root) { + assert(d); + + r = partition_populate_filesystem(context, p, d->node); + if (r < 0) + return r; + } + + /* Other filesystems need to be provided with a pre-sized loopback file and will adapt to + * fully occupy it. Because we gave the filesystem a 1T sparse file, we need to shrink the + * filesystem down to a reasonable size again to fit it in the disk image. While there are + * some filesystems that support shrinking, it doesn't always work properly (e.g. shrinking + * btrfs gives us a 2.0G filesystem regardless of what we put in it). Instead, let's populate + * the filesystem again, but this time, instead of providing the filesystem with a 1T sparse + * loopback file, let's size the loopback file based on the actual data used by the + * filesystem in the sparse file after the first attempt. This should be a good guess of the + * minimal amount of space needed in the filesystem to fit all the required data. + */ + r = fd_apparent_size(fd, &fsz); + if (r < 0) + return r; + + /* Massage the size a bit because just going by actual data used in the sparse file isn't + * fool-proof. */ + uint64_t heuristic = streq(p->format, "xfs") ? fsz : fsz / 2; + fsz = round_up_size(fsz + heuristic, context->grain_size); + if (minimal_size_by_fs_name(p->format) != UINT64_MAX) + fsz = MAX(minimal_size_by_fs_name(p->format), fsz); + + log_info("Minimal partition size of %s filesystem of partition %s is %s", + p->format, strna(hint), FORMAT_BYTES(fsz)); + + d = loop_device_unref(d); + + /* Erase the previous filesystem first. */ + if (ftruncate(fd, 0)) + return log_error_errno(errno, "Failed to erase temporary file: %m"); + + if (ftruncate(fd, fsz)) + return log_error_errno(errno, "Failed to truncate temporary file to %s: %m", FORMAT_BYTES(fsz)); + + if (arg_offline <= 0) { + r = loop_device_make(fd, O_RDWR, 0, UINT64_MAX, context->sector_size, 0, LOCK_EX, &d); + if (r < 0 && (arg_offline == 0 || (r != -ENOENT && !ERRNO_IS_PRIVILEGE(r)) || !strv_isempty(p->subvolumes))) + return log_error_errno(r, "Failed to make loopback device of %s: %m", temp); + } + + r = make_filesystem(d ? d->node : temp, + p->format, + strempty(p->new_label), + root, + p->fs_uuid, + arg_discard, + /* quiet = */ false, + context->fs_sector_size, + extra_mkfs_options); + if (r < 0) + return r; + + if (!root) { + assert(d); + + r = partition_populate_filesystem(context, p, d->node); + if (r < 0) + return r; + } + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat temporary file: %m"); + + p->copy_blocks_path = TAKE_PTR(temp); + p->copy_blocks_path_is_our_file = true; + p->copy_blocks_fd = TAKE_FD(fd); + p->copy_blocks_size = st.st_size; + } + + /* Now that we've done the data partitions, do the verity hash partitions. We do these in a separate + * step because they might depend on data generated in the previous step. */ + + LIST_FOREACH(partitions, p, context->partitions) { + _cleanup_(unlink_and_freep) char *temp = NULL; + _cleanup_free_ char *hint = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + Partition *dp; + + if (p->dropped) + continue; + + if (PARTITION_EXISTS(p)) /* Never format existing partitions */ + continue; + + if (p->minimize == MINIMIZE_OFF) + continue; + + if (p->verity != VERITY_HASH) + continue; + + assert_se(dp = p->siblings[VERITY_DATA]); + assert(!dp->dropped); + assert(dp->copy_blocks_path); + + (void) partition_hint(p, context->node, &hint); + + log_info("Pre-populating verity hash data of partition %s to calculate minimal partition size", + strna(hint)); + + if (!vt) { + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Could not determine temporary directory: %m"); + } + + r = tempfn_random_child(vt, "repart", &temp); + if (r < 0) + return log_error_errno(r, "Failed to generate temporary file path: %m"); + + r = touch(temp); + if (r < 0) + return log_error_errno(r, "Failed to create temporary file: %m"); + + r = partition_format_verity_hash(context, p, temp, dp->copy_blocks_path); + if (r < 0) + return r; + + fd = open(temp, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return log_error_errno(errno, "Failed to open temporary file %s: %m", temp); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat temporary file: %m"); + + log_info("Minimal partition size of verity hash partition %s is %s", + strna(hint), FORMAT_BYTES(st.st_size)); + + p->copy_blocks_path = TAKE_PTR(temp); + p->copy_blocks_path_is_our_file = true; + p->copy_blocks_fd = TAKE_FD(fd); + p->copy_blocks_size = st.st_size; + } + + return 0; +} + +static int parse_partition_types(const char *p, GptPartitionType **partitions, size_t *n_partitions) { + int r; + + assert(partitions); + assert(n_partitions); + + for (;;) { + _cleanup_free_ char *name = NULL; + GptPartitionType type; + + r = extract_first_word(&p, &name, ",", EXTRACT_CUNESCAPE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == 0) + break; + if (r < 0) + return log_error_errno(r, "Failed to extract partition type identifier or GUID: %s", p); + + r = gpt_partition_type_from_string(name, &type); + if (r < 0) + return log_error_errno(r, "'%s' is not a valid partition type identifier or GUID", name); + + if (!GREEDY_REALLOC(*partitions, *n_partitions + 1)) + return log_oom(); + + (*partitions)[(*n_partitions)++] = type; + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-repart", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [DEVICE]\n" + "\n%sGrow and add partitions to partition table.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --dry-run=BOOL Whether to run dry-run operation\n" + " --empty=MODE One of refuse, allow, require, force, create; controls\n" + " how to handle empty disks lacking partition tables\n" + " --discard=BOOL Whether to discard backing blocks for new partitions\n" + " --pretty=BOOL Whether to show pretty summary before doing changes\n" + " --factory-reset=BOOL Whether to remove data partitions before recreating\n" + " them\n" + " --can-factory-reset Test whether factory reset is defined\n" + " --root=PATH Operate relative to root path\n" + " --image=PATH Operate relative to image file\n" + " --image-policy=POLICY\n" + " Specify disk image dissection policy\n" + " --definitions=DIR Find partition definitions in specified directory\n" + " --key-file=PATH Key to use when encrypting partitions\n" + " --private-key=PATH Private key to use when generating verity roothash\n" + " signatures\n" + " --certificate=PATH PEM certificate to use when generating verity\n" + " roothash signatures\n" + " --tpm2-device=PATH Path to TPM2 device node to use\n" + " --tpm2-device-key=PATH\n" + " Enroll a TPM2 device using its public key\n" + " --tpm2-seal-key-handle=HANDLE\n" + " Specify handle of key to use for sealing\n" + " --tpm2-pcrs=PCR1+PCR2+PCR3+…\n" + " TPM2 PCR indexes to use for TPM2 enrollment\n" + " --tpm2-public-key=PATH\n" + " Enroll signed TPM2 PCR policy against PEM public key\n" + " --tpm2-public-key-pcrs=PCR1+PCR2+PCR3+…\n" + " Enroll signed TPM2 PCR policy for specified TPM2 PCRs\n" + " --tpm2-pcrlock=PATH\n" + " Specify pcrlock policy to lock against\n" + " --seed=UUID 128-bit seed UUID to derive all UUIDs from\n" + " --size=BYTES Grow loopback file to specified size\n" + " --json=pretty|short|off\n" + " Generate JSON output\n" + " --split=BOOL Whether to generate split artifacts\n" + " --include-partitions=PARTITION1,PARTITION2,PARTITION3,…\n" + " Ignore partitions not of the specified types\n" + " --exclude-partitions=PARTITION1,PARTITION2,PARTITION3,…\n" + " Ignore partitions of the specified types\n" + " --defer-partitions=PARTITION1,PARTITION2,PARTITION3,…\n" + " Take partitions of the specified types into account\n" + " but don't populate them yet\n" + " --sector-size=SIZE Set the logical sector size for the image\n" + " --architecture=ARCH Set the generic architecture for the image\n" + " --offline=BOOL Whether to build the image offline\n" + " -s --copy-source=PATH Specify the primary source tree to copy files from\n" + " --copy-from=IMAGE Copy partitions from the given image(s)\n" + " -S --make-ddi=sysext Make a system extension DDI\n" + " -C --make-ddi=confext Make a configuration extension DDI\n" + " -P --make-ddi=portable Make a portable service DDI\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_DRY_RUN, + ARG_EMPTY, + ARG_DISCARD, + ARG_FACTORY_RESET, + ARG_CAN_FACTORY_RESET, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_SEED, + ARG_PRETTY, + ARG_DEFINITIONS, + ARG_SIZE, + ARG_JSON, + ARG_KEY_FILE, + ARG_PRIVATE_KEY, + ARG_CERTIFICATE, + ARG_TPM2_DEVICE, + ARG_TPM2_DEVICE_KEY, + ARG_TPM2_SEAL_KEY_HANDLE, + ARG_TPM2_PCRS, + ARG_TPM2_PUBLIC_KEY, + ARG_TPM2_PUBLIC_KEY_PCRS, + ARG_TPM2_PCRLOCK, + ARG_SPLIT, + ARG_INCLUDE_PARTITIONS, + ARG_EXCLUDE_PARTITIONS, + ARG_DEFER_PARTITIONS, + ARG_SECTOR_SIZE, + ARG_SKIP_PARTITIONS, + ARG_ARCHITECTURE, + ARG_OFFLINE, + ARG_COPY_FROM, + ARG_MAKE_DDI, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "dry-run", required_argument, NULL, ARG_DRY_RUN }, + { "empty", required_argument, NULL, ARG_EMPTY }, + { "discard", required_argument, NULL, ARG_DISCARD }, + { "factory-reset", required_argument, NULL, ARG_FACTORY_RESET }, + { "can-factory-reset", no_argument, NULL, ARG_CAN_FACTORY_RESET }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "seed", required_argument, NULL, ARG_SEED }, + { "pretty", required_argument, NULL, ARG_PRETTY }, + { "definitions", required_argument, NULL, ARG_DEFINITIONS }, + { "size", required_argument, NULL, ARG_SIZE }, + { "json", required_argument, NULL, ARG_JSON }, + { "key-file", required_argument, NULL, ARG_KEY_FILE }, + { "private-key", required_argument, NULL, ARG_PRIVATE_KEY }, + { "certificate", required_argument, NULL, ARG_CERTIFICATE }, + { "tpm2-device", required_argument, NULL, ARG_TPM2_DEVICE }, + { "tpm2-device-key", required_argument, NULL, ARG_TPM2_DEVICE_KEY }, + { "tpm2-seal-key-handle", required_argument, NULL, ARG_TPM2_SEAL_KEY_HANDLE }, + { "tpm2-pcrs", required_argument, NULL, ARG_TPM2_PCRS }, + { "tpm2-public-key", required_argument, NULL, ARG_TPM2_PUBLIC_KEY }, + { "tpm2-public-key-pcrs", required_argument, NULL, ARG_TPM2_PUBLIC_KEY_PCRS }, + { "tpm2-pcrlock", required_argument, NULL, ARG_TPM2_PCRLOCK }, + { "split", required_argument, NULL, ARG_SPLIT }, + { "include-partitions", required_argument, NULL, ARG_INCLUDE_PARTITIONS }, + { "exclude-partitions", required_argument, NULL, ARG_EXCLUDE_PARTITIONS }, + { "defer-partitions", required_argument, NULL, ARG_DEFER_PARTITIONS }, + { "sector-size", required_argument, NULL, ARG_SECTOR_SIZE }, + { "architecture", required_argument, NULL, ARG_ARCHITECTURE }, + { "offline", required_argument, NULL, ARG_OFFLINE }, + { "copy-from", required_argument, NULL, ARG_COPY_FROM }, + { "copy-source", required_argument, NULL, 's' }, + { "make-ddi", required_argument, NULL, ARG_MAKE_DDI }, + {} + }; + + bool auto_hash_pcr_values = true, auto_public_key_pcr_mask = true, auto_pcrlock = true; + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hs:SCP", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_DRY_RUN: + r = parse_boolean_argument("--dry-run=", optarg, &arg_dry_run); + if (r < 0) + return r; + break; + + case ARG_EMPTY: + if (isempty(optarg)) { + arg_empty = EMPTY_UNSET; + break; + } + + arg_empty = empty_mode_from_string(optarg); + if (arg_empty < 0) + return log_error_errno(arg_empty, "Failed to parse --empty= parameter: %s", optarg); + + break; + + case ARG_DISCARD: + r = parse_boolean_argument("--discard=", optarg, &arg_discard); + if (r < 0) + return r; + break; + + case ARG_FACTORY_RESET: + r = parse_boolean_argument("--factory-reset=", optarg, NULL); + if (r < 0) + return r; + arg_factory_reset = r; + break; + + case ARG_CAN_FACTORY_RESET: + arg_can_factory_reset = true; + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_SEED: + if (isempty(optarg)) { + arg_seed = SD_ID128_NULL; + arg_randomize = false; + } else if (streq(optarg, "random")) + arg_randomize = true; + else { + r = sd_id128_from_string(optarg, &arg_seed); + if (r < 0) + return log_error_errno(r, "Failed to parse seed: %s", optarg); + + arg_randomize = false; + } + + break; + + case ARG_PRETTY: + r = parse_boolean_argument("--pretty=", optarg, NULL); + if (r < 0) + return r; + arg_pretty = r; + break; + + case ARG_DEFINITIONS: { + _cleanup_free_ char *path = NULL; + r = parse_path_argument(optarg, false, &path); + if (r < 0) + return r; + if (strv_consume(&arg_definitions, TAKE_PTR(path)) < 0) + return log_oom(); + break; + } + + case ARG_SIZE: { + uint64_t parsed, rounded; + + if (streq(optarg, "auto")) { + arg_size = UINT64_MAX; + arg_size_auto = true; + break; + } + + r = parse_size(optarg, 1024, &parsed); + if (r < 0) + return log_error_errno(r, "Failed to parse --size= parameter: %s", optarg); + + rounded = round_up_size(parsed, 4096); + if (rounded == 0) + return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Specified image size too small, refusing."); + if (rounded == UINT64_MAX) + return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Specified image size too large, refusing."); + + if (rounded != parsed) + log_warning("Specified size is not a multiple of 4096, rounding up automatically. (%" PRIu64 " %s %" PRIu64 ")", + parsed, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), rounded); + + arg_size = rounded; + arg_size_auto = false; + break; + } + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + + break; + + case ARG_KEY_FILE: { + _cleanup_(erase_and_freep) char *k = NULL; + size_t n = 0; + + r = read_full_file_full( + AT_FDCWD, optarg, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &k, &n); + if (r < 0) + return log_error_errno(r, "Failed to read key file '%s': %m", optarg); + + erase_and_free(arg_key); + arg_key = TAKE_PTR(k); + arg_key_size = n; + break; + } + + case ARG_PRIVATE_KEY: { + _cleanup_(erase_and_freep) char *k = NULL; + size_t n = 0; + + r = read_full_file_full( + AT_FDCWD, optarg, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE|READ_FULL_FILE_WARN_WORLD_READABLE|READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &k, &n); + if (r < 0) + return log_error_errno(r, "Failed to read key file '%s': %m", optarg); + + EVP_PKEY_free(arg_private_key); + arg_private_key = NULL; + r = parse_private_key(k, n, &arg_private_key); + if (r < 0) + return r; + break; + } + + case ARG_CERTIFICATE: { + _cleanup_free_ char *cert = NULL; + size_t n = 0; + + r = read_full_file_full( + AT_FDCWD, optarg, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &cert, &n); + if (r < 0) + return log_error_errno(r, "Failed to read certificate file '%s': %m", optarg); + + X509_free(arg_certificate); + arg_certificate = NULL; + r = parse_x509_certificate(cert, n, &arg_certificate); + if (r < 0) + return r; + break; + } + + case ARG_TPM2_DEVICE: { + _cleanup_free_ char *device = NULL; + + if (streq(optarg, "list")) + return tpm2_list_devices(); + + if (!streq(optarg, "auto")) { + device = strdup(optarg); + if (!device) + return log_oom(); + } + + free(arg_tpm2_device); + arg_tpm2_device = TAKE_PTR(device); + break; + } + + case ARG_TPM2_DEVICE_KEY: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_device_key); + if (r < 0) + return r; + + break; + + case ARG_TPM2_SEAL_KEY_HANDLE: + r = safe_atou32_full(optarg, 16, &arg_tpm2_seal_key_handle); + if (r < 0) + return log_error_errno(r, "Could not parse TPM2 seal key handle index '%s': %m", optarg); + + break; + + case ARG_TPM2_PCRS: + auto_hash_pcr_values = false; + r = tpm2_parse_pcr_argument_append(optarg, &arg_tpm2_hash_pcr_values, &arg_tpm2_n_hash_pcr_values); + if (r < 0) + return r; + + break; + + case ARG_TPM2_PUBLIC_KEY: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_public_key); + if (r < 0) + return r; + + break; + + case ARG_TPM2_PUBLIC_KEY_PCRS: + auto_public_key_pcr_mask = false; + r = tpm2_parse_pcr_argument_to_mask(optarg, &arg_tpm2_public_key_pcr_mask); + if (r < 0) + return r; + + break; + + case ARG_TPM2_PCRLOCK: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_tpm2_pcrlock); + if (r < 0) + return r; + + auto_pcrlock = false; + break; + + case ARG_SPLIT: + r = parse_boolean_argument("--split=", optarg, NULL); + if (r < 0) + return r; + + arg_split = r; + break; + + case ARG_INCLUDE_PARTITIONS: + if (arg_filter_partitions_type == FILTER_PARTITIONS_EXCLUDE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Combination of --include-partitions= and --exclude-partitions= is invalid."); + + r = parse_partition_types(optarg, &arg_filter_partitions, &arg_n_filter_partitions); + if (r < 0) + return r; + + arg_filter_partitions_type = FILTER_PARTITIONS_INCLUDE; + + break; + + case ARG_EXCLUDE_PARTITIONS: + if (arg_filter_partitions_type == FILTER_PARTITIONS_INCLUDE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Combination of --include-partitions= and --exclude-partitions= is invalid."); + + r = parse_partition_types(optarg, &arg_filter_partitions, &arg_n_filter_partitions); + if (r < 0) + return r; + + arg_filter_partitions_type = FILTER_PARTITIONS_EXCLUDE; + + break; + + case ARG_DEFER_PARTITIONS: + r = parse_partition_types(optarg, &arg_defer_partitions, &arg_n_defer_partitions); + if (r < 0) + return r; + + break; + + case ARG_SECTOR_SIZE: + r = parse_sector_size(optarg, &arg_sector_size); + if (r < 0) + return r; + + break; + + case ARG_ARCHITECTURE: + r = architecture_from_string(optarg); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid architecture '%s'", optarg); + + arg_architecture = r; + break; + + case ARG_OFFLINE: + if (streq(optarg, "auto")) + arg_offline = -1; + else { + r = parse_boolean_argument("--offline=", optarg, NULL); + if (r < 0) + return r; + + arg_offline = r; + } + + break; + + case ARG_COPY_FROM: { + _cleanup_free_ char *p = NULL; + + r = parse_path_argument(optarg, /* suppress_root= */ false, &p); + if (r < 0) + return r; + + if (strv_consume(&arg_copy_from, TAKE_PTR(p)) < 0) + return log_oom(); + + break; + } + + case 's': + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_copy_source); + if (r < 0) + return r; + break; + + case ARG_MAKE_DDI: + if (!filename_is_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid DDI type: %s", optarg); + + r = free_and_strdup_warn(&arg_make_ddi, optarg); + if (r < 0) + return r; + break; + + case 'S': + r = free_and_strdup_warn(&arg_make_ddi, "sysext"); + if (r < 0) + return r; + break; + + case 'C': + r = free_and_strdup_warn(&arg_make_ddi, "confext"); + if (r < 0) + return r; + break; + + case 'P': + r = free_and_strdup_warn(&arg_make_ddi, "portable"); + if (r < 0) + return r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (argc - optind > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Expected at most one argument, the path to the block device or image file."); + + if (arg_make_ddi) { + if (arg_definitions) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Combination of --make-ddi= and --definitions= is not supported."); + if (!IN_SET(arg_empty, EMPTY_UNSET, EMPTY_CREATE)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Combination of --make-ddi= and --empty=%s is not supported.", empty_mode_to_string(arg_empty)); + + /* Imply automatic sizing in DDI mode */ + if (arg_size == UINT64_MAX) + arg_size_auto = true; + + if (!arg_copy_source) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No --copy-source= specified, refusing."); + + r = dir_is_empty(arg_copy_source, /* ignore_hidden_or_backup= */ false); + if (r < 0) + return log_error_errno(r, "Failed to determine if '%s' is empty: %m", arg_copy_source); + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Source directory '%s' is empty, refusing to create empty image.", arg_copy_source); + + if (sd_id128_is_null(arg_seed) && !arg_randomize) { + /* We don't want that /etc/machine-id leaks into any image built this way, hence + * let's randomize the seed if not specified explicitly */ + log_notice("No seed value specified, randomizing generated UUIDs, resulting image will not be reproducible."); + arg_randomize = true; + } + + arg_empty = EMPTY_CREATE; + } + + if (arg_empty == EMPTY_UNSET) /* default to refuse mode, if not otherwise specified */ + arg_empty = EMPTY_REFUSE; + + if (arg_factory_reset > 0 && IN_SET(arg_empty, EMPTY_FORCE, EMPTY_REQUIRE, EMPTY_CREATE)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Combination of --factory-reset=yes and --empty=force/--empty=require/--empty=create is invalid."); + + if (arg_can_factory_reset) + arg_dry_run = true; /* When --can-factory-reset is specified we don't make changes, hence + * non-dry-run mode makes no sense. Thus, imply dry run mode so that we + * open things strictly read-only. */ + else if (arg_empty == EMPTY_CREATE) + arg_dry_run = false; /* Imply --dry-run=no if we create the loopback file anew. After all we + * cannot really break anyone's partition tables that way. */ + + /* Disable pager once we are not just reviewing, but doing things. */ + if (!arg_dry_run) + arg_pager_flags |= PAGER_DISABLE; + + if (arg_empty == EMPTY_CREATE && arg_size == UINT64_MAX && !arg_size_auto) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "If --empty=create is specified, --size= must be specified, too."); + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + else if (!arg_image && !arg_root && in_initrd()) { + + /* By default operate on /sysusr/ or /sysroot/ when invoked in the initrd. We prefer the + * former, if it is mounted, so that we have deterministic behaviour on systems where /usr/ + * is vendor-supplied but the root fs formatted on first boot. */ + r = path_is_mount_point("/sysusr/usr", NULL, 0); + if (r <= 0) { + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Unable to determine whether /sysusr/usr is a mount point, assuming it is not: %m"); + + arg_root = strdup("/sysroot"); + } else + arg_root = strdup("/sysusr"); + if (!arg_root) + return log_oom(); + } + + arg_node = argc > optind ? argv[optind] : NULL; + + if (IN_SET(arg_empty, EMPTY_FORCE, EMPTY_REQUIRE, EMPTY_CREATE) && !arg_node && !arg_image) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "A path to a device node or image file must be specified when --make-ddi=, --empty=force, --empty=require or --empty=create are used."); + + if (arg_split && !arg_node) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "A path to an image file must be specified when --split is used."); + + if (auto_pcrlock) { + assert(!arg_tpm2_pcrlock); + + r = tpm2_pcrlock_search_file(NULL, NULL, &arg_tpm2_pcrlock); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Search for pcrlock.json failed, assuming it does not exist: %m"); + } else + log_debug("Automatically using pcrlock policy '%s'.", arg_tpm2_pcrlock); + } + + if (auto_public_key_pcr_mask) { + assert(arg_tpm2_public_key_pcr_mask == 0); + arg_tpm2_public_key_pcr_mask = INDEX_TO_MASK(uint32_t, TPM2_PCR_KERNEL_BOOT); + } + + if (auto_hash_pcr_values && !arg_tpm2_pcrlock) { /* Only lock to PCR 7 if no pcr policy is specified. */ + assert(arg_tpm2_n_hash_pcr_values == 0); + + if (!GREEDY_REALLOC_APPEND( + arg_tpm2_hash_pcr_values, + arg_tpm2_n_hash_pcr_values, + &TPM2_PCR_VALUE_MAKE(TPM2_PCR_INDEX_DEFAULT, /* hash= */ 0, /* value= */ {}), + 1)) + return log_oom(); + } + + if (arg_pretty < 0 && isatty(STDOUT_FILENO)) + arg_pretty = true; + + if (arg_architecture >= 0) { + FOREACH_ARRAY(p, arg_filter_partitions, arg_n_filter_partitions) + *p = gpt_partition_type_override_architecture(*p, arg_architecture); + + FOREACH_ARRAY(p, arg_defer_partitions, arg_n_defer_partitions) + *p = gpt_partition_type_override_architecture(*p, arg_architecture); + } + + return 1; +} + +static int parse_proc_cmdline_factory_reset(void) { + bool b; + int r; + + if (arg_factory_reset >= 0) /* Never override what is specified on the process command line */ + return 0; + + if (!in_initrd()) /* Never honour kernel command line factory reset request outside of the initrd */ + return 0; + + r = proc_cmdline_get_bool("systemd.factory_reset", /* flags = */ 0, &b); + if (r < 0) + return log_error_errno(r, "Failed to parse systemd.factory_reset kernel command line argument: %m"); + if (r > 0) { + arg_factory_reset = b; + + if (b) + log_notice("Honouring factory reset requested via kernel command line."); + } + + return 0; +} + +static int parse_efi_variable_factory_reset(void) { + _cleanup_free_ char *value = NULL; + int r; + + if (arg_factory_reset >= 0) /* Never override what is specified on the process command line */ + return 0; + + if (!in_initrd()) /* Never honour EFI variable factory reset request outside of the initrd */ + return 0; + + r = efi_get_variable_string(EFI_SYSTEMD_VARIABLE(FactoryReset), &value); + if (r < 0) { + if (r == -ENOENT || ERRNO_IS_NOT_SUPPORTED(r)) + return 0; + return log_error_errno(r, "Failed to read EFI variable FactoryReset: %m"); + } + + r = parse_boolean(value); + if (r < 0) + return log_error_errno(r, "Failed to parse EFI variable FactoryReset: %m"); + + arg_factory_reset = r; + if (r) + log_notice("Factory reset requested via EFI variable FactoryReset."); + + return 0; +} + +static int remove_efi_variable_factory_reset(void) { + int r; + + r = efi_set_variable(EFI_SYSTEMD_VARIABLE(FactoryReset), NULL, 0); + if (r < 0) { + if (r == -ENOENT || ERRNO_IS_NOT_SUPPORTED(r)) + return 0; + return log_error_errno(r, "Failed to remove EFI variable FactoryReset: %m"); + } + + log_info("Successfully unset EFI variable FactoryReset."); + return 0; +} + +static int acquire_root_devno( + const char *p, + const char *root, + int mode, + char **ret, + int *ret_fd) { + + _cleanup_free_ char *found_path = NULL, *node = NULL; + dev_t devno, fd_devno = MODE_INVALID; + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + assert(p); + assert(ret); + assert(ret_fd); + + fd = chase_and_open(p, root, CHASE_PREFIX_ROOT, mode, &found_path); + if (fd < 0) + return fd; + + if (fstat(fd, &st) < 0) + return -errno; + + if (S_ISREG(st.st_mode)) { + *ret = TAKE_PTR(found_path); + *ret_fd = TAKE_FD(fd); + return 0; + } + + if (S_ISBLK(st.st_mode)) { + /* Refuse referencing explicit block devices if a root dir is specified, after all we should + * not be able to leave the image the root path constrains us to. */ + if (root) + return -EPERM; + + fd_devno = devno = st.st_rdev; + } else if (S_ISDIR(st.st_mode)) { + + devno = st.st_dev; + if (major(devno) == 0) { + r = btrfs_get_block_device_fd(fd, &devno); + if (r == -ENOTTY) /* not btrfs */ + return -ENODEV; + if (r < 0) + return r; + } + } else + return -ENOTBLK; + + /* From dm-crypt to backing partition */ + r = block_get_originating(devno, &devno); + if (r == -ENOENT) + log_debug_errno(r, "Device '%s' has no dm-crypt/dm-verity device, no need to look for underlying block device.", p); + else if (r < 0) + log_debug_errno(r, "Failed to find underlying block device for '%s', ignoring: %m", p); + + /* From partition to whole disk containing it */ + r = block_get_whole_disk(devno, &devno); + if (r < 0) + log_debug_errno(r, "Failed to find whole disk block device for '%s', ignoring: %m", p); + + r = devname_from_devnum(S_IFBLK, devno, &node); + if (r < 0) + return log_debug_errno(r, "Failed to determine canonical path for '%s': %m", p); + + /* Only if we still look at the same block device we can reuse the fd. Otherwise return an + * invalidated fd. */ + if (fd_devno != MODE_INVALID && fd_devno == devno) { + /* Tell udev not to interfere while we are processing the device */ + if (flock(fd, arg_dry_run ? LOCK_SH : LOCK_EX) < 0) + return log_error_errno(errno, "Failed to lock device '%s': %m", node); + + *ret_fd = TAKE_FD(fd); + } else + *ret_fd = -EBADF; + + *ret = TAKE_PTR(node); + return 0; +} + +static int find_root(Context *context) { + _cleanup_free_ char *device = NULL; + int r; + + assert(context); + + if (arg_node) { + if (arg_empty == EMPTY_CREATE) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *s = NULL; + + s = strdup(arg_node); + if (!s) + return log_oom(); + + fd = open(arg_node, O_RDONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOFOLLOW, 0666); + if (fd < 0) + return log_error_errno(errno, "Failed to create '%s': %m", arg_node); + + context->node = TAKE_PTR(s); + context->node_is_our_file = true; + context->backing_fd = TAKE_FD(fd); + return 0; + } + + /* Note that we don't specify a root argument here: if the user explicitly configured a node + * we'll take it relative to the host, not the image */ + r = acquire_root_devno(arg_node, NULL, O_RDONLY|O_CLOEXEC, &context->node, &context->backing_fd); + if (r == -EUCLEAN) + return btrfs_log_dev_root(LOG_ERR, r, arg_node); + if (r < 0) + return log_error_errno(r, "Failed to open file or determine backing device of %s: %m", arg_node); + + return 0; + } + + assert(IN_SET(arg_empty, EMPTY_REFUSE, EMPTY_ALLOW)); + + /* If the root mount has been replaced by some form of volatile file system (overlayfs), the + * original root block device node is symlinked in /run/systemd/volatile-root. Let's read that + * here. */ + r = readlink_malloc("/run/systemd/volatile-root", &device); + if (r == -ENOENT) { /* volatile-root not found */ + /* Let's search for the root device. We look for two cases here: first in /, and then in /usr. The + * latter we check for cases where / is a tmpfs and only /usr is an actual persistent block device + * (think: volatile setups) */ + + FOREACH_STRING(p, "/", "/usr") { + + r = acquire_root_devno(p, arg_root, O_RDONLY|O_DIRECTORY|O_CLOEXEC, &context->node, + &context->backing_fd); + if (r < 0) { + if (r == -EUCLEAN) + return btrfs_log_dev_root(LOG_ERR, r, p); + if (r != -ENODEV) + return log_error_errno(r, "Failed to determine backing device of %s: %m", p); + } else + return 0; + } + } else if (r < 0) + return log_error_errno(r, "Failed to read symlink /run/systemd/volatile-root: %m"); + else { + r = acquire_root_devno(device, NULL, O_RDONLY|O_CLOEXEC, &context->node, &context->backing_fd); + if (r == -EUCLEAN) + return btrfs_log_dev_root(LOG_ERR, r, device); + if (r < 0) + return log_error_errno(r, "Failed to open file or determine backing device of %s: %m", device); + + return 0; + } + + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "Failed to discover root block device."); +} + +static int resize_pt(int fd, uint64_t sector_size) { + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + int r; + + /* After resizing the backing file we need to resize the partition table itself too, so that it takes + * possession of the enlarged backing file. For this it suffices to open the device with libfdisk and + * immediately write it again, with no changes. */ + + r = fdisk_new_context_at(fd, /* path= */ NULL, /* read_only= */ false, sector_size, &c); + if (r < 0) + return log_error_errno(r, "Failed to open device '%s': %m", FORMAT_PROC_FD_PATH(fd)); + + r = fdisk_has_label(c); + if (r < 0) + return log_error_errno(r, "Failed to determine whether disk '%s' has a disk label: %m", FORMAT_PROC_FD_PATH(fd)); + if (r == 0) { + log_debug("Not resizing partition table, as there currently is none."); + return 0; + } + + r = fdisk_write_disklabel(c); + if (r < 0) + return log_error_errno(r, "Failed to write resized partition table: %m"); + + log_info("Resized partition table."); + return 1; +} + +static int resize_backing_fd( + const char *node, /* The primary way we access the disk image to operate on */ + int *fd, /* An O_RDONLY fd referring to that inode */ + const char *backing_file, /* If the above refers to a loopback device, the backing regular file for that, which we can grow */ + LoopDevice *loop_device, + uint64_t sector_size) { + + _cleanup_close_ int writable_fd = -EBADF; + uint64_t current_size; + struct stat st; + int r; + + assert(node); + assert(fd); + + if (arg_size == UINT64_MAX) /* Nothing to do */ + return 0; + + if (*fd < 0) { + /* Open the file if we haven't opened it yet. Note that we open it read-only here, just to + * keep a reference to the file we can pass around. */ + *fd = open(node, O_RDONLY|O_CLOEXEC); + if (*fd < 0) + return log_error_errno(errno, "Failed to open '%s' in order to adjust size: %m", node); + } + + if (fstat(*fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", node); + + if (S_ISBLK(st.st_mode)) { + if (!backing_file) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), "Cannot resize block device '%s'.", node); + + assert(loop_device); + + if (ioctl(*fd, BLKGETSIZE64, ¤t_size) < 0) + return log_error_errno(errno, "Failed to determine size of block device %s: %m", node); + } else { + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Specified path '%s' is not a regular file or loopback block device, cannot resize: %m", node); + + assert(!backing_file); + assert(!loop_device); + current_size = st.st_size; + } + + if (current_size >= arg_size) { + log_info("File '%s' already is of requested size or larger, not growing. (%s >= %s)", + node, FORMAT_BYTES(current_size), FORMAT_BYTES(arg_size)); + return 0; + } + + if (S_ISBLK(st.st_mode)) { + assert(backing_file); + + /* This is a loopback device. We can't really grow those directly, but we can grow the + * backing file, hence let's do that. */ + + writable_fd = open(backing_file, O_WRONLY|O_CLOEXEC|O_NONBLOCK); + if (writable_fd < 0) + return log_error_errno(errno, "Failed to open backing file '%s': %m", backing_file); + + if (fstat(writable_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat() backing file '%s': %m", backing_file); + + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Backing file '%s' of block device is not a regular file: %m", backing_file); + + if ((uint64_t) st.st_size != current_size) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Size of backing file '%s' of loopback block device '%s' don't match, refusing.", + node, backing_file); + } else { + assert(S_ISREG(st.st_mode)); + assert(!backing_file); + + /* The file descriptor is read-only. In order to grow the file we need to have a writable fd. We + * reopen the file for that temporarily. We keep the writable fd only open for this operation though, + * as fdisk can't accept it anyway. */ + + writable_fd = fd_reopen(*fd, O_WRONLY|O_CLOEXEC); + if (writable_fd < 0) + return log_error_errno(writable_fd, "Failed to reopen backing file '%s' writable: %m", node); + } + + if (!arg_discard) { + if (fallocate(writable_fd, 0, 0, arg_size) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return log_error_errno(errno, "Failed to grow '%s' from %s to %s by allocation: %m", + node, FORMAT_BYTES(current_size), FORMAT_BYTES(arg_size)); + + /* Fallback to truncation, if fallocate() is not supported. */ + log_debug("Backing file system does not support fallocate(), falling back to ftruncate()."); + } else { + if (current_size == 0) /* Likely regular file just created by us */ + log_info("Allocated %s for '%s'.", FORMAT_BYTES(arg_size), node); + else + log_info("File '%s' grown from %s to %s by allocation.", + node, FORMAT_BYTES(current_size), FORMAT_BYTES(arg_size)); + + goto done; + } + } + + if (ftruncate(writable_fd, arg_size) < 0) + return log_error_errno(errno, "Failed to grow '%s' from %s to %s by truncation: %m", + node, FORMAT_BYTES(current_size), FORMAT_BYTES(arg_size)); + + if (current_size == 0) /* Likely regular file just created by us */ + log_info("Sized '%s' to %s.", node, FORMAT_BYTES(arg_size)); + else + log_info("File '%s' grown from %s to %s by truncation.", + node, FORMAT_BYTES(current_size), FORMAT_BYTES(arg_size)); + +done: + r = resize_pt(writable_fd, sector_size); + if (r < 0) + return r; + + if (loop_device) { + r = loop_device_refresh_size(loop_device, UINT64_MAX, arg_size); + if (r < 0) + return log_error_errno(r, "Failed to update loop device size: %m"); + } + + return 1; +} + +static int determine_auto_size(Context *c) { + uint64_t sum; + + assert(c); + + sum = round_up_size(GPT_METADATA_SIZE, 4096); + + LIST_FOREACH(partitions, p, c->partitions) { + uint64_t m; + + if (p->dropped) + continue; + + m = partition_min_size_with_padding(c, p); + if (m > UINT64_MAX - sum) + return log_error_errno(SYNTHETIC_ERRNO(EOVERFLOW), "Image would grow too large, refusing."); + + sum += m; + } + + if (c->total != UINT64_MAX) + /* Image already allocated? Then show its size. */ + log_info("Automatically determined minimal disk image size as %s, current image size is %s.", + FORMAT_BYTES(sum), FORMAT_BYTES(c->total)); + else + /* If the image is being created right now, then it has no previous size, suppress any comment about it hence. */ + log_info("Automatically determined minimal disk image size as %s.", + FORMAT_BYTES(sum)); + + arg_size = sum; + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + _cleanup_(context_freep) Context* context = NULL; + bool node_is_our_loop = false; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = parse_proc_cmdline_factory_reset(); + if (r < 0) + return r; + + r = parse_efi_variable_factory_reset(); + if (r < 0) + return r; + +#if HAVE_LIBCRYPTSETUP + cryptsetup_enable_logging(NULL); +#endif + + if (arg_image) { + assert(!arg_root); + + /* Mount this strictly read-only: we shall modify the partition table, not the file + * systems */ + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_MOUNT_READ_ONLY | + (arg_node ? DISSECT_IMAGE_DEVICE_READ_ONLY : 0) | /* If a different node to make changes to is specified let's open the device in read-only mode) */ + DISSECT_IMAGE_GPT_ONLY | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + + if (!arg_node) { + arg_node = strdup(loop_device->node); + if (!arg_node) + return log_oom(); + + /* Remember that the device we are about to manipulate is actually the one we + * allocated here, and thus to increase its backing file we know what to do */ + node_is_our_loop = true; + } + } + + if (!arg_copy_source && arg_root) { + /* If no explicit copy source is specified, then use --root=/--image= */ + arg_copy_source = strdup(arg_root); + if (!arg_copy_source) + return log_oom(); + } + + context = context_new(arg_seed); + if (!context) + return log_oom(); + + r = context_copy_from(context); + if (r < 0) + return r; + + if (arg_make_ddi) { + _cleanup_free_ char *d = NULL, *dp = NULL; + assert(!arg_definitions); + + d = strjoin(arg_make_ddi, ".repart.d/"); + if (!d) + return log_oom(); + + r = search_and_access(d, F_OK, arg_root, CONF_PATHS_USR_STRV("systemd/repart/definitions"), &dp); + if (r < 0) + return log_error_errno(r, "DDI type '%s' is not defined: %m", arg_make_ddi); + + if (strv_consume(&arg_definitions, TAKE_PTR(dp)) < 0) + return log_oom(); + } else + strv_uniq(arg_definitions); + + r = context_read_definitions(context); + if (r < 0) + return r; + + r = find_root(context); + if (r == -ENODEV) + return 76; /* Special return value which means "Root block device not found, so not doing + * anything". This isn't really an error when called at boot. */ + if (r < 0) + return r; + + if (arg_size != UINT64_MAX) { + r = resize_backing_fd( + context->node, + &context->backing_fd, + node_is_our_loop ? arg_image : NULL, + node_is_our_loop ? loop_device : NULL, + context->sector_size); + if (r < 0) + return r; + } + + r = context_load_partition_table(context); + if (r == -EHWPOISON) + return 77; /* Special return value which means "Not GPT, so not doing anything". This isn't + * really an error when called at boot. */ + if (r < 0) + return r; + context->from_scratch = r > 0; /* Starting from scratch */ + + if (arg_can_factory_reset) { + r = context_can_factory_reset(context); + if (r < 0) + return r; + if (r == 0) + return EXIT_FAILURE; + + return 0; + } + + r = context_factory_reset(context); + if (r < 0) + return r; + if (r > 0) { + /* We actually did a factory reset! */ + r = remove_efi_variable_factory_reset(); + if (r < 0) + return r; + + /* Reload the reduced partition table */ + context_unload_partition_table(context); + r = context_load_partition_table(context); + if (r < 0) + return r; + } + + r = context_read_seed(context, arg_root); + if (r < 0) + return r; + + /* Make sure each partition has a unique UUID and unique label */ + r = context_acquire_partition_uuids_and_labels(context); + if (r < 0) + return r; + + /* Open all files to copy blocks from now, since we want to take their size into consideration */ + r = context_open_copy_block_paths( + context, + loop_device ? loop_device->devno : /* if --image= is specified, only allow partitions on the loopback device */ + arg_root && !arg_image ? 0 : /* if --root= is specified, don't accept any block device */ + (dev_t) -1); /* if neither is specified, make no restrictions */ + if (r < 0) + return r; + + r = context_minimize(context); + if (r < 0) + return r; + + if (arg_size_auto) { + r = determine_auto_size(context); + if (r < 0) + return r; + + /* Flush out everything again, and let's grow the file first, then start fresh */ + context_unload_partition_table(context); + + assert(arg_size != UINT64_MAX); + r = resize_backing_fd( + context->node, + &context->backing_fd, + node_is_our_loop ? arg_image : NULL, + node_is_our_loop ? loop_device : NULL, + context->sector_size); + if (r < 0) + return r; + + r = context_load_partition_table(context); + if (r < 0) + return r; + } + + /* First try to fit new partitions in, dropping by priority until it fits */ + for (;;) { + uint64_t largest_free_area; + + if (context_allocate_partitions(context, &largest_free_area)) + break; /* Success! */ + + if (!context_drop_or_foreignize_one_priority(context)) { + r = log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Can't fit requested partitions into available free space (%s), refusing.", + FORMAT_BYTES(largest_free_area)); + determine_auto_size(context); + return r; + } + } + + /* Now assign free space according to the weight logic */ + r = context_grow_partitions(context); + if (r < 0) + return r; + + /* Now calculate where each new partition gets placed */ + context_place_partitions(context); + + (void) context_dump(context, /*late=*/ false); + + r = context_write_partition_table(context); + if (r < 0) + return r; + + r = context_split(context); + if (r < 0) + return r; + + (void) context_dump(context, /*late=*/ true); + + context->node = mfree(context->node); + + LIST_FOREACH(partitions, p, context->partitions) + p->split_path = mfree(p->split_path); + + return 0; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/path/meson.build b/src/path/meson.build new file mode 100644 index 0000000..70d3dd0 --- /dev/null +++ b/src/path/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-path', + 'public' : true, + 'sources' : files('path.c'), + }, +] diff --git a/src/path/path.c b/src/path/path.c new file mode 100644 index 0000000..1e69c6a --- /dev/null +++ b/src/path/path.c @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-path.h" + +#include "alloc-util.h" +#include "build.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "pager.h" +#include "pretty-print.h" +#include "string-util.h" + +static const char *arg_suffix = NULL; +static PagerFlags arg_pager_flags = 0; + +static const char* const path_table[_SD_PATH_MAX] = { + [SD_PATH_TEMPORARY] = "temporary", + [SD_PATH_TEMPORARY_LARGE] = "temporary-large", + + [SD_PATH_SYSTEM_BINARIES] = "system-binaries", + [SD_PATH_SYSTEM_INCLUDE] = "system-include", + [SD_PATH_SYSTEM_LIBRARY_PRIVATE] = "system-library-private", + [SD_PATH_SYSTEM_LIBRARY_ARCH] = "system-library-arch", + [SD_PATH_SYSTEM_SHARED] = "system-shared", + [SD_PATH_SYSTEM_CONFIGURATION_FACTORY] = "system-configuration-factory", + [SD_PATH_SYSTEM_STATE_FACTORY] = "system-state-factory", + + [SD_PATH_SYSTEM_CONFIGURATION] = "system-configuration", + [SD_PATH_SYSTEM_RUNTIME] = "system-runtime", + [SD_PATH_SYSTEM_RUNTIME_LOGS] = "system-runtime-logs", + [SD_PATH_SYSTEM_STATE_PRIVATE] = "system-state-private", + [SD_PATH_SYSTEM_STATE_LOGS] = "system-state-logs", + [SD_PATH_SYSTEM_STATE_CACHE] = "system-state-cache", + [SD_PATH_SYSTEM_STATE_SPOOL] = "system-state-spool", + + [SD_PATH_USER_BINARIES] = "user-binaries", + [SD_PATH_USER_LIBRARY_PRIVATE] = "user-library-private", + [SD_PATH_USER_LIBRARY_ARCH] = "user-library-arch", + [SD_PATH_USER_SHARED] = "user-shared", + + [SD_PATH_USER_CONFIGURATION] = "user-configuration", + [SD_PATH_USER_RUNTIME] = "user-runtime", + [SD_PATH_USER_STATE_CACHE] = "user-state-cache", + [SD_PATH_USER_STATE_PRIVATE] = "user-state-private", + + [SD_PATH_USER] = "user", + [SD_PATH_USER_DOCUMENTS] = "user-documents", + [SD_PATH_USER_MUSIC] = "user-music", + [SD_PATH_USER_PICTURES] = "user-pictures", + [SD_PATH_USER_VIDEOS] = "user-videos", + [SD_PATH_USER_DOWNLOAD] = "user-download", + [SD_PATH_USER_PUBLIC] = "user-public", + [SD_PATH_USER_TEMPLATES] = "user-templates", + [SD_PATH_USER_DESKTOP] = "user-desktop", + + [SD_PATH_SEARCH_BINARIES] = "search-binaries", + [SD_PATH_SEARCH_BINARIES_DEFAULT] = "search-binaries-default", + [SD_PATH_SEARCH_LIBRARY_PRIVATE] = "search-library-private", + [SD_PATH_SEARCH_LIBRARY_ARCH] = "search-library-arch", + [SD_PATH_SEARCH_SHARED] = "search-shared", + [SD_PATH_SEARCH_CONFIGURATION_FACTORY] = "search-configuration-factory", + [SD_PATH_SEARCH_STATE_FACTORY] = "search-state-factory", + [SD_PATH_SEARCH_CONFIGURATION] = "search-configuration", + + [SD_PATH_SYSTEMD_UTIL] = "systemd-util", + + [SD_PATH_SYSTEMD_SYSTEM_UNIT] = "systemd-system-unit", + [SD_PATH_SYSTEMD_SYSTEM_PRESET] = "systemd-system-preset", + [SD_PATH_SYSTEMD_SYSTEM_CONF] = "systemd-system-conf", + [SD_PATH_SYSTEMD_USER_UNIT] = "systemd-user-unit", + [SD_PATH_SYSTEMD_USER_PRESET] = "systemd-user-preset", + [SD_PATH_SYSTEMD_USER_CONF] = "systemd-user-conf", + + [SD_PATH_SYSTEMD_SEARCH_SYSTEM_UNIT] = "systemd-search-system-unit", + [SD_PATH_SYSTEMD_SEARCH_USER_UNIT] = "systemd-search-user-unit", + + [SD_PATH_SYSTEMD_SYSTEM_GENERATOR] = "systemd-system-generator", + [SD_PATH_SYSTEMD_USER_GENERATOR] = "systemd-user-generator", + [SD_PATH_SYSTEMD_SEARCH_SYSTEM_GENERATOR] = "systemd-search-system-generator", + [SD_PATH_SYSTEMD_SEARCH_USER_GENERATOR] = "systemd-search-user-generator", + + [SD_PATH_SYSTEMD_SLEEP] = "systemd-sleep", + [SD_PATH_SYSTEMD_SHUTDOWN] = "systemd-shutdown", + + [SD_PATH_TMPFILES] = "tmpfiles", + [SD_PATH_SYSUSERS] = "sysusers", + [SD_PATH_SYSCTL] = "sysctl", + [SD_PATH_BINFMT] = "binfmt", + [SD_PATH_MODULES_LOAD] = "modules-load", + [SD_PATH_CATALOG] = "catalog", + + [SD_PATH_SYSTEMD_SEARCH_NETWORK] = "systemd-search-network", + + [SD_PATH_SYSTEMD_SYSTEM_ENVIRONMENT_GENERATOR] = "systemd-system-environment-generator", + [SD_PATH_SYSTEMD_USER_ENVIRONMENT_GENERATOR] = "systemd-user-environment-generator", + [SD_PATH_SYSTEMD_SEARCH_SYSTEM_ENVIRONMENT_GENERATOR] = "systemd-search-system-environment-generator", + [SD_PATH_SYSTEMD_SEARCH_USER_ENVIRONMENT_GENERATOR] = "systemd-search-user-environment-generator", +}; + +static int list_paths(void) { + int r = 0; + + pager_open(arg_pager_flags); + + for (size_t i = 0; i < ELEMENTSOF(path_table); i++) { + _cleanup_free_ char *p = NULL; + int q; + + q = sd_path_lookup(i, arg_suffix, &p); + if (q < 0) { + log_full_errno(q == -ENXIO ? LOG_DEBUG : LOG_ERR, + q, "Failed to query %s: %m", path_table[i]); + if (q != -ENXIO) + RET_GATHER(r, q); + continue; + } + + printf("%s%s:%s %s\n", ansi_highlight(), path_table[i], ansi_normal(), p); + } + + return r; +} + +static int print_path(const char *n) { + int r; + + for (size_t i = 0; i < ELEMENTSOF(path_table); i++) + if (streq(path_table[i], n)) { + _cleanup_free_ char *p = NULL; + + r = sd_path_lookup(i, arg_suffix, &p); + if (r < 0) + return log_error_errno(r, "Failed to query %s: %m", n); + + printf("%s\n", p); + return 0; + } + + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Path %s not known.", n); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-path", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [NAME...]\n\n" + "Show system and user paths.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --suffix=SUFFIX Suffix to append to paths\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_SUFFIX, + ARG_NO_PAGER, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "suffix", required_argument, NULL, ARG_SUFFIX }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_SUFFIX: + arg_suffix = optarg; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char* argv[]) { + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (argc > optind) + for (int i = optind; i < argc; i++) + RET_GATHER(r, print_path(argv[i])); + else + r = list_paths(); + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/pcrextend/meson.build b/src/pcrextend/meson.build new file mode 100644 index 0000000..05c5350 --- /dev/null +++ b/src/pcrextend/meson.build @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-pcrextend', + 'conditions' : [ + 'HAVE_BLKID', + 'ENABLE_BOOTLOADER', + 'HAVE_OPENSSL', + 'HAVE_TPM2', + ], + 'sources' : files('pcrextend.c'), + 'dependencies' : [ + libblkid, + libopenssl, + tpm2, + ], + }, +] diff --git a/src/pcrextend/pcrextend.c b/src/pcrextend/pcrextend.c new file mode 100644 index 0000000..1295949 --- /dev/null +++ b/src/pcrextend/pcrextend.c @@ -0,0 +1,391 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include + +#include "build.h" +#include "efi-loader.h" +#include "escape.h" +#include "main-func.h" +#include "openssl-util.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pcrextend-util.h" +#include "pretty-print.h" +#include "strv.h" +#include "tpm2-pcr.h" +#include "tpm2-util.h" +#include "varlink.h" +#include "varlink-io.systemd.PCRExtend.h" + +static bool arg_graceful = false; +static char *arg_tpm2_device = NULL; +static char **arg_banks = NULL; +static char *arg_file_system = NULL; +static bool arg_machine_id = false; +static unsigned arg_pcr_index = UINT_MAX; +static bool arg_varlink = false; + +STATIC_DESTRUCTOR_REGISTER(arg_banks, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep); +STATIC_DESTRUCTOR_REGISTER(arg_file_system, freep); + +#define EXTENSION_STRING_SAFE_LIMIT 1024 + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-pcrextend", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] WORD\n" + "%1$s [OPTIONS...] --file-system=PATH\n" + "%1$s [OPTIONS...] --machine-id\n" + "\n%5$sExtend a TPM2 PCR with boot phase, machine ID, or file system ID.%6$s\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Print version\n" + " --bank=DIGEST Select TPM PCR bank (SHA1, SHA256)\n" + " --pcr=INDEX Select TPM PCR index (0…23)\n" + " --tpm2-device=PATH Use specified TPM2 device\n" + " --graceful Exit gracefully if no TPM2 device is found\n" + " --file-system=PATH Measure UUID/labels of file system into PCR 15\n" + " --machine-id Measure machine ID into PCR 15\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_BANK, + ARG_PCR, + ARG_TPM2_DEVICE, + ARG_GRACEFUL, + ARG_FILE_SYSTEM, + ARG_MACHINE_ID, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "bank", required_argument, NULL, ARG_BANK }, + { "pcr", required_argument, NULL, ARG_PCR }, + { "tpm2-device", required_argument, NULL, ARG_TPM2_DEVICE }, + { "graceful", no_argument, NULL, ARG_GRACEFUL }, + { "file-system", required_argument, NULL, ARG_FILE_SYSTEM }, + { "machine-id", no_argument, NULL, ARG_MACHINE_ID }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + help(0, NULL, NULL); + return 0; + + case ARG_VERSION: + return version(); + + case ARG_BANK: { + const EVP_MD *implementation; + + implementation = EVP_get_digestbyname(optarg); + if (!implementation) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown bank '%s', refusing.", optarg); + + if (strv_extend(&arg_banks, EVP_MD_name(implementation)) < 0) + return log_oom(); + + break; + } + + case ARG_PCR: + r = tpm2_pcr_index_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse PCR index: %s", optarg); + + arg_pcr_index = r; + break; + + case ARG_TPM2_DEVICE: { + _cleanup_free_ char *device = NULL; + + if (streq(optarg, "list")) + return tpm2_list_devices(); + + if (!streq(optarg, "auto")) { + device = strdup(optarg); + if (!device) + return log_oom(); + } + + free_and_replace(arg_tpm2_device, device); + break; + } + + case ARG_GRACEFUL: + arg_graceful = true; + break; + + case ARG_FILE_SYSTEM: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_file_system); + if (r < 0) + return r; + + break; + + case ARG_MACHINE_ID: + arg_machine_id = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_file_system && arg_machine_id) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--file-system= and --machine-id may not be combined."); + + r = varlink_invocation(VARLINK_ALLOW_ACCEPT); + if (r < 0) + return log_error_errno(r, "Failed to check if invoked in Varlink mode: %m"); + if (r > 0) + arg_varlink = true; + else if (arg_pcr_index == UINT_MAX) + arg_pcr_index = (arg_file_system || arg_machine_id) ? + TPM2_PCR_SYSTEM_IDENTITY : /* → PCR 15 */ + TPM2_PCR_KERNEL_BOOT; /* → PCR 11 */ + + return 1; +} + +static int determine_banks(Tpm2Context *c, unsigned target_pcr_nr) { + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(c); + + if (!strv_isempty(arg_banks)) /* Explicitly configured? Then use that */ + return 0; + + r = tpm2_get_good_pcr_banks_strv(c, UINT32_C(1) << target_pcr_nr, &l); + if (r < 0) + return log_error_errno(r, "Could not verify pcr banks: %m"); + + strv_free_and_replace(arg_banks, l); + return 0; +} + +static int extend_now(unsigned pcr, const void *data, size_t size, Tpm2UserspaceEventType event) { + _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL; + int r; + + r = tpm2_context_new(arg_tpm2_device, &c); + if (r < 0) + return r; + + r = determine_banks(c, pcr); + if (r < 0) + return r; + if (strv_isempty(arg_banks)) /* Still none? */ + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Found a TPM2 without enabled PCR banks. Can't operate."); + + _cleanup_free_ char *joined_banks = NULL; + joined_banks = strv_join(arg_banks, ", "); + if (!joined_banks) + return log_oom(); + + _cleanup_free_ char *safe = NULL; + if (size > EXTENSION_STRING_SAFE_LIMIT) { + safe = cescape_length(data, EXTENSION_STRING_SAFE_LIMIT); + if (!safe) + return log_oom(); + + if (!strextend(&safe, "...")) + return log_oom(); + } else { + safe = cescape_length(data, size); + if (!safe) + return log_oom(); + } + + log_debug("Measuring '%s' into PCR index %u, banks %s.", safe, pcr, joined_banks); + + r = tpm2_extend_bytes(c, arg_banks, pcr, data, size, /* secret= */ NULL, /* secret_size= */ 0, event, safe); + if (r < 0) + return log_error_errno(r, "Could not extend PCR: %m"); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_TPM_PCR_EXTEND_STR, + LOG_MESSAGE("Extended PCR index %u with '%s' (banks %s).", pcr, safe, joined_banks), + "MEASURING=%s", safe, + "PCR=%u", pcr, + "BANKS=%s", joined_banks); + + return 0; +} + +typedef struct MethodExtendParameters { + unsigned pcr; + const char *text; + struct iovec data; +} MethodExtendParameters; + +static void method_extend_parameters_done(MethodExtendParameters *p) { + assert(p); + + iovec_done(&p->data); +} + +static int vl_method_extend(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "pcr", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(MethodExtendParameters, pcr), JSON_MANDATORY }, + { "text", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(MethodExtendParameters, text), 0 }, + { "data", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(MethodExtendParameters, data), 0 }, + {} + }; + _cleanup_(method_extend_parameters_done) MethodExtendParameters p = { + .pcr = UINT_MAX, + }; + int r; + + assert(link); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (!TPM2_PCR_INDEX_VALID(p.pcr)) + return varlink_errorb(link, VARLINK_ERROR_INVALID_PARAMETER, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("parameter", "pcr"))); + + if (p.text) { + /* Specifying both the text string and the binary data is not allowed */ + if (p.data.iov_base) + return varlink_errorb(link, VARLINK_ERROR_INVALID_PARAMETER, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("parameter", "data"))); + + r = extend_now(p.pcr, p.text, strlen(p.text), _TPM2_USERSPACE_EVENT_TYPE_INVALID); + } else if (p.data.iov_base) + r = extend_now(p.pcr, p.data.iov_base, p.data.iov_len, _TPM2_USERSPACE_EVENT_TYPE_INVALID); + else + return varlink_errorb(link, VARLINK_ERROR_INVALID_PARAMETER, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("parameter", "text"))); + if (r < 0) + return r; + + return varlink_reply(link, NULL); +} + +static int run(int argc, char *argv[]) { + _cleanup_free_ char *word = NULL; + Tpm2UserspaceEventType event; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_varlink) { + _cleanup_(varlink_server_unrefp) VarlinkServer *varlink_server = NULL; + + /* Invocation as Varlink service */ + + r = varlink_server_new(&varlink_server, VARLINK_SERVER_ROOT_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to allocate Varlink server: %m"); + + r = varlink_server_add_interface(varlink_server, &vl_interface_io_systemd_PCRExtend); + if (r < 0) + return log_error_errno(r, "Failed to add Varlink interface: %m"); + + r = varlink_server_bind_method(varlink_server, "io.systemd.PCRExtend.Extend", vl_method_extend); + if (r < 0) + return log_error_errno(r, "Failed to bind Varlink method: %m"); + + r = varlink_server_loop_auto(varlink_server); + if (r < 0) + return log_error_errno(r, "Failed to run Varlink event loop: %m"); + + return EXIT_SUCCESS; + } + + if (arg_file_system) { + if (optind != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected no argument."); + + r = pcrextend_file_system_word(arg_file_system, &word, NULL); + if (r < 0) + return r; + + event = TPM2_EVENT_FILESYSTEM; + + } else if (arg_machine_id) { + + if (optind != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected no argument."); + + r = pcrextend_machine_id_word(&word); + if (r < 0) + return r; + + event = TPM2_EVENT_MACHINE_ID; + + } else { + if (optind+1 != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected a single argument."); + + word = strdup(argv[optind]); + if (!word) + return log_oom(); + + /* Refuse to measure an empty word. We want to be able to write the series of measured words + * separated by colons, where multiple separating colons are collapsed. Thus it makes sense to + * disallow an empty word to avoid ambiguities. */ + if (isempty(word)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "String to measure cannot be empty, refusing."); + + event = TPM2_EVENT_PHASE; + } + + if (arg_graceful && tpm2_support() != TPM2_SUPPORT_FULL) { + log_notice("No complete TPM2 support detected, exiting gracefully."); + return EXIT_SUCCESS; + } + + /* Skip logic if sd-stub is not used, after all PCR 11 might have a very different purpose then. */ + r = efi_measured_uki(LOG_ERR); + if (r < 0) + return r; + if (r == 0) { + log_info("Kernel stub did not measure kernel image into PCR %i, skipping userspace measurement, too.", TPM2_PCR_KERNEL_BOOT); + return EXIT_SUCCESS; + } + + r = extend_now(arg_pcr_index, word, strlen(word), event); + if (r < 0) + return log_error_errno(r, "Failed to create TPM2 context: %m"); + + return EXIT_SUCCESS; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/pcrlock/meson.build b/src/pcrlock/meson.build new file mode 100644 index 0000000..a31b30b --- /dev/null +++ b/src/pcrlock/meson.build @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-pcrlock', + 'conditions' : [ + 'HAVE_OPENSSL', + 'HAVE_TPM2' + ], + 'sources' : files( + 'pcrlock.c', + 'pcrlock-firmware.c', + 'pehash.c', + ), + 'dependencies' : [ + libm, + libopenssl, + tpm2, + ], + }, +] + +if conf.get('HAVE_OPENSSL') == 1 and conf.get('HAVE_TPM2') == 1 + install_data('pcrlock.d/350-action-efi-application.pcrlock', install_dir : pcrlockdir) + install_data('pcrlock.d/400-secureboot-separator.pcrlock.d/300-0x00000000.pcrlock', install_dir : pcrlockdir / '400-secureboot-separator.pcrlock.d') + install_data('pcrlock.d/400-secureboot-separator.pcrlock.d/600-0xffffffff.pcrlock', install_dir : pcrlockdir / '400-secureboot-separator.pcrlock.d') + install_data('pcrlock.d/500-separator.pcrlock.d/300-0x00000000.pcrlock', install_dir : pcrlockdir / '500-separator.pcrlock.d') + install_data('pcrlock.d/500-separator.pcrlock.d/600-0xffffffff.pcrlock', install_dir : pcrlockdir / '500-separator.pcrlock.d') + install_data('pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/300-present.pcrlock', install_dir : pcrlockdir / '700-action-efi-exit-boot-services.pcrlock.d') + install_data('pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/600-absent.pcrlock', install_dir : pcrlockdir / '700-action-efi-exit-boot-services.pcrlock.d') + install_data('pcrlock.d/750-enter-initrd.pcrlock', install_dir : pcrlockdir) + install_data('pcrlock.d/800-leave-initrd.pcrlock', install_dir : pcrlockdir) + install_data('pcrlock.d/850-sysinit.pcrlock', install_dir : pcrlockdir) + install_data('pcrlock.d/900-ready.pcrlock', install_dir : pcrlockdir) + install_data('pcrlock.d/950-shutdown.pcrlock', install_dir : pcrlockdir) + install_data('pcrlock.d/990-final.pcrlock', install_dir : pcrlockdir) +endif diff --git a/src/pcrlock/pcrlock-firmware.c b/src/pcrlock/pcrlock-firmware.c new file mode 100644 index 0000000..73c68c2 --- /dev/null +++ b/src/pcrlock/pcrlock-firmware.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "pcrlock-firmware.h" +#include "unaligned.h" + +static int tcg_pcr_event2_digests_size( + const TCG_EfiSpecIdEventAlgorithmSize *algorithms, + size_t n_algorithms, + size_t *ret) { + + size_t m = 0; + + assert(algorithms || n_algorithms == 0); + assert(ret); + + FOREACH_ARRAY(a, algorithms, n_algorithms) { + + if (a->digestSize > UINT32_MAX - offsetof(TPMT_HA, digest) - m) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Accumulated hash size too large"); + + m += offsetof(TPMT_HA, digest) + a->digestSize; + } + + *ret = m; + return 0; +} + +int validate_firmware_event( + const TCG_PCR_EVENT2 *event, + size_t left, + const TCG_EfiSpecIdEventAlgorithmSize *algorithms, + size_t n_algorithms, + const TCG_PCR_EVENT2 **ret_next_event, + size_t *ret_left, + const void **ret_payload, + size_t *ret_payload_size) { + + size_t digests_size; + int r; + + assert(event); + assert(algorithms || n_algorithms == 0); + assert(ret_next_event); + assert(ret_left); + + if (left == 0) { + *ret_next_event = NULL; + *ret_left = 0; + return 0; + } + + r = tcg_pcr_event2_digests_size(algorithms, n_algorithms, &digests_size); + if (r < 0) + return r; + + if (left < (uint64_t) offsetof(TCG_PCR_EVENT2, digests.digests) + (uint64_t) digests_size + sizeof(uint32_t)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event header too short."); + + if (event->digests.count != n_algorithms) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Number of digests in event doesn't match log."); + + uint32_t eventSize = unaligned_read_ne32((const uint8_t*) &event->digests.digests + digests_size); + uint64_t size = (uint64_t) offsetof(TCG_PCR_EVENT2, digests.digests) + (uint64_t) digests_size + sizeof(uint32_t) + eventSize; + + if (size > left) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event header too short."); + + *ret_next_event = (const TCG_PCR_EVENT2*) ((const uint8_t*) event + size); + *ret_left = left - size; + + if (ret_payload) + *ret_payload = (const uint8_t*) &event->digests.digests + digests_size + sizeof(uint32_t); + if (ret_payload_size) + *ret_payload_size = eventSize; + + return 1; +} + +int validate_firmware_header( + const void *start, + size_t size, + const TCG_EfiSpecIdEventAlgorithmSize **ret_algorithms, + size_t *ret_n_algorithms, + const TCG_PCR_EVENT2 **ret_first, + size_t *ret_left) { + + assert(start || size == 0); + assert(ret_algorithms); + assert(ret_n_algorithms); + assert(ret_first); + assert(ret_left); + + if (size < offsetof(TCG_PCClientPCREvent, event)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log too short for TCG_PCClientPCREvent."); + + const TCG_PCClientPCREvent *h = start; + + if (size < (uint64_t) offsetof(TCG_PCClientPCREvent, event) + (uint64_t) h->eventDataSize) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log too short for TCG_PCClientPCREvent events data."); + + if (h->pcrIndex != 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log header has unexpected PCR index %" PRIu32, h->pcrIndex); + if (h->eventType != EV_NO_ACTION) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log header has unexpected event type 0x%" PRIx32, h->eventType); + if (!memeqzero(h->digest, sizeof(h->digest))) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log header has unexpected non-zero digest."); + + if (h->eventDataSize < offsetof(TCG_EfiSpecIDEvent, digestSizes)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log header too short for TCG_EfiSpecIdEvent."); + + const TCG_EfiSpecIDEvent *id = (const TCG_EfiSpecIDEvent*) h->event; + + /* Signature as per "TCG PC Client Specific Platform Firmware Profile Specification" + * (https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/), + * section 10.4.5.1 "Specification ID Version Event" (at least in version 1.05 Revision 23 of the + * spec) */ + if (memcmp(id->signature, + (const uint8_t[]) { 0x53, 0x70, 0x65, 0x63, 0x20, 0x49, 0x44, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x30, 0x33, 0x00 }, + sizeof(id->signature)) != 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Missing TPM2 event log signature."); + + if (id->numberOfAlgorithms <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Number of advertised hash algorithms is zero."); + if (id->numberOfAlgorithms > UINT32_MAX / sizeof(TCG_EfiSpecIdEventAlgorithmSize)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Number of advertised hash algorithms too large."); + + log_debug("TPM PC Client Platform Firmware Profile: family %u.%u, revision %u.%u", + id->specVersionMajor, id->specVersionMinor, + id->specErrata / 100, id->specErrata % 100); + + if (h->eventDataSize < (uint64_t) offsetof(TCG_EfiSpecIDEvent, digestSizes) + (uint64_t) (id->numberOfAlgorithms * sizeof(TCG_EfiSpecIdEventAlgorithmSize)) + 1U) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log header doesn't fit all algorithms."); + + uint8_t vendorInfoSize = *((const uint8_t*) id + offsetof(TCG_EfiSpecIDEvent, digestSizes) + (id->numberOfAlgorithms * sizeof(TCG_EfiSpecIdEventAlgorithmSize))); + if (h->eventDataSize != offsetof(TCG_EfiSpecIDEvent, digestSizes) + (id->numberOfAlgorithms * sizeof(TCG_EfiSpecIdEventAlgorithmSize)) + 1U + vendorInfoSize) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log header doesn't fit vendor info."); + + for (size_t i = 0; i < id->numberOfAlgorithms; i++) { + const EVP_MD *implementation; + const char *a; + + a = tpm2_hash_alg_to_string(id->digestSizes[i].algorithmId); + if (!a) { + log_notice("Event log advertises unknown hash algorithm 0x%4x, can't validate.", id->digestSizes[i].algorithmId); + continue; + } + + implementation = EVP_get_digestbyname(a); + if (!implementation) { + log_notice("Event log advertises hash algorithm '%s' we don't implement, can't validate.", a); + continue; + } + + if (EVP_MD_size(implementation) != id->digestSizes[i].digestSize) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Advertised digest size for '%s' is wrong, refusing.", a); + } + + *ret_algorithms = id->digestSizes; + *ret_n_algorithms = id->numberOfAlgorithms; + + size_t offset = offsetof(TCG_PCClientPCREvent, event) + h->eventDataSize; + *ret_first = (TCG_PCR_EVENT2*) ((const uint8_t*) h + offset); + *ret_left = size - offset; + + return 0; +} diff --git a/src/pcrlock/pcrlock-firmware.h b/src/pcrlock/pcrlock-firmware.h new file mode 100644 index 0000000..169666e --- /dev/null +++ b/src/pcrlock/pcrlock-firmware.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "tpm2-event-log.h" +#include "tpm2-util.h" + +int validate_firmware_event( + const TCG_PCR_EVENT2 *event, + size_t left, + const TCG_EfiSpecIdEventAlgorithmSize *algorithms, + size_t n_algorithms, + const TCG_PCR_EVENT2 **ret_next_event, + size_t *ret_left, + const void **ret_payload, + size_t *ret_payload_size); + +int validate_firmware_header( + const void *start, + size_t size, + const TCG_EfiSpecIdEventAlgorithmSize **ret_algorithms, + size_t *ret_n_algorithms, + const TCG_PCR_EVENT2 **ret_first, + size_t *ret_left); diff --git a/src/pcrlock/pcrlock.c b/src/pcrlock/pcrlock.c new file mode 100644 index 0000000..bdc6bbd --- /dev/null +++ b/src/pcrlock/pcrlock.c @@ -0,0 +1,5011 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "ask-password-api.h" +#include "blockdev-util.h" +#include "build.h" +#include "chase.h" +#include "conf-files.h" +#include "efi-api.h" +#include "env-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "format-util.h" +#include "fs-util.h" +#include "gpt.h" +#include "hash-funcs.h" +#include "hexdecoct.h" +#include "initrd-util.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "openssl-util.h" +#include "ordered-set.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pcrextend-util.h" +#include "pcrlock-firmware.h" +#include "pehash.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "random-util.h" +#include "recovery-key.h" +#include "sort-util.h" +#include "terminal-util.h" +#include "tpm2-util.h" +#include "unaligned.h" +#include "unit-name.h" +#include "utf8.h" +#include "verbs.h" + +static PagerFlags arg_pager_flags = 0; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF|JSON_FORMAT_NEWLINE; +static char **arg_components = NULL; +static uint32_t arg_pcr_mask = 0; +static char *arg_pcrlock_path = NULL; +static bool arg_pcrlock_auto = true; +static bool arg_raw_description = false; +static char *arg_location_start = NULL; +static char *arg_location_end = NULL; +static TPM2_HANDLE arg_nv_index = 0; +static bool arg_recovery_pin = false; +static char *arg_policy_path = NULL; +static bool arg_force = false; + +STATIC_DESTRUCTOR_REGISTER(arg_components, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_pcrlock_path, freep); +STATIC_DESTRUCTOR_REGISTER(arg_location_start, freep); +STATIC_DESTRUCTOR_REGISTER(arg_location_end, freep); +STATIC_DESTRUCTOR_REGISTER(arg_policy_path, freep); + +#define PCRLOCK_SECUREBOOT_POLICY_PATH "/var/lib/pcrlock.d/240-secureboot-policy.pcrlock.d/generated.pcrlock" +#define PCRLOCK_FIRMWARE_CODE_EARLY_PATH "/var/lib/pcrlock.d/250-firmware-code-early.pcrlock.d/generated.pcrlock" +#define PCRLOCK_FIRMWARE_CONFIG_EARLY_PATH "/var/lib/pcrlock.d/250-firmware-config-early.pcrlock.d/generated.pcrlock" +#define PCRLOCK_FIRMWARE_CODE_LATE_PATH "/var/lib/pcrlock.d/550-firmware-code-late.pcrlock.d/generated.pcrlock" +#define PCRLOCK_FIRMWARE_CONFIG_LATE_PATH "/var/lib/pcrlock.d/550-firmware-config-late.pcrlock.d/generated.pcrlock" +#define PCRLOCK_GPT_PATH "/var/lib/pcrlock.d/600-gpt.pcrlock.d/generated.pcrlock" +#define PCRLOCK_SECUREBOOT_AUTHORITY_PATH "/var/lib/pcrlock.d/620-secureboot-authority.pcrlock.d/generated.pcrlock" +#define PCRLOCK_KERNEL_CMDLINE_PATH "/var/lib/pcrlock.d/710-kernel-cmdline.pcrlock.d/generated.pcrlock" +#define PCRLOCK_KERNEL_INITRD_PATH "/var/lib/pcrlock.d/720-kernel-initrd.pcrlock.d/generated.pcrlock" +#define PCRLOCK_MACHINE_ID_PATH "/var/lib/pcrlock.d/820-machine-id.pcrlock" +#define PCRLOCK_ROOT_FILE_SYSTEM_PATH "/var/lib/pcrlock.d/830-root-file-system.pcrlock" +#define PCRLOCK_FILE_SYSTEM_PATH_PREFIX "/var/lib/pcrlock.d/840-file-system-" + +/* The default set of PCRs to lock to */ +#define DEFAULT_PCR_MASK \ + ((UINT32_C(1) << TPM2_PCR_PLATFORM_CODE) | \ + (UINT32_C(1) << TPM2_PCR_PLATFORM_CONFIG) | \ + (UINT32_C(1) << TPM2_PCR_EXTERNAL_CODE) | \ + (UINT32_C(1) << TPM2_PCR_EXTERNAL_CONFIG) | \ + (UINT32_C(1) << TPM2_PCR_BOOT_LOADER_CODE) | \ + (UINT32_C(1) << TPM2_PCR_BOOT_LOADER_CONFIG) | \ + (UINT32_C(1) << TPM2_PCR_SECURE_BOOT_POLICY) | \ + (UINT32_C(1) << TPM2_PCR_KERNEL_BOOT) | \ + (UINT32_C(1) << TPM2_PCR_KERNEL_CONFIG) | \ + (UINT32_C(1) << TPM2_PCR_SYSEXTS) | \ + (UINT32_C(1) << TPM2_PCR_SHIM_POLICY) | \ + (UINT32_C(1) << TPM2_PCR_SYSTEM_IDENTITY)) + +typedef struct EventLogRecordBank EventLogRecordBank; +typedef struct EventLogRecord EventLogRecord; +typedef struct EventLogRegisterBank EventLogRegisterBank; +typedef struct EventLogRegister EventLogRegister; +typedef struct EventLogComponentVariant EventLogComponentVariant; +typedef struct EventLogComponent EventLogComponent; +typedef struct EventLog EventLog; + +struct EventLogRecordBank { + uint16_t algorithm; + TPM2B_DIGEST hash; + LIST_FIELDS(EventLogRecordBank, banks); +}; + +typedef enum EventPayloadValid { + EVENT_PAYLOAD_VALID_YES, + EVENT_PAYLOAD_VALID_NO, + EVENT_PAYLOAD_VALID_DONT_KNOW, + _EVENT_PAYLOAD_VALID_MAX, + _EVENT_PAYLOAD_VALID_INVALID = -EINVAL, +} EventPayloadValid; + +struct EventLogRecord { + EventLog *event_log; + uint32_t pcr; + + const char *source; + char *description; + + /* Data for firmware events (i.e. "TCG PC Client Platform Firmware Profile Specification" events) */ + uint32_t firmware_event_type; + void *firmware_payload; + size_t firmware_payload_size; + + /* Data for userspace events (i.e. those generated by systemd in userspace */ + Tpm2UserspaceEventType userspace_event_type; + JsonVariant *userspace_content; + + /* Validation result for the event payload itself, if the record contains enough information to validate the hash */ + EventPayloadValid event_payload_valid; + + /* If this record matches an variant of one of our defined components */ + EventLogComponentVariant **mapped; + size_t n_mapped; + + /* If this record is part of an EventLogComponentVariant */ + EventLogComponentVariant *owning_component_variant; + + LIST_HEAD(EventLogRecordBank, banks); +}; + +#define EVENT_LOG_RECORD_IS_FIRMWARE(record) ((record)->firmware_event_type != UINT32_MAX) +#define EVENT_LOG_RECORD_IS_USERSPACE(record) ((record)->userspace_event_type >= 0) + +struct EventLogRegisterBank { + TPM2B_DIGEST observed; + TPM2B_DIGEST calculated; +}; + +struct EventLogRegister { + char *color; + unsigned n_measurements; + bool fully_recognized; /* true if all measurements in this register have been recognized to match components */ + EventLogRegisterBank *banks; +}; + +struct EventLogComponentVariant { + EventLogComponent *component; + + char *id; + char *path; + + EventLogRecord **records; + size_t n_records; +}; + +struct EventLogComponent { + char *id; + + EventLogComponentVariant **variants; + size_t n_variants; +}; + +struct EventLog { + EventLogRecord **records; + size_t n_records; + + uint16_t *algorithms; + size_t n_algorithms; + bool algorithms_locked; /* if algorithms where set explicitly by user, and we should not determine them automatically */ + + const EVP_MD **mds; + + /* The hash algorithm which we focus on for matching up components */ + uint16_t primary_algorithm; + + uint8_t startup_locality; + bool startup_locality_found; + + EventLogRegister registers[TPM2_PCRS_MAX]; + + EventLogComponent **components; + size_t n_components; + + /* Number of components which we couldn't find in the event log */ + size_t n_missing_components; + + /* PCRs mask indicating all PCRs touched by unrecognized components */ + uint32_t missing_component_pcrs; +}; + +static EventLogRecordBank *event_log_record_bank_free(EventLogRecordBank *bank) { + return mfree(bank); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EventLogRecordBank*, event_log_record_bank_free); + +static EventLogRecord *event_log_record_free(EventLogRecord *record) { + EventLogRecordBank *bank; + + if (!record) + return NULL; + + free(record->description); + free(record->firmware_payload); + json_variant_unref(record->userspace_content); + + while ((bank = LIST_POP(banks, record->banks))) + event_log_record_bank_free(bank); + + free(record->mapped); + + return mfree(record); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EventLogRecord*, event_log_record_free); + +static void event_log_register_done(EventLog *el, EventLogRegister *reg) { + assert(reg); + + free(reg->color); + free(reg->banks); +} + +static EventLogComponentVariant* event_log_component_variant_free(EventLogComponentVariant *variant) { + if (!variant) + return NULL; + + free(variant->id); + free(variant->path); + + FOREACH_ARRAY(record, variant->records, variant->n_records) + event_log_record_free(*record); + + free(variant->records); + + return mfree(variant); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EventLogComponentVariant*, event_log_component_variant_free); + +static EventLogComponent* event_log_component_free(EventLogComponent *component) { + if (!component) + return NULL; + + FOREACH_ARRAY(variant, component->variants, component->n_variants) + event_log_component_variant_free(*variant); + free(component->variants); + + free(component->id); + + return mfree(component); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EventLogComponent*, event_log_component_free); + +static EventLog* event_log_free(EventLog *el) { + if (!el) + return NULL; + + FOREACH_ARRAY(p, el->registers, TPM2_PCRS_MAX) + event_log_register_done(el, p); + + FOREACH_ARRAY(rr, el->records, el->n_records) + event_log_record_free(*rr); + free(el->records); + + FOREACH_ARRAY(c, el->components, el->n_components) + event_log_component_free(*c); + free(el->components); + + free(el->algorithms); + free(el->mds); + + return mfree(el); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EventLog*, event_log_free); + +static EventLogRecord* event_log_record_new(EventLog *el) { + EventLogRecord *record; + + record = new(EventLogRecord, 1); + if (!record) + return NULL; + + *record = (EventLogRecord) { + .event_log = el, + .firmware_event_type = UINT32_MAX, + .userspace_event_type = _TPM2_USERSPACE_EVENT_TYPE_INVALID, + .event_payload_valid = _EVENT_PAYLOAD_VALID_INVALID, + }; + + return record; +} + +static int event_log_add_record( + EventLog *el, + EventLogRecord **ret) { + + _cleanup_(event_log_record_freep) EventLogRecord *record = NULL; + + assert(el); + + if (!GREEDY_REALLOC(el->records, el->n_records+1)) + return -ENOMEM; + + record = event_log_record_new(el); + if (!record) + return -ENOMEM; + + el->records[el->n_records++] = record; + + if (ret) + *ret = record; + + TAKE_PTR(record); + + return 0; +} + +static int event_log_add_algorithm(EventLog *el, uint16_t alg) { + assert(el); + + if (el->algorithms_locked) /* algorithms configured via env var, don't add any further automatically */ + return 0; + + if (typesafe_bsearch(&alg, el->algorithms, el->n_algorithms, cmp_uint16)) + return 0; + + if (!GREEDY_REALLOC(el->algorithms, el->n_algorithms+1)) + return -ENOMEM; + + el->algorithms[el->n_algorithms++] = alg; + + typesafe_qsort(el->algorithms, el->n_algorithms, cmp_uint16); + + return 1; +} + +static int event_log_add_algorithms_from_environment(EventLog *el) { + const char *e; + int r; + + assert(el); + + e = secure_getenv("SYSTEMD_TPM2_HASH_ALGORITHMS"); + if (!e) + return 0; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&e, &word, ":", 0); + if (r < 0) + return r; + if (r == 0) + break; + + r = tpm2_hash_alg_from_string(word); + if (r < 0) + return log_error_errno(r, "Unknown hash algorithm '%s'.", word); + + r = event_log_add_algorithm(el, r); + if (r < 0) + return log_error_errno(r, "Failed to add hash algorithm '%s'.", word); + } + + if (el->n_algorithms > 0) + el->algorithms_locked = true; + + return 0; +} + +static EventLogRecordBank *event_log_record_find_bank( + const EventLogRecord *record, + uint16_t alg) { + + assert(record); + + LIST_FOREACH(banks, i, record->banks) + if (i->algorithm == alg) + return i; + + return NULL; +} + +static int event_log_record_add_bank( + EventLogRecord *record, + uint16_t algorithm, + const void *hash, + size_t hash_size, + EventLogRecordBank **ret) { + + _cleanup_(event_log_record_bank_freep) EventLogRecordBank *bank = NULL; + _cleanup_free_ void *h = NULL; + + assert(record); + assert(hash || hash_size == 0); + + if (event_log_record_find_bank(record, algorithm)) + return -EEXIST; + + if (hash_size > sizeof_field(TPM2B_DIGEST, buffer)) + return -E2BIG; + + h = memdup(hash, hash_size); + if (!h) + return -ENOMEM; + + bank = new(EventLogRecordBank, 1); + if (!bank) + return -ENOMEM; + + *bank = (EventLogRecordBank) { + .algorithm = algorithm, + .hash = TPM2B_DIGEST_MAKE(hash, hash_size), + }; + + LIST_PREPEND(banks, record->banks, bank); + + if (ret) + *ret = bank; + + TAKE_PTR(bank); + + return 0; +} + +static bool event_log_record_is_stub(EventLogRecord *rec) { + assert(rec); + + /* Recognizes the special EV_IPL events systemd-stub generates. Since EV_IPL can be used by almost + * anything, we'll check for the PCR values, to see if it's one of ours. */ + + if (rec->firmware_event_type != EV_IPL) + return false; + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + return false; + + if (!IN_SET(rec->pcr, + TPM2_PCR_KERNEL_BOOT, /* 11 */ + TPM2_PCR_KERNEL_CONFIG, /* 12 */ + TPM2_PCR_SYSEXTS)) /* 13 */ + return false; + + return true; +} + +static int event_log_record_parse_variable_data( + EventLogRecord *rec, + sd_id128_t *ret_variable_uuid, + char **ret_variable_name) { + + _cleanup_free_ char16_t *p16 = NULL; + _cleanup_free_ char *p = NULL; + + assert(rec); + assert(ret_variable_uuid); + assert(ret_variable_name); + + if (rec->firmware_payload_size < sizeof(UEFI_VARIABLE_DATA)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "EFI variable field too short."); + + const UEFI_VARIABLE_DATA *vdata = rec->firmware_payload; + + if (vdata->unicodeNameLength > (SIZE_MAX - offsetof(UEFI_VARIABLE_DATA, unicodeNameLength)) / 2) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Unicode name length too large."); + + size_t m = offsetof(UEFI_VARIABLE_DATA, unicodeName) + vdata->unicodeNameLength * 2; + + if (vdata->variableDataLength > SIZE_MAX - m) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Oversize EFI variable data size."); + + if (rec->firmware_payload_size != m + vdata->variableDataLength) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "EFI variable data has wrong size."); + + p16 = memdup(vdata->unicodeName, vdata->unicodeNameLength * 2); /* Copy out, to align properly */ + if (!p16) + return log_oom_debug(); + + p = utf16_to_utf8(p16, vdata->unicodeNameLength * 2); + if (!p) + return log_oom_debug(); + + if (!string_is_safe(p)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Unsafe EFI variable string in record."); + + *ret_variable_uuid = efi_guid_to_id128(vdata->variableName); + *ret_variable_name = TAKE_PTR(p); + + return 0; +} + +static int event_log_record_extract_firmware_description(EventLogRecord *rec) { + _cleanup_free_ char *fallback = NULL; + int r; + + assert(rec); + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + return 0; + + if (arg_raw_description) + goto catchall; + + switch (rec->firmware_event_type) { + + case EV_EFI_VARIABLE_DRIVER_CONFIG: + case EV_EFI_VARIABLE_BOOT: + case EV_EFI_VARIABLE_BOOT2: + case EV_EFI_VARIABLE_AUTHORITY: { + _cleanup_free_ char *p = NULL; + sd_id128_t uuid; + + r = event_log_record_parse_variable_data(rec, &uuid, &p); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_warning_errno(r, "EFI variable data invalid, ignoring."); + goto invalid; + } + + if (asprintf(&rec->description, "%s: %s-" SD_ID128_UUID_FORMAT_STR, + rec->firmware_event_type == EV_EFI_VARIABLE_AUTHORITY ? "Authority" : "Variable", + p, + SD_ID128_FORMAT_VAL(uuid)) < 0) + return log_oom(); + + return 1; + } + + case EV_SEPARATOR: { + if (rec->firmware_payload_size != sizeof(uint32_t)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "EFI separator field has wrong size, ignoring."); + goto invalid; + } + + uint32_t val = unaligned_read_ne32(rec->firmware_payload); + + switch (val) { + + case 0: + case UINT32_C(0xffffffff): + (void) asprintf(&rec->description, "Separator: Success (0x%02" PRIx32 ")", val); + break; + + case 1: + rec->description = strdup("Separator: Error (0x01)"); + break; + + default: + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected separator payload %" PRIu32 ".", val); + goto invalid; + } + + if (!rec->description) + return log_oom(); + + return 1; + } + + case EV_EFI_ACTION: { + _cleanup_free_ char *d = NULL; + + r = make_cstring(rec->firmware_payload, rec->firmware_payload_size, MAKE_CSTRING_ALLOW_TRAILING_NUL, &d); + if (r < 0) + return log_error_errno(r, "Failed to make C string from EFI action string: %m"); + + if (!string_is_safe(d)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Unsafe EFI action string in record, ignoring."); + goto invalid; + } + + rec->description = strjoin("Action: ", d); + if (!rec->description) + return log_oom(); + return 1; + } + + case EV_EFI_GPT_EVENT: { + if (rec->firmware_payload_size < sizeof(GptHeader)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "GPT measurement too short, ignoring."); + goto invalid; + } + + const GptHeader *h = rec->firmware_payload; + + if (!gpt_header_has_signature(h)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "GPT measurement does not cover a GPT partition table header, ignoring."); + goto invalid; + } + + if (asprintf(&rec->description, "GPT: disk " SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(efi_guid_to_id128(h->disk_guid))) < 0) + return log_oom(); + + return 1; + } + + case EV_IPL: { + _cleanup_free_ char *d = NULL; + + /* EV_IPL can be anything, only try to parse the description on PCRs we "own" */ + if (!event_log_record_is_stub(rec)) + break; + + /* sd-stub always sets a description string as text for these */ + + d = utf16_to_utf8(rec->firmware_payload, rec->firmware_payload_size); + if (!d) + return log_oom(); + + if (string_has_cc(d, NULL)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Unsafe EFI action string in record, ignoring."); + goto invalid; + } + + rec->description = strjoin("String: ", d); + if (!rec->description) + return log_oom(); + + return 1; + } + + case EV_EVENT_TAG: { + TCG_PCClientTaggedEvent *tag = rec->firmware_payload; + size_t left = rec->firmware_payload_size; + + if (left == 0) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Empty tagged PC client event, ignoring."); + goto invalid; + } + + for (;;) { + uint64_t m; + + if (left < offsetof(TCG_PCClientTaggedEvent, taggedEventData)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Tagged PC client event too short, ignoring."); + goto invalid; + } + + m = offsetof(TCG_PCClientTaggedEvent, taggedEventData) + (uint64_t) tag->taggedEventDataSize; + if (left < m) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Tagged PC client event data too short, ignoring."); + goto invalid; + } + + switch (tag->taggedEventID) { + + /* Linux kernel's own measurements: */ + case INITRD_EVENT_TAG_ID: + /* The tagged event payload is just a constant string, hence don't show it */ + if (!strextend_with_separator(&rec->description, ", ", "Linux: initrd")) + return log_oom(); + break; + + case LOAD_OPTIONS_EVENT_TAG_ID: + /* As above. */ + if (!strextend_with_separator(&rec->description, ", ", "Linux: kernel command line")) + return log_oom(); + break; + + /* systemd's measurements: */ + case LOADER_CONF_EVENT_TAG_ID: + /* As above. */ + if (!strextend_with_separator(&rec->description, ", ", "systemd-boot: loader.conf")) + return log_oom(); + break; + + case DEVICETREE_ADDON_EVENT_TAG_ID: { + _cleanup_free_ char *raw = NULL, *s = NULL; + + raw = utf16_to_utf8((const char16_t*) tag->taggedEventData, tag->taggedEventDataSize); + if (!raw) + return log_oom(); + + s = cescape(raw); + if (!s) + return log_oom(); + + r = strextendf_with_separator(&rec->description, ", ", "systemd-stub: devicetree addon %s", s); + if (r < 0) + return log_error_errno(r, "Failed to format EV_EVENT_TAG description string: %m"); + break; + } + + default: { + _cleanup_free_ char *s = NULL; + + s = cescape_length((char*) tag->taggedEventData, tag->taggedEventDataSize); + if (!s) + return log_oom(); + + r = strextendf_with_separator(&rec->description, ", ", "Tag 0x%" PRIx32 ": %s", tag->taggedEventID, s); + if (r < 0) + return log_error_errno(r, "Failed to format EV_EVENT_TAG description string: %m"); + + break; + }} + + tag = (TCG_PCClientTaggedEvent*) ((uint8_t*) tag + m); + left -= m; + + if (left == 0) + break; + } + + return 1; + } + + case EV_EFI_PLATFORM_FIRMWARE_BLOB: { + const UEFI_PLATFORM_FIRMWARE_BLOB *blob; + if (rec->firmware_payload_size != sizeof(UEFI_PLATFORM_FIRMWARE_BLOB)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "EV_EFI_PLATFORM_FIRMWARE_BLOB of wrong size, ignoring."); + goto invalid; + } + + blob = rec->firmware_payload; + if (asprintf(&rec->description, "Blob: %s @ 0x%" PRIx64, FORMAT_BYTES(blob->blobLength), blob->blobBase) < 0) + return log_oom(); + + return 1; + } + + case EV_EFI_BOOT_SERVICES_APPLICATION: + case EV_EFI_BOOT_SERVICES_DRIVER: + case EV_EFI_RUNTIME_SERVICES_DRIVER: { + const UEFI_IMAGE_LOAD_EVENT *load; + _cleanup_free_ char *fn = NULL; + bool end = false; + + if (rec->firmware_payload_size < offsetof(UEFI_IMAGE_LOAD_EVENT, devicePath)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Device path too short, ignoring."); + goto invalid; + } + + load = rec->firmware_payload; + if (load->lengthOfDevicePath != + rec->firmware_payload_size - offsetof(UEFI_IMAGE_LOAD_EVENT, devicePath)) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Device path size does not match, ignoring."); + goto invalid; + } + + const packed_EFI_DEVICE_PATH *dp = (const packed_EFI_DEVICE_PATH*) load->devicePath; + size_t left = load->lengthOfDevicePath; + + for (;;) { + if (left == 0) { + if (!end) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Garbage after device path end, ignoring."); + goto invalid; + } + + break; + } + + if (end) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Garbage after device path end, ignoring."); + goto invalid; + } + + if (left < offsetof(packed_EFI_DEVICE_PATH, path) || left < dp->length) { + log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Device path element too short, ignoring."); + goto invalid; + } + + if (dp->type == 4 && dp->subType == 4) { + /* Filename, store the last node of this type as description, it should contain the file name */ + + free(fn); + fn = utf16_to_utf8((void*) dp->path, dp->length - offsetof(packed_EFI_DEVICE_PATH, path)); + if (!fn) + return log_oom(); + + } else if (dp->type == 0x7F && dp->subType == 0xFF) + /* End of Hardware Device Path */ + end = true; + else + log_debug("Ignoring device path element type=0x%02x subtype=0x%02x", dp->type, dp->subType); + + left -= dp->length; + dp = (packed_EFI_DEVICE_PATH*) ((uint8_t*) dp + dp->length); + } + + if (fn) { + rec->description = strjoin("File: ", fn); + if (!rec->description) + return log_oom(); + + return 1; + } + + break; + }} + +catchall: + /* Catchall: show binary data */ + fallback = cescape_length(rec->firmware_payload, rec->firmware_payload_size); + if (!fallback) + return log_oom(); + + rec->description = strjoin("Raw: ", fallback); + if (!rec->description) + return log_oom(); + return 1; + + +invalid: + /* Mark the payload as invalid, so that we do not bother parsing/validating it any further */ + rec->event_payload_valid = EVENT_PAYLOAD_VALID_NO; + return 0; +} + +static int event_log_add_algorithms_from_record(EventLog *el, EventLogRecord *record) { + int r; + + assert(el); + assert(record); + + if (el->algorithms_locked) + return 0; + + LIST_FOREACH(banks, i, record->banks) { + r = event_log_add_algorithm(el, i->algorithm); + if (r < 0) + return r; + } + + return 0; +} + +static int event_log_load_firmware(EventLog *el) { + const TCG_EfiSpecIdEventAlgorithmSize *algorithms; + size_t bufsize = 0, n_algorithms = 0, left = 0; + _cleanup_free_ void *buf = NULL; + const TCG_PCR_EVENT2 *event; + const char *path; + int r; + + assert(el); + + path = tpm2_firmware_log_path(); + + r = read_full_file(path, (char**) &buf, &bufsize); + if (r < 0) + return log_error_errno(r, "Failed to open TPM2 event log '%s': %m", path); + + if (bufsize == 0) { + /* Sometimes it's useful to invoke things with SYSTEMD_MEASURE_LOG_FIRMWARE=/dev/null, let's allow that, and proceed */ + log_warning("Empty firmware event log file, not loading."); + return 0; + } + + r = validate_firmware_header(buf, bufsize, &algorithms, &n_algorithms, &event, &left); + if (r < 0) + return r; + + for (const TCG_PCR_EVENT2 *next_event = NULL;; event = next_event) { + EventLogRecord *record = NULL; + const void *payload; + size_t payload_size; + + r = validate_firmware_event( + event, + left, + algorithms, + n_algorithms, + &next_event, + &left, + &payload, + &payload_size); + if (r < 0) + return r; + if (r == 0) + break; + + if (event->eventType == EV_NO_ACTION && + event->pcrIndex == 0 && + payload_size == 17 && + memcmp(payload, "StartupLocality", sizeof("StartupLocality")) == 0) { + if (el->startup_locality_found) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "StartupLocality event found twice!"); + + el->startup_locality = ((const uint8_t*) payload)[sizeof("StartupLocality")]; + el->startup_locality_found = true; + log_debug("Found StartupLocality event: %u", el->startup_locality); + continue; + } + + if (event->eventType == EV_NO_ACTION) { /* Ignore pseudo events, that don't result in a measurement */ + log_debug("Skipping NO_ACTION event."); + continue; + } + + r = event_log_add_record(el, &record); + if (r < 0) + return log_error_errno(r, "Failed to add record to event log: %m"); + + record->pcr = event->pcrIndex; + record->source = path; + record->firmware_event_type = event->eventType; + record->firmware_payload = memdup(payload, payload_size); + if (!record->firmware_payload) + return log_oom(); + record->firmware_payload_size = payload_size; + + const void *ha, *ha_next = NULL; + ha = (const uint8_t*) event + offsetof(TCG_PCR_EVENT2, digests.digests); + assert(event->digests.count == n_algorithms); + + for (size_t i = 0; i < n_algorithms; i++, ha = ha_next) { + ha_next = (const uint8_t*) ha + offsetof(TPMT_HA, digest) + algorithms[i].digestSize; + + /* The TPMT_HA is not aligned in the record, hence read the hashAlg field via an unaligned read */ + assert_cc(__builtin_types_compatible_p(uint16_t, typeof(TPMI_ALG_HASH))); + uint16_t hash_alg = unaligned_read_ne16((const uint8_t*) ha + offsetof(TPMT_HA, hashAlg)); + + if (hash_alg != algorithms[i].algorithmId) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Hash algorithms in event log record don't match log."); + + if (!tpm2_hash_alg_to_string(algorithms[i].algorithmId)) + continue; + + r = event_log_record_add_bank( + record, + algorithms[i].algorithmId, + (const uint8_t*) ha + offsetof(TPMT_HA, digest), + algorithms[i].digestSize, + /* ret= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to add bank to event log record: %m"); + } + + /* Try to extract a descriptive text */ + r = event_log_record_extract_firmware_description(record); + if (r < 0) + return r; + + r = event_log_add_algorithms_from_record(el, record); + if (r < 0) + return r; + } + + return 0; +} + +static int event_log_record_parse_json(EventLogRecord *record, JsonVariant *j) { + const char *rectype = NULL; + JsonVariant *x, *k; + uint64_t u; + int r; + + assert(record); + assert(j); + + if (!json_variant_is_object(j)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "record object is not an object."); + + x = json_variant_by_key(j, "pcr"); + if (!x) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'pcr' field missing from TPM measurement log file entry."); + if (!json_variant_is_unsigned(x)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'pcr' field is not an integer."); + + u = json_variant_unsigned(x); + if (u >= TPM2_PCRS_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'pcr' field is out of range."); + record->pcr = json_variant_unsigned(x); + + x = json_variant_by_key(j, "digests"); + if (!x) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'digests' field missing from TPM measurement log file entry."); + if (!json_variant_is_array(x)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'digests' field is not an array."); + + JSON_VARIANT_ARRAY_FOREACH(k, x) { + _cleanup_free_ void *hash = NULL; + size_t hash_size; + JsonVariant *a, *h; + int na; + + a = json_variant_by_key(k, "hashAlg"); + if (!a) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'digests' field element lacks 'hashAlg' field."); + if (!json_variant_is_string(a)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'hashAlg' field is not a string."); + + na = tpm2_hash_alg_from_string(json_variant_string(a)); + if (na < 0) { + log_debug_errno(na, "Unsupported hash '%s' in userspace event log, ignoring: %m", json_variant_string(a)); + continue; + } + + h = json_variant_by_key(k, "digest"); + if (!h) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'digests' field lacks 'digest' field"); + + r = json_variant_unhex(h, &hash, &hash_size); + if (r < 0) + return log_error_errno(r, "Failed to decode digest: %m"); + + r = event_log_record_add_bank( + record, + na, + hash, + hash_size, + /* ret= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to add bank to event log record: %m"); + } + + x = json_variant_by_key(j, "content_type"); + if (!x) + log_debug("'content_type' missing from TPM measurement log file entry, ignoring."); + else { + if (!json_variant_is_string(x)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'content_type' field is not a string."); + + rectype = json_variant_string(x); + } + + if (streq_ptr(rectype, "systemd")) { + JsonVariant *y; + + x = json_variant_by_key(j, "content"); + if (!x) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'content' field missing from TPM measurement log file entry."); + if (!json_variant_is_object(x)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'content' sub-object is not an object."); + + y = json_variant_by_key(x, "string"); + if (y) { + if (!json_variant_is_string(y)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'string' field is not a string."); + + r = free_and_strdup_warn(&record->description, json_variant_string(y)); + if (r < 0) + return r; + } + + y = json_variant_by_key(x, "eventType"); + if (y) { + if (!json_variant_is_string(y)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'eventType' field is not a string."); + + record->userspace_event_type = tpm2_userspace_event_type_from_string(json_variant_string(y)); + if (record->userspace_event_type < 0) + log_debug_errno(record->userspace_event_type, "Unknown userspace event type '%s', ignoring.", json_variant_string(y)); + } + + json_variant_unref(record->userspace_content); + record->userspace_content = json_variant_ref(x); + } + + return 0; +} + +static int event_log_load_userspace(EventLog *el) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *b = NULL; + bool beginning = true; + const char *path; + size_t bn = 0; + int r; + + assert(el); + + path = tpm2_userspace_log_path(); + + f = fopen(path, "re"); + if (!f) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open userspace TPM measurement log file: %m"); + + return 0; + } + + if (flock(fileno(f), LOCK_SH) < 0) + return log_error_errno(errno, "Failed to lock userspace TPM measurement log file: %m"); + + for (;;) { + _cleanup_(json_variant_unrefp) JsonVariant *j = NULL; + EventLogRecord *record; + int ch; + + ch = fgetc(f); + if (ch == EOF) { + if (ferror(f)) + return log_error_errno(errno, "Failed to read local TPM measurement log file: %m"); + + if (beginning) + break; + } else if (ch != 0x1EU) { + if (!GREEDY_REALLOC(b, bn + 2)) + return log_oom(); + + b[bn++] = (char) ch; + continue; + } + + if (beginning) { + beginning = false; + continue; + } + + if (!GREEDY_REALLOC(b, bn + 1)) + return log_oom(); + + b[bn] = 0; /* Turn it into a string */ + + if (memchr(b, 0, bn)) { + log_warning("Found record with embedded NUL byte, skipping."); + continue; + } + + r = json_parse(b, 0, &j, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse local TPM measurement log file: %m"); + + r = event_log_add_record(el, &record); + if (r < 0) + return log_error_errno(r, "Failed to add record to event log: %m"); + + record->source = path; + + r = event_log_record_parse_json(record, j); + if (r < 0) + return r; + + r = event_log_add_algorithms_from_record(el, record); + if (r < 0) + return r; + + if (ch == EOF) + break; + + b = mfree(b); + bn = 0; + } + + return 0; +} + +static EventLog *event_log_new(void) { + _cleanup_(event_log_freep) EventLog *el = NULL; + + el = new(EventLog, 1); + if (!el) + return NULL; + + *el = (EventLog) { + .primary_algorithm = UINT16_MAX, + }; + + return TAKE_PTR(el); +} + +static int event_log_load(EventLog *el) { + int r; + + assert(el); + + r = event_log_load_firmware(el); + if (r < 0) + return r; + + r = event_log_load_userspace(el); + if (r < 0) + return r; + + return 0; +} + +static int event_log_read_pcrs(EventLog *el) { + _cleanup_(tpm2_context_unrefp) Tpm2Context *tc = NULL; + int r; + + assert(el); + + r = tpm2_context_new(NULL, &tc); + if (r < 0) + return r; + + FOREACH_ARRAY(rr, el->registers, TPM2_PCRS_MAX) { + if (rr->banks) + continue; + + rr->banks = new0(EventLogRegisterBank, el->n_algorithms); + if (!rr->banks) + return log_oom(); + } + + for (size_t a = 0; a < el->n_algorithms; a++) { + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values; + TPML_PCR_SELECTION selection; + + tpm2_tpml_pcr_selection_from_mask(TPM2_PCRS_MASK, el->algorithms[a], &selection); + r = tpm2_pcr_read(tc, &selection, &pcr_values, &n_pcr_values); + if (r < 0) + return r; + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) { + assert(v->hash == el->algorithms[a]); + el->registers[v->index].banks[a].observed = v->value; + } + } + + return 0; +} + +static void event_log_initial_pcr_state(EventLog *el, uint32_t pcr, size_t size, TPM2B_DIGEST *ret) { + assert(el); + assert(pcr < TPM2_PCRS_MAX); + assert(size > 0); + assert(size <= sizeof_field(TPM2B_DIGEST, buffer)); + assert(ret); + + ret->size = size; + + switch (pcr) { + + case 0: + memzero(ret->buffer, ret->size-1); + ((uint8_t*) ret->buffer)[ret->size-1] = el->startup_locality_found ? el->startup_locality : 0; + break; + + case 1 ... 16: + case 23: + memzero(ret->buffer, ret->size); + break; + + case 17 ... 22: + memset(ret->buffer, 0xffu, ret->size); + break; + + default: + assert_not_reached(); + } +} + +static int event_log_calculate_pcrs(EventLog *el) { + assert(el); + + /* Iterates through the event log an calculates the expected hash values based on all listed records */ + + assert(!el->mds); + el->mds = new(const EVP_MD*, el->n_algorithms); + if (!el->mds) + return log_oom(); + + for (size_t i = 0; i < el->n_algorithms; i++) { + const EVP_MD *md; + const char *a; + + assert_se(a = tpm2_hash_alg_to_string(el->algorithms[i])); + assert_se(md = EVP_get_digestbyname(a)); + + el->mds[i] = md; + } + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) + for (size_t i = 0; i < el->n_algorithms; i++) { + EventLogRegisterBank *b = el->registers[pcr].banks + i; + event_log_initial_pcr_state(el, pcr, EVP_MD_size(el->mds[i]), &b->calculated); + } + + FOREACH_ARRAY(rr, el->records, el->n_records) { + EventLogRegister *reg = el->registers + (*rr)->pcr; + + for (size_t i = 0; i < el->n_algorithms; i++) { + const char *n = tpm2_hash_alg_to_string(el->algorithms[i]); + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *mc = NULL; + EventLogRegisterBank *reg_b; + EventLogRecordBank *rec_b; + unsigned sz; + + rec_b = event_log_record_find_bank(*rr, el->algorithms[i]); + if (!rec_b) { + log_warning_errno(SYNTHETIC_ERRNO(ENXIO), "Record with missing bank '%s', ignoring.", n); + continue; + } + + reg_b = reg->banks + i; + + mc = EVP_MD_CTX_new(); + if (!mc) + return log_oom(); + + if (EVP_DigestInit_ex(mc, el->mds[i], NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize %s message digest context.", n); + + if (EVP_DigestUpdate(mc, reg_b->calculated.buffer, reg_b->calculated.size) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to run digest."); + + if (EVP_DigestUpdate(mc, rec_b->hash.buffer, rec_b->hash.size) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to run digest."); + + if (EVP_DigestFinal_ex(mc, reg_b->calculated.buffer, &sz) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize hash context."); + + assert(sz == reg_b->calculated.size); + } + + reg->n_measurements++; + } + + return 0; +} + +static int event_log_record_validate_hash_firmware( + EventLogRecord *record, + EventLogRecordBank *bank, + const EVP_MD *md) { + + _cleanup_free_ void *hdata_alternative = NULL; + size_t hsz, hsz_alternative = 0; + bool strict = false; + const void *hdata; + + assert(record); + assert(bank); + assert(md); + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(record)) + return 0; + + switch (record->firmware_event_type) { + + case EV_EFI_ACTION: + case EV_EFI_GPT_EVENT: + case EV_EFI_VARIABLE_BOOT2: + case EV_EFI_VARIABLE_DRIVER_CONFIG: + case EV_EFI_VARIABLE_AUTHORITY: + case EV_SEPARATOR: + case EV_S_CRTM_VERSION: + /* Here the extended hash value is the hash value of the event payload. Note that + * EV_PLATFORM_CONFIG_FLAGS (according to the TCG PC Client Platform Firmware Profile + * Specification) is also supposed to be like this. But ovmf doesn't follow this requirement, + * hence be lenient on that one, and don't include it here. */ + hdata = record->firmware_payload; + hsz = record->firmware_payload_size; + strict = true; + break; + + case EV_EFI_VARIABLE_BOOT: { + const UEFI_VARIABLE_DATA *vdata = record->firmware_payload; + size_t skip; + + /* Here the extended hash value is the hash value of the variable data (i.e. excluding the + * name). + * + * Note: we already checked the general validity of the UEFI_VARIABLE_DATA structure, hence + * no need to do so again. */ + + assert(record->firmware_payload_size >= offsetof(UEFI_VARIABLE_DATA, unicodeName)); + skip = offsetof(UEFI_VARIABLE_DATA, unicodeName) + vdata->unicodeNameLength * 2; + + assert(record->firmware_payload_size >= skip); + hdata = (const uint8_t*) record->firmware_payload + skip; + hsz = record->firmware_payload_size - skip; + strict = true; + break; + } + + case EV_IPL: + if (event_log_record_is_stub(record)) { + /* The PE section names have a descriptive string in UTF-16 in the payload, but the + * hash is over the UTF-8 version (with suffixing 0), hence let's convert the payload + * into that format here, and see if it checks out. */ + hdata_alternative = utf16_to_utf8(record->firmware_payload, record->firmware_payload_size); + if (!hdata_alternative) + return log_oom(); + + hsz_alternative = strlen(hdata_alternative) + 1; /* with NUL byte */ + } + + _fallthrough_; + + default: + /* For the others check the data too, just in case. But usually this will not match, hence + * only report if the checksum matches, but don't complain if it does not. */ + hdata = record->firmware_payload; + hsz = record->firmware_payload_size; + strict = false; + break; + } + + int mdsz = EVP_MD_size(md); + assert(mdsz > 0); + assert((size_t) mdsz <= sizeof_field(TPM2B_DIGEST, buffer)); + + TPM2B_DIGEST payload_hash = { + .size = mdsz, + }; + + unsigned dsz = mdsz; + + if (EVP_Digest(hdata, hsz, payload_hash.buffer, &dsz, md, NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to calculate event payload hash."); + assert(dsz == (unsigned) mdsz); + + /* If this didn't match then let's try the alternative format here, if we have one, and check things then. */ + if (memcmp_nn(bank->hash.buffer, bank->hash.size, payload_hash.buffer, payload_hash.size) != 0 && hdata_alternative) { + if (EVP_Digest(hdata_alternative, hsz_alternative, payload_hash.buffer, &dsz, md, NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to calculate event payload hash."); + assert(dsz == (unsigned) mdsz); + } + + if (memcmp_nn(bank->hash.buffer, bank->hash.size, payload_hash.buffer, payload_hash.size) != 0) { + if (strict) + record->event_payload_valid = EVENT_PAYLOAD_VALID_NO; + else if (record->event_payload_valid != EVENT_PAYLOAD_VALID_NO) + record->event_payload_valid = EVENT_PAYLOAD_VALID_DONT_KNOW; + } else if (record->event_payload_valid < 0) + record->event_payload_valid = EVENT_PAYLOAD_VALID_YES; + + return 1; +} + +static int event_log_record_validate_hash_userspace( + EventLogRecord *record, + EventLogRecordBank *bank, + const EVP_MD *md) { + + _cleanup_free_ unsigned char *payload_hash = NULL; + unsigned payload_hash_size; + JsonVariant *js; + const char *s; + int mdsz; + + assert(record); + assert(bank); + assert(md); + + if (!EVENT_LOG_RECORD_IS_USERSPACE(record)) + return 0; + + if (!record->userspace_content) + return 0; + + js = json_variant_by_key(record->userspace_content, "string"); + if (!js) + return 0; + + assert(json_variant_is_string(js)); + s = json_variant_string(js); + + mdsz = EVP_MD_size(md); + assert(mdsz > 0); + + payload_hash_size = mdsz; + payload_hash = malloc(payload_hash_size); + if (!payload_hash) + return log_oom(); + + if (EVP_Digest(s, strlen(s), payload_hash, &payload_hash_size, md, NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to calculate event payload hash."); + + assert((int) payload_hash_size == mdsz); + if (memcmp_nn(bank->hash.buffer, bank->hash.size, payload_hash, payload_hash_size) != 0) + record->event_payload_valid = EVENT_PAYLOAD_VALID_NO; + else if (record->event_payload_valid < 0) + record->event_payload_valid = EVENT_PAYLOAD_VALID_YES; + + return 0; +} + +static int event_log_validate_record_hashes(EventLog *el) { + int r; + + assert(el); + + /* For records which contain the full data to validate the hashes, do so. */ + + FOREACH_ARRAY(rr, el->records, el->n_records) { + + LIST_FOREACH(banks, bank, (*rr)->banks) { + const EVP_MD *md; + const char *a; + + assert_se(a = tpm2_hash_alg_to_string(bank->algorithm)); + assert_se(md = EVP_get_digestbyname(a)); + + r = event_log_record_validate_hash_firmware(*rr, bank, md); + if (r < 0) + return r; + + r = event_log_record_validate_hash_userspace(*rr, bank, md); + if (r < 0) + return r; + } + } + + return 0; +} + +static int event_log_component_cmp(EventLogComponent *const*a, EventLogComponent *const*b) { + const EventLogComponent *x = ASSERT_PTR(*ASSERT_PTR(a)), *y = ASSERT_PTR(*ASSERT_PTR(b)); + + return strcmp(x->id, y->id); +} + +static EventLogComponent *event_log_find_component(EventLog *el, const char *id) { + EventLogComponent k = { + .id = (char*) id, + }; + EventLogComponent *kk = &k, **found; + + assert(el); + assert(id); + + found = typesafe_bsearch( + &kk, + el->components, + el->n_components, + event_log_component_cmp); + if (!found) + return NULL; + + return *found; +} + +static int event_log_add_component(EventLog *el, const char *id, EventLogComponent **ret) { + _cleanup_(event_log_component_freep) EventLogComponent *component = NULL; + _cleanup_free_ char *id_copy = NULL; + EventLogComponent *found; + + assert(el); + assert(ret); + + found = event_log_find_component(el, id); + if (found) { + *ret = found; + return 0; + } + + if (!GREEDY_REALLOC(el->components, el->n_components+1)) + return log_oom(); + + id_copy = strdup(id); + if (!id_copy) + return log_oom(); + + component = new(EventLogComponent, 1); + if (!component) + return log_oom(); + + *component = (EventLogComponent) { + .id = TAKE_PTR(id_copy), + }; + + if (ret) + *ret = component; + + el->components[el->n_components++] = TAKE_PTR(component); + return 1; +} + +static int event_log_record_equal(const EventLogRecord *a, const EventLogRecord *b) { + EventLogRecordBank *x, *y; + + assert(a); + assert(a->event_log); + assert(b); + assert(b->event_log); + assert(a->event_log == b->event_log); + + if (a->pcr != b->pcr) + return false; + + x = event_log_record_find_bank(a, a->event_log->primary_algorithm); + y = event_log_record_find_bank(b, b->event_log->primary_algorithm); + if (!x || !y) + return false; + + assert(x->algorithm == a->event_log->primary_algorithm); + assert(y->algorithm == b->event_log->primary_algorithm); + + return memcmp_nn(x->hash.buffer, x->hash.size, y->hash.buffer, y->hash.size) == 0; +} + +static int event_log_add_component_file(EventLog *el, EventLogComponent *component, const char *path) { + _cleanup_(event_log_component_variant_freep) EventLogComponentVariant *variant = NULL; + _cleanup_free_ char *fname = NULL, *id = NULL, *path_copy = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *j = NULL; + JsonVariant *records; + const char *e; + int r; + + assert(el); + + r = path_extract_filename(path, &fname); + if (r < 0) + return log_error_errno(r, "Failed to extract basename from path %s: %m", path); + + e = endswith(fname, ".pcrlock"); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Bad suffix: %s", fname); + + id = strndup(fname, e - fname); + if (!id) + return log_oom(); + + if (!component) { + r = event_log_add_component(el, id, &component); + if (r < 0) + return r; + } + + if (!GREEDY_REALLOC(component->variants, component->n_variants+1)) + return log_oom(); + + r = json_parse_file( + /* f= */ NULL, + path, + /* flags= */ 0, + &j, + /* ret_line= */ NULL, + /* ret_column= */ NULL); + if (r < 0) { + log_warning_errno(r, "Failed to parse component file %s, ignoring: %m", path); + return 0; + } + + if (!json_variant_is_object(j)) { + log_warning_errno(r, "Component file %s does not contain JSON object, ignoring.", path); + return 0; + } + + path_copy = strdup(path); + if (!path_copy) + return log_oom(); + + variant = new(EventLogComponentVariant, 1); + if (!variant) + return log_oom(); + + *variant = (EventLogComponentVariant) { + .component = component, + .path = TAKE_PTR(path_copy), + .id = TAKE_PTR(id), + }; + + records = json_variant_by_key(j, "records"); + if (records) { + JsonVariant *rj; + + if (!json_variant_is_array(records)) { + log_warning_errno(r, "Component records field of file %s is not an array, ignoring.", path); + return 0; + } + + JSON_VARIANT_ARRAY_FOREACH(rj, records) { + _cleanup_(event_log_record_freep) EventLogRecord *record = NULL; + + if (!GREEDY_REALLOC(variant->records, variant->n_records+1)) + return log_oom(); + + record = event_log_record_new(el); + if (!record) + return log_oom(); + + r = event_log_record_parse_json(record, rj); + if (r < 0) + return r; + + record->owning_component_variant = variant; + variant->records[variant->n_records++] = TAKE_PTR(record); + } + } + + component->variants[component->n_variants++] = TAKE_PTR(variant); + return 1; +} + +static int event_log_add_component_dir(EventLog *el, const char *path, char **base_search) { + _cleanup_free_ char *fname = NULL, *id = NULL; + _cleanup_strv_free_ char **files = NULL; + EventLogComponent *component; + const char *e; + int r; + + assert(el); + + r = path_extract_filename(path, &fname); + if (r < 0) + return log_error_errno(r, "Failed to extract basename from path %s: %m", path); + + e = endswith(fname, ".pcrlock.d"); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Bad suffix: %s", fname); + + id = strndup(fname, e - fname); + if (!id) + return log_oom(); + + r = event_log_add_component(el, id, &component); + if (r < 0) + return r; + + _cleanup_strv_free_ char **search = NULL; + + STRV_FOREACH(b, base_search) { + _cleanup_free_ char *q = NULL; + + q = path_join(*b, fname); + if (!q) + return log_oom(); + + r = strv_consume(&search, TAKE_PTR(q)); + if (r < 0) + return log_oom(); + } + + r = conf_files_list_strv(&files, ".pcrlock", /* root= */ NULL, CONF_FILES_REGULAR, (const char*const*) search); + if (r < 0) + return log_error_errno(r, "Failed to enumerate .pcrlock files for component '%s': %m", id); + + STRV_FOREACH(f, files) { + r = event_log_add_component_file(el, component, *f); + if (r < 0) + return r; + } + + return 0; +} + +static int event_log_load_components(EventLog *el) { + _cleanup_strv_free_ char **files = NULL; + char **dirs; + int r; + + assert(el); + + dirs = arg_components ?: + STRV_MAKE("/etc/pcrlock.d", + "/run/pcrlock.d", + "/var/lib/pcrlock.d", + "/usr/local/lib/pcrlock.d", + "/usr/lib/pcrlock.d"); + + r = conf_files_list_strv(&files, NULL, NULL, CONF_FILES_REGULAR|CONF_FILES_DIRECTORY|CONF_FILES_FILTER_MASKED, (const char*const*) dirs); + if (r < 0) + return log_error_errno(r, "Failed to enumerate .pcrlock files: %m"); + + STRV_FOREACH(f, files) { + if (endswith(*f, ".pcrlock.d")) + r = event_log_add_component_dir(el, *f, dirs); + else if (endswith(*f, ".pcrlock")) + r = event_log_add_component_file(el, NULL, *f); + else + continue; + if (r < 0) + return r; + } + + return 0; +} + +static int event_log_validate_fully_recognized(EventLog *el) { + + for (uint32_t pcr = 0; pcr < ELEMENTSOF(el->registers); pcr++) { + bool fully_recognized = true; + + FOREACH_ARRAY(rr, el->records, el->n_records) { + EventLogRecord *rec = *rr; + + if (rec->pcr != pcr) + continue; + + if (rec->n_mapped == 0) { + log_notice("Event log record %zu (PCR %" PRIu32 ", \"%s\") not matching any component.", + (size_t) (rr - el->records), rec->pcr, strna(rec->description)); + fully_recognized = false; + break; + } + } + + el->registers[pcr].fully_recognized = fully_recognized; + } + + return 0; +} + +static int event_log_match_component_variant( + EventLog *el, + size_t i, + EventLogComponentVariant *variant, + size_t j, + bool assign) { + + int r; + + assert(el); + assert(variant); + + /* It's OK to point immediately after the last record, but not further */ + assert(i <= el->n_records); + assert(j <= variant->n_records); + + /* All entries in the variant checked out? Yippieh! */ + if (j == variant->n_records) + return true; + + /* If the remainder of the variant is longer than the remainder of the event log, it cannot possibly fit. */ + if (el->n_records - i < variant->n_records - j) + return false; + + /* Does this record match? If not, let's try at the next place in the logs. */ + if (!event_log_record_equal(el->records[i], variant->records[j])) + return event_log_match_component_variant(el, i + 1, variant, j, assign); /* Recursion! */ + + /* This one matches. Good. Let's see if the rest also matches. (Recursion!) */ + r = event_log_match_component_variant(el, i + 1, variant, j + 1, assign); + if (r <= 0) + return r; + + if (assign) { + /* Take ownership (Note we allow multiple components and variants to take owneship of the same record!) */ + if (!GREEDY_REALLOC(el->records[i]->mapped, el->records[i]->n_mapped+1)) + return log_oom(); + + el->records[i]->mapped[el->records[i]->n_mapped++] = variant; + } + + return true; +} + +static uint32_t event_log_component_variant_pcrs(EventLogComponentVariant *i) { + uint32_t mask = 0; + + assert(i); + + /* returns mask of PCRs touched by this variant */ + + FOREACH_ARRAY(rr, i->records, i->n_records) + mask |= UINT32_C(1) << (*rr)->pcr; + + return mask; +} + +static uint32_t event_log_component_pcrs(EventLogComponent *c) { + uint32_t mask = 0; + + assert(c); + + /* Returns mask of PCRs touched by this component */ + + FOREACH_ARRAY(ii, c->variants, c->n_variants) + mask |= event_log_component_variant_pcrs(*ii); + + return mask; +} + +static int event_log_map_components(EventLog *el) { + _cleanup_free_ char *skipped_ids = NULL; + unsigned n_skipped = 0; + int r; + + assert(el); + + FOREACH_ARRAY(cc, el->components, el->n_components) { + _cleanup_free_ char *matching_ids = NULL; + unsigned n_matching = 0, n_empty = 0; + EventLogComponent *c = *cc; + + if (arg_location_end && strcmp(c->id, arg_location_end) > 0) { + n_skipped++; + + if (!strextend_with_separator(&skipped_ids, ", ", c->id)) + return log_oom(); + + continue; + } + + FOREACH_ARRAY(ii, c->variants, c->n_variants) { + EventLogComponentVariant *i = *ii; + + if (i->n_records == 0) { + /* The empty variant always matches */ + n_empty++; + continue; + } + + r = event_log_match_component_variant(el, 0, i, 0, n_matching + n_empty == 0); + if (r < 0) + return r; + if (r > 0) { + n_matching++; + + if (!strextend_with_separator(&matching_ids, ", ", i->id)) + return log_oom(); + } + } + + if (n_matching + n_empty == 0) { + + if (arg_location_start && strcmp(c->id, arg_location_start) >= 0) + log_info("Didn't find component '%s' in event log, assuming system hasn't reached it yet.", c->id); + else { + log_notice("Couldn't find component '%s' in event log.", c->id); + el->n_missing_components++; + el->missing_component_pcrs |= event_log_component_pcrs(c); + } + } else if (n_matching > 1) + log_debug("Found %u possible variants of component '%s' in event log (%s). Proceeding.", n_matching, c->id, matching_ids); + } + + if (n_skipped > 0) + log_notice("Skipped %u components after location '%s' (%s).", n_skipped, arg_location_end, skipped_ids); + if (el->n_missing_components > 0) + log_notice("Unable to recognize %zu components in event log.", el->n_missing_components); + + return event_log_validate_fully_recognized(el); +} + +static void hsv_to_rgb( + double h, double s, double v, + uint8_t* ret_r, uint8_t *ret_g, uint8_t *ret_b) { + + double c, x, m, r, g, b; + + assert(s >= 0 && s <= 100); + assert(v >= 0 && v <= 100); + assert(ret_r); + assert(ret_g); + assert(ret_b); + + c = (s / 100.0) * (v / 100.0); + x = c * (1 - fabs(fmod(h / 60.0, 2) - 1)); + m = (v / 100) - c; + + if (h >= 0 && h < 60) + r = c, g = x, b = 0.0; + else if (h >= 60 && h < 120) + r = x, g = c, b = 0.0; + else if (h >= 120 && h < 180) + r = 0.0, g = c, b = x; + else if (h >= 180 && h < 240) + r = 0.0, g = x, b = c; + else if (h >= 240 && h < 300) + r = x, g = 0.0, b = c; + else + r = c, g = 0.0, b = x; + + *ret_r = (uint8_t) ((r + m) * 255); + *ret_g = (uint8_t) ((g + m) * 255); + *ret_b = (uint8_t) ((b + m) * 255); +} + +#define ANSI_TRUE_COLOR_MAX (7U + 3U + 1U + 3U + 1U + 3U + 2U) + +static const char *ansi_true_color(uint8_t r, uint8_t g, uint8_t b, char ret[static ANSI_TRUE_COLOR_MAX]) { + snprintf(ret, ANSI_TRUE_COLOR_MAX, "\x1B[38;2;%u;%u;%um", r, g, b); + return ret; +} + +static char *color_for_pcr(EventLog *el, uint32_t pcr) { + char color[ANSI_TRUE_COLOR_MAX]; + uint8_t r, g, b; + + assert(el); + assert(pcr < TPM2_PCRS_MAX); + + if (el->registers[pcr].color) + return el->registers[pcr].color; + + hsv_to_rgb(360.0 / (TPM2_PCRS_MAX - 1) * pcr, 100, 90, &r, &g, &b); + ansi_true_color(r, g, b, color); + + el->registers[pcr].color = strdup(color); + return el->registers[pcr].color; +} + +static int add_algorithm_columns( + EventLog *el, + Table *table, + const char *prefix, + const char *json_field_prefix) { + + int r; + + assert(el); + assert(table); + + FOREACH_ARRAY(alg, el->algorithms, el->n_algorithms) { + const char *n = tpm2_hash_alg_to_string(*alg); + _cleanup_free_ char *v = NULL; + + if (prefix) { + v = strjoin(prefix, " ", n); + if (!v) + return log_oom(); + } + + size_t c = table_get_current_column(table); + + r = table_add_cell(table, NULL, TABLE_HEADER, v ?: n); + if (r < 0) + return table_log_add_error(r); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF) && + el->primary_algorithm != UINT16_MAX && + *alg != el->primary_algorithm) + (void) table_hide_column_from_display(table, c); + + _cleanup_free_ char *j = NULL; + if (json_field_prefix) { + _cleanup_free_ char *m = strdup(n); + if (!m) + return log_oom(); + + j = strjoin(json_field_prefix, ascii_strupper(m)); + if (!j) + return log_oom(); + } + + (void) table_set_json_field_name(table, c, j ?: n); + } + + return 0; +} + +static int show_log_table(EventLog *el, JsonVariant **ret_variant) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(el); + + table = table_new_raw(5 + el->n_algorithms + 4); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + r = table_add_many(table, + TABLE_HEADER, "pcr", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_HEADER, "", + TABLE_HEADER, "pcrname", + TABLE_HEADER, "event", + TABLE_HEADER, "match", + TABLE_SET_ALIGN_PERCENT, 100); + if (r < 0) + return table_log_add_error(r); + + r = add_algorithm_columns(el, table, NULL, NULL); + if (r < 0) + return r; + + size_t phase_column = table_get_current_column(table); + + r = table_add_many(table, + TABLE_HEADER, "F/U", + TABLE_HEADER, "source", + TABLE_HEADER, "component", + TABLE_HEADER, "description"); + if (r < 0) + return table_log_add_error(r); + + (void) table_hide_column_from_display(table, table_get_columns(table) - 3); /* hide source */ + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + (void) table_hide_column_from_display(table, (size_t) 1); /* hide color block column */ + + (void) table_set_json_field_name(table, phase_column, "phase"); + + FOREACH_ARRAY(rr, el->records, el->n_records) { + EventLogRecord *record = *rr; + + r = table_add_many(table, + TABLE_UINT32, record->pcr, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_FULL_BLOCK), + TABLE_SET_COLOR, color_for_pcr(el, record->pcr), + TABLE_STRING, tpm2_pcr_index_to_string(record->pcr)); + if (r < 0) + return table_log_add_error(r); + + if (EVENT_LOG_RECORD_IS_FIRMWARE(record)) { + const char *et; + + et = tpm2_log_event_type_to_string(record->firmware_event_type); + if (et) + r = table_add_cell(table, NULL, TABLE_STRING, et); + else + r = table_add_cell(table, NULL, TABLE_UINT32_HEX, &record->firmware_event_type); + } else if (EVENT_LOG_RECORD_IS_USERSPACE(record)) + r = table_add_cell(table, NULL, TABLE_STRING, tpm2_userspace_event_type_to_string(record->userspace_event_type)); + else + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (record->event_payload_valid < 0 || record->event_payload_valid == EVENT_PAYLOAD_VALID_DONT_KNOW) + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + else + r = table_add_many(table, + TABLE_BOOLEAN_CHECKMARK, record->event_payload_valid == EVENT_PAYLOAD_VALID_YES, + TABLE_SET_COLOR, ansi_highlight_green_red(record->event_payload_valid == EVENT_PAYLOAD_VALID_YES)); + if (r < 0) + return table_log_add_error(r); + + FOREACH_ARRAY(alg, el->algorithms, el->n_algorithms) { + EventLogRecordBank *bank; + + bank = event_log_record_find_bank(record, *alg); + if (bank) { + _cleanup_free_ char *hex = NULL; + + hex = hexmem(bank->hash.buffer, bank->hash.size); + if (!hex) + return log_oom(); + + r = table_add_cell(table, NULL, TABLE_STRING, hex); + } else + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_STRING, EVENT_LOG_RECORD_IS_FIRMWARE(record) ? "F" : + EVENT_LOG_RECORD_IS_USERSPACE(record) ? "U" : NULL, + TABLE_PATH_BASENAME, record->source, + TABLE_PATH_BASENAME, record->n_mapped > 0 ? record->mapped[0]->component->id : NULL, + TABLE_STRING, record->description); + if (r < 0) + return table_log_add_error(r); + } + + if (ret_variant) { + r = table_to_json(table, ret_variant); + if (r < 0) + return log_error_errno(r, "Failed to format table to JSON: %m"); + + return 0; + } + + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, /* show_header= */true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + + return 0; +} + +static bool is_unset_pcr(const void *value, size_t size) { + return memeqzero(value, size) || memeqbyte(0xffu, value, size); +} + +static bool event_log_pcr_checks_out(const EventLog *el, const EventLogRegister *reg) { + assert(el); + assert(reg); + + for (size_t i = 0; i < el->n_algorithms; i++) + if (memcmp_nn(reg->banks[i].calculated.buffer, reg->banks[i].calculated.size, + reg->banks[i].observed.buffer, reg->banks[i].observed.size) != 0) + return false; + + return true; +} + +static int show_pcr_table(EventLog *el, JsonVariant **ret_variant) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(el); + + table = table_new_raw(8 + el->n_algorithms*2); + if (!table) + return log_oom(); + + (void) table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + r = table_add_many(table, + TABLE_HEADER, "pcr", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_HEADER, "", /* color block column */ + TABLE_HEADER, "", /* emoji column */ + TABLE_HEADER, "pcrname", + TABLE_HEADER, "count", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_HEADER, "h", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_HEADER, "r", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_HEADER, "c", + TABLE_SET_ALIGN_PERCENT, 100); + if (r < 0) + return table_log_add_error(r); + + r = add_algorithm_columns(el, table, "Calculated", "calculated"); + if (r < 0) + return r; + + r = add_algorithm_columns(el, table, "Observed", "observed"); + if (r < 0) + return r; + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + (void) table_hide_column_from_display(table, (size_t) 1, (size_t) 2); /* hide color block and emoji column */ + else if (!emoji_enabled()) + (void) table_hide_column_from_display(table, (size_t) 2); + + (void) table_set_json_field_name(table, 5, "hashMatchesEventLog"); + (void) table_set_json_field_name(table, 6, "allEventsMatched"); + (void) table_set_json_field_name(table, 7, "noMissingComponents"); + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + /* Check if the PCR hash value matches the event log data */ + bool hash_match = event_log_pcr_checks_out(el, el->registers + pcr); + + /* Whether all records in this PCR have a matching component */ + bool fully_recognized = el->registers[pcr].fully_recognized; + + /* Whether any unmatched components touch this PCR */ + bool missing_components = FLAGS_SET(el->missing_component_pcrs, UINT32_C(1) << pcr); + + const char *emoji = special_glyph( + !hash_match ? SPECIAL_GLYPH_DEPRESSED_SMILEY : + !fully_recognized ? SPECIAL_GLYPH_UNHAPPY_SMILEY : + missing_components ? SPECIAL_GLYPH_SLIGHTLY_HAPPY_SMILEY : + SPECIAL_GLYPH_HAPPY_SMILEY); + + r = table_add_many(table, + TABLE_UINT32, pcr, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_FULL_BLOCK), + TABLE_SET_COLOR, color_for_pcr(el, pcr), + TABLE_STRING, emoji, + TABLE_STRING, tpm2_pcr_index_to_string(pcr)); + if (r < 0) + return table_log_add_error(r); + + if (el->registers[pcr].n_measurements > 0) + r = table_add_cell(table, NULL, TABLE_UINT, &el->registers[pcr].n_measurements); + else + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + r = table_add_many(table, + TABLE_BOOLEAN_CHECKMARK, hash_match, + TABLE_SET_COLOR, ansi_highlight_green_red(hash_match), + TABLE_BOOLEAN_CHECKMARK, fully_recognized, + TABLE_SET_COLOR, ansi_highlight_green_red(fully_recognized), + TABLE_BOOLEAN_CHECKMARK, !missing_components, + TABLE_SET_COLOR, ansi_highlight_green_red(!missing_components)); + if (r < 0) + return table_log_add_error(r); + + for (size_t i = 0; i < el->n_algorithms; i++) { + const char *color; + + color = is_unset_pcr(el->registers[pcr].banks[i].calculated.buffer, el->registers[pcr].banks[i].calculated.size) ? ANSI_GREY : NULL; + + if (el->registers[pcr].banks[i].calculated.size > 0) { + _cleanup_free_ char *hex = NULL; + + hex = hexmem(el->registers[pcr].banks[i].calculated.buffer, el->registers[pcr].banks[i].calculated.size); + if (!hex) + return log_oom(); + + r = table_add_many(table, + TABLE_STRING, hex, + TABLE_SET_COLOR, color); + } else + r = table_add_many(table, + TABLE_EMPTY, + TABLE_SET_COLOR, color); + if (r < 0) + return table_log_add_error(r); + } + + for (size_t i = 0; i < el->n_algorithms; i++) { + _cleanup_free_ char *hex = NULL; + const char *color; + + hex = hexmem(el->registers[pcr].banks[i].observed.buffer, el->registers[pcr].banks[i].observed.size); + if (!hex) + return log_oom(); + + color = !hash_match ? ANSI_HIGHLIGHT_RED : + is_unset_pcr(el->registers[pcr].banks[i].observed.buffer, el->registers[pcr].banks[i].observed.size) ? ANSI_GREY : NULL; + + r = table_add_many(table, + TABLE_STRING, hex, + TABLE_SET_COLOR, color); + if (r < 0) + return table_log_add_error(r); + } + } + + if (ret_variant) { + r = table_to_json(table, ret_variant); + if (r < 0) + return log_error_errno(r, "Failed to format table to JSON: %m"); + + return 0; + } + + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, /* show_header= */ true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + printf("\n" + "%sLegend: H → PCR hash value matches event log%s\n" + "%s R → All event log records for this PCR have a matching component%s\n" + "%s C → No components that couldn't be matched with log records affect this PCR%s\n", + ansi_grey(), ansi_normal(), /* less on small screens automatically resets the color after long lines, hence we set it anew for each line */ + ansi_grey(), ansi_normal(), + ansi_grey(), ansi_normal()); + + return 0; +} + +static int event_determine_primary_algorithm(EventLog *el) { + assert(el); + + if (el->n_algorithms == 0) { + /* Nothing loaded to make the decision on? Then pick SHA256 */ + el->primary_algorithm = TPM2_ALG_SHA256; + return 0; + } + + FOREACH_ARRAY(alg, el->algorithms, el->n_algorithms) { + /* If we have SHA256, focus on that that */ + + if (*alg == TPM2_ALG_SHA256) { + el->primary_algorithm = *alg; + return 0; + } + } + + /* Otherwise show the "best" (i.e. the one with the highest id value) */ + el->primary_algorithm = el->algorithms[el->n_algorithms-1]; + return 0; +} + +static int event_log_load_and_process(EventLog **ret) { + _cleanup_(event_log_freep) EventLog *el = NULL; + int r; + + el = event_log_new(); + if (!el) + return log_oom(); + + r = event_log_add_algorithms_from_environment(el); + if (r < 0) + return r; + + r = event_log_load(el); + if (r < 0) + return r; + + r = event_log_read_pcrs(el); + if (r < 0) + return r; + + r = event_log_calculate_pcrs(el); + if (r < 0) + return r; + + r = event_log_validate_record_hashes(el); + if (r < 0) + return r; + + r = event_determine_primary_algorithm(el); + if (r < 0) + return r; + + r = event_log_load_components(el); + if (r < 0) + return r; + + r = event_log_map_components(el); + if (r < 0) + return r; + + *ret = TAKE_PTR(el); + return 0; +} + +static int verb_show_log(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *log_table = NULL, *pcr_table = NULL; + _cleanup_(event_log_freep) EventLog *el = NULL; + bool want_json = !FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF); + int r; + + r = event_log_load_and_process(&el); + if (r < 0) + return r; + + if (!want_json) + putchar('\n'); + + r = show_log_table(el, want_json ? &log_table : NULL); + if (r < 0) + return r; + + if (!want_json) + putchar('\n'); + + r = show_pcr_table(el, want_json ? &pcr_table : NULL); + if (r < 0) + return r; + + if (want_json) { + _cleanup_(json_variant_unrefp) JsonVariant *object = NULL; + + r = json_build(&object, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_VARIANT("log", log_table), + JSON_BUILD_PAIR_VARIANT("pcrs", pcr_table))); + if (r < 0) + return log_error_errno(r, "Failed to generate combined object: %m"); + + r = json_variant_dump(object, arg_json_format_flags, stdout, /* prefix= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to dump JSON object: %m"); + } + + return 0; +} + +static int verb_show_cel(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_(event_log_freep) EventLog *el = NULL; + uint64_t recnum = 0; + int r; + + el = event_log_new(); + if (!el) + return log_oom(); + + r = event_log_load(el); + if (r < 0) + return r; + + /* Output the event log in TCG CEL-JSON. */ + + FOREACH_ARRAY(rr, el->records, el->n_records) { + _cleanup_(json_variant_unrefp) JsonVariant *ja = NULL, *fj = NULL; + EventLogRecord *record = *rr; + JsonVariant *cd = NULL; + const char *ct = NULL; + + LIST_FOREACH(banks, bank, record->banks) { + r = json_variant_append_arrayb( + &ja, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("hashAlg", tpm2_hash_alg_to_string(bank->algorithm)), + JSON_BUILD_PAIR_HEX("digest", bank->hash.buffer, bank->hash.size))); + if (r < 0) + return log_error_errno(r, "Failed to append CEL digest entry: %m"); + } + + if (!ja) { + r = json_variant_new_array(&ja, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate JSON array: %m"); + } + + if (EVENT_LOG_RECORD_IS_FIRMWARE(record)) { + _cleanup_free_ char *et = NULL; + const char *z; + + z = tpm2_log_event_type_to_string(record->firmware_event_type); + if (z) { + _cleanup_free_ char *b = NULL; + + b = strreplace(z, "-", "_"); + if (!b) + return log_oom(); + + et = strjoin("EV_", ascii_strupper(b)); + if (!et) + return log_oom(); + } else if (asprintf(&et, "%" PRIu32, record->firmware_event_type) < 0) + return log_oom(); + + r = json_build(&fj, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("event_type", et), + JSON_BUILD_PAIR_HEX("event_data", record->firmware_payload, record->firmware_payload_size))); + if (r < 0) + return log_error_errno(r, "Failed to build firmware event data: %m"); + + cd = fj; + ct = "pcclient_std"; + } else if (EVENT_LOG_RECORD_IS_USERSPACE(record)) { + cd = record->userspace_content; + ct = "systemd"; + } + + r = json_variant_append_arrayb(&array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("pcr", record->pcr), + JSON_BUILD_PAIR_UNSIGNED("recnum", ++recnum), + JSON_BUILD_PAIR_VARIANT("digests", ja), + JSON_BUILD_PAIR_CONDITION(ct, "content_type", JSON_BUILD_STRING(ct)), + JSON_BUILD_PAIR_CONDITION(cd, "content", JSON_BUILD_VARIANT(cd)))); + if (r < 0) + return log_error_errno(r, "Failed to append CEL record: %m"); + } + + if (arg_json_format_flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO)) + pager_open(arg_pager_flags); + + json_variant_dump(array, arg_json_format_flags|JSON_FORMAT_EMPTY_ARRAY, stdout, NULL); + return 0; +} + +static int verb_list_components(int argc, char *argv[], void *userdata) { + _cleanup_(event_log_freep) EventLog *el = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + enum { + BEFORE_LOCATION, + BETWEEN_LOCATION, + AFTER_LOCATION, + } loc = BEFORE_LOCATION; + int r; + + el = event_log_new(); + if (!el) + return log_oom(); + + r = event_log_add_algorithms_from_environment(el); + if (r < 0) + return r; + + r = event_determine_primary_algorithm(el); + if (r < 0) + return r; + + r = event_log_load_components(el); + if (r < 0) + return r; + + table = table_new("id", "variants"); + if (!table) + return log_oom(); + + FOREACH_ARRAY(c, el->components, el->n_components) { + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + _cleanup_free_ char *marker = NULL; + + switch (loc) { + + case BEFORE_LOCATION: + if (arg_location_end && strcmp((*c)->id, arg_location_end) >= 0) { + loc = AFTER_LOCATION; + marker = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), " location '", arg_location_end, "' ", special_glyph(SPECIAL_GLYPH_ARROW_LEFT)); + } else if (arg_location_start && strcmp((*c)->id, arg_location_start) >= 0) { + loc = BETWEEN_LOCATION; + marker = strjoin(special_glyph(SPECIAL_GLYPH_TREE_TOP), " start location '", arg_location_start, "' ", special_glyph(SPECIAL_GLYPH_ARROW_DOWN)); + } + + break; + + case BETWEEN_LOCATION: + if (arg_location_end && strcmp((*c)->id, arg_location_end) >= 0) { + loc = AFTER_LOCATION; + marker = strjoin(special_glyph(SPECIAL_GLYPH_TREE_RIGHT), " end location '", arg_location_end, "' ", special_glyph(SPECIAL_GLYPH_ARROW_UP)); + } + break; + + case AFTER_LOCATION: + break; + } + + if (marker) { + r = table_add_many(table, + TABLE_STRING, marker, + TABLE_SET_COLOR, ANSI_GREY, + TABLE_EMPTY); + if (r < 0) + return table_log_add_error(r); + } + } + + FOREACH_ARRAY(variant, (*c)->variants, (*c)->n_variants) { + r = table_add_many(table, + TABLE_STRING, (*c)->id, + TABLE_PATH, (*variant)->path); + if (r < 0) + return table_log_add_error(r); + } + } + + if (table_get_rows(table) > 1 || !FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, /* show_header= */ true); + if (r < 0) + return log_error_errno(r, "Failed to output table: %m"); + } + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + if (table_get_rows(table) > 1) + printf("\n%zu components listed.\n", table_get_rows(table) - 1); + else + printf("No components defined.\n"); + } + + return 0; +} + +static int event_log_pcr_mask_checks_out(EventLog *el, uint32_t mask) { + assert(el); + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + + if (!FLAGS_SET(mask, UINT32_C(1) << pcr)) + continue; + + if (!event_log_pcr_checks_out(el, el->registers + pcr)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Event log for PCR %" PRIu32 " does not match PCR state, refusing.", pcr); + } + + return 0; +} + +static int make_pcrlock_record( + uint32_t pcr, + const void *data, + size_t data_size, + JsonVariant **ret_record) { + + _cleanup_(json_variant_unrefp) JsonVariant *digests = NULL; + int r; + + assert(data || data_size == 0); + assert(ret_record); + + if (data_size == SIZE_MAX) + data_size = strlen(data); + + /* Generates a .pcrlock record for the given PCR and data/data size. This is a subset of TCG CEL. */ + + FOREACH_ARRAY(pa, tpm2_hash_algorithms, TPM2_N_HASH_ALGORITHMS) { + _cleanup_free_ unsigned char *hash = NULL; + int hash_ssize; + unsigned hash_usize; + const EVP_MD *md; + const char *a; + + assert_se(a = tpm2_hash_alg_to_string(*pa)); + assert_se(md = EVP_get_digestbyname(a)); + hash_ssize = EVP_MD_size(md); + assert_se(hash_ssize > 0); + hash_usize = hash_ssize; + + hash = malloc(hash_usize); + if (!hash) + return log_oom(); + + if (EVP_Digest(data, data_size, hash, &hash_usize, md, NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to hash data with algorithm '%s'.", a); + + r = json_variant_append_arrayb( + &digests, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashAlg", JSON_BUILD_STRING(a)), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(hash, hash_usize)))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON digest object: %m"); + } + + r = json_build(ret_record, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(pcr)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(digests)))); + if (r < 0) + return log_error_errno(r, "Failed to build record object: %m"); + + return 0; +} + +static const char *pcrlock_path(const char *default_pcrlock_path) { + return arg_pcrlock_path ?: arg_pcrlock_auto ? default_pcrlock_path : NULL; +} + +static int write_pcrlock(JsonVariant *array, const char *default_pcrlock_path) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *a = NULL; + _cleanup_fclose_ FILE *f = NULL; + const char *p; + int r; + + if (!array) { + r = json_variant_new_array(&a, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate empty array: %m"); + + array = a; + } + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("records", JSON_BUILD_VARIANT(array)))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON object: %m"); + + p = pcrlock_path(default_pcrlock_path); + if (p) { + (void) mkdir_parents_label(p, 0755); + + f = fopen(p, "we"); + if (!f) + return log_error_errno(errno, "Failed to open %s for writing: %m", p); + } + + r = json_variant_dump(v, arg_json_format_flags, f ?: stdout, /* prefix= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to output JSON object: %m"); + + if (p) + log_info("%s written.", p); + + return 0; +} + +static int unlink_pcrlock(const char *default_pcrlock_path) { + const char *p; + + p = pcrlock_path(default_pcrlock_path); + if (!p) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No .pcrlock path specified, refusing."); + + if (unlink(p) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to delete %s: %m", p); + + log_info("%s already deleted.", p); + } else + log_info("%s deleted.", p); + + (void) rmdir_parents(p, "/var/lib"); + + return 0; +} + +static int verb_lock_raw(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_free_ char *data = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size; + int r; + + if (arg_pcr_mask == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No PCR specified, refusing."); + + if (argc >= 2) { + f = fopen(argv[1], "re"); + if (!f) + return log_error_errno(errno, "Failed to open '%s': %m", argv[1]); + } + + r = read_full_stream(f ?: stdin, &data, &size); + if (r < 0) + return log_error_errno(r, "Failed to read data from stdin: %m"); + + for (uint32_t i = 0; i < TPM2_PCRS_MAX; i++) { + _cleanup_(json_variant_unrefp) JsonVariant *record = NULL; + + if (!FLAGS_SET(arg_pcr_mask, UINT32_C(1) << i)) + continue; + + r = make_pcrlock_record(i, data, size, &record); + if (r < 0) + return r; + + r = json_variant_append_array(&array, record); + if (r < 0) + return log_error_errno(r, "Failed to append to JSON array: %m"); + } + + return write_pcrlock(array, NULL); +} + +static int verb_unlock_simple(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(NULL); +} + +static int verb_lock_secureboot_policy(int argc, char *argv[], void *userdata) { + static const struct { + sd_id128_t id; + const char *name; + int synthesize_empty; /* 0 → fail, > 0 → synthesize empty db, < 0 → skip */ + } variables[] = { + { EFI_VENDOR_GLOBAL, "SecureBoot", 0 }, + { EFI_VENDOR_GLOBAL, "PK", 1 }, + { EFI_VENDOR_GLOBAL, "KEK", 1 }, + { EFI_VENDOR_DATABASE, "db", 1 }, + { EFI_VENDOR_DATABASE, "dbx", 1 }, + { EFI_VENDOR_DATABASE, "dbt", -1 }, + { EFI_VENDOR_DATABASE, "dbr", -1 }, + }; + + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + int r; + + /* Generates expected records from the current SecureBoot state, as readable in the EFI variables + * right now. */ + + FOREACH_ARRAY(vv, variables, ELEMENTSOF(variables)) { + _cleanup_(json_variant_unrefp) JsonVariant *record = NULL; + + _cleanup_free_ char *name = NULL; + if (asprintf(&name, "%s-" SD_ID128_UUID_FORMAT_STR, vv->name, SD_ID128_FORMAT_VAL(vv->id)) < 0) + return log_oom(); + + _cleanup_free_ void *data = NULL; + size_t data_size; + r = efi_get_variable(name, NULL, &data, &data_size); + if (r < 0) { + if (r != -ENOENT || vv->synthesize_empty == 0) + return log_error_errno(r, "Failed to read EFI variable '%s': %m", name); + if (vv->synthesize_empty < 0) + continue; + + /* If the main database variables are not set we don't consider this an error, but + * measure an empty database instead. */ + log_debug("EFI variable %s is not set, synthesizing empty variable for measurement.", name); + data_size = 0; + } + + _cleanup_free_ char16_t* name16 = utf8_to_utf16(vv->name, SIZE_MAX); + if (!name16) + return log_oom(); + size_t name16_bytes = char16_strlen(name16) * 2; + + size_t vdata_size = offsetof(UEFI_VARIABLE_DATA, unicodeName) + name16_bytes + data_size; + _cleanup_free_ UEFI_VARIABLE_DATA *vdata = malloc(vdata_size); + if (!vdata) + return log_oom(); + + *vdata = (UEFI_VARIABLE_DATA) { + .unicodeNameLength = name16_bytes / 2, + .variableDataLength = data_size, + }; + + efi_id128_to_guid(vv->id, vdata->variableName); + memcpy(mempcpy(vdata->unicodeName, name16, name16_bytes), data, data_size); + + r = make_pcrlock_record(TPM2_PCR_SECURE_BOOT_POLICY /* =7 */, vdata, vdata_size, &record); + if (r < 0) + return r; + + r = json_variant_append_array(&array, record); + if (r < 0) + return log_error_errno(r, "Failed to append to JSON array: %m"); + } + + return write_pcrlock(array, PCRLOCK_SECUREBOOT_POLICY_PATH); +} + +static int verb_unlock_secureboot_policy(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(PCRLOCK_SECUREBOOT_POLICY_PATH); +} + +static int event_log_record_is_secureboot_variable(EventLogRecord *rec, sd_id128_t uuid, const char *name) { + _cleanup_free_ char *found_name = NULL; + sd_id128_t found_uuid; + int r; + + assert(rec); + assert(name); + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + return false; + + if (rec->pcr != TPM2_PCR_SECURE_BOOT_POLICY) + return false; + + if (rec->event_payload_valid != EVENT_PAYLOAD_VALID_YES) + return false; + + if (rec->firmware_event_type != EV_EFI_VARIABLE_DRIVER_CONFIG) + return false; + + r = event_log_record_parse_variable_data(rec, &found_uuid, &found_name); + if (r == -EBADMSG) + return false; + if (r < 0) + return r; + + if (!sd_id128_equal(found_uuid, uuid)) + return false; + + return streq(found_name, name); +} + +static bool event_log_record_is_secureboot_authority(EventLogRecord *rec) { + assert(rec); + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + return false; + + if (rec->pcr != TPM2_PCR_SECURE_BOOT_POLICY) + return false; + + if (rec->event_payload_valid != EVENT_PAYLOAD_VALID_YES) + return false; + + return rec->firmware_event_type == EV_EFI_VARIABLE_AUTHORITY; +} + +static int event_log_ensure_secureboot_consistency(EventLog *el) { + static const struct { + sd_id128_t id; + const char *name; + bool required; + } table[] = { + { EFI_VENDOR_GLOBAL, "SecureBoot", true }, + { EFI_VENDOR_GLOBAL, "PK", true }, + { EFI_VENDOR_GLOBAL, "KEK", true }, + { EFI_VENDOR_DATABASE, "db", true }, + { EFI_VENDOR_DATABASE, "dbx", true }, + { EFI_VENDOR_DATABASE, "dbt", false }, + { EFI_VENDOR_DATABASE, "dbr", false }, + // FIXME: ensure we also find the separator here + }; + + EventLogRecord *records[ELEMENTSOF(table)] = {}; + EventLogRecord *first_authority = NULL; + + assert(el); + + /* Ensures that the PCR 7 records are complete and in order. Before we lock down PCR 7 we want to + * ensure its state is actually consistent. */ + + FOREACH_ARRAY(rr, el->records, el->n_records) { + EventLogRecord *rec = *rr; + size_t found = SIZE_MAX; + + if (event_log_record_is_secureboot_authority(rec)) { + if (first_authority) + continue; + + first_authority = rec; + // FIXME: also check that each authority record's data is also listed in 'db' + continue; + } + + for (size_t i = 0; i < ELEMENTSOF(table); i++) + if (event_log_record_is_secureboot_variable(rec, table[i].id, table[i].name)) { + found = i; + break; + } + if (found == SIZE_MAX) + continue; + + /* Require the authority records always come *after* database measurements */ + if (first_authority) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "SecureBoot authority before variable, refusing."); + + /* Check for duplicates */ + if (records[found]) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Duplicate '%s' record, refusing.", rec->description); + + /* Check for order */ + for (size_t j = found + 1; j < ELEMENTSOF(table); j++) + if (records[j]) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "'%s' record before '%s' record, refusing.", records[j]->description, rec->description); + + records[found] = rec; + } + + /* Check for existence */ + for (size_t i = 0; i < ELEMENTSOF(table); i++) + if (table[i].required && !records[i]) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Required record '%s' not found, refusing.", table[i].name); + + /* At this point we know that all required variables have been measured, in the right order. */ + return 0; +} + +static int verb_lock_secureboot_authority(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_(event_log_freep) EventLog *el = NULL; + int r; + + /* Lock down the EV_EFI_VARIABLE_AUTHORITY records from the existing log. Note that there's not too + * much value in locking this down too much, since it stores only the result of the primary database + * checks, and that's what we should bind policy to. Moreover it's hard to predict, since extension + * card firmware validation will result in additional records here. */ + + if (!is_efi_secure_boot()) { + log_info("SecureBoot disabled, not generating authority .pcrlock file."); + return unlink_pcrlock(PCRLOCK_SECUREBOOT_AUTHORITY_PATH); + } + + el = event_log_new(); + if (!el) + return log_oom(); + + r = event_log_add_algorithms_from_environment(el); + if (r < 0) + return r; + + r = event_log_load(el); + if (r < 0) + return r; + + r = event_log_read_pcrs(el); + if (r < 0) + return r; + + r = event_log_calculate_pcrs(el); + if (r < 0) + return r; + + /* Before we base anything on the event log records, let's check that the event log state checks + * out. */ + + r = event_log_pcr_mask_checks_out(el, UINT32_C(1) << TPM2_PCR_SECURE_BOOT_POLICY); + if (r < 0) + return r; + + r = event_log_validate_record_hashes(el); + if (r < 0) + return r; + + r = event_log_ensure_secureboot_consistency(el); + if (r < 0) + return r; + + FOREACH_ARRAY(rr, el->records, el->n_records) { + _cleanup_(json_variant_unrefp) JsonVariant *digests = NULL; + EventLogRecord *rec = *rr; + + if (!event_log_record_is_secureboot_authority(rec)) + continue; + + log_debug("Locking down authority '%s'.", strna(rec->description)); + + LIST_FOREACH(banks, bank, rec->banks) { + r = json_variant_append_arrayb( + &digests, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashAlg", JSON_BUILD_STRING(tpm2_hash_alg_to_string(bank->algorithm))), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(bank->hash.buffer, bank->hash.size)))); + if (r < 0) + return log_error_errno(r, "Failed to build digests array: %m"); + } + + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(rec->pcr)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(digests)))); + if (r < 0) + return log_error_errno(r, "Failed to build record array: %m"); + } + + return write_pcrlock(array, PCRLOCK_SECUREBOOT_AUTHORITY_PATH); +} + +static int verb_unlock_secureboot_authority(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(PCRLOCK_SECUREBOOT_AUTHORITY_PATH); +} + +static int verb_lock_gpt(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL, *record = NULL; + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + uint8_t h[2 * 4096]; /* space for at least two 4K sectors. GPT header should definitely be in here */ + uint64_t start, n_members, member_size; + _cleanup_close_ int fd = -EBADF; + const GptHeader *p; + size_t found = 0; + ssize_t n; + int r; + + r = block_device_new_from_path( + argc >= 2 ? argv[1] : "/", + BLOCK_DEVICE_LOOKUP_WHOLE_DISK|BLOCK_DEVICE_LOOKUP_BACKING|BLOCK_DEVICE_LOOKUP_ORIGINATING, + &d); + if (r < 0) + return log_error_errno(r, "Failed to determine root block device: %m"); + + fd = sd_device_open(d, O_CLOEXEC|O_RDONLY|O_NOCTTY); + if (fd < 0) + return log_error_errno(fd, "Failed to open root block device: %m"); + + n = pread(fd, &h, sizeof(h), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read GPT header of block device: %m"); + if ((size_t) n != sizeof(h)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read trying to read GPT header: %m"); + + /* Try a couple of sector sizes */ + for (size_t sz = 512; sz <= 4096; sz <<= 1) { + assert(sizeof(h) >= sz * 2); + p = (const GptHeader*) (h + sz); /* 2nd sector */ + + if (!gpt_header_has_signature(p)) + continue; + + if (found != 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "Disk has partition table for multiple sector sizes, refusing."); + + found = sz; + } + + if (found == 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Disk does not have GPT partition table, refusing."); + + p = (const GptHeader*) (h + found); + + if (le32toh(p->header_size) > found) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "GPT header size over long (%" PRIu32 "), refusing.", le32toh(p->header_size)); + + start = le64toh(p->partition_entry_lba); + if (start > UINT64_MAX / found) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Partition table start offset overflow, refusing."); + + member_size = le32toh(p->size_of_partition_entry); + if (member_size < sizeof(GptPartitionEntry)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Partition entry size too short, refusing."); + + n_members = le32toh(p->number_of_partition_entries); + uint64_t member_bufsz = n_members * member_size; + if (member_bufsz > 1U*1024U*1024U) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Partition table size too large, refusing."); + + member_bufsz = ROUND_UP(member_bufsz, found); + + _cleanup_free_ void *members = malloc(member_bufsz); + if (!members) + return log_oom(); + + n = pread(fd, members, member_bufsz, start * found); + if (n < 0) + return log_error_errno(errno, "Failed to read GPT partition table entries: %m"); + if ((size_t) n != member_bufsz) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading GPT partition table entries: %m"); + + size_t vdata_size = le32toh(p->header_size) + sizeof(le64_t) + member_size * n_members; + _cleanup_free_ void *vdata = malloc0(vdata_size); + if (!vdata) + return log_oom(); + + void *n_measured_entries = mempcpy(vdata, p, sizeof(GptHeader)); /* n_measured_entries is a 64bit value */ + + void *qq = (uint8_t*) n_measured_entries + sizeof(le64_t); + + for (uint64_t i = 0; i < n_members; i++) { + const GptPartitionEntry *entry = (const GptPartitionEntry*) ((const uint8_t*) members + (member_size * i)); + + if (memeqzero(entry->partition_type_guid, sizeof(entry->partition_type_guid))) + continue; + + qq = mempcpy(qq, entry, member_size); + unaligned_write_le64(n_measured_entries, unaligned_read_le64(n_measured_entries) + 1); + } + + vdata_size = (uint8_t*) qq - (uint8_t*) vdata; + + r = make_pcrlock_record(TPM2_PCR_BOOT_LOADER_CONFIG /* =5 */, vdata, vdata_size, &record); + if (r < 0) + return r; + + r = json_variant_new_array(&array, &record, 1); + if (r < 0) + return log_error_errno(r, "Failed to append to JSON array: %m"); + + return write_pcrlock(array, PCRLOCK_GPT_PATH); +} + +static int verb_unlock_gpt(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(PCRLOCK_GPT_PATH); +} + +static bool event_log_record_is_separator(const EventLogRecord *rec) { + assert(rec); + + /* Recognizes EV_SEPARATOR events */ + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + return false; + + if (rec->firmware_event_type != EV_SEPARATOR) + return false; + + return rec->event_payload_valid == EVENT_PAYLOAD_VALID_YES; /* Insist the record is consistent */ +} + +static int event_log_record_is_action_calling_efi_app(const EventLogRecord *rec) { + _cleanup_free_ char *d = NULL; + int r; + + assert(rec); + + /* Recognizes the special EV_EFI_ACTION that is issues when the firmware passes control to the boot loader. */ + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + return false; + + if (rec->pcr != TPM2_PCR_BOOT_LOADER_CODE) + return false; + + if (rec->firmware_event_type != EV_EFI_ACTION) + return false; + + if (rec->event_payload_valid != EVENT_PAYLOAD_VALID_YES) /* Insist the record is consistent */ + return false; + + r = make_cstring(rec->firmware_payload, rec->firmware_payload_size, MAKE_CSTRING_ALLOW_TRAILING_NUL, &d); + if (r < 0) + return r; + + return streq(d, "Calling EFI Application from Boot Option"); +} + +static void enable_json_sse(void) { + /* We shall write this to a single output stream? We have to output two files, hence try to be smart + * and enable JSON SSE */ + + if (!arg_pcrlock_path && arg_pcrlock_auto) + return; + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_SSE)) + return; + + log_notice("Enabling JSON_SEQ mode, since writing two .pcrlock files to single output."); + arg_json_format_flags |= JSON_FORMAT_SSE; +} + +static int verb_lock_firmware(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array_early = NULL, *array_late = NULL; + _cleanup_(event_log_freep) EventLog *el = NULL; + uint32_t always_mask, separator_mask, separator_seen_mask = 0, action_seen_mask = 0; + const char *default_pcrlock_early_path, *default_pcrlock_late_path; + int r; + + enable_json_sse(); + + /* The PCRs we intend to cover. Note that we measure firmware, external *and* boot loader code/config + * here – but the latter only until the "separator" events are seen, which tell us where transition + * into OS boot loader happens. This reflects the fact that on some systems the firmware already + * measures some firmware-supplied apps into PCR 4. (e.g. Thinkpad X1 Gen9) */ + if (endswith(argv[0], "firmware-code")) { + always_mask = (UINT32_C(1) << TPM2_PCR_PLATFORM_CODE) | /* → 0 */ + (UINT32_C(1) << TPM2_PCR_EXTERNAL_CODE); /* → 2 */ + + separator_mask = UINT32_C(1) << TPM2_PCR_BOOT_LOADER_CODE; /* → 4 */ + + default_pcrlock_early_path = PCRLOCK_FIRMWARE_CODE_EARLY_PATH; + default_pcrlock_late_path = PCRLOCK_FIRMWARE_CODE_LATE_PATH; + } else { + assert(endswith(argv[0], "firmware-config")); + always_mask = (UINT32_C(1) << TPM2_PCR_PLATFORM_CONFIG) | /* → 1 */ + (UINT32_C(1) << TPM2_PCR_EXTERNAL_CONFIG); /* → 3 */ + + separator_mask = UINT32_C(1) << TPM2_PCR_BOOT_LOADER_CONFIG; /* → 5 */ + + default_pcrlock_early_path = PCRLOCK_FIRMWARE_CONFIG_EARLY_PATH; + default_pcrlock_late_path = PCRLOCK_FIRMWARE_CONFIG_LATE_PATH; + } + + el = event_log_new(); + if (!el) + return log_oom(); + + r = event_log_add_algorithms_from_environment(el); + if (r < 0) + return r; + + r = event_log_load(el); + if (r < 0) + return r; + + r = event_log_read_pcrs(el); + if (r < 0) + return r; + + r = event_log_calculate_pcrs(el); + if (r < 0) + return r; + + r = event_log_validate_record_hashes(el); + if (r < 0) + return r; + + /* Before we base anything on the event log records for any of the selected PCRs, let's check that + * the event log state checks out for them. */ + + r = event_log_pcr_mask_checks_out(el, always_mask|separator_mask); + if (r < 0) + return r; + + // FIXME: before doing this, validate ahead-of-time that EV_SEPARATOR records exist for all entries, + // and exactly once + + FOREACH_ARRAY(rr, el->records, el->n_records) { + _cleanup_(json_variant_unrefp) JsonVariant *digests = NULL; + EventLogRecord *rec = *rr; + uint32_t bit = UINT32_C(1) << rec->pcr; + + if (!EVENT_LOG_RECORD_IS_FIRMWARE(rec)) + continue; + + if (!FLAGS_SET(always_mask, bit) && + !(FLAGS_SET(separator_mask, bit) && !FLAGS_SET(separator_seen_mask|action_seen_mask, bit))) + continue; + + /* If we hit the separator record, we stop processing the PCRs listed in `separator_mask` */ + if (event_log_record_is_separator(rec)) { + separator_seen_mask |= bit; + continue; + } + + /* If we hit the special "Calling EFI Application from Boot Option" action we treat this the + * same as a separator here, as that's where firmware passes control to boot loader. Note + * that some EFI implementations forget to generate one of them. */ + r = event_log_record_is_action_calling_efi_app(rec); + if (r < 0) + return log_error_errno(r, "Failed to check if event is 'Calling EFI Application from Boot Option' action: %m"); + if (r > 0) { + action_seen_mask |= bit; + continue; + } + + LIST_FOREACH(banks, bank, rec->banks) { + r = json_variant_append_arrayb( + &digests, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashAlg", JSON_BUILD_STRING(tpm2_hash_alg_to_string(bank->algorithm))), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(bank->hash.buffer, bank->hash.size)))); + if (r < 0) + return log_error_errno(r, "Failed to build digests array: %m"); + } + + r = json_variant_append_arrayb( + FLAGS_SET(separator_seen_mask, bit) ? &array_late : &array_early, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(rec->pcr)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(digests)))); + if (r < 0) + return log_error_errno(r, "Failed to build record array: %m"); + } + + r = write_pcrlock(array_early, default_pcrlock_early_path); + if (r < 0) + return r; + + return write_pcrlock(array_late, default_pcrlock_late_path); +} + +static int verb_unlock_firmware(int argc, char *argv[], void *userdata) { + const char *default_pcrlock_early_path, *default_pcrlock_late_path; + int r; + + if (endswith(argv[0], "firmware-code")) { + default_pcrlock_early_path = PCRLOCK_FIRMWARE_CODE_EARLY_PATH; + default_pcrlock_late_path = PCRLOCK_FIRMWARE_CODE_LATE_PATH; + } else { + default_pcrlock_early_path = PCRLOCK_FIRMWARE_CONFIG_EARLY_PATH; + default_pcrlock_late_path = PCRLOCK_FIRMWARE_CONFIG_LATE_PATH; + } + + r = unlink_pcrlock(default_pcrlock_early_path); + if (r < 0) + return r; + + if (arg_pcrlock_path) /* if the path is specified don't delete the same thing twice */ + return 0; + + r = unlink_pcrlock(default_pcrlock_late_path); + if (r < 0) + return r; + + return 0; +} + +static int verb_lock_machine_id(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *record = NULL, *array = NULL; + _cleanup_free_ char *word = NULL; + int r; + + r = pcrextend_machine_id_word(&word); + if (r < 0) + return r; + + r = make_pcrlock_record(TPM2_PCR_SYSTEM_IDENTITY /* = 15 */, word, SIZE_MAX, &record); + if (r < 0) + return r; + + r = json_variant_new_array(&array, &record, 1); + if (r < 0) + return log_error_errno(r, "Failed to create record array: %m"); + + return write_pcrlock(array, PCRLOCK_MACHINE_ID_PATH); +} + +static int verb_unlock_machine_id(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(PCRLOCK_MACHINE_ID_PATH); +} + +static int pcrlock_file_system_path(const char *normalized_path, char **ret) { + _cleanup_free_ char *s = NULL; + + assert(normalized_path); + + if (path_equal(normalized_path, "/")) + s = strdup(PCRLOCK_ROOT_FILE_SYSTEM_PATH); + else { + /* We reuse the escaping we use for turning paths into unit names */ + _cleanup_free_ char *escaped = NULL; + + assert(normalized_path[0] == '/'); + assert(normalized_path[1] != '/'); + + escaped = unit_name_escape(normalized_path + 1); + if (!escaped) + return log_oom(); + + s = strjoin(PCRLOCK_FILE_SYSTEM_PATH_PREFIX, escaped, ".pcrlock"); + } + if (!s) + return log_oom(); + + *ret = TAKE_PTR(s); + return 0; +} + +static int verb_lock_file_system(int argc, char *argv[], void *userdata) { + const char* paths[3] = {}; + int r; + + if (argc > 1) + paths[0] = argv[1]; + else { + dev_t a, b; + paths[0] = "/"; + + r = get_block_device("/", &a); + if (r < 0) + return log_error_errno(r, "Failed to get device of root file system: %m"); + + r = get_block_device("/var", &b); + if (r < 0) + return log_error_errno(r, "Failed to get device of /var/ file system: %m"); + + /* if backing device is distinct, then measure /var/ too */ + if (a != b) + paths[1] = "/var"; + + enable_json_sse(); + } + + STRV_FOREACH(p, paths) { + _cleanup_free_ char *word = NULL, *normalized_path = NULL, *pcrlock_file = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *record = NULL, *array = NULL; + + r = pcrextend_file_system_word(*p, &word, &normalized_path); + if (r < 0) + return r; + + r = pcrlock_file_system_path(normalized_path, &pcrlock_file); + if (r < 0) + return r; + + r = make_pcrlock_record(TPM2_PCR_SYSTEM_IDENTITY /* = 15 */, word, SIZE_MAX, &record); + if (r < 0) + return r; + + r = json_variant_new_array(&array, &record, 1); + if (r < 0) + return log_error_errno(r, "Failed to create record array: %m"); + + r = write_pcrlock(array, pcrlock_file); + if (r < 0) + return r; + } + + return 0; +} + +static int verb_unlock_file_system(int argc, char *argv[], void *userdata) { + const char* paths[3] = {}; + int r; + + if (argc > 1) + paths[0] = argv[1]; + else { + paths[0] = "/"; + paths[1] = "/var"; + } + + STRV_FOREACH(p, paths) { + _cleanup_free_ char *normalized_path = NULL, *pcrlock_file = NULL; + + r = chase(*p, NULL, 0, &normalized_path, NULL); + if (r < 0) + return log_error_errno(r, "Failed to normal path '%s': %m", argv[1]); + + r = pcrlock_file_system_path(normalized_path, &pcrlock_file); + if (r < 0) + return r; + + r = unlink_pcrlock(pcrlock_file); + if (r < 0) + return r; + } + + return 0; +} + +static int verb_lock_pe(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + // FIXME: Maybe also generate a matching EV_EFI_VARIABLE_AUTHORITY records here for each signature that + // covers this PE plus its hash, as alternatives under the same component name + + if (argc >= 2) { + fd = open(argv[1], O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", argv[1]); + } + + if (arg_pcr_mask == 0) + arg_pcr_mask = UINT32_C(1) << TPM2_PCR_BOOT_LOADER_CODE; + + for (uint32_t i = 0; i < TPM2_PCRS_MAX; i++) { + _cleanup_(json_variant_unrefp) JsonVariant *digests = NULL; + + if (!FLAGS_SET(arg_pcr_mask, UINT32_C(1) << i)) + continue; + + FOREACH_ARRAY(pa, tpm2_hash_algorithms, TPM2_N_HASH_ALGORITHMS) { + _cleanup_free_ void *hash = NULL; + size_t hash_size; + const EVP_MD *md; + const char *a; + + assert_se(a = tpm2_hash_alg_to_string(*pa)); + assert_se(md = EVP_get_digestbyname(a)); + + r = pe_hash(fd < 0 ? STDIN_FILENO : fd, md, &hash, &hash_size); + if (r < 0) + return log_error_errno(r, "Failed to hash PE binary: %m"); + + r = json_variant_append_arrayb(&digests, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashAlg", JSON_BUILD_STRING(a)), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(hash, hash_size)))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON digest object: %m"); + } + + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(i)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(digests)))); + if (r < 0) + return log_error_errno(r, "Failed to append record object: %m"); + } + + return write_pcrlock(array, NULL); +} + +typedef void* SectionHashArray[_UNIFIED_SECTION_MAX * TPM2_N_HASH_ALGORITHMS]; + +static void section_hashes_array_done(SectionHashArray *array) { + assert(array); + + for (size_t i = 0; i < _UNIFIED_SECTION_MAX * TPM2_N_HASH_ALGORITHMS; i++) + free((*array)[i]); +} + +static int verb_lock_uki(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL, *pe_digests = NULL; + _cleanup_(section_hashes_array_done) SectionHashArray section_hashes = {}; + size_t hash_sizes[TPM2_N_HASH_ALGORITHMS]; + _cleanup_close_ int fd = -EBADF; + int r; + + if (arg_pcr_mask != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "PCR not configurable for UKI lock down."); + + if (argc >= 2) { + fd = open(argv[1], O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", argv[1]); + } + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) { + _cleanup_free_ void *peh = NULL; + const EVP_MD *md; + const char *a; + + assert_se(a = tpm2_hash_alg_to_string(tpm2_hash_algorithms[i])); + assert_se(md = EVP_get_digestbyname(a)); + + r = pe_hash(fd < 0 ? STDIN_FILENO : fd, md, &peh, hash_sizes + i); + if (r < 0) + return log_error_errno(r, "Failed to hash PE binary: %m"); + + r = json_variant_append_arrayb( + &pe_digests, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashAlg", JSON_BUILD_STRING(a)), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(peh, hash_sizes[i])))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON digest object: %m"); + + r = uki_hash(fd < 0 ? STDIN_FILENO : fd, md, section_hashes + (i * _UNIFIED_SECTION_MAX), hash_sizes + i); + if (r < 0) + return log_error_errno(r, "Failed to UKI hash PE binary: %m"); + } + + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(TPM2_PCR_BOOT_LOADER_CODE)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(pe_digests)))); + if (r < 0) + return log_error_errno(r, "Failed to append record object: %m"); + + for (UnifiedSection section = 0; section < _UNIFIED_SECTION_MAX; section++) { + _cleanup_(json_variant_unrefp) JsonVariant *section_digests = NULL, *record = NULL; + + if (!unified_section_measure(section)) + continue; + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) { + const char *a; + void *hash; + + hash = section_hashes[i * _UNIFIED_SECTION_MAX + section]; + if (!hash) + continue; + + assert_se(a = tpm2_hash_alg_to_string(tpm2_hash_algorithms[i])); + + r = json_variant_append_arrayb( + §ion_digests, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashAlg", JSON_BUILD_STRING(a)), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(hash, hash_sizes[i])))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON digest object: %m"); + } + + if (!section_digests) + continue; + + /* So we have digests for this section, hence generate a record for the section name first. */ + r = make_pcrlock_record(TPM2_PCR_KERNEL_BOOT /* =11 */, unified_sections[section], strlen(unified_sections[section]) + 1, &record); + if (r < 0) + return r; + + r = json_variant_append_array(&array, record); + if (r < 0) + return log_error_errno(r, "Failed to append JSON record array: %m"); + + /* And then append a record for the section contents digests as well */ + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(TPM2_PCR_KERNEL_BOOT /* =11 */)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(section_digests)))); + if (r < 0) + return log_error_errno(r, "Failed to append record object: %m"); + } + + return write_pcrlock(array, NULL); +} + +static int event_log_reduce_to_safe_pcrs(EventLog *el, uint32_t *pcrs) { + _cleanup_free_ char *dropped = NULL, *kept = NULL; + + assert(el); + assert(pcrs); + + /* When we compile a new PCR policy we don't want to bind to PCRs which are fishy for one of three + * reasons: + * + * 1. The PCR value doesn't match the event log + * 2. The event log for the PCR contains measurements we don't know responsible components for + * 3. The event log for the PCR does not contain measurements for components we know + * + * This function checks for the three conditions and drops the PCR from the mask. + */ + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + + if (!FLAGS_SET(*pcrs, UINT32_C(1) << pcr)) + continue; + + if (!event_log_pcr_checks_out(el, el->registers + pcr)) { + log_notice("PCR %" PRIu32 " (%s) value does not match event log. Removing from set of PCRs.", pcr, strna(tpm2_pcr_index_to_string(pcr))); + goto drop; + } + + if (!el->registers[pcr].fully_recognized) { + log_notice("PCR %" PRIu32 " (%s) event log contains unrecognized measurements. Removing from set of PCRs.", pcr, strna(tpm2_pcr_index_to_string(pcr))); + goto drop; + } + + if (FLAGS_SET(el->missing_component_pcrs, UINT32_C(1) << pcr)) { + log_notice("PCR %" PRIu32 " (%s) is touched by component we can't find in event log. Removing from set of PCRs.", pcr, strna(tpm2_pcr_index_to_string(pcr))); + goto drop; + } + + log_info("PCR %" PRIu32 " (%s) matches event log and fully consists of recognized measurements. Including in set of PCRs.", pcr, strna(tpm2_pcr_index_to_string(pcr))); + + if (strextendf_with_separator(&kept, ", ", "%" PRIu32 " (%s)", pcr, tpm2_pcr_index_to_string(pcr)) < 0) + return log_oom(); + + continue; + + drop: + *pcrs &= ~(UINT32_C(1) << pcr); + + if (strextendf_with_separator(&dropped, ", ", "%" PRIu32 " (%s)", pcr, tpm2_pcr_index_to_string(pcr)) < 0) + return log_oom(); + } + + if (dropped) + log_notice("PCRs dropped from protection mask: %s", dropped); + else + log_debug("No PCRs dropped from protection mask."); + + if (kept) + log_notice("PCRs in protection mask: %s", kept); + else + log_notice("No PCRs kept in protection mask."); + + return 0; +} + +static int verb_lock_kernel_cmdline(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *record = NULL, *array = NULL; + _cleanup_free_ char *cmdline = NULL; + int r; + + if (argc > 1) { + if (empty_or_dash(argv[1])) + r = read_full_stream(stdin, &cmdline, NULL); + else + r = read_full_file(argv[1], &cmdline, NULL); + } else + r = proc_cmdline(&cmdline); + if (r < 0) + return log_error_errno(r, "Failed to read cmdline: %m"); + + delete_trailing_chars(cmdline, "\n"); + + _cleanup_free_ char16_t *u = NULL; + u = utf8_to_utf16(cmdline, SIZE_MAX); + if (!u) + return log_oom(); + + r = make_pcrlock_record(TPM2_PCR_KERNEL_INITRD /* = 9 */, u, char16_strlen(u)*2+2, &record); + if (r < 0) + return r; + + r = json_variant_new_array(&array, &record, 1); + if (r < 0) + return log_error_errno(r, "Failed to create record array: %m"); + + r = write_pcrlock(array, PCRLOCK_KERNEL_CMDLINE_PATH); + if (r < 0) + return r; + + return 0; +} + +static int verb_unlock_kernel_cmdline(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(PCRLOCK_KERNEL_CMDLINE_PATH); +} + +static int verb_lock_kernel_initrd(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *record = NULL, *array = NULL; + _cleanup_free_ void *data = NULL; + _cleanup_fclose_ FILE *f = NULL; + size_t size; + int r; + + if (argc >= 2) { + f = fopen(argv[1], "re"); + if (!f) + return log_error_errno(errno, "Failed to open '%s': %m", argv[1]); + } + + r = read_full_stream(f ?: stdin, (char**) &data, &size); + if (r < 0) + return log_error_errno(r, "Failed to read data from stdin: %m"); + + r = make_pcrlock_record(TPM2_PCR_KERNEL_INITRD /* = 9 */, data, size, &record); + if (r < 0) + return r; + + r = json_variant_new_array(&array, &record, 1); + if (r < 0) + return log_error_errno(r, "Failed to create record array: %m"); + + r = write_pcrlock(array, PCRLOCK_KERNEL_INITRD_PATH); + if (r < 0) + return r; + + return 0; +} + +static int verb_unlock_kernel_initrd(int argc, char *argv[], void *userdata) { + return unlink_pcrlock(PCRLOCK_KERNEL_INITRD_PATH); +} + +static int pcr_prediction_add_result( + Tpm2PCRPrediction *context, + Tpm2PCRPredictionResult *result, + uint32_t pcr, + const char *path, + size_t offset) { + + _cleanup_free_ Tpm2PCRPredictionResult *copy = NULL; + int r; + + assert(context); + assert(result); + + copy = newdup(Tpm2PCRPredictionResult, result, 1); + if (!copy) + return log_oom(); + + r = ordered_set_ensure_put(context->results + pcr, &tpm2_pcr_prediction_result_hash_ops, copy); + if (r == -EEXIST) /* Multiple identical results for the same PCR are totally expected */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to insert result into set: %m"); + + log_debug("Added prediction result %u for PCR %" PRIu32 " (path: %s)", ordered_set_size(context->results[pcr]), pcr, strempty(path)); + + TAKE_PTR(copy); + return 0; +} + +static const EVP_MD* evp_from_tpm2_alg(uint16_t alg) { + const char *name; + + name = tpm2_hash_alg_to_string(alg); + if (!name) + return NULL; + + return EVP_get_digestbyname(name); +} + +static int event_log_component_variant_calculate( + Tpm2PCRPrediction *context, + Tpm2PCRPredictionResult *result, + EventLogComponent *component, + EventLogComponentVariant *variant, + uint32_t pcr, + const char *path) { + + int r; + + assert(context); + assert(result); + assert(component); + assert(variant); + + FOREACH_ARRAY(rr, variant->records, variant->n_records) { + EventLogRecord *rec = *rr; + + if (rec->pcr != pcr) + continue; + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) { + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *md_ctx = NULL; + EventLogRecordBank *b; + + if (result->hash[i].size <= 0) /* already invalidated */ + continue; + + b = event_log_record_find_bank(rec, tpm2_hash_algorithms[i]); + if (!b) { + /* Can't calculate, hence invalidate */ + result->hash[i] = (TPM2B_DIGEST) {}; + continue; + } + + md_ctx = EVP_MD_CTX_new(); + if (!md_ctx) + return log_oom(); + + const EVP_MD *md = ASSERT_PTR(evp_from_tpm2_alg(tpm2_hash_algorithms[i])); + + int sz = EVP_MD_size(md); + assert(sz > 0); + assert((size_t) sz <= sizeof_field(TPM2B_DIGEST, buffer)); + + assert(sz == tpm2_hash_alg_to_size(tpm2_hash_algorithms[i])); + + assert(result->hash[i].size == (size_t) sz); + assert(b->hash.size == (size_t) sz); + + if (EVP_DigestInit_ex(md_ctx, md, NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to initialize message digest."); + + if (EVP_DigestUpdate(md_ctx, result->hash[i].buffer, sz) != 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to hash bank value."); + + if (EVP_DigestUpdate(md_ctx, b->hash.buffer, sz) != 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to hash data value."); + + unsigned l = (unsigned) sz; + if (EVP_DigestFinal_ex(md_ctx, result->hash[i].buffer, &l) != 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to finalize message digest."); + + assert(l == (unsigned) sz); + } + + /* This is a valid result once we hit the start location */ + if (arg_location_start && strcmp(component->id, arg_location_start) >= 0) { + r = pcr_prediction_add_result(context, result, pcr, path, rr - variant->records); + if (r < 0) + return r; + } + } + + return 0; +} + +static int event_log_predict_pcrs( + EventLog *el, + Tpm2PCRPrediction *context, + Tpm2PCRPredictionResult *parent_result, + size_t component_index, + uint32_t pcr, + const char *path) { + + EventLogComponent *component; + int count = 0, r; + + assert(el); + assert(context); + assert(parent_result); + + /* Check if we reached the end of the components, generate a result, and backtrack */ + if (component_index >= el->n_components || + (arg_location_end && strcmp(el->components[component_index]->id, arg_location_end) > 0)) { + r = pcr_prediction_add_result(context, parent_result, pcr, path, /* offset= */ 0); + if (r < 0) + return r; + + return 1; + } + + component = ASSERT_PTR(el->components[component_index]); + + FOREACH_ARRAY(ii, component->variants, component->n_variants) { + _cleanup_free_ Tpm2PCRPredictionResult *result = NULL; + EventLogComponentVariant *variant = *ii; + _cleanup_free_ char *subpath = NULL; + + /* Operate on a copy of the result */ + + if (path) + subpath = strjoin(path, ":", component->id); + else + subpath = strdup(component->id); + if (!subpath) + return log_oom(); + + if (!streq(component->id, variant->id)) + if (!strextend(&subpath, "@", variant->id)) + return log_oom(); + + result = newdup(Tpm2PCRPredictionResult, parent_result, 1); + if (!result) + return log_oom(); + + r = event_log_component_variant_calculate( + context, + result, + component, + variant, + pcr, + subpath); + if (r < 0) + return r; + + r = event_log_predict_pcrs( + el, + context, + result, + component_index + 1, /* Next component */ + pcr, + subpath); + if (r < 0) + return r; + + count += r; + } + + return count; +} + +static ssize_t event_log_calculate_component_combinations(EventLog *el) { + ssize_t count = 1; + assert(el); + + FOREACH_ARRAY(cc, el->components, el->n_components) { + EventLogComponent *c = *cc; + + /* Overflow check */ + if (c->n_variants > (size_t) (SSIZE_MAX/count)) + return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "Too many component combinations."); + + count *= c->n_variants; + } + + return count; +} + +static int event_log_show_predictions(Tpm2PCRPrediction *context, uint16_t alg) { + int r; + + assert(context); + + pager_open(arg_pager_flags); + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + _cleanup_(json_variant_unrefp) JsonVariant *j = NULL; + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) { + _cleanup_(json_variant_unrefp) JsonVariant *aj = NULL; + + r = tpm2_pcr_prediction_to_json( + context, + tpm2_hash_algorithms[i], + &aj); + if (r < 0) + return r; + + if (json_variant_elements(aj) == 0) + continue; + + r = json_variant_set_field( + &j, + tpm2_hash_alg_to_string(tpm2_hash_algorithms[i]), + aj); + if (r < 0) + return log_error_errno(r, "Failed to add prediction bank to object: %m"); + } + + if (!j) { + r = json_variant_new_object(&j, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocated empty object: %m"); + } + + json_variant_dump(j, arg_json_format_flags, /* f= */ NULL, /* prefix= */ NULL); + return 0; + } + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + Tpm2PCRPredictionResult *result; + if (!FLAGS_SET(context->pcrs, UINT32_C(1) << pcr)) + continue; + + if (ordered_set_isempty(context->results[pcr])) { + printf("No results for PCR %u (%s).\n", pcr, tpm2_pcr_index_to_string(pcr)); + continue; + } + + printf("%sResults for PCR %u (%s):%s\n", ansi_underline(), pcr, tpm2_pcr_index_to_string(pcr), ansi_normal()); + + ORDERED_SET_FOREACH(result, context->results[pcr]) { + + _cleanup_free_ char *aa = NULL, *h = NULL; + const char *a; + + TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(result, alg); + if (!hash) + continue; + + a = ASSERT_PTR(tpm2_hash_alg_to_string(alg)); + aa = strdup(a); + if (!aa) + return log_oom(); + + ascii_strlower(aa); + + h = hexmem(hash->buffer, hash->size); + if (!h) + return log_oom(); + + printf(" %s%-6s:%s %s\n", ansi_grey(), aa, ansi_normal(), h); + } + } + + return 0; +} + +static int tpm2_pcr_prediction_run( + EventLog *el, + Tpm2PCRPrediction *context) { + + int r; + + assert(el); + assert(context); + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + _cleanup_free_ Tpm2PCRPredictionResult *result = NULL; + + if (!FLAGS_SET(context->pcrs, UINT32_C(1) << pcr)) + continue; + + result = new0(Tpm2PCRPredictionResult, 1); + if (!result) + return log_oom(); + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) + event_log_initial_pcr_state(el, pcr, tpm2_hash_alg_to_size(tpm2_hash_algorithms[i]), result->hash + i); + + r = event_log_predict_pcrs( + el, + context, + result, + /* component_index= */ 0, + pcr, + /* path= */ NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int verb_predict(int argc, char *argv[], void *userdata) { + _cleanup_(tpm2_pcr_prediction_done) Tpm2PCRPrediction context = { + arg_pcr_mask != 0 ? arg_pcr_mask : DEFAULT_PCR_MASK, + }; + _cleanup_(event_log_freep) EventLog *el = NULL; + ssize_t count; + int r; + + r = event_log_load_and_process(&el); + if (r < 0) + return r; + + count = event_log_calculate_component_combinations(el); + if (count < 0) + return count; + + log_info("%zi combinations of components.", count); + + r = event_log_reduce_to_safe_pcrs(el, &context.pcrs); + if (r < 0) + return r; + + r = tpm2_pcr_prediction_run(el, &context); + if (r < 0) + return r; + + return event_log_show_predictions(&context, el->primary_algorithm); +} + +static int remove_policy_file(const char *path) { + assert(path); + + if (unlink(path) < 0) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to remove policy file '%s': %m", path); + } + + log_info("Removed policy file '%s'.", path); + return 1; +} + +static int verb_make_policy(int argc, char *argv[], void *userdata) { + int r; + + /* Here's how this all works: after predicting all possible PCR values for next boot (with + * alternatives) we'll calculate a policy from it as a combination of PolicyPCR + PolicyOR + * expressions. This is then stored in an NV index. When a component of the boot process is changed a + * new prediction is made and the NV index updated (which automatically invalidates any older + * policies). + * + * Whenever we want to lock an encrypted object (for example FDE) against this policy, we'll use a + * PolicyAuthorizeNV epxression that pins the NV index in the policy, and permits access to any + * policies matching the current NV index contents. + * + * We grant world-readable read access to the NV index. Write access is controlled by a PIN (which we + * either generate locally or which the user can provide us with) which can also be used for + * recovery. This PIN is sealed to the TPM and is locked via PolicyAuthorizeNV to the NV index it + * protects (i.e. we dogfood 🌭 🐶 hard here). This means in order to update such a policy we need + * the policy to pass. + * + * Information about the used NV Index, the SRK of the TPM, the sealed PIN and the current PCR + * prediction data are stored in a JSON file in /var/lib/. In order to be able to unlock root disks + * this data must be also copied to the ESP so that it is available to the initrd. The data is not + * sensitive, as SRK and NV index are pinned by it, and the prediction data must match the NV index + * to be useful. */ + + usec_t start_usec = now(CLOCK_MONOTONIC); + + _cleanup_(event_log_freep) EventLog *el = NULL; + r = event_log_load_and_process(&el); + if (r < 0) + return r; + + _cleanup_(tpm2_pcr_prediction_done) Tpm2PCRPrediction new_prediction = { + arg_pcr_mask != 0 ? arg_pcr_mask : DEFAULT_PCR_MASK, + }; + r = event_log_reduce_to_safe_pcrs(el, &new_prediction.pcrs); + if (r < 0) + return r; + + usec_t predict_start_usec = now(CLOCK_MONOTONIC); + + r = tpm2_pcr_prediction_run(el, &new_prediction); + if (r < 0) + return r; + + log_info("Predicted future PCRs in %s.", FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), predict_start_usec), 1)); + + _cleanup_(json_variant_unrefp) JsonVariant *new_prediction_json = NULL; + r = tpm2_pcr_prediction_to_json(&new_prediction, el->primary_algorithm, &new_prediction_json); + if (r < 0) + return r; + + if (DEBUG_LOGGING) + (void) json_variant_dump(new_prediction_json, JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO, stderr, NULL); + + _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy old_policy = {}; + + r = tpm2_pcrlock_policy_load(arg_pcrlock_path, &old_policy); + if (r < 0) + return r; + + bool have_old_policy = r > 0; + + /* When we update the policy the old serializations for NV, SRK, PIN remain the same */ + _cleanup_(iovec_done) struct iovec + nv_blob = TAKE_STRUCT(old_policy.nv_handle), + nv_public_blob = TAKE_STRUCT(old_policy.nv_public), + srk_blob = TAKE_STRUCT(old_policy.srk_handle), + pin_public = TAKE_STRUCT(old_policy.pin_public), + pin_private = TAKE_STRUCT(old_policy.pin_private); + + if (have_old_policy) { + if (arg_nv_index != 0 && old_policy.nv_index != arg_nv_index) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Stored policy references different NV index (0x%x) than specified (0x%x), refusing.", old_policy.nv_index, arg_nv_index); + + if (!arg_force && + old_policy.algorithm == el->primary_algorithm && + tpm2_pcr_prediction_equal(&old_policy.prediction, &new_prediction, el->primary_algorithm)) { + log_info("Prediction is identical to current policy, skipping update."); + return EXIT_SUCCESS; + } + } + + _cleanup_(tpm2_context_unrefp) Tpm2Context *tc = NULL; + r = tpm2_context_new(NULL, &tc); + if (r < 0) + return log_error_errno(r, "Failed to allocate TPM2 context: %m"); + + if (!tpm2_supports_command(tc, TPM2_CC_PolicyAuthorizeNV)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 does not support PolicyAuthorizeNV command, refusing."); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *srk_handle = NULL; + + if (iovec_is_set(&srk_blob)) { + r = tpm2_deserialize( + tc, + srk_blob.iov_base, + srk_blob.iov_len, + &srk_handle); + if (r < 0) + return log_error_errno(r, "Failed to deserialize SRK TR: %m"); + } else { + r = tpm2_get_or_create_srk( + tc, + /* session= */ NULL, + /* ret_public= */ NULL, + /* ret_name= */ NULL, + /* ret_qname= */ NULL, + &srk_handle); + if (r < 0) + return log_error_errno(r, "Failed to install SRK: %m"); + } + + _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL; + r = tpm2_make_encryption_session( + tc, + srk_handle, + /* bind_key= */ &TPM2_HANDLE_NONE, + &encryption_session); + if (r < 0) + return log_error_errno(r, "Failed to allocate encryption session: %m"); + + /* Acquire a recovery PIN, either from the user, or create a randomized one */ + _cleanup_(erase_and_freep) char *pin = NULL; + if (arg_recovery_pin) { + r = getenv_steal_erase("PIN", &pin); + if (r < 0) + return log_error_errno(r, "Failed to acquire PIN from environment: %m"); + if (r == 0) { + _cleanup_(strv_free_erasep) char **l = NULL; + + r = ask_password_auto( + "Recovery PIN", + /* icon= */ NULL, + /* id= */ "pcrlock-recovery-pin", + /* key_name= */ NULL, + /* credential_name= */ "systemd-pcrlock.recovery-pin", + /* until= */ 0, + /* flags= */ 0, + &l); + if (r < 0) + return log_error_errno(r, "Failed to query for recovery PIN: %m"); + + if (strv_length(l) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected a single PIN only."); + + pin = TAKE_PTR(l[0]); + l = mfree(l); + } + + } else if (!have_old_policy) { + char rnd[256]; + + r = crypto_random_bytes(rnd, sizeof(rnd)); + if (r < 0) + return log_error_errno(r, "Failed to generate a randomized recovery PIN: %m"); + + (void) base64mem(rnd, sizeof(rnd), &pin); + explicit_bzero_safe(rnd, sizeof(rnd)); + if (!pin) + return log_oom(); + } + + _cleanup_(tpm2_handle_freep) Tpm2Handle *nv_handle = NULL; + TPM2_HANDLE nv_index = 0; + + if (iovec_is_set(&nv_blob)) { + r = tpm2_deserialize(tc, nv_blob.iov_base, nv_blob.iov_len, &nv_handle); + if (r < 0) + return log_error_errno(r, "Failed to deserialize NV index TR: %m"); + + nv_index = old_policy.nv_index; + } + + TPM2B_AUTH auth = {}; + CLEANUP_ERASE(auth); + + if (pin) { + r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &auth); + if (r < 0) + return log_error_errno(r, "Failed to hash PIN: %m"); + } else { + assert(iovec_is_set(&pin_public)); + assert(iovec_is_set(&pin_private)); + + log_debug("Retrieving PIN from sealed data."); + + usec_t pin_start_usec = now(CLOCK_MONOTONIC); + + _cleanup_(iovec_done_erase) struct iovec secret = {}; + for (unsigned attempt = 0;; attempt++) { + _cleanup_(tpm2_handle_freep) Tpm2Handle *policy_session = NULL; + + r = tpm2_make_policy_session( + tc, + srk_handle, + encryption_session, + &policy_session); + if (r < 0) + return log_error_errno(r, "Failed to allocate policy session: %m"); + + r = tpm2_policy_super_pcr( + tc, + policy_session, + &old_policy.prediction, + old_policy.algorithm); + if (r < 0) + return log_error_errno(r, "Failed to submit super PCR policy: %m"); + + r = tpm2_policy_authorize_nv( + tc, + policy_session, + nv_handle, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to submit AuthorizeNV policy: %m"); + + r = tpm2_unseal_data( + tc, + &pin_public, + &pin_private, + srk_handle, + policy_session, + encryption_session, + &secret); + if (r < 0 && (r != -ESTALE || attempt >= 16)) + return log_error_errno(r, "Failed to unseal PIN: %m"); + if (r == 0) + break; + + log_debug("Trying again (attempt %u), as PCR values changed during unlock attempt.", attempt+1); + } + + if (secret.iov_len > sizeof_field(TPM2B_AUTH, buffer)) + return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "Decrypted PIN too large."); + + auth = (TPM2B_AUTH) { + .size = secret.iov_len, + }; + + memcpy_safe(auth.buffer, secret.iov_base, secret.iov_len); + + log_info("Retrieved PIN from TPM2 in %s.", FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), pin_start_usec), 1)); + } + + TPM2B_NV_PUBLIC nv_public = {}; + + usec_t nv_index_start_usec = now(CLOCK_MONOTONIC); + + if (!iovec_is_set(&nv_blob)) { + TPM2B_DIGEST recovery_policy_digest = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + r = tpm2_calculate_policy_auth_value(&recovery_policy_digest); + if (r < 0) + return log_error_errno(r, "Failed to calculate authentication value policy: %m"); + + log_debug("Allocating NV index to write PCR policy to..."); + r = tpm2_define_policy_nv_index( + tc, + encryption_session, + arg_nv_index, + &recovery_policy_digest, + pin, + &auth, + &nv_index, + &nv_handle, + &nv_public); + if (r == -EEXIST) + return log_error_errno(r, "NV index 0x%" PRIx32 " already allocated.", arg_nv_index); + if (r < 0) + return log_error_errno(r, "Failed to allocate NV index: %m"); + } + + r = tpm2_set_auth_binary(tc, nv_handle, &auth); + if (r < 0) + return log_error_errno(r, "Failed to set authentication value on NV index: %m"); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *policy_session = NULL; + r = tpm2_make_policy_session( + tc, + srk_handle, + encryption_session, + &policy_session); + if (r < 0) + return log_error_errno(r, "Failed to allocate policy session: %m"); + + r = tpm2_policy_auth_value( + tc, + policy_session, + /* ret_policy_digest= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to submit authentication value policy: %m"); + + log_debug("Calculating new PCR policy to write..."); + TPM2B_DIGEST new_super_pcr_policy_digest = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + + usec_t pcr_policy_start_usec = now(CLOCK_MONOTONIC); + + r = tpm2_calculate_policy_super_pcr( + &new_prediction, + el->primary_algorithm, + &new_super_pcr_policy_digest); + if (r < 0) + return log_error_errno(r, "Failed to calculate super PCR policy: %m"); + + log_info("Calculated new PCR policy in %s.", FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), pcr_policy_start_usec), 1)); + + log_debug("Writing new PCR policy to NV index..."); + r = tpm2_write_policy_nv_index( + tc, + policy_session, + nv_index, + nv_handle, + &new_super_pcr_policy_digest); + if (r < 0) + return log_error_errno(r, "Failed to write to NV index: %m"); + + log_info("Updated NV index in %s.", FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), nv_index_start_usec), 1)); + + assert(iovec_is_set(&pin_public) == iovec_is_set(&pin_private)); + if (!iovec_is_set(&pin_public)) { + TPM2B_DIGEST authnv_policy_digest = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + + r = tpm2_calculate_policy_authorize_nv(&nv_public, &authnv_policy_digest); + if (r < 0) + return log_error_errno(r, "Failed to calculate AuthorizeNV policy: %m"); + + struct iovec data = { + .iov_base = auth.buffer, + .iov_len = auth.size, + }; + + usec_t pin_seal_start_usec = now(CLOCK_MONOTONIC); + + log_debug("Sealing PIN to NV index policy..."); + r = tpm2_seal_data( + tc, + &data, + srk_handle, + encryption_session, + &authnv_policy_digest, + &pin_public, + &pin_private); + if (r < 0) + return log_error_errno(r, "Failed to seal PIN to NV auth policy: %m"); + + log_info("Sealed PIN in %s.", FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), pin_seal_start_usec), 1)); + } + + if (!iovec_is_set(&nv_blob)) { + r = tpm2_serialize(tc, nv_handle, &nv_blob.iov_base, &nv_blob.iov_len); + if (r < 0) + return log_error_errno(r, "Failed to serialize NV index TR: %m"); + } + + if (!iovec_is_set(&srk_blob)) { + r = tpm2_serialize(tc, srk_handle, &srk_blob.iov_base, &srk_blob.iov_len); + if (r < 0) + return log_error_errno(r, "Failed to serialize SRK index TR: %m"); + } + + if (!iovec_is_set(&nv_public_blob)) { + r = tpm2_marshal_nv_public(&nv_public, &nv_public_blob.iov_base, &nv_public_blob.iov_len); + if (r < 0) + return log_error_errno(r, "Failed to marshal NV public area: %m"); + } + + _cleanup_(json_variant_unrefp) JsonVariant *new_configuration_json = NULL; + r = json_build(&new_configuration_json, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("pcrBank", tpm2_hash_alg_to_string(el->primary_algorithm)), + JSON_BUILD_PAIR_VARIANT("pcrValues", new_prediction_json), + JSON_BUILD_PAIR_INTEGER("nvIndex", nv_index), + JSON_BUILD_PAIR_IOVEC_BASE64("nvHandle", &nv_blob), + JSON_BUILD_PAIR_IOVEC_BASE64("nvPublic", &nv_public_blob), + JSON_BUILD_PAIR_IOVEC_BASE64("srkHandle", &srk_blob), + JSON_BUILD_PAIR_IOVEC_BASE64("pinPublic", &pin_public), + JSON_BUILD_PAIR_IOVEC_BASE64("pinPrivate", &pin_private))); + if (r < 0) + return log_error_errno(r, "Failed to generate JSON: %m"); + + _cleanup_free_ char *text = NULL; + r = json_variant_format(new_configuration_json, 0, &text); + if (r < 0) + return log_error_errno(r, "Failed to format new configuration to JSON: %m"); + + const char *path = arg_pcrlock_path ?: (in_initrd() ? "/run/systemd/pcrlock.json" : "/var/lib/systemd/pcrlock.json"); + r = write_string_file(path, text, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_SYNC|WRITE_STRING_FILE_MKDIR_0755); + if (r < 0) + return log_error_errno(r, "Failed to write new configuration to '%s': %m", path); + + if (!arg_pcrlock_path && !in_initrd()) { + r = remove_policy_file("/run/systemd/pcrlock.json"); + if (r < 0) + return r; + } + + log_info("Written new policy to '%s' and digest to TPM2 NV index 0x%x.", path, nv_index); + + log_info("Overall time spent: %s", FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), start_usec), 1)); + + return 0; +} + +static int undefine_policy_nv_index( + uint32_t nv_index, + const struct iovec *nv_blob, + const struct iovec *srk_blob) { + int r; + + assert(nv_blob); + assert(srk_blob); + + _cleanup_(tpm2_context_unrefp) Tpm2Context *tc = NULL; + r = tpm2_context_new(NULL, &tc); + if (r < 0) + return r; + + _cleanup_(tpm2_handle_freep) Tpm2Handle *srk_handle = NULL; + r = tpm2_deserialize( + tc, + srk_blob->iov_base, + srk_blob->iov_len, + &srk_handle); + if (r < 0) + return log_error_errno(r, "Failed to deserialize SRK TR: %m"); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *nv_handle = NULL; + r = tpm2_deserialize( + tc, + nv_blob->iov_base, + nv_blob->iov_len, + &nv_handle); + if (r < 0) + return log_error_errno(r, "Failed to deserialize NV TR: %m"); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL; + r = tpm2_make_encryption_session( + tc, + srk_handle, + /* bind_key= */ &TPM2_HANDLE_NONE, + &encryption_session); + if (r < 0) + return r; + + r = tpm2_undefine_policy_nv_index( + tc, + encryption_session, + nv_index, + nv_handle); + if (r < 0) + return r; + + log_info("Removed NV index 0x%x", nv_index); + return 0; +} + +static int verb_remove_policy(int argc, char *argv[], void *userdata) { + int r; + + _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy policy = {}; + r = tpm2_pcrlock_policy_load(arg_policy_path, &policy); + if (r == 0) { + log_info("No policy found."); + return 0; + } + + if (r < 0) + log_notice("Failed to load old policy file, assuming it is corrupted, removing."); + else { + r = undefine_policy_nv_index(policy.nv_index, &policy.nv_handle, &policy.srk_handle); + if (r < 0) + log_notice("Failed to remove NV index, assuming data out of date, removing policy file."); + } + + if (arg_policy_path) { + r = remove_policy_file(arg_policy_path); + if (r < 0) + return r; + + return 0; + } else { + int ret = 0; + + RET_GATHER(ret, remove_policy_file("/var/lib/systemd/pcrlock.json")); + RET_GATHER(ret, remove_policy_file("/run/systemd/pcrlock.json")); + + return ret; + } +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-pcrlock", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n" + "\n%5$sManage a TPM2 PCR lock.%6$s\n" + "\n%3$sCommands:%4$s\n" + " log Show measurement log\n" + " cel Show measurement log in TCG CEL-JSON format\n" + " list-components List defined .pcrlock components\n" + " predict Predict PCR values\n" + " make-policy Predict PCR values and generate TPM2 policy from it\n" + " remove-policy Remove TPM2 policy\n" + "\n%3$sProtections:%4$s\n" + " lock-firmware-code Generate a .pcrlock file from current firmware code\n" + " unlock-firmware-code Remove .pcrlock file for firmware code\n" + " lock-firmware-config Generate a .pcrlock file from current firmware configuration\n" + " unlock-firmware-config Remove .pcrlock file for firmware configuration\n" + " lock-secureboot-policy Generate a .pcrlock file from current SecureBoot policy\n" + " unlock-secureboot-policy Remove .pcrlock file for SecureBoot policy\n" + " lock-secureboot-authority Generate a .pcrlock file from current SecureBoot authority\n" + " unlock-secureboot-authority Remove .pcrlock file for SecureBoot authority\n" + " lock-gpt [DISK] Generate a .pcrlock file from GPT header\n" + " unlock-gpt Remove .pcrlock file for GPT header\n" + " lock-pe [BINARY] Generate a .pcrlock file from PE binary\n" + " unlock-pe Remove .pcrlock file for PE binary\n" + " lock-uki [UKI] Generate a .pcrlock file from UKI PE binary\n" + " unlock-uki Remove .pcrlock file for UKI PE binary\n" + " lock-machine-id Generate a .pcrlock file from current machine ID\n" + " unlock-machine-id Remove .pcrlock file for machine ID\n" + " lock-file-system [PATH] Generate a .pcrlock file from current root fs + /var/\n" + " unlock-file-system [PATH] Remove .pcrlock file for root fs + /var/\n" + " lock-kernel-cmdline [FILE] Generate a .pcrlock file from kernel command line\n" + " unlock-kernel-cmdline Remove .pcrlock file for kernel command line\n" + " lock-kernel-initrd FILE Generate a .pcrlock file from an initrd file\n" + " unlock-kernel-initrd Remove .pcrlock file for an initrd file\n" + " lock-raw [FILE] Generate a .pcrlock file from raw data\n" + " unlock-raw Remove .pcrlock file for raw data\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Print version\n" + " --no-pager Do not pipe output into a pager\n" + " --json=pretty|short|off Generate JSON output\n" + " --raw-description Show raw firmware record data as description in table\n" + " --pcr=NR Generate .pcrlock for specified PCR\n" + " --nv-index=NUMBER Use the specified NV index, instead of a random one\n" + " --components=PATH Directory to read .pcrlock files from\n" + " --location=STRING[:STRING]\n" + " Do not process components beyond this component name\n" + " --recovery-pin=yes Ask for a recovery PIN\n" + " --pcrlock=PATH .pcrlock file to write expected PCR measurement to\n" + " --policy=PATH JSON file to write policy output to\n" + " --force Write policy even if it matches existing policy\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_JSON, + ARG_RAW_DESCRIPTION, + ARG_PCR, + ARG_NV_INDEX, + ARG_COMPONENTS, + ARG_LOCATION, + ARG_RECOVERY_PIN, + ARG_PCRLOCK, + ARG_POLICY, + ARG_FORCE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "json", required_argument, NULL, ARG_JSON }, + { "raw-description", no_argument, NULL, ARG_RAW_DESCRIPTION }, + { "pcr", required_argument, NULL, ARG_PCR }, + { "nv-index", required_argument, NULL, ARG_NV_INDEX }, + { "components", required_argument, NULL, ARG_COMPONENTS }, + { "location", required_argument, NULL, ARG_LOCATION }, + { "recovery-pin", required_argument, NULL, ARG_RECOVERY_PIN }, + { "pcrlock", required_argument, NULL, ARG_PCRLOCK }, + { "policy", required_argument, NULL, ARG_POLICY }, + { "force", no_argument, NULL, ARG_FORCE }, + {} + }; + + bool auto_location = true; + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + help(0, NULL, NULL); + return 0; + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + break; + + case ARG_RAW_DESCRIPTION: + arg_raw_description = true; + break; + + case ARG_PCR: { + r = tpm2_parse_pcr_argument_to_mask(optarg, &arg_pcr_mask); + if (r < 0) + return log_error_errno(r, "Failed to parse PCR specification: %s", optarg); + + break; + } + + case ARG_NV_INDEX: + if (isempty(optarg)) + arg_nv_index = 0; + else { + uint32_t u; + + r = safe_atou32_full(optarg, 16, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse --nv-index= argument: %s", optarg); + + if (u < TPM2_NV_INDEX_FIRST || u > TPM2_NV_INDEX_LAST) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Argument for --nv-index= outside of valid range 0x%" PRIx32 "…0x%" PRIx32 ": 0x%" PRIx32, + TPM2_NV_INDEX_FIRST, TPM2_NV_INDEX_LAST, u); + + arg_nv_index = u; + } + break; + + case ARG_COMPONENTS: { + _cleanup_free_ char *p = NULL; + + r = parse_path_argument(optarg, /* suppress_root= */ false, &p); + if (r < 0) + return r; + + r = strv_consume(&arg_components, TAKE_PTR(p)); + if (r < 0) + return log_oom(); + + break; + } + + case ARG_LOCATION: { + _cleanup_free_ char *start = NULL, *end = NULL; + const char *e; + + auto_location = false; + + if (isempty(optarg)) { + arg_location_start = mfree(arg_location_start); + arg_location_end = mfree(arg_location_end); + break; + } + + e = strchr(optarg, ':'); + if (e) { + start = strndup(optarg, e - optarg); + if (!start) + return log_oom(); + + end = strdup(e + 1); + if (!end) + return log_oom(); + } else { + start = strdup(optarg); + if (!start) + return log_oom(); + + end = strdup(optarg); + if (!end) + return log_oom(); + } + + if (!filename_is_valid(start)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Location string invalid, refusing: %s", start); + if (!filename_is_valid(end)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Location string invalid, refusing: %s", end); + + free_and_replace(arg_location_start, start); + free_and_replace(arg_location_end, end); + break; + } + + case ARG_RECOVERY_PIN: + r = parse_boolean_argument("--recovery-pin", optarg, &arg_recovery_pin); + if (r < 0) + return r; + break; + + case ARG_PCRLOCK: + if (isempty(optarg) || streq(optarg, "-")) + arg_pcrlock_path = mfree(arg_pcrlock_path); + else { + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_pcrlock_path); + if (r < 0) + return r; + } + + arg_pcrlock_auto = false; + break; + + case ARG_POLICY: + if (isempty(optarg) || streq(optarg, "-")) + arg_policy_path = mfree(arg_policy_path); + else { + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_policy_path); + if (r < 0) + return r; + } + + break; + + case ARG_FORCE: + arg_force = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (auto_location) { + assert(!arg_location_start); + assert(!arg_location_end); + + arg_location_start = strdup("760-"); + if (!arg_location_start) + return log_oom(); + + arg_location_end = strdup("940-"); + if (!arg_location_end) + return log_oom(); + } + + return 1; +} + +static int pcrlock_main(int argc, char *argv[]) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "log", VERB_ANY, 1, VERB_DEFAULT, verb_show_log }, + { "cel", VERB_ANY, 1, 0, verb_show_cel }, + { "list-components", VERB_ANY, 1, 0, verb_list_components }, + { "predict", VERB_ANY, 1, 0, verb_predict }, + { "lock-firmware-code", VERB_ANY, 2, 0, verb_lock_firmware }, + { "unlock-firmware-code", VERB_ANY, 1, 0, verb_unlock_firmware }, + { "lock-firmware-config", VERB_ANY, 2, 0, verb_lock_firmware }, + { "unlock-firmware-config", VERB_ANY, 1, 0, verb_unlock_firmware }, + { "lock-secureboot-policy", VERB_ANY, 1, 0, verb_lock_secureboot_policy }, + { "unlock-secureboot-policy", VERB_ANY, 1, 0, verb_unlock_secureboot_policy }, + { "lock-secureboot-authority", VERB_ANY, 1, 0, verb_lock_secureboot_authority }, + { "unlock-secureboot-authority", VERB_ANY, 1, 0, verb_unlock_secureboot_authority }, + { "lock-gpt", VERB_ANY, 2, 0, verb_lock_gpt }, + { "unlock-gpt", VERB_ANY, 1, 0, verb_unlock_gpt }, + { "lock-pe", VERB_ANY, 2, 0, verb_lock_pe }, + { "unlock-pe", VERB_ANY, 1, 0, verb_unlock_simple }, + { "lock-uki", VERB_ANY, 2, 0, verb_lock_uki }, + { "unlock-uki", VERB_ANY, 1, 0, verb_unlock_simple }, + { "lock-machine-id", VERB_ANY, 1, 0, verb_lock_machine_id }, + { "unlock-machine-id", VERB_ANY, 1, 0, verb_unlock_machine_id }, + { "lock-file-system", VERB_ANY, 2, 0, verb_lock_file_system }, + { "unlock-file-system", VERB_ANY, 2, 0, verb_unlock_file_system }, + { "lock-kernel-cmdline", VERB_ANY, 2, 0, verb_lock_kernel_cmdline }, + { "unlock-kernel-cmdline", VERB_ANY, 1, 0, verb_unlock_kernel_cmdline }, + { "lock-kernel-initrd", VERB_ANY, 2, 0, verb_lock_kernel_initrd }, + { "unlock-kernel-initrd", VERB_ANY, 1, 0, verb_unlock_kernel_initrd }, + { "lock-raw", VERB_ANY, 2, 0, verb_lock_raw }, + { "unlock-raw", VERB_ANY, 1, 0, verb_unlock_simple }, + { "make-policy", VERB_ANY, 1, 0, verb_make_policy }, + { "remove-policy", VERB_ANY, 1, 0, verb_remove_policy }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return pcrlock_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/pcrlock/pcrlock.d/350-action-efi-application.pcrlock b/src/pcrlock/pcrlock.d/350-action-efi-application.pcrlock new file mode 100644 index 0000000..2baaa9c --- /dev/null +++ b/src/pcrlock/pcrlock.d/350-action-efi-application.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":4,"digests":[{"hashAlg":"sha1","digest":"cd0fdb4531a6ec41be2753ba042637d6e5f7f256"},{"hashAlg":"sha256","digest":"3d6772b4f84ed47595d72a2c4c5ffd15f5bb72c7507fe26f2aaee2c69d5633ba"},{"hashAlg":"sha384","digest":"77a0dab2312b4e1e57a84d865a21e5b2ee8d677a21012ada819d0a98988078d3d740f6346bfe0abaa938ca20439a8d71"},{"hashAlg":"sha512","digest":"03020279c5ea3676d6630c82a9931343225e8eab81529b65c786aeb6a445d3852a34dd193178f938b6b47345a72d4b647df309c971f7c02f0ede296a136a1086"}]}]} diff --git a/src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/300-0x00000000.pcrlock b/src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/300-0x00000000.pcrlock new file mode 100644 index 0000000..c577c98 --- /dev/null +++ b/src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/300-0x00000000.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":7,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]}]} diff --git a/src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/600-0xffffffff.pcrlock b/src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/600-0xffffffff.pcrlock new file mode 100644 index 0000000..2e86898 --- /dev/null +++ b/src/pcrlock/pcrlock.d/400-secureboot-separator.pcrlock.d/600-0xffffffff.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":7,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]}]} diff --git a/src/pcrlock/pcrlock.d/500-separator.pcrlock.d/300-0x00000000.pcrlock b/src/pcrlock/pcrlock.d/500-separator.pcrlock.d/300-0x00000000.pcrlock new file mode 100644 index 0000000..f1e473f --- /dev/null +++ b/src/pcrlock/pcrlock.d/500-separator.pcrlock.d/300-0x00000000.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":0,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]},{"pcr":1,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]},{"pcr":2,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]},{"pcr":3,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]},{"pcr":4,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]},{"pcr":5,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]},{"pcr":6,"digests":[{"hashAlg":"sha1","digest":"9069ca78e7450a285173431b3e52c5c25299e473"},{"hashAlg":"sha256","digest":"df3f619804a92fdb4057192dc43dd748ea778adc52bc498ce80524c014b81119"},{"hashAlg":"sha384","digest":"394341b7182cd227c5c6b07ef8000cdfd86136c4292b8e576573ad7ed9ae41019f5818b4b971c9effc60e1ad9f1289f0"},{"hashAlg":"sha512","digest":"ec2d57691d9b2d40182ac565032054b7d784ba96b18bcb5be0bb4e70e3fb041eff582c8af66ee50256539f2181d7f9e53627c0189da7e75a4d5ef10ea93b20b3"}]}]} diff --git a/src/pcrlock/pcrlock.d/500-separator.pcrlock.d/600-0xffffffff.pcrlock b/src/pcrlock/pcrlock.d/500-separator.pcrlock.d/600-0xffffffff.pcrlock new file mode 100644 index 0000000..0b8d20b --- /dev/null +++ b/src/pcrlock/pcrlock.d/500-separator.pcrlock.d/600-0xffffffff.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":0,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]},{"pcr":1,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]},{"pcr":2,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]},{"pcr":3,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]},{"pcr":4,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]},{"pcr":5,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]},{"pcr":6,"digests":[{"hashAlg":"sha1","digest":"d9be6524a5f5047db5866813acf3277892a7a30a"},{"hashAlg":"sha256","digest":"ad95131bc0b799c0b1af477fb14fcf26a6a9f76079e48bf090acb7e8367bfd0e"},{"hashAlg":"sha384","digest":"4a06b879c7eedbe01c945d46b5bd785b59203dce81ea6a1206c28091ca285365f760d9167778f0dc1763d4854aafd40a"},{"hashAlg":"sha512","digest":"ea71bb243b0b2db729b9eb88e3c55a3f490fbff23457825051224a1fe6e6d3f480590cfa3a4a6b12c622d6ac366feb03cd17004ed004cb3f0d52731626946679"}]}]} diff --git a/src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/300-present.pcrlock b/src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/300-present.pcrlock new file mode 100644 index 0000000..d7012df --- /dev/null +++ b/src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/300-present.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":5,"digests":[{"hashAlg":"sha1","digest":"443a6b7b82b7af564f2e393cd9d5a388b7fa4a98"},{"hashAlg":"sha256","digest":"d8043d6b7b85ad358eb3b6ae6a873ab7ef23a26352c5dc4faa5aeedacf5eb41b"},{"hashAlg":"sha384","digest":"214b0bef1379756011344877743fdc2a5382bac6e70362d624ccf3f654407c1b4badf7d8f9295dd3dabdef65b27677e0"},{"hashAlg":"sha512","digest":"0fed3a4c9552021436534d27f3adb481e22b50b29e4b37a63f518540a651a174f149b69f500b0bdb2cb3bf4e0e21e0781451090af33e88f6bee4cbebd15c1668"}]},{"pcr":5,"digests":[{"hashAlg":"sha1","digest":"475545ddc978d7bfd036facc7e2e987f48189f0d"},{"hashAlg":"sha256","digest":"b54f7542cbd872a81a9d9dea839b2b8d747c7ebd5ea6615c40f42f44a6dbeba0"},{"hashAlg":"sha384","digest":"0a2e01c85deae718a530ad8c6d20a84009babe6c8989269e950d8cf440c6e997695e64d455c4174a652cd080f6230b74"},{"hashAlg":"sha512","digest":"1bb30cdbd6da78fe2a8a161ef51176e22d64dce305b40b47243673af64a2b16fca6182116433e3891be94773f6d7d411275721d5bf7d40ea51a274d5c891637c"}]}]} diff --git a/src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/600-absent.pcrlock b/src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/600-absent.pcrlock new file mode 100644 index 0000000..a16142b --- /dev/null +++ b/src/pcrlock/pcrlock.d/700-action-efi-exit-boot-services.pcrlock.d/600-absent.pcrlock @@ -0,0 +1 @@ +{"records":[]} diff --git a/src/pcrlock/pcrlock.d/750-enter-initrd.pcrlock b/src/pcrlock/pcrlock.d/750-enter-initrd.pcrlock new file mode 100644 index 0000000..a2332dc --- /dev/null +++ b/src/pcrlock/pcrlock.d/750-enter-initrd.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":11,"digests":[{"hashAlg":"sha1","digest":"b1b01d5f73f321eb70e76f8a0e241ac0a3fa4a6e"},{"hashAlg":"sha256","digest":"51e6b92f405d1f98d96e3de343d61d420ad6923b25de21d766f9298192f14fed"},{"hashAlg":"sha384","digest":"687eef3a3a8c716439b5ed583657e8668401630c321f2f35d19b953ddf20b68a96474d0c2e5f0e1757bfa5ba70b9fc32"},{"hashAlg":"sha512","digest":"ab0ddfdabe43f1d06b3e58fbe17439a0f7f552e9e228d85665d485ececf7e733bae4cd7e0a17e5456e2ee7e412f5a0f37de05a782cce781e173ee26958de7f30"}]}]} diff --git a/src/pcrlock/pcrlock.d/800-leave-initrd.pcrlock b/src/pcrlock/pcrlock.d/800-leave-initrd.pcrlock new file mode 100644 index 0000000..bd8f436 --- /dev/null +++ b/src/pcrlock/pcrlock.d/800-leave-initrd.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":11,"digests":[{"hashAlg":"sha1","digest":"865e1ff2cc5b8db815313b23fe3d8b561212f5d1"},{"hashAlg":"sha256","digest":"3be261aff7db92bf507eae947f4003ffa2bcad0bffe3524601d62d0bc8be7135"},{"hashAlg":"sha384","digest":"9c0743b7a2e1ee06c70b7137b763cd2205c26ced274149959b05bd5a51bfa96b4fedaa4f87398b5c88986d1ff0879910"},{"hashAlg":"sha512","digest":"01b8ca86b9f8fac967f383380aff7cdffd2ef0c496574517c25398f7c74aa611821dd469ba021b2aa9b9a7232865708ca45c79368f2e7fffda3dd6b308264008"}]}]} diff --git a/src/pcrlock/pcrlock.d/850-sysinit.pcrlock b/src/pcrlock/pcrlock.d/850-sysinit.pcrlock new file mode 100644 index 0000000..3bae445 --- /dev/null +++ b/src/pcrlock/pcrlock.d/850-sysinit.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":11,"digests":[{"hashAlg":"sha1","digest":"aeabcf402223916e804cce79778a55d5a9276983"},{"hashAlg":"sha256","digest":"730bb5a583ba880c277e656d2dc8aba1a314a11b14d25b05153d2bab82567a48"},{"hashAlg":"sha384","digest":"955cc8939f81d862b3119aabe612fd36bf91668bb62397f5e4126085d79ba6d7cbfa4e3a2345747f0b476ce4b1cbc2c9"},{"hashAlg":"sha512","digest":"a9eb62cdd1cd8292b6325a8ee3770d6f1b613426a749e17ffba8f90bdd6c41806468fb79d01276de7cc791877dfebae165d4ed07585154acf96652c6db92acc1"}]}]} diff --git a/src/pcrlock/pcrlock.d/900-ready.pcrlock b/src/pcrlock/pcrlock.d/900-ready.pcrlock new file mode 100644 index 0000000..9a0e82f --- /dev/null +++ b/src/pcrlock/pcrlock.d/900-ready.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":11,"digests":[{"hashAlg":"sha1","digest":"75c0533730caf1f78561c0883fb87bc8d98ef04b"},{"hashAlg":"sha256","digest":"b24d6d33736ecd5604a4b17bc9c6481039fac362bb7df044ef1c10a2bfd21db6"},{"hashAlg":"sha384","digest":"23ed5781da39fe6dc17f79478aeeb9eb2bca1d776061da188e10f9c85f7933fb39cfdba50f39af8aed24e5b45b80d006"},{"hashAlg":"sha512","digest":"ca6616f94a209e53f6fdc526b473172eb4b2157cf4809c31e36ad52db614ed352e68407be53c238ba17a561c4fde43f4a859aa8711f9781a0c934296d4d7571b"}]}]} diff --git a/src/pcrlock/pcrlock.d/950-shutdown.pcrlock b/src/pcrlock/pcrlock.d/950-shutdown.pcrlock new file mode 100644 index 0000000..1bc3f76 --- /dev/null +++ b/src/pcrlock/pcrlock.d/950-shutdown.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":11,"digests":[{"hashAlg":"sha1","digest":"53669f193b2174641c72654b5c3e5b67950334ae"},{"hashAlg":"sha256","digest":"08434ba9cdf55a02284e2913400586cd289878e0f055f7bb0b07ce392caeb989"},{"hashAlg":"sha384","digest":"186e2d6603b9755221b7ef894dd52b1154b48ef4786aec06ab6f7709e639715e89bd59fa80736bb45f0ca88583c212c1"},{"hashAlg":"sha512","digest":"9e5549deb36fc48768cb80e03bc91c36cf549ff5921e05bab5b68faefda7fac8c8a0755db783cbf1c1b98c80dc22ef06ff3f4a0a16704749f5cd4acf40e42a94"}]}]} diff --git a/src/pcrlock/pcrlock.d/990-final.pcrlock b/src/pcrlock/pcrlock.d/990-final.pcrlock new file mode 100644 index 0000000..77081ae --- /dev/null +++ b/src/pcrlock/pcrlock.d/990-final.pcrlock @@ -0,0 +1 @@ +{"records":[{"pcr":11,"digests":[{"hashAlg":"sha1","digest":"d594c2cc0a53025004791399d80e20852af4c988"},{"hashAlg":"sha256","digest":"2443630b4620165c8b173e7265e17526fe2787ae594364dd6d839ad58f2fc007"},{"hashAlg":"sha384","digest":"90697eec39ed47f2b7ed278aa6fe6a1c073fcc7f3af54299fb95ac8a18c771acbac71e25b5a5639554943bfdfab76737"},{"hashAlg":"sha512","digest":"b3d9598ca0aa5da28be1c97a45d53cc5c72a80e61c439c8bf3e89c5c0661f49df8fa34019a21cd5e31261ae3a3a87ef4592d8010aad6a5ecdc9dbaae38cd1470"}]}]} diff --git a/src/pcrlock/pehash.c b/src/pcrlock/pehash.c new file mode 100644 index 0000000..06d1f6a --- /dev/null +++ b/src/pcrlock/pehash.c @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "hexdecoct.h" +#include "pe-binary.h" +#include "pehash.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-table.h" + +/* Implements: + * + * https://download.microsoft.com/download/9/c/5/9c5b2167-8017-4bae-9fde-d599bac8184a/authenticode_pe.docx + * → Section "Calculating the PE Image Hash" + */ + +#define IMAGE_DATA_DIRECTORY_INDEX_CERTIFICATION_TABLE 4U + +static int hash_file(int fd, EVP_MD_CTX *md_ctx, uint64_t offset, uint64_t size) { + uint8_t buffer[64*1024]; + + log_debug("Hashing %" PRIu64 " @ %" PRIu64 " → %" PRIu64, size, offset, offset + size); + + while (size > 0) { + size_t m = MIN(size, sizeof(buffer)); + ssize_t n; + + n = pread(fd, buffer, m, offset); + if (n < 0) + return log_debug_errno(errno, "Failed to read file for hashing: %m"); + if ((size_t) n != m) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while hashing."); + + if (EVP_DigestUpdate(md_ctx, buffer, m) != 1) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Unable to hash data."); + + offset += m; + size -= m; + } + + return 0; +} + +static int section_offset_cmp(const IMAGE_SECTION_HEADER *a, const IMAGE_SECTION_HEADER *b) { + return CMP(ASSERT_PTR(a)->PointerToRawData, ASSERT_PTR(b)->PointerToRawData); +} + +int pe_hash(int fd, + const EVP_MD *md, + void **ret_hash, + size_t *ret_hash_size) { + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *mdctx = NULL; + _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL; + _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL; + _cleanup_free_ PeHeader *pe_header = NULL; + const IMAGE_DATA_DIRECTORY *certificate_table; + struct stat st; + uint64_t p, q; + int r; + + assert(fd >= 0); + assert(md); + assert(ret_hash_size); + assert(ret_hash); + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat file: %m"); + r = stat_verify_regular(&st); + if (r < 0) + return log_debug_errno(r, "Not a regular file: %m"); + + r = pe_load_headers(fd, &dos_header, &pe_header); + if (r < 0) + return r; + + r = pe_load_sections(fd, dos_header, pe_header, §ions); + if (r < 0) + return r; + + certificate_table = pe_header_get_data_directory(pe_header, IMAGE_DATA_DIRECTORY_INDEX_CERTIFICATION_TABLE); + if (!certificate_table) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "File lacks certificate table."); + + mdctx = EVP_MD_CTX_new(); + if (!mdctx) + return log_oom_debug(); + + if (EVP_DigestInit_ex(mdctx, md, NULL) != 1) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to allocate message digest."); + + /* Everything from beginning of file to CheckSum field in PE header */ + p = (uint64_t) dos_header->e_lfanew + + offsetof(PeHeader, optional.CheckSum); + r = hash_file(fd, mdctx, 0, p); + if (r < 0) + return r; + p += sizeof(le32_t); + + /* Everything between the CheckSum field and the Image Data Directory Entry for the Certification Table */ + q = (uint64_t) dos_header->e_lfanew + + PE_HEADER_OPTIONAL_FIELD_OFFSET(pe_header, DataDirectory[IMAGE_DATA_DIRECTORY_INDEX_CERTIFICATION_TABLE]); + r = hash_file(fd, mdctx, p, q - p); + if (r < 0) + return r; + q += sizeof(IMAGE_DATA_DIRECTORY); + + /* The rest of the header + the section table */ + p = pe_header->optional.SizeOfHeaders; + if (p < q) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "SizeOfHeaders too short."); + r = hash_file(fd, mdctx, q, p - q); + if (r < 0) + return r; + + /* Sort by location in file */ + typesafe_qsort(sections, pe_header->pe.NumberOfSections, section_offset_cmp); + + FOREACH_ARRAY(section, sections, pe_header->pe.NumberOfSections) { + r = hash_file(fd, mdctx, section->PointerToRawData, section->SizeOfRawData); + if (r < 0) + return r; + + p += section->SizeOfRawData; + } + + if ((uint64_t) st.st_size > p) { + + if (st.st_size - p < certificate_table->Size) + return log_debug_errno(errno, "No space for certificate table, refusing."); + + r = hash_file(fd, mdctx, p, st.st_size - p - certificate_table->Size); + if (r < 0) + return r; + } + + int hsz = EVP_MD_CTX_size(mdctx); + if (hsz < 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to get hash size."); + + unsigned hash_size = (unsigned) hsz; + _cleanup_free_ void *hash = malloc(hsz); + if (!hash) + return log_oom_debug(); + + if (EVP_DigestFinal_ex(mdctx, hash, &hash_size) != 1) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to finalize hash function."); + + assert(hash_size == (unsigned) hsz); + + *ret_hash = TAKE_PTR(hash); + *ret_hash_size = hash_size; + + return 0; +} + +typedef void* SectionHashArray[_UNIFIED_SECTION_MAX]; + +static void section_hash_array_done(SectionHashArray *array) { + assert(array); + + for (size_t i = 0; i < _UNIFIED_SECTION_MAX; i++) + free((*array)[i]); +} + +int uki_hash(int fd, + const EVP_MD *md, + void* ret_hashes[static _UNIFIED_SECTION_MAX], + size_t *ret_hash_size) { + + _cleanup_(section_hash_array_done) SectionHashArray hashes = {}; + _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL; + _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL; + _cleanup_free_ PeHeader *pe_header = NULL; + int r; + + assert(fd >= 0); + assert(ret_hashes); + assert(ret_hash_size); + + r = pe_load_headers(fd, &dos_header, &pe_header); + if (r < 0) + return r; + + r = pe_load_sections(fd, dos_header, pe_header, §ions); + if (r < 0) + return r; + + int hsz = EVP_MD_size(md); + if (hsz < 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to get hash size."); + + FOREACH_ARRAY(section, sections, pe_header->pe.NumberOfSections) { + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *mdctx = NULL; + _cleanup_free_ char *n = NULL; + ssize_t i; + + n = memdup_suffix0(section->Name, sizeof(section->Name)); + if (!n) + return log_oom_debug(); + + i = string_table_lookup(unified_sections, _UNIFIED_SECTION_MAX, n); + if (i < 0) + continue; + + if (hashes[i]) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Duplicate section"); + + mdctx = EVP_MD_CTX_new(); + if (!mdctx) + return log_oom_debug(); + + if (EVP_DigestInit_ex(mdctx, md, NULL) != 1) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to allocate message digest."); + + r = hash_file(fd, mdctx, section->PointerToRawData, section->VirtualSize); + if (r < 0) + return r; + + hashes[i] = malloc(hsz); + if (!hashes[i]) + return log_oom_debug(); + + unsigned hash_size = (unsigned) hsz; + if (EVP_DigestFinal_ex(mdctx, hashes[i], &hash_size) != 1) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to finalize hash function."); + + assert(hash_size == (unsigned) hsz); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *hs = NULL; + + hs = hexmem(hashes[i], hsz); + log_debug("Section %s with %s is %s.", n, EVP_MD_name(md), strna(hs)); + } + } + + memcpy(ret_hashes, hashes, sizeof(hashes)); + zero(hashes); + *ret_hash_size = (unsigned) hsz; + + return 0; +} diff --git a/src/pcrlock/pehash.h b/src/pcrlock/pehash.h new file mode 100644 index 0000000..26f2fb1 --- /dev/null +++ b/src/pcrlock/pehash.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "openssl-util.h" +#include "uki.h" + +int pe_hash(int fd, const EVP_MD *md, void **ret_hash, size_t *ret_hash_size); + +int uki_hash(int fd, const EVP_MD *md, void *ret_hashes[static _UNIFIED_SECTION_MAX], size_t *ret_hash_size); diff --git a/src/portable/meson.build b/src/portable/meson.build new file mode 100644 index 0000000..210829b --- /dev/null +++ b/src/portable/meson.build @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_portabled_sources = files( + 'portable.c', + 'portabled-bus.c', + 'portabled-image-bus.c', + 'portabled-image.c', + 'portabled-operation.c', + 'portabled.c', +) + +if get_option('link-portabled-shared') + portabled_link_with = [libshared] +else + portabled_link_with = [ + libshared_static, + libsystemd_static, + ] +endif + +executables += [ + libexec_template + { + 'name' : 'systemd-portabled', + 'dbus' : true, + 'conditions' : ['ENABLE_PORTABLED'], + 'sources' : systemd_portabled_sources, + 'link_with' : portabled_link_with, + 'dependencies' : [ + libselinux, + threads, + ], + }, + executable_template + { + 'name' : 'portablectl', + 'public' : true, + 'conditions' : ['ENABLE_PORTABLED'], + 'sources' : files('portablectl.c'), + 'link_with' : portabled_link_with, + 'dependencies' : threads, + }, +] + +if conf.get('ENABLE_PORTABLED') == 1 + install_data('org.freedesktop.portable1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.portable1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.portable1.policy', + install_dir : polkitpolicydir) + + install_data('profile/default/service.conf', install_dir : profiledir / 'default') + install_data('profile/nonetwork/service.conf', install_dir : profiledir / 'nonetwork') + install_data('profile/strict/service.conf', install_dir : profiledir / 'strict') + install_data('profile/trusted/service.conf', install_dir : profiledir / 'trusted') +endif diff --git a/src/portable/org.freedesktop.portable1.conf b/src/portable/org.freedesktop.portable1.conf new file mode 100644 index 0000000..4899305 --- /dev/null +++ b/src/portable/org.freedesktop.portable1.conf @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/portable/org.freedesktop.portable1.policy b/src/portable/org.freedesktop.portable1.policy new file mode 100644 index 0000000..a26b00f --- /dev/null +++ b/src/portable/org.freedesktop.portable1.policy @@ -0,0 +1,43 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Inspect a portable service image + Authentication is required to inspect a portable service image. + + auth_admin + auth_admin + auth_admin_keep + + + + + Attach or detach a portable service image + Authentication is required to attach or detach a portable service image. + + auth_admin + auth_admin + auth_admin_keep + + org.freedesktop.systemd1.reload-daemon + + + + Delete or modify portable service image + Authentication is required to delete or modify a portable service image. + + auth_admin + auth_admin + auth_admin_keep + + + + diff --git a/src/portable/org.freedesktop.portable1.service b/src/portable/org.freedesktop.portable1.service new file mode 100644 index 0000000..873746e --- /dev/null +++ b/src/portable/org.freedesktop.portable1.service @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +[D-BUS Service] +Name=org.freedesktop.portable1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.portable1.service diff --git a/src/portable/portable.c b/src/portable/portable.c new file mode 100644 index 0000000..6054f0f --- /dev/null +++ b/src/portable/portable.c @@ -0,0 +1,2105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-messages.h" + +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "chase.h" +#include "conf-files.h" +#include "copy.h" +#include "data-fd-util.h" +#include "constants.h" +#include "dirent-util.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-list.h" +#include "escape.h" +#include "extension-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "install.h" +#include "iovec-util.h" +#include "locale-util.h" +#include "loop-util.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "path-lookup.h" +#include "portable.h" +#include "process-util.h" +#include "selinux-util.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "sort-util.h" +#include "string-table.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "user-util.h" + +/* Markers used in the first line of our 20-portable.conf unit file drop-in to determine, that a) the unit file was + * dropped there by the portable service logic and b) for which image it was dropped there. */ +#define PORTABLE_DROPIN_MARKER_BEGIN "# Drop-in created for image '" +#define PORTABLE_DROPIN_MARKER_END "', do not edit." + +static bool prefix_match(const char *unit, const char *prefix) { + const char *p; + + p = startswith(unit, prefix); + if (!p) + return false; + + /* Only respect prefixes followed by dash or dot or when there's a complete match */ + return IN_SET(*p, '-', '.', '@', 0); +} + +static bool unit_match(const char *unit, char **matches) { + const char *dot; + + dot = strrchr(unit, '.'); + if (!dot) + return false; + + if (!STR_IN_SET(dot, ".service", ".socket", ".target", ".timer", ".path")) + return false; + + /* Empty match expression means: everything */ + if (strv_isempty(matches)) + return true; + + /* Otherwise, at least one needs to match */ + STRV_FOREACH(i, matches) + if (prefix_match(unit, *i)) + return true; + + return false; +} + +static PortableMetadata *portable_metadata_new(const char *name, const char *path, const char *selinux_label, int fd) { + PortableMetadata *m; + + m = malloc0(offsetof(PortableMetadata, name) + strlen(name) + 1); + if (!m) + return NULL; + + /* In case of a layered attach, we want to remember which image the unit came from */ + if (path) { + m->image_path = strdup(path); + if (!m->image_path) + return mfree(m); + } + + /* The metadata file might have SELinux labels, we need to carry them and reapply them */ + if (!isempty(selinux_label)) { + m->selinux_label = strdup(selinux_label); + if (!m->selinux_label) { + free(m->image_path); + return mfree(m); + } + } + + strcpy(m->name, name); + m->fd = fd; + + return TAKE_PTR(m); +} + +PortableMetadata *portable_metadata_unref(PortableMetadata *i) { + if (!i) + return NULL; + + safe_close(i->fd); + free(i->source); + free(i->image_path); + free(i->selinux_label); + + return mfree(i); +} + +static int compare_metadata(PortableMetadata *const *x, PortableMetadata *const *y) { + return strcmp((*x)->name, (*y)->name); +} + +int portable_metadata_hashmap_to_sorted_array(Hashmap *unit_files, PortableMetadata ***ret) { + + _cleanup_free_ PortableMetadata **sorted = NULL; + PortableMetadata *item; + size_t k = 0; + + sorted = new(PortableMetadata*, hashmap_size(unit_files)); + if (!sorted) + return -ENOMEM; + + HASHMAP_FOREACH(item, unit_files) + sorted[k++] = item; + + assert(k == hashmap_size(unit_files)); + + typesafe_qsort(sorted, k, compare_metadata); + + *ret = TAKE_PTR(sorted); + return 0; +} + +static int send_one_fd_iov_with_data_fd( + int socket_fd, + const struct iovec *iov, + size_t iovlen, + int fd) { + + _cleanup_close_ int data_fd = -EBADF; + + assert(iov || iovlen == 0); + assert(socket_fd >= 0); + assert(fd >= 0); + + data_fd = copy_data_fd(fd); + if (data_fd < 0) + return data_fd; + + return send_one_fd_iov(socket_fd, data_fd, iov, iovlen, 0); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(portable_metadata_hash_ops, char, string_hash_func, string_compare_func, + PortableMetadata, portable_metadata_unref); + +static int extract_now( + const char *where, + char **matches, + const char *image_name, + bool path_is_extension, + bool relax_extension_release_check, + int socket_fd, + PortableMetadata **ret_os_release, + Hashmap **ret_unit_files) { + + _cleanup_hashmap_free_ Hashmap *unit_files = NULL; + _cleanup_(portable_metadata_unrefp) PortableMetadata *os_release = NULL; + _cleanup_(lookup_paths_free) LookupPaths paths = {}; + _cleanup_close_ int os_release_fd = -EBADF; + _cleanup_free_ char *os_release_path = NULL; + const char *os_release_id; + int r; + + /* Extracts the metadata from a directory tree 'where'. Extracts two kinds of information: the /etc/os-release + * data, and all unit files matching the specified expression. Note that this function is called in two very + * different but also similar contexts. When the tool gets invoked on a directory tree, we'll process it + * directly, and in-process, and thus can return the requested data directly, via 'ret_os_release' and + * 'ret_unit_files'. However, if the tool is invoked on a raw disk image — which needs to be mounted first — we + * are invoked in a child process with private mounts and then need to send the collected data to our + * parent. To handle both cases in one call this function also gets a 'socket_fd' parameter, which when >= 0 is + * used to send the data to the parent. */ + + assert(where); + + /* First, find os-release/extension-release and send it upstream (or just save it). */ + if (path_is_extension) { + ImageClass class = IMAGE_SYSEXT; + + r = open_extension_release(where, IMAGE_SYSEXT, image_name, relax_extension_release_check, &os_release_path, &os_release_fd); + if (r == -ENOENT) { + r = open_extension_release(where, IMAGE_CONFEXT, image_name, relax_extension_release_check, &os_release_path, &os_release_fd); + if (r >= 0) + class = IMAGE_CONFEXT; + } + if (r < 0) + return log_error_errno(r, "Failed to open extension release from '%s': %m", image_name); + + os_release_id = strjoina((class == IMAGE_SYSEXT) ? "/usr/lib" : "/etc", "/extension-release.d/extension-release.", image_name); + } else { + os_release_id = "/etc/os-release"; + r = open_os_release(where, &os_release_path, &os_release_fd); + } + if (r < 0) + log_debug_errno(r, + "Couldn't acquire %s file, ignoring: %m", + path_is_extension ? "extension-release " : "os-release"); + else { + if (socket_fd >= 0) { + struct iovec iov[] = { + IOVEC_MAKE_STRING(os_release_id), + IOVEC_MAKE((char *)"\0", sizeof(char)), + }; + + r = send_one_fd_iov_with_data_fd(socket_fd, iov, ELEMENTSOF(iov), os_release_fd); + if (r < 0) + return log_debug_errno(r, "Failed to send os-release file: %m"); + } + + if (ret_os_release) { + os_release = portable_metadata_new(os_release_id, NULL, NULL, os_release_fd); + if (!os_release) + return -ENOMEM; + + os_release_fd = -EBADF; + os_release->source = TAKE_PTR(os_release_path); + } + } + + /* Then, send unit file data to the parent (or/and add it to the hashmap). For that we use our usual unit + * discovery logic. Note that we force looking inside of /lib/systemd/system/ for units too, as the + * image might have a legacy split-usr layout. */ + r = lookup_paths_init(&paths, RUNTIME_SCOPE_SYSTEM, LOOKUP_PATHS_SPLIT_USR, where); + if (r < 0) + return log_debug_errno(r, "Failed to acquire lookup paths: %m"); + + unit_files = hashmap_new(&portable_metadata_hash_ops); + if (!unit_files) + return -ENOMEM; + + STRV_FOREACH(i, paths.search_path) { + _cleanup_free_ char *resolved = NULL; + _cleanup_closedir_ DIR *d = NULL; + + r = chase_and_opendir(*i, where, 0, &resolved, &d); + if (r < 0) { + log_debug_errno(r, "Failed to open unit path '%s', ignoring: %m", *i); + continue; + } + + FOREACH_DIRENT(de, d, return log_debug_errno(errno, "Failed to read directory: %m")) { + _cleanup_(portable_metadata_unrefp) PortableMetadata *m = NULL; + _cleanup_(mac_selinux_freep) char *con = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + + if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY)) + continue; + + if (!unit_match(de->d_name, matches)) + continue; + + /* Filter out duplicates */ + if (hashmap_get(unit_files, de->d_name)) + continue; + + if (!IN_SET(de->d_type, DT_LNK, DT_REG)) + continue; + + fd = openat(dirfd(d), de->d_name, O_CLOEXEC|O_RDONLY); + if (fd < 0) { + log_debug_errno(errno, "Failed to open unit file '%s', ignoring: %m", de->d_name); + continue; + } + + /* Reject empty files, just in case */ + if (fstat(fd, &st) < 0) { + log_debug_errno(errno, "Failed to stat unit file '%s', ignoring: %m", de->d_name); + continue; + } + + if (st.st_size <= 0) { + log_debug("Unit file '%s' is empty, ignoring.", de->d_name); + continue; + } + +#if HAVE_SELINUX + /* The units will be copied on the host's filesystem, so if they had a SELinux label + * we have to preserve it. Copy it out so that it can be applied later. */ + + r = fgetfilecon_raw(fd, &con); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(errno)) + log_debug_errno(errno, "Failed to get SELinux file context from '%s', ignoring: %m", de->d_name); +#endif + + if (socket_fd >= 0) { + struct iovec iov[] = { + IOVEC_MAKE_STRING(de->d_name), + IOVEC_MAKE((char *)"\0", sizeof(char)), + IOVEC_MAKE_STRING(strempty(con)), + }; + + r = send_one_fd_iov_with_data_fd(socket_fd, iov, ELEMENTSOF(iov), fd); + if (r < 0) + return log_debug_errno(r, "Failed to send unit metadata to parent: %m"); + } + + m = portable_metadata_new(de->d_name, where, con, fd); + if (!m) + return -ENOMEM; + fd = -EBADF; + + m->source = path_join(resolved, de->d_name); + if (!m->source) + return -ENOMEM; + + r = hashmap_put(unit_files, m->name, m); + if (r < 0) + return log_debug_errno(r, "Failed to add unit to hashmap: %m"); + m = NULL; + } + } + + if (ret_os_release) + *ret_os_release = TAKE_PTR(os_release); + if (ret_unit_files) + *ret_unit_files = TAKE_PTR(unit_files); + + return 0; +} + +static int portable_extract_by_path( + const char *path, + bool path_is_extension, + bool relax_extension_release_check, + char **matches, + const ImagePolicy *image_policy, + PortableMetadata **ret_os_release, + Hashmap **ret_unit_files, + sd_bus_error *error) { + + _cleanup_hashmap_free_ Hashmap *unit_files = NULL; + _cleanup_(portable_metadata_unrefp) PortableMetadata* os_release = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + int r; + + assert(path); + + r = loop_device_make_by_path(path, O_RDONLY, /* sector_size= */ UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &d); + if (r == -EISDIR) { + _cleanup_free_ char *image_name = NULL; + + /* We can't turn this into a loop-back block device, and this returns EISDIR? Then this is a directory + * tree and not a raw device. It's easy then. */ + + r = path_extract_filename(path, &image_name); + if (r < 0) + return log_error_errno(r, "Failed to extract image name from path '%s': %m", path); + + r = extract_now(path, matches, image_name, path_is_extension, /* relax_extension_release_check= */ false, -1, &os_release, &unit_files); + if (r < 0) + return r; + + } else if (r < 0) + return log_debug_errno(r, "Failed to set up loopback device for %s: %m", path); + else { + _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; + _cleanup_(rmdir_and_freep) char *tmpdir = NULL; + _cleanup_close_pair_ int seq[2] = EBADF_PAIR; + _cleanup_(sigkill_waitp) pid_t child = 0; + + /* We now have a loopback block device, let's fork off a child in its own mount namespace, mount it + * there, and extract the metadata we need. The metadata is sent from the child back to us. */ + + BLOCK_SIGNALS(SIGCHLD); + + r = mkdtemp_malloc("/tmp/inspect-XXXXXX", &tmpdir); + if (r < 0) + return log_debug_errno(r, "Failed to create temporary directory: %m"); + + r = dissect_loop_device( + d, + /* verity= */ NULL, + /* mount_options= */ NULL, + image_policy, + DISSECT_IMAGE_READ_ONLY | + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES, + &m); + if (r == -ENOPKG) + sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Couldn't identify a suitable partition table or file system in '%s'.", path); + else if (r == -EADDRNOTAVAIL) + sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "No root partition for specified root hash found in '%s'.", path); + else if (r == -ENOTUNIQ) + sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Multiple suitable root partitions found in image '%s'.", path); + else if (r == -ENXIO) + sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "No suitable root partition found in image '%s'.", path); + else if (r == -EPROTONOSUPPORT) + sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Device '%s' is loopback block device with partition scanning turned off, please turn it on.", path); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, seq) < 0) + return log_debug_errno(errno, "Failed to allocated SOCK_SEQPACKET socket: %m"); + + r = safe_fork("(sd-dissect)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_LOG, &child); + if (r < 0) + return r; + if (r == 0) { + DissectImageFlags flags = DISSECT_IMAGE_READ_ONLY; + + seq[0] = safe_close(seq[0]); + + if (path_is_extension) + flags |= DISSECT_IMAGE_VALIDATE_OS_EXT | (relax_extension_release_check ? DISSECT_IMAGE_RELAX_EXTENSION_CHECK : 0); + else + flags |= DISSECT_IMAGE_VALIDATE_OS; + + r = dissected_image_mount( + m, + tmpdir, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + flags); + if (r < 0) { + log_debug_errno(r, "Failed to mount dissected image: %m"); + goto child_finish; + } + + r = extract_now(tmpdir, matches, m->image_name, path_is_extension, relax_extension_release_check, seq[1], NULL, NULL); + + child_finish: + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + seq[1] = safe_close(seq[1]); + + unit_files = hashmap_new(&portable_metadata_hash_ops); + if (!unit_files) + return -ENOMEM; + + for (;;) { + _cleanup_(portable_metadata_unrefp) PortableMetadata *add = NULL; + _cleanup_close_ int fd = -EBADF; + /* We use NAME_MAX space for the SELinux label here. The kernel currently enforces no limit, but + * according to suggestions from the SELinux people this will change and it will probably be + * identical to NAME_MAX. For now we use that, but this should be updated one day when the final + * limit is known. */ + char iov_buffer[PATH_MAX + NAME_MAX + 2]; + struct iovec iov = IOVEC_MAKE(iov_buffer, sizeof(iov_buffer)); + + ssize_t n = receive_one_fd_iov(seq[0], &iov, 1, 0, &fd); + if (n == -EIO) + break; + if (n < 0) + return log_debug_errno(n, "Failed to receive item: %m"); + iov_buffer[n] = 0; + + /* We can't really distinguish a zero-length datagram without any fds from EOF (both are signalled the + * same way by recvmsg()). Hence, accept either as end notification. */ + if (isempty(iov_buffer) && fd < 0) + break; + + if (isempty(iov_buffer) || fd < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid item sent from child."); + + /* Given recvmsg cannot be used with multiple io vectors if you don't know the size in advance, + * use a marker to separate the name and the optional SELinux context. */ + char *selinux_label = memchr(iov_buffer, 0, n); + assert(selinux_label); + selinux_label++; + + add = portable_metadata_new(iov_buffer, path, selinux_label, fd); + if (!add) + return -ENOMEM; + fd = -EBADF; + + /* Note that we do not initialize 'add->source' here, as the source path is not usable here as + * it refers to a path only valid in the short-living namespaced child process we forked + * here. */ + + if (PORTABLE_METADATA_IS_UNIT(add)) { + r = hashmap_put(unit_files, add->name, add); + if (r < 0) + return log_debug_errno(r, "Failed to add item to unit file list: %m"); + + add = NULL; + + } else if (PORTABLE_METADATA_IS_OS_RELEASE(add) || PORTABLE_METADATA_IS_EXTENSION_RELEASE(add)) { + + assert(!os_release); + os_release = TAKE_PTR(add); + } else + assert_not_reached(); + } + + r = wait_for_terminate_and_check("(sd-dissect)", child, 0); + if (r < 0) + return r; + child = 0; + } + + if (!os_release) + return sd_bus_error_setf(error, + SD_BUS_ERROR_INVALID_ARGS, + "Image '%s' lacks %s data, refusing.", + path, + path_is_extension ? "extension-release" : "os-release"); + + if (ret_unit_files) + *ret_unit_files = TAKE_PTR(unit_files); + + if (ret_os_release) + *ret_os_release = TAKE_PTR(os_release); + + return 0; +} + +static int extract_image_and_extensions( + const char *name_or_path, + char **matches, + char **extension_image_paths, + bool validate_extension, + bool relax_extension_release_check, + const ImagePolicy *image_policy, + Image **ret_image, + OrderedHashmap **ret_extension_images, + OrderedHashmap **ret_extension_releases, + PortableMetadata **ret_os_release, + Hashmap **ret_unit_files, + char ***ret_valid_prefixes, + sd_bus_error *error) { + + _cleanup_free_ char *id = NULL, *version_id = NULL, *sysext_level = NULL, *confext_level = NULL; + _cleanup_(portable_metadata_unrefp) PortableMetadata *os_release = NULL; + _cleanup_ordered_hashmap_free_ OrderedHashmap *extension_images = NULL, *extension_releases = NULL; + _cleanup_hashmap_free_ Hashmap *unit_files = NULL; + _cleanup_strv_free_ char **valid_prefixes = NULL; + _cleanup_(image_unrefp) Image *image = NULL; + Image *ext; + int r; + + assert(name_or_path); + + r = image_find_harder(IMAGE_PORTABLE, name_or_path, NULL, &image); + if (r < 0) + return r; + + if (!strv_isempty(extension_image_paths)) { + extension_images = ordered_hashmap_new(&image_hash_ops); + if (!extension_images) + return -ENOMEM; + + if (ret_extension_releases) { + extension_releases = ordered_hashmap_new(&portable_metadata_hash_ops); + if (!extension_releases) + return -ENOMEM; + } + + STRV_FOREACH(p, extension_image_paths) { + _cleanup_(image_unrefp) Image *new = NULL; + + r = image_find_harder(IMAGE_PORTABLE, *p, NULL, &new); + if (r < 0) + return r; + + r = ordered_hashmap_put(extension_images, new->name, new); + if (r < 0) + return r; + TAKE_PTR(new); + } + } + + r = portable_extract_by_path( + image->path, + /* path_is_extension= */ false, + /* relax_extension_release_check= */ false, + matches, + image_policy, + &os_release, + &unit_files, + error); + if (r < 0) + return r; + + /* If we are layering extension images on top of a runtime image, check that the os-release and + * extension-release metadata match, otherwise reject it immediately as invalid, or it will fail when + * the units are started. Also, collect valid portable prefixes if caller requested that. */ + if (validate_extension || ret_valid_prefixes) { + _cleanup_free_ char *prefixes = NULL; + + r = parse_env_file_fd(os_release->fd, os_release->name, + "ID", &id, + "VERSION_ID", &version_id, + "SYSEXT_LEVEL", &sysext_level, + "CONFEXT_LEVEL", &confext_level, + "PORTABLE_PREFIXES", &prefixes); + if (r < 0) + return r; + if (isempty(id)) + return sd_bus_error_set_errnof(error, SYNTHETIC_ERRNO(ESTALE), "Image %s os-release metadata lacks the ID field", name_or_path); + + if (prefixes) { + valid_prefixes = strv_split(prefixes, WHITESPACE); + if (!valid_prefixes) + return -ENOMEM; + } + } + + ORDERED_HASHMAP_FOREACH(ext, extension_images) { + _cleanup_(portable_metadata_unrefp) PortableMetadata *extension_release_meta = NULL; + _cleanup_hashmap_free_ Hashmap *extra_unit_files = NULL; + _cleanup_strv_free_ char **extension_release = NULL; + const char *e; + + r = portable_extract_by_path( + ext->path, + /* path_is_extension= */ true, + relax_extension_release_check, + matches, + image_policy, + &extension_release_meta, + &extra_unit_files, + error); + if (r < 0) + return r; + + r = hashmap_move(unit_files, extra_unit_files); + if (r < 0) + return r; + + if (!validate_extension && !ret_valid_prefixes && !ret_extension_releases) + continue; + + r = load_env_file_pairs_fd(extension_release_meta->fd, extension_release_meta->name, &extension_release); + if (r < 0) + return r; + + if (validate_extension) { + r = extension_release_validate(ext->path, id, version_id, sysext_level, "portable", extension_release, IMAGE_SYSEXT); + if (r < 0) + r = extension_release_validate(ext->path, id, version_id, confext_level, "portable", extension_release, IMAGE_CONFEXT); + + if (r == 0) + return sd_bus_error_set_errnof(error, SYNTHETIC_ERRNO(ESTALE), "Image %s extension-release metadata does not match the root's", ext->path); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to compare image %s extension-release metadata with the root's os-release: %m", ext->path); + } + + e = strv_env_pairs_get(extension_release, "PORTABLE_PREFIXES"); + if (e) { + _cleanup_strv_free_ char **l = NULL; + + l = strv_split(e, WHITESPACE); + if (!l) + return -ENOMEM; + + r = strv_extend_strv(&valid_prefixes, l, true); + if (r < 0) + return r; + } + + if (ret_extension_releases) { + r = ordered_hashmap_put(extension_releases, ext->name, extension_release_meta); + if (r < 0) + return r; + TAKE_PTR(extension_release_meta); + } + } + + strv_sort(valid_prefixes); + + if (ret_image) + *ret_image = TAKE_PTR(image); + if (ret_extension_images) + *ret_extension_images = TAKE_PTR(extension_images); + if (ret_extension_releases) + *ret_extension_releases = TAKE_PTR(extension_releases); + if (ret_os_release) + *ret_os_release = TAKE_PTR(os_release); + if (ret_unit_files) + *ret_unit_files = TAKE_PTR(unit_files); + if (ret_valid_prefixes) + *ret_valid_prefixes = TAKE_PTR(valid_prefixes); + + return 0; +} + +int portable_extract( + const char *name_or_path, + char **matches, + char **extension_image_paths, + const ImagePolicy *image_policy, + PortableFlags flags, + PortableMetadata **ret_os_release, + OrderedHashmap **ret_extension_releases, + Hashmap **ret_unit_files, + char ***ret_valid_prefixes, + sd_bus_error *error) { + + _cleanup_(portable_metadata_unrefp) PortableMetadata *os_release = NULL; + _cleanup_ordered_hashmap_free_ OrderedHashmap *extension_images = NULL, *extension_releases = NULL; + _cleanup_hashmap_free_ Hashmap *unit_files = NULL; + _cleanup_strv_free_ char **valid_prefixes = NULL; + _cleanup_(image_unrefp) Image *image = NULL; + int r; + + assert(name_or_path); + + r = extract_image_and_extensions( + name_or_path, + matches, + extension_image_paths, + /* validate_extension= */ false, + /* relax_extension_release_check= */ FLAGS_SET(flags, PORTABLE_FORCE_EXTENSION), + image_policy, + &image, + &extension_images, + &extension_releases, + &os_release, + &unit_files, + ret_valid_prefixes ? &valid_prefixes : NULL, + error); + if (r < 0) + return r; + + if (hashmap_isempty(unit_files)) { + _cleanup_free_ char *extensions = strv_join(extension_image_paths, ", "); + if (!extensions) + return -ENOMEM; + + return sd_bus_error_setf(error, + SD_BUS_ERROR_INVALID_ARGS, + "Couldn't find any matching unit files in image '%s%s%s', refusing.", + image->path, + isempty(extensions) ? "" : "' or any of its extensions '", + isempty(extensions) ? "" : extensions); + } + + if (ret_os_release) + *ret_os_release = TAKE_PTR(os_release); + if (ret_extension_releases) + *ret_extension_releases = TAKE_PTR(extension_releases); + if (ret_unit_files) + *ret_unit_files = TAKE_PTR(unit_files); + if (ret_valid_prefixes) + *ret_valid_prefixes = TAKE_PTR(valid_prefixes); + + return 0; +} + +static int unit_file_is_active( + sd_bus *bus, + const char *name, + sd_bus_error *error) { + + static const char *const active_states[] = { + "activating", + "active", + "reloading", + "deactivating", + NULL, + }; + int r; + + if (!bus) + return false; + + /* If we are looking at a plain or instance things are easy, we can just query the state */ + if (unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + _cleanup_free_ char *path = NULL, *buf = NULL; + + path = unit_dbus_path_from_name(name); + if (!path) + return -ENOMEM; + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "ActiveState", + error, + &buf); + if (r < 0) + return log_debug_errno(r, "Failed to retrieve unit state: %s", bus_error_message(error, r)); + + return strv_contains((char**) active_states, buf); + } + + /* Otherwise we need to enumerate. But let's build the most restricted query we can */ + if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + const char *at, *prefix, *joined; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "ListUnitsByPatterns"); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(m, (char**) active_states); + if (r < 0) + return r; + + at = strchr(name, '@'); + assert(at); + + prefix = strndupa_safe(name, at + 1 - name); + joined = strjoina(prefix, "*", at + 1); + + r = sd_bus_message_append_strv(m, STRV_MAKE(joined)); + if (r < 0) + return r; + + r = sd_bus_call(bus, m, 0, error, &reply); + if (r < 0) + return log_debug_errno(r, "Failed to list units: %s", bus_error_message(error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_STRUCT, "ssssssouso"); + if (r < 0) + return r; + + return r > 0; + } + + return -EINVAL; +} + +static int portable_changes_add( + PortableChange **changes, + size_t *n_changes, + int type_or_errno, /* PORTABLE_COPY, PORTABLE_SYMLINK, … if positive, or errno if negative */ + const char *path, + const char *source) { + + _cleanup_free_ char *p = NULL, *s = NULL; + PortableChange *c; + int r; + + assert(path); + assert(!changes == !n_changes); + + if (type_or_errno >= 0) + assert(type_or_errno < _PORTABLE_CHANGE_TYPE_MAX); + else + assert(type_or_errno >= -ERRNO_MAX); + + if (!changes) + return 0; + + c = reallocarray(*changes, *n_changes + 1, sizeof(PortableChange)); + if (!c) + return -ENOMEM; + *changes = c; + + r = path_simplify_alloc(path, &p); + if (r < 0) + return r; + + r = path_simplify_alloc(source, &s); + if (r < 0) + return r; + + c[(*n_changes)++] = (PortableChange) { + .type_or_errno = type_or_errno, + .path = TAKE_PTR(p), + .source = TAKE_PTR(s), + }; + + return 0; +} + +static int portable_changes_add_with_prefix( + PortableChange **changes, + size_t *n_changes, + int type_or_errno, + const char *prefix, + const char *path, + const char *source) { + + _cleanup_free_ char *path_buf = NULL, *source_buf = NULL; + + assert(path); + assert(!changes == !n_changes); + + if (!changes) + return 0; + + if (prefix) { + path_buf = path_join(prefix, path); + if (!path_buf) + return -ENOMEM; + + path = path_buf; + + if (source) { + source_buf = path_join(prefix, source); + if (!source_buf) + return -ENOMEM; + + source = source_buf; + } + } + + return portable_changes_add(changes, n_changes, type_or_errno, path, source); +} + +void portable_changes_free(PortableChange *changes, size_t n_changes) { + size_t i; + + assert(changes || n_changes == 0); + + for (i = 0; i < n_changes; i++) { + free(changes[i].path); + free(changes[i].source); + } + + free(changes); +} + +static const char *root_setting_from_image(ImageType type) { + return IN_SET(type, IMAGE_DIRECTORY, IMAGE_SUBVOLUME) ? "RootDirectory=" : "RootImage="; +} + +static const char *extension_setting_from_image(ImageType type) { + return IN_SET(type, IMAGE_DIRECTORY, IMAGE_SUBVOLUME) ? "ExtensionDirectories=" : "ExtensionImages="; +} + +static int make_marker_text(const char *image_path, OrderedHashmap *extension_images, char **ret_text) { + _cleanup_free_ char *text = NULL, *escaped_image_path = NULL; + Image *ext; + + assert(image_path); + assert(ret_text); + + escaped_image_path = xescape(image_path, ":"); + if (!escaped_image_path) + return -ENOMEM; + + /* If the image is layered, include all layers in the marker as a colon-separated + * list of paths, so that we can do exact matches on removal. */ + text = strjoin(PORTABLE_DROPIN_MARKER_BEGIN, escaped_image_path); + if (!text) + return -ENOMEM; + + ORDERED_HASHMAP_FOREACH(ext, extension_images) { + _cleanup_free_ char *escaped = NULL; + + escaped = xescape(ext->path, ":"); + if (!escaped) + return -ENOMEM; + + if (!strextend(&text, ":", escaped)) + return -ENOMEM; + } + + if (!strextend(&text, PORTABLE_DROPIN_MARKER_END "\n")) + return -ENOMEM; + + *ret_text = TAKE_PTR(text); + return 0; +} + +static int append_release_log_fields( + char **text, + const PortableMetadata *release, + ImageClass type, + const char *field_name) { + + static const char *const field_versions[_IMAGE_CLASS_MAX][4]= { + [IMAGE_PORTABLE] = { "IMAGE_VERSION", "VERSION_ID", "BUILD_ID", NULL }, + [IMAGE_SYSEXT] = { "SYSEXT_IMAGE_VERSION", "SYSEXT_VERSION_ID", "SYSEXT_BUILD_ID", NULL }, + [IMAGE_CONFEXT] = { "CONFEXT_IMAGE_VERSION", "CONFEXT_VERSION_ID", "CONFEXT_BUILD_ID", NULL }, + }; + static const char *const field_ids[_IMAGE_CLASS_MAX][3]= { + [IMAGE_PORTABLE] = { "IMAGE_ID", "ID", NULL }, + [IMAGE_SYSEXT] = { "SYSEXT_IMAGE_ID", "SYSEXT_ID", NULL }, + [IMAGE_CONFEXT] = { "CONFEXT_IMAGE_ID", "CONFEXT_ID", NULL }, + }; + _cleanup_strv_free_ char **fields = NULL; + const char *id = NULL, *version = NULL; + int r; + + assert(IN_SET(type, IMAGE_PORTABLE, IMAGE_SYSEXT, IMAGE_CONFEXT)); + assert(!strv_isempty((char *const *)field_ids[type])); + assert(!strv_isempty((char *const *)field_versions[type])); + assert(field_name); + assert(text); + + if (!release) + return 0; /* Nothing to do. */ + + r = load_env_file_pairs_fd(release->fd, release->name, &fields); + if (r < 0) + return log_debug_errno(r, "Failed to parse '%s': %m", release->name); + + /* Find an ID first, in order of preference from more specific to less specific: IMAGE_ID -> ID */ + id = strv_find_first_field((char *const *)field_ids[type], fields); + + /* Then the version, same logic, prefer the more specific one */ + version = strv_find_first_field((char *const *)field_versions[type], fields); + + /* If there's no valid version to be found, simply omit it. */ + if (!id && !version) + return 0; + + if (!strextend(text, + "LogExtraFields=", + field_name, + "=", + strempty(id), + id && version ? "_" : "", + strempty(version), + "\n")) + return -ENOMEM; + + return 0; +} + +static int install_chroot_dropin( + const char *image_path, + ImageType type, + OrderedHashmap *extension_images, + OrderedHashmap *extension_releases, + const PortableMetadata *m, + const PortableMetadata *os_release, + const char *dropin_dir, + PortableFlags flags, + char **ret_dropin, + PortableChange **changes, + size_t *n_changes) { + + _cleanup_free_ char *text = NULL, *dropin = NULL; + int r; + + assert(image_path); + assert(m); + assert(dropin_dir); + + dropin = path_join(dropin_dir, "20-portable.conf"); + if (!dropin) + return -ENOMEM; + + r = make_marker_text(image_path, extension_images, &text); + if (r < 0) + return log_debug_errno(r, "Failed to generate marker string for portable drop-in: %m"); + + if (endswith(m->name, ".service")) { + const char *root_type; + _cleanup_free_ char *base_name = NULL; + Image *ext; + + root_type = root_setting_from_image(type); + + r = path_extract_filename(m->image_path ?: image_path, &base_name); + if (r < 0) + return log_debug_errno(r, "Failed to extract basename from '%s': %m", m->image_path ?: image_path); + + if (!strextend(&text, + "\n" + "[Service]\n", + root_type, image_path, "\n" + "Environment=PORTABLE=", base_name, "\n" + "LogExtraFields=PORTABLE=", base_name, "\n")) + return -ENOMEM; + + /* If we have a single image then PORTABLE= will point to it, so we add + * PORTABLE_NAME_AND_VERSION= with the os-release fields and we are done. But if we have + * extensions, PORTABLE= will point to the image where the current unit was found in. So we + * also list PORTABLE_ROOT= and PORTABLE_ROOT_NAME_AND_VERSION= for the base image, and + * PORTABLE_EXTENSION= and PORTABLE_EXTENSION_NAME_AND_VERSION= for each extension, so that + * all needed metadata is available. */ + if (ordered_hashmap_isempty(extension_images)) + r = append_release_log_fields(&text, os_release, IMAGE_PORTABLE, "PORTABLE_NAME_AND_VERSION"); + else { + _cleanup_free_ char *root_base_name = NULL; + + r = path_extract_filename(image_path, &root_base_name); + if (r < 0) + return log_debug_errno(r, "Failed to extract basename from '%s': %m", image_path); + + if (!strextend(&text, + "Environment=PORTABLE_ROOT=", root_base_name, "\n", + "LogExtraFields=PORTABLE_ROOT=", root_base_name, "\n")) + return -ENOMEM; + + r = append_release_log_fields(&text, os_release, IMAGE_PORTABLE, "PORTABLE_ROOT_NAME_AND_VERSION"); + } + if (r < 0) + return r; + + if (m->image_path && !path_equal(m->image_path, image_path)) + ORDERED_HASHMAP_FOREACH(ext, extension_images) { + _cleanup_free_ char *extension_base_name = NULL; + + r = path_extract_filename(ext->path, &extension_base_name); + if (r < 0) + return log_debug_errno(r, "Failed to extract basename from '%s': %m", ext->path); + + if (!strextend(&text, + "\n", + extension_setting_from_image(ext->type), + ext->path, + /* With --force tell PID1 to avoid enforcing that the image and + * extension-release. have to match. */ + !IN_SET(type, IMAGE_DIRECTORY, IMAGE_SUBVOLUME) && + FLAGS_SET(flags, PORTABLE_FORCE_EXTENSION) ? + ":x-systemd.relax-extension-release-check\n" : + "\n", + /* In PORTABLE= we list the 'main' image name for this unit + * (the image where the unit was extracted from), but we are + * stacking multiple images, so list those too. */ + "LogExtraFields=PORTABLE_EXTENSION=", extension_base_name, "\n")) + return -ENOMEM; + + /* Look for image/version identifiers in the extension release files. We + * look for all possible IDs, but typically only 1 or 2 will be set, so + * the number of fields added shouldn't be too large. We prefix the DDI + * name to the value, so that we can add the same field multiple times and + * still be able to identify what applies to what. */ + r = append_release_log_fields(&text, + ordered_hashmap_get(extension_releases, ext->name), + IMAGE_SYSEXT, + "PORTABLE_EXTENSION_NAME_AND_VERSION"); + if (r < 0) + return r; + + r = append_release_log_fields(&text, + ordered_hashmap_get(extension_releases, ext->name), + IMAGE_CONFEXT, + "PORTABLE_EXTENSION_NAME_AND_VERSION"); + if (r < 0) + return r; + } + } + + r = write_string_file(dropin, text, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_SYNC); + if (r < 0) + return log_debug_errno(r, "Failed to write '%s': %m", dropin); + + (void) portable_changes_add(changes, n_changes, PORTABLE_WRITE, dropin, NULL); + + if (ret_dropin) + *ret_dropin = TAKE_PTR(dropin); + + return 0; +} + +static int install_profile_dropin( + const char *image_path, + const PortableMetadata *m, + const char *dropin_dir, + const char *profile, + PortableFlags flags, + char **ret_dropin, + PortableChange **changes, + size_t *n_changes) { + + _cleanup_free_ char *dropin = NULL, *from = NULL; + int r; + + assert(image_path); + assert(m); + assert(dropin_dir); + + if (!profile) + return 0; + + r = find_portable_profile(profile, m->name, &from); + if (r < 0) { + if (r != -ENOENT) + return log_debug_errno(errno, "Profile '%s' is not accessible: %m", profile); + + log_debug_errno(errno, "Skipping link to profile '%s', as it does not exist: %m", profile); + return 0; + } + + dropin = path_join(dropin_dir, "10-profile.conf"); + if (!dropin) + return -ENOMEM; + + if (flags & PORTABLE_PREFER_COPY) { + + r = copy_file_atomic(from, dropin, 0644, COPY_REFLINK|COPY_FSYNC); + if (r < 0) + return log_debug_errno(r, "Failed to copy %s %s %s: %m", from, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), dropin); + + (void) portable_changes_add(changes, n_changes, PORTABLE_COPY, dropin, from); + + } else { + + if (symlink(from, dropin) < 0) + return log_debug_errno(errno, "Failed to link %s %s %s: %m", from, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), dropin); + + (void) portable_changes_add(changes, n_changes, PORTABLE_SYMLINK, dropin, from); + } + + if (ret_dropin) + *ret_dropin = TAKE_PTR(dropin); + + return 0; +} + +static const char *attached_path(const LookupPaths *paths, PortableFlags flags) { + const char *where; + + assert(paths); + + if (flags & PORTABLE_RUNTIME) + where = paths->runtime_attached; + else + where = paths->persistent_attached; + + assert(where); + return where; +} + +static int attach_unit_file( + const LookupPaths *paths, + const char *image_path, + ImageType type, + OrderedHashmap *extension_images, + OrderedHashmap *extension_releases, + const PortableMetadata *m, + const PortableMetadata *os_release, + const char *profile, + PortableFlags flags, + PortableChange **changes, + size_t *n_changes) { + + _cleanup_(unlink_and_freep) char *chroot_dropin = NULL, *profile_dropin = NULL; + _cleanup_(rmdir_and_freep) char *dropin_dir = NULL; + _cleanup_free_ char *path = NULL; + const char *where; + int r; + + assert(paths); + assert(image_path); + assert(m); + assert(PORTABLE_METADATA_IS_UNIT(m)); + + where = attached_path(paths, flags); + + (void) mkdir_parents(where, 0755); + if (mkdir(where, 0755) < 0) { + if (errno != EEXIST) + return log_debug_errno(errno, "Failed to create attach directory %s: %m", where); + } else + (void) portable_changes_add(changes, n_changes, PORTABLE_MKDIR, where, NULL); + + path = path_join(where, m->name); + if (!path) + return -ENOMEM; + + dropin_dir = strjoin(path, ".d"); + if (!dropin_dir) + return -ENOMEM; + + if (mkdir(dropin_dir, 0755) < 0) { + if (errno != EEXIST) + return log_debug_errno(errno, "Failed to create drop-in directory %s: %m", dropin_dir); + } else + (void) portable_changes_add(changes, n_changes, PORTABLE_MKDIR, dropin_dir, NULL); + + /* We install the drop-ins first, and the actual unit file last to achieve somewhat atomic behaviour if PID 1 + * is reloaded while we are creating things here: as long as only the drop-ins exist the unit doesn't exist at + * all for PID 1. */ + + r = install_chroot_dropin(image_path, type, extension_images, extension_releases, m, os_release, dropin_dir, flags, &chroot_dropin, changes, n_changes); + if (r < 0) + return r; + + r = install_profile_dropin(image_path, m, dropin_dir, profile, flags, &profile_dropin, changes, n_changes); + if (r < 0) + return r; + + if ((flags & PORTABLE_PREFER_SYMLINK) && m->source) { + + if (symlink(m->source, path) < 0) + return log_debug_errno(errno, "Failed to symlink unit file '%s': %m", path); + + (void) portable_changes_add(changes, n_changes, PORTABLE_SYMLINK, path, m->source); + + } else { + _cleanup_(unlink_and_freep) char *tmp = NULL; + _cleanup_close_ int fd = -EBADF; + + (void) mac_selinux_create_file_prepare_label(path, m->selinux_label); + + fd = open_tmpfile_linkable(path, O_WRONLY|O_CLOEXEC, &tmp); + mac_selinux_create_file_clear(); /* Clear immediately in case of errors */ + if (fd < 0) + return log_debug_errno(fd, "Failed to create unit file '%s': %m", path); + + r = copy_bytes(m->fd, fd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return log_debug_errno(r, "Failed to copy unit file '%s': %m", path); + + if (fchmod(fd, 0644) < 0) + return log_debug_errno(errno, "Failed to change unit file access mode for '%s': %m", path); + + r = link_tmpfile(fd, tmp, path, LINK_TMPFILE_SYNC); + if (r < 0) + return log_debug_errno(r, "Failed to install unit file '%s': %m", path); + + tmp = mfree(tmp); + + (void) portable_changes_add(changes, n_changes, PORTABLE_COPY, path, m->source); + } + + /* All is established now, now let's disable any rollbacks */ + chroot_dropin = mfree(chroot_dropin); + profile_dropin = mfree(profile_dropin); + dropin_dir = mfree(dropin_dir); + + return 0; +} + +static int image_symlink( + const char *image_path, + PortableFlags flags, + char **ret) { + + const char *fn, *where; + char *joined = NULL; + + assert(image_path); + assert(ret); + + fn = last_path_component(image_path); + + if (flags & PORTABLE_RUNTIME) + where = "/run/portables/"; + else + where = "/etc/portables/"; + + joined = strjoin(where, fn); + if (!joined) + return -ENOMEM; + + *ret = joined; + return 0; +} + +static int install_image_symlink( + const char *image_path, + PortableFlags flags, + PortableChange **changes, + size_t *n_changes) { + + _cleanup_free_ char *sl = NULL; + int r; + + assert(image_path); + + /* If the image is outside of the image search also link it into it, so that it can be found with short image + * names and is listed among the images. */ + + if (image_in_search_path(IMAGE_PORTABLE, NULL, image_path)) + return 0; + + r = image_symlink(image_path, flags, &sl); + if (r < 0) + return log_debug_errno(r, "Failed to generate image symlink path: %m"); + + (void) mkdir_parents(sl, 0755); + + if (symlink(image_path, sl) < 0) + return log_debug_errno(errno, "Failed to link %s %s %s: %m", image_path, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), sl); + + (void) portable_changes_add(changes, n_changes, PORTABLE_SYMLINK, sl, image_path); + return 0; +} + +static int install_image_and_extensions_symlinks( + const Image *image, + OrderedHashmap *extension_images, + PortableFlags flags, + PortableChange **changes, + size_t *n_changes) { + + Image *ext; + int r; + + assert(image); + + ORDERED_HASHMAP_FOREACH(ext, extension_images) { + r = install_image_symlink(ext->path, flags, changes, n_changes); + if (r < 0) + return r; + } + + r = install_image_symlink(image->path, flags, changes, n_changes); + if (r < 0) + return r; + + return 0; +} + +static bool prefix_matches_compatible(char **matches, char **valid_prefixes) { + /* Checks if all 'matches' are included in the list of 'valid_prefixes' */ + + STRV_FOREACH(m, matches) + if (!strv_contains(valid_prefixes, *m)) + return false; + + return true; +} + +static void log_portable_verb( + const char *verb, + const char *message_id, + const char *image_path, + OrderedHashmap *extension_images, + char **extension_image_paths, + PortableFlags flags) { + + _cleanup_free_ char *root_base_name = NULL, *extensions_joined = NULL; + _cleanup_strv_free_ char **extension_base_names = NULL; + Image *ext; + int r; + + assert(verb); + assert(message_id); + assert(image_path); + assert(!extension_images || !extension_image_paths); + + /* Use the same structured metadata as it is attached to units via LogExtraFields=. The main image + * is logged as PORTABLE_ROOT= and extensions, if any, as individual PORTABLE_EXTENSION= fields. */ + + r = path_extract_filename(image_path, &root_base_name); + if (r < 0) + log_debug_errno(r, "Failed to extract basename from '%s', ignoring: %m", image_path); + + ORDERED_HASHMAP_FOREACH(ext, extension_images) { + _cleanup_free_ char *extension_base_name = NULL; + + r = path_extract_filename(ext->path, &extension_base_name); + if (r < 0) { + log_debug_errno(r, "Failed to extract basename from '%s', ignoring: %m", ext->path); + continue; + } + + r = strv_extendf(&extension_base_names, "PORTABLE_EXTENSION=%s", extension_base_name); + if (r < 0) + log_oom_debug(); + + if (!strextend_with_separator(&extensions_joined, ", ", ext->path)) + log_oom_debug(); + } + + STRV_FOREACH(e, extension_image_paths) { + _cleanup_free_ char *extension_base_name = NULL; + + r = path_extract_filename(*e, &extension_base_name); + if (r < 0) { + log_debug_errno(r, "Failed to extract basename from '%s', ignoring: %m", *e); + continue; + } + + r = strv_extendf(&extension_base_names, "PORTABLE_EXTENSION=%s", extension_base_name); + if (r < 0) + log_oom_debug(); + + if (!strextend_with_separator(&extensions_joined, ", ", *e)) + log_oom_debug(); + } + + LOG_CONTEXT_PUSH_STRV(extension_base_names); + + log_struct(LOG_INFO, + LOG_MESSAGE("Successfully %s%s '%s%s%s'", + verb, + FLAGS_SET(flags, PORTABLE_RUNTIME) ? " ephemeral" : "", + image_path, + isempty(extensions_joined) ? "" : "' and its extension(s) '", + strempty(extensions_joined)), + message_id, + "PORTABLE_ROOT=%s", strna(root_base_name)); +} + +int portable_attach( + sd_bus *bus, + const char *name_or_path, + char **matches, + const char *profile, + char **extension_image_paths, + const ImagePolicy *image_policy, + PortableFlags flags, + PortableChange **changes, + size_t *n_changes, + sd_bus_error *error) { + + _cleanup_ordered_hashmap_free_ OrderedHashmap *extension_images = NULL, *extension_releases = NULL; + _cleanup_(portable_metadata_unrefp) PortableMetadata *os_release = NULL; + _cleanup_hashmap_free_ Hashmap *unit_files = NULL; + _cleanup_(lookup_paths_free) LookupPaths paths = {}; + _cleanup_strv_free_ char **valid_prefixes = NULL; + _cleanup_(image_unrefp) Image *image = NULL; + PortableMetadata *item; + int r; + + r = extract_image_and_extensions( + name_or_path, + matches, + extension_image_paths, + /* validate_extension= */ true, + /* relax_extension_release_check= */ FLAGS_SET(flags, PORTABLE_FORCE_EXTENSION), + image_policy, + &image, + &extension_images, + &extension_releases, + &os_release, + &unit_files, + &valid_prefixes, + error); + if (r < 0) + return r; + + if (valid_prefixes && !prefix_matches_compatible(matches, valid_prefixes)) { + _cleanup_free_ char *matches_joined = NULL, *extensions_joined = NULL, *valid_prefixes_joined = NULL; + + matches_joined = strv_join(matches, "', '"); + if (!matches_joined) + return -ENOMEM; + + extensions_joined = strv_join(extension_image_paths, ", "); + if (!extensions_joined) + return -ENOMEM; + + valid_prefixes_joined = strv_join(valid_prefixes, ", "); + if (!valid_prefixes_joined) + return -ENOMEM; + + return sd_bus_error_setf( + error, + SD_BUS_ERROR_INVALID_ARGS, + "Selected matches '%s' are not compatible with portable service image '%s%s%s', refusing. (Acceptable prefix matches are: %s)", + matches_joined, + image->path, + isempty(extensions_joined) ? "" : "' or any of its extensions '", + strempty(extensions_joined), + valid_prefixes_joined); + } + + if (hashmap_isempty(unit_files)) { + _cleanup_free_ char *extensions_joined = strv_join(extension_image_paths, ", "); + if (!extensions_joined) + return -ENOMEM; + + return sd_bus_error_setf( + error, + SD_BUS_ERROR_INVALID_ARGS, + "Couldn't find any matching unit files in image '%s%s%s', refusing.", + image->path, + isempty(extensions_joined) ? "" : "' or any of its extensions '", + strempty(extensions_joined)); + } + + r = lookup_paths_init(&paths, RUNTIME_SCOPE_SYSTEM, /* flags= */ 0, NULL); + if (r < 0) + return r; + + if (!FLAGS_SET(flags, PORTABLE_REATTACH) && !FLAGS_SET(flags, PORTABLE_FORCE_ATTACH)) + HASHMAP_FOREACH(item, unit_files) { + r = unit_file_exists(RUNTIME_SCOPE_SYSTEM, &paths, item->name); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether unit '%s' exists on the host: %m", item->name); + if (r > 0) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, "Unit file '%s' exists on the host already, refusing.", item->name); + + r = unit_file_is_active(bus, item->name, error); + if (r < 0) + return r; + if (r > 0) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, "Unit file '%s' is active already, refusing.", item->name); + } + + HASHMAP_FOREACH(item, unit_files) { + r = attach_unit_file(&paths, image->path, image->type, extension_images, extension_releases, + item, os_release, profile, flags, changes, n_changes); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to attach unit '%s': %m", item->name); + } + + /* We don't care too much for the image symlink, it's just a convenience thing, it's not necessary for proper + * operation otherwise. */ + (void) install_image_and_extensions_symlinks(image, extension_images, flags, changes, n_changes); + + log_portable_verb( + "attached", + "MESSAGE_ID=" SD_MESSAGE_PORTABLE_ATTACHED_STR, + image->path, + extension_images, + /* extension_image_paths= */ NULL, + flags); + + return 0; +} + +static bool marker_matches_images(const char *marker, const char *name_or_path, char **extension_image_paths) { + _cleanup_strv_free_ char **root_and_extensions = NULL; + const char *a; + int r; + + assert(marker); + assert(name_or_path); + + /* If extensions were used when attaching, the marker will be a colon-separated + * list of images/paths. We enforce strict 1:1 matching, so that we are sure + * we are detaching exactly what was attached. + * For each image, starting with the root, we look for a token in the marker, + * and return a negative answer on any non-matching combination. */ + + root_and_extensions = strv_new(name_or_path); + if (!root_and_extensions) + return -ENOMEM; + + r = strv_extend_strv(&root_and_extensions, extension_image_paths, false); + if (r < 0) + return r; + + STRV_FOREACH(image_name_or_path, root_and_extensions) { + _cleanup_free_ char *image = NULL; + + r = extract_first_word(&marker, &image, ":", EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_debug_errno(r, "Failed to parse marker: %s", marker); + if (r == 0) + return false; + + a = last_path_component(image); + + if (image_name_is_valid(*image_name_or_path)) { + const char *e, *underscore; + + /* We shall match against an image name. In that case let's compare the last component, and optionally + * allow either a suffix of ".raw" or a series of "/". + * But allow matching on a different version of the same image, when a "_" is used as a separator. */ + underscore = strchr(*image_name_or_path, '_'); + if (underscore) { + if (strneq(a, *image_name_or_path, underscore - *image_name_or_path)) + continue; + return false; + } + + e = startswith(a, *image_name_or_path); + if (!e) + return false; + + if(!(e[strspn(e, "/")] == 0 || streq(e, ".raw"))) + return false; + } else { + const char *b, *underscore; + size_t l; + + /* We shall match against a path. Let's ignore any prefix here though, as often there are many ways to + * reach the same file. However, in this mode, let's validate any file suffix. + * But also ensure that we don't fail if both components don't have a '/' at all + * (strcspn returns the full length of the string in that case, which might not + * match as the versions might differ). */ + + l = strcspn(a, "/"); + b = last_path_component(*image_name_or_path); + + if ((a[l] != '/') != !strchr(b, '/')) /* One is a directory, the other is not */ + return false; + + if (a[l] != 0 && strcspn(b, "/") != l) + return false; + + underscore = strchr(b, '_'); + if (underscore) + l = underscore - b; + else { /* Either component could be versioned */ + underscore = strchr(a, '_'); + if (underscore) + l = underscore - a; + } + + if (!strneq(a, b, l)) + return false; + } + } + + return true; +} + +static int test_chroot_dropin( + DIR *d, + const char *where, + const char *fname, + const char *name_or_path, + char **extension_image_paths, + char **ret_marker) { + + _cleanup_free_ char *line = NULL, *marker = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -EBADF; + const char *p, *e, *k; + int r; + + assert(d); + assert(where); + assert(fname); + + /* We recognize unis created from portable images via the drop-in we created for them */ + + p = strjoina(fname, ".d/20-portable.conf"); + fd = openat(dirfd(d), p, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (errno == ENOENT) + return 0; + + return log_debug_errno(errno, "Failed to open %s/%s: %m", where, p); + } + + r = take_fdopen_unlocked(&fd, "r", &f); + if (r < 0) + return log_debug_errno(r, "Failed to convert file handle: %m"); + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_debug_errno(r, "Failed to read from %s/%s: %m", where, p); + + e = startswith(line, PORTABLE_DROPIN_MARKER_BEGIN); + if (!e) + return 0; + + k = endswith(e, PORTABLE_DROPIN_MARKER_END); + if (!k) + return 0; + + marker = strndup(e, k - e); + if (!marker) + return -ENOMEM; + + if (!name_or_path) + r = true; + else + r = marker_matches_images(marker, name_or_path, extension_image_paths); + + if (ret_marker) + *ret_marker = TAKE_PTR(marker); + + return r; +} + +int portable_detach( + sd_bus *bus, + const char *name_or_path, + char **extension_image_paths, + PortableFlags flags, + PortableChange **changes, + size_t *n_changes, + sd_bus_error *error) { + + _cleanup_(lookup_paths_free) LookupPaths paths = {}; + _cleanup_set_free_ Set *unit_files = NULL, *markers = NULL; + _cleanup_free_ char *extensions = NULL; + _cleanup_closedir_ DIR *d = NULL; + const char *where, *item; + int ret = 0; + int r; + + assert(name_or_path); + + r = lookup_paths_init(&paths, RUNTIME_SCOPE_SYSTEM, /* flags= */ 0, NULL); + if (r < 0) + return r; + + where = attached_path(&paths, flags); + + d = opendir(where); + if (!d) { + if (errno == ENOENT) + goto not_found; + + return log_debug_errno(errno, "Failed to open '%s' directory: %m", where); + } + + FOREACH_DIRENT(de, d, return log_debug_errno(errno, "Failed to enumerate '%s' directory: %m", where)) { + _cleanup_free_ char *marker = NULL, *unit_name = NULL; + const char *dot; + + /* When a portable service is enabled with "portablectl --copy=symlink --enable --now attach", + * and is disabled with "portablectl --enable --now detach", which calls DisableUnitFilesWithFlags + * DBus method, the main unit file is removed, but its drop-ins are not. Hence, here we need + * to list both main unit files and drop-in directories (without the main unit files). */ + + dot = endswith(de->d_name, ".d"); + if (dot) + unit_name = strndup(de->d_name, dot - de->d_name); + else + unit_name = strdup(de->d_name); + if (!unit_name) + return -ENOMEM; + + if (!unit_name_is_valid(unit_name, UNIT_NAME_ANY)) + continue; + + /* Filter out duplicates */ + if (set_contains(unit_files, unit_name)) + continue; + + if (dot ? !IN_SET(de->d_type, DT_LNK, DT_DIR) : !IN_SET(de->d_type, DT_LNK, DT_REG)) + continue; + + r = test_chroot_dropin(d, where, unit_name, name_or_path, extension_image_paths, &marker); + if (r < 0) + return r; + if (r == 0) + continue; + + if (!FLAGS_SET(flags, PORTABLE_REATTACH) && !FLAGS_SET(flags, PORTABLE_FORCE_ATTACH)) { + r = unit_file_is_active(bus, unit_name, error); + if (r < 0) + return r; + if (r > 0) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, "Unit file '%s' is active, can't detach.", unit_name); + } + + r = set_ensure_consume(&unit_files, &string_hash_ops_free, TAKE_PTR(unit_name)); + if (r < 0) + return log_oom_debug(); + + for (const char *p = marker;;) { + _cleanup_free_ char *image = NULL; + + r = extract_first_word(&p, &image, ":", EXTRACT_UNESCAPE_SEPARATORS|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_debug_errno(r, "Failed to parse marker: %s", p); + if (r == 0) + break; + + if (path_is_absolute(image) && !image_in_search_path(IMAGE_PORTABLE, NULL, image)) { + r = set_ensure_consume(&markers, &path_hash_ops_free, TAKE_PTR(image)); + if (r < 0) + return r; + } + } + } + + if (set_isempty(unit_files)) + goto not_found; + + SET_FOREACH(item, unit_files) { + _cleanup_free_ char *md = NULL; + + if (unlinkat(dirfd(d), item, 0) < 0) { + log_debug_errno(errno, "Can't remove unit file %s/%s: %m", where, item); + + if (errno != ENOENT && ret >= 0) + ret = -errno; + } else + portable_changes_add_with_prefix(changes, n_changes, PORTABLE_UNLINK, where, item, NULL); + + FOREACH_STRING(suffix, ".d/10-profile.conf", ".d/20-portable.conf") { + _cleanup_free_ char *dropin = NULL; + + dropin = strjoin(item, suffix); + if (!dropin) + return -ENOMEM; + + if (unlinkat(dirfd(d), dropin, 0) < 0) { + log_debug_errno(errno, "Can't remove drop-in %s/%s: %m", where, dropin); + + if (errno != ENOENT && ret >= 0) + ret = -errno; + } else + portable_changes_add_with_prefix(changes, n_changes, PORTABLE_UNLINK, where, dropin, NULL); + } + + md = strjoin(item, ".d"); + if (!md) + return -ENOMEM; + + if (unlinkat(dirfd(d), md, AT_REMOVEDIR) < 0) { + log_debug_errno(errno, "Can't remove drop-in directory %s/%s: %m", where, md); + + if (errno != ENOENT && ret >= 0) + ret = -errno; + } else + portable_changes_add_with_prefix(changes, n_changes, PORTABLE_UNLINK, where, md, NULL); + } + + /* Now, also drop any image symlink, for images outside of the sarch path */ + SET_FOREACH(item, markers) { + _cleanup_free_ char *sl = NULL; + struct stat st; + + r = image_symlink(item, flags, &sl); + if (r < 0) { + log_debug_errno(r, "Failed to determine image symlink for '%s', ignoring: %m", item); + continue; + } + + if (lstat(sl, &st) < 0) { + log_debug_errno(errno, "Failed to stat '%s', ignoring: %m", sl); + continue; + } + + if (!S_ISLNK(st.st_mode)) { + log_debug("Image '%s' is not a symlink, ignoring.", sl); + continue; + } + + if (unlink(sl) < 0) { + log_debug_errno(errno, "Can't remove image symlink '%s': %m", sl); + + if (errno != ENOENT && ret >= 0) + ret = -errno; + } else + portable_changes_add(changes, n_changes, PORTABLE_UNLINK, sl, NULL); + } + + /* Try to remove the unit file directory, if we can */ + if (rmdir(where) >= 0) + portable_changes_add(changes, n_changes, PORTABLE_UNLINK, where, NULL); + + log_portable_verb( + "detached", + "MESSAGE_ID=" SD_MESSAGE_PORTABLE_DETACHED_STR, + name_or_path, + /* extension_images= */ NULL, + extension_image_paths, + flags); + + return ret; + +not_found: + extensions = strv_join(extension_image_paths, ", "); + if (!extensions) + return -ENOMEM; + + r = sd_bus_error_setf(error, + BUS_ERROR_NO_SUCH_UNIT, + "No unit files associated with '%s%s%s' found attached to the system. Image not attached?", + name_or_path, + isempty(extensions) ? "" : "' or any of its extensions '", + isempty(extensions) ? "" : extensions); + return log_debug_errno(r, "%s", error->message); +} + +static int portable_get_state_internal( + sd_bus *bus, + const char *name_or_path, + char **extension_image_paths, + PortableFlags flags, + PortableState *ret, + sd_bus_error *error) { + + _cleanup_(lookup_paths_free) LookupPaths paths = {}; + bool found_enabled = false, found_running = false; + _cleanup_set_free_ Set *unit_files = NULL; + _cleanup_closedir_ DIR *d = NULL; + const char *where; + int r; + + assert(name_or_path); + assert(ret); + + r = lookup_paths_init(&paths, RUNTIME_SCOPE_SYSTEM, /* flags= */ 0, NULL); + if (r < 0) + return r; + + where = attached_path(&paths, flags); + + d = opendir(where); + if (!d) { + if (errno == ENOENT) { + /* If the 'attached' directory doesn't exist at all, then we know for sure this image isn't attached. */ + *ret = PORTABLE_DETACHED; + return 0; + } + + return log_debug_errno(errno, "Failed to open '%s' directory: %m", where); + } + + FOREACH_DIRENT(de, d, return log_debug_errno(errno, "Failed to enumerate '%s' directory: %m", where)) { + UnitFileState state; + + if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY)) + continue; + + /* Filter out duplicates */ + if (set_contains(unit_files, de->d_name)) + continue; + + if (!IN_SET(de->d_type, DT_LNK, DT_REG)) + continue; + + r = test_chroot_dropin(d, where, de->d_name, name_or_path, extension_image_paths, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + r = unit_file_lookup_state(RUNTIME_SCOPE_SYSTEM, &paths, de->d_name, &state); + if (r < 0) + return log_debug_errno(r, "Failed to determine unit file state of '%s': %m", de->d_name); + if (!IN_SET(state, UNIT_FILE_STATIC, UNIT_FILE_DISABLED, UNIT_FILE_LINKED, UNIT_FILE_LINKED_RUNTIME)) + found_enabled = true; + + r = unit_file_is_active(bus, de->d_name, error); + if (r < 0) + return r; + if (r > 0) + found_running = true; + + r = set_put_strdup(&unit_files, de->d_name); + if (r < 0) + return log_debug_errno(r, "Failed to add unit name '%s' to set: %m", de->d_name); + } + + *ret = found_running ? (!set_isempty(unit_files) && (flags & PORTABLE_RUNTIME) ? PORTABLE_RUNNING_RUNTIME : PORTABLE_RUNNING) : + found_enabled ? (flags & PORTABLE_RUNTIME ? PORTABLE_ENABLED_RUNTIME : PORTABLE_ENABLED) : + !set_isempty(unit_files) ? (flags & PORTABLE_RUNTIME ? PORTABLE_ATTACHED_RUNTIME : PORTABLE_ATTACHED) : PORTABLE_DETACHED; + + return 0; +} + +int portable_get_state( + sd_bus *bus, + const char *name_or_path, + char **extension_image_paths, + PortableFlags flags, + PortableState *ret, + sd_bus_error *error) { + + PortableState state; + int r; + + assert(name_or_path); + assert(ret); + + /* We look for matching units twice: once in the regular directories, and once in the runtime directories — but + * the latter only if we didn't find anything in the former. */ + + r = portable_get_state_internal(bus, name_or_path, extension_image_paths, flags & ~PORTABLE_RUNTIME, &state, error); + if (r < 0) + return r; + + if (state == PORTABLE_DETACHED) { + r = portable_get_state_internal(bus, name_or_path, extension_image_paths, flags | PORTABLE_RUNTIME, &state, error); + if (r < 0) + return r; + } + + *ret = state; + return 0; +} + +int portable_get_profiles(char ***ret) { + assert(ret); + + return conf_files_list_nulstr(ret, NULL, NULL, CONF_FILES_DIRECTORY|CONF_FILES_BASENAME|CONF_FILES_FILTER_MASKED, PORTABLE_PROFILE_DIRS); +} + +static const char* const portable_change_type_table[_PORTABLE_CHANGE_TYPE_MAX] = { + [PORTABLE_COPY] = "copy", + [PORTABLE_MKDIR] = "mkdir", + [PORTABLE_SYMLINK] = "symlink", + [PORTABLE_UNLINK] = "unlink", + [PORTABLE_WRITE] = "write", +}; + +DEFINE_STRING_TABLE_LOOKUP(portable_change_type, int); + +static const char* const portable_state_table[_PORTABLE_STATE_MAX] = { + [PORTABLE_DETACHED] = "detached", + [PORTABLE_ATTACHED] = "attached", + [PORTABLE_ATTACHED_RUNTIME] = "attached-runtime", + [PORTABLE_ENABLED] = "enabled", + [PORTABLE_ENABLED_RUNTIME] = "enabled-runtime", + [PORTABLE_RUNNING] = "running", + [PORTABLE_RUNNING_RUNTIME] = "running-runtime", +}; + +DEFINE_STRING_TABLE_LOOKUP(portable_state, PortableState); diff --git a/src/portable/portable.h b/src/portable/portable.h new file mode 100644 index 0000000..c4a9d51 --- /dev/null +++ b/src/portable/portable.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "dissect-image.h" +#include "hashmap.h" +#include "macro.h" +#include "set.h" +#include "string-util.h" + +typedef struct PortableMetadata { + int fd; + char *source; + char *image_path; + char *selinux_label; + char name[]; +} PortableMetadata; + +#define PORTABLE_METADATA_IS_OS_RELEASE(m) (streq((m)->name, "/etc/os-release")) +#define PORTABLE_METADATA_IS_EXTENSION_RELEASE(m) (startswith_strv((m)->name, STRV_MAKE("/usr/lib/extension-release.d/extension-release.", "/etc/extension-release.d/extension-release."))) +#define PORTABLE_METADATA_IS_UNIT(m) (!IN_SET((m)->name[0], 0, '/')) + +typedef enum PortableFlags { + PORTABLE_RUNTIME = 1 << 0, /* Public API via DBUS, do not change */ + PORTABLE_FORCE_ATTACH = 1 << 1, /* Public API via DBUS, do not change */ + PORTABLE_FORCE_EXTENSION = 1 << 2, /* Public API via DBUS, do not change */ + PORTABLE_PREFER_COPY = 1 << 3, + PORTABLE_PREFER_SYMLINK = 1 << 4, + PORTABLE_REATTACH = 1 << 5, + _PORTABLE_MASK_PUBLIC = PORTABLE_RUNTIME | PORTABLE_FORCE_ATTACH | PORTABLE_FORCE_EXTENSION, + _PORTABLE_TYPE_MAX, + _PORTABLE_TYPE_INVALID = -EINVAL, +} PortableFlags; + +/* This enum is anonymous, since we usually store it in an 'int', as we overload it with negative errno + * values. */ +enum { + PORTABLE_COPY, + PORTABLE_SYMLINK, + PORTABLE_UNLINK, + PORTABLE_WRITE, + PORTABLE_MKDIR, + _PORTABLE_CHANGE_TYPE_MAX, + _PORTABLE_CHANGE_TYPE_INVALID = -EINVAL, +}; + +typedef enum PortableState { + PORTABLE_DETACHED, + PORTABLE_ATTACHED, + PORTABLE_ATTACHED_RUNTIME, + PORTABLE_ENABLED, + PORTABLE_ENABLED_RUNTIME, + PORTABLE_RUNNING, + PORTABLE_RUNNING_RUNTIME, + _PORTABLE_STATE_MAX, + _PORTABLE_STATE_INVALID = -EINVAL, +} PortableState; + +typedef struct PortableChange { + int type_or_errno; /* PORTABLE_COPY, PORTABLE_SYMLINK, … if positive, errno if negative */ + char *path; + char *source; +} PortableChange; + +PortableMetadata *portable_metadata_unref(PortableMetadata *i); +DEFINE_TRIVIAL_CLEANUP_FUNC(PortableMetadata*, portable_metadata_unref); + +int portable_metadata_hashmap_to_sorted_array(Hashmap *unit_files, PortableMetadata ***ret); + +int portable_extract(const char *image, char **matches, char **extension_image_paths, const ImagePolicy *image_policy, PortableFlags flags, PortableMetadata **ret_os_release, OrderedHashmap **ret_extension_releases, Hashmap **ret_unit_files, char ***ret_valid_prefixes, sd_bus_error *error); + +int portable_attach(sd_bus *bus, const char *name_or_path, char **matches, const char *profile, char **extension_images, const ImagePolicy* image_policy, PortableFlags flags, PortableChange **changes, size_t *n_changes, sd_bus_error *error); +int portable_detach(sd_bus *bus, const char *name_or_path, char **extension_image_paths, PortableFlags flags, PortableChange **changes, size_t *n_changes, sd_bus_error *error); + +int portable_get_state(sd_bus *bus, const char *name_or_path, char **extension_image_paths, PortableFlags flags, PortableState *ret, sd_bus_error *error); + +int portable_get_profiles(char ***ret); + +void portable_changes_free(PortableChange *changes, size_t n_changes); + +const char *portable_change_type_to_string(int t) _const_; +int portable_change_type_from_string(const char *t) _pure_; + +const char *portable_state_to_string(PortableState t) _const_; +PortableState portable_state_from_string(const char *t) _pure_; diff --git a/src/portable/portablectl.c b/src/portable/portablectl.c new file mode 100644 index 0000000..1588b17 --- /dev/null +++ b/src/portable/portablectl.c @@ -0,0 +1,1459 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-wait-for-jobs.h" +#include "chase.h" +#include "constants.h" +#include "dirent-util.h" +#include "env-file.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "fs-util.h" +#include "locale-util.h" +#include "main-func.h" +#include "os-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "portable.h" +#include "pretty-print.h" +#include "spawn-polkit-agent.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "verbs.h" + +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static bool arg_ask_password = true; +static bool arg_quiet = false; +static const char *arg_profile = "default"; +static const char* arg_copy_mode = NULL; +static bool arg_runtime = false; +static bool arg_reload = true; +static bool arg_cat = false; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static const char *arg_host = NULL; +static bool arg_enable = false; +static bool arg_now = false; +static bool arg_no_block = false; +static char **arg_extension_images = NULL; +static bool arg_force = false; + +STATIC_DESTRUCTOR_REGISTER(arg_extension_images, strv_freep); + +static bool is_portable_managed(const char *unit) { + return ENDSWITH_SET(unit, ".service", ".target", ".socket", ".path", ".timer"); +} + +static int determine_image(const char *image, bool permit_non_existing, char **ret) { + int r; + + /* If the specified name is a valid image name, we pass it as-is to portabled, which will search for it in the + * usual search directories. Otherwise we presume it's a path, and will normalize it on the client's side + * (among other things, to make the path independent of the client's working directory) before passing it + * over. */ + + if (image_name_is_valid(image)) { + char *c; + + if (!arg_quiet && laccess(image, F_OK) >= 0) + log_warning("Ambiguous invocation: current working directory contains file matching non-path argument '%s', ignoring. " + "Prefix argument with './' to force reference to file in current working directory.", image); + + c = strdup(image); + if (!c) + return log_oom(); + + *ret = c; + return 0; + } + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Operations on images by path not supported when connecting to remote systems."); + + r = chase(image, NULL, CHASE_TRAIL_SLASH | (permit_non_existing ? CHASE_NONEXISTENT : 0), ret, NULL); + if (r < 0) + return log_error_errno(r, "Cannot normalize specified image path '%s': %m", image); + + return 0; +} + +static int attach_extensions_to_message(sd_bus_message *m, const char *method, char **extensions) { + int r; + + assert(m); + assert(method); + + /* The new methods also have flags parameters that are independent of the extensions */ + if (strv_isempty(extensions) && !endswith(method, "WithExtensions")) + return 0; + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return bus_log_create_error(r); + + STRV_FOREACH(p, extensions) { + _cleanup_free_ char *resolved_extension_image = NULL; + + r = determine_image( + *p, + startswith_strv(method, STRV_MAKE("Get", "Detach")), + &resolved_extension_image); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "s", resolved_extension_image); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int extract_prefix(const char *path, char **ret) { + _cleanup_free_ char *name = NULL, *bn = NULL; + const char *underscore; + size_t m; + int r; + + r = path_extract_filename(path, &bn); + if (r < 0) + return r; + + underscore = strchr(bn, '_'); + if (underscore) + m = underscore - bn; + else { + const char *e; + + e = endswith(bn, ".raw"); + if (!e) + e = strchr(bn, 0); + + m = e - bn; + } + + name = strndup(bn, m); + if (!name) + return -ENOMEM; + + /* A slightly reduced version of what's permitted in unit names. With ':' and '\' are removed, as well as '_' + * which we use as delimiter for the second part of the image string, which we ignore for now. */ + if (!in_charset(name, DIGITS LETTERS "-.")) + return -EINVAL; + + if (!filename_is_valid(name)) + return -EINVAL; + + *ret = TAKE_PTR(name); + return 0; +} + +static int determine_matches(const char *image, char **l, bool allow_any, char ***ret) { + _cleanup_strv_free_ char **k = NULL; + int r; + + /* Determine the matches to apply. If the list is empty we derive the match from the image name. If the list + * contains exactly the "-" we return a wildcard list (which is the empty list), but only if this is expressly + * permitted. */ + + if (strv_isempty(l)) { + char *prefix; + + r = extract_prefix(image, &prefix); + if (r < 0) + return log_error_errno(r, "Failed to extract prefix of image name '%s': %m", image); + + if (!arg_quiet) + log_info("(Matching unit files with prefix '%s'.)", prefix); + + r = strv_consume(&k, prefix); + if (r < 0) + return log_oom(); + + } else if (strv_equal(l, STRV_MAKE("-"))) { + + if (!allow_any) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Refusing all unit file match."); + + if (!arg_quiet) + log_info("(Matching all unit files.)"); + } else { + + k = strv_copy(l); + if (!k) + return log_oom(); + + if (!arg_quiet) { + _cleanup_free_ char *joined = NULL; + + joined = strv_join(k, "', '"); + if (!joined) + return log_oom(); + + log_info("(Matching unit files with prefixes '%s'.)", joined); + } + } + + *ret = TAKE_PTR(k); + + return 0; +} + +static int acquire_bus(sd_bus **bus) { + int r; + + assert(bus); + + if (*bus) + return 0; + + r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + (void) sd_bus_set_allow_interactive_authorization(*bus, arg_ask_password); + + return 0; +} + +static int maybe_reload(sd_bus **bus) { + int r; + + if (!arg_reload) + return 0; + + r = acquire_bus(bus); + if (r < 0) + return r; + + return bus_service_manager_reload(*bus); +} + +static int get_image_metadata(sd_bus *bus, const char *image, char **matches, sd_bus_message **reply) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + uint64_t flags = arg_force ? PORTABLE_FORCE_EXTENSION : 0; + const char *method; + int r; + + assert(bus); + assert(reply); + + method = strv_isempty(arg_extension_images) && !arg_force ? "GetImageMetadata" : "GetImageMetadataWithExtensions"; + + r = bus_message_new_method_call(bus, &m, bus_portable_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", image); + if (r < 0) + return bus_log_create_error(r); + + r = attach_extensions_to_message(m, method, arg_extension_images); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(m, matches); + if (r < 0) + return bus_log_create_error(r); + + if (streq(method, "GetImageMetadataWithExtensions")) { + r = sd_bus_message_append(m, "t", flags); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_call(bus, m, 0, &error, reply); + if (r < 0) + return log_error_errno(r, "Failed to inspect image metadata: %s", bus_error_message(&error, r)); + + return 0; +} + +static int inspect_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_strv_free_ char **matches = NULL; + _cleanup_free_ char *image = NULL; + bool nl = false, header = false; + const char *path; + const void *data; + size_t sz; + int r; + + r = determine_image(argv[1], false, &image); + if (r < 0) + return r; + + r = determine_matches(argv[1], argv + 2, true, &matches); + if (r < 0) + return r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + r = get_image_metadata(bus, image, matches, &reply); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "s", &path); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &data, &sz); + if (r < 0) + return bus_log_parse_error(r); + + pager_open(arg_pager_flags); + + if (arg_cat) { + printf("%s-- OS Release: --%s\n", ansi_highlight(), ansi_normal()); + fwrite(data, sz, 1, stdout); + fflush(stdout); + nl = true; + } else { + _cleanup_free_ char *pretty_portable = NULL, *pretty_os = NULL; + _cleanup_fclose_ FILE *f = NULL; + + f = fmemopen_unlocked((void*) data, sz, "r"); + if (!f) + return log_error_errno(errno, "Failed to open /etc/os-release buffer: %m"); + + r = parse_env_file(f, "/etc/os-release", + "PORTABLE_PRETTY_NAME", &pretty_portable, + "PRETTY_NAME", &pretty_os); + if (r < 0) + return log_error_errno(r, "Failed to parse /etc/os-release: %m"); + + printf("Image:\n\t%s\n" + "Portable Service:\n\t%s\n" + "Operating System:\n\t%s\n", + path, + strna(pretty_portable), + strna(pretty_os)); + } + + if (!strv_isempty(arg_extension_images)) { + /* If we specified any extensions, we'll first get back exactly the paths (and + * extension-release content) for each one of the arguments. */ + + r = sd_bus_message_enter_container(reply, 'a', "{say}"); + if (r < 0) + return bus_log_parse_error(r); + + for (size_t i = 0; i < strv_length(arg_extension_images); ++i) { + const char *name; + + r = sd_bus_message_enter_container(reply, 'e', "say"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = sd_bus_message_read(reply, "s", &name); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &data, &sz); + if (r < 0) + return bus_log_parse_error(r); + + if (arg_cat) { + if (nl) + fputc('\n', stdout); + + printf("%s-- Extension Release: %s --%s\n", ansi_highlight(), name, ansi_normal()); + fwrite(data, sz, 1, stdout); + fflush(stdout); + nl = true; + } else { + _cleanup_free_ char *pretty_portable = NULL, *sysext_pretty_os = NULL, + *sysext_level = NULL, *sysext_id = NULL, + *sysext_version_id = NULL, *sysext_scope = NULL, + *portable_prefixes = NULL, *id = NULL, *version_id = NULL, + *sysext_image_id = NULL, *sysext_image_version = NULL, + *sysext_build_id = NULL, *confext_pretty_os = NULL, + *confext_level = NULL, *confext_id = NULL, + *confext_version_id = NULL, *confext_scope = NULL, + *confext_image_id = NULL, *confext_image_version = NULL, + *confext_build_id = NULL; + _cleanup_fclose_ FILE *f = NULL; + + f = fmemopen_unlocked((void*) data, sz, "r"); + if (!f) + return log_error_errno(errno, "Failed to open extension-release buffer: %m"); + + r = parse_env_file(f, name, + "SYSEXT_ID", &sysext_id, + "SYSEXT_VERSION_ID", &sysext_version_id, + "SYSEXT_BUILD_ID", &sysext_build_id, + "SYSEXT_IMAGE_ID", &sysext_image_id, + "SYSEXT_IMAGE_VERSION", &sysext_image_version, + "SYSEXT_SCOPE", &sysext_scope, + "SYSEXT_LEVEL", &sysext_level, + "SYSEXT_PRETTY_NAME", &sysext_pretty_os, + "CONFEXT_ID", &confext_id, + "CONFEXT_VERSION_ID", &confext_version_id, + "CONFEXT_BUILD_ID", &confext_build_id, + "CONFEXT_IMAGE_ID", &confext_image_id, + "CONFEXT_IMAGE_VERSION", &confext_image_version, + "CONFEXT_SCOPE", &confext_scope, + "CONFEXT_LEVEL", &confext_level, + "CONFEXT_PRETTY_NAME", &confext_pretty_os, + "ID", &id, + "VERSION_ID", &version_id, + "PORTABLE_PRETTY_NAME", &pretty_portable, + "PORTABLE_PREFIXES", &portable_prefixes); + if (r < 0) + return log_error_errno(r, "Failed to parse extension release from '%s': %m", name); + + printf("Extension:\n\t%s\n" + "\tExtension Scope:\n\t\t%s\n" + "\tExtension Compatibility Level:\n\t\t%s\n" + "\tExtension Compatibility OS:\n\t\t%s\n" + "\tExtension Compatibility OS Version:\n\t\t%s\n" + "\tPortable Service:\n\t\t%s\n" + "\tPortable Prefixes:\n\t\t%s\n" + "\tExtension Image:\n\t\t%s%s%s %s%s%s\n", + name, + strna(sysext_scope ?: confext_scope), + strna(sysext_level ?: confext_level), + strna(id), + strna(version_id), + strna(pretty_portable), + strna(portable_prefixes), + strempty(sysext_pretty_os ?: confext_pretty_os), + (sysext_pretty_os ?: confext_pretty_os) ? " (" : "ID: ", + strna(sysext_id ?: sysext_image_id ?: confext_id ?: confext_image_id), + (sysext_pretty_os ?: confext_pretty_os) ? "" : "Version: ", + strna(sysext_version_id ?: sysext_image_version ?: sysext_build_id ?: confext_version_id ?: confext_image_version ?: confext_build_id), + (sysext_pretty_os ?: confext_pretty_os) ? ")" : ""); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_enter_container(reply, 'a', "{say}"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + const char *name; + + r = sd_bus_message_enter_container(reply, 'e', "say"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = sd_bus_message_read(reply, "s", &name); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &data, &sz); + if (r < 0) + return bus_log_parse_error(r); + + if (arg_cat) { + if (nl) + fputc('\n', stdout); + + printf("%s-- Unit file: %s --%s\n", ansi_highlight(), name, ansi_normal()); + fwrite(data, sz, 1, stdout); + fflush(stdout); + nl = true; + } else { + if (!header) { + fputs("Unit files:\n", stdout); + header = true; + } + + fputc('\t', stdout); + fputs(name, stdout); + fputc('\n', stdout); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static int print_changes(sd_bus_message *m) { + int r; + + if (arg_quiet) + return 0; + + r = sd_bus_message_enter_container(m, 'a', "(sss)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + const char *type, *path, *source; + + r = sd_bus_message_read(m, "(sss)", &type, &path, &source); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (streq(type, "symlink")) + log_info("Created symlink %s %s %s.", path, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), source); + else if (streq(type, "copy")) { + if (isempty(source)) + log_info("Copied %s.", path); + else + log_info("Copied %s %s %s.", source, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), path); + } else if (streq(type, "unlink")) + log_info("Removed %s.", path); + else if (streq(type, "write")) + log_info("Written %s.", path); + else if (streq(type, "mkdir")) + log_info("Created directory %s.", path); + else + log_error("Unexpected change: %s/%s/%s", type, path, source); + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int maybe_enable_disable(sd_bus *bus, const char *path, bool enable) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_strv_free_ char **names = NULL; + const uint64_t flags = UNIT_FILE_PORTABLE | (arg_runtime ? UNIT_FILE_RUNTIME : 0); + int r; + + if (!arg_enable) + return 0; + + names = strv_new(path, NULL); + if (!names) + return log_oom(); + + r = bus_message_new_method_call( + bus, + &m, + bus_systemd_mgr, + enable ? "EnableUnitFilesWithFlags" : "DisableUnitFilesWithFlags"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, names); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "t", flags); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to %s the portable service %s: %s", + enable ? "enable" : "disable", path, bus_error_message(&error, r)); + + if (enable) { + r = sd_bus_message_skip(reply, "b"); + if (r < 0) + return bus_log_parse_error(r); + } + + (void) bus_deserialize_and_dump_unit_file_changes(reply, arg_quiet); + + return 0; +} + +static int maybe_start_stop_restart(sd_bus *bus, const char *path, const char *method, BusWaitForJobs *wait) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *name = NULL; + const char *job = NULL; + int r; + + assert(STR_IN_SET(method, "StartUnit", "StopUnit", "RestartUnit")); + + if (!arg_now) + return 0; + + r = path_extract_filename(path, &name); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", path); + + r = bus_call_method( + bus, + bus_systemd_mgr, + method, + &error, + &reply, + "ss", name, "replace"); + if (r < 0) + return log_error_errno(r, "Failed to call %s on the portable service %s: %s", + method, + path, + bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &job); + if (r < 0) + return bus_log_parse_error(r); + + if (!arg_quiet) + log_info("Queued %s to call %s on portable service %s.", job, method, name); + + if (wait) { + r = bus_wait_for_jobs_add(wait, job); + if (r < 0) + return log_error_errno(r, "Failed to watch %s job to call %s on %s: %m", + job, method, name); + } + + return 0; +} + +static int maybe_enable_start(sd_bus *bus, sd_bus_message *reply) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *wait = NULL; + int r; + + if (!arg_enable && !arg_now) + return 0; + + if (!arg_no_block) { + r = bus_wait_for_jobs_new(bus, &wait); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + r = sd_bus_message_rewind(reply, true); + if (r < 0) + return r; + r = sd_bus_message_enter_container(reply, 'a', "(sss)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + char *type, *path, *source; + + r = sd_bus_message_read(reply, "(sss)", &type, &path, &source); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (STR_IN_SET(type, "symlink", "copy") && is_portable_managed(path)) { + (void) maybe_enable_disable(bus, path, true); + (void) maybe_start_stop_restart(bus, path, "StartUnit", wait); + } + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + if (!arg_no_block) { + r = bus_wait_for_jobs(wait, arg_quiet, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int maybe_stop_enable_restart(sd_bus *bus, sd_bus_message *reply) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *wait = NULL; + int r; + + if (!arg_enable && !arg_now) + return 0; + + if (!arg_no_block) { + r = bus_wait_for_jobs_new(bus, &wait); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + r = sd_bus_message_rewind(reply, true); + if (r < 0) + return r; + + /* First we get a list of units that were definitely removed, not just re-attached, + * so we can also stop them if the user asked us to. */ + r = sd_bus_message_enter_container(reply, 'a', "(sss)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + char *type, *path, *source; + + r = sd_bus_message_read(reply, "(sss)", &type, &path, &source); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (streq(type, "unlink") && is_portable_managed(path)) + (void) maybe_start_stop_restart(bus, path, "StopUnit", wait); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + /* Then we get a list of units that were either added or changed, so that we can + * enable them and/or restart them if the user asked us to. */ + r = sd_bus_message_enter_container(reply, 'a', "(sss)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + char *type, *path, *source; + + r = sd_bus_message_read(reply, "(sss)", &type, &path, &source); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + if (STR_IN_SET(type, "symlink", "copy") && is_portable_managed(path)) { + (void) maybe_enable_disable(bus, path, true); + (void) maybe_start_stop_restart(bus, path, "RestartUnit", wait); + } + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + if (!arg_no_block) { + r = bus_wait_for_jobs(wait, arg_quiet, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int maybe_stop_disable(sd_bus *bus, char *image, char *argv[]) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *wait = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_strv_free_ char **matches = NULL; + int r; + + if (!arg_enable && !arg_now) + return 0; + + r = determine_matches(argv[1], argv + 2, true, &matches); + if (r < 0) + return r; + + r = bus_wait_for_jobs_new(bus, &wait); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + + r = get_image_metadata(bus, image, matches, &reply); + if (r < 0) + return r; + + r = sd_bus_message_skip(reply, "say"); + if (r < 0) + return bus_log_parse_error(r); + + /* If we specified any extensions or --force (which makes the request go through the new + * WithExtensions calls), we'll first get an array of extension-release metadata. */ + if (!strv_isempty(arg_extension_images) || arg_force) { + r = sd_bus_message_skip(reply, "a{say}"); + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_enter_container(reply, 'a', "{say}"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + const char *name; + + r = sd_bus_message_enter_container(reply, 'e', "say"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = sd_bus_message_read(reply, "s", &name); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_skip(reply, "ay"); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + (void) maybe_start_stop_restart(bus, name, "StopUnit", wait); + (void) maybe_enable_disable(bus, name, false); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + /* Stopping must always block or the detach will fail if the unit is still running */ + r = bus_wait_for_jobs(wait, arg_quiet, NULL); + if (r < 0) + return r; + + return 0; +} + +static int attach_reattach_image(int argc, char *argv[], const char *method) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_strv_free_ char **matches = NULL; + _cleanup_free_ char *image = NULL; + int r; + + assert(method); + assert(STR_IN_SET(method, "AttachImage", "ReattachImage", "AttachImageWithExtensions", "ReattachImageWithExtensions")); + + r = determine_image(argv[1], false, &image); + if (r < 0) + return r; + + r = determine_matches(argv[1], argv + 2, false, &matches); + if (r < 0) + return r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_message_new_method_call(bus, &m, bus_portable_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", image); + if (r < 0) + return bus_log_create_error(r); + + r = attach_extensions_to_message(m, method, arg_extension_images); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(m, matches); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", arg_profile); + if (r < 0) + return bus_log_create_error(r); + + if (STR_IN_SET(method, "AttachImageWithExtensions", "ReattachImageWithExtensions")) { + uint64_t flags = (arg_runtime ? PORTABLE_RUNTIME : 0) | (arg_force ? PORTABLE_FORCE_ATTACH | PORTABLE_FORCE_EXTENSION : 0); + + r = sd_bus_message_append(m, "st", arg_copy_mode, flags); + } else + r = sd_bus_message_append(m, "bs", arg_runtime, arg_copy_mode); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "%s failed: %s", method, bus_error_message(&error, r)); + + (void) maybe_reload(&bus); + + print_changes(reply); + + if (STR_IN_SET(method, "AttachImage", "AttachImageWithExtensions")) + (void) maybe_enable_start(bus, reply); + else { + /* ReattachImage returns 2 lists - removed units first, and changed/added second */ + print_changes(reply); + (void) maybe_stop_enable_restart(bus, reply); + } + + return 0; +} + +static int attach_image(int argc, char *argv[], void *userdata) { + return attach_reattach_image(argc, argv, strv_isempty(arg_extension_images) && !arg_force ? "AttachImage" : "AttachImageWithExtensions"); +} + +static int reattach_image(int argc, char *argv[], void *userdata) { + return attach_reattach_image(argc, argv, strv_isempty(arg_extension_images) && !arg_force ? "ReattachImage" : "ReattachImageWithExtensions"); +} + +static int detach_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *image = NULL; + const char *method; + int r; + + r = determine_image(argv[1], true, &image); + if (r < 0) + return r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + (void) maybe_stop_disable(bus, image, argv); + + method = strv_isempty(arg_extension_images) && !arg_force ? "DetachImage" : "DetachImageWithExtensions"; + + r = bus_message_new_method_call(bus, &m, bus_portable_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", image); + if (r < 0) + return bus_log_create_error(r); + + r = attach_extensions_to_message(m, method, arg_extension_images); + if (r < 0) + return r; + + if (streq(method, "DetachImage")) + r = sd_bus_message_append(m, "b", arg_runtime); + else { + uint64_t flags = (arg_runtime ? PORTABLE_RUNTIME : 0) | (arg_force ? PORTABLE_FORCE_ATTACH | PORTABLE_FORCE_EXTENSION : 0); + + r = sd_bus_message_append(m, "t", flags); + } + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "%s failed: %s", method, bus_error_message(&error, r)); + + (void) maybe_reload(&bus); + + print_changes(reply); + return 0; +} + +static int list_images(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_portable_mgr, "ListImages", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list images: %s", bus_error_message(&error, r)); + + table = table_new("name", "type", "ro", "crtime", "mtime", "usage", "state"); + if (!table) + return log_oom(); + + r = sd_bus_message_enter_container(reply, 'a', "(ssbtttso)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + const char *name, *type, *state; + uint64_t crtime, mtime, usage; + int ro_int; + + r = sd_bus_message_read(reply, "(ssbtttso)", &name, &type, &ro_int, &crtime, &mtime, &usage, &state, NULL); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = table_add_many(table, + TABLE_STRING, name, + TABLE_STRING, type, + TABLE_BOOLEAN, ro_int, + TABLE_SET_COLOR, ro_int ? ansi_highlight_red() : NULL, + TABLE_TIMESTAMP, crtime, + TABLE_TIMESTAMP, mtime, + TABLE_SIZE, usage, + TABLE_STRING, state, + TABLE_SET_COLOR, !streq(state, "detached") ? ansi_highlight_green() : NULL); + if (r < 0) + return table_log_add_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (table_get_rows(table) > 1) { + r = table_set_sort(table, (size_t) 0); + if (r < 0) + return table_log_sort_error(r); + + table_set_header(table, arg_legend); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + if (table_get_rows(table) > 1) + printf("\n%zu images listed.\n", table_get_rows(table) - 1); + else + printf("No images.\n"); + } + + return 0; +} + +static int remove_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r, i; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (i = 1; i < argc; i++) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + r = bus_message_new_method_call(bus, &m, bus_portable_mgr, "RemoveImage"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", argv[i]); + if (r < 0) + return bus_log_create_error(r); + + /* This is a slow operation, hence turn off any method call timeouts */ + r = sd_bus_call(bus, m, USEC_INFINITY, &error, NULL); + if (r < 0) + return log_error_errno(r, "Could not remove image: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int read_only_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int b = true, r; + + if (argc > 2) { + b = parse_boolean(argv[2]); + if (b < 0) + return log_error_errno(b, "Failed to parse boolean argument: %s", argv[2]); + } + + r = acquire_bus(&bus); + if (r < 0) + return r; + + (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method(bus, bus_portable_mgr, "MarkImageReadOnly", &error, NULL, "sb", argv[1], b); + if (r < 0) + return log_error_errno(r, "Could not mark image read-only: %s", bus_error_message(&error, r)); + + return 0; +} + +static int set_limit(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + uint64_t limit; + int r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + (void) polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + if (STR_IN_SET(argv[argc-1], "-", "none", "infinity")) + limit = UINT64_MAX; + else { + r = parse_size(argv[argc-1], 1024, &limit); + if (r < 0) + return log_error_errno(r, "Failed to parse size: %s", argv[argc-1]); + } + + if (argc > 2) + /* With two arguments changes the quota limit of the specified image */ + r = bus_call_method(bus, bus_portable_mgr, "SetImageLimit", &error, NULL, "st", argv[1], limit); + else + /* With one argument changes the pool quota limit */ + r = bus_call_method(bus, bus_portable_mgr, "SetPoolLimit", &error, NULL, "t", limit); + + if (r < 0) + return log_error_errno(r, "Could not set limit: %s", bus_error_message(&error, r)); + + return 0; +} + +static int is_image_attached(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *image = NULL; + const char *state, *method; + int r; + + r = determine_image(argv[1], true, &image); + if (r < 0) + return r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + method = strv_isempty(arg_extension_images) ? "GetImageState" : "GetImageStateWithExtensions"; + + r = bus_message_new_method_call(bus, &m, bus_portable_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", image); + if (r < 0) + return bus_log_create_error(r); + + r = attach_extensions_to_message(m, method, arg_extension_images); + if (r < 0) + return r; + + if (!strv_isempty(arg_extension_images)) { + r = sd_bus_message_append(m, "t", UINT64_C(0)); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "%s failed: %s", method, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &state); + if (r < 0) + return r; + + if (!arg_quiet) + puts(state); + + return streq(state, "detached"); +} + +static int dump_profiles(void) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_strv_free_ char **l = NULL; + int r; + + r = acquire_bus(&bus); + if (r < 0) + return r; + + r = bus_get_property_strv(bus, bus_portable_mgr, "Profiles", &error, &l); + if (r < 0) + return log_error_errno(r, "Failed to acquire list of profiles: %s", bus_error_message(&error, r)); + + if (arg_legend) + log_info("Available unit profiles:"); + + STRV_FOREACH(i, l) { + fputs(*i, stdout); + fputc('\n', stdout); + } + + return 0; +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("portablectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n\n" + "%sAttach or detach portable services from the local system.%s\n" + "\nCommands:\n" + " list List available portable service images\n" + " attach NAME|PATH [PREFIX...]\n" + " Attach the specified portable service image\n" + " detach NAME|PATH [PREFIX...]\n" + " Detach the specified portable service image\n" + " reattach NAME|PATH [PREFIX...]\n" + " Reattach the specified portable service image\n" + " inspect NAME|PATH [PREFIX...]\n" + " Show details of specified portable service image\n" + " is-attached NAME|PATH Query if portable service image is attached\n" + " read-only NAME|PATH [BOOL] Mark or unmark portable service image read-only\n" + " remove NAME|PATH... Remove a portable service image\n" + " set-limit [NAME|PATH] Set image or pool size limit (disk quota)\n" + "\nOptions:\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --no-ask-password Do not ask for system passwords\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " -q --quiet Suppress informational messages\n" + " -p --profile=PROFILE Pick security profile for portable service\n" + " --copy=copy|auto|symlink Prefer copying or symlinks if possible\n" + " --runtime Attach portable service until next reboot only\n" + " --no-reload Don't reload the system and service manager\n" + " --cat When inspecting include unit and os-release file\n" + " contents\n" + " --enable Immediately enable/disable the portable service\n" + " after attach/detach\n" + " --now Immediately start/stop the portable service after\n" + " attach/before detach\n" + " --no-block Don't block waiting for attach --now to complete\n" + " --extension=PATH Extend the image with an overlay\n" + " --force Skip 'already active' check when attaching or\n" + " detaching an image (with extensions)\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + int r; + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_NO_ASK_PASSWORD, + ARG_COPY, + ARG_RUNTIME, + ARG_NO_RELOAD, + ARG_CAT, + ARG_ENABLE, + ARG_NOW, + ARG_NO_BLOCK, + ARG_EXTENSION, + ARG_FORCE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "quiet", no_argument, NULL, 'q' }, + { "profile", required_argument, NULL, 'p' }, + { "copy", required_argument, NULL, ARG_COPY }, + { "runtime", no_argument, NULL, ARG_RUNTIME }, + { "no-reload", no_argument, NULL, ARG_NO_RELOAD }, + { "cat", no_argument, NULL, ARG_CAT }, + { "enable", no_argument, NULL, ARG_ENABLE }, + { "now", no_argument, NULL, ARG_NOW }, + { "no-block", no_argument, NULL, ARG_NO_BLOCK }, + { "extension", required_argument, NULL, ARG_EXTENSION }, + { "force", no_argument, NULL, ARG_FORCE }, + {} + }; + + assert(argc >= 0); + assert(argv); + + for (;;) { + int c; + + c = getopt_long(argc, argv, "hH:M:qp:", options, NULL); + if (c < 0) + break; + + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case 'q': + arg_quiet = true; + break; + + case 'p': + if (streq(optarg, "help")) + return dump_profiles(); + + if (!filename_is_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unit profile name not valid: %s", optarg); + + arg_profile = optarg; + break; + + case ARG_COPY: + if (streq(optarg, "auto")) + arg_copy_mode = NULL; + else if (STR_IN_SET(optarg, "copy", "symlink")) + arg_copy_mode = optarg; + else if (streq(optarg, "help")) { + puts("auto\n" + "copy\n" + "symlink"); + return 0; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse --copy= argument: %s", optarg); + + break; + + case ARG_RUNTIME: + arg_runtime = true; + break; + + case ARG_NO_RELOAD: + arg_reload = false; + break; + + case ARG_CAT: + arg_cat = true; + break; + + case ARG_ENABLE: + arg_enable = true; + break; + + case ARG_NOW: + arg_now = true; + break; + + case ARG_NO_BLOCK: + arg_no_block = true; + break; + + case ARG_EXTENSION: + r = strv_extend(&arg_extension_images, optarg); + if (r < 0) + return log_oom(); + break; + + case ARG_FORCE: + arg_force = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + } + + return 1; +} + +static int run(int argc, char *argv[]) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "list", VERB_ANY, 1, VERB_DEFAULT, list_images }, + { "attach", 2, VERB_ANY, 0, attach_image }, + { "detach", 2, VERB_ANY, 0, detach_image }, + { "inspect", 2, VERB_ANY, 0, inspect_image }, + { "is-attached", 2, 2, 0, is_image_attached }, + { "read-only", 2, 3, 0, read_only_image }, + { "remove", 2, VERB_ANY, 0, remove_image }, + { "set-limit", 3, 3, 0, set_limit }, + { "reattach", 2, VERB_ANY, 0, reattach_image }, + {} + }; + + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/portable/portabled-bus.c b/src/portable/portabled-bus.c new file mode 100644 index 0000000..0d55180 --- /dev/null +++ b/src/portable/portabled-bus.c @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "bus-common-errors.h" +#include "bus-object.h" +#include "bus-polkit.h" +#include "discover-image.h" +#include "fd-util.h" +#include "io-util.h" +#include "missing_capability.h" +#include "portable.h" +#include "portabled-bus.h" +#include "portabled-image-bus.h" +#include "portabled-image.h" +#include "portabled.h" +#include "strv.h" +#include "user-util.h" + +static int property_get_pool_path( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "s", "/var/lib/portables"); +} + +static int property_get_pool_usage( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_close_ int fd = -EBADF; + uint64_t usage = UINT64_MAX; + + assert(bus); + assert(reply); + + fd = open("/var/lib/portables", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd >= 0) { + BtrfsQuotaInfo q; + + if (btrfs_subvol_get_subtree_quota_fd(fd, 0, &q) >= 0) + usage = q.referenced; + } + + return sd_bus_message_append(reply, "t", usage); +} + +static int property_get_pool_limit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_close_ int fd = -EBADF; + uint64_t size = UINT64_MAX; + + assert(bus); + assert(reply); + + fd = open("/var/lib/portables", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd >= 0) { + BtrfsQuotaInfo q; + + if (btrfs_subvol_get_subtree_quota_fd(fd, 0, &q) >= 0) + size = q.referenced_max; + } + + return sd_bus_message_append(reply, "t", size); +} + +static int property_get_profiles( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_strv_free_ char **l = NULL; + int r; + + assert(bus); + assert(reply); + + r = portable_get_profiles(&l); + if (r < 0) + return r; + + return sd_bus_message_append_strv(reply, l); +} + +static int method_get_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *name; + Image *image; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_image_acquire(m, message, name, NULL, BUS_IMAGE_REFUSE_BY_PATH, NULL, &image, error); + if (r < 0) + return r; + + r = bus_image_path(image, &p); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int method_list_images(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_hashmap_free_ Hashmap *images = NULL; + Manager *m = ASSERT_PTR(userdata); + Image *image; + int r; + + assert(message); + + images = hashmap_new(&image_hash_ops); + if (!images) + return -ENOMEM; + + r = manager_image_cache_discover(m, images, error); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(ssbtttso)"); + if (r < 0) + return r; + + HASHMAP_FOREACH(image, images) { + _cleanup_(sd_bus_error_free) sd_bus_error error_state = SD_BUS_ERROR_NULL; + PortableState state = _PORTABLE_STATE_INVALID; + _cleanup_free_ char *p = NULL; + + r = bus_image_path(image, &p); + if (r < 0) + return r; + + r = portable_get_state( + sd_bus_message_get_bus(message), + image->path, + NULL, + 0, + &state, + &error_state); + if (r < 0) + log_debug_errno(r, "Failed to get state of image '%s', ignoring: %s", + image->path, bus_error_message(&error_state, r)); + + r = sd_bus_message_append(reply, "(ssbtttso)", + image->name, + image_type_to_string(image->type), + image->read_only, + image->crtime, + image->mtime, + image->usage, + portable_state_to_string(state), + p); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int redirect_method_to_image( + Manager *m, + sd_bus_message *message, + sd_bus_error *error, + int (*method)(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error* error)) { + + const char *name_or_path; + int r; + + assert(m); + assert(message); + assert(method); + + r = sd_bus_message_read(message, "s", &name_or_path); + if (r < 0) + return r; + + return method(m, message, name_or_path, NULL, error); +} + +static int method_get_image_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_get_os_release); +} + +static int method_get_image_metadata(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_get_metadata); +} + +static int method_get_image_state(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **extension_images = NULL; + const char *name_or_path; + PortableState state; + int r; + + assert(message); + + r = sd_bus_message_read(message, "s", &name_or_path); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "GetImageStateWithExtensions")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "t", &input_flags); + if (r < 0) + return r; + + /* No flags are supported by this method for now. */ + if (input_flags != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + } + + r = portable_get_state( + sd_bus_message_get_bus(message), + name_or_path, + extension_images, + 0, + &state, + error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", portable_state_to_string(state)); +} + +static int method_attach_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_attach); +} + +static int method_detach_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **extension_images = NULL; + PortableChange *changes = NULL; + PortableFlags flags = 0; + Manager *m = ASSERT_PTR(userdata); + size_t n_changes = 0; + const char *name_or_path; + int r; + + assert(message); + + CLEANUP_ARRAY(changes, n_changes, portable_changes_free); + + /* Note that we do not redirect detaching to the image object here, because we want to allow that users can + * detach already deleted images too, in case the user already deleted an image before properly detaching + * it. */ + + r = sd_bus_message_read(message, "s", &name_or_path); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "DetachImageWithExtensions")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "t", &input_flags); + if (r < 0) + return r; + + if ((input_flags & ~_PORTABLE_MASK_PUBLIC) != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + flags |= input_flags; + } else { + int runtime; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + + if (runtime) + flags |= PORTABLE_RUNTIME; + } + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.portable1.attach-images", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = portable_detach( + sd_bus_message_get_bus(message), + name_or_path, + extension_images, + flags, + &changes, + &n_changes, + error); + if (r < 0) + return r; + + return reply_portable_changes(message, changes, n_changes); +} + +static int method_reattach_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_reattach); +} + +static int method_remove_image(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_remove); +} + +static int method_mark_image_read_only(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_mark_read_only); +} + +static int method_set_image_limit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return redirect_method_to_image(userdata, message, error, bus_image_common_set_limit); +} + +static int method_set_pool_limit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + uint64_t limit; + int r; + + assert(message); + + r = sd_bus_message_read(message, "t", &limit); + if (r < 0) + return r; + if (!FILE_SIZE_VALID_OR_INFINITY(limit)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "New limit out of range"); + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.portable1.manage-images", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + (void) btrfs_qgroup_set_limit("/var/lib/portables", 0, limit); + + r = btrfs_subvol_set_subtree_quota_limit("/var/lib/portables", 0, limit); + if (r == -ENOTTY) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Quota is only supported on btrfs."); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to adjust quota limit: %m"); + + return sd_bus_reply_method_return(message, NULL); +} + +const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("PoolPath", "s", property_get_pool_path, 0, 0), + SD_BUS_PROPERTY("PoolUsage", "t", property_get_pool_usage, 0, 0), + SD_BUS_PROPERTY("PoolLimit", "t", property_get_pool_limit, 0, 0), + SD_BUS_PROPERTY("Profiles", "as", property_get_profiles, 0, 0), + SD_BUS_METHOD_WITH_ARGS("GetImage", + SD_BUS_ARGS("s", image), + SD_BUS_RESULT("o", object), + method_get_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListImages", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a(ssbtttso)", images), + method_list_images, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageOSRelease", + SD_BUS_ARGS("s", image), + SD_BUS_RESULT("a{ss}", os_release), + method_get_image_os_release, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageMetadata", + SD_BUS_ARGS("s", image, + "as", matches), + SD_BUS_RESULT("s", image, + "ay", os_release, + "a{say}", units), + method_get_image_metadata, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageMetadataWithExtensions", + SD_BUS_ARGS("s", image, + "as", extensions, + "as", matches, + "t", flags), + SD_BUS_RESULT("s", image, + "ay", os_release, + "a{say}", extensions, + "a{say}", units), + method_get_image_metadata, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageState", + SD_BUS_ARGS("s", image), + SD_BUS_RESULT("s", state), + method_get_image_state, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetImageStateWithExtensions", + SD_BUS_ARGS("s", image, + "as", extensions, + "t", flags), + SD_BUS_RESULT("s", state), + method_get_image_state, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AttachImage", + SD_BUS_ARGS("s", image, + "as", matches, + "s", profile, + "b", runtime, + "s", copy_mode), + SD_BUS_RESULT("a(sss)", changes), + method_attach_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AttachImageWithExtensions", + SD_BUS_ARGS("s", image, + "as", extensions, + "as", matches, + "s", profile, + "s", copy_mode, + "t", flags), + SD_BUS_RESULT("a(sss)", changes), + method_attach_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DetachImage", + SD_BUS_ARGS("s", image, + "b", runtime), + SD_BUS_RESULT("a(sss)", changes), + method_detach_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DetachImageWithExtensions", + SD_BUS_ARGS("s", image, + "as", extensions, + "t", flags), + SD_BUS_RESULT("a(sss)", changes), + method_detach_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReattachImage", + SD_BUS_ARGS("s", image, + "as", matches, + "s", profile, + "b", runtime, + "s", copy_mode), + SD_BUS_RESULT("a(sss)", changes_removed, + "a(sss)", changes_updated), + method_reattach_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReattachImageWithExtensions", + SD_BUS_ARGS("s", image, + "as", extensions, + "as", matches, + "s", profile, + "s", copy_mode, + "t", flags), + SD_BUS_RESULT("a(sss)", changes_removed, + "a(sss)", changes_updated), + method_reattach_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RemoveImage", + SD_BUS_ARGS("s", image), + SD_BUS_NO_RESULT, + method_remove_image, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MarkImageReadOnly", + SD_BUS_ARGS("s", image, + "b", read_only), + SD_BUS_NO_RESULT, + method_mark_image_read_only, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetImageLimit", + SD_BUS_ARGS("s", image, + "t", limit), + SD_BUS_NO_RESULT, + method_set_image_limit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetPoolLimit", + SD_BUS_ARGS("t", limit), + SD_BUS_NO_RESULT, + method_set_pool_limit, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/portable1", + "org.freedesktop.portable1.Manager", + .vtables = BUS_VTABLES(manager_vtable), + .children = BUS_IMPLEMENTATIONS(&image_object), +}; + +static int reply_portable_compose_message(sd_bus_message *reply, const PortableChange *changes, size_t n_changes) { + size_t i; + int r; + + assert(reply); + assert(changes || n_changes == 0); + + r = sd_bus_message_open_container(reply, 'a', "(sss)"); + if (r < 0) + return r; + + for (i = 0; i < n_changes; i++) { + if (changes[i].type_or_errno < 0) + continue; + + r = sd_bus_message_append(reply, "(sss)", + portable_change_type_to_string(changes[i].type_or_errno), + changes[i].path, + changes[i].source); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return 0; +} + +int reply_portable_changes(sd_bus_message *m, const PortableChange *changes, size_t n_changes) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(m); + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = reply_portable_compose_message(reply, changes, n_changes); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +int reply_portable_changes_pair( + sd_bus_message *m, + const PortableChange *changes_first, + size_t n_changes_first, + const PortableChange *changes_second, + size_t n_changes_second) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(m); + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = reply_portable_compose_message(reply, changes_first, n_changes_first); + if (r < 0) + return r; + + r = reply_portable_compose_message(reply, changes_second, n_changes_second); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} diff --git a/src/portable/portabled-bus.h b/src/portable/portabled-bus.h new file mode 100644 index 0000000..7da366c --- /dev/null +++ b/src/portable/portabled-bus.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "portable.h" + +extern const sd_bus_vtable manager_vtable[]; + +int reply_portable_changes(sd_bus_message *m, const PortableChange *changes, size_t n_changes); +int reply_portable_changes_pair(sd_bus_message *m, const PortableChange *changes_first, size_t n_changes_first, const PortableChange *changes_second, size_t n_changes_second); diff --git a/src/portable/portabled-image-bus.c b/src/portable/portabled-image-bus.c new file mode 100644 index 0000000..1f61c3b --- /dev/null +++ b/src/portable/portabled-image-bus.c @@ -0,0 +1,1191 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-label.h" +#include "bus-object.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "discover-image.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "missing_capability.h" +#include "os-util.h" +#include "portable.h" +#include "portabled-bus.h" +#include "portabled-image-bus.h" +#include "portabled-image.h" +#include "portabled.h" +#include "process-util.h" +#include "strv.h" +#include "user-util.h" + +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, image_type, ImageType); + +int bus_image_common_get_os_release( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + int r; + + assert(name_or_path || image); + assert(message); + + if (!m) { + assert(image); + m = image->userdata; + } + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_BY_PATH, + "org.freedesktop.portable1.inspect-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) /* Will call us back */ + return 1; + + if (!image->metadata_valid) { + r = image_read_metadata(image, &image_policy_service); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read image metadata: %m"); + } + + return bus_reply_pair_array(message, image->os_release); +} + +static int bus_image_method_get_os_release(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_get_os_release(NULL, message, NULL, userdata, error); +} + +static int append_fd(sd_bus_message *m, PortableMetadata *d) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *buf = NULL; + size_t n = 0; + int r; + + assert(m); + + if (d) { + assert(d->fd >= 0); + + r = fdopen_independent(d->fd, "r", &f); + if (r < 0) + return r; + + r = read_full_stream(f, &buf, &n); + if (r < 0) + return r; + } + + return sd_bus_message_append_array(m, 'y', buf, n); +} + +int bus_image_common_get_metadata( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + _cleanup_ordered_hashmap_free_ OrderedHashmap *extension_releases = NULL; + _cleanup_(portable_metadata_unrefp) PortableMetadata *os_release = NULL; + _cleanup_strv_free_ char **matches = NULL, **extension_images = NULL; + _cleanup_hashmap_free_ Hashmap *unit_files = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ PortableMetadata **sorted = NULL; + PortableFlags flags = 0; + int r; + + assert(name_or_path || image); + assert(message); + + if (!m) { + assert(image); + m = image->userdata; + } + + bool have_exti = sd_bus_message_is_method_call(message, NULL, "GetImageMetadataWithExtensions") || + sd_bus_message_is_method_call(message, NULL, "GetMetadataWithExtensions"); + + if (have_exti) { + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + } + + r = sd_bus_message_read_strv(message, &matches); + if (r < 0) + return r; + + if (have_exti) { + uint64_t input_flags = 0; + + r = sd_bus_message_read(message, "t", &input_flags); + if (r < 0) + return r; + + if ((input_flags & ~_PORTABLE_MASK_PUBLIC) != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + flags |= input_flags; + } + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_BY_PATH, + "org.freedesktop.portable1.inspect-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) /* Will call us back */ + return 1; + + r = portable_extract( + image->path, + matches, + extension_images, + /* image_policy= */ NULL, + flags, + &os_release, + &extension_releases, + &unit_files, + NULL, + error); + if (r < 0) + return r; + + r = portable_metadata_hashmap_to_sorted_array(unit_files, &sorted); + if (r < 0) + return r; + + r = sd_bus_message_new_method_return(message, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", image->path); + if (r < 0) + return r; + + r = append_fd(reply, os_release); + if (r < 0) + return r; + + /* If it was requested, also send back the extension path and the content + * of each extension-release file. Behind a flag, as it's an incompatible + * change. */ + if (have_exti) { + PortableMetadata *extension_release; + + r = sd_bus_message_open_container(reply, 'a', "{say}"); + if (r < 0) + return r; + + ORDERED_HASHMAP_FOREACH(extension_release, extension_releases) { + + r = sd_bus_message_open_container(reply, 'e', "say"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", extension_release->image_path); + if (r < 0) + return r; + + r = append_fd(reply, extension_release); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + r = sd_bus_message_open_container(reply, 'a', "{say}"); + if (r < 0) + return r; + + for (size_t i = 0; i < hashmap_size(unit_files); i++) { + + r = sd_bus_message_open_container(reply, 'e', "say"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", sorted[i]->name); + if (r < 0) + return r; + + r = append_fd(reply, sorted[i]); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int bus_image_method_get_metadata(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_get_metadata(NULL, message, NULL, userdata, error); +} + +static int bus_image_method_get_state( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _cleanup_strv_free_ char **extension_images = NULL; + Image *image = ASSERT_PTR(userdata); + PortableState state; + int r; + + assert(message); + + if (sd_bus_message_is_method_call(message, NULL, "GetStateWithExtensions")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "t", &input_flags); + if (r < 0) + return r; + + /* No flags are supported by this method for now. */ + if (input_flags != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + } + + r = portable_get_state( + sd_bus_message_get_bus(message), + image->path, + extension_images, + 0, + &state, + error); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, "s", portable_state_to_string(state)); +} + +int bus_image_common_attach( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + _cleanup_strv_free_ char **matches = NULL, **extension_images = NULL; + PortableChange *changes = NULL; + PortableFlags flags = 0; + const char *profile, *copy_mode; + size_t n_changes = 0; + int r; + + assert(message); + assert(name_or_path || image); + + CLEANUP_ARRAY(changes, n_changes, portable_changes_free); + + if (!m) { + assert(image); + m = image->userdata; + } + + if (sd_bus_message_is_method_call(message, NULL, "AttachImageWithExtensions") || + sd_bus_message_is_method_call(message, NULL, "AttachWithExtensions")) { + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + } + + r = sd_bus_message_read_strv(message, &matches); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &profile); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "AttachImageWithExtensions") || + sd_bus_message_is_method_call(message, NULL, "AttachWithExtensions")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read(message, "st", ©_mode, &input_flags); + if (r < 0) + return r; + if ((input_flags & ~_PORTABLE_MASK_PUBLIC) != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + flags |= input_flags; + } else { + int runtime; + + r = sd_bus_message_read(message, "bs", &runtime, ©_mode); + if (r < 0) + return r; + + if (runtime) + flags |= PORTABLE_RUNTIME; + } + + if (streq(copy_mode, "symlink")) + flags |= PORTABLE_PREFER_SYMLINK; + else if (streq(copy_mode, "copy")) + flags |= PORTABLE_PREFER_COPY; + else if (!isempty(copy_mode)) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, "Unknown copy mode '%s'", copy_mode); + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_ALL, + "org.freedesktop.portable1.attach-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) /* Will call us back */ + return 1; + + r = portable_attach( + sd_bus_message_get_bus(message), + image->path, + matches, + profile, + extension_images, + /* image_policy= */ NULL, + flags, + &changes, + &n_changes, + error); + if (r < 0) + return r; + + return reply_portable_changes(message, changes, n_changes); +} + +static int bus_image_method_attach(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_attach(NULL, message, NULL, userdata, error); +} + +static int bus_image_method_detach( + sd_bus_message *message, + void *userdata, + sd_bus_error *error) { + + _cleanup_strv_free_ char **extension_images = NULL; + PortableChange *changes = NULL; + Image *image = ASSERT_PTR(userdata); + Manager *m = ASSERT_PTR(image->userdata); + PortableFlags flags = 0; + size_t n_changes = 0; + int r; + + assert(message); + + CLEANUP_ARRAY(changes, n_changes, portable_changes_free); + + if (sd_bus_message_is_method_call(message, NULL, "DetachWithExtensions")) { + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + } + + if (sd_bus_message_is_method_call(message, NULL, "DetachWithExtensions")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read(message, "t", &input_flags); + if (r < 0) + return r; + + if ((input_flags & ~_PORTABLE_MASK_PUBLIC) != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + flags |= input_flags; + } else { + int runtime; + + r = sd_bus_message_read(message, "b", &runtime); + if (r < 0) + return r; + + if (runtime) + flags |= PORTABLE_RUNTIME; + } + + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + "org.freedesktop.portable1.attach-images", + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = portable_detach( + sd_bus_message_get_bus(message), + image->path, + extension_images, + flags, + &changes, + &n_changes, + error); + if (r < 0) + return r; + + return reply_portable_changes(message, changes, n_changes); +} + +int bus_image_common_remove( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + _cleanup_(sigkill_waitp) pid_t child = 0; + PortableState state; + int r; + + assert(message); + assert(name_or_path || image); + + if (!m) { + assert(image); + m = image->userdata; + } + + if (m->n_operations >= OPERATIONS_MAX) + return sd_bus_error_set(error, SD_BUS_ERROR_LIMITS_EXCEEDED, "Too many ongoing operations."); + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_ALL, + "org.freedesktop.portable1.manage-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = portable_get_state( + sd_bus_message_get_bus(message), + image->path, + NULL, + 0, + &state, + error); + if (r < 0) + return r; + + if (state != PORTABLE_DETACHED) + return sd_bus_error_set_errnof(error, EBUSY, "Image '%s' is not detached, refusing.", image->path); + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) + return sd_bus_error_set_errnof(error, errno, "Failed to create pipe: %m"); + + r = safe_fork("(sd-imgrm)", FORK_RESET_SIGNALS, &child); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to fork(): %m"); + if (r == 0) { + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + r = image_remove(image); + if (r < 0) { + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + r = operation_new(m, child, message, errno_pipe_fd[0], NULL); + if (r < 0) + return r; + + child = 0; + errno_pipe_fd[0] = -EBADF; + + return 1; +} + +static int bus_image_method_remove(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_remove(NULL, message, NULL, userdata, error); +} + +/* Given two PortableChange arrays, return a new array that has all elements of the first that are + * not also present in the second, comparing the basename of the path values. */ +static int normalize_portable_changes( + const PortableChange *changes_attached, + size_t n_changes_attached, + const PortableChange *changes_detached, + size_t n_changes_detached, + PortableChange **ret_changes, + size_t *ret_n_changes) { + + PortableChange *changes = NULL; + size_t n_changes = 0; + + assert(ret_n_changes); + assert(ret_changes); + + if (n_changes_detached == 0) + return 0; /* Nothing to do */ + + changes = new0(PortableChange, n_changes_attached + n_changes_detached); + if (!changes) + return -ENOMEM; + + CLEANUP_ARRAY(changes, n_changes, portable_changes_free); + + /* Corner case: only detached, nothing attached */ + if (n_changes_attached == 0) { + memcpy(changes, changes_detached, sizeof(PortableChange) * n_changes_detached); + *ret_changes = TAKE_PTR(changes); + *ret_n_changes = n_changes_detached; + return 0; + } + + for (size_t i = 0; i < n_changes_detached; ++i) { + bool found = false; + + for (size_t j = 0; j < n_changes_attached; ++j) + if (streq(basename(changes_detached[i].path), basename(changes_attached[j].path))) { + found = true; + break; + } + + if (!found) { + _cleanup_free_ char *path = NULL, *source = NULL; + + path = strdup(changes_detached[i].path); + if (!path) + return -ENOMEM; + + if (changes_detached[i].source) { + source = strdup(changes_detached[i].source); + if (!source) + return -ENOMEM; + } + + changes[n_changes++] = (PortableChange) { + .type_or_errno = changes_detached[i].type_or_errno, + .path = TAKE_PTR(path), + .source = TAKE_PTR(source), + }; + } + } + + *ret_n_changes = n_changes; + *ret_changes = TAKE_PTR(changes); + + return 0; +} + +int bus_image_common_reattach( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + PortableChange *changes_detached = NULL, *changes_attached = NULL, *changes_gone = NULL; + size_t n_changes_detached = 0, n_changes_attached = 0, n_changes_gone = 0; + _cleanup_strv_free_ char **matches = NULL, **extension_images = NULL; + PortableFlags flags = PORTABLE_REATTACH; + const char *profile, *copy_mode; + int r; + + assert(message); + assert(name_or_path || image); + + CLEANUP_ARRAY(changes_detached, n_changes_detached, portable_changes_free); + CLEANUP_ARRAY(changes_attached, n_changes_attached, portable_changes_free); + CLEANUP_ARRAY(changes_gone, n_changes_gone, portable_changes_free); + + if (!m) { + assert(image); + m = image->userdata; + } + + if (sd_bus_message_is_method_call(message, NULL, "ReattachImageWithExtensions") || + sd_bus_message_is_method_call(message, NULL, "ReattachWithExtensions")) { + r = sd_bus_message_read_strv(message, &extension_images); + if (r < 0) + return r; + } + + r = sd_bus_message_read_strv(message, &matches); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &profile); + if (r < 0) + return r; + + if (sd_bus_message_is_method_call(message, NULL, "ReattachImageWithExtensions") || + sd_bus_message_is_method_call(message, NULL, "ReattachWithExtensions")) { + uint64_t input_flags = 0; + + r = sd_bus_message_read(message, "st", ©_mode, &input_flags); + if (r < 0) + return r; + + if ((input_flags & ~_PORTABLE_MASK_PUBLIC) != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, + "Invalid 'flags' parameter '%" PRIu64 "'", + input_flags); + flags |= input_flags; + } else { + int runtime; + + r = sd_bus_message_read(message, "bs", &runtime, ©_mode); + if (r < 0) + return r; + + if (runtime) + flags |= PORTABLE_RUNTIME; + } + + if (streq(copy_mode, "symlink")) + flags |= PORTABLE_PREFER_SYMLINK; + else if (streq(copy_mode, "copy")) + flags |= PORTABLE_PREFER_COPY; + else if (!isempty(copy_mode)) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, "Unknown copy mode '%s'", copy_mode); + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_ALL, + "org.freedesktop.portable1.attach-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) /* Will call us back */ + return 1; + + r = portable_detach( + sd_bus_message_get_bus(message), + image->path, + extension_images, + flags, + &changes_detached, + &n_changes_detached, + error); + if (r < 0) + return r; + + r = portable_attach( + sd_bus_message_get_bus(message), + image->path, + matches, + profile, + extension_images, + /* image_policy= */ NULL, + flags, + &changes_attached, + &n_changes_attached, + error); + if (r < 0) + return r; + + /* We want to return the list of units really removed by the detach, + * and not added again by the attach */ + r = normalize_portable_changes(changes_attached, n_changes_attached, + changes_detached, n_changes_detached, + &changes_gone, &n_changes_gone); + if (r < 0) + return r; + + /* First, return the units that are gone (so that the caller can stop them) + * Then, return the units that are changed/added (so that the caller can + * start/restart/enable them) */ + return reply_portable_changes_pair(message, + changes_gone, n_changes_gone, + changes_attached, n_changes_attached); +} + +static int bus_image_method_reattach(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_reattach(NULL, message, NULL, userdata, error); +} + +int bus_image_common_mark_read_only( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + int r, read_only; + + assert(message); + assert(name_or_path || image); + + if (!m) { + assert(image); + m = image->userdata; + } + + r = sd_bus_message_read(message, "b", &read_only); + if (r < 0) + return r; + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_ALL, + "org.freedesktop.portable1.manage-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = image_read_only(image, read_only); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_image_method_mark_read_only(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_mark_read_only(NULL, message, NULL, userdata, error); +} + +int bus_image_common_set_limit( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + sd_bus_error *error) { + + uint64_t limit; + int r; + + assert(message); + assert(name_or_path || image); + + if (!m) { + assert(image); + m = image->userdata; + } + + r = sd_bus_message_read(message, "t", &limit); + if (r < 0) + return r; + if (!FILE_SIZE_VALID_OR_INFINITY(limit)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "New limit out of range"); + + r = bus_image_acquire(m, + message, + name_or_path, + image, + BUS_IMAGE_AUTHENTICATE_ALL, + "org.freedesktop.portable1.manage-images", + &image, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Will call us back */ + + r = image_set_limit(image, limit); + if (r < 0) + return r; + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_image_method_set_limit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_image_common_set_limit(NULL, message, NULL, userdata, error); +} + +const sd_bus_vtable image_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("Name", "s", NULL, offsetof(Image, name), 0), + SD_BUS_PROPERTY("Path", "s", NULL, offsetof(Image, path), 0), + SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Image, type), 0), + SD_BUS_PROPERTY("ReadOnly", "b", bus_property_get_bool, offsetof(Image, read_only), 0), + SD_BUS_PROPERTY("CreationTimestamp", "t", NULL, offsetof(Image, crtime), 0), + SD_BUS_PROPERTY("ModificationTimestamp", "t", NULL, offsetof(Image, mtime), 0), + SD_BUS_PROPERTY("Usage", "t", NULL, offsetof(Image, usage), 0), + SD_BUS_PROPERTY("Limit", "t", NULL, offsetof(Image, limit), 0), + SD_BUS_PROPERTY("UsageExclusive", "t", NULL, offsetof(Image, usage_exclusive), 0), + SD_BUS_PROPERTY("LimitExclusive", "t", NULL, offsetof(Image, limit_exclusive), 0), + SD_BUS_METHOD_WITH_ARGS("GetOSRelease", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("a{ss}", os_release), + bus_image_method_get_os_release, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetMetadata", + SD_BUS_ARGS("as", matches), + SD_BUS_RESULT("s", image, + "ay", os_release, + "a{say}", units), + bus_image_method_get_metadata, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetMetadataWithExtensions", + SD_BUS_ARGS("as", extensions, + "as", matches, + "t", flags), + SD_BUS_RESULT("s", image, + "ay", os_release, + "a{say}", extensions, + "a{say}", units), + bus_image_method_get_metadata, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetState", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("s", state), + bus_image_method_get_state, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetStateWithExtensions", + SD_BUS_ARGS("as", extensions, + "t", flags), + SD_BUS_RESULT("s", state), + bus_image_method_get_state, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Attach", + SD_BUS_ARGS("as", matches, + "s", profile, + "b", runtime, + "s", copy_mode), + SD_BUS_RESULT("a(sss)", changes), + bus_image_method_attach, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("AttachWithExtensions", + SD_BUS_ARGS("as", extensions, + "as", matches, + "s", profile, + "s", copy_mode, + "t", flags), + SD_BUS_RESULT("a(sss)", changes), + bus_image_method_attach, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Detach", + SD_BUS_ARGS("b", runtime), + SD_BUS_RESULT("a(sss)", changes), + bus_image_method_detach, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("DetachWithExtensions", + SD_BUS_ARGS("as", extensions, + "t", flags), + SD_BUS_RESULT("a(sss)", changes), + bus_image_method_detach, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Reattach", + SD_BUS_ARGS("as", matches, + "s", profile, + "b", runtime, + "s", copy_mode), + SD_BUS_RESULT("a(sss)", changes_removed, + "a(sss)", changes_updated), + bus_image_method_reattach, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ReattachWithExtensions", + SD_BUS_ARGS("as", extensions, + "as", matches, + "s", profile, + "s", copy_mode, + "t", flags), + SD_BUS_RESULT("a(sss)", changes_removed, + "a(sss)", changes_updated), + bus_image_method_reattach, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Remove", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_image_method_remove, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("MarkReadOnly", + SD_BUS_ARGS("b", read_only), + SD_BUS_NO_RESULT, + bus_image_method_mark_read_only, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLimit", + SD_BUS_ARGS("t", limit), + SD_BUS_NO_RESULT, + bus_image_method_set_limit, + SD_BUS_VTABLE_UNPRIVILEGED), + /* Deprecated silly typo */ + SD_BUS_METHOD_WITH_ARGS("ReattacheWithExtensions", + SD_BUS_ARGS("as", extensions, + "as", matches, + "s", profile, + "s", copy_mode, + "t", flags), + SD_BUS_RESULT("a(sss)", changes_removed, + "a(sss)", changes_updated), + bus_image_method_reattach, + SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN), + SD_BUS_VTABLE_END +}; + +int bus_image_path(Image *image, char **ret) { + assert(image); + assert(ret); + + if (!image->discoverable) + return -EINVAL; + + return sd_bus_path_encode("/org/freedesktop/portable1/image", image->name, ret); +} + +int bus_image_acquire( + Manager *m, + sd_bus_message *message, + const char *name_or_path, + Image *image, + ImageAcquireMode mode, + const char *polkit_action, + Image **ret, + sd_bus_error *error) { + + _cleanup_(image_unrefp) Image *loaded = NULL; + Image *cached; + int r; + + assert(m); + assert(message); + assert(name_or_path || image); + assert(mode >= 0); + assert(mode < _BUS_IMAGE_ACQUIRE_MODE_MAX); + assert(polkit_action || mode == BUS_IMAGE_REFUSE_BY_PATH); + assert(ret); + + /* Acquires an 'Image' object if not acquired yet, and enforces necessary authentication while doing so. */ + + if (mode == BUS_IMAGE_AUTHENTICATE_ALL) { + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + polkit_action, + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) { /* Will call us back */ + *ret = NULL; + return 0; + } + } + + /* Already passed in? */ + if (image) { + *ret = image; + return 1; + } + + /* Let's see if this image is already cached? */ + cached = manager_image_cache_get(m, name_or_path); + if (cached) { + *ret = cached; + return 1; + } + + if (image_name_is_valid(name_or_path)) { + + /* If it's a short name, let's search for it */ + r = image_find(IMAGE_PORTABLE, name_or_path, NULL, &loaded); + if (r == -ENOENT) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PORTABLE_IMAGE, + "No image '%s' found.", name_or_path); + + /* other errors are handled below… */ + } else { + /* Don't accept path if this is always forbidden */ + if (mode == BUS_IMAGE_REFUSE_BY_PATH) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Expected image name, not path in place of '%s'.", name_or_path); + + if (!path_is_absolute(name_or_path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Image name '%s' is not valid or not a valid path.", name_or_path); + + if (!path_is_normalized(name_or_path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Image path '%s' is not normalized.", name_or_path); + + if (mode == BUS_IMAGE_AUTHENTICATE_BY_PATH) { + r = bus_verify_polkit_async( + message, + CAP_SYS_ADMIN, + polkit_action, + NULL, + false, + UID_INVALID, + &m->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) { /* Will call us back */ + *ret = NULL; + return 0; + } + } + + r = image_from_path(name_or_path, &loaded); + } + if (r == -EMEDIUMTYPE) { + sd_bus_error_setf(error, BUS_ERROR_BAD_PORTABLE_IMAGE_TYPE, + "Type of image '%s' not recognized; supported image types are directories/btrfs subvolumes, block devices, and raw disk image files with suffix '.raw'.", + name_or_path); + return r; + } + if (r < 0) + return r; + + /* Add what we just loaded to the cache. This has as side-effect that the object stays in memory until the + * cache is purged again, i.e. at least for the current event loop iteration, which is all we need, and which + * means we don't actually need to ref the return object. */ + r = manager_image_cache_add(m, loaded); + if (r < 0) + return r; + + *ret = loaded; + return 1; +} + +int bus_image_object_find( + sd_bus *bus, + const char *path, + const char *interface, + void *userdata, + void **found, + sd_bus_error *error) { + + _cleanup_free_ char *e = NULL; + Manager *m = userdata; + Image *image = NULL; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = sd_bus_path_decode(path, "/org/freedesktop/portable1/image", &e); + if (r < 0) + return 0; + if (r == 0) + goto not_found; + if (isempty(e)) + /* The path is "/org/freedesktop/portable1/image" itself */ + goto not_found; + + r = bus_image_acquire(m, sd_bus_get_current_message(bus), e, NULL, BUS_IMAGE_REFUSE_BY_PATH, NULL, &image, error); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return r; + + *found = image; + return 1; + +not_found: + *found = NULL; + return 0; +} + +int bus_image_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + _cleanup_strv_free_ char **l = NULL; + Manager *m = userdata; + size_t n = 0; + Image *image; + int r; + + assert(bus); + assert(path); + assert(nodes); + + images = hashmap_new(&image_hash_ops); + if (!images) + return -ENOMEM; + + r = manager_image_cache_discover(m, images, error); + if (r < 0) + return r; + + HASHMAP_FOREACH(image, images) { + char *p; + + r = bus_image_path(image, &p); + if (r < 0) + return r; + + if (!GREEDY_REALLOC(l, n+2)) { + free(p); + return -ENOMEM; + } + + l[n++] = p; + l[n] = NULL; + } + + *nodes = TAKE_PTR(l); + + return 1; +} + +const BusObjectImplementation image_object = { + "/org/freedesktop/portable1/image", + "org.freedesktop.portable1.Image", + .fallback_vtables = BUS_FALLBACK_VTABLES({image_vtable, bus_image_object_find}), + .node_enumerator = bus_image_node_enumerator, +}; diff --git a/src/portable/portabled-image-bus.h b/src/portable/portabled-image-bus.h new file mode 100644 index 0000000..763a089 --- /dev/null +++ b/src/portable/portabled-image-bus.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "discover-image.h" +#include "portabled.h" + +int bus_image_common_get_os_release(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); +int bus_image_common_get_metadata(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); +int bus_image_common_attach(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); +int bus_image_common_remove(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); +int bus_image_common_reattach(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); +int bus_image_common_mark_read_only(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); +int bus_image_common_set_limit(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, sd_bus_error *error); + +extern const sd_bus_vtable image_vtable[]; +extern const BusObjectImplementation image_object; + +int bus_image_path(Image *image, char **ret); + +/* So here's some complexity: some of operations can either take an image name, or a fully qualified file system path + * to an image. We need to authenticate differently when processing these two: images referenced via simple image names + * mean the images are located in the image search path and thus safe for limited read access for unprivileged + * clients. For operations on images located anywhere else we need explicit authentication however, so that + * unprivileged clients can't make us open arbitrary files in the file system. + * + * The "Image" bus objects directly represent images in the image search path, but do not exist for path-referenced + * images. Hence, when requesting a bus object we need to refuse references by file system path, but still allow + * references by image name. Depending on the operation to execute potentially we need to authenticate in all cases. */ + +typedef enum ImageAcquireMode { + BUS_IMAGE_REFUSE_BY_PATH, /* allow by name + prohibit by path */ + BUS_IMAGE_AUTHENTICATE_BY_PATH, /* allow by name + polkit by path */ + BUS_IMAGE_AUTHENTICATE_ALL, /* polkit by name + polkit by path */ + _BUS_IMAGE_ACQUIRE_MODE_MAX, + _BUS_IMAGE_ACQUIRE_MODE_INVALID = -EINVAL, +} ImageAcquireMode; + +int bus_image_acquire(Manager *m, sd_bus_message *message, const char *name_or_path, Image *image, ImageAcquireMode mode, const char *polkit_action, Image **ret, sd_bus_error *error); + +int bus_image_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error); +int bus_image_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error); diff --git a/src/portable/portabled-image.c b/src/portable/portabled-image.c new file mode 100644 index 0000000..6d28391 --- /dev/null +++ b/src/portable/portabled-image.c @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "portable.h" +#include "portabled-image.h" +#include "portabled.h" + +Image *manager_image_cache_get(Manager *m, const char *name_or_path) { + assert(m); + + return hashmap_get(m->image_cache, name_or_path); +} + +static int image_cache_flush(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + hashmap_clear(m->image_cache); + return 0; +} + +static int manager_image_cache_initialize(Manager *m) { + int r; + + assert(m); + + r = hashmap_ensure_allocated(&m->image_cache, &image_hash_ops); + if (r < 0) + return r; + + /* We flush the cache as soon as we are idle again */ + if (!m->image_cache_defer_event) { + r = sd_event_add_defer(m->event, &m->image_cache_defer_event, image_cache_flush, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->image_cache_defer_event, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return r; + } + + r = sd_event_source_set_enabled(m->image_cache_defer_event, SD_EVENT_ONESHOT); + if (r < 0) + return r; + + return 0; +} + +int manager_image_cache_add(Manager *m, Image *image) { + int r; + + assert(m); + + /* We add the specified image to the cache under two keys. + * + * 1. Always under its path + * + * 2. If the image was discovered in the search path (i.e. its discoverable boolean set) we'll also add it + * under its short name. + */ + + r = manager_image_cache_initialize(m); + if (r < 0) + return r; + + image->userdata = m; + + r = hashmap_put(m->image_cache, image->path, image); + if (r < 0) + return r; + + image_ref(image); + + if (image->discoverable) { + r = hashmap_put(m->image_cache, image->name, image); + if (r < 0) + return r; + + image_ref(image); + } + + return 0; +} + +int manager_image_cache_discover(Manager *m, Hashmap *images, sd_bus_error *error) { + Image *image; + int r; + + assert(m); + + /* A wrapper around image_discover() (for finding images in search path) and portable_discover_attached() (for + * finding attached images). */ + + r = image_discover(IMAGE_PORTABLE, NULL, images); + if (r < 0) + return r; + + HASHMAP_FOREACH(image, images) + (void) manager_image_cache_add(m, image); + + return 0; +} diff --git a/src/portable/portabled-image.h b/src/portable/portabled-image.h new file mode 100644 index 0000000..753f389 --- /dev/null +++ b/src/portable/portabled-image.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "discover-image.h" +#include "hashmap.h" +#include "portabled.h" + +Image *manager_image_cache_get(Manager *m, const char *name_or_path); + +int manager_image_cache_add(Manager *m, Image *image); + +int manager_image_cache_discover(Manager *m, Hashmap *images, sd_bus_error *error); diff --git a/src/portable/portabled-operation.c b/src/portable/portabled-operation.c new file mode 100644 index 0000000..53f33e5 --- /dev/null +++ b/src/portable/portabled-operation.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "portabled-operation.h" +#include "process-util.h" + +static int operation_done(sd_event_source *s, const siginfo_t *si, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + Operation *o = ASSERT_PTR(userdata); + int r; + + assert(si); + + log_debug("Operation " PID_FMT " is now complete with code=%s status=%i", + o->pid, + sigchld_code_to_string(si->si_code), si->si_status); + + o->pid = 0; + + if (si->si_code != CLD_EXITED) { + r = sd_bus_error_set(&error, SD_BUS_ERROR_FAILED, "Child died abnormally."); + goto fail; + } + + if (si->si_status == EXIT_SUCCESS) + r = 0; + else if (read(o->errno_fd, &r, sizeof(r)) != sizeof(r)) { /* Try to acquire error code for failed operation */ + r = sd_bus_error_set(&error, SD_BUS_ERROR_FAILED, "Child failed."); + goto fail; + } + + if (o->done) { + /* A completion routine is set for this operation, call it. */ + r = o->done(o, r, &error); + if (r < 0) { + if (!sd_bus_error_is_set(&error)) + sd_bus_error_set_errno(&error, r); + + goto fail; + } + + } else { + /* The default operation when done is to simply return an error on failure or an empty success + * message on success. */ + if (r < 0) { + sd_bus_error_set_errno(&error, r); + goto fail; + } + + r = sd_bus_reply_method_return(o->message, NULL); + if (r < 0) + log_error_errno(r, "Failed to reply to message: %m"); + } + + operation_free(o); + return 0; + +fail: + r = sd_bus_reply_method_error(o->message, &error); + if (r < 0) + log_error_errno(r, "Failed to reply to message: %m"); + + operation_free(o); + return 0; +} + +int operation_new(Manager *manager, pid_t child, sd_bus_message *message, int errno_fd, Operation **ret) { + Operation *o; + int r; + + assert(manager); + assert(child > 1); + assert(message); + assert(errno_fd >= 0); + + o = new0(Operation, 1); + if (!o) + return -ENOMEM; + + o->extra_fd = -EBADF; + + r = sd_event_add_child(manager->event, &o->event_source, child, WEXITED, operation_done, o); + if (r < 0) { + free(o); + return r; + } + + o->pid = child; + o->message = sd_bus_message_ref(message); + o->errno_fd = errno_fd; + + LIST_PREPEND(operations, manager->operations, o); + manager->n_operations++; + o->manager = manager; + + log_debug("Started new operation " PID_FMT ".", child); + + /* At this point we took ownership of both the child and the errno file descriptor! */ + + if (ret) + *ret = o; + + return 0; +} + +Operation *operation_free(Operation *o) { + if (!o) + return NULL; + + sd_event_source_unref(o->event_source); + + safe_close(o->errno_fd); + safe_close(o->extra_fd); + + if (o->pid > 1) + (void) sigkill_wait(o->pid); + + sd_bus_message_unref(o->message); + + if (o->manager) { + LIST_REMOVE(operations, o->manager->operations, o); + o->manager->n_operations--; + } + + return mfree(o); +} diff --git a/src/portable/portabled-operation.h b/src/portable/portabled-operation.h new file mode 100644 index 0000000..f64740e --- /dev/null +++ b/src/portable/portabled-operation.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" +#include "sd-event.h" + +#include "list.h" + +typedef struct Operation Operation; + +#include "portabled.h" + +#define OPERATIONS_MAX 64 + +struct Operation { + Manager *manager; + pid_t pid; + sd_bus_message *message; + int errno_fd; + int extra_fd; + sd_event_source *event_source; + int (*done)(Operation *o, int ret, sd_bus_error *error); + LIST_FIELDS(Operation, operations); +}; + +int operation_new(Manager *manager, pid_t child, sd_bus_message *message, int errno_fd, Operation **ret); +Operation *operation_free(Operation *o); diff --git a/src/portable/portabled.c b/src/portable/portabled.c new file mode 100644 index 0000000..136c5fa --- /dev/null +++ b/src/portable/portabled.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-log-control-api.h" +#include "bus-polkit.h" +#include "common-signal.h" +#include "constants.h" +#include "daemon-util.h" +#include "main-func.h" +#include "portabled-bus.h" +#include "portabled-image-bus.h" +#include "portabled.h" +#include "process-util.h" +#include "service-util.h" +#include "signal-util.h" + +static Manager* manager_unref(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unref); + +static int manager_new(Manager **ret) { + _cleanup_(manager_unrefp) Manager *m = NULL; + int r; + + assert(ret); + + m = new0(Manager, 1); + if (!m) + return -ENOMEM; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + if (r < 0) + return r; + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + (void) sd_event_set_watchdog(m->event, true); + + *ret = TAKE_PTR(m); + return 0; +} + +static Manager* manager_unref(Manager *m) { + assert(m); + + hashmap_free(m->image_cache); + + sd_event_source_unref(m->image_cache_defer_event); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + + sd_bus_flush_close_unref(m->bus); + sd_event_unref(m->event); + + return mfree(m); +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = sd_bus_default_system(&m->bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.portable1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + (void) sd_bus_set_exit_on_disconnect(m->bus, true); + + return 0; +} + +static int manager_startup(Manager *m) { + int r; + + assert(m); + + r = manager_connect_bus(m); + if (r < 0) + return r; + + return 0; +} + +static bool check_idle(void *userdata) { + Manager *m = userdata; + + return !m->operations; +} + +static int manager_run(Manager *m) { + assert(m); + + return bus_event_loop_with_idle( + m->event, + m->bus, + "org.freedesktop.portable1", + DEFAULT_EXIT_USEC, + check_idle, m); +} + +static int run(int argc, char *argv[]) { + _cleanup_(manager_unrefp) Manager *m = NULL; + int r; + + log_setup(); + + r = service_parse_argv("systemd-portabled.service", + "Manage registrations of portable images.", + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + if (argc != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments."); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to allocate manager object: %m"); + + r = manager_startup(m); + if (r < 0) + return log_error_errno(r, "Failed to fully start up daemon: %m"); + + log_debug("systemd-portabled running as pid " PID_FMT, getpid_cached()); + r = sd_notify(false, NOTIFY_READY); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + + r = manager_run(m); + + (void) sd_notify(false, NOTIFY_STOPPING); + log_debug("systemd-portabled stopped as pid " PID_FMT, getpid_cached()); + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/portable/portabled.h b/src/portable/portabled.h new file mode 100644 index 0000000..71ec41d --- /dev/null +++ b/src/portable/portabled.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-event.h" + +#include "bus-object.h" +#include "hashmap.h" +#include "list.h" + +typedef struct Manager Manager; + +#include "portabled-operation.h" + +struct Manager { + sd_event *event; + sd_bus *bus; + + Hashmap *polkit_registry; + + Hashmap *image_cache; + sd_event_source *image_cache_defer_event; + + LIST_HEAD(Operation, operations); + unsigned n_operations; +}; + +extern const BusObjectImplementation manager_object; diff --git a/src/portable/profile/default/service.conf b/src/portable/profile/default/service.conf new file mode 100644 index 0000000..230aa60 --- /dev/null +++ b/src/portable/profile/default/service.conf @@ -0,0 +1,30 @@ +# The "default" security profile for services, i.e. a number of useful restrictions + +[Service] +MountAPIVFS=yes +BindReadOnlyPaths=/dev/log /run/systemd/journal/socket /run/systemd/journal/stdout +BindReadOnlyPaths=/etc/machine-id +BindReadOnlyPaths=/etc/resolv.conf +BindReadOnlyPaths=/run/dbus/system_bus_socket +DynamicUser=yes +RemoveIPC=yes +CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_FOWNER \ + CAP_FSETID CAP_IPC_LOCK CAP_IPC_OWNER CAP_KILL CAP_MKNOD CAP_NET_ADMIN \ + CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_SETGID CAP_SETPCAP \ + CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT CAP_SYS_NICE CAP_SYS_RESOURCE +PrivateTmp=yes +PrivateDevices=yes +PrivateUsers=yes +ProtectSystem=strict +ProtectHome=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 +LockPersonality=yes +MemoryDenyWriteExecute=yes +RestrictRealtime=yes +RestrictNamespaces=yes +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM +SystemCallArchitectures=native diff --git a/src/portable/profile/nonetwork/service.conf b/src/portable/profile/nonetwork/service.conf new file mode 100644 index 0000000..cd7f75c --- /dev/null +++ b/src/portable/profile/nonetwork/service.conf @@ -0,0 +1,30 @@ +# The "nonetwork" security profile for services, i.e. like "default" but without networking + +[Service] +MountAPIVFS=yes +BindReadOnlyPaths=/dev/log /run/systemd/journal/socket /run/systemd/journal/stdout +BindReadOnlyPaths=/etc/machine-id +BindReadOnlyPaths=/run/dbus/system_bus_socket +DynamicUser=yes +RemoveIPC=yes +CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_FOWNER \ + CAP_FSETID CAP_IPC_LOCK CAP_IPC_OWNER CAP_KILL CAP_MKNOD CAP_SETGID CAP_SETPCAP \ + CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT CAP_SYS_NICE CAP_SYS_RESOURCE +PrivateTmp=yes +PrivateDevices=yes +PrivateUsers=yes +ProtectSystem=strict +ProtectHome=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +RestrictAddressFamilies=AF_UNIX AF_NETLINK +LockPersonality=yes +MemoryDenyWriteExecute=yes +RestrictRealtime=yes +RestrictNamespaces=yes +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM +SystemCallArchitectures=native +PrivateNetwork=yes +IPAddressDeny=any diff --git a/src/portable/profile/strict/service.conf b/src/portable/profile/strict/service.conf new file mode 100644 index 0000000..f924e10 --- /dev/null +++ b/src/portable/profile/strict/service.conf @@ -0,0 +1,29 @@ +# The "strict" security profile for services, all options turned on + +[Service] +MountAPIVFS=yes +BindReadOnlyPaths=/dev/log /run/systemd/journal/socket /run/systemd/journal/stdout +BindReadOnlyPaths=/etc/machine-id +DynamicUser=yes +RemoveIPC=yes +CapabilityBoundingSet= +PrivateTmp=yes +PrivateDevices=yes +PrivateUsers=yes +ProtectSystem=strict +ProtectHome=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +RestrictAddressFamilies=AF_UNIX +LockPersonality=yes +NoNewPrivileges=yes +MemoryDenyWriteExecute=yes +RestrictRealtime=yes +RestrictNamespaces=yes +SystemCallFilter=@system-service +SystemCallErrorNumber=EPERM +SystemCallArchitectures=native +PrivateNetwork=yes +IPAddressDeny=any +TasksMax=4 diff --git a/src/portable/profile/trusted/service.conf b/src/portable/profile/trusted/service.conf new file mode 100644 index 0000000..04deeb2 --- /dev/null +++ b/src/portable/profile/trusted/service.conf @@ -0,0 +1,8 @@ +# The "trusted" profile for services, i.e. no restrictions are applied apart from a private /tmp + +[Service] +MountAPIVFS=yes +PrivateTmp=yes +BindPaths=/run +BindReadOnlyPaths=/etc/machine-id +BindReadOnlyPaths=/etc/resolv.conf diff --git a/src/pstore/meson.build b/src/pstore/meson.build new file mode 100644 index 0000000..b6fda87 --- /dev/null +++ b/src/pstore/meson.build @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-pstore', + 'conditions' : ['ENABLE_PSTORE'], + 'sources' : files('pstore.c'), + 'dependencies' : [ + libacl, + liblz4, + libxz, + libzstd, + threads, + ], + }, +] + +if conf.get('ENABLE_PSTORE') == 1 and install_sysconfdir_samples + install_data('pstore.conf', + install_dir : pkgconfigfiledir) +endif diff --git a/src/pstore/pstore.c b/src/pstore/pstore.c new file mode 100644 index 0000000..8f32a0a --- /dev/null +++ b/src/pstore/pstore.c @@ -0,0 +1,367 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* Copyright © 2019 Oracle and/or its affiliates. */ + +/* Generally speaking, the pstore contains a small number of files + * that in turn contain a small amount of data. */ +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-journal.h" +#include "sd-login.h" +#include "sd-messages.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "capability-util.h" +#include "cgroup-util.h" +#include "compress.h" +#include "conf-parser.h" +#include "copy.h" +#include "dirent-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "journal-importer.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "mkdir.h" +#include "parse-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "special.h" +#include "sort-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "user-util.h" + +/* Command line argument handling */ +typedef enum PStoreStorage { + PSTORE_STORAGE_NONE, + PSTORE_STORAGE_EXTERNAL, + PSTORE_STORAGE_JOURNAL, + _PSTORE_STORAGE_MAX, + _PSTORE_STORAGE_INVALID = -EINVAL, +} PStoreStorage; + +static const char* const pstore_storage_table[_PSTORE_STORAGE_MAX] = { + [PSTORE_STORAGE_NONE] = "none", + [PSTORE_STORAGE_EXTERNAL] = "external", + [PSTORE_STORAGE_JOURNAL] = "journal", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(pstore_storage, PStoreStorage); +static DEFINE_CONFIG_PARSE_ENUM(config_parse_pstore_storage, pstore_storage, PStoreStorage, "Failed to parse storage setting"); + +static PStoreStorage arg_storage = PSTORE_STORAGE_EXTERNAL; + +static bool arg_unlink = true; +static const char *arg_sourcedir = "/sys/fs/pstore"; +static const char *arg_archivedir = "/var/lib/systemd/pstore"; + +static int parse_config(void) { + static const ConfigTableItem items[] = { + { "PStore", "Unlink", config_parse_bool, 0, &arg_unlink }, + { "PStore", "Storage", config_parse_pstore_storage, 0, &arg_storage }, + {} + }; + + return config_parse_config_file("pstore.conf", "PStore\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, NULL); +} + +/* File list handling - PStoreEntry is the struct and + * and PStoreEntry is the type that contains all info + * about a pstore entry. */ +typedef struct PStoreEntry { + struct dirent dirent; + bool is_binary; + bool handled; + char *content; + size_t content_size; +} PStoreEntry; + +typedef struct PStoreList { + PStoreEntry *entries; + size_t n_entries; +} PStoreList; + +static void pstore_entries_reset(PStoreList *list) { + for (size_t i = 0; i < list->n_entries; i++) + free(list->entries[i].content); + free(list->entries); + list->n_entries = 0; +} + +static int compare_pstore_entries(const PStoreEntry *a, const PStoreEntry *b) { + return strcmp(a->dirent.d_name, b->dirent.d_name); +} + +static int move_file(PStoreEntry *pe, const char *subdir1, const char *subdir2) { + _cleanup_free_ char *ifd_path = NULL, *ofd_path = NULL; + _cleanup_free_ void *field = NULL; + const char *suffix, *message; + struct iovec iovec[2]; + int n_iovec = 0, r; + + if (pe->handled) + return 0; + + ifd_path = path_join(arg_sourcedir, pe->dirent.d_name); + if (!ifd_path) + return log_oom(); + + ofd_path = path_join(arg_archivedir, subdir1, subdir2, pe->dirent.d_name); + if (!ofd_path) + return log_oom(); + + /* Always log to the journal */ + suffix = arg_storage == PSTORE_STORAGE_EXTERNAL ? strjoina(" moved to ", ofd_path) : (char *)"."; + message = strjoina("MESSAGE=PStore ", pe->dirent.d_name, suffix); + iovec[n_iovec++] = IOVEC_MAKE_STRING(message); + + if (pe->content_size > 0) { + size_t field_size; + + field_size = strlen("FILE=") + pe->content_size; + field = malloc(field_size); + if (!field) + return log_oom(); + memcpy(stpcpy(field, "FILE="), pe->content, pe->content_size); + iovec[n_iovec++] = IOVEC_MAKE(field, field_size); + } + + r = sd_journal_sendv(iovec, n_iovec); + if (r < 0) + return log_error_errno(r, "Failed to log pstore entry: %m"); + + if (arg_storage == PSTORE_STORAGE_EXTERNAL) { + /* Move file from pstore to external storage */ + r = mkdir_parents(ofd_path, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create directory %s: %m", ofd_path); + r = copy_file_atomic(ifd_path, ofd_path, 0600, COPY_REPLACE); + if (r < 0) + return log_error_errno(r, "Failed to copy_file_atomic: %s to %s", ifd_path, ofd_path); + } + + /* If file copied properly, remove it from pstore */ + if (arg_unlink) + (void) unlink(ifd_path); + + pe->handled = true; + + return 0; +} + +static int append_dmesg(PStoreEntry *pe, const char *subdir1, const char *subdir2) { + /* Append dmesg chunk to end, create if needed */ + _cleanup_free_ char *ofd_path = NULL; + _cleanup_close_ int ofd = -EBADF; + ssize_t wr; + + assert(pe); + + if (arg_storage != PSTORE_STORAGE_EXTERNAL) + return 0; + + if (pe->content_size == 0) + return 0; + + ofd_path = path_join(arg_archivedir, subdir1, subdir2, "dmesg.txt"); + if (!ofd_path) + return log_oom(); + + ofd = open(ofd_path, O_CREAT|O_NOFOLLOW|O_NOCTTY|O_CLOEXEC|O_APPEND|O_WRONLY, 0640); + if (ofd < 0) + return log_error_errno(ofd, "Failed to open file %s: %m", ofd_path); + wr = write(ofd, pe->content, pe->content_size); + if (wr < 0) + return log_error_errno(errno, "Failed to store dmesg to %s: %m", ofd_path); + if ((size_t)wr != pe->content_size) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to store dmesg to %s. %zu bytes are lost.", ofd_path, pe->content_size - wr); + + return 0; +} + +static int process_dmesg_files(PStoreList *list) { + /* Move files, reconstruct dmesg.txt */ + _cleanup_free_ char *erst_subdir = NULL; + unsigned long long last_record_id = 0; + + /* When dmesg is written into pstore, it is done so in small chunks, whatever the exchange buffer + * size is with the underlying pstore backend (ie. EFI may be ~2KiB), which means an example + * pstore with approximately 64KB of storage may have up to roughly 32 dmesg files, some likely + * related. + * + * Here we look at the dmesg filename and try to discern if files are part of a related group, + * meaning the same original dmesg. + * + * The dmesg- filename contains the backend-type and the Common Platform Error Record, CPER, + * record id, a 64-bit number. + * + * Files are processed in reverse lexigraphical order so as to properly reconstruct original dmesg. */ + + for (size_t n = list->n_entries; n > 0; n--) { + PStoreEntry *pe; + char *p; + + pe = &list->entries[n-1]; + + if (pe->handled) + continue; + if (endswith(pe->dirent.d_name, ".enc.z")) /* indicates a problem */ + continue; + if (!startswith(pe->dirent.d_name, "dmesg-")) + continue; + + /* The linux kernel changed the prefix from dmesg-efi- to dmesg-efi_pstore- + * so now we have to handle both cases. */ + if ((p = STARTSWITH_SET(pe->dirent.d_name, "dmesg-efi-", "dmesg-efi_pstore-"))) { + /* For the EFI backend, the 3 least significant digits of record id encodes a + * "count" number, the next 2 least significant digits for the dmesg part + * (chunk) number, and the remaining digits as the timestamp. See + * linux/drivers/firmware/efi/efi-pstore.c in efi_pstore_write(). */ + _cleanup_free_ char *subdir1 = NULL, *subdir2 = NULL; + size_t plen = strlen(p); + + if (plen < 6) + continue; + + /* Extract base record id */ + subdir1 = strndup(p, plen - 5); + if (!subdir1) + return log_oom(); + /* Extract "count" field */ + subdir2 = strndup(p + plen - 3, 3); + if (!subdir2) + return log_oom(); + + /* Now move file from pstore to archive storage */ + (void) move_file(pe, subdir1, subdir2); + + /* Append to the dmesg */ + (void) append_dmesg(pe, subdir1, subdir2); + } else if ((p = startswith(pe->dirent.d_name, "dmesg-erst-"))) { + /* For the ERST backend, the record is a monotonically increasing number, seeded as + * a timestamp. See linux/drivers/acpi/apei/erst.c in erst_writer(). */ + unsigned long long record_id; + + if (safe_atollu_full(p, 10, &record_id) < 0) + continue; + if (last_record_id - 1 != record_id) + /* A discontinuity in the number has been detected, this current record id + * will become the directory name for all pieces of the dmesg in this + * series. */ + if (free_and_strdup(&erst_subdir, p) < 0) + return log_oom(); + + /* Now move file from pstore to archive storage */ + (void) move_file(pe, erst_subdir, NULL); + + /* Append to the dmesg */ + (void) append_dmesg(pe, erst_subdir, NULL); + + /* Update, but keep erst_subdir for next file */ + last_record_id = record_id; + } else + log_debug("Unknown backend, ignoring \"%s\".", pe->dirent.d_name); + } + return 0; +} + +static int list_files(PStoreList *list, const char *sourcepath) { + _cleanup_closedir_ DIR *dirp = NULL; + int r; + + dirp = opendir(sourcepath); + if (!dirp) + return log_error_errno(errno, "Failed to opendir %s: %m", sourcepath); + + FOREACH_DIRENT(de, dirp, return log_error_errno(errno, "Failed to iterate through %s: %m", sourcepath)) { + _cleanup_free_ char *ifd_path = NULL; + + ifd_path = path_join(sourcepath, de->d_name); + if (!ifd_path) + return log_oom(); + + _cleanup_free_ char *buf = NULL; + size_t buf_size; + + /* Now read contents of pstore file */ + r = read_full_virtual_file(ifd_path, &buf, &buf_size); + if (r < 0) { + log_warning_errno(r, "Failed to read file %s, skipping: %m", ifd_path); + continue; + } + + if (!GREEDY_REALLOC(list->entries, list->n_entries + 1)) + return log_oom(); + + list->entries[list->n_entries++] = (PStoreEntry) { + .dirent = *de, + .content = TAKE_PTR(buf), + .content_size = buf_size, + .is_binary = true, + .handled = false, + }; + } + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(pstore_entries_reset) PStoreList list = {}; + int r; + + log_setup(); + + if (argc == 3) { + arg_sourcedir = argv[1]; + arg_archivedir = argv[2]; + } else if (argc > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes zero or two arguments."); + + /* Ignore all parse errors */ + (void) parse_config(); + + log_debug("Selected storage: %s.", pstore_storage_to_string(arg_storage)); + log_debug("Selected unlink: %s.", yes_no(arg_unlink)); + + if (arg_storage == PSTORE_STORAGE_NONE) + /* Do nothing, intentionally, leaving pstore untouched */ + return 0; + + /* Obtain list of files in pstore */ + r = list_files(&list, arg_sourcedir); + if (r < 0) + return r; + + /* Handle each pstore file */ + /* Sort files lexicographically ascending, generally needed by all */ + typesafe_qsort(list.entries, list.n_entries, compare_pstore_entries); + + /* Process known file types */ + (void) process_dmesg_files(&list); + + /* Move left over files out of pstore */ + for (size_t n = 0; n < list.n_entries; n++) + (void) move_file(&list.entries[n], NULL, NULL); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/pstore/pstore.conf b/src/pstore/pstore.conf new file mode 100644 index 0000000..22b18b5 --- /dev/null +++ b/src/pstore/pstore.conf @@ -0,0 +1,21 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/pstore.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/pstore.conf' to display the full config. +# +# See pstore.conf(5) for details. + +[PStore] +#Storage=external +#Unlink=yes diff --git a/src/quotacheck/meson.build b/src/quotacheck/meson.build new file mode 100644 index 0000000..0a3c3e3 --- /dev/null +++ b/src/quotacheck/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-quotacheck', + 'conditions' : ['ENABLE_QUOTACHECK'], + 'sources' : files('quotacheck.c'), + }, +] diff --git a/src/quotacheck/quotacheck.c b/src/quotacheck/quotacheck.c new file mode 100644 index 0000000..27a914d --- /dev/null +++ b/src/quotacheck/quotacheck.c @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "main-func.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "signal-util.h" +#include "string-util.h" + +static bool arg_skip = false; +static bool arg_force = false; + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + + if (streq(key, "quotacheck.mode")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (streq(value, "auto")) + arg_force = arg_skip = false; + else if (streq(value, "force")) + arg_force = true; + else if (streq(value, "skip")) + arg_skip = true; + else + log_warning("Invalid quotacheck.mode= parameter '%s'. Ignoring.", value); + } + +#if HAVE_SYSV_COMPAT + else if (streq(key, "forcequotacheck") && !value) { + log_warning("Please use 'quotacheck.mode=force' rather than 'forcequotacheck' on the kernel command line."); + arg_force = true; + } +#endif + + return 0; +} + +static void test_files(void) { + +#if HAVE_SYSV_COMPAT + if (access("/forcequotacheck", F_OK) >= 0) { + log_error("Please pass 'quotacheck.mode=force' on the kernel command line rather than creating /forcequotacheck on the root file system."); + arg_force = true; + } +#endif +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + if (argc > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes no arguments."); + + umask(0022); + + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + + test_files(); + + if (!arg_force) { + if (arg_skip) + return 0; + + if (access("/run/systemd/quotacheck", F_OK) < 0) + return 0; + } + + r = safe_fork("(quotacheck)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_WAIT|FORK_LOG, NULL); + if (r < 0) + return r; + if (r == 0) { + static const char * const cmdline[] = { + QUOTACHECK, + "-anug", + NULL + }; + + /* Child */ + + execv(cmdline[0], (char**) cmdline); + _exit(EXIT_FAILURE); /* Operational error */ + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/random-seed/meson.build b/src/random-seed/meson.build new file mode 100644 index 0000000..daa2eef --- /dev/null +++ b/src/random-seed/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-random-seed', + 'conditions' : ['ENABLE_RANDOMSEED'], + 'sources' : files('random-seed.c'), + }, +] diff --git a/src/random-seed/random-seed.c b/src/random-seed/random-seed.c new file mode 100644 index 0000000..bad18ad --- /dev/null +++ b/src/random-seed/random-seed.c @@ -0,0 +1,457 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#if USE_SYS_RANDOM_H +# include +#endif +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "build.h" +#include "fd-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "log.h" +#include "main-func.h" +#include "missing_random.h" +#include "missing_syscall.h" +#include "mkdir.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "random-util.h" +#include "string-table.h" +#include "string-util.h" +#include "sync-util.h" +#include "sha256.h" +#include "xattr-util.h" + +typedef enum SeedAction { + ACTION_LOAD, + ACTION_SAVE, + _ACTION_MAX, + _ACTION_INVALID = -EINVAL, +} SeedAction; + +typedef enum CreditEntropy { + CREDIT_ENTROPY_NO_WAY, + CREDIT_ENTROPY_YES_PLEASE, + CREDIT_ENTROPY_YES_FORCED, +} CreditEntropy; + +static SeedAction arg_action = _ACTION_INVALID; + +static CreditEntropy may_credit(int seed_fd) { + const char *e; + int r; + + assert(seed_fd >= 0); + + e = getenv("SYSTEMD_RANDOM_SEED_CREDIT"); + if (!e) { + log_debug("$SYSTEMD_RANDOM_SEED_CREDIT is not set, not crediting entropy."); + return CREDIT_ENTROPY_NO_WAY; + } + if (streq(e, "force")) { + log_debug("$SYSTEMD_RANDOM_SEED_CREDIT is set to 'force', crediting entropy."); + return CREDIT_ENTROPY_YES_FORCED; + } + + r = parse_boolean(e); + if (r <= 0) { + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_RANDOM_SEED_CREDIT, not crediting entropy: %m"); + else + log_debug("Crediting entropy is turned off via $SYSTEMD_RANDOM_SEED_CREDIT, not crediting entropy."); + + return CREDIT_ENTROPY_NO_WAY; + } + + /* Determine if the file is marked as creditable */ + r = getxattr_at_bool(seed_fd, /* path= */ NULL, "user.random-seed-creditable", /* flags= */ 0); + if (r < 0) { + if (ERRNO_IS_XATTR_ABSENT(r)) + log_debug_errno(r, "Seed file is not marked as creditable, not crediting."); + else + log_warning_errno(r, "Failed to read extended attribute, ignoring: %m"); + + return CREDIT_ENTROPY_NO_WAY; + } + if (r == 0) { + log_debug("Seed file is marked as not creditable, not crediting."); + return CREDIT_ENTROPY_NO_WAY; + } + + /* Don't credit the random seed if we are in first-boot mode, because we are supposed to start from + * scratch. This is a safety precaution for cases where people ship "golden" images with empty + * /etc but populated /var that contains a random seed. */ + r = RET_NERRNO(access("/run/systemd/first-boot", F_OK)); + if (r == -ENOENT) + /* All is good, we are not in first-boot mode. */ + return CREDIT_ENTROPY_YES_PLEASE; + if (r < 0) { + log_warning_errno(r, "Failed to check whether we are in first-boot mode, not crediting entropy: %m"); + return CREDIT_ENTROPY_NO_WAY; + } + + log_debug("Not crediting entropy, since booted in first-boot mode."); + return CREDIT_ENTROPY_NO_WAY; +} + +static int random_seed_size(int seed_fd, size_t *ret_size) { + struct stat st; + + assert(ret_size); + assert(seed_fd >= 0); + + if (fstat(seed_fd, &st) < 0) + return log_error_errno(errno, "Failed to stat() seed file " RANDOM_SEED ": %m"); + + /* If the seed file is larger than what the kernel expects, then honour the existing size and + * save/restore as much as it says */ + + *ret_size = CLAMP((uint64_t)st.st_size, random_pool_size(), RANDOM_POOL_SIZE_MAX); + return 0; +} + +static void load_machine_id(int urandom_fd) { + sd_id128_t mid; + int r; + + assert(urandom_fd >= 0); + + /* As an extra protection against "golden images" that are put together sloppily, i.e. images which + * are duplicated on multiple systems but where the random seed file is not properly + * reset. Frequently the machine ID is properly reset on those systems however (simply because it's + * easier to notice, if it isn't due to address clashes and so on, while random seed equivalence is + * generally not noticed easily), hence let's simply write the machined ID into the random pool + * too. */ + r = sd_id128_get_machine(&mid); + if (r < 0) + return (void) log_debug_errno(r, "Failed to get machine ID, ignoring: %m"); + + r = random_write_entropy(urandom_fd, &mid, sizeof(mid), /* credit= */ false); + if (r < 0) + log_debug_errno(r, "Failed to write machine ID to /dev/urandom, ignoring: %m"); +} + +static int load_seed_file( + int seed_fd, + int urandom_fd, + size_t seed_size, + struct sha256_ctx **ret_hash_state) { + + _cleanup_free_ void *buf = NULL; + CreditEntropy lets_credit; + ssize_t k; + int r; + + assert(seed_fd >= 0); + assert(urandom_fd >= 0); + + buf = malloc(seed_size); + if (!buf) + return log_oom(); + + k = loop_read(seed_fd, buf, seed_size, false); + if (k < 0) { + log_warning_errno(k, "Failed to read seed from " RANDOM_SEED ": %m"); + return 0; + } + if (k == 0) { + log_debug("Seed file " RANDOM_SEED " not yet initialized, proceeding."); + return 0; + } + + /* If we're going to later write out a seed file, initialize a hash state with the contents of the + * seed file we just read, so that the new one can't regress in entropy. */ + if (ret_hash_state) { + struct sha256_ctx *hash_state; + + hash_state = new(struct sha256_ctx, 1); + if (!hash_state) + return log_oom(); + + sha256_init_ctx(hash_state); + sha256_process_bytes_and_size(buf, k, hash_state); /* Hash with length to distinguish from new seed. */ + + *ret_hash_state = hash_state; + } + + (void) lseek(seed_fd, 0, SEEK_SET); + + lets_credit = may_credit(seed_fd); + + /* Before we credit or use the entropy, let's make sure to securely drop the creditable xattr from + * the file, so that we never credit the same random seed again. Note that further down we'll write a + * new seed again, and likely mark it as credible again, hence this is just paranoia to close the + * short time window between the time we upload the random seed into the kernel and download the new + * one from it. */ + + if (fremovexattr(seed_fd, "user.random-seed-creditable") < 0) { + if (!ERRNO_IS_XATTR_ABSENT(errno)) + log_warning_errno(errno, "Failed to remove extended attribute, ignoring: %m"); + + /* Otherwise, there was no creditable flag set, which is OK. */ + } else { + r = fsync_full(seed_fd); + if (r < 0) { + log_warning_errno(r, "Failed to synchronize seed to disk, not crediting entropy: %m"); + + if (lets_credit == CREDIT_ENTROPY_YES_PLEASE) + lets_credit = CREDIT_ENTROPY_NO_WAY; + } + } + + r = random_write_entropy(urandom_fd, buf, k, + IN_SET(lets_credit, CREDIT_ENTROPY_YES_PLEASE, CREDIT_ENTROPY_YES_FORCED)); + if (r < 0) + log_warning_errno(r, "Failed to write seed to /dev/urandom: %m"); + + return 0; +} + +static int save_seed_file( + int seed_fd, + int urandom_fd, + size_t seed_size, + bool synchronous, + struct sha256_ctx *hash_state) { + + _cleanup_free_ void *buf = NULL; + bool getrandom_worked = false; + ssize_t k, l; + int r; + + assert(seed_fd >= 0); + assert(urandom_fd >= 0); + + /* This is just a safety measure. Given that we are root and most likely created the file ourselves + * the mode and owner should be correct anyway. */ + r = fchmod_and_chown(seed_fd, 0600, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to adjust seed file ownership and access mode: %m"); + + buf = malloc(seed_size); + if (!buf) + return log_oom(); + + k = getrandom(buf, seed_size, GRND_NONBLOCK); + if (k < 0 && errno == EAGAIN && synchronous) { + /* If we're asked to make ourselves a barrier for proper initialization of the random pool + * make this whole job synchronous by asking getrandom() to wait until the requested number + * of random bytes is available. */ + log_notice("Kernel entropy pool is not initialized yet, waiting until it is."); + k = getrandom(buf, seed_size, 0); + } + if (k < 0) + log_debug_errno(errno, "Failed to read random data with getrandom(), falling back to /dev/urandom: %m"); + else if ((size_t) k < seed_size) + log_debug("Short read from getrandom(), falling back to /dev/urandom."); + else + getrandom_worked = true; + + if (!getrandom_worked) { + /* Retry with classic /dev/urandom */ + k = loop_read(urandom_fd, buf, seed_size, false); + if (k < 0) + return log_error_errno(k, "Failed to read new seed from /dev/urandom: %m"); + if (k == 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Got EOF while reading from /dev/urandom."); + } + + /* If we previously read in a seed file, then hash the new seed into the old one, and replace the + * last 32 bytes of the seed with the hash output, so that the new seed file can't regress in + * entropy. */ + if (hash_state) { + uint8_t hash[SHA256_DIGEST_SIZE]; + + sha256_process_bytes_and_size(buf, k, hash_state); /* Hash with length to distinguish from old seed. */ + sha256_finish_ctx(hash_state, hash); + l = MIN((size_t)k, sizeof(hash)); + memcpy((uint8_t *)buf + k - l, hash, l); + } + + r = loop_write(seed_fd, buf, (size_t) k); + if (r < 0) + return log_error_errno(r, "Failed to write new random seed file: %m"); + + if (ftruncate(seed_fd, k) < 0) + return log_error_errno(r, "Failed to truncate random seed file: %m"); + + r = fsync_full(seed_fd); + if (r < 0) + return log_error_errno(r, "Failed to synchronize seed file: %m"); + + /* If we got this random seed data from getrandom() the data is suitable for crediting entropy later + * on. Let's keep that in mind by setting an extended attribute. on the file */ + if (getrandom_worked) + if (fsetxattr(seed_fd, "user.random-seed-creditable", "1", 1, 0) < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to mark seed file as creditable, ignoring: %m"); + return 0; +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-random-seed", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND\n" + "\n%5$sLoad and save the system random seed at boot and shutdown.%6$s\n" + "\n%3$sCommands:%4$s\n" + " load Load a random seed saved on disk into the kernel entropy pool\n" + " save Save a new random seed on disk\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static const char* const seed_action_table[_ACTION_MAX] = { + [ACTION_LOAD] = "load", + [ACTION_SAVE] = "save", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(seed_action, SeedAction); + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(0, NULL, NULL); + case ARG_VERSION: + return version(); + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind + 1 != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program requires one argument."); + + arg_action = seed_action_from_string(argv[optind]); + if (arg_action < 0) + return log_error_errno(arg_action, "Unknown action '%s'", argv[optind]); + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_free_ struct sha256_ctx *hash_state = NULL; + _cleanup_close_ int seed_fd = -EBADF, random_fd = -EBADF; + bool read_seed_file, write_seed_file, synchronous; + size_t seed_size; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + umask(0022); + + r = mkdir_parents(RANDOM_SEED, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create directory " RANDOM_SEED_DIR ": %m"); + + random_fd = open("/dev/urandom", O_RDWR|O_CLOEXEC|O_NOCTTY); + if (random_fd < 0) + return log_error_errno(errno, "Failed to open /dev/urandom: %m"); + + /* When we load the seed we read it and write it to the device and then immediately update the saved + * seed with new data, to make sure the next boot gets seeded differently. */ + + switch (arg_action) { + case ACTION_LOAD: + /* First, let's write the machine ID into /dev/urandom, not crediting entropy. See + * load_machine_id() for an explanation why. */ + load_machine_id(random_fd); + + seed_fd = open(RANDOM_SEED, O_RDWR|O_CLOEXEC|O_NOCTTY|O_CREAT, 0600); + if (seed_fd < 0) { + int open_rw_error = -errno; + + write_seed_file = false; + + seed_fd = open(RANDOM_SEED, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (seed_fd < 0) { + bool missing = errno == ENOENT; + int level = missing ? LOG_DEBUG : LOG_ERR; + + log_full_errno(level, open_rw_error, "Failed to open " RANDOM_SEED " for writing: %m"); + log_full_errno(level, errno, "Failed to open " RANDOM_SEED " for reading: %m"); + return missing ? 0 : -errno; + } + } else + write_seed_file = true; + + read_seed_file = true; + synchronous = true; /* make this invocation a synchronous barrier for random pool initialization */ + break; + + case ACTION_SAVE: + seed_fd = open(RANDOM_SEED, O_WRONLY|O_CLOEXEC|O_NOCTTY|O_CREAT, 0600); + if (seed_fd < 0) + return log_error_errno(errno, "Failed to open " RANDOM_SEED ": %m"); + + read_seed_file = false; + write_seed_file = true; + synchronous = false; + break; + + default: + assert_not_reached(); + } + + r = random_seed_size(seed_fd, &seed_size); + if (r < 0) + return r; + + if (read_seed_file) + r = load_seed_file(seed_fd, random_fd, seed_size, + write_seed_file ? &hash_state : NULL); + + if (r >= 0 && write_seed_file) + r = save_seed_file(seed_fd, random_fd, seed_size, synchronous, hash_state); + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/rc-local-generator/meson.build b/src/rc-local-generator/meson.build new file mode 100644 index 0000000..e74225c --- /dev/null +++ b/src/rc-local-generator/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + generator_template + { + 'name' : 'systemd-rc-local-generator', + 'conditions' : ['HAVE_SYSV_COMPAT'], + 'sources' : files('rc-local-generator.c'), + }, +] diff --git a/src/rc-local-generator/rc-local-generator.c b/src/rc-local-generator/rc-local-generator.c new file mode 100644 index 0000000..89cc5fa --- /dev/null +++ b/src/rc-local-generator/rc-local-generator.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "generator.h" +#include "initrd-util.h" +#include "log.h" +#include "mkdir-label.h" +#include "string-util.h" + +static const char *arg_dest = NULL; + +/* So you are reading this, and might wonder: why is this implemented as a generator rather than as a plain, statically + * enabled service that carries appropriate ConditionFileIsExecutable= lines? The answer is this: conditions bypass + * execution of a service's binary, but they have no influence on unit dependencies. Thus, a service that is + * conditioned out will still act as synchronization point in the dependency tree, and we'd rather not have that for + * these two legacy scripts. */ + +static int add_symlink(const char *service, const char *where) { + const char *from, *to; + + assert(service); + assert(where); + + from = strjoina(SYSTEM_DATA_UNIT_DIR "/", service); + to = strjoina(arg_dest, "/", where, ".wants/", service); + + (void) mkdir_parents_label(to, 0755); + + if (symlink(from, to) < 0) { + if (errno == EEXIST) + return 0; + + return log_error_errno(errno, "Failed to create symlink %s: %m", to); + } + + return 1; +} + +static int check_executable(const char *path) { + assert(path); + + if (access(path, X_OK) < 0) { + if (errno == ENOENT) + return log_debug_errno(errno, "%s does not exist, skipping.", path); + if (errno == EACCES) + return log_info_errno(errno, "%s is not marked executable, skipping.", path); + + return log_warning_errno(errno, "Couldn't determine if %s exists and is executable, skipping: %m", path); + } + + return 0; +} + +static int run(const char *dest, const char *dest_early, const char *dest_late) { + int r = 0, k = 0; + + assert_se(arg_dest = dest); + + if (in_initrd()) { + log_debug("Skipping generator, running in the initrd."); + return EXIT_SUCCESS; + } + + if (check_executable(RC_LOCAL_PATH) >= 0) { + log_debug("Automatically adding rc-local.service."); + + r = add_symlink("rc-local.service", "multi-user.target"); + } + + return r < 0 ? r : k; +} + +DEFINE_MAIN_GENERATOR_FUNCTION(run); diff --git a/src/remount-fs/meson.build b/src/remount-fs/meson.build new file mode 100644 index 0000000..8761d25 --- /dev/null +++ b/src/remount-fs/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-remount-fs', + 'sources' : files('remount-fs.c'), + }, +] diff --git a/src/remount-fs/remount-fs.c b/src/remount-fs/remount-fs.c new file mode 100644 index 0000000..37c7b38 --- /dev/null +++ b/src/remount-fs/remount-fs.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "env-util.h" +#include "exit-status.h" +#include "fstab-util.h" +#include "log.h" +#include "main-func.h" +#include "mount-setup.h" +#include "mount-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "strv.h" + +/* Goes through /etc/fstab and remounts all API file systems, applying options that are in /etc/fstab that systemd + * might not have respected */ + +static int track_pid(Hashmap **h, const char *path, pid_t pid) { + _cleanup_free_ char *c = NULL; + int r; + + assert(h); + assert(path); + assert(pid_is_valid(pid)); + + c = strdup(path); + if (!c) + return log_oom(); + + r = hashmap_ensure_put(h, NULL, PID_TO_PTR(pid), c); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to store pid " PID_FMT, pid); + + TAKE_PTR(c); + return 0; +} + +static int do_remount(const char *path, bool force_rw, Hashmap **pids) { + pid_t pid; + int r; + + log_debug("Remounting %s...", path); + + r = safe_fork(force_rw ? "(remount-rw)" : "(remount)", + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + execv(MOUNT_PATH, + STRV_MAKE(MOUNT_PATH, + path, + "-o", + force_rw ? "remount,rw" : "remount")); + log_error_errno(errno, "Failed to execute " MOUNT_PATH ": %m"); + _exit(EXIT_FAILURE); + } + + /* Parent */ + return track_pid(pids, path, pid); +} + +static int remount_by_fstab(Hashmap **ret_pids) { + _cleanup_hashmap_free_free_ Hashmap *pids = NULL; + _cleanup_endmntent_ FILE *f = NULL; + bool has_root = false; + struct mntent* me; + int r; + + assert(ret_pids); + + if (!fstab_enabled()) + return 0; + + f = setmntent(fstab_path(), "re"); + if (!f) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", fstab_path()); + + return 0; + } + + while ((me = getmntent(f))) { + /* Remount the root fs, /usr, and all API VFSs */ + if (!mount_point_is_api(me->mnt_dir) && + !PATH_IN_SET(me->mnt_dir, "/", "/usr")) + continue; + + if (path_equal(me->mnt_dir, "/")) + has_root = true; + + r = do_remount(me->mnt_dir, false, &pids); + if (r < 0) + return r; + } + + *ret_pids = TAKE_PTR(pids); + return has_root; +} + +static int run(int argc, char *argv[]) { + _cleanup_hashmap_free_free_ Hashmap *pids = NULL; + int r; + + log_setup(); + + if (argc > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes no arguments."); + + umask(0022); + + r = remount_by_fstab(&pids); + if (r < 0) + return r; + if (r == 0) { + /* The $SYSTEMD_REMOUNT_ROOT_RW environment variable is set by systemd-gpt-auto-generator to tell us + * whether to remount things. We honour it only if there's no explicit line in /etc/fstab configured + * which takes precedence. */ + + r = getenv_bool("SYSTEMD_REMOUNT_ROOT_RW"); + if (r < 0 && r != -ENXIO) + log_warning_errno(r, "Failed to parse $SYSTEMD_REMOUNT_ROOT_RW, ignoring: %m"); + + if (r > 0) { + r = do_remount("/", true, &pids); + if (r < 0) + return r; + } + } + + r = 0; + while (!hashmap_isempty(pids)) { + _cleanup_free_ char *s = NULL; + siginfo_t si = {}; + + if (waitid(P_ALL, 0, &si, WEXITED) < 0) { + if (errno == EINTR) + continue; + + return log_error_errno(errno, "waitid() failed: %m"); + } + + s = hashmap_remove(pids, PID_TO_PTR(si.si_pid)); + if (s && + !is_clean_exit(si.si_code, si.si_status, EXIT_CLEAN_COMMAND, NULL)) { + if (si.si_code == CLD_EXITED) + log_error(MOUNT_PATH " for %s exited with exit status %i.", s, si.si_status); + else + log_error(MOUNT_PATH " for %s terminated by signal %s.", s, signal_to_string(si.si_status)); + + r = -ENOEXEC; + } + } + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/reply-password/meson.build b/src/reply-password/meson.build new file mode 100644 index 0000000..ace112c --- /dev/null +++ b/src/reply-password/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-reply-password', + 'sources' : files('reply-password.c'), + }, +] diff --git a/src/reply-password/reply-password.c b/src/reply-password/reply-password.c new file mode 100644 index 0000000..ce4582a --- /dev/null +++ b/src/reply-password/reply-password.c @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "main-func.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "socket-util.h" +#include "string-util.h" + +static int send_on_socket(int fd, const char *socket_name, const void *packet, size_t size) { + union sockaddr_union sa = {}; + int salen; + + assert(fd >= 0); + assert(socket_name); + assert(packet); + + salen = sockaddr_un_set_path(&sa.un, socket_name); + if (salen < 0) + return log_error_errno(salen, "Specified socket path for AF_UNIX socket invalid, refusing: %s", socket_name); + + if (sendto(fd, packet, size, MSG_NOSIGNAL, &sa.sa, salen) < 0) + return log_error_errno(errno, "Failed to send: %m"); + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(erase_and_freep) char *packet = NULL; + _cleanup_close_ int fd = -EBADF; + size_t length = 0; + int r; + + log_setup(); + + if (argc != 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Wrong number of arguments."); + + if (streq(argv[1], "1")) { + _cleanup_(erase_and_freep) char *line = NULL; + + r = read_line(stdin, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read password: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Got EOF while reading password."); + + packet = strjoin("+", line); + if (!packet) + return log_oom(); + + length = 1 + strlen(line) + 1; + + } else if (streq(argv[1], "0")) { + packet = strdup("-"); + if (!packet) + return log_oom(); + + length = 1; + + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid first argument %s", argv[1]); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "socket() failed: %m"); + + return send_on_socket(fd, argv[2], packet, length); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/resolve/RFCs b/src/resolve/RFCs new file mode 100644 index 0000000..7190c16 --- /dev/null +++ b/src/resolve/RFCs @@ -0,0 +1,60 @@ +Y = Comprehensively Implemented, to the point appropriate for resolved +D = Comprehensively Implemented, by a dependency of resolved +! = Missing and something we might want to implement +~ = Needs no explicit support or doesn't apply +? = Is this relevant today? + = We are working on this + +Y https://tools.ietf.org/html/rfc1034 → DOMAIN NAMES - CONCEPTS AND FACILITIES +Y https://tools.ietf.org/html/rfc1035 → DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION +? https://tools.ietf.org/html/rfc1101 → DNS Encoding of Network Names and Other Types +Y https://tools.ietf.org/html/rfc1123 → Requirements for Internet Hosts — Application and Support +~ https://tools.ietf.org/html/rfc1464 → Using the Domain Name System To Store Arbitrary String Attributes +Y https://tools.ietf.org/html/rfc1536 → Common DNS Implementation Errors and Suggested Fixes +Y https://tools.ietf.org/html/rfc1876 → A Means for Expressing Location Information in the Domain Name System +Y https://tools.ietf.org/html/rfc2181 → Clarifications to the DNS Specification +Y https://tools.ietf.org/html/rfc2308 → Negative Caching of DNS Queries (DNS NCACHE) +Y https://tools.ietf.org/html/rfc2782 → A DNS RR for specifying the location of services (DNS SRV) +D https://tools.ietf.org/html/rfc3492 → Punycode: A Bootstring encoding of Unicode for Internationalized Domain Names in Applications (IDNA) +Y https://tools.ietf.org/html/rfc3596 → DNS Extensions to Support IP Version 6 +Y https://tools.ietf.org/html/rfc3597 → Handling of Unknown DNS Resource Record (RR) Types +Y https://tools.ietf.org/html/rfc4033 → DNS Security Introduction and Requirements +Y https://tools.ietf.org/html/rfc4034 → Resource Records for the DNS Security Extensions +Y https://tools.ietf.org/html/rfc4035 → Protocol Modifications for the DNS Security Extensions +! https://tools.ietf.org/html/rfc4183 → A Suggested Scheme for DNS Resolution of Networks and Gateways +Y https://tools.ietf.org/html/rfc4255 → Using DNS to Securely Publish Secure Shell (SSH) Key Fingerprints +Y https://tools.ietf.org/html/rfc4343 → Domain Name System (DNS) Case Insensitivity Clarification +~ https://tools.ietf.org/html/rfc4470 → Minimally Covering NSEC Records and DNSSEC On-line Signing +Y https://tools.ietf.org/html/rfc4501 → Domain Name System Uniform Resource Identifiers +Y https://tools.ietf.org/html/rfc4509 → Use of SHA-256 in DNSSEC Delegation Signer (DS) Resource Records (RRs) +~ https://tools.ietf.org/html/rfc4592 → The Role of Wildcards in the Domain Name System +~ https://tools.ietf.org/html/rfc4697 → Observed DNS Resolution Misbehavior +Y https://tools.ietf.org/html/rfc4795 → Link-Local Multicast Name Resolution (LLMNR) +Y https://tools.ietf.org/html/rfc5011 → Automated Updates of DNS Security (DNSSEC) Trust Anchors +Y https://tools.ietf.org/html/rfc5155 → DNS Security (DNSSEC) Hashed Authenticated Denial of Existence +Y https://tools.ietf.org/html/rfc5452 → Measures for Making DNS More Resilient against Forged Answers +Y https://tools.ietf.org/html/rfc5702 → Use of SHA-2 Algorithms with RSA in DNSKEY and RRSIG Resource Records for DNSSEC +Y https://tools.ietf.org/html/rfc5890 → Internationalized Domain Names for Applications (IDNA): Definitions and Document Framework +Y https://tools.ietf.org/html/rfc5891 → Internationalized Domain Names in Applications (IDNA): Protocol +Y https://tools.ietf.org/html/rfc5966 → DNS Transport over TCP - Implementation Requirements +Y https://tools.ietf.org/html/rfc6303 → Locally Served DNS Zones +Y https://tools.ietf.org/html/rfc6604 → xNAME RCODE and Status Bits Clarification +Y https://tools.ietf.org/html/rfc6605 → Elliptic Curve Digital Signature Algorithm (DSA) for DNSSEC + https://tools.ietf.org/html/rfc6672 → DNAME Redirection in the DNS +! https://tools.ietf.org/html/rfc6731 → Improved Recursive DNS Server Selection for Multi-Interfaced Nodes +Y https://tools.ietf.org/html/rfc6761 → Special-Use Domain Names + https://tools.ietf.org/html/rfc6762 → Multicast DNS + https://tools.ietf.org/html/rfc6763 → DNS-Based Service Discovery +~ https://tools.ietf.org/html/rfc6781 → DNSSEC Operational Practices, Version 2 +Y https://tools.ietf.org/html/rfc6840 → Clarifications and Implementation Notes for DNS Security (DNSSEC) +Y https://tools.ietf.org/html/rfc6891 → Extension Mechanisms for DNS (EDNS(0)) +Y https://tools.ietf.org/html/rfc6944 → Applicability Statement: DNS Security (DNSSEC) DNSKEY Algorithm Implementation Status +Y https://tools.ietf.org/html/rfc6975 → Signaling Cryptographic Algorithm Understanding in DNS Security Extensions (DNSSEC) +Y https://tools.ietf.org/html/rfc7129 → Authenticated Denial of Existence in the DNS +Y https://tools.ietf.org/html/rfc7646 → Definition and Use of DNSSEC Negative Trust Anchors +~ https://tools.ietf.org/html/rfc7719 → DNS Terminology +Y https://tools.ietf.org/html/rfc8080 → Edwards-Curve Digital Security Algorithm (EdDSA) for DNSSEC + +Also relevant: + + https://www.iab.org/documents/correspondence-reports-documents/2013-2/iab-statement-dotless-domains-considered-harmful/ diff --git a/src/resolve/dns-type.c b/src/resolve/dns-type.c new file mode 100644 index 0000000..da68b41 --- /dev/null +++ b/src/resolve/dns-type.c @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "dns-type.h" +#include "parse-util.h" +#include "string-util.h" + +typedef const struct { + uint16_t type; + const char *name; +} dns_type; + +static const struct dns_type_name * +lookup_dns_type (register const char *str, register GPERF_LEN_TYPE len); + +#include "dns_type-from-name.h" +#include "dns_type-to-name.h" + +int dns_type_from_string(const char *s) { + const struct dns_type_name *sc; + + assert(s); + + sc = lookup_dns_type(s, strlen(s)); + if (sc) + return sc->id; + + s = startswith_no_case(s, "TYPE"); + if (s) { + unsigned x; + + if (safe_atou(s, &x) >= 0 && + x <= UINT16_MAX) + return (int) x; + } + + return _DNS_TYPE_INVALID; +} + +bool dns_type_is_pseudo(uint16_t type) { + + /* Checks whether the specified type is a "pseudo-type". What + * a "pseudo-type" precisely is, is defined only very weakly, + * but apparently entails all RR types that are not actually + * stored as RRs on the server and should hence also not be + * cached. We use this list primarily to validate NSEC type + * bitfields, and to verify what to cache. */ + + return IN_SET(type, + 0, /* A Pseudo RR type, according to RFC 2931 */ + DNS_TYPE_ANY, + DNS_TYPE_AXFR, + DNS_TYPE_IXFR, + DNS_TYPE_OPT, + DNS_TYPE_TSIG, + DNS_TYPE_TKEY + ); +} + +bool dns_class_is_pseudo(uint16_t class) { + return class == DNS_CLASS_ANY; +} + +bool dns_type_is_valid_query(uint16_t type) { + + /* The types valid as questions in packets */ + + return !IN_SET(type, + 0, + DNS_TYPE_OPT, + DNS_TYPE_TSIG, + DNS_TYPE_TKEY, + + /* RRSIG are technically valid as questions, but we refuse doing explicit queries for them, as + * they aren't really payload, but signatures for payload, and cannot be validated on their + * own. After all they are the signatures, and have no signatures of their own validating + * them. */ + DNS_TYPE_RRSIG); +} + +bool dns_type_is_zone_transer(uint16_t type) { + + /* Zone transfers, either normal or incremental */ + + return IN_SET(type, + DNS_TYPE_AXFR, + DNS_TYPE_IXFR); +} + +bool dns_type_is_valid_rr(uint16_t type) { + + /* The types valid as RR in packets (but not necessarily + * stored on servers). */ + + return !IN_SET(type, + DNS_TYPE_ANY, + DNS_TYPE_AXFR, + DNS_TYPE_IXFR); +} + +bool dns_class_is_valid_rr(uint16_t class) { + return class != DNS_CLASS_ANY; +} + +bool dns_type_may_redirect(uint16_t type) { + /* The following record types should never be redirected using + * CNAME/DNAME RRs. See + * . */ + + if (dns_type_is_pseudo(type)) + return false; + + return !IN_SET(type, + DNS_TYPE_CNAME, + DNS_TYPE_DNAME, + DNS_TYPE_NSEC3, + DNS_TYPE_NSEC, + DNS_TYPE_RRSIG, + DNS_TYPE_NXT, + DNS_TYPE_SIG, + DNS_TYPE_KEY); +} + +bool dns_type_may_wildcard(uint16_t type) { + + /* The following records may not be expanded from wildcard RRsets */ + + if (dns_type_is_pseudo(type)) + return false; + + return !IN_SET(type, + DNS_TYPE_NSEC3, + DNS_TYPE_SOA, + + /* Prohibited by https://tools.ietf.org/html/rfc4592#section-4.4 */ + DNS_TYPE_DNAME); +} + +bool dns_type_apex_only(uint16_t type) { + + /* Returns true for all RR types that may only appear signed in a zone apex */ + + return IN_SET(type, + DNS_TYPE_SOA, + DNS_TYPE_NS, /* this one can appear elsewhere, too, but not signed */ + DNS_TYPE_DNSKEY, + DNS_TYPE_NSEC3PARAM); +} + +bool dns_type_is_dnssec(uint16_t type) { + return IN_SET(type, + DNS_TYPE_DS, + DNS_TYPE_DNSKEY, + DNS_TYPE_RRSIG, + DNS_TYPE_NSEC, + DNS_TYPE_NSEC3, + DNS_TYPE_NSEC3PARAM); +} + +bool dns_type_is_obsolete(uint16_t type) { + return IN_SET(type, + /* Obsoleted by RFC 973 */ + DNS_TYPE_MD, + DNS_TYPE_MF, + DNS_TYPE_MAILA, + + /* Kinda obsoleted by RFC 2505 */ + DNS_TYPE_MB, + DNS_TYPE_MG, + DNS_TYPE_MR, + DNS_TYPE_MINFO, + DNS_TYPE_MAILB, + + /* RFC1127 kinda obsoleted this by recommending against its use */ + DNS_TYPE_WKS, + + /* Declared historical by RFC 6563 */ + DNS_TYPE_A6, + + /* Obsoleted by DNSSEC-bis */ + DNS_TYPE_NXT, + + /* RFC 1035 removed support for concepts that needed this from RFC 883 */ + DNS_TYPE_NULL); +} + +bool dns_type_needs_authentication(uint16_t type) { + + /* Returns true for all (non-obsolete) RR types where records are not useful if they aren't + * authenticated. I.e. everything that contains crypto keys. */ + + return IN_SET(type, + DNS_TYPE_CERT, + DNS_TYPE_SSHFP, + DNS_TYPE_IPSECKEY, + DNS_TYPE_DS, + DNS_TYPE_DNSKEY, + DNS_TYPE_TLSA, + DNS_TYPE_CDNSKEY, + DNS_TYPE_OPENPGPKEY, + DNS_TYPE_CAA); +} + +int dns_type_to_af(uint16_t t) { + switch (t) { + + case DNS_TYPE_A: + return AF_INET; + + case DNS_TYPE_AAAA: + return AF_INET6; + + case DNS_TYPE_ANY: + return AF_UNSPEC; + + default: + return -EINVAL; + } +} + +const char *dns_class_to_string(uint16_t class) { + + switch (class) { + + case DNS_CLASS_IN: + return "IN"; + + case DNS_CLASS_ANY: + return "ANY"; + } + + return NULL; +} + +int dns_class_from_string(const char *s) { + + if (!s) + return _DNS_CLASS_INVALID; + + if (strcaseeq(s, "IN")) + return DNS_CLASS_IN; + else if (strcaseeq(s, "ANY")) + return DNS_CLASS_ANY; + + return _DNS_CLASS_INVALID; +} + +const char* tlsa_cert_usage_to_string(uint8_t cert_usage) { + + switch (cert_usage) { + + case 0: + return "CA constraint"; + + case 1: + return "Service certificate constraint"; + + case 2: + return "Trust anchor assertion"; + + case 3: + return "Domain-issued certificate"; + + case 4 ... 254: + return "Unassigned"; + + case 255: + return "Private use"; + } + + return NULL; /* clang cannot count that we covered everything */ +} + +const char* tlsa_selector_to_string(uint8_t selector) { + switch (selector) { + + case 0: + return "Full Certificate"; + + case 1: + return "SubjectPublicKeyInfo"; + + case 2 ... 254: + return "Unassigned"; + + case 255: + return "Private use"; + } + + return NULL; +} + +const char* tlsa_matching_type_to_string(uint8_t selector) { + + switch (selector) { + + case 0: + return "No hash used"; + + case 1: + return "SHA-256"; + + case 2: + return "SHA-512"; + + case 3 ... 254: + return "Unassigned"; + + case 255: + return "Private use"; + } + + return NULL; +} diff --git a/src/resolve/dns-type.h b/src/resolve/dns-type.h new file mode 100644 index 0000000..c6be190 --- /dev/null +++ b/src/resolve/dns-type.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +/* DNS record types, taken from + * http://www.iana.org/assignments/dns-parameters/dns-parameters.xhtml. + */ +enum { + /* 0 is reserved */ + DNS_TYPE_A = 0x01, + DNS_TYPE_NS, + DNS_TYPE_MD, + DNS_TYPE_MF, + DNS_TYPE_CNAME, + DNS_TYPE_SOA, + DNS_TYPE_MB, + DNS_TYPE_MG, + DNS_TYPE_MR, + DNS_TYPE_NULL, + DNS_TYPE_WKS, + DNS_TYPE_PTR, + DNS_TYPE_HINFO, + DNS_TYPE_MINFO, + DNS_TYPE_MX, + DNS_TYPE_TXT, + DNS_TYPE_RP, + DNS_TYPE_AFSDB, + DNS_TYPE_X25, + DNS_TYPE_ISDN, + DNS_TYPE_RT, + DNS_TYPE_NSAP, + DNS_TYPE_NSAP_PTR, + DNS_TYPE_SIG, + DNS_TYPE_KEY, + DNS_TYPE_PX, + DNS_TYPE_GPOS, + DNS_TYPE_AAAA, + DNS_TYPE_LOC, + DNS_TYPE_NXT, + DNS_TYPE_EID, + DNS_TYPE_NIMLOC, + DNS_TYPE_SRV, + DNS_TYPE_ATMA, + DNS_TYPE_NAPTR, + DNS_TYPE_KX, + DNS_TYPE_CERT, + DNS_TYPE_A6, + DNS_TYPE_DNAME, + DNS_TYPE_SINK, + DNS_TYPE_OPT, /* EDNS0 option */ + DNS_TYPE_APL, + DNS_TYPE_DS, + DNS_TYPE_SSHFP, + DNS_TYPE_IPSECKEY, + DNS_TYPE_RRSIG, + DNS_TYPE_NSEC, + DNS_TYPE_DNSKEY, + DNS_TYPE_DHCID, + DNS_TYPE_NSEC3, + DNS_TYPE_NSEC3PARAM, + DNS_TYPE_TLSA, + DNS_TYPE_SMIMEA, /* RFC 8162 */ + /* 0x36 (54) is not assigned */ + DNS_TYPE_HIP = 0x37, + DNS_TYPE_NINFO, + DNS_TYPE_RKEY, + DNS_TYPE_TALINK, + DNS_TYPE_CDS, + DNS_TYPE_CDNSKEY, + DNS_TYPE_OPENPGPKEY, + DNS_TYPE_CSYNC, + DNS_TYPE_ZONEMD, + DNS_TYPE_SVCB, /* RFC 9460 */ + DNS_TYPE_HTTPS, /* RFC 9460 */ + /* 0x42…0x62 (66…98) are not assigned */ + DNS_TYPE_SPF = 0x63, + DNS_TYPE_UINFO, + DNS_TYPE_UID, + DNS_TYPE_GID, + DNS_TYPE_UNSPEC, + DNS_TYPE_NID, + DNS_TYPE_L32, + DNS_TYPE_L64, + DNS_TYPE_LP, + DNS_TYPE_EUI48, + DNS_TYPE_EUI64, + /* 0x6e…0xf8 (110…248) are not assigned */ + DNS_TYPE_TKEY = 0xF9, + DNS_TYPE_TSIG, + DNS_TYPE_IXFR, + DNS_TYPE_AXFR, + DNS_TYPE_MAILB, + DNS_TYPE_MAILA, + DNS_TYPE_ANY, + DNS_TYPE_URI, + DNS_TYPE_CAA, + DNS_TYPE_AVC, + DNS_TYPE_DOA, + DNS_TYPE_AMTRELAY, + DNS_TYPE_RESINFO, + /* 0x106…0x7fff (262…32767) are not assigned */ + DNS_TYPE_TA = 0x8000, + DNS_TYPE_DLV, + /* 32770…65279 are not assigned */ + /* 65280…65534 are for private use */ + /* 65535 is reserved */ + _DNS_TYPE_MAX, + _DNS_TYPE_INVALID = -EINVAL, +}; + +assert_cc(DNS_TYPE_SMIMEA == 53); +assert_cc(DNS_TYPE_HTTPS == 65); +assert_cc(DNS_TYPE_EUI64 == 109); +assert_cc(DNS_TYPE_RESINFO == 261); +assert_cc(DNS_TYPE_ANY == 255); + +/* DNS record classes, see RFC 1035 */ +enum { + DNS_CLASS_IN = 0x01, + DNS_CLASS_ANY = 0xFF, + + _DNS_CLASS_MAX, + _DNS_CLASS_INVALID = -EINVAL, +}; + +#define _DNS_CLASS_STRING_MAX (sizeof "CLASS" + DECIMAL_STR_MAX(uint16_t)) +#define _DNS_TYPE_STRING_MAX (sizeof "CLASS" + DECIMAL_STR_MAX(uint16_t)) + +bool dns_type_is_pseudo(uint16_t type); +bool dns_type_is_valid_query(uint16_t type); +bool dns_type_is_valid_rr(uint16_t type); +bool dns_type_may_redirect(uint16_t type); +bool dns_type_is_dnssec(uint16_t type); +bool dns_type_is_obsolete(uint16_t type); +bool dns_type_may_wildcard(uint16_t type); +bool dns_type_apex_only(uint16_t type); +bool dns_type_needs_authentication(uint16_t type); +bool dns_type_is_zone_transer(uint16_t type); +int dns_type_to_af(uint16_t type); + +bool dns_class_is_pseudo(uint16_t class); +bool dns_class_is_valid_rr(uint16_t class); + +/* TYPE?? follows http://tools.ietf.org/html/rfc3597#section-5 */ +const char *dns_type_to_string(int type); +int dns_type_from_string(const char *s); + +const char *dns_class_to_string(uint16_t class); +int dns_class_from_string(const char *name); + +/* https://tools.ietf.org/html/draft-ietf-dane-protocol-23#section-7.2 */ +const char *tlsa_cert_usage_to_string(uint8_t cert_usage); + +/* https://tools.ietf.org/html/draft-ietf-dane-protocol-23#section-7.3 */ +const char *tlsa_selector_to_string(uint8_t selector); + +/* https://tools.ietf.org/html/draft-ietf-dane-protocol-23#section-7.4 */ +const char *tlsa_matching_type_to_string(uint8_t selector); + +/* https://tools.ietf.org/html/rfc6844#section-5.1 */ +#define CAA_FLAG_CRITICAL (1u << 7) diff --git a/src/resolve/dns_type-to-name.awk b/src/resolve/dns_type-to-name.awk new file mode 100644 index 0000000..92187d5 --- /dev/null +++ b/src/resolve/dns_type-to-name.awk @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "const char *dns_type_to_string(int type) {" + print " switch (type) {" +} +{ + printf " case DNS_TYPE_%s: return ", $1; + sub(/_/, "-"); + printf "\"%s\";\n", $1 +} +END{ + print " default: return NULL;" + print " }" + print "}" +} diff --git a/src/resolve/fuzz-dns-packet.c b/src/resolve/fuzz-dns-packet.c new file mode 100644 index 0000000..a5b1fd6 --- /dev/null +++ b/src/resolve/fuzz-dns-packet.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fuzz.h" +#include "memory-util.h" +#include "resolved-dns-packet.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + + if (outside_size_range(size, 0, DNS_PACKET_SIZE_MAX)) + return 0; + + fuzz_setup_logging(); + + assert_se(dns_packet_new(&p, DNS_PROTOCOL_DNS, 0, DNS_PACKET_SIZE_MAX) >= 0); + p->size = 0; /* by default append starts after the header, undo that */ + assert_se(dns_packet_append_blob(p, data, size, NULL) >= 0); + if (size < DNS_PACKET_HEADER_SIZE) { + /* make sure we pad the packet back up to the minimum header size */ + assert_se(p->allocated >= DNS_PACKET_HEADER_SIZE); + memzero(DNS_PACKET_DATA(p) + size, DNS_PACKET_HEADER_SIZE - size); + p->size = DNS_PACKET_HEADER_SIZE; + } + (void) dns_packet_extract(p); + + return 0; +} diff --git a/src/resolve/fuzz-dns-packet.options b/src/resolve/fuzz-dns-packet.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/resolve/fuzz-dns-packet.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/resolve/fuzz-etc-hosts.c b/src/resolve/fuzz-etc-hosts.c new file mode 100644 index 0000000..9fb1ee1 --- /dev/null +++ b/src/resolve/fuzz-etc-hosts.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fuzz.h" +#include "resolved-etc-hosts.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(etc_hosts_clear) EtcHosts h = {}; + + fuzz_setup_logging(); + + f = data_to_file(data, size); + assert_se(f); + + (void) etc_hosts_parse(&h, f); + + return 0; +} diff --git a/src/resolve/fuzz-resource-record.c b/src/resolve/fuzz-resource-record.c new file mode 100644 index 0000000..358a5c7 --- /dev/null +++ b/src/resolve/fuzz-resource-record.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fuzz.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "resolved-dns-packet.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL, *copy = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + if (outside_size_range(size, 0, DNS_PACKET_SIZE_MAX)) + return 0; + + if (dns_resource_record_new_from_raw(&rr, data, size) < 0) + return 0; + + fuzz_setup_logging(); + + assert_se(copy = dns_resource_record_copy(rr)); + assert_se(dns_resource_record_equal(copy, rr) > 0); + + assert_se(f = memstream_init(&m)); + (void) fprintf(f, "%s", strna(dns_resource_record_to_string(rr))); + + if (dns_resource_record_to_json(rr, &v) < 0) + return 0; + + (void) json_variant_dump(v, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR|JSON_FORMAT_SOURCE, f, NULL); + (void) dns_resource_record_to_wire_format(rr, false); + (void) dns_resource_record_to_wire_format(rr, true); + + return 0; +} diff --git a/src/resolve/generate-dns_type-gperf.py b/src/resolve/generate-dns_type-gperf.py new file mode 100755 index 0000000..0d818fb --- /dev/null +++ b/src/resolve/generate-dns_type-gperf.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later + +"""Generate %-from-name.gperf from %-list.txt +""" + +import sys + +name, prefix, input = sys.argv[1:] + +print("""\ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \\"-Wimplicit-fallthrough\\"") +#endif +%}""") +print("""\ +struct {}_name {{ const char* name; int id; }}; +%null-strings +%%""".format(name)) + +for line in open(input): + line = line.rstrip() + s = line.replace('_', '-') + print("{}, {}{}".format(s, prefix, line)) diff --git a/src/resolve/generate-dns_type-list.sed b/src/resolve/generate-dns_type-list.sed new file mode 100644 index 0000000..32af08c --- /dev/null +++ b/src/resolve/generate-dns_type-list.sed @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +s/.* DNS_TYPE_(\w+).*/\1/p diff --git a/src/resolve/meson.build b/src/resolve/meson.build new file mode 100644 index 0000000..e7867e2 --- /dev/null +++ b/src/resolve/meson.build @@ -0,0 +1,240 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +resolve_includes = [includes, include_directories('.')] + +basic_dns_sources = files( + 'resolved-dns-dnssec.c', + 'resolved-dns-packet.c', + 'resolved-dns-rr.c', + 'resolved-dns-answer.c', + 'resolved-dns-question.c', + 'resolved-util.c', + 'dns-type.c', +) + +systemd_resolved_sources = files( + 'resolved-bus.c', + 'resolved-conf.c', + 'resolved-dns-cache.c', + 'resolved-dns-query.c', + 'resolved-dns-scope.c', + 'resolved-dns-search-domain.c', + 'resolved-dns-server.c', + 'resolved-dns-stream.c', + 'resolved-dns-stub.c', + 'resolved-dns-synthesize.c', + 'resolved-dns-transaction.c', + 'resolved-dns-trust-anchor.c', + 'resolved-dns-zone.c', + 'resolved-dnssd-bus.c', + 'resolved-dnssd.c', + 'resolved-etc-hosts.c', + 'resolved-link-bus.c', + 'resolved-link.c', + 'resolved-llmnr.c', + 'resolved-manager.c', + 'resolved-mdns.c', + 'resolved-resolv-conf.c', + 'resolved-socket-graveyard.c', + 'resolved-varlink.c', +) + +resolvectl_sources = files( + 'resolvconf-compat.c', + 'resolvectl.c', +) + +############################################################ + +dns_type_list_txt = custom_target( + 'dns_type-list.txt', + input : ['generate-dns_type-list.sed', 'dns-type.h'], + output : 'dns_type-list.txt', + command : [sed, '-n', '-r', '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +generate_dns_type_gperf = find_program('generate-dns_type-gperf.py') + +gperf_file = custom_target( + 'dns_type-from-name.gperf', + input : dns_type_list_txt, + output : 'dns_type-from-name.gperf', + command : [generate_dns_type_gperf, 'dns_type', 'DNS_TYPE_', '@INPUT@'], + capture : true) + +basic_dns_sources += custom_target( + 'dns_type-from-name.h', + input : gperf_file, + output : 'dns_type-from-name.h', + command : [gperf, + '-L', 'ANSI-C', '-t', '--ignore-case', + '-N', 'lookup_dns_type', + '-H', 'hash_dns_type_name', + '-p', '-C', + '@INPUT@'], + capture : true) + +basic_dns_sources += custom_target( + 'dns_type-to-name.h', + input : ['dns_type-to-name.awk', dns_type_list_txt], + output : 'dns_type-to-name.h', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +libsystemd_resolve_core = static_library( + 'systemd-resolve-core', + basic_dns_sources, + include_directories : includes, + dependencies : userspace, + build_by_default : false) + +systemd_resolved_sources += custom_target( + 'resolved_gperf.c', + input : 'resolved-gperf.gperf', + output : 'resolved-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +systemd_resolved_sources += custom_target( + 'resolved_dnssd_gperf.c', + input : 'resolved-dnssd-gperf.gperf', + output : 'resolved-dnssd-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +systemd_resolved_dependencies = [threads, libm] + [lib_openssl_or_gcrypt] +if conf.get('ENABLE_DNS_OVER_TLS') == 1 + if conf.get('DNS_OVER_TLS_USE_GNUTLS') == 1 + systemd_resolved_sources += files( + 'resolved-dnstls-gnutls.c', + ) + systemd_resolved_dependencies += libgnutls + elif conf.get('DNS_OVER_TLS_USE_OPENSSL') == 1 + systemd_resolved_sources += files( + 'resolved-dnstls-openssl.c', + ) + systemd_resolved_dependencies += libopenssl + else + error('unknown dependency for supporting DNS-over-TLS') + endif +endif + +link_with = [ + libbasic_gcrypt, + libshared, + libsystemd_resolve_core, +] + +resolve_common_template = { + 'link_with' : [ + libshared, + libsystemd_resolve_core, + ], + 'dependencies' : [ + lib_openssl_or_gcrypt, + libm, + ], +} +resolve_test_template = test_template + resolve_common_template +resolve_fuzz_template = fuzz_template + resolve_common_template + +executables += [ + libexec_template + { + 'name' : 'systemd-resolved', + 'dbus' : true, + 'conditions' : ['ENABLE_RESOLVE'], + 'sources' : systemd_resolved_sources + + files('resolved.c'), + 'include_directories' : resolve_includes, + 'link_with' : link_with, + 'dependencies' : systemd_resolved_dependencies, + }, + executable_template + { + 'name' : 'resolvectl', + 'public' : true, + 'conditions' : ['ENABLE_RESOLVE'], + 'sources' : resolvectl_sources, + 'link_with' : link_with, + 'dependencies' : [ + lib_openssl_or_gcrypt, + libidn, + libm, + threads, + ], + }, + resolve_test_template + { + 'sources' : files('test-resolve-tables.c'), + }, + resolve_test_template + { + 'sources' : files('test-dns-packet.c'), + }, + resolve_test_template + { + 'sources' : files( + 'test-resolved-etc-hosts.c', + 'resolved-etc-hosts.c', + ), + }, + resolve_test_template + { + 'sources' : files('test-resolved-packet.c'), + }, + resolve_test_template + { + 'sources' : files('test-dnssec.c'), + 'conditions' : ['HAVE_OPENSSL_OR_GCRYPT'], + }, + resolve_test_template + { + 'sources' : files('test-dnssec-complex.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : [ + files('test-resolved-stream.c'), + basic_dns_sources, + systemd_resolved_sources, + ], + 'dependencies' : [ + lib_openssl_or_gcrypt, + libm, + systemd_resolved_dependencies, + ], + 'include_directories' : resolve_includes, + }, + resolve_fuzz_template + { + 'sources' : files('fuzz-dns-packet.c'), + }, + resolve_fuzz_template + { + 'sources' : files( + 'fuzz-etc-hosts.c', + 'resolved-etc-hosts.c', + ), + }, + resolve_fuzz_template + { + 'sources' : files('fuzz-resource-record.c'), + }, +] + +if conf.get('ENABLE_RESOLVE') == 1 + install_data('org.freedesktop.resolve1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.resolve1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.resolve1.policy', + install_dir : polkitpolicydir) + install_data('resolv.conf', + install_dir : libexecdir) + + install_emptydir(sbindir) + meson.add_install_script(sh, '-c', + ln_s.format(bindir / 'resolvectl', + sbindir / 'resolvconf')) + + # symlink for backwards compatibility after rename + meson.add_install_script(sh, '-c', + ln_s.format(bindir / 'resolvectl', + bindir / 'systemd-resolve')) +endif + +custom_target( + 'resolved.conf', + input : 'resolved.conf.in', + output : 'resolved.conf', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : conf.get('ENABLE_RESOLVE') == 1 and install_sysconfdir_samples, + install_dir : pkgconfigfiledir) diff --git a/src/resolve/org.freedesktop.resolve1.conf b/src/resolve/org.freedesktop.resolve1.conf new file mode 100644 index 0000000..52ea558 --- /dev/null +++ b/src/resolve/org.freedesktop.resolve1.conf @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/resolve/org.freedesktop.resolve1.policy b/src/resolve/org.freedesktop.resolve1.policy new file mode 100644 index 0000000..502b975 --- /dev/null +++ b/src/resolve/org.freedesktop.resolve1.policy @@ -0,0 +1,142 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Register a DNS-SD service + Authentication is required to register a DNS-SD service + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Unregister a DNS-SD service + Authentication is required to unregister a DNS-SD service + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Set DNS servers + Authentication is required to set DNS servers. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Set domains + Authentication is required to set domains. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Set default route + Authentication is required to set default route. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Enable/disable LLMNR + Authentication is required to enable or disable LLMNR. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Enable/disable multicast DNS + Authentication is required to enable or disable multicast DNS. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Enable/disable DNS over TLS + Authentication is required to enable or disable DNS over TLS. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Enable/disable DNSSEC + Authentication is required to enable or disable DNSSEC. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Set DNSSEC Negative Trust Anchors + Authentication is required to set DNSSEC Negative Trust Anchors. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + + Revert name resolution settings + Authentication is required to reset name resolution settings. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-resolve + + + diff --git a/src/resolve/org.freedesktop.resolve1.service b/src/resolve/org.freedesktop.resolve1.service new file mode 100644 index 0000000..32a04f3 --- /dev/null +++ b/src/resolve/org.freedesktop.resolve1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.resolve1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.resolve1.service diff --git a/src/resolve/resolv.conf b/src/resolve/resolv.conf new file mode 100644 index 0000000..b4e9a96 --- /dev/null +++ b/src/resolve/resolv.conf @@ -0,0 +1,19 @@ +# This file belongs to man:systemd-resolved(8). Do not edit. +# +# This is a static resolv.conf file for connecting local clients to the +# internal DNS stub resolver of systemd-resolved. This file lists no search +# domains. +# +# Run "resolvectl status" to see details about the uplink DNS servers +# currently in use. +# +# Third party programs must not access this file directly, but only through the +# symlink at /etc/resolv.conf. To manage man:resolv.conf(5) in a different way, +# replace this symlink by a static file or a different symlink. +# +# See man:systemd-resolved.service(8) for details about the supported modes of +# operation for /etc/resolv.conf. + +nameserver 127.0.0.53 +options edns0 trust-ad +search . diff --git a/src/resolve/resolvconf-compat.c b/src/resolve/resolvconf-compat.c new file mode 100644 index 0000000..bef95c0 --- /dev/null +++ b/src/resolve/resolvconf-compat.c @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "build.h" +#include "constants.h" +#include "dns-domain.h" +#include "extract-word.h" +#include "fileio.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "resolvconf-compat.h" +#include "resolvectl.h" +#include "resolved-def.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +static int resolvconf_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("resolvectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s -a INTERFACE < FILE\n" + "%1$s -d INTERFACE\n" + "\n" + "Register DNS server and domain configuration with systemd-resolved.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -a Register per-interface DNS server and domain data\n" + " -d Unregister per-interface DNS server and domain data\n" + " -f Ignore if specified interface does not exist\n" + " -x Send DNS traffic preferably over this interface\n" + "\n" + "This is a compatibility alias for the resolvectl(1) tool, providing native\n" + "command line compatibility with the resolvconf(8) tool of various Linux\n" + "distributions and BSD systems. Some options supported by other implementations\n" + "are not supported and are ignored: -m, -p, -u. Various options supported by other\n" + "implementations are not supported and will cause the invocation to fail:\n" + "-I, -i, -l, -R, -r, -v, -V, --enable-updates, --disable-updates,\n" + "--updates-are-enabled.\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_nameserver(const char *string) { + int r; + + assert(string); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&string, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + if (strv_push(&arg_set_dns, word) < 0) + return log_oom(); + + word = NULL; + } + + return 0; +} + +static int parse_search_domain(const char *string) { + int r; + + assert(string); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&string, &word, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return r; + if (r == 0) + break; + + if (strv_push(&arg_set_domain, word) < 0) + return log_oom(); + + word = NULL; + } + + return 0; +} + +int resolvconf_parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_ENABLE_UPDATES, + ARG_DISABLE_UPDATES, + ARG_UPDATES_ARE_ENABLED, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + + /* The following are specific to Debian's original resolvconf */ + { "enable-updates", no_argument, NULL, ARG_ENABLE_UPDATES }, + { "disable-updates", no_argument, NULL, ARG_DISABLE_UPDATES }, + { "updates-are-enabled", no_argument, NULL, ARG_UPDATES_ARE_ENABLED }, + {} + }; + + enum { + TYPE_REGULAR, + TYPE_PRIVATE, /* -p: Not supported, treated identically to TYPE_REGULAR */ + TYPE_EXCLUSIVE, /* -x */ + } type = TYPE_REGULAR; + + int c, r; + + assert(argc >= 0); + assert(argv); + + /* openresolv checks these environment variables */ + if (getenv("IF_EXCLUSIVE")) + type = TYPE_EXCLUSIVE; + if (getenv("IF_PRIVATE")) + type = TYPE_PRIVATE; /* not actually supported */ + + arg_mode = _MODE_INVALID; + + while ((c = getopt_long(argc, argv, "hadxpfm:uIi:l:Rr:vV", options, NULL)) >= 0) + switch (c) { + + case 'h': + return resolvconf_help(); + + case ARG_VERSION: + return version(); + + /* -a and -d is what everybody can agree on */ + case 'a': + arg_mode = MODE_SET_LINK; + break; + + case 'd': + arg_mode = MODE_REVERT_LINK; + break; + + /* The exclusive/private/force stuff is an openresolv invention, we support in some skewed way */ + case 'x': + type = TYPE_EXCLUSIVE; + break; + + case 'p': + type = TYPE_PRIVATE; /* not actually supported */ + break; + + case 'f': + arg_ifindex_permissive = true; + break; + + /* The metrics stuff is an openresolv invention we ignore (and don't really need) */ + case 'm': + log_debug("Switch -%c ignored.", c); + break; + + /* -u supposedly should "update all subscribers". We have no subscribers, hence let's make + this a NOP, and exit immediately, cleanly. */ + case 'u': + log_info("Switch -%c ignored.", c); + return 0; + + /* The following options are openresolv inventions we don't support. */ + case 'I': + case 'i': + case 'l': + case 'R': + case 'r': + case 'v': + case 'V': + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Switch -%c not supported.", c); + + /* The Debian resolvconf commands we don't support. */ + case ARG_ENABLE_UPDATES: + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Switch --enable-updates not supported."); + case ARG_DISABLE_UPDATES: + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Switch --disable-updates not supported."); + case ARG_UPDATES_ARE_ENABLED: + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Switch --updates-are-enabled not supported."); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_mode == _MODE_INVALID) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Expected either -a or -d on the command line."); + + if (optind+1 != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Expected interface name as argument."); + + r = ifname_resolvconf_mangle(argv[optind]); + if (r <= 0) + return r; + + optind++; + + if (arg_mode == MODE_SET_LINK) { + unsigned n = 0; + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *a; + + r = read_stripped_line(stdin, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read from stdin: %m"); + if (r == 0) + break; + + n++; + + if (IN_SET(*line, '#', ';', 0)) + continue; + + a = first_word(line, "nameserver"); + if (a) { + (void) parse_nameserver(a); + continue; + } + + a = first_word(line, "domain"); + if (!a) + a = first_word(line, "search"); + if (a) { + (void) parse_search_domain(a); + continue; + } + + log_syntax(NULL, LOG_DEBUG, "stdin", n, 0, "Ignoring resolv.conf line: %s", line); + } + + if (type == TYPE_EXCLUSIVE) { + + /* If -x mode is selected, let's preferably route non-suffixed lookups to this interface. This + * somewhat matches the original -x behaviour */ + + r = strv_extend(&arg_set_domain, "~."); + if (r < 0) + return log_oom(); + + } else if (type == TYPE_PRIVATE) + log_debug("Private DNS server data not supported, ignoring."); + + if (!arg_set_dns) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "No DNS servers specified, refusing operation."); + } + + return 1; /* work to do */ +} diff --git a/src/resolve/resolvconf-compat.h b/src/resolve/resolvconf-compat.h new file mode 100644 index 0000000..33a5318 --- /dev/null +++ b/src/resolve/resolvconf-compat.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int resolvconf_parse_argv(int argc, char *argv[]); diff --git a/src/resolve/resolvectl.c b/src/resolve/resolvectl.c new file mode 100644 index 0000000..afa537f --- /dev/null +++ b/src/resolve/resolvectl.c @@ -0,0 +1,4076 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-bus.h" +#include "sd-netlink.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "build.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-message-util.h" +#include "dns-domain.h" +#include "errno-list.h" +#include "escape.h" +#include "format-table.h" +#include "format-util.h" +#include "gcrypt-util.h" +#include "hostname-util.h" +#include "json.h" +#include "main-func.h" +#include "missing_network.h" +#include "netlink-util.h" +#include "openssl-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "resolvconf-compat.h" +#include "resolve-util.h" +#include "resolvectl.h" +#include "resolved-def.h" +#include "resolved-dns-packet.h" +#include "resolved-util.h" +#include "socket-netlink.h" +#include "sort-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "terminal-util.h" +#include "utf8.h" +#include "varlink.h" +#include "verb-log-control.h" +#include "verbs.h" + +static int arg_family = AF_UNSPEC; +static int arg_ifindex = 0; +static char *arg_ifname = NULL; +static uint16_t arg_type = 0; +static uint16_t arg_class = 0; +static bool arg_legend = true; +static uint64_t arg_flags = 0; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static PagerFlags arg_pager_flags = 0; +bool arg_ifindex_permissive = false; /* If true, don't generate an error if the specified interface index doesn't exist */ +static const char *arg_service_family = NULL; + +typedef enum RawType { + RAW_NONE, + RAW_PAYLOAD, + RAW_PACKET, +} RawType; +static RawType arg_raw = RAW_NONE; + +ExecutionMode arg_mode = MODE_RESOLVE_HOST; + +char **arg_set_dns = NULL; +char **arg_set_domain = NULL; +static const char *arg_set_llmnr = NULL; +static const char *arg_set_mdns = NULL; +static const char *arg_set_dns_over_tls = NULL; +static const char *arg_set_dnssec = NULL; +static char **arg_set_nta = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_ifname, freep); +STATIC_DESTRUCTOR_REGISTER(arg_set_dns, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_set_domain, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_set_nta, strv_freep); + +typedef enum StatusMode { + STATUS_ALL, + STATUS_DNS, + STATUS_DOMAIN, + STATUS_DEFAULT_ROUTE, + STATUS_LLMNR, + STATUS_MDNS, + STATUS_PRIVATE, + STATUS_DNSSEC, + STATUS_NTA, +} StatusMode; + +typedef struct InterfaceInfo { + int index; + const char *name; +} InterfaceInfo; + +static int interface_info_compare(const InterfaceInfo *a, const InterfaceInfo *b) { + int r; + + r = CMP(a->index, b->index); + if (r != 0) + return r; + + return strcmp_ptr(a->name, b->name); +} + +int ifname_mangle_full(const char *s, bool drop_protocol_specifier) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_strv_free_ char **found = NULL; + int r; + + assert(s); + + if (drop_protocol_specifier) { + _cleanup_free_ char *buf = NULL; + int ifindex_longest_name = -ENODEV; + + /* When invoked as resolvconf, drop the protocol specifier(s) at the end. */ + + buf = strdup(s); + if (!buf) + return log_oom(); + + for (;;) { + r = rtnl_resolve_interface(&rtnl, buf); + if (r > 0) { + if (ifindex_longest_name <= 0) + ifindex_longest_name = r; + + r = strv_extend(&found, buf); + if (r < 0) + return log_oom(); + } + + char *dot = strrchr(buf, '.'); + if (!dot) + break; + + *dot = '\0'; + } + + unsigned n = strv_length(found); + if (n > 1) { + _cleanup_free_ char *joined = NULL; + + joined = strv_join(found, ", "); + log_warning("Found multiple interfaces (%s) matching with '%s'. Using '%s' (ifindex=%i).", + strna(joined), s, found[0], ifindex_longest_name); + + } else if (n == 1) { + const char *proto; + + proto = ASSERT_PTR(startswith(s, found[0])); + if (!isempty(proto)) + log_info("Dropped protocol specifier '%s' from '%s'. Using '%s' (ifindex=%i).", + proto, s, found[0], ifindex_longest_name); + } + + r = ifindex_longest_name; + } else + r = rtnl_resolve_interface(&rtnl, s); + if (r < 0) { + if (ERRNO_IS_DEVICE_ABSENT(r) && arg_ifindex_permissive) { + log_debug_errno(r, "Interface '%s' not found, but -f specified, ignoring: %m", s); + return 0; /* done */ + } + return log_error_errno(r, "Failed to resolve interface \"%s\": %m", s); + } + + if (arg_ifindex > 0 && arg_ifindex != r) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified multiple different interfaces. Refusing."); + + arg_ifindex = r; + return free_and_strdup_warn(&arg_ifname, found ? found[0] : s); /* found */ +} + +static void print_source(uint64_t flags, usec_t rtt) { + if (!arg_legend) + return; + + if (flags == 0) + return; + + printf("\n%s-- Information acquired via", ansi_grey()); + + printf(" protocol%s%s%s%s%s", + flags & SD_RESOLVED_DNS ? " DNS" :"", + flags & SD_RESOLVED_LLMNR_IPV4 ? " LLMNR/IPv4" : "", + flags & SD_RESOLVED_LLMNR_IPV6 ? " LLMNR/IPv6" : "", + flags & SD_RESOLVED_MDNS_IPV4 ? " mDNS/IPv4" : "", + flags & SD_RESOLVED_MDNS_IPV6 ? " mDNS/IPv6" : ""); + + printf(" in %s.%s\n" + "%s-- Data is authenticated: %s; Data was acquired via local or encrypted transport: %s%s\n", + FORMAT_TIMESPAN(rtt, 100), + ansi_normal(), + ansi_grey(), + yes_no(flags & SD_RESOLVED_AUTHENTICATED), + yes_no(flags & SD_RESOLVED_CONFIDENTIAL), + ansi_normal()); + + if ((flags & (SD_RESOLVED_FROM_MASK|SD_RESOLVED_SYNTHETIC)) != 0) + printf("%s-- Data from:%s%s%s%s%s%s\n", + ansi_grey(), + FLAGS_SET(flags, SD_RESOLVED_SYNTHETIC) ? " synthetic" : "", + FLAGS_SET(flags, SD_RESOLVED_FROM_CACHE) ? " cache" : "", + FLAGS_SET(flags, SD_RESOLVED_FROM_ZONE) ? " zone" : "", + FLAGS_SET(flags, SD_RESOLVED_FROM_TRUST_ANCHOR) ? " trust-anchor" : "", + FLAGS_SET(flags, SD_RESOLVED_FROM_NETWORK) ? " network" : "", + ansi_normal()); +} + +static void print_ifindex_comment(int printed_so_far, int ifindex) { + char ifname[IF_NAMESIZE]; + int r; + + if (ifindex <= 0) + return; + + r = format_ifname(ifindex, ifname); + if (r < 0) + return (void) log_warning_errno(r, "Failed to resolve interface name for index %i, ignoring: %m", ifindex); + + printf("%*s%s-- link: %s%s", + 60 > printed_so_far ? 60 - printed_so_far : 0, " ", /* Align comment to the 60th column */ + ansi_grey(), ifname, ansi_normal()); +} + +static int resolve_host_error(const char *name, int r, const sd_bus_error *error) { + if (sd_bus_error_has_name(error, BUS_ERROR_DNS_NXDOMAIN)) + return log_error_errno(r, "%s: %s", name, bus_error_message(error, r)); + + return log_error_errno(r, "%s: resolve call failed: %s", name, bus_error_message(error, r)); +} + +static int resolve_host(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *canonical = NULL; + unsigned c = 0; + uint64_t flags; + usec_t ts; + int r; + + assert(name); + + log_debug("Resolving %s (family %s, interface %s).", name, af_to_name(arg_family) ?: "*", isempty(arg_ifname) ? "*" : arg_ifname); + + r = bus_message_new_method_call(bus, &req, bus_resolve_mgr, "ResolveHostname"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "isit", arg_ifindex, name, arg_family, arg_flags); + if (r < 0) + return bus_log_create_error(r); + + ts = now(CLOCK_MONOTONIC); + + r = sd_bus_call(bus, req, SD_RESOLVED_QUERY_TIMEOUT_USEC, &error, &reply); + if (r < 0) + return resolve_host_error(name, r, &error); + + ts = now(CLOCK_MONOTONIC) - ts; + + r = sd_bus_message_enter_container(reply, 'a', "(iiay)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_enter_container(reply, 'r', "iiay")) > 0) { + _cleanup_free_ char *pretty = NULL; + int ifindex, family, k; + union in_addr_union a; + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(reply, "i", &ifindex); + if (r < 0) + return bus_log_parse_error(r); + + sd_bus_error_free(&error); + r = bus_message_read_in_addr_auto(reply, &error, &family, &a); + if (r < 0 && !sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)) + return log_error_errno(r, "%s: systemd-resolved returned invalid result: %s", name, bus_error_message(&error, r)); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)) { + log_debug_errno(r, "%s: systemd-resolved returned invalid result, ignoring: %s", name, bus_error_message(&error, r)); + continue; + } + + r = in_addr_ifindex_to_string(family, &a, ifindex, &pretty); + if (r < 0) + return log_error_errno(r, "Failed to print address for %s: %m", name); + + k = printf("%*s%s %s%s%s", + (int) strlen(name), c == 0 ? name : "", c == 0 ? ":" : " ", + ansi_highlight(), pretty, ansi_normal()); + + print_ifindex_comment(k, ifindex); + fputc('\n', stdout); + + c++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(reply, "st", &canonical, &flags); + if (r < 0) + return bus_log_parse_error(r); + + if (!streq(name, canonical)) + printf("%*s%s (%s)\n", + (int) strlen(name), c == 0 ? name : "", c == 0 ? ":" : " ", + canonical); + + if (c == 0) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "%s: no addresses found", name); + + print_source(flags, ts); + + return 0; +} + +static int resolve_address(sd_bus *bus, int family, const union in_addr_union *address, int ifindex) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *pretty = NULL; + uint64_t flags; + unsigned c = 0; + usec_t ts; + int r; + + assert(bus); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(address); + + if (ifindex <= 0) + ifindex = arg_ifindex; + + r = in_addr_ifindex_to_string(family, address, ifindex, &pretty); + if (r < 0) + return log_oom(); + + log_debug("Resolving %s.", pretty); + + r = bus_message_new_method_call(bus, &req, bus_resolve_mgr, "ResolveAddress"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "ii", ifindex, family); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_array(req, 'y', address, FAMILY_ADDRESS_SIZE(family)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "t", arg_flags); + if (r < 0) + return bus_log_create_error(r); + + ts = now(CLOCK_MONOTONIC); + + r = sd_bus_call(bus, req, SD_RESOLVED_QUERY_TIMEOUT_USEC, &error, &reply); + if (r < 0) + return log_error_errno(r, "%s: resolve call failed: %s", pretty, bus_error_message(&error, r)); + + ts = now(CLOCK_MONOTONIC) - ts; + + r = sd_bus_message_enter_container(reply, 'a', "(is)"); + if (r < 0) + return bus_log_create_error(r); + + while ((r = sd_bus_message_enter_container(reply, 'r', "is")) > 0) { + const char *n; + int k; + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(reply, "is", &ifindex, &n); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return r; + + k = printf("%*s%s %s%s%s", + (int) strlen(pretty), c == 0 ? pretty : "", + c == 0 ? ":" : " ", + ansi_highlight(), n, ansi_normal()); + + print_ifindex_comment(k, ifindex); + fputc('\n', stdout); + + c++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(reply, "t", &flags); + if (r < 0) + return bus_log_parse_error(r); + + if (c == 0) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), + "%s: no names found", pretty); + + print_source(flags, ts); + + return 0; +} + +static int output_rr_packet(const void *d, size_t l, int ifindex) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + int r; + + r = dns_resource_record_new_from_raw(&rr, d, l); + if (r < 0) + return log_error_errno(r, "Failed to parse RR: %m"); + + if (arg_raw == RAW_PAYLOAD) { + void *data; + ssize_t k; + + k = dns_resource_record_payload(rr, &data); + if (k < 0) + return log_error_errno(k, "Cannot dump RR: %m"); + fwrite(data, 1, k, stdout); + } else { + const char *s; + int k; + + s = dns_resource_record_to_string(rr); + if (!s) + return log_oom(); + + k = printf("%s", s); + print_ifindex_comment(k, ifindex); + fputc('\n', stdout); + } + + return 0; +} + +static int idna_candidate(const char *name, char **ret) { + _cleanup_free_ char *idnafied = NULL; + int r; + + assert(name); + assert(ret); + + r = dns_name_apply_idna(name, &idnafied); + if (r < 0) + return log_error_errno(r, "Failed to apply IDNA to name '%s': %m", name); + if (r > 0 && !streq(name, idnafied)) { + *ret = TAKE_PTR(idnafied); + return true; + } + + *ret = NULL; + return false; +} + +static bool single_label_nonsynthetic(const char *name) { + _cleanup_free_ char *first_label = NULL; + int r; + + if (!dns_name_is_single_label(name)) + return false; + + if (is_localhost(name) || + is_gateway_hostname(name) || + is_outbound_hostname(name) || + is_dns_stub_hostname(name) || + is_dns_proxy_stub_hostname(name)) + return false; + + r = resolve_system_hostname(NULL, &first_label); + if (r < 0) { + log_warning_errno(r, "Failed to determine the hostname: %m"); + return false; + } + + return !streq(name, first_label); +} + +static int resolve_record(sd_bus *bus, const char *name, uint16_t class, uint16_t type, bool warn_missing) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *idnafied = NULL; + bool needs_authentication = false; + unsigned n = 0; + uint64_t flags; + usec_t ts; + int r; + + assert(name); + + log_debug("Resolving %s %s %s (interface %s).", name, dns_class_to_string(class), dns_type_to_string(type), isempty(arg_ifname) ? "*" : arg_ifname); + + if (dns_name_dot_suffixed(name) == 0 && single_label_nonsynthetic(name)) + log_notice("(Note that search domains are not appended when --type= is specified. " + "Please specify fully qualified domain names, or remove --type= switch from invocation in order to request regular hostname resolution.)"); + + r = idna_candidate(name, &idnafied); + if (r < 0) + return r; + if (r > 0) + log_notice("(Note that IDNA translation is not applied when --type= is specified. " + "Please specify translated domain names — i.e. '%s' — when resolving raw records, or remove --type= switch from invocation in order to request regular hostname resolution.", + idnafied); + + r = bus_message_new_method_call(bus, &req, bus_resolve_mgr, "ResolveRecord"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "isqqt", arg_ifindex, name, class, type, arg_flags); + if (r < 0) + return bus_log_create_error(r); + + ts = now(CLOCK_MONOTONIC); + + r = sd_bus_call(bus, req, SD_RESOLVED_QUERY_TIMEOUT_USEC, &error, &reply); + if (r < 0) { + if (warn_missing || r != -ENXIO) + log_error("%s: resolve call failed: %s", name, bus_error_message(&error, r)); + return r; + } + + ts = now(CLOCK_MONOTONIC) - ts; + + r = sd_bus_message_enter_container(reply, 'a', "(iqqay)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_enter_container(reply, 'r', "iqqay")) > 0) { + uint16_t c, t; + int ifindex; + const void *d; + size_t l; + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(reply, "iqq", &ifindex, &c, &t); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(reply, 'y', &d, &l); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (arg_raw == RAW_PACKET) { + uint64_t u64 = htole64(l); + + fwrite(&u64, sizeof(u64), 1, stdout); + fwrite(d, 1, l, stdout); + } else { + r = output_rr_packet(d, l, ifindex); + if (r < 0) + return r; + } + + if (dns_type_needs_authentication(t)) + needs_authentication = true; + + n++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(reply, "t", &flags); + if (r < 0) + return bus_log_parse_error(r); + + if (n == 0) { + if (warn_missing) + log_error("%s: no records found", name); + return -ESRCH; + } + + print_source(flags, ts); + + if ((flags & SD_RESOLVED_AUTHENTICATED) == 0 && needs_authentication) { + fflush(stdout); + + fprintf(stderr, "\n%s" + "WARNING: The resources shown contain cryptographic key data which could not be\n" + " authenticated. It is not suitable to authenticate any communication.\n" + " This is usually indication that DNSSEC authentication was not enabled\n" + " or is not available for the selected protocol or DNS servers.%s\n", + ansi_highlight_red(), + ansi_normal()); + } + + return 0; +} + +static int resolve_rfc4501(sd_bus *bus, const char *name) { + uint16_t type = 0, class = 0; + const char *p, *q, *n; + int r; + + assert(bus); + assert(name); + assert(startswith(name, "dns:")); + + /* Parse RFC 4501 dns: URIs */ + + p = name + 4; + + if (p[0] == '/') { + const char *e; + + if (p[1] != '/') + goto invalid; + + e = strchr(p + 2, '/'); + if (!e) + goto invalid; + + if (e != p + 2) + log_warning("DNS authority specification not supported; ignoring specified authority."); + + p = e + 1; + } + + q = strchr(p, '?'); + if (q) { + n = strndupa_safe(p, q - p); + q++; + + for (;;) { + const char *f; + + f = startswith_no_case(q, "class="); + if (f) { + _cleanup_free_ char *t = NULL; + const char *e; + + if (class != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "DNS class specified twice."); + + e = strchrnul(f, ';'); + t = strndup(f, e - f); + if (!t) + return log_oom(); + + r = dns_class_from_string(t); + if (r < 0) + return log_error_errno(r, "Unknown DNS class %s.", t); + + class = r; + + if (*e == ';') { + q = e + 1; + continue; + } + + break; + } + + f = startswith_no_case(q, "type="); + if (f) { + _cleanup_free_ char *t = NULL; + const char *e; + + if (type != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "DNS type specified twice."); + + e = strchrnul(f, ';'); + t = strndup(f, e - f); + if (!t) + return log_oom(); + + r = dns_type_from_string(t); + if (r < 0) + return log_error_errno(r, "Unknown DNS type %s: %m", t); + + type = r; + + if (*e == ';') { + q = e + 1; + continue; + } + + break; + } + + goto invalid; + } + } else + n = p; + + if (class == 0) + class = arg_class ?: DNS_CLASS_IN; + if (type == 0) + type = arg_type ?: DNS_TYPE_A; + + return resolve_record(bus, n, class, type, true); + +invalid: + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid DNS URI: %s", name); +} + +static int verb_query(int argc, char **argv, void *userdata) { + sd_bus *bus = userdata; + int q, r = 0; + + if (arg_type != 0) + STRV_FOREACH(p, argv + 1) { + q = resolve_record(bus, *p, arg_class, arg_type, true); + if (q < 0) + r = q; + } + + else + STRV_FOREACH(p, argv + 1) { + if (startswith(*p, "dns:")) + q = resolve_rfc4501(bus, *p); + else { + int family, ifindex; + union in_addr_union a; + + q = in_addr_ifindex_from_string_auto(*p, &family, &a, &ifindex); + if (q >= 0) + q = resolve_address(bus, family, &a, ifindex); + else + q = resolve_host(bus, *p); + } + if (q < 0) + r = q; + } + + return r; +} + +static int resolve_service(sd_bus *bus, const char *name, const char *type, const char *domain) { + const char *canonical_name, *canonical_type, *canonical_domain; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + size_t indent, sz; + uint64_t flags; + const char *p; + unsigned c; + usec_t ts; + int r; + + assert(bus); + assert(domain); + + name = empty_to_null(name); + type = empty_to_null(type); + + if (name) + log_debug("Resolving service \"%s\" of type %s in %s (family %s, interface %s).", name, type, domain, af_to_name(arg_family) ?: "*", isempty(arg_ifname) ? "*" : arg_ifname); + else if (type) + log_debug("Resolving service type %s of %s (family %s, interface %s).", type, domain, af_to_name(arg_family) ?: "*", isempty(arg_ifname) ? "*" : arg_ifname); + else + log_debug("Resolving service type %s (family %s, interface %s).", domain, af_to_name(arg_family) ?: "*", isempty(arg_ifname) ? "*" : arg_ifname); + + r = bus_message_new_method_call(bus, &req, bus_resolve_mgr, "ResolveService"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "isssit", arg_ifindex, name, type, domain, arg_family, arg_flags); + if (r < 0) + return bus_log_create_error(r); + + ts = now(CLOCK_MONOTONIC); + + r = sd_bus_call(bus, req, SD_RESOLVED_QUERY_TIMEOUT_USEC, &error, &reply); + if (r < 0) + return log_error_errno(r, "Resolve call failed: %s", bus_error_message(&error, r)); + + ts = now(CLOCK_MONOTONIC) - ts; + + r = sd_bus_message_enter_container(reply, 'a', "(qqqsa(iiay)s)"); + if (r < 0) + return bus_log_parse_error(r); + + indent = + (name ? strlen(name) + 1 : 0) + + (type ? strlen(type) + 1 : 0) + + strlen(domain) + 2; + + c = 0; + while ((r = sd_bus_message_enter_container(reply, 'r', "qqqsa(iiay)s")) > 0) { + uint16_t priority, weight, port; + const char *hostname, *canonical; + + r = sd_bus_message_read(reply, "qqqs", &priority, &weight, &port, &hostname); + if (r < 0) + return bus_log_parse_error(r); + + if (name) + printf("%*s%s", (int) strlen(name), c == 0 ? name : "", c == 0 ? "/" : " "); + if (type) + printf("%*s%s", (int) strlen(type), c == 0 ? type : "", c == 0 ? "/" : " "); + + printf("%*s%s %s:%u [priority=%u, weight=%u]\n", + (int) strlen(domain), c == 0 ? domain : "", + c == 0 ? ":" : " ", + hostname, port, + priority, weight); + + r = sd_bus_message_enter_container(reply, 'a', "(iiay)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_enter_container(reply, 'r', "iiay")) > 0) { + _cleanup_free_ char *pretty = NULL; + int ifindex, family, k; + union in_addr_union a; + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(reply, "i", &ifindex); + if (r < 0) + return bus_log_parse_error(r); + + sd_bus_error_free(&error); + r = bus_message_read_in_addr_auto(reply, &error, &family, &a); + if (r < 0 && !sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)) + return log_error_errno(r, "%s: systemd-resolved returned invalid result: %s", name, bus_error_message(&error, r)); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + if (sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)) { + log_debug_errno(r, "%s: systemd-resolved returned invalid result, ignoring: %s", name, bus_error_message(&error, r)); + continue; + } + + r = in_addr_ifindex_to_string(family, &a, ifindex, &pretty); + if (r < 0) + return log_error_errno(r, "Failed to print address for %s: %m", name); + + k = printf("%*s%s", (int) indent, "", pretty); + print_ifindex_comment(k, ifindex); + fputc('\n', stdout); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(reply, "s", &canonical); + if (r < 0) + return bus_log_parse_error(r); + + if (!streq(hostname, canonical)) + printf("%*s(%s)\n", (int) indent, "", canonical); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + c++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_enter_container(reply, 'a', "ay"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read_array(reply, 'y', (const void**) &p, &sz)) > 0) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape_length(p, sz); + if (!escaped) + return log_oom(); + + printf("%*s%s\n", (int) indent, "", escaped); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(reply, "ssst", &canonical_name, &canonical_type, &canonical_domain, &flags); + if (r < 0) + return bus_log_parse_error(r); + + canonical_name = empty_to_null(canonical_name); + canonical_type = empty_to_null(canonical_type); + + if (!streq_ptr(name, canonical_name) || + !streq_ptr(type, canonical_type) || + !streq_ptr(domain, canonical_domain)) { + + printf("%*s(", (int) indent, ""); + + if (canonical_name) + printf("%s/", canonical_name); + if (canonical_type) + printf("%s/", canonical_type); + + printf("%s)\n", canonical_domain); + } + + print_source(flags, ts); + + return 0; +} + +static int verb_service(int argc, char **argv, void *userdata) { + sd_bus *bus = userdata; + + if (argc == 2) + return resolve_service(bus, NULL, NULL, argv[1]); + else if (argc == 3) + return resolve_service(bus, NULL, argv[1], argv[2]); + else + return resolve_service(bus, argv[1], argv[2], argv[3]); +} + +static int resolve_openpgp(sd_bus *bus, const char *address) { + const char *domain, *full; + int r; + _cleanup_free_ char *hashed = NULL; + + assert(bus); + assert(address); + + domain = strrchr(address, '@'); + if (!domain) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Address does not contain '@': \"%s\"", address); + if (domain == address || domain[1] == '\0') + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Address starts or ends with '@': \"%s\"", address); + domain++; + + r = string_hashsum_sha256(address, domain - 1 - address, &hashed); + if (r < 0) + return log_error_errno(r, "Hashing failed: %m"); + + strshorten(hashed, 56); + + full = strjoina(hashed, "._openpgpkey.", domain); + log_debug("Looking up \"%s\".", full); + + r = resolve_record(bus, full, + arg_class ?: DNS_CLASS_IN, + arg_type ?: DNS_TYPE_OPENPGPKEY, false); + + if (IN_SET(r, -ENXIO, -ESRCH)) { /* NXDOMAIN or NODATA? */ + hashed = mfree(hashed); + r = string_hashsum_sha224(address, domain - 1 - address, &hashed); + if (r < 0) + return log_error_errno(r, "Hashing failed: %m"); + + full = strjoina(hashed, "._openpgpkey.", domain); + log_debug("Looking up \"%s\".", full); + + return resolve_record(bus, full, + arg_class ?: DNS_CLASS_IN, + arg_type ?: DNS_TYPE_OPENPGPKEY, true); + } + + return r; +} + +static int verb_openpgp(int argc, char **argv, void *userdata) { + sd_bus *bus = userdata; + int q, r = 0; + + STRV_FOREACH(p, argv + 1) { + q = resolve_openpgp(bus, *p); + if (q < 0) + r = q; + } + + return r; +} + +static int resolve_tlsa(sd_bus *bus, const char *family, const char *address) { + const char *port; + uint16_t port_num = 443; + _cleanup_free_ char *full = NULL; + int r; + + assert(bus); + assert(address); + + port = strrchr(address, ':'); + if (port) { + r = parse_ip_port(port + 1, &port_num); + if (r < 0) + return log_error_errno(r, "Invalid port \"%s\".", port + 1); + + address = strndupa_safe(address, port - address); + } + + r = asprintf(&full, "_%u._%s.%s", + port_num, + family, + address); + if (r < 0) + return log_oom(); + + log_debug("Looking up \"%s\".", full); + + return resolve_record(bus, full, + arg_class ?: DNS_CLASS_IN, + arg_type ?: DNS_TYPE_TLSA, true); +} + +static bool service_family_is_valid(const char *s) { + return STR_IN_SET(s, "tcp", "udp", "sctp"); +} + +static int verb_tlsa(int argc, char **argv, void *userdata) { + sd_bus *bus = userdata; + char **args = argv + 1; + const char *family = "tcp"; + int q, r = 0; + + if (service_family_is_valid(argv[1])) { + family = argv[1]; + args++; + } + + STRV_FOREACH(p, args) { + q = resolve_tlsa(bus, family, *p); + if (q < 0) + r = q; + } + + return r; +} + +static int show_statistics(int argc, char **argv, void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + JsonVariant *reply = NULL; + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_connect_address(&vl, "/run/systemd/resolve/io.systemd.Resolve.Monitor"); + if (r < 0) + return log_error_errno(r, "Failed to connect to query monitoring service /run/systemd/resolve/io.systemd.Resolve.Monitor: %m"); + + r = varlink_call(vl, "io.systemd.Resolve.Monitor.DumpStatistics", NULL, &reply, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to issue DumpStatistics() varlink call: %m"); + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + return json_variant_dump(reply, arg_json_format_flags, NULL, NULL); + + struct statistics { + JsonVariant *transactions; + JsonVariant *cache; + JsonVariant *dnssec; + } statistics; + + static const JsonDispatch statistics_dispatch_table[] = { + { "transactions", JSON_VARIANT_OBJECT, json_dispatch_variant_noref, offsetof(struct statistics, transactions), JSON_MANDATORY }, + { "cache", JSON_VARIANT_OBJECT, json_dispatch_variant_noref, offsetof(struct statistics, cache), JSON_MANDATORY }, + { "dnssec", JSON_VARIANT_OBJECT, json_dispatch_variant_noref, offsetof(struct statistics, dnssec), JSON_MANDATORY }, + {}, + }; + + r = json_dispatch(reply, statistics_dispatch_table, JSON_LOG, &statistics); + if (r < 0) + return r; + + struct transactions { + uint64_t n_current_transactions; + uint64_t n_transactions_total; + uint64_t n_timeouts_total; + uint64_t n_timeouts_served_stale_total; + uint64_t n_failure_responses_total; + uint64_t n_failure_responses_served_stale_total; + } transactions; + + static const JsonDispatch transactions_dispatch_table[] = { + { "currentTransactions", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct transactions, n_current_transactions), JSON_MANDATORY }, + { "totalTransactions", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct transactions, n_transactions_total), JSON_MANDATORY }, + { "totalTimeouts", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct transactions, n_timeouts_total), JSON_MANDATORY }, + { "totalTimeoutsServedStale", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct transactions, n_timeouts_served_stale_total), JSON_MANDATORY }, + { "totalFailedResponses", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct transactions, n_failure_responses_total), JSON_MANDATORY }, + { "totalFailedResponsesServedStale", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct transactions, n_failure_responses_served_stale_total), JSON_MANDATORY }, + {}, + }; + + r = json_dispatch(statistics.transactions, transactions_dispatch_table, JSON_LOG, &transactions); + if (r < 0) + return r; + + struct cache { + uint64_t cache_size; + uint64_t n_cache_hit; + uint64_t n_cache_miss; + } cache; + + static const JsonDispatch cache_dispatch_table[] = { + { "size", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct cache, cache_size), JSON_MANDATORY }, + { "hits", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct cache, n_cache_hit), JSON_MANDATORY }, + { "misses", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct cache, n_cache_miss), JSON_MANDATORY }, + {}, + }; + + r = json_dispatch(statistics.cache, cache_dispatch_table, JSON_LOG, &cache); + if (r < 0) + return r; + + struct dnsssec { + uint64_t n_dnssec_secure; + uint64_t n_dnssec_insecure; + uint64_t n_dnssec_bogus; + uint64_t n_dnssec_indeterminate; + } dnsssec; + + static const JsonDispatch dnssec_dispatch_table[] = { + { "secure", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct dnsssec, n_dnssec_secure), JSON_MANDATORY }, + { "insecure", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct dnsssec, n_dnssec_insecure), JSON_MANDATORY }, + { "bogus", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct dnsssec, n_dnssec_bogus), JSON_MANDATORY }, + { "indeterminate", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct dnsssec, n_dnssec_indeterminate), JSON_MANDATORY }, + {}, + }; + + r = json_dispatch(statistics.dnssec, dnssec_dispatch_table, JSON_LOG, &dnsssec); + if (r < 0) + return r; + + table = table_new_vertical(); + if (!table) + return log_oom(); + + r = table_add_many(table, + TABLE_STRING, "Transactions", + TABLE_SET_COLOR, ansi_highlight(), + TABLE_SET_ALIGN_PERCENT, 0, + TABLE_EMPTY, + TABLE_FIELD, "Current Transactions", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_UINT64, transactions.n_current_transactions, + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_FIELD, "Total Transactions", + TABLE_UINT64, transactions.n_transactions_total, + TABLE_EMPTY, TABLE_EMPTY, + TABLE_STRING, "Cache", + TABLE_SET_COLOR, ansi_highlight(), + TABLE_SET_ALIGN_PERCENT, 0, + TABLE_EMPTY, + TABLE_FIELD, "Current Cache Size", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_UINT64, cache.cache_size, + TABLE_FIELD, "Cache Hits", + TABLE_UINT64, cache.n_cache_hit, + TABLE_FIELD, "Cache Misses", + TABLE_UINT64, cache.n_cache_miss, + TABLE_EMPTY, TABLE_EMPTY, + TABLE_STRING, "Failure Transactions", + TABLE_SET_COLOR, ansi_highlight(), + TABLE_SET_ALIGN_PERCENT, 0, + TABLE_EMPTY, + TABLE_FIELD, "Total Timeouts", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_UINT64, transactions.n_timeouts_total, + TABLE_FIELD, "Total Timeouts (Stale Data Served)", + TABLE_UINT64, transactions.n_timeouts_served_stale_total, + TABLE_FIELD, "Total Failure Responses", + TABLE_UINT64, transactions.n_failure_responses_total, + TABLE_FIELD, "Total Failure Responses (Stale Data Served)", + TABLE_UINT64, transactions.n_failure_responses_served_stale_total, + TABLE_EMPTY, TABLE_EMPTY, + TABLE_STRING, "DNSSEC Verdicts", + TABLE_SET_COLOR, ansi_highlight(), + TABLE_SET_ALIGN_PERCENT, 0, + TABLE_EMPTY, + TABLE_FIELD, "Secure", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_UINT64, dnsssec.n_dnssec_secure, + TABLE_FIELD, "Insecure", + TABLE_UINT64, dnsssec.n_dnssec_insecure, + TABLE_FIELD, "Bogus", + TABLE_UINT64, dnsssec.n_dnssec_bogus, + TABLE_FIELD, "Indeterminate", + TABLE_UINT64, dnsssec.n_dnssec_indeterminate + ); + if (r < 0) + return table_log_add_error(r); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int reset_statistics(int argc, char **argv, void *userdata) { + JsonVariant *reply = NULL; + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_connect_address(&vl, "/run/systemd/resolve/io.systemd.Resolve.Monitor"); + if (r < 0) + return log_error_errno(r, "Failed to connect to query monitoring service /run/systemd/resolve/io.systemd.Resolve.Monitor: %m"); + + r = varlink_call(vl, "io.systemd.Resolve.Monitor.ResetStatistics", NULL, &reply, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to issue ResetStatistics() varlink call: %m"); + + if (!FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + return json_variant_dump(reply, arg_json_format_flags, NULL, NULL); + + return 0; +} + +static int flush_caches(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + int r; + + r = bus_call_method(bus, bus_resolve_mgr, "FlushCaches", &error, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to flush caches: %s", bus_error_message(&error, r)); + + return 0; +} + +static int reset_server_features(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + int r; + + r = bus_call_method(bus, bus_resolve_mgr, "ResetServerFeatures", &error, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to reset server features: %s", bus_error_message(&error, r)); + + return 0; +} + +static int read_dns_server_one( + sd_bus_message *m, + bool with_ifindex, /* read "ifindex" reply that also carries an interface index */ + bool extended, /* read "extended" reply, i.e. with port number and server name */ + bool only_global, /* suppress entries with an (non-loopback) ifindex set (i.e. which are specific to some interface) */ + char **ret) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *pretty = NULL; + union in_addr_union a; + const char *name = NULL; + int32_t ifindex = 0; + int family, r, k; + uint16_t port = 0; + + assert(m); + assert(ret); + + r = sd_bus_message_enter_container( + m, + 'r', + with_ifindex ? (extended ? "iiayqs" : "iiay") : + (extended ? "iayqs" : "iay")); + if (r <= 0) + return r; + + if (with_ifindex) { + r = sd_bus_message_read(m, "i", &ifindex); + if (r < 0) + return r; + } + + k = bus_message_read_in_addr_auto(m, &error, &family, &a); + if (k < 0 && !sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)) + return k; + + if (extended) { + r = sd_bus_message_read(m, "q", &port); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "s", &name); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + if (k < 0) { + log_debug("Invalid DNS server, ignoring: %s", bus_error_message(&error, k)); + *ret = NULL; + return 1; + } + + if (only_global && ifindex > 0 && ifindex != LOOPBACK_IFINDEX) { + /* This one has an (non-loopback) ifindex set, and we were told to suppress those. Hence do so. */ + *ret = NULL; + return 1; + } + + r = in_addr_port_ifindex_name_to_string(family, &a, port, ifindex, name, &pretty); + if (r < 0) + return r; + + *ret = TAKE_PTR(pretty); + return 1; +} + +static int map_link_dns_servers_internal(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata, bool extended) { + char ***l = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', extended ? "(iayqs)" : "(iay)"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *pretty = NULL; + + r = read_dns_server_one(m, /* with_ifindex= */ false, extended, /* only_global= */ false, &pretty); + if (r < 0) + return r; + if (r == 0) + break; + + if (isempty(pretty)) + continue; + + r = strv_consume(l, TAKE_PTR(pretty)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int map_link_dns_servers(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + return map_link_dns_servers_internal(bus, member, m, error, userdata, false); +} + +static int map_link_dns_servers_ex(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + return map_link_dns_servers_internal(bus, member, m, error, userdata, true); +} + +static int map_link_current_dns_server(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + assert(m); + assert(userdata); + + return read_dns_server_one(m, /* with_ifindex= */ false, /* extended= */ false, /* only_global= */ false, userdata); +} + +static int map_link_current_dns_server_ex(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + assert(m); + assert(userdata); + + return read_dns_server_one(m, /* with_ifindex= */ false, /* extended= */ true, /* only_global= */ false, userdata); +} + +static int read_domain_one(sd_bus_message *m, bool with_ifindex, char **ret) { + _cleanup_free_ char *str = NULL; + int ifindex, route_only, r; + const char *domain; + + assert(m); + assert(ret); + + if (with_ifindex) + r = sd_bus_message_read(m, "(isb)", &ifindex, &domain, &route_only); + else + r = sd_bus_message_read(m, "(sb)", &domain, &route_only); + if (r <= 0) + return r; + + if (with_ifindex && ifindex != 0) { + /* only show the global ones here */ + *ret = NULL; + return 1; + } + + if (route_only) + str = strjoin("~", domain); + else + str = strdup(domain); + if (!str) + return -ENOMEM; + + *ret = TAKE_PTR(str); + + return 1; +} + +static int map_link_domains(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + char ***l = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', "(sb)"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *pretty = NULL; + + r = read_domain_one(m, false, &pretty); + if (r < 0) + return r; + if (r == 0) + break; + + if (isempty(pretty)) + continue; + + r = strv_consume(l, TAKE_PTR(pretty)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int status_print_strv_ifindex(int ifindex, const char *ifname, char **p) { + const unsigned indent = strlen("Global: "); /* Use the same indentation everywhere to make things nice */ + int pos1, pos2; + + if (ifname) + printf("%s%nLink %i (%s)%n%s:", ansi_highlight(), &pos1, ifindex, ifname, &pos2, ansi_normal()); + else + printf("%s%nGlobal%n%s:", ansi_highlight(), &pos1, &pos2, ansi_normal()); + + size_t cols = columns(), position = pos2 - pos1 + 2; + + STRV_FOREACH(i, p) { + size_t our_len = utf8_console_width(*i); /* This returns -1 on invalid utf-8 (which shouldn't happen). + * If that happens, we'll just print one item per line. */ + + if (position <= indent || size_add(size_add(position, 1), our_len) < cols) { + printf(" %s", *i); + position = size_add(size_add(position, 1), our_len); + } else { + printf("\n%*s%s", (int) indent, "", *i); + position = size_add(our_len, indent); + } + } + + printf("\n"); + + return 0; +} + +static int status_print_strv_global(char **p) { + return status_print_strv_ifindex(0, NULL, p); +} + +typedef struct LinkInfo { + uint64_t scopes_mask; + const char *llmnr; + const char *mdns; + const char *dns_over_tls; + const char *dnssec; + char *current_dns; + char *current_dns_ex; + char **dns; + char **dns_ex; + char **domains; + char **ntas; + bool dnssec_supported; + bool default_route; +} LinkInfo; + +typedef struct GlobalInfo { + char *current_dns; + char *current_dns_ex; + char **dns; + char **dns_ex; + char **fallback_dns; + char **fallback_dns_ex; + char **domains; + char **ntas; + const char *llmnr; + const char *mdns; + const char *dns_over_tls; + const char *dnssec; + const char *resolv_conf_mode; + bool dnssec_supported; +} GlobalInfo; + +static void link_info_clear(LinkInfo *p) { + free(p->current_dns); + free(p->current_dns_ex); + strv_free(p->dns); + strv_free(p->dns_ex); + strv_free(p->domains); + strv_free(p->ntas); +} + +static void global_info_clear(GlobalInfo *p) { + free(p->current_dns); + free(p->current_dns_ex); + strv_free(p->dns); + strv_free(p->dns_ex); + strv_free(p->fallback_dns); + strv_free(p->fallback_dns_ex); + strv_free(p->domains); + strv_free(p->ntas); +} + +static int dump_list(Table *table, const char *field, char * const *l) { + int r; + + if (strv_isempty(l)) + return 0; + + r = table_add_many(table, + TABLE_FIELD, field, + TABLE_STRV_WRAPPED, l); + if (r < 0) + return table_log_add_error(r); + + return 0; +} + +static int strv_extend_extended_bool(char ***strv, const char *name, const char *value) { + int r; + + if (value) { + r = parse_boolean(value); + if (r >= 0) + return strv_extendf(strv, "%s%s", plus_minus(r), name); + } + + return strv_extendf(strv, "%s=%s", name, value ?: "???"); +} + +static char** link_protocol_status(const LinkInfo *info) { + _cleanup_strv_free_ char **s = NULL; + + if (strv_extendf(&s, "%sDefaultRoute", plus_minus(info->default_route)) < 0) + return NULL; + + if (strv_extend_extended_bool(&s, "LLMNR", info->llmnr) < 0) + return NULL; + + if (strv_extend_extended_bool(&s, "mDNS", info->mdns) < 0) + return NULL; + + if (strv_extend_extended_bool(&s, "DNSOverTLS", info->dns_over_tls) < 0) + return NULL; + + if (strv_extendf(&s, "DNSSEC=%s/%s", + info->dnssec ?: "???", + info->dnssec_supported ? "supported" : "unsupported") < 0) + return NULL; + + return TAKE_PTR(s); +} + +static char** global_protocol_status(const GlobalInfo *info) { + _cleanup_strv_free_ char **s = NULL; + + if (strv_extend_extended_bool(&s, "LLMNR", info->llmnr) < 0) + return NULL; + + if (strv_extend_extended_bool(&s, "mDNS", info->mdns) < 0) + return NULL; + + if (strv_extend_extended_bool(&s, "DNSOverTLS", info->dns_over_tls) < 0) + return NULL; + + if (strv_extendf(&s, "DNSSEC=%s/%s", + info->dnssec ?: "???", + info->dnssec_supported ? "supported" : "unsupported") < 0) + return NULL; + + return TAKE_PTR(s); +} + +static int status_ifindex(sd_bus *bus, int ifindex, const char *name, StatusMode mode, bool *empty_line) { + static const struct bus_properties_map property_map[] = { + { "ScopesMask", "t", NULL, offsetof(LinkInfo, scopes_mask) }, + { "DNS", "a(iay)", map_link_dns_servers, offsetof(LinkInfo, dns) }, + { "DNSEx", "a(iayqs)", map_link_dns_servers_ex, offsetof(LinkInfo, dns_ex) }, + { "CurrentDNSServer", "(iay)", map_link_current_dns_server, offsetof(LinkInfo, current_dns) }, + { "CurrentDNSServerEx", "(iayqs)", map_link_current_dns_server_ex, offsetof(LinkInfo, current_dns_ex) }, + { "Domains", "a(sb)", map_link_domains, offsetof(LinkInfo, domains) }, + { "DefaultRoute", "b", NULL, offsetof(LinkInfo, default_route) }, + { "LLMNR", "s", NULL, offsetof(LinkInfo, llmnr) }, + { "MulticastDNS", "s", NULL, offsetof(LinkInfo, mdns) }, + { "DNSOverTLS", "s", NULL, offsetof(LinkInfo, dns_over_tls) }, + { "DNSSEC", "s", NULL, offsetof(LinkInfo, dnssec) }, + { "DNSSECNegativeTrustAnchors", "as", bus_map_strv_sort, offsetof(LinkInfo, ntas) }, + { "DNSSECSupported", "b", NULL, offsetof(LinkInfo, dnssec_supported) }, + {} + }; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(link_info_clear) LinkInfo link_info = {}; + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *p = NULL; + char ifi[DECIMAL_STR_MAX(int)], ifname[IF_NAMESIZE]; + int r; + + assert(bus); + assert(ifindex > 0); + + if (!name) { + r = format_ifname(ifindex, ifname); + if (r < 0) + return log_error_errno(r, "Failed to resolve interface name for %i: %m", ifindex); + + name = ifname; + } + + xsprintf(ifi, "%i", ifindex); + r = sd_bus_path_encode("/org/freedesktop/resolve1/link", ifi, &p); + if (r < 0) + return log_oom(); + + r = bus_map_all_properties(bus, + "org.freedesktop.resolve1", + p, + property_map, + BUS_MAP_BOOLEAN_AS_BOOL, + &error, + &m, + &link_info); + if (r < 0) + return log_error_errno(r, "Failed to get link data for %i: %s", ifindex, bus_error_message(&error, r)); + + pager_open(arg_pager_flags); + + if (mode == STATUS_DNS) + return status_print_strv_ifindex(ifindex, name, link_info.dns_ex ?: link_info.dns); + + if (mode == STATUS_DOMAIN) + return status_print_strv_ifindex(ifindex, name, link_info.domains); + + if (mode == STATUS_NTA) + return status_print_strv_ifindex(ifindex, name, link_info.ntas); + + if (mode == STATUS_DEFAULT_ROUTE) { + printf("%sLink %i (%s)%s: %s\n", + ansi_highlight(), ifindex, name, ansi_normal(), + yes_no(link_info.default_route)); + + return 0; + } + + if (mode == STATUS_LLMNR) { + printf("%sLink %i (%s)%s: %s\n", + ansi_highlight(), ifindex, name, ansi_normal(), + strna(link_info.llmnr)); + + return 0; + } + + if (mode == STATUS_MDNS) { + printf("%sLink %i (%s)%s: %s\n", + ansi_highlight(), ifindex, name, ansi_normal(), + strna(link_info.mdns)); + + return 0; + } + + if (mode == STATUS_PRIVATE) { + printf("%sLink %i (%s)%s: %s\n", + ansi_highlight(), ifindex, name, ansi_normal(), + strna(link_info.dns_over_tls)); + + return 0; + } + + if (mode == STATUS_DNSSEC) { + printf("%sLink %i (%s)%s: %s\n", + ansi_highlight(), ifindex, name, ansi_normal(), + strna(link_info.dnssec)); + + return 0; + } + + if (empty_line && *empty_line) + fputc('\n', stdout); + + printf("%sLink %i (%s)%s\n", + ansi_highlight(), ifindex, name, ansi_normal()); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + r = table_add_many(table, + TABLE_FIELD, "Current Scopes", + TABLE_SET_MINIMUM_WIDTH, 19); + if (r < 0) + return table_log_add_error(r); + + if (link_info.scopes_mask == 0) + r = table_add_cell(table, NULL, TABLE_STRING, "none"); + else { + _cleanup_free_ char *buf = NULL; + size_t len; + + if (asprintf(&buf, "%s%s%s%s%s", + link_info.scopes_mask & SD_RESOLVED_DNS ? "DNS " : "", + link_info.scopes_mask & SD_RESOLVED_LLMNR_IPV4 ? "LLMNR/IPv4 " : "", + link_info.scopes_mask & SD_RESOLVED_LLMNR_IPV6 ? "LLMNR/IPv6 " : "", + link_info.scopes_mask & SD_RESOLVED_MDNS_IPV4 ? "mDNS/IPv4 " : "", + link_info.scopes_mask & SD_RESOLVED_MDNS_IPV6 ? "mDNS/IPv6 " : "") < 0) + return log_oom(); + + len = strlen(buf); + assert(len > 0); + buf[len - 1] = '\0'; + + r = table_add_cell(table, NULL, TABLE_STRING, buf); + } + if (r < 0) + return table_log_add_error(r); + + _cleanup_strv_free_ char **pstatus = link_protocol_status(&link_info); + if (!pstatus) + return log_oom(); + + r = table_add_many(table, + TABLE_FIELD, "Protocols", + TABLE_STRV_WRAPPED, pstatus); + if (r < 0) + return table_log_add_error(r); + + if (link_info.current_dns) { + r = table_add_many(table, + TABLE_FIELD, "Current DNS Server", + TABLE_STRING, link_info.current_dns_ex ?: link_info.current_dns); + if (r < 0) + return table_log_add_error(r); + } + + r = dump_list(table, "DNS Servers", link_info.dns_ex ?: link_info.dns); + if (r < 0) + return r; + + r = dump_list(table, "DNS Domain", link_info.domains); + if (r < 0) + return r; + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (empty_line) + *empty_line = true; + + return 0; +} + +static int map_global_dns_servers_internal( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata, + bool extended) { + + char ***l = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', extended ? "(iiayqs)" : "(iiay)"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *pretty = NULL; + + r = read_dns_server_one(m, /* with_ifindex= */ true, extended, /* only_global= */ true, &pretty); + if (r < 0) + return r; + if (r == 0) + break; + + if (isempty(pretty)) + continue; + + r = strv_consume(l, TAKE_PTR(pretty)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int map_global_dns_servers(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + return map_global_dns_servers_internal(bus, member, m, error, userdata, /* extended= */ false); +} + +static int map_global_dns_servers_ex(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + return map_global_dns_servers_internal(bus, member, m, error, userdata, /* extended= */ true); +} + +static int map_global_current_dns_server(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + return read_dns_server_one(m, /* with_ifindex= */ true, /* extended= */ false, /* only_global= */ true, userdata); +} + +static int map_global_current_dns_server_ex(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + return read_dns_server_one(m, /* with_ifindex= */ true, /* extended= */ true, /* only_global= */ true, userdata); +} + +static int map_global_domains(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + char ***l = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(member); + assert(m); + + r = sd_bus_message_enter_container(m, 'a', "(isb)"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *pretty = NULL; + + r = read_domain_one(m, true, &pretty); + if (r < 0) + return r; + if (r == 0) + break; + + if (isempty(pretty)) + continue; + + r = strv_consume(l, TAKE_PTR(pretty)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + strv_sort(*l); + + return 0; +} + +static int status_global(sd_bus *bus, StatusMode mode, bool *empty_line) { + static const struct bus_properties_map property_map[] = { + { "DNS", "a(iiay)", map_global_dns_servers, offsetof(GlobalInfo, dns) }, + { "DNSEx", "a(iiayqs)", map_global_dns_servers_ex, offsetof(GlobalInfo, dns_ex) }, + { "FallbackDNS", "a(iiay)", map_global_dns_servers, offsetof(GlobalInfo, fallback_dns) }, + { "FallbackDNSEx", "a(iiayqs)", map_global_dns_servers_ex, offsetof(GlobalInfo, fallback_dns_ex) }, + { "CurrentDNSServer", "(iiay)", map_global_current_dns_server, offsetof(GlobalInfo, current_dns) }, + { "CurrentDNSServerEx", "(iiayqs)", map_global_current_dns_server_ex, offsetof(GlobalInfo, current_dns_ex) }, + { "Domains", "a(isb)", map_global_domains, offsetof(GlobalInfo, domains) }, + { "DNSSECNegativeTrustAnchors", "as", bus_map_strv_sort, offsetof(GlobalInfo, ntas) }, + { "LLMNR", "s", NULL, offsetof(GlobalInfo, llmnr) }, + { "MulticastDNS", "s", NULL, offsetof(GlobalInfo, mdns) }, + { "DNSOverTLS", "s", NULL, offsetof(GlobalInfo, dns_over_tls) }, + { "DNSSEC", "s", NULL, offsetof(GlobalInfo, dnssec) }, + { "DNSSECSupported", "b", NULL, offsetof(GlobalInfo, dnssec_supported) }, + { "ResolvConfMode", "s", NULL, offsetof(GlobalInfo, resolv_conf_mode) }, + {} + }; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(global_info_clear) GlobalInfo global_info = {}; + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(bus); + assert(empty_line); + + r = bus_map_all_properties(bus, + "org.freedesktop.resolve1", + "/org/freedesktop/resolve1", + property_map, + BUS_MAP_BOOLEAN_AS_BOOL, + &error, + &m, + &global_info); + if (r < 0) + return log_error_errno(r, "Failed to get global data: %s", bus_error_message(&error, r)); + + pager_open(arg_pager_flags); + + if (mode == STATUS_DNS) + return status_print_strv_global(global_info.dns_ex ?: global_info.dns); + + if (mode == STATUS_DOMAIN) + return status_print_strv_global(global_info.domains); + + if (mode == STATUS_NTA) + return status_print_strv_global(global_info.ntas); + + if (mode == STATUS_LLMNR) { + printf("%sGlobal%s: %s\n", ansi_highlight(), ansi_normal(), + strna(global_info.llmnr)); + + return 0; + } + + if (mode == STATUS_MDNS) { + printf("%sGlobal%s: %s\n", ansi_highlight(), ansi_normal(), + strna(global_info.mdns)); + + return 0; + } + + if (mode == STATUS_PRIVATE) { + printf("%sGlobal%s: %s\n", ansi_highlight(), ansi_normal(), + strna(global_info.dns_over_tls)); + + return 0; + } + + if (mode == STATUS_DNSSEC) { + printf("%sGlobal%s: %s\n", ansi_highlight(), ansi_normal(), + strna(global_info.dnssec)); + + return 0; + } + + printf("%sGlobal%s\n", ansi_highlight(), ansi_normal()); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + _cleanup_strv_free_ char **pstatus = global_protocol_status(&global_info); + if (!pstatus) + return log_oom(); + + r = table_add_many(table, + TABLE_FIELD, "Protocols", + TABLE_SET_MINIMUM_WIDTH, 19, + TABLE_STRV_WRAPPED, pstatus); + if (r < 0) + return table_log_add_error(r); + + if (global_info.resolv_conf_mode) { + r = table_add_many(table, + TABLE_FIELD, "resolv.conf mode", + TABLE_STRING, global_info.resolv_conf_mode); + if (r < 0) + return table_log_add_error(r); + } + + if (global_info.current_dns) { + r = table_add_many(table, + TABLE_FIELD, "Current DNS Server", + TABLE_STRING, global_info.current_dns_ex ?: global_info.current_dns); + if (r < 0) + return table_log_add_error(r); + } + + r = dump_list(table, "DNS Servers", global_info.dns_ex ?: global_info.dns); + if (r < 0) + return r; + + r = dump_list(table, "Fallback DNS Servers", global_info.fallback_dns_ex ?: global_info.fallback_dns); + if (r < 0) + return r; + + r = dump_list(table, "DNS Domain", global_info.domains); + if (r < 0) + return r; + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + *empty_line = true; + + return 0; +} + +static int status_all(sd_bus *bus, StatusMode mode) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + bool empty_line = false; + int r; + + assert(bus); + + r = status_global(bus, mode, &empty_line); + if (r < 0) + return r; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, 0); + if (r < 0) + return rtnl_log_create_error(r); + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return rtnl_log_create_error(r); + + r = sd_netlink_call(rtnl, req, 0, &reply); + if (r < 0) + return log_error_errno(r, "Failed to enumerate links: %m"); + + _cleanup_free_ InterfaceInfo *infos = NULL; + size_t n_infos = 0; + + for (sd_netlink_message *i = reply; i; i = sd_netlink_message_next(i)) { + const char *name; + int ifindex; + uint16_t type; + + r = sd_netlink_message_get_type(i, &type); + if (r < 0) + return rtnl_log_parse_error(r); + + if (type != RTM_NEWLINK) + continue; + + r = sd_rtnl_message_link_get_ifindex(i, &ifindex); + if (r < 0) + return rtnl_log_parse_error(r); + + if (ifindex == LOOPBACK_IFINDEX) + continue; + + r = sd_netlink_message_read_string(i, IFLA_IFNAME, &name); + if (r < 0) + return rtnl_log_parse_error(r); + + if (!GREEDY_REALLOC(infos, n_infos + 1)) + return log_oom(); + + infos[n_infos++] = (InterfaceInfo) { ifindex, name }; + } + + typesafe_qsort(infos, n_infos, interface_info_compare); + + r = 0; + for (size_t i = 0; i < n_infos; i++) { + int q = status_ifindex(bus, infos[i].index, infos[i].name, mode, &empty_line); + if (q < 0 && r >= 0) + r = q; + } + + return r; +} + +static int verb_status(int argc, char **argv, void *userdata) { + sd_bus *bus = userdata; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r = 0; + + if (argc > 1) { + bool empty_line = false; + + STRV_FOREACH(ifname, argv + 1) { + int ifindex, q; + + ifindex = rtnl_resolve_interface(&rtnl, *ifname); + if (ifindex < 0) { + log_warning_errno(ifindex, "Failed to resolve interface \"%s\", ignoring: %m", *ifname); + continue; + } + + q = status_ifindex(bus, ifindex, NULL, STATUS_ALL, &empty_line); + if (q < 0) + r = q; + } + } else + r = status_all(bus, STATUS_ALL); + + return r; +} + +static int call_dns(sd_bus *bus, char **dns, const BusLocator *locator, sd_bus_error *error, bool extended) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + int r; + + r = bus_message_new_method_call(bus, &req, locator, extended ? "SetLinkDNSEx" : "SetLinkDNS"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "i", arg_ifindex); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(req, 'a', extended ? "(iayqs)" : "(iay)"); + if (r < 0) + return bus_log_create_error(r); + + /* If only argument is the empty string, then call SetLinkDNS() with an + * empty list, which will clear the list of domains for an interface. */ + if (!strv_equal(dns, STRV_MAKE(""))) + STRV_FOREACH(p, dns) { + _cleanup_free_ char *name = NULL; + struct in_addr_data data; + uint16_t port; + int ifindex; + + r = in_addr_port_ifindex_name_from_string_auto(*p, &data.family, &data.address, &port, &ifindex, &name); + if (r < 0) + return log_error_errno(r, "Failed to parse DNS server address: %s", *p); + + if (ifindex != 0 && ifindex != arg_ifindex) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid ifindex: %i", ifindex); + + r = sd_bus_message_open_container(req, 'r', extended ? "iayqs" : "iay"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "i", data.family); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_array(req, 'y', &data.address, FAMILY_ADDRESS_SIZE(data.family)); + if (r < 0) + return bus_log_create_error(r); + + if (extended) { + r = sd_bus_message_append(req, "q", port); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "s", name); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(req); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(req); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, req, 0, error, NULL); + if (r < 0 && extended && sd_bus_error_has_name(error, SD_BUS_ERROR_UNKNOWN_METHOD)) { + sd_bus_error_free(error); + return call_dns(bus, dns, locator, error, false); + } + return r; +} + +static int verb_dns(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_DNS); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_DNS, NULL); + + r = call_dns(bus, argv + 2, bus_resolve_mgr, &error, true); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = call_dns(bus, argv + 2, bus_network_mgr, &error, true); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set DNS configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int call_domain(sd_bus *bus, char **domain, const BusLocator *locator, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + int r; + + r = bus_message_new_method_call(bus, &req, locator, "SetLinkDomains"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "i", arg_ifindex); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(req, 'a', "(sb)"); + if (r < 0) + return bus_log_create_error(r); + + /* If only argument is the empty string, then call SetLinkDomains() with an + * empty list, which will clear the list of domains for an interface. */ + if (!strv_equal(domain, STRV_MAKE(""))) + STRV_FOREACH(p, domain) { + const char *n; + + n = **p == '~' ? *p + 1 : *p; + + r = dns_name_is_valid(n); + if (r < 0) + return log_error_errno(r, "Failed to validate specified domain %s: %m", n); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Domain not valid: %s", + n); + + r = sd_bus_message_append(req, "(sb)", n, **p == '~'); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(req); + if (r < 0) + return bus_log_create_error(r); + + return sd_bus_call(bus, req, 0, error, NULL); +} + +static int verb_domain(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_DOMAIN); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_DOMAIN, NULL); + + r = call_domain(bus, argv + 2, bus_resolve_mgr, &error); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = call_domain(bus, argv + 2, bus_network_mgr, &error); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set domain configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_default_route(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r, b; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_DEFAULT_ROUTE); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_DEFAULT_ROUTE, NULL); + + b = parse_boolean(argv[2]); + if (b < 0) + return log_error_errno(b, "Failed to parse boolean argument: %s", argv[2]); + + r = bus_call_method(bus, bus_resolve_mgr, "SetLinkDefaultRoute", &error, NULL, "ib", arg_ifindex, b); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = bus_call_method(bus, bus_network_mgr, "SetLinkDefaultRoute", &error, NULL, "ib", arg_ifindex, b); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set default route configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_llmnr(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *global_llmnr_support_str = NULL; + ResolveSupport global_llmnr_support, llmnr_support; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_LLMNR); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_LLMNR, NULL); + + llmnr_support = resolve_support_from_string(argv[2]); + if (llmnr_support < 0) + return log_error_errno(llmnr_support, "Invalid LLMNR setting: %s", argv[2]); + + r = bus_get_property_string(bus, bus_resolve_mgr, "LLMNR", &error, &global_llmnr_support_str); + if (r < 0) + return log_error_errno(r, "Failed to get the global LLMNR support state: %s", bus_error_message(&error, r)); + + global_llmnr_support = resolve_support_from_string(global_llmnr_support_str); + if (global_llmnr_support < 0) + return log_error_errno(global_llmnr_support, "Received invalid global LLMNR setting: %s", global_llmnr_support_str); + + if (global_llmnr_support < llmnr_support) + log_warning("Setting LLMNR support level \"%s\" for \"%s\", but the global support level is \"%s\".", + argv[2], arg_ifname, global_llmnr_support_str); + + r = bus_call_method(bus, bus_resolve_mgr, "SetLinkLLMNR", &error, NULL, "is", arg_ifindex, argv[2]); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = bus_call_method(bus, bus_network_mgr, "SetLinkLLMNR", &error, NULL, "is", arg_ifindex, argv[2]); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set LLMNR configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_mdns(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *global_mdns_support_str = NULL; + ResolveSupport global_mdns_support, mdns_support; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_MDNS); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_MDNS, NULL); + + mdns_support = resolve_support_from_string(argv[2]); + if (mdns_support < 0) + return log_error_errno(mdns_support, "Invalid mDNS setting: %s", argv[2]); + + r = bus_get_property_string(bus, bus_resolve_mgr, "MulticastDNS", &error, &global_mdns_support_str); + if (r < 0) + return log_error_errno(r, "Failed to get the global mDNS support state: %s", bus_error_message(&error, r)); + + global_mdns_support = resolve_support_from_string(global_mdns_support_str); + if (global_mdns_support < 0) + return log_error_errno(global_mdns_support, "Received invalid global mDNS setting: %s", global_mdns_support_str); + + if (global_mdns_support < mdns_support) + log_warning("Setting mDNS support level \"%s\" for \"%s\", but the global support level is \"%s\".", + argv[2], arg_ifname, global_mdns_support_str); + + r = bus_call_method(bus, bus_resolve_mgr, "SetLinkMulticastDNS", &error, NULL, "is", arg_ifindex, argv[2]); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = bus_call_method( + bus, + bus_network_mgr, + "SetLinkMulticastDNS", + &error, + NULL, + "is", arg_ifindex, argv[2]); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set MulticastDNS configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_dns_over_tls(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_PRIVATE); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_PRIVATE, NULL); + + r = bus_call_method(bus, bus_resolve_mgr, "SetLinkDNSOverTLS", &error, NULL, "is", arg_ifindex, argv[2]); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = bus_call_method( + bus, + bus_network_mgr, + "SetLinkDNSOverTLS", + &error, + NULL, + "is", arg_ifindex, argv[2]); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set DNSOverTLS configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_dnssec(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_DNSSEC); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_DNSSEC, NULL); + + r = bus_call_method(bus, bus_resolve_mgr, "SetLinkDNSSEC", &error, NULL, "is", arg_ifindex, argv[2]); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = bus_call_method(bus, bus_network_mgr, "SetLinkDNSSEC", &error, NULL, "is", arg_ifindex, argv[2]); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set DNSSEC configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int call_nta(sd_bus *bus, char **nta, const BusLocator *locator, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + int r; + + r = bus_message_new_method_call(bus, &req, locator, "SetLinkDNSSECNegativeTrustAnchors"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "i", arg_ifindex); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(req, nta); + if (r < 0) + return bus_log_create_error(r); + + return sd_bus_call(bus, req, 0, error, NULL); +} + +static int verb_nta(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + bool clear; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return status_all(bus, STATUS_NTA); + + if (argc < 3) + return status_ifindex(bus, arg_ifindex, NULL, STATUS_NTA, NULL); + + /* If only argument is the empty string, then call SetLinkDNSSECNegativeTrustAnchors() + * with an empty list, which will clear the list of domains for an interface. */ + clear = strv_equal(argv + 2, STRV_MAKE("")); + + if (!clear) + STRV_FOREACH(p, argv + 2) { + r = dns_name_is_valid(*p); + if (r < 0) + return log_error_errno(r, "Failed to validate specified domain %s: %m", *p); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Domain not valid: %s", + *p); + } + + r = call_nta(bus, clear ? NULL : argv + 2, bus_resolve_mgr, &error); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = call_nta(bus, clear ? NULL : argv + 2, bus_network_mgr, &error); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to set DNSSEC NTA configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_revert_link(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + if (argc >= 2) { + r = ifname_mangle(argv[1]); + if (r < 0) + return r; + } + + if (arg_ifindex <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Interface argument required."); + + r = bus_call_method(bus, bus_resolve_mgr, "RevertLink", &error, NULL, "i", arg_ifindex); + if (r < 0 && sd_bus_error_has_name(&error, BUS_ERROR_LINK_BUSY)) { + sd_bus_error_free(&error); + + r = bus_call_method(bus, bus_network_mgr, "RevertLinkDNS", &error, NULL, "i", arg_ifindex); + } + if (r < 0) { + if (arg_ifindex_permissive && + sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_LINK)) + return 0; + + return log_error_errno(r, "Failed to revert interface configuration: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int verb_log_level(int argc, char *argv[], void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + + assert(IN_SET(argc, 1, 2)); + + return verb_log_control_common(bus, "org.freedesktop.resolve1", argv[0], argc == 2 ? argv[1] : NULL); +} + +static int print_question(char prefix, const char *color, JsonVariant *question) { + JsonVariant *q = NULL; + int r; + + assert(color); + + JSON_VARIANT_ARRAY_FOREACH(q, question) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + char buf[DNS_RESOURCE_KEY_STRING_MAX]; + + r = dns_resource_key_from_json(q, &key); + if (r < 0) { + log_warning_errno(r, "Received monitor message with invalid question key, ignoring: %m"); + continue; + } + + printf("%s%s %c%s: %s\n", + color, + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + prefix, + ansi_normal(), + dns_resource_key_to_string(key, buf, sizeof(buf))); + } + + return 0; +} + +static int print_answer(JsonVariant *answer) { + JsonVariant *a; + int r; + + JSON_VARIANT_ARRAY_FOREACH(a, answer) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + _cleanup_free_ void *d = NULL; + JsonVariant *jraw; + const char *s; + size_t l; + + jraw = json_variant_by_key(a, "raw"); + if (!jraw) { + log_warning("Received monitor answer lacking valid raw data, ignoring."); + continue; + } + + r = json_variant_unbase64(jraw, &d, &l); + if (r < 0) { + log_warning_errno(r, "Failed to undo base64 encoding of monitor answer raw data, ignoring."); + continue; + } + + r = dns_resource_record_new_from_raw(&rr, d, l); + if (r < 0) { + log_warning_errno(r, "Failed to parse monitor answer RR, ignoring: %m"); + continue; + } + + s = dns_resource_record_to_string(rr); + if (!s) + return log_oom(); + + printf("%s%s A%s: %s\n", + ansi_highlight_yellow(), + special_glyph(SPECIAL_GLYPH_ARROW_LEFT), + ansi_normal(), + s); + } + + return 0; +} + +static void monitor_query_dump(JsonVariant *v) { + _cleanup_(json_variant_unrefp) JsonVariant *question = NULL, *answer = NULL, *collected_questions = NULL; + int rcode = -1, error = 0, r; + const char *state = NULL; + + assert(v); + + JsonDispatch dispatch_table[] = { + { "question", JSON_VARIANT_ARRAY, json_dispatch_variant, PTR_TO_SIZE(&question), JSON_MANDATORY }, + { "answer", JSON_VARIANT_ARRAY, json_dispatch_variant, PTR_TO_SIZE(&answer), 0 }, + { "collectedQuestions", JSON_VARIANT_ARRAY, json_dispatch_variant, PTR_TO_SIZE(&collected_questions), 0 }, + { "state", JSON_VARIANT_STRING, json_dispatch_const_string, PTR_TO_SIZE(&state), JSON_MANDATORY }, + { "rcode", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, PTR_TO_SIZE(&rcode), 0 }, + { "errno", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, PTR_TO_SIZE(&error), 0 }, + {} + }; + + r = json_dispatch(v, dispatch_table, 0, NULL); + if (r < 0) + return (void) log_warning("Received malformed monitor message, ignoring."); + + /* First show the current question */ + print_question('Q', ansi_highlight_cyan(), question); + + /* And then show the questions that led to this one in case this was a CNAME chain */ + print_question('C', ansi_highlight_grey(), collected_questions); + + printf("%s%s S%s: %s\n", + streq_ptr(state, "success") ? ansi_highlight_green() : ansi_highlight_red(), + special_glyph(SPECIAL_GLYPH_ARROW_LEFT), + ansi_normal(), + strna(streq_ptr(state, "errno") ? errno_to_name(error) : + streq_ptr(state, "rcode-failure") ? dns_rcode_to_string(rcode) : + state)); + + print_answer(answer); +} + +static int monitor_reply( + Varlink *link, + JsonVariant *parameters, + const char *error_id, + VarlinkReplyFlags flags, + void *userdata) { + + assert(link); + + if (error_id) { + bool disconnect; + + disconnect = streq(error_id, VARLINK_ERROR_DISCONNECTED); + if (disconnect) + log_info("Disconnected."); + else + log_error("Varlink error: %s", error_id); + + (void) sd_event_exit(ASSERT_PTR(varlink_get_event(link)), disconnect ? EXIT_SUCCESS : EXIT_FAILURE); + return 0; + } + + if (json_variant_by_key(parameters, "ready")) { + /* The first message coming in will just indicate that we are now subscribed. We let our + * caller know if they asked for it. Once the caller sees this they should know that we are + * not going to miss any queries anymore. */ + (void) sd_notify(/* unset_environment=false */ false, "READY=1"); + return 0; + } + + if (arg_json_format_flags & JSON_FORMAT_OFF) { + monitor_query_dump(parameters); + printf("\n"); + } else + json_variant_dump(parameters, arg_json_format_flags, NULL, NULL); + + fflush(stdout); + + return 0; +} + +static int verb_monitor(int argc, char *argv[], void *userdata) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r, c; + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get event loop: %m"); + + r = sd_event_set_signal_exit(event, true); + if (r < 0) + return log_error_errno(r, "Failed to enable exit on SIGINT/SIGTERM: %m"); + + r = varlink_connect_address(&vl, "/run/systemd/resolve/io.systemd.Resolve.Monitor"); + if (r < 0) + return log_error_errno(r, "Failed to connect to query monitoring service /run/systemd/resolve/io.systemd.Resolve.Monitor: %m"); + + r = varlink_set_relative_timeout(vl, USEC_INFINITY); /* We want the monitor to run basically forever */ + if (r < 0) + return log_error_errno(r, "Failed to set varlink time-out: %m"); + + r = varlink_attach_event(vl, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + r = varlink_bind_reply(vl, monitor_reply); + if (r < 0) + return log_error_errno(r, "Failed to bind reply callback to varlink connection: %m"); + + r = varlink_observe(vl, "io.systemd.Resolve.Monitor.SubscribeQueryResults", NULL); + if (r < 0) + return log_error_errno(r, "Failed to issue SubscribeQueryResults() varlink call: %m"); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + r = sd_event_get_exit_code(event, &c); + if (r < 0) + return log_error_errno(r, "Failed to get exit code: %m"); + + return c; +} + +static int dump_cache_item(JsonVariant *item) { + + struct item_info { + JsonVariant *key; + JsonVariant *rrs; + const char *type; + uint64_t until; + } item_info = {}; + + static const JsonDispatch dispatch_table[] = { + { "key", JSON_VARIANT_OBJECT, json_dispatch_variant_noref, offsetof(struct item_info, key), JSON_MANDATORY }, + { "rrs", JSON_VARIANT_ARRAY, json_dispatch_variant_noref, offsetof(struct item_info, rrs), 0 }, + { "type", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct item_info, type), 0 }, + { "until", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct item_info, until), 0 }, + {}, + }; + + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *k = NULL; + int r, c = 0; + + r = json_dispatch(item, dispatch_table, JSON_LOG, &item_info); + if (r < 0) + return r; + + r = dns_resource_key_from_json(item_info.key, &k); + if (r < 0) + return log_error_errno(r, "Failed to turn JSON data to resource key: %m"); + + if (item_info.type) + printf("%s %s%s%s\n", DNS_RESOURCE_KEY_TO_STRING(k), ansi_highlight_red(), item_info.type, ansi_normal()); + else { + JsonVariant *i; + + JSON_VARIANT_ARRAY_FOREACH(i, item_info.rrs) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + _cleanup_free_ void *data = NULL; + JsonVariant *raw; + size_t size; + + raw = json_variant_by_key(i, "raw"); + if (!raw) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "raw field missing from RR JSON data."); + + r = json_variant_unbase64(raw, &data, &size); + if (r < 0) + return log_error_errno(r, "Unable to decode raw RR JSON data: %m"); + + r = dns_resource_record_new_from_raw(&rr, data, size); + if (r < 0) + return log_error_errno(r, "Failed to parse DNS data: %m"); + + printf("%s\n", dns_resource_record_to_string(rr)); + c++; + } + } + + return c; +} + +static int dump_cache_scope(JsonVariant *scope) { + + struct scope_info { + const char *protocol; + int family; + int ifindex; + const char *ifname; + JsonVariant *cache; + } scope_info = { + .family = AF_UNSPEC, + }; + JsonVariant *i; + int r, c = 0; + + static const JsonDispatch dispatch_table[] = { + { "protocol", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct scope_info, protocol), JSON_MANDATORY }, + { "family", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(struct scope_info, family), 0 }, + { "ifindex", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(struct scope_info, ifindex), 0 }, + { "ifname", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct scope_info, ifname), 0 }, + { "cache", JSON_VARIANT_ARRAY, json_dispatch_variant_noref, offsetof(struct scope_info, cache), JSON_MANDATORY }, + {}, + }; + + r = json_dispatch(scope, dispatch_table, JSON_LOG, &scope_info); + if (r < 0) + return r; + + printf("%sScope protocol=%s", ansi_underline(), scope_info.protocol); + + if (scope_info.family != AF_UNSPEC) + printf(" family=%s", af_to_name(scope_info.family)); + + if (scope_info.ifindex > 0) + printf(" ifindex=%i", scope_info.ifindex); + if (scope_info.ifname) + printf(" ifname=%s", scope_info.ifname); + + printf("%s\n", ansi_normal()); + + JSON_VARIANT_ARRAY_FOREACH(i, scope_info.cache) { + r = dump_cache_item(i); + if (r < 0) + return r; + + c += r; + } + + if (c == 0) + printf("%sNo entries.%s\n\n", ansi_grey(), ansi_normal()); + else + printf("\n"); + + return 0; +} + +static int verb_show_cache(int argc, char *argv[], void *userdata) { + JsonVariant *reply = NULL, *d = NULL; + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_connect_address(&vl, "/run/systemd/resolve/io.systemd.Resolve.Monitor"); + if (r < 0) + return log_error_errno(r, "Failed to connect to query monitoring service /run/systemd/resolve/io.systemd.Resolve.Monitor: %m"); + + r = varlink_call(vl, "io.systemd.Resolve.Monitor.DumpCache", NULL, &reply, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to issue DumpCache() varlink call: %m"); + + d = json_variant_by_key(reply, "dump"); + if (!d) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "DumpCache() response is missing 'dump' key."); + + if (!json_variant_is_array(d)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "DumpCache() response 'dump' field not an array"); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + JsonVariant *i; + + JSON_VARIANT_ARRAY_FOREACH(i, d) { + r = dump_cache_scope(i); + if (r < 0) + return r; + } + + return 0; + } + + return json_variant_dump(d, arg_json_format_flags, NULL, NULL); +} + +static int dump_server_state(JsonVariant *server) { + _cleanup_(table_unrefp) Table *table = NULL; + TableCell *cell; + + struct server_state { + const char *server_name; + const char *type; + const char *ifname; + int ifindex; + const char *verified_feature_level; + const char *possible_feature_level; + const char *dnssec_mode; + bool dnssec_supported; + size_t received_udp_fragment_max; + uint64_t n_failed_udp; + uint64_t n_failed_tcp; + bool packet_truncated; + bool packet_bad_opt; + bool packet_rrsig_missing; + bool packet_invalid; + bool packet_do_off; + } server_state = { + .ifindex = -1, + }; + + int r; + + static const JsonDispatch dispatch_table[] = { + { "Server", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct server_state, server_name), JSON_MANDATORY }, + { "Type", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct server_state, type), JSON_MANDATORY }, + { "Interface", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct server_state, ifname), 0 }, + { "InterfaceIndex", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(struct server_state, ifindex), 0 }, + { "VerifiedFeatureLevel", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct server_state, verified_feature_level), 0 }, + { "PossibleFeatureLevel", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct server_state, possible_feature_level), 0 }, + { "DNSSECMode", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(struct server_state, dnssec_mode), JSON_MANDATORY }, + { "DNSSECSupported", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct server_state, dnssec_supported), JSON_MANDATORY }, + { "ReceivedUDPFragmentMax", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct server_state, received_udp_fragment_max), JSON_MANDATORY }, + { "FailedUDPAttempts", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct server_state, n_failed_udp), JSON_MANDATORY }, + { "FailedTCPAttempts", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct server_state, n_failed_tcp), JSON_MANDATORY }, + { "PacketTruncated", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct server_state, packet_truncated), JSON_MANDATORY }, + { "PacketBadOpt", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct server_state, packet_bad_opt), JSON_MANDATORY }, + { "PacketRRSIGMissing", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct server_state, packet_rrsig_missing), JSON_MANDATORY }, + { "PacketInvalid", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct server_state, packet_invalid), JSON_MANDATORY }, + { "PacketDoOff", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct server_state, packet_do_off), JSON_MANDATORY }, + {}, + }; + + r = json_dispatch(server, dispatch_table, JSON_LOG|JSON_PERMISSIVE, &server_state); + if (r < 0) + return r; + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + (void) table_set_ellipsize_percent(table, cell, 100); + (void) table_set_align_percent(table, cell, 0); + + r = table_add_cell_stringf(table, NULL, "Server: %s", server_state.server_name); + if (r < 0) + return table_log_add_error(r); + + r = table_add_many(table, + TABLE_EMPTY, + TABLE_FIELD, "Type", + TABLE_SET_ALIGN_PERCENT, 100, + TABLE_STRING, server_state.type); + if (r < 0) + return table_log_add_error(r); + + if (server_state.ifname) { + r = table_add_many(table, + TABLE_FIELD, "Interface", + TABLE_STRING, server_state.ifname); + if (r < 0) + return table_log_add_error(r); + } + + if (server_state.ifindex >= 0) { + r = table_add_many(table, + TABLE_FIELD, "Interface Index", + TABLE_INT, server_state.ifindex); + if (r < 0) + return table_log_add_error(r); + } + + if (server_state.verified_feature_level) { + r = table_add_many(table, + TABLE_FIELD, "Verified feature level", + TABLE_STRING, server_state.verified_feature_level); + if (r < 0) + return table_log_add_error(r); + } + + if (server_state.possible_feature_level) { + r = table_add_many(table, + TABLE_FIELD, "Possible feature level", + TABLE_STRING, server_state.possible_feature_level); + if (r < 0) + return table_log_add_error(r); + } + + r = table_add_many(table, + TABLE_FIELD, "DNSSEC Mode", + TABLE_STRING, server_state.dnssec_mode, + TABLE_FIELD, "DNSSEC Supported", + TABLE_STRING, yes_no(server_state.dnssec_supported), + TABLE_FIELD, "Maximum UDP fragment size received", + TABLE_UINT64, server_state.received_udp_fragment_max, + TABLE_FIELD, "Failed UDP attempts", + TABLE_UINT64, server_state.n_failed_udp, + TABLE_FIELD, "Failed TCP attempts", + TABLE_UINT64, server_state.n_failed_tcp, + TABLE_FIELD, "Seen truncated packet", + TABLE_STRING, yes_no(server_state.packet_truncated), + TABLE_FIELD, "Seen OPT RR getting lost", + TABLE_STRING, yes_no(server_state.packet_bad_opt), + TABLE_FIELD, "Seen RRSIG RR missing", + TABLE_STRING, yes_no(server_state.packet_rrsig_missing), + TABLE_FIELD, "Seen invalid packet", + TABLE_STRING, yes_no(server_state.packet_invalid), + TABLE_FIELD, "Server dropped DO flag", + TABLE_STRING, yes_no(server_state.packet_do_off), + TABLE_SET_ALIGN_PERCENT, 0, + TABLE_EMPTY, TABLE_EMPTY); + + if (r < 0) + return table_log_add_error(r); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int verb_show_server_state(int argc, char *argv[], void *userdata) { + JsonVariant *reply = NULL, *d = NULL; + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_connect_address(&vl, "/run/systemd/resolve/io.systemd.Resolve.Monitor"); + if (r < 0) + return log_error_errno(r, "Failed to connect to query monitoring service /run/systemd/resolve/io.systemd.Resolve.Monitor: %m"); + + r = varlink_call(vl, "io.systemd.Resolve.Monitor.DumpServerState", NULL, &reply, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to issue DumpServerState() varlink call: %m"); + + d = json_variant_by_key(reply, "dump"); + if (!d) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "DumpCache() response is missing 'dump' key."); + + if (!json_variant_is_array(d)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "DumpCache() response 'dump' field not an array"); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + JsonVariant *i; + + JSON_VARIANT_ARRAY_FOREACH(i, d) { + r = dump_server_state(i); + if (r < 0) + return r; + } + + return 0; + } + + return json_variant_dump(d, arg_json_format_flags, NULL, NULL); +} + +static void help_protocol_types(void) { + if (arg_legend) + puts("Known protocol types:"); + puts("dns\n" + "llmnr\n" + "llmnr-ipv4\n" + "llmnr-ipv6\n" + "mdns\n" + "mdns-ipv4\n" + "mdns-ipv6"); +} + +static void help_dns_types(void) { + if (arg_legend) + puts("Known DNS RR types:"); + + DUMP_STRING_TABLE(dns_type, int, _DNS_TYPE_MAX); +} + +static void help_dns_classes(void) { + if (arg_legend) + puts("Known DNS RR classes:"); + + DUMP_STRING_TABLE(dns_class, int, _DNS_CLASS_MAX); +} + +static int compat_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("resolvectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] HOSTNAME|ADDRESS...\n" + "%1$s [OPTIONS...] --service [[NAME] TYPE] DOMAIN\n" + "%1$s [OPTIONS...] --openpgp EMAIL@DOMAIN...\n" + "%1$s [OPTIONS...] --statistics\n" + "%1$s [OPTIONS...] --reset-statistics\n" + "\n" + "%2$sResolve domain names, IPv4 and IPv6 addresses, DNS records, and services.%3$s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " -4 Resolve IPv4 addresses\n" + " -6 Resolve IPv6 addresses\n" + " -i --interface=INTERFACE Look on interface\n" + " -p --protocol=PROTO|help Look via protocol\n" + " -t --type=TYPE|help Query RR with DNS type\n" + " -c --class=CLASS|help Query RR with DNS class\n" + " --service Resolve service (SRV)\n" + " --service-address=BOOL Resolve address for services (default: yes)\n" + " --service-txt=BOOL Resolve TXT records for services (default: yes)\n" + " --openpgp Query OpenPGP public key\n" + " --tlsa Query TLS public key\n" + " --cname=BOOL Follow CNAME redirects (default: yes)\n" + " --search=BOOL Use search domains for single-label names\n" + " (default: yes)\n" + " --raw[=payload|packet] Dump the answer as binary data\n" + " --legend=BOOL Print headers and additional info (default: yes)\n" + " --statistics Show resolver statistics\n" + " --reset-statistics Reset resolver statistics\n" + " --status Show link and server status\n" + " --flush-caches Flush all local DNS caches\n" + " --reset-server-features\n" + " Forget learnt DNS server feature levels\n" + " --set-dns=SERVER Set per-interface DNS server address\n" + " --set-domain=DOMAIN Set per-interface search domain\n" + " --set-llmnr=MODE Set per-interface LLMNR mode\n" + " --set-mdns=MODE Set per-interface MulticastDNS mode\n" + " --set-dnsovertls=MODE Set per-interface DNS-over-TLS mode\n" + " --set-dnssec=MODE Set per-interface DNSSEC mode\n" + " --set-nta=DOMAIN Set per-interface DNSSEC NTA\n" + " --revert Revert per-interface configuration\n" + "\nSee the %4$s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int native_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("resolvectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n" + "\n" + "%sSend control commands to the network name resolution manager, or%s\n" + "%sresolve domain names, IPv4 and IPv6 addresses, DNS records, and services.%s\n" + "\nCommands:\n" + " query HOSTNAME|ADDRESS... Resolve domain names, IPv4 and IPv6 addresses\n" + " service [[NAME] TYPE] DOMAIN Resolve service (SRV)\n" + " openpgp EMAIL@DOMAIN... Query OpenPGP public key\n" + " tlsa DOMAIN[:PORT]... Query TLS public key\n" + " status [LINK...] Show link and server status\n" + " statistics Show resolver statistics\n" + " reset-statistics Reset resolver statistics\n" + " flush-caches Flush all local DNS caches\n" + " reset-server-features Forget learnt DNS server feature levels\n" + " monitor Monitor DNS queries\n" + " show-cache Show cache contents\n" + " show-server-state Show servers state\n" + " dns [LINK [SERVER...]] Get/set per-interface DNS server address\n" + " domain [LINK [DOMAIN...]] Get/set per-interface search domain\n" + " default-route [LINK [BOOL]] Get/set per-interface default route flag\n" + " llmnr [LINK [MODE]] Get/set per-interface LLMNR mode\n" + " mdns [LINK [MODE]] Get/set per-interface MulticastDNS mode\n" + " dnsovertls [LINK [MODE]] Get/set per-interface DNS-over-TLS mode\n" + " dnssec [LINK [MODE]] Get/set per-interface DNSSEC mode\n" + " nta [LINK [DOMAIN...]] Get/set per-interface DNSSEC NTA\n" + " revert LINK Revert per-interface configuration\n" + " log-level [LEVEL] Get/set logging threshold for systemd-resolved\n" + "\nOptions:\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " -4 Resolve IPv4 addresses\n" + " -6 Resolve IPv6 addresses\n" + " -i --interface=INTERFACE Look on interface\n" + " -p --protocol=PROTO|help Look via protocol\n" + " -t --type=TYPE|help Query RR with DNS type\n" + " -c --class=CLASS|help Query RR with DNS class\n" + " --service-address=BOOL Resolve address for services (default: yes)\n" + " --service-txt=BOOL Resolve TXT records for services (default: yes)\n" + " --cname=BOOL Follow CNAME redirects (default: yes)\n" + " --validate=BOOL Allow DNSSEC validation (default: yes)\n" + " --synthesize=BOOL Allow synthetic response (default: yes)\n" + " --cache=BOOL Allow response from cache (default: yes)\n" + " --stale-data=BOOL Allow response from cache with stale data (default: yes)\n" + " --zone=BOOL Allow response from locally registered mDNS/LLMNR\n" + " records (default: yes)\n" + " --trust-anchor=BOOL Allow response from local trust anchor (default:\n" + " yes)\n" + " --network=BOOL Allow response from network (default: yes)\n" + " --search=BOOL Use search domains for single-label names (default:\n" + " yes)\n" + " --raw[=payload|packet] Dump the answer as binary data\n" + " --legend=BOOL Print headers and additional info (default: yes)\n" + " --json=MODE Output as JSON\n" + " -j Same as --json=pretty on tty, --json=short\n" + " otherwise\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int verb_help(int argc, char **argv, void *userdata) { + return native_help(); +} + +static int compat_parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_LEGEND, + ARG_SERVICE, + ARG_CNAME, + ARG_SERVICE_ADDRESS, + ARG_SERVICE_TXT, + ARG_OPENPGP, + ARG_TLSA, + ARG_RAW, + ARG_SEARCH, + ARG_STATISTICS, + ARG_RESET_STATISTICS, + ARG_STATUS, + ARG_FLUSH_CACHES, + ARG_RESET_SERVER_FEATURES, + ARG_NO_PAGER, + ARG_SET_DNS, + ARG_SET_DOMAIN, + ARG_SET_LLMNR, + ARG_SET_MDNS, + ARG_SET_PRIVATE, + ARG_SET_DNSSEC, + ARG_SET_NTA, + ARG_REVERT_LINK, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "type", required_argument, NULL, 't' }, + { "class", required_argument, NULL, 'c' }, + { "legend", required_argument, NULL, ARG_LEGEND }, + { "interface", required_argument, NULL, 'i' }, + { "protocol", required_argument, NULL, 'p' }, + { "cname", required_argument, NULL, ARG_CNAME }, + { "service", no_argument, NULL, ARG_SERVICE }, + { "service-address", required_argument, NULL, ARG_SERVICE_ADDRESS }, + { "service-txt", required_argument, NULL, ARG_SERVICE_TXT }, + { "openpgp", no_argument, NULL, ARG_OPENPGP }, + { "tlsa", optional_argument, NULL, ARG_TLSA }, + { "raw", optional_argument, NULL, ARG_RAW }, + { "search", required_argument, NULL, ARG_SEARCH }, + { "statistics", no_argument, NULL, ARG_STATISTICS, }, + { "reset-statistics", no_argument, NULL, ARG_RESET_STATISTICS }, + { "status", no_argument, NULL, ARG_STATUS }, + { "flush-caches", no_argument, NULL, ARG_FLUSH_CACHES }, + { "reset-server-features", no_argument, NULL, ARG_RESET_SERVER_FEATURES }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "set-dns", required_argument, NULL, ARG_SET_DNS }, + { "set-domain", required_argument, NULL, ARG_SET_DOMAIN }, + { "set-llmnr", required_argument, NULL, ARG_SET_LLMNR }, + { "set-mdns", required_argument, NULL, ARG_SET_MDNS }, + { "set-dnsovertls", required_argument, NULL, ARG_SET_PRIVATE }, + { "set-dnssec", required_argument, NULL, ARG_SET_DNSSEC }, + { "set-nta", required_argument, NULL, ARG_SET_NTA }, + { "revert", no_argument, NULL, ARG_REVERT_LINK }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h46i:t:c:p:", options, NULL)) >= 0) + switch (c) { + + case 'h': + return compat_help(); + + case ARG_VERSION: + return version(); + + case '4': + arg_family = AF_INET; + break; + + case '6': + arg_family = AF_INET6; + break; + + case 'i': + r = ifname_mangle(optarg); + if (r < 0) + return r; + break; + + case 't': + if (streq(optarg, "help")) { + help_dns_types(); + return 0; + } + + r = dns_type_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse RR record type %s: %m", optarg); + + arg_type = (uint16_t) r; + assert((int) arg_type == r); + + arg_mode = MODE_RESOLVE_RECORD; + break; + + case 'c': + if (streq(optarg, "help")) { + help_dns_classes(); + return 0; + } + + r = dns_class_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse RR record class %s: %m", optarg); + + arg_class = (uint16_t) r; + assert((int) arg_class == r); + + break; + + case ARG_LEGEND: + r = parse_boolean_argument("--legend=", optarg, &arg_legend); + if (r < 0) + return r; + break; + + case 'p': + if (streq(optarg, "help")) { + help_protocol_types(); + return 0; + } else if (streq(optarg, "dns")) + arg_flags |= SD_RESOLVED_DNS; + else if (streq(optarg, "llmnr")) + arg_flags |= SD_RESOLVED_LLMNR; + else if (streq(optarg, "llmnr-ipv4")) + arg_flags |= SD_RESOLVED_LLMNR_IPV4; + else if (streq(optarg, "llmnr-ipv6")) + arg_flags |= SD_RESOLVED_LLMNR_IPV6; + else if (streq(optarg, "mdns")) + arg_flags |= SD_RESOLVED_MDNS; + else if (streq(optarg, "mdns-ipv4")) + arg_flags |= SD_RESOLVED_MDNS_IPV4; + else if (streq(optarg, "mdns-ipv6")) + arg_flags |= SD_RESOLVED_MDNS_IPV6; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown protocol specifier: %s", optarg); + + break; + + case ARG_SERVICE: + arg_mode = MODE_RESOLVE_SERVICE; + break; + + case ARG_OPENPGP: + arg_mode = MODE_RESOLVE_OPENPGP; + break; + + case ARG_TLSA: + arg_mode = MODE_RESOLVE_TLSA; + if (!optarg || service_family_is_valid(optarg)) + arg_service_family = optarg; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown service family \"%s\".", optarg); + break; + + case ARG_RAW: + if (on_tty()) + return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), + "Refusing to write binary data to tty."); + + if (optarg == NULL || streq(optarg, "payload")) + arg_raw = RAW_PAYLOAD; + else if (streq(optarg, "packet")) + arg_raw = RAW_PACKET; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown --raw specifier \"%s\".", + optarg); + + arg_legend = false; + break; + + case ARG_CNAME: + r = parse_boolean_argument("--cname=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_CNAME, r == 0); + break; + + case ARG_SERVICE_ADDRESS: + r = parse_boolean_argument("--service-address=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_ADDRESS, r == 0); + break; + + case ARG_SERVICE_TXT: + r = parse_boolean_argument("--service-txt=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_TXT, r == 0); + break; + + case ARG_SEARCH: + r = parse_boolean_argument("--search=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_SEARCH, r == 0); + break; + + case ARG_STATISTICS: + arg_mode = MODE_STATISTICS; + break; + + case ARG_RESET_STATISTICS: + arg_mode = MODE_RESET_STATISTICS; + break; + + case ARG_FLUSH_CACHES: + arg_mode = MODE_FLUSH_CACHES; + break; + + case ARG_RESET_SERVER_FEATURES: + arg_mode = MODE_RESET_SERVER_FEATURES; + break; + + case ARG_STATUS: + arg_mode = MODE_STATUS; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_SET_DNS: + r = strv_extend(&arg_set_dns, optarg); + if (r < 0) + return log_oom(); + + arg_mode = MODE_SET_LINK; + break; + + case ARG_SET_DOMAIN: + r = strv_extend(&arg_set_domain, optarg); + if (r < 0) + return log_oom(); + + arg_mode = MODE_SET_LINK; + break; + + case ARG_SET_LLMNR: + arg_set_llmnr = optarg; + arg_mode = MODE_SET_LINK; + break; + + case ARG_SET_MDNS: + arg_set_mdns = optarg; + arg_mode = MODE_SET_LINK; + break; + + case ARG_SET_PRIVATE: + arg_set_dns_over_tls = optarg; + arg_mode = MODE_SET_LINK; + break; + + case ARG_SET_DNSSEC: + arg_set_dnssec = optarg; + arg_mode = MODE_SET_LINK; + break; + + case ARG_SET_NTA: + r = strv_extend(&arg_set_nta, optarg); + if (r < 0) + return log_oom(); + + arg_mode = MODE_SET_LINK; + break; + + case ARG_REVERT_LINK: + arg_mode = MODE_REVERT_LINK; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_type == 0 && arg_class != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--class= may only be used in conjunction with --type=."); + + if (arg_type != 0 && arg_mode == MODE_RESOLVE_SERVICE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--service and --type= may not be combined."); + + if (arg_type != 0 && arg_class == 0) + arg_class = DNS_CLASS_IN; + + if (arg_class != 0 && arg_type == 0) + arg_type = DNS_TYPE_A; + + if (IN_SET(arg_mode, MODE_SET_LINK, MODE_REVERT_LINK)) { + + if (arg_ifindex <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--set-dns=, --set-domain=, --set-llmnr=, --set-mdns=, --set-dnsovertls=, --set-dnssec=, --set-nta= and --revert require --interface=."); + } + + return 1 /* work to do */; +} + +static int native_parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_LEGEND, + ARG_CNAME, + ARG_VALIDATE, + ARG_SYNTHESIZE, + ARG_CACHE, + ARG_ZONE, + ARG_TRUST_ANCHOR, + ARG_NETWORK, + ARG_SERVICE_ADDRESS, + ARG_SERVICE_TXT, + ARG_RAW, + ARG_SEARCH, + ARG_NO_PAGER, + ARG_JSON, + ARG_STALE_DATA + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "type", required_argument, NULL, 't' }, + { "class", required_argument, NULL, 'c' }, + { "legend", required_argument, NULL, ARG_LEGEND }, + { "interface", required_argument, NULL, 'i' }, + { "protocol", required_argument, NULL, 'p' }, + { "cname", required_argument, NULL, ARG_CNAME }, + { "validate", required_argument, NULL, ARG_VALIDATE }, + { "synthesize", required_argument, NULL, ARG_SYNTHESIZE }, + { "cache", required_argument, NULL, ARG_CACHE }, + { "zone", required_argument, NULL, ARG_ZONE }, + { "trust-anchor", required_argument, NULL, ARG_TRUST_ANCHOR }, + { "network", required_argument, NULL, ARG_NETWORK }, + { "service-address", required_argument, NULL, ARG_SERVICE_ADDRESS }, + { "service-txt", required_argument, NULL, ARG_SERVICE_TXT }, + { "raw", optional_argument, NULL, ARG_RAW }, + { "search", required_argument, NULL, ARG_SEARCH }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "json", required_argument, NULL, ARG_JSON }, + { "stale-data", required_argument, NULL, ARG_STALE_DATA }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h46i:t:c:p:j", options, NULL)) >= 0) + switch (c) { + + case 'h': + return native_help(); + + case ARG_VERSION: + return version(); + + case '4': + arg_family = AF_INET; + break; + + case '6': + arg_family = AF_INET6; + break; + + case 'i': + r = ifname_mangle(optarg); + if (r < 0) + return r; + break; + + case 't': + if (streq(optarg, "help")) { + help_dns_types(); + return 0; + } + + r = dns_type_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse RR record type %s: %m", optarg); + + arg_type = (uint16_t) r; + assert((int) arg_type == r); + + break; + + case 'c': + if (streq(optarg, "help")) { + help_dns_classes(); + return 0; + } + + r = dns_class_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse RR record class %s: %m", optarg); + + arg_class = (uint16_t) r; + assert((int) arg_class == r); + + break; + + case ARG_LEGEND: + r = parse_boolean_argument("--legend=", optarg, &arg_legend); + if (r < 0) + return r; + break; + + case 'p': + if (streq(optarg, "help")) { + help_protocol_types(); + return 0; + } else if (streq(optarg, "dns")) + arg_flags |= SD_RESOLVED_DNS; + else if (streq(optarg, "llmnr")) + arg_flags |= SD_RESOLVED_LLMNR; + else if (streq(optarg, "llmnr-ipv4")) + arg_flags |= SD_RESOLVED_LLMNR_IPV4; + else if (streq(optarg, "llmnr-ipv6")) + arg_flags |= SD_RESOLVED_LLMNR_IPV6; + else if (streq(optarg, "mdns")) + arg_flags |= SD_RESOLVED_MDNS; + else if (streq(optarg, "mdns-ipv4")) + arg_flags |= SD_RESOLVED_MDNS_IPV4; + else if (streq(optarg, "mdns-ipv6")) + arg_flags |= SD_RESOLVED_MDNS_IPV6; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown protocol specifier: %s", + optarg); + + break; + + case ARG_RAW: + if (on_tty()) + return log_error_errno(SYNTHETIC_ERRNO(ENOTTY), + "Refusing to write binary data to tty."); + + if (optarg == NULL || streq(optarg, "payload")) + arg_raw = RAW_PAYLOAD; + else if (streq(optarg, "packet")) + arg_raw = RAW_PACKET; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown --raw specifier \"%s\".", + optarg); + + arg_legend = false; + break; + + case ARG_CNAME: + r = parse_boolean_argument("--cname=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_CNAME, r == 0); + break; + + case ARG_VALIDATE: + r = parse_boolean_argument("--validate=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_VALIDATE, r == 0); + break; + + case ARG_SYNTHESIZE: + r = parse_boolean_argument("--synthesize=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_SYNTHESIZE, r == 0); + break; + + case ARG_CACHE: + r = parse_boolean_argument("--cache=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_CACHE, r == 0); + break; + + case ARG_STALE_DATA: + r = parse_boolean_argument("--stale-data=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_STALE, r == 0); + break; + + case ARG_ZONE: + r = parse_boolean_argument("--zone=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_ZONE, r == 0); + break; + + case ARG_TRUST_ANCHOR: + r = parse_boolean_argument("--trust-anchor=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_TRUST_ANCHOR, r == 0); + break; + + case ARG_NETWORK: + r = parse_boolean_argument("--network=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_NETWORK, r == 0); + break; + + case ARG_SERVICE_ADDRESS: + r = parse_boolean_argument("--service-address=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_ADDRESS, r == 0); + break; + + case ARG_SERVICE_TXT: + r = parse_boolean_argument("--service-txt=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_TXT, r == 0); + break; + + case ARG_SEARCH: + r = parse_boolean_argument("--search=", optarg, NULL); + if (r < 0) + return r; + SET_FLAG(arg_flags, SD_RESOLVED_NO_SEARCH, r == 0); + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + + break; + + case 'j': + arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_type == 0 && arg_class != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--class= may only be used in conjunction with --type=."); + + if (arg_type != 0 && arg_class == 0) + arg_class = DNS_CLASS_IN; + + if (arg_class != 0 && arg_type == 0) + arg_type = DNS_TYPE_A; + + return 1 /* work to do */; +} + +static int native_main(int argc, char *argv[], sd_bus *bus) { + + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, verb_help }, + { "status", VERB_ANY, VERB_ANY, VERB_DEFAULT, verb_status }, + { "query", 2, VERB_ANY, 0, verb_query }, + { "service", 2, 4, 0, verb_service }, + { "openpgp", 2, VERB_ANY, 0, verb_openpgp }, + { "tlsa", 2, VERB_ANY, 0, verb_tlsa }, + { "statistics", VERB_ANY, 1, 0, show_statistics }, + { "reset-statistics", VERB_ANY, 1, 0, reset_statistics }, + { "flush-caches", VERB_ANY, 1, 0, flush_caches }, + { "reset-server-features", VERB_ANY, 1, 0, reset_server_features }, + { "dns", VERB_ANY, VERB_ANY, 0, verb_dns }, + { "domain", VERB_ANY, VERB_ANY, 0, verb_domain }, + { "default-route", VERB_ANY, 3, 0, verb_default_route }, + { "llmnr", VERB_ANY, 3, 0, verb_llmnr }, + { "mdns", VERB_ANY, 3, 0, verb_mdns }, + { "dnsovertls", VERB_ANY, 3, 0, verb_dns_over_tls }, + { "dnssec", VERB_ANY, 3, 0, verb_dnssec }, + { "nta", VERB_ANY, VERB_ANY, 0, verb_nta }, + { "revert", VERB_ANY, 2, 0, verb_revert_link }, + { "log-level", VERB_ANY, 2, 0, verb_log_level }, + { "monitor", VERB_ANY, 1, 0, verb_monitor }, + { "show-cache", VERB_ANY, 1, 0, verb_show_cache }, + { "show-server-state", VERB_ANY, 1, 0, verb_show_server_state}, + {} + }; + + return dispatch_verb(argc, argv, verbs, bus); +} + +static int translate(const char *verb, const char *single_arg, size_t num_args, char **args, sd_bus *bus) { + char **fake, **p; + size_t num; + + assert(verb); + assert(num_args == 0 || args); + + num = !!single_arg + num_args + 1; + + p = fake = newa0(char *, num + 1); + *p++ = (char *) verb; + if (single_arg) + *p++ = (char *) single_arg; + for (size_t i = 0; i < num_args; i++) + *p++ = args[i]; + + optind = 0; + return native_main((int) num, fake, bus); +} + +static int compat_main(int argc, char *argv[], sd_bus *bus) { + int r = 0; + + switch (arg_mode) { + case MODE_RESOLVE_HOST: + case MODE_RESOLVE_RECORD: + return translate("query", NULL, argc - optind, argv + optind, bus); + + case MODE_RESOLVE_SERVICE: + return translate("service", NULL, argc - optind, argv + optind, bus); + + case MODE_RESOLVE_OPENPGP: + return translate("openpgp", NULL, argc - optind, argv + optind, bus); + + case MODE_RESOLVE_TLSA: + return translate("tlsa", arg_service_family, argc - optind, argv + optind, bus); + + case MODE_STATISTICS: + return translate("statistics", NULL, 0, NULL, bus); + + case MODE_RESET_STATISTICS: + return translate("reset-statistics", NULL, 0, NULL, bus); + + case MODE_FLUSH_CACHES: + return translate("flush-caches", NULL, 0, NULL, bus); + + case MODE_RESET_SERVER_FEATURES: + return translate("reset-server-features", NULL, 0, NULL, bus); + + case MODE_STATUS: + return translate("status", NULL, argc - optind, argv + optind, bus); + + case MODE_SET_LINK: + assert(arg_ifname); + + if (arg_set_dns) { + r = translate("dns", arg_ifname, strv_length(arg_set_dns), arg_set_dns, bus); + if (r < 0) + return r; + } + + if (arg_set_domain) { + r = translate("domain", arg_ifname, strv_length(arg_set_domain), arg_set_domain, bus); + if (r < 0) + return r; + } + + if (arg_set_nta) { + r = translate("nta", arg_ifname, strv_length(arg_set_nta), arg_set_nta, bus); + if (r < 0) + return r; + } + + if (arg_set_llmnr) { + r = translate("llmnr", arg_ifname, 1, (char **) &arg_set_llmnr, bus); + if (r < 0) + return r; + } + + if (arg_set_mdns) { + r = translate("mdns", arg_ifname, 1, (char **) &arg_set_mdns, bus); + if (r < 0) + return r; + } + + if (arg_set_dns_over_tls) { + r = translate("dnsovertls", arg_ifname, 1, (char **) &arg_set_dns_over_tls, bus); + if (r < 0) + return r; + } + + if (arg_set_dnssec) { + r = translate("dnssec", arg_ifname, 1, (char **) &arg_set_dnssec, bus); + if (r < 0) + return r; + } + + return r; + + case MODE_REVERT_LINK: + assert(arg_ifname); + + return translate("revert", arg_ifname, 0, NULL, bus); + + case _MODE_INVALID: + assert_not_reached(); + } + + return 0; +} + +static int run(int argc, char **argv) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + bool compat = false; + int r; + + setlocale(LC_ALL, ""); + log_setup(); + + if (invoked_as(argv, "resolvconf")) { + compat = true; + r = resolvconf_parse_argv(argc, argv); + } else if (invoked_as(argv, "systemd-resolve")) { + compat = true; + r = compat_parse_argv(argc, argv); + } else + r = native_parse_argv(argc, argv); + if (r <= 0) + return r; + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "sd_bus_open_system: %m"); + + if (compat) + return compat_main(argc, argv, bus); + + return native_main(argc, argv, bus); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/resolve/resolvectl.h b/src/resolve/resolvectl.h new file mode 100644 index 0000000..3e404da --- /dev/null +++ b/src/resolve/resolvectl.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +typedef enum ExecutionMode { + MODE_RESOLVE_HOST, + MODE_RESOLVE_RECORD, + MODE_RESOLVE_SERVICE, + MODE_RESOLVE_OPENPGP, + MODE_RESOLVE_TLSA, + MODE_STATISTICS, + MODE_RESET_STATISTICS, + MODE_FLUSH_CACHES, + MODE_RESET_SERVER_FEATURES, + MODE_STATUS, + MODE_SET_LINK, + MODE_REVERT_LINK, + _MODE_INVALID = -EINVAL, +} ExecutionMode; + +extern ExecutionMode arg_mode; +extern char **arg_set_dns; +extern char **arg_set_domain; +extern bool arg_ifindex_permissive; + +int ifname_mangle_full(const char *s, bool drop_protocol_specifier); +static inline int ifname_mangle(const char *s) { + return ifname_mangle_full(s, false); +} +static inline int ifname_resolvconf_mangle(const char *s) { + return ifname_mangle_full(s, true); +} diff --git a/src/resolve/resolved-bus.c b/src/resolve/resolved-bus.c new file mode 100644 index 0000000..1ef25ac --- /dev/null +++ b/src/resolve/resolved-bus.c @@ -0,0 +1,2285 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-locator.h" +#include "bus-log-control-api.h" +#include "bus-message-util.h" +#include "bus-polkit.h" +#include "dns-domain.h" +#include "format-util.h" +#include "memory-util.h" +#include "missing_capability.h" +#include "resolved-bus.h" +#include "resolved-def.h" +#include "resolved-dns-synthesize.h" +#include "resolved-dnssd-bus.h" +#include "resolved-dnssd.h" +#include "resolved-link-bus.h" +#include "resolved-resolv-conf.h" +#include "socket-netlink.h" +#include "stdio-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "user-util.h" +#include "utf8.h" + +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_resolve_support, resolve_support, ResolveSupport); + +static int query_on_bus_track(sd_bus_track *t, void *userdata) { + DnsQuery *q = ASSERT_PTR(userdata); + + assert(t); + + if (!DNS_TRANSACTION_IS_LIVE(q->state)) + return 0; + + log_debug("Client of active query vanished, aborting query."); + dns_query_complete(q, DNS_TRANSACTION_ABORTED); + return 0; +} + +static int dns_query_bus_track(DnsQuery *q, sd_bus_message *m) { + int r; + + assert(q); + assert(m); + + if (!q->bus_track) { + r = sd_bus_track_new(sd_bus_message_get_bus(m), &q->bus_track, query_on_bus_track, q); + if (r < 0) + return r; + } + + r = sd_bus_track_add_sender(q->bus_track, m); + if (r < 0) + return r; + + return 0; +} + +static sd_bus_message *dns_query_steal_request(DnsQuery *q) { + assert(q); + + /* Find the main query, it's the one that owns the message */ + while (q->auxiliary_for) + q = q->auxiliary_for; + + /* Let's take the request message out of the DnsQuery object, so that we never send requests twice */ + return TAKE_PTR(q->bus_request); +} + +_sd_printf_(3, 4) static int reply_method_errorf( + DnsQuery *query, + const char *error_name, + const char *format, + ...) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + va_list ap; + int r; + + assert(query); + assert(format); + + req = dns_query_steal_request(query); + if (!req) /* No bus message set anymore? then we already replied already, let's not answer a second time */ + return 0; + + va_start(ap, format); + r = sd_bus_reply_method_errorfv(req, error_name, format, ap); + va_end(ap); + + return r; +} + +_sd_printf_(3, 4) static int reply_method_errnof( + DnsQuery *query, + int err, + const char *format, + ...) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + int r; + + assert(query); + + req = dns_query_steal_request(query); + if (!req) /* No bus message set anymore? then we already replied already, let's not answer a second time */ + return 0; + + if (format) { + va_list ap; + + va_start(ap, format); + r = sd_bus_reply_method_errnofv(req, err, format, ap); + va_end(ap); + } else + r = sd_bus_reply_method_errno(req, err, NULL); + + return r; +} + +static int reply_query_state(DnsQuery *q) { + assert(q); + + switch (q->state) { + + case DNS_TRANSACTION_NO_SERVERS: + return reply_method_errorf(q, BUS_ERROR_NO_NAME_SERVERS, "No appropriate name servers or networks for name found"); + + case DNS_TRANSACTION_TIMEOUT: + return reply_method_errorf(q, SD_BUS_ERROR_TIMEOUT, "Query timed out"); + + case DNS_TRANSACTION_ATTEMPTS_MAX_REACHED: + return reply_method_errorf(q, SD_BUS_ERROR_TIMEOUT, "All attempts to contact name servers or networks failed"); + + case DNS_TRANSACTION_INVALID_REPLY: + return reply_method_errorf(q, BUS_ERROR_INVALID_REPLY, "Received invalid reply"); + + case DNS_TRANSACTION_ERRNO: + return reply_method_errnof(q, q->answer_errno, "Lookup failed due to system error: %m"); + + case DNS_TRANSACTION_ABORTED: + return reply_method_errorf(q, BUS_ERROR_ABORTED, "Query aborted"); + + case DNS_TRANSACTION_DNSSEC_FAILED: + return reply_method_errorf(q, BUS_ERROR_DNSSEC_FAILED, "DNSSEC validation failed: %s", + dnssec_result_to_string(q->answer_dnssec_result)); + + case DNS_TRANSACTION_NO_TRUST_ANCHOR: + return reply_method_errorf(q, BUS_ERROR_NO_TRUST_ANCHOR, "No suitable trust anchor known"); + + case DNS_TRANSACTION_RR_TYPE_UNSUPPORTED: + return reply_method_errorf(q, BUS_ERROR_RR_TYPE_UNSUPPORTED, "Server does not support requested resource record type"); + + case DNS_TRANSACTION_NETWORK_DOWN: + return reply_method_errorf(q, BUS_ERROR_NETWORK_DOWN, "Network is down"); + + case DNS_TRANSACTION_NOT_FOUND: + /* We return this as NXDOMAIN. This is only generated when a host doesn't implement LLMNR/TCP, and we + * thus quickly know that we cannot resolve an in-addr.arpa or ip6.arpa address. */ + return reply_method_errorf(q, BUS_ERROR_DNS_NXDOMAIN, "'%s' not found", dns_query_string(q)); + + case DNS_TRANSACTION_NO_SOURCE: + return reply_method_errorf(q, BUS_ERROR_NO_SOURCE, "All suitable resolution sources turned off"); + + case DNS_TRANSACTION_STUB_LOOP: + return reply_method_errorf(q, BUS_ERROR_STUB_LOOP, "Configured DNS server loops back to us"); + + case DNS_TRANSACTION_RCODE_FAILURE: { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + + req = dns_query_steal_request(q); + if (!req) /* No bus message set anymore? then we already replied already, let's not answer a second time */ + return 0; + + if (q->answer_rcode == DNS_RCODE_NXDOMAIN) + sd_bus_error_setf(&error, BUS_ERROR_DNS_NXDOMAIN, "Name '%s' not found", dns_query_string(q)); + else { + const char *rc, *n; + + rc = FORMAT_DNS_RCODE(q->answer_rcode); + n = strjoina(_BUS_ERROR_DNS, rc); + sd_bus_error_setf(&error, n, "Could not resolve '%s', server or network returned error %s", dns_query_string(q), rc); + } + + return sd_bus_reply_method_error(req, &error); + } + + case DNS_TRANSACTION_NULL: + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + case DNS_TRANSACTION_SUCCESS: + default: + assert_not_reached(); + } +} + +static int append_address(sd_bus_message *reply, DnsResourceRecord *rr, int ifindex) { + int r; + + assert(reply); + assert(rr); + + r = sd_bus_message_open_container(reply, 'r', "iiay"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", ifindex); + if (r < 0) + return r; + + if (rr->key->type == DNS_TYPE_A) { + r = sd_bus_message_append(reply, "i", AF_INET); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &rr->a.in_addr, sizeof(struct in_addr)); + + } else if (rr->key->type == DNS_TYPE_AAAA) { + r = sd_bus_message_append(reply, "i", AF_INET6); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &rr->aaaa.in6_addr, sizeof(struct in6_addr)); + } else + return -EAFNOSUPPORT; + + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return 0; +} + +static void bus_method_resolve_hostname_complete(DnsQuery *query) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *canonical = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = query; + _cleanup_free_ char *normalized = NULL; + DnsQuestion *question; + DnsResourceRecord *rr; + unsigned added = 0; + int ifindex, r; + + assert(q); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + r = reply_query_state(q); + goto finish; + } + + r = dns_query_process_cname_many(q); + if (r == -ELOOP) { + r = reply_method_errorf(q, BUS_ERROR_CNAME_LOOP, "CNAME loop detected, or CNAME resolving disabled on '%s'", dns_query_string(q)); + goto finish; + } + if (r < 0) + goto finish; + if (r == DNS_QUERY_CNAME) { + /* This was a cname, and the query was restarted. */ + TAKE_PTR(q); + return; + } + + r = sd_bus_message_new_method_return(q->bus_request, &reply); + if (r < 0) + goto finish; + + r = sd_bus_message_open_container(reply, 'a', "(iiay)"); + if (r < 0) + goto finish; + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, q->answer) { + + r = dns_question_matches_rr(question, rr, DNS_SEARCH_DOMAIN_NAME(q->answer_search_domain)); + if (r < 0) + goto finish; + if (r == 0) + continue; + + r = append_address(reply, rr, ifindex); + if (r < 0) + goto finish; + + if (!canonical) + canonical = dns_resource_record_ref(rr); + + added++; + } + + if (added <= 0) { + r = reply_method_errorf(q, BUS_ERROR_NO_SUCH_RR, "'%s' does not have any RR of the requested type", dns_query_string(q)); + goto finish; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto finish; + + /* The key names are not necessarily normalized, make sure that they are when we return them to our + * bus clients. */ + assert(canonical); + r = dns_name_normalize(dns_resource_key_name(canonical->key), 0, &normalized); + if (r < 0) + goto finish; + + /* Return the precise spelling and uppercasing and CNAME target reported by the server */ + r = sd_bus_message_append( + reply, "st", + normalized, + dns_query_reply_flags_make(q)); + if (r < 0) + goto finish; + + q->bus_request = sd_bus_message_unref(q->bus_request); + r = sd_bus_send(q->manager->bus, reply, NULL); + +finish: + if (r < 0) { + log_error_errno(r, "Failed to send hostname reply: %m"); + (void) reply_method_errnof(q, r, NULL); + } +} + +static int validate_and_mangle_flags( + const char *name, + uint64_t *flags, + uint64_t ok, + sd_bus_error *error) { + + assert(flags); + + /* Checks that the client supplied interface index and flags parameter actually are valid and make + * sense in our method call context. Specifically: + * + * 1. Checks that the interface index is either 0 (meaning *all* interfaces) or positive + * + * 2. Only the protocols flags and a bunch of NO_XYZ flags are set, at most. Plus additional flags + * specific to our method, passed in the "ok" parameter. + * + * 3. If zero protocol flags are specified it is automatically turned into *all* protocols. This way + * clients can simply pass 0 as flags and all will work as it should. They can also use this so + * that clients don't have to know all the protocols resolved implements, but can just specify 0 + * to mean "all supported protocols". + */ + + if (*flags & ~(SD_RESOLVED_PROTOCOLS_ALL| + SD_RESOLVED_NO_CNAME| + SD_RESOLVED_NO_VALIDATE| + SD_RESOLVED_NO_SYNTHESIZE| + SD_RESOLVED_NO_CACHE| + SD_RESOLVED_NO_ZONE| + SD_RESOLVED_NO_TRUST_ANCHOR| + SD_RESOLVED_NO_NETWORK| + SD_RESOLVED_NO_STALE| + ok)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid flags parameter"); + + if ((*flags & SD_RESOLVED_PROTOCOLS_ALL) == 0) /* If no protocol is enabled, enable all */ + *flags |= SD_RESOLVED_PROTOCOLS_ALL; + + /* Imply SD_RESOLVED_NO_SEARCH if permitted and name is dot suffixed. */ + if (name && FLAGS_SET(ok, SD_RESOLVED_NO_SEARCH) && dns_name_dot_suffixed(name) > 0) + *flags |= SD_RESOLVED_NO_SEARCH; + + return 0; +} + +static int parse_as_address(sd_bus_message *m, int ifindex, const char *hostname, int family, uint64_t flags) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *canonical = NULL; + union in_addr_union parsed; + int r, ff, parsed_ifindex = 0; + + /* Check if the hostname is actually already an IP address formatted as string. In that case just parse it, + * let's not attempt to look it up. */ + + r = in_addr_ifindex_from_string_auto(hostname, &ff, &parsed, &parsed_ifindex); + if (r < 0) /* not an address */ + return 0; + + if (family != AF_UNSPEC && ff != family) + return sd_bus_reply_method_errorf(m, BUS_ERROR_NO_SUCH_RR, "The specified address is not of the requested family."); + if (ifindex > 0 && parsed_ifindex > 0 && parsed_ifindex != ifindex) + return sd_bus_reply_method_errorf(m, BUS_ERROR_NO_SUCH_RR, "The specified address interface index does not match requested interface."); + + if (parsed_ifindex > 0) + ifindex = parsed_ifindex; + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(iiay)"); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'r', "iiay"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "ii", ifindex, ff); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &parsed, FAMILY_ADDRESS_SIZE(ff)); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + /* When an IP address is specified we just return it as canonical name, in order to avoid a DNS + * look-up. However, we reformat it to make sure it's in a truly canonical form (i.e. on IPv6 the inner + * omissions are always done the same way). */ + r = in_addr_ifindex_to_string(ff, &parsed, ifindex, &canonical); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "st", canonical, + SD_RESOLVED_FLAGS_MAKE(dns_synthesize_protocol(flags), ff, true, true) | + SD_RESOLVED_SYNTHETIC); + if (r < 0) + return r; + + return sd_bus_send(sd_bus_message_get_bus(m), reply, NULL); +} + +void bus_client_log(sd_bus_message *m, const char *what) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + const char *comm = NULL; + uid_t uid = UID_INVALID; + pid_t pid = 0; + int r; + + assert(m); + assert(what); + + if (!DEBUG_LOGGING) + return; + + r = sd_bus_query_sender_creds(m, SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|SD_BUS_CREDS_COMM|SD_BUS_CREDS_AUGMENT, &creds); + if (r < 0) + return (void) log_debug_errno(r, "Failed to query client credentials, ignoring: %m"); + + (void) sd_bus_creds_get_uid(creds, &uid); + (void) sd_bus_creds_get_pid(creds, &pid); + (void) sd_bus_creds_get_comm(creds, &comm); + + log_debug("D-Bus %s request from client PID " PID_FMT " (%s) with UID " UID_FMT, + what, pid, strna(comm), uid); +} + +static int bus_method_resolve_hostname(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(dns_question_unrefp) DnsQuestion *question_idna = NULL, *question_utf8 = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + Manager *m = ASSERT_PTR(userdata); + const char *hostname; + int family, ifindex; + uint64_t flags; + int r; + + assert(message); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(message, "isit", &ifindex, &hostname, &family, &flags); + if (r < 0) + return r; + + if (ifindex < 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface index"); + + if (!IN_SET(family, AF_INET, AF_INET6, AF_UNSPEC)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown address family %i", family); + + r = validate_and_mangle_flags(hostname, &flags, SD_RESOLVED_NO_SEARCH, error); + if (r < 0) + return r; + + r = parse_as_address(message, ifindex, hostname, family, flags); + if (r != 0) + return r; + + r = dns_name_is_valid(hostname); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid hostname '%s'", hostname); + + r = dns_question_new_address(&question_utf8, family, hostname, false); + if (r < 0) + return r; + + r = dns_question_new_address(&question_idna, family, hostname, true); + if (r < 0 && r != -EALREADY) + return r; + + bus_client_log(message, "hostname resolution"); + + r = dns_query_new(m, &q, question_utf8, question_idna ?: question_utf8, NULL, ifindex, flags); + if (r < 0) + return r; + + q->bus_request = sd_bus_message_ref(message); + q->request_family = family; + q->complete = bus_method_resolve_hostname_complete; + + r = dns_query_bus_track(q, message); + if (r < 0) + return r; + + r = dns_query_go(q); + if (r < 0) + return r; + + TAKE_PTR(q); + return 1; +} + +static void bus_method_resolve_address_complete(DnsQuery *query) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = query; + DnsQuestion *question; + DnsResourceRecord *rr; + unsigned added = 0; + int ifindex, r; + + assert(q); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + r = reply_query_state(q); + goto finish; + } + + r = dns_query_process_cname_many(q); + if (r == -ELOOP) { + r = reply_method_errorf(q, BUS_ERROR_CNAME_LOOP, "CNAME loop detected, or CNAME resolving disabled on '%s'", dns_query_string(q)); + goto finish; + } + if (r < 0) + goto finish; + if (r == DNS_QUERY_CNAME) { + /* This was a cname, and the query was restarted. */ + TAKE_PTR(q); + return; + } + + r = sd_bus_message_new_method_return(q->bus_request, &reply); + if (r < 0) + goto finish; + + r = sd_bus_message_open_container(reply, 'a', "(is)"); + if (r < 0) + goto finish; + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, q->answer) { + _cleanup_free_ char *normalized = NULL; + + r = dns_question_matches_rr(question, rr, NULL); + if (r < 0) + goto finish; + if (r == 0) + continue; + + r = dns_name_normalize(rr->ptr.name, 0, &normalized); + if (r < 0) + goto finish; + + r = sd_bus_message_append(reply, "(is)", ifindex, normalized); + if (r < 0) + goto finish; + + added++; + } + + if (added <= 0) { + r = reply_method_errorf(q, BUS_ERROR_NO_SUCH_RR, + "Address %s does not have any RR of requested type", + IN_ADDR_TO_STRING(q->request_family, &q->request_address)); + goto finish; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto finish; + + r = sd_bus_message_append(reply, "t", dns_query_reply_flags_make(q)); + if (r < 0) + goto finish; + + q->bus_request = sd_bus_message_unref(q->bus_request); + r = sd_bus_send(q->manager->bus, reply, NULL); + +finish: + if (r < 0) { + log_error_errno(r, "Failed to send address reply: %m"); + (void) reply_method_errnof(q, r, NULL); + } +} + +static int bus_method_resolve_address(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + Manager *m = ASSERT_PTR(userdata); + union in_addr_union a; + int family, ifindex; + uint64_t flags; + int r; + + assert(message); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(message, "i", &ifindex); + if (r < 0) + return r; + + r = bus_message_read_in_addr_auto(message, error, &family, &a); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "t", &flags); + if (r < 0) + return r; + + if (ifindex < 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface index"); + + r = validate_and_mangle_flags(NULL, &flags, 0, error); + if (r < 0) + return r; + + r = dns_question_new_reverse(&question, family, &a); + if (r < 0) + return r; + + bus_client_log(message, "address resolution"); + + r = dns_query_new(m, &q, question, question, NULL, ifindex, flags|SD_RESOLVED_NO_SEARCH); + if (r < 0) + return r; + + q->bus_request = sd_bus_message_ref(message); + q->request_family = family; + q->request_address = a; + q->complete = bus_method_resolve_address_complete; + + r = dns_query_bus_track(q, message); + if (r < 0) + return r; + + r = dns_query_go(q); + if (r < 0) + return r; + + TAKE_PTR(q); + return 1; +} + +static int bus_message_append_rr(sd_bus_message *m, DnsResourceRecord *rr, int ifindex) { + int r; + + assert(m); + assert(rr); + + r = sd_bus_message_open_container(m, 'r', "iqqay"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "iqq", + ifindex, + rr->key->class, + rr->key->type); + if (r < 0) + return r; + + r = dns_resource_record_to_wire_format(rr, false); + if (r < 0) + return r; + + r = sd_bus_message_append_array(m, 'y', rr->wire_format, rr->wire_format_size); + if (r < 0) + return r; + + return sd_bus_message_close_container(m); +} + +static void bus_method_resolve_record_complete(DnsQuery *query) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = query; + DnsResourceRecord *rr; + DnsQuestion *question; + unsigned added = 0; + int ifindex; + int r; + + assert(q); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + r = reply_query_state(q); + goto finish; + } + + r = dns_query_process_cname_many(q); + if (r == -ELOOP) { + r = reply_method_errorf(q, BUS_ERROR_CNAME_LOOP, "CNAME loop detected, or CNAME resolving disabled on '%s'", dns_query_string(q)); + goto finish; + } + if (r < 0) + goto finish; + if (r == DNS_QUERY_CNAME) { + /* This was a cname, and the query was restarted. */ + TAKE_PTR(q); + return; + } + + r = sd_bus_message_new_method_return(q->bus_request, &reply); + if (r < 0) + goto finish; + + r = sd_bus_message_open_container(reply, 'a', "(iqqay)"); + if (r < 0) + goto finish; + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, q->answer) { + r = dns_question_matches_rr(question, rr, NULL); + if (r < 0) + goto finish; + if (r == 0) + continue; + + r = bus_message_append_rr(reply, rr, ifindex); + if (r < 0) + goto finish; + + added++; + } + + if (added <= 0) { + r = reply_method_errorf(q, BUS_ERROR_NO_SUCH_RR, "Name '%s' does not have any RR of the requested type", dns_query_string(q)); + goto finish; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto finish; + + r = sd_bus_message_append(reply, "t", dns_query_reply_flags_make(q)); + if (r < 0) + goto finish; + + q->bus_request = sd_bus_message_unref(q->bus_request); + r = sd_bus_send(q->manager->bus, reply, NULL); + +finish: + if (r < 0) { + log_error_errno(r, "Failed to send record reply: %m"); + (void) reply_method_errnof(q, r, NULL); + } +} + +static int bus_method_resolve_record(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + Manager *m = ASSERT_PTR(userdata); + uint16_t class, type; + const char *name; + int r, ifindex; + uint64_t flags; + + assert(message); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(message, "isqqt", &ifindex, &name, &class, &type, &flags); + if (r < 0) + return r; + + if (ifindex < 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface index"); + + r = dns_name_is_valid(name); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid name '%s'", name); + + if (!dns_type_is_valid_query(type)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Specified resource record type %" PRIu16 " may not be used in a query.", type); + if (dns_type_is_zone_transer(type)) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Zone transfers not permitted via this programming interface."); + if (dns_type_is_obsolete(type)) + return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Specified DNS resource record type %" PRIu16 " is obsolete.", type); + + r = validate_and_mangle_flags(name, &flags, 0, error); + if (r < 0) + return r; + + question = dns_question_new(1); + if (!question) + return -ENOMEM; + + key = dns_resource_key_new(class, type, name); + if (!key) + return -ENOMEM; + + r = dns_question_add(question, key, 0); + if (r < 0) + return r; + + bus_client_log(message, "resource record resolution"); + + /* Setting SD_RESOLVED_CLAMP_TTL: let's request that the TTL is fixed up for locally cached entries, + * after all we return it in the wire format blob. */ + r = dns_query_new(m, &q, question, question, NULL, ifindex, flags|SD_RESOLVED_NO_SEARCH|SD_RESOLVED_CLAMP_TTL); + if (r < 0) + return r; + + q->bus_request = sd_bus_message_ref(message); + q->complete = bus_method_resolve_record_complete; + + r = dns_query_bus_track(q, message); + if (r < 0) + return r; + + r = dns_query_go(q); + if (r < 0) + return r; + + TAKE_PTR(q); + return 1; +} + +static int append_srv(DnsQuery *q, sd_bus_message *reply, DnsResourceRecord *rr) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *canonical = NULL; + _cleanup_free_ char *normalized = NULL; + int r; + + assert(q); + assert(reply); + assert(rr); + assert(rr->key); + + if (rr->key->type != DNS_TYPE_SRV) + return 0; + + if ((q->flags & SD_RESOLVED_NO_ADDRESS) == 0) { + /* First, let's see if we could find an appropriate A or AAAA + * record for the SRV record */ + LIST_FOREACH(auxiliary_queries, aux, q->auxiliary_queries) { + DnsResourceRecord *zz; + DnsQuestion *question; + + if (aux->state != DNS_TRANSACTION_SUCCESS) + continue; + if (aux->auxiliary_result != 0) + continue; + + question = dns_query_question_for_protocol(aux, aux->answer_protocol); + + r = dns_name_equal(dns_question_first_name(question), rr->srv.name); + if (r < 0) + return r; + if (r == 0) + continue; + + DNS_ANSWER_FOREACH(zz, aux->answer) { + + r = dns_question_matches_rr(question, zz, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + canonical = dns_resource_record_ref(zz); + break; + } + + if (canonical) + break; + } + + /* Is there are successful A/AAAA lookup for this SRV RR? If not, don't add it */ + if (!canonical) + return 0; + } + + r = sd_bus_message_open_container(reply, 'r', "qqqsa(iiay)s"); + if (r < 0) + return r; + + r = dns_name_normalize(rr->srv.name, 0, &normalized); + if (r < 0) + return r; + + r = sd_bus_message_append( + reply, + "qqqs", + rr->srv.priority, rr->srv.weight, rr->srv.port, normalized); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "(iiay)"); + if (r < 0) + return r; + + if ((q->flags & SD_RESOLVED_NO_ADDRESS) == 0) { + LIST_FOREACH(auxiliary_queries, aux, q->auxiliary_queries) { + DnsResourceRecord *zz; + DnsQuestion *question; + int ifindex; + + if (aux->state != DNS_TRANSACTION_SUCCESS) + continue; + if (aux->auxiliary_result != 0) + continue; + + question = dns_query_question_for_protocol(aux, aux->answer_protocol); + + r = dns_name_equal(dns_question_first_name(question), rr->srv.name); + if (r < 0) + return r; + if (r == 0) + continue; + + DNS_ANSWER_FOREACH_IFINDEX(zz, ifindex, aux->answer) { + + r = dns_question_matches_rr(question, zz, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + r = append_address(reply, zz, ifindex); + if (r < 0) + return r; + } + } + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + if (canonical) { + normalized = mfree(normalized); + + r = dns_name_normalize(dns_resource_key_name(canonical->key), 0, &normalized); + if (r < 0) + return r; + } + + /* Note that above we appended the hostname as encoded in the + * SRV, and here the canonical hostname this maps to. */ + r = sd_bus_message_append(reply, "s", normalized); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return 1; +} + +static int append_txt(sd_bus_message *reply, DnsResourceRecord *rr) { + int r; + + assert(reply); + assert(rr); + assert(rr->key); + + if (rr->key->type != DNS_TYPE_TXT) + return 0; + + LIST_FOREACH(items, i, rr->txt.items) { + + if (i->length <= 0) + continue; + + r = sd_bus_message_append_array(reply, 'y', i->data, i->length); + if (r < 0) + return r; + } + + return 1; +} + +static void resolve_service_all_complete(DnsQuery *query) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *canonical = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *name = NULL, *type = NULL, *domain = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = query; + DnsQuestion *question; + DnsResourceRecord *rr; + unsigned added = 0; + int r; + + assert(q); + + if (q->block_all_complete > 0) { + TAKE_PTR(q); + return; + } + + if ((q->flags & SD_RESOLVED_NO_ADDRESS) == 0) { + DnsQuery *bad = NULL; + bool have_success = false; + + LIST_FOREACH(auxiliary_queries, aux, q->auxiliary_queries) { + + switch (aux->state) { + + case DNS_TRANSACTION_PENDING: + /* If an auxiliary query is still pending, let's wait */ + TAKE_PTR(q); + return; + + case DNS_TRANSACTION_SUCCESS: + if (aux->auxiliary_result == 0) + have_success = true; + else + bad = aux; + break; + + default: + bad = aux; + break; + } + } + + if (!have_success) { + /* We can only return one error, hence pick the last error we encountered */ + + assert(bad); + + if (bad->state == DNS_TRANSACTION_SUCCESS) { + assert(bad->auxiliary_result != 0); + + if (bad->auxiliary_result == -ELOOP) { + r = reply_method_errorf(q, BUS_ERROR_CNAME_LOOP, "CNAME loop detected, or CNAME resolving disabled on '%s'", dns_query_string(bad)); + goto finish; + } + + assert(bad->auxiliary_result < 0); + r = bad->auxiliary_result; + goto finish; + } + + r = reply_query_state(bad); + goto finish; + } + } + + r = sd_bus_message_new_method_return(q->bus_request, &reply); + if (r < 0) + goto finish; + + r = sd_bus_message_open_container(reply, 'a', "(qqqsa(iiay)s)"); + if (r < 0) + goto finish; + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH(rr, q->answer) { + r = dns_question_matches_rr(question, rr, NULL); + if (r < 0) + goto finish; + if (r == 0) + continue; + + r = append_srv(q, reply, rr); + if (r < 0) + goto finish; + if (r == 0) /* not an SRV record */ + continue; + + if (!canonical) + canonical = dns_resource_record_ref(rr); + + added++; + } + + if (added <= 0) { + r = reply_method_errorf(q, BUS_ERROR_NO_SUCH_RR, "'%s' does not have any RR of the requested type", dns_query_string(q)); + goto finish; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto finish; + + r = sd_bus_message_open_container(reply, 'a', "ay"); + if (r < 0) + goto finish; + + DNS_ANSWER_FOREACH(rr, q->answer) { + r = dns_question_matches_rr(question, rr, NULL); + if (r < 0) + goto finish; + if (r == 0) + continue; + + r = append_txt(reply, rr); + if (r < 0) + goto finish; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + goto finish; + + assert(canonical); + r = dns_service_split(dns_resource_key_name(canonical->key), &name, &type, &domain); + if (r < 0) + goto finish; + + r = sd_bus_message_append( + reply, + "ssst", + name, type, domain, + dns_query_reply_flags_make(q)); + if (r < 0) + goto finish; + + q->bus_request = sd_bus_message_unref(q->bus_request); + r = sd_bus_send(q->manager->bus, reply, NULL); + +finish: + if (r < 0) { + log_error_errno(r, "Failed to send service reply: %m"); + (void) reply_method_errnof(q, r, NULL); + } +} + +static void resolve_service_hostname_complete(DnsQuery *q) { + int r; + + assert(q); + assert(q->auxiliary_for); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + resolve_service_all_complete(q->auxiliary_for); + return; + } + + r = dns_query_process_cname_many(q); + if (r == DNS_QUERY_CNAME) /* This was a cname, and the query was restarted. */ + return; + + /* This auxiliary lookup is finished or failed, let's see if all are finished now. */ + q->auxiliary_result = r < 0 ? r : 0; + resolve_service_all_complete(q->auxiliary_for); +} + +static int resolve_service_hostname(DnsQuery *q, DnsResourceRecord *rr, int ifindex) { + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + _cleanup_(dns_query_freep) DnsQuery *aux = NULL; + int r; + + assert(q); + assert(rr); + assert(rr->key); + assert(rr->key->type == DNS_TYPE_SRV); + + /* OK, we found an SRV record for the service. Let's resolve + * the hostname included in it */ + + r = dns_question_new_address(&question, q->request_family, rr->srv.name, false); + if (r < 0) + return r; + + r = dns_query_new(q->manager, &aux, question, question, NULL, ifindex, q->flags|SD_RESOLVED_NO_SEARCH); + if (r < 0) + return r; + + aux->request_family = q->request_family; + aux->complete = resolve_service_hostname_complete; + + r = dns_query_make_auxiliary(aux, q); + if (r == -EAGAIN) + /* Too many auxiliary lookups? If so, don't complain, + * let's just not add this one, we already have more + * than enough */ + return 0; + if (r < 0) + return r; + + /* Note that auxiliary queries do not track the original bus + * client, only the primary request does that. */ + + r = dns_query_go(aux); + if (r < 0) + return r; + + TAKE_PTR(aux); + return 1; +} + +static void bus_method_resolve_service_complete(DnsQuery *query) { + _cleanup_(dns_query_freep) DnsQuery *q = query; + bool has_root_domain = false; + DnsResourceRecord *rr; + DnsQuestion *question; + unsigned found = 0; + int ifindex, r; + + assert(q); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + r = reply_query_state(q); + goto finish; + } + + r = dns_query_process_cname_many(q); + if (r == -ELOOP) { + r = reply_method_errorf(q, BUS_ERROR_CNAME_LOOP, "CNAME loop detected, or CNAME resolving disabled on '%s'", dns_query_string(q)); + goto finish; + } + if (r < 0) + goto finish; + if (r == DNS_QUERY_CNAME) { + /* This was a cname, and the query was restarted. */ + TAKE_PTR(q); + return; + } + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, q->answer) { + r = dns_question_matches_rr(question, rr, NULL); + if (r < 0) + goto finish; + if (r == 0) + continue; + + if (rr->key->type != DNS_TYPE_SRV) + continue; + + if (dns_name_is_root(rr->srv.name)) { + has_root_domain = true; + continue; + } + + if ((q->flags & SD_RESOLVED_NO_ADDRESS) == 0) { + q->block_all_complete++; + r = resolve_service_hostname(q, rr, ifindex); + q->block_all_complete--; + + if (r < 0) + goto finish; + } + + found++; + } + + if (has_root_domain && found <= 0) { + /* If there's exactly one SRV RR and it uses the root domain as hostname, then the service is + * explicitly not offered on the domain. Report this as a recognizable error. See RFC 2782, + * Section "Usage Rules". */ + r = reply_method_errorf(q, BUS_ERROR_NO_SUCH_SERVICE, "'%s' does not provide the requested service", dns_query_string(q)); + goto finish; + } + + if (found <= 0) { + r = reply_method_errorf(q, BUS_ERROR_NO_SUCH_RR, "'%s' does not have any RR of the requested type", dns_query_string(q)); + goto finish; + } + + /* Maybe we are already finished? check now... */ + resolve_service_all_complete(TAKE_PTR(q)); + return; + +finish: + if (r < 0) { + log_error_errno(r, "Failed to send service reply: %m"); + (void) reply_method_errnof(q, r, NULL); + } +} + +static int bus_method_resolve_service(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(dns_question_unrefp) DnsQuestion *question_idna = NULL, *question_utf8 = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + const char *name, *type, *domain; + Manager *m = ASSERT_PTR(userdata); + int family, ifindex; + uint64_t flags; + int r; + + assert(message); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(message, "isssit", &ifindex, &name, &type, &domain, &family, &flags); + if (r < 0) + return r; + + if (ifindex < 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface index"); + + if (!IN_SET(family, AF_INET, AF_INET6, AF_UNSPEC)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown address family %i", family); + + if (isempty(name)) + name = NULL; + else if (!dns_service_name_is_valid(name)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid service name '%s'", name); + + if (isempty(type)) + type = NULL; + else if (!dns_srv_type_is_valid(type)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid SRV service type '%s'", type); + + r = dns_name_is_valid(domain); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid domain '%s'", domain); + + if (name && !type) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Service name cannot be specified without service type."); + + r = validate_and_mangle_flags(name, &flags, SD_RESOLVED_NO_TXT|SD_RESOLVED_NO_ADDRESS, error); + if (r < 0) + return r; + + r = dns_question_new_service(&question_utf8, name, type, domain, !(flags & SD_RESOLVED_NO_TXT), false); + if (r < 0) + return r; + + r = dns_question_new_service(&question_idna, name, type, domain, !(flags & SD_RESOLVED_NO_TXT), true); + if (r < 0) + return r; + + bus_client_log(message, "service resolution"); + + r = dns_query_new(m, &q, question_utf8, question_idna, NULL, ifindex, flags|SD_RESOLVED_NO_SEARCH); + if (r < 0) + return r; + + q->bus_request = sd_bus_message_ref(message); + q->request_family = family; + q->complete = bus_method_resolve_service_complete; + + r = dns_query_bus_track(q, message); + if (r < 0) + return r; + + r = dns_query_go(q); + if (r < 0) + return r; + + TAKE_PTR(q); + return 1; +} + +int bus_dns_server_append( + sd_bus_message *reply, + DnsServer *s, + bool with_ifindex, /* include "ifindex" field */ + bool extended) { /* also include port number and server name */ + int r; + + assert(reply); + + if (!s) { + if (with_ifindex) { + if (extended) + return sd_bus_message_append(reply, "(iiayqs)", 0, AF_UNSPEC, 0, 0, NULL); + else + return sd_bus_message_append(reply, "(iiay)", 0, AF_UNSPEC, 0); + } else { + if (extended) + return sd_bus_message_append(reply, "(iayqs)", AF_UNSPEC, 0, 0, NULL); + else + return sd_bus_message_append(reply, "(iay)", AF_UNSPEC, 0); + } + } + + r = sd_bus_message_open_container( + reply, + 'r', + with_ifindex ? (extended ? "iiayqs" : "iiay") : + (extended ? "iayqs" : "iay")); + if (r < 0) + return r; + + if (with_ifindex) { + r = sd_bus_message_append(reply, "i", dns_server_ifindex(s)); + if (r < 0) + return r; + } + + r = sd_bus_message_append(reply, "i", s->family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &s->address, FAMILY_ADDRESS_SIZE(s->family)); + if (r < 0) + return r; + + if (extended) { + r = sd_bus_message_append(reply, "q", s->port); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "s", s->server_name); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int bus_property_get_dns_servers_internal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error, + bool extended) { + + Manager *m = ASSERT_PTR(userdata); + Link *l; + int r; + + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', extended ? "(iiayqs)" : "(iiay)"); + if (r < 0) + return r; + + LIST_FOREACH(servers, s, m->dns_servers) { + r = bus_dns_server_append(reply, s, true, extended); + if (r < 0) + return r; + } + + HASHMAP_FOREACH(l, m->links) + LIST_FOREACH(servers, s, l->dns_servers) { + r = bus_dns_server_append(reply, s, true, extended); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int bus_property_get_dns_servers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return bus_property_get_dns_servers_internal(bus, path, interface, property, reply, userdata, error, false); +} + +static int bus_property_get_dns_servers_ex( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return bus_property_get_dns_servers_internal(bus, path, interface, property, reply, userdata, error, true); +} + +static int bus_property_get_fallback_dns_servers_internal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error, + bool extended) { + + DnsServer **f = ASSERT_PTR(userdata); + int r; + + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', extended ? "(iiayqs)" : "(iiay)"); + if (r < 0) + return r; + + LIST_FOREACH(servers, s, *f) { + r = bus_dns_server_append(reply, s, true, extended); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int bus_property_get_fallback_dns_servers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return bus_property_get_fallback_dns_servers_internal(bus, path, interface, property, reply, userdata, error, false); +} + +static int bus_property_get_fallback_dns_servers_ex( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return bus_property_get_fallback_dns_servers_internal(bus, path, interface, property, reply, userdata, error, true); +} + +static int bus_property_get_current_dns_server_internal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error, + bool extended) { + + DnsServer *s; + + assert(reply); + assert(userdata); + + s = *(DnsServer **) userdata; + + return bus_dns_server_append(reply, s, true, extended); +} + +static int bus_property_get_current_dns_server( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return bus_property_get_current_dns_server_internal(bus, path, interface, property, reply, userdata, error, false); +} + +static int bus_property_get_current_dns_server_ex( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return bus_property_get_current_dns_server_internal(bus, path, interface, property, reply, userdata, error, true); +} + +static int bus_property_get_domains( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + Link *l; + int r; + + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(isb)"); + if (r < 0) + return r; + + LIST_FOREACH(domains, d, m->search_domains) { + r = sd_bus_message_append(reply, "(isb)", 0, d->name, d->route_only); + if (r < 0) + return r; + } + + HASHMAP_FOREACH(l, m->links) { + LIST_FOREACH(domains, d, l->search_domains) { + r = sd_bus_message_append(reply, "(isb)", l->ifindex, d->name, d->route_only); + if (r < 0) + return r; + } + } + + return sd_bus_message_close_container(reply); +} + +static int bus_property_get_transaction_statistics( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(reply); + + return sd_bus_message_append(reply, "(tt)", + (uint64_t) hashmap_size(m->dns_transactions), + (uint64_t) m->n_transactions_total); +} + +static int bus_property_get_cache_statistics( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t size = 0, hit = 0, miss = 0; + Manager *m = ASSERT_PTR(userdata); + + assert(reply); + + LIST_FOREACH(scopes, s, m->dns_scopes) { + size += dns_cache_size(&s->cache); + hit += s->cache.n_hit; + miss += s->cache.n_miss; + } + + return sd_bus_message_append(reply, "(ttt)", size, hit, miss); +} + +static int bus_property_get_dnssec_statistics( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + + assert(reply); + + return sd_bus_message_append(reply, "(tttt)", + (uint64_t) m->n_dnssec_verdict[DNSSEC_SECURE], + (uint64_t) m->n_dnssec_verdict[DNSSEC_INSECURE], + (uint64_t) m->n_dnssec_verdict[DNSSEC_BOGUS], + (uint64_t) m->n_dnssec_verdict[DNSSEC_INDETERMINATE]); +} + +static BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_dns_stub_listener_mode, dns_stub_listener_mode, DnsStubListenerMode); +static BUS_DEFINE_PROPERTY_GET(bus_property_get_dnssec_supported, "b", Manager, manager_dnssec_supported); +static BUS_DEFINE_PROPERTY_GET2(bus_property_get_dnssec_mode, "s", Manager, manager_get_dnssec_mode, dnssec_mode_to_string); +static BUS_DEFINE_PROPERTY_GET2(bus_property_get_dns_over_tls_mode, "s", Manager, manager_get_dns_over_tls_mode, dns_over_tls_mode_to_string); + +static int bus_property_get_resolv_conf_mode( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + int r; + + assert(reply); + + r = resolv_conf_mode(); + if (r < 0) { + log_warning_errno(r, "Failed to test /etc/resolv.conf mode, ignoring: %m"); + return sd_bus_message_append(reply, "s", NULL); + } + + return sd_bus_message_append(reply, "s", resolv_conf_mode_to_string(r)); +} + +static int bus_method_reset_statistics(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + + assert(message); + + bus_client_log(message, "statistics reset"); + + dns_manager_reset_statistics(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int get_any_link(Manager *m, int ifindex, Link **ret, sd_bus_error *error) { + Link *l; + + assert(m); + assert(ret); + + l = hashmap_get(m->links, INT_TO_PTR(ifindex)); + if (!l) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_LINK, "Link %i not known", ifindex); + + *ret = l; + return 0; +} + +static int call_link_method(Manager *m, sd_bus_message *message, sd_bus_message_handler_t handler, sd_bus_error *error) { + int ifindex, r; + Link *l; + + assert(m); + assert(message); + assert(handler); + + r = bus_message_read_ifindex(message, error, &ifindex); + if (r < 0) + return r; + + r = get_any_link(m, ifindex, &l, error); + if (r < 0) + return r; + + return handler(message, l, error); +} + +static int bus_method_set_link_dns_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dns_servers, error); +} + +static int bus_method_set_link_dns_servers_ex(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dns_servers_ex, error); +} + +static int bus_method_set_link_domains(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_domains, error); +} + +static int bus_method_set_link_default_route(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_default_route, error); +} + +static int bus_method_set_link_llmnr(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_llmnr, error); +} + +static int bus_method_set_link_mdns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_mdns, error); +} + +static int bus_method_set_link_dns_over_tls(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dns_over_tls, error); +} + +static int bus_method_set_link_dnssec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dnssec, error); +} + +static int bus_method_set_link_dnssec_negative_trust_anchors(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_set_dnssec_negative_trust_anchors, error); +} + +static int bus_method_revert_link(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return call_link_method(userdata, message, bus_link_method_revert, error); +} + +static int bus_method_get_link(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *p = NULL; + Manager *m = ASSERT_PTR(userdata); + int r, ifindex; + Link *l; + + assert(message); + + r = bus_message_read_ifindex(message, error, &ifindex); + if (r < 0) + return r; + + r = get_any_link(m, ifindex, &l, error); + if (r < 0) + return r; + + p = link_bus_path(l); + if (!p) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", p); +} + +static int bus_method_flush_caches(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + + assert(message); + + bus_client_log(message, "cache flush"); + + manager_flush_caches(m, LOG_INFO); + + return sd_bus_reply_method_return(message, NULL); +} + +static int bus_method_reset_server_features(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + + assert(message); + + bus_client_log(message, "server feature reset"); + + manager_reset_server_features(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int dnssd_service_on_bus_track(sd_bus_track *t, void *userdata) { + DnssdService *s = ASSERT_PTR(userdata); + + assert(t); + + log_debug("Client of active request vanished, destroying DNS-SD service."); + dnssd_service_free(s); + + return 0; +} + +static int bus_method_register_service(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_(dnssd_service_freep) DnssdService *service = NULL; + _cleanup_(sd_bus_track_unrefp) sd_bus_track *bus_track = NULL; + const char *name, *name_template, *type; + _cleanup_free_ char *path = NULL; + DnssdService *s = NULL; + Manager *m = ASSERT_PTR(userdata); + uid_t euid; + int r; + + assert(message); + + if (m->mdns_support != RESOLVE_SUPPORT_YES) + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for MulticastDNS is disabled"); + + service = new0(DnssdService, 1); + if (!service) + return log_oom(); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_euid(creds, &euid); + if (r < 0) + return r; + service->originator = euid; + + r = sd_bus_message_read(message, "sssqqq", &name, &name_template, &type, + &service->port, &service->priority, + &service->weight); + if (r < 0) + return r; + + s = hashmap_get(m->dnssd_services, name); + if (s) + return sd_bus_error_setf(error, BUS_ERROR_DNSSD_SERVICE_EXISTS, "DNS-SD service '%s' exists already", name); + + if (!dnssd_srv_type_is_valid(type)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "DNS-SD service type '%s' is invalid", type); + + service->name = strdup(name); + if (!service->name) + return log_oom(); + + service->name_template = strdup(name_template); + if (!service->name_template) + return log_oom(); + + service->type = strdup(type); + if (!service->type) + return log_oom(); + + r = dnssd_render_instance_name(m, service, NULL); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, SD_BUS_TYPE_ARRAY, "a{say}"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(message, SD_BUS_TYPE_ARRAY, "{say}")) > 0) { + _cleanup_(dnssd_txtdata_freep) DnssdTxtData *txt_data = NULL; + DnsTxtItem *last = NULL; + + txt_data = new0(DnssdTxtData, 1); + if (!txt_data) + return log_oom(); + + while ((r = sd_bus_message_enter_container(message, SD_BUS_TYPE_DICT_ENTRY, "say")) > 0) { + const char *key; + const void *value; + size_t size; + DnsTxtItem *i; + + r = sd_bus_message_read(message, "s", &key); + if (r < 0) + return r; + + if (isempty(key)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Keys in DNS-SD TXT RRs can't be empty"); + + if (!ascii_is_valid(key)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "TXT key '%s' contains non-ASCII symbols", key); + + r = sd_bus_message_read_array(message, 'y', &value, &size); + if (r < 0) + return r; + + r = dnssd_txt_item_new_from_data(key, value, size, &i); + if (r < 0) + return r; + + LIST_INSERT_AFTER(items, txt_data->txts, last, i); + last = i; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (txt_data->txts) { + LIST_PREPEND(items, service->txt_data_items, txt_data); + txt_data = NULL; + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!service->txt_data_items) { + _cleanup_(dnssd_txtdata_freep) DnssdTxtData *txt_data = NULL; + + txt_data = new0(DnssdTxtData, 1); + if (!txt_data) + return log_oom(); + + r = dns_txt_item_new_empty(&txt_data->txts); + if (r < 0) + return r; + + LIST_PREPEND(items, service->txt_data_items, txt_data); + txt_data = NULL; + } + + r = sd_bus_path_encode("/org/freedesktop/resolve1/dnssd", service->name, &path); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_SYS_ADMIN, + "org.freedesktop.resolve1.register-service", + NULL, false, UID_INVALID, + &m->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + r = hashmap_ensure_put(&m->dnssd_services, &string_hash_ops, service->name, service); + if (r < 0) + return r; + + r = sd_bus_track_new(sd_bus_message_get_bus(message), &bus_track, dnssd_service_on_bus_track, service); + if (r < 0) + return r; + + r = sd_bus_track_add_sender(bus_track, message); + if (r < 0) + return r; + + service->manager = m; + + service = NULL; + + manager_refresh_rrs(m); + + return sd_bus_reply_method_return(message, "o", path); +} + +static int call_dnssd_method(Manager *m, sd_bus_message *message, sd_bus_message_handler_t handler, sd_bus_error *error) { + _cleanup_free_ char *name = NULL; + DnssdService *s = NULL; + const char *path; + int r; + + assert(m); + assert(message); + assert(handler); + + r = sd_bus_message_read(message, "o", &path); + if (r < 0) + return r; + + r = sd_bus_path_decode(path, "/org/freedesktop/resolve1/dnssd", &name); + if (r == 0) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DNSSD_SERVICE, "DNS-SD service with object path '%s' does not exist", path); + if (r < 0) + return r; + + s = hashmap_get(m->dnssd_services, name); + if (!s) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DNSSD_SERVICE, "DNS-SD service '%s' not known", name); + + return handler(message, s, error); +} + +static int bus_method_unregister_service(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + + assert(message); + + return call_dnssd_method(m, message, bus_dnssd_method_unregister, error); +} + +static const sd_bus_vtable resolve_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_PROPERTY("LLMNRHostname", "s", NULL, offsetof(Manager, llmnr_hostname), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("LLMNR", "s", bus_property_get_resolve_support, offsetof(Manager, llmnr_support), 0), + SD_BUS_PROPERTY("MulticastDNS", "s", bus_property_get_resolve_support, offsetof(Manager, mdns_support), 0), + SD_BUS_PROPERTY("DNSOverTLS", "s", bus_property_get_dns_over_tls_mode, 0, 0), + SD_BUS_PROPERTY("DNS", "a(iiay)", bus_property_get_dns_servers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("DNSEx", "a(iiayqs)", bus_property_get_dns_servers_ex, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FallbackDNS", "a(iiay)", bus_property_get_fallback_dns_servers, offsetof(Manager, fallback_dns_servers), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("FallbackDNSEx", "a(iiayqs)", bus_property_get_fallback_dns_servers_ex, offsetof(Manager, fallback_dns_servers), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CurrentDNSServer", "(iiay)", bus_property_get_current_dns_server, offsetof(Manager, current_dns_server), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CurrentDNSServerEx", "(iiayqs)", bus_property_get_current_dns_server_ex, offsetof(Manager, current_dns_server), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Domains", "a(isb)", bus_property_get_domains, 0, 0), + SD_BUS_PROPERTY("TransactionStatistics", "(tt)", bus_property_get_transaction_statistics, 0, 0), + SD_BUS_PROPERTY("CacheStatistics", "(ttt)", bus_property_get_cache_statistics, 0, 0), + SD_BUS_PROPERTY("DNSSEC", "s", bus_property_get_dnssec_mode, 0, 0), + SD_BUS_PROPERTY("DNSSECStatistics", "(tttt)", bus_property_get_dnssec_statistics, 0, 0), + SD_BUS_PROPERTY("DNSSECSupported", "b", bus_property_get_dnssec_supported, 0, 0), + SD_BUS_PROPERTY("DNSSECNegativeTrustAnchors", "as", bus_property_get_string_set, offsetof(Manager, trust_anchor.negative_by_name), 0), + SD_BUS_PROPERTY("DNSStubListener", "s", bus_property_get_dns_stub_listener_mode, offsetof(Manager, dns_stub_listener_mode), 0), + SD_BUS_PROPERTY("ResolvConfMode", "s", bus_property_get_resolv_conf_mode, 0, 0), + + SD_BUS_METHOD_WITH_ARGS("ResolveHostname", + SD_BUS_ARGS("i", ifindex, "s", name, "i", family, "t", flags), + SD_BUS_RESULT("a(iiay)", addresses, "s", canonical, "t", flags), + bus_method_resolve_hostname, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ResolveAddress", + SD_BUS_ARGS("i", ifindex, "i", family, "ay", address, "t", flags), + SD_BUS_RESULT("a(is)", names, "t", flags), + bus_method_resolve_address, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ResolveRecord", + SD_BUS_ARGS("i", ifindex, "s", name, "q", class, "q", type, "t", flags), + SD_BUS_RESULT("a(iqqay)", records, "t", flags), + bus_method_resolve_record, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ResolveService", + SD_BUS_ARGS("i", ifindex, + "s", name, + "s", type, + "s", domain, + "i", family, + "t", flags), + SD_BUS_RESULT("a(qqqsa(iiay)s)", srv_data, + "aay", txt_data, + "s", canonical_name, + "s", canonical_type, + "s", canonical_domain, + "t", flags), + bus_method_resolve_service, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("GetLink", + SD_BUS_ARGS("i", ifindex), + SD_BUS_RESULT("o", path), + bus_method_get_link, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNS", + SD_BUS_ARGS("i", ifindex, "a(iay)", addresses), + SD_BUS_NO_RESULT, + bus_method_set_link_dns_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSEx", + SD_BUS_ARGS("i", ifindex, "a(iayqs)", addresses), + SD_BUS_NO_RESULT, + bus_method_set_link_dns_servers_ex, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDomains", + SD_BUS_ARGS("i", ifindex, "a(sb)", domains), + SD_BUS_NO_RESULT, + bus_method_set_link_domains, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDefaultRoute", + SD_BUS_ARGS("i", ifindex, "b", enable), + SD_BUS_NO_RESULT, + bus_method_set_link_default_route, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkLLMNR", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_llmnr, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkMulticastDNS", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_mdns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSOverTLS", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_dns_over_tls, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSSEC", + SD_BUS_ARGS("i", ifindex, "s", mode), + SD_BUS_NO_RESULT, + bus_method_set_link_dnssec, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLinkDNSSECNegativeTrustAnchors", + SD_BUS_ARGS("i", ifindex, "as", names), + SD_BUS_NO_RESULT, + bus_method_set_link_dnssec_negative_trust_anchors, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RevertLink", + SD_BUS_ARGS("i", ifindex), + SD_BUS_NO_RESULT, + bus_method_revert_link, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RegisterService", + SD_BUS_ARGS("s", name, + "s", name_template, + "s", type, + "q", service_port, + "q", service_priority, + "q", service_weight, + "aa{say}", txt_datas), + SD_BUS_RESULT("o", service_path), + bus_method_register_service, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("UnregisterService", + SD_BUS_ARGS("o", service_path), + SD_BUS_NO_RESULT, + bus_method_unregister_service, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ResetStatistics", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_method_reset_statistics, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("FlushCaches", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_method_flush_caches, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ResetServerFeatures", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_method_reset_server_features, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END, +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/resolve1", + "org.freedesktop.resolve1.Manager", + .vtables = BUS_VTABLES(resolve_vtable), + .children = BUS_IMPLEMENTATIONS(&link_object, + &dnssd_object), +}; + +static int match_prepare_for_sleep(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + Manager *m = ASSERT_PTR(userdata); + int b, r; + + assert(message); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + if (b) + return 0; + + log_debug("Coming back from suspend, verifying all RRs..."); + + manager_verify_all(m); + return 0; +} + +int manager_connect_bus(Manager *m) { + int r; + + assert(m); + + if (m->bus) + return 0; + + r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-resolve"); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.resolve1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + r = bus_match_signal_async( + m->bus, + NULL, + bus_login_mgr, + "PrepareForSleep", + match_prepare_for_sleep, + NULL, + m); + if (r < 0) + log_warning_errno(r, "Failed to request match for PrepareForSleep, ignoring: %m"); + + return 0; +} + +int _manager_send_changed(Manager *manager, const char *property, ...) { + assert(manager); + + if (sd_bus_is_ready(manager->bus) <= 0) + return 0; + + char **l = strv_from_stdarg_alloca(property); + + int r = sd_bus_emit_properties_changed_strv( + manager->bus, + "/org/freedesktop/resolve1", + "org.freedesktop.resolve1.Manager", + l); + if (r < 0) + log_notice_errno(r, "Failed to emit notification about changed property %s: %m", property); + return r; +} diff --git a/src/resolve/resolved-bus.h b/src/resolve/resolved-bus.h new file mode 100644 index 0000000..6c2bd26 --- /dev/null +++ b/src/resolve/resolved-bus.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-object.h" +#include "resolved-manager.h" + +extern const BusObjectImplementation manager_object; + +int manager_connect_bus(Manager *m); +int _manager_send_changed(Manager *manager, const char *property, ...) _sentinel_; +#define manager_send_changed(manager, ...) _manager_send_changed(manager, __VA_ARGS__, NULL) +int bus_dns_server_append(sd_bus_message *reply, DnsServer *s, bool with_ifindex, bool extended); +int bus_property_get_resolve_support(sd_bus *bus, const char *path, const char *interface, + const char *property, sd_bus_message *reply, + void *userdata, sd_bus_error *error); + +void bus_client_log(sd_bus_message *m, const char *what); diff --git a/src/resolve/resolved-conf.c b/src/resolve/resolved-conf.c new file mode 100644 index 0000000..2f08ed0 --- /dev/null +++ b/src/resolve/resolved-conf.c @@ -0,0 +1,603 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "conf-parser.h" +#include "constants.h" +#include "creds-util.h" +#include "dns-domain.h" +#include "extract-word.h" +#include "hexdecoct.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "resolved-conf.h" +#include "resolved-dns-search-domain.h" +#include "resolved-dns-stub.h" +#include "resolved-dnssd.h" +#include "resolved-manager.h" +#include "socket-netlink.h" +#include "specifier.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +DEFINE_CONFIG_PARSE_ENUM(config_parse_dns_stub_listener_mode, dns_stub_listener_mode, DnsStubListenerMode, "Failed to parse DNS stub listener mode setting"); + +static int manager_add_dns_server_by_string(Manager *m, DnsServerType type, const char *word) { + _cleanup_free_ char *server_name = NULL; + union in_addr_union address; + int family, r, ifindex = 0; + uint16_t port; + DnsServer *s; + + assert(m); + assert(word); + + r = in_addr_port_ifindex_name_from_string_auto(word, &family, &address, &port, &ifindex, &server_name); + if (r < 0) + return r; + + /* Silently filter out 0.0.0.0, 127.0.0.53, 127.0.0.54 (our own stub DNS listener) */ + if (!dns_server_address_valid(family, &address)) + return 0; + + /* By default, the port number is determined with the transaction feature level. + * See dns_transaction_port() and dns_server_port(). */ + if (IN_SET(port, 53, 853)) + port = 0; + + /* Filter out duplicates */ + s = dns_server_find(manager_get_first_dns_server(m, type), family, &address, port, ifindex, server_name); + if (s) { + /* Drop the marker. This is used to find the servers that ceased to exist, see + * manager_mark_dns_servers() and manager_flush_marked_dns_servers(). */ + dns_server_move_back_and_unmark(s); + return 0; + } + + return dns_server_new(m, NULL, type, NULL, family, &address, port, ifindex, server_name); +} + +int manager_parse_dns_server_string_and_warn(Manager *m, DnsServerType type, const char *string) { + int r; + + assert(m); + assert(string); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&string, &word, NULL, 0); + if (r <= 0) + return r; + + r = manager_add_dns_server_by_string(m, type, word); + if (r < 0) + log_warning_errno(r, "Failed to add DNS server address '%s', ignoring: %m", word); + } +} + +static int manager_add_search_domain_by_string(Manager *m, const char *domain) { + DnsSearchDomain *d; + bool route_only; + int r; + + assert(m); + assert(domain); + + route_only = *domain == '~'; + if (route_only) + domain++; + + if (dns_name_is_root(domain) || streq(domain, "*")) { + route_only = true; + domain = "."; + } + + r = dns_search_domain_find(m->search_domains, domain, &d); + if (r < 0) + return r; + if (r > 0) + dns_search_domain_move_back_and_unmark(d); + else { + r = dns_search_domain_new(m, &d, DNS_SEARCH_DOMAIN_SYSTEM, NULL, domain); + if (r < 0) + return r; + } + + d->route_only = route_only; + return 0; +} + +int manager_parse_search_domains_and_warn(Manager *m, const char *string) { + int r; + + assert(m); + assert(string); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&string, &word, NULL, EXTRACT_UNQUOTE); + if (r <= 0) + return r; + + r = manager_add_search_domain_by_string(m, word); + if (r < 0) + log_warning_errno(r, "Failed to add search domain '%s', ignoring: %m", word); + } +} + +int config_parse_dns_servers( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + /* Empty assignment means clear the list */ + dns_server_unlink_all(manager_get_first_dns_server(m, ltype)); + else { + /* Otherwise, add to the list */ + r = manager_parse_dns_server_string_and_warn(m, ltype, rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse DNS server string '%s', ignoring.", rvalue); + return 0; + } + } + + /* If we have a manual setting, then we stop reading + * /etc/resolv.conf */ + if (ltype == DNS_SERVER_SYSTEM) + m->read_resolv_conf = false; + if (ltype == DNS_SERVER_FALLBACK) + m->need_builtin_fallbacks = false; + + return 0; +} + +int config_parse_search_domains( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + /* Empty assignment means clear the list */ + dns_search_domain_unlink_all(m->search_domains); + else { + /* Otherwise, add to the list */ + r = manager_parse_search_domains_and_warn(m, rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse search domains string '%s', ignoring.", rvalue); + return 0; + } + } + + /* If we have a manual setting, then we stop reading + * /etc/resolv.conf */ + m->read_resolv_conf = false; + + return 0; +} + +int config_parse_dnssd_service_name( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + static const Specifier specifier_table[] = { + { 'a', specifier_architecture, NULL }, + { 'b', specifier_boot_id, NULL }, + { 'B', specifier_os_build_id, NULL }, + { 'H', specifier_hostname, NULL }, /* We will use specifier_dnssd_hostname(). */ + { 'm', specifier_machine_id, NULL }, + { 'o', specifier_os_id, NULL }, + { 'v', specifier_kernel_release, NULL }, + { 'w', specifier_os_version_id, NULL }, + { 'W', specifier_os_variant_id, NULL }, + {} + }; + DnssdService *s = ASSERT_PTR(userdata); + _cleanup_free_ char *name = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + s->name_template = mfree(s->name_template); + return 0; + } + + r = specifier_printf(rvalue, DNS_LABEL_MAX, specifier_table, NULL, NULL, &name); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid service instance name template '%s', ignoring assignment: %m", rvalue); + return 0; + } + + if (!dns_service_name_is_valid(name)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Service instance name template '%s' renders to invalid name '%s'. Ignoring assignment.", + rvalue, name); + return 0; + } + + return free_and_strdup_warn(&s->name_template, rvalue); +} + +int config_parse_dnssd_service_type( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + DnssdService *s = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + s->type = mfree(s->type); + return 0; + } + + if (!dnssd_srv_type_is_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Service type is invalid. Ignoring."); + return 0; + } + + r = free_and_strdup(&s->type, rvalue); + if (r < 0) + return log_oom(); + + return 0; +} + +int config_parse_dnssd_txt( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(dnssd_txtdata_freep) DnssdTxtData *txt_data = NULL; + DnssdService *s = ASSERT_PTR(userdata); + DnsTxtItem *last = NULL; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Flush out collected items */ + s->txt_data_items = dnssd_txtdata_free_all(s->txt_data_items); + return 0; + } + + txt_data = new0(DnssdTxtData, 1); + if (!txt_data) + return log_oom(); + + for (;;) { + _cleanup_free_ char *word = NULL, *key = NULL, *value = NULL; + _cleanup_free_ void *decoded = NULL; + size_t length = 0; + DnsTxtItem *i; + int r; + + r = extract_first_word(&rvalue, &word, NULL, + EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = split_pair(word, "=", &key, &value); + if (r == -ENOMEM) + return log_oom(); + if (r == -EINVAL) + key = TAKE_PTR(word); + + if (!ascii_is_valid(key)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid key, ignoring: %s", key); + continue; + } + + switch (ltype) { + + case DNS_TXT_ITEM_DATA: + if (value) { + r = unbase64mem(value, strlen(value), &decoded, &length); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid base64 encoding, ignoring: %s", value); + continue; + } + } + + r = dnssd_txt_item_new_from_data(key, decoded, length, &i); + if (r < 0) + return log_oom(); + break; + + case DNS_TXT_ITEM_TEXT: + r = dnssd_txt_item_new_from_string(key, value, &i); + if (r < 0) + return log_oom(); + break; + + default: + assert_not_reached(); + } + + LIST_INSERT_AFTER(items, txt_data->txts, last, i); + last = i; + } + + if (txt_data->txts) { + LIST_PREPEND(items, s->txt_data_items, txt_data); + TAKE_PTR(txt_data); + } + + return 0; +} + +int config_parse_dns_stub_listener_extra( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ DnsStubListenerExtra *stub = NULL; + Manager *m = userdata; + const char *p; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + m->dns_extra_stub_listeners = ordered_set_free(m->dns_extra_stub_listeners); + return 0; + } + + r = dns_stub_listener_extra_new(m, &stub); + if (r < 0) + return log_oom(); + + p = startswith(rvalue, "udp:"); + if (p) + stub->mode = DNS_STUB_LISTENER_UDP; + else { + p = startswith(rvalue, "tcp:"); + if (p) + stub->mode = DNS_STUB_LISTENER_TCP; + else { + stub->mode = DNS_STUB_LISTENER_YES; + p = rvalue; + } + } + + r = in_addr_port_ifindex_name_from_string_auto(p, &stub->family, &stub->address, &stub->port, NULL, NULL); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse address in %s=%s, ignoring assignment: %m", + lvalue, rvalue); + return 0; + } + + r = ordered_set_ensure_put(&m->dns_extra_stub_listeners, &dns_stub_listener_extra_hash_ops, stub); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to store %s=%s, ignoring assignment: %m", lvalue, rvalue); + return 0; + } + + TAKE_PTR(stub); + + return 0; +} + +static void read_credentials(Manager *m) { + _cleanup_free_ char *dns = NULL, *domains = NULL; + int r; + + assert(m); + + /* Hmm, if we aren't supposed to read /etc/resolv.conf because the DNS settings were already + * configured explicitly in our config file, we don't want to honour credentials either */ + if (!m->read_resolv_conf) + return; + + r = read_credential_strings_many("network.dns", &dns, + "network.search_domains", &domains); + if (r < 0) + log_warning_errno(r, "Failed to read credentials, ignoring: %m"); + + if (dns) { + r = manager_parse_dns_server_string_and_warn(m, DNS_SERVER_SYSTEM, dns); + if (r < 0) + log_warning_errno(r, "Failed to parse credential network.dns '%s', ignoring.", dns); + + m->read_resolv_conf = false; + } + + if (domains) { + r = manager_parse_search_domains_and_warn(m, domains); + if (r < 0) + log_warning_errno(r, "Failed to parse credential network.search_domains '%s', ignoring.", domains); + + m->read_resolv_conf = false; + } +} + +struct ProcCmdlineInfo { + Manager *manager; + + /* If there's a setting configured via /proc/cmdline we want to reset the configured lists, but only + * once, so that multiple nameserver= or domain= settings can be specified on the kernel command line + * and will be combined. These booleans will be set once we erase the list once. */ + bool dns_server_unlinked; + bool search_domain_unlinked; +}; + +static int proc_cmdline_callback(const char *key, const char *value, void *data) { + struct ProcCmdlineInfo *info = ASSERT_PTR(data); + int r; + + assert(key); + assert(info->manager); + + /* The kernel command line option names are chosen to be compatible with what various tools already + * interpret, for example dracut and SUSE Linux. */ + + if (streq(key, "nameserver")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (!info->dns_server_unlinked) { + /* The kernel command line overrides any prior configuration */ + dns_server_unlink_all(manager_get_first_dns_server(info->manager, DNS_SERVER_SYSTEM)); + info->dns_server_unlinked = true; + } + + r = manager_parse_dns_server_string_and_warn(info->manager, DNS_SERVER_SYSTEM, value); + if (r < 0) + log_warning_errno(r, "Failed to parse DNS server string '%s', ignoring.", value); + + info->manager->read_resolv_conf = false; + + } else if (streq(key, "domain")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + if (!info->search_domain_unlinked) { + dns_search_domain_unlink_all(info->manager->search_domains); + info->search_domain_unlinked = true; + } + + r = manager_parse_search_domains_and_warn(info->manager, value); + if (r < 0) + log_warning_errno(r, "Failed to parse credential provided search domain string '%s', ignoring.", value); + + info->manager->read_resolv_conf = false; + } + + return 0; +} + +static void read_proc_cmdline(Manager *m) { + int r; + + assert(m); + + r = proc_cmdline_parse(proc_cmdline_callback, &(struct ProcCmdlineInfo) { .manager = m }, 0); + if (r < 0) + log_warning_errno(r, "Failed to read kernel command line, ignoring: %m"); +} + +int manager_parse_config_file(Manager *m) { + int r; + + assert(m); + + r = config_parse_config_file("resolved.conf", "Resolve\0", + config_item_perf_lookup, resolved_gperf_lookup, + CONFIG_PARSE_WARN, m); + if (r < 0) + return r; + + read_credentials(m); /* credentials are only used when nothing is explicitly configured … */ + read_proc_cmdline(m); /* … but kernel command line overrides local configuration. */ + + if (m->need_builtin_fallbacks) { + r = manager_parse_dns_server_string_and_warn(m, DNS_SERVER_FALLBACK, DNS_SERVERS); + if (r < 0) + return r; + } + +#if !HAVE_OPENSSL_OR_GCRYPT + if (m->dnssec_mode != DNSSEC_NO) { + log_warning("DNSSEC option cannot be enabled or set to allow-downgrade when systemd-resolved is built without a cryptographic library. Turning off DNSSEC support."); + m->dnssec_mode = DNSSEC_NO; + } +#endif + +#if !ENABLE_DNS_OVER_TLS + if (m->dns_over_tls_mode != DNS_OVER_TLS_NO) { + log_warning("DNS-over-TLS option cannot be enabled or set to opportunistic when systemd-resolved is built without DNS-over-TLS support. Turning off DNS-over-TLS support."); + m->dns_over_tls_mode = DNS_OVER_TLS_NO; + } +#endif + return 0; + +} diff --git a/src/resolve/resolved-conf.h b/src/resolve/resolved-conf.h new file mode 100644 index 0000000..07ce259 --- /dev/null +++ b/src/resolve/resolved-conf.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" + +#include "resolved-dns-server.h" + +int manager_parse_config_file(Manager *m); + +int manager_parse_search_domains_and_warn(Manager *m, const char *string); +int manager_parse_dns_server_string_and_warn(Manager *m, DnsServerType type, const char *string); + +const struct ConfigPerfItem* resolved_gperf_lookup(const char *key, GPERF_LEN_TYPE length); +const struct ConfigPerfItem* resolved_dnssd_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +CONFIG_PARSER_PROTOTYPE(config_parse_dns_servers); +CONFIG_PARSER_PROTOTYPE(config_parse_search_domains); +CONFIG_PARSER_PROTOTYPE(config_parse_dns_stub_listener_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_dnssd_service_name); +CONFIG_PARSER_PROTOTYPE(config_parse_dnssd_service_type); +CONFIG_PARSER_PROTOTYPE(config_parse_dnssd_txt); +CONFIG_PARSER_PROTOTYPE(config_parse_dns_stub_listener_extra); diff --git a/src/resolve/resolved-def.h b/src/resolve/resolved-def.h new file mode 100644 index 0000000..b7a44f9 --- /dev/null +++ b/src/resolve/resolved-def.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" + +/* Input + Output: The various protocols we can use */ +#define SD_RESOLVED_DNS (UINT64_C(1) << 0) +#define SD_RESOLVED_LLMNR_IPV4 (UINT64_C(1) << 1) +#define SD_RESOLVED_LLMNR_IPV6 (UINT64_C(1) << 2) +#define SD_RESOLVED_MDNS_IPV4 (UINT64_C(1) << 3) +#define SD_RESOLVED_MDNS_IPV6 (UINT64_C(1) << 4) + +/* Input: Don't follow CNAMEs/DNAMEs */ +#define SD_RESOLVED_NO_CNAME (UINT64_C(1) << 5) + +/* Input: When doing service (SRV) resolving, don't resolve associated mDNS-style TXT records */ +#define SD_RESOLVED_NO_TXT (UINT64_C(1) << 6) + +/* Input: When doing service (SRV) resolving, don't resolve A/AAA RR for included hostname */ +#define SD_RESOLVED_NO_ADDRESS (UINT64_C(1) << 7) + +/* Input: Don't apply search domain logic to request */ +#define SD_RESOLVED_NO_SEARCH (UINT64_C(1) << 8) + +/* Output: Result is authenticated */ +#define SD_RESOLVED_AUTHENTICATED (UINT64_C(1) << 9) + +/* Input: Don't DNSSEC validate request */ +#define SD_RESOLVED_NO_VALIDATE (UINT64_C(1) << 10) + +/* Input: Don't answer request from locally synthesized records (which includes /etc/hosts) */ +#define SD_RESOLVED_NO_SYNTHESIZE (UINT64_C(1) << 11) + +/* Input: Don't answer request from cache */ +#define SD_RESOLVED_NO_CACHE (UINT64_C(1) << 12) + +/* Input: Don't answer request from locally registered public LLMNR/mDNS RRs */ +#define SD_RESOLVED_NO_ZONE (UINT64_C(1) << 13) + +/* Input: Don't answer request from locally configured trust anchors. */ +#define SD_RESOLVED_NO_TRUST_ANCHOR (UINT64_C(1) << 14) + +/* Input: Don't go to network for this request */ +#define SD_RESOLVED_NO_NETWORK (UINT64_C(1) << 15) + +/* Input: Require that request is answered from a "primary" answer, i.e. not from RRs acquired as + * side-effect of a previous transaction */ +#define SD_RESOLVED_REQUIRE_PRIMARY (UINT64_C(1) << 16) + +/* Input: If reply is answered from cache, the TTLs will be adjusted by age of cache entry */ +#define SD_RESOLVED_CLAMP_TTL (UINT64_C(1) << 17) + +/* Output: Result was only sent via encrypted channels, or never left this system */ +#define SD_RESOLVED_CONFIDENTIAL (UINT64_C(1) << 18) + +/* Output: Result was (at least partially) synthesized locally */ +#define SD_RESOLVED_SYNTHETIC (UINT64_C(1) << 19) + +/* Output: Result was (at least partially) answered from cache */ +#define SD_RESOLVED_FROM_CACHE (UINT64_C(1) << 20) + +/* Output: Result was (at least partially) answered from local zone */ +#define SD_RESOLVED_FROM_ZONE (UINT64_C(1) << 21) + +/* Output: Result was (at least partially) answered from trust anchor */ +#define SD_RESOLVED_FROM_TRUST_ANCHOR (UINT64_C(1) << 22) + +/* Output: Result was (at least partially) answered from network */ +#define SD_RESOLVED_FROM_NETWORK (UINT64_C(1) << 23) + +/* Input: Don't answer request with stale data */ +#define SD_RESOLVED_NO_STALE (UINT64_C(1) << 24) + +#define SD_RESOLVED_LLMNR (SD_RESOLVED_LLMNR_IPV4|SD_RESOLVED_LLMNR_IPV6) +#define SD_RESOLVED_MDNS (SD_RESOLVED_MDNS_IPV4|SD_RESOLVED_MDNS_IPV6) +#define SD_RESOLVED_PROTOCOLS_ALL (SD_RESOLVED_MDNS|SD_RESOLVED_LLMNR|SD_RESOLVED_DNS) + +#define SD_RESOLVED_FROM_MASK (SD_RESOLVED_FROM_CACHE|SD_RESOLVED_FROM_ZONE|SD_RESOLVED_FROM_TRUST_ANCHOR|SD_RESOLVED_FROM_NETWORK) + +#define SD_RESOLVED_QUERY_TIMEOUT_USEC (120 * USEC_PER_SEC) diff --git a/src/resolve/resolved-dns-answer.c b/src/resolve/resolved-dns-answer.c new file mode 100644 index 0000000..bf023a7 --- /dev/null +++ b/src/resolve/resolved-dns-answer.c @@ -0,0 +1,862 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "dns-domain.h" +#include "random-util.h" +#include "resolved-dns-answer.h" +#include "resolved-dns-dnssec.h" +#include "string-util.h" + +static DnsAnswerItem *dns_answer_item_free(DnsAnswerItem *item) { + if (!item) + return NULL; + + dns_resource_record_unref(item->rr); + dns_resource_record_unref(item->rrsig); + + return mfree(item); +} + +DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(DnsAnswerItem, dns_answer_item, dns_answer_item_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsAnswerItem*, dns_answer_item_unref); + +static void dns_answer_item_hash_func(const DnsAnswerItem *a, struct siphash *state) { + assert(a); + assert(state); + + siphash24_compress(&a->ifindex, sizeof(a->ifindex), state); + + dns_resource_record_hash_func(a->rr, state); +} + +static int dns_answer_item_compare_func(const DnsAnswerItem *a, const DnsAnswerItem *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->ifindex, b->ifindex); + if (r != 0) + return r; + + return dns_resource_record_compare_func(a->rr, b->rr); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + dns_answer_item_hash_ops, + DnsAnswerItem, + dns_answer_item_hash_func, + dns_answer_item_compare_func, + dns_answer_item_unref); + +static int dns_answer_reserve_internal(DnsAnswer *a, size_t n) { + size_t m; + + assert(a); + assert(a->items); + + m = ordered_set_size(a->items); + assert(m <= UINT16_MAX); /* We can only place 64K RRs in an answer at max */ + + n = saturate_add(m, n, UINT16_MAX); + + /* Higher multipliers give slightly higher efficiency through hash collisions, but the gains + * quickly drop off after 2. */ + return ordered_set_reserve(a->items, n * 2); +} + +DnsAnswer *dns_answer_new(size_t n) { + _cleanup_ordered_set_free_ OrderedSet *s = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *a = NULL; + + if (n > UINT16_MAX) + n = UINT16_MAX; + + s = ordered_set_new(&dns_answer_item_hash_ops); + if (!s) + return NULL; + + a = new(DnsAnswer, 1); + if (!a) + return NULL; + + *a = (DnsAnswer) { + .n_ref = 1, + .items = TAKE_PTR(s), + }; + + if (dns_answer_reserve_internal(a, n) < 0) + return NULL; + + return TAKE_PTR(a); +} + +static DnsAnswer *dns_answer_free(DnsAnswer *a) { + assert(a); + + ordered_set_free(a->items); + return mfree(a); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DnsAnswer, dns_answer, dns_answer_free); + +static int dns_answer_add_raw( + DnsAnswer *a, + DnsResourceRecord *rr, + int ifindex, + DnsAnswerFlags flags, + DnsResourceRecord *rrsig) { + + _cleanup_(dns_answer_item_unrefp) DnsAnswerItem *item = NULL; + int r; + + assert(rr); + + if (!a) + return -ENOSPC; + + if (dns_answer_size(a) >= UINT16_MAX) + return -ENOSPC; + + item = new(DnsAnswerItem, 1); + if (!item) + return -ENOMEM; + + *item = (DnsAnswerItem) { + .n_ref = 1, + .rr = dns_resource_record_ref(rr), + .ifindex = ifindex, + .flags = flags, + .rrsig = dns_resource_record_ref(rrsig), + }; + + r = ordered_set_put(a->items, item); + if (r < 0) + return r; + + TAKE_PTR(item); + return 1; +} + +static int dns_answer_add_raw_all(DnsAnswer *a, DnsAnswer *source) { + DnsAnswerItem *item; + int r; + + DNS_ANSWER_FOREACH_ITEM(item, source) { + r = dns_answer_add_raw( + a, + item->rr, + item->ifindex, + item->flags, + item->rrsig); + if (r < 0) + return r; + } + + return 0; +} + +int dns_answer_add( + DnsAnswer *a, + DnsResourceRecord *rr, + int ifindex, + DnsAnswerFlags flags, + DnsResourceRecord *rrsig) { + + DnsAnswerItem tmp, *exist; + + assert(rr); + + if (!a) + return -ENOSPC; + if (a->n_ref > 1) + return -EBUSY; + + tmp = (DnsAnswerItem) { + .rr = rr, + .ifindex = ifindex, + }; + + exist = ordered_set_get(a->items, &tmp); + if (exist) { + /* There's already an RR of the same RRset in place! Let's see if the TTLs more or + * less match. RFC 2181, Section 5.2 suggests clients should reject RRsets + * containing RRs with differing TTLs. We are more tolerant of this situation except + * if one RR has a zero TTL and the other a nonzero TTL. In mDNS, zero TTLs are + * special, so we must error in that case. */ + if ((rr->ttl == 0) != (exist->rr->ttl == 0)) { + if ((exist->flags | flags) & DNS_ANSWER_REFUSE_TTL_NO_MATCH) + return log_debug_errno( + SYNTHETIC_ERRNO(EINVAL), + "Refusing to merge RRs with zero TTL and non-zero TTL: %s vs. %s", + dns_resource_record_to_string(rr), + dns_resource_record_to_string(exist->rr)); + + log_debug("Merging RRs with zero TTL and non-zero TTL (not RFC 2181/5.2 compliant): %s vs. %s", + dns_resource_record_to_string(rr), + dns_resource_record_to_string(exist->rr)); + } + + /* Entry already exists, keep the entry with the higher TTL. */ + if (rr->ttl > exist->rr->ttl) { + DNS_RR_REPLACE(exist->rr, dns_resource_record_ref(rr)); + + /* Update RRSIG and RR at the same time */ + if (rrsig) + DNS_RR_REPLACE(exist->rrsig, dns_resource_record_ref(rrsig)); + } + + exist->flags |= flags; + + if (rr->key->type == DNS_TYPE_RRSIG) { + /* If the rr is RRSIG, then move the rr to the end. */ + assert_se(ordered_set_remove(a->items, exist) == exist); + assert_se(ordered_set_put(a->items, exist) == 1); + } + return 0; + } + + return dns_answer_add_raw(a, rr, ifindex, flags, rrsig); +} + +static int dns_answer_add_all(DnsAnswer *a, DnsAnswer *b) { + DnsAnswerItem *item; + int r; + + DNS_ANSWER_FOREACH_ITEM(item, b) { + r = dns_answer_add(a, item->rr, item->ifindex, item->flags, item->rrsig); + if (r < 0) + return r; + } + + return 0; +} + +int dns_answer_add_extend( + DnsAnswer **a, + DnsResourceRecord *rr, + int ifindex, + DnsAnswerFlags flags, + DnsResourceRecord *rrsig) { + + int r; + + assert(a); + assert(rr); + + r = dns_answer_reserve_or_clone(a, 1); + if (r < 0) + return r; + + return dns_answer_add(*a, rr, ifindex, flags, rrsig); +} + +int dns_answer_add_soa(DnsAnswer *a, const char *name, uint32_t ttl, int ifindex) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *soa = NULL; + + soa = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_SOA, name); + if (!soa) + return -ENOMEM; + + soa->ttl = ttl; + + soa->soa.mname = strdup(name); + if (!soa->soa.mname) + return -ENOMEM; + + soa->soa.rname = strjoin("root.", name); + if (!soa->soa.rname) + return -ENOMEM; + + soa->soa.serial = 1; + soa->soa.refresh = 1; + soa->soa.retry = 1; + soa->soa.expire = 1; + soa->soa.minimum = ttl; + + return dns_answer_add(a, soa, ifindex, DNS_ANSWER_AUTHENTICATED, NULL); +} + +int dns_answer_match_key(DnsAnswer *a, const DnsResourceKey *key, DnsAnswerFlags *ret_flags) { + DnsAnswerFlags flags = 0, i_flags; + DnsResourceRecord *i; + bool found = false; + int r; + + assert(key); + + DNS_ANSWER_FOREACH_FLAGS(i, i_flags, a) { + r = dns_resource_key_match_rr(key, i, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + if (!ret_flags) + return 1; + + if (found) + flags &= i_flags; + else { + flags = i_flags; + found = true; + } + } + + if (ret_flags) + *ret_flags = flags; + + return found; +} + +bool dns_answer_contains_nsec_or_nsec3(DnsAnswer *a) { + DnsResourceRecord *i; + + DNS_ANSWER_FOREACH(i, a) + if (IN_SET(i->key->type, DNS_TYPE_NSEC, DNS_TYPE_NSEC3)) + return true; + + return false; +} + +int dns_answer_contains_zone_nsec3(DnsAnswer *answer, const char *zone) { + DnsResourceRecord *rr; + int r; + + /* Checks whether the specified answer contains at least one NSEC3 RR in the specified zone */ + + DNS_ANSWER_FOREACH(rr, answer) { + const char *p; + + if (rr->key->type != DNS_TYPE_NSEC3) + continue; + + p = dns_resource_key_name(rr->key); + r = dns_name_parent(&p); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dns_name_equal(p, zone); + if (r != 0) + return r; + } + + return false; +} + +bool dns_answer_contains(DnsAnswer *answer, DnsResourceRecord *rr) { + DnsResourceRecord *i; + + DNS_ANSWER_FOREACH(i, answer) + if (dns_resource_record_equal(i, rr)) + return true; + + return false; +} + +int dns_answer_find_soa( + DnsAnswer *a, + const DnsResourceKey *key, + DnsResourceRecord **ret, + DnsAnswerFlags *ret_flags) { + + DnsResourceRecord *rr, *soa = NULL; + DnsAnswerFlags rr_flags, soa_flags = 0; + int r; + + assert(key); + + /* For a SOA record we can never find a matching SOA record */ + if (key->type == DNS_TYPE_SOA) + goto not_found; + + DNS_ANSWER_FOREACH_FLAGS(rr, rr_flags, a) { + r = dns_resource_key_match_soa(key, rr->key); + if (r < 0) + return r; + if (r > 0) { + + if (soa) { + r = dns_name_endswith(dns_resource_key_name(rr->key), dns_resource_key_name(soa->key)); + if (r < 0) + return r; + if (r > 0) + continue; + } + + soa = rr; + soa_flags = rr_flags; + } + } + + if (!soa) + goto not_found; + + if (ret) + *ret = soa; + if (ret_flags) + *ret_flags = soa_flags; + + return 1; + +not_found: + if (ret) + *ret = NULL; + if (ret_flags) + *ret_flags = 0; + + return 0; +} + +int dns_answer_find_cname_or_dname( + DnsAnswer *a, + const DnsResourceKey *key, + DnsResourceRecord **ret, + DnsAnswerFlags *ret_flags) { + + DnsResourceRecord *rr; + DnsAnswerFlags rr_flags; + int r; + + assert(key); + + /* For a {C,D}NAME record we can never find a matching {C,D}NAME record */ + if (!dns_type_may_redirect(key->type)) + return 0; + + DNS_ANSWER_FOREACH_FLAGS(rr, rr_flags, a) { + r = dns_resource_key_match_cname_or_dname(key, rr->key, NULL); + if (r < 0) + return r; + if (r > 0) { + if (ret) + *ret = rr; + if (ret_flags) + *ret_flags = rr_flags; + return 1; + } + } + + if (ret) + *ret = NULL; + if (ret_flags) + *ret_flags = 0; + + return 0; +} + +int dns_answer_merge(DnsAnswer *a, DnsAnswer *b, DnsAnswer **ret) { + _cleanup_(dns_answer_unrefp) DnsAnswer *k = NULL; + int r; + + assert(ret); + + if (a == b) { + *ret = dns_answer_ref(a); + return 0; + } + + if (dns_answer_size(a) <= 0) { + *ret = dns_answer_ref(b); + return 0; + } + + if (dns_answer_size(b) <= 0) { + *ret = dns_answer_ref(a); + return 0; + } + + k = dns_answer_new(dns_answer_size(a) + dns_answer_size(b)); + if (!k) + return -ENOMEM; + + r = dns_answer_add_raw_all(k, a); + if (r < 0) + return r; + + r = dns_answer_add_all(k, b); + if (r < 0) + return r; + + *ret = TAKE_PTR(k); + + return 0; +} + +int dns_answer_extend(DnsAnswer **a, DnsAnswer *b) { + DnsAnswer *merged; + int r; + + assert(a); + + r = dns_answer_merge(*a, b, &merged); + if (r < 0) + return r; + + DNS_ANSWER_REPLACE(*a, merged); + return 0; +} + +int dns_answer_remove_by_key(DnsAnswer **a, const DnsResourceKey *key) { + DnsAnswerItem *item; + bool found = false; + int r; + + assert(a); + assert(key); + + /* Remove all entries matching the specified key from *a */ + + DNS_ANSWER_FOREACH_ITEM(item, *a) { + r = dns_resource_key_equal(item->rr->key, key); + if (r < 0) + return r; + if (r > 0) { + dns_answer_item_unref(ordered_set_remove((*a)->items, item)); + found = true; + } + } + + if (!found) + return 0; + + if (dns_answer_isempty(*a)) + *a = dns_answer_unref(*a); /* Return NULL for the empty answer */ + + return 1; +} + +int dns_answer_remove_by_rr(DnsAnswer **a, DnsResourceRecord *rr) { + _unused_ _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr_ref = dns_resource_record_ref(rr); + DnsAnswerItem *item; + bool found = false; + int r; + + assert(a); + assert(rr); + + /* Remove all entries matching the specified RR from *a */ + + DNS_ANSWER_FOREACH_ITEM(item, *a) { + r = dns_resource_record_equal(item->rr, rr); + if (r < 0) + return r; + if (r > 0) { + dns_answer_item_unref(ordered_set_remove((*a)->items, item)); + found = true; + } + } + + if (!found) + return 0; + + if (dns_answer_isempty(*a)) + *a = dns_answer_unref(*a); /* Return NULL for the empty answer */ + + return 1; +} + +int dns_answer_remove_by_answer_keys(DnsAnswer **a, DnsAnswer *b) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *prev = NULL; + DnsAnswerItem *item; + int r; + + /* Removes all items from '*a' that have a matching key in 'b' */ + + DNS_ANSWER_FOREACH_ITEM(item, b) { + + if (prev && dns_resource_key_equal(item->rr->key, prev)) /* Skip this one, we already looked at it */ + continue; + + r = dns_answer_remove_by_key(a, item->rr->key); + if (r < 0) + return r; + if (!*a) + return 0; /* a is already empty. */ + + /* Let's remember this entry's RR key, to optimize the loop a bit: if we have an RRset with + * more than one item then we don't need to remove the key multiple times */ + DNS_RESOURCE_KEY_REPLACE(prev, dns_resource_key_ref(item->rr->key)); + } + + return 0; +} + +int dns_answer_copy_by_key( + DnsAnswer **a, + DnsAnswer *source, + const DnsResourceKey *key, + DnsAnswerFlags or_flags, + DnsResourceRecord *rrsig) { + + DnsAnswerItem *item; + int r; + + assert(a); + assert(key); + + /* Copy all RRs matching the specified key from source into *a */ + + DNS_ANSWER_FOREACH_ITEM(item, source) { + + r = dns_resource_key_equal(item->rr->key, key); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dns_answer_add_extend(a, item->rr, item->ifindex, item->flags|or_flags, rrsig ?: item->rrsig); + if (r < 0) + return r; + } + + return 0; +} + +int dns_answer_move_by_key( + DnsAnswer **to, + DnsAnswer **from, + const DnsResourceKey *key, + DnsAnswerFlags or_flags, + DnsResourceRecord *rrsig) { + + int r; + + assert(to); + assert(from); + assert(key); + + r = dns_answer_copy_by_key(to, *from, key, or_flags, rrsig); + if (r < 0) + return r; + + return dns_answer_remove_by_key(from, key); +} + +void dns_answer_order_by_scope(DnsAnswer *a, bool prefer_link_local) { + _cleanup_free_ DnsAnswerItem **items = NULL; + DnsAnswerItem **p, *item; + size_t n; + + n = dns_answer_size(a); + if (n <= 1) + return; + + /* RFC 4795, Section 2.6 suggests we should order entries + * depending on whether the sender is a link-local address. */ + + p = items = new(DnsAnswerItem*, n); + if (!items) + return (void) log_oom(); + + /* Order preferred address records and other records to the beginning of the array */ + DNS_ANSWER_FOREACH_ITEM(item, a) + if (dns_resource_record_is_link_local_address(item->rr) == prefer_link_local) + *p++ = dns_answer_item_ref(item); + + /* Order address records that are not preferred to the end of the array */ + DNS_ANSWER_FOREACH_ITEM(item, a) + if (dns_resource_record_is_link_local_address(item->rr) != prefer_link_local) + *p++ = dns_answer_item_ref(item); + + + assert((size_t) (p - items) == n); + + ordered_set_clear(a->items); + for (size_t i = 0; i < n; i++) + assert_se(ordered_set_put(a->items, items[i]) >= 0); +} + +int dns_answer_reserve(DnsAnswer **a, size_t n_free) { + assert(a); + + if (n_free <= 0) + return 0; + + if (!*a) { + DnsAnswer *n; + + n = dns_answer_new(n_free); + if (!n) + return -ENOMEM; + + *a = n; + return 0; + } + + if ((*a)->n_ref > 1) + return -EBUSY; + + return dns_answer_reserve_internal(*a, n_free); +} + +int dns_answer_reserve_or_clone(DnsAnswer **a, size_t n_free) { + _cleanup_(dns_answer_unrefp) DnsAnswer *n = NULL; + size_t ns; + int r; + + assert(a); + + r = dns_answer_reserve(a, n_free); + if (r != -EBUSY) + return r; + + ns = dns_answer_size(*a); + assert(ns <= UINT16_MAX); /* Maximum number of RRs we can stick into a DNS packet section */ + + ns = saturate_add(ns, n_free, UINT16_MAX); + + n = dns_answer_new(ns); + if (!n) + return -ENOMEM; + + r = dns_answer_add_raw_all(n, *a); + if (r < 0) + return r; + + DNS_ANSWER_REPLACE(*a, TAKE_PTR(n)); + return 0; +} + +/* + * This function is not used in the code base, but is useful when debugging. Do not delete. + */ +void dns_answer_dump(DnsAnswer *answer, FILE *f) { + DnsAnswerItem *item; + + if (!f) + f = stdout; + + DNS_ANSWER_FOREACH_ITEM(item, answer) { + const char *t; + + fputc('\t', f); + + t = dns_resource_record_to_string(item->rr); + if (!t) { + log_oom(); + continue; + } + + fputs(t, f); + fputs("\t;", f); + fprintf(f, " ttl=%" PRIu32, item->rr->ttl); + + if (item->ifindex != 0) + fprintf(f, " ifindex=%i", item->ifindex); + if (item->rrsig) + fputs(" rrsig", f); + if (item->flags & DNS_ANSWER_AUTHENTICATED) + fputs(" authenticated", f); + if (item->flags & DNS_ANSWER_CACHEABLE) + fputs(" cacheable", f); + if (item->flags & DNS_ANSWER_SHARED_OWNER) + fputs(" shared-owner", f); + if (item->flags & DNS_ANSWER_CACHE_FLUSH) + fputs(" cache-flush", f); + if (item->flags & DNS_ANSWER_GOODBYE) + fputs(" goodbye", f); + if (item->flags & DNS_ANSWER_SECTION_ANSWER) + fputs(" section-answer", f); + if (item->flags & DNS_ANSWER_SECTION_AUTHORITY) + fputs(" section-authority", f); + if (item->flags & DNS_ANSWER_SECTION_ADDITIONAL) + fputs(" section-additional", f); + + fputc('\n', f); + } +} + +int dns_answer_has_dname_for_cname(DnsAnswer *a, DnsResourceRecord *cname) { + DnsResourceRecord *rr; + int r; + + assert(cname); + + /* Checks whether the answer contains a DNAME record that indicates that the specified CNAME record is + * synthesized from it */ + + if (cname->key->type != DNS_TYPE_CNAME) + return 0; + + DNS_ANSWER_FOREACH(rr, a) { + _cleanup_free_ char *n = NULL; + + if (rr->key->type != DNS_TYPE_DNAME) + continue; + if (rr->key->class != cname->key->class) + continue; + + r = dns_name_change_suffix(cname->cname.name, rr->dname.name, dns_resource_key_name(rr->key), &n); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dns_name_equal(n, dns_resource_key_name(cname->key)); + if (r < 0) + return r; + if (r > 0) + return 1; + } + + return 0; +} + +void dns_answer_randomize(DnsAnswer *a) { + _cleanup_free_ DnsAnswerItem **items = NULL; + DnsAnswerItem **p, *item; + size_t n; + + /* Permutes the answer list randomly (Knuth shuffle) */ + + n = dns_answer_size(a); + if (n <= 1) + return; + + p = items = new(DnsAnswerItem*, n); + if (!items) + return (void) log_oom(); + + DNS_ANSWER_FOREACH_ITEM(item, a) + *p++ = dns_answer_item_ref(item); + + assert((size_t) (p - items) == n); + + for (size_t i = 0; i < n; i++) { + size_t k; + + k = random_u64_range(n); + if (k == i) + continue; + + SWAP_TWO(items[i], items[k]); + } + + ordered_set_clear(a->items); + for (size_t i = 0; i < n; i++) + assert_se(ordered_set_put(a->items, items[i]) >= 0); +} + +uint32_t dns_answer_min_ttl(DnsAnswer *a) { + uint32_t ttl = UINT32_MAX; + DnsResourceRecord *rr; + + /* Return the smallest TTL of all RRs in this answer */ + + DNS_ANSWER_FOREACH(rr, a) { + /* Don't consider OPT (where the TTL field is used for other purposes than an actual TTL) */ + + if (dns_type_is_pseudo(rr->key->type) || + dns_class_is_pseudo(rr->key->class)) + continue; + + ttl = MIN(ttl, rr->ttl); + } + + return ttl; +} diff --git a/src/resolve/resolved-dns-answer.h b/src/resolve/resolved-dns-answer.h new file mode 100644 index 0000000..068803c --- /dev/null +++ b/src/resolve/resolved-dns-answer.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct DnsAnswer DnsAnswer; +typedef struct DnsAnswerItem DnsAnswerItem; + +#include "macro.h" +#include "ordered-set.h" +#include "resolved-dns-rr.h" + +/* A simple array of resource records. We keep track of the originating ifindex for each RR where that makes + * sense, so that we can qualify A and AAAA RRs referring to a local link with the right ifindex. + * + * Note that we usually encode the empty DnsAnswer object as a simple NULL. */ + +typedef enum DnsAnswerFlags { + DNS_ANSWER_AUTHENTICATED = 1 << 0, /* Item has been authenticated */ + DNS_ANSWER_CACHEABLE = 1 << 1, /* Item is subject to caching */ + DNS_ANSWER_SHARED_OWNER = 1 << 2, /* For mDNS: RRset may be owner by multiple peers */ + DNS_ANSWER_CACHE_FLUSH = 1 << 3, /* For mDNS: sets cache-flush bit in the rrclass of response records */ + DNS_ANSWER_GOODBYE = 1 << 4, /* For mDNS: item is subject to disappear */ + DNS_ANSWER_SECTION_ANSWER = 1 << 5, /* When parsing: RR originates from answer section */ + DNS_ANSWER_SECTION_AUTHORITY = 1 << 6, /* When parsing: RR originates from authority section */ + DNS_ANSWER_SECTION_ADDITIONAL = 1 << 7, /* When parsing: RR originates from additional section */ + DNS_ANSWER_REFUSE_TTL_NO_MATCH = 1 << 8, /* For mDNS; refuse to merge a zero TTL RR with a nonzero TTL RR */ + + DNS_ANSWER_MASK_SECTIONS = DNS_ANSWER_SECTION_ANSWER| + DNS_ANSWER_SECTION_AUTHORITY| + DNS_ANSWER_SECTION_ADDITIONAL, +} DnsAnswerFlags; + +struct DnsAnswerItem { + unsigned n_ref; + DnsResourceRecord *rr; + DnsResourceRecord *rrsig; /* Optionally, also store RRSIG RR that successfully validates this item */ + int ifindex; + DnsAnswerFlags flags; +}; + +struct DnsAnswer { + unsigned n_ref; + OrderedSet *items; +}; + +DnsAnswer *dns_answer_new(size_t n); +DnsAnswer *dns_answer_ref(DnsAnswer *a); +DnsAnswer *dns_answer_unref(DnsAnswer *a); + +#define DNS_ANSWER_REPLACE(a, b) \ + do { \ + typeof(a)* _a = &(a); \ + typeof(b) _b = (b); \ + dns_answer_unref(*_a); \ + *_a = _b; \ + } while(0) + +int dns_answer_add(DnsAnswer *a, DnsResourceRecord *rr, int ifindex, DnsAnswerFlags flags, DnsResourceRecord *rrsig); +int dns_answer_add_extend(DnsAnswer **a, DnsResourceRecord *rr, int ifindex, DnsAnswerFlags flags, DnsResourceRecord *rrsig); +int dns_answer_add_soa(DnsAnswer *a, const char *name, uint32_t ttl, int ifindex); + +int dns_answer_match_key(DnsAnswer *a, const DnsResourceKey *key, DnsAnswerFlags *ret_flags); +bool dns_answer_contains_nsec_or_nsec3(DnsAnswer *a); +int dns_answer_contains_zone_nsec3(DnsAnswer *answer, const char *zone); +bool dns_answer_contains(DnsAnswer *answer, DnsResourceRecord *rr); + +int dns_answer_find_soa(DnsAnswer *a, const DnsResourceKey *key, DnsResourceRecord **ret, DnsAnswerFlags *ret_flags); +int dns_answer_find_cname_or_dname(DnsAnswer *a, const DnsResourceKey *key, DnsResourceRecord **ret, DnsAnswerFlags *ret_flags); + +int dns_answer_merge(DnsAnswer *a, DnsAnswer *b, DnsAnswer **ret); +int dns_answer_extend(DnsAnswer **a, DnsAnswer *b); + +void dns_answer_order_by_scope(DnsAnswer *a, bool prefer_link_local); + +int dns_answer_reserve(DnsAnswer **a, size_t n_free); +int dns_answer_reserve_or_clone(DnsAnswer **a, size_t n_free); + +int dns_answer_remove_by_key(DnsAnswer **a, const DnsResourceKey *key); +int dns_answer_remove_by_rr(DnsAnswer **a, DnsResourceRecord *rr); +int dns_answer_remove_by_answer_keys(DnsAnswer **a, DnsAnswer *b); + +int dns_answer_copy_by_key(DnsAnswer **a, DnsAnswer *source, const DnsResourceKey *key, DnsAnswerFlags or_flags, DnsResourceRecord *rrsig); +int dns_answer_move_by_key(DnsAnswer **to, DnsAnswer **from, const DnsResourceKey *key, DnsAnswerFlags or_flags, DnsResourceRecord *rrsig); + +int dns_answer_has_dname_for_cname(DnsAnswer *a, DnsResourceRecord *cname); + +static inline size_t dns_answer_size(DnsAnswer *a) { + return a ? ordered_set_size(a->items) : 0; +} + +static inline bool dns_answer_isempty(DnsAnswer *a) { + return dns_answer_size(a) <= 0; +} + +void dns_answer_dump(DnsAnswer *answer, FILE *f); + +void dns_answer_randomize(DnsAnswer *a); + +uint32_t dns_answer_min_ttl(DnsAnswer *a); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsAnswer*, dns_answer_unref); + +typedef struct DnsAnswerIterator { + Iterator iterator; + DnsAnswer *answer; + DnsAnswerItem *item; +} DnsAnswerIterator; + +#define _DNS_ANSWER_FOREACH(kk, a, i) \ + for (DnsAnswerIterator i = { .iterator = ITERATOR_FIRST, .answer = (a) }; \ + i.answer && \ + ordered_set_iterate(i.answer->items, &i.iterator, (void**) &(i.item)) && \ + (kk = i.item->rr, true); ) + +#define DNS_ANSWER_FOREACH(rr, a) _DNS_ANSWER_FOREACH(rr, a, UNIQ_T(i, UNIQ)) + +#define _DNS_ANSWER_FOREACH_IFINDEX(kk, ifi, a, i) \ + for (DnsAnswerIterator i = { .iterator = ITERATOR_FIRST, .answer = (a) }; \ + i.answer && \ + ordered_set_iterate(i.answer->items, &i.iterator, (void**) &(i.item)) && \ + (kk = i.item->rr, ifi = i.item->ifindex, true); ) + +#define DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, a) _DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, a, UNIQ_T(i, UNIQ)) + +#define _DNS_ANSWER_FOREACH_FLAGS(kk, fl, a, i) \ + for (DnsAnswerIterator i = { .iterator = ITERATOR_FIRST, .answer = (a) }; \ + i.answer && \ + ordered_set_iterate(i.answer->items, &i.iterator, (void**) &(i.item)) && \ + (kk = i.item->rr, fl = i.item->flags, true); ) + +#define DNS_ANSWER_FOREACH_FLAGS(rr, flags, a) _DNS_ANSWER_FOREACH_FLAGS(rr, flags, a, UNIQ_T(i, UNIQ)) + +#define _DNS_ANSWER_FOREACH_ITEM(it, a, i) \ + for (DnsAnswerIterator i = { .iterator = ITERATOR_FIRST, .answer = (a) }; \ + i.answer && \ + ordered_set_iterate(i.answer->items, &i.iterator, (void**) &(i.item)) && \ + (it = i.item, true); ) + +#define DNS_ANSWER_FOREACH_ITEM(item, a) _DNS_ANSWER_FOREACH_ITEM(item, a, UNIQ_T(i, UNIQ)) diff --git a/src/resolve/resolved-dns-cache.c b/src/resolve/resolved-dns-cache.c new file mode 100644 index 0000000..a9a6492 --- /dev/null +++ b/src/resolve/resolved-dns-cache.c @@ -0,0 +1,1486 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "dns-domain.h" +#include "format-util.h" +#include "resolved-dns-answer.h" +#include "resolved-dns-cache.h" +#include "resolved-dns-packet.h" +#include "string-util.h" + +/* Never cache more than 4K entries. RFC 1536, Section 5 suggests to + * leave DNS caches unbounded, but that's crazy. */ +#define CACHE_MAX 4096 + +/* We never keep any item longer than 2h in our cache unless StaleRetentionSec is greater than zero. */ +#define CACHE_TTL_MAX_USEC (2 * USEC_PER_HOUR) + +/* The max TTL for stale data is set to 30 seconds. See RFC 8767, Section 6. */ +#define CACHE_STALE_TTL_MAX_USEC (30 * USEC_PER_SEC) + +/* How long to cache strange rcodes, i.e. rcodes != SUCCESS and != NXDOMAIN (specifically: that's only SERVFAIL for + * now) */ +#define CACHE_TTL_STRANGE_RCODE_USEC (10 * USEC_PER_SEC) + +#define CACHEABLE_QUERY_FLAGS (SD_RESOLVED_AUTHENTICATED|SD_RESOLVED_CONFIDENTIAL) + +typedef enum DnsCacheItemType DnsCacheItemType; +typedef struct DnsCacheItem DnsCacheItem; + +enum DnsCacheItemType { + DNS_CACHE_POSITIVE, + DNS_CACHE_NODATA, + DNS_CACHE_NXDOMAIN, + DNS_CACHE_RCODE, /* "strange" RCODE (effective only SERVFAIL for now) */ +}; + +struct DnsCacheItem { + DnsCacheItemType type; + int rcode; + DnsResourceKey *key; /* The key for this item, i.e. the lookup key */ + DnsResourceRecord *rr; /* The RR for this item, i.e. the lookup value for positive queries */ + DnsAnswer *answer; /* The full validated answer, if this is an RRset acquired via a "primary" lookup */ + DnsPacket *full_packet; /* The full packet this information was acquired with */ + + usec_t until; /* If StaleRetentionSec is greater than zero, until is set to a duration of StaleRetentionSec from the time of TTL expiry. If StaleRetentionSec is zero, both until and until_valid will be set to ttl. */ + usec_t until_valid; /* The key is for storing the time when the TTL set to expire. */ + uint64_t query_flags; /* SD_RESOLVED_AUTHENTICATED and/or SD_RESOLVED_CONFIDENTIAL */ + DnssecResult dnssec_result; + + int ifindex; + int owner_family; + union in_addr_union owner_address; + + unsigned prioq_idx; + LIST_FIELDS(DnsCacheItem, by_key); + + bool shared_owner; +}; + +/* Returns true if this is a cache item created as result of an explicit lookup, or created as "side-effect" + * of another request. "Primary" entries will carry the full answer data (with NSEC, …) that can aso prove + * wildcard expansion, non-existence and such, while entries that were created as "side-effect" just contain + * immediate RR data for the specified RR key, but nothing else. */ +#define DNS_CACHE_ITEM_IS_PRIMARY(item) (!!(item)->answer) + +static const char *dns_cache_item_type_to_string(DnsCacheItem *item) { + assert(item); + + switch (item->type) { + + case DNS_CACHE_POSITIVE: + return "POSITIVE"; + + case DNS_CACHE_NODATA: + return "NODATA"; + + case DNS_CACHE_NXDOMAIN: + return "NXDOMAIN"; + + case DNS_CACHE_RCODE: + return dns_rcode_to_string(item->rcode); + } + + return NULL; +} + +static DnsCacheItem* dns_cache_item_free(DnsCacheItem *i) { + if (!i) + return NULL; + + dns_resource_record_unref(i->rr); + dns_resource_key_unref(i->key); + dns_answer_unref(i->answer); + dns_packet_unref(i->full_packet); + return mfree(i); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsCacheItem*, dns_cache_item_free); + +static void dns_cache_item_unlink_and_free(DnsCache *c, DnsCacheItem *i) { + DnsCacheItem *first; + + assert(c); + + if (!i) + return; + + first = hashmap_get(c->by_key, i->key); + LIST_REMOVE(by_key, first, i); + + if (first) + assert_se(hashmap_replace(c->by_key, first->key, first) >= 0); + else + hashmap_remove(c->by_key, i->key); + + prioq_remove(c->by_expiry, i, &i->prioq_idx); + + dns_cache_item_free(i); +} + +static bool dns_cache_remove_by_rr(DnsCache *c, DnsResourceRecord *rr) { + DnsCacheItem *first; + int r; + + first = hashmap_get(c->by_key, rr->key); + LIST_FOREACH(by_key, i, first) { + r = dns_resource_record_equal(i->rr, rr); + if (r < 0) + return r; + if (r > 0) { + dns_cache_item_unlink_and_free(c, i); + return true; + } + } + + return false; +} + +static bool dns_cache_remove_by_key(DnsCache *c, DnsResourceKey *key) { + DnsCacheItem *first; + + assert(c); + assert(key); + + first = hashmap_remove(c->by_key, key); + if (!first) + return false; + + LIST_FOREACH(by_key, i, first) { + prioq_remove(c->by_expiry, i, &i->prioq_idx); + dns_cache_item_free(i); + } + + return true; +} + +void dns_cache_flush(DnsCache *c) { + DnsResourceKey *key; + + assert(c); + + while ((key = hashmap_first_key(c->by_key))) + dns_cache_remove_by_key(c, key); + + assert(hashmap_size(c->by_key) == 0); + assert(prioq_size(c->by_expiry) == 0); + + c->by_key = hashmap_free(c->by_key); + c->by_expiry = prioq_free(c->by_expiry); +} + +static void dns_cache_make_space(DnsCache *c, unsigned add) { + assert(c); + + if (add <= 0) + return; + + /* Makes space for n new entries. Note that we actually allow + * the cache to grow beyond CACHE_MAX, but only when we shall + * add more RRs to the cache than CACHE_MAX at once. In that + * case the cache will be emptied completely otherwise. */ + + for (;;) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + DnsCacheItem *i; + + if (prioq_size(c->by_expiry) <= 0) + break; + + if (prioq_size(c->by_expiry) + add < CACHE_MAX) + break; + + i = prioq_peek(c->by_expiry); + assert(i); + + /* Take an extra reference to the key so that it + * doesn't go away in the middle of the remove call */ + key = dns_resource_key_ref(i->key); + dns_cache_remove_by_key(c, key); + } +} + +void dns_cache_prune(DnsCache *c) { + usec_t t = 0; + + assert(c); + + /* Remove all entries that are past their TTL */ + + for (;;) { + DnsCacheItem *i; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + i = prioq_peek(c->by_expiry); + if (!i) + break; + + if (t <= 0) + t = now(CLOCK_BOOTTIME); + + if (i->until > t) + break; + + /* Depending whether this is an mDNS shared entry + * either remove only this one RR or the whole RRset */ + log_debug("Removing %scache entry for %s (expired "USEC_FMT"s ago)", + i->shared_owner ? "shared " : "", + dns_resource_key_to_string(i->key, key_str, sizeof key_str), + (t - i->until) / USEC_PER_SEC); + + if (i->shared_owner) + dns_cache_item_unlink_and_free(c, i); + else { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + + /* Take an extra reference to the key so that it + * doesn't go away in the middle of the remove call */ + key = dns_resource_key_ref(i->key); + dns_cache_remove_by_key(c, key); + } + } +} + +static int dns_cache_item_prioq_compare_func(const void *a, const void *b) { + const DnsCacheItem *x = a, *y = b; + + return CMP(x->until, y->until); +} + +static int dns_cache_init(DnsCache *c) { + int r; + + assert(c); + + r = prioq_ensure_allocated(&c->by_expiry, dns_cache_item_prioq_compare_func); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&c->by_key, &dns_resource_key_hash_ops); + if (r < 0) + return r; + + return r; +} + +static int dns_cache_link_item(DnsCache *c, DnsCacheItem *i) { + DnsCacheItem *first; + int r; + + assert(c); + assert(i); + + r = prioq_put(c->by_expiry, i, &i->prioq_idx); + if (r < 0) + return r; + + first = hashmap_get(c->by_key, i->key); + if (first) { + _unused_ _cleanup_(dns_resource_key_unrefp) DnsResourceKey *k = NULL; + + /* Keep a reference to the original key, while we manipulate the list. */ + k = dns_resource_key_ref(first->key); + + /* Now, try to reduce the number of keys we keep */ + dns_resource_key_reduce(&first->key, &i->key); + + if (first->rr) + dns_resource_key_reduce(&first->rr->key, &i->key); + if (i->rr) + dns_resource_key_reduce(&i->rr->key, &i->key); + + LIST_PREPEND(by_key, first, i); + assert_se(hashmap_replace(c->by_key, first->key, first) >= 0); + } else { + r = hashmap_put(c->by_key, i->key, i); + if (r < 0) { + prioq_remove(c->by_expiry, i, &i->prioq_idx); + return r; + } + } + + return 0; +} + +static DnsCacheItem* dns_cache_get(DnsCache *c, DnsResourceRecord *rr) { + assert(c); + assert(rr); + + LIST_FOREACH(by_key, i, (DnsCacheItem*) hashmap_get(c->by_key, rr->key)) + if (i->rr && dns_resource_record_equal(i->rr, rr) > 0) + return i; + + return NULL; +} + +static usec_t calculate_until_valid( + DnsResourceRecord *rr, + uint32_t min_ttl, + uint32_t nsec_ttl, + usec_t timestamp, + bool use_soa_minimum) { + + uint32_t ttl; + usec_t u; + + assert(rr); + + ttl = MIN(min_ttl, nsec_ttl); + if (rr->key->type == DNS_TYPE_SOA && use_soa_minimum) { + /* If this is a SOA RR, and it is requested, clamp to the SOA's minimum field. This is used + * when we do negative caching, to determine the TTL for the negative caching entry. See RFC + * 2308, Section 5. */ + + if (ttl > rr->soa.minimum) + ttl = rr->soa.minimum; + } + + u = ttl * USEC_PER_SEC; + if (u > CACHE_TTL_MAX_USEC) + u = CACHE_TTL_MAX_USEC; + + if (rr->expiry != USEC_INFINITY) { + usec_t left; + + /* Make use of the DNSSEC RRSIG expiry info, if we have it */ + + left = LESS_BY(rr->expiry, now(CLOCK_REALTIME)); + if (u > left) + u = left; + } + + return timestamp + u; +} + +static usec_t calculate_until( + usec_t until_valid, + usec_t stale_retention_usec) { + + return stale_retention_usec > 0 ? usec_add(until_valid, stale_retention_usec) : until_valid; +} + +static void dns_cache_item_update_positive( + DnsCache *c, + DnsCacheItem *i, + DnsResourceRecord *rr, + DnsAnswer *answer, + DnsPacket *full_packet, + uint32_t min_ttl, + uint64_t query_flags, + bool shared_owner, + DnssecResult dnssec_result, + usec_t timestamp, + int ifindex, + int owner_family, + const union in_addr_union *owner_address, + usec_t stale_retention_usec) { + + assert(c); + assert(i); + assert(rr); + assert(owner_address); + + i->type = DNS_CACHE_POSITIVE; + + if (!i->by_key_prev) + /* We are the first item in the list, we need to + * update the key used in the hashmap */ + + assert_se(hashmap_replace(c->by_key, rr->key, i) >= 0); + + DNS_RR_REPLACE(i->rr, dns_resource_record_ref(rr)); + + DNS_RESOURCE_KEY_REPLACE(i->key, dns_resource_key_ref(rr->key)); + + DNS_ANSWER_REPLACE(i->answer, dns_answer_ref(answer)); + + DNS_PACKET_REPLACE(i->full_packet, dns_packet_ref(full_packet)); + + i->until_valid = calculate_until_valid(rr, min_ttl, UINT32_MAX, timestamp, false); + i->until = calculate_until(i->until_valid, stale_retention_usec); + i->query_flags = query_flags & CACHEABLE_QUERY_FLAGS; + i->shared_owner = shared_owner; + i->dnssec_result = dnssec_result; + + i->ifindex = ifindex; + + i->owner_family = owner_family; + i->owner_address = *owner_address; + + prioq_reshuffle(c->by_expiry, i, &i->prioq_idx); +} + +static int dns_cache_put_positive( + DnsCache *c, + DnsProtocol protocol, + DnsResourceRecord *rr, + DnsAnswer *answer, + DnsPacket *full_packet, + uint64_t query_flags, + bool shared_owner, + DnssecResult dnssec_result, + usec_t timestamp, + int ifindex, + int owner_family, + const union in_addr_union *owner_address, + usec_t stale_retention_usec) { + + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + DnsCacheItem *existing; + uint32_t min_ttl; + int r; + + assert(c); + assert(rr); + assert(owner_address); + + /* Never cache pseudo RRs */ + if (dns_class_is_pseudo(rr->key->class)) + return 0; + if (dns_type_is_pseudo(rr->key->type)) + return 0; + + /* Determine the minimal TTL of all RRs in the answer plus the one by the main RR we are supposed to + * cache. Since we cache whole answers to questions we should never return answers where only some + * RRs are still valid, hence find the lowest here */ + min_ttl = MIN(dns_answer_min_ttl(answer), rr->ttl); + + /* New TTL is 0? Delete this specific entry... */ + if (min_ttl <= 0) { + r = dns_cache_remove_by_rr(c, rr); + log_debug("%s: %s", + r > 0 ? "Removed zero TTL entry from cache" : "Not caching zero TTL cache entry", + dns_resource_key_to_string(rr->key, key_str, sizeof key_str)); + return 0; + } + + /* Entry exists already? Update TTL, timestamp and owner */ + existing = dns_cache_get(c, rr); + if (existing) { + dns_cache_item_update_positive( + c, + existing, + rr, + answer, + full_packet, + min_ttl, + query_flags, + shared_owner, + dnssec_result, + timestamp, + ifindex, + owner_family, + owner_address, + stale_retention_usec); + return 0; + } + + /* Do not cache mDNS goodbye packet. */ + if (protocol == DNS_PROTOCOL_MDNS && rr->ttl <= 1) + return 0; + + /* Otherwise, add the new RR */ + r = dns_cache_init(c); + if (r < 0) + return r; + + dns_cache_make_space(c, 1); + + _cleanup_(dns_cache_item_freep) DnsCacheItem *i = new(DnsCacheItem, 1); + if (!i) + return -ENOMEM; + + /* If StaleRetentionSec is greater than zero, the 'until' property is set to a duration + * of StaleRetentionSec from the time of TTL expiry. + * If StaleRetentionSec is zero, both the 'until' and 'until_valid' are set to the TTL duration, + * leading to the eviction of the record once the TTL expires.*/ + usec_t until_valid = calculate_until_valid(rr, min_ttl, UINT32_MAX, timestamp, false); + *i = (DnsCacheItem) { + .type = DNS_CACHE_POSITIVE, + .key = dns_resource_key_ref(rr->key), + .rr = dns_resource_record_ref(rr), + .answer = dns_answer_ref(answer), + .full_packet = dns_packet_ref(full_packet), + .until = calculate_until(until_valid, stale_retention_usec), + .until_valid = until_valid, + .query_flags = query_flags & CACHEABLE_QUERY_FLAGS, + .shared_owner = shared_owner, + .dnssec_result = dnssec_result, + .ifindex = ifindex, + .owner_family = owner_family, + .owner_address = *owner_address, + .prioq_idx = PRIOQ_IDX_NULL, + }; + + r = dns_cache_link_item(c, i); + if (r < 0) + return r; + + log_debug("Added positive %s %s%s cache entry for %s "USEC_FMT"s on %s/%s/%s", + FLAGS_SET(i->query_flags, SD_RESOLVED_AUTHENTICATED) ? "authenticated" : "unauthenticated", + FLAGS_SET(i->query_flags, SD_RESOLVED_CONFIDENTIAL) ? "confidential" : "non-confidential", + i->shared_owner ? " shared" : "", + dns_resource_key_to_string(i->key, key_str, sizeof key_str), + (i->until - timestamp) / USEC_PER_SEC, + i->ifindex == 0 ? "*" : FORMAT_IFNAME(i->ifindex), + af_to_name_short(i->owner_family), + IN_ADDR_TO_STRING(i->owner_family, &i->owner_address)); + + TAKE_PTR(i); + return 0; +} + +static int dns_cache_put_negative( + DnsCache *c, + DnsResourceKey *key, + int rcode, + DnsAnswer *answer, + DnsPacket *full_packet, + uint64_t query_flags, + DnssecResult dnssec_result, + uint32_t nsec_ttl, + usec_t timestamp, + DnsResourceRecord *soa, + int owner_family, + const union in_addr_union *owner_address) { + + _cleanup_(dns_cache_item_freep) DnsCacheItem *i = NULL; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + int r; + + assert(c); + assert(key); + assert(owner_address); + + /* Never cache pseudo RR keys. DNS_TYPE_ANY is particularly + * important to filter out as we use this as a pseudo-type for + * NXDOMAIN entries */ + if (dns_class_is_pseudo(key->class)) + return 0; + if (dns_type_is_pseudo(key->type)) + return 0; + + if (IN_SET(rcode, DNS_RCODE_SUCCESS, DNS_RCODE_NXDOMAIN)) { + if (!soa) + return 0; + + /* For negative replies, check if we have a TTL of a SOA */ + if (nsec_ttl <= 0 || soa->soa.minimum <= 0 || soa->ttl <= 0) { + log_debug("Not caching negative entry with zero SOA/NSEC/NSEC3 TTL: %s", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + return 0; + } + } else if (rcode != DNS_RCODE_SERVFAIL) + return 0; + + r = dns_cache_init(c); + if (r < 0) + return r; + + dns_cache_make_space(c, 1); + + i = new(DnsCacheItem, 1); + if (!i) + return -ENOMEM; + + *i = (DnsCacheItem) { + .type = + rcode == DNS_RCODE_SUCCESS ? DNS_CACHE_NODATA : + rcode == DNS_RCODE_NXDOMAIN ? DNS_CACHE_NXDOMAIN : DNS_CACHE_RCODE, + .query_flags = query_flags & CACHEABLE_QUERY_FLAGS, + .dnssec_result = dnssec_result, + .owner_family = owner_family, + .owner_address = *owner_address, + .prioq_idx = PRIOQ_IDX_NULL, + .rcode = rcode, + .answer = dns_answer_ref(answer), + .full_packet = dns_packet_ref(full_packet), + }; + + /* Determine how long to cache this entry. In case we have some RRs in the answer use the lowest TTL + * of any of them. Typically that's the SOA's TTL, which is OK, but could possibly be lower because + * of some other RR. Let's better take the lowest option here than a needlessly high one */ + i->until = i->until_valid = + i->type == DNS_CACHE_RCODE ? timestamp + CACHE_TTL_STRANGE_RCODE_USEC : + calculate_until_valid(soa, dns_answer_min_ttl(answer), nsec_ttl, timestamp, true); + + if (i->type == DNS_CACHE_NXDOMAIN) { + /* NXDOMAIN entries should apply equally to all types, so we use ANY as + * a pseudo type for this purpose here. */ + i->key = dns_resource_key_new(key->class, DNS_TYPE_ANY, dns_resource_key_name(key)); + if (!i->key) + return -ENOMEM; + + /* Make sure to remove any previous entry for this + * specific ANY key. (For non-ANY keys the cache data + * is already cleared by the caller.) Note that we + * don't bother removing positive or NODATA cache + * items in this case, because it would either be slow + * or require explicit indexing by name */ + dns_cache_remove_by_key(c, key); + } else + i->key = dns_resource_key_ref(key); + + r = dns_cache_link_item(c, i); + if (r < 0) + return r; + + log_debug("Added %s cache entry for %s "USEC_FMT"s", + dns_cache_item_type_to_string(i), + dns_resource_key_to_string(i->key, key_str, sizeof key_str), + (i->until - timestamp) / USEC_PER_SEC); + + i = NULL; + return 0; +} + +static void dns_cache_remove_previous( + DnsCache *c, + DnsResourceKey *key, + DnsAnswer *answer) { + + DnsResourceRecord *rr; + DnsAnswerFlags flags; + + assert(c); + + /* First, if we were passed a key (i.e. on LLMNR/DNS, but + * not on mDNS), delete all matching old RRs, so that we only + * keep complete by_key in place. */ + if (key) + dns_cache_remove_by_key(c, key); + + /* Second, flush all entries matching the answer, unless this + * is an RR that is explicitly marked to be "shared" between + * peers (i.e. mDNS RRs without the flush-cache bit set). */ + DNS_ANSWER_FOREACH_FLAGS(rr, flags, answer) { + if ((flags & DNS_ANSWER_CACHEABLE) == 0) + continue; + + if (flags & DNS_ANSWER_SHARED_OWNER) + continue; + + dns_cache_remove_by_key(c, rr->key); + } +} + +static bool rr_eligible(DnsResourceRecord *rr) { + assert(rr); + + /* When we see an NSEC/NSEC3 RR, we'll only cache it if it is from the lower zone, not the upper zone, since + * that's where the interesting bits are (with exception of DS RRs). Of course, this way we cannot derive DS + * existence from any cached NSEC/NSEC3, but that should be fine. */ + + switch (rr->key->type) { + + case DNS_TYPE_NSEC: + return !bitmap_isset(rr->nsec.types, DNS_TYPE_NS) || + bitmap_isset(rr->nsec.types, DNS_TYPE_SOA); + + case DNS_TYPE_NSEC3: + return !bitmap_isset(rr->nsec3.types, DNS_TYPE_NS) || + bitmap_isset(rr->nsec3.types, DNS_TYPE_SOA); + + default: + return true; + } +} + +int dns_cache_put( + DnsCache *c, + DnsCacheMode cache_mode, + DnsProtocol protocol, + DnsResourceKey *key, + int rcode, + DnsAnswer *answer, + DnsPacket *full_packet, + uint64_t query_flags, + DnssecResult dnssec_result, + uint32_t nsec_ttl, + int owner_family, + const union in_addr_union *owner_address, + usec_t stale_retention_usec) { + + DnsResourceRecord *soa = NULL; + bool weird_rcode = false; + DnsAnswerItem *item; + DnsAnswerFlags flags; + unsigned cache_keys; + usec_t timestamp; + int r; + + assert(c); + assert(owner_address); + + dns_cache_remove_previous(c, key, answer); + + /* We only care for positive replies and NXDOMAINs, on all other replies we will simply flush the respective + * entries, and that's it. (Well, with one further exception: since some DNS zones (akamai!) return SERVFAIL + * consistently for some lookups, and forwarders tend to propagate that we'll cache that too, but only for a + * short time.) */ + + if (IN_SET(rcode, DNS_RCODE_SUCCESS, DNS_RCODE_NXDOMAIN)) { + if (dns_answer_isempty(answer)) { + if (key) { + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + log_debug("Not caching negative entry without a SOA record: %s", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + } + + return 0; + } + + } else { + /* Only cache SERVFAIL as "weird" rcode for now. We can add more later, should that turn out to be + * beneficial. */ + if (rcode != DNS_RCODE_SERVFAIL) + return 0; + + weird_rcode = true; + } + + cache_keys = dns_answer_size(answer); + if (key) + cache_keys++; + + /* Make some space for our new entries */ + dns_cache_make_space(c, cache_keys); + + timestamp = now(CLOCK_BOOTTIME); + + /* Second, add in positive entries for all contained RRs */ + DNS_ANSWER_FOREACH_ITEM(item, answer) { + int primary = false; + + if (!FLAGS_SET(item->flags, DNS_ANSWER_CACHEABLE) || + !rr_eligible(item->rr)) + continue; + + if (key) { + /* We store the auxiliary RRs and packet data in the cache only if they were in + * direct response to the original query. If we cache an RR we also received, and + * that is just auxiliary information we can't use the data, hence don't. */ + + primary = dns_resource_key_match_rr(key, item->rr, NULL); + if (primary < 0) + return primary; + if (primary == 0) { + primary = dns_resource_key_match_cname_or_dname(key, item->rr->key, NULL); + if (primary < 0) + return primary; + } + } + + if (!primary) { + DnsCacheItem *first; + + /* Do not replace existing cache items for primary lookups with non-primary + * data. After all the primary lookup data is a lot more useful. */ + first = hashmap_get(c->by_key, item->rr->key); + if (first && DNS_CACHE_ITEM_IS_PRIMARY(first)) + return 0; + } + + r = dns_cache_put_positive( + c, + protocol, + item->rr, + primary ? answer : NULL, + primary ? full_packet : NULL, + ((item->flags & DNS_ANSWER_AUTHENTICATED) ? SD_RESOLVED_AUTHENTICATED : 0) | + (query_flags & SD_RESOLVED_CONFIDENTIAL), + item->flags & DNS_ANSWER_SHARED_OWNER, + dnssec_result, + timestamp, + item->ifindex, + owner_family, + owner_address, + stale_retention_usec); + if (r < 0) + goto fail; + } + + if (!key) /* mDNS doesn't know negative caching, really */ + return 0; + + /* Third, add in negative entries if the key has no RR */ + r = dns_answer_match_key(answer, key, NULL); + if (r < 0) + goto fail; + if (r > 0) + return 0; + + /* But not if it has a matching CNAME/DNAME (the negative caching will be done on the canonical name, + * not on the alias) */ + r = dns_answer_find_cname_or_dname(answer, key, NULL, NULL); + if (r < 0) + goto fail; + if (r > 0) + return 0; + + /* See https://tools.ietf.org/html/rfc2308, which say that a matching SOA record in the packet is used to + * enable negative caching. We apply one exception though: if we are about to cache a weird rcode we do so + * regardless of a SOA. */ + r = dns_answer_find_soa(answer, key, &soa, &flags); + if (r < 0) + goto fail; + if (r == 0 && !weird_rcode) + return 0; + if (r > 0) { + /* Refuse using the SOA data if it is unsigned, but the key is signed */ + if (FLAGS_SET(query_flags, SD_RESOLVED_AUTHENTICATED) && + (flags & DNS_ANSWER_AUTHENTICATED) == 0) + return 0; + } + + if (cache_mode == DNS_CACHE_MODE_NO_NEGATIVE) { + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + log_debug("Not caching negative entry for: %s, cache mode set to no-negative", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + return 0; + } + + r = dns_cache_put_negative( + c, + key, + rcode, + answer, + full_packet, + query_flags, + dnssec_result, + nsec_ttl, + timestamp, + soa, + owner_family, + owner_address); + if (r < 0) + goto fail; + + return 0; + +fail: + /* Adding all RRs failed. Let's clean up what we already + * added, just in case */ + + if (key) + dns_cache_remove_by_key(c, key); + + DNS_ANSWER_FOREACH_ITEM(item, answer) { + if ((item->flags & DNS_ANSWER_CACHEABLE) == 0) + continue; + + dns_cache_remove_by_key(c, item->rr->key); + } + + return r; +} + +static DnsCacheItem *dns_cache_get_by_key_follow_cname_dname_nsec(DnsCache *c, DnsResourceKey *k) { + DnsCacheItem *i; + const char *n; + int r; + + assert(c); + assert(k); + + /* If we hit some OOM error, or suchlike, we don't care too + * much, after all this is just a cache */ + + i = hashmap_get(c->by_key, k); + if (i) + return i; + + n = dns_resource_key_name(k); + + /* Check if we have an NXDOMAIN cache item for the name, notice that we use + * the pseudo-type ANY for NXDOMAIN cache items. */ + i = hashmap_get(c->by_key, &DNS_RESOURCE_KEY_CONST(k->class, DNS_TYPE_ANY, n)); + if (i && i->type == DNS_CACHE_NXDOMAIN) + return i; + + if (dns_type_may_redirect(k->type)) { + /* Check if we have a CNAME record instead */ + i = hashmap_get(c->by_key, &DNS_RESOURCE_KEY_CONST(k->class, DNS_TYPE_CNAME, n)); + if (i && i->type != DNS_CACHE_NODATA) + return i; + + /* OK, let's look for cached DNAME records. */ + for (;;) { + if (isempty(n)) + return NULL; + + i = hashmap_get(c->by_key, &DNS_RESOURCE_KEY_CONST(k->class, DNS_TYPE_DNAME, n)); + if (i && i->type != DNS_CACHE_NODATA) + return i; + + /* Jump one label ahead */ + r = dns_name_parent(&n); + if (r <= 0) + return NULL; + } + } + + if (k->type != DNS_TYPE_NSEC) { + /* Check if we have an NSEC record instead for the name. */ + i = hashmap_get(c->by_key, &DNS_RESOURCE_KEY_CONST(k->class, DNS_TYPE_NSEC, n)); + if (i) + return i; + } + + return NULL; +} + +static int answer_add_clamp_ttl( + DnsAnswer **answer, + DnsResourceRecord *rr, + int ifindex, + DnsAnswerFlags answer_flags, + DnsResourceRecord *rrsig, + uint64_t query_flags, + usec_t until, + usec_t current) { + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *patched = NULL, *patched_rrsig = NULL; + int r; + + assert(answer); + assert(rr); + + if (FLAGS_SET(query_flags, SD_RESOLVED_CLAMP_TTL)) { + uint32_t left_ttl; + + assert(current > 0); + + /* Let's determine how much time is left for this cache entry. Note that we round down, but + * clamp this to be 1s at minimum, since we usually want records to remain cached better too + * short a time than too long a time, but otoh don't want to return 0 ever, since that has + * special semantics in various contexts — in particular in mDNS */ + + left_ttl = MAX(1U, LESS_BY(until, current) / USEC_PER_SEC); + + patched = dns_resource_record_ref(rr); + + r = dns_resource_record_clamp_ttl(&patched, left_ttl); + if (r < 0) + return r; + + rr = patched; + + if (rrsig) { + patched_rrsig = dns_resource_record_ref(rrsig); + r = dns_resource_record_clamp_ttl(&patched_rrsig, left_ttl); + if (r < 0) + return r; + + rrsig = patched_rrsig; + } + } + + r = dns_answer_add_extend(answer, rr, ifindex, answer_flags, rrsig); + if (r < 0) + return r; + + return 0; +} + +int dns_cache_lookup( + DnsCache *c, + DnsResourceKey *key, + uint64_t query_flags, + int *ret_rcode, + DnsAnswer **ret_answer, + DnsPacket **ret_full_packet, + uint64_t *ret_query_flags, + DnssecResult *ret_dnssec_result) { + + _cleanup_(dns_packet_unrefp) DnsPacket *full_packet = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + unsigned n = 0; + int r; + bool nxdomain = false; + DnsCacheItem *first, *nsec = NULL; + bool have_authenticated = false, have_non_authenticated = false, have_confidential = false, have_non_confidential = false; + usec_t current = 0; + int found_rcode = -1; + DnssecResult dnssec_result = -1; + int have_dnssec_result = -1; + + assert(c); + assert(key); + + if (key->type == DNS_TYPE_ANY || key->class == DNS_CLASS_ANY) { + /* If we have ANY lookups we don't use the cache, so that the caller refreshes via the + * network. */ + + log_debug("Ignoring cache for ANY lookup: %s", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + goto miss; + } + + first = dns_cache_get_by_key_follow_cname_dname_nsec(c, key); + if (!first) { + /* If one question cannot be answered we need to refresh */ + + log_debug("Cache miss for %s", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + goto miss; + } + + if ((query_flags & (SD_RESOLVED_CLAMP_TTL | SD_RESOLVED_NO_STALE)) != 0) { + /* 'current' is always passed to answer_add_clamp_ttl(), but is only used conditionally. + * We'll do the same assert there to make sure that it was initialized properly. + * 'current' is also used below when SD_RESOLVED_NO_STALE is set. */ + current = now(CLOCK_BOOTTIME); + assert(current > 0); + } + + LIST_FOREACH(by_key, j, first) { + /* If the caller doesn't allow us to answer questions from cache data learned from + * "side-effect", skip this entry. */ + if (FLAGS_SET(query_flags, SD_RESOLVED_REQUIRE_PRIMARY) && + !DNS_CACHE_ITEM_IS_PRIMARY(j)) { + log_debug("Primary answer was requested for cache lookup for %s, which we don't have.", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + + goto miss; + } + + /* Skip the next part if ttl is expired and requested with no stale flag. */ + if (FLAGS_SET(query_flags, SD_RESOLVED_NO_STALE) && j->until_valid < current) { + log_debug("Requested with no stale and TTL expired for %s", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + + goto miss; + } + + if (j->type == DNS_CACHE_NXDOMAIN) + nxdomain = true; + else if (j->type == DNS_CACHE_RCODE) + found_rcode = j->rcode; + else if (j->rr) { + if (j->rr->key->type == DNS_TYPE_NSEC) + nsec = j; + + n++; + } + + if (FLAGS_SET(j->query_flags, SD_RESOLVED_AUTHENTICATED)) + have_authenticated = true; + else + have_non_authenticated = true; + + if (FLAGS_SET(j->query_flags, SD_RESOLVED_CONFIDENTIAL)) + have_confidential = true; + else + have_non_confidential = true; + + if (j->dnssec_result < 0) { + have_dnssec_result = false; /* an entry without dnssec result? then invalidate things for good */ + dnssec_result = _DNSSEC_RESULT_INVALID; + } else if (have_dnssec_result < 0) { + have_dnssec_result = true; /* So far no result seen, let's pick this one up */ + dnssec_result = j->dnssec_result; + } else if (have_dnssec_result > 0 && j->dnssec_result != dnssec_result) { + have_dnssec_result = false; /* conflicting result seen? then invalidate for good */ + dnssec_result = _DNSSEC_RESULT_INVALID; + } + + /* If the question is being resolved using stale data, the clamp TTL will be set to CACHE_STALE_TTL_MAX_USEC. */ + usec_t until = FLAGS_SET(query_flags, SD_RESOLVED_NO_STALE) ? j->until_valid + : usec_add(current, CACHE_STALE_TTL_MAX_USEC); + + /* Append the answer RRs to our answer. Ideally we have the answer object, which we + * preferably use. But if the cached entry was generated as "side-effect" of a reply, + * i.e. from validated auxiliary records rather than from the main reply, then we use the + * individual RRs only instead. */ + if (j->answer) { + + /* Minor optimization, if the full answer object of this and the previous RR is the + * same, don't bother adding it again. Typically we store a full RRset here, hence + * that should be the case. */ + if (!j->by_key_prev || j->answer != j->by_key_prev->answer) { + DnsAnswerItem *item; + + DNS_ANSWER_FOREACH_ITEM(item, j->answer) { + r = answer_add_clamp_ttl( + &answer, + item->rr, + item->ifindex, + item->flags, + item->rrsig, + query_flags, + until, + current); + if (r < 0) + return r; + } + } + + } else if (j->rr) { + r = answer_add_clamp_ttl( + &answer, + j->rr, + j->ifindex, + FLAGS_SET(j->query_flags, SD_RESOLVED_AUTHENTICATED) ? DNS_ANSWER_AUTHENTICATED : 0, + NULL, + query_flags, + until, + current); + if (r < 0) + return r; + } + + /* We'll return any packet we have for this. Typically all cache entries for the same key + * should come from the same packet anyway, hence it doesn't really matter which packet we + * return here, they should all resolve to the same anyway. */ + if (!full_packet && j->full_packet) + full_packet = dns_packet_ref(j->full_packet); + } + + if (found_rcode >= 0) { + log_debug("RCODE %s cache hit for %s", + FORMAT_DNS_RCODE(found_rcode), + dns_resource_key_to_string(key, key_str, sizeof(key_str))); + + if (ret_rcode) + *ret_rcode = found_rcode; + if (ret_answer) + *ret_answer = TAKE_PTR(answer); + if (ret_full_packet) + *ret_full_packet = TAKE_PTR(full_packet); + if (ret_query_flags) + *ret_query_flags = 0; + if (ret_dnssec_result) + *ret_dnssec_result = dnssec_result; + + c->n_hit++; + return 1; + } + + if (nsec && !IN_SET(key->type, DNS_TYPE_NSEC, DNS_TYPE_DS)) { + /* Note that we won't derive information for DS RRs from an NSEC, because we only cache NSEC + * RRs from the lower-zone of a zone cut, but the DS RRs are on the upper zone. */ + + log_debug("NSEC NODATA cache hit for %s", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + + /* We only found an NSEC record that matches our name. If it says the type doesn't exist + * report NODATA. Otherwise report a cache miss. */ + + if (ret_rcode) + *ret_rcode = DNS_RCODE_SUCCESS; + if (ret_answer) + *ret_answer = TAKE_PTR(answer); + if (ret_full_packet) + *ret_full_packet = TAKE_PTR(full_packet); + if (ret_query_flags) + *ret_query_flags = nsec->query_flags; + if (ret_dnssec_result) + *ret_dnssec_result = nsec->dnssec_result; + + if (!bitmap_isset(nsec->rr->nsec.types, key->type) && + !bitmap_isset(nsec->rr->nsec.types, DNS_TYPE_CNAME) && + !bitmap_isset(nsec->rr->nsec.types, DNS_TYPE_DNAME)) { + c->n_hit++; + return 1; + } + + c->n_miss++; + return 0; + } + + log_debug("%s cache hit for %s", + n > 0 ? "Positive" : + nxdomain ? "NXDOMAIN" : "NODATA", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + + if (n <= 0) { + c->n_hit++; + + if (ret_rcode) + *ret_rcode = nxdomain ? DNS_RCODE_NXDOMAIN : DNS_RCODE_SUCCESS; + if (ret_answer) + *ret_answer = TAKE_PTR(answer); + if (ret_full_packet) + *ret_full_packet = TAKE_PTR(full_packet); + if (ret_query_flags) + *ret_query_flags = + ((have_authenticated && !have_non_authenticated) ? SD_RESOLVED_AUTHENTICATED : 0) | + ((have_confidential && !have_non_confidential) ? SD_RESOLVED_CONFIDENTIAL : 0); + if (ret_dnssec_result) + *ret_dnssec_result = dnssec_result; + + return 1; + } + + c->n_hit++; + + if (ret_rcode) + *ret_rcode = DNS_RCODE_SUCCESS; + if (ret_answer) + *ret_answer = TAKE_PTR(answer); + if (ret_full_packet) + *ret_full_packet = TAKE_PTR(full_packet); + if (ret_query_flags) + *ret_query_flags = + ((have_authenticated && !have_non_authenticated) ? SD_RESOLVED_AUTHENTICATED : 0) | + ((have_confidential && !have_non_confidential) ? SD_RESOLVED_CONFIDENTIAL : 0); + if (ret_dnssec_result) + *ret_dnssec_result = dnssec_result; + + return n; + +miss: + if (ret_rcode) + *ret_rcode = DNS_RCODE_SUCCESS; + if (ret_answer) + *ret_answer = NULL; + if (ret_full_packet) + *ret_full_packet = NULL; + if (ret_query_flags) + *ret_query_flags = 0; + if (ret_dnssec_result) + *ret_dnssec_result = _DNSSEC_RESULT_INVALID; + + c->n_miss++; + return 0; +} + +int dns_cache_check_conflicts(DnsCache *cache, DnsResourceRecord *rr, int owner_family, const union in_addr_union *owner_address) { + DnsCacheItem *first; + bool same_owner = true; + + assert(cache); + assert(rr); + + dns_cache_prune(cache); + + /* See if there's a cache entry for the same key. If there + * isn't there's no conflict */ + first = hashmap_get(cache->by_key, rr->key); + if (!first) + return 0; + + /* See if the RR key is owned by the same owner, if so, there + * isn't a conflict either */ + LIST_FOREACH(by_key, i, first) { + if (i->owner_family != owner_family || + !in_addr_equal(owner_family, &i->owner_address, owner_address)) { + same_owner = false; + break; + } + } + if (same_owner) + return 0; + + /* See if there's the exact same RR in the cache. If yes, then + * there's no conflict. */ + if (dns_cache_get(cache, rr)) + return 0; + + /* There's a conflict */ + return 1; +} + +int dns_cache_export_shared_to_packet(DnsCache *cache, DnsPacket *p, usec_t ts, unsigned max_rr) { + unsigned ancount = 0; + DnsCacheItem *i; + int r; + + assert(cache); + assert(p); + assert(p->protocol == DNS_PROTOCOL_MDNS); + + HASHMAP_FOREACH(i, cache->by_key) + LIST_FOREACH(by_key, j, i) { + if (!j->rr) + continue; + + if (!j->shared_owner) + continue; + + /* Ignore cached goodby packet. See on_mdns_packet() and RFC 6762 section 10.1. */ + if (j->rr->ttl <= 1) + continue; + + /* RFC6762 7.1: Don't append records with less than half the TTL remaining + * as known answers. */ + if (usec_sub_unsigned(j->until, ts) < j->rr->ttl * USEC_PER_SEC / 2) + continue; + + if (max_rr > 0 && ancount >= max_rr) { + DNS_PACKET_HEADER(p)->ancount = htobe16(ancount); + ancount = 0; + + r = dns_packet_new_query(&p->more, p->protocol, 0, true); + if (r < 0) + return r; + + p = p->more; + + max_rr = UINT_MAX; + } + + r = dns_packet_append_rr(p, j->rr, 0, NULL, NULL); + if (r == -EMSGSIZE) { + if (max_rr == 0) + /* If max_rr == 0, do not allocate more packets. */ + goto finalize; + + /* If we're unable to stuff all known answers into the given packet, allocate + * a new one, push the RR into that one and link it to the current one. */ + + DNS_PACKET_HEADER(p)->ancount = htobe16(ancount); + ancount = 0; + + r = dns_packet_new_query(&p->more, p->protocol, 0, true); + if (r < 0) + return r; + + /* continue with new packet */ + p = p->more; + r = dns_packet_append_rr(p, j->rr, 0, NULL, NULL); + } + + if (r < 0) + return r; + + ancount++; + } + +finalize: + DNS_PACKET_HEADER(p)->ancount = htobe16(ancount); + + return 0; +} + +void dns_cache_dump(DnsCache *cache, FILE *f) { + DnsCacheItem *i; + + if (!cache) + return; + + if (!f) + f = stdout; + + HASHMAP_FOREACH(i, cache->by_key) + LIST_FOREACH(by_key, j, i) { + + fputc('\t', f); + + if (j->rr) { + const char *t; + t = dns_resource_record_to_string(j->rr); + if (!t) { + log_oom(); + continue; + } + + fputs(t, f); + fputc('\n', f); + } else { + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + fputs(dns_resource_key_to_string(j->key, key_str, sizeof key_str), f); + fputs(" -- ", f); + fputs(dns_cache_item_type_to_string(j), f); + fputc('\n', f); + } + } +} + +int dns_cache_dump_to_json(DnsCache *cache, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *c = NULL; + DnsCacheItem *i; + int r; + + assert(cache); + assert(ret); + + HASHMAP_FOREACH(i, cache->by_key) { + _cleanup_(json_variant_unrefp) JsonVariant *d = NULL, *k = NULL; + + r = dns_resource_key_to_json(i->key, &k); + if (r < 0) + return r; + + if (i->rr) { + _cleanup_(json_variant_unrefp) JsonVariant *l = NULL; + + LIST_FOREACH(by_key, j, i) { + _cleanup_(json_variant_unrefp) JsonVariant *rj = NULL; + + assert(j->rr); + + r = dns_resource_record_to_json(j->rr, &rj); + if (r < 0) + return r; + + r = dns_resource_record_to_wire_format(j->rr, /* canonical= */ false); /* don't use DNSSEC canonical format, since it removes casing, but we want that for DNS_SD compat */ + if (r < 0) + return r; + + r = json_variant_append_arrayb( + &l, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_VARIANT("rr", rj), + JSON_BUILD_PAIR_BASE64("raw", j->rr->wire_format, j->rr->wire_format_size))); + if (r < 0) + return r; + } + + if (!l) { + r = json_variant_new_array(&l, NULL, 0); + if (r < 0) + return r; + } + + r = json_build(&d, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_VARIANT("key", k), + JSON_BUILD_PAIR_VARIANT("rrs", l), + JSON_BUILD_PAIR_UNSIGNED("until", i->until))); + } else if (i->type == DNS_CACHE_NODATA) { + r = json_build(&d, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_VARIANT("key", k), + JSON_BUILD_PAIR_EMPTY_ARRAY("rrs"), + JSON_BUILD_PAIR_UNSIGNED("until", i->until))); + } else + r = json_build(&d, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_VARIANT("key", k), + JSON_BUILD_PAIR_STRING("type", dns_cache_item_type_to_string(i)), + JSON_BUILD_PAIR_UNSIGNED("until", i->until))); + if (r < 0) + return r; + + r = json_variant_append_array(&c, d); + if (r < 0) + return r; + } + + if (!c) + return json_variant_new_array(ret, NULL, 0); + + *ret = TAKE_PTR(c); + return 0; +} + +bool dns_cache_is_empty(DnsCache *cache) { + if (!cache) + return true; + + return hashmap_isempty(cache->by_key); +} + +unsigned dns_cache_size(DnsCache *cache) { + if (!cache) + return 0; + + return hashmap_size(cache->by_key); +} diff --git a/src/resolve/resolved-dns-cache.h b/src/resolve/resolved-dns-cache.h new file mode 100644 index 0000000..d078ae9 --- /dev/null +++ b/src/resolve/resolved-dns-cache.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" +#include "list.h" +#include "prioq.h" +#include "resolve-util.h" +#include "resolved-dns-dnssec.h" +#include "time-util.h" + +typedef struct DnsCache { + Hashmap *by_key; + Prioq *by_expiry; + unsigned n_hit; + unsigned n_miss; +} DnsCache; + +#include "resolved-dns-answer.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-question.h" +#include "resolved-dns-rr.h" + +void dns_cache_flush(DnsCache *c); +void dns_cache_prune(DnsCache *c); + +int dns_cache_put( + DnsCache *c, + DnsCacheMode cache_mode, + DnsProtocol protocol, + DnsResourceKey *key, + int rcode, + DnsAnswer *answer, + DnsPacket *full_packet, + uint64_t query_flags, + DnssecResult dnssec_result, + uint32_t nsec_ttl, + int owner_family, + const union in_addr_union *owner_address, + usec_t stale_retention_usec); + +int dns_cache_lookup( + DnsCache *c, + DnsResourceKey *key, + uint64_t query_flags, + int *ret_rcode, + DnsAnswer **ret_answer, + DnsPacket **ret_full_packet, + uint64_t *ret_query_flags, + DnssecResult *ret_dnssec_result); + +int dns_cache_check_conflicts(DnsCache *cache, DnsResourceRecord *rr, int owner_family, const union in_addr_union *owner_address); + +void dns_cache_dump(DnsCache *cache, FILE *f); +int dns_cache_dump_to_json(DnsCache *cache, JsonVariant **ret); + +bool dns_cache_is_empty(DnsCache *cache); + +unsigned dns_cache_size(DnsCache *cache); + +int dns_cache_export_shared_to_packet(DnsCache *cache, DnsPacket *p, usec_t ts, unsigned max_rr); diff --git a/src/resolve/resolved-dns-dnssec.c b/src/resolve/resolved-dns-dnssec.c new file mode 100644 index 0000000..a192d82 --- /dev/null +++ b/src/resolve/resolved-dns-dnssec.c @@ -0,0 +1,2589 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dns-domain.h" +#include "fd-util.h" +#include "fileio.h" +#include "gcrypt-util.h" +#include "hexdecoct.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "openssl-util.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-packet.h" +#include "sort-util.h" +#include "string-table.h" + +#if PREFER_OPENSSL && OPENSSL_VERSION_MAJOR >= 3 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdeprecated-declarations" +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(RSA*, RSA_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_KEY*, EC_KEY_free, NULL); +# pragma GCC diagnostic pop +#endif + +#define VERIFY_RRS_MAX 256 +#define MAX_KEY_SIZE (32*1024) + +/* Permit a maximum clock skew of 1h 10min. This should be enough to deal with DST confusion */ +#define SKEW_MAX (1*USEC_PER_HOUR + 10*USEC_PER_MINUTE) + +/* Maximum number of NSEC3 iterations we'll do. RFC5155 says 2500 shall be the maximum useful value, but + * RFC9276 § 3.2 says that we should reduce the acceptable iteration count */ +#define NSEC3_ITERATIONS_MAX 100 + +/* + * The DNSSEC Chain of trust: + * + * Normal RRs are protected via RRSIG RRs in combination with DNSKEY RRs, all in the same zone + * DNSKEY RRs are either protected like normal RRs, or via a DS from a zone "higher" up the tree + * DS RRs are protected like normal RRs + * + * Example chain: + * Normal RR → RRSIG/DNSKEY+ → DS → RRSIG/DNSKEY+ → DS → ... → DS → RRSIG/DNSKEY+ → DS + */ + +uint16_t dnssec_keytag(DnsResourceRecord *dnskey, bool mask_revoke) { + const uint8_t *p; + uint32_t sum, f; + + /* The algorithm from RFC 4034, Appendix B. */ + + assert(dnskey); + assert(dnskey->key->type == DNS_TYPE_DNSKEY); + + f = (uint32_t) dnskey->dnskey.flags; + + if (mask_revoke) + f &= ~DNSKEY_FLAG_REVOKE; + + sum = f + ((((uint32_t) dnskey->dnskey.protocol) << 8) + (uint32_t) dnskey->dnskey.algorithm); + + p = dnskey->dnskey.key; + + for (size_t i = 0; i < dnskey->dnskey.key_size; i++) + sum += (i & 1) == 0 ? (uint32_t) p[i] << 8 : (uint32_t) p[i]; + + sum += (sum >> 16) & UINT32_C(0xFFFF); + + return sum & UINT32_C(0xFFFF); +} + +#if HAVE_OPENSSL_OR_GCRYPT + +static int rr_compare(DnsResourceRecord * const *a, DnsResourceRecord * const *b) { + const DnsResourceRecord *x = *a, *y = *b; + size_t m; + int r; + + /* Let's order the RRs according to RFC 4034, Section 6.3 */ + + assert(x); + assert(x->wire_format); + assert(y); + assert(y->wire_format); + + m = MIN(DNS_RESOURCE_RECORD_RDATA_SIZE(x), DNS_RESOURCE_RECORD_RDATA_SIZE(y)); + + r = memcmp(DNS_RESOURCE_RECORD_RDATA(x), DNS_RESOURCE_RECORD_RDATA(y), m); + if (r != 0) + return r; + + return CMP(DNS_RESOURCE_RECORD_RDATA_SIZE(x), DNS_RESOURCE_RECORD_RDATA_SIZE(y)); +} + +static int dnssec_rsa_verify_raw( + hash_algorithm_t hash_algorithm, + const void *signature, size_t signature_size, + const void *data, size_t data_size, + const void *exponent, size_t exponent_size, + const void *modulus, size_t modulus_size) { + int r; + +#if PREFER_OPENSSL +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdeprecated-declarations" + _cleanup_(RSA_freep) RSA *rpubkey = NULL; + _cleanup_(EVP_PKEY_freep) EVP_PKEY *epubkey = NULL; + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = NULL; + _cleanup_(BN_freep) BIGNUM *e = NULL, *m = NULL; + + assert(hash_algorithm); + + e = BN_bin2bn(exponent, exponent_size, NULL); + if (!e) + return -EIO; + + m = BN_bin2bn(modulus, modulus_size, NULL); + if (!m) + return -EIO; + + rpubkey = RSA_new(); + if (!rpubkey) + return -ENOMEM; + + if (RSA_set0_key(rpubkey, m, e, NULL) <= 0) + return -EIO; + e = m = NULL; + + assert((size_t) RSA_size(rpubkey) == signature_size); + + epubkey = EVP_PKEY_new(); + if (!epubkey) + return -ENOMEM; + + if (EVP_PKEY_assign_RSA(epubkey, RSAPublicKey_dup(rpubkey)) <= 0) + return -EIO; + + ctx = EVP_PKEY_CTX_new(epubkey, NULL); + if (!ctx) + return -ENOMEM; + + if (EVP_PKEY_verify_init(ctx) <= 0) + return -EIO; + + if (EVP_PKEY_CTX_set_rsa_padding(ctx, RSA_PKCS1_PADDING) <= 0) + return -EIO; + + if (EVP_PKEY_CTX_set_signature_md(ctx, hash_algorithm) <= 0) + return -EIO; + + r = EVP_PKEY_verify(ctx, signature, signature_size, data, data_size); + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Signature verification failed: 0x%lx", ERR_get_error()); + +# pragma GCC diagnostic pop +#else + gcry_sexp_t public_key_sexp = NULL, data_sexp = NULL, signature_sexp = NULL; + gcry_mpi_t n = NULL, e = NULL, s = NULL; + gcry_error_t ge; + + assert(hash_algorithm); + + ge = gcry_mpi_scan(&s, GCRYMPI_FMT_USG, signature, signature_size, NULL); + if (ge != 0) { + r = -EIO; + goto finish; + } + + ge = gcry_mpi_scan(&e, GCRYMPI_FMT_USG, exponent, exponent_size, NULL); + if (ge != 0) { + r = -EIO; + goto finish; + } + + ge = gcry_mpi_scan(&n, GCRYMPI_FMT_USG, modulus, modulus_size, NULL); + if (ge != 0) { + r = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&signature_sexp, + NULL, + "(sig-val (rsa (s %m)))", + s); + + if (ge != 0) { + r = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&data_sexp, + NULL, + "(data (flags pkcs1) (hash %s %b))", + hash_algorithm, + (int) data_size, + data); + if (ge != 0) { + r = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&public_key_sexp, + NULL, + "(public-key (rsa (n %m) (e %m)))", + n, + e); + if (ge != 0) { + r = -EIO; + goto finish; + } + + ge = gcry_pk_verify(signature_sexp, data_sexp, public_key_sexp); + if (gpg_err_code(ge) == GPG_ERR_BAD_SIGNATURE) + r = 0; + else if (ge != 0) + r = log_debug_errno(SYNTHETIC_ERRNO(EIO), + "RSA signature check failed: %s", gpg_strerror(ge)); + else + r = 1; + +finish: + if (e) + gcry_mpi_release(e); + if (n) + gcry_mpi_release(n); + if (s) + gcry_mpi_release(s); + + if (public_key_sexp) + gcry_sexp_release(public_key_sexp); + if (signature_sexp) + gcry_sexp_release(signature_sexp); + if (data_sexp) + gcry_sexp_release(data_sexp); +#endif + return r; +} + +static int dnssec_rsa_verify( + hash_algorithm_t hash_algorithm, + const void *hash, size_t hash_size, + DnsResourceRecord *rrsig, + DnsResourceRecord *dnskey) { + + size_t exponent_size, modulus_size; + void *exponent, *modulus; + + assert(hash_algorithm); + assert(hash); + assert(hash_size > 0); + assert(rrsig); + assert(dnskey); + + if (*(uint8_t*) dnskey->dnskey.key == 0) { + /* exponent is > 255 bytes long */ + + exponent = (uint8_t*) dnskey->dnskey.key + 3; + exponent_size = + ((size_t) (((uint8_t*) dnskey->dnskey.key)[1]) << 8) | + ((size_t) ((uint8_t*) dnskey->dnskey.key)[2]); + + if (exponent_size < 256) + return -EINVAL; + + if (3 + exponent_size >= dnskey->dnskey.key_size) + return -EINVAL; + + modulus = (uint8_t*) dnskey->dnskey.key + 3 + exponent_size; + modulus_size = dnskey->dnskey.key_size - 3 - exponent_size; + + } else { + /* exponent is <= 255 bytes long */ + + exponent = (uint8_t*) dnskey->dnskey.key + 1; + exponent_size = (size_t) ((uint8_t*) dnskey->dnskey.key)[0]; + + if (exponent_size <= 0) + return -EINVAL; + + if (1 + exponent_size >= dnskey->dnskey.key_size) + return -EINVAL; + + modulus = (uint8_t*) dnskey->dnskey.key + 1 + exponent_size; + modulus_size = dnskey->dnskey.key_size - 1 - exponent_size; + } + + return dnssec_rsa_verify_raw( + hash_algorithm, + rrsig->rrsig.signature, rrsig->rrsig.signature_size, + hash, hash_size, + exponent, exponent_size, + modulus, modulus_size); +} + +static int dnssec_ecdsa_verify_raw( + hash_algorithm_t hash_algorithm, + elliptic_curve_t curve, + const void *signature_r, size_t signature_r_size, + const void *signature_s, size_t signature_s_size, + const void *data, size_t data_size, + const void *key, size_t key_size) { + int k; + +#if PREFER_OPENSSL +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdeprecated-declarations" + _cleanup_(EC_GROUP_freep) EC_GROUP *ec_group = NULL; + _cleanup_(EC_POINT_freep) EC_POINT *p = NULL; + _cleanup_(EC_KEY_freep) EC_KEY *eckey = NULL; + _cleanup_(BN_CTX_freep) BN_CTX *bctx = NULL; + _cleanup_(BN_freep) BIGNUM *r = NULL, *s = NULL; + _cleanup_(ECDSA_SIG_freep) ECDSA_SIG *sig = NULL; + + assert(hash_algorithm); + + ec_group = EC_GROUP_new_by_curve_name(curve); + if (!ec_group) + return -ENOMEM; + + p = EC_POINT_new(ec_group); + if (!p) + return -ENOMEM; + + bctx = BN_CTX_new(); + if (!bctx) + return -ENOMEM; + + if (EC_POINT_oct2point(ec_group, p, key, key_size, bctx) <= 0) + return -EIO; + + eckey = EC_KEY_new(); + if (!eckey) + return -ENOMEM; + + if (EC_KEY_set_group(eckey, ec_group) <= 0) + return -EIO; + + if (EC_KEY_set_public_key(eckey, p) <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "EC_POINT_bn2point failed: 0x%lx", ERR_get_error()); + + assert(EC_KEY_check_key(eckey) == 1); + + r = BN_bin2bn(signature_r, signature_r_size, NULL); + if (!r) + return -EIO; + + s = BN_bin2bn(signature_s, signature_s_size, NULL); + if (!s) + return -EIO; + + /* TODO: We should eventually use the EVP API once it supports ECDSA signature verification */ + + sig = ECDSA_SIG_new(); + if (!sig) + return -ENOMEM; + + if (ECDSA_SIG_set0(sig, r, s) <= 0) + return -EIO; + r = s = NULL; + + k = ECDSA_do_verify(data, data_size, sig, eckey); + if (k < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Signature verification failed: 0x%lx", ERR_get_error()); + +# pragma GCC diagnostic pop +#else + gcry_sexp_t public_key_sexp = NULL, data_sexp = NULL, signature_sexp = NULL; + gcry_mpi_t q = NULL, r = NULL, s = NULL; + gcry_error_t ge; + + assert(hash_algorithm); + + ge = gcry_mpi_scan(&r, GCRYMPI_FMT_USG, signature_r, signature_r_size, NULL); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_mpi_scan(&s, GCRYMPI_FMT_USG, signature_s, signature_s_size, NULL); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_mpi_scan(&q, GCRYMPI_FMT_USG, key, key_size, NULL); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&signature_sexp, + NULL, + "(sig-val (ecdsa (r %m) (s %m)))", + r, + s); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&data_sexp, + NULL, + "(data (flags rfc6979) (hash %s %b))", + hash_algorithm, + (int) data_size, + data); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&public_key_sexp, + NULL, + "(public-key (ecc (curve %s) (q %m)))", + curve, + q); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_pk_verify(signature_sexp, data_sexp, public_key_sexp); + if (gpg_err_code(ge) == GPG_ERR_BAD_SIGNATURE) + k = 0; + else if (ge != 0) { + log_debug("ECDSA signature check failed: %s", gpg_strerror(ge)); + k = -EIO; + } else + k = 1; +finish: + if (r) + gcry_mpi_release(r); + if (s) + gcry_mpi_release(s); + if (q) + gcry_mpi_release(q); + + if (public_key_sexp) + gcry_sexp_release(public_key_sexp); + if (signature_sexp) + gcry_sexp_release(signature_sexp); + if (data_sexp) + gcry_sexp_release(data_sexp); +#endif + return k; +} + +static int dnssec_ecdsa_verify( + hash_algorithm_t hash_algorithm, + int algorithm, + const void *hash, size_t hash_size, + DnsResourceRecord *rrsig, + DnsResourceRecord *dnskey) { + + elliptic_curve_t curve; + size_t key_size; + uint8_t *q; + + assert(hash); + assert(hash_size); + assert(rrsig); + assert(dnskey); + + if (algorithm == DNSSEC_ALGORITHM_ECDSAP256SHA256) { + curve = OPENSSL_OR_GCRYPT(NID_X9_62_prime256v1, "NIST P-256"); /* NIST P-256 */ + key_size = 32; + } else if (algorithm == DNSSEC_ALGORITHM_ECDSAP384SHA384) { + curve = OPENSSL_OR_GCRYPT(NID_secp384r1, "NIST P-384"); /* NIST P-384 */ + key_size = 48; + } else + return -EOPNOTSUPP; + + if (dnskey->dnskey.key_size != key_size * 2) + return -EINVAL; + + if (rrsig->rrsig.signature_size != key_size * 2) + return -EINVAL; + + q = newa(uint8_t, key_size*2 + 1); + q[0] = 0x04; /* Prepend 0x04 to indicate an uncompressed key */ + memcpy(q+1, dnskey->dnskey.key, key_size*2); + + return dnssec_ecdsa_verify_raw( + hash_algorithm, + curve, + rrsig->rrsig.signature, key_size, + (uint8_t*) rrsig->rrsig.signature + key_size, key_size, + hash, hash_size, + q, key_size*2+1); +} + +static int dnssec_eddsa_verify_raw( + elliptic_curve_t curve, + const uint8_t *signature, size_t signature_size, + const uint8_t *data, size_t data_size, + const uint8_t *key, size_t key_size) { + +#if PREFER_OPENSSL + _cleanup_(EVP_PKEY_freep) EVP_PKEY *evkey = NULL; + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *pctx = NULL; + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *ctx = NULL; + int r; + + assert(curve == NID_ED25519); + assert(signature_size == key_size * 2); + + uint8_t *q = newa(uint8_t, signature_size + 1); + q[0] = 0x04; /* Prepend 0x04 to indicate an uncompressed key */ + memcpy(q+1, signature, signature_size); + + evkey = EVP_PKEY_new_raw_public_key(EVP_PKEY_ED25519, NULL, key, key_size); + if (!evkey) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "EVP_PKEY_new_raw_public_key failed: 0x%lx", ERR_get_error()); + + pctx = EVP_PKEY_CTX_new(evkey, NULL); + if (!pctx) + return -ENOMEM; + + ctx = EVP_MD_CTX_new(); + if (!ctx) + return -ENOMEM; + + /* This prevents EVP_DigestVerifyInit from managing pctx and complicating our free logic. */ + EVP_MD_CTX_set_pkey_ctx(ctx, pctx); + + /* One might be tempted to use EVP_PKEY_verify_init, but see Ed25519(7ssl). */ + if (EVP_DigestVerifyInit(ctx, &pctx, NULL, NULL, evkey) <= 0) + return -EIO; + + r = EVP_DigestVerify(ctx, signature, signature_size, data, data_size); + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Signature verification failed: 0x%lx", ERR_get_error()); + + return r; + +#elif GCRYPT_VERSION_NUMBER >= 0x010600 + gcry_sexp_t public_key_sexp = NULL, data_sexp = NULL, signature_sexp = NULL; + gcry_error_t ge; + int k; + + assert(signature_size == key_size * 2); + + ge = gcry_sexp_build(&signature_sexp, + NULL, + "(sig-val (eddsa (r %b) (s %b)))", + (int) key_size, + signature, + (int) key_size, + signature + key_size); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&data_sexp, + NULL, + "(data (flags eddsa) (hash-algo sha512) (value %b))", + (int) data_size, + data); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_sexp_build(&public_key_sexp, + NULL, + "(public-key (ecc (curve %s) (flags eddsa) (q %b)))", + curve, + (int) key_size, + key); + if (ge != 0) { + k = -EIO; + goto finish; + } + + ge = gcry_pk_verify(signature_sexp, data_sexp, public_key_sexp); + if (gpg_err_code(ge) == GPG_ERR_BAD_SIGNATURE) + k = 0; + else if (ge != 0) + k = log_debug_errno(SYNTHETIC_ERRNO(EIO), + "EdDSA signature check failed: %s", gpg_strerror(ge)); + else + k = 1; +finish: + if (public_key_sexp) + gcry_sexp_release(public_key_sexp); + if (signature_sexp) + gcry_sexp_release(signature_sexp); + if (data_sexp) + gcry_sexp_release(data_sexp); + + return k; +#else + return -EOPNOTSUPP; +#endif +} + +static int dnssec_eddsa_verify( + int algorithm, + const void *data, size_t data_size, + DnsResourceRecord *rrsig, + DnsResourceRecord *dnskey) { + elliptic_curve_t curve; + size_t key_size; + + if (algorithm == DNSSEC_ALGORITHM_ED25519) { + curve = OPENSSL_OR_GCRYPT(NID_ED25519, "Ed25519"); + key_size = 32; + } else + return -EOPNOTSUPP; + + if (dnskey->dnskey.key_size != key_size) + return -EINVAL; + + if (rrsig->rrsig.signature_size != key_size * 2) + return -EINVAL; + + return dnssec_eddsa_verify_raw( + curve, + rrsig->rrsig.signature, rrsig->rrsig.signature_size, + data, data_size, + dnskey->dnskey.key, key_size); +} + +static int md_add_uint8(hash_context_t ctx, uint8_t v) { +#if PREFER_OPENSSL + return EVP_DigestUpdate(ctx, &v, sizeof(v)); +#else + gcry_md_write(ctx, &v, sizeof(v)); + return 0; +#endif +} + +static int md_add_uint16(hash_context_t ctx, uint16_t v) { + v = htobe16(v); +#if PREFER_OPENSSL + return EVP_DigestUpdate(ctx, &v, sizeof(v)); +#else + gcry_md_write(ctx, &v, sizeof(v)); + return 0; +#endif +} + +static void fwrite_uint8(FILE *fp, uint8_t v) { + fwrite(&v, sizeof(v), 1, fp); +} + +static void fwrite_uint16(FILE *fp, uint16_t v) { + v = htobe16(v); + fwrite(&v, sizeof(v), 1, fp); +} + +static void fwrite_uint32(FILE *fp, uint32_t v) { + v = htobe32(v); + fwrite(&v, sizeof(v), 1, fp); +} + +static int dnssec_rrsig_prepare(DnsResourceRecord *rrsig) { + int n_key_labels, n_signer_labels; + const char *name; + int r; + + /* Checks whether the specified RRSIG RR is somewhat valid, and initializes the .n_skip_labels_source + * and .n_skip_labels_signer fields so that we can use them later on. */ + + assert(rrsig); + assert(rrsig->key->type == DNS_TYPE_RRSIG); + + /* Check if this RRSIG RR is already prepared */ + if (rrsig->n_skip_labels_source != UINT8_MAX) + return 0; + + if (rrsig->rrsig.inception > rrsig->rrsig.expiration) + return -EINVAL; + + name = dns_resource_key_name(rrsig->key); + + n_key_labels = dns_name_count_labels(name); + if (n_key_labels < 0) + return n_key_labels; + if (rrsig->rrsig.labels > n_key_labels) + return -EINVAL; + + n_signer_labels = dns_name_count_labels(rrsig->rrsig.signer); + if (n_signer_labels < 0) + return n_signer_labels; + if (n_signer_labels > rrsig->rrsig.labels) + return -EINVAL; + + r = dns_name_skip(name, n_key_labels - n_signer_labels, &name); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + /* Check if the signer is really a suffix of us */ + r = dns_name_equal(name, rrsig->rrsig.signer); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + assert(n_key_labels < UINT8_MAX); /* UINT8_MAX/-1 means unsigned. */ + rrsig->n_skip_labels_source = n_key_labels - rrsig->rrsig.labels; + rrsig->n_skip_labels_signer = n_key_labels - n_signer_labels; + + return 0; +} + +static int dnssec_rrsig_expired(DnsResourceRecord *rrsig, usec_t realtime) { + usec_t expiration, inception, skew; + + assert(rrsig); + assert(rrsig->key->type == DNS_TYPE_RRSIG); + + if (realtime == USEC_INFINITY) + realtime = now(CLOCK_REALTIME); + + expiration = rrsig->rrsig.expiration * USEC_PER_SEC; + inception = rrsig->rrsig.inception * USEC_PER_SEC; + + /* Consider inverted validity intervals as expired */ + if (inception > expiration) + return true; + + /* Permit a certain amount of clock skew of 10% of the valid + * time range. This takes inspiration from unbound's + * resolver. */ + skew = (expiration - inception) / 10; + if (skew > SKEW_MAX) + skew = SKEW_MAX; + + if (inception < skew) + inception = 0; + else + inception -= skew; + + if (expiration + skew < expiration) + expiration = USEC_INFINITY; + else + expiration += skew; + + return realtime < inception || realtime > expiration; +} + +static hash_md_t algorithm_to_implementation_id(uint8_t algorithm) { + + /* Translates a DNSSEC signature algorithm into an openssl/gcrypt digest identifier. + * + * Note that we implement all algorithms listed as "Must implement" and "Recommended to Implement" in + * RFC6944. We don't implement any algorithms that are listed as "Optional" or "Must Not Implement". + * Specifically, we do not implement RSAMD5, DSASHA1, DH, DSA-NSEC3-SHA1, and GOST-ECC. */ + + switch (algorithm) { + + case DNSSEC_ALGORITHM_RSASHA1: + case DNSSEC_ALGORITHM_RSASHA1_NSEC3_SHA1: + return OPENSSL_OR_GCRYPT(EVP_sha1(), GCRY_MD_SHA1); + + case DNSSEC_ALGORITHM_RSASHA256: + case DNSSEC_ALGORITHM_ECDSAP256SHA256: + return OPENSSL_OR_GCRYPT(EVP_sha256(), GCRY_MD_SHA256); + + case DNSSEC_ALGORITHM_ECDSAP384SHA384: + return OPENSSL_OR_GCRYPT(EVP_sha384(), GCRY_MD_SHA384); + + case DNSSEC_ALGORITHM_RSASHA512: + return OPENSSL_OR_GCRYPT(EVP_sha512(), GCRY_MD_SHA512); + + default: + return OPENSSL_OR_GCRYPT(NULL, -EOPNOTSUPP); + } +} + +static void dnssec_fix_rrset_ttl( + DnsResourceRecord *list[], + unsigned n, + DnsResourceRecord *rrsig) { + + assert(list); + assert(n > 0); + assert(rrsig); + + for (unsigned k = 0; k < n; k++) { + DnsResourceRecord *rr = list[k]; + + /* Pick the TTL as the minimum of the RR's TTL, the + * RR's original TTL according to the RRSIG and the + * RRSIG's own TTL, see RFC 4035, Section 5.3.3 */ + rr->ttl = MIN3(rr->ttl, rrsig->rrsig.original_ttl, rrsig->ttl); + rr->expiry = rrsig->rrsig.expiration * USEC_PER_SEC; + + /* Copy over information about the signer and wildcard source of synthesis */ + rr->n_skip_labels_source = rrsig->n_skip_labels_source; + rr->n_skip_labels_signer = rrsig->n_skip_labels_signer; + } + + rrsig->expiry = rrsig->rrsig.expiration * USEC_PER_SEC; +} + +static int dnssec_rrset_serialize_sig( + DnsResourceRecord *rrsig, + const char *source, + DnsResourceRecord **list, + size_t list_len, + bool wildcard, + char **ret_sig_data, + size_t *ret_sig_size) { + + _cleanup_(memstream_done) MemStream m = {}; + uint8_t wire_format_name[DNS_WIRE_FORMAT_HOSTNAME_MAX]; + DnsResourceRecord *rr; + FILE *f; + int r; + + assert(rrsig); + assert(source); + assert(list || list_len == 0); + assert(ret_sig_data); + assert(ret_sig_size); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + fwrite_uint16(f, rrsig->rrsig.type_covered); + fwrite_uint8(f, rrsig->rrsig.algorithm); + fwrite_uint8(f, rrsig->rrsig.labels); + fwrite_uint32(f, rrsig->rrsig.original_ttl); + fwrite_uint32(f, rrsig->rrsig.expiration); + fwrite_uint32(f, rrsig->rrsig.inception); + fwrite_uint16(f, rrsig->rrsig.key_tag); + + r = dns_name_to_wire_format(rrsig->rrsig.signer, wire_format_name, sizeof(wire_format_name), true); + if (r < 0) + return r; + fwrite(wire_format_name, 1, r, f); + + /* Convert the source of synthesis into wire format */ + r = dns_name_to_wire_format(source, wire_format_name, sizeof(wire_format_name), true); + if (r < 0) + return r; + + for (size_t k = 0; k < list_len; k++) { + size_t l; + + rr = list[k]; + + /* Hash the source of synthesis. If this is a wildcard, then prefix it with the *. label */ + if (wildcard) + fwrite((uint8_t[]) { 1, '*'}, sizeof(uint8_t), 2, f); + fwrite(wire_format_name, 1, r, f); + + fwrite_uint16(f, rr->key->type); + fwrite_uint16(f, rr->key->class); + fwrite_uint32(f, rrsig->rrsig.original_ttl); + + l = DNS_RESOURCE_RECORD_RDATA_SIZE(rr); + assert(l <= 0xFFFF); + + fwrite_uint16(f, (uint16_t) l); + fwrite(DNS_RESOURCE_RECORD_RDATA(rr), 1, l, f); + } + + return memstream_finalize(&m, ret_sig_data, ret_sig_size); +} + +static int dnssec_rrset_verify_sig( + DnsResourceRecord *rrsig, + DnsResourceRecord *dnskey, + const char *sig_data, + size_t sig_size) { + + assert(rrsig); + assert(dnskey); + assert(sig_data); + assert(sig_size > 0); + + hash_md_t md_algorithm; + +#if PREFER_OPENSSL + uint8_t hash[EVP_MAX_MD_SIZE]; + unsigned hash_size; +#else + _cleanup_(gcry_md_closep) gcry_md_hd_t md = NULL; + void *hash; + size_t hash_size; + + initialize_libgcrypt(false); +#endif + + switch (rrsig->rrsig.algorithm) { + case DNSSEC_ALGORITHM_ED25519: +#if PREFER_OPENSSL || GCRYPT_VERSION_NUMBER >= 0x010600 + return dnssec_eddsa_verify( + rrsig->rrsig.algorithm, + sig_data, sig_size, + rrsig, + dnskey); +#endif + case DNSSEC_ALGORITHM_ED448: + return -EOPNOTSUPP; + default: + /* OK, the RRs are now in canonical order. Let's calculate the digest */ + md_algorithm = algorithm_to_implementation_id(rrsig->rrsig.algorithm); +#if PREFER_OPENSSL + if (!md_algorithm) + return -EOPNOTSUPP; + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *ctx = EVP_MD_CTX_new(); + if (!ctx) + return -ENOMEM; + + if (EVP_DigestInit_ex(ctx, md_algorithm, NULL) <= 0) + return -EIO; + + if (EVP_DigestUpdate(ctx, sig_data, sig_size) <= 0) + return -EIO; + + if (EVP_DigestFinal_ex(ctx, hash, &hash_size) <= 0) + return -EIO; + + assert(hash_size > 0); + +#else + if (md_algorithm < 0) + return md_algorithm; + + gcry_error_t err = gcry_md_open(&md, md_algorithm, 0); + if (gcry_err_code(err) != GPG_ERR_NO_ERROR || !md) + return -EIO; + + hash_size = gcry_md_get_algo_dlen(md_algorithm); + assert(hash_size > 0); + + gcry_md_write(md, sig_data, sig_size); + + hash = gcry_md_read(md, 0); + if (!hash) + return -EIO; +#endif + } + + switch (rrsig->rrsig.algorithm) { + + case DNSSEC_ALGORITHM_RSASHA1: + case DNSSEC_ALGORITHM_RSASHA1_NSEC3_SHA1: + case DNSSEC_ALGORITHM_RSASHA256: + case DNSSEC_ALGORITHM_RSASHA512: + return dnssec_rsa_verify( + OPENSSL_OR_GCRYPT(md_algorithm, gcry_md_algo_name(md_algorithm)), + hash, hash_size, + rrsig, + dnskey); + + case DNSSEC_ALGORITHM_ECDSAP256SHA256: + case DNSSEC_ALGORITHM_ECDSAP384SHA384: + return dnssec_ecdsa_verify( + OPENSSL_OR_GCRYPT(md_algorithm, gcry_md_algo_name(md_algorithm)), + rrsig->rrsig.algorithm, + hash, hash_size, + rrsig, + dnskey); + + default: + assert_not_reached(); + } +} + +int dnssec_verify_rrset( + DnsAnswer *a, + const DnsResourceKey *key, + DnsResourceRecord *rrsig, + DnsResourceRecord *dnskey, + usec_t realtime, + DnssecResult *result) { + + DnsResourceRecord **list, *rr; + const char *source, *name; + _cleanup_free_ char *sig_data = NULL; + size_t sig_size = 0; /* avoid false maybe-uninitialized warning */ + size_t n = 0; + bool wildcard; + int r; + + assert(key); + assert(rrsig); + assert(dnskey); + assert(result); + assert(rrsig->key->type == DNS_TYPE_RRSIG); + assert(dnskey->key->type == DNS_TYPE_DNSKEY); + + /* Verifies that the RRSet matches the specified "key" in "a", + * using the signature "rrsig" and the key "dnskey". It's + * assumed that RRSIG and DNSKEY match. */ + + r = dnssec_rrsig_prepare(rrsig); + if (r == -EINVAL) { + *result = DNSSEC_INVALID; + return r; + } + if (r < 0) + return r; + + r = dnssec_rrsig_expired(rrsig, realtime); + if (r < 0) + return r; + if (r > 0) { + *result = DNSSEC_SIGNATURE_EXPIRED; + return 0; + } + + name = dns_resource_key_name(key); + + /* Some keys may only appear signed in the zone apex, and are invalid anywhere else. (SOA, NS...) */ + if (dns_type_apex_only(rrsig->rrsig.type_covered)) { + r = dns_name_equal(rrsig->rrsig.signer, name); + if (r < 0) + return r; + if (r == 0) { + *result = DNSSEC_INVALID; + return 0; + } + } + + /* OTOH DS RRs may not appear in the zone apex, but are valid everywhere else. */ + if (rrsig->rrsig.type_covered == DNS_TYPE_DS) { + r = dns_name_equal(rrsig->rrsig.signer, name); + if (r < 0) + return r; + if (r > 0) { + *result = DNSSEC_INVALID; + return 0; + } + } + + /* Determine the "Source of Synthesis" and whether this is a wildcard RRSIG */ + r = dns_name_suffix(name, rrsig->rrsig.labels, &source); + if (r < 0) + return r; + if (r > 0 && !dns_type_may_wildcard(rrsig->rrsig.type_covered)) { + /* We refuse to validate NSEC3 or SOA RRs that are synthesized from wildcards */ + *result = DNSSEC_INVALID; + return 0; + } + if (r == 1) { + /* If we stripped a single label, then let's see if that maybe was "*". If so, we are not really + * synthesized from a wildcard, we are the wildcard itself. Treat that like a normal name. */ + r = dns_name_startswith(name, "*"); + if (r < 0) + return r; + if (r > 0) + source = name; + + wildcard = r == 0; + } else + wildcard = r > 0; + + /* Collect all relevant RRs in a single array, so that we can look at the RRset */ + list = newa(DnsResourceRecord *, dns_answer_size(a)); + + DNS_ANSWER_FOREACH(rr, a) { + r = dns_resource_key_equal(key, rr->key); + if (r < 0) + return r; + if (r == 0) + continue; + + /* We need the wire format for ordering, and digest calculation */ + r = dns_resource_record_to_wire_format(rr, true); + if (r < 0) + return r; + + list[n++] = rr; + + if (n > VERIFY_RRS_MAX) + return -E2BIG; + } + + if (n <= 0) + return -ENODATA; + + /* Bring the RRs into canonical order */ + typesafe_qsort(list, n, rr_compare); + + r = dnssec_rrset_serialize_sig(rrsig, source, list, n, wildcard, + &sig_data, &sig_size); + if (r < 0) + return r; + + r = dnssec_rrset_verify_sig(rrsig, dnskey, sig_data, sig_size); + if (r == -EOPNOTSUPP) { + *result = DNSSEC_UNSUPPORTED_ALGORITHM; + return 0; + } + if (r < 0) + return r; + + /* Now, fix the ttl, expiry, and remember the synthesizing source and the signer */ + if (r > 0) + dnssec_fix_rrset_ttl(list, n, rrsig); + + if (r == 0) + *result = DNSSEC_INVALID; + else if (wildcard) + *result = DNSSEC_VALIDATED_WILDCARD; + else + *result = DNSSEC_VALIDATED; + + return 0; +} + +int dnssec_rrsig_match_dnskey(DnsResourceRecord *rrsig, DnsResourceRecord *dnskey, bool revoked_ok) { + + assert(rrsig); + assert(dnskey); + + /* Checks if the specified DNSKEY RR matches the key used for + * the signature in the specified RRSIG RR */ + + if (rrsig->key->type != DNS_TYPE_RRSIG) + return -EINVAL; + + if (dnskey->key->type != DNS_TYPE_DNSKEY) + return 0; + if (dnskey->key->class != rrsig->key->class) + return 0; + if ((dnskey->dnskey.flags & DNSKEY_FLAG_ZONE_KEY) == 0) + return 0; + if (!revoked_ok && (dnskey->dnskey.flags & DNSKEY_FLAG_REVOKE)) + return 0; + if (dnskey->dnskey.protocol != 3) + return 0; + if (dnskey->dnskey.algorithm != rrsig->rrsig.algorithm) + return 0; + + if (dnssec_keytag(dnskey, false) != rrsig->rrsig.key_tag) + return 0; + + return dns_name_equal(dns_resource_key_name(dnskey->key), rrsig->rrsig.signer); +} + +int dnssec_key_match_rrsig(const DnsResourceKey *key, DnsResourceRecord *rrsig) { + assert(key); + assert(rrsig); + + /* Checks if the specified RRSIG RR protects the RRSet of the specified RR key. */ + + if (rrsig->key->type != DNS_TYPE_RRSIG) + return 0; + if (rrsig->key->class != key->class) + return 0; + if (rrsig->rrsig.type_covered != key->type) + return 0; + + return dns_name_equal(dns_resource_key_name(rrsig->key), dns_resource_key_name(key)); +} + +int dnssec_verify_rrset_search( + DnsAnswer *a, + const DnsResourceKey *key, + DnsAnswer *validated_dnskeys, + usec_t realtime, + DnssecResult *result, + DnsResourceRecord **ret_rrsig) { + + bool found_rrsig = false, found_invalid = false, found_expired_rrsig = false, found_unsupported_algorithm = false; + unsigned nvalidations = 0; + DnsResourceRecord *rrsig; + int r; + + assert(key); + assert(result); + + /* Verifies all RRs from "a" that match the key "key" against DNSKEYs in "validated_dnskeys" */ + + if (dns_answer_isempty(a)) + return -ENODATA; + + /* Iterate through each RRSIG RR. */ + DNS_ANSWER_FOREACH(rrsig, a) { + DnsResourceRecord *dnskey; + DnsAnswerFlags flags; + + /* Is this an RRSIG RR that applies to RRs matching our key? */ + r = dnssec_key_match_rrsig(key, rrsig); + if (r < 0) + return r; + if (r == 0) + continue; + + found_rrsig = true; + + /* Look for a matching key */ + DNS_ANSWER_FOREACH_FLAGS(dnskey, flags, validated_dnskeys) { + DnssecResult one_result; + + if ((flags & DNS_ANSWER_AUTHENTICATED) == 0) + continue; + + /* Is this a DNSKEY RR that matches they key of our RRSIG? */ + r = dnssec_rrsig_match_dnskey(rrsig, dnskey, false); + if (r < 0) + return r; + if (r == 0) + continue; + + /* Take the time here, if it isn't set yet, so + * that we do all validations with the same + * time. */ + if (realtime == USEC_INFINITY) + realtime = now(CLOCK_REALTIME); + + /* Have we seen an unreasonable number of invalid signaures? */ + if (nvalidations > DNSSEC_INVALID_MAX) { + if (ret_rrsig) + *ret_rrsig = NULL; + *result = DNSSEC_TOO_MANY_VALIDATIONS; + return (int) nvalidations; + } + + /* Yay, we found a matching RRSIG with a matching + * DNSKEY, awesome. Now let's verify all entries of + * the RRSet against the RRSIG and DNSKEY + * combination. */ + + r = dnssec_verify_rrset(a, key, rrsig, dnskey, realtime, &one_result); + if (r < 0) + return r; + + nvalidations++; + + switch (one_result) { + + case DNSSEC_VALIDATED: + case DNSSEC_VALIDATED_WILDCARD: + /* Yay, the RR has been validated, + * return immediately, but fix up the expiry */ + if (ret_rrsig) + *ret_rrsig = rrsig; + + *result = one_result; + return (int) nvalidations; + + case DNSSEC_INVALID: + /* If the signature is invalid, let's try another + key and/or signature. After all they + key_tags and stuff are not unique, and + might be shared by multiple keys. */ + found_invalid = true; + continue; + + case DNSSEC_UNSUPPORTED_ALGORITHM: + /* If the key algorithm is + unsupported, try another + RRSIG/DNSKEY pair, but remember we + encountered this, so that we can + return a proper error when we + encounter nothing better. */ + found_unsupported_algorithm = true; + continue; + + case DNSSEC_SIGNATURE_EXPIRED: + /* If the signature is expired, try + another one, but remember it, so + that we can return this */ + found_expired_rrsig = true; + continue; + + default: + assert_not_reached(); + } + } + } + + if (found_expired_rrsig) + *result = DNSSEC_SIGNATURE_EXPIRED; + else if (found_unsupported_algorithm) + *result = DNSSEC_UNSUPPORTED_ALGORITHM; + else if (found_invalid) + *result = DNSSEC_INVALID; + else if (found_rrsig) + *result = DNSSEC_MISSING_KEY; + else + *result = DNSSEC_NO_SIGNATURE; + + if (ret_rrsig) + *ret_rrsig = NULL; + + return (int) nvalidations; +} + +int dnssec_has_rrsig(DnsAnswer *a, const DnsResourceKey *key) { + DnsResourceRecord *rr; + int r; + + /* Checks whether there's at least one RRSIG in 'a' that protects RRs of the specified key */ + + DNS_ANSWER_FOREACH(rr, a) { + r = dnssec_key_match_rrsig(key, rr); + if (r < 0) + return r; + if (r > 0) + return 1; + } + + return 0; +} + +static hash_md_t digest_to_hash_md(uint8_t algorithm) { + + /* Translates a DNSSEC digest algorithm into an openssl/gcrypt digest identifier */ + + switch (algorithm) { + + case DNSSEC_DIGEST_SHA1: + return OPENSSL_OR_GCRYPT(EVP_sha1(), GCRY_MD_SHA1); + + case DNSSEC_DIGEST_SHA256: + return OPENSSL_OR_GCRYPT(EVP_sha256(), GCRY_MD_SHA256); + + case DNSSEC_DIGEST_SHA384: + return OPENSSL_OR_GCRYPT(EVP_sha384(), GCRY_MD_SHA384); + + default: + return OPENSSL_OR_GCRYPT(NULL, -EOPNOTSUPP); + } +} + +int dnssec_verify_dnskey_by_ds(DnsResourceRecord *dnskey, DnsResourceRecord *ds, bool mask_revoke) { + uint8_t wire_format[DNS_WIRE_FORMAT_HOSTNAME_MAX]; + int r; + + assert(dnskey); + assert(ds); + + /* Implements DNSKEY verification by a DS, according to RFC 4035, section 5.2 */ + + if (dnskey->key->type != DNS_TYPE_DNSKEY) + return -EINVAL; + if (ds->key->type != DNS_TYPE_DS) + return -EINVAL; + if ((dnskey->dnskey.flags & DNSKEY_FLAG_ZONE_KEY) == 0) + return -EKEYREJECTED; + if (!mask_revoke && (dnskey->dnskey.flags & DNSKEY_FLAG_REVOKE)) + return -EKEYREJECTED; + if (dnskey->dnskey.protocol != 3) + return -EKEYREJECTED; + + if (dnskey->dnskey.algorithm != ds->ds.algorithm) + return 0; + if (dnssec_keytag(dnskey, mask_revoke) != ds->ds.key_tag) + return 0; + + r = dns_name_to_wire_format(dns_resource_key_name(dnskey->key), wire_format, sizeof wire_format, true); + if (r < 0) + return r; + + hash_md_t md_algorithm = digest_to_hash_md(ds->ds.digest_type); + +#if PREFER_OPENSSL + if (!md_algorithm) + return -EOPNOTSUPP; + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *ctx = NULL; + uint8_t result[EVP_MAX_MD_SIZE]; + + unsigned hash_size = EVP_MD_size(md_algorithm); + assert(hash_size > 0); + + if (ds->ds.digest_size != hash_size) + return 0; + + ctx = EVP_MD_CTX_new(); + if (!ctx) + return -ENOMEM; + + if (EVP_DigestInit_ex(ctx, md_algorithm, NULL) <= 0) + return -EIO; + + if (EVP_DigestUpdate(ctx, wire_format, r) <= 0) + return -EIO; + + if (mask_revoke) + md_add_uint16(ctx, dnskey->dnskey.flags & ~DNSKEY_FLAG_REVOKE); + else + md_add_uint16(ctx, dnskey->dnskey.flags); + + r = md_add_uint8(ctx, dnskey->dnskey.protocol); + if (r <= 0) + return r; + r = md_add_uint8(ctx, dnskey->dnskey.algorithm); + if (r <= 0) + return r; + if (EVP_DigestUpdate(ctx, dnskey->dnskey.key, dnskey->dnskey.key_size) <= 0) + return -EIO; + + if (EVP_DigestFinal_ex(ctx, result, NULL) <= 0) + return -EIO; + +#else + if (md_algorithm < 0) + return -EOPNOTSUPP; + + initialize_libgcrypt(false); + + _cleanup_(gcry_md_closep) gcry_md_hd_t md = NULL; + + size_t hash_size = gcry_md_get_algo_dlen(md_algorithm); + assert(hash_size > 0); + + if (ds->ds.digest_size != hash_size) + return 0; + + gcry_error_t err = gcry_md_open(&md, md_algorithm, 0); + if (gcry_err_code(err) != GPG_ERR_NO_ERROR || !md) + return -EIO; + + gcry_md_write(md, wire_format, r); + if (mask_revoke) + md_add_uint16(md, dnskey->dnskey.flags & ~DNSKEY_FLAG_REVOKE); + else + md_add_uint16(md, dnskey->dnskey.flags); + md_add_uint8(md, dnskey->dnskey.protocol); + md_add_uint8(md, dnskey->dnskey.algorithm); + gcry_md_write(md, dnskey->dnskey.key, dnskey->dnskey.key_size); + + void *result = gcry_md_read(md, 0); + if (!result) + return -EIO; +#endif + + return memcmp(result, ds->ds.digest, ds->ds.digest_size) == 0; +} + +int dnssec_verify_dnskey_by_ds_search(DnsResourceRecord *dnskey, DnsAnswer *validated_ds) { + DnsResourceRecord *ds; + DnsAnswerFlags flags; + int r; + + assert(dnskey); + + if (dnskey->key->type != DNS_TYPE_DNSKEY) + return 0; + + DNS_ANSWER_FOREACH_FLAGS(ds, flags, validated_ds) { + + if ((flags & DNS_ANSWER_AUTHENTICATED) == 0) + continue; + + if (ds->key->type != DNS_TYPE_DS) + continue; + if (ds->key->class != dnskey->key->class) + continue; + + r = dns_name_equal(dns_resource_key_name(dnskey->key), dns_resource_key_name(ds->key)); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dnssec_verify_dnskey_by_ds(dnskey, ds, false); + if (IN_SET(r, -EKEYREJECTED, -EOPNOTSUPP)) + return 0; /* The DNSKEY is revoked or otherwise invalid, or we don't support the digest algorithm */ + if (r < 0) + return r; + if (r > 0) + return 1; + } + + return 0; +} + +static hash_md_t nsec3_hash_to_hash_md(uint8_t algorithm) { + + /* Translates a DNSSEC NSEC3 hash algorithm into an openssl/gcrypt digest identifier */ + + switch (algorithm) { + + case NSEC3_ALGORITHM_SHA1: + return OPENSSL_OR_GCRYPT(EVP_sha1(), GCRY_MD_SHA1); + + default: + return OPENSSL_OR_GCRYPT(NULL, -EOPNOTSUPP); + } +} + +int dnssec_nsec3_hash(DnsResourceRecord *nsec3, const char *name, void *ret) { + uint8_t wire_format[DNS_WIRE_FORMAT_HOSTNAME_MAX]; + int r; + + assert(nsec3); + assert(name); + assert(ret); + + if (nsec3->key->type != DNS_TYPE_NSEC3) + return -EINVAL; + + if (nsec3->nsec3.iterations > NSEC3_ITERATIONS_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Ignoring NSEC3 RR %s with excessive number of iterations.", + dns_resource_record_to_string(nsec3)); + + hash_md_t algorithm = nsec3_hash_to_hash_md(nsec3->nsec3.algorithm); +#if PREFER_OPENSSL + if (!algorithm) + return -EOPNOTSUPP; + + size_t hash_size = EVP_MD_size(algorithm); + assert(hash_size > 0); + + if (nsec3->nsec3.next_hashed_name_size != hash_size) + return -EINVAL; + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *ctx = EVP_MD_CTX_new(); + if (!ctx) + return -ENOMEM; + + if (EVP_DigestInit_ex(ctx, algorithm, NULL) <= 0) + return -EIO; + + r = dns_name_to_wire_format(name, wire_format, sizeof(wire_format), true); + if (r < 0) + return r; + + if (EVP_DigestUpdate(ctx, wire_format, r) <= 0) + return -EIO; + if (EVP_DigestUpdate(ctx, nsec3->nsec3.salt, nsec3->nsec3.salt_size) <= 0) + return -EIO; + + uint8_t result[EVP_MAX_MD_SIZE]; + if (EVP_DigestFinal_ex(ctx, result, NULL) <= 0) + return -EIO; + + for (unsigned k = 0; k < nsec3->nsec3.iterations; k++) { + if (EVP_DigestInit_ex(ctx, algorithm, NULL) <= 0) + return -EIO; + if (EVP_DigestUpdate(ctx, result, hash_size) <= 0) + return -EIO; + if (EVP_DigestUpdate(ctx, nsec3->nsec3.salt, nsec3->nsec3.salt_size) <= 0) + return -EIO; + + if (EVP_DigestFinal_ex(ctx, result, NULL) <= 0) + return -EIO; + } +#else + if (algorithm < 0) + return algorithm; + + initialize_libgcrypt(false); + + unsigned hash_size = gcry_md_get_algo_dlen(algorithm); + assert(hash_size > 0); + + if (nsec3->nsec3.next_hashed_name_size != hash_size) + return -EINVAL; + + r = dns_name_to_wire_format(name, wire_format, sizeof(wire_format), true); + if (r < 0) + return r; + + _cleanup_(gcry_md_closep) gcry_md_hd_t md = NULL; + gcry_error_t err = gcry_md_open(&md, algorithm, 0); + if (gcry_err_code(err) != GPG_ERR_NO_ERROR || !md) + return -EIO; + + gcry_md_write(md, wire_format, r); + gcry_md_write(md, nsec3->nsec3.salt, nsec3->nsec3.salt_size); + + void *result = gcry_md_read(md, 0); + if (!result) + return -EIO; + + for (unsigned k = 0; k < nsec3->nsec3.iterations; k++) { + uint8_t tmp[hash_size]; + memcpy(tmp, result, hash_size); + + gcry_md_reset(md); + gcry_md_write(md, tmp, hash_size); + gcry_md_write(md, nsec3->nsec3.salt, nsec3->nsec3.salt_size); + + result = gcry_md_read(md, 0); + if (!result) + return -EIO; + } +#endif + + memcpy(ret, result, hash_size); + return (int) hash_size; +} + +static int nsec3_is_good(DnsResourceRecord *rr, DnsResourceRecord *nsec3) { + const char *a, *b; + int r; + + assert(rr); + + if (rr->key->type != DNS_TYPE_NSEC3) + return 0; + + /* RFC 5155, Section 8.2 says we MUST ignore NSEC3 RRs with flags != 0 or 1 */ + if (!IN_SET(rr->nsec3.flags, 0, 1)) + return 0; + + /* Ignore NSEC3 RRs whose algorithm we don't know */ +#if PREFER_OPENSSL + if (!nsec3_hash_to_hash_md(rr->nsec3.algorithm)) + return 0; +#else + if (nsec3_hash_to_hash_md(rr->nsec3.algorithm) < 0) + return 0; +#endif + + /* Ignore NSEC3 RRs with an excessive number of required iterations */ + if (rr->nsec3.iterations > NSEC3_ITERATIONS_MAX) + return 0; + + /* Ignore NSEC3 RRs generated from wildcards. If these NSEC3 RRs weren't correctly signed we can't make this + * check (since rr->n_skip_labels_source is -1), but that's OK, as we won't trust them anyway in that case. */ + if (!IN_SET(rr->n_skip_labels_source, 0, UINT8_MAX)) + return 0; + /* Ignore NSEC3 RRs that are located anywhere else than one label below the zone */ + if (!IN_SET(rr->n_skip_labels_signer, 1, UINT8_MAX)) + return 0; + + if (!nsec3) + return 1; + + /* If a second NSEC3 RR is specified, also check if they are from the same zone. */ + + if (nsec3 == rr) /* Shortcut */ + return 1; + + if (rr->key->class != nsec3->key->class) + return 0; + if (rr->nsec3.algorithm != nsec3->nsec3.algorithm) + return 0; + if (rr->nsec3.iterations != nsec3->nsec3.iterations) + return 0; + if (rr->nsec3.salt_size != nsec3->nsec3.salt_size) + return 0; + if (memcmp_safe(rr->nsec3.salt, nsec3->nsec3.salt, rr->nsec3.salt_size) != 0) + return 0; + + a = dns_resource_key_name(rr->key); + r = dns_name_parent(&a); /* strip off hash */ + if (r <= 0) + return r; + + b = dns_resource_key_name(nsec3->key); + r = dns_name_parent(&b); /* strip off hash */ + if (r <= 0) + return r; + + /* Make sure both have the same parent */ + return dns_name_equal(a, b); +} + +static int nsec3_hashed_domain_format(const uint8_t *hashed, size_t hashed_size, const char *zone, char **ret) { + _cleanup_free_ char *l = NULL; + char *j; + + assert(hashed); + assert(hashed_size > 0); + assert(zone); + assert(ret); + + l = base32hexmem(hashed, hashed_size, false); + if (!l) + return -ENOMEM; + + j = strjoin(l, ".", zone); + if (!j) + return -ENOMEM; + + *ret = j; + return (int) hashed_size; +} + +static int nsec3_hashed_domain_make(DnsResourceRecord *nsec3, const char *domain, const char *zone, char **ret) { + uint8_t hashed[DNSSEC_HASH_SIZE_MAX]; + int hashed_size; + + assert(nsec3); + assert(domain); + assert(zone); + assert(ret); + + hashed_size = dnssec_nsec3_hash(nsec3, domain, hashed); + if (hashed_size < 0) + return hashed_size; + + return nsec3_hashed_domain_format(hashed, (size_t) hashed_size, zone, ret); +} + +/* See RFC 5155, Section 8 + * First try to find a NSEC3 record that matches our query precisely, if that fails, find the closest + * enclosure. Secondly, find a proof that there is no closer enclosure and either a proof that there + * is no wildcard domain as a direct descendant of the closest enclosure, or find an NSEC3 record that + * matches the wildcard domain. + * + * Based on this we can prove either the existence of the record in @key, or NXDOMAIN or NODATA, or + * that there is no proof either way. The latter is the case if a proof of non-existence of a given + * name uses an NSEC3 record with the opt-out bit set. Lastly, if we are given insufficient NSEC3 records + * to conclude anything we indicate this by returning NO_RR. */ +static int dnssec_test_nsec3(DnsAnswer *answer, DnsResourceKey *key, DnssecNsecResult *result, bool *authenticated, uint32_t *ttl) { + _cleanup_free_ char *next_closer_domain = NULL, *wildcard_domain = NULL; + const char *zone, *p, *pp = NULL, *wildcard; + DnsResourceRecord *rr, *enclosure_rr, *zone_rr, *wildcard_rr = NULL; + DnsAnswerFlags flags; + int hashed_size, r; + bool a, no_closer = false, no_wildcard = false, optout = false; + + assert(key); + assert(result); + + /* First step, find the zone name and the NSEC3 parameters of the zone. + * it is sufficient to look for the longest common suffix we find with + * any NSEC3 RR in the response. Any NSEC3 record will do as all NSEC3 + * records from a given zone in a response must use the same + * parameters. */ + zone = dns_resource_key_name(key); + for (;;) { + DNS_ANSWER_FOREACH_FLAGS(zone_rr, flags, answer) { + r = nsec3_is_good(zone_rr, NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dns_name_equal_skip(dns_resource_key_name(zone_rr->key), 1, zone); + if (r < 0) + return r; + if (r > 0) + goto found_zone; + } + + /* Strip one label from the front */ + r = dns_name_parent(&zone); + if (r < 0) + return r; + if (r == 0) + break; + } + + *result = DNSSEC_NSEC_NO_RR; + return 0; + +found_zone: + /* Second step, find the closest encloser NSEC3 RR in 'answer' that matches 'key' */ + p = dns_resource_key_name(key); + for (;;) { + _cleanup_free_ char *hashed_domain = NULL; + + hashed_size = nsec3_hashed_domain_make(zone_rr, p, zone, &hashed_domain); + if (hashed_size == -EOPNOTSUPP) { + *result = DNSSEC_NSEC_UNSUPPORTED_ALGORITHM; + return 0; + } + if (hashed_size < 0) + return hashed_size; + + DNS_ANSWER_FOREACH_FLAGS(enclosure_rr, flags, answer) { + + r = nsec3_is_good(enclosure_rr, zone_rr); + if (r < 0) + return r; + if (r == 0) + continue; + + if (enclosure_rr->nsec3.next_hashed_name_size != (size_t) hashed_size) + continue; + + r = dns_name_equal(dns_resource_key_name(enclosure_rr->key), hashed_domain); + if (r < 0) + return r; + if (r > 0) { + a = flags & DNS_ANSWER_AUTHENTICATED; + goto found_closest_encloser; + } + } + + /* We didn't find the closest encloser with this name, + * but let's remember this domain name, it might be + * the next closer name */ + + pp = p; + + /* Strip one label from the front */ + r = dns_name_parent(&p); + if (r < 0) + return r; + if (r == 0) + break; + } + + *result = DNSSEC_NSEC_NO_RR; + return 0; + +found_closest_encloser: + /* We found a closest encloser in 'p'; next closer is 'pp' */ + + if (!pp) { + /* We have an exact match! If we area looking for a DS RR, then we must insist that we got the NSEC3 RR + * from the parent. Otherwise the one from the child. Do so, by checking whether SOA and NS are + * appropriately set. */ + + if (key->type == DNS_TYPE_DS) { + if (bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_SOA)) + return -EBADMSG; + } else { + if (bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_NS) && + !bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_SOA)) + return -EBADMSG; + } + + /* No next closer NSEC3 RR. That means there's a direct NSEC3 RR for our key. */ + if (bitmap_isset(enclosure_rr->nsec3.types, key->type)) + *result = DNSSEC_NSEC_FOUND; + else if (bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_CNAME)) + *result = DNSSEC_NSEC_CNAME; + else + *result = DNSSEC_NSEC_NODATA; + + if (authenticated) + *authenticated = a; + if (ttl) + *ttl = enclosure_rr->ttl; + + return 0; + } + + /* Ensure this is not a DNAME domain, see RFC5155, section 8.3. */ + if (bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_DNAME)) + return -EBADMSG; + + /* Ensure that this data is from the delegated domain + * (i.e. originates from the "lower" DNS server), and isn't + * just glue records (i.e. doesn't originate from the "upper" + * DNS server). */ + if (bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_NS) && + !bitmap_isset(enclosure_rr->nsec3.types, DNS_TYPE_SOA)) + return -EBADMSG; + + /* Prove that there is no next closer and whether or not there is a wildcard domain. */ + + wildcard = strjoina("*.", p); + r = nsec3_hashed_domain_make(enclosure_rr, wildcard, zone, &wildcard_domain); + if (r < 0) + return r; + if (r != hashed_size) + return -EBADMSG; + + r = nsec3_hashed_domain_make(enclosure_rr, pp, zone, &next_closer_domain); + if (r < 0) + return r; + if (r != hashed_size) + return -EBADMSG; + + DNS_ANSWER_FOREACH_FLAGS(rr, flags, answer) { + _cleanup_free_ char *next_hashed_domain = NULL; + + r = nsec3_is_good(rr, zone_rr); + if (r < 0) + return r; + if (r == 0) + continue; + + r = nsec3_hashed_domain_format(rr->nsec3.next_hashed_name, rr->nsec3.next_hashed_name_size, zone, &next_hashed_domain); + if (r < 0) + return r; + + r = dns_name_between(dns_resource_key_name(rr->key), next_closer_domain, next_hashed_domain); + if (r < 0) + return r; + if (r > 0) { + if (rr->nsec3.flags & 1) + optout = true; + + a = a && (flags & DNS_ANSWER_AUTHENTICATED); + + no_closer = true; + } + + r = dns_name_equal(dns_resource_key_name(rr->key), wildcard_domain); + if (r < 0) + return r; + if (r > 0) { + a = a && (flags & DNS_ANSWER_AUTHENTICATED); + + wildcard_rr = rr; + } + + r = dns_name_between(dns_resource_key_name(rr->key), wildcard_domain, next_hashed_domain); + if (r < 0) + return r; + if (r > 0) { + if (rr->nsec3.flags & 1) + /* This only makes sense if we have a wildcard delegation, which is + * very unlikely, see RFC 4592, Section 4.2, but we cannot rely on + * this not happening, so hence cannot simply conclude NXDOMAIN as + * we would wish */ + optout = true; + + a = a && (flags & DNS_ANSWER_AUTHENTICATED); + + no_wildcard = true; + } + } + + if (wildcard_rr && no_wildcard) + return -EBADMSG; + + if (!no_closer) { + *result = DNSSEC_NSEC_NO_RR; + return 0; + } + + if (wildcard_rr) { + /* A wildcard exists that matches our query. */ + if (optout) + /* This is not specified in any RFC to the best of my knowledge, but + * if the next closer enclosure is covered by an opt-out NSEC3 RR + * it means that we cannot prove that the source of synthesis is + * correct, as there may be a closer match. */ + *result = DNSSEC_NSEC_OPTOUT; + else if (bitmap_isset(wildcard_rr->nsec3.types, key->type)) + *result = DNSSEC_NSEC_FOUND; + else if (bitmap_isset(wildcard_rr->nsec3.types, DNS_TYPE_CNAME)) + *result = DNSSEC_NSEC_CNAME; + else + *result = DNSSEC_NSEC_NODATA; + } else { + if (optout) + /* The RFC only specifies that we have to care for optout for NODATA for + * DS records. However, children of an insecure opt-out delegation should + * also be considered opt-out, rather than verified NXDOMAIN. + * Note that we do not require a proof of wildcard non-existence if the + * next closer domain is covered by an opt-out, as that would not provide + * any additional information. */ + *result = DNSSEC_NSEC_OPTOUT; + else if (no_wildcard) + *result = DNSSEC_NSEC_NXDOMAIN; + else { + *result = DNSSEC_NSEC_NO_RR; + + return 0; + } + } + + if (authenticated) + *authenticated = a; + + if (ttl) + *ttl = enclosure_rr->ttl; + + return 0; +} + +static int dnssec_nsec_wildcard_equal(DnsResourceRecord *rr, const char *name) { + char label[DNS_LABEL_MAX]; + const char *n; + int r; + + assert(rr); + assert(rr->key->type == DNS_TYPE_NSEC); + + /* Checks whether the specified RR has a name beginning in "*.", and if the rest is a suffix of our name */ + + if (rr->n_skip_labels_source != 1) + return 0; + + n = dns_resource_key_name(rr->key); + r = dns_label_unescape(&n, label, sizeof label, 0); + if (r <= 0) + return r; + if (r != 1 || label[0] != '*') + return 0; + + return dns_name_endswith(name, n); +} + +static int dnssec_nsec_in_path(DnsResourceRecord *rr, const char *name) { + const char *nn, *common_suffix; + int r; + + assert(rr); + assert(rr->key->type == DNS_TYPE_NSEC); + + /* Checks whether the specified nsec RR indicates that name is an empty non-terminal (ENT) + * + * A couple of examples: + * + * NSEC bar → waldo.foo.bar: indicates that foo.bar exists and is an ENT + * NSEC waldo.foo.bar → yyy.zzz.xoo.bar: indicates that xoo.bar and zzz.xoo.bar exist and are ENTs + * NSEC yyy.zzz.xoo.bar → bar: indicates pretty much nothing about ENTs + */ + + /* First, determine parent of next domain. */ + nn = rr->nsec.next_domain_name; + r = dns_name_parent(&nn); + if (r <= 0) + return r; + + /* If the name we just determined is not equal or child of the name we are interested in, then we can't say + * anything at all. */ + r = dns_name_endswith(nn, name); + if (r <= 0) + return r; + + /* If the name we are interested in is not a prefix of the common suffix of the NSEC RR's owner and next domain names, then we can't say anything either. */ + r = dns_name_common_suffix(dns_resource_key_name(rr->key), rr->nsec.next_domain_name, &common_suffix); + if (r < 0) + return r; + + return dns_name_endswith(name, common_suffix); +} + +static int dnssec_nsec_from_parent_zone(DnsResourceRecord *rr, const char *name) { + int r; + + assert(rr); + assert(rr->key->type == DNS_TYPE_NSEC); + + /* Checks whether this NSEC originates to the parent zone or the child zone. */ + + r = dns_name_parent(&name); + if (r <= 0) + return r; + + r = dns_name_equal(name, dns_resource_key_name(rr->key)); + if (r <= 0) + return r; + + /* DNAME, and NS without SOA is an indication for a delegation. */ + if (bitmap_isset(rr->nsec.types, DNS_TYPE_DNAME)) + return 1; + + if (bitmap_isset(rr->nsec.types, DNS_TYPE_NS) && !bitmap_isset(rr->nsec.types, DNS_TYPE_SOA)) + return 1; + + return 0; +} + +static int dnssec_nsec_covers(DnsResourceRecord *rr, const char *name) { + const char *signer; + int r; + + assert(rr); + assert(rr->key->type == DNS_TYPE_NSEC); + + /* Checks whether the name is covered by this NSEC RR. This means, that the name is somewhere below the NSEC's + * signer name, and between the NSEC's two names. */ + + r = dns_resource_record_signer(rr, &signer); + if (r < 0) + return r; + + r = dns_name_endswith(name, signer); /* this NSEC isn't suitable the name is not in the signer's domain */ + if (r <= 0) + return r; + + return dns_name_between(dns_resource_key_name(rr->key), name, rr->nsec.next_domain_name); +} + +static int dnssec_nsec_generate_wildcard(DnsResourceRecord *rr, const char *name, char **wc) { + const char *common_suffix1, *common_suffix2, *signer; + int r, labels1, labels2; + + assert(rr); + assert(rr->key->type == DNS_TYPE_NSEC); + + /* Generates "Wildcard at the Closest Encloser" for the given name and NSEC RR. */ + + r = dns_resource_record_signer(rr, &signer); + if (r < 0) + return r; + + r = dns_name_endswith(name, signer); /* this NSEC isn't suitable the name is not in the signer's domain */ + if (r <= 0) + return r; + + r = dns_name_common_suffix(name, dns_resource_key_name(rr->key), &common_suffix1); + if (r < 0) + return r; + + r = dns_name_common_suffix(name, rr->nsec.next_domain_name, &common_suffix2); + if (r < 0) + return r; + + labels1 = dns_name_count_labels(common_suffix1); + if (labels1 < 0) + return labels1; + + labels2 = dns_name_count_labels(common_suffix2); + if (labels2 < 0) + return labels2; + + if (labels1 > labels2) + r = dns_name_concat("*", common_suffix1, 0, wc); + else + r = dns_name_concat("*", common_suffix2, 0, wc); + + if (r < 0) + return r; + + return 0; +} + +int dnssec_nsec_test(DnsAnswer *answer, DnsResourceKey *key, DnssecNsecResult *result, bool *authenticated, uint32_t *ttl) { + bool have_nsec3 = false, covering_rr_authenticated = false, wildcard_rr_authenticated = false; + DnsResourceRecord *rr, *covering_rr = NULL, *wildcard_rr = NULL; + DnsAnswerFlags flags; + const char *name; + int r; + + assert(key); + assert(result); + + /* Look for any NSEC/NSEC3 RRs that say something about the specified key. */ + + name = dns_resource_key_name(key); + + DNS_ANSWER_FOREACH_FLAGS(rr, flags, answer) { + + if (rr->key->class != key->class) + continue; + + have_nsec3 = have_nsec3 || (rr->key->type == DNS_TYPE_NSEC3); + + if (rr->key->type != DNS_TYPE_NSEC) + continue; + + /* The following checks only make sense for NSEC RRs that are not expanded from a wildcard */ + r = dns_resource_record_is_synthetic(rr); + if (r == -ENODATA) /* No signing RR known. */ + continue; + if (r < 0) + return r; + if (r > 0) + continue; + + /* Check if this is a direct match. If so, we have encountered a NODATA case */ + r = dns_name_equal(dns_resource_key_name(rr->key), name); + if (r < 0) + return r; + if (r == 0) { + /* If it's not a direct match, maybe it's a wild card match? */ + r = dnssec_nsec_wildcard_equal(rr, name); + if (r < 0) + return r; + } + if (r > 0) { + if (key->type == DNS_TYPE_DS) { + /* If we look for a DS RR and the server sent us the NSEC RR of the child zone + * we have a problem. For DS RRs we want the NSEC RR from the parent */ + if (bitmap_isset(rr->nsec.types, DNS_TYPE_SOA)) + continue; + } else { + /* For all RR types, ensure that if NS is set SOA is set too, so that we know + * we got the child's NSEC. */ + if (bitmap_isset(rr->nsec.types, DNS_TYPE_NS) && + !bitmap_isset(rr->nsec.types, DNS_TYPE_SOA)) + continue; + } + + if (bitmap_isset(rr->nsec.types, key->type)) + *result = DNSSEC_NSEC_FOUND; + else if (bitmap_isset(rr->nsec.types, DNS_TYPE_CNAME)) + *result = DNSSEC_NSEC_CNAME; + else + *result = DNSSEC_NSEC_NODATA; + + if (authenticated) + *authenticated = flags & DNS_ANSWER_AUTHENTICATED; + if (ttl) + *ttl = rr->ttl; + + return 0; + } + + /* Check if the name we are looking for is an empty non-terminal within the owner or next name + * of the NSEC RR. */ + r = dnssec_nsec_in_path(rr, name); + if (r < 0) + return r; + if (r > 0) { + *result = DNSSEC_NSEC_NODATA; + + if (authenticated) + *authenticated = flags & DNS_ANSWER_AUTHENTICATED; + if (ttl) + *ttl = rr->ttl; + + return 0; + } + + /* The following two "covering" checks, are not useful if the NSEC is from the parent */ + r = dnssec_nsec_from_parent_zone(rr, name); + if (r < 0) + return r; + if (r > 0) + continue; + + /* Check if this NSEC RR proves the absence of an explicit RR under this name */ + r = dnssec_nsec_covers(rr, name); + if (r < 0) + return r; + if (r > 0 && (!covering_rr || !covering_rr_authenticated)) { + covering_rr = rr; + covering_rr_authenticated = flags & DNS_ANSWER_AUTHENTICATED; + } + } + + if (covering_rr) { + _cleanup_free_ char *wc = NULL; + r = dnssec_nsec_generate_wildcard(covering_rr, name, &wc); + if (r < 0) + return r; + + DNS_ANSWER_FOREACH_FLAGS(rr, flags, answer) { + + if (rr->key->class != key->class) + continue; + + if (rr->key->type != DNS_TYPE_NSEC) + continue; + + /* Check if this NSEC RR proves the nonexistence of the wildcard */ + r = dnssec_nsec_covers(rr, wc); + if (r < 0) + return r; + if (r > 0 && (!wildcard_rr || !wildcard_rr_authenticated)) { + wildcard_rr = rr; + wildcard_rr_authenticated = flags & DNS_ANSWER_AUTHENTICATED; + } + } + } + + if (covering_rr && wildcard_rr) { + /* If we could prove that neither the name itself, nor the wildcard at the closest encloser exists, we + * proved the NXDOMAIN case. */ + *result = DNSSEC_NSEC_NXDOMAIN; + + if (authenticated) + *authenticated = covering_rr_authenticated && wildcard_rr_authenticated; + if (ttl) + *ttl = MIN(covering_rr->ttl, wildcard_rr->ttl); + + return 0; + } + + /* OK, this was not sufficient. Let's see if NSEC3 can help. */ + if (have_nsec3) + return dnssec_test_nsec3(answer, key, result, authenticated, ttl); + + /* No appropriate NSEC RR found, report this. */ + *result = DNSSEC_NSEC_NO_RR; + return 0; +} + +static int dnssec_nsec_test_enclosed(DnsAnswer *answer, uint16_t type, const char *name, const char *zone, bool *authenticated) { + DnsResourceRecord *rr; + DnsAnswerFlags flags; + int r; + + assert(name); + assert(zone); + + /* Checks whether there's an NSEC/NSEC3 that proves that the specified 'name' is non-existing in the specified + * 'zone'. The 'zone' must be a suffix of the 'name'. */ + + DNS_ANSWER_FOREACH_FLAGS(rr, flags, answer) { + bool found = false; + + if (rr->key->type != type && type != DNS_TYPE_ANY) + continue; + + switch (rr->key->type) { + + case DNS_TYPE_NSEC: + + /* We only care for NSEC RRs from the indicated zone */ + r = dns_resource_record_is_signer(rr, zone); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dns_name_between(dns_resource_key_name(rr->key), name, rr->nsec.next_domain_name); + if (r < 0) + return r; + + found = r > 0; + break; + + case DNS_TYPE_NSEC3: { + _cleanup_free_ char *hashed_domain = NULL, *next_hashed_domain = NULL; + + /* We only care for NSEC3 RRs from the indicated zone */ + r = dns_resource_record_is_signer(rr, zone); + if (r < 0) + return r; + if (r == 0) + continue; + + r = nsec3_is_good(rr, NULL); + if (r < 0) + return r; + if (r == 0) + break; + + /* Format the domain we are testing with the NSEC3 RR's hash function */ + r = nsec3_hashed_domain_make( + rr, + name, + zone, + &hashed_domain); + if (r < 0) + return r; + if ((size_t) r != rr->nsec3.next_hashed_name_size) + break; + + /* Format the NSEC3's next hashed name as proper domain name */ + r = nsec3_hashed_domain_format( + rr->nsec3.next_hashed_name, + rr->nsec3.next_hashed_name_size, + zone, + &next_hashed_domain); + if (r < 0) + return r; + + r = dns_name_between(dns_resource_key_name(rr->key), hashed_domain, next_hashed_domain); + if (r < 0) + return r; + + found = r > 0; + break; + } + + default: + continue; + } + + if (found) { + if (authenticated) + *authenticated = flags & DNS_ANSWER_AUTHENTICATED; + return 1; + } + } + + return 0; +} + +static int dnssec_test_positive_wildcard_nsec3( + DnsAnswer *answer, + const char *name, + const char *source, + const char *zone, + bool *authenticated) { + + const char *next_closer = NULL; + int r; + + /* Run a positive NSEC3 wildcard proof. Specifically: + * + * A proof that the "next closer" of the generating wildcard does not exist. + * + * Note a key difference between the NSEC3 and NSEC versions of the proof. NSEC RRs don't have to exist for + * empty non-transients. NSEC3 RRs however have to. This means it's sufficient to check if the next closer name + * exists for the NSEC3 RR and we are done. + * + * To prove that a.b.c.d.e.f is rightfully synthesized from a wildcard *.d.e.f all we have to check is that + * c.d.e.f does not exist. */ + + for (;;) { + next_closer = name; + r = dns_name_parent(&name); + if (r <= 0) + return r; + + r = dns_name_equal(name, source); + if (r < 0) + return r; + if (r > 0) + break; + } + + return dnssec_nsec_test_enclosed(answer, DNS_TYPE_NSEC3, next_closer, zone, authenticated); +} + +static int dnssec_test_positive_wildcard_nsec( + DnsAnswer *answer, + const char *name, + const char *source, + const char *zone, + bool *_authenticated) { + + bool authenticated = true; + int r; + + /* Run a positive NSEC wildcard proof. Specifically: + * + * A proof that there's neither a wildcard name nor a non-wildcard name that is a suffix of the name "name" and + * a prefix of the synthesizing source "source" in the zone "zone". + * + * See RFC 5155, Section 8.8 and RFC 4035, Section 5.3.4 + * + * Note that if we want to prove that a.b.c.d.e.f is rightfully synthesized from a wildcard *.d.e.f, then we + * have to prove that none of the following exist: + * + * 1) a.b.c.d.e.f + * 2) *.b.c.d.e.f + * 3) b.c.d.e.f + * 4) *.c.d.e.f + * 5) c.d.e.f + */ + + for (;;) { + _cleanup_free_ char *wc = NULL; + bool a = false; + + /* Check if there's an NSEC or NSEC3 RR that proves that the mame we determined is really non-existing, + * i.e between the owner name and the next name of an NSEC RR. */ + r = dnssec_nsec_test_enclosed(answer, DNS_TYPE_NSEC, name, zone, &a); + if (r <= 0) + return r; + + authenticated = authenticated && a; + + /* Strip one label off */ + r = dns_name_parent(&name); + if (r <= 0) + return r; + + /* Did we reach the source of synthesis? */ + r = dns_name_equal(name, source); + if (r < 0) + return r; + if (r > 0) { + /* Successful exit */ + *_authenticated = authenticated; + return 1; + } + + /* Safety check, that the source of synthesis is still our suffix */ + r = dns_name_endswith(name, source); + if (r < 0) + return r; + if (r == 0) + return -EBADMSG; + + /* Replace the label we stripped off with an asterisk */ + wc = strjoin("*.", name); + if (!wc) + return -ENOMEM; + + /* And check if the proof holds for the asterisk name, too */ + r = dnssec_nsec_test_enclosed(answer, DNS_TYPE_NSEC, wc, zone, &a); + if (r <= 0) + return r; + + authenticated = authenticated && a; + /* In the next iteration we'll check the non-asterisk-prefixed version */ + } +} + +int dnssec_test_positive_wildcard( + DnsAnswer *answer, + const char *name, + const char *source, + const char *zone, + bool *authenticated) { + + int r; + + assert(name); + assert(source); + assert(zone); + assert(authenticated); + + r = dns_answer_contains_zone_nsec3(answer, zone); + if (r < 0) + return r; + if (r > 0) + return dnssec_test_positive_wildcard_nsec3(answer, name, source, zone, authenticated); + else + return dnssec_test_positive_wildcard_nsec(answer, name, source, zone, authenticated); +} + +#else + +int dnssec_verify_rrset( + DnsAnswer *a, + const DnsResourceKey *key, + DnsResourceRecord *rrsig, + DnsResourceRecord *dnskey, + usec_t realtime, + DnssecResult *result) { + + return -EOPNOTSUPP; +} + +int dnssec_rrsig_match_dnskey(DnsResourceRecord *rrsig, DnsResourceRecord *dnskey, bool revoked_ok) { + + return -EOPNOTSUPP; +} + +int dnssec_key_match_rrsig(const DnsResourceKey *key, DnsResourceRecord *rrsig) { + + return -EOPNOTSUPP; +} + +int dnssec_verify_rrset_search( + DnsAnswer *a, + const DnsResourceKey *key, + DnsAnswer *validated_dnskeys, + usec_t realtime, + DnssecResult *result, + DnsResourceRecord **ret_rrsig) { + + return -EOPNOTSUPP; +} + +int dnssec_has_rrsig(DnsAnswer *a, const DnsResourceKey *key) { + + return -EOPNOTSUPP; +} + +int dnssec_verify_dnskey_by_ds(DnsResourceRecord *dnskey, DnsResourceRecord *ds, bool mask_revoke) { + + return -EOPNOTSUPP; +} + +int dnssec_verify_dnskey_by_ds_search(DnsResourceRecord *dnskey, DnsAnswer *validated_ds) { + + return -EOPNOTSUPP; +} + +int dnssec_nsec3_hash(DnsResourceRecord *nsec3, const char *name, void *ret) { + + return -EOPNOTSUPP; +} + +int dnssec_nsec_test(DnsAnswer *answer, DnsResourceKey *key, DnssecNsecResult *result, bool *authenticated, uint32_t *ttl) { + + return -EOPNOTSUPP; +} + +int dnssec_test_positive_wildcard( + DnsAnswer *answer, + const char *name, + const char *source, + const char *zone, + bool *authenticated) { + + return -EOPNOTSUPP; +} + +#endif + +static const char* const dnssec_result_table[_DNSSEC_RESULT_MAX] = { + [DNSSEC_VALIDATED] = "validated", + [DNSSEC_VALIDATED_WILDCARD] = "validated-wildcard", + [DNSSEC_INVALID] = "invalid", + [DNSSEC_SIGNATURE_EXPIRED] = "signature-expired", + [DNSSEC_UNSUPPORTED_ALGORITHM] = "unsupported-algorithm", + [DNSSEC_NO_SIGNATURE] = "no-signature", + [DNSSEC_MISSING_KEY] = "missing-key", + [DNSSEC_UNSIGNED] = "unsigned", + [DNSSEC_FAILED_AUXILIARY] = "failed-auxiliary", + [DNSSEC_NSEC_MISMATCH] = "nsec-mismatch", + [DNSSEC_INCOMPATIBLE_SERVER] = "incompatible-server", + [DNSSEC_TOO_MANY_VALIDATIONS] = "too-many-validations", +}; +DEFINE_STRING_TABLE_LOOKUP(dnssec_result, DnssecResult); + +static const char* const dnssec_verdict_table[_DNSSEC_VERDICT_MAX] = { + [DNSSEC_SECURE] = "secure", + [DNSSEC_INSECURE] = "insecure", + [DNSSEC_BOGUS] = "bogus", + [DNSSEC_INDETERMINATE] = "indeterminate", +}; +DEFINE_STRING_TABLE_LOOKUP(dnssec_verdict, DnssecVerdict); diff --git a/src/resolve/resolved-dns-dnssec.h b/src/resolve/resolved-dns-dnssec.h new file mode 100644 index 0000000..29b9013 --- /dev/null +++ b/src/resolve/resolved-dns-dnssec.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef enum DnssecResult DnssecResult; +typedef enum DnssecVerdict DnssecVerdict; + +#include "dns-domain.h" +#include "resolved-dns-answer.h" +#include "resolved-dns-rr.h" + +enum DnssecResult { + /* These six are returned by dnssec_verify_rrset() */ + DNSSEC_VALIDATED, + DNSSEC_VALIDATED_WILDCARD, /* Validated via a wildcard RRSIG, further NSEC/NSEC3 checks necessary */ + DNSSEC_INVALID, + DNSSEC_SIGNATURE_EXPIRED, + DNSSEC_UNSUPPORTED_ALGORITHM, + DNSSEC_TOO_MANY_VALIDATIONS, + + /* These two are added by dnssec_verify_rrset_search() */ + DNSSEC_NO_SIGNATURE, + DNSSEC_MISSING_KEY, + + /* These two are added by the DnsTransaction logic */ + DNSSEC_UNSIGNED, + DNSSEC_FAILED_AUXILIARY, + DNSSEC_NSEC_MISMATCH, + DNSSEC_INCOMPATIBLE_SERVER, + + _DNSSEC_RESULT_MAX, + _DNSSEC_RESULT_INVALID = -EINVAL, +}; + +enum DnssecVerdict { + DNSSEC_SECURE, + DNSSEC_INSECURE, + DNSSEC_BOGUS, + DNSSEC_INDETERMINATE, + + _DNSSEC_VERDICT_MAX, + _DNSSEC_VERDICT_INVALID = -EINVAL, +}; + +#define DNSSEC_CANONICAL_HOSTNAME_MAX (DNS_HOSTNAME_MAX + 2) + +/* The longest digest we'll ever generate, of all digest algorithms we support */ +#define DNSSEC_HASH_SIZE_MAX (MAX(20, 32)) + +/* The most invalid signatures we will tolerate for a single rrset */ +#define DNSSEC_INVALID_MAX 5 + +/* The total number of signature validations we will tolerate for a single transaction */ +#define DNSSEC_VALIDATION_MAX 64 + +int dnssec_rrsig_match_dnskey(DnsResourceRecord *rrsig, DnsResourceRecord *dnskey, bool revoked_ok); +int dnssec_key_match_rrsig(const DnsResourceKey *key, DnsResourceRecord *rrsig); + +int dnssec_verify_rrset(DnsAnswer *answer, const DnsResourceKey *key, DnsResourceRecord *rrsig, DnsResourceRecord *dnskey, usec_t realtime, DnssecResult *result); +int dnssec_verify_rrset_search(DnsAnswer *answer, const DnsResourceKey *key, DnsAnswer *validated_dnskeys, usec_t realtime, DnssecResult *result, DnsResourceRecord **rrsig); + +int dnssec_verify_dnskey_by_ds(DnsResourceRecord *dnskey, DnsResourceRecord *ds, bool mask_revoke); +int dnssec_verify_dnskey_by_ds_search(DnsResourceRecord *dnskey, DnsAnswer *validated_ds); + +int dnssec_has_rrsig(DnsAnswer *a, const DnsResourceKey *key); + +uint16_t dnssec_keytag(DnsResourceRecord *dnskey, bool mask_revoke); + +int dnssec_nsec3_hash(DnsResourceRecord *nsec3, const char *name, void *ret); + +typedef enum DnssecNsecResult { + DNSSEC_NSEC_NO_RR, /* No suitable NSEC/NSEC3 RR found */ + DNSSEC_NSEC_CNAME, /* Didn't find what was asked for, but did find CNAME */ + DNSSEC_NSEC_UNSUPPORTED_ALGORITHM, + DNSSEC_NSEC_NXDOMAIN, + DNSSEC_NSEC_NODATA, + DNSSEC_NSEC_FOUND, + DNSSEC_NSEC_OPTOUT, +} DnssecNsecResult; + +int dnssec_nsec_test(DnsAnswer *answer, DnsResourceKey *key, DnssecNsecResult *result, bool *authenticated, uint32_t *ttl); + +int dnssec_test_positive_wildcard(DnsAnswer *a, const char *name, const char *source, const char *zone, bool *authenticated); + +const char* dnssec_result_to_string(DnssecResult m) _const_; +DnssecResult dnssec_result_from_string(const char *s) _pure_; + +const char* dnssec_verdict_to_string(DnssecVerdict m) _const_; +DnssecVerdict dnssec_verdict_from_string(const char *s) _pure_; diff --git a/src/resolve/resolved-dns-packet.c b/src/resolve/resolved-dns-packet.c new file mode 100644 index 0000000..426711b --- /dev/null +++ b/src/resolve/resolved-dns-packet.c @@ -0,0 +1,2686 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_GCRYPT +# include +#endif + +#include "alloc-util.h" +#include "dns-domain.h" +#include "memory-util.h" +#include "resolved-dns-packet.h" +#include "set.h" +#include "stdio-util.h" +#include "string-table.h" +#include "strv.h" +#include "unaligned.h" +#include "utf8.h" + +#define EDNS0_OPT_DO (1<<15) + +assert_cc(DNS_PACKET_SIZE_START > DNS_PACKET_HEADER_SIZE); + +typedef struct DnsPacketRewinder { + DnsPacket *packet; + size_t saved_rindex; +} DnsPacketRewinder; + +static void rewind_dns_packet(DnsPacketRewinder *rewinder) { + if (rewinder->packet) + dns_packet_rewind(rewinder->packet, rewinder->saved_rindex); +} + +#define REWINDER_INIT(p) { \ + .packet = (p), \ + .saved_rindex = (p)->rindex, \ + } +#define CANCEL_REWINDER(rewinder) do { (rewinder).packet = NULL; } while (0) + +int dns_packet_new( + DnsPacket **ret, + DnsProtocol protocol, + size_t min_alloc_dsize, + size_t max_size) { + + DnsPacket *p; + size_t a; + + assert(ret); + assert(max_size >= DNS_PACKET_HEADER_SIZE); + + if (max_size > DNS_PACKET_SIZE_MAX) + max_size = DNS_PACKET_SIZE_MAX; + + /* The caller may not check what is going to be truly allocated, so do not allow to + * allocate a DNS packet bigger than DNS_PACKET_SIZE_MAX. + */ + if (min_alloc_dsize > DNS_PACKET_SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EFBIG), + "Requested packet data size too big: %zu", + min_alloc_dsize); + + /* When dns_packet_new() is called with min_alloc_dsize == 0, allocate more than the + * absolute minimum (which is the dns packet header size), to avoid + * resizing immediately again after appending the first data to the packet. + */ + if (min_alloc_dsize < DNS_PACKET_HEADER_SIZE) + a = DNS_PACKET_SIZE_START; + else + a = min_alloc_dsize; + + /* round up to next page size */ + a = PAGE_ALIGN(ALIGN(sizeof(DnsPacket)) + a) - ALIGN(sizeof(DnsPacket)); + + /* make sure we never allocate more than useful */ + if (a > max_size) + a = max_size; + + p = malloc0(ALIGN(sizeof(DnsPacket)) + a); + if (!p) + return -ENOMEM; + + *p = (DnsPacket) { + .n_ref = 1, + .protocol = protocol, + .size = DNS_PACKET_HEADER_SIZE, + .rindex = DNS_PACKET_HEADER_SIZE, + .allocated = a, + .max_size = max_size, + .opt_start = SIZE_MAX, + .opt_size = SIZE_MAX, + }; + + *ret = p; + + return 0; +} + +void dns_packet_set_flags(DnsPacket *p, bool dnssec_checking_disabled, bool truncated) { + + DnsPacketHeader *h; + + assert(p); + + h = DNS_PACKET_HEADER(p); + + switch (p->protocol) { + case DNS_PROTOCOL_LLMNR: + assert(!truncated); + + h->flags = htobe16(DNS_PACKET_MAKE_FLAGS(0 /* qr */, + 0 /* opcode */, + 0 /* c */, + 0 /* tc */, + 0 /* t */, + 0 /* ra */, + 0 /* ad */, + 0 /* cd */, + 0 /* rcode */)); + break; + + case DNS_PROTOCOL_MDNS: + h->flags = htobe16(DNS_PACKET_MAKE_FLAGS(0 /* qr */, + 0 /* opcode */, + 0 /* aa */, + truncated /* tc */, + 0 /* rd (ask for recursion) */, + 0 /* ra */, + 0 /* ad */, + 0 /* cd */, + 0 /* rcode */)); + break; + + default: + assert(!truncated); + + h->flags = htobe16(DNS_PACKET_MAKE_FLAGS(0 /* qr */, + 0 /* opcode */, + 0 /* aa */, + 0 /* tc */, + 1 /* rd (ask for recursion) */, + 0 /* ra */, + 0 /* ad */, + dnssec_checking_disabled /* cd */, + 0 /* rcode */)); + } +} + +int dns_packet_new_query(DnsPacket **ret, DnsProtocol protocol, size_t min_alloc_dsize, bool dnssec_checking_disabled) { + DnsPacket *p; + int r; + + assert(ret); + + r = dns_packet_new(&p, protocol, min_alloc_dsize, DNS_PACKET_SIZE_MAX); + if (r < 0) + return r; + + /* Always set the TC bit to 0 initially. + * If there are multiple packets later, we'll update the bit shortly before sending. + */ + dns_packet_set_flags(p, dnssec_checking_disabled, false); + + *ret = p; + return 0; +} + +int dns_packet_dup(DnsPacket **ret, DnsPacket *p) { + DnsPacket *c; + int r; + + assert(ret); + assert(p); + + r = dns_packet_validate(p); + if (r < 0) + return r; + + c = malloc(ALIGN(sizeof(DnsPacket)) + p->size); + if (!c) + return -ENOMEM; + + *c = (DnsPacket) { + .n_ref = 1, + .protocol = p->protocol, + .size = p->size, + .rindex = DNS_PACKET_HEADER_SIZE, + .allocated = p->size, + .max_size = p->max_size, + .opt_start = SIZE_MAX, + .opt_size = SIZE_MAX, + }; + + memcpy(DNS_PACKET_DATA(c), DNS_PACKET_DATA(p), p->size); + + *ret = c; + return 0; +} + +DnsPacket *dns_packet_ref(DnsPacket *p) { + + if (!p) + return NULL; + + assert(!p->on_stack); + + assert(p->n_ref > 0); + p->n_ref++; + return p; +} + +static void dns_packet_free(DnsPacket *p) { + char *s; + + assert(p); + + dns_question_unref(p->question); + dns_answer_unref(p->answer); + dns_resource_record_unref(p->opt); + + while ((s = hashmap_steal_first_key(p->names))) + free(s); + hashmap_free(p->names); + + free(p->_data); + + if (!p->on_stack) + free(p); +} + +DnsPacket *dns_packet_unref(DnsPacket *p) { + if (!p) + return NULL; + + assert(p->n_ref > 0); + + dns_packet_unref(p->more); + + if (p->n_ref == 1) + dns_packet_free(p); + else + p->n_ref--; + + return NULL; +} + +int dns_packet_validate(DnsPacket *p) { + assert(p); + + if (p->size < DNS_PACKET_HEADER_SIZE) + return -EBADMSG; + + if (p->size > DNS_PACKET_SIZE_MAX) + return -EBADMSG; + + return 1; +} + +int dns_packet_validate_reply(DnsPacket *p) { + int r; + + assert(p); + + r = dns_packet_validate(p); + if (r < 0) + return r; + + if (DNS_PACKET_QR(p) != 1) + return 0; + + if (DNS_PACKET_OPCODE(p) != 0) + return -EBADMSG; + + switch (p->protocol) { + + case DNS_PROTOCOL_LLMNR: + /* RFC 4795, Section 2.1.1. says to discard all replies with QDCOUNT != 1 */ + if (DNS_PACKET_QDCOUNT(p) != 1) + return -EBADMSG; + + break; + + case DNS_PROTOCOL_MDNS: + /* RFC 6762, Section 18 */ + if (DNS_PACKET_RCODE(p) != 0) + return -EBADMSG; + + break; + + default: + break; + } + + return 1; +} + +int dns_packet_validate_query(DnsPacket *p) { + int r; + + assert(p); + + r = dns_packet_validate(p); + if (r < 0) + return r; + + if (DNS_PACKET_QR(p) != 0) + return 0; + + if (DNS_PACKET_OPCODE(p) != 0) + return -EBADMSG; + + switch (p->protocol) { + + case DNS_PROTOCOL_DNS: + if (DNS_PACKET_TC(p)) + return -EBADMSG; + + if (DNS_PACKET_QDCOUNT(p) != 1) + return -EBADMSG; + + if (DNS_PACKET_ANCOUNT(p) > 0) + return -EBADMSG; + + /* Note, in most cases, DNS query packet does not have authority section. But some query + * types, e.g. IXFR, have Authority sections. Hence, unlike the check for LLMNR, we do not + * check DNS_PACKET_NSCOUNT(p) here. */ + break; + + case DNS_PROTOCOL_LLMNR: + if (DNS_PACKET_TC(p)) + return -EBADMSG; + + /* RFC 4795, Section 2.1.1. says to discard all queries with QDCOUNT != 1 */ + if (DNS_PACKET_QDCOUNT(p) != 1) + return -EBADMSG; + + /* RFC 4795, Section 2.1.1. says to discard all queries with ANCOUNT != 0 */ + if (DNS_PACKET_ANCOUNT(p) > 0) + return -EBADMSG; + + /* RFC 4795, Section 2.1.1. says to discard all queries with NSCOUNT != 0 */ + if (DNS_PACKET_NSCOUNT(p) > 0) + return -EBADMSG; + + break; + + case DNS_PROTOCOL_MDNS: + /* Note, mDNS query may have truncation flag. So, unlike the check for DNS and LLMNR, + * we do not check DNS_PACKET_TC(p) here. */ + + /* RFC 6762, Section 18 specifies that messages with non-zero RCODE + * must be silently ignored, and that we must ignore the values of + * AA, RD, RA, AD, and CD bits. */ + if (DNS_PACKET_RCODE(p) != 0) + return -EBADMSG; + + break; + + default: + break; + } + + return 1; +} + +static int dns_packet_extend(DnsPacket *p, size_t add, void **ret, size_t *start) { + assert(p); + + if (p->size + add > p->allocated) { + size_t a, ms; + + a = PAGE_ALIGN((p->size + add) * 2); + + ms = dns_packet_size_max(p); + if (a > ms) + a = ms; + + if (p->size + add > a) + return -EMSGSIZE; + + if (p->_data) { + void *d; + + d = realloc(p->_data, a); + if (!d) + return -ENOMEM; + + p->_data = d; + } else { + p->_data = malloc(a); + if (!p->_data) + return -ENOMEM; + + memcpy(p->_data, (uint8_t*) p + ALIGN(sizeof(DnsPacket)), p->size); + memzero((uint8_t*) p->_data + p->size, a - p->size); + } + + p->allocated = a; + } + + if (start) + *start = p->size; + + if (ret) + *ret = (uint8_t*) DNS_PACKET_DATA(p) + p->size; + + p->size += add; + return 0; +} + +void dns_packet_truncate(DnsPacket *p, size_t sz) { + char *s; + void *n; + + assert(p); + + if (p->size <= sz) + return; + + HASHMAP_FOREACH_KEY(n, s, p->names) { + + if (PTR_TO_SIZE(n) < sz) + continue; + + hashmap_remove(p->names, s); + free(s); + } + + p->size = sz; +} + +int dns_packet_append_blob(DnsPacket *p, const void *d, size_t l, size_t *start) { + void *q; + int r; + + assert(p); + + r = dns_packet_extend(p, l, &q, start); + if (r < 0) + return r; + + memcpy_safe(q, d, l); + return 0; +} + +int dns_packet_append_uint8(DnsPacket *p, uint8_t v, size_t *start) { + void *d; + int r; + + assert(p); + + r = dns_packet_extend(p, sizeof(uint8_t), &d, start); + if (r < 0) + return r; + + ((uint8_t*) d)[0] = v; + + return 0; +} + +int dns_packet_append_uint16(DnsPacket *p, uint16_t v, size_t *start) { + void *d; + int r; + + assert(p); + + r = dns_packet_extend(p, sizeof(uint16_t), &d, start); + if (r < 0) + return r; + + unaligned_write_be16(d, v); + + return 0; +} + +int dns_packet_append_uint32(DnsPacket *p, uint32_t v, size_t *start) { + void *d; + int r; + + assert(p); + + r = dns_packet_extend(p, sizeof(uint32_t), &d, start); + if (r < 0) + return r; + + unaligned_write_be32(d, v); + + return 0; +} + +int dns_packet_append_string(DnsPacket *p, const char *s, size_t *start) { + assert(p); + assert(s); + + return dns_packet_append_raw_string(p, s, strlen(s), start); +} + +int dns_packet_append_raw_string(DnsPacket *p, const void *s, size_t size, size_t *start) { + void *d; + int r; + + assert(p); + assert(s || size == 0); + + if (size > 255) + return -E2BIG; + + r = dns_packet_extend(p, 1 + size, &d, start); + if (r < 0) + return r; + + ((uint8_t*) d)[0] = (uint8_t) size; + + memcpy_safe(((uint8_t*) d) + 1, s, size); + + return 0; +} + +int dns_packet_append_label(DnsPacket *p, const char *d, size_t l, bool canonical_candidate, size_t *start) { + uint8_t *w; + int r; + + /* Append a label to a packet. Optionally, does this in DNSSEC + * canonical form, if this label is marked as a candidate for + * it, and the canonical form logic is enabled for the + * packet */ + + assert(p); + assert(d); + + if (l > DNS_LABEL_MAX) + return -E2BIG; + + r = dns_packet_extend(p, 1 + l, (void**) &w, start); + if (r < 0) + return r; + + *(w++) = (uint8_t) l; + + if (p->canonical_form && canonical_candidate) + /* Generate in canonical form, as defined by DNSSEC + * RFC 4034, Section 6.2, i.e. all lower-case. */ + for (size_t i = 0; i < l; i++) + w[i] = (uint8_t) ascii_tolower(d[i]); + else + /* Otherwise, just copy the string unaltered. This is + * essential for DNS-SD, where the casing of labels + * matters and needs to be retained. */ + memcpy(w, d, l); + + return 0; +} + +int dns_packet_append_name( + DnsPacket *p, + const char *name, + bool allow_compression, + bool canonical_candidate, + size_t *start) { + + size_t saved_size; + int r; + + assert(p); + assert(name); + + if (p->refuse_compression) + allow_compression = false; + + saved_size = p->size; + + while (!dns_name_is_root(name)) { + const char *z = name; + char label[DNS_LABEL_MAX]; + size_t n = 0; + + if (allow_compression) + n = PTR_TO_SIZE(hashmap_get(p->names, name)); + if (n > 0) { + assert(n < p->size); + + if (n < 0x4000) { + r = dns_packet_append_uint16(p, 0xC000 | n, NULL); + if (r < 0) + goto fail; + + goto done; + } + } + + r = dns_label_unescape(&name, label, sizeof label, 0); + if (r < 0) + goto fail; + + r = dns_packet_append_label(p, label, r, canonical_candidate, &n); + if (r < 0) + goto fail; + + if (allow_compression) { + _cleanup_free_ char *s = NULL; + + s = strdup(z); + if (!s) { + r = -ENOMEM; + goto fail; + } + + r = hashmap_ensure_put(&p->names, &dns_name_hash_ops, s, SIZE_TO_PTR(n)); + if (r < 0) + goto fail; + + TAKE_PTR(s); + } + } + + r = dns_packet_append_uint8(p, 0, NULL); + if (r < 0) + return r; + +done: + if (start) + *start = saved_size; + + return 0; + +fail: + dns_packet_truncate(p, saved_size); + return r; +} + +int dns_packet_append_key(DnsPacket *p, const DnsResourceKey *k, const DnsAnswerFlags flags, size_t *start) { + size_t saved_size; + uint16_t class; + int r; + + assert(p); + assert(k); + + saved_size = p->size; + + r = dns_packet_append_name(p, dns_resource_key_name(k), true, true, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, k->type, NULL); + if (r < 0) + goto fail; + + class = flags & DNS_ANSWER_CACHE_FLUSH ? k->class | MDNS_RR_CACHE_FLUSH_OR_QU : k->class; + r = dns_packet_append_uint16(p, class, NULL); + if (r < 0) + goto fail; + + if (start) + *start = saved_size; + + return 0; + +fail: + dns_packet_truncate(p, saved_size); + return r; +} + +static int dns_packet_append_type_window(DnsPacket *p, uint8_t window, uint8_t length, const uint8_t *types, size_t *start) { + size_t saved_size; + int r; + + assert(p); + assert(types); + assert(length > 0); + + saved_size = p->size; + + r = dns_packet_append_uint8(p, window, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, length, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, types, length, NULL); + if (r < 0) + goto fail; + + if (start) + *start = saved_size; + + return 0; +fail: + dns_packet_truncate(p, saved_size); + return r; +} + +static int dns_packet_append_types(DnsPacket *p, Bitmap *types, size_t *start) { + uint8_t window = 0; + uint8_t entry = 0; + uint8_t bitmaps[32] = {}; + unsigned n; + size_t saved_size; + int r; + + assert(p); + + saved_size = p->size; + + BITMAP_FOREACH(n, types) { + assert(n <= 0xffff); + + if ((n >> 8) != window && bitmaps[entry / 8] != 0) { + r = dns_packet_append_type_window(p, window, entry / 8 + 1, bitmaps, NULL); + if (r < 0) + goto fail; + + zero(bitmaps); + } + + window = n >> 8; + entry = n & 255; + + bitmaps[entry / 8] |= 1 << (7 - (entry % 8)); + } + + if (bitmaps[entry / 8] != 0) { + r = dns_packet_append_type_window(p, window, entry / 8 + 1, bitmaps, NULL); + if (r < 0) + goto fail; + } + + if (start) + *start = saved_size; + + return 0; +fail: + dns_packet_truncate(p, saved_size); + return r; +} + +/* Append the OPT pseudo-RR described in RFC6891 */ +int dns_packet_append_opt( + DnsPacket *p, + uint16_t max_udp_size, + bool edns0_do, + bool include_rfc6975, + const char *nsid, + int rcode, + size_t *ret_start) { + + size_t saved_size; + int r; + + assert(p); + /* we must never advertise supported packet size smaller than the legacy max */ + assert(max_udp_size >= DNS_PACKET_UNICAST_SIZE_MAX); + assert(rcode >= 0); + assert(rcode <= _DNS_RCODE_MAX); + + if (p->opt_start != SIZE_MAX) + return -EBUSY; + + assert(p->opt_size == SIZE_MAX); + + saved_size = p->size; + + /* empty name */ + r = dns_packet_append_uint8(p, 0, NULL); + if (r < 0) + return r; + + /* type */ + r = dns_packet_append_uint16(p, DNS_TYPE_OPT, NULL); + if (r < 0) + goto fail; + + /* class: maximum udp packet that can be received */ + r = dns_packet_append_uint16(p, max_udp_size, NULL); + if (r < 0) + goto fail; + + /* extended RCODE and VERSION */ + r = dns_packet_append_uint16(p, ((uint16_t) rcode & 0x0FF0) << 4, NULL); + if (r < 0) + goto fail; + + /* flags: DNSSEC OK (DO), see RFC3225 */ + r = dns_packet_append_uint16(p, edns0_do ? EDNS0_OPT_DO : 0, NULL); + if (r < 0) + goto fail; + + if (edns0_do && include_rfc6975) { + /* If DO is on and this is requested, also append RFC6975 Algorithm data. This is supposed to + * be done on queries, not on replies, hencer callers should turn this off when finishing off + * replies. */ + + static const uint8_t rfc6975[] = { + + 0, 5, /* OPTION_CODE: DAU */ +#if PREFER_OPENSSL || (HAVE_GCRYPT && GCRYPT_VERSION_NUMBER >= 0x010600) + 0, 7, /* LIST_LENGTH */ +#else + 0, 6, /* LIST_LENGTH */ +#endif + DNSSEC_ALGORITHM_RSASHA1, + DNSSEC_ALGORITHM_RSASHA1_NSEC3_SHA1, + DNSSEC_ALGORITHM_RSASHA256, + DNSSEC_ALGORITHM_RSASHA512, + DNSSEC_ALGORITHM_ECDSAP256SHA256, + DNSSEC_ALGORITHM_ECDSAP384SHA384, +#if PREFER_OPENSSL || (HAVE_GCRYPT && GCRYPT_VERSION_NUMBER >= 0x010600) + DNSSEC_ALGORITHM_ED25519, +#endif + + 0, 6, /* OPTION_CODE: DHU */ + 0, 3, /* LIST_LENGTH */ + DNSSEC_DIGEST_SHA1, + DNSSEC_DIGEST_SHA256, + DNSSEC_DIGEST_SHA384, + + 0, 7, /* OPTION_CODE: N3U */ + 0, 1, /* LIST_LENGTH */ + NSEC3_ALGORITHM_SHA1, + }; + + r = dns_packet_append_uint16(p, sizeof(rfc6975), NULL); /* RDLENGTH */ + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rfc6975, sizeof(rfc6975), NULL); /* the payload, as defined above */ + + } else if (nsid) { + + if (strlen(nsid) > UINT16_MAX - 4) { + r = -E2BIG; + goto fail; + } + + r = dns_packet_append_uint16(p, 4 + strlen(nsid), NULL); /* RDLENGTH */ + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, 3, NULL); /* OPTION-CODE: NSID */ + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, strlen(nsid), NULL); /* OPTION-LENGTH */ + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, nsid, strlen(nsid), NULL); + } else + r = dns_packet_append_uint16(p, 0, NULL); + if (r < 0) + goto fail; + + DNS_PACKET_HEADER(p)->arcount = htobe16(DNS_PACKET_ARCOUNT(p) + 1); + + p->opt_start = saved_size; + p->opt_size = p->size - saved_size; + + if (ret_start) + *ret_start = saved_size; + + return 0; + +fail: + dns_packet_truncate(p, saved_size); + return r; +} + +int dns_packet_truncate_opt(DnsPacket *p) { + assert(p); + + if (p->opt_start == SIZE_MAX) { + assert(p->opt_size == SIZE_MAX); + return 0; + } + + assert(p->opt_size != SIZE_MAX); + assert(DNS_PACKET_ARCOUNT(p) > 0); + + if (p->opt_start + p->opt_size != p->size) + return -EBUSY; + + dns_packet_truncate(p, p->opt_start); + DNS_PACKET_HEADER(p)->arcount = htobe16(DNS_PACKET_ARCOUNT(p) - 1); + p->opt_start = p->opt_size = SIZE_MAX; + + return 1; +} + +int dns_packet_append_rr(DnsPacket *p, const DnsResourceRecord *rr, const DnsAnswerFlags flags, size_t *start, size_t *rdata_start) { + + size_t saved_size, rdlength_offset, end, rdlength, rds; + uint32_t ttl; + int r; + + assert(p); + assert(rr); + + saved_size = p->size; + + r = dns_packet_append_key(p, rr->key, flags, NULL); + if (r < 0) + goto fail; + + ttl = flags & DNS_ANSWER_GOODBYE ? 0 : rr->ttl; + r = dns_packet_append_uint32(p, ttl, NULL); + if (r < 0) + goto fail; + + /* Initially we write 0 here */ + r = dns_packet_append_uint16(p, 0, &rdlength_offset); + if (r < 0) + goto fail; + + rds = p->size - saved_size; + + switch (rr->unparsable ? _DNS_TYPE_INVALID : rr->key->type) { + + case DNS_TYPE_SRV: + r = dns_packet_append_uint16(p, rr->srv.priority, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, rr->srv.weight, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, rr->srv.port, NULL); + if (r < 0) + goto fail; + + /* RFC 2782 states "Unless and until permitted by future standards action, name compression + * is not to be used for this field." Hence we turn off compression here. */ + r = dns_packet_append_name(p, rr->srv.name, /* allow_compression= */ false, /* canonical_candidate= */ true, NULL); + break; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + r = dns_packet_append_name(p, rr->ptr.name, true, true, NULL); + break; + + case DNS_TYPE_HINFO: + r = dns_packet_append_string(p, rr->hinfo.cpu, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_string(p, rr->hinfo.os, NULL); + break; + + case DNS_TYPE_SPF: /* exactly the same as TXT */ + case DNS_TYPE_TXT: + + if (!rr->txt.items) { + /* RFC 6763, section 6.1 suggests to generate + * single empty string for an empty array. */ + + r = dns_packet_append_raw_string(p, NULL, 0, NULL); + if (r < 0) + goto fail; + } else + LIST_FOREACH(items, i, rr->txt.items) { + r = dns_packet_append_raw_string(p, i->data, i->length, NULL); + if (r < 0) + goto fail; + } + + r = 0; + break; + + case DNS_TYPE_A: + r = dns_packet_append_blob(p, &rr->a.in_addr, sizeof(struct in_addr), NULL); + break; + + case DNS_TYPE_AAAA: + r = dns_packet_append_blob(p, &rr->aaaa.in6_addr, sizeof(struct in6_addr), NULL); + break; + + case DNS_TYPE_SOA: + r = dns_packet_append_name(p, rr->soa.mname, true, true, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_name(p, rr->soa.rname, true, true, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->soa.serial, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->soa.refresh, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->soa.retry, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->soa.expire, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->soa.minimum, NULL); + break; + + case DNS_TYPE_MX: + r = dns_packet_append_uint16(p, rr->mx.priority, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_name(p, rr->mx.exchange, true, true, NULL); + break; + + case DNS_TYPE_LOC: + r = dns_packet_append_uint8(p, rr->loc.version, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->loc.size, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->loc.horiz_pre, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->loc.vert_pre, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->loc.latitude, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->loc.longitude, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->loc.altitude, NULL); + break; + + case DNS_TYPE_DS: + r = dns_packet_append_uint16(p, rr->ds.key_tag, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->ds.algorithm, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->ds.digest_type, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->ds.digest, rr->ds.digest_size, NULL); + break; + + case DNS_TYPE_SSHFP: + r = dns_packet_append_uint8(p, rr->sshfp.algorithm, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->sshfp.fptype, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->sshfp.fingerprint, rr->sshfp.fingerprint_size, NULL); + break; + + case DNS_TYPE_DNSKEY: + r = dns_packet_append_uint16(p, rr->dnskey.flags, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->dnskey.protocol, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->dnskey.algorithm, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->dnskey.key, rr->dnskey.key_size, NULL); + break; + + case DNS_TYPE_RRSIG: + r = dns_packet_append_uint16(p, rr->rrsig.type_covered, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->rrsig.algorithm, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->rrsig.labels, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->rrsig.original_ttl, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->rrsig.expiration, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint32(p, rr->rrsig.inception, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, rr->rrsig.key_tag, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_name(p, rr->rrsig.signer, false, true, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->rrsig.signature, rr->rrsig.signature_size, NULL); + break; + + case DNS_TYPE_NSEC: + r = dns_packet_append_name(p, rr->nsec.next_domain_name, false, false, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_types(p, rr->nsec.types, NULL); + if (r < 0) + goto fail; + + break; + + case DNS_TYPE_NSEC3: + r = dns_packet_append_uint8(p, rr->nsec3.algorithm, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->nsec3.flags, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint16(p, rr->nsec3.iterations, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->nsec3.salt_size, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->nsec3.salt, rr->nsec3.salt_size, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->nsec3.next_hashed_name_size, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->nsec3.next_hashed_name, rr->nsec3.next_hashed_name_size, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_types(p, rr->nsec3.types, NULL); + if (r < 0) + goto fail; + + break; + + case DNS_TYPE_TLSA: + r = dns_packet_append_uint8(p, rr->tlsa.cert_usage, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->tlsa.selector, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_uint8(p, rr->tlsa.matching_type, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->tlsa.data, rr->tlsa.data_size, NULL); + break; + + case DNS_TYPE_CAA: + r = dns_packet_append_uint8(p, rr->caa.flags, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_string(p, rr->caa.tag, NULL); + if (r < 0) + goto fail; + + r = dns_packet_append_blob(p, rr->caa.value, rr->caa.value_size, NULL); + break; + + case DNS_TYPE_OPT: + case DNS_TYPE_OPENPGPKEY: + case _DNS_TYPE_INVALID: /* unparsable */ + default: + + r = dns_packet_append_blob(p, rr->generic.data, rr->generic.data_size, NULL); + break; + } + if (r < 0) + goto fail; + + /* Let's calculate the actual data size and update the field */ + rdlength = p->size - rdlength_offset - sizeof(uint16_t); + if (rdlength > 0xFFFF) { + r = -ENOSPC; + goto fail; + } + + end = p->size; + p->size = rdlength_offset; + r = dns_packet_append_uint16(p, rdlength, NULL); + if (r < 0) + goto fail; + p->size = end; + + if (start) + *start = saved_size; + + if (rdata_start) + *rdata_start = rds; + + return 0; + +fail: + dns_packet_truncate(p, saved_size); + return r; +} + +int dns_packet_append_question(DnsPacket *p, DnsQuestion *q) { + DnsResourceKey *key; + int r; + + assert(p); + + DNS_QUESTION_FOREACH(key, q) { + r = dns_packet_append_key(p, key, 0, NULL); + if (r < 0) + return r; + } + + return 0; +} + +int dns_packet_append_answer(DnsPacket *p, DnsAnswer *a, unsigned *completed) { + DnsResourceRecord *rr; + DnsAnswerFlags flags; + int r; + + assert(p); + + DNS_ANSWER_FOREACH_FLAGS(rr, flags, a) { + r = dns_packet_append_rr(p, rr, flags, NULL, NULL); + if (r < 0) + return r; + + if (completed) + (*completed)++; + } + + return 0; +} + +int dns_packet_read(DnsPacket *p, size_t sz, const void **ret, size_t *start) { + assert(p); + assert(p->rindex <= p->size); + + if (sz > p->size - p->rindex) + return -EMSGSIZE; + + if (ret) + *ret = (uint8_t*) DNS_PACKET_DATA(p) + p->rindex; + + if (start) + *start = p->rindex; + + p->rindex += sz; + return 0; +} + +void dns_packet_rewind(DnsPacket *p, size_t idx) { + assert(p); + assert(idx <= p->size); + assert(idx >= DNS_PACKET_HEADER_SIZE); + + p->rindex = idx; +} + +int dns_packet_read_blob(DnsPacket *p, void *d, size_t sz, size_t *start) { + const void *q; + int r; + + assert(p); + assert(d); + + r = dns_packet_read(p, sz, &q, start); + if (r < 0) + return r; + + memcpy(d, q, sz); + return 0; +} + +static int dns_packet_read_memdup( + DnsPacket *p, size_t size, + void **ret, size_t *ret_size, + size_t *ret_start) { + + const void *src; + size_t start; + int r; + + assert(p); + assert(ret); + + r = dns_packet_read(p, size, &src, &start); + if (r < 0) + return r; + + if (size <= 0) + *ret = NULL; + else { + void *copy; + + copy = memdup(src, size); + if (!copy) + return -ENOMEM; + + *ret = copy; + } + + if (ret_size) + *ret_size = size; + if (ret_start) + *ret_start = start; + + return 0; +} + +int dns_packet_read_uint8(DnsPacket *p, uint8_t *ret, size_t *start) { + const void *d; + int r; + + assert(p); + + r = dns_packet_read(p, sizeof(uint8_t), &d, start); + if (r < 0) + return r; + + *ret = ((uint8_t*) d)[0]; + return 0; +} + +int dns_packet_read_uint16(DnsPacket *p, uint16_t *ret, size_t *start) { + const void *d; + int r; + + assert(p); + + r = dns_packet_read(p, sizeof(uint16_t), &d, start); + if (r < 0) + return r; + + if (ret) + *ret = unaligned_read_be16(d); + + return 0; +} + +int dns_packet_read_uint32(DnsPacket *p, uint32_t *ret, size_t *start) { + const void *d; + int r; + + assert(p); + + r = dns_packet_read(p, sizeof(uint32_t), &d, start); + if (r < 0) + return r; + + *ret = unaligned_read_be32(d); + + return 0; +} + +int dns_packet_read_string(DnsPacket *p, char **ret, size_t *start) { + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + _cleanup_free_ char *t = NULL; + const void *d; + uint8_t c; + int r; + + assert(p); + + r = dns_packet_read_uint8(p, &c, NULL); + if (r < 0) + return r; + + r = dns_packet_read(p, c, &d, NULL); + if (r < 0) + return r; + + r = make_cstring(d, c, MAKE_CSTRING_REFUSE_TRAILING_NUL, &t); + if (r < 0) + return r; + + if (!utf8_is_valid(t)) + return -EBADMSG; + + *ret = TAKE_PTR(t); + + if (start) + *start = rewinder.saved_rindex; + CANCEL_REWINDER(rewinder); + + return 0; +} + +int dns_packet_read_raw_string(DnsPacket *p, const void **ret, size_t *size, size_t *start) { + assert(p); + + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + uint8_t c; + int r; + + r = dns_packet_read_uint8(p, &c, NULL); + if (r < 0) + return r; + + r = dns_packet_read(p, c, ret, NULL); + if (r < 0) + return r; + + if (size) + *size = c; + if (start) + *start = rewinder.saved_rindex; + CANCEL_REWINDER(rewinder); + + return 0; +} + +int dns_packet_read_name( + DnsPacket *p, + char **ret, + bool allow_compression, + size_t *ret_start) { + + assert(p); + + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + size_t after_rindex = 0, jump_barrier = p->rindex; + _cleanup_free_ char *name = NULL; + bool first = true; + size_t n = 0; + int r; + + if (p->refuse_compression) + allow_compression = false; + + for (;;) { + uint8_t c, d; + + r = dns_packet_read_uint8(p, &c, NULL); + if (r < 0) + return r; + + if (c == 0) + /* End of name */ + break; + else if (c <= 63) { + const char *label; + + /* Literal label */ + r = dns_packet_read(p, c, (const void**) &label, NULL); + if (r < 0) + return r; + + if (!GREEDY_REALLOC(name, n + !first + DNS_LABEL_ESCAPED_MAX)) + return -ENOMEM; + + if (first) + first = false; + else + name[n++] = '.'; + + r = dns_label_escape(label, c, name + n, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + n += r; + continue; + } else if (allow_compression && FLAGS_SET(c, 0xc0)) { + uint16_t ptr; + + /* Pointer */ + r = dns_packet_read_uint8(p, &d, NULL); + if (r < 0) + return r; + + ptr = (uint16_t) (c & ~0xc0) << 8 | (uint16_t) d; + if (ptr < DNS_PACKET_HEADER_SIZE || ptr >= jump_barrier) + return -EBADMSG; + + if (after_rindex == 0) + after_rindex = p->rindex; + + /* Jumps are limited to a "prior occurrence" (RFC-1035 4.1.4) */ + jump_barrier = ptr; + p->rindex = ptr; + } else + return -EBADMSG; + } + + if (!GREEDY_REALLOC(name, n + 1)) + return -ENOMEM; + + name[n] = 0; + + if (after_rindex != 0) + p->rindex= after_rindex; + + if (ret) + *ret = TAKE_PTR(name); + if (ret_start) + *ret_start = rewinder.saved_rindex; + + CANCEL_REWINDER(rewinder); + + return 0; +} + +static int dns_packet_read_type_window(DnsPacket *p, Bitmap **types, size_t *start) { + assert(p); + assert(types); + + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + uint8_t window, length; + const uint8_t *bitmap; + uint8_t bit = 0; + bool found = false; + int r; + + r = bitmap_ensure_allocated(types); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &window, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &length, NULL); + if (r < 0) + return r; + + if (length == 0 || length > 32) + return -EBADMSG; + + r = dns_packet_read(p, length, (const void **)&bitmap, NULL); + if (r < 0) + return r; + + for (uint8_t i = 0; i < length; i++) { + uint8_t bitmask = 1 << 7; + + if (!bitmap[i]) { + found = false; + bit += 8; + continue; + } + + found = true; + + for (; bitmask; bit++, bitmask >>= 1) + if (bitmap[i] & bitmask) { + uint16_t n; + + n = (uint16_t) window << 8 | (uint16_t) bit; + + /* Ignore pseudo-types. see RFC4034 section 4.1.2 */ + if (dns_type_is_pseudo(n)) + continue; + + r = bitmap_set(*types, n); + if (r < 0) + return r; + } + } + + if (!found) + return -EBADMSG; + + if (start) + *start = rewinder.saved_rindex; + CANCEL_REWINDER(rewinder); + + return 0; +} + +static int dns_packet_read_type_windows(DnsPacket *p, Bitmap **types, size_t size, size_t *start) { + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + int r; + + while (p->rindex - rewinder.saved_rindex < size) { + r = dns_packet_read_type_window(p, types, NULL); + if (r < 0) + return r; + + assert(p->rindex >= rewinder.saved_rindex); + + /* don't read past end of current RR */ + if (p->rindex - rewinder.saved_rindex > size) + return -EBADMSG; + } + + if (p->rindex - rewinder.saved_rindex != size) + return -EBADMSG; + + if (start) + *start = rewinder.saved_rindex; + CANCEL_REWINDER(rewinder); + + return 0; +} + +int dns_packet_read_key( + DnsPacket *p, + DnsResourceKey **ret, + bool *ret_cache_flush_or_qu, + size_t *ret_start) { + + assert(p); + + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + _cleanup_free_ char *name = NULL; + bool cache_flush_or_qu = false; + uint16_t class, type; + int r; + + r = dns_packet_read_name(p, &name, true, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint16(p, &type, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint16(p, &class, NULL); + if (r < 0) + return r; + + if (p->protocol == DNS_PROTOCOL_MDNS) { + /* See RFC6762, sections 5.4 and 10.2 */ + + if (type != DNS_TYPE_OPT && (class & MDNS_RR_CACHE_FLUSH_OR_QU)) { + class &= ~MDNS_RR_CACHE_FLUSH_OR_QU; + cache_flush_or_qu = true; + } + } + + if (ret) { + DnsResourceKey *key; + + key = dns_resource_key_new_consume(class, type, name); + if (!key) + return -ENOMEM; + + TAKE_PTR(name); + *ret = key; + } + + if (ret_cache_flush_or_qu) + *ret_cache_flush_or_qu = cache_flush_or_qu; + if (ret_start) + *ret_start = rewinder.saved_rindex; + + CANCEL_REWINDER(rewinder); + return 0; +} + +static bool loc_size_ok(uint8_t size) { + uint8_t m = size >> 4, e = size & 0xF; + + return m <= 9 && e <= 9 && (m > 0 || e == 0); +} + +int dns_packet_read_rr( + DnsPacket *p, + DnsResourceRecord **ret, + bool *ret_cache_flush, + size_t *ret_start) { + + assert(p); + + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + size_t offset; + uint16_t rdlength; + bool cache_flush; + int r; + + r = dns_packet_read_key(p, &key, &cache_flush, NULL); + if (r < 0) + return r; + + if (!dns_class_is_valid_rr(key->class) || !dns_type_is_valid_rr(key->type)) + return -EBADMSG; + + rr = dns_resource_record_new(key); + if (!rr) + return -ENOMEM; + + r = dns_packet_read_uint32(p, &rr->ttl, NULL); + if (r < 0) + return r; + + /* RFC 2181, Section 8, suggests to + * treat a TTL with the MSB set as a zero TTL. */ + if (rr->ttl & UINT32_C(0x80000000)) + rr->ttl = 0; + + r = dns_packet_read_uint16(p, &rdlength, NULL); + if (r < 0) + return r; + + if (rdlength > p->size - p->rindex) + return -EBADMSG; + + offset = p->rindex; + + switch (rr->key->type) { + + case DNS_TYPE_SRV: + r = dns_packet_read_uint16(p, &rr->srv.priority, NULL); + if (r < 0) + return r; + r = dns_packet_read_uint16(p, &rr->srv.weight, NULL); + if (r < 0) + return r; + r = dns_packet_read_uint16(p, &rr->srv.port, NULL); + if (r < 0) + return r; + + /* RFC 2782 states "Unless and until permitted by future standards action, name compression + * is not to be used for this field." Nonetheless, we support it here, in the interest of + * increasing compatibility with implementations that do not implement this correctly. After + * all we didn't do this right once upon a time ourselves (see + * https://github.com/systemd/systemd/issues/9793). */ + r = dns_packet_read_name(p, &rr->srv.name, /* allow_compression= */ true, NULL); + break; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + r = dns_packet_read_name(p, &rr->ptr.name, true, NULL); + break; + + case DNS_TYPE_HINFO: + r = dns_packet_read_string(p, &rr->hinfo.cpu, NULL); + if (r < 0) + return r; + + r = dns_packet_read_string(p, &rr->hinfo.os, NULL); + break; + + case DNS_TYPE_SPF: /* exactly the same as TXT */ + case DNS_TYPE_TXT: + if (rdlength <= 0) { + r = dns_txt_item_new_empty(&rr->txt.items); + if (r < 0) + return r; + } else { + DnsTxtItem *last = NULL; + + while (p->rindex - offset < rdlength) { + DnsTxtItem *i; + const void *data; + size_t sz; + + r = dns_packet_read_raw_string(p, &data, &sz, NULL); + if (r < 0) + return r; + + i = malloc0(offsetof(DnsTxtItem, data) + sz + 1); /* extra NUL byte at the end */ + if (!i) + return -ENOMEM; + + memcpy(i->data, data, sz); + i->length = sz; + + LIST_INSERT_AFTER(items, rr->txt.items, last, i); + last = i; + } + } + + r = 0; + break; + + case DNS_TYPE_A: + r = dns_packet_read_blob(p, &rr->a.in_addr, sizeof(struct in_addr), NULL); + break; + + case DNS_TYPE_AAAA: + r = dns_packet_read_blob(p, &rr->aaaa.in6_addr, sizeof(struct in6_addr), NULL); + break; + + case DNS_TYPE_SOA: + r = dns_packet_read_name(p, &rr->soa.mname, true, NULL); + if (r < 0) + return r; + + r = dns_packet_read_name(p, &rr->soa.rname, true, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->soa.serial, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->soa.refresh, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->soa.retry, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->soa.expire, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->soa.minimum, NULL); + break; + + case DNS_TYPE_MX: + r = dns_packet_read_uint16(p, &rr->mx.priority, NULL); + if (r < 0) + return r; + + r = dns_packet_read_name(p, &rr->mx.exchange, true, NULL); + break; + + case DNS_TYPE_LOC: { + uint8_t t; + size_t pos; + + r = dns_packet_read_uint8(p, &t, &pos); + if (r < 0) + return r; + + if (t == 0) { + rr->loc.version = t; + + r = dns_packet_read_uint8(p, &rr->loc.size, NULL); + if (r < 0) + return r; + + if (!loc_size_ok(rr->loc.size)) + return -EBADMSG; + + r = dns_packet_read_uint8(p, &rr->loc.horiz_pre, NULL); + if (r < 0) + return r; + + if (!loc_size_ok(rr->loc.horiz_pre)) + return -EBADMSG; + + r = dns_packet_read_uint8(p, &rr->loc.vert_pre, NULL); + if (r < 0) + return r; + + if (!loc_size_ok(rr->loc.vert_pre)) + return -EBADMSG; + + r = dns_packet_read_uint32(p, &rr->loc.latitude, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->loc.longitude, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->loc.altitude, NULL); + if (r < 0) + return r; + + break; + } else { + dns_packet_rewind(p, pos); + rr->unparsable = true; + goto unparsable; + } + } + + case DNS_TYPE_DS: + r = dns_packet_read_uint16(p, &rr->ds.key_tag, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->ds.algorithm, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->ds.digest_type, NULL); + if (r < 0) + return r; + + if (rdlength < 4) + return -EBADMSG; + + r = dns_packet_read_memdup(p, rdlength - 4, + &rr->ds.digest, &rr->ds.digest_size, + NULL); + if (r < 0) + return r; + + if (rr->ds.digest_size <= 0) + /* the accepted size depends on the algorithm, but for now + just ensure that the value is greater than zero */ + return -EBADMSG; + + break; + + case DNS_TYPE_SSHFP: + r = dns_packet_read_uint8(p, &rr->sshfp.algorithm, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->sshfp.fptype, NULL); + if (r < 0) + return r; + + if (rdlength < 2) + return -EBADMSG; + + r = dns_packet_read_memdup(p, rdlength - 2, + &rr->sshfp.fingerprint, &rr->sshfp.fingerprint_size, + NULL); + + if (rr->sshfp.fingerprint_size <= 0) + /* the accepted size depends on the algorithm, but for now + just ensure that the value is greater than zero */ + return -EBADMSG; + + break; + + case DNS_TYPE_DNSKEY: + r = dns_packet_read_uint16(p, &rr->dnskey.flags, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->dnskey.protocol, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->dnskey.algorithm, NULL); + if (r < 0) + return r; + + if (rdlength < 4) + return -EBADMSG; + + r = dns_packet_read_memdup(p, rdlength - 4, + &rr->dnskey.key, &rr->dnskey.key_size, + NULL); + + if (rr->dnskey.key_size <= 0) + /* the accepted size depends on the algorithm, but for now + just ensure that the value is greater than zero */ + return -EBADMSG; + + break; + + case DNS_TYPE_RRSIG: + r = dns_packet_read_uint16(p, &rr->rrsig.type_covered, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->rrsig.algorithm, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->rrsig.labels, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->rrsig.original_ttl, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->rrsig.expiration, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &rr->rrsig.inception, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint16(p, &rr->rrsig.key_tag, NULL); + if (r < 0) + return r; + + r = dns_packet_read_name(p, &rr->rrsig.signer, false, NULL); + if (r < 0) + return r; + + if (rdlength < p->rindex - offset) + return -EBADMSG; + + r = dns_packet_read_memdup(p, offset + rdlength - p->rindex, + &rr->rrsig.signature, &rr->rrsig.signature_size, + NULL); + + if (rr->rrsig.signature_size <= 0) + /* the accepted size depends on the algorithm, but for now + just ensure that the value is greater than zero */ + return -EBADMSG; + + break; + + case DNS_TYPE_NSEC: { + + /* + * RFC6762, section 18.14 explicitly states mDNS should use name compression. + * This contradicts RFC3845, section 2.1.1 + */ + + bool allow_compressed = p->protocol == DNS_PROTOCOL_MDNS; + + r = dns_packet_read_name(p, &rr->nsec.next_domain_name, allow_compressed, NULL); + if (r < 0) + return r; + + if (rdlength < p->rindex - offset) + return -EBADMSG; + + r = dns_packet_read_type_windows(p, &rr->nsec.types, offset + rdlength - p->rindex, NULL); + + /* We accept empty NSEC bitmaps. The bit indicating the presence of the NSEC record itself + * is redundant and in e.g., RFC4956 this fact is used to define a use for NSEC records + * without the NSEC bit set. */ + + break; + } + case DNS_TYPE_NSEC3: { + uint8_t size; + + r = dns_packet_read_uint8(p, &rr->nsec3.algorithm, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->nsec3.flags, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint16(p, &rr->nsec3.iterations, NULL); + if (r < 0) + return r; + + /* this may be zero */ + r = dns_packet_read_uint8(p, &size, NULL); + if (r < 0) + return r; + + r = dns_packet_read_memdup(p, size, &rr->nsec3.salt, &rr->nsec3.salt_size, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &size, NULL); + if (r < 0) + return r; + + if (size <= 0) + return -EBADMSG; + + r = dns_packet_read_memdup(p, size, + &rr->nsec3.next_hashed_name, &rr->nsec3.next_hashed_name_size, + NULL); + if (r < 0) + return r; + + if (rdlength < p->rindex - offset) + return -EBADMSG; + + r = dns_packet_read_type_windows(p, &rr->nsec3.types, offset + rdlength - p->rindex, NULL); + + /* empty non-terminals can have NSEC3 records, so empty bitmaps are allowed */ + + break; + } + + case DNS_TYPE_TLSA: + r = dns_packet_read_uint8(p, &rr->tlsa.cert_usage, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->tlsa.selector, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint8(p, &rr->tlsa.matching_type, NULL); + if (r < 0) + return r; + + if (rdlength < 3) + return -EBADMSG; + + r = dns_packet_read_memdup(p, rdlength - 3, + &rr->tlsa.data, &rr->tlsa.data_size, + NULL); + + if (rr->tlsa.data_size <= 0) + /* the accepted size depends on the algorithm, but for now + just ensure that the value is greater than zero */ + return -EBADMSG; + + break; + + case DNS_TYPE_CAA: + r = dns_packet_read_uint8(p, &rr->caa.flags, NULL); + if (r < 0) + return r; + + r = dns_packet_read_string(p, &rr->caa.tag, NULL); + if (r < 0) + return r; + + if (rdlength < p->rindex - offset) + return -EBADMSG; + + r = dns_packet_read_memdup(p, + rdlength + offset - p->rindex, + &rr->caa.value, &rr->caa.value_size, NULL); + + break; + + case DNS_TYPE_OPT: /* we only care about the header of OPT for now. */ + case DNS_TYPE_OPENPGPKEY: + default: + unparsable: + r = dns_packet_read_memdup(p, rdlength, &rr->generic.data, &rr->generic.data_size, NULL); + + break; + } + if (r < 0) + return r; + if (p->rindex - offset != rdlength) + return -EBADMSG; + + if (ret) + *ret = TAKE_PTR(rr); + if (ret_cache_flush) + *ret_cache_flush = cache_flush; + if (ret_start) + *ret_start = rewinder.saved_rindex; + + CANCEL_REWINDER(rewinder); + return 0; +} + +static bool opt_is_good(DnsResourceRecord *rr, bool *rfc6975) { + const uint8_t* p; + bool found_dau_dhu_n3u = false; + size_t l; + + /* Checks whether the specified OPT RR is well-formed and whether it contains RFC6975 data (which is not OK in + * a reply). */ + + assert(rr); + assert(rr->key->type == DNS_TYPE_OPT); + + /* Check that the version is 0 */ + if (((rr->ttl >> 16) & UINT32_C(0xFF)) != 0) { + *rfc6975 = false; + return true; /* if it's not version 0, it's OK, but we will ignore the OPT field contents */ + } + + p = rr->opt.data; + l = rr->opt.data_size; + while (l > 0) { + uint16_t option_code, option_length; + + /* At least four bytes for OPTION-CODE and OPTION-LENGTH are required */ + if (l < 4U) + return false; + + option_code = unaligned_read_be16(p); + option_length = unaligned_read_be16(p + 2); + + if (l < option_length + 4U) + return false; + + /* RFC 6975 DAU, DHU or N3U fields found. */ + if (IN_SET(option_code, 5, 6, 7)) + found_dau_dhu_n3u = true; + + p += option_length + 4U; + l -= option_length + 4U; + } + + *rfc6975 = found_dau_dhu_n3u; + return true; +} + +static int dns_packet_extract_question(DnsPacket *p, DnsQuestion **ret_question) { + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + unsigned n; + int r; + + n = DNS_PACKET_QDCOUNT(p); + if (n > 0) { + question = dns_question_new(n); + if (!question) + return -ENOMEM; + + _cleanup_set_free_ Set *keys = NULL; /* references to keys are kept by Question */ + + keys = set_new(&dns_resource_key_hash_ops); + if (!keys) + return log_oom(); + + r = set_reserve(keys, n * 2); /* Higher multipliers give slightly higher efficiency through + * hash collisions, but the gains quickly drop off after 2. */ + if (r < 0) + return r; + + for (unsigned i = 0; i < n; i++) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + bool qu; + + r = dns_packet_read_key(p, &key, &qu, NULL); + if (r < 0) + return r; + + if (!dns_type_is_valid_query(key->type)) + return -EBADMSG; + + r = set_put(keys, key); + if (r < 0) + return r; + if (r == 0) + /* Already in the Question, let's skip */ + continue; + + r = dns_question_add_raw(question, key, qu ? DNS_QUESTION_WANTS_UNICAST_REPLY : 0); + if (r < 0) + return r; + } + } + + *ret_question = TAKE_PTR(question); + + return 0; +} + +static int dns_packet_extract_answer(DnsPacket *p, DnsAnswer **ret_answer) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + unsigned n; + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *previous = NULL; + bool bad_opt = false; + int r; + + n = DNS_PACKET_RRCOUNT(p); + if (n == 0) + return 0; + + answer = dns_answer_new(n); + if (!answer) + return -ENOMEM; + + for (unsigned i = 0; i < n; i++) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + bool cache_flush = false; + size_t start; + + if (p->rindex == p->size && p->opt) { + /* If we reached the end of the packet already, but there are still more RRs + * declared, then that's a corrupt packet. Let's accept the packet anyway, since it's + * apparently a common bug in routers. Let's however suppress OPT support in this + * case, so that we force the rest of the logic into lowest DNS baseline support. Or + * to say this differently: if the DNS server doesn't even get the RR counts right, + * it's highly unlikely it gets EDNS right. */ + log_debug("More resource records declared in packet than included, suppressing OPT."); + bad_opt = true; + break; + } + + r = dns_packet_read_rr(p, &rr, &cache_flush, &start); + if (r < 0) + return r; + + /* Try to reduce memory usage a bit */ + if (previous) + dns_resource_key_reduce(&rr->key, &previous->key); + + if (rr->key->type == DNS_TYPE_OPT) { + bool has_rfc6975; + + if (p->opt || bad_opt) { + /* Multiple OPT RRs? if so, let's ignore all, because there's + * something wrong with the server, and if one is valid we wouldn't + * know which one. */ + log_debug("Multiple OPT RRs detected, ignoring all."); + bad_opt = true; + continue; + } + + if (!dns_name_is_root(dns_resource_key_name(rr->key))) { + /* If the OPT RR is not owned by the root domain, then it is bad, + * let's ignore it. */ + log_debug("OPT RR is not owned by root domain, ignoring."); + bad_opt = true; + continue; + } + + if (i < DNS_PACKET_ANCOUNT(p) + DNS_PACKET_NSCOUNT(p)) { + /* OPT RR is in the wrong section? Some Belkin routers do this. This + * is a hint the EDNS implementation is borked, like the Belkin one + * is, hence ignore it. */ + log_debug("OPT RR in wrong section, ignoring."); + bad_opt = true; + continue; + } + + if (!opt_is_good(rr, &has_rfc6975)) { + log_debug("Malformed OPT RR, ignoring."); + bad_opt = true; + continue; + } + + if (DNS_PACKET_QR(p)) { + /* Additional checks for responses */ + + if (!DNS_RESOURCE_RECORD_OPT_VERSION_SUPPORTED(rr)) + /* If this is a reply and we don't know the EDNS version + * then something is weird... */ + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "EDNS version newer that our request, bad server."); + + if (has_rfc6975) { + /* If the OPT RR contains RFC6975 algorithm data, then this + * is indication that the server just copied the OPT it got + * from us (which contained that data) back into the reply. + * If so, then it doesn't properly support EDNS, as RFC6975 + * makes it very clear that the algorithm data should only + * be contained in questions, never in replies. Crappy + * Belkin routers copy the OPT data for example, hence let's + * detect this so that we downgrade early. */ + log_debug("OPT RR contains RFC6975 data, ignoring."); + bad_opt = true; + continue; + } + } + + p->opt = dns_resource_record_ref(rr); + p->opt_start = start; + assert(p->rindex >= start); + p->opt_size = p->rindex - start; + } else { + DnsAnswerFlags flags = 0; + + if (p->protocol == DNS_PROTOCOL_MDNS) { + flags |= DNS_ANSWER_REFUSE_TTL_NO_MATCH; + if (!cache_flush) + flags |= DNS_ANSWER_SHARED_OWNER; + } + + /* According to RFC 4795, section 2.9. only the RRs from the Answer section shall be + * cached. Hence mark only those RRs as cacheable by default, but not the ones from + * the Additional or Authority sections. + * This restriction does not apply to mDNS records (RFC 6762). */ + if (i < DNS_PACKET_ANCOUNT(p)) + flags |= DNS_ANSWER_CACHEABLE|DNS_ANSWER_SECTION_ANSWER; + else if (i < DNS_PACKET_ANCOUNT(p) + DNS_PACKET_NSCOUNT(p)) + flags |= DNS_ANSWER_SECTION_AUTHORITY; + else { + flags |= DNS_ANSWER_SECTION_ADDITIONAL; + if (p->protocol == DNS_PROTOCOL_MDNS) + flags |= DNS_ANSWER_CACHEABLE; + } + + r = dns_answer_add(answer, rr, p->ifindex, flags, NULL); + if (r < 0) + return r; + } + + /* Remember this RR, so that we can potentially merge its ->key object with the + * next RR. Note that we only do this if we actually decided to keep the RR around. + */ + DNS_RR_REPLACE(previous, dns_resource_record_ref(rr)); + } + + if (bad_opt) { + p->opt = dns_resource_record_unref(p->opt); + p->opt_start = p->opt_size = SIZE_MAX; + } + + *ret_answer = TAKE_PTR(answer); + + return 0; +} + +int dns_packet_extract(DnsPacket *p) { + assert(p); + + if (p->extracted) + return 0; + + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + _unused_ _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + int r; + + dns_packet_rewind(p, DNS_PACKET_HEADER_SIZE); + + r = dns_packet_extract_question(p, &question); + if (r < 0) + return r; + + r = dns_packet_extract_answer(p, &answer); + if (r < 0) + return r; + + if (p->rindex < p->size) { + log_debug("Trailing garbage in packet, suppressing OPT."); + p->opt = dns_resource_record_unref(p->opt); + p->opt_start = p->opt_size = SIZE_MAX; + } + + p->question = TAKE_PTR(question); + p->answer = TAKE_PTR(answer); + p->extracted = true; + + /* no CANCEL, always rewind */ + return 0; +} + +int dns_packet_is_reply_for(DnsPacket *p, const DnsResourceKey *key) { + int r; + + assert(p); + assert(key); + + /* Checks if the specified packet is a reply for the specified + * key and the specified key is the only one in the question + * section. */ + + if (DNS_PACKET_QR(p) != 1) + return 0; + + /* Let's unpack the packet, if that hasn't happened yet. */ + r = dns_packet_extract(p); + if (r < 0) + return r; + + if (!p->question) + return 0; + + if (p->question->n_keys != 1) + return 0; + + return dns_resource_key_equal(dns_question_first_key(p->question), key); +} + +int dns_packet_patch_max_udp_size(DnsPacket *p, uint16_t max_udp_size) { + assert(p); + assert(max_udp_size >= DNS_PACKET_UNICAST_SIZE_MAX); + + if (p->opt_start == SIZE_MAX) /* No OPT section, nothing to patch */ + return 0; + + assert(p->opt_size != SIZE_MAX); + assert(p->opt_size >= 5); + + unaligned_write_be16(DNS_PACKET_DATA(p) + p->opt_start + 3, max_udp_size); + return 1; +} + +static int patch_rr(DnsPacket *p, usec_t age) { + _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + size_t ttl_index; + uint32_t ttl; + uint16_t type, rdlength; + int r; + + /* Patches the RR at the current rindex, subtracts the specified time from the TTL */ + + r = dns_packet_read_name(p, NULL, true, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint16(p, &type, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint16(p, NULL, NULL); + if (r < 0) + return r; + + r = dns_packet_read_uint32(p, &ttl, &ttl_index); + if (r < 0) + return r; + + if (type != DNS_TYPE_OPT) { /* The TTL of the OPT field is not actually a TTL, skip it */ + ttl = LESS_BY(ttl * USEC_PER_SEC, age) / USEC_PER_SEC; + unaligned_write_be32(DNS_PACKET_DATA(p) + ttl_index, ttl); + } + + r = dns_packet_read_uint16(p, &rdlength, NULL); + if (r < 0) + return r; + + r = dns_packet_read(p, rdlength, NULL, NULL); + if (r < 0) + return r; + + CANCEL_REWINDER(rewinder); + return 0; +} + +int dns_packet_patch_ttls(DnsPacket *p, usec_t timestamp) { + assert(p); + assert(timestamp_is_set(timestamp)); + + /* Adjusts all TTLs in the packet by subtracting the time difference between now and the specified timestamp */ + + _unused_ _cleanup_(rewind_dns_packet) DnsPacketRewinder rewinder = REWINDER_INIT(p); + unsigned n; + usec_t k; + int r; + + k = now(CLOCK_BOOTTIME); + assert(k >= timestamp); + k -= timestamp; + + dns_packet_rewind(p, DNS_PACKET_HEADER_SIZE); + + n = DNS_PACKET_QDCOUNT(p); + for (unsigned i = 0; i < n; i++) { + r = dns_packet_read_key(p, NULL, NULL, NULL); + if (r < 0) + return r; + } + + n = DNS_PACKET_RRCOUNT(p); + for (unsigned i = 0; i < n; i++) { + + /* DNS servers suck, hence the RR count is in many servers off. If we reached the end + * prematurely, accept that, exit early */ + if (p->rindex == p->size) + break; + + r = patch_rr(p, k); + if (r < 0) + return r; + } + + return 0; +} + +static void dns_packet_hash_func(const DnsPacket *s, struct siphash *state) { + assert(s); + + siphash24_compress(&s->size, sizeof(s->size), state); + siphash24_compress(DNS_PACKET_DATA((DnsPacket*) s), s->size, state); +} + +static int dns_packet_compare_func(const DnsPacket *x, const DnsPacket *y) { + int r; + + r = CMP(x->size, y->size); + if (r != 0) + return r; + + return memcmp(DNS_PACKET_DATA((DnsPacket*) x), DNS_PACKET_DATA((DnsPacket*) y), x->size); +} + +DEFINE_HASH_OPS(dns_packet_hash_ops, DnsPacket, dns_packet_hash_func, dns_packet_compare_func); + +bool dns_packet_equal(const DnsPacket *a, const DnsPacket *b) { + return dns_packet_compare_func(a, b) == 0; +} + +int dns_packet_has_nsid_request(DnsPacket *p) { + bool has_nsid = false; + const uint8_t *d; + size_t l; + + assert(p); + + if (!p->opt) + return false; + + d = p->opt->opt.data; + l = p->opt->opt.data_size; + + while (l > 0) { + uint16_t code, length; + + if (l < 4U) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "EDNS0 variable part has invalid size."); + + code = unaligned_read_be16(d); + length = unaligned_read_be16(d + 2); + + if (l < 4U + length) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Truncated option in EDNS0 variable part."); + + if (code == 3) { + if (has_nsid) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Duplicate NSID option in EDNS0 variable part."); + + if (length != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Non-empty NSID option in DNS request."); + + has_nsid = true; + } + + d += 4U + length; + l -= 4U + length; + } + + return has_nsid; +} + +size_t dns_packet_size_unfragmented(DnsPacket *p) { + assert(p); + + if (p->fragsize == 0) /* Wasn't fragmented */ + return p->size; + + /* The fragment size (p->fragsize) covers the whole (fragmented) IP packet, while the regular packet + * size (p->size) only covers the DNS part. Thus, subtract the UDP header from the largest fragment + * size, in order to determine which size of DNS packet would have gone through without + * fragmenting. */ + + return LESS_BY(p->fragsize, udp_header_size(p->family)); +} + +static const char* const dns_rcode_table[_DNS_RCODE_MAX_DEFINED] = { + [DNS_RCODE_SUCCESS] = "SUCCESS", + [DNS_RCODE_FORMERR] = "FORMERR", + [DNS_RCODE_SERVFAIL] = "SERVFAIL", + [DNS_RCODE_NXDOMAIN] = "NXDOMAIN", + [DNS_RCODE_NOTIMP] = "NOTIMP", + [DNS_RCODE_REFUSED] = "REFUSED", + [DNS_RCODE_YXDOMAIN] = "YXDOMAIN", + [DNS_RCODE_YXRRSET] = "YRRSET", + [DNS_RCODE_NXRRSET] = "NXRRSET", + [DNS_RCODE_NOTAUTH] = "NOTAUTH", + [DNS_RCODE_NOTZONE] = "NOTZONE", + [DNS_RCODE_BADVERS] = "BADVERS", + [DNS_RCODE_BADKEY] = "BADKEY", + [DNS_RCODE_BADTIME] = "BADTIME", + [DNS_RCODE_BADMODE] = "BADMODE", + [DNS_RCODE_BADNAME] = "BADNAME", + [DNS_RCODE_BADALG] = "BADALG", + [DNS_RCODE_BADTRUNC] = "BADTRUNC", + [DNS_RCODE_BADCOOKIE] = "BADCOOKIE", +}; +DEFINE_STRING_TABLE_LOOKUP(dns_rcode, int); + +const char *format_dns_rcode(int i, char buf[static DECIMAL_STR_MAX(int)]) { + const char *p = dns_rcode_to_string(i); + if (p) + return p; + + return snprintf_ok(buf, DECIMAL_STR_MAX(int), "%i", i); +} + +static const char* const dns_protocol_table[_DNS_PROTOCOL_MAX] = { + [DNS_PROTOCOL_DNS] = "dns", + [DNS_PROTOCOL_MDNS] = "mdns", + [DNS_PROTOCOL_LLMNR] = "llmnr", +}; +DEFINE_STRING_TABLE_LOOKUP(dns_protocol, DnsProtocol); diff --git a/src/resolve/resolved-dns-packet.h b/src/resolve/resolved-dns-packet.h new file mode 100644 index 0000000..a6af44c --- /dev/null +++ b/src/resolve/resolved-dns-packet.h @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "hashmap.h" +#include "in-addr-util.h" +#include "macro.h" +#include "sparse-endian.h" + +typedef struct DnsPacketHeader DnsPacketHeader; +typedef struct DnsPacket DnsPacket; + +#include "resolved-def.h" +#include "resolved-dns-answer.h" +#include "resolved-dns-question.h" +#include "resolved-dns-rr.h" + +typedef enum DnsProtocol { + DNS_PROTOCOL_DNS, + DNS_PROTOCOL_MDNS, + DNS_PROTOCOL_LLMNR, + _DNS_PROTOCOL_MAX, + _DNS_PROTOCOL_INVALID = -EINVAL, +} DnsProtocol; + +struct DnsPacketHeader { + uint16_t id; + be16_t flags; + be16_t qdcount; + be16_t ancount; + be16_t nscount; + be16_t arcount; +} _packed_; + +#define DNS_PACKET_HEADER_SIZE sizeof(DnsPacketHeader) +#define UDP4_PACKET_HEADER_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr)) +#define UDP6_PACKET_HEADER_SIZE (sizeof(struct ip6_hdr) + sizeof(struct udphdr)) + +assert_cc(sizeof(struct ip6_hdr) == 40); +assert_cc(sizeof(struct iphdr) == 20); +assert_cc(sizeof(struct udphdr) == 8); +assert_cc(sizeof(DnsPacketHeader) == 12); + +/* The various DNS protocols deviate in how large a packet can grow, but the TCP transport has a 16-bit size + * field, hence that appears to be the absolute maximum. */ +#define DNS_PACKET_SIZE_MAX 0xFFFFu + +/* The default size to use for allocation when we don't know how large + * the packet will turn out to be. */ +#define DNS_PACKET_SIZE_START 512u + +/* RFC 1035 say 512 is the maximum, for classic unicast DNS */ +#define DNS_PACKET_UNICAST_SIZE_MAX 512u + +/* With EDNS0 we can use larger packets, default to 1232, which is what is commonly used */ +#define DNS_PACKET_UNICAST_SIZE_LARGE_MAX 1232u + +struct DnsPacket { + unsigned n_ref; + DnsProtocol protocol; + size_t size, allocated, rindex, max_size, fragsize; + void *_data; /* don't access directly, use DNS_PACKET_DATA()! */ + Hashmap *names; /* For name compression */ + size_t opt_start, opt_size; + + /* Parsed data */ + DnsQuestion *question; + DnsAnswer *answer; + DnsResourceRecord *opt; + + /* For support of truncated packets */ + DnsPacket *more; + + /* Packet reception metadata */ + usec_t timestamp; /* CLOCK_BOOTTIME (or CLOCK_MONOTONIC if the former doesn't exist) */ + int ifindex; + int family, ipproto; + union in_addr_union sender, destination; + uint16_t sender_port, destination_port; + uint32_t ttl; + + bool on_stack; + bool extracted; + bool refuse_compression; + bool canonical_form; + + /* Note: fields should be ordered to minimize alignment gaps. Use pahole! */ +}; + +static inline uint8_t* DNS_PACKET_DATA(const DnsPacket *p) { + if (_unlikely_(!p)) + return NULL; + + if (p->_data) + return p->_data; + + return ((uint8_t*) p) + ALIGN(sizeof(DnsPacket)); +} + +#define DNS_PACKET_HEADER(p) ((DnsPacketHeader*) DNS_PACKET_DATA(p)) +#define DNS_PACKET_ID(p) DNS_PACKET_HEADER(p)->id +#define DNS_PACKET_QR(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 15) & 1) +#define DNS_PACKET_OPCODE(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 11) & 15) +#define DNS_PACKET_AA(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 10) & 1) +#define DNS_PACKET_TC(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 9) & 1) +#define DNS_PACKET_RD(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 8) & 1) +#define DNS_PACKET_RA(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 7) & 1) +#define DNS_PACKET_AD(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 5) & 1) +#define DNS_PACKET_CD(p) ((be16toh(DNS_PACKET_HEADER(p)->flags) >> 4) & 1) + +#define DNS_PACKET_FLAG_TC (UINT16_C(1) << 9) + +static inline uint16_t DNS_PACKET_RCODE(DnsPacket *p) { + uint16_t rcode; + + if (p->opt) + rcode = (uint16_t) (p->opt->ttl >> 24); + else + rcode = 0; + + return rcode | (be16toh(DNS_PACKET_HEADER(p)->flags) & 0xF); +} + +static inline uint16_t DNS_PACKET_PAYLOAD_SIZE_MAX(DnsPacket *p) { + + /* Returns the advertised maximum size for replies, or the DNS default if there's nothing defined. */ + + if (p->ipproto == IPPROTO_TCP) /* we ignore EDNS(0) size data on TCP, like everybody else */ + return DNS_PACKET_SIZE_MAX; + + if (p->opt) + return MAX(DNS_PACKET_UNICAST_SIZE_MAX, p->opt->key->class); + + return DNS_PACKET_UNICAST_SIZE_MAX; +} + +static inline bool DNS_PACKET_DO(DnsPacket *p) { + if (!p->opt) + return false; + + return !!(p->opt->ttl & (1U << 15)); +} + +static inline bool DNS_PACKET_VERSION_SUPPORTED(DnsPacket *p) { + /* Returns true if this packet is in a version we support. Which means either non-EDNS or EDNS(0), but not EDNS + * of any newer versions */ + + if (!p->opt) + return true; + + return DNS_RESOURCE_RECORD_OPT_VERSION_SUPPORTED(p->opt); +} + +static inline bool DNS_PACKET_IS_FRAGMENTED(DnsPacket *p) { + assert(p); + + /* For ingress packets: was this packet fragmented according to our knowledge? */ + + return p->fragsize != 0; +} + +/* LLMNR defines some bits differently */ +#define DNS_PACKET_LLMNR_C(p) DNS_PACKET_AA(p) +#define DNS_PACKET_LLMNR_T(p) DNS_PACKET_RD(p) + +#define DNS_PACKET_QDCOUNT(p) be16toh(DNS_PACKET_HEADER(p)->qdcount) +#define DNS_PACKET_ANCOUNT(p) be16toh(DNS_PACKET_HEADER(p)->ancount) +#define DNS_PACKET_NSCOUNT(p) be16toh(DNS_PACKET_HEADER(p)->nscount) +#define DNS_PACKET_ARCOUNT(p) be16toh(DNS_PACKET_HEADER(p)->arcount) + +#define DNS_PACKET_MAKE_FLAGS(qr, opcode, aa, tc, rd, ra, ad, cd, rcode) \ + (((uint16_t) !!(qr) << 15) | \ + ((uint16_t) ((opcode) & 15) << 11) | \ + ((uint16_t) !!(aa) << 10) | /* on LLMNR: c */ \ + ((uint16_t) !!(tc) << 9) | \ + ((uint16_t) !!(rd) << 8) | /* on LLMNR: t */ \ + ((uint16_t) !!(ra) << 7) | \ + ((uint16_t) !!(ad) << 5) | \ + ((uint16_t) !!(cd) << 4) | \ + ((uint16_t) ((rcode) & 15))) + +static inline unsigned DNS_PACKET_RRCOUNT(DnsPacket *p) { + return + (unsigned) DNS_PACKET_ANCOUNT(p) + + (unsigned) DNS_PACKET_NSCOUNT(p) + + (unsigned) DNS_PACKET_ARCOUNT(p); +} + +int dns_packet_new(DnsPacket **p, DnsProtocol protocol, size_t min_alloc_dsize, size_t max_size); +int dns_packet_new_query(DnsPacket **p, DnsProtocol protocol, size_t min_alloc_dsize, bool dnssec_checking_disabled); + +int dns_packet_dup(DnsPacket **ret, DnsPacket *p); + +void dns_packet_set_flags(DnsPacket *p, bool dnssec_checking_disabled, bool truncated); + +DnsPacket *dns_packet_ref(DnsPacket *p); +DnsPacket *dns_packet_unref(DnsPacket *p); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsPacket*, dns_packet_unref); + +#define DNS_PACKET_REPLACE(a, b) \ + do { \ + typeof(a)* _a = &(a); \ + typeof(b) _b = (b); \ + dns_packet_unref(*_a); \ + *_a = _b; \ + } while(0) + +int dns_packet_validate(DnsPacket *p); +int dns_packet_validate_reply(DnsPacket *p); +int dns_packet_validate_query(DnsPacket *p); + +int dns_packet_is_reply_for(DnsPacket *p, const DnsResourceKey *key); + +int dns_packet_append_blob(DnsPacket *p, const void *d, size_t sz, size_t *start); +int dns_packet_append_uint8(DnsPacket *p, uint8_t v, size_t *start); +int dns_packet_append_uint16(DnsPacket *p, uint16_t v, size_t *start); +int dns_packet_append_uint32(DnsPacket *p, uint32_t v, size_t *start); +int dns_packet_append_string(DnsPacket *p, const char *s, size_t *start); +int dns_packet_append_raw_string(DnsPacket *p, const void *s, size_t size, size_t *start); +int dns_packet_append_label(DnsPacket *p, const char *s, size_t l, bool canonical_candidate, size_t *start); +int dns_packet_append_name(DnsPacket *p, const char *name, bool allow_compression, bool canonical_candidate, size_t *start); +int dns_packet_append_key(DnsPacket *p, const DnsResourceKey *key, const DnsAnswerFlags flags, size_t *start); +int dns_packet_append_rr(DnsPacket *p, const DnsResourceRecord *rr, const DnsAnswerFlags flags, size_t *start, size_t *rdata_start); +int dns_packet_append_opt(DnsPacket *p, uint16_t max_udp_size, bool edns0_do, bool include_rfc6975, const char *nsid, int rcode, size_t *ret_start); +int dns_packet_append_question(DnsPacket *p, DnsQuestion *q); +int dns_packet_append_answer(DnsPacket *p, DnsAnswer *a, unsigned *completed); + +int dns_packet_patch_max_udp_size(DnsPacket *p, uint16_t max_udp_size); +int dns_packet_patch_ttls(DnsPacket *p, usec_t timestamp); + +void dns_packet_truncate(DnsPacket *p, size_t sz); +int dns_packet_truncate_opt(DnsPacket *p); + +int dns_packet_read(DnsPacket *p, size_t sz, const void **ret, size_t *start); +int dns_packet_read_blob(DnsPacket *p, void *d, size_t sz, size_t *start); +int dns_packet_read_uint8(DnsPacket *p, uint8_t *ret, size_t *start); +int dns_packet_read_uint16(DnsPacket *p, uint16_t *ret, size_t *start); +int dns_packet_read_uint32(DnsPacket *p, uint32_t *ret, size_t *start); +int dns_packet_read_string(DnsPacket *p, char **ret, size_t *start); +int dns_packet_read_raw_string(DnsPacket *p, const void **ret, size_t *size, size_t *start); +int dns_packet_read_name(DnsPacket *p, char **ret, bool allow_compression, size_t *start); +int dns_packet_read_key(DnsPacket *p, DnsResourceKey **ret, bool *ret_cache_flush_or_qu, size_t *start); +int dns_packet_read_rr(DnsPacket *p, DnsResourceRecord **ret, bool *ret_cache_flush, size_t *start); + +void dns_packet_rewind(DnsPacket *p, size_t idx); + +int dns_packet_skip_question(DnsPacket *p); +int dns_packet_extract(DnsPacket *p); + +bool dns_packet_equal(const DnsPacket *a, const DnsPacket *b); + +int dns_packet_has_nsid_request(DnsPacket *p); + +/* https://www.iana.org/assignments/dns-parameters/dns-parameters.xhtml#dns-parameters-6 */ +enum { + DNS_RCODE_SUCCESS = 0, + DNS_RCODE_FORMERR = 1, + DNS_RCODE_SERVFAIL = 2, + DNS_RCODE_NXDOMAIN = 3, + DNS_RCODE_NOTIMP = 4, + DNS_RCODE_REFUSED = 5, + DNS_RCODE_YXDOMAIN = 6, + DNS_RCODE_YXRRSET = 7, + DNS_RCODE_NXRRSET = 8, + DNS_RCODE_NOTAUTH = 9, + DNS_RCODE_NOTZONE = 10, + DNS_RCODE_BADVERS = 16, + DNS_RCODE_BADSIG = 16, /* duplicate value! */ + DNS_RCODE_BADKEY = 17, + DNS_RCODE_BADTIME = 18, + DNS_RCODE_BADMODE = 19, + DNS_RCODE_BADNAME = 20, + DNS_RCODE_BADALG = 21, + DNS_RCODE_BADTRUNC = 22, + DNS_RCODE_BADCOOKIE = 23, + _DNS_RCODE_MAX_DEFINED, + _DNS_RCODE_MAX = 4095 /* 4 bit rcode in the header plus 8 bit rcode in OPT, makes 12 bit */ +}; + +const char* dns_rcode_to_string(int i) _const_; +int dns_rcode_from_string(const char *s) _pure_; +const char *format_dns_rcode(int i, char buf[static DECIMAL_STR_MAX(int)]); +#define FORMAT_DNS_RCODE(i) format_dns_rcode(i, (char [DECIMAL_STR_MAX(int)]) {}) + +const char* dns_protocol_to_string(DnsProtocol p) _const_; +DnsProtocol dns_protocol_from_string(const char *s) _pure_; + +#define LLMNR_MULTICAST_IPV4_ADDRESS ((struct in_addr) { .s_addr = htobe32(224U << 24 | 252U) }) +#define LLMNR_MULTICAST_IPV6_ADDRESS ((struct in6_addr) { .s6_addr = { 0xFF, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x03 } }) + +#define MDNS_MULTICAST_IPV4_ADDRESS ((struct in_addr) { .s_addr = htobe32(224U << 24 | 251U) }) +#define MDNS_MULTICAST_IPV6_ADDRESS ((struct in6_addr) { .s6_addr = { 0xFF, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfb } }) + +extern const struct hash_ops dns_packet_hash_ops; + +static inline uint64_t SD_RESOLVED_FLAGS_MAKE( + DnsProtocol protocol, + int family, + bool authenticated, + bool confidential) { + uint64_t f; + + /* Converts a protocol + family into a flags field as used in queries and responses */ + + f = (authenticated ? SD_RESOLVED_AUTHENTICATED : 0) | + (confidential ? SD_RESOLVED_CONFIDENTIAL : 0); + + switch (protocol) { + case DNS_PROTOCOL_DNS: + return f|SD_RESOLVED_DNS; + + case DNS_PROTOCOL_LLMNR: + return f|(family == AF_INET6 ? SD_RESOLVED_LLMNR_IPV6 : SD_RESOLVED_LLMNR_IPV4); + + case DNS_PROTOCOL_MDNS: + return f|(family == AF_INET6 ? SD_RESOLVED_MDNS_IPV6 : SD_RESOLVED_MDNS_IPV4); + + default: + return f; + } +} + +static inline size_t dns_packet_size_max(DnsPacket *p) { + assert(p); + + /* Why not insist on a fully initialized max_size during DnsPacket construction? Well, this way it's easy to + * allocate a transient, throw-away DnsPacket on the stack by simple zero initialization, without having to + * deal with explicit field initialization. */ + + return p->max_size != 0 ? p->max_size : DNS_PACKET_SIZE_MAX; +} + +static inline size_t udp_header_size(int af) { + + switch (af) { + case AF_INET: + return UDP4_PACKET_HEADER_SIZE; + case AF_INET6: + return UDP6_PACKET_HEADER_SIZE; + default: + assert_not_reached(); + } +} + +size_t dns_packet_size_unfragmented(DnsPacket *p); diff --git a/src/resolve/resolved-dns-query.c b/src/resolve/resolved-dns-query.c new file mode 100644 index 0000000..7eb6b97 --- /dev/null +++ b/src/resolve/resolved-dns-query.c @@ -0,0 +1,1299 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dns-domain.h" +#include "dns-type.h" +#include "event-util.h" +#include "glyph-util.h" +#include "hostname-util.h" +#include "local-addresses.h" +#include "resolved-dns-query.h" +#include "resolved-dns-synthesize.h" +#include "resolved-etc-hosts.h" +#include "string-util.h" + +#define QUERIES_MAX 2048 +#define AUXILIARY_QUERIES_MAX 64 +#define CNAME_REDIRECTS_MAX 16 + +assert_cc(AUXILIARY_QUERIES_MAX < UINT8_MAX); +assert_cc(CNAME_REDIRECTS_MAX < UINT8_MAX); + +static int dns_query_candidate_new(DnsQueryCandidate **ret, DnsQuery *q, DnsScope *s) { + DnsQueryCandidate *c; + + assert(ret); + assert(q); + assert(s); + + c = new(DnsQueryCandidate, 1); + if (!c) + return -ENOMEM; + + *c = (DnsQueryCandidate) { + .n_ref = 1, + .query = q, + .scope = s, + }; + + LIST_PREPEND(candidates_by_query, q->candidates, c); + LIST_PREPEND(candidates_by_scope, s->query_candidates, c); + + *ret = c; + return 0; +} + +static void dns_query_candidate_stop(DnsQueryCandidate *c) { + DnsTransaction *t; + + assert(c); + + /* Detach all the DnsTransactions attached to this query */ + + while ((t = set_steal_first(c->transactions))) { + set_remove(t->notify_query_candidates, c); + set_remove(t->notify_query_candidates_done, c); + dns_transaction_gc(t); + } +} + +static DnsQueryCandidate* dns_query_candidate_unlink(DnsQueryCandidate *c) { + assert(c); + + /* Detach this DnsQueryCandidate from the Query and Scope objects */ + + if (c->query) { + LIST_REMOVE(candidates_by_query, c->query->candidates, c); + c->query = NULL; + } + + if (c->scope) { + LIST_REMOVE(candidates_by_scope, c->scope->query_candidates, c); + c->scope = NULL; + } + + return c; +} + +static DnsQueryCandidate* dns_query_candidate_free(DnsQueryCandidate *c) { + if (!c) + return NULL; + + dns_query_candidate_stop(c); + dns_query_candidate_unlink(c); + + set_free(c->transactions); + dns_search_domain_unref(c->search_domain); + + return mfree(c); +} + +DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(DnsQueryCandidate, dns_query_candidate, dns_query_candidate_free); + +static int dns_query_candidate_next_search_domain(DnsQueryCandidate *c) { + DnsSearchDomain *next; + + assert(c); + + if (c->search_domain && c->search_domain->linked) + next = c->search_domain->domains_next; + else + next = dns_scope_get_search_domains(c->scope); + + for (;;) { + if (!next) /* We hit the end of the list */ + return 0; + + if (!next->route_only) + break; + + /* Skip over route-only domains */ + next = next->domains_next; + } + + dns_search_domain_unref(c->search_domain); + c->search_domain = dns_search_domain_ref(next); + + return 1; +} + +static int dns_query_candidate_add_transaction( + DnsQueryCandidate *c, + DnsResourceKey *key, + DnsPacket *bypass) { + + _cleanup_(dns_transaction_gcp) DnsTransaction *t = NULL; + int r; + + assert(c); + assert(c->query); /* We shan't add transactions to a candidate that has been detached already */ + + if (key) { + /* Regular lookup with a resource key */ + assert(!bypass); + + t = dns_scope_find_transaction(c->scope, key, c->query->flags); + if (!t) { + r = dns_transaction_new(&t, c->scope, key, NULL, c->query->flags); + if (r < 0) + return r; + } else if (set_contains(c->transactions, t)) + return 0; + } else { + /* "Bypass" lookup with a query packet */ + assert(bypass); + + r = dns_transaction_new(&t, c->scope, NULL, bypass, c->query->flags); + if (r < 0) + return r; + } + + r = set_ensure_allocated(&t->notify_query_candidates_done, NULL); + if (r < 0) + return r; + + r = set_ensure_put(&t->notify_query_candidates, NULL, c); + if (r < 0) + return r; + + r = set_ensure_put(&c->transactions, NULL, t); + if (r < 0) { + (void) set_remove(t->notify_query_candidates, c); + return r; + } + + TAKE_PTR(t); + return 1; +} + +static int dns_query_candidate_go(DnsQueryCandidate *c) { + _unused_ _cleanup_(dns_query_candidate_unrefp) DnsQueryCandidate *keep_c = NULL; + DnsTransaction *t; + int r; + unsigned n = 0; + + assert(c); + + /* Let's keep a reference to the query while we're operating */ + keep_c = dns_query_candidate_ref(c); + + /* Start the transactions that are not started yet */ + SET_FOREACH(t, c->transactions) { + if (t->state != DNS_TRANSACTION_NULL) + continue; + + r = dns_transaction_go(t); + if (r < 0) + return r; + + n++; + } + + /* If there was nothing to start, then let's proceed immediately */ + if (n == 0) + dns_query_candidate_notify(c); + + return 0; +} + +static DnsTransactionState dns_query_candidate_state(DnsQueryCandidate *c) { + DnsTransactionState state = DNS_TRANSACTION_NO_SERVERS; + DnsTransaction *t; + + assert(c); + + if (c->error_code != 0) + return DNS_TRANSACTION_ERRNO; + + SET_FOREACH(t, c->transactions) + + switch (t->state) { + + case DNS_TRANSACTION_NULL: + /* If there's a NULL transaction pending, then + * this means not all transactions where + * started yet, and we were called from within + * the stackframe that is supposed to start + * remaining transactions. In this case, + * simply claim the candidate is pending. */ + + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + /* If there's one transaction currently in + * VALIDATING state, then this means there's + * also one in PENDING state, hence we can + * return PENDING immediately. */ + return DNS_TRANSACTION_PENDING; + + case DNS_TRANSACTION_SUCCESS: + state = t->state; + break; + + default: + if (state != DNS_TRANSACTION_SUCCESS) + state = t->state; + + break; + } + + return state; +} + +static int dns_query_candidate_setup_transactions(DnsQueryCandidate *c) { + DnsQuestion *question; + DnsResourceKey *key; + int n = 0, r; + + assert(c); + assert(c->query); /* We shan't add transactions to a candidate that has been detached already */ + + dns_query_candidate_stop(c); + + if (c->query->question_bypass) { + /* If this is a bypass query, then pass the original query packet along to the transaction */ + + assert(dns_question_size(c->query->question_bypass->question) == 1); + + if (!dns_scope_good_key(c->scope, dns_question_first_key(c->query->question_bypass->question))) + return 0; + + r = dns_query_candidate_add_transaction(c, NULL, c->query->question_bypass); + if (r < 0) + goto fail; + + return 1; + } + + question = dns_query_question_for_protocol(c->query, c->scope->protocol); + + /* Create one transaction per question key */ + DNS_QUESTION_FOREACH(key, question) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *new_key = NULL; + DnsResourceKey *qkey; + + if (c->search_domain) { + r = dns_resource_key_new_append_suffix(&new_key, key, c->search_domain->name); + if (r < 0) + goto fail; + + qkey = new_key; + } else + qkey = key; + + if (!dns_scope_good_key(c->scope, qkey)) + continue; + + r = dns_query_candidate_add_transaction(c, qkey, NULL); + if (r < 0) + goto fail; + + n++; + } + + return n; + +fail: + dns_query_candidate_stop(c); + return r; +} + +void dns_query_candidate_notify(DnsQueryCandidate *c) { + DnsTransactionState state; + int r; + + assert(c); + + if (!c->query) /* This candidate has been abandoned, do nothing. */ + return; + + state = dns_query_candidate_state(c); + + if (DNS_TRANSACTION_IS_LIVE(state)) + return; + + if (state != DNS_TRANSACTION_SUCCESS && c->search_domain) { + + r = dns_query_candidate_next_search_domain(c); + if (r < 0) + goto fail; + + if (r > 0) { + /* OK, there's another search domain to try, let's do so. */ + + r = dns_query_candidate_setup_transactions(c); + if (r < 0) + goto fail; + + if (r > 0) { + /* New transactions where queued. Start them and wait */ + + r = dns_query_candidate_go(c); + if (r < 0) + goto fail; + + return; + } + } + + } + + dns_query_ready(c->query); + return; + +fail: + c->error_code = log_warning_errno(r, "Failed to follow search domains: %m"); + dns_query_ready(c->query); +} + +static void dns_query_stop(DnsQuery *q) { + assert(q); + + event_source_disable(q->timeout_event_source); + + LIST_FOREACH(candidates_by_query, c, q->candidates) + dns_query_candidate_stop(c); +} + +static void dns_query_unlink_candidates(DnsQuery *q) { + assert(q); + + while (q->candidates) + /* Here we drop *our* references to each of the candidates. If we had the only reference, the + * DnsQueryCandidate object will be freed. */ + dns_query_candidate_unref(dns_query_candidate_unlink(q->candidates)); +} + +static void dns_query_reset_answer(DnsQuery *q) { + assert(q); + + q->answer = dns_answer_unref(q->answer); + q->answer_rcode = 0; + q->answer_dnssec_result = _DNSSEC_RESULT_INVALID; + q->answer_errno = 0; + q->answer_query_flags = 0; + q->answer_protocol = _DNS_PROTOCOL_INVALID; + q->answer_family = AF_UNSPEC; + q->answer_search_domain = dns_search_domain_unref(q->answer_search_domain); + q->answer_full_packet = dns_packet_unref(q->answer_full_packet); +} + +DnsQuery *dns_query_free(DnsQuery *q) { + if (!q) + return NULL; + + q->timeout_event_source = sd_event_source_disable_unref(q->timeout_event_source); + + while (q->auxiliary_queries) + dns_query_free(q->auxiliary_queries); + + if (q->auxiliary_for) { + assert(q->auxiliary_for->n_auxiliary_queries > 0); + q->auxiliary_for->n_auxiliary_queries--; + LIST_REMOVE(auxiliary_queries, q->auxiliary_for->auxiliary_queries, q); + } + + dns_query_unlink_candidates(q); + + dns_question_unref(q->question_idna); + dns_question_unref(q->question_utf8); + dns_packet_unref(q->question_bypass); + dns_question_unref(q->collected_questions); + + dns_query_reset_answer(q); + + sd_bus_message_unref(q->bus_request); + sd_bus_track_unref(q->bus_track); + + if (q->varlink_request) { + varlink_set_userdata(q->varlink_request, NULL); + varlink_unref(q->varlink_request); + } + + if (q->request_packet) + hashmap_remove_value(q->stub_listener_extra ? + q->stub_listener_extra->queries_by_packet : + q->manager->stub_queries_by_packet, + q->request_packet, + q); + + dns_packet_unref(q->request_packet); + dns_answer_unref(q->reply_answer); + dns_answer_unref(q->reply_authoritative); + dns_answer_unref(q->reply_additional); + + if (q->request_stream) { + /* Detach the stream from our query, in case something else keeps a reference to it. */ + (void) set_remove(q->request_stream->queries, q); + q->request_stream = dns_stream_unref(q->request_stream); + } + + free(q->request_address_string); + + if (q->manager) { + LIST_REMOVE(queries, q->manager->dns_queries, q); + q->manager->n_dns_queries--; + } + + return mfree(q); +} + +int dns_query_new( + Manager *m, + DnsQuery **ret, + DnsQuestion *question_utf8, + DnsQuestion *question_idna, + DnsPacket *question_bypass, + int ifindex, + uint64_t flags) { + + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + DnsResourceKey *key; + int r; + + assert(m); + + if (question_bypass) { + /* It's either a "bypass" query, or a regular one, but can't be both. */ + if (question_utf8 || question_idna) + return -EINVAL; + + } else { + bool good = false; + + /* This (primarily) checks two things: + * + * 1. That the question is not empty + * 2. That all RR keys in the question objects are for the same domain + * + * Or in other words, a single DnsQuery object may be used to look up A+AAAA combination for + * the same domain name, or SRV+TXT (for DNS-SD services), but not for unrelated lookups. */ + + if (dns_question_size(question_utf8) > 0) { + r = dns_question_is_valid_for_query(question_utf8); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + good = true; + } + + /* If the IDNA and UTF8 questions are the same, merge their references */ + r = dns_question_is_equal(question_idna, question_utf8); + if (r < 0) + return r; + if (r > 0) + question_idna = question_utf8; + else { + if (dns_question_size(question_idna) > 0) { + r = dns_question_is_valid_for_query(question_idna); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + good = true; + } + } + + if (!good) /* don't allow empty queries */ + return -EINVAL; + } + + if (m->n_dns_queries >= QUERIES_MAX) + return -EBUSY; + + q = new(DnsQuery, 1); + if (!q) + return -ENOMEM; + + *q = (DnsQuery) { + .question_utf8 = dns_question_ref(question_utf8), + .question_idna = dns_question_ref(question_idna), + .question_bypass = dns_packet_ref(question_bypass), + .ifindex = ifindex, + .flags = flags, + .answer_dnssec_result = _DNSSEC_RESULT_INVALID, + .answer_protocol = _DNS_PROTOCOL_INVALID, + .answer_family = AF_UNSPEC, + }; + + if (question_bypass) { + DNS_QUESTION_FOREACH(key, question_bypass->question) + log_debug("Looking up bypass packet for %s.", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + } else { + /* First dump UTF8 question */ + DNS_QUESTION_FOREACH(key, question_utf8) + log_debug("Looking up RR for %s.", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + + /* And then dump the IDNA question, but only what hasn't been dumped already through the UTF8 question. */ + DNS_QUESTION_FOREACH(key, question_idna) { + r = dns_question_contains_key(question_utf8, key); + if (r < 0) + return r; + if (r > 0) + continue; + + log_debug("Looking up IDNA RR for %s.", + dns_resource_key_to_string(key, key_str, sizeof key_str)); + } + } + + LIST_PREPEND(queries, m->dns_queries, q); + m->n_dns_queries++; + q->manager = m; + + if (ret) + *ret = q; + + TAKE_PTR(q); + return 0; +} + +int dns_query_make_auxiliary(DnsQuery *q, DnsQuery *auxiliary_for) { + assert(q); + assert(auxiliary_for); + + /* Ensure that the query is not auxiliary yet, and + * nothing else is auxiliary to it either */ + assert(!q->auxiliary_for); + assert(!q->auxiliary_queries); + + /* Ensure that the unit we shall be made auxiliary for isn't + * auxiliary itself */ + assert(!auxiliary_for->auxiliary_for); + + if (auxiliary_for->n_auxiliary_queries >= AUXILIARY_QUERIES_MAX) + return -EAGAIN; + + LIST_PREPEND(auxiliary_queries, auxiliary_for->auxiliary_queries, q); + q->auxiliary_for = auxiliary_for; + + auxiliary_for->n_auxiliary_queries++; + return 0; +} + +void dns_query_complete(DnsQuery *q, DnsTransactionState state) { + assert(q); + assert(!DNS_TRANSACTION_IS_LIVE(state)); + assert(DNS_TRANSACTION_IS_LIVE(q->state)); + + /* Note that this call might invalidate the query. Callers should hence not attempt to access the + * query or transaction after calling this function. */ + + q->state = state; + + (void) manager_monitor_send(q->manager, q->state, q->answer_rcode, q->answer_errno, q->question_idna, q->question_utf8, q->question_bypass, q->collected_questions, q->answer); + + dns_query_stop(q); + if (q->complete) + q->complete(q); +} + +static int on_query_timeout(sd_event_source *s, usec_t usec, void *userdata) { + DnsQuery *q = ASSERT_PTR(userdata); + + assert(s); + + dns_query_complete(q, DNS_TRANSACTION_TIMEOUT); + return 0; +} + +static int dns_query_add_candidate(DnsQuery *q, DnsScope *s) { + _cleanup_(dns_query_candidate_unrefp) DnsQueryCandidate *c = NULL; + int r; + + assert(q); + assert(s); + + r = dns_query_candidate_new(&c, q, s); + if (r < 0) + return r; + + /* If this a single-label domain on DNS, we might append a suitable search domain first. */ + if (!FLAGS_SET(q->flags, SD_RESOLVED_NO_SEARCH) && + dns_scope_name_wants_search_domain(s, dns_question_first_name(q->question_idna))) { + /* OK, we want a search domain now. Let's find one for this scope */ + + r = dns_query_candidate_next_search_domain(c); + if (r < 0) + return r; + } + + r = dns_query_candidate_setup_transactions(c); + if (r < 0) + return r; + + TAKE_PTR(c); + return 0; +} + +static int dns_query_synthesize_reply(DnsQuery *q, DnsTransactionState *state) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + int r; + + assert(q); + assert(state); + + /* Tries to synthesize localhost RR replies (and others) where appropriate. Note that this is done *after* the + * the normal lookup finished. The data from the network hence takes precedence over the data we + * synthesize. (But note that many scopes refuse to resolve certain domain names) */ + + if (!IN_SET(*state, + DNS_TRANSACTION_RCODE_FAILURE, + DNS_TRANSACTION_NO_SERVERS, + DNS_TRANSACTION_TIMEOUT, + DNS_TRANSACTION_ATTEMPTS_MAX_REACHED, + DNS_TRANSACTION_NETWORK_DOWN, + DNS_TRANSACTION_NOT_FOUND)) + return 0; + + if (FLAGS_SET(q->flags, SD_RESOLVED_NO_SYNTHESIZE)) + return 0; + + r = dns_synthesize_answer( + q->manager, + q->question_bypass ? q->question_bypass->question : q->question_utf8, + q->ifindex, + &answer); + if (r == -ENXIO) { + /* If we get ENXIO this tells us to generate NXDOMAIN unconditionally. */ + + dns_query_reset_answer(q); + q->answer_rcode = DNS_RCODE_NXDOMAIN; + q->answer_protocol = dns_synthesize_protocol(q->flags); + q->answer_family = dns_synthesize_family(q->flags); + q->answer_query_flags = SD_RESOLVED_AUTHENTICATED|SD_RESOLVED_CONFIDENTIAL|SD_RESOLVED_SYNTHETIC; + *state = DNS_TRANSACTION_RCODE_FAILURE; + + return 0; + } + if (r <= 0) + return r; + + dns_query_reset_answer(q); + + q->answer = TAKE_PTR(answer); + q->answer_rcode = DNS_RCODE_SUCCESS; + q->answer_protocol = dns_synthesize_protocol(q->flags); + q->answer_family = dns_synthesize_family(q->flags); + q->answer_query_flags = SD_RESOLVED_AUTHENTICATED|SD_RESOLVED_CONFIDENTIAL|SD_RESOLVED_SYNTHETIC; + + *state = DNS_TRANSACTION_SUCCESS; + + return 1; +} + +static int dns_query_try_etc_hosts(DnsQuery *q) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + int r; + + assert(q); + + /* Looks in /etc/hosts for matching entries. Note that this is done *before* the normal lookup is + * done. The data from /etc/hosts hence takes precedence over the network. */ + + if (FLAGS_SET(q->flags, SD_RESOLVED_NO_SYNTHESIZE)) + return 0; + + r = manager_etc_hosts_lookup( + q->manager, + q->question_bypass ? q->question_bypass->question : q->question_utf8, + &answer); + if (r <= 0) + return r; + + dns_query_reset_answer(q); + + q->answer = TAKE_PTR(answer); + q->answer_rcode = DNS_RCODE_SUCCESS; + q->answer_protocol = dns_synthesize_protocol(q->flags); + q->answer_family = dns_synthesize_family(q->flags); + q->answer_query_flags = SD_RESOLVED_AUTHENTICATED|SD_RESOLVED_CONFIDENTIAL|SD_RESOLVED_SYNTHETIC; + + return 1; +} + +int dns_query_go(DnsQuery *q) { + DnsScopeMatch found = DNS_SCOPE_NO; + DnsScope *first = NULL; + int r; + + assert(q); + + if (q->state != DNS_TRANSACTION_NULL) + return 0; + + r = dns_query_try_etc_hosts(q); + if (r < 0) + return r; + if (r > 0) { + dns_query_complete(q, DNS_TRANSACTION_SUCCESS); + return 1; + } + + LIST_FOREACH(scopes, s, q->manager->dns_scopes) { + DnsScopeMatch match; + + match = dns_scope_good_domain(s, q); + assert(match >= 0); + if (match > found) { /* Does this match better? If so, remember how well it matched, and the first one + * that matches this well */ + found = match; + first = s; + } + } + + if (found == DNS_SCOPE_NO) { + DnsTransactionState state = DNS_TRANSACTION_NO_SERVERS; + + r = dns_query_synthesize_reply(q, &state); + if (r < 0) + return r; + + dns_query_complete(q, state); + return 1; + } + + r = dns_query_add_candidate(q, first); + if (r < 0) + goto fail; + + LIST_FOREACH(scopes, s, first->scopes_next) { + DnsScopeMatch match; + + match = dns_scope_good_domain(s, q); + assert(match >= 0); + if (match < found) + continue; + + r = dns_query_add_candidate(q, s); + if (r < 0) + goto fail; + } + + dns_query_reset_answer(q); + + r = event_reset_time_relative( + q->manager->event, + &q->timeout_event_source, + CLOCK_BOOTTIME, + SD_RESOLVED_QUERY_TIMEOUT_USEC, + 0, on_query_timeout, q, + 0, "query-timeout", true); + if (r < 0) + goto fail; + + q->state = DNS_TRANSACTION_PENDING; + q->block_ready++; + + /* Start the transactions */ + LIST_FOREACH(candidates_by_query, c, q->candidates) { + r = dns_query_candidate_go(c); + if (r < 0) { + q->block_ready--; + goto fail; + } + } + + q->block_ready--; + dns_query_ready(q); + + return 1; + +fail: + dns_query_stop(q); + return r; +} + +static void dns_query_accept(DnsQuery *q, DnsQueryCandidate *c) { + DnsTransactionState state = DNS_TRANSACTION_NO_SERVERS; + bool has_authenticated = false, has_non_authenticated = false, has_confidential = false, has_non_confidential = false; + DnssecResult dnssec_result_authenticated = _DNSSEC_RESULT_INVALID, dnssec_result_non_authenticated = _DNSSEC_RESULT_INVALID; + DnsTransaction *t; + int r; + + assert(q); + + if (!c) { + r = dns_query_synthesize_reply(q, &state); + if (r < 0) + goto fail; + + dns_query_complete(q, state); + return; + } + + if (c->error_code != 0) { + /* If the candidate had an error condition of its own, start with that. */ + state = DNS_TRANSACTION_ERRNO; + q->answer = dns_answer_unref(q->answer); + q->answer_rcode = 0; + q->answer_dnssec_result = _DNSSEC_RESULT_INVALID; + q->answer_query_flags = 0; + q->answer_errno = c->error_code; + q->answer_full_packet = dns_packet_unref(q->answer_full_packet); + } + + SET_FOREACH(t, c->transactions) { + + switch (t->state) { + + case DNS_TRANSACTION_SUCCESS: { + /* We found a successful reply, merge it into the answer */ + + if (state == DNS_TRANSACTION_SUCCESS) { + r = dns_answer_extend(&q->answer, t->answer); + if (r < 0) + goto fail; + + q->answer_query_flags |= dns_transaction_source_to_query_flags(t->answer_source); + } else { + /* Override non-successful previous answers */ + DNS_ANSWER_REPLACE(q->answer, dns_answer_ref(t->answer)); + q->answer_query_flags = dns_transaction_source_to_query_flags(t->answer_source); + } + + q->answer_rcode = t->answer_rcode; + q->answer_errno = 0; + + DNS_PACKET_REPLACE(q->answer_full_packet, dns_packet_ref(t->received)); + + if (FLAGS_SET(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) { + has_authenticated = true; + dnssec_result_authenticated = t->answer_dnssec_result; + } else { + has_non_authenticated = true; + dnssec_result_non_authenticated = t->answer_dnssec_result; + } + + if (FLAGS_SET(t->answer_query_flags, SD_RESOLVED_CONFIDENTIAL)) + has_confidential = true; + else + has_non_confidential = true; + + state = DNS_TRANSACTION_SUCCESS; + break; + } + + case DNS_TRANSACTION_NULL: + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + case DNS_TRANSACTION_ABORTED: + /* Ignore transactions that didn't complete */ + continue; + + default: + /* Any kind of failure? Store the data away, if there's nothing stored yet. */ + if (state == DNS_TRANSACTION_SUCCESS) + continue; + + /* If there's already an authenticated negative reply stored, then prefer that over any unauthenticated one */ + if (FLAGS_SET(q->answer_query_flags, SD_RESOLVED_AUTHENTICATED) && + !FLAGS_SET(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + continue; + + DNS_ANSWER_REPLACE(q->answer, dns_answer_ref(t->answer)); + q->answer_rcode = t->answer_rcode; + q->answer_dnssec_result = t->answer_dnssec_result; + q->answer_query_flags = t->answer_query_flags | dns_transaction_source_to_query_flags(t->answer_source); + q->answer_errno = t->answer_errno; + DNS_PACKET_REPLACE(q->answer_full_packet, dns_packet_ref(t->received)); + + state = t->state; + break; + } + } + + if (state == DNS_TRANSACTION_SUCCESS) { + SET_FLAG(q->answer_query_flags, SD_RESOLVED_AUTHENTICATED, has_authenticated && !has_non_authenticated); + SET_FLAG(q->answer_query_flags, SD_RESOLVED_CONFIDENTIAL, has_confidential && !has_non_confidential); + q->answer_dnssec_result = FLAGS_SET(q->answer_query_flags, SD_RESOLVED_AUTHENTICATED) ? dnssec_result_authenticated : dnssec_result_non_authenticated; + } + + q->answer_protocol = c->scope->protocol; + q->answer_family = c->scope->family; + + dns_search_domain_unref(q->answer_search_domain); + q->answer_search_domain = dns_search_domain_ref(c->search_domain); + + r = dns_query_synthesize_reply(q, &state); + if (r < 0) + goto fail; + + dns_query_complete(q, state); + return; + +fail: + q->answer_errno = -r; + dns_query_complete(q, DNS_TRANSACTION_ERRNO); +} + +void dns_query_ready(DnsQuery *q) { + DnsQueryCandidate *bad = NULL; + bool pending = false; + + assert(q); + assert(DNS_TRANSACTION_IS_LIVE(q->state)); + + /* Note that this call might invalidate the query. Callers + * should hence not attempt to access the query or transaction + * after calling this function, unless the block_ready + * counter was explicitly bumped before doing so. */ + + if (q->block_ready > 0) + return; + + LIST_FOREACH(candidates_by_query, c, q->candidates) { + DnsTransactionState state; + + state = dns_query_candidate_state(c); + switch (state) { + + case DNS_TRANSACTION_SUCCESS: + /* One of the candidates is successful, + * let's use it, and copy its data out */ + dns_query_accept(q, c); + return; + + case DNS_TRANSACTION_NULL: + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + /* One of the candidates is still going on, + * let's maybe wait for it */ + pending = true; + break; + + default: + /* Any kind of failure */ + bad = c; + break; + } + } + + if (pending) + return; + + dns_query_accept(q, bad); +} + +static int dns_query_collect_question(DnsQuery *q, DnsQuestion *question) { + _cleanup_(dns_question_unrefp) DnsQuestion *merged = NULL; + int r; + + assert(q); + + if (dns_question_size(question) == 0) + return 0; + + /* When redirecting, save the first element in the chain, for informational purposes when monitoring */ + r = dns_question_merge(q->collected_questions, question, &merged); + if (r < 0) + return r; + + dns_question_unref(q->collected_questions); + q->collected_questions = TAKE_PTR(merged); + + return 0; +} + +static int dns_query_cname_redirect(DnsQuery *q, const DnsResourceRecord *cname) { + _cleanup_(dns_question_unrefp) DnsQuestion *nq_idna = NULL, *nq_utf8 = NULL; + int r, k; + + assert(q); + + if (q->n_cname_redirects >= CNAME_REDIRECTS_MAX) + return -ELOOP; + q->n_cname_redirects++; + + r = dns_question_cname_redirect(q->question_idna, cname, &nq_idna); + if (r < 0) + return r; + if (r > 0) + log_debug("Following CNAME/DNAME %s %s %s.", + dns_question_first_name(q->question_idna), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + dns_question_first_name(nq_idna)); + + k = dns_question_is_equal(q->question_idna, q->question_utf8); + if (k < 0) + return k; + if (k > 0) { + /* Same question? Shortcut new question generation */ + nq_utf8 = dns_question_ref(nq_idna); + k = r; + } else { + k = dns_question_cname_redirect(q->question_utf8, cname, &nq_utf8); + if (k < 0) + return k; + if (k > 0) + log_debug("Following UTF8 CNAME/DNAME %s %s %s.", + dns_question_first_name(q->question_utf8), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + dns_question_first_name(nq_utf8)); + } + + if (r == 0 && k == 0) /* No actual cname happened? */ + return -ELOOP; + + if (q->answer_protocol == DNS_PROTOCOL_DNS) + /* Don't permit CNAME redirects from unicast DNS to LLMNR or MulticastDNS, so that global resources + * cannot invade the local namespace. The opposite way we permit: local names may redirect to global + * ones. */ + q->flags &= ~(SD_RESOLVED_LLMNR|SD_RESOLVED_MDNS); /* mask away the local protocols */ + + /* Turn off searching for the new name */ + q->flags |= SD_RESOLVED_NO_SEARCH; + + r = dns_query_collect_question(q, q->question_idna); + if (r < 0) + return r; + r = dns_query_collect_question(q, q->question_utf8); + if (r < 0) + return r; + + /* Install the redirected question */ + dns_question_unref(q->question_idna); + q->question_idna = TAKE_PTR(nq_idna); + + dns_question_unref(q->question_utf8); + q->question_utf8 = TAKE_PTR(nq_utf8); + + dns_query_unlink_candidates(q); + + /* Note that we do *not* reset the answer here, because the answer we previously got might already + * include everything we need, let's check that first */ + + q->state = DNS_TRANSACTION_NULL; + + return 0; +} + +int dns_query_process_cname_one(DnsQuery *q) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *cname = NULL; + DnsQuestion *question; + DnsResourceRecord *rr; + bool full_match = true; + DnsResourceKey *k; + int r; + + assert(q); + + /* Processes a CNAME redirect if there's one. Returns one of three values: + * + * CNAME_QUERY_MATCH → direct RR match, caller should just use the RRs in this answer (and not + * bother with any CNAME/DNAME stuff) + * + * CNAME_QUERY_NOMATCH → no match at all, neither direct nor CNAME/DNAME, caller might decide to + * restart query or take things as NODATA reply. + * + * CNAME_QUERY_CNAME → no direct RR match, but a CNAME/DNAME match that we now followed for one step. + * + * The function might also return a failure, in particular -ELOOP if we encountered too many + * CNAMEs/DNAMEs in a chain or if following CNAMEs/DNAMEs was turned off. + * + * Note that this function doesn't actually restart the query. The caller can decide to do that in + * case of CNAME_QUERY_CNAME, though. */ + + if (!IN_SET(q->state, DNS_TRANSACTION_SUCCESS, DNS_TRANSACTION_NULL)) + return DNS_QUERY_NOMATCH; + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + /* Small reminder: our question will consist of one or more RR keys that match in name, but not in + * record type. Specifically, when we do an address lookup the question will typically consist of one + * A and one AAAA key lookup for the same domain name. When we get a response from a server we need + * to check if the answer answers all our questions to use it. Note that a response of CNAME/DNAME + * can answer both an A and the AAAA question for us, but an A/AAAA response only the relevant + * type. + * + * Hence we first check of the answers we collected are sufficient to answer all our questions + * directly. If one question wasn't answered we go on, waiting for more replies. However, if there's + * a CNAME/DNAME response we use it, and redirect to it, regardless if it was a response to the A or + * the AAAA query. */ + + DNS_QUESTION_FOREACH(k, question) { + bool match = false; + + DNS_ANSWER_FOREACH(rr, q->answer) { + r = dns_resource_key_match_rr(k, rr, DNS_SEARCH_DOMAIN_NAME(q->answer_search_domain)); + if (r < 0) + return r; + if (r > 0) { + match = true; /* Yay, we found an RR that matches the key we are looking for */ + break; + } + } + + if (!match) { + /* Hmm. :-( there's no response for this key. This doesn't match. */ + full_match = false; + break; + } + } + + if (full_match) + return DNS_QUERY_MATCH; /* The answer can answer our question in full, no need to follow CNAMEs/DNAMEs */ + + /* Let's see if there is a CNAME/DNAME to match. This case is simpler: we accept the CNAME/DNAME that + * matches any of our questions. */ + DNS_ANSWER_FOREACH(rr, q->answer) { + r = dns_question_matches_cname_or_dname(question, rr, DNS_SEARCH_DOMAIN_NAME(q->answer_search_domain)); + if (r < 0) + return r; + if (r > 0 && !cname) + cname = dns_resource_record_ref(rr); + } + + if (!cname) + return DNS_QUERY_NOMATCH; /* No match and no CNAME/DNAME to follow */ + + if (q->flags & SD_RESOLVED_NO_CNAME) + return -ELOOP; + + if (!FLAGS_SET(q->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + q->previous_redirect_unauthenticated = true; + if (!FLAGS_SET(q->answer_query_flags, SD_RESOLVED_CONFIDENTIAL)) + q->previous_redirect_non_confidential = true; + if (!FLAGS_SET(q->answer_query_flags, SD_RESOLVED_SYNTHETIC)) + q->previous_redirect_non_synthetic = true; + + /* OK, let's actually follow the CNAME */ + r = dns_query_cname_redirect(q, cname); + if (r < 0) + return r; + + return DNS_QUERY_CNAME; /* Tell caller that we did a single CNAME/DNAME redirection step */ +} + +int dns_query_process_cname_many(DnsQuery *q) { + int r; + + assert(q); + + /* Follows CNAMEs through the current packet: as long as the current packet can fulfill our + * redirected CNAME queries we keep going, and restart the query once the current packet isn't good + * enough anymore. It's a wrapper around dns_query_process_cname_one() and returns the same values, + * but with extended semantics. Specifically: + * + * DNS_QUERY_MATCH → as above + * + * DNS_QUERY_CNAME → we ran into a CNAME/DNAME redirect that we could not answer from the current + * message, and thus restarted the query to resolve it. + * + * DNS_QUERY_NOMATCH → we reached the end of CNAME/DNAME chain, and there are no direct matches nor a + * CNAME/DNAME match. i.e. this is a NODATA case. + * + * Note that this function will restart the query for the caller if needed, and that's the case + * DNS_QUERY_CNAME is returned. + */ + + r = dns_query_process_cname_one(q); + if (r != DNS_QUERY_CNAME) + return r; /* The first redirect is special: if it doesn't answer the question that's no + * reason to restart the query, we just accept this as a NODATA answer. */ + + for (;;) { + r = dns_query_process_cname_one(q); + if (r < 0 || r == DNS_QUERY_MATCH) + return r; + if (r == DNS_QUERY_NOMATCH) { + /* OK, so we followed one or more CNAME/DNAME RR but the existing packet can't answer + * this. Let's restart the query hence, with the new question. Why the different + * handling than the first chain element? Because if the server answers a direct + * question with an empty answer then this is a NODATA response. But if it responds + * with a CNAME chain that ultimately is incomplete (i.e. a non-empty but truncated + * CNAME chain) then we better follow up ourselves and ask for the rest of the + * chain. This is particular relevant since our cache will store CNAME/DNAME + * redirects that we learnt about for lookups of certain DNS types, but later on we + * can reuse this data even for other DNS types, but in that case need to follow up + * with the final lookup of the chain ourselves with the RR type we ourselves are + * interested in. */ + r = dns_query_go(q); + if (r < 0) + return r; + + return DNS_QUERY_CNAME; + } + + /* So we found a CNAME that the existing packet already answers, again via a CNAME, let's + * continue going then. */ + assert(r == DNS_QUERY_CNAME); + } +} + +DnsQuestion* dns_query_question_for_protocol(DnsQuery *q, DnsProtocol protocol) { + assert(q); + + if (q->question_bypass) + return q->question_bypass->question; + + switch (protocol) { + + case DNS_PROTOCOL_DNS: + return q->question_idna; + + case DNS_PROTOCOL_MDNS: + case DNS_PROTOCOL_LLMNR: + return q->question_utf8; + + default: + return NULL; + } +} + +const char *dns_query_string(DnsQuery *q) { + const char *name; + int r; + + /* Returns a somewhat useful human-readable lookup key string for this query */ + + if (q->question_bypass) + return dns_question_first_name(q->question_bypass->question); + + if (q->request_address_string) + return q->request_address_string; + + if (q->request_address_valid) { + r = in_addr_to_string(q->request_family, &q->request_address, &q->request_address_string); + if (r >= 0) + return q->request_address_string; + } + + name = dns_question_first_name(q->question_utf8); + if (name) + return name; + + return dns_question_first_name(q->question_idna); +} + +bool dns_query_fully_authenticated(DnsQuery *q) { + assert(q); + + return FLAGS_SET(q->answer_query_flags, SD_RESOLVED_AUTHENTICATED) && !q->previous_redirect_unauthenticated; +} + +bool dns_query_fully_confidential(DnsQuery *q) { + assert(q); + + return FLAGS_SET(q->answer_query_flags, SD_RESOLVED_CONFIDENTIAL) && !q->previous_redirect_non_confidential; +} + +bool dns_query_fully_authoritative(DnsQuery *q) { + assert(q); + + /* We are authoritative for everything synthetic (except if a previous CNAME/DNAME) wasn't + * synthetic. (Note: SD_RESOLVED_SYNTHETIC is reset on each CNAME/DNAME, hence the explicit check for + * previous synthetic DNAME/CNAME redirections.) */ + if ((q->answer_query_flags & SD_RESOLVED_SYNTHETIC) && !q->previous_redirect_non_synthetic) + return true; + + /* We are also authoritative for everything coming only from the trust anchor and the local + * zones. (Note: the SD_RESOLVED_FROM_xyz flags we merge on each redirect, hence no need to + * explicitly check previous redirects here.) */ + return (q->answer_query_flags & SD_RESOLVED_FROM_MASK & ~(SD_RESOLVED_FROM_TRUST_ANCHOR | SD_RESOLVED_FROM_ZONE)) == 0; +} diff --git a/src/resolve/resolved-dns-query.h b/src/resolve/resolved-dns-query.h new file mode 100644 index 0000000..2723299 --- /dev/null +++ b/src/resolve/resolved-dns-query.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "set.h" +#include "varlink.h" + +typedef struct DnsQueryCandidate DnsQueryCandidate; +typedef struct DnsQuery DnsQuery; +typedef struct DnsStubListenerExtra DnsStubListenerExtra; + +#include "resolved-dns-answer.h" +#include "resolved-dns-question.h" +#include "resolved-dns-search-domain.h" +#include "resolved-dns-transaction.h" + +struct DnsQueryCandidate { + unsigned n_ref; + int error_code; + + DnsQuery *query; + DnsScope *scope; + + DnsSearchDomain *search_domain; + + Set *transactions; + + LIST_FIELDS(DnsQueryCandidate, candidates_by_query); + LIST_FIELDS(DnsQueryCandidate, candidates_by_scope); +}; + +struct DnsQuery { + Manager *manager; + + /* The question, formatted in IDNA for use on classic DNS, and as UTF8 for use in LLMNR or mDNS. Note + * that even on classic DNS some labels might use UTF8 encoding. Specifically, DNS-SD service names + * (in contrast to their domain suffixes) use UTF-8 encoding even on DNS. Thus, the difference + * between these two fields is mostly relevant only for explicit *hostname* lookups as well as the + * domain suffixes of service lookups. + * + * Note that questions may consist of multiple RR keys at once, but they must be for the same domain + * name. This is used for A+AAAA and TXT+SRV lookups: we'll allocate a single DnsQuery object for + * them instead of two separate ones. That allows us minor optimizations with response handling: + * CNAME/DNAMEs of the first reply we get can already be used to follow the CNAME/DNAME chain for + * both, and we can take benefit of server replies that oftentimes put A responses into AAAA queries + * and vice versa (in the additional section). */ + DnsQuestion *question_idna; + DnsQuestion *question_utf8; + + /* If this is not a question by ourselves, but a "bypass" request, we propagate the original packet + * here, and use that instead. */ + DnsPacket *question_bypass; + + /* When we follow a CNAME redirect, we save the original question here, for informational/monitoring + * purposes. We'll keep adding to this whenever we go one step in the redirect, so that in the end + * this will contain the complete set of CNAME questions. */ + DnsQuestion *collected_questions; + + uint64_t flags; + int ifindex; + + /* When resolving a service, we first create a TXT+SRV query, and then for the hostnames we discover + * auxiliary A+AAAA queries. This pointer always points from the auxiliary queries back to the + * TXT+SRV query. */ + int auxiliary_result; + DnsQuery *auxiliary_for; + LIST_HEAD(DnsQuery, auxiliary_queries); + + LIST_HEAD(DnsQueryCandidate, candidates); + sd_event_source *timeout_event_source; + + /* Discovered data */ + DnsAnswer *answer; + int answer_rcode; + DnssecResult answer_dnssec_result; + uint64_t answer_query_flags; + DnsProtocol answer_protocol; + int answer_family; + DnsPacket *answer_full_packet; + DnsSearchDomain *answer_search_domain; + + DnsTransactionState state; + int answer_errno; /* if state is DNS_TRANSACTION_ERRNO */ + + unsigned block_ready; + + uint8_t n_auxiliary_queries; + uint8_t n_cname_redirects; + + bool previous_redirect_unauthenticated:1; + bool previous_redirect_non_confidential:1; + bool previous_redirect_non_synthetic:1; + bool request_address_valid:1; + + /* Bus + Varlink client information */ + sd_bus_message *bus_request; + Varlink *varlink_request; + int request_family; + union in_addr_union request_address; + unsigned block_all_complete; + char *request_address_string; + + /* DNS stub information */ + DnsPacket *request_packet; + DnsStream *request_stream; + DnsAnswer *reply_answer; + DnsAnswer *reply_authoritative; + DnsAnswer *reply_additional; + DnsStubListenerExtra *stub_listener_extra; + + /* Completion callback */ + void (*complete)(DnsQuery* q); + + sd_bus_track *bus_track; + + LIST_FIELDS(DnsQuery, queries); + LIST_FIELDS(DnsQuery, auxiliary_queries); + + /* Note: fields should be ordered to minimize alignment gaps. Use pahole! */ +}; + +enum { + DNS_QUERY_MATCH, + DNS_QUERY_NOMATCH, + DNS_QUERY_CNAME, +}; + +DnsQueryCandidate* dns_query_candidate_ref(DnsQueryCandidate*); +DnsQueryCandidate* dns_query_candidate_unref(DnsQueryCandidate*); +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsQueryCandidate*, dns_query_candidate_unref); + +void dns_query_candidate_notify(DnsQueryCandidate *c); + +int dns_query_new(Manager *m, DnsQuery **q, DnsQuestion *question_utf8, DnsQuestion *question_idna, DnsPacket *question_bypass, int family, uint64_t flags); +DnsQuery *dns_query_free(DnsQuery *q); + +int dns_query_make_auxiliary(DnsQuery *q, DnsQuery *auxiliary_for); + +int dns_query_go(DnsQuery *q); +void dns_query_ready(DnsQuery *q); + +int dns_query_process_cname_one(DnsQuery *q); +int dns_query_process_cname_many(DnsQuery *q); + +void dns_query_complete(DnsQuery *q, DnsTransactionState state); + +DnsQuestion* dns_query_question_for_protocol(DnsQuery *q, DnsProtocol protocol); + +const char *dns_query_string(DnsQuery *q); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsQuery*, dns_query_free); + +bool dns_query_fully_authenticated(DnsQuery *q); +bool dns_query_fully_confidential(DnsQuery *q); +bool dns_query_fully_authoritative(DnsQuery *q); + +static inline uint64_t dns_query_reply_flags_make(DnsQuery *q) { + assert(q); + + return SD_RESOLVED_FLAGS_MAKE(q->answer_protocol, + q->answer_family, + dns_query_fully_authenticated(q), + dns_query_fully_confidential(q)) | + (q->answer_query_flags & (SD_RESOLVED_FROM_MASK|SD_RESOLVED_SYNTHETIC)); +} diff --git a/src/resolve/resolved-dns-question.c b/src/resolve/resolved-dns-question.c new file mode 100644 index 0000000..5754c85 --- /dev/null +++ b/src/resolve/resolved-dns-question.c @@ -0,0 +1,552 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dns-domain.h" +#include "dns-type.h" +#include "resolved-dns-question.h" +#include "socket-util.h" + +DnsQuestion *dns_question_new(size_t n) { + DnsQuestion *q; + + if (n > UINT16_MAX) /* We can only place 64K key in an question section at max */ + n = UINT16_MAX; + + q = malloc0(offsetof(DnsQuestion, items) + sizeof(DnsQuestionItem) * n); + if (!q) + return NULL; + + q->n_ref = 1; + q->n_allocated = n; + + return q; +} + +static DnsQuestion *dns_question_free(DnsQuestion *q) { + DnsResourceKey *key; + + assert(q); + + DNS_QUESTION_FOREACH(key, q) + dns_resource_key_unref(key); + + return mfree(q); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DnsQuestion, dns_question, dns_question_free); + +int dns_question_add_raw(DnsQuestion *q, DnsResourceKey *key, DnsQuestionFlags flags) { + /* Insert without checking for duplicates. */ + + assert(key); + assert(q); + + if (q->n_keys >= q->n_allocated) + return -ENOSPC; + + q->items[q->n_keys++] = (DnsQuestionItem) { + .key = dns_resource_key_ref(key), + .flags = flags, + }; + return 0; +} + +static int dns_question_add_raw_all(DnsQuestion *a, DnsQuestion *b) { + DnsQuestionItem *item; + int r; + + DNS_QUESTION_FOREACH_ITEM(item, b) { + r = dns_question_add_raw(a, item->key, item->flags); + if (r < 0) + return r; + } + + return 0; +} + +int dns_question_add(DnsQuestion *q, DnsResourceKey *key, DnsQuestionFlags flags) { + DnsQuestionItem *item; + int r; + + assert(key); + + if (!q) + return -ENOSPC; + + + DNS_QUESTION_FOREACH_ITEM(item, q) { + r = dns_resource_key_equal(item->key, key); + if (r < 0) + return r; + if (r > 0 && item->flags == flags) + return 0; + } + + return dns_question_add_raw(q, key, flags); +} + +static int dns_question_add_all(DnsQuestion *a, DnsQuestion *b) { + DnsQuestionItem *item; + int r; + + DNS_QUESTION_FOREACH_ITEM(item, b) { + r = dns_question_add(a, item->key, item->flags); + if (r < 0) + return r; + } + + return 0; +} + +int dns_question_matches_rr(DnsQuestion *q, DnsResourceRecord *rr, const char *search_domain) { + DnsResourceKey *key; + int r; + + assert(rr); + + if (!q) + return 0; + + DNS_QUESTION_FOREACH(key, q) { + r = dns_resource_key_match_rr(key, rr, search_domain); + if (r != 0) + return r; + } + + return 0; +} + +int dns_question_matches_cname_or_dname(DnsQuestion *q, DnsResourceRecord *rr, const char *search_domain) { + DnsResourceKey *key; + int r; + + assert(rr); + + if (!q) + return 0; + + if (!IN_SET(rr->key->type, DNS_TYPE_CNAME, DNS_TYPE_DNAME)) + return 0; + + DNS_QUESTION_FOREACH(key, q) { + /* For a {C,D}NAME record we can never find a matching {C,D}NAME record */ + if (!dns_type_may_redirect(key->type)) + return 0; + + r = dns_resource_key_match_cname_or_dname(key, rr->key, search_domain); + if (r != 0) + return r; + } + + return 0; +} + +int dns_question_is_valid_for_query(DnsQuestion *q) { + const char *name; + size_t i; + int r; + + if (!q) + return 0; + + if (q->n_keys <= 0) + return 0; + + if (q->n_keys > 65535) + return 0; + + name = dns_resource_key_name(q->items[0].key); + if (!name) + return 0; + + /* Check that all keys in this question bear the same name */ + for (i = 0; i < q->n_keys; i++) { + assert(q->items[i].key); + + if (i > 0) { + r = dns_name_equal(dns_resource_key_name(q->items[i].key), name); + if (r <= 0) + return r; + } + + if (!dns_type_is_valid_query(q->items[i].key->type)) + return 0; + } + + return 1; +} + +int dns_question_contains_key(DnsQuestion *q, const DnsResourceKey *k) { + size_t j; + int r; + + assert(k); + + if (!q) + return 0; + + + for (j = 0; j < q->n_keys; j++) { + r = dns_resource_key_equal(q->items[j].key, k); + if (r != 0) + return r; + } + + return 0; +} + +static int dns_question_contains_item(DnsQuestion *q, const DnsQuestionItem *i) { + DnsQuestionItem *item; + int r; + + assert(i); + + DNS_QUESTION_FOREACH_ITEM(item, q) { + if (item->flags != i->flags) + continue; + r = dns_resource_key_equal(item->key, i->key); + if (r != 0) + return r; + } + + return false; +} + +int dns_question_is_equal(DnsQuestion *a, DnsQuestion *b) { + DnsQuestionItem *item; + int r; + + if (a == b) + return 1; + + if (!a) + return !b || b->n_keys == 0; + if (!b) + return a->n_keys == 0; + + /* Checks if all items in a are also contained b, and vice versa */ + + DNS_QUESTION_FOREACH_ITEM(item, a) { + r = dns_question_contains_item(b, item); + if (r <= 0) + return r; + } + DNS_QUESTION_FOREACH_ITEM(item, b) { + r = dns_question_contains_item(a, item); + if (r <= 0) + return r; + } + + return 1; +} + +int dns_question_cname_redirect(DnsQuestion *q, const DnsResourceRecord *cname, DnsQuestion **ret) { + _cleanup_(dns_question_unrefp) DnsQuestion *n = NULL; + DnsResourceKey *key; + bool same = true; + int r; + + assert(cname); + assert(ret); + assert(IN_SET(cname->key->type, DNS_TYPE_CNAME, DNS_TYPE_DNAME)); + + if (dns_question_size(q) <= 0) { + *ret = NULL; + return 0; + } + + DNS_QUESTION_FOREACH(key, q) { + _cleanup_free_ char *destination = NULL; + const char *d; + + if (cname->key->type == DNS_TYPE_CNAME) + d = cname->cname.name; + else { + r = dns_name_change_suffix(dns_resource_key_name(key), dns_resource_key_name(cname->key), cname->dname.name, &destination); + if (r < 0) + return r; + if (r == 0) + continue; + + d = destination; + } + + r = dns_name_equal(dns_resource_key_name(key), d); + if (r < 0) + return r; + + if (r == 0) { + same = false; + break; + } + } + + /* Fully the same, indicate we didn't do a thing */ + if (same) { + *ret = NULL; + return 0; + } + + n = dns_question_new(q->n_keys); + if (!n) + return -ENOMEM; + + /* Create a new question, and patch in the new name */ + DNS_QUESTION_FOREACH(key, q) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *k = NULL; + + k = dns_resource_key_new_redirect(key, cname); + if (!k) + return -ENOMEM; + + r = dns_question_add(n, k, 0); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(n); + + return 1; +} + +const char *dns_question_first_name(DnsQuestion *q) { + + if (!q) + return NULL; + + if (q->n_keys < 1) + return NULL; + + return dns_resource_key_name(q->items[0].key); +} + +int dns_question_new_address(DnsQuestion **ret, int family, const char *name, bool convert_idna) { + _cleanup_(dns_question_unrefp) DnsQuestion *q = NULL; + _cleanup_free_ char *buf = NULL; + int r; + + assert(ret); + assert(name); + + if (!IN_SET(family, AF_INET, AF_INET6, AF_UNSPEC)) + return -EAFNOSUPPORT; + + /* If IPv6 is off and the request has an unspecified lookup family, restrict it automatically to + * IPv4. */ + if (family == AF_UNSPEC && !socket_ipv6_is_enabled()) + family = AF_INET; + + if (convert_idna) { + r = dns_name_apply_idna(name, &buf); + if (r < 0) + return r; + if (r > 0 && !streq(name, buf)) + name = buf; + else + /* We did not manage to create convert the idna name, or it's + * the same as the original name. We assume the caller already + * created an unconverted question, so let's not repeat work + * unnecessarily. */ + return -EALREADY; + } + + q = dns_question_new(family == AF_UNSPEC ? 2 : 1); + if (!q) + return -ENOMEM; + + if (family != AF_INET6) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + + key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_A, name); + if (!key) + return -ENOMEM; + + r = dns_question_add(q, key, 0); + if (r < 0) + return r; + } + + if (family != AF_INET) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + + key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_AAAA, name); + if (!key) + return -ENOMEM; + + r = dns_question_add(q, key, 0); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(q); + + return 0; +} + +int dns_question_new_reverse(DnsQuestion **ret, int family, const union in_addr_union *a) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + _cleanup_(dns_question_unrefp) DnsQuestion *q = NULL; + _cleanup_free_ char *reverse = NULL; + int r; + + assert(ret); + assert(a); + + if (!IN_SET(family, AF_INET, AF_INET6, AF_UNSPEC)) + return -EAFNOSUPPORT; + + r = dns_name_reverse(family, a, &reverse); + if (r < 0) + return r; + + q = dns_question_new(1); + if (!q) + return -ENOMEM; + + key = dns_resource_key_new_consume(DNS_CLASS_IN, DNS_TYPE_PTR, reverse); + if (!key) + return -ENOMEM; + + reverse = NULL; + + r = dns_question_add(q, key, 0); + if (r < 0) + return r; + + *ret = TAKE_PTR(q); + + return 0; +} + +int dns_question_new_service( + DnsQuestion **ret, + const char *service, + const char *type, + const char *domain, + bool with_txt, + bool convert_idna) { + + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + _cleanup_(dns_question_unrefp) DnsQuestion *q = NULL; + _cleanup_free_ char *buf = NULL, *joined = NULL; + const char *name; + int r; + + assert(ret); + + /* We support three modes of invocation: + * + * 1. Only a domain is specified, in which case we assume a properly encoded SRV RR name, including service + * type and possibly a service name. If specified in this way we assume it's already IDNA converted if + * that's necessary. + * + * 2. Both service type and a domain specified, in which case a normal SRV RR is assumed, without a DNS-SD + * style prefix. In this case we'll IDNA convert the domain, if that's requested. + * + * 3. All three of service name, type and domain are specified, in which case a DNS-SD service is put + * together. The service name is never IDNA converted, and the domain is if requested. + * + * It's not supported to specify a service name without a type, or no domain name. + */ + + if (!domain) + return -EINVAL; + + if (type) { + if (convert_idna) { + r = dns_name_apply_idna(domain, &buf); + if (r < 0) + return r; + if (r > 0) + domain = buf; + } + + r = dns_service_join(service, type, domain, &joined); + if (r < 0) + return r; + + name = joined; + } else { + if (service) + return -EINVAL; + + name = domain; + } + + q = dns_question_new(1 + with_txt); + if (!q) + return -ENOMEM; + + key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_SRV, name); + if (!key) + return -ENOMEM; + + r = dns_question_add(q, key, 0); + if (r < 0) + return r; + + if (with_txt) { + dns_resource_key_unref(key); + key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_TXT, name); + if (!key) + return -ENOMEM; + + r = dns_question_add(q, key, 0); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(q); + + return 0; +} + +/* + * This function is not used in the code base, but is useful when debugging. Do not delete. + */ +void dns_question_dump(DnsQuestion *question, FILE *f) { + DnsResourceKey *k; + + if (!f) + f = stdout; + + DNS_QUESTION_FOREACH(k, question) { + char buf[DNS_RESOURCE_KEY_STRING_MAX]; + + fputc('\t', f); + fputs(dns_resource_key_to_string(k, buf, sizeof(buf)), f); + fputc('\n', f); + } +} + +int dns_question_merge(DnsQuestion *a, DnsQuestion *b, DnsQuestion **ret) { + _cleanup_(dns_question_unrefp) DnsQuestion *k = NULL; + int r; + + assert(ret); + + if (a == b || dns_question_size(b) <= 0) { + *ret = dns_question_ref(a); + return 0; + } + + if (dns_question_size(a) <= 0) { + *ret = dns_question_ref(b); + return 0; + } + + k = dns_question_new(dns_question_size(a) + dns_question_size(b)); + if (!k) + return -ENOMEM; + + r = dns_question_add_raw_all(k, a); + if (r < 0) + return r; + + r = dns_question_add_all(k, b); + if (r < 0) + return r; + + *ret = TAKE_PTR(k); + return 0; +} diff --git a/src/resolve/resolved-dns-question.h b/src/resolve/resolved-dns-question.h new file mode 100644 index 0000000..b7dc60c --- /dev/null +++ b/src/resolve/resolved-dns-question.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct DnsQuestion DnsQuestion; +typedef struct DnsQuestionItem DnsQuestionItem; + +#include "macro.h" +#include "resolved-dns-rr.h" + +/* A simple array of resource keys */ + +typedef enum DnsQuestionFlags { + DNS_QUESTION_WANTS_UNICAST_REPLY = 1 << 0, /* For mDNS: sender is willing to accept unicast replies */ +} DnsQuestionFlags; + +struct DnsQuestionItem { + DnsResourceKey *key; + DnsQuestionFlags flags; +}; + +struct DnsQuestion { + unsigned n_ref; + size_t n_keys, n_allocated; + DnsQuestionItem items[]; +}; + +DnsQuestion *dns_question_new(size_t n); +DnsQuestion *dns_question_ref(DnsQuestion *q); +DnsQuestion *dns_question_unref(DnsQuestion *q); + +int dns_question_new_address(DnsQuestion **ret, int family, const char *name, bool convert_idna); +int dns_question_new_reverse(DnsQuestion **ret, int family, const union in_addr_union *a); +int dns_question_new_service(DnsQuestion **ret, const char *service, const char *type, const char *domain, bool with_txt, bool convert_idna); + +int dns_question_add_raw(DnsQuestion *q, DnsResourceKey *key, DnsQuestionFlags flags); +int dns_question_add(DnsQuestion *q, DnsResourceKey *key, DnsQuestionFlags flags); + +int dns_question_matches_rr(DnsQuestion *q, DnsResourceRecord *rr, const char *search_domain); +int dns_question_matches_cname_or_dname(DnsQuestion *q, DnsResourceRecord *rr, const char* search_domain); +int dns_question_is_valid_for_query(DnsQuestion *q); +int dns_question_contains_key(DnsQuestion *q, const DnsResourceKey *k); +int dns_question_is_equal(DnsQuestion *a, DnsQuestion *b); + +int dns_question_cname_redirect(DnsQuestion *q, const DnsResourceRecord *cname, DnsQuestion **ret); + +void dns_question_dump(DnsQuestion *q, FILE *f); + +const char *dns_question_first_name(DnsQuestion *q); + +static inline DnsResourceKey *dns_question_first_key(DnsQuestion *q) { + return (q && q->n_keys > 0) ? q->items[0].key : NULL; +} + +static inline size_t dns_question_size(DnsQuestion *q) { + return q ? q->n_keys : 0; +} + +static inline bool dns_question_isempty(DnsQuestion *q) { + return dns_question_size(q) <= 0; +} + +int dns_question_merge(DnsQuestion *a, DnsQuestion *b, DnsQuestion **ret); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsQuestion*, dns_question_unref); + +#define _DNS_QUESTION_FOREACH(u, k, q) \ + for (size_t UNIQ_T(i, u) = ({ \ + (k) = ((q) && (q)->n_keys > 0) ? (q)->items[0].key : NULL; \ + 0; \ + }); \ + (q) && (UNIQ_T(i, u) < (q)->n_keys); \ + UNIQ_T(i, u)++, (k) = (UNIQ_T(i, u) < (q)->n_keys ? (q)->items[UNIQ_T(i, u)].key : NULL)) + +#define DNS_QUESTION_FOREACH(key, q) _DNS_QUESTION_FOREACH(UNIQ, key, q) + +#define _DNS_QUESTION_FOREACH_ITEM(u, item, q) \ + for (size_t UNIQ_T(i, u) = ({ \ + (item) = dns_question_isempty(q) ? NULL : (q)->items; \ + 0; \ + }); \ + UNIQ_T(i, u) < dns_question_size(q); \ + UNIQ_T(i, u)++, (item) = (UNIQ_T(i, u) < dns_question_size(q) ? (q)->items + UNIQ_T(i, u) : NULL)) + +#define DNS_QUESTION_FOREACH_ITEM(item, q) _DNS_QUESTION_FOREACH_ITEM(UNIQ, item, q) diff --git a/src/resolve/resolved-dns-rr.c b/src/resolve/resolved-dns-rr.c new file mode 100644 index 0000000..00f7bea --- /dev/null +++ b/src/resolve/resolved-dns-rr.c @@ -0,0 +1,2159 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "dns-domain.h" +#include "dns-type.h" +#include "escape.h" +#include "hexdecoct.h" +#include "memory-util.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-rr.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +DnsResourceKey* dns_resource_key_new(uint16_t class, uint16_t type, const char *name) { + DnsResourceKey *k; + size_t l; + + assert(name); + + l = strlen(name); + k = malloc0(sizeof(DnsResourceKey) + l + 1); + if (!k) + return NULL; + + k->n_ref = 1; + k->class = class; + k->type = type; + + strcpy((char*) k + sizeof(DnsResourceKey), name); + + return k; +} + +DnsResourceKey* dns_resource_key_new_redirect(const DnsResourceKey *key, const DnsResourceRecord *cname) { + int r; + + assert(key); + assert(cname); + + assert(IN_SET(cname->key->type, DNS_TYPE_CNAME, DNS_TYPE_DNAME)); + + if (cname->key->type == DNS_TYPE_CNAME) + return dns_resource_key_new(key->class, key->type, cname->cname.name); + else { + _cleanup_free_ char *destination = NULL; + DnsResourceKey *k; + + r = dns_name_change_suffix(dns_resource_key_name(key), dns_resource_key_name(cname->key), cname->dname.name, &destination); + if (r < 0) + return NULL; + if (r == 0) + return dns_resource_key_ref((DnsResourceKey*) key); + + k = dns_resource_key_new_consume(key->class, key->type, destination); + if (!k) + return NULL; + + TAKE_PTR(destination); + return k; + } +} + +int dns_resource_key_new_append_suffix(DnsResourceKey **ret, DnsResourceKey *key, char *name) { + DnsResourceKey *new_key; + char *joined; + int r; + + assert(ret); + assert(key); + assert(name); + + if (dns_name_is_root(name)) { + *ret = dns_resource_key_ref(key); + return 0; + } + + r = dns_name_concat(dns_resource_key_name(key), name, 0, &joined); + if (r < 0) + return r; + + new_key = dns_resource_key_new_consume(key->class, key->type, joined); + if (!new_key) { + free(joined); + return -ENOMEM; + } + + *ret = new_key; + return 0; +} + +DnsResourceKey* dns_resource_key_new_consume(uint16_t class, uint16_t type, char *name) { + DnsResourceKey *k; + + assert(name); + + k = new(DnsResourceKey, 1); + if (!k) + return NULL; + + *k = (DnsResourceKey) { + .n_ref = 1, + .class = class, + .type = type, + ._name = name, + }; + + return k; +} + +DnsResourceKey* dns_resource_key_ref(DnsResourceKey *k) { + + if (!k) + return NULL; + + /* Static/const keys created with DNS_RESOURCE_KEY_CONST will + * set this to -1, they should not be reffed/unreffed */ + assert(k->n_ref != UINT_MAX); + + assert(k->n_ref > 0); + k->n_ref++; + + return k; +} + +DnsResourceKey* dns_resource_key_unref(DnsResourceKey *k) { + if (!k) + return NULL; + + assert(k->n_ref != UINT_MAX); + assert(k->n_ref > 0); + + if (k->n_ref == 1) { + free(k->_name); + free(k); + } else + k->n_ref--; + + return NULL; +} + +const char* dns_resource_key_name(const DnsResourceKey *key) { + const char *name; + + if (!key) + return NULL; + + if (key->_name) + name = key->_name; + else + name = (char*) key + sizeof(DnsResourceKey); + + if (dns_name_is_root(name)) + return "."; + else + return name; +} + +bool dns_resource_key_is_address(const DnsResourceKey *key) { + assert(key); + + /* Check if this is an A or AAAA resource key */ + + return key->class == DNS_CLASS_IN && IN_SET(key->type, DNS_TYPE_A, DNS_TYPE_AAAA); +} + +bool dns_resource_key_is_dnssd_ptr(const DnsResourceKey *key) { + assert(key); + + /* Check if this is a PTR resource key used in + Service Instance Enumeration as described in RFC6763 p4.1. */ + + if (key->type != DNS_TYPE_PTR) + return false; + + return dns_name_endswith(dns_resource_key_name(key), "_tcp.local") || + dns_name_endswith(dns_resource_key_name(key), "_udp.local"); +} + +int dns_resource_key_equal(const DnsResourceKey *a, const DnsResourceKey *b) { + int r; + + if (a == b) + return 1; + + r = dns_name_equal(dns_resource_key_name(a), dns_resource_key_name(b)); + if (r <= 0) + return r; + + if (a->class != b->class) + return 0; + + if (a->type != b->type) + return 0; + + return 1; +} + +int dns_resource_key_match_rr(const DnsResourceKey *key, DnsResourceRecord *rr, const char *search_domain) { + int r; + + assert(key); + assert(rr); + + if (key == rr->key) + return 1; + + /* Checks if an rr matches the specified key. If a search + * domain is specified, it will also be checked if the key + * with the search domain suffixed might match the RR. */ + + if (rr->key->class != key->class && key->class != DNS_CLASS_ANY) + return 0; + + if (rr->key->type != key->type && key->type != DNS_TYPE_ANY) + return 0; + + r = dns_name_equal(dns_resource_key_name(rr->key), dns_resource_key_name(key)); + if (r != 0) + return r; + + if (search_domain) { + _cleanup_free_ char *joined = NULL; + + r = dns_name_concat(dns_resource_key_name(key), search_domain, 0, &joined); + if (r < 0) + return r; + + return dns_name_equal(dns_resource_key_name(rr->key), joined); + } + + return 0; +} + +int dns_resource_key_match_cname_or_dname(const DnsResourceKey *key, const DnsResourceKey *cname, const char *search_domain) { + int r; + + assert(key); + assert(cname); + + if (cname->class != key->class && key->class != DNS_CLASS_ANY) + return 0; + + if (!dns_type_may_redirect(key->type)) + return 0; + + if (cname->type == DNS_TYPE_CNAME) + r = dns_name_equal(dns_resource_key_name(key), dns_resource_key_name(cname)); + else if (cname->type == DNS_TYPE_DNAME) + r = dns_name_endswith(dns_resource_key_name(key), dns_resource_key_name(cname)); + else + return 0; + + if (r != 0) + return r; + + if (search_domain) { + _cleanup_free_ char *joined = NULL; + + r = dns_name_concat(dns_resource_key_name(key), search_domain, 0, &joined); + if (r < 0) + return r; + + if (cname->type == DNS_TYPE_CNAME) + return dns_name_equal(joined, dns_resource_key_name(cname)); + else if (cname->type == DNS_TYPE_DNAME) + return dns_name_endswith(joined, dns_resource_key_name(cname)); + } + + return 0; +} + +int dns_resource_key_match_soa(const DnsResourceKey *key, const DnsResourceKey *soa) { + assert(soa); + assert(key); + + /* Checks whether 'soa' is a SOA record for the specified key. */ + + if (soa->class != key->class) + return 0; + + if (soa->type != DNS_TYPE_SOA) + return 0; + + return dns_name_endswith(dns_resource_key_name(key), dns_resource_key_name(soa)); +} + +static void dns_resource_key_hash_func(const DnsResourceKey *k, struct siphash *state) { + assert(k); + + dns_name_hash_func(dns_resource_key_name(k), state); + siphash24_compress(&k->class, sizeof(k->class), state); + siphash24_compress(&k->type, sizeof(k->type), state); +} + +static int dns_resource_key_compare_func(const DnsResourceKey *x, const DnsResourceKey *y) { + int r; + + r = dns_name_compare_func(dns_resource_key_name(x), dns_resource_key_name(y)); + if (r != 0) + return r; + + r = CMP(x->type, y->type); + if (r != 0) + return r; + + return CMP(x->class, y->class); +} + +DEFINE_HASH_OPS(dns_resource_key_hash_ops, DnsResourceKey, dns_resource_key_hash_func, dns_resource_key_compare_func); + +char* dns_resource_key_to_string(const DnsResourceKey *key, char *buf, size_t buf_size) { + const char *c, *t; + char *ans = buf; + + /* If we cannot convert the CLASS/TYPE into a known string, + use the format recommended by RFC 3597, Section 5. */ + + c = dns_class_to_string(key->class); + t = dns_type_to_string(key->type); + + (void) snprintf(buf, buf_size, "%s %s%s%.0u %s%s%.0u", + dns_resource_key_name(key), + strempty(c), c ? "" : "CLASS", c ? 0u : key->class, + strempty(t), t ? "" : "TYPE", t ? 0u : key->type); + + return ans; +} + +bool dns_resource_key_reduce(DnsResourceKey **a, DnsResourceKey **b) { + assert(a); + assert(b); + + /* Try to replace one RR key by another if they are identical, thus saving a bit of memory. Note that we do + * this only for RR keys, not for RRs themselves, as they carry a lot of additional metadata (where they come + * from, validity data, and suchlike), and cannot be replaced so easily by other RRs that have the same + * superficial data. */ + + if (!*a) + return false; + if (!*b) + return false; + + /* We refuse merging const keys */ + if ((*a)->n_ref == UINT_MAX) + return false; + if ((*b)->n_ref == UINT_MAX) + return false; + + /* Already the same? */ + if (*a == *b) + return true; + + /* Are they really identical? */ + if (dns_resource_key_equal(*a, *b) <= 0) + return false; + + /* Keep the one which already has more references. */ + if ((*a)->n_ref > (*b)->n_ref) + DNS_RESOURCE_KEY_REPLACE(*b, dns_resource_key_ref(*a)); + else + DNS_RESOURCE_KEY_REPLACE(*a, dns_resource_key_ref(*b)); + + return true; +} + +DnsResourceRecord* dns_resource_record_new(DnsResourceKey *key) { + DnsResourceRecord *rr; + + rr = new(DnsResourceRecord, 1); + if (!rr) + return NULL; + + *rr = (DnsResourceRecord) { + .n_ref = 1, + .key = dns_resource_key_ref(key), + .expiry = USEC_INFINITY, + .n_skip_labels_signer = UINT8_MAX, + .n_skip_labels_source = UINT8_MAX, + }; + + return rr; +} + +DnsResourceRecord* dns_resource_record_new_full(uint16_t class, uint16_t type, const char *name) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + + key = dns_resource_key_new(class, type, name); + if (!key) + return NULL; + + return dns_resource_record_new(key); +} + +static DnsResourceRecord* dns_resource_record_free(DnsResourceRecord *rr) { + assert(rr); + + if (rr->key) { + switch (rr->key->type) { + + case DNS_TYPE_SRV: + free(rr->srv.name); + break; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + free(rr->ptr.name); + break; + + case DNS_TYPE_HINFO: + free(rr->hinfo.cpu); + free(rr->hinfo.os); + break; + + case DNS_TYPE_TXT: + case DNS_TYPE_SPF: + dns_txt_item_free_all(rr->txt.items); + break; + + case DNS_TYPE_SOA: + free(rr->soa.mname); + free(rr->soa.rname); + break; + + case DNS_TYPE_MX: + free(rr->mx.exchange); + break; + + case DNS_TYPE_DS: + free(rr->ds.digest); + break; + + case DNS_TYPE_SSHFP: + free(rr->sshfp.fingerprint); + break; + + case DNS_TYPE_DNSKEY: + free(rr->dnskey.key); + break; + + case DNS_TYPE_RRSIG: + free(rr->rrsig.signer); + free(rr->rrsig.signature); + break; + + case DNS_TYPE_NSEC: + free(rr->nsec.next_domain_name); + bitmap_free(rr->nsec.types); + break; + + case DNS_TYPE_NSEC3: + free(rr->nsec3.next_hashed_name); + free(rr->nsec3.salt); + bitmap_free(rr->nsec3.types); + break; + + case DNS_TYPE_LOC: + case DNS_TYPE_A: + case DNS_TYPE_AAAA: + break; + + case DNS_TYPE_TLSA: + free(rr->tlsa.data); + break; + + case DNS_TYPE_CAA: + free(rr->caa.tag); + free(rr->caa.value); + break; + + case DNS_TYPE_OPENPGPKEY: + default: + if (!rr->unparsable) + free(rr->generic.data); + } + + if (rr->unparsable) + free(rr->generic.data); + + free(rr->wire_format); + dns_resource_key_unref(rr->key); + } + + free(rr->to_string); + return mfree(rr); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DnsResourceRecord, dns_resource_record, dns_resource_record_free); + +int dns_resource_record_new_reverse(DnsResourceRecord **ret, int family, const union in_addr_union *address, const char *hostname) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + _cleanup_free_ char *ptr = NULL; + int r; + + assert(ret); + assert(address); + assert(hostname); + + r = dns_name_reverse(family, address, &ptr); + if (r < 0) + return r; + + key = dns_resource_key_new_consume(DNS_CLASS_IN, DNS_TYPE_PTR, ptr); + if (!key) + return -ENOMEM; + + ptr = NULL; + + rr = dns_resource_record_new(key); + if (!rr) + return -ENOMEM; + + rr->ptr.name = strdup(hostname); + if (!rr->ptr.name) + return -ENOMEM; + + *ret = TAKE_PTR(rr); + + return 0; +} + +int dns_resource_record_new_address(DnsResourceRecord **ret, int family, const union in_addr_union *address, const char *name) { + DnsResourceRecord *rr; + + assert(ret); + assert(address); + assert(family); + + if (family == AF_INET) { + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_A, name); + if (!rr) + return -ENOMEM; + + rr->a.in_addr = address->in; + + } else if (family == AF_INET6) { + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_AAAA, name); + if (!rr) + return -ENOMEM; + + rr->aaaa.in6_addr = address->in6; + } else + return -EAFNOSUPPORT; + + *ret = rr; + + return 0; +} + +#define FIELD_EQUAL(a, b, field) \ + ((a).field ## _size == (b).field ## _size && \ + memcmp_safe((a).field, (b).field, (a).field ## _size) == 0) + +int dns_resource_record_payload_equal(const DnsResourceRecord *a, const DnsResourceRecord *b) { + int r; + + /* Check if a and b are the same, but don't look at their keys */ + + if (a->unparsable != b->unparsable) + return 0; + + switch (a->unparsable ? _DNS_TYPE_INVALID : a->key->type) { + + case DNS_TYPE_SRV: + r = dns_name_equal(a->srv.name, b->srv.name); + if (r <= 0) + return r; + + return a->srv.priority == b->srv.priority && + a->srv.weight == b->srv.weight && + a->srv.port == b->srv.port; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + return dns_name_equal(a->ptr.name, b->ptr.name); + + case DNS_TYPE_HINFO: + return strcaseeq(a->hinfo.cpu, b->hinfo.cpu) && + strcaseeq(a->hinfo.os, b->hinfo.os); + + case DNS_TYPE_SPF: /* exactly the same as TXT */ + case DNS_TYPE_TXT: + return dns_txt_item_equal(a->txt.items, b->txt.items); + + case DNS_TYPE_A: + return memcmp(&a->a.in_addr, &b->a.in_addr, sizeof(struct in_addr)) == 0; + + case DNS_TYPE_AAAA: + return memcmp(&a->aaaa.in6_addr, &b->aaaa.in6_addr, sizeof(struct in6_addr)) == 0; + + case DNS_TYPE_SOA: + r = dns_name_equal(a->soa.mname, b->soa.mname); + if (r <= 0) + return r; + r = dns_name_equal(a->soa.rname, b->soa.rname); + if (r <= 0) + return r; + + return a->soa.serial == b->soa.serial && + a->soa.refresh == b->soa.refresh && + a->soa.retry == b->soa.retry && + a->soa.expire == b->soa.expire && + a->soa.minimum == b->soa.minimum; + + case DNS_TYPE_MX: + if (a->mx.priority != b->mx.priority) + return 0; + + return dns_name_equal(a->mx.exchange, b->mx.exchange); + + case DNS_TYPE_LOC: + assert(a->loc.version == b->loc.version); + + return a->loc.size == b->loc.size && + a->loc.horiz_pre == b->loc.horiz_pre && + a->loc.vert_pre == b->loc.vert_pre && + a->loc.latitude == b->loc.latitude && + a->loc.longitude == b->loc.longitude && + a->loc.altitude == b->loc.altitude; + + case DNS_TYPE_DS: + return a->ds.key_tag == b->ds.key_tag && + a->ds.algorithm == b->ds.algorithm && + a->ds.digest_type == b->ds.digest_type && + FIELD_EQUAL(a->ds, b->ds, digest); + + case DNS_TYPE_SSHFP: + return a->sshfp.algorithm == b->sshfp.algorithm && + a->sshfp.fptype == b->sshfp.fptype && + FIELD_EQUAL(a->sshfp, b->sshfp, fingerprint); + + case DNS_TYPE_DNSKEY: + return a->dnskey.flags == b->dnskey.flags && + a->dnskey.protocol == b->dnskey.protocol && + a->dnskey.algorithm == b->dnskey.algorithm && + FIELD_EQUAL(a->dnskey, b->dnskey, key); + + case DNS_TYPE_RRSIG: + /* do the fast comparisons first */ + return a->rrsig.type_covered == b->rrsig.type_covered && + a->rrsig.algorithm == b->rrsig.algorithm && + a->rrsig.labels == b->rrsig.labels && + a->rrsig.original_ttl == b->rrsig.original_ttl && + a->rrsig.expiration == b->rrsig.expiration && + a->rrsig.inception == b->rrsig.inception && + a->rrsig.key_tag == b->rrsig.key_tag && + FIELD_EQUAL(a->rrsig, b->rrsig, signature) && + dns_name_equal(a->rrsig.signer, b->rrsig.signer); + + case DNS_TYPE_NSEC: + return dns_name_equal(a->nsec.next_domain_name, b->nsec.next_domain_name) && + bitmap_equal(a->nsec.types, b->nsec.types); + + case DNS_TYPE_NSEC3: + return a->nsec3.algorithm == b->nsec3.algorithm && + a->nsec3.flags == b->nsec3.flags && + a->nsec3.iterations == b->nsec3.iterations && + FIELD_EQUAL(a->nsec3, b->nsec3, salt) && + FIELD_EQUAL(a->nsec3, b->nsec3, next_hashed_name) && + bitmap_equal(a->nsec3.types, b->nsec3.types); + + case DNS_TYPE_TLSA: + return a->tlsa.cert_usage == b->tlsa.cert_usage && + a->tlsa.selector == b->tlsa.selector && + a->tlsa.matching_type == b->tlsa.matching_type && + FIELD_EQUAL(a->tlsa, b->tlsa, data); + + case DNS_TYPE_CAA: + return a->caa.flags == b->caa.flags && + streq(a->caa.tag, b->caa.tag) && + FIELD_EQUAL(a->caa, b->caa, value); + + case DNS_TYPE_OPENPGPKEY: + default: + return FIELD_EQUAL(a->generic, b->generic, data); + } +} + +int dns_resource_record_equal(const DnsResourceRecord *a, const DnsResourceRecord *b) { + int r; + + assert(a); + assert(b); + + if (a == b) + return 1; + + r = dns_resource_key_equal(a->key, b->key); + if (r <= 0) + return r; + + return dns_resource_record_payload_equal(a, b); +} + +static char* format_location(uint32_t latitude, uint32_t longitude, uint32_t altitude, + uint8_t size, uint8_t horiz_pre, uint8_t vert_pre) { + char *s; + char NS = latitude >= 1U<<31 ? 'N' : 'S'; + char EW = longitude >= 1U<<31 ? 'E' : 'W'; + + int lat = latitude >= 1U<<31 ? (int) (latitude - (1U<<31)) : (int) ((1U<<31) - latitude); + int lon = longitude >= 1U<<31 ? (int) (longitude - (1U<<31)) : (int) ((1U<<31) - longitude); + double alt = altitude >= 10000000u ? altitude - 10000000u : -(double)(10000000u - altitude); + double siz = (size >> 4) * exp10((double) (size & 0xF)); + double hor = (horiz_pre >> 4) * exp10((double) (horiz_pre & 0xF)); + double ver = (vert_pre >> 4) * exp10((double) (vert_pre & 0xF)); + + if (asprintf(&s, "%d %d %.3f %c %d %d %.3f %c %.2fm %.2fm %.2fm %.2fm", + (lat / 60000 / 60), + (lat / 60000) % 60, + (lat % 60000) / 1000., + NS, + (lon / 60000 / 60), + (lon / 60000) % 60, + (lon % 60000) / 1000., + EW, + alt / 100., + siz / 100., + hor / 100., + ver / 100.) < 0) + return NULL; + + return s; +} + +static int format_timestamp_dns(char *buf, size_t l, time_t sec) { + struct tm tm; + + assert(buf); + assert(l > STRLEN("YYYYMMDDHHmmSS")); + + if (!gmtime_r(&sec, &tm)) + return -EINVAL; + + if (strftime(buf, l, "%Y%m%d%H%M%S", &tm) <= 0) + return -EINVAL; + + return 0; +} + +static char *format_types(Bitmap *types) { + _cleanup_strv_free_ char **strv = NULL; + _cleanup_free_ char *str = NULL; + unsigned type; + int r; + + BITMAP_FOREACH(type, types) { + if (dns_type_to_string(type)) { + r = strv_extend(&strv, dns_type_to_string(type)); + if (r < 0) + return NULL; + } else { + char *t; + + r = asprintf(&t, "TYPE%u", type); + if (r < 0) + return NULL; + + r = strv_consume(&strv, t); + if (r < 0) + return NULL; + } + } + + str = strv_join(strv, " "); + if (!str) + return NULL; + + return strjoin("( ", str, " )"); +} + +static char *format_txt(DnsTxtItem *first) { + size_t c = 1; + char *p, *s; + + LIST_FOREACH(items, i, first) + c += i->length * 4 + 3; + + p = s = new(char, c); + if (!s) + return NULL; + + LIST_FOREACH(items, i, first) { + if (i != first) + *(p++) = ' '; + + *(p++) = '"'; + + for (size_t j = 0; j < i->length; j++) { + if (i->data[j] < ' ' || i->data[j] == '"' || i->data[j] >= 127) { + *(p++) = '\\'; + *(p++) = '0' + (i->data[j] / 100); + *(p++) = '0' + ((i->data[j] / 10) % 10); + *(p++) = '0' + (i->data[j] % 10); + } else + *(p++) = i->data[j]; + } + + *(p++) = '"'; + } + + *p = 0; + return s; +} + +const char *dns_resource_record_to_string(DnsResourceRecord *rr) { + _cleanup_free_ char *s = NULL, *t = NULL; + char k[DNS_RESOURCE_KEY_STRING_MAX]; + int r; + + assert(rr); + + if (rr->to_string) + return rr->to_string; + + dns_resource_key_to_string(rr->key, k, sizeof(k)); + + switch (rr->unparsable ? _DNS_TYPE_INVALID : rr->key->type) { + + case DNS_TYPE_SRV: + r = asprintf(&s, "%s %u %u %u %s", + k, + rr->srv.priority, + rr->srv.weight, + rr->srv.port, + strna(rr->srv.name)); + if (r < 0) + return NULL; + break; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + s = strjoin(k, " ", rr->ptr.name); + if (!s) + return NULL; + + break; + + case DNS_TYPE_HINFO: + s = strjoin(k, " ", rr->hinfo.cpu, " ", rr->hinfo.os); + if (!s) + return NULL; + break; + + case DNS_TYPE_SPF: /* exactly the same as TXT */ + case DNS_TYPE_TXT: + t = format_txt(rr->txt.items); + if (!t) + return NULL; + + s = strjoin(k, " ", t); + if (!s) + return NULL; + break; + + case DNS_TYPE_A: + r = in_addr_to_string(AF_INET, (const union in_addr_union*) &rr->a.in_addr, &t); + if (r < 0) + return NULL; + + s = strjoin(k, " ", t); + if (!s) + return NULL; + break; + + case DNS_TYPE_AAAA: + r = in_addr_to_string(AF_INET6, (const union in_addr_union*) &rr->aaaa.in6_addr, &t); + if (r < 0) + return NULL; + + s = strjoin(k, " ", t); + if (!s) + return NULL; + break; + + case DNS_TYPE_SOA: + r = asprintf(&s, "%s %s %s %u %u %u %u %u", + k, + strna(rr->soa.mname), + strna(rr->soa.rname), + rr->soa.serial, + rr->soa.refresh, + rr->soa.retry, + rr->soa.expire, + rr->soa.minimum); + if (r < 0) + return NULL; + break; + + case DNS_TYPE_MX: + r = asprintf(&s, "%s %u %s", + k, + rr->mx.priority, + rr->mx.exchange); + if (r < 0) + return NULL; + break; + + case DNS_TYPE_LOC: + assert(rr->loc.version == 0); + + t = format_location(rr->loc.latitude, + rr->loc.longitude, + rr->loc.altitude, + rr->loc.size, + rr->loc.horiz_pre, + rr->loc.vert_pre); + if (!t) + return NULL; + + s = strjoin(k, " ", t); + if (!s) + return NULL; + break; + + case DNS_TYPE_DS: + t = hexmem(rr->ds.digest, rr->ds.digest_size); + if (!t) + return NULL; + + r = asprintf(&s, "%s %u %u %u %s", + k, + rr->ds.key_tag, + rr->ds.algorithm, + rr->ds.digest_type, + t); + if (r < 0) + return NULL; + break; + + case DNS_TYPE_SSHFP: + t = hexmem(rr->sshfp.fingerprint, rr->sshfp.fingerprint_size); + if (!t) + return NULL; + + r = asprintf(&s, "%s %u %u %s", + k, + rr->sshfp.algorithm, + rr->sshfp.fptype, + t); + if (r < 0) + return NULL; + break; + + case DNS_TYPE_DNSKEY: { + _cleanup_free_ char *alg = NULL; + uint16_t key_tag; + + key_tag = dnssec_keytag(rr, true); + + r = dnssec_algorithm_to_string_alloc(rr->dnskey.algorithm, &alg); + if (r < 0) + return NULL; + + r = asprintf(&t, "%s %u %u %s", + k, + rr->dnskey.flags, + rr->dnskey.protocol, + alg); + if (r < 0) + return NULL; + + r = base64_append(&t, r, + rr->dnskey.key, rr->dnskey.key_size, + 8, columns()); + if (r < 0) + return NULL; + + r = asprintf(&s, "%s\n" + " -- Flags:%s%s%s\n" + " -- Key tag: %u", + t, + rr->dnskey.flags & DNSKEY_FLAG_SEP ? " SEP" : "", + rr->dnskey.flags & DNSKEY_FLAG_REVOKE ? " REVOKE" : "", + rr->dnskey.flags & DNSKEY_FLAG_ZONE_KEY ? " ZONE_KEY" : "", + key_tag); + if (r < 0) + return NULL; + + break; + } + + case DNS_TYPE_RRSIG: { + _cleanup_free_ char *alg = NULL; + char expiration[STRLEN("YYYYMMDDHHmmSS") + 1], inception[STRLEN("YYYYMMDDHHmmSS") + 1]; + const char *type; + + type = dns_type_to_string(rr->rrsig.type_covered); + + r = dnssec_algorithm_to_string_alloc(rr->rrsig.algorithm, &alg); + if (r < 0) + return NULL; + + r = format_timestamp_dns(expiration, sizeof(expiration), rr->rrsig.expiration); + if (r < 0) + return NULL; + + r = format_timestamp_dns(inception, sizeof(inception), rr->rrsig.inception); + if (r < 0) + return NULL; + + /* TYPE?? follows + * http://tools.ietf.org/html/rfc3597#section-5 */ + + r = asprintf(&s, "%s %s%.*u %s %u %u %s %s %u %s", + k, + type ?: "TYPE", + type ? 0 : 1, type ? 0u : (unsigned) rr->rrsig.type_covered, + alg, + rr->rrsig.labels, + rr->rrsig.original_ttl, + expiration, + inception, + rr->rrsig.key_tag, + rr->rrsig.signer); + if (r < 0) + return NULL; + + r = base64_append(&s, r, + rr->rrsig.signature, rr->rrsig.signature_size, + 8, columns()); + if (r < 0) + return NULL; + + break; + } + + case DNS_TYPE_NSEC: + t = format_types(rr->nsec.types); + if (!t) + return NULL; + + r = asprintf(&s, "%s %s %s", + k, + rr->nsec.next_domain_name, + t); + if (r < 0) + return NULL; + break; + + case DNS_TYPE_NSEC3: { + _cleanup_free_ char *salt = NULL, *hash = NULL; + + if (rr->nsec3.salt_size > 0) { + salt = hexmem(rr->nsec3.salt, rr->nsec3.salt_size); + if (!salt) + return NULL; + } + + hash = base32hexmem(rr->nsec3.next_hashed_name, rr->nsec3.next_hashed_name_size, false); + if (!hash) + return NULL; + + t = format_types(rr->nsec3.types); + if (!t) + return NULL; + + r = asprintf(&s, "%s %"PRIu8" %"PRIu8" %"PRIu16" %s %s %s", + k, + rr->nsec3.algorithm, + rr->nsec3.flags, + rr->nsec3.iterations, + rr->nsec3.salt_size > 0 ? salt : "-", + hash, + t); + if (r < 0) + return NULL; + + break; + } + + case DNS_TYPE_TLSA: + t = hexmem(rr->tlsa.data, rr->tlsa.data_size); + if (!t) + return NULL; + + r = asprintf(&s, + "%s %u %u %u %s\n" + " -- Cert. usage: %s\n" + " -- Selector: %s\n" + " -- Matching type: %s", + k, + rr->tlsa.cert_usage, + rr->tlsa.selector, + rr->tlsa.matching_type, + t, + tlsa_cert_usage_to_string(rr->tlsa.cert_usage), + tlsa_selector_to_string(rr->tlsa.selector), + tlsa_matching_type_to_string(rr->tlsa.matching_type)); + if (r < 0) + return NULL; + + break; + + case DNS_TYPE_CAA: + t = octescape(rr->caa.value, rr->caa.value_size); + if (!t) + return NULL; + + r = asprintf(&s, "%s %u %s \"%s\"%s%s%s%.0u", + k, + rr->caa.flags, + rr->caa.tag, + t, + rr->caa.flags ? "\n -- Flags:" : "", + rr->caa.flags & CAA_FLAG_CRITICAL ? " critical" : "", + rr->caa.flags & ~CAA_FLAG_CRITICAL ? " " : "", + rr->caa.flags & ~CAA_FLAG_CRITICAL); + if (r < 0) + return NULL; + + break; + + case DNS_TYPE_OPENPGPKEY: + r = asprintf(&s, "%s", k); + if (r < 0) + return NULL; + + r = base64_append(&s, r, + rr->generic.data, rr->generic.data_size, + 8, columns()); + if (r < 0) + return NULL; + break; + + default: + /* Format as documented in RFC 3597, Section 5 */ + if (rr->generic.data_size == 0) + r = asprintf(&s, "%s \\# 0", k); + else { + t = hexmem(rr->generic.data, rr->generic.data_size); + if (!t) + return NULL; + r = asprintf(&s, "%s \\# %zu %s", k, rr->generic.data_size, t); + } + if (r < 0) + return NULL; + break; + } + + rr->to_string = s; + return TAKE_PTR(s); +} + +ssize_t dns_resource_record_payload(DnsResourceRecord *rr, void **out) { + assert(rr); + assert(out); + + switch (rr->unparsable ? _DNS_TYPE_INVALID : rr->key->type) { + case DNS_TYPE_SRV: + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + case DNS_TYPE_HINFO: + case DNS_TYPE_SPF: + case DNS_TYPE_TXT: + case DNS_TYPE_A: + case DNS_TYPE_AAAA: + case DNS_TYPE_SOA: + case DNS_TYPE_MX: + case DNS_TYPE_LOC: + case DNS_TYPE_DS: + case DNS_TYPE_DNSKEY: + case DNS_TYPE_RRSIG: + case DNS_TYPE_NSEC: + case DNS_TYPE_NSEC3: + return -EINVAL; + + case DNS_TYPE_SSHFP: + *out = rr->sshfp.fingerprint; + return rr->sshfp.fingerprint_size; + + case DNS_TYPE_TLSA: + *out = rr->tlsa.data; + return rr->tlsa.data_size; + + case DNS_TYPE_OPENPGPKEY: + default: + *out = rr->generic.data; + return rr->generic.data_size; + } +} + +int dns_resource_record_to_wire_format(DnsResourceRecord *rr, bool canonical) { + + _cleanup_(dns_packet_unref) DnsPacket packet = { + .n_ref = 1, + .protocol = DNS_PROTOCOL_DNS, + .on_stack = true, + .refuse_compression = true, + .canonical_form = canonical, + }; + + size_t start, rds; + int r; + + assert(rr); + + /* Generates the RR in wire-format, optionally in the + * canonical form as discussed in the DNSSEC RFC 4034, Section + * 6.2. We allocate a throw-away DnsPacket object on the stack + * here, because we need some book-keeping for memory + * management, and can reuse the DnsPacket serializer, that + * can generate the canonical form, too, but also knows label + * compression and suchlike. */ + + if (rr->wire_format && rr->wire_format_canonical == canonical) + return 0; + + r = dns_packet_append_rr(&packet, rr, 0, &start, &rds); + if (r < 0) + return r; + + assert(start == 0); + assert(packet._data); + + free(rr->wire_format); + rr->wire_format = TAKE_PTR(packet._data); + rr->wire_format_size = packet.size; + rr->wire_format_rdata_offset = rds; + rr->wire_format_canonical = canonical; + + return 0; +} + +int dns_resource_record_signer(DnsResourceRecord *rr, const char **ret) { + const char *n; + int r; + + assert(rr); + assert(ret); + + /* Returns the RRset's signer, if it is known. */ + + if (rr->n_skip_labels_signer == UINT8_MAX) + return -ENODATA; + + n = dns_resource_key_name(rr->key); + r = dns_name_skip(n, rr->n_skip_labels_signer, &n); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + *ret = n; + return 0; +} + +int dns_resource_record_source(DnsResourceRecord *rr, const char **ret) { + const char *n; + int r; + + assert(rr); + assert(ret); + + /* Returns the RRset's synthesizing source, if it is known. */ + + if (rr->n_skip_labels_source == UINT8_MAX) + return -ENODATA; + + n = dns_resource_key_name(rr->key); + r = dns_name_skip(n, rr->n_skip_labels_source, &n); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + *ret = n; + return 0; +} + +int dns_resource_record_is_signer(DnsResourceRecord *rr, const char *zone) { + const char *signer; + int r; + + assert(rr); + + r = dns_resource_record_signer(rr, &signer); + if (r < 0) + return r; + + return dns_name_equal(zone, signer); +} + +int dns_resource_record_is_synthetic(DnsResourceRecord *rr) { + int r; + + assert(rr); + + /* Returns > 0 if the RR is generated from a wildcard, and is not the asterisk name itself */ + + if (rr->n_skip_labels_source == UINT8_MAX) + return -ENODATA; + + if (rr->n_skip_labels_source == 0) + return 0; + + if (rr->n_skip_labels_source > 1) + return 1; + + r = dns_name_startswith(dns_resource_key_name(rr->key), "*"); + if (r < 0) + return r; + + return !r; +} + +void dns_resource_record_hash_func(const DnsResourceRecord *rr, struct siphash *state) { + assert(rr); + + dns_resource_key_hash_func(rr->key, state); + + switch (rr->unparsable ? _DNS_TYPE_INVALID : rr->key->type) { + + case DNS_TYPE_SRV: + siphash24_compress(&rr->srv.priority, sizeof(rr->srv.priority), state); + siphash24_compress(&rr->srv.weight, sizeof(rr->srv.weight), state); + siphash24_compress(&rr->srv.port, sizeof(rr->srv.port), state); + dns_name_hash_func(rr->srv.name, state); + break; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + dns_name_hash_func(rr->ptr.name, state); + break; + + case DNS_TYPE_HINFO: + string_hash_func(rr->hinfo.cpu, state); + string_hash_func(rr->hinfo.os, state); + break; + + case DNS_TYPE_TXT: + case DNS_TYPE_SPF: { + LIST_FOREACH(items, j, rr->txt.items) { + siphash24_compress_safe(j->data, j->length, state); + + /* Add an extra NUL byte, so that "a" followed by "b" doesn't result in the same hash as "ab" + * followed by "". */ + siphash24_compress_byte(0, state); + } + break; + } + + case DNS_TYPE_A: + siphash24_compress(&rr->a.in_addr, sizeof(rr->a.in_addr), state); + break; + + case DNS_TYPE_AAAA: + siphash24_compress(&rr->aaaa.in6_addr, sizeof(rr->aaaa.in6_addr), state); + break; + + case DNS_TYPE_SOA: + dns_name_hash_func(rr->soa.mname, state); + dns_name_hash_func(rr->soa.rname, state); + siphash24_compress(&rr->soa.serial, sizeof(rr->soa.serial), state); + siphash24_compress(&rr->soa.refresh, sizeof(rr->soa.refresh), state); + siphash24_compress(&rr->soa.retry, sizeof(rr->soa.retry), state); + siphash24_compress(&rr->soa.expire, sizeof(rr->soa.expire), state); + siphash24_compress(&rr->soa.minimum, sizeof(rr->soa.minimum), state); + break; + + case DNS_TYPE_MX: + siphash24_compress(&rr->mx.priority, sizeof(rr->mx.priority), state); + dns_name_hash_func(rr->mx.exchange, state); + break; + + case DNS_TYPE_LOC: + siphash24_compress(&rr->loc.version, sizeof(rr->loc.version), state); + siphash24_compress(&rr->loc.size, sizeof(rr->loc.size), state); + siphash24_compress(&rr->loc.horiz_pre, sizeof(rr->loc.horiz_pre), state); + siphash24_compress(&rr->loc.vert_pre, sizeof(rr->loc.vert_pre), state); + siphash24_compress(&rr->loc.latitude, sizeof(rr->loc.latitude), state); + siphash24_compress(&rr->loc.longitude, sizeof(rr->loc.longitude), state); + siphash24_compress(&rr->loc.altitude, sizeof(rr->loc.altitude), state); + break; + + case DNS_TYPE_SSHFP: + siphash24_compress(&rr->sshfp.algorithm, sizeof(rr->sshfp.algorithm), state); + siphash24_compress(&rr->sshfp.fptype, sizeof(rr->sshfp.fptype), state); + siphash24_compress_safe(rr->sshfp.fingerprint, rr->sshfp.fingerprint_size, state); + break; + + case DNS_TYPE_DNSKEY: + siphash24_compress(&rr->dnskey.flags, sizeof(rr->dnskey.flags), state); + siphash24_compress(&rr->dnskey.protocol, sizeof(rr->dnskey.protocol), state); + siphash24_compress(&rr->dnskey.algorithm, sizeof(rr->dnskey.algorithm), state); + siphash24_compress_safe(rr->dnskey.key, rr->dnskey.key_size, state); + break; + + case DNS_TYPE_RRSIG: + siphash24_compress(&rr->rrsig.type_covered, sizeof(rr->rrsig.type_covered), state); + siphash24_compress(&rr->rrsig.algorithm, sizeof(rr->rrsig.algorithm), state); + siphash24_compress(&rr->rrsig.labels, sizeof(rr->rrsig.labels), state); + siphash24_compress(&rr->rrsig.original_ttl, sizeof(rr->rrsig.original_ttl), state); + siphash24_compress(&rr->rrsig.expiration, sizeof(rr->rrsig.expiration), state); + siphash24_compress(&rr->rrsig.inception, sizeof(rr->rrsig.inception), state); + siphash24_compress(&rr->rrsig.key_tag, sizeof(rr->rrsig.key_tag), state); + dns_name_hash_func(rr->rrsig.signer, state); + siphash24_compress_safe(rr->rrsig.signature, rr->rrsig.signature_size, state); + break; + + case DNS_TYPE_NSEC: + dns_name_hash_func(rr->nsec.next_domain_name, state); + /* FIXME: we leave out the type bitmap here. Hash + * would be better if we'd take it into account + * too. */ + break; + + case DNS_TYPE_DS: + siphash24_compress(&rr->ds.key_tag, sizeof(rr->ds.key_tag), state); + siphash24_compress(&rr->ds.algorithm, sizeof(rr->ds.algorithm), state); + siphash24_compress(&rr->ds.digest_type, sizeof(rr->ds.digest_type), state); + siphash24_compress_safe(rr->ds.digest, rr->ds.digest_size, state); + break; + + case DNS_TYPE_NSEC3: + siphash24_compress(&rr->nsec3.algorithm, sizeof(rr->nsec3.algorithm), state); + siphash24_compress(&rr->nsec3.flags, sizeof(rr->nsec3.flags), state); + siphash24_compress(&rr->nsec3.iterations, sizeof(rr->nsec3.iterations), state); + siphash24_compress_safe(rr->nsec3.salt, rr->nsec3.salt_size, state); + siphash24_compress_safe(rr->nsec3.next_hashed_name, rr->nsec3.next_hashed_name_size, state); + /* FIXME: We leave the bitmaps out */ + break; + + case DNS_TYPE_TLSA: + siphash24_compress(&rr->tlsa.cert_usage, sizeof(rr->tlsa.cert_usage), state); + siphash24_compress(&rr->tlsa.selector, sizeof(rr->tlsa.selector), state); + siphash24_compress(&rr->tlsa.matching_type, sizeof(rr->tlsa.matching_type), state); + siphash24_compress_safe(rr->tlsa.data, rr->tlsa.data_size, state); + break; + + case DNS_TYPE_CAA: + siphash24_compress(&rr->caa.flags, sizeof(rr->caa.flags), state); + string_hash_func(rr->caa.tag, state); + siphash24_compress_safe(rr->caa.value, rr->caa.value_size, state); + break; + + case DNS_TYPE_OPENPGPKEY: + default: + siphash24_compress_safe(rr->generic.data, rr->generic.data_size, state); + break; + } +} + +int dns_resource_record_compare_func(const DnsResourceRecord *x, const DnsResourceRecord *y) { + int r; + + r = dns_resource_key_compare_func(x->key, y->key); + if (r != 0) + return r; + + if (dns_resource_record_payload_equal(x, y) > 0) + return 0; + + /* We still use CMP() here, even though don't implement proper + * ordering, since the hashtable doesn't need ordering anyway. */ + return CMP(x, y); +} + +DEFINE_HASH_OPS(dns_resource_record_hash_ops, DnsResourceRecord, dns_resource_record_hash_func, dns_resource_record_compare_func); + +DnsResourceRecord *dns_resource_record_copy(DnsResourceRecord *rr) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *copy = NULL; + DnsResourceRecord *t; + + assert(rr); + + copy = dns_resource_record_new(rr->key); + if (!copy) + return NULL; + + copy->ttl = rr->ttl; + copy->expiry = rr->expiry; + copy->n_skip_labels_signer = rr->n_skip_labels_signer; + copy->n_skip_labels_source = rr->n_skip_labels_source; + copy->unparsable = rr->unparsable; + + switch (rr->unparsable ? _DNS_TYPE_INVALID : rr->key->type) { + + case DNS_TYPE_SRV: + copy->srv.priority = rr->srv.priority; + copy->srv.weight = rr->srv.weight; + copy->srv.port = rr->srv.port; + copy->srv.name = strdup(rr->srv.name); + if (!copy->srv.name) + return NULL; + break; + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + copy->ptr.name = strdup(rr->ptr.name); + if (!copy->ptr.name) + return NULL; + break; + + case DNS_TYPE_HINFO: + copy->hinfo.cpu = strdup(rr->hinfo.cpu); + if (!copy->hinfo.cpu) + return NULL; + + copy->hinfo.os = strdup(rr->hinfo.os); + if (!copy->hinfo.os) + return NULL; + break; + + case DNS_TYPE_TXT: + case DNS_TYPE_SPF: + copy->txt.items = dns_txt_item_copy(rr->txt.items); + if (!copy->txt.items) + return NULL; + break; + + case DNS_TYPE_A: + copy->a = rr->a; + break; + + case DNS_TYPE_AAAA: + copy->aaaa = rr->aaaa; + break; + + case DNS_TYPE_SOA: + copy->soa.mname = strdup(rr->soa.mname); + if (!copy->soa.mname) + return NULL; + copy->soa.rname = strdup(rr->soa.rname); + if (!copy->soa.rname) + return NULL; + copy->soa.serial = rr->soa.serial; + copy->soa.refresh = rr->soa.refresh; + copy->soa.retry = rr->soa.retry; + copy->soa.expire = rr->soa.expire; + copy->soa.minimum = rr->soa.minimum; + break; + + case DNS_TYPE_MX: + copy->mx.priority = rr->mx.priority; + copy->mx.exchange = strdup(rr->mx.exchange); + if (!copy->mx.exchange) + return NULL; + break; + + case DNS_TYPE_LOC: + copy->loc = rr->loc; + break; + + case DNS_TYPE_SSHFP: + copy->sshfp.algorithm = rr->sshfp.algorithm; + copy->sshfp.fptype = rr->sshfp.fptype; + copy->sshfp.fingerprint = memdup(rr->sshfp.fingerprint, rr->sshfp.fingerprint_size); + if (!copy->sshfp.fingerprint) + return NULL; + copy->sshfp.fingerprint_size = rr->sshfp.fingerprint_size; + break; + + case DNS_TYPE_DNSKEY: + copy->dnskey.flags = rr->dnskey.flags; + copy->dnskey.protocol = rr->dnskey.protocol; + copy->dnskey.algorithm = rr->dnskey.algorithm; + copy->dnskey.key = memdup(rr->dnskey.key, rr->dnskey.key_size); + if (!copy->dnskey.key) + return NULL; + copy->dnskey.key_size = rr->dnskey.key_size; + break; + + case DNS_TYPE_RRSIG: + copy->rrsig.type_covered = rr->rrsig.type_covered; + copy->rrsig.algorithm = rr->rrsig.algorithm; + copy->rrsig.labels = rr->rrsig.labels; + copy->rrsig.original_ttl = rr->rrsig.original_ttl; + copy->rrsig.expiration = rr->rrsig.expiration; + copy->rrsig.inception = rr->rrsig.inception; + copy->rrsig.key_tag = rr->rrsig.key_tag; + copy->rrsig.signer = strdup(rr->rrsig.signer); + if (!copy->rrsig.signer) + return NULL; + copy->rrsig.signature = memdup(rr->rrsig.signature, rr->rrsig.signature_size); + if (!copy->rrsig.signature) + return NULL; + copy->rrsig.signature_size = rr->rrsig.signature_size; + break; + + case DNS_TYPE_NSEC: + copy->nsec.next_domain_name = strdup(rr->nsec.next_domain_name); + if (!copy->nsec.next_domain_name) + return NULL; + if (rr->nsec.types) { + copy->nsec.types = bitmap_copy(rr->nsec.types); + if (!copy->nsec.types) + return NULL; + } + break; + + case DNS_TYPE_DS: + copy->ds.key_tag = rr->ds.key_tag; + copy->ds.algorithm = rr->ds.algorithm; + copy->ds.digest_type = rr->ds.digest_type; + copy->ds.digest = memdup(rr->ds.digest, rr->ds.digest_size); + if (!copy->ds.digest) + return NULL; + copy->ds.digest_size = rr->ds.digest_size; + break; + + case DNS_TYPE_NSEC3: + copy->nsec3.algorithm = rr->nsec3.algorithm; + copy->nsec3.flags = rr->nsec3.flags; + copy->nsec3.iterations = rr->nsec3.iterations; + copy->nsec3.salt = memdup(rr->nsec3.salt, rr->nsec3.salt_size); + if (!copy->nsec3.salt) + return NULL; + copy->nsec3.salt_size = rr->nsec3.salt_size; + copy->nsec3.next_hashed_name = memdup(rr->nsec3.next_hashed_name, rr->nsec3.next_hashed_name_size); + if (!copy->nsec3.next_hashed_name) + return NULL; + copy->nsec3.next_hashed_name_size = rr->nsec3.next_hashed_name_size; + if (rr->nsec3.types) { + copy->nsec3.types = bitmap_copy(rr->nsec3.types); + if (!copy->nsec3.types) + return NULL; + } + break; + + case DNS_TYPE_TLSA: + copy->tlsa.cert_usage = rr->tlsa.cert_usage; + copy->tlsa.selector = rr->tlsa.selector; + copy->tlsa.matching_type = rr->tlsa.matching_type; + copy->tlsa.data = memdup(rr->tlsa.data, rr->tlsa.data_size); + if (!copy->tlsa.data) + return NULL; + copy->tlsa.data_size = rr->tlsa.data_size; + break; + + case DNS_TYPE_CAA: + copy->caa.flags = rr->caa.flags; + copy->caa.tag = strdup(rr->caa.tag); + if (!copy->caa.tag) + return NULL; + copy->caa.value = memdup(rr->caa.value, rr->caa.value_size); + if (!copy->caa.value) + return NULL; + copy->caa.value_size = rr->caa.value_size; + break; + + case DNS_TYPE_OPT: + default: + copy->generic.data = memdup(rr->generic.data, rr->generic.data_size); + if (!copy->generic.data) + return NULL; + copy->generic.data_size = rr->generic.data_size; + break; + } + + t = TAKE_PTR(copy); + + return t; +} + +int dns_resource_record_clamp_ttl(DnsResourceRecord **rr, uint32_t max_ttl) { + DnsResourceRecord *old_rr, *new_rr; + uint32_t new_ttl; + + assert(rr); + old_rr = *rr; + + if (old_rr->key->type == DNS_TYPE_OPT) + return -EINVAL; + + new_ttl = MIN(old_rr->ttl, max_ttl); + if (new_ttl == old_rr->ttl) + return 0; + + if (old_rr->n_ref == 1) { + /* Patch in place */ + old_rr->ttl = new_ttl; + return 1; + } + + new_rr = dns_resource_record_copy(old_rr); + if (!new_rr) + return -ENOMEM; + + new_rr->ttl = new_ttl; + + DNS_RR_REPLACE(*rr, new_rr); + return 1; +} + +bool dns_resource_record_is_link_local_address(DnsResourceRecord *rr) { + assert(rr); + + if (rr->key->class != DNS_CLASS_IN) + return false; + + if (rr->key->type == DNS_TYPE_A) + return in4_addr_is_link_local(&rr->a.in_addr); + + if (rr->key->type == DNS_TYPE_AAAA) + return in6_addr_is_link_local(&rr->aaaa.in6_addr); + + return false; +} + +int dns_resource_record_get_cname_target(DnsResourceKey *key, DnsResourceRecord *cname, char **ret) { + _cleanup_free_ char *d = NULL; + int r; + + assert(key); + assert(cname); + + /* Checks if the RR `cname` is a CNAME/DNAME RR that matches the specified `key`. If so, returns the + * target domain. If not, returns -EUNATCH */ + + if (key->class != cname->key->class && key->class != DNS_CLASS_ANY) + return -EUNATCH; + + if (!dns_type_may_redirect(key->type)) /* This key type is not subject to CNAME/DNAME redirection? + * Then let's refuse right-away */ + return -EUNATCH; + + if (cname->key->type == DNS_TYPE_CNAME) { + r = dns_name_equal(dns_resource_key_name(key), + dns_resource_key_name(cname->key)); + if (r < 0) + return r; + if (r == 0) + return -EUNATCH; /* CNAME RR key doesn't actually match the original key */ + + d = strdup(cname->cname.name); + if (!d) + return -ENOMEM; + + } else if (cname->key->type == DNS_TYPE_DNAME) { + + r = dns_name_change_suffix( + dns_resource_key_name(key), + dns_resource_key_name(cname->key), + cname->dname.name, + &d); + if (r < 0) + return r; + if (r == 0) + return -EUNATCH; /* DNAME RR key doesn't actually match the original key */ + + } else + return -EUNATCH; /* Not a CNAME/DNAME RR, hence doesn't match the proposition either */ + + *ret = TAKE_PTR(d); + return 0; +} + +DnsTxtItem *dns_txt_item_free_all(DnsTxtItem *first) { + LIST_FOREACH(items, i, first) + free(i); + + return NULL; +} + +bool dns_txt_item_equal(DnsTxtItem *a, DnsTxtItem *b) { + DnsTxtItem *bb = b; + + if (a == b) + return true; + + LIST_FOREACH(items, aa, a) { + if (!bb) + return false; + + if (memcmp_nn(aa->data, aa->length, bb->data, bb->length) != 0) + return false; + + bb = bb->items_next; + } + + return !bb; +} + +DnsTxtItem *dns_txt_item_copy(DnsTxtItem *first) { + DnsTxtItem *copy = NULL, *end = NULL; + + LIST_FOREACH(items, i, first) { + DnsTxtItem *j; + + j = memdup(i, offsetof(DnsTxtItem, data) + i->length + 1); + if (!j) + return dns_txt_item_free_all(copy); + + LIST_INSERT_AFTER(items, copy, end, j); + end = j; + } + + return copy; +} + +int dns_txt_item_new_empty(DnsTxtItem **ret) { + DnsTxtItem *i; + + assert(ret); + + /* RFC 6763, section 6.1 suggests to treat + * empty TXT RRs as equivalent to a TXT record + * with a single empty string. */ + + i = malloc0(offsetof(DnsTxtItem, data) + 1); /* for safety reasons we add an extra NUL byte */ + if (!i) + return -ENOMEM; + + *ret = i; + return 0; +} + +int dns_resource_record_new_from_raw(DnsResourceRecord **ret, const void *data, size_t size) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + int r; + + r = dns_packet_new(&p, DNS_PROTOCOL_DNS, 0, DNS_PACKET_SIZE_MAX); + if (r < 0) + return r; + + p->refuse_compression = true; + + r = dns_packet_append_blob(p, data, size, NULL); + if (r < 0) + return r; + + return dns_packet_read_rr(p, ret, NULL, NULL); +} + +int dns_resource_key_to_json(DnsResourceKey *key, JsonVariant **ret) { + assert(key); + assert(ret); + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("class", JSON_BUILD_INTEGER(key->class)), + JSON_BUILD_PAIR("type", JSON_BUILD_INTEGER(key->type)), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(dns_resource_key_name(key))))); +} + +int dns_resource_key_from_json(JsonVariant *v, DnsResourceKey **ret) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + uint16_t type = 0, class = 0; + const char *name = NULL; + int r; + + JsonDispatch dispatch_table[] = { + { "class", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint16, PTR_TO_SIZE(&class), JSON_MANDATORY }, + { "type", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint16, PTR_TO_SIZE(&type), JSON_MANDATORY }, + { "name", JSON_VARIANT_STRING, json_dispatch_const_string, PTR_TO_SIZE(&name), JSON_MANDATORY }, + {} + }; + + assert(v); + assert(ret); + + r = json_dispatch(v, dispatch_table, 0, NULL); + if (r < 0) + return r; + + key = dns_resource_key_new(class, type, name); + if (!key) + return -ENOMEM; + + *ret = TAKE_PTR(key); + return 0; +} + +static int type_bitmap_to_json(Bitmap *b, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *l = NULL; + unsigned t; + int r; + + assert(ret); + + BITMAP_FOREACH(t, b) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = json_variant_new_unsigned(&v, t); + if (r < 0) + return r; + + r = json_variant_append_array(&l, v); + if (r < 0) + return r; + } + + if (!l) + return json_variant_new_array(ret, NULL, 0); + + *ret = TAKE_PTR(l); + return 0; +} + +static int txt_to_json(DnsTxtItem *items, JsonVariant **ret) { + JsonVariant **elements = NULL; + size_t n = 0; + int r; + + assert(ret); + + LIST_FOREACH(items, i, items) { + if (!GREEDY_REALLOC(elements, n + 1)) { + r = -ENOMEM; + goto finalize; + } + + r = json_variant_new_octescape(elements + n, i->data, i->length); + if (r < 0) + goto finalize; + + n++; + } + + r = json_variant_new_array(ret, elements, n); + +finalize: + for (size_t i = 0; i < n; i++) + json_variant_unref(elements[i]); + + free(elements); + return r; +} + +int dns_resource_record_to_json(DnsResourceRecord *rr, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *k = NULL; + int r; + + assert(rr); + assert(ret); + + r = dns_resource_key_to_json(rr->key, &k); + if (r < 0) + return r; + + switch (rr->unparsable ? _DNS_TYPE_INVALID : rr->key->type) { + + case DNS_TYPE_SRV: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("priority", JSON_BUILD_UNSIGNED(rr->srv.priority)), + JSON_BUILD_PAIR("weight", JSON_BUILD_UNSIGNED(rr->srv.weight)), + JSON_BUILD_PAIR("port", JSON_BUILD_UNSIGNED(rr->srv.port)), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(rr->srv.name)))); + + case DNS_TYPE_PTR: + case DNS_TYPE_NS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(rr->ptr.name)))); + + case DNS_TYPE_HINFO: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("cpu", JSON_BUILD_STRING(rr->hinfo.cpu)), + JSON_BUILD_PAIR("os", JSON_BUILD_STRING(rr->hinfo.os)))); + + case DNS_TYPE_SPF: + case DNS_TYPE_TXT: { + _cleanup_(json_variant_unrefp) JsonVariant *l = NULL; + + r = txt_to_json(rr->txt.items, &l); + if (r < 0) + return r; + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("items", JSON_BUILD_VARIANT(l)))); + } + + case DNS_TYPE_A: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("address", JSON_BUILD_IN4_ADDR(&rr->a.in_addr)))); + + case DNS_TYPE_AAAA: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("address", JSON_BUILD_IN6_ADDR(&rr->aaaa.in6_addr)))); + + case DNS_TYPE_SOA: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("mname", JSON_BUILD_STRING(rr->soa.mname)), + JSON_BUILD_PAIR("rname", JSON_BUILD_STRING(rr->soa.rname)), + JSON_BUILD_PAIR("serial", JSON_BUILD_UNSIGNED(rr->soa.serial)), + JSON_BUILD_PAIR("refresh", JSON_BUILD_UNSIGNED(rr->soa.refresh)), + JSON_BUILD_PAIR("expire", JSON_BUILD_UNSIGNED(rr->soa.retry)), + JSON_BUILD_PAIR("minimum", JSON_BUILD_UNSIGNED(rr->soa.minimum)))); + + case DNS_TYPE_MX: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("priority", JSON_BUILD_UNSIGNED(rr->mx.priority)), + JSON_BUILD_PAIR("exchange", JSON_BUILD_STRING(rr->mx.exchange)))); + case DNS_TYPE_LOC: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("version", JSON_BUILD_UNSIGNED(rr->loc.version)), + JSON_BUILD_PAIR("size", JSON_BUILD_UNSIGNED(rr->loc.size)), + JSON_BUILD_PAIR("horiz_pre", JSON_BUILD_UNSIGNED(rr->loc.horiz_pre)), + JSON_BUILD_PAIR("vert_pre", JSON_BUILD_UNSIGNED(rr->loc.vert_pre)), + JSON_BUILD_PAIR("latitude", JSON_BUILD_UNSIGNED(rr->loc.latitude)), + JSON_BUILD_PAIR("longitude", JSON_BUILD_UNSIGNED(rr->loc.longitude)), + JSON_BUILD_PAIR("altitude", JSON_BUILD_UNSIGNED(rr->loc.altitude)))); + + case DNS_TYPE_DS: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("keyTag", JSON_BUILD_UNSIGNED(rr->ds.key_tag)), + JSON_BUILD_PAIR("algorithm", JSON_BUILD_UNSIGNED(rr->ds.algorithm)), + JSON_BUILD_PAIR("digestType", JSON_BUILD_UNSIGNED(rr->ds.digest_type)), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(rr->ds.digest, rr->ds.digest_size)))); + + case DNS_TYPE_SSHFP: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("algorithm", JSON_BUILD_UNSIGNED(rr->sshfp.algorithm)), + JSON_BUILD_PAIR("fptype", JSON_BUILD_UNSIGNED(rr->sshfp.fptype)), + JSON_BUILD_PAIR("fingerprint", JSON_BUILD_HEX(rr->sshfp.fingerprint, rr->sshfp.fingerprint_size)))); + + case DNS_TYPE_DNSKEY: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("flags", JSON_BUILD_UNSIGNED(rr->dnskey.flags)), + JSON_BUILD_PAIR("protocol", JSON_BUILD_UNSIGNED(rr->dnskey.protocol)), + JSON_BUILD_PAIR("algorithm", JSON_BUILD_UNSIGNED(rr->dnskey.algorithm)), + JSON_BUILD_PAIR("dnskey", JSON_BUILD_BASE64(rr->dnskey.key, rr->dnskey.key_size)))); + + + case DNS_TYPE_RRSIG: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("signer", JSON_BUILD_STRING(rr->rrsig.signer)), + JSON_BUILD_PAIR("typeCovered", JSON_BUILD_UNSIGNED(rr->rrsig.type_covered)), + JSON_BUILD_PAIR("algorithm", JSON_BUILD_UNSIGNED(rr->rrsig.algorithm)), + JSON_BUILD_PAIR("labels", JSON_BUILD_UNSIGNED(rr->rrsig.labels)), + JSON_BUILD_PAIR("originalTtl", JSON_BUILD_UNSIGNED(rr->rrsig.original_ttl)), + JSON_BUILD_PAIR("expiration", JSON_BUILD_UNSIGNED(rr->rrsig.expiration)), + JSON_BUILD_PAIR("inception", JSON_BUILD_UNSIGNED(rr->rrsig.inception)), + JSON_BUILD_PAIR("keyTag", JSON_BUILD_UNSIGNED(rr->rrsig.key_tag)), + JSON_BUILD_PAIR("signature", JSON_BUILD_BASE64(rr->rrsig.signature, rr->rrsig.signature_size)))); + + case DNS_TYPE_NSEC: { + _cleanup_(json_variant_unrefp) JsonVariant *bm = NULL; + + r = type_bitmap_to_json(rr->nsec.types, &bm); + if (r < 0) + return r; + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("nextDomain", JSON_BUILD_STRING(rr->nsec.next_domain_name)), + JSON_BUILD_PAIR("types", JSON_BUILD_VARIANT(bm)))); + } + + case DNS_TYPE_NSEC3: { + _cleanup_(json_variant_unrefp) JsonVariant *bm = NULL; + + r = type_bitmap_to_json(rr->nsec3.types, &bm); + if (r < 0) + return r; + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("algorithm", JSON_BUILD_UNSIGNED(rr->nsec3.algorithm)), + JSON_BUILD_PAIR("flags", JSON_BUILD_UNSIGNED(rr->nsec3.flags)), + JSON_BUILD_PAIR("iterations", JSON_BUILD_UNSIGNED(rr->nsec3.iterations)), + JSON_BUILD_PAIR("salt", JSON_BUILD_HEX(rr->nsec3.salt, rr->nsec3.salt_size)), + JSON_BUILD_PAIR("hash", JSON_BUILD_BASE32HEX(rr->nsec3.next_hashed_name, rr->nsec3.next_hashed_name_size)), + JSON_BUILD_PAIR("types", JSON_BUILD_VARIANT(bm)))); + } + + case DNS_TYPE_TLSA: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("certUsage", JSON_BUILD_UNSIGNED(rr->tlsa.cert_usage)), + JSON_BUILD_PAIR("selector", JSON_BUILD_UNSIGNED(rr->tlsa.selector)), + JSON_BUILD_PAIR("matchingType", JSON_BUILD_UNSIGNED(rr->tlsa.matching_type)), + JSON_BUILD_PAIR("data", JSON_BUILD_HEX(rr->tlsa.data, rr->tlsa.data_size)))); + + case DNS_TYPE_CAA: + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("key", JSON_BUILD_VARIANT(k)), + JSON_BUILD_PAIR("flags", JSON_BUILD_UNSIGNED(rr->caa.flags)), + JSON_BUILD_PAIR("tag", JSON_BUILD_STRING(rr->caa.tag)), + JSON_BUILD_PAIR("value", JSON_BUILD_OCTESCAPE(rr->caa.value, rr->caa.value_size)))); + + default: + /* Can't provide broken-down format */ + *ret = NULL; + return 0; + } +} + +static const char* const dnssec_algorithm_table[_DNSSEC_ALGORITHM_MAX_DEFINED] = { + /* Mnemonics as listed on https://www.iana.org/assignments/dns-sec-alg-numbers/dns-sec-alg-numbers.xhtml */ + [DNSSEC_ALGORITHM_RSAMD5] = "RSAMD5", + [DNSSEC_ALGORITHM_DH] = "DH", + [DNSSEC_ALGORITHM_DSA] = "DSA", + [DNSSEC_ALGORITHM_ECC] = "ECC", + [DNSSEC_ALGORITHM_RSASHA1] = "RSASHA1", + [DNSSEC_ALGORITHM_DSA_NSEC3_SHA1] = "DSA-NSEC3-SHA1", + [DNSSEC_ALGORITHM_RSASHA1_NSEC3_SHA1] = "RSASHA1-NSEC3-SHA1", + [DNSSEC_ALGORITHM_RSASHA256] = "RSASHA256", + [DNSSEC_ALGORITHM_RSASHA512] = "RSASHA512", + [DNSSEC_ALGORITHM_ECC_GOST] = "ECC-GOST", + [DNSSEC_ALGORITHM_ECDSAP256SHA256] = "ECDSAP256SHA256", + [DNSSEC_ALGORITHM_ECDSAP384SHA384] = "ECDSAP384SHA384", + [DNSSEC_ALGORITHM_ED25519] = "ED25519", + [DNSSEC_ALGORITHM_ED448] = "ED448", + [DNSSEC_ALGORITHM_INDIRECT] = "INDIRECT", + [DNSSEC_ALGORITHM_PRIVATEDNS] = "PRIVATEDNS", + [DNSSEC_ALGORITHM_PRIVATEOID] = "PRIVATEOID", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(dnssec_algorithm, int, 255); + +static const char* const dnssec_digest_table[_DNSSEC_DIGEST_MAX_DEFINED] = { + /* Names as listed on https://www.iana.org/assignments/ds-rr-types/ds-rr-types.xhtml */ + [DNSSEC_DIGEST_SHA1] = "SHA-1", + [DNSSEC_DIGEST_SHA256] = "SHA-256", + [DNSSEC_DIGEST_GOST_R_34_11_94] = "GOST_R_34.11-94", + [DNSSEC_DIGEST_SHA384] = "SHA-384", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(dnssec_digest, int, 255); diff --git a/src/resolve/resolved-dns-rr.h b/src/resolve/resolved-dns-rr.h new file mode 100644 index 0000000..fd15cc3 --- /dev/null +++ b/src/resolve/resolved-dns-rr.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bitmap.h" +#include "dns-def.h" +#include "dns-type.h" +#include "hashmap.h" +#include "in-addr-util.h" +#include "json.h" +#include "list.h" +#include "string-util.h" +#include "time-util.h" + +typedef struct DnsResourceKey DnsResourceKey; +typedef struct DnsResourceRecord DnsResourceRecord; +typedef struct DnsTxtItem DnsTxtItem; + +/* DNSKEY RR flags */ +#define DNSKEY_FLAG_SEP (UINT16_C(1) << 0) +#define DNSKEY_FLAG_REVOKE (UINT16_C(1) << 7) +#define DNSKEY_FLAG_ZONE_KEY (UINT16_C(1) << 8) + +/* mDNS RR flags */ +#define MDNS_RR_CACHE_FLUSH_OR_QU (UINT16_C(1) << 15) + +/* DNSSEC algorithm identifiers, see + * http://tools.ietf.org/html/rfc4034#appendix-A.1 and + * https://www.iana.org/assignments/dns-sec-alg-numbers/dns-sec-alg-numbers.xhtml */ +enum { + DNSSEC_ALGORITHM_RSAMD5 = 1, + DNSSEC_ALGORITHM_DH, + DNSSEC_ALGORITHM_DSA, + DNSSEC_ALGORITHM_ECC, + DNSSEC_ALGORITHM_RSASHA1, + DNSSEC_ALGORITHM_DSA_NSEC3_SHA1, + DNSSEC_ALGORITHM_RSASHA1_NSEC3_SHA1, + DNSSEC_ALGORITHM_RSASHA256 = 8, /* RFC 5702 */ + DNSSEC_ALGORITHM_RSASHA512 = 10, /* RFC 5702 */ + DNSSEC_ALGORITHM_ECC_GOST = 12, /* RFC 5933 */ + DNSSEC_ALGORITHM_ECDSAP256SHA256 = 13, /* RFC 6605 */ + DNSSEC_ALGORITHM_ECDSAP384SHA384 = 14, /* RFC 6605 */ + DNSSEC_ALGORITHM_ED25519 = 15, /* RFC 8080 */ + DNSSEC_ALGORITHM_ED448 = 16, /* RFC 8080 */ + DNSSEC_ALGORITHM_INDIRECT = 252, + DNSSEC_ALGORITHM_PRIVATEDNS, + DNSSEC_ALGORITHM_PRIVATEOID, + _DNSSEC_ALGORITHM_MAX_DEFINED +}; + +/* DNSSEC digest identifiers, see + * https://www.iana.org/assignments/ds-rr-types/ds-rr-types.xhtml */ +enum { + DNSSEC_DIGEST_SHA1 = 1, + DNSSEC_DIGEST_SHA256 = 2, /* RFC 4509 */ + DNSSEC_DIGEST_GOST_R_34_11_94 = 3, /* RFC 5933 */ + DNSSEC_DIGEST_SHA384 = 4, /* RFC 6605 */ + _DNSSEC_DIGEST_MAX_DEFINED +}; + +/* DNSSEC NSEC3 hash algorithms, see + * https://www.iana.org/assignments/dnssec-nsec3-parameters/dnssec-nsec3-parameters.xhtml */ +enum { + NSEC3_ALGORITHM_SHA1 = 1, + _NSEC3_ALGORITHM_MAX_DEFINED +}; + +struct DnsResourceKey { + unsigned n_ref; /* (unsigned -1) for const keys, see below */ + uint16_t class, type; + char *_name; /* don't access directly, use dns_resource_key_name()! */ +}; + +/* Creates a temporary resource key. This is only useful to quickly + * look up something, without allocating a full DnsResourceKey object + * for it. Note that it is not OK to take references to this kind of + * resource key object. */ +#define DNS_RESOURCE_KEY_CONST(c, t, n) \ + ((DnsResourceKey) { \ + .n_ref = UINT_MAX, \ + .class = c, \ + .type = t, \ + ._name = (char*) n, \ + }) + +struct DnsTxtItem { + size_t length; + LIST_FIELDS(DnsTxtItem, items); + uint8_t data[]; +}; + +struct DnsResourceRecord { + unsigned n_ref; + uint32_t ttl; + usec_t expiry; /* RRSIG signature expiry */ + + DnsResourceKey *key; + + char *to_string; + + /* How many labels to strip to determine "signer" of the RRSIG (aka, the zone). -1 if not signed. */ + uint8_t n_skip_labels_signer; + /* How many labels to strip to determine "synthesizing source" of this RR, i.e. the wildcard's immediate parent. -1 if not signed. */ + uint8_t n_skip_labels_source; + + bool unparsable; + bool wire_format_canonical; + + void *wire_format; + size_t wire_format_size; + size_t wire_format_rdata_offset; + + union { + struct { + void *data; + size_t data_size; + } generic, opt; + + struct { + char *name; + uint16_t priority; + uint16_t weight; + uint16_t port; + } srv; + + struct { + char *name; + } ptr, ns, cname, dname; + + struct { + char *cpu; + char *os; + } hinfo; + + struct { + DnsTxtItem *items; + } txt, spf; + + struct { + struct in_addr in_addr; + } a; + + struct { + struct in6_addr in6_addr; + } aaaa; + + struct { + char *mname; + char *rname; + uint32_t serial; + uint32_t refresh; + uint32_t retry; + uint32_t expire; + uint32_t minimum; + } soa; + + struct { + char *exchange; + uint16_t priority; + } mx; + + /* https://tools.ietf.org/html/rfc1876 */ + struct { + uint8_t version; + uint8_t size; + uint8_t horiz_pre; + uint8_t vert_pre; + uint32_t latitude; + uint32_t longitude; + uint32_t altitude; + } loc; + + /* https://tools.ietf.org/html/rfc4255#section-3.1 */ + struct { + void *fingerprint; + size_t fingerprint_size; + + uint8_t algorithm; + uint8_t fptype; + } sshfp; + + /* http://tools.ietf.org/html/rfc4034#section-2.1 */ + struct { + void* key; + size_t key_size; + + uint16_t flags; + uint8_t protocol; + uint8_t algorithm; + } dnskey; + + /* http://tools.ietf.org/html/rfc4034#section-3.1 */ + struct { + char *signer; + void *signature; + size_t signature_size; + + uint16_t type_covered; + uint8_t algorithm; + uint8_t labels; + uint32_t original_ttl; + uint32_t expiration; + uint32_t inception; + uint16_t key_tag; + } rrsig; + + /* https://tools.ietf.org/html/rfc4034#section-4.1 */ + struct { + char *next_domain_name; + Bitmap *types; + } nsec; + + /* https://tools.ietf.org/html/rfc4034#section-5.1 */ + struct { + void *digest; + size_t digest_size; + + uint16_t key_tag; + uint8_t algorithm; + uint8_t digest_type; + } ds; + + struct { + Bitmap *types; + void *salt; + size_t salt_size; + void *next_hashed_name; + size_t next_hashed_name_size; + + uint8_t algorithm; + uint8_t flags; + uint16_t iterations; + } nsec3; + + /* https://tools.ietf.org/html/draft-ietf-dane-protocol-23 */ + struct { + void *data; + size_t data_size; + + uint8_t cert_usage; + uint8_t selector; + uint8_t matching_type; + } tlsa; + + /* https://tools.ietf.org/html/rfc6844 */ + struct { + char *tag; + void *value; + size_t value_size; + + uint8_t flags; + } caa; + }; + + /* Note: fields should be ordered to minimize alignment gaps. Use pahole! */ +}; + +/* We use uint8_t for label counts above, and UINT8_MAX/-1 has special meaning. */ +assert_cc(DNS_N_LABELS_MAX < UINT8_MAX); + +static inline const void* DNS_RESOURCE_RECORD_RDATA(const DnsResourceRecord *rr) { + if (!rr) + return NULL; + + if (!rr->wire_format) + return NULL; + + assert(rr->wire_format_rdata_offset <= rr->wire_format_size); + return (uint8_t*) rr->wire_format + rr->wire_format_rdata_offset; +} + +static inline size_t DNS_RESOURCE_RECORD_RDATA_SIZE(const DnsResourceRecord *rr) { + if (!rr) + return 0; + if (!rr->wire_format) + return 0; + + assert(rr->wire_format_rdata_offset <= rr->wire_format_size); + return rr->wire_format_size - rr->wire_format_rdata_offset; +} + +static inline uint8_t DNS_RESOURCE_RECORD_OPT_VERSION_SUPPORTED(const DnsResourceRecord *rr) { + assert(rr); + assert(rr->key->type == DNS_TYPE_OPT); + + return ((rr->ttl >> 16) & 0xFF) == 0; +} + +DnsResourceKey* dns_resource_key_new(uint16_t class, uint16_t type, const char *name); +DnsResourceKey* dns_resource_key_new_redirect(const DnsResourceKey *key, const DnsResourceRecord *cname); +int dns_resource_key_new_append_suffix(DnsResourceKey **ret, DnsResourceKey *key, char *name); +DnsResourceKey* dns_resource_key_new_consume(uint16_t class, uint16_t type, char *name); +DnsResourceKey* dns_resource_key_ref(DnsResourceKey *key); +DnsResourceKey* dns_resource_key_unref(DnsResourceKey *key); + +#define DNS_RESOURCE_KEY_REPLACE(a, b) \ + do { \ + typeof(a)* _a = &(a); \ + typeof(b) _b = (b); \ + dns_resource_key_unref(*_a); \ + *_a = _b; \ + } while(0) + +const char* dns_resource_key_name(const DnsResourceKey *key); +bool dns_resource_key_is_address(const DnsResourceKey *key); +bool dns_resource_key_is_dnssd_ptr(const DnsResourceKey *key); +int dns_resource_key_equal(const DnsResourceKey *a, const DnsResourceKey *b); +int dns_resource_key_match_rr(const DnsResourceKey *key, DnsResourceRecord *rr, const char *search_domain); +int dns_resource_key_match_cname_or_dname(const DnsResourceKey *key, const DnsResourceKey *cname, const char *search_domain); +int dns_resource_key_match_soa(const DnsResourceKey *key, const DnsResourceKey *soa); + +/* _DNS_{CLASS,TYPE}_STRING_MAX include one byte for NUL, which we use for space instead below. + * DNS_HOSTNAME_MAX does not include the NUL byte, so we need to add 1. */ +#define DNS_RESOURCE_KEY_STRING_MAX (_DNS_CLASS_STRING_MAX + _DNS_TYPE_STRING_MAX + DNS_HOSTNAME_MAX + 1) + +char* dns_resource_key_to_string(const DnsResourceKey *key, char *buf, size_t buf_size); +ssize_t dns_resource_record_payload(DnsResourceRecord *rr, void **out); + +#define DNS_RESOURCE_KEY_TO_STRING(key) \ + dns_resource_key_to_string(key, (char[DNS_RESOURCE_KEY_STRING_MAX]) {}, DNS_RESOURCE_KEY_STRING_MAX) + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsResourceKey*, dns_resource_key_unref); + +static inline bool dns_key_is_shared(const DnsResourceKey *key) { + return key->type == DNS_TYPE_PTR; +} + +bool dns_resource_key_reduce(DnsResourceKey **a, DnsResourceKey **b); + +DnsResourceRecord* dns_resource_record_new(DnsResourceKey *key); +DnsResourceRecord* dns_resource_record_new_full(uint16_t class, uint16_t type, const char *name); +DnsResourceRecord* dns_resource_record_ref(DnsResourceRecord *rr); +DnsResourceRecord* dns_resource_record_unref(DnsResourceRecord *rr); + +#define DNS_RR_REPLACE(a, b) \ + do { \ + typeof(a)* _a = &(a); \ + typeof(b) _b = (b); \ + dns_resource_record_unref(*_a); \ + *_a = _b; \ + } while(0) + +int dns_resource_record_new_reverse(DnsResourceRecord **ret, int family, const union in_addr_union *address, const char *name); +int dns_resource_record_new_address(DnsResourceRecord **ret, int family, const union in_addr_union *address, const char *name); +int dns_resource_record_equal(const DnsResourceRecord *a, const DnsResourceRecord *b); +int dns_resource_record_payload_equal(const DnsResourceRecord *a, const DnsResourceRecord *b); + +const char* dns_resource_record_to_string(DnsResourceRecord *rr); +DnsResourceRecord *dns_resource_record_copy(DnsResourceRecord *rr); +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsResourceRecord*, dns_resource_record_unref); + +int dns_resource_record_to_wire_format(DnsResourceRecord *rr, bool canonical); + +int dns_resource_record_signer(DnsResourceRecord *rr, const char **ret); +int dns_resource_record_source(DnsResourceRecord *rr, const char **ret); +int dns_resource_record_is_signer(DnsResourceRecord *rr, const char *zone); +int dns_resource_record_is_synthetic(DnsResourceRecord *rr); + +int dns_resource_record_clamp_ttl(DnsResourceRecord **rr, uint32_t max_ttl); + +bool dns_resource_record_is_link_local_address(DnsResourceRecord *rr); + +int dns_resource_record_get_cname_target(DnsResourceKey *key, DnsResourceRecord *cname, char **ret); + +DnsTxtItem *dns_txt_item_free_all(DnsTxtItem *i); +bool dns_txt_item_equal(DnsTxtItem *a, DnsTxtItem *b); +DnsTxtItem *dns_txt_item_copy(DnsTxtItem *i); +int dns_txt_item_new_empty(DnsTxtItem **ret); + +int dns_resource_record_new_from_raw(DnsResourceRecord **ret, const void *data, size_t size); + +int dns_resource_key_to_json(DnsResourceKey *key, JsonVariant **ret); +int dns_resource_key_from_json(JsonVariant *v, DnsResourceKey **ret); +int dns_resource_record_to_json(DnsResourceRecord *rr, JsonVariant **ret); + +void dns_resource_record_hash_func(const DnsResourceRecord *i, struct siphash *state); +int dns_resource_record_compare_func(const DnsResourceRecord *x, const DnsResourceRecord *y); + +extern const struct hash_ops dns_resource_key_hash_ops; +extern const struct hash_ops dns_resource_record_hash_ops; + +int dnssec_algorithm_to_string_alloc(int i, char **ret); +int dnssec_algorithm_from_string(const char *s) _pure_; + +int dnssec_digest_to_string_alloc(int i, char **ret); +int dnssec_digest_from_string(const char *s) _pure_; diff --git a/src/resolve/resolved-dns-scope.c b/src/resolve/resolved-dns-scope.c new file mode 100644 index 0000000..2e8b3e5 --- /dev/null +++ b/src/resolve/resolved-dns-scope.c @@ -0,0 +1,1683 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "dns-domain.h" +#include "errno-util.h" +#include "fd-util.h" +#include "hostname-util.h" +#include "missing_network.h" +#include "random-util.h" +#include "resolved-dnssd.h" +#include "resolved-dns-scope.h" +#include "resolved-dns-zone.h" +#include "resolved-llmnr.h" +#include "resolved-mdns.h" +#include "socket-util.h" +#include "strv.h" + +#define MULTICAST_RATELIMIT_INTERVAL_USEC (1*USEC_PER_SEC) +#define MULTICAST_RATELIMIT_BURST 1000 + +/* After how much time to repeat LLMNR requests, see RFC 4795 Section 7 */ +#define MULTICAST_RESEND_TIMEOUT_MIN_USEC (100 * USEC_PER_MSEC) +#define MULTICAST_RESEND_TIMEOUT_MAX_USEC (1 * USEC_PER_SEC) + +int dns_scope_new(Manager *m, DnsScope **ret, Link *l, DnsProtocol protocol, int family) { + DnsScope *s; + + assert(m); + assert(ret); + + s = new(DnsScope, 1); + if (!s) + return -ENOMEM; + + *s = (DnsScope) { + .manager = m, + .link = l, + .protocol = protocol, + .family = family, + .resend_timeout = MULTICAST_RESEND_TIMEOUT_MIN_USEC, + }; + + if (protocol == DNS_PROTOCOL_DNS) { + /* Copy DNSSEC mode from the link if it is set there, + * otherwise take the manager's DNSSEC mode. Note that + * we copy this only at scope creation time, and do + * not update it from the on, even if the setting + * changes. */ + + if (l) { + s->dnssec_mode = link_get_dnssec_mode(l); + s->dns_over_tls_mode = link_get_dns_over_tls_mode(l); + } else { + s->dnssec_mode = manager_get_dnssec_mode(m); + s->dns_over_tls_mode = manager_get_dns_over_tls_mode(m); + } + + } else { + s->dnssec_mode = DNSSEC_NO; + s->dns_over_tls_mode = DNS_OVER_TLS_NO; + } + + LIST_PREPEND(scopes, m->dns_scopes, s); + + dns_scope_llmnr_membership(s, true); + dns_scope_mdns_membership(s, true); + + log_debug("New scope on link %s, protocol %s, family %s", l ? l->ifname : "*", dns_protocol_to_string(protocol), family == AF_UNSPEC ? "*" : af_to_name(family)); + + /* Enforce ratelimiting for the multicast protocols */ + s->ratelimit = (const RateLimit) { MULTICAST_RATELIMIT_INTERVAL_USEC, MULTICAST_RATELIMIT_BURST }; + + *ret = s; + return 0; +} + +static void dns_scope_abort_transactions(DnsScope *s) { + assert(s); + + while (s->transactions) { + DnsTransaction *t = s->transactions; + + /* Abort the transaction, but make sure it is not + * freed while we still look at it */ + + t->block_gc++; + if (DNS_TRANSACTION_IS_LIVE(t->state)) + dns_transaction_complete(t, DNS_TRANSACTION_ABORTED); + t->block_gc--; + + dns_transaction_free(t); + } +} + +DnsScope* dns_scope_free(DnsScope *s) { + if (!s) + return NULL; + + log_debug("Removing scope on link %s, protocol %s, family %s", s->link ? s->link->ifname : "*", dns_protocol_to_string(s->protocol), s->family == AF_UNSPEC ? "*" : af_to_name(s->family)); + + dns_scope_llmnr_membership(s, false); + dns_scope_mdns_membership(s, false); + dns_scope_abort_transactions(s); + + while (s->query_candidates) + dns_query_candidate_unref(s->query_candidates); + + hashmap_free(s->transactions_by_key); + + ordered_hashmap_free_with_destructor(s->conflict_queue, dns_resource_record_unref); + sd_event_source_disable_unref(s->conflict_event_source); + + sd_event_source_disable_unref(s->announce_event_source); + + dns_cache_flush(&s->cache); + dns_zone_flush(&s->zone); + + LIST_REMOVE(scopes, s->manager->dns_scopes, s); + return mfree(s); +} + +DnsServer *dns_scope_get_dns_server(DnsScope *s) { + assert(s); + + if (s->protocol != DNS_PROTOCOL_DNS) + return NULL; + + if (s->link) + return link_get_dns_server(s->link); + else + return manager_get_dns_server(s->manager); +} + +unsigned dns_scope_get_n_dns_servers(DnsScope *s) { + unsigned n = 0; + DnsServer *i; + + assert(s); + + if (s->protocol != DNS_PROTOCOL_DNS) + return 0; + + if (s->link) + i = s->link->dns_servers; + else + i = s->manager->dns_servers; + + for (; i; i = i->servers_next) + n++; + + return n; +} + +void dns_scope_next_dns_server(DnsScope *s, DnsServer *if_current) { + assert(s); + + if (s->protocol != DNS_PROTOCOL_DNS) + return; + + /* Changes to the next DNS server in the list. If 'if_current' is passed will do so only if the + * current DNS server still matches it. */ + + if (s->link) + link_next_dns_server(s->link, if_current); + else + manager_next_dns_server(s->manager, if_current); +} + +void dns_scope_packet_received(DnsScope *s, usec_t rtt) { + assert(s); + + if (rtt <= s->max_rtt) + return; + + s->max_rtt = rtt; + s->resend_timeout = MIN(MAX(MULTICAST_RESEND_TIMEOUT_MIN_USEC, s->max_rtt * 2), MULTICAST_RESEND_TIMEOUT_MAX_USEC); +} + +void dns_scope_packet_lost(DnsScope *s, usec_t usec) { + assert(s); + + if (s->resend_timeout <= usec) + s->resend_timeout = MIN(s->resend_timeout * 2, MULTICAST_RESEND_TIMEOUT_MAX_USEC); +} + +static int dns_scope_emit_one(DnsScope *s, int fd, int family, DnsPacket *p) { + int r; + + assert(s); + assert(p); + assert(p->protocol == s->protocol); + + if (family == AF_UNSPEC) { + if (s->family == AF_UNSPEC) + return -EAFNOSUPPORT; + + family = s->family; + } + + switch (s->protocol) { + + case DNS_PROTOCOL_DNS: { + size_t mtu, udp_size, min_mtu, socket_mtu = 0; + + assert(fd >= 0); + + if (DNS_PACKET_QDCOUNT(p) > 1) /* Classic DNS only allows one question per packet */ + return -EOPNOTSUPP; + + if (p->size > DNS_PACKET_UNICAST_SIZE_MAX) + return -EMSGSIZE; + + /* Determine the local most accurate MTU */ + if (s->link) + mtu = s->link->mtu; + else + mtu = manager_find_mtu(s->manager); + + /* Acquire the socket's PMDU MTU */ + r = socket_get_mtu(fd, family, &socket_mtu); + if (r < 0 && !ERRNO_IS_DISCONNECT(r)) /* Will return ENOTCONN if no information is available yet */ + return log_debug_errno(r, "Failed to read socket MTU: %m"); + + /* Determine the appropriate UDP header size */ + udp_size = udp_header_size(family); + min_mtu = udp_size + DNS_PACKET_HEADER_SIZE; + + log_debug("Emitting UDP, link MTU is %zu, socket MTU is %zu, minimal MTU is %zu", + mtu, socket_mtu, min_mtu); + + /* Clamp by the kernel's idea of the (path) MTU */ + if (socket_mtu != 0 && socket_mtu < mtu) + mtu = socket_mtu; + + /* Put a lower limit, in case all MTU data we acquired was rubbish */ + if (mtu < min_mtu) + mtu = min_mtu; + + /* Now check our packet size against the MTU we determined */ + if (udp_size + p->size > mtu) + return -EMSGSIZE; /* This means: try TCP instead */ + + r = manager_write(s->manager, fd, p); + if (r < 0) + return r; + + break; + } + + case DNS_PROTOCOL_LLMNR: { + union in_addr_union addr; + + assert(fd < 0); + + if (DNS_PACKET_QDCOUNT(p) > 1) + return -EOPNOTSUPP; + + if (!ratelimit_below(&s->ratelimit)) + return -EBUSY; + + if (family == AF_INET) { + addr.in = LLMNR_MULTICAST_IPV4_ADDRESS; + fd = manager_llmnr_ipv4_udp_fd(s->manager); + } else if (family == AF_INET6) { + addr.in6 = LLMNR_MULTICAST_IPV6_ADDRESS; + fd = manager_llmnr_ipv6_udp_fd(s->manager); + } else + return -EAFNOSUPPORT; + if (fd < 0) + return fd; + + r = manager_send(s->manager, fd, s->link->ifindex, family, &addr, LLMNR_PORT, NULL, p); + if (r < 0) + return r; + + break; + } + + case DNS_PROTOCOL_MDNS: { + union in_addr_union addr; + assert(fd < 0); + + if (!ratelimit_below(&s->ratelimit)) + return -EBUSY; + + if (family == AF_INET) { + if (in4_addr_is_null(&p->destination.in)) + addr.in = MDNS_MULTICAST_IPV4_ADDRESS; + else + addr = p->destination; + fd = manager_mdns_ipv4_fd(s->manager); + } else if (family == AF_INET6) { + if (in6_addr_is_null(&p->destination.in6)) + addr.in6 = MDNS_MULTICAST_IPV6_ADDRESS; + else + addr = p->destination; + fd = manager_mdns_ipv6_fd(s->manager); + } else + return -EAFNOSUPPORT; + if (fd < 0) + return fd; + + r = manager_send(s->manager, fd, s->link->ifindex, family, &addr, p->destination_port ?: MDNS_PORT, NULL, p); + if (r < 0) + return r; + + break; + } + + default: + return -EAFNOSUPPORT; + } + + return 1; +} + +int dns_scope_emit_udp(DnsScope *s, int fd, int af, DnsPacket *p) { + int r; + + assert(s); + assert(p); + assert(p->protocol == s->protocol); + assert((s->protocol == DNS_PROTOCOL_DNS) == (fd >= 0)); + + do { + /* If there are multiple linked packets, set the TC bit in all but the last of them */ + if (p->more) { + assert(p->protocol == DNS_PROTOCOL_MDNS); + dns_packet_set_flags(p, true, true); + } + + r = dns_scope_emit_one(s, fd, af, p); + if (r < 0) + return r; + + p = p->more; + } while (p); + + return 0; +} + +static int dns_scope_socket( + DnsScope *s, + int type, + int family, + const union in_addr_union *address, + DnsServer *server, + uint16_t port, + union sockaddr_union *ret_socket_address) { + + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + socklen_t salen; + int r, ifindex; + + assert(s); + + if (server) { + assert(family == AF_UNSPEC); + assert(!address); + + ifindex = dns_server_ifindex(server); + + switch (server->family) { + case AF_INET: + sa = (union sockaddr_union) { + .in.sin_family = server->family, + .in.sin_port = htobe16(port), + .in.sin_addr = server->address.in, + }; + salen = sizeof(sa.in); + break; + case AF_INET6: + sa = (union sockaddr_union) { + .in6.sin6_family = server->family, + .in6.sin6_port = htobe16(port), + .in6.sin6_addr = server->address.in6, + .in6.sin6_scope_id = ifindex, + }; + salen = sizeof(sa.in6); + break; + default: + return -EAFNOSUPPORT; + } + } else { + assert(family != AF_UNSPEC); + assert(address); + + ifindex = s->link ? s->link->ifindex : 0; + + switch (family) { + case AF_INET: + sa = (union sockaddr_union) { + .in.sin_family = family, + .in.sin_port = htobe16(port), + .in.sin_addr = address->in, + }; + salen = sizeof(sa.in); + break; + case AF_INET6: + sa = (union sockaddr_union) { + .in6.sin6_family = family, + .in6.sin6_port = htobe16(port), + .in6.sin6_addr = address->in6, + .in6.sin6_scope_id = ifindex, + }; + salen = sizeof(sa.in6); + break; + default: + return -EAFNOSUPPORT; + } + } + + fd = socket(sa.sa.sa_family, type|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + if (type == SOCK_STREAM) { + r = setsockopt_int(fd, IPPROTO_TCP, TCP_NODELAY, true); + if (r < 0) + return r; + } + + if (ifindex != 0) { + r = socket_set_unicast_if(fd, sa.sa.sa_family, ifindex); + if (r < 0) + return r; + } + + if (s->protocol == DNS_PROTOCOL_LLMNR) { + /* RFC 4795, section 2.5 requires the TTL to be set to 1 */ + r = socket_set_ttl(fd, sa.sa.sa_family, 1); + if (r < 0) + return r; + } + + if (type == SOCK_DGRAM) { + /* Set IP_RECVERR or IPV6_RECVERR to get ICMP error feedback. See discussion in #10345. */ + r = socket_set_recverr(fd, sa.sa.sa_family, true); + if (r < 0) + return r; + + r = socket_set_recvpktinfo(fd, sa.sa.sa_family, true); + if (r < 0) + return r; + + /* Turn of path MTU discovery for security reasons */ + r = socket_disable_pmtud(fd, sa.sa.sa_family); + if (r < 0) + log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m"); + + /* Learn about fragmentation taking place */ + r = socket_set_recvfragsize(fd, sa.sa.sa_family, true); + if (r < 0) + log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m"); + } + + if (ret_socket_address) + *ret_socket_address = sa; + else { + bool bound = false; + + /* Let's temporarily bind the socket to the specified ifindex. The kernel currently takes + * only the SO_BINDTODEVICE/SO_BINDTOINDEX ifindex into account when making routing decisions + * in connect() — and not IP_UNICAST_IF. We don't really want any of the other semantics of + * SO_BINDTODEVICE/SO_BINDTOINDEX, hence we immediately unbind the socket after the fact + * again. + * + * As a special exception we don't do this if we notice that the specified IP address is on + * the local host. SO_BINDTODEVICE in combination with destination addresses on the local + * host result in EHOSTUNREACH, since Linux won't send the packets out of the specified + * interface, but delivers them directly to the local socket. */ + if (s->link && + !manager_find_link_address(s->manager, sa.sa.sa_family, sockaddr_in_addr(&sa.sa)) && + in_addr_is_localhost(sa.sa.sa_family, sockaddr_in_addr(&sa.sa)) == 0) { + r = socket_bind_to_ifindex(fd, ifindex); + if (r < 0) + return r; + + bound = true; + } + + r = connect(fd, &sa.sa, salen); + if (r < 0 && errno != EINPROGRESS) + return -errno; + + if (bound) { + r = socket_bind_to_ifindex(fd, 0); + if (r < 0) + return r; + } + } + + return TAKE_FD(fd); +} + +int dns_scope_socket_udp(DnsScope *s, DnsServer *server) { + return dns_scope_socket(s, SOCK_DGRAM, AF_UNSPEC, NULL, server, dns_server_port(server), NULL); +} + +int dns_scope_socket_tcp(DnsScope *s, int family, const union in_addr_union *address, DnsServer *server, uint16_t port, union sockaddr_union *ret_socket_address) { + /* If ret_socket_address is not NULL, the caller is responsible + * for calling connect() or sendmsg(). This is required by TCP + * Fast Open, to be able to send the initial SYN packet along + * with the first data packet. */ + return dns_scope_socket(s, SOCK_STREAM, family, address, server, port, ret_socket_address); +} + +static DnsScopeMatch match_link_local_reverse_lookups(const char *domain) { + assert(domain); + + if (dns_name_endswith(domain, "254.169.in-addr.arpa") > 0) + return DNS_SCOPE_YES_BASE + 4; /* 4 labels match */ + + if (dns_name_endswith(domain, "8.e.f.ip6.arpa") > 0 || + dns_name_endswith(domain, "9.e.f.ip6.arpa") > 0 || + dns_name_endswith(domain, "a.e.f.ip6.arpa") > 0 || + dns_name_endswith(domain, "b.e.f.ip6.arpa") > 0) + return DNS_SCOPE_YES_BASE + 5; /* 5 labels match */ + + return _DNS_SCOPE_MATCH_INVALID; +} + +static DnsScopeMatch match_subnet_reverse_lookups( + DnsScope *s, + const char *domain, + bool exclude_own) { + + union in_addr_union ia; + int f, r; + + assert(s); + assert(domain); + + /* Checks whether the specified domain is a reverse address domain (i.e. in the .in-addr.arpa or + * .ip6.arpa area), and if so, whether the address matches any of the local subnets of the link the + * scope is associated with. If so, our scope should consider itself relevant for any lookup in the + * domain, since it apparently refers to hosts on this link's subnet. + * + * If 'exclude_own' is true this will return DNS_SCOPE_NO for any IP addresses assigned locally. This + * is useful for LLMNR/mDNS as we never want to look up our own hostname on LLMNR/mDNS but always use + * the locally synthesized one. */ + + if (!s->link) + return _DNS_SCOPE_MATCH_INVALID; /* No link, hence no local addresses to check */ + + r = dns_name_address(domain, &f, &ia); + if (r < 0) + log_debug_errno(r, "Failed to determine whether '%s' is an address domain: %m", domain); + if (r <= 0) + return _DNS_SCOPE_MATCH_INVALID; + + if (s->family != AF_UNSPEC && f != s->family) + return _DNS_SCOPE_MATCH_INVALID; /* Don't look for IPv4 addresses on LLMNR/mDNS over IPv6 and vice versa */ + + if (in_addr_is_null(f, &ia)) + return DNS_SCOPE_NO; + + LIST_FOREACH(addresses, a, s->link->addresses) { + + if (a->family != f) + continue; + + /* Equals our own address? nah, let's not use this scope. The local synthesizer will pick it up for us. */ + if (exclude_own && + in_addr_equal(f, &a->in_addr, &ia) > 0) + return DNS_SCOPE_NO; + + if (a->prefixlen == UCHAR_MAX) /* don't know subnet mask */ + continue; + + /* Don't send mDNS queries for the IPv4 broadcast address */ + if (f == AF_INET && in_addr_equal(f, &a->in_addr_broadcast, &ia) > 0) + return DNS_SCOPE_NO; + + /* Check if the address is in the local subnet */ + r = in_addr_prefix_covers(f, &a->in_addr, a->prefixlen, &ia); + if (r < 0) + log_debug_errno(r, "Failed to determine whether link address covers lookup address '%s': %m", domain); + if (r > 0) + /* Note that we only claim zero labels match. This is so that this is at the same + * priority a DNS scope with "." as routing domain is. */ + return DNS_SCOPE_YES_BASE + 0; + } + + return _DNS_SCOPE_MATCH_INVALID; +} + +DnsScopeMatch dns_scope_good_domain( + DnsScope *s, + DnsQuery *q) { + + DnsQuestion *question; + const char *domain; + uint64_t flags; + int ifindex; + + /* This returns the following return values: + * + * DNS_SCOPE_NO → This scope is not suitable for lookups of this domain, at all + * DNS_SCOPE_MAYBE → This scope is suitable, but only if nothing else wants it + * DNS_SCOPE_YES_BASE+n → This scope is suitable, and 'n' suffix labels match + * + * (The idea is that the caller will only use the scopes with the longest 'n' returned. If no scopes return + * DNS_SCOPE_YES_BASE+n, then it should use those which returned DNS_SCOPE_MAYBE. It should never use those + * which returned DNS_SCOPE_NO.) + */ + + assert(s); + assert(q); + + question = dns_query_question_for_protocol(q, s->protocol); + if (!question) + return DNS_SCOPE_NO; + + domain = dns_question_first_name(question); + if (!domain) + return DNS_SCOPE_NO; + + ifindex = q->ifindex; + flags = q->flags; + + /* Checks if the specified domain is something to look up on this scope. Note that this accepts + * non-qualified hostnames, i.e. those without any search path suffixed. */ + + if (ifindex != 0 && (!s->link || s->link->ifindex != ifindex)) + return DNS_SCOPE_NO; + + if ((SD_RESOLVED_FLAGS_MAKE(s->protocol, s->family, false, false) & flags) == 0) + return DNS_SCOPE_NO; + + /* Never resolve any loopback hostname or IP address via DNS, LLMNR or mDNS. Instead, always rely on + * synthesized RRs for these. */ + if (is_localhost(domain) || + dns_name_endswith(domain, "127.in-addr.arpa") > 0 || + dns_name_equal(domain, "1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa") > 0) + return DNS_SCOPE_NO; + + /* Never respond to some of the domains listed in RFC6303 + RFC6761 */ + if (dns_name_dont_resolve(domain)) + return DNS_SCOPE_NO; + + /* Never go to network for the _gateway, _outbound, _localdnsstub, _localdnsproxy domain — they're something special, synthesized locally. */ + if (is_gateway_hostname(domain) || + is_outbound_hostname(domain) || + is_dns_stub_hostname(domain) || + is_dns_proxy_stub_hostname(domain)) + return DNS_SCOPE_NO; + + switch (s->protocol) { + + case DNS_PROTOCOL_DNS: { + bool has_search_domains = false; + DnsScopeMatch m; + int n_best = -1; + + if (dns_name_is_root(domain)) { + DnsResourceKey *t; + bool found = false; + + /* Refuse root name if only A and/or AAAA records are requested. */ + + DNS_QUESTION_FOREACH(t, question) + if (!IN_SET(t->type, DNS_TYPE_A, DNS_TYPE_AAAA)) { + found = true; + break; + } + + if (!found) + return DNS_SCOPE_NO; + } + + /* Never route things to scopes that lack DNS servers */ + if (!dns_scope_get_dns_server(s)) + return DNS_SCOPE_NO; + + /* Always honour search domains for routing queries, except if this scope lacks DNS servers. Note that + * we return DNS_SCOPE_YES here, rather than just DNS_SCOPE_MAYBE, which means other wildcard scopes + * won't be considered anymore. */ + LIST_FOREACH(domains, d, dns_scope_get_search_domains(s)) { + + if (!d->route_only && !dns_name_is_root(d->name)) + has_search_domains = true; + + if (dns_name_endswith(domain, d->name) > 0) { + int c; + + c = dns_name_count_labels(d->name); + if (c < 0) + continue; + + if (c > n_best) + n_best = c; + } + } + + /* If there's a true search domain defined for this scope, and the query is single-label, + * then let's resolve things here, preferably. Note that LLMNR considers itself + * authoritative for single-label names too, at the same preference, see below. */ + if (has_search_domains && dns_name_is_single_label(domain)) + return DNS_SCOPE_YES_BASE + 1; + + /* If ResolveUnicastSingleLabel=yes and the query is single-label, then bump match result + to prevent LLMNR monopoly among candidates. */ + if (s->manager->resolve_unicast_single_label && dns_name_is_single_label(domain)) + return DNS_SCOPE_YES_BASE + 1; + + /* Let's return the number of labels in the best matching result */ + if (n_best >= 0) { + assert(n_best <= DNS_SCOPE_YES_END - DNS_SCOPE_YES_BASE); + return DNS_SCOPE_YES_BASE + n_best; + } + + /* Exclude link-local IP ranges */ + if (match_link_local_reverse_lookups(domain) >= DNS_SCOPE_YES_BASE || + /* If networks use .local in their private setups, they are supposed to also add .local + * to their search domains, which we already checked above. Otherwise, we consider .local + * specific to mDNS and won't send such queries ordinary DNS servers. */ + dns_name_endswith(domain, "local") > 0) + return DNS_SCOPE_NO; + + /* If the IP address to look up matches the local subnet, then implicitly synthesizes + * DNS_SCOPE_YES_BASE + 0 on this interface, i.e. preferably resolve IP addresses via the DNS + * server belonging to this interface. */ + m = match_subnet_reverse_lookups(s, domain, false); + if (m >= 0) + return m; + + /* If there was no match at all, then see if this scope is suitable as default route. */ + if (!dns_scope_is_default_route(s)) + return DNS_SCOPE_NO; + + return DNS_SCOPE_MAYBE; + } + + case DNS_PROTOCOL_MDNS: { + DnsScopeMatch m; + + m = match_link_local_reverse_lookups(domain); + if (m >= 0) + return m; + + m = match_subnet_reverse_lookups(s, domain, true); + if (m >= 0) + return m; + + if ((s->family == AF_INET && dns_name_endswith(domain, "in-addr.arpa") > 0) || + (s->family == AF_INET6 && dns_name_endswith(domain, "ip6.arpa") > 0)) + return DNS_SCOPE_MAYBE; + + if ((dns_name_endswith(domain, "local") > 0 && /* only resolve names ending in .local via mDNS */ + dns_name_equal(domain, "local") == 0 && /* but not the single-label "local" name itself */ + manager_is_own_hostname(s->manager, domain) <= 0)) /* never resolve the local hostname via mDNS */ + return DNS_SCOPE_YES_BASE + 1; /* Return +1, as the top-level .local domain matches, i.e. one label */ + + return DNS_SCOPE_NO; + } + + case DNS_PROTOCOL_LLMNR: { + DnsScopeMatch m; + + m = match_link_local_reverse_lookups(domain); + if (m >= 0) + return m; + + m = match_subnet_reverse_lookups(s, domain, true); + if (m >= 0) + return m; + + if ((s->family == AF_INET && dns_name_endswith(domain, "in-addr.arpa") > 0) || + (s->family == AF_INET6 && dns_name_endswith(domain, "ip6.arpa") > 0)) + return DNS_SCOPE_MAYBE; + + if ((dns_name_is_single_label(domain) && /* only resolve single label names via LLMNR */ + dns_name_equal(domain, "local") == 0 && /* don't resolve "local" with LLMNR, it's the top-level domain of mDNS after all, see above */ + manager_is_own_hostname(s->manager, domain) <= 0)) /* never resolve the local hostname via LLMNR */ + return DNS_SCOPE_YES_BASE + 1; /* Return +1, as we consider ourselves authoritative + * for single-label names, i.e. one label. This is + * particularly relevant as it means a "." route on some + * other scope won't pull all traffic away from + * us. (If people actually want to pull traffic away + * from us they should turn off LLMNR on the + * link). Note that unicast DNS scopes with search + * domains also consider themselves authoritative for + * single-label domains, at the same preference (see + * above). */ + + return DNS_SCOPE_NO; + } + + default: + assert_not_reached(); + } +} + +bool dns_scope_good_key(DnsScope *s, const DnsResourceKey *key) { + int key_family; + + assert(s); + assert(key); + + /* Check if it makes sense to resolve the specified key on this scope. Note that this call assumes a + * fully qualified name, i.e. the search suffixes already appended. */ + + if (!IN_SET(key->class, DNS_CLASS_IN, DNS_CLASS_ANY)) + return false; + + if (s->protocol == DNS_PROTOCOL_DNS) { + + /* On classic DNS, looking up non-address RRs is always fine. (Specifically, we want to + * permit looking up DNSKEY and DS records on the root and top-level domains.) */ + if (!dns_resource_key_is_address(key)) + return true; + + /* Unless explicitly overridden, we refuse to look up A and AAAA RRs on the root and + * single-label domains, under the assumption that those should be resolved via LLMNR or + * search path only, and should not be leaked onto the internet. */ + const char* name = dns_resource_key_name(key); + + if (!s->manager->resolve_unicast_single_label && + dns_name_is_single_label(name)) + return false; + + return !dns_name_is_root(name); + } + + /* Never route DNSSEC RR queries to LLMNR/mDNS scopes */ + if (dns_type_is_dnssec(key->type)) + return false; + + /* On mDNS and LLMNR, send A and AAAA queries only on the respective scopes */ + + key_family = dns_type_to_af(key->type); + if (key_family < 0) + return true; + + return key_family == s->family; +} + +static int dns_scope_multicast_membership(DnsScope *s, bool b, struct in_addr in, struct in6_addr in6) { + int fd; + + assert(s); + assert(s->link); + + if (s->family == AF_INET) { + struct ip_mreqn mreqn = { + .imr_multiaddr = in, + .imr_ifindex = s->link->ifindex, + }; + + if (s->protocol == DNS_PROTOCOL_LLMNR) + fd = manager_llmnr_ipv4_udp_fd(s->manager); + else + fd = manager_mdns_ipv4_fd(s->manager); + + if (fd < 0) + return fd; + + /* Always first try to drop membership before we add + * one. This is necessary on some devices, such as + * veth. */ + if (b) + (void) setsockopt(fd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreqn, sizeof(mreqn)); + + if (setsockopt(fd, IPPROTO_IP, b ? IP_ADD_MEMBERSHIP : IP_DROP_MEMBERSHIP, &mreqn, sizeof(mreqn)) < 0) + return -errno; + + } else if (s->family == AF_INET6) { + struct ipv6_mreq mreq = { + .ipv6mr_multiaddr = in6, + .ipv6mr_interface = s->link->ifindex, + }; + + if (s->protocol == DNS_PROTOCOL_LLMNR) + fd = manager_llmnr_ipv6_udp_fd(s->manager); + else + fd = manager_mdns_ipv6_fd(s->manager); + + if (fd < 0) + return fd; + + if (b) + (void) setsockopt(fd, IPPROTO_IPV6, IPV6_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); + + if (setsockopt(fd, IPPROTO_IPV6, b ? IPV6_ADD_MEMBERSHIP : IPV6_DROP_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) + return -errno; + } else + return -EAFNOSUPPORT; + + return 0; +} + +int dns_scope_llmnr_membership(DnsScope *s, bool b) { + assert(s); + + if (s->protocol != DNS_PROTOCOL_LLMNR) + return 0; + + return dns_scope_multicast_membership(s, b, LLMNR_MULTICAST_IPV4_ADDRESS, LLMNR_MULTICAST_IPV6_ADDRESS); +} + +int dns_scope_mdns_membership(DnsScope *s, bool b) { + assert(s); + + if (s->protocol != DNS_PROTOCOL_MDNS) + return 0; + + return dns_scope_multicast_membership(s, b, MDNS_MULTICAST_IPV4_ADDRESS, MDNS_MULTICAST_IPV6_ADDRESS); +} + +int dns_scope_make_reply_packet( + DnsScope *s, + uint16_t id, + int rcode, + DnsQuestion *q, + DnsAnswer *answer, + DnsAnswer *soa, + bool tentative, + DnsPacket **ret) { + + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + unsigned n_answer = 0, n_soa = 0; + int r; + bool c_or_aa; + + assert(s); + assert(ret); + + if (dns_question_isempty(q) && + dns_answer_isempty(answer) && + dns_answer_isempty(soa)) + return -EINVAL; + + r = dns_packet_new(&p, s->protocol, 0, DNS_PACKET_SIZE_MAX); + if (r < 0) + return r; + + /* mDNS answers must have the Authoritative Answer bit set, see RFC 6762, section 18.4. */ + c_or_aa = s->protocol == DNS_PROTOCOL_MDNS; + + DNS_PACKET_HEADER(p)->id = id; + DNS_PACKET_HEADER(p)->flags = htobe16(DNS_PACKET_MAKE_FLAGS( + 1 /* qr */, + 0 /* opcode */, + c_or_aa, + 0 /* tc */, + tentative, + 0 /* (ra) */, + 0 /* (ad) */, + 0 /* (cd) */, + rcode)); + + r = dns_packet_append_question(p, q); + if (r < 0) + return r; + DNS_PACKET_HEADER(p)->qdcount = htobe16(dns_question_size(q)); + + r = dns_packet_append_answer(p, answer, &n_answer); + if (r < 0) + return r; + DNS_PACKET_HEADER(p)->ancount = htobe16(n_answer); + + r = dns_packet_append_answer(p, soa, &n_soa); + if (r < 0) + return r; + DNS_PACKET_HEADER(p)->arcount = htobe16(n_soa); + + *ret = TAKE_PTR(p); + + return 0; +} + +static void dns_scope_verify_conflicts(DnsScope *s, DnsPacket *p) { + DnsResourceRecord *rr; + DnsResourceKey *key; + + assert(s); + assert(p); + + DNS_QUESTION_FOREACH(key, p->question) + dns_zone_verify_conflicts(&s->zone, key); + + DNS_ANSWER_FOREACH(rr, p->answer) + dns_zone_verify_conflicts(&s->zone, rr->key); +} + +void dns_scope_process_query(DnsScope *s, DnsStream *stream, DnsPacket *p) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL, *soa = NULL; + _cleanup_(dns_packet_unrefp) DnsPacket *reply = NULL; + DnsResourceKey *key = NULL; + bool tentative = false; + int r; + + assert(s); + assert(p); + + if (p->protocol != DNS_PROTOCOL_LLMNR) + return; + + if (p->ipproto == IPPROTO_UDP) { + /* Don't accept UDP queries directed to anything but + * the LLMNR multicast addresses. See RFC 4795, + * section 2.5. */ + + if (p->family == AF_INET && !in4_addr_equal(&p->destination.in, &LLMNR_MULTICAST_IPV4_ADDRESS)) + return; + + if (p->family == AF_INET6 && !in6_addr_equal(&p->destination.in6, &LLMNR_MULTICAST_IPV6_ADDRESS)) + return; + } + + r = dns_packet_extract(p); + if (r < 0) { + log_debug_errno(r, "Failed to extract resource records from incoming packet: %m"); + return; + } + + if (DNS_PACKET_LLMNR_C(p)) { + /* Somebody notified us about a possible conflict */ + dns_scope_verify_conflicts(s, p); + return; + } + + if (dns_question_size(p->question) != 1) + return (void) log_debug("Received LLMNR query without question or multiple questions, ignoring."); + + key = dns_question_first_key(p->question); + + r = dns_zone_lookup(&s->zone, key, 0, &answer, &soa, &tentative); + if (r < 0) { + log_debug_errno(r, "Failed to look up key: %m"); + return; + } + if (r == 0) + return; + + if (answer) + dns_answer_order_by_scope(answer, in_addr_is_link_local(p->family, &p->sender) > 0); + + r = dns_scope_make_reply_packet(s, DNS_PACKET_ID(p), DNS_RCODE_SUCCESS, p->question, answer, soa, tentative, &reply); + if (r < 0) { + log_debug_errno(r, "Failed to build reply packet: %m"); + return; + } + + if (stream) { + r = dns_stream_write_packet(stream, reply); + if (r < 0) { + log_debug_errno(r, "Failed to enqueue reply packet: %m"); + return; + } + + /* Let's take an extra reference on this stream, so that it stays around after returning. The reference + * will be dangling until the stream is disconnected, and the default completion handler of the stream + * will then unref the stream and destroy it */ + if (DNS_STREAM_QUEUED(stream)) + dns_stream_ref(stream); + } else { + int fd; + + if (!ratelimit_below(&s->ratelimit)) + return; + + if (p->family == AF_INET) + fd = manager_llmnr_ipv4_udp_fd(s->manager); + else if (p->family == AF_INET6) + fd = manager_llmnr_ipv6_udp_fd(s->manager); + else { + log_debug("Unknown protocol"); + return; + } + if (fd < 0) { + log_debug_errno(fd, "Failed to get reply socket: %m"); + return; + } + + /* Note that we always immediately reply to all LLMNR + * requests, and do not wait any time, since we + * verified uniqueness for all records. Also see RFC + * 4795, Section 2.7 */ + + r = manager_send(s->manager, fd, p->ifindex, p->family, &p->sender, p->sender_port, NULL, reply); + if (r < 0) { + log_debug_errno(r, "Failed to send reply packet: %m"); + return; + } + } +} + +DnsTransaction *dns_scope_find_transaction( + DnsScope *scope, + DnsResourceKey *key, + uint64_t query_flags) { + + DnsTransaction *first; + + assert(scope); + assert(key); + + /* Iterate through the list of transactions with a matching key */ + first = hashmap_get(scope->transactions_by_key, key); + LIST_FOREACH(transactions_by_key, t, first) { + + /* These four flags must match exactly: we cannot use a validated response for a + * non-validating client, and we cannot use a non-validated response for a validating + * client. Similar, if the sources don't match things aren't usable either. */ + if (((query_flags ^ t->query_flags) & + (SD_RESOLVED_NO_VALIDATE| + SD_RESOLVED_NO_ZONE| + SD_RESOLVED_NO_TRUST_ANCHOR| + SD_RESOLVED_NO_NETWORK)) != 0) + continue; + + /* We can reuse a primary query if a regular one is requested, but not vice versa */ + if ((query_flags & SD_RESOLVED_REQUIRE_PRIMARY) && + !(t->query_flags & SD_RESOLVED_REQUIRE_PRIMARY)) + continue; + + /* Don't reuse a transaction that allowed caching when we got told not to use it */ + if ((query_flags & SD_RESOLVED_NO_CACHE) && + !(t->query_flags & SD_RESOLVED_NO_CACHE)) + continue; + + /* If we are asked to clamp ttls and the existing transaction doesn't do it, we can't + * reuse */ + if ((query_flags & SD_RESOLVED_CLAMP_TTL) && + !(t->query_flags & SD_RESOLVED_CLAMP_TTL)) + continue; + + return t; + } + + return NULL; +} + +static int dns_scope_make_conflict_packet( + DnsScope *s, + DnsResourceRecord *rr, + DnsPacket **ret) { + + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + int r; + + assert(s); + assert(rr); + assert(ret); + + r = dns_packet_new(&p, s->protocol, 0, DNS_PACKET_SIZE_MAX); + if (r < 0) + return r; + + DNS_PACKET_HEADER(p)->flags = htobe16(DNS_PACKET_MAKE_FLAGS( + 0 /* qr */, + 0 /* opcode */, + 1 /* conflict */, + 0 /* tc */, + 0 /* t */, + 0 /* (ra) */, + 0 /* (ad) */, + 0 /* (cd) */, + 0)); + + /* For mDNS, the transaction ID should always be 0 */ + if (s->protocol != DNS_PROTOCOL_MDNS) + random_bytes(&DNS_PACKET_HEADER(p)->id, sizeof(uint16_t)); + + DNS_PACKET_HEADER(p)->qdcount = htobe16(1); + DNS_PACKET_HEADER(p)->arcount = htobe16(1); + + r = dns_packet_append_key(p, rr->key, 0, NULL); + if (r < 0) + return r; + + r = dns_packet_append_rr(p, rr, 0, NULL, NULL); + if (r < 0) + return r; + + *ret = TAKE_PTR(p); + + return 0; +} + +static int on_conflict_dispatch(sd_event_source *es, usec_t usec, void *userdata) { + DnsScope *scope = ASSERT_PTR(userdata); + int r; + + assert(es); + + scope->conflict_event_source = sd_event_source_disable_unref(scope->conflict_event_source); + + for (;;) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + + key = ordered_hashmap_first_key(scope->conflict_queue); + if (!key) + break; + + rr = ordered_hashmap_remove(scope->conflict_queue, key); + assert(rr); + + r = dns_scope_make_conflict_packet(scope, rr, &p); + if (r < 0) { + log_error_errno(r, "Failed to make conflict packet: %m"); + return 0; + } + + r = dns_scope_emit_udp(scope, -1, AF_UNSPEC, p); + if (r < 0) + log_debug_errno(r, "Failed to send conflict packet: %m"); + } + + return 0; +} + +int dns_scope_notify_conflict(DnsScope *scope, DnsResourceRecord *rr) { + int r; + + assert(scope); + assert(rr); + + /* We don't send these queries immediately. Instead, we queue + * them, and send them after some jitter delay. */ + r = ordered_hashmap_ensure_allocated(&scope->conflict_queue, &dns_resource_key_hash_ops); + if (r < 0) { + log_oom(); + return r; + } + + /* We only place one RR per key in the conflict + * messages, not all of them. That should be enough to + * indicate where there might be a conflict */ + r = ordered_hashmap_put(scope->conflict_queue, rr->key, rr); + if (IN_SET(r, 0, -EEXIST)) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to queue conflicting RR: %m"); + + dns_resource_key_ref(rr->key); + dns_resource_record_ref(rr); + + if (scope->conflict_event_source) + return 0; + + r = sd_event_add_time_relative( + scope->manager->event, + &scope->conflict_event_source, + CLOCK_BOOTTIME, + random_u64_range(LLMNR_JITTER_INTERVAL_USEC), + 0, + on_conflict_dispatch, scope); + if (r < 0) + return log_debug_errno(r, "Failed to add conflict dispatch event: %m"); + + (void) sd_event_source_set_description(scope->conflict_event_source, "scope-conflict"); + + return 0; +} + +void dns_scope_check_conflicts(DnsScope *scope, DnsPacket *p) { + DnsResourceRecord *rr; + int r; + + assert(scope); + assert(p); + + if (!IN_SET(p->protocol, DNS_PROTOCOL_LLMNR, DNS_PROTOCOL_MDNS)) + return; + + if (DNS_PACKET_RRCOUNT(p) <= 0) + return; + + if (p->protocol == DNS_PROTOCOL_LLMNR) { + if (DNS_PACKET_LLMNR_C(p) != 0) + return; + + if (DNS_PACKET_LLMNR_T(p) != 0) + return; + } + + if (manager_packet_from_local_address(scope->manager, p)) + return; + + r = dns_packet_extract(p); + if (r < 0) { + log_debug_errno(r, "Failed to extract packet: %m"); + return; + } + + log_debug("Checking for conflicts..."); + + DNS_ANSWER_FOREACH(rr, p->answer) { + /* No conflict if it is DNS-SD RR used for service enumeration. */ + if (dns_resource_key_is_dnssd_ptr(rr->key)) + continue; + + /* Check for conflicts against the local zone. If we + * found one, we won't check any further */ + r = dns_zone_check_conflicts(&scope->zone, rr); + if (r != 0) + continue; + + /* Check for conflicts against the local cache. If so, + * send out an advisory query, to inform everybody */ + r = dns_cache_check_conflicts(&scope->cache, rr, p->family, &p->sender); + if (r <= 0) + continue; + + dns_scope_notify_conflict(scope, rr); + } +} + +void dns_scope_dump(DnsScope *s, FILE *f) { + assert(s); + + if (!f) + f = stdout; + + fputs("[Scope protocol=", f); + fputs(dns_protocol_to_string(s->protocol), f); + + if (s->link) { + fputs(" interface=", f); + fputs(s->link->ifname, f); + } + + if (s->family != AF_UNSPEC) { + fputs(" family=", f); + fputs(af_to_name(s->family), f); + } + + fputs("]\n", f); + + if (!dns_zone_is_empty(&s->zone)) { + fputs("ZONE:\n", f); + dns_zone_dump(&s->zone, f); + } + + if (!dns_cache_is_empty(&s->cache)) { + fputs("CACHE:\n", f); + dns_cache_dump(&s->cache, f); + } +} + +DnsSearchDomain *dns_scope_get_search_domains(DnsScope *s) { + assert(s); + + if (s->protocol != DNS_PROTOCOL_DNS) + return NULL; + + if (s->link) + return s->link->search_domains; + + return s->manager->search_domains; +} + +bool dns_scope_name_wants_search_domain(DnsScope *s, const char *name) { + assert(s); + + if (s->protocol != DNS_PROTOCOL_DNS) + return false; + + if (!dns_name_is_single_label(name)) + return false; + + /* If we allow single-label domain lookups on unicast DNS, and this scope has a search domain that matches + * _exactly_ this name, then do not use search domains. */ + if (s->manager->resolve_unicast_single_label) + LIST_FOREACH(domains, d, dns_scope_get_search_domains(s)) + if (dns_name_equal(name, d->name) > 0) + return false; + + return true; +} + +bool dns_scope_network_good(DnsScope *s) { + /* Checks whether the network is in good state for lookups on this scope. For mDNS/LLMNR/Classic DNS scopes + * bound to links this is easy, as they don't even exist if the link isn't in a suitable state. For the global + * DNS scope we check whether there are any links that are up and have an address. + * + * Note that Linux routing is complex and even systems that superficially have no IPv4 address might + * be able to route IPv4 (and similar for IPv6), hence let's make a check here independent of address + * family. */ + + if (s->link) + return true; + + return manager_routable(s->manager); +} + +int dns_scope_ifindex(DnsScope *s) { + assert(s); + + if (s->link) + return s->link->ifindex; + + return 0; +} + +static int on_announcement_timeout(sd_event_source *s, usec_t usec, void *userdata) { + DnsScope *scope = userdata; + + assert(s); + + scope->announce_event_source = sd_event_source_disable_unref(scope->announce_event_source); + + (void) dns_scope_announce(scope, false); + return 0; +} + +int dns_scope_announce(DnsScope *scope, bool goodbye) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + _cleanup_set_free_ Set *types = NULL; + DnsZoneItem *z; + unsigned size = 0; + char *service_type; + int r; + + if (!scope) + return 0; + + if (scope->protocol != DNS_PROTOCOL_MDNS) + return 0; + + r = sd_event_get_state(scope->manager->event); + if (r < 0) + return log_debug_errno(r, "Failed to get event loop state: %m"); + + /* If this is called on exit, through manager_free() -> link_free(), then we cannot announce. */ + if (r == SD_EVENT_FINISHED) + return 0; + + /* Check if we're done with probing. */ + LIST_FOREACH(transactions_by_scope, t, scope->transactions) + if (t->probing && DNS_TRANSACTION_IS_LIVE(t->state)) + return 0; + + /* Check if there're services pending conflict resolution. */ + if (manager_next_dnssd_names(scope->manager)) + return 0; /* we reach this point only if changing hostname didn't help */ + + /* Calculate answer's size. */ + HASHMAP_FOREACH(z, scope->zone.by_key) { + if (z->state != DNS_ZONE_ITEM_ESTABLISHED) + continue; + + if (z->rr->key->type == DNS_TYPE_PTR && + !dns_zone_contains_name(&scope->zone, z->rr->ptr.name)) { + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + log_debug("Skip PTR RR <%s> since its counterparts seem to be withdrawn", dns_resource_key_to_string(z->rr->key, key_str, sizeof key_str)); + z->state = DNS_ZONE_ITEM_WITHDRAWN; + continue; + } + + /* Collect service types for _services._dns-sd._udp.local RRs in a set */ + if (!scope->announced && + dns_resource_key_is_dnssd_ptr(z->rr->key)) { + if (!set_contains(types, dns_resource_key_name(z->rr->key))) { + r = set_ensure_put(&types, &dns_name_hash_ops, dns_resource_key_name(z->rr->key)); + if (r < 0) + return log_debug_errno(r, "Failed to add item to set: %m"); + } + } + + LIST_FOREACH(by_key, i, z) + size++; + } + + answer = dns_answer_new(size + set_size(types)); + if (!answer) + return log_oom(); + + /* Second iteration, actually add RRs to the answer. */ + HASHMAP_FOREACH(z, scope->zone.by_key) + LIST_FOREACH (by_key, i, z) { + DnsAnswerFlags flags; + + if (i->state != DNS_ZONE_ITEM_ESTABLISHED) + continue; + + if (dns_resource_key_is_dnssd_ptr(i->rr->key)) + flags = goodbye ? DNS_ANSWER_GOODBYE : 0; + else + flags = goodbye ? (DNS_ANSWER_GOODBYE|DNS_ANSWER_CACHE_FLUSH) : DNS_ANSWER_CACHE_FLUSH; + + r = dns_answer_add(answer, i->rr, 0, flags, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to add RR to announce: %m"); + } + + /* Since all the active services are in the zone make them discoverable now. */ + SET_FOREACH(service_type, types) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_PTR, + "_services._dns-sd._udp.local"); + if (!rr) + return log_oom(); + + rr->ptr.name = strdup(service_type); + if (!rr->ptr.name) + return log_oom(); + + rr->ttl = MDNS_DEFAULT_TTL; + + r = dns_zone_put(&scope->zone, scope, rr, false); + if (r < 0) + log_warning_errno(r, "Failed to add DNS-SD PTR record to MDNS zone, ignoring: %m"); + + r = dns_answer_add(answer, rr, 0, 0, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to add RR to announce: %m"); + } + + if (dns_answer_isempty(answer)) + return 0; + + r = dns_scope_make_reply_packet(scope, 0, DNS_RCODE_SUCCESS, NULL, answer, NULL, false, &p); + if (r < 0) + return log_debug_errno(r, "Failed to build reply packet: %m"); + + r = dns_scope_emit_udp(scope, -1, AF_UNSPEC, p); + if (r < 0) + return log_debug_errno(r, "Failed to send reply packet: %m"); + + /* In section 8.3 of RFC6762: "The Multicast DNS responder MUST send at least two unsolicited + * responses, one second apart." */ + if (!scope->announced) { + scope->announced = true; + + r = sd_event_add_time_relative( + scope->manager->event, + &scope->announce_event_source, + CLOCK_BOOTTIME, + MDNS_ANNOUNCE_DELAY, + 0, + on_announcement_timeout, scope); + if (r < 0) + return log_debug_errno(r, "Failed to schedule second announcement: %m"); + + (void) sd_event_source_set_description(scope->announce_event_source, "mdns-announce"); + } + + return 0; +} + +int dns_scope_add_dnssd_services(DnsScope *scope) { + DnssdService *service; + int r; + + assert(scope); + + if (hashmap_size(scope->manager->dnssd_services) == 0) + return 0; + + scope->announced = false; + + HASHMAP_FOREACH(service, scope->manager->dnssd_services) { + service->withdrawn = false; + + r = dns_zone_put(&scope->zone, scope, service->ptr_rr, false); + if (r < 0) + log_warning_errno(r, "Failed to add PTR record to MDNS zone: %m"); + + r = dns_zone_put(&scope->zone, scope, service->srv_rr, true); + if (r < 0) + log_warning_errno(r, "Failed to add SRV record to MDNS zone: %m"); + + LIST_FOREACH(items, txt_data, service->txt_data_items) { + r = dns_zone_put(&scope->zone, scope, txt_data->rr, true); + if (r < 0) + log_warning_errno(r, "Failed to add TXT record to MDNS zone: %m"); + } + } + + return 0; +} + +int dns_scope_remove_dnssd_services(DnsScope *scope) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + DnssdService *service; + int r; + + assert(scope); + + key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_PTR, + "_services._dns-sd._udp.local"); + if (!key) + return log_oom(); + + r = dns_zone_remove_rrs_by_key(&scope->zone, key); + if (r < 0) + return r; + + HASHMAP_FOREACH(service, scope->manager->dnssd_services) { + dns_zone_remove_rr(&scope->zone, service->ptr_rr); + dns_zone_remove_rr(&scope->zone, service->srv_rr); + LIST_FOREACH(items, txt_data, service->txt_data_items) + dns_zone_remove_rr(&scope->zone, txt_data->rr); + } + + return 0; +} + +static bool dns_scope_has_route_only_domains(DnsScope *scope) { + DnsSearchDomain *first; + bool route_only = false; + + assert(scope); + assert(scope->protocol == DNS_PROTOCOL_DNS); + + /* Returns 'true' if this scope is suitable for queries to specific domains only. For that we check + * if there are any route-only domains on this interface, as a heuristic to discern VPN-style links + * from non-VPN-style links. Returns 'false' for all other cases, i.e. if the scope is intended to + * take queries to arbitrary domains, i.e. has no routing domains set. */ + + if (scope->link) + first = scope->link->search_domains; + else + first = scope->manager->search_domains; + + LIST_FOREACH(domains, domain, first) { + /* "." means "any domain", thus the interface takes any kind of traffic. Thus, we exit early + * here, as it doesn't really matter whether this link has any route-only domains or not, + * "~." really trumps everything and clearly indicates that this interface shall receive all + * traffic it can get. */ + if (dns_name_is_root(DNS_SEARCH_DOMAIN_NAME(domain))) + return false; + + if (domain->route_only) + route_only = true; + } + + return route_only; +} + +bool dns_scope_is_default_route(DnsScope *scope) { + assert(scope); + + /* Only use DNS scopes as default routes */ + if (scope->protocol != DNS_PROTOCOL_DNS) + return false; + + /* The global DNS scope is always suitable as default route */ + if (!scope->link) + return true; + + /* Honour whatever is explicitly configured. This is really the best approach, and trumps any + * automatic logic. */ + if (scope->link->default_route >= 0) + return scope->link->default_route; + + /* Otherwise check if we have any route-only domains, as a sensible heuristic: if so, let's not + * volunteer as default route. */ + return !dns_scope_has_route_only_domains(scope); +} + +int dns_scope_dump_cache_to_json(DnsScope *scope, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *cache = NULL; + int r; + + assert(scope); + assert(ret); + + r = dns_cache_dump_to_json(&scope->cache, &cache); + if (r < 0) + return r; + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("protocol", dns_protocol_to_string(scope->protocol)), + JSON_BUILD_PAIR_CONDITION(scope->family != AF_UNSPEC, "family", JSON_BUILD_INTEGER(scope->family)), + JSON_BUILD_PAIR_CONDITION(scope->link, "ifindex", JSON_BUILD_INTEGER(scope->link ? scope->link->ifindex : 0)), + JSON_BUILD_PAIR_CONDITION(scope->link, "ifname", JSON_BUILD_STRING(scope->link ? scope->link->ifname : NULL)), + JSON_BUILD_PAIR_VARIANT("cache", cache))); +} diff --git a/src/resolve/resolved-dns-scope.h b/src/resolve/resolved-dns-scope.h new file mode 100644 index 0000000..ca33fd0 --- /dev/null +++ b/src/resolve/resolved-dns-scope.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "list.h" +#include "ratelimit.h" + +typedef struct DnsQueryCandidate DnsQueryCandidate; +typedef struct DnsScope DnsScope; + +#include "resolved-dns-cache.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-query.h" +#include "resolved-dns-search-domain.h" +#include "resolved-dns-server.h" +#include "resolved-dns-stream.h" +#include "resolved-dns-zone.h" + +typedef enum DnsScopeMatch { + DNS_SCOPE_NO, + DNS_SCOPE_MAYBE, + DNS_SCOPE_YES_BASE, /* Add the number of matching labels to this */ + DNS_SCOPE_YES_END = DNS_SCOPE_YES_BASE + DNS_N_LABELS_MAX, + _DNS_SCOPE_MATCH_MAX, + _DNS_SCOPE_MATCH_INVALID = -EINVAL, +} DnsScopeMatch; + +struct DnsScope { + Manager *manager; + + DnsProtocol protocol; + int family; + + /* Copied at scope creation time from the link/manager */ + DnssecMode dnssec_mode; + DnsOverTlsMode dns_over_tls_mode; + + Link *link; + + DnsCache cache; + DnsZone zone; + + OrderedHashmap *conflict_queue; + sd_event_source *conflict_event_source; + + sd_event_source *announce_event_source; + + RateLimit ratelimit; + + usec_t resend_timeout; + usec_t max_rtt; + + LIST_HEAD(DnsQueryCandidate, query_candidates); + + /* Note that we keep track of ongoing transactions in two ways: once in a hashmap, indexed by the rr + * key, and once in a linked list. We use the hashmap to quickly find transactions we can reuse for a + * key. But note that there might be multiple transactions for the same key (because the associated + * query flags might differ in incompatible ways: e.g. we may not reuse a non-validating transaction + * as validating. Hence we maintain a per-key list of transactions, which we iterate through to find + * one we can reuse with matching flags. */ + Hashmap *transactions_by_key; + LIST_HEAD(DnsTransaction, transactions); + + LIST_FIELDS(DnsScope, scopes); + + bool announced; +}; + +int dns_scope_new(Manager *m, DnsScope **ret, Link *l, DnsProtocol p, int family); +DnsScope* dns_scope_free(DnsScope *s); + +void dns_scope_packet_received(DnsScope *s, usec_t rtt); +void dns_scope_packet_lost(DnsScope *s, usec_t usec); + +int dns_scope_emit_udp(DnsScope *s, int fd, int af, DnsPacket *p); +int dns_scope_socket_tcp(DnsScope *s, int family, const union in_addr_union *address, DnsServer *server, uint16_t port, union sockaddr_union *ret_socket_address); +int dns_scope_socket_udp(DnsScope *s, DnsServer *server); + +DnsScopeMatch dns_scope_good_domain(DnsScope *s, DnsQuery *q); +bool dns_scope_good_key(DnsScope *s, const DnsResourceKey *key); + +DnsServer *dns_scope_get_dns_server(DnsScope *s); +unsigned dns_scope_get_n_dns_servers(DnsScope *s); +void dns_scope_next_dns_server(DnsScope *s, DnsServer *if_current); + +int dns_scope_llmnr_membership(DnsScope *s, bool b); +int dns_scope_mdns_membership(DnsScope *s, bool b); + +int dns_scope_make_reply_packet(DnsScope *s, uint16_t id, int rcode, DnsQuestion *q, DnsAnswer *answer, DnsAnswer *soa, bool tentative, DnsPacket **ret); +void dns_scope_process_query(DnsScope *s, DnsStream *stream, DnsPacket *p); + +DnsTransaction *dns_scope_find_transaction(DnsScope *scope, DnsResourceKey *key, uint64_t query_flags); + +int dns_scope_notify_conflict(DnsScope *scope, DnsResourceRecord *rr); +void dns_scope_check_conflicts(DnsScope *scope, DnsPacket *p); + +void dns_scope_dump(DnsScope *s, FILE *f); + +DnsSearchDomain *dns_scope_get_search_domains(DnsScope *s); + +bool dns_scope_name_wants_search_domain(DnsScope *s, const char *name); + +bool dns_scope_network_good(DnsScope *s); + +int dns_scope_ifindex(DnsScope *s); + +int dns_scope_announce(DnsScope *scope, bool goodbye); + +int dns_scope_add_dnssd_services(DnsScope *scope); +int dns_scope_remove_dnssd_services(DnsScope *scope); + +bool dns_scope_is_default_route(DnsScope *scope); + +int dns_scope_dump_cache_to_json(DnsScope *scope, JsonVariant **ret); diff --git a/src/resolve/resolved-dns-search-domain.c b/src/resolve/resolved-dns-search-domain.c new file mode 100644 index 0000000..a11b213 --- /dev/null +++ b/src/resolve/resolved-dns-search-domain.c @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dns-domain.h" +#include "resolved-dns-search-domain.h" +#include "resolved-link.h" +#include "resolved-manager.h" + +int dns_search_domain_new( + Manager *m, + DnsSearchDomain **ret, + DnsSearchDomainType type, + Link *l, + const char *name) { + + _cleanup_free_ char *normalized = NULL; + DnsSearchDomain *d; + int r; + + assert(m); + assert((type == DNS_SEARCH_DOMAIN_LINK) == !!l); + assert(name); + + r = dns_name_normalize(name, 0, &normalized); + if (r < 0) + return r; + + if (l) { + if (l->n_search_domains >= LINK_SEARCH_DOMAINS_MAX) + return -E2BIG; + } else { + if (m->n_search_domains >= MANAGER_SEARCH_DOMAINS_MAX) + return -E2BIG; + } + + d = new(DnsSearchDomain, 1); + if (!d) + return -ENOMEM; + + *d = (DnsSearchDomain) { + .n_ref = 1, + .manager = m, + .type = type, + .name = TAKE_PTR(normalized), + }; + + switch (type) { + + case DNS_SEARCH_DOMAIN_LINK: + d->link = l; + LIST_APPEND(domains, l->search_domains, d); + l->n_search_domains++; + break; + + case DNS_SEARCH_DOMAIN_SYSTEM: + LIST_APPEND(domains, m->search_domains, d); + m->n_search_domains++; + break; + + default: + assert_not_reached(); + } + + d->linked = true; + + if (ret) + *ret = d; + + return 0; +} + +static DnsSearchDomain* dns_search_domain_free(DnsSearchDomain *d) { + assert(d); + + free(d->name); + return mfree(d); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DnsSearchDomain, dns_search_domain, dns_search_domain_free); + +void dns_search_domain_unlink(DnsSearchDomain *d) { + assert(d); + assert(d->manager); + + if (!d->linked) + return; + + switch (d->type) { + + case DNS_SEARCH_DOMAIN_LINK: + assert(d->link); + assert(d->link->n_search_domains > 0); + LIST_REMOVE(domains, d->link->search_domains, d); + d->link->n_search_domains--; + break; + + case DNS_SEARCH_DOMAIN_SYSTEM: + assert(d->manager->n_search_domains > 0); + LIST_REMOVE(domains, d->manager->search_domains, d); + d->manager->n_search_domains--; + break; + } + + d->linked = false; + + dns_search_domain_unref(d); +} + +void dns_search_domain_move_back_and_unmark(DnsSearchDomain *d) { + DnsSearchDomain *tail; + + assert(d); + + if (!d->marked) + return; + + d->marked = false; + + if (!d->linked || !d->domains_next) + return; + + switch (d->type) { + + case DNS_SEARCH_DOMAIN_LINK: + assert(d->link); + tail = LIST_FIND_TAIL(domains, d); + LIST_REMOVE(domains, d->link->search_domains, d); + LIST_INSERT_AFTER(domains, d->link->search_domains, tail, d); + break; + + case DNS_SEARCH_DOMAIN_SYSTEM: + tail = LIST_FIND_TAIL(domains, d); + LIST_REMOVE(domains, d->manager->search_domains, d); + LIST_INSERT_AFTER(domains, d->manager->search_domains, tail, d); + break; + + default: + assert_not_reached(); + } +} + +void dns_search_domain_unlink_all(DnsSearchDomain *first) { + DnsSearchDomain *next; + + if (!first) + return; + + next = first->domains_next; + dns_search_domain_unlink(first); + + dns_search_domain_unlink_all(next); +} + +bool dns_search_domain_unlink_marked(DnsSearchDomain *first) { + DnsSearchDomain *next; + bool changed; + + if (!first) + return false; + + next = first->domains_next; + + if (first->marked) { + dns_search_domain_unlink(first); + changed = true; + } else + changed = false; + + return dns_search_domain_unlink_marked(next) || changed; +} + +void dns_search_domain_mark_all(DnsSearchDomain *first) { + if (!first) + return; + + first->marked = true; + dns_search_domain_mark_all(first->domains_next); +} + +int dns_search_domain_find(DnsSearchDomain *first, const char *name, DnsSearchDomain **ret) { + int r; + + assert(name); + assert(ret); + + LIST_FOREACH(domains, d, first) { + + r = dns_name_equal(name, d->name); + if (r < 0) + return r; + if (r > 0) { + *ret = d; + return 1; + } + } + + *ret = NULL; + return 0; +} diff --git a/src/resolve/resolved-dns-search-domain.h b/src/resolve/resolved-dns-search-domain.h new file mode 100644 index 0000000..f0d96ac --- /dev/null +++ b/src/resolve/resolved-dns-search-domain.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "list.h" +#include "macro.h" + +typedef struct DnsSearchDomain DnsSearchDomain; +typedef struct Link Link; +typedef struct Manager Manager; + +typedef enum DnsSearchDomainType { + DNS_SEARCH_DOMAIN_SYSTEM, + DNS_SEARCH_DOMAIN_LINK, +} DnsSearchDomainType; + +struct DnsSearchDomain { + Manager *manager; + + unsigned n_ref; + + DnsSearchDomainType type; + Link *link; + + char *name; + + bool marked:1; + bool route_only:1; + + bool linked:1; + LIST_FIELDS(DnsSearchDomain, domains); +}; + +int dns_search_domain_new( + Manager *m, + DnsSearchDomain **ret, + DnsSearchDomainType type, + Link *link, + const char *name); + +DnsSearchDomain* dns_search_domain_ref(DnsSearchDomain *d); +DnsSearchDomain* dns_search_domain_unref(DnsSearchDomain *d); + +void dns_search_domain_unlink(DnsSearchDomain *d); +void dns_search_domain_move_back_and_unmark(DnsSearchDomain *d); + +void dns_search_domain_unlink_all(DnsSearchDomain *first); +bool dns_search_domain_unlink_marked(DnsSearchDomain *first); +void dns_search_domain_mark_all(DnsSearchDomain *first); + +int dns_search_domain_find(DnsSearchDomain *first, const char *name, DnsSearchDomain **ret); + +static inline const char* DNS_SEARCH_DOMAIN_NAME(DnsSearchDomain *d) { + return d ? d->name : NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsSearchDomain*, dns_search_domain_unref); diff --git a/src/resolve/resolved-dns-server.c b/src/resolve/resolved-dns-server.c new file mode 100644 index 0000000..b7db839 --- /dev/null +++ b/src/resolve/resolved-dns-server.c @@ -0,0 +1,1122 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "resolved-bus.h" +#include "resolved-dns-server.h" +#include "resolved-dns-stub.h" +#include "resolved-manager.h" +#include "resolved-resolv-conf.h" +#include "siphash24.h" +#include "string-table.h" +#include "string-util.h" + +/* The amount of time to wait before retrying with a full feature set */ +#define DNS_SERVER_FEATURE_GRACE_PERIOD_MAX_USEC (6 * USEC_PER_HOUR) +#define DNS_SERVER_FEATURE_GRACE_PERIOD_MIN_USEC (5 * USEC_PER_MINUTE) + +/* The number of times we will attempt a certain feature set before degrading */ +#define DNS_SERVER_FEATURE_RETRY_ATTEMPTS 3 + +int dns_server_new( + Manager *m, + DnsServer **ret, + DnsServerType type, + Link *l, + int family, + const union in_addr_union *in_addr, + uint16_t port, + int ifindex, + const char *server_name) { + + _cleanup_free_ char *name = NULL; + DnsServer *s; + + assert(m); + assert((type == DNS_SERVER_LINK) == !!l); + assert(in_addr); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + if (l) { + if (l->n_dns_servers >= LINK_DNS_SERVERS_MAX) + return -E2BIG; + } else { + if (m->n_dns_servers >= MANAGER_DNS_SERVERS_MAX) + return -E2BIG; + } + + if (!isempty(server_name)) { + name = strdup(server_name); + if (!name) + return -ENOMEM; + } + + s = new(DnsServer, 1); + if (!s) + return -ENOMEM; + + *s = (DnsServer) { + .n_ref = 1, + .manager = m, + .type = type, + .family = family, + .address = *in_addr, + .port = port, + .ifindex = ifindex, + .server_name = TAKE_PTR(name), + }; + + dns_server_reset_features(s); + + switch (type) { + + case DNS_SERVER_LINK: + s->link = l; + LIST_APPEND(servers, l->dns_servers, s); + l->n_dns_servers++; + break; + + case DNS_SERVER_SYSTEM: + LIST_APPEND(servers, m->dns_servers, s); + m->n_dns_servers++; + break; + + case DNS_SERVER_FALLBACK: + LIST_APPEND(servers, m->fallback_dns_servers, s); + m->n_dns_servers++; + break; + + default: + assert_not_reached(); + } + + s->linked = true; + + /* A new DNS server that isn't fallback is added and the one + * we used so far was a fallback one? Then let's try to pick + * the new one */ + if (type != DNS_SERVER_FALLBACK && + m->current_dns_server && + m->current_dns_server->type == DNS_SERVER_FALLBACK) + manager_set_dns_server(m, NULL); + + if (ret) + *ret = s; + + return 0; +} + +static DnsServer* dns_server_free(DnsServer *s) { + assert(s); + + dns_server_unref_stream(s); + +#if ENABLE_DNS_OVER_TLS + dnstls_server_free(s); +#endif + + free(s->server_string); + free(s->server_string_full); + free(s->server_name); + return mfree(s); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DnsServer, dns_server, dns_server_free); + +void dns_server_unlink(DnsServer *s) { + assert(s); + assert(s->manager); + + /* This removes the specified server from the linked list of + * servers, but any server might still stay around if it has + * refs, for example from an ongoing transaction. */ + + if (!s->linked) + return; + + switch (s->type) { + + case DNS_SERVER_LINK: + assert(s->link); + assert(s->link->n_dns_servers > 0); + LIST_REMOVE(servers, s->link->dns_servers, s); + s->link->n_dns_servers--; + break; + + case DNS_SERVER_SYSTEM: + assert(s->manager->n_dns_servers > 0); + LIST_REMOVE(servers, s->manager->dns_servers, s); + s->manager->n_dns_servers--; + break; + + case DNS_SERVER_FALLBACK: + assert(s->manager->n_dns_servers > 0); + LIST_REMOVE(servers, s->manager->fallback_dns_servers, s); + s->manager->n_dns_servers--; + break; + default: + assert_not_reached(); + } + + s->linked = false; + + if (s->link && s->link->current_dns_server == s) + link_set_dns_server(s->link, NULL); + + if (s->manager->current_dns_server == s) + manager_set_dns_server(s->manager, NULL); + + /* No need to keep a default stream around anymore */ + dns_server_unref_stream(s); + + dns_server_unref(s); +} + +void dns_server_move_back_and_unmark(DnsServer *s) { + DnsServer *tail; + + assert(s); + + if (!s->marked) + return; + + s->marked = false; + + if (!s->linked || !s->servers_next) + return; + + /* Move us to the end of the list, so that the order is + * strictly kept, if we are not at the end anyway. */ + + switch (s->type) { + + case DNS_SERVER_LINK: + assert(s->link); + tail = LIST_FIND_TAIL(servers, s); + LIST_REMOVE(servers, s->link->dns_servers, s); + LIST_INSERT_AFTER(servers, s->link->dns_servers, tail, s); + break; + + case DNS_SERVER_SYSTEM: + tail = LIST_FIND_TAIL(servers, s); + LIST_REMOVE(servers, s->manager->dns_servers, s); + LIST_INSERT_AFTER(servers, s->manager->dns_servers, tail, s); + break; + + case DNS_SERVER_FALLBACK: + tail = LIST_FIND_TAIL(servers, s); + LIST_REMOVE(servers, s->manager->fallback_dns_servers, s); + LIST_INSERT_AFTER(servers, s->manager->fallback_dns_servers, tail, s); + break; + + default: + assert_not_reached(); + } +} + +static void dns_server_verified(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + if (s->verified_feature_level > level) + return; + + if (s->verified_feature_level != level) { + log_debug("Verified we get a response at feature level %s from DNS server %s.", + dns_server_feature_level_to_string(level), + strna(dns_server_string_full(s))); + s->verified_feature_level = level; + } + + assert_se(sd_event_now(s->manager->event, CLOCK_BOOTTIME, &s->verified_usec) >= 0); +} + +static void dns_server_reset_counters(DnsServer *s) { + assert(s); + + s->n_failed_udp = 0; + s->n_failed_tcp = 0; + s->n_failed_tls = 0; + s->packet_truncated = false; + s->packet_invalid = false; + s->verified_usec = 0; + + /* Note that we do not reset s->packet_bad_opt and s->packet_rrsig_missing here. We reset them only when the + * grace period ends, but not when lowering the possible feature level, as a lower level feature level should + * not make RRSIGs appear or OPT appear, but rather make them disappear. If the reappear anyway, then that's + * indication for a differently broken OPT/RRSIG implementation, and we really don't want to support that + * either. + * + * This is particularly important to deal with certain Belkin routers which break OPT for certain lookups (A), + * but pass traffic through for others (AAAA). If we detect the broken behaviour on one lookup we should not + * re-enable it for another, because we cannot validate things anyway, given that the RRSIG/OPT data will be + * incomplete. */ +} + +void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t fragsize) { + assert(s); + + if (protocol == IPPROTO_UDP) { + if (s->possible_feature_level == level) + s->n_failed_udp = 0; + } else if (protocol == IPPROTO_TCP) { + if (DNS_SERVER_FEATURE_LEVEL_IS_TLS(level)) { + if (s->possible_feature_level == level) + s->n_failed_tls = 0; + } else { + if (s->possible_feature_level == level) + s->n_failed_tcp = 0; + + /* Successful TCP connections are only useful to verify the TCP feature level. */ + level = DNS_SERVER_FEATURE_LEVEL_TCP; + } + } + + /* If the RRSIG data is missing, then we can only validate EDNS0 at max */ + if (s->packet_rrsig_missing && level >= DNS_SERVER_FEATURE_LEVEL_DO) + level = DNS_SERVER_FEATURE_LEVEL_IS_TLS(level) ? DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN : DNS_SERVER_FEATURE_LEVEL_EDNS0; + + /* If the OPT RR got lost, then we can only validate UDP at max */ + if (s->packet_bad_opt && level >= DNS_SERVER_FEATURE_LEVEL_EDNS0) + level = DNS_SERVER_FEATURE_LEVEL_EDNS0 - 1; + + dns_server_verified(s, level); + + /* Remember the size of the largest UDP packet fragment we received from a server, we know that we + * can always announce support for packets with at least this size. */ + if (protocol == IPPROTO_UDP && s->received_udp_fragment_max < fragsize) + s->received_udp_fragment_max = fragsize; +} + +void dns_server_packet_lost(DnsServer *s, int protocol, DnsServerFeatureLevel level) { + assert(s); + assert(s->manager); + + if (s->possible_feature_level != level) + return; + + if (protocol == IPPROTO_UDP) + s->n_failed_udp++; + else if (protocol == IPPROTO_TCP) { + if (DNS_SERVER_FEATURE_LEVEL_IS_TLS(level)) + s->n_failed_tls++; + else + s->n_failed_tcp++; + } +} + +void dns_server_packet_truncated(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + /* Invoked whenever we get a packet with TC bit set. */ + + if (s->possible_feature_level != level) + return; + + s->packet_truncated = true; +} + +void dns_server_packet_rrsig_missing(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + if (level < DNS_SERVER_FEATURE_LEVEL_DO) + return; + + /* If the RRSIG RRs are missing, we have to downgrade what we previously verified */ + if (s->verified_feature_level >= DNS_SERVER_FEATURE_LEVEL_DO) + s->verified_feature_level = DNS_SERVER_FEATURE_LEVEL_IS_TLS(level) ? DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN : DNS_SERVER_FEATURE_LEVEL_EDNS0; + + s->packet_rrsig_missing = true; +} + +void dns_server_packet_bad_opt(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + if (level < DNS_SERVER_FEATURE_LEVEL_EDNS0) + return; + + /* If the OPT RR got lost, we have to downgrade what we previously verified */ + if (s->verified_feature_level >= DNS_SERVER_FEATURE_LEVEL_EDNS0) + s->verified_feature_level = DNS_SERVER_FEATURE_LEVEL_EDNS0-1; + + s->packet_bad_opt = true; +} + +void dns_server_packet_rcode_downgrade(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + /* Invoked whenever we got a FORMERR, SERVFAIL or NOTIMP rcode from a server and downgrading the feature level + * for the transaction made it go away. In this case we immediately downgrade to the feature level that made + * things work. */ + + if (s->verified_feature_level > level) + s->verified_feature_level = level; + + if (s->possible_feature_level > level) { + s->possible_feature_level = level; + dns_server_reset_counters(s); + log_debug("Downgrading transaction feature level fixed an RCODE error, downgrading server %s too.", strna(dns_server_string_full(s))); + } +} + +void dns_server_packet_invalid(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + /* Invoked whenever we got a packet we couldn't parse at all */ + + if (s->possible_feature_level != level) + return; + + s->packet_invalid = true; +} + +void dns_server_packet_do_off(DnsServer *s, DnsServerFeatureLevel level) { + assert(s); + + /* Invoked whenever the DO flag was not copied from our request to the response. */ + + if (s->possible_feature_level != level) + return; + + s->packet_do_off = true; +} + +void dns_server_packet_udp_fragmented(DnsServer *s, size_t fragsize) { + assert(s); + + /* Invoked whenever we got a fragmented UDP packet. Let's do two things: keep track of the largest + * fragment we ever received from the server, and remember this, so that we can use it to lower the + * advertised packet size in EDNS0 */ + + if (s->received_udp_fragment_max < fragsize) + s->received_udp_fragment_max = fragsize; + + s->packet_fragmented = true; +} + +static bool dns_server_grace_period_expired(DnsServer *s) { + usec_t ts; + + assert(s); + assert(s->manager); + + if (s->verified_usec == 0) + return false; + + assert_se(sd_event_now(s->manager->event, CLOCK_BOOTTIME, &ts) >= 0); + + if (s->verified_usec + s->features_grace_period_usec > ts) + return false; + + s->features_grace_period_usec = MIN(s->features_grace_period_usec * 2, DNS_SERVER_FEATURE_GRACE_PERIOD_MAX_USEC); + + return true; +} + +DnsServerFeatureLevel dns_server_possible_feature_level(DnsServer *s) { + DnsServerFeatureLevel best; + + assert(s); + + /* Determine the best feature level we care about. If DNSSEC mode is off there's no point in using anything + * better than EDNS0, hence don't even try. */ + if (dns_server_get_dnssec_mode(s) != DNSSEC_NO) + best = dns_server_get_dns_over_tls_mode(s) == DNS_OVER_TLS_NO ? + DNS_SERVER_FEATURE_LEVEL_DO : + DNS_SERVER_FEATURE_LEVEL_TLS_DO; + else + best = dns_server_get_dns_over_tls_mode(s) == DNS_OVER_TLS_NO ? + DNS_SERVER_FEATURE_LEVEL_EDNS0 : + DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN; + + /* Clamp the feature level the highest level we care about. The DNSSEC mode might have changed since the last + * time, hence let's downgrade if we are still at a higher level. */ + if (s->possible_feature_level > best) + s->possible_feature_level = best; + + if (s->possible_feature_level < best && dns_server_grace_period_expired(s)) { + + s->possible_feature_level = best; + + dns_server_reset_counters(s); + + s->packet_bad_opt = false; + s->packet_rrsig_missing = false; + + log_info("Grace period over, resuming full feature set (%s) for DNS server %s.", + dns_server_feature_level_to_string(s->possible_feature_level), + strna(dns_server_string_full(s))); + + dns_server_flush_cache(s); + + } else if (s->possible_feature_level <= s->verified_feature_level) + s->possible_feature_level = s->verified_feature_level; + else { + DnsServerFeatureLevel p = s->possible_feature_level; + int log_level = LOG_WARNING; + + if (s->n_failed_tcp >= DNS_SERVER_FEATURE_RETRY_ATTEMPTS && + s->possible_feature_level == DNS_SERVER_FEATURE_LEVEL_TCP) { + + /* We are at the TCP (lowest) level, and we tried a couple of TCP connections, and it didn't + * work. Upgrade back to UDP again. */ + log_debug("Reached maximum number of failed TCP connection attempts, trying UDP again..."); + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_UDP; + + } else if (s->n_failed_tls > 0 && + DNS_SERVER_FEATURE_LEVEL_IS_TLS(s->possible_feature_level) && + dns_server_get_dns_over_tls_mode(s) != DNS_OVER_TLS_YES) { + + /* We tried to connect using DNS-over-TLS, and it didn't work. Downgrade to plaintext UDP + * if we don't require DNS-over-TLS */ + + log_debug("Server doesn't support DNS-over-TLS, downgrading protocol..."); + s->possible_feature_level--; + + } else if (s->packet_invalid && + s->possible_feature_level > DNS_SERVER_FEATURE_LEVEL_UDP && + s->possible_feature_level != DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN) { + + /* Downgrade from DO to EDNS0 + from EDNS0 to UDP, from TLS+DO to plain TLS. Or in + * other words, if we receive a packet we cannot parse jump to the next lower feature + * level that actually has an influence on the packet layout (and not just the + * transport). */ + + log_debug("Got invalid packet from server, downgrading protocol..."); + s->possible_feature_level = + s->possible_feature_level == DNS_SERVER_FEATURE_LEVEL_TLS_DO ? DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN : + DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(s->possible_feature_level) ? DNS_SERVER_FEATURE_LEVEL_EDNS0 : + DNS_SERVER_FEATURE_LEVEL_UDP; + + } else if (s->packet_bad_opt && + DNS_SERVER_FEATURE_LEVEL_IS_EDNS0(s->possible_feature_level) && + dns_server_get_dnssec_mode(s) != DNSSEC_YES && + dns_server_get_dns_over_tls_mode(s) != DNS_OVER_TLS_YES) { + + /* A reply to one of our EDNS0 queries didn't carry a valid OPT RR, then downgrade to + * below EDNS0 levels. After all, some servers generate different responses with and + * without OPT RR in the request. Example: + * + * https://open.nlnetlabs.nl/pipermail/dnssec-trigger/2014-November/000376.html + * + * If we are in strict DNSSEC or DoT mode, we don't do this kind of downgrade + * however, as both modes imply EDNS0 to work (DNSSEC strictly requires it, and DoT + * only in our implementation). */ + + log_debug("Server doesn't support EDNS(0) properly, downgrading feature level..."); + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_UDP; + + /* Users often don't control the DNS server they use so let's not complain too loudly + * when we can't use EDNS because the DNS server doesn't support it. */ + log_level = LOG_NOTICE; + + } else if (s->packet_do_off && + DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(s->possible_feature_level) && + dns_server_get_dnssec_mode(s) != DNSSEC_YES) { + + /* The server didn't copy the DO bit from request to response, thus DNSSEC is not + * correctly implemented, let's downgrade if that's allowed. */ + + log_debug("Detected server didn't copy DO flag from request to response, downgrading feature level..."); + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_IS_TLS(s->possible_feature_level) ? DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN : + DNS_SERVER_FEATURE_LEVEL_EDNS0; + + } else if (s->packet_rrsig_missing && + DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(s->possible_feature_level) && + dns_server_get_dnssec_mode(s) != DNSSEC_YES) { + + /* RRSIG data was missing on an EDNS0 packet with DO bit set. This means the server + * doesn't augment responses with DNSSEC RRs. If so, let's better not ask the server + * for it anymore, after all some servers generate different replies depending if an + * OPT RR is in the query or not. If we are in strict DNSSEC mode, don't allow such + * downgrades however, since a DNSSEC feature level is a requirement for strict + * DNSSEC mode. */ + + log_debug("Detected server responses lack RRSIG records, downgrading feature level..."); + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_IS_TLS(s->possible_feature_level) ? DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN : + DNS_SERVER_FEATURE_LEVEL_EDNS0; + + } else if (s->n_failed_udp >= DNS_SERVER_FEATURE_RETRY_ATTEMPTS && + DNS_SERVER_FEATURE_LEVEL_IS_UDP(s->possible_feature_level) && + ((s->possible_feature_level != DNS_SERVER_FEATURE_LEVEL_DO) || dns_server_get_dnssec_mode(s) != DNSSEC_YES)) { + + /* We lost too many UDP packets in a row, and are on a UDP feature level. If the + * packets are lost, maybe the server cannot parse them, hence downgrading sounds + * like a good idea. We might downgrade all the way down to TCP this way. + * + * If strict DNSSEC mode is used we won't downgrade below DO level however, as packet loss + * might have many reasons, a broken DNSSEC implementation being only one reason. And if the + * user is strict on DNSSEC, then let's assume that DNSSEC is not the fault here. */ + + log_debug("Lost too many UDP packets, downgrading feature level..."); + if (s->possible_feature_level == DNS_SERVER_FEATURE_LEVEL_DO) /* skip over TLS_PLAIN */ + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_EDNS0; + else + s->possible_feature_level--; + + } else if (s->n_failed_tcp >= DNS_SERVER_FEATURE_RETRY_ATTEMPTS && + s->packet_truncated && + s->possible_feature_level > DNS_SERVER_FEATURE_LEVEL_UDP && + DNS_SERVER_FEATURE_LEVEL_IS_UDP(s->possible_feature_level) && + (!DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(s->possible_feature_level) || dns_server_get_dnssec_mode(s) != DNSSEC_YES)) { + + /* We got too many TCP connection failures in a row, we had at least one truncated + * packet, and are on feature level above UDP. By downgrading things and getting rid + * of DNSSEC or EDNS0 data we hope to make the packet smaller, so that it still + * works via UDP given that TCP appears not to be a fallback. Note that if we are + * already at the lowest UDP level, we don't go further down, since that's TCP, and + * TCP failed too often after all. */ + + log_debug("Got too many failed TCP connection failures and truncated UDP packets, downgrading feature level..."); + + if (DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(s->possible_feature_level)) + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_EDNS0; /* Go DNSSEC → EDNS0 */ + else + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_UDP; /* Go EDNS0 → UDP */ + } + + if (p != s->possible_feature_level) { + + /* We changed the feature level, reset the counting */ + dns_server_reset_counters(s); + + log_full(log_level, "Using degraded feature set %s instead of %s for DNS server %s.", + dns_server_feature_level_to_string(s->possible_feature_level), + dns_server_feature_level_to_string(p), strna(dns_server_string_full(s))); + } + } + + return s->possible_feature_level; +} + +int dns_server_adjust_opt(DnsServer *server, DnsPacket *packet, DnsServerFeatureLevel level) { + size_t packet_size, udp_size; + bool edns_do; + int r; + + assert(server); + assert(packet); + assert(packet->protocol == DNS_PROTOCOL_DNS); + + /* Fix the OPT field in the packet to match our current feature level. */ + + r = dns_packet_truncate_opt(packet); + if (r < 0) + return r; + + if (level < DNS_SERVER_FEATURE_LEVEL_EDNS0) + return 0; + + edns_do = level >= DNS_SERVER_FEATURE_LEVEL_DO; + + udp_size = udp_header_size(server->family); + + if (in_addr_is_localhost(server->family, &server->address) > 0) + packet_size = 65536 - udp_size; /* force linux loopback MTU if localhost address */ + else { + /* Use the MTU pointing to the server, subtract the IP/UDP header size */ + packet_size = LESS_BY(dns_server_get_mtu(server), udp_size); + + /* On the Internet we want to avoid fragmentation for security reasons. If we saw + * fragmented packets, the above was too large, let's clamp it to the largest + * fragment we saw */ + if (server->packet_fragmented) + packet_size = MIN(server->received_udp_fragment_max, packet_size); + + /* Let's not pick ridiculously large sizes, i.e. not more than 4K. No one appears + * to ever use such large sized on the Internet IRL, hence let's not either. */ + packet_size = MIN(packet_size, 4096U); + } + + /* Strictly speaking we quite possibly can receive larger datagrams than the MTU (since the + * MTU is for egress, not for ingress), but more often than not the value is symmetric, and + * we want something that does the right thing in the majority of cases, and not just in the + * theoretical edge case. */ + + /* Safety clamp, never advertise less than 512 or more than 65535 */ + packet_size = CLAMP(packet_size, + DNS_PACKET_UNICAST_SIZE_MAX, + DNS_PACKET_SIZE_MAX); + + log_debug("Announcing packet size %zu in egress EDNS(0) packet.", packet_size); + + return dns_packet_append_opt(packet, packet_size, edns_do, /* include_rfc6975 = */ true, NULL, 0, NULL); +} + +int dns_server_ifindex(const DnsServer *s) { + assert(s); + + /* For loopback addresses, go via the loopback interface, regardless which interface this is linked + * to. */ + if (in_addr_is_localhost(s->family, &s->address)) + return LOOPBACK_IFINDEX; + + /* The link ifindex always takes precedence */ + if (s->link) + return s->link->ifindex; + + if (s->ifindex > 0) + return s->ifindex; + + return 0; +} + +uint16_t dns_server_port(const DnsServer *s) { + assert(s); + + if (s->port > 0) + return s->port; + + return 53; +} + +const char *dns_server_string(DnsServer *server) { + assert(server); + + if (!server->server_string) + (void) in_addr_ifindex_to_string(server->family, &server->address, dns_server_ifindex(server), &server->server_string); + + return server->server_string; +} + +const char *dns_server_string_full(DnsServer *server) { + assert(server); + + if (!server->server_string_full) + (void) in_addr_port_ifindex_name_to_string( + server->family, + &server->address, + server->port, + dns_server_ifindex(server), + server->server_name, + &server->server_string_full); + + return server->server_string_full; +} + +bool dns_server_dnssec_supported(DnsServer *server) { + assert(server); + + /* Returns whether the server supports DNSSEC according to what we know about it */ + + if (dns_server_get_dnssec_mode(server) == DNSSEC_YES) /* If strict DNSSEC mode is enabled, always assume DNSSEC mode is supported. */ + return true; + + if (!DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(server->possible_feature_level)) + return false; + + if (server->packet_bad_opt) + return false; + + if (server->packet_rrsig_missing) + return false; + + if (server->packet_do_off) + return false; + + /* DNSSEC servers need to support TCP properly (see RFC5966), if they don't, we assume DNSSEC is borked too */ + if (server->n_failed_tcp >= DNS_SERVER_FEATURE_RETRY_ATTEMPTS) + return false; + + return true; +} + +void dns_server_warn_downgrade(DnsServer *server) { + assert(server); + + if (server->warned_downgrade) + return; + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_DNSSEC_DOWNGRADE_STR, + LOG_MESSAGE("Server %s does not support DNSSEC, downgrading to non-DNSSEC mode.", + strna(dns_server_string_full(server))), + "DNS_SERVER=%s", strna(dns_server_string_full(server)), + "DNS_SERVER_FEATURE_LEVEL=%s", dns_server_feature_level_to_string(server->possible_feature_level)); + + server->warned_downgrade = true; +} + +size_t dns_server_get_mtu(DnsServer *s) { + assert(s); + + if (s->link && s->link->mtu != 0) + return s->link->mtu; + + return manager_find_mtu(s->manager); +} + +static void dns_server_hash_func(const DnsServer *s, struct siphash *state) { + assert(s); + + siphash24_compress(&s->family, sizeof(s->family), state); + siphash24_compress(&s->address, FAMILY_ADDRESS_SIZE(s->family), state); + siphash24_compress(&s->port, sizeof(s->port), state); + siphash24_compress(&s->ifindex, sizeof(s->ifindex), state); + siphash24_compress_string(s->server_name, state); +} + +static int dns_server_compare_func(const DnsServer *x, const DnsServer *y) { + int r; + + r = CMP(x->family, y->family); + if (r != 0) + return r; + + r = memcmp(&x->address, &y->address, FAMILY_ADDRESS_SIZE(x->family)); + if (r != 0) + return r; + + r = CMP(x->port, y->port); + if (r != 0) + return r; + + r = CMP(x->ifindex, y->ifindex); + if (r != 0) + return r; + + return streq_ptr(x->server_name, y->server_name); +} + +DEFINE_HASH_OPS(dns_server_hash_ops, DnsServer, dns_server_hash_func, dns_server_compare_func); + +void dns_server_unlink_all(DnsServer *first) { + DnsServer *next; + + if (!first) + return; + + next = first->servers_next; + dns_server_unlink(first); + + dns_server_unlink_all(next); +} + +bool dns_server_unlink_marked(DnsServer *server) { + bool changed = false; + + while (server) { + DnsServer *next; + + next = server->servers_next; + + if (server->marked) { + dns_server_unlink(server); + changed = true; + } + + server = next; + } + + return changed; +} + +void dns_server_mark_all(DnsServer *server) { + while (server) { + server->marked = true; + server = server->servers_next; + } +} + +DnsServer *dns_server_find(DnsServer *first, int family, const union in_addr_union *in_addr, uint16_t port, int ifindex, const char *name) { + LIST_FOREACH(servers, s, first) + if (s->family == family && + in_addr_equal(family, &s->address, in_addr) > 0 && + s->port == port && + s->ifindex == ifindex && + streq_ptr(s->server_name, name)) + return s; + + return NULL; +} + +DnsServer *manager_get_first_dns_server(Manager *m, DnsServerType t) { + assert(m); + + switch (t) { + + case DNS_SERVER_SYSTEM: + return m->dns_servers; + + case DNS_SERVER_FALLBACK: + return m->fallback_dns_servers; + + default: + return NULL; + } +} + +DnsServer *manager_set_dns_server(Manager *m, DnsServer *s) { + assert(m); + + if (m->current_dns_server == s) + return s; + + /* Let's log about the server switch, at debug level. Except if we switch from a non-fallback server + * to a fallback server or back, since that is noteworthy and possibly a configuration issue */ + if (s) + log_full((s->type == DNS_SERVER_FALLBACK) != (m->current_dns_server && m->current_dns_server->type == DNS_SERVER_FALLBACK) ? LOG_NOTICE : LOG_DEBUG, + "Switching to %s DNS server %s.", dns_server_type_to_string(s->type), strna(dns_server_string_full(s))); + + dns_server_unref(m->current_dns_server); + m->current_dns_server = dns_server_ref(s); + + if (m->unicast_scope) + dns_cache_flush(&m->unicast_scope->cache); + + (void) manager_send_changed(m, "CurrentDNSServer"); + + return s; +} + +DnsServer *manager_get_dns_server(Manager *m) { + Link *l; + assert(m); + + /* Try to read updates resolv.conf */ + manager_read_resolv_conf(m); + + /* If no DNS server was chosen so far, pick the first one */ + if (!m->current_dns_server || + /* In case m->current_dns_server != m->dns_servers */ + manager_server_is_stub(m, m->current_dns_server)) + manager_set_dns_server(m, m->dns_servers); + + while (m->current_dns_server && + manager_server_is_stub(m, m->current_dns_server)) { + manager_next_dns_server(m, NULL); + if (m->current_dns_server == m->dns_servers) + manager_set_dns_server(m, NULL); + } + + if (!m->current_dns_server) { + bool found = false; + + /* No DNS servers configured, let's see if there are + * any on any links. If not, we use the fallback + * servers */ + + HASHMAP_FOREACH(l, m->links) + if (l->dns_servers) { + found = true; + break; + } + + if (!found) + manager_set_dns_server(m, m->fallback_dns_servers); + } + + return m->current_dns_server; +} + +void manager_next_dns_server(Manager *m, DnsServer *if_current) { + assert(m); + + /* If the DNS server is already a different one than the one specified in 'if_current' don't do anything */ + if (if_current && m->current_dns_server != if_current) + return; + + /* If there's currently no DNS server set, then the next manager_get_dns_server() will find one */ + if (!m->current_dns_server) + return; + + /* Change to the next one, but make sure to follow the linked list only if the server is still + * linked. */ + if (m->current_dns_server->linked && m->current_dns_server->servers_next) { + manager_set_dns_server(m, m->current_dns_server->servers_next); + return; + } + + /* If there was no next one, then start from the beginning of the list */ + if (m->current_dns_server->type == DNS_SERVER_FALLBACK) + manager_set_dns_server(m, m->fallback_dns_servers); + else + manager_set_dns_server(m, m->dns_servers); +} + +DnssecMode dns_server_get_dnssec_mode(DnsServer *s) { + assert(s); + + if (s->link) + return link_get_dnssec_mode(s->link); + + return manager_get_dnssec_mode(s->manager); +} + +DnsOverTlsMode dns_server_get_dns_over_tls_mode(DnsServer *s) { + assert(s); + + if (s->link) + return link_get_dns_over_tls_mode(s->link); + + return manager_get_dns_over_tls_mode(s->manager); +} + +void dns_server_flush_cache(DnsServer *s) { + DnsServer *current; + DnsScope *scope; + + assert(s); + + /* Flush the cache of the scope this server belongs to */ + + current = s->link ? s->link->current_dns_server : s->manager->current_dns_server; + if (current != s) + return; + + scope = s->link ? s->link->unicast_scope : s->manager->unicast_scope; + if (!scope) + return; + + dns_cache_flush(&scope->cache); +} + +void dns_server_reset_features(DnsServer *s) { + assert(s); + + s->verified_feature_level = _DNS_SERVER_FEATURE_LEVEL_INVALID; + s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_BEST; + + s->received_udp_fragment_max = DNS_PACKET_UNICAST_SIZE_MAX; + + s->packet_bad_opt = false; + s->packet_rrsig_missing = false; + s->packet_do_off = false; + + s->features_grace_period_usec = DNS_SERVER_FEATURE_GRACE_PERIOD_MIN_USEC; + + s->warned_downgrade = false; + + dns_server_reset_counters(s); + + /* Let's close the default stream, so that we reprobe with the new features */ + dns_server_unref_stream(s); +} + +void dns_server_reset_features_all(DnsServer *s) { + LIST_FOREACH(servers, i, s) + dns_server_reset_features(i); +} + +void dns_server_dump(DnsServer *s, FILE *f) { + assert(s); + + if (!f) + f = stdout; + + fputs("[Server ", f); + fputs(strna(dns_server_string_full(s)), f); + fputs(" type=", f); + fputs(dns_server_type_to_string(s->type), f); + + if (s->type == DNS_SERVER_LINK) { + assert(s->link); + + fputs(" interface=", f); + fputs(s->link->ifname, f); + } + + fputs("]\n", f); + + fputs("\tVerified feature level: ", f); + fputs(strna(dns_server_feature_level_to_string(s->verified_feature_level)), f); + fputc('\n', f); + + fputs("\tPossible feature level: ", f); + fputs(strna(dns_server_feature_level_to_string(s->possible_feature_level)), f); + fputc('\n', f); + + fputs("\tDNSSEC Mode: ", f); + fputs(strna(dnssec_mode_to_string(dns_server_get_dnssec_mode(s))), f); + fputc('\n', f); + + fputs("\tCan do DNSSEC: ", f); + fputs(yes_no(dns_server_dnssec_supported(s)), f); + fputc('\n', f); + + fprintf(f, + "\tMaximum UDP fragment size received: %zu\n" + "\tFailed UDP attempts: %u\n" + "\tFailed TCP attempts: %u\n" + "\tSeen truncated packet: %s\n" + "\tSeen OPT RR getting lost: %s\n" + "\tSeen RRSIG RR missing: %s\n" + "\tSeen invalid packet: %s\n" + "\tServer dropped DO flag: %s\n", + s->received_udp_fragment_max, + s->n_failed_udp, + s->n_failed_tcp, + yes_no(s->packet_truncated), + yes_no(s->packet_bad_opt), + yes_no(s->packet_rrsig_missing), + yes_no(s->packet_invalid), + yes_no(s->packet_do_off)); +} + +void dns_server_unref_stream(DnsServer *s) { + DnsStream *ref; + + assert(s); + + /* Detaches the default stream of this server. Some special care needs to be taken here, as that stream and + * this server reference each other. First, take the stream out of the server. It's destructor will check if it + * is registered with us, hence let's invalidate this separately, so that it is already unregistered. */ + ref = TAKE_PTR(s->stream); + + /* And then, unref it */ + dns_stream_unref(ref); +} + +DnsScope *dns_server_scope(DnsServer *s) { + assert(s); + assert((s->type == DNS_SERVER_LINK) == !!s->link); + + if (s->link) + return s->link->unicast_scope; + + return s->manager->unicast_scope; +} + +static const char* const dns_server_type_table[_DNS_SERVER_TYPE_MAX] = { + [DNS_SERVER_SYSTEM] = "system", + [DNS_SERVER_FALLBACK] = "fallback", + [DNS_SERVER_LINK] = "link", +}; +DEFINE_STRING_TABLE_LOOKUP(dns_server_type, DnsServerType); + +static const char* const dns_server_feature_level_table[_DNS_SERVER_FEATURE_LEVEL_MAX] = { + [DNS_SERVER_FEATURE_LEVEL_TCP] = "TCP", + [DNS_SERVER_FEATURE_LEVEL_UDP] = "UDP", + [DNS_SERVER_FEATURE_LEVEL_EDNS0] = "UDP+EDNS0", + [DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN] = "TLS+EDNS0", + [DNS_SERVER_FEATURE_LEVEL_DO] = "UDP+EDNS0+DO", + [DNS_SERVER_FEATURE_LEVEL_TLS_DO] = "TLS+EDNS0+DO", +}; +DEFINE_STRING_TABLE_LOOKUP(dns_server_feature_level, DnsServerFeatureLevel); + +int dns_server_dump_state_to_json(DnsServer *server, JsonVariant **ret) { + + assert(server); + assert(ret); + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("Server", strna(dns_server_string_full(server))), + JSON_BUILD_PAIR_STRING("Type", strna(dns_server_type_to_string(server->type))), + JSON_BUILD_PAIR_CONDITION(server->type == DNS_SERVER_LINK, "Interface", JSON_BUILD_STRING(server->link ? server->link->ifname : NULL)), + JSON_BUILD_PAIR_CONDITION(server->type == DNS_SERVER_LINK, "InterfaceIndex", JSON_BUILD_UNSIGNED(server->link ? server->link->ifindex : 0)), + JSON_BUILD_PAIR_STRING("VerifiedFeatureLevel", strna(dns_server_feature_level_to_string(server->verified_feature_level))), + JSON_BUILD_PAIR_STRING("PossibleFeatureLevel", strna(dns_server_feature_level_to_string(server->possible_feature_level))), + JSON_BUILD_PAIR_STRING("DNSSECMode", strna(dnssec_mode_to_string(dns_server_get_dnssec_mode(server)))), + JSON_BUILD_PAIR_BOOLEAN("DNSSECSupported", dns_server_dnssec_supported(server)), + JSON_BUILD_PAIR_UNSIGNED("ReceivedUDPFragmentMax", server->received_udp_fragment_max), + JSON_BUILD_PAIR_UNSIGNED("FailedUDPAttempts", server->n_failed_udp), + JSON_BUILD_PAIR_UNSIGNED("FailedTCPAttempts", server->n_failed_tcp), + JSON_BUILD_PAIR_BOOLEAN("PacketTruncated", server->packet_truncated), + JSON_BUILD_PAIR_BOOLEAN("PacketBadOpt", server->packet_bad_opt), + JSON_BUILD_PAIR_BOOLEAN("PacketRRSIGMissing", server->packet_rrsig_missing), + JSON_BUILD_PAIR_BOOLEAN("PacketInvalid", server->packet_invalid), + JSON_BUILD_PAIR_BOOLEAN("PacketDoOff", server->packet_do_off))); +} diff --git a/src/resolve/resolved-dns-server.h b/src/resolve/resolved-dns-server.h new file mode 100644 index 0000000..ed6560f --- /dev/null +++ b/src/resolve/resolved-dns-server.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "in-addr-util.h" +#include "json.h" +#include "list.h" +#include "resolve-util.h" +#include "time-util.h" + +typedef struct DnsScope DnsScope; +typedef struct DnsServer DnsServer; +typedef struct DnsStream DnsStream; +typedef struct DnsPacket DnsPacket; +typedef struct Link Link; +typedef struct Manager Manager; + +#include "resolved-dnstls.h" + +typedef enum DnsServerType { + DNS_SERVER_SYSTEM, + DNS_SERVER_FALLBACK, + DNS_SERVER_LINK, + _DNS_SERVER_TYPE_MAX, + _DNS_SERVER_TYPE_INVALID = -EINVAL, +} DnsServerType; + +const char* dns_server_type_to_string(DnsServerType i) _const_; +DnsServerType dns_server_type_from_string(const char *s) _pure_; + +typedef enum DnsServerFeatureLevel { + DNS_SERVER_FEATURE_LEVEL_TCP, + DNS_SERVER_FEATURE_LEVEL_UDP, + DNS_SERVER_FEATURE_LEVEL_EDNS0, + DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN, + DNS_SERVER_FEATURE_LEVEL_DO, + DNS_SERVER_FEATURE_LEVEL_TLS_DO, + _DNS_SERVER_FEATURE_LEVEL_MAX, + _DNS_SERVER_FEATURE_LEVEL_INVALID = -EINVAL, +} DnsServerFeatureLevel; + +#define DNS_SERVER_FEATURE_LEVEL_WORST 0 +#define DNS_SERVER_FEATURE_LEVEL_BEST (_DNS_SERVER_FEATURE_LEVEL_MAX - 1) +#define DNS_SERVER_FEATURE_LEVEL_IS_EDNS0(x) ((x) >= DNS_SERVER_FEATURE_LEVEL_EDNS0) +#define DNS_SERVER_FEATURE_LEVEL_IS_TLS(x) IN_SET(x, DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN, DNS_SERVER_FEATURE_LEVEL_TLS_DO) +#define DNS_SERVER_FEATURE_LEVEL_IS_DNSSEC(x) ((x) >= DNS_SERVER_FEATURE_LEVEL_DO) +#define DNS_SERVER_FEATURE_LEVEL_IS_UDP(x) IN_SET(x, DNS_SERVER_FEATURE_LEVEL_UDP, DNS_SERVER_FEATURE_LEVEL_EDNS0, DNS_SERVER_FEATURE_LEVEL_DO) + +const char* dns_server_feature_level_to_string(DnsServerFeatureLevel i) _const_; +DnsServerFeatureLevel dns_server_feature_level_from_string(const char *s) _pure_; + +struct DnsServer { + Manager *manager; + + unsigned n_ref; + + DnsServerType type; + Link *link; + + int family; + union in_addr_union address; + int ifindex; /* for IPv6 link-local DNS servers */ + uint16_t port; + char *server_name; + + char *server_string; + char *server_string_full; + + /* The long-lived stream towards this server. */ + DnsStream *stream; + +#if ENABLE_DNS_OVER_TLS + DnsTlsServerData dnstls_data; +#endif + + DnsServerFeatureLevel verified_feature_level; + DnsServerFeatureLevel possible_feature_level; + + size_t received_udp_fragment_max; /* largest packet or fragment (without IP/UDP header) we saw so far */ + + unsigned n_failed_udp; + unsigned n_failed_tcp; + unsigned n_failed_tls; + + bool packet_truncated:1; /* Set when TC bit was set on reply */ + bool packet_bad_opt:1; /* Set when OPT was missing or otherwise bad on reply */ + bool packet_rrsig_missing:1; /* Set when RRSIG was missing */ + bool packet_invalid:1; /* Set when we failed to parse a reply */ + bool packet_do_off:1; /* Set when the server didn't copy DNSSEC DO flag from request to response */ + bool packet_fragmented:1; /* Set when we ever saw a fragmented packet */ + + usec_t verified_usec; + usec_t features_grace_period_usec; + + /* Whether we already warned about downgrading to non-DNSSEC mode for this server */ + bool warned_downgrade:1; + + /* Used when GC'ing old DNS servers when configuration changes. */ + bool marked:1; + + /* If linked is set, then this server appears in the servers linked list */ + bool linked:1; + LIST_FIELDS(DnsServer, servers); +}; + +int dns_server_new( + Manager *m, + DnsServer **ret, + DnsServerType type, + Link *link, + int family, + const union in_addr_union *address, + uint16_t port, + int ifindex, + const char *server_string); + +DnsServer* dns_server_ref(DnsServer *s); +DnsServer* dns_server_unref(DnsServer *s); + +void dns_server_unlink(DnsServer *s); +void dns_server_move_back_and_unmark(DnsServer *s); + +void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t fragsize); +void dns_server_packet_lost(DnsServer *s, int protocol, DnsServerFeatureLevel level); +void dns_server_packet_truncated(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_rrsig_missing(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_bad_opt(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_rcode_downgrade(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_invalid(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_do_off(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_udp_fragmented(DnsServer *s, size_t fragsize); + +DnsServerFeatureLevel dns_server_possible_feature_level(DnsServer *s); + +int dns_server_adjust_opt(DnsServer *server, DnsPacket *packet, DnsServerFeatureLevel level); + +const char *dns_server_string(DnsServer *server); +const char *dns_server_string_full(DnsServer *server); +int dns_server_ifindex(const DnsServer *s); +uint16_t dns_server_port(const DnsServer *s); + +bool dns_server_dnssec_supported(DnsServer *server); + +void dns_server_warn_downgrade(DnsServer *server); + +DnsServer *dns_server_find(DnsServer *first, int family, const union in_addr_union *in_addr, uint16_t port, int ifindex, const char *name); + +void dns_server_unlink_all(DnsServer *first); +bool dns_server_unlink_marked(DnsServer *first); +void dns_server_mark_all(DnsServer *first); + +DnsServer *manager_get_first_dns_server(Manager *m, DnsServerType t); + +DnsServer *manager_set_dns_server(Manager *m, DnsServer *s); +DnsServer *manager_get_dns_server(Manager *m); +void manager_next_dns_server(Manager *m, DnsServer *if_current); + +DnssecMode dns_server_get_dnssec_mode(DnsServer *s); +DnsOverTlsMode dns_server_get_dns_over_tls_mode(DnsServer *s); + +size_t dns_server_get_mtu(DnsServer *s); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsServer*, dns_server_unref); + +extern const struct hash_ops dns_server_hash_ops; + +void dns_server_flush_cache(DnsServer *s); + +void dns_server_reset_features(DnsServer *s); +void dns_server_reset_features_all(DnsServer *s); + +void dns_server_dump(DnsServer *s, FILE *f); + +void dns_server_unref_stream(DnsServer *s); + +DnsScope *dns_server_scope(DnsServer *s); + +int dns_server_dump_state_to_json(DnsServer *server, JsonVariant **ret); diff --git a/src/resolve/resolved-dns-stream.c b/src/resolve/resolved-dns-stream.c new file mode 100644 index 0000000..ddd1db5 --- /dev/null +++ b/src/resolve/resolved-dns-stream.c @@ -0,0 +1,595 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "iovec-util.h" +#include "macro.h" +#include "missing_network.h" +#include "resolved-dns-stream.h" +#include "resolved-manager.h" + +#define DNS_STREAMS_MAX 128 + +#define DNS_QUERIES_PER_STREAM 32 + +static void dns_stream_stop(DnsStream *s) { + assert(s); + + s->io_event_source = sd_event_source_disable_unref(s->io_event_source); + s->timeout_event_source = sd_event_source_disable_unref(s->timeout_event_source); + s->fd = safe_close(s->fd); + + /* Disconnect us from the server object if we are now not usable anymore */ + dns_stream_detach(s); +} + +static int dns_stream_update_io(DnsStream *s) { + uint32_t f = 0; + + assert(s); + + if (s->write_packet && s->n_written < sizeof(s->write_size) + s->write_packet->size) + f |= EPOLLOUT; + else if (!ordered_set_isempty(s->write_queue)) { + dns_packet_unref(s->write_packet); + s->write_packet = ordered_set_steal_first(s->write_queue); + s->write_size = htobe16(s->write_packet->size); + s->n_written = 0; + f |= EPOLLOUT; + } + + /* Let's read a packet if we haven't queued any yet. Except if we already hit a limit of parallel + * queries for this connection. */ + if ((!s->read_packet || s->n_read < sizeof(s->read_size) + s->read_packet->size) && + set_size(s->queries) < DNS_QUERIES_PER_STREAM) + f |= EPOLLIN; + + s->requested_events = f; + +#if ENABLE_DNS_OVER_TLS + /* For handshake and clean closing purposes, TLS can override requested events */ + if (s->dnstls_events != 0) + f = s->dnstls_events; +#endif + + return sd_event_source_set_io_events(s->io_event_source, f); +} + +static int dns_stream_complete(DnsStream *s, int error) { + _cleanup_(dns_stream_unrefp) _unused_ DnsStream *ref = dns_stream_ref(s); /* Protect stream while we process it */ + + assert(s); + assert(error >= 0); + + /* Error is > 0 when the connection failed for some reason in the network stack. It's == 0 if we sent + * and received exactly one packet each (in the LLMNR client case). */ + +#if ENABLE_DNS_OVER_TLS + if (s->encrypted) { + int r; + + r = dnstls_stream_shutdown(s, error); + if (r != -EAGAIN) + dns_stream_stop(s); + } else +#endif + dns_stream_stop(s); + + dns_stream_detach(s); + + if (s->complete) + s->complete(s, error); + else /* the default action if no completion function is set is to close the stream */ + dns_stream_unref(s); + + return 0; +} + +static int dns_stream_identify(DnsStream *s) { + CMSG_BUFFER_TYPE(CMSG_SPACE(MAXSIZE(struct in_pktinfo, struct in6_pktinfo)) + + CMSG_SPACE(int) + /* for the TTL */ + + EXTRA_CMSG_SPACE /* kernel appears to require extra space */) control; + struct msghdr mh = {}; + struct cmsghdr *cmsg; + socklen_t sl; + int r; + + assert(s); + + if (s->identified) + return 0; + + /* Query the local side */ + s->local_salen = sizeof(s->local); + r = getsockname(s->fd, &s->local.sa, &s->local_salen); + if (r < 0) + return -errno; + if (s->local.sa.sa_family == AF_INET6 && s->ifindex <= 0) + s->ifindex = s->local.in6.sin6_scope_id; + + /* Query the remote side */ + s->peer_salen = sizeof(s->peer); + r = getpeername(s->fd, &s->peer.sa, &s->peer_salen); + if (r < 0) + return -errno; + if (s->peer.sa.sa_family == AF_INET6 && s->ifindex <= 0) + s->ifindex = s->peer.in6.sin6_scope_id; + + /* Check consistency */ + assert(s->peer.sa.sa_family == s->local.sa.sa_family); + assert(IN_SET(s->peer.sa.sa_family, AF_INET, AF_INET6)); + + /* Query connection meta information */ + sl = sizeof(control); + if (s->peer.sa.sa_family == AF_INET) { + r = getsockopt(s->fd, IPPROTO_IP, IP_PKTOPTIONS, &control, &sl); + if (r < 0) + return -errno; + } else if (s->peer.sa.sa_family == AF_INET6) { + + r = getsockopt(s->fd, IPPROTO_IPV6, IPV6_2292PKTOPTIONS, &control, &sl); + if (r < 0) + return -errno; + } else + return -EAFNOSUPPORT; + + mh.msg_control = &control; + mh.msg_controllen = sl; + + CMSG_FOREACH(cmsg, &mh) { + + if (cmsg->cmsg_level == IPPROTO_IPV6) { + assert(s->peer.sa.sa_family == AF_INET6); + + switch (cmsg->cmsg_type) { + + case IPV6_PKTINFO: { + struct in6_pktinfo *i = CMSG_TYPED_DATA(cmsg, struct in6_pktinfo); + + if (s->ifindex <= 0) + s->ifindex = i->ipi6_ifindex; + break; + } + + case IPV6_HOPLIMIT: + s->ttl = *CMSG_TYPED_DATA(cmsg, int); + break; + } + + } else if (cmsg->cmsg_level == IPPROTO_IP) { + assert(s->peer.sa.sa_family == AF_INET); + + switch (cmsg->cmsg_type) { + + case IP_PKTINFO: { + struct in_pktinfo *i = CMSG_TYPED_DATA(cmsg, struct in_pktinfo); + + if (s->ifindex <= 0) + s->ifindex = i->ipi_ifindex; + break; + } + + case IP_TTL: + s->ttl = *CMSG_TYPED_DATA(cmsg, int); + break; + } + } + } + + /* The Linux kernel sets the interface index to the loopback + * device if the connection came from the local host since it + * avoids the routing table in such a case. Let's unset the + * interface index in such a case. */ + if (s->ifindex == LOOPBACK_IFINDEX) + s->ifindex = 0; + + /* If we don't know the interface index still, we look for the + * first local interface with a matching address. Yuck! */ + if (s->ifindex <= 0) + s->ifindex = manager_find_ifindex(s->manager, s->local.sa.sa_family, sockaddr_in_addr(&s->local.sa)); + + if (s->protocol == DNS_PROTOCOL_LLMNR && s->ifindex > 0) { + /* Make sure all packets for this connection are sent on the same interface */ + r = socket_set_unicast_if(s->fd, s->local.sa.sa_family, s->ifindex); + if (r < 0) + log_debug_errno(errno, "Failed to invoke IP_UNICAST_IF/IPV6_UNICAST_IF: %m"); + } + + s->identified = true; + + return 0; +} + +ssize_t dns_stream_writev(DnsStream *s, const struct iovec *iov, size_t iovcnt, int flags) { + ssize_t m; + + assert(s); + assert(iov); + +#if ENABLE_DNS_OVER_TLS + if (s->encrypted && !(flags & DNS_STREAM_WRITE_TLS_DATA)) + return dnstls_stream_writev(s, iov, iovcnt); +#endif + + if (s->tfo_salen > 0) { + struct msghdr hdr = { + .msg_iov = (struct iovec*) iov, + .msg_iovlen = iovcnt, + .msg_name = &s->tfo_address.sa, + .msg_namelen = s->tfo_salen + }; + + m = sendmsg(s->fd, &hdr, MSG_FASTOPEN); + if (m < 0) { + if (errno == EOPNOTSUPP) { + s->tfo_salen = 0; + if (connect(s->fd, &s->tfo_address.sa, s->tfo_salen) < 0) + return -errno; + + return -EAGAIN; + } + if (errno == EINPROGRESS) + return -EAGAIN; + + return -errno; + } else + s->tfo_salen = 0; /* connection is made */ + } else { + m = writev(s->fd, iov, iovcnt); + if (m < 0) + return -errno; + } + + return m; +} + +static ssize_t dns_stream_read(DnsStream *s, void *buf, size_t count) { + ssize_t ss; + +#if ENABLE_DNS_OVER_TLS + if (s->encrypted) + ss = dnstls_stream_read(s, buf, count); + else +#endif + { + ss = read(s->fd, buf, count); + if (ss < 0) + return -errno; + } + + return ss; +} + +static int on_stream_timeout(sd_event_source *es, usec_t usec, void *userdata) { + DnsStream *s = ASSERT_PTR(userdata); + + return dns_stream_complete(s, ETIMEDOUT); +} + +static DnsPacket *dns_stream_take_read_packet(DnsStream *s) { + assert(s); + + /* Note, dns_stream_update() should be called after this is called. When this is called, the + * stream may be already full and the EPOLLIN flag is dropped from the stream IO event source. + * Even this makes a room to read in the stream, this does not call dns_stream_update(), hence + * EPOLLIN flag is not set automatically. So, to read further packets from the stream, + * dns_stream_update() must be called explicitly. Currently, this is only called from + * on_stream_io(), and there dns_stream_update() is called. */ + + if (!s->read_packet) + return NULL; + + if (s->n_read < sizeof(s->read_size)) + return NULL; + + if (s->n_read < sizeof(s->read_size) + be16toh(s->read_size)) + return NULL; + + s->n_read = 0; + return TAKE_PTR(s->read_packet); +} + +static int on_stream_io(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + _cleanup_(dns_stream_unrefp) DnsStream *s = dns_stream_ref(userdata); /* Protect stream while we process it */ + bool progressed = false; + int r; + + assert(s); + +#if ENABLE_DNS_OVER_TLS + if (s->encrypted) { + r = dnstls_stream_on_io(s, revents); + if (r == DNSTLS_STREAM_CLOSED) + return 0; + if (r == -EAGAIN) + return dns_stream_update_io(s); + if (r < 0) + return dns_stream_complete(s, -r); + + r = dns_stream_update_io(s); + if (r < 0) + return r; + } +#endif + + /* only identify after connecting */ + if (s->tfo_salen == 0) { + r = dns_stream_identify(s); + if (r < 0) + return dns_stream_complete(s, -r); + } + + if ((revents & EPOLLOUT) && + s->write_packet && + s->n_written < sizeof(s->write_size) + s->write_packet->size) { + + struct iovec iov[] = { + IOVEC_MAKE(&s->write_size, sizeof(s->write_size)), + IOVEC_MAKE(DNS_PACKET_DATA(s->write_packet), s->write_packet->size), + }; + + iovec_increment(iov, ELEMENTSOF(iov), s->n_written); + + ssize_t ss = dns_stream_writev(s, iov, ELEMENTSOF(iov), 0); + if (ss < 0) { + if (!ERRNO_IS_TRANSIENT(ss)) + return dns_stream_complete(s, -ss); + } else { + progressed = true; + s->n_written += ss; + } + + /* Are we done? If so, disable the event source for EPOLLOUT */ + if (s->n_written >= sizeof(s->write_size) + s->write_packet->size) { + r = dns_stream_update_io(s); + if (r < 0) + return dns_stream_complete(s, -r); + } + } + + while ((revents & (EPOLLIN|EPOLLHUP|EPOLLRDHUP)) && + (!s->read_packet || + s->n_read < sizeof(s->read_size) + s->read_packet->size)) { + + if (s->n_read < sizeof(s->read_size)) { + ssize_t ss; + + ss = dns_stream_read(s, (uint8_t*) &s->read_size + s->n_read, sizeof(s->read_size) - s->n_read); + if (ss < 0) { + if (!ERRNO_IS_TRANSIENT(ss)) + return dns_stream_complete(s, -ss); + break; + } else if (ss == 0) + return dns_stream_complete(s, ECONNRESET); + else { + progressed = true; + s->n_read += ss; + } + } + + if (s->n_read >= sizeof(s->read_size)) { + + if (be16toh(s->read_size) < DNS_PACKET_HEADER_SIZE) + return dns_stream_complete(s, EBADMSG); + + if (s->n_read < sizeof(s->read_size) + be16toh(s->read_size)) { + ssize_t ss; + + if (!s->read_packet) { + r = dns_packet_new(&s->read_packet, s->protocol, be16toh(s->read_size), DNS_PACKET_SIZE_MAX); + if (r < 0) + return dns_stream_complete(s, -r); + + s->read_packet->size = be16toh(s->read_size); + s->read_packet->ipproto = IPPROTO_TCP; + s->read_packet->family = s->peer.sa.sa_family; + s->read_packet->ttl = s->ttl; + s->read_packet->ifindex = s->ifindex; + s->read_packet->timestamp = now(CLOCK_BOOTTIME); + + if (s->read_packet->family == AF_INET) { + s->read_packet->sender.in = s->peer.in.sin_addr; + s->read_packet->sender_port = be16toh(s->peer.in.sin_port); + s->read_packet->destination.in = s->local.in.sin_addr; + s->read_packet->destination_port = be16toh(s->local.in.sin_port); + } else { + assert(s->read_packet->family == AF_INET6); + s->read_packet->sender.in6 = s->peer.in6.sin6_addr; + s->read_packet->sender_port = be16toh(s->peer.in6.sin6_port); + s->read_packet->destination.in6 = s->local.in6.sin6_addr; + s->read_packet->destination_port = be16toh(s->local.in6.sin6_port); + + if (s->read_packet->ifindex == 0) + s->read_packet->ifindex = s->peer.in6.sin6_scope_id; + if (s->read_packet->ifindex == 0) + s->read_packet->ifindex = s->local.in6.sin6_scope_id; + } + } + + ss = dns_stream_read(s, + (uint8_t*) DNS_PACKET_DATA(s->read_packet) + s->n_read - sizeof(s->read_size), + sizeof(s->read_size) + be16toh(s->read_size) - s->n_read); + if (ss < 0) { + if (!ERRNO_IS_TRANSIENT(ss)) + return dns_stream_complete(s, -ss); + break; + } else if (ss == 0) + return dns_stream_complete(s, ECONNRESET); + else + s->n_read += ss; + } + + /* Are we done? If so, call the packet handler and re-enable EPOLLIN for the + * event source if necessary. */ + _cleanup_(dns_packet_unrefp) DnsPacket *p = dns_stream_take_read_packet(s); + if (p) { + assert(s->on_packet); + r = s->on_packet(s, p); + if (r < 0) + return r; + + r = dns_stream_update_io(s); + if (r < 0) + return dns_stream_complete(s, -r); + + s->packet_received = true; + + /* If we just disabled the read event, stop reading */ + if (!FLAGS_SET(s->requested_events, EPOLLIN)) + break; + } + } + } + + /* Complete the stream if finished reading and writing one packet, and there's nothing + * else left to write. */ + if (s->type == DNS_STREAM_LLMNR_SEND && s->packet_received && + !FLAGS_SET(s->requested_events, EPOLLOUT)) + return dns_stream_complete(s, 0); + + /* If we did something, let's restart the timeout event source */ + if (progressed && s->timeout_event_source) { + r = sd_event_source_set_time_relative(s->timeout_event_source, DNS_STREAM_ESTABLISHED_TIMEOUT_USEC); + if (r < 0) + log_warning_errno(errno, "Couldn't restart TCP connection timeout, ignoring: %m"); + } + + return 0; +} + +static DnsStream *dns_stream_free(DnsStream *s) { + DnsPacket *p; + + assert(s); + + dns_stream_stop(s); + + if (s->manager) { + LIST_REMOVE(streams, s->manager->dns_streams, s); + s->manager->n_dns_streams[s->type]--; + } + +#if ENABLE_DNS_OVER_TLS + if (s->encrypted) + dnstls_stream_free(s); +#endif + + ORDERED_SET_FOREACH(p, s->write_queue) + dns_packet_unref(ordered_set_remove(s->write_queue, p)); + + dns_packet_unref(s->write_packet); + dns_packet_unref(s->read_packet); + dns_server_unref(s->server); + + ordered_set_free(s->write_queue); + + return mfree(s); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DnsStream, dns_stream, dns_stream_free); + +int dns_stream_new( + Manager *m, + DnsStream **ret, + DnsStreamType type, + DnsProtocol protocol, + int fd, + const union sockaddr_union *tfo_address, + int (on_packet)(DnsStream*, DnsPacket*), + int (complete)(DnsStream*, int), /* optional */ + usec_t connect_timeout_usec) { + + _cleanup_(dns_stream_unrefp) DnsStream *s = NULL; + int r; + + assert(m); + assert(ret); + assert(type >= 0); + assert(type < _DNS_STREAM_TYPE_MAX); + assert(protocol >= 0); + assert(protocol < _DNS_PROTOCOL_MAX); + assert(fd >= 0); + assert(on_packet); + + if (m->n_dns_streams[type] > DNS_STREAMS_MAX) + return -EBUSY; + + s = new(DnsStream, 1); + if (!s) + return -ENOMEM; + + *s = (DnsStream) { + .n_ref = 1, + .fd = -EBADF, + .protocol = protocol, + .type = type, + }; + + r = ordered_set_ensure_allocated(&s->write_queue, &dns_packet_hash_ops); + if (r < 0) + return r; + + r = sd_event_add_io(m->event, &s->io_event_source, fd, EPOLLIN, on_stream_io, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->io_event_source, "dns-stream-io"); + + r = sd_event_add_time_relative( + m->event, + &s->timeout_event_source, + CLOCK_BOOTTIME, + connect_timeout_usec, 0, + on_stream_timeout, s); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s->timeout_event_source, "dns-stream-timeout"); + + LIST_PREPEND(streams, m->dns_streams, s); + m->n_dns_streams[type]++; + s->manager = m; + + s->fd = fd; + s->on_packet = on_packet; + s->complete = complete; + + if (tfo_address) { + s->tfo_address = *tfo_address; + s->tfo_salen = tfo_address->sa.sa_family == AF_INET6 ? sizeof(tfo_address->in6) : sizeof(tfo_address->in); + } + + *ret = TAKE_PTR(s); + + return 0; +} + +int dns_stream_write_packet(DnsStream *s, DnsPacket *p) { + int r; + + assert(s); + assert(p); + + r = ordered_set_put(s->write_queue, p); + if (r < 0) + return r; + + dns_packet_ref(p); + + return dns_stream_update_io(s); +} + +void dns_stream_detach(DnsStream *s) { + assert(s); + + if (!s->server) + return; + + if (s->server->stream != s) + return; + + dns_server_unref_stream(s->server); +} diff --git a/src/resolve/resolved-dns-stream.h b/src/resolve/resolved-dns-stream.h new file mode 100644 index 0000000..ba4a59e --- /dev/null +++ b/src/resolve/resolved-dns-stream.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" + +#include "ordered-set.h" +#include "socket-util.h" + +typedef struct DnsServer DnsServer; +typedef struct DnsStream DnsStream; +typedef struct DnsTransaction DnsTransaction; +typedef struct Manager Manager; +typedef struct DnsStubListenerExtra DnsStubListenerExtra; + +#include "resolved-dns-packet.h" +#include "resolved-dnstls.h" + +/* Various timeouts for establishing TCP connections. First the default time-out for that. */ +#define DNS_STREAM_DEFAULT_TIMEOUT_USEC (10 * USEC_PER_SEC) + +/* In the DNS stub, be more friendly for incoming connections, than we are to ourselves for outgoing ones */ +#define DNS_STREAM_STUB_TIMEOUT_USEC (30 * USEC_PER_SEC) + +/* In opportunistic TLS mode, lower timeouts */ +#define DNS_STREAM_OPPORTUNISTIC_TLS_TIMEOUT_USEC (3 * USEC_PER_SEC) + +/* Once connections are established apply this timeout once nothing happens anymore */ +#define DNS_STREAM_ESTABLISHED_TIMEOUT_USEC (10 * USEC_PER_SEC) + +typedef enum DnsStreamType { + DNS_STREAM_LOOKUP, /* Outgoing connection to a classic DNS server */ + DNS_STREAM_LLMNR_SEND, /* Outgoing LLMNR TCP lookup */ + DNS_STREAM_LLMNR_RECV, /* Incoming LLMNR TCP lookup */ + DNS_STREAM_STUB, /* Incoming DNS stub connection */ + _DNS_STREAM_TYPE_MAX, + _DNS_STREAM_TYPE_INVALID = -EINVAL, +} DnsStreamType; + +#define DNS_STREAM_WRITE_TLS_DATA 1 + +/* Streams are used by three subsystems: + * + * 1. The normal transaction logic when doing a DNS or LLMNR lookup via TCP + * 2. The LLMNR logic when accepting a TCP-based lookup + * 3. The DNS stub logic when accepting a TCP-based lookup + */ + +struct DnsStream { + Manager *manager; + unsigned n_ref; + + DnsStreamType type; + DnsProtocol protocol; + + int fd; + union sockaddr_union peer; + socklen_t peer_salen; + union sockaddr_union local; + socklen_t local_salen; + int ifindex; + uint32_t ttl; + bool identified; + bool packet_received; /* At least one packet is received. Used by LLMNR. */ + uint32_t requested_events; + + /* only when using TCP fast open */ + union sockaddr_union tfo_address; + socklen_t tfo_salen; + +#if ENABLE_DNS_OVER_TLS + DnsTlsStreamData dnstls_data; + uint32_t dnstls_events; +#endif + + sd_event_source *io_event_source; + sd_event_source *timeout_event_source; + + be16_t write_size, read_size; + DnsPacket *write_packet, *read_packet; + size_t n_written, n_read; + OrderedSet *write_queue; + + int (*on_packet)(DnsStream *s, DnsPacket *p); + int (*complete)(DnsStream *s, int error); + + LIST_HEAD(DnsTransaction, transactions); /* when used by the transaction logic */ + DnsServer *server; /* when used by the transaction logic */ + Set *queries; /* when used by the DNS stub logic */ + + /* used when DNS-over-TLS is enabled */ + bool encrypted:1; + + DnsStubListenerExtra *stub_listener_extra; + + LIST_FIELDS(DnsStream, streams); +}; + +int dns_stream_new( + Manager *m, + DnsStream **ret, + DnsStreamType type, + DnsProtocol protocol, + int fd, + const union sockaddr_union *tfo_address, + int (on_packet)(DnsStream*, DnsPacket*), + int (complete)(DnsStream*, int), /* optional */ + usec_t connect_timeout_usec); +#if ENABLE_DNS_OVER_TLS +int dns_stream_connect_tls(DnsStream *s, void *tls_session); +#endif +DnsStream *dns_stream_unref(DnsStream *s); +DnsStream *dns_stream_ref(DnsStream *s); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsStream*, dns_stream_unref); + +int dns_stream_write_packet(DnsStream *s, DnsPacket *p); +ssize_t dns_stream_writev(DnsStream *s, const struct iovec *iov, size_t iovcnt, int flags); + +static inline bool DNS_STREAM_QUEUED(DnsStream *s) { + assert(s); + + if (s->fd < 0) /* already stopped? */ + return false; + + return !!s->write_packet; +} + +void dns_stream_detach(DnsStream *s); diff --git a/src/resolve/resolved-dns-stub.c b/src/resolve/resolved-dns-stub.c new file mode 100644 index 0000000..c59e3b7 --- /dev/null +++ b/src/resolve/resolved-dns-stub.c @@ -0,0 +1,1427 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "capability-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "missing_network.h" +#include "missing_socket.h" +#include "resolved-dns-stub.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-table.h" + +/* The MTU of the loopback device is 64K on Linux, advertise that as maximum datagram size, but subtract the Ethernet, + * IP and UDP header sizes */ +#define ADVERTISE_DATAGRAM_SIZE_MAX (65536U-14U-20U-8U) + +/* On the extra stubs, use a more conservative choice */ +#define ADVERTISE_EXTRA_DATAGRAM_SIZE_MAX DNS_PACKET_UNICAST_SIZE_LARGE_MAX + +static int manager_dns_stub_fd_extra(Manager *m, DnsStubListenerExtra *l, int type); +static int manager_dns_stub_fd(Manager *m, int family, const union in_addr_union *listen_address, int type); + +static void dns_stub_listener_extra_hash_func(const DnsStubListenerExtra *a, struct siphash *state) { + assert(a); + + siphash24_compress(&a->mode, sizeof(a->mode), state); + siphash24_compress(&a->family, sizeof(a->family), state); + siphash24_compress(&a->address, FAMILY_ADDRESS_SIZE(a->family), state); + siphash24_compress(&a->port, sizeof(a->port), state); +} + +static int dns_stub_listener_extra_compare_func(const DnsStubListenerExtra *a, const DnsStubListenerExtra *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->mode, b->mode); + if (r != 0) + return r; + + r = CMP(a->family, b->family); + if (r != 0) + return r; + + r = memcmp(&a->address, &b->address, FAMILY_ADDRESS_SIZE(a->family)); + if (r != 0) + return r; + + return CMP(a->port, b->port); +} + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + dns_stub_listener_extra_hash_ops, + DnsStubListenerExtra, + dns_stub_listener_extra_hash_func, + dns_stub_listener_extra_compare_func, + dns_stub_listener_extra_free); + +int dns_stub_listener_extra_new( + Manager *m, + DnsStubListenerExtra **ret) { + + DnsStubListenerExtra *l; + + l = new(DnsStubListenerExtra, 1); + if (!l) + return -ENOMEM; + + *l = (DnsStubListenerExtra) { + .manager = m, + }; + + *ret = TAKE_PTR(l); + return 0; +} + +DnsStubListenerExtra *dns_stub_listener_extra_free(DnsStubListenerExtra *p) { + if (!p) + return NULL; + + p->udp_event_source = sd_event_source_disable_unref(p->udp_event_source); + p->tcp_event_source = sd_event_source_disable_unref(p->tcp_event_source); + + hashmap_free(p->queries_by_packet); + + return mfree(p); +} + +static void stub_packet_hash_func(const DnsPacket *p, struct siphash *state) { + assert(p); + + siphash24_compress(&p->protocol, sizeof(p->protocol), state); + siphash24_compress(&p->family, sizeof(p->family), state); + siphash24_compress(&p->sender, sizeof(p->sender), state); + siphash24_compress(&p->ipproto, sizeof(p->ipproto), state); + siphash24_compress(&p->sender_port, sizeof(p->sender_port), state); + siphash24_compress(DNS_PACKET_HEADER(p), sizeof(DnsPacketHeader), state); + + /* We don't bother hashing the full packet here, just the header */ +} + +static int stub_packet_compare_func(const DnsPacket *x, const DnsPacket *y) { + int r; + + r = CMP(x->protocol, y->protocol); + if (r != 0) + return r; + + r = CMP(x->family, y->family); + if (r != 0) + return r; + + r = memcmp(&x->sender, &y->sender, sizeof(x->sender)); + if (r != 0) + return r; + + r = CMP(x->ipproto, y->ipproto); + if (r != 0) + return r; + + r = CMP(x->sender_port, y->sender_port); + if (r != 0) + return r; + + return memcmp(DNS_PACKET_HEADER(x), DNS_PACKET_HEADER(y), sizeof(DnsPacketHeader)); +} + +DEFINE_HASH_OPS(stub_packet_hash_ops, DnsPacket, stub_packet_hash_func, stub_packet_compare_func); + +static int reply_add_with_rrsig( + DnsAnswer **reply, + DnsResourceRecord *rr, + int ifindex, + DnsAnswerFlags flags, + DnsResourceRecord *rrsig, + bool with_rrsig) { + int r; + + assert(reply); + assert(rr); + + r = dns_answer_add_extend(reply, rr, ifindex, flags, rrsig); + if (r < 0) + return r; + + if (with_rrsig && rrsig) { + r = dns_answer_add_extend(reply, rrsig, ifindex, flags, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_stub_collect_answer_by_question( + DnsAnswer **reply, + DnsAnswer *answer, + DnsQuestion *question, + bool with_rrsig) { /* Add RRSIG RR matching each RR */ + + DnsAnswerItem *item; + int r; + + assert(reply); + + /* Copies all RRs from 'answer' into 'reply', if they match 'question'. */ + + DNS_ANSWER_FOREACH_ITEM(item, answer) { + + /* We have a question, let's see if this RR matches it */ + r = dns_question_matches_rr(question, item->rr, NULL); + if (r < 0) + return r; + if (!r) { + /* Maybe there's a CNAME/DNAME in here? If so, that's an answer too */ + r = dns_question_matches_cname_or_dname(question, item->rr, NULL); + if (r < 0) + return r; + if (!r) + continue; + } + + /* Mask the section info, we want the primary answers to always go without section + * info, so that it is added to the answer section when we synthesize a reply. */ + + r = reply_add_with_rrsig( + reply, + item->rr, + item->ifindex, + item->flags & ~DNS_ANSWER_MASK_SECTIONS, + item->rrsig, + with_rrsig); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_stub_collect_answer_by_section( + DnsAnswer **reply, + DnsAnswer *answer, + DnsAnswerFlags section, + DnsAnswer *exclude1, + DnsAnswer *exclude2, + bool with_dnssec) { /* Include DNSSEC RRs. RRSIG, NSEC, … */ + + DnsAnswerItem *item; + int r; + + assert(reply); + + /* Copies all RRs from 'answer' into 'reply', if they originate from the specified section. Also, + * avoid any RRs listed in 'exclude'. */ + + DNS_ANSWER_FOREACH_ITEM(item, answer) { + + if (dns_answer_contains(exclude1, item->rr) || + dns_answer_contains(exclude2, item->rr)) + continue; + + if (!with_dnssec && + dns_type_is_dnssec(item->rr->key->type)) + continue; + + if (((item->flags ^ section) & DNS_ANSWER_MASK_SECTIONS) != 0) + continue; + + r = reply_add_with_rrsig( + reply, + item->rr, + item->ifindex, + item->flags, + item->rrsig, + with_dnssec); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_stub_assign_sections( + DnsQuery *q, + DnsQuestion *question, + bool edns0_do) { + + int r; + + assert(q); + assert(question); + + /* Let's assign the 'answer' RRs we collected to their respective sections in the reply datagram. We + * try to reproduce a section assignment similar to what the upstream DNS server responded to us. We + * use the DNS_ANSWER_SECTION_xyz flags to match things up, which is where the original upstream's + * packet section assignment is stored in the DnsAnswer object. Not all RRs in the 'answer' objects + * come with section information though (for example, because they were synthesized locally, and not + * from a DNS packet). To deal with that we extend the assignment logic a bit: anything from the + * 'answer' object that directly matches the original question is always put in the ANSWER section, + * regardless if it carries section info, or what that section info says. Then, anything from the + * 'answer' objects that is from the ANSWER or AUTHORITY sections, and wasn't already added to the + * ANSWER section is placed in the AUTHORITY section. Everything else from either object is added to + * the ADDITIONAL section. */ + + /* Include all RRs that directly answer the question in the answer section */ + r = dns_stub_collect_answer_by_question( + &q->reply_answer, + q->answer, + question, + edns0_do); + if (r < 0) + return r; + + /* Include all RRs that originate from the authority sections, and aren't already listed in the + * answer section, in the authority section */ + r = dns_stub_collect_answer_by_section( + &q->reply_authoritative, + q->answer, + DNS_ANSWER_SECTION_AUTHORITY, + q->reply_answer, NULL, + edns0_do); + if (r < 0) + return r; + + /* Include all RRs that originate from the answer or additional sections in the additional section + * (except if already listed in the other two sections). Also add all RRs with no section marking. */ + r = dns_stub_collect_answer_by_section( + &q->reply_additional, + q->answer, + DNS_ANSWER_SECTION_ANSWER, + q->reply_answer, q->reply_authoritative, + edns0_do); + if (r < 0) + return r; + r = dns_stub_collect_answer_by_section( + &q->reply_additional, + q->answer, + DNS_ANSWER_SECTION_ADDITIONAL, + q->reply_answer, q->reply_authoritative, + edns0_do); + if (r < 0) + return r; + r = dns_stub_collect_answer_by_section( + &q->reply_additional, + q->answer, + 0, + q->reply_answer, q->reply_authoritative, + edns0_do); + if (r < 0) + return r; + + return 0; +} + +static int dns_stub_make_reply_packet( + DnsPacket **ret, + size_t max_size, + DnsQuestion *q, + bool *ret_truncated) { + + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + bool tc = false; + int r; + + assert(ret); + + r = dns_packet_new(&p, DNS_PROTOCOL_DNS, 0, max_size); + if (r < 0) + return r; + + r = dns_packet_append_question(p, q); + if (r == -EMSGSIZE) + tc = true; + else if (r < 0) + return r; + + if (ret_truncated) + *ret_truncated = tc; + else if (tc) + return -EMSGSIZE; + + DNS_PACKET_HEADER(p)->qdcount = htobe16(dns_question_size(q)); + + *ret = TAKE_PTR(p); + return 0; +} + +static int dns_stub_add_reply_packet_body( + DnsPacket *p, + DnsAnswer *answer, + DnsAnswer *authoritative, + DnsAnswer *additional, + bool edns0_do, /* Client expects DNSSEC RRs? */ + bool *truncated) { + + unsigned n_answer = 0, n_authoritative = 0, n_additional = 0; + bool tc = false; + int r; + + assert(p); + + /* Add the three sections to the packet. If the answer section doesn't fit we'll signal that as + * truncation. If the authoritative section doesn't fit and we are in DNSSEC mode, also signal + * truncation. In all other cases where things don't fit don't signal truncation, as for those cases + * the dropped RRs should not be essential. */ + + r = dns_packet_append_answer(p, answer, &n_answer); + if (r == -EMSGSIZE) + tc = true; + else if (r < 0) + return r; + else { + r = dns_packet_append_answer(p, authoritative, &n_authoritative); + if (r == -EMSGSIZE) { + if (edns0_do) + tc = true; + } else if (r < 0) + return r; + else { + r = dns_packet_append_answer(p, additional, &n_additional); + if (r < 0 && r != -EMSGSIZE) + return r; + } + } + + if (tc) { + if (!truncated) + return -EMSGSIZE; + + *truncated = true; + } + + DNS_PACKET_HEADER(p)->ancount = htobe16(n_answer); + DNS_PACKET_HEADER(p)->nscount = htobe16(n_authoritative); + DNS_PACKET_HEADER(p)->arcount = htobe16(n_additional); + return 0; +} + +static const char *nsid_string(void) { + static char buffer[SD_ID128_STRING_MAX + STRLEN(".resolved.systemd.io")] = ""; + sd_id128_t id; + int r; + + /* Let's generate a string that we can use as RFC5001 NSID identifier. The string shall identify us + * as systemd-resolved, and return a different string for each resolved instance without leaking host + * identity. Hence let's use a fixed suffix that identifies resolved, and a prefix generated from the + * machine ID but from which the machine ID cannot be determined. + * + * Clients can use this to determine whether an answer is originating locally or is proxied from + * upstream. */ + + if (!isempty(buffer)) + return buffer; + + r = sd_id128_get_machine_app_specific( + SD_ID128_MAKE(ed,d3,12,5d,16,b9,41,f9,a1,49,5f,ab,15,62,ab,27), + &id); + if (r < 0) { + log_debug_errno(r, "Failed to determine machine ID, ignoring: %m"); + return NULL; + } + + xsprintf(buffer, SD_ID128_FORMAT_STR ".resolved.systemd.io", SD_ID128_FORMAT_VAL(id)); + return buffer; +} + +static int dns_stub_finish_reply_packet( + DnsPacket *p, + uint16_t id, + int rcode, + bool tc, /* set the Truncated bit? */ + bool aa, /* set the Authoritative Answer bit? */ + bool rd, /* set the Recursion Desired bit? */ + bool add_opt, /* add an OPT RR to this packet? */ + bool edns0_do, /* set the EDNS0 DNSSEC OK bit? */ + bool ad, /* set the DNSSEC authenticated data bit? */ + bool cd, /* set the DNSSEC checking disabled bit? */ + uint16_t max_udp_size, /* The maximum UDP datagram size to advertise to clients */ + bool nsid) { /* whether to add NSID */ + + int r; + + assert(p); + + if (add_opt) { + r = dns_packet_append_opt(p, max_udp_size, edns0_do, /* include_rfc6975 = */ false, nsid ? nsid_string() : NULL, rcode, NULL); + if (r == -EMSGSIZE) /* Hit the size limit? then indicate truncation */ + tc = true; + else if (r < 0) + return r; + } else { + /* If the client can't to EDNS0, don't do DO either */ + edns0_do = false; + + /* If we don't do EDNS, clamp the rcode to 4 bit */ + if (rcode > 0xF) + rcode = DNS_RCODE_SERVFAIL; + } + + /* Don't set the CD bit unless DO is on, too */ + if (!edns0_do) + cd = false; + + /* Note that we allow the AD bit to be set even if client didn't signal DO, as per RFC 6840, section + * 5.7 */ + + DNS_PACKET_HEADER(p)->id = id; + + DNS_PACKET_HEADER(p)->flags = htobe16(DNS_PACKET_MAKE_FLAGS( + 1 /* qr */, + 0 /* opcode */, + aa /* aa */, + tc /* tc */, + rd /* rd */, + 1 /* ra */, + ad /* ad */, + cd /* cd */, + rcode)); + + return 0; +} + +static bool address_is_proxy(int family, const union in_addr_union *a) { + assert(a); + + /* Returns true if the specified address is the DNS "proxy" stub, i.e. where we unconditionally enable bypass mode */ + + if (family != AF_INET) + return false; + + return be32toh(a->in.s_addr) == INADDR_DNS_PROXY_STUB; +} + +static int find_socket_fd( + Manager *m, + DnsStubListenerExtra *l, + int family, + const union in_addr_union *listen_address, + int type) { + + assert(m); + + /* Finds the right socket to use for sending. If we know the extra listener, otherwise go via the + * address to send from */ + if (l) + return manager_dns_stub_fd_extra(m, l, type); + + return manager_dns_stub_fd(m, family, listen_address, type); +} + +static int dns_stub_send( + Manager *m, + DnsStubListenerExtra *l, + DnsStream *s, + DnsPacket *p, + DnsPacket *reply) { + + int r; + + assert(m); + assert(p); + assert(reply); + + if (s) + r = dns_stream_write_packet(s, reply); + else { + int fd, ifindex; + + fd = find_socket_fd(m, l, p->family, &p->destination, SOCK_DGRAM); + if (fd < 0) + return fd; + + if (address_is_proxy(p->family, &p->destination)) + /* Force loopback iface if this is the loopback proxy stub + * and ifindex was normalized to 0 by manager_recv(). */ + ifindex = p->ifindex ?: LOOPBACK_IFINDEX; + else + /* Force loopback iface if this is the main listener stub. */ + ifindex = l ? p->ifindex : LOOPBACK_IFINDEX; + + /* Note that it is essential here that we explicitly choose the source IP address for this + * packet. This is because otherwise the kernel will choose it automatically based on the + * routing table and will thus pick 127.0.0.1 rather than 127.0.0.53/54. */ + r = manager_send(m, + fd, + ifindex, + p->family, &p->sender, p->sender_port, &p->destination, + reply); + } + if (r < 0) + return log_debug_errno(r, "Failed to send reply packet: %m"); + + return 0; +} + +static int dns_stub_reply_with_edns0_do(DnsQuery *q) { + assert(q); + + /* Reply with DNSSEC DO set? Only if client supports it; and we did any DNSSEC verification + * ourselves, or consider the data fully authenticated because we generated it locally, or the client + * set cd */ + + return DNS_PACKET_DO(q->request_packet) && + (q->answer_dnssec_result >= 0 || /* we did proper DNSSEC validation … */ + dns_query_fully_authenticated(q) || /* … or we considered it authentic otherwise … */ + DNS_PACKET_CD(q->request_packet)); /* … or client set CD */ +} + +static void dns_stub_suppress_duplicate_section_rrs(DnsQuery *q) { + /* If we follow a CNAME/DNAME chain we might end up populating our sections with redundant RRs + * because we built up the sections from multiple reply packets (one from each CNAME/DNAME chain + * element). E.g. it could be that an RR that was included in the first reply's additional section + * ends up being relevant as main answer in a subsequent reply in the chain. Let's clean this up, and + * remove everything in the "higher priority" sections from the "lower priority" sections. + * + * Note that this removal matches by RR keys instead of the full RRs. This is because RRsets should + * always end up in one section fully or not at all, but never be split among sections. + * + * Specifically: we remove ANSWER section RRs from the AUTHORITATIVE and ADDITIONAL sections, as well + * as AUTHORITATIVE section RRs from the ADDITIONAL section. */ + + dns_answer_remove_by_answer_keys(&q->reply_authoritative, q->reply_answer); + dns_answer_remove_by_answer_keys(&q->reply_additional, q->reply_answer); + dns_answer_remove_by_answer_keys(&q->reply_additional, q->reply_authoritative); +} + +static int dns_stub_send_reply( + DnsQuery *q, + int rcode) { + + _cleanup_(dns_packet_unrefp) DnsPacket *reply = NULL; + bool truncated, edns0_do; + int r; + + assert(q); + + edns0_do = dns_stub_reply_with_edns0_do(q); /* let's check if we shall reply with EDNS0 DO? */ + + r = dns_stub_make_reply_packet( + &reply, + DNS_PACKET_PAYLOAD_SIZE_MAX(q->request_packet), + q->request_packet->question, + &truncated); + if (r < 0) + return log_debug_errno(r, "Failed to build reply packet: %m"); + + dns_stub_suppress_duplicate_section_rrs(q); + + r = dns_stub_add_reply_packet_body( + reply, + q->reply_answer, + q->reply_authoritative, + q->reply_additional, + edns0_do, + &truncated); + if (r < 0) + return log_debug_errno(r, "Failed to append reply packet body: %m"); + + r = dns_stub_finish_reply_packet( + reply, + DNS_PACKET_ID(q->request_packet), + rcode, + truncated, + dns_query_fully_authoritative(q), + DNS_PACKET_RD(q->request_packet), + !!q->request_packet->opt, + edns0_do, + (DNS_PACKET_AD(q->request_packet) || DNS_PACKET_DO(q->request_packet)) && dns_query_fully_authenticated(q), + DNS_PACKET_CD(q->request_packet), + q->stub_listener_extra ? ADVERTISE_EXTRA_DATAGRAM_SIZE_MAX : ADVERTISE_DATAGRAM_SIZE_MAX, + dns_packet_has_nsid_request(q->request_packet) > 0 && !q->stub_listener_extra); + if (r < 0) + return log_debug_errno(r, "Failed to build failure packet: %m"); + + return dns_stub_send(q->manager, q->stub_listener_extra, q->request_stream, q->request_packet, reply); +} + +static int dns_stub_send_failure( + Manager *m, + DnsStubListenerExtra *l, + DnsStream *s, + DnsPacket *p, + int rcode, + bool authenticated) { + + _cleanup_(dns_packet_unrefp) DnsPacket *reply = NULL; + bool truncated; + int r; + + assert(m); + assert(p); + + r = dns_stub_make_reply_packet( + &reply, + DNS_PACKET_PAYLOAD_SIZE_MAX(p), + p->question, + &truncated); + if (r < 0) + return log_debug_errno(r, "Failed to make failure packet: %m"); + + r = dns_stub_finish_reply_packet( + reply, + DNS_PACKET_ID(p), + rcode, + truncated, + false, + DNS_PACKET_RD(p), + !!p->opt, + DNS_PACKET_DO(p), + (DNS_PACKET_AD(p) || DNS_PACKET_DO(p)) && authenticated, + DNS_PACKET_CD(p), + l ? ADVERTISE_EXTRA_DATAGRAM_SIZE_MAX : ADVERTISE_DATAGRAM_SIZE_MAX, + dns_packet_has_nsid_request(p) > 0 && !l); + if (r < 0) + return log_debug_errno(r, "Failed to build failure packet: %m"); + + return dns_stub_send(m, l, s, p, reply); +} + +static int dns_stub_patch_bypass_reply_packet( + DnsPacket **ret, /* Where to place the patched packet */ + DnsPacket *original, /* The packet to patch */ + DnsPacket *request) { /* The packet the patched packet shall look like a reply to */ + _cleanup_(dns_packet_unrefp) DnsPacket *c = NULL; + int r; + + assert(ret); + assert(original); + assert(request); + + r = dns_packet_dup(&c, original); + if (r < 0) + return r; + + /* Extract the packet, so that we know where the OPT field is */ + r = dns_packet_extract(c); + if (r < 0) + return r; + + /* Copy over the original client request ID, so that we can make the upstream query look like our own reply. */ + DNS_PACKET_HEADER(c)->id = DNS_PACKET_HEADER(request)->id; + + /* Patch in our own maximum datagram size, if EDNS0 was on */ + r = dns_packet_patch_max_udp_size(c, ADVERTISE_DATAGRAM_SIZE_MAX); + if (r < 0) + return r; + + /* Lower all TTLs by the time passed since we received the datagram. */ + if (timestamp_is_set(original->timestamp)) { + r = dns_packet_patch_ttls(c, original->timestamp); + if (r < 0) + return r; + } + + /* Our upstream connection might have supported larger DNS requests than our downstream one, hence + * set the TC bit if our reply is larger than what the client supports, and truncate. */ + if (c->size > DNS_PACKET_PAYLOAD_SIZE_MAX(request)) { + log_debug("Artificially truncating stub response, as advertised size of client is smaller than upstream one."); + dns_packet_truncate(c, DNS_PACKET_PAYLOAD_SIZE_MAX(request)); + DNS_PACKET_HEADER(c)->flags = htobe16(be16toh(DNS_PACKET_HEADER(c)->flags) | DNS_PACKET_FLAG_TC); + } + + *ret = TAKE_PTR(c); + return 0; +} + +static void dns_stub_query_complete(DnsQuery *query) { + _cleanup_(dns_query_freep) DnsQuery *q = query; + int r; + + assert(q); + assert(q->request_packet); + + if (q->question_bypass) { + /* This is a bypass reply. If so, let's propagate the upstream packet, if we have it and it + * is regular DNS. (We can't do this if the upstream packet is LLMNR or mDNS, since the + * packets are not 100% compatible.) */ + + if (q->answer_full_packet && + q->answer_full_packet->protocol == DNS_PROTOCOL_DNS) { + _cleanup_(dns_packet_unrefp) DnsPacket *reply = NULL; + + r = dns_stub_patch_bypass_reply_packet(&reply, q->answer_full_packet, q->request_packet); + if (r < 0) + log_debug_errno(r, "Failed to patch bypass reply packet: %m"); + else + (void) dns_stub_send(q->manager, q->stub_listener_extra, q->request_stream, q->request_packet, reply); + + return; + } + } + + /* Take all data from the current reply, and merge it into the three reply sections we are building + * up. We do this before processing CNAME redirects, so that we gradually build up our sections, and + * and keep adding all RRs in the CNAME chain. */ + r = dns_stub_assign_sections( + q, + dns_query_question_for_protocol(q, DNS_PROTOCOL_DNS), + dns_stub_reply_with_edns0_do(q)); + if (r < 0) + return (void) log_debug_errno(r, "Failed to assign sections: %m"); + + switch (q->state) { + + case DNS_TRANSACTION_SUCCESS: { + bool first = true; + + for (;;) { + int cname_result; + + cname_result = dns_query_process_cname_one(q); + if (cname_result == -ELOOP) { /* CNAME loop, let's send what we already have */ + log_debug("Detected CNAME loop, returning what we already have."); + (void) dns_stub_send_reply(q, q->answer_rcode); + break; + } + if (cname_result < 0) { + log_debug_errno(cname_result, "Failed to process CNAME: %m"); + break; + } + + if (cname_result == DNS_QUERY_NOMATCH) { + /* This answer doesn't contain any RR that would answer our question + * positively, i.e. neither directly nor via CNAME. */ + + if (first) /* We never followed a CNAME and the answer doesn't match our + * question at all? Then this is final, the empty answer is the + * answer. */ + break; + + /* Otherwise, we already followed a CNAME once within this packet, and the + * packet doesn't answer our question. In that case let's restart the query, + * now with the redirected question. We'll */ + r = dns_query_go(q); + if (r < 0) + return (void) log_debug_errno(r, "Failed to restart query: %m"); + + TAKE_PTR(q); + return; + } + + r = dns_stub_assign_sections( + q, + dns_query_question_for_protocol(q, DNS_PROTOCOL_DNS), + dns_stub_reply_with_edns0_do(q)); + if (r < 0) + return (void) log_debug_errno(r, "Failed to assign sections: %m"); + + if (cname_result == DNS_QUERY_MATCH) /* A match? Then we are done, let's return what we got */ + break; + + /* We followed a CNAME. and collected the RRs that answer the redirected question + * successfully. Let's not try to do this again. */ + assert(cname_result == DNS_QUERY_CNAME); + first = false; + } + + _fallthrough_; + } + + case DNS_TRANSACTION_RCODE_FAILURE: + (void) dns_stub_send_reply(q, q->answer_rcode); + break; + + case DNS_TRANSACTION_NOT_FOUND: + (void) dns_stub_send_reply(q, DNS_RCODE_NXDOMAIN); + break; + + case DNS_TRANSACTION_TIMEOUT: + case DNS_TRANSACTION_ATTEMPTS_MAX_REACHED: + /* Propagate a timeout as a no packet, i.e. that the client also gets a timeout */ + break; + + case DNS_TRANSACTION_NO_SERVERS: + case DNS_TRANSACTION_INVALID_REPLY: + case DNS_TRANSACTION_ERRNO: + case DNS_TRANSACTION_ABORTED: + case DNS_TRANSACTION_DNSSEC_FAILED: + case DNS_TRANSACTION_NO_TRUST_ANCHOR: + case DNS_TRANSACTION_RR_TYPE_UNSUPPORTED: + case DNS_TRANSACTION_NETWORK_DOWN: + case DNS_TRANSACTION_NO_SOURCE: + case DNS_TRANSACTION_STUB_LOOP: + (void) dns_stub_send_reply(q, DNS_RCODE_SERVFAIL); + break; + + case DNS_TRANSACTION_NULL: + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + default: + assert_not_reached(); + } +} + +static int dns_stub_stream_complete(DnsStream *s, int error) { + assert(s); + + log_debug_errno(error, "DNS TCP connection terminated, destroying queries: %m"); + + for (;;) { + DnsQuery *q; + + q = set_first(s->queries); + if (!q) + break; + + dns_query_free(q); + } + + /* This drops the implicit ref we keep around since it was allocated, as incoming stub connections + * should be kept as long as the client wants to. */ + dns_stream_unref(s); + return 0; +} + +static void dns_stub_process_query(Manager *m, DnsStubListenerExtra *l, DnsStream *s, DnsPacket *p) { + uint64_t protocol_flags = SD_RESOLVED_PROTOCOLS_ALL; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + Hashmap **queries_by_packet; + DnsQuery *existing; + bool bypass = false; + int r; + + assert(m); + assert(p); + assert(p->protocol == DNS_PROTOCOL_DNS); + + if (!l && /* l == NULL if this is the main stub */ + !address_is_proxy(p->family, &p->destination) && /* don't restrict needlessly for 127.0.0.54 */ + (in_addr_is_localhost(p->family, &p->sender) <= 0 || + in_addr_is_localhost(p->family, &p->destination) <= 0)) { + log_warning("Got packet on unexpected (i.e. non-localhost) IP range, ignoring."); + return; + } + + if (manager_packet_from_our_transaction(m, p)) { + log_debug("Got our own packet looped back, ignoring."); + return; + } + + queries_by_packet = l ? &l->queries_by_packet : &m->stub_queries_by_packet; + existing = hashmap_get(*queries_by_packet, p); + if (existing && dns_packet_equal(existing->request_packet, p)) { + log_debug("Got repeat packet from client, ignoring."); + return; + } + + r = dns_packet_extract(p); + if (r < 0) { + log_debug_errno(r, "Failed to extract resources from incoming packet, ignoring packet: %m"); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_FORMERR, false); + return; + } + + if (!DNS_PACKET_VERSION_SUPPORTED(p)) { + log_debug("Got EDNS OPT field with unsupported version number."); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_BADVERS, false); + return; + } + + if (dns_type_is_obsolete(dns_question_first_key(p->question)->type)) { + log_debug("Got message with obsolete key type, refusing."); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_REFUSED, false); + return; + } + + if (dns_type_is_zone_transer(dns_question_first_key(p->question)->type)) { + log_debug("Got request for zone transfer, refusing."); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_REFUSED, false); + return; + } + + if (!DNS_PACKET_RD(p)) { + /* If the "rd" bit is off (i.e. recursion was not requested), then refuse operation */ + log_debug("Got request with recursion disabled, refusing."); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_REFUSED, false); + return; + } + + r = hashmap_ensure_allocated(queries_by_packet, &stub_packet_hash_ops); + if (r < 0) { + log_oom(); + return; + } + + if (address_is_proxy(p->family, &p->destination)) { + _cleanup_free_ char *dipa = NULL; + + r = in_addr_to_string(p->family, &p->destination, &dipa); + if (r < 0) + return (void) log_error_errno(r, "Failed to format destination address: %m"); + + log_debug("Got request to DNS proxy address 127.0.0.54, enabling bypass logic."); + bypass = true; + protocol_flags = SD_RESOLVED_DNS|SD_RESOLVED_NO_ZONE; /* Turn off mDNS/LLMNR for proxy stub. */ + } else if ((DNS_PACKET_DO(p) && DNS_PACKET_CD(p))) { + log_debug("Got request with DNSSEC checking disabled, enabling bypass logic."); + bypass = true; + } + + if (bypass) + r = dns_query_new(m, &q, NULL, NULL, p, 0, + protocol_flags| + SD_RESOLVED_NO_CNAME| + SD_RESOLVED_NO_SEARCH| + SD_RESOLVED_NO_VALIDATE| + SD_RESOLVED_REQUIRE_PRIMARY| + SD_RESOLVED_CLAMP_TTL); + else + r = dns_query_new(m, &q, p->question, p->question, NULL, 0, + protocol_flags| + SD_RESOLVED_NO_SEARCH| + (DNS_PACKET_DO(p) ? SD_RESOLVED_REQUIRE_PRIMARY : 0)| + SD_RESOLVED_CLAMP_TTL); + if (r < 0) { + log_error_errno(r, "Failed to generate query object: %m"); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_SERVFAIL, false); + return; + } + + q->request_packet = dns_packet_ref(p); + q->request_stream = dns_stream_ref(s); /* make sure the stream stays around until we can send a reply through it */ + q->stub_listener_extra = l; + q->complete = dns_stub_query_complete; + + if (s) { + /* Remember which queries belong to this stream, so that we can cancel them when the stream + * is disconnected early */ + + r = set_ensure_put(&s->queries, NULL, q); + if (r < 0) { + log_oom(); + return; + } + assert(r > 0); + } + + /* Add the query to the hash table we use to determine repeat packets now. We don't care about + * failures here, since in the worst case we'll not recognize duplicate incoming requests, which + * isn't particularly bad. */ + (void) hashmap_put(*queries_by_packet, q->request_packet, q); + + r = dns_query_go(q); + if (r < 0) { + log_error_errno(r, "Failed to start query: %m"); + dns_stub_send_failure(m, l, s, p, DNS_RCODE_SERVFAIL, false); + return; + } + + log_debug("Processing query..."); + TAKE_PTR(q); +} + +static int on_dns_stub_packet_internal(sd_event_source *s, int fd, uint32_t revents, Manager *m, DnsStubListenerExtra *l) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + int r; + + r = manager_recv(m, fd, DNS_PROTOCOL_DNS, &p); + if (r <= 0) + return r; + + if (dns_packet_validate_query(p) > 0) { + log_debug("Got DNS stub UDP query packet for id %u", DNS_PACKET_ID(p)); + + dns_stub_process_query(m, l, NULL, p); + } else + log_debug("Invalid DNS stub UDP packet, ignoring."); + + return 0; +} + +static int on_dns_stub_packet(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + return on_dns_stub_packet_internal(s, fd, revents, userdata, NULL); +} + +static int on_dns_stub_packet_extra(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + DnsStubListenerExtra *l = ASSERT_PTR(userdata); + + return on_dns_stub_packet_internal(s, fd, revents, l->manager, l); +} + +static int on_dns_stub_stream_packet(DnsStream *s, DnsPacket *p) { + assert(s); + assert(s->manager); + assert(p); + + if (dns_packet_validate_query(p) > 0) { + log_debug("Got DNS stub TCP query packet for id %u", DNS_PACKET_ID(p)); + + dns_stub_process_query(s->manager, s->stub_listener_extra, s, p); + } else + log_debug("Invalid DNS stub TCP packet, ignoring."); + + return 0; +} + +static int on_dns_stub_stream_internal(sd_event_source *s, int fd, uint32_t revents, Manager *m, DnsStubListenerExtra *l) { + DnsStream *stream; + int cfd, r; + + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (cfd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return -errno; + } + + r = dns_stream_new(m, &stream, DNS_STREAM_STUB, DNS_PROTOCOL_DNS, cfd, NULL, + on_dns_stub_stream_packet, dns_stub_stream_complete, DNS_STREAM_STUB_TIMEOUT_USEC); + if (r < 0) { + safe_close(cfd); + return r; + } + + stream->stub_listener_extra = l; + + /* We let the reference to the stream dangle here, it will be dropped later by the complete callback. */ + + return 0; +} + +static int on_dns_stub_stream(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + return on_dns_stub_stream_internal(s, fd, revents, userdata, NULL); +} + +static int on_dns_stub_stream_extra(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + DnsStubListenerExtra *l = ASSERT_PTR(userdata); + + return on_dns_stub_stream_internal(s, fd, revents, l->manager, l); +} + +static int set_dns_stub_common_socket_options(int fd, int family) { + int r; + + assert(fd >= 0); + assert(IN_SET(family, AF_INET, AF_INET6)); + + r = setsockopt_int(fd, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return r; + + r = socket_set_recvpktinfo(fd, family, true); + if (r < 0) + return r; + + r = socket_set_recvttl(fd, family, true); + if (r < 0) + return r; + + return 0; +} + +static int set_dns_stub_common_tcp_socket_options(int fd) { + int r; + + assert(fd >= 0); + + r = setsockopt_int(fd, IPPROTO_TCP, TCP_FASTOPEN, 5); /* Everybody appears to pick qlen=5, let's do the same here. */ + if (r < 0) + log_debug_errno(r, "Failed to enable TCP_FASTOPEN on TCP listening socket, ignoring: %m"); + + r = setsockopt_int(fd, IPPROTO_TCP, TCP_NODELAY, true); + if (r < 0) + log_debug_errno(r, "Failed to enable TCP_NODELAY mode, ignoring: %m"); + + return 0; +} + +static int manager_dns_stub_fd( + Manager *m, + int family, + const union in_addr_union *listen_addr, + int type) { + + sd_event_source **event_source; + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + int r; + + assert(m); + assert(listen_addr); + + if (type == SOCK_DGRAM) + event_source = address_is_proxy(family, listen_addr) ? &m->dns_proxy_stub_udp_event_source : &m->dns_stub_udp_event_source; + else if (type == SOCK_STREAM) + event_source = address_is_proxy(family, listen_addr) ? &m->dns_proxy_stub_tcp_event_source : &m->dns_stub_tcp_event_source; + else + return -EPROTONOSUPPORT; + + if (*event_source) + return sd_event_source_get_io_fd(*event_source); + + fd = socket(family, type | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + r = set_dns_stub_common_socket_options(fd, family); + if (r < 0) + return r; + + if (type == SOCK_STREAM) { + r = set_dns_stub_common_tcp_socket_options(fd); + if (r < 0) + return r; + } + + /* Set slightly different socket options for the non-proxy and the proxy binding. The former we want + * to be accessible only from the local host, for the latter it's OK if people use NAT redirects or + * so to redirect external traffic to it. */ + + if (!address_is_proxy(family, listen_addr)) { + /* Make sure no traffic from outside the local host can leak to onto this socket */ + r = socket_bind_to_ifindex(fd, LOOPBACK_IFINDEX); + if (r < 0) + return r; + + r = socket_set_ttl(fd, family, 1); + if (r < 0) + return r; + } else if (type == SOCK_DGRAM) { + /* Turn off Path MTU Discovery for UDP, for security reasons. See socket_disable_pmtud() for + * a longer discussion. (We only do this for sockets that are potentially externally + * accessible, i.e. the proxy stub one. For the non-proxy one we instead set the TTL to 1, + * see above, so that packets don't get routed at all.) */ + r = socket_disable_pmtud(fd, family); + if (r < 0) + log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m"); + + r = socket_set_recvfragsize(fd, family, true); + if (r < 0) + log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m"); + } + + r = sockaddr_set_in_addr(&sa, family, listen_addr, 53); + if (r < 0) + return r; + + if (bind(fd, &sa.sa, sizeof(sa.in)) < 0) + return -errno; + + if (type == SOCK_STREAM && + listen(fd, SOMAXCONN_DELUXE) < 0) + return -errno; + + r = sd_event_add_io(m->event, event_source, fd, EPOLLIN, + type == SOCK_DGRAM ? on_dns_stub_packet : on_dns_stub_stream, + m); + if (r < 0) + return r; + + r = sd_event_source_set_io_fd_own(*event_source, true); + if (r < 0) + return r; + + (void) sd_event_source_set_description(*event_source, + type == SOCK_DGRAM ? "dns-stub-udp" : "dns-stub-tcp"); + + return TAKE_FD(fd); +} + +static int manager_dns_stub_fd_extra(Manager *m, DnsStubListenerExtra *l, int type) { + _cleanup_free_ char *pretty = NULL; + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + int r; + + assert(m); + assert(l); + assert(IN_SET(type, SOCK_DGRAM, SOCK_STREAM)); + + sd_event_source **event_source = type == SOCK_DGRAM ? &l->udp_event_source : &l->tcp_event_source; + if (*event_source) + return sd_event_source_get_io_fd(*event_source); + + if (!have_effective_cap(CAP_NET_BIND_SERVICE) && dns_stub_listener_extra_port(l) < 1024) { + log_warning("Missing CAP_NET_BIND_SERVICE capability, not creating extra stub listener on port %hu.", + dns_stub_listener_extra_port(l)); + return 0; + } + + if (l->family == AF_INET) + sa = (union sockaddr_union) { + .in.sin_family = l->family, + .in.sin_port = htobe16(dns_stub_listener_extra_port(l)), + .in.sin_addr = l->address.in, + }; + else + sa = (union sockaddr_union) { + .in6.sin6_family = l->family, + .in6.sin6_port = htobe16(dns_stub_listener_extra_port(l)), + .in6.sin6_addr = l->address.in6, + }; + + fd = socket(l->family, type | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (fd < 0) { + r = -errno; + goto fail; + } + + r = set_dns_stub_common_socket_options(fd, l->family); + if (r < 0) + goto fail; + + if (type == SOCK_STREAM) { + r = set_dns_stub_common_tcp_socket_options(fd); + if (r < 0) + goto fail; + } + + /* Do not set IP_TTL for extra DNS stub listeners, as the address may not be local and in that case + * people may want ttl > 1. */ + + r = socket_set_freebind(fd, l->family, true); + if (r < 0) + goto fail; + + if (type == SOCK_DGRAM) { + r = socket_disable_pmtud(fd, l->family); + if (r < 0) + log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m"); + + r = socket_set_recvfragsize(fd, l->family, true); + if (r < 0) + log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m"); + } + + r = RET_NERRNO(bind(fd, &sa.sa, SOCKADDR_LEN(sa))); + if (r < 0) + goto fail; + + if (type == SOCK_STREAM && + listen(fd, SOMAXCONN_DELUXE) < 0) { + r = -errno; + goto fail; + } + + r = sd_event_add_io(m->event, event_source, fd, EPOLLIN, + type == SOCK_DGRAM ? on_dns_stub_packet_extra : on_dns_stub_stream_extra, + l); + if (r < 0) + goto fail; + + r = sd_event_source_set_io_fd_own(*event_source, true); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(*event_source, + type == SOCK_DGRAM ? "dns-stub-udp-extra" : "dns-stub-tcp-extra"); + + if (DEBUG_LOGGING) { + (void) in_addr_port_to_string(l->family, &l->address, l->port, &pretty); + log_debug("Listening on %s socket %s.", + type == SOCK_DGRAM ? "UDP" : "TCP", + strnull(pretty)); + } + + return TAKE_FD(fd); + +fail: + assert(r < 0); + (void) in_addr_port_to_string(l->family, &l->address, l->port, &pretty); + return log_warning_errno(r, + r == -EADDRINUSE ? "Another process is already listening on %s socket %s: %m" : + "Failed to listen on %s socket %s: %m", + type == SOCK_DGRAM ? "UDP" : "TCP", + strnull(pretty)); +} + +int manager_dns_stub_start(Manager *m) { + int r; + + assert(m); + + if (m->dns_stub_listener_mode == DNS_STUB_LISTENER_NO) + log_debug("Not creating stub listener."); + else if (!have_effective_cap(CAP_NET_BIND_SERVICE)) + log_warning("Missing CAP_NET_BIND_SERVICE capability, not creating stub listener on port 53."); + else { + static const struct { + uint32_t addr; + int socket_type; + } stub_sockets[] = { + { INADDR_DNS_STUB, SOCK_DGRAM }, + { INADDR_DNS_STUB, SOCK_STREAM }, + { INADDR_DNS_PROXY_STUB, SOCK_DGRAM }, + { INADDR_DNS_PROXY_STUB, SOCK_STREAM }, + }; + + log_debug("Creating stub listener using %s.", + m->dns_stub_listener_mode == DNS_STUB_LISTENER_UDP ? "UDP" : + m->dns_stub_listener_mode == DNS_STUB_LISTENER_TCP ? "TCP" : + "UDP/TCP"); + + for (size_t i = 0; i < ELEMENTSOF(stub_sockets); i++) { + union in_addr_union a = { + .in.s_addr = htobe32(stub_sockets[i].addr), + }; + + if (m->dns_stub_listener_mode == DNS_STUB_LISTENER_UDP && stub_sockets[i].socket_type == SOCK_STREAM) + continue; + if (m->dns_stub_listener_mode == DNS_STUB_LISTENER_TCP && stub_sockets[i].socket_type == SOCK_DGRAM) + continue; + + r = manager_dns_stub_fd(m, AF_INET, &a, stub_sockets[i].socket_type); + if (r < 0) { + _cleanup_free_ char *busy_socket = NULL; + + if (asprintf(&busy_socket, + "%s socket " IPV4_ADDRESS_FMT_STR ":53", + stub_sockets[i].socket_type == SOCK_DGRAM ? "UDP" : "TCP", + IPV4_ADDRESS_FMT_VAL(a.in)) < 0) + return log_oom(); + + if (IN_SET(r, -EADDRINUSE, -EPERM)) { + log_warning_errno(r, + r == -EADDRINUSE ? "Another process is already listening on %s.\n" + "Turning off local DNS stub support." : + "Failed to listen on %s: %m.\n" + "Turning off local DNS stub support.", + busy_socket); + manager_dns_stub_stop(m); + break; + } + + return log_error_errno(r, "Failed to listen on %s: %m", busy_socket); + } + } + } + + if (!ordered_set_isempty(m->dns_extra_stub_listeners)) { + DnsStubListenerExtra *l; + + log_debug("Creating extra stub listeners."); + + ORDERED_SET_FOREACH(l, m->dns_extra_stub_listeners) { + if (FLAGS_SET(l->mode, DNS_STUB_LISTENER_UDP)) + (void) manager_dns_stub_fd_extra(m, l, SOCK_DGRAM); + if (FLAGS_SET(l->mode, DNS_STUB_LISTENER_TCP)) + (void) manager_dns_stub_fd_extra(m, l, SOCK_STREAM); + } + } + + return 0; +} + +void manager_dns_stub_stop(Manager *m) { + assert(m); + + m->dns_stub_udp_event_source = sd_event_source_disable_unref(m->dns_stub_udp_event_source); + m->dns_stub_tcp_event_source = sd_event_source_disable_unref(m->dns_stub_tcp_event_source); + m->dns_proxy_stub_udp_event_source = sd_event_source_disable_unref(m->dns_proxy_stub_udp_event_source); + m->dns_proxy_stub_tcp_event_source = sd_event_source_disable_unref(m->dns_proxy_stub_tcp_event_source); +} + +static const char* const dns_stub_listener_mode_table[_DNS_STUB_LISTENER_MODE_MAX] = { + [DNS_STUB_LISTENER_NO] = "no", + [DNS_STUB_LISTENER_UDP] = "udp", + [DNS_STUB_LISTENER_TCP] = "tcp", + [DNS_STUB_LISTENER_YES] = "yes", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dns_stub_listener_mode, DnsStubListenerMode, DNS_STUB_LISTENER_YES); diff --git a/src/resolve/resolved-dns-stub.h b/src/resolve/resolved-dns-stub.h new file mode 100644 index 0000000..3b9bf65 --- /dev/null +++ b/src/resolve/resolved-dns-stub.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hash-funcs.h" + +typedef struct DnsStubListenerExtra DnsStubListenerExtra; + +typedef enum DnsStubListenerMode { + DNS_STUB_LISTENER_NO, + DNS_STUB_LISTENER_UDP = 1 << 0, + DNS_STUB_LISTENER_TCP = 1 << 1, + DNS_STUB_LISTENER_YES = DNS_STUB_LISTENER_UDP | DNS_STUB_LISTENER_TCP, + _DNS_STUB_LISTENER_MODE_MAX, + _DNS_STUB_LISTENER_MODE_INVALID = -EINVAL, +} DnsStubListenerMode; + +#include "resolved-manager.h" + +struct DnsStubListenerExtra { + Manager *manager; + + DnsStubListenerMode mode; + + int family; + union in_addr_union address; + uint16_t port; + + sd_event_source *udp_event_source; + sd_event_source *tcp_event_source; + + Hashmap *queries_by_packet; +}; + +extern const struct hash_ops dns_stub_listener_extra_hash_ops; + +int dns_stub_listener_extra_new(Manager *m, DnsStubListenerExtra **ret); +DnsStubListenerExtra *dns_stub_listener_extra_free(DnsStubListenerExtra *p); +static inline uint16_t dns_stub_listener_extra_port(DnsStubListenerExtra *p) { + assert(p); + + return p->port > 0 ? p->port : 53; +} + +void manager_dns_stub_stop(Manager *m); +int manager_dns_stub_start(Manager *m); + +const char* dns_stub_listener_mode_to_string(DnsStubListenerMode p) _const_; +DnsStubListenerMode dns_stub_listener_mode_from_string(const char *s) _pure_; diff --git a/src/resolve/resolved-dns-synthesize.c b/src/resolve/resolved-dns-synthesize.c new file mode 100644 index 0000000..5bde29c --- /dev/null +++ b/src/resolve/resolved-dns-synthesize.c @@ -0,0 +1,571 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "env-util.h" +#include "hostname-util.h" +#include "local-addresses.h" +#include "missing_network.h" +#include "resolved-dns-synthesize.h" + +int dns_synthesize_family(uint64_t flags) { + + /* Picks an address family depending on set flags. This is + * purely for synthesized answers, where the family we return + * for the reply should match what was requested in the + * question, even though we are synthesizing the answer + * here. */ + + if (!(flags & SD_RESOLVED_DNS)) { + if (flags & (SD_RESOLVED_LLMNR_IPV4|SD_RESOLVED_MDNS_IPV4)) + return AF_INET; + if (flags & (SD_RESOLVED_LLMNR_IPV6|SD_RESOLVED_MDNS_IPV6)) + return AF_INET6; + } + + return AF_UNSPEC; +} + +DnsProtocol dns_synthesize_protocol(uint64_t flags) { + + /* Similar as dns_synthesize_family() but does this for the + * protocol. If resolving via DNS was requested, we claim it + * was DNS. Similar, if nothing specific was + * requested. However, if only resolving via LLMNR was + * requested we return that. */ + + if (flags & SD_RESOLVED_DNS) + return DNS_PROTOCOL_DNS; + if (flags & SD_RESOLVED_LLMNR) + return DNS_PROTOCOL_LLMNR; + if (flags & SD_RESOLVED_MDNS) + return DNS_PROTOCOL_MDNS; + + return DNS_PROTOCOL_DNS; +} + +static int synthesize_localhost_rr(Manager *m, const DnsResourceKey *key, DnsAnswer **answer) { + int r; + + assert(m); + assert(key); + assert(answer); + + r = dns_answer_reserve(answer, 2); + if (r < 0) + return r; + + if (IN_SET(key->type, DNS_TYPE_A, DNS_TYPE_ANY)) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_A, dns_resource_key_name(key)); + if (!rr) + return -ENOMEM; + + rr->a.in_addr.s_addr = htobe32(INADDR_LOOPBACK); + + r = dns_answer_add(*answer, rr, LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return r; + } + + if (IN_SET(key->type, DNS_TYPE_AAAA, DNS_TYPE_ANY) && socket_ipv6_is_enabled()) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_AAAA, dns_resource_key_name(key)); + if (!rr) + return -ENOMEM; + + rr->aaaa.in6_addr = in6addr_loopback; + + r = dns_answer_add(*answer, rr, LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int answer_add_ptr(DnsAnswer **answer, const char *from, const char *to, int ifindex, DnsAnswerFlags flags) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_PTR, from); + if (!rr) + return -ENOMEM; + + rr->ptr.name = strdup(to); + if (!rr->ptr.name) + return -ENOMEM; + + return dns_answer_add(*answer, rr, ifindex, flags, NULL); +} + +static int synthesize_localhost_ptr(Manager *m, const DnsResourceKey *key, DnsAnswer **answer) { + int r; + + assert(m); + assert(key); + assert(answer); + + if (IN_SET(key->type, DNS_TYPE_PTR, DNS_TYPE_ANY)) { + r = dns_answer_reserve(answer, 1); + if (r < 0) + return r; + + r = answer_add_ptr(answer, dns_resource_key_name(key), "localhost", LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + } + + return 0; +} + +static int answer_add_addresses_rr( + DnsAnswer **answer, + const char *name, + struct local_address *addresses, + unsigned n_addresses) { + + unsigned j; + int r; + + assert(answer); + assert(name); + + r = dns_answer_reserve(answer, n_addresses); + if (r < 0) + return r; + + for (j = 0; j < n_addresses; j++) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + r = dns_resource_record_new_address(&rr, addresses[j].family, &addresses[j].address, name); + if (r < 0) + return r; + + r = dns_answer_add(*answer, rr, addresses[j].ifindex, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int answer_add_addresses_ptr( + DnsAnswer **answer, + const char *name, + struct local_address *addresses, + unsigned n_addresses, + int af, const union in_addr_union *match) { + + bool added = false; + unsigned j; + int r; + + assert(answer); + assert(name); + + for (j = 0; j < n_addresses; j++) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + if (af != AF_UNSPEC) { + + if (addresses[j].family != af) + continue; + + if (match && !in_addr_equal(af, match, &addresses[j].address)) + continue; + } + + r = dns_answer_reserve(answer, 1); + if (r < 0) + return r; + + r = dns_resource_record_new_reverse(&rr, addresses[j].family, &addresses[j].address, name); + if (r < 0) + return r; + + r = dns_answer_add(*answer, rr, addresses[j].ifindex, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return r; + + added = true; + } + + return added; +} + +static int synthesize_system_hostname_rr(Manager *m, const DnsResourceKey *key, int ifindex, DnsAnswer **answer) { + _cleanup_free_ struct local_address *addresses = NULL; + int n = 0, af; + + assert(m); + assert(key); + assert(answer); + + af = dns_type_to_af(key->type); + if (af >= 0) { + n = local_addresses(m->rtnl, ifindex, af, &addresses); + if (n < 0) + return n; + + if (n == 0) { + struct local_address buffer[2]; + + /* If we have no local addresses then use ::1 and 127.0.0.2 as local ones. */ + + if (IN_SET(af, AF_INET, AF_UNSPEC)) + buffer[n++] = (struct local_address) { + .family = AF_INET, + .ifindex = LOOPBACK_IFINDEX, + .address.in.s_addr = htobe32(INADDR_LOCALADDRESS), + }; + + if (IN_SET(af, AF_INET6, AF_UNSPEC) && socket_ipv6_is_enabled()) + buffer[n++] = (struct local_address) { + .family = AF_INET6, + .ifindex = LOOPBACK_IFINDEX, + .address.in6 = in6addr_loopback, + }; + + return answer_add_addresses_rr(answer, + dns_resource_key_name(key), + buffer, n); + } + } + + return answer_add_addresses_rr(answer, dns_resource_key_name(key), addresses, n); +} + +static int synthesize_system_hostname_ptr(Manager *m, int af, const union in_addr_union *address, int ifindex, DnsAnswer **answer) { + _cleanup_free_ struct local_address *addresses = NULL; + bool added = false; + int n, r; + + assert(m); + assert(address); + assert(answer); + + if (af == AF_INET && address->in.s_addr == htobe32(INADDR_LOCALADDRESS)) { + + /* Always map the IPv4 address 127.0.0.2 to the local hostname, in addition to "localhost": */ + + r = dns_answer_reserve(answer, 4); + if (r < 0) + return r; + + r = answer_add_ptr(answer, "2.0.0.127.in-addr.arpa", m->full_hostname, LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + + r = answer_add_ptr(answer, "2.0.0.127.in-addr.arpa", m->llmnr_hostname, LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + + r = answer_add_ptr(answer, "2.0.0.127.in-addr.arpa", m->mdns_hostname, LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + + r = answer_add_ptr(answer, "2.0.0.127.in-addr.arpa", "localhost", LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + + return 1; + } + + n = local_addresses(m->rtnl, ifindex, af, &addresses); + if (n <= 0) + return n; + + r = answer_add_addresses_ptr(answer, m->full_hostname, addresses, n, af, address); + if (r < 0) + return r; + if (r > 0) + added = true; + + r = answer_add_addresses_ptr(answer, m->llmnr_hostname, addresses, n, af, address); + if (r < 0) + return r; + if (r > 0) + added = true; + + r = answer_add_addresses_ptr(answer, m->mdns_hostname, addresses, n, af, address); + if (r < 0) + return r; + if (r > 0) + added = true; + + return added; +} + +static int synthesize_gateway_rr( + Manager *m, + const DnsResourceKey *key, + int ifindex, + int (*lookup)(sd_netlink *context, int ifindex, int af, struct local_address **ret), /* either local_gateways() or local_outbound() */ + DnsAnswer **answer) { + _cleanup_free_ struct local_address *addresses = NULL; + int n = 0, af, r; + + assert(m); + assert(key); + assert(lookup); + assert(answer); + + af = dns_type_to_af(key->type); + if (af >= 0) { + n = lookup(m->rtnl, ifindex, af, &addresses); + if (n < 0) /* < 0 means: error */ + return n; + + if (n == 0) { /* == 0 means we have no gateway */ + /* See if there's a gateway on the other protocol */ + if (af == AF_INET) + n = lookup(m->rtnl, ifindex, AF_INET6, NULL); + else { + assert(af == AF_INET6); + n = lookup(m->rtnl, ifindex, AF_INET, NULL); + } + if (n <= 0) /* error (if < 0) or really no gateway at all (if == 0) */ + return n; + + /* We have a gateway on the other protocol. Let's return > 0 without adding any RR to + * the answer, i.e. synthesize NODATA (and not NXDOMAIN!) */ + return 1; + } + } + + r = answer_add_addresses_rr(answer, dns_resource_key_name(key), addresses, n); + if (r < 0) + return r; + + return 1; /* > 0 means: we have some gateway */ +} + +static int synthesize_dns_stub_rr( + Manager *m, + const DnsResourceKey *key, + in_addr_t addr, + DnsAnswer **answer) { + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + int r; + + assert(m); + assert(key); + assert(answer); + + if (!IN_SET(key->type, DNS_TYPE_A, DNS_TYPE_ANY)) + return 1; /* we still consider ourselves the owner of this name */ + + r = dns_answer_reserve(answer, 1); + if (r < 0) + return r; + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_A, dns_resource_key_name(key)); + if (!rr) + return -ENOMEM; + + rr->a.in_addr.s_addr = htobe32(addr); + + r = dns_answer_add(*answer, rr, LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return r; + + return 1; +} + +static int synthesize_dns_stub_ptr( + Manager *m, + int af, + const union in_addr_union *address, + DnsAnswer **answer) { + + int r; + + assert(m); + assert(address); + assert(answer); + + if (af != AF_INET) + return 0; + + if (address->in.s_addr == htobe32(INADDR_DNS_STUB)) { + + r = dns_answer_reserve(answer, 1); + if (r < 0) + return r; + + r = answer_add_ptr(answer, "53.0.0.127.in-addr.arpa", "_localdnsstub", LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + + return 1; + } + + if (address->in.s_addr == htobe32(INADDR_DNS_PROXY_STUB)) { + + r = dns_answer_reserve(answer, 1); + if (r < 0) + return r; + + r = answer_add_ptr(answer, "54.0.0.127.in-addr.arpa", "_localdnsproxy", LOOPBACK_IFINDEX, DNS_ANSWER_AUTHENTICATED); + if (r < 0) + return r; + + return 1; + } + + return 0; +} + +static int synthesize_gateway_ptr( + Manager *m, + int af, + const union in_addr_union *address, + int ifindex, + DnsAnswer **answer) { + + _cleanup_free_ struct local_address *addresses = NULL; + int n; + + assert(m); + assert(address); + assert(answer); + + n = local_gateways(m->rtnl, ifindex, af, &addresses); + if (n <= 0) + return n; + + return answer_add_addresses_ptr(answer, "_gateway", addresses, n, af, address); +} + +int dns_synthesize_answer( + Manager *m, + DnsQuestion *q, + int ifindex, + DnsAnswer **ret) { + + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnsResourceKey *key; + bool found = false, nxdomain = false; + int r; + + assert(m); + assert(q); + + DNS_QUESTION_FOREACH(key, q) { + union in_addr_union address; + const char *name; + int af; + + if (!IN_SET(key->class, DNS_CLASS_IN, DNS_CLASS_ANY)) + continue; + + name = dns_resource_key_name(key); + + if (dns_name_is_root(name)) { + /* Do nothing. */ + + } else if (dns_name_dont_resolve(name)) { + /* Synthesize NXDOMAIN for some of the domains in RFC6303 + RFC6761 */ + nxdomain = true; + continue; + + } else if (is_localhost(name)) { + + r = synthesize_localhost_rr(m, key, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize localhost RRs: %m"); + + } else if (manager_is_own_hostname(m, name)) { + + if (getenv_bool("SYSTEMD_RESOLVED_SYNTHESIZE_HOSTNAME") == 0) + continue; + r = synthesize_system_hostname_rr(m, key, ifindex, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize system hostname RRs: %m"); + + } else if (is_gateway_hostname(name)) { + + r = synthesize_gateway_rr(m, key, ifindex, local_gateways, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize gateway RRs: %m"); + if (r == 0) { /* if we have no gateway return NXDOMAIN */ + nxdomain = true; + continue; + } + + } else if (is_outbound_hostname(name)) { + + r = synthesize_gateway_rr(m, key, ifindex, local_outbounds, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize outbound RRs: %m"); + if (r == 0) { /* if we have no gateway return NXDOMAIN */ + nxdomain = true; + continue; + } + + } else if (is_dns_stub_hostname(name)) { + + r = synthesize_dns_stub_rr(m, key, INADDR_DNS_STUB, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize local DNS stub RRs: %m"); + + } else if (is_dns_proxy_stub_hostname(name)) { + + r = synthesize_dns_stub_rr(m, key, INADDR_DNS_PROXY_STUB, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize local DNS stub RRs: %m"); + + } else if ((dns_name_endswith(name, "127.in-addr.arpa") > 0 && + dns_name_equal(name, "2.0.0.127.in-addr.arpa") == 0 && + dns_name_equal(name, "53.0.0.127.in-addr.arpa") == 0 && + dns_name_equal(name, "54.0.0.127.in-addr.arpa") == 0) || + dns_name_equal(name, "1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa") > 0) { + + r = synthesize_localhost_ptr(m, key, &answer); + if (r < 0) + return log_error_errno(r, "Failed to synthesize localhost PTR RRs: %m"); + + } else if (dns_name_address(name, &af, &address) > 0) { + int v, w, u; + + if (getenv_bool("SYSTEMD_RESOLVED_SYNTHESIZE_HOSTNAME") == 0) + continue; + + v = synthesize_system_hostname_ptr(m, af, &address, ifindex, &answer); + if (v < 0) + return log_error_errno(v, "Failed to synthesize system hostname PTR RR: %m"); + + w = synthesize_gateway_ptr(m, af, &address, ifindex, &answer); + if (w < 0) + return log_error_errno(w, "Failed to synthesize gateway hostname PTR RR: %m"); + + u = synthesize_dns_stub_ptr(m, af, &address, &answer); + if (u < 0) + return log_error_errno(u, "Failed to synthesize local stub hostname PTR PR: %m"); + + if (v == 0 && w == 0 && u == 0) /* This IP address is neither a local one, nor a gateway, nor a stub address */ + continue; + + /* Note that we never synthesize reverse PTR for _outbound, since those are local + * addresses and thus mapped to the local hostname anyway, hence they already have a + * mapping. */ + + } else + continue; + + found = true; + } + + if (found) { + + if (ret) + *ret = TAKE_PTR(answer); + + return 1; + } else if (nxdomain) + return -ENXIO; + + return 0; +} diff --git a/src/resolve/resolved-dns-synthesize.h b/src/resolve/resolved-dns-synthesize.h new file mode 100644 index 0000000..bf271e8 --- /dev/null +++ b/src/resolve/resolved-dns-synthesize.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "resolved-dns-answer.h" +#include "resolved-dns-question.h" +#include "resolved-manager.h" + +int dns_synthesize_family(uint64_t flags); +DnsProtocol dns_synthesize_protocol(uint64_t flags); + +int dns_synthesize_answer(Manager *m, DnsQuestion *q, int ifindex, DnsAnswer **ret); diff --git a/src/resolve/resolved-dns-transaction.c b/src/resolve/resolved-dns-transaction.c new file mode 100644 index 0000000..8ff5653 --- /dev/null +++ b/src/resolve/resolved-dns-transaction.c @@ -0,0 +1,3670 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-messages.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "dns-domain.h" +#include "errno-list.h" +#include "errno-util.h" +#include "fd-util.h" +#include "glyph-util.h" +#include "random-util.h" +#include "resolved-dns-cache.h" +#include "resolved-dns-transaction.h" +#include "resolved-dnstls.h" +#include "resolved-llmnr.h" +#include "string-table.h" + +#define TRANSACTIONS_MAX 4096 +#define TRANSACTION_TCP_TIMEOUT_USEC (10U*USEC_PER_SEC) + +/* After how much time to repeat classic DNS requests */ +#define DNS_TIMEOUT_USEC (SD_RESOLVED_QUERY_TIMEOUT_USEC / DNS_TRANSACTION_ATTEMPTS_MAX) + +static void dns_transaction_reset_answer(DnsTransaction *t) { + assert(t); + + t->received = dns_packet_unref(t->received); + t->answer = dns_answer_unref(t->answer); + t->answer_rcode = 0; + t->answer_dnssec_result = _DNSSEC_RESULT_INVALID; + t->answer_source = _DNS_TRANSACTION_SOURCE_INVALID; + t->answer_query_flags = 0; + t->answer_nsec_ttl = UINT32_MAX; + t->answer_errno = 0; +} + +static void dns_transaction_flush_dnssec_transactions(DnsTransaction *t) { + DnsTransaction *z; + + assert(t); + + while ((z = set_steal_first(t->dnssec_transactions))) { + set_remove(z->notify_transactions, t); + set_remove(z->notify_transactions_done, t); + dns_transaction_gc(z); + } +} + +static void dns_transaction_close_connection( + DnsTransaction *t, + bool use_graveyard) { /* Set use_graveyard = false when you know the connection is already + * dead, for example because you got a connection error back from the + * kernel. In that case there's no point in keeping the fd around, + * hence don't. */ + int r; + + assert(t); + + if (t->stream) { + /* Let's detach the stream from our transaction, in case something else keeps a reference to it. */ + LIST_REMOVE(transactions_by_stream, t->stream->transactions, t); + + /* Remove packet in case it's still in the queue */ + dns_packet_unref(ordered_set_remove(t->stream->write_queue, t->sent)); + + t->stream = dns_stream_unref(t->stream); + } + + t->dns_udp_event_source = sd_event_source_disable_unref(t->dns_udp_event_source); + + /* If we have a UDP socket where we sent a packet, but never received one, then add it to the socket + * graveyard, instead of closing it right away. That way it will stick around for a moment longer, + * and the reply we might still get from the server will be eaten up instead of resulting in an ICMP + * port unreachable error message. */ + + /* Skip the graveyard stuff when we're shutting down, since that requires running event loop */ + if (!t->scope->manager->event || sd_event_get_state(t->scope->manager->event) == SD_EVENT_FINISHED) + use_graveyard = false; + + if (use_graveyard && t->dns_udp_fd >= 0 && t->sent && !t->received) { + r = manager_add_socket_to_graveyard(t->scope->manager, t->dns_udp_fd); + if (r < 0) + log_debug_errno(r, "Failed to add UDP socket to graveyard, closing immediately: %m"); + else + TAKE_FD(t->dns_udp_fd); + } + + t->dns_udp_fd = safe_close(t->dns_udp_fd); +} + +static void dns_transaction_stop_timeout(DnsTransaction *t) { + assert(t); + + t->timeout_event_source = sd_event_source_disable_unref(t->timeout_event_source); +} + +DnsTransaction* dns_transaction_free(DnsTransaction *t) { + DnsQueryCandidate *c; + DnsZoneItem *i; + DnsTransaction *z; + + if (!t) + return NULL; + + log_debug("Freeing transaction %" PRIu16 ".", t->id); + + dns_transaction_close_connection(t, true); + dns_transaction_stop_timeout(t); + + dns_packet_unref(t->sent); + dns_transaction_reset_answer(t); + + dns_server_unref(t->server); + + if (t->scope) { + if (t->key) { + DnsTransaction *first; + + first = hashmap_get(t->scope->transactions_by_key, t->key); + LIST_REMOVE(transactions_by_key, first, t); + if (first) + hashmap_replace(t->scope->transactions_by_key, first->key, first); + else + hashmap_remove(t->scope->transactions_by_key, t->key); + } + + LIST_REMOVE(transactions_by_scope, t->scope->transactions, t); + + if (t->id != 0) + hashmap_remove(t->scope->manager->dns_transactions, UINT_TO_PTR(t->id)); + } + + while ((c = set_steal_first(t->notify_query_candidates))) + set_remove(c->transactions, t); + set_free(t->notify_query_candidates); + + while ((c = set_steal_first(t->notify_query_candidates_done))) + set_remove(c->transactions, t); + set_free(t->notify_query_candidates_done); + + while ((i = set_steal_first(t->notify_zone_items))) + i->probe_transaction = NULL; + set_free(t->notify_zone_items); + + while ((i = set_steal_first(t->notify_zone_items_done))) + i->probe_transaction = NULL; + set_free(t->notify_zone_items_done); + + while ((z = set_steal_first(t->notify_transactions))) + set_remove(z->dnssec_transactions, t); + set_free(t->notify_transactions); + + while ((z = set_steal_first(t->notify_transactions_done))) + set_remove(z->dnssec_transactions, t); + set_free(t->notify_transactions_done); + + dns_transaction_flush_dnssec_transactions(t); + set_free(t->dnssec_transactions); + + dns_answer_unref(t->validated_keys); + dns_resource_key_unref(t->key); + dns_packet_unref(t->bypass); + + return mfree(t); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsTransaction*, dns_transaction_free); + +DnsTransaction* dns_transaction_gc(DnsTransaction *t) { + assert(t); + + /* Returns !NULL if we can't gc yet. */ + + if (t->block_gc > 0) + return t; + + if (set_isempty(t->notify_query_candidates) && + set_isempty(t->notify_query_candidates_done) && + set_isempty(t->notify_zone_items) && + set_isempty(t->notify_zone_items_done) && + set_isempty(t->notify_transactions) && + set_isempty(t->notify_transactions_done)) + return dns_transaction_free(t); + + return t; +} + +static uint16_t pick_new_id(Manager *m) { + uint16_t new_id; + + /* Find a fresh, unused transaction id. Note that this loop is bounded because there's a limit on the + * number of transactions, and it's much lower than the space of IDs. */ + + assert_cc(TRANSACTIONS_MAX < 0xFFFF); + + do + random_bytes(&new_id, sizeof(new_id)); + while (new_id == 0 || + hashmap_get(m->dns_transactions, UINT_TO_PTR(new_id))); + + return new_id; +} + +static int key_ok( + DnsScope *scope, + DnsResourceKey *key) { + + /* Don't allow looking up invalid or pseudo RRs */ + if (!dns_type_is_valid_query(key->type)) + return -EINVAL; + if (dns_type_is_obsolete(key->type)) + return -EOPNOTSUPP; + + /* We only support the IN class */ + if (!IN_SET(key->class, DNS_CLASS_IN, DNS_CLASS_ANY)) + return -EOPNOTSUPP; + + /* Don't allows DNSSEC RRs to be looked up via LLMNR/mDNS. They don't really make sense + * there, and it speeds up our queries if we refuse this early */ + if (scope->protocol != DNS_PROTOCOL_DNS && + dns_type_is_dnssec(key->type)) + return -EOPNOTSUPP; + + return 0; +} + +int dns_transaction_new( + DnsTransaction **ret, + DnsScope *s, + DnsResourceKey *key, + DnsPacket *bypass, + uint64_t query_flags) { + + _cleanup_(dns_transaction_freep) DnsTransaction *t = NULL; + int r; + + assert(ret); + assert(s); + + if (key) { + assert(!bypass); + + r = key_ok(s, key); + if (r < 0) + return r; + } else { + DnsResourceKey *qk; + assert(bypass); + + r = dns_packet_validate_query(bypass); + if (r < 0) + return r; + + DNS_QUESTION_FOREACH(qk, bypass->question) { + r = key_ok(s, qk); + if (r < 0) + return r; + } + } + + if (hashmap_size(s->manager->dns_transactions) >= TRANSACTIONS_MAX) + return -EBUSY; + + r = hashmap_ensure_allocated(&s->manager->dns_transactions, NULL); + if (r < 0) + return r; + + if (key) { + r = hashmap_ensure_allocated(&s->transactions_by_key, &dns_resource_key_hash_ops); + if (r < 0) + return r; + } + + t = new(DnsTransaction, 1); + if (!t) + return -ENOMEM; + + *t = (DnsTransaction) { + .dns_udp_fd = -EBADF, + .answer_source = _DNS_TRANSACTION_SOURCE_INVALID, + .answer_dnssec_result = _DNSSEC_RESULT_INVALID, + .answer_nsec_ttl = UINT32_MAX, + .key = dns_resource_key_ref(key), + .query_flags = query_flags, + .bypass = dns_packet_ref(bypass), + .current_feature_level = _DNS_SERVER_FEATURE_LEVEL_INVALID, + .clamp_feature_level_servfail = _DNS_SERVER_FEATURE_LEVEL_INVALID, + .id = pick_new_id(s->manager), + }; + + r = hashmap_put(s->manager->dns_transactions, UINT_TO_PTR(t->id), t); + if (r < 0) { + t->id = 0; + return r; + } + + if (t->key) { + DnsTransaction *first; + + first = hashmap_get(s->transactions_by_key, t->key); + LIST_PREPEND(transactions_by_key, first, t); + + r = hashmap_replace(s->transactions_by_key, first->key, first); + if (r < 0) { + LIST_REMOVE(transactions_by_key, first, t); + return r; + } + } + + LIST_PREPEND(transactions_by_scope, s->transactions, t); + t->scope = s; + + s->manager->n_transactions_total++; + + if (ret) + *ret = t; + + TAKE_PTR(t); + return 0; +} + +static void dns_transaction_shuffle_id(DnsTransaction *t) { + uint16_t new_id; + assert(t); + + /* Pick a new ID for this transaction. */ + + new_id = pick_new_id(t->scope->manager); + assert_se(hashmap_remove_and_put(t->scope->manager->dns_transactions, UINT_TO_PTR(t->id), UINT_TO_PTR(new_id), t) >= 0); + + log_debug("Transaction %" PRIu16 " is now %" PRIu16 ".", t->id, new_id); + t->id = new_id; + + /* Make sure we generate a new packet with the new ID */ + t->sent = dns_packet_unref(t->sent); +} + +static void dns_transaction_tentative(DnsTransaction *t, DnsPacket *p) { + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + DnsZoneItem *z; + + assert(t); + assert(p); + assert(t->scope->protocol == DNS_PROTOCOL_LLMNR); + + if (manager_packet_from_local_address(t->scope->manager, p) != 0) + return; + + log_debug("Transaction %" PRIu16 " for <%s> on scope %s on %s/%s got tentative packet from %s.", + t->id, + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str), + dns_protocol_to_string(t->scope->protocol), + t->scope->link ? t->scope->link->ifname : "*", + af_to_name_short(t->scope->family), + IN_ADDR_TO_STRING(p->family, &p->sender)); + + /* RFC 4795, Section 4.1 says that the peer with the + * lexicographically smaller IP address loses */ + if (memcmp(&p->sender, &p->destination, FAMILY_ADDRESS_SIZE(p->family)) >= 0) { + log_debug("Peer has lexicographically larger IP address and thus lost in the conflict."); + return; + } + + log_debug("We have the lexicographically larger IP address and thus lost in the conflict."); + + t->block_gc++; + + while ((z = set_first(t->notify_zone_items))) { + /* First, make sure the zone item drops the reference + * to us */ + dns_zone_item_probe_stop(z); + + /* Secondly, report this as conflict, so that we might + * look for a different hostname */ + dns_zone_item_conflict(z); + } + t->block_gc--; + + dns_transaction_gc(t); +} + +void dns_transaction_complete(DnsTransaction *t, DnsTransactionState state) { + DnsQueryCandidate *c; + DnsZoneItem *z; + DnsTransaction *d; + const char *st; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + assert(t); + assert(!DNS_TRANSACTION_IS_LIVE(state)); + + if (state == DNS_TRANSACTION_DNSSEC_FAILED) { + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str); + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_DNSSEC_FAILURE_STR, + LOG_MESSAGE("DNSSEC validation failed for question %s: %s", + key_str, dnssec_result_to_string(t->answer_dnssec_result)), + "DNS_TRANSACTION=%" PRIu16, t->id, + "DNS_QUESTION=%s", key_str, + "DNSSEC_RESULT=%s", dnssec_result_to_string(t->answer_dnssec_result), + "DNS_SERVER=%s", strna(dns_server_string_full(t->server)), + "DNS_SERVER_FEATURE_LEVEL=%s", dns_server_feature_level_to_string(t->server->possible_feature_level)); + } + + /* Note that this call might invalidate the query. Callers + * should hence not attempt to access the query or transaction + * after calling this function. */ + + if (state == DNS_TRANSACTION_ERRNO) + st = errno_to_name(t->answer_errno); + else + st = dns_transaction_state_to_string(state); + + log_debug("%s transaction %" PRIu16 " for <%s> on scope %s on %s/%s now complete with <%s> from %s (%s; %s).", + t->bypass ? "Bypass" : "Regular", + t->id, + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str), + dns_protocol_to_string(t->scope->protocol), + t->scope->link ? t->scope->link->ifname : "*", + af_to_name_short(t->scope->family), + st, + t->answer_source < 0 ? "none" : dns_transaction_source_to_string(t->answer_source), + FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE) ? "not validated" : + (FLAGS_SET(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED) ? "authenticated" : "unsigned"), + FLAGS_SET(t->answer_query_flags, SD_RESOLVED_CONFIDENTIAL) ? "confidential" : "non-confidential"); + + t->state = state; + + dns_transaction_close_connection(t, true); + dns_transaction_stop_timeout(t); + + /* Notify all queries that are interested, but make sure the + * transaction isn't freed while we are still looking at it */ + t->block_gc++; + + SET_FOREACH_MOVE(c, t->notify_query_candidates_done, t->notify_query_candidates) + dns_query_candidate_notify(c); + SWAP_TWO(t->notify_query_candidates, t->notify_query_candidates_done); + + SET_FOREACH_MOVE(z, t->notify_zone_items_done, t->notify_zone_items) + dns_zone_item_notify(z); + SWAP_TWO(t->notify_zone_items, t->notify_zone_items_done); + if (t->probing && t->state == DNS_TRANSACTION_ATTEMPTS_MAX_REACHED) + (void) dns_scope_announce(t->scope, false); + + SET_FOREACH_MOVE(d, t->notify_transactions_done, t->notify_transactions) + dns_transaction_notify(d, t); + SWAP_TWO(t->notify_transactions, t->notify_transactions_done); + + t->block_gc--; + dns_transaction_gc(t); +} + +static void dns_transaction_complete_errno(DnsTransaction *t, int error) { + assert(t); + assert(error != 0); + + t->answer_errno = abs(error); + dns_transaction_complete(t, DNS_TRANSACTION_ERRNO); +} + +static int dns_transaction_pick_server(DnsTransaction *t) { + DnsServer *server; + + assert(t); + assert(t->scope->protocol == DNS_PROTOCOL_DNS); + + /* Pick a DNS server and a feature level for it. */ + + server = dns_scope_get_dns_server(t->scope); + if (!server) + return -ESRCH; + + /* If we changed the server invalidate the feature level clamping, as the new server might have completely + * different properties. */ + if (server != t->server) + t->clamp_feature_level_servfail = _DNS_SERVER_FEATURE_LEVEL_INVALID; + + t->current_feature_level = dns_server_possible_feature_level(server); + + /* Clamp the feature level if that is requested. */ + if (t->clamp_feature_level_servfail != _DNS_SERVER_FEATURE_LEVEL_INVALID && + t->current_feature_level > t->clamp_feature_level_servfail) + t->current_feature_level = t->clamp_feature_level_servfail; + + log_debug("Using feature level %s for transaction %u.", dns_server_feature_level_to_string(t->current_feature_level), t->id); + + if (server == t->server) + return 0; + + dns_server_unref(t->server); + t->server = dns_server_ref(server); + + t->n_picked_servers ++; + + log_debug("Using DNS server %s for transaction %u.", strna(dns_server_string_full(t->server)), t->id); + + return 1; +} + +static void dns_transaction_retry(DnsTransaction *t, bool next_server) { + int r; + + assert(t); + + /* Retries the transaction as it is, possibly on a different server */ + + if (next_server && t->scope->protocol == DNS_PROTOCOL_DNS) + log_debug("Retrying transaction %" PRIu16 ", after switching servers.", t->id); + else + log_debug("Retrying transaction %" PRIu16 ".", t->id); + + /* Before we try again, switch to a new server. */ + if (next_server) + dns_scope_next_dns_server(t->scope, t->server); + + r = dns_transaction_go(t); + if (r < 0) + dns_transaction_complete_errno(t, r); +} + +static bool dns_transaction_limited_retry(DnsTransaction *t) { + assert(t); + + /* If we haven't tried all different servers yet, let's try again with a different server */ + + if (t->n_picked_servers >= dns_scope_get_n_dns_servers(t->scope)) + return false; + + dns_transaction_retry(t, /* next_server= */ true); + return true; +} + +static int dns_transaction_maybe_restart(DnsTransaction *t) { + int r; + + assert(t); + + /* Restarts the transaction, under a new ID if the feature level of the server changed since we first + * tried, without changing DNS server. Returns > 0 if the transaction was restarted, 0 if not. */ + + if (!t->server) + return 0; + + if (t->current_feature_level <= dns_server_possible_feature_level(t->server)) + return 0; + + /* The server's current feature level is lower than when we sent the original query. We learnt something from + the response or possibly an auxiliary DNSSEC response that we didn't know before. We take that as reason to + restart the whole transaction. This is a good idea to deal with servers that respond rubbish if we include + OPT RR or DO bit. One of these cases is documented here, for example: + https://open.nlnetlabs.nl/pipermail/dnssec-trigger/2014-November/000376.html */ + + log_debug("Server feature level is now lower than when we began our transaction. Restarting with new ID."); + dns_transaction_shuffle_id(t); + + r = dns_transaction_go(t); + if (r < 0) + return r; + + return 1; +} + +static void on_transaction_stream_error(DnsTransaction *t, int error) { + assert(t); + + dns_transaction_close_connection(t, true); + + if (ERRNO_IS_DISCONNECT(error)) { + if (t->scope->protocol == DNS_PROTOCOL_LLMNR) { + /* If the LLMNR/TCP connection failed, the host doesn't support LLMNR, and we cannot answer the + * question on this scope. */ + dns_transaction_complete(t, DNS_TRANSACTION_NOT_FOUND); + return; + } + + dns_transaction_retry(t, true); + return; + } + if (error != 0) + dns_transaction_complete_errno(t, error); +} + +static int dns_transaction_on_stream_packet(DnsTransaction *t, DnsStream *s, DnsPacket *p) { + bool encrypted; + + assert(t); + assert(s); + assert(p); + + encrypted = s->encrypted; + + dns_transaction_close_connection(t, true); + + if (dns_packet_validate_reply(p) <= 0) { + log_debug("Invalid TCP reply packet."); + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return 0; + } + + dns_scope_check_conflicts(t->scope, p); + + t->block_gc++; + dns_transaction_process_reply(t, p, encrypted); + t->block_gc--; + + /* If the response wasn't useful, then complete the transition + * now. After all, we are the worst feature set now with TCP + * sockets, and there's really no point in retrying. */ + if (t->state == DNS_TRANSACTION_PENDING) + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + else + dns_transaction_gc(t); + + return 0; +} + +static int on_stream_complete(DnsStream *s, int error) { + assert(s); + + if (ERRNO_IS_DISCONNECT(error) && s->protocol != DNS_PROTOCOL_LLMNR) { + log_debug_errno(error, "Connection failure for DNS TCP stream: %m"); + + if (s->transactions) { + DnsTransaction *t; + + t = s->transactions; + dns_server_packet_lost(t->server, IPPROTO_TCP, t->current_feature_level); + } + } + + if (error != 0) { + /* First, detach the stream from the server. Otherwise, transactions attached to this stream + * may be restarted by on_transaction_stream_error() below with this stream. */ + dns_stream_detach(s); + + /* Do not use LIST_FOREACH() here, as + * on_transaction_stream_error() + * -> dns_transaction_complete_errno() + * -> dns_transaction_free() + * may free multiple transactions in the list. */ + DnsTransaction *t; + while ((t = s->transactions)) + on_transaction_stream_error(t, error); + } + + return 0; +} + +static int on_stream_packet(DnsStream *s, DnsPacket *p) { + DnsTransaction *t; + + assert(s); + assert(s->manager); + assert(p); + + t = hashmap_get(s->manager->dns_transactions, UINT_TO_PTR(DNS_PACKET_ID(p))); + if (t && t->stream == s) /* Validate that the stream we got this on actually is the stream the + * transaction was using. */ + return dns_transaction_on_stream_packet(t, s, p); + + /* Ignore incorrect transaction id as an old transaction can have been canceled. */ + log_debug("Received unexpected TCP reply packet with id %" PRIu16 ", ignoring.", DNS_PACKET_ID(p)); + return 0; +} + +static uint16_t dns_transaction_port(DnsTransaction *t) { + assert(t); + + if (t->server->port > 0) + return t->server->port; + + return DNS_SERVER_FEATURE_LEVEL_IS_TLS(t->current_feature_level) ? 853 : 53; +} + +static int dns_transaction_emit_tcp(DnsTransaction *t) { + usec_t stream_timeout_usec = DNS_STREAM_DEFAULT_TIMEOUT_USEC; + _cleanup_(dns_stream_unrefp) DnsStream *s = NULL; + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + DnsStreamType type; + int r; + + assert(t); + assert(t->sent); + + dns_transaction_close_connection(t, true); + + switch (t->scope->protocol) { + + case DNS_PROTOCOL_DNS: + r = dns_transaction_pick_server(t); + if (r < 0) + return r; + + if (manager_server_is_stub(t->scope->manager, t->server)) + return -ELOOP; + + if (!t->bypass) { + if (!dns_server_dnssec_supported(t->server) && dns_type_is_dnssec(dns_transaction_key(t)->type)) + return -EOPNOTSUPP; + + r = dns_server_adjust_opt(t->server, t->sent, t->current_feature_level); + if (r < 0) + return r; + } + + if (t->server->stream && (DNS_SERVER_FEATURE_LEVEL_IS_TLS(t->current_feature_level) == t->server->stream->encrypted)) + s = dns_stream_ref(t->server->stream); + else + fd = dns_scope_socket_tcp(t->scope, AF_UNSPEC, NULL, t->server, dns_transaction_port(t), &sa); + + /* Lower timeout in DNS-over-TLS opportunistic mode. In environments where DoT is blocked + * without ICMP response overly long delays when contacting DoT servers are nasty, in + * particular if multiple DNS servers are defined which we try in turn and all are + * blocked. Hence, substantially lower the timeout in that case. */ + if (DNS_SERVER_FEATURE_LEVEL_IS_TLS(t->current_feature_level) && + dns_server_get_dns_over_tls_mode(t->server) == DNS_OVER_TLS_OPPORTUNISTIC) + stream_timeout_usec = DNS_STREAM_OPPORTUNISTIC_TLS_TIMEOUT_USEC; + + type = DNS_STREAM_LOOKUP; + break; + + case DNS_PROTOCOL_LLMNR: + /* When we already received a reply to this (but it was truncated), send to its sender address */ + if (t->received) + fd = dns_scope_socket_tcp(t->scope, t->received->family, &t->received->sender, NULL, t->received->sender_port, &sa); + else { + union in_addr_union address; + int family = AF_UNSPEC; + + /* Otherwise, try to talk to the owner of a + * the IP address, in case this is a reverse + * PTR lookup */ + + r = dns_name_address(dns_resource_key_name(dns_transaction_key(t)), &family, &address); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + if (family != t->scope->family) + return -ESRCH; + + fd = dns_scope_socket_tcp(t->scope, family, &address, NULL, LLMNR_PORT, &sa); + } + + type = DNS_STREAM_LLMNR_SEND; + break; + + default: + return -EAFNOSUPPORT; + } + + if (!s) { + if (fd < 0) + return fd; + + r = dns_stream_new(t->scope->manager, &s, type, t->scope->protocol, fd, &sa, + on_stream_packet, on_stream_complete, stream_timeout_usec); + if (r < 0) + return r; + + fd = -EBADF; + +#if ENABLE_DNS_OVER_TLS + if (t->scope->protocol == DNS_PROTOCOL_DNS && + DNS_SERVER_FEATURE_LEVEL_IS_TLS(t->current_feature_level)) { + + assert(t->server); + r = dnstls_stream_connect_tls(s, t->server); + if (r < 0) + return r; + } +#endif + + if (t->server) { + dns_server_unref_stream(t->server); + s->server = dns_server_ref(t->server); + t->server->stream = dns_stream_ref(s); + } + + /* The interface index is difficult to determine if we are + * connecting to the local host, hence fill this in right away + * instead of determining it from the socket */ + s->ifindex = dns_scope_ifindex(t->scope); + } + + t->stream = TAKE_PTR(s); + LIST_PREPEND(transactions_by_stream, t->stream->transactions, t); + + r = dns_stream_write_packet(t->stream, t->sent); + if (r < 0) { + dns_transaction_close_connection(t, /* use_graveyard= */ false); + return r; + } + + dns_transaction_reset_answer(t); + + t->tried_stream = true; + + return 0; +} + +static void dns_transaction_cache_answer(DnsTransaction *t) { + assert(t); + + /* For mDNS we cache whenever we get the packet, rather than + * in each transaction. */ + if (!IN_SET(t->scope->protocol, DNS_PROTOCOL_DNS, DNS_PROTOCOL_LLMNR)) + return; + + /* Caching disabled? */ + if (t->scope->manager->enable_cache == DNS_CACHE_MODE_NO) + return; + + /* If validation is turned off for this transaction, but DNSSEC is on, then let's not cache this */ + if (FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE) && t->scope->dnssec_mode != DNSSEC_NO) + return; + + /* Packet from localhost? */ + if (!t->scope->manager->cache_from_localhost && + in_addr_is_localhost(t->received->family, &t->received->sender) != 0) + return; + + dns_cache_put(&t->scope->cache, + t->scope->manager->enable_cache, + t->scope->protocol, + dns_transaction_key(t), + t->answer_rcode, + t->answer, + DNS_PACKET_CD(t->received) ? t->received : NULL, /* only cache full packets with CD on, + * since our use case for caching them + * is "bypass" mode which is only + * enabled for CD packets. */ + t->answer_query_flags, + t->answer_dnssec_result, + t->answer_nsec_ttl, + t->received->family, + &t->received->sender, + t->scope->manager->stale_retention_usec); +} + +static bool dns_transaction_dnssec_is_live(DnsTransaction *t) { + DnsTransaction *dt; + + assert(t); + + SET_FOREACH(dt, t->dnssec_transactions) + if (DNS_TRANSACTION_IS_LIVE(dt->state)) + return true; + + return false; +} + +static int dns_transaction_dnssec_ready(DnsTransaction *t) { + DnsTransaction *dt; + int r; + + assert(t); + + /* Checks whether the auxiliary DNSSEC transactions of our transaction have completed, or are still + * ongoing. Returns 0, if we aren't ready for the DNSSEC validation, positive if we are. */ + + SET_FOREACH(dt, t->dnssec_transactions) { + + switch (dt->state) { + + case DNS_TRANSACTION_NULL: + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + /* Still ongoing */ + return 0; + + case DNS_TRANSACTION_RCODE_FAILURE: + if (!IN_SET(dt->answer_rcode, DNS_RCODE_NXDOMAIN, DNS_RCODE_SERVFAIL)) { + log_debug("Auxiliary DNSSEC RR query failed with rcode=%s.", FORMAT_DNS_RCODE(dt->answer_rcode)); + goto fail; + } + + /* Fall-through: NXDOMAIN/SERVFAIL is good enough for us. This is because some DNS servers + * erroneously return NXDOMAIN/SERVFAIL for empty non-terminals (Akamai...) or missing DS + * records (Facebook), and we need to handle that nicely, when asking for parent SOA or similar + * RRs to make unsigned proofs. */ + + case DNS_TRANSACTION_SUCCESS: + /* All good. */ + break; + + case DNS_TRANSACTION_DNSSEC_FAILED: + /* We handle DNSSEC failures different from other errors, as we care about the DNSSEC + * validation result */ + + log_debug("Auxiliary DNSSEC RR query failed validation: %s", dnssec_result_to_string(dt->answer_dnssec_result)); + t->answer_dnssec_result = dt->answer_dnssec_result; /* Copy error code over */ + dns_transaction_complete(t, DNS_TRANSACTION_DNSSEC_FAILED); + return 0; + + default: + log_debug("Auxiliary DNSSEC RR query failed with %s", dns_transaction_state_to_string(dt->state)); + goto fail; + } + } + + /* All is ready, we can go and validate */ + return 1; + +fail: + /* Some auxiliary DNSSEC transaction failed for some reason. Maybe we learned something about the + * server due to this failure, and the feature level is now different? Let's see and restart the + * transaction if so. If not, let's propagate the auxiliary failure. + * + * This is particularly relevant if an auxiliary request figured out that DNSSEC doesn't work, and we + * are in permissive DNSSEC mode, and thus should restart things without DNSSEC magic. */ + r = dns_transaction_maybe_restart(t); + if (r < 0) + return r; + if (r > 0) + return 0; /* don't validate just yet, we restarted things */ + + t->answer_dnssec_result = DNSSEC_FAILED_AUXILIARY; + dns_transaction_complete(t, DNS_TRANSACTION_DNSSEC_FAILED); + return 0; +} + +static void dns_transaction_process_dnssec(DnsTransaction *t) { + int r; + + assert(t); + + /* Are there ongoing DNSSEC transactions? If so, let's wait for them. */ + r = dns_transaction_dnssec_ready(t); + if (r < 0) + goto fail; + if (r == 0) /* We aren't ready yet (or one of our auxiliary transactions failed, and we shouldn't validate now */ + return; + + /* See if we learnt things from the additional DNSSEC transactions, that we didn't know before, and better + * restart the lookup immediately. */ + r = dns_transaction_maybe_restart(t); + if (r < 0) + goto fail; + if (r > 0) /* Transaction got restarted... */ + return; + + /* All our auxiliary DNSSEC transactions are complete now. Try + * to validate our RRset now. */ + r = dns_transaction_validate_dnssec(t); + if (r == -EBADMSG) { + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return; + } + if (r < 0) + goto fail; + + if (t->answer_dnssec_result == DNSSEC_INCOMPATIBLE_SERVER && + t->scope->dnssec_mode == DNSSEC_YES) { + + /* We are not in automatic downgrade mode, and the server is bad. Let's try a different server, maybe + * that works. */ + + if (dns_transaction_limited_retry(t)) + return; + + /* OK, let's give up, apparently all servers we tried didn't work. */ + dns_transaction_complete(t, DNS_TRANSACTION_DNSSEC_FAILED); + return; + } + + if (!IN_SET(t->answer_dnssec_result, + _DNSSEC_RESULT_INVALID, /* No DNSSEC validation enabled */ + DNSSEC_VALIDATED, /* Answer is signed and validated successfully */ + DNSSEC_UNSIGNED, /* Answer is right-fully unsigned */ + DNSSEC_INCOMPATIBLE_SERVER)) { /* Server does not do DNSSEC (Yay, we are downgrade attack vulnerable!) */ + dns_transaction_complete(t, DNS_TRANSACTION_DNSSEC_FAILED); + return; + } + + if (t->answer_dnssec_result == DNSSEC_INCOMPATIBLE_SERVER) + dns_server_warn_downgrade(t->server); + + dns_transaction_cache_answer(t); + + if (t->answer_rcode == DNS_RCODE_SUCCESS) + dns_transaction_complete(t, DNS_TRANSACTION_SUCCESS); + else + dns_transaction_complete(t, DNS_TRANSACTION_RCODE_FAILURE); + + return; + +fail: + dns_transaction_complete_errno(t, r); +} + +static int dns_transaction_has_positive_answer(DnsTransaction *t, DnsAnswerFlags *flags) { + int r; + + assert(t); + + /* Checks whether the answer is positive, i.e. either a direct + * answer to the question, or a CNAME/DNAME for it */ + + r = dns_answer_match_key(t->answer, dns_transaction_key(t), flags); + if (r != 0) + return r; + + r = dns_answer_find_cname_or_dname(t->answer, dns_transaction_key(t), NULL, flags); + if (r != 0) + return r; + + return false; +} + +static int dns_transaction_fix_rcode(DnsTransaction *t) { + int r; + + assert(t); + + /* Fix up the RCODE to SUCCESS if we get at least one matching RR in a response. Note that this contradicts the + * DNS RFCs a bit. Specifically, RFC 6604 Section 3 clarifies that the RCODE shall say something about a + * CNAME/DNAME chain element coming after the last chain element contained in the message, and not the first + * one included. However, it also indicates that not all DNS servers implement this correctly. Moreover, when + * using DNSSEC we usually only can prove the first element of a CNAME/DNAME chain anyway, hence let's settle + * on always processing the RCODE as referring to the immediate look-up we do, i.e. the first element of a + * CNAME/DNAME chain. This way, we uniformly handle CNAME/DNAME chains, regardless if the DNS server + * incorrectly implements RCODE, whether DNSSEC is in use, or whether the DNS server only supplied us with an + * incomplete CNAME/DNAME chain. + * + * Or in other words: if we get at least one positive reply in a message we patch NXDOMAIN to become SUCCESS, + * and then rely on the CNAME chasing logic to figure out that there's actually a CNAME error with a new + * lookup. */ + + if (t->answer_rcode != DNS_RCODE_NXDOMAIN) + return 0; + + r = dns_transaction_has_positive_answer(t, NULL); + if (r <= 0) + return r; + + t->answer_rcode = DNS_RCODE_SUCCESS; + return 0; +} + +void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypted) { + bool retry_with_tcp = false; + int r; + + assert(t); + assert(p); + assert(t->scope); + assert(t->scope->manager); + + if (t->state != DNS_TRANSACTION_PENDING) + return; + + /* Increment the total failure counter only when it is the first attempt at querying and the upstream + * server returns a failure response code. This ensures a more accurate count of the number of queries + * that received a failure response code, as it doesn't consider retries. */ + + if (t->n_attempts == 1 && !IN_SET(DNS_PACKET_RCODE(p), DNS_RCODE_SUCCESS, DNS_RCODE_NXDOMAIN)) + t->scope->manager->n_failure_responses_total++; + + /* Note that this call might invalidate the query. Callers + * should hence not attempt to access the query or transaction + * after calling this function. */ + + log_debug("Processing incoming packet of size %zu on transaction %" PRIu16" (rcode=%s).", + p->size, + t->id, FORMAT_DNS_RCODE(DNS_PACKET_RCODE(p))); + + switch (t->scope->protocol) { + + case DNS_PROTOCOL_LLMNR: + /* For LLMNR we will not accept any packets from other interfaces */ + + if (p->ifindex != dns_scope_ifindex(t->scope)) + return; + + if (p->family != t->scope->family) + return; + + /* Tentative packets are not full responses but still + * useful for identifying uniqueness conflicts during + * probing. */ + if (DNS_PACKET_LLMNR_T(p)) { + dns_transaction_tentative(t, p); + return; + } + + break; + + case DNS_PROTOCOL_MDNS: + /* For mDNS we will not accept any packets from other interfaces */ + + if (p->ifindex != dns_scope_ifindex(t->scope)) + return; + + if (p->family != t->scope->family) + return; + + break; + + case DNS_PROTOCOL_DNS: + /* Note that we do not need to verify the + * addresses/port numbers of incoming traffic, as we + * invoked connect() on our UDP socket in which case + * the kernel already does the needed verification for + * us. */ + break; + + default: + assert_not_reached(); + } + + if (t->received != p) + DNS_PACKET_REPLACE(t->received, dns_packet_ref(p)); + + t->answer_source = DNS_TRANSACTION_NETWORK; + + if (p->ipproto == IPPROTO_TCP) { + if (DNS_PACKET_TC(p)) { + /* Truncated via TCP? Somebody must be fucking with us */ + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return; + } + + if (DNS_PACKET_ID(p) != t->id) { + /* Not the reply to our query? Somebody must be fucking with us */ + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return; + } + } + + switch (t->scope->protocol) { + + case DNS_PROTOCOL_DNS: + assert(t->server); + + if (!t->bypass && + IN_SET(DNS_PACKET_RCODE(p), DNS_RCODE_FORMERR, DNS_RCODE_SERVFAIL, DNS_RCODE_NOTIMP)) { + + /* Request failed, immediately try again with reduced features */ + + if (t->current_feature_level <= DNS_SERVER_FEATURE_LEVEL_UDP) { + + /* This was already at UDP feature level? If so, it doesn't make sense to downgrade + * this transaction anymore, but let's see if it might make sense to send the request + * to a different DNS server instead. If not let's process the response, and accept the + * rcode. Note that we don't retry on TCP, since that's a suitable way to mitigate + * packet loss, but is not going to give us better rcodes should we actually have + * managed to get them already at UDP level. */ + + if (dns_transaction_limited_retry(t)) + return; + + /* Give up, accept the rcode */ + log_debug("Server returned error: %s", FORMAT_DNS_RCODE(DNS_PACKET_RCODE(p))); + break; + } + + /* SERVFAIL can happen for many reasons and may be transient. + * To avoid unnecessary downgrades retry once with the initial level. + * Check for clamp_feature_level_servfail having an invalid value as a sign that this is the + * first attempt to downgrade. If so, clamp to the current value so that the transaction + * is retried without actually downgrading. If the next try also fails we will downgrade by + * hitting the else branch below. */ + if (DNS_PACKET_RCODE(p) == DNS_RCODE_SERVFAIL && + t->clamp_feature_level_servfail < 0) { + t->clamp_feature_level_servfail = t->current_feature_level; + log_debug("Server returned error %s, retrying transaction.", + FORMAT_DNS_RCODE(DNS_PACKET_RCODE(p))); + } else { + /* Reduce this feature level by one and try again. */ + switch (t->current_feature_level) { + case DNS_SERVER_FEATURE_LEVEL_TLS_DO: + t->clamp_feature_level_servfail = DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN; + break; + case DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN + 1: + /* Skip plain TLS when TLS is not supported */ + t->clamp_feature_level_servfail = DNS_SERVER_FEATURE_LEVEL_TLS_PLAIN - 1; + break; + default: + t->clamp_feature_level_servfail = t->current_feature_level - 1; + } + + log_debug("Server returned error %s, retrying transaction with reduced feature level %s.", + FORMAT_DNS_RCODE(DNS_PACKET_RCODE(p)), + dns_server_feature_level_to_string(t->clamp_feature_level_servfail)); + } + + dns_transaction_retry(t, false /* use the same server */); + return; + } + + if (DNS_PACKET_RCODE(p) == DNS_RCODE_REFUSED) { + /* This server refused our request? If so, try again, use a different server */ + log_debug("Server returned REFUSED, switching servers, and retrying."); + + if (dns_transaction_limited_retry(t)) + return; + + break; + } + + if (DNS_PACKET_TC(p)) + dns_server_packet_truncated(t->server, t->current_feature_level); + + break; + + case DNS_PROTOCOL_LLMNR: + case DNS_PROTOCOL_MDNS: + dns_scope_packet_received(t->scope, p->timestamp - t->start_usec); + break; + + default: + assert_not_reached(); + } + + if (DNS_PACKET_TC(p)) { + + /* Truncated packets for mDNS are not allowed. Give up immediately. */ + if (t->scope->protocol == DNS_PROTOCOL_MDNS) { + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return; + } + + /* Response was truncated, let's try again with good old TCP */ + log_debug("Reply truncated, retrying via TCP."); + retry_with_tcp = true; + + } else if (t->scope->protocol == DNS_PROTOCOL_DNS && + DNS_PACKET_IS_FRAGMENTED(p)) { + + /* Report the fragment size, so that we downgrade from LARGE to regular EDNS0 if needed */ + if (t->server) + dns_server_packet_udp_fragmented(t->server, dns_packet_size_unfragmented(p)); + + if (t->current_feature_level > DNS_SERVER_FEATURE_LEVEL_UDP) { + /* Packet was fragmented. Let's retry with TCP to avoid fragmentation attack + * issues. (We don't do that on the lowest feature level however, since crappy DNS + * servers often do not implement TCP, hence falling back to TCP on fragmentation is + * counter-productive there.) */ + + log_debug("Reply fragmented, retrying via TCP. (Largest fragment size: %zu; Datagram size: %zu)", + p->fragsize, p->size); + retry_with_tcp = true; + } + } + + if (retry_with_tcp) { + r = dns_transaction_emit_tcp(t); + if (r == -ESRCH) { + /* No servers found? Damn! */ + dns_transaction_complete(t, DNS_TRANSACTION_NO_SERVERS); + return; + } + if (r == -EOPNOTSUPP) { + /* Tried to ask for DNSSEC RRs, on a server that doesn't do DNSSEC */ + dns_transaction_complete(t, DNS_TRANSACTION_RR_TYPE_UNSUPPORTED); + return; + } + if (r < 0) { + /* On LLMNR, if we cannot connect to the host, + * we immediately give up */ + if (t->scope->protocol != DNS_PROTOCOL_DNS) + goto fail; + + /* On DNS, couldn't send? Try immediately again, with a new server */ + if (dns_transaction_limited_retry(t)) + return; + + /* No new server to try, give up */ + dns_transaction_complete(t, DNS_TRANSACTION_ATTEMPTS_MAX_REACHED); + } + + return; + } + + /* After the superficial checks, actually parse the message. */ + r = dns_packet_extract(p); + if (r < 0) { + if (t->server) { + dns_server_packet_invalid(t->server, t->current_feature_level); + + r = dns_transaction_maybe_restart(t); + if (r < 0) + goto fail; + if (r > 0) /* Transaction got restarted... */ + return; + } + + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return; + } + + if (t->server) { + /* Report that we successfully received a valid packet with a good rcode after we initially got a bad + * rcode and subsequently downgraded the protocol */ + + if (IN_SET(DNS_PACKET_RCODE(p), DNS_RCODE_SUCCESS, DNS_RCODE_NXDOMAIN) && + t->clamp_feature_level_servfail != _DNS_SERVER_FEATURE_LEVEL_INVALID) + dns_server_packet_rcode_downgrade(t->server, t->clamp_feature_level_servfail); + + /* Report that the OPT RR was missing */ + if (!p->opt) + dns_server_packet_bad_opt(t->server, t->current_feature_level); + + /* Report that the server didn't copy our query DO bit from request to response */ + if (DNS_PACKET_DO(t->sent) && !DNS_PACKET_DO(t->received)) + dns_server_packet_do_off(t->server, t->current_feature_level); + + /* Report that we successfully received a packet. We keep track of the largest packet + * size/fragment size we got. Which is useful for announcing the EDNS(0) packet size we can + * receive to our server. */ + dns_server_packet_received(t->server, p->ipproto, t->current_feature_level, dns_packet_size_unfragmented(p)); + } + + /* See if we know things we didn't know before that indicate we better restart the lookup immediately. */ + r = dns_transaction_maybe_restart(t); + if (r < 0) + goto fail; + if (r > 0) /* Transaction got restarted... */ + return; + + /* When dealing with protocols other than mDNS only consider responses with equivalent query section + * to the request. For mDNS this check doesn't make sense, because the section 6 of RFC6762 states + * that "Multicast DNS responses MUST NOT contain any questions in the Question Section". */ + if (t->scope->protocol != DNS_PROTOCOL_MDNS) { + r = dns_packet_is_reply_for(p, dns_transaction_key(t)); + if (r < 0) + goto fail; + if (r == 0) { + dns_transaction_complete(t, DNS_TRANSACTION_INVALID_REPLY); + return; + } + } + + /* Install the answer as answer to the transaction. We ref the answer twice here: the main `answer` + * field is later replaced by the DNSSEC validated subset. The 'answer_auxiliary' field carries the + * original complete record set, including RRSIG and friends. We use this when passing data to + * clients that ask for DNSSEC metadata. */ + DNS_ANSWER_REPLACE(t->answer, dns_answer_ref(p->answer)); + t->answer_rcode = DNS_PACKET_RCODE(p); + t->answer_dnssec_result = _DNSSEC_RESULT_INVALID; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, false); + SET_FLAG(t->answer_query_flags, SD_RESOLVED_CONFIDENTIAL, encrypted); + + r = dns_transaction_fix_rcode(t); + if (r < 0) + goto fail; + + /* Block GC while starting requests for additional DNSSEC RRs */ + t->block_gc++; + r = dns_transaction_request_dnssec_keys(t); + t->block_gc--; + + /* Maybe the transaction is ready for GC'ing now? If so, free it and return. */ + if (!dns_transaction_gc(t)) + return; + + /* Requesting additional keys might have resulted in this transaction to fail, since the auxiliary + * request failed for some reason. If so, we are not in pending state anymore, and we should exit + * quickly. */ + if (t->state != DNS_TRANSACTION_PENDING) + return; + if (r < 0) + goto fail; + if (r > 0) { + /* There are DNSSEC transactions pending now. Update the state accordingly. */ + t->state = DNS_TRANSACTION_VALIDATING; + dns_transaction_close_connection(t, true); + dns_transaction_stop_timeout(t); + return; + } + + dns_transaction_process_dnssec(t); + return; + +fail: + dns_transaction_complete_errno(t, r); +} + +static int on_dns_packet(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + DnsTransaction *t = ASSERT_PTR(userdata); + int r; + + assert(t->scope); + + r = manager_recv(t->scope->manager, fd, DNS_PROTOCOL_DNS, &p); + if (r < 0) { + if (ERRNO_IS_DISCONNECT(r)) { + usec_t usec; + + /* UDP connection failures get reported via ICMP and then are possibly delivered to us on the + * next recvmsg(). Treat this like a lost packet. */ + + log_debug_errno(r, "Connection failure for DNS UDP packet: %m"); + assert_se(sd_event_now(t->scope->manager->event, CLOCK_BOOTTIME, &usec) >= 0); + dns_server_packet_lost(t->server, IPPROTO_UDP, t->current_feature_level); + + dns_transaction_close_connection(t, /* use_graveyard = */ false); + + if (dns_transaction_limited_retry(t)) /* Try a different server */ + return 0; + } + dns_transaction_complete_errno(t, r); + return 0; + } + if (r == 0) + /* Spurious wakeup without any data */ + return 0; + + r = dns_packet_validate_reply(p); + if (r < 0) { + log_debug_errno(r, "Received invalid DNS packet as response, ignoring: %m"); + return 0; + } + if (r == 0) { + log_debug("Received inappropriate DNS packet as response, ignoring."); + return 0; + } + + if (DNS_PACKET_ID(p) != t->id) { + log_debug("Received packet with incorrect transaction ID, ignoring."); + return 0; + } + + dns_transaction_process_reply(t, p, false); + return 0; +} + +static int dns_transaction_emit_udp(DnsTransaction *t) { + int r; + + assert(t); + + if (t->scope->protocol == DNS_PROTOCOL_DNS) { + + r = dns_transaction_pick_server(t); + if (r < 0) + return r; + + if (manager_server_is_stub(t->scope->manager, t->server)) + return -ELOOP; + + if (t->current_feature_level < DNS_SERVER_FEATURE_LEVEL_UDP || DNS_SERVER_FEATURE_LEVEL_IS_TLS(t->current_feature_level)) + return -EAGAIN; /* Sorry, can't do UDP, try TCP! */ + + if (!t->bypass && !dns_server_dnssec_supported(t->server) && dns_type_is_dnssec(dns_transaction_key(t)->type)) + return -EOPNOTSUPP; + + if (r > 0 || t->dns_udp_fd < 0) { /* Server changed, or no connection yet. */ + int fd; + + dns_transaction_close_connection(t, true); + + /* Before we allocate a new UDP socket, let's process the graveyard a bit to free some fds */ + manager_socket_graveyard_process(t->scope->manager); + + fd = dns_scope_socket_udp(t->scope, t->server); + if (fd < 0) + return fd; + + r = sd_event_add_io(t->scope->manager->event, &t->dns_udp_event_source, fd, EPOLLIN, on_dns_packet, t); + if (r < 0) { + safe_close(fd); + return r; + } + + (void) sd_event_source_set_description(t->dns_udp_event_source, "dns-transaction-udp"); + t->dns_udp_fd = fd; + } + + if (!t->bypass) { + r = dns_server_adjust_opt(t->server, t->sent, t->current_feature_level); + if (r < 0) + return r; + } + } else + dns_transaction_close_connection(t, true); + + r = dns_scope_emit_udp(t->scope, t->dns_udp_fd, t->server ? t->server->family : AF_UNSPEC, t->sent); + if (r < 0) + return r; + + dns_transaction_reset_answer(t); + + return 0; +} + +static int on_transaction_timeout(sd_event_source *s, usec_t usec, void *userdata) { + DnsTransaction *t = ASSERT_PTR(userdata); + + assert(s); + + t->seen_timeout = true; + + if (t->initial_jitter_scheduled && !t->initial_jitter_elapsed) { + log_debug("Initial jitter phase for transaction %" PRIu16 " elapsed.", t->id); + t->initial_jitter_elapsed = true; + } else { + /* Timeout reached? Increase the timeout for the server used */ + switch (t->scope->protocol) { + + case DNS_PROTOCOL_DNS: + assert(t->server); + dns_server_packet_lost(t->server, t->stream ? IPPROTO_TCP : IPPROTO_UDP, t->current_feature_level); + break; + + case DNS_PROTOCOL_LLMNR: + case DNS_PROTOCOL_MDNS: + dns_scope_packet_lost(t->scope, usec - t->start_usec); + break; + + default: + assert_not_reached(); + } + + log_debug("Timeout reached on transaction %" PRIu16 ".", t->id); + } + + dns_transaction_retry(t, /* next_server= */ true); /* try a different server, but given this means + * packet loss, let's do so even if we already + * tried a bunch */ + return 0; +} + +static int dns_transaction_setup_timeout( + DnsTransaction *t, + usec_t timeout_usec /* relative */, + usec_t next_usec /* CLOCK_BOOTTIME */) { + + int r; + + assert(t); + + dns_transaction_stop_timeout(t); + + r = sd_event_add_time_relative( + t->scope->manager->event, + &t->timeout_event_source, + CLOCK_BOOTTIME, + timeout_usec, 0, + on_transaction_timeout, t); + if (r < 0) + return r; + + (void) sd_event_source_set_description(t->timeout_event_source, "dns-transaction-timeout"); + + t->next_attempt_after = next_usec; + t->state = DNS_TRANSACTION_PENDING; + return 0; +} + +static usec_t transaction_get_resend_timeout(DnsTransaction *t) { + assert(t); + assert(t->scope); + + switch (t->scope->protocol) { + + case DNS_PROTOCOL_DNS: + + /* When we do TCP, grant a much longer timeout, as in this case there's no need for us to quickly + * resend, as the kernel does that anyway for us, and we really don't want to interrupt it in that + * needlessly. */ + if (t->stream) + return TRANSACTION_TCP_TIMEOUT_USEC; + + return DNS_TIMEOUT_USEC; + + case DNS_PROTOCOL_MDNS: + if (t->probing) + return MDNS_PROBING_INTERVAL_USEC; + + /* See RFC 6762 Section 5.1 suggests that timeout should be a few seconds. */ + assert(t->n_attempts > 0); + return (1 << (t->n_attempts - 1)) * USEC_PER_SEC; + + case DNS_PROTOCOL_LLMNR: + return t->scope->resend_timeout; + + default: + assert_not_reached(); + } +} + +static void dns_transaction_randomize_answer(DnsTransaction *t) { + int r; + + assert(t); + + /* Randomizes the order of the answer array. This is done for all cached responses, so that we return + * a different order each time. We do this only for DNS traffic, in order to do some minimal, crappy + * load balancing. We don't do this for LLMNR or mDNS, since the order (preferring link-local + * addresses, and such like) might have meaning there, and load balancing is pointless. */ + + if (t->scope->protocol != DNS_PROTOCOL_DNS) + return; + + /* No point in randomizing, if there's just one RR */ + if (dns_answer_size(t->answer) <= 1) + return; + + r = dns_answer_reserve_or_clone(&t->answer, 0); + if (r < 0) /* If this fails, just don't randomize, this is non-essential stuff after all */ + return (void) log_debug_errno(r, "Failed to clone answer record, not randomizing RR order of answer: %m"); + + dns_answer_randomize(t->answer); +} + +static int dns_transaction_prepare(DnsTransaction *t, usec_t ts) { + int r; + + assert(t); + + /* Returns 0 if dns_transaction_complete() has been called. In that case the transaction and query + * candidate objects may have been invalidated and must not be accessed. Returns 1 if the transaction + * has been prepared. */ + + dns_transaction_stop_timeout(t); + + if (t->n_attempts == 1 && t->seen_timeout) + t->scope->manager->n_timeouts_total++; + + if (!dns_scope_network_good(t->scope)) { + dns_transaction_complete(t, DNS_TRANSACTION_NETWORK_DOWN); + return 0; + } + + if (t->n_attempts >= TRANSACTION_ATTEMPTS_MAX(t->scope->protocol)) { + DnsTransactionState result; + + if (t->scope->protocol == DNS_PROTOCOL_LLMNR) + /* If we didn't find anything on LLMNR, it's not an error, but a failure to resolve + * the name. */ + result = DNS_TRANSACTION_NOT_FOUND; + else + result = DNS_TRANSACTION_ATTEMPTS_MAX_REACHED; + + dns_transaction_complete(t, result); + return 0; + } + + if (t->scope->protocol == DNS_PROTOCOL_LLMNR && t->tried_stream) { + /* If we already tried via a stream, then we don't + * retry on LLMNR. See RFC 4795, Section 2.7. */ + dns_transaction_complete(t, DNS_TRANSACTION_ATTEMPTS_MAX_REACHED); + return 0; + } + + t->n_attempts++; + t->start_usec = ts; + + dns_transaction_reset_answer(t); + dns_transaction_flush_dnssec_transactions(t); + + /* Check the trust anchor. Do so only on classic DNS, since DNSSEC does not apply otherwise. */ + if (t->scope->protocol == DNS_PROTOCOL_DNS && + !FLAGS_SET(t->query_flags, SD_RESOLVED_NO_TRUST_ANCHOR)) { + r = dns_trust_anchor_lookup_positive(&t->scope->manager->trust_anchor, dns_transaction_key(t), &t->answer); + if (r < 0) + return r; + if (r > 0) { + t->answer_rcode = DNS_RCODE_SUCCESS; + t->answer_source = DNS_TRANSACTION_TRUST_ANCHOR; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED|SD_RESOLVED_CONFIDENTIAL, true); + dns_transaction_complete(t, DNS_TRANSACTION_SUCCESS); + return 0; + } + + if (dns_name_is_root(dns_resource_key_name(dns_transaction_key(t))) && + dns_transaction_key(t)->type == DNS_TYPE_DS) { + + /* Hmm, this is a request for the root DS? A DS RR doesn't exist in the root zone, + * and if our trust anchor didn't know it either, this means we cannot do any DNSSEC + * logic anymore. */ + + if (t->scope->dnssec_mode == DNSSEC_ALLOW_DOWNGRADE) { + /* We are in downgrade mode. In this case, synthesize an unsigned empty + * response, so that the any lookup depending on this one can continue + * assuming there was no DS, and hence the root zone was unsigned. */ + + t->answer_rcode = DNS_RCODE_SUCCESS; + t->answer_source = DNS_TRANSACTION_TRUST_ANCHOR; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, false); + SET_FLAG(t->answer_query_flags, SD_RESOLVED_CONFIDENTIAL, true); + dns_transaction_complete(t, DNS_TRANSACTION_SUCCESS); + } else + /* If we are not in downgrade mode, then fail the lookup, because we cannot + * reasonably answer it. There might be DS RRs, but we don't know them, and + * the DNS server won't tell them to us (and even if it would, we couldn't + * validate and trust them. */ + dns_transaction_complete(t, DNS_TRANSACTION_NO_TRUST_ANCHOR); + + return 0; + } + } + + /* Check the zone. */ + if (!FLAGS_SET(t->query_flags, SD_RESOLVED_NO_ZONE)) { + r = dns_zone_lookup(&t->scope->zone, dns_transaction_key(t), dns_scope_ifindex(t->scope), &t->answer, NULL, NULL); + if (r < 0) + return r; + if (r > 0) { + t->answer_rcode = DNS_RCODE_SUCCESS; + t->answer_source = DNS_TRANSACTION_ZONE; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED|SD_RESOLVED_CONFIDENTIAL, true); + dns_transaction_complete(t, DNS_TRANSACTION_SUCCESS); + return 0; + } + } + + /* Check the cache. */ + if (!FLAGS_SET(t->query_flags, SD_RESOLVED_NO_CACHE)) { + + /* Before trying the cache, let's make sure we figured out a server to use. Should this cause + * a change of server this might flush the cache. */ + (void) dns_scope_get_dns_server(t->scope); + + /* Let's then prune all outdated entries */ + dns_cache_prune(&t->scope->cache); + + /* For the initial attempt or when no stale data is requested, disable serve stale + * and answer the question from the cache (honors ttl property). + * On the second attempt, if StaleRetentionSec is greater than zero, + * try to answer the question using stale date (honors until property) */ + uint64_t query_flags = t->query_flags; + if (t->n_attempts == 1 || t->scope->manager->stale_retention_usec == 0) + query_flags |= SD_RESOLVED_NO_STALE; + + r = dns_cache_lookup( + &t->scope->cache, + dns_transaction_key(t), + query_flags, + &t->answer_rcode, + &t->answer, + &t->received, + &t->answer_query_flags, + &t->answer_dnssec_result); + if (r < 0) + return r; + if (r > 0) { + dns_transaction_randomize_answer(t); + + if (t->bypass && t->scope->protocol == DNS_PROTOCOL_DNS && !t->received) + /* When bypass mode is on, do not use cached data unless it came with a full + * packet. */ + dns_transaction_reset_answer(t); + else { + if (t->n_attempts > 1 && !FLAGS_SET(query_flags, SD_RESOLVED_NO_STALE)) { + + if (t->answer_rcode == DNS_RCODE_SUCCESS) { + if (t->seen_timeout) + t->scope->manager->n_timeouts_served_stale_total++; + else + t->scope->manager->n_failure_responses_served_stale_total++; + } + + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + log_debug("Serve Stale response rcode=%s for %s", + FORMAT_DNS_RCODE(t->answer_rcode), + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str)); + } + + t->answer_source = DNS_TRANSACTION_CACHE; + if (t->answer_rcode == DNS_RCODE_SUCCESS) + dns_transaction_complete(t, DNS_TRANSACTION_SUCCESS); + else + dns_transaction_complete(t, DNS_TRANSACTION_RCODE_FAILURE); + return 0; + } + } + } + + if (FLAGS_SET(t->query_flags, SD_RESOLVED_NO_NETWORK)) { + dns_transaction_complete(t, DNS_TRANSACTION_NO_SOURCE); + return 0; + } + + return 1; +} + +static int dns_packet_append_zone(DnsPacket *p, DnsTransaction *t, DnsResourceKey *k, unsigned *nscount) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + bool tentative; + int r; + + assert(p); + assert(t); + assert(k); + + if (k->type != DNS_TYPE_ANY) + return 0; + + r = dns_zone_lookup(&t->scope->zone, k, t->scope->link->ifindex, &answer, NULL, &tentative); + if (r < 0) + return r; + + return dns_packet_append_answer(p, answer, nscount); +} + +static int mdns_make_dummy_packet(DnsTransaction *t, DnsPacket **ret_packet, Set **ret_keys) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + _cleanup_set_free_ Set *keys = NULL; + bool add_known_answers = false; + unsigned qdcount; + usec_t ts; + int r; + + assert(t); + assert(t->scope); + assert(t->scope->protocol == DNS_PROTOCOL_MDNS); + assert(ret_packet); + assert(ret_keys); + + r = dns_packet_new_query(&p, t->scope->protocol, 0, false); + if (r < 0) + return r; + + r = dns_packet_append_key(p, dns_transaction_key(t), 0, NULL); + if (r < 0) + return r; + + qdcount = 1; + + if (dns_key_is_shared(dns_transaction_key(t))) + add_known_answers = true; + + r = dns_packet_append_zone(p, t, dns_transaction_key(t), NULL); + if (r < 0) + return r; + + /* Save appended keys */ + r = set_ensure_put(&keys, &dns_resource_key_hash_ops, dns_transaction_key(t)); + if (r < 0) + return r; + + assert_se(sd_event_now(t->scope->manager->event, CLOCK_BOOTTIME, &ts) >= 0); + + LIST_FOREACH(transactions_by_scope, other, t->scope->transactions) { + + /* Skip ourselves */ + if (other == t) + continue; + + if (other->state != DNS_TRANSACTION_PENDING) + continue; + + if (other->next_attempt_after > ts) + continue; + + if (!set_contains(keys, dns_transaction_key(other))) { + size_t saved_packet_size; + + r = dns_packet_append_key(p, dns_transaction_key(other), 0, &saved_packet_size); + /* If we can't stuff more questions into the packet, just give up. + * One of the 'other' transactions will fire later and take care of the rest. */ + if (r == -EMSGSIZE) + break; + if (r < 0) + return r; + + r = dns_packet_append_zone(p, t, dns_transaction_key(other), NULL); + if (r == -EMSGSIZE) { + dns_packet_truncate(p, saved_packet_size); + break; + } + if (r < 0) + return r; + + r = set_ensure_put(&keys, &dns_resource_key_hash_ops, dns_transaction_key(other)); + if (r < 0) + return r; + } + + r = dns_transaction_prepare(other, ts); + if (r < 0) + return r; + if (r == 0) + /* In this case, not only this transaction, but multiple transactions may be + * freed. Hence, we need to restart the loop. */ + return -EAGAIN; + + usec_t timeout = transaction_get_resend_timeout(other); + r = dns_transaction_setup_timeout(other, timeout, usec_add(ts, timeout)); + if (r < 0) + return r; + + if (dns_key_is_shared(dns_transaction_key(other))) + add_known_answers = true; + + qdcount++; + if (qdcount >= UINT16_MAX) + break; + } + + DNS_PACKET_HEADER(p)->qdcount = htobe16(qdcount); + + /* Append known answers section if we're asking for any shared record */ + if (add_known_answers) { + r = dns_cache_export_shared_to_packet(&t->scope->cache, p, ts, 0); + if (r < 0) + return r; + } + + *ret_packet = TAKE_PTR(p); + *ret_keys = TAKE_PTR(keys); + return add_known_answers; +} + +static int dns_transaction_make_packet_mdns(DnsTransaction *t) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL, *dummy = NULL; + _cleanup_set_free_ Set *keys = NULL; + bool add_known_answers; + DnsResourceKey *k; + unsigned c; + int r; + + assert(t); + assert(t->scope->protocol == DNS_PROTOCOL_MDNS); + + /* Discard any previously prepared packet, so we can start over and coalesce again */ + t->sent = dns_packet_unref(t->sent); + + /* First, create a dummy packet to calculate the number of known answers to be appended in the first packet. */ + for (;;) { + r = mdns_make_dummy_packet(t, &dummy, &keys); + if (r == -EAGAIN) + continue; + if (r < 0) + return r; + + add_known_answers = r; + break; + } + + /* Then, create actual packet. */ + r = dns_packet_new_query(&p, t->scope->protocol, 0, false); + if (r < 0) + return r; + + /* Questions */ + c = 0; + SET_FOREACH(k, keys) { + r = dns_packet_append_key(p, k, 0, NULL); + if (r < 0) + return r; + c++; + } + DNS_PACKET_HEADER(p)->qdcount = htobe16(c); + + /* Known answers */ + if (add_known_answers) { + usec_t ts; + + assert_se(sd_event_now(t->scope->manager->event, CLOCK_BOOTTIME, &ts) >= 0); + + r = dns_cache_export_shared_to_packet(&t->scope->cache, p, ts, be16toh(DNS_PACKET_HEADER(dummy)->ancount)); + if (r < 0) + return r; + } + + /* Authorities */ + c = 0; + SET_FOREACH(k, keys) { + r = dns_packet_append_zone(p, t, k, &c); + if (r < 0) + return r; + } + DNS_PACKET_HEADER(p)->nscount = htobe16(c); + + t->sent = TAKE_PTR(p); + return 0; +} + +static int dns_transaction_make_packet(DnsTransaction *t) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + int r; + + assert(t); + + if (t->scope->protocol == DNS_PROTOCOL_MDNS) + return dns_transaction_make_packet_mdns(t); + + if (t->sent) + return 0; + + if (t->bypass && t->bypass->protocol == t->scope->protocol) { + /* If bypass logic is enabled and the protocol if the original packet and our scope match, + * take the original packet, copy it, and patch in our new ID */ + r = dns_packet_dup(&p, t->bypass); + if (r < 0) + return r; + } else { + r = dns_packet_new_query( + &p, t->scope->protocol, + /* min_alloc_dsize = */ 0, + /* dnssec_cd = */ !FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE) && + t->scope->dnssec_mode != DNSSEC_NO); + if (r < 0) + return r; + + r = dns_packet_append_key(p, dns_transaction_key(t), 0, NULL); + if (r < 0) + return r; + + DNS_PACKET_HEADER(p)->qdcount = htobe16(1); + } + + DNS_PACKET_HEADER(p)->id = t->id; + + t->sent = TAKE_PTR(p); + return 0; +} + +int dns_transaction_go(DnsTransaction *t) { + usec_t ts; + int r; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + assert(t); + + /* Returns > 0 if the transaction is now pending, returns 0 if could be processed immediately and has + * finished now. In the latter case, the transaction and query candidate objects must not be accessed. + */ + + assert_se(sd_event_now(t->scope->manager->event, CLOCK_BOOTTIME, &ts) >= 0); + + r = dns_transaction_prepare(t, ts); + if (r <= 0) + return r; + + log_debug("Firing %s transaction %" PRIu16 " for <%s> scope %s on %s/%s (validate=%s).", + t->bypass ? "bypass" : "regular", + t->id, + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str), + dns_protocol_to_string(t->scope->protocol), + t->scope->link ? t->scope->link->ifname : "*", + af_to_name_short(t->scope->family), + yes_no(!FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE))); + + if (!t->initial_jitter_scheduled && + IN_SET(t->scope->protocol, DNS_PROTOCOL_LLMNR, DNS_PROTOCOL_MDNS)) { + usec_t jitter; + + /* RFC 4795 Section 2.7 suggests all LLMNR queries should be delayed by a random time from 0 to + * JITTER_INTERVAL. + * RFC 6762 Section 8.1 suggests initial probe queries should be delayed by a random time from + * 0 to 250ms. */ + + t->initial_jitter_scheduled = true; + t->n_attempts = 0; + + switch (t->scope->protocol) { + + case DNS_PROTOCOL_LLMNR: + jitter = random_u64_range(LLMNR_JITTER_INTERVAL_USEC); + break; + + case DNS_PROTOCOL_MDNS: + if (t->probing) + jitter = random_u64_range(MDNS_PROBING_INTERVAL_USEC); + else + jitter = 0; + break; + default: + assert_not_reached(); + } + + r = dns_transaction_setup_timeout(t, jitter, ts); + if (r < 0) + return r; + + log_debug("Delaying %s transaction %" PRIu16 " for " USEC_FMT "us.", + dns_protocol_to_string(t->scope->protocol), + t->id, + jitter); + return 1; + } + + /* Otherwise, we need to ask the network */ + r = dns_transaction_make_packet(t); + if (r < 0) + return r; + + if (t->scope->protocol == DNS_PROTOCOL_LLMNR && + (dns_name_endswith(dns_resource_key_name(dns_transaction_key(t)), "in-addr.arpa") > 0 || + dns_name_endswith(dns_resource_key_name(dns_transaction_key(t)), "ip6.arpa") > 0)) { + + /* RFC 4795, Section 2.4. says reverse lookups shall + * always be made via TCP on LLMNR */ + r = dns_transaction_emit_tcp(t); + } else { + /* Try via UDP, and if that fails due to large size or lack of + * support try via TCP */ + r = dns_transaction_emit_udp(t); + if (r == -EMSGSIZE) + log_debug("Sending query via TCP since it is too large."); + else if (r == -EAGAIN) + log_debug("Sending query via TCP since UDP isn't supported or DNS-over-TLS is selected."); + else if (r == -EPERM) + log_debug("Sending query via TCP since UDP is blocked."); + if (IN_SET(r, -EMSGSIZE, -EAGAIN, -EPERM)) + r = dns_transaction_emit_tcp(t); + } + if (r == -ELOOP) { + if (t->scope->protocol != DNS_PROTOCOL_DNS) + return r; + + /* One of our own stub listeners */ + log_debug_errno(r, "Detected that specified DNS server is our own extra listener, switching DNS servers."); + + dns_scope_next_dns_server(t->scope, t->server); + + if (dns_scope_get_dns_server(t->scope) == t->server) { + log_debug_errno(r, "Still pointing to extra listener after switching DNS servers, refusing operation."); + dns_transaction_complete(t, DNS_TRANSACTION_STUB_LOOP); + return 0; + } + + return dns_transaction_go(t); + } + if (r == -ESRCH) { + /* No servers to send this to? */ + dns_transaction_complete(t, DNS_TRANSACTION_NO_SERVERS); + return 0; + } + if (r == -EOPNOTSUPP) { + /* Tried to ask for DNSSEC RRs, on a server that doesn't do DNSSEC */ + dns_transaction_complete(t, DNS_TRANSACTION_RR_TYPE_UNSUPPORTED); + return 0; + } + if (t->scope->protocol == DNS_PROTOCOL_LLMNR && ERRNO_IS_NEG_DISCONNECT(r)) { + /* On LLMNR, if we cannot connect to a host via TCP when doing reverse lookups. This means we cannot + * answer this request with this protocol. */ + dns_transaction_complete(t, DNS_TRANSACTION_NOT_FOUND); + return 0; + } + if (r < 0) { + if (t->scope->protocol != DNS_PROTOCOL_DNS) + return r; + + /* Couldn't send? Try immediately again, with a new server */ + dns_scope_next_dns_server(t->scope, t->server); + + return dns_transaction_go(t); + } + + usec_t timeout = transaction_get_resend_timeout(t); + r = dns_transaction_setup_timeout(t, timeout, usec_add(ts, timeout)); + if (r < 0) + return r; + + return 1; +} + +static int dns_transaction_find_cyclic(DnsTransaction *t, DnsTransaction *aux) { + DnsTransaction *n; + int r; + + assert(t); + assert(aux); + + /* Try to find cyclic dependencies between transaction objects */ + + if (t == aux) + return 1; + + SET_FOREACH(n, aux->dnssec_transactions) { + r = dns_transaction_find_cyclic(t, n); + if (r != 0) + return r; + } + + return 0; +} + +static int dns_transaction_add_dnssec_transaction(DnsTransaction *t, DnsResourceKey *key, DnsTransaction **ret) { + _cleanup_(dns_transaction_gcp) DnsTransaction *aux = NULL; + int r; + + assert(t); + assert(ret); + assert(key); + + aux = dns_scope_find_transaction(t->scope, key, t->query_flags); + if (!aux) { + r = dns_transaction_new(&aux, t->scope, key, NULL, t->query_flags); + if (r < 0) + return r; + } else { + if (set_contains(t->dnssec_transactions, aux)) { + *ret = aux; + return 0; + } + + r = dns_transaction_find_cyclic(t, aux); + if (r < 0) + return r; + if (r > 0) { + char s[DNS_RESOURCE_KEY_STRING_MAX], saux[DNS_RESOURCE_KEY_STRING_MAX]; + + return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), + "Potential cyclic dependency, refusing to add transaction %" PRIu16 " (%s) as dependency for %" PRIu16 " (%s).", + aux->id, + dns_resource_key_to_string(dns_transaction_key(t), s, sizeof s), + t->id, + dns_resource_key_to_string(dns_transaction_key(aux), saux, sizeof saux)); + } + } + + r = set_ensure_allocated(&aux->notify_transactions_done, NULL); + if (r < 0) + return r; + + r = set_ensure_put(&t->dnssec_transactions, NULL, aux); + if (r < 0) + return r; + + r = set_ensure_put(&aux->notify_transactions, NULL, t); + if (r < 0) { + (void) set_remove(t->dnssec_transactions, aux); + return r; + } + + *ret = TAKE_PTR(aux); + return 1; +} + +static int dns_transaction_request_dnssec_rr(DnsTransaction *t, DnsResourceKey *key) { + _cleanup_(dns_answer_unrefp) DnsAnswer *a = NULL; + DnsTransaction *aux; + int r; + + assert(t); + assert(key); + + /* Try to get the data from the trust anchor */ + r = dns_trust_anchor_lookup_positive(&t->scope->manager->trust_anchor, key, &a); + if (r < 0) + return r; + if (r > 0) { + r = dns_answer_extend(&t->validated_keys, a); + if (r < 0) + return r; + + return 0; + } + + /* This didn't work, ask for it via the network/cache then. */ + r = dns_transaction_add_dnssec_transaction(t, key, &aux); + if (r == -ELOOP) /* This would result in a cyclic dependency */ + return 0; + if (r < 0) + return r; + + if (aux->state == DNS_TRANSACTION_NULL) { + r = dns_transaction_go(aux); + if (r < 0) + return r; + } + + return 1; +} + +static int dns_transaction_negative_trust_anchor_lookup(DnsTransaction *t, const char *name) { + int r; + + assert(t); + + /* Check whether the specified name is in the NTA + * database, either in the global one, or the link-local + * one. */ + + r = dns_trust_anchor_lookup_negative(&t->scope->manager->trust_anchor, name); + if (r != 0) + return r; + + if (!t->scope->link) + return 0; + + return link_negative_trust_anchor_lookup(t->scope->link, name); +} + +static int dns_transaction_has_negative_answer(DnsTransaction *t) { + int r; + + assert(t); + + /* Checks whether the answer is negative, and lacks NSEC/NSEC3 + * RRs to prove it */ + + r = dns_transaction_has_positive_answer(t, NULL); + if (r < 0) + return r; + if (r > 0) + return false; + + /* Is this key explicitly listed as a negative trust anchor? + * If so, it's nothing we need to care about */ + r = dns_transaction_negative_trust_anchor_lookup(t, dns_resource_key_name(dns_transaction_key(t))); + if (r < 0) + return r; + return !r; +} + +static int dns_transaction_is_primary_response(DnsTransaction *t, DnsResourceRecord *rr) { + int r; + + assert(t); + assert(rr); + + /* Check if the specified RR is the "primary" response, + * i.e. either matches the question precisely or is a + * CNAME/DNAME for it. */ + + r = dns_resource_key_match_rr(dns_transaction_key(t), rr, NULL); + if (r != 0) + return r; + + return dns_resource_key_match_cname_or_dname(dns_transaction_key(t), rr->key, NULL); +} + +static bool dns_transaction_dnssec_supported(DnsTransaction *t) { + assert(t); + + /* Checks whether our transaction's DNS server is assumed to be compatible with DNSSEC. Returns false as soon + * as we changed our mind about a server, and now believe it is incompatible with DNSSEC. */ + + if (t->scope->protocol != DNS_PROTOCOL_DNS) + return false; + + /* If we have picked no server, then we are working from the cache or some other source, and DNSSEC might well + * be supported, hence return true. */ + if (!t->server) + return true; + + /* Note that we do not check the feature level actually used for the transaction but instead the feature level + * the server is known to support currently, as the transaction feature level might be lower than what the + * server actually supports, since we might have downgraded this transaction's feature level because we got a + * SERVFAIL earlier and wanted to check whether downgrading fixes it. */ + + return dns_server_dnssec_supported(t->server); +} + +static bool dns_transaction_dnssec_supported_full(DnsTransaction *t) { + DnsTransaction *dt; + + assert(t); + + /* Checks whether our transaction our any of the auxiliary transactions couldn't do DNSSEC. */ + + if (!dns_transaction_dnssec_supported(t)) + return false; + + SET_FOREACH(dt, t->dnssec_transactions) + if (!dns_transaction_dnssec_supported(dt)) + return false; + + return true; +} + +int dns_transaction_request_dnssec_keys(DnsTransaction *t) { + DnsResourceRecord *rr; + + int r; + + assert(t); + + /* + * Retrieve all auxiliary RRs for the answer we got, so that + * we can verify signatures or prove that RRs are rightfully + * unsigned. Specifically: + * + * - For RRSIG we get the matching DNSKEY + * - For DNSKEY we get the matching DS + * - For unsigned SOA/NS we get the matching DS + * - For unsigned CNAME/DNAME/DS we get the parent SOA RR + * - For other unsigned RRs we get the matching SOA RR + * - For SOA/NS queries with no matching response RR, and no NSEC/NSEC3, the DS RR + * - For DS queries with no matching response RRs, and no NSEC/NSEC3, the parent's SOA RR + * - For other queries with no matching response RRs, and no NSEC/NSEC3, the SOA RR + */ + + if (FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE) || t->scope->dnssec_mode == DNSSEC_NO) + return 0; + if (t->answer_source != DNS_TRANSACTION_NETWORK) + return 0; /* We only need to validate stuff from the network */ + if (!dns_transaction_dnssec_supported(t)) + return 0; /* If we can't do DNSSEC anyway there's no point in getting the auxiliary RRs */ + + DNS_ANSWER_FOREACH(rr, t->answer) { + + if (dns_type_is_pseudo(rr->key->type)) + continue; + + /* If this RR is in the negative trust anchor, we don't need to validate it. */ + r = dns_transaction_negative_trust_anchor_lookup(t, dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r > 0) + continue; + + switch (rr->key->type) { + + case DNS_TYPE_RRSIG: { + /* For each RRSIG we request the matching DNSKEY */ + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *dnskey = NULL; + + /* If this RRSIG is about a DNSKEY RR and the + * signer is the same as the owner, then we + * already have the DNSKEY, and we don't have + * to look for more. */ + if (rr->rrsig.type_covered == DNS_TYPE_DNSKEY) { + r = dns_name_equal(rr->rrsig.signer, dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r > 0) + continue; + } + + /* If the signer is not a parent of our + * original query, then this is about an + * auxiliary RRset, but not anything we asked + * for. In this case we aren't interested, + * because we don't want to request additional + * RRs for stuff we didn't really ask for, and + * also to avoid request loops, where + * additional RRs from one transaction result + * in another transaction whose additional RRs + * point back to the original transaction, and + * we deadlock. */ + r = dns_name_endswith(dns_resource_key_name(dns_transaction_key(t)), rr->rrsig.signer); + if (r < 0) + return r; + if (r == 0) + continue; + + dnskey = dns_resource_key_new(rr->key->class, DNS_TYPE_DNSKEY, rr->rrsig.signer); + if (!dnskey) + return -ENOMEM; + + log_debug("Requesting DNSKEY to validate transaction %" PRIu16" (%s, RRSIG with key tag: %" PRIu16 ").", + t->id, dns_resource_key_name(rr->key), rr->rrsig.key_tag); + r = dns_transaction_request_dnssec_rr(t, dnskey); + if (r < 0) + return r; + break; + } + + case DNS_TYPE_DNSKEY: { + /* For each DNSKEY we request the matching DS */ + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *ds = NULL; + + /* If the DNSKEY we are looking at is not for + * zone we are interested in, nor any of its + * parents, we aren't interested, and don't + * request it. After all, we don't want to end + * up in request loops, and want to keep + * additional traffic down. */ + + r = dns_name_endswith(dns_resource_key_name(dns_transaction_key(t)), dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r == 0) + continue; + + ds = dns_resource_key_new(rr->key->class, DNS_TYPE_DS, dns_resource_key_name(rr->key)); + if (!ds) + return -ENOMEM; + + log_debug("Requesting DS to validate transaction %" PRIu16" (%s, DNSKEY with key tag: %" PRIu16 ").", + t->id, dns_resource_key_name(rr->key), dnssec_keytag(rr, false)); + r = dns_transaction_request_dnssec_rr(t, ds); + if (r < 0) + return r; + + break; + } + + case DNS_TYPE_SOA: + case DNS_TYPE_NS: { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *ds = NULL; + + /* For an unsigned SOA or NS, try to acquire + * the matching DS RR, as we are at a zone cut + * then, and whether a DS exists tells us + * whether the zone is signed. Do so only if + * this RR matches our original question, + * however. */ + + r = dns_resource_key_match_rr(dns_transaction_key(t), rr, NULL); + if (r < 0) + return r; + if (r == 0) { + /* Hmm, so this SOA RR doesn't match our original question. In this case, maybe this is + * a negative reply, and we need the SOA RR's TTL in order to cache a negative entry? + * If so, we need to validate it, too. */ + + r = dns_answer_match_key(t->answer, dns_transaction_key(t), NULL); + if (r < 0) + return r; + if (r > 0) /* positive reply, we won't need the SOA and hence don't need to validate + * it. */ + continue; + + /* Only bother with this if the SOA/NS RR we are looking at is actually a parent of + * what we are looking for, otherwise there's no value in it for us. */ + r = dns_name_endswith(dns_resource_key_name(dns_transaction_key(t)), dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r == 0) + continue; + } + + r = dnssec_has_rrsig(t->answer, rr->key); + if (r < 0) + return r; + if (r > 0) + continue; + + ds = dns_resource_key_new(rr->key->class, DNS_TYPE_DS, dns_resource_key_name(rr->key)); + if (!ds) + return -ENOMEM; + + log_debug("Requesting DS to validate transaction %" PRIu16 " (%s, unsigned SOA/NS RRset).", + t->id, dns_resource_key_name(rr->key)); + r = dns_transaction_request_dnssec_rr(t, ds); + if (r < 0) + return r; + + break; + } + + case DNS_TYPE_DS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *soa = NULL; + const char *name; + + /* CNAMEs and DNAMEs cannot be located at a + * zone apex, hence ask for the parent SOA for + * unsigned CNAME/DNAME RRs, maybe that's the + * apex. But do all that only if this is + * actually a response to our original + * question. + * + * Similar for DS RRs, which are signed when + * the parent SOA is signed. */ + + r = dns_transaction_is_primary_response(t, rr); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dnssec_has_rrsig(t->answer, rr->key); + if (r < 0) + return r; + if (r > 0) + continue; + + r = dns_answer_has_dname_for_cname(t->answer, rr); + if (r < 0) + return r; + if (r > 0) + continue; + + name = dns_resource_key_name(rr->key); + r = dns_name_parent(&name); + if (r < 0) + return r; + if (r == 0) + continue; + + soa = dns_resource_key_new(rr->key->class, DNS_TYPE_SOA, name); + if (!soa) + return -ENOMEM; + + log_debug("Requesting parent SOA to validate transaction %" PRIu16 " (%s, unsigned CNAME/DNAME/DS RRset).", + t->id, dns_resource_key_name(rr->key)); + r = dns_transaction_request_dnssec_rr(t, soa); + if (r < 0) + return r; + + break; + } + + default: { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *soa = NULL; + + /* For other unsigned RRsets (including + * NSEC/NSEC3!), look for proof the zone is + * unsigned, by requesting the SOA RR of the + * zone. However, do so only if they are + * directly relevant to our original + * question. */ + + r = dns_transaction_is_primary_response(t, rr); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dnssec_has_rrsig(t->answer, rr->key); + if (r < 0) + return r; + if (r > 0) + continue; + + soa = dns_resource_key_new(rr->key->class, DNS_TYPE_SOA, dns_resource_key_name(rr->key)); + if (!soa) + return -ENOMEM; + + log_debug("Requesting SOA to validate transaction %" PRIu16 " (%s, unsigned non-SOA/NS RRset <%s>).", + t->id, dns_resource_key_name(rr->key), dns_resource_record_to_string(rr)); + r = dns_transaction_request_dnssec_rr(t, soa); + if (r < 0) + return r; + break; + }} + } + + /* Above, we requested everything necessary to validate what + * we got. Now, let's request what we need to validate what we + * didn't get... */ + + r = dns_transaction_has_negative_answer(t); + if (r < 0) + return r; + if (r > 0) { + const char *name, *signed_status; + uint16_t type = 0; + + name = dns_resource_key_name(dns_transaction_key(t)); + signed_status = dns_answer_contains_nsec_or_nsec3(t->answer) ? "signed" : "unsigned"; + + /* If this was a SOA or NS request, then check if there's a DS RR for the same domain. Note that this + * could also be used as indication that we are not at a zone apex, but in real world setups there are + * too many broken DNS servers (Hello, incapdns.net!) where non-terminal zones return NXDOMAIN even + * though they have further children. If this was a DS request, then it's signed when the parent zone + * is signed, hence ask the parent SOA in that case. If this was any other RR then ask for the SOA RR, + * to see if that is signed. */ + + if (dns_transaction_key(t)->type == DNS_TYPE_DS) { + r = dns_name_parent(&name); + if (r > 0) { + type = DNS_TYPE_SOA; + log_debug("Requesting parent SOA (%s %s) to validate transaction %" PRIu16 " (%s, %s empty DS response).", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, + dns_resource_key_name(dns_transaction_key(t)), signed_status); + } else + name = NULL; + + } else if (IN_SET(dns_transaction_key(t)->type, DNS_TYPE_SOA, DNS_TYPE_NS)) { + + type = DNS_TYPE_DS; + log_debug("Requesting DS (%s %s) to validate transaction %" PRIu16 " (%s, %s empty SOA/NS response).", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, name, signed_status); + + } else { + type = DNS_TYPE_SOA; + log_debug("Requesting SOA (%s %s) to validate transaction %" PRIu16 " (%s, %s empty non-SOA/NS/DS response).", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), name, t->id, name, signed_status); + } + + if (name) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *soa = NULL; + + soa = dns_resource_key_new(dns_transaction_key(t)->class, type, name); + if (!soa) + return -ENOMEM; + + r = dns_transaction_request_dnssec_rr(t, soa); + if (r < 0) + return r; + } + } + + return dns_transaction_dnssec_is_live(t); +} + +void dns_transaction_notify(DnsTransaction *t, DnsTransaction *source) { + assert(t); + assert(source); + + /* Invoked whenever any of our auxiliary DNSSEC transactions completed its work. If the state is still PENDING, + we are still in the loop that adds further DNSSEC transactions, hence don't check if we are ready yet. If + the state is VALIDATING however, we should check if we are complete now. */ + + if (t->state == DNS_TRANSACTION_VALIDATING) + dns_transaction_process_dnssec(t); +} + +static int dns_transaction_validate_dnskey_by_ds(DnsTransaction *t) { + DnsAnswerItem *item; + int r; + + assert(t); + + /* Add all DNSKEY RRs from the answer that are validated by DS + * RRs from the list of validated keys to the list of + * validated keys. */ + + DNS_ANSWER_FOREACH_ITEM(item, t->answer) { + + r = dnssec_verify_dnskey_by_ds_search(item->rr, t->validated_keys); + if (r < 0) + return r; + if (r == 0) + continue; + + /* If so, the DNSKEY is validated too. */ + r = dns_answer_add_extend(&t->validated_keys, item->rr, item->ifindex, item->flags|DNS_ANSWER_AUTHENTICATED, item->rrsig); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_transaction_requires_rrsig(DnsTransaction *t, DnsResourceRecord *rr) { + int r; + + assert(t); + assert(rr); + + /* Checks if the RR we are looking for must be signed with an + * RRSIG. This is used for positive responses. */ + + if (t->scope->dnssec_mode == DNSSEC_NO) + return false; + + if (dns_type_is_pseudo(rr->key->type)) + return -EINVAL; + + r = dns_transaction_negative_trust_anchor_lookup(t, dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r > 0) + return false; + + switch (rr->key->type) { + + case DNS_TYPE_RRSIG: + /* RRSIGs are the signatures themselves, they need no signing. */ + return false; + + case DNS_TYPE_SOA: + case DNS_TYPE_NS: { + DnsTransaction *dt; + + /* For SOA or NS RRs we look for a matching DS transaction */ + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (dns_transaction_key(dt)->class != rr->key->class) + continue; + if (dns_transaction_key(dt)->type != DNS_TYPE_DS) + continue; + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r == 0) + continue; + + /* We found a DS transactions for the SOA/NS + * RRs we are looking at. If it discovered signed DS + * RRs, then we need to be signed, too. */ + + if (!FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + return false; + + return dns_answer_match_key(dt->answer, dns_transaction_key(dt), NULL); + } + + /* We found nothing that proves this is safe to leave + * this unauthenticated, hence ask inist on + * authentication. */ + return true; + } + + case DNS_TYPE_DS: + case DNS_TYPE_CNAME: + case DNS_TYPE_DNAME: { + const char *parent = NULL; + DnsTransaction *dt; + + /* + * CNAME/DNAME RRs cannot be located at a zone apex, hence look directly for the parent SOA. + * + * DS RRs are signed if the parent is signed, hence also look at the parent SOA + */ + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (dns_transaction_key(dt)->class != rr->key->class) + continue; + if (dns_transaction_key(dt)->type != DNS_TYPE_SOA) + continue; + + if (!parent) { + parent = dns_resource_key_name(rr->key); + r = dns_name_parent(&parent); + if (r < 0) + return r; + if (r == 0) { + if (rr->key->type == DNS_TYPE_DS) + return true; + + /* A CNAME/DNAME without a parent? That's sooo weird. */ + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "Transaction %" PRIu16 " claims CNAME/DNAME at root. Refusing.", t->id); + } + } + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), parent); + if (r < 0) + return r; + if (r == 0) + continue; + + return FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED); + } + + return true; + } + + default: { + DnsTransaction *dt; + + /* Any other kind of RR (including DNSKEY/NSEC/NSEC3). Let's see if our SOA lookup was authenticated */ + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (dns_transaction_key(dt)->class != rr->key->class) + continue; + if (dns_transaction_key(dt)->type != DNS_TYPE_SOA) + continue; + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r == 0) + continue; + + /* We found the transaction that was supposed to find the SOA RR for us. It was + * successful, but found no RR for us. This means we are not at a zone cut. In this + * case, we require authentication if the SOA lookup was authenticated too. */ + return FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED); + } + + return true; + }} +} + +static int dns_transaction_in_private_tld(DnsTransaction *t, const DnsResourceKey *key) { + DnsTransaction *dt; + const char *tld; + int r; + + /* If DNSSEC downgrade mode is on, checks whether the + * specified RR is one level below a TLD we have proven not to + * exist. In such a case we assume that this is a private + * domain, and permit it. + * + * This detects cases like the Fritz!Box router networks. Each + * Fritz!Box router serves a private "fritz.box" zone, in the + * non-existing TLD "box". Requests for the "fritz.box" domain + * are served by the router itself, while requests for the + * "box" domain will result in NXDOMAIN. + * + * Note that this logic is unable to detect cases where a + * router serves a private DNS zone directly under + * non-existing TLD. In such a case we cannot detect whether + * the TLD is supposed to exist or not, as all requests we + * make for it will be answered by the router's zone, and not + * by the root zone. */ + + assert(t); + + if (t->scope->dnssec_mode != DNSSEC_ALLOW_DOWNGRADE) + return false; /* In strict DNSSEC mode what doesn't exist, doesn't exist */ + + tld = dns_resource_key_name(key); + r = dns_name_parent(&tld); + if (r < 0) + return r; + if (r == 0) + return false; /* Already the root domain */ + + if (!dns_name_is_single_label(tld)) + return false; + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (dns_transaction_key(dt)->class != key->class) + continue; + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), tld); + if (r < 0) + return r; + if (r == 0) + continue; + + /* We found an auxiliary lookup we did for the TLD. If + * that returned with NXDOMAIN, we know the TLD didn't + * exist, and hence this might be a private zone. */ + + return dt->answer_rcode == DNS_RCODE_NXDOMAIN; + } + + return false; +} + +static int dns_transaction_requires_nsec(DnsTransaction *t) { + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + DnsTransaction *dt; + const char *name; + uint16_t type = 0; + int r; + + assert(t); + + /* Checks if we need to insist on NSEC/NSEC3 RRs for proving + * this negative reply */ + + if (t->scope->dnssec_mode == DNSSEC_NO) + return false; + + if (dns_type_is_pseudo(dns_transaction_key(t)->type)) + return -EINVAL; + + r = dns_transaction_negative_trust_anchor_lookup(t, dns_resource_key_name(dns_transaction_key(t))); + if (r < 0) + return r; + if (r > 0) + return false; + + r = dns_transaction_in_private_tld(t, dns_transaction_key(t)); + if (r < 0) + return r; + if (r > 0) { + /* The lookup is from a TLD that is proven not to + * exist, and we are in downgrade mode, hence ignore + * that fact that we didn't get any NSEC RRs. */ + + log_info("Detected a negative query %s in a private DNS zone, permitting unsigned response.", + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str)); + return false; + } + + name = dns_resource_key_name(dns_transaction_key(t)); + + if (dns_transaction_key(t)->type == DNS_TYPE_DS) { + + /* We got a negative reply for this DS lookup? DS RRs are signed when their parent zone is signed, + * hence check the parent SOA in this case. */ + + r = dns_name_parent(&name); + if (r < 0) + return r; + if (r == 0) + return true; + + type = DNS_TYPE_SOA; + + } else if (IN_SET(dns_transaction_key(t)->type, DNS_TYPE_SOA, DNS_TYPE_NS)) + /* We got a negative reply for this SOA/NS lookup? If so, check if there's a DS RR for this */ + type = DNS_TYPE_DS; + else + /* For all other negative replies, check for the SOA lookup */ + type = DNS_TYPE_SOA; + + /* For all other RRs we check the SOA on the same level to see + * if it's signed. */ + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (dns_transaction_key(dt)->class != dns_transaction_key(t)->class) + continue; + if (dns_transaction_key(dt)->type != type) + continue; + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), name); + if (r < 0) + return r; + if (r == 0) + continue; + + return FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED); + } + + /* If in doubt, require NSEC/NSEC3 */ + return true; +} + +static int dns_transaction_dnskey_authenticated(DnsTransaction *t, DnsResourceRecord *rr) { + DnsResourceRecord *rrsig; + bool found = false; + int r; + + /* Checks whether any of the DNSKEYs used for the RRSIGs for + * the specified RRset is authenticated (i.e. has a matching + * DS RR). */ + + r = dns_transaction_negative_trust_anchor_lookup(t, dns_resource_key_name(rr->key)); + if (r < 0) + return r; + if (r > 0) + return false; + + DNS_ANSWER_FOREACH(rrsig, t->answer) { + DnsTransaction *dt; + + r = dnssec_key_match_rrsig(rr->key, rrsig); + if (r < 0) + return r; + if (r == 0) + continue; + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (dns_transaction_key(dt)->class != rr->key->class) + continue; + + if (dns_transaction_key(dt)->type == DNS_TYPE_DNSKEY) { + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), rrsig->rrsig.signer); + if (r < 0) + return r; + if (r == 0) + continue; + + /* OK, we found an auxiliary DNSKEY lookup. If that lookup is authenticated, + * report this. */ + + if (FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + return true; + + found = true; + + } else if (dns_transaction_key(dt)->type == DNS_TYPE_DS) { + + r = dns_name_equal(dns_resource_key_name(dns_transaction_key(dt)), rrsig->rrsig.signer); + if (r < 0) + return r; + if (r == 0) + continue; + + /* OK, we found an auxiliary DS lookup. If that lookup is authenticated and + * non-zero, we won! */ + + if (!FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + return false; + + return dns_answer_match_key(dt->answer, dns_transaction_key(dt), NULL); + } + } + } + + return found ? false : -ENXIO; +} + +static int dns_transaction_known_signed(DnsTransaction *t, DnsResourceRecord *rr) { + assert(t); + assert(rr); + + /* We know that the root domain is signed, hence if it appears + * not to be signed, there's a problem with the DNS server */ + + return rr->key->class == DNS_CLASS_IN && + dns_name_is_root(dns_resource_key_name(rr->key)); +} + +static int dns_transaction_check_revoked_trust_anchors(DnsTransaction *t) { + DnsResourceRecord *rr; + int r; + + assert(t); + + /* Maybe warn the user that we encountered a revoked DNSKEY + * for a key from our trust anchor. Note that we don't care + * whether the DNSKEY can be authenticated or not. It's + * sufficient if it is self-signed. */ + + DNS_ANSWER_FOREACH(rr, t->answer) { + r = dns_trust_anchor_check_revoked(&t->scope->manager->trust_anchor, rr, t->answer); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_transaction_invalidate_revoked_keys(DnsTransaction *t) { + bool changed; + int r; + + assert(t); + + /* Removes all DNSKEY/DS objects from t->validated_keys that + * our trust anchors database considers revoked. */ + + do { + DnsResourceRecord *rr; + + changed = false; + + DNS_ANSWER_FOREACH(rr, t->validated_keys) { + r = dns_trust_anchor_is_revoked(&t->scope->manager->trust_anchor, rr); + if (r < 0) + return r; + if (r > 0) { + r = dns_answer_remove_by_rr(&t->validated_keys, rr); + if (r < 0) + return r; + + assert(r > 0); + changed = true; + break; + } + } + } while (changed); + + return 0; +} + +static int dns_transaction_copy_validated(DnsTransaction *t) { + DnsTransaction *dt; + int r; + + assert(t); + + /* Copy all validated RRs from the auxiliary DNSSEC transactions into our set of validated RRs */ + + SET_FOREACH(dt, t->dnssec_transactions) { + + if (DNS_TRANSACTION_IS_LIVE(dt->state)) + continue; + + if (!FLAGS_SET(dt->answer_query_flags, SD_RESOLVED_AUTHENTICATED)) + continue; + + r = dns_answer_extend(&t->validated_keys, dt->answer); + if (r < 0) + return r; + } + + return 0; +} + +typedef enum { + DNSSEC_PHASE_DNSKEY, /* Phase #1, only validate DNSKEYs */ + DNSSEC_PHASE_NSEC, /* Phase #2, only validate NSEC+NSEC3 */ + DNSSEC_PHASE_ALL, /* Phase #3, validate everything else */ +} Phase; + +static int dnssec_validate_records( + DnsTransaction *t, + Phase phase, + bool *have_nsec, + unsigned *nvalidations, + DnsAnswer **validated) { + + DnsResourceRecord *rr; + int r; + + assert(nvalidations); + + /* Returns negative on error, 0 if validation failed, 1 to restart validation, 2 when finished. */ + + DNS_ANSWER_FOREACH(rr, t->answer) { + _unused_ _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr_ref = dns_resource_record_ref(rr); + DnsResourceRecord *rrsig = NULL; + DnssecResult result; + + switch (rr->key->type) { + case DNS_TYPE_RRSIG: + continue; + + case DNS_TYPE_DNSKEY: + /* We validate DNSKEYs only in the DNSKEY and ALL phases */ + if (phase == DNSSEC_PHASE_NSEC) + continue; + break; + + case DNS_TYPE_NSEC: + case DNS_TYPE_NSEC3: + *have_nsec = true; + + /* We validate NSEC/NSEC3 only in the NSEC and ALL phases */ + if (phase == DNSSEC_PHASE_DNSKEY) + continue; + break; + + default: + /* We validate all other RRs only in the ALL phases */ + if (phase != DNSSEC_PHASE_ALL) + continue; + } + + r = dnssec_verify_rrset_search( + t->answer, + rr->key, + t->validated_keys, + USEC_INFINITY, + &result, + &rrsig); + if (r < 0) + return r; + *nvalidations += r; + + log_debug("Looking at %s: %s", strna(dns_resource_record_to_string(rr)), dnssec_result_to_string(result)); + + if (result == DNSSEC_VALIDATED) { + assert(rrsig); + + if (rr->key->type == DNS_TYPE_DNSKEY) { + /* If we just validated a DNSKEY RRset, then let's add these keys to + * the set of validated keys for this transaction. */ + + r = dns_answer_copy_by_key(&t->validated_keys, t->answer, rr->key, DNS_ANSWER_AUTHENTICATED, rrsig); + if (r < 0) + return r; + + /* Some of the DNSKEYs we just added might already have been revoked, + * remove them again in that case. */ + r = dns_transaction_invalidate_revoked_keys(t); + if (r < 0) + return r; + } + + /* Add the validated RRset to the new list of validated RRsets, and remove it from + * the unvalidated RRsets. We mark the RRset as authenticated and cacheable. */ + r = dns_answer_move_by_key(validated, &t->answer, rr->key, DNS_ANSWER_AUTHENTICATED|DNS_ANSWER_CACHEABLE, rrsig); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, DNSSEC_SECURE, rr->key); + + /* Exit the loop, we dropped something from the answer, start from the beginning */ + return 1; + } + + /* If we haven't read all DNSKEYs yet a negative result of the validation is irrelevant, as + * there might be more DNSKEYs coming. Similar, if we haven't read all NSEC/NSEC3 RRs yet, + * we cannot do positive wildcard proofs yet, as those require the NSEC/NSEC3 RRs. */ + if (phase != DNSSEC_PHASE_ALL) + continue; + + if (result == DNSSEC_VALIDATED_WILDCARD) { + bool authenticated = false; + const char *source; + + assert(rrsig); + + /* This RRset validated, but as a wildcard. This means we need + * to prove via NSEC/NSEC3 that no matching non-wildcard RR exists. */ + + /* First step, determine the source of synthesis */ + r = dns_resource_record_source(rrsig, &source); + if (r < 0) + return r; + + r = dnssec_test_positive_wildcard(*validated, + dns_resource_key_name(rr->key), + source, + rrsig->rrsig.signer, + &authenticated); + + /* Unless the NSEC proof showed that the key really doesn't exist something is off. */ + if (r == 0) + result = DNSSEC_INVALID; + else { + r = dns_answer_move_by_key( + validated, + &t->answer, + rr->key, + authenticated ? (DNS_ANSWER_AUTHENTICATED|DNS_ANSWER_CACHEABLE) : 0, + rrsig); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, authenticated ? DNSSEC_SECURE : DNSSEC_INSECURE, rr->key); + + /* Exit the loop, we dropped something from the answer, start from the beginning */ + return 1; + } + } + + if (result == DNSSEC_NO_SIGNATURE) { + r = dns_transaction_requires_rrsig(t, rr); + if (r < 0) + return r; + if (r == 0) { + /* Data does not require signing. In that case, just copy it over, + * but remember that this is by no means authenticated. */ + r = dns_answer_move_by_key( + validated, + &t->answer, + rr->key, + 0, + NULL); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, rr->key); + return 1; + } + + r = dns_transaction_known_signed(t, rr); + if (r < 0) + return r; + if (r > 0) { + /* This is an RR we know has to be signed. If it isn't this means + * the server is not attaching RRSIGs, hence complain. */ + + dns_server_packet_rrsig_missing(t->server, t->current_feature_level); + + if (t->scope->dnssec_mode == DNSSEC_ALLOW_DOWNGRADE) { + + /* Downgrading is OK? If so, just consider the information unsigned */ + + r = dns_answer_move_by_key(validated, &t->answer, rr->key, 0, NULL); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, rr->key); + return 1; + } + + /* Otherwise, fail */ + t->answer_dnssec_result = DNSSEC_INCOMPATIBLE_SERVER; + return 0; + } + + r = dns_transaction_in_private_tld(t, rr->key); + if (r < 0) + return r; + if (r > 0) { + char s[DNS_RESOURCE_KEY_STRING_MAX]; + + /* The data is from a TLD that is proven not to exist, and we are in downgrade + * mode, hence ignore the fact that this was not signed. */ + + log_info("Detected RRset %s is in a private DNS zone, permitting unsigned RRs.", + dns_resource_key_to_string(rr->key, s, sizeof s)); + + r = dns_answer_move_by_key(validated, &t->answer, rr->key, 0, NULL); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, rr->key); + return 1; + } + } + + /* https://datatracker.ietf.org/doc/html/rfc6840#section-5.2 */ + if (result == DNSSEC_UNSUPPORTED_ALGORITHM) { + r = dns_answer_move_by_key(validated, &t->answer, rr->key, 0, NULL); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, rr->key); + return 1; + } + + if (IN_SET(result, + DNSSEC_MISSING_KEY, + DNSSEC_SIGNATURE_EXPIRED)) { + + r = dns_transaction_dnskey_authenticated(t, rr); + if (r < 0 && r != -ENXIO) + return r; + if (r == 0) { + /* The DNSKEY transaction was not authenticated, this means there's + * no DS for this, which means it's OK if no keys are found for this signature. */ + + r = dns_answer_move_by_key(validated, &t->answer, rr->key, 0, NULL); + if (r < 0) + return r; + + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, rr->key); + return 1; + } + } + + r = dns_transaction_is_primary_response(t, rr); + if (r < 0) + return r; + if (r > 0) { + /* Look for a matching DNAME for this CNAME */ + r = dns_answer_has_dname_for_cname(t->answer, rr); + if (r < 0) + return r; + if (r == 0) { + /* Also look among the stuff we already validated */ + r = dns_answer_has_dname_for_cname(*validated, rr); + if (r < 0) + return r; + } + + if (r == 0) { + if (IN_SET(result, + DNSSEC_INVALID, + DNSSEC_SIGNATURE_EXPIRED, + DNSSEC_NO_SIGNATURE)) + manager_dnssec_verdict(t->scope->manager, DNSSEC_BOGUS, rr->key); + else /* DNSSEC_MISSING_KEY, DNSSEC_UNSUPPORTED_ALGORITHM, + or DNSSEC_TOO_MANY_VALIDATIONS */ + manager_dnssec_verdict(t->scope->manager, DNSSEC_INDETERMINATE, rr->key); + + /* This is a primary response to our question, and it failed validation. + * That's fatal. */ + t->answer_dnssec_result = result; + return 0; + } + + /* This is a primary response, but we do have a DNAME RR + * in the RR that can replay this CNAME, hence rely on + * that, and we can remove the CNAME in favour of it. */ + } + + /* This is just some auxiliary data. Just remove the RRset and continue. */ + r = dns_answer_remove_by_key(&t->answer, rr->key); + if (r < 0) + return r; + + /* We dropped something from the answer, start from the beginning. */ + return 1; + } + + return 2; /* Finito. */ +} + +int dns_transaction_validate_dnssec(DnsTransaction *t) { + _cleanup_(dns_answer_unrefp) DnsAnswer *validated = NULL; + Phase phase; + DnsAnswerFlags flags; + int r; + char key_str[DNS_RESOURCE_KEY_STRING_MAX]; + + assert(t); + + /* We have now collected all DS and DNSKEY RRs in t->validated_keys, let's see which RRs we can now + * authenticate with that. */ + + if (FLAGS_SET(t->query_flags, SD_RESOLVED_NO_VALIDATE) || t->scope->dnssec_mode == DNSSEC_NO) + return 0; + + /* Already validated */ + if (t->answer_dnssec_result != _DNSSEC_RESULT_INVALID) + return 0; + + /* Our own stuff needs no validation */ + if (IN_SET(t->answer_source, DNS_TRANSACTION_ZONE, DNS_TRANSACTION_TRUST_ANCHOR)) { + t->answer_dnssec_result = DNSSEC_VALIDATED; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, true); + return 0; + } + + /* Cached stuff is not affected by validation. */ + if (t->answer_source != DNS_TRANSACTION_NETWORK) + return 0; + + if (!dns_transaction_dnssec_supported_full(t)) { + /* The server does not support DNSSEC, or doesn't augment responses with RRSIGs. */ + t->answer_dnssec_result = DNSSEC_INCOMPATIBLE_SERVER; + log_debug("Not validating response for %" PRIu16 ", used server feature level does not support DNSSEC.", t->id); + return 0; + } + + log_debug("Validating response from transaction %" PRIu16 " (%s).", + t->id, + dns_resource_key_to_string(dns_transaction_key(t), key_str, sizeof key_str)); + + /* First, see if this response contains any revoked trust + * anchors we care about */ + r = dns_transaction_check_revoked_trust_anchors(t); + if (r < 0) + return r; + + /* Third, copy all RRs we acquired successfully from auxiliary RRs over. */ + r = dns_transaction_copy_validated(t); + if (r < 0) + return r; + + /* Second, see if there are DNSKEYs we already know a + * validated DS for. */ + r = dns_transaction_validate_dnskey_by_ds(t); + if (r < 0) + return r; + + /* Fourth, remove all DNSKEY and DS RRs again that our trust + * anchor says are revoked. After all we might have marked + * some keys revoked above, but they might still be lingering + * in our validated_keys list. */ + r = dns_transaction_invalidate_revoked_keys(t); + if (r < 0) + return r; + + phase = DNSSEC_PHASE_DNSKEY; + for (unsigned nvalidations = 0;;) { + bool have_nsec = false; + + r = dnssec_validate_records(t, phase, &have_nsec, &nvalidations, &validated); + if (r <= 0) + return r; + + if (nvalidations > DNSSEC_VALIDATION_MAX) { + /* This reply requires an onerous number of signature validations to verify. Let's + * not waste our time trying, as this shouldn't happen for well-behaved domains + * anyway. */ + t->answer_dnssec_result = DNSSEC_TOO_MANY_VALIDATIONS; + return 0; + } + + /* Try again as long as we managed to achieve something */ + if (r == 1) + continue; + + if (phase == DNSSEC_PHASE_DNSKEY && have_nsec) { + /* OK, we processed all DNSKEYs, and there are NSEC/NSEC3 RRs, look at those now. */ + phase = DNSSEC_PHASE_NSEC; + continue; + } + + if (phase != DNSSEC_PHASE_ALL) { + /* OK, we processed all DNSKEYs and NSEC/NSEC3 RRs, look at all the rest now. + * Note that in this third phase we start to remove RRs we couldn't validate. */ + phase = DNSSEC_PHASE_ALL; + continue; + } + + /* We're done */ + break; + } + + DNS_ANSWER_REPLACE(t->answer, TAKE_PTR(validated)); + + /* At this point the answer only contains validated + * RRsets. Now, let's see if it actually answers the question + * we asked. If so, great! If it doesn't, then see if + * NSEC/NSEC3 can prove this. */ + r = dns_transaction_has_positive_answer(t, &flags); + if (r > 0) { + /* Yes, it answers the question! */ + + if (flags & DNS_ANSWER_AUTHENTICATED) { + /* The answer is fully authenticated, yay. */ + t->answer_dnssec_result = DNSSEC_VALIDATED; + t->answer_rcode = DNS_RCODE_SUCCESS; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, true); + } else { + /* The answer is not fully authenticated. */ + t->answer_dnssec_result = DNSSEC_UNSIGNED; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, false); + } + + } else if (r == 0) { + DnssecNsecResult nr; + bool authenticated = false; + + /* Bummer! Let's check NSEC/NSEC3 */ + r = dnssec_nsec_test(t->answer, dns_transaction_key(t), &nr, &authenticated, &t->answer_nsec_ttl); + if (r < 0) + return r; + + switch (nr) { + + case DNSSEC_NSEC_NXDOMAIN: + /* NSEC proves the domain doesn't exist. Very good. */ + log_debug("Proved NXDOMAIN via NSEC/NSEC3 for transaction %u (%s)", t->id, key_str); + t->answer_dnssec_result = DNSSEC_VALIDATED; + t->answer_rcode = DNS_RCODE_NXDOMAIN; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, authenticated); + + manager_dnssec_verdict(t->scope->manager, authenticated ? DNSSEC_SECURE : DNSSEC_INSECURE, dns_transaction_key(t)); + break; + + case DNSSEC_NSEC_NODATA: + /* NSEC proves that there's no data here, very good. */ + log_debug("Proved NODATA via NSEC/NSEC3 for transaction %u (%s)", t->id, key_str); + t->answer_dnssec_result = DNSSEC_VALIDATED; + t->answer_rcode = DNS_RCODE_SUCCESS; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, authenticated); + + manager_dnssec_verdict(t->scope->manager, authenticated ? DNSSEC_SECURE : DNSSEC_INSECURE, dns_transaction_key(t)); + break; + + case DNSSEC_NSEC_OPTOUT: + /* NSEC3 says the data might not be signed */ + log_debug("Data is NSEC3 opt-out via NSEC/NSEC3 for transaction %u (%s)", t->id, key_str); + t->answer_dnssec_result = DNSSEC_UNSIGNED; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, false); + + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, dns_transaction_key(t)); + break; + + case DNSSEC_NSEC_NO_RR: + /* No NSEC data? Bummer! */ + + r = dns_transaction_requires_nsec(t); + if (r < 0) + return r; + if (r > 0) { + t->answer_dnssec_result = DNSSEC_NO_SIGNATURE; + manager_dnssec_verdict(t->scope->manager, DNSSEC_BOGUS, dns_transaction_key(t)); + } else { + t->answer_dnssec_result = DNSSEC_UNSIGNED; + SET_FLAG(t->answer_query_flags, SD_RESOLVED_AUTHENTICATED, false); + manager_dnssec_verdict(t->scope->manager, DNSSEC_INSECURE, dns_transaction_key(t)); + } + + break; + + case DNSSEC_NSEC_UNSUPPORTED_ALGORITHM: + /* We don't know the NSEC3 algorithm used? */ + t->answer_dnssec_result = DNSSEC_UNSUPPORTED_ALGORITHM; + manager_dnssec_verdict(t->scope->manager, DNSSEC_INDETERMINATE, dns_transaction_key(t)); + break; + + case DNSSEC_NSEC_FOUND: + case DNSSEC_NSEC_CNAME: + /* NSEC says it needs to be there, but we couldn't find it? Bummer! */ + t->answer_dnssec_result = DNSSEC_NSEC_MISMATCH; + manager_dnssec_verdict(t->scope->manager, DNSSEC_BOGUS, dns_transaction_key(t)); + break; + + default: + assert_not_reached(); + } + } + + return 1; +} + +static const char* const dns_transaction_state_table[_DNS_TRANSACTION_STATE_MAX] = { + [DNS_TRANSACTION_NULL] = "null", + [DNS_TRANSACTION_PENDING] = "pending", + [DNS_TRANSACTION_VALIDATING] = "validating", + [DNS_TRANSACTION_RCODE_FAILURE] = "rcode-failure", + [DNS_TRANSACTION_SUCCESS] = "success", + [DNS_TRANSACTION_NO_SERVERS] = "no-servers", + [DNS_TRANSACTION_TIMEOUT] = "timeout", + [DNS_TRANSACTION_ATTEMPTS_MAX_REACHED] = "attempts-max-reached", + [DNS_TRANSACTION_INVALID_REPLY] = "invalid-reply", + [DNS_TRANSACTION_ERRNO] = "errno", + [DNS_TRANSACTION_ABORTED] = "aborted", + [DNS_TRANSACTION_DNSSEC_FAILED] = "dnssec-failed", + [DNS_TRANSACTION_NO_TRUST_ANCHOR] = "no-trust-anchor", + [DNS_TRANSACTION_RR_TYPE_UNSUPPORTED] = "rr-type-unsupported", + [DNS_TRANSACTION_NETWORK_DOWN] = "network-down", + [DNS_TRANSACTION_NOT_FOUND] = "not-found", + [DNS_TRANSACTION_NO_SOURCE] = "no-source", + [DNS_TRANSACTION_STUB_LOOP] = "stub-loop", +}; +DEFINE_STRING_TABLE_LOOKUP(dns_transaction_state, DnsTransactionState); + +static const char* const dns_transaction_source_table[_DNS_TRANSACTION_SOURCE_MAX] = { + [DNS_TRANSACTION_NETWORK] = "network", + [DNS_TRANSACTION_CACHE] = "cache", + [DNS_TRANSACTION_ZONE] = "zone", + [DNS_TRANSACTION_TRUST_ANCHOR] = "trust-anchor", +}; +DEFINE_STRING_TABLE_LOOKUP(dns_transaction_source, DnsTransactionSource); diff --git a/src/resolve/resolved-dns-transaction.h b/src/resolve/resolved-dns-transaction.h new file mode 100644 index 0000000..2fd8720 --- /dev/null +++ b/src/resolve/resolved-dns-transaction.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" +#include "in-addr-util.h" + +typedef struct DnsTransaction DnsTransaction; +typedef struct DnsTransactionFinder DnsTransactionFinder; +typedef enum DnsTransactionState DnsTransactionState; +typedef enum DnsTransactionSource DnsTransactionSource; + +#include "resolved-dns-answer.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-question.h" +#include "resolved-dns-server.h" + +enum DnsTransactionState { + DNS_TRANSACTION_NULL, + DNS_TRANSACTION_PENDING, + DNS_TRANSACTION_VALIDATING, + DNS_TRANSACTION_RCODE_FAILURE, + DNS_TRANSACTION_SUCCESS, + DNS_TRANSACTION_NO_SERVERS, + DNS_TRANSACTION_TIMEOUT, + DNS_TRANSACTION_ATTEMPTS_MAX_REACHED, + DNS_TRANSACTION_INVALID_REPLY, + DNS_TRANSACTION_ERRNO, + DNS_TRANSACTION_ABORTED, + DNS_TRANSACTION_DNSSEC_FAILED, + DNS_TRANSACTION_NO_TRUST_ANCHOR, + DNS_TRANSACTION_RR_TYPE_UNSUPPORTED, + DNS_TRANSACTION_NETWORK_DOWN, + DNS_TRANSACTION_NOT_FOUND, /* like NXDOMAIN, but when LLMNR/TCP connections fail */ + DNS_TRANSACTION_NO_SOURCE, /* All suitable DnsTransactionSource turned off */ + DNS_TRANSACTION_STUB_LOOP, + _DNS_TRANSACTION_STATE_MAX, + _DNS_TRANSACTION_STATE_INVALID = -EINVAL, +}; + +#define DNS_TRANSACTION_IS_LIVE(state) IN_SET((state), DNS_TRANSACTION_NULL, DNS_TRANSACTION_PENDING, DNS_TRANSACTION_VALIDATING) + +enum DnsTransactionSource { + DNS_TRANSACTION_NETWORK, + DNS_TRANSACTION_CACHE, + DNS_TRANSACTION_ZONE, + DNS_TRANSACTION_TRUST_ANCHOR, + _DNS_TRANSACTION_SOURCE_MAX, + _DNS_TRANSACTION_SOURCE_INVALID = -EINVAL, +}; + +struct DnsTransaction { + DnsScope *scope; + + DnsResourceKey *key; /* For regular lookups the RR key to look for */ + DnsPacket *bypass; /* For bypass lookups the full original request packet */ + + uint64_t query_flags; + + DnsPacket *sent, *received; + + DnsAnswer *answer; + int answer_rcode; + DnssecResult answer_dnssec_result; + DnsTransactionSource answer_source; + uint32_t answer_nsec_ttl; + int answer_errno; /* if state is DNS_TRANSACTION_ERRNO */ + + DnsTransactionState state; + + /* SD_RESOLVED_AUTHENTICATED here indicates whether the primary answer is authenticated, i.e. whether + * the RRs from answer which directly match the question are authenticated, or, if there are none, + * whether the NODATA or NXDOMAIN case is. It says nothing about additional RRs listed in the answer, + * however they have their own DNS_ANSWER_AUTHORIZED FLAGS. Note that this bit is defined different + * than the AD bit in DNS packets, as that covers more than just the actual primary answer. */ + uint64_t answer_query_flags; + + /* Contains DNSKEY, DS, SOA RRs we already verified and need + * to authenticate this reply */ + DnsAnswer *validated_keys; + + usec_t start_usec; + usec_t next_attempt_after; + sd_event_source *timeout_event_source; + unsigned n_attempts; + + /* UDP connection logic, if we need it */ + int dns_udp_fd; + sd_event_source *dns_udp_event_source; + + /* TCP connection logic, if we need it */ + DnsStream *stream; + + /* The active server */ + DnsServer *server; + + /* The features of the DNS server at time of transaction start */ + DnsServerFeatureLevel current_feature_level; + + /* If we got SERVFAIL back, we retry the lookup, using a lower feature level than we used before. */ + DnsServerFeatureLevel clamp_feature_level_servfail; + + uint16_t id; + + bool tried_stream:1; + + bool initial_jitter_scheduled:1; + bool initial_jitter_elapsed:1; + + bool probing:1; + + bool seen_timeout:1; + + /* Query candidates this transaction is referenced by and that + * shall be notified about this specific transaction + * completing. */ + Set *notify_query_candidates, *notify_query_candidates_done; + + /* Zone items this transaction is referenced by and that shall + * be notified about completion. */ + Set *notify_zone_items, *notify_zone_items_done; + + /* Other transactions that this transactions is referenced by + * and that shall be notified about completion. This is used + * when transactions want to validate their RRsets, but need + * another DNSKEY or DS RR to do so. */ + Set *notify_transactions, *notify_transactions_done; + + /* The opposite direction: the transactions this transaction + * created in order to request DNSKEY or DS RRs. */ + Set *dnssec_transactions; + + unsigned n_picked_servers; + + unsigned block_gc; + + LIST_FIELDS(DnsTransaction, transactions_by_scope); + LIST_FIELDS(DnsTransaction, transactions_by_stream); + LIST_FIELDS(DnsTransaction, transactions_by_key); + + /* Note: fields should be ordered to minimize alignment gaps. Use pahole! */ +}; + +int dns_transaction_new(DnsTransaction **ret, DnsScope *s, DnsResourceKey *key, DnsPacket *bypass, uint64_t flags); +DnsTransaction* dns_transaction_free(DnsTransaction *t); + +DnsTransaction* dns_transaction_gc(DnsTransaction *t); +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsTransaction*, dns_transaction_gc); + +int dns_transaction_go(DnsTransaction *t); + +void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypted); +void dns_transaction_complete(DnsTransaction *t, DnsTransactionState state); + +void dns_transaction_notify(DnsTransaction *t, DnsTransaction *source); +int dns_transaction_validate_dnssec(DnsTransaction *t); +int dns_transaction_request_dnssec_keys(DnsTransaction *t); + +static inline DnsResourceKey *dns_transaction_key(DnsTransaction *t) { + assert(t); + + /* Return the lookup key of this transaction. Either takes the lookup key from the bypass packet if + * we are a bypass transaction. Or take the configured key for regular transactions. */ + + if (t->key) + return t->key; + + assert(t->bypass); + + return dns_question_first_key(t->bypass->question); +} + +static inline uint64_t dns_transaction_source_to_query_flags(DnsTransactionSource s) { + + switch (s) { + + case DNS_TRANSACTION_NETWORK: + return SD_RESOLVED_FROM_NETWORK; + + case DNS_TRANSACTION_CACHE: + return SD_RESOLVED_FROM_CACHE; + + case DNS_TRANSACTION_ZONE: + return SD_RESOLVED_FROM_ZONE; + + case DNS_TRANSACTION_TRUST_ANCHOR: + return SD_RESOLVED_FROM_TRUST_ANCHOR; + + default: + return 0; + } +} + +const char* dns_transaction_state_to_string(DnsTransactionState p) _const_; +DnsTransactionState dns_transaction_state_from_string(const char *s) _pure_; + +const char* dns_transaction_source_to_string(DnsTransactionSource p) _const_; +DnsTransactionSource dns_transaction_source_from_string(const char *s) _pure_; + +/* LLMNR Jitter interval, see RFC 4795 Section 7 */ +#define LLMNR_JITTER_INTERVAL_USEC (100 * USEC_PER_MSEC) + +/* mDNS probing interval, see RFC 6762 Section 8.1 */ +#define MDNS_PROBING_INTERVAL_USEC (250 * USEC_PER_MSEC) + +/* Maximum attempts to send DNS requests, across all DNS servers */ +#define DNS_TRANSACTION_ATTEMPTS_MAX 24 + +/* Maximum attempts to send LLMNR requests, see RFC 4795 Section 2.7 */ +#define LLMNR_TRANSACTION_ATTEMPTS_MAX 3 + +/* Maximum attempts to send MDNS requests, see RFC 6762 Section 8.1 */ +#define MDNS_TRANSACTION_ATTEMPTS_MAX 3 + +#define TRANSACTION_ATTEMPTS_MAX(p) ((p) == DNS_PROTOCOL_LLMNR ? \ + LLMNR_TRANSACTION_ATTEMPTS_MAX : \ + (p) == DNS_PROTOCOL_MDNS ? \ + MDNS_TRANSACTION_ATTEMPTS_MAX : \ + DNS_TRANSACTION_ATTEMPTS_MAX) diff --git a/src/resolve/resolved-dns-trust-anchor.c b/src/resolve/resolved-dns-trust-anchor.c new file mode 100644 index 0000000..1703c43 --- /dev/null +++ b/src/resolve/resolved-dns-trust-anchor.c @@ -0,0 +1,779 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "conf-files.h" +#include "constants.h" +#include "dns-domain.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-trust-anchor.h" +#include "set.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" + +static const char trust_anchor_dirs[] = CONF_PATHS_NULSTR("dnssec-trust-anchors.d"); + +/* The second DS RR from https://data.iana.org/root-anchors/root-anchors.xml, retrieved February 2017 */ +static const uint8_t root_digest2[] = + { 0xE0, 0x6D, 0x44, 0xB8, 0x0B, 0x8F, 0x1D, 0x39, 0xA9, 0x5C, 0x0B, 0x0D, 0x7C, 0x65, 0xD0, 0x84, + 0x58, 0xE8, 0x80, 0x40, 0x9B, 0xBC, 0x68, 0x34, 0x57, 0x10, 0x42, 0x37, 0xC7, 0xF8, 0xEC, 0x8D }; + +static bool dns_trust_anchor_knows_domain_positive(DnsTrustAnchor *d, const char *name) { + assert(d); + + /* Returns true if there's an entry for the specified domain + * name in our trust anchor */ + + return + hashmap_contains(d->positive_by_key, &DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_DNSKEY, name)) || + hashmap_contains(d->positive_by_key, &DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_DS, name)); +} + +static int add_root_ksk( + DnsAnswer *answer, + DnsResourceKey *key, + uint16_t key_tag, + uint8_t algorithm, + uint8_t digest_type, + const void *digest, + size_t digest_size) { + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + int r; + + rr = dns_resource_record_new(key); + if (!rr) + return -ENOMEM; + + rr->ds.key_tag = key_tag; + rr->ds.algorithm = algorithm; + rr->ds.digest_type = digest_type; + rr->ds.digest_size = digest_size; + rr->ds.digest = memdup(digest, rr->ds.digest_size); + if (!rr->ds.digest) + return -ENOMEM; + + r = dns_answer_add(answer, rr, 0, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return r; + + return 0; +} + +static int dns_trust_anchor_add_builtin_positive(DnsTrustAnchor *d) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + int r; + + assert(d); + + r = hashmap_ensure_allocated(&d->positive_by_key, &dns_resource_key_hash_ops); + if (r < 0) + return r; + + /* Only add the built-in trust anchor if there's neither a DS nor a DNSKEY defined for the root domain. That + * way users have an easy way to override the root domain DS/DNSKEY data. */ + if (dns_trust_anchor_knows_domain_positive(d, ".")) + return 0; + + key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_DS, ""); + if (!key) + return -ENOMEM; + + answer = dns_answer_new(2); + if (!answer) + return -ENOMEM; + + /* Add the currently valid RRs from https://data.iana.org/root-anchors/root-anchors.xml */ + r = add_root_ksk(answer, key, 20326, DNSSEC_ALGORITHM_RSASHA256, DNSSEC_DIGEST_SHA256, root_digest2, sizeof(root_digest2)); + if (r < 0) + return r; + + r = hashmap_put(d->positive_by_key, key, answer); + if (r < 0) + return r; + + answer = NULL; + return 0; +} + +static int dns_trust_anchor_add_builtin_negative(DnsTrustAnchor *d) { + + static const char private_domains[] = + /* RFC 6761 says that .test is a special domain for + * testing and not to be installed in the root zone */ + "test\0" + + /* RFC 6761 says that these reverse IP lookup ranges + * are for private addresses, and hence should not + * show up in the root zone */ + "10.in-addr.arpa\0" + "16.172.in-addr.arpa\0" + "17.172.in-addr.arpa\0" + "18.172.in-addr.arpa\0" + "19.172.in-addr.arpa\0" + "20.172.in-addr.arpa\0" + "21.172.in-addr.arpa\0" + "22.172.in-addr.arpa\0" + "23.172.in-addr.arpa\0" + "24.172.in-addr.arpa\0" + "25.172.in-addr.arpa\0" + "26.172.in-addr.arpa\0" + "27.172.in-addr.arpa\0" + "28.172.in-addr.arpa\0" + "29.172.in-addr.arpa\0" + "30.172.in-addr.arpa\0" + "31.172.in-addr.arpa\0" + "168.192.in-addr.arpa\0" + + /* The same, but for IPv6. */ + "d.f.ip6.arpa\0" + + /* RFC 6762 reserves the .local domain for Multicast + * DNS, it hence cannot appear in the root zone. (Note + * that we by default do not route .local traffic to + * DNS anyway, except when a configured search domain + * suggests so.) */ + "local\0" + + /* These two are well known, popular private zone + * TLDs, that are blocked from delegation, according + * to: + * http://icannwiki.com/Name_Collision#NGPC_Resolution + * + * There's also ongoing work on making this official + * in an RRC: + * https://www.ietf.org/archive/id/draft-chapin-additional-reserved-tlds-02.txt */ + "home\0" + "corp\0" + + /* The following four TLDs are suggested for private + * zones in RFC 6762, Appendix G, and are hence very + * unlikely to be made official TLDs any day soon */ + "lan\0" + "intranet\0" + "internal\0" + "private\0" + + /* Defined by RFC 8375. The most official choice. */ + "home.arpa\0" + + /* RFC 8880 says because the 'ipv4only.arpa' zone has to + * be an insecure delegation, DNSSEC cannot be used to + * protect these answers from tampering by malicious + * devices on the path */ + "ipv4only.arpa\0" + "170.0.0.192.in-addr.arpa\0" + "171.0.0.192.in-addr.arpa\0"; + + int r; + + assert(d); + + /* Only add the built-in trust anchor if there's no negative + * trust anchor defined at all. This enables easy overriding + * of negative trust anchors. */ + + if (set_size(d->negative_by_name) > 0) + return 0; + + r = set_ensure_allocated(&d->negative_by_name, &dns_name_hash_ops); + if (r < 0) + return r; + + /* We add a couple of domains as default negative trust + * anchors, where it's very unlikely they will be installed in + * the root zone. If they exist they must be private, and thus + * unsigned. */ + + NULSTR_FOREACH(name, private_domains) { + if (dns_trust_anchor_knows_domain_positive(d, name)) + continue; + + r = set_put_strdup(&d->negative_by_name, name); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_trust_anchor_load_positive(DnsTrustAnchor *d, const char *path, unsigned line, const char *s) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + _cleanup_free_ char *domain = NULL, *class = NULL, *type = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnsAnswer *old_answer = NULL; + const char *p = s; + int r; + + assert(d); + assert(line); + + r = extract_first_word(&p, &domain, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_warning_errno(r, "Unable to parse domain in line %s:%u: %m", path, line); + + r = dns_name_is_valid(domain); + if (r < 0) + return log_warning_errno(r, "Failed to check validity of domain name '%s', at line %s:%u, ignoring line: %m", domain, path, line); + if (r == 0) { + log_warning("Domain name %s is invalid, at line %s:%u, ignoring line.", domain, path, line); + return -EINVAL; + } + + r = extract_many_words(&p, NULL, 0, &class, &type, NULL); + if (r < 0) + return log_warning_errno(r, "Unable to parse class and type in line %s:%u: %m", path, line); + if (r != 2) { + log_warning("Missing class or type in line %s:%u", path, line); + return -EINVAL; + } + + if (!strcaseeq(class, "IN")) { + log_warning("RR class %s is not supported, ignoring line %s:%u.", class, path, line); + return -EINVAL; + } + + if (strcaseeq(type, "DS")) { + _cleanup_free_ char *key_tag = NULL, *algorithm = NULL, *digest_type = NULL; + _cleanup_free_ void *dd = NULL; + uint16_t kt; + int a, dt; + size_t l; + + r = extract_many_words(&p, NULL, 0, &key_tag, &algorithm, &digest_type, NULL); + if (r < 0) { + log_warning_errno(r, "Failed to parse DS parameters on line %s:%u: %m", path, line); + return -EINVAL; + } + if (r != 3) { + log_warning("Missing DS parameters on line %s:%u", path, line); + return -EINVAL; + } + + r = safe_atou16(key_tag, &kt); + if (r < 0) + return log_warning_errno(r, "Failed to parse DS key tag %s on line %s:%u: %m", key_tag, path, line); + + a = dnssec_algorithm_from_string(algorithm); + if (a < 0) { + log_warning("Failed to parse DS algorithm %s on line %s:%u", algorithm, path, line); + return -EINVAL; + } + + dt = dnssec_digest_from_string(digest_type); + if (dt < 0) { + log_warning("Failed to parse DS digest type %s on line %s:%u", digest_type, path, line); + return -EINVAL; + } + + if (isempty(p)) { + log_warning("Missing DS digest on line %s:%u", path, line); + return -EINVAL; + } + + r = unhexmem(p, strlen(p), &dd, &l); + if (r < 0) { + log_warning("Failed to parse DS digest %s on line %s:%u", p, path, line); + return -EINVAL; + } + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, domain); + if (!rr) + return log_oom(); + + rr->ds.key_tag = kt; + rr->ds.algorithm = a; + rr->ds.digest_type = dt; + rr->ds.digest_size = l; + rr->ds.digest = TAKE_PTR(dd); + + } else if (strcaseeq(type, "DNSKEY")) { + _cleanup_free_ char *flags = NULL, *protocol = NULL, *algorithm = NULL; + _cleanup_free_ void *k = NULL; + uint16_t f; + size_t l; + int a; + + r = extract_many_words(&p, NULL, 0, &flags, &protocol, &algorithm, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to parse DNSKEY parameters on line %s:%u: %m", path, line); + if (r != 3) { + log_warning("Missing DNSKEY parameters on line %s:%u", path, line); + return -EINVAL; + } + + if (!streq(protocol, "3")) { + log_warning("DNSKEY Protocol is not 3 on line %s:%u", path, line); + return -EINVAL; + } + + r = safe_atou16(flags, &f); + if (r < 0) + return log_warning_errno(r, "Failed to parse DNSKEY flags field %s on line %s:%u", flags, path, line); + if ((f & DNSKEY_FLAG_ZONE_KEY) == 0) { + log_warning("DNSKEY lacks zone key bit set on line %s:%u", path, line); + return -EINVAL; + } + if ((f & DNSKEY_FLAG_REVOKE)) { + log_warning("DNSKEY is already revoked on line %s:%u", path, line); + return -EINVAL; + } + + a = dnssec_algorithm_from_string(algorithm); + if (a < 0) { + log_warning("Failed to parse DNSKEY algorithm %s on line %s:%u", algorithm, path, line); + return -EINVAL; + } + + if (isempty(p)) { + log_warning("Missing DNSKEY key on line %s:%u", path, line); + return -EINVAL; + } + + r = unbase64mem(p, strlen(p), &k, &l); + if (r < 0) + return log_warning_errno(r, "Failed to parse DNSKEY key data %s on line %s:%u", p, path, line); + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, domain); + if (!rr) + return log_oom(); + + rr->dnskey.flags = f; + rr->dnskey.protocol = 3; + rr->dnskey.algorithm = a; + rr->dnskey.key_size = l; + rr->dnskey.key = TAKE_PTR(k); + + } else { + log_warning("RR type %s is not supported, ignoring line %s:%u.", type, path, line); + return -EINVAL; + } + + r = hashmap_ensure_allocated(&d->positive_by_key, &dns_resource_key_hash_ops); + if (r < 0) + return log_oom(); + + old_answer = hashmap_get(d->positive_by_key, rr->key); + answer = dns_answer_ref(old_answer); + + r = dns_answer_add_extend(&answer, rr, 0, DNS_ANSWER_AUTHENTICATED, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add trust anchor RR: %m"); + + r = hashmap_replace(d->positive_by_key, rr->key, answer); + if (r < 0) + return log_error_errno(r, "Failed to add answer to trust anchor: %m"); + + old_answer = dns_answer_unref(old_answer); + answer = NULL; + + return 0; +} + +static int dns_trust_anchor_load_negative(DnsTrustAnchor *d, const char *path, unsigned line, const char *s) { + _cleanup_free_ char *domain = NULL; + const char *p = s; + int r; + + assert(d); + assert(line); + + r = extract_first_word(&p, &domain, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_warning_errno(r, "Unable to parse line %s:%u: %m", path, line); + + r = dns_name_is_valid(domain); + if (r < 0) + return log_warning_errno(r, "Failed to check validity of domain name '%s', at line %s:%u, ignoring line: %m", domain, path, line); + if (r == 0) { + log_warning("Domain name %s is invalid, at line %s:%u, ignoring line.", domain, path, line); + return -EINVAL; + } + + if (!isempty(p)) { + log_warning("Trailing garbage at line %s:%u, ignoring line.", path, line); + return -EINVAL; + } + + r = set_ensure_consume(&d->negative_by_name, &dns_name_hash_ops, TAKE_PTR(domain)); + if (r < 0) + return log_oom(); + + return 0; +} + +static int dns_trust_anchor_load_files( + DnsTrustAnchor *d, + const char *suffix, + int (*loader)(DnsTrustAnchor *d, const char *path, unsigned n, const char *line)) { + + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(d); + assert(suffix); + assert(loader); + + r = conf_files_list_nulstr(&files, suffix, NULL, 0, trust_anchor_dirs); + if (r < 0) + return log_error_errno(r, "Failed to enumerate %s trust anchor files: %m", suffix); + + STRV_FOREACH(f, files) { + _cleanup_fclose_ FILE *g = NULL; + unsigned n = 0; + + g = fopen(*f, "re"); + if (!g) { + if (errno == ENOENT) + continue; + + log_warning_errno(errno, "Failed to open '%s', ignoring: %m", *f); + continue; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(g, LONG_LINE_MAX, &line); + if (r < 0) { + log_warning_errno(r, "Failed to read '%s', ignoring: %m", *f); + break; + } + if (r == 0) + break; + + n++; + + if (isempty(line)) + continue; + + if (*line == ';') + continue; + + (void) loader(d, *f, n, line); + } + } + + return 0; +} + +static int domain_name_cmp(char * const *a, char * const *b) { + return dns_name_compare_func(*a, *b); +} + +static int dns_trust_anchor_dump(DnsTrustAnchor *d) { + DnsAnswer *a; + + assert(d); + + if (hashmap_isempty(d->positive_by_key)) + log_info("No positive trust anchors defined."); + else { + log_info("Positive Trust Anchors:"); + HASHMAP_FOREACH(a, d->positive_by_key) { + DnsResourceRecord *rr; + + DNS_ANSWER_FOREACH(rr, a) + log_info("%s", dns_resource_record_to_string(rr)); + } + } + + if (set_isempty(d->negative_by_name)) + log_info("No negative trust anchors defined."); + else { + _cleanup_free_ char **l = NULL, *j = NULL; + + l = set_get_strv(d->negative_by_name); + if (!l) + return log_oom(); + + typesafe_qsort(l, set_size(d->negative_by_name), domain_name_cmp); + + j = strv_join(l, " "); + if (!j) + return log_oom(); + + log_info("Negative trust anchors: %s", j); + } + + return 0; +} + +int dns_trust_anchor_load(DnsTrustAnchor *d) { + int r; + + assert(d); + + /* If loading things from disk fails, we don't consider this fatal */ + (void) dns_trust_anchor_load_files(d, ".positive", dns_trust_anchor_load_positive); + (void) dns_trust_anchor_load_files(d, ".negative", dns_trust_anchor_load_negative); + + /* However, if the built-in DS fails, then we have a problem. */ + r = dns_trust_anchor_add_builtin_positive(d); + if (r < 0) + return log_error_errno(r, "Failed to add built-in positive trust anchor: %m"); + + r = dns_trust_anchor_add_builtin_negative(d); + if (r < 0) + return log_error_errno(r, "Failed to add built-in negative trust anchor: %m"); + + dns_trust_anchor_dump(d); + + return 0; +} + +void dns_trust_anchor_flush(DnsTrustAnchor *d) { + assert(d); + + d->positive_by_key = hashmap_free_with_destructor(d->positive_by_key, dns_answer_unref); + d->revoked_by_rr = set_free_with_destructor(d->revoked_by_rr, dns_resource_record_unref); + d->negative_by_name = set_free_free(d->negative_by_name); +} + +int dns_trust_anchor_lookup_positive(DnsTrustAnchor *d, const DnsResourceKey *key, DnsAnswer **ret) { + DnsAnswer *a; + + assert(d); + assert(key); + assert(ret); + + /* We only serve DS and DNSKEY RRs. */ + if (!IN_SET(key->type, DNS_TYPE_DS, DNS_TYPE_DNSKEY)) + return 0; + + a = hashmap_get(d->positive_by_key, key); + if (!a) + return 0; + + *ret = dns_answer_ref(a); + return 1; +} + +int dns_trust_anchor_lookup_negative(DnsTrustAnchor *d, const char *name) { + int r; + + assert(d); + assert(name); + + for (;;) { + /* If the domain is listed as-is in the NTA database, then that counts */ + if (set_contains(d->negative_by_name, name)) + return true; + + /* If the domain isn't listed as NTA, but is listed as positive trust anchor, then that counts. See RFC + * 7646, section 1.1 */ + if (hashmap_contains(d->positive_by_key, &DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_DS, name))) + return false; + + if (hashmap_contains(d->positive_by_key, &DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_KEY, name))) + return false; + + /* And now, let's look at the parent, and check that too */ + r = dns_name_parent(&name); + if (r < 0) + return r; + if (r == 0) + break; + } + + return false; +} + +static int dns_trust_anchor_revoked_put(DnsTrustAnchor *d, DnsResourceRecord *rr) { + int r; + + assert(d); + + r = set_ensure_put(&d->revoked_by_rr, &dns_resource_record_hash_ops, rr); + if (r < 0) + return r; + if (r > 0) + dns_resource_record_ref(rr); + + return r; +} + +static int dns_trust_anchor_remove_revoked(DnsTrustAnchor *d, DnsResourceRecord *rr) { + _cleanup_(dns_answer_unrefp) DnsAnswer *new_answer = NULL; + DnsAnswer *old_answer; + DnsAnswerItem *item; + int r; + + /* Remember that this is a revoked trust anchor RR */ + r = dns_trust_anchor_revoked_put(d, rr); + if (r < 0) + return r; + + /* Remove this from the positive trust anchor */ + old_answer = hashmap_get(d->positive_by_key, rr->key); + if (!old_answer) + return 0; + + new_answer = dns_answer_ref(old_answer); + + r = dns_answer_remove_by_rr(&new_answer, rr); + if (r <= 0) + return r; + + /* We found the key! Warn the user */ + log_struct(LOG_WARNING, + "MESSAGE_ID=" SD_MESSAGE_DNSSEC_TRUST_ANCHOR_REVOKED_STR, + LOG_MESSAGE("DNSSEC trust anchor %s has been revoked.\n" + "Please update the trust anchor, or upgrade your operating system.", + strna(dns_resource_record_to_string(rr))), + "TRUST_ANCHOR=%s", dns_resource_record_to_string(rr)); + + if (dns_answer_size(new_answer) <= 0) { + assert_se(hashmap_remove(d->positive_by_key, rr->key) == old_answer); + dns_answer_unref(old_answer); + return 1; + } + + item = ordered_set_first(new_answer->items); + r = hashmap_replace(d->positive_by_key, item->rr->key, new_answer); + if (r < 0) + return r; + + TAKE_PTR(new_answer); + dns_answer_unref(old_answer); + return 1; +} + +static int dns_trust_anchor_check_revoked_one(DnsTrustAnchor *d, DnsResourceRecord *revoked_dnskey) { + DnsAnswer *a; + int r; + + assert(d); + assert(revoked_dnskey); + assert(revoked_dnskey->key->type == DNS_TYPE_DNSKEY); + assert(revoked_dnskey->dnskey.flags & DNSKEY_FLAG_REVOKE); + + a = hashmap_get(d->positive_by_key, revoked_dnskey->key); + if (a) { + DnsResourceRecord *anchor; + + /* First, look for the precise DNSKEY in our trust anchor database */ + + DNS_ANSWER_FOREACH(anchor, a) { + + if (anchor->dnskey.protocol != revoked_dnskey->dnskey.protocol) + continue; + + if (anchor->dnskey.algorithm != revoked_dnskey->dnskey.algorithm) + continue; + + if (anchor->dnskey.key_size != revoked_dnskey->dnskey.key_size) + continue; + + /* Note that we allow the REVOKE bit to be + * different! It will be set in the revoked + * key, but unset in our version of it */ + if (((anchor->dnskey.flags ^ revoked_dnskey->dnskey.flags) | DNSKEY_FLAG_REVOKE) != DNSKEY_FLAG_REVOKE) + continue; + + if (memcmp(anchor->dnskey.key, revoked_dnskey->dnskey.key, anchor->dnskey.key_size) != 0) + continue; + + dns_trust_anchor_remove_revoked(d, anchor); + break; + } + } + + a = hashmap_get(d->positive_by_key, &DNS_RESOURCE_KEY_CONST(revoked_dnskey->key->class, DNS_TYPE_DS, dns_resource_key_name(revoked_dnskey->key))); + if (a) { + DnsResourceRecord *anchor; + + /* Second, look for DS RRs matching this DNSKEY in our trust anchor database */ + + DNS_ANSWER_FOREACH(anchor, a) { + + /* We set mask_revoke to true here, since our + * DS fingerprint will be the one of the + * unrevoked DNSKEY, but the one we got passed + * here has the bit set. */ + r = dnssec_verify_dnskey_by_ds(revoked_dnskey, anchor, true); + if (r < 0) + return r; + if (r == 0) + continue; + + dns_trust_anchor_remove_revoked(d, anchor); + break; + } + } + + return 0; +} + +int dns_trust_anchor_check_revoked(DnsTrustAnchor *d, DnsResourceRecord *dnskey, DnsAnswer *rrs) { + DnsResourceRecord *rrsig; + int r; + + assert(d); + assert(dnskey); + + /* Looks if "dnskey" is a self-signed RR that has been revoked + * and matches one of our trust anchor entries. If so, removes + * it from the trust anchor and returns > 0. */ + + if (dnskey->key->type != DNS_TYPE_DNSKEY) + return 0; + + /* Is this DNSKEY revoked? */ + if ((dnskey->dnskey.flags & DNSKEY_FLAG_REVOKE) == 0) + return 0; + + /* Could this be interesting to us at all? If not, + * there's no point in looking for and verifying a + * self-signed RRSIG. */ + if (!dns_trust_anchor_knows_domain_positive(d, dns_resource_key_name(dnskey->key))) + return 0; + + /* Look for a self-signed RRSIG in the other rrs belonging to this DNSKEY */ + DNS_ANSWER_FOREACH(rrsig, rrs) { + DnssecResult result; + + if (rrsig->key->type != DNS_TYPE_RRSIG) + continue; + + r = dnssec_rrsig_match_dnskey(rrsig, dnskey, true); + if (r < 0) + return r; + if (r == 0) + continue; + + r = dnssec_verify_rrset(rrs, dnskey->key, rrsig, dnskey, USEC_INFINITY, &result); + if (r < 0) + return r; + if (result != DNSSEC_VALIDATED) + continue; + + /* Bingo! This is a revoked self-signed DNSKEY. Let's + * see if this precise one exists in our trust anchor + * database, too. */ + r = dns_trust_anchor_check_revoked_one(d, dnskey); + if (r < 0) + return r; + + return 1; + } + + return 0; +} + +int dns_trust_anchor_is_revoked(DnsTrustAnchor *d, DnsResourceRecord *rr) { + assert(d); + + if (!IN_SET(rr->key->type, DNS_TYPE_DS, DNS_TYPE_DNSKEY)) + return 0; + + return set_contains(d->revoked_by_rr, rr); +} diff --git a/src/resolve/resolved-dns-trust-anchor.h b/src/resolve/resolved-dns-trust-anchor.h new file mode 100644 index 0000000..14047ec --- /dev/null +++ b/src/resolve/resolved-dns-trust-anchor.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct DnsTrustAnchor DnsTrustAnchor; + +#include "hashmap.h" +#include "resolved-dns-answer.h" +#include "resolved-dns-rr.h" + +/* This contains a fixed database mapping domain names to DS or DNSKEY records. */ + +struct DnsTrustAnchor { + Hashmap *positive_by_key; + Set *negative_by_name; + Set *revoked_by_rr; +}; + +int dns_trust_anchor_load(DnsTrustAnchor *d); +void dns_trust_anchor_flush(DnsTrustAnchor *d); + +int dns_trust_anchor_lookup_positive(DnsTrustAnchor *d, const DnsResourceKey* key, DnsAnswer **answer); +int dns_trust_anchor_lookup_negative(DnsTrustAnchor *d, const char *name); + +int dns_trust_anchor_check_revoked(DnsTrustAnchor *d, DnsResourceRecord *dnskey, DnsAnswer *rrs); +int dns_trust_anchor_is_revoked(DnsTrustAnchor *d, DnsResourceRecord *rr); diff --git a/src/resolve/resolved-dns-zone.c b/src/resolve/resolved-dns-zone.c new file mode 100644 index 0000000..f533f97 --- /dev/null +++ b/src/resolve/resolved-dns-zone.c @@ -0,0 +1,686 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dns-domain.h" +#include "list.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-zone.h" +#include "resolved-dnssd.h" +#include "resolved-manager.h" +#include "string-util.h" + +/* Never allow more than 1K entries */ +#define ZONE_MAX 1024 + +void dns_zone_item_probe_stop(DnsZoneItem *i) { + DnsTransaction *t; + assert(i); + + if (!i->probe_transaction) + return; + + t = TAKE_PTR(i->probe_transaction); + + set_remove(t->notify_zone_items, i); + set_remove(t->notify_zone_items_done, i); + dns_transaction_gc(t); +} + +static DnsZoneItem* dns_zone_item_free(DnsZoneItem *i) { + if (!i) + return NULL; + + dns_zone_item_probe_stop(i); + dns_resource_record_unref(i->rr); + + return mfree(i); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(DnsZoneItem*, dns_zone_item_free); + +static void dns_zone_item_remove_and_free(DnsZone *z, DnsZoneItem *i) { + DnsZoneItem *first; + + assert(z); + + if (!i) + return; + + first = hashmap_get(z->by_key, i->rr->key); + LIST_REMOVE(by_key, first, i); + if (first) + assert_se(hashmap_replace(z->by_key, first->rr->key, first) >= 0); + else + hashmap_remove(z->by_key, i->rr->key); + + first = hashmap_get(z->by_name, dns_resource_key_name(i->rr->key)); + LIST_REMOVE(by_name, first, i); + if (first) + assert_se(hashmap_replace(z->by_name, dns_resource_key_name(first->rr->key), first) >= 0); + else + hashmap_remove(z->by_name, dns_resource_key_name(i->rr->key)); + + dns_zone_item_free(i); +} + +void dns_zone_flush(DnsZone *z) { + DnsZoneItem *i; + + assert(z); + + while ((i = hashmap_first(z->by_key))) + dns_zone_item_remove_and_free(z, i); + + assert(hashmap_size(z->by_key) == 0); + assert(hashmap_size(z->by_name) == 0); + + z->by_key = hashmap_free(z->by_key); + z->by_name = hashmap_free(z->by_name); +} + +DnsZoneItem* dns_zone_get(DnsZone *z, DnsResourceRecord *rr) { + assert(z); + assert(rr); + + LIST_FOREACH(by_key, i, (DnsZoneItem*) hashmap_get(z->by_key, rr->key)) + if (dns_resource_record_equal(i->rr, rr) > 0) + return i; + + return NULL; +} + +void dns_zone_remove_rr(DnsZone *z, DnsResourceRecord *rr) { + DnsZoneItem *i; + + assert(z); + + if (!rr) + return; + + i = dns_zone_get(z, rr); + if (i) + dns_zone_item_remove_and_free(z, i); +} + +int dns_zone_remove_rrs_by_key(DnsZone *z, DnsResourceKey *key) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL, *soa = NULL; + DnsResourceRecord *rr; + bool tentative; + int r; + + r = dns_zone_lookup(z, key, 0, &answer, &soa, &tentative); + if (r < 0) + return r; + + DNS_ANSWER_FOREACH(rr, answer) + dns_zone_remove_rr(z, rr); + + return 0; +} + +static int dns_zone_init(DnsZone *z) { + int r; + + assert(z); + + r = hashmap_ensure_allocated(&z->by_key, &dns_resource_key_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&z->by_name, &dns_name_hash_ops); + if (r < 0) + return r; + + return 0; +} + +static int dns_zone_link_item(DnsZone *z, DnsZoneItem *i) { + DnsZoneItem *first; + int r; + + first = hashmap_get(z->by_key, i->rr->key); + if (first) { + LIST_PREPEND(by_key, first, i); + assert_se(hashmap_replace(z->by_key, first->rr->key, first) >= 0); + } else { + r = hashmap_put(z->by_key, i->rr->key, i); + if (r < 0) + return r; + } + + first = hashmap_get(z->by_name, dns_resource_key_name(i->rr->key)); + if (first) { + LIST_PREPEND(by_name, first, i); + assert_se(hashmap_replace(z->by_name, dns_resource_key_name(first->rr->key), first) >= 0); + } else { + r = hashmap_put(z->by_name, dns_resource_key_name(i->rr->key), i); + if (r < 0) + return r; + } + + return 0; +} + +static int dns_zone_item_probe_start(DnsZoneItem *i) { + _cleanup_(dns_transaction_gcp) DnsTransaction *t = NULL; + int r; + + assert(i); + + if (i->probe_transaction) + return 0; + + t = dns_scope_find_transaction( + i->scope, + &DNS_RESOURCE_KEY_CONST(i->rr->key->class, DNS_TYPE_ANY, dns_resource_key_name(i->rr->key)), + SD_RESOLVED_NO_CACHE|SD_RESOLVED_NO_ZONE); + if (!t) { + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + + key = dns_resource_key_new(i->rr->key->class, DNS_TYPE_ANY, dns_resource_key_name(i->rr->key)); + if (!key) + return -ENOMEM; + + r = dns_transaction_new(&t, i->scope, key, NULL, SD_RESOLVED_NO_CACHE|SD_RESOLVED_NO_ZONE); + if (r < 0) + return r; + } + + r = set_ensure_allocated(&t->notify_zone_items_done, NULL); + if (r < 0) + return r; + + r = set_ensure_put(&t->notify_zone_items, NULL, i); + if (r < 0) + return r; + + t->probing = true; + i->probe_transaction = TAKE_PTR(t); + + if (i->probe_transaction->state == DNS_TRANSACTION_NULL) { + i->block_ready++; + r = dns_transaction_go(i->probe_transaction); + i->block_ready--; + + if (r < 0) { + dns_zone_item_probe_stop(i); + return r; + } + } + + dns_zone_item_notify(i); + return 0; +} + +int dns_zone_put(DnsZone *z, DnsScope *s, DnsResourceRecord *rr, bool probe) { + _cleanup_(dns_zone_item_freep) DnsZoneItem *i = NULL; + DnsZoneItem *existing; + int r; + + assert(z); + assert(s); + assert(rr); + + if (dns_class_is_pseudo(rr->key->class)) + return -EINVAL; + if (dns_type_is_pseudo(rr->key->type)) + return -EINVAL; + + existing = dns_zone_get(z, rr); + if (existing) + return 0; + + r = dns_zone_init(z); + if (r < 0) + return r; + + i = new(DnsZoneItem, 1); + if (!i) + return -ENOMEM; + + *i = (DnsZoneItem) { + .scope = s, + .rr = dns_resource_record_ref(rr), + .probing_enabled = probe, + }; + + r = dns_zone_link_item(z, i); + if (r < 0) + return r; + + if (probe) { + bool established = false; + + /* Check if there's already an RR with the same name + * established. If so, it has been probed already, and + * we don't need to probe again. */ + + LIST_FOREACH_OTHERS(by_name, j, i) + if (j->state == DNS_ZONE_ITEM_ESTABLISHED) + established = true; + + if (established) + i->state = DNS_ZONE_ITEM_ESTABLISHED; + else { + i->state = DNS_ZONE_ITEM_PROBING; + + r = dns_zone_item_probe_start(i); + if (r < 0) { + dns_zone_item_remove_and_free(z, i); + i = NULL; + return r; + } + } + } else + i->state = DNS_ZONE_ITEM_ESTABLISHED; + + i = NULL; + return 0; +} + +static int dns_zone_add_authenticated_answer(DnsAnswer *a, DnsZoneItem *i, int ifindex) { + DnsAnswerFlags flags; + + /* From RFC 6762, Section 10.2 + * "They (the rules about when to set the cache-flush bit) apply to + * startup announcements as described in Section 8.3, "Announcing", + * and to responses generated as a result of receiving query messages." + * So, set the cache-flush bit for mDNS answers except for DNS-SD + * service enumeration PTRs described in RFC 6763, Section 4.1. */ + if (i->scope->protocol == DNS_PROTOCOL_MDNS && + !dns_resource_key_is_dnssd_ptr(i->rr->key)) + flags = DNS_ANSWER_AUTHENTICATED|DNS_ANSWER_CACHE_FLUSH; + else + flags = DNS_ANSWER_AUTHENTICATED; + + return dns_answer_add(a, i->rr, ifindex, flags, NULL); +} + +int dns_zone_lookup(DnsZone *z, DnsResourceKey *key, int ifindex, DnsAnswer **ret_answer, DnsAnswer **ret_soa, bool *ret_tentative) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL, *soa = NULL; + unsigned n_answer = 0; + DnsZoneItem *first; + bool tentative = true, need_soa = false; + int r; + + /* Note that we don't actually need the ifindex for anything. However when it is passed we'll initialize the + * ifindex field in the answer with it */ + + assert(z); + assert(key); + assert(ret_answer); + + /* First iteration, count what we have */ + + if (key->type == DNS_TYPE_ANY || key->class == DNS_CLASS_ANY) { + bool found = false, added = false; + int k; + + /* If this is a generic match, then we have to + * go through the list by the name and look + * for everything manually */ + + first = hashmap_get(z->by_name, dns_resource_key_name(key)); + LIST_FOREACH(by_name, j, first) { + if (!IN_SET(j->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + found = true; + + k = dns_resource_key_match_rr(key, j->rr, NULL); + if (k < 0) + return k; + if (k > 0) { + n_answer++; + added = true; + } + + } + + if (found && !added) + need_soa = true; + + } else { + bool found = false; + + /* If this is a specific match, then look for + * the right key immediately */ + + first = hashmap_get(z->by_key, key); + LIST_FOREACH(by_key, j, first) { + if (!IN_SET(j->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + found = true; + n_answer++; + } + + if (!found) { + first = hashmap_get(z->by_name, dns_resource_key_name(key)); + LIST_FOREACH(by_name, j, first) { + if (!IN_SET(j->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + need_soa = true; + break; + } + } + } + + if (n_answer <= 0 && !need_soa) + goto return_empty; + + if (n_answer > 0) { + answer = dns_answer_new(n_answer); + if (!answer) + return -ENOMEM; + } + + if (need_soa) { + soa = dns_answer_new(1); + if (!soa) + return -ENOMEM; + } + + /* Second iteration, actually add the RRs to the answers */ + if (key->type == DNS_TYPE_ANY || key->class == DNS_CLASS_ANY) { + bool found = false, added = false; + int k; + + first = hashmap_get(z->by_name, dns_resource_key_name(key)); + LIST_FOREACH(by_name, j, first) { + if (!IN_SET(j->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + found = true; + + if (j->state != DNS_ZONE_ITEM_PROBING) + tentative = false; + + k = dns_resource_key_match_rr(key, j->rr, NULL); + if (k < 0) + return k; + if (k > 0) { + r = dns_zone_add_authenticated_answer(answer, j, ifindex); + if (r < 0) + return r; + + added = true; + } + } + + if (found && !added) { + r = dns_answer_add_soa(soa, dns_resource_key_name(key), LLMNR_DEFAULT_TTL, ifindex); + if (r < 0) + return r; + } + } else { + bool found = false; + + first = hashmap_get(z->by_key, key); + LIST_FOREACH(by_key, j, first) { + if (!IN_SET(j->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + found = true; + + if (j->state != DNS_ZONE_ITEM_PROBING) + tentative = false; + + r = dns_zone_add_authenticated_answer(answer, j, ifindex); + if (r < 0) + return r; + } + + if (!found) { + bool add_soa = false; + + first = hashmap_get(z->by_name, dns_resource_key_name(key)); + LIST_FOREACH(by_name, j, first) { + if (!IN_SET(j->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + if (j->state != DNS_ZONE_ITEM_PROBING) + tentative = false; + + add_soa = true; + } + + if (add_soa) { + r = dns_answer_add_soa(soa, dns_resource_key_name(key), LLMNR_DEFAULT_TTL, ifindex); + if (r < 0) + return r; + } + } + } + + /* If the caller sets ret_tentative to NULL, then use this as + * indication to not return tentative entries */ + + if (!ret_tentative && tentative) + goto return_empty; + + *ret_answer = TAKE_PTR(answer); + + if (ret_soa) + *ret_soa = TAKE_PTR(soa); + + if (ret_tentative) + *ret_tentative = tentative; + + return 1; + +return_empty: + *ret_answer = NULL; + + if (ret_soa) + *ret_soa = NULL; + + if (ret_tentative) + *ret_tentative = false; + + return 0; +} + +void dns_zone_item_conflict(DnsZoneItem *i) { + assert(i); + + if (!IN_SET(i->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_VERIFYING, DNS_ZONE_ITEM_ESTABLISHED)) + return; + + log_info("Detected conflict on %s", strna(dns_resource_record_to_string(i->rr))); + + dns_zone_item_probe_stop(i); + + /* Withdraw the conflict item */ + i->state = DNS_ZONE_ITEM_WITHDRAWN; + + (void) dnssd_signal_conflict(i->scope->manager, dns_resource_key_name(i->rr->key)); + + /* Maybe change the hostname */ + if (manager_is_own_hostname(i->scope->manager, dns_resource_key_name(i->rr->key)) > 0) + manager_next_hostname(i->scope->manager); +} + +void dns_zone_item_notify(DnsZoneItem *i) { + assert(i); + assert(i->probe_transaction); + + if (i->block_ready > 0) + return; + + if (IN_SET(i->probe_transaction->state, DNS_TRANSACTION_NULL, DNS_TRANSACTION_PENDING, DNS_TRANSACTION_VALIDATING)) + return; + + if (i->probe_transaction->state == DNS_TRANSACTION_SUCCESS) { + bool we_lost = false; + + /* The probe got a successful reply. If we so far + * weren't established we just give up. + * + * In LLMNR case if we already + * were established, and the peer has the + * lexicographically larger IP address we continue + * and defend it. */ + + if (!IN_SET(i->state, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) { + log_debug("Got a successful probe for not yet established RR, we lost."); + we_lost = true; + } else if (i->probe_transaction->scope->protocol == DNS_PROTOCOL_LLMNR) { + assert(i->probe_transaction->received); + we_lost = memcmp(&i->probe_transaction->received->sender, &i->probe_transaction->received->destination, FAMILY_ADDRESS_SIZE(i->probe_transaction->received->family)) < 0; + if (we_lost) + log_debug("Got a successful probe reply for an established RR, and we have a lexicographically larger IP address and thus lost."); + } + + if (we_lost) { + dns_zone_item_conflict(i); + return; + } + + log_debug("Got a successful probe reply, but peer has lexicographically lower IP address and thus lost."); + } + + log_debug("Record %s successfully probed.", strna(dns_resource_record_to_string(i->rr))); + + dns_zone_item_probe_stop(i); + i->state = DNS_ZONE_ITEM_ESTABLISHED; +} + +static int dns_zone_item_verify(DnsZoneItem *i) { + int r; + + assert(i); + + if (i->state != DNS_ZONE_ITEM_ESTABLISHED) + return 0; + + log_debug("Verifying RR %s", strna(dns_resource_record_to_string(i->rr))); + + i->state = DNS_ZONE_ITEM_VERIFYING; + r = dns_zone_item_probe_start(i); + if (r < 0) { + log_error_errno(r, "Failed to start probing for verifying RR: %m"); + i->state = DNS_ZONE_ITEM_ESTABLISHED; + return r; + } + + return 0; +} + +int dns_zone_check_conflicts(DnsZone *zone, DnsResourceRecord *rr) { + DnsZoneItem *first; + int c = 0; + + assert(zone); + assert(rr); + + /* This checks whether a response RR we received from somebody + * else is one that we actually thought was uniquely ours. If + * so, we'll verify our RRs. */ + + /* No conflict if we don't have the name at all. */ + first = hashmap_get(zone->by_name, dns_resource_key_name(rr->key)); + if (!first) + return 0; + + /* No conflict if we have the exact same RR */ + if (dns_zone_get(zone, rr)) + return 0; + + /* No conflict if it is DNS-SD RR used for service enumeration. */ + if (dns_resource_key_is_dnssd_ptr(rr->key)) + return 0; + + /* OK, somebody else has RRs for the same name. Yuck! Let's + * start probing again */ + + LIST_FOREACH(by_name, i, first) { + if (dns_resource_record_equal(i->rr, rr)) + continue; + + dns_zone_item_verify(i); + c++; + } + + return c; +} + +int dns_zone_verify_conflicts(DnsZone *zone, DnsResourceKey *key) { + DnsZoneItem *first; + int c = 0; + + assert(zone); + + /* Somebody else notified us about a possible conflict. Let's + * verify if that's true. */ + + first = hashmap_get(zone->by_name, dns_resource_key_name(key)); + if (!first) + return 0; + + LIST_FOREACH(by_name, i, first) { + dns_zone_item_verify(i); + c++; + } + + return c; +} + +void dns_zone_verify_all(DnsZone *zone) { + DnsZoneItem *i; + + assert(zone); + + HASHMAP_FOREACH(i, zone->by_key) + LIST_FOREACH(by_key, j, i) + dns_zone_item_verify(j); +} + +void dns_zone_dump(DnsZone *zone, FILE *f) { + DnsZoneItem *i; + + if (!zone) + return; + + if (!f) + f = stdout; + + HASHMAP_FOREACH(i, zone->by_key) + LIST_FOREACH(by_key, j, i) { + const char *t; + + t = dns_resource_record_to_string(j->rr); + if (!t) { + log_oom(); + continue; + } + + fputc('\t', f); + fputs(t, f); + fputc('\n', f); + } +} + +bool dns_zone_is_empty(DnsZone *zone) { + if (!zone) + return true; + + return hashmap_isempty(zone->by_key); +} + +bool dns_zone_contains_name(DnsZone *z, const char *name) { + DnsZoneItem *first; + + first = hashmap_get(z->by_name, name); + if (!first) + return false; + + LIST_FOREACH(by_name, i, first) { + if (!IN_SET(i->state, DNS_ZONE_ITEM_PROBING, DNS_ZONE_ITEM_ESTABLISHED, DNS_ZONE_ITEM_VERIFYING)) + continue; + + return true; + } + + return false; +} diff --git a/src/resolve/resolved-dns-zone.h b/src/resolve/resolved-dns-zone.h new file mode 100644 index 0000000..1f5a6e0 --- /dev/null +++ b/src/resolve/resolved-dns-zone.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" + +typedef struct DnsZone { + Hashmap *by_key; + Hashmap *by_name; +} DnsZone; + +typedef struct DnsZoneItem DnsZoneItem; +typedef enum DnsZoneItemState DnsZoneItemState; + +#include "resolved-dns-answer.h" +#include "resolved-dns-question.h" +#include "resolved-dns-rr.h" +#include "resolved-dns-transaction.h" + +/* RFC 4795 Section 2.8. suggests a TTL of 30s by default */ +#define LLMNR_DEFAULT_TTL (30) + +/* RFC 6762 Section 10. suggests a TTL of 120s by default */ +#define MDNS_DEFAULT_TTL (120) + +enum DnsZoneItemState { + DNS_ZONE_ITEM_PROBING, + DNS_ZONE_ITEM_ESTABLISHED, + DNS_ZONE_ITEM_VERIFYING, + DNS_ZONE_ITEM_WITHDRAWN, +}; + +struct DnsZoneItem { + DnsScope *scope; + DnsResourceRecord *rr; + + DnsZoneItemState state; + + unsigned block_ready; + + bool probing_enabled; + + LIST_FIELDS(DnsZoneItem, by_key); + LIST_FIELDS(DnsZoneItem, by_name); + + DnsTransaction *probe_transaction; +}; + +void dns_zone_flush(DnsZone *z); + +int dns_zone_put(DnsZone *z, DnsScope *s, DnsResourceRecord *rr, bool probe); +DnsZoneItem* dns_zone_get(DnsZone *z, DnsResourceRecord *rr); +void dns_zone_remove_rr(DnsZone *z, DnsResourceRecord *rr); +int dns_zone_remove_rrs_by_key(DnsZone *z, DnsResourceKey *key); + +int dns_zone_lookup(DnsZone *z, DnsResourceKey *key, int ifindex, DnsAnswer **answer, DnsAnswer **soa, bool *tentative); + +void dns_zone_item_conflict(DnsZoneItem *i); +void dns_zone_item_notify(DnsZoneItem *i); + +int dns_zone_check_conflicts(DnsZone *zone, DnsResourceRecord *rr); +int dns_zone_verify_conflicts(DnsZone *zone, DnsResourceKey *key); + +void dns_zone_verify_all(DnsZone *zone); + +void dns_zone_item_probe_stop(DnsZoneItem *i); + +void dns_zone_dump(DnsZone *zone, FILE *f); +bool dns_zone_is_empty(DnsZone *zone); +bool dns_zone_contains_name(DnsZone *z, const char *name); diff --git a/src/resolve/resolved-dnssd-bus.c b/src/resolve/resolved-dnssd-bus.c new file mode 100644 index 0000000..0f0d478 --- /dev/null +++ b/src/resolve/resolved-dnssd-bus.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-polkit.h" +#include "missing_capability.h" +#include "resolved-dnssd-bus.h" +#include "resolved-dnssd.h" +#include "resolved-link.h" +#include "resolved-manager.h" +#include "strv.h" +#include "user-util.h" + +int bus_dnssd_method_unregister(sd_bus_message *message, void *userdata, sd_bus_error *error) { + DnssdService *s = ASSERT_PTR(userdata); + Manager *m; + Link *l; + int r; + + assert(message); + + m = s->manager; + + r = bus_verify_polkit_async(message, CAP_SYS_ADMIN, + "org.freedesktop.resolve1.unregister-service", + NULL, false, s->originator, + &m->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + HASHMAP_FOREACH(l, m->links) { + if (l->mdns_ipv4_scope) { + r = dns_scope_announce(l->mdns_ipv4_scope, true); + if (r < 0) + log_warning_errno(r, "Failed to send goodbye messages in IPv4 scope: %m"); + + dns_zone_remove_rr(&l->mdns_ipv4_scope->zone, s->ptr_rr); + dns_zone_remove_rr(&l->mdns_ipv4_scope->zone, s->srv_rr); + LIST_FOREACH(items, txt_data, s->txt_data_items) + dns_zone_remove_rr(&l->mdns_ipv4_scope->zone, txt_data->rr); + } + + if (l->mdns_ipv6_scope) { + r = dns_scope_announce(l->mdns_ipv6_scope, true); + if (r < 0) + log_warning_errno(r, "Failed to send goodbye messages in IPv6 scope: %m"); + + dns_zone_remove_rr(&l->mdns_ipv6_scope->zone, s->ptr_rr); + dns_zone_remove_rr(&l->mdns_ipv6_scope->zone, s->srv_rr); + LIST_FOREACH(items, txt_data, s->txt_data_items) + dns_zone_remove_rr(&l->mdns_ipv6_scope->zone, txt_data->rr); + } + } + + dnssd_service_free(s); + + manager_refresh_rrs(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int dnssd_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + _cleanup_free_ char *name = NULL; + Manager *m = ASSERT_PTR(userdata); + DnssdService *service; + int r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = sd_bus_path_decode(path, "/org/freedesktop/resolve1/dnssd", &name); + if (r <= 0) + return 0; + + service = hashmap_get(m->dnssd_services, name); + if (!service) + return 0; + + *found = service; + return 1; +} + +static int dnssd_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = ASSERT_PTR(userdata); + DnssdService *service; + unsigned c = 0; + int r; + + assert(bus); + assert(path); + assert(nodes); + + l = new0(char*, hashmap_size(m->dnssd_services) + 1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(service, m->dnssd_services) { + char *p; + + r = sd_bus_path_encode("/org/freedesktop/resolve1/dnssd", service->name, &p); + if (r < 0) + return r; + + l[c++] = p; + } + + l[c] = NULL; + *nodes = TAKE_PTR(l); + + return 1; +} + +static const sd_bus_vtable dnssd_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_METHOD("Unregister", NULL, NULL, bus_dnssd_method_unregister, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_SIGNAL("Conflicted", NULL, 0), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation dnssd_object = { + "/org/freedesktop/resolve1/dnssd", + "org.freedesktop.resolve1.DnssdService", + .fallback_vtables = BUS_FALLBACK_VTABLES({dnssd_vtable, dnssd_object_find}), + .node_enumerator = dnssd_node_enumerator, +}; diff --git a/src/resolve/resolved-dnssd-bus.h b/src/resolve/resolved-dnssd-bus.h new file mode 100644 index 0000000..f396e23 --- /dev/null +++ b/src/resolve/resolved-dnssd-bus.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" + +extern const BusObjectImplementation dnssd_object; + +int bus_dnssd_method_unregister(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/resolve/resolved-dnssd-gperf.gperf b/src/resolve/resolved-dnssd-gperf.gperf new file mode 100644 index 0000000..f10eae3 --- /dev/null +++ b/src/resolve/resolved-dnssd-gperf.gperf @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#include +#include "conf-parser.h" +#include "resolved-conf.h" +#include "resolved-dnssd.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name resolved_dnssd_gperf_hash +%define lookup-function-name resolved_dnssd_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Service.Name, config_parse_dnssd_service_name, 0, 0 +Service.Type, config_parse_dnssd_service_type, 0, 0 +Service.Port, config_parse_ip_port, 0, offsetof(DnssdService, port) +Service.Priority, config_parse_uint16, 0, offsetof(DnssdService, priority) +Service.Weight, config_parse_uint16, 0, offsetof(DnssdService, weight) +Service.TxtText, config_parse_dnssd_txt, DNS_TXT_ITEM_TEXT, 0 +Service.TxtData, config_parse_dnssd_txt, DNS_TXT_ITEM_DATA, 0 diff --git a/src/resolve/resolved-dnssd.c b/src/resolve/resolved-dnssd.c new file mode 100644 index 0000000..994771e --- /dev/null +++ b/src/resolve/resolved-dnssd.c @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-files.h" +#include "conf-parser.h" +#include "constants.h" +#include "resolved-dnssd.h" +#include "resolved-dns-rr.h" +#include "resolved-manager.h" +#include "resolved-conf.h" +#include "specifier.h" +#include "strv.h" + +#define DNSSD_SERVICE_DIRS ((const char* const*) CONF_PATHS_STRV("systemd/dnssd")) + +DnssdTxtData *dnssd_txtdata_free(DnssdTxtData *txt_data) { + if (!txt_data) + return NULL; + + dns_resource_record_unref(txt_data->rr); + dns_txt_item_free_all(txt_data->txts); + + return mfree(txt_data); +} + +DnssdTxtData *dnssd_txtdata_free_all(DnssdTxtData *txt_data) { + DnssdTxtData *next; + + if (!txt_data) + return NULL; + + next = txt_data->items_next; + + dnssd_txtdata_free(txt_data); + + return dnssd_txtdata_free_all(next); +} + +DnssdService *dnssd_service_free(DnssdService *service) { + if (!service) + return NULL; + + if (service->manager) + hashmap_remove(service->manager->dnssd_services, service->name); + + dns_resource_record_unref(service->ptr_rr); + dns_resource_record_unref(service->srv_rr); + + dnssd_txtdata_free_all(service->txt_data_items); + + free(service->filename); + free(service->name); + free(service->type); + free(service->name_template); + + return mfree(service); +} + +static int dnssd_service_load(Manager *manager, const char *filename) { + _cleanup_(dnssd_service_freep) DnssdService *service = NULL; + _cleanup_(dnssd_txtdata_freep) DnssdTxtData *txt_data = NULL; + char *d; + const char *dropin_dirname; + int r; + + assert(manager); + assert(filename); + + service = new0(DnssdService, 1); + if (!service) + return log_oom(); + + service->filename = strdup(filename); + if (!service->filename) + return log_oom(); + + service->name = strdup(basename(filename)); + if (!service->name) + return log_oom(); + + d = endswith(service->name, ".dnssd"); + if (!d) + return -EINVAL; + + assert(streq(d, ".dnssd")); + + *d = '\0'; + + dropin_dirname = strjoina(service->name, ".dnssd.d"); + + r = config_parse_many( + STRV_MAKE_CONST(filename), DNSSD_SERVICE_DIRS, dropin_dirname, /* root = */ NULL, + "Service\0", + config_item_perf_lookup, resolved_dnssd_gperf_lookup, + CONFIG_PARSE_WARN, + service, + NULL, + NULL); + if (r < 0) + return r; + + if (!service->name_template) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s doesn't define service instance name", + service->name); + + if (!service->type) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s doesn't define service type", + service->name); + + if (!service->txt_data_items) { + txt_data = new0(DnssdTxtData, 1); + if (!txt_data) + return log_oom(); + + r = dns_txt_item_new_empty(&txt_data->txts); + if (r < 0) + return r; + + LIST_PREPEND(items, service->txt_data_items, txt_data); + TAKE_PTR(txt_data); + } + + r = hashmap_ensure_put(&manager->dnssd_services, &string_hash_ops, service->name, service); + if (r < 0) + return r; + + service->manager = manager; + + r = dnssd_update_rrs(service); + if (r < 0) + return r; + + TAKE_PTR(service); + + return 0; +} + +static int specifier_dnssd_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Manager *m = ASSERT_PTR(userdata); + char *n; + + assert(m->llmnr_hostname); + + n = strdup(m->llmnr_hostname); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int dnssd_render_instance_name(Manager *m, DnssdService *s, char **ret) { + static const Specifier specifier_table[] = { + { 'a', specifier_architecture, NULL }, + { 'b', specifier_boot_id, NULL }, + { 'B', specifier_os_build_id, NULL }, + { 'H', specifier_dnssd_hostname, NULL }, + { 'm', specifier_machine_id, NULL }, + { 'o', specifier_os_id, NULL }, + { 'v', specifier_kernel_release, NULL }, + { 'w', specifier_os_version_id, NULL }, + { 'W', specifier_os_variant_id, NULL }, + {} + }; + _cleanup_free_ char *name = NULL; + int r; + + assert(m); + assert(s); + assert(s->name_template); + + r = specifier_printf(s->name_template, DNS_LABEL_MAX, specifier_table, NULL, m, &name); + if (r < 0) + return log_debug_errno(r, "Failed to replace specifiers: %m"); + + if (!dns_service_name_is_valid(name)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Service instance name '%s' is invalid.", + name); + + if (ret) + *ret = TAKE_PTR(name); + + return 0; +} + +int dnssd_load(Manager *manager) { + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(manager); + + if (manager->mdns_support != RESOLVE_SUPPORT_YES) + return 0; + + r = conf_files_list_strv(&files, ".dnssd", NULL, 0, DNSSD_SERVICE_DIRS); + if (r < 0) + return log_error_errno(r, "Failed to enumerate .dnssd files: %m"); + + STRV_FOREACH_BACKWARDS(f, files) { + r = dnssd_service_load(manager, *f); + if (r < 0) + log_warning_errno(r, "Failed to load '%s': %m", *f); + } + + return 0; +} + +int dnssd_update_rrs(DnssdService *s) { + _cleanup_free_ char *n = NULL, *service_name = NULL, *full_name = NULL; + int r; + + assert(s); + assert(s->txt_data_items); + assert(s->manager); + + s->ptr_rr = dns_resource_record_unref(s->ptr_rr); + s->srv_rr = dns_resource_record_unref(s->srv_rr); + LIST_FOREACH(items, txt_data, s->txt_data_items) + txt_data->rr = dns_resource_record_unref(txt_data->rr); + + r = dnssd_render_instance_name(s->manager, s, &n); + if (r < 0) + return r; + + r = dns_name_concat(s->type, "local", 0, &service_name); + if (r < 0) + return r; + r = dns_name_concat(n, service_name, 0, &full_name); + if (r < 0) + return r; + + LIST_FOREACH(items, txt_data, s->txt_data_items) { + txt_data->rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_TXT, + full_name); + if (!txt_data->rr) + goto oom; + + txt_data->rr->ttl = MDNS_DEFAULT_TTL; + txt_data->rr->txt.items = dns_txt_item_copy(txt_data->txts); + if (!txt_data->rr->txt.items) + goto oom; + } + + s->ptr_rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_PTR, + service_name); + if (!s->ptr_rr) + goto oom; + + s->ptr_rr->ttl = MDNS_DEFAULT_TTL; + s->ptr_rr->ptr.name = strdup(full_name); + if (!s->ptr_rr->ptr.name) + goto oom; + + s->srv_rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_SRV, + full_name); + if (!s->srv_rr) + goto oom; + + s->srv_rr->ttl = MDNS_DEFAULT_TTL; + s->srv_rr->srv.priority = s->priority; + s->srv_rr->srv.weight = s->weight; + s->srv_rr->srv.port = s->port; + s->srv_rr->srv.name = strdup(s->manager->mdns_hostname); + if (!s->srv_rr->srv.name) + goto oom; + + return 0; + +oom: + LIST_FOREACH(items, txt_data, s->txt_data_items) + txt_data->rr = dns_resource_record_unref(txt_data->rr); + s->ptr_rr = dns_resource_record_unref(s->ptr_rr); + s->srv_rr = dns_resource_record_unref(s->srv_rr); + return -ENOMEM; +} + +int dnssd_txt_item_new_from_string(const char *key, const char *value, DnsTxtItem **ret_item) { + size_t length; + DnsTxtItem *i; + + length = strlen(key); + + if (!isempty(value)) + length += strlen(value) + 1; /* length of value plus '=' */ + + i = malloc0(offsetof(DnsTxtItem, data) + length + 1); /* for safety reasons we add an extra NUL byte */ + if (!i) + return -ENOMEM; + + memcpy(i->data, key, strlen(key)); + if (!isempty(value)) { + memcpy(i->data + strlen(key), "=", 1); + memcpy(i->data + strlen(key) + 1, value, strlen(value)); + } + i->length = length; + + *ret_item = TAKE_PTR(i); + + return 0; +} + +int dnssd_txt_item_new_from_data(const char *key, const void *data, const size_t size, DnsTxtItem **ret_item) { + size_t length; + DnsTxtItem *i; + + length = strlen(key); + + if (size > 0) + length += size + 1; /* size of date plus '=' */ + + i = malloc0(offsetof(DnsTxtItem, data) + length + 1); /* for safety reasons we add an extra NUL byte */ + if (!i) + return -ENOMEM; + + memcpy(i->data, key, strlen(key)); + if (size > 0) { + memcpy(i->data + strlen(key), "=", 1); + memcpy(i->data + strlen(key) + 1, data, size); + } + i->length = length; + + *ret_item = TAKE_PTR(i); + + return 0; +} + +int dnssd_signal_conflict(Manager *manager, const char *name) { + DnssdService *s; + int r; + + if (sd_bus_is_ready(manager->bus) <= 0) + return 0; + + HASHMAP_FOREACH(s, manager->dnssd_services) { + if (s->withdrawn) + continue; + + if (dns_name_equal(dns_resource_key_name(s->srv_rr->key), name)) { + _cleanup_free_ char *path = NULL; + + s->withdrawn = true; + + r = sd_bus_path_encode("/org/freedesktop/resolve1/dnssd", s->name, &path); + if (r < 0) + return log_error_errno(r, "Can't get D-BUS object path: %m"); + + r = sd_bus_emit_signal(manager->bus, + path, + "org.freedesktop.resolve1.DnssdService", + "Conflicted", + NULL); + if (r < 0) + return log_error_errno(r, "Cannot emit signal: %m"); + + break; + } + } + + return 0; +} diff --git a/src/resolve/resolved-dnssd.h b/src/resolve/resolved-dnssd.h new file mode 100644 index 0000000..e978a0d --- /dev/null +++ b/src/resolve/resolved-dnssd.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include "list.h" + +typedef struct DnssdService DnssdService; +typedef struct DnssdTxtData DnssdTxtData; + +typedef struct Manager Manager; +typedef struct DnsResourceRecord DnsResourceRecord; +typedef struct DnsTxtItem DnsTxtItem; + +enum { + DNS_TXT_ITEM_TEXT, + DNS_TXT_ITEM_DATA, +}; + +struct DnssdTxtData { + DnsResourceRecord *rr; + + LIST_HEAD(DnsTxtItem, txts); + + LIST_FIELDS(DnssdTxtData, items); +}; + +struct DnssdService { + char *filename; + char *name; + char *name_template; + char *type; + uint16_t port; + uint16_t priority; + uint16_t weight; + + DnsResourceRecord *ptr_rr; + DnsResourceRecord *srv_rr; + + /* Section 6.8 of RFC 6763 allows having service + * instances with multiple TXT resource records. */ + LIST_HEAD(DnssdTxtData, txt_data_items); + + Manager *manager; + + bool withdrawn:1; + uid_t originator; +}; + +DnssdService *dnssd_service_free(DnssdService *service); +DnssdTxtData *dnssd_txtdata_free(DnssdTxtData *txt_data); +DnssdTxtData *dnssd_txtdata_free_all(DnssdTxtData *txt_data); + +DEFINE_TRIVIAL_CLEANUP_FUNC(DnssdService*, dnssd_service_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(DnssdTxtData*, dnssd_txtdata_free); + +int dnssd_render_instance_name(Manager *m, DnssdService *s, char **ret); +int dnssd_load(Manager *manager); +int dnssd_txt_item_new_from_string(const char *key, const char *value, DnsTxtItem **ret_item); +int dnssd_txt_item_new_from_data(const char *key, const void *value, const size_t size, DnsTxtItem **ret_item); +int dnssd_update_rrs(DnssdService *s); +int dnssd_signal_conflict(Manager *manager, const char *name); diff --git a/src/resolve/resolved-dnstls-gnutls.c b/src/resolve/resolved-dnstls-gnutls.c new file mode 100644 index 0000000..6ac026e --- /dev/null +++ b/src/resolve/resolved-dnstls-gnutls.c @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if !ENABLE_DNS_OVER_TLS || !DNS_OVER_TLS_USE_GNUTLS +#error This source file requires DNS-over-TLS to be enabled and GnuTLS to be available. +#endif + +#include + +#include "iovec-util.h" +#include "resolved-dns-stream.h" +#include "resolved-dnstls.h" +#include "resolved-manager.h" + +#define TLS_PROTOCOL_PRIORITY "NORMAL:-VERS-ALL:+VERS-TLS1.3:+VERS-TLS1.2" +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(gnutls_session_t, gnutls_deinit, NULL); + +static ssize_t dnstls_stream_vec_push(gnutls_transport_ptr_t p, const giovec_t *iov, int iovcnt) { + int r; + + assert(p); + + r = dns_stream_writev((DnsStream*) p, (const struct iovec*) iov, iovcnt, DNS_STREAM_WRITE_TLS_DATA); + if (r < 0) { + errno = -r; + return -1; + } + + return r; +} + +int dnstls_stream_connect_tls(DnsStream *stream, DnsServer *server) { + _cleanup_(gnutls_deinitp) gnutls_session_t gs = NULL; + int r; + + assert(stream); + assert(server); + + r = gnutls_init(&gs, GNUTLS_CLIENT | GNUTLS_ENABLE_FALSE_START | GNUTLS_NONBLOCK); + if (r < 0) + return r; + + /* As DNS-over-TLS is a recent protocol, older TLS versions can be disabled */ + r = gnutls_priority_set_direct(gs, TLS_PROTOCOL_PRIORITY, NULL); + if (r < 0) + return r; + + r = gnutls_credentials_set(gs, GNUTLS_CRD_CERTIFICATE, stream->manager->dnstls_data.cert_cred); + if (r < 0) + return r; + + if (server->dnstls_data.session_data.size > 0) { + gnutls_session_set_data(gs, server->dnstls_data.session_data.data, server->dnstls_data.session_data.size); + + // Clear old session ticket + gnutls_free(server->dnstls_data.session_data.data); + server->dnstls_data.session_data.data = NULL; + server->dnstls_data.session_data.size = 0; + } + + if (server->manager->dns_over_tls_mode == DNS_OVER_TLS_YES) { + if (server->server_name) + gnutls_session_set_verify_cert(gs, server->server_name, 0); + else { + stream->dnstls_data.validation.type = GNUTLS_DT_IP_ADDRESS; + if (server->family == AF_INET) { + stream->dnstls_data.validation.data = (unsigned char*) &server->address.in.s_addr; + stream->dnstls_data.validation.size = 4; + } else { + stream->dnstls_data.validation.data = server->address.in6.s6_addr; + stream->dnstls_data.validation.size = 16; + } + gnutls_session_set_verify_cert2(gs, &stream->dnstls_data.validation, 1, 0); + } + } + + if (server->server_name) { + r = gnutls_server_name_set(gs, GNUTLS_NAME_DNS, server->server_name, strlen(server->server_name)); + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set server name: %s", gnutls_strerror(r)); + } + + gnutls_handshake_set_timeout(gs, GNUTLS_DEFAULT_HANDSHAKE_TIMEOUT); + + gnutls_transport_set_ptr2(gs, (gnutls_transport_ptr_t) (long) stream->fd, stream); + gnutls_transport_set_vec_push_function(gs, &dnstls_stream_vec_push); + + stream->encrypted = true; + stream->dnstls_data.handshake = gnutls_handshake(gs); + if (stream->dnstls_data.handshake < 0 && gnutls_error_is_fatal(stream->dnstls_data.handshake)) + return -ECONNREFUSED; + + stream->dnstls_data.session = TAKE_PTR(gs); + + return 0; +} + +void dnstls_stream_free(DnsStream *stream) { + assert(stream); + assert(stream->encrypted); + + if (stream->dnstls_data.session) + gnutls_deinit(stream->dnstls_data.session); +} + +int dnstls_stream_on_io(DnsStream *stream, uint32_t revents) { + int r; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.session); + + if (stream->dnstls_data.shutdown) { + r = gnutls_bye(stream->dnstls_data.session, GNUTLS_SHUT_RDWR); + if (r == GNUTLS_E_AGAIN) { + stream->dnstls_events = gnutls_record_get_direction(stream->dnstls_data.session) == 1 ? EPOLLOUT : EPOLLIN; + return -EAGAIN; + } else if (r < 0) + log_debug("Failed to invoke gnutls_bye: %s", gnutls_strerror(r)); + + stream->dnstls_events = 0; + stream->dnstls_data.shutdown = false; + dns_stream_unref(stream); + return DNSTLS_STREAM_CLOSED; + } else if (stream->dnstls_data.handshake < 0) { + stream->dnstls_data.handshake = gnutls_handshake(stream->dnstls_data.session); + if (stream->dnstls_data.handshake == GNUTLS_E_AGAIN) { + stream->dnstls_events = gnutls_record_get_direction(stream->dnstls_data.session) == 1 ? EPOLLOUT : EPOLLIN; + return -EAGAIN; + } else if (stream->dnstls_data.handshake < 0) { + log_debug("Failed to invoke gnutls_handshake: %s", gnutls_strerror(stream->dnstls_data.handshake)); + if (gnutls_error_is_fatal(stream->dnstls_data.handshake)) + return -ECONNREFUSED; + } + + stream->dnstls_events = 0; + } + + return 0; +} + +int dnstls_stream_shutdown(DnsStream *stream, int error) { + int r; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.session); + + /* Store TLS Ticket for faster successive TLS handshakes */ + if (stream->server && stream->server->dnstls_data.session_data.size == 0 && stream->dnstls_data.handshake == GNUTLS_E_SUCCESS) + gnutls_session_get_data2(stream->dnstls_data.session, &stream->server->dnstls_data.session_data); + + if (IN_SET(error, ETIMEDOUT, 0)) { + r = gnutls_bye(stream->dnstls_data.session, GNUTLS_SHUT_RDWR); + if (r == GNUTLS_E_AGAIN) { + if (!stream->dnstls_data.shutdown) { + stream->dnstls_data.shutdown = true; + dns_stream_ref(stream); + return -EAGAIN; + } + } else if (r < 0) + log_debug("Failed to invoke gnutls_bye: %s", gnutls_strerror(r)); + } + + return 0; +} + +ssize_t dnstls_stream_writev(DnsStream *stream, const struct iovec *iov, size_t iovcnt) { + ssize_t ss; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.session); + assert(iov); + assert(iovec_total_size(iov, iovcnt) > 0); + + gnutls_record_cork(stream->dnstls_data.session); + + for (size_t i = 0; i < iovcnt; i++) { + ss = gnutls_record_send( + stream->dnstls_data.session, + iov[i].iov_base, iov[i].iov_len); + if (ss < 0) + break; + } + + ss = gnutls_record_uncork(stream->dnstls_data.session, 0); + if (ss < 0) + switch (ss) { + case GNUTLS_E_INTERRUPTED: + return -EINTR; + case GNUTLS_E_AGAIN: + return -EAGAIN; + default: + return log_debug_errno(SYNTHETIC_ERRNO(EPIPE), + "Failed to invoke gnutls_record_send: %s", + gnutls_strerror(ss)); + } + + return ss; +} + +ssize_t dnstls_stream_read(DnsStream *stream, void *buf, size_t count) { + ssize_t ss; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.session); + assert(buf); + + ss = gnutls_record_recv(stream->dnstls_data.session, buf, count); + if (ss < 0) + switch (ss) { + case GNUTLS_E_INTERRUPTED: + return -EINTR; + case GNUTLS_E_AGAIN: + return -EAGAIN; + default: + return log_debug_errno(SYNTHETIC_ERRNO(EPIPE), + "Failed to invoke gnutls_record_recv: %s", + gnutls_strerror(ss)); + } + + return ss; +} + +void dnstls_server_free(DnsServer *server) { + assert(server); + + if (server->dnstls_data.session_data.data) + gnutls_free(server->dnstls_data.session_data.data); +} + +int dnstls_manager_init(Manager *manager) { + int r; + assert(manager); + + r = gnutls_certificate_allocate_credentials(&manager->dnstls_data.cert_cred); + if (r < 0) + return -ENOMEM; + + r = gnutls_certificate_set_x509_system_trust(manager->dnstls_data.cert_cred); + if (r < 0) + log_warning("Failed to load system trust store: %s", gnutls_strerror(r)); + + return 0; +} + +void dnstls_manager_free(Manager *manager) { + assert(manager); + + if (manager->dnstls_data.cert_cred) + gnutls_certificate_free_credentials(manager->dnstls_data.cert_cred); +} diff --git a/src/resolve/resolved-dnstls-gnutls.h b/src/resolve/resolved-dnstls-gnutls.h new file mode 100644 index 0000000..dc1255f --- /dev/null +++ b/src/resolve/resolved-dnstls-gnutls.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if !ENABLE_DNS_OVER_TLS || !DNS_OVER_TLS_USE_GNUTLS +#error This source file requires DNS-over-TLS to be enabled and GnuTLS to be available. +#endif + +#include +#include + +struct DnsTlsManagerData { + gnutls_certificate_credentials_t cert_cred; +}; + +struct DnsTlsServerData { + gnutls_datum_t session_data; +}; + +struct DnsTlsStreamData { + gnutls_session_t session; + gnutls_typed_vdata_st validation; + int handshake; + bool shutdown; +}; diff --git a/src/resolve/resolved-dnstls-openssl.c b/src/resolve/resolved-dnstls-openssl.c new file mode 100644 index 0000000..fbcee7f --- /dev/null +++ b/src/resolve/resolved-dnstls-openssl.c @@ -0,0 +1,422 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if !ENABLE_DNS_OVER_TLS || !DNS_OVER_TLS_USE_OPENSSL +#error This source file requires DNS-over-TLS to be enabled and OpenSSL to be available. +#endif + +#include +#include +#include + +#include "io-util.h" +#include "openssl-util.h" +#include "resolved-dns-stream.h" +#include "resolved-dnstls.h" +#include "resolved-manager.h" + +static char *dnstls_error_string(int ssl_error, char *buf, size_t count) { + assert(buf || count == 0); + if (ssl_error == SSL_ERROR_SSL) + ERR_error_string_n(ERR_get_error(), buf, count); + else + snprintf(buf, count, "SSL_get_error()=%d", ssl_error); + return buf; +} + +#define DNSTLS_ERROR_BUFSIZE 256 +#define DNSTLS_ERROR_STRING(error) \ + dnstls_error_string((error), (char[DNSTLS_ERROR_BUFSIZE]){}, DNSTLS_ERROR_BUFSIZE) + +static int dnstls_flush_write_buffer(DnsStream *stream) { + ssize_t ss; + + assert(stream); + assert(stream->encrypted); + + if (stream->dnstls_data.buffer_offset < stream->dnstls_data.write_buffer->length) { + assert(stream->dnstls_data.write_buffer->data); + + struct iovec iov[1]; + iov[0] = IOVEC_MAKE(stream->dnstls_data.write_buffer->data + stream->dnstls_data.buffer_offset, + stream->dnstls_data.write_buffer->length - stream->dnstls_data.buffer_offset); + ss = dns_stream_writev(stream, iov, 1, DNS_STREAM_WRITE_TLS_DATA); + if (ss < 0) { + if (ss == -EAGAIN) + stream->dnstls_events |= EPOLLOUT; + + return ss; + } else { + stream->dnstls_data.buffer_offset += ss; + + if (stream->dnstls_data.buffer_offset < stream->dnstls_data.write_buffer->length) { + stream->dnstls_events |= EPOLLOUT; + return -EAGAIN; + } else { + BIO_reset(SSL_get_wbio(stream->dnstls_data.ssl)); + stream->dnstls_data.buffer_offset = 0; + } + } + } + + return 0; +} + +int dnstls_stream_connect_tls(DnsStream *stream, DnsServer *server) { + _cleanup_(BIO_freep) BIO *rb = NULL, *wb = NULL; + _cleanup_(SSL_freep) SSL *s = NULL; + int error, r; + + assert(stream); + assert(stream->manager); + assert(server); + + rb = BIO_new_socket(stream->fd, 0); + if (!rb) + return -ENOMEM; + + wb = BIO_new(BIO_s_mem()); + if (!wb) + return -ENOMEM; + + BIO_get_mem_ptr(wb, &stream->dnstls_data.write_buffer); + stream->dnstls_data.buffer_offset = 0; + + s = SSL_new(stream->manager->dnstls_data.ctx); + if (!s) + return -ENOMEM; + + SSL_set_connect_state(s); + r = SSL_set_session(s, server->dnstls_data.session); + if (r == 0) + return -EIO; + SSL_set_bio(s, TAKE_PTR(rb), TAKE_PTR(wb)); + + if (server->manager->dns_over_tls_mode == DNS_OVER_TLS_YES) { + X509_VERIFY_PARAM *v; + + SSL_set_verify(s, SSL_VERIFY_PEER, NULL); + v = SSL_get0_param(s); + if (server->server_name) { + X509_VERIFY_PARAM_set_hostflags(v, X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS); + if (X509_VERIFY_PARAM_set1_host(v, server->server_name, 0) == 0) + return -ECONNREFUSED; + } else { + const unsigned char *ip; + ip = server->family == AF_INET ? (const unsigned char*) &server->address.in.s_addr : server->address.in6.s6_addr; + if (X509_VERIFY_PARAM_set1_ip(v, ip, FAMILY_ADDRESS_SIZE(server->family)) == 0) + return -ECONNREFUSED; + } + } + + if (server->server_name) { + r = SSL_set_tlsext_host_name(s, server->server_name); + if (r <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to set server name: %s", DNSTLS_ERROR_STRING(SSL_ERROR_SSL)); + } + + ERR_clear_error(); + stream->dnstls_data.handshake = SSL_do_handshake(s); + if (stream->dnstls_data.handshake <= 0) { + error = SSL_get_error(s, stream->dnstls_data.handshake); + if (!IN_SET(error, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE)) + return log_debug_errno(SYNTHETIC_ERRNO(ECONNREFUSED), + "Failed to invoke SSL_do_handshake: %s", DNSTLS_ERROR_STRING(error)); + } + + stream->encrypted = true; + stream->dnstls_data.ssl = TAKE_PTR(s); + + r = dnstls_flush_write_buffer(stream); + if (r < 0 && r != -EAGAIN) { + SSL_free(TAKE_PTR(stream->dnstls_data.ssl)); + return r; + } + + return 0; +} + +void dnstls_stream_free(DnsStream *stream) { + assert(stream); + assert(stream->encrypted); + + if (stream->dnstls_data.ssl) + SSL_free(stream->dnstls_data.ssl); +} + +int dnstls_stream_on_io(DnsStream *stream, uint32_t revents) { + int error, r; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.ssl); + + /* Flush write buffer when requested by OpenSSL */ + if ((revents & EPOLLOUT) && (stream->dnstls_events & EPOLLOUT)) { + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + } + + if (stream->dnstls_data.shutdown) { + ERR_clear_error(); + r = SSL_shutdown(stream->dnstls_data.ssl); + if (r == 0) { + stream->dnstls_events = 0; + + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + return -EAGAIN; + } else if (r < 0) { + error = SSL_get_error(stream->dnstls_data.ssl, r); + if (IN_SET(error, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE)) { + stream->dnstls_events = error == SSL_ERROR_WANT_READ ? EPOLLIN : EPOLLOUT; + + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + return -EAGAIN; + } else if (error == SSL_ERROR_SYSCALL) { + if (errno > 0) + log_debug_errno(errno, "Failed to invoke SSL_shutdown, ignoring: %m"); + } else + log_debug("Failed to invoke SSL_shutdown, ignoring: %s", DNSTLS_ERROR_STRING(error)); + } + + stream->dnstls_events = 0; + stream->dnstls_data.shutdown = false; + + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + dns_stream_unref(stream); + return DNSTLS_STREAM_CLOSED; + } else if (stream->dnstls_data.handshake <= 0) { + ERR_clear_error(); + stream->dnstls_data.handshake = SSL_do_handshake(stream->dnstls_data.ssl); + if (stream->dnstls_data.handshake <= 0) { + error = SSL_get_error(stream->dnstls_data.ssl, stream->dnstls_data.handshake); + if (IN_SET(error, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE)) { + stream->dnstls_events = error == SSL_ERROR_WANT_READ ? EPOLLIN : EPOLLOUT; + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + return -EAGAIN; + } else + return log_debug_errno(SYNTHETIC_ERRNO(ECONNREFUSED), + "Failed to invoke SSL_do_handshake: %s", + DNSTLS_ERROR_STRING(error)); + } + + stream->dnstls_events = 0; + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + } + + return 0; +} + +int dnstls_stream_shutdown(DnsStream *stream, int error) { + int ssl_error, r; + SSL_SESSION *s; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.ssl); + + if (stream->server) { + s = SSL_get1_session(stream->dnstls_data.ssl); + if (s) { + if (stream->server->dnstls_data.session) + SSL_SESSION_free(stream->server->dnstls_data.session); + + stream->server->dnstls_data.session = s; + } + } + + if (error == ETIMEDOUT) { + ERR_clear_error(); + r = SSL_shutdown(stream->dnstls_data.ssl); + if (r == 0) { + if (!stream->dnstls_data.shutdown) { + stream->dnstls_data.shutdown = true; + dns_stream_ref(stream); + } + + stream->dnstls_events = 0; + + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + return -EAGAIN; + } else if (r < 0) { + ssl_error = SSL_get_error(stream->dnstls_data.ssl, r); + if (IN_SET(ssl_error, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE)) { + stream->dnstls_events = ssl_error == SSL_ERROR_WANT_READ ? EPOLLIN : EPOLLOUT; + r = dnstls_flush_write_buffer(stream); + if (r < 0 && r != -EAGAIN) + return r; + + if (!stream->dnstls_data.shutdown) { + stream->dnstls_data.shutdown = true; + dns_stream_ref(stream); + } + return -EAGAIN; + } else if (ssl_error == SSL_ERROR_SYSCALL) { + if (errno > 0) + log_debug_errno(errno, "Failed to invoke SSL_shutdown, ignoring: %m"); + } else + log_debug("Failed to invoke SSL_shutdown, ignoring: %s", DNSTLS_ERROR_STRING(ssl_error)); + } + + stream->dnstls_events = 0; + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + } + + return 0; +} + +static ssize_t dnstls_stream_write(DnsStream *stream, const char *buf, size_t count) { + int error, r; + ssize_t ss; + + ERR_clear_error(); + ss = r = SSL_write(stream->dnstls_data.ssl, buf, count); + if (r <= 0) { + error = SSL_get_error(stream->dnstls_data.ssl, r); + if (IN_SET(error, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE)) { + stream->dnstls_events = error == SSL_ERROR_WANT_READ ? EPOLLIN : EPOLLOUT; + ss = -EAGAIN; + } else if (error == SSL_ERROR_ZERO_RETURN) { + stream->dnstls_events = 0; + ss = 0; + } else { + log_debug("Failed to invoke SSL_write: %s", DNSTLS_ERROR_STRING(error)); + stream->dnstls_events = 0; + ss = -EPIPE; + } + } else + stream->dnstls_events = 0; + + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + return ss; +} + +ssize_t dnstls_stream_writev(DnsStream *stream, const struct iovec *iov, size_t iovcnt) { + _cleanup_free_ char *buf = NULL; + size_t count; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.ssl); + assert(iov); + assert(iovec_total_size(iov, iovcnt) > 0); + + if (iovcnt == 1) + return dnstls_stream_write(stream, iov[0].iov_base, iov[0].iov_len); + + /* As of now, OpenSSL cannot accumulate multiple writes, so join into a + single buffer. Suboptimal, but better than multiple SSL_write calls. */ + count = iovec_total_size(iov, iovcnt); + buf = new(char, count); + for (size_t i = 0, pos = 0; i < iovcnt; pos += iov[i].iov_len, i++) + memcpy(buf + pos, iov[i].iov_base, iov[i].iov_len); + + return dnstls_stream_write(stream, buf, count); +} + +ssize_t dnstls_stream_read(DnsStream *stream, void *buf, size_t count) { + int error, r; + ssize_t ss; + + assert(stream); + assert(stream->encrypted); + assert(stream->dnstls_data.ssl); + assert(buf); + + ERR_clear_error(); + ss = r = SSL_read(stream->dnstls_data.ssl, buf, count); + if (r <= 0) { + error = SSL_get_error(stream->dnstls_data.ssl, r); + if (IN_SET(error, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE)) { + /* If we receive SSL_ERROR_WANT_READ here, there are two possible scenarios: + * OpenSSL needs to renegotiate (so we want to get an EPOLLIN event), or + * There is no more application data is available, so we can just return + And apparently there's no nice way to distinguish between the two. + To handle this, never set EPOLLIN and just continue as usual. + If OpenSSL really wants to read due to renegotiation, it will tell us + again on SSL_write (at which point we will request EPOLLIN force a read); + or we will just eventually read data anyway while we wait for a packet */ + stream->dnstls_events = error == SSL_ERROR_WANT_READ ? 0 : EPOLLOUT; + ss = -EAGAIN; + } else if (error == SSL_ERROR_ZERO_RETURN) { + stream->dnstls_events = 0; + ss = 0; + } else { + log_debug("Failed to invoke SSL_read: %s", DNSTLS_ERROR_STRING(error)); + stream->dnstls_events = 0; + ss = -EPIPE; + } + } else + stream->dnstls_events = 0; + + /* flush write buffer in cache of renegotiation */ + r = dnstls_flush_write_buffer(stream); + if (r < 0) + return r; + + return ss; +} + +void dnstls_server_free(DnsServer *server) { + assert(server); + + if (server->dnstls_data.session) + SSL_SESSION_free(server->dnstls_data.session); +} + +int dnstls_manager_init(Manager *manager) { + int r; + + assert(manager); + + ERR_load_crypto_strings(); + SSL_load_error_strings(); + + manager->dnstls_data.ctx = SSL_CTX_new(TLS_client_method()); + if (!manager->dnstls_data.ctx) + return -ENOMEM; + + r = SSL_CTX_set_min_proto_version(manager->dnstls_data.ctx, TLS1_2_VERSION); + if (r == 0) + return -EIO; + + (void) SSL_CTX_set_options(manager->dnstls_data.ctx, SSL_OP_NO_COMPRESSION); + + r = SSL_CTX_set_default_verify_paths(manager->dnstls_data.ctx); + if (r == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EIO), + "Failed to load system trust store: %s", + ERR_error_string(ERR_get_error(), NULL)); + + return 0; +} + +void dnstls_manager_free(Manager *manager) { + assert(manager); + + if (manager->dnstls_data.ctx) + SSL_CTX_free(manager->dnstls_data.ctx); +} diff --git a/src/resolve/resolved-dnstls-openssl.h b/src/resolve/resolved-dnstls-openssl.h new file mode 100644 index 0000000..a73b77b --- /dev/null +++ b/src/resolve/resolved-dnstls-openssl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if !ENABLE_DNS_OVER_TLS || !DNS_OVER_TLS_USE_OPENSSL +#error This source file requires DNS-over-TLS to be enabled and OpenSSL to be available. +#endif + +#include +#include + +struct DnsTlsManagerData { + SSL_CTX *ctx; +}; + +struct DnsTlsServerData { + SSL_SESSION *session; +}; + +struct DnsTlsStreamData { + int handshake; + bool shutdown; + SSL *ssl; + BUF_MEM *write_buffer; + size_t buffer_offset; +}; diff --git a/src/resolve/resolved-dnstls.h b/src/resolve/resolved-dnstls.h new file mode 100644 index 0000000..cda97e0 --- /dev/null +++ b/src/resolve/resolved-dnstls.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if ENABLE_DNS_OVER_TLS + +#include +#include + +typedef struct DnsServer DnsServer; +typedef struct DnsStream DnsStream; +typedef struct DnsTlsManagerData DnsTlsManagerData; +typedef struct DnsTlsServerData DnsTlsServerData; +typedef struct DnsTlsStreamData DnsTlsStreamData; +typedef struct Manager Manager; + +#if DNS_OVER_TLS_USE_GNUTLS +#include "resolved-dnstls-gnutls.h" +#elif DNS_OVER_TLS_USE_OPENSSL +#include "resolved-dnstls-openssl.h" +#else +#error Unknown dependency for supporting DNS-over-TLS +#endif + +#define DNSTLS_STREAM_CLOSED 1 + +int dnstls_stream_connect_tls(DnsStream *stream, DnsServer *server); +void dnstls_stream_free(DnsStream *stream); +int dnstls_stream_on_io(DnsStream *stream, uint32_t revents); +int dnstls_stream_shutdown(DnsStream *stream, int error); +ssize_t dnstls_stream_writev(DnsStream *stream, const struct iovec *iov, size_t iovcnt); +ssize_t dnstls_stream_read(DnsStream *stream, void *buf, size_t count); + +void dnstls_server_free(DnsServer *server); + +int dnstls_manager_init(Manager *manager); +void dnstls_manager_free(Manager *manager); + +#endif /* ENABLE_DNS_OVER_TLS */ diff --git a/src/resolve/resolved-etc-hosts.c b/src/resolve/resolved-etc-hosts.c new file mode 100644 index 0000000..6af160a --- /dev/null +++ b/src/resolve/resolved-etc-hosts.c @@ -0,0 +1,586 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "fd-util.h" +#include "fileio.h" +#include "hostname-util.h" +#include "resolved-dns-synthesize.h" +#include "resolved-etc-hosts.h" +#include "socket-netlink.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +/* Recheck /etc/hosts at most once every 2s */ +#define ETC_HOSTS_RECHECK_USEC (2*USEC_PER_SEC) + +static EtcHostsItemByAddress *etc_hosts_item_by_address_free(EtcHostsItemByAddress *item) { + if (!item) + return NULL; + + set_free(item->names); + return mfree(item); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EtcHostsItemByAddress*, etc_hosts_item_by_address_free); + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + by_address_hash_ops, + struct in_addr_data, + in_addr_data_hash_func, + in_addr_data_compare_func, + EtcHostsItemByAddress, + etc_hosts_item_by_address_free); + +static EtcHostsItemByName *etc_hosts_item_by_name_free(EtcHostsItemByName *item) { + if (!item) + return NULL; + + free(item->name); + set_free(item->addresses); + return mfree(item); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(EtcHostsItemByName*, etc_hosts_item_by_name_free); + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + by_name_hash_ops, + char, + dns_name_hash_func, + dns_name_compare_func, + EtcHostsItemByName, + etc_hosts_item_by_name_free); + +void etc_hosts_clear(EtcHosts *hosts) { + assert(hosts); + + hosts->by_address = hashmap_free(hosts->by_address); + hosts->by_name = hashmap_free(hosts->by_name); + hosts->no_address = set_free(hosts->no_address); +} + +void manager_etc_hosts_flush(Manager *m) { + etc_hosts_clear(&m->etc_hosts); + m->etc_hosts_stat = (struct stat) {}; +} + +static int parse_line(EtcHosts *hosts, unsigned nr, const char *line) { + _cleanup_free_ char *address_str = NULL; + struct in_addr_data address = {}; + bool found = false; + EtcHostsItemByAddress *item; + int r; + + assert(hosts); + assert(line); + + r = extract_first_word(&line, &address_str, NULL, EXTRACT_RELAX); + if (r < 0) + return log_error_errno(r, "/etc/hosts:%u: failed to extract address: %m", nr); + assert(r > 0); /* We already checked that the line is not empty, so it should contain *something* */ + + r = in_addr_ifindex_from_string_auto(address_str, &address.family, &address.address, NULL); + if (r < 0) { + log_warning_errno(r, "/etc/hosts:%u: address '%s' is invalid, ignoring: %m", nr, address_str); + return 0; + } + + r = in_addr_data_is_null(&address); + if (r < 0) { + log_warning_errno(r, "/etc/hosts:%u: address '%s' is invalid, ignoring: %m", nr, address_str); + return 0; + } + if (r > 0) + /* This is an 0.0.0.0 or :: item, which we assume means that we shall map the specified hostname to + * nothing. */ + item = NULL; + else { + /* If this is a normal address, then simply add entry mapping it to the specified names */ + + item = hashmap_get(hosts->by_address, &address); + if (!item) { + _cleanup_(etc_hosts_item_by_address_freep) EtcHostsItemByAddress *new_item = NULL; + + new_item = new(EtcHostsItemByAddress, 1); + if (!new_item) + return log_oom(); + + *new_item = (EtcHostsItemByAddress) { + .address = address, + }; + + r = hashmap_ensure_put(&hosts->by_address, &by_address_hash_ops, &new_item->address, new_item); + if (r < 0) + return log_oom(); + + item = TAKE_PTR(new_item); + } + } + + for (;;) { + _cleanup_free_ char *name = NULL; + EtcHostsItemByName *bn; + + r = extract_first_word(&line, &name, NULL, EXTRACT_RELAX); + if (r < 0) + return log_error_errno(r, "/etc/hosts:%u: couldn't extract hostname: %m", nr); + if (r == 0) + break; + + r = dns_name_is_valid_ldh(name); + if (r <= 0) { + if (r < 0) + log_warning_errno(r, "/etc/hosts:%u: Failed to check the validity of hostname \"%s\", ignoring: %m", nr, name); + else + log_warning("/etc/hosts:%u: hostname \"%s\" is not valid, ignoring.", nr, name); + continue; + } + + found = true; + + if (!item) { + /* Optimize the case where we don't need to store any addresses, by storing + * only the name in a dedicated Set instead of the hashmap */ + + r = set_ensure_consume(&hosts->no_address, &dns_name_hash_ops_free, TAKE_PTR(name)); + if (r < 0) + return log_oom(); + + continue; + } + + bn = hashmap_get(hosts->by_name, name); + if (!bn) { + _cleanup_(etc_hosts_item_by_name_freep) EtcHostsItemByName *new_item = NULL; + _cleanup_free_ char *name_copy = NULL; + + name_copy = strdup(name); + if (!name_copy) + return log_oom(); + + new_item = new(EtcHostsItemByName, 1); + if (!new_item) + return log_oom(); + + *new_item = (EtcHostsItemByName) { + .name = TAKE_PTR(name_copy), + }; + + r = hashmap_ensure_put(&hosts->by_name, &by_name_hash_ops, new_item->name, new_item); + if (r < 0) + return log_oom(); + + bn = TAKE_PTR(new_item); + } + + if (!set_contains(bn->addresses, &address)) { + _cleanup_free_ struct in_addr_data *address_copy = NULL; + + address_copy = newdup(struct in_addr_data, &address, 1); + if (!address_copy) + return log_oom(); + + r = set_ensure_consume(&bn->addresses, &in_addr_data_hash_ops_free, TAKE_PTR(address_copy)); + if (r < 0) + return log_oom(); + } + + r = set_ensure_put(&item->names, &dns_name_hash_ops_free, name); + if (r < 0) + return log_oom(); + if (r == 0) /* the name is already listed */ + continue; + /* + * Keep track of the first name listed for this address. + * This name will be used in responses as the canonical name. + */ + if (!item->canonical_name) + item->canonical_name = name; + TAKE_PTR(name); + } + + if (!found) + log_warning("/etc/hosts:%u: line is missing any valid hostnames", nr); + + return 0; +} + +static void strip_localhost(EtcHosts *hosts) { + static const struct in_addr_data local_in_addrs[] = { + { + .family = AF_INET, +#if __BYTE_ORDER == __LITTLE_ENDIAN + /* We want constant expressions here, that's why we don't use htole32() here */ + .address.in.s_addr = UINT32_C(0x0100007F), +#else + .address.in.s_addr = UINT32_C(0x7F000001), +#endif + }, + { + .family = AF_INET6, + .address.in6 = IN6ADDR_LOOPBACK_INIT, + }, + }; + + assert(hosts); + + /* Removes the 'localhost' entry from what we loaded. But only if the mapping is exclusively between + * 127.0.0.1 and localhost (or aliases to that we recognize). If there's any other name assigned to + * it, we leave the entry in. + * + * This way our regular synthesizing can take over, but only if it would result in the exact same + * mappings. */ + + for (size_t j = 0; j < ELEMENTSOF(local_in_addrs); j++) { + bool all_localhost, all_local_address; + EtcHostsItemByAddress *item; + const char *name; + + item = hashmap_get(hosts->by_address, local_in_addrs + j); + if (!item) + continue; + + /* Check whether all hostnames the loopback address points to are localhost ones */ + all_localhost = true; + SET_FOREACH(name, item->names) + if (!is_localhost(name)) { + all_localhost = false; + break; + } + + if (!all_localhost) /* Not all names are localhost, hence keep the entries for this address. */ + continue; + + /* Now check if the names listed for this address actually all point back just to this + * address (or the other loopback address). If not, let's stay away from this too. */ + all_local_address = true; + SET_FOREACH(name, item->names) { + EtcHostsItemByName *n; + struct in_addr_data *a; + + n = hashmap_get(hosts->by_name, name); + if (!n) /* No reverse entry? Then almost certainly the entry already got deleted from + * the previous iteration of this loop, i.e. via the other protocol */ + break; + + /* Now check if the addresses of this item are all localhost addresses */ + SET_FOREACH(a, n->addresses) + if (!in_addr_is_localhost(a->family, &a->address)) { + all_local_address = false; + break; + } + + if (!all_local_address) + break; + } + + if (!all_local_address) + continue; + + SET_FOREACH(name, item->names) + etc_hosts_item_by_name_free(hashmap_remove(hosts->by_name, name)); + + assert_se(hashmap_remove(hosts->by_address, local_in_addrs + j) == item); + etc_hosts_item_by_address_free(item); + } +} + +int etc_hosts_parse(EtcHosts *hosts, FILE *f) { + _cleanup_(etc_hosts_clear) EtcHosts t = {}; + unsigned nr = 0; + int r; + + assert(hosts); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *l; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read /etc/hosts: %m"); + if (r == 0) + break; + + nr++; + + l = strchr(line, '#'); + if (l) + *l = '\0'; + + l = strstrip(line); + if (isempty(l)) + continue; + + r = parse_line(&t, nr, l); + if (r < 0) + return r; + } + + strip_localhost(&t); + + etc_hosts_clear(hosts); + *hosts = TAKE_STRUCT(t); + return 0; +} + +static int manager_etc_hosts_read(Manager *m) { + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + usec_t ts; + int r; + + assert_se(sd_event_now(m->event, CLOCK_BOOTTIME, &ts) >= 0); + + /* See if we checked /etc/hosts recently already */ + if (m->etc_hosts_last != USEC_INFINITY && m->etc_hosts_last + ETC_HOSTS_RECHECK_USEC > ts) + return 0; + + m->etc_hosts_last = ts; + + if (m->etc_hosts_stat.st_mode != 0) { + if (stat("/etc/hosts", &st) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to stat /etc/hosts: %m"); + + manager_etc_hosts_flush(m); + return 0; + } + + /* Did the mtime or ino/dev change? If not, there's no point in re-reading the file. */ + if (stat_inode_unmodified(&m->etc_hosts_stat, &st)) + return 0; + } + + f = fopen("/etc/hosts", "re"); + if (!f) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open /etc/hosts: %m"); + + manager_etc_hosts_flush(m); + return 0; + } + + /* Take the timestamp at the beginning of processing, so that any changes made later are read on the next + * invocation */ + r = fstat(fileno(f), &st); + if (r < 0) + return log_error_errno(errno, "Failed to fstat() /etc/hosts: %m"); + + r = etc_hosts_parse(&m->etc_hosts, f); + if (r < 0) + return r; + + m->etc_hosts_stat = st; + m->etc_hosts_last = ts; + + return 1; +} + +static int answer_add_ptr(DnsAnswer *answer, DnsResourceKey *key, const char *name) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + rr = dns_resource_record_new(key); + if (!rr) + return -ENOMEM; + + rr->ptr.name = strdup(name); + if (!rr->ptr.name) + return -ENOMEM; + + return dns_answer_add(answer, rr, 0, DNS_ANSWER_AUTHENTICATED, NULL); +} + +static int answer_add_cname(DnsAnswer *answer, const char *name, const char *cname) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_CNAME, name); + if (!rr) + return -ENOMEM; + + rr->cname.name = strdup(cname); + if (!rr->cname.name) + return -ENOMEM; + + return dns_answer_add(answer, rr, 0, DNS_ANSWER_AUTHENTICATED, NULL); +} + +static int answer_add_addr(DnsAnswer *answer, const char *name, const struct in_addr_data *a) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + int r; + + r = dns_resource_record_new_address(&rr, a->family, &a->address, name); + if (r < 0) + return r; + + return dns_answer_add(answer, rr, 0, DNS_ANSWER_AUTHENTICATED, NULL); +} + +static int etc_hosts_lookup_by_address( + EtcHosts *hosts, + DnsQuestion *q, + const char *name, + const struct in_addr_data *address, + DnsAnswer **answer) { + + DnsResourceKey *t, *found_ptr = NULL; + EtcHostsItemByAddress *item; + int r; + + assert(hosts); + assert(q); + assert(name); + assert(address); + assert(answer); + + item = hashmap_get(hosts->by_address, address); + if (!item) + return 0; + + /* We have an address in /etc/hosts that matches the queried name. Let's return successful. Actual data + * we'll only return if the request was for PTR. */ + + DNS_QUESTION_FOREACH(t, q) { + if (!IN_SET(t->type, DNS_TYPE_PTR, DNS_TYPE_ANY)) + continue; + if (!IN_SET(t->class, DNS_CLASS_IN, DNS_CLASS_ANY)) + continue; + + r = dns_name_equal(dns_resource_key_name(t), name); + if (r < 0) + return r; + if (r > 0) { + found_ptr = t; + break; + } + } + + if (found_ptr) { + const char *n; + + r = dns_answer_reserve(answer, set_size(item->names)); + if (r < 0) + return r; + + if (item->canonical_name) { + r = answer_add_ptr(*answer, found_ptr, item->canonical_name); + if (r < 0) + return r; + } + + SET_FOREACH(n, item->names) { + if (n == item->canonical_name) + continue; + + r = answer_add_ptr(*answer, found_ptr, n); + if (r < 0) + return r; + } + } + + return 1; +} + +static int etc_hosts_lookup_by_name( + EtcHosts *hosts, + DnsQuestion *q, + const char *name, + DnsAnswer **answer) { + + bool found_a = false, found_aaaa = false; + const struct in_addr_data *a; + EtcHostsItemByName *item; + DnsResourceKey *t; + int r; + + assert(hosts); + assert(q); + assert(name); + assert(answer); + + item = hashmap_get(hosts->by_name, name); + if (item) { + r = dns_answer_reserve(answer, set_size(item->addresses)); + if (r < 0) + return r; + } else { + /* Check if name was listed with no address. If yes, continue to return an answer. */ + if (!set_contains(hosts->no_address, name)) + return 0; + } + + DNS_QUESTION_FOREACH(t, q) { + if (!IN_SET(t->type, DNS_TYPE_A, DNS_TYPE_AAAA, DNS_TYPE_ANY)) + continue; + if (!IN_SET(t->class, DNS_CLASS_IN, DNS_CLASS_ANY)) + continue; + + r = dns_name_equal(dns_resource_key_name(t), name); + if (r < 0) + return r; + if (r == 0) + continue; + + if (IN_SET(t->type, DNS_TYPE_A, DNS_TYPE_ANY)) + found_a = true; + if (IN_SET(t->type, DNS_TYPE_AAAA, DNS_TYPE_ANY)) + found_aaaa = true; + + if (found_a && found_aaaa) + break; + } + + SET_FOREACH(a, item ? item->addresses : NULL) { + EtcHostsItemByAddress *item_by_addr; + const char *canonical_name; + + if ((!found_a && a->family == AF_INET) || + (!found_aaaa && a->family == AF_INET6)) + continue; + + item_by_addr = hashmap_get(hosts->by_address, a); + if (item_by_addr && item_by_addr->canonical_name) + canonical_name = item_by_addr->canonical_name; + else + canonical_name = item->name; + + if (!streq(item->name, canonical_name)) { + r = answer_add_cname(*answer, item->name, canonical_name); + if (r < 0) + return r; + } + + r = answer_add_addr(*answer, canonical_name, a); + if (r < 0) + return r; + } + + return found_a || found_aaaa; +} + +int manager_etc_hosts_lookup(Manager *m, DnsQuestion *q, DnsAnswer **answer) { + struct in_addr_data k; + const char *name; + + assert(m); + assert(q); + assert(answer); + + if (!m->read_etc_hosts) + return 0; + + (void) manager_etc_hosts_read(m); + + name = dns_question_first_name(q); + if (!name) + return 0; + + if (dns_name_address(name, &k.family, &k.address) > 0) + return etc_hosts_lookup_by_address(&m->etc_hosts, q, name, &k, answer); + + return etc_hosts_lookup_by_name(&m->etc_hosts, q, name, answer); +} diff --git a/src/resolve/resolved-etc-hosts.h b/src/resolve/resolved-etc-hosts.h new file mode 100644 index 0000000..805a09b --- /dev/null +++ b/src/resolve/resolved-etc-hosts.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "resolved-manager.h" +#include "resolved-dns-question.h" +#include "resolved-dns-answer.h" + +typedef struct EtcHostsItemByAddress { + struct in_addr_data address; + Set *names; + const char *canonical_name; +} EtcHostsItemByAddress; + +typedef struct EtcHostsItemByName { + char *name; + Set *addresses; +} EtcHostsItemByName; + +int etc_hosts_parse(EtcHosts *hosts, FILE *f); +void etc_hosts_clear(EtcHosts *hosts); + +void manager_etc_hosts_flush(Manager *m); +int manager_etc_hosts_lookup(Manager *m, DnsQuestion* q, DnsAnswer **answer); diff --git a/src/resolve/resolved-gperf.gperf b/src/resolve/resolved-gperf.gperf new file mode 100644 index 0000000..6883935 --- /dev/null +++ b/src/resolve/resolved-gperf.gperf @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "resolved-conf.h" +#include "resolved-manager.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name resolved_gperf_hash +%define lookup-function-name resolved_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Resolve.DNS, config_parse_dns_servers, DNS_SERVER_SYSTEM, 0 +Resolve.FallbackDNS, config_parse_dns_servers, DNS_SERVER_FALLBACK, 0 +Resolve.Domains, config_parse_search_domains, 0, 0 +Resolve.LLMNR, config_parse_resolve_support, 0, offsetof(Manager, llmnr_support) +Resolve.MulticastDNS, config_parse_resolve_support, 0, offsetof(Manager, mdns_support) +Resolve.DNSSEC, config_parse_dnssec_mode, 0, offsetof(Manager, dnssec_mode) +Resolve.DNSOverTLS, config_parse_dns_over_tls_mode, 0, offsetof(Manager, dns_over_tls_mode) +Resolve.Cache, config_parse_dns_cache_mode, DNS_CACHE_MODE_YES, offsetof(Manager, enable_cache) +Resolve.DNSStubListener, config_parse_dns_stub_listener_mode, 0, offsetof(Manager, dns_stub_listener_mode) +Resolve.ReadEtcHosts, config_parse_bool, 0, offsetof(Manager, read_etc_hosts) +Resolve.ResolveUnicastSingleLabel, config_parse_bool, 0, offsetof(Manager, resolve_unicast_single_label) +Resolve.DNSStubListenerExtra, config_parse_dns_stub_listener_extra, 0, offsetof(Manager, dns_extra_stub_listeners) +Resolve.CacheFromLocalhost, config_parse_bool, 0, offsetof(Manager, cache_from_localhost) +Resolve.StaleRetentionSec, config_parse_sec, 0, offsetof(Manager, stale_retention_usec) diff --git a/src/resolve/resolved-link-bus.c b/src/resolve/resolved-link-bus.c new file mode 100644 index 0000000..4f8f591 --- /dev/null +++ b/src/resolve/resolved-link-bus.c @@ -0,0 +1,907 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-get-properties.h" +#include "bus-message-util.h" +#include "bus-polkit.h" +#include "log-link.h" +#include "parse-util.h" +#include "resolve-util.h" +#include "resolved-bus.h" +#include "resolved-link-bus.h" +#include "resolved-resolv-conf.h" +#include "socket-netlink.h" +#include "stdio-util.h" +#include "strv.h" +#include "user-util.h" + +static BUS_DEFINE_PROPERTY_GET(property_get_dnssec_supported, "b", Link, link_dnssec_supported); +static BUS_DEFINE_PROPERTY_GET2(property_get_dnssec_mode, "s", Link, link_get_dnssec_mode, dnssec_mode_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_llmnr_support, "s", Link, link_get_llmnr_support, resolve_support_to_string); +static BUS_DEFINE_PROPERTY_GET2(property_get_mdns_support, "s", Link, link_get_mdns_support, resolve_support_to_string); + +static int property_get_dns_over_tls_mode( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *l = ASSERT_PTR(userdata); + + assert(reply); + + return sd_bus_message_append(reply, "s", dns_over_tls_mode_to_string(link_get_dns_over_tls_mode(l))); +} + +static int property_get_dns_internal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error, + bool extended) { + + Link *l = ASSERT_PTR(userdata); + int r; + + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', extended ? "(iayqs)" : "(iay)"); + if (r < 0) + return r; + + LIST_FOREACH(servers, s, l->dns_servers) { + r = bus_dns_server_append(reply, s, false, extended); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_dns( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return property_get_dns_internal(bus, path, interface, property, reply, userdata, error, false); +} + +static int property_get_dns_ex( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return property_get_dns_internal(bus, path, interface, property, reply, userdata, error, true); +} + +static int property_get_current_dns_server_internal( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error, + bool extended) { + + DnsServer *s; + + assert(reply); + assert(userdata); + + s = *(DnsServer **) userdata; + + return bus_dns_server_append(reply, s, false, extended); +} + +static int property_get_current_dns_server( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return property_get_current_dns_server_internal(bus, path, interface, property, reply, userdata, error, false); +} + +static int property_get_current_dns_server_ex( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + return property_get_current_dns_server_internal(bus, path, interface, property, reply, userdata, error, true); +} + +static int property_get_domains( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *l = ASSERT_PTR(userdata); + int r; + + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "(sb)"); + if (r < 0) + return r; + + LIST_FOREACH(domains, d, l->search_domains) { + r = sd_bus_message_append(reply, "(sb)", d->name, d->route_only); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int property_get_default_route( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *l = ASSERT_PTR(userdata); + + assert(reply); + + /* Return what is configured, if there's something configured */ + if (l->default_route >= 0) + return sd_bus_message_append(reply, "b", l->default_route); + + /* Otherwise report what is in effect */ + if (l->unicast_scope) + return sd_bus_message_append(reply, "b", dns_scope_is_default_route(l->unicast_scope)); + + return sd_bus_message_append(reply, "b", false); +} + +static int property_get_scopes_mask( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Link *l = ASSERT_PTR(userdata); + uint64_t mask; + + assert(reply); + + mask = (l->unicast_scope ? SD_RESOLVED_DNS : 0) | + (l->llmnr_ipv4_scope ? SD_RESOLVED_LLMNR_IPV4 : 0) | + (l->llmnr_ipv6_scope ? SD_RESOLVED_LLMNR_IPV6 : 0) | + (l->mdns_ipv4_scope ? SD_RESOLVED_MDNS_IPV4 : 0) | + (l->mdns_ipv6_scope ? SD_RESOLVED_MDNS_IPV6 : 0); + + return sd_bus_message_append(reply, "t", mask); +} + +static int verify_unmanaged_link(Link *l, sd_bus_error *error) { + assert(l); + + if (l->flags & IFF_LOOPBACK) + return sd_bus_error_setf(error, BUS_ERROR_LINK_BUSY, "Link %s is loopback device.", l->ifname); + if (l->is_managed) + return sd_bus_error_setf(error, BUS_ERROR_LINK_BUSY, "Link %s is managed.", l->ifname); + + return 0; +} + +static int bus_link_method_set_dns_servers_internal(sd_bus_message *message, void *userdata, sd_bus_error *error, bool extended) { + _cleanup_free_ char *j = NULL; + struct in_addr_full **dns; + bool changed = false; + Link *l = ASSERT_PTR(userdata); + size_t n; + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = bus_message_read_dns_servers(message, error, extended, &dns, &n); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-dns-servers", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + goto finalize; + if (r == 0) { + r = 1; /* Polkit will call us back */ + goto finalize; + } + + for (size_t i = 0; i < n; i++) { + const char *s; + + s = in_addr_full_to_string(dns[i]); + if (!s) { + r = -ENOMEM; + goto finalize; + } + + if (!strextend_with_separator(&j, ", ", s)) { + r = -ENOMEM; + goto finalize; + } + } + + bus_client_log(message, "DNS server change"); + + dns_server_mark_all(l->dns_servers); + + for (size_t i = 0; i < n; i++) { + DnsServer *s; + + s = dns_server_find(l->dns_servers, dns[i]->family, &dns[i]->address, dns[i]->port, 0, dns[i]->server_name); + if (s) + dns_server_move_back_and_unmark(s); + else { + r = dns_server_new(l->manager, NULL, DNS_SERVER_LINK, l, dns[i]->family, &dns[i]->address, dns[i]->port, 0, dns[i]->server_name); + if (r < 0) { + dns_server_unlink_all(l->dns_servers); + goto finalize; + } + + changed = true; + } + + } + + changed = dns_server_unlink_marked(l->dns_servers) || changed; + + if (changed) { + link_allocate_scopes(l); + + (void) link_save_user(l); + (void) manager_write_resolv_conf(l->manager); + (void) manager_send_changed(l->manager, "DNS"); + + if (j) + log_link_info(l, "Bus client set DNS server list to: %s", j); + else + log_link_info(l, "Bus client reset DNS server list."); + } + + r = sd_bus_reply_method_return(message, NULL); + +finalize: + for (size_t i = 0; i < n; i++) + in_addr_full_free(dns[i]); + free(dns); + + return r; +} + +int bus_link_method_set_dns_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_link_method_set_dns_servers_internal(message, userdata, error, false); +} + +int bus_link_method_set_dns_servers_ex(sd_bus_message *message, void *userdata, sd_bus_error *error) { + return bus_link_method_set_dns_servers_internal(message, userdata, error, true); +} + +int bus_link_method_set_domains(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *j = NULL; + Link *l = ASSERT_PTR(userdata); + bool changed = false; + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "(sb)"); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *prefixed = NULL; + const char *name; + int route_only; + + r = sd_bus_message_read(message, "(sb)", &name, &route_only); + if (r < 0) + return r; + if (r == 0) + break; + + r = dns_name_is_valid(name); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid search domain %s", name); + if (!route_only && dns_name_is_root(name)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Root domain is not suitable as search domain"); + + if (route_only) { + prefixed = strjoin("~", name); + if (!prefixed) + return -ENOMEM; + + name = prefixed; + } + + if (!strextend_with_separator(&j, ", ", name)) + return -ENOMEM; + } + + r = sd_bus_message_rewind(message, false); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-domains", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "dns domains change"); + + dns_search_domain_mark_all(l->search_domains); + + for (;;) { + DnsSearchDomain *d; + const char *name; + int route_only; + + r = sd_bus_message_read(message, "(sb)", &name, &route_only); + if (r < 0) + goto clear; + if (r == 0) + break; + + r = dns_search_domain_find(l->search_domains, name, &d); + if (r < 0) + goto clear; + + if (r > 0) + dns_search_domain_move_back_and_unmark(d); + else { + r = dns_search_domain_new(l->manager, &d, DNS_SEARCH_DOMAIN_LINK, l, name); + if (r < 0) + goto clear; + + changed = true; + } + + d->route_only = route_only; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + goto clear; + + changed = dns_search_domain_unlink_marked(l->search_domains) || changed; + + if (changed) { + (void) link_save_user(l); + (void) manager_write_resolv_conf(l->manager); + + if (j) + log_link_info(l, "Bus client set search domain list to: %s", j); + else + log_link_info(l, "Bus client reset search domain list."); + } + + return sd_bus_reply_method_return(message, NULL); + +clear: + dns_search_domain_unlink_all(l->search_domains); + return r; +} + +int bus_link_method_set_default_route(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r, b; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-default-route", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "dns default route change"); + + if (l->default_route != b) { + l->default_route = b; + + (void) link_save_user(l); + (void) manager_write_resolv_conf(l->manager); + + log_link_info(l, "Bus client set default route setting: %s", yes_no(b)); + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_llmnr(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + ResolveSupport mode; + const char *llmnr; + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &llmnr); + if (r < 0) + return r; + + if (isempty(llmnr)) + mode = RESOLVE_SUPPORT_YES; + else { + mode = resolve_support_from_string(llmnr); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid LLMNR setting: %s", llmnr); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-llmnr", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "LLMNR change"); + + if (l->llmnr_support != mode) { + l->llmnr_support = mode; + link_allocate_scopes(l); + link_add_rrs(l, false); + + (void) link_save_user(l); + + log_link_info(l, "Bus client set LLMNR setting: %s", resolve_support_to_string(mode)); + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_mdns(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + ResolveSupport mode; + const char *mdns; + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &mdns); + if (r < 0) + return r; + + if (isempty(mdns)) + mode = RESOLVE_SUPPORT_YES; + else { + mode = resolve_support_from_string(mdns); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid MulticastDNS setting: %s", mdns); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-mdns", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "mDNS change"); + + if (l->mdns_support != mode) { + l->mdns_support = mode; + link_allocate_scopes(l); + link_add_rrs(l, false); + + (void) link_save_user(l); + + log_link_info(l, "Bus client set MulticastDNS setting: %s", resolve_support_to_string(mode)); + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_dns_over_tls(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + const char *dns_over_tls; + DnsOverTlsMode mode; + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &dns_over_tls); + if (r < 0) + return r; + + if (isempty(dns_over_tls)) + mode = _DNS_OVER_TLS_MODE_INVALID; + else { + mode = dns_over_tls_mode_from_string(dns_over_tls); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid DNSOverTLS setting: %s", dns_over_tls); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-dns-over-tls", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "D-o-T change"); + + if (l->dns_over_tls_mode != mode) { + link_set_dns_over_tls_mode(l, mode); + link_allocate_scopes(l); + + (void) link_save_user(l); + + log_link_info(l, "Bus client set DNSOverTLS setting: %s", + mode < 0 ? "default" : dns_over_tls_mode_to_string(mode)); + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_dnssec(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + const char *dnssec; + DnssecMode mode; + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &dnssec); + if (r < 0) + return r; + + if (isempty(dnssec)) + mode = _DNSSEC_MODE_INVALID; + else { + mode = dnssec_mode_from_string(dnssec); + if (mode < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid DNSSEC setting: %s", dnssec); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-dnssec", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "DNSSEC change"); + + if (l->dnssec_mode != mode) { + link_set_dnssec_mode(l, mode); + link_allocate_scopes(l); + + (void) link_save_user(l); + + log_link_info(l, "Bus client set DNSSEC setting: %s", + mode < 0 ? "default" : dnssec_mode_to_string(mode)); + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_set_dnssec_negative_trust_anchors(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_set_free_free_ Set *ns = NULL; + _cleanup_strv_free_ char **ntas = NULL; + _cleanup_free_ char *j = NULL; + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + ns = set_new(&dns_name_hash_ops); + if (!ns) + return -ENOMEM; + + r = sd_bus_message_read_strv(message, &ntas); + if (r < 0) + return r; + + STRV_FOREACH(i, ntas) { + r = dns_name_is_valid(*i); + if (r < 0) + return r; + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid negative trust anchor domain: %s", *i); + + r = set_put_strdup(&ns, *i); + if (r < 0) + return r; + + if (!strextend_with_separator(&j, ", ", *i)) + return -ENOMEM; + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.set-dnssec-negative-trust-anchors", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "DNSSEC NTA change"); + + if (!set_equal(ns, l->dnssec_negative_trust_anchors)) { + set_free_free(l->dnssec_negative_trust_anchors); + l->dnssec_negative_trust_anchors = TAKE_PTR(ns); + + (void) link_save_user(l); + + if (j) + log_link_info(l, "Bus client set NTA list to: %s", j); + else + log_link_info(l, "Bus client reset NTA list."); + } + + return sd_bus_reply_method_return(message, NULL); +} + +int bus_link_method_revert(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Link *l = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = verify_unmanaged_link(l, error); + if (r < 0) + return r; + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.resolve1.revert", + NULL, true, UID_INVALID, + &l->manager->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* Polkit will call us back */ + + bus_client_log(message, "revert"); + + link_flush_settings(l); + link_allocate_scopes(l); + link_add_rrs(l, false); + + (void) link_save_user(l); + (void) manager_write_resolv_conf(l->manager); + (void) manager_send_changed(l->manager, "DNS"); + + return sd_bus_reply_method_return(message, NULL); +} + +static int link_object_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) { + _cleanup_free_ char *e = NULL; + Manager *m = ASSERT_PTR(userdata); + Link *link; + int ifindex, r; + + assert(bus); + assert(path); + assert(interface); + assert(found); + + r = sd_bus_path_decode(path, "/org/freedesktop/resolve1/link", &e); + if (r <= 0) + return 0; + + ifindex = parse_ifindex(e); + if (ifindex < 0) + return 0; + + link = hashmap_get(m->links, INT_TO_PTR(ifindex)); + if (!link) + return 0; + + *found = link; + return 1; +} + +char *link_bus_path(const Link *link) { + char *p, ifindex[DECIMAL_STR_MAX(link->ifindex)]; + int r; + + assert(link); + + xsprintf(ifindex, "%i", link->ifindex); + + r = sd_bus_path_encode("/org/freedesktop/resolve1/link", ifindex, &p); + if (r < 0) + return NULL; + + return p; +} + +static int link_node_enumerator(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) { + _cleanup_strv_free_ char **l = NULL; + Manager *m = ASSERT_PTR(userdata); + Link *link; + unsigned c = 0; + + assert(bus); + assert(path); + assert(nodes); + + l = new0(char*, hashmap_size(m->links) + 1); + if (!l) + return -ENOMEM; + + HASHMAP_FOREACH(link, m->links) { + char *p; + + p = link_bus_path(link); + if (!p) + return -ENOMEM; + + l[c++] = p; + } + + l[c] = NULL; + *nodes = TAKE_PTR(l); + + return 1; +} + +static const sd_bus_vtable link_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("ScopesMask", "t", property_get_scopes_mask, 0, 0), + SD_BUS_PROPERTY("DNS", "a(iay)", property_get_dns, 0, 0), + SD_BUS_PROPERTY("DNSEx", "a(iayqs)", property_get_dns_ex, 0, 0), + SD_BUS_PROPERTY("CurrentDNSServer", "(iay)", property_get_current_dns_server, offsetof(Link, current_dns_server), 0), + SD_BUS_PROPERTY("CurrentDNSServerEx", "(iayqs)", property_get_current_dns_server_ex, offsetof(Link, current_dns_server), 0), + SD_BUS_PROPERTY("Domains", "a(sb)", property_get_domains, 0, 0), + SD_BUS_PROPERTY("DefaultRoute", "b", property_get_default_route, 0, 0), + SD_BUS_PROPERTY("LLMNR", "s", property_get_llmnr_support, 0, 0), + SD_BUS_PROPERTY("MulticastDNS", "s", property_get_mdns_support, 0, 0), + SD_BUS_PROPERTY("DNSOverTLS", "s", property_get_dns_over_tls_mode, 0, 0), + SD_BUS_PROPERTY("DNSSEC", "s", property_get_dnssec_mode, 0, 0), + SD_BUS_PROPERTY("DNSSECNegativeTrustAnchors", "as", bus_property_get_string_set, offsetof(Link, dnssec_negative_trust_anchors), 0), + SD_BUS_PROPERTY("DNSSECSupported", "b", property_get_dnssec_supported, 0, 0), + + SD_BUS_METHOD_WITH_ARGS("SetDNS", + SD_BUS_ARGS("a(iay)", addresses), + SD_BUS_NO_RESULT, + bus_link_method_set_dns_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSEx", + SD_BUS_ARGS("a(iayqs)", addresses), + SD_BUS_NO_RESULT, + bus_link_method_set_dns_servers_ex, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDomains", + SD_BUS_ARGS("a(sb)", domains), + SD_BUS_NO_RESULT, + bus_link_method_set_domains, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDefaultRoute", + SD_BUS_ARGS("b", enable), + SD_BUS_NO_RESULT, + bus_link_method_set_default_route, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLLMNR", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_llmnr, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetMulticastDNS", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_mdns, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSOverTLS", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_dns_over_tls, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSSEC", + SD_BUS_ARGS("s", mode), + SD_BUS_NO_RESULT, + bus_link_method_set_dnssec, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetDNSSECNegativeTrustAnchors", + SD_BUS_ARGS("as", names), + SD_BUS_NO_RESULT, + bus_link_method_set_dnssec_negative_trust_anchors, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("Revert", + SD_BUS_NO_ARGS, + SD_BUS_NO_RESULT, + bus_link_method_revert, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation link_object = { + "/org/freedesktop/resolve1/link", + "org.freedesktop.resolve1.Link", + .fallback_vtables = BUS_FALLBACK_VTABLES({link_vtable, link_object_find}), + .node_enumerator = link_node_enumerator, +}; diff --git a/src/resolve/resolved-link-bus.h b/src/resolve/resolved-link-bus.h new file mode 100644 index 0000000..b882df5 --- /dev/null +++ b/src/resolve/resolved-link-bus.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-util.h" +#include "resolved-link.h" + +extern const BusObjectImplementation link_object; + +char *link_bus_path(const Link *link); + +int bus_link_method_set_dns_servers(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dns_servers_ex(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_domains(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_default_route(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_llmnr(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_mdns(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dns_over_tls(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dnssec(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_set_dnssec_negative_trust_anchors(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_link_method_revert(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/resolve/resolved-link.c b/src/resolve/resolved-link.c new file mode 100644 index 0000000..dd5dadd --- /dev/null +++ b/src/resolve/resolved-link.c @@ -0,0 +1,1445 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-network.h" + +#include "alloc-util.h" +#include "env-file.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "log-link.h" +#include "mkdir.h" +#include "netif-util.h" +#include "parse-util.h" +#include "resolved-link.h" +#include "resolved-llmnr.h" +#include "resolved-mdns.h" +#include "socket-netlink.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" + +int link_new(Manager *m, Link **ret, int ifindex) { + _cleanup_(link_freep) Link *l = NULL; + int r; + + assert(m); + assert(ifindex > 0); + + l = new(Link, 1); + if (!l) + return -ENOMEM; + + *l = (Link) { + .ifindex = ifindex, + .default_route = -1, + .llmnr_support = RESOLVE_SUPPORT_YES, + .mdns_support = RESOLVE_SUPPORT_YES, + .dnssec_mode = _DNSSEC_MODE_INVALID, + .dns_over_tls_mode = _DNS_OVER_TLS_MODE_INVALID, + .operstate = IF_OPER_UNKNOWN, + }; + + if (asprintf(&l->state_file, "/run/systemd/resolve/netif/%i", ifindex) < 0) + return -ENOMEM; + + r = hashmap_ensure_put(&m->links, NULL, INT_TO_PTR(ifindex), l); + if (r < 0) + return r; + + l->manager = m; + + if (ret) + *ret = l; + TAKE_PTR(l); + + return 0; +} + +void link_flush_settings(Link *l) { + assert(l); + + l->default_route = -1; + l->llmnr_support = RESOLVE_SUPPORT_YES; + l->mdns_support = RESOLVE_SUPPORT_YES; + l->dnssec_mode = _DNSSEC_MODE_INVALID; + l->dns_over_tls_mode = _DNS_OVER_TLS_MODE_INVALID; + + dns_server_unlink_all(l->dns_servers); + dns_search_domain_unlink_all(l->search_domains); + + l->dnssec_negative_trust_anchors = set_free_free(l->dnssec_negative_trust_anchors); +} + +Link *link_free(Link *l) { + if (!l) + return NULL; + + /* Send goodbye messages. */ + dns_scope_announce(l->mdns_ipv4_scope, true); + dns_scope_announce(l->mdns_ipv6_scope, true); + + link_flush_settings(l); + + while (l->addresses) + (void) link_address_free(l->addresses); + + if (l->manager) + hashmap_remove(l->manager->links, INT_TO_PTR(l->ifindex)); + + dns_scope_free(l->unicast_scope); + dns_scope_free(l->llmnr_ipv4_scope); + dns_scope_free(l->llmnr_ipv6_scope); + dns_scope_free(l->mdns_ipv4_scope); + dns_scope_free(l->mdns_ipv6_scope); + + free(l->state_file); + free(l->ifname); + + return mfree(l); +} + +void link_allocate_scopes(Link *l) { + bool unicast_relevant; + int r; + + assert(l); + + /* If a link that used to be relevant is no longer, or a link that did not use to be relevant now becomes + * relevant, let's reinit the learnt global DNS server information, since we might talk to different servers + * now, even if they have the same addresses as before. */ + + unicast_relevant = link_relevant(l, AF_UNSPEC, false); + if (unicast_relevant != l->unicast_relevant) { + l->unicast_relevant = unicast_relevant; + + dns_server_reset_features_all(l->manager->fallback_dns_servers); + dns_server_reset_features_all(l->manager->dns_servers); + + /* Also, flush the global unicast scope, to deal with split horizon setups, where talking through one + * interface reveals different DNS zones than through others. */ + if (l->manager->unicast_scope) + dns_cache_flush(&l->manager->unicast_scope->cache); + } + + /* And now, allocate all scopes that makes sense now if we didn't have them yet, and drop those which we don't + * need anymore */ + + if (unicast_relevant && l->dns_servers) { + if (!l->unicast_scope) { + dns_server_reset_features_all(l->dns_servers); + + r = dns_scope_new(l->manager, &l->unicast_scope, l, DNS_PROTOCOL_DNS, AF_UNSPEC); + if (r < 0) + log_link_warning_errno(l, r, "Failed to allocate DNS scope, ignoring: %m"); + } + } else + l->unicast_scope = dns_scope_free(l->unicast_scope); + + if (link_relevant(l, AF_INET, true) && + link_get_llmnr_support(l) != RESOLVE_SUPPORT_NO) { + if (!l->llmnr_ipv4_scope) { + r = dns_scope_new(l->manager, &l->llmnr_ipv4_scope, l, DNS_PROTOCOL_LLMNR, AF_INET); + if (r < 0) + log_link_warning_errno(l, r, "Failed to allocate LLMNR IPv4 scope, ignoring: %m"); + } + } else + l->llmnr_ipv4_scope = dns_scope_free(l->llmnr_ipv4_scope); + + if (link_relevant(l, AF_INET6, true) && + link_get_llmnr_support(l) != RESOLVE_SUPPORT_NO) { + if (!l->llmnr_ipv6_scope) { + r = dns_scope_new(l->manager, &l->llmnr_ipv6_scope, l, DNS_PROTOCOL_LLMNR, AF_INET6); + if (r < 0) + log_link_warning_errno(l, r, "Failed to allocate LLMNR IPv6 scope, ignoring: %m"); + } + } else + l->llmnr_ipv6_scope = dns_scope_free(l->llmnr_ipv6_scope); + + if (link_relevant(l, AF_INET, true) && + link_get_mdns_support(l) != RESOLVE_SUPPORT_NO) { + if (!l->mdns_ipv4_scope) { + r = dns_scope_new(l->manager, &l->mdns_ipv4_scope, l, DNS_PROTOCOL_MDNS, AF_INET); + if (r < 0) + log_link_warning_errno(l, r, "Failed to allocate mDNS IPv4 scope, ignoring: %m"); + } + } else + l->mdns_ipv4_scope = dns_scope_free(l->mdns_ipv4_scope); + + if (link_relevant(l, AF_INET6, true) && + link_get_mdns_support(l) != RESOLVE_SUPPORT_NO) { + if (!l->mdns_ipv6_scope) { + r = dns_scope_new(l->manager, &l->mdns_ipv6_scope, l, DNS_PROTOCOL_MDNS, AF_INET6); + if (r < 0) + log_link_warning_errno(l, r, "Failed to allocate mDNS IPv6 scope, ignoring: %m"); + } + } else + l->mdns_ipv6_scope = dns_scope_free(l->mdns_ipv6_scope); +} + +void link_add_rrs(Link *l, bool force_remove) { + int r; + + LIST_FOREACH(addresses, a, l->addresses) + link_address_add_rrs(a, force_remove); + + if (!force_remove && + link_get_mdns_support(l) == RESOLVE_SUPPORT_YES) { + + if (l->mdns_ipv4_scope) { + r = dns_scope_add_dnssd_services(l->mdns_ipv4_scope); + if (r < 0) + log_link_warning_errno(l, r, "Failed to add IPv4 DNS-SD services, ignoring: %m"); + } + + if (l->mdns_ipv6_scope) { + r = dns_scope_add_dnssd_services(l->mdns_ipv6_scope); + if (r < 0) + log_link_warning_errno(l, r, "Failed to add IPv6 DNS-SD services, ignoring: %m"); + } + + } else { + + if (l->mdns_ipv4_scope) { + r = dns_scope_remove_dnssd_services(l->mdns_ipv4_scope); + if (r < 0) + log_link_warning_errno(l, r, "Failed to remove IPv4 DNS-SD services, ignoring: %m"); + } + + if (l->mdns_ipv6_scope) { + r = dns_scope_remove_dnssd_services(l->mdns_ipv6_scope); + if (r < 0) + log_link_warning_errno(l, r, "Failed to remove IPv6 DNS-SD services, ignoring: %m"); + } + } +} + +int link_process_rtnl(Link *l, sd_netlink_message *m) { + const char *n = NULL; + int r; + + assert(l); + assert(m); + + r = sd_rtnl_message_link_get_flags(m, &l->flags); + if (r < 0) + return r; + + (void) sd_netlink_message_read_u32(m, IFLA_MTU, &l->mtu); + (void) sd_netlink_message_read_u8(m, IFLA_OPERSTATE, &l->operstate); + + if (sd_netlink_message_read_string(m, IFLA_IFNAME, &n) >= 0 && + !streq_ptr(l->ifname, n)) { + if (l->ifname) + log_link_debug(l, "Interface name change detected: %s -> %s", l->ifname, n); + + r = free_and_strdup(&l->ifname, n); + if (r < 0) + return r; + } + + return 0; +} + +static int link_update_dns_server_one(Link *l, const char *str) { + _cleanup_free_ char *name = NULL; + int family, ifindex, r; + union in_addr_union a; + DnsServer *s; + uint16_t port; + + assert(l); + assert(str); + + r = in_addr_port_ifindex_name_from_string_auto(str, &family, &a, &port, &ifindex, &name); + if (r < 0) + return r; + + if (ifindex != 0 && ifindex != l->ifindex) + return -EINVAL; + + /* By default, the port number is determined with the transaction feature level. + * See dns_transaction_port() and dns_server_port(). */ + if (IN_SET(port, 53, 853)) + port = 0; + + s = dns_server_find(l->dns_servers, family, &a, port, 0, name); + if (s) { + dns_server_move_back_and_unmark(s); + return 0; + } + + return dns_server_new(l->manager, NULL, DNS_SERVER_LINK, l, family, &a, port, 0, name); +} + +static int link_update_dns_servers(Link *l) { + _cleanup_strv_free_ char **nameservers = NULL; + int r; + + assert(l); + + r = sd_network_link_get_dns(l->ifindex, &nameservers); + if (r == -ENODATA) { + r = 0; + goto clear; + } + if (r < 0) + goto clear; + + dns_server_mark_all(l->dns_servers); + + STRV_FOREACH(nameserver, nameservers) { + r = link_update_dns_server_one(l, *nameserver); + if (r < 0) + goto clear; + } + + dns_server_unlink_marked(l->dns_servers); + return 0; + +clear: + dns_server_unlink_all(l->dns_servers); + return r; +} + +static int link_update_default_route(Link *l) { + int r; + + assert(l); + + r = sd_network_link_get_dns_default_route(l->ifindex); + if (r == -ENODATA) { + r = 0; + goto clear; + } + if (r < 0) + goto clear; + + l->default_route = r > 0; + return 0; + +clear: + l->default_route = -1; + return r; +} + +static int link_update_llmnr_support(Link *l) { + _cleanup_free_ char *b = NULL; + int r; + + assert(l); + + l->llmnr_support = RESOLVE_SUPPORT_YES; /* yes, yes, we set it twice which is ugly */ + + r = sd_network_link_get_llmnr(l->ifindex, &b); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + r = resolve_support_from_string(b); + if (r < 0) + return r; + + l->llmnr_support = r; + return 0; +} + +static int link_update_mdns_support(Link *l) { + _cleanup_free_ char *b = NULL; + int r; + + assert(l); + + l->mdns_support = RESOLVE_SUPPORT_YES; + + r = sd_network_link_get_mdns(l->ifindex, &b); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + r = resolve_support_from_string(b); + if (r < 0) + return r; + + l->mdns_support = r; + return 0; +} + +void link_set_dns_over_tls_mode(Link *l, DnsOverTlsMode mode) { + + assert(l); + +#if ! ENABLE_DNS_OVER_TLS + if (mode != DNS_OVER_TLS_NO) + log_link_warning(l, + "DNS-over-TLS option for the link cannot be enabled or set to opportunistic " + "when systemd-resolved is built without DNS-over-TLS support. " + "Turning off DNS-over-TLS support."); + return; +#endif + + l->dns_over_tls_mode = mode; + l->unicast_scope = dns_scope_free(l->unicast_scope); +} + +static int link_update_dns_over_tls_mode(Link *l) { + _cleanup_free_ char *b = NULL; + int r; + + assert(l); + + l->dns_over_tls_mode = _DNS_OVER_TLS_MODE_INVALID; + + r = sd_network_link_get_dns_over_tls(l->ifindex, &b); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + r = dns_over_tls_mode_from_string(b); + if (r < 0) + return r; + + l->dns_over_tls_mode = r; + return 0; +} + +void link_set_dnssec_mode(Link *l, DnssecMode mode) { + + assert(l); + +#if !HAVE_OPENSSL_OR_GCRYPT + if (IN_SET(mode, DNSSEC_YES, DNSSEC_ALLOW_DOWNGRADE)) + log_link_warning(l, + "DNSSEC option for the link cannot be enabled or set to allow-downgrade " + "when systemd-resolved is built without a cryptographic library. " + "Turning off DNSSEC support."); + return; +#endif + + if (l->dnssec_mode == mode) + return; + + l->dnssec_mode = mode; + l->unicast_scope = dns_scope_free(l->unicast_scope); +} + +static int link_update_dnssec_mode(Link *l) { + _cleanup_free_ char *m = NULL; + DnssecMode mode; + int r; + + assert(l); + + l->dnssec_mode = _DNSSEC_MODE_INVALID; + + r = sd_network_link_get_dnssec(l->ifindex, &m); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + mode = dnssec_mode_from_string(m); + if (mode < 0) + return mode; + + link_set_dnssec_mode(l, mode); + return 0; +} + +static int link_update_dnssec_negative_trust_anchors(Link *l) { + _cleanup_strv_free_ char **ntas = NULL; + _cleanup_set_free_free_ Set *ns = NULL; + int r; + + assert(l); + + l->dnssec_negative_trust_anchors = set_free_free(l->dnssec_negative_trust_anchors); + + r = sd_network_link_get_dnssec_negative_trust_anchors(l->ifindex, &ntas); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + ns = set_new(&dns_name_hash_ops); + if (!ns) + return -ENOMEM; + + r = set_put_strdupv(&ns, ntas); + if (r < 0) + return r; + + l->dnssec_negative_trust_anchors = TAKE_PTR(ns); + return 0; +} + +static int link_update_search_domain_one(Link *l, const char *name, bool route_only) { + DnsSearchDomain *d; + int r; + + assert(l); + assert(name); + + r = dns_search_domain_find(l->search_domains, name, &d); + if (r < 0) + return r; + if (r > 0) + dns_search_domain_move_back_and_unmark(d); + else { + r = dns_search_domain_new(l->manager, &d, DNS_SEARCH_DOMAIN_LINK, l, name); + if (r < 0) + return r; + } + + d->route_only = route_only; + return 0; +} + +static int link_update_search_domains(Link *l) { + _cleanup_strv_free_ char **sdomains = NULL, **rdomains = NULL; + int r, q; + + assert(l); + + r = sd_network_link_get_search_domains(l->ifindex, &sdomains); + if (r < 0 && r != -ENODATA) + goto clear; + + q = sd_network_link_get_route_domains(l->ifindex, &rdomains); + if (q < 0 && q != -ENODATA) { + r = q; + goto clear; + } + + if (r == -ENODATA && q == -ENODATA) { + /* networkd knows nothing about this interface, and that's fine. */ + r = 0; + goto clear; + } + + dns_search_domain_mark_all(l->search_domains); + + STRV_FOREACH(i, sdomains) { + r = link_update_search_domain_one(l, *i, false); + if (r < 0) + goto clear; + } + + STRV_FOREACH(i, rdomains) { + r = link_update_search_domain_one(l, *i, true); + if (r < 0) + goto clear; + } + + dns_search_domain_unlink_marked(l->search_domains); + return 0; + +clear: + dns_search_domain_unlink_all(l->search_domains); + return r; +} + +static int link_is_managed(Link *l) { + _cleanup_free_ char *state = NULL; + int r; + + assert(l); + + r = sd_network_link_get_setup_state(l->ifindex, &state); + if (r == -ENODATA) + return 0; + if (r < 0) + return r; + + return !STR_IN_SET(state, "pending", "initialized", "unmanaged"); +} + +static void link_enter_unmanaged(Link *l) { + assert(l); + + /* If this link used to be managed, but is now unmanaged, flush all our settings — but only once. */ + if (l->is_managed) + link_flush_settings(l); + + l->is_managed = false; +} + +static void link_read_settings(Link *l) { + struct stat st; + int r; + + assert(l); + + /* Read settings from networkd, except when networkd is not managing this interface. */ + + r = sd_network_link_get_stat(l->ifindex, &st); + if (r == -ENOENT) + return link_enter_unmanaged(l); + if (r < 0) + return (void) log_link_warning_errno(l, r, "Failed to stat() networkd's link state file, ignoring: %m"); + + if (stat_inode_unmodified(&l->networkd_state_file_stat, &st)) + /* The state file is unmodified. Not necessary to re-read settings. */ + return; + + /* Save the new stat for the next event. */ + l->networkd_state_file_stat = st; + + r = link_is_managed(l); + if (r < 0) + return (void) log_link_warning_errno(l, r, "Failed to determine whether the interface is managed, ignoring: %m"); + if (r == 0) + return link_enter_unmanaged(l); + + l->is_managed = true; + + r = network_link_get_operational_state(l->ifindex, &l->networkd_operstate); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read networkd's link operational state, ignoring: %m"); + + r = link_update_dns_servers(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read DNS servers for the interface, ignoring: %m"); + + r = link_update_llmnr_support(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read LLMNR support for the interface, ignoring: %m"); + + r = link_update_mdns_support(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read mDNS support for the interface, ignoring: %m"); + + r = link_update_dns_over_tls_mode(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read DNS-over-TLS mode for the interface, ignoring: %m"); + + r = link_update_dnssec_mode(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read DNSSEC mode for the interface, ignoring: %m"); + + r = link_update_dnssec_negative_trust_anchors(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read DNSSEC negative trust anchors for the interface, ignoring: %m"); + + r = link_update_search_domains(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read search domains for the interface, ignoring: %m"); + + r = link_update_default_route(l); + if (r < 0) + log_link_warning_errno(l, r, "Failed to read default route setting for the interface, proceeding anyway: %m"); +} + +int link_update(Link *l) { + int r; + + assert(l); + + link_read_settings(l); + r = link_load_user(l); + if (r < 0) + return r; + + if (link_get_llmnr_support(l) != RESOLVE_SUPPORT_NO) { + r = manager_llmnr_start(l->manager); + if (r < 0) + return r; + } + + if (link_get_mdns_support(l) != RESOLVE_SUPPORT_NO) { + r = manager_mdns_start(l->manager); + if (r < 0) + return r; + } + + link_allocate_scopes(l); + link_add_rrs(l, false); + + return 0; +} + +bool link_relevant(Link *l, int family, bool local_multicast) { + assert(l); + + /* A link is relevant for local multicast traffic if it isn't a loopback device, has a link + * beat, can do multicast and has at least one link-local (or better) IP address. + * + * A link is relevant for non-multicast traffic if it isn't a loopback device, has a link beat, and has at + * least one routable address. */ + + if ((l->flags & (IFF_LOOPBACK | IFF_DORMANT)) != 0) + return false; + + if (!FLAGS_SET(l->flags, IFF_UP | IFF_LOWER_UP)) + return false; + + if (local_multicast && + !FLAGS_SET(l->flags, IFF_MULTICAST)) + return false; + + if (!netif_has_carrier(l->operstate, l->flags)) + return false; + + if (l->is_managed && + !IN_SET(l->networkd_operstate, LINK_OPERSTATE_DEGRADED_CARRIER, LINK_OPERSTATE_DEGRADED, LINK_OPERSTATE_ROUTABLE)) + return false; + + LIST_FOREACH(addresses, a, l->addresses) + if ((family == AF_UNSPEC || a->family == family) && link_address_relevant(a, local_multicast)) + return true; + + return false; +} + +LinkAddress *link_find_address(Link *l, int family, const union in_addr_union *in_addr) { + assert(l); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return NULL; + + if (!in_addr) + return NULL; + + LIST_FOREACH(addresses, a, l->addresses) + if (a->family == family && in_addr_equal(family, &a->in_addr, in_addr)) + return a; + + return NULL; +} + +DnsServer* link_set_dns_server(Link *l, DnsServer *s) { + assert(l); + + if (l->current_dns_server == s) + return s; + + if (s) + log_link_debug(l, "Switching to DNS server %s.", strna(dns_server_string_full(s))); + + dns_server_unref(l->current_dns_server); + l->current_dns_server = dns_server_ref(s); + + /* Skip flushing the cache if server stale feature is enabled. */ + if (l->unicast_scope && l->manager->stale_retention_usec == 0) + dns_cache_flush(&l->unicast_scope->cache); + + return s; +} + +DnsServer *link_get_dns_server(Link *l) { + assert(l); + + if (!l->current_dns_server) + link_set_dns_server(l, l->dns_servers); + + return l->current_dns_server; +} + +void link_next_dns_server(Link *l, DnsServer *if_current) { + assert(l); + + /* If the current server of the transaction is specified, and we already are at a different one, + * don't do anything */ + if (if_current && l->current_dns_server != if_current) + return; + + /* If currently have no DNS server, then don't do anything, we'll pick it lazily the next time a DNS + * server is needed. */ + if (!l->current_dns_server) + return; + + /* Change to the next one, but make sure to follow the linked list only if this server is actually + * still linked. */ + if (l->current_dns_server->linked && l->current_dns_server->servers_next) { + link_set_dns_server(l, l->current_dns_server->servers_next); + return; + } + + /* Pick the first one again, after we reached the end */ + link_set_dns_server(l, l->dns_servers); +} + +DnsOverTlsMode link_get_dns_over_tls_mode(Link *l) { + assert(l); + + if (l->dns_over_tls_mode != _DNS_OVER_TLS_MODE_INVALID) + return l->dns_over_tls_mode; + + return manager_get_dns_over_tls_mode(l->manager); +} + +DnssecMode link_get_dnssec_mode(Link *l) { + assert(l); + + if (l->dnssec_mode != _DNSSEC_MODE_INVALID) + return l->dnssec_mode; + + return manager_get_dnssec_mode(l->manager); +} + +bool link_dnssec_supported(Link *l) { + DnsServer *server; + + assert(l); + + if (link_get_dnssec_mode(l) == DNSSEC_NO) + return false; + + server = link_get_dns_server(l); + if (server) + return dns_server_dnssec_supported(server); + + return true; +} + +ResolveSupport link_get_llmnr_support(Link *link) { + assert(link); + assert(link->manager); + + /* This provides the effective LLMNR support level for the link, instead of the 'internal' per-link setting. */ + + return MIN(link->llmnr_support, link->manager->llmnr_support); +} + +ResolveSupport link_get_mdns_support(Link *link) { + assert(link); + assert(link->manager); + + /* This provides the effective mDNS support level for the link, instead of the 'internal' per-link setting. */ + + return MIN(link->mdns_support, link->manager->mdns_support); +} + +int link_address_new(Link *l, + LinkAddress **ret, + int family, + const union in_addr_union *in_addr, + const union in_addr_union *in_addr_broadcast) { + LinkAddress *a; + + assert(l); + assert(in_addr); + + a = new(LinkAddress, 1); + if (!a) + return -ENOMEM; + + *a = (LinkAddress) { + .family = family, + .in_addr = *in_addr, + .in_addr_broadcast = *in_addr_broadcast, + .link = l, + .prefixlen = UCHAR_MAX, + }; + + LIST_PREPEND(addresses, l->addresses, a); + l->n_addresses++; + + if (ret) + *ret = a; + + return 0; +} + +LinkAddress *link_address_free(LinkAddress *a) { + if (!a) + return NULL; + + if (a->link) { + LIST_REMOVE(addresses, a->link->addresses, a); + + assert(a->link->n_addresses > 0); + a->link->n_addresses--; + + if (a->llmnr_address_rr) { + if (a->family == AF_INET && a->link->llmnr_ipv4_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv4_scope->zone, a->llmnr_address_rr); + else if (a->family == AF_INET6 && a->link->llmnr_ipv6_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv6_scope->zone, a->llmnr_address_rr); + } + + if (a->llmnr_ptr_rr) { + if (a->family == AF_INET && a->link->llmnr_ipv4_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv4_scope->zone, a->llmnr_ptr_rr); + else if (a->family == AF_INET6 && a->link->llmnr_ipv6_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv6_scope->zone, a->llmnr_ptr_rr); + } + + if (a->mdns_address_rr) { + if (a->family == AF_INET && a->link->mdns_ipv4_scope) + dns_zone_remove_rr(&a->link->mdns_ipv4_scope->zone, a->mdns_address_rr); + else if (a->family == AF_INET6 && a->link->mdns_ipv6_scope) + dns_zone_remove_rr(&a->link->mdns_ipv6_scope->zone, a->mdns_address_rr); + } + + if (a->mdns_ptr_rr) { + if (a->family == AF_INET && a->link->mdns_ipv4_scope) + dns_zone_remove_rr(&a->link->mdns_ipv4_scope->zone, a->mdns_ptr_rr); + else if (a->family == AF_INET6 && a->link->mdns_ipv6_scope) + dns_zone_remove_rr(&a->link->mdns_ipv6_scope->zone, a->mdns_ptr_rr); + } + } + + dns_resource_record_unref(a->llmnr_address_rr); + dns_resource_record_unref(a->llmnr_ptr_rr); + dns_resource_record_unref(a->mdns_address_rr); + dns_resource_record_unref(a->mdns_ptr_rr); + + return mfree(a); +} + +void link_address_add_rrs(LinkAddress *a, bool force_remove) { + int r; + + assert(a); + + if (a->family == AF_INET) { + + if (!force_remove && + link_address_relevant(a, true) && + a->link->llmnr_ipv4_scope && + link_get_llmnr_support(a->link) == RESOLVE_SUPPORT_YES) { + + if (!a->link->manager->llmnr_host_ipv4_key) { + a->link->manager->llmnr_host_ipv4_key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_A, a->link->manager->llmnr_hostname); + if (!a->link->manager->llmnr_host_ipv4_key) { + r = -ENOMEM; + goto fail; + } + } + + if (!a->llmnr_address_rr) { + a->llmnr_address_rr = dns_resource_record_new(a->link->manager->llmnr_host_ipv4_key); + if (!a->llmnr_address_rr) { + r = -ENOMEM; + goto fail; + } + + a->llmnr_address_rr->a.in_addr = a->in_addr.in; + a->llmnr_address_rr->ttl = LLMNR_DEFAULT_TTL; + } + + if (!a->llmnr_ptr_rr) { + r = dns_resource_record_new_reverse(&a->llmnr_ptr_rr, a->family, &a->in_addr, a->link->manager->llmnr_hostname); + if (r < 0) + goto fail; + + a->llmnr_ptr_rr->ttl = LLMNR_DEFAULT_TTL; + } + + r = dns_zone_put(&a->link->llmnr_ipv4_scope->zone, a->link->llmnr_ipv4_scope, a->llmnr_address_rr, true); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add A record to LLMNR zone, ignoring: %m"); + + r = dns_zone_put(&a->link->llmnr_ipv4_scope->zone, a->link->llmnr_ipv4_scope, a->llmnr_ptr_rr, false); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add IPv4 PTR record to LLMNR zone, ignoring: %m"); + } else { + if (a->llmnr_address_rr) { + if (a->link->llmnr_ipv4_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv4_scope->zone, a->llmnr_address_rr); + a->llmnr_address_rr = dns_resource_record_unref(a->llmnr_address_rr); + } + + if (a->llmnr_ptr_rr) { + if (a->link->llmnr_ipv4_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv4_scope->zone, a->llmnr_ptr_rr); + a->llmnr_ptr_rr = dns_resource_record_unref(a->llmnr_ptr_rr); + } + } + + if (!force_remove && + link_address_relevant(a, true) && + a->link->mdns_ipv4_scope && + link_get_mdns_support(a->link) == RESOLVE_SUPPORT_YES) { + if (!a->link->manager->mdns_host_ipv4_key) { + a->link->manager->mdns_host_ipv4_key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_A, a->link->manager->mdns_hostname); + if (!a->link->manager->mdns_host_ipv4_key) { + r = -ENOMEM; + goto fail; + } + } + + if (!a->mdns_address_rr) { + a->mdns_address_rr = dns_resource_record_new(a->link->manager->mdns_host_ipv4_key); + if (!a->mdns_address_rr) { + r = -ENOMEM; + goto fail; + } + + a->mdns_address_rr->a.in_addr = a->in_addr.in; + a->mdns_address_rr->ttl = MDNS_DEFAULT_TTL; + } + + if (!a->mdns_ptr_rr) { + r = dns_resource_record_new_reverse(&a->mdns_ptr_rr, a->family, &a->in_addr, a->link->manager->mdns_hostname); + if (r < 0) + goto fail; + + a->mdns_ptr_rr->ttl = MDNS_DEFAULT_TTL; + } + + r = dns_zone_put(&a->link->mdns_ipv4_scope->zone, a->link->mdns_ipv4_scope, a->mdns_address_rr, true); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add A record to MDNS zone, ignoring: %m"); + + r = dns_zone_put(&a->link->mdns_ipv4_scope->zone, a->link->mdns_ipv4_scope, a->mdns_ptr_rr, false); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add IPv4 PTR record to MDNS zone, ignoring: %m"); + } else { + if (a->mdns_address_rr) { + if (a->link->mdns_ipv4_scope) + dns_zone_remove_rr(&a->link->mdns_ipv4_scope->zone, a->mdns_address_rr); + a->mdns_address_rr = dns_resource_record_unref(a->mdns_address_rr); + } + + if (a->mdns_ptr_rr) { + if (a->link->mdns_ipv4_scope) + dns_zone_remove_rr(&a->link->mdns_ipv4_scope->zone, a->mdns_ptr_rr); + a->mdns_ptr_rr = dns_resource_record_unref(a->mdns_ptr_rr); + } + } + } + + if (a->family == AF_INET6) { + + if (!force_remove && + link_address_relevant(a, true) && + a->link->llmnr_ipv6_scope && + link_get_llmnr_support(a->link) == RESOLVE_SUPPORT_YES) { + + if (!a->link->manager->llmnr_host_ipv6_key) { + a->link->manager->llmnr_host_ipv6_key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_AAAA, a->link->manager->llmnr_hostname); + if (!a->link->manager->llmnr_host_ipv6_key) { + r = -ENOMEM; + goto fail; + } + } + + if (!a->llmnr_address_rr) { + a->llmnr_address_rr = dns_resource_record_new(a->link->manager->llmnr_host_ipv6_key); + if (!a->llmnr_address_rr) { + r = -ENOMEM; + goto fail; + } + + a->llmnr_address_rr->aaaa.in6_addr = a->in_addr.in6; + a->llmnr_address_rr->ttl = LLMNR_DEFAULT_TTL; + } + + if (!a->llmnr_ptr_rr) { + r = dns_resource_record_new_reverse(&a->llmnr_ptr_rr, a->family, &a->in_addr, a->link->manager->llmnr_hostname); + if (r < 0) + goto fail; + + a->llmnr_ptr_rr->ttl = LLMNR_DEFAULT_TTL; + } + + r = dns_zone_put(&a->link->llmnr_ipv6_scope->zone, a->link->llmnr_ipv6_scope, a->llmnr_address_rr, true); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add AAAA record to LLMNR zone, ignoring: %m"); + + r = dns_zone_put(&a->link->llmnr_ipv6_scope->zone, a->link->llmnr_ipv6_scope, a->llmnr_ptr_rr, false); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add IPv6 PTR record to LLMNR zone, ignoring: %m"); + } else { + if (a->llmnr_address_rr) { + if (a->link->llmnr_ipv6_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv6_scope->zone, a->llmnr_address_rr); + a->llmnr_address_rr = dns_resource_record_unref(a->llmnr_address_rr); + } + + if (a->llmnr_ptr_rr) { + if (a->link->llmnr_ipv6_scope) + dns_zone_remove_rr(&a->link->llmnr_ipv6_scope->zone, a->llmnr_ptr_rr); + a->llmnr_ptr_rr = dns_resource_record_unref(a->llmnr_ptr_rr); + } + } + + if (!force_remove && + link_address_relevant(a, true) && + a->link->mdns_ipv6_scope && + link_get_mdns_support(a->link) == RESOLVE_SUPPORT_YES) { + + if (!a->link->manager->mdns_host_ipv6_key) { + a->link->manager->mdns_host_ipv6_key = dns_resource_key_new(DNS_CLASS_IN, DNS_TYPE_AAAA, a->link->manager->mdns_hostname); + if (!a->link->manager->mdns_host_ipv6_key) { + r = -ENOMEM; + goto fail; + } + } + + if (!a->mdns_address_rr) { + a->mdns_address_rr = dns_resource_record_new(a->link->manager->mdns_host_ipv6_key); + if (!a->mdns_address_rr) { + r = -ENOMEM; + goto fail; + } + + a->mdns_address_rr->aaaa.in6_addr = a->in_addr.in6; + a->mdns_address_rr->ttl = MDNS_DEFAULT_TTL; + } + + if (!a->mdns_ptr_rr) { + r = dns_resource_record_new_reverse(&a->mdns_ptr_rr, a->family, &a->in_addr, a->link->manager->mdns_hostname); + if (r < 0) + goto fail; + + a->mdns_ptr_rr->ttl = MDNS_DEFAULT_TTL; + } + + r = dns_zone_put(&a->link->mdns_ipv6_scope->zone, a->link->mdns_ipv6_scope, a->mdns_address_rr, true); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add AAAA record to MDNS zone, ignoring: %m"); + + r = dns_zone_put(&a->link->mdns_ipv6_scope->zone, a->link->mdns_ipv6_scope, a->mdns_ptr_rr, false); + if (r < 0) + log_link_warning_errno(a->link, r, "Failed to add IPv6 PTR record to MDNS zone, ignoring: %m"); + } else { + if (a->mdns_address_rr) { + if (a->link->mdns_ipv6_scope) + dns_zone_remove_rr(&a->link->mdns_ipv6_scope->zone, a->mdns_address_rr); + a->mdns_address_rr = dns_resource_record_unref(a->mdns_address_rr); + } + + if (a->mdns_ptr_rr) { + if (a->link->mdns_ipv6_scope) + dns_zone_remove_rr(&a->link->mdns_ipv6_scope->zone, a->mdns_ptr_rr); + a->mdns_ptr_rr = dns_resource_record_unref(a->mdns_ptr_rr); + } + } + } + + return; + +fail: + log_link_debug_errno(a->link, r, "Failed to update address RRs, ignoring: %m"); +} + +int link_address_update_rtnl(LinkAddress *a, sd_netlink_message *m) { + int r; + + assert(a); + assert(m); + + r = sd_rtnl_message_addr_get_flags(m, &a->flags); + if (r < 0) + return r; + + (void) sd_rtnl_message_addr_get_prefixlen(m, &a->prefixlen); + (void) sd_rtnl_message_addr_get_scope(m, &a->scope); + + link_allocate_scopes(a->link); + link_add_rrs(a->link, false); + + return 0; +} + +bool link_address_relevant(LinkAddress *a, bool local_multicast) { + assert(a); + + if (a->flags & (IFA_F_DEPRECATED|IFA_F_TENTATIVE)) + return false; + + if (a->scope >= (local_multicast ? RT_SCOPE_HOST : RT_SCOPE_LINK)) + return false; + + return true; +} + +static bool link_needs_save(Link *l) { + assert(l); + + /* Returns true if any of the settings where set different from the default */ + + if (l->is_managed) + return false; + + if (l->llmnr_support != RESOLVE_SUPPORT_YES || + l->mdns_support != RESOLVE_SUPPORT_YES || + l->dnssec_mode != _DNSSEC_MODE_INVALID || + l->dns_over_tls_mode != _DNS_OVER_TLS_MODE_INVALID) + return true; + + if (l->dns_servers || + l->search_domains) + return true; + + if (!set_isempty(l->dnssec_negative_trust_anchors)) + return true; + + if (l->default_route >= 0) + return true; + + return false; +} + +int link_save_user(Link *l) { + _cleanup_(unlink_and_freep) char *temp_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + const char *v; + int r; + + assert(l); + assert(l->state_file); + + if (!link_needs_save(l)) { + (void) unlink(l->state_file); + return 0; + } + + r = mkdir_parents(l->state_file, 0700); + if (r < 0) + goto fail; + + r = fopen_temporary(l->state_file, &f, &temp_path); + if (r < 0) + goto fail; + + (void) fchmod(fileno(f), 0644); + + fputs("# This is private data. Do not parse.\n", f); + + v = resolve_support_to_string(l->llmnr_support); + if (v) + fprintf(f, "LLMNR=%s\n", v); + + v = resolve_support_to_string(l->mdns_support); + if (v) + fprintf(f, "MDNS=%s\n", v); + + v = dnssec_mode_to_string(l->dnssec_mode); + if (v) + fprintf(f, "DNSSEC=%s\n", v); + + v = dns_over_tls_mode_to_string(l->dns_over_tls_mode); + if (v) + fprintf(f, "DNSOVERTLS=%s\n", v); + + if (l->default_route >= 0) + fprintf(f, "DEFAULT_ROUTE=%s\n", yes_no(l->default_route)); + + if (l->dns_servers) { + fputs("SERVERS=", f); + LIST_FOREACH(servers, server, l->dns_servers) { + + if (server != l->dns_servers) + fputc(' ', f); + + v = dns_server_string_full(server); + if (!v) { + r = -ENOMEM; + goto fail; + } + + fputs(v, f); + } + fputc('\n', f); + } + + if (l->search_domains) { + fputs("DOMAINS=", f); + LIST_FOREACH(domains, domain, l->search_domains) { + + if (domain != l->search_domains) + fputc(' ', f); + + if (domain->route_only) + fputc('~', f); + + fputs(DNS_SEARCH_DOMAIN_NAME(domain), f); + } + fputc('\n', f); + } + + if (!set_isempty(l->dnssec_negative_trust_anchors)) { + bool space = false; + char *nta; + + fputs("NTAS=", f); + SET_FOREACH(nta, l->dnssec_negative_trust_anchors) { + + if (space) + fputc(' ', f); + + fputs(nta, f); + space = true; + } + fputc('\n', f); + } + + r = fflush_and_check(f); + if (r < 0) + goto fail; + + if (rename(temp_path, l->state_file) < 0) { + r = -errno; + goto fail; + } + + temp_path = mfree(temp_path); + + return 0; + +fail: + (void) unlink(l->state_file); + + return log_link_error_errno(l, r, "Failed to save link data %s: %m", l->state_file); +} + +int link_load_user(Link *l) { + _cleanup_free_ char + *llmnr = NULL, + *mdns = NULL, + *dnssec = NULL, + *dns_over_tls = NULL, + *servers = NULL, + *domains = NULL, + *ntas = NULL, + *default_route = NULL; + + ResolveSupport s; + const char *p; + int r; + + assert(l); + assert(l->state_file); + + /* Try to load only a single time */ + if (l->loaded) + return 0; + l->loaded = true; + + if (l->is_managed) + return 0; /* if the device is managed, then networkd is our configuration source, not the bus API */ + + r = parse_env_file(NULL, l->state_file, + "LLMNR", &llmnr, + "MDNS", &mdns, + "DNSSEC", &dnssec, + "DNSOVERTLS", &dns_over_tls, + "SERVERS", &servers, + "DOMAINS", &domains, + "NTAS", &ntas, + "DEFAULT_ROUTE", &default_route); + if (r == -ENOENT) + return 0; + if (r < 0) + goto fail; + + link_flush_settings(l); + + /* If we can't recognize the LLMNR or MDNS setting we don't override the default */ + s = resolve_support_from_string(llmnr); + if (s >= 0) + l->llmnr_support = s; + + s = resolve_support_from_string(mdns); + if (s >= 0) + l->mdns_support = s; + + r = parse_boolean(default_route); + if (r >= 0) + l->default_route = r; + + /* If we can't recognize the DNSSEC setting, then set it to invalid, so that the daemon default is used. */ + l->dnssec_mode = dnssec_mode_from_string(dnssec); + + /* Same for DNSOverTLS */ + l->dns_over_tls_mode = dns_over_tls_mode_from_string(dns_over_tls); + + for (p = servers;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + goto fail; + if (r == 0) + break; + + r = link_update_dns_server_one(l, word); + if (r < 0) { + log_link_debug_errno(l, r, "Failed to load DNS server '%s', ignoring: %m", word); + continue; + } + } + + for (p = domains;;) { + _cleanup_free_ char *word = NULL; + const char *n; + bool is_route; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + goto fail; + if (r == 0) + break; + + is_route = word[0] == '~'; + n = is_route ? word + 1 : word; + + r = link_update_search_domain_one(l, n, is_route); + if (r < 0) { + log_link_debug_errno(l, r, "Failed to load search domain '%s', ignoring: %m", word); + continue; + } + } + + if (ntas) { + _cleanup_set_free_free_ Set *ns = NULL; + + ns = set_new(&dns_name_hash_ops); + if (!ns) { + r = -ENOMEM; + goto fail; + } + + r = set_put_strsplit(ns, ntas, NULL, 0); + if (r < 0) + goto fail; + + l->dnssec_negative_trust_anchors = TAKE_PTR(ns); + } + + return 0; + +fail: + return log_link_error_errno(l, r, "Failed to load link data %s: %m", l->state_file); +} + +void link_remove_user(Link *l) { + assert(l); + assert(l->state_file); + + (void) unlink(l->state_file); +} + +bool link_negative_trust_anchor_lookup(Link *l, const char *name) { + int r; + + assert(l); + assert(name); + + /* Checks whether the specified domain (or any of its parent domains) are listed as per-link NTA. */ + + for (;;) { + if (set_contains(l->dnssec_negative_trust_anchors, name)) + return true; + + /* And now, let's look at the parent, and check that too */ + r = dns_name_parent(&name); + if (r < 0) + return r; + if (r == 0) + break; + } + + return false; +} diff --git a/src/resolve/resolved-link.h b/src/resolve/resolved-link.h new file mode 100644 index 0000000..0695a6f --- /dev/null +++ b/src/resolve/resolved-link.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-netlink.h" + +#include "in-addr-util.h" +#include "network-util.h" +#include "ratelimit.h" +#include "resolve-util.h" + +typedef struct Link Link; +typedef struct LinkAddress LinkAddress; + +#include "resolved-dns-rr.h" +#include "resolved-dns-scope.h" +#include "resolved-dns-search-domain.h" +#include "resolved-dns-server.h" + +#define LINK_SEARCH_DOMAINS_MAX 256 +#define LINK_DNS_SERVERS_MAX 256 + +struct LinkAddress { + Link *link; + + int family; + union in_addr_union in_addr; + union in_addr_union in_addr_broadcast; + unsigned char prefixlen; + + unsigned char flags, scope; + + DnsResourceRecord *llmnr_address_rr; + DnsResourceRecord *llmnr_ptr_rr; + DnsResourceRecord *mdns_address_rr; + DnsResourceRecord *mdns_ptr_rr; + + LIST_FIELDS(LinkAddress, addresses); +}; + +struct Link { + Manager *manager; + + int ifindex; + unsigned flags; + + LIST_HEAD(LinkAddress, addresses); + unsigned n_addresses; + + LIST_HEAD(DnsServer, dns_servers); + DnsServer *current_dns_server; + unsigned n_dns_servers; + + LIST_HEAD(DnsSearchDomain, search_domains); + unsigned n_search_domains; + + int default_route; + + ResolveSupport llmnr_support; + ResolveSupport mdns_support; + DnsOverTlsMode dns_over_tls_mode; + DnssecMode dnssec_mode; + Set *dnssec_negative_trust_anchors; + + DnsScope *unicast_scope; + DnsScope *llmnr_ipv4_scope; + DnsScope *llmnr_ipv6_scope; + DnsScope *mdns_ipv4_scope; + DnsScope *mdns_ipv6_scope; + + struct stat networkd_state_file_stat; + LinkOperationalState networkd_operstate; + bool is_managed; + + char *ifname; + uint32_t mtu; + uint8_t operstate; + + bool loaded; + char *state_file; + + bool unicast_relevant; +}; + +int link_new(Manager *m, Link **ret, int ifindex); +Link *link_free(Link *l); +int link_process_rtnl(Link *l, sd_netlink_message *m); +int link_update(Link *l); +bool link_relevant(Link *l, int family, bool local_multicast); +LinkAddress* link_find_address(Link *l, int family, const union in_addr_union *in_addr); +void link_add_rrs(Link *l, bool force_remove); + +void link_flush_settings(Link *l); +void link_set_dnssec_mode(Link *l, DnssecMode mode); +void link_set_dns_over_tls_mode(Link *l, DnsOverTlsMode mode); +void link_allocate_scopes(Link *l); + +DnsServer* link_set_dns_server(Link *l, DnsServer *s); +DnsServer* link_get_dns_server(Link *l); +void link_next_dns_server(Link *l, DnsServer *if_current); + +DnssecMode link_get_dnssec_mode(Link *l); +bool link_dnssec_supported(Link *l); + +DnsOverTlsMode link_get_dns_over_tls_mode(Link *l); + +ResolveSupport link_get_llmnr_support(Link *link); +ResolveSupport link_get_mdns_support(Link *link); + +int link_save_user(Link *l); +int link_load_user(Link *l); +void link_remove_user(Link *l); + +int link_address_new(Link *l, + LinkAddress **ret, + int family, + const union in_addr_union *in_addr, + const union in_addr_union *in_addr_broadcast); +LinkAddress *link_address_free(LinkAddress *a); +int link_address_update_rtnl(LinkAddress *a, sd_netlink_message *m); +bool link_address_relevant(LinkAddress *l, bool local_multicast); +void link_address_add_rrs(LinkAddress *a, bool force_remove); + +bool link_negative_trust_anchor_lookup(Link *l, const char *name); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_free); diff --git a/src/resolve/resolved-llmnr.c b/src/resolve/resolved-llmnr.c new file mode 100644 index 0000000..9469bda --- /dev/null +++ b/src/resolve/resolved-llmnr.c @@ -0,0 +1,471 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "errno-util.h" +#include "fd-util.h" +#include "resolved-llmnr.h" +#include "resolved-manager.h" + +void manager_llmnr_stop(Manager *m) { + assert(m); + + m->llmnr_ipv4_udp_event_source = sd_event_source_disable_unref(m->llmnr_ipv4_udp_event_source); + m->llmnr_ipv4_udp_fd = safe_close(m->llmnr_ipv4_udp_fd); + + m->llmnr_ipv6_udp_event_source = sd_event_source_disable_unref(m->llmnr_ipv6_udp_event_source); + m->llmnr_ipv6_udp_fd = safe_close(m->llmnr_ipv6_udp_fd); + + m->llmnr_ipv4_tcp_event_source = sd_event_source_disable_unref(m->llmnr_ipv4_tcp_event_source); + m->llmnr_ipv4_tcp_fd = safe_close(m->llmnr_ipv4_tcp_fd); + + m->llmnr_ipv6_tcp_event_source = sd_event_source_disable_unref(m->llmnr_ipv6_tcp_event_source); + m->llmnr_ipv6_tcp_fd = safe_close(m->llmnr_ipv6_tcp_fd); +} + +int manager_llmnr_start(Manager *m) { + int r; + + assert(m); + + if (m->llmnr_support == RESOLVE_SUPPORT_NO) + return 0; + + r = manager_llmnr_ipv4_udp_fd(m); + if (r == -EADDRINUSE) + goto eaddrinuse; + if (r < 0) + return r; + + r = manager_llmnr_ipv4_tcp_fd(m); + if (r == -EADDRINUSE) + goto eaddrinuse; + if (r < 0) + return r; + + if (socket_ipv6_is_enabled()) { + r = manager_llmnr_ipv6_udp_fd(m); + if (r == -EADDRINUSE) + goto eaddrinuse; + if (r < 0) + return r; + + r = manager_llmnr_ipv6_tcp_fd(m); + if (r == -EADDRINUSE) + goto eaddrinuse; + if (r < 0) + return r; + } + + return 0; + +eaddrinuse: + log_warning("Another LLMNR responder prohibits binding the socket to the same port. Turning off LLMNR support."); + m->llmnr_support = RESOLVE_SUPPORT_NO; + manager_llmnr_stop(m); + + return 0; +} + +static int on_llmnr_packet(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + DnsTransaction *t = NULL; + Manager *m = ASSERT_PTR(userdata); + DnsScope *scope; + int r; + + assert(s); + assert(fd >= 0); + + r = manager_recv(m, fd, DNS_PROTOCOL_LLMNR, &p); + if (r <= 0) + return r; + + if (manager_packet_from_local_address(m, p)) + return 0; + + scope = manager_find_scope(m, p); + if (!scope) { + log_debug("Got LLMNR UDP packet on unknown scope. Ignoring."); + return 0; + } + + if (dns_packet_validate_reply(p) > 0) { + log_debug("Got LLMNR UDP reply packet for id %u", DNS_PACKET_ID(p)); + + dns_scope_check_conflicts(scope, p); + + t = hashmap_get(m->dns_transactions, UINT_TO_PTR(DNS_PACKET_ID(p))); + if (t) + dns_transaction_process_reply(t, p, false); + + } else if (dns_packet_validate_query(p) > 0) { + log_debug("Got LLMNR UDP query packet for id %u", DNS_PACKET_ID(p)); + + dns_scope_process_query(scope, NULL, p); + } else + log_debug("Invalid LLMNR UDP packet, ignoring."); + + return 0; +} + +static int set_llmnr_common_socket_options(int fd, int family) { + int r; + + r = socket_set_recvpktinfo(fd, family, true); + if (r < 0) + return r; + + r = socket_set_recvttl(fd, family, true); + if (r < 0) + return r; + + return 0; +} + +static int set_llmnr_common_udp_socket_options(int fd, int family) { + int r; + + /* RFC 4795, section 2.5 recommends setting the TTL of UDP packets to 255. */ + r = socket_set_ttl(fd, family, 255); + if (r < 0) + return r; + + return 0; +} + +int manager_llmnr_ipv4_udp_fd(Manager *m) { + union sockaddr_union sa = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(LLMNR_PORT), + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(m); + + if (m->llmnr_ipv4_udp_fd >= 0) + return m->llmnr_ipv4_udp_fd; + + s = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s < 0) + return log_error_errno(errno, "LLMNR-IPv4(UDP): Failed to create socket: %m"); + + r = set_llmnr_common_socket_options(s, AF_INET); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set common socket options: %m"); + + r = set_llmnr_common_udp_socket_options(s, AF_INET); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set common UDP socket options: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_MULTICAST_TTL, 255); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set IP_MULTICAST_TTL: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_MULTICAST_LOOP, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set IP_MULTICAST_LOOP: %m"); + + /* Disable Don't-Fragment bit in the IP header */ + r = setsockopt_int(s, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_DONT); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set IP_MTU_DISCOVER: %m"); + + /* first try to bind without SO_REUSEADDR to detect another LLMNR responder */ + r = bind(s, &sa.sa, sizeof(sa.in)); + if (r < 0) { + if (errno != EADDRINUSE) + return log_error_errno(errno, "LLMNR-IPv4(UDP): Failed to bind socket: %m"); + + log_warning("LLMNR-IPv4(UDP): There appears to be another LLMNR responder running, or previously systemd-resolved crashed with some outstanding transfers."); + + /* try again with SO_REUSEADDR */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set SO_REUSEADDR: %m"); + + r = bind(s, &sa.sa, sizeof(sa.in)); + if (r < 0) + return log_error_errno(errno, "LLMNR-IPv4(UDP): Failed to bind socket: %m"); + } else { + /* enable SO_REUSEADDR for the case that the user really wants multiple LLMNR responders */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to set SO_REUSEADDR: %m"); + } + + r = sd_event_add_io(m->event, &m->llmnr_ipv4_udp_event_source, s, EPOLLIN, on_llmnr_packet, m); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(UDP): Failed to create event source: %m"); + + (void) sd_event_source_set_description(m->llmnr_ipv4_udp_event_source, "llmnr-ipv4-udp"); + + return m->llmnr_ipv4_udp_fd = TAKE_FD(s); +} + +int manager_llmnr_ipv6_udp_fd(Manager *m) { + union sockaddr_union sa = { + .in6.sin6_family = AF_INET6, + .in6.sin6_port = htobe16(LLMNR_PORT), + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(m); + + if (m->llmnr_ipv6_udp_fd >= 0) + return m->llmnr_ipv6_udp_fd; + + s = socket(AF_INET6, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s < 0) + return log_error_errno(errno, "LLMNR-IPv6(UDP): Failed to create socket: %m"); + + r = set_llmnr_common_socket_options(s, AF_INET6); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set common socket options: %m"); + + r = set_llmnr_common_udp_socket_options(s, AF_INET6); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set common UDP socket options: %m"); + + /* RFC 4795, section 2.5 recommends setting the TTL of UDP packets to 255. */ + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, 255); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set IPV6_MULTICAST_HOPS: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set IPV6_MULTICAST_LOOP: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_V6ONLY, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set IPV6_V6ONLY: %m"); + + /* first try to bind without SO_REUSEADDR to detect another LLMNR responder */ + r = bind(s, &sa.sa, sizeof(sa.in6)); + if (r < 0) { + if (errno != EADDRINUSE) + return log_error_errno(errno, "LLMNR-IPv6(UDP): Failed to bind socket: %m"); + + log_warning("LLMNR-IPv6(UDP): There appears to be another LLMNR responder running, or previously systemd-resolved crashed with some outstanding transfers."); + + /* try again with SO_REUSEADDR */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set SO_REUSEADDR: %m"); + + r = bind(s, &sa.sa, sizeof(sa.in6)); + if (r < 0) + return log_error_errno(errno, "LLMNR-IPv6(UDP): Failed to bind socket: %m"); + } else { + /* enable SO_REUSEADDR for the case that the user really wants multiple LLMNR responders */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to set SO_REUSEADDR: %m"); + } + + r = sd_event_add_io(m->event, &m->llmnr_ipv6_udp_event_source, s, EPOLLIN, on_llmnr_packet, m); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(UDP): Failed to create event source: %m"); + + (void) sd_event_source_set_description(m->llmnr_ipv6_udp_event_source, "llmnr-ipv6-udp"); + + return m->llmnr_ipv6_udp_fd = TAKE_FD(s); +} + +static int on_llmnr_stream_packet(DnsStream *s, DnsPacket *p) { + DnsScope *scope; + + assert(s); + assert(s->manager); + assert(p); + + scope = manager_find_scope(s->manager, p); + if (!scope) + log_debug("Got LLMNR TCP packet on unknown scope. Ignoring."); + else if (dns_packet_validate_query(p) > 0) { + log_debug("Got LLMNR TCP query packet for id %u", DNS_PACKET_ID(p)); + + dns_scope_process_query(scope, s, p); + } else + log_debug("Invalid LLMNR TCP packet, ignoring."); + + return 0; +} + +static int on_llmnr_stream(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + DnsStream *stream; + Manager *m = userdata; + int cfd, r; + + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (cfd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return -errno; + } + + /* We don't configure a "complete" handler here, we rely on the default handler, thus freeing it */ + r = dns_stream_new(m, &stream, DNS_STREAM_LLMNR_RECV, DNS_PROTOCOL_LLMNR, cfd, NULL, + on_llmnr_stream_packet, NULL, DNS_STREAM_DEFAULT_TIMEOUT_USEC); + if (r < 0) { + safe_close(cfd); + return r; + } + + return 0; +} + +static int set_llmnr_common_tcp_socket_options(int fd, int family) { + int r; + + /* RFC 4795, section 2.5. requires setting the TTL of TCP streams to 1 */ + r = socket_set_ttl(fd, family, 1); + if (r < 0) + return r; + + r = setsockopt_int(fd, IPPROTO_TCP, TCP_FASTOPEN, 5); /* Everybody appears to pick qlen=5, let's do the same here. */ + if (r < 0) + log_debug_errno(r, "Failed to enable TCP_FASTOPEN on TCP listening socket, ignoring: %m"); + + r = setsockopt_int(fd, IPPROTO_TCP, TCP_NODELAY, true); + if (r < 0) + log_debug_errno(r, "Failed to enable TCP_NODELAY mode, ignoring: %m"); + + return 0; +} + +int manager_llmnr_ipv4_tcp_fd(Manager *m) { + union sockaddr_union sa = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(LLMNR_PORT), + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(m); + + if (m->llmnr_ipv4_tcp_fd >= 0) + return m->llmnr_ipv4_tcp_fd; + + s = socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s < 0) + return log_error_errno(errno, "LLMNR-IPv4(TCP): Failed to create socket: %m"); + + r = set_llmnr_common_socket_options(s, AF_INET); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(TCP): Failed to set common socket options: %m"); + + r = set_llmnr_common_tcp_socket_options(s, AF_INET); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(TCP): Failed to set common TCP socket options: %m"); + + /* Disable Don't-Fragment bit in the IP header */ + r = setsockopt_int(s, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_DONT); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(TCP): Failed to set IP_MTU_DISCOVER: %m"); + + /* first try to bind without SO_REUSEADDR to detect another LLMNR responder */ + r = bind(s, &sa.sa, sizeof(sa.in)); + if (r < 0) { + if (errno != EADDRINUSE) + return log_error_errno(errno, "LLMNR-IPv4(TCP): Failed to bind socket: %m"); + + log_warning("LLMNR-IPv4(TCP): There appears to be another LLMNR responder running, or previously systemd-resolved crashed with some outstanding transfers."); + + /* try again with SO_REUSEADDR */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(TCP): Failed to set SO_REUSEADDR: %m"); + + r = bind(s, &sa.sa, sizeof(sa.in)); + if (r < 0) + return log_error_errno(errno, "LLMNR-IPv4(TCP): Failed to bind socket: %m"); + } else { + /* enable SO_REUSEADDR for the case that the user really wants multiple LLMNR responders */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(TCP): Failed to set SO_REUSEADDR: %m"); + } + + r = listen(s, SOMAXCONN_DELUXE); + if (r < 0) + return log_error_errno(errno, "LLMNR-IPv4(TCP): Failed to listen the stream: %m"); + + r = sd_event_add_io(m->event, &m->llmnr_ipv4_tcp_event_source, s, EPOLLIN, on_llmnr_stream, m); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv4(TCP): Failed to create event source: %m"); + + (void) sd_event_source_set_description(m->llmnr_ipv4_tcp_event_source, "llmnr-ipv4-tcp"); + + return m->llmnr_ipv4_tcp_fd = TAKE_FD(s); +} + +int manager_llmnr_ipv6_tcp_fd(Manager *m) { + union sockaddr_union sa = { + .in6.sin6_family = AF_INET6, + .in6.sin6_port = htobe16(LLMNR_PORT), + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(m); + + if (m->llmnr_ipv6_tcp_fd >= 0) + return m->llmnr_ipv6_tcp_fd; + + s = socket(AF_INET6, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s < 0) + return log_error_errno(errno, "LLMNR-IPv6(TCP): Failed to create socket: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_V6ONLY, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(TCP): Failed to set IPV6_V6ONLY: %m"); + + r = set_llmnr_common_socket_options(s, AF_INET6); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(TCP): Failed to set common socket options: %m"); + + r = set_llmnr_common_tcp_socket_options(s, AF_INET6); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(TCP): Failed to set common TCP socket options: %m"); + + /* first try to bind without SO_REUSEADDR to detect another LLMNR responder */ + r = bind(s, &sa.sa, sizeof(sa.in6)); + if (r < 0) { + if (errno != EADDRINUSE) + return log_error_errno(errno, "LLMNR-IPv6(TCP): Failed to bind socket: %m"); + + log_warning("LLMNR-IPv6(TCP): There appears to be another LLMNR responder running, or previously systemd-resolved crashed with some outstanding transfers."); + + /* try again with SO_REUSEADDR */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(TCP): Failed to set SO_REUSEADDR: %m"); + + r = bind(s, &sa.sa, sizeof(sa.in6)); + if (r < 0) + return log_error_errno(errno, "LLMNR-IPv6(TCP): Failed to bind socket: %m"); + } else { + /* enable SO_REUSEADDR for the case that the user really wants multiple LLMNR responders */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(TCP): Failed to set SO_REUSEADDR: %m"); + } + + r = listen(s, SOMAXCONN_DELUXE); + if (r < 0) + return log_error_errno(errno, "LLMNR-IPv6(TCP): Failed to listen the stream: %m"); + + r = sd_event_add_io(m->event, &m->llmnr_ipv6_tcp_event_source, s, EPOLLIN, on_llmnr_stream, m); + if (r < 0) + return log_error_errno(r, "LLMNR-IPv6(TCP): Failed to create event source: %m"); + + (void) sd_event_source_set_description(m->llmnr_ipv6_tcp_event_source, "llmnr-ipv6-tcp"); + + return m->llmnr_ipv6_tcp_fd = TAKE_FD(s); +} diff --git a/src/resolve/resolved-llmnr.h b/src/resolve/resolved-llmnr.h new file mode 100644 index 0000000..4cdd260 --- /dev/null +++ b/src/resolve/resolved-llmnr.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "resolved-manager.h" + +#define LLMNR_PORT 5355 + +int manager_llmnr_ipv4_udp_fd(Manager *m); +int manager_llmnr_ipv6_udp_fd(Manager *m); +int manager_llmnr_ipv4_tcp_fd(Manager *m); +int manager_llmnr_ipv6_tcp_fd(Manager *m); + +void manager_llmnr_stop(Manager *m); +int manager_llmnr_start(Manager *m); diff --git a/src/resolve/resolved-manager.c b/src/resolve/resolved-manager.c new file mode 100644 index 0000000..b52619e --- /dev/null +++ b/src/resolve/resolved-manager.c @@ -0,0 +1,1860 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "bus-polkit.h" +#include "dirent-util.h" +#include "dns-domain.h" +#include "event-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hostname-util.h" +#include "idn-util.h" +#include "io-util.h" +#include "iovec-util.h" +#include "memstream-util.h" +#include "missing_network.h" +#include "missing_socket.h" +#include "netlink-util.h" +#include "ordered-set.h" +#include "parse-util.h" +#include "random-util.h" +#include "resolved-bus.h" +#include "resolved-conf.h" +#include "resolved-dns-stub.h" +#include "resolved-dnssd.h" +#include "resolved-etc-hosts.h" +#include "resolved-llmnr.h" +#include "resolved-manager.h" +#include "resolved-mdns.h" +#include "resolved-resolv-conf.h" +#include "resolved-util.h" +#include "resolved-varlink.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "utf8.h" + +#define SEND_TIMEOUT_USEC (200 * USEC_PER_MSEC) + +static int manager_process_link(sd_netlink *rtnl, sd_netlink_message *mm, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + uint16_t type; + Link *l; + int ifindex, r; + + assert(rtnl); + assert(mm); + + r = sd_netlink_message_get_type(mm, &type); + if (r < 0) + goto fail; + + r = sd_rtnl_message_link_get_ifindex(mm, &ifindex); + if (r < 0) + goto fail; + + l = hashmap_get(m->links, INT_TO_PTR(ifindex)); + + switch (type) { + + case RTM_NEWLINK:{ + bool is_new = !l; + + if (!l) { + r = link_new(m, &l, ifindex); + if (r < 0) + goto fail; + } + + r = link_process_rtnl(l, mm); + if (r < 0) + goto fail; + + r = link_update(l); + if (r < 0) + goto fail; + + if (is_new) + log_debug("Found new link %i/%s", ifindex, l->ifname); + + break; + } + + case RTM_DELLINK: + if (l) { + log_debug("Removing link %i/%s", l->ifindex, l->ifname); + link_remove_user(l); + link_free(l); + } + + break; + } + + return 0; + +fail: + log_warning_errno(r, "Failed to process RTNL link message: %m"); + return 0; +} + +static int manager_process_address(sd_netlink *rtnl, sd_netlink_message *mm, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + union in_addr_union address, broadcast = {}; + uint16_t type; + int r, ifindex, family; + LinkAddress *a; + Link *l; + + assert(rtnl); + assert(mm); + + r = sd_netlink_message_get_type(mm, &type); + if (r < 0) + goto fail; + + r = sd_rtnl_message_addr_get_ifindex(mm, &ifindex); + if (r < 0) + goto fail; + + l = hashmap_get(m->links, INT_TO_PTR(ifindex)); + if (!l) + return 0; + + r = sd_rtnl_message_addr_get_family(mm, &family); + if (r < 0) + goto fail; + + switch (family) { + + case AF_INET: + sd_netlink_message_read_in_addr(mm, IFA_BROADCAST, &broadcast.in); + r = sd_netlink_message_read_in_addr(mm, IFA_LOCAL, &address.in); + if (r < 0) { + r = sd_netlink_message_read_in_addr(mm, IFA_ADDRESS, &address.in); + if (r < 0) + goto fail; + } + + break; + + case AF_INET6: + r = sd_netlink_message_read_in6_addr(mm, IFA_LOCAL, &address.in6); + if (r < 0) { + r = sd_netlink_message_read_in6_addr(mm, IFA_ADDRESS, &address.in6); + if (r < 0) + goto fail; + } + + break; + + default: + return 0; + } + + a = link_find_address(l, family, &address); + + switch (type) { + + case RTM_NEWADDR: + + if (!a) { + r = link_address_new(l, &a, family, &address, &broadcast); + if (r < 0) + return r; + } + + r = link_address_update_rtnl(a, mm); + if (r < 0) + return r; + + break; + + case RTM_DELADDR: + link_address_free(a); + break; + } + + return 0; + +fail: + log_warning_errno(r, "Failed to process RTNL address message: %m"); + return 0; +} + +static int manager_rtnl_listen(Manager *m) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + int r; + + assert(m); + + /* First, subscribe to interfaces coming and going */ + r = sd_netlink_open(&m->rtnl); + if (r < 0) + return r; + + r = sd_netlink_attach_event(m->rtnl, m->event, SD_EVENT_PRIORITY_IMPORTANT); + if (r < 0) + return r; + + r = sd_netlink_add_match(m->rtnl, NULL, RTM_NEWLINK, manager_process_link, NULL, m, "resolve-NEWLINK"); + if (r < 0) + return r; + + r = sd_netlink_add_match(m->rtnl, NULL, RTM_DELLINK, manager_process_link, NULL, m, "resolve-DELLINK"); + if (r < 0) + return r; + + r = sd_netlink_add_match(m->rtnl, NULL, RTM_NEWADDR, manager_process_address, NULL, m, "resolve-NEWADDR"); + if (r < 0) + return r; + + r = sd_netlink_add_match(m->rtnl, NULL, RTM_DELADDR, manager_process_address, NULL, m, "resolve-DELADDR"); + if (r < 0) + return r; + + /* Then, enumerate all links */ + r = sd_rtnl_message_new_link(m->rtnl, &req, RTM_GETLINK, 0); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(m->rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *i = reply; i; i = sd_netlink_message_next(i)) { + r = manager_process_link(m->rtnl, i, m); + if (r < 0) + return r; + } + + req = sd_netlink_message_unref(req); + reply = sd_netlink_message_unref(reply); + + /* Finally, enumerate all addresses, too */ + r = sd_rtnl_message_new_addr(m->rtnl, &req, RTM_GETADDR, 0, AF_UNSPEC); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(m->rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *i = reply; i; i = sd_netlink_message_next(i)) { + r = manager_process_address(m->rtnl, i, m); + if (r < 0) + return r; + } + + return r; +} + +static int on_network_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + Link *l; + int r; + + sd_network_monitor_flush(m->network_monitor); + + HASHMAP_FOREACH(l, m->links) { + r = link_update(l); + if (r < 0) + log_warning_errno(r, "Failed to update monitor information for %i: %m", l->ifindex); + } + + (void) manager_write_resolv_conf(m); + (void) manager_send_changed(m, "DNS"); + + return 0; +} + +static int manager_network_monitor_listen(Manager *m) { + int r, fd, events; + + assert(m); + + r = sd_network_monitor_new(&m->network_monitor, NULL); + if (r < 0) + return r; + + fd = sd_network_monitor_get_fd(m->network_monitor); + if (fd < 0) + return fd; + + events = sd_network_monitor_get_events(m->network_monitor); + if (events < 0) + return events; + + r = sd_event_add_io(m->event, &m->network_event_source, fd, events, &on_network_event, m); + if (r < 0) + return r; + + r = sd_event_source_set_priority(m->network_event_source, SD_EVENT_PRIORITY_IMPORTANT+5); + if (r < 0) + return r; + + (void) sd_event_source_set_description(m->network_event_source, "network-monitor"); + + return 0; +} + +static int manager_clock_change_listen(Manager *m); + +static int on_clock_change(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + /* The clock has changed, let's flush all caches. Why that? That's because DNSSEC validation takes + * the system clock into consideration, and if the clock changes the old validations might have been + * wrong. Let's redo all validation with the new, correct time. + * + * (Also, this is triggered after system suspend, which is also a good reason to drop caches, since + * we might be connected to a different network now without this being visible in a dropped link + * carrier or so.) */ + + log_info("Clock change detected. Flushing caches."); + manager_flush_caches(m, LOG_DEBUG /* downgrade the functions own log message, since we already logged here at LOG_INFO level */); + + /* The clock change timerfd is unusable after it triggered once, create a new one. */ + return manager_clock_change_listen(m); +} + +static int manager_clock_change_listen(Manager *m) { + int r; + + assert(m); + + m->clock_change_event_source = sd_event_source_disable_unref(m->clock_change_event_source); + + r = event_add_time_change(m->event, &m->clock_change_event_source, on_clock_change, m); + if (r < 0) + return log_error_errno(r, "Failed to create clock change event source: %m"); + + return 0; +} + +static int determine_hostnames(char **full_hostname, char **llmnr_hostname, char **mdns_hostname) { + _cleanup_free_ char *h = NULL, *n = NULL; + int r; + + assert(full_hostname); + assert(llmnr_hostname); + assert(mdns_hostname); + + r = resolve_system_hostname(&h, &n); + if (r < 0) + return r; + + r = dns_name_concat(n, "local", 0, mdns_hostname); + if (r < 0) + return log_error_errno(r, "Failed to determine mDNS hostname: %m"); + + *llmnr_hostname = TAKE_PTR(n); + *full_hostname = TAKE_PTR(h); + + return 0; +} + +static char* fallback_hostname(void) { + + /* Determine the fall back hostname. For exposing this system to the outside world, we cannot have it + * to be "localhost" even if that's the default hostname. In this case, let's revert to "linux" + * instead. */ + + _cleanup_free_ char *n = get_default_hostname(); + if (!n) + return NULL; + + if (is_localhost(n)) + return strdup("linux"); + + return TAKE_PTR(n); +} + +static int make_fallback_hostnames(char **full_hostname, char **llmnr_hostname, char **mdns_hostname) { + _cleanup_free_ char *h = NULL, *n = NULL, *m = NULL; + char label[DNS_LABEL_MAX]; + const char *p; + int r; + + assert(full_hostname); + assert(llmnr_hostname); + assert(mdns_hostname); + + p = h = fallback_hostname(); + if (!h) + return log_oom(); + + r = dns_label_unescape(&p, label, sizeof label, 0); + if (r < 0) + return log_error_errno(r, "Failed to unescape fallback hostname: %m"); + + assert(r > 0); /* The fallback hostname must have at least one label */ + + r = dns_label_escape_new(label, r, &n); + if (r < 0) + return log_error_errno(r, "Failed to escape fallback hostname: %m"); + + r = dns_name_concat(n, "local", 0, &m); + if (r < 0) + return log_error_errno(r, "Failed to concatenate mDNS hostname: %m"); + + *llmnr_hostname = TAKE_PTR(n); + *mdns_hostname = TAKE_PTR(m); + *full_hostname = TAKE_PTR(h); + + return 0; +} + +static int on_hostname_change(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + _cleanup_free_ char *full_hostname = NULL, *llmnr_hostname = NULL, *mdns_hostname = NULL; + Manager *m = ASSERT_PTR(userdata); + bool llmnr_hostname_changed; + int r; + + r = determine_hostnames(&full_hostname, &llmnr_hostname, &mdns_hostname); + if (r < 0) { + log_warning_errno(r, "Failed to determine the local hostname and LLMNR/mDNS names, ignoring: %m"); + return 0; /* ignore invalid hostnames */ + } + + llmnr_hostname_changed = !streq(llmnr_hostname, m->llmnr_hostname); + if (streq(full_hostname, m->full_hostname) && + !llmnr_hostname_changed && + streq(mdns_hostname, m->mdns_hostname)) + return 0; + + log_info("System hostname changed to '%s'.", full_hostname); + + free_and_replace(m->full_hostname, full_hostname); + free_and_replace(m->llmnr_hostname, llmnr_hostname); + free_and_replace(m->mdns_hostname, mdns_hostname); + + manager_refresh_rrs(m); + (void) manager_send_changed(m, "LLMNRHostname"); + + return 0; +} + +static int manager_watch_hostname(Manager *m) { + int r; + + assert(m); + + m->hostname_fd = open("/proc/sys/kernel/hostname", + O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (m->hostname_fd < 0) { + log_warning_errno(errno, "Failed to watch hostname: %m"); + return 0; + } + + r = sd_event_add_io(m->event, &m->hostname_event_source, m->hostname_fd, 0, on_hostname_change, m); + if (r < 0) { + if (r == -EPERM) + /* kernels prior to 3.2 don't support polling this file. Ignore the failure. */ + m->hostname_fd = safe_close(m->hostname_fd); + else + return log_error_errno(r, "Failed to add hostname event source: %m"); + } + + (void) sd_event_source_set_description(m->hostname_event_source, "hostname"); + + r = determine_hostnames(&m->full_hostname, &m->llmnr_hostname, &m->mdns_hostname); + if (r < 0) { + _cleanup_free_ char *d = NULL; + + d = fallback_hostname(); + if (!d) + return log_oom(); + + log_info("Defaulting to hostname '%s'.", d); + + r = make_fallback_hostnames(&m->full_hostname, &m->llmnr_hostname, &m->mdns_hostname); + if (r < 0) + return r; + } else + log_info("Using system hostname '%s'.", m->full_hostname); + + return 0; +} + +static int manager_sigusr1(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + _cleanup_(memstream_done) MemStream ms = {}; + Manager *m = ASSERT_PTR(userdata); + Link *l; + FILE *f; + + assert(s); + assert(si); + + f = memstream_init(&ms); + if (!f) + return log_oom(); + + LIST_FOREACH(scopes, scope, m->dns_scopes) + dns_scope_dump(scope, f); + + LIST_FOREACH(servers, server, m->dns_servers) + dns_server_dump(server, f); + LIST_FOREACH(servers, server, m->fallback_dns_servers) + dns_server_dump(server, f); + HASHMAP_FOREACH(l, m->links) + LIST_FOREACH(servers, server, l->dns_servers) + dns_server_dump(server, f); + + return memstream_dump(LOG_INFO, &ms); +} + +static int manager_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + assert(si); + + manager_flush_caches(m, LOG_INFO); + + return 0; +} + +static int manager_sigrtmin1(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + assert(si); + + manager_reset_server_features(m); + return 0; +} + +static int manager_memory_pressure(sd_event_source *s, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + log_info("Under memory pressure, flushing caches."); + + manager_flush_caches(m, LOG_INFO); + sd_event_trim_memory(); + + return 0; +} + +static int manager_memory_pressure_listen(Manager *m) { + int r; + + assert(m); + + r = sd_event_add_memory_pressure(m->event, NULL, manager_memory_pressure, m); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN )? LOG_DEBUG : LOG_NOTICE, r, + "Failed to install memory pressure event source, ignoring: %m"); + + return 0; +} + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .llmnr_ipv4_udp_fd = -EBADF, + .llmnr_ipv6_udp_fd = -EBADF, + .llmnr_ipv4_tcp_fd = -EBADF, + .llmnr_ipv6_tcp_fd = -EBADF, + .mdns_ipv4_fd = -EBADF, + .mdns_ipv6_fd = -EBADF, + .hostname_fd = -EBADF, + + .llmnr_support = DEFAULT_LLMNR_MODE, + .mdns_support = DEFAULT_MDNS_MODE, + .dnssec_mode = DEFAULT_DNSSEC_MODE, + .dns_over_tls_mode = DEFAULT_DNS_OVER_TLS_MODE, + .enable_cache = DNS_CACHE_MODE_YES, + .dns_stub_listener_mode = DNS_STUB_LISTENER_YES, + .read_resolv_conf = true, + .need_builtin_fallbacks = true, + .etc_hosts_last = USEC_INFINITY, + .read_etc_hosts = true, + + .sigrtmin18_info.memory_pressure_handler = manager_memory_pressure, + .sigrtmin18_info.memory_pressure_userdata = m, + }; + + r = dns_trust_anchor_load(&m->trust_anchor); + if (r < 0) + return r; + + r = manager_parse_config_file(m); + if (r < 0) + log_warning_errno(r, "Failed to parse configuration file: %m"); + +#if ENABLE_DNS_OVER_TLS + r = dnstls_manager_init(m); + if (r < 0) + return r; +#endif + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + (void) sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + + (void) sd_event_set_watchdog(m->event, true); + + r = manager_watch_hostname(m); + if (r < 0) + return r; + + r = dnssd_load(m); + if (r < 0) + log_warning_errno(r, "Failed to load DNS-SD configuration files: %m"); + + r = dns_scope_new(m, &m->unicast_scope, NULL, DNS_PROTOCOL_DNS, AF_UNSPEC); + if (r < 0) + return r; + + r = manager_network_monitor_listen(m); + if (r < 0) + return r; + + r = manager_rtnl_listen(m); + if (r < 0) + return r; + + r = manager_clock_change_listen(m); + if (r < 0) + return r; + + r = manager_memory_pressure_listen(m); + if (r < 0) + return r; + + r = manager_connect_bus(m); + if (r < 0) + return r; + + (void) sd_event_add_signal(m->event, &m->sigusr1_event_source, SIGUSR1, manager_sigusr1, m); + (void) sd_event_add_signal(m->event, &m->sigusr2_event_source, SIGUSR2, manager_sigusr2, m); + (void) sd_event_add_signal(m->event, &m->sigrtmin1_event_source, SIGRTMIN+1, manager_sigrtmin1, m); + (void) sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, &m->sigrtmin18_info); + + manager_cleanup_saved_user(m); + + *ret = TAKE_PTR(m); + + return 0; +} + +int manager_start(Manager *m) { + int r; + + assert(m); + + r = manager_dns_stub_start(m); + if (r < 0) + return r; + + r = manager_varlink_init(m); + if (r < 0) + return r; + + return 0; +} + +Manager *manager_free(Manager *m) { + Link *l; + DnssdService *s; + + if (!m) + return NULL; + + dns_server_unlink_all(m->dns_servers); + dns_server_unlink_all(m->fallback_dns_servers); + dns_search_domain_unlink_all(m->search_domains); + + while ((l = hashmap_first(m->links))) + link_free(l); + + while (m->dns_queries) + dns_query_free(m->dns_queries); + + m->stub_queries_by_packet = hashmap_free(m->stub_queries_by_packet); + + dns_scope_free(m->unicast_scope); + + /* At this point only orphaned streams should remain. All others should have been freed already by their + * owners */ + while (m->dns_streams) + dns_stream_unref(m->dns_streams); + +#if ENABLE_DNS_OVER_TLS + dnstls_manager_free(m); +#endif + + hashmap_free(m->links); + hashmap_free(m->dns_transactions); + + sd_event_source_unref(m->network_event_source); + sd_network_monitor_unref(m->network_monitor); + + sd_netlink_unref(m->rtnl); + sd_event_source_unref(m->rtnl_event_source); + sd_event_source_unref(m->clock_change_event_source); + + manager_llmnr_stop(m); + manager_mdns_stop(m); + manager_dns_stub_stop(m); + manager_varlink_done(m); + + manager_socket_graveyard_clear(m); + + ordered_set_free(m->dns_extra_stub_listeners); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + + sd_bus_flush_close_unref(m->bus); + + sd_event_source_unref(m->sigusr1_event_source); + sd_event_source_unref(m->sigusr2_event_source); + sd_event_source_unref(m->sigrtmin1_event_source); + + dns_resource_key_unref(m->llmnr_host_ipv4_key); + dns_resource_key_unref(m->llmnr_host_ipv6_key); + dns_resource_key_unref(m->mdns_host_ipv4_key); + dns_resource_key_unref(m->mdns_host_ipv6_key); + + sd_event_source_unref(m->hostname_event_source); + safe_close(m->hostname_fd); + + sd_event_unref(m->event); + + free(m->full_hostname); + free(m->llmnr_hostname); + free(m->mdns_hostname); + + while ((s = hashmap_first(m->dnssd_services))) + dnssd_service_free(s); + hashmap_free(m->dnssd_services); + + dns_trust_anchor_flush(&m->trust_anchor); + manager_etc_hosts_flush(m); + + return mfree(m); +} + +int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + CMSG_BUFFER_TYPE(CMSG_SPACE(MAXSIZE(struct in_pktinfo, struct in6_pktinfo)) + + CMSG_SPACE(int) /* ttl/hoplimit */ + + EXTRA_CMSG_SPACE /* kernel appears to require extra buffer space */) control; + union sockaddr_union sa; + struct iovec iov; + struct msghdr mh = { + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + ssize_t ms, l; + int r; + + assert(m); + assert(fd >= 0); + assert(ret); + + ms = next_datagram_size_fd(fd); + if (ms < 0) + return ms; + + r = dns_packet_new(&p, protocol, ms, DNS_PACKET_SIZE_MAX); + if (r < 0) + return r; + + iov = IOVEC_MAKE(DNS_PACKET_DATA(p), p->allocated); + + l = recvmsg_safe(fd, &mh, 0); + if (ERRNO_IS_NEG_TRANSIENT(l)) + return 0; + if (l <= 0) + return l; + + assert(!(mh.msg_flags & MSG_TRUNC)); + + p->size = (size_t) l; + + p->family = sa.sa.sa_family; + p->ipproto = IPPROTO_UDP; + if (p->family == AF_INET) { + p->sender.in = sa.in.sin_addr; + p->sender_port = be16toh(sa.in.sin_port); + } else if (p->family == AF_INET6) { + p->sender.in6 = sa.in6.sin6_addr; + p->sender_port = be16toh(sa.in6.sin6_port); + p->ifindex = sa.in6.sin6_scope_id; + } else + return -EAFNOSUPPORT; + + p->timestamp = now(CLOCK_BOOTTIME); + + CMSG_FOREACH(cmsg, &mh) { + + if (cmsg->cmsg_level == IPPROTO_IPV6) { + assert(p->family == AF_INET6); + + switch (cmsg->cmsg_type) { + + case IPV6_PKTINFO: { + struct in6_pktinfo *i = CMSG_TYPED_DATA(cmsg, struct in6_pktinfo); + + if (p->ifindex <= 0) + p->ifindex = i->ipi6_ifindex; + + p->destination.in6 = i->ipi6_addr; + break; + } + + case IPV6_HOPLIMIT: + p->ttl = *CMSG_TYPED_DATA(cmsg, int); + break; + + case IPV6_RECVFRAGSIZE: + p->fragsize = *CMSG_TYPED_DATA(cmsg, int); + break; + } + } else if (cmsg->cmsg_level == IPPROTO_IP) { + assert(p->family == AF_INET); + + switch (cmsg->cmsg_type) { + + case IP_PKTINFO: { + struct in_pktinfo *i = CMSG_TYPED_DATA(cmsg, struct in_pktinfo); + + if (p->ifindex <= 0) + p->ifindex = i->ipi_ifindex; + + p->destination.in = i->ipi_addr; + break; + } + + case IP_TTL: + p->ttl = *CMSG_TYPED_DATA(cmsg, int); + break; + + case IP_RECVFRAGSIZE: + p->fragsize = *CMSG_TYPED_DATA(cmsg, int); + break; + } + } + } + + /* The Linux kernel sets the interface index to the loopback + * device if the packet came from the local host since it + * avoids the routing table in such a case. Let's unset the + * interface index in such a case. */ + if (p->ifindex == LOOPBACK_IFINDEX) + p->ifindex = 0; + + if (protocol != DNS_PROTOCOL_DNS) { + /* If we don't know the interface index still, we look for the + * first local interface with a matching address. Yuck! */ + if (p->ifindex <= 0) + p->ifindex = manager_find_ifindex(m, p->family, &p->destination); + } + + log_debug("Received %s UDP packet of size %zu, ifindex=%i, ttl=%u, fragsize=%zu, sender=%s, destination=%s", + dns_protocol_to_string(protocol), p->size, p->ifindex, p->ttl, p->fragsize, + IN_ADDR_TO_STRING(p->family, &p->sender), + IN_ADDR_TO_STRING(p->family, &p->destination)); + + *ret = TAKE_PTR(p); + return 1; +} + +static int sendmsg_loop(int fd, struct msghdr *mh, int flags) { + usec_t end; + int r; + + assert(fd >= 0); + assert(mh); + + end = usec_add(now(CLOCK_MONOTONIC), SEND_TIMEOUT_USEC); + + for (;;) { + if (sendmsg(fd, mh, flags) >= 0) + return 0; + if (errno == EINTR) + continue; + if (errno != EAGAIN) + return -errno; + + r = fd_wait_for_event(fd, POLLOUT, LESS_BY(end, now(CLOCK_MONOTONIC))); + if (ERRNO_IS_NEG_TRANSIENT(r)) + continue; + if (r < 0) + return r; + if (r == 0) + return -ETIMEDOUT; + } +} + +static int write_loop(int fd, void *message, size_t length) { + usec_t end; + int r; + + assert(fd >= 0); + assert(message); + + end = usec_add(now(CLOCK_MONOTONIC), SEND_TIMEOUT_USEC); + + for (;;) { + if (write(fd, message, length) >= 0) + return 0; + if (errno == EINTR) + continue; + if (errno != EAGAIN) + return -errno; + + r = fd_wait_for_event(fd, POLLOUT, LESS_BY(end, now(CLOCK_MONOTONIC))); + if (ERRNO_IS_NEG_TRANSIENT(r)) + continue; + if (r < 0) + return r; + if (r == 0) + return -ETIMEDOUT; + } +} + +int manager_write(Manager *m, int fd, DnsPacket *p) { + int r; + + log_debug("Sending %s%s packet with id %" PRIu16 " of size %zu.", + DNS_PACKET_TC(p) ? "truncated (!) " : "", + DNS_PACKET_QR(p) ? "response" : "query", + DNS_PACKET_ID(p), + p->size); + + r = write_loop(fd, DNS_PACKET_DATA(p), p->size); + if (r < 0) + return r; + + return 0; +} + +static int manager_ipv4_send( + Manager *m, + int fd, + int ifindex, + const struct in_addr *destination, + uint16_t port, + const struct in_addr *source, + DnsPacket *p) { + + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct in_pktinfo))) control = {}; + union sockaddr_union sa; + struct iovec iov; + struct msghdr mh = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa.in), + }; + + assert(m); + assert(fd >= 0); + assert(destination); + assert(port > 0); + assert(p); + + iov = IOVEC_MAKE(DNS_PACKET_DATA(p), p->size); + + sa = (union sockaddr_union) { + .in.sin_family = AF_INET, + .in.sin_addr = *destination, + .in.sin_port = htobe16(port), + }; + + if (ifindex > 0) { + struct cmsghdr *cmsg; + struct in_pktinfo *pi; + + mh.msg_control = &control; + mh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&mh); + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); + cmsg->cmsg_level = IPPROTO_IP; + cmsg->cmsg_type = IP_PKTINFO; + + pi = CMSG_TYPED_DATA(cmsg, struct in_pktinfo); + pi->ipi_ifindex = ifindex; + + if (source) + pi->ipi_spec_dst = *source; + } + + return sendmsg_loop(fd, &mh, 0); +} + +static int manager_ipv6_send( + Manager *m, + int fd, + int ifindex, + const struct in6_addr *destination, + uint16_t port, + const struct in6_addr *source, + DnsPacket *p) { + + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct in6_pktinfo))) control = {}; + union sockaddr_union sa; + struct iovec iov; + struct msghdr mh = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_name = &sa.sa, + .msg_namelen = sizeof(sa.in6), + }; + + assert(m); + assert(fd >= 0); + assert(destination); + assert(port > 0); + assert(p); + + iov = IOVEC_MAKE(DNS_PACKET_DATA(p), p->size); + + sa = (union sockaddr_union) { + .in6.sin6_family = AF_INET6, + .in6.sin6_addr = *destination, + .in6.sin6_port = htobe16(port), + .in6.sin6_scope_id = ifindex, + }; + + if (ifindex > 0) { + struct cmsghdr *cmsg; + struct in6_pktinfo *pi; + + mh.msg_control = &control; + mh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&mh); + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); + cmsg->cmsg_level = IPPROTO_IPV6; + cmsg->cmsg_type = IPV6_PKTINFO; + + pi = CMSG_TYPED_DATA(cmsg, struct in6_pktinfo); + pi->ipi6_ifindex = ifindex; + + if (source) + pi->ipi6_addr = *source; + } + + return sendmsg_loop(fd, &mh, 0); +} + +static int dns_question_to_json(DnsQuestion *q, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *l = NULL; + DnsResourceKey *key; + int r; + + assert(ret); + + DNS_QUESTION_FOREACH(key, q) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = dns_resource_key_to_json(key, &v); + if (r < 0) + return r; + + r = json_variant_append_array(&l, v); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(l); + return 0; +} + +int manager_monitor_send( + Manager *m, + int state, + int rcode, + int error, + DnsQuestion *question_idna, + DnsQuestion *question_utf8, + DnsPacket *question_bypass, + DnsQuestion *collected_questions, + DnsAnswer *answer) { + + _cleanup_(json_variant_unrefp) JsonVariant *jquestion = NULL, *jcollected_questions = NULL, *janswer = NULL; + _cleanup_(dns_question_unrefp) DnsQuestion *merged = NULL; + Varlink *connection; + DnsAnswerItem *rri; + int r; + + assert(m); + + if (set_isempty(m->varlink_subscription)) + return 0; + + /* Merge all questions into one */ + r = dns_question_merge(question_idna, question_utf8, &merged); + if (r < 0) + return log_error_errno(r, "Failed to merge UTF8/IDNA questions: %m"); + + if (question_bypass) { + _cleanup_(dns_question_unrefp) DnsQuestion *merged2 = NULL; + + r = dns_question_merge(merged, question_bypass->question, &merged2); + if (r < 0) + return log_error_errno(r, "Failed to merge UTF8/IDNA questions and DNS packet question: %m"); + + dns_question_unref(merged); + merged = TAKE_PTR(merged2); + } + + /* Convert the current primary question to JSON */ + r = dns_question_to_json(merged, &jquestion); + if (r < 0) + return log_error_errno(r, "Failed to convert question to JSON: %m"); + + /* Generate a JSON array of the questions preceding the current one in the CNAME chain */ + r = dns_question_to_json(collected_questions, &jcollected_questions); + if (r < 0) + return log_error_errno(r, "Failed to convert question to JSON: %m"); + + DNS_ANSWER_FOREACH_ITEM(rri, answer) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = dns_resource_record_to_json(rri->rr, &v); + if (r < 0) + return log_error_errno(r, "Failed to convert answer resource record to JSON: %m"); + + r = dns_resource_record_to_wire_format(rri->rr, /* canonical= */ false); /* don't use DNSSEC canonical format, since it removes casing, but we want that for DNS_SD compat */ + if (r < 0) + return log_error_errno(r, "Failed to generate RR wire format: %m"); + + r = json_variant_append_arrayb( + &janswer, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_CONDITION(v, "rr", JSON_BUILD_VARIANT(v)), + JSON_BUILD_PAIR("raw", JSON_BUILD_BASE64(rri->rr->wire_format, rri->rr->wire_format_size)), + JSON_BUILD_PAIR_CONDITION(rri->ifindex > 0, "ifindex", JSON_BUILD_INTEGER(rri->ifindex)))); + if (r < 0) + return log_debug_errno(r, "Failed to append notification entry to array: %m"); + } + + SET_FOREACH(connection, m->varlink_subscription) { + r = varlink_notifyb(connection, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("state", JSON_BUILD_STRING(dns_transaction_state_to_string(state))), + JSON_BUILD_PAIR_CONDITION(state == DNS_TRANSACTION_RCODE_FAILURE, "rcode", JSON_BUILD_INTEGER(rcode)), + JSON_BUILD_PAIR_CONDITION(state == DNS_TRANSACTION_ERRNO, "errno", JSON_BUILD_INTEGER(error)), + JSON_BUILD_PAIR("question", JSON_BUILD_VARIANT(jquestion)), + JSON_BUILD_PAIR_CONDITION(jcollected_questions, "collectedQuestions", JSON_BUILD_VARIANT(jcollected_questions)), + JSON_BUILD_PAIR_CONDITION(janswer, "answer", JSON_BUILD_VARIANT(janswer)))); + if (r < 0) + log_debug_errno(r, "Failed to send monitor event, ignoring: %m"); + } + + return 0; +} + +int manager_send( + Manager *m, + int fd, + int ifindex, + int family, + const union in_addr_union *destination, + uint16_t port, + const union in_addr_union *source, + DnsPacket *p) { + + assert(m); + assert(fd >= 0); + assert(destination); + assert(port > 0); + assert(p); + + /* For mDNS, it is natural that the packet have truncated flag when we have many known answers. */ + bool truncated = DNS_PACKET_TC(p) && (p->protocol != DNS_PROTOCOL_MDNS || !p->more); + + log_debug("Sending %s%s packet with id %" PRIu16 " on interface %i/%s of size %zu.", + truncated ? "truncated (!) " : "", + DNS_PACKET_QR(p) ? "response" : "query", + DNS_PACKET_ID(p), + ifindex, af_to_name(family), + p->size); + + if (family == AF_INET) + return manager_ipv4_send(m, fd, ifindex, &destination->in, port, source ? &source->in : NULL, p); + if (family == AF_INET6) + return manager_ipv6_send(m, fd, ifindex, &destination->in6, port, source ? &source->in6 : NULL, p); + + return -EAFNOSUPPORT; +} + +uint32_t manager_find_mtu(Manager *m) { + uint32_t mtu = 0; + Link *l; + + /* If we don't know on which link a DNS packet would be delivered, let's find the largest MTU that + * works on all interfaces we know of that have an IP address associated */ + + HASHMAP_FOREACH(l, m->links) { + /* Let's filter out links without IP addresses (e.g. AF_CAN links and suchlike) */ + if (!l->addresses) + continue; + + /* Safety check: MTU shorter than what we need for the absolutely shortest DNS request? Then + * let's ignore this link. */ + if (l->mtu < MIN(UDP4_PACKET_HEADER_SIZE + DNS_PACKET_HEADER_SIZE, + UDP6_PACKET_HEADER_SIZE + DNS_PACKET_HEADER_SIZE)) + continue; + + if (mtu <= 0 || l->mtu < mtu) + mtu = l->mtu; + } + + if (mtu == 0) /* found nothing? then let's assume the typical Ethernet MTU for lack of anything more precise */ + return 1500; + + return mtu; +} + +int manager_find_ifindex(Manager *m, int family, const union in_addr_union *in_addr) { + LinkAddress *a; + + assert(m); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return 0; + + if (!in_addr) + return 0; + + a = manager_find_link_address(m, family, in_addr); + if (a) + return a->link->ifindex; + + return 0; +} + +void manager_refresh_rrs(Manager *m) { + Link *l; + DnssdService *s; + + assert(m); + + m->llmnr_host_ipv4_key = dns_resource_key_unref(m->llmnr_host_ipv4_key); + m->llmnr_host_ipv6_key = dns_resource_key_unref(m->llmnr_host_ipv6_key); + m->mdns_host_ipv4_key = dns_resource_key_unref(m->mdns_host_ipv4_key); + m->mdns_host_ipv6_key = dns_resource_key_unref(m->mdns_host_ipv6_key); + + HASHMAP_FOREACH(l, m->links) + link_add_rrs(l, true); + + if (m->mdns_support == RESOLVE_SUPPORT_YES) + HASHMAP_FOREACH(s, m->dnssd_services) + if (dnssd_update_rrs(s) < 0) + log_warning("Failed to refresh DNS-SD service '%s'", s->name); + + HASHMAP_FOREACH(l, m->links) + link_add_rrs(l, false); +} + +static int manager_next_random_name(const char *old, char **ret_new) { + const char *p; + uint64_t u, a; + char *n; + + p = strchr(old, 0); + assert(p); + + while (p > old) { + if (!ascii_isdigit(p[-1])) + break; + + p--; + } + + if (*p == 0 || safe_atou64(p, &u) < 0 || u <= 0) + u = 1; + + /* Add a random number to the old value. This way we can avoid + * that two hosts pick the same hostname, win on IPv4 and lose + * on IPv6 (or vice versa), and pick the same hostname + * replacement hostname, ad infinitum. We still want the + * numbers to go up monotonically, hence we just add a random + * value 1..10 */ + + random_bytes(&a, sizeof(a)); + u += 1 + a % 10; + + if (asprintf(&n, "%.*s%" PRIu64, (int) (p - old), old, u) < 0) + return -ENOMEM; + + *ret_new = n; + + return 0; +} + +int manager_next_hostname(Manager *m) { + _cleanup_free_ char *h = NULL, *k = NULL; + int r; + + assert(m); + + r = manager_next_random_name(m->llmnr_hostname, &h); + if (r < 0) + return r; + + r = dns_name_concat(h, "local", 0, &k); + if (r < 0) + return r; + + log_info("Hostname conflict, changing published hostname from '%s' to '%s'.", m->llmnr_hostname, h); + + free_and_replace(m->llmnr_hostname, h); + free_and_replace(m->mdns_hostname, k); + + manager_refresh_rrs(m); + (void) manager_send_changed(m, "LLMNRHostname"); + + return 0; +} + +LinkAddress* manager_find_link_address(Manager *m, int family, const union in_addr_union *in_addr) { + Link *l; + + assert(m); + + if (!IN_SET(family, AF_INET, AF_INET6)) + return NULL; + + if (!in_addr) + return NULL; + + HASHMAP_FOREACH(l, m->links) { + LinkAddress *a; + + a = link_find_address(l, family, in_addr); + if (a) + return a; + } + + return NULL; +} + +bool manager_packet_from_local_address(Manager *m, DnsPacket *p) { + assert(m); + assert(p); + + /* Let's see if this packet comes from an IP address we have on any local interface */ + + return !!manager_find_link_address(m, p->family, &p->sender); +} + +bool manager_packet_from_our_transaction(Manager *m, DnsPacket *p) { + DnsTransaction *t; + + assert(m); + assert(p); + + /* Let's see if we have a transaction with a query message with the exact same binary contents as the + * one we just got. If so, it's almost definitely a packet loop of some kind. */ + + t = hashmap_get(m->dns_transactions, UINT_TO_PTR(DNS_PACKET_ID(p))); + if (!t) + return false; + + return t->sent && dns_packet_equal(t->sent, p); +} + +DnsScope* manager_find_scope(Manager *m, DnsPacket *p) { + Link *l; + + assert(m); + assert(p); + + l = hashmap_get(m->links, INT_TO_PTR(p->ifindex)); + if (!l) + return NULL; + + switch (p->protocol) { + case DNS_PROTOCOL_LLMNR: + if (p->family == AF_INET) + return l->llmnr_ipv4_scope; + else if (p->family == AF_INET6) + return l->llmnr_ipv6_scope; + + break; + + case DNS_PROTOCOL_MDNS: + if (p->family == AF_INET) + return l->mdns_ipv4_scope; + else if (p->family == AF_INET6) + return l->mdns_ipv6_scope; + + break; + + default: + break; + } + + return NULL; +} + +void manager_verify_all(Manager *m) { + assert(m); + + LIST_FOREACH(scopes, s, m->dns_scopes) + dns_zone_verify_all(&s->zone); +} + +int manager_is_own_hostname(Manager *m, const char *name) { + int r; + + assert(m); + assert(name); + + if (m->llmnr_hostname) { + r = dns_name_equal(name, m->llmnr_hostname); + if (r != 0) + return r; + } + + if (m->mdns_hostname) { + r = dns_name_equal(name, m->mdns_hostname); + if (r != 0) + return r; + } + + if (m->full_hostname) + return dns_name_equal(name, m->full_hostname); + + return 0; +} + +int manager_compile_dns_servers(Manager *m, OrderedSet **dns) { + Link *l; + int r; + + assert(m); + assert(dns); + + r = ordered_set_ensure_allocated(dns, &dns_server_hash_ops); + if (r < 0) + return r; + + /* First add the system-wide servers and domains */ + LIST_FOREACH(servers, s, m->dns_servers) { + r = ordered_set_put(*dns, s); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + + /* Then, add the per-link servers */ + HASHMAP_FOREACH(l, m->links) { + LIST_FOREACH(servers, s, l->dns_servers) { + r = ordered_set_put(*dns, s); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + } + + /* If we found nothing, add the fallback servers */ + if (ordered_set_isempty(*dns)) { + LIST_FOREACH(servers, s, m->fallback_dns_servers) { + r = ordered_set_put(*dns, s); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + } + + return 0; +} + +/* filter_route is a tri-state: + * < 0: no filtering + * = 0 or false: return only domains which should be used for searching + * > 0 or true: return only domains which are for routing only + */ +int manager_compile_search_domains(Manager *m, OrderedSet **domains, int filter_route) { + Link *l; + int r; + + assert(m); + assert(domains); + + r = ordered_set_ensure_allocated(domains, &dns_name_hash_ops); + if (r < 0) + return r; + + LIST_FOREACH(domains, d, m->search_domains) { + + if (filter_route >= 0 && + d->route_only != !!filter_route) + continue; + + r = ordered_set_put(*domains, d->name); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + + HASHMAP_FOREACH(l, m->links) { + + LIST_FOREACH(domains, d, l->search_domains) { + + if (filter_route >= 0 && + d->route_only != !!filter_route) + continue; + + r = ordered_set_put(*domains, d->name); + if (r == -EEXIST) + continue; + if (r < 0) + return r; + } + } + + return 0; +} + +DnssecMode manager_get_dnssec_mode(Manager *m) { + assert(m); + + if (m->dnssec_mode != _DNSSEC_MODE_INVALID) + return m->dnssec_mode; + + return DNSSEC_NO; +} + +bool manager_dnssec_supported(Manager *m) { + DnsServer *server; + Link *l; + + assert(m); + + if (manager_get_dnssec_mode(m) == DNSSEC_NO) + return false; + + server = manager_get_dns_server(m); + if (server && !dns_server_dnssec_supported(server)) + return false; + + HASHMAP_FOREACH(l, m->links) + if (!link_dnssec_supported(l)) + return false; + + return true; +} + +DnsOverTlsMode manager_get_dns_over_tls_mode(Manager *m) { + assert(m); + + if (m->dns_over_tls_mode != _DNS_OVER_TLS_MODE_INVALID) + return m->dns_over_tls_mode; + + return DNS_OVER_TLS_NO; +} + +void manager_dnssec_verdict(Manager *m, DnssecVerdict verdict, const DnsResourceKey *key) { + + assert(verdict >= 0); + assert(verdict < _DNSSEC_VERDICT_MAX); + + if (DEBUG_LOGGING) { + char s[DNS_RESOURCE_KEY_STRING_MAX]; + + log_debug("Found verdict for lookup %s: %s", + dns_resource_key_to_string(key, s, sizeof s), + dnssec_verdict_to_string(verdict)); + } + + m->n_dnssec_verdict[verdict]++; +} + +bool manager_routable(Manager *m) { + Link *l; + + assert(m); + + /* Returns true if the host has at least one interface with a routable address (regardless if IPv4 or IPv6) */ + + HASHMAP_FOREACH(l, m->links) + if (link_relevant(l, AF_UNSPEC, false)) + return true; + + return false; +} + +void manager_flush_caches(Manager *m, int log_level) { + assert(m); + + LIST_FOREACH(scopes, scope, m->dns_scopes) + dns_cache_flush(&scope->cache); + + log_full(log_level, "Flushed all caches."); +} + +void manager_reset_server_features(Manager *m) { + Link *l; + + dns_server_reset_features_all(m->dns_servers); + dns_server_reset_features_all(m->fallback_dns_servers); + + HASHMAP_FOREACH(l, m->links) + dns_server_reset_features_all(l->dns_servers); + + log_info("Resetting learnt feature levels on all servers."); +} + +void manager_cleanup_saved_user(Manager *m) { + _cleanup_closedir_ DIR *d = NULL; + + assert(m); + + /* Clean up all saved per-link files in /run/systemd/resolve/netif/ that don't have a matching interface + * anymore. These files are created to persist settings pushed in by the user via the bus, so that resolved can + * be restarted without losing this data. */ + + d = opendir("/run/systemd/resolve/netif/"); + if (!d) { + if (errno == ENOENT) + return; + + log_warning_errno(errno, "Failed to open interface directory: %m"); + return; + } + + FOREACH_DIRENT_ALL(de, d, log_error_errno(errno, "Failed to read interface directory: %m")) { + _cleanup_free_ char *p = NULL; + int ifindex; + Link *l; + + if (!IN_SET(de->d_type, DT_UNKNOWN, DT_REG)) + continue; + + if (dot_or_dot_dot(de->d_name)) + continue; + + ifindex = parse_ifindex(de->d_name); + if (ifindex < 0) /* Probably some temporary file from a previous run. Delete it */ + goto rm; + + l = hashmap_get(m->links, INT_TO_PTR(ifindex)); + if (!l) /* link vanished */ + goto rm; + + if (l->is_managed) /* now managed by networkd, hence the bus settings are useless */ + goto rm; + + continue; + + rm: + p = path_join("/run/systemd/resolve/netif", de->d_name); + if (!p) { + log_oom(); + return; + } + + (void) unlink(p); + } +} + +bool manager_next_dnssd_names(Manager *m) { + DnssdService *s; + bool tried = false; + int r; + + assert(m); + + HASHMAP_FOREACH(s, m->dnssd_services) { + _cleanup_free_ char * new_name = NULL; + + if (!s->withdrawn) + continue; + + r = manager_next_random_name(s->name_template, &new_name); + if (r < 0) { + log_warning_errno(r, "Failed to get new name for service '%s': %m", s->name); + continue; + } + + free_and_replace(s->name_template, new_name); + + s->withdrawn = false; + + tried = true; + } + + if (tried) + manager_refresh_rrs(m); + + return tried; +} + +bool manager_server_is_stub(Manager *m, DnsServer *s) { + DnsStubListenerExtra *l; + + assert(m); + assert(s); + + /* Safety check: we generally already skip the main stub when parsing configuration. But let's be + * extra careful, and check here again */ + if (s->family == AF_INET && + s->address.in.s_addr == htobe32(INADDR_DNS_STUB) && + dns_server_port(s) == 53) + return true; + + /* Main reason to call this is to check server data against the extra listeners, and filter things + * out. */ + ORDERED_SET_FOREACH(l, m->dns_extra_stub_listeners) + if (s->family == l->family && + in_addr_equal(s->family, &s->address, &l->address) && + dns_server_port(s) == dns_stub_listener_extra_port(l)) + return true; + + return false; +} + +int socket_disable_pmtud(int fd, int af) { + int r; + + assert(fd >= 0); + + if (af == AF_UNSPEC) { + af = socket_get_family(fd); + if (af < 0) + return af; + } + + switch (af) { + + case AF_INET: { + /* Turn off path MTU discovery, let's rather fragment on the way than to open us up against + * PMTU forgery vulnerabilities. + * + * There appears to be no documentation about IP_PMTUDISC_OMIT, but it has the effect that + * the "Don't Fragment" bit in the IPv4 header is turned off, thus enforcing fragmentation if + * our datagram size exceeds the MTU of a router in the path, and turning off path MTU + * discovery. + * + * This helps mitigating the PMTUD vulnerability described here: + * + * https://blog.apnic.net/2019/07/12/its-time-to-consider-avoiding-ip-fragmentation-in-the-dns/ + * + * Similar logic is in place in most DNS servers. + * + * There are multiple conflicting goals: we want to allow the largest datagrams possible (for + * efficiency reasons), but not have fragmentation (for security reasons), nor use PMTUD (for + * security reasons, too). Our strategy to deal with this is: use large packets, turn off + * PMTUD, but watch fragmentation taking place, and then size our packets to the max of the + * fragments seen — and if we need larger packets always go to TCP. + */ + + r = setsockopt_int(fd, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_OMIT); + if (r < 0) + return r; + + return 0; + } + + case AF_INET6: { + /* On IPv6 fragmentation only is done by the sender — never by routers on the path. PMTUD is + * mandatory. If we want to turn off PMTUD, the only way is by sending with minimal MTU only, + * so that we apply maximum fragmentation locally already, and thus PMTUD doesn't happen + * because there's nothing that could be fragmented further anymore. */ + + r = setsockopt_int(fd, IPPROTO_IPV6, IPV6_MTU, IPV6_MIN_MTU); + if (r < 0) + return r; + + return 0; + } + + default: + return -EAFNOSUPPORT; + } +} + +int dns_manager_dump_statistics_json(Manager *m, JsonVariant **ret) { + uint64_t size = 0, hit = 0, miss = 0; + + assert(m); + assert(ret); + + LIST_FOREACH(scopes, s, m->dns_scopes) { + size += dns_cache_size(&s->cache); + hit += s->cache.n_hit; + miss += s->cache.n_miss; + } + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("transactions", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("currentTransactions", hashmap_size(m->dns_transactions)), + JSON_BUILD_PAIR_UNSIGNED("totalTransactions", m->n_transactions_total), + JSON_BUILD_PAIR_UNSIGNED("totalTimeouts", m->n_timeouts_total), + JSON_BUILD_PAIR_UNSIGNED("totalTimeoutsServedStale", m->n_timeouts_served_stale_total), + JSON_BUILD_PAIR_UNSIGNED("totalFailedResponses", m->n_failure_responses_total), + JSON_BUILD_PAIR_UNSIGNED("totalFailedResponsesServedStale", m->n_failure_responses_served_stale_total) + )), + JSON_BUILD_PAIR("cache", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("size", size), + JSON_BUILD_PAIR_UNSIGNED("hits", hit), + JSON_BUILD_PAIR_UNSIGNED("misses", miss) + )), + JSON_BUILD_PAIR("dnssec", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("secure", m->n_dnssec_verdict[DNSSEC_SECURE]), + JSON_BUILD_PAIR_UNSIGNED("insecure", m->n_dnssec_verdict[DNSSEC_INSECURE]), + JSON_BUILD_PAIR_UNSIGNED("bogus", m->n_dnssec_verdict[DNSSEC_BOGUS]), + JSON_BUILD_PAIR_UNSIGNED("indeterminate", m->n_dnssec_verdict[DNSSEC_INDETERMINATE]) + )))); +} + +void dns_manager_reset_statistics(Manager *m) { + + assert(m); + + LIST_FOREACH(scopes, s, m->dns_scopes) + s->cache.n_hit = s->cache.n_miss = 0; + + m->n_transactions_total = 0; + m->n_timeouts_total = 0; + m->n_timeouts_served_stale_total = 0; + m->n_failure_responses_total = 0; + m->n_failure_responses_served_stale_total = 0; + zero(m->n_dnssec_verdict); +} diff --git a/src/resolve/resolved-manager.h b/src/resolve/resolved-manager.h new file mode 100644 index 0000000..5cd5e83 --- /dev/null +++ b/src/resolve/resolved-manager.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-event.h" +#include "sd-netlink.h" +#include "sd-network.h" + +#include "common-signal.h" +#include "hashmap.h" +#include "list.h" +#include "ordered-set.h" +#include "resolve-util.h" +#include "varlink.h" + +typedef struct Manager Manager; + +#include "resolved-dns-query.h" +#include "resolved-dns-search-domain.h" +#include "resolved-dns-stream.h" +#include "resolved-dns-stub.h" +#include "resolved-dns-trust-anchor.h" +#include "resolved-link.h" +#include "resolved-socket-graveyard.h" + +#define MANAGER_SEARCH_DOMAINS_MAX 256 +#define MANAGER_DNS_SERVERS_MAX 256 + +typedef struct EtcHosts { + Hashmap *by_address; + Hashmap *by_name; + Set *no_address; +} EtcHosts; + +struct Manager { + sd_event *event; + + ResolveSupport llmnr_support; + ResolveSupport mdns_support; + DnssecMode dnssec_mode; + DnsOverTlsMode dns_over_tls_mode; + DnsCacheMode enable_cache; + bool cache_from_localhost; + DnsStubListenerMode dns_stub_listener_mode; + usec_t stale_retention_usec; + +#if ENABLE_DNS_OVER_TLS + DnsTlsManagerData dnstls_data; +#endif + + /* Network */ + Hashmap *links; + + sd_netlink *rtnl; + sd_event_source *rtnl_event_source; + + sd_network_monitor *network_monitor; + sd_event_source *network_event_source; + + /* DNS query management */ + Hashmap *dns_transactions; + LIST_HEAD(DnsQuery, dns_queries); + unsigned n_dns_queries; + Hashmap *stub_queries_by_packet; + + LIST_HEAD(DnsStream, dns_streams); + unsigned n_dns_streams[_DNS_STREAM_TYPE_MAX]; + + /* Unicast dns */ + LIST_HEAD(DnsServer, dns_servers); + LIST_HEAD(DnsServer, fallback_dns_servers); + unsigned n_dns_servers; /* counts both main and fallback */ + DnsServer *current_dns_server; + + LIST_HEAD(DnsSearchDomain, search_domains); + unsigned n_search_domains; + + bool need_builtin_fallbacks; + bool read_resolv_conf; + bool resolve_unicast_single_label; + + struct stat resolv_conf_stat; + + DnsTrustAnchor trust_anchor; + + LIST_HEAD(DnsScope, dns_scopes); + DnsScope *unicast_scope; + + /* LLMNR */ + int llmnr_ipv4_udp_fd; + int llmnr_ipv6_udp_fd; + int llmnr_ipv4_tcp_fd; + int llmnr_ipv6_tcp_fd; + + sd_event_source *llmnr_ipv4_udp_event_source; + sd_event_source *llmnr_ipv6_udp_event_source; + sd_event_source *llmnr_ipv4_tcp_event_source; + sd_event_source *llmnr_ipv6_tcp_event_source; + + /* mDNS */ + int mdns_ipv4_fd; + int mdns_ipv6_fd; + sd_event_source *mdns_ipv4_event_source; + sd_event_source *mdns_ipv6_event_source; + + /* DNS-SD */ + Hashmap *dnssd_services; + + /* dbus */ + sd_bus *bus; + + /* The hostname we publish on LLMNR and mDNS */ + char *full_hostname; + char *llmnr_hostname; + char *mdns_hostname; + DnsResourceKey *llmnr_host_ipv4_key; + DnsResourceKey *llmnr_host_ipv6_key; + DnsResourceKey *mdns_host_ipv4_key; + DnsResourceKey *mdns_host_ipv6_key; + + /* Watch the system hostname */ + int hostname_fd; + sd_event_source *hostname_event_source; + + sd_event_source *sigusr1_event_source; + sd_event_source *sigusr2_event_source; + sd_event_source *sigrtmin1_event_source; + + unsigned n_transactions_total; + unsigned n_timeouts_total; + unsigned n_timeouts_served_stale_total; + unsigned n_failure_responses_total; + unsigned n_failure_responses_served_stale_total; + + unsigned n_dnssec_verdict[_DNSSEC_VERDICT_MAX]; + + /* Data from /etc/hosts */ + EtcHosts etc_hosts; + usec_t etc_hosts_last; + struct stat etc_hosts_stat; + bool read_etc_hosts; + + OrderedSet *dns_extra_stub_listeners; + + /* Local DNS stub on 127.0.0.53:53 */ + sd_event_source *dns_stub_udp_event_source; + sd_event_source *dns_stub_tcp_event_source; + + /* Local DNS proxy stub on 127.0.0.54:53 */ + sd_event_source *dns_proxy_stub_udp_event_source; + sd_event_source *dns_proxy_stub_tcp_event_source; + + Hashmap *polkit_registry; + + VarlinkServer *varlink_server; + VarlinkServer *varlink_monitor_server; + + Set *varlink_subscription; + + sd_event_source *clock_change_event_source; + + LIST_HEAD(SocketGraveyard, socket_graveyard); + SocketGraveyard *socket_graveyard_oldest; + size_t n_socket_graveyard; + + struct sigrtmin18_info sigrtmin18_info; +}; + +/* Manager */ + +int manager_new(Manager **ret); +Manager* manager_free(Manager *m); + +int manager_start(Manager *m); + +uint32_t manager_find_mtu(Manager *m); + +int manager_monitor_send(Manager *m, int state, int rcode, int error, DnsQuestion *question_idna, DnsQuestion *question_utf8, DnsPacket *question_bypass, DnsQuestion *collected_questions, DnsAnswer *answer); + +int manager_write(Manager *m, int fd, DnsPacket *p); +int manager_send(Manager *m, int fd, int ifindex, int family, const union in_addr_union *destination, uint16_t port, const union in_addr_union *source, DnsPacket *p); +int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret); + +int manager_find_ifindex(Manager *m, int family, const union in_addr_union *in_addr); +LinkAddress* manager_find_link_address(Manager *m, int family, const union in_addr_union *in_addr); + +void manager_refresh_rrs(Manager *m); +int manager_next_hostname(Manager *m); + +bool manager_packet_from_local_address(Manager *m, DnsPacket *p); +bool manager_packet_from_our_transaction(Manager *m, DnsPacket *p); + +DnsScope* manager_find_scope(Manager *m, DnsPacket *p); + +void manager_verify_all(Manager *m); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +/* For some reason we need some extra cmsg space on some kernels/archs. One of those days we need to figure out why */ +#define EXTRA_CMSG_SPACE 1024 + +int manager_is_own_hostname(Manager *m, const char *name); + +int manager_compile_dns_servers(Manager *m, OrderedSet **servers); +int manager_compile_search_domains(Manager *m, OrderedSet **domains, int filter_route); + +DnssecMode manager_get_dnssec_mode(Manager *m); +bool manager_dnssec_supported(Manager *m); + +DnsOverTlsMode manager_get_dns_over_tls_mode(Manager *m); + +void manager_dnssec_verdict(Manager *m, DnssecVerdict verdict, const DnsResourceKey *key); + +bool manager_routable(Manager *m); + +void manager_flush_caches(Manager *m, int log_level); +void manager_reset_server_features(Manager *m); + +void manager_cleanup_saved_user(Manager *m); + +bool manager_next_dnssd_names(Manager *m); + +bool manager_server_is_stub(Manager *m, DnsServer *s); + +int socket_disable_pmtud(int fd, int af); + +int dns_manager_dump_statistics_json(Manager *m, JsonVariant **ret); + +void dns_manager_reset_statistics(Manager *m); diff --git a/src/resolve/resolved-mdns.c b/src/resolve/resolved-mdns.c new file mode 100644 index 0000000..3e6e83f --- /dev/null +++ b/src/resolve/resolved-mdns.c @@ -0,0 +1,614 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "resolved-manager.h" +#include "resolved-mdns.h" +#include "sort-util.h" + +#define CLEAR_CACHE_FLUSH(x) (~MDNS_RR_CACHE_FLUSH_OR_QU & (x)) + +void manager_mdns_stop(Manager *m) { + assert(m); + + m->mdns_ipv4_event_source = sd_event_source_disable_unref(m->mdns_ipv4_event_source); + m->mdns_ipv4_fd = safe_close(m->mdns_ipv4_fd); + + m->mdns_ipv6_event_source = sd_event_source_disable_unref(m->mdns_ipv6_event_source); + m->mdns_ipv6_fd = safe_close(m->mdns_ipv6_fd); +} + +int manager_mdns_start(Manager *m) { + int r; + + assert(m); + + if (m->mdns_support == RESOLVE_SUPPORT_NO) + return 0; + + r = manager_mdns_ipv4_fd(m); + if (r == -EADDRINUSE) + goto eaddrinuse; + if (r < 0) + return r; + + if (socket_ipv6_is_enabled()) { + r = manager_mdns_ipv6_fd(m); + if (r == -EADDRINUSE) + goto eaddrinuse; + if (r < 0) + return r; + } + + return 0; + +eaddrinuse: + log_warning("Another mDNS responder prohibits binding the socket to the same port. Turning off mDNS support."); + m->mdns_support = RESOLVE_SUPPORT_NO; + manager_mdns_stop(m); + + return 0; +} + +static int mdns_rr_compare(DnsResourceRecord * const *a, DnsResourceRecord * const *b) { + DnsResourceRecord *x = *(DnsResourceRecord **) a, *y = *(DnsResourceRecord **) b; + size_t m; + int r; + + assert(x); + assert(y); + + r = CMP(CLEAR_CACHE_FLUSH(x->key->class), CLEAR_CACHE_FLUSH(y->key->class)); + if (r != 0) + return r; + + r = CMP(x->key->type, y->key->type); + if (r != 0) + return r; + + r = dns_resource_record_to_wire_format(x, false); + if (r < 0) { + log_warning_errno(r, "Can't wire-format RR: %m"); + return 0; + } + + r = dns_resource_record_to_wire_format(y, false); + if (r < 0) { + log_warning_errno(r, "Can't wire-format RR: %m"); + return 0; + } + + m = MIN(DNS_RESOURCE_RECORD_RDATA_SIZE(x), DNS_RESOURCE_RECORD_RDATA_SIZE(y)); + + r = memcmp(DNS_RESOURCE_RECORD_RDATA(x), DNS_RESOURCE_RECORD_RDATA(y), m); + if (r != 0) + return r; + + return CMP(DNS_RESOURCE_RECORD_RDATA_SIZE(x), DNS_RESOURCE_RECORD_RDATA_SIZE(y)); +} + +static int proposed_rrs_cmp(DnsResourceRecord **x, unsigned x_size, DnsResourceRecord **y, unsigned y_size) { + unsigned m; + int r; + + m = MIN(x_size, y_size); + for (unsigned i = 0; i < m; i++) { + r = mdns_rr_compare(&x[i], &y[i]); + if (r != 0) + return r; + } + + return CMP(x_size, y_size); +} + +static int mdns_packet_extract_matching_rrs(DnsPacket *p, DnsResourceKey *key, DnsResourceRecord ***ret_rrs) { + _cleanup_free_ DnsResourceRecord **list = NULL; + size_t i, n = 0, size = 0; + DnsResourceRecord *rr; + int r; + + assert(p); + assert(key); + assert(ret_rrs); + assert_return(DNS_PACKET_NSCOUNT(p) > 0, -EINVAL); + + i = 0; + DNS_ANSWER_FOREACH(rr, p->answer) { + if (i >= DNS_PACKET_ANCOUNT(p) && i < DNS_PACKET_ANCOUNT(p) + DNS_PACKET_NSCOUNT(p)) { + r = dns_resource_key_match_rr(key, rr, NULL); + if (r < 0) + return r; + if (r > 0) + size++; + } + i++; + } + + if (size == 0) { + *ret_rrs = NULL; + return 0; + } + + list = new(DnsResourceRecord *, size); + if (!list) + return -ENOMEM; + + i = 0; + DNS_ANSWER_FOREACH(rr, p->answer) { + if (i >= DNS_PACKET_ANCOUNT(p) && i < DNS_PACKET_ANCOUNT(p) + DNS_PACKET_NSCOUNT(p)) { + r = dns_resource_key_match_rr(key, rr, NULL); + if (r < 0) + return r; + if (r > 0) + list[n++] = rr; + } + i++; + } + + assert(n == size); + typesafe_qsort(list, size, mdns_rr_compare); + + *ret_rrs = TAKE_PTR(list); + + return size; +} + +static int mdns_do_tiebreak(DnsResourceKey *key, DnsAnswer *answer, DnsPacket *p) { + _cleanup_free_ DnsResourceRecord **our = NULL, **remote = NULL; + DnsResourceRecord *rr; + size_t i = 0, size; + int r; + + size = dns_answer_size(answer); + our = new(DnsResourceRecord *, size); + if (!our) + return -ENOMEM; + + DNS_ANSWER_FOREACH(rr, answer) + our[i++] = rr; + + typesafe_qsort(our, size, mdns_rr_compare); + + r = mdns_packet_extract_matching_rrs(p, key, &remote); + if (r < 0) + return r; + + if (proposed_rrs_cmp(remote, r, our, size) > 0) + return 1; + + return 0; +} + +static bool mdns_should_reply_using_unicast(DnsPacket *p) { + DnsQuestionItem *item; + + /* Work out if we should respond using multicast or unicast. */ + + /* The query was a legacy "one-shot mDNS query", RFC 6762, sections 5.1 and 6.7 */ + if (p->sender_port != MDNS_PORT) + return true; + + /* The query was a "direct unicast query", RFC 6762, section 5.5 */ + switch (p->family) { + case AF_INET: + if (!in4_addr_equal(&p->destination.in, &MDNS_MULTICAST_IPV4_ADDRESS)) + return true; + break; + case AF_INET6: + if (!in6_addr_equal(&p->destination.in6, &MDNS_MULTICAST_IPV6_ADDRESS)) + return true; + break; + } + + /* All the questions in the query had a QU bit set, RFC 6762, section 5.4 */ + DNS_QUESTION_FOREACH_ITEM(item, p->question) + if (!FLAGS_SET(item->flags, DNS_QUESTION_WANTS_UNICAST_REPLY)) + return false; + + return true; +} + +static bool sender_on_local_subnet(DnsScope *s, DnsPacket *p) { + int r; + + /* Check whether the sender is on a local subnet. */ + + if (!s->link) + return false; + + LIST_FOREACH(addresses, a, s->link->addresses) { + if (a->family != p->family) + continue; + if (a->prefixlen == UCHAR_MAX) /* don't know subnet mask */ + continue; + + r = in_addr_prefix_covers(a->family, &a->in_addr, a->prefixlen, &p->sender); + if (r < 0) + log_debug_errno(r, "Failed to determine whether link address covers sender address: %m"); + if (r > 0) + return true; + } + + return false; +} + + +static int mdns_scope_process_query(DnsScope *s, DnsPacket *p) { + _cleanup_(dns_answer_unrefp) DnsAnswer *full_answer = NULL; + _cleanup_(dns_packet_unrefp) DnsPacket *reply = NULL; + DnsResourceKey *key = NULL; + DnsResourceRecord *rr; + bool tentative = false; + bool legacy_query = p->sender_port != MDNS_PORT; + bool unicast_reply; + int r; + + assert(s); + assert(p); + + r = dns_packet_extract(p); + if (r < 0) + return log_debug_errno(r, "Failed to extract resource records from incoming packet: %m"); + + /* TODO: Support Known-Answers only packets gracefully. */ + if (dns_question_size(p->question) <= 0) + return 0; + + unicast_reply = mdns_should_reply_using_unicast(p); + if (unicast_reply && !sender_on_local_subnet(s, p)) { + /* RFC 6762, section 5.5 recommends silently ignoring unicast queries + * from senders outside the local network, so that we don't reveal our + * internal network structure to outsiders. */ + log_debug("Sender wants a unicast reply, but is not on a local subnet. Ignoring."); + return 0; + } + + DNS_QUESTION_FOREACH(key, p->question) { + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL, *soa = NULL; + DnsAnswerItem *item; + + r = dns_zone_lookup(&s->zone, key, 0, &answer, &soa, &tentative); + if (r < 0) + return log_debug_errno(r, "Failed to look up key: %m"); + + if (tentative && DNS_PACKET_NSCOUNT(p) > 0) { + /* + * A race condition detected with the probe packet from + * a remote host. + * Do simultaneous probe tiebreaking as described in + * RFC 6762, Section 8.2. In case we lost don't reply + * the question and withdraw conflicting RRs. + */ + r = mdns_do_tiebreak(key, answer, p); + if (r < 0) + return log_debug_errno(r, "Failed to do tiebreaking"); + + if (r > 0) { /* we lost */ + DNS_ANSWER_FOREACH(rr, answer) { + DnsZoneItem *i; + + i = dns_zone_get(&s->zone, rr); + if (i) + dns_zone_item_conflict(i); + } + + continue; + } + } + + if (dns_answer_isempty(answer)) + continue; + + /* Copy answer items from full_answer to answer, tweaking them if needed. */ + if (full_answer) { + r = dns_answer_reserve(&full_answer, dns_answer_size(answer)); + if (r < 0) + return log_debug_errno(r, "Failed to reserve space in answer"); + } else { + full_answer = dns_answer_new(dns_answer_size(answer)); + if (!full_answer) + return log_oom(); + } + + DNS_ANSWER_FOREACH_ITEM(item, answer) { + DnsAnswerFlags flags = item->flags | DNS_ANSWER_REFUSE_TTL_NO_MATCH; + /* The cache-flush bit must not be set in legacy unicast responses. + * See section 6.7 of RFC 6762. */ + if (legacy_query) + flags &= ~DNS_ANSWER_CACHE_FLUSH; + r = dns_answer_add(full_answer, item->rr, item->ifindex, flags, item->rrsig); + if (r < 0) + return log_debug_errno(r, "Failed to extend answer: %m"); + } + } + + if (dns_answer_isempty(full_answer)) + return 0; + + r = dns_scope_make_reply_packet(s, DNS_PACKET_ID(p), DNS_RCODE_SUCCESS, + legacy_query ? p->question : NULL, full_answer, + NULL, false, &reply); + if (r < 0) + return log_debug_errno(r, "Failed to build reply packet: %m"); + + if (!ratelimit_below(&s->ratelimit)) + return 0; + + if (unicast_reply) { + reply->destination = p->sender; + reply->destination_port = p->sender_port; + } + r = dns_scope_emit_udp(s, -1, AF_UNSPEC, reply); + if (r < 0) + return log_debug_errno(r, "Failed to send reply packet: %m"); + + return 0; +} + +static int on_mdns_packet(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + Manager *m = userdata; + DnsScope *scope; + int r; + + r = manager_recv(m, fd, DNS_PROTOCOL_MDNS, &p); + if (r <= 0) + return r; + + if (manager_packet_from_local_address(m, p)) + return 0; + + scope = manager_find_scope(m, p); + if (!scope) { + log_debug("Got mDNS UDP packet on unknown scope. Ignoring."); + return 0; + } + + if (dns_packet_validate_reply(p) > 0) { + DnsResourceRecord *rr; + + log_debug("Got mDNS reply packet"); + + /* + * mDNS is different from regular DNS and LLMNR with regard to handling responses. + * While on other protocols, we can ignore every answer that doesn't match a question + * we broadcast earlier, RFC6762, section 18.1 recommends looking at and caching all + * incoming information, regardless of the DNS packet ID. + * + * Hence, extract the packet here, and try to find a transaction for answer the we got + * and complete it. Also store the new information in scope's cache. + */ + r = dns_packet_extract(p); + if (r < 0) { + log_debug("mDNS packet extraction failed."); + return 0; + } + + dns_scope_check_conflicts(scope, p); + + DNS_ANSWER_FOREACH(rr, p->answer) { + const char *name; + + name = dns_resource_key_name(rr->key); + + /* If the received reply packet contains ANY record that is not .local + * or .in-addr.arpa or .ip6.arpa, we assume someone's playing tricks on + * us and discard the packet completely. */ + if (!(dns_name_endswith(name, "in-addr.arpa") > 0 || + dns_name_endswith(name, "ip6.arpa") > 0 || + dns_name_endswith(name, "local") > 0)) + return 0; + + if (rr->ttl == 0) { + log_debug("Got a goodbye packet"); + /* See the section 10.1 of RFC6762 */ + rr->ttl = 1; + } + } + + for (bool match = true; match;) { + match = false; + LIST_FOREACH(transactions_by_scope, t, scope->transactions) { + if (t->state != DNS_TRANSACTION_PENDING) + continue; + + r = dns_answer_match_key(p->answer, dns_transaction_key(t), NULL); + if (r <= 0) { + if (r < 0) + log_debug_errno(r, "Failed to match resource key, ignoring: %m"); + continue; + } + + /* This packet matches the transaction, let's pass it on as reply */ + dns_transaction_process_reply(t, p, false); + + /* The dns_transaction_process_reply() -> dns_transaction_complete() -> + * dns_query_candidate_stop() may free multiple transactions. Hence, restart + * the loop. */ + match = true; + break; + } + } + + dns_cache_put( + &scope->cache, + scope->manager->enable_cache, + DNS_PROTOCOL_MDNS, + NULL, + DNS_PACKET_RCODE(p), + p->answer, + NULL, + false, + _DNSSEC_RESULT_INVALID, + UINT32_MAX, + p->family, + &p->sender, + scope->manager->stale_retention_usec); + + } else if (dns_packet_validate_query(p) > 0) { + log_debug("Got mDNS query packet for id %u", DNS_PACKET_ID(p)); + + r = mdns_scope_process_query(scope, p); + if (r < 0) { + log_debug_errno(r, "mDNS query processing failed: %m"); + return 0; + } + } else + log_debug("Invalid mDNS UDP packet."); + + return 0; +} + +int manager_mdns_ipv4_fd(Manager *m) { + union sockaddr_union sa = { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(MDNS_PORT), + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(m); + + if (m->mdns_ipv4_fd >= 0) + return m->mdns_ipv4_fd; + + s = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s < 0) + return log_error_errno(errno, "mDNS-IPv4: Failed to create socket: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_TTL, 255); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set IP_TTL: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_MULTICAST_TTL, 255); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set IP_MULTICAST_TTL: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_MULTICAST_LOOP, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set IP_MULTICAST_LOOP: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_PKTINFO, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set IP_PKTINFO: %m"); + + r = setsockopt_int(s, IPPROTO_IP, IP_RECVTTL, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set IP_RECVTTL: %m"); + + /* Disable Don't-Fragment bit in the IP header */ + r = setsockopt_int(s, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_DONT); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set IP_MTU_DISCOVER: %m"); + + /* See the section 15.1 of RFC6762 */ + /* first try to bind without SO_REUSEADDR to detect another mDNS responder */ + r = bind(s, &sa.sa, sizeof(sa.in)); + if (r < 0) { + if (errno != EADDRINUSE) + return log_error_errno(errno, "mDNS-IPv4: Failed to bind socket: %m"); + + log_warning("mDNS-IPv4: There appears to be another mDNS responder running, or previously systemd-resolved crashed with some outstanding transfers."); + + /* try again with SO_REUSEADDR */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set SO_REUSEADDR: %m"); + + r = bind(s, &sa.sa, sizeof(sa.in)); + if (r < 0) + return log_error_errno(errno, "mDNS-IPv4: Failed to bind socket: %m"); + } else { + /* enable SO_REUSEADDR for the case that the user really wants multiple mDNS responders */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to set SO_REUSEADDR: %m"); + } + + r = sd_event_add_io(m->event, &m->mdns_ipv4_event_source, s, EPOLLIN, on_mdns_packet, m); + if (r < 0) + return log_error_errno(r, "mDNS-IPv4: Failed to create event source: %m"); + + (void) sd_event_source_set_description(m->mdns_ipv4_event_source, "mdns-ipv4"); + + return m->mdns_ipv4_fd = TAKE_FD(s); +} + +int manager_mdns_ipv6_fd(Manager *m) { + union sockaddr_union sa = { + .in6.sin6_family = AF_INET6, + .in6.sin6_port = htobe16(MDNS_PORT), + }; + _cleanup_close_ int s = -EBADF; + int r; + + assert(m); + + if (m->mdns_ipv6_fd >= 0) + return m->mdns_ipv6_fd; + + s = socket(AF_INET6, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s < 0) + return log_error_errno(errno, "mDNS-IPv6: Failed to create socket: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_UNICAST_HOPS, 255); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set IPV6_UNICAST_HOPS: %m"); + + /* RFC 6762, section 11 recommends setting the TTL of UDP packets to 255. */ + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, 255); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set IPV6_MULTICAST_HOPS: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set IPV6_MULTICAST_LOOP: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_V6ONLY, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set IPV6_V6ONLY: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_RECVPKTINFO, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set IPV6_RECVPKTINFO: %m"); + + r = setsockopt_int(s, IPPROTO_IPV6, IPV6_RECVHOPLIMIT, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set IPV6_RECVHOPLIMIT: %m"); + + /* See the section 15.1 of RFC6762 */ + /* first try to bind without SO_REUSEADDR to detect another mDNS responder */ + r = bind(s, &sa.sa, sizeof(sa.in6)); + if (r < 0) { + if (errno != EADDRINUSE) + return log_error_errno(errno, "mDNS-IPv6: Failed to bind socket: %m"); + + log_warning("mDNS-IPv6: There appears to be another mDNS responder running, or previously systemd-resolved crashed with some outstanding transfers."); + + /* try again with SO_REUSEADDR */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set SO_REUSEADDR: %m"); + + r = bind(s, &sa.sa, sizeof(sa.in6)); + if (r < 0) + return log_error_errno(errno, "mDNS-IPv6: Failed to bind socket: %m"); + } else { + /* enable SO_REUSEADDR for the case that the user really wants multiple mDNS responders */ + r = setsockopt_int(s, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to set SO_REUSEADDR: %m"); + } + + r = sd_event_add_io(m->event, &m->mdns_ipv6_event_source, s, EPOLLIN, on_mdns_packet, m); + if (r < 0) + return log_error_errno(r, "mDNS-IPv6: Failed to create event source: %m"); + + (void) sd_event_source_set_description(m->mdns_ipv6_event_source, "mdns-ipv6"); + + return m->mdns_ipv6_fd = TAKE_FD(s); +} diff --git a/src/resolve/resolved-mdns.h b/src/resolve/resolved-mdns.h new file mode 100644 index 0000000..38ef180 --- /dev/null +++ b/src/resolve/resolved-mdns.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "resolved-manager.h" + +#define MDNS_PORT 5353 +#define MDNS_ANNOUNCE_DELAY (1 * USEC_PER_SEC) + +int manager_mdns_ipv4_fd(Manager *m); +int manager_mdns_ipv6_fd(Manager *m); + +void manager_mdns_stop(Manager *m); +int manager_mdns_start(Manager *m); diff --git a/src/resolve/resolved-resolv-conf.c b/src/resolve/resolved-resolv-conf.c new file mode 100644 index 0000000..2071e08 --- /dev/null +++ b/src/resolve/resolved-resolv-conf.c @@ -0,0 +1,434 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dns-domain.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "label-util.h" +#include "ordered-set.h" +#include "path-util.h" +#include "resolved-conf.h" +#include "resolved-dns-server.h" +#include "resolved-resolv-conf.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util-label.h" + +int manager_check_resolv_conf(const Manager *m) { + struct stat st, own; + + assert(m); + + /* This warns only when our stub listener is disabled and /etc/resolv.conf is a symlink to + * PRIVATE_STATIC_RESOLV_CONF. */ + + if (m->dns_stub_listener_mode != DNS_STUB_LISTENER_NO) + return 0; + + if (stat("/etc/resolv.conf", &st) < 0) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to stat /etc/resolv.conf: %m"); + } + + /* Is it symlinked to our own uplink file? */ + if (stat(PRIVATE_STATIC_RESOLV_CONF, &own) >= 0 && + stat_inode_same(&st, &own)) + return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "DNSStubListener= is disabled, but /etc/resolv.conf is a symlink to " + PRIVATE_STATIC_RESOLV_CONF " which expects DNSStubListener= to be enabled."); + + return 0; +} + +static bool file_is_our_own(const struct stat *st) { + assert(st); + + FOREACH_STRING(path, + PRIVATE_UPLINK_RESOLV_CONF, + PRIVATE_STUB_RESOLV_CONF, + PRIVATE_STATIC_RESOLV_CONF) { + + struct stat own; + + /* Is it symlinked to our own uplink file? */ + if (stat(path, &own) >= 0 && + stat_inode_same(st, &own)) + return true; + } + + return false; +} + +int manager_read_resolv_conf(Manager *m) { + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + unsigned n = 0; + int r; + + assert(m); + + /* Reads the system /etc/resolv.conf, if it exists and is not + * symlinked to our own resolv.conf instance */ + + if (!m->read_resolv_conf) + return 0; + + r = stat("/etc/resolv.conf", &st); + if (r < 0) { + if (errno == ENOENT) + return 0; + + r = log_warning_errno(errno, "Failed to stat /etc/resolv.conf: %m"); + goto clear; + } + + /* Have we already seen the file? */ + if (stat_inode_unmodified(&st, &m->resolv_conf_stat)) + return 0; + + if (file_is_our_own(&st)) + return 0; + + f = fopen("/etc/resolv.conf", "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + r = log_warning_errno(errno, "Failed to open /etc/resolv.conf: %m"); + goto clear; + } + + if (fstat(fileno(f), &st) < 0) { + r = log_error_errno(errno, "Failed to stat open file: %m"); + goto clear; + } + + if (file_is_our_own(&st)) + return 0; + + dns_server_mark_all(m->dns_servers); + dns_search_domain_mark_all(m->search_domains); + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *a; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) { + log_error_errno(r, "Failed to read /etc/resolv.conf: %m"); + goto clear; + } + if (r == 0) + break; + + n++; + + if (IN_SET(*line, '#', ';', 0)) + continue; + + a = first_word(line, "nameserver"); + if (a) { + r = manager_parse_dns_server_string_and_warn(m, DNS_SERVER_SYSTEM, a); + if (r < 0) + log_warning_errno(r, "Failed to parse DNS server address '%s', ignoring.", a); + + continue; + } + + a = first_word(line, "domain"); + if (!a) /* We treat "domain" lines, and "search" lines as equivalent, and add both to our list. */ + a = first_word(line, "search"); + if (a) { + r = manager_parse_search_domains_and_warn(m, a); + if (r < 0) + log_warning_errno(r, "Failed to parse search domain string '%s', ignoring.", a); + + continue; + } + + log_syntax(NULL, LOG_DEBUG, "/etc/resolv.conf", n, 0, "Ignoring resolv.conf line: %s", line); + } + + m->resolv_conf_stat = st; + + /* Flush out all servers and search domains that are still + * marked. Those are then ones that didn't appear in the new + * /etc/resolv.conf */ + dns_server_unlink_marked(m->dns_servers); + dns_search_domain_unlink_marked(m->search_domains); + + /* Whenever /etc/resolv.conf changes, start using the first + * DNS server of it. This is useful to deal with broken + * network managing implementations (like NetworkManager), + * that when connecting to a VPN place both the VPN DNS + * servers and the local ones in /etc/resolv.conf. Without + * resetting the DNS server to use back to the first entry we + * will continue to use the local one thus being unable to + * resolve VPN domains. */ + manager_set_dns_server(m, m->dns_servers); + + /* Unconditionally flush the cache when /etc/resolv.conf is + * modified, even if the data it contained was completely + * identical to the previous version we used. We do this + * because altering /etc/resolv.conf is typically done when + * the network configuration changes, and that should be + * enough to flush the global unicast DNS cache. */ + if (m->unicast_scope) + dns_cache_flush(&m->unicast_scope->cache); + + /* If /etc/resolv.conf changed, make sure to forget everything we learned about the DNS servers. After all we + * might now talk to a very different DNS server that just happens to have the same IP address as an old one + * (think 192.168.1.1). */ + dns_server_reset_features_all(m->dns_servers); + + return 0; + +clear: + dns_server_unlink_all(m->dns_servers); + dns_search_domain_unlink_all(m->search_domains); + return r; +} + +static void write_resolv_conf_server(DnsServer *s, FILE *f, unsigned *count) { + DnsScope *scope; + + assert(s); + assert(f); + assert(count); + + if (!dns_server_string(s)) { + log_warning("Out of memory, or invalid DNS address. Ignoring server."); + return; + } + + /* resolv.conf simply doesn't support any other ports than 53, hence there's nothing much we can + * do — we have to suppress these entries */ + if (dns_server_port(s) != 53) { + log_debug("DNS server %s with non-standard UDP port number, suppressing from generated resolv.conf.", dns_server_string(s)); + return; + } + + /* Check if the scope this DNS server belongs to is suitable as 'default' route for lookups; resolv.conf does + * not have a syntax to express that, so it must not appear as a global name server to avoid routing unrelated + * domains to it (which is a privacy violation, will most probably fail anyway, and adds unnecessary load) */ + scope = dns_server_scope(s); + if (scope && !dns_scope_is_default_route(scope)) { + log_debug("Scope of DNS server %s has only route-only domains, not using as global name server", dns_server_string(s)); + return; + } + + if (*count == MAXNS) + fputs("# Too many DNS servers configured, the following entries may be ignored.\n", f); + (*count)++; + + fprintf(f, "nameserver %s\n", dns_server_string(s)); +} + +static void write_resolv_conf_search( + OrderedSet *domains, + FILE *f) { + char *domain; + + assert(domains); + assert(f); + + fputs("search", f); + + ORDERED_SET_FOREACH(domain, domains) { + fputc(' ', f); + fputs(domain, f); + } + + fputs("\n", f); +} + +static int write_uplink_resolv_conf_contents(FILE *f, OrderedSet *dns, OrderedSet *domains) { + + fputs("# This is "PRIVATE_UPLINK_RESOLV_CONF" managed by man:systemd-resolved(8).\n" + "# Do not edit.\n" + "#\n" + "# This file might be symlinked as /etc/resolv.conf. If you're looking at\n" + "# /etc/resolv.conf and seeing this text, you have followed the symlink.\n" + "#\n" + "# This is a dynamic resolv.conf file for connecting local clients directly to\n" + "# all known uplink DNS servers. This file lists all configured search domains.\n" + "#\n" + "# Third party programs should typically not access this file directly, but only\n" + "# through the symlink at /etc/resolv.conf. To manage man:resolv.conf(5) in a\n" + "# different way, replace this symlink by a static file or a different symlink.\n" + "#\n" + "# See man:systemd-resolved.service(8) for details about the supported modes of\n" + "# operation for /etc/resolv.conf.\n" + "\n", f); + + if (ordered_set_isempty(dns)) + fputs("# No DNS servers known.\n", f); + else { + unsigned count = 0; + DnsServer *s; + + ORDERED_SET_FOREACH(s, dns) + write_resolv_conf_server(s, f, &count); + } + + if (ordered_set_isempty(domains)) + fputs("search .\n", f); /* Make sure that if the local hostname is chosen as fqdn this does not + * imply a search domain */ + else + write_resolv_conf_search(domains, f); + + return fflush_and_check(f); +} + +static int write_stub_resolv_conf_contents(FILE *f, OrderedSet *dns, OrderedSet *domains) { + fputs("# This is "PRIVATE_STUB_RESOLV_CONF" managed by man:systemd-resolved(8).\n" + "# Do not edit.\n" + "#\n" + "# This file might be symlinked as /etc/resolv.conf. If you're looking at\n" + "# /etc/resolv.conf and seeing this text, you have followed the symlink.\n" + "#\n" + "# This is a dynamic resolv.conf file for connecting local clients to the\n" + "# internal DNS stub resolver of systemd-resolved. This file lists all\n" + "# configured search domains.\n" + "#\n" + "# Run \"resolvectl status\" to see details about the uplink DNS servers\n" + "# currently in use.\n" + "#\n" + "# Third party programs should typically not access this file directly, but only\n" + "# through the symlink at /etc/resolv.conf. To manage man:resolv.conf(5) in a\n" + "# different way, replace this symlink by a static file or a different symlink.\n" + "#\n" + "# See man:systemd-resolved.service(8) for details about the supported modes of\n" + "# operation for /etc/resolv.conf.\n" + "\n" + "nameserver 127.0.0.53\n" + "options edns0 trust-ad\n", f); + + if (ordered_set_isempty(domains)) + fputs("search .\n", f); /* Make sure that if the local hostname is chosen as fqdn this does not + * imply a search domain */ + else + write_resolv_conf_search(domains, f); + + return fflush_and_check(f); +} + +int manager_write_resolv_conf(Manager *m) { + _cleanup_ordered_set_free_ OrderedSet *dns = NULL, *domains = NULL; + _cleanup_(unlink_and_freep) char *temp_path_uplink = NULL, *temp_path_stub = NULL; + _cleanup_fclose_ FILE *f_uplink = NULL, *f_stub = NULL; + int r; + + assert(m); + + /* Read the system /etc/resolv.conf first */ + (void) manager_read_resolv_conf(m); + + /* Add the full list to a set, to filter out duplicates */ + r = manager_compile_dns_servers(m, &dns); + if (r < 0) + return log_warning_errno(r, "Failed to compile list of DNS servers, ignoring: %m"); + + r = manager_compile_search_domains(m, &domains, false); + if (r < 0) + return log_warning_errno(r, "Failed to compile list of search domains, ignoring: %m"); + + r = fopen_temporary_label(PRIVATE_UPLINK_RESOLV_CONF, PRIVATE_UPLINK_RESOLV_CONF, &f_uplink, &temp_path_uplink); + if (r < 0) + return log_warning_errno(r, "Failed to open new %s for writing, ignoring: %m", PRIVATE_UPLINK_RESOLV_CONF); + + (void) fchmod(fileno(f_uplink), 0644); + + r = write_uplink_resolv_conf_contents(f_uplink, dns, domains); + if (r < 0) + return log_warning_errno(r, "Failed to write new %s, ignoring: %m", PRIVATE_UPLINK_RESOLV_CONF); + + if (m->dns_stub_listener_mode != DNS_STUB_LISTENER_NO) { + r = fopen_temporary_label(PRIVATE_STUB_RESOLV_CONF, PRIVATE_STUB_RESOLV_CONF, &f_stub, &temp_path_stub); + if (r < 0) + return log_warning_errno(r, "Failed to open new %s for writing, ignoring: %m", PRIVATE_STUB_RESOLV_CONF); + + (void) fchmod(fileno(f_stub), 0644); + + r = write_stub_resolv_conf_contents(f_stub, dns, domains); + if (r < 0) + return log_warning_errno(r, "Failed to write new %s, ignoring: %m", PRIVATE_STUB_RESOLV_CONF); + + r = conservative_rename(temp_path_stub, PRIVATE_STUB_RESOLV_CONF); + if (r < 0) + log_warning_errno(r, "Failed to move new %s into place, ignoring: %m", PRIVATE_STUB_RESOLV_CONF); + + temp_path_stub = mfree(temp_path_stub); /* free the string explicitly, so that we don't unlink anymore */ + } else { + _cleanup_free_ char *fname = NULL; + r = path_extract_filename(PRIVATE_UPLINK_RESOLV_CONF, &fname); + if (r < 0) + return log_warning_errno(r, "Failed to extract filename from path '" PRIVATE_UPLINK_RESOLV_CONF "', ignoring: %m"); + + r = symlink_atomic_label(fname, PRIVATE_STUB_RESOLV_CONF); + if (r < 0) + log_warning_errno(r, "Failed to symlink %s, ignoring: %m", PRIVATE_STUB_RESOLV_CONF); + } + + r = conservative_rename(temp_path_uplink, PRIVATE_UPLINK_RESOLV_CONF); + if (r < 0) + log_warning_errno(r, "Failed to move new %s into place: %m", PRIVATE_UPLINK_RESOLV_CONF); + + temp_path_uplink = mfree(temp_path_uplink); /* free the string explicitly, so that we don't unlink anymore */ + return r; +} + +int resolv_conf_mode(void) { + static const char * const table[_RESOLV_CONF_MODE_MAX] = { + [RESOLV_CONF_UPLINK] = PRIVATE_UPLINK_RESOLV_CONF, + [RESOLV_CONF_STUB] = PRIVATE_STUB_RESOLV_CONF, + [RESOLV_CONF_STATIC] = PRIVATE_STATIC_RESOLV_CONF, + }; + + struct stat system_st; + + if (stat("/etc/resolv.conf", &system_st) < 0) { + if (errno == ENOENT) + return RESOLV_CONF_MISSING; + + return -errno; + } + + for (ResolvConfMode m = 0; m < _RESOLV_CONF_MODE_MAX; m++) { + struct stat our_st; + + if (!table[m]) + continue; + + if (stat(table[m], &our_st) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to stat() %s, ignoring: %m", table[m]); + + continue; + } + + if (stat_inode_same(&system_st, &our_st)) + return m; + } + + return RESOLV_CONF_FOREIGN; +} + +static const char* const resolv_conf_mode_table[_RESOLV_CONF_MODE_MAX] = { + [RESOLV_CONF_UPLINK] = "uplink", + [RESOLV_CONF_STUB] = "stub", + [RESOLV_CONF_STATIC] = "static", + [RESOLV_CONF_MISSING] = "missing", + [RESOLV_CONF_FOREIGN] = "foreign", +}; +DEFINE_STRING_TABLE_LOOKUP(resolv_conf_mode, ResolvConfMode); diff --git a/src/resolve/resolved-resolv-conf.h b/src/resolve/resolved-resolv-conf.h new file mode 100644 index 0000000..8c0dee8 --- /dev/null +++ b/src/resolve/resolved-resolv-conf.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "resolved-manager.h" + +int manager_check_resolv_conf(const Manager *m); +int manager_read_resolv_conf(Manager *m); +int manager_write_resolv_conf(Manager *m); + +typedef enum ResolvConfMode { + RESOLV_CONF_UPLINK, + RESOLV_CONF_STUB, + RESOLV_CONF_STATIC, + RESOLV_CONF_FOREIGN, + RESOLV_CONF_MISSING, + _RESOLV_CONF_MODE_MAX, + _RESOLV_CONF_MODE_INVALID = -EINVAL, +} ResolvConfMode; + +int resolv_conf_mode(void); + +const char* resolv_conf_mode_to_string(ResolvConfMode m) _const_; +ResolvConfMode resolv_conf_mode_from_string(const char *s) _pure_; diff --git a/src/resolve/resolved-socket-graveyard.c b/src/resolve/resolved-socket-graveyard.c new file mode 100644 index 0000000..9605d72 --- /dev/null +++ b/src/resolve/resolved-socket-graveyard.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "resolved-socket-graveyard.h" + +#define SOCKET_GRAVEYARD_USEC (5 * USEC_PER_SEC) +#define SOCKET_GRAVEYARD_MAX 100 + +/* This implements a socket "graveyard" for UDP sockets. If a socket fd is added to the graveyard it is kept + * open for a couple of more seconds, expecting one reply. Once the reply is received the fd is closed + * immediately, or if none is received it is closed after the timeout. Why all this? So that if we contact a + * DNS server, and it doesn't reply instantly, and we lose interest in the response and thus close the fd, we + * don't end up sending back an ICMP error once the server responds but we aren't listening anymore. (See + * https://github.com/systemd/systemd/issues/17421 for further information.) + * + * Note that we don't allocate any timer event source to clear up the graveyard once the socket's timeout is + * reached. Instead we operate lazily: we close old entries when adding a new fd to the graveyard, or + * whenever any code runs manager_socket_graveyard_process() — which the DNS transaction code does right + * before allocating a new UDP socket. */ + +static SocketGraveyard* socket_graveyard_free(SocketGraveyard *g) { + if (!g) + return NULL; + + if (g->manager) { + assert(g->manager->n_socket_graveyard > 0); + g->manager->n_socket_graveyard--; + + if (g->manager->socket_graveyard_oldest == g) + g->manager->socket_graveyard_oldest = g->graveyard_prev; + + LIST_REMOVE(graveyard, g->manager->socket_graveyard, g); + + assert((g->manager->n_socket_graveyard > 0) == !!g->manager->socket_graveyard); + assert((g->manager->n_socket_graveyard > 0) == !!g->manager->socket_graveyard_oldest); + } + + if (g->io_event_source) { + log_debug("Closing graveyard socket fd %i", sd_event_source_get_io_fd(g->io_event_source)); + sd_event_source_disable_unref(g->io_event_source); + } + + return mfree(g); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(SocketGraveyard*, socket_graveyard_free); + +void manager_socket_graveyard_process(Manager *m) { + usec_t n = USEC_INFINITY; + + assert(m); + + while (m->socket_graveyard_oldest) { + SocketGraveyard *g = m->socket_graveyard_oldest; + + if (n == USEC_INFINITY) + assert_se(sd_event_now(m->event, CLOCK_BOOTTIME, &n) >= 0); + + if (g->deadline > n) + break; + + socket_graveyard_free(g); + } +} + +void manager_socket_graveyard_clear(Manager *m) { + assert(m); + + while (m->socket_graveyard) + socket_graveyard_free(m->socket_graveyard); +} + +static int on_io_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + SocketGraveyard *g = ASSERT_PTR(userdata); + + /* An IO event happened on the graveyard fd. We don't actually care which event that is, and we don't + * read any incoming packet off the socket. We just close the fd, that's enough to not trigger the + * ICMP unreachable port event */ + + socket_graveyard_free(g); + return 0; +} + +static void manager_socket_graveyard_make_room(Manager *m) { + assert(m); + + while (m->n_socket_graveyard >= SOCKET_GRAVEYARD_MAX) + socket_graveyard_free(m->socket_graveyard_oldest); +} + +int manager_add_socket_to_graveyard(Manager *m, int fd) { + _cleanup_(socket_graveyard_freep) SocketGraveyard *g = NULL; + int r; + + assert(m); + assert(fd >= 0); + + manager_socket_graveyard_process(m); + manager_socket_graveyard_make_room(m); + + g = new(SocketGraveyard, 1); + if (!g) + return log_oom(); + + *g = (SocketGraveyard) { + .manager = m, + }; + + LIST_PREPEND(graveyard, m->socket_graveyard, g); + if (!m->socket_graveyard_oldest) + m->socket_graveyard_oldest = g; + + m->n_socket_graveyard++; + + assert_se(sd_event_now(m->event, CLOCK_BOOTTIME, &g->deadline) >= 0); + g->deadline += SOCKET_GRAVEYARD_USEC; + + r = sd_event_add_io(m->event, &g->io_event_source, fd, EPOLLIN, on_io_event, g); + if (r < 0) + return log_error_errno(r, "Failed to create graveyard IO source: %m"); + + r = sd_event_source_set_io_fd_own(g->io_event_source, true); + if (r < 0) + return log_error_errno(r, "Failed to enable graveyard IO source fd ownership: %m"); + + (void) sd_event_source_set_description(g->io_event_source, "graveyard"); + + log_debug("Added socket %i to graveyard", fd); + + TAKE_PTR(g); + return 0; +} diff --git a/src/resolve/resolved-socket-graveyard.h b/src/resolve/resolved-socket-graveyard.h new file mode 100644 index 0000000..50c6aad --- /dev/null +++ b/src/resolve/resolved-socket-graveyard.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct SocketGraveyard SocketGraveyard; + +#include "resolved-manager.h" + +struct SocketGraveyard { + Manager *manager; + usec_t deadline; + sd_event_source *io_event_source; + LIST_FIELDS(SocketGraveyard, graveyard); +}; + +void manager_socket_graveyard_process(Manager *m); +void manager_socket_graveyard_clear(Manager *m); + +int manager_add_socket_to_graveyard(Manager *m, int fd); diff --git a/src/resolve/resolved-util.c b/src/resolve/resolved-util.c new file mode 100644 index 0000000..00abada --- /dev/null +++ b/src/resolve/resolved-util.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dns-def.h" +#include "dns-domain.h" +#include "hostname-util.h" +#include "idn-util.h" +#include "resolved-util.h" +#include "utf8.h" + +int resolve_system_hostname(char **full_hostname, char **first_label) { + _cleanup_free_ char *h = NULL, *n = NULL; +#if HAVE_LIBIDN2 + _cleanup_free_ char *utf8 = NULL; +#elif HAVE_LIBIDN + int k; +#endif + char label[DNS_LABEL_MAX]; + const char *p, *decoded; + int r; + + /* Return the full hostname in *full_hostname, if nonnull. + * + * Extract and normalize the first label of the locally configured hostname, check it's not + * "localhost", and return it in *first_label, if nonnull. */ + + r = gethostname_strict(&h); + if (r < 0) + return log_debug_errno(r, "Can't determine system hostname: %m"); + + p = h; + r = dns_label_unescape(&p, label, sizeof label, 0); + if (r < 0) + return log_debug_errno(r, "Failed to unescape hostname: %m"); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Couldn't find a single label in hostname."); + +#if HAVE_LIBIDN || HAVE_LIBIDN2 + r = dlopen_idn(); + if (r < 0) { + log_debug_errno(r, "Failed to initialize IDN support, ignoring: %m"); + decoded = label; /* no decoding */ + } else +#endif + { +#if HAVE_LIBIDN2 + r = sym_idn2_to_unicode_8z8z(label, &utf8, 0); + if (r != IDN2_OK) + return log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), + "Failed to undo IDNA: %s", sym_idn2_strerror(r)); + assert(utf8_is_valid(utf8)); + + r = strlen(utf8); + decoded = utf8; +#elif HAVE_LIBIDN + k = dns_label_undo_idna(label, r, label, sizeof label); + if (k < 0) + return log_debug_errno(k, "Failed to undo IDNA: %m"); + if (k > 0) + r = k; + + if (!utf8_is_valid(label)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "System hostname is not UTF-8 clean."); + decoded = label; +#else + decoded = label; /* no decoding */ +#endif + } + + r = dns_label_escape_new(decoded, r, &n); + if (r < 0) + return log_debug_errno(r, "Failed to escape hostname: %m"); + + if (is_localhost(n)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "System hostname is 'localhost', ignoring."); + + if (full_hostname) + *full_hostname = TAKE_PTR(h); + if (first_label) + *first_label = TAKE_PTR(n); + return 0; +} diff --git a/src/resolve/resolved-util.h b/src/resolve/resolved-util.h new file mode 100644 index 0000000..446b7c9 --- /dev/null +++ b/src/resolve/resolved-util.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int resolve_system_hostname(char **full_hostname, char **first_label); diff --git a/src/resolve/resolved-varlink.c b/src/resolve/resolved-varlink.c new file mode 100644 index 0000000..3e178a6 --- /dev/null +++ b/src/resolve/resolved-varlink.c @@ -0,0 +1,796 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "glyph-util.h" +#include "in-addr-util.h" +#include "resolved-dns-synthesize.h" +#include "resolved-varlink.h" +#include "socket-netlink.h" +#include "varlink-io.systemd.Resolve.h" +#include "varlink-io.systemd.Resolve.Monitor.h" + +typedef struct LookupParameters { + int ifindex; + uint64_t flags; + int family; + union in_addr_union address; + size_t address_size; + char *name; +} LookupParameters; + +static void lookup_parameters_destroy(LookupParameters *p) { + assert(p); + free(p->name); +} + +static int reply_query_state(DnsQuery *q) { + + assert(q); + assert(q->varlink_request); + + switch (q->state) { + + case DNS_TRANSACTION_NO_SERVERS: + return varlink_error(q->varlink_request, "io.systemd.Resolve.NoNameServers", NULL); + + case DNS_TRANSACTION_TIMEOUT: + return varlink_error(q->varlink_request, "io.systemd.Resolve.QueryTimedOut", NULL); + + case DNS_TRANSACTION_ATTEMPTS_MAX_REACHED: + return varlink_error(q->varlink_request, "io.systemd.Resolve.MaxAttemptsReached", NULL); + + case DNS_TRANSACTION_INVALID_REPLY: + return varlink_error(q->varlink_request, "io.systemd.Resolve.InvalidReply", NULL); + + case DNS_TRANSACTION_ERRNO: + return varlink_error_errno(q->varlink_request, q->answer_errno); + + case DNS_TRANSACTION_ABORTED: + return varlink_error(q->varlink_request, "io.systemd.Resolve.QueryAborted", NULL); + + case DNS_TRANSACTION_DNSSEC_FAILED: + return varlink_errorb(q->varlink_request, "io.systemd.Resolve.DNSSECValidationFailed", + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("result", JSON_BUILD_STRING(dnssec_result_to_string(q->answer_dnssec_result))))); + + case DNS_TRANSACTION_NO_TRUST_ANCHOR: + return varlink_error(q->varlink_request, "io.systemd.Resolve.NoTrustAnchor", NULL); + + case DNS_TRANSACTION_RR_TYPE_UNSUPPORTED: + return varlink_error(q->varlink_request, "io.systemd.Resolve.ResourceRecordTypeUnsupported", NULL); + + case DNS_TRANSACTION_NETWORK_DOWN: + return varlink_error(q->varlink_request, "io.systemd.Resolve.NetworkDown", NULL); + + case DNS_TRANSACTION_NO_SOURCE: + return varlink_error(q->varlink_request, "io.systemd.Resolve.NoSource", NULL); + + case DNS_TRANSACTION_STUB_LOOP: + return varlink_error(q->varlink_request, "io.systemd.Resolve.StubLoop", NULL); + + case DNS_TRANSACTION_NOT_FOUND: + /* We return this as NXDOMAIN. This is only generated when a host doesn't implement LLMNR/TCP, and we + * thus quickly know that we cannot resolve an in-addr.arpa or ip6.arpa address. */ + return varlink_errorb(q->varlink_request, "io.systemd.Resolve.DNSError", + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("rcode", JSON_BUILD_INTEGER(DNS_RCODE_NXDOMAIN)))); + + case DNS_TRANSACTION_RCODE_FAILURE: + return varlink_errorb(q->varlink_request, "io.systemd.Resolve.DNSError", + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("rcode", JSON_BUILD_INTEGER(q->answer_rcode)))); + + case DNS_TRANSACTION_NULL: + case DNS_TRANSACTION_PENDING: + case DNS_TRANSACTION_VALIDATING: + case DNS_TRANSACTION_SUCCESS: + default: + assert_not_reached(); + } +} + +static void vl_on_disconnect(VarlinkServer *s, Varlink *link, void *userdata) { + DnsQuery *q; + + assert(s); + assert(link); + + q = varlink_get_userdata(link); + if (!q) + return; + + if (!DNS_TRANSACTION_IS_LIVE(q->state)) + return; + + log_debug("Client of active query vanished, aborting query."); + dns_query_complete(q, DNS_TRANSACTION_ABORTED); +} + +static void vl_on_notification_disconnect(VarlinkServer *s, Varlink *link, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + assert(link); + + Varlink *removed_link = set_remove(m->varlink_subscription, link); + if (removed_link) { + varlink_unref(removed_link); + log_debug("%u monitor clients remain active", set_size(m->varlink_subscription)); + } +} + +static bool validate_and_mangle_flags( + const char *name, + uint64_t *flags, + uint64_t ok) { + + assert(flags); + + /* This checks that the specified client-provided flags parameter actually makes sense, and mangles + * it slightly. Specifically: + * + * 1. We check that only the protocol flags and a bunch of NO_XYZ flags are on at most, plus the + * method-specific flags specified in 'ok'. + * + * 2. If no protocols are enabled we automatically convert that to "all protocols are enabled". + * + * The second rule means that clients can just pass 0 as flags for the common case, and all supported + * protocols are enabled. Moreover it's useful so that client's do not have to be aware of all + * protocols implemented in resolved, but can use 0 as protocols flags set as indicator for + * "everything". + */ + + if (*flags & ~(SD_RESOLVED_PROTOCOLS_ALL| + SD_RESOLVED_NO_CNAME| + SD_RESOLVED_NO_VALIDATE| + SD_RESOLVED_NO_SYNTHESIZE| + SD_RESOLVED_NO_CACHE| + SD_RESOLVED_NO_ZONE| + SD_RESOLVED_NO_TRUST_ANCHOR| + SD_RESOLVED_NO_NETWORK| + SD_RESOLVED_NO_STALE| + ok)) + return false; + + if ((*flags & SD_RESOLVED_PROTOCOLS_ALL) == 0) /* If no protocol is enabled, enable all */ + *flags |= SD_RESOLVED_PROTOCOLS_ALL; + + /* If the SD_RESOLVED_NO_SEARCH flag is acceptable, and the query name is dot-suffixed, turn off + * search domains. Note that DNS name normalization drops the dot suffix, hence we propagate this + * into the flags field as early as we can. */ + if (name && FLAGS_SET(ok, SD_RESOLVED_NO_SEARCH) && dns_name_dot_suffixed(name) > 0) + *flags |= SD_RESOLVED_NO_SEARCH; + + return true; +} + +static void vl_method_resolve_hostname_complete(DnsQuery *query) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *canonical = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = query; + _cleanup_free_ char *normalized = NULL; + DnsResourceRecord *rr; + DnsQuestion *question; + int ifindex, r; + + assert(q); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + r = reply_query_state(q); + goto finish; + } + + r = dns_query_process_cname_many(q); + if (r == -ELOOP) { + r = varlink_error(q->varlink_request, "io.systemd.Resolve.CNAMELoop", NULL); + goto finish; + } + if (r < 0) + goto finish; + if (r == DNS_QUERY_CNAME) { + /* This was a cname, and the query was restarted. */ + TAKE_PTR(q); + return; + } + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, q->answer) { + _cleanup_(json_variant_unrefp) JsonVariant *entry = NULL; + int family; + const void *p; + + r = dns_question_matches_rr(question, rr, DNS_SEARCH_DOMAIN_NAME(q->answer_search_domain)); + if (r < 0) + goto finish; + if (r == 0) + continue; + + if (rr->key->type == DNS_TYPE_A) { + family = AF_INET; + p = &rr->a.in_addr; + } else if (rr->key->type == DNS_TYPE_AAAA) { + family = AF_INET6; + p = &rr->aaaa.in6_addr; + } else { + r = -EAFNOSUPPORT; + goto finish; + } + + r = json_build(&entry, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_CONDITION(ifindex > 0, "ifindex", JSON_BUILD_INTEGER(ifindex)), + JSON_BUILD_PAIR("family", JSON_BUILD_INTEGER(family)), + JSON_BUILD_PAIR("address", JSON_BUILD_BYTE_ARRAY(p, FAMILY_ADDRESS_SIZE(family))))); + if (r < 0) + goto finish; + + if (!canonical) + canonical = dns_resource_record_ref(rr); + + r = json_variant_append_array(&array, entry); + if (r < 0) + goto finish; + } + + if (json_variant_is_blank_object(array)) { + r = varlink_error(q->varlink_request, "io.systemd.Resolve.NoSuchResourceRecord", NULL); + goto finish; + } + + assert(canonical); + r = dns_name_normalize(dns_resource_key_name(canonical->key), 0, &normalized); + if (r < 0) + goto finish; + + r = varlink_replyb(q->varlink_request, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("addresses", JSON_BUILD_VARIANT(array)), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(normalized)), + JSON_BUILD_PAIR("flags", JSON_BUILD_INTEGER(dns_query_reply_flags_make(q))))); +finish: + if (r < 0) { + log_full_errno(ERRNO_IS_DISCONNECT(r) ? LOG_DEBUG : LOG_ERR, r, "Failed to send hostname reply: %m"); + r = varlink_error_errno(q->varlink_request, r); + } +} + +static int parse_as_address(Varlink *link, LookupParameters *p) { + _cleanup_free_ char *canonical = NULL; + int r, ff, parsed_ifindex, ifindex; + union in_addr_union parsed; + + assert(link); + assert(p); + + /* Check if this parses as literal address. If so, just parse it and return that, do not involve networking */ + r = in_addr_ifindex_from_string_auto(p->name, &ff, &parsed, &parsed_ifindex); + if (r < 0) + return 0; /* not a literal address */ + + /* Make sure the data we parsed matches what is requested */ + if ((p->family != AF_UNSPEC && ff != p->family) || + (p->ifindex > 0 && parsed_ifindex > 0 && parsed_ifindex != p->ifindex)) + return varlink_error(link, "io.systemd.Resolve.NoSuchResourceRecord", NULL); + + ifindex = parsed_ifindex > 0 ? parsed_ifindex : p->ifindex; + + /* Reformat the address as string, to return as canonicalized name */ + r = in_addr_ifindex_to_string(ff, &parsed, ifindex, &canonical); + if (r < 0) + return r; + + return varlink_replyb( + link, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("addresses", + JSON_BUILD_ARRAY( + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_CONDITION(ifindex > 0, "ifindex", JSON_BUILD_INTEGER(ifindex)), + JSON_BUILD_PAIR("family", JSON_BUILD_INTEGER(ff)), + JSON_BUILD_PAIR("address", JSON_BUILD_BYTE_ARRAY(&parsed, FAMILY_ADDRESS_SIZE(ff)))))), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(canonical)), + JSON_BUILD_PAIR("flags", JSON_BUILD_INTEGER(SD_RESOLVED_FLAGS_MAKE(dns_synthesize_protocol(p->flags), ff, true, true)| + SD_RESOLVED_SYNTHETIC)))); +} + +static int vl_method_resolve_hostname(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch dispatch_table[] = { + { "ifindex", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(LookupParameters, ifindex), 0 }, + { "name", JSON_VARIANT_STRING, json_dispatch_string, offsetof(LookupParameters, name), JSON_MANDATORY }, + { "family", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(LookupParameters, family), 0 }, + { "flags", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(LookupParameters, flags), 0 }, + {} + }; + + _cleanup_(dns_question_unrefp) DnsQuestion *question_idna = NULL, *question_utf8 = NULL; + _cleanup_(lookup_parameters_destroy) LookupParameters p = { + .family = AF_UNSPEC, + }; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + Manager *m; + int r; + + assert(link); + + m = varlink_server_get_userdata(varlink_get_server(link)); + assert(m); + + if (FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) + return -EINVAL; + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (p.ifindex < 0) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("ifindex")); + + r = dns_name_is_valid(p.name); + if (r < 0) + return r; + if (r == 0) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("name")); + + if (!IN_SET(p.family, AF_UNSPEC, AF_INET, AF_INET6)) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("family")); + + if (!validate_and_mangle_flags(p.name, &p.flags, SD_RESOLVED_NO_SEARCH)) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("flags")); + + r = parse_as_address(link, &p); + if (r != 0) + return r; + + r = dns_question_new_address(&question_utf8, p.family, p.name, false); + if (r < 0) + return r; + + r = dns_question_new_address(&question_idna, p.family, p.name, true); + if (r < 0 && r != -EALREADY) + return r; + + r = dns_query_new(m, &q, question_utf8, question_idna ?: question_utf8, NULL, p.ifindex, p.flags); + if (r < 0) + return r; + + q->varlink_request = varlink_ref(link); + varlink_set_userdata(link, q); + q->request_family = p.family; + q->complete = vl_method_resolve_hostname_complete; + + r = dns_query_go(q); + if (r < 0) + return r; + + TAKE_PTR(q); + return 1; +} + +static int json_dispatch_address(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + LookupParameters *p = ASSERT_PTR(userdata); + union in_addr_union buf = {}; + JsonVariant *i; + size_t n, k = 0; + + assert(variant); + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + n = json_variant_elements(variant); + if (!IN_SET(n, 4, 16)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is array of unexpected size.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(i, variant) { + int64_t b; + + if (!json_variant_is_integer(i)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "Element %zu of JSON field '%s' is not an integer.", k, strna(name)); + + b = json_variant_integer(i); + if (b < 0 || b > 0xff) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), + "Element %zu of JSON field '%s' is out of range 0%s255.", + k, strna(name), special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + buf.bytes[k++] = (uint8_t) b; + } + + p->address = buf; + p->address_size = k; + + return 0; +} + +static void vl_method_resolve_address_complete(DnsQuery *query) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + _cleanup_(dns_query_freep) DnsQuery *q = query; + DnsQuestion *question; + DnsResourceRecord *rr; + int ifindex, r; + + assert(q); + + if (q->state != DNS_TRANSACTION_SUCCESS) { + r = reply_query_state(q); + goto finish; + } + + r = dns_query_process_cname_many(q); + if (r == -ELOOP) { + r = varlink_error(q->varlink_request, "io.systemd.Resolve.CNAMELoop", NULL); + goto finish; + } + if (r < 0) + goto finish; + if (r == DNS_QUERY_CNAME) { + /* This was a cname, and the query was restarted. */ + TAKE_PTR(q); + return; + } + + question = dns_query_question_for_protocol(q, q->answer_protocol); + + DNS_ANSWER_FOREACH_IFINDEX(rr, ifindex, q->answer) { + _cleanup_free_ char *normalized = NULL; + + r = dns_question_matches_rr(question, rr, NULL); + if (r < 0) + goto finish; + if (r == 0) + continue; + + r = dns_name_normalize(rr->ptr.name, 0, &normalized); + if (r < 0) + goto finish; + + r = json_variant_append_arrayb( + &array, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_CONDITION(ifindex > 0, "ifindex", JSON_BUILD_INTEGER(ifindex)), + JSON_BUILD_PAIR("name", JSON_BUILD_STRING(normalized)))); + if (r < 0) + goto finish; + } + + if (json_variant_is_blank_object(array)) { + r = varlink_error(q->varlink_request, "io.systemd.Resolve.NoSuchResourceRecord", NULL); + goto finish; + } + + r = varlink_replyb(q->varlink_request, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("names", JSON_BUILD_VARIANT(array)), + JSON_BUILD_PAIR("flags", JSON_BUILD_INTEGER(dns_query_reply_flags_make(q))))); +finish: + if (r < 0) { + log_full_errno(ERRNO_IS_DISCONNECT(r) ? LOG_DEBUG : LOG_ERR, r, "Failed to send address reply: %m"); + r = varlink_error_errno(q->varlink_request, r); + } +} + +static int vl_method_resolve_address(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch dispatch_table[] = { + { "ifindex", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(LookupParameters, ifindex), 0 }, + { "family", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int, offsetof(LookupParameters, family), JSON_MANDATORY }, + { "address", JSON_VARIANT_ARRAY, json_dispatch_address, 0, JSON_MANDATORY }, + { "flags", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(LookupParameters, flags), 0 }, + {} + }; + + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + _cleanup_(lookup_parameters_destroy) LookupParameters p = { + .family = AF_UNSPEC, + }; + _cleanup_(dns_query_freep) DnsQuery *q = NULL; + Manager *m; + int r; + + assert(link); + + m = varlink_server_get_userdata(varlink_get_server(link)); + assert(m); + + if (FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) + return -EINVAL; + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + if (p.ifindex < 0) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("ifindex")); + + if (!IN_SET(p.family, AF_INET, AF_INET6)) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("family")); + + if (FAMILY_ADDRESS_SIZE(p.family) != p.address_size) + return varlink_error(link, "io.systemd.Resolve.BadAddressSize", NULL); + + if (!validate_and_mangle_flags(NULL, &p.flags, 0)) + return varlink_error_invalid_parameter(link, JSON_VARIANT_STRING_CONST("flags")); + + r = dns_question_new_reverse(&question, p.family, &p.address); + if (r < 0) + return r; + + r = dns_query_new(m, &q, question, question, NULL, p.ifindex, p.flags|SD_RESOLVED_NO_SEARCH); + if (r < 0) + return r; + + q->varlink_request = varlink_ref(link); + varlink_set_userdata(link, q); + + q->request_family = p.family; + q->request_address = p.address; + q->complete = vl_method_resolve_address_complete; + + r = dns_query_go(q); + if (r < 0) + return r; + + TAKE_PTR(q); + return 1; +} + +static int vl_method_subscribe_query_results(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Manager *m; + int r; + + assert(link); + + m = ASSERT_PTR(varlink_server_get_userdata(varlink_get_server(link))); + + /* if the client didn't set the more flag, it is using us incorrectly */ + if (!FLAGS_SET(flags, VARLINK_METHOD_MORE)) + return varlink_error_invalid_parameter(link, NULL); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + /* Send a ready message to the connecting client, to indicate that we are now listinening, and all + * queries issued after the point the client sees this will also be reported to the client. */ + r = varlink_notifyb(link, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("ready", JSON_BUILD_BOOLEAN(true)))); + if (r < 0) + return log_error_errno(r, "Failed to report monitor to be established: %m"); + + r = set_ensure_put(&m->varlink_subscription, NULL, link); + if (r < 0) + return log_error_errno(r, "Failed to add subscription to set: %m"); + varlink_ref(link); + + log_debug("%u clients now attached for varlink notifications", set_size(m->varlink_subscription)); + + return 1; +} + +static int vl_method_dump_cache(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *list = NULL; + Manager *m; + int r; + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + m = ASSERT_PTR(varlink_server_get_userdata(varlink_get_server(link))); + + LIST_FOREACH(scopes, s, m->dns_scopes) { + _cleanup_(json_variant_unrefp) JsonVariant *j = NULL; + + r = dns_scope_dump_cache_to_json(s, &j); + if (r < 0) + return r; + + r = json_variant_append_array(&list, j); + if (r < 0) + return r; + } + + if (!list) { + r = json_variant_new_array(&list, NULL, 0); + if (r < 0) + return r; + } + + return varlink_replyb(link, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("dump", JSON_BUILD_VARIANT(list)))); +} + +static int dns_server_dump_state_to_json_list(DnsServer *server, JsonVariant **list) { + _cleanup_(json_variant_unrefp) JsonVariant *j = NULL; + int r; + + assert(list); + assert(server); + + r = dns_server_dump_state_to_json(server, &j); + if (r < 0) + return r; + + return json_variant_append_array(list, j); +} + +static int vl_method_dump_server_state(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *list = NULL; + Manager *m; + int r; + Link *l; + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + m = ASSERT_PTR(varlink_server_get_userdata(varlink_get_server(link))); + + LIST_FOREACH(servers, server, m->dns_servers) { + r = dns_server_dump_state_to_json_list(server, &list); + if (r < 0) + return r; + } + + LIST_FOREACH(servers, server, m->fallback_dns_servers) { + r = dns_server_dump_state_to_json_list(server, &list); + if (r < 0) + return r; + } + + HASHMAP_FOREACH(l, m->links) + LIST_FOREACH(servers, server, l->dns_servers) { + r = dns_server_dump_state_to_json_list(server, &list); + if (r < 0) + return r; + } + + if (!list) { + r = json_variant_new_array(&list, NULL, 0); + if (r < 0) + return r; + } + + return varlink_replyb(link, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("dump", JSON_BUILD_VARIANT(list)))); +} + +static int vl_method_dump_statistics(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *j = NULL; + Manager *m; + int r; + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + m = ASSERT_PTR(varlink_server_get_userdata(varlink_get_server(link))); + + r = dns_manager_dump_statistics_json(m, &j); + if (r < 0) + return r; + + return varlink_replyb(link, JSON_BUILD_VARIANT(j)); +} + +static int vl_method_reset_statistics(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + Manager *m; + + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + m = ASSERT_PTR(varlink_server_get_userdata(varlink_get_server(link))); + + dns_manager_reset_statistics(m); + + return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT); +} + +static int varlink_monitor_server_init(Manager *m) { + _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL; + int r; + + assert(m); + + if (m->varlink_monitor_server) + return 0; + + r = varlink_server_new(&server, VARLINK_SERVER_ROOT_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(server, m); + + r = varlink_server_add_interface(server, &vl_interface_io_systemd_Resolve_Monitor); + if (r < 0) + return log_error_errno(r, "Failed to add Resolve.Monitor interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + server, + "io.systemd.Resolve.Monitor.SubscribeQueryResults", vl_method_subscribe_query_results, + "io.systemd.Resolve.Monitor.DumpCache", vl_method_dump_cache, + "io.systemd.Resolve.Monitor.DumpServerState", vl_method_dump_server_state, + "io.systemd.Resolve.Monitor.DumpStatistics", vl_method_dump_statistics, + "io.systemd.Resolve.Monitor.ResetStatistics", vl_method_reset_statistics); + if (r < 0) + return log_error_errno(r, "Failed to register varlink methods: %m"); + + r = varlink_server_bind_disconnect(server, vl_on_notification_disconnect); + if (r < 0) + return log_error_errno(r, "Failed to register varlink disconnect handler: %m"); + + r = varlink_server_listen_address(server, "/run/systemd/resolve/io.systemd.Resolve.Monitor", 0600); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_attach_event(server, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + m->varlink_monitor_server = TAKE_PTR(server); + + return 0; +} + +static int varlink_main_server_init(Manager *m) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + + if (m->varlink_server) + return 0; + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID); + if (r < 0) + return log_error_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_add_interface(s, &vl_interface_io_systemd_Resolve); + if (r < 0) + return log_error_errno(r, "Failed to add Resolve interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + s, + "io.systemd.Resolve.ResolveHostname", vl_method_resolve_hostname, + "io.systemd.Resolve.ResolveAddress", vl_method_resolve_address); + if (r < 0) + return log_error_errno(r, "Failed to register varlink methods: %m"); + + r = varlink_server_bind_disconnect(s, vl_on_disconnect); + if (r < 0) + return log_error_errno(r, "Failed to register varlink disconnect handler: %m"); + + r = varlink_server_listen_address(s, "/run/systemd/resolve/io.systemd.Resolve", 0666); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + m->varlink_server = TAKE_PTR(s); + return 0; +} + +int manager_varlink_init(Manager *m) { + int r; + + r = varlink_main_server_init(m); + if (r < 0) + return r; + + r = varlink_monitor_server_init(m); + if (r < 0) + return r; + + return 0; +} + +void manager_varlink_done(Manager *m) { + assert(m); + + m->varlink_server = varlink_server_unref(m->varlink_server); + m->varlink_monitor_server = varlink_server_unref(m->varlink_monitor_server); +} diff --git a/src/resolve/resolved-varlink.h b/src/resolve/resolved-varlink.h new file mode 100644 index 0000000..57fdfe9 --- /dev/null +++ b/src/resolve/resolved-varlink.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "resolved-manager.h" + +int manager_varlink_init(Manager *m); +void manager_varlink_done(Manager *m); diff --git a/src/resolve/resolved.c b/src/resolve/resolved.c new file mode 100644 index 0000000..1625c51 --- /dev/null +++ b/src/resolve/resolved.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-event.h" + +#include "bus-log-control-api.h" +#include "capability-util.h" +#include "daemon-util.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "resolved-bus.h" +#include "resolved-conf.h" +#include "resolved-manager.h" +#include "resolved-resolv-conf.h" +#include "selinux-util.h" +#include "service-util.h" +#include "signal-util.h" +#include "user-util.h" + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL; + int r; + + log_setup(); + + r = service_parse_argv("systemd-resolved.service", + "Provide name resolution with caching using DNS, mDNS, LLMNR.", + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + + /* Drop privileges, but only if we have been started as root. If we are not running as root we assume most + * privileges are already dropped and we can't create our directory. */ + if (getuid() == 0) { + const char *user = "systemd-resolve"; + uid_t uid; + gid_t gid; + + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Cannot resolve user name %s: %m", user); + + /* As we're root, we can create the directory where resolv.conf will live */ + r = mkdir_safe_label("/run/systemd/resolve", 0755, uid, gid, MKDIR_WARN_MODE); + if (r < 0) + return log_error_errno(r, "Could not create runtime directory: %m"); + + /* Drop privileges, but keep three caps. Note that we drop two of those too, later on (see below) */ + r = drop_privileges(uid, gid, + (UINT64_C(1) << CAP_NET_RAW)| /* needed for SO_BINDTODEVICE */ + (UINT64_C(1) << CAP_NET_BIND_SERVICE)| /* needed to bind on port 53 */ + (UINT64_C(1) << CAP_SETPCAP) /* needed in order to drop the caps later */); + if (r < 0) + return log_error_errno(r, "Failed to drop privileges: %m"); + } + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, SIGUSR1, SIGUSR2, SIGRTMIN+1, SIGRTMIN+18, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Could not create manager: %m"); + + r = manager_start(m); + if (r < 0) + return log_error_errno(r, "Failed to start manager: %m"); + + /* Write finish default resolv.conf to avoid a dangling symlink */ + (void) manager_write_resolv_conf(m); + + (void) manager_check_resolv_conf(m); + + /* Let's drop the remaining caps now */ + r = capability_bounding_set_drop((UINT64_C(1) << CAP_NET_RAW), true); + if (r < 0) + return log_error_errno(r, "Failed to drop remaining caps: %m"); + + notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/resolve/resolved.conf.in b/src/resolve/resolved.conf.in new file mode 100644 index 0000000..0031b15 --- /dev/null +++ b/src/resolve/resolved.conf.in @@ -0,0 +1,37 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/resolved.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/resolved.conf' to display the full config. +# +# See resolved.conf(5) for details. + +[Resolve] +# Some examples of DNS servers which may be used for DNS= and FallbackDNS=: +# Cloudflare: 1.1.1.1#cloudflare-dns.com 1.0.0.1#cloudflare-dns.com 2606:4700:4700::1111#cloudflare-dns.com 2606:4700:4700::1001#cloudflare-dns.com +# Google: 8.8.8.8#dns.google 8.8.4.4#dns.google 2001:4860:4860::8888#dns.google 2001:4860:4860::8844#dns.google +# Quad9: 9.9.9.9#dns.quad9.net 149.112.112.112#dns.quad9.net 2620:fe::fe#dns.quad9.net 2620:fe::9#dns.quad9.net +#DNS= +#FallbackDNS={{DNS_SERVERS}} +#Domains= +#DNSSEC={{DEFAULT_DNSSEC_MODE_STR}} +#DNSOverTLS={{DEFAULT_DNS_OVER_TLS_MODE_STR}} +#MulticastDNS={{DEFAULT_MDNS_MODE_STR}} +#LLMNR={{DEFAULT_LLMNR_MODE_STR}} +#Cache=yes +#CacheFromLocalhost=no +#DNSStubListener=yes +#DNSStubListenerExtra= +#ReadEtcHosts=yes +#ResolveUnicastSingleLabel=no +#StaleRetentionSec=0 diff --git a/src/resolve/test-dns-packet.c b/src/resolve/test-dns-packet.c new file mode 100644 index 0000000..ca09b08 --- /dev/null +++ b/src/resolve/test-dns-packet.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "fileio.h" +#include "glob-util.h" +#include "log.h" +#include "macro.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-rr.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "unaligned.h" + +#define HASH_KEY SD_ID128_MAKE(d3,1e,48,90,4b,fa,4c,fe,af,9d,d5,a1,d7,2e,8a,b1) + +static void verify_rr_copy(DnsResourceRecord *rr) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *copy = NULL; + const char *a, *b; + + assert_se(copy = dns_resource_record_copy(rr)); + assert_se(dns_resource_record_equal(copy, rr) > 0); + + assert_se(a = dns_resource_record_to_string(rr)); + assert_se(b = dns_resource_record_to_string(copy)); + + assert_se(streq(a, b)); +} + +static uint64_t hash(DnsResourceRecord *rr) { + struct siphash state; + + siphash24_init(&state, HASH_KEY.bytes); + dns_resource_record_hash_func(rr, &state); + return siphash24_finalize(&state); +} + +static void test_packet_from_file(const char* filename, bool canonical) { + _cleanup_free_ char *data = NULL; + size_t data_size, packet_size, offset; + + assert_se(read_full_file(filename, &data, &data_size) >= 0); + assert_se(data); + assert_se(data_size > 8); + + log_info("============== %s %s==============", filename, canonical ? "canonical " : ""); + + for (offset = 0; offset < data_size; offset += 8 + packet_size) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL, *p2 = NULL; + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL, *rr2 = NULL; + const char *s, *s2; + uint64_t hash1, hash2; + + packet_size = unaligned_read_le64(data + offset); + assert_se(packet_size > 0); + assert_se(offset + 8 + packet_size <= data_size); + + assert_se(dns_packet_new(&p, DNS_PROTOCOL_DNS, 0, DNS_PACKET_SIZE_MAX) >= 0); + + assert_se(dns_packet_append_blob(p, data + offset + 8, packet_size, NULL) >= 0); + assert_se(dns_packet_read_rr(p, &rr, NULL, NULL) >= 0); + + verify_rr_copy(rr); + + s = dns_resource_record_to_string(rr); + assert_se(s); + puts(s); + + hash1 = hash(rr); + + assert_se(dns_resource_record_to_wire_format(rr, canonical) >= 0); + + assert_se(dns_packet_new(&p2, DNS_PROTOCOL_DNS, 0, DNS_PACKET_SIZE_MAX) >= 0); + assert_se(dns_packet_append_blob(p2, rr->wire_format, rr->wire_format_size, NULL) >= 0); + assert_se(dns_packet_read_rr(p2, &rr2, NULL, NULL) >= 0); + + verify_rr_copy(rr); + + s2 = dns_resource_record_to_string(rr); + assert_se(s2); + assert_se(streq(s, s2)); + + hash2 = hash(rr); + assert_se(hash1 == hash2); + } +} + +static void test_dns_resource_record_get_cname_target(void) { + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *cname = NULL, *dname = NULL; + _cleanup_free_ char *target = NULL; + + assert_se(cname = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_CNAME, "quux.foobar")); + assert_se(cname->cname.name = strdup("wuff.wuff")); + + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "waldo"), cname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "foobar"), cname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "quux"), cname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, ""), cname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "."), cname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "nope.quux.foobar"), cname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "quux.foobar"), cname, &target) == 0); + assert_se(streq(target, "wuff.wuff")); + target = mfree(target); + + assert_se(dname = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNAME, "quux.foobar")); + assert_se(dname->dname.name = strdup("wuff.wuff")); + + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "waldo"), dname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "foobar"), dname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "quux"), dname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, ""), dname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "."), dname, &target) == -EUNATCH); + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "yupp.quux.foobar"), dname, &target) == 0); + assert_se(streq(target, "yupp.wuff.wuff")); + target = mfree(target); + + assert_se(dns_resource_record_get_cname_target(&DNS_RESOURCE_KEY_CONST(DNS_CLASS_IN, DNS_TYPE_A, "quux.foobar"), cname, &target) == 0); + assert_se(streq(target, "wuff.wuff")); +} + +int main(int argc, char **argv) { + int N; + _cleanup_globfree_ glob_t g = {}; + char **fnames; + + test_setup_logging(LOG_DEBUG); + + if (argc >= 2) { + N = argc - 1; + fnames = argv + 1; + } else { + _cleanup_free_ char *pkts_glob = NULL; + assert_se(get_testdata_dir("test-resolve/*.pkts", &pkts_glob) >= 0); + assert_se(glob(pkts_glob, GLOB_NOSORT, NULL, &g) == 0); + N = g.gl_pathc; + fnames = g.gl_pathv; + } + + for (int i = 0; i < N; i++) { + test_packet_from_file(fnames[i], false); + puts(""); + test_packet_from_file(fnames[i], true); + if (i + 1 < N) + puts(""); + } + + test_dns_resource_record_get_cname_target(); + + return EXIT_SUCCESS; +} diff --git a/src/resolve/test-dnssec-complex.c b/src/resolve/test-dnssec-complex.c new file mode 100644 index 0000000..05a5f07 --- /dev/null +++ b/src/resolve/test-dnssec-complex.c @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-bus.h" + +#include "af-list.h" +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-locator.h" +#include "dns-type.h" +#include "random-util.h" +#include "resolved-def.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" + +static void prefix_random(const char *name, char **ret) { + uint64_t i, u; + char *m = NULL; + + u = 1 + (random_u64() & 3); + + for (i = 0; i < u; i++) { + _cleanup_free_ char *b = NULL; + char *x; + + assert_se(asprintf(&b, "x%" PRIu64 "x", random_u64())); + x = strjoin(b, ".", name); + assert_se(x); + + free(m); + m = x; + } + + *ret = m; + } + +static void test_rr_lookup(sd_bus *bus, const char *name, uint16_t type, const char *result) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *m = NULL; + int r; + + /* If the name starts with a dot, we prefix one to three random labels */ + if (startswith(name, ".")) { + prefix_random(name + 1, &m); + name = m; + } + + assert_se(bus_message_new_method_call(bus, &req, bus_resolve_mgr, "ResolveRecord") >= 0); + + assert_se(sd_bus_message_append(req, "isqqt", 0, name, DNS_CLASS_IN, type, UINT64_C(0)) >= 0); + + r = sd_bus_call(bus, req, SD_RESOLVED_QUERY_TIMEOUT_USEC, &error, &reply); + + if (r < 0) { + assert_se(result); + assert_se(sd_bus_error_has_name(&error, result)); + log_info("[OK] %s/%s resulted in <%s>.", name, dns_type_to_string(type), error.name); + } else { + assert_se(!result); + log_info("[OK] %s/%s succeeded.", name, dns_type_to_string(type)); + } +} + +static void test_hostname_lookup(sd_bus *bus, const char *name, int family, const char *result) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *m = NULL; + const char *af; + int r; + + af = family == AF_UNSPEC ? "AF_UNSPEC" : af_to_name(family); + + /* If the name starts with a dot, we prefix one to three random labels */ + if (startswith(name, ".")) { + prefix_random(name + 1, &m); + name = m; + } + + assert_se(bus_message_new_method_call(bus, &req, bus_resolve_mgr, "ResolveHostname") >= 0); + + assert_se(sd_bus_message_append(req, "isit", 0, name, family, UINT64_C(0)) >= 0); + + r = sd_bus_call(bus, req, SD_RESOLVED_QUERY_TIMEOUT_USEC, &error, &reply); + + if (r < 0) { + assert_se(result); + assert_se(sd_bus_error_has_name(&error, result)); + log_info("[OK] %s/%s resulted in <%s>.", name, af, error.name); + } else { + assert_se(!result); + log_info("[OK] %s/%s succeeded.", name, af); + } + +} + +int main(int argc, char* argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + + /* Note that this is a manual test as it requires: + * + * Full network access + * A DNSSEC capable DNS server + * That zones contacted are still set up as they were when I wrote this. + */ + + test_setup_logging(LOG_DEBUG); + + assert_se(sd_bus_open_system(&bus) >= 0); + + /* Normally signed */ + test_rr_lookup(bus, "www.eurid.eu", DNS_TYPE_A, NULL); + test_hostname_lookup(bus, "www.eurid.eu", AF_UNSPEC, NULL); + + test_rr_lookup(bus, "sigok.verteiltesysteme.net", DNS_TYPE_A, NULL); + test_hostname_lookup(bus, "sigok.verteiltesysteme.net", AF_UNSPEC, NULL); + + /* Normally signed, NODATA */ + test_rr_lookup(bus, "www.eurid.eu", DNS_TYPE_RP, BUS_ERROR_NO_SUCH_RR); + test_rr_lookup(bus, "sigok.verteiltesysteme.net", DNS_TYPE_RP, BUS_ERROR_NO_SUCH_RR); + + /* Invalid signature */ + test_rr_lookup(bus, "sigfail.verteiltesysteme.net", DNS_TYPE_A, BUS_ERROR_DNSSEC_FAILED); + test_hostname_lookup(bus, "sigfail.verteiltesysteme.net", AF_INET, BUS_ERROR_DNSSEC_FAILED); + + /* Invalid signature, RSA, wildcard */ + test_rr_lookup(bus, ".wilda.rhybar.0skar.cz", DNS_TYPE_A, BUS_ERROR_DNSSEC_FAILED); + test_hostname_lookup(bus, ".wilda.rhybar.0skar.cz", AF_INET, BUS_ERROR_DNSSEC_FAILED); + + /* Invalid signature, ECDSA, wildcard */ + test_rr_lookup(bus, ".wilda.rhybar.ecdsa.0skar.cz", DNS_TYPE_A, BUS_ERROR_DNSSEC_FAILED); + test_hostname_lookup(bus, ".wilda.rhybar.ecdsa.0skar.cz", AF_INET, BUS_ERROR_DNSSEC_FAILED); + + /* Missing DS for DNSKEY */ + test_rr_lookup(bus, "www.dnssec-bogus.sg", DNS_TYPE_A, BUS_ERROR_DNSSEC_FAILED); + test_hostname_lookup(bus, "www.dnssec-bogus.sg", AF_INET, BUS_ERROR_DNSSEC_FAILED); + + /* NXDOMAIN in NSEC domain */ + test_rr_lookup(bus, "hhh.nasa.gov", DNS_TYPE_A, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "hhh.nasa.gov", AF_UNSPEC, BUS_ERROR_DNS_NXDOMAIN); + test_rr_lookup(bus, "_pgpkey-https._tcp.hkps.pool.sks-keyservers.net", DNS_TYPE_SRV, BUS_ERROR_DNS_NXDOMAIN); + + /* wildcard, NSEC zone */ + test_rr_lookup(bus, ".wilda.nsec.0skar.cz", DNS_TYPE_A, NULL); + test_hostname_lookup(bus, ".wilda.nsec.0skar.cz", AF_INET, NULL); + + /* wildcard, NSEC zone, NODATA */ + test_rr_lookup(bus, ".wilda.nsec.0skar.cz", DNS_TYPE_RP, BUS_ERROR_NO_SUCH_RR); + + /* wildcard, NSEC3 zone */ + test_rr_lookup(bus, ".wilda.0skar.cz", DNS_TYPE_A, NULL); + test_hostname_lookup(bus, ".wilda.0skar.cz", AF_INET, NULL); + + /* wildcard, NSEC3 zone, NODATA */ + test_rr_lookup(bus, ".wilda.0skar.cz", DNS_TYPE_RP, BUS_ERROR_NO_SUCH_RR); + + /* wildcard, NSEC zone, CNAME */ + test_rr_lookup(bus, ".wild.nsec.0skar.cz", DNS_TYPE_A, NULL); + test_hostname_lookup(bus, ".wild.nsec.0skar.cz", AF_UNSPEC, NULL); + test_hostname_lookup(bus, ".wild.nsec.0skar.cz", AF_INET, NULL); + + /* wildcard, NSEC zone, NODATA, CNAME */ + test_rr_lookup(bus, ".wild.nsec.0skar.cz", DNS_TYPE_RP, BUS_ERROR_NO_SUCH_RR); + + /* wildcard, NSEC3 zone, CNAME */ + test_rr_lookup(bus, ".wild.0skar.cz", DNS_TYPE_A, NULL); + test_hostname_lookup(bus, ".wild.0skar.cz", AF_UNSPEC, NULL); + test_hostname_lookup(bus, ".wild.0skar.cz", AF_INET, NULL); + + /* wildcard, NSEC3 zone, NODATA, CNAME */ + test_rr_lookup(bus, ".wild.0skar.cz", DNS_TYPE_RP, BUS_ERROR_NO_SUCH_RR); + + /* NODATA due to empty non-terminal in NSEC domain */ + test_rr_lookup(bus, "herndon.nasa.gov", DNS_TYPE_A, BUS_ERROR_NO_SUCH_RR); + test_hostname_lookup(bus, "herndon.nasa.gov", AF_UNSPEC, BUS_ERROR_NO_SUCH_RR); + test_hostname_lookup(bus, "herndon.nasa.gov", AF_INET, BUS_ERROR_NO_SUCH_RR); + test_hostname_lookup(bus, "herndon.nasa.gov", AF_INET6, BUS_ERROR_NO_SUCH_RR); + + /* NXDOMAIN in NSEC root zone: */ + test_rr_lookup(bus, "jasdhjas.kjkfgjhfjg", DNS_TYPE_A, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "jasdhjas.kjkfgjhfjg", AF_UNSPEC, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "jasdhjas.kjkfgjhfjg", AF_INET, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "jasdhjas.kjkfgjhfjg", AF_INET6, BUS_ERROR_DNS_NXDOMAIN); + + /* NXDOMAIN in NSEC3 .com zone: */ + test_rr_lookup(bus, "kjkfgjhfjgsdfdsfd.com", DNS_TYPE_A, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "kjkfgjhfjgsdfdsfd.com", AF_INET, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "kjkfgjhfjgsdfdsfd.com", AF_INET6, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, "kjkfgjhfjgsdfdsfd.com", AF_UNSPEC, BUS_ERROR_DNS_NXDOMAIN); + + /* Unsigned A */ + test_rr_lookup(bus, "poettering.de", DNS_TYPE_A, NULL); + test_rr_lookup(bus, "poettering.de", DNS_TYPE_AAAA, NULL); + test_hostname_lookup(bus, "poettering.de", AF_UNSPEC, NULL); + test_hostname_lookup(bus, "poettering.de", AF_INET, NULL); + test_hostname_lookup(bus, "poettering.de", AF_INET6, NULL); + +#if HAVE_LIBIDN2 || HAVE_LIBIDN + /* Unsigned A with IDNA conversion necessary */ + test_hostname_lookup(bus, "pöttering.de", AF_UNSPEC, NULL); + test_hostname_lookup(bus, "pöttering.de", AF_INET, NULL); + test_hostname_lookup(bus, "pöttering.de", AF_INET6, NULL); +#endif + + /* DNAME, pointing to NXDOMAIN */ + test_rr_lookup(bus, ".ireallyhpoethisdoesnexist.xn--kprw13d.", DNS_TYPE_A, BUS_ERROR_DNS_NXDOMAIN); + test_rr_lookup(bus, ".ireallyhpoethisdoesnexist.xn--kprw13d.", DNS_TYPE_RP, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, ".ireallyhpoethisdoesntexist.xn--kprw13d.", AF_UNSPEC, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, ".ireallyhpoethisdoesntexist.xn--kprw13d.", AF_INET, BUS_ERROR_DNS_NXDOMAIN); + test_hostname_lookup(bus, ".ireallyhpoethisdoesntexist.xn--kprw13d.", AF_INET6, BUS_ERROR_DNS_NXDOMAIN); + + return 0; +} diff --git a/src/resolve/test-dnssec.c b/src/resolve/test-dnssec.c new file mode 100644 index 0000000..d325b53 --- /dev/null +++ b/src/resolve/test-dnssec.c @@ -0,0 +1,787 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#if HAVE_GCRYPT +# include +#endif + +#include "alloc-util.h" +#include "hexdecoct.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-rr.h" +#include "string-util.h" +#include "tests.h" + +TEST(dnssec_verify_dns_key) { + static const uint8_t ds1_fprint[] = { + 0x46, 0x8B, 0xC8, 0xDD, 0xC7, 0xE8, 0x27, 0x03, 0x40, 0xBB, 0x8A, 0x1F, 0x3B, 0x2E, 0x45, 0x9D, + 0x80, 0x67, 0x14, 0x01, + }; + static const uint8_t ds2_fprint[] = { + 0x8A, 0xEE, 0x80, 0x47, 0x05, 0x5F, 0x83, 0xD1, 0x48, 0xBA, 0x8F, 0xF6, 0xDD, 0xA7, 0x60, 0xCE, + 0x94, 0xF7, 0xC7, 0x5E, 0x52, 0x4C, 0xF2, 0xE9, 0x50, 0xB9, 0x2E, 0xCB, 0xEF, 0x96, 0xB9, 0x98, + }; + static const uint8_t dnskey_blob[] = { + 0x03, 0x01, 0x00, 0x01, 0xa8, 0x12, 0xda, 0x4f, 0xd2, 0x7d, 0x54, 0x14, 0x0e, 0xcc, 0x5b, 0x5e, + 0x45, 0x9c, 0x96, 0x98, 0xc0, 0xc0, 0x85, 0x81, 0xb1, 0x47, 0x8c, 0x7d, 0xe8, 0x39, 0x50, 0xcc, + 0xc5, 0xd0, 0xf2, 0x00, 0x81, 0x67, 0x79, 0xf6, 0xcc, 0x9d, 0xad, 0x6c, 0xbb, 0x7b, 0x6f, 0x48, + 0x97, 0x15, 0x1c, 0xfd, 0x0b, 0xfe, 0xd3, 0xd7, 0x7d, 0x9f, 0x81, 0x26, 0xd3, 0xc5, 0x65, 0x49, + 0xcf, 0x46, 0x62, 0xb0, 0x55, 0x6e, 0x47, 0xc7, 0x30, 0xef, 0x51, 0xfb, 0x3e, 0xc6, 0xef, 0xde, + 0x27, 0x3f, 0xfa, 0x57, 0x2d, 0xa7, 0x1d, 0x80, 0x46, 0x9a, 0x5f, 0x14, 0xb3, 0xb0, 0x2c, 0xbe, + 0x72, 0xca, 0xdf, 0xb2, 0xff, 0x36, 0x5b, 0x4f, 0xec, 0x58, 0x8e, 0x8d, 0x01, 0xe9, 0xa9, 0xdf, + 0xb5, 0x60, 0xad, 0x52, 0x4d, 0xfc, 0xa9, 0x3e, 0x8d, 0x35, 0x95, 0xb3, 0x4e, 0x0f, 0xca, 0x45, + 0x1b, 0xf7, 0xef, 0x3a, 0x88, 0x25, 0x08, 0xc7, 0x4e, 0x06, 0xc1, 0x62, 0x1a, 0xce, 0xd8, 0x77, + 0xbd, 0x02, 0x65, 0xf8, 0x49, 0xfb, 0xce, 0xf6, 0xa8, 0x09, 0xfc, 0xde, 0xb2, 0x09, 0x9d, 0x39, + 0xf8, 0x63, 0x9c, 0x32, 0x42, 0x7c, 0xa0, 0x30, 0x86, 0x72, 0x7a, 0x4a, 0xc6, 0xd4, 0xb3, 0x2d, + 0x24, 0xef, 0x96, 0x3f, 0xc2, 0xda, 0xd3, 0xf2, 0x15, 0x6f, 0xda, 0x65, 0x4b, 0x81, 0x28, 0x68, + 0xf4, 0xfe, 0x3e, 0x71, 0x4f, 0x50, 0x96, 0x72, 0x58, 0xa1, 0x89, 0xdd, 0x01, 0x61, 0x39, 0x39, + 0xc6, 0x76, 0xa4, 0xda, 0x02, 0x70, 0x3d, 0xc0, 0xdc, 0x8d, 0x70, 0x72, 0x04, 0x90, 0x79, 0xd4, + 0xec, 0x65, 0xcf, 0x49, 0x35, 0x25, 0x3a, 0x14, 0x1a, 0x45, 0x20, 0xeb, 0x31, 0xaf, 0x92, 0xba, + 0x20, 0xd3, 0xcd, 0xa7, 0x13, 0x44, 0xdc, 0xcf, 0xf0, 0x27, 0x34, 0xb9, 0xe7, 0x24, 0x6f, 0x73, + 0xe7, 0xea, 0x77, 0x03, + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *dnskey = NULL, *ds1 = NULL, *ds2 = NULL; + + /* The two DS RRs in effect for nasa.gov on 2015-12-01. */ + ds1 = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, "nasa.gov"); + assert_se(ds1); + + ds1->ds.key_tag = 47857; + ds1->ds.algorithm = DNSSEC_ALGORITHM_RSASHA256; + ds1->ds.digest_type = DNSSEC_DIGEST_SHA1; + ds1->ds.digest_size = sizeof(ds1_fprint); + ds1->ds.digest = memdup(ds1_fprint, ds1->ds.digest_size); + assert_se(ds1->ds.digest); + + log_info("DS1: %s", strna(dns_resource_record_to_string(ds1))); + + ds2 = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, "NASA.GOV"); + assert_se(ds2); + + ds2->ds.key_tag = 47857; + ds2->ds.algorithm = DNSSEC_ALGORITHM_RSASHA256; + ds2->ds.digest_type = DNSSEC_DIGEST_SHA256; + ds2->ds.digest_size = sizeof(ds2_fprint); + ds2->ds.digest = memdup(ds2_fprint, ds2->ds.digest_size); + assert_se(ds2->ds.digest); + + log_info("DS2: %s", strna(dns_resource_record_to_string(ds2))); + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "nasa.GOV"); + assert_se(dnskey); + + dnskey->dnskey.flags = 257; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_RSASHA256; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + log_info("DNSKEY keytag: %u", dnssec_keytag(dnskey, false)); + + assert_se(dnssec_verify_dnskey_by_ds(dnskey, ds1, false) > 0); + assert_se(dnssec_verify_dnskey_by_ds(dnskey, ds2, false) > 0); +} + +TEST(dnssec_verify_rfc8080_ed25519_example1) { + static const uint8_t dnskey_blob[] = { + 0x97, 0x4d, 0x96, 0xa2, 0x2d, 0x22, 0x4b, 0xc0, 0x1a, 0xdb, 0x91, 0x50, 0x91, 0x47, 0x7d, + 0x44, 0xcc, 0xd9, 0x1c, 0x9a, 0x41, 0xa1, 0x14, 0x30, 0x01, 0x01, 0x17, 0xd5, 0x2c, 0x59, + 0x24, 0xe + }; + static const uint8_t ds_fprint[] = { + 0xdd, 0xa6, 0xb9, 0x69, 0xbd, 0xfb, 0x79, 0xf7, 0x1e, 0xe7, 0xb7, 0xfb, 0xdf, 0xb7, 0xdc, + 0xd7, 0xad, 0xbb, 0xd3, 0x5d, 0xdf, 0x79, 0xed, 0x3b, 0x6d, 0xd7, 0xf6, 0xe3, 0x56, 0xdd, + 0xd7, 0x47, 0xf7, 0x6f, 0x5f, 0x7a, 0xe1, 0xa6, 0xf9, 0xe5, 0xce, 0xfc, 0x7b, 0xbf, 0x5a, + 0xdf, 0x4e, 0x1b + }; + static const uint8_t signature_blob[] = { + 0xa0, 0xbf, 0x64, 0xac, 0x9b, 0xa7, 0xef, 0x17, 0xc1, 0x38, 0x85, 0x9c, 0x18, 0x78, 0xbb, + 0x99, 0xa8, 0x39, 0xfe, 0x17, 0x59, 0xac, 0xa5, 0xb0, 0xd7, 0x98, 0xcf, 0x1a, 0xb1, 0xe9, + 0x8d, 0x07, 0x91, 0x02, 0xf4, 0xdd, 0xb3, 0x36, 0x8f, 0x0f, 0xe4, 0x0b, 0xb3, 0x77, 0xf1, + 0xf0, 0x0e, 0x0c, 0xdd, 0xed, 0xb7, 0x99, 0x16, 0x7d, 0x56, 0xb6, 0xe9, 0x32, 0x78, 0x30, + 0x72, 0xba, 0x8d, 0x02 + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *dnskey = NULL, *ds = NULL, *mx = NULL, + *rrsig = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "example.com."); + assert_se(dnskey); + + dnskey->dnskey.flags = 257; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_ED25519; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + + ds = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, "example.com."); + assert_se(ds); + + ds->ds.key_tag = 3613; + ds->ds.algorithm = DNSSEC_ALGORITHM_ED25519; + ds->ds.digest_type = DNSSEC_DIGEST_SHA256; + ds->ds.digest_size = sizeof(ds_fprint); + ds->ds.digest = memdup(ds_fprint, ds->ds.digest_size); + assert_se(ds->ds.digest); + + log_info("DS: %s", strna(dns_resource_record_to_string(ds))); + + mx = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_MX, "example.com."); + assert_se(mx); + + mx->mx.priority = 10; + mx->mx.exchange = strdup("mail.example.com."); + assert_se(mx->mx.exchange); + + log_info("MX: %s", strna(dns_resource_record_to_string(mx))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "example.com."); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_MX; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_ED25519; + rrsig->rrsig.labels = 2; + rrsig->rrsig.original_ttl = 3600; + rrsig->rrsig.expiration = 1440021600; + rrsig->rrsig.inception = 1438207200; + rrsig->rrsig.key_tag = 3613; + rrsig->rrsig.signer = strdup("example.com."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + assert_se(dnssec_key_match_rrsig(mx->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(1); + assert_se(answer); + assert_se(dns_answer_add(answer, mx, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + assert_se(dnssec_verify_rrset(answer, mx->key, rrsig, dnskey, + rrsig->rrsig.inception * USEC_PER_SEC, &result) >= 0); +#if PREFER_OPENSSL || GCRYPT_VERSION_NUMBER >= 0x010600 + assert_se(result == DNSSEC_VALIDATED); +#else + assert_se(result == DNSSEC_UNSUPPORTED_ALGORITHM); +#endif +} + +TEST(dnssec_verify_rfc8080_ed25519_example2) { + static const uint8_t dnskey_blob[] = { + 0xcc, 0xf9, 0xd9, 0xfd, 0x0c, 0x04, 0x7b, 0xb4, 0xbc, 0x0b, 0x94, 0x8f, 0xcf, 0x63, 0x9f, + 0x4b, 0x94, 0x51, 0xe3, 0x40, 0x13, 0x93, 0x6f, 0xeb, 0x62, 0x71, 0x3d, 0xc4, 0x72, 0x4, + 0x8a, 0x3b + }; + static const uint8_t ds_fprint[] = { + 0xe3, 0x4d, 0x7b, 0xf3, 0x56, 0xfd, 0xdf, 0x87, 0xb7, 0xf7, 0x67, 0x5e, 0xe3, 0xdd, 0x9e, + 0x73, 0xbe, 0xda, 0x7b, 0x67, 0xb5, 0xe5, 0xde, 0xf4, 0x7f, 0xae, 0x7b, 0xe5, 0xad, 0x5c, + 0xd1, 0xb7, 0x39, 0xf5, 0xce, 0x76, 0xef, 0x97, 0x34, 0xe1, 0xe6, 0xde, 0xf3, 0x47, 0x3a, + 0xeb, 0x5e, 0x1c + }; + static const uint8_t signature_blob[] = { + 0xcd, 0x74, 0x34, 0x6e, 0x46, 0x20, 0x41, 0x31, 0x05, 0xc9, 0xf2, 0xf2, 0x8b, 0xd4, 0x28, + 0x89, 0x8e, 0x83, 0xf1, 0x97, 0x58, 0xa3, 0x8c, 0x32, 0x52, 0x15, 0x62, 0xa1, 0x86, 0x57, + 0x15, 0xd4, 0xf8, 0xd7, 0x44, 0x0f, 0x44, 0x84, 0xd0, 0x4a, 0xa2, 0x52, 0x9f, 0x34, 0x28, + 0x4a, 0x6e, 0x69, 0xa0, 0x9e, 0xe0, 0x0f, 0xb0, 0x10, 0x47, 0x43, 0xbb, 0x2a, 0xe2, 0x39, + 0x93, 0x6a, 0x5c, 0x06 + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *dnskey = NULL, *ds = NULL, *mx = NULL, + *rrsig = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "example.com."); + assert_se(dnskey); + + dnskey->dnskey.flags = 257; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_ED25519; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + + ds = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, "example.com."); + assert_se(ds); + + ds->ds.key_tag = 35217; + ds->ds.algorithm = DNSSEC_ALGORITHM_ED25519; + ds->ds.digest_type = DNSSEC_DIGEST_SHA256; + ds->ds.digest_size = sizeof(ds_fprint); + ds->ds.digest = memdup(ds_fprint, ds->ds.digest_size); + assert_se(ds->ds.digest); + + log_info("DS: %s", strna(dns_resource_record_to_string(ds))); + + mx = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_MX, "example.com."); + assert_se(mx); + + mx->mx.priority = 10; + mx->mx.exchange = strdup("mail.example.com."); + assert_se(mx->mx.exchange); + + log_info("MX: %s", strna(dns_resource_record_to_string(mx))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "example.com."); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_MX; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_ED25519; + rrsig->rrsig.labels = 2; + rrsig->rrsig.original_ttl = 3600; + rrsig->rrsig.expiration = 1440021600; + rrsig->rrsig.inception = 1438207200; + rrsig->rrsig.key_tag = 35217; + rrsig->rrsig.signer = strdup("example.com."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + assert_se(dnssec_key_match_rrsig(mx->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(1); + assert_se(answer); + assert_se(dns_answer_add(answer, mx, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + assert_se(dnssec_verify_rrset(answer, mx->key, rrsig, dnskey, + rrsig->rrsig.inception * USEC_PER_SEC, &result) >= 0); +#if PREFER_OPENSSL || GCRYPT_VERSION_NUMBER >= 0x010600 + assert_se(result == DNSSEC_VALIDATED); +#else + assert_se(result == DNSSEC_UNSUPPORTED_ALGORITHM); +#endif +} + +TEST(dnssec_verify_rfc6605_example1) { + static const uint8_t signature_blob[] = { + 0xab, 0x1e, 0xb0, 0x2d, 0x8a, 0xa6, 0x87, 0xe9, 0x7d, 0xa0, 0x22, 0x93, 0x37, 0xaa, 0x88, 0x73, + 0xe6, 0xf0, 0xeb, 0x26, 0xbe, 0x28, 0x9f, 0x28, 0x33, 0x3d, 0x18, 0x3f, 0x5d, 0x3b, 0x7a, 0x95, + 0xc0, 0xc8, 0x69, 0xad, 0xfb, 0x74, 0x8d, 0xae, 0xe3, 0xc5, 0x28, 0x6e, 0xed, 0x66, 0x82, 0xc1, + 0x2e, 0x55, 0x33, 0x18, 0x6b, 0xac, 0xed, 0x9c, 0x26, 0xc1, 0x67, 0xa9, 0xeb, 0xae, 0x95, 0x0b, + }; + + static const uint8_t ds_fprint[] = { + 0x6f, 0x87, 0x3c, 0x73, 0x57, 0xde, 0xd9, 0xee, 0xf8, 0xef, 0xbd, 0x76, 0xed, 0xbd, 0xbb, 0xd7, + 0x5e, 0x7a, 0xe7, 0xa6, 0x9d, 0xeb, 0x6e, 0x7a, 0x7f, 0x8d, 0xb8, 0xeb, 0x6e, 0x5b, 0x7f, 0x97, + 0x35, 0x7b, 0x6e, 0xfb, 0xd1, 0xc7, 0xba, 0x77, 0xa7, 0xb7, 0xed, 0xd7, 0xfa, 0xd5, 0xdd, 0x7b, + }; + + static const uint8_t dnskey_blob[] = { + 0x1a, 0x88, 0xc8, 0x86, 0x15, 0xd4, 0x37, 0xfb, 0xb8, 0xbf, 0x9e, 0x19, 0x42, 0xa1, 0x92, 0x9f, + 0x28, 0x56, 0x27, 0x06, 0xae, 0x6c, 0x2b, 0xd3, 0x99, 0xe7, 0xb1, 0xbf, 0xb6, 0xd1, 0xe9, 0xe7, + 0x5b, 0x92, 0xb4, 0xaa, 0x42, 0x91, 0x7a, 0xe1, 0xc6, 0x1b, 0x70, 0x1e, 0xf0, 0x35, 0xc3, 0xfe, + 0x7b, 0xe3, 0x00, 0x9c, 0xba, 0xfe, 0x5a, 0x2f, 0x71, 0x31, 0x6c, 0x90, 0x2d, 0xcf, 0x0d, 0x00, + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *dnskey = NULL, *ds = NULL, *a = NULL, + *rrsig = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "example.net."); + assert_se(dnskey); + + dnskey->dnskey.flags = 257; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_ECDSAP256SHA256; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + + ds = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, "example.net."); + assert_se(ds); + + ds->ds.key_tag = 55648; + ds->ds.algorithm = DNSSEC_ALGORITHM_ECDSAP256SHA256; + ds->ds.digest_type = DNSSEC_DIGEST_SHA256; + ds->ds.digest_size = sizeof(ds_fprint); + ds->ds.digest = memdup(ds_fprint, ds->ds.digest_size); + assert_se(ds->ds.digest); + + log_info("DS: %s", strna(dns_resource_record_to_string(ds))); + + a = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_A, "www.example.net"); + assert_se(a); + + a->a.in_addr.s_addr = inet_addr("192.0.2.1"); + + log_info("A: %s", strna(dns_resource_record_to_string(a))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "www.example.net."); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_A; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_ECDSAP256SHA256; + rrsig->rrsig.labels = 3; + rrsig->rrsig.expiration = 1284026679; + rrsig->rrsig.inception = 1281607479; + rrsig->rrsig.key_tag = 55648; + rrsig->rrsig.original_ttl = 3600; + rrsig->rrsig.signer = strdup("example.net."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + assert_se(dnssec_key_match_rrsig(a->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(1); + assert_se(answer); + assert_se(dns_answer_add(answer, a, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + assert_se(dnssec_verify_rrset(answer, a->key, rrsig, dnskey, + rrsig->rrsig.inception * USEC_PER_SEC, &result) >= 0); + assert_se(result == DNSSEC_VALIDATED); +} + +TEST(dnssec_verify_rfc6605_example2) { + static const uint8_t signature_blob[] = { + 0xfc, 0xbe, 0x61, 0x0c, 0xa2, 0x2f, 0x18, 0x3c, 0x88, 0xd5, 0xf7, 0x00, 0x45, 0x7d, 0xf3, 0xeb, + 0x9a, 0xab, 0x98, 0xfb, 0x15, 0xcf, 0xbd, 0xd0, 0x0f, 0x53, 0x2b, 0xe4, 0x21, 0x2a, 0x3a, 0x22, + 0xcf, 0xf7, 0x98, 0x71, 0x42, 0x8b, 0xae, 0xae, 0x81, 0x82, 0x79, 0x93, 0xaf, 0xcc, 0x56, 0xb1, + 0xb1, 0x3f, 0x06, 0x96, 0xbe, 0xf8, 0x85, 0xb6, 0xaf, 0x44, 0xa6, 0xb2, 0x24, 0xdb, 0xb2, 0x74, + 0x2b, 0xb3, 0x59, 0x34, 0x92, 0x3d, 0xdc, 0xfb, 0xc2, 0x7a, 0x97, 0x2f, 0x96, 0xdd, 0x70, 0x9c, + 0xee, 0xb1, 0xd9, 0xc8, 0xd1, 0x14, 0x8c, 0x44, 0xec, 0x71, 0xc0, 0x68, 0xa9, 0x59, 0xc2, 0x66, + + }; + + static const uint8_t ds_fprint[] = { + 0xef, 0x67, 0x7b, 0x6f, 0xad, 0xbd, 0xef, 0xa7, 0x1e, 0xd3, 0xae, 0x37, 0xf1, 0xef, 0x5c, 0xd1, + 0xb7, 0xf7, 0xd7, 0xdd, 0x35, 0xdd, 0xc7, 0xfc, 0xd3, 0x57, 0xf4, 0xf5, 0xe7, 0x1c, 0xf3, 0x86, + 0xfc, 0x77, 0xb7, 0xbd, 0xe3, 0xde, 0x5f, 0xdb, 0xb7, 0xb7, 0xd3, 0x97, 0x3a, 0x6b, 0xd6, 0xf4, + 0xe7, 0xad, 0xda, 0xf5, 0xbe, 0x5f, 0xe1, 0xdd, 0xbc, 0xf3, 0x8d, 0x39, 0x73, 0x7d, 0x34, 0xf1, + 0xaf, 0x78, 0xe9, 0xd7, 0xfd, 0xf3, 0x77, 0x7a, + }; + + static const uint8_t dnskey_blob[] = { + 0xc4, 0xa6, 0x1a, 0x36, 0x15, 0x9d, 0x18, 0xe7, 0xc9, 0xfa, 0x73, 0xeb, 0x2f, 0xcf, 0xda, 0xae, + 0x4c, 0x1f, 0xd8, 0x46, 0x37, 0x30, 0x32, 0x7e, 0x48, 0x4a, 0xca, 0x8a, 0xf0, 0x55, 0x4a, 0xe9, + 0xb5, 0xc3, 0xf7, 0xa0, 0xb1, 0x7b, 0xd2, 0x00, 0x3b, 0x4d, 0x26, 0x1c, 0x9e, 0x9b, 0x94, 0x42, + 0x3a, 0x98, 0x10, 0xe8, 0xaf, 0x17, 0xd4, 0x34, 0x52, 0x12, 0x4a, 0xdb, 0x61, 0x0f, 0x8e, 0x07, + 0xeb, 0xfc, 0xfe, 0xe5, 0xf8, 0xe4, 0xd0, 0x70, 0x63, 0xca, 0xe9, 0xeb, 0x91, 0x7a, 0x1a, 0x5b, + 0xab, 0xf0, 0x8f, 0xe6, 0x95, 0x53, 0x60, 0x17, 0xa5, 0xbf, 0xa9, 0x32, 0x37, 0xee, 0x6e, 0x34, + }; + + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *dnskey = NULL, *ds = NULL, *a = NULL, + *rrsig = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "example.net."); + assert_se(dnskey); + + dnskey->dnskey.flags = 257; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_ECDSAP384SHA384; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + + ds = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DS, "example.net."); + assert_se(ds); + + ds->ds.key_tag = 10771; + ds->ds.algorithm = DNSSEC_ALGORITHM_ECDSAP384SHA384; + ds->ds.digest_type = DNSSEC_DIGEST_SHA384; + ds->ds.digest_size = sizeof(ds_fprint); + ds->ds.digest = memdup(ds_fprint, ds->ds.digest_size); + assert_se(ds->ds.digest); + + log_info("DS: %s", strna(dns_resource_record_to_string(ds))); + + a = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_A, "www.example.net"); + assert_se(a); + + a->a.in_addr.s_addr = inet_addr("192.0.2.1"); + + log_info("A: %s", strna(dns_resource_record_to_string(a))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "www.example.net."); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_A; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_ECDSAP384SHA384; + rrsig->rrsig.labels = 3; + rrsig->rrsig.expiration = 1284027625; + rrsig->rrsig.inception = 1281608425; + rrsig->rrsig.key_tag = 10771; + rrsig->rrsig.original_ttl = 3600; + rrsig->rrsig.signer = strdup("example.net."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + assert_se(dnssec_key_match_rrsig(a->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(1); + assert_se(answer); + assert_se(dns_answer_add(answer, a, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + assert_se(dnssec_verify_rrset(answer, a->key, rrsig, dnskey, + rrsig->rrsig.inception * USEC_PER_SEC, &result) >= 0); + assert_se(result == DNSSEC_VALIDATED); +} + +TEST(dnssec_verify_rrset) { + static const uint8_t signature_blob[] = { + 0x7f, 0x79, 0xdd, 0x5e, 0x89, 0x79, 0x18, 0xd0, 0x34, 0x86, 0x8c, 0x72, 0x77, 0x75, 0x48, 0x4d, + 0xc3, 0x7d, 0x38, 0x04, 0xab, 0xcd, 0x9e, 0x4c, 0x82, 0xb0, 0x92, 0xca, 0xe9, 0x66, 0xe9, 0x6e, + 0x47, 0xc7, 0x68, 0x8c, 0x94, 0xf6, 0x69, 0xcb, 0x75, 0x94, 0xe6, 0x30, 0xa6, 0xfb, 0x68, 0x64, + 0x96, 0x1a, 0x84, 0xe1, 0xdc, 0x16, 0x4c, 0x83, 0x6c, 0x44, 0xf2, 0x74, 0x4d, 0x74, 0x79, 0x8f, + 0xf3, 0xf4, 0x63, 0x0d, 0xef, 0x5a, 0xe7, 0xe2, 0xfd, 0xf2, 0x2b, 0x38, 0x7c, 0x28, 0x96, 0x9d, + 0xb6, 0xcd, 0x5c, 0x3b, 0x57, 0xe2, 0x24, 0x78, 0x65, 0xd0, 0x9e, 0x77, 0x83, 0x09, 0x6c, 0xff, + 0x3d, 0x52, 0x3f, 0x6e, 0xd1, 0xed, 0x2e, 0xf9, 0xee, 0x8e, 0xa6, 0xbe, 0x9a, 0xa8, 0x87, 0x76, + 0xd8, 0x77, 0xcc, 0x96, 0xa0, 0x98, 0xa1, 0xd1, 0x68, 0x09, 0x43, 0xcf, 0x56, 0xd9, 0xd1, 0x66, + }; + + static const uint8_t dnskey_blob[] = { + 0x03, 0x01, 0x00, 0x01, 0x9b, 0x49, 0x9b, 0xc1, 0xf9, 0x9a, 0xe0, 0x4e, 0xcf, 0xcb, 0x14, 0x45, + 0x2e, 0xc9, 0xf9, 0x74, 0xa7, 0x18, 0xb5, 0xf3, 0xde, 0x39, 0x49, 0xdf, 0x63, 0x33, 0x97, 0x52, + 0xe0, 0x8e, 0xac, 0x50, 0x30, 0x8e, 0x09, 0xd5, 0x24, 0x3d, 0x26, 0xa4, 0x49, 0x37, 0x2b, 0xb0, + 0x6b, 0x1b, 0xdf, 0xde, 0x85, 0x83, 0xcb, 0x22, 0x4e, 0x60, 0x0a, 0x91, 0x1a, 0x1f, 0xc5, 0x40, + 0xb1, 0xc3, 0x15, 0xc1, 0x54, 0x77, 0x86, 0x65, 0x53, 0xec, 0x10, 0x90, 0x0c, 0x91, 0x00, 0x5e, + 0x15, 0xdc, 0x08, 0x02, 0x4c, 0x8c, 0x0d, 0xc0, 0xac, 0x6e, 0xc4, 0x3e, 0x1b, 0x80, 0x19, 0xe4, + 0xf7, 0x5f, 0x77, 0x51, 0x06, 0x87, 0x61, 0xde, 0xa2, 0x18, 0x0f, 0x40, 0x8b, 0x79, 0x72, 0xfa, + 0x8d, 0x1a, 0x44, 0x47, 0x0d, 0x8e, 0x3a, 0x2d, 0xc7, 0x39, 0xbf, 0x56, 0x28, 0x97, 0xd9, 0x20, + 0x4f, 0x00, 0x51, 0x3b, + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *a = NULL, *rrsig = NULL, *dnskey = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + a = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_A, "nAsA.gov"); + assert_se(a); + + a->a.in_addr.s_addr = inet_addr("52.0.14.116"); + + log_info("A: %s", strna(dns_resource_record_to_string(a))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "NaSa.GOV."); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_A; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_RSASHA256; + rrsig->rrsig.labels = 2; + rrsig->rrsig.original_ttl = 600; + rrsig->rrsig.expiration = 0x5683135c; + rrsig->rrsig.inception = 0x565b7da8; + rrsig->rrsig.key_tag = 63876; + rrsig->rrsig.signer = strdup("Nasa.Gov."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "nASA.gOV"); + assert_se(dnskey); + + dnskey->dnskey.flags = 256; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_RSASHA256; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + log_info("DNSKEY keytag: %u", dnssec_keytag(dnskey, false)); + + assert_se(dnssec_key_match_rrsig(a->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(1); + assert_se(answer); + assert_se(dns_answer_add(answer, a, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + /* Validate the RR as it if was 2015-12-2 today */ + assert_se(dnssec_verify_rrset(answer, a->key, rrsig, dnskey, 1449092754*USEC_PER_SEC, &result) >= 0); + assert_se(result == DNSSEC_VALIDATED); +} + +TEST(dnssec_verify_rrset2) { + static const uint8_t signature_blob[] = { + 0x48, 0x45, 0xc8, 0x8b, 0xc0, 0x14, 0x92, 0xf5, 0x15, 0xc6, 0x84, 0x9d, 0x2f, 0xe3, 0x32, 0x11, + 0x7d, 0xf1, 0xe6, 0x87, 0xb9, 0x42, 0xd3, 0x8b, 0x9e, 0xaf, 0x92, 0x31, 0x0a, 0x53, 0xad, 0x8b, + 0xa7, 0x5c, 0x83, 0x39, 0x8c, 0x28, 0xac, 0xce, 0x6e, 0x9c, 0x18, 0xe3, 0x31, 0x16, 0x6e, 0xca, + 0x38, 0x31, 0xaf, 0xd9, 0x94, 0xf1, 0x84, 0xb1, 0xdf, 0x5a, 0xc2, 0x73, 0x22, 0xf6, 0xcb, 0xa2, + 0xe7, 0x8c, 0x77, 0x0c, 0x74, 0x2f, 0xc2, 0x13, 0xb0, 0x93, 0x51, 0xa9, 0x4f, 0xae, 0x0a, 0xda, + 0x45, 0xcc, 0xfd, 0x43, 0x99, 0x36, 0x9a, 0x0d, 0x21, 0xe0, 0xeb, 0x30, 0x65, 0xd4, 0xa0, 0x27, + 0x37, 0x3b, 0xe4, 0xc1, 0xc5, 0xa1, 0x2a, 0xd1, 0x76, 0xc4, 0x7e, 0x64, 0x0e, 0x5a, 0xa6, 0x50, + 0x24, 0xd5, 0x2c, 0xcc, 0x6d, 0xe5, 0x37, 0xea, 0xbd, 0x09, 0x34, 0xed, 0x24, 0x06, 0xa1, 0x22, + }; + + static const uint8_t dnskey_blob[] = { + 0x03, 0x01, 0x00, 0x01, 0xc3, 0x7f, 0x1d, 0xd1, 0x1c, 0x97, 0xb1, 0x13, 0x34, 0x3a, 0x9a, 0xea, + 0xee, 0xd9, 0x5a, 0x11, 0x1b, 0x17, 0xc7, 0xe3, 0xd4, 0xda, 0x20, 0xbc, 0x5d, 0xba, 0x74, 0xe3, + 0x37, 0x99, 0xec, 0x25, 0xce, 0x93, 0x7f, 0xbd, 0x22, 0x73, 0x7e, 0x14, 0x71, 0xe0, 0x60, 0x07, + 0xd4, 0x39, 0x8b, 0x5e, 0xe9, 0xba, 0x25, 0xe8, 0x49, 0xe9, 0x34, 0xef, 0xfe, 0x04, 0x5c, 0xa5, + 0x27, 0xcd, 0xa9, 0xda, 0x70, 0x05, 0x21, 0xab, 0x15, 0x82, 0x24, 0xc3, 0x94, 0xf5, 0xd7, 0xb7, + 0xc4, 0x66, 0xcb, 0x32, 0x6e, 0x60, 0x2b, 0x55, 0x59, 0x28, 0x89, 0x8a, 0x72, 0xde, 0x88, 0x56, + 0x27, 0x95, 0xd9, 0xac, 0x88, 0x4f, 0x65, 0x2b, 0x68, 0xfc, 0xe6, 0x41, 0xc1, 0x1b, 0xef, 0x4e, + 0xd6, 0xc2, 0x0f, 0x64, 0x88, 0x95, 0x5e, 0xdd, 0x3a, 0x02, 0x07, 0x50, 0xa9, 0xda, 0xa4, 0x49, + 0x74, 0x62, 0xfe, 0xd7, + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *nsec = NULL, *rrsig = NULL, *dnskey = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + nsec = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_NSEC, "nasa.gov"); + assert_se(nsec); + + nsec->nsec.next_domain_name = strdup("3D-Printing.nasa.gov"); + assert_se(nsec->nsec.next_domain_name); + + nsec->nsec.types = bitmap_new(); + assert_se(nsec->nsec.types); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_A) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_NS) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_SOA) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_MX) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_TXT) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_RRSIG) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_NSEC) >= 0); + assert_se(bitmap_set(nsec->nsec.types, DNS_TYPE_DNSKEY) >= 0); + assert_se(bitmap_set(nsec->nsec.types, 65534) >= 0); + + log_info("NSEC: %s", strna(dns_resource_record_to_string(nsec))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "NaSa.GOV."); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_NSEC; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_RSASHA256; + rrsig->rrsig.labels = 2; + rrsig->rrsig.original_ttl = 300; + rrsig->rrsig.expiration = 0x5689002f; + rrsig->rrsig.inception = 0x56617230; + rrsig->rrsig.key_tag = 30390; + rrsig->rrsig.signer = strdup("Nasa.Gov."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "nASA.gOV"); + assert_se(dnskey); + + dnskey->dnskey.flags = 256; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_RSASHA256; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + log_info("DNSKEY keytag: %u", dnssec_keytag(dnskey, false)); + + assert_se(dnssec_key_match_rrsig(nsec->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(1); + assert_se(answer); + assert_se(dns_answer_add(answer, nsec, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + /* Validate the RR as it if was 2015-12-11 today */ + assert_se(dnssec_verify_rrset(answer, nsec->key, rrsig, dnskey, 1449849318*USEC_PER_SEC, &result) >= 0); + assert_se(result == DNSSEC_VALIDATED); +} + +TEST(dnssec_verify_rrset3) { + static const uint8_t signature_blob[] = { + 0x41, 0x09, 0x08, 0x67, 0x51, 0x6d, 0x02, 0xf2, 0x17, 0x1e, 0x61, 0x03, 0xc6, 0x80, 0x7a, 0x82, + 0x8f, 0x6c, 0x8c, 0x4c, 0x68, 0x6f, 0x1c, 0xaa, 0x4a, 0xe0, 0x9b, 0x72, 0xdf, 0x7f, 0x15, 0xfa, + 0x2b, 0xc5, 0x63, 0x6f, 0x52, 0xa2, 0x60, 0x59, 0x24, 0xb6, 0xc3, 0x43, 0x3d, 0x47, 0x38, 0xd8, + 0x0c, 0xcc, 0x6c, 0x10, 0x49, 0x92, 0x97, 0x6c, 0x7d, 0x32, 0xc2, 0x62, 0x83, 0x34, 0x96, 0xdf, + 0xbd, 0xf9, 0xcc, 0xcf, 0xd9, 0x4d, 0x8b, 0x8a, 0xa9, 0x3c, 0x1f, 0x89, 0xc4, 0xad, 0xd5, 0xbb, + 0x74, 0xf8, 0xee, 0x60, 0x54, 0x7a, 0xec, 0x36, 0x45, 0xf2, 0xec, 0xb9, 0x73, 0x66, 0xae, 0x57, + 0x2d, 0xd4, 0x91, 0x02, 0x99, 0xcd, 0xba, 0xbd, 0x6e, 0xfb, 0xa6, 0xf6, 0x34, 0xce, 0x4c, 0x44, + 0x0b, 0xd2, 0x66, 0xdb, 0x4e, 0x5e, 0x00, 0x72, 0x1b, 0xe5, 0x2f, 0x24, 0xd2, 0xc8, 0x72, 0x37, + 0x97, 0x2b, 0xd0, 0xcd, 0xa9, 0x6b, 0x84, 0x32, 0x56, 0x7a, 0x89, 0x6e, 0x3d, 0x8f, 0x03, 0x9a, + 0x9d, 0x6d, 0xf7, 0xe5, 0x13, 0xd7, 0x4b, 0xbc, 0xe2, 0x6c, 0xd1, 0x18, 0x60, 0x0e, 0x1a, 0xe3, + 0xf9, 0xc0, 0x34, 0x4b, 0x1c, 0x82, 0x17, 0x5e, 0xdf, 0x81, 0x32, 0xd7, 0x5b, 0x30, 0x1d, 0xe0, + 0x29, 0x80, 0x6b, 0xb1, 0x69, 0xbf, 0x3f, 0x12, 0x56, 0xb0, 0x80, 0x91, 0x22, 0x1a, 0x31, 0xd5, + 0x5d, 0x3d, 0xdd, 0x70, 0x5e, 0xcb, 0xc7, 0x2d, 0xb8, 0x3e, 0x54, 0x34, 0xd3, 0x50, 0x89, 0x77, + 0x08, 0xc1, 0xf7, 0x11, 0x6e, 0x57, 0xd7, 0x09, 0x94, 0x20, 0x03, 0x38, 0xc3, 0x3a, 0xd3, 0x93, + 0x8f, 0xd0, 0x65, 0xc5, 0xa1, 0xe0, 0x69, 0x2c, 0xf6, 0x0a, 0xce, 0x01, 0xb6, 0x0d, 0x95, 0xa0, + 0x5d, 0x97, 0x94, 0xc3, 0xf1, 0xcd, 0x49, 0xea, 0x20, 0xd3, 0xa9, 0xa6, 0x67, 0x94, 0x64, 0x17 + }; + + static const uint8_t dnskey_blob[] = { + 0x03, 0x01, 0x00, 0x01, 0xbf, 0xdd, 0x24, 0x95, 0x21, 0x70, 0xa8, 0x5b, 0x19, 0xa6, 0x76, 0xd3, + 0x5b, 0x37, 0xcf, 0x59, 0x0d, 0x3c, 0xdb, 0x0c, 0xcf, 0xd6, 0x19, 0x02, 0xc7, 0x8e, 0x56, 0x4d, + 0x14, 0xb7, 0x9d, 0x71, 0xf4, 0xdd, 0x24, 0x36, 0xc8, 0x32, 0x1c, 0x63, 0xf7, 0xc0, 0xfc, 0xe3, + 0x83, 0xa6, 0x22, 0x8b, 0x6a, 0x34, 0x41, 0x72, 0xaa, 0x95, 0x98, 0x06, 0xac, 0x03, 0xec, 0xc3, + 0xa1, 0x6d, 0x8b, 0x1b, 0xfd, 0xa4, 0x05, 0x72, 0xe6, 0xe0, 0xb9, 0x98, 0x07, 0x54, 0x7a, 0xb2, + 0x55, 0x30, 0x96, 0xa3, 0x22, 0x3b, 0xe0, 0x9d, 0x61, 0xf6, 0xdc, 0x31, 0x2b, 0xc9, 0x2c, 0x12, + 0x06, 0x7f, 0x3c, 0x5d, 0x29, 0x76, 0x01, 0x62, 0xe3, 0x41, 0x41, 0x4f, 0xa6, 0x07, 0xfa, 0x2d, + 0x0c, 0x64, 0x88, 0xd1, 0x56, 0x18, 0x4b, 0x2b, 0xc2, 0x19, 0x7e, 0xd0, 0x1a, 0x8c, 0x2d, 0x8d, + 0x06, 0xdf, 0x4d, 0xaf, 0xd9, 0xe3, 0x31, 0x59, 0xbc, 0xc3, 0x36, 0x22, 0xe7, 0x15, 0xf9, 0xb2, + 0x44, 0x8a, 0x33, 0xd7, 0x6c, 0xf1, 0xcc, 0x37, 0x05, 0x69, 0x32, 0x71, 0x76, 0xd8, 0x50, 0x06, + 0xae, 0x27, 0xed, 0x3b, 0xdb, 0x1a, 0x97, 0x9b, 0xa3, 0x3e, 0x40, 0x42, 0x29, 0xaf, 0x75, 0x1c, + 0xff, 0x1d, 0xaf, 0x85, 0x02, 0xb3, 0x2e, 0x99, 0x67, 0x08, 0x13, 0xd5, 0xda, 0x6d, 0x65, 0xb2, + 0x36, 0x6f, 0x2f, 0x64, 0xe0, 0xfa, 0xd3, 0x81, 0x86, 0x6b, 0x41, 0x3e, 0x91, 0xaa, 0x0a, 0xd3, + 0xb2, 0x92, 0xd9, 0x42, 0x36, 0x8a, 0x11, 0x0b, 0x5b, 0xb0, 0xea, 0xad, 0x76, 0xd5, 0xb4, 0x81, + 0x30, 0xca, 0x5c, 0x4f, 0xd9, 0xea, 0xe7, 0x4b, 0x10, 0x0a, 0x09, 0x4b, 0x73, 0x66, 0xed, 0x8e, + 0x84, 0xa2, 0x4f, 0x93, 0x7e, 0x29, 0xdc, 0x6a, 0xbd, 0x12, 0xa1, 0x3d, 0xd2, 0xd6, 0x2a, 0x67, + 0x99, 0x4d, 0xf3, 0x43 + }; + + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *mx1 = NULL, *mx2 = NULL, *mx3 = NULL, *mx4 = NULL, *rrsig = NULL, *dnskey = NULL; + _cleanup_(dns_answer_unrefp) DnsAnswer *answer = NULL; + DnssecResult result; + + mx1 = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_MX, "kodapan.se"); + assert_se(mx1); + + mx1->mx.priority = 1; + mx1->mx.exchange = strdup("ASPMX.L.GOOGLE.COM"); + assert_se(mx1->mx.exchange); + + log_info("MX: %s", strna(dns_resource_record_to_string(mx1))); + + mx2 = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_MX, "kodapan.se"); + assert_se(mx2); + + mx2->mx.priority = 5; + mx2->mx.exchange = strdup("ALT2.ASPMX.L.GOOGLE.COM"); + assert_se(mx2->mx.exchange); + + log_info("MX: %s", strna(dns_resource_record_to_string(mx2))); + + mx3 = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_MX, "kodapan.se"); + assert_se(mx3); + + mx3->mx.priority = 10; + mx3->mx.exchange = strdup("ASPMX2.GOOGLEMAIL.COM"); + assert_se(mx3->mx.exchange); + + log_info("MX: %s", strna(dns_resource_record_to_string(mx3))); + + mx4 = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_MX, "kodapan.se"); + assert_se(mx4); + + mx4->mx.priority = 10; + mx4->mx.exchange = strdup("ASPMX3.GOOGLEMAIL.COM"); + assert_se(mx4->mx.exchange); + + log_info("MX: %s", strna(dns_resource_record_to_string(mx4))); + + rrsig = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_RRSIG, "kodapan.se"); + assert_se(rrsig); + + rrsig->rrsig.type_covered = DNS_TYPE_MX; + rrsig->rrsig.algorithm = DNSSEC_ALGORITHM_RSASHA256; + rrsig->rrsig.labels = 2; + rrsig->rrsig.original_ttl = 900; + rrsig->rrsig.expiration = 0x5e608a84; + rrsig->rrsig.inception = 0x5e4e1584; + rrsig->rrsig.key_tag = 44028; + rrsig->rrsig.signer = strdup("kodapan.se."); + assert_se(rrsig->rrsig.signer); + rrsig->rrsig.signature_size = sizeof(signature_blob); + rrsig->rrsig.signature = memdup(signature_blob, rrsig->rrsig.signature_size); + assert_se(rrsig->rrsig.signature); + + log_info("RRSIG: %s", strna(dns_resource_record_to_string(rrsig))); + + dnskey = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_DNSKEY, "kodapan.se"); + assert_se(dnskey); + + dnskey->dnskey.flags = 256; + dnskey->dnskey.protocol = 3; + dnskey->dnskey.algorithm = DNSSEC_ALGORITHM_RSASHA256; + dnskey->dnskey.key_size = sizeof(dnskey_blob); + dnskey->dnskey.key = memdup(dnskey_blob, sizeof(dnskey_blob)); + assert_se(dnskey->dnskey.key); + + log_info("DNSKEY: %s", strna(dns_resource_record_to_string(dnskey))); + log_info("DNSKEY keytag: %u", dnssec_keytag(dnskey, false)); + + assert_se(dnssec_key_match_rrsig(mx1->key, rrsig) > 0); + assert_se(dnssec_key_match_rrsig(mx2->key, rrsig) > 0); + assert_se(dnssec_key_match_rrsig(mx3->key, rrsig) > 0); + assert_se(dnssec_key_match_rrsig(mx4->key, rrsig) > 0); + assert_se(dnssec_rrsig_match_dnskey(rrsig, dnskey, false) > 0); + + answer = dns_answer_new(4); + assert_se(answer); + assert_se(dns_answer_add(answer, mx1, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + assert_se(dns_answer_add(answer, mx2, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + assert_se(dns_answer_add(answer, mx3, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + assert_se(dns_answer_add(answer, mx4, 0, DNS_ANSWER_AUTHENTICATED, NULL) >= 0); + + /* Validate the RR as it if was 2020-02-24 today */ + assert_se(dnssec_verify_rrset(answer, mx1->key, rrsig, dnskey, 1582534685*USEC_PER_SEC, &result) >= 0); + assert_se(result == DNSSEC_VALIDATED); +} + +TEST(dnssec_nsec3_hash) { + static const uint8_t salt[] = { 0xB0, 0x1D, 0xFA, 0xCE }; + static const uint8_t next_hashed_name[] = { 0x84, 0x10, 0x26, 0x53, 0xc9, 0xfa, 0x4d, 0x85, 0x6c, 0x97, 0x82, 0xe2, 0x8f, 0xdf, 0x2d, 0x5e, 0x87, 0x69, 0xc4, 0x52 }; + _cleanup_(dns_resource_record_unrefp) DnsResourceRecord *rr = NULL; + uint8_t h[DNSSEC_HASH_SIZE_MAX]; + _cleanup_free_ char *b = NULL; + int k; + + /* The NSEC3 RR for eurid.eu on 2015-12-14. */ + rr = dns_resource_record_new_full(DNS_CLASS_IN, DNS_TYPE_NSEC3, "PJ8S08RR45VIQDAQGE7EN3VHKNROTBMM.eurid.eu."); + assert_se(rr); + + rr->nsec3.algorithm = DNSSEC_DIGEST_SHA1; + rr->nsec3.flags = 1; + rr->nsec3.iterations = 1; + rr->nsec3.salt = memdup(salt, sizeof(salt)); + assert_se(rr->nsec3.salt); + rr->nsec3.salt_size = sizeof(salt); + rr->nsec3.next_hashed_name = memdup(next_hashed_name, sizeof(next_hashed_name)); + assert_se(rr->nsec3.next_hashed_name); + rr->nsec3.next_hashed_name_size = sizeof(next_hashed_name); + + log_info("NSEC3: %s", strna(dns_resource_record_to_string(rr))); + + k = dnssec_nsec3_hash(rr, "eurid.eu", &h); + assert_se(k >= 0); + + b = base32hexmem(h, k, false); + assert_se(b); + assert_se(strcasecmp(b, "PJ8S08RR45VIQDAQGE7EN3VHKNROTBMM") == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/resolve/test-resolve-tables.c b/src/resolve/test-resolve-tables.c new file mode 100644 index 0000000..6b86181 --- /dev/null +++ b/src/resolve/test-resolve-tables.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dns-type.h" +#include "resolved-dns-dnssec.h" +#include "resolved-dns-packet.h" +#include "test-tables.h" +#include "tests.h" + +int main(int argc, char **argv) { + uint16_t i; + + test_setup_logging(LOG_DEBUG); + + test_table(dns_protocol, DNS_PROTOCOL); + test_table(dnssec_result, DNSSEC_RESULT); + test_table(dnssec_verdict, DNSSEC_VERDICT); + + test_table_sparse(dns_rcode, DNS_RCODE); + test_table_sparse(dns_type, DNS_TYPE); + + log_info("/* DNS_TYPE */"); + for (i = 0; i < _DNS_TYPE_MAX; i++) { + const char *s; + + s = dns_type_to_string(i); + assert_se(s == NULL || strlen(s) < _DNS_TYPE_STRING_MAX); + + if (s) + log_info("%-*s %s%s%s%s%s%s%s%s%s", + (int) _DNS_TYPE_STRING_MAX - 1, s, + dns_type_is_pseudo(i) ? "pseudo " : "", + dns_type_is_valid_query(i) ? "valid_query " : "", + dns_type_is_valid_rr(i) ? "is_valid_rr " : "", + dns_type_may_redirect(i) ? "may_redirect " : "", + dns_type_is_dnssec(i) ? "dnssec " : "", + dns_type_is_obsolete(i) ? "obsolete " : "", + dns_type_may_wildcard(i) ? "wildcard " : "", + dns_type_apex_only(i) ? "apex_only " : "", + dns_type_needs_authentication(i) ? "needs_authentication" : ""); + } + + log_info("/* DNS_CLASS */"); + for (i = 0; i < _DNS_CLASS_MAX; i++) { + const char *s; + + s = dns_class_to_string(i); + assert_se(s == NULL || strlen(s) < _DNS_CLASS_STRING_MAX); + + if (s) + log_info("%-*s %s%s", + (int) _DNS_CLASS_STRING_MAX - 1, s, + dns_class_is_pseudo(i) ? "is_pseudo " : "", + dns_class_is_valid_rr(i) ? "is_valid_rr " : ""); + } + + return EXIT_SUCCESS; +} diff --git a/src/resolve/test-resolved-etc-hosts.c b/src/resolve/test-resolved-etc-hosts.c new file mode 100644 index 0000000..75f7db3 --- /dev/null +++ b/src/resolve/test-resolved-etc-hosts.c @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "log.h" +#include "resolved-etc-hosts.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(parse_etc_hosts_system) { + _cleanup_fclose_ FILE *f = NULL; + + f = fopen("/etc/hosts", "re"); + if (!f) { + assert_se(errno == ENOENT); + return; + } + + _cleanup_(etc_hosts_clear) EtcHosts hosts = {}; + assert_se(etc_hosts_parse(&hosts, f) == 0); +} + +#define in_addr_4(_address_str) \ + (&(struct in_addr_data) { .family = AF_INET, .address.in = { .s_addr = inet_addr(_address_str) } }) + +#define in_addr_6(...) \ + (&(struct in_addr_data) { .family = AF_INET6, .address.in6 = { .s6_addr = __VA_ARGS__ } }) + +#define has_4(_set, _address_str) \ + set_contains(_set, in_addr_4(_address_str)) + +#define has_6(_set, ...) \ + set_contains(_set, in_addr_6(__VA_ARGS__)) + +TEST(parse_etc_hosts) { + _cleanup_(unlink_tempfilep) char + t[] = "/tmp/test-resolved-etc-hosts.XXXXXX"; + + int fd; + _cleanup_fclose_ FILE *f = NULL; + + fd = mkostemp_safe(t); + assert_se(fd >= 0); + + f = fdopen(fd, "r+"); + assert_se(f); + fputs("1.2.3.4 some.where\n" + "1.2.3.5 some.where\n" + "1.2.3.6 dash dash-dash.where-dash\n" + "1.2.3.7 bad-dash- -bad-dash -bad-dash.bad-\n" + "1.2.3.8\n" + "1.2.3.9 before.comment # within.comment\n" + "1.2.3.10 before.comment#within.comment2\n" + "1.2.3.11 before.comment# within.comment3\n" + "1.2.3.12 before.comment#\n" + "1.2.3 short.address\n" + "1.2.3.4.5 long.address\n" + "1::2::3 multi.colon\n" + + "::0 some.where some.other\n" + "0.0.0.0 deny.listed\n" + "::5\t\t\t \tsome.where\tsome.other foobar.foo.foo\t\t\t\n" + " \n", f); + assert_se(fflush_and_check(f) >= 0); + rewind(f); + + _cleanup_(etc_hosts_clear) EtcHosts hosts = {}; + assert_se(etc_hosts_parse(&hosts, f) == 0); + + EtcHostsItemByName *bn; + assert_se(bn = hashmap_get(hosts.by_name, "some.where")); + assert_se(set_size(bn->addresses) == 3); + assert_se(has_4(bn->addresses, "1.2.3.4")); + assert_se(has_4(bn->addresses, "1.2.3.5")); + assert_se(has_6(bn->addresses, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5})); + + assert_se(bn = hashmap_get(hosts.by_name, "dash")); + assert_se(set_size(bn->addresses) == 1); + assert_se(has_4(bn->addresses, "1.2.3.6")); + + assert_se(bn = hashmap_get(hosts.by_name, "dash-dash.where-dash")); + assert_se(set_size(bn->addresses) == 1); + assert_se(has_4(bn->addresses, "1.2.3.6")); + + /* See https://tools.ietf.org/html/rfc1035#section-2.3.1 */ + FOREACH_STRING(s, "bad-dash-", "-bad-dash", "-bad-dash.bad-") + assert_se(!hashmap_get(hosts.by_name, s)); + + assert_se(bn = hashmap_get(hosts.by_name, "before.comment")); + assert_se(set_size(bn->addresses) == 4); + assert_se(has_4(bn->addresses, "1.2.3.9")); + assert_se(has_4(bn->addresses, "1.2.3.10")); + assert_se(has_4(bn->addresses, "1.2.3.11")); + assert_se(has_4(bn->addresses, "1.2.3.12")); + + assert_se(!hashmap_get(hosts.by_name, "within.comment")); + assert_se(!hashmap_get(hosts.by_name, "within.comment2")); + assert_se(!hashmap_get(hosts.by_name, "within.comment3")); + assert_se(!hashmap_get(hosts.by_name, "#")); + + assert_se(!hashmap_get(hosts.by_name, "short.address")); + assert_se(!hashmap_get(hosts.by_name, "long.address")); + assert_se(!hashmap_get(hosts.by_name, "multi.colon")); + assert_se(!set_contains(hosts.no_address, "short.address")); + assert_se(!set_contains(hosts.no_address, "long.address")); + assert_se(!set_contains(hosts.no_address, "multi.colon")); + + assert_se(bn = hashmap_get(hosts.by_name, "some.other")); + assert_se(set_size(bn->addresses) == 1); + assert_se(has_6(bn->addresses, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5})); + + EtcHostsItemByAddress *ba; + assert_se(ba = hashmap_get(hosts.by_address, in_addr_4("1.2.3.6"))); + assert_se(set_size(ba->names) == 2); + assert_se(set_contains(ba->names, "dash")); + assert_se(set_contains(ba->names, "dash-dash.where-dash")); + assert_se(streq(ba->canonical_name, "dash")); + + assert_se(ba = hashmap_get(hosts.by_address, in_addr_6({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5}))); + assert_se(set_size(ba->names) == 3); + assert_se(set_contains(ba->names, "some.where")); + assert_se(set_contains(ba->names, "some.other")); + assert_se(set_contains(ba->names, "foobar.foo.foo")); + assert_se(streq(ba->canonical_name, "some.where")); + + assert_se( set_contains(hosts.no_address, "some.where")); + assert_se( set_contains(hosts.no_address, "some.other")); + assert_se( set_contains(hosts.no_address, "deny.listed")); + assert_se(!set_contains(hosts.no_address, "foobar.foo.foo")); +} + +static void test_parse_file_one(const char *fname) { + _cleanup_(etc_hosts_clear) EtcHosts hosts = {}; + _cleanup_fclose_ FILE *f = NULL; + + log_info("/* %s(\"%s\") */", __func__, fname); + + assert_se(f = fopen(fname, "re")); + assert_se(etc_hosts_parse(&hosts, f) == 0); +} + +TEST(parse_file) { + for (int i = 1; i < saved_argc; i++) + test_parse_file_one(saved_argv[i]); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/resolve/test-resolved-packet.c b/src/resolve/test-resolved-packet.c new file mode 100644 index 0000000..dd8c969 --- /dev/null +++ b/src/resolve/test-resolved-packet.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "log.h" +#include "resolved-dns-packet.h" +#include "tests.h" + +TEST(dns_packet_new) { + size_t i; + _cleanup_(dns_packet_unrefp) DnsPacket *p2 = NULL; + + for (i = 0; i <= DNS_PACKET_SIZE_MAX; i++) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + + assert_se(dns_packet_new(&p, DNS_PROTOCOL_DNS, i, DNS_PACKET_SIZE_MAX) == 0); + + log_debug("dns_packet_new: %zu → %zu", i, p->allocated); + assert_se(p->allocated >= MIN(DNS_PACKET_SIZE_MAX, i)); + + if (i > DNS_PACKET_SIZE_START + 10 && i < DNS_PACKET_SIZE_MAX - 10) + i = MIN(i * 2, DNS_PACKET_SIZE_MAX - 10); + } + + assert_se(dns_packet_new(&p2, DNS_PROTOCOL_DNS, DNS_PACKET_SIZE_MAX + 1, DNS_PACKET_SIZE_MAX) == -EFBIG); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/resolve/test-resolved-stream.c b/src/resolve/test-resolved-stream.c new file mode 100644 index 0000000..847de04 --- /dev/null +++ b/src/resolve/test-resolved-stream.c @@ -0,0 +1,394 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "resolved-dns-packet.h" +#include "resolved-dns-question.h" +#include "resolved-dns-rr.h" +#if ENABLE_DNS_OVER_TLS +#include "resolved-dnstls.h" +#endif +#include "resolved-dns-server.h" +#include "resolved-dns-stream.h" +#include "resolved-manager.h" +#include "sd-event.h" +#include "sparse-endian.h" +#include "tests.h" + +static union sockaddr_union server_address; + +/* Bytes of the questions & answers used in the test, including TCP DNS 2-byte length prefix */ +static const uint8_t QUESTION_A[] = { + 0x00, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 'e', + 'x' , 'a' , 'm' , 'p' , 'l' , 'e' , 0x03, 'c' , 'o' , 'm' , 0x00, 0x00, 0x01, 0x00, 0x01 +}; +static const uint8_t QUESTION_AAAA[] = { + 0x00, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 'e', + 'x' , 'a' , 'm' , 'p' , 'l' , 'e' , 0x03, 'c' , 'o' , 'm' , 0x00, 0x00, 0x1C, 0x00, 0x01 +}; +static const uint8_t ANSWER_A[] = { + 0x00, 0x2D, 0x00, 0x00, 0x81, 0x80, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x07, 'e', + 'x' , 'a' , 'm' , 'p' , 'l' , 'e' , 0x03, 'c' , 'o' , 'm' , 0x00, 0x00, 0x01, 0x00, 0x01, 0xC0, + 0x0C, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x52, 0x8D, 0x00, 0x04, 0x5D, 0xB8, 0xD8, 0x22, +}; +static const uint8_t ANSWER_AAAA[] = { + 0x00, 0x39, 0x00, 0x00, 0x81, 0x80, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x07, 'e', + 'x' , 'a' , 'm' , 'p' , 'l' , 'e' , 0x03, 'c' , 'o' , 'm' , 0x00, 0x00, 0x1C, 0x00, 0x01, 0xC0, + 0x0C, 0x00, 0x1C, 0x00, 0x01, 0x00, 0x00, 0x54, 0x4B, 0x00, 0x10, 0x26, 0x06, 0x28, 0x00, 0x02, + 0x20, 0x00, 0x01, 0x02, 0x48, 0x18, 0x93, 0x25, 0xC8, 0x19, 0x46, +}; + +/** + * A mock TCP DNS server that asserts certain questions are received + * and replies with the same answer every time. + */ +static void receive_and_check_question(int fd, const uint8_t *expected_question, + size_t question_size) { + uint8_t *actual_question; + size_t n_read = 0; + + actual_question = newa(uint8_t, question_size); + while (n_read < question_size) { + ssize_t r = read(fd, actual_question + n_read, question_size - n_read); + assert_se(r >= 0); + n_read += (size_t)r; + } + assert_se(n_read == question_size); + + assert_se(memcmp(expected_question, actual_question, question_size) == 0); +} + +static void send_answer(int fd, const uint8_t *answer, size_t answer_size) { + assert_se(write(fd, answer, answer_size) == (ssize_t)answer_size); +} + +/* Sends two answers together in a single write operation, + * so they hopefully end up in a single TCP packet / TLS record */ +static void send_answers_together(int fd, + const uint8_t *answer1, size_t answer1_size, + const uint8_t *answer2, size_t answer2_size) { + uint8_t *answer; + size_t answer_size = answer1_size + answer2_size; + + answer = newa(uint8_t, answer_size); + memcpy(answer, answer1, answer1_size); + memcpy(answer + answer1_size, answer2, answer2_size); + assert_se(write(fd, answer, answer_size) == (ssize_t)answer_size); +} + +static void server_handle(int fd) { + receive_and_check_question(fd, QUESTION_A, sizeof(QUESTION_A)); + send_answer(fd, ANSWER_A, sizeof(ANSWER_A)); + + receive_and_check_question(fd, QUESTION_AAAA, sizeof(QUESTION_AAAA)); + send_answer(fd, ANSWER_AAAA, sizeof(ANSWER_AAAA)); + + receive_and_check_question(fd, QUESTION_A, sizeof(QUESTION_A)); + receive_and_check_question(fd, QUESTION_AAAA, sizeof(QUESTION_AAAA)); + send_answers_together(fd, ANSWER_A, sizeof(ANSWER_A), + ANSWER_AAAA, sizeof(ANSWER_AAAA)); +} + +static void *tcp_dns_server(void *p) { + _cleanup_close_ int bindfd = -EBADF, acceptfd = -EBADF; + + assert_se((bindfd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, 0)) >= 0); + assert_se(setsockopt(bindfd, SOL_SOCKET, SO_REUSEADDR, &(int){1}, sizeof(int)) >= 0); + assert_se(bind(bindfd, &server_address.sa, SOCKADDR_LEN(server_address)) >= 0); + assert_se(listen(bindfd, 1) >= 0); + assert_se((acceptfd = accept(bindfd, NULL, NULL)) >= 0); + server_handle(acceptfd); + return NULL; +} + +#if ENABLE_DNS_OVER_TLS +/* + * Spawns a DNS TLS server using the command line "openssl s_server" tool. + */ +static void *tls_dns_server(void *p) { + pid_t openssl_pid; + int r; + _cleanup_close_ int fd_server = -EBADF, fd_tls = -EBADF; + _cleanup_free_ char *cert_path = NULL, *key_path = NULL; + _cleanup_free_ char *bind_str = NULL; + + assert_se(get_testdata_dir("test-resolve/selfsigned.cert", &cert_path) >= 0); + assert_se(get_testdata_dir("test-resolve/selfsigned.key", &key_path) >= 0); + + assert_se(asprintf(&bind_str, "%s:%d", + IN_ADDR_TO_STRING(server_address.in.sin_family, + sockaddr_in_addr(&server_address.sa)), + be16toh(server_address.in.sin_port)) >= 0); + + /* We will hook one of the socketpair ends to OpenSSL's TLS server + * stdin/stdout, so we will be able to read and write plaintext + * from the other end's file descriptor like an usual TCP server */ + { + int fd[2]; + assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, fd) >= 0); + fd_server = fd[0]; + fd_tls = fd[1]; + } + + r = safe_fork_full("(test-resolved-stream-tls-openssl)", + (int[]) { fd_tls, fd_tls, STDOUT_FILENO }, + NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG|FORK_REOPEN_LOG, + &openssl_pid); + assert_se(r >= 0); + if (r == 0) { + /* Child */ + execlp("openssl", "openssl", "s_server", "-accept", bind_str, + "-key", key_path, "-cert", cert_path, + "-quiet", "-naccept", "1", NULL); + log_error("exec failed, is something wrong with the 'openssl' command?"); + _exit(EXIT_FAILURE); + } else { + pthread_mutex_t *server_lock = (pthread_mutex_t *)p; + + server_handle(fd_server); + + /* Once the test is done kill the TLS server to release the port */ + assert_se(pthread_mutex_lock(server_lock) == 0); + assert_se(kill(openssl_pid, SIGTERM) >= 0); + assert_se(waitpid(openssl_pid, NULL, 0) >= 0); + assert_se(pthread_mutex_unlock(server_lock) == 0); + } + + return NULL; +} +#endif + +static const char *TEST_DOMAIN = "example.com"; +static const uint64_t EVENT_TIMEOUT_USEC = 5 * 1000 * 1000; + +static void send_simple_question(DnsStream *stream, uint16_t type) { + _cleanup_(dns_packet_unrefp) DnsPacket *p = NULL; + _cleanup_(dns_resource_key_unrefp) DnsResourceKey *key = NULL; + _cleanup_(dns_question_unrefp) DnsQuestion *question = NULL; + + assert_se(dns_packet_new(&p, DNS_PROTOCOL_DNS, 0, DNS_PACKET_SIZE_MAX) >= 0); + assert_se(question = dns_question_new(1)); + assert_se(key = dns_resource_key_new(DNS_CLASS_IN, type, TEST_DOMAIN)); + assert_se(dns_question_add(question, key, 0) >= 0); + assert_se(dns_packet_append_question(p, question) >= 0); + DNS_PACKET_HEADER(p)->qdcount = htobe16(dns_question_size(question)); + assert_se(dns_stream_write_packet(stream, p) >= 0); +} + +static const size_t MAX_RECEIVED_PACKETS = 2; +static DnsPacket *received_packets[2] = {}; +static size_t n_received_packets = 0; + +static int on_stream_packet(DnsStream *stream, DnsPacket *p) { + assert_se(n_received_packets < MAX_RECEIVED_PACKETS); + assert_se(received_packets[n_received_packets++] = dns_packet_ref(p)); + return 0; +} + +static int on_stream_complete_do_nothing(DnsStream *s, int error) { + return 0; +} + +static void test_dns_stream(bool tls) { + Manager manager = {}; + _cleanup_(dns_stream_unrefp) DnsStream *stream = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_close_ int clientfd = -EBADF; + int r; + + void *(*server_entrypoint)(void *); + pthread_t server_thread; + pthread_mutex_t server_lock; + + log_info("test-resolved-stream: Started %s test", tls ? "TLS" : "TCP"); + +#if ENABLE_DNS_OVER_TLS + if (tls) + /* For TLS mode, use DNS_OVER_TLS_OPPORTUNISTIC instead of DNS_OVER_TLS_YES, just to make + * certificate validation more lenient, allowing us to use self-signed certificates. We + * never downgrade, everything we test always goes over TLS */ + manager.dns_over_tls_mode = DNS_OVER_TLS_OPPORTUNISTIC; +#endif + + assert_se(sd_event_new(&event) >= 0); + manager.event = event; + + /* Set up a mock DNS (over TCP or TLS) server */ + server_entrypoint = tcp_dns_server; +#if ENABLE_DNS_OVER_TLS + if (tls) + server_entrypoint = tls_dns_server; +#endif + assert_se(pthread_mutex_init(&server_lock, NULL) == 0); + assert_se(pthread_mutex_lock(&server_lock) == 0); + assert_se(pthread_create(&server_thread, NULL, server_entrypoint, &server_lock) == 0); + + /* Create a socket client and connect to the TCP or TLS server + * The server may not be up immediately, so try to connect a few times before failing */ + assert_se((clientfd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, 0)) >= 0); + + for (int i = 0; i < 100; i++) { + r = connect(clientfd, &server_address.sa, SOCKADDR_LEN(server_address)); + if (r >= 0) + break; + usleep_safe(EVENT_TIMEOUT_USEC / 100); + } + assert_se(r >= 0); + + /* systemd-resolved uses (and requires) the socket to be in nonblocking mode */ + assert_se(fcntl(clientfd, F_SETFL, O_NONBLOCK) >= 0); + + /* Initialize DNS stream (disabling the default self-destruction + behaviour when no complete callback is set) */ + assert_se(dns_stream_new(&manager, &stream, DNS_STREAM_LOOKUP, DNS_PROTOCOL_DNS, + TAKE_FD(clientfd), NULL, on_stream_packet, on_stream_complete_do_nothing, + DNS_STREAM_DEFAULT_TIMEOUT_USEC) >= 0); +#if ENABLE_DNS_OVER_TLS + if (tls) { + DnsServer server = { + .manager = &manager, + .family = server_address.sa.sa_family, + .address = *sockaddr_in_addr(&server_address.sa), + }; + + assert_se(dnstls_manager_init(&manager) >= 0); + assert_se(dnstls_stream_connect_tls(stream, &server) >= 0); + } +#endif + + /* Test: Question of type A and associated answer */ + log_info("test-resolved-stream: A record"); + send_simple_question(stream, DNS_TYPE_A); + while (n_received_packets != 1) + assert_se(sd_event_run(event, EVENT_TIMEOUT_USEC) >= 1); + assert_se(DNS_PACKET_DATA(received_packets[0])); + assert_se(memcmp(DNS_PACKET_DATA(received_packets[0]), + ANSWER_A + 2, sizeof(ANSWER_A) - 2) == 0); + dns_packet_unref(TAKE_PTR(received_packets[0])); + n_received_packets = 0; + + /* Test: Question of type AAAA and associated answer */ + log_info("test-resolved-stream: AAAA record"); + send_simple_question(stream, DNS_TYPE_AAAA); + while (n_received_packets != 1) + assert_se(sd_event_run(event, EVENT_TIMEOUT_USEC) >= 1); + assert_se(DNS_PACKET_DATA(received_packets[0])); + assert_se(memcmp(DNS_PACKET_DATA(received_packets[0]), + ANSWER_AAAA + 2, sizeof(ANSWER_AAAA) - 2) == 0); + dns_packet_unref(TAKE_PTR(received_packets[0])); + n_received_packets = 0; + + /* Test: Question of type A and AAAA and associated answers + * Both answers are sent back in a single packet or TLS record + * (tests the fix of PR #22132: "Fix DoT timeout on multiple answer records") */ + log_info("test-resolved-stream: A + AAAA record"); + send_simple_question(stream, DNS_TYPE_A); + send_simple_question(stream, DNS_TYPE_AAAA); + + while (n_received_packets != 2) + assert_se(sd_event_run(event, EVENT_TIMEOUT_USEC) >= 1); + assert_se(DNS_PACKET_DATA(received_packets[0])); + assert_se(DNS_PACKET_DATA(received_packets[1])); + assert_se(memcmp(DNS_PACKET_DATA(received_packets[0]), + ANSWER_A + 2, sizeof(ANSWER_A) - 2) == 0); + assert_se(memcmp(DNS_PACKET_DATA(received_packets[1]), + ANSWER_AAAA + 2, sizeof(ANSWER_AAAA) - 2) == 0); + dns_packet_unref(TAKE_PTR(received_packets[0])); + dns_packet_unref(TAKE_PTR(received_packets[1])); + n_received_packets = 0; + +#if ENABLE_DNS_OVER_TLS + if (tls) + dnstls_manager_free(&manager); +#endif + + /* Stop the DNS server */ + assert_se(pthread_mutex_unlock(&server_lock) == 0); + assert_se(pthread_join(server_thread, NULL) == 0); + assert_se(pthread_mutex_destroy(&server_lock) == 0); + + log_info("test-resolved-stream: Finished %s test", tls ? "TLS" : "TCP"); +} + +static void try_isolate_network(void) { + _cleanup_close_ int socket_fd = -EBADF; + int r; + + /* First test if CLONE_NEWUSER/CLONE_NEWNET can actually work for us, i.e. we can open the namespaces + * and then still access the build dir we are run from. We do that in a child process since it's + * nasty if we have to go back from the namespace once we entered it and realized it cannot work. */ + r = safe_fork("(usernstest)", FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, NULL); + if (r == 0) { /* child */ + _cleanup_free_ char *rt = NULL, *d = NULL; + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { + log_warning_errno(errno, "test-resolved-stream: Can't create user and network ns, running on host: %m"); + _exit(EXIT_FAILURE); + } + + assert_se(get_process_exe(0, &rt) >= 0); + assert_se(path_extract_directory(rt, &d) >= 0); + + if (access(d, F_OK) < 0) { + log_warning_errno(errno, "test-resolved-stream: Can't access /proc/self/exe from user/network ns, running on host: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + if (r == -EPROTO) /* EPROTO means nonzero exit code of child, i.e. the tests in the child failed */ + return; + assert_se(r > 0); + + /* Now that we know that the unshare() is safe, let's actually do it */ + assert_se(unshare(CLONE_NEWUSER | CLONE_NEWNET) >= 0); + + /* Bring up the loopback interfaceon the newly created network namespace */ + struct ifreq req = { .ifr_ifindex = 1 }; + assert_se((socket_fd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, 0)) >= 0); + assert_se(ioctl(socket_fd, SIOCGIFNAME, &req) >= 0); + assert_se(ioctl(socket_fd, SIOCGIFFLAGS, &req) >= 0); + assert_se(FLAGS_SET(req.ifr_flags, IFF_LOOPBACK)); + req.ifr_flags |= IFF_UP; + assert_se(ioctl(socket_fd, SIOCSIFFLAGS, &req) >= 0); +} + +int main(int argc, char **argv) { + server_address = (union sockaddr_union) { + .in.sin_family = AF_INET, + .in.sin_port = htobe16(random_u64_range(UINT16_MAX - 1024) + 1024), + .in.sin_addr.s_addr = htobe32(INADDR_LOOPBACK) + }; + + test_setup_logging(LOG_DEBUG); + + try_isolate_network(); + + test_dns_stream(false); +#if ENABLE_DNS_OVER_TLS + if (system("openssl version >/dev/null 2>&1") != 0) + return log_tests_skipped("Skipping TLS test since the 'openssl' command does not seem to be available"); + test_dns_stream(true); +#endif + + return 0; +} diff --git a/src/rfkill/meson.build b/src/rfkill/meson.build new file mode 100644 index 0000000..aa13b00 --- /dev/null +++ b/src/rfkill/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-rfkill', + 'conditions' : ['ENABLE_RFKILL'], + 'sources' : files('rfkill.c'), + }, +] diff --git a/src/rfkill/rfkill.c b/src/rfkill/rfkill.c new file mode 100644 index 0000000..be2a7f8 --- /dev/null +++ b/src/rfkill/rfkill.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "list.h" +#include "main-func.h" +#include "mkdir.h" +#include "parse-util.h" +#include "reboot-util.h" +#include "string-table.h" +#include "string-util.h" +#include "udev-util.h" + +/* Note that any write is delayed until exit and the rfkill state will not be + * stored for rfkill indices that disappear after a change. */ +#define EXIT_USEC (5 * USEC_PER_SEC) + +typedef struct write_queue_item { + LIST_FIELDS(struct write_queue_item, queue); + int rfkill_idx; + char *file; + int state; +} write_queue_item; + +typedef struct Context { + LIST_HEAD(write_queue_item, write_queue); + int rfkill_fd; +} Context; + +static struct write_queue_item* write_queue_item_free(struct write_queue_item *item) { + if (!item) + return NULL; + + free(item->file); + return mfree(item); +} + +static const char* const rfkill_type_table[NUM_RFKILL_TYPES] = { + [RFKILL_TYPE_ALL] = "all", + [RFKILL_TYPE_WLAN] = "wlan", + [RFKILL_TYPE_BLUETOOTH] = "bluetooth", + [RFKILL_TYPE_UWB] = "uwb", + [RFKILL_TYPE_WIMAX] = "wimax", + [RFKILL_TYPE_WWAN] = "wwan", + [RFKILL_TYPE_GPS] = "gps", + [RFKILL_TYPE_FM] = "fm", + [RFKILL_TYPE_NFC] = "nfc", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(rfkill_type, int); + +static int find_device( + const struct rfkill_event *event, + sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _cleanup_free_ char *sysname = NULL; + const char *name; + int r; + + assert(event); + assert(ret); + + if (asprintf(&sysname, "rfkill%u", event->idx) < 0) + return log_oom(); + + r = sd_device_new_from_subsystem_sysname(&device, "rfkill", sysname); + if (r < 0) + return log_full_errno(ERRNO_IS_DEVICE_ABSENT(r) ? LOG_DEBUG : LOG_ERR, r, + "Failed to open device '%s': %m", sysname); + + r = sd_device_get_sysattr_value(device, "name", &name); + if (r < 0) + return log_device_debug_errno(device, r, "Device has no name, ignoring: %m"); + + log_device_debug(device, "Operating on rfkill device '%s'.", name); + + *ret = TAKE_PTR(device); + return 0; +} + +static int determine_state_file( + const struct rfkill_event *event, + char **ret) { + + _cleanup_(sd_device_unrefp) sd_device *d = NULL, *device = NULL; + const char *path_id, *type; + char *state_file; + int r; + + assert(event); + assert(ret); + + r = find_device(event, &d); + if (r < 0) + return r; + + r = device_wait_for_initialization(d, "rfkill", USEC_INFINITY, &device); + if (r < 0) + return r; + + assert_se(type = rfkill_type_to_string(event->type)); + + if (sd_device_get_property_value(device, "ID_PATH", &path_id) >= 0) { + _cleanup_free_ char *escaped_path_id = NULL; + + escaped_path_id = cescape(path_id); + if (!escaped_path_id) + return log_oom(); + + state_file = strjoin("/var/lib/systemd/rfkill/", escaped_path_id, ":", type); + } else + state_file = strjoin("/var/lib/systemd/rfkill/", type); + + if (!state_file) + return log_oom(); + + *ret = state_file; + return 0; +} + +static int load_state(Context *c, const struct rfkill_event *event) { + _cleanup_free_ char *state_file = NULL, *value = NULL; + int b, r; + + assert(c); + assert(c->rfkill_fd >= 0); + assert(event); + + if (!shall_restore_state()) + return 0; + + r = determine_state_file(event, &state_file); + if (r < 0) + return r; + + r = read_one_line_file(state_file, &value); + if (IN_SET(r, -ENOENT, 0)) { + /* No state file or it's truncated? Then save the current state */ + + r = write_string_file(state_file, one_zero(event->soft), WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_MKDIR_0755); + if (r < 0) + return log_error_errno(r, "Failed to write state file %s: %m", state_file); + + log_debug("Saved state '%s' to %s.", one_zero(event->soft), state_file); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to read state file %s: %m", state_file); + + b = parse_boolean(value); + if (b < 0) + return log_error_errno(b, "Failed to parse state file %s: %m", state_file); + + struct rfkill_event we = { + .idx = event->idx, + .op = RFKILL_OP_CHANGE, + .soft = b, + }; + assert_cc(offsetof(struct rfkill_event, op) < RFKILL_EVENT_SIZE_V1); + assert_cc(offsetof(struct rfkill_event, soft) < RFKILL_EVENT_SIZE_V1); + + ssize_t l = write(c->rfkill_fd, &we, sizeof we); + if (l < 0) + return log_error_errno(errno, "Failed to restore rfkill state for %u: %m", event->idx); + if ((size_t)l < RFKILL_EVENT_SIZE_V1) /* l cannot be < 0 here. Cast to fix -Werror=sign-compare */ + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Couldn't write rfkill event structure, too short (wrote %zd of %zu bytes).", + l, sizeof we); + log_debug("Writing struct rfkill_event successful (%zd of %zu bytes).", l, sizeof we); + + log_debug("Loaded state '%s' from %s.", one_zero(b), state_file); + return 0; +} + +static void save_state_queue_remove(Context *c, int idx, const char *state_file) { + assert(c); + + LIST_FOREACH(queue, item, c->write_queue) + if ((state_file && streq(item->file, state_file)) || idx == item->rfkill_idx) { + log_debug("Canceled previous save state of '%s' to %s.", one_zero(item->state), item->file); + LIST_REMOVE(queue, c->write_queue, item); + write_queue_item_free(item); + } +} + +static int save_state_queue(Context *c, const struct rfkill_event *event) { + _cleanup_free_ char *state_file = NULL; + struct write_queue_item *item; + int r; + + assert(c); + assert(c->rfkill_fd >= 0); + assert(event); + + r = determine_state_file(event, &state_file); + if (r < 0) + return r; + + save_state_queue_remove(c, event->idx, state_file); + + item = new0(struct write_queue_item, 1); + if (!item) + return -ENOMEM; + + item->file = TAKE_PTR(state_file); + item->rfkill_idx = event->idx; + item->state = event->soft; + + LIST_APPEND(queue, c->write_queue, item); + + return 0; +} + +static int save_state_cancel(Context *c, const struct rfkill_event *event) { + _cleanup_free_ char *state_file = NULL; + int r; + + assert(c); + assert(c->rfkill_fd >= 0); + assert(event); + + r = determine_state_file(event, &state_file); + save_state_queue_remove(c, event->idx, state_file); + if (r < 0) + return r; + + return 0; +} + +static int save_state_write_one(struct write_queue_item *item) { + int r; + + r = write_string_file(item->file, one_zero(item->state), WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC|WRITE_STRING_FILE_MKDIR_0755); + if (r < 0) + return log_error_errno(r, "Failed to write state file %s: %m", item->file); + + log_debug("Saved state '%s' to %s.", one_zero(item->state), item->file); + return 0; +} + +static void context_save_and_clear(Context *c) { + struct write_queue_item *i; + + assert(c); + + while ((i = LIST_POP(queue, c->write_queue))) { + (void) save_state_write_one(i); + write_queue_item_free(i); + } + + safe_close(c->rfkill_fd); +} + +static int run(int argc, char *argv[]) { + _cleanup_(context_save_and_clear) Context c = { .rfkill_fd = -EBADF }; + bool ready = false; + int r, n; + + if (argc > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program requires no arguments."); + + log_setup(); + + umask(0022); + + n = sd_listen_fds(false); + if (n < 0) + return log_error_errno(n, "Failed to determine whether we got any file descriptors passed: %m"); + if (n > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got too many file descriptors."); + + if (n == 0) { + c.rfkill_fd = open("/dev/rfkill", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (c.rfkill_fd < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "Missing rfkill subsystem, or no device present, exiting."); + return 0; + } + + return log_error_errno(errno, "Failed to open /dev/rfkill: %m"); + } + } else { + c.rfkill_fd = SD_LISTEN_FDS_START; + + r = fd_nonblock(c.rfkill_fd, 1); + if (r < 0) + return log_error_errno(r, "Failed to make /dev/rfkill socket non-blocking: %m"); + } + + for (;;) { + struct rfkill_event event = {}; + + ssize_t l = read(c.rfkill_fd, &event, sizeof event); + if (l < 0) { + if (errno != EAGAIN) + return log_error_errno(errno, "Failed to read from /dev/rfkill: %m"); + + if (!ready) { + /* Notify manager that we are now finished with processing whatever was + * queued */ + r = sd_notify(false, "READY=1"); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + + ready = true; + } + + /* Hang around for a bit, maybe there's more coming */ + + r = fd_wait_for_event(c.rfkill_fd, POLLIN, EXIT_USEC); + if (r == -EINTR) + continue; + if (r < 0) + return log_error_errno(r, "Failed to poll() on device: %m"); + if (r > 0) + continue; + + log_debug("All events read and idle, exiting."); + break; + } + + if ((size_t)l < RFKILL_EVENT_SIZE_V1) /* l cannot be < 0 here. Cast to fix -Werror=sign-compare */ + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read of struct rfkill_event: (%zd < %zu)", + l, (size_t) RFKILL_EVENT_SIZE_V1); /* Casting necessary to make compiling with different kernel versions happy */ + log_debug("Reading struct rfkill_event: got %zd bytes.", l); + + /* The event structure has more fields. We only care about the first few, so it's OK if we + * don't read the full structure. */ + assert_cc(offsetof(struct rfkill_event, op) < RFKILL_EVENT_SIZE_V1); + assert_cc(offsetof(struct rfkill_event, type) < RFKILL_EVENT_SIZE_V1); + + const char *type = rfkill_type_to_string(event.type); + if (!type) { + log_debug("An rfkill device of unknown type %u discovered, ignoring.", event.type); + continue; + } + + switch (event.op) { + + case RFKILL_OP_ADD: + log_debug("A new rfkill device has been added with index %u and type %s.", event.idx, type); + (void) load_state(&c, &event); + break; + + case RFKILL_OP_DEL: + log_debug("An rfkill device has been removed with index %u and type %s", event.idx, type); + (void) save_state_cancel(&c, &event); + break; + + case RFKILL_OP_CHANGE: + log_debug("An rfkill device has changed state with index %u and type %s", event.idx, type); + (void) save_state_queue(&c, &event); + break; + + default: + log_debug("Unknown event %u from /dev/rfkill for index %u and type %s, ignoring.", event.op, event.idx, type); + break; + } + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/rpm/macros.systemd.in b/src/rpm/macros.systemd.in new file mode 100644 index 0000000..241e4b9 --- /dev/null +++ b/src/rpm/macros.systemd.in @@ -0,0 +1,199 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. + +# RPM macros for packages installing systemd unit files + +%_systemd_util_dir {{LIBEXECDIR}} +%_unitdir {{SYSTEM_DATA_UNIT_DIR}} +%_userunitdir {{USER_DATA_UNIT_DIR}} +%_presetdir {{SYSTEM_PRESET_DIR}} +%_userpresetdir {{USER_PRESET_DIR}} +%_udevhwdbdir {{UDEV_HWDB_DIR}} +%_udevrulesdir {{UDEV_RULES_DIR}} +%_journalcatalogdir {{SYSTEMD_CATALOG_DIR}} +%_binfmtdir {{BINFMT_DIR}} +%_sysctldir {{SYSCTL_DIR}} +%_sysusersdir {{SYSUSERS_DIR}} +%_tmpfilesdir {{TMPFILES_DIR}} +%_user_tmpfilesdir {{USER_TMPFILES_DIR}} +%_environmentdir {{ENVIRONMENT_DIR}} +%_modulesloaddir {{MODULESLOAD_DIR}} +%_modprobedir {{MODPROBE_DIR}} +%_systemdgeneratordir {{SYSTEM_GENERATOR_DIR}} +%_systemdusergeneratordir {{USER_GENERATOR_DIR}} +%_systemd_system_env_generator_dir {{SYSTEM_ENV_GENERATOR_DIR}} +%_systemd_user_env_generator_dir {{USER_ENV_GENERATOR_DIR}} + +# Because we had one release with a typo... +# This is temporary (Remove after systemd 240 is released) +%_environmnentdir %{warn:Use %%_environmentdir instead}%_environmentdir + +%systemd_requires \ +Requires(post): systemd \ +Requires(preun): systemd \ +Requires(postun): systemd \ +%{nil} + +%systemd_ordering \ +OrderWithRequires(post): systemd \ +OrderWithRequires(preun): systemd \ +OrderWithRequires(postun): systemd \ +%{nil} + +%__systemd_someargs_0(:) %{error:The %%%1 macro requires some arguments} +%__systemd_twoargs_2() %{nil} + +%systemd_post() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_post}} \ +if [ $1 -eq 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Initial installation \ + {{SYSTEMD_UPDATE_HELPER_PATH}} install-system-units %{?*} || : \ +fi \ +%{nil} + +%systemd_user_post() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_post}} \ +if [ $1 -eq 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Initial installation \ + {{SYSTEMD_UPDATE_HELPER_PATH}} install-user-units %{?*} || : \ +fi \ +%{nil} + +%systemd_preun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_preun}} \ +if [ $1 -eq 0 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package removal, not upgrade \ + {{SYSTEMD_UPDATE_HELPER_PATH}} remove-system-units %{?*} || : \ +fi \ +%{nil} + +%systemd_user_preun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_preun}} \ +if [ $1 -eq 0 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package removal, not upgrade \ + {{SYSTEMD_UPDATE_HELPER_PATH}} remove-user-units %{?*} || : \ +fi \ +%{nil} + +%systemd_postun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_postun}} \ +%{nil} + +%systemd_user_postun() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_postun}} \ +%{nil} + +%systemd_postun_with_restart() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_postun_with_restart}} \ +if [ $1 -ge 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package upgrade, not uninstall \ + {{SYSTEMD_UPDATE_HELPER_PATH}} mark-restart-system-units %{?*} || : \ +fi \ +%{nil} + +%systemd_user_postun_with_restart() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_postun_with_restart}} \ +if [ $1 -ge 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package upgrade, not uninstall \ + {{SYSTEMD_UPDATE_HELPER_PATH}} mark-restart-user-units %{?*} || : \ +fi \ +%{nil} + +%systemd_postun_with_reload() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_postun_with_reload}} \ +if [ $1 -ge 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package upgrade, not uninstall \ + {{SYSTEMD_UPDATE_HELPER_PATH}} mark-reload-system-units %{?*} || : \ +fi \ +%{nil} + +%systemd_user_postun_with_reload() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# systemd_user_postun_with_reload}} \ +if [ $1 -ge 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package upgrade, not uninstall \ + {{SYSTEMD_UPDATE_HELPER_PATH}} mark-reload-user-units %{?*} || : \ +fi \ +%{nil} + +%systemd_user_daemon_reexec() \ +if [ $1 -ge 1 ] && [ -x "{{SYSTEMD_UPDATE_HELPER_PATH}}" ]; then \ + # Package upgrade, not uninstall \ + {{SYSTEMD_UPDATE_HELPER_PATH}} user-reexec || : \ +fi \ +%{nil} + +%udev_hwdb_update() %{nil} + +%udev_rules_update() %{nil} + +%journal_catalog_update() %{nil} + +# Deprecated. Use %tmpfiles_create_package instead +%tmpfiles_create() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# tmpfiles_create}} \ +command -v systemd-tmpfiles >/dev/null && systemd-tmpfiles --create %{?*} || : \ +%{nil} + +# Deprecated. Use %sysusers_create_package instead +%sysusers_create() \ +%{expand:%%{?__systemd_someargs_%#:%%__systemd_someargs_%# sysusers_create}} \ +command -v systemd-sysusers >/dev/null && systemd-sysusers %{?*} || : \ +%{nil} + +%sysusers_create_inline() \ +command -v systemd-sysusers >/dev/null && systemd-sysusers - < 2 ? tuple[2] : '', + build_by_default : true) +endforeach diff --git a/src/rpm/systemd-update-helper.in b/src/rpm/systemd-update-helper.in new file mode 100755 index 0000000..c81e16c --- /dev/null +++ b/src/rpm/systemd-update-helper.in @@ -0,0 +1,141 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu +set -o pipefail + +command="${1:?}" +shift + +command -v systemctl >/dev/null || exit 0 + +case "$command" in + install-system-units) + systemctl --no-reload preset "$@" + ;; + + install-user-units) + systemctl --no-reload preset --global "$@" + ;; + + remove-system-units) + if [ -d /run/systemd/system ]; then + systemctl --no-reload disable --now --no-warn "$@" + else + systemctl --no-reload disable --no-warn "$@" + fi + ;; + + remove-user-units) + systemctl --global disable --no-warn "$@" + + [ -d /run/systemd/system ] || exit 0 + + users=$(systemctl list-units 'user@*' --legend=no | sed -n -r 's/.*user@([0-9]+).service.*/\1/p') + for user in $users; do + SYSTEMD_BUS_TIMEOUT={{UPDATE_HELPER_USER_TIMEOUT_SEC}}s \ + systemctl --user -M "$user@" disable --now --no-warn "$@" & + done + wait + ;; + + mark-restart-system-units) + [ -d /run/systemd/system ] || exit 0 + + for unit in "$@"; do + systemctl set-property "$unit" Markers=+needs-restart & + done + wait + ;; + + mark-reload-system-units) + [ -d /run/systemd/system ] || exit 0 + + for unit in "$@"; do + systemctl set-property "$unit" Markers=+needs-reload & + done + wait + ;; + + mark-restart-user-units) + [ -d /run/systemd/system ] || exit 0 + + users=$(systemctl list-units 'user@*' --legend=no | sed -n -r 's/.*user@([0-9]+).service.*/\1/p') + for user in $users; do + for unit in "$@"; do + SYSTEMD_BUS_TIMEOUT={{UPDATE_HELPER_USER_TIMEOUT_SEC}}s \ + systemctl --user -M "$user@" set-property "$unit" Markers=+needs-restart & + done + done + wait + ;; + + mark-reload-user-units) + [ -d /run/systemd/system ] || exit 0 + + users=$(systemctl list-units 'user@*' --legend=no | sed -n -r 's/.*user@([0-9]+).service.*/\1/p') + for user in $users; do + for unit in "$@"; do + SYSTEMD_BUS_TIMEOUT={{UPDATE_HELPER_USER_TIMEOUT_SEC}}s \ + systemctl --user -M "$user@" set-property "$unit" Markers=+needs-reload & + done + done + wait + ;; + + system-reload-restart|system-reload|system-restart) + if [ -n "$*" ]; then + echo "Unexpected arguments for '$command': $*" + exit 2 + fi + + [ -d /run/systemd/system ] || exit 0 + + if [[ "$command" =~ reload ]]; then + systemctl daemon-reload + fi + + if [[ "$command" =~ restart ]]; then + systemctl reload-or-restart --marked + fi + ;; + + user-reload-restart|user-reload|user-restart|user-reexec) + if [ -n "$*" ]; then + echo "Unexpected arguments for '$command': $*" + exit 2 + fi + + [ -d /run/systemd/system ] || exit 0 + + users=$(systemctl list-units 'user@*' --legend=no | sed -n -r 's/.*user@([0-9]+).service.*/\1/p') + + if [[ "$command" =~ reexec ]]; then + for user in $users; do + SYSTEMD_BUS_TIMEOUT={{UPDATE_HELPER_USER_TIMEOUT_SEC}}s \ + systemctl --user -M "$user@" daemon-reexec & + done + wait + fi + + if [[ "$command" =~ reload ]]; then + for user in $users; do + SYSTEMD_BUS_TIMEOUT={{UPDATE_HELPER_USER_TIMEOUT_SEC}}s \ + systemctl --user -M "$user@" daemon-reload & + done + wait + fi + + if [[ "$command" =~ restart ]]; then + for user in $users; do + SYSTEMD_BUS_TIMEOUT={{UPDATE_HELPER_USER_TIMEOUT_SEC}}s \ + systemctl --user -M "$user@" reload-or-restart --marked & + done + wait + fi + ;; + + *) + echo "Unknown verb '$command'" + exit 3 + ;; +esac diff --git a/src/rpm/triggers.systemd.in b/src/rpm/triggers.systemd.in new file mode 100644 index 0000000..d480ab8 --- /dev/null +++ b/src/rpm/triggers.systemd.in @@ -0,0 +1,82 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# Copyright © 2018 Neal Gompa + +# The contents of this are an example to be copied into systemd.spec. +# +# Minimum rpm version supported: 4.14.0 + +%transfiletriggerin -P 900900 -p -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +-- This script will run after any package is initially installed or +-- upgraded. We care about the case where a package is initially +-- installed, because other cases are covered by the *un scriptlets, +-- so sometimes we will reload needlessly. +assert(rpm.execute("{{SYSTEMD_UPDATE_HELPER_PATH}}", "system-reload-restart")) + +%transfiletriggerin -P 900899 -p -- {{USER_DATA_UNIT_DIR}} /etc/systemd/user +assert(rpm.execute("{{SYSTEMD_UPDATE_HELPER_PATH}}", "user-reload-restart")) + +%transfiletriggerpostun -P 1000100 -p -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +-- On removal, we need to run daemon-reload after any units have been +-- removed. +-- On upgrade, we need to run daemon-reload after any new unit files +-- have been installed, but before %postun scripts in packages get +-- executed. +assert(rpm.execute("{{SYSTEMD_UPDATE_HELPER_PATH}}", "system-reload")) + +%transfiletriggerpostun -P 1000100 -p -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +-- Execute daemon-reload in user managers. +assert(rpm.execute("{{SYSTEMD_UPDATE_HELPER_PATH}}", "user-reload")) + +%transfiletriggerpostun -P 10000 -p -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +-- We restart remaining system services that should be restarted here. +assert(rpm.execute("{{SYSTEMD_UPDATE_HELPER_PATH}}", "system-restart")) + +%transfiletriggerpostun -P 9999 -p -- {{USER_DATA_UNIT_DIR}} /etc/systemd/user +-- We restart remaining user services that should be restarted here. +assert(rpm.execute("{{SYSTEMD_UPDATE_HELPER_PATH}}", "user-restart")) + +%transfiletriggerin -P 1000700 -p -- {{SYSUSERS_DIR}} +-- This script will process files installed in {{SYSUSERS_DIR}} to create +-- specified users automatically. The priority is set such that it +-- will run before the tmpfiles file trigger. +assert(rpm.execute("systemd-sysusers")) + +%transfiletriggerin -P 1000700 udev -p -- {{UDEV_HWDB_DIR}} +-- This script will automatically invoke hwdb update if files have been +-- installed or updated in {{UDEV_HWDB_DIR}}. +assert(rpm.execute("systemd-hwdb", "update")) + +%transfiletriggerin -P 1000700 -p -- {{SYSTEMD_CATALOG_DIR}} +-- This script will automatically invoke journal catalog update if files +-- have been installed or updated in {{SYSTEMD_CATALOG_DIR}}. +assert(rpm.execute("journalctl", "--update-catalog")) + +%transfiletriggerin -P 1000700 -p -- {{BINFMT_DIR}} +-- This script will automatically apply binfmt rules if files have been +-- installed or updated in {{BINFMT_DIR}}. +if posix.access("/run/systemd/system") then + assert(rpm.execute("{{LIBEXECDIR}}/systemd-binfmt")) +end + +%transfiletriggerin -P 1000600 -p -- {{TMPFILES_DIR}} +-- This script will process files installed in {{TMPFILES_DIR}} to create +-- tmpfiles automatically. The priority is set such that it will run +-- after the sysusers file trigger, but before any other triggers. +assert(rpm.execute("systemd-tmpfiles", "--create")) + +%transfiletriggerin -P 1000600 udev -p -- {{UDEV_RULES_DIR}} +-- This script will automatically update udev with new rules if files +-- have been installed or updated in {{UDEV_RULES_DIR}}. +if posix.access("/run/udev/control") then + assert(rpm.execute("udevadm", "control", "--reload")) +end + +%transfiletriggerin -P 1000500 -p -- {{SYSCTL_DIR}} +-- This script will automatically apply sysctl rules if files have been +-- installed or updated in {{SYSCTL_DIR}}. +if posix.access("/run/systemd/system") then + assert(rpm.execute("{{LIBEXECDIR}}/systemd-sysctl")) +end diff --git a/src/rpm/triggers.systemd.sh.in b/src/rpm/triggers.systemd.sh.in new file mode 100644 index 0000000..1b94f7d --- /dev/null +++ b/src/rpm/triggers.systemd.sh.in @@ -0,0 +1,87 @@ +# -*- Mode: rpm-spec; indent-tabs-mode: nil -*- */ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# Copyright 2018 Neal Gompa + +# The contents of this are an example to be copied into systemd.spec. +# +# Minimum rpm version supported: 4.14.0 + +%transfiletriggerin -P 900900 -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +# This script will run after any package is initially installed or +# upgraded. We care about the case where a package is initially +# installed, because other cases are covered by the *un scriptlets, +# so sometimes we will reload needlessly. +{{SYSTEMD_UPDATE_HELPER_PATH}} system-reload-restart || : + +%transfiletriggerin -P 900899 -- {{USER_DATA_UNIT_DIR}} /etc/systemd/user +{{SYSTEMD_UPDATE_HELPER_PATH}} user-reload-restart || : + +%transfiletriggerpostun -P 1000100 -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +# On removal, we need to run daemon-reload after any units have been +# removed. +# On upgrade, we need to run daemon-reload after any new unit files +# have been installed, but before %postun scripts in packages get +# executed. +{{SYSTEMD_UPDATE_HELPER_PATH}} system-reload || : + +%transfiletriggerpostun -P 1000099 -- {{USER_DATA_UNIT_DIR}} /etc/systemd/user +# Execute daemon-reload in user managers. +{{SYSTEMD_UPDATE_HELPER_PATH}} user-reload || : + +%transfiletriggerpostun -P 10000 -- {{SYSTEM_DATA_UNIT_DIR}} /etc/systemd/system +# We restart remaining system services that should be restarted here. +{{SYSTEMD_UPDATE_HELPER_PATH}} system-restart || : + +%transfiletriggerpostun -P 9999 -- {{USER_DATA_UNIT_DIR}} /etc/systemd/user +# We restart remaining user services that should be restarted here. +{{SYSTEMD_UPDATE_HELPER_PATH}} user-restart || : + +%transfiletriggerin -P 1000700 -- {{SYSUSERS_DIR}} +# This script will process files installed in {{SYSUSERS_DIR}} to create +# specified users automatically. The priority is set such that it +# will run before the tmpfiles file trigger. +systemd-sysusers || : + +%transfiletriggerin -P 1000700 udev -- {{UDEV_HWDB_DIR}} +# This script will automatically invoke hwdb update if files have been +# installed or updated in {{UDEV_HWDB_DIR}}. +systemd-hwdb update || : + +%transfiletriggerin -P 1000700 -- {{SYSTEMD_CATALOG_DIR}} +# This script will automatically invoke journal catalog update if files +# have been installed or updated in {{SYSTEMD_CATALOG_DIR}}. +journalctl --update-catalog || : + +%transfiletriggerin -P 1000700 -- {{BINFMT_DIR}} +# This script will automatically apply binfmt rules if files have been +# installed or updated in {{BINFMT_DIR}}. +if test -d "/run/systemd/system"; then + # systemd-binfmt might fail if binfmt_misc kernel module is not loaded + # during install + {{LIBEXECDIR}}/systemd-binfmt || : +fi + +%transfiletriggerin -P 1000600 -- {{TMPFILES_DIR}} +# This script will process files installed in {{TMPFILES_DIR}} to create +# tmpfiles automatically. The priority is set such that it will run +# after the sysusers file trigger, but before any other triggers. +if test -d "/run/systemd/system"; then + systemd-tmpfiles --create || : +fi + +%transfiletriggerin -P 1000600 udev -- {{UDEV_RULES_DIR}} +# This script will automatically update udev with new rules if files +# have been installed or updated in {{UDEV_RULES_DIR}}. +if test -e /run/udev/control; then + udevadm control --reload || : +fi + +%transfiletriggerin -P 1000500 -- {{SYSCTL_DIR}} +# This script will automatically apply sysctl rules if files have been +# installed or updated in {{SYSCTL_DIR}}. +if test -d "/run/systemd/system"; then + {{LIBEXECDIR}}/systemd-sysctl || : +fi diff --git a/src/run-generator/meson.build b/src/run-generator/meson.build new file mode 100644 index 0000000..9a4e4ad --- /dev/null +++ b/src/run-generator/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + generator_template + { + 'name' : 'systemd-run-generator', + 'sources' : files('run-generator.c'), + }, +] diff --git a/src/run-generator/run-generator.c b/src/run-generator/run-generator.c new file mode 100644 index 0000000..5692b7a --- /dev/null +++ b/src/run-generator/run-generator.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "generator.h" +#include "glyph-util.h" +#include "mkdir.h" +#include "proc-cmdline.h" +#include "special.h" +#include "specifier.h" +#include "strv.h" + +static const char *arg_dest = NULL; +static char **arg_commands = NULL; +static char *arg_success_action = NULL; +static char *arg_failure_action = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_commands, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_success_action, freep); +STATIC_DESTRUCTOR_REGISTER(arg_failure_action, freep); + +static int parse(const char *key, const char *value, void *data) { + int r; + + assert(key); + + if (streq(key, "systemd.run")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = strv_extend(&arg_commands, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.run_success_action")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + return free_and_strdup_warn(&arg_success_action, value); + + } else if (proc_cmdline_key_streq(key, "systemd.run_failure_action")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + return free_and_strdup_warn(&arg_failure_action, value); + } + + return 0; +} + +static int generate(void) { + _cleanup_fclose_ FILE *f = NULL; + const char *p; + int r; + + if (strv_isempty(arg_commands) && !arg_success_action) + return 0; + + r = generator_open_unit_file(arg_dest, /* source = */ NULL, "kernel-command-line.service", &f); + if (r < 0) + return r; + + fputs("[Unit]\n" + "Description=Command from Kernel Command Line\n" + "Documentation=man:systemd-run-generator(8)\n" + "SourcePath=/proc/cmdline\n", f); + + if (!streq_ptr(arg_success_action, "none")) + fprintf(f, "SuccessAction=%s\n", + arg_success_action ?: "exit"); + + if (!streq_ptr(arg_failure_action, "none")) + fprintf(f, "FailureAction=%s\n", + arg_failure_action ?: "exit"); + + fputs("\n" + "[Service]\n" + "Type=oneshot\n" + "StandardOutput=journal+console\n", f); + + STRV_FOREACH(c, arg_commands) { + _cleanup_free_ char *a = NULL; + + a = specifier_escape(*c); + if (!a) + return log_oom(); + + fprintf(f, "ExecStart=%s\n", a); + } + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit kernel-command-line.service: %m"); + + /* Let's create a target we can link "default.target" to */ + p = strjoina(arg_dest, "/kernel-command-line.target"); + r = write_string_file( + p, + "# Automatically generated by systemd-run-generator\n\n" + "[Unit]\n" + "Description=Command from Kernel Command Line\n" + "Documentation=man:systemd-run-generator(8)\n" + "SourcePath=/proc/cmdline\n" + "Requires=kernel-command-line.service\n" + "After=kernel-command-line.service\n", + WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_NOFOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to create unit file %s: %m", p); + + /* And now redirect default.target to our new target */ + p = strjoina(arg_dest, "/" SPECIAL_DEFAULT_TARGET); + if (symlink("kernel-command-line.target", p) < 0) + return log_error_errno(errno, "Failed to link unit file kernel-command-line.target %s %s: %m", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), p); + + return 0; +} + +static int run(const char *dest, const char *dest_early, const char *dest_late) { + int r; + + assert_se(arg_dest = dest); + + r = proc_cmdline_parse(parse, NULL, PROC_CMDLINE_RD_STRICT|PROC_CMDLINE_STRIP_RD_PREFIX); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + + return generate(); +} + +DEFINE_MAIN_GENERATOR_FUNCTION(run); diff --git a/src/run/meson.build b/src/run/meson.build new file mode 100644 index 0000000..597a25a --- /dev/null +++ b/src/run/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-run', + 'public' : true, + 'sources' : files('run.c'), + }, +] diff --git a/src/run/run.c b/src/run/run.c new file mode 100644 index 0000000..88eca0f --- /dev/null +++ b/src/run/run.c @@ -0,0 +1,1987 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-event.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-unit-util.h" +#include "bus-wait-for-jobs.h" +#include "calendarspec.h" +#include "env-util.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "main-func.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "ptyfwd.h" +#include "signal-util.h" +#include "spawn-polkit-agent.h" +#include "strv.h" +#include "terminal-util.h" +#include "unit-def.h" +#include "unit-name.h" +#include "user-util.h" + +static bool arg_ask_password = true; +static bool arg_scope = false; +static bool arg_remain_after_exit = false; +static bool arg_no_block = false; +static bool arg_wait = false; +static const char *arg_unit = NULL; +static char *arg_description = NULL; +static const char *arg_slice = NULL; +static bool arg_slice_inherit = false; +static int arg_expand_environment = -1; +static bool arg_send_sighup = false; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static const char *arg_host = NULL; +static RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; +static const char *arg_service_type = NULL; +static const char *arg_exec_user = NULL; +static const char *arg_exec_group = NULL; +static int arg_nice = 0; +static bool arg_nice_set = false; +static char **arg_environment = NULL; +static char **arg_property = NULL; +static enum { + ARG_STDIO_NONE, /* The default, as it is for normal services, stdin connected to /dev/null, and stdout+stderr to the journal */ + ARG_STDIO_PTY, /* Interactive behaviour, requested by --pty: we allocate a pty and connect it to the TTY we are invoked from */ + ARG_STDIO_DIRECT, /* Directly pass our stdin/stdout/stderr to the activated service, useful for usage in shell pipelines, requested by --pipe */ + ARG_STDIO_AUTO, /* If --pipe and --pty are used together we use --pty when invoked on a TTY, and --pipe otherwise */ +} arg_stdio = ARG_STDIO_NONE; +static char **arg_path_property = NULL; +static char **arg_socket_property = NULL; +static char **arg_timer_property = NULL; +static bool arg_with_timer = false; +static bool arg_quiet = false; +static bool arg_aggressive_gc = false; +static char *arg_working_directory = NULL; +static bool arg_shell = false; +static char **arg_cmdline = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_description, freep); +STATIC_DESTRUCTOR_REGISTER(arg_environment, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_path_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_socket_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_timer_property, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_working_directory, freep); +STATIC_DESTRUCTOR_REGISTER(arg_cmdline, strv_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-run", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND [ARGUMENTS...]\n" + "\n%sRun the specified command in a transient scope or service.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-ask-password Do not prompt for password\n" + " --user Run as user unit\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " --scope Run this as scope rather than service\n" + " -u --unit=UNIT Run under the specified unit name\n" + " -p --property=NAME=VALUE Set service or scope unit property\n" + " --description=TEXT Description for unit\n" + " --slice=SLICE Run in the specified slice\n" + " --slice-inherit Inherit the slice\n" + " --expand-environment=BOOL Control expansion of environment variables\n" + " --no-block Do not wait until operation finished\n" + " -r --remain-after-exit Leave service around until explicitly stopped\n" + " --wait Wait until service stopped again\n" + " --send-sighup Send SIGHUP when terminating\n" + " --service-type=TYPE Service type\n" + " --uid=USER Run as system user\n" + " --gid=GROUP Run as system group\n" + " --nice=NICE Nice level\n" + " --working-directory=PATH Set working directory\n" + " -d --same-dir Inherit working directory from caller\n" + " -E --setenv=NAME[=VALUE] Set environment variable\n" + " -t --pty Run service on pseudo TTY as STDIN/STDOUT/\n" + " STDERR\n" + " -P --pipe Pass STDIN/STDOUT/STDERR directly to service\n" + " -q --quiet Suppress information messages during runtime\n" + " -G --collect Unload unit after it ran, even when failed\n" + " -S --shell Invoke a $SHELL interactively\n\n" + "Path options:\n" + " --path-property=NAME=VALUE Set path unit property\n\n" + "Socket options:\n" + " --socket-property=NAME=VALUE Set socket unit property\n\n" + "Timer options:\n" + " --on-active=SECONDS Run after SECONDS delay\n" + " --on-boot=SECONDS Run SECONDS after machine was booted up\n" + " --on-startup=SECONDS Run SECONDS after systemd activation\n" + " --on-unit-active=SECONDS Run SECONDS after the last activation\n" + " --on-unit-inactive=SECONDS Run SECONDS after the last deactivation\n" + " --on-calendar=SPEC Realtime timer\n" + " --on-timezone-change Run when the timezone changes\n" + " --on-clock-change Run when the realtime clock jumps\n" + " --timer-property=NAME=VALUE Set timer unit property\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int add_timer_property(const char *name, const char *val) { + char *p; + + assert(name); + assert(val); + + p = strjoin(name, "=", val); + if (!p) + return log_oom(); + + if (strv_consume(&arg_timer_property, p) < 0) + return log_oom(); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_USER, + ARG_SYSTEM, + ARG_SCOPE, + ARG_DESCRIPTION, + ARG_SLICE, + ARG_SLICE_INHERIT, + ARG_EXPAND_ENVIRONMENT, + ARG_SEND_SIGHUP, + ARG_SERVICE_TYPE, + ARG_EXEC_USER, + ARG_EXEC_GROUP, + ARG_NICE, + ARG_ON_ACTIVE, + ARG_ON_BOOT, + ARG_ON_STARTUP, + ARG_ON_UNIT_ACTIVE, + ARG_ON_UNIT_INACTIVE, + ARG_ON_CALENDAR, + ARG_ON_TIMEZONE_CHANGE, + ARG_ON_CLOCK_CHANGE, + ARG_TIMER_PROPERTY, + ARG_PATH_PROPERTY, + ARG_SOCKET_PROPERTY, + ARG_NO_BLOCK, + ARG_NO_ASK_PASSWORD, + ARG_WAIT, + ARG_WORKING_DIRECTORY, + ARG_SHELL, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "user", no_argument, NULL, ARG_USER }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "scope", no_argument, NULL, ARG_SCOPE }, + { "unit", required_argument, NULL, 'u' }, + { "description", required_argument, NULL, ARG_DESCRIPTION }, + { "slice", required_argument, NULL, ARG_SLICE }, + { "slice-inherit", no_argument, NULL, ARG_SLICE_INHERIT }, + { "remain-after-exit", no_argument, NULL, 'r' }, + { "expand-environment", required_argument, NULL, ARG_EXPAND_ENVIRONMENT }, + { "send-sighup", no_argument, NULL, ARG_SEND_SIGHUP }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "service-type", required_argument, NULL, ARG_SERVICE_TYPE }, + { "wait", no_argument, NULL, ARG_WAIT }, + { "uid", required_argument, NULL, ARG_EXEC_USER }, + { "gid", required_argument, NULL, ARG_EXEC_GROUP }, + { "nice", required_argument, NULL, ARG_NICE }, + { "setenv", required_argument, NULL, 'E' }, + { "property", required_argument, NULL, 'p' }, + { "tty", no_argument, NULL, 't' }, /* deprecated alias */ + { "pty", no_argument, NULL, 't' }, + { "pipe", no_argument, NULL, 'P' }, + { "quiet", no_argument, NULL, 'q' }, + { "on-active", required_argument, NULL, ARG_ON_ACTIVE }, + { "on-boot", required_argument, NULL, ARG_ON_BOOT }, + { "on-startup", required_argument, NULL, ARG_ON_STARTUP }, + { "on-unit-active", required_argument, NULL, ARG_ON_UNIT_ACTIVE }, + { "on-unit-inactive", required_argument, NULL, ARG_ON_UNIT_INACTIVE }, + { "on-calendar", required_argument, NULL, ARG_ON_CALENDAR }, + { "on-timezone-change", no_argument, NULL, ARG_ON_TIMEZONE_CHANGE }, + { "on-clock-change", no_argument, NULL, ARG_ON_CLOCK_CHANGE }, + { "timer-property", required_argument, NULL, ARG_TIMER_PROPERTY }, + { "path-property", required_argument, NULL, ARG_PATH_PROPERTY }, + { "socket-property", required_argument, NULL, ARG_SOCKET_PROPERTY }, + { "no-block", no_argument, NULL, ARG_NO_BLOCK }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "collect", no_argument, NULL, 'G' }, + { "working-directory", required_argument, NULL, ARG_WORKING_DIRECTORY }, + { "same-dir", no_argument, NULL, 'd' }, + { "shell", no_argument, NULL, 'S' }, + {}, + }; + + bool with_trigger = false; + int r, c; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+hrH:M:E:p:tPqGdSu:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + + case ARG_SYSTEM: + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + break; + + case ARG_SCOPE: + arg_scope = true; + break; + + case 'u': + arg_unit = optarg; + break; + + case ARG_DESCRIPTION: + r = free_and_strdup(&arg_description, optarg); + if (r < 0) + return r; + break; + + case ARG_SLICE: + arg_slice = optarg; + break; + + case ARG_SLICE_INHERIT: + arg_slice_inherit = true; + break; + + case ARG_EXPAND_ENVIRONMENT: { + bool b; + + r = parse_boolean_argument("--expand-environment=", optarg, &b); + if (r < 0) + return r; + + arg_expand_environment = b; + + break; + } + + case ARG_SEND_SIGHUP: + arg_send_sighup = true; + break; + + case 'r': + arg_remain_after_exit = true; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case ARG_SERVICE_TYPE: + arg_service_type = optarg; + break; + + case ARG_EXEC_USER: + arg_exec_user = optarg; + break; + + case ARG_EXEC_GROUP: + arg_exec_group = optarg; + break; + + case ARG_NICE: + r = parse_nice(optarg, &arg_nice); + if (r < 0) + return log_error_errno(r, "Failed to parse nice value: %s", optarg); + + arg_nice_set = true; + break; + + case 'E': + r = strv_env_replace_strdup_passthrough(&arg_environment, optarg); + if (r < 0) + return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg); + + break; + + case 'p': + if (strv_extend(&arg_property, optarg) < 0) + return log_oom(); + + break; + + case 't': /* --pty */ + if (IN_SET(arg_stdio, ARG_STDIO_DIRECT, ARG_STDIO_AUTO)) /* if --pipe is already used, upgrade to auto mode */ + arg_stdio = ARG_STDIO_AUTO; + else + arg_stdio = ARG_STDIO_PTY; + break; + + case 'P': /* --pipe */ + if (IN_SET(arg_stdio, ARG_STDIO_PTY, ARG_STDIO_AUTO)) /* If --pty is already used, upgrade to auto mode */ + arg_stdio = ARG_STDIO_AUTO; + else + arg_stdio = ARG_STDIO_DIRECT; + break; + + case 'q': + arg_quiet = true; + break; + + case ARG_ON_ACTIVE: + r = add_timer_property("OnActiveSec", optarg); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_ON_BOOT: + r = add_timer_property("OnBootSec", optarg); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_ON_STARTUP: + r = add_timer_property("OnStartupSec", optarg); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_ON_UNIT_ACTIVE: + r = add_timer_property("OnUnitActiveSec", optarg); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_ON_UNIT_INACTIVE: + r = add_timer_property("OnUnitInactiveSec", optarg); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_ON_CALENDAR: { + _cleanup_(calendar_spec_freep) CalendarSpec *cs = NULL; + + r = calendar_spec_from_string(optarg, &cs); + if (r < 0) + return log_error_errno(r, "Failed to parse calendar event specification: %m"); + + /* Let's make sure the given calendar event is not in the past */ + r = calendar_spec_next_usec(cs, now(CLOCK_REALTIME), NULL); + if (r == -ENOENT) + /* The calendar event is in the past — let's warn about this, but install it + * anyway as is. The service manager will trigger the service right away. + * Moreover, the server side might have a different clock or timezone than we + * do, hence it should decide when or whether to run something. */ + log_warning("Specified calendar expression is in the past, proceeding anyway."); + else if (r < 0) + return log_error_errno(r, "Failed to calculate next time calendar expression elapses: %m"); + + r = add_timer_property("OnCalendar", optarg); + if (r < 0) + return r; + + arg_with_timer = true; + break; + } + + case ARG_ON_TIMEZONE_CHANGE: + r = add_timer_property("OnTimezoneChange", "yes"); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_ON_CLOCK_CHANGE: + r = add_timer_property("OnClockChange", "yes"); + if (r < 0) + return r; + + arg_with_timer = true; + break; + + case ARG_TIMER_PROPERTY: + + if (strv_extend(&arg_timer_property, optarg) < 0) + return log_oom(); + + arg_with_timer = arg_with_timer || + STARTSWITH_SET(optarg, + "OnActiveSec=", + "OnBootSec=", + "OnStartupSec=", + "OnUnitActiveSec=", + "OnUnitInactiveSec=", + "OnCalendar="); + break; + + case ARG_PATH_PROPERTY: + + if (strv_extend(&arg_path_property, optarg) < 0) + return log_oom(); + + break; + + case ARG_SOCKET_PROPERTY: + + if (strv_extend(&arg_socket_property, optarg) < 0) + return log_oom(); + + break; + + case ARG_NO_BLOCK: + arg_no_block = true; + break; + + case ARG_WAIT: + arg_wait = true; + break; + + case ARG_WORKING_DIRECTORY: + r = parse_path_argument(optarg, true, &arg_working_directory); + if (r < 0) + return r; + + break; + + case 'd': { + _cleanup_free_ char *p = NULL; + + r = safe_getcwd(&p); + if (r < 0) + return log_error_errno(r, "Failed to get current working directory: %m"); + + if (empty_or_root(p)) + arg_working_directory = mfree(arg_working_directory); + else + free_and_replace(arg_working_directory, p); + break; + } + + case 'G': + arg_aggressive_gc = true; + break; + + case 'S': + arg_shell = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + /* If we are talking to the per-user instance PolicyKit isn't going to help */ + if (arg_runtime_scope == RUNTIME_SCOPE_USER) + arg_ask_password = false; + + with_trigger = !!arg_path_property || !!arg_socket_property || arg_with_timer; + + /* currently, only single trigger (path, socket, timer) unit can be created simultaneously */ + if ((int) !!arg_path_property + (int) !!arg_socket_property + (int) arg_with_timer > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Only single trigger (path, socket, timer) unit can be created."); + + if (arg_shell) { + /* If --shell is imply --pty --pipe --same-dir --service-type=exec --wait --collect, unless otherwise + * specified. */ + + if (!arg_scope) { + if (arg_stdio == ARG_STDIO_NONE) + arg_stdio = ARG_STDIO_AUTO; + + if (!arg_working_directory) { + r = safe_getcwd(&arg_working_directory); + if (r < 0) + return log_error_errno(r, "Failed to get current working directory: %m"); + } + + if (!arg_service_type) { + arg_service_type = strdup("exec"); + if (!arg_service_type) + return log_oom(); + } + + arg_wait = true; + } + + arg_aggressive_gc = true; + } + + if (arg_stdio == ARG_STDIO_AUTO) + /* If we both --pty and --pipe are specified we'll automatically pick --pty if we are connected fully + * to a TTY and pick direct fd passing otherwise. This way, we automatically adapt to usage in a shell + * pipeline, but we are neatly interactive with tty-level isolation otherwise. */ + arg_stdio = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) && isatty(STDERR_FILENO) ? + ARG_STDIO_PTY : + ARG_STDIO_DIRECT; + + if (argc > optind) { + char **l; + + if (arg_shell) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "If --shell is used, no command line is expected."); + + l = strv_copy(argv + optind); + if (!l) + return log_oom(); + + strv_free_and_replace(arg_cmdline, l); + + } else if (arg_shell) { + _cleanup_free_ char *s = NULL; + char **l; + + r = get_shell(&s); + if (r < 0) + return log_error_errno(r, "Failed to determine shell: %m"); + + l = strv_new(s); + if (!l) + return log_oom(); + + strv_free_and_replace(arg_cmdline, l); + + } else if (!arg_unit || !with_trigger) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Command line to execute required."); + + if (arg_runtime_scope == RUNTIME_SCOPE_USER && arg_transport == BUS_TRANSPORT_REMOTE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Execution in user context is not supported on remote systems."); + + if (arg_scope && arg_transport == BUS_TRANSPORT_REMOTE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Scope execution is not supported on remote systems."); + + if (arg_scope && (arg_remain_after_exit || arg_service_type)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--remain-after-exit and --service-type= are not supported in --scope mode."); + + if (arg_stdio != ARG_STDIO_NONE && (with_trigger || arg_scope)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--pty/--pipe is not compatible in timer or --scope mode."); + + if (arg_stdio != ARG_STDIO_NONE && arg_transport == BUS_TRANSPORT_REMOTE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--pty/--pipe is only supported when connecting to the local system or containers."); + + if (arg_stdio != ARG_STDIO_NONE && arg_no_block) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--pty/--pipe is not compatible with --no-block."); + + if (arg_scope && with_trigger) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Path, socket or timer options are not supported in --scope mode."); + + if (arg_timer_property && !arg_with_timer) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--timer-property= has no effect without any other timer options."); + + if (arg_wait) { + if (arg_no_block) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--wait may not be combined with --no-block."); + + if (with_trigger) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--wait may not be combined with path, socket or timer operations."); + + if (arg_scope) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--wait may not be combined with --scope."); + } + + return 1; +} + +static int transient_unit_set_properties(sd_bus_message *m, UnitType t, char **properties) { + int r; + + assert(m); + + r = sd_bus_message_append(m, "(sv)", "Description", "s", arg_description); + if (r < 0) + return bus_log_create_error(r); + + if (arg_aggressive_gc) { + r = sd_bus_message_append(m, "(sv)", "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_is_bus_client(sd_bus_message_get_bus(m)); + if (r < 0) + return log_error_errno(r, "Can't determine if bus connection is direct or to broker: %m"); + if (r > 0) { + /* Pin the object as least as long as we are around. Note that AddRef (currently) only works + * if we talk via the bus though. */ + r = sd_bus_message_append(m, "(sv)", "AddRef", "b", 1); + if (r < 0) + return bus_log_create_error(r); + } + + return bus_append_unit_property_assignment_many(m, t, properties); +} + +static int transient_cgroup_set_properties(sd_bus_message *m) { + _cleanup_free_ char *name = NULL; + _cleanup_free_ char *slice = NULL; + int r; + assert(m); + + if (arg_slice_inherit) { + char *end; + + switch (arg_runtime_scope) { + + case RUNTIME_SCOPE_USER: + r = cg_pid_get_user_slice(0, &name); + break; + + case RUNTIME_SCOPE_SYSTEM: + r = cg_pid_get_slice(0, &name); + break; + + default: + assert_not_reached(); + } + + if (r < 0) + return log_error_errno(r, "Failed to get PID slice: %m"); + + end = endswith(name, ".slice"); + if (!end) + return -ENXIO; + *end = 0; + } + + if (!isempty(arg_slice) && !strextend_with_separator(&name, "-", arg_slice)) + return log_oom(); + + if (!name) + return 0; + + r = unit_name_mangle_with_suffix(name, "as slice", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".slice", &slice); + if (r < 0) + return log_error_errno(r, "Failed to mangle name '%s': %m", arg_slice); + + r = sd_bus_message_append(m, "(sv)", "Slice", "s", slice); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int transient_kill_set_properties(sd_bus_message *m) { + int r; + + assert(m); + + if (arg_send_sighup) { + r = sd_bus_message_append(m, "(sv)", "SendSIGHUP", "b", arg_send_sighup); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int transient_service_set_properties(sd_bus_message *m, const char *pty_path) { + bool send_term = false; + int r; + + /* We disable environment expansion on the server side via ExecStartEx=:. + * ExecStartEx was added relatively recently (v243), and some bugs were fixed only later. + * So use that feature only if required. It will fail with older systemds. */ + bool use_ex_prop = arg_expand_environment == 0; + + assert(m); + + r = transient_unit_set_properties(m, UNIT_SERVICE, arg_property); + if (r < 0) + return r; + + r = transient_kill_set_properties(m); + if (r < 0) + return r; + + r = transient_cgroup_set_properties(m); + if (r < 0) + return r; + + if (arg_remain_after_exit) { + r = sd_bus_message_append(m, "(sv)", "RemainAfterExit", "b", arg_remain_after_exit); + if (r < 0) + return bus_log_create_error(r); + } + + if (arg_service_type) { + r = sd_bus_message_append(m, "(sv)", "Type", "s", arg_service_type); + if (r < 0) + return bus_log_create_error(r); + } + + if (arg_exec_user) { + r = sd_bus_message_append(m, "(sv)", "User", "s", arg_exec_user); + if (r < 0) + return bus_log_create_error(r); + } + + if (arg_exec_group) { + r = sd_bus_message_append(m, "(sv)", "Group", "s", arg_exec_group); + if (r < 0) + return bus_log_create_error(r); + } + + if (arg_nice_set) { + r = sd_bus_message_append(m, "(sv)", "Nice", "i", arg_nice); + if (r < 0) + return bus_log_create_error(r); + } + + if (arg_working_directory) { + r = sd_bus_message_append(m, "(sv)", "WorkingDirectory", "s", arg_working_directory); + if (r < 0) + return bus_log_create_error(r); + } + + if (pty_path) { + r = sd_bus_message_append(m, + "(sv)(sv)(sv)(sv)", + "StandardInput", "s", "tty", + "StandardOutput", "s", "tty", + "StandardError", "s", "tty", + "TTYPath", "s", pty_path); + if (r < 0) + return bus_log_create_error(r); + + send_term = true; + + } else if (arg_stdio == ARG_STDIO_DIRECT) { + r = sd_bus_message_append(m, + "(sv)(sv)(sv)", + "StandardInputFileDescriptor", "h", STDIN_FILENO, + "StandardOutputFileDescriptor", "h", STDOUT_FILENO, + "StandardErrorFileDescriptor", "h", STDERR_FILENO); + if (r < 0) + return bus_log_create_error(r); + + send_term = isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) || isatty(STDERR_FILENO); + } + + if (send_term) { + const char *e; + + e = getenv("TERM"); + if (e) { + _cleanup_free_ char *n = NULL; + + n = strjoin("TERM=", e); + if (!n) + return log_oom(); + + r = sd_bus_message_append(m, + "(sv)", + "Environment", "as", 1, n); + if (r < 0) + return bus_log_create_error(r); + } + } + + if (!strv_isempty(arg_environment)) { + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", "Environment"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "as"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, arg_environment); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + /* Exec container */ + if (!strv_isempty(arg_cmdline)) { + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", + use_ex_prop ? "ExecStartEx" : "ExecStart"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', + use_ex_prop ? "a(sasas)" : "a(sasb)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', + use_ex_prop ? "(sasas)" : "(sasb)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'r', + use_ex_prop ? "sasas" : "sasb"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", arg_cmdline[0]); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, arg_cmdline); + if (r < 0) + return bus_log_create_error(r); + + if (use_ex_prop) + r = sd_bus_message_append_strv( + m, + STRV_MAKE(arg_expand_environment > 0 ? NULL : "no-env-expand")); + else + r = sd_bus_message_append(m, "b", false); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int transient_scope_set_properties(sd_bus_message *m, bool allow_pidfd) { + int r; + + assert(m); + + r = transient_unit_set_properties(m, UNIT_SCOPE, arg_property); + if (r < 0) + return r; + + r = transient_kill_set_properties(m); + if (r < 0) + return r; + + r = transient_cgroup_set_properties(m); + if (r < 0) + return r; + + if (allow_pidfd) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + + r = pidref_set_self(&pidref); + if (r < 0) + return r; + + r = bus_append_scope_pidref(m, &pidref); + } else + r = sd_bus_message_append( + m, "(sv)", + "PIDs", "au", 1, getpid_cached()); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int transient_timer_set_properties(sd_bus_message *m) { + int r; + + assert(m); + + r = transient_unit_set_properties(m, UNIT_TIMER, arg_timer_property); + if (r < 0) + return r; + + /* Automatically clean up our transient timers */ + r = sd_bus_message_append(m, "(sv)", "RemainAfterElapse", "b", false); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int make_unit_name(sd_bus *bus, UnitType t, char **ret) { + const char *unique, *id; + char *p; + int r; + + assert(bus); + assert(t >= 0); + assert(t < _UNIT_TYPE_MAX); + + r = sd_bus_get_unique_name(bus, &unique); + if (r < 0) { + sd_id128_t rnd; + + /* We couldn't get the unique name, which is a pretty + * common case if we are connected to systemd + * directly. In that case, just pick a random uuid as + * name */ + + r = sd_id128_randomize(&rnd); + if (r < 0) + return log_error_errno(r, "Failed to generate random run unit name: %m"); + + if (asprintf(ret, "run-r" SD_ID128_FORMAT_STR ".%s", SD_ID128_FORMAT_VAL(rnd), unit_type_to_string(t)) < 0) + return log_oom(); + + return 0; + } + + /* We managed to get the unique name, then let's use that to name our transient units. */ + + id = startswith(unique, ":1."); /* let' strip the usual prefix */ + if (!id) + id = startswith(unique, ":"); /* the spec only requires things to start with a colon, hence + * let's add a generic fallback for that. */ + if (!id) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unique name %s has unexpected format.", + unique); + + p = strjoin("run-u", id, ".", unit_type_to_string(t)); + if (!p) + return log_oom(); + + *ret = p; + return 0; +} + +typedef struct RunContext { + sd_bus *bus; + sd_event *event; + PTYForward *forward; + sd_bus_slot *match; + + /* Current state of the unit */ + char *active_state; + bool has_job; + + /* The exit data of the unit */ + uint64_t inactive_exit_usec; + uint64_t inactive_enter_usec; + char *result; + uint64_t cpu_usage_nsec; + uint64_t memory_peak; + uint64_t memory_swap_peak; + uint64_t ip_ingress_bytes; + uint64_t ip_egress_bytes; + uint64_t io_read_bytes; + uint64_t io_write_bytes; + uint32_t exit_code; + uint32_t exit_status; +} RunContext; + +static void run_context_free(RunContext *c) { + assert(c); + + c->forward = pty_forward_free(c->forward); + c->match = sd_bus_slot_unref(c->match); + c->bus = sd_bus_unref(c->bus); + c->event = sd_event_unref(c->event); + + free(c->active_state); + free(c->result); +} + +static void run_context_check_done(RunContext *c) { + bool done; + + assert(c); + + if (c->match) + done = STRPTR_IN_SET(c->active_state, "inactive", "failed") && !c->has_job; + else + done = true; + + if (c->forward && done) /* If the service is gone, it's time to drain the output */ + done = pty_forward_drain(c->forward); + + if (done) + sd_event_exit(c->event, EXIT_SUCCESS); +} + +static int map_job(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + bool *b = userdata; + const char *job; + uint32_t id; + int r; + + r = sd_bus_message_read(m, "(uo)", &id, &job); + if (r < 0) + return r; + + *b = id != 0 || !streq(job, "/"); + return 0; +} + +static int run_context_update(RunContext *c, const char *path) { + + static const struct bus_properties_map map[] = { + { "ActiveState", "s", NULL, offsetof(RunContext, active_state) }, + { "InactiveExitTimestampMonotonic", "t", NULL, offsetof(RunContext, inactive_exit_usec) }, + { "InactiveEnterTimestampMonotonic", "t", NULL, offsetof(RunContext, inactive_enter_usec) }, + { "Result", "s", NULL, offsetof(RunContext, result) }, + { "ExecMainCode", "i", NULL, offsetof(RunContext, exit_code) }, + { "ExecMainStatus", "i", NULL, offsetof(RunContext, exit_status) }, + { "CPUUsageNSec", "t", NULL, offsetof(RunContext, cpu_usage_nsec) }, + { "MemoryPeak", "t", NULL, offsetof(RunContext, memory_peak) }, + { "MemorySwapPeak", "t", NULL, offsetof(RunContext, memory_swap_peak) }, + { "IPIngressBytes", "t", NULL, offsetof(RunContext, ip_ingress_bytes) }, + { "IPEgressBytes", "t", NULL, offsetof(RunContext, ip_egress_bytes) }, + { "IOReadBytes", "t", NULL, offsetof(RunContext, io_read_bytes) }, + { "IOWriteBytes", "t", NULL, offsetof(RunContext, io_write_bytes) }, + { "Job", "(uo)", map_job, offsetof(RunContext, has_job) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + r = bus_map_all_properties(c->bus, + "org.freedesktop.systemd1", + path, + map, + BUS_MAP_STRDUP, + &error, + NULL, + c); + if (r < 0) { + sd_event_exit(c->event, EXIT_FAILURE); + return log_error_errno(r, "Failed to query unit state: %s", bus_error_message(&error, r)); + } + + run_context_check_done(c); + return 0; +} + +static int on_properties_changed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + RunContext *c = ASSERT_PTR(userdata); + + assert(m); + + return run_context_update(c, sd_bus_message_get_path(m)); +} + +static int pty_forward_handler(PTYForward *f, int rcode, void *userdata) { + RunContext *c = userdata; + + assert(f); + + if (rcode < 0) { + sd_event_exit(c->event, EXIT_FAILURE); + return log_error_errno(rcode, "Error on PTY forwarding logic: %m"); + } + + run_context_check_done(c); + return 0; +} + +static int make_transient_service_unit( + sd_bus *bus, + sd_bus_message **message, + const char *service, + const char *pty_path) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + assert(message); + assert(service); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_set_allow_interactive_authorization(m, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Name and mode */ + r = sd_bus_message_append(m, "ss", service, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = transient_service_set_properties(m, pty_path); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* Auxiliary units */ + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return bus_log_create_error(r); + + *message = TAKE_PTR(m); + return 0; +} + +static int bus_call_with_hint( + sd_bus *bus, + sd_bus_message *message, + const char *name, + sd_bus_message **reply) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + r = sd_bus_call(bus, message, 0, &error, reply); + if (r < 0) { + log_error_errno(r, "Failed to start transient %s unit: %s", name, bus_error_message(&error, r)); + + if (arg_expand_environment == 0 && + sd_bus_error_has_names(&error, + SD_BUS_ERROR_UNKNOWN_PROPERTY, + SD_BUS_ERROR_PROPERTY_READ_ONLY)) + log_notice_errno(r, "Hint: --expand-environment=no is not supported by old systemd"); + } + + return r; +} + +static int acquire_invocation_id(sd_bus *bus, const char *unit, sd_id128_t *ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *object = NULL; + const void *p; + size_t l; + int r; + + assert(bus); + assert(ret); + + if (unit) { + object = unit_dbus_path_from_name(unit); + if (!object) + return log_oom(); + } + + r = sd_bus_get_property(bus, + "org.freedesktop.systemd1", + object ?: "/org/freedesktop/systemd1/unit/self", + "org.freedesktop.systemd1.Unit", + "InvocationID", + &error, + &reply, + "ay"); + if (r < 0) + return log_error_errno(r, "Failed to request invocation ID for unit: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read_array(reply, 'y', &p, &l); + if (r < 0) + return bus_log_parse_error(r); + + if (l == 0) { + *ret = SD_ID128_NULL; + return 0; /* no uuid set */ + } + + if (l != sizeof(sd_id128_t)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid UUID size, %zu != %zu.", l, sizeof(sd_id128_t)); + + memcpy(ret, p, l); + return !sd_id128_is_null(*ret); +} + +static int start_transient_service(sd_bus *bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_free_ char *service = NULL, *pty_path = NULL; + _cleanup_close_ int master = -EBADF; + int r; + + assert(bus); + + if (arg_stdio == ARG_STDIO_PTY) { + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (master < 0) + return log_error_errno(errno, "Failed to acquire pseudo tty: %m"); + + r = ptsname_malloc(master, &pty_path); + if (r < 0) + return log_error_errno(r, "Failed to determine tty name: %m"); + + if (unlockpt(master) < 0) + return log_error_errno(errno, "Failed to unlock tty: %m"); + + } else if (arg_transport == BUS_TRANSPORT_MACHINE) { + _cleanup_(sd_bus_unrefp) sd_bus *system_bus = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *pty_reply = NULL; + const char *s; + + r = sd_bus_default_system(&system_bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_call_method(system_bus, + bus_machine_mgr, + "OpenMachinePTY", + &error, + &pty_reply, + "s", arg_host); + if (r < 0) + return log_error_errno(r, "Failed to get machine PTY: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(pty_reply, "hs", &master, &s); + if (r < 0) + return bus_log_parse_error(r); + + master = fcntl(master, F_DUPFD_CLOEXEC, 3); + if (master < 0) + return log_error_errno(errno, "Failed to duplicate master fd: %m"); + + pty_path = strdup(s); + if (!pty_path) + return log_oom(); + } else + assert_not_reached(); + } + + /* Optionally, wait for the start job to complete. If we are supposed to read the service's stdin + * lets skip this however, because we should start that already when the start job is running, and + * there's little point in waiting for the start job to complete in that case anyway, as we'll wait + * for EOF anyway, which is going to be much later. */ + if (!arg_no_block && arg_stdio == ARG_STDIO_NONE) { + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + if (arg_unit) { + r = unit_name_mangle_with_suffix(arg_unit, "as unit", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".service", &service); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + } else { + r = make_unit_name(bus, UNIT_SERVICE, &service); + if (r < 0) + return r; + } + + r = make_transient_service_unit(bus, &m, service, pty_path); + if (r < 0) + return r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_with_hint(bus, m, "service", &reply); + if (r < 0) + return r; + + if (w) { + const char *object; + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, + object, + arg_quiet, + arg_runtime_scope == RUNTIME_SCOPE_USER ? STRV_MAKE_CONST("--user") : NULL); + if (r < 0) + return r; + } + + if (!arg_quiet) { + sd_id128_t invocation_id; + + r = acquire_invocation_id(bus, service, &invocation_id); + if (r < 0) + return r; + if (r == 0) /* No invocation UUID set */ + log_info("Running as unit: %s", service); + else + log_info("Running as unit: %s; invocation ID: " SD_ID128_FORMAT_STR, service, SD_ID128_FORMAT_VAL(invocation_id)); + } + + if (arg_wait || arg_stdio != ARG_STDIO_NONE) { + _cleanup_(run_context_free) RunContext c = { + .cpu_usage_nsec = NSEC_INFINITY, + .memory_peak = UINT64_MAX, + .memory_swap_peak = UINT64_MAX, + .ip_ingress_bytes = UINT64_MAX, + .ip_egress_bytes = UINT64_MAX, + .io_read_bytes = UINT64_MAX, + .io_write_bytes = UINT64_MAX, + .inactive_exit_usec = USEC_INFINITY, + .inactive_enter_usec = USEC_INFINITY, + }; + _cleanup_free_ char *path = NULL; + + c.bus = sd_bus_ref(bus); + + r = sd_event_default(&c.event); + if (r < 0) + return log_error_errno(r, "Failed to get event loop: %m"); + + if (master >= 0) { + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGWINCH, SIGTERM, SIGINT, -1) >= 0); + (void) sd_event_add_signal(c.event, NULL, SIGINT, NULL, NULL); + (void) sd_event_add_signal(c.event, NULL, SIGTERM, NULL, NULL); + + if (!arg_quiet) + log_info("Press ^] three times within 1s to disconnect TTY."); + + r = pty_forward_new(c.event, master, PTY_FORWARD_IGNORE_INITIAL_VHANGUP, &c.forward); + if (r < 0) + return log_error_errno(r, "Failed to create PTY forwarder: %m"); + + pty_forward_set_handler(c.forward, pty_forward_handler, &c); + + /* Make sure to process any TTY events before we process bus events */ + (void) pty_forward_set_priority(c.forward, SD_EVENT_PRIORITY_IMPORTANT); + } + + path = unit_dbus_path_from_name(service); + if (!path) + return log_oom(); + + r = sd_bus_match_signal_async( + bus, + &c.match, + "org.freedesktop.systemd1", + path, + "org.freedesktop.DBus.Properties", + "PropertiesChanged", + on_properties_changed, NULL, &c); + if (r < 0) + return log_error_errno(r, "Failed to request properties changed signal match: %m"); + + r = sd_bus_attach_event(bus, c.event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + r = run_context_update(&c, path); + if (r < 0) + return r; + + r = sd_event_loop(c.event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + if (c.forward) { + char last_char = 0; + + r = pty_forward_get_last_char(c.forward, &last_char); + if (r >= 0 && !arg_quiet && last_char != '\n') + fputc('\n', stdout); + } + + if (arg_wait && !arg_quiet) { + + /* Explicitly destroy the PTY forwarder, so that the PTY device is usable again, with its + * original settings (i.e. proper line breaks), so that we can show the summary in a pretty + * way. */ + c.forward = pty_forward_free(c.forward); + + if (!isempty(c.result)) + log_info("Finished with result: %s", strna(c.result)); + + if (c.exit_code == CLD_EXITED) + log_info("Main processes terminated with: code=%s/status=%u", + sigchld_code_to_string(c.exit_code), c.exit_status); + else if (c.exit_code > 0) + log_info("Main processes terminated with: code=%s/status=%s", + sigchld_code_to_string(c.exit_code), signal_to_string(c.exit_status)); + + if (timestamp_is_set(c.inactive_enter_usec) && + timestamp_is_set(c.inactive_exit_usec) && + c.inactive_enter_usec > c.inactive_exit_usec) + log_info("Service runtime: %s", + FORMAT_TIMESPAN(c.inactive_enter_usec - c.inactive_exit_usec, USEC_PER_MSEC)); + + if (c.cpu_usage_nsec != NSEC_INFINITY) + log_info("CPU time consumed: %s", + FORMAT_TIMESPAN(DIV_ROUND_UP(c.cpu_usage_nsec, NSEC_PER_USEC), USEC_PER_MSEC)); + + if (c.memory_peak != UINT64_MAX) + log_info("Memory peak: %s", FORMAT_BYTES(c.memory_peak)); + + if (c.memory_swap_peak != UINT64_MAX) + log_info("Memory swap peak: %s", FORMAT_BYTES(c.memory_swap_peak)); + + if (c.ip_ingress_bytes != UINT64_MAX) + log_info("IP traffic received: %s", FORMAT_BYTES(c.ip_ingress_bytes)); + + if (c.ip_egress_bytes != UINT64_MAX) + log_info("IP traffic sent: %s", FORMAT_BYTES(c.ip_egress_bytes)); + + if (c.io_read_bytes != UINT64_MAX) + log_info("IO bytes read: %s", FORMAT_BYTES(c.io_read_bytes)); + + if (c.io_write_bytes != UINT64_MAX) + log_info("IO bytes written: %s", FORMAT_BYTES(c.io_write_bytes)); + } + + /* Try to propagate the service's return value. But if the service defines + * e.g. SuccessExitStatus, honour this, and return 0 to mean "success". */ + if (streq_ptr(c.result, "success")) + return EXIT_SUCCESS; + if (streq_ptr(c.result, "exit-code") && c.exit_status > 0) + return c.exit_status; + if (streq_ptr(c.result, "signal")) + return EXIT_EXCEPTION; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + +static int start_transient_scope(sd_bus *bus) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_strv_free_ char **env = NULL, **user_env = NULL; + _cleanup_free_ char *scope = NULL; + const char *object = NULL; + sd_id128_t invocation_id; + bool allow_pidfd = true; + int r; + + assert(bus); + assert(!strv_isempty(arg_cmdline)); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + + if (arg_unit) { + r = unit_name_mangle_with_suffix(arg_unit, "as unit", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".scope", &scope); + if (r < 0) + return log_error_errno(r, "Failed to mangle scope name: %m"); + } else { + r = make_unit_name(bus, UNIT_SCOPE, &scope); + if (r < 0) + return r; + } + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + for (;;) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_set_allow_interactive_authorization(m, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Name and Mode */ + r = sd_bus_message_append(m, "ss", scope, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = transient_scope_set_properties(m, allow_pidfd); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* Auxiliary units */ + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + if (sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY) && allow_pidfd) { + log_debug("Retrying with classic PIDs."); + allow_pidfd = false; + continue; + } + + return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r)); + } + + break; + } + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, arg_quiet, arg_runtime_scope == RUNTIME_SCOPE_USER ? STRV_MAKE_CONST("--user") : NULL); + if (r < 0) + return r; + + r = acquire_invocation_id(bus, NULL, &invocation_id); + if (r < 0) + return r; + if (r == 0) + log_debug("No invocation ID set."); + else { + if (strv_extendf(&user_env, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(invocation_id)) < 0) + return log_oom(); + } + + if (arg_nice_set) { + if (setpriority(PRIO_PROCESS, 0, arg_nice) < 0) + return log_error_errno(errno, "Failed to set nice level: %m"); + } + + if (arg_exec_group) { + gid_t gid; + + r = get_group_creds(&arg_exec_group, &gid, 0); + if (r < 0) + return log_error_errno(r, "Failed to resolve group %s: %m", arg_exec_group); + + if (setresgid(gid, gid, gid) < 0) + return log_error_errno(errno, "Failed to change GID to " GID_FMT ": %m", gid); + } + + if (arg_exec_user) { + const char *home, *shell; + uid_t uid; + gid_t gid; + + r = get_user_creds(&arg_exec_user, &uid, &gid, &home, &shell, USER_CREDS_CLEAN|USER_CREDS_PREFER_NSS); + if (r < 0) + return log_error_errno(r, "Failed to resolve user %s: %m", arg_exec_user); + + if (home) { + r = strv_extendf(&user_env, "HOME=%s", home); + if (r < 0) + return log_oom(); + } + + if (shell) { + r = strv_extendf(&user_env, "SHELL=%s", shell); + if (r < 0) + return log_oom(); + } + + r = strv_extendf(&user_env, "USER=%s", arg_exec_user); + if (r < 0) + return log_oom(); + + r = strv_extendf(&user_env, "LOGNAME=%s", arg_exec_user); + if (r < 0) + return log_oom(); + + if (!arg_exec_group) { + if (setresgid(gid, gid, gid) < 0) + return log_error_errno(errno, "Failed to change GID to " GID_FMT ": %m", gid); + } + + if (setresuid(uid, uid, uid) < 0) + return log_error_errno(errno, "Failed to change UID to " UID_FMT ": %m", uid); + } + + if (arg_working_directory && chdir(arg_working_directory) < 0) + return log_error_errno(errno, "Failed to change directory to '%s': %m", arg_working_directory); + + env = strv_env_merge(environ, user_env, arg_environment); + if (!env) + return log_oom(); + + if (!arg_quiet) { + if (sd_id128_is_null(invocation_id)) + log_info("Running as unit: %s", scope); + else + log_info("Running as unit: %s; invocation ID: " SD_ID128_FORMAT_STR, scope, SD_ID128_FORMAT_VAL(invocation_id)); + } + + if (arg_expand_environment > 0) { + _cleanup_strv_free_ char **expanded_cmdline = NULL, **unset_variables = NULL, **bad_variables = NULL; + + r = replace_env_argv(arg_cmdline, env, &expanded_cmdline, &unset_variables, &bad_variables); + if (r < 0) + return log_error_errno(r, "Failed to expand environment variables: %m"); + + free_and_replace(arg_cmdline, expanded_cmdline); + + if (!strv_isempty(unset_variables)) { + _cleanup_free_ char *ju = strv_join(unset_variables, ", "); + log_warning("Referenced but unset environment variable evaluates to an empty string: %s", strna(ju)); + } + + if (!strv_isempty(bad_variables)) { + _cleanup_free_ char *jb = strv_join(bad_variables, ", "); + log_warning("Invalid environment variable name evaluates to an empty string: %s", strna(jb)); + } + } + + execvpe(arg_cmdline[0], arg_cmdline, env); + + return log_error_errno(errno, "Failed to execute: %m"); +} + +static int make_transient_trigger_unit( + sd_bus *bus, + sd_bus_message **message, + const char *suffix, + const char *trigger, + const char *service) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + assert(message); + assert(suffix); + assert(trigger); + assert(service); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_set_allow_interactive_authorization(m, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Name and Mode */ + r = sd_bus_message_append(m, "ss", trigger, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + if (streq(suffix, ".path")) + r = transient_unit_set_properties(m, UNIT_PATH, arg_path_property); + else if (streq(suffix, ".socket")) + r = transient_unit_set_properties(m, UNIT_SOCKET, arg_socket_property); + else if (streq(suffix, ".timer")) + r = transient_timer_set_properties(m); + else + assert_not_reached(); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sa(sv))"); + if (r < 0) + return bus_log_create_error(r); + + if (!strv_isempty(arg_cmdline)) { + r = sd_bus_message_open_container(m, 'r', "sa(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", service); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = transient_service_set_properties(m, NULL); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + *message = TAKE_PTR(m); + return 0; +} + +static int start_transient_trigger(sd_bus *bus, const char *suffix) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_free_ char *trigger = NULL, *service = NULL; + const char *object = NULL; + int r; + + assert(bus); + assert(suffix); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + + if (arg_unit) { + switch (unit_name_to_type(arg_unit)) { + + case UNIT_SERVICE: + service = strdup(arg_unit); + if (!service) + return log_oom(); + + r = unit_name_change_suffix(service, suffix, &trigger); + if (r < 0) + return log_error_errno(r, "Failed to change unit suffix: %m"); + break; + + case UNIT_TIMER: + trigger = strdup(arg_unit); + if (!trigger) + return log_oom(); + + r = unit_name_change_suffix(trigger, ".service", &service); + if (r < 0) + return log_error_errno(r, "Failed to change unit suffix: %m"); + break; + + default: + r = unit_name_mangle_with_suffix(arg_unit, "as unit", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".service", &service); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + r = unit_name_mangle_with_suffix(arg_unit, "as trigger", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + suffix, &trigger); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + break; + } + } else { + r = make_unit_name(bus, UNIT_SERVICE, &service); + if (r < 0) + return r; + + r = unit_name_change_suffix(service, suffix, &trigger); + if (r < 0) + return log_error_errno(r, "Failed to change unit suffix: %m"); + } + + r = make_transient_trigger_unit(bus, &m, suffix, trigger, service); + if (r < 0) + return r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_with_hint(bus, m, suffix + 1, &reply); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, arg_quiet, arg_runtime_scope == RUNTIME_SCOPE_USER ? STRV_MAKE_CONST("--user") : NULL); + if (r < 0) + return r; + + if (!arg_quiet) { + log_info("Running %s as unit: %s", suffix + 1, trigger); + if (!strv_isempty(arg_cmdline)) + log_info("Will run service as unit: %s", service); + } + + return EXIT_SUCCESS; +} + +static bool shall_make_executable_absolute(void) { + if (strv_isempty(arg_cmdline)) + return false; + if (arg_transport != BUS_TRANSPORT_LOCAL) + return false; + + FOREACH_STRING(f, "RootDirectory=", "RootImage=", "ExecSearchPath=", "MountImages=", "ExtensionImages=") + if (strv_find_startswith(arg_property, f)) + return false; + + return true; +} + +static int run(int argc, char* argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (shall_make_executable_absolute()) { + /* Patch in an absolute path to fail early for user convenience, but only when we can do it + * (i.e. we will be running from the same file system). This also uses the user's $PATH, + * while we use a fixed search path in the manager. */ + + _cleanup_free_ char *command = NULL; + r = find_executable(arg_cmdline[0], &command); + if (r < 0) + return log_error_errno(r, "Failed to find executable %s: %m", arg_cmdline[0]); + + free_and_replace(arg_cmdline[0], command); + } + + if (!arg_description) { + char *t; + + if (strv_isempty(arg_cmdline)) + t = strdup(arg_unit); + else + t = quote_command_line(arg_cmdline, SHELL_ESCAPE_EMPTY); + if (!t) + return log_oom(); + + free_and_replace(arg_description, t); + } + + /* For backward compatibility reasons env var expansion is disabled by default for scopes, and + * enabled by default for everything else. Try to detect it and print a warning, so that we can + * change it in the future and harmonize it. */ + if (arg_expand_environment < 0) { + arg_expand_environment = !arg_scope; + + if (!arg_quiet && arg_scope && strchr(arg_description, '$')) + log_warning("Scope command line contains environment variable, which is not expanded" + " by default for now, but will be expanded by default in the future." + " Use --expand-environment=yes/no to explicitly control it as needed."); + } + + /* If --wait is used connect via the bus, unconditionally, as ref/unref is not supported via the + * limited direct connection */ + if (arg_wait || + arg_stdio != ARG_STDIO_NONE || + (arg_runtime_scope == RUNTIME_SCOPE_USER && arg_transport != BUS_TRANSPORT_LOCAL)) + r = bus_connect_transport(arg_transport, arg_host, arg_runtime_scope, &bus); + else + r = bus_connect_transport_systemd(arg_transport, arg_host, arg_runtime_scope, &bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + if (arg_scope) + return start_transient_scope(bus); + if (arg_path_property) + return start_transient_trigger(bus, ".path"); + if (arg_socket_property) + return start_transient_trigger(bus, ".socket"); + if (arg_with_timer) + return start_transient_trigger(bus, ".timer"); + return start_transient_service(bus); +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/shared/acl-util.c b/src/shared/acl-util.c new file mode 100644 index 0000000..7bfe025 --- /dev/null +++ b/src/shared/acl-util.c @@ -0,0 +1,652 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "acl-util.h" +#include "alloc-util.h" +#include "errno-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +#if HAVE_ACL + +int acl_find_uid(acl_t acl, uid_t uid, acl_entry_t *ret_entry) { + acl_entry_t i; + int r; + + assert(acl); + assert(uid_is_valid(uid)); + assert(ret_entry); + + for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { + + acl_tag_t tag; + uid_t *u; + bool b; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (tag != ACL_USER) + continue; + + u = acl_get_qualifier(i); + if (!u) + return -errno; + + b = *u == uid; + acl_free(u); + + if (b) { + *ret_entry = i; + return 1; + } + } + if (r < 0) + return -errno; + + *ret_entry = NULL; + return 0; +} + +int calc_acl_mask_if_needed(acl_t *acl_p) { + acl_entry_t i; + int r; + bool need = false; + + assert(acl_p); + + for (r = acl_get_entry(*acl_p, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(*acl_p, ACL_NEXT_ENTRY, &i)) { + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (tag == ACL_MASK) + return 0; + + if (IN_SET(tag, ACL_USER, ACL_GROUP)) + need = true; + } + if (r < 0) + return -errno; + + if (need && acl_calc_mask(acl_p) < 0) + return -errno; + + return need; +} + +int add_base_acls_if_needed(acl_t *acl_p, const char *path) { + acl_entry_t i; + int r; + bool have_user_obj = false, have_group_obj = false, have_other = false; + struct stat st; + _cleanup_(acl_freep) acl_t basic = NULL; + + assert(acl_p); + assert(path); + + for (r = acl_get_entry(*acl_p, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(*acl_p, ACL_NEXT_ENTRY, &i)) { + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (tag == ACL_USER_OBJ) + have_user_obj = true; + else if (tag == ACL_GROUP_OBJ) + have_group_obj = true; + else if (tag == ACL_OTHER) + have_other = true; + if (have_user_obj && have_group_obj && have_other) + return 0; + } + if (r < 0) + return -errno; + + r = stat(path, &st); + if (r < 0) + return -errno; + + basic = acl_from_mode(st.st_mode); + if (!basic) + return -errno; + + for (r = acl_get_entry(basic, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(basic, ACL_NEXT_ENTRY, &i)) { + acl_tag_t tag; + acl_entry_t dst; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if ((tag == ACL_USER_OBJ && have_user_obj) || + (tag == ACL_GROUP_OBJ && have_group_obj) || + (tag == ACL_OTHER && have_other)) + continue; + + r = acl_create_entry(acl_p, &dst); + if (r < 0) + return -errno; + + r = acl_copy_entry(dst, i); + if (r < 0) + return -errno; + } + if (r < 0) + return -errno; + return 0; +} + +int acl_search_groups(const char *path, char ***ret_groups) { + _cleanup_strv_free_ char **g = NULL; + _cleanup_(acl_freep) acl_t acl = NULL; + bool ret = false; + acl_entry_t entry; + int r; + + assert(path); + + acl = acl_get_file(path, ACL_TYPE_DEFAULT); + if (!acl) + return -errno; + + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &entry); + for (;;) { + _cleanup_(acl_free_gid_tpp) gid_t *gid = NULL; + acl_tag_t tag; + + if (r < 0) + return -errno; + if (r == 0) + break; + + if (acl_get_tag_type(entry, &tag) < 0) + return -errno; + + if (tag != ACL_GROUP) + goto next; + + gid = acl_get_qualifier(entry); + if (!gid) + return -errno; + + if (in_gid(*gid) > 0) { + if (!ret_groups) + return true; + + ret = true; + } + + if (ret_groups) { + char *name; + + name = gid_to_name(*gid); + if (!name) + return -ENOMEM; + + r = strv_consume(&g, name); + if (r < 0) + return r; + } + + next: + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &entry); + } + + if (ret_groups) + *ret_groups = TAKE_PTR(g); + + return ret; +} + +int parse_acl( + const char *text, + acl_t *ret_acl_access, + acl_t *ret_acl_access_exec, /* extra rules to apply to inodes subject to uppercase X handling */ + acl_t *ret_acl_default, + bool want_mask) { + + _cleanup_strv_free_ char **a = NULL, **e = NULL, **d = NULL, **split = NULL; + _cleanup_(acl_freep) acl_t a_acl = NULL, e_acl = NULL, d_acl = NULL; + int r; + + assert(text); + assert(ret_acl_access); + assert(ret_acl_access_exec); + assert(ret_acl_default); + + split = strv_split(text, ","); + if (!split) + return -ENOMEM; + + STRV_FOREACH(entry, split) { + _cleanup_strv_free_ char **entry_split = NULL; + _cleanup_free_ char *entry_join = NULL; + int n; + + n = strv_split_full(&entry_split, *entry, ":", EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_RETAIN_ESCAPE); + if (n < 0) + return n; + + if (n < 3 || n > 4) + return -EINVAL; + + string_replace_char(entry_split[n-1], 'X', 'x'); + + if (n == 4) { + if (!STR_IN_SET(entry_split[0], "default", "d")) + return -EINVAL; + + entry_join = strv_join(entry_split + 1, ":"); + if (!entry_join) + return -ENOMEM; + + r = strv_consume(&d, TAKE_PTR(entry_join)); + } else { /* n == 3 */ + entry_join = strv_join(entry_split, ":"); + if (!entry_join) + return -ENOMEM; + + if (!streq(*entry, entry_join)) + r = strv_consume(&e, TAKE_PTR(entry_join)); + else + r = strv_consume(&a, TAKE_PTR(entry_join)); + } + if (r < 0) + return r; + } + + if (!strv_isempty(a)) { + _cleanup_free_ char *join = NULL; + + join = strv_join(a, ","); + if (!join) + return -ENOMEM; + + a_acl = acl_from_text(join); + if (!a_acl) + return -errno; + + if (want_mask) { + r = calc_acl_mask_if_needed(&a_acl); + if (r < 0) + return r; + } + } + + if (!strv_isempty(e)) { + _cleanup_free_ char *join = NULL; + + join = strv_join(e, ","); + if (!join) + return -ENOMEM; + + e_acl = acl_from_text(join); + if (!e_acl) + return -errno; + + /* The mask must be calculated after deciding whether the execute bit should be set. */ + } + + if (!strv_isempty(d)) { + _cleanup_free_ char *join = NULL; + + join = strv_join(d, ","); + if (!join) + return -ENOMEM; + + d_acl = acl_from_text(join); + if (!d_acl) + return -errno; + + if (want_mask) { + r = calc_acl_mask_if_needed(&d_acl); + if (r < 0) + return r; + } + } + + *ret_acl_access = TAKE_PTR(a_acl); + *ret_acl_access_exec = TAKE_PTR(e_acl); + *ret_acl_default = TAKE_PTR(d_acl); + + return 0; +} + +static int acl_entry_equal(acl_entry_t a, acl_entry_t b) { + acl_tag_t tag_a, tag_b; + + if (acl_get_tag_type(a, &tag_a) < 0) + return -errno; + + if (acl_get_tag_type(b, &tag_b) < 0) + return -errno; + + if (tag_a != tag_b) + return false; + + switch (tag_a) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + /* can have only one of those */ + return true; + case ACL_USER: { + _cleanup_(acl_free_uid_tpp) uid_t *uid_a = NULL, *uid_b = NULL; + + uid_a = acl_get_qualifier(a); + if (!uid_a) + return -errno; + + uid_b = acl_get_qualifier(b); + if (!uid_b) + return -errno; + + return *uid_a == *uid_b; + } + case ACL_GROUP: { + _cleanup_(acl_free_gid_tpp) gid_t *gid_a = NULL, *gid_b = NULL; + + gid_a = acl_get_qualifier(a); + if (!gid_a) + return -errno; + + gid_b = acl_get_qualifier(b); + if (!gid_b) + return -errno; + + return *gid_a == *gid_b; + } + default: + assert_not_reached(); + } +} + +static int find_acl_entry(acl_t acl, acl_entry_t entry, acl_entry_t *ret) { + acl_entry_t i; + int r; + + for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { + + r = acl_entry_equal(i, entry); + if (r < 0) + return r; + if (r > 0) { + if (ret) + *ret = i; + return 0; + } + } + if (r < 0) + return -errno; + + return -ENOENT; +} + +int acls_for_file(const char *path, acl_type_t type, acl_t acl, acl_t *ret) { + _cleanup_(acl_freep) acl_t applied = NULL; + acl_entry_t i; + int r; + + assert(path); + + applied = acl_get_file(path, type); + if (!applied) + return -errno; + + for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { + + acl_entry_t j; + + r = find_acl_entry(applied, i, &j); + if (r == -ENOENT) { + if (acl_create_entry(&applied, &j) < 0) + return -errno; + } else if (r < 0) + return r; + + if (acl_copy_entry(j, i) < 0) + return -errno; + } + if (r < 0) + return -errno; + + if (ret) + *ret = TAKE_PTR(applied); + + return 0; +} + +/* POSIX says that ACL_{READ,WRITE,EXECUTE} don't have to be bitmasks. But that is a natural thing to do and + * all extant implementations do it. Let's make sure that we fail verbosely in the (imho unlikely) scenario + * that we get a new implementation that does not satisfy this. */ +assert_cc(!(ACL_READ & ACL_WRITE)); +assert_cc(!(ACL_WRITE & ACL_EXECUTE)); +assert_cc(!(ACL_EXECUTE & ACL_READ)); +assert_cc((unsigned) ACL_READ == ACL_READ); +assert_cc((unsigned) ACL_WRITE == ACL_WRITE); +assert_cc((unsigned) ACL_EXECUTE == ACL_EXECUTE); + +int fd_add_uid_acl_permission( + int fd, + uid_t uid, + unsigned mask) { + + _cleanup_(acl_freep) acl_t acl = NULL; + acl_permset_t permset; + acl_entry_t entry; + int r; + + /* Adds an ACL entry for the specified file to allow the indicated access to the specified + * user. Operates purely incrementally. */ + + assert(fd >= 0); + assert(uid_is_valid(uid)); + + acl = acl_get_fd(fd); + if (!acl) + return -errno; + + r = acl_find_uid(acl, uid, &entry); + if (r <= 0) { + if (acl_create_entry(&acl, &entry) < 0 || + acl_set_tag_type(entry, ACL_USER) < 0 || + acl_set_qualifier(entry, &uid) < 0) + return -errno; + } + + if (acl_get_permset(entry, &permset) < 0) + return -errno; + + if ((mask & ACL_READ) && acl_add_perm(permset, ACL_READ) < 0) + return -errno; + if ((mask & ACL_WRITE) && acl_add_perm(permset, ACL_WRITE) < 0) + return -errno; + if ((mask & ACL_EXECUTE) && acl_add_perm(permset, ACL_EXECUTE) < 0) + return -errno; + + r = calc_acl_mask_if_needed(&acl); + if (r < 0) + return r; + + if (acl_set_fd(fd, acl) < 0) + return -errno; + + return 0; +} + +int fd_acl_make_read_only(int fd) { + _cleanup_(acl_freep) acl_t acl = NULL; + bool changed = false; + acl_entry_t i; + int r; + + assert(fd >= 0); + + /* Safely drops all W bits from all relevant ACL entries of the file, without changing entries which + * are masked by the ACL mask */ + + acl = acl_get_fd(fd); + if (!acl) { + + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + + /* No ACLs? Then just update the regular mode_t */ + return fd_acl_make_read_only_fallback(fd); + } + + for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { + acl_permset_t permset; + acl_tag_t tag; + int b; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + /* These three control the x bits overall (as ACL_MASK affects all remaining tags) */ + if (!IN_SET(tag, ACL_USER_OBJ, ACL_MASK, ACL_OTHER)) + continue; + + if (acl_get_permset(i, &permset) < 0) + return -errno; + + b = acl_get_perm(permset, ACL_WRITE); + if (b < 0) + return -errno; + + if (b) { + if (acl_delete_perm(permset, ACL_WRITE) < 0) + return -errno; + + changed = true; + } + } + if (r < 0) + return -errno; + + if (!changed) + return 0; + + if (acl_set_fd(fd, acl) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + + return fd_acl_make_read_only_fallback(fd); + } + + return 1; +} + +int fd_acl_make_writable(int fd) { + _cleanup_(acl_freep) acl_t acl = NULL; + acl_entry_t i; + int r; + + /* Safely adds the writable bit to the owner's ACL entry of this inode. (And only the owner's! – This + * not the obvious inverse of fd_acl_make_read_only() hence!) */ + + acl = acl_get_fd(fd); + if (!acl) { + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + + /* No ACLs? Then just update the regular mode_t */ + return fd_acl_make_writable_fallback(fd); + } + + for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + r > 0; + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { + acl_permset_t permset; + acl_tag_t tag; + int b; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (tag != ACL_USER_OBJ) + continue; + + if (acl_get_permset(i, &permset) < 0) + return -errno; + + b = acl_get_perm(permset, ACL_WRITE); + if (b < 0) + return -errno; + + if (b) + return 0; /* Already set? Then there's nothing to do. */ + + if (acl_add_perm(permset, ACL_WRITE) < 0) + return -errno; + + break; + } + if (r < 0) + return -errno; + + if (acl_set_fd(fd, acl) < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + + return fd_acl_make_writable_fallback(fd); + } + + return 1; +} +#endif + +int fd_acl_make_read_only_fallback(int fd) { + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + if ((st.st_mode & 0222) == 0) + return 0; + + if (fchmod(fd, st.st_mode & 0555) < 0) + return -errno; + + return 1; +} + +int fd_acl_make_writable_fallback(int fd) { + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + if ((st.st_mode & 0200) != 0) /* already set */ + return 0; + + if (fchmod(fd, (st.st_mode & 07777) | 0200) < 0) + return -errno; + + return 1; +} diff --git a/src/shared/acl-util.h b/src/shared/acl-util.h new file mode 100644 index 0000000..ef315c2 --- /dev/null +++ b/src/shared/acl-util.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +int fd_acl_make_read_only_fallback(int fd); +int fd_acl_make_writable_fallback(int fd); + +#if HAVE_ACL +#include +#include +#include + +#include "macro.h" + +int acl_find_uid(acl_t acl, uid_t uid, acl_entry_t *entry); +int calc_acl_mask_if_needed(acl_t *acl_p); +int add_base_acls_if_needed(acl_t *acl_p, const char *path); +int acl_search_groups(const char* path, char ***ret_groups); +int parse_acl( + const char *text, + acl_t *ret_acl_access, + acl_t *ret_acl_access_exec, + acl_t *ret_acl_default, + bool want_mask); +int acls_for_file(const char *path, acl_type_t type, acl_t new, acl_t *ret); +int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask); + +int fd_acl_make_read_only(int fd); +int fd_acl_make_writable(int fd); + +/* acl_free takes multiple argument types. + * Multiple cleanup functions are necessary. */ +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(acl_t, acl_free, NULL); +#define acl_free_charp acl_free +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(char*, acl_free_charp, NULL); +#define acl_free_uid_tp acl_free +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(uid_t*, acl_free_uid_tp, NULL); +#define acl_free_gid_tp acl_free +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(gid_t*, acl_free_gid_tp, NULL); + +#else +#define ACL_READ 0x04 +#define ACL_WRITE 0x02 +#define ACL_EXECUTE 0x01 + +static inline int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask) { + return -EOPNOTSUPP; +} + +static inline int fd_acl_make_read_only(int fd) { + return fd_acl_make_read_only_fallback(fd); +} + +static inline int fd_acl_make_writable(int fd) { + return fd_acl_make_writable_fallback(fd); +} + +#endif diff --git a/src/shared/acpi-fpdt.c b/src/shared/acpi-fpdt.c new file mode 100644 index 0000000..22a36bd --- /dev/null +++ b/src/shared/acpi-fpdt.c @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "acpi-fpdt.h" +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "time-util.h" + +struct acpi_table_header { + char signature[4]; + uint32_t length; + uint8_t revision; + uint8_t checksum; + char oem_id[6]; + char oem_table_id[8]; + uint32_t oem_revision; + char asl_compiler_id[4]; + uint32_t asl_compiler_revision; +} _packed_; + +enum { + ACPI_FPDT_TYPE_BOOT = 0, + ACPI_FPDT_TYPE_S3PERF = 1, +}; + +struct acpi_fpdt_header { + uint16_t type; + uint8_t length; + uint8_t revision; + uint8_t reserved[4]; + uint64_t ptr; +} _packed_; + +struct acpi_fpdt_boot_header { + char signature[4]; + uint32_t length; +} _packed_; + +enum { + ACPI_FPDT_S3PERF_RESUME_REC = 0, + ACPI_FPDT_S3PERF_SUSPEND_REC = 1, + ACPI_FPDT_BOOT_REC = 2, +}; + +struct acpi_fpdt_boot { + uint16_t type; + uint8_t length; + uint8_t revision; + uint8_t reserved[4]; + uint64_t reset_end; + uint64_t load_start; + uint64_t startup_start; + uint64_t exit_services_entry; + uint64_t exit_services_exit; +} _packed; + +/* /dev/mem is deprecated on many systems, try using /sys/firmware/acpi/fpdt parsing instead. + * This code requires kernel version 5.12 on x86 based machines or 6.2 for arm64 */ +static int acpi_get_boot_usec_kernel_parsed(usec_t *ret_loader_start, usec_t *ret_loader_exit) { + usec_t start, end; + int r; + + r = read_timestamp_file("/sys/firmware/acpi/fpdt/boot/exitbootservice_end_ns", &end); + if (r < 0) + return r; + + if (end == 0) + /* Non-UEFI compatible boot. */ + return -ENODATA; + + r = read_timestamp_file("/sys/firmware/acpi/fpdt/boot/bootloader_launch_ns", &start); + if (r < 0) + return r; + + if (start == 0 || end < start) + return -EINVAL; + if (end > NSEC_PER_HOUR) + return -EINVAL; + + if (ret_loader_start) + *ret_loader_start = start / 1000; + if (ret_loader_exit) + *ret_loader_exit = end / 1000; + + return 0; +} + +int acpi_get_boot_usec(usec_t *ret_loader_start, usec_t *ret_loader_exit) { + _cleanup_free_ char *buf = NULL; + struct acpi_table_header *tbl; + size_t l; + ssize_t ll; + struct acpi_fpdt_header *rec; + int r; + uint64_t ptr = 0; + _cleanup_close_ int fd = -EBADF; + struct acpi_fpdt_boot_header hbrec; + struct acpi_fpdt_boot brec; + + r = acpi_get_boot_usec_kernel_parsed(ret_loader_start, ret_loader_exit); + if (r != -ENOENT) /* fallback to /dev/mem hack only if kernel doesn't support the new sysfs files */ + return r; + + r = read_full_virtual_file("/sys/firmware/acpi/tables/FPDT", &buf, &l); + if (r < 0) + return r; + + if (l < sizeof(struct acpi_table_header) + sizeof(struct acpi_fpdt_header)) + return -EINVAL; + + tbl = (struct acpi_table_header *)buf; + if (l != tbl->length) + return -EINVAL; + + if (memcmp(tbl->signature, "FPDT", 4) != 0) + return -EINVAL; + + /* find Firmware Basic Boot Performance Pointer Record */ + for (rec = (struct acpi_fpdt_header *)(buf + sizeof(struct acpi_table_header)); + (char *)rec + offsetof(struct acpi_fpdt_header, revision) <= buf + l; + rec = (struct acpi_fpdt_header *)((char *)rec + rec->length)) { + if (rec->length <= 0) + break; + if (rec->type != ACPI_FPDT_TYPE_BOOT) + continue; + if (rec->length != sizeof(struct acpi_fpdt_header)) + continue; + + ptr = rec->ptr; + break; + } + + if (ptr == 0) + return -ENODATA; + + /* read Firmware Basic Boot Performance Data Record */ + fd = open("/dev/mem", O_CLOEXEC|O_RDONLY); + if (fd < 0) + return -errno; + + ll = pread(fd, &hbrec, sizeof(struct acpi_fpdt_boot_header), ptr); + if (ll < 0) + return -errno; + if ((size_t) ll != sizeof(struct acpi_fpdt_boot_header)) + return -EINVAL; + + if (memcmp(hbrec.signature, "FBPT", 4) != 0) + return -EINVAL; + + if (hbrec.length < sizeof(struct acpi_fpdt_boot_header) + sizeof(struct acpi_fpdt_boot)) + return -EINVAL; + + ll = pread(fd, &brec, sizeof(struct acpi_fpdt_boot), ptr + sizeof(struct acpi_fpdt_boot_header)); + if (ll < 0) + return -errno; + if ((size_t) ll != sizeof(struct acpi_fpdt_boot)) + return -EINVAL; + + if (brec.length != sizeof(struct acpi_fpdt_boot)) + return -EINVAL; + + if (brec.type != ACPI_FPDT_BOOT_REC) + return -EINVAL; + + if (brec.exit_services_exit == 0) + /* Non-UEFI compatible boot. */ + return -ENODATA; + + if (brec.startup_start == 0 || brec.exit_services_exit < brec.startup_start) + return -EINVAL; + if (brec.exit_services_exit > NSEC_PER_HOUR) + return -EINVAL; + + if (ret_loader_start) + *ret_loader_start = brec.startup_start / 1000; + if (ret_loader_exit) + *ret_loader_exit = brec.exit_services_exit / 1000; + + return 0; +} diff --git a/src/shared/acpi-fpdt.h b/src/shared/acpi-fpdt.h new file mode 100644 index 0000000..56f8c9e --- /dev/null +++ b/src/shared/acpi-fpdt.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int acpi_get_boot_usec(usec_t *ret_loader_start, usec_t *ret_loader_exit); diff --git a/src/shared/apparmor-util.c b/src/shared/apparmor-util.c new file mode 100644 index 0000000..68e1c55 --- /dev/null +++ b/src/shared/apparmor-util.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "apparmor-util.h" +#include "fileio.h" +#include "parse-util.h" + +bool mac_apparmor_use(void) { + static int cached_use = -1; + + if (cached_use < 0) { + _cleanup_free_ char *p = NULL; + + cached_use = + read_one_line_file("/sys/module/apparmor/parameters/enabled", &p) >= 0 && + parse_boolean(p) > 0; + } + + return cached_use; +} diff --git a/src/shared/apparmor-util.h b/src/shared/apparmor-util.h new file mode 100644 index 0000000..8007aeb --- /dev/null +++ b/src/shared/apparmor-util.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +bool mac_apparmor_use(void); diff --git a/src/shared/ask-password-api.c b/src/shared/ask-password-api.c new file mode 100644 index 0000000..0e323f4 --- /dev/null +++ b/src/shared/ask-password-api.c @@ -0,0 +1,1002 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "ask-password-api.h" +#include "creds-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "io-util.h" +#include "iovec-util.h" +#include "keyring-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "nulstr-util.h" +#include "plymouth-util.h" +#include "process-util.h" +#include "random-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "utf8.h" + +#define KEYRING_TIMEOUT_USEC ((5 * USEC_PER_MINUTE) / 2) + +static int lookup_key(const char *keyname, key_serial_t *ret) { + key_serial_t serial; + + assert(keyname); + assert(ret); + + serial = request_key("user", keyname, NULL, 0); + if (serial == -1) + return negative_errno(); + + *ret = serial; + return 0; +} + +static int retrieve_key(key_serial_t serial, char ***ret) { + _cleanup_(erase_and_freep) void *p = NULL; + char **l; + size_t n; + int r; + + assert(ret); + + r = keyring_read(serial, &p, &n); + if (r < 0) + return r; + + l = strv_parse_nulstr(p, n); + if (!l) + return -ENOMEM; + + *ret = l; + return 0; +} + +static int add_to_keyring(const char *keyname, AskPasswordFlags flags, char **passwords) { + _cleanup_strv_free_erase_ char **l = NULL; + _cleanup_(erase_and_freep) char *p = NULL; + key_serial_t serial; + size_t n; + int r; + + assert(keyname); + + if (!FLAGS_SET(flags, ASK_PASSWORD_PUSH_CACHE)) + return 0; + if (strv_isempty(passwords)) + return 0; + + r = lookup_key(keyname, &serial); + if (r >= 0) { + r = retrieve_key(serial, &l); + if (r < 0) + return r; + } else if (r != -ENOKEY) + return r; + + r = strv_extend_strv(&l, passwords, true); + if (r <= 0) + return r; + + r = strv_make_nulstr(l, &p, &n); + if (r < 0) + return r; + + /* chop off the final NUL byte. We do this because we want to use the separator NUL bytes only if we + * have multiple passwords. */ + n = LESS_BY(n, (size_t) 1); + + serial = add_key("user", keyname, p, n, KEY_SPEC_USER_KEYRING); + if (serial == -1) + return -errno; + + if (keyctl(KEYCTL_SET_TIMEOUT, + (unsigned long) serial, + (unsigned long) DIV_ROUND_UP(KEYRING_TIMEOUT_USEC, USEC_PER_SEC), 0, 0) < 0) + log_debug_errno(errno, "Failed to adjust kernel keyring key timeout: %m"); + + /* Tell everyone to check the keyring */ + (void) touch("/run/systemd/ask-password"); + + log_debug("Added key to kernel keyring as %" PRIi32 ".", serial); + + return 1; +} + +static int add_to_keyring_and_log(const char *keyname, AskPasswordFlags flags, char **passwords) { + int r; + + assert(keyname); + + r = add_to_keyring(keyname, flags, passwords); + if (r < 0) + return log_debug_errno(r, "Failed to add password to kernel keyring: %m"); + + return 0; +} + +static int ask_password_keyring(const char *keyname, AskPasswordFlags flags, char ***ret) { + + key_serial_t serial; + int r; + + assert(keyname); + assert(ret); + + if (!FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED)) + return -EUNATCH; + + r = lookup_key(keyname, &serial); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || r == -EPERM) + /* When retrieving, the distinction between "kernel or container manager don't support or + * allow this" and "no matching key known" doesn't matter. Note that we propagate EACCESS + * here (even if EPERM not) since that is used if the keyring is available, but we lack + * access to the key. */ + return -ENOKEY; + if (r < 0) + return r; + + return retrieve_key(serial, ret); +} + +static int backspace_chars(int ttyfd, size_t p) { + if (ttyfd < 0) + return 0; + + _cleanup_free_ char *buf = malloc_multiply(3, p); + if (!buf) + return log_oom(); + + for (size_t i = 0; i < p; i++) + memcpy(buf + 3 * i, "\b \b", 3); + + return loop_write(ttyfd, buf, 3 * p); +} + +static int backspace_string(int ttyfd, const char *str) { + assert(str); + + /* Backspaces through enough characters to entirely undo printing of the specified string. */ + + if (ttyfd < 0) + return 0; + + size_t m = utf8_n_codepoints(str); + if (m == SIZE_MAX) + m = strlen(str); /* Not a valid UTF-8 string? If so, let's backspace the number of bytes + * output. Most likely this happened because we are not in a UTF-8 locale, + * and in that case that is the correct thing to do. And even if it's not, + * terminals tend to stop backspacing at the leftmost column, hence + * backspacing too much should be mostly OK. */ + + return backspace_chars(ttyfd, m); +} + +int ask_password_plymouth( + const char *message, + usec_t until, + AskPasswordFlags flags, + const char *flag_file, + char ***ret) { + + _cleanup_close_ int fd = -EBADF, notify = -EBADF; + _cleanup_free_ char *packet = NULL; + ssize_t k; + int r, n; + struct pollfd pollfd[2] = {}; + char buffer[LINE_MAX]; + size_t p = 0; + enum { + POLL_SOCKET, + POLL_INOTIFY + }; + + assert(ret); + + if (!message) + message = "Password:"; + + if (flag_file) { + notify = inotify_init1(IN_CLOEXEC|IN_NONBLOCK); + if (notify < 0) + return -errno; + + if (inotify_add_watch(notify, flag_file, IN_ATTRIB) < 0) /* for the link count */ + return -errno; + } + + fd = plymouth_connect(SOCK_NONBLOCK); + if (fd < 0) + return fd; + + if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED)) { + packet = strdup("c"); + n = 1; + } else if (asprintf(&packet, "*\002%c%s%n", (int) (strlen(message) + 1), message, &n) < 0) + packet = NULL; + if (!packet) + return -ENOMEM; + + r = loop_write_full(fd, packet, n + 1, USEC_INFINITY); + if (r < 0) + return r; + + CLEANUP_ERASE(buffer); + + pollfd[POLL_SOCKET].fd = fd; + pollfd[POLL_SOCKET].events = POLLIN; + pollfd[POLL_INOTIFY].fd = notify; + pollfd[POLL_INOTIFY].events = POLLIN; + + for (;;) { + usec_t timeout; + + if (until > 0) + timeout = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + else + timeout = USEC_INFINITY; + + if (flag_file && access(flag_file, F_OK) < 0) + return -errno; + + r = ppoll_usec(pollfd, notify >= 0 ? 2 : 1, timeout); + if (r == -EINTR) + continue; + if (r < 0) + return r; + if (r == 0) + return -ETIME; + + if (notify >= 0 && pollfd[POLL_INOTIFY].revents != 0) + (void) flush_fd(notify); + + if (pollfd[POLL_SOCKET].revents == 0) + continue; + + k = read(fd, buffer + p, sizeof(buffer) - p); + if (k < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + continue; + + return -errno; + } + if (k == 0) + return -EIO; + + p += k; + + if (buffer[0] == 5) { + + if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED)) { + /* Hmm, first try with cached + * passwords failed, so let's retry + * with a normal password request */ + packet = mfree(packet); + + if (asprintf(&packet, "*\002%c%s%n", (int) (strlen(message) + 1), message, &n) < 0) + return -ENOMEM; + + r = loop_write_full(fd, packet, n + 1, USEC_INFINITY); + if (r < 0) + return r; + + flags &= ~ASK_PASSWORD_ACCEPT_CACHED; + p = 0; + continue; + } + + /* No password, because UI not shown */ + return -ENOENT; + + } else if (IN_SET(buffer[0], 2, 9)) { + uint32_t size; + char **l; + + /* One or more answers */ + if (p < 5) + continue; + + memcpy(&size, buffer+1, sizeof(size)); + size = le32toh(size); + if (size + 5 > sizeof(buffer)) + return -EIO; + + if (p-5 < size) + continue; + + l = strv_parse_nulstr(buffer + 5, size); + if (!l) + return -ENOMEM; + + *ret = l; + break; + + } else + /* Unknown packet */ + return -EIO; + } + + return 0; +} + +#define NO_ECHO "(no echo) " +#define PRESS_TAB "(press TAB for no echo) " +#define SKIPPED "(skipped)" + +int ask_password_tty( + int ttyfd, + const char *message, + const char *keyname, + usec_t until, + AskPasswordFlags flags, + const char *flag_file, + char ***ret) { + + enum { + POLL_TTY, + POLL_INOTIFY, + _POLL_MAX, + }; + + bool reset_tty = false, dirty = false, use_color = false, press_tab_visible = false; + _cleanup_close_ int cttyfd = -EBADF, notify = -EBADF; + struct termios old_termios, new_termios; + char passphrase[LINE_MAX + 1] = {}, *x; + _cleanup_strv_free_erase_ char **l = NULL; + struct pollfd pollfd[_POLL_MAX]; + size_t p = 0, codepoint = 0; + int r; + + assert(ret); + + if (FLAGS_SET(flags, ASK_PASSWORD_NO_TTY)) + return -EUNATCH; + + if (!message) + message = "Password:"; + + if (!FLAGS_SET(flags, ASK_PASSWORD_HIDE_EMOJI) && emoji_enabled()) + message = strjoina(special_glyph(SPECIAL_GLYPH_LOCK_AND_KEY), " ", message); + + if (flag_file || (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && keyname)) { + notify = inotify_init1(IN_CLOEXEC|IN_NONBLOCK); + if (notify < 0) + return -errno; + } + if (flag_file) { + if (inotify_add_watch(notify, flag_file, IN_ATTRIB /* for the link count */) < 0) + return -errno; + } + if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && keyname) { + r = ask_password_keyring(keyname, flags, ret); + if (r >= 0) + return 0; + else if (r != -ENOKEY) + return r; + + if (inotify_add_watch(notify, "/run/systemd/ask-password", IN_ATTRIB /* for mtime */) < 0) + return -errno; + } + + CLEANUP_ERASE(passphrase); + + /* If the caller didn't specify a TTY, then use the controlling tty, if we can. */ + if (ttyfd < 0) + ttyfd = cttyfd = open("/dev/tty", O_RDWR|O_NOCTTY|O_CLOEXEC); + + if (ttyfd >= 0) { + if (tcgetattr(ttyfd, &old_termios) < 0) + return -errno; + + if (FLAGS_SET(flags, ASK_PASSWORD_CONSOLE_COLOR)) + use_color = dev_console_colors_enabled(); + else + use_color = colors_enabled(); + + if (use_color) + (void) loop_write(ttyfd, ANSI_HIGHLIGHT, SIZE_MAX); + + (void) loop_write(ttyfd, message, SIZE_MAX); + (void) loop_write(ttyfd, " ", 1); + + if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT) && !FLAGS_SET(flags, ASK_PASSWORD_ECHO)) { + if (use_color) + (void) loop_write(ttyfd, ansi_grey(), SIZE_MAX); + + (void) loop_write(ttyfd, PRESS_TAB, SIZE_MAX); + press_tab_visible = true; + } + + if (use_color) + (void) loop_write(ttyfd, ANSI_NORMAL, SIZE_MAX); + + new_termios = old_termios; + new_termios.c_lflag &= ~(ICANON|ECHO); + new_termios.c_cc[VMIN] = 1; + new_termios.c_cc[VTIME] = 0; + + r = RET_NERRNO(tcsetattr(ttyfd, TCSADRAIN, &new_termios)); + if (r < 0) + goto finish; + + reset_tty = true; + } + + pollfd[POLL_TTY] = (struct pollfd) { + .fd = ttyfd >= 0 ? ttyfd : STDIN_FILENO, + .events = POLLIN, + }; + pollfd[POLL_INOTIFY] = (struct pollfd) { + .fd = notify, + .events = POLLIN, + }; + + for (;;) { + _cleanup_(erase_char) char c; + usec_t timeout; + ssize_t n; + + if (until > 0) + timeout = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + else + timeout = USEC_INFINITY; + + if (flag_file) { + r = RET_NERRNO(access(flag_file, F_OK)); + if (r < 0) + goto finish; + } + + r = ppoll_usec(pollfd, notify >= 0 ? 2 : 1, timeout); + if (r == -EINTR) + continue; + if (r < 0) + goto finish; + if (r == 0) { + r = -ETIME; + goto finish; + } + + if (notify >= 0 && pollfd[POLL_INOTIFY].revents != 0 && keyname) { + (void) flush_fd(notify); + + r = ask_password_keyring(keyname, flags, ret); + if (r >= 0) { + r = 0; + goto finish; + } else if (r != -ENOKEY) + goto finish; + } + + if (pollfd[POLL_TTY].revents == 0) + continue; + + n = read(ttyfd >= 0 ? ttyfd : STDIN_FILENO, &c, 1); + if (n < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + continue; + + r = -errno; + goto finish; + + } + + if (press_tab_visible) { + assert(ttyfd >= 0); + backspace_chars(ttyfd, strlen(PRESS_TAB)); + press_tab_visible = false; + } + + /* We treat EOF, newline and NUL byte all as valid end markers */ + if (n == 0 || c == '\n' || c == 0) + break; + + if (c == 4) { /* C-d also known as EOT */ + if (ttyfd >= 0) + (void) loop_write(ttyfd, SKIPPED, SIZE_MAX); + + goto skipped; + } + + if (c == 21) { /* C-u */ + + if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT)) + (void) backspace_string(ttyfd, passphrase); + + explicit_bzero_safe(passphrase, sizeof(passphrase)); + p = codepoint = 0; + + } else if (IN_SET(c, '\b', 127)) { + + if (p > 0) { + size_t q; + + if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT)) + (void) backspace_chars(ttyfd, 1); + + /* Remove a full UTF-8 codepoint from the end. For that, figure out where the + * last one begins */ + q = 0; + for (;;) { + int z; + + z = utf8_encoded_valid_unichar(passphrase + q, SIZE_MAX); + if (z <= 0) { + q = SIZE_MAX; /* Invalid UTF8! */ + break; + } + + if (q + z >= p) /* This one brings us over the edge */ + break; + + q += z; + } + + p = codepoint = q == SIZE_MAX ? p - 1 : q; + explicit_bzero_safe(passphrase + p, sizeof(passphrase) - p); + + } else if (!dirty && !FLAGS_SET(flags, ASK_PASSWORD_SILENT)) { + + flags |= ASK_PASSWORD_SILENT; + + /* There are two ways to enter silent mode. Either by pressing backspace as + * first key (and only as first key), or ... */ + + if (ttyfd >= 0) + (void) loop_write(ttyfd, NO_ECHO, SIZE_MAX); + + } else if (ttyfd >= 0) + (void) loop_write(ttyfd, "\a", 1); + + } else if (c == '\t' && !FLAGS_SET(flags, ASK_PASSWORD_SILENT)) { + + (void) backspace_string(ttyfd, passphrase); + flags |= ASK_PASSWORD_SILENT; + + /* ... or by pressing TAB at any time. */ + + if (ttyfd >= 0) + (void) loop_write(ttyfd, NO_ECHO, SIZE_MAX); + + } else if (p >= sizeof(passphrase)-1) { + + /* Reached the size limit */ + if (ttyfd >= 0) + (void) loop_write(ttyfd, "\a", 1); + + } else { + passphrase[p++] = c; + + if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT) && ttyfd >= 0) { + /* Check if we got a complete UTF-8 character now. If so, let's output one '*'. */ + n = utf8_encoded_valid_unichar(passphrase + codepoint, SIZE_MAX); + if (n >= 0) { + if (FLAGS_SET(flags, ASK_PASSWORD_ECHO)) + (void) loop_write(ttyfd, passphrase + codepoint, n); + else + (void) loop_write(ttyfd, + special_glyph(SPECIAL_GLYPH_BULLET), + SIZE_MAX); + codepoint = p; + } + } + + dirty = true; + } + } + + x = strndup(passphrase, p); + if (!x) { + r = -ENOMEM; + goto finish; + } + + r = strv_consume(&l, x); + if (r < 0) + goto finish; + +skipped: + if (strv_isempty(l)) + r = log_debug_errno(SYNTHETIC_ERRNO(ECANCELED), "Password query was cancelled."); + else { + if (keyname) + (void) add_to_keyring_and_log(keyname, flags, l); + + *ret = TAKE_PTR(l); + r = 0; + } + +finish: + if (ttyfd >= 0 && reset_tty) { + (void) loop_write(ttyfd, "\n", 1); + (void) tcsetattr(ttyfd, TCSADRAIN, &old_termios); + } + + return r; +} + +static int create_socket(char **ret) { + _cleanup_free_ char *path = NULL; + union sockaddr_union sa; + socklen_t sa_len; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(ret); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + if (asprintf(&path, "/run/systemd/ask-password/sck.%" PRIx64, random_u64()) < 0) + return -ENOMEM; + + r = sockaddr_un_set_path(&sa.un, path); + if (r < 0) + return r; + sa_len = r; + + WITH_UMASK(0177) + if (bind(fd, &sa.sa, sa_len) < 0) + return -errno; + + r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return r; + + *ret = TAKE_PTR(path); + return TAKE_FD(fd); +} + +int ask_password_agent( + const char *message, + const char *icon, + const char *id, + const char *keyname, + usec_t until, + AskPasswordFlags flags, + char ***ret) { + + enum { + FD_SOCKET, + FD_SIGNAL, + FD_INOTIFY, + _FD_MAX + }; + + _cleanup_close_ int socket_fd = -EBADF, signal_fd = -EBADF, notify = -EBADF, fd = -EBADF; + char temp[] = "/run/systemd/ask-password/tmp.XXXXXX"; + char final[sizeof(temp)] = ""; + _cleanup_free_ char *socket_name = NULL; + _cleanup_strv_free_erase_ char **l = NULL; + _cleanup_fclose_ FILE *f = NULL; + struct pollfd pollfd[_FD_MAX]; + sigset_t mask, oldmask; + int r; + + assert(ret); + + if (FLAGS_SET(flags, ASK_PASSWORD_NO_AGENT)) + return -EUNATCH; + + assert_se(sigemptyset(&mask) >= 0); + assert_se(sigset_add_many(&mask, SIGINT, SIGTERM, -1) >= 0); + assert_se(sigprocmask(SIG_BLOCK, &mask, &oldmask) >= 0); + + (void) mkdir_p_label("/run/systemd/ask-password", 0755); + + if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && keyname) { + r = ask_password_keyring(keyname, flags, ret); + if (r >= 0) { + r = 0; + goto finish; + } else if (r != -ENOKEY) + goto finish; + + notify = inotify_init1(IN_CLOEXEC | IN_NONBLOCK); + if (notify < 0) { + r = -errno; + goto finish; + } + + r = RET_NERRNO(inotify_add_watch(notify, "/run/systemd/ask-password", IN_ATTRIB /* for mtime */)); + if (r < 0) + goto finish; + } + + fd = mkostemp_safe(temp); + if (fd < 0) { + r = fd; + goto finish; + } + + (void) fchmod(fd, 0644); + + f = take_fdopen(&fd, "w"); + if (!f) { + r = -errno; + goto finish; + } + + signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC); + if (signal_fd < 0) { + r = -errno; + goto finish; + } + + socket_fd = create_socket(&socket_name); + if (socket_fd < 0) { + r = socket_fd; + goto finish; + } + + fprintf(f, + "[Ask]\n" + "PID="PID_FMT"\n" + "Socket=%s\n" + "AcceptCached=%i\n" + "Echo=%i\n" + "NotAfter="USEC_FMT"\n" + "Silent=%i\n", + getpid_cached(), + socket_name, + FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED), + FLAGS_SET(flags, ASK_PASSWORD_ECHO), + until, + FLAGS_SET(flags, ASK_PASSWORD_SILENT)); + + if (message) + fprintf(f, "Message=%s\n", message); + + if (icon) + fprintf(f, "Icon=%s\n", icon); + + if (id) + fprintf(f, "Id=%s\n", id); + + r = fflush_and_check(f); + if (r < 0) + goto finish; + + memcpy(final, temp, sizeof(temp)); + + final[sizeof(final)-11] = 'a'; + final[sizeof(final)-10] = 's'; + final[sizeof(final)-9] = 'k'; + + r = RET_NERRNO(rename(temp, final)); + if (r < 0) + goto finish; + + zero(pollfd); + pollfd[FD_SOCKET].fd = socket_fd; + pollfd[FD_SOCKET].events = POLLIN; + pollfd[FD_SIGNAL].fd = signal_fd; + pollfd[FD_SIGNAL].events = POLLIN; + pollfd[FD_INOTIFY].fd = notify; + pollfd[FD_INOTIFY].events = POLLIN; + + for (;;) { + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + char passphrase[LINE_MAX+1]; + struct iovec iovec; + struct ucred *ucred; + usec_t timeout; + ssize_t n; + + if (until > 0) + timeout = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + else + timeout = USEC_INFINITY; + + r = ppoll_usec(pollfd, notify >= 0 ? _FD_MAX : _FD_MAX - 1, timeout); + if (r == -EINTR) + continue; + if (r < 0) + goto finish; + if (r == 0) { + r = -ETIME; + goto finish; + } + + if (pollfd[FD_SIGNAL].revents & POLLIN) { + r = -EINTR; + goto finish; + } + + if (notify >= 0 && pollfd[FD_INOTIFY].revents != 0) { + (void) flush_fd(notify); + + r = ask_password_keyring(keyname, flags, ret); + if (r >= 0) { + r = 0; + goto finish; + } else if (r != -ENOKEY) + goto finish; + } + + if (pollfd[FD_SOCKET].revents == 0) + continue; + + if (pollfd[FD_SOCKET].revents != POLLIN) { + r = -EIO; + goto finish; + } + + iovec = IOVEC_MAKE(passphrase, sizeof(passphrase)); + + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + + n = recvmsg_safe(socket_fd, &msghdr, 0); + if (ERRNO_IS_NEG_TRANSIENT(n)) + continue; + else if (n == -EXFULL) { + log_debug("Got message with truncated control data, ignoring."); + continue; + } else if (n < 0) { + r = (int) n; + goto finish; + } + + CLEANUP_ERASE(passphrase); + + cmsg_close_all(&msghdr); + + if (n == 0) { + log_debug("Message too short"); + continue; + } + + ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!ucred) { + log_debug("Received message without credentials. Ignoring."); + continue; + } + + if (ucred->uid != 0) { + log_debug("Got request from unprivileged user. Ignoring."); + continue; + } + + if (passphrase[0] == '+') { + /* An empty message refers to the empty password */ + if (n == 1) + l = strv_new(""); + else + l = strv_parse_nulstr(passphrase+1, n-1); + if (!l) { + r = -ENOMEM; + goto finish; + } + + if (strv_isempty(l)) { + l = strv_free(l); + log_debug("Invalid packet"); + continue; + } + + break; + } + + if (passphrase[0] == '-') { + r = -ECANCELED; + goto finish; + } + + log_debug("Invalid packet"); + } + + if (keyname) + (void) add_to_keyring_and_log(keyname, flags, l); + + *ret = TAKE_PTR(l); + r = 0; + +finish: + if (socket_name) + (void) unlink(socket_name); + + (void) unlink(temp); + + if (final[0]) + (void) unlink(final); + + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0); + return r; +} + +static int ask_password_credential(const char *credential_name, AskPasswordFlags flags, char ***ret) { + _cleanup_(erase_and_freep) char *buffer = NULL; + size_t size; + char **l; + int r; + + assert(credential_name); + assert(ret); + + r = read_credential(credential_name, (void**) &buffer, &size); + if (IN_SET(r, -ENXIO, -ENOENT)) /* No credentials passed or this credential not defined? */ + return -ENOKEY; + + l = strv_parse_nulstr(buffer, size); + if (!l) + return -ENOMEM; + + *ret = l; + return 0; +} + +int ask_password_auto( + const char *message, + const char *icon, + const char *id, /* id in "ask-password" protocol */ + const char *key_name, /* name in kernel keyring */ + const char *credential_name, /* name in $CREDENTIALS_DIRECTORY directory */ + usec_t until, + AskPasswordFlags flags, + char ***ret) { + + int r; + + assert(ret); + + if (!FLAGS_SET(flags, ASK_PASSWORD_NO_CREDENTIAL) && credential_name) { + r = ask_password_credential(credential_name, flags, ret); + if (r != -ENOKEY) + return r; + } + + if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && + key_name && + (FLAGS_SET(flags, ASK_PASSWORD_NO_TTY) || !isatty(STDIN_FILENO)) && + FLAGS_SET(flags, ASK_PASSWORD_NO_AGENT)) { + r = ask_password_keyring(key_name, flags, ret); + if (r != -ENOKEY) + return r; + } + + if (!FLAGS_SET(flags, ASK_PASSWORD_NO_TTY) && isatty(STDIN_FILENO)) + return ask_password_tty(-1, message, key_name, until, flags, NULL, ret); + + if (!FLAGS_SET(flags, ASK_PASSWORD_NO_AGENT)) + return ask_password_agent(message, icon, id, key_name, until, flags, ret); + + return -EUNATCH; +} diff --git a/src/shared/ask-password-api.h b/src/shared/ask-password-api.h new file mode 100644 index 0000000..7464e7f --- /dev/null +++ b/src/shared/ask-password-api.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" + +typedef enum AskPasswordFlags { + ASK_PASSWORD_ACCEPT_CACHED = 1 << 0, /* read from kernel keyring */ + ASK_PASSWORD_PUSH_CACHE = 1 << 1, /* write to kernel keyring after getting password from elsewhere */ + ASK_PASSWORD_ECHO = 1 << 2, /* show the password literally while reading, instead of "*" */ + ASK_PASSWORD_SILENT = 1 << 3, /* do no show any password at all while reading */ + ASK_PASSWORD_NO_TTY = 1 << 4, /* never ask for password on tty */ + ASK_PASSWORD_NO_AGENT = 1 << 5, /* never ask for password via agent */ + ASK_PASSWORD_CONSOLE_COLOR = 1 << 6, /* Use color if /dev/console points to a console that supports color */ + ASK_PASSWORD_NO_CREDENTIAL = 1 << 7, /* never use $CREDENTIALS_DIRECTORY data */ + ASK_PASSWORD_HIDE_EMOJI = 1 << 8, /* hide the lock and key emoji */ +} AskPasswordFlags; + +int ask_password_tty(int tty_fd, const char *message, const char *key_name, usec_t until, AskPasswordFlags flags, const char *flag_file, char ***ret); +int ask_password_plymouth(const char *message, usec_t until, AskPasswordFlags flags, const char *flag_file, char ***ret); +int ask_password_agent(const char *message, const char *icon, const char *id, const char *key_name, usec_t until, AskPasswordFlags flag, char ***ret); +int ask_password_auto(const char *message, const char *icon, const char *id, const char *key_name, const char *credential_name, usec_t until, AskPasswordFlags flag, char ***ret); diff --git a/src/shared/async.c b/src/shared/async.c new file mode 100644 index 0000000..41f6b97 --- /dev/null +++ b/src/shared/async.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "async.h" +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "process-util.h" +#include "signal-util.h" + +int asynchronous_sync(pid_t *ret_pid) { + int r; + + /* This forks off an invocation of fork() as a child process, in order to initiate synchronization to + * disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our + * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking + * syscalls. */ + + r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid); + if (r < 0) + return r; + if (r == 0) { + /* Child process */ + sync(); + _exit(EXIT_SUCCESS); + } + + return 0; +} + +/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether + * we need to fork again */ +#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1)) + +static int close_func(void *p) { + unsigned v = PTR_TO_UINT(p); + + (void) prctl(PR_SET_NAME, (unsigned long*) "(sd-close)"); + + /* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such + * children memory allocation is not allowed, since glibc does not release malloc mutexes in + * clone() 💣 */ + + if (v & NEED_DOUBLE_FORK) { + pid_t pid; + + v &= ~NEED_DOUBLE_FORK; + + /* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so + * that the reaper knows when it's time to reap. */ + pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v)); + if (pid >= 0) + return 0; + } + + close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */ + return 0; +} + +int asynchronous_close(int fd) { + unsigned v; + pid_t pid; + int r; + + /* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so + * that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we + * work around it, and hide this as a far away as we can. + * + * It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a + * minimal subprocess instead which shares our fd table via CLONE_FILES. */ + + if (fd < 0) + return -EBADF; /* already invalid */ + + PROTECT_ERRNO; + + v = (unsigned) fd; + + /* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But + * we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager + * process qualifies as this), we can avoid the double forking, since the double forked process would + * be reparented back to us anyway. */ + r = is_reaper_process(); + if (r < 0) + log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m"); + if (r <= 0) + v |= NEED_DOUBLE_FORK; + + pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v)); + if (pid < 0) + assert_se(close_nointr(fd) != -EBADF); /* local fallback */ + else if (v & NEED_DOUBLE_FORK) { + + /* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for + * any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting + * then. + * + * We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while + * waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */ + for (;;) + if (waitpid(pid, NULL, __WCLONE) >= 0 || errno != EINTR) + break; + } + + return -EBADF; /* return an invalidated fd */ +} + +int asynchronous_rm_rf(const char *p, RemoveFlags flags) { + int r; + + assert(p); + + /* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child + * will attempt to do its thing, but we won't wait for it or check its success. */ + + r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL); + if (r != 0) + return r; + + /* Child */ + + r = rm_rf(p, flags); + if (r < 0) { + log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p); + _exit(EXIT_FAILURE); /* This is a detached process, hence no one really cares, but who knows + * maybe it's good for debugging/tracing to return an exit code + * indicative of our failure here. */ + } + + _exit(EXIT_SUCCESS); +} diff --git a/src/shared/async.h b/src/shared/async.h new file mode 100644 index 0000000..96148f9 --- /dev/null +++ b/src/shared/async.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" +#include "rm-rf.h" + +/* These functions implement various potentially slow operations that are executed asynchronously. They are + * carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does + * not share any memory with the parent process, and thus cannot possibly interfere with the malloc() + * synchronization locks. + * + * Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone() + * (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the + * malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other + * thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence + * one has to make the choice: either never use threads in the parent, or never do memory allocation in the + * child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted + * for avoiding threads. */ + +int asynchronous_sync(pid_t *ret_pid); +int asynchronous_close(int fd); +int asynchronous_rm_rf(const char *p, RemoveFlags flags); + +DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close); diff --git a/src/shared/barrier.c b/src/shared/barrier.c new file mode 100644 index 0000000..bd5bdd7 --- /dev/null +++ b/src/shared/barrier.c @@ -0,0 +1,394 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "barrier.h" +#include "errno-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "macro.h" + +/** + * Barriers + * This barrier implementation provides a simple synchronization method based + * on file-descriptors that can safely be used between threads and processes. A + * barrier object contains 2 shared counters based on eventfd. Both processes + * can now place barriers and wait for the other end to reach a random or + * specific barrier. + * Barriers are numbered, so you can either wait for the other end to reach any + * barrier or the last barrier that you placed. This way, you can use barriers + * for one-way *and* full synchronization. Note that even-though barriers are + * numbered, these numbers are internal and recycled once both sides reached the + * same barrier (implemented as a simple signed counter). It is thus not + * possible to address barriers by their ID. + * + * Barrier-API: Both ends can place as many barriers via barrier_place() as + * they want and each pair of barriers on both sides will be implicitly linked. + * Each side can use the barrier_wait/sync_*() family of calls to wait for the + * other side to place a specific barrier. barrier_wait_next() waits until the + * other side calls barrier_place(). No links between the barriers are + * considered and this simply serves as most basic asynchronous barrier. + * barrier_sync_next() is like barrier_wait_next() and waits for the other side + * to place their next barrier via barrier_place(). However, it only waits for + * barriers that are linked to a barrier we already placed. If the other side + * already placed more barriers than we did, barrier_sync_next() returns + * immediately. + * barrier_sync() extends barrier_sync_next() and waits until the other end + * placed as many barriers via barrier_place() as we did. If they already placed + * as many as we did (or more), it returns immediately. + * + * Additionally to basic barriers, an abortion event is available. + * barrier_abort() places an abortion event that cannot be undone. An abortion + * immediately cancels all placed barriers and replaces them. Any running and + * following wait/sync call besides barrier_wait_abortion() will immediately + * return false on both sides (otherwise, they always return true). + * barrier_abort() can be called multiple times on both ends and will be a + * no-op if already called on this side. + * barrier_wait_abortion() can be used to wait for the other side to call + * barrier_abort() and is the only wait/sync call that does not return + * immediately if we aborted outself. It only returns once the other side + * called barrier_abort(). + * + * Barriers can be used for in-process and inter-process synchronization. + * However, for in-process synchronization you could just use mutexes. + * Therefore, main target is IPC and we require both sides to *not* share the FD + * table. If that's given, barriers provide target tracking: If the remote side + * exit()s, an abortion event is implicitly queued on the other side. This way, + * a sync/wait call will be woken up if the remote side crashed or exited + * unexpectedly. However, note that these abortion events are only queued if the + * barrier-queue has been drained. Therefore, it is safe to place a barrier and + * exit. The other side can safely wait on the barrier even though the exit + * queued an abortion event. Usually, the abortion event would overwrite the + * barrier, however, that's not true for exit-abortion events. Those are only + * queued if the barrier-queue is drained (thus, the receiving side has placed + * more barriers than the remote side). + */ + +/** + * barrier_create() - Initialize a barrier object + * @obj: barrier to initialize + * + * This initializes a barrier object. The caller is responsible of allocating + * the memory and keeping it valid. The memory does not have to be zeroed + * beforehand. + * Two eventfd objects are allocated for each barrier. If allocation fails, an + * error is returned. + * + * If this function fails, the barrier is reset to an invalid state so it is + * safe to call barrier_destroy() on the object regardless whether the + * initialization succeeded or not. + * + * The caller is responsible to destroy the object via barrier_destroy() before + * releasing the underlying memory. + * + * Returns: 0 on success, negative error code on failure. + */ +int barrier_create(Barrier *b) { + _unused_ _cleanup_(barrier_destroyp) Barrier *staging = b; + + assert(b); + + b->me = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (b->me < 0) + return -errno; + + b->them = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (b->them < 0) + return -errno; + + if (pipe2(b->pipe, O_CLOEXEC | O_NONBLOCK) < 0) + return -errno; + + staging = NULL; + return 0; +} + +/** + * barrier_destroy() - Destroy a barrier object + * @b: barrier to destroy or NULL + * + * This destroys a barrier object that has previously been passed to + * barrier_create(). The object is released and reset to invalid + * state. Therefore, it is safe to call barrier_destroy() multiple + * times or even if barrier_create() failed. However, barrier must be + * always initialized with BARRIER_NULL. + * + * If @b is NULL, this is a no-op. + */ +Barrier* barrier_destroy(Barrier *b) { + if (!b) + return NULL; + + b->me = safe_close(b->me); + b->them = safe_close(b->them); + safe_close_pair(b->pipe); + b->barriers = 0; + return NULL; +} + +/** + * barrier_set_role() - Set the local role of the barrier + * @b: barrier to operate on + * @role: role to set on the barrier + * + * This sets the roles on a barrier object. This is needed to know + * which side of the barrier you're on. Usually, the parent creates + * the barrier via barrier_create() and then calls fork() or clone(). + * Therefore, the FDs are duplicated and the child retains the same + * barrier object. + * + * Both sides need to call barrier_set_role() after fork() or clone() + * are done. If this is not done, barriers will not work correctly. + * + * Note that barriers could be supported without fork() or clone(). However, + * this is currently not needed so it hasn't been implemented. + */ +void barrier_set_role(Barrier *b, unsigned role) { + assert(b); + assert(IN_SET(role, BARRIER_PARENT, BARRIER_CHILD)); + /* make sure this is only called once */ + assert(b->pipe[0] >= 0 && b->pipe[1] >= 0); + + if (role == BARRIER_PARENT) + b->pipe[1] = safe_close(b->pipe[1]); + else { + b->pipe[0] = safe_close(b->pipe[0]); + + /* swap me/them for children */ + SWAP_TWO(b->me, b->them); + } +} + +/* places barrier; returns false if we aborted, otherwise true */ +static bool barrier_write(Barrier *b, uint64_t buf) { + ssize_t len; + + /* prevent new sync-points if we already aborted */ + if (barrier_i_aborted(b)) + return false; + + assert(b->me >= 0); + do + len = write(b->me, &buf, sizeof(buf)); + while (len < 0 && ERRNO_IS_TRANSIENT(errno)); + + if (len != sizeof(buf)) + goto error; + + /* lock if we aborted */ + if (buf >= (uint64_t)BARRIER_ABORTION) { + if (barrier_they_aborted(b)) + b->barriers = BARRIER_WE_ABORTED; + else + b->barriers = BARRIER_I_ABORTED; + } else if (!barrier_is_aborted(b)) + b->barriers += buf; + + return !barrier_i_aborted(b); + +error: + /* If there is an unexpected error, we have to make this fatal. There + * is no way we can recover from sync-errors. Therefore, we close the + * pipe-ends and treat this as abortion. The other end will notice the + * pipe-close and treat it as abortion, too. */ + + safe_close_pair(b->pipe); + b->barriers = BARRIER_WE_ABORTED; + return false; +} + +/* waits for barriers; returns false if they aborted, otherwise true */ +static bool barrier_read(Barrier *b, int64_t comp) { + if (barrier_they_aborted(b)) + return false; + + while (b->barriers > comp) { + struct pollfd pfd[2] = { + { .fd = b->pipe[0] >= 0 ? b->pipe[0] : b->pipe[1], + .events = POLLHUP }, + { .fd = b->them, + .events = POLLIN }}; + uint64_t buf; + int r; + + r = ppoll_usec(pfd, ELEMENTSOF(pfd), USEC_INFINITY); + if (r == -EINTR) + continue; + if (r < 0) + goto error; + + if (pfd[1].revents) { + ssize_t len; + + /* events on @them signal new data for us */ + len = read(b->them, &buf, sizeof(buf)); + if (len < 0 && ERRNO_IS_TRANSIENT(errno)) + continue; + + if (len != sizeof(buf)) + goto error; + } else if (pfd[0].revents & (POLLHUP | POLLERR | POLLNVAL)) + /* POLLHUP on the pipe tells us the other side exited. + * We treat this as implicit abortion. But we only + * handle it if there's no event on the eventfd. This + * guarantees that exit-abortions do not overwrite real + * barriers. */ + buf = BARRIER_ABORTION; + else + continue; + + /* lock if they aborted */ + if (buf >= (uint64_t)BARRIER_ABORTION) { + if (barrier_i_aborted(b)) + b->barriers = BARRIER_WE_ABORTED; + else + b->barriers = BARRIER_THEY_ABORTED; + } else if (!barrier_is_aborted(b)) + b->barriers -= buf; + } + + return !barrier_they_aborted(b); + +error: + /* If there is an unexpected error, we have to make this fatal. There + * is no way we can recover from sync-errors. Therefore, we close the + * pipe-ends and treat this as abortion. The other end will notice the + * pipe-close and treat it as abortion, too. */ + + safe_close_pair(b->pipe); + b->barriers = BARRIER_WE_ABORTED; + return false; +} + +/** + * barrier_place() - Place a new barrier + * @b: barrier object + * + * This places a new barrier on the barrier object. If either side already + * aborted, this is a no-op and returns "false". Otherwise, the barrier is + * placed and this returns "true". + * + * Returns: true if barrier was placed, false if either side aborted. + */ +bool barrier_place(Barrier *b) { + assert(b); + + if (barrier_is_aborted(b)) + return false; + + barrier_write(b, BARRIER_SINGLE); + return true; +} + +/** + * barrier_abort() - Abort the synchronization + * @b: barrier object to abort + * + * This aborts the barrier-synchronization. If barrier_abort() was already + * called on this side, this is a no-op. Otherwise, the barrier is put into the + * ABORT-state and will stay there. The other side is notified about the + * abortion. Any following attempt to place normal barriers or to wait on normal + * barriers will return immediately as "false". + * + * You can wait for the other side to call barrier_abort(), too. Use + * barrier_wait_abortion() for that. + * + * Returns: false if the other side already aborted, true otherwise. + */ +bool barrier_abort(Barrier *b) { + assert(b); + + barrier_write(b, BARRIER_ABORTION); + return !barrier_they_aborted(b); +} + +/** + * barrier_wait_next() - Wait for the next barrier of the other side + * @b: barrier to operate on + * + * This waits until the other side places its next barrier. This is independent + * of any barrier-links and just waits for any next barrier of the other side. + * + * If either side aborted, this returns false. + * + * Returns: false if either side aborted, true otherwise. + */ +bool barrier_wait_next(Barrier *b) { + assert(b); + + if (barrier_is_aborted(b)) + return false; + + barrier_read(b, b->barriers - 1); + return !barrier_is_aborted(b); +} + +/** + * barrier_wait_abortion() - Wait for the other side to abort + * @b: barrier to operate on + * + * This waits until the other side called barrier_abort(). This can be called + * regardless whether the local side already called barrier_abort() or not. + * + * If the other side has already aborted, this returns immediately. + * + * Returns: false if the local side aborted, true otherwise. + */ +bool barrier_wait_abortion(Barrier *b) { + assert(b); + + barrier_read(b, BARRIER_THEY_ABORTED); + return !barrier_i_aborted(b); +} + +/** + * barrier_sync_next() - Wait for the other side to place a next linked barrier + * @b: barrier to operate on + * + * This is like barrier_wait_next() and waits for the other side to call + * barrier_place(). However, this only waits for linked barriers. That means, if + * the other side already placed more barriers than (or as much as) we did, this + * returns immediately instead of waiting. + * + * If either side aborted, this returns false. + * + * Returns: false if either side aborted, true otherwise. + */ +bool barrier_sync_next(Barrier *b) { + assert(b); + + if (barrier_is_aborted(b)) + return false; + + barrier_read(b, MAX((int64_t)0, b->barriers - 1)); + return !barrier_is_aborted(b); +} + +/** + * barrier_sync() - Wait for the other side to place as many barriers as we did + * @b: barrier to operate on + * + * This is like barrier_sync_next() but waits for the other side to call + * barrier_place() as often as we did (in total). If they already placed as much + * as we did (or more), this returns immediately instead of waiting. + * + * If either side aborted, this returns false. + * + * Returns: false if either side aborted, true otherwise. + */ +bool barrier_sync(Barrier *b) { + assert(b); + + if (barrier_is_aborted(b)) + return false; + + barrier_read(b, 0); + return !barrier_is_aborted(b); +} diff --git a/src/shared/barrier.h b/src/shared/barrier.h new file mode 100644 index 0000000..4ee2040 --- /dev/null +++ b/src/shared/barrier.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "macro.h" + +/* See source file for an API description. */ + +typedef struct Barrier Barrier; + +enum { + BARRIER_SINGLE = 1LL, + BARRIER_ABORTION = INT64_MAX, + + /* bias values to store state; keep @WE < @THEY < @I */ + BARRIER_BIAS = INT64_MIN, + BARRIER_WE_ABORTED = BARRIER_BIAS + 1LL, + BARRIER_THEY_ABORTED = BARRIER_BIAS + 2LL, + BARRIER_I_ABORTED = BARRIER_BIAS + 3LL, +}; + +enum { + BARRIER_PARENT, + BARRIER_CHILD, +}; + +struct Barrier { + int me; + int them; + int pipe[2]; + int64_t barriers; +}; + +#define BARRIER_NULL {-EBADF, -EBADF, {-EBADF, -EBADF}, 0} + +int barrier_create(Barrier *obj); +Barrier* barrier_destroy(Barrier *b); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Barrier*, barrier_destroy); + +void barrier_set_role(Barrier *b, unsigned role); + +bool barrier_place(Barrier *b); +bool barrier_abort(Barrier *b); + +bool barrier_wait_next(Barrier *b); +bool barrier_wait_abortion(Barrier *b); +bool barrier_sync_next(Barrier *b); +bool barrier_sync(Barrier *b); + +static inline bool barrier_i_aborted(Barrier *b) { + return IN_SET(b->barriers, BARRIER_I_ABORTED, BARRIER_WE_ABORTED); +} + +static inline bool barrier_they_aborted(Barrier *b) { + return IN_SET(b->barriers, BARRIER_THEY_ABORTED, BARRIER_WE_ABORTED); +} + +static inline bool barrier_we_aborted(Barrier *b) { + return b->barriers == BARRIER_WE_ABORTED; +} + +static inline bool barrier_is_aborted(Barrier *b) { + return IN_SET(b->barriers, + BARRIER_I_ABORTED, BARRIER_THEY_ABORTED, BARRIER_WE_ABORTED); +} + +static inline bool barrier_place_and_sync(Barrier *b) { + (void) barrier_place(b); + return barrier_sync(b); +} diff --git a/src/shared/base-filesystem.c b/src/shared/base-filesystem.c new file mode 100644 index 0000000..569ef46 --- /dev/null +++ b/src/shared/base-filesystem.c @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "architecture.h" +#include "base-filesystem.h" +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "string-util.h" +#include "umask-util.h" +#include "user-util.h" + +typedef struct BaseFilesystem { + const char *dir; /* directory or symlink to create */ + mode_t mode; + const char *target; /* if non-NULL create as symlink to this target */ + const char *exists; /* conditionalize this entry on existence of this file */ + bool ignore_failure; +} BaseFilesystem; + +static const BaseFilesystem table[] = { + { "bin", 0, "usr/bin\0", NULL }, + { "lib", 0, "usr/lib\0", NULL }, + { "root", 0750, NULL, NULL, true }, + { "sbin", 0, "usr/sbin\0", NULL }, + { "usr", 0755, NULL, NULL }, + { "var", 0755, NULL, NULL }, + { "etc", 0755, NULL, NULL }, + { "proc", 0555, NULL, NULL, true }, + { "sys", 0555, NULL, NULL, true }, + { "dev", 0555, NULL, NULL, true }, + { "run", 0555, NULL, NULL, true }, + /* We don't add /tmp/ here for now (even though it's necessary for regular operation), because we + * want to support both cases where /tmp/ is a mount of its own (in which case we probably should set + * the mode to 1555, to indicate that no one should write to it, not even root) and when it's part of + * the rootfs (in which case we should set mode 1777), and we simply don't know what's right. */ + + /* Various architecture ABIs define the path to the dynamic loader via the /lib64/ subdirectory of + * the root directory. When booting from an otherwise empty root file system (where only /usr/ has + * been mounted into) it is thus necessary to create a symlink pointing to the right subdirectory of + * /usr/ first — otherwise we couldn't invoke any dynamic binary. Let's detect this case here, and + * create the symlink as needed should it be missing. We prefer doing this consistently with Debian's + * multiarch logic, but support Fedora-style and Arch-style multilib too. */ +#if defined(__aarch64__) + /* aarch64 ELF ABI actually says dynamic loader is in /lib/, but Fedora puts it in /lib64/ anyway and + * just symlinks /lib/ld-linux-aarch64.so.1 to ../lib64/ld-linux-aarch64.so.1. For this to work + * correctly, /lib64/ must be symlinked to /usr/lib64/. */ + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-linux-aarch64.so.1" }, +# define KNOW_LIB64_DIRS 1 +#elif defined(__alpha__) +#elif defined(__arc__) || defined(__tilegx__) +#elif defined(__arm__) + /* No /lib64 on arm. The linker is /lib/ld-linux-armhf.so.3. */ +# define KNOW_LIB64_DIRS 1 +#elif defined(__i386__) || defined(__x86_64__) + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-linux-x86-64.so.2" }, +# define KNOW_LIB64_DIRS 1 +#elif defined(__ia64__) +#elif defined(__loongarch_lp64) +# define KNOW_LIB64_DIRS 1 +# if defined(__loongarch_double_float) + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-linux-loongarch-lp64d.so.1" }, +# elif defined(__loongarch_single_float) + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-linux-loongarch-lp64f.so.1" }, +# elif defined(__loongarch_soft_float) + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-linux-loongarch-lp64s.so.1" }, +# else +# error "Unknown LoongArch ABI" +# endif +#elif defined(__m68k__) + /* No link needed. */ +# define KNOW_LIB64_DIRS 1 +#elif defined(_MIPS_SIM) +# if _MIPS_SIM == _MIPS_SIM_ABI32 +# elif _MIPS_SIM == _MIPS_SIM_NABI32 +# elif _MIPS_SIM == _MIPS_SIM_ABI64 +# else +# error "Unknown MIPS ABI" +# endif +#elif defined(__powerpc__) +# if defined(__PPC64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld64.so.2" }, +# define KNOW_LIB64_DIRS 1 +# elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* powerpc64-linux-gnu */ +# else + /* powerpc-linux-gnu */ +# endif +#elif defined(__riscv) +# if __riscv_xlen == 32 +# elif __riscv_xlen == 64 + /* Same situation as for aarch64 */ + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-linux-riscv64-lp64d.so.1" }, +# define KNOW_LIB64_DIRS 1 +# else +# error "Unknown RISC-V ABI" +# endif +#elif defined(__s390__) + /* s390-linux-gnu */ +#elif defined(__s390x__) + { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0" + "usr/lib64\0" + "usr/lib\0", "ld-lsb-s390x.so.3" }, +# define KNOW_LIB64_DIRS 1 +#elif defined(__sparc__) +#endif + /* gcc doesn't allow pragma to be used within constructs, hence log about this separately below */ +}; + +#ifndef KNOW_LIB64_DIRS +# pragma message "Please add an entry above specifying whether your architecture uses /lib64/, /lib32/, or no such links." +#endif + +int base_filesystem_create_fd(int fd, const char *root, uid_t uid, gid_t gid) { + int r; + + assert(fd >= 0); + assert(root); + + /* The "root" parameter is decoration only – it's only used as part of log messages */ + + for (size_t i = 0; i < ELEMENTSOF(table); i++) { + if (faccessat(fd, table[i].dir, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) + continue; + + if (table[i].target) { /* Create as symlink? */ + const char *target = NULL; + + /* check if one of the targets exists */ + NULSTR_FOREACH(s, table[i].target) { + if (faccessat(fd, s, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + continue; + + /* check if a specific file exists at the target path */ + if (table[i].exists) { + _cleanup_free_ char *p = NULL; + + p = path_join(s, table[i].exists); + if (!p) + return log_oom(); + + if (faccessat(fd, p, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + continue; + } + + target = s; + break; + } + + if (!target) + continue; + + r = RET_NERRNO(symlinkat(target, fd, table[i].dir)); + } else { + /* Create as directory. */ + WITH_UMASK(0000) + r = RET_NERRNO(mkdirat(fd, table[i].dir, table[i].mode)); + } + if (r < 0) { + bool ignore = IN_SET(r, -EEXIST, -EROFS) || table[i].ignore_failure; + log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r, + "Failed to create %s/%s: %m", root, table[i].dir); + if (ignore) + continue; + + return r; + } + + if (uid_is_valid(uid) || gid_is_valid(gid)) + if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0) + return log_error_errno(errno, "Failed to chown %s/%s: %m", root, table[i].dir); + } + + return 0; +} + +int base_filesystem_create(const char *root, uid_t uid, gid_t gid) { + _cleanup_close_ int fd = -EBADF; + + fd = open(ASSERT_PTR(root), O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open root file system: %m"); + + return base_filesystem_create_fd(fd, root, uid, gid); +} diff --git a/src/shared/base-filesystem.h b/src/shared/base-filesystem.h new file mode 100644 index 0000000..a1ccf45 --- /dev/null +++ b/src/shared/base-filesystem.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int base_filesystem_create_fd(int fd, const char *root, uid_t uid, gid_t gid); +int base_filesystem_create(const char *root, uid_t uid, gid_t gid); diff --git a/src/shared/battery-util.c b/src/shared/battery-util.c new file mode 100644 index 0000000..37b3f6a --- /dev/null +++ b/src/shared/battery-util.c @@ -0,0 +1,283 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-device.h" + +#include "device-private.h" +#include "device-util.h" +#include "string-util.h" +#include "battery-util.h" + +#define BATTERY_LOW_CAPACITY_LEVEL 5 + +static int device_is_power_sink(sd_device *device) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool found_source = false, found_sink = false; + sd_device *parent; + int r; + + assert(device); + + /* USB-C power supply device has two power roles: source or sink. See, + * https://docs.kernel.org/admin-guide/abi-testing.html#abi-file-testing-sysfs-class-typec */ + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "typec", true); + if (r < 0) + return r; + + r = sd_device_get_parent(device, &parent); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_parent(e, parent); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + const char *val; + + r = sd_device_get_sysattr_value(d, "power_role", &val); + if (r < 0) { + if (r != -ENOENT) + log_device_debug_errno(d, r, "Failed to read 'power_role' sysfs attribute, ignoring: %m"); + continue; + } + + if (strstr(val, "[source]")) { + found_source = true; + log_device_debug(d, "The USB type-C port is in power source mode."); + } else if (strstr(val, "[sink]")) { + found_sink = true; + log_device_debug(d, "The USB type-C port is in power sink mode."); + } + } + + if (found_sink) + log_device_debug(device, "The USB type-C device has at least one port in power sink mode."); + else if (!found_source) + log_device_debug(device, "The USB type-C device has no port in power source mode, assuming the device is in power sink mode."); + else + log_device_debug(device, "All USB type-C ports are in power source mode."); + + return found_sink || !found_source; +} + +static bool battery_is_discharging(sd_device *d) { + const char *val; + int r; + + assert(d); + + r = sd_device_get_sysattr_value(d, "scope", &val); + if (r < 0) { + if (r != -ENOENT) + log_device_debug_errno(d, r, "Failed to read 'scope' sysfs attribute, ignoring: %m"); + } else if (streq(val, "Device")) { + log_device_debug(d, "The power supply is a device battery, ignoring device."); + return false; + } + + r = device_get_sysattr_bool(d, "present"); + if (r < 0) + log_device_debug_errno(d, r, "Failed to read 'present' sysfs attribute, assuming the battery is present: %m"); + else if (r == 0) { + log_device_debug(d, "The battery is not present, ignoring the power supply."); + return false; + } + + /* Possible values: "Unknown", "Charging", "Discharging", "Not charging", "Full" */ + r = sd_device_get_sysattr_value(d, "status", &val); + if (r < 0) { + log_device_debug_errno(d, r, "Failed to read 'status' sysfs attribute, assuming the battery is discharging: %m"); + return true; + } + if (!streq(val, "Discharging")) { + log_device_debug(d, "The battery status is '%s', assuming the battery is not used as a power source of this machine.", val); + return false; + } + + return true; +} + +int on_ac_power(void) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool found_ac_online = false, found_discharging_battery = false; + int r; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "power_supply", true); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + /* See + * https://github.com/torvalds/linux/blob/4eef766b7d4d88f0b984781bc1bcb574a6eafdc7/include/linux/power_supply.h#L176 + * for defined power source types. Also see: + * https://docs.kernel.org/admin-guide/abi-testing.html#abi-file-testing-sysfs-class-power */ + + const char *val; + r = sd_device_get_sysattr_value(d, "type", &val); + if (r < 0) { + log_device_debug_errno(d, r, "Failed to read 'type' sysfs attribute, ignoring device: %m"); + continue; + } + + /* Ignore USB-C power supply in source mode. See issue #21988. */ + if (streq(val, "USB")) { + r = device_is_power_sink(d); + if (r <= 0) { + if (r < 0) + log_device_debug_errno(d, r, "Failed to determine the current power role, ignoring device: %m"); + else + log_device_debug(d, "USB power supply is in source mode, ignoring device."); + continue; + } + } + + if (streq(val, "Battery")) { + if (battery_is_discharging(d)) { + found_discharging_battery = true; + log_device_debug(d, "The power supply is a battery and currently discharging."); + } + continue; + } + + r = device_get_sysattr_unsigned(d, "online", NULL); + if (r < 0) { + log_device_debug_errno(d, r, "Failed to query 'online' sysfs attribute, ignoring device: %m"); + continue; + } else if (r > 0) /* At least 1 and 2 are defined as different types of 'online' */ + found_ac_online = true; + + log_device_debug(d, "The power supply is currently %s.", r > 0 ? "online" : "offline"); + } + + if (found_ac_online) { + log_debug("Found at least one online non-battery power supply, system is running on AC."); + return true; + } else if (found_discharging_battery) { + log_debug("Found at least one discharging battery and no online power sources, assuming system is running from battery."); + return false; + } else { + log_debug("No power supply reported online and no discharging battery found, assuming system is running on AC."); + return true; + } +} + +/* Get the list of batteries */ +int battery_enumerator_new(sd_device_enumerator **ret) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(ret); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "power_supply", /* match = */ true); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, "type", "Battery", /* match = */ true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, "present", "1", /* match = */ true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, "scope", "Device", /* match = */ false); + if (r < 0) + return r; + + *ret = TAKE_PTR(e); + return 0; +} + +/* Battery percentage capacity fetched from capacity file and if in range 0-100 then returned */ +int battery_read_capacity_percentage(sd_device *dev) { + int battery_capacity, r; + + assert(dev); + + r = device_get_sysattr_int(dev, "capacity", &battery_capacity); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to read/parse POWER_SUPPLY_CAPACITY: %m"); + + if (battery_capacity < 0 || battery_capacity > 100) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ERANGE), "Invalid battery capacity: %d", battery_capacity); + + return battery_capacity; +} + +/* If a battery whose percentage capacity is <= 5% exists, and we're not on AC power, return success */ +int battery_is_discharging_and_low(void) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool unsure = false, found_low = false; + int r; + + /* We have not used battery capacity_level since value is set to full + * or Normal in case ACPI is not working properly. In case of no battery + * 0 will be returned and system will be suspended for 1st cycle then hibernated */ + + r = on_ac_power(); + if (r < 0) + log_warning_errno(r, "Failed to check if the system is running on AC, assuming it is not: %m"); + if (r > 0) + return false; + + r = battery_enumerator_new(&e); + if (r < 0) + return log_error_errno(r, "Failed to initialize battery enumerator: %m"); + + FOREACH_DEVICE(e, dev) { + int level; + + level = battery_read_capacity_percentage(dev); + if (level < 0) { + unsure = true; + continue; + } + + if (level > BATTERY_LOW_CAPACITY_LEVEL) { /* Found a charged battery */ + log_device_full(dev, + found_low ? LOG_INFO : LOG_DEBUG, + "Found battery with capacity above threshold (%d%% > %d%%).", + level, BATTERY_LOW_CAPACITY_LEVEL); + return false; + } + + log_device_info(dev, + "Found battery with capacity below threshold (%d%% <= %d%%).", + level, BATTERY_LOW_CAPACITY_LEVEL); + found_low = true; + } + + /* If we found a battery whose state we couldn't read, don't assume we are in low battery state */ + if (unsure) { + log_notice("Found battery with unreadable state, assuming not in low battery state."); + return false; + } + + /* If found neither charged nor low batteries, assume that we aren't in low battery state */ + return found_low; +} diff --git a/src/shared/battery-util.h b/src/shared/battery-util.h new file mode 100644 index 0000000..c58f30b --- /dev/null +++ b/src/shared/battery-util.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-device.h" + +int on_ac_power(void); + +int battery_is_discharging_and_low(void); + +int battery_enumerator_new(sd_device_enumerator **ret); +int battery_read_capacity_percentage(sd_device *dev); diff --git a/src/shared/binfmt-util.c b/src/shared/binfmt-util.c new file mode 100644 index 0000000..a261754 --- /dev/null +++ b/src/shared/binfmt-util.c @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "binfmt-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "missing_magic.h" +#include "stat-util.h" + +int binfmt_mounted(void) { + _cleanup_close_ int fd = -EBADF; + int r; + + fd = RET_NERRNO(open("/proc/sys/fs/binfmt_misc", O_CLOEXEC | O_DIRECTORY | O_PATH)); + if (fd == -ENOENT) + return false; + if (fd < 0) + return fd; + + r = fd_is_fs_type(fd, BINFMTFS_MAGIC); + if (r <= 0) + return r; + + return access_fd(fd, W_OK) >= 0; +} + +int disable_binfmt(void) { + int r; + + /* Flush out all rules. This is important during shutdown to cover for rules using "F", since those + * might pin a file and thus block us from unmounting stuff cleanly. + * + * We are a bit careful here, since binfmt_misc might still be an autofs which we don't want to + * trigger. */ + + r = binfmt_mounted(); + if (r < 0) + return log_warning_errno(r, "Failed to determine whether binfmt_misc is mounted: %m"); + if (r == 0) { + log_debug("binfmt_misc is not mounted in read-write mode, not detaching entries."); + return 0; + } + + r = write_string_file("/proc/sys/fs/binfmt_misc/status", "-1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_warning_errno(r, "Failed to unregister binfmt_misc entries: %m"); + + log_debug("Unregistered all remaining binfmt_misc entries."); + return 0; +} diff --git a/src/shared/binfmt-util.h b/src/shared/binfmt-util.h new file mode 100644 index 0000000..13f4548 --- /dev/null +++ b/src/shared/binfmt-util.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int binfmt_mounted(void); +int disable_binfmt(void); diff --git a/src/shared/bitmap.c b/src/shared/bitmap.c new file mode 100644 index 0000000..6cf08b8 --- /dev/null +++ b/src/shared/bitmap.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bitmap.h" +#include "hashmap.h" +#include "macro.h" +#include "memory-util.h" + +/* Bitmaps are only meant to store relatively small numbers + * (corresponding to, say, an enum), so it is ok to limit + * the max entry. 64k should be plenty. */ +#define BITMAPS_MAX_ENTRY 0xffff + +/* This indicates that we reached the end of the bitmap */ +#define BITMAP_END (UINT_MAX) + +#define BITMAP_NUM_TO_OFFSET(n) ((n) / (sizeof(uint64_t) * 8)) +#define BITMAP_NUM_TO_REM(n) ((n) % (sizeof(uint64_t) * 8)) +#define BITMAP_OFFSET_TO_NUM(offset, rem) ((offset) * sizeof(uint64_t) * 8 + (rem)) + +Bitmap* bitmap_new(void) { + return new0(Bitmap, 1); +} + +Bitmap* bitmap_copy(Bitmap *b) { + Bitmap *ret; + + ret = bitmap_new(); + if (!ret) + return NULL; + + ret->bitmaps = newdup(uint64_t, b->bitmaps, b->n_bitmaps); + if (!ret->bitmaps) + return mfree(ret); + + ret->n_bitmaps = b->n_bitmaps; + return ret; +} + +Bitmap* bitmap_free(Bitmap *b) { + if (!b) + return NULL; + + free(b->bitmaps); + return mfree(b); +} + +int bitmap_ensure_allocated(Bitmap **b) { + Bitmap *a; + + assert(b); + + if (*b) + return 0; + + a = bitmap_new(); + if (!a) + return -ENOMEM; + + *b = a; + + return 0; +} + +int bitmap_set(Bitmap *b, unsigned n) { + uint64_t bitmask; + unsigned offset; + + assert(b); + + /* we refuse to allocate huge bitmaps */ + if (n > BITMAPS_MAX_ENTRY) + return -ERANGE; + + offset = BITMAP_NUM_TO_OFFSET(n); + + if (offset >= b->n_bitmaps) { + if (!GREEDY_REALLOC0(b->bitmaps, offset + 1)) + return -ENOMEM; + + b->n_bitmaps = offset + 1; + } + + bitmask = UINT64_C(1) << BITMAP_NUM_TO_REM(n); + + b->bitmaps[offset] |= bitmask; + + return 0; +} + +void bitmap_unset(Bitmap *b, unsigned n) { + uint64_t bitmask; + unsigned offset; + + if (!b) + return; + + offset = BITMAP_NUM_TO_OFFSET(n); + + if (offset >= b->n_bitmaps) + return; + + bitmask = UINT64_C(1) << BITMAP_NUM_TO_REM(n); + + b->bitmaps[offset] &= ~bitmask; +} + +bool bitmap_isset(const Bitmap *b, unsigned n) { + uint64_t bitmask; + unsigned offset; + + if (!b) + return false; + + offset = BITMAP_NUM_TO_OFFSET(n); + + if (offset >= b->n_bitmaps) + return false; + + bitmask = UINT64_C(1) << BITMAP_NUM_TO_REM(n); + + return !!(b->bitmaps[offset] & bitmask); +} + +bool bitmap_isclear(const Bitmap *b) { + unsigned i; + + if (!b) + return true; + + for (i = 0; i < b->n_bitmaps; i++) + if (b->bitmaps[i] != 0) + return false; + + return true; +} + +void bitmap_clear(Bitmap *b) { + if (!b) + return; + + b->bitmaps = mfree(b->bitmaps); + b->n_bitmaps = 0; +} + +bool bitmap_iterate(const Bitmap *b, Iterator *i, unsigned *n) { + uint64_t bitmask; + unsigned offset, rem; + + assert(i); + assert(n); + + if (!b || i->idx == BITMAP_END) + return false; + + offset = BITMAP_NUM_TO_OFFSET(i->idx); + rem = BITMAP_NUM_TO_REM(i->idx); + bitmask = UINT64_C(1) << rem; + + for (; offset < b->n_bitmaps; offset ++) { + if (b->bitmaps[offset]) { + for (; bitmask; bitmask <<= 1, rem ++) { + if (b->bitmaps[offset] & bitmask) { + *n = BITMAP_OFFSET_TO_NUM(offset, rem); + i->idx = *n + 1; + + return true; + } + } + } + + rem = 0; + bitmask = 1; + } + + i->idx = BITMAP_END; + + return false; +} + +bool bitmap_equal(const Bitmap *a, const Bitmap *b) { + size_t common_n_bitmaps; + const Bitmap *c; + unsigned i; + + if (a == b) + return true; + + if (!a != !b) + return false; + + if (!a) + return true; + + common_n_bitmaps = MIN(a->n_bitmaps, b->n_bitmaps); + if (memcmp_safe(a->bitmaps, b->bitmaps, sizeof(uint64_t) * common_n_bitmaps) != 0) + return false; + + c = a->n_bitmaps > b->n_bitmaps ? a : b; + for (i = common_n_bitmaps; i < c->n_bitmaps; i++) + if (c->bitmaps[i] != 0) + return false; + + return true; +} diff --git a/src/shared/bitmap.h b/src/shared/bitmap.h new file mode 100644 index 0000000..e77e2e1 --- /dev/null +++ b/src/shared/bitmap.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "hashmap.h" +#include "macro.h" + +typedef struct Bitmap { + uint64_t *bitmaps; + size_t n_bitmaps; +} Bitmap; + +Bitmap* bitmap_new(void); +Bitmap* bitmap_copy(Bitmap *b); +int bitmap_ensure_allocated(Bitmap **b); +Bitmap* bitmap_free(Bitmap *b); + +int bitmap_set(Bitmap *b, unsigned n); +void bitmap_unset(Bitmap *b, unsigned n); +bool bitmap_isset(const Bitmap *b, unsigned n); +bool bitmap_isclear(const Bitmap *b); +void bitmap_clear(Bitmap *b); + +bool bitmap_iterate(const Bitmap *b, Iterator *i, unsigned *n); + +bool bitmap_equal(const Bitmap *a, const Bitmap *b); + +#define _BITMAP_FOREACH(n, b, i) \ + for (Iterator i = {}; bitmap_iterate((b), &i, (unsigned*)&(n)); ) +#define BITMAP_FOREACH(n, b) \ + _BITMAP_FOREACH(n, b, UNIQ_T(i, UNIQ)) + +DEFINE_TRIVIAL_CLEANUP_FUNC(Bitmap*, bitmap_free); + +#define _cleanup_bitmap_free_ _cleanup_(bitmap_freep) diff --git a/src/shared/blkid-util.h b/src/shared/blkid-util.h new file mode 100644 index 0000000..abc4b61 --- /dev/null +++ b/src/shared/blkid-util.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_BLKID +# include + +# include "sd-id128.h" + +# include "macro.h" +# include "string-util.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(blkid_probe, blkid_free_probe, NULL); + +static inline int blkid_partition_get_uuid_id128(blkid_partition p, sd_id128_t *ret) { + const char *s; + + assert(p); + + s = blkid_partition_get_uuid(p); + if (isempty(s)) + return -ENXIO; + + return sd_id128_from_string(s, ret); +} + +static inline int blkid_partition_get_type_id128(blkid_partition p, sd_id128_t *ret) { + const char *s; + + assert(p); + + s = blkid_partition_get_type_string(p); + if (isempty(s)) + return -ENXIO; + + return sd_id128_from_string(s, ret); +} + +/* Define symbolic names for blkid_do_safeprobe() return values, since blkid only uses literal numbers. We + * prefix these symbolic definitions with underscores, to not invade libblkid's namespace needlessly. */ +enum { + _BLKID_SAFEPROBE_FOUND = 0, + _BLKID_SAFEPROBE_NOT_FOUND = 1, + _BLKID_SAFEPROBE_AMBIGUOUS = -2, + _BLKID_SAFEPROBE_ERROR = -1, +}; + +#endif diff --git a/src/shared/blockdev-util.c b/src/shared/blockdev-util.c new file mode 100644 index 0000000..c906aec --- /dev/null +++ b/src/shared/blockdev-util.c @@ -0,0 +1,828 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "missing_magic.h" +#include "parse-util.h" + +static int fd_get_devnum(int fd, BlockDeviceLookupFlag flags, dev_t *ret) { + struct stat st; + dev_t devnum; + int r; + + assert(fd >= 0); + assert(ret); + + if (fstat(fd, &st) < 0) + return -errno; + + if (S_ISBLK(st.st_mode)) + devnum = st.st_rdev; + else if (!FLAGS_SET(flags, BLOCK_DEVICE_LOOKUP_BACKING)) + return -ENOTBLK; + else if (!S_ISREG(st.st_mode) && !S_ISDIR(st.st_mode)) + return -ENOTBLK; + else if (major(st.st_dev) != 0) + devnum = st.st_dev; + else { + /* If major(st.st_dev) is zero, this might mean we are backed by btrfs, which needs special + * handing, to get the backing device node. */ + + r = btrfs_get_block_device_fd(fd, &devnum); + if (r == -ENOTTY) /* not btrfs */ + return -ENOTBLK; + if (r < 0) + return r; + } + + *ret = devnum; + return 0; +} + +int block_device_is_whole_disk(sd_device *dev) { + const char *s; + int r; + + assert(dev); + + r = sd_device_get_subsystem(dev, &s); + if (r < 0) + return r; + + if (!streq(s, "block")) + return -ENOTBLK; + + r = sd_device_get_devtype(dev, &s); + if (r < 0) + return r; + + return streq(s, "disk"); +} + +int block_device_get_whole_disk(sd_device *dev, sd_device **ret) { + int r; + + assert(dev); + assert(ret); + + /* Do not unref returned sd_device object. */ + + r = block_device_is_whole_disk(dev); + if (r < 0) + return r; + if (r == 0) { + r = sd_device_get_parent(dev, &dev); + if (r == -ENOENT) /* Already removed? Let's return a recognizable error. */ + return -ENODEV; + if (r < 0) + return r; + + r = block_device_is_whole_disk(dev); + if (r < 0) + return r; + if (r == 0) + return -ENXIO; + } + + *ret = dev; + return 0; +} + +int block_device_get_originating(sd_device *dev, sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *first_found = NULL; + const char *suffix; + dev_t devnum = 0; /* avoid false maybe-uninitialized warning */ + + /* For the specified block device tries to chase it through the layers, in case LUKS-style DM + * stacking is used, trying to find the next underlying layer. */ + + assert(dev); + assert(ret); + + FOREACH_DEVICE_CHILD_WITH_SUFFIX(dev, child, suffix) { + sd_device *child_whole_disk; + dev_t n; + + if (!path_startswith(suffix, "slaves")) + continue; + + if (block_device_get_whole_disk(child, &child_whole_disk) < 0) + continue; + + if (sd_device_get_devnum(child_whole_disk, &n) < 0) + continue; + + if (!first_found) { + first_found = sd_device_ref(child); + devnum = n; + continue; + } + + /* We found a device backed by multiple other devices. We don't really support automatic + * discovery on such setups, with the exception of dm-verity partitions. In this case there + * are two backing devices: the data partition and the hash partition. We are fine with such + * setups, however, only if both partitions are on the same physical device. Hence, let's + * verify this by iterating over every node in the 'slaves/' directory and comparing them with + * the first that gets returned by readdir(), to ensure they all point to the same device. */ + if (n != devnum) + return -ENOTUNIQ; + } + + if (!first_found) + return -ENOENT; + + *ret = TAKE_PTR(first_found); + return 1; /* found */ +} + +int block_device_new_from_fd(int fd, BlockDeviceLookupFlag flags, sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + dev_t devnum; + int r; + + assert(fd >= 0); + assert(ret); + + r = fd_get_devnum(fd, flags, &devnum); + if (r < 0) + return r; + + r = sd_device_new_from_devnum(&dev, 'b', devnum); + if (r < 0) + return r; + + if (FLAGS_SET(flags, BLOCK_DEVICE_LOOKUP_ORIGINATING)) { + _cleanup_(sd_device_unrefp) sd_device *dev_origin = NULL; + sd_device *dev_whole_disk; + + r = block_device_get_whole_disk(dev, &dev_whole_disk); + if (r < 0) + return r; + + r = block_device_get_originating(dev_whole_disk, &dev_origin); + if (r < 0 && r != -ENOENT) + return r; + if (r > 0) + device_unref_and_replace(dev, dev_origin); + } + + if (FLAGS_SET(flags, BLOCK_DEVICE_LOOKUP_WHOLE_DISK)) { + sd_device *dev_whole_disk; + + r = block_device_get_whole_disk(dev, &dev_whole_disk); + if (r < 0) + return r; + + *ret = sd_device_ref(dev_whole_disk); + return 0; + } + + *ret = sd_device_ref(dev); + return 0; +} + +int block_device_new_from_path(const char *path, BlockDeviceLookupFlag flags, sd_device **ret) { + _cleanup_close_ int fd = -EBADF; + + assert(path); + assert(ret); + + fd = open(path, O_CLOEXEC|O_PATH); + if (fd < 0) + return -errno; + + return block_device_new_from_fd(fd, flags, ret); +} + +int block_get_whole_disk(dev_t d, dev_t *ret) { + char p[SYS_BLOCK_PATH_MAX("/partition")]; + _cleanup_free_ char *s = NULL; + dev_t devt; + int r; + + assert(ret); + + if (major(d) == 0) + return -ENODEV; + + /* If it has a queue this is good enough for us */ + xsprintf_sys_block_path(p, "/queue", d); + if (access(p, F_OK) >= 0) { + *ret = d; + return 0; + } + if (errno != ENOENT) + return -errno; + + /* If it is a partition find the originating device */ + xsprintf_sys_block_path(p, "/partition", d); + if (access(p, F_OK) < 0) + return -errno; + + /* Get parent dev_t */ + xsprintf_sys_block_path(p, "/../dev", d); + r = read_one_line_file(p, &s); + if (r < 0) + return r; + + r = parse_devnum(s, &devt); + if (r < 0) + return r; + + /* Only return this if it is really good enough for us. */ + xsprintf_sys_block_path(p, "/queue", devt); + if (access(p, F_OK) < 0) + return -errno; + + *ret = devt; + return 1; +} + +int get_block_device_fd(int fd, dev_t *ret) { + struct stat st; + int r; + + assert(fd >= 0); + assert(ret); + + /* Gets the block device directly backing a file system. If the block device is encrypted, returns + * the device mapper block device. */ + + if (fstat(fd, &st)) + return -errno; + + if (major(st.st_dev) != 0) { + *ret = st.st_dev; + return 1; + } + + r = btrfs_get_block_device_fd(fd, ret); + if (r > 0) + return 1; + if (r != -ENOTTY) /* not btrfs */ + return r; + + *ret = 0; + return 0; +} + +int get_block_device(const char *path, dev_t *ret) { + _cleanup_close_ int fd = -EBADF; + + assert(path); + assert(ret); + + fd = open(path, O_RDONLY|O_NOFOLLOW|O_CLOEXEC); + if (fd < 0) + return -errno; + + return get_block_device_fd(fd, ret); +} + +int block_get_originating(dev_t dt, dev_t *ret) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL, *origin = NULL; + int r; + + assert(ret); + + r = sd_device_new_from_devnum(&dev, 'b', dt); + if (r < 0) + return r; + + r = block_device_get_originating(dev, &origin); + if (r < 0) + return r; + + return sd_device_get_devnum(origin, ret); +} + +int get_block_device_harder_fd(int fd, dev_t *ret) { + int r; + + assert(fd >= 0); + assert(ret); + + /* Gets the backing block device for a file system, and handles LUKS encrypted file systems, looking for its + * immediate parent, if there is one. */ + + r = get_block_device_fd(fd, ret); + if (r <= 0) + return r; + + r = block_get_originating(*ret, ret); + if (r < 0) + log_debug_errno(r, "Failed to chase block device, ignoring: %m"); + + return 1; +} + +int get_block_device_harder(const char *path, dev_t *ret) { + _cleanup_close_ int fd = -EBADF; + + assert(path); + assert(ret); + + fd = open(path, O_RDONLY|O_NOFOLLOW|O_CLOEXEC); + if (fd < 0) + return -errno; + + return get_block_device_harder_fd(fd, ret); +} + +int lock_whole_block_device(dev_t devt, int operation) { + _cleanup_close_ int lock_fd = -EBADF; + dev_t whole_devt; + int r; + + /* Let's get a BSD file lock on the whole block device, as per: https://systemd.io/BLOCK_DEVICE_LOCKING */ + + r = block_get_whole_disk(devt, &whole_devt); + if (r < 0) + return r; + + lock_fd = r = device_open_from_devnum(S_IFBLK, whole_devt, O_RDONLY|O_CLOEXEC|O_NONBLOCK, NULL); + if (r < 0) + return r; + + if (flock(lock_fd, operation) < 0) + return -errno; + + return TAKE_FD(lock_fd); +} + +int blockdev_partscan_enabled(int fd) { + _cleanup_free_ char *p = NULL, *buf = NULL; + unsigned long long ull; + struct stat st; + int r; + + /* Checks if partition scanning is correctly enabled on the block device */ + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISBLK(st.st_mode)) + return -ENOTBLK; + + if (asprintf(&p, "/sys/dev/block/%u:%u/capability", major(st.st_rdev), minor(st.st_rdev)) < 0) + return -ENOMEM; + + r = read_one_line_file(p, &buf); + if (r == -ENOENT) /* If the capability file doesn't exist then we are most likely looking at a + * partition block device, not the whole block device. And that means we have no + * partition scanning on for it (we do for its parent, but not for the partition + * itself). */ + return false; + if (r < 0) + return r; + + r = safe_atollu_full(buf, 16, &ull); + if (r < 0) + return r; + +#ifndef GENHD_FL_NO_PART_SCAN +#define GENHD_FL_NO_PART_SCAN (0x0200) +#endif + + return !FLAGS_SET(ull, GENHD_FL_NO_PART_SCAN); +} + +static int blockdev_is_encrypted(const char *sysfs_path, unsigned depth_left) { + _cleanup_free_ char *p = NULL, *uuids = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r, found_encrypted = false; + + assert(sysfs_path); + + if (depth_left == 0) + return -EINVAL; + + p = path_join(sysfs_path, "dm/uuid"); + if (!p) + return -ENOMEM; + + r = read_one_line_file(p, &uuids); + if (r != -ENOENT) { + if (r < 0) + return r; + + /* The DM device's uuid attribute is prefixed with "CRYPT-" if this is a dm-crypt device. */ + if (startswith(uuids, "CRYPT-")) + return true; + } + + /* Not a dm-crypt device itself. But maybe it is on top of one? Follow the links in the "slaves/" + * subdir. */ + + p = mfree(p); + p = path_join(sysfs_path, "slaves"); + if (!p) + return -ENOMEM; + + d = opendir(p); + if (!d) { + if (errno == ENOENT) /* Doesn't have underlying devices */ + return false; + + return -errno; + } + + for (;;) { + _cleanup_free_ char *q = NULL; + struct dirent *de; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + return -errno; + + break; /* No more underlying devices */ + } + + q = path_join(p, de->d_name); + if (!q) + return -ENOMEM; + + r = blockdev_is_encrypted(q, depth_left - 1); + if (r < 0) + return r; + if (r == 0) /* we found one that is not encrypted? then propagate that immediately */ + return false; + + found_encrypted = true; + } + + return found_encrypted; +} + +int fd_is_encrypted(int fd) { + char p[SYS_BLOCK_PATH_MAX(NULL)]; + dev_t devt; + int r; + + r = get_block_device_fd(fd, &devt); + if (r < 0) + return r; + if (r == 0) /* doesn't have a block device */ + return false; + + xsprintf_sys_block_path(p, NULL, devt); + + return blockdev_is_encrypted(p, 10 /* safety net: maximum recursion depth */); +} + +int path_is_encrypted(const char *path) { + char p[SYS_BLOCK_PATH_MAX(NULL)]; + dev_t devt; + int r; + + r = get_block_device(path, &devt); + if (r < 0) + return r; + if (r == 0) /* doesn't have a block device */ + return false; + + xsprintf_sys_block_path(p, NULL, devt); + + return blockdev_is_encrypted(p, 10 /* safety net: maximum recursion depth */); +} + +int fd_get_whole_disk(int fd, bool backing, dev_t *ret) { + dev_t devt; + int r; + + assert(fd >= 0); + assert(ret); + + r = fd_get_devnum(fd, backing ? BLOCK_DEVICE_LOOKUP_BACKING : 0, &devt); + if (r < 0) + return r; + + return block_get_whole_disk(devt, ret); +} + +int path_get_whole_disk(const char *path, bool backing, dev_t *ret) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_CLOEXEC|O_PATH); + if (fd < 0) + return -errno; + + return fd_get_whole_disk(fd, backing, ret); +} + +int block_device_add_partition( + int fd, + const char *name, + int nr, + uint64_t start, + uint64_t size) { + + assert(fd >= 0); + assert(name); + assert(nr > 0); + + struct blkpg_partition bp = { + .pno = nr, + .start = start, + .length = size, + }; + + struct blkpg_ioctl_arg ba = { + .op = BLKPG_ADD_PARTITION, + .data = &bp, + .datalen = sizeof(bp), + }; + + if (strlen(name) >= sizeof(bp.devname)) + return -EINVAL; + + strcpy(bp.devname, name); + + return RET_NERRNO(ioctl(fd, BLKPG, &ba)); +} + +int block_device_remove_partition( + int fd, + const char *name, + int nr) { + + assert(fd >= 0); + assert(name); + assert(nr > 0); + + struct blkpg_partition bp = { + .pno = nr, + }; + + struct blkpg_ioctl_arg ba = { + .op = BLKPG_DEL_PARTITION, + .data = &bp, + .datalen = sizeof(bp), + }; + + if (strlen(name) >= sizeof(bp.devname)) + return -EINVAL; + + strcpy(bp.devname, name); + + return RET_NERRNO(ioctl(fd, BLKPG, &ba)); +} + +int block_device_resize_partition( + int fd, + int nr, + uint64_t start, + uint64_t size) { + + assert(fd >= 0); + assert(nr > 0); + + struct blkpg_partition bp = { + .pno = nr, + .start = start, + .length = size, + }; + + struct blkpg_ioctl_arg ba = { + .op = BLKPG_RESIZE_PARTITION, + .data = &bp, + .datalen = sizeof(bp), + }; + + return RET_NERRNO(ioctl(fd, BLKPG, &ba)); +} + +int partition_enumerator_new(sd_device *dev, sd_device_enumerator **ret) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *s; + int r; + + assert(dev); + assert(ret); + + /* Refuse invocation on partition block device, insist on "whole" device */ + r = block_device_is_whole_disk(dev); + if (r < 0) + return r; + if (r == 0) + return -ENXIO; /* return a recognizable error */ + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_parent(e, dev); + if (r < 0) + return r; + + r = sd_device_get_sysname(dev, &s); + if (r < 0) + return r; + + /* Also add sysname check for safety. Hopefully, this also improves performance. */ + s = strjoina(s, "*"); + r = sd_device_enumerator_add_match_sysname(e, s); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "block", /* match = */ true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_property(e, "DEVTYPE", "partition"); + if (r < 0) + return r; + + *ret = TAKE_PTR(e); + return 0; +} + +int block_device_remove_all_partitions(sd_device *dev, int fd) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_(sd_device_unrefp) sd_device *dev_unref = NULL; + _cleanup_close_ int fd_close = -EBADF; + bool has_partitions = false; + int r, k = 0; + + assert(dev || fd >= 0); + + if (!dev) { + r = block_device_new_from_fd(fd, 0, &dev_unref); + if (r < 0) + return r; + + dev = dev_unref; + } + + r = partition_enumerator_new(dev, &e); + if (r < 0) + return r; + + if (fd < 0) { + fd_close = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY); + if (fd_close < 0) + return fd_close; + + fd = fd_close; + } + + FOREACH_DEVICE(e, part) { + const char *v, *devname; + int nr; + + has_partitions = true; + + r = sd_device_get_devname(part, &devname); + if (r < 0) + return r; + + r = sd_device_get_property_value(part, "PARTN", &v); + if (r < 0) + return r; + + r = safe_atoi(v, &nr); + if (r < 0) + return r; + + r = btrfs_forget_device(devname); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to forget btrfs device %s, ignoring: %m", devname); + + r = block_device_remove_partition(fd, devname, nr); + if (r == -ENODEV) { + log_debug("Kernel removed partition %s before us, ignoring", devname); + continue; + } + if (r < 0) { + log_debug_errno(r, "Failed to remove partition %s: %m", devname); + k = k < 0 ? k : r; + continue; + } + + log_debug("Removed partition %s", devname); + } + + return k < 0 ? k : has_partitions; +} + +int block_device_has_partitions(sd_device *dev) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(dev); + + /* Checks if the specified device currently has partitions. */ + + r = partition_enumerator_new(dev, &e); + if (r < 0) + return r; + + return !!sd_device_enumerator_get_device_first(e); +} + +int blockdev_reread_partition_table(sd_device *dev) { + _cleanup_close_ int fd = -EBADF; + + assert(dev); + + /* Try to re-read the partition table. This only succeeds if none of the devices is busy. */ + + fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return fd; + + if (flock(fd, LOCK_EX|LOCK_NB) < 0) + return -errno; + + if (ioctl(fd, BLKRRPART, 0) < 0) + return -errno; + + return 0; +} + +int blockdev_get_sector_size(int fd, uint32_t *ret) { + int ssz = 0; + + assert(fd >= 0); + assert(ret); + + if (ioctl(fd, BLKSSZGET, &ssz) < 0) + return -errno; + if (ssz <= 0) /* make sure the field is initialized */ + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Block device reported invalid sector size %i.", ssz); + + *ret = ssz; + return 0; +} + +int blockdev_get_root(int level, dev_t *ret) { + _cleanup_free_ char *p = NULL; + dev_t devno; + int r; + + /* Returns the device node backing the root file system. Traces through + * dm-crypt/dm-verity/... Returns > 0 and the devno of the device on success. If there's no block + * device (or multiple) returns 0 and a devno of 0. Failure otherwise. + * + * If the root mount has been replaced by some form of volatile file system (overlayfs), the original + * root block device node is symlinked in /run/systemd/volatile-root. Let's read that here. */ + r = readlink_malloc("/run/systemd/volatile-root", &p); + if (r == -ENOENT) { /* volatile-root not found */ + r = get_block_device_harder("/", &devno); + if (r == -EUCLEAN) + return btrfs_log_dev_root(level, r, "root file system"); + if (r < 0) + return log_full_errno(level, r, "Failed to determine block device of root file system: %m"); + if (r == 0) { /* Not backed by a single block device. (Could be NFS or so, or could be multi-device RAID or so) */ + r = get_block_device_harder("/usr", &devno); + if (r == -EUCLEAN) + return btrfs_log_dev_root(level, r, "/usr"); + if (r < 0) + return log_full_errno(level, r, "Failed to determine block device of /usr/ file system: %m"); + if (r == 0) { /* /usr/ not backed by single block device, either. */ + log_debug("Neither root nor /usr/ file system are on a (single) block device."); + + if (ret) + *ret = 0; + + return 0; + } + } + } else if (r < 0) + return log_full_errno(level, r, "Failed to read symlink /run/systemd/volatile-root: %m"); + else { + mode_t m; + r = device_path_parse_major_minor(p, &m, &devno); + if (r < 0) + return log_full_errno(level, r, "Failed to parse major/minor device node: %m"); + if (!S_ISBLK(m)) + return log_full_errno(level, SYNTHETIC_ERRNO(ENOTBLK), "Volatile root device is of wrong type."); + } + + if (ret) + *ret = devno; + + return 1; +} diff --git a/src/shared/blockdev-util.h b/src/shared/blockdev-util.h new file mode 100644 index 0000000..954a23d --- /dev/null +++ b/src/shared/blockdev-util.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +#include "macro.h" +#include "stdio-util.h" +#include "string-util.h" + +#define SYS_BLOCK_PATH_MAX(suffix) \ + (STRLEN("/sys/dev/block/") + DECIMAL_STR_MAX(dev_t) + 1 + DECIMAL_STR_MAX(dev_t) + strlen_ptr(suffix)) +#define xsprintf_sys_block_path(buf, suffix, devno) \ + xsprintf(buf, "/sys/dev/block/%u:%u%s", major(devno), minor(devno), strempty(suffix)) + +typedef enum BlockDeviceLookupFlag { + BLOCK_DEVICE_LOOKUP_WHOLE_DISK = 1 << 0, /* whole block device, e.g. sda, nvme0n1, or loop0. */ + BLOCK_DEVICE_LOOKUP_BACKING = 1 << 1, /* fd may be regular file or directory on file system, in + * which case backing block device is determined. */ + BLOCK_DEVICE_LOOKUP_ORIGINATING = 1 << 2, /* Try to find the underlying layer device for stacked + * block device, e.g. LUKS-style DM. */ +} BlockDeviceLookupFlag; + +int block_device_new_from_fd(int fd, BlockDeviceLookupFlag flag, sd_device **ret); +int block_device_new_from_path(const char *path, BlockDeviceLookupFlag flag, sd_device **ret); + +int block_device_is_whole_disk(sd_device *dev); +int block_device_get_whole_disk(sd_device *dev, sd_device **ret); +int block_device_get_originating(sd_device *dev, sd_device **ret); + +int block_get_whole_disk(dev_t d, dev_t *ret); +int block_get_originating(dev_t d, dev_t *ret); + +int get_block_device_fd(int fd, dev_t *ret); +int get_block_device(const char *path, dev_t *dev); + +int get_block_device_harder_fd(int fd, dev_t *dev); +int get_block_device_harder(const char *path, dev_t *dev); + +int lock_whole_block_device(dev_t devt, int operation); + +int blockdev_partscan_enabled(int fd); + +int fd_is_encrypted(int fd); +int path_is_encrypted(const char *path); + +int fd_get_whole_disk(int fd, bool backing, dev_t *ret); +int path_get_whole_disk(const char *path, bool backing, dev_t *ret); + +int block_device_add_partition(int fd, const char *name, int nr, uint64_t start, uint64_t size); +int block_device_remove_partition(int fd, const char *name, int nr); +int block_device_resize_partition(int fd, int nr, uint64_t start, uint64_t size); +int partition_enumerator_new(sd_device *dev, sd_device_enumerator **ret); +int block_device_remove_all_partitions(sd_device *dev, int fd); +int block_device_has_partitions(sd_device *dev); +int blockdev_reread_partition_table(sd_device *dev); + +int blockdev_get_sector_size(int fd, uint32_t *ret); + +int blockdev_get_root(int level, dev_t *ret); diff --git a/src/shared/bond-util.c b/src/shared/bond-util.c new file mode 100644 index 0000000..e04b201 --- /dev/null +++ b/src/shared/bond-util.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bond-util.h" +#include "string-table.h" + +static const char* const bond_mode_table[_NETDEV_BOND_MODE_MAX] = { + [NETDEV_BOND_MODE_BALANCE_RR] = "balance-rr", + [NETDEV_BOND_MODE_ACTIVE_BACKUP] = "active-backup", + [NETDEV_BOND_MODE_BALANCE_XOR] = "balance-xor", + [NETDEV_BOND_MODE_BROADCAST] = "broadcast", + [NETDEV_BOND_MODE_802_3AD] = "802.3ad", + [NETDEV_BOND_MODE_BALANCE_TLB] = "balance-tlb", + [NETDEV_BOND_MODE_BALANCE_ALB] = "balance-alb", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_mode, BondMode); + +static const char* const bond_xmit_hash_policy_table[_NETDEV_BOND_XMIT_HASH_POLICY_MAX] = { + [NETDEV_BOND_XMIT_HASH_POLICY_LAYER2] = "layer2", + [NETDEV_BOND_XMIT_HASH_POLICY_LAYER34] = "layer3+4", + [NETDEV_BOND_XMIT_HASH_POLICY_LAYER23] = "layer2+3", + [NETDEV_BOND_XMIT_HASH_POLICY_ENCAP23] = "encap2+3", + [NETDEV_BOND_XMIT_HASH_POLICY_ENCAP34] = "encap3+4", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_xmit_hash_policy, BondXmitHashPolicy); + +static const char* const bond_lacp_rate_table[_NETDEV_BOND_LACP_RATE_MAX] = { + [NETDEV_BOND_LACP_RATE_SLOW] = "slow", + [NETDEV_BOND_LACP_RATE_FAST] = "fast", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_lacp_rate, BondLacpRate); + +static const char* const bond_ad_select_table[_NETDEV_BOND_AD_SELECT_MAX] = { + [NETDEV_BOND_AD_SELECT_STABLE] = "stable", + [NETDEV_BOND_AD_SELECT_BANDWIDTH] = "bandwidth", + [NETDEV_BOND_AD_SELECT_COUNT] = "count", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_ad_select, BondAdSelect); + +static const char* const bond_fail_over_mac_table[_NETDEV_BOND_FAIL_OVER_MAC_MAX] = { + [NETDEV_BOND_FAIL_OVER_MAC_NONE] = "none", + [NETDEV_BOND_FAIL_OVER_MAC_ACTIVE] = "active", + [NETDEV_BOND_FAIL_OVER_MAC_FOLLOW] = "follow", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_fail_over_mac, BondFailOverMac); + +static const char *const bond_arp_validate_table[_NETDEV_BOND_ARP_VALIDATE_MAX] = { + [NETDEV_BOND_ARP_VALIDATE_NONE] = "none", + [NETDEV_BOND_ARP_VALIDATE_ACTIVE]= "active", + [NETDEV_BOND_ARP_VALIDATE_BACKUP]= "backup", + [NETDEV_BOND_ARP_VALIDATE_ALL]= "all", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_arp_validate, BondArpValidate); + +static const char *const bond_arp_all_targets_table[_NETDEV_BOND_ARP_ALL_TARGETS_MAX] = { + [NETDEV_BOND_ARP_ALL_TARGETS_ANY] = "any", + [NETDEV_BOND_ARP_ALL_TARGETS_ALL] = "all", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_arp_all_targets, BondArpAllTargets); + +static const char *const bond_primary_reselect_table[_NETDEV_BOND_PRIMARY_RESELECT_MAX] = { + [NETDEV_BOND_PRIMARY_RESELECT_ALWAYS] = "always", + [NETDEV_BOND_PRIMARY_RESELECT_BETTER]= "better", + [NETDEV_BOND_PRIMARY_RESELECT_FAILURE]= "failure", +}; + +DEFINE_STRING_TABLE_LOOKUP(bond_primary_reselect, BondPrimaryReselect); diff --git a/src/shared/bond-util.h b/src/shared/bond-util.h new file mode 100644 index 0000000..9e693b1 --- /dev/null +++ b/src/shared/bond-util.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +/* + * Maximum number of targets supported by the kernel for a single + * bond netdev. + */ +#define NETDEV_BOND_ARP_TARGETS_MAX 16 + +typedef enum BondMode { + NETDEV_BOND_MODE_BALANCE_RR = BOND_MODE_ROUNDROBIN, + NETDEV_BOND_MODE_ACTIVE_BACKUP = BOND_MODE_ACTIVEBACKUP, + NETDEV_BOND_MODE_BALANCE_XOR = BOND_MODE_XOR, + NETDEV_BOND_MODE_BROADCAST = BOND_MODE_BROADCAST, + NETDEV_BOND_MODE_802_3AD = BOND_MODE_8023AD, + NETDEV_BOND_MODE_BALANCE_TLB = BOND_MODE_TLB, + NETDEV_BOND_MODE_BALANCE_ALB = BOND_MODE_ALB, + _NETDEV_BOND_MODE_MAX, + _NETDEV_BOND_MODE_INVALID = -EINVAL, +} BondMode; + +typedef enum BondXmitHashPolicy { + NETDEV_BOND_XMIT_HASH_POLICY_LAYER2 = BOND_XMIT_POLICY_LAYER2, + NETDEV_BOND_XMIT_HASH_POLICY_LAYER34 = BOND_XMIT_POLICY_LAYER34, + NETDEV_BOND_XMIT_HASH_POLICY_LAYER23 = BOND_XMIT_POLICY_LAYER23, + NETDEV_BOND_XMIT_HASH_POLICY_ENCAP23 = BOND_XMIT_POLICY_ENCAP23, + NETDEV_BOND_XMIT_HASH_POLICY_ENCAP34 = BOND_XMIT_POLICY_ENCAP34, + _NETDEV_BOND_XMIT_HASH_POLICY_MAX, + _NETDEV_BOND_XMIT_HASH_POLICY_INVALID = -EINVAL, +} BondXmitHashPolicy; + +typedef enum BondLacpRate { + NETDEV_BOND_LACP_RATE_SLOW, + NETDEV_BOND_LACP_RATE_FAST, + _NETDEV_BOND_LACP_RATE_MAX, + _NETDEV_BOND_LACP_RATE_INVALID = -EINVAL, +} BondLacpRate; + +typedef enum BondAdSelect { + NETDEV_BOND_AD_SELECT_STABLE, + NETDEV_BOND_AD_SELECT_BANDWIDTH, + NETDEV_BOND_AD_SELECT_COUNT, + _NETDEV_BOND_AD_SELECT_MAX, + _NETDEV_BOND_AD_SELECT_INVALID = -EINVAL, +} BondAdSelect; + +typedef enum BondFailOverMac { + NETDEV_BOND_FAIL_OVER_MAC_NONE, + NETDEV_BOND_FAIL_OVER_MAC_ACTIVE, + NETDEV_BOND_FAIL_OVER_MAC_FOLLOW, + _NETDEV_BOND_FAIL_OVER_MAC_MAX, + _NETDEV_BOND_FAIL_OVER_MAC_INVALID = -EINVAL, +} BondFailOverMac; + +typedef enum BondArpValidate { + NETDEV_BOND_ARP_VALIDATE_NONE, + NETDEV_BOND_ARP_VALIDATE_ACTIVE, + NETDEV_BOND_ARP_VALIDATE_BACKUP, + NETDEV_BOND_ARP_VALIDATE_ALL, + _NETDEV_BOND_ARP_VALIDATE_MAX, + _NETDEV_BOND_ARP_VALIDATE_INVALID = -EINVAL, +} BondArpValidate; + +typedef enum BondArpAllTargets { + NETDEV_BOND_ARP_ALL_TARGETS_ANY, + NETDEV_BOND_ARP_ALL_TARGETS_ALL, + _NETDEV_BOND_ARP_ALL_TARGETS_MAX, + _NETDEV_BOND_ARP_ALL_TARGETS_INVALID = -EINVAL, +} BondArpAllTargets; + +typedef enum BondPrimaryReselect { + NETDEV_BOND_PRIMARY_RESELECT_ALWAYS, + NETDEV_BOND_PRIMARY_RESELECT_BETTER, + NETDEV_BOND_PRIMARY_RESELECT_FAILURE, + _NETDEV_BOND_PRIMARY_RESELECT_MAX, + _NETDEV_BOND_PRIMARY_RESELECT_INVALID = -EINVAL, +} BondPrimaryReselect; + +const char *bond_mode_to_string(BondMode d) _const_; +BondMode bond_mode_from_string(const char *d) _pure_; + +const char *bond_xmit_hash_policy_to_string(BondXmitHashPolicy d) _const_; +BondXmitHashPolicy bond_xmit_hash_policy_from_string(const char *d) _pure_; + +const char *bond_lacp_rate_to_string(BondLacpRate d) _const_; +BondLacpRate bond_lacp_rate_from_string(const char *d) _pure_; + +const char *bond_fail_over_mac_to_string(BondFailOverMac d) _const_; +BondFailOverMac bond_fail_over_mac_from_string(const char *d) _pure_; + +const char *bond_ad_select_to_string(BondAdSelect d) _const_; +BondAdSelect bond_ad_select_from_string(const char *d) _pure_; + +const char *bond_arp_validate_to_string(BondArpValidate d) _const_; +BondArpValidate bond_arp_validate_from_string(const char *d) _pure_; + +const char *bond_arp_all_targets_to_string(BondArpAllTargets d) _const_; +BondArpAllTargets bond_arp_all_targets_from_string(const char *d) _pure_; + +const char *bond_primary_reselect_to_string(BondPrimaryReselect d) _const_; +BondPrimaryReselect bond_primary_reselect_from_string(const char *d) _pure_; diff --git a/src/shared/boot-entry.c b/src/shared/boot-entry.c new file mode 100644 index 0000000..e726073 --- /dev/null +++ b/src/shared/boot-entry.c @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "boot-entry.h" +#include "chase.h" +#include "fd-util.h" +#include "fileio.h" +#include "id128-util.h" +#include "os-util.h" +#include "path-util.h" +#include "string-table.h" +#include "string-util.h" +#include "utf8.h" + +bool boot_entry_token_valid(const char *p) { + return utf8_is_valid(p) && string_is_safe(p) && filename_is_valid(p); +} + +static int entry_token_load(int rfd, const char *etc_kernel, BootEntryTokenType *type, char **token) { + _cleanup_free_ char *buf = NULL, *p = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(type); + assert(*type == BOOT_ENTRY_TOKEN_AUTO); + assert(token); + + if (!etc_kernel) + return 0; + + p = path_join(etc_kernel, "entry-token"); + if (!p) + return log_oom(); + + r = chase_and_fopenat_unlocked(rfd, p, CHASE_AT_RESOLVE_IN_ROOT, "re", NULL, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to chase and open '%s': %m", p); + + r = read_line(f, NAME_MAX, &buf); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", p); + + if (isempty(buf)) + return 0; + + if (!boot_entry_token_valid(buf)) { + log_debug("Invalid entry token specified in %s, ignoring.", p); + return 0; + } + + *token = TAKE_PTR(buf); + *type = BOOT_ENTRY_TOKEN_LITERAL; + return 1; +} + +static int entry_token_from_machine_id(sd_id128_t machine_id, BootEntryTokenType *type, char **token) { + char *p; + + assert(type); + assert(IN_SET(*type, BOOT_ENTRY_TOKEN_AUTO, BOOT_ENTRY_TOKEN_MACHINE_ID)); + assert(token); + + if (sd_id128_is_null(machine_id)) + return 0; + + p = strdup(SD_ID128_TO_STRING(machine_id)); + if (!p) + return log_oom(); + + *token = p; + *type = BOOT_ENTRY_TOKEN_MACHINE_ID; + return 1; +} + +static int entry_token_from_os_release(int rfd, BootEntryTokenType *type, char **token) { + _cleanup_free_ char *id = NULL, *image_id = NULL; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(type); + assert(IN_SET(*type, BOOT_ENTRY_TOKEN_AUTO, BOOT_ENTRY_TOKEN_OS_IMAGE_ID, BOOT_ENTRY_TOKEN_OS_ID)); + assert(token); + + switch (*type) { + case BOOT_ENTRY_TOKEN_AUTO: + r = parse_os_release_at(rfd, + "IMAGE_ID", &image_id, + "ID", &id); + break; + + case BOOT_ENTRY_TOKEN_OS_IMAGE_ID: + r = parse_os_release_at(rfd, "IMAGE_ID", &image_id); + break; + + case BOOT_ENTRY_TOKEN_OS_ID: + r = parse_os_release_at(rfd, "ID", &id); + break; + + default: + assert_not_reached(); + } + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to load /etc/os-release: %m"); + + if (!isempty(image_id) && boot_entry_token_valid(image_id)) { + *token = TAKE_PTR(image_id); + *type = BOOT_ENTRY_TOKEN_OS_IMAGE_ID; + return 1; + } + + if (!isempty(id) && boot_entry_token_valid(id)) { + *token = TAKE_PTR(id); + *type = BOOT_ENTRY_TOKEN_OS_ID; + return 1; + } + + return 0; +} + +int boot_entry_token_ensure_at( + int rfd, + const char *etc_kernel, + sd_id128_t machine_id, + bool machine_id_is_random, + BootEntryTokenType *type, + char **token) { + + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(type); + assert(token); + + if (*token) + return 0; /* Already set. */ + + switch (*type) { + + case BOOT_ENTRY_TOKEN_AUTO: + r = entry_token_load(rfd, etc_kernel, type, token); + if (r != 0) + return r; + + if (!machine_id_is_random) { + r = entry_token_from_machine_id(machine_id, type, token); + if (r != 0) + return r; + } + + r = entry_token_from_os_release(rfd, type, token); + if (r != 0) + return r; + + if (machine_id_is_random) { + r = entry_token_from_machine_id(machine_id, type, token); + if (r != 0) + return r; + } + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "No machine ID set, and /etc/os-release carries no ID=/IMAGE_ID= fields."); + + case BOOT_ENTRY_TOKEN_MACHINE_ID: + r = entry_token_from_machine_id(machine_id, type, token); + if (r != 0) + return r; + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No machine ID set."); + + case BOOT_ENTRY_TOKEN_OS_IMAGE_ID: + r = entry_token_from_os_release(rfd, type, token); + if (r != 0) + return r; + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "IMAGE_ID= field not set in /etc/os-release."); + + case BOOT_ENTRY_TOKEN_OS_ID: + r = entry_token_from_os_release(rfd, type, token); + if (r != 0) + return r; + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "ID= field not set in /etc/os-release."); + + case BOOT_ENTRY_TOKEN_LITERAL: + /* In this case, the token should be already set by the user input. */ + return -EINVAL; + + default: + assert_not_reached(); + } +} + +int boot_entry_token_ensure( + const char *root, + const char *etc_kernel, + sd_id128_t machine_id, + bool machine_id_is_random, + BootEntryTokenType *type, + char **token) { + + assert(token); + + if (*token) + return 0; /* Already set. */ + + _cleanup_close_ int rfd = -EBADF; + + rfd = open(empty_to_root(root), O_CLOEXEC | O_DIRECTORY | O_PATH); + if (rfd < 0) + return -errno; + + return boot_entry_token_ensure_at(rfd, etc_kernel, machine_id, machine_id_is_random, type, token); +} + +int parse_boot_entry_token_type(const char *s, BootEntryTokenType *type, char **token) { + assert(s); + assert(type); + assert(token); + + /* + * This function is intended to be used in command line parsers, to handle token that are passed in. + * + * NOTE THAT THIS WILL FREE THE PREVIOUS ARGUMENT POINTER ON SUCCESS! + * Hence, do not pass in uninitialized pointers. + */ + + if (streq(s, "machine-id")) { + *type = BOOT_ENTRY_TOKEN_MACHINE_ID; + *token = mfree(*token); + return 0; + } + + if (streq(s, "os-image-id")) { + *type = BOOT_ENTRY_TOKEN_OS_IMAGE_ID; + *token = mfree(*token); + return 0; + } + + if (streq(s, "os-id")) { + *type = BOOT_ENTRY_TOKEN_OS_ID; + *token = mfree(*token); + return 0; + } + + const char *e = startswith(s, "literal:"); + if (e) { + if (!boot_entry_token_valid(e)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid entry token literal is specified for --entry-token=."); + + *type = BOOT_ENTRY_TOKEN_LITERAL; + return free_and_strdup_warn(token, e); + } + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unexpected parameter for --entry-token=: %s", s); +} + +static const char *const boot_entry_token_type_table[] = { + [BOOT_ENTRY_TOKEN_MACHINE_ID] = "machine-id", + [BOOT_ENTRY_TOKEN_OS_IMAGE_ID] = "os-image-id", + [BOOT_ENTRY_TOKEN_OS_ID] = "os-id", + [BOOT_ENTRY_TOKEN_LITERAL] = "literal", + [BOOT_ENTRY_TOKEN_AUTO] = "auto", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(boot_entry_token_type, BootEntryTokenType); diff --git a/src/shared/boot-entry.h b/src/shared/boot-entry.h new file mode 100644 index 0000000..f3a6f28 --- /dev/null +++ b/src/shared/boot-entry.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-id128.h" + +typedef enum BootEntryTokenType { + BOOT_ENTRY_TOKEN_MACHINE_ID, + BOOT_ENTRY_TOKEN_OS_IMAGE_ID, + BOOT_ENTRY_TOKEN_OS_ID, + BOOT_ENTRY_TOKEN_LITERAL, + BOOT_ENTRY_TOKEN_AUTO, +} BootEntryTokenType; + +bool boot_entry_token_valid(const char *p); + +int boot_entry_token_ensure( + const char *root, + const char *etc_kernel, /* will be prefixed with root, typically /etc/kernel. */ + sd_id128_t machine_id, + bool machine_id_is_random, + BootEntryTokenType *type, /* input and output */ + char **token); /* output, but do not pass uninitialized value. */ +int boot_entry_token_ensure_at( + int rfd, + const char *etc_kernel, + sd_id128_t machine_id, + bool machine_id_is_random, + BootEntryTokenType *type, + char **token); + +int parse_boot_entry_token_type(const char *s, BootEntryTokenType *type, char **token); + +const char* boot_entry_token_type_to_string(BootEntryTokenType t); diff --git a/src/shared/boot-timestamps.c b/src/shared/boot-timestamps.c new file mode 100644 index 0000000..e49bd8f --- /dev/null +++ b/src/shared/boot-timestamps.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "acpi-fpdt.h" +#include "boot-timestamps.h" +#include "efi-loader.h" +#include "macro.h" +#include "time-util.h" + +int boot_timestamps(const dual_timestamp *n, dual_timestamp *firmware, dual_timestamp *loader) { + usec_t x = 0, y = 0, a; + int r; + dual_timestamp _n; + + assert(firmware); + assert(loader); + + if (!n) { + dual_timestamp_now(&_n); + n = &_n; + } + + r = acpi_get_boot_usec(&x, &y); + if (r < 0) { + r = efi_loader_get_boot_usec(&x, &y); + if (r < 0) + return r; + } + + /* Let's convert this to timestamps where the firmware + * began/loader began working. To make this more confusing: + * since usec_t is unsigned and the kernel's monotonic clock + * begins at kernel initialization we'll actually initialize + * the monotonic timestamps here as negative of the actual + * value. */ + + firmware->monotonic = y; + loader->monotonic = y - x; + + a = n->monotonic + firmware->monotonic; + firmware->realtime = n->realtime > a ? n->realtime - a : 0; + + a = n->monotonic + loader->monotonic; + loader->realtime = n->realtime > a ? n->realtime - a : 0; + + return 0; +} diff --git a/src/shared/boot-timestamps.h b/src/shared/boot-timestamps.h new file mode 100644 index 0000000..55b7ad1 --- /dev/null +++ b/src/shared/boot-timestamps.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int boot_timestamps(const dual_timestamp *n, dual_timestamp *firmware, dual_timestamp *loader); diff --git a/src/shared/bootspec.c b/src/shared/bootspec.c new file mode 100644 index 0000000..f4b2fdc --- /dev/null +++ b/src/shared/bootspec.c @@ -0,0 +1,1434 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "bootspec-fundamental.h" +#include "bootspec.h" +#include "chase.h" +#include "conf-files.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "efi-loader.h" +#include "env-file.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "find-esp.h" +#include "path-util.h" +#include "pe-binary.h" +#include "pretty-print.h" +#include "recurse-dir.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "strv.h" +#include "terminal-util.h" +#include "unaligned.h" + +static const char* const boot_entry_type_table[_BOOT_ENTRY_TYPE_MAX] = { + [BOOT_ENTRY_CONF] = "Boot Loader Specification Type #1 (.conf)", + [BOOT_ENTRY_UNIFIED] = "Boot Loader Specification Type #2 (.efi)", + [BOOT_ENTRY_LOADER] = "Reported by Boot Loader", + [BOOT_ENTRY_LOADER_AUTO] = "Automatic", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(boot_entry_type, BootEntryType); + +static const char* const boot_entry_type_json_table[_BOOT_ENTRY_TYPE_MAX] = { + [BOOT_ENTRY_CONF] = "type1", + [BOOT_ENTRY_UNIFIED] = "type2", + [BOOT_ENTRY_LOADER] = "loader", + [BOOT_ENTRY_LOADER_AUTO] = "auto", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(boot_entry_type_json, BootEntryType); + +static void boot_entry_free(BootEntry *entry) { + assert(entry); + + free(entry->id); + free(entry->id_old); + free(entry->path); + free(entry->root); + free(entry->title); + free(entry->show_title); + free(entry->sort_key); + free(entry->version); + free(entry->machine_id); + free(entry->architecture); + strv_free(entry->options); + free(entry->kernel); + free(entry->efi); + strv_free(entry->initrd); + free(entry->device_tree); + strv_free(entry->device_tree_overlay); +} + +static int mangle_path( + const char *fname, + unsigned line, + const char *field, + const char *p, + char **ret) { + + _cleanup_free_ char *c = NULL; + + assert(field); + assert(p); + assert(ret); + + /* Spec leaves open if prefixed with "/" or not, let's normalize that */ + if (path_is_absolute(p)) + c = strdup(p); + else + c = strjoin("/", p); + if (!c) + return -ENOMEM; + + /* We only reference files, never directories */ + if (endswith(c, "/")) { + log_syntax(NULL, LOG_WARNING, fname, line, 0, "Path in field '%s' has trailing slash, ignoring: %s", field, c); + *ret = NULL; + return 0; + } + + /* Remove duplicate "/" */ + path_simplify(c); + + /* No ".." or "." or so */ + if (!path_is_normalized(c)) { + log_syntax(NULL, LOG_WARNING, fname, line, 0, "Path in field '%s' is not normalized, ignoring: %s", field, c); + *ret = NULL; + return 0; + } + + *ret = TAKE_PTR(c); + return 1; +} + +static int parse_path_one( + const char *fname, + unsigned line, + const char *field, + char **s, + const char *p) { + + _cleanup_free_ char *c = NULL; + int r; + + assert(field); + assert(s); + assert(p); + + r = mangle_path(fname, line, field, p, &c); + if (r <= 0) + return r; + + return free_and_replace(*s, c); +} + +static int parse_path_strv( + const char *fname, + unsigned line, + const char *field, + char ***s, + const char *p) { + + char *c; + int r; + + assert(field); + assert(s); + assert(p); + + r = mangle_path(fname, line, field, p, &c); + if (r <= 0) + return r; + + return strv_consume(s, c); +} + +static int parse_path_many( + const char *fname, + unsigned line, + const char *field, + char ***s, + const char *p) { + + _cleanup_strv_free_ char **l = NULL, **f = NULL; + int r; + + l = strv_split(p, NULL); + if (!l) + return -ENOMEM; + + STRV_FOREACH(i, l) { + char *c; + + r = mangle_path(fname, line, field, *i, &c); + if (r < 0) + return r; + if (r == 0) + continue; + + r = strv_consume(&f, c); + if (r < 0) + return r; + } + + return strv_extend_strv(s, f, /* filter_duplicates= */ false); +} + +static int parse_tries(const char *fname, const char **p, unsigned *ret) { + _cleanup_free_ char *d = NULL; + unsigned tries; + size_t n; + int r; + + assert(fname); + assert(p); + assert(*p); + assert(ret); + + n = strspn(*p, DIGITS); + if (n == 0) { + *ret = UINT_MAX; + return 0; + } + + d = strndup(*p, n); + if (!d) + return log_oom(); + + r = safe_atou_full(d, 10, &tries); + if (r >= 0 && tries > INT_MAX) /* sd-boot allows INT_MAX, let's use the same limit */ + r = -ERANGE; + if (r < 0) + return log_error_errno(r, "Failed to parse tries counter of filename '%s': %m", fname); + + *p = *p + n; + *ret = tries; + return 1; +} + +int boot_filename_extract_tries( + const char *fname, + char **ret_stripped, + unsigned *ret_tries_left, + unsigned *ret_tries_done) { + + unsigned tries_left = UINT_MAX, tries_done = UINT_MAX; + _cleanup_free_ char *stripped = NULL; + const char *p, *suffix, *m; + int r; + + assert(fname); + assert(ret_stripped); + assert(ret_tries_left); + assert(ret_tries_done); + + /* Be liberal with suffix, only insist on a dot. After all we want to cover any capitalization here + * (vfat is case insensitive after all), and at least .efi and .conf as suffix. */ + suffix = strrchr(fname, '.'); + if (!suffix) + goto nothing; + + p = m = memrchr(fname, '+', suffix - fname); + if (!p) + goto nothing; + p++; + + r = parse_tries(fname, &p, &tries_left); + if (r < 0) + return r; + if (r == 0) + goto nothing; + + if (*p == '-') { + p++; + + r = parse_tries(fname, &p, &tries_done); + if (r < 0) + return r; + if (r == 0) + goto nothing; + } + + if (p != suffix) + goto nothing; + + stripped = strndup(fname, m - fname); + if (!stripped) + return log_oom(); + + if (!strextend(&stripped, suffix)) + return log_oom(); + + *ret_stripped = TAKE_PTR(stripped); + *ret_tries_left = tries_left; + *ret_tries_done = tries_done; + + return 0; + +nothing: + stripped = strdup(fname); + if (!stripped) + return log_oom(); + + *ret_stripped = TAKE_PTR(stripped); + *ret_tries_left = *ret_tries_done = UINT_MAX; + return 0; +} + +static int boot_entry_load_type1( + FILE *f, + const char *root, + const char *dir, + const char *fname, + BootEntry *entry) { + + _cleanup_(boot_entry_free) BootEntry tmp = BOOT_ENTRY_INIT(BOOT_ENTRY_CONF); + unsigned line = 1; + char *c; + int r; + + assert(f); + assert(root); + assert(dir); + assert(fname); + assert(entry); + + /* Loads a Type #1 boot menu entry from the specified FILE* object */ + + r = boot_filename_extract_tries(fname, &tmp.id, &tmp.tries_left, &tmp.tries_done); + if (r < 0) + return r; + + if (!efi_loader_entry_name_valid(tmp.id)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid loader entry name: %s", fname); + + c = endswith_no_case(tmp.id, ".conf"); + if (!c) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid loader entry file suffix: %s", fname); + + tmp.id_old = strndup(tmp.id, c - tmp.id); /* Without .conf suffix */ + if (!tmp.id_old) + return log_oom(); + + tmp.path = path_join(dir, fname); + if (!tmp.path) + return log_oom(); + + tmp.root = strdup(root); + if (!tmp.root) + return log_oom(); + + for (;;) { + _cleanup_free_ char *buf = NULL, *field = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &buf); + if (r == 0) + break; + if (r == -ENOBUFS) + return log_syntax(NULL, LOG_ERR, tmp.path, line, r, "Line too long."); + if (r < 0) + return log_syntax(NULL, LOG_ERR, tmp.path, line, r, "Error while reading: %m"); + + line++; + + if (IN_SET(buf[0], '#', '\0')) + continue; + + const char *p = buf; + r = extract_first_word(&p, &field, NULL, 0); + if (r < 0) { + log_syntax(NULL, LOG_WARNING, tmp.path, line, r, "Failed to parse, ignoring line: %m"); + continue; + } + if (r == 0) { + log_syntax(NULL, LOG_WARNING, tmp.path, line, 0, "Bad syntax, ignoring line."); + continue; + } + + if (isempty(p)) { + /* Some fields can reasonably have an empty value. In other cases warn. */ + if (!STR_IN_SET(field, "options", "devicetree-overlay")) + log_syntax(NULL, LOG_WARNING, tmp.path, line, 0, "Field '%s' without value, ignoring line.", field); + + continue; + } + + if (streq(field, "title")) + r = free_and_strdup(&tmp.title, p); + else if (streq(field, "sort-key")) + r = free_and_strdup(&tmp.sort_key, p); + else if (streq(field, "version")) + r = free_and_strdup(&tmp.version, p); + else if (streq(field, "machine-id")) + r = free_and_strdup(&tmp.machine_id, p); + else if (streq(field, "architecture")) + r = free_and_strdup(&tmp.architecture, p); + else if (streq(field, "options")) + r = strv_extend(&tmp.options, p); + else if (streq(field, "linux")) + r = parse_path_one(tmp.path, line, field, &tmp.kernel, p); + else if (streq(field, "efi")) + r = parse_path_one(tmp.path, line, field, &tmp.efi, p); + else if (streq(field, "initrd")) + r = parse_path_strv(tmp.path, line, field, &tmp.initrd, p); + else if (streq(field, "devicetree")) + r = parse_path_one(tmp.path, line, field, &tmp.device_tree, p); + else if (streq(field, "devicetree-overlay")) + r = parse_path_many(tmp.path, line, field, &tmp.device_tree_overlay, p); + else { + log_syntax(NULL, LOG_WARNING, tmp.path, line, 0, "Unknown line '%s', ignoring.", field); + continue; + } + if (r < 0) + return log_syntax(NULL, LOG_ERR, tmp.path, line, r, "Error while parsing: %m"); + } + + *entry = TAKE_STRUCT(tmp); + return 0; +} + +int boot_config_load_type1( + BootConfig *config, + FILE *f, + const char *root, + const char *dir, + const char *fname) { + int r; + + assert(config); + assert(f); + assert(root); + assert(dir); + assert(fname); + + if (!GREEDY_REALLOC0(config->entries, config->n_entries + 1)) + return log_oom(); + + r = boot_entry_load_type1(f, root, dir, fname, config->entries + config->n_entries); + if (r < 0) + return r; + + config->n_entries++; + return 0; +} + +void boot_config_free(BootConfig *config) { + assert(config); + + free(config->default_pattern); + free(config->timeout); + free(config->editor); + free(config->auto_entries); + free(config->auto_firmware); + free(config->console_mode); + free(config->beep); + + free(config->entry_oneshot); + free(config->entry_default); + free(config->entry_selected); + + for (size_t i = 0; i < config->n_entries; i++) + boot_entry_free(config->entries + i); + free(config->entries); + + set_free(config->inodes_seen); +} + +int boot_loader_read_conf(BootConfig *config, FILE *file, const char *path) { + unsigned line = 1; + int r; + + assert(config); + assert(file); + assert(path); + + for (;;) { + _cleanup_free_ char *buf = NULL, *field = NULL; + + r = read_stripped_line(file, LONG_LINE_MAX, &buf); + if (r == 0) + break; + if (r == -ENOBUFS) + return log_syntax(NULL, LOG_ERR, path, line, r, "Line too long."); + if (r < 0) + return log_syntax(NULL, LOG_ERR, path, line, r, "Error while reading: %m"); + + line++; + + if (IN_SET(buf[0], '#', '\0')) + continue; + + const char *p = buf; + r = extract_first_word(&p, &field, NULL, 0); + if (r < 0) { + log_syntax(NULL, LOG_WARNING, path, line, r, "Failed to parse, ignoring line: %m"); + continue; + } + if (r == 0) { + log_syntax(NULL, LOG_WARNING, path, line, 0, "Bad syntax, ignoring line."); + continue; + } + if (isempty(p)) { + log_syntax(NULL, LOG_WARNING, path, line, 0, "Field '%s' without value, ignoring line.", field); + continue; + } + + if (streq(field, "default")) + r = free_and_strdup(&config->default_pattern, p); + else if (streq(field, "timeout")) + r = free_and_strdup(&config->timeout, p); + else if (streq(field, "editor")) + r = free_and_strdup(&config->editor, p); + else if (streq(field, "auto-entries")) + r = free_and_strdup(&config->auto_entries, p); + else if (streq(field, "auto-firmware")) + r = free_and_strdup(&config->auto_firmware, p); + else if (streq(field, "console-mode")) + r = free_and_strdup(&config->console_mode, p); + else if (streq(field, "random-seed-mode")) + log_syntax(NULL, LOG_WARNING, path, line, 0, "'random-seed-mode' has been deprecated, ignoring."); + else if (streq(field, "beep")) + r = free_and_strdup(&config->beep, p); + else { + log_syntax(NULL, LOG_WARNING, path, line, 0, "Unknown line '%s', ignoring.", field); + continue; + } + if (r < 0) + return log_syntax(NULL, LOG_ERR, path, line, r, "Error while parsing: %m"); + } + + return 1; +} + +static int boot_loader_read_conf_path(BootConfig *config, const char *root, const char *path) { + _cleanup_free_ char *full = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(config); + assert(path); + + r = chase_and_fopen_unlocked(path, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, "re", &full, &f); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to open '%s/%s': %m", root, path); + + return boot_loader_read_conf(config, f, full); +} + +static int boot_entry_compare(const BootEntry *a, const BootEntry *b) { + int r; + + assert(a); + assert(b); + + r = CMP(!a->sort_key, !b->sort_key); + if (r != 0) + return r; + + if (a->sort_key && b->sort_key) { + r = strcmp(a->sort_key, b->sort_key); + if (r != 0) + return r; + + r = strcmp_ptr(a->machine_id, b->machine_id); + if (r != 0) + return r; + + r = -strverscmp_improved(a->version, b->version); + if (r != 0) + return r; + } + + return -strverscmp_improved(a->id, b->id); +} + +static int config_check_inode_relevant_and_unseen(BootConfig *config, int fd, const char *fname) { + _cleanup_free_ char *d = NULL; + struct stat st; + + assert(config); + assert(fd >= 0); + assert(fname); + + /* So, here's the thing: because of the mess around /efi/ vs. /boot/ vs. /boot/efi/ it might be that + * people have these dirs, or subdirs of them symlinked or bind mounted, and we might end up + * iterating though some dirs multiple times. Let's thus rather be safe than sorry, and track the + * inodes we already processed: let's ignore inodes we have seen already. This should be robust + * against any form of symlinking or bind mounting, and effectively suppress any such duplicates. */ + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat('%s'): %m", fname); + if (!S_ISREG(st.st_mode)) { + log_debug("File '%s' is not a regular file, ignoring.", fname); + return false; + } + + if (set_contains(config->inodes_seen, &st)) { + log_debug("Inode '%s' already seen before, ignoring.", fname); + return false; + } + + d = memdup(&st, sizeof(st)); + if (!d) + return log_oom(); + + if (set_ensure_consume(&config->inodes_seen, &inode_hash_ops, TAKE_PTR(d)) < 0) + return log_oom(); + + return true; +} + +static int boot_entries_find_type1( + BootConfig *config, + const char *root, + const char *dir) { + + _cleanup_free_ DirectoryEntries *dentries = NULL; + _cleanup_free_ char *full = NULL; + _cleanup_close_ int dir_fd = -EBADF; + int r; + + assert(config); + assert(root); + assert(dir); + + dir_fd = chase_and_open(dir, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, O_DIRECTORY|O_CLOEXEC, &full); + if (dir_fd == -ENOENT) + return 0; + if (dir_fd < 0) + return log_error_errno(dir_fd, "Failed to open '%s/%s': %m", root, dir); + + r = readdir_all(dir_fd, RECURSE_DIR_IGNORE_DOT, &dentries); + if (r < 0) + return log_error_errno(r, "Failed to read directory '%s': %m", full); + + for (size_t i = 0; i < dentries->n_entries; i++) { + const struct dirent *de = dentries->entries[i]; + _cleanup_fclose_ FILE *f = NULL; + + if (!dirent_is_file(de)) + continue; + + if (!endswith_no_case(de->d_name, ".conf")) + continue; + + r = xfopenat(dir_fd, de->d_name, "re", O_NOFOLLOW|O_NOCTTY, &f); + if (r < 0) { + log_warning_errno(r, "Failed to open %s/%s, ignoring: %m", full, de->d_name); + continue; + } + + r = config_check_inode_relevant_and_unseen(config, fileno(f), de->d_name); + if (r < 0) + return r; + if (r == 0) /* inode already seen or otherwise not relevant */ + continue; + + r = boot_config_load_type1(config, f, root, full, de->d_name); + if (r == -ENOMEM) /* ignore all other errors */ + return r; + } + + return 0; +} + +static int boot_entry_load_unified( + const char *root, + const char *path, + const char *osrelease, + const char *cmdline, + BootEntry *ret) { + + _cleanup_free_ char *fname = NULL, *os_pretty_name = NULL, *os_image_id = NULL, *os_name = NULL, *os_id = NULL, + *os_image_version = NULL, *os_version = NULL, *os_version_id = NULL, *os_build_id = NULL; + _cleanup_(boot_entry_free) BootEntry tmp = BOOT_ENTRY_INIT(BOOT_ENTRY_UNIFIED); + const char *k, *good_name, *good_version, *good_sort_key; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(root); + assert(path); + assert(osrelease); + + k = path_startswith(path, root); + if (!k) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not below root: %s", path); + + f = fmemopen_unlocked((void*) osrelease, strlen(osrelease), "r"); + if (!f) + return log_error_errno(errno, "Failed to open os-release buffer: %m"); + + r = parse_env_file(f, "os-release", + "PRETTY_NAME", &os_pretty_name, + "IMAGE_ID", &os_image_id, + "NAME", &os_name, + "ID", &os_id, + "IMAGE_VERSION", &os_image_version, + "VERSION", &os_version, + "VERSION_ID", &os_version_id, + "BUILD_ID", &os_build_id); + if (r < 0) + return log_error_errno(r, "Failed to parse os-release data from unified kernel image %s: %m", path); + + if (!bootspec_pick_name_version_sort_key( + os_pretty_name, + os_image_id, + os_name, + os_id, + os_image_version, + os_version, + os_version_id, + os_build_id, + &good_name, + &good_version, + &good_sort_key)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Missing fields in os-release data from unified kernel image %s, refusing.", path); + + r = path_extract_filename(path, &fname); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", path); + + r = boot_filename_extract_tries(fname, &tmp.id, &tmp.tries_left, &tmp.tries_done); + if (r < 0) + return r; + + if (!efi_loader_entry_name_valid(tmp.id)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid loader entry name: %s", tmp.id); + + if (os_id && os_version_id) { + tmp.id_old = strjoin(os_id, "-", os_version_id); + if (!tmp.id_old) + return log_oom(); + } + + tmp.path = strdup(path); + if (!tmp.path) + return log_oom(); + + tmp.root = strdup(root); + if (!tmp.root) + return log_oom(); + + tmp.kernel = path_make_absolute(k, "/"); + if (!tmp.kernel) + return log_oom(); + + tmp.options = strv_new(skip_leading_chars(cmdline, WHITESPACE)); + if (!tmp.options) + return log_oom(); + + delete_trailing_chars(tmp.options[0], WHITESPACE); + + tmp.title = strdup(good_name); + if (!tmp.title) + return log_oom(); + + if (good_sort_key) { + tmp.sort_key = strdup(good_sort_key); + if (!tmp.sort_key) + return log_oom(); + } + + if (good_version) { + tmp.version = strdup(good_version); + if (!tmp.version) + return log_oom(); + } + + *ret = TAKE_STRUCT(tmp); + return 0; +} + +/* Maximum PE section we are willing to load (Note that sections we are not interested in may be larger, but + * the ones we do care about and we are willing to load into memory have this size limit.) */ +#define PE_SECTION_SIZE_MAX (4U*1024U*1024U) + +static int find_sections( + int fd, + const char *path, + char **ret_osrelease, + char **ret_cmdline) { + + _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL; + _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL; + _cleanup_free_ char *osrel = NULL, *cmdline = NULL; + _cleanup_free_ PeHeader *pe_header = NULL; + int r; + + assert(fd >= 0); + assert(path); + + r = pe_load_headers(fd, &dos_header, &pe_header); + if (r < 0) + return log_warning_errno(r, "Failed to parse PE file '%s': %m", path); + + r = pe_load_sections(fd, dos_header, pe_header, §ions); + if (r < 0) + return log_warning_errno(r, "Failed to parse PE sections of '%s': %m", path); + + if (!pe_is_uki(pe_header, sections)) + return log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Parsed PE file '%s' is not a UKI.", path); + + r = pe_read_section_data(fd, pe_header, sections, ".osrel", PE_SECTION_SIZE_MAX, (void**) &osrel, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to read .osrel section of '%s': %m", path); + + r = pe_read_section_data(fd, pe_header, sections, ".cmdline", PE_SECTION_SIZE_MAX, (void**) &cmdline, NULL); + if (r < 0 && r != -ENXIO) /* cmdline is optional */ + return log_warning_errno(r, "Failed to read .cmdline section of '%s': %m", path); + + if (ret_osrelease) + *ret_osrelease = TAKE_PTR(osrel); + if (ret_cmdline) + *ret_cmdline = TAKE_PTR(cmdline); + + return 0; +} + +static int boot_entries_find_unified( + BootConfig *config, + const char *root, + const char *dir) { + + _cleanup_closedir_ DIR *d = NULL; + _cleanup_free_ char *full = NULL; + int r; + + assert(config); + assert(dir); + + r = chase_and_opendir(dir, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &full, &d); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to open '%s/%s': %m", root, dir); + + FOREACH_DIRENT(de, d, return log_error_errno(errno, "Failed to read %s: %m", full)) { + _cleanup_free_ char *j = NULL, *osrelease = NULL, *cmdline = NULL; + _cleanup_close_ int fd = -EBADF; + + if (!dirent_is_file(de)) + continue; + + if (!endswith_no_case(de->d_name, ".efi")) + continue; + + if (!GREEDY_REALLOC0(config->entries, config->n_entries + 1)) + return log_oom(); + + fd = openat(dirfd(d), de->d_name, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOFOLLOW|O_NOCTTY); + if (fd < 0) { + log_warning_errno(errno, "Failed to open %s/%s, ignoring: %m", full, de->d_name); + continue; + } + + r = config_check_inode_relevant_and_unseen(config, fd, de->d_name); + if (r < 0) + return r; + if (r == 0) /* inode already seen or otherwise not relevant */ + continue; + + j = path_join(full, de->d_name); + if (!j) + return log_oom(); + + if (find_sections(fd, j, &osrelease, &cmdline) < 0) + continue; + + r = boot_entry_load_unified(root, j, osrelease, cmdline, config->entries + config->n_entries); + if (r < 0) + continue; + + config->n_entries++; + } + + return 0; +} + +static bool find_nonunique(const BootEntry *entries, size_t n_entries, bool arr[]) { + bool non_unique = false; + + assert(entries || n_entries == 0); + assert(arr || n_entries == 0); + + for (size_t i = 0; i < n_entries; i++) + arr[i] = false; + + for (size_t i = 0; i < n_entries; i++) + for (size_t j = 0; j < n_entries; j++) + if (i != j && streq(boot_entry_title(entries + i), + boot_entry_title(entries + j))) + non_unique = arr[i] = arr[j] = true; + + return non_unique; +} + +static int boot_entries_uniquify(BootEntry *entries, size_t n_entries) { + _cleanup_free_ bool *arr = NULL; + char *s; + + assert(entries || n_entries == 0); + + if (n_entries == 0) + return 0; + + arr = new(bool, n_entries); + if (!arr) + return -ENOMEM; + + /* Find _all_ non-unique titles */ + if (!find_nonunique(entries, n_entries, arr)) + return 0; + + /* Add version to non-unique titles */ + for (size_t i = 0; i < n_entries; i++) + if (arr[i] && entries[i].version) { + if (asprintf(&s, "%s (%s)", boot_entry_title(entries + i), entries[i].version) < 0) + return -ENOMEM; + + free_and_replace(entries[i].show_title, s); + } + + if (!find_nonunique(entries, n_entries, arr)) + return 0; + + /* Add machine-id to non-unique titles */ + for (size_t i = 0; i < n_entries; i++) + if (arr[i] && entries[i].machine_id) { + if (asprintf(&s, "%s (%s)", boot_entry_title(entries + i), entries[i].machine_id) < 0) + return -ENOMEM; + + free_and_replace(entries[i].show_title, s); + } + + if (!find_nonunique(entries, n_entries, arr)) + return 0; + + /* Add file name to non-unique titles */ + for (size_t i = 0; i < n_entries; i++) + if (arr[i]) { + if (asprintf(&s, "%s (%s)", boot_entry_title(entries + i), entries[i].id) < 0) + return -ENOMEM; + + free_and_replace(entries[i].show_title, s); + } + + return 0; +} + +static int boot_config_find(const BootConfig *config, const char *id) { + assert(config); + + if (!id) + return -1; + + if (id[0] == '@') { + if (!strcaseeq(id, "@saved")) + return -1; + if (!config->entry_selected) + return -1; + id = config->entry_selected; + } + + for (size_t i = 0; i < config->n_entries; i++) + if (fnmatch(id, config->entries[i].id, FNM_CASEFOLD) == 0) + return i; + + return -1; +} + +static int boot_entries_select_default(const BootConfig *config) { + int i; + + assert(config); + assert(config->entries || config->n_entries == 0); + + if (config->n_entries == 0) { + log_debug("Found no default boot entry :("); + return -1; /* -1 means "no default" */ + } + + if (config->entry_oneshot) { + i = boot_config_find(config, config->entry_oneshot); + if (i >= 0) { + log_debug("Found default: id \"%s\" is matched by LoaderEntryOneShot", + config->entries[i].id); + return i; + } + } + + if (config->entry_default) { + i = boot_config_find(config, config->entry_default); + if (i >= 0) { + log_debug("Found default: id \"%s\" is matched by LoaderEntryDefault", + config->entries[i].id); + return i; + } + } + + if (config->default_pattern) { + i = boot_config_find(config, config->default_pattern); + if (i >= 0) { + log_debug("Found default: id \"%s\" is matched by pattern \"%s\"", + config->entries[i].id, config->default_pattern); + return i; + } + } + + log_debug("Found default: first entry \"%s\"", config->entries[0].id); + return 0; +} + +static int boot_entries_select_selected(const BootConfig *config) { + assert(config); + assert(config->entries || config->n_entries == 0); + + if (!config->entry_selected || config->n_entries == 0) + return -1; + + return boot_config_find(config, config->entry_selected); +} + +static int boot_load_efi_entry_pointers(BootConfig *config, bool skip_efivars) { + int r; + + assert(config); + + if (skip_efivars || !is_efi_boot()) + return 0; + + /* Loads the three "pointers" to boot loader entries from their EFI variables */ + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntryOneShot), &config->entry_oneshot); + if (r == -ENOMEM) + return log_oom(); + if (r < 0 && !IN_SET(r, -ENOENT, -ENODATA)) + log_warning_errno(r, "Failed to read EFI variable \"LoaderEntryOneShot\", ignoring: %m"); + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntryDefault), &config->entry_default); + if (r == -ENOMEM) + return log_oom(); + if (r < 0 && !IN_SET(r, -ENOENT, -ENODATA)) + log_warning_errno(r, "Failed to read EFI variable \"LoaderEntryDefault\", ignoring: %m"); + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntrySelected), &config->entry_selected); + if (r == -ENOMEM) + return log_oom(); + if (r < 0 && !IN_SET(r, -ENOENT, -ENODATA)) + log_warning_errno(r, "Failed to read EFI variable \"LoaderEntrySelected\", ignoring: %m"); + + return 1; +} + +int boot_config_select_special_entries(BootConfig *config, bool skip_efivars) { + int r; + + assert(config); + + r = boot_load_efi_entry_pointers(config, skip_efivars); + if (r < 0) + return r; + + config->default_entry = boot_entries_select_default(config); + config->selected_entry = boot_entries_select_selected(config); + + return 0; +} + +int boot_config_finalize(BootConfig *config) { + int r; + + typesafe_qsort(config->entries, config->n_entries, boot_entry_compare); + + r = boot_entries_uniquify(config->entries, config->n_entries); + if (r < 0) + return log_error_errno(r, "Failed to uniquify boot entries: %m"); + + return 0; +} + +int boot_config_load( + BootConfig *config, + const char *esp_path, + const char *xbootldr_path) { + + int r; + + assert(config); + + if (esp_path) { + r = boot_loader_read_conf_path(config, esp_path, "/loader/loader.conf"); + if (r < 0) + return r; + + r = boot_entries_find_type1(config, esp_path, "/loader/entries"); + if (r < 0) + return r; + + r = boot_entries_find_unified(config, esp_path, "/EFI/Linux/"); + if (r < 0) + return r; + } + + if (xbootldr_path) { + r = boot_entries_find_type1(config, xbootldr_path, "/loader/entries"); + if (r < 0) + return r; + + r = boot_entries_find_unified(config, xbootldr_path, "/EFI/Linux/"); + if (r < 0) + return r; + } + + return boot_config_finalize(config); +} + +int boot_config_load_auto( + BootConfig *config, + const char *override_esp_path, + const char *override_xbootldr_path) { + + _cleanup_free_ char *esp_where = NULL, *xbootldr_where = NULL; + dev_t esp_devid = 0, xbootldr_devid = 0; + int r; + + assert(config); + + /* This function is similar to boot_entries_load_config(), however we automatically search for the + * ESP and the XBOOTLDR partition unless it is explicitly specified. Also, if the user did not pass + * an ESP or XBOOTLDR path directly, let's see if /run/boot-loader-entries/ exists. If so, let's + * read data from there, as if it was an ESP (i.e. loading both entries and loader.conf data from + * it). This allows other boot loaders to pass boot loader entry information to our tools if they + * want to. */ + + if (!override_esp_path && !override_xbootldr_path) { + if (access("/run/boot-loader-entries/", F_OK) >= 0) + return boot_config_load(config, "/run/boot-loader-entries/", NULL); + + if (errno != ENOENT) + return log_error_errno(errno, + "Failed to determine whether /run/boot-loader-entries/ exists: %m"); + } + + r = find_esp_and_warn(NULL, override_esp_path, /* unprivileged_mode= */ false, &esp_where, NULL, NULL, NULL, NULL, &esp_devid); + if (r < 0) /* we don't log about ENOKEY here, but propagate it, leaving it to the caller to log */ + return r; + + r = find_xbootldr_and_warn(NULL, override_xbootldr_path, /* unprivileged_mode= */ false, &xbootldr_where, NULL, &xbootldr_devid); + if (r < 0 && r != -ENOKEY) + return r; /* It's fine if the XBOOTLDR partition doesn't exist, hence we ignore ENOKEY here */ + + /* If both paths actually refer to the same inode, suppress the xbootldr path */ + if (esp_where && xbootldr_where && devnum_set_and_equal(esp_devid, xbootldr_devid)) + xbootldr_where = mfree(xbootldr_where); + + return boot_config_load(config, esp_where, xbootldr_where); +} + +int boot_config_augment_from_loader( + BootConfig *config, + char **found_by_loader, + bool only_auto) { + + static const char *const title_table[] = { + /* Pretty names for a few well-known automatically discovered entries. */ + "auto-osx", "macOS", + "auto-windows", "Windows Boot Manager", + "auto-efi-shell", "EFI Shell", + "auto-efi-default", "EFI Default Loader", + "auto-poweroff", "Power Off The System", + "auto-reboot", "Reboot The System", + "auto-reboot-to-firmware-setup", "Reboot Into Firmware Interface", + NULL, + }; + + assert(config); + + /* Let's add the entries discovered by the boot loader to the end of our list, unless they are + * already included there. */ + + STRV_FOREACH(i, found_by_loader) { + BootEntry *existing; + _cleanup_free_ char *c = NULL, *t = NULL, *p = NULL; + + existing = boot_config_find_entry(config, *i); + if (existing) { + existing->reported_by_loader = true; + continue; + } + + if (only_auto && !startswith(*i, "auto-")) + continue; + + c = strdup(*i); + if (!c) + return log_oom(); + + STRV_FOREACH_PAIR(a, b, title_table) + if (streq(*a, *i)) { + t = strdup(*b); + if (!t) + return log_oom(); + break; + } + + p = strdup(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderEntries))); + if (!p) + return log_oom(); + + if (!GREEDY_REALLOC0(config->entries, config->n_entries + 1)) + return log_oom(); + + config->entries[config->n_entries++] = (BootEntry) { + .type = startswith(*i, "auto-") ? BOOT_ENTRY_LOADER_AUTO : BOOT_ENTRY_LOADER, + .id = TAKE_PTR(c), + .title = TAKE_PTR(t), + .path = TAKE_PTR(p), + .reported_by_loader = true, + .tries_left = UINT_MAX, + .tries_done = UINT_MAX, + }; + } + + return 0; +} + +BootEntry* boot_config_find_entry(BootConfig *config, const char *id) { + assert(config); + assert(id); + + for (size_t j = 0; j < config->n_entries; j++) + if (strcaseeq_ptr(config->entries[j].id, id) || + strcaseeq_ptr(config->entries[j].id_old, id)) + return config->entries + j; + + return NULL; +} + +static void boot_entry_file_list( + const char *field, + const char *root, + const char *p, + int *ret_status) { + + assert(p); + assert(ret_status); + + int status = chase_and_access(p, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, F_OK, NULL); + + /* Note that this shows two '/' between the root and the file. This is intentional to highlight (in + * the absence of color support) to the user that the boot loader is only interested in the second + * part of the file. */ + printf("%13s%s %s%s/%s", strempty(field), field ? ":" : " ", ansi_grey(), root, ansi_normal()); + + if (status < 0) { + errno = -status; + printf("%s%s%s (%m)\n", ansi_highlight_red(), p, ansi_normal()); + } else + printf("%s\n", p); + + if (*ret_status == 0 && status < 0) + *ret_status = status; +} + +int show_boot_entry( + const BootEntry *e, + bool show_as_default, + bool show_as_selected, + bool show_reported) { + + int status = 0; + + /* Returns 0 on success, negative on processing error, and positive if something is wrong with the + boot entry itself. */ + + assert(e); + + printf(" type: %s\n", + boot_entry_type_to_string(e->type)); + + printf(" title: %s%s%s", + ansi_highlight(), boot_entry_title(e), ansi_normal()); + + if (show_as_default) + printf(" %s(default)%s", + ansi_highlight_green(), ansi_normal()); + + if (show_as_selected) + printf(" %s(selected)%s", + ansi_highlight_magenta(), ansi_normal()); + + if (show_reported) { + if (e->type == BOOT_ENTRY_LOADER) + printf(" %s(reported/absent)%s", + ansi_highlight_red(), ansi_normal()); + else if (!e->reported_by_loader && e->type != BOOT_ENTRY_LOADER_AUTO) + printf(" %s(not reported/new)%s", + ansi_highlight_green(), ansi_normal()); + } + + putchar('\n'); + + if (e->id) + printf(" id: %s\n", e->id); + if (e->path) { + _cleanup_free_ char *text = NULL, *link = NULL; + + const char *p = e->root ? path_startswith(e->path, e->root) : NULL; + if (p) { + text = strjoin(ansi_grey(), e->root, "/", ansi_normal(), "/", p); + if (!text) + return log_oom(); + } + + /* Let's urlify the link to make it easy to view in an editor, but only if it is a text + * file. Unified images are binary ELFs, and EFI variables are not pure text either. */ + if (e->type == BOOT_ENTRY_CONF) + (void) terminal_urlify_path(e->path, text, &link); + + printf(" source: %s\n", link ?: text ?: e->path); + } + if (e->tries_left != UINT_MAX) { + printf(" tries: %u left", e->tries_left); + + if (e->tries_done != UINT_MAX) + printf("; %u done\n", e->tries_done); + else + printf("\n"); + } + + if (e->sort_key) + printf(" sort-key: %s\n", e->sort_key); + if (e->version) + printf(" version: %s\n", e->version); + if (e->machine_id) + printf(" machine-id: %s\n", e->machine_id); + if (e->architecture) + printf(" architecture: %s\n", e->architecture); + if (e->kernel) + boot_entry_file_list("linux", e->root, e->kernel, &status); + if (e->efi) + boot_entry_file_list("efi", e->root, e->efi, &status); + + STRV_FOREACH(s, e->initrd) + boot_entry_file_list(s == e->initrd ? "initrd" : NULL, + e->root, + *s, + &status); + + if (!strv_isempty(e->options)) { + _cleanup_free_ char *t = NULL, *t2 = NULL; + _cleanup_strv_free_ char **ts = NULL; + + t = strv_join(e->options, " "); + if (!t) + return log_oom(); + + ts = strv_split_newlines(t); + if (!ts) + return log_oom(); + + t2 = strv_join(ts, "\n "); + if (!t2) + return log_oom(); + + printf(" options: %s\n", t2); + } + + if (e->device_tree) + boot_entry_file_list("devicetree", e->root, e->device_tree, &status); + + STRV_FOREACH(s, e->device_tree_overlay) + boot_entry_file_list(s == e->device_tree_overlay ? "devicetree-overlay" : NULL, + e->root, + *s, + &status); + + return -status; +} + +int show_boot_entries(const BootConfig *config, JsonFormatFlags json_format) { + int r; + + assert(config); + + if (!FLAGS_SET(json_format, JSON_FORMAT_OFF)) { + _cleanup_(json_variant_unrefp) JsonVariant *array = NULL; + + for (size_t i = 0; i < config->n_entries; i++) { + _cleanup_free_ char *opts = NULL; + const BootEntry *e = config->entries + i; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + if (!strv_isempty(e->options)) { + opts = strv_join(e->options, " "); + if (!opts) + return log_oom(); + } + + r = json_variant_merge_objectb( + &v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("type", JSON_BUILD_STRING(boot_entry_type_json_to_string(e->type))), + JSON_BUILD_PAIR_CONDITION(e->id, "id", JSON_BUILD_STRING(e->id)), + JSON_BUILD_PAIR_CONDITION(e->path, "path", JSON_BUILD_STRING(e->path)), + JSON_BUILD_PAIR_CONDITION(e->root, "root", JSON_BUILD_STRING(e->root)), + JSON_BUILD_PAIR_CONDITION(e->title, "title", JSON_BUILD_STRING(e->title)), + JSON_BUILD_PAIR_CONDITION(boot_entry_title(e), "showTitle", JSON_BUILD_STRING(boot_entry_title(e))), + JSON_BUILD_PAIR_CONDITION(e->sort_key, "sortKey", JSON_BUILD_STRING(e->sort_key)), + JSON_BUILD_PAIR_CONDITION(e->version, "version", JSON_BUILD_STRING(e->version)), + JSON_BUILD_PAIR_CONDITION(e->machine_id, "machineId", JSON_BUILD_STRING(e->machine_id)), + JSON_BUILD_PAIR_CONDITION(e->architecture, "architecture", JSON_BUILD_STRING(e->architecture)), + JSON_BUILD_PAIR_CONDITION(opts, "options", JSON_BUILD_STRING(opts)), + JSON_BUILD_PAIR_CONDITION(e->kernel, "linux", JSON_BUILD_STRING(e->kernel)), + JSON_BUILD_PAIR_CONDITION(e->efi, "efi", JSON_BUILD_STRING(e->efi)), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(e->initrd), "initrd", JSON_BUILD_STRV(e->initrd)), + JSON_BUILD_PAIR_CONDITION(e->device_tree, "devicetree", JSON_BUILD_STRING(e->device_tree)), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(e->device_tree_overlay), "devicetreeOverlay", JSON_BUILD_STRV(e->device_tree_overlay)))); + if (r < 0) + return log_oom(); + + /* Sanitizers (only memory sanitizer?) do not like function call with too many + * arguments and trigger false positive warnings. Let's not add too many json objects + * at once. */ + r = json_variant_merge_objectb( + &v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("isReported", JSON_BUILD_BOOLEAN(e->reported_by_loader)), + JSON_BUILD_PAIR_CONDITION(e->tries_left != UINT_MAX, "triesLeft", JSON_BUILD_UNSIGNED(e->tries_left)), + JSON_BUILD_PAIR_CONDITION(e->tries_done != UINT_MAX, "triesDone", JSON_BUILD_UNSIGNED(e->tries_done)), + JSON_BUILD_PAIR_CONDITION(config->default_entry >= 0, "isDefault", JSON_BUILD_BOOLEAN(i == (size_t) config->default_entry)), + JSON_BUILD_PAIR_CONDITION(config->selected_entry >= 0, "isSelected", JSON_BUILD_BOOLEAN(i == (size_t) config->selected_entry)))); + + if (r < 0) + return log_oom(); + + r = json_variant_append_array(&array, v); + if (r < 0) + return log_oom(); + } + + json_variant_dump(array, json_format | JSON_FORMAT_EMPTY_ARRAY, NULL, NULL); + + } else { + for (size_t n = 0; n < config->n_entries; n++) { + r = show_boot_entry( + config->entries + n, + /* show_as_default= */ n == (size_t) config->default_entry, + /* show_as_selected= */ n == (size_t) config->selected_entry, + /* show_discovered= */ true); + if (r < 0) + return r; + + if (n+1 < config->n_entries) + putchar('\n'); + } + } + + return 0; +} diff --git a/src/shared/bootspec.h b/src/shared/bootspec.h new file mode 100644 index 0000000..ddd149e --- /dev/null +++ b/src/shared/bootspec.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include +#include +#include + +#include "json.h" +#include "set.h" +#include "string-util.h" + +typedef enum BootEntryType { + BOOT_ENTRY_CONF, /* Boot Loader Specification Type #1 entries: *.conf files */ + BOOT_ENTRY_UNIFIED, /* Boot Loader Specification Type #2 entries: *.efi files */ + BOOT_ENTRY_LOADER, /* Additional entries augmented from LoaderEntries EFI variable (regular entries) */ + BOOT_ENTRY_LOADER_AUTO, /* Additional entries augmented from LoaderEntries EFI variable (special "automatic" entries) */ + _BOOT_ENTRY_TYPE_MAX, + _BOOT_ENTRY_TYPE_INVALID = -EINVAL, +} BootEntryType; + +typedef struct BootEntry { + BootEntryType type; + bool reported_by_loader; + char *id; /* This is the file basename (including extension!) */ + char *id_old; /* Old-style ID, for deduplication purposes. */ + char *path; /* This is the full path to the drop-in file */ + char *root; /* The root path in which the drop-in was found, i.e. to which 'kernel', 'efi' and 'initrd' are relative */ + char *title; + char *show_title; + char *sort_key; + char *version; + char *machine_id; + char *architecture; + char **options; + char *kernel; /* linux is #defined to 1, yikes! */ + char *efi; + char **initrd; + char *device_tree; + char **device_tree_overlay; + unsigned tries_left; + unsigned tries_done; +} BootEntry; + +#define BOOT_ENTRY_INIT(t) \ + { \ + .type = (t), \ + .tries_left = UINT_MAX, \ + .tries_done = UINT_MAX, \ + } + +typedef struct BootConfig { + char *default_pattern; + char *timeout; + char *editor; + char *auto_entries; + char *auto_firmware; + char *console_mode; + char *beep; + + char *entry_oneshot; + char *entry_default; + char *entry_selected; + + BootEntry *entries; + size_t n_entries; + + ssize_t default_entry; + ssize_t selected_entry; + + Set *inodes_seen; +} BootConfig; + +#define BOOT_CONFIG_NULL \ + { \ + .default_entry = -1, \ + .selected_entry = -1, \ + } + +const char* boot_entry_type_to_string(BootEntryType); +const char* boot_entry_type_json_to_string(BootEntryType); + +BootEntry* boot_config_find_entry(BootConfig *config, const char *id); + +static inline const BootEntry* boot_config_default_entry(const BootConfig *config) { + assert(config); + + if (config->default_entry < 0) + return NULL; + + assert((size_t) config->default_entry < config->n_entries); + return config->entries + config->default_entry; +} + +void boot_config_free(BootConfig *config); + +int boot_loader_read_conf(BootConfig *config, FILE *file, const char *path); + +int boot_config_load_type1( + BootConfig *config, + FILE *f, + const char *root, + const char *dir, + const char *id); + +int boot_config_finalize(BootConfig *config); +int boot_config_load(BootConfig *config, const char *esp_path, const char *xbootldr_path); +int boot_config_load_auto(BootConfig *config, const char *override_esp_path, const char *override_xbootldr_path); +int boot_config_augment_from_loader(BootConfig *config, char **list, bool only_auto); + +int boot_config_select_special_entries(BootConfig *config, bool skip_efivars); + +static inline const char* boot_entry_title(const BootEntry *entry) { + assert(entry); + + return ASSERT_PTR(entry->show_title ?: entry->title ?: entry->id); +} + +int show_boot_entry( + const BootEntry *e, + bool show_as_default, + bool show_as_selected, + bool show_reported); +int show_boot_entries( + const BootConfig *config, + JsonFormatFlags json_format); + +int boot_filename_extract_tries(const char *fname, char **ret_stripped, unsigned *ret_tries_left, unsigned *ret_tries_done); diff --git a/src/shared/bpf-compat.h b/src/shared/bpf-compat.h new file mode 100644 index 0000000..9ccb7d8 --- /dev/null +++ b/src/shared/bpf-compat.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* libbpf has been moving quickly. + * They added new symbols in the 0.x versions and shortly after removed + * deprecated symbols in 1.0. + * We only need bpf_map_create and libbpf_probe_bpf_prog_type so we work + * around the incompatibility here by: + * - declaring both symbols, and looking for either depending on the libbpf + * so version we found + * - having helpers that automatically use the appropriate version behind the + * new API for easy cleanup later + * + * The advantage of doing this instead of only looking for the symbols declared at + * compile time is that we can then load either the old or the new symbols at runtime + * regardless of the version we were compiled with */ + + +/* declare the struct for libbpf <= 0.6.0 -- it causes no harm on newer versions */ +struct bpf_map_create_opts; + +/* new symbols available from 0.7.0. + * We need the symbols here: + * - after bpf_map_create_opts struct has been defined for older libbpf + * - before the compat static inline helpers that use them. + * When removing this file move these back to bpf-dlopen.h */ +extern int (*sym_bpf_map_create)(enum bpf_map_type, const char *, __u32, __u32, __u32, const struct bpf_map_create_opts *); +extern int (*sym_libbpf_probe_bpf_prog_type)(enum bpf_prog_type, const void *); + +/* compat symbols removed in libbpf 1.0 */ +extern int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags); +extern bool (*sym_bpf_probe_prog_type)(enum bpf_prog_type, __u32); + +/* helpers to use the available variant behind new API */ +static inline int compat_bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts) { + if (sym_bpf_map_create) + return sym_bpf_map_create(map_type, map_name, key_size, + value_size, max_entries, opts); + + return sym_bpf_create_map(map_type, key_size, value_size, max_entries, + 0 /* opts->map_flags, but opts is always NULL for us so skip build dependency on the type */); +} + +static inline int compat_libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) { + if (sym_libbpf_probe_bpf_prog_type) + return sym_libbpf_probe_bpf_prog_type(prog_type, opts); + + return sym_bpf_probe_prog_type(prog_type, 0); +} diff --git a/src/shared/bpf-dlopen.c b/src/shared/bpf-dlopen.c new file mode 100644 index 0000000..15301ae --- /dev/null +++ b/src/shared/bpf-dlopen.c @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dlfcn-util.h" +#include "bpf-dlopen.h" +#include "log.h" +#include "strv.h" + +#if HAVE_LIBBPF + +/* libbpf changed types of function prototypes around, so we need to disable some type checking for older + * libbpf. We consider everything older than 0.7 too old for accurate type checks. */ +#if defined(__LIBBPF_CURRENT_VERSION_GEQ) +#if __LIBBPF_CURRENT_VERSION_GEQ(0, 7) +#define MODERN_LIBBPF 1 +#endif +#endif +#if !defined(MODERN_LIBBPF) +#define MODERN_LIBBPF 0 +#endif + +struct bpf_link* (*sym_bpf_program__attach_cgroup)(const struct bpf_program *, int); +struct bpf_link* (*sym_bpf_program__attach_lsm)(const struct bpf_program *); +int (*sym_bpf_link__fd)(const struct bpf_link *); +int (*sym_bpf_link__destroy)(struct bpf_link *); +int (*sym_bpf_map__fd)(const struct bpf_map *); +const char* (*sym_bpf_map__name)(const struct bpf_map *); +int (*sym_bpf_map_create)(enum bpf_map_type, const char *, __u32, __u32, __u32, const struct bpf_map_create_opts *); +int (*sym_bpf_map__set_max_entries)(struct bpf_map *, __u32); +int (*sym_bpf_map_update_elem)(int, const void *, const void *, __u64); +int (*sym_bpf_map_delete_elem)(int, const void *); +int (*sym_bpf_map__set_inner_map_fd)(struct bpf_map *, int); +int (*sym_bpf_object__open_skeleton)(struct bpf_object_skeleton *, const struct bpf_object_open_opts *); +int (*sym_bpf_object__load_skeleton)(struct bpf_object_skeleton *); +int (*sym_bpf_object__attach_skeleton)(struct bpf_object_skeleton *); +void (*sym_bpf_object__detach_skeleton)(struct bpf_object_skeleton *); +void (*sym_bpf_object__destroy_skeleton)(struct bpf_object_skeleton *); +int (*sym_libbpf_probe_bpf_prog_type)(enum bpf_prog_type, const void *); +const char* (*sym_bpf_program__name)(const struct bpf_program *); +libbpf_print_fn_t (*sym_libbpf_set_print)(libbpf_print_fn_t); +long (*sym_libbpf_get_error)(const void *); + +/* compat symbols removed in libbpf 1.0 */ +int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags); +bool (*sym_bpf_probe_prog_type)(enum bpf_prog_type, __u32); + +_printf_(2,0) +static int bpf_print_func(enum libbpf_print_level level, const char *fmt, va_list ap) { +#if !LOG_TRACE + /* libbpf logs a lot of details at its debug level, which we don't need to see. */ + if (level == LIBBPF_DEBUG) + return 0; +#endif + /* All other levels are downgraded to LOG_DEBUG */ + + /* errno is used here, on the assumption that if the log message uses %m, errno will be set to + * something useful. Otherwise, it shouldn't matter, we may pass 0 or some bogus value. */ + return log_internalv(LOG_DEBUG, errno, NULL, 0, NULL, fmt, ap); +} + +int dlopen_bpf(void) { + void *dl; + int r; + + DISABLE_WARNING_DEPRECATED_DECLARATIONS; + + dl = dlopen("libbpf.so.1", RTLD_LAZY); + if (!dl) { + /* libbpf < 1.0.0 (we rely on 0.1.0+) provide most symbols we care about, but + * unfortunately not all until 0.7.0. See bpf-compat.h for more details. + * Once we consider we can assume 0.7+ is present we can just use the same symbol + * list for both files, and when we assume 1.0+ is present we can remove this dlopen */ + dl = dlopen("libbpf.so.0", RTLD_LAZY); + if (!dl) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "neither libbpf.so.1 nor libbpf.so.0 are installed: %s", dlerror()); + + /* symbols deprecated in 1.0 we use as compat */ + r = dlsym_many_or_warn( + dl, LOG_DEBUG, +#if MODERN_LIBBPF + /* Don't exist anymore in new libbpf, hence cannot type check them */ + DLSYM_ARG_FORCE(bpf_create_map), + DLSYM_ARG_FORCE(bpf_probe_prog_type)); +#else + DLSYM_ARG(bpf_create_map), + DLSYM_ARG(bpf_probe_prog_type)); +#endif + } else { + /* symbols available from 0.7.0 */ + r = dlsym_many_or_warn( + dl, LOG_DEBUG, +#if MODERN_LIBBPF + DLSYM_ARG(bpf_map_create), + DLSYM_ARG(libbpf_probe_bpf_prog_type) +#else + /* These symbols did not exist in old libbpf, hence we cannot type check them */ + DLSYM_ARG_FORCE(bpf_map_create), + DLSYM_ARG_FORCE(libbpf_probe_bpf_prog_type) +#endif + ); + } + + r = dlsym_many_or_warn( + dl, LOG_DEBUG, + DLSYM_ARG(bpf_link__destroy), + DLSYM_ARG(bpf_link__fd), + DLSYM_ARG(bpf_map__fd), + DLSYM_ARG(bpf_map__name), + DLSYM_ARG(bpf_map__set_max_entries), + DLSYM_ARG(bpf_map_update_elem), + DLSYM_ARG(bpf_map_delete_elem), + DLSYM_ARG(bpf_map__set_inner_map_fd), + DLSYM_ARG(bpf_object__open_skeleton), + DLSYM_ARG(bpf_object__load_skeleton), + DLSYM_ARG(bpf_object__attach_skeleton), + DLSYM_ARG(bpf_object__detach_skeleton), + DLSYM_ARG(bpf_object__destroy_skeleton), +#if MODERN_LIBBPF + DLSYM_ARG(bpf_program__attach_cgroup), + DLSYM_ARG(bpf_program__attach_lsm), +#else + /* libbpf added a "const" to function parameters where it should not have, ignore this type incompatibility */ + DLSYM_ARG_FORCE(bpf_program__attach_cgroup), + DLSYM_ARG_FORCE(bpf_program__attach_lsm), +#endif + DLSYM_ARG(bpf_program__name), + DLSYM_ARG(libbpf_set_print), + DLSYM_ARG(libbpf_get_error)); + if (r < 0) + return r; + + /* We set the print helper unconditionally. Otherwise libbpf will emit not useful log messages. */ + (void) sym_libbpf_set_print(bpf_print_func); + + REENABLE_WARNING; + + return r; +} + +#else + +int dlopen_bpf(void) { + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "libbpf support is not compiled in."); +} +#endif diff --git a/src/shared/bpf-dlopen.h b/src/shared/bpf-dlopen.h new file mode 100644 index 0000000..0750abc --- /dev/null +++ b/src/shared/bpf-dlopen.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_LIBBPF + +#include +#include + +#include "bpf-compat.h" + +extern struct bpf_link* (*sym_bpf_program__attach_cgroup)(const struct bpf_program *, int); +extern struct bpf_link* (*sym_bpf_program__attach_lsm)(const struct bpf_program *); +extern int (*sym_bpf_link__fd)(const struct bpf_link *); +extern int (*sym_bpf_link__destroy)(struct bpf_link *); +extern int (*sym_bpf_map__fd)(const struct bpf_map *); +extern const char* (*sym_bpf_map__name)(const struct bpf_map *); +extern int (*sym_bpf_map__set_max_entries)(struct bpf_map *, __u32); +extern int (*sym_bpf_map_update_elem)(int, const void *, const void *, __u64); +extern int (*sym_bpf_map_delete_elem)(int, const void *); +extern int (*sym_bpf_map__set_inner_map_fd)(struct bpf_map *, int); +/* The *_skeleton APIs are autogenerated by bpftool, the targets can be found + * in ./build/src/core/bpf/socket_bind/socket-bind.skel.h */ +extern int (*sym_bpf_object__open_skeleton)(struct bpf_object_skeleton *, const struct bpf_object_open_opts *); +extern int (*sym_bpf_object__load_skeleton)(struct bpf_object_skeleton *); +extern int (*sym_bpf_object__attach_skeleton)(struct bpf_object_skeleton *); +extern void (*sym_bpf_object__detach_skeleton)(struct bpf_object_skeleton *); +extern void (*sym_bpf_object__destroy_skeleton)(struct bpf_object_skeleton *); +extern const char* (*sym_bpf_program__name)(const struct bpf_program *); +extern libbpf_print_fn_t (*sym_libbpf_set_print)(libbpf_print_fn_t); +extern long (*sym_libbpf_get_error)(const void *); + +#endif + +int dlopen_bpf(void); diff --git a/src/shared/bpf-link.c b/src/shared/bpf-link.c new file mode 100644 index 0000000..fea49b2 --- /dev/null +++ b/src/shared/bpf-link.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "serialize.h" + +bool bpf_can_link_program(struct bpf_program *prog) { + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + + assert(prog); + + if (dlopen_bpf() < 0) + return false; + + /* Pass invalid cgroup fd intentionally. */ + link = sym_bpf_program__attach_cgroup(prog, /*cgroup_fd=*/-1); + + /* EBADF indicates that bpf_link is supported by kernel. */ + return sym_libbpf_get_error(link) == -EBADF; +} + +int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link) { + assert(key); + + if (!link) + return -ENOENT; + + if (sym_libbpf_get_error(link) != 0) + return -EINVAL; + + return serialize_fd(f, fds, key, sym_bpf_link__fd(link)); +} + +struct bpf_link *bpf_link_free(struct bpf_link *link) { + /* If libbpf wasn't dlopen()ed, sym_bpf_link__destroy might be unresolved (NULL), so let's not try to + * call it if link is NULL. link might also be a non-null "error pointer", but such a value can only + * originate from a call to libbpf, but that means that libbpf is available, and we can let + * bpf_link__destroy() handle it. */ + if (link) + (void) sym_bpf_link__destroy(link); + + return NULL; +} diff --git a/src/shared/bpf-link.h b/src/shared/bpf-link.h new file mode 100644 index 0000000..38aa080 --- /dev/null +++ b/src/shared/bpf-link.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include + +#include "fdset.h" +#include "macro.h" + +bool bpf_can_link_program(struct bpf_program *prog); + +int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link); + +struct bpf_link *bpf_link_free(struct bpf_link *p); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct bpf_link *, bpf_link_free); diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c new file mode 100644 index 0000000..bbdd4f6 --- /dev/null +++ b/src/shared/bpf-program.c @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-program.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "path-util.h" +#include "serialize.h" +#include "string-table.h" + +static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = { + [BPF_CGROUP_INET_INGRESS] = "ingress", + [BPF_CGROUP_INET_EGRESS] = "egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", + [BPF_CGROUP_SOCK_OPS] = "sock_ops", + [BPF_CGROUP_DEVICE] = "device", + [BPF_CGROUP_INET4_BIND] = "bind4", + [BPF_CGROUP_INET6_BIND] = "bind6", + [BPF_CGROUP_INET4_CONNECT] = "connect4", + [BPF_CGROUP_INET6_CONNECT] = "connect6", + [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", + [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", + [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", + [BPF_CGROUP_SYSCTL] = "sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", + [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", + [BPF_CGROUP_GETSOCKOPT] = "getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "setsockopt", +}; + +DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int); + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free); + +BPFProgram *bpf_program_free(BPFProgram *p) { + if (!p) + return NULL; + /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last + * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated + * programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with + * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in + * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during + * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To + * counter this, we track closely to which cgroup a program was attached to and will detach it on our own + * whenever we close the BPF fd. */ + (void) bpf_program_cgroup_detach(p); + + safe_close(p->kernel_fd); + free(p->prog_name); + free(p->instructions); + free(p->attached_path); + + return mfree(p); +} + + /* struct bpf_prog_info info must be initialized since its value is both input and output + * for BPF_OBJ_GET_INFO_BY_FD syscall. */ +static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) { + union bpf_attr attr; + + /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when + * structured initialization is used. + * Refer to https://github.com/systemd/systemd/issues/18164 + */ + zero(attr); + attr.info.bpf_fd = prog_fd; + attr.info.info_len = info_len; + attr.info.info = PTR_TO_UINT64(info); + + return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr))); +} + +int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) { + _cleanup_(bpf_program_freep) BPFProgram *p = NULL; + _cleanup_free_ char *name = NULL; + + if (prog_name) { + if (strlen(prog_name) >= BPF_OBJ_NAME_LEN) + return -ENAMETOOLONG; + + name = strdup(prog_name); + if (!name) + return -ENOMEM; + } + + p = new(BPFProgram, 1); + if (!p) + return -ENOMEM; + + *p = (BPFProgram) { + .prog_type = prog_type, + .kernel_fd = -EBADF, + .prog_name = TAKE_PTR(name), + }; + + *ret = TAKE_PTR(p); + + return 0; +} + +int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) { + _cleanup_(bpf_program_freep) BPFProgram *p = NULL; + struct bpf_prog_info info = {}; + int r; + + assert(path); + assert(ret); + + p = new(BPFProgram, 1); + if (!p) + return -ENOMEM; + + *p = (BPFProgram) { + .prog_type = BPF_PROG_TYPE_UNSPEC, + .kernel_fd = -EBADF, + }; + + r = bpf_program_load_from_bpf_fs(p, path); + if (r < 0) + return r; + + r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info)); + if (r < 0) + return r; + + p->prog_type = info.type; + *ret = TAKE_PTR(p); + + return 0; +} + + +int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) { + + assert(p); + + if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */ + return -EBUSY; + + if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count)) + return -ENOMEM; + + memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count); + p->n_instructions += count; + + return 0; +} + +int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) { + union bpf_attr attr; + + assert(p); + + if (p->kernel_fd >= 0) { /* make this idempotent */ + memzero(log_buf, log_size); + return 0; + } + + // FIXME: Clang doesn't 0-pad with structured initialization, causing + // the kernel to reject the bpf_attr as invalid. See: + // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65 + // Ideally it should behave like GCC, so that we can remove these workarounds. + zero(attr); + attr.prog_type = p->prog_type; + attr.insns = PTR_TO_UINT64(p->instructions); + attr.insn_cnt = p->n_instructions; + attr.license = PTR_TO_UINT64("GPL"); + attr.log_buf = PTR_TO_UINT64(log_buf); + attr.log_level = !!log_buf; + attr.log_size = log_size; + if (p->prog_name) + strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1); + + p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (p->kernel_fd < 0) + return -errno; + + return 0; +} + +int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) { + union bpf_attr attr; + + assert(p); + + if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */ + return -EBUSY; + + zero(attr); + attr.pathname = PTR_TO_UINT64(path); + + p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr)); + if (p->kernel_fd < 0) + return -errno; + + return 0; +} + +int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) { + _cleanup_free_ char *copy = NULL; + _cleanup_close_ int fd = -EBADF; + union bpf_attr attr; + int r; + + assert(p); + assert(type >= 0); + assert(path); + + if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI)) + return -EINVAL; + + /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's + * refuse this early. */ + if (p->attached_path) { + if (!path_equal(p->attached_path, path)) + return -EBUSY; + if (p->attached_type != type) + return -EBUSY; + if (p->attached_flags != flags) + return -EBUSY; + + /* Here's a shortcut: if we previously attached this program already, then we don't have to do so + * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have + * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags + * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags + * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours + * would remain in effect. */ + if (flags != BPF_F_ALLOW_OVERRIDE) + return 0; + } + + /* Ensure we have a kernel object for this. */ + r = bpf_program_load_kernel(p, NULL, 0); + if (r < 0) + return r; + + copy = strdup(path); + if (!copy) + return -ENOMEM; + + fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + zero(attr); + attr.attach_type = type; + attr.target_fd = fd; + attr.attach_bpf_fd = p->kernel_fd; + attr.attach_flags = flags; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) + return -errno; + + free_and_replace(p->attached_path, copy); + p->attached_type = type; + p->attached_flags = flags; + + return 0; +} + +int bpf_program_cgroup_detach(BPFProgram *p) { + _cleanup_close_ int fd = -EBADF; + + assert(p); + + if (!p->attached_path) + return -EUNATCH; + + fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) + return -errno; + + /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached + * implicitly by the removal, hence don't complain */ + + } else { + union bpf_attr attr; + + zero(attr); + attr.attach_type = p->attached_type; + attr.target_fd = fd; + attr.attach_bpf_fd = p->kernel_fd; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) + return -errno; + } + + p->attached_path = mfree(p->attached_path); + + return 0; +} + +int bpf_map_new( + const char *name, + enum bpf_map_type type, + size_t key_size, + size_t value_size, + size_t max_entries, + uint32_t flags) { + + union bpf_attr attr; + const char *n = name; + + zero(attr); + attr.map_type = type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + attr.map_flags = flags; + + /* The map name is primarily informational for debugging purposes, and typically too short + * to carry the full unit name, hence we employ a trivial lossy escaping to make it fit + * (truncation + only alphanumerical, "." and "_" are allowed as per + * https://www.kernel.org/doc/html/next/bpf/maps.html#usage-notes) */ + for (size_t i = 0; i < sizeof(attr.map_name) - 1 && *n; i++, n++) + attr.map_name[i] = strchr(ALPHANUMERICAL ".", *n) ? *n : '_'; + + return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr))); +} + +int bpf_map_update_element(int fd, const void *key, void *value) { + union bpf_attr attr; + + zero(attr); + attr.map_fd = fd; + attr.key = PTR_TO_UINT64(key); + attr.value = PTR_TO_UINT64(value); + + return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr))); +} + +int bpf_map_lookup_element(int fd, const void *key, void *value) { + union bpf_attr attr; + + zero(attr); + attr.map_fd = fd; + attr.key = PTR_TO_UINT64(key); + attr.value = PTR_TO_UINT64(value); + + return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr))); +} + +int bpf_program_pin(int prog_fd, const char *bpffs_path) { + union bpf_attr attr; + + zero(attr); + attr.pathname = PTR_TO_UINT64((void *) bpffs_path); + attr.bpf_fd = prog_fd; + + return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr))); +} + +int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) { + struct bpf_prog_info info = {}; + int r; + + assert(ret_id); + + r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info)); + if (r < 0) + return r; + + *ret_id = info.id; + + return 0; +}; + +int bpf_program_serialize_attachment( + FILE *f, + FDSet *fds, + const char *key, + BPFProgram *p) { + + _cleanup_free_ char *escaped = NULL; + int copy, r; + + if (!p || !p->attached_path) + return 0; + + assert(p->kernel_fd >= 0); + + escaped = cescape(p->attached_path); + if (!escaped) + return -ENOMEM; + + copy = fdset_put_dup(fds, p->kernel_fd); + if (copy < 0) + return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m"); + + r = serialize_item_format( + f, + key, + "%i %s %s", + copy, + bpf_cgroup_attach_type_to_string(p->attached_type), + escaped); + if (r < 0) + return r; + + /* After serialization, let's forget the fact that this program is attached. The attachment — if you + * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because + * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to + * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't + * want the program to be detached while freeing things, so that the attachment can be retained after + * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL, + * hence we set it to NULL here. */ + + p->attached_path = mfree(p->attached_path); + return 0; +} + +int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) { + BPFProgram *p; + int r; + + SET_FOREACH(p, set) { + r = bpf_program_serialize_attachment(f, fds, key, p); + if (r < 0) + return r; + } + + return 0; +} + +int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) { + _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL; + _cleanup_(bpf_program_freep) BPFProgram *p = NULL; + _cleanup_close_ int fd = -EBADF; + ssize_t l; + int ifd, at, r; + + assert(v); + assert(bpfp); + + /* Extract first word: the fd number */ + r = extract_first_word(&v, &sfd, NULL, 0); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + ifd = parse_fd(sfd); + if (ifd < 0) + return r; + + /* Extract second word: the attach type */ + r = extract_first_word(&v, &sat, NULL, 0); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + at = bpf_cgroup_attach_type_from_string(sat); + if (at < 0) + return at; + + /* The rest is the path */ + if (isempty(v)) + return -EINVAL; + + l = cunescape(v, 0, &unescaped); + if (l < 0) + return l; + + fd = fdset_remove(fds, ifd); + if (fd < 0) + return fd; + + p = new(BPFProgram, 1); + if (!p) + return -ENOMEM; + + *p = (BPFProgram) { + .kernel_fd = TAKE_FD(fd), + .prog_type = BPF_PROG_TYPE_UNSPEC, + .attached_path = TAKE_PTR(unescaped), + .attached_type = at, + }; + + if (*bpfp) + bpf_program_free(*bpfp); + + *bpfp = TAKE_PTR(p); + return 0; +} + +int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) { + BPFProgram *p = NULL; + int r; + + assert(v); + assert(bpfsetp); + + r = bpf_program_deserialize_attachment(v, fds, &p); + if (r < 0) + return r; + + r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p); + if (r < 0) + return r; + + return 0; +} diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h new file mode 100644 index 0000000..0e0b666 --- /dev/null +++ b/src/shared/bpf-program.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "fdset.h" +#include "list.h" +#include "macro.h" + +typedef struct BPFProgram BPFProgram; + +/* This encapsulates three different concepts: the loaded BPF program, the BPF code, and the attachment to a + * cgroup. Typically our BPF programs go through all three stages: we build the code, we load it, and finally + * we attach it, but it might happen that we operate with programs that aren't loaded or aren't attached, or + * where we don't have the code. */ +struct BPFProgram { + /* The loaded BPF program, if loaded */ + int kernel_fd; + uint32_t prog_type; + char *prog_name; + + /* The code of it BPF program, if known */ + size_t n_instructions; + struct bpf_insn *instructions; + + /* The cgroup path the program is attached to, if it is attached. If non-NULL bpf_program_unref() + * will detach on destruction. */ + char *attached_path; + int attached_type; + uint32_t attached_flags; +}; + +int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret); +int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret); +BPFProgram *bpf_program_free(BPFProgram *p); + +int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count); +int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size); +int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path); + +int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags); +int bpf_program_cgroup_detach(BPFProgram *p); + +int bpf_program_pin(int prog_fd, const char *bpffs_path); +int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id); + +int bpf_program_serialize_attachment(FILE *f, FDSet *fds, const char *key, BPFProgram *p); +int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set); +int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp); +int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp); + +extern const struct hash_ops bpf_program_hash_ops; + +int bpf_map_new(const char *name, enum bpf_map_type type, size_t key_size, size_t value_size, + size_t max_entries, uint32_t flags); +int bpf_map_update_element(int fd, const void *key, void *value); +int bpf_map_lookup_element(int fd, const void *key, void *value); + +int bpf_cgroup_attach_type_from_string(const char *str) _pure_; +const char *bpf_cgroup_attach_type_to_string(int attach_type) _const_; + +DEFINE_TRIVIAL_CLEANUP_FUNC(BPFProgram*, bpf_program_free); diff --git a/src/shared/bridge-util.c b/src/shared/bridge-util.c new file mode 100644 index 0000000..e1a8bcb --- /dev/null +++ b/src/shared/bridge-util.c @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bridge-util.h" +#include "string-table.h" + +static const char* const bridge_state_table[_NETDEV_BRIDGE_STATE_MAX] = { + [NETDEV_BRIDGE_STATE_DISABLED] = "disabled", + [NETDEV_BRIDGE_STATE_LISTENING] = "listening", + [NETDEV_BRIDGE_STATE_LEARNING] = "learning", + [NETDEV_BRIDGE_STATE_FORWARDING] = "forwarding", +}; + +DEFINE_STRING_TABLE_LOOKUP(bridge_state, BridgeState); diff --git a/src/shared/bridge-util.h b/src/shared/bridge-util.h new file mode 100644 index 0000000..a60891c --- /dev/null +++ b/src/shared/bridge-util.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "conf-parser.h" + +typedef enum BridgeState { + NETDEV_BRIDGE_STATE_DISABLED = BR_STATE_DISABLED, + NETDEV_BRIDGE_STATE_LISTENING = BR_STATE_LISTENING, + NETDEV_BRIDGE_STATE_LEARNING = BR_STATE_LEARNING, + NETDEV_BRIDGE_STATE_FORWARDING = BR_STATE_FORWARDING, + NETDEV_BRIDGE_STATE_BLOCKING = BR_STATE_BLOCKING, + _NETDEV_BRIDGE_STATE_MAX, + _NETDEV_BRIDGE_STATE_INVALID = -EINVAL, +} BridgeState; + +const char *bridge_state_to_string(BridgeState d) _const_; +BridgeState bridge_state_from_string(const char *d) _pure_; diff --git a/src/shared/btrfs-util.c b/src/shared/btrfs-util.c new file mode 100644 index 0000000..b3e4b50 --- /dev/null +++ b/src/shared/btrfs-util.c @@ -0,0 +1,2164 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "chase.h" +#include "chattr-util.h" +#include "copy.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "macro.h" +#include "path-util.h" +#include "rm-rf.h" +#include "smack-util.h" +#include "sparse-endian.h" +#include "stat-util.h" +#include "string-util.h" +#include "time-util.h" + +/* WARNING: Be careful with file system ioctls! When we get an fd, we + * need to make sure it either refers to only a regular file or + * directory, or that it is located on btrfs, before invoking any + * btrfs ioctls. The ioctl numbers are reused by some device drivers + * (such as DRM), and hence might have bad effects when invoked on + * device nodes (that reference drivers) rather than fds to normal + * files or directories. */ + +int btrfs_is_subvol_at(int dir_fd, const char *path) { + struct stat st; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + /* On btrfs subvolumes always have the inode 256 */ + + if (fstatat(dir_fd, strempty(path), &st, isempty(path) ? AT_EMPTY_PATH : 0) < 0) + return -errno; + + if (!btrfs_might_be_subvol(&st)) + return 0; + + return is_fs_type_at(dir_fd, path, BTRFS_SUPER_MAGIC); +} + +int btrfs_subvol_set_read_only_at(int dir_fd, const char *path, bool b) { + _cleanup_close_ int fd = -EBADF; + uint64_t flags, nflags; + struct stat st; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + fd = xopenat(dir_fd, path, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0) + return fd; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!btrfs_might_be_subvol(&st)) + return -EINVAL; + + if (ioctl(fd, BTRFS_IOC_SUBVOL_GETFLAGS, &flags) < 0) + return -errno; + + nflags = UPDATE_FLAG(flags, BTRFS_SUBVOL_RDONLY, b); + if (flags == nflags) + return 0; + + return RET_NERRNO(ioctl(fd, BTRFS_IOC_SUBVOL_SETFLAGS, &nflags)); +} + +int btrfs_subvol_get_read_only_fd(int fd) { + uint64_t flags; + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + if (!btrfs_might_be_subvol(&st)) + return -EINVAL; + + if (ioctl(fd, BTRFS_IOC_SUBVOL_GETFLAGS, &flags) < 0) + return -errno; + + return !!(flags & BTRFS_SUBVOL_RDONLY); +} + +int btrfs_get_block_device_at(int dir_fd, const char *path, dev_t *ret) { + struct btrfs_ioctl_fs_info_args fsi = {}; + _cleanup_close_ int fd = -EBADF; + uint64_t id; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + assert(ret); + + fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0) + return fd; + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + if (ioctl(fd, BTRFS_IOC_FS_INFO, &fsi) < 0) + return -errno; + + /* We won't do this for btrfs RAID */ + if (fsi.num_devices != 1) { + *ret = 0; + return 0; + } + + for (id = 1; id <= fsi.max_id; id++) { + struct btrfs_ioctl_dev_info_args di = { + .devid = id, + }; + struct stat st; + + if (ioctl(fd, BTRFS_IOC_DEV_INFO, &di) < 0) { + if (errno == ENODEV) + continue; + + return -errno; + } + + /* For the root fs — when no initrd is involved — btrfs returns /dev/root on any kernels from + * the past few years. That sucks, as we have no API to determine the actual root then. let's + * return an recognizable error for this case, so that the caller can maybe print a nice + * message about this. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=89721 */ + if (path_equal((char*) di.path, "/dev/root")) + return -EUCLEAN; + + if (stat((char*) di.path, &st) < 0) + return -errno; + + if (!S_ISBLK(st.st_mode)) + return -ENOTBLK; + + if (major(st.st_rdev) == 0) + return -ENODEV; + + *ret = st.st_rdev; + return 1; + } + + return -ENODEV; +} + +int btrfs_subvol_get_id_fd(int fd, uint64_t *ret) { + struct btrfs_ioctl_ino_lookup_args args = { + .objectid = BTRFS_FIRST_FREE_OBJECTID + }; + int r; + + assert(fd >= 0); + assert(ret); + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + if (ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args) < 0) + return -errno; + + *ret = args.treeid; + return 0; +} + +int btrfs_subvol_get_id(int fd, const char *subvol, uint64_t *ret) { + _cleanup_close_ int subvol_fd = -EBADF; + + assert(fd >= 0); + assert(ret); + + subvol_fd = openat(fd, subvol, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (subvol_fd < 0) + return -errno; + + return btrfs_subvol_get_id_fd(subvol_fd, ret); +} + +static bool btrfs_ioctl_search_args_inc(struct btrfs_ioctl_search_args *args) { + assert(args); + + /* the objectid, type, offset together make up the btrfs key, + * which is considered a single 136byte integer when + * comparing. This call increases the counter by one, dealing + * with the overflow between the overflows */ + + if (args->key.min_offset < UINT64_MAX) { + args->key.min_offset++; + return true; + } + + if (args->key.min_type < UINT8_MAX) { + args->key.min_type++; + args->key.min_offset = 0; + return true; + } + + if (args->key.min_objectid < UINT64_MAX) { + args->key.min_objectid++; + args->key.min_offset = 0; + args->key.min_type = 0; + return true; + } + + return 0; +} + +static void btrfs_ioctl_search_args_set(struct btrfs_ioctl_search_args *args, const struct btrfs_ioctl_search_header *h) { + assert(args); + assert(h); + + args->key.min_objectid = h->objectid; + args->key.min_type = h->type; + args->key.min_offset = h->offset; +} + +static int btrfs_ioctl_search_args_compare(const struct btrfs_ioctl_search_args *args) { + int r; + + assert(args); + + /* Compare min and max */ + + r = CMP(args->key.min_objectid, args->key.max_objectid); + if (r != 0) + return r; + + r = CMP(args->key.min_type, args->key.max_type); + if (r != 0) + return r; + + return CMP(args->key.min_offset, args->key.max_offset); +} + +typedef struct BtrfsForeachIterator { + const void *p; + size_t i; +} BtrfsForeachIterator; + +/* Iterates through a series of struct btrfs_file_extent_item elements. They are unfortunately not aligned, + * hence we copy out the header from them */ +#define FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) \ + for (BtrfsForeachIterator iterator = { \ + .p = ({ \ + memcpy(&(sh), (args).buf, sizeof(struct btrfs_ioctl_search_header)); \ + (body) = (const void*) ((const uint8_t*) (args).buf + sizeof(struct btrfs_ioctl_search_header)); \ + (args).buf; \ + }), \ + }; \ + iterator.i < (args).key.nr_items; \ + iterator.i++, \ + memcpy(&(sh), iterator.p = (const uint8_t*) iterator.p + sizeof(struct btrfs_ioctl_search_header) + (sh).len, sizeof(struct btrfs_ioctl_search_header)), \ + (body) = (const void*) ((const uint8_t*) iterator.p + sizeof(struct btrfs_ioctl_search_header))) + +int btrfs_subvol_get_info_fd(int fd, uint64_t subvol_id, BtrfsSubvolInfo *ret) { + struct btrfs_ioctl_search_args args = { + /* Tree of tree roots */ + .key.tree_id = BTRFS_ROOT_TREE_OBJECTID, + + /* Look precisely for the subvolume items */ + .key.min_type = BTRFS_ROOT_ITEM_KEY, + .key.max_type = BTRFS_ROOT_ITEM_KEY, + + .key.min_offset = 0, + .key.max_offset = UINT64_MAX, + + /* No restrictions on the other components */ + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + bool found = false; + int r; + + assert(fd >= 0); + assert(ret); + + if (subvol_id == 0) { + r = btrfs_subvol_get_id_fd(fd, &subvol_id); + if (r < 0) + return r; + } else { + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + } + + args.key.min_objectid = args.key.max_objectid = subvol_id; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + args.key.nr_items = 256; + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) + return -errno; + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + /* Make sure we start the next search at least from this entry */ + btrfs_ioctl_search_args_set(&args, &sh); + + if (sh.objectid != subvol_id) + continue; + if (sh.type != BTRFS_ROOT_ITEM_KEY) + continue; + + /* Older versions of the struct lacked the otime setting */ + if (sh.len < offsetof(struct btrfs_root_item, otime) + sizeof(struct btrfs_timespec)) + continue; + + const struct btrfs_root_item *ri = body; + ret->otime = (usec_t) le64toh(ri->otime.sec) * USEC_PER_SEC + + (usec_t) le32toh(ri->otime.nsec) / NSEC_PER_USEC; + + ret->subvol_id = subvol_id; + ret->read_only = le64toh(ri->flags) & BTRFS_ROOT_SUBVOL_RDONLY; + + assert_cc(sizeof(ri->uuid) == sizeof(ret->uuid)); + memcpy(&ret->uuid, ri->uuid, sizeof(ret->uuid)); + memcpy(&ret->parent_uuid, ri->parent_uuid, sizeof(ret->parent_uuid)); + + found = true; + goto finish; + } + + /* Increase search key by one, to read the next item, if we can. */ + if (!btrfs_ioctl_search_args_inc(&args)) + break; + } + +finish: + return found ? 0 : -ENODATA; +} + +int btrfs_qgroup_get_quota_fd(int fd, uint64_t qgroupid, BtrfsQuotaInfo *ret) { + + struct btrfs_ioctl_search_args args = { + /* Tree of quota items */ + .key.tree_id = BTRFS_QUOTA_TREE_OBJECTID, + + /* The object ID is always 0 */ + .key.min_objectid = 0, + .key.max_objectid = 0, + + /* Look precisely for the quota items */ + .key.min_type = BTRFS_QGROUP_STATUS_KEY, + .key.max_type = BTRFS_QGROUP_LIMIT_KEY, + + /* No restrictions on the other components */ + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + bool found_info = false, found_limit = false; + int r; + + assert(fd >= 0); + assert(ret); + + if (qgroupid == 0) { + r = btrfs_subvol_get_id_fd(fd, &qgroupid); + if (r < 0) + return r; + } else { + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + } + + args.key.min_offset = args.key.max_offset = qgroupid; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + args.key.nr_items = 256; + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) { + if (errno == ENOENT) /* quota tree is missing: quota disabled */ + break; + + return -errno; + } + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + + /* Make sure we start the next search at least from this entry */ + btrfs_ioctl_search_args_set(&args, &sh); + + if (sh.objectid != 0) + continue; + if (sh.offset != qgroupid) + continue; + + if (sh.type == BTRFS_QGROUP_INFO_KEY) { + const struct btrfs_qgroup_info_item *qii = body; + + ret->referenced = le64toh(qii->rfer); + ret->exclusive = le64toh(qii->excl); + + found_info = true; + + } else if (sh.type == BTRFS_QGROUP_LIMIT_KEY) { + const struct btrfs_qgroup_limit_item *qli = body; + + if (le64toh(qli->flags) & BTRFS_QGROUP_LIMIT_MAX_RFER) + ret->referenced_max = le64toh(qli->max_rfer); + else + ret->referenced_max = UINT64_MAX; + + if (le64toh(qli->flags) & BTRFS_QGROUP_LIMIT_MAX_EXCL) + ret->exclusive_max = le64toh(qli->max_excl); + else + ret->exclusive_max = UINT64_MAX; + + found_limit = true; + } + + if (found_info && found_limit) + goto finish; + } + + /* Increase search key by one, to read the next item, if we can. */ + if (!btrfs_ioctl_search_args_inc(&args)) + break; + } + +finish: + if (!found_limit && !found_info) + return -ENODATA; + + if (!found_info) { + ret->referenced = UINT64_MAX; + ret->exclusive = UINT64_MAX; + } + + if (!found_limit) { + ret->referenced_max = UINT64_MAX; + ret->exclusive_max = UINT64_MAX; + } + + return 0; +} + +int btrfs_qgroup_get_quota(const char *path, uint64_t qgroupid, BtrfsQuotaInfo *ret) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return btrfs_qgroup_get_quota_fd(fd, qgroupid, ret); +} + +int btrfs_subvol_find_subtree_qgroup(int fd, uint64_t subvol_id, uint64_t *ret) { + uint64_t level, lowest = UINT64_MAX, lowest_qgroupid = 0; + _cleanup_free_ uint64_t *qgroups = NULL; + int r, n; + + assert(fd >= 0); + assert(ret); + + /* This finds the "subtree" qgroup for a specific + * subvolume. This only works for subvolumes that have been + * prepared with btrfs_subvol_auto_qgroup_fd() with + * insert_intermediary_qgroup=true (or equivalent). For others + * it will return the leaf qgroup instead. The two cases may + * be distinguished via the return value, which is 1 in case + * an appropriate "subtree" qgroup was found, and 0 + * otherwise. */ + + if (subvol_id == 0) { + r = btrfs_subvol_get_id_fd(fd, &subvol_id); + if (r < 0) + return r; + } + + r = btrfs_qgroupid_split(subvol_id, &level, NULL); + if (r < 0) + return r; + if (level != 0) /* Input must be a leaf qgroup */ + return -EINVAL; + + n = btrfs_qgroup_find_parents(fd, subvol_id, &qgroups); + if (n < 0) + return n; + + for (int i = 0; i < n; i++) { + uint64_t id; + + r = btrfs_qgroupid_split(qgroups[i], &level, &id); + if (r < 0) + return r; + + if (id != subvol_id) + continue; + + if (lowest == UINT64_MAX || level < lowest) { + lowest_qgroupid = qgroups[i]; + lowest = level; + } + } + + if (lowest == UINT64_MAX) { + /* No suitable higher-level qgroup found, let's return + * the leaf qgroup instead, and indicate that with the + * return value. */ + + *ret = subvol_id; + return 0; + } + + *ret = lowest_qgroupid; + return 1; +} + +int btrfs_subvol_get_subtree_quota_fd(int fd, uint64_t subvol_id, BtrfsQuotaInfo *ret) { + uint64_t qgroupid; + int r; + + assert(fd >= 0); + assert(ret); + + /* This determines the quota data of the qgroup with the + * lowest level, that shares the id part with the specified + * subvolume. This is useful for determining the quota data + * for entire subvolume subtrees, as long as the subtrees have + * been set up with btrfs_qgroup_subvol_auto_fd() or in a + * compatible way */ + + r = btrfs_subvol_find_subtree_qgroup(fd, subvol_id, &qgroupid); + if (r < 0) + return r; + + return btrfs_qgroup_get_quota_fd(fd, qgroupid, ret); +} + +int btrfs_subvol_get_subtree_quota(const char *path, uint64_t subvol_id, BtrfsQuotaInfo *ret) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return btrfs_subvol_get_subtree_quota_fd(fd, subvol_id, ret); +} + +int btrfs_defrag_fd(int fd) { + int r; + + assert(fd >= 0); + + r = fd_verify_regular(fd); + if (r < 0) + return r; + + return RET_NERRNO(ioctl(fd, BTRFS_IOC_DEFRAG, NULL)); +} + +int btrfs_defrag(const char *p) { + _cleanup_close_ int fd = -EBADF; + + fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return btrfs_defrag_fd(fd); +} + +int btrfs_quota_enable_fd(int fd, bool b) { + struct btrfs_ioctl_quota_ctl_args args = { + .cmd = b ? BTRFS_QUOTA_CTL_ENABLE : BTRFS_QUOTA_CTL_DISABLE, + }; + int r; + + assert(fd >= 0); + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + return RET_NERRNO(ioctl(fd, BTRFS_IOC_QUOTA_CTL, &args)); +} + +int btrfs_quota_enable(const char *path, bool b) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return btrfs_quota_enable_fd(fd, b); +} + +int btrfs_qgroup_set_limit_fd(int fd, uint64_t qgroupid, uint64_t referenced_max) { + + struct btrfs_ioctl_qgroup_limit_args args = { + .lim.max_rfer = referenced_max, + .lim.flags = BTRFS_QGROUP_LIMIT_MAX_RFER, + }; + int r; + + assert(fd >= 0); + + if (qgroupid == 0) { + r = btrfs_subvol_get_id_fd(fd, &qgroupid); + if (r < 0) + return r; + } else { + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + } + + args.qgroupid = qgroupid; + + for (unsigned c = 0;; c++) { + if (ioctl(fd, BTRFS_IOC_QGROUP_LIMIT, &args) < 0) { + + if (errno == EBUSY && c < 10) { + (void) btrfs_quota_scan_wait(fd); + continue; + } + + return -errno; + } + + break; + } + + return 0; +} + +int btrfs_qgroup_set_limit(const char *path, uint64_t qgroupid, uint64_t referenced_max) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return btrfs_qgroup_set_limit_fd(fd, qgroupid, referenced_max); +} + +int btrfs_subvol_set_subtree_quota_limit_fd(int fd, uint64_t subvol_id, uint64_t referenced_max) { + uint64_t qgroupid; + int r; + + assert(fd >= 0); + + r = btrfs_subvol_find_subtree_qgroup(fd, subvol_id, &qgroupid); + if (r < 0) + return r; + + return btrfs_qgroup_set_limit_fd(fd, qgroupid, referenced_max); +} + +int btrfs_subvol_set_subtree_quota_limit(const char *path, uint64_t subvol_id, uint64_t referenced_max) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return btrfs_subvol_set_subtree_quota_limit_fd(fd, subvol_id, referenced_max); +} + +int btrfs_qgroupid_make(uint64_t level, uint64_t id, uint64_t *ret) { + assert(ret); + + if (level >= (UINT64_C(1) << (64 - BTRFS_QGROUP_LEVEL_SHIFT))) + return -EINVAL; + + if (id >= (UINT64_C(1) << BTRFS_QGROUP_LEVEL_SHIFT)) + return -EINVAL; + + *ret = (level << BTRFS_QGROUP_LEVEL_SHIFT) | id; + return 0; +} + +int btrfs_qgroupid_split(uint64_t qgroupid, uint64_t *level, uint64_t *id) { + assert(level || id); + + if (level) + *level = qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; + + if (id) + *id = qgroupid & ((UINT64_C(1) << BTRFS_QGROUP_LEVEL_SHIFT) - 1); + + return 0; +} + +static int qgroup_create_or_destroy(int fd, bool b, uint64_t qgroupid) { + + struct btrfs_ioctl_qgroup_create_args args = { + .create = b, + .qgroupid = qgroupid, + }; + int r; + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + for (unsigned c = 0;; c++) { + if (ioctl(fd, BTRFS_IOC_QGROUP_CREATE, &args) < 0) { + + /* On old kernels if quota is not enabled, we get EINVAL. On newer kernels we get + * ENOTCONN. Let's always convert this to ENOTCONN to make this recognizable + * everywhere the same way. */ + + if (IN_SET(errno, EINVAL, ENOTCONN)) + return -ENOTCONN; + + if (errno == EBUSY && c < 10) { + (void) btrfs_quota_scan_wait(fd); + continue; + } + + return -errno; + } + + break; + } + + return 0; +} + +int btrfs_qgroup_create(int fd, uint64_t qgroupid) { + return qgroup_create_or_destroy(fd, true, qgroupid); +} + +int btrfs_qgroup_destroy(int fd, uint64_t qgroupid) { + return qgroup_create_or_destroy(fd, false, qgroupid); +} + +int btrfs_qgroup_destroy_recursive(int fd, uint64_t qgroupid) { + _cleanup_free_ uint64_t *qgroups = NULL; + uint64_t subvol_id; + int n, r; + + /* Destroys the specified qgroup, but unassigns it from all + * its parents first. Also, it recursively destroys all + * qgroups it is assigned to that have the same id part of the + * qgroupid as the specified group. */ + + r = btrfs_qgroupid_split(qgroupid, NULL, &subvol_id); + if (r < 0) + return r; + + n = btrfs_qgroup_find_parents(fd, qgroupid, &qgroups); + if (n < 0) + return n; + + for (int i = 0; i < n; i++) { + uint64_t id; + + r = btrfs_qgroupid_split(qgroups[i], NULL, &id); + if (r < 0) + return r; + + r = btrfs_qgroup_unassign(fd, qgroupid, qgroups[i]); + if (r < 0) + return r; + + if (id != subvol_id) + continue; + + /* The parent qgroupid shares the same id part with + * us? If so, destroy it too. */ + + (void) btrfs_qgroup_destroy_recursive(fd, qgroups[i]); + } + + return btrfs_qgroup_destroy(fd, qgroupid); +} + +int btrfs_quota_scan_start(int fd) { + struct btrfs_ioctl_quota_rescan_args args = {}; + + assert(fd >= 0); + + return RET_NERRNO(ioctl(fd, BTRFS_IOC_QUOTA_RESCAN, &args)); +} + +int btrfs_quota_scan_wait(int fd) { + assert(fd >= 0); + + return RET_NERRNO(ioctl(fd, BTRFS_IOC_QUOTA_RESCAN_WAIT)); +} + +int btrfs_quota_scan_ongoing(int fd) { + struct btrfs_ioctl_quota_rescan_args args = {}; + + assert(fd >= 0); + + if (ioctl(fd, BTRFS_IOC_QUOTA_RESCAN_STATUS, &args) < 0) + return -errno; + + return !!args.flags; +} + +static int qgroup_assign_or_unassign(int fd, bool b, uint64_t child, uint64_t parent) { + struct btrfs_ioctl_qgroup_assign_args args = { + .assign = b, + .src = child, + .dst = parent, + }; + int r; + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + for (unsigned c = 0;; c++) { + r = ioctl(fd, BTRFS_IOC_QGROUP_ASSIGN, &args); + if (r < 0) { + if (errno == EBUSY && c < 10) { + (void) btrfs_quota_scan_wait(fd); + continue; + } + + return -errno; + } + + if (r == 0) + return 0; + + /* If the return value is > 0, we need to request a rescan */ + + (void) btrfs_quota_scan_start(fd); + return 1; + } +} + +int btrfs_qgroup_assign(int fd, uint64_t child, uint64_t parent) { + return qgroup_assign_or_unassign(fd, true, child, parent); +} + +int btrfs_qgroup_unassign(int fd, uint64_t child, uint64_t parent) { + return qgroup_assign_or_unassign(fd, false, child, parent); +} + +static int subvol_remove_children(int fd, const char *subvolume, uint64_t subvol_id, BtrfsRemoveFlags flags) { + struct btrfs_ioctl_search_args args = { + .key.tree_id = BTRFS_ROOT_TREE_OBJECTID, + + .key.min_objectid = BTRFS_FIRST_FREE_OBJECTID, + .key.max_objectid = BTRFS_LAST_FREE_OBJECTID, + + .key.min_type = BTRFS_ROOT_BACKREF_KEY, + .key.max_type = BTRFS_ROOT_BACKREF_KEY, + + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + struct btrfs_ioctl_vol_args vol_args = {}; + _cleanup_close_ int subvol_fd = -EBADF; + struct stat st; + bool made_writable = false; + int r; + + assert(fd >= 0); + assert(subvolume); + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISDIR(st.st_mode)) + return -EINVAL; + + subvol_fd = openat(fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); + if (subvol_fd < 0) + return -errno; + + /* Let's check if this is actually a subvolume. Note that this is mostly redundant, as BTRFS_IOC_SNAP_DESTROY + * would fail anyway if it is not. However, it's a good thing to check this ahead of time so that we can return + * ENOTTY unconditionally in this case. This is different from the ioctl() which will return EPERM/EACCES if we + * don't have the privileges to remove subvolumes, regardless if the specified directory is actually a + * subvolume or not. In order to make it easy for callers to cover the "this is not a btrfs subvolume" case + * let's prefer ENOTTY over EPERM/EACCES though. */ + r = btrfs_is_subvol_fd(subvol_fd); + if (r < 0) + return r; + if (r == 0) /* Not a btrfs subvolume */ + return -ENOTTY; + + if (subvol_id == 0) { + r = btrfs_subvol_get_id_fd(subvol_fd, &subvol_id); + if (r < 0) + return r; + } + + /* First, try to remove the subvolume. If it happens to be + * already empty, this will just work. */ + strncpy(vol_args.name, subvolume, sizeof(vol_args.name)-1); + if (ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args) >= 0) { + (void) btrfs_qgroup_destroy_recursive(fd, subvol_id); /* for the leaf subvolumes, the qgroup id is identical to the subvol id */ + return 0; + } + if (!(flags & BTRFS_REMOVE_RECURSIVE) || errno != ENOTEMPTY) + return -errno; + + /* OK, the subvolume is not empty, let's look for child + * subvolumes, and remove them, first */ + + args.key.min_offset = args.key.max_offset = subvol_id; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + args.key.nr_items = 256; + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) + return -errno; + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + _cleanup_free_ char *p = NULL; + + btrfs_ioctl_search_args_set(&args, &sh); + + if (sh.type != BTRFS_ROOT_BACKREF_KEY) + continue; + if (sh.offset != subvol_id) + continue; + + const struct btrfs_root_ref *ref = body; + p = memdup_suffix0((char*) ref + sizeof(struct btrfs_root_ref), le64toh(ref->name_len)); + if (!p) + return -ENOMEM; + + struct btrfs_ioctl_ino_lookup_args ino_args = { + .treeid = subvol_id, + .objectid = htole64(ref->dirid), + }; + + if (ioctl(fd, BTRFS_IOC_INO_LOOKUP, &ino_args) < 0) + return -errno; + + if (!made_writable) { + r = btrfs_subvol_set_read_only_fd(subvol_fd, false); + if (r < 0) + return r; + + made_writable = true; + } + + if (isempty(ino_args.name)) + /* Subvolume is in the top-level + * directory of the subvolume. */ + r = subvol_remove_children(subvol_fd, p, sh.objectid, flags); + else { + _cleanup_close_ int child_fd = -EBADF; + + /* Subvolume is somewhere further down, + * hence we need to open the + * containing directory first */ + + child_fd = openat(subvol_fd, ino_args.name, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + r = subvol_remove_children(child_fd, p, sh.objectid, flags); + } + if (r < 0) + return r; + } + + /* Increase search key by one, to read the next item, if we can. */ + if (!btrfs_ioctl_search_args_inc(&args)) + break; + } + + /* OK, the child subvolumes should all be gone now, let's try + * again to remove the subvolume */ + if (ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args) < 0) + return -errno; + + (void) btrfs_qgroup_destroy_recursive(fd, subvol_id); + return 0; +} + +int btrfs_subvol_remove_at(int dir_fd, const char *path, BtrfsRemoveFlags flags) { + _cleanup_free_ char *subvolume = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(path); + + fd = chase_and_openat(dir_fd, path, CHASE_PARENT|CHASE_EXTRACT_FILENAME, O_CLOEXEC, &subvolume); + if (fd < 0) + return fd; + + r = btrfs_validate_subvolume_name(subvolume); + if (r < 0) + return r; + + return subvol_remove_children(fd, subvolume, 0, flags); +} + +int btrfs_qgroup_copy_limits(int fd, uint64_t old_qgroupid, uint64_t new_qgroupid) { + + struct btrfs_ioctl_search_args args = { + /* Tree of quota items */ + .key.tree_id = BTRFS_QUOTA_TREE_OBJECTID, + + /* The object ID is always 0 */ + .key.min_objectid = 0, + .key.max_objectid = 0, + + /* Look precisely for the quota items */ + .key.min_type = BTRFS_QGROUP_LIMIT_KEY, + .key.max_type = BTRFS_QGROUP_LIMIT_KEY, + + /* For our qgroup */ + .key.min_offset = old_qgroupid, + .key.max_offset = old_qgroupid, + + /* No restrictions on the other components */ + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + int r; + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + args.key.nr_items = 256; + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) { + if (errno == ENOENT) /* quota tree missing: quota is not enabled, hence nothing to copy */ + break; + + return -errno; + } + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + struct btrfs_ioctl_qgroup_limit_args qargs; + unsigned c; + + /* Make sure we start the next search at least from this entry */ + btrfs_ioctl_search_args_set(&args, &sh); + + if (sh.objectid != 0) + continue; + if (sh.type != BTRFS_QGROUP_LIMIT_KEY) + continue; + if (sh.offset != old_qgroupid) + continue; + + /* We found the entry, now copy things over. */ + + const struct btrfs_qgroup_limit_item *qli = body; + qargs = (struct btrfs_ioctl_qgroup_limit_args) { + .qgroupid = new_qgroupid, + + .lim.max_rfer = le64toh(qli->max_rfer), + .lim.max_excl = le64toh(qli->max_excl), + .lim.rsv_rfer = le64toh(qli->rsv_rfer), + .lim.rsv_excl = le64toh(qli->rsv_excl), + + .lim.flags = le64toh(qli->flags) & (BTRFS_QGROUP_LIMIT_MAX_RFER| + BTRFS_QGROUP_LIMIT_MAX_EXCL| + BTRFS_QGROUP_LIMIT_RSV_RFER| + BTRFS_QGROUP_LIMIT_RSV_EXCL), + }; + + for (c = 0;; c++) { + if (ioctl(fd, BTRFS_IOC_QGROUP_LIMIT, &qargs) < 0) { + if (errno == EBUSY && c < 10) { + (void) btrfs_quota_scan_wait(fd); + continue; + } + return -errno; + } + + break; + } + + return 1; + } + + /* Increase search key by one, to read the next item, if we can. */ + if (!btrfs_ioctl_search_args_inc(&args)) + break; + } + + return 0; +} + +static int copy_quota_hierarchy(int fd, uint64_t old_subvol_id, uint64_t new_subvol_id) { + _cleanup_free_ uint64_t *old_qgroups = NULL, *old_parent_qgroups = NULL; + bool copy_from_parent = false, insert_intermediary_qgroup = false; + int n_old_qgroups, n_old_parent_qgroups, r; + uint64_t old_parent_id; + + assert(fd >= 0); + + /* Copies a reduced form of quota information from the old to + * the new subvolume. */ + + n_old_qgroups = btrfs_qgroup_find_parents(fd, old_subvol_id, &old_qgroups); + if (n_old_qgroups <= 0) /* Nothing to copy */ + return n_old_qgroups; + + r = btrfs_subvol_get_parent(fd, old_subvol_id, &old_parent_id); + if (r == -ENXIO) + /* We have no parent, hence nothing to copy. */ + n_old_parent_qgroups = 0; + else if (r < 0) + return r; + else { + n_old_parent_qgroups = btrfs_qgroup_find_parents(fd, old_parent_id, &old_parent_qgroups); + if (n_old_parent_qgroups < 0) + return n_old_parent_qgroups; + } + + for (int i = 0; i < n_old_qgroups; i++) { + uint64_t id; + + r = btrfs_qgroupid_split(old_qgroups[i], NULL, &id); + if (r < 0) + return r; + + if (id == old_subvol_id) { + /* The old subvolume was member of a qgroup + * that had the same id, but a different level + * as it self. Let's set up something similar + * in the destination. */ + insert_intermediary_qgroup = true; + break; + } + + for (int j = 0; j < n_old_parent_qgroups; j++) + if (old_parent_qgroups[j] == old_qgroups[i]) + /* The old subvolume shared a common + * parent qgroup with its parent + * subvolume. Let's set up something + * similar in the destination. */ + copy_from_parent = true; + } + + if (!insert_intermediary_qgroup && !copy_from_parent) + return 0; + + return btrfs_subvol_auto_qgroup_fd(fd, new_subvol_id, insert_intermediary_qgroup); +} + +static int copy_subtree_quota_limits(int fd, uint64_t old_subvol, uint64_t new_subvol) { + uint64_t old_subtree_qgroup, new_subtree_qgroup; + bool changed; + int r; + + /* First copy the leaf limits */ + r = btrfs_qgroup_copy_limits(fd, old_subvol, new_subvol); + if (r < 0) + return r; + changed = r > 0; + + /* Then, try to copy the subtree limits, if there are any. */ + r = btrfs_subvol_find_subtree_qgroup(fd, old_subvol, &old_subtree_qgroup); + if (r < 0) + return r; + if (r == 0) + return changed; + + r = btrfs_subvol_find_subtree_qgroup(fd, new_subvol, &new_subtree_qgroup); + if (r < 0) + return r; + if (r == 0) + return changed; + + r = btrfs_qgroup_copy_limits(fd, old_subtree_qgroup, new_subtree_qgroup); + if (r != 0) + return r; + + return changed; +} + +static int subvol_snapshot_children( + int old_fd, + int new_fd, + const char *subvolume, + uint64_t old_subvol_id, + BtrfsSnapshotFlags flags) { + + struct btrfs_ioctl_search_args args = { + .key.tree_id = BTRFS_ROOT_TREE_OBJECTID, + + .key.min_objectid = BTRFS_FIRST_FREE_OBJECTID, + .key.max_objectid = BTRFS_LAST_FREE_OBJECTID, + + .key.min_type = BTRFS_ROOT_BACKREF_KEY, + .key.max_type = BTRFS_ROOT_BACKREF_KEY, + + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + struct btrfs_ioctl_vol_args_v2 vol_args = { + .flags = flags & BTRFS_SNAPSHOT_READ_ONLY ? BTRFS_SUBVOL_RDONLY : 0, + .fd = old_fd, + }; + _cleanup_close_ int subvolume_fd = -EBADF; + uint64_t new_subvol_id; + int r; + + assert(old_fd >= 0); + assert(new_fd >= 0); + assert(subvolume); + + strncpy(vol_args.name, subvolume, sizeof(vol_args.name)-1); + + if (ioctl(new_fd, BTRFS_IOC_SNAP_CREATE_V2, &vol_args) < 0) + return -errno; + + if (FLAGS_SET(flags, BTRFS_SNAPSHOT_LOCK_BSD)) { + subvolume_fd = xopenat_lock(new_fd, subvolume, + O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW, + /* xopen_flags = */ 0, + /* mode = */ 0, + LOCK_BSD, + LOCK_EX); + if (subvolume_fd < 0) + return subvolume_fd; + + r = btrfs_is_subvol_fd(subvolume_fd); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + } + + if (!(flags & BTRFS_SNAPSHOT_RECURSIVE) && + !(flags & BTRFS_SNAPSHOT_QUOTA)) + return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0; + + if (old_subvol_id == 0) { + r = btrfs_subvol_get_id_fd(old_fd, &old_subvol_id); + if (r < 0) + return r; + } + + r = btrfs_subvol_get_id(new_fd, vol_args.name, &new_subvol_id); + if (r < 0) + return r; + + if (flags & BTRFS_SNAPSHOT_QUOTA) + (void) copy_quota_hierarchy(new_fd, old_subvol_id, new_subvol_id); + + if (!(flags & BTRFS_SNAPSHOT_RECURSIVE)) { + + if (flags & BTRFS_SNAPSHOT_QUOTA) + (void) copy_subtree_quota_limits(new_fd, old_subvol_id, new_subvol_id); + + return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0; + } + + args.key.min_offset = args.key.max_offset = old_subvol_id; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + args.key.nr_items = 256; + if (ioctl(old_fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) + return -errno; + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + _cleanup_free_ char *p = NULL, *c = NULL, *np = NULL; + _cleanup_close_ int old_child_fd = -EBADF, new_child_fd = -EBADF; + + btrfs_ioctl_search_args_set(&args, &sh); + + if (sh.type != BTRFS_ROOT_BACKREF_KEY) + continue; + + /* Avoid finding the source subvolume a second time */ + if (sh.offset != old_subvol_id) + continue; + + /* Avoid running into loops if the new subvolume is below the old one. */ + if (sh.objectid == new_subvol_id) + continue; + + const struct btrfs_root_ref *ref = body; + p = memdup_suffix0((char*) ref + sizeof(struct btrfs_root_ref), le64toh(ref->name_len)); + if (!p) + return -ENOMEM; + + struct btrfs_ioctl_ino_lookup_args ino_args = { + .treeid = old_subvol_id, + .objectid = htole64(ref->dirid), + }; + + if (ioctl(old_fd, BTRFS_IOC_INO_LOOKUP, &ino_args) < 0) + return -errno; + + c = path_join(ino_args.name, p); + if (!c) + return -ENOMEM; + + old_child_fd = openat(old_fd, c, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); + if (old_child_fd < 0) + return -errno; + + np = path_join(subvolume, ino_args.name); + if (!np) + return -ENOMEM; + + new_child_fd = openat(new_fd, np, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); + if (new_child_fd < 0) + return -errno; + + if (flags & BTRFS_SNAPSHOT_READ_ONLY) { + /* If the snapshot is read-only we need to mark it writable temporarily, to + * put the subsnapshot into place. */ + + if (subvolume_fd < 0) { + subvolume_fd = openat(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); + if (subvolume_fd < 0) + return -errno; + } + + r = btrfs_subvol_set_read_only_fd(subvolume_fd, false); + if (r < 0) + return r; + } + + /* When btrfs clones the subvolumes, child subvolumes appear as empty + * directories. Remove them, so that we can create a new snapshot in their place */ + if (unlinkat(new_child_fd, p, AT_REMOVEDIR) < 0) { + int k = -errno; + + if (flags & BTRFS_SNAPSHOT_READ_ONLY) + (void) btrfs_subvol_set_read_only_fd(subvolume_fd, true); + + return k; + } + + r = subvol_snapshot_children(old_child_fd, new_child_fd, p, sh.objectid, + flags & ~(BTRFS_SNAPSHOT_FALLBACK_COPY|BTRFS_SNAPSHOT_LOCK_BSD)); + + /* Restore the readonly flag */ + if (flags & BTRFS_SNAPSHOT_READ_ONLY) { + int k; + + k = btrfs_subvol_set_read_only_fd(subvolume_fd, true); + if (r >= 0 && k < 0) + return k; + } + + if (r < 0) + return r; + } + + /* Increase search key by one, to read the next item, if we can. */ + if (!btrfs_ioctl_search_args_inc(&args)) + break; + } + + if (flags & BTRFS_SNAPSHOT_QUOTA) + (void) copy_subtree_quota_limits(new_fd, old_subvol_id, new_subvol_id); + + return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0; +} + +int btrfs_subvol_snapshot_at_full( + int dir_fdf, + const char *from, + int dir_fdt, + const char *to, + BtrfsSnapshotFlags flags, + copy_progress_path_t progress_path, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + _cleanup_free_ char *subvolume = NULL; + _cleanup_close_ int old_fd = -EBADF, new_fd = -EBADF, subvolume_fd = -EBADF; + int r; + + assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD); + assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD); + assert(to); + + old_fd = xopenat(dir_fdf, from, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY, /* xopen_flags = */ 0, /* mode = */ 0); + if (old_fd < 0) + return old_fd; + + new_fd = chase_and_openat(dir_fdt, to, CHASE_PARENT|CHASE_EXTRACT_FILENAME, O_CLOEXEC, &subvolume); + if (new_fd < 0) + return new_fd; + + r = btrfs_validate_subvolume_name(subvolume); + if (r < 0) + return r; + + r = btrfs_is_subvol_at(dir_fdf, from); + if (r < 0) + return r; + if (r == 0) { + bool plain_directory = false; + + /* If the source isn't a proper subvolume, fail unless fallback is requested */ + if (!(flags & BTRFS_SNAPSHOT_FALLBACK_COPY)) + return -EISDIR; + + r = btrfs_subvol_make(new_fd, subvolume); + if (r < 0) { + if (ERRNO_IS_NOT_SUPPORTED(r) && (flags & BTRFS_SNAPSHOT_FALLBACK_DIRECTORY)) { + /* If the destination doesn't support subvolumes, then use a plain directory, if that's requested. */ + if (mkdirat(new_fd, subvolume, 0755) < 0) + return -errno; + + plain_directory = true; + } else + return r; + } + + if (FLAGS_SET(flags, BTRFS_SNAPSHOT_LOCK_BSD)) { + subvolume_fd = xopenat_lock(new_fd, subvolume, + O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW, + /* xopen_flags = */ 0, + /* mode = */ 0, + LOCK_BSD, + LOCK_EX); + if (subvolume_fd < 0) + return subvolume_fd; + + if (!plain_directory) { + r = btrfs_is_subvol_fd(subvolume_fd); + if (r < 0) + return r; + if (r == 0) + return -EEXIST; + } + } + + r = copy_directory_at_full( + dir_fdf, from, + new_fd, subvolume, + COPY_MERGE_EMPTY| + COPY_REFLINK| + COPY_SAME_MOUNT| + COPY_HARDLINKS| + COPY_ALL_XATTRS| + (FLAGS_SET(flags, BTRFS_SNAPSHOT_SIGINT) ? COPY_SIGINT : 0)| + (FLAGS_SET(flags, BTRFS_SNAPSHOT_SIGTERM) ? COPY_SIGTERM : 0), + progress_path, + progress_bytes, + userdata); + if (r < 0) + goto fallback_fail; + + if (flags & BTRFS_SNAPSHOT_READ_ONLY) { + + if (plain_directory) { + /* Plain directories have no recursive read-only flag, but something pretty close to + * it: the IMMUTABLE bit. Let's use this here, if this is requested. */ + + if (flags & BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE) + (void) chattr_at(new_fd, subvolume, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL); + } else { + r = btrfs_subvol_set_read_only_at(new_fd, subvolume, true); + if (r < 0) + goto fallback_fail; + } + } + + return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0; + + fallback_fail: + (void) rm_rf_at(new_fd, subvolume, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME); + return r; + } + + return subvol_snapshot_children(old_fd, new_fd, subvolume, 0, flags); +} + +int btrfs_qgroup_find_parents(int fd, uint64_t qgroupid, uint64_t **ret) { + + struct btrfs_ioctl_search_args args = { + /* Tree of quota items */ + .key.tree_id = BTRFS_QUOTA_TREE_OBJECTID, + + /* Look precisely for the quota relation items */ + .key.min_type = BTRFS_QGROUP_RELATION_KEY, + .key.max_type = BTRFS_QGROUP_RELATION_KEY, + + /* No restrictions on the other components */ + .key.min_offset = 0, + .key.max_offset = UINT64_MAX, + + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + _cleanup_free_ uint64_t *items = NULL; + size_t n_items = 0; + int r; + + assert(fd >= 0); + assert(ret); + + if (qgroupid == 0) { + r = btrfs_subvol_get_id_fd(fd, &qgroupid); + if (r < 0) + return r; + } else { + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + } + + args.key.min_objectid = args.key.max_objectid = qgroupid; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + _unused_ const void *body; + + args.key.nr_items = 256; + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) { + if (errno == ENOENT) /* quota tree missing: quota is disabled */ + break; + + return -errno; + } + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + + /* Make sure we start the next search at least from this entry */ + btrfs_ioctl_search_args_set(&args, &sh); + + if (sh.type != BTRFS_QGROUP_RELATION_KEY) + continue; + if (sh.offset < sh.objectid) + continue; + if (sh.objectid != qgroupid) + continue; + + if (!GREEDY_REALLOC(items, n_items+1)) + return -ENOMEM; + + items[n_items++] = sh.offset; + } + + /* Increase search key by one, to read the next item, if we can. */ + if (!btrfs_ioctl_search_args_inc(&args)) + break; + } + + if (n_items <= 0) { + *ret = NULL; + return 0; + } + + *ret = TAKE_PTR(items); + + return (int) n_items; +} + +int btrfs_subvol_auto_qgroup_fd(int fd, uint64_t subvol_id, bool insert_intermediary_qgroup) { + _cleanup_free_ uint64_t *qgroups = NULL; + _cleanup_close_ int real_fd = -EBADF; + uint64_t parent_subvol; + bool changed = false; + int n = 0, r; + + assert(fd >= 0); + + /* + * Sets up the specified subvolume's qgroup automatically in + * one of two ways: + * + * If insert_intermediary_qgroup is false, the subvolume's + * leaf qgroup will be assigned to the same parent qgroups as + * the subvolume's parent subvolume. + * + * If insert_intermediary_qgroup is true a new intermediary + * higher-level qgroup is created, with a higher level number, + * but reusing the id of the subvolume. The level number is + * picked as one smaller than the lowest level qgroup the + * parent subvolume is a member of. If the parent subvolume's + * leaf qgroup is assigned to no higher-level qgroup a new + * qgroup of level 255 is created instead. Either way, the new + * qgroup is then assigned to the parent's higher-level + * qgroup, and the subvolume itself is assigned to it. + * + * If the subvolume is already assigned to a higher level + * qgroup, no operation is executed. + * + * Effectively this means: regardless if + * insert_intermediary_qgroup is true or not, after this + * function is invoked the subvolume will be accounted within + * the same qgroups as the parent. However, if it is true, it + * will also get its own higher-level qgroup, which may in + * turn be used by subvolumes created beneath this subvolume + * later on. + * + * This hence defines a simple default qgroup setup for + * subvolumes, as long as this function is invoked on each + * created subvolume: each subvolume is always accounting + * together with its immediate parents. Optionally, if + * insert_intermediary_qgroup is true, it will also get a + * qgroup that then includes all its own child subvolumes. + */ + + /* Turn this into a proper fd, if it is currently O_PATH */ + fd = fd_reopen_condition(fd, O_RDONLY|O_CLOEXEC, O_PATH, &real_fd); + if (fd < 0) + return fd; + + if (subvol_id == 0) { + r = btrfs_is_subvol_fd(fd); + if (r < 0) + return r; + if (!r) + return -ENOTTY; + + r = btrfs_subvol_get_id_fd(fd, &subvol_id); + if (r < 0) + return r; + } + + n = btrfs_qgroup_find_parents(fd, subvol_id, &qgroups); + if (n < 0) + return n; + if (n > 0) /* already parent qgroups set up, let's bail */ + return 0; + + qgroups = mfree(qgroups); + + r = btrfs_subvol_get_parent(fd, subvol_id, &parent_subvol); + if (r == -ENXIO) + /* No parent, hence no qgroup memberships */ + n = 0; + else if (r < 0) + return r; + else { + n = btrfs_qgroup_find_parents(fd, parent_subvol, &qgroups); + if (n < 0) + return n; + } + + if (insert_intermediary_qgroup) { + uint64_t lowest = 256, new_qgroupid; + bool created = false; + + /* Determine the lowest qgroup that the parent + * subvolume is assigned to. */ + + for (int i = 0; i < n; i++) { + uint64_t level; + + r = btrfs_qgroupid_split(qgroups[i], &level, NULL); + if (r < 0) + return r; + + if (level < lowest) + lowest = level; + } + + if (lowest <= 1) /* There are no levels left we could use insert an intermediary qgroup at */ + return -EBUSY; + + r = btrfs_qgroupid_make(lowest - 1, subvol_id, &new_qgroupid); + if (r < 0) + return r; + + /* Create the new intermediary group, unless it already exists */ + r = btrfs_qgroup_create(fd, new_qgroupid); + if (r < 0 && r != -EEXIST) + return r; + if (r >= 0) + changed = created = true; + + for (int i = 0; i < n; i++) { + r = btrfs_qgroup_assign(fd, new_qgroupid, qgroups[i]); + if (r < 0 && r != -EEXIST) { + if (created) + (void) btrfs_qgroup_destroy_recursive(fd, new_qgroupid); + + return r; + } + if (r >= 0) + changed = true; + } + + r = btrfs_qgroup_assign(fd, subvol_id, new_qgroupid); + if (r < 0 && r != -EEXIST) { + if (created) + (void) btrfs_qgroup_destroy_recursive(fd, new_qgroupid); + return r; + } + if (r >= 0) + changed = true; + + } else { + int i; + + /* Assign our subvolume to all the same qgroups as the parent */ + + for (i = 0; i < n; i++) { + r = btrfs_qgroup_assign(fd, subvol_id, qgroups[i]); + if (r < 0 && r != -EEXIST) + return r; + if (r >= 0) + changed = true; + } + } + + return changed; +} + +int btrfs_subvol_auto_qgroup(const char *path, uint64_t subvol_id, bool create_intermediary_qgroup) { + _cleanup_close_ int fd = -EBADF; + + fd = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + if (fd < 0) + return -errno; + + return btrfs_subvol_auto_qgroup_fd(fd, subvol_id, create_intermediary_qgroup); +} + +int btrfs_subvol_get_parent(int fd, uint64_t subvol_id, uint64_t *ret) { + + struct btrfs_ioctl_search_args args = { + /* Tree of tree roots */ + .key.tree_id = BTRFS_ROOT_TREE_OBJECTID, + + /* Look precisely for the subvolume items */ + .key.min_type = BTRFS_ROOT_BACKREF_KEY, + .key.max_type = BTRFS_ROOT_BACKREF_KEY, + + /* No restrictions on the other components */ + .key.min_offset = 0, + .key.max_offset = UINT64_MAX, + + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + int r; + + assert(fd >= 0); + assert(ret); + + if (subvol_id == 0) { + r = btrfs_subvol_get_id_fd(fd, &subvol_id); + if (r < 0) + return r; + } else { + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + } + + args.key.min_objectid = args.key.max_objectid = subvol_id; + + while (btrfs_ioctl_search_args_compare(&args) <= 0) { + struct btrfs_ioctl_search_header sh; + _unused_ const void *body = NULL; + + args.key.nr_items = 256; + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) + return negative_errno(); + + if (args.key.nr_items <= 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) { + + if (sh.type != BTRFS_ROOT_BACKREF_KEY) + continue; + if (sh.objectid != subvol_id) + continue; + + *ret = sh.offset; + return 0; + } + } + + return -ENXIO; +} + +int btrfs_forget_device(const char *path) { + _cleanup_close_ int control_fd = -EBADF; + struct btrfs_ioctl_vol_args args = {}; + + assert(path); + + if (strlen(path) > BTRFS_PATH_NAME_MAX) + return -E2BIG; + + strcpy(args.name, path); + + control_fd = open("/dev/btrfs-control", O_RDWR|O_CLOEXEC); + if (control_fd < 0) + return -errno; + + return RET_NERRNO(ioctl(control_fd, BTRFS_IOC_FORGET_DEV, &args)); +} + +typedef struct BtrfsStripe { + uint64_t devid; + uint64_t offset; +} BtrfsStripe; + +typedef struct BtrfsChunk { + uint64_t offset; + uint64_t length; + uint64_t type; + + BtrfsStripe *stripes; + uint16_t n_stripes; + uint64_t stripe_len; +} BtrfsChunk; + +typedef struct BtrfsChunkTree { + BtrfsChunk **chunks; + size_t n_chunks; +} BtrfsChunkTree; + +static BtrfsChunk* btrfs_chunk_free(BtrfsChunk *chunk) { + if (!chunk) + return NULL; + + free(chunk->stripes); + + return mfree(chunk); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(BtrfsChunk*, btrfs_chunk_free); + +static void btrfs_chunk_tree_done(BtrfsChunkTree *tree) { + assert(tree); + + FOREACH_ARRAY(i, tree->chunks, tree->n_chunks) + btrfs_chunk_free(*i); + + free(tree->chunks); +} + +static int btrfs_read_chunk_tree_fd(int fd, BtrfsChunkTree *ret) { + + struct btrfs_ioctl_search_args search_args = { + .key.tree_id = BTRFS_CHUNK_TREE_OBJECTID, + + .key.min_type = BTRFS_CHUNK_ITEM_KEY, + .key.max_type = BTRFS_CHUNK_ITEM_KEY, + + .key.min_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID, + .key.max_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID, + + .key.min_offset = 0, + .key.max_offset = UINT64_MAX, + + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + _cleanup_(btrfs_chunk_tree_done) BtrfsChunkTree tree = {}; + + assert(fd >= 0); + assert(ret); + + while (btrfs_ioctl_search_args_compare(&search_args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + search_args.key.nr_items = 256; + + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &search_args) < 0) + return -errno; + + if (search_args.key.nr_items == 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, search_args) { + _cleanup_(btrfs_chunk_freep) BtrfsChunk *chunk = NULL; + + btrfs_ioctl_search_args_set(&search_args, &sh); + + if (sh.objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID) + continue; + if (sh.type != BTRFS_CHUNK_ITEM_KEY) + continue; + + chunk = new(BtrfsChunk, 1); + if (!chunk) + return -ENOMEM; + + const struct btrfs_chunk *item = body; + *chunk = (BtrfsChunk) { + .offset = sh.offset, + .length = le64toh(item->length), + .type = le64toh(item->type), + .n_stripes = le16toh(item->num_stripes), + .stripe_len = le64toh(item->stripe_len), + }; + + chunk->stripes = new(BtrfsStripe, chunk->n_stripes); + if (!chunk->stripes) + return -ENOMEM; + + for (size_t j = 0; j < chunk->n_stripes; j++) { + const struct btrfs_stripe *stripe = &item->stripe + j; + + chunk->stripes[j] = (BtrfsStripe) { + .devid = le64toh(stripe->devid), + .offset = le64toh(stripe->offset), + }; + } + + if (!GREEDY_REALLOC(tree.chunks, tree.n_chunks + 1)) + return -ENOMEM; + + tree.chunks[tree.n_chunks++] = TAKE_PTR(chunk); + } + + if (!btrfs_ioctl_search_args_inc(&search_args)) + break; + } + + *ret = TAKE_STRUCT(tree); + return 0; +} + +static BtrfsChunk* btrfs_find_chunk_from_logical_address(const BtrfsChunkTree *tree, uint64_t logical) { + size_t min_index, max_index; + + assert(tree); + assert(tree->chunks || tree->n_chunks == 0); + + if (tree->n_chunks == 0) + return NULL; + + /* bisection */ + min_index = 0; + max_index = tree->n_chunks - 1; + + while (min_index <= max_index) { + size_t mid = (min_index + max_index) / 2; + + if (logical < tree->chunks[mid]->offset) { + if (mid < 1) + return NULL; + + max_index = mid - 1; + } else if (logical >= tree->chunks[mid]->offset + tree->chunks[mid]->length) + min_index = mid + 1; + else + return tree->chunks[mid]; + } + + return NULL; +} + +static int btrfs_is_nocow_fd(int fd) { + unsigned flags; + int r; + + assert(fd >= 0); + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -ENOTTY; + + r = read_attr_fd(fd, &flags); + if (r < 0) + return r; + + return FLAGS_SET(flags, FS_NOCOW_FL) && !FLAGS_SET(flags, FS_COMPR_FL); +} + +int btrfs_get_file_physical_offset_fd(int fd, uint64_t *ret) { + + struct btrfs_ioctl_search_args search_args = { + .key.min_type = BTRFS_EXTENT_DATA_KEY, + .key.max_type = BTRFS_EXTENT_DATA_KEY, + + .key.min_offset = 0, + .key.max_offset = UINT64_MAX, + + .key.min_transid = 0, + .key.max_transid = UINT64_MAX, + }; + + _cleanup_(btrfs_chunk_tree_done) BtrfsChunkTree tree = {}; + uint64_t subvol_id; + struct stat st; + int r; + + assert(fd >= 0); + assert(ret); + + if (fstat(fd, &st) < 0) + return -errno; + + r = stat_verify_regular(&st); + if (r < 0) + return r; + + r = btrfs_is_nocow_fd(fd); + if (r < 0) + return r; + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot get physical address for btrfs extent: CoW enabled"); + + r = btrfs_subvol_get_id_fd(fd, &subvol_id); + if (r < 0) + return r; + + r = btrfs_read_chunk_tree_fd(fd, &tree); + if (r < 0) + return r; + + search_args.key.tree_id = subvol_id; + search_args.key.min_objectid = search_args.key.max_objectid = st.st_ino; + + while (btrfs_ioctl_search_args_compare(&search_args) <= 0) { + struct btrfs_ioctl_search_header sh; + const void *body; + + search_args.key.nr_items = 256; + + if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &search_args) < 0) + return -errno; + + if (search_args.key.nr_items == 0) + break; + + FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, search_args) { + uint64_t logical_offset; + BtrfsChunk *chunk; + + btrfs_ioctl_search_args_set(&search_args, &sh); + + if (sh.type != BTRFS_EXTENT_DATA_KEY) + continue; + + if (sh.objectid != st.st_ino) + continue; + + const struct btrfs_file_extent_item *item = body; + if (!IN_SET(item->type, BTRFS_FILE_EXTENT_REG, BTRFS_FILE_EXTENT_PREALLOC)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot get physical address for btrfs extent: invalid type %" PRIu8, + item->type); + + if (item->compression != 0 || item->encryption != 0 || item->other_encoding != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot get physical address for btrfs extent: has incompatible property"); + + logical_offset = le64toh(item->disk_bytenr); + if (logical_offset == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot get physical address for btrfs extent: failed to get logical offset"); + + chunk = btrfs_find_chunk_from_logical_address(&tree, logical_offset); + if (!chunk) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot get physical address for btrfs extent: no matching chunk found"); + + if ((chunk->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot get physical address for btrfs extent: unsupported profile"); + + uint64_t relative_chunk, relative_stripe, stripe_nr; + uint16_t stripe_index; + + assert(logical_offset >= chunk->offset); + assert(chunk->n_stripes > 0); + assert(chunk->stripe_len > 0); + + relative_chunk = logical_offset - chunk->offset; + stripe_nr = relative_chunk / chunk->stripe_len; + relative_stripe = relative_chunk - stripe_nr * chunk->stripe_len; + stripe_index = stripe_nr % chunk->n_stripes; + + *ret = chunk->stripes[stripe_index].offset + + stripe_nr / chunk->n_stripes * chunk->stripe_len + + relative_stripe; + + return 0; + } + + if (!btrfs_ioctl_search_args_inc(&search_args)) + break; + } + + return -ENODATA; +} diff --git a/src/shared/btrfs-util.h b/src/shared/btrfs-util.h new file mode 100644 index 0000000..cd80903 --- /dev/null +++ b/src/shared/btrfs-util.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +#include "btrfs.h" +#include "copy.h" +#include "time-util.h" + +typedef struct BtrfsSubvolInfo { + uint64_t subvol_id; + usec_t otime; + + sd_id128_t uuid; + sd_id128_t parent_uuid; + + bool read_only; +} BtrfsSubvolInfo; + +typedef struct BtrfsQuotaInfo { + uint64_t referenced; + uint64_t exclusive; + uint64_t referenced_max; + uint64_t exclusive_max; +} BtrfsQuotaInfo; + +typedef enum BtrfsSnapshotFlags { + BTRFS_SNAPSHOT_FALLBACK_COPY = 1 << 0, /* If the source isn't a subvolume, reflink everything */ + BTRFS_SNAPSHOT_READ_ONLY = 1 << 1, + BTRFS_SNAPSHOT_RECURSIVE = 1 << 2, + BTRFS_SNAPSHOT_QUOTA = 1 << 3, + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY = 1 << 4, /* If the destination doesn't support subvolumes, reflink/copy instead */ + BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE = 1 << 5, /* When we can't create a subvolume, use the FS_IMMUTABLE attribute for indicating read-only */ + BTRFS_SNAPSHOT_SIGINT = 1 << 6, /* Check for SIGINT regularly, and return EINTR if seen */ + BTRFS_SNAPSHOT_SIGTERM = 1 << 7, /* Ditto, but for SIGTERM */ + BTRFS_SNAPSHOT_LOCK_BSD = 1 << 8, /* Return a BSD exclusively locked file descriptor referring to snapshot subvolume/directory. */ +} BtrfsSnapshotFlags; + +typedef enum BtrfsRemoveFlags { + BTRFS_REMOVE_RECURSIVE = 1 << 0, + BTRFS_REMOVE_QUOTA = 1 << 1, +} BtrfsRemoveFlags; + +int btrfs_is_subvol_at(int dir_fd, const char *path); +static inline int btrfs_is_subvol_fd(int fd) { + return btrfs_is_subvol_at(fd, NULL); +} +static inline int btrfs_is_subvol(const char *path) { + return btrfs_is_subvol_at(AT_FDCWD, path); +} + +int btrfs_get_block_device_at(int dir_fd, const char *path, dev_t *ret); +static inline int btrfs_get_block_device(const char *path, dev_t *ret) { + return btrfs_get_block_device_at(AT_FDCWD, path, ret); +} +static inline int btrfs_get_block_device_fd(int fd, dev_t *ret) { + return btrfs_get_block_device_at(fd, "", ret); +} + +int btrfs_defrag_fd(int fd); +int btrfs_defrag(const char *p); + +int btrfs_quota_enable_fd(int fd, bool b); +int btrfs_quota_enable(const char *path, bool b); + +int btrfs_quota_scan_start(int fd); +int btrfs_quota_scan_wait(int fd); +int btrfs_quota_scan_ongoing(int fd); + +int btrfs_subvol_snapshot_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, BtrfsSnapshotFlags flags, copy_progress_path_t progress_path, copy_progress_bytes_t progress_bytes, void *userdata); +static inline int btrfs_subvol_snapshot_at(int dir_fdf, const char *from, int dir_fdt, const char *to, BtrfsSnapshotFlags flags) { + return btrfs_subvol_snapshot_at_full(dir_fdf, from, dir_fdt, to, flags, NULL, NULL, NULL); +} + +int btrfs_subvol_remove_at(int dir_fd, const char *path, BtrfsRemoveFlags flags); +static inline int btrfs_subvol_remove(const char *path, BtrfsRemoveFlags flags) { + return btrfs_subvol_remove_at(AT_FDCWD, path, flags); +} + +int btrfs_subvol_set_read_only_at(int dir_fd, const char *path, bool b); +static inline int btrfs_subvol_set_read_only_fd(int fd, bool b) { + return btrfs_subvol_set_read_only_at(fd, NULL, b); +} +static inline int btrfs_subvol_set_read_only(const char *path, bool b) { + return btrfs_subvol_set_read_only_at(AT_FDCWD, path, b); +} + +int btrfs_subvol_get_read_only_fd(int fd); + +int btrfs_subvol_get_id(int fd, const char *subvolume, uint64_t *ret); +int btrfs_subvol_get_id_fd(int fd, uint64_t *ret); +int btrfs_subvol_get_parent(int fd, uint64_t subvol_id, uint64_t *ret); + +int btrfs_subvol_get_info_fd(int fd, uint64_t subvol_id, BtrfsSubvolInfo *info); + +int btrfs_subvol_find_subtree_qgroup(int fd, uint64_t subvol_id, uint64_t *ret); + +int btrfs_subvol_get_subtree_quota(const char *path, uint64_t subvol_id, BtrfsQuotaInfo *quota); +int btrfs_subvol_get_subtree_quota_fd(int fd, uint64_t subvol_id, BtrfsQuotaInfo *quota); + +int btrfs_subvol_set_subtree_quota_limit(const char *path, uint64_t subvol_id, uint64_t referenced_max); +int btrfs_subvol_set_subtree_quota_limit_fd(int fd, uint64_t subvol_id, uint64_t referenced_max); + +int btrfs_subvol_auto_qgroup_fd(int fd, uint64_t subvol_id, bool new_qgroup); +int btrfs_subvol_auto_qgroup(const char *path, uint64_t subvol_id, bool create_intermediary_qgroup); + +int btrfs_qgroupid_make(uint64_t level, uint64_t id, uint64_t *ret); +int btrfs_qgroupid_split(uint64_t qgroupid, uint64_t *level, uint64_t *id); + +int btrfs_qgroup_create(int fd, uint64_t qgroupid); +int btrfs_qgroup_destroy(int fd, uint64_t qgroupid); +int btrfs_qgroup_destroy_recursive(int fd, uint64_t qgroupid); + +int btrfs_qgroup_set_limit_fd(int fd, uint64_t qgroupid, uint64_t referenced_max); +int btrfs_qgroup_set_limit(const char *path, uint64_t qgroupid, uint64_t referenced_max); + +int btrfs_qgroup_copy_limits(int fd, uint64_t old_qgroupid, uint64_t new_qgroupid); + +int btrfs_qgroup_assign(int fd, uint64_t child, uint64_t parent); +int btrfs_qgroup_unassign(int fd, uint64_t child, uint64_t parent); + +int btrfs_qgroup_find_parents(int fd, uint64_t qgroupid, uint64_t **ret); + +int btrfs_qgroup_get_quota_fd(int fd, uint64_t qgroupid, BtrfsQuotaInfo *quota); +int btrfs_qgroup_get_quota(const char *path, uint64_t qgroupid, BtrfsQuotaInfo *quota); + +static inline int btrfs_log_dev_root(int level, int ret, const char *p) { + return log_full_errno(level, ret, + "File system behind %s is reported by btrfs to be backed by pseudo-device /dev/root, which is not a valid userspace accessible device node. " + "Cannot determine correct backing block device.", p); +} + +static inline bool btrfs_might_be_subvol(const struct stat *st) { + if (!st) + return false; + + /* Returns true if this 'struct stat' looks like it could refer to a btrfs subvolume. To make a final + * decision, needs to be combined with an fstatfs() check to see if this is actually btrfs. */ + + return S_ISDIR(st->st_mode) && st->st_ino == 256; +} + +int btrfs_forget_device(const char *path); + +int btrfs_get_file_physical_offset_fd(int fd, uint64_t *ret); diff --git a/src/shared/bus-get-properties.c b/src/shared/bus-get-properties.c new file mode 100644 index 0000000..53e5d6b --- /dev/null +++ b/src/shared/bus-get-properties.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-get-properties.h" +#include "rlimit-util.h" +#include "stdio-util.h" +#include "string-util.h" + +int bus_property_get_bool( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + int b = *(bool*) userdata; + + return sd_bus_message_append_basic(reply, 'b', &b); +} + +int bus_property_set_bool( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + int b, r; + + r = sd_bus_message_read(value, "b", &b); + if (r < 0) + return r; + + *(bool*) userdata = b; + return 0; +} + +int bus_property_get_tristate( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + /* Defaults to false. */ + + int b = (*(int*) userdata) > 0; + + return sd_bus_message_append_basic(reply, 'b', &b); +} + +int bus_property_get_id128( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + sd_id128_t *id = userdata; + + if (sd_id128_is_null(*id)) /* Add an empty array if the ID is zero */ + return sd_bus_message_append(reply, "ay", 0); + else + return sd_bus_message_append_array(reply, 'y', id->bytes, 16); +} + +#if __SIZEOF_SIZE_T__ != 8 +int bus_property_get_size( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t sz = *(size_t*) userdata; + + return sd_bus_message_append_basic(reply, 't', &sz); +} +#endif + +#if __SIZEOF_LONG__ != 8 +int bus_property_get_long( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + int64_t l = *(long*) userdata; + + return sd_bus_message_append_basic(reply, 'x', &l); +} + +int bus_property_get_ulong( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t ul = *(unsigned long*) userdata; + + return sd_bus_message_append_basic(reply, 't', &ul); +} +#endif + +int bus_property_get_rlimit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + const char *is_soft; + struct rlimit *rl; + uint64_t u; + rlim_t x; + + assert(bus); + assert(reply); + assert(userdata); + + is_soft = endswith(property, "Soft"); + + rl = *(struct rlimit**) userdata; + if (rl) + x = is_soft ? rl->rlim_cur : rl->rlim_max; + else { + struct rlimit buf = {}; + const char *s, *p; + int z; + + /* Chop off "Soft" suffix */ + s = is_soft ? strndupa_safe(property, is_soft - property) : property; + + /* Skip over any prefix, such as "Default" */ + assert_se(p = strstrafter(s, "Limit")); + + z = rlimit_from_string(p); + assert(z >= 0); + + (void) getrlimit(z, &buf); + x = is_soft ? buf.rlim_cur : buf.rlim_max; + } + + /* rlim_t might have different sizes, let's map RLIMIT_INFINITY to UINT64_MAX, so that it is the same on all + * archs */ + u = x == RLIM_INFINITY ? UINT64_MAX : (uint64_t) x; + + return sd_bus_message_append(reply, "t", u); +} diff --git a/src/shared/bus-get-properties.h b/src/shared/bus-get-properties.h new file mode 100644 index 0000000..4c35126 --- /dev/null +++ b/src/shared/bus-get-properties.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "macro.h" + +int bus_property_get_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_set_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error); +int bus_property_get_tristate(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_get_id128(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +#define bus_property_get_usec ((sd_bus_property_get_t) NULL) +#define bus_property_set_usec ((sd_bus_property_set_t) NULL) + +assert_cc(sizeof(int) == sizeof(int32_t)); +#define bus_property_get_int ((sd_bus_property_get_t) NULL) + +assert_cc(sizeof(unsigned) == sizeof(uint32_t)); +#define bus_property_get_unsigned ((sd_bus_property_get_t) NULL) + +/* On 64-bit machines we can use the default serializer for size_t and + * friends, otherwise we need to cast this manually */ +#if __SIZEOF_SIZE_T__ == 8 +#define bus_property_get_size ((sd_bus_property_get_t) NULL) +#else +int bus_property_get_size(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +#endif + +#if __SIZEOF_LONG__ == 8 +#define bus_property_get_long ((sd_bus_property_get_t) NULL) +#define bus_property_get_ulong ((sd_bus_property_get_t) NULL) +#else +int bus_property_get_long(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_get_ulong(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +#endif + +/* uid_t and friends on Linux 32 bit. This means we can just use the + * default serializer for 32-bit unsigned, for serializing it, and map + * it to NULL here */ +assert_cc(sizeof(uid_t) == sizeof(uint32_t)); +#define bus_property_get_uid ((sd_bus_property_get_t) NULL) + +assert_cc(sizeof(gid_t) == sizeof(uint32_t)); +#define bus_property_get_gid ((sd_bus_property_get_t) NULL) + +assert_cc(sizeof(pid_t) == sizeof(uint32_t)); +#define bus_property_get_pid ((sd_bus_property_get_t) NULL) + +assert_cc(sizeof(mode_t) == sizeof(uint32_t)); +#define bus_property_get_mode ((sd_bus_property_get_t) NULL) + +int bus_property_get_rlimit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +#define BUS_DEFINE_PROPERTY_GET_GLOBAL(function, bus_type, val) \ + int function(sd_bus *bus, \ + const char *path, \ + const char *interface, \ + const char *property, \ + sd_bus_message *reply, \ + void *userdata, \ + sd_bus_error *error) { \ + \ + assert(bus); \ + assert(reply); \ + \ + return sd_bus_message_append(reply, bus_type, val); \ + } + +#define BUS_DEFINE_PROPERTY_GET2(function, bus_type, data_type, get1, get2) \ + int function(sd_bus *bus, \ + const char *path, \ + const char *interface, \ + const char *property, \ + sd_bus_message *reply, \ + void *userdata, \ + sd_bus_error *error) { \ + \ + data_type *data = ASSERT_PTR(userdata); \ + \ + assert(bus); \ + assert(reply); \ + \ + return sd_bus_message_append(reply, bus_type, \ + get2(get1(data))); \ + } + +#define ident(x) (x) +#define BUS_DEFINE_PROPERTY_GET(function, bus_type, data_type, get1) \ + BUS_DEFINE_PROPERTY_GET2(function, bus_type, data_type, get1, ident) + +#define ref(x) (*(x)) +#define BUS_DEFINE_PROPERTY_GET_REF(function, bus_type, data_type, get) \ + BUS_DEFINE_PROPERTY_GET2(function, bus_type, data_type, ref, get) + +#define BUS_DEFINE_PROPERTY_GET_ENUM(function, name, type) \ + BUS_DEFINE_PROPERTY_GET_REF(function, "s", type, name##_to_string) + +#define BUS_PROPERTY_DUAL_TIMESTAMP(name, offset, flags) \ + SD_BUS_PROPERTY(name, "t", bus_property_get_usec, (offset) + offsetof(struct dual_timestamp, realtime), (flags)), \ + SD_BUS_PROPERTY(name "Monotonic", "t", bus_property_get_usec, (offset) + offsetof(struct dual_timestamp, monotonic), (flags)) diff --git a/src/shared/bus-locator.c b/src/shared/bus-locator.c new file mode 100644 index 0000000..ff7a872 --- /dev/null +++ b/src/shared/bus-locator.c @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-locator.h" +#include "macro.h" + +const BusLocator* const bus_home_mgr = &(BusLocator){ + .destination = "org.freedesktop.home1", + .path = "/org/freedesktop/home1", + .interface = "org.freedesktop.home1.Manager", +}; + +const BusLocator* const bus_import_mgr = &(BusLocator){ + .destination ="org.freedesktop.import1", + .path = "/org/freedesktop/import1", + .interface = "org.freedesktop.import1.Manager" +}; + +const BusLocator* const bus_locale = &(BusLocator){ + .destination = "org.freedesktop.locale1", + .path = "/org/freedesktop/locale1", + .interface = "org.freedesktop.locale1" +}; + +const BusLocator* const bus_login_mgr = &(BusLocator){ + .destination = "org.freedesktop.login1", + .path = "/org/freedesktop/login1", + .interface = "org.freedesktop.login1.Manager" +}; + +const BusLocator* const bus_machine_mgr = &(BusLocator){ + .destination ="org.freedesktop.machine1", + .path = "/org/freedesktop/machine1", + .interface = "org.freedesktop.machine1.Manager" +}; + +const BusLocator* const bus_network_mgr = &(BusLocator){ + .destination = "org.freedesktop.network1", + .path = "/org/freedesktop/network1", + .interface = "org.freedesktop.network1.Manager" +}; + +const BusLocator* const bus_oom_mgr = &(BusLocator){ + .destination = "org.freedesktop.oom1", + .path = "/org/freedesktop/oom1", + .interface = "org.freedesktop.oom1.Manager" +}; + +const BusLocator* const bus_portable_mgr = &(BusLocator){ + .destination = "org.freedesktop.portable1", + .path = "/org/freedesktop/portable1", + .interface = "org.freedesktop.portable1.Manager" +}; + +const BusLocator* const bus_resolve_mgr = &(BusLocator){ + .destination = "org.freedesktop.resolve1", + .path = "/org/freedesktop/resolve1", + .interface = "org.freedesktop.resolve1.Manager" +}; + +const BusLocator* const bus_systemd_mgr = &(BusLocator){ + .destination = "org.freedesktop.systemd1", + .path = "/org/freedesktop/systemd1", + .interface = "org.freedesktop.systemd1.Manager" +}; + +const BusLocator* const bus_timedate = &(BusLocator){ + .destination = "org.freedesktop.timedate1", + .path = "/org/freedesktop/timedate1", + .interface = "org.freedesktop.timedate1" +}; + +const BusLocator* const bus_timesync_mgr = &(BusLocator){ + .destination = "org.freedesktop.timesync1", + .path = "/org/freedesktop/timesync1", + .interface = "org.freedesktop.timesync1.Manager" +}; + +const BusLocator* const bus_hostname = &(BusLocator){ + .destination = "org.freedesktop.hostname1", + .path = "/org/freedesktop/hostname1", + .interface = "org.freedesktop.hostname1" +}; + +/* Shorthand flavors of the sd-bus convenience helpers with destination,path,interface strings encapsulated + * within a single struct. */ +int bus_call_method_async( + sd_bus *bus, + sd_bus_slot **slot, + const BusLocator *locator, + const char *member, + sd_bus_message_handler_t callback, + void *userdata, + const char *types, ...) { + + va_list ap; + int r; + + assert(locator); + + va_start(ap, types); + r = sd_bus_call_method_asyncv(bus, slot, locator->destination, locator->path, locator->interface, member, callback, userdata, types, ap); + va_end(ap); + + return r; +} + +int bus_call_method( + sd_bus *bus, + const BusLocator *locator, + const char *member, + sd_bus_error *error, + sd_bus_message **reply, + const char *types, ...) { + + va_list ap; + int r; + + assert(locator); + + va_start(ap, types); + r = sd_bus_call_methodv(bus, locator->destination, locator->path, locator->interface, member, error, reply, types, ap); + va_end(ap); + + return r; +} + +int bus_get_property( + sd_bus *bus, + const BusLocator *locator, + const char *member, + sd_bus_error *error, + sd_bus_message **reply, + const char *type) { + + assert(locator); + + return sd_bus_get_property(bus, locator->destination, locator->path, locator->interface, member, error, reply, type); +} + +int bus_get_property_trivial( + sd_bus *bus, + const BusLocator *locator, + const char *member, + sd_bus_error *error, + char type, void *ptr) { + + assert(locator); + + return sd_bus_get_property_trivial(bus, locator->destination, locator->path, locator->interface, member, error, type, ptr); +} + +int bus_get_property_string( + sd_bus *bus, + const BusLocator *locator, + const char *member, + sd_bus_error *error, + char **ret) { + + assert(locator); + + return sd_bus_get_property_string(bus, locator->destination, locator->path, locator->interface, member, error, ret); +} + +int bus_get_property_strv( + sd_bus *bus, + const BusLocator *locator, + const char *member, + sd_bus_error *error, + char ***ret) { + + assert(locator); + + return sd_bus_get_property_strv(bus, locator->destination, locator->path, locator->interface, member, error, ret); +} + +int bus_set_property( + sd_bus *bus, + const BusLocator *locator, + const char *member, + sd_bus_error *error, + const char *type, ...) { + + va_list ap; + int r; + + assert(locator); + + va_start(ap, type); + r = sd_bus_set_propertyv(bus, locator->destination, locator->path, locator->interface, member, error, type, ap); + va_end(ap); + + return r; +} + +int bus_match_signal( + sd_bus *bus, + sd_bus_slot **ret, + const BusLocator *locator, + const char *member, + sd_bus_message_handler_t callback, + void *userdata) { + + assert(locator); + + return sd_bus_match_signal(bus, ret, locator->destination, locator->path, locator->interface, member, callback, userdata); +} + +int bus_match_signal_async( + sd_bus *bus, + sd_bus_slot **ret, + const BusLocator *locator, + const char *member, + sd_bus_message_handler_t callback, + sd_bus_message_handler_t install_callback, + void *userdata) { + + assert(locator); + + return sd_bus_match_signal_async(bus, ret, locator->destination, locator->path, locator->interface, member, callback, install_callback, userdata); +} + +int bus_message_new_method_call( + sd_bus *bus, + sd_bus_message **m, + const BusLocator *locator, + const char *member) { + + assert(locator); + + return sd_bus_message_new_method_call(bus, m, locator->destination, locator->path, locator->interface, member); +} diff --git a/src/shared/bus-locator.h b/src/shared/bus-locator.h new file mode 100644 index 0000000..4f50a97 --- /dev/null +++ b/src/shared/bus-locator.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +typedef struct BusLocator { + const char *destination; + const char *path; + const char *interface; +} BusLocator; + +extern const BusLocator* const bus_home_mgr; +extern const BusLocator* const bus_hostname; +extern const BusLocator* const bus_import_mgr; +extern const BusLocator* const bus_locale; +extern const BusLocator* const bus_login_mgr; +extern const BusLocator* const bus_machine_mgr; +extern const BusLocator* const bus_network_mgr; +extern const BusLocator* const bus_oom_mgr; +extern const BusLocator* const bus_portable_mgr; +extern const BusLocator* const bus_resolve_mgr; +extern const BusLocator* const bus_systemd_mgr; +extern const BusLocator* const bus_timedate; +extern const BusLocator* const bus_timesync_mgr; + +/* Shorthand flavors of the sd-bus convenience helpers with destination,path,interface strings encapsulated + * within a single struct. */ +int bus_call_method_async(sd_bus *bus, sd_bus_slot **slot, const BusLocator *locator, const char *member, sd_bus_message_handler_t callback, void *userdata, const char *types, ...); +int bus_call_method(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, sd_bus_message **reply, const char *types, ...); +int bus_get_property(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, sd_bus_message **reply, const char *type); +int bus_get_property_trivial(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, char type, void *ptr); +int bus_get_property_string(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, char **ret); +int bus_get_property_strv(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, char ***ret); +int bus_set_property(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, const char *type, ...); +int bus_match_signal(sd_bus *bus, sd_bus_slot **ret, const BusLocator *locator, const char *member, sd_bus_message_handler_t callback, void *userdata); +int bus_match_signal_async(sd_bus *bus, sd_bus_slot **ret, const BusLocator *locator, const char *member, sd_bus_message_handler_t callback, sd_bus_message_handler_t install_callback, void *userdata); +int bus_message_new_method_call(sd_bus *bus, sd_bus_message **m, const BusLocator *locator, const char *member); diff --git a/src/shared/bus-log-control-api.c b/src/shared/bus-log-control-api.c new file mode 100644 index 0000000..40f99ac --- /dev/null +++ b/src/shared/bus-log-control-api.c @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-log-control-api.h" +#include "bus-util.h" +#include "log.h" +#include "sd-bus.h" +#include "syslog-util.h" + +int bus_property_get_log_level( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + _cleanup_free_ char *t = NULL; + int r; + + assert(bus); + assert(reply); + + r = log_level_to_string_alloc(log_get_max_level(), &t); + if (r < 0) + return r; + + return sd_bus_message_append(reply, "s", t); +} + +int bus_property_set_log_level( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + r = log_level_from_string(t); + if (r < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t); + + log_info("Setting log level to %s.", t); + log_set_max_level(r); + + return 0; +} + +BUS_DEFINE_PROPERTY_GET_GLOBAL(bus_property_get_log_target, "s", log_target_to_string(log_get_target())); + +int bus_property_set_log_target( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *value, + void *userdata, + sd_bus_error *error) { + + LogTarget target; + const char *t; + int r; + + assert(bus); + assert(value); + + r = sd_bus_message_read(value, "s", &t); + if (r < 0) + return r; + + target = log_target_from_string(t); + if (target < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t); + + log_info("Setting log target to %s.", log_target_to_string(target)); + log_set_target_and_open(target); + + return 0; +} + +BUS_DEFINE_PROPERTY_GET_GLOBAL(bus_property_get_syslog_identifier, "s", program_invocation_short_name); + +static const sd_bus_vtable log_control_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, bus_property_set_log_level, 0, 0), + SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, bus_property_set_log_target, 0, 0), + SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0), + + /* One of those days we might want to add a similar, second interface to cover common service + * operations such as Reload(), Reexecute(), Exit() … and maybe some properties exposing version + * number and other meta-data of the service. */ + + SD_BUS_VTABLE_END, +}; + +const BusObjectImplementation log_control_object = { + "/org/freedesktop/LogControl1", + "org.freedesktop.LogControl1", + .vtables = BUS_VTABLES(log_control_vtable), +}; diff --git a/src/shared/bus-log-control-api.h b/src/shared/bus-log-control-api.h new file mode 100644 index 0000000..85f60a7 --- /dev/null +++ b/src/shared/bus-log-control-api.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-object.h" + +extern const BusObjectImplementation log_control_object; +static inline int bus_log_control_api_register(sd_bus *bus) { + return bus_add_implementation(bus, &log_control_object, NULL); +} + +int bus_property_get_log_level(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_set_log_level(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error); + +int bus_property_get_log_target(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_set_log_target(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + +int bus_property_get_syslog_identifier(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); diff --git a/src/shared/bus-map-properties.c b/src/shared/bus-map-properties.c new file mode 100644 index 0000000..809759d --- /dev/null +++ b/src/shared/bus-map-properties.c @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-map-properties.h" +#include "alloc-util.h" +#include "bus-util.h" +#include "strv.h" +#include "bus-message.h" + +int bus_map_id128(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + sd_id128_t *p = userdata; + const void *v; + size_t n; + int r; + + r = sd_bus_message_read_array(m, SD_BUS_TYPE_BYTE, &v, &n); + if (r < 0) + return bus_log_parse_error_debug(r); + + if (n == 0) + *p = SD_ID128_NULL; + else if (n == 16) + memcpy((*p).bytes, v, n); + else + return -EINVAL; + + return 0; +} + +int bus_map_strv_sort(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + char ***p = userdata; + int r; + + r = sd_bus_message_read_strv_extend(m, &l); + if (r < 0) + return bus_log_parse_error_debug(r); + + r = strv_extend_strv(p, l, false); + if (r < 0) + return bus_log_parse_error_debug(r); + + strv_sort(*p); + return 0; +} + +static int map_basic(sd_bus *bus, const char *member, sd_bus_message *m, unsigned flags, sd_bus_error *error, void *userdata) { + char type; + int r; + + r = sd_bus_message_peek_type(m, &type, NULL); + if (r < 0) + return bus_log_parse_error_debug(r); + + switch (type) { + + case SD_BUS_TYPE_STRING: + case SD_BUS_TYPE_OBJECT_PATH: { + const char **p = userdata; + const char *s; + + r = sd_bus_message_read_basic(m, type, &s); + if (r < 0) + return bus_log_parse_error_debug(r); + + if (isempty(s)) + s = NULL; + + if (flags & BUS_MAP_STRDUP) + return free_and_strdup((char **) userdata, s); + + *p = s; + return 0; + } + + case SD_BUS_TYPE_ARRAY: { + _cleanup_strv_free_ char **l = NULL; + char ***p = userdata; + + r = sd_bus_message_read_strv_extend(m, &l); + if (r < 0) + return bus_log_parse_error_debug(r); + + return strv_extend_strv(p, l, false); + } + + case SD_BUS_TYPE_BOOLEAN: { + int b; + + r = sd_bus_message_read_basic(m, type, &b); + if (r < 0) + return bus_log_parse_error_debug(r); + + if (flags & BUS_MAP_BOOLEAN_AS_BOOL) + *(bool*) userdata = b; + else + *(int*) userdata = b; + + return 0; + } + + case SD_BUS_TYPE_INT32: + case SD_BUS_TYPE_UINT32: { + uint32_t u, *p = userdata; + + r = sd_bus_message_read_basic(m, type, &u); + if (r < 0) + return bus_log_parse_error_debug(r); + + *p = u; + return 0; + } + + case SD_BUS_TYPE_INT64: + case SD_BUS_TYPE_UINT64: { + uint64_t t, *p = userdata; + + r = sd_bus_message_read_basic(m, type, &t); + if (r < 0) + return bus_log_parse_error_debug(r); + + *p = t; + return 0; + } + + case SD_BUS_TYPE_DOUBLE: { + double d, *p = userdata; + + r = sd_bus_message_read_basic(m, type, &d); + if (r < 0) + return bus_log_parse_error_debug(r); + + *p = d; + return 0; + }} + + return -EOPNOTSUPP; +} + +int bus_message_map_all_properties( + sd_bus_message *m, + const struct bus_properties_map *map, + unsigned flags, + sd_bus_error *error, + void *userdata) { + + int r; + + assert(m); + assert(map); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "{sv}"); + if (r < 0) + return bus_log_parse_error_debug(r); + + while ((r = sd_bus_message_enter_container(m, SD_BUS_TYPE_DICT_ENTRY, "sv")) > 0) { + const struct bus_properties_map *prop; + const char *member; + const char *contents; + void *v; + unsigned i; + + r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &member); + if (r < 0) + return bus_log_parse_error_debug(r); + + for (i = 0, prop = NULL; map[i].member; i++) + if (streq(map[i].member, member)) { + prop = &map[i]; + break; + } + + if (prop) { + r = sd_bus_message_peek_type(m, NULL, &contents); + if (r < 0) + return bus_log_parse_error_debug(r); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_VARIANT, contents); + if (r < 0) + return bus_log_parse_error_debug(r); + + v = (uint8_t *)userdata + prop->offset; + if (map[i].set) + r = prop->set(sd_bus_message_get_bus(m), member, m, error, v); + else + r = map_basic(sd_bus_message_get_bus(m), member, m, flags, error, v); + if (r < 0) + return bus_log_parse_error_debug(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error_debug(r); + } else { + r = sd_bus_message_skip(m, "v"); + if (r < 0) + return bus_log_parse_error_debug(r); + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error_debug(r); + } + if (r < 0) + return bus_log_parse_error_debug(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error_debug(r); + + return r; +} + +int bus_map_all_properties( + sd_bus *bus, + const char *destination, + const char *path, + const struct bus_properties_map *map, + unsigned flags, + sd_bus_error *error, + sd_bus_message **reply, + void *userdata) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + assert(destination); + assert(path); + assert(map); + assert(reply || (flags & BUS_MAP_STRDUP)); + + r = sd_bus_call_method( + bus, + destination, + path, + "org.freedesktop.DBus.Properties", + "GetAll", + error, + &m, + "s", ""); + if (r < 0) + return r; + + r = bus_message_map_all_properties(m, map, flags, error, userdata); + if (r < 0) + return r; + + if (reply) + *reply = sd_bus_message_ref(m); + + return r; +} diff --git a/src/shared/bus-map-properties.h b/src/shared/bus-map-properties.h new file mode 100644 index 0000000..e9f4a92 --- /dev/null +++ b/src/shared/bus-map-properties.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +typedef int (*bus_property_set_t) (sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata); + +struct bus_properties_map { + const char *member; + const char *signature; + bus_property_set_t set; + size_t offset; +}; + +enum { + BUS_MAP_STRDUP = 1 << 0, /* If set, each "s" message is duplicated. Thus, each pointer needs to be freed. */ + BUS_MAP_BOOLEAN_AS_BOOL = 1 << 1, /* If set, each "b" message is written to a bool pointer. If not set, "b" is written to an int pointer. */ +}; + +int bus_map_id128(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata); +int bus_map_strv_sort(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata); + +int bus_message_map_all_properties(sd_bus_message *m, const struct bus_properties_map *map, unsigned flags, sd_bus_error *error, void *userdata); +int bus_map_all_properties(sd_bus *bus, const char *destination, const char *path, const struct bus_properties_map *map, + unsigned flags, sd_bus_error *error, sd_bus_message **reply, void *userdata); diff --git a/src/shared/bus-message-util.c b/src/shared/bus-message-util.c new file mode 100644 index 0000000..53f6350 --- /dev/null +++ b/src/shared/bus-message-util.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-message-util.h" + +#include "resolve-util.h" + +int bus_message_read_ifindex(sd_bus_message *message, sd_bus_error *error, int *ret) { + int ifindex, r; + + assert(message); + assert(ret); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(message, "i", &ifindex); + if (r < 0) + return r; + + if (ifindex <= 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface index"); + + *ret = ifindex; + + return 0; +} + +int bus_message_read_family(sd_bus_message *message, sd_bus_error *error, int *ret) { + int family, r; + + assert(message); + assert(ret); + + assert_cc(sizeof(int) == sizeof(int32_t)); + + r = sd_bus_message_read(message, "i", &family); + if (r < 0) + return r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown address family %i", family); + + *ret = family; + return 0; +} + +int bus_message_read_in_addr_auto(sd_bus_message *message, sd_bus_error *error, int *ret_family, union in_addr_union *ret_addr) { + int family, r; + const void *d; + size_t sz; + + assert(message); + + r = sd_bus_message_read(message, "i", &family); + if (r < 0) + return r; + + r = sd_bus_message_read_array(message, 'y', &d, &sz); + if (r < 0) + return r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown address family %i", family); + + if (sz != FAMILY_ADDRESS_SIZE(family)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address size"); + + if (ret_family) + *ret_family = family; + if (ret_addr) + memcpy(ret_addr, d, sz); + return 0; +} + +static int bus_message_read_dns_one( + sd_bus_message *message, + sd_bus_error *error, + bool extended, + int *ret_family, + union in_addr_union *ret_address, + uint16_t *ret_port, + const char **ret_server_name) { + const char *server_name = NULL; + union in_addr_union a; + uint16_t port = 0; + int family, r; + + assert(message); + assert(ret_family); + assert(ret_address); + assert(ret_port); + assert(ret_server_name); + + r = sd_bus_message_enter_container(message, 'r', extended ? "iayqs" : "iay"); + if (r <= 0) + return r; + + r = bus_message_read_in_addr_auto(message, error, &family, &a); + if (r < 0) + return r; + + if (!dns_server_address_valid(family, &a)) { + r = sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid DNS server address"); + assert(r < 0); + return r; + } + + if (extended) { + r = sd_bus_message_read(message, "q", &port); + if (r < 0) + return r; + + if (IN_SET(port, 53, 853)) + port = 0; + + r = sd_bus_message_read(message, "s", &server_name); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + *ret_family = family; + *ret_address = a; + *ret_port = port; + *ret_server_name = server_name; + + return 1; +} + +int bus_message_read_dns_servers( + sd_bus_message *message, + sd_bus_error *error, + bool extended, + struct in_addr_full ***ret_dns, + size_t *ret_n_dns) { + + struct in_addr_full **dns = NULL; + size_t n = 0; + int r; + + assert(message); + assert(ret_dns); + assert(ret_n_dns); + + r = sd_bus_message_enter_container(message, 'a', extended ? "(iayqs)" : "(iay)"); + if (r < 0) + return r; + + for (;;) { + const char *server_name; + union in_addr_union a; + uint16_t port; + int family; + + r = bus_message_read_dns_one(message, error, extended, &family, &a, &port, &server_name); + if (r < 0) + goto clear; + if (r == 0) + break; + + if (!GREEDY_REALLOC(dns, n+1)) { + r = -ENOMEM; + goto clear; + } + + r = in_addr_full_new(family, &a, port, 0, server_name, dns + n); + if (r < 0) + goto clear; + + n++; + } + + *ret_dns = TAKE_PTR(dns); + *ret_n_dns = n; + return 0; + +clear: + for (size_t i = 0; i < n; i++) + in_addr_full_free(dns[i]); + free(dns); + + return r; +} diff --git a/src/shared/bus-message-util.h b/src/shared/bus-message-util.h new file mode 100644 index 0000000..b82c083 --- /dev/null +++ b/src/shared/bus-message-util.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "in-addr-util.h" +#include "socket-netlink.h" + +int bus_message_read_ifindex(sd_bus_message *message, sd_bus_error *error, int *ret); +int bus_message_read_family(sd_bus_message *message, sd_bus_error *error, int *ret); +int bus_message_read_in_addr_auto(sd_bus_message *message, sd_bus_error *error, int *ret_family, union in_addr_union *ret_addr); + +int bus_message_read_dns_servers( + sd_bus_message *message, + sd_bus_error *error, + bool extended, + struct in_addr_full ***ret_dns, + size_t *ret_n_dns); diff --git a/src/shared/bus-object.c b/src/shared/bus-object.c new file mode 100644 index 0000000..4ed5215 --- /dev/null +++ b/src/shared/bus-object.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-introspect.h" +#include "bus-object.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" + +int bus_add_implementation(sd_bus *bus, const BusObjectImplementation *impl, void *userdata) { + int r; + + log_debug("Registering bus object implementation for path=%s iface=%s", impl->path, impl->interface); + + for (const sd_bus_vtable **p = impl->vtables; p && *p; p++) { + r = sd_bus_add_object_vtable(bus, NULL, + impl->path, + impl->interface, + *p, + userdata); + if (r < 0) + return log_error_errno(r, "Failed to register bus path %s with interface %s: %m", + impl->path, + impl->interface); + } + + for (const BusObjectVtablePair *p = impl->fallback_vtables; p && p->vtable; p++) { + r = sd_bus_add_fallback_vtable(bus, NULL, + impl->path, + impl->interface, + p->vtable, + p->object_find, + userdata); + if (r < 0) + return log_error_errno(r, "Failed to register bus path %s with interface %s: %m", + impl->path, + impl->interface); + } + + if (impl->node_enumerator) { + r = sd_bus_add_node_enumerator(bus, NULL, + impl->path, + impl->node_enumerator, + userdata); + if (r < 0) + return log_error_errno(r, "Failed to add node enumerator for %s: %m", + impl->path); + } + + if (impl->manager) { + r = sd_bus_add_object_manager(bus, NULL, impl->path); + if (r < 0) + return log_error_errno(r, "Failed to add object manager for %s: %m", impl->path); + } + + for (size_t i = 0; impl->children && impl->children[i]; i++) { + r = bus_add_implementation(bus, impl->children[i], userdata); + if (r < 0) + return r; + } + + return 0; +} + +static const BusObjectImplementation* find_implementation( + const char *pattern, + const BusObjectImplementation* const* bus_objects) { + + for (size_t i = 0; bus_objects && bus_objects[i]; i++) { + const BusObjectImplementation *impl = bus_objects[i]; + + if (STR_IN_SET(pattern, impl->path, impl->interface)) + return impl; + + impl = find_implementation(pattern, impl->children); + if (impl) + return impl; + } + + return NULL; +} + +static int bus_introspect_implementation( + struct introspect *intro, + const BusObjectImplementation *impl) { + int r; + + for (const sd_bus_vtable **p = impl->vtables; p && *p; p++) { + r = introspect_write_interface(intro, impl->interface, *p); + if (r < 0) + return log_error_errno(r, "Failed to write introspection data: %m"); + } + + for (const BusObjectVtablePair *p = impl->fallback_vtables; p && p->vtable; p++) { + r = introspect_write_interface(intro, impl->interface, p->vtable); + if (r < 0) + return log_error_errno(r, "Failed to write introspection data: %m"); + } + + return 0; +} + +static void list_paths( + FILE *out, + const BusObjectImplementation* const* bus_objects) { + + for (size_t i = 0; bus_objects[i]; i++) { + fprintf(out, "%s\t%s\n", bus_objects[i]->path, bus_objects[i]->interface); + if (bus_objects[i]->children) + list_paths(out, bus_objects[i]->children); + } +} + +int bus_introspect_implementations( + FILE *out, + const char *pattern, + const BusObjectImplementation* const* bus_objects) { + + const BusObjectImplementation *impl, *main_impl = NULL; + _cleanup_free_ char *s = NULL; + int r; + + if (streq(pattern, "list")) { + list_paths(out, bus_objects); + return 0; + } + + struct introspect intro = {}; + bool is_interface = sd_bus_interface_name_is_valid(pattern); + + impl = find_implementation(pattern, bus_objects); + if (!impl) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "%s %s not found", + is_interface ? "Interface" : "Object path", + pattern); + + /* We use trusted=false here to get all the @org.freedesktop.systemd1.Privileged annotations. */ + r = introspect_begin(&intro, false); + if (r < 0) + return log_error_errno(r, "Failed to write introspection data: %m"); + + r = introspect_write_default_interfaces(&intro, impl->manager); + if (r < 0) + return log_error_errno(r, "Failed to write introspection data: %m"); + + /* Check if there is a non-fallback path that applies to the given interface, also + * print it. This is useful in the case of units: o.fd.systemd1.Service is declared + * as a fallback vtable for o/fd/systemd1/unit, and we also want to print + * o.fd.systemd1.Unit, which is the non-fallback implementation. */ + if (impl->fallback_vtables && is_interface) + main_impl = find_implementation(impl->path, bus_objects); + + if (main_impl) + bus_introspect_implementation(&intro, main_impl); + + if (impl != main_impl) + bus_introspect_implementation(&intro, impl); + + _cleanup_ordered_set_free_ OrderedSet *nodes = NULL; + + for (size_t i = 0; impl->children && impl->children[i]; i++) { + r = ordered_set_put_strdup(&nodes, impl->children[i]->path); + if (r < 0) + return log_oom(); + } + + r = introspect_write_child_nodes(&intro, nodes, impl->path); + if (r < 0) + return r; + + r = introspect_finish(&intro, &s); + if (r < 0) + return log_error_errno(r, "Failed to write introspection data: %m"); + + fputs(s, out); + return 0; +} diff --git a/src/shared/bus-object.h b/src/shared/bus-object.h new file mode 100644 index 0000000..145bbd2 --- /dev/null +++ b/src/shared/bus-object.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" + +typedef struct BusObjectImplementation BusObjectImplementation; + +typedef struct BusObjectVtablePair { + const sd_bus_vtable *vtable; + sd_bus_object_find_t object_find; +} BusObjectVtablePair; + +struct BusObjectImplementation { + const char *path; + const char *interface; + const sd_bus_vtable **vtables; + const BusObjectVtablePair *fallback_vtables; + sd_bus_node_enumerator_t node_enumerator; + bool manager; + const BusObjectImplementation **children; +}; + +#define BUS_VTABLES(...) ((const sd_bus_vtable* []){ __VA_ARGS__, NULL }) +#define BUS_FALLBACK_VTABLES(...) ((const BusObjectVtablePair[]) { __VA_ARGS__, {} }) +#define BUS_IMPLEMENTATIONS(...) ((const BusObjectImplementation* []) { __VA_ARGS__, NULL }) + +int bus_add_implementation(sd_bus *bus, const BusObjectImplementation *impl, void *userdata); +int bus_introspect_implementations( + FILE *out, + const char *pattern, + const BusObjectImplementation* const* bus_objects); diff --git a/src/shared/bus-polkit.c b/src/shared/bus-polkit.c new file mode 100644 index 0000000..904b897 --- /dev/null +++ b/src/shared/bus-polkit.c @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-internal.h" +#include "bus-message.h" +#include "bus-polkit.h" +#include "bus-util.h" +#include "strv.h" +#include "user-util.h" + +static int check_good_user(sd_bus_message *m, uid_t good_user) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + uid_t sender_uid; + int r; + + assert(m); + + if (good_user == UID_INVALID) + return 0; + + r = sd_bus_query_sender_creds(m, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + /* Don't trust augmented credentials for authorization */ + assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_EUID) == 0, -EPERM); + + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + return r; + + return sender_uid == good_user; +} + +#if ENABLE_POLKIT +static int bus_message_append_strv_key_value(sd_bus_message *m, const char **l) { + int r; + + assert(m); + + r = sd_bus_message_open_container(m, 'a', "{ss}"); + if (r < 0) + return r; + + STRV_FOREACH_PAIR(k, v, l) { + r = sd_bus_message_append(m, "{ss}", *k, *v); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return r; + + return r; +} + +static int bus_message_new_polkit_auth_call( + sd_bus_message *m, + const char *action, + const char **details, + bool interactive, + sd_bus_message **ret) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *c = NULL; + const char *sender; + int r; + + assert(m); + assert(action); + assert(ret); + + sender = sd_bus_message_get_sender(m); + if (!sender) + return -EBADMSG; + + r = sd_bus_message_new_method_call( + ASSERT_PTR(m->bus), + &c, + "org.freedesktop.PolicyKit1", + "/org/freedesktop/PolicyKit1/Authority", + "org.freedesktop.PolicyKit1.Authority", + "CheckAuthorization"); + if (r < 0) + return r; + + r = sd_bus_message_append(c, "(sa{sv})s", "system-bus-name", 1, "name", "s", sender, action); + if (r < 0) + return r; + + r = bus_message_append_strv_key_value(c, details); + if (r < 0) + return r; + + r = sd_bus_message_append(c, "us", interactive, NULL); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + return 0; +} +#endif + +int bus_test_polkit( + sd_bus_message *call, + int capability, + const char *action, + const char **details, + uid_t good_user, + bool *_challenge, + sd_bus_error *ret_error) { + + int r; + + assert(call); + assert(action); + + /* Tests non-interactively! */ + + r = check_good_user(call, good_user); + if (r != 0) + return r; + + r = sd_bus_query_sender_privilege(call, capability); + if (r < 0) + return r; + if (r > 0) + return 1; + +#if ENABLE_POLKIT + _cleanup_(sd_bus_message_unrefp) sd_bus_message *request = NULL, *reply = NULL; + int authorized = false, challenge = false; + + r = bus_message_new_polkit_auth_call(call, action, details, /* interactive = */ false, &request); + if (r < 0) + return r; + + r = sd_bus_call(call->bus, request, 0, ret_error, &reply); + if (r < 0) { + /* Treat no PK available as access denied */ + if (bus_error_is_unknown_service(ret_error)) { + sd_bus_error_free(ret_error); + return -EACCES; + } + + return r; + } + + r = sd_bus_message_enter_container(reply, 'r', "bba{ss}"); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "bb", &authorized, &challenge); + if (r < 0) + return r; + + if (authorized) + return 1; + + if (_challenge) { + *_challenge = challenge; + return 0; + } +#endif + + return -EACCES; +} + +#if ENABLE_POLKIT + +typedef struct AsyncPolkitQueryAction { + char *action; + char **details; + + LIST_FIELDS(struct AsyncPolkitQueryAction, authorized); +} AsyncPolkitQueryAction; + +static AsyncPolkitQueryAction *async_polkit_query_action_free(AsyncPolkitQueryAction *a) { + if (!a) + return NULL; + + free(a->action); + strv_free(a->details); + + return mfree(a); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(AsyncPolkitQueryAction*, async_polkit_query_action_free); + +typedef struct AsyncPolkitQuery { + unsigned n_ref; + + AsyncPolkitQueryAction *action; + + sd_bus_message *request; + sd_bus_slot *slot; + + Hashmap *registry; + sd_event_source *defer_event_source; + + LIST_HEAD(AsyncPolkitQueryAction, authorized_actions); + AsyncPolkitQueryAction *denied_action; + AsyncPolkitQueryAction *error_action; + sd_bus_error error; +} AsyncPolkitQuery; + +static AsyncPolkitQuery *async_polkit_query_free(AsyncPolkitQuery *q) { + if (!q) + return NULL; + + sd_bus_slot_unref(q->slot); + + if (q->registry && q->request) + hashmap_remove(q->registry, q->request); + + sd_bus_message_unref(q->request); + + async_polkit_query_action_free(q->action); + + sd_event_source_disable_unref(q->defer_event_source); + + LIST_CLEAR(authorized, q->authorized_actions, async_polkit_query_action_free); + + async_polkit_query_action_free(q->denied_action); + async_polkit_query_action_free(q->error_action); + + sd_bus_error_free(&q->error); + + return mfree(q); +} + +DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(AsyncPolkitQuery, async_polkit_query, async_polkit_query_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(AsyncPolkitQuery*, async_polkit_query_unref); + +static int async_polkit_defer(sd_event_source *s, void *userdata) { + AsyncPolkitQuery *q = ASSERT_PTR(userdata); + + assert(s); + + /* This is called as idle event source after we processed the async polkit reply, hopefully after the + * method call we re-enqueued has been properly processed. */ + + async_polkit_query_unref(q); + return 0; +} + +static int async_polkit_read_reply(sd_bus_message *reply, AsyncPolkitQuery *q) { + _cleanup_(async_polkit_query_action_freep) AsyncPolkitQueryAction *a = NULL; + int authorized, challenge, r; + + assert(reply); + assert(q); + + /* Processing of a PolicyKit checks is canceled on the first auth. error. */ + assert(!q->denied_action); + assert(!q->error_action); + assert(!sd_bus_error_is_set(&q->error)); + + assert(q->action); + a = TAKE_PTR(q->action); + + if (sd_bus_message_is_method_error(reply, NULL)) { + const sd_bus_error *e; + + e = sd_bus_message_get_error(reply); + + if (bus_error_is_unknown_service(e)) + /* Treat no PK available as access denied */ + q->denied_action = TAKE_PTR(a); + else { + /* Save error from polkit reply, so it can be returned when the same authorization + * is attempted for second time */ + q->error_action = TAKE_PTR(a); + r = sd_bus_error_copy(&q->error, e); + if (r == -ENOMEM) + return r; + } + + return 0; + } + + r = sd_bus_message_enter_container(reply, 'r', "bba{ss}"); + if (r >= 0) + r = sd_bus_message_read(reply, "bb", &authorized, &challenge); + if (r < 0) + return r; + + if (authorized) + LIST_PREPEND(authorized, q->authorized_actions, TAKE_PTR(a)); + else if (challenge) { + q->error_action = TAKE_PTR(a); + sd_bus_error_set_const(&q->error, SD_BUS_ERROR_INTERACTIVE_AUTHORIZATION_REQUIRED, "Interactive authentication required."); + } else + q->denied_action = TAKE_PTR(a); + + return 0; +} + +static int async_polkit_process_reply(sd_bus_message *reply, AsyncPolkitQuery *q) { + int r; + + assert(reply); + assert(q); + + assert(q->slot); + q->slot = sd_bus_slot_unref(q->slot); + + r = async_polkit_read_reply(reply, q); + if (r < 0) + return r; + + /* Now, let's dispatch the original message a second time be re-enqueing. This will then traverse the + * whole message processing again, and thus re-validating and re-retrieving the "userdata" field + * again. + * + * We install an idle event loop event to clean-up the PolicyKit request data when we are idle again, + * i.e. after the second time the message is processed is complete. */ + + if (!q->defer_event_source) { + r = sd_event_add_defer( + sd_bus_get_event(sd_bus_message_get_bus(reply)), + &q->defer_event_source, + async_polkit_defer, + q); + if (r < 0) + return r; + + r = sd_event_source_set_priority(q->defer_event_source, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return r; + } + + r = sd_event_source_set_enabled(q->defer_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return r; + + r = sd_bus_message_rewind(q->request, true); + if (r < 0) + return r; + + r = sd_bus_enqueue_for_read(sd_bus_message_get_bus(q->request), q->request); + if (r < 0) + return r; + + return 1; +} + +static int async_polkit_callback(sd_bus_message *reply, void *userdata, sd_bus_error *error) { + AsyncPolkitQuery *q = ASSERT_PTR(userdata); + int r; + + assert(reply); + + r = async_polkit_process_reply(reply, q); + if (r < 0) { + log_debug_errno(r, "Processing asynchronous PolicyKit reply failed, ignoring: %m"); + (void) sd_bus_reply_method_errno(q->request, r, NULL); + async_polkit_query_unref(q); + } + return r; +} + +static int async_polkit_query_check_action( + AsyncPolkitQuery *q, + const char *action, + const char **details, + sd_bus_error *ret_error) { + + assert(q); + assert(action); + assert(ret_error); + + LIST_FOREACH(authorized, a, q->authorized_actions) + if (streq(a->action, action) && strv_equal(a->details, (char**) details)) + return 1; + + if (q->error_action && streq(q->error_action->action, action)) + return sd_bus_error_copy(ret_error, &q->error); + + if (q->denied_action && streq(q->denied_action->action, action)) + return -EACCES; + + return 0; +} + +#endif + +/* bus_verify_polkit_async() handles verification of D-Bus calls with polkit. Because the polkit API + * is asynchronous, the whole thing is a bit complex and requires some support in the code that uses + * it. It relies on sd-bus's support for interrupting the processing of a message. + * + * Requirements: + * + * * bus_verify_polkit_async() must be called before any changes to internal state. + * * If bus_verify_polkit_async() has made a new polkit query (signaled by return value 0), + * processing of the message should be interrupted. This is done by returning 1--which sd-bus + * handles specially--and is usually accompanied by a comment. (The message will be queued for + * processing again later when a reply from polkit is received.) + * * The code needs to keep a hashmap, here called registry, in which bus_verify_polkit_async() + * stores active queries. This hashmap's lifetime must be larger than the method handler's; + * e.g., it can be a member of some "manager" object or a global variable. + * + * Return value: + * + * * 0 - a new polkit call has been made, which means the processing of the message should be + * interrupted; + * * 1 - the action has been allowed; + * * -EACCES - the action has been denied; + * * < 0 - an unspecified error. + * + * A step-by-step description of how it works: + * + * 1. A D-Bus method handler calls bus_verify_polkit_async(), passing it the D-Bus message being + * processed and the polkit action to verify. + * 2. bus_verify_polkit_async() checks the registry for an existing query object associated with the + * message. Let's assume this is the first call, so it finds nothing. + * 3. A new AsyncPolkitQuery object is created and an async. D-Bus call to polkit is made. The + * function then returns 0. The method handler returns 1 to tell sd-bus that the processing of + * the message has been interrupted. + * 4. (Later) A reply from polkit is received and async_polkit_callback() is called. + * 5. async_polkit_callback() reads the reply and stores its result in the passed query. + * 6. async_polkit_callback() enqueues the original message again. + * 7. (Later) The same D-Bus method handler is called for the same message. It calls + * bus_verify_polkit_async() again. + * 8. bus_verify_polkit_async() checks the registry for an existing query object associated with the + * message. It finds one and returns the result for the action. + * 9. The method handler continues processing of the message. If there's another action that needs + * to be verified: + * 10. bus_verify_polkit_async() is called again for the new action. The registry already contains a + * query for the message, but the new action hasn't been seen yet, hence steps 4-8 are repeated. + * 11. (In the method handler again.) bus_verify_polkit_async() returns query results for both + * actions and the processing continues as in step 9. + * + * Memory handling: + * + * async_polkit_callback() registers a deferred call of async_polkit_defer() for the query, which + * causes the query to be removed from the registry and freed. Deferred events are run with idle + * priority, so this will happen after processing of the D-Bus message, when the query is no longer + * needed. + * + * Schematically: + * + * (m - D-Bus message, a - polkit action, q - polkit query) + * + * -> foo_method(m) + * -> bus_verify_polkit_async(m, a) + * -> async_polkit_query_ref(q) + * -> bus_call_method_async(q) + * <- bus_verify_polkit_async(m, a) = 0 + * <- foo_method(m) = 1 + * ... + * -> async_polkit_callback(q) + * -> sd_event_add_defer(async_polkit_defer, q) + * -> sd_bus_enqueue_for_read(m) + * <- async_polkit_callback(q) + * ... + * -> foo_method(m) + * -> bus_verify_polkit_async(m, a) + * <- bus_verify_polkit_async(m, a) = 1/-EACCES/error + * ... + * // possibly another call to bus_verify_polkit_async with action a2 + * <- foo_method(m) + * ... + * -> async_polkit_defer(q) + * -> async_polkit_query_unref(q) + * <- async_polkit_defer(q) + */ + +int bus_verify_polkit_async( + sd_bus_message *call, + int capability, + const char *action, + const char **details, + bool interactive, + uid_t good_user, + Hashmap **registry, + sd_bus_error *ret_error) { + + int r; + + assert(call); + assert(action); + assert(registry); + assert(ret_error); + + r = check_good_user(call, good_user); + if (r != 0) + return r; + +#if ENABLE_POLKIT + _cleanup_(async_polkit_query_unrefp) AsyncPolkitQuery *q = NULL; + + q = async_polkit_query_ref(hashmap_get(*registry, call)); + /* This is a repeated invocation of this function, hence let's check if we've already got + * a response from polkit for this action */ + if (q) { + r = async_polkit_query_check_action(q, action, details, ret_error); + if (r != 0) + return r; + } +#endif + + r = sd_bus_query_sender_privilege(call, capability); + if (r < 0) + return r; + if (r > 0) + return 1; + +#if ENABLE_POLKIT + _cleanup_(sd_bus_message_unrefp) sd_bus_message *pk = NULL; + + int c = sd_bus_message_get_allow_interactive_authorization(call); + if (c < 0) + return c; + if (c > 0) + interactive = true; + + r = hashmap_ensure_allocated(registry, NULL); + if (r < 0) + return r; + + r = bus_message_new_polkit_auth_call(call, action, details, interactive, &pk); + if (r < 0) + return r; + + if (!q) { + q = new(AsyncPolkitQuery, 1); + if (!q) + return -ENOMEM; + + *q = (AsyncPolkitQuery) { + .n_ref = 1, + .request = sd_bus_message_ref(call), + }; + } + + assert(!q->action); + q->action = new(AsyncPolkitQueryAction, 1); + if (!q->action) + return -ENOMEM; + + *q->action = (AsyncPolkitQueryAction) { + .action = strdup(action), + .details = strv_copy((char**) details), + }; + if (!q->action->action || !q->action->details) + return -ENOMEM; + + if (!q->registry) { + r = hashmap_put(*registry, call, q); + if (r < 0) + return r; + + q->registry = *registry; + } + + r = sd_bus_call_async(call->bus, &q->slot, pk, async_polkit_callback, q, 0); + if (r < 0) + return r; + + TAKE_PTR(q); + + return 0; +#endif + + return -EACCES; +} + +Hashmap *bus_verify_polkit_async_registry_free(Hashmap *registry) { +#if ENABLE_POLKIT + return hashmap_free_with_destructor(registry, async_polkit_query_unref); +#else + assert(hashmap_isempty(registry)); + return hashmap_free(registry); +#endif +} diff --git a/src/shared/bus-polkit.h b/src/shared/bus-polkit.h new file mode 100644 index 0000000..e2a3b7e --- /dev/null +++ b/src/shared/bus-polkit.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "hashmap.h" + +int bus_test_polkit(sd_bus_message *call, int capability, const char *action, const char **details, uid_t good_user, bool *_challenge, sd_bus_error *e); + +int bus_verify_polkit_async(sd_bus_message *call, int capability, const char *action, const char **details, bool interactive, uid_t good_user, Hashmap **registry, sd_bus_error *error); +Hashmap *bus_verify_polkit_async_registry_free(Hashmap *registry); diff --git a/src/shared/bus-print-properties.c b/src/shared/bus-print-properties.c new file mode 100644 index 0000000..6704e1e --- /dev/null +++ b/src/shared/bus-print-properties.c @@ -0,0 +1,440 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-print-properties.h" +#include "cap-list.h" +#include "cgroup-util.h" +#include "escape.h" +#include "mountpoint-util.h" +#include "nsflags.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "user-util.h" + +int bus_print_property_value(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *value) { + assert(name); + + if (expected_value && !streq_ptr(expected_value, value)) + return 0; + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) && isempty(value)) + return 0; + + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + puts(strempty(value)); + else + printf("%s=%s\n", name, strempty(value)); + + return 0; +} + +int bus_print_property_valuef(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *fmt, ...) { + _cleanup_free_ char *s = NULL; + va_list ap; + int r; + + assert(name); + assert(fmt); + + va_start(ap, fmt); + r = vasprintf(&s, fmt, ap); + va_end(ap); + if (r < 0) + return -ENOMEM; + + return bus_print_property_value(name, expected_value, flags, s); +} + +static int bus_print_property(const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags) { + char type; + const char *contents; + int r; + + assert(name); + assert(m); + + r = sd_bus_message_peek_type(m, &type, &contents); + if (r < 0) + return r; + + switch (type) { + + case SD_BUS_TYPE_STRING: { + const char *s; + + r = sd_bus_message_read_basic(m, type, &s); + if (r < 0) + return r; + + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || !isempty(s)) { + bool good; + + /* This property has a single value, so we need to take + * care not to print a new line, everything else is OK. */ + good = !strchr(s, '\n'); + bus_print_property_value(name, expected_value, flags, good ? s : "[unprintable]"); + } + + return 1; + } + + case SD_BUS_TYPE_BOOLEAN: { + int b; + + r = sd_bus_message_read_basic(m, type, &b); + if (r < 0) + return r; + + if (expected_value && parse_boolean(expected_value) != b) + return 1; + + bus_print_property_value(name, NULL, flags, yes_no(b)); + return 1; + } + + case SD_BUS_TYPE_UINT64: { + uint64_t u; + + r = sd_bus_message_read_basic(m, type, &u); + if (r < 0) + return r; + + /* Yes, heuristics! But we can change this check + * should it turn out to not be sufficient */ + + if (endswith(name, "Timestamp") || + STR_IN_SET(name, "NextElapseUSecRealtime", "LastTriggerUSec", "TimeUSec", "RTCTimeUSec")) + + bus_print_property_value(name, expected_value, flags, FORMAT_TIMESTAMP(u)); + + else if (strstr(name, "USec")) + bus_print_property_value(name, expected_value, flags, FORMAT_TIMESPAN(u, 0)); + + else if (streq(name, "CoredumpFilter")) + bus_print_property_valuef(name, expected_value, flags, "0x%"PRIx64, u); + + else if (streq(name, "RestrictNamespaces")) { + _cleanup_free_ char *s = NULL; + const char *result; + + if ((u & NAMESPACE_FLAGS_ALL) == 0) + result = "yes"; + else if (FLAGS_SET(u, NAMESPACE_FLAGS_ALL)) + result = "no"; + else { + r = namespace_flags_to_string(u, &s); + if (r < 0) + return r; + + result = s; + } + + bus_print_property_value(name, expected_value, flags, result); + + } else if (streq(name, "MountFlags")) { + const char *result; + + result = mount_propagation_flag_to_string(u); + if (!result) + return -EINVAL; + + bus_print_property_value(name, expected_value, flags, result); + + } else if (STR_IN_SET(name, "CapabilityBoundingSet", "AmbientCapabilities")) { + _cleanup_free_ char *s = NULL; + + r = capability_set_to_string(u, &s); + if (r < 0) + return r; + + bus_print_property_value(name, expected_value, flags, s); + + } else if (STR_IN_SET(name, "CPUWeight", "StartupCPUWeight") && u == CGROUP_WEIGHT_IDLE) + bus_print_property_value(name, expected_value, flags, "idle"); + + else if ((STR_IN_SET(name, "CPUWeight", "StartupCPUWeight", "IOWeight", "StartupIOWeight") && u == CGROUP_WEIGHT_INVALID) || + (STR_IN_SET(name, "CPUShares", "StartupCPUShares") && u == CGROUP_CPU_SHARES_INVALID) || + (STR_IN_SET(name, "BlockIOWeight", "StartupBlockIOWeight") && u == CGROUP_BLKIO_WEIGHT_INVALID) || + (STR_IN_SET(name, "MemoryCurrent", "MemoryAvailable", "TasksCurrent") && u == UINT64_MAX) || + (startswith(name, "Memory") && ENDSWITH_SET(name, "Current", "Peak") && u == CGROUP_LIMIT_MAX) || + (startswith(name, "IO") && ENDSWITH_SET(name, "Bytes", "Operations") && u == UINT64_MAX) || + (endswith(name, "NSec") && u == UINT64_MAX)) + + bus_print_property_value(name, expected_value, flags, "[not set]"); + + else if ((ENDSWITH_SET(name, "MemoryLow", "MemoryMin", "MemoryHigh", "MemoryMax", "MemorySwapMax", "MemoryZSwapMax", "MemoryLimit") && + u == CGROUP_LIMIT_MAX) || + (STR_IN_SET(name, "TasksMax", "DefaultTasksMax") && u == UINT64_MAX) || + (startswith(name, "Limit") && u == UINT64_MAX) || + (startswith(name, "DefaultLimit") && u == UINT64_MAX)) + + bus_print_property_value(name, expected_value, flags, "infinity"); + else if (STR_IN_SET(name, "IPIngressBytes", "IPIngressPackets", "IPEgressBytes", "IPEgressPackets") && u == UINT64_MAX) + bus_print_property_value(name, expected_value, flags, "[no data]"); + else + bus_print_property_valuef(name, expected_value, flags, "%"PRIu64, u); + + return 1; + } + + case SD_BUS_TYPE_INT64: { + int64_t i; + + r = sd_bus_message_read_basic(m, type, &i); + if (r < 0) + return r; + + bus_print_property_valuef(name, expected_value, flags, "%"PRIi64, i); + return 1; + } + + case SD_BUS_TYPE_UINT32: { + uint32_t u; + + r = sd_bus_message_read_basic(m, type, &u); + if (r < 0) + return r; + + if (strstr(name, "UMask") || strstr(name, "Mode")) + bus_print_property_valuef(name, expected_value, flags, "%04o", u); + + else if (streq(name, "UID")) { + if (u == UID_INVALID) + bus_print_property_value(name, expected_value, flags, "[not set]"); + else + bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u); + } else if (streq(name, "GID")) { + if (u == GID_INVALID) + bus_print_property_value(name, expected_value, flags, "[not set]"); + else + bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u); + } else + bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u); + + return 1; + } + + case SD_BUS_TYPE_INT32: { + int32_t i; + + r = sd_bus_message_read_basic(m, type, &i); + if (r < 0) + return r; + + bus_print_property_valuef(name, expected_value, flags, "%"PRIi32, i); + return 1; + } + + case SD_BUS_TYPE_DOUBLE: { + double d; + + r = sd_bus_message_read_basic(m, type, &d); + if (r < 0) + return r; + + bus_print_property_valuef(name, expected_value, flags, "%g", d); + return 1; + } + + case SD_BUS_TYPE_ARRAY: + if (streq(contents, "s")) { + bool first = true; + const char *str; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, contents); + if (r < 0) + return r; + + while ((r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &str)) > 0) { + _cleanup_free_ char *e = NULL; + + e = shell_maybe_quote(str, 0); + if (!e) + return -ENOMEM; + + if (first) { + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + printf("%s=", name); + first = false; + } else + fputs(" ", stdout); + + fputs(e, stdout); + } + if (r < 0) + return r; + + if (first && FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) && !FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + printf("%s=", name); + if (!first || FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY)) + puts(""); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 1; + + } else if (streq(contents, "y")) { + const uint8_t *u; + size_t n; + + r = sd_bus_message_read_array(m, SD_BUS_TYPE_BYTE, (const void**) &u, &n); + if (r < 0) + return r; + + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || n > 0) { + unsigned i; + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + printf("%s=", name); + + for (i = 0; i < n; i++) + printf("%02x", u[i]); + + puts(""); + } + + return 1; + + } else if (streq(contents, "u")) { + uint32_t *u; + size_t n; + + r = sd_bus_message_read_array(m, SD_BUS_TYPE_UINT32, (const void**) &u, &n); + if (r < 0) + return r; + + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || n > 0) { + unsigned i; + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) + printf("%s=", name); + + for (i = 0; i < n; i++) + printf("%08x", u[i]); + + puts(""); + } + + return 1; + } + + break; + } + + return 0; +} + +int bus_message_print_all_properties( + sd_bus_message *m, + bus_message_print_t func, + char **filter, + BusPrintPropertyFlags flags, + Set **found_properties) { + + int r; + + assert(m); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "{sv}"); + if (r < 0) + return r; + + while ((r = sd_bus_message_enter_container(m, SD_BUS_TYPE_DICT_ENTRY, "sv")) > 0) { + _cleanup_free_ char *name_with_equal = NULL; + const char *name, *contents, *expected_value = NULL; + + r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &name); + if (r < 0) + return r; + + if (found_properties) { + r = set_ensure_put(found_properties, &string_hash_ops, name); + if (r < 0) + return log_oom(); + } + + name_with_equal = strjoin(name, "="); + if (!name_with_equal) + return log_oom(); + + if (!filter || strv_contains(filter, name) || + (expected_value = strv_find_startswith(filter, name_with_equal))) { + r = sd_bus_message_peek_type(m, NULL, &contents); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_VARIANT, contents); + if (r < 0) + return r; + + if (func) + r = func(name, expected_value, m, flags); + if (!func || r == 0) + r = bus_print_property(name, expected_value, m, flags); + if (r < 0) + return r; + if (r == 0) { + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) && !expected_value) + printf("%s=[unprintable]\n", name); + /* skip what we didn't read */ + r = sd_bus_message_skip(m, contents); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + } else { + r = sd_bus_message_skip(m, "v"); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +int bus_print_all_properties( + sd_bus *bus, + const char *dest, + const char *path, + bus_message_print_t func, + char **filter, + BusPrintPropertyFlags flags, + Set **found_properties) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(path); + + r = sd_bus_call_method(bus, + dest, + path, + "org.freedesktop.DBus.Properties", + "GetAll", + &error, + &reply, + "s", ""); + if (r < 0) + return r; + + return bus_message_print_all_properties(reply, func, filter, flags, found_properties); +} diff --git a/src/shared/bus-print-properties.h b/src/shared/bus-print-properties.h new file mode 100644 index 0000000..a17875c --- /dev/null +++ b/src/shared/bus-print-properties.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +#include "macro.h" +#include "set.h" + +typedef enum BusPrintPropertyFlags { + BUS_PRINT_PROPERTY_ONLY_VALUE = 1 << 0, /* e.g. systemctl --value */ + BUS_PRINT_PROPERTY_SHOW_EMPTY = 1 << 1, /* e.g. systemctl --all */ +} BusPrintPropertyFlags; + +typedef int (*bus_message_print_t) (const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags); + +int bus_print_property_value(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *value); +int bus_print_property_valuef(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *fmt, ...) _printf_(4,5); +int bus_message_print_all_properties(sd_bus_message *m, bus_message_print_t func, char **filter, BusPrintPropertyFlags flags, Set **found_properties); +int bus_print_all_properties(sd_bus *bus, const char *dest, const char *path, bus_message_print_t func, char **filter, BusPrintPropertyFlags flags, Set **found_properties); diff --git a/src/shared/bus-unit-procs.c b/src/shared/bus-unit-procs.c new file mode 100644 index 0000000..8b462b5 --- /dev/null +++ b/src/shared/bus-unit-procs.c @@ -0,0 +1,402 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-locator.h" +#include "bus-unit-procs.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "list.h" +#include "macro.h" +#include "path-util.h" +#include "process-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "terminal-util.h" + +struct CGroupInfo { + char *cgroup_path; + bool is_const; /* If false, cgroup_path should be free()'d */ + + Hashmap *pids; /* PID → process name */ + bool done; + + struct CGroupInfo *parent; + LIST_FIELDS(struct CGroupInfo, siblings); + LIST_HEAD(struct CGroupInfo, children); + size_t n_children; +}; + +static int add_cgroup(Hashmap *cgroups, const char *path, bool is_const, struct CGroupInfo **ret) { + struct CGroupInfo *parent = NULL, *cg; + int r; + + assert(cgroups); + assert(ret); + + path = empty_to_root(path); + + cg = hashmap_get(cgroups, path); + if (cg) { + *ret = cg; + return 0; + } + + if (!empty_or_root(path)) { + const char *e, *pp; + + e = strrchr(path, '/'); + if (!e) + return -EINVAL; + + pp = strndupa_safe(path, e - path); + + r = add_cgroup(cgroups, pp, false, &parent); + if (r < 0) + return r; + } + + cg = new0(struct CGroupInfo, 1); + if (!cg) + return -ENOMEM; + + if (is_const) + cg->cgroup_path = (char*) path; + else { + cg->cgroup_path = strdup(path); + if (!cg->cgroup_path) { + free(cg); + return -ENOMEM; + } + } + + cg->is_const = is_const; + cg->parent = parent; + + r = hashmap_put(cgroups, cg->cgroup_path, cg); + if (r < 0) { + if (!is_const) + free(cg->cgroup_path); + free(cg); + return r; + } + + if (parent) { + LIST_PREPEND(siblings, parent->children, cg); + parent->n_children++; + } + + *ret = cg; + return 1; +} + +static int add_process( + Hashmap *cgroups, + const char *path, + pid_t pid, + const char *name) { + + struct CGroupInfo *cg; + int r; + + assert(cgroups); + assert(name); + assert(pid > 0); + + r = add_cgroup(cgroups, path, true, &cg); + if (r < 0) + return r; + + return hashmap_ensure_put(&cg->pids, &trivial_hash_ops, PID_TO_PTR(pid), (void*) name); +} + +static void remove_cgroup(Hashmap *cgroups, struct CGroupInfo *cg) { + assert(cgroups); + assert(cg); + + while (cg->children) + remove_cgroup(cgroups, cg->children); + + hashmap_remove(cgroups, cg->cgroup_path); + + if (!cg->is_const) + free(cg->cgroup_path); + + hashmap_free(cg->pids); + + if (cg->parent) + LIST_REMOVE(siblings, cg->parent->children, cg); + + free(cg); +} + +static int cgroup_info_compare_func(struct CGroupInfo * const *a, struct CGroupInfo * const *b) { + return strcmp((*a)->cgroup_path, (*b)->cgroup_path); +} + +static int dump_processes( + Hashmap *cgroups, + const char *cgroup_path, + const char *prefix, + unsigned n_columns, + OutputFlags flags) { + + struct CGroupInfo *cg; + int r; + + assert(prefix); + + cgroup_path = empty_to_root(cgroup_path); + + cg = hashmap_get(cgroups, cgroup_path); + if (!cg) + return 0; + + if (!hashmap_isempty(cg->pids)) { + const char *name; + size_t n = 0, i; + pid_t *pids; + void *pidp; + int width; + + /* Order processes by their PID */ + pids = newa(pid_t, hashmap_size(cg->pids)); + + HASHMAP_FOREACH_KEY(name, pidp, cg->pids) + pids[n++] = PTR_TO_PID(pidp); + + assert(n == hashmap_size(cg->pids)); + typesafe_qsort(pids, n, pid_compare_func); + + width = DECIMAL_STR_WIDTH(pids[n-1]); + + for (i = 0; i < n; i++) { + _cleanup_free_ char *e = NULL; + const char *special; + bool more; + + name = hashmap_get(cg->pids, PID_TO_PTR(pids[i])); + assert(name); + + if (n_columns != 0) { + unsigned k; + + k = MAX(LESS_BY(n_columns, 2U + width + 1U), 20U); + + e = ellipsize(name, k, 100); + if (e) + name = e; + } + + more = i+1 < n || cg->children; + special = special_glyph(more ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT); + + fprintf(stdout, "%s%s%s%*"PID_PRI" %s%s\n", + prefix, + special, + ansi_grey(), + width, pids[i], + name, + ansi_normal()); + } + } + + if (cg->children) { + struct CGroupInfo **children; + size_t n = 0, i; + + /* Order subcgroups by their name */ + children = newa(struct CGroupInfo*, cg->n_children); + LIST_FOREACH(siblings, child, cg->children) + children[n++] = child; + assert(n == cg->n_children); + typesafe_qsort(children, n, cgroup_info_compare_func); + + if (n_columns != 0) + n_columns = MAX(LESS_BY(n_columns, 2U), 20U); + + for (i = 0; i < n; i++) { + _cleanup_free_ char *pp = NULL; + const char *name, *special; + bool more; + + name = strrchr(children[i]->cgroup_path, '/'); + if (!name) + return -EINVAL; + name++; + + more = i+1 < n; + special = special_glyph(more ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT); + + fputs(prefix, stdout); + fputs(special, stdout); + fputs(name, stdout); + fputc('\n', stdout); + + special = special_glyph(more ? SPECIAL_GLYPH_TREE_VERTICAL : SPECIAL_GLYPH_TREE_SPACE); + + pp = strjoin(prefix, special); + if (!pp) + return -ENOMEM; + + r = dump_processes(cgroups, children[i]->cgroup_path, pp, n_columns, flags); + if (r < 0) + return r; + } + } + + cg->done = true; + return 0; +} + +static int dump_extra_processes( + Hashmap *cgroups, + const char *prefix, + unsigned n_columns, + OutputFlags flags) { + + _cleanup_free_ pid_t *pids = NULL; + _cleanup_hashmap_free_ Hashmap *names = NULL; + struct CGroupInfo *cg; + size_t n = 0, k; + int width, r; + + /* Prints the extra processes, i.e. those that are in cgroups we haven't displayed yet. We show them as + * combined, sorted, linear list. */ + + HASHMAP_FOREACH(cg, cgroups) { + const char *name; + void *pidp; + + if (cg->done) + continue; + + if (hashmap_isempty(cg->pids)) + continue; + + r = hashmap_ensure_allocated(&names, &trivial_hash_ops); + if (r < 0) + return r; + + if (!GREEDY_REALLOC(pids, n + hashmap_size(cg->pids))) + return -ENOMEM; + + HASHMAP_FOREACH_KEY(name, pidp, cg->pids) { + pids[n++] = PTR_TO_PID(pidp); + + r = hashmap_put(names, pidp, (void*) name); + if (r < 0) + return r; + } + } + + if (n == 0) + return 0; + + typesafe_qsort(pids, n, pid_compare_func); + width = DECIMAL_STR_WIDTH(pids[n-1]); + + for (k = 0; k < n; k++) { + _cleanup_free_ char *e = NULL; + const char *name; + + name = hashmap_get(names, PID_TO_PTR(pids[k])); + assert(name); + + if (n_columns != 0) { + unsigned z; + + z = MAX(LESS_BY(n_columns, 2U + width + 1U), 20U); + + e = ellipsize(name, z, 100); + if (e) + name = e; + } + + fprintf(stdout, "%s%s %*" PID_PRI " %s\n", + prefix, + special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET), + width, pids[k], + name); + } + + return 0; +} + +int unit_show_processes( + sd_bus *bus, + const char *unit, + const char *cgroup_path, + const char *prefix, + unsigned n_columns, + OutputFlags flags, + sd_bus_error *error) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + Hashmap *cgroups = NULL; + struct CGroupInfo *cg; + int r; + + assert(bus); + assert(unit); + + if (flags & OUTPUT_FULL_WIDTH) + n_columns = 0; + else if (n_columns <= 0) + n_columns = columns(); + + prefix = strempty(prefix); + + r = bus_call_method( + bus, + bus_systemd_mgr, + "GetUnitProcesses", + error, + &reply, + "s", + unit); + if (r < 0) + return r; + + cgroups = hashmap_new(&path_hash_ops); + if (!cgroups) + return -ENOMEM; + + r = sd_bus_message_enter_container(reply, 'a', "(sus)"); + if (r < 0) + goto finish; + + for (;;) { + const char *path = NULL, *name = NULL; + uint32_t pid; + + r = sd_bus_message_read(reply, "(sus)", &path, &pid, &name); + if (r < 0) + goto finish; + if (r == 0) + break; + + r = add_process(cgroups, path, pid, name); + if (r == -ENOMEM) + goto finish; + if (r < 0) + log_warning_errno(r, "Invalid process description in GetUnitProcesses reply: cgroup=\"%s\" pid=%u command=\"%s\", ignoring: %m", + path, pid, name); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + goto finish; + + r = dump_processes(cgroups, cgroup_path, prefix, n_columns, flags); + if (r < 0) + goto finish; + + r = dump_extra_processes(cgroups, prefix, n_columns, flags); + +finish: + while ((cg = hashmap_first(cgroups))) + remove_cgroup(cgroups, cg); + + hashmap_free(cgroups); + + return r; +} diff --git a/src/shared/bus-unit-procs.h b/src/shared/bus-unit-procs.h new file mode 100644 index 0000000..78c5569 --- /dev/null +++ b/src/shared/bus-unit-procs.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "output-mode.h" + +int unit_show_processes(sd_bus *bus, const char *unit, const char *cgroup_path, const char *prefix, unsigned n_columns, OutputFlags flags, sd_bus_error *error); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c new file mode 100644 index 0000000..50de989 --- /dev/null +++ b/src/shared/bus-unit-util.c @@ -0,0 +1,2938 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "af-list.h" +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "cap-list.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "condition.h" +#include "coredump-util.h" +#include "cpu-set-util.h" +#include "dissect-image.h" +#include "escape.h" +#include "exec-util.h" +#include "exit-status.h" +#include "fileio.h" +#include "firewall-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "ioprio-util.h" +#include "ip-protocol-list.h" +#include "libmount-util.h" +#include "locale-util.h" +#include "log.h" +#include "macro.h" +#include "missing_fs.h" +#include "mountpoint-util.h" +#include "nsflags.h" +#include "numa-util.h" +#include "open-file.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "seccomp-util.h" +#include "securebits-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "sort-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "syslog-util.h" +#include "terminal-util.h" +#include "unit-def.h" +#include "user-util.h" +#include "utf8.h" + +int bus_parse_unit_info(sd_bus_message *message, UnitInfo *u) { + assert(message); + assert(u); + + u->machine = NULL; + + return sd_bus_message_read( + message, + "(ssssssouso)", + &u->id, + &u->description, + &u->load_state, + &u->active_state, + &u->sub_state, + &u->following, + &u->unit_path, + &u->job_id, + &u->job_type, + &u->job_path); +} + +#define DEFINE_BUS_APPEND_PARSE_PTR(bus_type, cast_type, type, parse_func) \ + static int bus_append_##parse_func( \ + sd_bus_message *m, \ + const char *field, \ + const char *eq) { \ + type val; \ + int r; \ + \ + r = parse_func(eq, &val); \ + if (r < 0) \ + return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq); \ + \ + r = sd_bus_message_append(m, "(sv)", field, \ + bus_type, (cast_type) val); \ + if (r < 0) \ + return bus_log_create_error(r); \ + \ + return 1; \ + } + +#define DEFINE_BUS_APPEND_PARSE(bus_type, parse_func) \ + static int bus_append_##parse_func( \ + sd_bus_message *m, \ + const char *field, \ + const char *eq) { \ + int r; \ + \ + r = parse_func(eq); \ + if (r < 0) \ + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s: %s", field, eq); \ + \ + r = sd_bus_message_append(m, "(sv)", field, \ + bus_type, (int32_t) r); \ + if (r < 0) \ + return bus_log_create_error(r); \ + \ + return 1; \ + } + +DEFINE_BUS_APPEND_PARSE("b", parse_boolean); +DEFINE_BUS_APPEND_PARSE("i", ioprio_class_from_string); +DEFINE_BUS_APPEND_PARSE("i", ip_tos_from_string); +DEFINE_BUS_APPEND_PARSE("i", log_facility_unshifted_from_string); +DEFINE_BUS_APPEND_PARSE("i", log_level_from_string); +DEFINE_BUS_APPEND_PARSE("i", seccomp_parse_errno_or_action); +DEFINE_BUS_APPEND_PARSE("i", sched_policy_from_string); +DEFINE_BUS_APPEND_PARSE("i", secure_bits_from_string); +DEFINE_BUS_APPEND_PARSE("i", signal_from_string); +DEFINE_BUS_APPEND_PARSE("i", parse_ip_protocol); +DEFINE_BUS_APPEND_PARSE_PTR("i", int32_t, int, ioprio_parse_priority); +DEFINE_BUS_APPEND_PARSE_PTR("i", int32_t, int, parse_nice); +DEFINE_BUS_APPEND_PARSE_PTR("i", int32_t, int, safe_atoi); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, nsec_t, parse_nsec); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_blkio_weight_parse); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_cpu_shares_parse); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_weight_parse); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_cpu_weight_parse); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, unsigned long, mount_propagation_flag_from_string); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, safe_atou64); +DEFINE_BUS_APPEND_PARSE_PTR("u", uint32_t, mode_t, parse_mode); +DEFINE_BUS_APPEND_PARSE_PTR("u", uint32_t, unsigned, safe_atou); +DEFINE_BUS_APPEND_PARSE_PTR("x", int64_t, int64_t, safe_atoi64); +DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, coredump_filter_mask_from_string); + +static int bus_append_string(sd_bus_message *m, const char *field, const char *eq) { + int r; + + r = sd_bus_message_append(m, "(sv)", field, "s", eq); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_strv(sd_bus_message *m, const char *field, const char *eq, ExtractFlags flags) { + const char *p; + int r; + + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, 's', field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "as"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return bus_log_create_error(r); + + for (p = eq;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, flags); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Invalid syntax: %s", eq); + + r = sd_bus_message_append_basic(m, 's', word); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_byte_array(sd_bus_message *m, const char *field, const void *buf, size_t n) { + int r; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "ay"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_array(m, 'y', buf, n); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_parse_sec_rename(sd_bus_message *m, const char *field, const char *eq) { + char *n; + usec_t t; + size_t l; + int r; + + r = parse_sec(eq, &t); + if (r < 0) + return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq); + + l = strlen(field); + n = newa(char, l + 2); + /* Change suffix Sec → USec */ + strcpy(mempcpy(n, field, l - 3), "USec"); + + r = sd_bus_message_append(m, "(sv)", n, "t", t); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_parse_size(sd_bus_message *m, const char *field, const char *eq, uint64_t base) { + uint64_t v; + int r; + + r = parse_size(eq, base, &v); + if (r < 0) + return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq); + + r = sd_bus_message_append(m, "(sv)", field, "t", v); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_exec_command(sd_bus_message *m, const char *field, const char *eq) { + bool explicit_path = false, done = false; + _cleanup_strv_free_ char **l = NULL, **ex_opts = NULL; + _cleanup_free_ char *path = NULL, *upgraded_name = NULL; + ExecCommandFlags flags = 0; + bool is_ex_prop = endswith(field, "Ex"); + int r; + + do { + switch (*eq) { + + case '-': + if (FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE)) + done = true; + else { + flags |= EXEC_COMMAND_IGNORE_FAILURE; + eq++; + } + break; + + case '@': + if (explicit_path) + done = true; + else { + explicit_path = true; + eq++; + } + break; + + case ':': + if (FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND)) + done = true; + else { + flags |= EXEC_COMMAND_NO_ENV_EXPAND; + eq++; + } + break; + + case '+': + if (flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)) + done = true; + else { + flags |= EXEC_COMMAND_FULLY_PRIVILEGED; + eq++; + } + break; + + case '!': + if (flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC)) + done = true; + else if (FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID)) { + flags &= ~EXEC_COMMAND_NO_SETUID; + flags |= EXEC_COMMAND_AMBIENT_MAGIC; + eq++; + } else { + flags |= EXEC_COMMAND_NO_SETUID; + eq++; + } + break; + + default: + done = true; + break; + } + } while (!done); + + if (!is_ex_prop && (flags & (EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) { + /* Upgrade the ExecXYZ= property to ExecXYZEx= for convenience */ + is_ex_prop = true; + upgraded_name = strjoin(field, "Ex"); + if (!upgraded_name) + return log_oom(); + } + + if (is_ex_prop) { + r = exec_command_flags_to_strv(flags, &ex_opts); + if (r < 0) + return log_error_errno(r, "Failed to convert ExecCommandFlags to strv: %m"); + } + + if (explicit_path) { + r = extract_first_word(&eq, &path, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE); + if (r < 0) + return log_error_errno(r, "Failed to parse path: %m"); + } + + r = strv_split_full(&l, eq, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE); + if (r < 0) + return log_error_errno(r, "Failed to parse command line: %m"); + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, upgraded_name ?: field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', is_ex_prop ? "a(sasas)" : "a(sasb)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', is_ex_prop ? "(sasas)" : "(sasb)"); + if (r < 0) + return bus_log_create_error(r); + + if (!strv_isempty(l)) { + + r = sd_bus_message_open_container(m, 'r', is_ex_prop ? "sasas" : "sasb"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", path ?: l[0]); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, l); + if (r < 0) + return bus_log_create_error(r); + + r = is_ex_prop ? sd_bus_message_append_strv(m, ex_opts) : sd_bus_message_append(m, "b", FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_open_file(sd_bus_message *m, const char *field, const char *eq) { + _cleanup_(open_file_freep) OpenFile *of = NULL; + int r; + + assert(m); + + r = open_file_parse(eq, &of); + if (r < 0) + return log_error_errno(r, "Failed to parse OpenFile= setting: %m"); + + r = sd_bus_message_append(m, "(sv)", field, "a(sst)", (size_t) 1, of->path, of->fdname, of->flags); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_ip_address_access(sd_bus_message *m, int family, const union in_addr_union *prefix, unsigned char prefixlen) { + int r; + + assert(m); + assert(prefix); + + r = sd_bus_message_open_container(m, 'r', "iayu"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "i", family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(m, 'y', prefix, FAMILY_ADDRESS_SIZE(family)); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "u", prefixlen); + if (r < 0) + return r; + + return sd_bus_message_close_container(m); +} + +static int bus_append_nft_set(sd_bus_message *m, const char *field, const char *eq) { + int r; + + assert(m); + assert(field); + assert(eq); + + if (isempty(eq)) { + r = sd_bus_message_append(m, "(sv)", field, "a(iiss)", 0); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(iiss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(iiss)"); + if (r < 0) + return bus_log_create_error(r); + + for (const char *p = eq;;) { + _cleanup_free_ char *tuple = NULL, *source_str = NULL, *nfproto_str = NULL, *table = NULL, *set = NULL; + const char *q = NULL; + int source, nfproto; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", field); + if (r == 0) + break; + if (isempty(tuple)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field); + + q = tuple; + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE, &source_str, &nfproto_str, &table, &set, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r != 4 || !isempty(q)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field); + + assert(source_str); + assert(nfproto_str); + assert(table); + assert(set); + + source = nft_set_source_from_string(source_str); + if (!IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field); + + nfproto = nfproto_from_string(nfproto_str); + if (nfproto < 0 || !nft_identifier_valid(table) || !nft_identifier_valid(set)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field); + + r = sd_bus_message_append(m, "(iiss)", source, nfproto, table, set); + if (r < 0) + return bus_log_create_error(r); + } + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; +} + +static int bus_append_cgroup_property(sd_bus_message *m, const char *field, const char *eq) { + int r; + + if (STR_IN_SET(field, "DevicePolicy", + "Slice", + "ManagedOOMSwap", + "ManagedOOMMemoryPressure", + "ManagedOOMPreference", + "MemoryPressureWatch", + "DelegateSubgroup")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { + r = parse_permyriad(eq); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value: %s", field, eq); + + /* Pass around scaled to 2^32-1 == 100% */ + r = sd_bus_message_append(m, "(sv)", field, "u", UINT32_SCALE_FROM_PERMYRIAD(r)); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "CPUAccounting", + "MemoryAccounting", + "IOAccounting", + "BlockIOAccounting", + "TasksAccounting", + "IPAccounting", + "CoredumpReceive")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "CPUWeight", + "StartupCPUWeight")) + return bus_append_cg_cpu_weight_parse(m, field, eq); + + if (STR_IN_SET(field, "IOWeight", + "StartupIOWeight")) + return bus_append_cg_weight_parse(m, field, eq); + + if (STR_IN_SET(field, "CPUShares", + "StartupCPUShares")) + return bus_append_cg_cpu_shares_parse(m, field, eq); + + if (STR_IN_SET(field, "AllowedCPUs", + "StartupAllowedCPUs", + "AllowedMemoryNodes", + "StartupAllowedMemoryNodes")) { + _cleanup_(cpu_set_reset) CPUSet cpuset = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + r = parse_cpu_set(eq, &cpuset); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value: %s", field, eq); + + r = cpu_set_to_dbus(&cpuset, &array, &allocated); + if (r < 0) + return log_error_errno(r, "Failed to serialize CPUSet: %m"); + + return bus_append_byte_array(m, field, array, allocated); + } + + if (STR_IN_SET(field, "BlockIOWeight", + "StartupBlockIOWeight")) + return bus_append_cg_blkio_weight_parse(m, field, eq); + + if (streq(field, "DisableControllers")) + return bus_append_strv(m, "DisableControllers", eq, EXTRACT_UNQUOTE); + + if (streq(field, "Delegate")) { + r = parse_boolean(eq); + if (r < 0) + return bus_append_strv(m, "DelegateControllers", eq, EXTRACT_UNQUOTE); + + r = sd_bus_message_append(m, "(sv)", "Delegate", "b", r); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "MemoryMin", + "DefaultMemoryLow", + "DefaultMemoryMin", + "MemoryLow", + "MemoryHigh", + "MemoryMax", + "MemorySwapMax", + "MemoryZSwapMax", + "MemoryLimit", + "TasksMax")) { + + if (streq(eq, "infinity")) { + r = sd_bus_message_append(m, "(sv)", field, "t", CGROUP_LIMIT_MAX); + if (r < 0) + return bus_log_create_error(r); + return 1; + } else if (isempty(eq)) { + uint64_t empty_value = STR_IN_SET(field, + "DefaultMemoryLow", + "DefaultMemoryMin", + "MemoryLow", + "MemoryMin") ? + CGROUP_LIMIT_MIN : + CGROUP_LIMIT_MAX; + + r = sd_bus_message_append(m, "(sv)", field, "t", empty_value); + if (r < 0) + return bus_log_create_error(r); + return 1; + } + + r = parse_permyriad(eq); + if (r >= 0) { + char *n; + + /* When this is a percentage we'll convert this into a relative value in the range 0…UINT32_MAX + * and pass it in the MemoryLowScale property (and related ones). This way the physical memory + * size can be determined server-side. */ + + n = strjoina(field, "Scale"); + r = sd_bus_message_append(m, "(sv)", n, "u", UINT32_SCALE_FROM_PERMYRIAD(r)); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "TasksMax")) + return bus_append_safe_atou64(m, field, eq); + + return bus_append_parse_size(m, field, eq, 1024); + } + + if (streq(field, "CPUQuota")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", "CPUQuotaPerSecUSec", "t", USEC_INFINITY); + else { + r = parse_permyriad_unbounded(eq); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ERANGE), + "CPU quota too small."); + if (r < 0) + return log_error_errno(r, "CPU quota '%s' invalid.", eq); + + r = sd_bus_message_append(m, "(sv)", "CPUQuotaPerSecUSec", "t", (((uint64_t) r * USEC_PER_SEC) / 10000U)); + } + + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "CPUQuotaPeriodSec")) { + usec_t u = USEC_INFINITY; + + r = parse_sec_def_infinity(eq, &u); + if (r < 0) + return log_error_errno(r, "CPU quota period '%s' invalid.", eq); + + r = sd_bus_message_append(m, "(sv)", "CPUQuotaPeriodUSec", "t", u); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "DeviceAllow")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 0); + else { + const char *path = eq, *rwm = NULL, *e; + + e = strchr(eq, ' '); + if (e) { + path = strndupa_safe(eq, e - eq); + rwm = e+1; + } + + r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 1, path, strempty(rwm)); + } + + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (cgroup_io_limit_type_from_string(field) >= 0 || STR_IN_SET(field, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "a(st)", 0); + else { + const char *path, *bandwidth, *e; + uint64_t bytes; + + e = strchr(eq, ' '); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse %s value %s.", + field, eq); + + path = strndupa_safe(eq, e - eq); + bandwidth = e+1; + + if (streq(bandwidth, "infinity")) + bytes = CGROUP_LIMIT_MAX; + else { + r = parse_size(bandwidth, 1000, &bytes); + if (r < 0) + return log_error_errno(r, "Failed to parse byte value %s: %m", bandwidth); + } + + r = sd_bus_message_append(m, "(sv)", field, "a(st)", 1, path, bytes); + } + + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "IODeviceWeight", + "BlockIODeviceWeight")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "a(st)", 0); + else { + const char *path, *weight, *e; + uint64_t u; + + e = strchr(eq, ' '); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse %s value %s.", + field, eq); + + path = strndupa_safe(eq, e - eq); + weight = e+1; + + r = safe_atou64(weight, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value %s: %m", field, weight); + + r = sd_bus_message_append(m, "(sv)", field, "a(st)", 1, path, u); + } + + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "IODeviceLatencyTargetSec")) { + const char *field_usec = "IODeviceLatencyTargetUSec"; + + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field_usec, "a(st)", USEC_INFINITY); + else { + const char *path, *target, *e; + usec_t usec; + + e = strchr(eq, ' '); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse %s value %s.", + field, eq); + + path = strndupa_safe(eq, e - eq); + target = e+1; + + r = parse_sec(target, &usec); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value %s: %m", field, target); + + r = sd_bus_message_append(m, "(sv)", field_usec, "a(st)", 1, path, usec); + } + + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "IPAddressAllow", + "IPAddressDeny")) { + unsigned char prefixlen; + union in_addr_union prefix = {}; + int family; + + if (isempty(eq)) { + r = sd_bus_message_append(m, "(sv)", field, "a(iayu)", 0); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(iayu)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(iayu)"); + if (r < 0) + return bus_log_create_error(r); + + if (streq(eq, "any")) { + /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */ + + r = bus_append_ip_address_access(m, AF_INET, &prefix, 0); + if (r < 0) + return bus_log_create_error(r); + + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 0); + if (r < 0) + return bus_log_create_error(r); + + } else if (is_localhost(eq)) { + /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */ + + prefix.in.s_addr = htobe32(0x7f000000); + r = bus_append_ip_address_access(m, AF_INET, &prefix, 8); + if (r < 0) + return bus_log_create_error(r); + + prefix.in6 = (struct in6_addr) IN6ADDR_LOOPBACK_INIT; + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 128); + if (r < 0) + return r; + + } else if (streq(eq, "link-local")) { + /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */ + + prefix.in.s_addr = htobe32((UINT32_C(169) << 24 | UINT32_C(254) << 16)); + r = bus_append_ip_address_access(m, AF_INET, &prefix, 16); + if (r < 0) + return bus_log_create_error(r); + + prefix.in6 = (struct in6_addr) { + .s6_addr32[0] = htobe32(0xfe800000) + }; + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 64); + if (r < 0) + return bus_log_create_error(r); + + } else if (streq(eq, "multicast")) { + /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */ + + prefix.in.s_addr = htobe32((UINT32_C(224) << 24)); + r = bus_append_ip_address_access(m, AF_INET, &prefix, 4); + if (r < 0) + return bus_log_create_error(r); + + prefix.in6 = (struct in6_addr) { + .s6_addr32[0] = htobe32(0xff000000) + }; + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 8); + if (r < 0) + return bus_log_create_error(r); + + } else { + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&eq, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %s", field, eq); + + r = in_addr_prefix_from_string_auto(word, &family, &prefix, &prefixlen); + if (r < 0) + return log_error_errno(r, "Failed to parse IP address prefix: %s", word); + + r = bus_append_ip_address_access(m, family, &prefix, prefixlen); + if (r < 0) + return bus_log_create_error(r); + } + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "IPIngressFilterPath", + "IPEgressFilterPath")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "as", 0); + else + r = sd_bus_message_append(m, "(sv)", field, "as", 1, eq); + + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "BPFProgram")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 0); + else { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&eq, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", field); + + r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 1, word, eq); + } + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "SocketBindAllow", + "SocketBindDeny")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "a(iiqq)", 0); + else { + int32_t family, ip_protocol; + uint16_t nr_ports, port_min; + + r = parse_socket_bind_item(eq, &family, &ip_protocol, &nr_ports, &port_min); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s", field); + + r = sd_bus_message_append( + m, "(sv)", field, "a(iiqq)", 1, family, ip_protocol, nr_ports, port_min); + } + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "MemoryPressureThresholdSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (streq(field, "NFTSet")) + return bus_append_nft_set(m, field, eq); + + return 0; +} + +static int bus_append_automount_property(sd_bus_message *m, const char *field, const char *eq) { + if (STR_IN_SET(field, "Where", + "ExtraOptions")) + return bus_append_string(m, field, eq); + + if (streq(field, "DirectoryMode")) + return bus_append_parse_mode(m, field, eq); + + if (streq(field, "TimeoutIdleSec")) + return bus_append_parse_sec_rename(m, field, eq); + + return 0; +} + +static int bus_append_execute_property(sd_bus_message *m, const char *field, const char *eq) { + const char *suffix; + int r; + + if (STR_IN_SET(field, "User", + "Group", + "UtmpIdentifier", + "UtmpMode", + "PAMName", + "TTYPath", + "WorkingDirectory", + "RootDirectory", + "SyslogIdentifier", + "ProtectSystem", + "ProtectHome", + "SELinuxContext", + "RootImage", + "RootVerity", + "RuntimeDirectoryPreserve", + "Personality", + "KeyringMode", + "ProtectProc", + "ProcSubset", + "NetworkNamespacePath", + "IPCNamespacePath", + "LogNamespace", + "RootImagePolicy", + "MountImagePolicy", + "ExtensionImagePolicy")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "IgnoreSIGPIPE", + "TTYVHangup", + "TTYReset", + "TTYVTDisallocate", + "PrivateTmp", + "PrivateDevices", + "PrivateNetwork", + "PrivateUsers", + "PrivateMounts", + "PrivateIPC", + "NoNewPrivileges", + "SyslogLevelPrefix", + "MemoryDenyWriteExecute", + "RestrictRealtime", + "DynamicUser", + "RemoveIPC", + "ProtectKernelTunables", + "ProtectKernelModules", + "ProtectKernelLogs", + "ProtectClock", + "ProtectControlGroups", + "MountAPIVFS", + "CPUSchedulingResetOnFork", + "LockPersonality", + "ProtectHostname", + "MemoryKSM", + "RestrictSUIDSGID", + "RootEphemeral", + "SetLoginEnvironment")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "ReadWriteDirectories", + "ReadOnlyDirectories", + "InaccessibleDirectories", + "ReadWritePaths", + "ReadOnlyPaths", + "InaccessiblePaths", + "ExecPaths", + "NoExecPaths", + "ExecSearchPath", + "ExtensionDirectories", + "ConfigurationDirectory", + "SupplementaryGroups", + "SystemCallArchitectures")) + return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE); + + if (STR_IN_SET(field, "SyslogLevel", + "LogLevelMax")) + return bus_append_log_level_from_string(m, field, eq); + + if (streq(field, "SyslogFacility")) + return bus_append_log_facility_unshifted_from_string(m, field, eq); + + if (streq(field, "SecureBits")) + return bus_append_secure_bits_from_string(m, field, eq); + + if (streq(field, "CPUSchedulingPolicy")) + return bus_append_sched_policy_from_string(m, field, eq); + + if (STR_IN_SET(field, "CPUSchedulingPriority", + "OOMScoreAdjust")) + return bus_append_safe_atoi(m, field, eq); + + if (streq(field, "CoredumpFilter")) + return bus_append_coredump_filter_mask_from_string(m, field, eq); + + if (streq(field, "Nice")) + return bus_append_parse_nice(m, field, eq); + + if (streq(field, "SystemCallErrorNumber")) + return bus_append_seccomp_parse_errno_or_action(m, field, eq); + + if (streq(field, "IOSchedulingClass")) + return bus_append_ioprio_class_from_string(m, field, eq); + + if (streq(field, "IOSchedulingPriority")) + return bus_append_ioprio_parse_priority(m, field, eq); + + if (STR_IN_SET(field, "RuntimeDirectoryMode", + "StateDirectoryMode", + "CacheDirectoryMode", + "LogsDirectoryMode", + "ConfigurationDirectoryMode", + "UMask")) + return bus_append_parse_mode(m, field, eq); + + if (streq(field, "TimerSlackNSec")) + return bus_append_parse_nsec(m, field, eq); + + if (streq(field, "LogRateLimitIntervalSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (STR_IN_SET(field, "LogRateLimitBurst", + "TTYRows", + "TTYColumns")) + return bus_append_safe_atou(m, field, eq); + + if (streq(field, "MountFlags")) + return bus_append_mount_propagation_flag_from_string(m, field, eq); + + if (STR_IN_SET(field, "Environment", + "UnsetEnvironment", + "PassEnvironment")) + return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE); + + if (streq(field, "EnvironmentFile")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", "EnvironmentFiles", "a(sb)", 0); + else + r = sd_bus_message_append(m, "(sv)", "EnvironmentFiles", "a(sb)", 1, + eq[0] == '-' ? eq + 1 : eq, + eq[0] == '-'); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "SetCredential", "SetCredentialEncrypted")) { + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, 's', field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(say)"); + if (r < 0) + return bus_log_create_error(r); + + if (isempty(eq)) + r = sd_bus_message_append(m, "a(say)", 0); + else { + _cleanup_free_ char *word = NULL; + const char *p = eq; + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s= parameter: %s", field, eq); + if (r == 0 || !p) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing argument to %s=.", field); + + r = sd_bus_message_open_container(m, 'a', "(say)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'r', "say"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", word); + if (r < 0) + return bus_log_create_error(r); + + if (streq(field, "SetCredentialEncrypted")) { + _cleanup_free_ void *decoded = NULL; + size_t decoded_size; + + r = unbase64mem(p, SIZE_MAX, &decoded, &decoded_size); + if (r < 0) + return log_error_errno(r, "Failed to base64 decode encrypted credential: %m"); + + r = sd_bus_message_append_array(m, 'y', decoded, decoded_size); + } else { + _cleanup_free_ char *unescaped = NULL; + ssize_t l; + + l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped); + if (l < 0) + return log_error_errno(l, "Failed to unescape %s= value: %s", field, p); + + r = sd_bus_message_append_array(m, 'y', unescaped, l); + } + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + } + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "LoadCredential", "LoadCredentialEncrypted")) { + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, 's', field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(ss)"); + if (r < 0) + return bus_log_create_error(r); + + if (isempty(eq)) + r = sd_bus_message_append(m, "a(ss)", 0); + else { + _cleanup_free_ char *word = NULL; + const char *p = eq; + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s= parameter: %s", field, eq); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing argument to %s=.", field); + + if (isempty(p)) /* If only one field is specified, then this means "inherit from above" */ + p = eq; + + r = sd_bus_message_append(m, "a(ss)", 1, word, p); + } + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "ImportCredential")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "as", 0); + else + r = sd_bus_message_append(m, "(sv)", field, "as", 1, eq); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "LogExtraFields")) { + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, 's', "LogExtraFields"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "aay"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "ay"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_array(m, 'y', eq, strlen(eq)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "LogFilterPatterns")) { + r = sd_bus_message_append(m, "(sv)", "LogFilterPatterns", "a(bs)", 1, + eq[0] != '~', + eq[0] != '~' ? eq : eq + 1); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "StandardInput", + "StandardOutput", + "StandardError")) { + const char *n, *appended; + + if ((n = startswith(eq, "fd:"))) { + appended = strjoina(field, "FileDescriptorName"); + r = sd_bus_message_append(m, "(sv)", appended, "s", n); + } else if ((n = startswith(eq, "file:"))) { + appended = strjoina(field, "File"); + r = sd_bus_message_append(m, "(sv)", appended, "s", n); + } else if ((n = startswith(eq, "append:"))) { + appended = strjoina(field, "FileToAppend"); + r = sd_bus_message_append(m, "(sv)", appended, "s", n); + } else if ((n = startswith(eq, "truncate:"))) { + appended = strjoina(field, "FileToTruncate"); + r = sd_bus_message_append(m, "(sv)", appended, "s", n); + } else + r = sd_bus_message_append(m, "(sv)", field, "s", eq); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "StandardInputText")) { + _cleanup_free_ char *unescaped = NULL; + ssize_t l; + + l = cunescape(eq, 0, &unescaped); + if (l < 0) + return log_error_errno(l, "Failed to unescape text '%s': %m", eq); + + if (!strextend(&unescaped, "\n")) + return log_oom(); + + /* Note that we don't expand specifiers here, but that should be OK, as this is a + * programmatic interface anyway */ + + return bus_append_byte_array(m, field, unescaped, l + 1); + } + + if (streq(field, "StandardInputData")) { + _cleanup_free_ void *decoded = NULL; + size_t sz; + + r = unbase64mem(eq, SIZE_MAX, &decoded, &sz); + if (r < 0) + return log_error_errno(r, "Failed to decode base64 data '%s': %m", eq); + + return bus_append_byte_array(m, field, decoded, sz); + } + + if ((suffix = startswith(field, "Limit"))) { + int rl; + + rl = rlimit_from_string(suffix); + if (rl >= 0) { + const char *sn; + struct rlimit l; + + r = rlimit_parse(rl, eq, &l); + if (r < 0) + return log_error_errno(r, "Failed to parse resource limit: %s", eq); + + r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) l.rlim_max); + if (r < 0) + return bus_log_create_error(r); + + sn = strjoina(field, "Soft"); + r = sd_bus_message_append(m, "(sv)", sn, "t", (uint64_t) l.rlim_cur); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + } + + if (STR_IN_SET(field, "AppArmorProfile", + "SmackProcessLabel")) { + int ignore = 0; + const char *s = eq; + + if (eq[0] == '-') { + ignore = 1; + s = eq + 1; + } + + r = sd_bus_message_append(m, "(sv)", field, "(bs)", ignore, s); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "CapabilityBoundingSet", + "AmbientCapabilities")) { + uint64_t sum = 0; + bool invert = false; + const char *p = eq; + + if (*p == '~') { + invert = true; + p++; + } + + r = capability_set_from_string(p, &sum); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value %s: %m", field, eq); + + sum = invert ? ~sum : sum; + + r = sd_bus_message_append(m, "(sv)", field, "t", sum); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "CPUAffinity")) { + _cleanup_(cpu_set_reset) CPUSet cpuset = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + if (eq && streq(eq, "numa")) { + r = sd_bus_message_append(m, "(sv)", "CPUAffinityFromNUMA", "b", true); + if (r < 0) + return bus_log_create_error(r); + return r; + } + + r = parse_cpu_set(eq, &cpuset); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value: %s", field, eq); + + r = cpu_set_to_dbus(&cpuset, &array, &allocated); + if (r < 0) + return log_error_errno(r, "Failed to serialize CPUAffinity: %m"); + + return bus_append_byte_array(m, field, array, allocated); + } + + if (streq(field, "NUMAPolicy")) { + r = mpol_from_string(eq); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value: %s", field, eq); + + r = sd_bus_message_append(m, "(sv)", field, "i", (int32_t) r); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "NUMAMask")) { + _cleanup_(cpu_set_reset) CPUSet nodes = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + if (eq && streq(eq, "all")) { + r = numa_mask_add_all(&nodes); + if (r < 0) + return log_error_errno(r, "Failed to create NUMA mask representing \"all\" NUMA nodes: %m"); + } else { + r = parse_cpu_set(eq, &nodes); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value: %s", field, eq); + } + + r = cpu_set_to_dbus(&nodes, &array, &allocated); + if (r < 0) + return log_error_errno(r, "Failed to serialize NUMAMask: %m"); + + return bus_append_byte_array(m, field, array, allocated); + } + + if (STR_IN_SET(field, "RestrictAddressFamilies", + "RestrictFileSystems", + "SystemCallFilter", + "SystemCallLog", + "RestrictNetworkInterfaces")) { + int allow_list = 1; + const char *p = eq; + + if (*p == '~') { + allow_list = 0; + p++; + } + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "(bas)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'r', "bas"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, 'b', &allow_list); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Invalid syntax: %s", eq); + + r = sd_bus_message_append_basic(m, 's', word); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "RestrictNamespaces")) { + bool invert = false; + unsigned long flags; + + r = parse_boolean(eq); + if (r > 0) + flags = 0; + else if (r == 0) + flags = NAMESPACE_FLAGS_ALL; + else { + if (eq[0] == '~') { + invert = true; + eq++; + } + + r = namespace_flags_from_string(eq, &flags); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value %s.", field, eq); + } + + if (invert) + flags = (~flags) & NAMESPACE_FLAGS_ALL; + + r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "BindPaths", + "BindReadOnlyPaths")) { + const char *p = eq; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(ssbt)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ssbt)"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *source = NULL, *destination = NULL; + char *s = NULL, *d = NULL; + bool ignore_enoent = false; + uint64_t flags = MS_REC; + + r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + if (r == 0) + break; + + s = source; + if (s[0] == '-') { + ignore_enoent = true; + s++; + } + + if (p && p[-1] == ':') { + r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Missing argument after ':': %s", + eq); + + d = destination; + + if (p && p[-1] == ':') { + _cleanup_free_ char *options = NULL; + + r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + + if (isempty(options) || streq(options, "rbind")) + flags = MS_REC; + else if (streq(options, "norbind")) + flags = 0; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown options: %s", + eq); + } + } else + d = s; + + r = sd_bus_message_append(m, "(ssbt)", s, d, ignore_enoent, flags); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "TemporaryFileSystem")) { + const char *p = eq; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *word = NULL, *path = NULL; + const char *w; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + if (r == 0) + break; + + w = word; + r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse argument: %s", + p); + + r = sd_bus_message_append(m, "(ss)", path, w); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "RootHash")) { + _cleanup_free_ void *roothash_decoded = NULL; + size_t roothash_decoded_size = 0; + + /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */ + if (path_is_absolute(eq)) + return bus_append_string(m, "RootHashPath", eq); + + /* We have a roothash to decode, eg: RootHash=012345789abcdef */ + r = unhexmem(eq, strlen(eq), &roothash_decoded, &roothash_decoded_size); + if (r < 0) + return log_error_errno(r, "Failed to decode RootHash= '%s': %m", eq); + if (roothash_decoded_size < sizeof(sd_id128_t)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "RootHash= '%s' is too short: %m", eq); + + return bus_append_byte_array(m, field, roothash_decoded, roothash_decoded_size); + } + + if (streq(field, "RootHashSignature")) { + _cleanup_free_ void *roothash_sig_decoded = NULL; + char *value; + size_t roothash_sig_decoded_size = 0; + + /* We have the path to a roothash signature to load and decode, eg: RootHash=/foo/bar.roothash.p7s */ + if (path_is_absolute(eq)) + return bus_append_string(m, "RootHashSignaturePath", eq); + + if (!(value = startswith(eq, "base64:"))) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to decode RootHashSignature= '%s', not a path but doesn't start with 'base64:': %m", eq); + + /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */ + r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size); + if (r < 0) + return log_error_errno(r, "Failed to decode RootHashSignature= '%s': %m", eq); + + return bus_append_byte_array(m, field, roothash_sig_decoded, roothash_sig_decoded_size); + } + + if (streq(field, "RootImageOptions")) { + _cleanup_strv_free_ char **l = NULL; + const char *p = eq; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = strv_split_colon_pairs(&l, p); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + + STRV_FOREACH_PAIR(first, second, l) { + r = sd_bus_message_append(m, "(ss)", + !isempty(*second) ? *first : "root", + !isempty(*second) ? *second : *first); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "MountImages")) { + const char *p = eq; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(ssba(ss))"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ssba(ss))"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL; + const char *q = NULL, *source = NULL; + bool permissive = false; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_error_errno(r, "Failed to parse MountImages= property: %s", eq); + if (r == 0) + break; + + q = tuple; + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse MountImages= property: %s", eq); + if (r == 0) + continue; + + source = first; + if (source[0] == '-') { + permissive = true; + source++; + } + + if (isempty(second)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Missing argument after ':': %s", + eq); + + r = sd_bus_message_open_container(m, 'r', "ssba(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "ssb", source, second, permissive); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *partition = NULL, *mount_options = NULL; + + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse MountImages= property: %s", eq); + if (r == 0) + break; + /* Single set of options, applying to the root partition/single filesystem */ + if (r == 1) { + r = sd_bus_message_append(m, "(ss)", "root", partition); + if (r < 0) + return bus_log_create_error(r); + + break; + } + + r = sd_bus_message_append(m, "(ss)", partition, mount_options); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "ExtensionImages")) { + const char *p = eq; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(sba(ss))"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sba(ss))"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *source = NULL, *tuple = NULL; + const char *q = NULL, *s = NULL; + bool permissive = false; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_error_errno(r, "Failed to parse ExtensionImages= property: %s", eq); + if (r == 0) + break; + + q = tuple; + r = extract_first_word(&q, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse ExtensionImages= property: %s", eq); + if (r == 0) + continue; + + s = source; + if (s[0] == '-') { + permissive = true; + s++; + } + + r = sd_bus_message_open_container(m, 'r', "sba(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "sb", s, permissive); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + for (;;) { + _cleanup_free_ char *partition = NULL, *mount_options = NULL; + + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse ExtensionImages= property: %s", eq); + if (r == 0) + break; + /* Single set of options, applying to the root partition/single filesystem */ + if (r == 1) { + r = sd_bus_message_append(m, "(ss)", "root", partition); + if (r < 0) + return bus_log_create_error(r); + + break; + } + + r = sd_bus_message_append(m, "(ss)", partition, mount_options); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "StateDirectory", "RuntimeDirectory", "CacheDirectory", "LogsDirectory")) { + _cleanup_strv_free_ char **symlinks = NULL, **sources = NULL; + const char *p = eq; + + /* Adding new directories is supported from both *DirectorySymlink methods and the + * older ones, so first parse the input, and if we are given a new-style src:dst + * tuple use the new method, else use the old one. */ + + for (;;) { + _cleanup_free_ char *tuple = NULL, *source = NULL, *destination = NULL; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_error_errno(r, "Failed to parse argument: %m"); + if (r == 0) + break; + + const char *t = tuple; + r = extract_many_words(&t, ":", EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL); + if (r <= 0) + return log_error_errno(r ?: SYNTHETIC_ERRNO(EINVAL), "Failed to parse argument: %m"); + + path_simplify(source); + + if (isempty(destination)) { + r = strv_consume(&sources, TAKE_PTR(source)); + if (r < 0) + return bus_log_create_error(r); + } else { + path_simplify(destination); + + r = strv_consume_pair(&symlinks, TAKE_PTR(source), TAKE_PTR(destination)); + if (r < 0) + return log_oom(); + } + } + + if (!strv_isempty(sources)) { + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "as"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, sources); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + /* For State and Runtime directories we support an optional destination parameter, which + * will be used to create a symlink to the source. But it is new so we cannot change the + * old DBUS signatures, so append a new message type. */ + if (!strv_isempty(symlinks)) { + const char *symlink_field; + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + if (streq(field, "StateDirectory")) + symlink_field = "StateDirectorySymlink"; + else if (streq(field, "RuntimeDirectory")) + symlink_field = "RuntimeDirectorySymlink"; + else if (streq(field, "CacheDirectory")) + symlink_field = "CacheDirectorySymlink"; + else if (streq(field, "LogsDirectory")) + symlink_field = "LogsDirectorySymlink"; + else + assert_not_reached(); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, symlink_field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(sst)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sst)"); + if (r < 0) + return bus_log_create_error(r); + + STRV_FOREACH_PAIR(source, destination, symlinks) { + r = sd_bus_message_append(m, "(sst)", *source, *destination, UINT64_C(0)); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + return 1; + } + + return 0; +} + +static int bus_append_kill_property(sd_bus_message *m, const char *field, const char *eq) { + if (streq(field, "KillMode")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "SendSIGHUP", + "SendSIGKILL")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "KillSignal", + "RestartKillSignal", + "FinalKillSignal", + "WatchdogSignal", + "ReloadSignal")) + return bus_append_signal_from_string(m, field, eq); + + return 0; +} + +static int bus_append_mount_property(sd_bus_message *m, const char *field, const char *eq) { + + if (STR_IN_SET(field, "What", + "Where", + "Options", + "Type")) + return bus_append_string(m, field, eq); + + if (streq(field, "TimeoutSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (streq(field, "DirectoryMode")) + return bus_append_parse_mode(m, field, eq); + + if (STR_IN_SET(field, "SloppyOptions", + "LazyUnmount", + "ForceUnmount", + "ReadwriteOnly")) + return bus_append_parse_boolean(m, field, eq); + + return 0; +} + +static int bus_append_path_property(sd_bus_message *m, const char *field, const char *eq) { + int r; + + if (streq(field, "MakeDirectory")) + return bus_append_parse_boolean(m, field, eq); + + if (streq(field, "DirectoryMode")) + return bus_append_parse_mode(m, field, eq); + + if (STR_IN_SET(field, "PathExists", + "PathExistsGlob", + "PathChanged", + "PathModified", + "DirectoryNotEmpty")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", "Paths", "a(ss)", 0); + else + r = sd_bus_message_append(m, "(sv)", "Paths", "a(ss)", 1, field, eq); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (STR_IN_SET(field, "TriggerLimitBurst", "PollLimitBurst")) + return bus_append_safe_atou(m, field, eq); + + if (STR_IN_SET(field, "TriggerLimitIntervalSec", "PollLimitIntervalSec")) + return bus_append_parse_sec_rename(m, field, eq); + + return 0; +} + +static int bus_append_scope_property(sd_bus_message *m, const char *field, const char *eq) { + if (streq(field, "RuntimeMaxSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (streq(field, "RuntimeRandomizedExtraSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (streq(field, "TimeoutStopSec")) + return bus_append_parse_sec_rename(m, field, eq); + + /* Scope units don't have execution context but we still want to allow setting these two, + * so let's handle them separately. */ + if (STR_IN_SET(field, "User", "Group")) + return bus_append_string(m, field, eq); + + if (streq(field, "OOMPolicy")) + return bus_append_string(m, field, eq); + + return 0; +} + +static int bus_append_service_property(sd_bus_message *m, const char *field, const char *eq) { + int r; + + if (STR_IN_SET(field, "PIDFile", + "Type", + "ExitType", + "Restart", + "RestartMode", + "BusName", + "NotifyAccess", + "USBFunctionDescriptors", + "USBFunctionStrings", + "OOMPolicy", + "TimeoutStartFailureMode", + "TimeoutStopFailureMode", + "FileDescriptorStorePreserve")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "PermissionsStartOnly", + "RootDirectoryStartOnly", + "RemainAfterExit", + "GuessMainPID")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "RestartSec", + "RestartMaxDelaySec", + "TimeoutStartSec", + "TimeoutStopSec", + "TimeoutAbortSec", + "RuntimeMaxSec", + "RuntimeRandomizedExtraSec", + "WatchdogSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (streq(field, "TimeoutSec")) { + r = bus_append_parse_sec_rename(m, "TimeoutStartSec", eq); + if (r < 0) + return r; + + return bus_append_parse_sec_rename(m, "TimeoutStopSec", eq); + } + + if (STR_IN_SET(field, "FileDescriptorStoreMax", + "RestartSteps")) + return bus_append_safe_atou(m, field, eq); + + if (STR_IN_SET(field, "ExecCondition", + "ExecStartPre", + "ExecStart", + "ExecStartPost", + "ExecConditionEx", + "ExecStartPreEx", + "ExecStartEx", + "ExecStartPostEx", + "ExecReload", + "ExecStop", + "ExecStopPost", + "ExecReloadEx", + "ExecStopEx", + "ExecStopPostEx")) + return bus_append_exec_command(m, field, eq); + + if (STR_IN_SET(field, "RestartPreventExitStatus", + "RestartForceExitStatus", + "SuccessExitStatus")) { + _cleanup_free_ int *status = NULL, *signal = NULL; + size_t n_status = 0, n_signal = 0; + const char *p; + + for (p = eq;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Invalid syntax in %s: %s", field, eq); + + /* We need to call exit_status_from_string() first, because we want + * to parse numbers as exit statuses, not signals. */ + + r = exit_status_from_string(word); + if (r >= 0) { + assert(r >= 0 && r < 256); + + status = reallocarray(status, n_status + 1, sizeof(int)); + if (!status) + return log_oom(); + + status[n_status++] = r; + + } else if ((r = signal_from_string(word)) >= 0) { + signal = reallocarray(signal, n_signal + 1, sizeof(int)); + if (!signal) + return log_oom(); + + signal[n_signal++] = r; + + } else + /* original r from exit_status_to_string() */ + return log_error_errno(r, "Invalid status or signal %s in %s: %m", + word, field); + } + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "(aiai)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'r', "aiai"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_array(m, 'i', status, n_status * sizeof(int)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_array(m, 'i', signal, n_signal * sizeof(int)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "OpenFile")) + return bus_append_open_file(m, field, eq); + + return 0; +} + +static int bus_append_socket_property(sd_bus_message *m, const char *field, const char *eq) { + int r; + + if (STR_IN_SET(field, "Accept", + "FlushPending", + "Writable", + "KeepAlive", + "NoDelay", + "FreeBind", + "Transparent", + "Broadcast", + "PassCredentials", + "PassSecurity", + "PassPacketInfo", + "ReusePort", + "RemoveOnStop", + "SELinuxContextFromNet")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "Priority", + "IPTTL", + "Mark")) + return bus_append_safe_atoi(m, field, eq); + + if (streq(field, "IPTOS")) + return bus_append_ip_tos_from_string(m, field, eq); + + if (STR_IN_SET(field, "Backlog", + "MaxConnections", + "MaxConnectionsPerSource", + "KeepAliveProbes", + "TriggerLimitBurst", + "PollLimitBurst")) + return bus_append_safe_atou(m, field, eq); + + if (STR_IN_SET(field, "SocketMode", + "DirectoryMode")) + return bus_append_parse_mode(m, field, eq); + + if (STR_IN_SET(field, "MessageQueueMaxMessages", + "MessageQueueMessageSize")) + return bus_append_safe_atoi64(m, field, eq); + + if (STR_IN_SET(field, "TimeoutSec", + "KeepAliveTimeSec", + "KeepAliveIntervalSec", + "DeferAcceptSec", + "TriggerLimitIntervalSec", + "PollLimitIntervalSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (STR_IN_SET(field, "ReceiveBuffer", + "SendBuffer", + "PipeSize")) + return bus_append_parse_size(m, field, eq, 1024); + + if (STR_IN_SET(field, "ExecStartPre", + "ExecStartPost", + "ExecReload", + "ExecStopPost")) + return bus_append_exec_command(m, field, eq); + + if (STR_IN_SET(field, "SmackLabel", + "SmackLabelIPIn", + "SmackLabelIPOut", + "TCPCongestion", + "BindToDevice", + "BindIPv6Only", + "FileDescriptorName", + "SocketUser", + "SocketGroup", + "Timestamping")) + return bus_append_string(m, field, eq); + + if (streq(field, "Symlinks")) + return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE); + + if (streq(field, "SocketProtocol")) + return bus_append_parse_ip_protocol(m, field, eq); + + if (STR_IN_SET(field, "ListenStream", + "ListenDatagram", + "ListenSequentialPacket", + "ListenNetlink", + "ListenSpecial", + "ListenMessageQueue", + "ListenFIFO", + "ListenUSBFunction")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", "Listen", "a(ss)", 0); + else + r = sd_bus_message_append(m, "(sv)", "Listen", "a(ss)", 1, field + STRLEN("Listen"), eq); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + return 0; +} +static int bus_append_timer_property(sd_bus_message *m, const char *field, const char *eq) { + int r; + + if (STR_IN_SET(field, "WakeSystem", + "RemainAfterElapse", + "Persistent", + "OnTimezoneChange", + "OnClockChange", + "FixedRandomDelay")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "AccuracySec", + "RandomizedDelaySec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (STR_IN_SET(field, "OnActiveSec", + "OnBootSec", + "OnStartupSec", + "OnUnitActiveSec", + "OnUnitInactiveSec")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", "TimersMonotonic", "a(st)", 0); + else { + usec_t t; + r = parse_sec(eq, &t); + if (r < 0) + return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq); + + r = sd_bus_message_append(m, "(sv)", "TimersMonotonic", "a(st)", 1, field, t); + } + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (streq(field, "OnCalendar")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", "TimersCalendar", "a(ss)", 0); + else + r = sd_bus_message_append(m, "(sv)", "TimersCalendar", "a(ss)", 1, field, eq); + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + return 0; +} + +static int bus_append_unit_property(sd_bus_message *m, const char *field, const char *eq) { + ConditionType t = _CONDITION_TYPE_INVALID; + bool is_condition = false; + int r; + + if (STR_IN_SET(field, "Description", + "SourcePath", + "OnFailureJobMode", + "JobTimeoutAction", + "JobTimeoutRebootArgument", + "StartLimitAction", + "FailureAction", + "SuccessAction", + "RebootArgument", + "CollectMode")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "StopWhenUnneeded", + "RefuseManualStart", + "RefuseManualStop", + "AllowIsolate", + "IgnoreOnIsolate", + "SurviveFinalKillSignal", + "DefaultDependencies")) + return bus_append_parse_boolean(m, field, eq); + + if (STR_IN_SET(field, "JobTimeoutSec", + "JobRunningTimeoutSec", + "StartLimitIntervalSec")) + return bus_append_parse_sec_rename(m, field, eq); + + if (streq(field, "StartLimitBurst")) + return bus_append_safe_atou(m, field, eq); + + if (STR_IN_SET(field, "SuccessActionExitStatus", + "FailureActionExitStatus")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "i", -1); + else { + uint8_t u; + + r = safe_atou8(eq, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse %s=%s", field, eq); + + r = sd_bus_message_append(m, "(sv)", field, "i", (int) u); + } + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + if (unit_dependency_from_string(field) >= 0 || + STR_IN_SET(field, "Documentation", + "RequiresMountsFor", + "Markers")) + return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE); + + t = condition_type_from_string(field); + if (t >= 0) + is_condition = true; + else + t = assert_type_from_string(field); + if (t >= 0) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", is_condition ? "Conditions" : "Asserts", "a(sbbs)", 0); + else { + const char *p = eq; + int trigger, negate; + + trigger = *p == '|'; + if (trigger) + p++; + + negate = *p == '!'; + if (negate) + p++; + + r = sd_bus_message_append(m, "(sv)", is_condition ? "Conditions" : "Asserts", "a(sbbs)", 1, + field, trigger, negate, p); + } + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + + return 0; +} + +int bus_append_unit_property_assignment(sd_bus_message *m, UnitType t, const char *assignment) { + const char *eq, *field; + int r; + + assert(m); + assert(assignment); + + eq = strchr(assignment, '='); + if (!eq) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Not an assignment: %s", assignment); + + field = strndupa_safe(assignment, eq - assignment); + eq++; + + switch (t) { + case UNIT_SERVICE: + r = bus_append_cgroup_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_execute_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_kill_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_service_property(m, field, eq); + if (r != 0) + return r; + break; + + case UNIT_SOCKET: + r = bus_append_cgroup_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_execute_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_kill_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_socket_property(m, field, eq); + if (r != 0) + return r; + break; + + case UNIT_TIMER: + r = bus_append_timer_property(m, field, eq); + if (r != 0) + return r; + break; + + case UNIT_PATH: + r = bus_append_path_property(m, field, eq); + if (r != 0) + return r; + break; + + case UNIT_SLICE: + r = bus_append_cgroup_property(m, field, eq); + if (r != 0) + return r; + break; + + case UNIT_SCOPE: + r = bus_append_cgroup_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_kill_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_scope_property(m, field, eq); + if (r != 0) + return r; + break; + + case UNIT_MOUNT: + r = bus_append_cgroup_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_execute_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_kill_property(m, field, eq); + if (r != 0) + return r; + + r = bus_append_mount_property(m, field, eq); + if (r != 0) + return r; + + break; + + case UNIT_AUTOMOUNT: + r = bus_append_automount_property(m, field, eq); + if (r != 0) + return r; + + break; + + case UNIT_TARGET: + case UNIT_DEVICE: + case UNIT_SWAP: + break; + + default: + assert_not_reached(); + } + + r = bus_append_unit_property(m, field, eq); + if (r != 0) + return r; + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown assignment: %s", assignment); +} + +int bus_append_unit_property_assignment_many(sd_bus_message *m, UnitType t, char **l) { + int r; + + assert(m); + + STRV_FOREACH(i, l) { + r = bus_append_unit_property_assignment(m, t, *i); + if (r < 0) + return r; + } + + return 0; +} + +int bus_append_scope_pidref(sd_bus_message *m, const PidRef *pidref) { + assert(m); + + if (!pidref_is_set(pidref)) + return -ESRCH; + + if (pidref->fd >= 0) + return sd_bus_message_append( + m, "(sv)", + "PIDFDs", "ah", 1, pidref->fd); + + return sd_bus_message_append( + m, "(sv)", + "PIDs", "au", 1, pidref->pid); +} + +int bus_deserialize_and_dump_unit_file_changes(sd_bus_message *m, bool quiet) { + const char *type, *path, *source; + InstallChange *changes = NULL; + size_t n_changes = 0; + int r; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(sss)", &type, &path, &source)) > 0) { + InstallChangeType t; + + /* We expect only "success" changes to be sent over the bus. Hence, reject anything + * negative. */ + t = install_change_type_from_string(type); + if (t < 0) { + log_notice_errno(t, "Manager reported unknown change type \"%s\" for path \"%s\", ignoring.", + type, path); + continue; + } + + r = install_changes_add(&changes, &n_changes, t, path, source); + if (r < 0) + return r; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + install_changes_dump(0, NULL, changes, n_changes, quiet); + + return 0; +} + +int unit_load_state(sd_bus *bus, const char *name, char **load_state) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL; + int r; + + path = unit_dbus_path_from_name(name); + if (!path) + return log_oom(); + + /* This function warns on its own, because otherwise it'd be awkward to pass + * the dbus error message around. */ + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "LoadState", + &error, + load_state); + if (r < 0) + return log_error_errno(r, "Failed to get load state of %s: %s", name, bus_error_message(&error, r)); + + return 0; +} + +int unit_info_compare(const UnitInfo *a, const UnitInfo *b) { + int r; + + /* First, order by machine */ + r = strcasecmp_ptr(a->machine, b->machine); + if (r != 0) + return r; + + /* Second, order by unit type */ + r = strcasecmp_ptr(strrchr(a->id, '.'), strrchr(b->id, '.')); + if (r != 0) + return r; + + /* Third, order by name */ + return strcasecmp(a->id, b->id); +} + +int bus_service_manager_reload(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "Reload"); + if (r < 0) + return bus_log_create_error(r); + + /* Reloading the daemon may take long, hence set a longer timeout here */ + r = sd_bus_call(bus, m, DAEMON_RELOAD_TIMEOUT_SEC, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to reload service manager: %s", bus_error_message(&error, r)); + + return 0; +} diff --git a/src/shared/bus-unit-util.h b/src/shared/bus-unit-util.h new file mode 100644 index 0000000..d52c847 --- /dev/null +++ b/src/shared/bus-unit-util.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "install.h" +#include "pidref.h" +#include "unit-def.h" + +typedef struct UnitInfo { + const char *machine; + const char *id; + const char *description; + const char *load_state; + const char *active_state; + const char *sub_state; + const char *following; + const char *unit_path; + uint32_t job_id; + const char *job_type; + const char *job_path; +} UnitInfo; + +int bus_parse_unit_info(sd_bus_message *message, UnitInfo *u); + +int bus_append_unit_property_assignment(sd_bus_message *m, UnitType t, const char *assignment); +int bus_append_unit_property_assignment_many(sd_bus_message *m, UnitType t, char **l); + +int bus_append_scope_pidref(sd_bus_message *m, const PidRef *pidref); + +int bus_deserialize_and_dump_unit_file_changes(sd_bus_message *m, bool quiet); + +int unit_load_state(sd_bus *bus, const char *name, char **load_state); + +int unit_info_compare(const UnitInfo *a, const UnitInfo *b); + +int bus_service_manager_reload(sd_bus *bus); diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c new file mode 100644 index 0000000..4123152 --- /dev/null +++ b/src/shared/bus-util.c @@ -0,0 +1,711 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-daemon.h" +#include "sd-event.h" +#include "sd-id128.h" + +#include "bus-common-errors.h" +#include "bus-internal.h" +#include "bus-label.h" +#include "bus-util.h" +#include "data-fd-util.h" +#include "fd-util.h" +#include "memstream-util.h" +#include "path-util.h" +#include "socket-util.h" +#include "stdio-util.h" + +static int name_owner_change_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + sd_event *e = ASSERT_PTR(userdata); + + assert(m); + + sd_bus_close(sd_bus_message_get_bus(m)); + sd_event_exit(e, 0); + + return 1; +} + +int bus_log_address_error(int r, BusTransport transport) { + bool hint = transport == BUS_TRANSPORT_LOCAL && r == -ENOMEDIUM; + + return log_error_errno(r, + hint ? "Failed to set bus address: $DBUS_SESSION_BUS_ADDRESS and $XDG_RUNTIME_DIR not defined (consider using --machine=@.host --user to connect to bus of other user)" : + "Failed to set bus address: %m"); +} + +int bus_log_connect_error(int r, BusTransport transport) { + bool hint_vars = transport == BUS_TRANSPORT_LOCAL && r == -ENOMEDIUM, + hint_addr = transport == BUS_TRANSPORT_LOCAL && ERRNO_IS_PRIVILEGE(r); + + return log_error_errno(r, + r == hint_vars ? "Failed to connect to bus: $DBUS_SESSION_BUS_ADDRESS and $XDG_RUNTIME_DIR not defined (consider using --machine=@.host --user to connect to bus of other user)" : + r == hint_addr ? "Failed to connect to bus: Operation not permitted (consider using --machine=@.host --user to connect to bus of other user)" : + "Failed to connect to bus: %m"); +} + +int bus_async_unregister_and_exit(sd_event *e, sd_bus *bus, const char *name) { + const char *match; + const char *unique; + int r; + + assert(e); + assert(bus); + assert(name); + + /* We unregister the name here and then wait for the + * NameOwnerChanged signal for this event to arrive before we + * quit. We do this in order to make sure that any queued + * requests are still processed before we really exit. */ + + r = sd_bus_get_unique_name(bus, &unique); + if (r < 0) + return r; + + match = strjoina( + "sender='org.freedesktop.DBus'," + "type='signal'," + "interface='org.freedesktop.DBus'," + "member='NameOwnerChanged'," + "path='/org/freedesktop/DBus'," + "arg0='", name, "',", + "arg1='", unique, "',", + "arg2=''"); + + r = sd_bus_add_match_async(bus, NULL, match, name_owner_change_callback, NULL, e); + if (r < 0) + return r; + + r = sd_bus_release_name_async(bus, NULL, name, NULL, NULL); + if (r < 0) + return r; + + return 0; +} + +int bus_event_loop_with_idle( + sd_event *e, + sd_bus *bus, + const char *name, + usec_t timeout, + check_idle_t check_idle, + void *userdata) { + + bool exiting = false; + int r, code; + + assert(e); + assert(bus); + assert(name); + + for (;;) { + bool idle; + + r = sd_event_get_state(e); + if (r < 0) + return r; + if (r == SD_EVENT_FINISHED) + break; + + if (check_idle) + idle = check_idle(userdata); + else + idle = true; + + r = sd_event_run(e, exiting || !idle ? UINT64_MAX : timeout); + if (r < 0) + return r; + + if (r == 0 && !exiting && idle) { + /* Inform the service manager that we are going down, so that it will queue all + * further start requests, instead of assuming we are already running. */ + sd_notify(false, "STOPPING=1"); + + r = bus_async_unregister_and_exit(e, bus, name); + if (r < 0) + return r; + + exiting = true; + } + } + + r = sd_event_get_exit_code(e, &code); + if (r < 0) + return r; + + return code; +} + +int bus_name_has_owner(sd_bus *c, const char *name, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *rep = NULL; + int r, has_owner = 0; + + assert(c); + assert(name); + + r = sd_bus_call_method(c, + "org.freedesktop.DBus", + "/org/freedesktop/dbus", + "org.freedesktop.DBus", + "NameHasOwner", + error, + &rep, + "s", + name); + if (r < 0) + return r; + + r = sd_bus_message_read_basic(rep, 'b', &has_owner); + if (r < 0) + return sd_bus_error_set_errno(error, r); + + return has_owner; +} + +bool bus_error_is_unknown_service(const sd_bus_error *error) { + return sd_bus_error_has_names(error, + SD_BUS_ERROR_SERVICE_UNKNOWN, + SD_BUS_ERROR_NAME_HAS_NO_OWNER, + BUS_ERROR_NO_SUCH_UNIT); +} + +int bus_check_peercred(sd_bus *c) { + struct ucred ucred; + int fd, r; + + assert(c); + + fd = sd_bus_get_fd(c); + if (fd < 0) + return fd; + + r = getpeercred(fd, &ucred); + if (r < 0) + return r; + + if (ucred.uid != 0 && ucred.uid != geteuid()) + return -EPERM; + + return 1; +} + +int bus_connect_system_systemd(sd_bus **ret_bus) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + assert(ret_bus); + + if (geteuid() != 0) + return sd_bus_default_system(ret_bus); + + /* If we are root then let's talk directly to the system + * instance, instead of going via the bus */ + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + r = sd_bus_set_address(bus, "unix:path=/run/systemd/private"); + if (r < 0) + return r; + + r = sd_bus_start(bus); + if (r < 0) + return sd_bus_default_system(ret_bus); + + r = bus_check_peercred(bus); + if (r < 0) + return r; + + *ret_bus = TAKE_PTR(bus); + return 0; +} + +int bus_connect_user_systemd(sd_bus **ret_bus) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *ee = NULL; + const char *e; + int r; + + assert(ret_bus); + + e = secure_getenv("XDG_RUNTIME_DIR"); + if (!e) + return sd_bus_default_user(ret_bus); + + ee = bus_address_escape(e); + if (!ee) + return -ENOMEM; + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + bus->address = strjoin("unix:path=", ee, "/systemd/private"); + if (!bus->address) + return -ENOMEM; + + r = sd_bus_start(bus); + if (r < 0) + return sd_bus_default_user(ret_bus); + + r = bus_check_peercred(bus); + if (r < 0) + return r; + + *ret_bus = TAKE_PTR(bus); + return 0; +} + +int bus_connect_transport( + BusTransport transport, + const char *host, + RuntimeScope runtime_scope, + sd_bus **ret) { + + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + assert(transport >= 0); + assert(transport < _BUS_TRANSPORT_MAX); + assert(ret); + + assert_return((transport == BUS_TRANSPORT_LOCAL) == !host, -EINVAL); + assert_return(transport != BUS_TRANSPORT_REMOTE || runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); + + switch (transport) { + + case BUS_TRANSPORT_LOCAL: + + switch (runtime_scope) { + + case RUNTIME_SCOPE_USER: + r = sd_bus_default_user(&bus); + break; + + case RUNTIME_SCOPE_SYSTEM: + if (sd_booted() <= 0) + /* Print a friendly message when the local system is actually not running systemd as PID 1. */ + return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "System has not been booted with systemd as init system (PID 1). Can't operate."); + r = sd_bus_default_system(&bus); + break; + + default: + assert_not_reached(); + } + break; + + case BUS_TRANSPORT_REMOTE: + r = sd_bus_open_system_remote(&bus, host); + break; + + case BUS_TRANSPORT_MACHINE: + + switch (runtime_scope) { + + case RUNTIME_SCOPE_USER: + r = sd_bus_open_user_machine(&bus, host); + break; + + case RUNTIME_SCOPE_SYSTEM: + r = sd_bus_open_system_machine(&bus, host); + break; + + default: + assert_not_reached(); + } + + break; + + default: + assert_not_reached(); + } + if (r < 0) + return r; + + r = sd_bus_set_exit_on_disconnect(bus, true); + if (r < 0) + return r; + + *ret = TAKE_PTR(bus); + return 0; +} + +int bus_connect_transport_systemd(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus) { + assert(transport >= 0); + assert(transport < _BUS_TRANSPORT_MAX); + assert(bus); + + assert_return((transport == BUS_TRANSPORT_LOCAL) == !host, -EINVAL); + assert_return(transport == BUS_TRANSPORT_LOCAL || runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP); + + switch (transport) { + + case BUS_TRANSPORT_LOCAL: + switch (runtime_scope) { + + case RUNTIME_SCOPE_USER: + return bus_connect_user_systemd(bus); + + case RUNTIME_SCOPE_SYSTEM: + if (sd_booted() <= 0) + /* Print a friendly message when the local system is actually not running systemd as PID 1. */ + return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "System has not been booted with systemd as init system (PID 1). Can't operate."); + return bus_connect_system_systemd(bus); + + default: + assert_not_reached(); + } + + break; + + case BUS_TRANSPORT_REMOTE: + return sd_bus_open_system_remote(bus, host); + + case BUS_TRANSPORT_MACHINE: + return sd_bus_open_system_machine(bus, host); + + default: + assert_not_reached(); + } +} + +/** + * bus_path_encode_unique() - encode unique object path + * @b: bus connection or NULL + * @prefix: object path prefix + * @sender_id: unique-name of client, or NULL + * @external_id: external ID to be chosen by client, or NULL + * @ret_path: storage for encoded object path pointer + * + * Whenever we provide a bus API that allows clients to create and manage + * server-side objects, we need to provide a unique name for these objects. If + * we let the server choose the name, we suffer from a race condition: If a + * client creates an object asynchronously, it cannot destroy that object until + * it received the method reply. It cannot know the name of the new object, + * thus, it cannot destroy it. Furthermore, it enforces a round-trip. + * + * Therefore, many APIs allow the client to choose the unique name for newly + * created objects. There're two problems to solve, though: + * 1) Object names are usually defined via dbus object paths, which are + * usually globally namespaced. Therefore, multiple clients must be able + * to choose unique object names without interference. + * 2) If multiple libraries share the same bus connection, they must be + * able to choose unique object names without interference. + * The first problem is solved easily by prefixing a name with the + * unique-bus-name of a connection. The server side must enforce this and + * reject any other name. The second problem is solved by providing unique + * suffixes from within sd-bus. + * + * This helper allows clients to create unique object-paths. It uses the + * template '/prefix/sender_id/external_id' and returns the new path in + * @ret_path (must be freed by the caller). + * If @sender_id is NULL, the unique-name of @b is used. If @external_id is + * NULL, this function allocates a unique suffix via @b (by requesting a new + * cookie). If both @sender_id and @external_id are given, @b can be passed as + * NULL. + * + * Returns: 0 on success, negative error code on failure. + */ +int bus_path_encode_unique(sd_bus *b, const char *prefix, const char *sender_id, const char *external_id, char **ret_path) { + _cleanup_free_ char *sender_label = NULL, *external_label = NULL; + char external_buf[DECIMAL_STR_MAX(uint64_t)], *p; + int r; + + assert_return(b || (sender_id && external_id), -EINVAL); + assert_return(sd_bus_object_path_is_valid(prefix), -EINVAL); + assert_return(ret_path, -EINVAL); + + if (!sender_id) { + r = sd_bus_get_unique_name(b, &sender_id); + if (r < 0) + return r; + } + + if (!external_id) { + xsprintf(external_buf, "%"PRIu64, ++b->cookie); + external_id = external_buf; + } + + sender_label = bus_label_escape(sender_id); + if (!sender_label) + return -ENOMEM; + + external_label = bus_label_escape(external_id); + if (!external_label) + return -ENOMEM; + + p = path_join(prefix, sender_label, external_label); + if (!p) + return -ENOMEM; + + *ret_path = p; + return 0; +} + +/** + * bus_path_decode_unique() - decode unique object path + * @path: object path to decode + * @prefix: object path prefix + * @ret_sender: output parameter for sender-id label + * @ret_external: output parameter for external-id label + * + * This does the reverse of bus_path_encode_unique() (see its description for + * details). Both trailing labels, sender-id and external-id, are unescaped and + * returned in the given output parameters (the caller must free them). + * + * Note that this function returns 0 if the path does not match the template + * (see bus_path_encode_unique()), 1 if it matched. + * + * Returns: Negative error code on failure, 0 if the given object path does not + * match the template (return parameters are set to NULL), 1 if it was + * parsed successfully (return parameters contain allocated labels). + */ +int bus_path_decode_unique(const char *path, const char *prefix, char **ret_sender, char **ret_external) { + const char *p, *q; + char *sender, *external; + + assert(sd_bus_object_path_is_valid(path)); + assert(sd_bus_object_path_is_valid(prefix)); + assert(ret_sender); + assert(ret_external); + + p = object_path_startswith(path, prefix); + if (!p) { + *ret_sender = NULL; + *ret_external = NULL; + return 0; + } + + q = strchr(p, '/'); + if (!q) { + *ret_sender = NULL; + *ret_external = NULL; + return 0; + } + + sender = bus_label_unescape_n(p, q - p); + external = bus_label_unescape(q + 1); + if (!sender || !external) { + free(sender); + free(external); + return -ENOMEM; + } + + *ret_sender = sender; + *ret_external = external; + return 1; +} + +int bus_track_add_name_many(sd_bus_track *t, char **l) { + int r = 0; + + assert(t); + + /* Continues adding after failure, and returns the first failure. */ + + STRV_FOREACH(i, l) + RET_GATHER(r, sd_bus_track_add_name(t, *i)); + return r; +} + +int bus_open_system_watch_bind_with_description(sd_bus **ret, const char *description) { + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + const char *e; + int r; + + assert(ret); + + /* Match like sd_bus_open_system(), but with the "watch_bind" feature and the Connected() signal + * turned on. */ + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + if (description) { + r = sd_bus_set_description(bus, description); + if (r < 0) + return r; + } + + e = secure_getenv("DBUS_SYSTEM_BUS_ADDRESS"); + if (!e) + e = DEFAULT_SYSTEM_BUS_ADDRESS; + + r = sd_bus_set_address(bus, e); + if (r < 0) + return r; + + r = sd_bus_set_bus_client(bus, true); + if (r < 0) + return r; + + r = sd_bus_negotiate_creds(bus, true, SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS); + if (r < 0) + return r; + + r = sd_bus_set_watch_bind(bus, true); + if (r < 0) + return r; + + r = sd_bus_set_connected_signal(bus, true); + if (r < 0) + return r; + + r = sd_bus_start(bus); + if (r < 0) + return r; + + *ret = TAKE_PTR(bus); + + return 0; +} + +int bus_reply_pair_array(sd_bus_message *m, char **l) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + + assert(m); + + /* Reply to the specified message with a message containing a dictionary put together from the + * specified strv */ + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_open_container(reply, 'a', "{ss}"); + if (r < 0) + return r; + + STRV_FOREACH_PAIR(k, v, l) { + r = sd_bus_message_append(reply, "{ss}", *k, *v); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static int method_dump_memory_state_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *dump = NULL; + _cleanup_close_ int fd = -EBADF; + size_t dump_size; + FILE *f; + int r; + + assert(message); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + r = RET_NERRNO(malloc_info(/* options= */ 0, f)); + if (r < 0) + return r; + + r = memstream_finalize(&m, &dump, &dump_size); + if (r < 0) + return r; + + fd = acquire_data_fd(dump, dump_size, 0); + if (fd < 0) + return fd; + + r = sd_bus_reply_method_return(message, "h", fd); + if (r < 0) + return r; + + return 1; /* Stop further processing */ +} + +/* The default install callback will fail and disconnect the bus if it cannot register the match, but this + * is only a debug method, we definitely don't want to fail in case there's some permission issue. */ +static int dummy_install_callback(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) { + return 1; +} + +int bus_register_malloc_status(sd_bus *bus, const char *destination) { + const char *match; + int r; + + assert(bus); + assert(!isempty(destination)); + + match = strjoina("type='method_call'," + "interface='org.freedesktop.MemoryAllocation1'," + "path='/org/freedesktop/MemoryAllocation1'," + "destination='", destination, "',", + "member='GetMallocInfo'"); + + r = sd_bus_add_match_async(bus, NULL, match, method_dump_memory_state_by_fd, dummy_install_callback, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to subscribe to GetMallocInfo() calls on MemoryAllocation1 interface: %m"); + + return 0; +} + +static void bus_message_unref_wrapper(void *m) { + sd_bus_message_unref(m); +} + +const struct hash_ops bus_message_hash_ops = { + .hash = trivial_hash_func, + .compare = trivial_compare_func, + .free_value = bus_message_unref_wrapper, +}; + +int bus_message_append_string_set(sd_bus_message *m, Set *set) { + const char *s; + int r; + + assert(m); + + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return r; + + SET_FOREACH(s, set) { + r = sd_bus_message_append(m, "s", s); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(m); +} + +int bus_property_get_string_set( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Set **s = ASSERT_PTR(userdata); + + assert(bus); + assert(property); + assert(reply); + + return bus_message_append_string_set(reply, *s); +} diff --git a/src/shared/bus-util.h b/src/shared/bus-util.h new file mode 100644 index 0000000..869c639 --- /dev/null +++ b/src/shared/bus-util.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-event.h" + +#include "errno-util.h" +#include "macro.h" +#include "runtime-scope.h" +#include "set.h" +#include "string-util.h" +#include "time-util.h" + +typedef enum BusTransport { + BUS_TRANSPORT_LOCAL, + BUS_TRANSPORT_REMOTE, + BUS_TRANSPORT_MACHINE, + _BUS_TRANSPORT_MAX, + _BUS_TRANSPORT_INVALID = -EINVAL, +} BusTransport; + +int bus_async_unregister_and_exit(sd_event *e, sd_bus *bus, const char *name); + +typedef bool (*check_idle_t)(void *userdata); + +int bus_event_loop_with_idle(sd_event *e, sd_bus *bus, const char *name, usec_t timeout, check_idle_t check_idle, void *userdata); + +int bus_name_has_owner(sd_bus *c, const char *name, sd_bus_error *error); +bool bus_error_is_unknown_service(const sd_bus_error *error); + +int bus_check_peercred(sd_bus *c); + +int bus_connect_system_systemd(sd_bus **ret_bus); +int bus_connect_user_systemd(sd_bus **ret_bus); + +int bus_connect_transport(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus); +int bus_connect_transport_systemd(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus); + +int bus_log_address_error(int r, BusTransport transport); +int bus_log_connect_error(int r, BusTransport transport); + +#define bus_log_parse_error(r) \ + log_error_errno(r, "Failed to parse bus message: %m") + +#define bus_log_parse_error_debug(r) \ + log_debug_errno(r, "Failed to parse bus message: %m") + +#define bus_log_create_error(r) \ + log_error_errno(r, "Failed to create bus message: %m") + +int bus_path_encode_unique(sd_bus *b, const char *prefix, const char *sender_id, const char *external_id, char **ret_path); +int bus_path_decode_unique(const char *path, const char *prefix, char **ret_sender, char **ret_external); + +int bus_track_add_name_many(sd_bus_track *t, char **l); + +int bus_open_system_watch_bind_with_description(sd_bus **ret, const char *description); +static inline int bus_open_system_watch_bind(sd_bus **ret) { + return bus_open_system_watch_bind_with_description(ret, NULL); +} + +int bus_reply_pair_array(sd_bus_message *m, char **l); + +/* Listen to GetMallocInfo() calls to 'destination' and return malloc_info() via FD */ +int bus_register_malloc_status(sd_bus *bus, const char *destination); + +extern const struct hash_ops bus_message_hash_ops; + +int bus_message_append_string_set(sd_bus_message *m, Set *s); + +int bus_property_get_string_set(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); diff --git a/src/shared/bus-wait-for-jobs.c b/src/shared/bus-wait-for-jobs.c new file mode 100644 index 0000000..969c629 --- /dev/null +++ b/src/shared/bus-wait-for-jobs.c @@ -0,0 +1,333 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-wait-for-jobs.h" +#include "set.h" +#include "bus-util.h" +#include "bus-internal.h" +#include "unit-def.h" +#include "escape.h" +#include "strv.h" + +typedef struct BusWaitForJobs { + sd_bus *bus; + + /* The set of jobs to wait for, as bus object paths */ + Set *jobs; + + /* The unit name and job result of the last Job message */ + char *name; + char *result; + + sd_bus_slot *slot_job_removed; + sd_bus_slot *slot_disconnected; +} BusWaitForJobs; + +static int match_disconnected(sd_bus_message *m, void *userdata, sd_bus_error *error) { + assert(m); + + log_error("Warning! D-Bus connection terminated."); + sd_bus_close(sd_bus_message_get_bus(m)); + + return 0; +} + +static int match_job_removed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + const char *path, *unit, *result; + BusWaitForJobs *d = ASSERT_PTR(userdata); + uint32_t id; + char *found; + int r; + + assert(m); + + r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + found = set_remove(d->jobs, (char*) path); + if (!found) + return 0; + + free(found); + + (void) free_and_strdup(&d->result, empty_to_null(result)); + + (void) free_and_strdup(&d->name, empty_to_null(unit)); + + return 0; +} + +BusWaitForJobs* bus_wait_for_jobs_free(BusWaitForJobs *d) { + if (!d) + return NULL; + + set_free(d->jobs); + + sd_bus_slot_unref(d->slot_disconnected); + sd_bus_slot_unref(d->slot_job_removed); + + sd_bus_unref(d->bus); + + free(d->name); + free(d->result); + + return mfree(d); +} + +int bus_wait_for_jobs_new(sd_bus *bus, BusWaitForJobs **ret) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *d = NULL; + int r; + + assert(bus); + assert(ret); + + d = new(BusWaitForJobs, 1); + if (!d) + return -ENOMEM; + + *d = (BusWaitForJobs) { + .bus = sd_bus_ref(bus), + }; + + /* When we are a bus client we match by sender. Direct + * connections OTOH have no initialized sender field, and + * hence we ignore the sender then */ + r = sd_bus_match_signal_async( + bus, + &d->slot_job_removed, + bus->bus_client ? "org.freedesktop.systemd1" : NULL, + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "JobRemoved", + match_job_removed, NULL, d); + if (r < 0) + return r; + + r = sd_bus_match_signal_async( + bus, + &d->slot_disconnected, + "org.freedesktop.DBus.Local", + NULL, + "org.freedesktop.DBus.Local", + "Disconnected", + match_disconnected, NULL, d); + if (r < 0) + return r; + + *ret = TAKE_PTR(d); + + return 0; +} + +static int bus_process_wait(sd_bus *bus) { + int r; + + for (;;) { + r = sd_bus_process(bus, NULL); + if (r < 0) + return r; + if (r > 0) + return 0; + + r = sd_bus_wait(bus, UINT64_MAX); + if (r < 0) + return r; + } +} + +static int bus_job_get_service_result(BusWaitForJobs *d, char **result) { + _cleanup_free_ char *dbus_path = NULL; + + assert(d); + assert(d->name); + assert(result); + + if (!endswith(d->name, ".service")) + return -EINVAL; + + dbus_path = unit_dbus_path_from_name(d->name); + if (!dbus_path) + return -ENOMEM; + + return sd_bus_get_property_string(d->bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Service", + "Result", + NULL, + result); +} + +static void log_job_error_with_service_result(const char* service, const char *result, const char* const* extra_args) { + _cleanup_free_ char *service_shell_quoted = NULL; + const char *systemctl = "systemctl", *journalctl = "journalctl"; + + static const struct { + const char *result, *explanation; + } explanations[] = { + { "resources", "of unavailable resources or another system error" }, + { "protocol", "the service did not take the steps required by its unit configuration" }, + { "timeout", "a timeout was exceeded" }, + { "exit-code", "the control process exited with error code" }, + { "signal", "a fatal signal was delivered to the control process" }, + { "core-dump", "a fatal signal was delivered causing the control process to dump core" }, + { "watchdog", "the service failed to send watchdog ping" }, + { "start-limit", "start of the service was attempted too often" } + }; + + assert(service); + + service_shell_quoted = shell_maybe_quote(service, 0); + + if (!strv_isempty((char**) extra_args)) { + _cleanup_free_ char *t = NULL; + + t = strv_join((char**) extra_args, " "); + systemctl = strjoina("systemctl ", t ?: ""); + journalctl = strjoina("journalctl ", t ?: ""); + } + + if (!isempty(result)) { + size_t i; + + for (i = 0; i < ELEMENTSOF(explanations); ++i) + if (streq(result, explanations[i].result)) + break; + + if (i < ELEMENTSOF(explanations)) { + log_error("Job for %s failed because %s.\n" + "See \"%s status %s\" and \"%s -xeu %s\" for details.\n", + service, + explanations[i].explanation, + systemctl, + service_shell_quoted ?: "", + journalctl, + service_shell_quoted ?: ""); + goto finish; + } + } + + log_error("Job for %s failed.\n" + "See \"%s status %s\" and \"%s -xeu %s\" for details.\n", + service, + systemctl, + service_shell_quoted ?: "", + journalctl, + service_shell_quoted ?: ""); + +finish: + /* For some results maybe additional explanation is required */ + if (streq_ptr(result, "start-limit")) + log_info("To force a start use \"%1$s reset-failed %2$s\"\n" + "followed by \"%1$s start %2$s\" again.", + systemctl, + service_shell_quoted ?: ""); +} + +static int check_wait_response(BusWaitForJobs *d, bool quiet, const char* const* extra_args) { + assert(d); + assert(d->name); + assert(d->result); + + if (!quiet) { + if (streq(d->result, "canceled")) + log_error("Job for %s canceled.", strna(d->name)); + else if (streq(d->result, "timeout")) + log_error("Job for %s timed out.", strna(d->name)); + else if (streq(d->result, "dependency")) + log_error("A dependency job for %s failed. See 'journalctl -xe' for details.", strna(d->name)); + else if (streq(d->result, "invalid")) + log_error("%s is not active, cannot reload.", strna(d->name)); + else if (streq(d->result, "assert")) + log_error("Assertion failed on job for %s.", strna(d->name)); + else if (streq(d->result, "unsupported")) + log_error("Operation on or unit type of %s not supported on this system.", strna(d->name)); + else if (streq(d->result, "collected")) + log_error("Queued job for %s was garbage collected.", strna(d->name)); + else if (streq(d->result, "once")) + log_error("Unit %s was started already once and can't be started again.", strna(d->name)); + else if (!STR_IN_SET(d->result, "done", "skipped")) { + + if (d->name && endswith(d->name, ".service")) { + _cleanup_free_ char *result = NULL; + int q; + + q = bus_job_get_service_result(d, &result); + if (q < 0) + log_debug_errno(q, "Failed to get Result property of unit %s: %m", d->name); + + log_job_error_with_service_result(d->name, result, extra_args); + } else + log_error("Job failed. See \"journalctl -xe\" for details."); + } + } + + if (STR_IN_SET(d->result, "canceled", "collected")) + return -ECANCELED; + else if (streq(d->result, "timeout")) + return -ETIME; + else if (streq(d->result, "dependency")) + return -EIO; + else if (streq(d->result, "invalid")) + return -ENOEXEC; + else if (streq(d->result, "assert")) + return -EPROTO; + else if (streq(d->result, "unsupported")) + return -EOPNOTSUPP; + else if (streq(d->result, "once")) + return -ESTALE; + else if (STR_IN_SET(d->result, "done", "skipped")) + return 0; + + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Unexpected job result, assuming server side newer than us: %s", d->result); +} + +int bus_wait_for_jobs(BusWaitForJobs *d, bool quiet, const char* const* extra_args) { + int r = 0; + + assert(d); + + while (!set_isempty(d->jobs)) { + int q; + + q = bus_process_wait(d->bus); + if (q < 0) + return log_error_errno(q, "Failed to wait for response: %m"); + + if (d->name && d->result) { + q = check_wait_response(d, quiet, extra_args); + /* Return the first error as it is most likely to be + * meaningful. */ + if (q < 0 && r == 0) + r = q; + + log_full_errno_zerook(LOG_DEBUG, q, + "Got result %s/%m for job %s", d->result, d->name); + } + + d->name = mfree(d->name); + d->result = mfree(d->result); + } + + return r; +} + +int bus_wait_for_jobs_add(BusWaitForJobs *d, const char *path) { + assert(d); + + return set_put_strdup(&d->jobs, path); +} + +int bus_wait_for_jobs_one(BusWaitForJobs *d, const char *path, bool quiet, const char* const* extra_args) { + int r; + + r = bus_wait_for_jobs_add(d, path); + if (r < 0) + return log_oom(); + + return bus_wait_for_jobs(d, quiet, extra_args); +} diff --git a/src/shared/bus-wait-for-jobs.h b/src/shared/bus-wait-for-jobs.h new file mode 100644 index 0000000..5acf8b9 --- /dev/null +++ b/src/shared/bus-wait-for-jobs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "macro.h" + +typedef struct BusWaitForJobs BusWaitForJobs; + +int bus_wait_for_jobs_new(sd_bus *bus, BusWaitForJobs **ret); +BusWaitForJobs* bus_wait_for_jobs_free(BusWaitForJobs *d); +int bus_wait_for_jobs_add(BusWaitForJobs *d, const char *path); +int bus_wait_for_jobs(BusWaitForJobs *d, bool quiet, const char* const* extra_args); +int bus_wait_for_jobs_one(BusWaitForJobs *d, const char *path, bool quiet, const char* const* extra_args); + +DEFINE_TRIVIAL_CLEANUP_FUNC(BusWaitForJobs*, bus_wait_for_jobs_free); diff --git a/src/shared/bus-wait-for-units.c b/src/shared/bus-wait-for-units.c new file mode 100644 index 0000000..0dd2a29 --- /dev/null +++ b/src/shared/bus-wait-for-units.c @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-map-properties.h" +#include "bus-wait-for-units.h" +#include "hashmap.h" +#include "string-util.h" +#include "strv.h" +#include "unit-def.h" + +typedef struct WaitForItem { + BusWaitForUnits *parent; + + BusWaitForUnitsFlags flags; + + char *bus_path; + + sd_bus_slot *slot_get_all; + sd_bus_slot *slot_properties_changed; + + bus_wait_for_units_unit_callback unit_callback; + void *userdata; + + char *active_state; + uint32_t job_id; + char *clean_result; +} WaitForItem; + +typedef struct BusWaitForUnits { + sd_bus *bus; + sd_bus_slot *slot_disconnected; + + Hashmap *items; + + bus_wait_for_units_ready_callback ready_callback; + void *userdata; + + WaitForItem *current; + + BusWaitForUnitsState state; + bool has_failed:1; +} BusWaitForUnits; + +static WaitForItem *wait_for_item_free(WaitForItem *item) { + int r; + + if (!item) + return NULL; + + if (item->parent) { + if (FLAGS_SET(item->flags, BUS_WAIT_REFFED) && item->bus_path && item->parent->bus) { + r = sd_bus_call_method_async( + item->parent->bus, + NULL, + "org.freedesktop.systemd1", + item->bus_path, + "org.freedesktop.systemd1.Unit", + "Unref", + NULL, + NULL, + NULL); + if (r < 0) + log_debug_errno(r, "Failed to drop reference to unit %s, ignoring: %m", item->bus_path); + } + + assert_se(hashmap_remove(item->parent->items, item->bus_path) == item); + + if (item->parent->current == item) + item->parent->current = NULL; + } + + sd_bus_slot_unref(item->slot_properties_changed); + sd_bus_slot_unref(item->slot_get_all); + + free(item->bus_path); + free(item->active_state); + free(item->clean_result); + + return mfree(item); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(WaitForItem*, wait_for_item_free); + +static void call_unit_callback_and_wait(BusWaitForUnits *d, WaitForItem *item, bool good) { + d->current = item; + + if (item->unit_callback) + item->unit_callback(d, item->bus_path, good, item->userdata); + + wait_for_item_free(item); +} + +static void bus_wait_for_units_clear(BusWaitForUnits *d) { + WaitForItem *item; + + assert(d); + + d->slot_disconnected = sd_bus_slot_unref(d->slot_disconnected); + d->bus = sd_bus_unref(d->bus); + + while ((item = hashmap_first(d->items))) + call_unit_callback_and_wait(d, item, false); + + d->items = hashmap_free(d->items); +} + +static int match_disconnected(sd_bus_message *m, void *userdata, sd_bus_error *error) { + BusWaitForUnits *d = ASSERT_PTR(userdata); + + assert(m); + + log_error("Warning! D-Bus connection terminated."); + + bus_wait_for_units_clear(d); + + if (d->ready_callback) + d->ready_callback(d, false, d->userdata); + else /* If no ready callback is specified close the connection so that the event loop exits */ + sd_bus_close(sd_bus_message_get_bus(m)); + + return 0; +} + +int bus_wait_for_units_new(sd_bus *bus, BusWaitForUnits **ret) { + _cleanup_(bus_wait_for_units_freep) BusWaitForUnits *d = NULL; + int r; + + assert(bus); + assert(ret); + + d = new(BusWaitForUnits, 1); + if (!d) + return -ENOMEM; + + *d = (BusWaitForUnits) { + .state = BUS_WAIT_SUCCESS, + .bus = sd_bus_ref(bus), + }; + + r = sd_bus_match_signal_async( + bus, + &d->slot_disconnected, + "org.freedesktop.DBus.Local", + NULL, + "org.freedesktop.DBus.Local", + "Disconnected", + match_disconnected, NULL, d); + if (r < 0) + return r; + + *ret = TAKE_PTR(d); + return 0; +} + +BusWaitForUnits* bus_wait_for_units_free(BusWaitForUnits *d) { + if (!d) + return NULL; + + bus_wait_for_units_clear(d); + sd_bus_slot_unref(d->slot_disconnected); + sd_bus_unref(d->bus); + + return mfree(d); +} + +static bool bus_wait_for_units_is_ready(BusWaitForUnits *d) { + assert(d); + + if (!d->bus) /* Disconnected? */ + return true; + + return hashmap_isempty(d->items); +} + +void bus_wait_for_units_set_ready_callback(BusWaitForUnits *d, bus_wait_for_units_ready_callback callback, void *userdata) { + assert(d); + + d->ready_callback = callback; + d->userdata = userdata; +} + +static void bus_wait_for_units_check_ready(BusWaitForUnits *d) { + assert(d); + + if (!bus_wait_for_units_is_ready(d)) + return; + + d->state = d->has_failed ? BUS_WAIT_FAILURE : BUS_WAIT_SUCCESS; + + if (d->ready_callback) + d->ready_callback(d, d->state, d->userdata); +} + +static void wait_for_item_check_ready(WaitForItem *item) { + BusWaitForUnits *d; + + assert(item); + assert_se(d = item->parent); + + if (FLAGS_SET(item->flags, BUS_WAIT_FOR_MAINTENANCE_END)) { + + if (item->clean_result && !streq(item->clean_result, "success")) + d->has_failed = true; + + if (!item->active_state || streq(item->active_state, "maintenance")) + return; + } + + if (FLAGS_SET(item->flags, BUS_WAIT_NO_JOB) && item->job_id != 0) + return; + + if (FLAGS_SET(item->flags, BUS_WAIT_FOR_INACTIVE)) { + + if (streq_ptr(item->active_state, "failed")) + d->has_failed = true; + else if (!streq_ptr(item->active_state, "inactive")) + return; + } + + call_unit_callback_and_wait(d, item, true); + bus_wait_for_units_check_ready(d); +} + +static int property_map_job( + sd_bus *bus, + const char *member, + sd_bus_message *m, + sd_bus_error *error, + void *userdata) { + + WaitForItem *item = ASSERT_PTR(userdata); + const char *path; + uint32_t id; + int r; + + r = sd_bus_message_read(m, "(uo)", &id, &path); + if (r < 0) + return r; + + item->job_id = id; + return 0; +} + +static int wait_for_item_parse_properties(WaitForItem *item, sd_bus_message *m) { + + static const struct bus_properties_map map[] = { + { "ActiveState", "s", NULL, offsetof(WaitForItem, active_state) }, + { "Job", "(uo)", property_map_job, 0 }, + { "CleanResult", "s", NULL, offsetof(WaitForItem, clean_result) }, + {} + }; + + int r; + + assert(item); + assert(m); + + r = bus_message_map_all_properties(m, map, BUS_MAP_STRDUP, NULL, item); + if (r < 0) + return r; + + wait_for_item_check_ready(item); + return 0; +} + +static int on_properties_changed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + WaitForItem *item = ASSERT_PTR(userdata); + const char *interface; + int r; + + r = sd_bus_message_read(m, "s", &interface); + if (r < 0) { + log_debug_errno(r, "Failed to parse PropertiesChanged signal: %m"); + return 0; + } + + if (!streq(interface, "org.freedesktop.systemd1.Unit")) + return 0; + + r = wait_for_item_parse_properties(item, m); + if (r < 0) + log_debug_errno(r, "Failed to process PropertiesChanged signal: %m"); + + return 0; +} + +static int on_get_all_properties(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + WaitForItem *item = ASSERT_PTR(userdata); + const sd_bus_error *e; + int r; + + e = sd_bus_message_get_error(m); + if (e) { + BusWaitForUnits *d = item->parent; + + d->has_failed = true; + + r = sd_bus_error_get_errno(e); + log_debug_errno(r, "GetAll() failed for %s: %s", + item->bus_path, bus_error_message(e, r)); + + call_unit_callback_and_wait(d, item, false); + bus_wait_for_units_check_ready(d); + return 0; + } + + r = wait_for_item_parse_properties(item, m); + if (r < 0) + log_debug_errno(r, "Failed to process GetAll method reply: %m"); + + return 0; +} + +int bus_wait_for_units_add_unit( + BusWaitForUnits *d, + const char *unit, + BusWaitForUnitsFlags flags, + bus_wait_for_units_unit_callback callback, + void *userdata) { + + _cleanup_(wait_for_item_freep) WaitForItem *item = NULL; + int r; + + assert(d); + assert(unit); + + assert(flags != 0); + + r = hashmap_ensure_allocated(&d->items, &string_hash_ops); + if (r < 0) + return r; + + item = new(WaitForItem, 1); + if (!item) + return -ENOMEM; + + *item = (WaitForItem) { + .flags = flags, + .bus_path = unit_dbus_path_from_name(unit), + .unit_callback = callback, + .userdata = userdata, + .job_id = UINT32_MAX, + }; + + if (!item->bus_path) + return -ENOMEM; + + if (!FLAGS_SET(item->flags, BUS_WAIT_REFFED)) { + r = sd_bus_call_method_async( + d->bus, + NULL, + "org.freedesktop.systemd1", + item->bus_path, + "org.freedesktop.systemd1.Unit", + "Ref", + NULL, + NULL, + NULL); + if (r < 0) + return log_debug_errno(r, "Failed to add reference to unit %s: %m", unit); + + item->flags |= BUS_WAIT_REFFED; + } + + r = sd_bus_match_signal_async( + d->bus, + &item->slot_properties_changed, + "org.freedesktop.systemd1", + item->bus_path, + "org.freedesktop.DBus.Properties", + "PropertiesChanged", + on_properties_changed, + NULL, + item); + if (r < 0) + return log_debug_errno(r, "Failed to request match for PropertiesChanged signal: %m"); + + r = sd_bus_call_method_async( + d->bus, + &item->slot_get_all, + "org.freedesktop.systemd1", + item->bus_path, + "org.freedesktop.DBus.Properties", + "GetAll", + on_get_all_properties, + item, + "s", FLAGS_SET(item->flags, BUS_WAIT_FOR_MAINTENANCE_END) ? NULL : "org.freedesktop.systemd1.Unit"); + if (r < 0) + return log_debug_errno(r, "Failed to request properties of unit %s: %m", unit); + + r = hashmap_put(d->items, item->bus_path, item); + if (r < 0) + return r; + + d->state = BUS_WAIT_RUNNING; + item->parent = d; + TAKE_PTR(item); + return 0; +} + +int bus_wait_for_units_run(BusWaitForUnits *d) { + int r; + + assert(d); + + while (d->state == BUS_WAIT_RUNNING) { + + r = sd_bus_process(d->bus, NULL); + if (r < 0) + return r; + if (r > 0) + continue; + + r = sd_bus_wait(d->bus, UINT64_MAX); + if (r < 0) + return r; + } + + return d->state; +} + +BusWaitForUnitsState bus_wait_for_units_state(BusWaitForUnits *d) { + assert(d); + + return d->state; +} diff --git a/src/shared/bus-wait-for-units.h b/src/shared/bus-wait-for-units.h new file mode 100644 index 0000000..2623e72 --- /dev/null +++ b/src/shared/bus-wait-for-units.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" +#include "sd-bus.h" + +typedef struct BusWaitForUnits BusWaitForUnits; + +typedef enum BusWaitForUnitsState { + BUS_WAIT_SUCCESS, /* Nothing to wait for anymore and nothing failed */ + BUS_WAIT_FAILURE, /* dito, but something failed */ + BUS_WAIT_RUNNING, /* Still something to wait for */ + _BUS_WAIT_FOR_UNITS_STATE_MAX, + _BUS_WAIT_FOR_UNITS_STATE_INVALID = -EINVAL, +} BusWaitForUnitsState; + +typedef enum BusWaitForUnitsFlags { + BUS_WAIT_FOR_MAINTENANCE_END = 1 << 0, /* Wait until the unit is no longer in maintenance state */ + BUS_WAIT_FOR_INACTIVE = 1 << 1, /* Wait until the unit is back in inactive or dead state */ + BUS_WAIT_NO_JOB = 1 << 2, /* Wait until there's no more job pending */ + BUS_WAIT_REFFED = 1 << 3, /* The unit is already reffed with RefUnit() */ +} BusWaitForUnitsFlags; + +typedef void (*bus_wait_for_units_ready_callback)(BusWaitForUnits *d, BusWaitForUnitsState state, void *userdata); +typedef void (*bus_wait_for_units_unit_callback)(BusWaitForUnits *d, const char *unit_path, bool good, void *userdata); + +int bus_wait_for_units_new(sd_bus *bus, BusWaitForUnits **ret); +BusWaitForUnits* bus_wait_for_units_free(BusWaitForUnits *d); + +BusWaitForUnitsState bus_wait_for_units_state(BusWaitForUnits *d); +void bus_wait_for_units_set_ready_callback(BusWaitForUnits *d, bus_wait_for_units_ready_callback callback, void *userdata); +int bus_wait_for_units_add_unit(BusWaitForUnits *d, const char *unit, BusWaitForUnitsFlags flags, bus_wait_for_units_unit_callback callback, void *userdata); +int bus_wait_for_units_run(BusWaitForUnits *d); + +DEFINE_TRIVIAL_CLEANUP_FUNC(BusWaitForUnits*, bus_wait_for_units_free); diff --git a/src/shared/calendarspec.c b/src/shared/calendarspec.c new file mode 100644 index 0000000..039080f --- /dev/null +++ b/src/shared/calendarspec.c @@ -0,0 +1,1435 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "calendarspec.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "macro.h" +#include "memstream-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +#define BITS_WEEKDAYS 127 +#define MIN_YEAR 1970 +#define MAX_YEAR 2199 + +/* An arbitrary limit on the length of the chains of components. We don't want to + * build a very long linked list, which would be slow to iterate over and might cause + * our stack to overflow. It's unlikely that legitimate uses require more than a few + * linked components anyway. */ +#define CALENDARSPEC_COMPONENTS_MAX 240 + +/* Let's make sure that the microsecond component is safe to be stored in an 'int' */ +assert_cc(INT_MAX >= USEC_PER_SEC); + +static CalendarComponent* chain_free(CalendarComponent *c) { + while (c) { + CalendarComponent *n = c->next; + free_and_replace(c, n); + } + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(CalendarComponent*, chain_free); + +CalendarSpec* calendar_spec_free(CalendarSpec *c) { + + if (!c) + return NULL; + + chain_free(c->year); + chain_free(c->month); + chain_free(c->day); + chain_free(c->hour); + chain_free(c->minute); + chain_free(c->microsecond); + free(c->timezone); + + return mfree(c); +} + +static int component_compare(CalendarComponent * const *a, CalendarComponent * const *b) { + int r; + + r = CMP((*a)->start, (*b)->start); + if (r != 0) + return r; + + r = CMP((*a)->stop, (*b)->stop); + if (r != 0) + return r; + + return CMP((*a)->repeat, (*b)->repeat); +} + +static void normalize_chain(CalendarComponent **c) { + assert(c); + + size_t n = 0; + for (CalendarComponent *i = *c; i; i = i->next) { + n++; + + /* While we're counting the chain, also normalize 'stop' + * so the length of the range is a multiple of 'repeat'. */ + if (i->stop > i->start && i->repeat > 0) + i->stop -= (i->stop - i->start) % i->repeat; + + /* If a repeat value is specified, but it cannot even be triggered once, let's suppress it. + * + * Similarly, if the stop value is the same as the start value, then let's just make this a + * non-repeating chain element. */ + if ((i->stop > i->start && i->repeat > 0 && i->start + i->repeat > i->stop) || + i->start == i->stop) { + i->repeat = 0; + i->stop = -1; + } + } + + if (n <= 1) + return; + + CalendarComponent **b, **j; + b = j = newa(CalendarComponent*, n); + for (CalendarComponent *i = *c; i; i = i->next) + *(j++) = i; + + typesafe_qsort(b, n, component_compare); + + b[n-1]->next = NULL; + CalendarComponent *next = b[n-1]; + + /* Drop non-unique entries */ + for (size_t k = n-1; k > 0; k--) { + if (component_compare(&b[k-1], &next) == 0) { + free(b[k-1]); + continue; + } + + b[k-1]->next = next; + next = b[k-1]; + } + + *c = next; +} + +static void fix_year(CalendarComponent *c) { + /* Turns 12 → 2012, 89 → 1989 */ + + while (c) { + if (c->start >= 0 && c->start < 70) + c->start += 2000; + + if (c->stop >= 0 && c->stop < 70) + c->stop += 2000; + + if (c->start >= 70 && c->start < 100) + c->start += 1900; + + if (c->stop >= 70 && c->stop < 100) + c->stop += 1900; + + c = c->next; + } +} + +static void calendar_spec_normalize(CalendarSpec *c) { + assert(c); + + if (streq_ptr(c->timezone, "UTC")) { + c->utc = true; + c->timezone = mfree(c->timezone); + } + + if (c->weekdays_bits <= 0 || c->weekdays_bits >= BITS_WEEKDAYS) + c->weekdays_bits = -1; + + if (c->end_of_month && !c->day) + c->end_of_month = false; + + fix_year(c->year); + + normalize_chain(&c->year); + normalize_chain(&c->month); + normalize_chain(&c->day); + normalize_chain(&c->hour); + normalize_chain(&c->minute); + normalize_chain(&c->microsecond); +} + +static bool chain_valid(CalendarComponent *c, int from, int to, bool end_of_month) { + assert(to >= from); + + if (!c) + return true; + + /* Forbid dates more than 28 days from the end of the month */ + if (end_of_month) + to -= 3; + + if (c->start < from || c->start > to) + return false; + + /* Avoid overly large values that could cause overflow */ + if (c->repeat > to - from) + return false; + + /* + * c->repeat must be short enough so at least one repetition may + * occur before the end of the interval. For dates scheduled + * relative to the end of the month, c->start and c->stop + * correspond to the Nth last day of the month. + */ + if (c->stop >= 0) { + if (c->stop < from || c ->stop > to) + return false; + + if (c->start + c->repeat > c->stop) + return false; + } else { + if (end_of_month && c->start - c->repeat < from) + return false; + + if (!end_of_month && c->start + c->repeat > to) + return false; + } + + if (c->next) + return chain_valid(c->next, from, to, end_of_month); + + return true; +} + +_pure_ bool calendar_spec_valid(CalendarSpec *c) { + assert(c); + + if (c->weekdays_bits > BITS_WEEKDAYS) + return false; + + if (!chain_valid(c->year, MIN_YEAR, MAX_YEAR, false)) + return false; + + if (!chain_valid(c->month, 1, 12, false)) + return false; + + if (!chain_valid(c->day, 1, 31, c->end_of_month)) + return false; + + if (!chain_valid(c->hour, 0, 23, false)) + return false; + + if (!chain_valid(c->minute, 0, 59, false)) + return false; + + if (!chain_valid(c->microsecond, 0, 60*USEC_PER_SEC-1, false)) + return false; + + return true; +} + +static void format_weekdays(FILE *f, const CalendarSpec *c) { + static const char *const days[] = { + "Mon", + "Tue", + "Wed", + "Thu", + "Fri", + "Sat", + "Sun", + }; + + int l, x; + bool need_comma = false; + + assert(f); + assert(c); + assert(c->weekdays_bits > 0 && c->weekdays_bits <= BITS_WEEKDAYS); + + for (x = 0, l = -1; x < (int) ELEMENTSOF(days); x++) { + + if (c->weekdays_bits & (1 << x)) { + + if (l < 0) { + if (need_comma) + fputc(',', f); + else + need_comma = true; + + fputs(days[x], f); + l = x; + } + + } else if (l >= 0) { + + if (x > l + 1) { + fputs(x > l + 2 ? ".." : ",", f); + fputs(days[x-1], f); + } + + l = -1; + } + } + + if (l >= 0 && x > l + 1) { + fputs(x > l + 2 ? ".." : ",", f); + fputs(days[x-1], f); + } +} + +static bool chain_is_star(const CalendarComponent *c, bool usec) { + /* Return true if the whole chain can be replaced by '*'. + * This happens when the chain is empty or one of the components covers all. */ + if (!c) + return true; + if (usec) + for (; c; c = c->next) + if (c->start == 0 && c->stop < 0 && c->repeat == USEC_PER_SEC) + return true; + return false; +} + +static void _format_chain(FILE *f, int space, const CalendarComponent *c, bool start, bool usec) { + int d = usec ? (int) USEC_PER_SEC : 1; + + assert(f); + + if (start && chain_is_star(c, usec)) { + fputc('*', f); + return; + } + + assert(c->start >= 0); + + fprintf(f, "%0*i", space, c->start / d); + if (c->start % d > 0) + fprintf(f, ".%06i", c->start % d); + + if (c->stop > 0) + fprintf(f, "..%0*i", space, c->stop / d); + if (c->stop % d > 0) + fprintf(f, ".%06i", c->stop % d); + + if (c->repeat > 0 && !(c->stop > 0 && c->repeat == d)) + fprintf(f, "/%i", c->repeat / d); + if (c->repeat % d > 0) + fprintf(f, ".%06i", c->repeat % d); + + if (c->next) { + fputc(',', f); + _format_chain(f, space, c->next, false, usec); + } +} + +static void format_chain(FILE *f, int space, const CalendarComponent *c, bool usec) { + _format_chain(f, space, c, /* start = */ true, usec); +} + +int calendar_spec_to_string(const CalendarSpec *c, char **ret) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + assert(c); + assert(ret); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + if (c->weekdays_bits > 0 && c->weekdays_bits <= BITS_WEEKDAYS) { + format_weekdays(f, c); + fputc(' ', f); + } + + format_chain(f, 4, c->year, false); + fputc('-', f); + format_chain(f, 2, c->month, false); + fputc(c->end_of_month ? '~' : '-', f); + format_chain(f, 2, c->day, false); + fputc(' ', f); + format_chain(f, 2, c->hour, false); + fputc(':', f); + format_chain(f, 2, c->minute, false); + fputc(':', f); + format_chain(f, 2, c->microsecond, true); + + if (c->utc) + fputs(" UTC", f); + else if (c->timezone) { + fputc(' ', f); + fputs(c->timezone, f); + } else if (IN_SET(c->dst, 0, 1)) { + + /* If daylight saving is explicitly on or off, let's show the used timezone. */ + + tzset(); + + if (!isempty(tzname[c->dst])) { + fputc(' ', f); + fputs(tzname[c->dst], f); + } + } + + return memstream_finalize(&m, ret, NULL); +} + +static int parse_weekdays(const char **p, CalendarSpec *c) { + static const struct { + const char *name; + const int nr; + } day_nr[] = { + { "Monday", 0 }, + { "Mon", 0 }, + { "Tuesday", 1 }, + { "Tue", 1 }, + { "Wednesday", 2 }, + { "Wed", 2 }, + { "Thursday", 3 }, + { "Thu", 3 }, + { "Friday", 4 }, + { "Fri", 4 }, + { "Saturday", 5 }, + { "Sat", 5 }, + { "Sunday", 6 }, + { "Sun", 6 }, + }; + + int l = -1; + bool first = true; + + assert(p); + assert(*p); + assert(c); + + for (;;) { + size_t i; + + for (i = 0; i < ELEMENTSOF(day_nr); i++) { + size_t skip; + + if (!startswith_no_case(*p, day_nr[i].name)) + continue; + + skip = strlen(day_nr[i].name); + + if (!IN_SET((*p)[skip], 0, '-', '.', ',', ' ')) + return -EINVAL; + + c->weekdays_bits |= 1 << day_nr[i].nr; + + if (l >= 0) { + if (l > day_nr[i].nr) + return -EINVAL; + + for (int j = l + 1; j < day_nr[i].nr; j++) + c->weekdays_bits |= 1 << j; + } + + *p += skip; + break; + } + + /* Couldn't find this prefix, so let's assume the + weekday was not specified and let's continue with + the date */ + if (i >= ELEMENTSOF(day_nr)) + return first ? 0 : -EINVAL; + + /* We reached the end of the string */ + if (**p == 0) + return 0; + + /* We reached the end of the weekday spec part */ + if (**p == ' ') { + *p += strspn(*p, " "); + return 0; + } + + if (**p == '.') { + if (l >= 0) + return -EINVAL; + + if ((*p)[1] != '.') + return -EINVAL; + + l = day_nr[i].nr; + *p += 2; + + /* Support ranges with "-" for backwards compatibility */ + } else if (**p == '-') { + if (l >= 0) + return -EINVAL; + + l = day_nr[i].nr; + *p += 1; + + } else if (**p == ',') { + l = -1; + *p += 1; + } + + /* Allow a trailing comma but not an open range */ + if (IN_SET(**p, 0, ' ')) { + *p += strspn(*p, " "); + return l < 0 ? 0 : -EINVAL; + } + + first = false; + } +} + +static int parse_one_number(const char *p, const char **e, unsigned long *ret) { + char *ee = NULL; + unsigned long value; + + errno = 0; + value = strtoul(p, &ee, 10); + if (errno > 0) + return -errno; + if (ee == p) + return -EINVAL; + + *ret = value; + *e = ee; + return 0; +} + +static int parse_component_decimal(const char **p, bool usec, int *res) { + unsigned long value; + const char *e = NULL; + int r; + + if (!ascii_isdigit(**p)) + return -EINVAL; + + r = parse_one_number(*p, &e, &value); + if (r < 0) + return r; + + if (usec) { + if (value * USEC_PER_SEC / USEC_PER_SEC != value) + return -ERANGE; + + value *= USEC_PER_SEC; + + /* One "." is a decimal point, but ".." is a range separator */ + if (e[0] == '.' && e[1] != '.') { + unsigned add; + + e++; + r = parse_fractional_part_u(&e, 6, &add); + if (r < 0) + return r; + + if (add + value < value) + return -ERANGE; + value += add; + } + } + + if (value > INT_MAX) + return -ERANGE; + + *p = e; + *res = value; + + return 0; +} + +static int const_chain(int value, CalendarComponent **c) { + CalendarComponent *cc = NULL; + + assert(c); + + cc = new(CalendarComponent, 1); + if (!cc) + return -ENOMEM; + + *cc = (CalendarComponent) { + .start = value, + .stop = -1, + .repeat = 0, + .next = *c, + }; + + *c = cc; + + return 0; +} + +static int calendarspec_from_time_t(CalendarSpec *c, time_t time) { + _cleanup_(chain_freep) CalendarComponent + *year = NULL, *month = NULL, *day = NULL, + *hour = NULL, *minute = NULL, *us = NULL; + struct tm tm; + int r; + + if (!gmtime_r(&time, &tm)) + return -ERANGE; + + if (tm.tm_year > INT_MAX - 1900) + return -ERANGE; + + r = const_chain(tm.tm_year + 1900, &year); + if (r < 0) + return r; + + r = const_chain(tm.tm_mon + 1, &month); + if (r < 0) + return r; + + r = const_chain(tm.tm_mday, &day); + if (r < 0) + return r; + + r = const_chain(tm.tm_hour, &hour); + if (r < 0) + return r; + + r = const_chain(tm.tm_min, &minute); + if (r < 0) + return r; + + r = const_chain(tm.tm_sec * USEC_PER_SEC, &us); + if (r < 0) + return r; + + c->utc = true; + c->year = TAKE_PTR(year); + c->month = TAKE_PTR(month); + c->day = TAKE_PTR(day); + c->hour = TAKE_PTR(hour); + c->minute = TAKE_PTR(minute); + c->microsecond = TAKE_PTR(us); + return 0; +} + +static int prepend_component(const char **p, bool usec, unsigned nesting, CalendarComponent **c) { + int r, start, stop = -1, repeat = 0; + CalendarComponent *cc; + const char *e = *p; + + assert(p); + assert(c); + + if (nesting > CALENDARSPEC_COMPONENTS_MAX) + return -ENOBUFS; + + r = parse_component_decimal(&e, usec, &start); + if (r < 0) + return r; + + if (e[0] == '.' && e[1] == '.') { + e += 2; + r = parse_component_decimal(&e, usec, &stop); + if (r < 0) + return r; + + repeat = usec ? USEC_PER_SEC : 1; + } + + if (*e == '/') { + e++; + r = parse_component_decimal(&e, usec, &repeat); + if (r < 0) + return r; + + if (repeat == 0) + return -ERANGE; + } else { + /* If no repeat value is specified for the μs component, then let's explicitly refuse ranges + * below 1s because our default repeat granularity is beyond that. */ + + /* Overflow check */ + if (start > INT_MAX - repeat) + return -ERANGE; + + if (usec && stop >= 0 && start + repeat > stop) + return -EINVAL; + } + + if (!IN_SET(*e, 0, ' ', ',', '-', '~', ':')) + return -EINVAL; + + cc = new(CalendarComponent, 1); + if (!cc) + return -ENOMEM; + + *cc = (CalendarComponent) { + .start = start, + .stop = stop, + .repeat = repeat, + .next = *c, + }; + + *p = e; + *c = cc; + + if (*e ==',') { + *p += 1; + return prepend_component(p, usec, nesting + 1, c); + } + + return 0; +} + +static int parse_chain(const char **p, bool usec, CalendarComponent **c) { + _cleanup_(chain_freep) CalendarComponent *cc = NULL; + const char *t; + int r; + + assert(p); + assert(c); + + t = *p; + + if (t[0] == '*') { + if (usec) { + r = const_chain(0, c); + if (r < 0) + return r; + (*c)->repeat = USEC_PER_SEC; + } else + *c = NULL; + + *p = t + 1; + return 0; + } + + r = prepend_component(&t, usec, 0, &cc); + if (r < 0) + return r; + + *p = t; + *c = TAKE_PTR(cc); + return 0; +} + +static int parse_date(const char **p, CalendarSpec *c) { + _cleanup_(chain_freep) CalendarComponent *first = NULL, *second = NULL, *third = NULL; + const char *t; + int r; + + assert(p); + assert(*p); + assert(c); + + t = *p; + + if (*t == 0) + return 0; + + /* @TIMESTAMP — UNIX time in seconds since the epoch */ + if (*t == '@') { + unsigned long value; + time_t time; + + r = parse_one_number(t + 1, &t, &value); + if (r < 0) + return r; + + time = value; + if ((unsigned long) time != value) + return -ERANGE; + + r = calendarspec_from_time_t(c, time); + if (r < 0) + return r; + + *p = t; + return 1; /* finito, don't parse H:M:S after that */ + } + + r = parse_chain(&t, false, &first); + if (r < 0) + return r; + + /* Already the end? A ':' as separator? In that case this was a time, not a date */ + if (IN_SET(*t, 0, ':')) + return 0; + + if (*t == '~') + c->end_of_month = true; + else if (*t != '-') + return -EINVAL; + + t++; + r = parse_chain(&t, false, &second); + if (r < 0) + return r; + + /* Got two parts, hence it's month and day */ + if (IN_SET(*t, 0, ' ')) { + *p = t + strspn(t, " "); + c->month = TAKE_PTR(first); + c->day = TAKE_PTR(second); + return 0; + } else if (c->end_of_month) + return -EINVAL; + + if (*t == '~') + c->end_of_month = true; + else if (*t != '-') + return -EINVAL; + + t++; + r = parse_chain(&t, false, &third); + if (r < 0) + return r; + + if (!IN_SET(*t, 0, ' ')) + return -EINVAL; + + /* Got three parts, hence it is year, month and day */ + *p = t + strspn(t, " "); + c->year = TAKE_PTR(first); + c->month = TAKE_PTR(second); + c->day = TAKE_PTR(third); + return 0; +} + +static int parse_calendar_time(const char **p, CalendarSpec *c) { + _cleanup_(chain_freep) CalendarComponent *h = NULL, *m = NULL, *s = NULL; + const char *t; + int r; + + assert(p); + assert(*p); + assert(c); + + t = *p; + + /* If no time is specified at all, then this means 00:00:00 */ + if (*t == 0) + goto null_hour; + + r = parse_chain(&t, false, &h); + if (r < 0) + return r; + + if (*t != ':') + return -EINVAL; + + t++; + r = parse_chain(&t, false, &m); + if (r < 0) + return r; + + /* Already at the end? Then it's hours and minutes, and seconds are 0 */ + if (*t == 0) + goto null_second; + + if (*t != ':') + return -EINVAL; + + t++; + r = parse_chain(&t, true, &s); + if (r < 0) + return r; + + /* At the end? Then it's hours, minutes and seconds */ + if (*t == 0) + goto finish; + + return -EINVAL; + +null_hour: + r = const_chain(0, &h); + if (r < 0) + return r; + + r = const_chain(0, &m); + if (r < 0) + return r; + +null_second: + r = const_chain(0, &s); + if (r < 0) + return r; + +finish: + *p = t; + c->hour = TAKE_PTR(h); + c->minute = TAKE_PTR(m); + c->microsecond = TAKE_PTR(s); + + return 0; +} + +int calendar_spec_from_string(const char *p, CalendarSpec **ret) { + const char *utc; + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + _cleanup_free_ char *p_tmp = NULL; + int r; + + assert(p); + + c = new(CalendarSpec, 1); + if (!c) + return -ENOMEM; + + *c = (CalendarSpec) { + .dst = -1, + .timezone = NULL, + }; + + utc = endswith_no_case(p, " UTC"); + if (utc) { + c->utc = true; + p = p_tmp = strndup(p, utc - p); + if (!p) + return -ENOMEM; + } else { + const char *e = NULL; + int j; + + tzset(); + + /* Check if the local timezone was specified? */ + for (j = 0; j <= 1; j++) { + if (isempty(tzname[j])) + continue; + + e = endswith_no_case(p, tzname[j]); + if (!e) + continue; + if (e == p) + continue; + if (e[-1] != ' ') + continue; + + break; + } + + /* Found one of the two timezones specified? */ + if (IN_SET(j, 0, 1)) { + p = p_tmp = strndup(p, e - p - 1); + if (!p) + return -ENOMEM; + + c->dst = j; + } else { + const char *last_space; + + last_space = strrchr(p, ' '); + if (last_space != NULL && timezone_is_valid(last_space + 1, LOG_DEBUG)) { + c->timezone = strdup(last_space + 1); + if (!c->timezone) + return -ENOMEM; + + p = p_tmp = strndup(p, last_space - p); + if (!p) + return -ENOMEM; + } + } + } + + if (isempty(p)) + return -EINVAL; + + if (strcaseeq(p, "minutely")) { + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (strcaseeq(p, "hourly")) { + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (strcaseeq(p, "daily")) { + r = const_chain(0, &c->hour); + if (r < 0) + return r; + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (strcaseeq(p, "monthly")) { + r = const_chain(1, &c->day); + if (r < 0) + return r; + r = const_chain(0, &c->hour); + if (r < 0) + return r; + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (STRCASE_IN_SET(p, + "annually", + "yearly", + "anually") /* backwards compatibility */ ) { + + r = const_chain(1, &c->month); + if (r < 0) + return r; + r = const_chain(1, &c->day); + if (r < 0) + return r; + r = const_chain(0, &c->hour); + if (r < 0) + return r; + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (strcaseeq(p, "weekly")) { + + c->weekdays_bits = 1; + + r = const_chain(0, &c->hour); + if (r < 0) + return r; + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (strcaseeq(p, "quarterly")) { + + r = const_chain(1, &c->month); + if (r < 0) + return r; + r = const_chain(4, &c->month); + if (r < 0) + return r; + r = const_chain(7, &c->month); + if (r < 0) + return r; + r = const_chain(10, &c->month); + if (r < 0) + return r; + r = const_chain(1, &c->day); + if (r < 0) + return r; + r = const_chain(0, &c->hour); + if (r < 0) + return r; + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else if (STRCASE_IN_SET(p, + "biannually", + "bi-annually", + "semiannually", + "semi-annually")) { + + r = const_chain(1, &c->month); + if (r < 0) + return r; + r = const_chain(7, &c->month); + if (r < 0) + return r; + r = const_chain(1, &c->day); + if (r < 0) + return r; + r = const_chain(0, &c->hour); + if (r < 0) + return r; + r = const_chain(0, &c->minute); + if (r < 0) + return r; + r = const_chain(0, &c->microsecond); + if (r < 0) + return r; + + } else { + r = parse_weekdays(&p, c); + if (r < 0) + return r; + + r = parse_date(&p, c); + if (r < 0) + return r; + + if (r == 0) { + r = parse_calendar_time(&p, c); + if (r < 0) + return r; + } + + if (*p != 0) + return -EINVAL; + } + + calendar_spec_normalize(c); + + if (!calendar_spec_valid(c)) + return -EINVAL; + + if (ret) + *ret = TAKE_PTR(c); + return 0; +} + +static int find_end_of_month(const struct tm *tm, bool utc, int day) { + struct tm t = *tm; + + t.tm_mon++; + t.tm_mday = 1 - day; + + if (mktime_or_timegm(&t, utc) < 0 || + t.tm_mon != tm->tm_mon) + return -1; + + return t.tm_mday; +} + +static int find_matching_component( + const CalendarSpec *spec, + const CalendarComponent *c, + const struct tm *tm, /* tm is only used for end-of-month calculations */ + int *val) { + + int d = -1, r; + bool d_set = false; + + assert(val); + + /* Finds the *earliest* matching time specified by one of the CalendarCompoment items in chain c. + * If no matches can be found, returns -ENOENT. + * Otherwise, updates *val to the matching time. 1 is returned if *val was changed, 0 otherwise. + */ + + if (!c) + return 0; + + bool end_of_month = spec->end_of_month && c == spec->day; + + while (c) { + int start, stop; + + if (end_of_month) { + start = find_end_of_month(tm, spec->utc, c->start); + stop = find_end_of_month(tm, spec->utc, c->stop); + + if (stop > 0) + SWAP_TWO(start, stop); + } else { + start = c->start; + stop = c->stop; + } + + if (start >= *val) { + + if (!d_set || start < d) { + d = start; + d_set = true; + } + + } else if (c->repeat > 0) { + int k; + + k = start + ROUND_UP(*val - start, c->repeat); + + if ((!d_set || k < d) && (stop < 0 || k <= stop)) { + d = k; + d_set = true; + } + } + + c = c->next; + } + + if (!d_set) + return -ENOENT; + + r = *val != d; + *val = d; + return r; +} + +static int tm_within_bounds(struct tm *tm, bool utc) { + struct tm t; + int cmp; + assert(tm); + + /* + * Set an upper bound on the year so impossible dates like "*-02-31" + * don't cause find_next() to loop forever. tm_year contains years + * since 1900, so adjust it accordingly. + */ + if (tm->tm_year + 1900 > MAX_YEAR) + return -ERANGE; + + t = *tm; + if (mktime_or_timegm(&t, utc) < 0) + return negative_errno(); + + /* + * Did any normalization take place? If so, it was out of bounds before. + * Normalization could skip next elapse, e.g. result of normalizing 3-33 + * is 4-2. This skips 4-1. So reset the sub time unit if upper unit was + * out of bounds. Normalization has occurred implies find_matching_component() > 0, + * other sub time units are already reset in find_next(). + */ + if ((cmp = CMP(t.tm_year, tm->tm_year)) != 0) + t.tm_mon = 0; + else if ((cmp = CMP(t.tm_mon, tm->tm_mon)) != 0) + t.tm_mday = 1; + else if ((cmp = CMP(t.tm_mday, tm->tm_mday)) != 0) + t.tm_hour = 0; + else if ((cmp = CMP(t.tm_hour, tm->tm_hour)) != 0) + t.tm_min = 0; + else if ((cmp = CMP(t.tm_min, tm->tm_min)) != 0) + t.tm_sec = 0; + else + cmp = CMP(t.tm_sec, tm->tm_sec); + + if (cmp < 0) + return -EDEADLK; /* Refuse to go backward */ + if (cmp > 0) + *tm = t; + return cmp == 0; +} + +static bool matches_weekday(int weekdays_bits, const struct tm *tm, bool utc) { + struct tm t; + int k; + + if (weekdays_bits < 0 || weekdays_bits >= BITS_WEEKDAYS) + return true; + + t = *tm; + if (mktime_or_timegm(&t, utc) < 0) + return false; + + k = t.tm_wday == 0 ? 6 : t.tm_wday - 1; + return (weekdays_bits & (1 << k)); +} + +/* A safety valve: if we get stuck in the calculation, return an error. + * C.f. https://bugzilla.redhat.com/show_bug.cgi?id=1941335. */ +#define MAX_CALENDAR_ITERATIONS 1000 + +static int find_next(const CalendarSpec *spec, struct tm *tm, usec_t *usec) { + struct tm c; + int tm_usec; + int r; + + /* Returns -ENOENT if the expression is not going to elapse anymore */ + + assert(spec); + assert(tm); + + c = *tm; + tm_usec = *usec; + + for (unsigned iteration = 0; iteration < MAX_CALENDAR_ITERATIONS; iteration++) { + /* Normalize the current date */ + (void) mktime_or_timegm(&c, spec->utc); + c.tm_isdst = spec->dst; + + c.tm_year += 1900; + r = find_matching_component(spec, spec->year, &c, &c.tm_year); + c.tm_year -= 1900; + + if (r > 0) { + c.tm_mon = 0; + c.tm_mday = 1; + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + } + if (r < 0) + return r; + if (tm_within_bounds(&c, spec->utc) <= 0) + return -ENOENT; + + c.tm_mon += 1; + r = find_matching_component(spec, spec->month, &c, &c.tm_mon); + c.tm_mon -= 1; + + if (r > 0) { + c.tm_mday = 1; + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + } + if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) { + c.tm_year++; + c.tm_mon = 0; + c.tm_mday = 1; + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + continue; + } + if (r == 0) + continue; + + r = find_matching_component(spec, spec->day, &c, &c.tm_mday); + if (r > 0) + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) { + c.tm_mon++; + c.tm_mday = 1; + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + continue; + } + if (r == 0) + continue; + + if (!matches_weekday(spec->weekdays_bits, &c, spec->utc)) { + c.tm_mday++; + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + continue; + } + + r = find_matching_component(spec, spec->hour, &c, &c.tm_hour); + if (r > 0) + c.tm_min = c.tm_sec = tm_usec = 0; + if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) { + c.tm_mday++; + c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0; + continue; + } + if (r == 0) + /* The next hour we set might be missing if there + * are time zone changes. Let's try again starting at + * normalized time. */ + continue; + + r = find_matching_component(spec, spec->minute, &c, &c.tm_min); + if (r > 0) + c.tm_sec = tm_usec = 0; + if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) { + c.tm_hour++; + c.tm_min = c.tm_sec = tm_usec = 0; + continue; + } + if (r == 0) + continue; + + c.tm_sec = c.tm_sec * USEC_PER_SEC + tm_usec; + r = find_matching_component(spec, spec->microsecond, &c, &c.tm_sec); + tm_usec = c.tm_sec % USEC_PER_SEC; + c.tm_sec /= USEC_PER_SEC; + + if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) { + c.tm_min++; + c.tm_sec = tm_usec = 0; + continue; + } + if (r == 0) + continue; + + *tm = c; + *usec = tm_usec; + return 0; + } + + /* It seems we entered an infinite loop. Let's gracefully return an error instead of hanging or + * aborting. This code is also exercised when timers.target is brought up during early boot, so + * aborting here is problematic and hard to diagnose for users. */ + _cleanup_free_ char *s = NULL; + (void) calendar_spec_to_string(spec, &s); + return log_warning_errno(SYNTHETIC_ERRNO(EDEADLK), + "Infinite loop in calendar calculation: %s", strna(s)); +} + +static int calendar_spec_next_usec_impl(const CalendarSpec *spec, usec_t usec, usec_t *ret_next) { + struct tm tm; + time_t t; + int r; + usec_t tm_usec; + + assert(spec); + + if (usec > USEC_TIMESTAMP_FORMATTABLE_MAX) + return -EINVAL; + + usec++; + t = (time_t) (usec / USEC_PER_SEC); + assert_se(localtime_or_gmtime_r(&t, &tm, spec->utc)); + tm_usec = usec % USEC_PER_SEC; + + r = find_next(spec, &tm, &tm_usec); + if (r < 0) + return r; + + t = mktime_or_timegm(&tm, spec->utc); + if (t < 0) + return -EINVAL; + + if (ret_next) + *ret_next = (usec_t) t * USEC_PER_SEC + tm_usec; + + return 0; +} + +typedef struct SpecNextResult { + usec_t next; + int return_value; +} SpecNextResult; + +int calendar_spec_next_usec(const CalendarSpec *spec, usec_t usec, usec_t *ret_next) { + SpecNextResult *shared, tmp; + int r; + + assert(spec); + + if (isempty(spec->timezone)) + return calendar_spec_next_usec_impl(spec, usec, ret_next); + + shared = mmap(NULL, sizeof *shared, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) + return negative_errno(); + + r = safe_fork("(sd-calendar)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_WAIT, NULL); + if (r < 0) { + (void) munmap(shared, sizeof *shared); + return r; + } + if (r == 0) { + char *colon_tz; + + /* tzset(3) says $TZ should be prefixed with ":" if we reference timezone files */ + colon_tz = strjoina(":", spec->timezone); + + if (setenv("TZ", colon_tz, 1) != 0) { + shared->return_value = negative_errno(); + _exit(EXIT_FAILURE); + } + + tzset(); + + shared->return_value = calendar_spec_next_usec_impl(spec, usec, &shared->next); + + _exit(EXIT_SUCCESS); + } + + tmp = *shared; + if (munmap(shared, sizeof *shared) < 0) + return negative_errno(); + + if (tmp.return_value == 0 && ret_next) + *ret_next = tmp.next; + + return tmp.return_value; +} diff --git a/src/shared/calendarspec.h b/src/shared/calendarspec.h new file mode 100644 index 0000000..60c1c79 --- /dev/null +++ b/src/shared/calendarspec.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* A structure for specifying (possibly repetitive) points in calendar + * time, a la cron */ + +#include + +#include "time-util.h" + +typedef struct CalendarComponent { + int start; + int stop; + int repeat; + + struct CalendarComponent *next; +} CalendarComponent; + +typedef struct CalendarSpec { + int weekdays_bits; + bool end_of_month:1; + bool utc:1; + signed int dst:2; + char *timezone; + + CalendarComponent *year; + CalendarComponent *month; + CalendarComponent *day; + + CalendarComponent *hour; + CalendarComponent *minute; + CalendarComponent *microsecond; +} CalendarSpec; + +CalendarSpec* calendar_spec_free(CalendarSpec *c); + +bool calendar_spec_valid(CalendarSpec *spec); + +int calendar_spec_to_string(const CalendarSpec *spec, char **ret); +int calendar_spec_from_string(const char *p, CalendarSpec **ret); + +int calendar_spec_next_usec(const CalendarSpec *spec, usec_t usec, usec_t *next); + +DEFINE_TRIVIAL_CLEANUP_FUNC(CalendarSpec*, calendar_spec_free); diff --git a/src/shared/cgroup-setup.c b/src/shared/cgroup-setup.c new file mode 100644 index 0000000..934a16e --- /dev/null +++ b/src/shared/cgroup-setup.c @@ -0,0 +1,1008 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "missing_threads.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "recurse-dir.h" +#include "stdio-util.h" +#include "string-util.h" +#include "user-util.h" +#include "virt.h" + +static int cg_any_controller_used_for_v1(void) { + _cleanup_free_ char *buf = NULL; + _cleanup_strv_free_ char **lines = NULL; + int r; + + r = read_full_virtual_file("/proc/cgroups", &buf, NULL); + if (r < 0) + return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m"); + + r = strv_split_newlines_full(&lines, buf, 0); + if (r < 0) + return r; + + /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all + * enabled kernel cgroup controllers are currently not in use by cgroup1. For reference: + * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups- + * + * Note that this is typically only useful to check inside a container where we don't know what + * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use + * unified since some or all controllers would be missing. This is not the best way to detect this, + * as whatever container manager created our container should have mounted /sys/fs/cgroup + * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use + * unified cgroups. */ + STRV_FOREACH(line, lines) { + _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL; + + /* Skip header line */ + if (startswith(*line, "#")) + continue; + + const char *p = *line; + r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled, NULL); + if (r < 0) + return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m"); + else if (r < 4) { + log_debug("Invalid /proc/cgroups line, ignoring."); + continue; + } + + /* Ignore disabled controllers. */ + if (streq(enabled, "0")) + continue; + + /* Ignore controllers we don't care about. */ + if (cgroup_controller_from_string(name) < 0) + continue; + + /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a + * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1 + * hierarchy, and can't be used in a unified cgroup. */ + if (!streq(hierarchy_id, "0")) { + log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name); + return 1; + } + } + + return 0; +} + +bool cg_is_unified_wanted(void) { + static thread_local int wanted = -1; + bool b; + const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL; + _cleanup_free_ char *c = NULL; + int r; + + /* If we have a cached value, return that. */ + if (wanted >= 0) + return wanted; + + /* If the hierarchy is already mounted, then follow whatever was chosen for it. */ + r = cg_unified_cached(true); + if (r >= 0) + return (wanted = r >= CGROUP_UNIFIED_ALL); + + /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */ + r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", /* flags = */ 0, &b); + if (r > 0) + return (wanted = b); + + /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to + * use hybrid or legacy hierarchy. */ + r = proc_cmdline_get_key("cgroup_no_v1", 0, &c); + if (r > 0 && streq_ptr(c, "all")) + return (wanted = true); + + /* If any controller is in use as v1, don't use unified. */ + if (cg_any_controller_used_for_v1() > 0) + return (wanted = false); + + return (wanted = is_default); +} + +bool cg_is_legacy_wanted(void) { + static thread_local int wanted = -1; + + /* If we have a cached value, return that. */ + if (wanted >= 0) + return wanted; + + /* Check if we have cgroup v2 already mounted. */ + if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL) + return (wanted = false); + + /* Otherwise, assume that at least partial legacy is wanted, + * since cgroup v2 should already be mounted at this point. */ + return (wanted = true); +} + +bool cg_is_hybrid_wanted(void) { + static thread_local int wanted = -1; + int r; + bool b; + const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD; + /* We default to true if the default is "hybrid", obviously, but also when the default is "unified", + * because if we get called, it means that unified hierarchy was not mounted. */ + + /* If we have a cached value, return that. */ + if (wanted >= 0) + return wanted; + + /* If the hierarchy is already mounted, then follow whatever was chosen for it. */ + if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL) + return (wanted = false); + + /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache + * a non-error result. */ + r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", /* flags = */ 0, &b); + + /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the + * negation. */ + return (wanted = r > 0 ? !b : is_default); +} + +int cg_weight_parse(const char *s, uint64_t *ret) { + uint64_t u; + int r; + + if (isempty(s)) { + *ret = CGROUP_WEIGHT_INVALID; + return 0; + } + + r = safe_atou64(s, &u); + if (r < 0) + return r; + + if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX) + return -ERANGE; + + *ret = u; + return 0; +} + +int cg_cpu_weight_parse(const char *s, uint64_t *ret) { + if (streq_ptr(s, "idle")) + return *ret = CGROUP_WEIGHT_IDLE; + return cg_weight_parse(s, ret); +} + +int cg_cpu_shares_parse(const char *s, uint64_t *ret) { + uint64_t u; + int r; + + if (isempty(s)) { + *ret = CGROUP_CPU_SHARES_INVALID; + return 0; + } + + r = safe_atou64(s, &u); + if (r < 0) + return r; + + if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX) + return -ERANGE; + + *ret = u; + return 0; +} + +int cg_blkio_weight_parse(const char *s, uint64_t *ret) { + uint64_t u; + int r; + + if (isempty(s)) { + *ret = CGROUP_BLKIO_WEIGHT_INVALID; + return 0; + } + + r = safe_atou64(s, &u); + if (r < 0) + return r; + + if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX) + return -ERANGE; + + *ret = u; + return 0; +} + +static int trim_cb( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */ + if (event == RECURSE_DIR_LEAVE && + de->d_type == DT_DIR && + unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 && + !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY)) + log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path); + + return RECURSE_DIR_CONTINUE; +} + +int cg_trim(const char *controller, const char *path, bool delete_root) { + _cleanup_free_ char *fs = NULL; + int r, q; + + assert(path); + assert(controller); + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + r = recurse_dir_at( + AT_FDCWD, + fs, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_ENSURE_TYPE, + trim_cb, + NULL); + if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */ + r = 0; + else if (r < 0) + log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path); + + /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is + * already gone anyway). Also, let's debug log about this failure, except if the error code is an + * expected one. */ + if (delete_root && !empty_or_root(path) && + rmdir(fs) < 0 && errno != ENOENT) { + if (!IN_SET(errno, ENOTEMPTY, EBUSY)) + log_debug_errno(errno, "Failed to trim cgroup %s: %m", path); + if (r >= 0) + r = -errno; + } + + q = cg_hybrid_unified(); + if (q < 0) + return q; + if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root); + + return r; +} + +/* Create a cgroup in the hierarchy of controller. + * Returns 0 if the group already existed, 1 on success, negative otherwise. + */ +int cg_create(const char *controller, const char *path) { + _cleanup_free_ char *fs = NULL; + int r; + + r = cg_get_path_and_check(controller, path, NULL, &fs); + if (r < 0) + return r; + + r = mkdir_parents(fs, 0755); + if (r < 0) + return r; + + r = RET_NERRNO(mkdir(fs, 0755)); + if (r == -EEXIST) + return 0; + if (r < 0) + return r; + + r = cg_hybrid_unified(); + if (r < 0) + return r; + + if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path); + if (r < 0) + log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path); + } + + return 1; +} + +int cg_create_and_attach(const char *controller, const char *path, pid_t pid) { + int r, q; + + assert(pid >= 0); + + r = cg_create(controller, path); + if (r < 0) + return r; + + q = cg_attach(controller, path, pid); + if (q < 0) + return q; + + /* This does not remove the cgroup on failure */ + return r; +} + +int cg_attach(const char *controller, const char *path, pid_t pid) { + _cleanup_free_ char *fs = NULL; + char c[DECIMAL_STR_MAX(pid_t) + 2]; + int r; + + assert(path); + assert(pid >= 0); + + r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs); + if (r < 0) + return r; + + if (pid == 0) + pid = getpid_cached(); + + xsprintf(c, PID_FMT "\n", pid); + + r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r == -EOPNOTSUPP && cg_is_threaded(path) > 0) + /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */ + return -EUCLEAN; + if (r < 0) + return r; + + r = cg_hybrid_unified(); + if (r < 0) + return r; + + if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid); + if (r < 0) + log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path); + } + + return 0; +} + +int cg_attach_fallback(const char *controller, const char *path, pid_t pid) { + int r; + + assert(controller); + assert(path); + assert(pid >= 0); + + r = cg_attach(controller, path, pid); + if (r < 0) { + char prefix[strlen(path) + 1]; + + /* This didn't work? Then let's try all prefixes of + * the destination */ + + PATH_FOREACH_PREFIX(prefix, path) { + int q; + + q = cg_attach(controller, prefix, pid); + if (q >= 0) + return q; + } + } + + return r; +} + +int cg_set_access( + const char *controller, + const char *path, + uid_t uid, + gid_t gid) { + + struct Attribute { + const char *name; + bool fatal; + }; + + /* cgroup v1, aka legacy/non-unified */ + static const struct Attribute legacy_attributes[] = { + { "cgroup.procs", true }, + { "tasks", false }, + { "cgroup.clone_children", false }, + {}, + }; + + /* cgroup v2, aka unified */ + static const struct Attribute unified_attributes[] = { + { "cgroup.procs", true }, + { "cgroup.subtree_control", true }, + { "cgroup.threads", false }, + { "memory.oom.group", false }, + { "memory.reclaim", false }, + {}, + }; + + static const struct Attribute* const attributes[] = { + [false] = legacy_attributes, + [true] = unified_attributes, + }; + + _cleanup_free_ char *fs = NULL; + const struct Attribute *i; + int r, unified; + + assert(path); + + if (uid == UID_INVALID && gid == GID_INVALID) + return 0; + + unified = cg_unified_controller(controller); + if (unified < 0) + return unified; + + /* Configure access to the cgroup itself */ + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + r = chmod_and_chown(fs, 0755, uid, gid); + if (r < 0) + return r; + + /* Configure access to the cgroup's attributes */ + for (i = attributes[unified]; i->name; i++) { + fs = mfree(fs); + + r = cg_get_path(controller, path, i->name, &fs); + if (r < 0) + return r; + + r = chmod_and_chown(fs, 0644, uid, gid); + if (r < 0) { + if (i->fatal) + return r; + + log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs); + } + } + + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_hybrid_unified(); + if (r < 0) + return r; + if (r > 0) { + /* Always propagate access mode from unified to legacy controller */ + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid); + if (r < 0) + log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path); + } + } + + return 0; +} + +struct access_callback_data { + uid_t uid; + gid_t gid; + int error; +}; + +static int access_callback( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + struct access_callback_data *d = ASSERT_PTR(userdata); + + if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY)) + return RECURSE_DIR_CONTINUE; + + assert(inode_fd >= 0); + + /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */ + if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) { + log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path)); + + if (d->error == 0) /* Return last error to caller */ + d->error = errno; + } + + return RECURSE_DIR_CONTINUE; +} + +int cg_set_access_recursive( + const char *controller, + const char *path, + uid_t uid, + gid_t gid) { + + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *fs = NULL; + int r; + + /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files, + * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to + * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */ + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY); + if (fd < 0) + return -errno; + + struct access_callback_data d = { + .uid = uid, + .gid = gid, + }; + + r = recurse_dir(fd, + fs, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL, + access_callback, + &d); + if (r < 0) + return r; + + return -d.error; +} + +int cg_migrate( + const char *cfrom, + const char *pfrom, + const char *cto, + const char *pto, + CGroupFlags flags) { + + bool done = false; + _cleanup_set_free_ Set *s = NULL; + int r, ret = 0; + pid_t my_pid; + + assert(cfrom); + assert(pfrom); + assert(cto); + assert(pto); + + s = set_new(NULL); + if (!s) + return -ENOMEM; + + my_pid = getpid_cached(); + + do { + _cleanup_fclose_ FILE *f = NULL; + pid_t pid = 0; + done = true; + + r = cg_enumerate_processes(cfrom, pfrom, &f); + if (r < 0) { + if (ret >= 0 && r != -ENOENT) + return r; + + return ret; + } + + while ((r = cg_read_pid(f, &pid)) > 0) { + + /* This might do weird stuff if we aren't a + * single-threaded program. However, we + * luckily know we are not */ + if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid) + continue; + + if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid)) + continue; + + /* Ignore kernel threads. Since they can only + * exist in the root cgroup, we only check for + * them there. */ + if (cfrom && + empty_or_root(pfrom) && + pid_is_kernel_thread(pid) > 0) + continue; + + r = cg_attach(cto, pto, pid); + if (r < 0) { + if (ret >= 0 && r != -ESRCH) + ret = r; + } else if (ret == 0) + ret = 1; + + done = false; + + r = set_put(s, PID_TO_PTR(pid)); + if (r < 0) { + if (ret >= 0) + return r; + + return ret; + } + } + + if (r < 0) { + if (ret >= 0) + return r; + + return ret; + } + } while (!done); + + return ret; +} + +int cg_migrate_recursive( + const char *cfrom, + const char *pfrom, + const char *cto, + const char *pto, + CGroupFlags flags) { + + _cleanup_closedir_ DIR *d = NULL; + int r, ret = 0; + char *fn; + + assert(cfrom); + assert(pfrom); + assert(cto); + assert(pto); + + ret = cg_migrate(cfrom, pfrom, cto, pto, flags); + + r = cg_enumerate_subgroups(cfrom, pfrom, &d); + if (r < 0) { + if (ret >= 0 && r != -ENOENT) + return r; + + return ret; + } + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = path_join(empty_to_root(pfrom), fn); + free(fn); + if (!p) + return -ENOMEM; + + r = cg_migrate_recursive(cfrom, p, cto, pto, flags); + if (r != 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + + if (flags & CGROUP_REMOVE) { + r = cg_rmdir(cfrom, pfrom); + if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY)) + return r; + } + + return ret; +} + +int cg_migrate_recursive_fallback( + const char *cfrom, + const char *pfrom, + const char *cto, + const char *pto, + CGroupFlags flags) { + + int r; + + assert(cfrom); + assert(pfrom); + assert(cto); + assert(pto); + + r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags); + if (r < 0) { + char prefix[strlen(pto) + 1]; + + /* This didn't work? Then let's try all prefixes of the destination */ + + PATH_FOREACH_PREFIX(prefix, pto) { + int q; + + q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags); + if (q >= 0) + return q; + } + } + + return r; +} + +int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) { + CGroupController c; + CGroupMask done; + bool created; + int r; + + /* This one will create a cgroup in our private tree, but also + * duplicate it in the trees specified in mask, and remove it + * in all others. + * + * Returns 0 if the group already existed in the systemd hierarchy, + * 1 on success, negative otherwise. + */ + + /* First create the cgroup in our own hierarchy. */ + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path); + if (r < 0) + return r; + created = r; + + /* If we are in the unified hierarchy, we are done now */ + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + return created; + + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + + /* Otherwise, do the same in the other hierarchies */ + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *n; + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + n = cgroup_controller_to_string(c); + if (FLAGS_SET(mask, bit)) + (void) cg_create(n, path); + + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return created; +} + +int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) { + int r; + + r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid); + if (r < 0) + return r; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + return 0; + + supported &= CGROUP_MASK_V1; + CGroupMask done = 0; + + for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *p = NULL; + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + if (path_callback) + p = path_callback(bit, userdata); + if (!p) + p = path; + + (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid); + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return 0; +} + +int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) { + CGroupController c; + CGroupMask done; + int r = 0, q; + + assert(to_callback); + + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *to = NULL; + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + if (!FLAGS_SET(mask, bit)) + continue; + + to = to_callback(bit, userdata); + + /* Remember first error and try continuing */ + q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0); + r = (r < 0) ? r : q; + + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return r; +} + +int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) { + int r, q; + + r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root); + if (r < 0) + return r; + + q = cg_all_unified(); + if (q < 0) + return q; + if (q > 0) + return r; + + return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root); +} + +int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) { + CGroupController c; + CGroupMask done; + int r = 0, q; + + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + if (FLAGS_SET(mask, bit)) { + /* Remember first error and try continuing */ + q = cg_trim(cgroup_controller_to_string(c), path, delete_root); + r = (r < 0) ? r : q; + } + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return r; +} + +int cg_enable_everywhere( + CGroupMask supported, + CGroupMask mask, + const char *p, + CGroupMask *ret_result_mask) { + + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *fs = NULL; + CGroupController c; + CGroupMask ret = 0; + int r; + + assert(p); + + if (supported == 0) { + if (ret_result_mask) + *ret_result_mask = 0; + return 0; + } + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) { + /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim + * complete success right away. (If you wonder why we return the full mask here, rather than zero: the + * caller tends to use the returned mask later on to compare if all controllers where properly joined, + * and if not requeues realization. This use is the primary purpose of the return value, hence let's + * minimize surprises here and reduce triggers for re-realization by always saying we fully + * succeeded.) */ + if (ret_result_mask) + *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with + * CGROUP_MASK_V2: The 'supported' mask + * might contain pure-V1 or BPF + * controllers, and we never want to + * claim that we could enable those with + * cgroup.subtree_control */ + return 0; + } + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs); + if (r < 0) + return r; + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *n; + + if (!FLAGS_SET(CGROUP_MASK_V2, bit)) + continue; + + if (!FLAGS_SET(supported, bit)) + continue; + + n = cgroup_controller_to_string(c); + { + char s[1 + strlen(n) + 1]; + + s[0] = FLAGS_SET(mask, bit) ? '+' : '-'; + strcpy(s + 1, n); + + if (!f) { + f = fopen(fs, "we"); + if (!f) + return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); + } + + r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) { + log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", + FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); + clearerr(f); + + /* If we can't turn off a controller, leave it on in the reported resulting mask. This + * happens for example when we attempt to turn off a controller up in the tree that is + * used down in the tree. */ + if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY + * only here, and not follow the same logic + * for other errors such as EINVAL or + * EOPNOTSUPP or anything else. That's + * because EBUSY indicates that the + * controllers is currently enabled and + * cannot be disabled because something down + * the hierarchy is still using it. Any other + * error most likely means something like "I + * never heard of this controller" or + * similar. In the former case it's hence + * safe to assume the controller is still on + * after the failed operation, while in the + * latter case it's safer to assume the + * controller is unknown and hence certainly + * not enabled. */ + ret |= bit; + } else { + /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */ + if (FLAGS_SET(mask, bit)) + ret |= bit; + } + } + } + + /* Let's return the precise set of controllers now enabled for the cgroup. */ + if (ret_result_mask) + *ret_result_mask = ret; + + return 0; +} diff --git a/src/shared/cgroup-setup.h b/src/shared/cgroup-setup.h new file mode 100644 index 0000000..1b6f071 --- /dev/null +++ b/src/shared/cgroup-setup.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "cgroup-util.h" + +bool cg_is_unified_wanted(void); +bool cg_is_legacy_wanted(void); +bool cg_is_hybrid_wanted(void); + +int cg_weight_parse(const char *s, uint64_t *ret); +int cg_cpu_weight_parse(const char *s, uint64_t *ret); +int cg_cpu_shares_parse(const char *s, uint64_t *ret); +int cg_blkio_weight_parse(const char *s, uint64_t *ret); + +int cg_trim(const char *controller, const char *path, bool delete_root); + +int cg_create(const char *controller, const char *path); +int cg_attach(const char *controller, const char *path, pid_t pid); +int cg_attach_fallback(const char *controller, const char *path, pid_t pid); +int cg_create_and_attach(const char *controller, const char *path, pid_t pid); + +int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid); +int cg_set_access_recursive(const char *controller, const char *path, uid_t uid, gid_t gid); + +int cg_migrate(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags); +int cg_migrate_recursive(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags); +int cg_migrate_recursive_fallback(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags); + +int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path); +int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t callback, void *userdata); +int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata); +int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root); +int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root); +int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p, CGroupMask *ret_result_mask); diff --git a/src/shared/cgroup-show.c b/src/shared/cgroup-show.c new file mode 100644 index 0000000..c2ee1c5 --- /dev/null +++ b/src/shared/cgroup-show.c @@ -0,0 +1,471 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "cgroup-show.h" +#include "cgroup-util.h" +#include "env-file.h" +#include "escape.h" +#include "fd-util.h" +#include "format-util.h" +#include "hostname-util.h" +#include "locale-util.h" +#include "macro.h" +#include "nulstr-util.h" +#include "output-mode.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "terminal-util.h" +#include "unit-name.h" +#include "xattr-util.h" + +static void show_pid_array( + pid_t pids[], + size_t n_pids, + const char *prefix, + size_t n_columns, + bool extra, + bool more, + OutputFlags flags) { + + size_t i, j, pid_width; + + if (n_pids == 0) + return; + + typesafe_qsort(pids, n_pids, pid_compare_func); + + /* Filter duplicates */ + for (j = 0, i = 1; i < n_pids; i++) { + if (pids[i] == pids[j]) + continue; + pids[++j] = pids[i]; + } + n_pids = j + 1; + pid_width = DECIMAL_STR_WIDTH(pids[j]); + + if (flags & OUTPUT_FULL_WIDTH) + n_columns = SIZE_MAX; + else { + if (n_columns > pid_width + 3) /* something like "├─1114784 " */ + n_columns -= pid_width + 3; + else + n_columns = 20; + } + for (i = 0; i < n_pids; i++) { + _cleanup_free_ char *t = NULL; + + (void) pid_get_cmdline(pids[i], n_columns, + PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_USE_LOCALE, + &t); + + if (extra) + printf("%s%s ", prefix, special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET)); + else + printf("%s%s", prefix, special_glyph(((more || i < n_pids-1) ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT))); + + printf("%s%*"PID_PRI" %s%s\n", ansi_grey(), (int) pid_width, pids[i], strna(t), ansi_normal()); + } +} + +static int show_cgroup_one_by_path( + const char *path, + const char *prefix, + size_t n_columns, + bool more, + OutputFlags flags) { + + _cleanup_free_ pid_t *pids = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + size_t n = 0; + char *fn; + int r; + + r = cg_mangle_path(path, &p); + if (r < 0) + return r; + + fn = strjoina(p, "/cgroup.procs"); + f = fopen(fn, "re"); + if (!f) + return -errno; + + for (;;) { + pid_t pid; + + /* libvirt / qemu uses threaded mode and cgroup.procs cannot be read at the lower levels. + * From https://docs.kernel.org/admin-guide/cgroup-v2.html#threads, + * “cgroup.procs” in a threaded domain cgroup contains the PIDs of all processes in + * the subtree and is not readable in the subtree proper. */ + r = cg_read_pid(f, &pid); + if (IN_SET(r, 0, -EOPNOTSUPP)) + break; + if (r < 0) + return r; + + if (!(flags & OUTPUT_KERNEL_THREADS) && pid_is_kernel_thread(pid) > 0) + continue; + + if (!GREEDY_REALLOC(pids, n + 1)) + return -ENOMEM; + + pids[n++] = pid; + } + + show_pid_array(pids, n, prefix, n_columns, false, more, flags); + + return 0; +} + +static int show_cgroup_name( + const char *path, + const char *prefix, + SpecialGlyph glyph, + OutputFlags flags) { + + uint64_t cgroupid = UINT64_MAX; + _cleanup_free_ char *b = NULL; + _cleanup_close_ int fd = -EBADF; + bool delegate; + int r; + + fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW|O_DIRECTORY, 0); + if (fd < 0) + return log_debug_errno(errno, "Failed to open cgroup '%s', ignoring: %m", path); + + r = cg_is_delegated_fd(fd); + if (r < 0) + log_debug_errno(r, "Failed to check if cgroup is delegated, ignoring: %m"); + delegate = r > 0; + + if (FLAGS_SET(flags, OUTPUT_CGROUP_ID)) { + cg_file_handle fh = CG_FILE_HANDLE_INIT; + int mnt_id = -1; + + if (name_to_handle_at( + fd, + "", + &fh.file_handle, + &mnt_id, + AT_EMPTY_PATH) < 0) + log_debug_errno(errno, "Failed to determine cgroup ID of %s, ignoring: %m", path); + else + cgroupid = CG_FILE_HANDLE_CGROUPID(fh); + } + + r = path_extract_filename(path, &b); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from cgroup path: %m"); + + printf("%s%s%s%s%s", + prefix, special_glyph(glyph), + delegate ? ansi_underline() : "", + cg_unescape(b), + delegate ? ansi_normal() : ""); + + if (delegate) + printf(" %s%s%s", + ansi_highlight(), + special_glyph(SPECIAL_GLYPH_ELLIPSIS), + ansi_normal()); + + if (cgroupid != UINT64_MAX) + printf(" %s(#%" PRIu64 ")%s", ansi_grey(), cgroupid, ansi_normal()); + + printf("\n"); + + if (FLAGS_SET(flags, OUTPUT_CGROUP_XATTRS)) { + _cleanup_free_ char *nl = NULL; + + r = flistxattr_malloc(fd, &nl); + if (r < 0) + log_debug_errno(r, "Failed to enumerate xattrs on '%s', ignoring: %m", path); + + NULSTR_FOREACH(xa, nl) { + _cleanup_free_ char *x = NULL, *y = NULL, *buf = NULL; + int n; + + if (!STARTSWITH_SET(xa, "user.", "trusted.")) + continue; + + n = fgetxattr_malloc(fd, xa, &buf); + if (n < 0) { + log_debug_errno(r, "Failed to read xattr '%s' off '%s', ignoring: %m", xa, path); + continue; + } + + x = cescape(xa); + if (!x) + return -ENOMEM; + + y = cescape_length(buf, n); + if (!y) + return -ENOMEM; + + printf("%s%s%s %s%s%s: %s\n", + prefix, + glyph == SPECIAL_GLYPH_TREE_BRANCH ? special_glyph(SPECIAL_GLYPH_TREE_VERTICAL) : " ", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + ansi_blue(), x, ansi_normal(), + y); + } + } + + return 0; +} + +int show_cgroup_by_path( + const char *path, + const char *prefix, + size_t n_columns, + OutputFlags flags) { + + _cleanup_free_ char *fn = NULL, *p1 = NULL, *last = NULL, *p2 = NULL; + _cleanup_closedir_ DIR *d = NULL; + bool shown_pids = false; + char *gn = NULL; + int r; + + assert(path); + + if (n_columns <= 0) + n_columns = columns(); + + prefix = strempty(prefix); + + r = cg_mangle_path(path, &fn); + if (r < 0) + return r; + + d = opendir(fn); + if (!d) + return -errno; + + while ((r = cg_read_subgroup(d, &gn)) > 0) { + _cleanup_free_ char *k = NULL; + + k = path_join(fn, gn); + free(gn); + if (!k) + return -ENOMEM; + + if (!(flags & OUTPUT_SHOW_ALL) && cg_is_empty_recursive(NULL, k) > 0) + continue; + + if (!shown_pids) { + (void) show_cgroup_one_by_path(path, prefix, n_columns, true, flags); + shown_pids = true; + } + + if (last) { + r = show_cgroup_name(last, prefix, SPECIAL_GLYPH_TREE_BRANCH, flags); + if (r < 0) + return r; + + if (!p1) { + p1 = strjoin(prefix, special_glyph(SPECIAL_GLYPH_TREE_VERTICAL)); + if (!p1) + return -ENOMEM; + } + + show_cgroup_by_path(last, p1, n_columns-2, flags); + free(last); + } + + last = TAKE_PTR(k); + } + + if (r < 0) + return r; + + if (!shown_pids) + (void) show_cgroup_one_by_path(path, prefix, n_columns, !!last, flags); + + if (last) { + r = show_cgroup_name(last, prefix, SPECIAL_GLYPH_TREE_RIGHT, flags); + if (r < 0) + return r; + + if (!p2) { + p2 = strjoin(prefix, " "); + if (!p2) + return -ENOMEM; + } + + show_cgroup_by_path(last, p2, n_columns-2, flags); + } + + return 0; +} + +int show_cgroup(const char *controller, + const char *path, + const char *prefix, + size_t n_columns, + OutputFlags flags) { + _cleanup_free_ char *p = NULL; + int r; + + assert(path); + + r = cg_get_path(controller, path, NULL, &p); + if (r < 0) + return r; + + return show_cgroup_by_path(p, prefix, n_columns, flags); +} + +static int show_extra_pids( + const char *controller, + const char *path, + const char *prefix, + size_t n_columns, + const pid_t pids[], + size_t n_pids, + OutputFlags flags) { + + _cleanup_free_ pid_t *copy = NULL; + size_t i, j; + int r; + + assert(path); + + if (n_pids <= 0) + return 0; + + if (n_columns <= 0) + n_columns = columns(); + + prefix = strempty(prefix); + + copy = new(pid_t, n_pids); + if (!copy) + return -ENOMEM; + + for (i = 0, j = 0; i < n_pids; i++) { + _cleanup_free_ char *k = NULL; + + r = cg_pid_get_path(controller, pids[i], &k); + if (r < 0) + return r; + + if (path_startswith(k, path)) + continue; + + copy[j++] = pids[i]; + } + + show_pid_array(copy, j, prefix, n_columns, true, false, flags); + + return 0; +} + +int show_cgroup_and_extra( + const char *controller, + const char *path, + const char *prefix, + size_t n_columns, + const pid_t extra_pids[], + size_t n_extra_pids, + OutputFlags flags) { + + int r; + + assert(path); + + r = show_cgroup(controller, path, prefix, n_columns, flags); + if (r < 0) + return r; + + return show_extra_pids(controller, path, prefix, n_columns, extra_pids, n_extra_pids, flags); +} + +int show_cgroup_get_unit_path_and_warn( + sd_bus *bus, + const char *unit, + char **ret) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL; + int r; + + path = unit_dbus_path_from_name(unit); + if (!path) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + path, + unit_dbus_interface_from_name(unit), + "ControlGroup", + &error, + ret); + if (r < 0) + return log_error_errno(r, "Failed to query unit control group path: %s", + bus_error_message(&error, r)); + + return 0; +} + +int show_cgroup_get_path_and_warn( + const char *machine, + const char *prefix, + char **ret) { + + _cleanup_free_ char *root = NULL; + int r; + + if (machine) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *unit = NULL; + const char *m; + + if (!hostname_is_valid(machine, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Machine name is not valid: %s", machine); + + m = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, m, "SCOPE", &unit); + if (r < 0) + return log_error_errno(r, "Failed to load machine data: %m"); + + r = bus_connect_transport_systemd(BUS_TRANSPORT_LOCAL, NULL, RUNTIME_SCOPE_SYSTEM, &bus); + if (r < 0) + return bus_log_connect_error(r, BUS_TRANSPORT_LOCAL); + + r = show_cgroup_get_unit_path_and_warn(bus, unit, &root); + if (r < 0) + return r; + } else { + r = cg_get_root_path(&root); + if (r == -ENOMEDIUM) + return log_error_errno(r, "Failed to get root control group path.\n" + "No cgroup filesystem mounted on /sys/fs/cgroup"); + if (r < 0) + return log_error_errno(r, "Failed to get root control group path: %m"); + } + + if (prefix) { + char *t; + + t = path_join(root, prefix); + if (!t) + return log_oom(); + + *ret = t; + } else + *ret = TAKE_PTR(root); + + return 0; +} diff --git a/src/shared/cgroup-show.h b/src/shared/cgroup-show.h new file mode 100644 index 0000000..db3c9c9 --- /dev/null +++ b/src/shared/cgroup-show.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-bus.h" + +#include "logs-show.h" +#include "output-mode.h" + +int show_cgroup_by_path(const char *path, const char *prefix, size_t n_columns, OutputFlags flags); +int show_cgroup(const char *controller, const char *path, const char *prefix, size_t n_columns, OutputFlags flags); + +int show_cgroup_and_extra(const char *controller, const char *path, const char *prefix, size_t n_columns, const pid_t extra_pids[], size_t n_extra_pids, OutputFlags flags); + +int show_cgroup_get_unit_path_and_warn( + sd_bus *bus, + const char *unit, + char **ret); +int show_cgroup_get_path_and_warn( + const char *machine, + const char *prefix, + char **ret); diff --git a/src/shared/chown-recursive.c b/src/shared/chown-recursive.c new file mode 100644 index 0000000..6aa5f67 --- /dev/null +++ b/src/shared/chown-recursive.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "chown-recursive.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "stdio-util.h" +#include "strv.h" +#include "user-util.h" + +static int chown_one( + int fd, + const struct stat *st, + uid_t uid, + gid_t gid, + mode_t mask) { + + int r; + + assert(fd >= 0); + assert(st); + + /* We change ACLs through the /proc/self/fd/%i path, so that we have a stable reference that works + * with O_PATH. */ + + /* Drop any ACL if there is one */ + FOREACH_STRING(n, "system.posix_acl_access", "system.posix_acl_default") + if (removexattr(FORMAT_PROC_FD_PATH(fd), n) < 0) + if (!ERRNO_IS_XATTR_ABSENT(errno)) + return -errno; + + r = fchmod_and_chown(fd, st->st_mode & mask, uid, gid); + if (r < 0) + return r; + + return 1; +} + +static int chown_recursive_internal( + int fd, + const struct stat *st, + uid_t uid, + gid_t gid, + mode_t mask) { + + _cleanup_closedir_ DIR *d = NULL; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + d = fdopendir(fd); + if (!d) { + safe_close(fd); + return -errno; + } + + FOREACH_DIRENT_ALL(de, d, return -errno) { + _cleanup_close_ int path_fd = -EBADF; + struct stat fst; + + if (dot_or_dot_dot(de->d_name)) + continue; + + /* Let's pin the child inode we want to fix now with an O_PATH fd, so that it cannot be swapped out + * while we manipulate it. */ + path_fd = openat(dirfd(d), de->d_name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (path_fd < 0) + return -errno; + + if (fstat(path_fd, &fst) < 0) + return -errno; + + if (S_ISDIR(fst.st_mode)) { + int subdir_fd; + + /* Convert it to a "real" (i.e. non-O_PATH) fd now */ + subdir_fd = fd_reopen(path_fd, O_RDONLY|O_CLOEXEC|O_NOATIME); + if (subdir_fd < 0) + return subdir_fd; + + r = chown_recursive_internal(subdir_fd, &fst, uid, gid, mask); /* takes possession of subdir_fd even on failure */ + if (r < 0) + return r; + if (r > 0) + changed = true; + } else { + r = chown_one(path_fd, &fst, uid, gid, mask); + if (r < 0) + return r; + if (r > 0) + changed = true; + } + } + + r = chown_one(dirfd(d), st, uid, gid, mask); + if (r < 0) + return r; + + return r > 0 || changed; +} + +int path_chown_recursive( + const char *path, + uid_t uid, + gid_t gid, + mode_t mask, + int flags) { + + _cleanup_close_ int fd = -EBADF; + struct stat st; + + assert((flags & ~AT_SYMLINK_FOLLOW) == 0); + + fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOATIME|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : O_NOFOLLOW)); + if (fd < 0) + return -errno; + + if (!uid_is_valid(uid) && !gid_is_valid(gid) && FLAGS_SET(mask, 07777)) + return 0; /* nothing to do */ + + if (fstat(fd, &st) < 0) + return -errno; + + /* Let's take a shortcut: if the top-level directory is properly owned, we don't descend into the + * whole tree, under the assumption that all is OK anyway. */ + if ((!uid_is_valid(uid) || st.st_uid == uid) && + (!gid_is_valid(gid) || st.st_gid == gid) && + ((st.st_mode & ~mask & 07777) == 0)) + return 0; + + return chown_recursive_internal(TAKE_FD(fd), &st, uid, gid, mask); /* we donate the fd to the call, regardless if it succeeded or failed */ +} + +int fd_chown_recursive( + int fd, + uid_t uid, + gid_t gid, + mode_t mask) { + + int duplicated_fd = -EBADF; + struct stat st; + + /* Note that the slightly different order of fstat() and the checks here and in + * path_chown_recursive(). That's because when we open the directory ourselves we can specify + * O_DIRECTORY and we always want to ensure we are operating on a directory before deciding whether + * the operation is otherwise redundant. */ + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISDIR(st.st_mode)) + return -ENOTDIR; + + if (!uid_is_valid(uid) && !gid_is_valid(gid) && FLAGS_SET(mask, 07777)) + return 0; /* nothing to do */ + + /* Shortcut, as above */ + if ((!uid_is_valid(uid) || st.st_uid == uid) && + (!gid_is_valid(gid) || st.st_gid == gid) && + ((st.st_mode & ~mask & 07777) == 0)) + return 0; + + /* Let's duplicate the fd here, as opendir() wants to take possession of it and close it afterwards */ + duplicated_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (duplicated_fd < 0) + return -errno; + + return chown_recursive_internal(duplicated_fd, &st, uid, gid, mask); /* fd donated even on failure */ +} diff --git a/src/shared/chown-recursive.h b/src/shared/chown-recursive.h new file mode 100644 index 0000000..2aab8e7 --- /dev/null +++ b/src/shared/chown-recursive.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int path_chown_recursive(const char *path, uid_t uid, gid_t gid, mode_t mask, int flags); + +int fd_chown_recursive(int fd, uid_t uid, gid_t gid, mode_t mask); diff --git a/src/shared/clean-ipc.c b/src/shared/clean-ipc.c new file mode 100644 index 0000000..bbb343f --- /dev/null +++ b/src/shared/clean-ipc.c @@ -0,0 +1,452 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clean-ipc.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +static bool match_uid_gid(uid_t subject_uid, gid_t subject_gid, uid_t delete_uid, gid_t delete_gid) { + + if (uid_is_valid(delete_uid) && subject_uid == delete_uid) + return true; + + if (gid_is_valid(delete_gid) && subject_gid == delete_gid) + return true; + + return false; +} + +static int clean_sysvipc_shm(uid_t delete_uid, gid_t delete_gid, bool rm) { + _cleanup_fclose_ FILE *f = NULL; + bool first = true; + int ret = 0, r; + + f = fopen("/proc/sysvipc/shm", "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open /proc/sysvipc/shm: %m"); + } + + for (;;) { + _cleanup_free_ char *line = NULL; + unsigned n_attached; + pid_t cpid, lpid; + uid_t uid, cuid; + gid_t gid, cgid; + int shmid; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(errno, "Failed to read /proc/sysvipc/shm: %m"); + if (r == 0) + break; + + if (first) { + first = false; + continue; + } + + if (sscanf(line, "%*i %i %*o %*u " PID_FMT " " PID_FMT " %u " UID_FMT " " GID_FMT " " UID_FMT " " GID_FMT, + &shmid, &cpid, &lpid, &n_attached, &uid, &gid, &cuid, &cgid) != 8) + continue; + + if (n_attached > 0) + continue; + + if (!match_uid_gid(uid, gid, delete_uid, delete_gid)) + continue; + + if (!rm) + return 1; + + if (shmctl(shmid, IPC_RMID, NULL) < 0) { + + /* Ignore entries that are already deleted */ + if (IN_SET(errno, EIDRM, EINVAL)) + continue; + + ret = log_warning_errno(errno, + "Failed to remove SysV shared memory segment %i: %m", + shmid); + } else { + log_debug("Removed SysV shared memory segment %i.", shmid); + if (ret == 0) + ret = 1; + } + } + + return ret; +} + +static int clean_sysvipc_sem(uid_t delete_uid, gid_t delete_gid, bool rm) { + _cleanup_fclose_ FILE *f = NULL; + bool first = true; + int ret = 0, r; + + f = fopen("/proc/sysvipc/sem", "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open /proc/sysvipc/sem: %m"); + } + + for (;;) { + _cleanup_free_ char *line = NULL; + uid_t uid, cuid; + gid_t gid, cgid; + int semid; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/sysvipc/sem: %m"); + if (r == 0) + break; + + if (first) { + first = false; + continue; + } + + if (sscanf(line, "%*i %i %*o %*u " UID_FMT " " GID_FMT " " UID_FMT " " GID_FMT, + &semid, &uid, &gid, &cuid, &cgid) != 5) + continue; + + if (!match_uid_gid(uid, gid, delete_uid, delete_gid)) + continue; + + if (!rm) + return 1; + + if (semctl(semid, 0, IPC_RMID) < 0) { + + /* Ignore entries that are already deleted */ + if (IN_SET(errno, EIDRM, EINVAL)) + continue; + + ret = log_warning_errno(errno, + "Failed to remove SysV semaphores object %i: %m", + semid); + } else { + log_debug("Removed SysV semaphore %i.", semid); + if (ret == 0) + ret = 1; + } + } + + return ret; +} + +static int clean_sysvipc_msg(uid_t delete_uid, gid_t delete_gid, bool rm) { + _cleanup_fclose_ FILE *f = NULL; + bool first = true; + int ret = 0, r; + + f = fopen("/proc/sysvipc/msg", "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open /proc/sysvipc/msg: %m"); + } + + for (;;) { + _cleanup_free_ char *line = NULL; + uid_t uid, cuid; + gid_t gid, cgid; + pid_t cpid, lpid; + int msgid; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/sysvipc/msg: %m"); + if (r == 0) + break; + + if (first) { + first = false; + continue; + } + + if (sscanf(line, "%*i %i %*o %*u %*u " PID_FMT " " PID_FMT " " UID_FMT " " GID_FMT " " UID_FMT " " GID_FMT, + &msgid, &cpid, &lpid, &uid, &gid, &cuid, &cgid) != 7) + continue; + + if (!match_uid_gid(uid, gid, delete_uid, delete_gid)) + continue; + + if (!rm) + return 1; + + if (msgctl(msgid, IPC_RMID, NULL) < 0) { + + /* Ignore entries that are already deleted */ + if (IN_SET(errno, EIDRM, EINVAL)) + continue; + + ret = log_warning_errno(errno, + "Failed to remove SysV message queue %i: %m", + msgid); + } else { + log_debug("Removed SysV message queue %i.", msgid); + if (ret == 0) + ret = 1; + } + } + + return ret; +} + +static int clean_posix_shm_internal(const char *dirname, DIR *dir, uid_t uid, gid_t gid, bool rm) { + int ret = 0, r; + + assert(dir); + + FOREACH_DIRENT_ALL(de, dir, goto fail) { + struct stat st; + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (fstatat(dirfd(dir), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) { + if (errno == ENOENT) + continue; + + ret = log_warning_errno(errno, "Failed to stat() POSIX shared memory segment %s/%s: %m", + dirname, de->d_name); + continue; + } + + if (S_ISDIR(st.st_mode)) { + _cleanup_closedir_ DIR *kid = NULL; + + kid = xopendirat(dirfd(dir), de->d_name, O_NOFOLLOW|O_NOATIME); + if (!kid) { + if (errno != ENOENT) + ret = log_warning_errno(errno, "Failed to enter shared memory directory %s/%s: %m", + dirname, de->d_name); + } else { + r = clean_posix_shm_internal(de->d_name, kid, uid, gid, rm); + if (r < 0) + ret = r; + } + + if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid)) + continue; + + if (!rm) + return 1; + + if (unlinkat(dirfd(dir), de->d_name, AT_REMOVEDIR) < 0) { + + if (errno == ENOENT) + continue; + + ret = log_warning_errno(errno, "Failed to remove POSIX shared memory directory %s/%s: %m", + dirname, de->d_name); + } else { + log_debug("Removed POSIX shared memory directory %s", de->d_name); + if (ret == 0) + ret = 1; + } + } else { + + if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid)) + continue; + + if (!rm) + return 1; + + if (unlinkat(dirfd(dir), de->d_name, 0) < 0) { + + if (errno == ENOENT) + continue; + + ret = log_warning_errno(errno, "Failed to remove POSIX shared memory segment %s: %m", de->d_name); + } else { + log_debug("Removed POSIX shared memory segment %s", de->d_name); + if (ret == 0) + ret = 1; + } + } + } + + return ret; + +fail: + return log_warning_errno(errno, "Failed to read /dev/shm: %m"); +} + +static int clean_posix_shm(uid_t uid, gid_t gid, bool rm) { + _cleanup_closedir_ DIR *dir = NULL; + + dir = opendir("/dev/shm"); + if (!dir) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open /dev/shm: %m"); + } + + return clean_posix_shm_internal("/dev/shm", dir, uid, gid, rm); +} + +static int clean_posix_mq(uid_t uid, gid_t gid, bool rm) { + _cleanup_closedir_ DIR *dir = NULL; + int ret = 0; + + dir = opendir("/dev/mqueue"); + if (!dir) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open /dev/mqueue: %m"); + } + + FOREACH_DIRENT_ALL(de, dir, goto fail) { + struct stat st; + char fn[1+strlen(de->d_name)+1]; + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (fstatat(dirfd(dir), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) { + if (errno == ENOENT) + continue; + + ret = log_warning_errno(errno, + "Failed to stat() MQ segment %s: %m", + de->d_name); + continue; + } + + if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid)) + continue; + + if (!rm) + return 1; + + fn[0] = '/'; + strcpy(fn+1, de->d_name); + + if (mq_unlink(fn) < 0) { + if (errno == ENOENT) + continue; + + ret = log_warning_errno(errno, + "Failed to unlink POSIX message queue %s: %m", + fn); + } else { + log_debug("Removed POSIX message queue %s", fn); + if (ret == 0) + ret = 1; + } + } + + return ret; + +fail: + return log_warning_errno(errno, "Failed to read /dev/mqueue: %m"); +} + +int clean_ipc_internal(uid_t uid, gid_t gid, bool rm) { + int ret = 0, r; + + /* If 'rm' is true, clean all IPC objects owned by either the specified UID or the specified GID. Return the + * last error encountered or == 0 if no matching IPC objects have been found or > 0 if matching IPC objects + * have been found and have been removed. + * + * If 'rm' is false, just search for IPC objects owned by either the specified UID or the specified GID. In + * this case we return < 0 on error, > 0 if we found a matching object, == 0 if we didn't. + * + * As special rule: if UID/GID is specified as root we'll silently not clean up things, and always claim that + * there are IPC objects for it. */ + + if (uid == 0) { + if (!rm) + return 1; + + uid = UID_INVALID; + } + if (gid == 0) { + if (!rm) + return 1; + + gid = GID_INVALID; + } + + /* Anything to do? */ + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + r = clean_sysvipc_shm(uid, gid, rm); + if (r != 0) { + if (!rm) + return r; + if (ret == 0) + ret = r; + } + + r = clean_sysvipc_sem(uid, gid, rm); + if (r != 0) { + if (!rm) + return r; + if (ret == 0) + ret = r; + } + + r = clean_sysvipc_msg(uid, gid, rm); + if (r != 0) { + if (!rm) + return r; + if (ret == 0) + ret = r; + } + + r = clean_posix_shm(uid, gid, rm); + if (r != 0) { + if (!rm) + return r; + if (ret == 0) + ret = r; + } + + r = clean_posix_mq(uid, gid, rm); + if (r != 0) { + if (!rm) + return r; + if (ret == 0) + ret = r; + } + + return ret; +} + +int clean_ipc_by_uid(uid_t uid) { + return clean_ipc_internal(uid, GID_INVALID, true); +} + +int clean_ipc_by_gid(gid_t gid) { + return clean_ipc_internal(UID_INVALID, gid, true); +} diff --git a/src/shared/clean-ipc.h b/src/shared/clean-ipc.h new file mode 100644 index 0000000..ed348fb --- /dev/null +++ b/src/shared/clean-ipc.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "user-util.h" + +int clean_ipc_internal(uid_t uid, gid_t gid, bool rm); + +/* Remove all IPC objects owned by the specified UID or GID */ +int clean_ipc_by_uid(uid_t uid); +int clean_ipc_by_gid(gid_t gid); + +/* Check if any IPC object owned by the specified UID or GID exists, returns > 0 if so, == 0 if not */ +static inline int search_ipc(uid_t uid, gid_t gid) { + return clean_ipc_internal(uid, gid, false); +} diff --git a/src/shared/clock-util.c b/src/shared/clock-util.c new file mode 100644 index 0000000..b0cbe30 --- /dev/null +++ b/src/shared/clock-util.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "clock-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "macro.h" +#include "string-util.h" + +int clock_get_hwclock(struct tm *tm) { + _cleanup_close_ int fd = -EBADF; + + assert(tm); + + fd = open("/dev/rtc", O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + /* This leaves the timezone fields of struct tm + * uninitialized! */ + if (ioctl(fd, RTC_RD_TIME, tm) < 0) + return -errno; + + /* We don't know daylight saving, so we reset this in order not + * to confuse mktime(). */ + tm->tm_isdst = -1; + + return 0; +} + +int clock_set_hwclock(const struct tm *tm) { + _cleanup_close_ int fd = -EBADF; + + assert(tm); + + fd = open("/dev/rtc", O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + return RET_NERRNO(ioctl(fd, RTC_SET_TIME, tm)); +} + +int clock_is_localtime(const char* adjtime_path) { + _cleanup_fclose_ FILE *f = NULL; + int r; + + if (!adjtime_path) + adjtime_path = "/etc/adjtime"; + + /* + * The third line of adjtime is "UTC" or "LOCAL" or nothing. + * # /etc/adjtime + * 0.0 0 0 + * 0 + * UTC + */ + f = fopen(adjtime_path, "re"); + if (f) { + _cleanup_free_ char *line = NULL; + unsigned i; + + for (i = 0; i < 2; i++) { /* skip the first two lines */ + r = read_line(f, LONG_LINE_MAX, NULL); + if (r < 0) + return r; + if (r == 0) + return false; /* less than three lines → default to UTC */ + } + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + return false; /* less than three lines → default to UTC */ + + return streq(line, "LOCAL"); + + } else if (errno != ENOENT) + return -errno; + + /* adjtime not present → default to UTC */ + return false; +} + +int clock_set_timezone(int *ret_minutesdelta) { + struct timespec ts; + struct tm tm; + int minutesdelta; + struct timezone tz; + + assert_se(clock_gettime(CLOCK_REALTIME, &ts) == 0); + assert_se(localtime_r(&ts.tv_sec, &tm)); + minutesdelta = tm.tm_gmtoff / 60; + + tz = (struct timezone) { + .tz_minuteswest = -minutesdelta, + .tz_dsttime = 0, /* DST_NONE */ + }; + + /* If the RTC does not run in UTC but in local time, the very first call to settimeofday() will set + * the kernel's timezone and will warp the system clock, so that it runs in UTC instead of the local + * time we have read from the RTC. */ + if (settimeofday(NULL, &tz) < 0) + return -errno; + + if (ret_minutesdelta) + *ret_minutesdelta = minutesdelta; + + return 0; +} + +int clock_reset_timewarp(void) { + static const struct timezone tz = { + .tz_minuteswest = 0, + .tz_dsttime = 0, /* DST_NONE */ + }; + + /* The very first call to settimeofday() does time warp magic. Do a dummy call here, so the time + * warping is sealed and all later calls behave as expected. */ + return RET_NERRNO(settimeofday(NULL, &tz)); +} + +#define EPOCH_FILE "/usr/lib/clock-epoch" + +int clock_apply_epoch(ClockChangeDirection *ret_attempted_change) { + usec_t epoch_usec, now_usec; + struct stat st; + + /* NB: we update *ret_attempted_change in *all* cases, both + * on success and failure, to indicate what we intended to do! */ + + assert(ret_attempted_change); + + if (stat(EPOCH_FILE, &st) < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Cannot stat " EPOCH_FILE ": %m"); + + epoch_usec = (usec_t) TIME_EPOCH * USEC_PER_SEC; + } else + epoch_usec = timespec_load(&st.st_mtim); + + now_usec = now(CLOCK_REALTIME); + if (now_usec < epoch_usec) + *ret_attempted_change = CLOCK_CHANGE_FORWARD; + else if (CLOCK_VALID_RANGE_USEC_MAX > 0 && now_usec > usec_add(epoch_usec, CLOCK_VALID_RANGE_USEC_MAX)) + *ret_attempted_change = CLOCK_CHANGE_BACKWARD; + else { + *ret_attempted_change = CLOCK_CHANGE_NOOP; + return 0; + } + + if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(epoch_usec)) < 0) + return -errno; + + return 1; +} diff --git a/src/shared/clock-util.h b/src/shared/clock-util.h new file mode 100644 index 0000000..c8f6d1b --- /dev/null +++ b/src/shared/clock-util.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +typedef enum ClockChangeDirection { + CLOCK_CHANGE_NOOP, + CLOCK_CHANGE_FORWARD, + CLOCK_CHANGE_BACKWARD, + _CLOCK_CHANGE_MAX, + _CLOCK_CHANGE_INVALID = -EINVAL, +} ClockChangeDirection; + +int clock_is_localtime(const char* adjtime_path); +int clock_set_timezone(int *ret_minutesdelta); +int clock_reset_timewarp(void); +int clock_get_hwclock(struct tm *tm); +int clock_set_hwclock(const struct tm *tm); +int clock_apply_epoch(ClockChangeDirection *ret_attempted_change); diff --git a/src/shared/common-signal.c b/src/shared/common-signal.c new file mode 100644 index 0000000..8e70e36 --- /dev/null +++ b/src/shared/common-signal.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "common-signal.h" +#include "fd-util.h" +#include "fileio.h" +#include "memstream-util.h" +#include "process-util.h" +#include "signal-util.h" + +int sigrtmin18_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + struct sigrtmin18_info *info = userdata; + _cleanup_free_ char *comm = NULL; + + assert(s); + assert(si); + + (void) pid_get_comm(si->ssi_pid, &comm); + + if (si->ssi_code != SI_QUEUE) { + log_notice("Received control signal %s from process " PID_FMT " (%s) without command value, ignoring.", + signal_to_string(si->ssi_signo), + (pid_t) si->ssi_pid, + strna(comm)); + return 0; + } + + log_debug("Received control signal %s from process " PID_FMT " (%s) with command 0x%08x.", + signal_to_string(si->ssi_signo), + (pid_t) si->ssi_pid, + strna(comm), + (unsigned) si->ssi_int); + + switch (si->ssi_int) { + + case _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE..._COMMON_SIGNAL_COMMAND_LOG_LEVEL_END: + log_set_max_level(si->ssi_int - _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE); + break; + + case COMMON_SIGNAL_COMMAND_CONSOLE: + log_set_target_and_open(LOG_TARGET_CONSOLE); + break; + case COMMON_SIGNAL_COMMAND_JOURNAL: + log_set_target_and_open(LOG_TARGET_JOURNAL); + break; + case COMMON_SIGNAL_COMMAND_KMSG: + log_set_target_and_open(LOG_TARGET_KMSG); + break; + case COMMON_SIGNAL_COMMAND_NULL: + log_set_target_and_open(LOG_TARGET_NULL); + break; + + case COMMON_SIGNAL_COMMAND_MEMORY_PRESSURE: + if (info && info->memory_pressure_handler) + return info->memory_pressure_handler(s, info->memory_pressure_userdata); + + sd_event_trim_memory(); + break; + + case COMMON_SIGNAL_COMMAND_MALLOC_INFO: { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + f = memstream_init(&m); + if (!f) { + log_oom(); + break; + } + + if (malloc_info(0, f) < 0) { + log_error_errno(errno, "Failed to invoke malloc_info(): %m"); + break; + } + + (void) memstream_dump(LOG_INFO, &m); + break; + } + + default: + log_notice("Received control signal %s with unknown command 0x%08x, ignoring.", + signal_to_string(si->ssi_signo), (unsigned) si->ssi_int); + break; + } + + return 0; +} diff --git a/src/shared/common-signal.h b/src/shared/common-signal.h new file mode 100644 index 0000000..1fe7b76 --- /dev/null +++ b/src/shared/common-signal.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include + +/* All our long-running services should implement a SIGRTMIN+18 handler that can be used to trigger certain + * actions that affect service runtime. The specific action is indicated via the "value integer" you can pass + * along realtime signals. This is mostly intended for debugging purposes and is entirely asynchronous in + * nature. Specifically, these are the commands: + * + * Currently available operations: + * + * • Change maximum log level + * • Change log target + * • Invoke memory trimming, like under memory pressure + * • Write glibc malloc() allocation info to logs + * + * How to use this? Via a command like the following: + * + * /usr/bin/kill -s RTMIN+18 -q 768 1 + * + * (This will tell PID 1 to trim its memory use.) + * + * or: + * + * systemctl kill --kill-value=0x300 -s RTMIN+18 systemd-journald + * + * (This will tell journald to trim its memory use.) + */ + +enum { + _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE = 0x100, + COMMON_SIGNAL_COMMAND_LOG_EMERG = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_EMERG, + COMMON_SIGNAL_COMMAND_LOG_ALERT = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_ALERT, + COMMON_SIGNAL_COMMAND_LOG_CRIT = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_CRIT, + COMMON_SIGNAL_COMMAND_LOG_ERR = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_ERR, + COMMON_SIGNAL_COMMAND_LOG_WARNING = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_WARNING, + COMMON_SIGNAL_COMMAND_LOG_NOTICE = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_NOTICE, + COMMON_SIGNAL_COMMAND_LOG_INFO = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_INFO, + COMMON_SIGNAL_COMMAND_LOG_DEBUG = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_DEBUG, + _COMMON_SIGNAL_COMMAND_LOG_LEVEL_END = COMMON_SIGNAL_COMMAND_LOG_DEBUG, + + COMMON_SIGNAL_COMMAND_CONSOLE = 0x200, + COMMON_SIGNAL_COMMAND_JOURNAL, + COMMON_SIGNAL_COMMAND_KMSG, + COMMON_SIGNAL_COMMAND_NULL, + + COMMON_SIGNAL_COMMAND_MEMORY_PRESSURE = 0x300, + COMMON_SIGNAL_COMMAND_MALLOC_INFO, + + /* Private signals start at 0x500 */ + _COMMON_SIGNAL_COMMAND_PRIVATE_BASE = 0x500, + _COMMON_SIGNAL_COMMAND_PRIVATE_END = 0xfff, +}; + +struct sigrtmin18_info { + sd_event_handler_t memory_pressure_handler; + void *memory_pressure_userdata; +}; + +int sigrtmin18_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata); diff --git a/src/shared/compare-operator.c b/src/shared/compare-operator.c new file mode 100644 index 0000000..0da28fc --- /dev/null +++ b/src/shared/compare-operator.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "compare-operator.h" +#include "string-util.h" + +CompareOperator parse_compare_operator(const char **s, CompareOperatorParseFlags flags) { + static const struct { + CompareOperator op; + const char *str; + CompareOperatorParseFlags valid_mask; /* If this operator appears when flags in mask not set, fail */ + CompareOperatorParseFlags need_mask; /* Skip over this operator when flags in mask not set */ + } table[] = { + { COMPARE_FNMATCH_EQUAL, "$=", .valid_mask = COMPARE_ALLOW_FNMATCH }, + { COMPARE_FNMATCH_UNEQUAL, "!$=", .valid_mask = COMPARE_ALLOW_FNMATCH }, + + { COMPARE_UNEQUAL, "<>" }, + { COMPARE_LOWER_OR_EQUAL, "<=" }, + { COMPARE_GREATER_OR_EQUAL, ">=" }, + { COMPARE_LOWER, "<" }, + { COMPARE_GREATER, ">" }, + { COMPARE_EQUAL, "==" }, + { COMPARE_STRING_EQUAL, "=", .need_mask = COMPARE_EQUAL_BY_STRING }, + { COMPARE_EQUAL, "=" }, + { COMPARE_STRING_UNEQUAL, "!=", .need_mask = COMPARE_EQUAL_BY_STRING }, + { COMPARE_UNEQUAL, "!=" }, + + { COMPARE_LOWER, "lt", .valid_mask = COMPARE_ALLOW_TEXTUAL }, + { COMPARE_LOWER_OR_EQUAL, "le", .valid_mask = COMPARE_ALLOW_TEXTUAL }, + { COMPARE_EQUAL, "eq", .valid_mask = COMPARE_ALLOW_TEXTUAL }, + { COMPARE_UNEQUAL, "ne", .valid_mask = COMPARE_ALLOW_TEXTUAL }, + { COMPARE_GREATER_OR_EQUAL, "ge", .valid_mask = COMPARE_ALLOW_TEXTUAL }, + { COMPARE_GREATER, "gt", .valid_mask = COMPARE_ALLOW_TEXTUAL }, + }; + + assert(s); + + if (!*s) /* Hmm, we already reached the end, for example because extract_first_word() and + * parse_compare_operator() are use on the same string? */ + return _COMPARE_OPERATOR_INVALID; + + for (size_t i = 0; i < ELEMENTSOF(table); i ++) { + const char *e; + + if (table[i].need_mask != 0 && !FLAGS_SET(flags, table[i].need_mask)) + continue; + + e = startswith(*s, table[i].str); + if (e) { + if (table[i].valid_mask != 0 && !FLAGS_SET(flags, table[i].valid_mask)) + return _COMPARE_OPERATOR_INVALID; + + *s = e; + return table[i].op; + } + } + + return _COMPARE_OPERATOR_INVALID; +} + +int test_order(int k, CompareOperator op) { + + switch (op) { + + case COMPARE_LOWER: + return k < 0; + + case COMPARE_LOWER_OR_EQUAL: + return k <= 0; + + case COMPARE_EQUAL: + return k == 0; + + case COMPARE_UNEQUAL: + return k != 0; + + case COMPARE_GREATER_OR_EQUAL: + return k >= 0; + + case COMPARE_GREATER: + return k > 0; + + default: + return -EINVAL; + } +} + +int version_or_fnmatch_compare( + CompareOperator op, + const char *a, + const char *b) { + int r; + + switch (op) { + + case COMPARE_STRING_EQUAL: + return streq_ptr(a, b); + + case COMPARE_STRING_UNEQUAL: + return !streq_ptr(a, b); + + case COMPARE_FNMATCH_EQUAL: + r = fnmatch(b, a, 0); + return r == 0 ? true : + r == FNM_NOMATCH ? false : -EINVAL; + + case COMPARE_FNMATCH_UNEQUAL: + r = fnmatch(b, a, 0); + return r == FNM_NOMATCH ? true: + r == 0 ? false : -EINVAL; + + case _COMPARE_OPERATOR_ORDER_FIRST..._COMPARE_OPERATOR_ORDER_LAST: + return test_order(strverscmp_improved(a, b), op); + + default: + return -EINVAL; + } +} diff --git a/src/shared/compare-operator.h b/src/shared/compare-operator.h new file mode 100644 index 0000000..900f3e5 --- /dev/null +++ b/src/shared/compare-operator.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#define COMPARE_OPERATOR_CHARS "!<=>" +#define COMPARE_OPERATOR_WITH_FNMATCH_CHARS COMPARE_OPERATOR_CHARS "$" + +typedef enum CompareOperator { + /* Listed in order of checking. Note that some comparators are prefixes of others, hence the longest + * should be listed first. */ + + /* Simple string compare operators */ + _COMPARE_OPERATOR_STRING_FIRST, + COMPARE_STRING_EQUAL = _COMPARE_OPERATOR_STRING_FIRST, + COMPARE_STRING_UNEQUAL, + _COMPARE_OPERATOR_STRING_LAST = COMPARE_STRING_UNEQUAL, + + /* fnmatch() compare operators */ + _COMPARE_OPERATOR_FNMATCH_FIRST, + COMPARE_FNMATCH_EQUAL = _COMPARE_OPERATOR_FNMATCH_FIRST, + COMPARE_FNMATCH_UNEQUAL, + _COMPARE_OPERATOR_FNMATCH_LAST = COMPARE_FNMATCH_UNEQUAL, + + /* Order compare operators */ + _COMPARE_OPERATOR_ORDER_FIRST, + COMPARE_LOWER_OR_EQUAL = _COMPARE_OPERATOR_ORDER_FIRST, + COMPARE_GREATER_OR_EQUAL, + COMPARE_LOWER, + COMPARE_GREATER, + COMPARE_EQUAL, + COMPARE_UNEQUAL, + _COMPARE_OPERATOR_ORDER_LAST = COMPARE_UNEQUAL, + + _COMPARE_OPERATOR_MAX, + _COMPARE_OPERATOR_INVALID = -EINVAL, +} CompareOperator; + +static inline bool COMPARE_OPERATOR_IS_STRING(CompareOperator c) { + return c >= _COMPARE_OPERATOR_STRING_FIRST && c <= _COMPARE_OPERATOR_STRING_LAST; +} + +static inline bool COMPARE_OPERATOR_IS_FNMATCH(CompareOperator c) { + return c >= _COMPARE_OPERATOR_FNMATCH_FIRST && c <= _COMPARE_OPERATOR_FNMATCH_LAST; +} + +static inline bool COMPARE_OPERATOR_IS_ORDER(CompareOperator c) { + return c >= _COMPARE_OPERATOR_ORDER_FIRST && c <= _COMPARE_OPERATOR_ORDER_LAST; +} + +typedef enum CompareOperatorParseFlags { + COMPARE_ALLOW_FNMATCH = 1 << 0, + COMPARE_EQUAL_BY_STRING = 1 << 1, + COMPARE_ALLOW_TEXTUAL = 1 << 2, +} CompareOperatorParseFlags; + +CompareOperator parse_compare_operator(const char **s, CompareOperatorParseFlags flags); + +int test_order(int k, CompareOperator op); + +int version_or_fnmatch_compare(CompareOperator op, const char *a, const char *b); diff --git a/src/shared/condition.c b/src/shared/condition.c new file mode 100644 index 0000000..d3446e8 --- /dev/null +++ b/src/shared/condition.c @@ -0,0 +1,1360 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "apparmor-util.h" +#include "architecture.h" +#include "audit-util.h" +#include "battery-util.h" +#include "blockdev-util.h" +#include "cap-list.h" +#include "cgroup-util.h" +#include "compare-operator.h" +#include "condition.h" +#include "confidential-virt.h" +#include "cpu-set-util.h" +#include "creds-util.h" +#include "efi-api.h" +#include "efi-loader.h" +#include "env-file.h" +#include "env-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "glob-util.h" +#include "hostname-util.h" +#include "ima-util.h" +#include "initrd-util.h" +#include "limits-util.h" +#include "list.h" +#include "macro.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "psi-util.h" +#include "selinux-util.h" +#include "smack-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "tomoyo-util.h" +#include "tpm2-util.h" +#include "uid-alloc-range.h" +#include "user-util.h" +#include "virt.h" + +Condition* condition_new(ConditionType type, const char *parameter, bool trigger, bool negate) { + Condition *c; + + assert(type >= 0); + assert(type < _CONDITION_TYPE_MAX); + assert(parameter); + + c = new(Condition, 1); + if (!c) + return NULL; + + *c = (Condition) { + .type = type, + .trigger = trigger, + .negate = negate, + }; + + if (parameter) { + c->parameter = strdup(parameter); + if (!c->parameter) + return mfree(c); + } + + return c; +} + +Condition* condition_free(Condition *c) { + assert(c); + + free(c->parameter); + return mfree(c); +} + +Condition* condition_free_list_type(Condition *head, ConditionType type) { + LIST_FOREACH(conditions, c, head) + if (type < 0 || c->type == type) { + LIST_REMOVE(conditions, head, c); + condition_free(c); + } + + assert(type >= 0 || !head); + return head; +} + +static int condition_test_kernel_command_line(Condition *c, char **env) { + _cleanup_strv_free_ char **args = NULL; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_KERNEL_COMMAND_LINE); + + r = proc_cmdline_strv(&args); + if (r < 0) + return r; + + bool equal = strchr(c->parameter, '='); + + STRV_FOREACH(word, args) { + bool found; + + if (equal) + found = streq(*word, c->parameter); + else { + const char *f; + + f = startswith(*word, c->parameter); + found = f && IN_SET(*f, 0, '='); + } + + if (found) + return true; + } + + return false; +} + +static int condition_test_credential(Condition *c, char **env) { + int (*gd)(const char **ret); + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_CREDENTIAL); + + /* For now we'll do a very simple existence check and are happy with either a regular or an encrypted + * credential. Given that we check the syntax of the argument we have the option to later maybe allow + * contents checks too without breaking compatibility, but for now let's be minimalistic. */ + + if (!credential_name_valid(c->parameter)) /* credentials with invalid names do not exist */ + return false; + + FOREACH_POINTER(gd, get_credentials_dir, get_encrypted_credentials_dir) { + _cleanup_free_ char *j = NULL; + const char *cd; + + r = gd(&cd); + if (r == -ENXIO) /* no env var set */ + continue; + if (r < 0) + return r; + + j = path_join(cd, c->parameter); + if (!j) + return -ENOMEM; + + if (laccess(j, F_OK) >= 0) + return true; /* yay! */ + if (errno != ENOENT) + return -errno; + + /* not found in this dir */ + } + + return false; +} + +static int condition_test_kernel_version(Condition *c, char **env) { + CompareOperator operator; + struct utsname u; + bool first = true; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_KERNEL_VERSION); + + assert_se(uname(&u) >= 0); + + for (const char *p = c->parameter;;) { + _cleanup_free_ char *word = NULL; + const char *s; + int r; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_debug_errno(r, "Failed to parse condition string \"%s\": %m", p); + if (r == 0) + break; + + s = strstrip(word); + operator = parse_compare_operator(&s, COMPARE_ALLOW_FNMATCH|COMPARE_EQUAL_BY_STRING); + if (operator < 0) /* No prefix? Then treat as glob string */ + operator = COMPARE_FNMATCH_EQUAL; + + s += strspn(s, WHITESPACE); + if (isempty(s)) { + if (first) { + /* For backwards compatibility, allow whitespace between the operator and + * value, without quoting, but only in the first expression. */ + word = mfree(word); + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + return log_debug_errno(r, "Failed to parse condition string \"%s\": %m", p); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected end of expression: %s", p); + s = word; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected end of expression: %s", p); + } + + r = version_or_fnmatch_compare(operator, u.release, s); + if (r < 0) + return r; + if (!r) + return false; + + first = false; + } + + return true; +} + +static int condition_test_osrelease(Condition *c, char **env) { + int r; + + assert(c); + assert(c->type == CONDITION_OS_RELEASE); + + for (const char *parameter = ASSERT_PTR(c->parameter);;) { + _cleanup_free_ char *key = NULL, *condition = NULL, *actual_value = NULL; + CompareOperator operator; + const char *word; + + r = extract_first_word(¶meter, &condition, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return log_debug_errno(r, "Failed to parse parameter: %m"); + if (r == 0) + break; + + /* parse_compare_operator() needs the string to start with the comparators */ + word = condition; + r = extract_first_word(&word, &key, COMPARE_OPERATOR_WITH_FNMATCH_CHARS, EXTRACT_RETAIN_SEPARATORS); + if (r < 0) + return log_debug_errno(r, "Failed to parse parameter: %m"); + /* The os-release spec mandates env-var-like key names */ + if (r == 0 || isempty(word) || !env_name_is_valid(key)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse parameter, key/value format expected: %m"); + + /* Do not allow whitespace after the separator, as that's not a valid os-release format */ + operator = parse_compare_operator(&word, COMPARE_ALLOW_FNMATCH|COMPARE_EQUAL_BY_STRING); + if (operator < 0 || isempty(word) || strchr(WHITESPACE, *word) != NULL) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse parameter, key/value format expected: %m"); + + r = parse_os_release(NULL, key, &actual_value); + if (r < 0) + return log_debug_errno(r, "Failed to parse os-release: %m"); + + r = version_or_fnmatch_compare(operator, actual_value, word); + if (r < 0) + return r; + if (!r) + return false; + } + + return true; +} + +static int condition_test_memory(Condition *c, char **env) { + CompareOperator operator; + uint64_t m, k; + const char *p; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_MEMORY); + + m = physical_memory(); + + p = c->parameter; + operator = parse_compare_operator(&p, 0); + if (operator < 0) + operator = COMPARE_GREATER_OR_EQUAL; /* default to >= check, if nothing is specified. */ + + r = parse_size(p, 1024, &k); + if (r < 0) + return log_debug_errno(r, "Failed to parse size '%s': %m", p); + + return test_order(CMP(m, k), operator); +} + +static int condition_test_cpus(Condition *c, char **env) { + CompareOperator operator; + const char *p; + unsigned k; + int r, n; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_CPUS); + + n = cpus_in_affinity_mask(); + if (n < 0) + return log_debug_errno(n, "Failed to determine CPUs in affinity mask: %m"); + + p = c->parameter; + operator = parse_compare_operator(&p, 0); + if (operator < 0) + operator = COMPARE_GREATER_OR_EQUAL; /* default to >= check, if nothing is specified. */ + + r = safe_atou(p, &k); + if (r < 0) + return log_debug_errno(r, "Failed to parse number of CPUs: %m"); + + return test_order(CMP((unsigned) n, k), operator); +} + +static int condition_test_user(Condition *c, char **env) { + uid_t id; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_USER); + + /* Do the quick&easy comparisons first, and only parse the UID later. */ + if (streq(c->parameter, "root")) + return getuid() == 0 || geteuid() == 0; + if (streq(c->parameter, NOBODY_USER_NAME)) + return getuid() == UID_NOBODY || geteuid() == UID_NOBODY; + if (streq(c->parameter, "@system")) + return uid_is_system(getuid()) || uid_is_system(geteuid()); + + r = parse_uid(c->parameter, &id); + if (r >= 0) + return id == getuid() || id == geteuid(); + + if (getpid_cached() == 1) /* We already checked for "root" above, and we know that + * PID 1 is running as root, hence we know it cannot match. */ + return false; + + /* getusername_malloc() may do an nss lookup, which is not allowed in PID 1. */ + _cleanup_free_ char *username = getusername_malloc(); + if (!username) + return -ENOMEM; + + if (streq(username, c->parameter)) + return 1; + + const char *u = c->parameter; + r = get_user_creds(&u, &id, NULL, NULL, NULL, USER_CREDS_ALLOW_MISSING); + if (r < 0) + return 0; + + return id == getuid() || id == geteuid(); +} + +static int condition_test_control_group_controller(Condition *c, char **env) { + int r; + CGroupMask system_mask, wanted_mask = 0; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_CONTROL_GROUP_CONTROLLER); + + if (streq(c->parameter, "v2")) + return cg_all_unified(); + if (streq(c->parameter, "v1")) { + r = cg_all_unified(); + if (r < 0) + return r; + return !r; + } + + r = cg_mask_supported(&system_mask); + if (r < 0) + return log_debug_errno(r, "Failed to determine supported controllers: %m"); + + r = cg_mask_from_string(c->parameter, &wanted_mask); + if (r < 0 || wanted_mask <= 0) { + /* This won't catch the case that we have an unknown controller + * mixed in with valid ones -- these are only assessed on the + * validity of the valid controllers found. */ + log_debug("Failed to parse cgroup string: %s", c->parameter); + return 1; + } + + return FLAGS_SET(system_mask, wanted_mask); +} + +static int condition_test_group(Condition *c, char **env) { + gid_t id; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_GROUP); + + r = parse_gid(c->parameter, &id); + if (r >= 0) + return in_gid(id); + + /* Avoid any NSS lookups if we are PID1 */ + if (getpid_cached() == 1) + return streq(c->parameter, "root"); + + return in_group(c->parameter) > 0; +} + +static int condition_test_virtualization(Condition *c, char **env) { + Virtualization v; + int b; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_VIRTUALIZATION); + + if (streq(c->parameter, "private-users")) + return running_in_userns(); + + v = detect_virtualization(); + if (v < 0) + return v; + + /* First, compare with yes/no */ + b = parse_boolean(c->parameter); + if (b >= 0) + return b == (v != VIRTUALIZATION_NONE); + + /* Then, compare categorization */ + if (streq(c->parameter, "vm")) + return VIRTUALIZATION_IS_VM(v); + + if (streq(c->parameter, "container")) + return VIRTUALIZATION_IS_CONTAINER(v); + + /* Finally compare id */ + return v != VIRTUALIZATION_NONE && streq(c->parameter, virtualization_to_string(v)); +} + +static int condition_test_architecture(Condition *c, char **env) { + Architecture a, b; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_ARCHITECTURE); + + a = uname_architecture(); + if (a < 0) + return a; + + if (streq(c->parameter, "native")) + b = native_architecture(); + else { + b = architecture_from_string(c->parameter); + if (b < 0) /* unknown architecture? Then it's definitely not ours */ + return false; + } + + return a == b; +} + +#define DTCOMPAT_FILE "/proc/device-tree/compatible" +static int condition_test_firmware_devicetree_compatible(const char *dtcarg) { + int r; + _cleanup_free_ char *dtcompat = NULL; + _cleanup_strv_free_ char **dtcompatlist = NULL; + size_t size; + + r = read_full_virtual_file(DTCOMPAT_FILE, &dtcompat, &size); + if (r < 0) { + /* if the path doesn't exist it is incompatible */ + if (r != -ENOENT) + log_debug_errno(r, "Failed to open() '%s', assuming machine is incompatible: %m", DTCOMPAT_FILE); + return false; + } + + /* Not sure this can happen, but play safe. */ + if (size == 0) { + log_debug("%s has zero length, assuming machine is incompatible", DTCOMPAT_FILE); + return false; + } + + /* /proc/device-tree/compatible consists of one or more strings, each ending in '\0'. + * So the last character in dtcompat must be a '\0'. */ + if (dtcompat[size - 1] != '\0') { + log_debug("%s is in an unknown format, assuming machine is incompatible", DTCOMPAT_FILE); + return false; + } + + dtcompatlist = strv_parse_nulstr(dtcompat, size); + if (!dtcompatlist) + return -ENOMEM; + + return strv_contains(dtcompatlist, dtcarg); +} + +static int condition_test_firmware_smbios_field(const char *expression) { + _cleanup_free_ char *field = NULL, *expected_value = NULL, *actual_value = NULL; + CompareOperator operator; + int r; + + assert(expression); + + /* Parse SMBIOS field */ + r = extract_first_word(&expression, &field, COMPARE_OPERATOR_WITH_FNMATCH_CHARS, EXTRACT_RETAIN_SEPARATORS); + if (r < 0) + return r; + if (r == 0 || isempty(expression)) + return -EINVAL; + + /* Remove trailing spaces from SMBIOS field */ + delete_trailing_chars(field, WHITESPACE); + + /* Parse operator */ + operator = parse_compare_operator(&expression, COMPARE_ALLOW_FNMATCH|COMPARE_EQUAL_BY_STRING); + if (operator < 0) + return operator; + + /* Parse expected value */ + r = extract_first_word(&expression, &expected_value, NULL, EXTRACT_UNQUOTE); + if (r < 0) + return r; + if (r == 0 || !isempty(expression)) + return -EINVAL; + + /* Read actual value from sysfs */ + if (!filename_is_valid(field)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid SMBIOS field name"); + + const char *p = strjoina("/sys/class/dmi/id/", field); + r = read_virtual_file(p, SIZE_MAX, &actual_value, NULL); + if (r < 0) { + log_debug_errno(r, "Failed to read %s: %m", p); + if (r == -ENOENT) + return false; + return r; + } + + /* Remove trailing newline */ + delete_trailing_chars(actual_value, WHITESPACE); + + /* Finally compare actual and expected value */ + return version_or_fnmatch_compare(operator, actual_value, expected_value); +} + +static int condition_test_firmware(Condition *c, char **env) { + sd_char *arg; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_FIRMWARE); + + if (streq(c->parameter, "device-tree")) { + if (access("/sys/firmware/devicetree/", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Unexpected error when checking for /sys/firmware/devicetree/: %m"); + return false; + } else + return true; + } else if ((arg = startswith(c->parameter, "device-tree-compatible("))) { + _cleanup_free_ char *dtc_arg = NULL; + char *end; + + end = strrchr(arg, ')'); + if (!end || *(end + 1) != '\0') { + log_debug("Malformed ConditionFirmware=%s", c->parameter); + return false; + } + + dtc_arg = strndup(arg, end - arg); + if (!dtc_arg) + return -ENOMEM; + + return condition_test_firmware_devicetree_compatible(dtc_arg); + } else if (streq(c->parameter, "uefi")) + return is_efi_boot(); + else if ((arg = startswith(c->parameter, "smbios-field("))) { + _cleanup_free_ char *smbios_arg = NULL; + char *end; + + end = strrchr(arg, ')'); + if (!end || *(end + 1) != '\0') + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Malformed ConditionFirmware=%s: %m", c->parameter); + + smbios_arg = strndup(arg, end - arg); + if (!smbios_arg) + return log_oom_debug(); + + r = condition_test_firmware_smbios_field(smbios_arg); + if (r < 0) + return log_debug_errno(r, "Malformed ConditionFirmware=%s: %m", c->parameter); + return r; + } else { + log_debug("Unsupported Firmware condition \"%s\"", c->parameter); + return false; + } +} + +static int condition_test_host(Condition *c, char **env) { + _cleanup_free_ char *h = NULL; + sd_id128_t x, y; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_HOST); + + if (sd_id128_from_string(c->parameter, &x) >= 0) { + + r = sd_id128_get_machine(&y); + if (r < 0) + return r; + + return sd_id128_equal(x, y); + } + + h = gethostname_malloc(); + if (!h) + return -ENOMEM; + + r = fnmatch(c->parameter, h, FNM_CASEFOLD); + if (r == FNM_NOMATCH) + return false; + if (r != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "fnmatch() failed."); + + return true; +} + +static int condition_test_ac_power(Condition *c, char **env) { + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_AC_POWER); + + r = parse_boolean(c->parameter); + if (r < 0) + return r; + + return (on_ac_power() != 0) == !!r; +} + +static int has_tpm2(void) { + /* Checks whether the kernel has the TPM subsystem enabled and the firmware reports support. Note + * we don't check for actual TPM devices, since we might not have loaded the driver for it yet, i.e. + * during early boot where we very likely want to use this condition check). + * + * Note that we don't check if we ourselves are built with TPM2 support here! */ + + return FLAGS_SET(tpm2_support(), TPM2_SUPPORT_SUBSYSTEM|TPM2_SUPPORT_FIRMWARE); +} + +static int condition_test_security(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_SECURITY); + + if (streq(c->parameter, "selinux")) + return mac_selinux_use(); + if (streq(c->parameter, "smack")) + return mac_smack_use(); + if (streq(c->parameter, "apparmor")) + return mac_apparmor_use(); + if (streq(c->parameter, "audit")) + return use_audit(); + if (streq(c->parameter, "ima")) + return use_ima(); + if (streq(c->parameter, "tomoyo")) + return mac_tomoyo_use(); + if (streq(c->parameter, "uefi-secureboot")) + return is_efi_secure_boot(); + if (streq(c->parameter, "tpm2")) + return has_tpm2(); + if (streq(c->parameter, "cvm")) + return detect_confidential_virtualization() > 0; + if (streq(c->parameter, "measured-uki")) + return efi_measured_uki(LOG_DEBUG); + + return false; +} + +static int condition_test_capability(Condition *c, char **env) { + unsigned long long capabilities = (unsigned long long) -1; + _cleanup_fclose_ FILE *f = NULL; + int value, r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_CAPABILITY); + + /* If it's an invalid capability, we don't have it */ + value = capability_from_name(c->parameter); + if (value < 0) + return -EINVAL; + + /* If it's a valid capability we default to assume + * that we have it */ + + f = fopen("/proc/self/status", "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + const char *p = startswith(line, "CapBnd:"); + if (p) { + if (sscanf(p, "%llx", &capabilities) != 1) + return -EIO; + + break; + } + } + + return !!(capabilities & (1ULL << value)); +} + +static int condition_test_needs_update(Condition *c, char **env) { + struct stat usr, other; + const char *p; + bool b; + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_NEEDS_UPDATE); + + r = proc_cmdline_get_bool("systemd.condition-needs-update", /* flags = */ 0, &b); + if (r < 0) + log_debug_errno(r, "Failed to parse systemd.condition-needs-update= kernel command line argument, ignoring: %m"); + if (r > 0) + return b; + + if (in_initrd()) { + log_debug("We are in an initrd, not doing any updates."); + return false; + } + + if (!path_is_absolute(c->parameter)) { + log_debug("Specified condition parameter '%s' is not absolute, assuming an update is needed.", c->parameter); + return true; + } + + /* If the file system is read-only we shouldn't suggest an update */ + r = path_is_read_only_fs(c->parameter); + if (r < 0) + log_debug_errno(r, "Failed to determine if '%s' is read-only, ignoring: %m", c->parameter); + if (r > 0) + return false; + + /* Any other failure means we should allow the condition to be true, so that we rather invoke too + * many update tools than too few. */ + + p = strjoina(c->parameter, "/.updated"); + if (lstat(p, &other) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to stat() '%s', assuming an update is needed: %m", p); + return true; + } + + if (lstat("/usr/", &usr) < 0) { + log_debug_errno(errno, "Failed to stat() /usr/, assuming an update is needed: %m"); + return true; + } + + /* + * First, compare seconds as they are always accurate... + */ + if (usr.st_mtim.tv_sec != other.st_mtim.tv_sec) + return usr.st_mtim.tv_sec > other.st_mtim.tv_sec; + + /* + * ...then compare nanoseconds. + * + * A false positive is only possible when /usr's nanoseconds > 0 + * (otherwise /usr cannot be strictly newer than the target file) + * AND the target file's nanoseconds == 0 + * (otherwise the filesystem supports nsec timestamps, see stat(2)). + */ + if (usr.st_mtim.tv_nsec == 0 || other.st_mtim.tv_nsec > 0) + return usr.st_mtim.tv_nsec > other.st_mtim.tv_nsec; + + _cleanup_free_ char *timestamp_str = NULL; + r = parse_env_file(NULL, p, "TIMESTAMP_NSEC", ×tamp_str); + if (r < 0) { + log_debug_errno(r, "Failed to parse timestamp file '%s', using mtime: %m", p); + return true; + } + if (isempty(timestamp_str)) { + log_debug("No data in timestamp file '%s', using mtime.", p); + return true; + } + + uint64_t timestamp; + r = safe_atou64(timestamp_str, ×tamp); + if (r < 0) { + log_debug_errno(r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m", timestamp_str, p); + return true; + } + + return timespec_load_nsec(&usr.st_mtim) > timestamp; +} + +static bool in_first_boot(void) { + static int first_boot = -1; + int r; + + if (first_boot >= 0) + return first_boot; + + const char *e = secure_getenv("SYSTEMD_FIRST_BOOT"); + if (e) { + r = parse_boolean(e); + if (r < 0) + log_debug_errno(r, "Failed to parse $SYSTEMD_FIRST_BOOT, ignoring: %m"); + else + return (first_boot = r); + } + + r = RET_NERRNO(access("/run/systemd/first-boot", F_OK)); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to check if /run/systemd/first-boot exists, assuming no: %m"); + return r >= 0; +} + +static int condition_test_first_boot(Condition *c, char **env) { + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_FIRST_BOOT); + + // TODO: Parse c->parameter immediately when reading the config. + // Apply negation when parsing too. + + r = parse_boolean(c->parameter); + if (r < 0) + return r; + + return in_first_boot() == r; +} + +static int condition_test_environment(Condition *c, char **env) { + bool equal; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_ENVIRONMENT); + + equal = strchr(c->parameter, '='); + + STRV_FOREACH(i, env) { + bool found; + + if (equal) + found = streq(c->parameter, *i); + else { + const char *f; + + f = startswith(*i, c->parameter); + found = f && IN_SET(*f, 0, '='); + } + + if (found) + return true; + } + + return false; +} + +static int condition_test_path_exists(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_EXISTS); + + return access(c->parameter, F_OK) >= 0; +} + +static int condition_test_path_exists_glob(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_EXISTS_GLOB); + + return glob_exists(c->parameter) > 0; +} + +static int condition_test_path_is_directory(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_IS_DIRECTORY); + + return is_dir(c->parameter, true) > 0; +} + +static int condition_test_path_is_symbolic_link(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_IS_SYMBOLIC_LINK); + + return is_symlink(c->parameter) > 0; +} + +static int condition_test_path_is_mount_point(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_IS_MOUNT_POINT); + + return path_is_mount_point(c->parameter, NULL, AT_SYMLINK_FOLLOW) > 0; +} + +static int condition_test_path_is_read_write(Condition *c, char **env) { + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_IS_READ_WRITE); + + r = path_is_read_only_fs(c->parameter); + + return r <= 0 && r != -ENOENT; +} + +static int condition_test_cpufeature(Condition *c, char **env) { + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_CPU_FEATURE); + + return has_cpu_with_flag(ascii_strlower(c->parameter)); +} + +static int condition_test_path_is_encrypted(Condition *c, char **env) { + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_PATH_IS_ENCRYPTED); + + r = path_is_encrypted(c->parameter); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to determine if '%s' is encrypted: %m", c->parameter); + + return r > 0; +} + +static int condition_test_directory_not_empty(Condition *c, char **env) { + int r; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_DIRECTORY_NOT_EMPTY); + + r = dir_is_empty(c->parameter, /* ignore_hidden_or_backup= */ true); + return r <= 0 && !IN_SET(r, -ENOENT, -ENOTDIR); +} + +static int condition_test_file_not_empty(Condition *c, char **env) { + struct stat st; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_FILE_NOT_EMPTY); + + return (stat(c->parameter, &st) >= 0 && + S_ISREG(st.st_mode) && + st.st_size > 0); +} + +static int condition_test_file_is_executable(Condition *c, char **env) { + struct stat st; + + assert(c); + assert(c->parameter); + assert(c->type == CONDITION_FILE_IS_EXECUTABLE); + + return (stat(c->parameter, &st) >= 0 && + S_ISREG(st.st_mode) && + (st.st_mode & 0111)); +} + +static int condition_test_psi(Condition *c, char **env) { + _cleanup_free_ char *first = NULL, *second = NULL, *third = NULL, *fourth = NULL, *pressure_path = NULL; + const char *p, *value, *pressure_type; + loadavg_t *current, limit; + ResourcePressure pressure; + int r; + + assert(c); + assert(c->parameter); + assert(IN_SET(c->type, CONDITION_MEMORY_PRESSURE, CONDITION_CPU_PRESSURE, CONDITION_IO_PRESSURE)); + + if (!is_pressure_supported()) { + log_debug("Pressure Stall Information (PSI) is not supported, skipping."); + return 1; + } + + pressure_type = c->type == CONDITION_MEMORY_PRESSURE ? "memory" : + c->type == CONDITION_CPU_PRESSURE ? "cpu" : + "io"; + + p = c->parameter; + r = extract_many_words(&p, ":", 0, &first, &second, NULL); + if (r <= 0) + return log_debug_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter); + /* If only one parameter is passed, then we look at the global system pressure rather than a specific cgroup. */ + if (r == 1) { + pressure_path = path_join("/proc/pressure", pressure_type); + if (!pressure_path) + return log_oom_debug(); + + value = first; + } else { + const char *controller = strjoina(pressure_type, ".pressure"); + _cleanup_free_ char *slice_path = NULL, *root_scope = NULL; + CGroupMask mask, required_mask; + char *slice, *e; + + required_mask = c->type == CONDITION_MEMORY_PRESSURE ? CGROUP_MASK_MEMORY : + c->type == CONDITION_CPU_PRESSURE ? CGROUP_MASK_CPU : + CGROUP_MASK_IO; + + slice = strstrip(first); + if (!slice) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter); + + r = cg_all_unified(); + if (r < 0) + return log_debug_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + if (r == 0) { + log_debug("PSI condition check requires the unified cgroups hierarchy, skipping."); + return 1; + } + + r = cg_mask_supported(&mask); + if (r < 0) + return log_debug_errno(r, "Failed to get supported cgroup controllers: %m"); + + if (!FLAGS_SET(mask, required_mask)) { + log_debug("Cgroup %s controller not available, skipping PSI condition check.", pressure_type); + return 1; + } + + r = cg_slice_to_path(slice, &slice_path); + if (r < 0) + return log_debug_errno(r, "Cannot determine slice \"%s\" cgroup path: %m", slice); + + /* We might be running under the user manager, so get the root path and prefix it accordingly. */ + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, getpid_cached(), &root_scope); + if (r < 0) + return log_debug_errno(r, "Failed to get root cgroup path: %m"); + + /* Drop init.scope, we want the parent. We could get an empty or / path, but that's fine, + * just skip it in that case. */ + e = endswith(root_scope, "/" SPECIAL_INIT_SCOPE); + if (e) + *e = 0; + if (!empty_or_root(root_scope)) { + _cleanup_free_ char *slice_joined = NULL; + + slice_joined = path_join(root_scope, slice_path); + if (!slice_joined) + return log_oom_debug(); + + free_and_replace(slice_path, slice_joined); + } + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, slice_path, controller, &pressure_path); + if (r < 0) + return log_debug_errno(r, "Error getting cgroup pressure path from %s: %m", slice_path); + + value = second; + } + + /* If a value including a specific timespan (in the intervals allowed by the kernel), + * parse it, otherwise we assume just a plain percentage that will be checked if it is + * smaller or equal to the current pressure average over 5 minutes. */ + r = extract_many_words(&value, "/", 0, &third, &fourth, NULL); + if (r <= 0) + return log_debug_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter); + if (r == 1) + current = &pressure.avg300; + else { + const char *timespan; + + timespan = skip_leading_chars(fourth, NULL); + if (!timespan) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter); + + if (startswith(timespan, "10sec")) + current = &pressure.avg10; + else if (startswith(timespan, "1min")) + current = &pressure.avg60; + else if (startswith(timespan, "5min")) + current = &pressure.avg300; + else + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter); + } + + value = strstrip(third); + if (!value) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter); + + r = parse_permyriad(value); + if (r < 0) + return log_debug_errno(r, "Failed to parse permyriad: %s", c->parameter); + + r = store_loadavg_fixed_point(r / 100LU, r % 100LU, &limit); + if (r < 0) + return log_debug_errno(r, "Failed to parse loadavg: %s", c->parameter); + + r = read_resource_pressure(pressure_path, PRESSURE_TYPE_FULL, &pressure); + if (r == -ENODATA) /* cpu.pressure 'full' was added recently, fall back to 'some'. */ + r = read_resource_pressure(pressure_path, PRESSURE_TYPE_SOME, &pressure); + if (r == -ENOENT) { + /* We already checked that /proc/pressure exists, so this means we were given a cgroup + * that doesn't exist or doesn't exist any longer. */ + log_debug("\"%s\" not found, skipping PSI check.", pressure_path); + return 1; + } + if (r < 0) + return log_debug_errno(r, "Error parsing pressure from %s: %m", pressure_path); + + return *current <= limit; +} + +int condition_test(Condition *c, char **env) { + + static int (*const condition_tests[_CONDITION_TYPE_MAX])(Condition *c, char **env) = { + [CONDITION_PATH_EXISTS] = condition_test_path_exists, + [CONDITION_PATH_EXISTS_GLOB] = condition_test_path_exists_glob, + [CONDITION_PATH_IS_DIRECTORY] = condition_test_path_is_directory, + [CONDITION_PATH_IS_SYMBOLIC_LINK] = condition_test_path_is_symbolic_link, + [CONDITION_PATH_IS_MOUNT_POINT] = condition_test_path_is_mount_point, + [CONDITION_PATH_IS_READ_WRITE] = condition_test_path_is_read_write, + [CONDITION_PATH_IS_ENCRYPTED] = condition_test_path_is_encrypted, + [CONDITION_DIRECTORY_NOT_EMPTY] = condition_test_directory_not_empty, + [CONDITION_FILE_NOT_EMPTY] = condition_test_file_not_empty, + [CONDITION_FILE_IS_EXECUTABLE] = condition_test_file_is_executable, + [CONDITION_KERNEL_COMMAND_LINE] = condition_test_kernel_command_line, + [CONDITION_KERNEL_VERSION] = condition_test_kernel_version, + [CONDITION_CREDENTIAL] = condition_test_credential, + [CONDITION_VIRTUALIZATION] = condition_test_virtualization, + [CONDITION_SECURITY] = condition_test_security, + [CONDITION_CAPABILITY] = condition_test_capability, + [CONDITION_HOST] = condition_test_host, + [CONDITION_AC_POWER] = condition_test_ac_power, + [CONDITION_ARCHITECTURE] = condition_test_architecture, + [CONDITION_FIRMWARE] = condition_test_firmware, + [CONDITION_NEEDS_UPDATE] = condition_test_needs_update, + [CONDITION_FIRST_BOOT] = condition_test_first_boot, + [CONDITION_USER] = condition_test_user, + [CONDITION_GROUP] = condition_test_group, + [CONDITION_CONTROL_GROUP_CONTROLLER] = condition_test_control_group_controller, + [CONDITION_CPUS] = condition_test_cpus, + [CONDITION_MEMORY] = condition_test_memory, + [CONDITION_ENVIRONMENT] = condition_test_environment, + [CONDITION_CPU_FEATURE] = condition_test_cpufeature, + [CONDITION_OS_RELEASE] = condition_test_osrelease, + [CONDITION_MEMORY_PRESSURE] = condition_test_psi, + [CONDITION_CPU_PRESSURE] = condition_test_psi, + [CONDITION_IO_PRESSURE] = condition_test_psi, + }; + + int r, b; + + assert(c); + assert(c->type >= 0); + assert(c->type < _CONDITION_TYPE_MAX); + + r = condition_tests[c->type](c, env); + if (r < 0) { + c->result = CONDITION_ERROR; + return r; + } + + b = (r > 0) == !c->negate; + c->result = b ? CONDITION_SUCCEEDED : CONDITION_FAILED; + return b; +} + +bool condition_test_list( + Condition *first, + char **env, + condition_to_string_t to_string, + condition_test_logger_t logger, + void *userdata) { + + int triggered = -1; + + /* If the condition list is empty, then it is true */ + if (!first) + return true; + + /* Otherwise, if all of the non-trigger conditions apply and + * if any of the trigger conditions apply (unless there are + * none) we return true */ + LIST_FOREACH(conditions, c, first) { + int r; + + r = condition_test(c, env); + + if (logger) { + if (r < 0) + logger(userdata, LOG_WARNING, r, PROJECT_FILE, __LINE__, __func__, + "Couldn't determine result for %s=%s%s%s, assuming failed: %m", + to_string(c->type), + c->trigger ? "|" : "", + c->negate ? "!" : "", + c->parameter); + else + logger(userdata, LOG_DEBUG, 0, PROJECT_FILE, __LINE__, __func__, + "%s=%s%s%s %s.", + to_string(c->type), + c->trigger ? "|" : "", + c->negate ? "!" : "", + c->parameter, + condition_result_to_string(c->result)); + } + + if (!c->trigger && r <= 0) + return false; + + if (c->trigger && triggered <= 0) + triggered = r > 0; + } + + return triggered != 0; +} + +void condition_dump(Condition *c, FILE *f, const char *prefix, condition_to_string_t to_string) { + assert(c); + assert(f); + assert(to_string); + + prefix = strempty(prefix); + + fprintf(f, + "%s\t%s: %s%s%s %s\n", + prefix, + to_string(c->type), + c->trigger ? "|" : "", + c->negate ? "!" : "", + c->parameter, + condition_result_to_string(c->result)); +} + +void condition_dump_list(Condition *first, FILE *f, const char *prefix, condition_to_string_t to_string) { + LIST_FOREACH(conditions, c, first) + condition_dump(c, f, prefix, to_string); +} + +static const char* const condition_type_table[_CONDITION_TYPE_MAX] = { + [CONDITION_ARCHITECTURE] = "ConditionArchitecture", + [CONDITION_FIRMWARE] = "ConditionFirmware", + [CONDITION_VIRTUALIZATION] = "ConditionVirtualization", + [CONDITION_HOST] = "ConditionHost", + [CONDITION_KERNEL_COMMAND_LINE] = "ConditionKernelCommandLine", + [CONDITION_KERNEL_VERSION] = "ConditionKernelVersion", + [CONDITION_CREDENTIAL] = "ConditionCredential", + [CONDITION_SECURITY] = "ConditionSecurity", + [CONDITION_CAPABILITY] = "ConditionCapability", + [CONDITION_AC_POWER] = "ConditionACPower", + [CONDITION_NEEDS_UPDATE] = "ConditionNeedsUpdate", + [CONDITION_FIRST_BOOT] = "ConditionFirstBoot", + [CONDITION_PATH_EXISTS] = "ConditionPathExists", + [CONDITION_PATH_EXISTS_GLOB] = "ConditionPathExistsGlob", + [CONDITION_PATH_IS_DIRECTORY] = "ConditionPathIsDirectory", + [CONDITION_PATH_IS_SYMBOLIC_LINK] = "ConditionPathIsSymbolicLink", + [CONDITION_PATH_IS_MOUNT_POINT] = "ConditionPathIsMountPoint", + [CONDITION_PATH_IS_READ_WRITE] = "ConditionPathIsReadWrite", + [CONDITION_PATH_IS_ENCRYPTED] = "ConditionPathIsEncrypted", + [CONDITION_DIRECTORY_NOT_EMPTY] = "ConditionDirectoryNotEmpty", + [CONDITION_FILE_NOT_EMPTY] = "ConditionFileNotEmpty", + [CONDITION_FILE_IS_EXECUTABLE] = "ConditionFileIsExecutable", + [CONDITION_USER] = "ConditionUser", + [CONDITION_GROUP] = "ConditionGroup", + [CONDITION_CONTROL_GROUP_CONTROLLER] = "ConditionControlGroupController", + [CONDITION_CPUS] = "ConditionCPUs", + [CONDITION_MEMORY] = "ConditionMemory", + [CONDITION_ENVIRONMENT] = "ConditionEnvironment", + [CONDITION_CPU_FEATURE] = "ConditionCPUFeature", + [CONDITION_OS_RELEASE] = "ConditionOSRelease", + [CONDITION_MEMORY_PRESSURE] = "ConditionMemoryPressure", + [CONDITION_CPU_PRESSURE] = "ConditionCPUPressure", + [CONDITION_IO_PRESSURE] = "ConditionIOPressure", +}; + +DEFINE_STRING_TABLE_LOOKUP(condition_type, ConditionType); + +static const char* const assert_type_table[_CONDITION_TYPE_MAX] = { + [CONDITION_ARCHITECTURE] = "AssertArchitecture", + [CONDITION_FIRMWARE] = "AssertFirmware", + [CONDITION_VIRTUALIZATION] = "AssertVirtualization", + [CONDITION_HOST] = "AssertHost", + [CONDITION_KERNEL_COMMAND_LINE] = "AssertKernelCommandLine", + [CONDITION_KERNEL_VERSION] = "AssertKernelVersion", + [CONDITION_CREDENTIAL] = "AssertCredential", + [CONDITION_SECURITY] = "AssertSecurity", + [CONDITION_CAPABILITY] = "AssertCapability", + [CONDITION_AC_POWER] = "AssertACPower", + [CONDITION_NEEDS_UPDATE] = "AssertNeedsUpdate", + [CONDITION_FIRST_BOOT] = "AssertFirstBoot", + [CONDITION_PATH_EXISTS] = "AssertPathExists", + [CONDITION_PATH_EXISTS_GLOB] = "AssertPathExistsGlob", + [CONDITION_PATH_IS_DIRECTORY] = "AssertPathIsDirectory", + [CONDITION_PATH_IS_SYMBOLIC_LINK] = "AssertPathIsSymbolicLink", + [CONDITION_PATH_IS_MOUNT_POINT] = "AssertPathIsMountPoint", + [CONDITION_PATH_IS_READ_WRITE] = "AssertPathIsReadWrite", + [CONDITION_PATH_IS_ENCRYPTED] = "AssertPathIsEncrypted", + [CONDITION_DIRECTORY_NOT_EMPTY] = "AssertDirectoryNotEmpty", + [CONDITION_FILE_NOT_EMPTY] = "AssertFileNotEmpty", + [CONDITION_FILE_IS_EXECUTABLE] = "AssertFileIsExecutable", + [CONDITION_USER] = "AssertUser", + [CONDITION_GROUP] = "AssertGroup", + [CONDITION_CONTROL_GROUP_CONTROLLER] = "AssertControlGroupController", + [CONDITION_CPUS] = "AssertCPUs", + [CONDITION_MEMORY] = "AssertMemory", + [CONDITION_ENVIRONMENT] = "AssertEnvironment", + [CONDITION_CPU_FEATURE] = "AssertCPUFeature", + [CONDITION_OS_RELEASE] = "AssertOSRelease", + [CONDITION_MEMORY_PRESSURE] = "AssertMemoryPressure", + [CONDITION_CPU_PRESSURE] = "AssertCPUPressure", + [CONDITION_IO_PRESSURE] = "AssertIOPressure", +}; + +DEFINE_STRING_TABLE_LOOKUP(assert_type, ConditionType); + +static const char* const condition_result_table[_CONDITION_RESULT_MAX] = { + [CONDITION_UNTESTED] = "untested", + [CONDITION_SUCCEEDED] = "succeeded", + [CONDITION_FAILED] = "failed", + [CONDITION_ERROR] = "error", +}; + +DEFINE_STRING_TABLE_LOOKUP(condition_result, ConditionResult); diff --git a/src/shared/condition.h b/src/shared/condition.h new file mode 100644 index 0000000..54cc904 --- /dev/null +++ b/src/shared/condition.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "list.h" +#include "macro.h" + +typedef enum ConditionType { + CONDITION_ARCHITECTURE, + CONDITION_FIRMWARE, + CONDITION_VIRTUALIZATION, + CONDITION_HOST, + CONDITION_KERNEL_COMMAND_LINE, + CONDITION_KERNEL_VERSION, + CONDITION_CREDENTIAL, + CONDITION_SECURITY, + CONDITION_CAPABILITY, + CONDITION_AC_POWER, + CONDITION_MEMORY, + CONDITION_CPUS, + CONDITION_ENVIRONMENT, + CONDITION_CPU_FEATURE, + CONDITION_OS_RELEASE, + CONDITION_MEMORY_PRESSURE, + CONDITION_CPU_PRESSURE, + CONDITION_IO_PRESSURE, + + CONDITION_NEEDS_UPDATE, + CONDITION_FIRST_BOOT, + + CONDITION_PATH_EXISTS, + CONDITION_PATH_EXISTS_GLOB, + CONDITION_PATH_IS_DIRECTORY, + CONDITION_PATH_IS_SYMBOLIC_LINK, + CONDITION_PATH_IS_MOUNT_POINT, + CONDITION_PATH_IS_READ_WRITE, + CONDITION_PATH_IS_ENCRYPTED, + CONDITION_DIRECTORY_NOT_EMPTY, + CONDITION_FILE_NOT_EMPTY, + CONDITION_FILE_IS_EXECUTABLE, + + CONDITION_USER, + CONDITION_GROUP, + + CONDITION_CONTROL_GROUP_CONTROLLER, + + _CONDITION_TYPE_MAX, + _CONDITION_TYPE_INVALID = -EINVAL, +} ConditionType; + +typedef enum ConditionResult { + CONDITION_UNTESTED, + CONDITION_SUCCEEDED, + CONDITION_FAILED, + CONDITION_ERROR, + _CONDITION_RESULT_MAX, + _CONDITION_RESULT_INVALID = -EINVAL, +} ConditionResult; + +typedef struct Condition { + ConditionType type:8; + + bool trigger:1; + bool negate:1; + + ConditionResult result:6; + + char *parameter; + + LIST_FIELDS(struct Condition, conditions); +} Condition; + +Condition* condition_new(ConditionType type, const char *parameter, bool trigger, bool negate); +Condition* condition_free(Condition *c); +Condition* condition_free_list_type(Condition *first, ConditionType type); +static inline Condition* condition_free_list(Condition *first) { + return condition_free_list_type(first, _CONDITION_TYPE_INVALID); +} + +int condition_test(Condition *c, char **env); + +typedef int (*condition_test_logger_t)(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) _printf_(7, 8); +typedef const char* (*condition_to_string_t)(ConditionType t) _const_; +bool condition_test_list(Condition *first, char **env, condition_to_string_t to_string, condition_test_logger_t logger, void *userdata); + +void condition_dump(Condition *c, FILE *f, const char *prefix, condition_to_string_t to_string); +void condition_dump_list(Condition *c, FILE *f, const char *prefix, condition_to_string_t to_string); + +const char* condition_type_to_string(ConditionType t) _const_; +ConditionType condition_type_from_string(const char *s) _pure_; + +const char* assert_type_to_string(ConditionType t) _const_; +ConditionType assert_type_from_string(const char *s) _pure_; + +const char* condition_result_to_string(ConditionResult r) _const_; +ConditionResult condition_result_from_string(const char *s) _pure_; + +static inline bool condition_takes_path(ConditionType t) { + return IN_SET(t, + CONDITION_PATH_EXISTS, + CONDITION_PATH_EXISTS_GLOB, + CONDITION_PATH_IS_DIRECTORY, + CONDITION_PATH_IS_SYMBOLIC_LINK, + CONDITION_PATH_IS_MOUNT_POINT, + CONDITION_PATH_IS_READ_WRITE, + CONDITION_PATH_IS_ENCRYPTED, + CONDITION_DIRECTORY_NOT_EMPTY, + CONDITION_FILE_NOT_EMPTY, + CONDITION_FILE_IS_EXECUTABLE, + CONDITION_NEEDS_UPDATE); +} diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c new file mode 100644 index 0000000..e8ecd9b --- /dev/null +++ b/src/shared/conf-parser.c @@ -0,0 +1,1984 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "constants.h" +#include "dns-domain.h" +#include "escape.h" +#include "ether-addr-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hash-funcs.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "in-addr-util.h" +#include "log.h" +#include "macro.h" +#include "missing_network.h" +#include "nulstr-util.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "sd-id128.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "syslog-util.h" +#include "time-util.h" +#include "utf8.h" + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(config_file_hash_ops_fclose, + char, path_hash_func, path_compare, + FILE, safe_fclose); + +int config_item_table_lookup( + const void *table, + const char *section, + const char *lvalue, + ConfigParserCallback *ret_func, + int *ret_ltype, + void **ret_data, + void *userdata) { + + assert(table); + assert(lvalue); + assert(ret_func); + assert(ret_ltype); + assert(ret_data); + + for (const ConfigTableItem *t = table; t->lvalue; t++) { + + if (!streq(lvalue, t->lvalue)) + continue; + + if (!streq_ptr(section, t->section)) + continue; + + *ret_func = t->parse; + *ret_ltype = t->ltype; + *ret_data = t->data; + return 1; + } + + *ret_func = NULL; + *ret_ltype = 0; + *ret_data = NULL; + return 0; +} + +int config_item_perf_lookup( + const void *table, + const char *section, + const char *lvalue, + ConfigParserCallback *ret_func, + int *ret_ltype, + void **ret_data, + void *userdata) { + + ConfigPerfItemLookup lookup = (ConfigPerfItemLookup) table; + const ConfigPerfItem *p; + + assert(table); + assert(lvalue); + assert(ret_func); + assert(ret_ltype); + assert(ret_data); + + if (section) { + const char *key; + + key = strjoina(section, ".", lvalue); + p = lookup(key, strlen(key)); + } else + p = lookup(lvalue, strlen(lvalue)); + if (!p) { + *ret_func = NULL; + *ret_ltype = 0; + *ret_data = NULL; + return 0; + } + + *ret_func = p->parse; + *ret_ltype = p->ltype; + *ret_data = (uint8_t*) userdata + p->offset; + return 1; +} + +/* Run the user supplied parser for an assignment */ +static int next_assignment( + const char *unit, + const char *filename, + unsigned line, + ConfigItemLookup lookup, + const void *table, + const char *section, + unsigned section_line, + const char *lvalue, + const char *rvalue, + ConfigParseFlags flags, + void *userdata) { + + ConfigParserCallback func = NULL; + int ltype = 0; + void *data = NULL; + int r; + + assert(filename); + assert(line > 0); + assert(lookup); + assert(lvalue); + assert(rvalue); + + r = lookup(table, section, lvalue, &func, <ype, &data, userdata); + if (r < 0) + return r; + if (r > 0) { + if (!func) + return 0; + + return func(unit, filename, line, section, section_line, + lvalue, ltype, rvalue, data, userdata); + } + + /* Warn about unknown non-extension fields. */ + if (!(flags & CONFIG_PARSE_RELAXED) && !startswith(lvalue, "X-")) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unknown key name '%s' in section '%s', ignoring.", lvalue, section); + + return 0; +} + +/* Parse a single logical line */ +static int parse_line( + const char* unit, + const char *filename, + unsigned line, + const char *sections, + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + char **section, + unsigned *section_line, + bool *section_ignored, + char *l, /* is modified */ + void *userdata) { + + char *e; + + assert(filename); + assert(line > 0); + assert(lookup); + assert(l); + + l = strstrip(l); + if (isempty(l)) + return 0; + + if (l[0] == '\n') + return 0; + + if (!utf8_is_valid(l)) + return log_syntax_invalid_utf8(unit, LOG_WARNING, filename, line, l); + + if (l[0] == '[') { + _cleanup_free_ char *n = NULL; + size_t k; + + k = strlen(l); + assert(k > 0); + + if (l[k-1] != ']') + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EBADMSG), "Invalid section header '%s'", l); + + n = strndup(l+1, k-2); + if (!n) + return log_oom(); + + if (!string_is_safe(n)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EBADMSG), "Bad characters in section header '%s'", l); + + if (sections && !nulstr_contains(sections, n)) { + bool ignore; + + ignore = (flags & CONFIG_PARSE_RELAXED) || startswith(n, "X-"); + + if (!ignore) + NULSTR_FOREACH(t, sections) + if (streq_ptr(n, startswith(t, "-"))) { /* Ignore sections prefixed with "-" in valid section list */ + ignore = true; + break; + } + + if (!ignore) + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown section '%s'. Ignoring.", n); + + *section = mfree(*section); + *section_line = 0; + *section_ignored = true; + } else { + free_and_replace(*section, n); + *section_line = line; + *section_ignored = false; + } + + return 0; + } + + if (sections && !*section) { + if (!(flags & CONFIG_PARSE_RELAXED) && !*section_ignored) + log_syntax(unit, LOG_WARNING, filename, line, 0, "Assignment outside of section. Ignoring."); + + return 0; + } + + e = strchr(l, '='); + if (!e) + return log_syntax(unit, LOG_WARNING, filename, line, 0, + "Missing '=', ignoring line."); + if (e == l) + return log_syntax(unit, LOG_WARNING, filename, line, 0, + "Missing key name before '=', ignoring line."); + + *e = 0; + e++; + + return next_assignment(unit, + filename, + line, + lookup, + table, + *section, + *section_line, + strstrip(l), + strstrip(e), + flags, + userdata); +} + +/* Go through the file and parse each line */ +int config_parse( + const char *unit, + const char *filename, + FILE *f, + const char *sections, + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata, + struct stat *ret_stat) { + + _cleanup_free_ char *section = NULL, *continuation = NULL; + _cleanup_fclose_ FILE *ours = NULL; + unsigned line = 0, section_line = 0; + bool section_ignored = false, bom_seen = false; + struct stat st; + int r, fd; + + assert(filename); + assert(lookup); + + if (!f) { + f = ours = fopen(filename, "re"); + if (!f) { + /* Only log on request, except for ENOENT, + * since we return 0 to the caller. */ + if ((flags & CONFIG_PARSE_WARN) || errno == ENOENT) + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno, + "Failed to open configuration file '%s': %m", filename); + + if (errno == ENOENT) { + if (ret_stat) + *ret_stat = (struct stat) {}; + + return 0; + } + + return -errno; + } + } + + fd = fileno(f); + if (fd >= 0) { /* stream might not have an fd, let's be careful hence */ + + if (fstat(fd, &st) < 0) + return log_full_errno(FLAGS_SET(flags, CONFIG_PARSE_WARN) ? LOG_ERR : LOG_DEBUG, errno, + "Failed to fstat(%s): %m", filename); + + (void) stat_warn_permissions(filename, &st); + } else + st = (struct stat) {}; + + for (;;) { + _cleanup_free_ char *buf = NULL; + bool escaped = false; + char *l, *p, *e; + + r = read_line(f, LONG_LINE_MAX, &buf); + if (r == 0) + break; + if (r == -ENOBUFS) { + if (flags & CONFIG_PARSE_WARN) + log_error_errno(r, "%s:%u: Line too long", filename, line); + + return r; + } + if (r < 0) { + if (FLAGS_SET(flags, CONFIG_PARSE_WARN)) + log_error_errno(r, "%s:%u: Error while reading configuration file: %m", filename, line); + + return r; + } + + line++; + + l = skip_leading_chars(buf, WHITESPACE); + if (*l != '\0' && strchr(COMMENTS, *l)) + continue; + + l = buf; + if (!bom_seen) { + char *q; + + q = startswith(buf, UTF8_BYTE_ORDER_MARK); + if (q) { + l = q; + bom_seen = true; + } + } + + if (continuation) { + if (strlen(continuation) + strlen(l) > LONG_LINE_MAX) { + if (flags & CONFIG_PARSE_WARN) + log_error("%s:%u: Continuation line too long", filename, line); + return -ENOBUFS; + } + + if (!strextend(&continuation, l)) { + if (flags & CONFIG_PARSE_WARN) + log_oom(); + return -ENOMEM; + } + + p = continuation; + } else + p = l; + + for (e = p; *e; e++) { + if (escaped) + escaped = false; + else if (*e == '\\') + escaped = true; + } + + if (escaped) { + *(e-1) = ' '; + + if (!continuation) { + continuation = strdup(l); + if (!continuation) { + if (flags & CONFIG_PARSE_WARN) + log_oom(); + return -ENOMEM; + } + } + + continue; + } + + r = parse_line(unit, + filename, + line, + sections, + lookup, + table, + flags, + §ion, + §ion_line, + §ion_ignored, + p, + userdata); + if (r < 0) { + if (flags & CONFIG_PARSE_WARN) + log_warning_errno(r, "%s:%u: Failed to parse file: %m", filename, line); + return r; + } + + continuation = mfree(continuation); + } + + if (continuation) { + r = parse_line(unit, + filename, + ++line, + sections, + lookup, + table, + flags, + §ion, + §ion_line, + §ion_ignored, + continuation, + userdata); + if (r < 0) { + if (flags & CONFIG_PARSE_WARN) + log_warning_errno(r, "%s:%u: Failed to parse file: %m", filename, line); + return r; + } + } + + if (ret_stat) + *ret_stat = st; + + return 1; +} + +int hashmap_put_stats_by_path(Hashmap **stats_by_path, const char *path, const struct stat *st) { + _cleanup_free_ struct stat *st_copy = NULL; + _cleanup_free_ char *path_copy = NULL; + int r; + + assert(stats_by_path); + assert(path); + assert(st); + + r = hashmap_ensure_allocated(stats_by_path, &path_hash_ops_free_free); + if (r < 0) + return r; + + st_copy = newdup(struct stat, st, 1); + if (!st_copy) + return -ENOMEM; + + path_copy = strdup(path); + if (!path_copy) + return -ENOMEM; + + r = hashmap_put(*stats_by_path, path_copy, st_copy); + if (r < 0) + return r; + + assert(r > 0); + TAKE_PTR(path_copy); + TAKE_PTR(st_copy); + return 0; +} + +static int config_parse_many_files( + const char* const* conf_files, + char **files, + const char *sections, + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata, + Hashmap **ret_stats_by_path) { + + _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL; + _cleanup_ordered_hashmap_free_ OrderedHashmap *dropins = NULL; + _cleanup_set_free_ Set *inodes = NULL; + struct stat st; + int r; + + if (ret_stats_by_path) { + stats_by_path = hashmap_new(&path_hash_ops_free_free); + if (!stats_by_path) + return -ENOMEM; + } + + STRV_FOREACH(fn, files) { + _cleanup_free_ struct stat *st_dropin = NULL; + _cleanup_fclose_ FILE *f = NULL; + int fd; + + f = fopen(*fn, "re"); + if (!f) { + if (errno == ENOENT) + continue; + + return -errno; + } + + fd = fileno(f); + + r = ordered_hashmap_ensure_put(&dropins, &config_file_hash_ops_fclose, *fn, f); + if (r < 0) { + assert(r != -EEXIST); + return r; + } + assert(r > 0); + TAKE_PTR(f); + + /* Get inodes for all drop-ins. Later we'll verify if main config is a symlink to or is + * symlinked as one of them. If so, we skip reading main config file directly. */ + + st_dropin = new(struct stat, 1); + if (!st_dropin) + return -ENOMEM; + + if (fstat(fd, st_dropin) < 0) + return -errno; + + r = set_ensure_consume(&inodes, &inode_hash_ops, TAKE_PTR(st_dropin)); + if (r < 0) + return r; + } + + /* First read the first found main config file. */ + STRV_FOREACH(fn, conf_files) { + _cleanup_fclose_ FILE *f = NULL; + + f = fopen(*fn, "re"); + if (!f) { + if (errno == ENOENT) + continue; + + return -errno; + } + + if (inodes) { + if (fstat(fileno(f), &st) < 0) + return -errno; + + if (set_contains(inodes, &st)) { + log_debug("%s: symlink to/symlinked as drop-in, will be read later.", *fn); + break; + } + } + + r = config_parse(NULL, *fn, f, sections, lookup, table, flags, userdata, &st); + if (r < 0) + return r; + assert(r > 0); + + if (ret_stats_by_path) { + r = hashmap_put_stats_by_path(&stats_by_path, *fn, &st); + if (r < 0) + return r; + } + + break; + } + + /* Then read all the drop-ins. */ + + const char *path_dropin; + FILE *f_dropin; + ORDERED_HASHMAP_FOREACH_KEY(f_dropin, path_dropin, dropins) { + r = config_parse(NULL, path_dropin, f_dropin, sections, lookup, table, flags, userdata, &st); + if (r < 0) + return r; + assert(r > 0); + + if (ret_stats_by_path) { + r = hashmap_put_stats_by_path(&stats_by_path, path_dropin, &st); + if (r < 0) + return r; + } + } + + if (ret_stats_by_path) + *ret_stats_by_path = TAKE_PTR(stats_by_path); + + return 0; +} + +/* Parse one main config file located in /etc/systemd and its drop-ins, which is what all systemd daemons + * do. */ +int config_parse_config_file( + const char *conf_file, + const char *sections, + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata) { + + _cleanup_strv_free_ char **dropins = NULL, **dropin_dirs = NULL; + char **conf_paths = CONF_PATHS_STRV(""); + int r; + + assert(conf_file); + + /* build the dropin dir list */ + dropin_dirs = new0(char*, strv_length(conf_paths) + 1); + if (!dropin_dirs) { + if (flags & CONFIG_PARSE_WARN) + return log_oom(); + return -ENOMEM; + } + + size_t i = 0; + STRV_FOREACH(p, conf_paths) { + char *d; + + d = strjoin(*p, "systemd/", conf_file, ".d"); + if (!d) { + if (flags & CONFIG_PARSE_WARN) + return log_oom(); + return -ENOMEM; + } + + dropin_dirs[i++] = d; + } + + r = conf_files_list_strv(&dropins, ".conf", NULL, 0, (const char**) dropin_dirs); + if (r < 0) + return r; + + const char *sysconf_file = strjoina(PKGSYSCONFDIR, "/", conf_file); + + return config_parse_many_files(STRV_MAKE_CONST(sysconf_file), dropins, + sections, lookup, table, flags, userdata, NULL); +} + +/* Parse each config file in the directories specified as strv. */ +int config_parse_many( + const char* const* conf_files, + const char* const* conf_file_dirs, + const char *dropin_dirname, + const char *root, + const char *sections, + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata, + Hashmap **ret_stats_by_path, + char ***ret_dropin_files) { + + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(conf_file_dirs); + assert(dropin_dirname); + assert(sections); + assert(table); + + r = conf_files_list_dropins(&files, dropin_dirname, root, conf_file_dirs); + if (r < 0) + return r; + + r = config_parse_many_files(conf_files, files, sections, lookup, table, flags, userdata, ret_stats_by_path); + if (r < 0) + return r; + + if (ret_dropin_files) + *ret_dropin_files = TAKE_PTR(files); + + return 0; +} + +static int dropins_get_stats_by_path( + const char* conf_file, + const char* const* conf_file_dirs, + Hashmap **stats_by_path) { + + _cleanup_strv_free_ char **files = NULL; + _cleanup_free_ char *dropin_dirname = NULL; + int r; + + assert(conf_file); + assert(conf_file_dirs); + assert(stats_by_path); + + r = path_extract_filename(conf_file, &dropin_dirname); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return -EINVAL; + + if (!strextend(&dropin_dirname, ".d")) + return -ENOMEM; + + r = conf_files_list_dropins(&files, dropin_dirname, /* root = */ NULL, conf_file_dirs); + if (r < 0) + return r; + + STRV_FOREACH(fn, files) { + struct stat st; + + if (stat(*fn, &st) < 0) { + if (errno == ENOENT) + continue; + + return -errno; + } + + r = hashmap_put_stats_by_path(stats_by_path, *fn, &st); + if (r < 0) + return r; + } + + return 0; +} + +int config_get_stats_by_path( + const char *suffix, + const char *root, + unsigned flags, + const char* const* dirs, + bool check_dropins, + Hashmap **ret) { + + _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL; + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(suffix); + assert(dirs); + assert(ret); + + /* Unlike config_parse(), this does not support stream. */ + + r = conf_files_list_strv(&files, suffix, root, flags, dirs); + if (r < 0) + return r; + + STRV_FOREACH(f, files) { + struct stat st; + + /* First read the main config file. */ + if (stat(*f, &st) < 0) { + if (errno == ENOENT) + continue; + + return -errno; + } + + r = hashmap_put_stats_by_path(&stats_by_path, *f, &st); + if (r < 0) + return r; + + if (!check_dropins) + continue; + + /* Then read all the drop-ins if requested. */ + r = dropins_get_stats_by_path(*f, dirs, &stats_by_path); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(stats_by_path); + return 0; +} + +bool stats_by_path_equal(Hashmap *a, Hashmap *b) { + struct stat *st_a, *st_b; + const char *path; + + if (hashmap_size(a) != hashmap_size(b)) + return false; + + HASHMAP_FOREACH_KEY(st_a, path, a) { + st_b = hashmap_get(b, path); + if (!st_b) + return false; + + if (!stat_inode_unmodified(st_a, st_b)) + return false; + } + + return true; +} + +static void config_section_hash_func(const ConfigSection *c, struct siphash *state) { + siphash24_compress_string(c->filename, state); + siphash24_compress(&c->line, sizeof(c->line), state); +} + +static int config_section_compare_func(const ConfigSection *x, const ConfigSection *y) { + int r; + + r = strcmp(x->filename, y->filename); + if (r != 0) + return r; + + return CMP(x->line, y->line); +} + +DEFINE_HASH_OPS(config_section_hash_ops, ConfigSection, config_section_hash_func, config_section_compare_func); + +int config_section_new(const char *filename, unsigned line, ConfigSection **ret) { + ConfigSection *cs; + + assert(filename); + assert(line > 0); + assert(ret); + + cs = malloc0(offsetof(ConfigSection, filename) + strlen(filename) + 1); + if (!cs) + return -ENOMEM; + + strcpy(cs->filename, filename); + cs->line = line; + + *ret = TAKE_PTR(cs); + return 0; +} + +int _hashmap_by_section_find_unused_line( + HashmapBase *entries_by_section, + const char *filename, + unsigned *ret) { + + ConfigSection *cs; + unsigned n = 0; + void *entry; + + HASHMAP_BASE_FOREACH_KEY(entry, cs, entries_by_section) { + if (filename && !streq(cs->filename, filename)) + continue; + n = MAX(n, cs->line); + } + + /* overflow? */ + if (n >= UINT_MAX) + return -EFBIG; + + *ret = n + 1; + return 0; +} + +#define DEFINE_PARSER(type, vartype, conv_func) \ + DEFINE_CONFIG_PARSE_PTR(config_parse_##type, conv_func, vartype, "Failed to parse " #type " value") + +DEFINE_PARSER(int, int, safe_atoi); +DEFINE_PARSER(long, long, safe_atoli); +DEFINE_PARSER(uint8, uint8_t, safe_atou8); +DEFINE_PARSER(uint16, uint16_t, safe_atou16); +DEFINE_PARSER(uint32, uint32_t, safe_atou32); +DEFINE_PARSER(int32, int32_t, safe_atoi32); +DEFINE_PARSER(uint64, uint64_t, safe_atou64); +DEFINE_PARSER(unsigned, unsigned, safe_atou); +DEFINE_PARSER(double, double, safe_atod); +DEFINE_PARSER(nsec, nsec_t, parse_nsec); +DEFINE_PARSER(sec, usec_t, parse_sec); +DEFINE_PARSER(sec_def_infinity, usec_t, parse_sec_def_infinity); +DEFINE_PARSER(mode, mode_t, parse_mode); +DEFINE_PARSER(pid, pid_t, parse_pid); + +int config_parse_iec_size( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + size_t *sz = ASSERT_PTR(data); + uint64_t v; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_size(rvalue, 1024, &v); + if (r >= 0 && (uint64_t) (size_t) v != v) + r = -ERANGE; + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value '%s', ignoring: %m", rvalue); + return 0; + } + + *sz = (size_t) v; + return 0; +} + +int config_parse_si_uint64( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *sz = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_size(rvalue, 1000, sz); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value '%s', ignoring: %m", rvalue); + + return 0; +} + +int config_parse_iec_uint64( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *bytes = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_size(rvalue, 1024, bytes); + if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value, ignoring: %s", rvalue); + + return 0; +} + +int config_parse_iec_uint64_infinity( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *bytes = ASSERT_PTR(data); + + assert(rvalue); + + if (streq(rvalue, "infinity")) { + *bytes = UINT64_MAX; + return 0; + } + + return config_parse_iec_uint64(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_bool( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int k; + bool *b = ASSERT_PTR(data); + bool fatal = ltype; + + assert(filename); + assert(lvalue); + assert(rvalue); + + k = parse_boolean(rvalue); + if (k < 0) { + log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, k, + "Failed to parse boolean value%s: %s", + fatal ? "" : ", ignoring", rvalue); + return fatal ? -ENOEXEC : 0; + } + + *b = k; + return 0; +} + +int config_parse_id128( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + sd_id128_t *result = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = id128_from_string_nonzero(rvalue, result); + if (r == -ENXIO) + log_syntax(unit, LOG_WARNING, filename, line, r, "128-bit ID/UUID is all 0, ignoring: %s", rvalue); + else if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse 128-bit ID/UUID, ignoring: %s", rvalue); + + return 0; +} + +int config_parse_tristate( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r, *t = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* A tristate is pretty much a boolean, except that it can also take an empty string, + * indicating "uninitialized", much like NULL is for a pointer type. */ + + if (isempty(rvalue)) { + *t = -1; + return 0; + } + + r = parse_tristate(rvalue, t); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse boolean value for %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + return 0; +} + +int config_parse_string( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **s = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *s = mfree(*s); + return 0; + } + + if (FLAGS_SET(ltype, CONFIG_PARSE_STRING_SAFE) && !string_is_safe(rvalue)) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(rvalue); + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified string contains unsafe characters, ignoring: %s", strna(escaped)); + return 0; + } + + if (FLAGS_SET(ltype, CONFIG_PARSE_STRING_ASCII) && !ascii_is_valid(rvalue)) { + _cleanup_free_ char *escaped = NULL; + + escaped = cescape(rvalue); + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified string contains invalid ASCII characters, ignoring: %s", strna(escaped)); + return 0; + } + + return free_and_strdup_warn(s, empty_to_null(rvalue)); +} + +int config_parse_dns_name( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **hostname = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *hostname = mfree(*hostname); + return 0; + } + + r = dns_name_is_valid(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to check validity of DNS domain name '%s', ignoring assignment: %m", rvalue); + return 0; + } + if (r == 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified invalid DNS domain name, ignoring assignment: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(hostname, rvalue); +} + +int config_parse_hostname( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **hostname = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *hostname = mfree(*hostname); + return 0; + } + + if (!hostname_is_valid(rvalue, 0)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Specified invalid hostname, ignoring assignment: %s", rvalue); + return 0; + } + + return config_parse_dns_name(unit, filename, line, section, section_line, + lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_path( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *n = NULL; + bool fatal = ltype; + char **s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + goto finalize; + + n = strdup(rvalue); + if (!n) + return log_oom(); + + r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE | (fatal ? PATH_CHECK_FATAL : 0), unit, filename, line, lvalue); + if (r < 0) + return fatal ? -ENOEXEC : 0; + +finalize: + return free_and_replace(*s, n); +} + +int config_parse_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***sv = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *sv = strv_free(*sv); + return 0; + } + + for (const char *p = rvalue;;) { + char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + r = strv_consume(sv, word); + if (r < 0) + return log_oom(); + } +} + +int config_parse_warn_compat( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Disabled reason = ltype; + + switch (reason) { + + case DISABLED_CONFIGURATION: + log_syntax(unit, LOG_DEBUG, filename, line, 0, + "Support for option %s= has been disabled at compile time and it is ignored", lvalue); + break; + + case DISABLED_LEGACY: + log_syntax(unit, LOG_INFO, filename, line, 0, + "Support for option %s= has been removed and it is ignored", lvalue); + break; + + case DISABLED_EXPERIMENTAL: + log_syntax(unit, LOG_INFO, filename, line, 0, + "Support for option %s= has not yet been enabled and it is ignored", lvalue); + break; + } + + return 0; +} + +int config_parse_log_facility( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *o = data, x; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + x = log_facility_unshifted_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse log facility, ignoring: %s", rvalue); + return 0; + } + + *o = (x << 3) | LOG_PRI(*o); + + return 0; +} + +int config_parse_log_level( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *o = data, x; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + x = log_level_from_string(rvalue); + if (x < 0) { + log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse log level, ignoring: %s", rvalue); + return 0; + } + + if (*o < 0) /* if it wasn't initialized so far, assume zero facility */ + *o = x; + else + *o = (*o & LOG_FACMASK) | x; + + return 0; +} + +int config_parse_signal( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *sig = data, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(sig); + + r = signal_from_string(rvalue); + if (r <= 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse signal name, ignoring: %s", rvalue); + return 0; + } + + *sig = r; + return 0; +} + +int config_parse_personality( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + unsigned long *personality = data, p; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(personality); + + if (isempty(rvalue)) + p = PERSONALITY_INVALID; + else { + p = personality_from_string(rvalue); + if (p == PERSONALITY_INVALID) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse personality, ignoring: %s", rvalue); + return 0; + } + } + + *personality = p; + return 0; +} + +int config_parse_ifname( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *s = mfree(*s); + return 0; + } + + if (!ifname_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Interface name is not valid or too long, ignoring assignment: %s", rvalue); + return 0; + } + + r = free_and_strdup(s, rvalue); + if (r < 0) + return log_oom(); + + return 0; +} + +int config_parse_ifnames( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **names = NULL; + char ***s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *s = strv_free(*s); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract interface name, ignoring assignment: %s", + rvalue); + return 0; + } + if (r == 0) + break; + + if (!ifname_valid_full(word, ltype)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Interface name is not valid or too long, ignoring assignment: %s", + word); + continue; + } + + r = strv_consume(&names, TAKE_PTR(word)); + if (r < 0) + return log_oom(); + } + + r = strv_extend_strv(s, names, true); + if (r < 0) + return log_oom(); + + return 0; +} + +int config_parse_ip_port( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint16_t *s = ASSERT_PTR(data); + uint16_t port; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *s = 0; + return 0; + } + + r = parse_ip_port(rvalue, &port); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse port '%s'.", rvalue); + return 0; + } + + *s = port; + + return 0; +} + +int config_parse_mtu( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *mtu = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = parse_mtu(ltype, rvalue, mtu); + if (r == -ERANGE) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Maximum transfer unit (MTU) value out of range. Permitted range is %" PRIu32 "…%" PRIu32 ", ignoring: %s", + (uint32_t) (ltype == AF_INET6 ? IPV6_MIN_MTU : IPV4_MIN_MTU), (uint32_t) UINT32_MAX, + rvalue); + return 0; + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse MTU value '%s', ignoring: %m", rvalue); + return 0; + } + + return 0; +} + +int config_parse_rlimit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + struct rlimit **rl = data, d = {}; + int r; + + assert(rvalue); + assert(rl); + + r = rlimit_parse(ltype, rvalue, &d); + if (r == -EILSEQ) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Soft resource limit chosen higher than hard limit, ignoring: %s", rvalue); + return 0; + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse resource value, ignoring: %s", rvalue); + return 0; + } + + if (rl[ltype]) + *rl[ltype] = d; + else { + rl[ltype] = newdup(struct rlimit, &d, 1); + if (!rl[ltype]) + return log_oom(); + } + + return 0; +} + +int config_parse_permille( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + unsigned *permille = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_permille(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse permille value, ignoring: %s", rvalue); + return 0; + } + + *permille = (unsigned) r; + + return 0; +} + +int config_parse_vlanprotocol( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *vlan_protocol = data; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + *vlan_protocol = -1; + return 0; + } + + if (STR_IN_SET(rvalue, "802.1ad", "802.1AD")) + *vlan_protocol = ETH_P_8021AD; + else if (STR_IN_SET(rvalue, "802.1q", "802.1Q")) + *vlan_protocol = ETH_P_8021Q; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse VLAN protocol value, ignoring: %s", rvalue); + return 0; + } + + return 0; +} + +int config_parse_hw_addr( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + struct hw_addr_data a, *hwaddr = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *hwaddr = HW_ADDR_NULL; + return 0; + } + + r = parse_hw_addr_full(rvalue, ltype, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Not a valid hardware address, ignoring assignment: %s", rvalue); + return 0; + } + + *hwaddr = a; + return 0; +} + +int config_parse_hw_addrs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **hwaddrs = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *hwaddrs = set_free(*hwaddrs); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + _cleanup_free_ struct hw_addr_data *n = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + n = new(struct hw_addr_data, 1); + if (!n) + return log_oom(); + + r = parse_hw_addr_full(word, ltype, n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Not a valid hardware address, ignoring: %s", word); + continue; + } + + r = set_ensure_consume(hwaddrs, &hw_addr_hash_ops_free, TAKE_PTR(n)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_ether_addr( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ struct ether_addr *n = NULL; + struct ether_addr **hwaddr = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *hwaddr = mfree(*hwaddr); + return 0; + } + + n = new0(struct ether_addr, 1); + if (!n) + return log_oom(); + + r = parse_ether_addr(rvalue, n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Not a valid MAC address, ignoring assignment: %s", rvalue); + return 0; + } + + free_and_replace(*hwaddr, n); + + return 0; +} + +int config_parse_ether_addrs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **hwaddrs = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty assignment resets the list */ + *hwaddrs = set_free(*hwaddrs); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + _cleanup_free_ struct ether_addr *n = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + n = new(struct ether_addr, 1); + if (!n) + return log_oom(); + + r = parse_ether_addr(word, n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Not a valid MAC address, ignoring: %s", word); + continue; + } + + r = set_ensure_consume(hwaddrs, ðer_addr_hash_ops_free, TAKE_PTR(n)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_in_addr_non_null( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + /* data must be a pointer to struct in_addr or in6_addr, and the type is determined by ltype. */ + struct in_addr *ipv4 = ASSERT_PTR(data); + struct in6_addr *ipv6 = ASSERT_PTR(data); + union in_addr_union a; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(IN_SET(ltype, AF_INET, AF_INET6)); + + if (isempty(rvalue)) { + if (ltype == AF_INET) + *ipv4 = (struct in_addr) {}; + else + *ipv6 = (struct in6_addr) {}; + return 0; + } + + r = in_addr_from_string(ltype, rvalue, &a); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (!in_addr_is_set(ltype, &a)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "%s= cannot be the ANY address, ignoring: %s", lvalue, rvalue); + return 0; + } + + if (ltype == AF_INET) + *ipv4 = a.in; + else + *ipv6 = a.in6; + return 0; +} + +int config_parse_unsigned_bounded( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *name, + const char *value, + unsigned min, + unsigned max, + bool ignoring, + unsigned *ret) { + + int r; + + assert(filename); + assert(name); + assert(value); + assert(ret); + + r = safe_atou_bounded(value, min, max, ret); + if (r == -ERANGE) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid '%s=%s', allowed range is %u..%u%s.", + name, value, min, max, ignoring ? ", ignoring" : ""); + else if (r < 0) + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse '%s=%s'%s: %m", + name, value, ignoring ? ", ignoring" : ""); + + if (r >= 0) + return 1; /* Return 1 if something was set */ + else if (ignoring) + return 0; + else + return r; +} + +DEFINE_CONFIG_PARSE(config_parse_percent, parse_percent, "Failed to parse percent value"); +DEFINE_CONFIG_PARSE(config_parse_permyriad, parse_permyriad, "Failed to parse permyriad value"); +DEFINE_CONFIG_PARSE_PTR(config_parse_sec_fix_0, parse_sec_fix_0, usec_t, "Failed to parse time value"); diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h new file mode 100644 index 0000000..a1768cd --- /dev/null +++ b/src/shared/conf-parser.h @@ -0,0 +1,481 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "hashmap.h" +#include "log.h" +#include "macro.h" +#include "time-util.h" + +/* An abstract parser for simple, line based, shallow configuration files consisting of variable assignments only. */ + +typedef enum ConfigParseFlags { + CONFIG_PARSE_RELAXED = 1 << 0, /* Do not warn about unknown non-extension fields */ + CONFIG_PARSE_WARN = 1 << 1, /* Emit non-debug messages */ +} ConfigParseFlags; + +/* Argument list for parsers of specific configuration settings. */ +#define CONFIG_PARSER_ARGUMENTS \ + const char *unit, \ + const char *filename, \ + unsigned line, \ + const char *section, \ + unsigned section_line, \ + const char *lvalue, \ + int ltype, \ + const char *rvalue, \ + void *data, \ + void *userdata + +/* Prototype for a parser for a specific configuration setting */ +typedef int (*ConfigParserCallback)(CONFIG_PARSER_ARGUMENTS); + +/* A macro declaring a function prototype, following the typedef above, simply because it's so cumbersomely long + * otherwise. (And current emacs gets irritatingly slow when editing files that contain lots of very long function + * prototypes on the same screen…) */ +#define CONFIG_PARSER_PROTOTYPE(name) int name(CONFIG_PARSER_ARGUMENTS) + +/* Wraps information for parsing a specific configuration variable, to + * be stored in a simple array */ +typedef struct ConfigTableItem { + const char *section; /* Section */ + const char *lvalue; /* Name of the variable */ + ConfigParserCallback parse; /* Function that is called to parse the variable's value */ + int ltype; /* Distinguish different variables passed to the same callback */ + void *data; /* Where to store the variable's data */ +} ConfigTableItem; + +/* Wraps information for parsing a specific configuration variable, to + * be stored in a gperf perfect hashtable */ +typedef struct ConfigPerfItem { + const char *section_and_lvalue; /* Section + "." + name of the variable */ + ConfigParserCallback parse; /* Function that is called to parse the variable's value */ + int ltype; /* Distinguish different variables passed to the same callback */ + size_t offset; /* Offset where to store data, from the beginning of userdata */ +} ConfigPerfItem; + +/* Prototype for a low-level gperf lookup function */ +typedef const ConfigPerfItem* (*ConfigPerfItemLookup)(const char *section_and_lvalue, GPERF_LEN_TYPE length); + +/* Prototype for a generic high-level lookup function */ +typedef int (*ConfigItemLookup)( + const void *table, + const char *section, + const char *lvalue, + ConfigParserCallback *ret_func, + int *ret_ltype, + void **ret_data, + void *userdata); + +/* Linear table search implementation of ConfigItemLookup, based on + * ConfigTableItem arrays */ +int config_item_table_lookup(const void *table, const char *section, const char *lvalue, ConfigParserCallback *ret_func, int *ret_ltype, void **ret_data, void *userdata); + +/* gperf implementation of ConfigItemLookup, based on gperf + * ConfigPerfItem tables */ +int config_item_perf_lookup(const void *table, const char *section, const char *lvalue, ConfigParserCallback *ret_func, int *ret_ltype, void **ret_data, void *userdata); + +int config_parse( + const char *unit, + const char *filename, + FILE *f, + const char *sections, /* nulstr */ + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata, + struct stat *ret_stat); /* possibly NULL */ + +int config_parse_config_file( + const char *conf_file, + const char *sections, /* nulstr */ + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata); + +int config_parse_many( + const char* const* conf_files, /* possibly empty */ + const char* const* conf_file_dirs, + const char *dropin_dirname, + const char *root, + const char *sections, /* nulstr */ + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata, + Hashmap **ret_stats_by_path, /* possibly NULL */ + char ***ret_drop_in_files); /* possibly NULL */ + +int config_get_stats_by_path( + const char *suffix, + const char *root, + unsigned flags, + const char* const* dirs, + bool check_dropins, + Hashmap **ret); + +int hashmap_put_stats_by_path(Hashmap **stats_by_path, const char *path, const struct stat *st); +bool stats_by_path_equal(Hashmap *a, Hashmap *b); + +typedef struct ConfigSection { + unsigned line; + bool invalid; + char filename[]; +} ConfigSection; + +static inline ConfigSection* config_section_free(ConfigSection *cs) { + return mfree(cs); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(ConfigSection*, config_section_free); + +int config_section_new(const char *filename, unsigned line, ConfigSection **ret); +extern const struct hash_ops config_section_hash_ops; +int _hashmap_by_section_find_unused_line( + HashmapBase *entries_by_section, + const char *filename, + unsigned *ret); +static inline int hashmap_by_section_find_unused_line( + Hashmap *entries_by_section, + const char *filename, + unsigned *ret) { + return _hashmap_by_section_find_unused_line(HASHMAP_BASE(entries_by_section), filename, ret); +} +static inline int ordered_hashmap_by_section_find_unused_line( + OrderedHashmap *entries_by_section, + const char *filename, + unsigned *ret) { + return _hashmap_by_section_find_unused_line(HASHMAP_BASE(entries_by_section), filename, ret); +} + +static inline bool section_is_invalid(ConfigSection *section) { + /* If this returns false, then it does _not_ mean the section is valid. */ + + if (!section) + return false; + + return section->invalid; +} + +#define DEFINE_SECTION_CLEANUP_FUNCTIONS(type, free_func) \ + static inline type* free_func##_or_set_invalid(type *p) { \ + assert(p); \ + \ + if (p->section) \ + p->section->invalid = true; \ + else \ + free_func(p); \ + return NULL; \ + } \ + DEFINE_TRIVIAL_CLEANUP_FUNC(type*, free_func); \ + DEFINE_TRIVIAL_CLEANUP_FUNC(type*, free_func##_or_set_invalid); + +CONFIG_PARSER_PROTOTYPE(config_parse_int); +CONFIG_PARSER_PROTOTYPE(config_parse_unsigned); +CONFIG_PARSER_PROTOTYPE(config_parse_long); +CONFIG_PARSER_PROTOTYPE(config_parse_uint8); +CONFIG_PARSER_PROTOTYPE(config_parse_uint16); +CONFIG_PARSER_PROTOTYPE(config_parse_uint32); +CONFIG_PARSER_PROTOTYPE(config_parse_int32); +CONFIG_PARSER_PROTOTYPE(config_parse_uint64); +CONFIG_PARSER_PROTOTYPE(config_parse_double); +CONFIG_PARSER_PROTOTYPE(config_parse_iec_size); +CONFIG_PARSER_PROTOTYPE(config_parse_si_uint64); +CONFIG_PARSER_PROTOTYPE(config_parse_iec_uint64); +CONFIG_PARSER_PROTOTYPE(config_parse_iec_uint64_infinity); +CONFIG_PARSER_PROTOTYPE(config_parse_bool); +CONFIG_PARSER_PROTOTYPE(config_parse_id128); +CONFIG_PARSER_PROTOTYPE(config_parse_tristate); +CONFIG_PARSER_PROTOTYPE(config_parse_string); +CONFIG_PARSER_PROTOTYPE(config_parse_dns_name); +CONFIG_PARSER_PROTOTYPE(config_parse_hostname); +CONFIG_PARSER_PROTOTYPE(config_parse_path); +CONFIG_PARSER_PROTOTYPE(config_parse_strv); +CONFIG_PARSER_PROTOTYPE(config_parse_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_sec_def_infinity); +CONFIG_PARSER_PROTOTYPE(config_parse_sec_def_unset); +CONFIG_PARSER_PROTOTYPE(config_parse_nsec); +CONFIG_PARSER_PROTOTYPE(config_parse_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_warn_compat); +CONFIG_PARSER_PROTOTYPE(config_parse_log_facility); +CONFIG_PARSER_PROTOTYPE(config_parse_log_level); +CONFIG_PARSER_PROTOTYPE(config_parse_signal); +CONFIG_PARSER_PROTOTYPE(config_parse_personality); +CONFIG_PARSER_PROTOTYPE(config_parse_permille); +CONFIG_PARSER_PROTOTYPE(config_parse_ifname); +CONFIG_PARSER_PROTOTYPE(config_parse_ifnames); +CONFIG_PARSER_PROTOTYPE(config_parse_ip_port); +CONFIG_PARSER_PROTOTYPE(config_parse_mtu); +CONFIG_PARSER_PROTOTYPE(config_parse_rlimit); +CONFIG_PARSER_PROTOTYPE(config_parse_vlanprotocol); +CONFIG_PARSER_PROTOTYPE(config_parse_hw_addr); +CONFIG_PARSER_PROTOTYPE(config_parse_hw_addrs); +CONFIG_PARSER_PROTOTYPE(config_parse_ether_addr); +CONFIG_PARSER_PROTOTYPE(config_parse_ether_addrs); +CONFIG_PARSER_PROTOTYPE(config_parse_in_addr_non_null); +CONFIG_PARSER_PROTOTYPE(config_parse_percent); +CONFIG_PARSER_PROTOTYPE(config_parse_permyriad); +CONFIG_PARSER_PROTOTYPE(config_parse_pid); +CONFIG_PARSER_PROTOTYPE(config_parse_sec_fix_0); + +typedef enum Disabled { + DISABLED_CONFIGURATION, + DISABLED_LEGACY, + DISABLED_EXPERIMENTAL, +} Disabled; + +typedef enum ConfigParseStringFlags { + CONFIG_PARSE_STRING_SAFE = 1 << 0, + CONFIG_PARSE_STRING_ASCII = 1 << 1, + + CONFIG_PARSE_STRING_SAFE_AND_ASCII = CONFIG_PARSE_STRING_SAFE | CONFIG_PARSE_STRING_ASCII, +} ConfigParseStringFlags; + +#define DEFINE_CONFIG_PARSE(function, parser, msg) \ + CONFIG_PARSER_PROTOTYPE(function) { \ + int *i = data, r; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + assert(data); \ + \ + r = parser(rvalue); \ + if (r < 0) { \ + log_syntax(unit, LOG_WARNING, filename, line, r, \ + msg ", ignoring: %s", rvalue); \ + return 0; \ + } \ + \ + *i = r; \ + return 0; \ + } + +#define DEFINE_CONFIG_PARSE_PTR(function, parser, type, msg) \ + CONFIG_PARSER_PROTOTYPE(function) { \ + type *i = ASSERT_PTR(data); \ + int r; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + \ + r = parser(rvalue, i); \ + if (r < 0) \ + log_syntax(unit, LOG_WARNING, filename, line, r, \ + msg ", ignoring: %s", rvalue); \ + \ + return 0; \ + } + +#define DEFINE_CONFIG_PARSE_ENUM_FULL(function, from_string, type, msg) \ + CONFIG_PARSER_PROTOTYPE(function) { \ + type *i = data, x; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + assert(data); \ + \ + x = from_string(rvalue); \ + if (x < 0) { \ + log_syntax(unit, LOG_WARNING, filename, line, x, \ + msg ", ignoring: %s", rvalue); \ + return 0; \ + } \ + \ + *i = x; \ + return 0; \ + } + +#define DEFINE_CONFIG_PARSE_ENUM(function, name, type, msg) \ + DEFINE_CONFIG_PARSE_ENUM_FULL(function, name##_from_string, type, msg) + +#define DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(function, name, type, default_value, msg) \ + CONFIG_PARSER_PROTOTYPE(function) { \ + type *i = data, x; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + assert(data); \ + \ + if (isempty(rvalue)) { \ + *i = default_value; \ + return 0; \ + } \ + \ + x = name##_from_string(rvalue); \ + if (x < 0) { \ + log_syntax(unit, LOG_WARNING, filename, line, x, \ + msg ", ignoring: %s", rvalue); \ + return 0; \ + } \ + \ + *i = x; \ + return 0; \ + } + +#define DEFINE_CONFIG_PARSE_ENUMV(function, name, type, invalid, msg) \ + CONFIG_PARSER_PROTOTYPE(function) { \ + type **enums = ASSERT_PTR(data); \ + _cleanup_free_ type *xs = NULL; \ + size_t i = 0; \ + int r; \ + \ + assert(filename); \ + assert(lvalue); \ + assert(rvalue); \ + \ + xs = new0(type, 1); \ + if (!xs) \ + return -ENOMEM; \ + \ + *xs = invalid; \ + \ + for (const char *p = rvalue;;) { \ + _cleanup_free_ char *en = NULL; \ + type x, *new_xs; \ + \ + r = extract_first_word(&p, &en, NULL, 0); \ + if (r == -ENOMEM) \ + return log_oom(); \ + if (r < 0) { \ + log_syntax(unit, LOG_WARNING, filename, line, r, \ + msg ", ignoring: %s", en); \ + return 0; \ + } \ + if (r == 0) \ + break; \ + \ + x = name##_from_string(en); \ + if (x < 0) { \ + log_syntax(unit, LOG_WARNING, filename, line, x, \ + msg ", ignoring: %s", en); \ + continue; \ + } \ + \ + for (type *ys = xs; x != invalid && *ys != invalid; ys++) \ + if (*ys == x) { \ + log_syntax(unit, LOG_NOTICE, filename, line, 0, \ + "Duplicate entry, ignoring: %s", \ + en); \ + x = invalid; \ + } \ + \ + if (x == invalid) \ + continue; \ + \ + *(xs + i) = x; \ + new_xs = realloc(xs, (++i + 1) * sizeof(type)); \ + if (new_xs) \ + xs = new_xs; \ + else \ + return log_oom(); \ + \ + *(xs + i) = invalid; \ + } \ + \ + return free_and_replace(*enums, xs); \ + } + +int config_parse_unsigned_bounded( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *name, + const char *value, + unsigned min, + unsigned max, + bool ignoring, + unsigned *ret); + +static inline int config_parse_uint32_bounded( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *name, + const char *value, + uint32_t min, + uint32_t max, + bool ignoring, + uint32_t *ret) { + + unsigned t; + int r; + + r = config_parse_unsigned_bounded( + unit, filename, line, section, section_line, name, value, + min, max, ignoring, + &t); + if (r <= 0) + return r; + assert(t <= UINT32_MAX); + *ret = t; + return 1; +} + +static inline int config_parse_uint16_bounded( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *name, + const char *value, + uint16_t min, + uint16_t max, + bool ignoring, + uint16_t *ret) { + + unsigned t; + int r; + + r = config_parse_unsigned_bounded( + unit, filename, line, section, section_line, name, value, + min, max, ignoring, + &t); + if (r <= 0) + return r; + assert(t <= UINT16_MAX); + *ret = t; + return 1; +} + +static inline int config_parse_uint8_bounded( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *name, + const char *value, + uint8_t min, + uint8_t max, + bool ignoring, + uint8_t *ret) { + + unsigned t; + int r; + + r = config_parse_unsigned_bounded( + unit, filename, line, section, section_line, name, value, + min, max, ignoring, + &t); + if (r <= 0) + return r; + assert(t <= UINT8_MAX); + *ret = t; + return 1; +} diff --git a/src/shared/copy.c b/src/shared/copy.c new file mode 100644 index 0000000..c0e30cd --- /dev/null +++ b/src/shared/copy.c @@ -0,0 +1,1635 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "chattr-util.h" +#include "copy.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "macro.h" +#include "missing_fs.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "sync-util.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "xattr-util.h" + +#define COPY_BUFFER_SIZE (16U*1024U) + +/* A safety net for descending recursively into file system trees to copy. On Linux PATH_MAX is 4096, which means the + * deepest valid path one can build is around 2048, which we hence use as a safety net here, to not spin endlessly in + * case of bind mount cycles and suchlike. */ +#define COPY_DEPTH_MAX 2048U + +static ssize_t try_copy_file_range( + int fd_in, loff_t *off_in, + int fd_out, loff_t *off_out, + size_t len, + unsigned flags) { + + static int have = -1; + ssize_t r; + + if (have == 0) + return -ENOSYS; + + r = copy_file_range(fd_in, off_in, fd_out, off_out, len, flags); + if (have < 0) + have = r >= 0 || errno != ENOSYS; + if (r < 0) + return -errno; + + return r; +} + +enum { + FD_IS_NO_PIPE, + FD_IS_BLOCKING_PIPE, + FD_IS_NONBLOCKING_PIPE, +}; + +static int fd_is_nonblock_pipe(int fd) { + struct stat st; + int flags; + + /* Checks whether the specified file descriptor refers to a pipe, and if so if O_NONBLOCK is set. */ + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISFIFO(st.st_mode)) + return FD_IS_NO_PIPE; + + flags = fcntl(fd, F_GETFL); + if (flags < 0) + return -errno; + + return FLAGS_SET(flags, O_NONBLOCK) ? FD_IS_NONBLOCKING_PIPE : FD_IS_BLOCKING_PIPE; +} + +static int look_for_signals(CopyFlags copy_flags) { + int r; + + if ((copy_flags & (COPY_SIGINT|COPY_SIGTERM)) == 0) + return 0; + + r = pop_pending_signal(copy_flags & COPY_SIGINT ? SIGINT : 0, + copy_flags & COPY_SIGTERM ? SIGTERM : 0); + if (r < 0) + return r; + if (r != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINTR), + "Got %s, cancelling copy operation.", signal_to_string(r)); + + return 0; +} + +static int create_hole(int fd, off_t size) { + off_t offset; + off_t end; + + offset = lseek(fd, 0, SEEK_CUR); + if (offset < 0) + return -errno; + + end = lseek(fd, 0, SEEK_END); + if (end < 0) + return -errno; + + /* If we're not at the end of the target file, try to punch a hole in the existing space using fallocate(). */ + + if (offset < end && + fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, MIN(size, end - offset)) < 0 && + !ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + + if (end - offset >= size) { + /* If we've created the full hole, set the file pointer to the end of the hole we created and exit. */ + if (lseek(fd, offset + size, SEEK_SET) < 0) + return -errno; + + return 0; + } + + /* If we haven't created the full hole, use ftruncate() to grow the file (and the hole) to the + * required size and move the file pointer to the end of the file. */ + + size -= end - offset; + + if (ftruncate(fd, end + size) < 0) + return -errno; + + if (lseek(fd, 0, SEEK_END) < 0) + return -errno; + + return 0; +} + +int copy_bytes_full( + int fdf, int fdt, + uint64_t max_bytes, + CopyFlags copy_flags, + void **ret_remains, + size_t *ret_remains_size, + copy_progress_bytes_t progress, + void *userdata) { + + _cleanup_close_ int fdf_opened = -EBADF, fdt_opened = -EBADF; + bool try_cfr = true, try_sendfile = true, try_splice = true, copied_something = false; + int r, nonblock_pipe = -1; + size_t m = SSIZE_MAX; /* that is the maximum that sendfile and c_f_r accept */ + + assert(fdf >= 0); + assert(fdt >= 0); + assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); + + /* Tries to copy bytes from the file descriptor 'fdf' to 'fdt' in the smartest possible way. Copies a maximum + * of 'max_bytes', which may be specified as UINT64_MAX, in which no maximum is applied. Returns negative on + * error, zero if EOF is hit before the bytes limit is hit and positive otherwise. If the copy fails for some + * reason but we read but didn't yet write some data an ret_remains/ret_remains_size is not NULL, then it will + * be initialized with an allocated buffer containing this "remaining" data. Note that these two parameters are + * initialized with a valid buffer only on failure and only if there's actually data already read. Otherwise + * these parameters if non-NULL are set to NULL. */ + + if (ret_remains) + *ret_remains = NULL; + if (ret_remains_size) + *ret_remains_size = 0; + + fdf = fd_reopen_condition(fdf, O_CLOEXEC | O_NOCTTY | O_RDONLY, O_PATH, &fdf_opened); + if (fdf < 0) + return fdf; + fdt = fd_reopen_condition(fdt, O_CLOEXEC | O_NOCTTY | O_RDWR, O_PATH, &fdt_opened); + if (fdt < 0) + return fdt; + + /* Try btrfs reflinks first. This only works on regular, seekable files, hence let's check the file offsets of + * source and destination first. */ + if ((copy_flags & COPY_REFLINK)) { + off_t foffset; + + foffset = lseek(fdf, 0, SEEK_CUR); + if (foffset >= 0) { + off_t toffset; + + toffset = lseek(fdt, 0, SEEK_CUR); + if (toffset >= 0) { + + if (foffset == 0 && toffset == 0 && max_bytes == UINT64_MAX) + r = reflink(fdf, fdt); /* full file reflink */ + else + r = reflink_range(fdf, foffset, fdt, toffset, max_bytes == UINT64_MAX ? 0 : max_bytes); /* partial reflink */ + if (r >= 0) { + off_t t; + + /* This worked, yay! Now — to be fully correct — let's adjust the file pointers */ + if (max_bytes == UINT64_MAX) { + + /* We cloned to the end of the source file, let's position the read + * pointer there, and query it at the same time. */ + t = lseek(fdf, 0, SEEK_END); + if (t < 0) + return -errno; + if (t < foffset) + return -ESPIPE; + + /* Let's adjust the destination file write pointer by the same number + * of bytes. */ + t = lseek(fdt, toffset + (t - foffset), SEEK_SET); + if (t < 0) + return -errno; + + return 0; /* we copied the whole thing, hence hit EOF, return 0 */ + } else { + t = lseek(fdf, foffset + max_bytes, SEEK_SET); + if (t < 0) + return -errno; + + t = lseek(fdt, toffset + max_bytes, SEEK_SET); + if (t < 0) + return -errno; + + return 1; /* we copied only some number of bytes, which worked, but this means we didn't hit EOF, return 1 */ + } + } + } + } + } + + for (;;) { + ssize_t n; + + if (max_bytes <= 0) + break; + + r = look_for_signals(copy_flags); + if (r < 0) + return r; + + if (max_bytes != UINT64_MAX && m > max_bytes) + m = max_bytes; + + if (copy_flags & COPY_HOLES) { + off_t c, e; + + c = lseek(fdf, 0, SEEK_CUR); + if (c < 0) + return -errno; + + /* To see if we're in a hole, we search for the next data offset. */ + e = lseek(fdf, c, SEEK_DATA); + if (e < 0 && errno == ENXIO) + /* If errno == ENXIO, that means we've reached the final hole of the file and + * that hole isn't followed by more data. */ + e = lseek(fdf, 0, SEEK_END); + if (e < 0) + return -errno; + + /* If we're in a hole (current offset is not a data offset), create a hole of the + * same size in the target file. */ + if (e > c) { + /* Make sure our new hole doesn't go over the maximum size we're allowed to copy. */ + n = MIN(max_bytes, (uint64_t) e - c); + r = create_hole(fdt, n); + if (r < 0) + return r; + + /* Make sure holes are taken into account in the maximum size we're supposed to copy. */ + if (max_bytes != UINT64_MAX) { + max_bytes -= n; + if (max_bytes <= 0) + break; + } + + /* Update the size we're supposed to copy in this iteration if needed. */ + if (m > max_bytes) + m = max_bytes; + } + + c = e; /* Set c to the start of the data segment. */ + + /* After copying a potential hole, find the end of the data segment by looking for + * the next hole. If we get ENXIO, we're at EOF. */ + e = lseek(fdf, c, SEEK_HOLE); + if (e < 0) { + if (errno == ENXIO) + break; + return -errno; + } + + /* SEEK_HOLE modifies the file offset so we need to move back to the initial offset. */ + if (lseek(fdf, c, SEEK_SET) < 0) + return -errno; + + /* Make sure we're not copying more than the current data segment. */ + m = MIN(m, (size_t) e - c); + } + + /* First try copy_file_range(), unless we already tried */ + if (try_cfr) { + n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u); + if (n < 0) { + if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF)) + return n; + + try_cfr = false; + /* use fallback below */ + } else if (n == 0) { /* likely EOF */ + + if (copied_something) + break; + + /* So, we hit EOF immediately, without having copied a single byte. This + * could indicate two things: the file is actually empty, or we are on some + * virtual file system such as procfs/sysfs where the syscall actually + * doesn't work but doesn't return an error. Try to handle that, by falling + * back to simple read()s in case we encounter empty files. + * + * See: https://lwn.net/Articles/846403/ */ + try_cfr = try_sendfile = try_splice = false; + } else + /* Success! */ + goto next; + } + + /* First try sendfile(), unless we already tried */ + if (try_sendfile) { + n = sendfile(fdt, fdf, NULL, m); + if (n < 0) { + if (!IN_SET(errno, EINVAL, ENOSYS)) + return -errno; + + try_sendfile = false; + /* use fallback below */ + } else if (n == 0) { /* likely EOF */ + + if (copied_something) + break; + + try_sendfile = try_splice = false; /* same logic as above for copy_file_range() */ + } else + /* Success! */ + goto next; + } + + /* Then try splice, unless we already tried. */ + if (try_splice) { + + /* splice()'s asynchronous I/O support is a bit weird. When it encounters a pipe file + * descriptor, then it will ignore its O_NONBLOCK flag and instead only honour the + * SPLICE_F_NONBLOCK flag specified in its flag parameter. Let's hide this behaviour + * here, and check if either of the specified fds are a pipe, and if so, let's pass + * the flag automatically, depending on O_NONBLOCK being set. + * + * Here's a twist though: when we use it to move data between two pipes of which one + * has O_NONBLOCK set and the other has not, then we have no individual control over + * O_NONBLOCK behaviour. Hence in that case we can't use splice() and still guarantee + * systematic O_NONBLOCK behaviour, hence don't. */ + + if (nonblock_pipe < 0) { + int a, b; + + /* Check if either of these fds is a pipe, and if so non-blocking or not */ + a = fd_is_nonblock_pipe(fdf); + if (a < 0) + return a; + + b = fd_is_nonblock_pipe(fdt); + if (b < 0) + return b; + + if ((a == FD_IS_NO_PIPE && b == FD_IS_NO_PIPE) || + (a == FD_IS_BLOCKING_PIPE && b == FD_IS_NONBLOCKING_PIPE) || + (a == FD_IS_NONBLOCKING_PIPE && b == FD_IS_BLOCKING_PIPE)) + + /* splice() only works if one of the fds is a pipe. If neither is, + * let's skip this step right-away. As mentioned above, if one of the + * two fds refers to a blocking pipe and the other to a non-blocking + * pipe, we can't use splice() either, hence don't try either. This + * hence means we can only use splice() if either only one of the two + * fds is a pipe, or if both are pipes with the same nonblocking flag + * setting. */ + + try_splice = false; + else + nonblock_pipe = a == FD_IS_NONBLOCKING_PIPE || b == FD_IS_NONBLOCKING_PIPE; + } + } + + if (try_splice) { + n = splice(fdf, NULL, fdt, NULL, m, nonblock_pipe ? SPLICE_F_NONBLOCK : 0); + if (n < 0) { + if (!IN_SET(errno, EINVAL, ENOSYS)) + return -errno; + + try_splice = false; + /* use fallback below */ + } else if (n == 0) { /* likely EOF */ + + if (copied_something) + break; + + try_splice = false; /* same logic as above for copy_file_range() + sendfile() */ + } else + /* Success! */ + goto next; + } + + /* As a fallback just copy bits by hand */ + { + uint8_t buf[MIN(m, COPY_BUFFER_SIZE)], *p = buf; + ssize_t z; + + n = read(fdf, buf, sizeof buf); + if (n < 0) + return -errno; + if (n == 0) /* EOF */ + break; + + z = (size_t) n; + do { + ssize_t k; + + k = write(fdt, p, z); + if (k < 0) { + r = -errno; + + if (ret_remains) { + void *copy; + + copy = memdup(p, z); + if (!copy) + return -ENOMEM; + + *ret_remains = copy; + } + + if (ret_remains_size) + *ret_remains_size = z; + + return r; + } + + assert(k <= z); + z -= k; + p += k; + } while (z > 0); + } + + next: + if (progress) { + r = progress(n, userdata); + if (r < 0) + return r; + } + + if (max_bytes != UINT64_MAX) { + assert(max_bytes >= (uint64_t) n); + max_bytes -= n; + } + + /* sendfile accepts at most SSIZE_MAX-offset bytes to copy, so reduce our maximum by the + * amount we already copied, but don't go below our copy buffer size, unless we are close the + * limit of bytes we are allowed to copy. */ + m = MAX(MIN(COPY_BUFFER_SIZE, max_bytes), m - n); + + copied_something = true; + } + + if (copy_flags & COPY_TRUNCATE) { + off_t off = lseek(fdt, 0, SEEK_CUR); + if (off < 0) + return -errno; + + if (ftruncate(fdt, off) < 0) + return -errno; + } + + return max_bytes <= 0; /* return 0 if we hit EOF earlier than the size limit */ +} + +static int fd_copy_symlink( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags) { + + _cleanup_free_ char *target = NULL; + int r; + + assert(from); + assert(st); + assert(to); + + r = readlinkat_malloc(df, from, &target); + if (r < 0) + return r; + + if (copy_flags & COPY_MAC_CREATE) { + r = mac_selinux_create_file_prepare_at(dt, to, S_IFLNK); + if (r < 0) + return r; + } + r = RET_NERRNO(symlinkat(target, dt, to)); + if (copy_flags & COPY_MAC_CREATE) + mac_selinux_create_file_clear(); + if (r < 0) { + if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) { + log_notice_errno(r, "Failed to copy symlink '%s', ignoring: %m", from); + return 0; + } + + return r; + } + + if (fchownat(dt, to, + uid_is_valid(override_uid) ? override_uid : st->st_uid, + gid_is_valid(override_gid) ? override_gid : st->st_gid, + AT_SYMLINK_NOFOLLOW) < 0) + r = -errno; + + (void) copy_xattr(df, from, dt, to, copy_flags); + (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW); + return r; +} + +/* Encapsulates the database we store potential hardlink targets in */ +typedef struct HardlinkContext { + int dir_fd; /* An fd to the directory we use as lookup table. Never AT_FDCWD. Lazily created, when + * we add the first entry. */ + + /* These two fields are used to create the hardlink repository directory above — via + * mkdirat(parent_fd, subdir) — and are kept so that we can automatically remove the directory again + * when we are done. */ + int parent_fd; /* Possibly AT_FDCWD */ + char *subdir; +} HardlinkContext; + +static int hardlink_context_setup( + HardlinkContext *c, + int dt, + const char *to, + CopyFlags copy_flags) { + + _cleanup_close_ int dt_copy = -EBADF; + int r; + + assert(c); + assert(c->dir_fd < 0 && c->dir_fd != AT_FDCWD); + assert(c->parent_fd < 0); + assert(!c->subdir); + + /* If hardlink recreation is requested we have to maintain a database of inodes that are potential + * hardlink sources. Given that generally disk sizes have to be assumed to be larger than what fits + * into physical RAM we cannot maintain that database in dynamic memory alone. Here we opt to + * maintain it on disk, to simplify things: inside the destination directory we'll maintain a + * temporary directory consisting of hardlinks of every inode we copied that might be subject of + * hardlinks. We can then use that as hardlink source later on. Yes, this means additional disk IO + * but thankfully Linux is optimized for this kind of thing. If this ever becomes a performance + * bottleneck we can certainly place an in-memory hash table in front of this, but for the beginning, + * let's keep things simple, and just use the disk as lookup table for inodes. + * + * Note that this should have zero performance impact as long as .n_link of all files copied remains + * <= 0, because in that case we will not actually allocate the hardlink inode lookup table directory + * on disk (we do so lazily, when the first candidate with .n_link > 1 is seen). This means, in the + * common case where hardlinks are not used at all or only for few files the fact that we store the + * table on disk shouldn't matter perfomance-wise. */ + + if (!FLAGS_SET(copy_flags, COPY_HARDLINKS)) + return 0; + + if (dt == AT_FDCWD) + dt_copy = AT_FDCWD; + else if (dt < 0) + return -EBADF; + else { + dt_copy = fcntl(dt, F_DUPFD_CLOEXEC, 3); + if (dt_copy < 0) + return -errno; + } + + r = tempfn_random_child(to, "hardlink", &c->subdir); + if (r < 0) + return r; + + c->parent_fd = TAKE_FD(dt_copy); + + /* We don't actually create the directory we keep the table in here, that's done on-demand when the + * first entry is added, using hardlink_context_realize() below. */ + return 1; +} + +static int hardlink_context_realize(HardlinkContext *c) { + if (!c) + return 0; + + if (c->dir_fd >= 0) /* Already realized */ + return 1; + + if (c->parent_fd < 0 && c->parent_fd != AT_FDCWD) /* Not configured */ + return 0; + + assert(c->subdir); + + c->dir_fd = open_mkdir_at(c->parent_fd, c->subdir, O_EXCL|O_CLOEXEC, 0700); + if (c->dir_fd < 0) + return c->dir_fd; + + return 1; +} + +static void hardlink_context_destroy(HardlinkContext *c) { + int r; + + assert(c); + + /* Automatically remove the hardlink lookup table directory again after we are done. This is used via + * _cleanup_() so that we really delete this, even on failure. */ + + if (c->dir_fd >= 0) { + /* might be have already been used for reading, so we need to rewind it. */ + if (lseek(c->dir_fd, 0, SEEK_SET) < 0) + log_debug_errno(errno, "Failed to lseek on file descriptor, ignoring: %m"); + + r = rm_rf_children(TAKE_FD(c->dir_fd), REMOVE_PHYSICAL, NULL); /* consumes dir_fd in all cases, even on failure */ + if (r < 0) + log_debug_errno(r, "Failed to remove hardlink store (%s) contents, ignoring: %m", c->subdir); + + assert(c->parent_fd >= 0 || c->parent_fd == AT_FDCWD); + assert(c->subdir); + + if (unlinkat(c->parent_fd, c->subdir, AT_REMOVEDIR) < 0) + log_debug_errno(errno, "Failed to remove hardlink store (%s) directory, ignoring: %m", c->subdir); + } + + assert_cc(AT_FDCWD < 0); + c->parent_fd = safe_close(c->parent_fd); + + c->subdir = mfree(c->subdir); +} + +static int try_hardlink( + HardlinkContext *c, + const struct stat *st, + int dt, + const char *to) { + + char dev_ino[DECIMAL_STR_MAX(dev_t)*2 + DECIMAL_STR_MAX(uint64_t) + 4]; + + assert(st); + assert(dt >= 0 || dt == AT_FDCWD); + assert(to); + + if (!c) /* No temporary hardlink directory, don't bother */ + return 0; + + if (st->st_nlink <= 1) /* Source not hardlinked, don't bother */ + return 0; + + if (c->dir_fd < 0) /* not yet realized, hence empty */ + return 0; + + xsprintf(dev_ino, "%u:%u:%" PRIu64, major(st->st_dev), minor(st->st_dev), (uint64_t) st->st_ino); + if (linkat(c->dir_fd, dev_ino, dt, to, 0) < 0) { + if (errno != ENOENT) /* doesn't exist in store yet */ + log_debug_errno(errno, "Failed to hardlink %s to %s, ignoring: %m", dev_ino, to); + return 0; + } + + return 1; +} + +static int memorize_hardlink( + HardlinkContext *c, + const struct stat *st, + int dt, + const char *to) { + + char dev_ino[DECIMAL_STR_MAX(dev_t)*2 + DECIMAL_STR_MAX(uint64_t) + 4]; + int r; + + assert(st); + assert(dt >= 0 || dt == AT_FDCWD); + assert(to); + + if (!c) /* No temporary hardlink directory, don't bother */ + return 0; + + if (st->st_nlink <= 1) /* Source not hardlinked, don't bother */ + return 0; + + r = hardlink_context_realize(c); /* Create the hardlink store lazily */ + if (r < 0) + return r; + + xsprintf(dev_ino, "%u:%u:%" PRIu64, major(st->st_dev), minor(st->st_dev), (uint64_t) st->st_ino); + if (linkat(dt, to, c->dir_fd, dev_ino, 0) < 0) { + log_debug_errno(errno, "Failed to hardlink %s to %s, ignoring: %m", to, dev_ino); + return 0; + } + + return 1; +} + +static int fd_copy_tree_generic( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + dev_t original_device, + unsigned depth_left, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + Hashmap *denylist, + Set *subvolumes, + HardlinkContext *hardlink_context, + const char *display_path, + copy_progress_path_t progress_path, + copy_progress_bytes_t progress_bytes, + void *userdata); + +static int fd_copy_regular( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + HardlinkContext *hardlink_context, + copy_progress_bytes_t progress, + void *userdata) { + + _cleanup_close_ int fdf = -EBADF, fdt = -EBADF; + int r, q; + + assert(from); + assert(st); + assert(to); + + r = try_hardlink(hardlink_context, st, dt, to); + if (r < 0) + return r; + if (r > 0) /* worked! */ + return 0; + + fdf = openat(df, from, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fdf < 0) + return -errno; + + if (copy_flags & COPY_MAC_CREATE) { + r = mac_selinux_create_file_prepare_at(dt, to, S_IFREG); + if (r < 0) + return r; + } + fdt = openat(dt, to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, st->st_mode & 07777); + if (copy_flags & COPY_MAC_CREATE) + mac_selinux_create_file_clear(); + if (fdt < 0) + return -errno; + + r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags, NULL, NULL, progress, userdata); + if (r < 0) + goto fail; + + if (fchown(fdt, + uid_is_valid(override_uid) ? override_uid : st->st_uid, + gid_is_valid(override_gid) ? override_gid : st->st_gid) < 0) + r = -errno; + + if (fchmod(fdt, st->st_mode & 07777) < 0) + r = -errno; + + (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim }); + (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags); + + if (copy_flags & COPY_FSYNC) { + if (fsync(fdt) < 0) { + r = -errno; + goto fail; + } + } + + q = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */ + if (q < 0) { + r = q; + goto fail; + } + + (void) memorize_hardlink(hardlink_context, st, dt, to); + return r; + +fail: + (void) unlinkat(dt, to, 0); + return r; +} + +static int fd_copy_fifo( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + HardlinkContext *hardlink_context) { + int r; + + assert(from); + assert(st); + assert(to); + + r = try_hardlink(hardlink_context, st, dt, to); + if (r < 0) + return r; + if (r > 0) /* worked! */ + return 0; + + if (copy_flags & COPY_MAC_CREATE) { + r = mac_selinux_create_file_prepare_at(dt, to, S_IFIFO); + if (r < 0) + return r; + } + r = RET_NERRNO(mkfifoat(dt, to, st->st_mode & 07777)); + if (copy_flags & COPY_MAC_CREATE) + mac_selinux_create_file_clear(); + if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) { + log_notice_errno(r, "Failed to copy fifo '%s', ignoring: %m", from); + return 0; + } else if (r < 0) + return r; + + if (fchownat(dt, to, + uid_is_valid(override_uid) ? override_uid : st->st_uid, + gid_is_valid(override_gid) ? override_gid : st->st_gid, + AT_SYMLINK_NOFOLLOW) < 0) + r = -errno; + + if (fchmodat(dt, to, st->st_mode & 07777, 0) < 0) + r = -errno; + + (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW); + + (void) memorize_hardlink(hardlink_context, st, dt, to); + return r; +} + +static int fd_copy_node( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + HardlinkContext *hardlink_context) { + int r; + + assert(from); + assert(st); + assert(to); + + r = try_hardlink(hardlink_context, st, dt, to); + if (r < 0) + return r; + if (r > 0) /* worked! */ + return 0; + + if (copy_flags & COPY_MAC_CREATE) { + r = mac_selinux_create_file_prepare_at(dt, to, st->st_mode & S_IFMT); + if (r < 0) + return r; + } + r = RET_NERRNO(mknodat(dt, to, st->st_mode, st->st_rdev)); + if (copy_flags & COPY_MAC_CREATE) + mac_selinux_create_file_clear(); + if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) { + log_notice_errno(r, "Failed to copy node '%s', ignoring: %m", from); + return 0; + } else if (r < 0) + return r; + + if (fchownat(dt, to, + uid_is_valid(override_uid) ? override_uid : st->st_uid, + gid_is_valid(override_gid) ? override_gid : st->st_gid, + AT_SYMLINK_NOFOLLOW) < 0) + r = -errno; + + if (fchmodat(dt, to, st->st_mode & 07777, 0) < 0) + r = -errno; + + (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW); + + (void) memorize_hardlink(hardlink_context, st, dt, to); + return r; +} + +static int fd_copy_directory( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + dev_t original_device, + unsigned depth_left, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + Hashmap *denylist, + Set *subvolumes, + HardlinkContext *hardlink_context, + const char *display_path, + copy_progress_path_t progress_path, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + _cleanup_(hardlink_context_destroy) HardlinkContext our_hardlink_context = { + .dir_fd = -EBADF, + .parent_fd = -EBADF, + }; + + _cleanup_close_ int fdf = -EBADF, fdt = -EBADF; + _cleanup_closedir_ DIR *d = NULL; + bool exists; + int r; + + assert(st); + assert(to); + + if (depth_left == 0) + return -ENAMETOOLONG; + + if (from) + fdf = openat(df, from, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + else + fdf = fcntl(df, F_DUPFD_CLOEXEC, 3); + if (fdf < 0) + return -errno; + + if (!hardlink_context) { + /* If recreating hardlinks is requested let's set up a context for that now. */ + r = hardlink_context_setup(&our_hardlink_context, dt, to, copy_flags); + if (r < 0) + return r; + if (r > 0) /* It's enabled and allocated, let's now use the same context for all recursive + * invocations from here down */ + hardlink_context = &our_hardlink_context; + } + + d = take_fdopendir(&fdf); + if (!d) + return -errno; + + r = dir_is_empty_at(dt, to, /* ignore_hidden_or_backup= */ false); + if (r < 0 && r != -ENOENT) + return r; + if ((r > 0 && !(copy_flags & (COPY_MERGE|COPY_MERGE_EMPTY))) || (r == 0 && !FLAGS_SET(copy_flags, COPY_MERGE))) + return -EEXIST; + + exists = r >= 0; + + fdt = xopenat_lock(dt, to, + O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|(exists ? 0 : O_CREAT|O_EXCL), + (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0)|(set_contains(subvolumes, st) ? XO_SUBVOLUME : 0), + st->st_mode & 07777, + copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, + LOCK_EX); + if (fdt < 0) + return fdt; + + r = 0; + + if (PTR_TO_INT(hashmap_get(denylist, st)) == DENY_CONTENTS) { + log_debug("%s is in the denylist, not recursing", from); + goto finish; + } + + FOREACH_DIRENT_ALL(de, d, return -errno) { + const char *child_display_path = NULL; + _cleanup_free_ char *dp = NULL; + struct stat buf; + int q; + + if (dot_or_dot_dot(de->d_name)) + continue; + + r = look_for_signals(copy_flags); + if (r < 0) + return r; + + if (fstatat(dirfd(d), de->d_name, &buf, AT_SYMLINK_NOFOLLOW) < 0) { + r = -errno; + continue; + } + + if (progress_path) { + if (display_path) + child_display_path = dp = path_join(display_path, de->d_name); + else + child_display_path = de->d_name; + + r = progress_path(child_display_path, &buf, userdata); + if (r < 0) + return r; + } + + if (PTR_TO_INT(hashmap_get(denylist, &buf)) == DENY_INODE) { + log_debug("%s/%s is in the denylist, ignoring", from, de->d_name); + continue; + } + + if (S_ISDIR(buf.st_mode)) { + /* + * Don't descend into directories on other file systems, if this is requested. We do a simple + * .st_dev check here, which basically comes for free. Note that we do this check only on + * directories, not other kind of file system objects, for two reason: + * + * • The kernel's overlayfs pseudo file system that overlays multiple real file systems + * propagates the .st_dev field of the file system a file originates from all the way up + * through the stack to stat(). It doesn't do that for directories however. This means that + * comparing .st_dev on non-directories suggests that they all are mount points. To avoid + * confusion we hence avoid relying on this check for regular files. + * + * • The main reason we do this check at all is to protect ourselves from bind mount cycles, + * where we really want to avoid descending down in all eternity. However the .st_dev check + * is usually not sufficient for this protection anyway, as bind mount cycles from the same + * file system onto itself can't be detected that way. (Note we also do a recursion depth + * check, which is probably the better protection in this regard, which is why + * COPY_SAME_MOUNT is optional). + */ + + if (FLAGS_SET(copy_flags, COPY_SAME_MOUNT)) { + if (buf.st_dev != original_device) + continue; + + r = fd_is_mount_point(dirfd(d), de->d_name, 0); + if (r < 0) + return r; + if (r > 0) + continue; + } + } + + q = fd_copy_tree_generic(dirfd(d), de->d_name, &buf, fdt, de->d_name, original_device, + depth_left-1, override_uid, override_gid, copy_flags & ~COPY_LOCK_BSD, + denylist, subvolumes, hardlink_context, child_display_path, progress_path, + progress_bytes, userdata); + + if (q == -EINTR) /* Propagate SIGINT/SIGTERM up instantly */ + return q; + if (q == -EEXIST && (copy_flags & COPY_MERGE)) + q = 0; + if (q < 0) + r = q; + } + +finish: + if (!exists) { + if (fchown(fdt, + uid_is_valid(override_uid) ? override_uid : st->st_uid, + gid_is_valid(override_gid) ? override_gid : st->st_gid) < 0) + r = -errno; + + if (fchmod(fdt, st->st_mode & 07777) < 0) + r = -errno; + + (void) copy_xattr(dirfd(d), NULL, fdt, NULL, copy_flags); + (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim }); + } + + if (copy_flags & COPY_FSYNC_FULL) { + if (fsync(fdt) < 0) + return -errno; + } + + if (r < 0) + return r; + + return copy_flags & COPY_LOCK_BSD ? TAKE_FD(fdt) : 0; +} + +static int fd_copy_leaf( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + HardlinkContext *hardlink_context, + const char *display_path, + copy_progress_bytes_t progress_bytes, + void *userdata) { + int r; + + if (S_ISREG(st->st_mode)) + r = fd_copy_regular(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, progress_bytes, userdata); + else if (S_ISLNK(st->st_mode)) + r = fd_copy_symlink(df, from, st, dt, to, override_uid, override_gid, copy_flags); + else if (S_ISFIFO(st->st_mode)) + r = fd_copy_fifo(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context); + else if (S_ISBLK(st->st_mode) || S_ISCHR(st->st_mode) || S_ISSOCK(st->st_mode)) + r = fd_copy_node(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context); + else + r = -EOPNOTSUPP; + + return r; +} + +static int fd_copy_tree_generic( + int df, + const char *from, + const struct stat *st, + int dt, + const char *to, + dev_t original_device, + unsigned depth_left, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + Hashmap *denylist, + Set *subvolumes, + HardlinkContext *hardlink_context, + const char *display_path, + copy_progress_path_t progress_path, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + int r; + + assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); + + if (S_ISDIR(st->st_mode)) + return fd_copy_directory(df, from, st, dt, to, original_device, depth_left-1, override_uid, + override_gid, copy_flags, denylist, subvolumes, hardlink_context, + display_path, progress_path, progress_bytes, userdata); + + DenyType t = PTR_TO_INT(hashmap_get(denylist, st)); + if (t == DENY_INODE) { + log_debug("%s is in the denylist, ignoring", from); + return 0; + } else if (t == DENY_CONTENTS) + log_debug("%s is configured to have its contents excluded, but is not a directory", from); + + r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata); + /* We just tried to copy a leaf node of the tree. If it failed because the node already exists *and* the COPY_REPLACE flag has been provided, we should unlink the node and re-copy. */ + if (r == -EEXIST && (copy_flags & COPY_REPLACE)) { + /* This codepath is us trying to address an error to copy, if the unlink fails, lets just return the original error. */ + if (unlinkat(dt, to, 0) < 0) + return r; + + r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata); + } + + return r; +} + +int copy_tree_at_full( + int fdf, + const char *from, + int fdt, + const char *to, + uid_t override_uid, + gid_t override_gid, + CopyFlags copy_flags, + Hashmap *denylist, + Set *subvolumes, + copy_progress_path_t progress_path, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + struct stat st; + int r; + + assert(from); + assert(to); + assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); + + if (fstatat(fdf, from, &st, AT_SYMLINK_NOFOLLOW) < 0) + return -errno; + + r = fd_copy_tree_generic(fdf, from, &st, fdt, to, st.st_dev, COPY_DEPTH_MAX, override_uid, + override_gid, copy_flags, denylist, subvolumes, NULL, NULL, progress_path, + progress_bytes, userdata); + if (r < 0) + return r; + + if (S_ISDIR(st.st_mode) && (copy_flags & COPY_SYNCFS)) { + /* If the top-level inode is a directory run syncfs() now. */ + r = syncfs_path(fdt, to); + if (r < 0) + return r; + } else if ((copy_flags & (COPY_FSYNC_FULL|COPY_SYNCFS)) != 0) { + /* fsync() the parent dir of what we just copied if COPY_FSYNC_FULL is set. Also do this in + * case COPY_SYNCFS is set but the top-level inode wasn't actually a directory. We do this so that + * COPY_SYNCFS provides reasonable synchronization semantics on any kind of inode: when the + * copy operation is done the whole inode — regardless of its type — and all its children + * will be synchronized to disk. */ + r = fsync_parent_at(fdt, to); + if (r < 0) + return r; + } + + return 0; +} + +static int sync_dir_by_flags(int dir_fd, const char *path, CopyFlags copy_flags) { + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + if (copy_flags & COPY_SYNCFS) + return syncfs_path(dir_fd, path); + if (copy_flags & COPY_FSYNC_FULL) + return fsync_parent_at(dir_fd, path); + + return 0; +} + +int copy_directory_at_full( + int dir_fdf, + const char *from, + int dir_fdt, + const char *to, + CopyFlags copy_flags, + copy_progress_path_t progress_path, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + _cleanup_close_ int fdt = -EBADF; + struct stat st; + int r; + + assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD); + assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD); + assert(to); + + if (fstatat(dir_fdf, strempty(from), &st, AT_SYMLINK_NOFOLLOW|(isempty(from) ? AT_EMPTY_PATH : 0)) < 0) + return -errno; + + r = stat_verify_directory(&st); + if (r < 0) + return r; + + r = fd_copy_directory( + dir_fdf, from, + &st, + dir_fdt, to, + st.st_dev, + COPY_DEPTH_MAX, + UID_INVALID, GID_INVALID, + copy_flags, + NULL, NULL, NULL, NULL, + progress_path, + progress_bytes, + userdata); + if (r < 0) + return r; + + if (FLAGS_SET(copy_flags, COPY_LOCK_BSD)) + fdt = r; + + r = sync_dir_by_flags(dir_fdt, to, copy_flags); + if (r < 0) + return r; + + return FLAGS_SET(copy_flags, COPY_LOCK_BSD) ? TAKE_FD(fdt) : 0; +} + +int copy_file_fd_at_full( + int dir_fdf, + const char *from, + int fdt, + CopyFlags copy_flags, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + _cleanup_close_ int fdf = -EBADF; + struct stat st; + int r; + + assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD); + assert(from); + assert(fdt >= 0); + assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); + + fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fdf < 0) + return -errno; + + r = fd_verify_regular(fdf); + if (r < 0) + return r; + + if (fstat(fdt, &st) < 0) + return -errno; + + r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags, NULL, NULL, progress_bytes, userdata); + if (r < 0) + return r; + + /* Make sure to copy file attributes only over if target is a regular + * file (so that copying a file to /dev/null won't alter the access + * mode/ownership of that device node...) */ + if (S_ISREG(st.st_mode)) { + (void) copy_times(fdf, fdt, copy_flags); + (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags); + } + + if (copy_flags & COPY_FSYNC_FULL) { + r = fsync_full(fdt); + if (r < 0) + return r; + } else if (copy_flags & COPY_FSYNC) { + if (fsync(fdt) < 0) + return -errno; + } + + return 0; +} + +int copy_file_at_full( + int dir_fdf, + const char *from, + int dir_fdt, + const char *to, + int flags, + mode_t mode, + unsigned chattr_flags, + unsigned chattr_mask, + CopyFlags copy_flags, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + _cleanup_close_ int fdf = -EBADF, fdt = -EBADF; + struct stat st; + int r; + + assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD); + assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD); + assert(from); + assert(to); + + fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fdf < 0) + return -errno; + + if (fstat(fdf, &st) < 0) + return -errno; + + r = stat_verify_regular(&st); + if (r < 0) + return r; + + WITH_UMASK(0000) { + fdt = xopenat_lock(dir_fdt, to, + flags|O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, + (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0), + mode != MODE_INVALID ? mode : st.st_mode, + copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, LOCK_EX); + if (fdt < 0) + return fdt; + } + + if (!FLAGS_SET(flags, O_EXCL)) { /* if O_EXCL was used we created the thing as regular file, no need to check again */ + r = fd_verify_regular(fdt); + if (r < 0) + goto fail; + } + + if (chattr_mask != 0) + (void) chattr_fd(fdt, chattr_flags, chattr_mask & CHATTR_EARLY_FL, NULL); + + r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags & ~COPY_LOCK_BSD, NULL, NULL, progress_bytes, userdata); + if (r < 0) + goto fail; + + (void) copy_times(fdf, fdt, copy_flags); + (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags); + + if (chattr_mask != 0) + (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL); + + if (copy_flags & (COPY_FSYNC|COPY_FSYNC_FULL)) { + if (fsync(fdt) < 0) { + r = -errno; + goto fail; + } + } + + if (!FLAGS_SET(copy_flags, COPY_LOCK_BSD)) { + r = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */ + if (r < 0) + goto fail; + } + + if (copy_flags & COPY_FSYNC_FULL) { + r = fsync_parent_at(dir_fdt, to); + if (r < 0) + goto fail; + } + + return copy_flags & COPY_LOCK_BSD ? TAKE_FD(fdt) : 0; + +fail: + /* Only unlink if we definitely are the ones who created the file */ + if (FLAGS_SET(flags, O_EXCL)) + (void) unlinkat(dir_fdt, to, 0); + + return r; +} + +int copy_file_atomic_at_full( + int dir_fdf, + const char *from, + int dir_fdt, + const char *to, + mode_t mode, + unsigned chattr_flags, + unsigned chattr_mask, + CopyFlags copy_flags, + copy_progress_bytes_t progress_bytes, + void *userdata) { + + _cleanup_(unlink_and_freep) char *t = NULL; + _cleanup_close_ int fdt = -EBADF; + int r; + + assert(from); + assert(to); + assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD)); + + if (copy_flags & COPY_MAC_CREATE) { + r = mac_selinux_create_file_prepare_at(dir_fdt, to, S_IFREG); + if (r < 0) + return r; + } + fdt = open_tmpfile_linkable_at(dir_fdt, to, O_WRONLY|O_CLOEXEC, &t); + if (copy_flags & COPY_MAC_CREATE) + mac_selinux_create_file_clear(); + if (fdt < 0) + return fdt; + + if (chattr_mask != 0) + (void) chattr_fd(fdt, chattr_flags, chattr_mask & CHATTR_EARLY_FL, NULL); + + r = copy_file_fd_at_full(dir_fdf, from, fdt, copy_flags, progress_bytes, userdata); + if (r < 0) + return r; + + if (fchmod(fdt, mode) < 0) + return -errno; + + if ((copy_flags & (COPY_FSYNC|COPY_FSYNC_FULL))) { + /* Sync the file */ + if (fsync(fdt) < 0) + return -errno; + } + + r = link_tmpfile_at(fdt, dir_fdt, t, to, (copy_flags & COPY_REPLACE) ? LINK_TMPFILE_REPLACE : 0); + if (r < 0) + return r; + + t = mfree(t); + + if (chattr_mask != 0) + (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL); + + r = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */ + if (r < 0) + goto fail; + + if (copy_flags & COPY_FSYNC_FULL) { + /* Sync the parent directory */ + r = fsync_parent_at(dir_fdt, to); + if (r < 0) + goto fail; + } + + return 0; + +fail: + (void) unlinkat(dir_fdt, to, 0); + return r; +} + +int copy_times(int fdf, int fdt, CopyFlags flags) { + struct stat st; + + assert(fdf >= 0); + assert(fdt >= 0); + + if (fstat(fdf, &st) < 0) + return -errno; + + if (futimens(fdt, (struct timespec[2]) { st.st_atim, st.st_mtim }) < 0) + return -errno; + + if (FLAGS_SET(flags, COPY_CRTIME)) { + usec_t crtime; + + if (fd_getcrtime(fdf, &crtime) >= 0) + (void) fd_setcrtime(fdt, crtime); + } + + return 0; +} + +int copy_access(int fdf, int fdt) { + struct stat st; + + assert(fdf >= 0); + assert(fdt >= 0); + + /* Copies just the access mode (and not the ownership) from fdf to fdt */ + + if (fstat(fdf, &st) < 0) + return -errno; + + return RET_NERRNO(fchmod(fdt, st.st_mode & 07777)); +} + +int copy_rights_with_fallback(int fdf, int fdt, const char *patht) { + struct stat st; + + assert(fdf >= 0); + assert(fdt >= 0); + + /* Copies both access mode and ownership from fdf to fdt */ + + if (fstat(fdf, &st) < 0) + return -errno; + + return fchmod_and_chown_with_fallback(fdt, patht, st.st_mode & 07777, st.st_uid, st.st_gid); +} + +int copy_xattr(int df, const char *from, int dt, const char *to, CopyFlags copy_flags) { + _cleanup_free_ char *names = NULL; + int ret = 0, r; + + r = listxattr_at_malloc(df, from, 0, &names); + if (r < 0) + return r; + + NULSTR_FOREACH(p, names) { + _cleanup_free_ char *value = NULL; + + if (!FLAGS_SET(copy_flags, COPY_ALL_XATTRS) && !startswith(p, "user.")) + continue; + + r = getxattr_at_malloc(df, from, p, 0, &value); + if (r == -ENODATA) + continue; /* gone by now */ + if (r < 0) + return r; + + if (xsetxattr(dt, to, p, value, r, 0) < 0) + ret = -errno; + } + + return ret; +} + +int reflink(int infd, int outfd) { + int r; + + assert(infd >= 0); + assert(outfd >= 0); + + /* Make sure we invoke the ioctl on a regular file, so that no device driver accidentally gets it. */ + + r = fd_verify_regular(outfd); + if (r < 0) + return r; + + /* FICLONE was introduced in Linux 4.5 but it uses the same number as BTRFS_IOC_CLONE introduced earlier */ + + assert_cc(FICLONE == BTRFS_IOC_CLONE); + + return RET_NERRNO(ioctl(outfd, FICLONE, infd)); +} + +assert_cc(sizeof(struct file_clone_range) == sizeof(struct btrfs_ioctl_clone_range_args)); + +int reflink_range(int infd, uint64_t in_offset, int outfd, uint64_t out_offset, uint64_t sz) { + struct file_clone_range args = { + .src_fd = infd, + .src_offset = in_offset, + .src_length = sz, + .dest_offset = out_offset, + }; + int r; + + assert(infd >= 0); + assert(outfd >= 0); + + /* Inside the kernel, FICLONE is identical to FICLONERANGE with offsets and size set to zero, let's + * simplify things and use the simple ioctl in that case. Also, do the same if the size is + * UINT64_MAX, which is how we usually encode "everything". */ + if (in_offset == 0 && out_offset == 0 && IN_SET(sz, 0, UINT64_MAX)) + return reflink(infd, outfd); + + r = fd_verify_regular(outfd); + if (r < 0) + return r; + + assert_cc(FICLONERANGE == BTRFS_IOC_CLONE_RANGE); + + return RET_NERRNO(ioctl(outfd, FICLONERANGE, &args)); +} diff --git a/src/shared/copy.h b/src/shared/copy.h new file mode 100644 index 0000000..d842edd --- /dev/null +++ b/src/shared/copy.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "set.h" + +typedef enum CopyFlags { + COPY_REFLINK = 1 << 0, /* Try to reflink */ + COPY_MERGE = 1 << 1, /* Merge existing trees with our new one to copy */ + COPY_REPLACE = 1 << 2, /* Replace an existing file if there's one */ + COPY_SAME_MOUNT = 1 << 3, /* Don't descend recursively into other file systems, across mount point boundaries */ + COPY_MERGE_EMPTY = 1 << 4, /* Merge an existing, empty directory with our new tree to copy */ + COPY_CRTIME = 1 << 5, /* Generate a user.crtime_usec xattr off the source crtime if there is one, on copying */ + COPY_SIGINT = 1 << 6, /* Check for SIGINT regularly and return EINTR if seen (caller needs to block SIGINT) */ + COPY_SIGTERM = 1 << 7, /* ditto, but for SIGTERM */ + COPY_MAC_CREATE = 1 << 8, /* Create files with the correct MAC label (currently SELinux only) */ + COPY_HARDLINKS = 1 << 9, /* Try to reproduce hard links */ + COPY_FSYNC = 1 << 10, /* fsync() after we are done */ + COPY_FSYNC_FULL = 1 << 11, /* fsync_full() after we are done */ + COPY_SYNCFS = 1 << 12, /* syncfs() the *top-level* dir after we are done */ + COPY_ALL_XATTRS = 1 << 13, /* Preserve all xattrs when copying, not just those in the user namespace */ + COPY_HOLES = 1 << 14, /* Copy holes */ + COPY_GRACEFUL_WARN = 1 << 15, /* Skip copying file types that aren't supported by the target filesystem */ + COPY_TRUNCATE = 1 << 16, /* Truncate to current file offset after copying */ + COPY_LOCK_BSD = 1 << 17, /* Return a BSD exclusively locked file descriptor referring to the copied image/directory. */ +} CopyFlags; + +typedef enum DenyType { + DENY_DONT = 0, /* we want INT_TO_PTR(DENY_DONT) to map to NULL */ + DENY_INODE, + DENY_CONTENTS, + _DENY_TYPE_MAX, + _DENY_TYPE_INVALID = -EINVAL, +} DenyType; + +typedef int (*copy_progress_bytes_t)(uint64_t n_bytes, void *userdata); +typedef int (*copy_progress_path_t)(const char *path, const struct stat *st, void *userdata); + +int copy_file_fd_at_full(int dir_fdf, const char *from, int to, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata); +static inline int copy_file_fd_at(int dir_fdf, const char *from, int to, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata) { + return copy_file_fd_at_full(dir_fdf, from, to, copy_flags, progress, userdata); +} +static inline int copy_file_fd_full(const char *from, int to, CopyFlags copy_flags) { + return copy_file_fd_at_full(AT_FDCWD, from, to, copy_flags, NULL, NULL); +} +static inline int copy_file_fd(const char *from, int to, CopyFlags copy_flags) { + return copy_file_fd_at(AT_FDCWD, from, to, copy_flags, NULL, NULL); +} + +int copy_file_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, int open_flags, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata); +static inline int copy_file_at(int dir_fdf, const char *from, int dir_fdt, const char *to, int open_flags, mode_t mode, CopyFlags copy_flags) { + return copy_file_at_full(dir_fdf, from, dir_fdt, to, open_flags, mode, 0, 0, copy_flags, NULL, NULL); +} +static inline int copy_file_full(const char *from, const char *to, int open_flags, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata) { + return copy_file_at_full(AT_FDCWD, from, AT_FDCWD, to, open_flags, mode, chattr_flags, chattr_mask, copy_flags, progress, userdata); +} +static inline int copy_file(const char *from, const char *to, int open_flags, mode_t mode, CopyFlags copy_flags) { + return copy_file_at(AT_FDCWD, from, AT_FDCWD, to, open_flags, mode, copy_flags); +} + +int copy_file_atomic_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata); +static inline int copy_file_atomic_at(int dir_fdf, const char *from, int dir_fdt, const char *to, mode_t mode, CopyFlags copy_flags) { + return copy_file_atomic_at_full(dir_fdf, from, dir_fdt, to, mode, 0, 0, copy_flags, NULL, NULL); +} +static inline int copy_file_atomic_full(const char *from, const char *to, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata) { + return copy_file_atomic_at_full(AT_FDCWD, from, AT_FDCWD, to, mode, chattr_flags, chattr_mask, copy_flags, progress, userdata); +} +static inline int copy_file_atomic(const char *from, const char *to, mode_t mode, CopyFlags copy_flags) { + return copy_file_atomic_full(from, to, mode, 0, 0, copy_flags, NULL, NULL); +} + +int copy_tree_at_full(int fdf, const char *from, int fdt, const char *to, uid_t override_uid, gid_t override_gid, CopyFlags copy_flags, Hashmap *denylist, Set *subvolumes, copy_progress_path_t progress_path, copy_progress_bytes_t progress_bytes, void *userdata); +static inline int copy_tree_at(int fdf, const char *from, int fdt, const char *to, uid_t override_uid, gid_t override_gid, CopyFlags copy_flags, Hashmap *denylist, Set *subvolumes) { + return copy_tree_at_full(fdf, from, fdt, to, override_uid, override_gid, copy_flags, denylist, subvolumes, NULL, NULL, NULL); +} +static inline int copy_tree(const char *from, const char *to, uid_t override_uid, gid_t override_gid, CopyFlags copy_flags, Hashmap *denylist, Set *subvolumes) { + return copy_tree_at_full(AT_FDCWD, from, AT_FDCWD, to, override_uid, override_gid, copy_flags, denylist, subvolumes, NULL, NULL, NULL); +} + +int copy_directory_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, CopyFlags copy_flags, copy_progress_path_t progress_path, copy_progress_bytes_t progress_bytes, void *userdata); +static inline int copy_directory_at(int dir_fdf, const char *from, int dir_fdt, const char *to, CopyFlags copy_flags) { + return copy_directory_at_full(dir_fdf, from, dir_fdt, to, copy_flags, NULL, NULL, NULL); +} + +int copy_bytes_full(int fdf, int fdt, uint64_t max_bytes, CopyFlags copy_flags, void **ret_remains, size_t *ret_remains_size, copy_progress_bytes_t progress, void *userdata); +static inline int copy_bytes(int fdf, int fdt, uint64_t max_bytes, CopyFlags copy_flags) { + return copy_bytes_full(fdf, fdt, max_bytes, copy_flags, NULL, NULL, NULL, NULL); +} + +int copy_times(int fdf, int fdt, CopyFlags flags); +int copy_access(int fdf, int fdt); +int copy_rights_with_fallback(int fdf, int fdt, const char *patht); +static inline int copy_rights(int fdf, int fdt) { + return copy_rights_with_fallback(fdf, fdt, NULL); /* no fallback */ +} +int copy_xattr(int df, const char *from, int dt, const char *to, CopyFlags copy_flags); + +int reflink(int infd, int outfd); +int reflink_range(int infd, uint64_t in_offset, int outfd, uint64_t out_offset, uint64_t sz); diff --git a/src/shared/coredump-util.c b/src/shared/coredump-util.c new file mode 100644 index 0000000..805503f --- /dev/null +++ b/src/shared/coredump-util.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "coredump-util.h" +#include "extract-word.h" +#include "fileio.h" +#include "string-table.h" +#include "unaligned.h" +#include "virt.h" + +static const char *const coredump_filter_table[_COREDUMP_FILTER_MAX] = { + [COREDUMP_FILTER_PRIVATE_ANONYMOUS] = "private-anonymous", + [COREDUMP_FILTER_SHARED_ANONYMOUS] = "shared-anonymous", + [COREDUMP_FILTER_PRIVATE_FILE_BACKED] = "private-file-backed", + [COREDUMP_FILTER_SHARED_FILE_BACKED] = "shared-file-backed", + [COREDUMP_FILTER_ELF_HEADERS] = "elf-headers", + [COREDUMP_FILTER_PRIVATE_HUGE] = "private-huge", + [COREDUMP_FILTER_SHARED_HUGE] = "shared-huge", + [COREDUMP_FILTER_PRIVATE_DAX] = "private-dax", + [COREDUMP_FILTER_SHARED_DAX] = "shared-dax", +}; + +DEFINE_STRING_TABLE_LOOKUP(coredump_filter, CoredumpFilter); + +int coredump_filter_mask_from_string(const char *s, uint64_t *ret) { + uint64_t m = 0; + + assert(s); + assert(ret); + + for (;;) { + _cleanup_free_ char *n = NULL; + CoredumpFilter v; + int r; + + r = extract_first_word(&s, &n, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + if (streq(n, "default")) { + m |= COREDUMP_FILTER_MASK_DEFAULT; + continue; + } + + if (streq(n, "all")) { + m = COREDUMP_FILTER_MASK_ALL; + continue; + } + + v = coredump_filter_from_string(n); + if (v >= 0) { + m |= 1u << v; + continue; + } + + uint64_t x; + r = safe_atoux64(n, &x); + if (r < 0) + return r; + + m |= x; + } + + *ret = m; + return 0; +} + +#define _DEFINE_PARSE_AUXV(size, type, unaligned_read) \ + static int parse_auxv##size( \ + int log_level, \ + const void *auxv, \ + size_t size_bytes, \ + int *at_secure, \ + uid_t *uid, \ + uid_t *euid, \ + gid_t *gid, \ + gid_t *egid) { \ + \ + assert(auxv || size_bytes == 0); \ + assert(at_secure); \ + assert(uid); \ + assert(euid); \ + assert(gid); \ + assert(egid); \ + \ + if (size_bytes % (2 * sizeof(type)) != 0) \ + return log_full_errno(log_level, \ + SYNTHETIC_ERRNO(EIO), \ + "Incomplete auxv structure (%zu bytes).", \ + size_bytes); \ + \ + size_t words = size_bytes / sizeof(type); \ + \ + /* Note that we set output variables even on error. */ \ + \ + for (size_t i = 0; i + 1 < words; i += 2) { \ + type key, val; \ + \ + key = unaligned_read((uint8_t*) auxv + i * sizeof(type)); \ + val = unaligned_read((uint8_t*) auxv + (i + 1) * sizeof(type)); \ + \ + switch (key) { \ + case AT_SECURE: \ + *at_secure = val != 0; \ + break; \ + case AT_UID: \ + *uid = val; \ + break; \ + case AT_EUID: \ + *euid = val; \ + break; \ + case AT_GID: \ + *gid = val; \ + break; \ + case AT_EGID: \ + *egid = val; \ + break; \ + case AT_NULL: \ + if (val != 0) \ + goto error; \ + return 0; \ + } \ + } \ + error: \ + return log_full_errno(log_level, \ + SYNTHETIC_ERRNO(ENODATA), \ + "AT_NULL terminator not found, cannot parse auxv structure."); \ + } + +#define DEFINE_PARSE_AUXV(size) \ + _DEFINE_PARSE_AUXV(size, uint##size##_t, unaligned_read_ne##size) + +DEFINE_PARSE_AUXV(32); +DEFINE_PARSE_AUXV(64); + +int parse_auxv(int log_level, + uint8_t elf_class, + const void *auxv, + size_t size_bytes, + int *at_secure, + uid_t *uid, + uid_t *euid, + gid_t *gid, + gid_t *egid) { + + switch (elf_class) { + case ELFCLASS64: + return parse_auxv64(log_level, auxv, size_bytes, at_secure, uid, euid, gid, egid); + case ELFCLASS32: + return parse_auxv32(log_level, auxv, size_bytes, at_secure, uid, euid, gid, egid); + default: + return log_full_errno(log_level, SYNTHETIC_ERRNO(EPROTONOSUPPORT), + "Unknown ELF class %d.", elf_class); + } +} + +int set_coredump_filter(uint64_t value) { + char t[HEXADECIMAL_STR_MAX(uint64_t)]; + + xsprintf(t, "0x%"PRIx64, value); + + return write_string_file("/proc/self/coredump_filter", t, + WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER); +} + +/* Turn off core dumps but only if we're running outside of a container. */ +void disable_coredumps(void) { + int r; + + if (detect_container() > 0) + return; + + r = write_string_file("/proc/sys/kernel/core_pattern", "|/bin/false", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_debug_errno(r, "Failed to turn off coredumps, ignoring: %m"); +} diff --git a/src/shared/coredump-util.h b/src/shared/coredump-util.h new file mode 100644 index 0000000..4f54bb9 --- /dev/null +++ b/src/shared/coredump-util.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +typedef enum CoredumpFilter { + COREDUMP_FILTER_PRIVATE_ANONYMOUS = 0, + COREDUMP_FILTER_SHARED_ANONYMOUS, + COREDUMP_FILTER_PRIVATE_FILE_BACKED, + COREDUMP_FILTER_SHARED_FILE_BACKED, + COREDUMP_FILTER_ELF_HEADERS, + COREDUMP_FILTER_PRIVATE_HUGE, + COREDUMP_FILTER_SHARED_HUGE, + COREDUMP_FILTER_PRIVATE_DAX, + COREDUMP_FILTER_SHARED_DAX, + _COREDUMP_FILTER_MAX, + _COREDUMP_FILTER_INVALID = -EINVAL, +} CoredumpFilter; + +#define COREDUMP_FILTER_MASK_DEFAULT (1u << COREDUMP_FILTER_PRIVATE_ANONYMOUS | \ + 1u << COREDUMP_FILTER_SHARED_ANONYMOUS | \ + 1u << COREDUMP_FILTER_ELF_HEADERS | \ + 1u << COREDUMP_FILTER_PRIVATE_HUGE) + +/* The kernel doesn't like UINT64_MAX and returns ERANGE, use UINT32_MAX to support future new flags */ +#define COREDUMP_FILTER_MASK_ALL UINT32_MAX + +const char* coredump_filter_to_string(CoredumpFilter i) _const_; +CoredumpFilter coredump_filter_from_string(const char *s) _pure_; +int coredump_filter_mask_from_string(const char *s, uint64_t *ret); + +int parse_auxv(int log_level, + uint8_t elf_class, + const void *auxv, + size_t size_bytes, + int *at_secure, + uid_t *uid, + uid_t *euid, + gid_t *gid, + gid_t *egid); + +int set_coredump_filter(uint64_t value); +void disable_coredumps(void); diff --git a/src/shared/cpu-set-util.c b/src/shared/cpu-set-util.c new file mode 100644 index 0000000..d096576 --- /dev/null +++ b/src/shared/cpu-set-util.c @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "cpu-set-util.h" +#include "dirent-util.h" +#include "errno-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "parse-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" + +char* cpu_set_to_string(const CPUSet *a) { + _cleanup_free_ char *str = NULL; + size_t len = 0; + int i, r; + + for (i = 0; (size_t) i < a->allocated * 8; i++) { + if (!CPU_ISSET_S(i, a->allocated, a->set)) + continue; + + if (!GREEDY_REALLOC(str, len + 1 + DECIMAL_STR_MAX(int))) + return NULL; + + r = sprintf(str + len, len > 0 ? " %d" : "%d", i); + assert_se(r > 0); + len += r; + } + + return TAKE_PTR(str) ?: strdup(""); +} + +char *cpu_set_to_range_string(const CPUSet *set) { + unsigned range_start = 0, range_end; + _cleanup_free_ char *str = NULL; + bool in_range = false; + size_t len = 0; + int r; + + for (unsigned i = 0; i < set->allocated * 8; i++) + if (CPU_ISSET_S(i, set->allocated, set->set)) { + if (in_range) + range_end++; + else { + range_start = range_end = i; + in_range = true; + } + } else if (in_range) { + in_range = false; + + if (!GREEDY_REALLOC(str, len + 2 + 2 * DECIMAL_STR_MAX(unsigned))) + return NULL; + + if (range_end > range_start) + r = sprintf(str + len, len > 0 ? " %u-%u" : "%u-%u", range_start, range_end); + else + r = sprintf(str + len, len > 0 ? " %u" : "%u", range_start); + assert_se(r > 0); + len += r; + } + + if (in_range) { + if (!GREEDY_REALLOC(str, len + 2 + 2 * DECIMAL_STR_MAX(int))) + return NULL; + + if (range_end > range_start) + r = sprintf(str + len, len > 0 ? " %u-%u" : "%u-%u", range_start, range_end); + else + r = sprintf(str + len, len > 0 ? " %u" : "%u", range_start); + assert_se(r > 0); + } + + return TAKE_PTR(str) ?: strdup(""); +} + +int cpu_set_realloc(CPUSet *cpu_set, unsigned ncpus) { + size_t need; + + assert(cpu_set); + + need = CPU_ALLOC_SIZE(ncpus); + if (need > cpu_set->allocated) { + cpu_set_t *t; + + t = realloc(cpu_set->set, need); + if (!t) + return -ENOMEM; + + memzero((uint8_t*) t + cpu_set->allocated, need - cpu_set->allocated); + + cpu_set->set = t; + cpu_set->allocated = need; + } + + return 0; +} + +int cpu_set_add(CPUSet *cpu_set, unsigned cpu) { + int r; + + if (cpu >= 8192) + /* As of kernel 5.1, CONFIG_NR_CPUS can be set to 8192 on PowerPC */ + return -ERANGE; + + r = cpu_set_realloc(cpu_set, cpu + 1); + if (r < 0) + return r; + + CPU_SET_S(cpu, cpu_set->allocated, cpu_set->set); + return 0; +} + +int cpu_set_add_all(CPUSet *a, const CPUSet *b) { + int r; + + /* Do this backwards, so if we fail, we fail before changing anything. */ + for (unsigned cpu_p1 = b->allocated * 8; cpu_p1 > 0; cpu_p1--) + if (CPU_ISSET_S(cpu_p1 - 1, b->allocated, b->set)) { + r = cpu_set_add(a, cpu_p1 - 1); + if (r < 0) + return r; + } + + return 1; +} + +int parse_cpu_set_full( + const char *rvalue, + CPUSet *cpu_set, + bool warn, + const char *unit, + const char *filename, + unsigned line, + const char *lvalue) { + + _cleanup_(cpu_set_reset) CPUSet c = {}; + const char *p = ASSERT_PTR(rvalue); + + assert(cpu_set); + + for (;;) { + _cleanup_free_ char *word = NULL; + unsigned cpu_lower, cpu_upper; + int r; + + r = extract_first_word(&p, &word, WHITESPACE ",", EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return warn ? log_oom() : -ENOMEM; + if (r < 0) + return warn ? log_syntax(unit, LOG_ERR, filename, line, r, "Invalid value for %s: %s", lvalue, rvalue) : r; + if (r == 0) + break; + + r = parse_range(word, &cpu_lower, &cpu_upper); + if (r < 0) + return warn ? log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse CPU affinity '%s'", word) : r; + + if (cpu_lower > cpu_upper) { + if (warn) + log_syntax(unit, LOG_WARNING, filename, line, 0, "Range '%s' is invalid, %u > %u, ignoring.", + word, cpu_lower, cpu_upper); + + /* Make sure something is allocated, to distinguish this from the empty case */ + r = cpu_set_realloc(&c, 1); + if (r < 0) + return r; + } + + for (unsigned cpu_p1 = MIN(cpu_upper, UINT_MAX-1) + 1; cpu_p1 > cpu_lower; cpu_p1--) { + r = cpu_set_add(&c, cpu_p1 - 1); + if (r < 0) + return warn ? log_syntax(unit, LOG_ERR, filename, line, r, + "Cannot add CPU %u to set: %m", cpu_p1 - 1) : r; + } + } + + *cpu_set = TAKE_STRUCT(c); + + return 0; +} + +int parse_cpu_set_extend( + const char *rvalue, + CPUSet *old, + bool warn, + const char *unit, + const char *filename, + unsigned line, + const char *lvalue) { + + _cleanup_(cpu_set_reset) CPUSet cpuset = {}; + int r; + + assert(old); + + r = parse_cpu_set_full(rvalue, &cpuset, true, unit, filename, line, lvalue); + if (r < 0) + return r; + + if (!cpuset.set) { + /* An empty assignment resets the CPU list */ + cpu_set_reset(old); + return 0; + } + + if (!old->set) { + *old = TAKE_STRUCT(cpuset); + return 1; + } + + return cpu_set_add_all(old, &cpuset); +} + +int cpus_in_affinity_mask(void) { + size_t n = 16; + int r; + + for (;;) { + cpu_set_t *c; + + c = CPU_ALLOC(n); + if (!c) + return -ENOMEM; + + if (sched_getaffinity(0, CPU_ALLOC_SIZE(n), c) >= 0) { + int k; + + k = CPU_COUNT_S(CPU_ALLOC_SIZE(n), c); + CPU_FREE(c); + + if (k <= 0) + return -EINVAL; + + return k; + } + + r = -errno; + CPU_FREE(c); + + if (r != -EINVAL) + return r; + if (n > SIZE_MAX/2) + return -ENOMEM; + n *= 2; + } +} + +int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated) { + uint8_t *out; + + assert(set); + assert(ret); + + out = new0(uint8_t, set->allocated); + if (!out) + return -ENOMEM; + + for (unsigned cpu = 0; cpu < set->allocated * 8; cpu++) + if (CPU_ISSET_S(cpu, set->allocated, set->set)) + out[cpu / 8] |= 1u << (cpu % 8); + + *ret = out; + *allocated = set->allocated; + return 0; +} + +int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set) { + _cleanup_(cpu_set_reset) CPUSet s = {}; + int r; + + assert(bits); + assert(set); + + for (unsigned cpu = size * 8; cpu > 0; cpu--) + if (bits[(cpu - 1) / 8] & (1u << ((cpu - 1) % 8))) { + r = cpu_set_add(&s, cpu - 1); + if (r < 0) + return r; + } + + *set = TAKE_STRUCT(s); + return 0; +} diff --git a/src/shared/cpu-set-util.h b/src/shared/cpu-set-util.h new file mode 100644 index 0000000..3c63a58 --- /dev/null +++ b/src/shared/cpu-set-util.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" +#include "missing_syscall.h" + +/* This wraps the libc interface with a variable to keep the allocated size. */ +typedef struct CPUSet { + cpu_set_t *set; + size_t allocated; /* in bytes */ +} CPUSet; + +static inline void cpu_set_reset(CPUSet *a) { + assert((a->allocated > 0) == !!a->set); + if (a->set) + CPU_FREE(a->set); + *a = (CPUSet) {}; +} + +int cpu_set_add_all(CPUSet *a, const CPUSet *b); +int cpu_set_add(CPUSet *a, unsigned cpu); + +char* cpu_set_to_string(const CPUSet *a); +char *cpu_set_to_range_string(const CPUSet *a); +int cpu_set_realloc(CPUSet *cpu_set, unsigned ncpus); + +int parse_cpu_set_full( + const char *rvalue, + CPUSet *cpu_set, + bool warn, + const char *unit, + const char *filename, unsigned line, + const char *lvalue); +int parse_cpu_set_extend( + const char *rvalue, + CPUSet *old, + bool warn, + const char *unit, + const char *filename, + unsigned line, + const char *lvalue); + +static inline int parse_cpu_set(const char *rvalue, CPUSet *cpu_set){ + return parse_cpu_set_full(rvalue, cpu_set, false, NULL, NULL, 0, NULL); +} + +int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated); +int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set); + +int cpus_in_affinity_mask(void); diff --git a/src/shared/creds-util.c b/src/shared/creds-util.c new file mode 100644 index 0000000..7cc8889 --- /dev/null +++ b/src/shared/creds-util.c @@ -0,0 +1,1395 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#if HAVE_OPENSSL +#include +#endif + +#include "sd-id128.h" + +#include "blockdev-util.h" +#include "capability-util.h" +#include "chattr-util.h" +#include "constants.h" +#include "creds-util.h" +#include "efi-api.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "io-util.h" +#include "memory-util.h" +#include "mkdir.h" +#include "openssl-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "random-util.h" +#include "sparse-endian.h" +#include "stat-util.h" +#include "tpm2-util.h" +#include "virt.h" + +#define PUBLIC_KEY_MAX (UINT32_C(1024) * UINT32_C(1024)) + +bool credential_name_valid(const char *s) { + /* We want that credential names are both valid in filenames (since that's our primary way to pass + * them around) and as fdnames (which is how we might want to pass them around eventually) */ + return filename_is_valid(s) && fdname_is_valid(s); +} + +bool credential_glob_valid(const char *s) { + const char *e, *a; + size_t n; + + /* Checks if a credential glob expression is valid. Note that this is more restrictive than + * fnmatch()! We only allow trailing asterisk matches for now (simply because we want some freedom + * with automatically extending the pattern in a systematic way to cover for unit instances getting + * per-instance credentials or similar. Moreover, credential globbing expressions are also more + * restrictive then credential names: we don't allow *, ?, [, ] in them (except for the asterisk + * match at the end of the string), simply to not allow ambiguity. After all, we want the flexibility + * to one day add full globbing should the need arise. */ + + if (isempty(s)) + return false; + + /* Find first glob (or NUL byte) */ + n = strcspn(s, "*?[]"); + e = s + n; + + /* For now, only allow asterisk wildcards, and only at the end of the string. If it's anything else, refuse. */ + if (isempty(e)) + return credential_name_valid(s); + + if (!streq(e, "*")) /* only allow trailing "*", no other globs */ + return false; + + if (n == 0) /* Explicitly allow the complete wildcard. */ + return true; + + if (n > NAME_MAX + strlen(e)) /* before we make a copy on the stack, let's check this is not overly large */ + return false; + + /* Make a copy of the string without the '*' suffix */ + a = strndupa_safe(s, n); + + return credential_name_valid(a); +} + +static int get_credentials_dir_internal(const char *envvar, const char **ret) { + const char *e; + + assert(ret); + + e = secure_getenv(envvar); + if (!e) + return -ENXIO; + + if (!path_is_absolute(e) || !path_is_normalized(e)) + return -EINVAL; + + *ret = e; + return 0; +} + +int get_credentials_dir(const char **ret) { + return get_credentials_dir_internal("CREDENTIALS_DIRECTORY", ret); +} + +int get_encrypted_credentials_dir(const char **ret) { + return get_credentials_dir_internal("ENCRYPTED_CREDENTIALS_DIRECTORY", ret); +} + +int read_credential(const char *name, void **ret, size_t *ret_size) { + _cleanup_free_ char *fn = NULL; + const char *d; + int r; + + assert(ret); + + if (!credential_name_valid(name)) + return -EINVAL; + + r = get_credentials_dir(&d); + if (r < 0) + return r; + + fn = path_join(d, name); + if (!fn) + return -ENOMEM; + + return read_full_file_full( + AT_FDCWD, fn, + UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE, + NULL, + (char**) ret, ret_size); +} + +int read_credential_with_decryption(const char *name, void **ret, size_t *ret_size) { + _cleanup_(erase_and_freep) void *data = NULL; + _cleanup_free_ char *fn = NULL; + size_t sz = 0; + const char *d; + int r; + + assert(ret); + + /* Just like read_credential() but will also look for encrypted credentials. Note that services only + * receive decrypted credentials, hence use read_credential() for those. This helper here is for + * generators, i.e. code that runs outside of service context, and thus has no decrypted credentials + * yet. + * + * Note that read_credential_harder_and_warn() logs on its own, while read_credential() does not! + * (It's a lot more complex and error prone given its TPM2 connectivity, and is generally called from + * generators only where logging is OK). + * + * Error handling is also a bit different: if we can't find a credential we'll return 0 and NULL + * pointers/zero size, rather than -ENXIO/-ENOENT. */ + + if (!credential_name_valid(name)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid credential name: %s", name); + + r = read_credential(name, ret, ret_size); + if (r >= 0) + return 1; /* found */ + if (!IN_SET(r, -ENXIO, -ENOENT)) + return log_error_errno(r, "Failed read unencrypted credential '%s': %m", name); + + r = get_encrypted_credentials_dir(&d); + if (r == -ENXIO) + goto not_found; + if (r < 0) + return log_error_errno(r, "Failed to determine encrypted credentials directory: %m"); + + fn = path_join(d, name); + if (!fn) + return log_oom(); + + r = read_full_file_full( + AT_FDCWD, fn, + UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE, + NULL, + (char**) &data, &sz); + if (r == -ENOENT) + goto not_found; + if (r < 0) + return log_error_errno(r, "Failed to read encrypted credential data: %m"); + + r = decrypt_credential_and_warn( + name, + now(CLOCK_REALTIME), + /* tpm2_device = */ NULL, + /* tpm2_signature_path = */ NULL, + data, + sz, + ret, + ret_size); + if (r < 0) + return r; + + return 1; /* found */ + +not_found: + *ret = NULL; + + if (ret_size) + *ret_size = 0; + + return 0; /* not found */ +} + +int read_credential_strings_many_internal( + const char *first_name, char **first_value, + ...) { + + _cleanup_free_ void *b = NULL; + int r, ret = 0; + + /* Reads a bunch of credentials into the specified buffers. If the specified buffers are already + * non-NULL frees them if a credential is found. Only supports string-based credentials + * (i.e. refuses embedded NUL bytes). + * + * 0 is returned when some or all credentials are missing. + */ + + if (!first_name) + return 0; + + r = read_credential(first_name, &b, NULL); + if (r == -ENXIO) /* No creds passed at all? Bail immediately. */ + return 0; + if (r < 0) { + if (r != -ENOENT) + ret = r; + } else + free_and_replace(*first_value, b); + + va_list ap; + va_start(ap, first_value); + + for (;;) { + _cleanup_free_ void *bb = NULL; + const char *name; + char **value; + + name = va_arg(ap, const char *); + if (!name) + break; + + value = va_arg(ap, char **); + if (*value) + continue; + + r = read_credential(name, &bb, NULL); + if (r < 0) { + if (ret >= 0 && r != -ENOENT) + ret = r; + } else + free_and_replace(*value, bb); + } + + va_end(ap); + return ret; +} + +int read_credential_bool(const char *name) { + _cleanup_free_ void *data = NULL; + int r; + + r = read_credential(name, &data, NULL); + if (r < 0) + return IN_SET(r, -ENXIO, -ENOENT) ? 0 : r; + + return parse_boolean(data); +} + +int get_credential_user_password(const char *username, char **ret_password, bool *ret_is_hashed) { + _cleanup_(erase_and_freep) char *creds_password = NULL; + _cleanup_free_ char *cn = NULL; + int r; + + /* Try to pick up the password for this account via the credentials logic */ + cn = strjoin("passwd.hashed-password.", username); + if (!cn) + return -ENOMEM; + + r = read_credential(cn, (void**) &creds_password, NULL); + if (r == -ENOENT) { + free(cn); + cn = strjoin("passwd.plaintext-password.", username); + if (!cn) + return -ENOMEM; + + r = read_credential(cn, (void**) &creds_password, NULL); + if (r < 0) + log_debug_errno(r, "Couldn't read credential '%s', ignoring: %m", cn); + else + *ret_is_hashed = false; + } else if (r < 0) + log_debug_errno(r, "Couldn't read credential '%s', ignoring: %m", cn); + else + *ret_is_hashed = true; + + *ret_password = TAKE_PTR(creds_password); + + return r; +} + +#if HAVE_OPENSSL + +#define CREDENTIAL_HOST_SECRET_SIZE 4096 + +static const sd_id128_t credential_app_id = + SD_ID128_MAKE(d3,ac,ec,ba,0d,ad,4c,df,b8,c9,38,15,28,93,6c,58); + +struct credential_host_secret_format { + /* The hashed machine ID of the machine this belongs to. Why? We want to ensure that each machine + * gets its own secret, even if people forget to flush out this secret file. Hence we bind it to the + * machine ID, for which there's hopefully a better chance it will be flushed out. We use a hashed + * machine ID instead of the literal one, because it's trivial to, and it might be a good idea not + * being able to directly associate a secret key file with a host. */ + sd_id128_t machine_id; + + /* The actual secret key */ + uint8_t data[CREDENTIAL_HOST_SECRET_SIZE]; +} _packed_; + +static void warn_not_encrypted(int fd, CredentialSecretFlags flags, const char *dirname, const char *filename) { + int r; + + assert(fd >= 0); + assert(dirname); + assert(filename); + + if (!FLAGS_SET(flags, CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED)) + return; + + r = fd_is_encrypted(fd); + if (r < 0) + log_debug_errno(r, "Failed to determine if credential secret file '%s/%s' is encrypted.", + dirname, filename); + else if (r == 0) + log_warning("Credential secret file '%s/%s' is not located on encrypted media, using anyway.", + dirname, filename); +} + +static int make_credential_host_secret( + int dfd, + const sd_id128_t machine_id, + CredentialSecretFlags flags, + const char *dirname, + const char *fn, + void **ret_data, + size_t *ret_size) { + + _cleanup_free_ char *t = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dfd >= 0); + assert(fn); + + /* For non-root users creating a temporary file using the openat(2) over "." will fail later, in the + * linkat(2) step at the end. The reason is that linkat(2) requires the CAP_DAC_READ_SEARCH + * capability when it uses the AT_EMPTY_PATH flag. */ + if (have_effective_cap(CAP_DAC_READ_SEARCH) > 0) { + fd = openat(dfd, ".", O_CLOEXEC|O_WRONLY|O_TMPFILE, 0400); + if (fd < 0) + log_debug_errno(errno, "Failed to create temporary credential file with O_TMPFILE, proceeding without: %m"); + } + if (fd < 0) { + if (asprintf(&t, "credential.secret.%016" PRIx64, random_u64()) < 0) + return -ENOMEM; + + fd = openat(dfd, t, O_CLOEXEC|O_WRONLY|O_CREAT|O_EXCL|O_NOFOLLOW, 0400); + if (fd < 0) + return -errno; + } + + r = chattr_secret(fd, 0); + if (r < 0) + log_debug_errno(r, "Failed to set file attributes for secrets file, ignoring: %m"); + + struct credential_host_secret_format buf = { + .machine_id = machine_id, + }; + + CLEANUP_ERASE(buf); + + r = crypto_random_bytes(buf.data, sizeof(buf.data)); + if (r < 0) + goto fail; + + r = loop_write(fd, &buf, sizeof(buf)); + if (r < 0) + goto fail; + + if (fsync(fd) < 0) { + r = -errno; + goto fail; + } + + warn_not_encrypted(fd, flags, dirname, fn); + + if (t) { + r = rename_noreplace(dfd, t, dfd, fn); + if (r < 0) + goto fail; + + t = mfree(t); + } else if (linkat(fd, "", dfd, fn, AT_EMPTY_PATH) < 0) { + r = -errno; + goto fail; + } + + if (fsync(dfd) < 0) { + r = -errno; + goto fail; + } + + if (ret_data) { + void *copy; + + copy = memdup(buf.data, sizeof(buf.data)); + if (!copy) { + r = -ENOMEM; + goto fail; + } + + *ret_data = copy; + } + + if (ret_size) + *ret_size = sizeof(buf.data); + + return 0; + +fail: + if (t && unlinkat(dfd, t, 0) < 0) + log_debug_errno(errno, "Failed to remove temporary credential key: %m"); + + return r; +} + +int get_credential_host_secret(CredentialSecretFlags flags, void **ret, size_t *ret_size) { + _cleanup_free_ char *_dirname = NULL, *_filename = NULL; + _cleanup_close_ int dfd = -EBADF; + sd_id128_t machine_id; + const char *dirname, *filename; + int r; + + r = sd_id128_get_machine_app_specific(credential_app_id, &machine_id); + if (r < 0) + return r; + + const char *e = secure_getenv("SYSTEMD_CREDENTIAL_SECRET"); + if (e) { + if (!path_is_normalized(e)) + return -EINVAL; + if (!path_is_absolute(e)) + return -EINVAL; + + r = path_extract_directory(e, &_dirname); + if (r < 0) + return r; + + r = path_extract_filename(e, &_filename); + if (r < 0) + return r; + + dirname = _dirname; + filename = _filename; + } else { + dirname = "/var/lib/systemd"; + filename = "credential.secret"; + } + + assert(dirname); + assert(filename); + + mkdir_parents(dirname, 0755); + dfd = open_mkdir_at(AT_FDCWD, dirname, O_CLOEXEC, 0755); + if (dfd < 0) + return log_debug_errno(dfd, "Failed to create or open directory '%s': %m", dirname); + + if (FLAGS_SET(flags, CREDENTIAL_SECRET_FAIL_ON_TEMPORARY_FS)) { + r = fd_is_temporary_fs(dfd); + if (r < 0) + return log_debug_errno(r, "Failed to check directory '%s': %m", dirname); + if (r > 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "Directory '%s' is on a temporary file system, refusing.", dirname); + } + + for (unsigned attempt = 0;; attempt++) { + _cleanup_(erase_and_freep) struct credential_host_secret_format *f = NULL; + _cleanup_close_ int fd = -EBADF; + size_t l = 0; + ssize_t n = 0; + struct stat st; + + if (attempt >= 3) /* Somebody is playing games with us */ + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "All attempts to create secret store in %s failed.", dirname); + + fd = openat(dfd, filename, O_CLOEXEC|O_RDONLY|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) { + if (errno != ENOENT || !FLAGS_SET(flags, CREDENTIAL_SECRET_GENERATE)) + return log_debug_errno(errno, + "Failed to open %s/%s: %m", dirname, filename); + + + r = make_credential_host_secret(dfd, machine_id, flags, dirname, filename, ret, ret_size); + if (r == -EEXIST) { + log_debug_errno(r, "Credential secret %s/%s appeared while we were creating it, rereading.", + dirname, filename); + continue; + } + if (r < 0) + return log_debug_errno(r, "Failed to create credential secret %s/%s: %m", + dirname, filename); + return 0; + } + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat %s/%s: %m", dirname, filename); + + r = stat_verify_regular(&st); + if (r < 0) + return log_debug_errno(r, "%s/%s is not a regular file: %m", dirname, filename); + if (st.st_nlink == 0) /* Deleted by now, try again */ + continue; + if (st.st_nlink > 1) + /* Our deletion check won't work if hardlinked somewhere else */ + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), + "%s/%s has too many links, refusing.", + dirname, filename); + if ((st.st_mode & 07777) != 0400) + /* Don't use file if not 0400 access mode */ + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), + "%s/%s has permissive access mode, refusing.", + dirname, filename); + l = st.st_size; + if (l < offsetof(struct credential_host_secret_format, data) + 1) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s/%s is too small, refusing.", dirname, filename); + if (l > 16*1024*1024) + return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), + "%s/%s is too big, refusing.", dirname, filename); + + f = malloc(l+1); + if (!f) + return log_oom_debug(); + + n = read(fd, f, l+1); + if (n < 0) + return log_debug_errno(errno, + "Failed to read %s/%s: %m", dirname, filename); + if ((size_t) n != l) /* What? The size changed? */ + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Failed to read %s/%s: %m", dirname, filename); + + if (sd_id128_equal(machine_id, f->machine_id)) { + size_t sz; + + warn_not_encrypted(fd, flags, dirname, filename); + + sz = l - offsetof(struct credential_host_secret_format, data); + assert(sz > 0); + + if (ret) { + void *copy; + + assert(sz <= sizeof(f->data)); /* Ensure we don't read past f->data bounds */ + + copy = memdup(f->data, sz); + if (!copy) + return log_oom_debug(); + + *ret = copy; + } + + if (ret_size) + *ret_size = sz; + + return 0; + } + + /* Hmm, this secret is from somewhere else. Let's delete the file. Let's first acquire a lock + * to ensure we are the only ones accessing the file while we delete it. */ + + if (flock(fd, LOCK_EX) < 0) + return log_debug_errno(errno, + "Failed to flock %s/%s: %m", dirname, filename); + + /* Before we delete it check that the file is still linked into the file system */ + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat %s/%s: %m", dirname, filename); + if (st.st_nlink == 0) /* Already deleted by now? */ + continue; + if (st.st_nlink != 1) /* Safety check, someone is playing games with us */ + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), + "%s/%s unexpectedly has too many links.", + dirname, filename); + if (unlinkat(dfd, filename, 0) < 0) + return log_debug_errno(errno, "Failed to unlink %s/%s: %m", dirname, filename); + + /* And now try again */ + } +} + +/* Construction is like this: + * + * A symmetric encryption key is derived from: + * + * 1. Either the "host" key (a key stored in /var/lib/credential.secret) + * + * 2. A key generated by letting the TPM2 calculate an HMAC hash of some nonce we pass to it, keyed + * by a key derived from its internal seed key. + * + * 3. The concatenation of the above. + * + * 4. Or a fixed "empty" key. This will not provide confidentiality or authenticity, of course, but is + * useful to encode credentials for the initrd on TPM-less systems, where we simply have no better + * concept to bind things to. Note that decryption of a key set up like this will be refused on + * systems that have a TPM and have SecureBoot enabled. + * + * The above is hashed with SHA256 which is then used as encryption key for AES256-GCM. The encrypted + * credential is a short (unencrypted) header describing which of the three keys to use, the IV to use for + * AES256-GCM and some more meta information (sizes of certain objects) that is strictly speaking redundant, + * but kinda nice to have since we can have a more generic parser. If the TPM2 key is used this is followed + * by another (unencrypted) header, with information about the TPM2 policy used (specifically: the PCR mask + * to bind against, and a hash of the resulting policy — the latter being redundant, but speeding up things a + * bit, since we can more quickly refuse PCR state), followed by a sealed/exported TPM2 HMAC key. This is + * then followed by the encrypted data, which begins with a metadata header (which contains validity + * timestamps as well as the credential name), followed by the actual credential payload. The file ends in + * the AES256-GCM tag. To make things simple, the AES256-GCM AAD covers the main and the TPM2 header in + * full. This means the whole file is either protected by AAD, or is ciphertext, or is the tag. No + * unprotected data is included. + */ + +struct _packed_ encrypted_credential_header { + sd_id128_t id; + le32_t key_size; + le32_t block_size; + le32_t iv_size; + le32_t tag_size; + uint8_t iv[]; + /* Followed by NUL bytes until next 8 byte boundary */ +}; + +struct _packed_ tpm2_credential_header { + le64_t pcr_mask; /* Note that the spec for PC Clients only mandates 24 PCRs, and that's what systems + * generally have. But keep the door open for more. */ + le16_t pcr_bank; /* For now, either TPM2_ALG_SHA256 or TPM2_ALG_SHA1 */ + le16_t primary_alg; /* Primary key algorithm (either TPM2_ALG_RSA or TPM2_ALG_ECC for now) */ + le32_t blob_size; + le32_t policy_hash_size; + uint8_t policy_hash_and_blob[]; + /* Followed by NUL bytes until next 8 byte boundary */ +}; + +struct _packed_ tpm2_public_key_credential_header { + le64_t pcr_mask; /* PCRs used for the public key PCR policy (usually just PCR 11, i.e. the unified kernel) */ + le32_t size; /* Size of DER public key */ + uint8_t data[]; /* DER public key */ + /* Followed by NUL bytes until next 8 byte boundary */ +}; + +struct _packed_ metadata_credential_header { + le64_t timestamp; + le64_t not_after; + le32_t name_size; + char name[]; + /* Followed by NUL bytes until next 8 byte boundary */ +}; + +/* Some generic limit for parts of the encrypted credential for which we don't know the right size ahead of + * time, but where we are really sure it won't be larger than this. Should be larger than any possible IV, + * padding, tag size and so on. This is purely used for early filtering out of invalid sizes. */ +#define CREDENTIAL_FIELD_SIZE_MAX (16U*1024U) + +static int sha256_hash_host_and_tpm2_key( + const void *host_key, + size_t host_key_size, + const void *tpm2_key, + size_t tpm2_key_size, + uint8_t ret[static SHA256_DIGEST_LENGTH]) { + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *md = NULL; + unsigned l; + + assert(host_key_size == 0 || host_key); + assert(tpm2_key_size == 0 || tpm2_key); + assert(ret); + + /* Combines the host key and the TPM2 HMAC hash into a SHA256 hash value we'll use as symmetric encryption key. */ + + md = EVP_MD_CTX_new(); + if (!md) + return log_oom(); + + if (EVP_DigestInit_ex(md, EVP_sha256(), NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initial SHA256 context."); + + if (host_key && EVP_DigestUpdate(md, host_key, host_key_size) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to hash host key."); + + if (tpm2_key && EVP_DigestUpdate(md, tpm2_key, tpm2_key_size) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to hash TPM2 key."); + + assert(EVP_MD_CTX_size(md) == SHA256_DIGEST_LENGTH); + + if (EVP_DigestFinal_ex(md, ret, &l) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize SHA256 hash."); + + assert(l == SHA256_DIGEST_LENGTH); + return 0; +} + +int encrypt_credential_and_warn( + sd_id128_t with_key, + const char *name, + usec_t timestamp, + usec_t not_after, + const char *tpm2_device, + uint32_t tpm2_hash_pcr_mask, + const char *tpm2_pubkey_path, + uint32_t tpm2_pubkey_pcr_mask, + const void *input, + size_t input_size, + void **ret, + size_t *ret_size) { + + _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL; + _cleanup_(erase_and_freep) void *host_key = NULL, *tpm2_key = NULL; + size_t host_key_size = 0, tpm2_key_size = 0, tpm2_blob_size = 0, tpm2_policy_hash_size = 0, output_size, p, ml; + _cleanup_free_ void *tpm2_blob = NULL, *tpm2_policy_hash = NULL, *iv = NULL, *output = NULL; + _cleanup_free_ struct metadata_credential_header *m = NULL; + uint16_t tpm2_pcr_bank = 0, tpm2_primary_alg = 0; + struct encrypted_credential_header *h; + int ksz, bsz, ivsz, tsz, added, r; + _cleanup_free_ void *pubkey = NULL; + size_t pubkey_size = 0; + uint8_t md[SHA256_DIGEST_LENGTH]; + const EVP_CIPHER *cc; + sd_id128_t id; + + assert(input || input_size == 0); + assert(ret); + assert(ret_size); + + if (!sd_id128_in_set(with_key, + _CRED_AUTO, + _CRED_AUTO_INITRD, + CRED_AES256_GCM_BY_HOST, + CRED_AES256_GCM_BY_TPM2_HMAC, + CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK, + CRED_AES256_GCM_BY_TPM2_ABSENT)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid key type: " SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(with_key)); + + if (name && !credential_name_valid(name)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid credential name: %s", name); + + if (not_after != USEC_INFINITY && timestamp != USEC_INFINITY && not_after < timestamp) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential is invalidated before it is valid (" USEC_FMT " < " USEC_FMT ").", not_after, timestamp); + + if (DEBUG_LOGGING) { + char buf[FORMAT_TIMESTAMP_MAX]; + + if (name) + log_debug("Including credential name '%s' in encrypted credential.", name); + if (timestamp != USEC_INFINITY) + log_debug("Including timestamp '%s' in encrypted credential.", format_timestamp(buf, sizeof(buf), timestamp)); + if (not_after != USEC_INFINITY) + log_debug("Including not-after timestamp '%s' in encrypted credential.", format_timestamp(buf, sizeof(buf), not_after)); + } + + if (sd_id128_in_set(with_key, + _CRED_AUTO, + CRED_AES256_GCM_BY_HOST, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK)) { + + r = get_credential_host_secret( + CREDENTIAL_SECRET_GENERATE| + CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED| + (sd_id128_equal(with_key, _CRED_AUTO) ? CREDENTIAL_SECRET_FAIL_ON_TEMPORARY_FS : 0), + &host_key, + &host_key_size); + if (r == -ENOMEDIUM && sd_id128_equal(with_key, _CRED_AUTO)) + log_debug_errno(r, "Credential host secret location on temporary file system, not using."); + else if (r < 0) + return log_error_errno(r, "Failed to determine local credential host secret: %m"); + } + +#if HAVE_TPM2 + bool try_tpm2; + if (sd_id128_in_set(with_key, _CRED_AUTO, _CRED_AUTO_INITRD)) { + /* If automatic mode is selected lets see if a TPM2 it is present. If we are running in a + * container tpm2_support will detect this, and will return a different flag combination of + * TPM2_SUPPORT_FULL, effectively skipping the use of TPM2 when inside one. */ + + try_tpm2 = tpm2_support() == TPM2_SUPPORT_FULL; + if (!try_tpm2) + log_debug("System lacks TPM2 support or running in a container, not attempting to use TPM2."); + } else + try_tpm2 = sd_id128_in_set(with_key, + CRED_AES256_GCM_BY_TPM2_HMAC, + CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK); + + if (try_tpm2) { + if (sd_id128_in_set(with_key, + _CRED_AUTO, + _CRED_AUTO_INITRD, + CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK, + CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK)) { + + /* Load public key for PCR policies, if one is specified, or explicitly requested */ + + r = tpm2_load_pcr_public_key(tpm2_pubkey_path, &pubkey, &pubkey_size); + if (r < 0) { + if (tpm2_pubkey_path || r != -ENOENT || !sd_id128_in_set(with_key, _CRED_AUTO, _CRED_AUTO_INITRD)) + return log_error_errno(r, "Failed read TPM PCR public key: %m"); + + log_debug_errno(r, "Failed to read TPM2 PCR public key, proceeding without: %m"); + } + } + + if (!pubkey) + tpm2_pubkey_pcr_mask = 0; + + _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL; + r = tpm2_context_new(tpm2_device, &tpm2_context); + if (r < 0) + return log_error_errno(r, "Failed to create TPM2 context: %m"); + + r = tpm2_get_best_pcr_bank(tpm2_context, tpm2_hash_pcr_mask | tpm2_pubkey_pcr_mask, &tpm2_pcr_bank); + if (r < 0) + return log_error_errno(r, "Could not find best pcr bank: %m"); + + TPML_PCR_SELECTION tpm2_hash_pcr_selection; + tpm2_tpml_pcr_selection_from_mask(tpm2_hash_pcr_mask, tpm2_pcr_bank, &tpm2_hash_pcr_selection); + + _cleanup_free_ Tpm2PCRValue *tpm2_hash_pcr_values = NULL; + size_t tpm2_n_hash_pcr_values; + r = tpm2_pcr_read(tpm2_context, &tpm2_hash_pcr_selection, &tpm2_hash_pcr_values, &tpm2_n_hash_pcr_values); + if (r < 0) + return log_error_errno(r, "Could not read PCR values: %m"); + + TPM2B_PUBLIC public; + if (pubkey) { + r = tpm2_tpm2b_public_from_pem(pubkey, pubkey_size, &public); + if (r < 0) + return log_error_errno(r, "Could not convert public key to TPM2B_PUBLIC: %m"); + } + + TPM2B_DIGEST tpm2_policy = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + r = tpm2_calculate_sealing_policy( + tpm2_hash_pcr_values, + tpm2_n_hash_pcr_values, + pubkey ? &public : NULL, + /* use_pin= */ false, + /* pcrlock_policy= */ NULL, + &tpm2_policy); + if (r < 0) + return log_error_errno(r, "Could not calculate sealing policy digest: %m"); + + r = tpm2_seal(tpm2_context, + /* seal_key_handle= */ 0, + &tpm2_policy, + /* pin= */ NULL, + &tpm2_key, &tpm2_key_size, + &tpm2_blob, &tpm2_blob_size, + &tpm2_primary_alg, + /* ret_srk_buf= */ NULL, + /* ret_srk_buf_size= */ NULL); + if (r < 0) { + if (sd_id128_equal(with_key, _CRED_AUTO_INITRD)) + log_warning("TPM2 present and used, but we didn't manage to talk to it. Credential will be refused if SecureBoot is enabled."); + else if (!sd_id128_equal(with_key, _CRED_AUTO)) + return log_error_errno(r, "Failed to seal to TPM2: %m"); + + log_notice_errno(r, "TPM2 sealing didn't work, continuing without TPM2: %m"); + } + + tpm2_policy_hash_size = tpm2_policy.size; + tpm2_policy_hash = malloc(tpm2_policy_hash_size); + if (!tpm2_policy_hash) + return log_oom(); + memcpy(tpm2_policy_hash, tpm2_policy.buffer, tpm2_policy_hash_size); + + assert(tpm2_blob_size <= CREDENTIAL_FIELD_SIZE_MAX); + assert(tpm2_policy_hash_size <= CREDENTIAL_FIELD_SIZE_MAX); + } +#endif + + if (sd_id128_in_set(with_key, _CRED_AUTO, _CRED_AUTO_INITRD)) { + /* Let's settle the key type in auto mode now. */ + + if (host_key && tpm2_key) + id = pubkey ? CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK : CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC; + else if (tpm2_key) + id = pubkey ? CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK : CRED_AES256_GCM_BY_TPM2_HMAC; + else if (host_key) + id = CRED_AES256_GCM_BY_HOST; + else if (sd_id128_equal(with_key, _CRED_AUTO_INITRD)) + id = CRED_AES256_GCM_BY_TPM2_ABSENT; + else + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "TPM2 not available and host key located on temporary file system, no encryption key available."); + } else + id = with_key; + + if (sd_id128_equal(id, CRED_AES256_GCM_BY_TPM2_ABSENT)) + log_warning("Using a null key for encryption and signing. Confidentiality or authenticity will not be provided."); + + /* Let's now take the host key and the TPM2 key and hash it together, to use as encryption key for the data */ + r = sha256_hash_host_and_tpm2_key(host_key, host_key_size, tpm2_key, tpm2_key_size, md); + if (r < 0) + return r; + + assert_se(cc = EVP_aes_256_gcm()); + + ksz = EVP_CIPHER_key_length(cc); + assert(ksz == sizeof(md)); + + bsz = EVP_CIPHER_block_size(cc); + assert(bsz > 0); + assert((size_t) bsz <= CREDENTIAL_FIELD_SIZE_MAX); + + ivsz = EVP_CIPHER_iv_length(cc); + if (ivsz > 0) { + assert((size_t) ivsz <= CREDENTIAL_FIELD_SIZE_MAX); + + iv = malloc(ivsz); + if (!iv) + return log_oom(); + + r = crypto_random_bytes(iv, ivsz); + if (r < 0) + return log_error_errno(r, "Failed to acquired randomized IV: %m"); + } + + tsz = 16; /* FIXME: On OpenSSL 3 there is EVP_CIPHER_CTX_get_tag_length(), until then let's hardcode this */ + + context = EVP_CIPHER_CTX_new(); + if (!context) + return log_error_errno(SYNTHETIC_ERRNO(ENOMEM), "Failed to allocate encryption object: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (EVP_EncryptInit_ex(context, cc, NULL, md, iv) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize encryption context: %s", + ERR_error_string(ERR_get_error(), NULL)); + + /* Just an upper estimate */ + output_size = + ALIGN8(offsetof(struct encrypted_credential_header, iv) + ivsz) + + ALIGN8(tpm2_key ? offsetof(struct tpm2_credential_header, policy_hash_and_blob) + tpm2_blob_size + tpm2_policy_hash_size : 0) + + ALIGN8(pubkey ? offsetof(struct tpm2_public_key_credential_header, data) + pubkey_size : 0) + + ALIGN8(offsetof(struct metadata_credential_header, name) + strlen_ptr(name)) + + input_size + 2U * (size_t) bsz + + tsz; + + output = malloc0(output_size); + if (!output) + return log_oom(); + + h = (struct encrypted_credential_header*) output; + h->id = id; + h->block_size = htole32(bsz); + h->key_size = htole32(ksz); + h->tag_size = htole32(tsz); + h->iv_size = htole32(ivsz); + memcpy(h->iv, iv, ivsz); + + p = ALIGN8(offsetof(struct encrypted_credential_header, iv) + ivsz); + + if (tpm2_key) { + struct tpm2_credential_header *t; + + t = (struct tpm2_credential_header*) ((uint8_t*) output + p); + t->pcr_mask = htole64(tpm2_hash_pcr_mask); + t->pcr_bank = htole16(tpm2_pcr_bank); + t->primary_alg = htole16(tpm2_primary_alg); + t->blob_size = htole32(tpm2_blob_size); + t->policy_hash_size = htole32(tpm2_policy_hash_size); + memcpy(t->policy_hash_and_blob, tpm2_blob, tpm2_blob_size); + memcpy(t->policy_hash_and_blob + tpm2_blob_size, tpm2_policy_hash, tpm2_policy_hash_size); + + p += ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + tpm2_blob_size + tpm2_policy_hash_size); + } + + if (pubkey) { + struct tpm2_public_key_credential_header *z; + + z = (struct tpm2_public_key_credential_header*) ((uint8_t*) output + p); + z->pcr_mask = htole64(tpm2_pubkey_pcr_mask); + z->size = htole32(pubkey_size); + memcpy(z->data, pubkey, pubkey_size); + + p += ALIGN8(offsetof(struct tpm2_public_key_credential_header, data) + pubkey_size); + } + + /* Pass the encrypted + TPM2 header as AAD */ + if (EVP_EncryptUpdate(context, NULL, &added, output, p) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to write AAD data: %s", + ERR_error_string(ERR_get_error(), NULL)); + + /* Now construct the metadata header */ + ml = strlen_ptr(name); + m = malloc0(ALIGN8(offsetof(struct metadata_credential_header, name) + ml)); + if (!m) + return log_oom(); + + m->timestamp = htole64(timestamp); + m->not_after = htole64(not_after); + m->name_size = htole32(ml); + memcpy_safe(m->name, name, ml); + + /* And encrypt the metadata header */ + if (EVP_EncryptUpdate(context, (uint8_t*) output + p, &added, (const unsigned char*) m, ALIGN8(offsetof(struct metadata_credential_header, name) + ml)) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to encrypt metadata header: %s", + ERR_error_string(ERR_get_error(), NULL)); + + assert(added >= 0); + assert((size_t) added <= output_size - p); + p += added; + + /* Then encrypt the plaintext */ + if (EVP_EncryptUpdate(context, (uint8_t*) output + p, &added, input, input_size) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to encrypt data: %s", + ERR_error_string(ERR_get_error(), NULL)); + + assert(added >= 0); + assert((size_t) added <= output_size - p); + p += added; + + /* Finalize */ + if (EVP_EncryptFinal_ex(context, (uint8_t*) output + p, &added) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize data encryption: %s", + ERR_error_string(ERR_get_error(), NULL)); + + assert(added >= 0); + assert((size_t) added <= output_size - p); + p += added; + + assert(p <= output_size - tsz); + + /* Append tag */ + if (EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_GET_TAG, tsz, (uint8_t*) output + p) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to get tag: %s", + ERR_error_string(ERR_get_error(), NULL)); + + p += tsz; + assert(p <= output_size); + + if (DEBUG_LOGGING && input_size > 0) { + size_t base64_size; + + base64_size = DIV_ROUND_UP(p * 4, 3); /* Include base64 size increase in debug output */ + assert(base64_size >= input_size); + log_debug("Input of %zu bytes grew to output of %zu bytes (+%2zu%%).", input_size, base64_size, base64_size * 100 / input_size - 100); + } + + *ret = TAKE_PTR(output); + *ret_size = p; + + return 0; +} + +int decrypt_credential_and_warn( + const char *validate_name, + usec_t validate_timestamp, + const char *tpm2_device, + const char *tpm2_signature_path, + const void *input, + size_t input_size, + void **ret, + size_t *ret_size) { + + _cleanup_(erase_and_freep) void *host_key = NULL, *tpm2_key = NULL, *plaintext = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *signature_json = NULL; + _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL; + size_t host_key_size = 0, tpm2_key_size = 0, plaintext_size, p, hs; + struct encrypted_credential_header *h; + struct metadata_credential_header *m; + uint8_t md[SHA256_DIGEST_LENGTH]; + bool with_tpm2, with_host_key, is_tpm2_absent, with_tpm2_pk; + const EVP_CIPHER *cc; + int r, added; + + assert(input || input_size == 0); + assert(ret); + assert(ret_size); + + h = (struct encrypted_credential_header*) input; + + /* The ID must fit in, for the current and all future formats */ + if (input_size < sizeof(h->id)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short."); + + with_host_key = sd_id128_in_set(h->id, CRED_AES256_GCM_BY_HOST, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK); + with_tpm2_pk = sd_id128_in_set(h->id, CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK); + with_tpm2 = sd_id128_in_set(h->id, CRED_AES256_GCM_BY_TPM2_HMAC, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC) || with_tpm2_pk; + is_tpm2_absent = sd_id128_equal(h->id, CRED_AES256_GCM_BY_TPM2_ABSENT); + + if (!with_host_key && !with_tpm2 && !is_tpm2_absent) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unknown encryption format, or corrupted data: %m"); + + if (with_tpm2_pk) { + r = tpm2_load_pcr_signature(tpm2_signature_path, &signature_json); + if (r < 0) + return log_error_errno(r, "Failed to load pcr signature: %m"); + } + + if (is_tpm2_absent) { + /* So this is a credential encrypted with a zero length key. We support this to cover for the + * case where neither a host key not a TPM2 are available (specifically: initrd environments + * where the host key is not yet accessible and no TPM2 chip exists at all), to minimize + * different codeflow for TPM2 and non-TPM2 codepaths. Of course, credentials encoded this + * way offer no confidentiality nor authenticity. Because of that it's important we refuse to + * use them on systems that actually *do* have a TPM2 chip – if we are in SecureBoot + * mode. Otherwise an attacker could hand us credentials like this and we'd use them thinking + * they are trusted, even though they are not. */ + + if (efi_has_tpm2()) { + if (is_efi_secure_boot()) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Credential uses fixed key for fallback use when TPM2 is absent — but TPM2 is present, and SecureBoot is enabled, refusing."); + + log_warning("Credential uses fixed key for use when TPM2 is absent, but TPM2 is present! Accepting anyway, since SecureBoot is disabled."); + } else + log_debug("Credential uses fixed key for use when TPM2 is absent, and TPM2 indeed is absent. Accepting."); + } + + /* Now we know the minimum header size */ + if (input_size < offsetof(struct encrypted_credential_header, iv)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short."); + + /* Verify some basic header values */ + if (le32toh(h->key_size) != sizeof(md)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected key size in header."); + if (le32toh(h->block_size) <= 0 || le32toh(h->block_size) > CREDENTIAL_FIELD_SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected block size in header."); + if (le32toh(h->iv_size) > CREDENTIAL_FIELD_SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "IV size too large."); + if (le32toh(h->tag_size) != 16) /* FIXME: On OpenSSL 3, let's verify via EVP_CIPHER_CTX_get_tag_length() */ + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected tag size in header."); + + /* Ensure we have space for the full header now (we don't know the size of the name hence this is a + * lower limit only) */ + if (input_size < + ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)) + + ALIGN8(with_tpm2 ? offsetof(struct tpm2_credential_header, policy_hash_and_blob) : 0) + + ALIGN8(with_tpm2_pk ? offsetof(struct tpm2_public_key_credential_header, data) : 0) + + ALIGN8(offsetof(struct metadata_credential_header, name)) + + le32toh(h->tag_size)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short."); + + p = ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)); + + if (with_tpm2) { +#if HAVE_TPM2 + struct tpm2_credential_header* t = (struct tpm2_credential_header*) ((uint8_t*) input + p); + struct tpm2_public_key_credential_header *z = NULL; + + if (!TPM2_PCR_MASK_VALID(t->pcr_mask)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 PCR mask out of range."); + if (!tpm2_hash_alg_to_string(le16toh(t->pcr_bank))) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 PCR bank invalid or not supported"); + if (!tpm2_asym_alg_to_string(le16toh(t->primary_alg))) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 primary key algorithm invalid or not supported."); + if (le32toh(t->blob_size) > CREDENTIAL_FIELD_SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected TPM2 blob size."); + if (le32toh(t->policy_hash_size) > CREDENTIAL_FIELD_SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected TPM2 policy hash size."); + + /* Ensure we have space for the full TPM2 header now (still don't know the name, and its size + * though, hence still just a lower limit test only) */ + if (input_size < + ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)) + + ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + le32toh(t->blob_size) + le32toh(t->policy_hash_size)) + + ALIGN8(with_tpm2_pk ? offsetof(struct tpm2_public_key_credential_header, data) : 0) + + ALIGN8(offsetof(struct metadata_credential_header, name)) + + le32toh(h->tag_size)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short."); + + p += ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + + le32toh(t->blob_size) + + le32toh(t->policy_hash_size)); + + if (with_tpm2_pk) { + z = (struct tpm2_public_key_credential_header*) ((uint8_t*) input + p); + + if (!TPM2_PCR_MASK_VALID(le64toh(z->pcr_mask)) || le64toh(z->pcr_mask) == 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 PCR mask out of range."); + if (le32toh(z->size) > PUBLIC_KEY_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected public key size."); + + if (input_size < + ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)) + + ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + le32toh(t->blob_size) + le32toh(t->policy_hash_size)) + + ALIGN8(offsetof(struct tpm2_public_key_credential_header, data) + le32toh(z->size)) + + ALIGN8(offsetof(struct metadata_credential_header, name)) + + le32toh(h->tag_size)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short."); + + p += ALIGN8(offsetof(struct tpm2_public_key_credential_header, data) + + le32toh(z->size)); + } + + _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL; + r = tpm2_context_new(tpm2_device, &tpm2_context); + if (r < 0) + return r; + + // TODO: Add the SRK data to the credential structure so it can be plumbed + // through and used to verify the TPM session. + r = tpm2_unseal(tpm2_context, + le64toh(t->pcr_mask), + le16toh(t->pcr_bank), + z ? z->data : NULL, + z ? le32toh(z->size) : 0, + z ? le64toh(z->pcr_mask) : 0, + signature_json, + /* pin= */ NULL, + /* pcrlock_policy= */ NULL, + le16toh(t->primary_alg), + t->policy_hash_and_blob, + le32toh(t->blob_size), + t->policy_hash_and_blob + le32toh(t->blob_size), + le32toh(t->policy_hash_size), + /* srk_buf= */ NULL, + /* srk_buf_size= */ 0, + &tpm2_key, + &tpm2_key_size); + if (r < 0) + return log_error_errno(r, "Failed to unseal secret using TPM2: %m"); +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Credential requires TPM2 support, but TPM2 support not available."); +#endif + } + + if (with_host_key) { + r = get_credential_host_secret( + 0, + &host_key, + &host_key_size); + if (r < 0) + return log_error_errno(r, "Failed to determine local credential key: %m"); + } + + if (is_tpm2_absent) + log_warning("Warning: using a null key for decryption and authentication. Confidentiality or authenticity are not provided."); + + sha256_hash_host_and_tpm2_key(host_key, host_key_size, tpm2_key, tpm2_key_size, md); + + assert_se(cc = EVP_aes_256_gcm()); + + /* Make sure cipher expectations match the header */ + if (EVP_CIPHER_key_length(cc) != (int) le32toh(h->key_size)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected key size in header."); + if (EVP_CIPHER_block_size(cc) != (int) le32toh(h->block_size)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected block size in header."); + + context = EVP_CIPHER_CTX_new(); + if (!context) + return log_error_errno(SYNTHETIC_ERRNO(ENOMEM), "Failed to allocate decryption object: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (EVP_DecryptInit_ex(context, cc, NULL, NULL, NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize decryption context: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_IVLEN, le32toh(h->iv_size), NULL) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set IV size on decryption context: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (EVP_DecryptInit_ex(context, NULL, NULL, md, h->iv) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set IV and key: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (EVP_DecryptUpdate(context, NULL, &added, input, p) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to write AAD data: %s", + ERR_error_string(ERR_get_error(), NULL)); + + plaintext = malloc(input_size - p - le32toh(h->tag_size)); + if (!plaintext) + return -ENOMEM; + + if (EVP_DecryptUpdate( + context, + plaintext, + &added, + (uint8_t*) input + p, + input_size - p - le32toh(h->tag_size)) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to decrypt data: %s", + ERR_error_string(ERR_get_error(), NULL)); + + assert(added >= 0); + assert((size_t) added <= input_size - p - le32toh(h->tag_size)); + plaintext_size = added; + + if (EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_TAG, le32toh(h->tag_size), (uint8_t*) input + input_size - le32toh(h->tag_size)) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set tag: %s", + ERR_error_string(ERR_get_error(), NULL)); + + if (EVP_DecryptFinal_ex(context, (uint8_t*) plaintext + plaintext_size, &added) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Decryption failed (incorrect key?): %s", + ERR_error_string(ERR_get_error(), NULL)); + + plaintext_size += added; + + if (plaintext_size < ALIGN8(offsetof(struct metadata_credential_header, name))) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Metadata header incomplete."); + + m = plaintext; + + if (le64toh(m->timestamp) != USEC_INFINITY && + le64toh(m->not_after) != USEC_INFINITY && + le64toh(m->timestamp) >= le64toh(m->not_after)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Timestamps of credential are not in order, refusing."); + + if (le32toh(m->name_size) > CREDENTIAL_NAME_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Embedded credential name too long, refusing."); + + hs = ALIGN8(offsetof(struct metadata_credential_header, name) + le32toh(m->name_size)); + if (plaintext_size < hs) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Metadata header incomplete."); + + if (le32toh(m->name_size) > 0) { + _cleanup_free_ char *embedded_name = NULL; + + r = make_cstring(m->name, le32toh(m->name_size), MAKE_CSTRING_REFUSE_TRAILING_NUL, &embedded_name); + if (r < 0) + return log_error_errno(r, "Unable to convert embedded credential name to C string: %m"); + + if (!credential_name_valid(embedded_name)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Embedded credential name is not valid, refusing."); + + if (validate_name && !streq(embedded_name, validate_name)) { + + r = getenv_bool_secure("SYSTEMD_CREDENTIAL_VALIDATE_NAME"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_CREDENTIAL_VALIDATE_NAME: %m"); + if (r != 0) + return log_error_errno(SYNTHETIC_ERRNO(EREMOTE), "Embedded credential name '%s' does not match filename '%s', refusing.", embedded_name, validate_name); + + log_debug("Embedded credential name '%s' does not match expected name '%s', but configured to use credential anyway.", embedded_name, validate_name); + } + } + + if (validate_timestamp != USEC_INFINITY) { + if (le64toh(m->timestamp) != USEC_INFINITY && le64toh(m->timestamp) > validate_timestamp) + log_debug("Credential timestamp is from the future, assuming clock skew."); + + if (le64toh(m->not_after) != USEC_INFINITY && le64toh(m->not_after) < validate_timestamp) { + + r = getenv_bool_secure("SYSTEMD_CREDENTIAL_VALIDATE_NOT_AFTER"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_CREDENTIAL_VALIDATE_NOT_AFTER: %m"); + if (r != 0) + return log_error_errno(SYNTHETIC_ERRNO(ESTALE), "Credential's time passed, refusing to use."); + + log_debug("Credential not-after timestamp has passed, but configured to use credential anyway."); + } + } + + if (ret) { + char *without_metadata; + + without_metadata = memdup((uint8_t*) plaintext + hs, plaintext_size - hs); + if (!without_metadata) + return log_oom(); + + *ret = without_metadata; + } + + if (ret_size) + *ret_size = plaintext_size - hs; + + return 0; +} + +#else + +int get_credential_host_secret(CredentialSecretFlags flags, void **ret, size_t *ret_size) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Support for encrypted credentials not available."); +} + +int encrypt_credential_and_warn(sd_id128_t with_key, const char *name, usec_t timestamp, usec_t not_after, const char *tpm2_device, uint32_t tpm2_hash_pcr_mask, const char *tpm2_pubkey_path, uint32_t tpm2_pubkey_pcr_mask, const void *input, size_t input_size, void **ret, size_t *ret_size) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Support for encrypted credentials not available."); +} + +int decrypt_credential_and_warn(const char *validate_name, usec_t validate_timestamp, const char *tpm2_device, const char *tpm2_signature_path, const void *input, size_t input_size, void **ret, size_t *ret_size) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Support for encrypted credentials not available."); +} + +#endif diff --git a/src/shared/creds-util.h b/src/shared/creds-util.h new file mode 100644 index 0000000..5e39a6a --- /dev/null +++ b/src/shared/creds-util.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +#include "fd-util.h" +#include "time-util.h" + +#define CREDENTIAL_NAME_MAX FDNAME_MAX + +/* Put a size limit on the individual credential */ +#define CREDENTIAL_SIZE_MAX (1024U*1024U) + +/* Refuse to store more than 1M per service, after all this is unswappable memory. Note that for now we put + * this to the same limit as the per-credential limit, i.e. if the user has n > 1 credentials instead of 1 it + * won't get them more space. */ +#define CREDENTIALS_TOTAL_SIZE_MAX CREDENTIAL_SIZE_MAX + +/* Put a size limit on encrypted credentials (which is the same as the unencrypted size plus a spacious 128K of extra + * space for headers, IVs, exported TPM2 key material and so on. */ +#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U*1024U) + +bool credential_name_valid(const char *s); +bool credential_glob_valid(const char *s); + +/* Where creds have been passed to the local execution context */ +int get_credentials_dir(const char **ret); +int get_encrypted_credentials_dir(const char **ret); + +/* Where creds have been passed to the system */ +#define SYSTEM_CREDENTIALS_DIRECTORY "/run/credentials/@system" +#define ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY "/run/credentials/@encrypted" + +int read_credential(const char *name, void **ret, size_t *ret_size); /* use in services! */ +int read_credential_with_decryption(const char *name, void **ret, size_t *ret_size); /* use in generators + pid1! */ + +int read_credential_strings_many_internal(const char *first_name, char **first_value, ...); + +#define read_credential_strings_many(first_name, first_value, ...) \ + read_credential_strings_many_internal(first_name, first_value, __VA_ARGS__, NULL) + +int read_credential_bool(const char *name); + +typedef enum CredentialSecretFlags { + CREDENTIAL_SECRET_GENERATE = 1 << 0, + CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED = 1 << 1, + CREDENTIAL_SECRET_FAIL_ON_TEMPORARY_FS = 1 << 2, +} CredentialSecretFlags; + +int get_credential_host_secret(CredentialSecretFlags flags, void **ret, size_t *ret_size); + +int get_credential_user_password(const char *username, char **ret_password, bool *ret_is_hashed); + +/* The four modes we support: keyed only by on-disk key, only by TPM2 HMAC key, and by the combination of + * both, as well as one with a fixed zero length key if TPM2 is missing (the latter of course provides no + * authenticity or confidentiality, but is still useful for integrity protection, and makes things simpler + * for us to handle). */ +#define CRED_AES256_GCM_BY_HOST SD_ID128_MAKE(5a,1c,6a,86,df,9d,40,96,b1,d5,a6,5e,08,62,f1,9a) +#define CRED_AES256_GCM_BY_TPM2_HMAC SD_ID128_MAKE(0c,7c,c0,7b,11,76,45,91,9c,4b,0b,ea,08,bc,20,fe) +#define CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK SD_ID128_MAKE(fa,f7,eb,93,41,e3,41,2c,a1,a4,36,f9,5a,29,36,2f) +#define CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC SD_ID128_MAKE(93,a8,94,09,48,74,44,90,90,ca,f2,fc,93,ca,b5,53) +#define CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK \ + SD_ID128_MAKE(af,49,50,a8,49,13,4e,b1,a7,38,46,30,4f,f3,0c,05) +#define CRED_AES256_GCM_BY_TPM2_ABSENT SD_ID128_MAKE(05,84,69,da,f6,f5,43,24,80,05,49,da,0f,8e,a2,fb) + +/* Two special IDs to pick a general automatic mode (i.e. tpm2+host if TPM2 exists, only host otherwise) or + * an initrd-specific automatic mode (i.e. tpm2 if firmware can do it, otherwise fixed zero-length key, and + * never involve host keys). These IDs will never be stored on disk, but are useful only internally while + * figuring out what precisely to write to disk. To mark that these aren't a "real" type, we'll prefix them + * with an underscore. */ +#define _CRED_AUTO SD_ID128_MAKE(a2,19,cb,07,85,b2,4c,04,b1,6d,18,ca,b9,d2,ee,01) +#define _CRED_AUTO_INITRD SD_ID128_MAKE(02,dc,8e,de,3a,02,43,ab,a9,ec,54,9c,05,e6,a0,71) + +int encrypt_credential_and_warn(sd_id128_t with_key, const char *name, usec_t timestamp, usec_t not_after, const char *tpm2_device, uint32_t tpm2_hash_pcr_mask, const char *tpm2_pubkey_path, uint32_t tpm2_pubkey_pcr_mask, const void *input, size_t input_size, void **ret, size_t *ret_size); +int decrypt_credential_and_warn(const char *validate_name, usec_t validate_timestamp, const char *tpm2_device, const char *tpm2_signature_path, const void *input, size_t input_size, void **ret, size_t *ret_size); diff --git a/src/shared/cryptsetup-fido2.c b/src/shared/cryptsetup-fido2.c new file mode 100644 index 0000000..285b82a --- /dev/null +++ b/src/shared/cryptsetup-fido2.c @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "ask-password-api.h" +#include "cryptsetup-fido2.h" +#include "env-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "json.h" +#include "libfido2-util.h" +#include "parse-util.h" +#include "random-util.h" +#include "strv.h" + +int acquire_fido2_key( + const char *volume_name, + const char *friendly_name, + const char *device, + const char *rp_id, + const void *cid, + size_t cid_size, + const char *key_file, + size_t key_file_size, + uint64_t key_file_offset, + const void *key_data, + size_t key_data_size, + usec_t until, + bool headless, + Fido2EnrollFlags required, + void **ret_decrypted_key, + size_t *ret_decrypted_key_size, + AskPasswordFlags ask_password_flags) { + + _cleanup_(erase_and_freep) char *envpw = NULL; + _cleanup_strv_free_erase_ char **pins = NULL; + _cleanup_free_ void *loaded_salt = NULL; + bool device_exists = false; + const char *salt; + size_t salt_size; + int r; + + if ((required & (FIDO2ENROLL_PIN | FIDO2ENROLL_UP | FIDO2ENROLL_UV)) && headless) + return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), + "Local verification is required to unlock this volume, but the 'headless' parameter was set."); + + ask_password_flags |= ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_ACCEPT_CACHED; + + assert(cid); + assert(key_file || key_data); + + if (key_data) { + salt = key_data; + salt_size = key_data_size; + } else { + _cleanup_free_ char *bindname = NULL; + + /* If we read the salt via AF_UNIX, make this client recognizable */ + if (asprintf(&bindname, "@%" PRIx64"/cryptsetup-fido2/%s", random_u64(), volume_name) < 0) + return log_oom(); + + r = read_full_file_full( + AT_FDCWD, key_file, + key_file_offset == 0 ? UINT64_MAX : key_file_offset, + key_file_size == 0 ? SIZE_MAX : key_file_size, + READ_FULL_FILE_CONNECT_SOCKET, + bindname, + (char**) &loaded_salt, &salt_size); + if (r < 0) + return r; + + salt = loaded_salt; + } + + r = getenv_steal_erase("PIN", &envpw); + if (r < 0) + return log_error_errno(r, "Failed to acquire password from environment: %m"); + if (r > 0) { + pins = strv_new(envpw); + if (!pins) + return log_oom(); + } + + for (;;) { + if (!device_exists) { + /* Before we inquire for the PIN we'll need, if we never talked to the device, check + * if the device actually is plugged in. Otherwise we'll ask for the PIN already when + * the device is not plugged in, which is confusing. */ + + r = fido2_have_device(device); + if (r < 0) + return r; + if (r == 0) /* no device found, return EAGAIN so that caller will wait/watch udev */ + return -EAGAIN; + + device_exists = true; /* now we know for sure, a device exists, no need to ask again */ + } + + /* Always make an attempt before asking for PIN. + * fido2_use_hmac_hash() will perform a pre-flight check for whether the credential for + * can be found on one of the connected devices. This way, we can avoid prompting the user + * for a PIN when we are sure that no device can be used. */ + r = fido2_use_hmac_hash( + device, + rp_id ?: "io.systemd.cryptsetup", + salt, salt_size, + cid, cid_size, + pins, + required, + ret_decrypted_key, + ret_decrypted_key_size); + if (!IN_SET(r, + -ENOANO, /* needs pin */ + -ENOLCK)) /* pin incorrect */ + return r; + + device_exists = true; /* that a PIN is needed/wasn't correct means that we managed to + * talk to a device */ + + if (headless) + return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "PIN querying disabled via 'headless' option. Use the '$PIN' environment variable."); + + pins = strv_free_erase(pins); + r = ask_password_auto("Please enter security token PIN:", "drive-harddisk", NULL, "fido2-pin", "cryptsetup.fido2-pin", until, ask_password_flags, &pins); + if (r < 0) + return log_error_errno(r, "Failed to ask for user password: %m"); + + ask_password_flags &= ~ASK_PASSWORD_ACCEPT_CACHED; + } +} + +int acquire_fido2_key_auto( + struct crypt_device *cd, + const char *name, + const char *friendly_name, + const char *fido2_device, + usec_t until, + bool headless, + void **ret_decrypted_key, + size_t *ret_decrypted_key_size, + AskPasswordFlags ask_password_flags) { + + _cleanup_free_ void *cid = NULL; + size_t cid_size = 0; + int r, ret = -ENOENT; + Fido2EnrollFlags required = 0; + + assert(cd); + assert(name); + assert(ret_decrypted_key); + assert(ret_decrypted_key_size); + + /* Loads FIDO2 metadata from LUKS2 JSON token headers. */ + + for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token ++) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + JsonVariant *w; + _cleanup_free_ void *salt = NULL; + _cleanup_free_ char *rp = NULL; + size_t salt_size = 0; + int ks; + + r = cryptsetup_get_token_as_json(cd, token, "systemd-fido2", &v); + if (IN_SET(r, -ENOENT, -EINVAL, -EMEDIUMTYPE)) + continue; + if (r < 0) + return log_error_errno(r, "Failed to read JSON token data off disk: %m"); + + ks = cryptsetup_get_keyslot_from_token(v); + if (ks < 0) { + /* Handle parsing errors of the keyslots field gracefully, since it's not 'owned' by + * us, but by the LUKS2 spec */ + log_warning_errno(ks, "Failed to extract keyslot index from FIDO2 JSON data token %i, skipping: %m", token); + continue; + } + + w = json_variant_by_key(v, "fido2-credential"); + if (!w || !json_variant_is_string(w)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "FIDO2 token data lacks 'fido2-credential' field."); + + r = unbase64mem(json_variant_string(w), SIZE_MAX, &cid, &cid_size); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid base64 data in 'fido2-credential' field."); + + w = json_variant_by_key(v, "fido2-salt"); + if (!w || !json_variant_is_string(w)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "FIDO2 token data lacks 'fido2-salt' field."); + + assert(!salt); + assert(salt_size == 0); + r = unbase64mem(json_variant_string(w), SIZE_MAX, &salt, &salt_size); + if (r < 0) + return log_error_errno(r, "Failed to decode base64 encoded salt."); + + w = json_variant_by_key(v, "fido2-rp"); + if (w) { + /* The "rp" field is optional. */ + + if (!json_variant_is_string(w)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "FIDO2 token data's 'fido2-rp' field is not a string."); + + assert(!rp); + rp = strdup(json_variant_string(w)); + if (!rp) + return log_oom(); + } + + w = json_variant_by_key(v, "fido2-clientPin-required"); + if (w) { + /* The "fido2-clientPin-required" field is optional. */ + + if (!json_variant_is_boolean(w)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "FIDO2 token data's 'fido2-clientPin-required' field is not a boolean."); + + SET_FLAG(required, FIDO2ENROLL_PIN, json_variant_boolean(w)); + } else + required |= FIDO2ENROLL_PIN_IF_NEEDED; /* compat with 248, where the field was unset */ + + w = json_variant_by_key(v, "fido2-up-required"); + if (w) { + /* The "fido2-up-required" field is optional. */ + + if (!json_variant_is_boolean(w)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "FIDO2 token data's 'fido2-up-required' field is not a boolean."); + + SET_FLAG(required, FIDO2ENROLL_UP, json_variant_boolean(w)); + } else + required |= FIDO2ENROLL_UP_IF_NEEDED; /* compat with 248 */ + + w = json_variant_by_key(v, "fido2-uv-required"); + if (w) { + /* The "fido2-uv-required" field is optional. */ + + if (!json_variant_is_boolean(w)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "FIDO2 token data's 'fido2-uv-required' field is not a boolean."); + + SET_FLAG(required, FIDO2ENROLL_UV, json_variant_boolean(w)); + } else + required |= FIDO2ENROLL_UV_OMIT; /* compat with 248 */ + + ret = acquire_fido2_key( + name, + friendly_name, + fido2_device, + rp, + cid, cid_size, + /* key_file= */ NULL, /* salt is read from LUKS header instead of key_file */ + /* key_file_size= */ 0, + /* key_file_offset= */ 0, + salt, salt_size, + until, + headless, + required, + ret_decrypted_key, ret_decrypted_key_size, + ask_password_flags); + if (ret == 0) + break; + } + + if (!cid) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), + "No valid FIDO2 token data found."); + + if (ret == -EAGAIN) /* fido2 device does not exist, or UV is blocked; caller will prompt for retry */ + return log_debug_errno(ret, "FIDO2 token does not exist, or UV is blocked."); + if (ret < 0) + return log_error_errno(ret, "Failed to unlock LUKS volume with FIDO2 token: %m"); + + log_info("Unlocked volume via automatically discovered security FIDO2 token."); + return ret; +} diff --git a/src/shared/cryptsetup-fido2.h b/src/shared/cryptsetup-fido2.h new file mode 100644 index 0000000..d96bb40 --- /dev/null +++ b/src/shared/cryptsetup-fido2.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "cryptsetup-util.h" +#include "libfido2-util.h" +#include "log.h" +#include "time-util.h" + +#if HAVE_LIBFIDO2 + +int acquire_fido2_key( + const char *volume_name, + const char *friendly_name, + const char *device, + const char *rp_id, + const void *cid, + size_t cid_size, + const char *key_file, + size_t key_file_size, + uint64_t key_file_offset, + const void *key_data, + size_t key_data_size, + usec_t until, + bool headless, + Fido2EnrollFlags required, + void **ret_decrypted_key, + size_t *ret_decrypted_key_size, + AskPasswordFlags ask_password_flags); + +int acquire_fido2_key_auto( + struct crypt_device *cd, + const char *name, + const char *friendly_name, + const char *fido2_device, + usec_t until, + bool headless, + void **ret_decrypted_key, + size_t *ret_decrypted_key_size, + AskPasswordFlags ask_password_flags); + +#else + +static inline int acquire_fido2_key( + const char *volume_name, + const char *friendly_name, + const char *device, + const char *rp_id, + const void *cid, + size_t cid_size, + const char *key_file, + size_t key_file_size, + uint64_t key_file_offset, + const void *key_data, + size_t key_data_size, + usec_t until, + bool headless, + Fido2EnrollFlags required, + void **ret_decrypted_key, + size_t *ret_decrypted_key_size, + AskPasswordFlags ask_password_flags) { + + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "FIDO2 token support not available."); +} + +static inline int acquire_fido2_key_auto( + struct crypt_device *cd, + const char *name, + const char *friendly_name, + const char *fido2_device, + usec_t until, + bool headless, + void **ret_decrypted_key, + size_t *ret_decrypted_key_size, + AskPasswordFlags ask_password_flags) { + + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "FIDO2 token support not available."); +} +#endif diff --git a/src/shared/cryptsetup-util.c b/src/shared/cryptsetup-util.c new file mode 100644 index 0000000..ab5764d --- /dev/null +++ b/src/shared/cryptsetup-util.c @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "cryptsetup-util.h" +#include "dlfcn-util.h" +#include "log.h" +#include "parse-util.h" + +#if HAVE_LIBCRYPTSETUP +static void *cryptsetup_dl = NULL; + +int (*sym_crypt_activate_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size, uint32_t flags); +#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY +int (*sym_crypt_activate_by_signed_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, const char *signature, size_t signature_size, uint32_t flags); +#endif +int (*sym_crypt_activate_by_volume_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, uint32_t flags); +int (*sym_crypt_deactivate_by_name)(struct crypt_device *cd, const char *name, uint32_t flags); +int (*sym_crypt_format)(struct crypt_device *cd, const char *type, const char *cipher, const char *cipher_mode, const char *uuid, const char *volume_key, size_t volume_key_size, void *params); +void (*sym_crypt_free)(struct crypt_device *cd); +const char *(*sym_crypt_get_cipher)(struct crypt_device *cd); +const char *(*sym_crypt_get_cipher_mode)(struct crypt_device *cd); +uint64_t (*sym_crypt_get_data_offset)(struct crypt_device *cd); +const char *(*sym_crypt_get_device_name)(struct crypt_device *cd); +const char *(*sym_crypt_get_dir)(void); +const char *(*sym_crypt_get_type)(struct crypt_device *cd); +const char *(*sym_crypt_get_uuid)(struct crypt_device *cd); +int (*sym_crypt_get_verity_info)(struct crypt_device *cd, struct crypt_params_verity *vp); +int (*sym_crypt_get_volume_key_size)(struct crypt_device *cd); +int (*sym_crypt_init)(struct crypt_device **cd, const char *device); +int (*sym_crypt_init_by_name)(struct crypt_device **cd, const char *name); +int (*sym_crypt_keyslot_add_by_volume_key)(struct crypt_device *cd, int keyslot, const char *volume_key, size_t volume_key_size, const char *passphrase, size_t passphrase_size); +int (*sym_crypt_keyslot_destroy)(struct crypt_device *cd, int keyslot); +int (*sym_crypt_keyslot_max)(const char *type); +int (*sym_crypt_load)(struct crypt_device *cd, const char *requested_type, void *params); +int (*sym_crypt_resize)(struct crypt_device *cd, const char *name, uint64_t new_size); +int (*sym_crypt_resume_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size); +int (*sym_crypt_set_data_device)(struct crypt_device *cd, const char *device); +void (*sym_crypt_set_debug_level)(int level); +void (*sym_crypt_set_log_callback)(struct crypt_device *cd, void (*log)(int level, const char *msg, void *usrptr), void *usrptr); +#if HAVE_CRYPT_SET_METADATA_SIZE +int (*sym_crypt_set_metadata_size)(struct crypt_device *cd, uint64_t metadata_size, uint64_t keyslots_size); +#endif +int (*sym_crypt_set_pbkdf_type)(struct crypt_device *cd, const struct crypt_pbkdf_type *pbkdf); +int (*sym_crypt_suspend)(struct crypt_device *cd, const char *name); +int (*sym_crypt_token_json_get)(struct crypt_device *cd, int token, const char **json); +int (*sym_crypt_token_json_set)(struct crypt_device *cd, int token, const char *json); +#if HAVE_CRYPT_TOKEN_MAX +int (*sym_crypt_token_max)(const char *type); +#endif +crypt_token_info (*sym_crypt_token_status)(struct crypt_device *cd, int token, const char **type); +int (*sym_crypt_volume_key_get)(struct crypt_device *cd, int keyslot, char *volume_key, size_t *volume_key_size, const char *passphrase, size_t passphrase_size); +#if HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE +int (*sym_crypt_reencrypt_init_by_passphrase)(struct crypt_device *cd, const char *name, const char *passphrase, size_t passphrase_size, int keyslot_old, int keyslot_new, const char *cipher, const char *cipher_mode, const struct crypt_params_reencrypt *params); +#endif +#if HAVE_CRYPT_REENCRYPT +int (*sym_crypt_reencrypt)(struct crypt_device *cd, int (*progress)(uint64_t size, uint64_t offset, void *usrptr)); +#endif +int (*sym_crypt_metadata_locking)(struct crypt_device *cd, int enable); +#if HAVE_CRYPT_SET_DATA_OFFSET +int (*sym_crypt_set_data_offset)(struct crypt_device *cd, uint64_t data_offset); +#endif +int (*sym_crypt_header_restore)(struct crypt_device *cd, const char *requested_type, const char *backup_file); +int (*sym_crypt_volume_key_keyring)(struct crypt_device *cd, int enable); + +/* Unfortunately libcryptsetup provides neither an environment variable to redirect where to look for token + * modules, nor does it have an API to change the token lookup path at runtime. The maintainers suggest using + * ELF interposition instead (see https://gitlab.com/cryptsetup/cryptsetup/-/issues/846). Hence let's do + * that: let's interpose libcryptsetup's crypt_token_external_path() function with our own, that *does* + * honour an environment variable where to look for tokens. This is tremendously useful for debugging + * libcryptsetup tokens: set the environment variable to your build dir and you can easily test token modules + * without jumping through various hoops. */ + +/* Do this only on new enough compilers that actually support the "symver" attribute. Given this is a debug + * feature, let's simply not bother on older compilers */ +#if BUILD_MODE_DEVELOPER && defined(__has_attribute) && __has_attribute(symver) +const char *my_crypt_token_external_path(void); /* prototype for our own implementation */ + +/* We use the "symver" attribute to mark this implementation as the default implementation, and drop the + * SD_SHARED namespace we by default attach to our symbols via a version script. */ +__attribute__((symver("crypt_token_external_path@@"))) +_public_ const char *my_crypt_token_external_path(void) { + const char *e; + + e = secure_getenv("SYSTEMD_CRYPTSETUP_TOKEN_PATH"); + if (e) + return e; + + /* Now chain invoke the original implementation. */ + if (cryptsetup_dl) { + typeof(crypt_token_external_path) *func; + func = (typeof(crypt_token_external_path)*) dlsym(cryptsetup_dl, "crypt_token_external_path"); + if (func) + return func(); + } + + return NULL; +} +#endif + +static void cryptsetup_log_glue(int level, const char *msg, void *usrptr) { + + switch (level) { + case CRYPT_LOG_NORMAL: + level = LOG_NOTICE; + break; + case CRYPT_LOG_ERROR: + level = LOG_ERR; + break; + case CRYPT_LOG_VERBOSE: + level = LOG_INFO; + break; + case CRYPT_LOG_DEBUG: + level = LOG_DEBUG; + break; + default: + log_error("Unknown libcryptsetup log level: %d", level); + level = LOG_ERR; + } + + log_full(level, "%s", msg); +} + +void cryptsetup_enable_logging(struct crypt_device *cd) { + /* It's OK to call this with a NULL parameter, in which case libcryptsetup will set the default log + * function. + * + * Note that this is also called from dlopen_cryptsetup(), which we call here too. Sounds like an + * endless loop, but isn't because we break it via the check for 'cryptsetup_dl' early in + * dlopen_cryptsetup(). */ + + if (dlopen_cryptsetup() < 0) + return; /* If this fails, let's gracefully ignore the issue, this is just debug logging after + * all, and if this failed we already generated a debug log message that should help + * to track things down. */ + + sym_crypt_set_log_callback(cd, cryptsetup_log_glue, NULL); + sym_crypt_set_debug_level(DEBUG_LOGGING ? CRYPT_DEBUG_ALL : CRYPT_DEBUG_NONE); +} + +int cryptsetup_set_minimal_pbkdf(struct crypt_device *cd) { + + /* With CRYPT_PBKDF_NO_BENCHMARK flag set .time_ms member is ignored + * while .iterations must be set at least to recommended minimum value. */ + + static const struct crypt_pbkdf_type minimal_pbkdf = { + .hash = "sha512", + .type = CRYPT_KDF_PBKDF2, + .iterations = 1000, /* recommended minimum count for pbkdf2 + * according to NIST SP 800-132, ch. 5.2 */ + .flags = CRYPT_PBKDF_NO_BENCHMARK + }; + + int r; + + /* Sets a minimal PKBDF in case we already have a high entropy key. */ + + r = dlopen_cryptsetup(); + if (r < 0) + return r; + + r = sym_crypt_set_pbkdf_type(cd, &minimal_pbkdf); + if (r < 0) + return r; + + return 0; +} + +int cryptsetup_get_token_as_json( + struct crypt_device *cd, + int idx, + const char *verify_type, + JsonVariant **ret) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + const char *text; + int r; + + assert(cd); + + /* Extracts and parses the LUKS2 JSON token data from a LUKS2 device. Optionally verifies the type of + * the token. Returns: + * + * -EINVAL → token index out of range or "type" field missing + * -ENOENT → token doesn't exist + * -EMEDIUMTYPE → "verify_type" specified and doesn't match token's type + */ + + r = dlopen_cryptsetup(); + if (r < 0) + return r; + + r = sym_crypt_token_json_get(cd, idx, &text); + if (r < 0) + return r; + + r = json_parse(text, 0, &v, NULL, NULL); + if (r < 0) + return r; + + if (verify_type) { + JsonVariant *w; + + w = json_variant_by_key(v, "type"); + if (!w) + return -EINVAL; + + if (!streq_ptr(json_variant_string(w), verify_type)) + return -EMEDIUMTYPE; + } + + if (ret) + *ret = TAKE_PTR(v); + + return 0; +} + +int cryptsetup_add_token_json(struct crypt_device *cd, JsonVariant *v) { + _cleanup_free_ char *text = NULL; + int r; + + r = dlopen_cryptsetup(); + if (r < 0) + return r; + + r = json_variant_format(v, 0, &text); + if (r < 0) + return log_debug_errno(r, "Failed to format token data for LUKS: %m"); + + log_debug("Adding token text <%s>", text); + + r = sym_crypt_token_json_set(cd, CRYPT_ANY_TOKEN, text); + if (r < 0) + return log_debug_errno(r, "Failed to write token data to LUKS: %m"); + + return 0; +} +#endif + +int dlopen_cryptsetup(void) { +#if HAVE_LIBCRYPTSETUP + int r; + + /* libcryptsetup added crypt_reencrypt() in 2.2.0, and marked it obsolete in 2.4.0, replacing it with + * crypt_reencrypt_run(), which takes one extra argument but is otherwise identical. The old call is + * still available though, and given we want to support 2.2.0 for a while longer, we'll stick to the + * old symbol. However, the old symbols now has a GCC deprecation decorator, hence let's turn off + * warnings about this for now. */ + + DISABLE_WARNING_DEPRECATED_DECLARATIONS; + + r = dlopen_many_sym_or_warn( + &cryptsetup_dl, "libcryptsetup.so.12", LOG_DEBUG, + DLSYM_ARG(crypt_activate_by_passphrase), +#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY + DLSYM_ARG(crypt_activate_by_signed_key), +#endif + DLSYM_ARG(crypt_activate_by_volume_key), + DLSYM_ARG(crypt_deactivate_by_name), + DLSYM_ARG(crypt_format), + DLSYM_ARG(crypt_free), + DLSYM_ARG(crypt_get_cipher), + DLSYM_ARG(crypt_get_cipher_mode), + DLSYM_ARG(crypt_get_data_offset), + DLSYM_ARG(crypt_get_device_name), + DLSYM_ARG(crypt_get_dir), + DLSYM_ARG(crypt_get_type), + DLSYM_ARG(crypt_get_uuid), + DLSYM_ARG(crypt_get_verity_info), + DLSYM_ARG(crypt_get_volume_key_size), + DLSYM_ARG(crypt_init), + DLSYM_ARG(crypt_init_by_name), + DLSYM_ARG(crypt_keyslot_add_by_volume_key), + DLSYM_ARG(crypt_keyslot_destroy), + DLSYM_ARG(crypt_keyslot_max), + DLSYM_ARG(crypt_load), + DLSYM_ARG(crypt_resize), + DLSYM_ARG(crypt_resume_by_passphrase), + DLSYM_ARG(crypt_set_data_device), + DLSYM_ARG(crypt_set_debug_level), + DLSYM_ARG(crypt_set_log_callback), +#if HAVE_CRYPT_SET_METADATA_SIZE + DLSYM_ARG(crypt_set_metadata_size), +#endif + DLSYM_ARG(crypt_set_pbkdf_type), + DLSYM_ARG(crypt_suspend), + DLSYM_ARG(crypt_token_json_get), + DLSYM_ARG(crypt_token_json_set), +#if HAVE_CRYPT_TOKEN_MAX + DLSYM_ARG(crypt_token_max), +#endif + DLSYM_ARG(crypt_token_status), + DLSYM_ARG(crypt_volume_key_get), +#if HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE + DLSYM_ARG(crypt_reencrypt_init_by_passphrase), +#endif +#if HAVE_CRYPT_REENCRYPT + DLSYM_ARG(crypt_reencrypt), +#endif + DLSYM_ARG(crypt_metadata_locking), +#if HAVE_CRYPT_SET_DATA_OFFSET + DLSYM_ARG(crypt_set_data_offset), +#endif + DLSYM_ARG(crypt_header_restore), + DLSYM_ARG(crypt_volume_key_keyring)); + if (r <= 0) + return r; + + REENABLE_WARNING; + + /* Redirect the default logging calls of libcryptsetup to our own logging infra. (Note that + * libcryptsetup also maintains per-"struct crypt_device" log functions, which we'll also set + * whenever allocating a "struct crypt_device" context. Why set both? To be defensive: maybe some + * other code loaded into this process also changes the global log functions of libcryptsetup, who + * knows? And if so, we still want our own objects to log via our own infra, at the very least.) */ + cryptsetup_enable_logging(NULL); + return 1; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "cryptsetup support is not compiled in."); +#endif +} + +int cryptsetup_get_keyslot_from_token(JsonVariant *v) { + int keyslot, r; + JsonVariant *w; + + /* Parses the "keyslots" field of a LUKS2 token object. The field can be an array, but here we assume + * that it contains a single element only, since that's the only way we ever generate it + * ourselves. */ + + w = json_variant_by_key(v, "keyslots"); + if (!w) + return -ENOENT; + if (!json_variant_is_array(w) || json_variant_elements(w) != 1) + return -EMEDIUMTYPE; + + w = json_variant_by_index(w, 0); + if (!w) + return -ENOENT; + if (!json_variant_is_string(w)) + return -EMEDIUMTYPE; + + r = safe_atoi(json_variant_string(w), &keyslot); + if (r < 0) + return r; + if (keyslot < 0) + return -EINVAL; + + return keyslot; +} diff --git a/src/shared/cryptsetup-util.h b/src/shared/cryptsetup-util.h new file mode 100644 index 0000000..5ff439d --- /dev/null +++ b/src/shared/cryptsetup-util.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "alloc-util.h" +#include "json.h" +#include "macro.h" + +#if HAVE_LIBCRYPTSETUP +#include + +/* These next two are defined in libcryptsetup.h from cryptsetup version 2.3.4 forwards. */ +#ifndef CRYPT_ACTIVATE_NO_READ_WORKQUEUE +#define CRYPT_ACTIVATE_NO_READ_WORKQUEUE (1 << 24) +#endif +#ifndef CRYPT_ACTIVATE_NO_WRITE_WORKQUEUE +#define CRYPT_ACTIVATE_NO_WRITE_WORKQUEUE (1 << 25) +#endif + +extern int (*sym_crypt_activate_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size, uint32_t flags); +#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY +extern int (*sym_crypt_activate_by_signed_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, const char *signature, size_t signature_size, uint32_t flags); +#endif +extern int (*sym_crypt_activate_by_volume_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, uint32_t flags); +extern int (*sym_crypt_deactivate_by_name)(struct crypt_device *cd, const char *name, uint32_t flags); +extern int (*sym_crypt_format)(struct crypt_device *cd, const char *type, const char *cipher, const char *cipher_mode, const char *uuid, const char *volume_key, size_t volume_key_size, void *params); +extern void (*sym_crypt_free)(struct crypt_device *cd); +extern const char *(*sym_crypt_get_cipher)(struct crypt_device *cd); +extern const char *(*sym_crypt_get_cipher_mode)(struct crypt_device *cd); +extern uint64_t (*sym_crypt_get_data_offset)(struct crypt_device *cd); +extern const char *(*sym_crypt_get_device_name)(struct crypt_device *cd); +extern const char *(*sym_crypt_get_dir)(void); +extern const char *(*sym_crypt_get_type)(struct crypt_device *cd); +extern const char *(*sym_crypt_get_uuid)(struct crypt_device *cd); +extern int (*sym_crypt_get_verity_info)(struct crypt_device *cd, struct crypt_params_verity *vp); +extern int (*sym_crypt_get_volume_key_size)(struct crypt_device *cd); +extern int (*sym_crypt_init)(struct crypt_device **cd, const char *device); +extern int (*sym_crypt_init_by_name)(struct crypt_device **cd, const char *name); +extern int (*sym_crypt_keyslot_add_by_volume_key)(struct crypt_device *cd, int keyslot, const char *volume_key, size_t volume_key_size, const char *passphrase, size_t passphrase_size); +extern int (*sym_crypt_keyslot_destroy)(struct crypt_device *cd, int keyslot); +extern int (*sym_crypt_keyslot_max)(const char *type); +extern int (*sym_crypt_load)(struct crypt_device *cd, const char *requested_type, void *params); +extern int (*sym_crypt_resize)(struct crypt_device *cd, const char *name, uint64_t new_size); +extern int (*sym_crypt_resume_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size); +extern int (*sym_crypt_set_data_device)(struct crypt_device *cd, const char *device); +extern void (*sym_crypt_set_debug_level)(int level); +extern void (*sym_crypt_set_log_callback)(struct crypt_device *cd, void (*log)(int level, const char *msg, void *usrptr), void *usrptr); +#if HAVE_CRYPT_SET_METADATA_SIZE +extern int (*sym_crypt_set_metadata_size)(struct crypt_device *cd, uint64_t metadata_size, uint64_t keyslots_size); +#endif +extern int (*sym_crypt_set_pbkdf_type)(struct crypt_device *cd, const struct crypt_pbkdf_type *pbkdf); +extern int (*sym_crypt_suspend)(struct crypt_device *cd, const char *name); +extern int (*sym_crypt_token_json_get)(struct crypt_device *cd, int token, const char **json); +extern int (*sym_crypt_token_json_set)(struct crypt_device *cd, int token, const char *json); +#if HAVE_CRYPT_TOKEN_MAX +extern int (*sym_crypt_token_max)(const char *type); +#else +/* As a fallback, use the same hard-coded value libcryptsetup uses internally. */ +static inline int crypt_token_max(_unused_ const char *type) { + assert(streq(type, CRYPT_LUKS2)); + + return 32; +} +#define sym_crypt_token_max(type) crypt_token_max(type) +#endif +extern crypt_token_info (*sym_crypt_token_status)(struct crypt_device *cd, int token, const char **type); +extern int (*sym_crypt_volume_key_get)(struct crypt_device *cd, int keyslot, char *volume_key, size_t *volume_key_size, const char *passphrase, size_t passphrase_size); +#if HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE +extern int (*sym_crypt_reencrypt_init_by_passphrase)(struct crypt_device *cd, const char *name, const char *passphrase, size_t passphrase_size, int keyslot_old, int keyslot_new, const char *cipher, const char *cipher_mode, const struct crypt_params_reencrypt *params); +#endif +#if HAVE_CRYPT_REENCRYPT +extern int (*sym_crypt_reencrypt)(struct crypt_device *cd, int (*progress)(uint64_t size, uint64_t offset, void *usrptr)); +#endif +extern int (*sym_crypt_metadata_locking)(struct crypt_device *cd, int enable); +#if HAVE_CRYPT_SET_DATA_OFFSET +extern int (*sym_crypt_set_data_offset)(struct crypt_device *cd, uint64_t data_offset); +#endif +extern int (*sym_crypt_header_restore)(struct crypt_device *cd, const char *requested_type, const char *backup_file); +extern int (*sym_crypt_volume_key_keyring)(struct crypt_device *cd, int enable); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct crypt_device *, crypt_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct crypt_device *, sym_crypt_free, NULL); + +/* Be careful, this works with dlopen_cryptsetup(), that is, it calls sym_crypt_free() instead of crypt_free(). */ +#define crypt_free_and_replace(a, b) \ + free_and_replace_full(a, b, sym_crypt_free) + +void cryptsetup_enable_logging(struct crypt_device *cd); + +int cryptsetup_set_minimal_pbkdf(struct crypt_device *cd); + +int cryptsetup_get_token_as_json(struct crypt_device *cd, int idx, const char *verify_type, JsonVariant **ret); +int cryptsetup_add_token_json(struct crypt_device *cd, JsonVariant *v); + +#else + +/* If libcryptsetup is not available, let's at least define the basic type and NOP destructors for it, to + * make a little bit less #ifdeferry necessary in main programs. */ +struct crypt_device; +static inline void sym_crypt_free(struct crypt_device* cd) {} +static inline void sym_crypt_freep(struct crypt_device** cd) {} + +#endif + +int dlopen_cryptsetup(void); + +int cryptsetup_get_keyslot_from_token(JsonVariant *v); + +static inline const char *mangle_none(const char *s) { + /* A helper that turns cryptsetup/integritysetup/veritysetup "options" strings into NULL if they are effectively empty */ + return isempty(s) || STR_IN_SET(s, "-", "none") ? NULL : s; +} diff --git a/src/shared/daemon-util.c b/src/shared/daemon-util.c new file mode 100644 index 0000000..32180a1 --- /dev/null +++ b/src/shared/daemon-util.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "daemon-util.h" +#include "fd-util.h" +#include "log.h" +#include "string-util.h" + +static int notify_remove_fd_warn(const char *name) { + int r; + + assert(name); + + r = sd_notifyf(/* unset_environment = */ false, + "FDSTOREREMOVE=1\n" + "FDNAME=%s", name); + if (r < 0) + return log_warning_errno(r, + "Failed to remove file descriptor \"%s\" from the store, ignoring: %m", + name); + + return 0; +} + +int notify_remove_fd_warnf(const char *format, ...) { + _cleanup_free_ char *p = NULL; + va_list ap; + int r; + + assert(format); + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + if (r < 0) + return log_oom(); + + return notify_remove_fd_warn(p); +} + +int close_and_notify_warn(int fd, const char *name) { + if (name) + (void) notify_remove_fd_warn(name); + + return safe_close(fd); +} + +static int notify_push_fd(int fd, const char *name) { + _cleanup_free_ char *state = NULL; + + assert(fd >= 0); + assert(name); + + state = strjoin("FDSTORE=1\n" + "FDNAME=", name); + if (!state) + return -ENOMEM; + + return sd_pid_notify_with_fds(0, /* unset_environment = */ false, state, &fd, 1); +} + +int notify_push_fdf(int fd, const char *format, ...) { + _cleanup_free_ char *name = NULL; + va_list ap; + int r; + + assert(fd >= 0); + assert(format); + + va_start(ap, format); + r = vasprintf(&name, format, ap); + va_end(ap); + if (r < 0) + return -ENOMEM; + + return notify_push_fd(fd, name); +} diff --git a/src/shared/daemon-util.h b/src/shared/daemon-util.h new file mode 100644 index 0000000..711885b --- /dev/null +++ b/src/shared/daemon-util.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-daemon.h" + +#include "macro.h" + +#define NOTIFY_READY "READY=1\n" "STATUS=Processing requests..." +#define NOTIFY_STOPPING "STOPPING=1\n" "STATUS=Shutting down..." + +static inline const char *notify_start(const char *start, const char *stop) { + if (start) + (void) sd_notify(false, start); + + return stop; +} + +/* This is intended to be used with _cleanup_ attribute. */ +static inline void notify_on_cleanup(const char **p) { + if (*p) + (void) sd_notify(false, *p); +} + +int notify_remove_fd_warnf(const char *format, ...) _printf_(1, 2); +int close_and_notify_warn(int fd, const char *name); +int notify_push_fdf(int fd, const char *format, ...) _printf_(2, 3); diff --git a/src/shared/data-fd-util.c b/src/shared/data-fd-util.c new file mode 100644 index 0000000..b939206 --- /dev/null +++ b/src/shared/data-fd-util.c @@ -0,0 +1,391 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#if HAVE_LINUX_MEMFD_H +#include +#endif + +#include "alloc-util.h" +#include "copy.h" +#include "data-fd-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "io-util.h" +#include "memfd-util.h" +#include "missing_mman.h" +#include "missing_syscall.h" +#include "tmpfile-util.h" + +/* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */ +#define DATA_FD_MEMORY_LIMIT (64U*1024U) + +/* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */ +#define DATA_FD_TMP_LIMIT (1024U*1024U) + +int acquire_data_fd(const void *data, size_t size, unsigned flags) { + _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR; + _cleanup_close_ int fd = -EBADF; + int isz = 0, r; + ssize_t n; + + assert(data || size == 0); + + /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more + * complex than I wish it was. But here's why: + * + * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them + * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14. + * + * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining + * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged + * clients can only bump their size to a system-wide limit, which might be quite low. + * + * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from + * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via + * /proc/self/. Unfortunately O_TMPFILE is not available on older kernels on tmpfs. + * + * d) Finally, we try creating a regular file in /dev/shm, which we then delete. + * + * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I + * figure. */ + + if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) + /* As a special case, return /dev/null if we have been called for an empty data block */ + return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY)); + + if ((flags & ACQUIRE_NO_MEMFD) == 0) { + fd = memfd_new_and_seal("data-fd", data, size); + if (fd < 0) { + if (ERRNO_IS_NOT_SUPPORTED(fd)) + goto try_pipe; + + return fd; + } + + return TAKE_FD(fd); + } + +try_pipe: + if ((flags & ACQUIRE_NO_PIPE) == 0) { + if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) + return -errno; + + isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); + if (isz < 0) + return -errno; + + if ((size_t) isz < size) { + isz = (int) size; + if (isz < 0 || (size_t) isz != size) + return -E2BIG; + + /* Try to bump the pipe size */ + (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz); + + /* See if that worked */ + isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); + if (isz < 0) + return -errno; + + if ((size_t) isz < size) + goto try_dev_shm; + } + + n = write(pipefds[1], data, size); + if (n < 0) + return -errno; + if ((size_t) n != size) + return -EIO; + + (void) fd_nonblock(pipefds[0], false); + + return TAKE_FD(pipefds[0]); + } + +try_dev_shm: + if ((flags & ACQUIRE_NO_TMPFILE) == 0) { + fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500); + if (fd < 0) + goto try_dev_shm_without_o_tmpfile; + + n = write(fd, data, size); + if (n < 0) + return -errno; + if ((size_t) n != size) + return -EIO; + + /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ + return fd_reopen(fd, O_RDONLY|O_CLOEXEC); + } + +try_dev_shm_without_o_tmpfile: + if ((flags & ACQUIRE_NO_REGULAR) == 0) { + char pattern[] = "/dev/shm/data-fd-XXXXXX"; + + fd = mkostemp_safe(pattern); + if (fd < 0) + return fd; + + n = write(fd, data, size); + if (n < 0) { + r = -errno; + goto unlink_and_return; + } + if ((size_t) n != size) { + r = -EIO; + goto unlink_and_return; + } + + /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ + r = fd_reopen(fd, O_RDONLY|O_CLOEXEC); + + unlink_and_return: + (void) unlink(pattern); + return r; + } + + return -EOPNOTSUPP; +} + +int copy_data_fd(int fd) { + _cleanup_close_ int copy_fd = -EBADF, tmp_fd = -EBADF; + _cleanup_free_ void *remains = NULL; + size_t remains_size = 0; + const char *td; + struct stat st; + int r; + + /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but + * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be + * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported + * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in + * /var/tmp. */ + + if (fstat(fd, &st) < 0) + return -errno; + + /* For now, let's only accept regular files, sockets, pipes and char devices */ + if (S_ISDIR(st.st_mode)) + return -EISDIR; + if (S_ISLNK(st.st_mode)) + return -ELOOP; + if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode)) + return -EBADFD; + + /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note + * that we use the reported regular file size only as a hint, given that there are plenty special files in + * /proc and /sys which report a zero file size but can be read from. */ + + if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) { + + /* Try a memfd first */ + copy_fd = memfd_new("data-fd"); + if (copy_fd >= 0) { + off_t f; + + r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0); + if (r < 0) + return r; + + f = lseek(copy_fd, 0, SEEK_SET); + if (f != 0) + return -errno; + + if (r == 0) { + /* Did it fit into the limit? If so, we are done. */ + r = memfd_set_sealed(copy_fd); + if (r < 0) + return r; + + return TAKE_FD(copy_fd); + } + + /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */ + + } else { + _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR; + int isz; + + /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather + * then block indefinitely when we hit the pipe size limit */ + + if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) + return -errno; + + isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); + if (isz < 0) + return -errno; + + /* Try to enlarge the pipe size if necessary */ + if ((size_t) isz < DATA_FD_MEMORY_LIMIT) { + + (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT); + + isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); + if (isz < 0) + return -errno; + } + + if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) { + + r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL); + if (r < 0 && r != -EAGAIN) + return r; /* If we get EAGAIN it could be because of the source or because of + * the destination fd, we can't know, as sendfile() and friends won't + * tell us. Hence, treat this as reason to fall back, just to be + * sure. */ + if (r == 0) { + /* Everything fit in, yay! */ + (void) fd_nonblock(pipefds[0], false); + + return TAKE_FD(pipefds[0]); + } + + /* Things didn't fit in. But we read data into the pipe, let's remember that, so that + * when writing the new file we incorporate this first. */ + copy_fd = TAKE_FD(pipefds[0]); + } + } + } + + /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */ + if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) && + (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) { + off_t f; + + tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC); + if (tmp_fd < 0) + return tmp_fd; + + if (copy_fd >= 0) { + /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the + * temporary file first. */ + + r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0); + if (r < 0) + return r; + + assert(r == 0); + } + + if (remains_size > 0) { + /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the + * failed copy operation, let's flush them out next. */ + + r = loop_write(tmp_fd, remains, remains_size); + if (r < 0) + return r; + } + + r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK); + if (r < 0) + return r; + if (r == 0) + goto finish; /* Yay, it fit in */ + + /* It didn't fit in. Let's not forget to use what we already used */ + f = lseek(tmp_fd, 0, SEEK_SET); + if (f != 0) + return -errno; + + close_and_replace(copy_fd, tmp_fd); + + remains = mfree(remains); + remains_size = 0; + } + + /* As last fallback use /var/tmp */ + r = var_tmp_dir(&td); + if (r < 0) + return r; + + tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC); + if (tmp_fd < 0) + return tmp_fd; + + if (copy_fd >= 0) { + /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this + * into the temporary file first. */ + r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return r; + + assert(r == 0); + } + + if (remains_size > 0) { + /* Then, copy in any read but not yet written bytes. */ + r = loop_write(tmp_fd, remains, remains_size); + if (r < 0) + return r; + } + + /* Copy in the rest */ + r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return r; + + assert(r == 0); + +finish: + /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the + * file again */ + + return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC); +} + +int memfd_clone_fd(int fd, const char *name, int mode) { + _cleanup_close_ int mfd = -EBADF; + struct stat st; + bool ro, exec; + int r; + + /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd + * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as + * S_ISREG. */ + + assert(fd >= 0); + assert(name); + assert(IN_SET(mode & O_ACCMODE, O_RDONLY, O_RDWR)); + assert((mode & ~(O_RDONLY|O_RDWR|O_CLOEXEC)) == 0); + + if (fstat(fd, &st) < 0) + return -errno; + + ro = (mode & O_ACCMODE) == O_RDONLY; + exec = st.st_mode & 0111; + + mfd = memfd_create_wrapper(name, + ((FLAGS_SET(mode, O_CLOEXEC) || ro) ? MFD_CLOEXEC : 0) | + (ro ? MFD_ALLOW_SEALING : 0) | + (exec ? MFD_EXEC : MFD_NOEXEC_SEAL)); + if (mfd < 0) + return mfd; + + r = copy_bytes(fd, mfd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return r; + + if (ro) { + _cleanup_close_ int rfd = -EBADF; + + r = memfd_set_sealed(mfd); + if (r < 0) + return r; + + rfd = fd_reopen(mfd, mode); + if (rfd < 0) + return rfd; + + return TAKE_FD(rfd); + } + + off_t f = lseek(mfd, 0, SEEK_SET); + if (f < 0) + return -errno; + + return TAKE_FD(mfd); +} diff --git a/src/shared/data-fd-util.h b/src/shared/data-fd-util.h new file mode 100644 index 0000000..4f3d8b8 --- /dev/null +++ b/src/shared/data-fd-util.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +enum { + ACQUIRE_NO_DEV_NULL = 1 << 0, + ACQUIRE_NO_MEMFD = 1 << 1, + ACQUIRE_NO_PIPE = 1 << 2, + ACQUIRE_NO_TMPFILE = 1 << 3, + ACQUIRE_NO_REGULAR = 1 << 4, +}; + +int acquire_data_fd(const void *data, size_t size, unsigned flags); +int copy_data_fd(int fd); +int memfd_clone_fd(int fd, const char *name, int mode); diff --git a/src/shared/dev-setup.c b/src/shared/dev-setup.c new file mode 100644 index 0000000..f7ed161 --- /dev/null +++ b/src/shared/dev-setup.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "dev-setup.h" +#include "fd-util.h" +#include "label-util.h" +#include "lock-util.h" +#include "log.h" +#include "mkdir-label.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "terminal-util.h" +#include "umask-util.h" +#include "user-util.h" + +int lock_dev_console(void) { + _cleanup_close_ int fd = -EBADF; + int r; + + fd = open_terminal("/dev/console", O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return fd; + + r = lock_generic(fd, LOCK_BSD, LOCK_EX); + if (r < 0) + return log_error_errno(r, "Failed to lock /dev/console: %m"); + + return TAKE_FD(fd); +} + +int dev_setup(const char *prefix, uid_t uid, gid_t gid) { + static const char symlinks[] = + "-/proc/kcore\0" "/dev/core\0" + "/proc/self/fd\0" "/dev/fd\0" + "/proc/self/fd/0\0" "/dev/stdin\0" + "/proc/self/fd/1\0" "/dev/stdout\0" + "/proc/self/fd/2\0" "/dev/stderr\0"; + + int r; + + NULSTR_FOREACH_PAIR(j, k, symlinks) { + _cleanup_free_ char *link_name = NULL; + const char *n; + + if (j[0] == '-') { + j++; + + if (access(j, F_OK) < 0) + continue; + } + + if (prefix) { + link_name = path_join(prefix, k); + if (!link_name) + return -ENOMEM; + + n = link_name; + } else + n = k; + + r = symlink_label(j, n); + if (r < 0) + log_debug_errno(r, "Failed to symlink %s to %s: %m", j, n); + + if (uid != UID_INVALID || gid != GID_INVALID) + if (lchown(n, uid, gid) < 0) + log_debug_errno(errno, "Failed to chown %s: %m", n); + } + + return 0; +} + +int make_inaccessible_nodes( + const char *parent_dir, + uid_t uid, + gid_t gid) { + + static const struct { + const char *name; + mode_t mode; + } table[] = { + { "inaccessible", S_IFDIR | 0755 }, + { "inaccessible/reg", S_IFREG | 0000 }, + { "inaccessible/dir", S_IFDIR | 0000 }, + { "inaccessible/fifo", S_IFIFO | 0000 }, + { "inaccessible/sock", S_IFSOCK | 0000 }, + + /* The following two are likely to fail if we lack the privs for it (for example in an userns + * environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibits creation of + * device nodes with a major/minor of 0). But that's entirely fine. Consumers of these files + * should implement falling back to use a different node then, for example + * /inaccessible/sock, which is close enough in behaviour and semantics for most uses. + */ + { "inaccessible/chr", S_IFCHR | 0000 }, + { "inaccessible/blk", S_IFBLK | 0000 }, + }; + + int r; + + if (!parent_dir) + parent_dir = "/run/systemd"; + + BLOCK_WITH_UMASK(0000); + + /* Set up inaccessible (and empty) file nodes of all types. This are used to as mount sources for over-mounting + * ("masking") file nodes that shall become inaccessible and empty for specific containers or services. We try + * to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the + * underlying file, i.e. in the best case we offer the same node type as the underlying node. */ + + for (size_t i = 0; i < ELEMENTSOF(table); i++) { + _cleanup_free_ char *path = NULL; + + path = path_join(parent_dir, table[i].name); + if (!path) + return log_oom(); + + if (S_ISDIR(table[i].mode)) + r = mkdir_label(path, table[i].mode & 07777); + else + r = mknod_label(path, table[i].mode, makedev(0, 0)); + if (r < 0) { + log_debug_errno(r, "Failed to create '%s', ignoring: %m", path); + continue; + } + + if (uid != UID_INVALID || gid != GID_INVALID) { + if (lchown(path, uid, gid) < 0) + log_debug_errno(errno, "Failed to chown '%s': %m", path); + } + } + + return 0; +} diff --git a/src/shared/dev-setup.h b/src/shared/dev-setup.h new file mode 100644 index 0000000..5339bc4 --- /dev/null +++ b/src/shared/dev-setup.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int lock_dev_console(void); + +int dev_setup(const char *prefix, uid_t uid, gid_t gid); + +int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid); diff --git a/src/shared/device-nodes.c b/src/shared/device-nodes.c new file mode 100644 index 0000000..d08c40f --- /dev/null +++ b/src/shared/device-nodes.c @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "device-nodes.h" +#include "path-util.h" +#include "string-util.h" +#include "utf8.h" + +int allow_listed_char_for_devnode(char c, const char *additional) { + return + ascii_isdigit(c) || + ascii_isalpha(c) || + strchr("#+-.:=@_", c) || + (additional && strchr(additional, c)); +} + +int encode_devnode_name(const char *str, char *str_enc, size_t len) { + size_t i, j; + + if (!str || !str_enc) + return -EINVAL; + + for (i = 0, j = 0; str[i] != '\0'; i++) { + int seqlen; + + seqlen = utf8_encoded_valid_unichar(str + i, SIZE_MAX); + if (seqlen > 1) { + + if (len-j < (size_t) seqlen) + return -EINVAL; + + memcpy(&str_enc[j], &str[i], seqlen); + j += seqlen; + i += (seqlen-1); + + } else if (str[i] == '\\' || !allow_listed_char_for_devnode(str[i], NULL)) { + + if (len-j < 4) + return -EINVAL; + + sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]); + j += 4; + + } else { + if (len-j < 1) + return -EINVAL; + + str_enc[j] = str[i]; + j++; + } + } + + if (len-j < 1) + return -EINVAL; + + str_enc[j] = '\0'; + return 0; +} + +int devnode_same(const char *a, const char *b) { + struct stat sa, sb; + + assert(a); + assert(b); + + if (!valid_device_node_path(a) || !valid_device_node_path(b)) + return -EINVAL; + + if (stat(a, &sa) < 0) + return -errno; + if (stat(b, &sb) < 0) + return -errno; + + if (!S_ISBLK(sa.st_mode) && !S_ISCHR(sa.st_mode)) + return -ENODEV; + if (!S_ISBLK(sb.st_mode) && !S_ISCHR(sb.st_mode)) + return -ENODEV; + + if (((sa.st_mode ^ sb.st_mode) & S_IFMT) != 0) /* both inode same device node type? */ + return false; + + return sa.st_rdev == sb.st_rdev; +} diff --git a/src/shared/device-nodes.h b/src/shared/device-nodes.h new file mode 100644 index 0000000..8b17a8e --- /dev/null +++ b/src/shared/device-nodes.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int encode_devnode_name(const char *str, char *str_enc, size_t len); +int allow_listed_char_for_devnode(char c, const char *additional); + +int devnode_same(const char *a, const char *b); diff --git a/src/shared/devnode-acl.c b/src/shared/devnode-acl.c new file mode 100644 index 0000000..b239699 --- /dev/null +++ b/src/shared/devnode-acl.c @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-device.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "device-util.h" +#include "devnode-acl.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "set.h" +#include "string-util.h" + +static int flush_acl(acl_t acl) { + acl_entry_t i; + int found; + bool changed = false; + + assert(acl); + + for (found = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + found > 0; + found = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) { + + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (tag != ACL_USER) + continue; + + if (acl_delete_entry(acl, i) < 0) + return -errno; + + changed = true; + } + + if (found < 0) + return -errno; + + return changed; +} + +int devnode_acl(const char *path, + bool flush, + bool del, uid_t old_uid, + bool add, uid_t new_uid) { + + _cleanup_(acl_freep) acl_t acl = NULL; + int r; + bool changed = false; + + assert(path); + + acl = acl_get_file(path, ACL_TYPE_ACCESS); + if (!acl) + return -errno; + + if (flush) { + + r = flush_acl(acl); + if (r < 0) + return r; + if (r > 0) + changed = true; + + } else if (del && old_uid > 0) { + acl_entry_t entry; + + r = acl_find_uid(acl, old_uid, &entry); + if (r < 0) + return r; + + if (r > 0) { + if (acl_delete_entry(acl, entry) < 0) + return -errno; + + changed = true; + } + } + + if (add && new_uid > 0) { + acl_entry_t entry; + acl_permset_t permset; + int rd, wt; + + r = acl_find_uid(acl, new_uid, &entry); + if (r < 0) + return r; + + if (r == 0) { + if (acl_create_entry(&acl, &entry) < 0) + return -errno; + + if (acl_set_tag_type(entry, ACL_USER) < 0 || + acl_set_qualifier(entry, &new_uid) < 0) + return -errno; + } + + if (acl_get_permset(entry, &permset) < 0) + return -errno; + + rd = acl_get_perm(permset, ACL_READ); + if (rd < 0) + return -errno; + + wt = acl_get_perm(permset, ACL_WRITE); + if (wt < 0) + return -errno; + + if (!rd || !wt) { + + if (acl_add_perm(permset, ACL_READ|ACL_WRITE) < 0) + return -errno; + + changed = true; + } + } + + if (!changed) + return 0; + + if (acl_calc_mask(&acl) < 0) + return -errno; + + if (acl_set_file(path, ACL_TYPE_ACCESS, acl) < 0) + return -errno; + + return 0; +} + +int devnode_acl_all(const char *seat, + bool flush, + bool del, uid_t old_uid, + bool add, uid_t new_uid) { + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_set_free_ Set *nodes = NULL; + _cleanup_closedir_ DIR *dir = NULL; + char *n; + int r; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + if (isempty(seat)) + seat = "seat0"; + + /* We can only match by one tag in libudev. We choose + * "uaccess" for that. If we could match for two tags here we + * could add the seat name as second match tag, but this would + * be hardly optimizable in libudev, and hence checking the + * second tag manually in our loop is a good solution. */ + r = sd_device_enumerator_add_match_tag(e, "uaccess"); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + const char *node, *sn; + + /* Make sure the tag is still in place */ + if (sd_device_has_current_tag(d, "uaccess") <= 0) + continue; + + if (sd_device_get_property_value(d, "ID_SEAT", &sn) < 0 || isempty(sn)) + sn = "seat0"; + + if (!streq(seat, sn)) + continue; + + /* In case people mistag devices with nodes, we need to ignore this */ + if (sd_device_get_devname(d, &node) < 0) + continue; + + log_device_debug(d, "Found udev node %s for seat %s", node, seat); + r = set_put_strdup_full(&nodes, &path_hash_ops_free, node); + if (r < 0) + return r; + } + + /* udev exports "dead" device nodes to allow module on-demand loading, + * these devices are not known to the kernel at this moment */ + dir = opendir("/run/udev/static_node-tags/uaccess"); + if (dir) { + FOREACH_DIRENT(de, dir, return -errno) { + r = readlinkat_malloc(dirfd(dir), de->d_name, &n); + if (r == -ENOENT) + continue; + if (r < 0) { + log_debug_errno(r, + "Unable to read symlink '/run/udev/static_node-tags/uaccess/%s', ignoring: %m", + de->d_name); + continue; + } + + log_debug("Found static node %s for seat %s", n, seat); + r = set_ensure_consume(&nodes, &path_hash_ops_free, n); + if (r < 0) + return r; + } + } + + r = 0; + SET_FOREACH(n, nodes) { + int k; + + log_debug("Changing ACLs at %s for seat %s (uid "UID_FMT"%s"UID_FMT"%s%s)", + n, seat, old_uid, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), new_uid, + del ? " del" : "", add ? " add" : ""); + + k = devnode_acl(n, flush, del, old_uid, add, new_uid); + if (k == -ENOENT) + log_debug("Device %s disappeared while setting ACLs", n); + else + RET_GATHER(r, k); + } + + return r; +} diff --git a/src/shared/devnode-acl.h b/src/shared/devnode-acl.h new file mode 100644 index 0000000..c88f3c0 --- /dev/null +++ b/src/shared/devnode-acl.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#if HAVE_ACL + +int devnode_acl(const char *path, + bool flush, + bool del, uid_t old_uid, + bool add, uid_t new_uid); + +int devnode_acl_all(const char *seat, + bool flush, + bool del, uid_t old_uid, + bool add, uid_t new_uid); +#else + +static inline int devnode_acl(const char *path, + bool flush, + bool del, uid_t old_uid, + bool add, uid_t new_uid) { + return 0; +} + +static inline int devnode_acl_all(const char *seat, + bool flush, + bool del, uid_t old_uid, + bool add, uid_t new_uid) { + return 0; +} + +#endif diff --git a/src/shared/discover-image.c b/src/shared/discover-image.c new file mode 100644 index 0000000..e8f4dfb --- /dev/null +++ b/src/shared/discover-image.c @@ -0,0 +1,1385 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "chase.h" +#include "chattr-util.h" +#include "copy.h" +#include "dirent-util.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "env-file.h" +#include "env-util.h" +#include "extension-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "hostname-setup.h" +#include "id128-util.h" +#include "initrd-util.h" +#include "lock-util.h" +#include "log.h" +#include "loop-util.h" +#include "macro.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "utf8.h" +#include "xattr-util.h" + +static const char* const image_search_path[_IMAGE_CLASS_MAX] = { + [IMAGE_MACHINE] = "/etc/machines\0" /* only place symlinks here */ + "/run/machines\0" /* and here too */ + "/var/lib/machines\0" /* the main place for images */ + "/var/lib/container\0" /* legacy */ + "/usr/local/lib/machines\0" + "/usr/lib/machines\0", + + [IMAGE_PORTABLE] = "/etc/portables\0" /* only place symlinks here */ + "/run/portables\0" /* and here too */ + "/var/lib/portables\0" /* the main place for images */ + "/usr/local/lib/portables\0" + "/usr/lib/portables\0", + + /* Note that we don't allow storing extensions under /usr/, unlike with other image types. That's + * because extension images are supposed to extend /usr/, so you get into recursive races, especially + * with directory-based extensions, as the kernel's OverlayFS explicitly checks for this and errors + * out with -ELOOP if it finds that a lowerdir= is a child of another lowerdir=. */ + [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */ + "/run/extensions\0" /* and here too */ + "/var/lib/extensions\0", /* the main place for images */ + + [IMAGE_CONFEXT] = "/run/confexts\0" /* only place symlinks here */ + "/var/lib/confexts\0" /* the main place for images */ + "/usr/local/lib/confexts\0" + "/usr/lib/confexts\0", +}; + +/* Inside the initrd, use a slightly different set of search path (i.e. include .extra/sysext in extension + * search dir) */ +static const char* const image_search_path_initrd[_IMAGE_CLASS_MAX] = { + /* (entries that aren't listed here will get the same search path as for the non initrd-case) */ + + [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */ + "/run/extensions\0" /* and here too */ + "/var/lib/extensions\0" /* the main place for images */ + "/.extra/sysext\0" /* put sysext picked up by systemd-stub last, since not trusted */ +}; + +static const char* image_class_suffix_table[_IMAGE_CLASS_MAX] = { + [IMAGE_SYSEXT] = ".sysext", + [IMAGE_CONFEXT] = ".confext", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(image_class_suffix, ImageClass); + +static Image *image_free(Image *i) { + assert(i); + + free(i->name); + free(i->path); + + free(i->hostname); + strv_free(i->machine_info); + strv_free(i->os_release); + strv_free(i->sysext_release); + strv_free(i->confext_release); + + return mfree(i); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Image, image, image_free); +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(image_hash_ops, char, string_hash_func, string_compare_func, + Image, image_unref); + +static char **image_settings_path(Image *image) { + _cleanup_strv_free_ char **l = NULL; + _cleanup_free_ char *fn = NULL; + size_t i = 0; + int r; + + assert(image); + + l = new0(char*, 4); + if (!l) + return NULL; + + fn = strjoin(image->name, ".nspawn"); + if (!fn) + return NULL; + + FOREACH_STRING(s, "/etc/systemd/nspawn", "/run/systemd/nspawn") { + l[i] = path_join(s, fn); + if (!l[i]) + return NULL; + + i++; + } + + r = file_in_same_dir(image->path, fn, l + i); + if (r == -ENOMEM) + return NULL; + if (r < 0) + log_debug_errno(r, "Failed to generate .nspawn settings path from image path, ignoring: %m"); + + strv_uniq(l); + + return TAKE_PTR(l); +} + +static int image_roothash_path(Image *image, char **ret) { + _cleanup_free_ char *fn = NULL; + + assert(image); + + fn = strjoin(image->name, ".roothash"); + if (!fn) + return -ENOMEM; + + return file_in_same_dir(image->path, fn, ret); +} + +static int image_new( + ImageType t, + ImageClass c, + const char *pretty, + const char *path, + const char *filename, + bool read_only, + usec_t crtime, + usec_t mtime, + Image **ret) { + + _cleanup_(image_unrefp) Image *i = NULL; + + assert(t >= 0); + assert(t < _IMAGE_TYPE_MAX); + assert(pretty); + assert(filename); + assert(ret); + + i = new(Image, 1); + if (!i) + return -ENOMEM; + + *i = (Image) { + .n_ref = 1, + .type = t, + .class = c, + .read_only = read_only, + .crtime = crtime, + .mtime = mtime, + .usage = UINT64_MAX, + .usage_exclusive = UINT64_MAX, + .limit = UINT64_MAX, + .limit_exclusive = UINT64_MAX, + }; + + i->name = strdup(pretty); + if (!i->name) + return -ENOMEM; + + i->path = path_join(path, filename); + if (!i->path) + return -ENOMEM; + + path_simplify(i->path); + + *ret = TAKE_PTR(i); + + return 0; +} + +static int extract_pretty( + const char *path, + const char *class_suffix, + const char *format_suffix, + char **ret) { + + _cleanup_free_ char *name = NULL; + int r; + + assert(path); + assert(ret); + + r = path_extract_filename(path, &name); + if (r < 0) + return r; + + if (format_suffix) { + char *e = endswith(name, format_suffix); + if (!e) /* Format suffix is required */ + return -EINVAL; + + *e = 0; + } + + if (class_suffix) { + char *e = endswith(name, class_suffix); + if (e) /* Class suffix is optional */ + *e = 0; + } + + if (!image_name_is_valid(name)) + return -EINVAL; + + *ret = TAKE_PTR(name); + return 0; +} + +static int image_make( + ImageClass c, + const char *pretty, + int dfd, + const char *path, + const char *filename, + const struct stat *st, + Image **ret) { + + _cleanup_free_ char *pretty_buffer = NULL, *parent = NULL; + struct stat stbuf; + bool read_only; + int r; + + assert(dfd >= 0 || dfd == AT_FDCWD); + assert(path || dfd == AT_FDCWD); + assert(filename); + + /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block + * devices into /var/lib/machines/, and treat them normally. + * + * This function returns -ENOENT if we can't find the image after all, and -EMEDIUMTYPE if it's not a file we + * recognize. */ + + if (!st) { + if (fstatat(dfd, filename, &stbuf, 0) < 0) + return -errno; + + st = &stbuf; + } + + if (!path) { + if (dfd == AT_FDCWD) + (void) safe_getcwd(&parent); + else + (void) fd_get_path(dfd, &parent); + } + + read_only = + (path && path_startswith(path, "/usr")) || + (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS); + + if (S_ISDIR(st->st_mode)) { + _cleanup_close_ int fd = -EBADF; + unsigned file_attr = 0; + usec_t crtime = 0; + + if (!ret) + return 0; + + if (!pretty) { + r = extract_pretty(filename, image_class_suffix_to_string(c), NULL, &pretty_buffer); + if (r < 0) + return r; + + pretty = pretty_buffer; + } + + fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY); + if (fd < 0) + return -errno; + + if (btrfs_might_be_subvol(st)) { + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return r; + if (r > 0) { + BtrfsSubvolInfo info; + + /* It's a btrfs subvolume */ + + r = btrfs_subvol_get_info_fd(fd, 0, &info); + if (r < 0) + return r; + + r = image_new(IMAGE_SUBVOLUME, + c, + pretty, + path, + filename, + info.read_only || read_only, + info.otime, + 0, + ret); + if (r < 0) + return r; + + if (btrfs_quota_scan_ongoing(fd) == 0) { + BtrfsQuotaInfo quota; + + r = btrfs_subvol_get_subtree_quota_fd(fd, 0, "a); + if (r >= 0) { + (*ret)->usage = quota.referenced; + (*ret)->usage_exclusive = quota.exclusive; + + (*ret)->limit = quota.referenced_max; + (*ret)->limit_exclusive = quota.exclusive_max; + } + } + + return 0; + } + } + + /* Get directory creation time (not available everywhere, but that's OK */ + (void) fd_getcrtime(fd, &crtime); + + /* If the IMMUTABLE bit is set, we consider the directory read-only. Since the ioctl is not + * supported everywhere we ignore failures. */ + (void) read_attr_fd(fd, &file_attr); + + /* It's just a normal directory. */ + r = image_new(IMAGE_DIRECTORY, + c, + pretty, + path, + filename, + read_only || (file_attr & FS_IMMUTABLE_FL), + crtime, + 0, /* we don't use mtime of stat() here, since it's not the time of last change of the tree, but only of the top-level dir */ + ret); + if (r < 0) + return r; + + return 0; + + } else if (S_ISREG(st->st_mode) && endswith(filename, ".raw")) { + usec_t crtime = 0; + + /* It's a RAW disk image */ + + if (!ret) + return 0; + + (void) fd_getcrtime_at(dfd, filename, AT_SYMLINK_FOLLOW, &crtime); + + if (!pretty) { + r = extract_pretty(filename, image_class_suffix_to_string(c), ".raw", &pretty_buffer); + if (r < 0) + return r; + + pretty = pretty_buffer; + } + + r = image_new(IMAGE_RAW, + c, + pretty, + path, + filename, + !(st->st_mode & 0222) || read_only, + crtime, + timespec_load(&st->st_mtim), + ret); + if (r < 0) + return r; + + (*ret)->usage = (*ret)->usage_exclusive = st->st_blocks * 512; + (*ret)->limit = (*ret)->limit_exclusive = st->st_size; + + return 0; + + } else if (S_ISBLK(st->st_mode)) { + _cleanup_close_ int block_fd = -EBADF; + uint64_t size = UINT64_MAX; + + /* A block device */ + + if (!ret) + return 0; + + if (!pretty) { + r = extract_pretty(filename, NULL, NULL, &pretty_buffer); + if (r < 0) + return r; + + pretty = pretty_buffer; + } + + block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (block_fd < 0) + log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path ?: strnull(parent), filename); + else { + /* Refresh stat data after opening the node */ + if (fstat(block_fd, &stbuf) < 0) + return -errno; + st = &stbuf; + + if (!S_ISBLK(st->st_mode)) /* Verify that what we opened is actually what we think it is */ + return -ENOTTY; + + if (!read_only) { + int state = 0; + + if (ioctl(block_fd, BLKROGET, &state) < 0) + log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path ?: strnull(parent), filename); + else if (state) + read_only = true; + } + + if (ioctl(block_fd, BLKGETSIZE64, &size) < 0) + log_debug_errno(errno, "Failed to issue BLKGETSIZE64 on device %s/%s, ignoring: %m", path ?: strnull(parent), filename); + + block_fd = safe_close(block_fd); + } + + r = image_new(IMAGE_BLOCK, + c, + pretty, + path, + filename, + !(st->st_mode & 0222) || read_only, + 0, + 0, + ret); + if (r < 0) + return r; + + if (!IN_SET(size, 0, UINT64_MAX)) + (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size; + + return 0; + } + + return -EMEDIUMTYPE; +} + +static const char *pick_image_search_path(ImageClass class) { + if (class < 0 || class >= _IMAGE_CLASS_MAX) + return NULL; + + /* Use the initrd search path if there is one, otherwise use the common one */ + return in_initrd() && image_search_path_initrd[class] ? image_search_path_initrd[class] : image_search_path[class]; +} + +int image_find(ImageClass class, + const char *name, + const char *root, + Image **ret) { + + int r; + + assert(class >= 0); + assert(class < _IMAGE_CLASS_MAX); + assert(name); + + /* There are no images with invalid names */ + if (!image_name_is_valid(name)) + return -ENOENT; + + NULSTR_FOREACH(path, pick_image_search_path(class)) { + _cleanup_free_ char *resolved = NULL; + _cleanup_closedir_ DIR *d = NULL; + struct stat st; + int flags; + + r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + + /* As mentioned above, we follow symlinks on this fstatat(), because we want to permit people + * to symlink block devices into the search path. (For now, we disable that when operating + * relative to some root directory.) */ + flags = root ? AT_SYMLINK_NOFOLLOW : 0; + if (fstatat(dirfd(d), name, &st, flags) < 0) { + _cleanup_free_ char *raw = NULL; + + if (errno != ENOENT) + return -errno; + + raw = strjoin(name, ".raw"); + if (!raw) + return -ENOMEM; + + if (fstatat(dirfd(d), raw, &st, flags) < 0) { + if (errno == ENOENT) + continue; + + return -errno; + } + + if (!S_ISREG(st.st_mode)) + continue; + + r = image_make(class, name, dirfd(d), resolved, raw, &st, ret); + + } else { + if (!S_ISDIR(st.st_mode) && !S_ISBLK(st.st_mode)) + continue; + + r = image_make(class, name, dirfd(d), resolved, name, &st, ret); + } + if (IN_SET(r, -ENOENT, -EMEDIUMTYPE)) + continue; + if (r < 0) + return r; + + if (ret) + (*ret)->discoverable = true; + + return 1; + } + + if (class == IMAGE_MACHINE && streq(name, ".host")) { + r = image_make(class, ".host", AT_FDCWD, NULL, empty_to_root(root), NULL, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->discoverable = true; + + return r; + } + + return -ENOENT; +}; + +int image_from_path(const char *path, Image **ret) { + + /* Note that we don't set the 'discoverable' field of the returned object, because we don't check here whether + * the image is in the image search path. And if it is we don't know if the path we used is actually not + * overridden by another, different image earlier in the search path */ + + if (path_equal(path, "/")) + return image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, "/", NULL, ret); + + return image_make(_IMAGE_CLASS_INVALID, NULL, AT_FDCWD, NULL, path, NULL, ret); +} + +int image_find_harder(ImageClass class, const char *name_or_path, const char *root, Image **ret) { + if (image_name_is_valid(name_or_path)) + return image_find(class, name_or_path, root, ret); + + return image_from_path(name_or_path, ret); +} + +int image_discover( + ImageClass class, + const char *root, + Hashmap *h) { + + int r; + + assert(class >= 0); + assert(class < _IMAGE_CLASS_MAX); + assert(h); + + NULSTR_FOREACH(path, pick_image_search_path(class)) { + _cleanup_free_ char *resolved = NULL; + _cleanup_closedir_ DIR *d = NULL; + + r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + + FOREACH_DIRENT_ALL(de, d, return -errno) { + _cleanup_(image_unrefp) Image *image = NULL; + _cleanup_free_ char *pretty = NULL; + struct stat st; + int flags; + + if (dot_or_dot_dot(de->d_name)) + continue; + + /* As mentioned above, we follow symlinks on this fstatat(), because we want to + * permit people to symlink block devices into the search path. */ + flags = root ? AT_SYMLINK_NOFOLLOW : 0; + if (fstatat(dirfd(d), de->d_name, &st, flags) < 0) { + if (errno == ENOENT) + continue; + + return -errno; + } + + if (S_ISREG(st.st_mode)) + r = extract_pretty(de->d_name, image_class_suffix_to_string(class), ".raw", &pretty); + else if (S_ISDIR(st.st_mode)) + r = extract_pretty(de->d_name, image_class_suffix_to_string(class), NULL, &pretty); + else if (S_ISBLK(st.st_mode)) + r = extract_pretty(de->d_name, NULL, NULL, &pretty); + else { + log_debug("Skipping directory entry '%s', which is neither regular file, directory nor block device.", de->d_name); + continue; + } + if (r < 0) { + log_debug_errno(r, "Skipping directory entry '%s', which doesn't look like an image.", de->d_name); + continue; + } + + if (hashmap_contains(h, pretty)) + continue; + + r = image_make(class, pretty, dirfd(d), resolved, de->d_name, &st, &image); + if (IN_SET(r, -ENOENT, -EMEDIUMTYPE)) + continue; + if (r < 0) + return r; + + image->discoverable = true; + + r = hashmap_put(h, image->name, image); + if (r < 0) + return r; + + TAKE_PTR(image); + } + } + + if (class == IMAGE_MACHINE && !hashmap_contains(h, ".host")) { + _cleanup_(image_unrefp) Image *image = NULL; + + r = image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, empty_to_root("/"), NULL, &image); + if (r < 0) + return r; + + image->discoverable = true; + + r = hashmap_put(h, image->name, image); + if (r < 0) + return r; + + image = NULL; + } + + return 0; +} + +int image_remove(Image *i) { + _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT; + _cleanup_strv_free_ char **settings = NULL; + _cleanup_free_ char *roothash = NULL; + int r; + + assert(i); + + if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i)) + return -EROFS; + + settings = image_settings_path(i); + if (!settings) + return -ENOMEM; + + r = image_roothash_path(i, &roothash); + if (r < 0) + return r; + + /* Make sure we don't interfere with a running nspawn */ + r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock); + if (r < 0) + return r; + + switch (i->type) { + + case IMAGE_SUBVOLUME: + + /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the + * big guns */ + if (unlink(i->path) < 0) { + r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); + if (r < 0) + return r; + } + + break; + + case IMAGE_DIRECTORY: + /* Allow deletion of read-only directories */ + (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL); + r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME); + if (r < 0) + return r; + + break; + + case IMAGE_BLOCK: + + /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node + * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink + * the thing (it's most likely a symlink after all). */ + + if (path_startswith(i->path, "/dev")) + break; + + _fallthrough_; + case IMAGE_RAW: + if (unlink(i->path) < 0) + return -errno; + break; + + default: + return -EOPNOTSUPP; + } + + STRV_FOREACH(j, settings) + if (unlink(*j) < 0 && errno != ENOENT) + log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j); + + if (unlink(roothash) < 0 && errno != ENOENT) + log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", roothash); + + return 0; +} + +static int rename_auxiliary_file(const char *path, const char *new_name, const char *suffix) { + _cleanup_free_ char *fn = NULL, *rs = NULL; + int r; + + fn = strjoin(new_name, suffix); + if (!fn) + return -ENOMEM; + + r = file_in_same_dir(path, fn, &rs); + if (r < 0) + return r; + + return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs); +} + +int image_rename(Image *i, const char *new_name) { + _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT; + _cleanup_free_ char *new_path = NULL, *nn = NULL, *roothash = NULL; + _cleanup_strv_free_ char **settings = NULL; + unsigned file_attr = 0; + int r; + + assert(i); + + if (!image_name_is_valid(new_name)) + return -EINVAL; + + if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i)) + return -EROFS; + + settings = image_settings_path(i); + if (!settings) + return -ENOMEM; + + r = image_roothash_path(i, &roothash); + if (r < 0) + return r; + + /* Make sure we don't interfere with a running nspawn */ + r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock); + if (r < 0) + return r; + + /* Make sure nobody takes the new name, between the time we + * checked it is currently unused in all search paths, and the + * time we take possession of it */ + r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock); + if (r < 0) + return r; + + r = image_find(IMAGE_MACHINE, new_name, NULL, NULL); + if (r >= 0) + return -EEXIST; + if (r != -ENOENT) + return r; + + switch (i->type) { + + case IMAGE_DIRECTORY: + /* Turn of the immutable bit while we rename the image, so that we can rename it */ + (void) read_attr_path(i->path, &file_attr); + + if (file_attr & FS_IMMUTABLE_FL) + (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL); + + _fallthrough_; + case IMAGE_SUBVOLUME: + r = file_in_same_dir(i->path, new_name, &new_path); + break; + + case IMAGE_BLOCK: + + /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */ + if (path_startswith(i->path, "/dev")) + return -EROFS; + + r = file_in_same_dir(i->path, new_name, &new_path); + break; + + case IMAGE_RAW: { + const char *fn; + + fn = strjoina(new_name, ".raw"); + + r = file_in_same_dir(i->path, fn, &new_path); + break; + } + + default: + return -EOPNOTSUPP; + } + if (r < 0) + return r; + + nn = strdup(new_name); + if (!nn) + return -ENOMEM; + + r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path); + if (r < 0) + return r; + + /* Restore the immutable bit, if it was set before */ + if (file_attr & FS_IMMUTABLE_FL) + (void) chattr_path(new_path, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL); + + free_and_replace(i->path, new_path); + free_and_replace(i->name, nn); + + STRV_FOREACH(j, settings) { + r = rename_auxiliary_file(*j, new_name, ".nspawn"); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j); + } + + r = rename_auxiliary_file(roothash, new_name, ".roothash"); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to rename roothash file %s, ignoring: %m", roothash); + + return 0; +} + +static int clone_auxiliary_file(const char *path, const char *new_name, const char *suffix) { + _cleanup_free_ char *fn = NULL, *rs = NULL; + int r; + + fn = strjoin(new_name, suffix); + if (!fn) + return -ENOMEM; + + r = file_in_same_dir(path, fn, &rs); + if (r < 0) + return r; + + return copy_file_atomic(path, rs, 0664, COPY_REFLINK); +} + +int image_clone(Image *i, const char *new_name, bool read_only) { + _cleanup_(release_lock_file) LockFile name_lock = LOCK_FILE_INIT; + _cleanup_strv_free_ char **settings = NULL; + _cleanup_free_ char *roothash = NULL; + const char *new_path; + int r; + + assert(i); + + if (!image_name_is_valid(new_name)) + return -EINVAL; + + settings = image_settings_path(i); + if (!settings) + return -ENOMEM; + + r = image_roothash_path(i, &roothash); + if (r < 0) + return r; + + /* Make sure nobody takes the new name, between the time we + * checked it is currently unused in all search paths, and the + * time we take possession of it */ + r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock); + if (r < 0) + return r; + + r = image_find(IMAGE_MACHINE, new_name, NULL, NULL); + if (r >= 0) + return -EEXIST; + if (r != -ENOENT) + return r; + + switch (i->type) { + + case IMAGE_SUBVOLUME: + case IMAGE_DIRECTORY: + /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain + * directory. */ + + new_path = strjoina("/var/lib/machines/", new_name); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, i->path, AT_FDCWD, new_path, + (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_QUOTA); + if (r >= 0) + /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */ + (void) btrfs_subvol_auto_qgroup(new_path, 0, true); + + break; + + case IMAGE_RAW: + new_path = strjoina("/var/lib/machines/", new_name, ".raw"); + + r = copy_file_atomic_full(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, FS_NOCOW_FL, + COPY_REFLINK|COPY_CRTIME, NULL, NULL); + break; + + case IMAGE_BLOCK: + default: + return -EOPNOTSUPP; + } + + if (r < 0) + return r; + + STRV_FOREACH(j, settings) { + r = clone_auxiliary_file(*j, new_name, ".nspawn"); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j); + } + + r = clone_auxiliary_file(roothash, new_name, ".roothash"); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to clone root hash file %s, ignoring: %m", roothash); + + return 0; +} + +int image_read_only(Image *i, bool b) { + _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT; + int r; + + assert(i); + + if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i)) + return -EROFS; + + /* Make sure we don't interfere with a running nspawn */ + r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock); + if (r < 0) + return r; + + switch (i->type) { + + case IMAGE_SUBVOLUME: + + /* Note that we set the flag only on the top-level + * subvolume of the image. */ + + r = btrfs_subvol_set_read_only(i->path, b); + if (r < 0) + return r; + + break; + + case IMAGE_DIRECTORY: + /* For simple directory trees we cannot use the access + mode of the top-level directory, since it has an + effect on the container itself. However, we can + use the "immutable" flag, to at least make the + top-level directory read-only. It's not as good as + a read-only subvolume, but at least something, and + we can read the value back. */ + + r = chattr_path(i->path, b ? FS_IMMUTABLE_FL : 0, FS_IMMUTABLE_FL, NULL); + if (r < 0) + return r; + + break; + + case IMAGE_RAW: { + struct stat st; + + if (stat(i->path, &st) < 0) + return -errno; + + if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0) + return -errno; + + /* If the images is now read-only, it's a good time to + * defrag it, given that no write patterns will + * fragment it again. */ + if (b) + (void) btrfs_defrag(i->path); + break; + } + + case IMAGE_BLOCK: { + _cleanup_close_ int fd = -EBADF; + struct stat st; + int state = b; + + fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + if (!S_ISBLK(st.st_mode)) + return -ENOTTY; + + if (ioctl(fd, BLKROSET, &state) < 0) + return -errno; + + break; + } + + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static void make_lock_dir(void) { + (void) mkdir_p("/run/systemd/nspawn", 0755); + (void) mkdir("/run/systemd/nspawn/locks", 0700); +} + +int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) { + _cleanup_free_ char *p = NULL; + LockFile t = LOCK_FILE_INIT; + struct stat st; + bool exclusive; + int r; + + assert(path); + assert(global); + assert(local); + + /* Locks an image path. This actually creates two locks: one "local" one, next to the image path + * itself, which might be shared via NFS. And another "global" one, in /run, that uses the + * device/inode number. This has the benefit that we can even lock a tree that is a mount point, + * correctly. */ + + if (!path_is_absolute(path)) + return -EINVAL; + + switch (operation & (LOCK_SH|LOCK_EX)) { + case LOCK_SH: + exclusive = false; + break; + case LOCK_EX: + exclusive = true; + break; + default: + return -EINVAL; + } + + if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) { + *local = *global = (LockFile) LOCK_FILE_INIT; + return 0; + } + + /* Prohibit taking exclusive locks on the host image. We can't allow this, since we ourselves are + * running off it after all, and we don't want any images to manipulate the host image. We make an + * exception for shared locks however: we allow those (and make them NOPs since there's no point in + * taking them if there can't be exclusive locks). Strictly speaking these are questionable as well, + * since it means changes made to the host might propagate to the container as they happen (and a + * shared lock kinda suggests that no changes happen at all while it is in place), but it's too + * useful not to allow read-only containers off the host root, hence let's support this, and trust + * the user to do the right thing with this. */ + if (path_equal(path, "/")) { + if (exclusive) + return -EBUSY; + + *local = *global = (LockFile) LOCK_FILE_INIT; + return 0; + } + + if (stat(path, &st) >= 0) { + if (S_ISBLK(st.st_mode)) + r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev)); + else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode)) + r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino); + else + return -ENOTTY; + if (r < 0) + return -ENOMEM; + } + + /* For block devices we don't need the "local" lock, as the major/minor lock above should be + * sufficient, since block devices are host local anyway. */ + if (!path_startswith(path, "/dev/")) { + r = make_lock_file_for(path, operation, &t); + if (r < 0) { + if (!exclusive && r == -EROFS) + log_debug_errno(r, "Failed to create shared lock for '%s', ignoring: %m", path); + else + return r; + } + } + + if (p) { + make_lock_dir(); + + r = make_lock_file(p, operation, global); + if (r < 0) { + release_lock_file(&t); + return r; + } + } else + *global = (LockFile) LOCK_FILE_INIT; + + *local = t; + return 0; +} + +int image_set_limit(Image *i, uint64_t referenced_max) { + assert(i); + + if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i)) + return -EROFS; + + if (i->type != IMAGE_SUBVOLUME) + return -EOPNOTSUPP; + + /* We set the quota both for the subvolume as well as for the + * subtree. The latter is mostly for historical reasons, since + * we didn't use to have a concept of subtree quota, and hence + * only modified the subvolume quota. */ + + (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max); + (void) btrfs_subvol_auto_qgroup(i->path, 0, true); + return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max); +} + +int image_read_metadata(Image *i, const ImagePolicy *image_policy) { + _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT; + int r; + + assert(i); + + r = image_path_lock(i->path, LOCK_SH|LOCK_NB, &global_lock, &local_lock); + if (r < 0) + return r; + + switch (i->type) { + + case IMAGE_SUBVOLUME: + case IMAGE_DIRECTORY: { + _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **sysext_release = NULL, **confext_release = NULL; + _cleanup_free_ char *hostname = NULL, *path = NULL; + sd_id128_t machine_id = SD_ID128_NULL; + + if (i->class == IMAGE_SYSEXT) { + r = extension_has_forbidden_content(i->path); + if (r < 0) + return r; + if (r > 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "Conflicting content found in image %s, refusing.", + i->name); + } + + r = chase("/etc/hostname", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to chase /etc/hostname in image %s: %m", i->name); + else if (r >= 0) { + r = read_etc_hostname(path, &hostname); + if (r < 0) + log_debug_errno(errno, "Failed to read /etc/hostname of image %s: %m", i->name); + } + + path = mfree(path); + + r = id128_get_machine(i->path, &machine_id); + if (r < 0) + log_debug_errno(r, "Failed to read machine ID in image %s, ignoring: %m", i->name); + + r = chase("/etc/machine-info", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to chase /etc/machine-info in image %s: %m", i->name); + else if (r >= 0) { + r = load_env_file_pairs(NULL, path, &machine_info); + if (r < 0) + log_debug_errno(r, "Failed to parse machine-info data of %s: %m", i->name); + } + + r = load_os_release_pairs(i->path, &os_release); + if (r < 0) + log_debug_errno(r, "Failed to read os-release in image, ignoring: %m"); + + r = load_extension_release_pairs(i->path, IMAGE_SYSEXT, i->name, /* relax_extension_release_check= */ false, &sysext_release); + if (r < 0) + log_debug_errno(r, "Failed to read sysext-release in image, ignoring: %m"); + + r = load_extension_release_pairs(i->path, IMAGE_CONFEXT, i->name, /* relax_extension_release_check= */ false, &confext_release); + if (r < 0) + log_debug_errno(r, "Failed to read confext-release in image, ignoring: %m"); + + free_and_replace(i->hostname, hostname); + i->machine_id = machine_id; + strv_free_and_replace(i->machine_info, machine_info); + strv_free_and_replace(i->os_release, os_release); + strv_free_and_replace(i->sysext_release, sysext_release); + strv_free_and_replace(i->confext_release, confext_release); + break; + } + + case IMAGE_RAW: + case IMAGE_BLOCK: { + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; + + r = loop_device_make_by_path(i->path, O_RDONLY, /* sector_size= */ UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &d); + if (r < 0) + return r; + + r = dissect_loop_device( + d, + /* verity= */ NULL, + /* mount_options= */ NULL, + image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_READ_ONLY | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES, + &m); + if (r < 0) + return r; + + r = dissected_image_acquire_metadata(m, + DISSECT_IMAGE_VALIDATE_OS | + DISSECT_IMAGE_VALIDATE_OS_EXT); + if (r < 0) + return r; + + free_and_replace(i->hostname, m->hostname); + i->machine_id = m->machine_id; + strv_free_and_replace(i->machine_info, m->machine_info); + strv_free_and_replace(i->os_release, m->os_release); + strv_free_and_replace(i->sysext_release, m->sysext_release); + strv_free_and_replace(i->confext_release, m->confext_release); + + break; + } + + default: + return -EOPNOTSUPP; + } + + i->metadata_valid = true; + + return 0; +} + +int image_name_lock(const char *name, int operation, LockFile *ret) { + const char *p; + + assert(name); + assert(ret); + + /* Locks an image name, regardless of the precise path used. */ + + if (streq(name, ".host")) + return -EBUSY; + + if (!image_name_is_valid(name)) + return -EINVAL; + + if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) { + *ret = (LockFile) LOCK_FILE_INIT; + return 0; + } + + make_lock_dir(); + + p = strjoina("/run/systemd/nspawn/locks/name-", name); + return make_lock_file(p, operation, ret); +} + +bool image_in_search_path( + ImageClass class, + const char *root, + const char *image) { + + assert(image); + + NULSTR_FOREACH(path, pick_image_search_path(class)) { + const char *p, *q; + size_t k; + + if (!empty_or_root(root)) { + q = path_startswith(path, root); + if (!q) + continue; + } else + q = path; + + p = path_startswith(q, path); + if (!p) + continue; + + /* Make sure there's a filename following */ + k = strcspn(p, "/"); + if (k == 0) + continue; + + p += k; + + /* Accept trailing slashes */ + if (p[strspn(p, "/")] == 0) + return true; + } + + return false; +} + +int image_to_json(const struct Image *img, JsonVariant **ret) { + assert(img); + + return json_build(ret, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("Type", image_type_to_string(img->type)), + JSON_BUILD_PAIR_STRING("Class", image_class_to_string(img->class)), + JSON_BUILD_PAIR_STRING("Name", img->name), + JSON_BUILD_PAIR_CONDITION(img->path, "Path", JSON_BUILD_STRING(img->path)), + JSON_BUILD_PAIR_BOOLEAN("ReadOnly", img->read_only), + JSON_BUILD_PAIR_CONDITION(img->crtime != 0, "CreationTimestamp", JSON_BUILD_UNSIGNED(img->crtime)), + JSON_BUILD_PAIR_CONDITION(img->mtime != 0, "ModificationTimestamp", JSON_BUILD_UNSIGNED(img->mtime)), + JSON_BUILD_PAIR_CONDITION(img->usage != UINT64_MAX, "Usage", JSON_BUILD_UNSIGNED(img->usage)), + JSON_BUILD_PAIR_CONDITION(img->usage_exclusive != UINT64_MAX, "UsageExclusive", JSON_BUILD_UNSIGNED(img->usage_exclusive)), + JSON_BUILD_PAIR_CONDITION(img->limit != UINT64_MAX, "Limit", JSON_BUILD_UNSIGNED(img->limit)), + JSON_BUILD_PAIR_CONDITION(img->limit_exclusive != UINT64_MAX, "LimitExclusive", JSON_BUILD_UNSIGNED(img->limit_exclusive)))); +} + +static const char* const image_type_table[_IMAGE_TYPE_MAX] = { + [IMAGE_DIRECTORY] = "directory", + [IMAGE_SUBVOLUME] = "subvolume", + [IMAGE_RAW] = "raw", + [IMAGE_BLOCK] = "block", +}; + +DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType); diff --git a/src/shared/discover-image.h b/src/shared/discover-image.h new file mode 100644 index 0000000..a30a3d9 --- /dev/null +++ b/src/shared/discover-image.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-id128.h" + +#include "hashmap.h" +#include "image-policy.h" +#include "json.h" +#include "lock-util.h" +#include "macro.h" +#include "os-util.h" +#include "path-util.h" +#include "string-util.h" +#include "time-util.h" + +typedef enum ImageType { + IMAGE_DIRECTORY, + IMAGE_SUBVOLUME, + IMAGE_RAW, + IMAGE_BLOCK, + _IMAGE_TYPE_MAX, + _IMAGE_TYPE_INVALID = -EINVAL, +} ImageType; + +typedef struct Image { + unsigned n_ref; + + ImageType type; + ImageClass class; + char *name; + char *path; + bool read_only; + + usec_t crtime; + usec_t mtime; + + uint64_t usage; + uint64_t usage_exclusive; + uint64_t limit; + uint64_t limit_exclusive; + + char *hostname; + sd_id128_t machine_id; + char **machine_info; + char **os_release; + char **sysext_release; + char **confext_release; + + bool metadata_valid:1; + bool discoverable:1; /* true if we know for sure that image_find() would find the image given just the short name */ + + void *userdata; +} Image; + +Image *image_unref(Image *i); +Image *image_ref(Image *i); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Image*, image_unref); + +int image_find(ImageClass class, const char *root, const char *name, Image **ret); +int image_from_path(const char *path, Image **ret); +int image_find_harder(ImageClass class, const char *root, const char *name_or_path, Image **ret); +int image_discover(ImageClass class, const char *root, Hashmap *map); + +int image_remove(Image *i); +int image_rename(Image *i, const char *new_name); +int image_clone(Image *i, const char *new_name, bool read_only); +int image_read_only(Image *i, bool b); + +const char* image_type_to_string(ImageType t) _const_; +ImageType image_type_from_string(const char *s) _pure_; + +int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local); +int image_name_lock(const char *name, int operation, LockFile *ret); + +int image_set_limit(Image *i, uint64_t referenced_max); + +int image_read_metadata(Image *i, const ImagePolicy *image_policy); + +bool image_in_search_path(ImageClass class, const char *root, const char *image); + +static inline char **image_extension_release(Image *image, ImageClass class) { + assert(image); + + if (class == IMAGE_SYSEXT) + return image->sysext_release; + if (class == IMAGE_CONFEXT) + return image->confext_release; + + return NULL; +} + +static inline bool IMAGE_IS_HIDDEN(const struct Image *i) { + assert(i); + + return i->name && i->name[0] == '.'; +} + +static inline bool IMAGE_IS_VENDOR(const struct Image *i) { + assert(i); + + return i->path && path_startswith(i->path, "/usr"); +} + +static inline bool IMAGE_IS_HOST(const struct Image *i) { + assert(i); + + if (i->name && streq(i->name, ".host")) + return true; + + if (i->path && path_equal(i->path, "/")) + return true; + + return false; +} + +int image_to_json(const struct Image *i, JsonVariant **ret); + +extern const struct hash_ops image_hash_ops; diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c new file mode 100644 index 0000000..84cfbcd --- /dev/null +++ b/src/shared/dissect-image.c @@ -0,0 +1,4069 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_OPENSSL +#include +#include +#include +#endif + +#include "sd-device.h" +#include "sd-id128.h" + +#include "architecture.h" +#include "ask-password-api.h" +#include "blkid-util.h" +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "chase.h" +#include "conf-files.h" +#include "constants.h" +#include "copy.h" +#include "cryptsetup-util.h" +#include "device-nodes.h" +#include "device-util.h" +#include "devnum-util.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "dm-util.h" +#include "env-file.h" +#include "env-util.h" +#include "extension-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "fsck-util.h" +#include "gpt.h" +#include "hexdecoct.h" +#include "hostname-setup.h" +#include "id128-util.h" +#include "import-util.h" +#include "io-util.h" +#include "missing_mount.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "nulstr-util.h" +#include "openssl-util.h" +#include "os-util.h" +#include "path-util.h" +#include "process-util.h" +#include "raw-clone.h" +#include "resize-fs.h" +#include "signal-util.h" +#include "sparse-endian.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "udev-util.h" +#include "user-util.h" +#include "xattr-util.h" + +/* how many times to wait for the device nodes to appear */ +#define N_DEVICE_NODE_LIST_ATTEMPTS 10 + +int dissect_fstype_ok(const char *fstype) { + const char *e; + bool b; + + /* When we automatically mount file systems, be a bit conservative by default what we are willing to + * mount, just as an extra safety net to not mount with badly maintained legacy file system + * drivers. */ + + e = secure_getenv("SYSTEMD_DISSECT_FILE_SYSTEMS"); + if (e) { + _cleanup_strv_free_ char **l = NULL; + + l = strv_split(e, ":"); + if (!l) + return -ENOMEM; + + b = strv_contains(l, fstype); + } else + b = STR_IN_SET(fstype, + "btrfs", + "erofs", + "ext4", + "f2fs", + "squashfs", + "vfat", + "xfs"); + if (b) + return true; + + log_debug("File system type '%s' is not allowed to be mounted as result of automatic dissection.", fstype); + return false; +} + +int probe_sector_size(int fd, uint32_t *ret) { + + /* Disk images might be for 512B or for 4096 sector sizes, let's try to auto-detect that by searching + * for the GPT headers at the relevant byte offsets */ + + assert_cc(sizeof(GptHeader) == 92); + + /* We expect a sector size in the range 512…4096. The GPT header is located in the second + * sector. Hence it could be at byte 512 at the earliest, and at byte 4096 at the latest. And we must + * read with granularity of the largest sector size we care about. Which means 8K. */ + uint8_t sectors[2 * 4096]; + uint32_t found = 0; + ssize_t n; + + assert(fd >= 0); + assert(ret); + + n = pread(fd, sectors, sizeof(sectors), 0); + if (n < 0) + return -errno; + if (n != sizeof(sectors)) /* too short? */ + goto not_found; + + /* Let's see if we find the GPT partition header with various expected sector sizes */ + for (uint32_t sz = 512; sz <= 4096; sz <<= 1) { + const GptHeader *p; + + assert(sizeof(sectors) >= sz * 2); + p = (const GptHeader*) (sectors + sz); + + if (!gpt_header_has_signature(p)) + continue; + + if (found != 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "Detected valid partition table at offsets matching multiple sector sizes, refusing."); + + found = sz; + } + + if (found != 0) { + log_debug("Determined sector size %" PRIu32 " based on discovered partition table.", found); + *ret = found; + return 1; /* indicate we *did* find it */ + } + +not_found: + log_debug("Couldn't find any partition table to derive sector size of."); + *ret = 512; /* pick the traditional default */ + return 0; /* indicate we didn't find it */ +} + +int probe_sector_size_prefer_ioctl(int fd, uint32_t *ret) { + struct stat st; + + assert(fd >= 0); + assert(ret); + + /* Just like probe_sector_size(), but if we are looking at a block device, will use the already + * configured sector size rather than probing by contents */ + + if (fstat(fd, &st) < 0) + return -errno; + + if (S_ISBLK(st.st_mode)) + return blockdev_get_sector_size(fd, ret); + + return probe_sector_size(fd, ret); +} + +int probe_filesystem_full( + int fd, + const char *path, + uint64_t offset, + uint64_t size, + char **ret_fstype) { + + /* Try to find device content type and return it in *ret_fstype. If nothing is found, + * 0/NULL will be returned. -EUCLEAN will be returned for ambiguous results, and a + * different error otherwise. */ + +#if HAVE_BLKID + _cleanup_(blkid_free_probep) blkid_probe b = NULL; + _cleanup_free_ char *path_by_fd = NULL; + _cleanup_close_ int fd_close = -EBADF; + const char *fstype; + int r; + + assert(fd >= 0 || path); + assert(ret_fstype); + + if (fd < 0) { + fd_close = open(path, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd_close < 0) + return -errno; + + fd = fd_close; + } + + if (!path) { + r = fd_get_path(fd, &path_by_fd); + if (r < 0) + return r; + + path = path_by_fd; + } + + if (size == 0) /* empty size? nothing found! */ + goto not_found; + + b = blkid_new_probe(); + if (!b) + return -ENOMEM; + + /* The Linux kernel maintains separate block device caches for main ("whole") and partition block + * devices, which means making a change to one might not be reflected immediately when reading via + * the other. That's massively confusing when mixing accesses to such devices. Let's address this in + * a limited way: when probing a file system that is not at the beginning of the block device we + * apparently probe a partition via the main block device, and in that case let's first flush the + * main block device cache, so that we get the data that the per-partition block device last + * sync'ed on. + * + * This only works under the assumption that any tools that write to the partition block devices + * issue an syncfs()/fsync() on the device after making changes. Typically file system formatting + * tools that write a superblock onto a partition block device do that, however. */ + if (offset != 0) + if (ioctl(fd, BLKFLSBUF, 0) < 0) + log_debug_errno(errno, "Failed to flush block device cache, ignoring: %m"); + + errno = 0; + r = blkid_probe_set_device( + b, + fd, + offset, + size == UINT64_MAX ? 0 : size); /* when blkid sees size=0 it understands "everything". We prefer using UINT64_MAX for that */ + if (r != 0) + return errno_or_else(ENOMEM); + + blkid_probe_enable_superblocks(b, 1); + blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE); + + errno = 0; + r = blkid_do_safeprobe(b); + if (r == _BLKID_SAFEPROBE_NOT_FOUND) + goto not_found; + if (r == _BLKID_SAFEPROBE_AMBIGUOUS) + return log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), + "Results ambiguous for partition %s", path); + if (r == _BLKID_SAFEPROBE_ERROR) + return log_debug_errno(errno_or_else(EIO), "Failed to probe partition %s: %m", path); + + assert(r == _BLKID_SAFEPROBE_FOUND); + + (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL); + + if (fstype) { + char *t; + + log_debug("Probed fstype '%s' on partition %s.", fstype, path); + + t = strdup(fstype); + if (!t) + return -ENOMEM; + + *ret_fstype = t; + return 1; + } + +not_found: + log_debug("No type detected on partition %s", path); + *ret_fstype = NULL; + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +#if HAVE_BLKID +static int image_policy_may_use( + const ImagePolicy *policy, + PartitionDesignator designator) { + + PartitionPolicyFlags f; + + /* For each partition we find in the partition table do a first check if it may exist at all given + * the policy, or if it shall be ignored. */ + + f = image_policy_get_exhaustively(policy, designator); + if (f < 0) + return f; + + if ((f & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_ABSENT) + /* only flag set in policy is "absent"? then this partition may not exist at all */ + return log_debug_errno( + SYNTHETIC_ERRNO(ERFKILL), + "Partition of designator '%s' exists, but not allowed by policy, refusing.", + partition_designator_to_string(designator)); + if ((f & _PARTITION_POLICY_USE_MASK & ~PARTITION_POLICY_ABSENT) == PARTITION_POLICY_UNUSED) { + /* only "unused" or "unused" + "absent" are set? then don't use it */ + log_debug("Partition of designator '%s' exists, and policy dictates to ignore it, doing so.", + partition_designator_to_string(designator)); + return false; /* ignore! */ + } + + return true; /* use! */ +} + +static int image_policy_check_protection( + const ImagePolicy *policy, + PartitionDesignator designator, + PartitionPolicyFlags found_flags) { + + PartitionPolicyFlags policy_flags; + + /* Checks if the flags in the policy for the designated partition overlap the flags of what we found */ + + if (found_flags < 0) + return found_flags; + + policy_flags = image_policy_get_exhaustively(policy, designator); + if (policy_flags < 0) + return policy_flags; + + if ((found_flags & policy_flags) == 0) { + _cleanup_free_ char *found_flags_string = NULL, *policy_flags_string = NULL; + + (void) partition_policy_flags_to_string(found_flags, /* simplify= */ true, &found_flags_string); + (void) partition_policy_flags_to_string(policy_flags, /* simplify= */ true, &policy_flags_string); + + return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s discovered with policy '%s' but '%s' was required, refusing.", + partition_designator_to_string(designator), + strnull(found_flags_string), strnull(policy_flags_string)); + } + + return 0; +} + +static int image_policy_check_partition_flags( + const ImagePolicy *policy, + PartitionDesignator designator, + uint64_t gpt_flags) { + + PartitionPolicyFlags policy_flags; + bool b; + + /* Checks if the partition flags in the policy match reality */ + + policy_flags = image_policy_get_exhaustively(policy, designator); + if (policy_flags < 0) + return policy_flags; + + b = FLAGS_SET(gpt_flags, SD_GPT_FLAG_READ_ONLY); + if ((policy_flags & _PARTITION_POLICY_READ_ONLY_MASK) == (b ? PARTITION_POLICY_READ_ONLY_OFF : PARTITION_POLICY_READ_ONLY_ON)) + return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s has 'read-only' flag incorrectly set (must be %s, is %s), refusing.", + partition_designator_to_string(designator), + one_zero(!b), one_zero(b)); + + b = FLAGS_SET(gpt_flags, SD_GPT_FLAG_GROWFS); + if ((policy_flags & _PARTITION_POLICY_GROWFS_MASK) == (b ? PARTITION_POLICY_GROWFS_OFF : PARTITION_POLICY_GROWFS_ON)) + return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s has 'growfs' flag incorrectly set (must be %s, is %s), refusing.", + partition_designator_to_string(designator), + one_zero(!b), one_zero(b)); + + return 0; +} + +static int dissected_image_probe_filesystems( + DissectedImage *m, + int fd, + const ImagePolicy *policy) { + + int r; + + assert(m); + + /* Fill in file system types if we don't know them yet. */ + + for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) { + DissectedPartition *p = m->partitions + i; + PartitionPolicyFlags found_flags; + + if (!p->found) + continue; + + if (!p->fstype) { + /* If we have an fd referring to the partition block device, use that. Otherwise go + * via the whole block device or backing regular file, and read via offset. */ + if (p->mount_node_fd >= 0) + r = probe_filesystem_full(p->mount_node_fd, p->node, 0, UINT64_MAX, &p->fstype); + else + r = probe_filesystem_full(fd, p->node, p->offset, p->size, &p->fstype); + if (r < 0) + return r; + } + + if (streq_ptr(p->fstype, "crypto_LUKS")) { + m->encrypted = true; + found_flags = PARTITION_POLICY_ENCRYPTED; /* found this one, and its definitely encrypted */ + } else + /* found it, but it's definitely not encrypted, hence mask the encrypted flag, but + * set all other ways that indicate "present". */ + found_flags = PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED; + + if (p->fstype && fstype_is_ro(p->fstype)) + p->rw = false; + + if (!p->rw) + p->growfs = false; + + /* We might have learnt more about the file system now (i.e. whether it is encrypted or not), + * hence we need to validate this against policy again, to see if the policy still matches + * with this new information. Note that image_policy_check_protection() will check for + * overlap between what's allowed in the policy and what we pass as 'found_policy' here. In + * the unencrypted case we thus might pass an overly unspecific mask here (i.e. unprotected + * OR verity OR signed), but that's fine since the earlier policy check already checked more + * specific which of those three cases where OK. Keep in mind that this function here only + * looks at specific partitions (and thus can only deduce encryption or not) but not the + * overall partition table (and thus cannot deduce verity or not). The earlier dissection + * checks already did the relevant checks that look at the whole partition table, and + * enforced policy there as needed. */ + r = image_policy_check_protection(policy, i, found_flags); + if (r < 0) + return r; + } + + return 0; +} + +static void check_partition_flags( + const char *node, + unsigned long long pflags, + unsigned long long supported) { + + assert(node); + + /* Mask away all flags supported by this partition's type and the three flags the UEFI spec defines generically */ + pflags &= ~(supported | + SD_GPT_FLAG_REQUIRED_PARTITION | + SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL | + SD_GPT_FLAG_LEGACY_BIOS_BOOTABLE); + + if (pflags == 0) + return; + + /* If there are other bits set, then log about it, to make things discoverable */ + for (unsigned i = 0; i < sizeof(pflags) * 8; i++) { + unsigned long long bit = 1ULL << i; + if (!FLAGS_SET(pflags, bit)) + continue; + + log_debug("Unexpected partition flag %llu set on %s!", bit, node); + } +} + +static int dissected_image_new(const char *path, DissectedImage **ret) { + _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; + _cleanup_free_ char *name = NULL; + int r; + + assert(ret); + + if (path) { + _cleanup_free_ char *filename = NULL; + + r = path_extract_filename(path, &filename); + if (r < 0) + return r; + + r = raw_strip_suffixes(filename, &name); + if (r < 0) + return r; + + if (!image_name_is_valid(name)) { + log_debug("Image name %s is not valid, ignoring.", strna(name)); + name = mfree(name); + } + } + + m = new(DissectedImage, 1); + if (!m) + return -ENOMEM; + + *m = (DissectedImage) { + .has_init_system = -1, + .image_name = TAKE_PTR(name), + }; + + for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) + m->partitions[i] = DISSECTED_PARTITION_NULL; + + *ret = TAKE_PTR(m); + return 0; +} +#endif + +static void dissected_partition_done(DissectedPartition *p) { + assert(p); + + free(p->fstype); + free(p->node); + free(p->label); + free(p->decrypted_fstype); + free(p->decrypted_node); + free(p->mount_options); + safe_close(p->mount_node_fd); + safe_close(p->fsmount_fd); + + *p = DISSECTED_PARTITION_NULL; +} + +#if HAVE_BLKID +static int make_partition_devname( + const char *whole_devname, + uint64_t diskseq, + int nr, + DissectImageFlags flags, + char **ret) { + + _cleanup_free_ char *s = NULL; + int r; + + assert(whole_devname); + assert(nr != 0); /* zero is not a valid partition nr */ + assert(ret); + + if (!FLAGS_SET(flags, DISSECT_IMAGE_DISKSEQ_DEVNODE) || diskseq == 0) { + + /* Given a whole block device node name (e.g. /dev/sda or /dev/loop7) generate a partition + * device name (e.g. /dev/sda7 or /dev/loop7p5). The rule the kernel uses is simple: if whole + * block device node name ends in a digit, then suffix a 'p', followed by the partition + * number. Otherwise, just suffix the partition number without any 'p'. */ + + if (nr < 0) { /* whole disk? */ + s = strdup(whole_devname); + if (!s) + return -ENOMEM; + } else { + size_t l = strlen(whole_devname); + if (l < 1) /* underflow check for the subtraction below */ + return -EINVAL; + + bool need_p = ascii_isdigit(whole_devname[l-1]); /* Last char a digit? */ + + if (asprintf(&s, "%s%s%i", whole_devname, need_p ? "p" : "", nr) < 0) + return -ENOMEM; + } + } else { + if (nr < 0) /* whole disk? */ + r = asprintf(&s, "/dev/disk/by-diskseq/%" PRIu64, diskseq); + else + r = asprintf(&s, "/dev/disk/by-diskseq/%" PRIu64 "-part%i", diskseq, nr); + if (r < 0) + return -ENOMEM; + } + + *ret = TAKE_PTR(s); + return 0; +} + +static int open_partition( + const char *node, + bool is_partition, + const LoopDevice *loop) { + + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_close_ int fd = -EBADF; + dev_t devnum; + int r; + + assert(node); + assert(loop); + + fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return -errno; + + /* Check if the block device is a child of (or equivalent to) the originally provided one. */ + r = block_device_new_from_fd(fd, is_partition ? BLOCK_DEVICE_LOOKUP_WHOLE_DISK : 0, &dev); + if (r < 0) + return r; + + r = sd_device_get_devnum(dev, &devnum); + if (r < 0) + return r; + + if (loop->devno != devnum) + return -ENXIO; + + /* Also check diskseq. */ + if (loop->diskseq != 0) { + uint64_t diskseq; + + r = fd_get_diskseq(fd, &diskseq); + if (r < 0) + return r; + + if (loop->diskseq != diskseq) + return -ENXIO; + } + + log_debug("Opened %s (fd=%i, whole_block_devnum=" DEVNUM_FORMAT_STR ", diskseq=%" PRIu64 ").", + node, fd, DEVNUM_FORMAT_VAL(loop->devno), loop->diskseq); + return TAKE_FD(fd); +} + +static int compare_arch(Architecture a, Architecture b) { + if (a == b) + return 0; + + if (a == native_architecture()) + return 1; + + if (b == native_architecture()) + return -1; + +#ifdef ARCHITECTURE_SECONDARY + if (a == ARCHITECTURE_SECONDARY) + return 1; + + if (b == ARCHITECTURE_SECONDARY) + return -1; +#endif + + return 0; +} + +static int dissect_image( + DissectedImage *m, + int fd, + const char *devname, + const VeritySettings *verity, + const MountOptions *mount_options, + const ImagePolicy *policy, + DissectImageFlags flags) { + + sd_id128_t root_uuid = SD_ID128_NULL, root_verity_uuid = SD_ID128_NULL; + sd_id128_t usr_uuid = SD_ID128_NULL, usr_verity_uuid = SD_ID128_NULL; + bool is_gpt, is_mbr, multiple_generic = false, + generic_rw = false, /* initialize to appease gcc */ + generic_growfs = false; + _cleanup_(blkid_free_probep) blkid_probe b = NULL; + _cleanup_free_ char *generic_node = NULL; + sd_id128_t generic_uuid = SD_ID128_NULL; + const char *pttype = NULL, *sptuuid = NULL; + blkid_partlist pl; + int r, generic_nr = -1, n_partitions; + + assert(m); + assert(fd >= 0); + assert(devname); + assert(!verity || verity->designator < 0 || IN_SET(verity->designator, PARTITION_ROOT, PARTITION_USR)); + assert(!verity || verity->root_hash || verity->root_hash_size == 0); + assert(!verity || verity->root_hash_sig || verity->root_hash_sig_size == 0); + assert(!verity || (verity->root_hash || !verity->root_hash_sig)); + assert(!((flags & DISSECT_IMAGE_GPT_ONLY) && (flags & DISSECT_IMAGE_NO_PARTITION_TABLE))); + assert(m->sector_size > 0); + + /* Probes a disk image, and returns information about what it found in *ret. + * + * Returns -ENOPKG if no suitable partition table or file system could be found. + * Returns -EADDRNOTAVAIL if a root hash was specified but no matching root/verity partitions found. + * Returns -ENXIO if we couldn't find any partition suitable as root or /usr partition + * Returns -ENOTUNIQ if we only found multiple generic partitions and thus don't know what to do with that + * Returns -ERFKILL if image doesn't match image policy + * Returns -EBADR if verity data was provided externally for an image that has a GPT partition table (i.e. is not just a naked fs) + * Returns -EPROTONOSUPPORT if DISSECT_IMAGE_ADD_PARTITION_DEVICES is set but the block device does not have partition logic enabled + * Returns -ENOMSG if we didn't find a single usable partition (and DISSECT_IMAGE_REFUSE_EMPTY is set) */ + + uint64_t diskseq = m->loop ? m->loop->diskseq : 0; + + if (verity && verity->root_hash) { + sd_id128_t fsuuid, vuuid; + + /* If a root hash is supplied, then we use the root partition that has a UUID that match the + * first 128-bit of the root hash. And we use the verity partition that has a UUID that match + * the final 128-bit. */ + + if (verity->root_hash_size < sizeof(sd_id128_t)) + return -EINVAL; + + memcpy(&fsuuid, verity->root_hash, sizeof(sd_id128_t)); + memcpy(&vuuid, (const uint8_t*) verity->root_hash + verity->root_hash_size - sizeof(sd_id128_t), sizeof(sd_id128_t)); + + if (sd_id128_is_null(fsuuid)) + return -EINVAL; + if (sd_id128_is_null(vuuid)) + return -EINVAL; + + /* If the verity data declares it's for the /usr partition, then search for that, in all + * other cases assume it's for the root partition. */ + if (verity->designator == PARTITION_USR) { + usr_uuid = fsuuid; + usr_verity_uuid = vuuid; + } else { + root_uuid = fsuuid; + root_verity_uuid = vuuid; + } + } + + b = blkid_new_probe(); + if (!b) + return -ENOMEM; + + errno = 0; + r = blkid_probe_set_device(b, fd, 0, 0); + if (r != 0) + return errno_or_else(ENOMEM); + + errno = 0; + r = blkid_probe_set_sectorsize(b, m->sector_size); + if (r != 0) + return errno_or_else(EIO); + + if ((flags & DISSECT_IMAGE_GPT_ONLY) == 0) { + /* Look for file system superblocks, unless we only shall look for GPT partition tables */ + blkid_probe_enable_superblocks(b, 1); + blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE|BLKID_SUBLKS_USAGE|BLKID_SUBLKS_UUID); + } + + blkid_probe_enable_partitions(b, 1); + blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS); + + errno = 0; + r = blkid_do_safeprobe(b); + if (r == _BLKID_SAFEPROBE_ERROR) + return errno_or_else(EIO); + if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOPKG), "Failed to identify any partition table."); + + assert(r == _BLKID_SAFEPROBE_FOUND); + + if ((!(flags & DISSECT_IMAGE_GPT_ONLY) && + (flags & DISSECT_IMAGE_GENERIC_ROOT)) || + (flags & DISSECT_IMAGE_NO_PARTITION_TABLE)) { + const char *usage = NULL; + + /* If flags permit this, also allow using non-partitioned single-filesystem images */ + + (void) blkid_probe_lookup_value(b, "USAGE", &usage, NULL); + if (STRPTR_IN_SET(usage, "filesystem", "crypto")) { + _cleanup_free_ char *t = NULL, *n = NULL, *o = NULL; + const char *fstype = NULL, *options = NULL, *suuid = NULL; + _cleanup_close_ int mount_node_fd = -EBADF; + sd_id128_t uuid = SD_ID128_NULL; + PartitionPolicyFlags found_flags; + bool encrypted; + + /* OK, we have found a file system, that's our root partition then. */ + + r = image_policy_may_use(policy, PARTITION_ROOT); + if (r < 0) + return r; + if (r == 0) /* policy says ignore this, so we ignore it */ + return -ENOPKG; + + (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL); + (void) blkid_probe_lookup_value(b, "UUID", &suuid, NULL); + + encrypted = streq_ptr(fstype, "crypto_LUKS"); + + if (verity_settings_data_covers(verity, PARTITION_ROOT)) + found_flags = verity->root_hash_sig ? PARTITION_POLICY_SIGNED : PARTITION_POLICY_VERITY; + else + found_flags = encrypted ? PARTITION_POLICY_ENCRYPTED : PARTITION_POLICY_UNPROTECTED; + + r = image_policy_check_protection(policy, PARTITION_ROOT, found_flags); + if (r < 0) + return r; + + r = image_policy_check_partition_flags(policy, PARTITION_ROOT, 0); /* we have no gpt partition flags, hence check against all bits off */ + if (r < 0) + return r; + + if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) { + mount_node_fd = open_partition(devname, /* is_partition = */ false, m->loop); + if (mount_node_fd < 0) + return mount_node_fd; + } + + if (fstype) { + t = strdup(fstype); + if (!t) + return -ENOMEM; + } + + if (suuid) { + /* blkid will return FAT's serial number as UUID, hence it is quite possible + * that parsing this will fail. We'll ignore the ID, since it's just too + * short to be useful as tru identifier. */ + r = sd_id128_from_string(suuid, &uuid); + if (r < 0) + log_debug_errno(r, "Failed to parse file system UUID '%s', ignoring: %m", suuid); + } + + r = make_partition_devname(devname, diskseq, -1, flags, &n); + if (r < 0) + return r; + + m->single_file_system = true; + m->encrypted = encrypted; + + m->has_verity = verity && verity->data_path; + m->verity_ready = verity_settings_data_covers(verity, PARTITION_ROOT); + + m->has_verity_sig = false; /* signature not embedded, must be specified */ + m->verity_sig_ready = m->verity_ready && verity->root_hash_sig; + + m->image_uuid = uuid; + + options = mount_options_from_designator(mount_options, PARTITION_ROOT); + if (options) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + m->partitions[PARTITION_ROOT] = (DissectedPartition) { + .found = true, + .rw = !m->verity_ready && !fstype_is_ro(fstype), + .partno = -1, + .architecture = _ARCHITECTURE_INVALID, + .fstype = TAKE_PTR(t), + .node = TAKE_PTR(n), + .mount_options = TAKE_PTR(o), + .mount_node_fd = TAKE_FD(mount_node_fd), + .offset = 0, + .size = UINT64_MAX, + .fsmount_fd = -EBADF, + }; + + return 0; + } + } + + (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL); + if (!pttype) + return -ENOPKG; + + is_gpt = streq_ptr(pttype, "gpt"); + is_mbr = streq_ptr(pttype, "dos"); + + if (!is_gpt && ((flags & DISSECT_IMAGE_GPT_ONLY) || !is_mbr)) + return -ENOPKG; + + /* We support external verity data partitions only if the image has no partition table */ + if (verity && verity->data_path) + return -EBADR; + + if (FLAGS_SET(flags, DISSECT_IMAGE_ADD_PARTITION_DEVICES)) { + /* Safety check: refuse block devices that carry a partition table but for which the kernel doesn't + * do partition scanning. */ + r = blockdev_partscan_enabled(fd); + if (r < 0) + return r; + if (r == 0) + return -EPROTONOSUPPORT; + } + + (void) blkid_probe_lookup_value(b, "PTUUID", &sptuuid, NULL); + if (sptuuid) { + r = sd_id128_from_string(sptuuid, &m->image_uuid); + if (r < 0) + log_debug_errno(r, "Failed to parse partition table UUID '%s', ignoring: %m", sptuuid); + } + + errno = 0; + pl = blkid_probe_get_partitions(b); + if (!pl) + return errno_or_else(ENOMEM); + + errno = 0; + n_partitions = blkid_partlist_numof_partitions(pl); + if (n_partitions < 0) + return errno_or_else(EIO); + + for (int i = 0; i < n_partitions; i++) { + _cleanup_free_ char *node = NULL; + unsigned long long pflags; + blkid_loff_t start, size; + blkid_partition pp; + int nr; + + errno = 0; + pp = blkid_partlist_get_partition(pl, i); + if (!pp) + return errno_or_else(EIO); + + pflags = blkid_partition_get_flags(pp); + + errno = 0; + nr = blkid_partition_get_partno(pp); + if (nr < 0) + return errno_or_else(EIO); + + errno = 0; + start = blkid_partition_get_start(pp); + if (start < 0) + return errno_or_else(EIO); + + assert((uint64_t) start < UINT64_MAX/512); + + errno = 0; + size = blkid_partition_get_size(pp); + if (size < 0) + return errno_or_else(EIO); + + assert((uint64_t) size < UINT64_MAX/512); + + /* While probing we need the non-diskseq device node name to access the thing, hence mask off + * DISSECT_IMAGE_DISKSEQ_DEVNODE. */ + r = make_partition_devname(devname, diskseq, nr, flags & ~DISSECT_IMAGE_DISKSEQ_DEVNODE, &node); + if (r < 0) + return r; + + /* So here's the thing: after the main ("whole") block device popped up it might take a while + * before the kernel fully probed the partition table. Waiting for that to finish is icky in + * userspace. So here's what we do instead. We issue the BLKPG_ADD_PARTITION ioctl to add the + * partition ourselves, racing against the kernel. Good thing is: if this call fails with + * EBUSY then the kernel was quicker than us, and that's totally OK, the outcome is good for + * us: the device node will exist. If OTOH our call was successful we won the race. Which is + * also good as the outcome is the same: the partition block device exists, and we can use + * it. + * + * Kernel returns EBUSY if there's already a partition by that number or an overlapping + * partition already existent. */ + + if (FLAGS_SET(flags, DISSECT_IMAGE_ADD_PARTITION_DEVICES)) { + r = block_device_add_partition(fd, node, nr, (uint64_t) start * 512, (uint64_t) size * 512); + if (r < 0) { + if (r != -EBUSY) + return log_debug_errno(r, "BLKPG_ADD_PARTITION failed: %m"); + + log_debug_errno(r, "Kernel was quicker than us in adding partition %i.", nr); + } else + log_debug("We were quicker than kernel in adding partition %i.", nr); + } + + if (is_gpt) { + const char *fstype = NULL, *label; + sd_id128_t type_id, id; + GptPartitionType type; + bool rw = true, growfs = false; + + r = blkid_partition_get_uuid_id128(pp, &id); + if (r < 0) { + log_debug_errno(r, "Failed to read partition UUID, ignoring: %m"); + continue; + } + + r = blkid_partition_get_type_id128(pp, &type_id); + if (r < 0) { + log_debug_errno(r, "Failed to read partition type UUID, ignoring: %m"); + continue; + } + + type = gpt_partition_type_from_uuid(type_id); + + label = blkid_partition_get_name(pp); /* libblkid returns NULL here if empty */ + + if (IN_SET(type.designator, + PARTITION_HOME, + PARTITION_SRV, + PARTITION_XBOOTLDR, + PARTITION_TMP)) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + rw = !(pflags & SD_GPT_FLAG_READ_ONLY); + growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS); + + } else if (type.designator == PARTITION_ESP) { + + /* Note that we don't check the SD_GPT_FLAG_NO_AUTO flag for the ESP, as it is + * not defined there. We instead check the SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL, as + * recommended by the UEFI spec (See "12.3.3 Number and Location of System + * Partitions"). */ + + if (pflags & SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL) + continue; + + fstype = "vfat"; + + } else if (type.designator == PARTITION_ROOT) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + /* If a root ID is specified, ignore everything but the root id */ + if (!sd_id128_is_null(root_uuid) && !sd_id128_equal(root_uuid, id)) + continue; + + rw = !(pflags & SD_GPT_FLAG_READ_ONLY); + growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS); + + } else if (type.designator == PARTITION_ROOT_VERITY) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + m->has_verity = true; + + /* If no verity configuration is specified, then don't do verity */ + if (!verity) + continue; + if (verity->designator >= 0 && verity->designator != PARTITION_ROOT) + continue; + + /* If root hash is specified, then ignore everything but the root id */ + if (!sd_id128_is_null(root_verity_uuid) && !sd_id128_equal(root_verity_uuid, id)) + continue; + + fstype = "DM_verity_hash"; + rw = false; + + } else if (type.designator == PARTITION_ROOT_VERITY_SIG) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + m->has_verity_sig = true; + + if (!verity) + continue; + if (verity->designator >= 0 && verity->designator != PARTITION_ROOT) + continue; + + fstype = "verity_hash_signature"; + rw = false; + + } else if (type.designator == PARTITION_USR) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + /* If a usr ID is specified, ignore everything but the usr id */ + if (!sd_id128_is_null(usr_uuid) && !sd_id128_equal(usr_uuid, id)) + continue; + + rw = !(pflags & SD_GPT_FLAG_READ_ONLY); + growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS); + + } else if (type.designator == PARTITION_USR_VERITY) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + m->has_verity = true; + + if (!verity) + continue; + if (verity->designator >= 0 && verity->designator != PARTITION_USR) + continue; + + /* If usr hash is specified, then ignore everything but the usr id */ + if (!sd_id128_is_null(usr_verity_uuid) && !sd_id128_equal(usr_verity_uuid, id)) + continue; + + fstype = "DM_verity_hash"; + rw = false; + + } else if (type.designator == PARTITION_USR_VERITY_SIG) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + m->has_verity_sig = true; + + if (!verity) + continue; + if (verity->designator >= 0 && verity->designator != PARTITION_USR) + continue; + + fstype = "verity_hash_signature"; + rw = false; + + } else if (type.designator == PARTITION_SWAP) { + + check_partition_flags(node, pflags, SD_GPT_FLAG_NO_AUTO); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + /* Note: we don't set fstype = "swap" here, because we still need to probe if + * it might be encrypted (i.e. fstype "crypt_LUKS") or unencrypted + * (i.e. fstype "swap"), and the only way to figure that out is via fstype + * probing. */ + + /* We don't have a designator for SD_GPT_LINUX_GENERIC so check the UUID instead. */ + } else if (sd_id128_equal(type.uuid, SD_GPT_LINUX_GENERIC)) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + if (generic_node) + multiple_generic = true; + else { + generic_nr = nr; + generic_rw = !(pflags & SD_GPT_FLAG_READ_ONLY); + generic_growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS); + generic_uuid = id; + generic_node = TAKE_PTR(node); + } + + } else if (type.designator == PARTITION_VAR) { + + check_partition_flags(node, pflags, + SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS); + + if (pflags & SD_GPT_FLAG_NO_AUTO) + continue; + + if (!FLAGS_SET(flags, DISSECT_IMAGE_RELAX_VAR_CHECK)) { + sd_id128_t var_uuid; + + /* For /var we insist that the uuid of the partition matches the + * HMAC-SHA256 of the /var GPT partition type uuid, keyed by machine + * ID. Why? Unlike the other partitions /var is inherently + * installation specific, hence we need to be careful not to mount it + * in the wrong installation. By hashing the partition UUID from + * /etc/machine-id we can securely bind the partition to the + * installation. */ + + r = sd_id128_get_machine_app_specific(SD_GPT_VAR, &var_uuid); + if (r < 0) + return r; + + if (!sd_id128_equal(var_uuid, id)) { + log_debug("Found a /var/ partition, but its UUID didn't match our expectations " + "(found: " SD_ID128_UUID_FORMAT_STR ", expected: " SD_ID128_UUID_FORMAT_STR "), ignoring.", + SD_ID128_FORMAT_VAL(id), SD_ID128_FORMAT_VAL(var_uuid)); + continue; + } + } + + rw = !(pflags & SD_GPT_FLAG_READ_ONLY); + growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS); + } + + if (type.designator != _PARTITION_DESIGNATOR_INVALID) { + _cleanup_free_ char *t = NULL, *o = NULL, *l = NULL, *n = NULL; + _cleanup_close_ int mount_node_fd = -EBADF; + const char *options = NULL; + + r = image_policy_may_use(policy, type.designator); + if (r < 0) + return r; + if (r == 0) { + /* Policy says: ignore; Remember this fact, so that we later can distinguish between "found but ignored" and "not found at all" */ + + if (!m->partitions[type.designator].found) + m->partitions[type.designator].ignored = true; + + continue; + } + + if (m->partitions[type.designator].found) { + int c; + + /* For most partition types the first one we see wins. Except for the + * rootfs and /usr, where we do a version compare of the label, and + * let the newest version win. This permits a simple A/B versioning + * scheme in OS images. */ + + c = compare_arch(type.arch, m->partitions[type.designator].architecture); + if (c < 0) /* the arch we already found is better than the one we found now */ + continue; + if (c == 0 && /* same arch? then go by version in label */ + (!partition_designator_is_versioned(type.designator) || + strverscmp_improved(label, m->partitions[type.designator].label) <= 0)) + continue; + + dissected_partition_done(m->partitions + type.designator); + } + + if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES) && + type.designator != PARTITION_SWAP) { + mount_node_fd = open_partition(node, /* is_partition = */ true, m->loop); + if (mount_node_fd < 0) + return mount_node_fd; + } + + r = make_partition_devname(devname, diskseq, nr, flags, &n); + if (r < 0) + return r; + + if (fstype) { + t = strdup(fstype); + if (!t) + return -ENOMEM; + } + + if (label) { + l = strdup(label); + if (!l) + return -ENOMEM; + } + + options = mount_options_from_designator(mount_options, type.designator); + if (options) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + m->partitions[type.designator] = (DissectedPartition) { + .found = true, + .partno = nr, + .rw = rw, + .growfs = growfs, + .architecture = type.arch, + .node = TAKE_PTR(n), + .fstype = TAKE_PTR(t), + .label = TAKE_PTR(l), + .uuid = id, + .mount_options = TAKE_PTR(o), + .mount_node_fd = TAKE_FD(mount_node_fd), + .offset = (uint64_t) start * 512, + .size = (uint64_t) size * 512, + .gpt_flags = pflags, + .fsmount_fd = -EBADF, + }; + } + + } else if (is_mbr) { + + switch (blkid_partition_get_type(pp)) { + + case 0x83: /* Linux partition */ + + if (pflags != 0x80) /* Bootable flag */ + continue; + + if (generic_node) + multiple_generic = true; + else { + generic_nr = nr; + generic_rw = true; + generic_growfs = false; + generic_node = TAKE_PTR(node); + } + + break; + + case 0xEA: { /* Boot Loader Spec extended $BOOT partition */ + _cleanup_close_ int mount_node_fd = -EBADF; + _cleanup_free_ char *o = NULL, *n = NULL; + sd_id128_t id = SD_ID128_NULL; + const char *options = NULL; + + r = image_policy_may_use(policy, PARTITION_XBOOTLDR); + if (r < 0) + return r; + if (r == 0) { /* policy says: ignore */ + if (!m->partitions[PARTITION_XBOOTLDR].found) + m->partitions[PARTITION_XBOOTLDR].ignored = true; + + continue; + } + + /* First one wins */ + if (m->partitions[PARTITION_XBOOTLDR].found) + continue; + + if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) { + mount_node_fd = open_partition(node, /* is_partition = */ true, m->loop); + if (mount_node_fd < 0) + return mount_node_fd; + } + + (void) blkid_partition_get_uuid_id128(pp, &id); + + r = make_partition_devname(devname, diskseq, nr, flags, &n); + if (r < 0) + return r; + + options = mount_options_from_designator(mount_options, PARTITION_XBOOTLDR); + if (options) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + m->partitions[PARTITION_XBOOTLDR] = (DissectedPartition) { + .found = true, + .partno = nr, + .rw = true, + .growfs = false, + .architecture = _ARCHITECTURE_INVALID, + .node = TAKE_PTR(n), + .uuid = id, + .mount_options = TAKE_PTR(o), + .mount_node_fd = TAKE_FD(mount_node_fd), + .offset = (uint64_t) start * 512, + .size = (uint64_t) size * 512, + .fsmount_fd = -EBADF, + }; + + break; + }} + } + } + + if (!m->partitions[PARTITION_ROOT].found && + (m->partitions[PARTITION_ROOT_VERITY].found || + m->partitions[PARTITION_ROOT_VERITY_SIG].found)) + return -EADDRNOTAVAIL; /* Verity found but no matching rootfs? Something is off, refuse. */ + + /* Hmm, we found a signature partition but no Verity data? Something is off. */ + if (m->partitions[PARTITION_ROOT_VERITY_SIG].found && !m->partitions[PARTITION_ROOT_VERITY].found) + return -EADDRNOTAVAIL; + + if (!m->partitions[PARTITION_USR].found && + (m->partitions[PARTITION_USR_VERITY].found || + m->partitions[PARTITION_USR_VERITY_SIG].found)) + return -EADDRNOTAVAIL; /* as above */ + + /* as above */ + if (m->partitions[PARTITION_USR_VERITY_SIG].found && !m->partitions[PARTITION_USR_VERITY].found) + return -EADDRNOTAVAIL; + + /* If root and /usr are combined then insist that the architecture matches */ + if (m->partitions[PARTITION_ROOT].found && + m->partitions[PARTITION_USR].found && + (m->partitions[PARTITION_ROOT].architecture >= 0 && + m->partitions[PARTITION_USR].architecture >= 0 && + m->partitions[PARTITION_ROOT].architecture != m->partitions[PARTITION_USR].architecture)) + return -EADDRNOTAVAIL; + + if (!m->partitions[PARTITION_ROOT].found && + !m->partitions[PARTITION_USR].found && + (flags & DISSECT_IMAGE_GENERIC_ROOT) && + (!verity || !verity->root_hash || verity->designator != PARTITION_USR)) { + + /* OK, we found nothing usable, then check if there's a single generic partition, and use + * that. If the root hash was set however, then we won't fall back to a generic node, because + * the root hash decides. */ + + /* If we didn't find a properly marked root partition, but we did find a single suitable + * generic Linux partition, then use this as root partition, if the caller asked for it. */ + if (multiple_generic) + return -ENOTUNIQ; + + /* If we didn't find a generic node, then we can't fix this up either */ + if (generic_node) { + r = image_policy_may_use(policy, PARTITION_ROOT); + if (r < 0) + return r; + if (r == 0) + /* Policy says: ignore; remember that we did */ + m->partitions[PARTITION_ROOT].ignored = true; + else { + _cleanup_close_ int mount_node_fd = -EBADF; + _cleanup_free_ char *o = NULL, *n = NULL; + const char *options; + + if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) { + mount_node_fd = open_partition(generic_node, /* is_partition = */ true, m->loop); + if (mount_node_fd < 0) + return mount_node_fd; + } + + r = make_partition_devname(devname, diskseq, generic_nr, flags, &n); + if (r < 0) + return r; + + options = mount_options_from_designator(mount_options, PARTITION_ROOT); + if (options) { + o = strdup(options); + if (!o) + return -ENOMEM; + } + + assert(generic_nr >= 0); + m->partitions[PARTITION_ROOT] = (DissectedPartition) { + .found = true, + .rw = generic_rw, + .growfs = generic_growfs, + .partno = generic_nr, + .architecture = _ARCHITECTURE_INVALID, + .node = TAKE_PTR(n), + .uuid = generic_uuid, + .mount_options = TAKE_PTR(o), + .mount_node_fd = TAKE_FD(mount_node_fd), + .offset = UINT64_MAX, + .size = UINT64_MAX, + .fsmount_fd = -EBADF, + }; + } + } + } + + /* Check if we have a root fs if we are told to do check. /usr alone is fine too, but only if appropriate flag for that is set too */ + if (FLAGS_SET(flags, DISSECT_IMAGE_REQUIRE_ROOT) && + !(m->partitions[PARTITION_ROOT].found || (m->partitions[PARTITION_USR].found && FLAGS_SET(flags, DISSECT_IMAGE_USR_NO_ROOT)))) + return -ENXIO; + + if (m->partitions[PARTITION_ROOT_VERITY].found) { + /* We only support one verity partition per image, i.e. can't do for both /usr and root fs */ + if (m->partitions[PARTITION_USR_VERITY].found) + return -ENOTUNIQ; + + /* We don't support verity enabled root with a split out /usr. Neither with nor without + * verity there. (Note that we do support verity-less root with verity-full /usr, though.) */ + if (m->partitions[PARTITION_USR].found) + return -EADDRNOTAVAIL; + } + + if (verity) { + /* If a verity designator is specified, then insist that the matching partition exists */ + if (verity->designator >= 0 && !m->partitions[verity->designator].found) + return -EADDRNOTAVAIL; + + bool have_verity_sig_partition; + if (verity->designator >= 0) + have_verity_sig_partition = m->partitions[verity->designator == PARTITION_USR ? PARTITION_USR_VERITY_SIG : PARTITION_ROOT_VERITY_SIG].found; + else + have_verity_sig_partition = m->partitions[PARTITION_USR_VERITY_SIG].found || m->partitions[PARTITION_ROOT_VERITY_SIG].found; + + if (verity->root_hash) { + /* If we have an explicit root hash and found the partitions for it, then we are ready to use + * Verity, set things up for it */ + + if (verity->designator < 0 || verity->designator == PARTITION_ROOT) { + if (!m->partitions[PARTITION_ROOT_VERITY].found || !m->partitions[PARTITION_ROOT].found) + return -EADDRNOTAVAIL; + + /* If we found a verity setup, then the root partition is necessarily read-only. */ + m->partitions[PARTITION_ROOT].rw = false; + m->verity_ready = true; + + } else { + assert(verity->designator == PARTITION_USR); + + if (!m->partitions[PARTITION_USR_VERITY].found || !m->partitions[PARTITION_USR].found) + return -EADDRNOTAVAIL; + + m->partitions[PARTITION_USR].rw = false; + m->verity_ready = true; + } + + if (m->verity_ready) + m->verity_sig_ready = verity->root_hash_sig || have_verity_sig_partition; + + } else if (have_verity_sig_partition) { + + /* If we found an embedded signature partition, we are ready, too. */ + + m->verity_ready = m->verity_sig_ready = true; + if (verity->designator >= 0) + m->partitions[verity->designator == PARTITION_USR ? PARTITION_USR : PARTITION_ROOT].rw = false; + else if (m->partitions[PARTITION_USR_VERITY_SIG].found) + m->partitions[PARTITION_USR].rw = false; + else if (m->partitions[PARTITION_ROOT_VERITY_SIG].found) + m->partitions[PARTITION_ROOT].rw = false; + } + } + + bool any = false; + + /* After we discovered all partitions let's see if the verity requirements match the policy. (Note: + * we don't check encryption requirements here, because we haven't probed the file system yet, hence + * don't know if this is encrypted or not) */ + for (PartitionDesignator di = 0; di < _PARTITION_DESIGNATOR_MAX; di++) { + PartitionDesignator vi, si; + PartitionPolicyFlags found_flags; + + any = any || m->partitions[di].found; + + vi = partition_verity_of(di); + si = partition_verity_sig_of(di); + + /* Determine the verity protection level for this partition. */ + found_flags = m->partitions[di].found ? + (vi >= 0 && m->partitions[vi].found ? + (si >= 0 && m->partitions[si].found ? PARTITION_POLICY_SIGNED : PARTITION_POLICY_VERITY) : + PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED) : + (m->partitions[di].ignored ? PARTITION_POLICY_UNUSED : PARTITION_POLICY_ABSENT); + + r = image_policy_check_protection(policy, di, found_flags); + if (r < 0) + return r; + + if (m->partitions[di].found) { + r = image_policy_check_partition_flags(policy, di, m->partitions[di].gpt_flags); + if (r < 0) + return r; + } + } + + if (!any && !FLAGS_SET(flags, DISSECT_IMAGE_ALLOW_EMPTY)) + return -ENOMSG; + + r = dissected_image_probe_filesystems(m, fd, policy); + if (r < 0) + return r; + + return 0; +} +#endif + +int dissect_image_file( + const char *path, + const VeritySettings *verity, + const MountOptions *mount_options, + const ImagePolicy *image_policy, + DissectImageFlags flags, + DissectedImage **ret) { + +#if HAVE_BLKID + _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(path); + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return -errno; + + r = fd_verify_regular(fd); + if (r < 0) + return r; + + r = dissected_image_new(path, &m); + if (r < 0) + return r; + + r = probe_sector_size(fd, &m->sector_size); + if (r < 0) + return r; + + r = dissect_image(m, fd, path, verity, mount_options, image_policy, flags); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(m); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int dissect_log_error(int log_level, int r, const char *name, const VeritySettings *verity) { + assert(log_level >= 0 && log_level <= LOG_DEBUG); + assert(name); + + switch (r) { + + case 0 ... INT_MAX: /* success! */ + return r; + + case -EOPNOTSUPP: + return log_full_errno(log_level, r, "Dissecting images is not supported, compiled without blkid support."); + + case -ENOPKG: + return log_full_errno(log_level, r, "%s: Couldn't identify a suitable partition table or file system.", name); + + case -ENOMEDIUM: + return log_full_errno(log_level, r, "%s: The image does not pass os-release/extension-release validation.", name); + + case -EADDRNOTAVAIL: + return log_full_errno(log_level, r, "%s: No root partition for specified root hash found.", name); + + case -ENOTUNIQ: + return log_full_errno(log_level, r, "%s: Multiple suitable root partitions found in image.", name); + + case -ENXIO: + return log_full_errno(log_level, r, "%s: No suitable root partition found in image.", name); + + case -EPROTONOSUPPORT: + return log_full_errno(log_level, r, "Device '%s' is a loopback block device with partition scanning turned off, please turn it on.", name); + + case -ENOTBLK: + return log_full_errno(log_level, r, "%s: Image is not a block device.", name); + + case -EBADR: + return log_full_errno(log_level, r, + "Combining partitioned images (such as '%s') with external Verity data (such as '%s') not supported. " + "(Consider setting $SYSTEMD_DISSECT_VERITY_SIDECAR=0 to disable automatic discovery of external Verity data.)", + name, strna(verity ? verity->data_path : NULL)); + + case -ERFKILL: + return log_full_errno(log_level, r, "%s: image does not match image policy.", name); + + case -ENOMSG: + return log_full_errno(log_level, r, "%s: no suitable partitions found.", name); + + default: + return log_full_errno(log_level, r, "%s: cannot dissect image: %m", name); + } +} + +int dissect_image_file_and_warn( + const char *path, + const VeritySettings *verity, + const MountOptions *mount_options, + const ImagePolicy *image_policy, + DissectImageFlags flags, + DissectedImage **ret) { + + return dissect_log_error( + LOG_ERR, + dissect_image_file(path, verity, mount_options, image_policy, flags, ret), + path, + verity); +} + +DissectedImage* dissected_image_unref(DissectedImage *m) { + if (!m) + return NULL; + + /* First, clear dissected partitions. */ + for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) + dissected_partition_done(m->partitions + i); + + /* Second, free decrypted images. This must be after dissected_partition_done(), as freeing + * DecryptedImage may try to deactivate partitions. */ + decrypted_image_unref(m->decrypted_image); + + /* Third, unref LoopDevice. This must be called after the above two, as freeing LoopDevice may try to + * remove existing partitions on the loopback block device. */ + loop_device_unref(m->loop); + + free(m->image_name); + free(m->hostname); + strv_free(m->machine_info); + strv_free(m->os_release); + strv_free(m->initrd_release); + strv_free(m->confext_release); + strv_free(m->sysext_release); + + return mfree(m); +} + +static int is_loop_device(const char *path) { + char s[SYS_BLOCK_PATH_MAX("/../loop/")]; + struct stat st; + + assert(path); + + if (stat(path, &st) < 0) + return -errno; + + if (!S_ISBLK(st.st_mode)) + return -ENOTBLK; + + xsprintf_sys_block_path(s, "/loop/", st.st_dev); + if (access(s, F_OK) < 0) { + if (errno != ENOENT) + return -errno; + + /* The device itself isn't a loop device, but maybe it's a partition and its parent is? */ + xsprintf_sys_block_path(s, "/../loop/", st.st_dev); + if (access(s, F_OK) < 0) + return errno == ENOENT ? false : -errno; + } + + return true; +} + +static int run_fsck(int node_fd, const char *fstype) { + int r, exit_status; + pid_t pid; + + assert(node_fd >= 0); + assert(fstype); + + r = fsck_exists_for_fstype(fstype); + if (r < 0) { + log_debug_errno(r, "Couldn't determine whether fsck for %s exists, proceeding anyway.", fstype); + return 0; + } + if (r == 0) { + log_debug("Not checking partition %s, as fsck for %s does not exist.", FORMAT_PROC_FD_PATH(node_fd), fstype); + return 0; + } + + r = safe_fork_full( + "(fsck)", + NULL, + &node_fd, 1, /* Leave the node fd open */ + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_CLOEXEC_OFF, + &pid); + if (r < 0) + return log_debug_errno(r, "Failed to fork off fsck: %m"); + if (r == 0) { + /* Child */ + execlp("fsck", "fsck", "-aT", FORMAT_PROC_FD_PATH(node_fd), NULL); + log_open(); + log_debug_errno(errno, "Failed to execl() fsck: %m"); + _exit(FSCK_OPERATIONAL_ERROR); + } + + exit_status = wait_for_terminate_and_check("fsck", pid, 0); + if (exit_status < 0) + return log_debug_errno(exit_status, "Failed to fork off fsck: %m"); + + if ((exit_status & ~FSCK_ERROR_CORRECTED) != FSCK_SUCCESS) { + log_debug("fsck failed with exit status %i.", exit_status); + + if ((exit_status & (FSCK_SYSTEM_SHOULD_REBOOT|FSCK_ERRORS_LEFT_UNCORRECTED)) != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), "File system is corrupted, refusing."); + + log_debug("Ignoring fsck error."); + } + + return 0; +} + +static int fs_grow(const char *node_path, int mount_fd, const char *mount_path) { + _cleanup_close_ int _mount_fd = -EBADF, node_fd = -EBADF; + uint64_t size, newsize; + const char *id; + int r; + + assert(node_path); + assert(mount_fd >= 0 || mount_path); + + node_fd = open(node_path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (node_fd < 0) + return log_debug_errno(errno, "Failed to open node device %s: %m", node_path); + + if (ioctl(node_fd, BLKGETSIZE64, &size) != 0) + return log_debug_errno(errno, "Failed to get block device size of %s: %m", node_path); + + if (mount_fd < 0) { + assert(mount_path); + + _mount_fd = open(mount_path, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (_mount_fd < 0) + return log_debug_errno(errno, "Failed to open mounted file system %s: %m", mount_path); + + mount_fd = _mount_fd; + } else { + mount_fd = fd_reopen_condition(mount_fd, O_RDONLY|O_DIRECTORY|O_CLOEXEC, O_RDONLY|O_DIRECTORY|O_CLOEXEC, &_mount_fd); + if (mount_fd < 0) + return log_debug_errno(errno, "Failed to reopen mount node: %m"); + } + + id = mount_path ?: node_path; + + log_debug("Resizing \"%s\" to %"PRIu64" bytes...", id, size); + r = resize_fs(mount_fd, size, &newsize); + if (r < 0) + return log_debug_errno(r, "Failed to resize \"%s\" to %"PRIu64" bytes: %m", id, size); + + if (newsize == size) + log_debug("Successfully resized \"%s\" to %s bytes.", + id, FORMAT_BYTES(newsize)); + else { + assert(newsize < size); + log_debug("Successfully resized \"%s\" to %s bytes (%"PRIu64" bytes lost due to blocksize).", + id, FORMAT_BYTES(newsize), size - newsize); + } + + return 0; +} + +int partition_pick_mount_options( + PartitionDesignator d, + const char *fstype, + bool rw, + bool discard, + char **ret_options, + unsigned long *ret_ms_flags) { + + _cleanup_free_ char *options = NULL; + + assert(ret_options); + + /* Selects a baseline of bind mount flags, that should always apply. + * + * Firstly, we set MS_NODEV universally on all mounts, since we don't want to allow device nodes outside of /dev/. + * + * On /var/tmp/ we'll also set MS_NOSUID, same as we set for /tmp/ on the host. + * + * On the ESP and XBOOTLDR partitions we'll also disable symlinks, and execution. These file systems + * are generally untrusted (i.e. not encrypted or authenticated), and typically VFAT hence we should + * be as restrictive as possible, and this shouldn't hurt, since the functionality is not available + * there anyway. */ + + unsigned long flags = MS_NODEV; + + if (!rw) + flags |= MS_RDONLY; + + switch (d) { + + case PARTITION_ESP: + case PARTITION_XBOOTLDR: + flags |= MS_NOSUID|MS_NOEXEC|ms_nosymfollow_supported(); + + /* The ESP might contain a pre-boot random seed. Let's make this unaccessible to regular + * userspace. ESP/XBOOTLDR is almost certainly VFAT, hence if we don't know assume it is. */ + if (!fstype || fstype_can_umask(fstype)) + if (!strextend_with_separator(&options, ",", "umask=0077")) + return -ENOMEM; + break; + + case PARTITION_TMP: + flags |= MS_NOSUID; + break; + + default: + break; + } + + /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the + * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices) + * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses + * from the upper file system still get propagated through to the underlying file system, + * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify + * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to + * carry a per file system table here. + * + * Note that this means that we might not be able to mount corrupted file systems as read-only + * anymore (since in some cases the kernel implementations will refuse mounting when corrupted, + * read-only and "norecovery" is specified). But I think for the case of automatically determined + * mount options for loopback devices this is the right choice, since otherwise using the same + * loopback file twice even in read-only mode, is going to fail badly sooner or later. The use case of + * making reuse of the immutable images "just work" is more relevant to us than having read-only + * access that actually modifies stuff work on such image files. Or to say this differently: if + * people want their file systems to be fixed up they should just open them in writable mode, where + * all these problems don't exist. */ + if (!rw && fstype && fstype_can_norecovery(fstype)) + if (!strextend_with_separator(&options, ",", "norecovery")) + return -ENOMEM; + + if (discard && fstype && fstype_can_discard(fstype)) + if (!strextend_with_separator(&options, ",", "discard")) + return -ENOMEM; + + if (!ret_ms_flags) /* Fold flags into option string if ret_flags specified as NULL */ + if (!strextend_with_separator(&options, ",", + FLAGS_SET(flags, MS_RDONLY) ? "ro" : "rw", + FLAGS_SET(flags, MS_NODEV) ? "nodev" : "dev", + FLAGS_SET(flags, MS_NOSUID) ? "nosuid" : "suid", + FLAGS_SET(flags, MS_NOEXEC) ? "noexec" : "exec", + FLAGS_SET(flags, MS_NOSYMFOLLOW) ? "nosymfollow" : NULL)) + /* NB: we suppress 'symfollow' here, since it's the default, and old /bin/mount might not know it */ + return -ENOMEM; + + if (ret_ms_flags) + *ret_ms_flags = flags; + + *ret_options = TAKE_PTR(options); + return 0; +} + +static bool need_user_mapping(uid_t uid_shift, uid_t uid_range) { + + if (!uid_is_valid(uid_shift)) + return false; + + return uid_shift != 0 || uid_range != UINT32_MAX; +} + +static int mount_partition( + PartitionDesignator d, + DissectedPartition *m, + const char *where, + const char *directory, + uid_t uid_shift, + uid_t uid_range, + int userns_fd, + DissectImageFlags flags) { + + _cleanup_free_ char *chased = NULL, *options = NULL; + const char *p = NULL, *node, *fstype = NULL; + bool rw, discard, grow; + unsigned long ms_flags; + int r; + + assert(m); + + if (!m->found) + return 0; + + /* Check the various combinations when we can't do anything anymore */ + if (m->fsmount_fd < 0 && m->mount_node_fd < 0) + return 0; + if (m->fsmount_fd >= 0 && !where) + return 0; + if (!where && m->mount_node_fd < 0) + return 0; + + if (m->fsmount_fd < 0) { + fstype = dissected_partition_fstype(m); + if (!fstype) + return -EAFNOSUPPORT; + + /* We are looking at an encrypted partition? This either means stacked encryption, or the + * caller didn't call dissected_image_decrypt() beforehand. Let's return a recognizable error + * for this case. */ + if (streq(fstype, "crypto_LUKS")) + return -EUNATCH; + + r = dissect_fstype_ok(fstype); + if (r < 0) + return r; + if (!r) + return -EIDRM; /* Recognizable error */ + } + + node = m->mount_node_fd < 0 ? NULL : FORMAT_PROC_FD_PATH(m->mount_node_fd); + rw = m->rw && !(flags & DISSECT_IMAGE_MOUNT_READ_ONLY); + + discard = ((flags & DISSECT_IMAGE_DISCARD) || + ((flags & DISSECT_IMAGE_DISCARD_ON_LOOP) && (m->node && is_loop_device(m->node) > 0))); + + grow = rw && m->growfs && FLAGS_SET(flags, DISSECT_IMAGE_GROWFS); + + if (FLAGS_SET(flags, DISSECT_IMAGE_FSCK) && rw && m->mount_node_fd >= 0 && m->fsmount_fd < 0) { + r = run_fsck(m->mount_node_fd, fstype); + if (r < 0) + return r; + } + + if (where) { + if (directory) { + /* Automatically create missing mount points inside the image, if necessary. */ + r = mkdir_p_root(where, directory, uid_shift, (gid_t) uid_shift, 0755, NULL); + if (r < 0 && r != -EROFS) + return r; + + r = chase(directory, where, CHASE_PREFIX_ROOT, &chased, NULL); + if (r < 0) + return r; + + p = chased; + } else { + /* Create top-level mount if missing – but only if this is asked for. This won't modify the + * image (as the branch above does) but the host hierarchy, and the created directory might + * survive our mount in the host hierarchy hence. */ + if (FLAGS_SET(flags, DISSECT_IMAGE_MKDIR)) { + r = mkdir_p(where, 0755); + if (r < 0) + return r; + } + + p = where; + } + } + + if (m->fsmount_fd < 0) { + r = partition_pick_mount_options(d, fstype, rw, discard, &options, &ms_flags); + if (r < 0) + return r; + + if (need_user_mapping(uid_shift, uid_range) && fstype_can_uid_gid(fstype)) { + _cleanup_free_ char *uid_option = NULL; + + if (asprintf(&uid_option, "uid=" UID_FMT ",gid=" GID_FMT, uid_shift, (gid_t) uid_shift) < 0) + return -ENOMEM; + + if (!strextend_with_separator(&options, ",", uid_option)) + return -ENOMEM; + + userns_fd = -EBADF; /* Not needed */ + } + + if (!isempty(m->mount_options)) + if (!strextend_with_separator(&options, ",", m->mount_options)) + return -ENOMEM; + } + + if (p) { + if (m->fsmount_fd >= 0) { + /* Case #1: Attach existing fsmount fd to the file system */ + + r = mount_exchange_graceful( + m->fsmount_fd, + p, + FLAGS_SET(flags, DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE)); + if (r < 0) + return log_debug_errno(r, "Failed to mount image on '%s': %m", p); + + } else { + assert(node); + + /* Case #2: Mount directly into place */ + r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, ms_flags, options); + if (r < 0) + return r; + + if (grow) + (void) fs_grow(node, -EBADF, p); + + if (userns_fd >= 0) { + r = remount_idmap_fd(STRV_MAKE(p), userns_fd); + if (r < 0) + return r; + } + } + } else { + assert(node); + + /* Case #3: Create fsmount fd */ + + m->fsmount_fd = make_fsmount(LOG_DEBUG, node, fstype, ms_flags, options, userns_fd); + if (m->fsmount_fd < 0) + return m->fsmount_fd; + + if (grow) + (void) fs_grow(node, m->fsmount_fd, NULL); + } + + return 1; +} + +static int mount_root_tmpfs(const char *where, uid_t uid_shift, uid_t uid_range, DissectImageFlags flags) { + _cleanup_free_ char *options = NULL; + int r; + + assert(where); + + /* For images that contain /usr/ but no rootfs, let's mount rootfs as tmpfs */ + + if (FLAGS_SET(flags, DISSECT_IMAGE_MKDIR)) { + r = mkdir_p(where, 0755); + if (r < 0) + return r; + } + + if (need_user_mapping(uid_shift, uid_range)) { + if (asprintf(&options, "uid=" UID_FMT ",gid=" GID_FMT, uid_shift, (gid_t) uid_shift) < 0) + return -ENOMEM; + } + + r = mount_nofollow_verbose(LOG_DEBUG, "rootfs", where, "tmpfs", MS_NODEV, options); + if (r < 0) + return r; + + return 1; +} + +static int mount_point_is_available(const char *where, const char *path, bool missing_ok) { + _cleanup_free_ char *p = NULL; + int r; + + /* Check whether is suitable as a mountpoint, i.e. is an empty directory + * or does not exist at all (when missing_ok). */ + + r = chase(path, where, CHASE_PREFIX_ROOT, &p, NULL); + if (r == -ENOENT) + return missing_ok; + if (r < 0) + return log_debug_errno(r, "Failed to chase \"%s\": %m", path); + + r = dir_is_empty(p, /* ignore_hidden_or_backup= */ false); + if (r == -ENOTDIR) + return false; + if (r < 0) + return log_debug_errno(r, "Failed to check directory \"%s\": %m", p); + return r > 0; +} + +int dissected_image_mount( + DissectedImage *m, + const char *where, + uid_t uid_shift, + uid_t uid_range, + int userns_fd, + DissectImageFlags flags) { + + _cleanup_close_ int my_userns_fd = -EBADF; + int r; + + assert(m); + + /* If 'where' is NULL then we'll use the new mount API to create fsmount() fds for the mounts and + * store them in DissectedPartition.fsmount_fd. + * + * If 'where' is not NULL then we'll either mount the partitions to the right places ourselves, + * or use DissectedPartition.fsmount_fd and bind it to the right places. + * + * This allows splitting the setting up up the superblocks and the binding to file systems paths into + * two distinct and differently privileged components: one that gets the fsmount fds, and the other + * that then applies them. + * + * Returns: + * + * -ENXIO → No root partition found + * -EMEDIUMTYPE → DISSECT_IMAGE_VALIDATE_OS set but no os-release/extension-release file found + * -EUNATCH → Encrypted partition found for which no dm-crypt was set up yet + * -EUCLEAN → fsck for file system failed + * -EBUSY → File system already mounted/used elsewhere (kernel) + * -EAFNOSUPPORT → File system type not supported or not known + * -EIDRM → File system is not among allowlisted "common" file systems + */ + + if (!where && (flags & (DISSECT_IMAGE_VALIDATE_OS|DISSECT_IMAGE_VALIDATE_OS_EXT)) != 0) + return -EOPNOTSUPP; /* for now, not supported */ + + if (!(m->partitions[PARTITION_ROOT].found || + (m->partitions[PARTITION_USR].found && FLAGS_SET(flags, DISSECT_IMAGE_USR_NO_ROOT)))) + return -ENXIO; /* Require a root fs or at least a /usr/ fs (the latter is subject to a flag of its own) */ + + if (userns_fd < 0 && need_user_mapping(uid_shift, uid_range) && FLAGS_SET(flags, DISSECT_IMAGE_MOUNT_IDMAPPED)) { + + my_userns_fd = make_userns(uid_shift, uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT); + if (my_userns_fd < 0) + return my_userns_fd; + + userns_fd = my_userns_fd; + } + + if ((flags & DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY) == 0) { + + /* First mount the root fs. If there's none we use a tmpfs. */ + if (m->partitions[PARTITION_ROOT].found) { + r = mount_partition(PARTITION_ROOT, m->partitions + PARTITION_ROOT, where, NULL, uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + + } else if (where) { + r = mount_root_tmpfs(where, uid_shift, uid_range, flags); + if (r < 0) + return r; + } + + /* For us mounting root always means mounting /usr as well */ + r = mount_partition(PARTITION_USR, m->partitions + PARTITION_USR, where, "/usr", uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + } + + if ((flags & DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY) == 0 && + (flags & (DISSECT_IMAGE_VALIDATE_OS|DISSECT_IMAGE_VALIDATE_OS_EXT)) != 0) { + /* If either one of the validation flags are set, ensure that the image qualifies as + * one or the other (or both). */ + bool ok = false; + + assert(where); + + if (FLAGS_SET(flags, DISSECT_IMAGE_VALIDATE_OS)) { + r = path_is_os_tree(where); + if (r < 0) + return r; + if (r > 0) + ok = true; + } + if (!ok && FLAGS_SET(flags, DISSECT_IMAGE_VALIDATE_OS_EXT) && m->image_name) { + r = extension_has_forbidden_content(where); + if (r < 0) + return r; + if (r == 0) { + r = path_is_extension_tree(IMAGE_SYSEXT, where, m->image_name, FLAGS_SET(flags, DISSECT_IMAGE_RELAX_EXTENSION_CHECK)); + if (r == 0) + r = path_is_extension_tree(IMAGE_CONFEXT, where, m->image_name, FLAGS_SET(flags, DISSECT_IMAGE_RELAX_EXTENSION_CHECK)); + if (r < 0) + return r; + if (r > 0) + ok = true; + } + } + + if (!ok) + return -ENOMEDIUM; + } + + if (flags & DISSECT_IMAGE_MOUNT_ROOT_ONLY) + return 0; + + r = mount_partition(PARTITION_HOME, m->partitions + PARTITION_HOME, where, "/home", uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + + r = mount_partition(PARTITION_SRV, m->partitions + PARTITION_SRV, where, "/srv", uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + + r = mount_partition(PARTITION_VAR, m->partitions + PARTITION_VAR, where, "/var", uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + + r = mount_partition(PARTITION_TMP, m->partitions + PARTITION_TMP, where, "/var/tmp", uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + + int slash_boot_is_available = 0; + if (where) { + r = slash_boot_is_available = mount_point_is_available(where, "/boot", /* missing_ok = */ true); + if (r < 0) + return r; + } + if (!where || slash_boot_is_available) { + r = mount_partition(PARTITION_XBOOTLDR, m->partitions + PARTITION_XBOOTLDR, where, "/boot", uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + slash_boot_is_available = !r; + } + + if (m->partitions[PARTITION_ESP].found) { + const char *esp_path = NULL; + + if (where) { + /* Mount the ESP to /boot/ if it exists and is empty and we didn't already mount the + * XBOOTLDR partition into it. Otherwise, use /efi instead, but only if it exists + * and is empty. */ + + if (slash_boot_is_available) { + r = mount_point_is_available(where, "/boot", /* missing_ok = */ false); + if (r < 0) + return r; + if (r > 0) + esp_path = "/boot"; + } + + if (!esp_path) { + r = mount_point_is_available(where, "/efi", /* missing_ok = */ true); + if (r < 0) + return r; + if (r > 0) + esp_path = "/efi"; + } + } + + /* OK, let's mount the ESP now (possibly creating the dir if missing) */ + r = mount_partition(PARTITION_ESP, m->partitions + PARTITION_ESP, where, esp_path, uid_shift, uid_range, userns_fd, flags); + if (r < 0) + return r; + } + + return 0; +} + +int dissected_image_mount_and_warn( + DissectedImage *m, + const char *where, + uid_t uid_shift, + uid_t uid_range, + int userns_fd, + DissectImageFlags flags) { + + int r; + + assert(m); + + r = dissected_image_mount(m, where, uid_shift, uid_range, userns_fd, flags); + if (r == -ENXIO) + return log_error_errno(r, "Not root file system found in image."); + if (r == -EMEDIUMTYPE) + return log_error_errno(r, "No suitable os-release/extension-release file in image found."); + if (r == -EUNATCH) + return log_error_errno(r, "Encrypted file system discovered, but decryption not requested."); + if (r == -EUCLEAN) + return log_error_errno(r, "File system check on image failed."); + if (r == -EBUSY) + return log_error_errno(r, "File system already mounted elsewhere."); + if (r == -EAFNOSUPPORT) + return log_error_errno(r, "File system type not supported or not known."); + if (r == -EIDRM) + return log_error_errno(r, "File system is too uncommon, refused."); + if (r < 0) + return log_error_errno(r, "Failed to mount image: %m"); + + return r; +} + +#if HAVE_LIBCRYPTSETUP +struct DecryptedPartition { + struct crypt_device *device; + char *name; + bool relinquished; +}; +#endif + +typedef struct DecryptedPartition DecryptedPartition; + +struct DecryptedImage { + unsigned n_ref; + DecryptedPartition *decrypted; + size_t n_decrypted; +}; + +static DecryptedImage* decrypted_image_free(DecryptedImage *d) { +#if HAVE_LIBCRYPTSETUP + int r; + + if (!d) + return NULL; + + for (size_t i = 0; i < d->n_decrypted; i++) { + DecryptedPartition *p = d->decrypted + i; + + if (p->device && p->name && !p->relinquished) { + _cleanup_free_ char *node = NULL; + + node = path_join("/dev/mapper", p->name); + if (node) { + r = btrfs_forget_device(node); + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to forget btrfs device %s, ignoring: %m", node); + } else + log_oom_debug(); + + /* Let's deactivate lazily, as the dm volume may be already/still used by other processes. */ + r = sym_crypt_deactivate_by_name(p->device, p->name, CRYPT_DEACTIVATE_DEFERRED); + if (r < 0) + log_debug_errno(r, "Failed to deactivate encrypted partition %s", p->name); + } + + if (p->device) + sym_crypt_free(p->device); + free(p->name); + } + + free(d->decrypted); + free(d); +#endif + return NULL; +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(DecryptedImage, decrypted_image, decrypted_image_free); + +#if HAVE_LIBCRYPTSETUP +static int decrypted_image_new(DecryptedImage **ret) { + _cleanup_(decrypted_image_unrefp) DecryptedImage *d = NULL; + + assert(ret); + + d = new(DecryptedImage, 1); + if (!d) + return -ENOMEM; + + *d = (DecryptedImage) { + .n_ref = 1, + }; + + *ret = TAKE_PTR(d); + return 0; +} + +static int make_dm_name_and_node(const void *original_node, const char *suffix, char **ret_name, char **ret_node) { + _cleanup_free_ char *name = NULL, *node = NULL; + const char *base; + + assert(original_node); + assert(suffix); + assert(ret_name); + assert(ret_node); + + base = strrchr(original_node, '/'); + if (!base) + base = original_node; + else + base++; + if (isempty(base)) + return -EINVAL; + + name = strjoin(base, suffix); + if (!name) + return -ENOMEM; + if (!filename_is_valid(name)) + return -EINVAL; + + node = path_join(sym_crypt_get_dir(), name); + if (!node) + return -ENOMEM; + + *ret_name = TAKE_PTR(name); + *ret_node = TAKE_PTR(node); + + return 0; +} + +static int decrypt_partition( + DissectedPartition *m, + const char *passphrase, + DissectImageFlags flags, + DecryptedImage *d) { + + _cleanup_free_ char *node = NULL, *name = NULL; + _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(m); + assert(d); + + if (!m->found || !m->node || !m->fstype) + return 0; + + if (!streq(m->fstype, "crypto_LUKS")) + return 0; + + if (!passphrase) + return -ENOKEY; + + r = dlopen_cryptsetup(); + if (r < 0) + return r; + + r = make_dm_name_and_node(m->node, "-decrypted", &name, &node); + if (r < 0) + return r; + + if (!GREEDY_REALLOC0(d->decrypted, d->n_decrypted + 1)) + return -ENOMEM; + + r = sym_crypt_init(&cd, m->node); + if (r < 0) + return log_debug_errno(r, "Failed to initialize dm-crypt: %m"); + + cryptsetup_enable_logging(cd); + + r = sym_crypt_load(cd, CRYPT_LUKS, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to load LUKS metadata: %m"); + + r = sym_crypt_activate_by_passphrase(cd, name, CRYPT_ANY_SLOT, passphrase, strlen(passphrase), + ((flags & DISSECT_IMAGE_DEVICE_READ_ONLY) ? CRYPT_ACTIVATE_READONLY : 0) | + ((flags & DISSECT_IMAGE_DISCARD_ON_CRYPTO) ? CRYPT_ACTIVATE_ALLOW_DISCARDS : 0)); + if (r < 0) { + log_debug_errno(r, "Failed to activate LUKS device: %m"); + return r == -EPERM ? -EKEYREJECTED : r; + } + + fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_debug_errno(errno, "Failed to open %s: %m", node); + + d->decrypted[d->n_decrypted++] = (DecryptedPartition) { + .name = TAKE_PTR(name), + .device = TAKE_PTR(cd), + }; + + m->decrypted_node = TAKE_PTR(node); + close_and_replace(m->mount_node_fd, fd); + + return 0; +} + +static int verity_can_reuse( + const VeritySettings *verity, + const char *name, + struct crypt_device **ret_cd) { + + /* If the same volume was already open, check that the root hashes match, and reuse it if they do */ + _cleanup_free_ char *root_hash_existing = NULL; + _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL; + struct crypt_params_verity crypt_params = {}; + size_t root_hash_existing_size; + int r; + + assert(verity); + assert(name); + assert(ret_cd); + + r = sym_crypt_init_by_name(&cd, name); + if (r < 0) + return log_debug_errno(r, "Error opening verity device, crypt_init_by_name failed: %m"); + + cryptsetup_enable_logging(cd); + + r = sym_crypt_get_verity_info(cd, &crypt_params); + if (r < 0) + return log_debug_errno(r, "Error opening verity device, crypt_get_verity_info failed: %m"); + + root_hash_existing_size = verity->root_hash_size; + root_hash_existing = malloc0(root_hash_existing_size); + if (!root_hash_existing) + return -ENOMEM; + + r = sym_crypt_volume_key_get(cd, CRYPT_ANY_SLOT, root_hash_existing, &root_hash_existing_size, NULL, 0); + if (r < 0) + return log_debug_errno(r, "Error opening verity device, crypt_volume_key_get failed: %m"); + if (verity->root_hash_size != root_hash_existing_size || + memcmp(root_hash_existing, verity->root_hash, verity->root_hash_size) != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Error opening verity device, it already exists but root hashes are different."); + +#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY + /* Ensure that, if signatures are supported, we only reuse the device if the previous mount used the + * same settings, so that a previous unsigned mount will not be reused if the user asks to use + * signing for the new one, and vice versa. */ + if (!!verity->root_hash_sig != !!(crypt_params.flags & CRYPT_VERITY_ROOT_HASH_SIGNATURE)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Error opening verity device, it already exists but signature settings are not the same."); +#endif + + *ret_cd = TAKE_PTR(cd); + return 0; +} + +static char* dm_deferred_remove_clean(char *name) { + if (!name) + return NULL; + + (void) sym_crypt_deactivate_by_name(NULL, name, CRYPT_DEACTIVATE_DEFERRED); + return mfree(name); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char *, dm_deferred_remove_clean); + +static int validate_signature_userspace(const VeritySettings *verity) { +#if HAVE_OPENSSL + _cleanup_(sk_X509_free_allp) STACK_OF(X509) *sk = NULL; + _cleanup_strv_free_ char **certs = NULL; + _cleanup_(PKCS7_freep) PKCS7 *p7 = NULL; + _cleanup_free_ char *s = NULL; + _cleanup_(BIO_freep) BIO *bio = NULL; /* 'bio' must be freed first, 's' second, hence keep this order + * of declaration in place, please */ + const unsigned char *d; + int r; + + assert(verity); + assert(verity->root_hash); + assert(verity->root_hash_sig); + + /* Because installing a signature certificate into the kernel chain is so messy, let's optionally do + * userspace validation. */ + + r = conf_files_list_nulstr(&certs, ".crt", NULL, CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, CONF_PATHS_NULSTR("verity.d")); + if (r < 0) + return log_debug_errno(r, "Failed to enumerate certificates: %m"); + if (strv_isempty(certs)) { + log_debug("No userspace dm-verity certificates found."); + return 0; + } + + d = verity->root_hash_sig; + p7 = d2i_PKCS7(NULL, &d, (long) verity->root_hash_sig_size); + if (!p7) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse PKCS7 DER signature data."); + + s = hexmem(verity->root_hash, verity->root_hash_size); + if (!s) + return log_oom_debug(); + + bio = BIO_new_mem_buf(s, strlen(s)); + if (!bio) + return log_oom_debug(); + + sk = sk_X509_new_null(); + if (!sk) + return log_oom_debug(); + + STRV_FOREACH(i, certs) { + _cleanup_(X509_freep) X509 *c = NULL; + _cleanup_fclose_ FILE *f = NULL; + + f = fopen(*i, "re"); + if (!f) { + log_debug_errno(errno, "Failed to open '%s', ignoring: %m", *i); + continue; + } + + c = PEM_read_X509(f, NULL, NULL, NULL); + if (!c) { + log_debug("Failed to load X509 certificate '%s', ignoring.", *i); + continue; + } + + if (sk_X509_push(sk, c) == 0) + return log_oom_debug(); + + TAKE_PTR(c); + } + + r = PKCS7_verify(p7, sk, NULL, bio, NULL, PKCS7_NOINTERN|PKCS7_NOVERIFY); + if (r) + log_debug("Userspace PKCS#7 validation succeeded."); + else + log_debug("Userspace PKCS#7 validation failed: %s", ERR_error_string(ERR_get_error(), NULL)); + + return r; +#else + log_debug("Not doing client-side validation of dm-verity root hash signatures, OpenSSL support disabled."); + return 0; +#endif +} + +static int do_crypt_activate_verity( + struct crypt_device *cd, + const char *name, + const VeritySettings *verity) { + + bool check_signature; + int r, k; + + assert(cd); + assert(name); + assert(verity); + + if (verity->root_hash_sig) { + r = getenv_bool_secure("SYSTEMD_DISSECT_VERITY_SIGNATURE"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_DISSECT_VERITY_SIGNATURE"); + + check_signature = r != 0; + } else + check_signature = false; + + if (check_signature) { + +#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY + /* First, if we have support for signed keys in the kernel, then try that first. */ + r = sym_crypt_activate_by_signed_key( + cd, + name, + verity->root_hash, + verity->root_hash_size, + verity->root_hash_sig, + verity->root_hash_sig_size, + CRYPT_ACTIVATE_READONLY); + if (r >= 0) + return r; + + log_debug_errno(r, "Validation of dm-verity signature failed via the kernel, trying userspace validation instead: %m"); +#else + log_debug("Activation of verity device with signature requested, but not supported via the kernel by %s due to missing crypt_activate_by_signed_key(), trying userspace validation instead.", + program_invocation_short_name); + r = 0; /* Set for the propagation below */ +#endif + + /* So this didn't work via the kernel, then let's try userspace validation instead. If that + * works we'll try to activate without telling the kernel the signature. */ + + /* Preferably propagate the original kernel error, so that the fallback logic can work, + * as the device-mapper is finicky around concurrent activations of the same volume */ + k = validate_signature_userspace(verity); + if (k < 0) + return r < 0 ? r : k; + if (k == 0) + return log_debug_errno(r < 0 ? r : SYNTHETIC_ERRNO(ENOKEY), + "Activation of signed Verity volume worked neither via the kernel nor in userspace, can't activate."); + } + + return sym_crypt_activate_by_volume_key( + cd, + name, + verity->root_hash, + verity->root_hash_size, + CRYPT_ACTIVATE_READONLY); +} + +static usec_t verity_timeout(void) { + usec_t t = 100 * USEC_PER_MSEC; + const char *e; + int r; + + /* On slower machines, like non-KVM vm, setting up device may take a long time. + * Let's make the timeout configurable. */ + + e = getenv("SYSTEMD_DISSECT_VERITY_TIMEOUT_SEC"); + if (!e) + return t; + + r = parse_sec(e, &t); + if (r < 0) + log_debug_errno(r, + "Failed to parse timeout specified in $SYSTEMD_DISSECT_VERITY_TIMEOUT_SEC, " + "using the default timeout (%s).", + FORMAT_TIMESPAN(t, USEC_PER_MSEC)); + + return t; +} + +static int verity_partition( + PartitionDesignator designator, + DissectedPartition *m, + DissectedPartition *v, + const VeritySettings *verity, + DissectImageFlags flags, + DecryptedImage *d) { + + _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL; + _cleanup_free_ char *node = NULL, *name = NULL; + _cleanup_close_ int mount_node_fd = -EBADF; + int r; + + assert(m); + assert(v || (verity && verity->data_path)); + + if (!verity || !verity->root_hash) + return 0; + if (!((verity->designator < 0 && designator == PARTITION_ROOT) || + (verity->designator == designator))) + return 0; + + if (!m->found || !m->node || !m->fstype) + return 0; + if (!verity->data_path) { + if (!v->found || !v->node || !v->fstype) + return 0; + + if (!streq(v->fstype, "DM_verity_hash")) + return 0; + } + + r = dlopen_cryptsetup(); + if (r < 0) + return r; + + if (FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) { + /* Use the roothash, which is unique per volume, as the device node name, so that it can be reused */ + _cleanup_free_ char *root_hash_encoded = NULL; + + root_hash_encoded = hexmem(verity->root_hash, verity->root_hash_size); + if (!root_hash_encoded) + return -ENOMEM; + + r = make_dm_name_and_node(root_hash_encoded, "-verity", &name, &node); + } else + r = make_dm_name_and_node(m->node, "-verity", &name, &node); + if (r < 0) + return r; + + r = sym_crypt_init(&cd, verity->data_path ?: v->node); + if (r < 0) + return r; + + cryptsetup_enable_logging(cd); + + r = sym_crypt_load(cd, CRYPT_VERITY, NULL); + if (r < 0) + return r; + + r = sym_crypt_set_data_device(cd, m->node); + if (r < 0) + return r; + + if (!GREEDY_REALLOC0(d->decrypted, d->n_decrypted + 1)) + return -ENOMEM; + + /* If activating fails because the device already exists, check the metadata and reuse it if it matches. + * In case of ENODEV/ENOENT, which can happen if another process is activating at the exact same time, + * retry a few times before giving up. */ + for (unsigned i = 0; i < N_DEVICE_NODE_LIST_ATTEMPTS; i++) { + _cleanup_(dm_deferred_remove_cleanp) char *restore_deferred_remove = NULL; + _cleanup_(sym_crypt_freep) struct crypt_device *existing_cd = NULL; + _cleanup_close_ int fd = -EBADF; + + /* First, check if the device already exists. */ + fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd < 0 && !ERRNO_IS_DEVICE_ABSENT(errno)) + return log_debug_errno(errno, "Failed to open verity device %s: %m", node); + if (fd >= 0) + goto check; /* The device already exists. Let's check it. */ + + /* The symlink to the device node does not exist yet. Assume not activated, and let's activate it. */ + r = do_crypt_activate_verity(cd, name, verity); + if (r >= 0) + goto try_open; /* The device is activated. Let's open it. */ + /* libdevmapper can return EINVAL when the device is already in the activation stage. + * There's no way to distinguish this situation from a genuine error due to invalid + * parameters, so immediately fall back to activating the device with a unique name. + * Improvements in libcrypsetup can ensure this never happens: + * https://gitlab.com/cryptsetup/cryptsetup/-/merge_requests/96 */ + if (r == -EINVAL && FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) + break; + if (r == -ENODEV) /* Volume is being opened but not ready, crypt_init_by_name would fail, try to open again */ + goto try_again; + if (!IN_SET(r, + -EEXIST, /* Volume has already been opened and ready to be used. */ + -EBUSY /* Volume is being opened but not ready, crypt_init_by_name() can fetch details. */)) + return log_debug_errno(r, "Failed to activate verity device %s: %m", node); + + check: + /* To avoid races, disable automatic removal on umount while setting up the new device. Restore it on failure. */ + r = dm_deferred_remove_cancel(name); + /* -EBUSY and -ENXIO: the device has already been removed or being removed. We cannot + * use the device, try to open again. See target_message() in drivers/md/dm-ioctl.c + * and dm_cancel_deferred_remove() in drivers/md/dm.c */ + if (IN_SET(r, -EBUSY, -ENXIO)) + goto try_again; + if (r < 0) + return log_debug_errno(r, "Failed to disable automated deferred removal for verity device %s: %m", node); + + restore_deferred_remove = strdup(name); + if (!restore_deferred_remove) + return log_oom_debug(); + + r = verity_can_reuse(verity, name, &existing_cd); + /* Same as above, -EINVAL can randomly happen when it actually means -EEXIST */ + if (r == -EINVAL && FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) + break; + if (IN_SET(r, + -ENOENT, /* Removed?? */ + -EBUSY, /* Volume is being opened but not ready, crypt_init_by_name() can fetch details. */ + -ENODEV /* Volume is being opened but not ready, crypt_init_by_name() would fail, try to open again. */ )) + goto try_again; + if (r < 0) + return log_debug_errno(r, "Failed to check if existing verity device %s can be reused: %m", node); + + if (fd < 0) { + /* devmapper might say that the device exists, but the devlink might not yet have been + * created. Check and wait for the udev event in that case. */ + r = device_wait_for_devlink(node, "block", verity_timeout(), NULL); + /* Fallback to activation with a unique device if it's taking too long */ + if (r == -ETIMEDOUT && FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) + break; + if (r < 0) + return log_debug_errno(r, "Failed to wait device node symlink %s: %m", node); + } + + try_open: + if (fd < 0) { + /* Now, the device is activated and devlink is created. Let's open it. */ + fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + if (!ERRNO_IS_DEVICE_ABSENT(errno)) + return log_debug_errno(errno, "Failed to open verity device %s: %m", node); + + /* The device has already been removed?? */ + goto try_again; + } + } + + /* Everything looks good and we'll be able to mount the device, so deferred remove will be re-enabled at that point. */ + restore_deferred_remove = mfree(restore_deferred_remove); + + mount_node_fd = TAKE_FD(fd); + if (existing_cd) + crypt_free_and_replace(cd, existing_cd); + + goto success; + + try_again: + /* Device is being removed by another process. Let's wait for a while. */ + (void) usleep_safe(2 * USEC_PER_MSEC); + } + + /* All trials failed or a conflicting verity device exists. Let's try to activate with a unique name. */ + if (FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) { + /* Before trying to activate with unique name, we need to free crypt_device object. + * Otherwise, we get error from libcryptsetup like the following: + * ------ + * systemd[1234]: Cannot use device /dev/loop5 which is in use (already mapped or mounted). + * ------ + */ + sym_crypt_free(cd); + cd = NULL; + return verity_partition(designator, m, v, verity, flags & ~DISSECT_IMAGE_VERITY_SHARE, d); + } + + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "All attempts to activate verity device %s failed.", name); + +success: + d->decrypted[d->n_decrypted++] = (DecryptedPartition) { + .name = TAKE_PTR(name), + .device = TAKE_PTR(cd), + }; + + m->decrypted_node = TAKE_PTR(node); + close_and_replace(m->mount_node_fd, mount_node_fd); + + return 0; +} +#endif + +int dissected_image_decrypt( + DissectedImage *m, + const char *passphrase, + const VeritySettings *verity, + DissectImageFlags flags) { + +#if HAVE_LIBCRYPTSETUP + _cleanup_(decrypted_image_unrefp) DecryptedImage *d = NULL; + int r; +#endif + + assert(m); + assert(!verity || verity->root_hash || verity->root_hash_size == 0); + + /* Returns: + * + * = 0 → There was nothing to decrypt + * > 0 → Decrypted successfully + * -ENOKEY → There's something to decrypt but no key was supplied + * -EKEYREJECTED → Passed key was not correct + */ + + if (verity && verity->root_hash && verity->root_hash_size < sizeof(sd_id128_t)) + return -EINVAL; + + if (!m->encrypted && !m->verity_ready) + return 0; + +#if HAVE_LIBCRYPTSETUP + r = decrypted_image_new(&d); + if (r < 0) + return r; + + for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) { + DissectedPartition *p = m->partitions + i; + PartitionDesignator k; + + if (!p->found) + continue; + + r = decrypt_partition(p, passphrase, flags, d); + if (r < 0) + return r; + + k = partition_verity_of(i); + if (k >= 0) { + r = verity_partition(i, p, m->partitions + k, verity, flags | DISSECT_IMAGE_VERITY_SHARE, d); + if (r < 0) + return r; + } + + if (!p->decrypted_fstype && p->mount_node_fd >= 0 && p->decrypted_node) { + r = probe_filesystem_full(p->mount_node_fd, p->decrypted_node, 0, UINT64_MAX, &p->decrypted_fstype); + if (r < 0 && r != -EUCLEAN) + return r; + } + } + + m->decrypted_image = TAKE_PTR(d); + + return 1; +#else + return -EOPNOTSUPP; +#endif +} + +int dissected_image_decrypt_interactively( + DissectedImage *m, + const char *passphrase, + const VeritySettings *verity, + DissectImageFlags flags) { + + _cleanup_strv_free_erase_ char **z = NULL; + int n = 3, r; + + if (passphrase) + n--; + + for (;;) { + r = dissected_image_decrypt(m, passphrase, verity, flags); + if (r >= 0) + return r; + if (r == -EKEYREJECTED) + log_error_errno(r, "Incorrect passphrase, try again!"); + else if (r != -ENOKEY) + return log_error_errno(r, "Failed to decrypt image: %m"); + + if (--n < 0) + return log_error_errno(SYNTHETIC_ERRNO(EKEYREJECTED), + "Too many retries."); + + z = strv_free(z); + + r = ask_password_auto("Please enter image passphrase:", NULL, "dissect", "dissect", "dissect.passphrase", USEC_INFINITY, 0, &z); + if (r < 0) + return log_error_errno(r, "Failed to query for passphrase: %m"); + + passphrase = z[0]; + } +} + +static int decrypted_image_relinquish(DecryptedImage *d) { + assert(d); + + /* Turns on automatic removal after the last use ended for all DM devices of this image, and sets a + * boolean so that we don't clean it up ourselves either anymore */ + +#if HAVE_LIBCRYPTSETUP + int r; + + for (size_t i = 0; i < d->n_decrypted; i++) { + DecryptedPartition *p = d->decrypted + i; + + if (p->relinquished) + continue; + + r = sym_crypt_deactivate_by_name(NULL, p->name, CRYPT_DEACTIVATE_DEFERRED); + if (r < 0) + return log_debug_errno(r, "Failed to mark %s for auto-removal: %m", p->name); + + p->relinquished = true; + } +#endif + + return 0; +} + +int dissected_image_relinquish(DissectedImage *m) { + int r; + + assert(m); + + if (m->decrypted_image) { + r = decrypted_image_relinquish(m->decrypted_image); + if (r < 0) + return r; + } + + if (m->loop) + loop_device_relinquish(m->loop); + + return 0; +} + +static char *build_auxiliary_path(const char *image, const char *suffix) { + const char *e; + char *n; + + assert(image); + assert(suffix); + + e = endswith(image, ".raw"); + if (!e) + return strjoin(e, suffix); + + n = new(char, e - image + strlen(suffix) + 1); + if (!n) + return NULL; + + strcpy(mempcpy(n, image, e - image), suffix); + return n; +} + +void verity_settings_done(VeritySettings *v) { + assert(v); + + v->root_hash = mfree(v->root_hash); + v->root_hash_size = 0; + + v->root_hash_sig = mfree(v->root_hash_sig); + v->root_hash_sig_size = 0; + + v->data_path = mfree(v->data_path); +} + +int verity_settings_load( + VeritySettings *verity, + const char *image, + const char *root_hash_path, + const char *root_hash_sig_path) { + + _cleanup_free_ void *root_hash = NULL, *root_hash_sig = NULL; + size_t root_hash_size = 0, root_hash_sig_size = 0; + _cleanup_free_ char *verity_data_path = NULL; + PartitionDesignator designator; + int r; + + assert(verity); + assert(image); + assert(verity->designator < 0 || IN_SET(verity->designator, PARTITION_ROOT, PARTITION_USR)); + + /* If we are asked to load the root hash for a device node, exit early */ + if (is_device_path(image)) + return 0; + + r = getenv_bool_secure("SYSTEMD_DISSECT_VERITY_SIDECAR"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_DISSECT_VERITY_SIDECAR, ignoring: %m"); + if (r == 0) + return 0; + + designator = verity->designator; + + /* We only fill in what isn't already filled in */ + + if (!verity->root_hash) { + _cleanup_free_ char *text = NULL; + + if (root_hash_path) { + /* If explicitly specified it takes precedence */ + r = read_one_line_file(root_hash_path, &text); + if (r < 0) + return r; + + if (designator < 0) + designator = PARTITION_ROOT; + } else { + /* Otherwise look for xattr and separate file, and first for the data for root and if + * that doesn't exist for /usr */ + + if (designator < 0 || designator == PARTITION_ROOT) { + r = getxattr_malloc(image, "user.verity.roothash", &text); + if (r < 0) { + _cleanup_free_ char *p = NULL; + + if (r != -ENOENT && !ERRNO_IS_XATTR_ABSENT(r)) + return r; + + p = build_auxiliary_path(image, ".roothash"); + if (!p) + return -ENOMEM; + + r = read_one_line_file(p, &text); + if (r < 0 && r != -ENOENT) + return r; + } + + if (text) + designator = PARTITION_ROOT; + } + + if (!text && (designator < 0 || designator == PARTITION_USR)) { + /* So in the "roothash" xattr/file name above the "root" of course primarily + * refers to the root of the Verity Merkle tree. But coincidentally it also + * is the hash for the *root* file system, i.e. the "root" neatly refers to + * two distinct concepts called "root". Taking benefit of this happy + * coincidence we call the file with the root hash for the /usr/ file system + * `usrhash`, because `usrroothash` or `rootusrhash` would just be too + * confusing. We thus drop the reference to the root of the Merkle tree, and + * just indicate which file system it's about. */ + r = getxattr_malloc(image, "user.verity.usrhash", &text); + if (r < 0) { + _cleanup_free_ char *p = NULL; + + if (r != -ENOENT && !ERRNO_IS_XATTR_ABSENT(r)) + return r; + + p = build_auxiliary_path(image, ".usrhash"); + if (!p) + return -ENOMEM; + + r = read_one_line_file(p, &text); + if (r < 0 && r != -ENOENT) + return r; + } + + if (text) + designator = PARTITION_USR; + } + } + + if (text) { + r = unhexmem(text, strlen(text), &root_hash, &root_hash_size); + if (r < 0) + return r; + if (root_hash_size < sizeof(sd_id128_t)) + return -EINVAL; + } + } + + if ((root_hash || verity->root_hash) && !verity->root_hash_sig) { + if (root_hash_sig_path) { + r = read_full_file(root_hash_sig_path, (char**) &root_hash_sig, &root_hash_sig_size); + if (r < 0 && r != -ENOENT) + return r; + + if (designator < 0) + designator = PARTITION_ROOT; + } else { + if (designator < 0 || designator == PARTITION_ROOT) { + _cleanup_free_ char *p = NULL; + + /* Follow naming convention recommended by the relevant RFC: + * https://tools.ietf.org/html/rfc5751#section-3.2.1 */ + p = build_auxiliary_path(image, ".roothash.p7s"); + if (!p) + return -ENOMEM; + + r = read_full_file(p, (char**) &root_hash_sig, &root_hash_sig_size); + if (r < 0 && r != -ENOENT) + return r; + if (r >= 0) + designator = PARTITION_ROOT; + } + + if (!root_hash_sig && (designator < 0 || designator == PARTITION_USR)) { + _cleanup_free_ char *p = NULL; + + p = build_auxiliary_path(image, ".usrhash.p7s"); + if (!p) + return -ENOMEM; + + r = read_full_file(p, (char**) &root_hash_sig, &root_hash_sig_size); + if (r < 0 && r != -ENOENT) + return r; + if (r >= 0) + designator = PARTITION_USR; + } + } + + if (root_hash_sig && root_hash_sig_size == 0) /* refuse empty size signatures */ + return -EINVAL; + } + + if (!verity->data_path) { + _cleanup_free_ char *p = NULL; + + p = build_auxiliary_path(image, ".verity"); + if (!p) + return -ENOMEM; + + if (access(p, F_OK) < 0) { + if (errno != ENOENT) + return -errno; + } else + verity_data_path = TAKE_PTR(p); + } + + if (root_hash) { + verity->root_hash = TAKE_PTR(root_hash); + verity->root_hash_size = root_hash_size; + } + + if (root_hash_sig) { + verity->root_hash_sig = TAKE_PTR(root_hash_sig); + verity->root_hash_sig_size = root_hash_sig_size; + } + + if (verity_data_path) + verity->data_path = TAKE_PTR(verity_data_path); + + if (verity->designator < 0) + verity->designator = designator; + + return 1; +} + +int dissected_image_load_verity_sig_partition( + DissectedImage *m, + int fd, + VeritySettings *verity) { + + _cleanup_free_ void *root_hash = NULL, *root_hash_sig = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + size_t root_hash_size, root_hash_sig_size; + _cleanup_free_ char *buf = NULL; + PartitionDesignator d; + DissectedPartition *p; + JsonVariant *rh, *sig; + ssize_t n; + char *e; + int r; + + assert(m); + assert(fd >= 0); + assert(verity); + + if (verity->root_hash && verity->root_hash_sig) /* Already loaded? */ + return 0; + + r = getenv_bool_secure("SYSTEMD_DISSECT_VERITY_EMBEDDED"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_DISSECT_VERITY_EMBEDDED, ignoring: %m"); + if (r == 0) + return 0; + + d = partition_verity_sig_of(verity->designator < 0 ? PARTITION_ROOT : verity->designator); + assert(d >= 0); + + p = m->partitions + d; + if (!p->found) + return 0; + if (p->offset == UINT64_MAX || p->size == UINT64_MAX) + return -EINVAL; + + if (p->size > 4*1024*1024) /* Signature data cannot possible be larger than 4M, refuse that */ + return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "Verity signature partition is larger than 4M, refusing."); + + buf = new(char, p->size+1); + if (!buf) + return -ENOMEM; + + n = pread(fd, buf, p->size, p->offset); + if (n < 0) + return -ENOMEM; + if ((uint64_t) n != p->size) + return -EIO; + + e = memchr(buf, 0, p->size); + if (e) { + /* If we found a NUL byte then the rest of the data must be NUL too */ + if (!memeqzero(e, p->size - (e - buf))) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature data contains embedded NUL byte."); + } else + buf[p->size] = 0; + + r = json_parse(buf, 0, &v, NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to parse signature JSON data: %m"); + + rh = json_variant_by_key(v, "rootHash"); + if (!rh) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature JSON object lacks 'rootHash' field."); + if (!json_variant_is_string(rh)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'rootHash' field of signature JSON object is not a string."); + + r = unhexmem(json_variant_string(rh), SIZE_MAX, &root_hash, &root_hash_size); + if (r < 0) + return log_debug_errno(r, "Failed to parse root hash field: %m"); + + /* Check if specified root hash matches if it is specified */ + if (verity->root_hash && + memcmp_nn(verity->root_hash, verity->root_hash_size, root_hash, root_hash_size) != 0) { + _cleanup_free_ char *a = NULL, *b = NULL; + + a = hexmem(root_hash, root_hash_size); + b = hexmem(verity->root_hash, verity->root_hash_size); + + return log_debug_errno(r, "Root hash in signature JSON data (%s) doesn't match configured hash (%s).", strna(a), strna(b)); + } + + sig = json_variant_by_key(v, "signature"); + if (!sig) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature JSON object lacks 'signature' field."); + if (!json_variant_is_string(sig)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'signature' field of signature JSON object is not a string."); + + r = unbase64mem(json_variant_string(sig), SIZE_MAX, &root_hash_sig, &root_hash_sig_size); + if (r < 0) + return log_debug_errno(r, "Failed to parse signature field: %m"); + + free_and_replace(verity->root_hash, root_hash); + verity->root_hash_size = root_hash_size; + + free_and_replace(verity->root_hash_sig, root_hash_sig); + verity->root_hash_sig_size = root_hash_sig_size; + + return 1; +} + +int dissected_image_acquire_metadata(DissectedImage *m, DissectImageFlags extra_flags) { + + enum { + META_HOSTNAME, + META_MACHINE_ID, + META_MACHINE_INFO, + META_OS_RELEASE, + META_INITRD_RELEASE, + META_SYSEXT_RELEASE, + META_CONFEXT_RELEASE, + META_HAS_INIT_SYSTEM, + _META_MAX, + }; + + static const char *const paths[_META_MAX] = { + [META_HOSTNAME] = "/etc/hostname\0", + [META_MACHINE_ID] = "/etc/machine-id\0", + [META_MACHINE_INFO] = "/etc/machine-info\0", + [META_OS_RELEASE] = "/etc/os-release\0" + "/usr/lib/os-release\0", + [META_INITRD_RELEASE] = "/etc/initrd-release\0" + "/usr/lib/initrd-release\0", + [META_SYSEXT_RELEASE] = "sysext-release\0", /* String used only for logging. */ + [META_CONFEXT_RELEASE] = "confext-release\0", /* ditto */ + [META_HAS_INIT_SYSTEM] = "has-init-system\0", /* ditto */ + }; + + _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **initrd_release = NULL, **sysext_release = NULL, **confext_release = NULL; + _cleanup_close_pair_ int error_pipe[2] = EBADF_PAIR; + _cleanup_(rmdir_and_freep) char *t = NULL; + _cleanup_(sigkill_waitp) pid_t child = 0; + sd_id128_t machine_id = SD_ID128_NULL; + _cleanup_free_ char *hostname = NULL; + unsigned n_meta_initialized = 0; + int fds[2 * _META_MAX], r, v; + int has_init_system = -1; + ssize_t n; + + BLOCK_SIGNALS(SIGCHLD); + + assert(m); + + for (; n_meta_initialized < _META_MAX; n_meta_initialized ++) { + if (!paths[n_meta_initialized]) { + fds[2*n_meta_initialized] = fds[2*n_meta_initialized+1] = -EBADF; + continue; + } + + if (pipe2(fds + 2*n_meta_initialized, O_CLOEXEC) < 0) { + r = -errno; + goto finish; + } + } + + r = mkdtemp_malloc("/tmp/dissect-XXXXXX", &t); + if (r < 0) + goto finish; + + if (pipe2(error_pipe, O_CLOEXEC) < 0) { + r = -errno; + goto finish; + } + + r = safe_fork("(sd-dissect)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, &child); + if (r < 0) + goto finish; + if (r == 0) { + /* Child in a new mount namespace */ + error_pipe[0] = safe_close(error_pipe[0]); + + r = dissected_image_mount( + m, + t, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + extra_flags | + DISSECT_IMAGE_READ_ONLY | + DISSECT_IMAGE_MOUNT_ROOT_ONLY | + DISSECT_IMAGE_USR_NO_ROOT); + if (r < 0) { + log_debug_errno(r, "Failed to mount dissected image: %m"); + goto inner_fail; + } + + for (unsigned k = 0; k < _META_MAX; k++) { + _cleanup_close_ int fd = -ENOENT; + + if (!paths[k]) + continue; + + fds[2*k] = safe_close(fds[2*k]); + + switch (k) { + + case META_SYSEXT_RELEASE: + if (!m->image_name) + goto next; + + /* As per the os-release spec, if the image is an extension it will have a + * file named after the image name in extension-release.d/ - we use the image + * name and try to resolve it with the extension-release helpers, as + * sometimes the image names are mangled on deployment and do not match + * anymore. Unlike other paths this is not fixed, and the image name can be + * mangled on deployment, so by calling into the helper we allow a fallback + * that matches on the first extension-release file found in the directory, + * if one named after the image cannot be found first. */ + r = open_extension_release( + t, + IMAGE_SYSEXT, + m->image_name, + /* relax_extension_release_check= */ false, + /* ret_path= */ NULL, + &fd); + if (r < 0) + fd = r; + break; + + case META_CONFEXT_RELEASE: + if (!m->image_name) + goto next; + + /* As above */ + r = open_extension_release( + t, + IMAGE_CONFEXT, + m->image_name, + /* relax_extension_release_check= */ false, + /* ret_path= */ NULL, + &fd); + if (r < 0) + fd = r; + + break; + + case META_HAS_INIT_SYSTEM: { + bool found = false; + + FOREACH_STRING(init, + "/usr/lib/systemd/systemd", /* systemd on /usr/ merged system */ + "/lib/systemd/systemd", /* systemd on /usr/ non-merged systems */ + "/sbin/init") { /* traditional path the Linux kernel invokes */ + + r = chase(init, t, CHASE_PREFIX_ROOT, NULL, NULL); + if (r < 0) { + if (r != -ENOENT) + log_debug_errno(r, "Failed to resolve %s, ignoring: %m", init); + } else { + found = true; + break; + } + } + + r = loop_write(fds[2*k+1], &found, sizeof(found)); + if (r < 0) + goto inner_fail; + + goto next; + } + + default: + NULSTR_FOREACH(p, paths[k]) { + fd = chase_and_open(p, t, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL); + if (fd >= 0) + break; + } + } + + if (fd < 0) { + log_debug_errno(fd, "Failed to read %s file of image, ignoring: %m", paths[k]); + goto next; + } + + r = copy_bytes(fd, fds[2*k+1], UINT64_MAX, 0); + if (r < 0) + goto inner_fail; + + next: + fds[2*k+1] = safe_close(fds[2*k+1]); + } + + _exit(EXIT_SUCCESS); + + inner_fail: + /* Let parent know the error */ + (void) write(error_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + error_pipe[1] = safe_close(error_pipe[1]); + + for (unsigned k = 0; k < _META_MAX; k++) { + _cleanup_fclose_ FILE *f = NULL; + + if (!paths[k]) + continue; + + fds[2*k+1] = safe_close(fds[2*k+1]); + + f = take_fdopen(&fds[2*k], "r"); + if (!f) { + r = -errno; + goto finish; + } + + switch (k) { + + case META_HOSTNAME: + r = read_etc_hostname_stream(f, &hostname); + if (r < 0) + log_debug_errno(r, "Failed to read /etc/hostname of image: %m"); + + break; + + case META_MACHINE_ID: { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + log_debug_errno(r, "Failed to read /etc/machine-id of image: %m"); + else if (r == 33) { + r = sd_id128_from_string(line, &machine_id); + if (r < 0) + log_debug_errno(r, "Image contains invalid /etc/machine-id: %s", line); + } else if (r == 0) + log_debug("/etc/machine-id file of image is empty."); + else if (streq(line, "uninitialized")) + log_debug("/etc/machine-id file of image is uninitialized (likely aborted first boot)."); + else + log_debug("/etc/machine-id file of image has unexpected length %i.", r); + + break; + } + + case META_MACHINE_INFO: + r = load_env_file_pairs(f, "machine-info", &machine_info); + if (r < 0) + log_debug_errno(r, "Failed to read /etc/machine-info of image: %m"); + + break; + + case META_OS_RELEASE: + r = load_env_file_pairs(f, "os-release", &os_release); + if (r < 0) + log_debug_errno(r, "Failed to read OS release file of image: %m"); + + break; + + case META_INITRD_RELEASE: + r = load_env_file_pairs(f, "initrd-release", &initrd_release); + if (r < 0) + log_debug_errno(r, "Failed to read initrd release file of image: %m"); + + break; + + case META_SYSEXT_RELEASE: + r = load_env_file_pairs(f, "sysext-release", &sysext_release); + if (r < 0) + log_debug_errno(r, "Failed to read sysext release file of image: %m"); + + break; + + case META_CONFEXT_RELEASE: + r = load_env_file_pairs(f, "confext-release", &confext_release); + if (r < 0) + log_debug_errno(r, "Failed to read confext release file of image: %m"); + + break; + + case META_HAS_INIT_SYSTEM: { + bool b = false; + size_t nr; + + errno = 0; + nr = fread(&b, 1, sizeof(b), f); + if (nr != sizeof(b)) + log_debug_errno(errno_or_else(EIO), "Failed to read has-init-system boolean: %m"); + else + has_init_system = b; + + break; + }} + } + + r = wait_for_terminate_and_check("(sd-dissect)", child, 0); + child = 0; + if (r < 0) + goto finish; + + n = read(error_pipe[0], &v, sizeof(v)); + if (n < 0) { + r = -errno; + goto finish; + } + if (n == sizeof(v)) { + r = v; /* propagate error sent to us from child */ + goto finish; + } + if (n != 0) { + r = -EIO; + goto finish; + } + if (r != EXIT_SUCCESS) { + r = -EPROTO; + goto finish; + } + + free_and_replace(m->hostname, hostname); + m->machine_id = machine_id; + strv_free_and_replace(m->machine_info, machine_info); + strv_free_and_replace(m->os_release, os_release); + strv_free_and_replace(m->initrd_release, initrd_release); + strv_free_and_replace(m->sysext_release, sysext_release); + strv_free_and_replace(m->confext_release, confext_release); + m->has_init_system = has_init_system; + +finish: + for (unsigned k = 0; k < n_meta_initialized; k++) + safe_close_pair(fds + 2*k); + + return r; +} + +Architecture dissected_image_architecture(DissectedImage *img) { + assert(img); + + if (img->partitions[PARTITION_ROOT].found && + img->partitions[PARTITION_ROOT].architecture >= 0) + return img->partitions[PARTITION_ROOT].architecture; + + if (img->partitions[PARTITION_USR].found && + img->partitions[PARTITION_USR].architecture >= 0) + return img->partitions[PARTITION_USR].architecture; + + return _ARCHITECTURE_INVALID; +} + +int dissect_loop_device( + LoopDevice *loop, + const VeritySettings *verity, + const MountOptions *mount_options, + const ImagePolicy *image_policy, + DissectImageFlags flags, + DissectedImage **ret) { + +#if HAVE_BLKID + _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; + int r; + + assert(loop); + + r = dissected_image_new(loop->backing_file ?: loop->node, &m); + if (r < 0) + return r; + + m->loop = loop_device_ref(loop); + m->sector_size = m->loop->sector_size; + + r = dissect_image(m, loop->fd, loop->node, verity, mount_options, image_policy, flags); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(m); + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int dissect_loop_device_and_warn( + LoopDevice *loop, + const VeritySettings *verity, + const MountOptions *mount_options, + const ImagePolicy *image_policy, + DissectImageFlags flags, + DissectedImage **ret) { + + assert(loop); + + return dissect_log_error( + LOG_ERR, + dissect_loop_device(loop, verity, mount_options, image_policy, flags, ret), + loop->backing_file ?: loop->node, + verity); + +} + +bool dissected_image_verity_candidate(const DissectedImage *image, PartitionDesignator partition_designator) { + assert(image); + + /* Checks if this partition could theoretically do Verity. For non-partitioned images this only works + * if there's an external verity file supplied, for which we can consult .has_verity. For partitioned + * images we only check the partition type. + * + * This call is used to decide whether to suppress or show a verity column in tabular output of the + * image. */ + + if (image->single_file_system) + return partition_designator == PARTITION_ROOT && image->has_verity; + + return partition_verity_of(partition_designator) >= 0; +} + +bool dissected_image_verity_ready(const DissectedImage *image, PartitionDesignator partition_designator) { + PartitionDesignator k; + + assert(image); + + /* Checks if this partition has verity data available that we can activate. For non-partitioned this + * works for the root partition, for others only if the associated verity partition was found. */ + + if (!image->verity_ready) + return false; + + if (image->single_file_system) + return partition_designator == PARTITION_ROOT; + + k = partition_verity_of(partition_designator); + return k >= 0 && image->partitions[k].found; +} + +bool dissected_image_verity_sig_ready(const DissectedImage *image, PartitionDesignator partition_designator) { + PartitionDesignator k; + + assert(image); + + /* Checks if this partition has verity signature data available that we can use. */ + + if (!image->verity_sig_ready) + return false; + + if (image->single_file_system) + return partition_designator == PARTITION_ROOT; + + k = partition_verity_sig_of(partition_designator); + return k >= 0 && image->partitions[k].found; +} + +MountOptions* mount_options_free_all(MountOptions *options) { + MountOptions *m; + + while ((m = LIST_POP(mount_options, options))) { + free(m->options); + free(m); + } + + return NULL; +} + +const char* mount_options_from_designator(const MountOptions *options, PartitionDesignator designator) { + LIST_FOREACH(mount_options, m, options) + if (designator == m->partition_designator && !isempty(m->options)) + return m->options; + + return NULL; +} + +int mount_image_privately_interactively( + const char *image, + const ImagePolicy *image_policy, + DissectImageFlags flags, + char **ret_directory, + int *ret_dir_fd, + LoopDevice **ret_loop_device) { + + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_free_ char *dir = NULL; + int r; + + /* Mounts an OS image at a temporary place, inside a newly created mount namespace of our own. This + * is used by tools such as systemd-tmpfiles or systemd-firstboot to operate on some disk image + * easily. */ + + assert(image); + assert(ret_loop_device); + + /* We intend to mount this right-away, hence add the partitions if needed and pin them. */ + flags |= DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES; + + r = verity_settings_load(&verity, image, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to load root hash data: %m"); + + r = loop_device_make_by_path( + image, + FLAGS_SET(flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &d); + if (r < 0) + return log_error_errno(r, "Failed to set up loopback device for %s: %m", image); + + r = dissect_loop_device_and_warn( + d, + &verity, + /* mount_options= */ NULL, + image_policy, + flags, + &dissected_image); + if (r < 0) + return r; + + r = dissected_image_load_verity_sig_partition(dissected_image, d->fd, &verity); + if (r < 0) + return r; + + r = dissected_image_decrypt_interactively(dissected_image, NULL, &verity, flags); + if (r < 0) + return r; + + r = detach_mount_namespace(); + if (r < 0) + return log_error_errno(r, "Failed to detach mount namespace: %m"); + + r = mkdir_p("/run/systemd/mount-rootfs", 0555); + if (r < 0) + return log_error_errno(r, "Failed to create mount point: %m"); + + r = dissected_image_mount_and_warn( + dissected_image, + "/run/systemd/mount-rootfs", + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + flags); + if (r < 0) + return r; + + r = loop_device_flock(d, LOCK_UN); + if (r < 0) + return r; + + r = dissected_image_relinquish(dissected_image); + if (r < 0) + return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m"); + + if (ret_directory) { + dir = strdup("/run/systemd/mount-rootfs"); + if (!dir) + return log_oom(); + } + + if (ret_dir_fd) { + _cleanup_close_ int dir_fd = -EBADF; + + dir_fd = open("/run/systemd/mount-rootfs", O_CLOEXEC|O_DIRECTORY); + if (dir_fd < 0) + return log_error_errno(errno, "Failed to open mount point directory: %m"); + + *ret_dir_fd = TAKE_FD(dir_fd); + } + + if (ret_directory) + *ret_directory = TAKE_PTR(dir); + + *ret_loop_device = TAKE_PTR(d); + return 0; +} + +static bool mount_options_relax_extension_release_checks(const MountOptions *options) { + if (!options) + return false; + + return string_contains_word(mount_options_from_designator(options, PARTITION_ROOT), ",", "x-systemd.relax-extension-release-check") || + string_contains_word(mount_options_from_designator(options, PARTITION_USR), ",", "x-systemd.relax-extension-release-check") || + string_contains_word(options->options, ",", "x-systemd.relax-extension-release-check"); +} + +int verity_dissect_and_mount( + int src_fd, + const char *src, + const char *dest, + const MountOptions *options, + const ImagePolicy *image_policy, + const char *required_host_os_release_id, + const char *required_host_os_release_version_id, + const char *required_host_os_release_sysext_level, + const char *required_host_os_release_confext_level, + const char *required_sysext_scope, + DissectedImage **ret_image) { + + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + DissectImageFlags dissect_image_flags; + bool relax_extension_release_check; + int r; + + assert(src); + /* Verifying release metadata requires mounted image for now, so ensure the check is skipped when + * opening an image without mounting it immediately (i.e.: 'dest' is NULL). */ + assert(!required_host_os_release_id || dest); + + relax_extension_release_check = mount_options_relax_extension_release_checks(options); + + /* We might get an FD for the image, but we use the original path to look for the dm-verity files */ + r = verity_settings_load(&verity, src, NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + dissect_image_flags = (verity.data_path ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0) | + (relax_extension_release_check ? DISSECT_IMAGE_RELAX_EXTENSION_CHECK : 0) | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES; + + /* Note that we don't use loop_device_make here, as the FD is most likely O_PATH which would not be + * accepted by LOOP_CONFIGURE, so just let loop_device_make_by_path reopen it as a regular FD. */ + r = loop_device_make_by_path( + src_fd >= 0 ? FORMAT_PROC_FD_PATH(src_fd) : src, + /* open_flags= */ -1, + /* sector_size= */ UINT32_MAX, + verity.data_path ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &loop_device); + if (r < 0) + return log_debug_errno(r, "Failed to create loop device for image: %m"); + + r = dissect_loop_device( + loop_device, + &verity, + options, + image_policy, + dissect_image_flags, + &dissected_image); + /* No partition table? Might be a single-filesystem image, try again */ + if (!verity.data_path && r == -ENOPKG) + r = dissect_loop_device( + loop_device, + &verity, + options, + image_policy, + dissect_image_flags | DISSECT_IMAGE_NO_PARTITION_TABLE, + &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); + + r = dissected_image_load_verity_sig_partition(dissected_image, loop_device->fd, &verity); + if (r < 0) + return r; + + r = dissected_image_decrypt( + dissected_image, + NULL, + &verity, + dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + + if (dest) { + r = mkdir_p_label(dest, 0755); + if (r < 0) + return log_debug_errno(r, "Failed to create destination directory %s: %m", dest); + r = umount_recursive(dest, 0); + if (r < 0) + return log_debug_errno(r, "Failed to umount under destination directory %s: %m", dest); + } + + r = dissected_image_mount( + dissected_image, + dest, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to mount image: %m"); + + r = loop_device_flock(loop_device, LOCK_UN); + if (r < 0) + return log_debug_errno(r, "Failed to unlock loopback device: %m"); + + /* If we got os-release values from the caller, then we need to match them with the image's + * extension-release.d/ content. Return -EINVAL if there's any mismatch. + * First, check the distro ID. If that matches, then check the new SYSEXT_LEVEL value if + * available, or else fallback to VERSION_ID. If neither is present (eg: rolling release), + * then a simple match on the ID will be performed. */ + if (required_host_os_release_id) { + _cleanup_strv_free_ char **extension_release = NULL; + ImageClass class = IMAGE_SYSEXT; + + assert(!isempty(required_host_os_release_id)); + + r = load_extension_release_pairs(dest, IMAGE_SYSEXT, dissected_image->image_name, relax_extension_release_check, &extension_release); + if (r == -ENOENT) { + r = load_extension_release_pairs(dest, IMAGE_CONFEXT, dissected_image->image_name, relax_extension_release_check, &extension_release); + if (r >= 0) + class = IMAGE_CONFEXT; + } + if (r < 0) + return log_debug_errno(r, "Failed to parse image %s extension-release metadata: %m", dissected_image->image_name); + + r = extension_release_validate( + dissected_image->image_name, + required_host_os_release_id, + required_host_os_release_version_id, + class == IMAGE_SYSEXT ? required_host_os_release_sysext_level : required_host_os_release_confext_level, + required_sysext_scope, + extension_release, + class); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Image %s extension-release metadata does not match the root's", dissected_image->image_name); + if (r < 0) + return log_debug_errno(r, "Failed to compare image %s extension-release metadata with the root's os-release: %m", dissected_image->image_name); + } + + r = dissected_image_relinquish(dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to relinquish dissected image: %m"); + + if (ret_image) + *ret_image = TAKE_PTR(dissected_image); + + return 0; +} diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h new file mode 100644 index 0000000..15c0bf7 --- /dev/null +++ b/src/shared/dissect-image.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-id128.h" + +#include "architecture.h" +#include "env-util.h" +#include "gpt.h" +#include "list.h" +#include "loop-util.h" +#include "macro.h" +#include "os-util.h" +#include "strv.h" + +typedef struct DissectedImage DissectedImage; +typedef struct DissectedPartition DissectedPartition; +typedef struct DecryptedImage DecryptedImage; +typedef struct MountOptions MountOptions; +typedef struct VeritySettings VeritySettings; + +struct DissectedPartition { + bool found:1; + bool ignored:1; + bool rw:1; + bool growfs:1; + int partno; /* -1 if there was no partition and the images contains a file system directly */ + Architecture architecture; /* Intended architecture: either native, secondary or unset ARCHITECTURE_INVALID. */ + sd_id128_t uuid; /* Partition entry UUID as reported by the GPT */ + char *fstype; + char *node; + char *label; + char *decrypted_node; + char *decrypted_fstype; + char *mount_options; + int mount_node_fd; + uint64_t size; + uint64_t offset; + uint64_t gpt_flags; + int fsmount_fd; +}; + +#define DISSECTED_PARTITION_NULL \ + ((DissectedPartition) { \ + .partno = -1, \ + .architecture = _ARCHITECTURE_INVALID, \ + .mount_node_fd = -EBADF, \ + .fsmount_fd = -EBADF, \ + }) +#define TAKE_PARTITION(p) \ + ({ \ + DissectedPartition *_pp = &(p), _p = *_pp; \ + *_pp = DISSECTED_PARTITION_NULL; \ + _p; \ + }) + +typedef enum DissectImageFlags { + DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */ + DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */ + DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */ + DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */ + DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_DISCARD | + DISSECT_IMAGE_DISCARD_ON_CRYPTO, + DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */ + DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */ + DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */ + DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */ + DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */ + DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */ + DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */ + DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */ + DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */ + DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */ + DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */ + DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */ + DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */ + DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */ + DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY | + DISSECT_IMAGE_MOUNT_READ_ONLY, + DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */ + DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */ + DISSECT_IMAGE_ADD_PARTITION_DEVICES = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */ + DISSECT_IMAGE_PIN_PARTITION_DEVICES = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */ + DISSECT_IMAGE_RELAX_EXTENSION_CHECK = 1 << 22, /* Don't insist that the extension-release file name matches the image name */ + DISSECT_IMAGE_DISKSEQ_DEVNODE = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */ + DISSECT_IMAGE_ALLOW_EMPTY = 1 << 24, /* Allow that no usable partitions is present */ + DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE = 1 << 25, /* Try to mount the image beneath the specified mountpoint, rather than on top of it, and then umount the top */ +} DissectImageFlags; + +struct DissectedImage { + bool encrypted:1; + bool has_verity:1; /* verity available in image, but not necessarily used */ + bool has_verity_sig:1; /* pkcs#7 signature embedded in image */ + bool verity_ready:1; /* verity available, fully specified and usable */ + bool verity_sig_ready:1; /* verity signature logic, fully specified and usable */ + bool single_file_system:1; /* MBR/GPT or single file system */ + + LoopDevice *loop; + DissectedPartition partitions[_PARTITION_DESIGNATOR_MAX]; + DecryptedImage *decrypted_image; + + uint32_t sector_size; + + char *image_name; + sd_id128_t image_uuid; + + /* Meta information extracted from /etc/os-release and similar */ + char *hostname; + sd_id128_t machine_id; + char **machine_info; + char **os_release; + char **initrd_release; + char **confext_release; + char **sysext_release; + int has_init_system; +}; + +struct MountOptions { + PartitionDesignator partition_designator; + char *options; + LIST_FIELDS(MountOptions, mount_options); +}; + +struct VeritySettings { + /* Binary root hash for the Verity Merkle tree */ + void *root_hash; + size_t root_hash_size; + + /* PKCS#7 signature of the above */ + void *root_hash_sig; + size_t root_hash_sig_size; + + /* Path to the verity data file, if stored externally */ + char *data_path; + + /* PARTITION_ROOT or PARTITION_USR, depending on what these Verity settings are for */ + PartitionDesignator designator; +}; + +#define VERITY_SETTINGS_DEFAULT { \ + .designator = _PARTITION_DESIGNATOR_INVALID \ + } + +/* We include image-policy.h down here, since ImagePolicy wants a complete definition of PartitionDesignator first. */ +#include "image-policy.h" + +MountOptions* mount_options_free_all(MountOptions *options); +DEFINE_TRIVIAL_CLEANUP_FUNC(MountOptions*, mount_options_free_all); +const char* mount_options_from_designator(const MountOptions *options, PartitionDesignator designator); + +int probe_filesystem_full(int fd, const char *path, uint64_t offset, uint64_t size, char **ret_fstype); +static inline int probe_filesystem(const char *path, char **ret_fstype) { + return probe_filesystem_full(-1, path, 0, UINT64_MAX, ret_fstype); +} + +int dissect_log_error(int log_level, int r, const char *name, const VeritySettings *verity); +int dissect_image_file(const char *path, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret); +int dissect_image_file_and_warn(const char *path, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret); +int dissect_loop_device(LoopDevice *loop, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret); +int dissect_loop_device_and_warn(LoopDevice *loop, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret); + +DissectedImage* dissected_image_unref(DissectedImage *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(DissectedImage*, dissected_image_unref); + +int dissected_image_decrypt(DissectedImage *m, const char *passphrase, const VeritySettings *verity, DissectImageFlags flags); +int dissected_image_decrypt_interactively(DissectedImage *m, const char *passphrase, const VeritySettings *verity, DissectImageFlags flags); +int dissected_image_mount(DissectedImage *m, const char *dest, uid_t uid_shift, uid_t uid_range, int userns_fd, DissectImageFlags flags); +int dissected_image_mount_and_warn(DissectedImage *m, const char *where, uid_t uid_shift, uid_t uid_range, int userns_fd, DissectImageFlags flags); + +int dissected_image_acquire_metadata(DissectedImage *m, DissectImageFlags extra_flags); + +Architecture dissected_image_architecture(DissectedImage *m); + +static inline bool dissected_image_is_bootable_os(DissectedImage *m) { + return m && m->has_init_system > 0; +} + +static inline bool dissected_image_is_bootable_uefi(DissectedImage *m) { + return m && m->partitions[PARTITION_ESP].found && dissected_image_is_bootable_os(m); +} + +static inline bool dissected_image_is_portable(DissectedImage *m) { + return m && strv_env_pairs_get(m->os_release, "PORTABLE_PREFIXES"); +} + +static inline bool dissected_image_is_initrd(DissectedImage *m) { + return m && !strv_isempty(m->initrd_release); +} + +DecryptedImage* decrypted_image_ref(DecryptedImage *p); +DecryptedImage* decrypted_image_unref(DecryptedImage *p); +DEFINE_TRIVIAL_CLEANUP_FUNC(DecryptedImage*, decrypted_image_unref); + +int dissected_image_relinquish(DissectedImage *m); + +int verity_settings_load(VeritySettings *verity, const char *image, const char *root_hash_path, const char *root_hash_sig_path); +void verity_settings_done(VeritySettings *verity); + +static inline bool verity_settings_data_covers(const VeritySettings *verity, PartitionDesignator d) { + /* Returns true if the verity settings contain sufficient information to cover the specified partition */ + return verity && + ((d >= 0 && verity->designator == d) || (d == PARTITION_ROOT && verity->designator < 0)) && + verity->root_hash && + verity->data_path; +} + +int dissected_image_load_verity_sig_partition(DissectedImage *m, int fd, VeritySettings *verity); + +bool dissected_image_verity_candidate(const DissectedImage *image, PartitionDesignator d); +bool dissected_image_verity_ready(const DissectedImage *image, PartitionDesignator d); +bool dissected_image_verity_sig_ready(const DissectedImage *image, PartitionDesignator d); + +int mount_image_privately_interactively(const char *path, const ImagePolicy *image_policy, DissectImageFlags flags, char **ret_directory, int *ret_dir_fd, LoopDevice **ret_loop_device); + +int verity_dissect_and_mount(int src_fd, const char *src, const char *dest, const MountOptions *options, const ImagePolicy *image_policy, const char *required_host_os_release_id, const char *required_host_os_release_version_id, const char *required_host_os_release_sysext_level, const char *required_host_os_release_confext_level, const char *required_sysext_scope, DissectedImage **ret_image); + +int dissect_fstype_ok(const char *fstype); + +int probe_sector_size(int fd, uint32_t *ret); +int probe_sector_size_prefer_ioctl(int fd, uint32_t *ret); + +int partition_pick_mount_options(PartitionDesignator d, const char *fstype, bool rw, bool discard, char **ret_options, unsigned long *ret_ms_flags); + +static inline const char *dissected_partition_fstype(const DissectedPartition *m) { + assert(m); + + return m->decrypted_node ? m->decrypted_fstype : m->fstype; +} diff --git a/src/shared/dlfcn-util.c b/src/shared/dlfcn-util.c new file mode 100644 index 0000000..a321df3 --- /dev/null +++ b/src/shared/dlfcn-util.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dlfcn-util.h" + +static int dlsym_many_or_warnv(void *dl, int log_level, va_list ap) { + void (**fn)(void); + + /* Tries to resolve a bunch of function symbols, and logs an error about if it cannot resolve one of + * them. Note that this function possibly modifies the supplied function pointers if the whole + * operation fails. */ + + while ((fn = va_arg(ap, typeof(fn)))) { + void (*tfn)(void); + const char *symbol; + + symbol = va_arg(ap, typeof(symbol)); + + tfn = (typeof(tfn)) dlsym(dl, symbol); + if (!tfn) + return log_full_errno(log_level, + SYNTHETIC_ERRNO(ELIBBAD), + "Can't find symbol %s: %s", symbol, dlerror()); + *fn = tfn; + } + + return 0; +} + +int dlsym_many_or_warn_sentinel(void *dl, int log_level, ...) { + va_list ap; + int r; + + va_start(ap, log_level); + r = dlsym_many_or_warnv(dl, log_level, ap); + va_end(ap); + + return r; +} + +int dlopen_many_sym_or_warn_sentinel(void **dlp, const char *filename, int log_level, ...) { + _cleanup_(dlclosep) void *dl = NULL; + int r; + + if (*dlp) + return 0; /* Already loaded */ + + dl = dlopen(filename, RTLD_LAZY); + if (!dl) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "%s is not installed: %s", filename, dlerror()); + + va_list ap; + va_start(ap, log_level); + r = dlsym_many_or_warnv(dl, log_level, ap); + va_end(ap); + + if (r < 0) + return r; + + /* Note that we never release the reference here, because there's no real reason to. After all this + * was traditionally a regular shared library dependency which lives forever too. */ + *dlp = TAKE_PTR(dl); + return 1; +} diff --git a/src/shared/dlfcn-util.h b/src/shared/dlfcn-util.h new file mode 100644 index 0000000..7d8cb4c --- /dev/null +++ b/src/shared/dlfcn-util.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(void*, dlclose, NULL); + +int dlsym_many_or_warn_sentinel(void *dl, int log_level, ...) _sentinel_; +int dlopen_many_sym_or_warn_sentinel(void **dlp, const char *filename, int log_level, ...) _sentinel_; + +#define dlsym_many_or_warn(dl, log_level, ...) \ + dlsym_many_or_warn_sentinel(dl, log_level, __VA_ARGS__, NULL) +#define dlopen_many_sym_or_warn(dlp, filename, log_level, ...) \ + dlopen_many_sym_or_warn_sentinel(dlp, filename, log_level, __VA_ARGS__, NULL) + +#define DLSYM_PROTOTYPE(symbol) \ + extern typeof(symbol)* sym_##symbol +#define DLSYM_FUNCTION(symbol) \ + typeof(symbol)* sym_##symbol = NULL + +/* Macro useful for putting together variable/symbol name pairs when calling dlsym_many_or_warn(). Assumes + * that each library symbol to resolve will be placed in a variable with the "sym_" prefix, i.e. a symbol + * "foobar" is loaded into a variable "sym_foobar". */ +#define DLSYM_ARG(arg) \ + ({ assert_cc(__builtin_types_compatible_p(typeof(sym_##arg), typeof(&arg))); &sym_##arg; }), STRINGIFY(arg) + +/* libbpf is a bit confused about type-safety and API compatibility. Provide a macro that can tape over that mess. Sad. */ +#define DLSYM_ARG_FORCE(arg) \ + &sym_##arg, STRINGIFY(arg) + +static inline void *safe_dlclose(void *p) { + if (!p) + return NULL; + + assert_se(dlclose(p) == 0); + return NULL; +} diff --git a/src/shared/dm-util.c b/src/shared/dm-util.c new file mode 100644 index 0000000..66c1e13 --- /dev/null +++ b/src/shared/dm-util.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "dm-util.h" +#include "fd-util.h" +#include "string-util.h" + +int dm_deferred_remove_cancel(const char *name) { + _cleanup_close_ int fd = -EBADF; + struct message { + struct dm_ioctl dm_ioctl; + struct dm_target_msg dm_target_msg; + char msg_text[STRLEN("@cancel_deferred_remove") + 1]; + } _packed_ message = { + .dm_ioctl = { + .version = { + DM_VERSION_MAJOR, + DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL + }, + .data_size = sizeof(struct message), + .data_start = sizeof(struct dm_ioctl), + }, + .msg_text = "@cancel_deferred_remove", + }; + + assert(name); + + if (strlen(name) >= sizeof(message.dm_ioctl.name)) + return -ENODEV; /* A device with a name longer than this cannot possibly exist */ + + strncpy_exact(message.dm_ioctl.name, name, sizeof(message.dm_ioctl.name)); + + fd = open("/dev/mapper/control", O_RDWR|O_CLOEXEC); + if (fd < 0) + return -errno; + + if (ioctl(fd, DM_TARGET_MSG, &message)) + return -errno; + + return 0; +} diff --git a/src/shared/dm-util.h b/src/shared/dm-util.h new file mode 100644 index 0000000..e6e3d7d --- /dev/null +++ b/src/shared/dm-util.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int dm_deferred_remove_cancel(const char *name); diff --git a/src/shared/dns-domain.c b/src/shared/dns-domain.c new file mode 100644 index 0000000..b41c9b0 --- /dev/null +++ b/src/shared/dns-domain.c @@ -0,0 +1,1421 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dns-domain.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "idn-util.h" +#include "in-addr-util.h" +#include "macro.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +int dns_label_unescape(const char **name, char *dest, size_t sz, DNSLabelFlags flags) { + const char *n; + char *d, last_char = 0; + int r = 0; + + assert(name); + assert(*name); + + n = *name; + d = dest; + + for (;;) { + if (IN_SET(*n, 0, '.')) { + if (FLAGS_SET(flags, DNS_LABEL_LDH) && last_char == '-') + /* Trailing dash */ + return -EINVAL; + + if (n[0] == '.' && (n[1] != 0 || !FLAGS_SET(flags, DNS_LABEL_LEAVE_TRAILING_DOT))) + n++; + + break; + } + + if (r >= DNS_LABEL_MAX) + return -EINVAL; + + if (sz <= 0) + return -ENOBUFS; + + if (*n == '\\') { + /* Escaped character */ + if (FLAGS_SET(flags, DNS_LABEL_NO_ESCAPES)) + return -EINVAL; + + n++; + + if (*n == 0) + /* Ending NUL */ + return -EINVAL; + + else if (IN_SET(*n, '\\', '.')) { + /* Escaped backslash or dot */ + + if (FLAGS_SET(flags, DNS_LABEL_LDH)) + return -EINVAL; + + last_char = *n; + if (d) + *(d++) = *n; + sz--; + r++; + n++; + + } else if (n[0] >= '0' && n[0] <= '9') { + unsigned k; + + /* Escaped literal ASCII character */ + + if (!(n[1] >= '0' && n[1] <= '9') || + !(n[2] >= '0' && n[2] <= '9')) + return -EINVAL; + + k = ((unsigned) (n[0] - '0') * 100) + + ((unsigned) (n[1] - '0') * 10) + + ((unsigned) (n[2] - '0')); + + /* Don't allow anything that doesn't fit in 8 bits. Note that we do allow + * control characters, as some servers (e.g. cloudflare) are happy to + * generate labels with them inside. */ + if (k > 255) + return -EINVAL; + + if (FLAGS_SET(flags, DNS_LABEL_LDH) && + !valid_ldh_char((char) k)) + return -EINVAL; + + last_char = (char) k; + if (d) + *(d++) = (char) k; + sz--; + r++; + + n += 3; + } else + return -EINVAL; + + } else if ((uint8_t) *n >= (uint8_t) ' ' && *n != 127) { + + /* Normal character */ + + if (FLAGS_SET(flags, DNS_LABEL_LDH)) { + if (!valid_ldh_char(*n)) + return -EINVAL; + if (r == 0 && *n == '-') + /* Leading dash */ + return -EINVAL; + } + + last_char = *n; + if (d) + *(d++) = *n; + sz--; + r++; + n++; + } else + return -EINVAL; + } + + /* Empty label that is not at the end? */ + if (r == 0 && *n) + return -EINVAL; + + /* More than one trailing dot? */ + if (n[0] == '.' && !FLAGS_SET(flags, DNS_LABEL_LEAVE_TRAILING_DOT)) + return -EINVAL; + + if (sz >= 1 && d) + *d = 0; + + *name = n; + return r; +} + +/* @label_terminal: terminal character of a label, updated to point to the terminal character of + * the previous label (always skipping one dot) or to NULL if there are no more + * labels. */ +int dns_label_unescape_suffix(const char *name, const char **label_terminal, char *dest, size_t sz) { + const char *terminal; + int r; + + assert(name); + assert(label_terminal); + assert(dest); + + /* no more labels */ + if (!*label_terminal) { + if (sz >= 1) + *dest = 0; + + return 0; + } + + terminal = *label_terminal; + assert(IN_SET(*terminal, 0, '.')); + + /* Skip current terminal character (and accept domain names ending it ".") */ + if (*terminal == 0) + terminal = PTR_SUB1(terminal, name); + if (terminal >= name && *terminal == '.') + terminal = PTR_SUB1(terminal, name); + + /* Point name to the last label, and terminal to the preceding terminal symbol (or make it a NULL pointer) */ + while (terminal) { + /* Find the start of the last label */ + if (*terminal == '.') { + const char *y; + unsigned slashes = 0; + + for (y = PTR_SUB1(terminal, name); y && *y == '\\'; y = PTR_SUB1(y, name)) + slashes++; + + if (slashes % 2 == 0) { + /* The '.' was not escaped */ + name = terminal + 1; + break; + } else { + terminal = y; + continue; + } + } + + terminal = PTR_SUB1(terminal, name); + } + + r = dns_label_unescape(&name, dest, sz, 0); + if (r < 0) + return r; + + *label_terminal = terminal; + + return r; +} + +int dns_label_escape(const char *p, size_t l, char *dest, size_t sz) { + char *q; + + /* DNS labels must be between 1 and 63 characters long. A + * zero-length label does not exist. See RFC 2181, Section + * 11. */ + + if (l <= 0 || l > DNS_LABEL_MAX) + return -EINVAL; + if (sz < 1) + return -ENOBUFS; + + assert(p); + assert(dest); + + q = dest; + while (l > 0) { + + if (IN_SET(*p, '.', '\\')) { + + /* Dot or backslash */ + + if (sz < 3) + return -ENOBUFS; + + *(q++) = '\\'; + *(q++) = *p; + + sz -= 2; + + } else if (IN_SET(*p, '_', '-') || + ascii_isdigit(*p) || + ascii_isalpha(*p)) { + + /* Proper character */ + + if (sz < 2) + return -ENOBUFS; + + *(q++) = *p; + sz -= 1; + + } else { + + /* Everything else */ + + if (sz < 5) + return -ENOBUFS; + + *(q++) = '\\'; + *(q++) = '0' + (char) ((uint8_t) *p / 100); + *(q++) = '0' + (char) (((uint8_t) *p / 10) % 10); + *(q++) = '0' + (char) ((uint8_t) *p % 10); + + sz -= 4; + } + + p++; + l--; + } + + *q = 0; + return (int) (q - dest); +} + +int dns_label_escape_new(const char *p, size_t l, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + assert(p); + assert(ret); + + if (l <= 0 || l > DNS_LABEL_MAX) + return -EINVAL; + + s = new(char, DNS_LABEL_ESCAPED_MAX); + if (!s) + return -ENOMEM; + + r = dns_label_escape(p, l, s, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + *ret = TAKE_PTR(s); + + return r; +} + +#if HAVE_LIBIDN +int dns_label_apply_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max) { + _cleanup_free_ uint32_t *input = NULL; + size_t input_size, l; + bool contains_8_bit = false; + char buffer[DNS_LABEL_MAX+1]; + int r; + + assert(encoded); + assert(decoded); + + /* Converts a U-label into an A-label */ + + r = dlopen_idn(); + if (r < 0) + return r; + + if (encoded_size <= 0) + return -EINVAL; + + for (const char *p = encoded; p < encoded + encoded_size; p++) + if ((uint8_t) *p > 127) + contains_8_bit = true; + + if (!contains_8_bit) { + if (encoded_size > DNS_LABEL_MAX) + return -EINVAL; + + return 0; + } + + input = sym_stringprep_utf8_to_ucs4(encoded, encoded_size, &input_size); + if (!input) + return -ENOMEM; + + if (sym_idna_to_ascii_4i(input, input_size, buffer, 0) != 0) + return -EINVAL; + + l = strlen(buffer); + + /* Verify that the result is not longer than one DNS label. */ + if (l <= 0 || l > DNS_LABEL_MAX) + return -EINVAL; + if (l > decoded_max) + return -ENOBUFS; + + memcpy(decoded, buffer, l); + + /* If there's room, append a trailing NUL byte, but only then */ + if (decoded_max > l) + decoded[l] = 0; + + return (int) l; +} + +int dns_label_undo_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max) { + size_t input_size, output_size; + _cleanup_free_ uint32_t *input = NULL; + _cleanup_free_ char *result = NULL; + uint32_t *output = NULL; + size_t w; + int r; + + /* To be invoked after unescaping. Converts an A-label into a U-label. */ + + assert(encoded); + assert(decoded); + + r = dlopen_idn(); + if (r < 0) + return r; + + if (encoded_size <= 0 || encoded_size > DNS_LABEL_MAX) + return -EINVAL; + + if (!memory_startswith(encoded, encoded_size, IDNA_ACE_PREFIX)) + return 0; + + input = sym_stringprep_utf8_to_ucs4(encoded, encoded_size, &input_size); + if (!input) + return -ENOMEM; + + output_size = input_size; + output = newa(uint32_t, output_size); + + sym_idna_to_unicode_44i(input, input_size, output, &output_size, 0); + + result = sym_stringprep_ucs4_to_utf8(output, output_size, NULL, &w); + if (!result) + return -ENOMEM; + if (w <= 0) + return -EINVAL; + if (w > decoded_max) + return -ENOBUFS; + + memcpy(decoded, result, w); + + /* Append trailing NUL byte if there's space, but only then. */ + if (decoded_max > w) + decoded[w] = 0; + + return w; +} +#endif + +int dns_name_concat(const char *a, const char *b, DNSLabelFlags flags, char **_ret) { + _cleanup_free_ char *ret = NULL; + size_t n = 0; + const char *p; + bool first = true; + int r; + + if (a) + p = a; + else if (b) + p = TAKE_PTR(b); + else + goto finish; + + for (;;) { + char label[DNS_LABEL_MAX]; + + r = dns_label_unescape(&p, label, sizeof label, flags); + if (r < 0) + return r; + if (r == 0) { + if (*p != 0) + return -EINVAL; + + if (b) { + /* Now continue with the second string, if there is one */ + p = TAKE_PTR(b); + continue; + } + + break; + } + + if (_ret) { + if (!GREEDY_REALLOC(ret, n + !first + DNS_LABEL_ESCAPED_MAX)) + return -ENOMEM; + + r = dns_label_escape(label, r, ret + n + !first, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + if (!first) + ret[n] = '.'; + } else { + char escaped[DNS_LABEL_ESCAPED_MAX]; + + r = dns_label_escape(label, r, escaped, sizeof(escaped)); + if (r < 0) + return r; + } + + n += r + !first; + first = false; + } + +finish: + if (n > DNS_HOSTNAME_MAX) + return -EINVAL; + + if (_ret) { + if (n == 0) { + /* Nothing appended? If so, generate at least a single dot, to indicate the DNS root domain */ + if (!GREEDY_REALLOC(ret, 2)) + return -ENOMEM; + + ret[n++] = '.'; + } else { + if (!GREEDY_REALLOC(ret, n + 1)) + return -ENOMEM; + } + + ret[n] = 0; + *_ret = TAKE_PTR(ret); + } + + return 0; +} + +void dns_name_hash_func(const char *p, struct siphash *state) { + int r; + + assert(p); + + for (;;) { + char label[DNS_LABEL_MAX+1]; + + r = dns_label_unescape(&p, label, sizeof label, 0); + if (r < 0) + break; + if (r == 0) + break; + + ascii_strlower_n(label, r); + siphash24_compress(label, r, state); + siphash24_compress_byte(0, state); /* make sure foobar and foo.bar result in different hashes */ + } + + /* enforce that all names are terminated by the empty label */ + string_hash_func("", state); +} + +int dns_name_compare_func(const char *a, const char *b) { + const char *x, *y; + int r, q; + + assert(a); + assert(b); + + x = a + strlen(a); + y = b + strlen(b); + + for (;;) { + char la[DNS_LABEL_MAX], lb[DNS_LABEL_MAX]; + + if (x == NULL && y == NULL) + return 0; + + r = dns_label_unescape_suffix(a, &x, la, sizeof(la)); + q = dns_label_unescape_suffix(b, &y, lb, sizeof(lb)); + if (r < 0 || q < 0) + return CMP(r, q); + + r = ascii_strcasecmp_nn(la, r, lb, q); + if (r != 0) + return r; + } +} + +DEFINE_HASH_OPS( + dns_name_hash_ops, + char, + dns_name_hash_func, + dns_name_compare_func); + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + dns_name_hash_ops_free, + char, + dns_name_hash_func, + dns_name_compare_func, + free); + +int dns_name_equal(const char *x, const char *y) { + int r, q; + + assert(x); + assert(y); + + for (;;) { + char la[DNS_LABEL_MAX], lb[DNS_LABEL_MAX]; + + r = dns_label_unescape(&x, la, sizeof la, 0); + if (r < 0) + return r; + + q = dns_label_unescape(&y, lb, sizeof lb, 0); + if (q < 0) + return q; + + if (r != q) + return false; + if (r == 0) + return true; + + if (ascii_strcasecmp_n(la, lb, r) != 0) + return false; + } +} + +int dns_name_endswith(const char *name, const char *suffix) { + const char *n, *s, *saved_n = NULL; + int r, q; + + assert(name); + assert(suffix); + + n = name; + s = suffix; + + for (;;) { + char ln[DNS_LABEL_MAX], ls[DNS_LABEL_MAX]; + + r = dns_label_unescape(&n, ln, sizeof ln, 0); + if (r < 0) + return r; + + if (!saved_n) + saved_n = n; + + q = dns_label_unescape(&s, ls, sizeof ls, 0); + if (q < 0) + return q; + + if (r == 0 && q == 0) + return true; + if (r == 0 && saved_n == n) + return false; + + if (r != q || ascii_strcasecmp_n(ln, ls, r) != 0) { + + /* Not the same, let's jump back, and try with the next label again */ + s = suffix; + n = TAKE_PTR(saved_n); + } + } +} + +int dns_name_startswith(const char *name, const char *prefix) { + const char *n, *p; + int r, q; + + assert(name); + assert(prefix); + + n = name; + p = prefix; + + for (;;) { + char ln[DNS_LABEL_MAX], lp[DNS_LABEL_MAX]; + + r = dns_label_unescape(&p, lp, sizeof lp, 0); + if (r < 0) + return r; + if (r == 0) + return true; + + q = dns_label_unescape(&n, ln, sizeof ln, 0); + if (q < 0) + return q; + + if (r != q) + return false; + if (ascii_strcasecmp_n(ln, lp, r) != 0) + return false; + } +} + +int dns_name_change_suffix(const char *name, const char *old_suffix, const char *new_suffix, char **ret) { + const char *n, *s, *saved_before = NULL, *saved_after = NULL, *prefix; + int r, q; + + assert(name); + assert(old_suffix); + assert(new_suffix); + assert(ret); + + n = name; + s = old_suffix; + + for (;;) { + char ln[DNS_LABEL_MAX], ls[DNS_LABEL_MAX]; + + if (!saved_before) + saved_before = n; + + r = dns_label_unescape(&n, ln, sizeof ln, 0); + if (r < 0) + return r; + + if (!saved_after) + saved_after = n; + + q = dns_label_unescape(&s, ls, sizeof ls, 0); + if (q < 0) + return q; + + if (r == 0 && q == 0) + break; + if (r == 0 && saved_after == n) { + *ret = NULL; /* doesn't match */ + return 0; + } + + if (r != q || ascii_strcasecmp_n(ln, ls, r) != 0) { + + /* Not the same, let's jump back, and try with the next label again */ + s = old_suffix; + n = TAKE_PTR(saved_after); + saved_before = NULL; + } + } + + /* Found it! Now generate the new name */ + prefix = strndupa_safe(name, saved_before - name); + + r = dns_name_concat(prefix, new_suffix, 0, ret); + if (r < 0) + return r; + + return 1; +} + +int dns_name_between(const char *a, const char *b, const char *c) { + /* Determine if b is strictly greater than a and strictly smaller than c. + We consider the order of names to be circular, so that if a is + strictly greater than c, we consider b to be between them if it is + either greater than a or smaller than c. This is how the canonical + DNS name order used in NSEC records work. */ + + if (dns_name_compare_func(a, c) < 0) + /* + a and c are properly ordered: + a<---b--->c + */ + return dns_name_compare_func(a, b) < 0 && + dns_name_compare_func(b, c) < 0; + else + /* + a and c are equal or 'reversed': + <--b--c a-----> + or: + <-----c a--b--> + */ + return dns_name_compare_func(b, c) < 0 || + dns_name_compare_func(a, b) < 0; +} + +int dns_name_reverse(int family, const union in_addr_union *a, char **ret) { + const uint8_t *p; + int r; + + assert(a); + assert(ret); + + p = (const uint8_t*) a; + + if (family == AF_INET) + r = asprintf(ret, "%u.%u.%u.%u.in-addr.arpa", p[3], p[2], p[1], p[0]); + else if (family == AF_INET6) + r = asprintf(ret, "%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.ip6.arpa", + hexchar(p[15] & 0xF), hexchar(p[15] >> 4), hexchar(p[14] & 0xF), hexchar(p[14] >> 4), + hexchar(p[13] & 0xF), hexchar(p[13] >> 4), hexchar(p[12] & 0xF), hexchar(p[12] >> 4), + hexchar(p[11] & 0xF), hexchar(p[11] >> 4), hexchar(p[10] & 0xF), hexchar(p[10] >> 4), + hexchar(p[ 9] & 0xF), hexchar(p[ 9] >> 4), hexchar(p[ 8] & 0xF), hexchar(p[ 8] >> 4), + hexchar(p[ 7] & 0xF), hexchar(p[ 7] >> 4), hexchar(p[ 6] & 0xF), hexchar(p[ 6] >> 4), + hexchar(p[ 5] & 0xF), hexchar(p[ 5] >> 4), hexchar(p[ 4] & 0xF), hexchar(p[ 4] >> 4), + hexchar(p[ 3] & 0xF), hexchar(p[ 3] >> 4), hexchar(p[ 2] & 0xF), hexchar(p[ 2] >> 4), + hexchar(p[ 1] & 0xF), hexchar(p[ 1] >> 4), hexchar(p[ 0] & 0xF), hexchar(p[ 0] >> 4)); + else + return -EAFNOSUPPORT; + if (r < 0) + return -ENOMEM; + + return 0; +} + +int dns_name_address(const char *p, int *ret_family, union in_addr_union *ret_address) { + int r; + + assert(p); + assert(ret_family); + assert(ret_address); + + r = dns_name_endswith(p, "in-addr.arpa"); + if (r < 0) + return r; + if (r > 0) { + uint8_t a[4]; + + for (size_t i = 0; i < ELEMENTSOF(a); i++) { + char label[DNS_LABEL_MAX+1]; + + r = dns_label_unescape(&p, label, sizeof label, 0); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + if (r > 3) + return -EINVAL; + + r = safe_atou8(label, &a[i]); + if (r < 0) + return r; + } + + r = dns_name_equal(p, "in-addr.arpa"); + if (r <= 0) + return r; + + *ret_family = AF_INET; + ret_address->in.s_addr = htobe32(((uint32_t) a[3] << 24) | + ((uint32_t) a[2] << 16) | + ((uint32_t) a[1] << 8) | + (uint32_t) a[0]); + + return 1; + } + + r = dns_name_endswith(p, "ip6.arpa"); + if (r < 0) + return r; + if (r > 0) { + struct in6_addr a; + + for (size_t i = 0; i < ELEMENTSOF(a.s6_addr); i++) { + char label[DNS_LABEL_MAX+1]; + int x, y; + + r = dns_label_unescape(&p, label, sizeof label, 0); + if (r <= 0) + return r; + if (r != 1) + return -EINVAL; + x = unhexchar(label[0]); + if (x < 0) + return -EINVAL; + + r = dns_label_unescape(&p, label, sizeof label, 0); + if (r <= 0) + return r; + if (r != 1) + return -EINVAL; + y = unhexchar(label[0]); + if (y < 0) + return -EINVAL; + + a.s6_addr[ELEMENTSOF(a.s6_addr) - i - 1] = (uint8_t) y << 4 | (uint8_t) x; + } + + r = dns_name_equal(p, "ip6.arpa"); + if (r <= 0) + return r; + + *ret_family = AF_INET6; + ret_address->in6 = a; + return 1; + } + + *ret_family = AF_UNSPEC; + *ret_address = IN_ADDR_NULL; + + return 0; +} + +bool dns_name_is_root(const char *name) { + assert(name); + + /* There are exactly two ways to encode the root domain name: + * as empty string, or with a single dot. */ + + return STR_IN_SET(name, "", "."); +} + +bool dns_name_is_single_label(const char *name) { + int r; + + assert(name); + + r = dns_name_parent(&name); + if (r <= 0) + return false; + + return dns_name_is_root(name); +} + +/* Encode a domain name according to RFC 1035 Section 3.1, without compression */ +int dns_name_to_wire_format(const char *domain, uint8_t *buffer, size_t len, bool canonical) { + uint8_t *label_length, *out; + int r; + + assert(domain); + assert(buffer); + + out = buffer; + + do { + /* Reserve a byte for label length */ + if (len <= 0) + return -ENOBUFS; + len--; + label_length = out; + out++; + + /* Convert and copy a single label. Note that + * dns_label_unescape() returns 0 when it hits the end + * of the domain name, which we rely on here to encode + * the trailing NUL byte. */ + r = dns_label_unescape(&domain, (char *) out, len, 0); + if (r < 0) + return r; + + /* Optionally, output the name in DNSSEC canonical + * format, as described in RFC 4034, section 6.2. Or + * in other words: in lower-case. */ + if (canonical) + ascii_strlower_n((char*) out, (size_t) r); + + /* Fill label length, move forward */ + *label_length = r; + out += r; + len -= r; + + } while (r != 0); + + /* Verify the maximum size of the encoded name. The trailing + * dot + NUL byte account are included this time, hence + * compare against DNS_HOSTNAME_MAX + 2 (which is 255) this + * time. */ + if (out - buffer > DNS_HOSTNAME_MAX + 2) + return -EINVAL; + + return out - buffer; +} + +static bool srv_type_label_is_valid(const char *label, size_t n) { + assert(label); + + if (n < 2) /* Label needs to be at least 2 chars long */ + return false; + + if (label[0] != '_') /* First label char needs to be underscore */ + return false; + + /* Second char must be a letter */ + if (!ascii_isalpha(label[1])) + return false; + + /* Third and further chars must be alphanumeric or a hyphen */ + for (size_t k = 2; k < n; k++) + if (!ascii_isalpha(label[k]) && + !ascii_isdigit(label[k]) && + label[k] != '-') + return false; + + return true; +} + +bool dns_srv_type_is_valid(const char *name) { + unsigned c = 0; + int r; + + if (!name) + return false; + + for (;;) { + char label[DNS_LABEL_MAX]; + + /* This more or less implements RFC 6335, Section 5.1 */ + + r = dns_label_unescape(&name, label, sizeof label, 0); + if (r < 0) + return false; + if (r == 0) + break; + + if (c >= 2) + return false; + + if (!srv_type_label_is_valid(label, r)) + return false; + + c++; + } + + return c == 2; /* exactly two labels */ +} + +bool dnssd_srv_type_is_valid(const char *name) { + return dns_srv_type_is_valid(name) && + ((dns_name_endswith(name, "_tcp") > 0) || + (dns_name_endswith(name, "_udp") > 0)); /* Specific to DNS-SD. RFC 6763, Section 7 */ +} + +bool dns_service_name_is_valid(const char *name) { + size_t l; + + /* This more or less implements RFC 6763, Section 4.1.1 */ + + if (!name) + return false; + + if (!utf8_is_valid(name)) + return false; + + if (string_has_cc(name, NULL)) + return false; + + l = strlen(name); + if (l <= 0) + return false; + if (l > DNS_LABEL_MAX) + return false; + + return true; +} + +int dns_service_join(const char *name, const char *type, const char *domain, char **ret) { + char escaped[DNS_LABEL_ESCAPED_MAX]; + _cleanup_free_ char *n = NULL; + int r; + + assert(type); + assert(domain); + assert(ret); + + if (!dns_srv_type_is_valid(type)) + return -EINVAL; + + if (!name) + return dns_name_concat(type, domain, 0, ret); + + if (!dns_service_name_is_valid(name)) + return -EINVAL; + + r = dns_label_escape(name, strlen(name), escaped, sizeof(escaped)); + if (r < 0) + return r; + + r = dns_name_concat(type, domain, 0, &n); + if (r < 0) + return r; + + return dns_name_concat(escaped, n, 0, ret); +} + +static bool dns_service_name_label_is_valid(const char *label, size_t n) { + char *s; + + assert(label); + + if (memchr(label, 0, n)) + return false; + + s = strndupa_safe(label, n); + return dns_service_name_is_valid(s); +} + +int dns_service_split(const char *joined, char **ret_name, char **ret_type, char **ret_domain) { + _cleanup_free_ char *name = NULL, *type = NULL, *domain = NULL; + const char *p = joined, *q = NULL, *d = joined; + char a[DNS_LABEL_MAX+1], b[DNS_LABEL_MAX+1], c[DNS_LABEL_MAX+1]; + int an, bn, cn, r; + unsigned x = 0; + + assert(joined); + + /* Get first label from the full name */ + an = dns_label_unescape(&p, a, sizeof(a), 0); + if (an < 0) + return an; + + if (an > 0) { + x++; + + /* If there was a first label, try to get the second one */ + bn = dns_label_unescape(&p, b, sizeof(b), 0); + if (bn < 0) + return bn; + + if (bn > 0) { + if (!srv_type_label_is_valid(b, bn)) + goto finish; + + x++; + + /* If there was a second label, try to get the third one */ + q = p; + cn = dns_label_unescape(&p, c, sizeof(c), 0); + if (cn < 0) + return cn; + + if (cn > 0 && srv_type_label_is_valid(c, cn)) + x++; + } + } + + switch (x) { + case 2: + if (!srv_type_label_is_valid(a, an)) + break; + + /* OK, got . . */ + + name = NULL; + + type = strjoin(a, ".", b); + if (!type) + return -ENOMEM; + + d = q; + break; + + case 3: + if (!dns_service_name_label_is_valid(a, an)) + break; + + /* OK, got . . . */ + + name = strndup(a, an); + if (!name) + return -ENOMEM; + + type = strjoin(b, ".", c); + if (!type) + return -ENOMEM; + + d = p; + break; + } + +finish: + r = dns_name_normalize(d, 0, &domain); + if (r < 0) + return r; + + if (ret_domain) + *ret_domain = TAKE_PTR(domain); + + if (ret_type) + *ret_type = TAKE_PTR(type); + + if (ret_name) + *ret_name = TAKE_PTR(name); + + return 0; +} + +static int dns_name_build_suffix_table(const char *name, const char *table[]) { + const char *p = ASSERT_PTR(name); + unsigned n = 0; + int r; + + assert(table); + + for (;;) { + if (n > DNS_N_LABELS_MAX) + return -EINVAL; + + table[n] = p; + r = dns_name_parent(&p); + if (r < 0) + return r; + if (r == 0) + break; + + n++; + } + + return (int) n; +} + +int dns_name_suffix(const char *name, unsigned n_labels, const char **ret) { + const char* labels[DNS_N_LABELS_MAX+1]; + int n; + + assert(name); + assert(ret); + + n = dns_name_build_suffix_table(name, labels); + if (n < 0) + return n; + + if ((unsigned) n < n_labels) + return -EINVAL; + + *ret = labels[n - n_labels]; + return (int) (n - n_labels); +} + +int dns_name_skip(const char *a, unsigned n_labels, const char **ret) { + int r; + + assert(a); + assert(ret); + + for (; n_labels > 0; n_labels--) { + r = dns_name_parent(&a); + if (r < 0) + return r; + if (r == 0) { + *ret = ""; + return 0; + } + } + + *ret = a; + return 1; +} + +int dns_name_count_labels(const char *name) { + unsigned n = 0; + int r; + + assert(name); + + for (const char *p = name;;) { + r = dns_name_parent(&p); + if (r < 0) + return r; + if (r == 0) + break; + + if (n >= DNS_N_LABELS_MAX) + return -EINVAL; + + n++; + } + + return n; +} + +int dns_name_equal_skip(const char *a, unsigned n_labels, const char *b) { + int r; + + assert(a); + assert(b); + + r = dns_name_skip(a, n_labels, &a); + if (r <= 0) + return r; + + return dns_name_equal(a, b); +} + +int dns_name_common_suffix(const char *a, const char *b, const char **ret) { + const char *a_labels[DNS_N_LABELS_MAX+1], *b_labels[DNS_N_LABELS_MAX+1]; + int n = 0, m = 0, k = 0, r, q; + + assert(a); + assert(b); + assert(ret); + + /* Determines the common suffix of domain names a and b */ + + n = dns_name_build_suffix_table(a, a_labels); + if (n < 0) + return n; + + m = dns_name_build_suffix_table(b, b_labels); + if (m < 0) + return m; + + for (;;) { + char la[DNS_LABEL_MAX], lb[DNS_LABEL_MAX]; + const char *x, *y; + + if (k >= n || k >= m) { + *ret = a_labels[n - k]; + return 0; + } + + x = a_labels[n - 1 - k]; + r = dns_label_unescape(&x, la, sizeof la, 0); + if (r < 0) + return r; + + y = b_labels[m - 1 - k]; + q = dns_label_unescape(&y, lb, sizeof lb, 0); + if (q < 0) + return q; + + if (r != q || ascii_strcasecmp_n(la, lb, r) != 0) { + *ret = a_labels[n - k]; + return 0; + } + + k++; + } +} + +int dns_name_apply_idna(const char *name, char **ret) { + + /* Return negative on error, 0 if not implemented, positive on success. */ + +#if HAVE_LIBIDN2 || HAVE_LIBIDN2 + int r; + + r = dlopen_idn(); + if (r == -EOPNOTSUPP) { + *ret = NULL; + return 0; + } + if (r < 0) + return r; +#endif + +#if HAVE_LIBIDN2 + _cleanup_free_ char *t = NULL; + + assert(name); + assert(ret); + + /* First, try non-transitional mode (i.e. IDN2008 rules) */ + r = sym_idn2_lookup_u8((uint8_t*) name, (uint8_t**) &t, + IDN2_NFC_INPUT | IDN2_NONTRANSITIONAL); + if (r == IDN2_DISALLOWED) /* If that failed, because of disallowed characters, try transitional mode. + * (i.e. IDN2003 rules which supports some unicode chars IDN2008 doesn't allow). */ + r = sym_idn2_lookup_u8((uint8_t*) name, (uint8_t**) &t, + IDN2_NFC_INPUT | IDN2_TRANSITIONAL); + + log_debug("idn2_lookup_u8: %s %s %s", name, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), t); + if (r == IDN2_OK) { + if (!startswith(name, "xn--")) { + _cleanup_free_ char *s = NULL; + + r = sym_idn2_to_unicode_8z8z(t, &s, 0); + if (r != IDN2_OK) { + log_debug("idn2_to_unicode_8z8z(\"%s\") failed: %d/%s", + t, r, sym_idn2_strerror(r)); + *ret = NULL; + return 0; + } + + if (!streq_ptr(name, s)) { + log_debug("idn2 roundtrip failed: \"%s\" %s \"%s\" %s \"%s\", ignoring.", + name, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), t, + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), s); + *ret = NULL; + return 0; + } + } + + *ret = TAKE_PTR(t); + return 1; /* *ret has been written */ + } + + log_debug("idn2_lookup_u8(\"%s\") failed: %d/%s", name, r, sym_idn2_strerror(r)); + if (r == IDN2_2HYPHEN) + /* The name has two hyphens — forbidden by IDNA2008 in some cases */ + return 0; + if (IN_SET(r, IDN2_TOO_BIG_DOMAIN, IDN2_TOO_BIG_LABEL)) + return -ENOSPC; + + return -EINVAL; +#elif HAVE_LIBIDN + _cleanup_free_ char *buf = NULL; + size_t n = 0; + bool first = true; + int r, q; + + assert(name); + assert(ret); + + for (;;) { + char label[DNS_LABEL_MAX]; + + r = dns_label_unescape(&name, label, sizeof label, 0); + if (r < 0) + return r; + if (r == 0) + break; + + q = dns_label_apply_idna(label, r, label, sizeof label); + if (q < 0) + return q; + if (q > 0) + r = q; + + if (!GREEDY_REALLOC(buf, n + !first + DNS_LABEL_ESCAPED_MAX)) + return -ENOMEM; + + r = dns_label_escape(label, r, buf + n + !first, DNS_LABEL_ESCAPED_MAX); + if (r < 0) + return r; + + if (first) + first = false; + else + buf[n++] = '.'; + + n += r; + } + + if (n > DNS_HOSTNAME_MAX) + return -EINVAL; + + if (!GREEDY_REALLOC(buf, n + 1)) + return -ENOMEM; + + buf[n] = 0; + *ret = TAKE_PTR(buf); + + return 1; +#else + *ret = NULL; + return 0; +#endif +} + +int dns_name_is_valid_or_address(const char *name) { + /* Returns > 0 if the specified name is either a valid IP address formatted as string or a valid DNS name */ + + if (isempty(name)) + return 0; + + if (in_addr_from_string_auto(name, NULL, NULL) >= 0) + return 1; + + return dns_name_is_valid(name); +} + +int dns_name_dot_suffixed(const char *name) { + const char *p = name; + int r; + + for (;;) { + if (streq(p, ".")) + return true; + + r = dns_label_unescape(&p, NULL, DNS_LABEL_MAX, DNS_LABEL_LEAVE_TRAILING_DOT); + if (r < 0) + return r; + if (r == 0) + return false; + } +} + +bool dns_name_dont_resolve(const char *name) { + + /* Never respond to some of the domains listed in RFC6303 */ + if (dns_name_endswith(name, "0.in-addr.arpa") > 0 || + dns_name_equal(name, "255.255.255.255.in-addr.arpa") > 0 || + dns_name_equal(name, "0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa") > 0) + return true; + + /* Never respond to some of the domains listed in RFC6761 */ + if (dns_name_endswith(name, "invalid") > 0) + return true; + + /* Never respond to some of the domains listed in RFC9476 */ + if (dns_name_endswith(name, "alt") > 0) + return true; + + return false; +} diff --git a/src/shared/dns-domain.h b/src/shared/dns-domain.h new file mode 100644 index 0000000..331fb89 --- /dev/null +++ b/src/shared/dns-domain.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "dns-def.h" +#include "hashmap.h" +#include "in-addr-util.h" + +typedef enum DNSLabelFlags { + DNS_LABEL_LDH = 1 << 0, /* Follow the "LDH" rule — only letters, digits, and internal hyphens. */ + DNS_LABEL_NO_ESCAPES = 1 << 1, /* Do not treat backslashes specially */ + DNS_LABEL_LEAVE_TRAILING_DOT = 1 << 2, /* Leave trailing dot in place */ +} DNSLabelFlags; + +int dns_label_unescape(const char **name, char *dest, size_t sz, DNSLabelFlags flags); +int dns_label_unescape_suffix(const char *name, const char **label_end, char *dest, size_t sz); +int dns_label_escape(const char *p, size_t l, char *dest, size_t sz); +int dns_label_escape_new(const char *p, size_t l, char **ret); + +static inline int dns_name_parent(const char **name) { + return dns_label_unescape(name, NULL, DNS_LABEL_MAX, 0); +} + +#if HAVE_LIBIDN +int dns_label_apply_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max); +int dns_label_undo_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max); +#endif + +int dns_name_concat(const char *a, const char *b, DNSLabelFlags flags, char **ret); + +static inline int dns_name_normalize(const char *s, DNSLabelFlags flags, char **ret) { + /* dns_name_concat() normalizes as a side-effect */ + return dns_name_concat(s, NULL, flags, ret); +} + +static inline int dns_name_is_valid(const char *s) { + int r; + + /* dns_name_concat() verifies as a side effect */ + r = dns_name_concat(s, NULL, 0, NULL); + if (r == -EINVAL) + return 0; + if (r < 0) + return r; + return 1; +} + +static inline int dns_name_is_valid_ldh(const char *s) { + int r; + + r = dns_name_concat(s, NULL, DNS_LABEL_LDH|DNS_LABEL_NO_ESCAPES, NULL); + if (r == -EINVAL) + return 0; + if (r < 0) + return r; + return 1; +} + +void dns_name_hash_func(const char *s, struct siphash *state); +int dns_name_compare_func(const char *a, const char *b); +extern const struct hash_ops dns_name_hash_ops; +extern const struct hash_ops dns_name_hash_ops_free; + +int dns_name_between(const char *a, const char *b, const char *c); +int dns_name_equal(const char *x, const char *y); +int dns_name_endswith(const char *name, const char *suffix); +int dns_name_startswith(const char *name, const char *prefix); + +int dns_name_change_suffix(const char *name, const char *old_suffix, const char *new_suffix, char **ret); + +int dns_name_reverse(int family, const union in_addr_union *a, char **ret); +int dns_name_address(const char *p, int *family, union in_addr_union *a); + +bool dns_name_is_root(const char *name); +bool dns_name_is_single_label(const char *name); + +int dns_name_to_wire_format(const char *domain, uint8_t *buffer, size_t len, bool canonical); + +bool dns_srv_type_is_valid(const char *name); +bool dnssd_srv_type_is_valid(const char *name); +bool dns_service_name_is_valid(const char *name); + +int dns_service_join(const char *name, const char *type, const char *domain, char **ret); +int dns_service_split(const char *joined, char **ret_name, char **ret_type, char **ret_domain); + +int dns_name_suffix(const char *name, unsigned n_labels, const char **ret); +int dns_name_count_labels(const char *name); + +int dns_name_skip(const char *a, unsigned n_labels, const char **ret); +int dns_name_equal_skip(const char *a, unsigned n_labels, const char *b); + +int dns_name_common_suffix(const char *a, const char *b, const char **ret); + +int dns_name_apply_idna(const char *name, char **ret); + +int dns_name_is_valid_or_address(const char *name); + +int dns_name_dot_suffixed(const char *name); + +bool dns_name_dont_resolve(const char *name); diff --git a/src/shared/dropin.c b/src/shared/dropin.c new file mode 100644 index 0000000..d46e838 --- /dev/null +++ b/src/shared/dropin.c @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "conf-files.h" +#include "dirent-util.h" +#include "dropin.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "hashmap.h" +#include "log.h" +#include "macro.h" +#include "mkdir.h" +#include "path-util.h" +#include "set.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" + +int drop_in_file(const char *dir, const char *unit, unsigned level, + const char *name, char **ret_p, char **ret_q) { + + char prefix[DECIMAL_STR_MAX(unsigned)]; + _cleanup_free_ char *b = NULL, *p = NULL, *q = NULL; + + assert(unit); + assert(name); + assert(ret_p); + assert(ret_q); + + sprintf(prefix, "%u", level); + + b = xescape(name, "/."); + if (!b) + return -ENOMEM; + + if (!filename_is_valid(b)) + return -EINVAL; + + p = strjoin(dir, "/", unit, ".d"); + q = strjoin(p, "/", prefix, "-", b, ".conf"); + if (!p || !q) + return -ENOMEM; + + *ret_p = TAKE_PTR(p); + *ret_q = TAKE_PTR(q); + return 0; +} + +int write_drop_in(const char *dir, const char *unit, unsigned level, + const char *name, const char *data) { + + _cleanup_free_ char *p = NULL, *q = NULL; + int r; + + assert(dir); + assert(unit); + assert(name); + assert(data); + + r = drop_in_file(dir, unit, level, name, &p, &q); + if (r < 0) + return r; + + (void) mkdir_p(p, 0755); + return write_string_file_atomic_label(q, data); +} + +int write_drop_in_format(const char *dir, const char *unit, unsigned level, + const char *name, const char *format, ...) { + _cleanup_free_ char *p = NULL; + va_list ap; + int r; + + assert(dir); + assert(unit); + assert(name); + assert(format); + + va_start(ap, format); + r = vasprintf(&p, format, ap); + va_end(ap); + + if (r < 0) + return -ENOMEM; + + return write_drop_in(dir, unit, level, name, p); +} + +static int unit_file_add_dir( + const char *original_root, + const char *path, + char ***dirs) { + + _cleanup_free_ char *chased = NULL; + int r; + + assert(path); + + /* This adds [original_root]/path to dirs, if it exists. */ + + r = chase(path, original_root, 0, &chased, NULL); + if (r == -ENOENT) /* Ignore -ENOENT, after all most units won't have a drop-in dir. */ + return 0; + if (r == -ENAMETOOLONG) { + /* Also, ignore -ENAMETOOLONG but log about it. After all, users are not even able to create the + * drop-in dir in such case. This mostly happens for device units with an overly long /sys path. */ + log_debug_errno(r, "Path '%s' too long, couldn't canonicalize, ignoring.", path); + return 0; + } + if (r < 0) + return log_warning_errno(r, "Failed to canonicalize path '%s': %m", path); + + if (strv_consume(dirs, TAKE_PTR(chased)) < 0) + return log_oom(); + + return 0; +} + +static int unit_file_find_dirs( + const char *original_root, + Set *unit_path_cache, + const char *unit_path, + const char *name, + const char *suffix, + char ***dirs) { + + _cleanup_free_ char *prefix = NULL, *instance = NULL, *built = NULL; + bool is_instance, chopped; + const char *dash; + UnitType type; + char *path; + size_t n; + int r; + + assert(unit_path); + assert(name); + assert(suffix); + + path = strjoina(unit_path, "/", name, suffix); + if (!unit_path_cache || set_get(unit_path_cache, path)) { + r = unit_file_add_dir(original_root, path, dirs); + if (r < 0) + return r; + } + + is_instance = unit_name_is_valid(name, UNIT_NAME_INSTANCE); + if (is_instance) { /* Also try the template dir */ + _cleanup_free_ char *template = NULL; + + r = unit_name_template(name, &template); + if (r < 0) + return log_error_errno(r, "Failed to generate template from unit name: %m"); + + r = unit_file_find_dirs(original_root, unit_path_cache, unit_path, template, suffix, dirs); + if (r < 0) + return r; + } + + /* Return early for top level drop-ins. */ + if (unit_type_from_string(name) >= 0) + return 0; + + /* Let's see if there's a "-" prefix for this unit name. If so, let's invoke ourselves for it. This will then + * recursively do the same for all our prefixes. i.e. this means given "foo-bar-waldo.service" we'll also + * search "foo-bar-.service" and "foo-.service". + * + * Note the order in which we do it: we traverse up adding drop-ins on each step. This means the more specific + * drop-ins may override the more generic drop-ins, which is the intended behaviour. */ + + r = unit_name_to_prefix(name, &prefix); + if (r < 0) + return log_error_errno(r, "Failed to derive unit name prefix from unit name: %m"); + + chopped = false; + for (;;) { + dash = strrchr(prefix, '-'); + if (!dash) /* No dash? if so we are done */ + return 0; + + n = (size_t) (dash - prefix); + if (n == 0) /* Leading dash? If so, we are done */ + return 0; + + if (prefix[n+1] != 0 || chopped) { + prefix[n+1] = 0; + break; + } + + /* Trailing dash? If so, chop it off and try again, but not more than once. */ + prefix[n] = 0; + chopped = true; + } + + if (!unit_prefix_is_valid(prefix)) + return 0; + + type = unit_name_to_type(name); + if (type < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to derive unit type from unit name: %s", + name); + + if (is_instance) { + r = unit_name_to_instance(name, &instance); + if (r < 0) + return log_error_errno(r, "Failed to derive unit name instance from unit name: %m"); + } + + r = unit_name_build_from_type(prefix, instance, type, &built); + if (r < 0) + return log_error_errno(r, "Failed to build prefix unit name: %m"); + + return unit_file_find_dirs(original_root, unit_path_cache, unit_path, built, suffix, dirs); +} + +int unit_file_find_dropin_paths( + const char *original_root, + char **lookup_path, + Set *unit_path_cache, + const char *dir_suffix, + const char *file_suffix, + const char *name, + const Set *aliases, + char ***ret) { + + _cleanup_strv_free_ char **dirs = NULL; + const char *n; + int r; + + assert(ret); + + if (name) + STRV_FOREACH(p, lookup_path) + (void) unit_file_find_dirs(original_root, unit_path_cache, *p, name, dir_suffix, &dirs); + + SET_FOREACH(n, aliases) + STRV_FOREACH(p, lookup_path) + (void) unit_file_find_dirs(original_root, unit_path_cache, *p, n, dir_suffix, &dirs); + + /* All the names in the unit are of the same type so just grab one. */ + n = name ?: (const char*) set_first(aliases); + if (n) { + UnitType type = _UNIT_TYPE_INVALID; + + type = unit_name_to_type(n); + if (type < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to derive unit type from unit name: %s", n); + + /* Special top level drop in for ".". Add this last as it's the most generic + * and should be able to be overridden by more specific drop-ins. */ + STRV_FOREACH(p, lookup_path) + (void) unit_file_find_dirs(original_root, + unit_path_cache, + *p, + unit_type_to_string(type), + dir_suffix, + &dirs); + } + + if (strv_isempty(dirs)) { + *ret = NULL; + return 0; + } + + r = conf_files_list_strv(ret, file_suffix, NULL, 0, (const char**) dirs); + if (r < 0) + return log_warning_errno(r, "Failed to create the list of configuration files: %m"); + + return 1; +} diff --git a/src/shared/dropin.h b/src/shared/dropin.h new file mode 100644 index 0000000..54cceaf --- /dev/null +++ b/src/shared/dropin.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" +#include "macro.h" +#include "set.h" +#include "unit-name.h" + +int drop_in_file(const char *dir, const char *unit, unsigned level, + const char *name, char **_p, char **_q); + +int write_drop_in(const char *dir, const char *unit, unsigned level, + const char *name, const char *data); + +int write_drop_in_format(const char *dir, const char *unit, unsigned level, + const char *name, const char *format, ...) _printf_(5, 6); + +int unit_file_find_dropin_paths( + const char *original_root, + char **lookup_path, + Set *unit_path_cache, + const char *dir_suffix, + const char *file_suffix, + const char *name, + const Set *aliases, + char ***paths); diff --git a/src/shared/edit-util.c b/src/shared/edit-util.c new file mode 100644 index 0000000..045839b --- /dev/null +++ b/src/shared/edit-util.c @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "copy.h" +#include "edit-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "mkdir-label.h" +#include "path-util.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util-label.h" + +void edit_file_context_done(EditFileContext *context) { + int r; + + assert(context); + + FOREACH_ARRAY(i, context->files, context->n_files) { + unlink_and_free(i->temp); + + if (context->remove_parent) { + _cleanup_free_ char *parent = NULL; + + r = path_extract_directory(i->path, &parent); + if (r < 0) + log_debug_errno(r, "Failed to extract directory from '%s', ignoring: %m", i->path); + else if (rmdir(parent) < 0 && !IN_SET(errno, ENOENT, ENOTEMPTY)) + log_debug_errno(errno, "Failed to remove parent directory '%s', ignoring: %m", parent); + } + + free(i->path); + free(i->original_path); + strv_free(i->comment_paths); + } + + context->files = mfree(context->files); + context->n_files = 0; +} + +bool edit_files_contains(const EditFileContext *context, const char *path) { + assert(context); + assert(path); + + FOREACH_ARRAY(i, context->files, context->n_files) + if (path_equal(i->path, path)) + return true; + + return false; +} + +int edit_files_add( + EditFileContext *context, + const char *path, + const char *original_path, + char * const *comment_paths) { + + _cleanup_free_ char *new_path = NULL, *new_original_path = NULL; + _cleanup_strv_free_ char **new_comment_paths = NULL; + + assert(context); + assert(path); + + if (edit_files_contains(context, path)) + return 0; + + if (!GREEDY_REALLOC(context->files, context->n_files + 1)) + return log_oom(); + + new_path = strdup(path); + if (!new_path) + return log_oom(); + + if (original_path) { + new_original_path = strdup(original_path); + if (!new_original_path) + return log_oom(); + } + + if (comment_paths) { + new_comment_paths = strv_copy(comment_paths); + if (!new_comment_paths) + return log_oom(); + } + + context->files[context->n_files] = (EditFile) { + .context = context, + .path = TAKE_PTR(new_path), + .original_path = TAKE_PTR(new_original_path), + .comment_paths = TAKE_PTR(new_comment_paths), + }; + context->n_files++; + + return 1; +} + +static int create_edit_temp_file(EditFile *e) { + _cleanup_(unlink_and_freep) char *temp = NULL; + _cleanup_fclose_ FILE *f = NULL; + const char *source; + bool has_original, has_target; + unsigned line = 1; + int r; + + assert(e); + assert(e->context); + assert(e->path); + assert(!e->comment_paths || (e->context->marker_start && e->context->marker_end)); + + if (e->temp) + return 0; + + r = mkdir_parents_label(e->path, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create parent directories for '%s': %m", e->path); + + r = fopen_temporary_label(e->path, e->path, &f, &temp); + if (r < 0) + return log_error_errno(r, "Failed to create temporary file for '%s': %m", e->path); + + if (fchmod(fileno(f), 0644) < 0) + return log_error_errno(errno, "Failed to change mode of temporary file '%s': %m", temp); + + has_original = e->original_path && access(e->original_path, F_OK) >= 0; + has_target = access(e->path, F_OK) >= 0; + + if (has_original && (!has_target || e->context->overwrite_with_origin)) + /* We are asked to overwrite target with original_path or target doesn't exist. */ + source = e->original_path; + else if (has_target) + /* Target exists and shouldn't be overwritten. */ + source = e->path; + else + source = NULL; + + if (e->comment_paths) { + _cleanup_free_ char *source_contents = NULL; + + if (source) { + r = read_full_file(source, &source_contents, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read source file '%s': %m", source); + } + + fprintf(f, + "### Editing %s\n" + "%s\n" + "\n" + "%s%s" + "\n" + "%s\n", + e->path, + e->context->marker_start, + strempty(source_contents), + source_contents && endswith(source_contents, "\n") ? "" : "\n", + e->context->marker_end); + + line = 4; /* Start editing at the contents area */ + + STRV_FOREACH(path, e->comment_paths) { + _cleanup_free_ char *comment = NULL; + + /* Skip the file which is being edited and the source file (can be the same) */ + if (PATH_IN_SET(*path, e->path, source)) + continue; + + r = read_full_file(*path, &comment, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read comment file '%s': %m", *path); + + fprintf(f, "\n\n### %s", *path); + + if (!isempty(comment)) { + _cleanup_free_ char *c = NULL; + + c = strreplace(strstrip(comment), "\n", "\n# "); + if (!c) + return log_oom(); + + fprintf(f, "\n# %s", c); + } + } + } else if (source) { + r = copy_file_fd(source, fileno(f), COPY_REFLINK); + if (r < 0) { + assert(r != -ENOENT); + return log_error_errno(r, "Failed to copy file '%s' to temporary file '%s': %m", source, temp); + } + } + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write to temporary file '%s': %m", temp); + + e->temp = TAKE_PTR(temp); + e->line = line; + + return 0; +} + +static int run_editor_child(const EditFileContext *context) { + _cleanup_strv_free_ char **args = NULL; + const char *editor; + int r; + + /* SYSTEMD_EDITOR takes precedence over EDITOR which takes precedence over VISUAL. + * If neither SYSTEMD_EDITOR nor EDITOR nor VISUAL are present, we try to execute + * well known editors. */ + editor = getenv("SYSTEMD_EDITOR"); + if (!editor) + editor = getenv("EDITOR"); + if (!editor) + editor = getenv("VISUAL"); + + if (!isempty(editor)) { + _cleanup_strv_free_ char **editor_args = NULL; + + editor_args = strv_split(editor, WHITESPACE); + if (!editor_args) + return log_oom(); + + args = TAKE_PTR(editor_args); + } + + if (context->n_files == 1 && context->files[0].line > 1) { + /* If editing a single file only, use the +LINE syntax to put cursor on the right line */ + r = strv_extendf(&args, "+%u", context->files[0].line); + if (r < 0) + return log_oom(); + } + + FOREACH_ARRAY(i, context->files, context->n_files) { + r = strv_extend(&args, i->temp); + if (r < 0) + return log_oom(); + } + + if (!isempty(editor)) + execvp(args[0], (char* const*) args); + + bool prepended = false; + FOREACH_STRING(name, "editor", "nano", "vim", "vi") { + if (!prepended) { + r = strv_prepend(&args, name); + prepended = true; + } else + r = free_and_strdup(&args[0], name); + if (r < 0) + return log_oom(); + + execvp(args[0], (char* const*) args); + + /* We do not fail if the editor doesn't exist because we want to try each one of them + * before failing. */ + if (errno != ENOENT) + return log_error_errno(errno, "Failed to execute '%s': %m", name); + } + + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Cannot edit files, no editor available. Please set either $SYSTEMD_EDITOR, $EDITOR or $VISUAL."); +} + +static int run_editor(const EditFileContext *context) { + int r; + + assert(context); + + r = safe_fork("(editor)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG|FORK_WAIT, NULL); + if (r < 0) + return r; + if (r == 0) { /* Child */ + r = run_editor_child(context); + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + return 0; +} + +static int strip_edit_temp_file(EditFile *e) { + _cleanup_free_ char *old_contents = NULL, *new_contents = NULL; + const char *stripped; + int r; + + assert(e); + assert(e->context); + assert(e->temp); + + r = read_full_file(e->temp, &old_contents, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read temporary file '%s': %m", e->temp); + + if (e->context->marker_start) { + /* Trim out the lines between the two markers */ + char *contents_start, *contents_end; + + assert(e->context->marker_end); + + contents_start = strstrafter(old_contents, e->context->marker_start); + if (!contents_start) + contents_start = old_contents; + + contents_end = strstr(contents_start, e->context->marker_end); + if (contents_end) + *contents_end = '\0'; + + stripped = strstrip(contents_start); + } else + stripped = strstrip(old_contents); + if (isempty(stripped)) + return 0; /* File is empty (has no real changes) */ + + /* Trim prefix and suffix, but ensure suffixed by single newline */ + new_contents = strjoin(stripped, "\n"); + if (!new_contents) + return log_oom(); + + if (streq(old_contents, new_contents)) /* Don't touch the file if the above didn't change a thing */ + return 1; /* Contents unchanged after stripping but has changes */ + + r = write_string_file(e->temp, new_contents, WRITE_STRING_FILE_CREATE | WRITE_STRING_FILE_TRUNCATE | WRITE_STRING_FILE_AVOID_NEWLINE); + if (r < 0) + return log_error_errno(r, "Failed to strip temporary file '%s': %m", e->temp); + + return 1; /* Contents have real changes and are changed after stripping */ +} + +int do_edit_files_and_install(EditFileContext *context) { + int r; + + assert(context); + + if (context->n_files == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Got no files to edit."); + + FOREACH_ARRAY(i, context->files, context->n_files) { + r = create_edit_temp_file(i); + if (r < 0) + return r; + } + + r = run_editor(context); + if (r < 0) + return r; + + FOREACH_ARRAY(i, context->files, context->n_files) { + /* Always call strip_edit_temp_file which will tell if the temp file has actual changes */ + r = strip_edit_temp_file(i); + if (r < 0) + return r; + if (r == 0) /* temp file doesn't carry actual changes, ignoring */ + continue; + + r = RET_NERRNO(rename(i->temp, i->path)); + if (r < 0) + return log_error_errno(r, + "Failed to rename temporary file '%s' to target file '%s': %m", + i->temp, + i->path); + i->temp = mfree(i->temp); + + log_info("Successfully installed edited file '%s'.", i->path); + } + + return 0; +} diff --git a/src/shared/edit-util.h b/src/shared/edit-util.h new file mode 100644 index 0000000..83b3df8 --- /dev/null +++ b/src/shared/edit-util.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#define DROPIN_MARKER_START "### Anything between here and the comment below will become the contents of the drop-in file" +#define DROPIN_MARKER_END "### Edits below this comment will be discarded" + +typedef struct EditFile EditFile; +typedef struct EditFileContext EditFileContext; + +struct EditFile { + EditFileContext *context; + char *path; + char *original_path; + char **comment_paths; + char *temp; + unsigned line; +}; + +struct EditFileContext { + EditFile *files; + size_t n_files; + const char *marker_start; + const char *marker_end; + bool remove_parent; + bool overwrite_with_origin; /* whether to always overwrite target with original file */ +}; + +void edit_file_context_done(EditFileContext *context); + +bool edit_files_contains(const EditFileContext *context, const char *path); + +int edit_files_add( + EditFileContext *context, + const char *path, + const char *original_path, + char * const *comment_paths); + +int do_edit_files_and_install(EditFileContext *context); diff --git a/src/shared/efi-api.c b/src/shared/efi-api.c new file mode 100644 index 0000000..4cd1091 --- /dev/null +++ b/src/shared/efi-api.c @@ -0,0 +1,556 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "dirent-util.h" +#include "efi-api.h" +#include "efivars.h" +#include "fd-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "utf8.h" + +#if ENABLE_EFI + +#define LOAD_OPTION_ACTIVE 0x00000001 +#define MEDIA_DEVICE_PATH 0x04 +#define MEDIA_HARDDRIVE_DP 0x01 +#define MEDIA_FILEPATH_DP 0x04 +#define SIGNATURE_TYPE_GUID 0x02 +#define MBR_TYPE_EFI_PARTITION_TABLE_HEADER 0x02 +#define END_DEVICE_PATH_TYPE 0x7f +#define END_ENTIRE_DEVICE_PATH_SUBTYPE 0xff + +#define EFI_OS_INDICATIONS_BOOT_TO_FW_UI UINT64_C(0x0000000000000001) + +#define boot_option__contents \ + { \ + uint32_t attr; \ + uint16_t path_len; \ + uint16_t title[]; \ + } + +struct boot_option boot_option__contents; +struct boot_option__packed boot_option__contents _packed_; +assert_cc(offsetof(struct boot_option, title) == offsetof(struct boot_option__packed, title)); +/* sizeof(struct boot_option) != sizeof(struct boot_option__packed), so + * the *size* of the structure should not be used anywhere below. */ + +struct drive_path { + uint32_t part_nr; + uint64_t part_start; + uint64_t part_size; + char signature[16]; + uint8_t mbr_type; + uint8_t signature_type; +} _packed_; + +#define device_path__contents \ + { \ + uint8_t type; \ + uint8_t sub_type; \ + uint16_t length; \ + union { \ + uint16_t path[0]; \ + struct drive_path drive; \ + }; \ + } + +struct device_path device_path__contents; +struct device_path__packed device_path__contents _packed_; +assert_cc(sizeof(struct device_path) == sizeof(struct device_path__packed)); + +int efi_reboot_to_firmware_supported(void) { + _cleanup_free_ void *v = NULL; + static int cache = -1; + uint64_t b; + size_t s; + int r; + + if (cache > 0) + return 0; + if (cache == 0) + return -EOPNOTSUPP; + + if (!is_efi_boot()) + goto not_supported; + + r = efi_get_variable(EFI_GLOBAL_VARIABLE(OsIndicationsSupported), NULL, &v, &s); + if (r == -ENOENT) + goto not_supported; /* variable doesn't exist? it's not supported then */ + if (r < 0) + return r; + if (s != sizeof(uint64_t)) + return -EINVAL; + + b = *(uint64_t*) v; + if (!(b & EFI_OS_INDICATIONS_BOOT_TO_FW_UI)) + goto not_supported; /* bit unset? it's not supported then */ + + cache = 1; + return 0; + +not_supported: + cache = 0; + return -EOPNOTSUPP; +} + +static int get_os_indications(uint64_t *ret) { + static struct stat cache_stat = {}; + _cleanup_free_ void *v = NULL; + static uint64_t cache; + struct stat new_stat; + size_t s; + int r; + + assert(ret); + + /* Let's verify general support first */ + r = efi_reboot_to_firmware_supported(); + if (r < 0) + return r; + + /* stat() the EFI variable, to see if the mtime changed. If it did we need to cache again. */ + if (stat(EFIVAR_PATH(EFI_GLOBAL_VARIABLE(OsIndications)), &new_stat) < 0) { + if (errno != ENOENT) + return -errno; + + /* Doesn't exist? Then we can exit early (also see below) */ + *ret = 0; + return 0; + + } else if (stat_inode_unmodified(&new_stat, &cache_stat)) { + /* inode didn't change, we can return the cached value */ + *ret = cache; + return 0; + } + + r = efi_get_variable(EFI_GLOBAL_VARIABLE(OsIndications), NULL, &v, &s); + if (r == -ENOENT) { + /* Some firmware implementations that do support OsIndications and report that with + * OsIndicationsSupported will remove the OsIndications variable when it is unset. Let's + * pretend it's 0 then, to hide this implementation detail. Note that this call will return + * -ENOENT then only if the support for OsIndications is missing entirely, as determined by + * efi_reboot_to_firmware_supported() above. */ + *ret = 0; + return 0; + } + if (r < 0) + return r; + if (s != sizeof(uint64_t)) + return -EINVAL; + + cache_stat = new_stat; + *ret = cache = *(uint64_t *)v; + return 0; +} + +int efi_get_reboot_to_firmware(void) { + int r; + uint64_t b; + + r = get_os_indications(&b); + if (r < 0) + return r; + + return !!(b & EFI_OS_INDICATIONS_BOOT_TO_FW_UI); +} + +int efi_set_reboot_to_firmware(bool value) { + int r; + uint64_t b, b_new; + + r = get_os_indications(&b); + if (r < 0) + return r; + + b_new = UPDATE_FLAG(b, EFI_OS_INDICATIONS_BOOT_TO_FW_UI, value); + + /* Avoid writing to efi vars store if we can due to firmware bugs. */ + if (b != b_new) + return efi_set_variable(EFI_GLOBAL_VARIABLE(OsIndications), &b_new, sizeof(uint64_t)); + + return 0; +} + +static ssize_t utf16_size(const uint16_t *s, size_t buf_len_bytes) { + size_t l = 0; + + /* Returns the size of the string in bytes without the terminating two zero bytes */ + + while (l < buf_len_bytes / sizeof(uint16_t)) { + if (s[l] == 0) + return (l + 1) * sizeof(uint16_t); + l++; + } + + return -EINVAL; /* The terminator was not found */ +} + +int efi_get_boot_option( + uint16_t id, + char **ret_title, + sd_id128_t *ret_part_uuid, + char **ret_path, + bool *ret_active) { + + char variable[STRLEN(EFI_GLOBAL_VARIABLE_STR("Boot")) + 4 + 1]; + _cleanup_free_ uint8_t *buf = NULL; + size_t l; + struct boot_option *header; + ssize_t title_size; + _cleanup_free_ char *s = NULL, *p = NULL; + sd_id128_t p_uuid = SD_ID128_NULL; + int r; + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + xsprintf(variable, EFI_GLOBAL_VARIABLE_STR("Boot%04X"), id); + r = efi_get_variable(variable, NULL, (void **)&buf, &l); + if (r < 0) + return r; + if (l < offsetof(struct boot_option, title)) + return -ENOENT; + + header = (struct boot_option *)buf; + title_size = utf16_size(header->title, l - offsetof(struct boot_option, title)); + if (title_size < 0) + return title_size; + + if (ret_title) { + s = utf16_to_utf8(header->title, title_size); + if (!s) + return -ENOMEM; + } + + if (header->path_len > 0) { + uint8_t *dbuf; + size_t dnext, doff; + + doff = offsetof(struct boot_option, title) + title_size; + dbuf = buf + doff; + if (header->path_len > l - doff) + return -EINVAL; + + dnext = 0; + while (dnext < header->path_len) { + struct device_path *dpath; + + dpath = (struct device_path *)(dbuf + dnext); + if (dpath->length < 4) + break; + + /* Type 0x7F – End of Hardware Device Path, Sub-Type 0xFF – End Entire Device Path */ + if (dpath->type == END_DEVICE_PATH_TYPE && dpath->sub_type == END_ENTIRE_DEVICE_PATH_SUBTYPE) + break; + + dnext += dpath->length; + + /* Type 0x04 – Media Device Path */ + if (dpath->type != MEDIA_DEVICE_PATH) + continue; + + /* Sub-Type 1 – Hard Drive */ + if (dpath->sub_type == MEDIA_HARDDRIVE_DP) { + /* 0x02 – GUID Partition Table */ + if (dpath->drive.mbr_type != MBR_TYPE_EFI_PARTITION_TABLE_HEADER) + continue; + + /* 0x02 – GUID signature */ + if (dpath->drive.signature_type != SIGNATURE_TYPE_GUID) + continue; + + if (ret_part_uuid) + p_uuid = efi_guid_to_id128(dpath->drive.signature); + continue; + } + + /* Sub-Type 4 – File Path */ + if (dpath->sub_type == MEDIA_FILEPATH_DP && !p && ret_path) { + p = utf16_to_utf8(dpath->path, dpath->length-4); + if (!p) + return -ENOMEM; + + efi_tilt_backslashes(p); + continue; + } + } + } + + if (ret_title) + *ret_title = TAKE_PTR(s); + if (ret_part_uuid) + *ret_part_uuid = p_uuid; + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_active) + *ret_active = header->attr & LOAD_OPTION_ACTIVE; + + return 0; +} + +static void to_utf16(uint16_t *dest, const char *src) { + int i; + + for (i = 0; src[i] != '\0'; i++) + dest[i] = src[i]; + dest[i] = '\0'; +} + +static uint16_t *tilt_slashes(uint16_t *s) { + for (uint16_t *p = s; *p; p++) + if (*p == '/') + *p = '\\'; + + return s; +} + +int efi_add_boot_option( + uint16_t id, + const char *title, + uint32_t part, + uint64_t pstart, + uint64_t psize, + sd_id128_t part_uuid, + const char *path) { + + size_t size, title_len, path_len; + _cleanup_free_ char *buf = NULL; + struct boot_option *option; + struct device_path *devicep; + char variable[STRLEN(EFI_GLOBAL_VARIABLE_STR("Boot")) + 4 + 1]; + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + title_len = (strlen(title)+1) * 2; + path_len = (strlen(path)+1) * 2; + + buf = malloc0(offsetof(struct boot_option, title) + title_len + + sizeof(struct drive_path) + + sizeof(struct device_path) + path_len); + if (!buf) + return -ENOMEM; + + /* header */ + option = (struct boot_option *)buf; + option->attr = LOAD_OPTION_ACTIVE; + option->path_len = offsetof(struct device_path, drive) + sizeof(struct drive_path) + + offsetof(struct device_path, path) + path_len + + offsetof(struct device_path, path); + to_utf16(option->title, title); + size = offsetof(struct boot_option, title) + title_len; + + /* partition info */ + devicep = (struct device_path *)(buf + size); + devicep->type = MEDIA_DEVICE_PATH; + devicep->sub_type = MEDIA_HARDDRIVE_DP; + devicep->length = offsetof(struct device_path, drive) + sizeof(struct drive_path); + memcpy(&devicep->drive.part_nr, &part, sizeof(uint32_t)); + memcpy(&devicep->drive.part_start, &pstart, sizeof(uint64_t)); + memcpy(&devicep->drive.part_size, &psize, sizeof(uint64_t)); + efi_id128_to_guid(part_uuid, devicep->drive.signature); + devicep->drive.mbr_type = MBR_TYPE_EFI_PARTITION_TABLE_HEADER; + devicep->drive.signature_type = SIGNATURE_TYPE_GUID; + size += devicep->length; + + /* path to loader */ + devicep = (struct device_path *)(buf + size); + devicep->type = MEDIA_DEVICE_PATH; + devicep->sub_type = MEDIA_FILEPATH_DP; + devicep->length = offsetof(struct device_path, path) + path_len; + to_utf16(devicep->path, path); + tilt_slashes(devicep->path); + size += devicep->length; + + /* end of path */ + devicep = (struct device_path *)(buf + size); + devicep->type = END_DEVICE_PATH_TYPE; + devicep->sub_type = END_ENTIRE_DEVICE_PATH_SUBTYPE; + devicep->length = offsetof(struct device_path, path); + size += devicep->length; + + xsprintf(variable, EFI_GLOBAL_VARIABLE_STR("Boot%04X"), id); + return efi_set_variable(variable, buf, size); +} + +int efi_remove_boot_option(uint16_t id) { + char variable[STRLEN(EFI_GLOBAL_VARIABLE_STR("Boot")) + 4 + 1]; + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + xsprintf(variable, EFI_GLOBAL_VARIABLE_STR("Boot%04X"), id); + return efi_set_variable(variable, NULL, 0); +} + +int efi_get_boot_order(uint16_t **ret_order) { + _cleanup_free_ void *buf = NULL; + size_t l; + int r; + + assert(ret_order); + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + r = efi_get_variable(EFI_GLOBAL_VARIABLE(BootOrder), NULL, &buf, &l); + if (r < 0) + return r; + + if (l <= 0) + return -ENOENT; + + if (l % sizeof(uint16_t) > 0 || + l / sizeof(uint16_t) > INT_MAX) + return -EINVAL; + + *ret_order = TAKE_PTR(buf); + return (int) (l / sizeof(uint16_t)); +} + +int efi_set_boot_order(const uint16_t *order, size_t n) { + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + return efi_set_variable(EFI_GLOBAL_VARIABLE(BootOrder), order, n * sizeof(uint16_t)); +} + +static int boot_id_hex(const char s[static 4]) { + int id = 0; + + assert(s); + + for (int i = 0; i < 4; i++) + if (s[i] >= '0' && s[i] <= '9') + id |= (s[i] - '0') << (3 - i) * 4; + else if (s[i] >= 'A' && s[i] <= 'F') + id |= (s[i] - 'A' + 10) << (3 - i) * 4; + else + return -EINVAL; + + return id; +} + +int efi_get_boot_options(uint16_t **ret_options) { + _cleanup_closedir_ DIR *dir = NULL; + _cleanup_free_ uint16_t *list = NULL; + int count = 0; + + assert(ret_options); + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + dir = opendir(EFIVAR_PATH(".")); + if (!dir) + return -errno; + + FOREACH_DIRENT(de, dir, return -errno) { + int id; + + if (strncmp(de->d_name, "Boot", 4) != 0) + continue; + + if (strlen(de->d_name) != 45) + continue; + + if (strcmp(de->d_name + 8, EFI_GLOBAL_VARIABLE_STR("")) != 0) /* generate variable suffix using macro */ + continue; + + id = boot_id_hex(de->d_name + 4); + if (id < 0) + continue; + + if (!GREEDY_REALLOC(list, count + 1)) + return -ENOMEM; + + list[count++] = id; + } + + typesafe_qsort(list, count, cmp_uint16); + + *ret_options = TAKE_PTR(list); + + return count; +} + +bool efi_has_tpm2(void) { + static int cache = -1; + + /* Returns whether the system has a TPM2 chip which is known to the EFI firmware. */ + + if (cache >= 0) + return cache; + + /* First, check if we are on an EFI boot at all. */ + if (!is_efi_boot()) { + cache = 0; + return cache; + } + + /* Then, check if the ACPI table "TPM2" exists, which is the TPM2 event log table, see: + * https://trustedcomputinggroup.org/wp-content/uploads/TCG_ACPIGeneralSpecification_v1.20_r8.pdf + * This table exists whenever the firmware is hooked up to TPM2. */ + cache = access("/sys/firmware/acpi/tables/TPM2", F_OK) >= 0; + if (cache) + return cache; + + if (errno != ENOENT) + log_debug_errno(errno, "Unable to test whether /sys/firmware/acpi/tables/TPM2 exists, assuming it doesn't: %m"); + + /* As the last try, check if the EFI firmware provides the EFI_TCG2_FINAL_EVENTS_TABLE + * stored in EFI configuration table, see: + * https://trustedcomputinggroup.org/wp-content/uploads/EFI-Protocol-Specification-rev13-160330final.pdf + */ + cache = access("/sys/kernel/security/tpm0/binary_bios_measurements", F_OK) >= 0; + if (!cache && errno != ENOENT) + log_debug_errno(errno, "Unable to test whether /sys/kernel/security/tpm0/binary_bios_measurements exists, assuming it doesn't: %m"); + + return cache; +} + +#endif + +struct efi_guid { + uint32_t u1; + uint16_t u2; + uint16_t u3; + uint8_t u4[8]; +} _packed_; + +sd_id128_t efi_guid_to_id128(const void *guid) { + const struct efi_guid *uuid = ASSERT_PTR(guid); /* cast is safe, because struct efi_guid is packed */ + sd_id128_t id128; + + id128.bytes[0] = (uuid->u1 >> 24) & 0xff; + id128.bytes[1] = (uuid->u1 >> 16) & 0xff; + id128.bytes[2] = (uuid->u1 >> 8) & 0xff; + id128.bytes[3] = uuid->u1 & 0xff; + + id128.bytes[4] = (uuid->u2 >> 8) & 0xff; + id128.bytes[5] = uuid->u2 & 0xff; + + id128.bytes[6] = (uuid->u3 >> 8) & 0xff; + id128.bytes[7] = uuid->u3 & 0xff; + + memcpy(&id128.bytes[8], uuid->u4, sizeof(uuid->u4)); + + return id128; +} + +void efi_id128_to_guid(sd_id128_t id, void *ret_guid) { + assert(ret_guid); + + struct efi_guid uuid = { + .u1 = id.bytes[0] << 24 | id.bytes[1] << 16 | id.bytes[2] << 8 | id.bytes[3], + .u2 = id.bytes[4] << 8 | id.bytes[5], + .u3 = id.bytes[6] << 8 | id.bytes[7], + }; + memcpy(uuid.u4, id.bytes+8, sizeof(uuid.u4)); + memcpy(ret_guid, &uuid, sizeof(uuid)); +} diff --git a/src/shared/efi-api.h b/src/shared/efi-api.h new file mode 100644 index 0000000..09071b2 --- /dev/null +++ b/src/shared/efi-api.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "efivars-fundamental.h" +#include "efivars.h" +#include "string-util.h" + +/* Various calls for interfacing with EFI variables from the official UEFI specs. */ + +#if ENABLE_EFI + +int efi_reboot_to_firmware_supported(void); +int efi_get_reboot_to_firmware(void); +int efi_set_reboot_to_firmware(bool value); + +int efi_get_boot_option(uint16_t nr, char **ret_title, sd_id128_t *ret_part_uuid, char **ret_path, bool *ret_active); +int efi_add_boot_option(uint16_t id, const char *title, uint32_t part, uint64_t pstart, uint64_t psize, sd_id128_t part_uuid, const char *path); +int efi_remove_boot_option(uint16_t id); +int efi_get_boot_order(uint16_t **ret_order); +int efi_set_boot_order(const uint16_t *order, size_t n); +int efi_get_boot_options(uint16_t **ret_options); + +bool efi_has_tpm2(void); + +#else + +static inline int efi_reboot_to_firmware_supported(void) { + return -EOPNOTSUPP; +} + +static inline int efi_get_reboot_to_firmware(void) { + return -EOPNOTSUPP; +} + +static inline int efi_set_reboot_to_firmware(bool value) { + return -EOPNOTSUPP; +} + +static inline int efi_get_boot_option(uint16_t nr, char **ret_title, sd_id128_t *ret_part_uuid, char **ret_path, bool *ret_active) { + return -EOPNOTSUPP; +} + +static inline int efi_add_boot_option(uint16_t id, const char *title, uint32_t part, uint64_t pstart, uint64_t psize, sd_id128_t part_uuid, const char *path) { + return -EOPNOTSUPP; +} + +static inline int efi_remove_boot_option(uint16_t id) { + return -EOPNOTSUPP; +} + +static inline int efi_get_boot_order(uint16_t **ret_order) { + return -EOPNOTSUPP; +} + +static inline int efi_set_boot_order(const uint16_t *order, size_t n) { + return -EOPNOTSUPP; +} + +static inline int efi_get_boot_options(uint16_t **ret_options) { + return -EOPNOTSUPP; +} + +static inline bool efi_has_tpm2(void) { + return false; +} + +#endif + +static inline char *efi_tilt_backslashes(char *s) { + return string_replace_char(s, '\\', '/'); +} + +sd_id128_t efi_guid_to_id128(const void *guid); +void efi_id128_to_guid(sd_id128_t id, void *ret_guid); diff --git a/src/shared/efi-loader.c b/src/shared/efi-loader.c new file mode 100644 index 0000000..7d6bda9 --- /dev/null +++ b/src/shared/efi-loader.c @@ -0,0 +1,363 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "efi-api.h" +#include "efi-loader.h" +#include "env-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "strv.h" +#include "tpm2-pcr.h" +#include "utf8.h" + +#if ENABLE_EFI + +static int read_usec(const char *variable, usec_t *ret) { + _cleanup_free_ char *j = NULL; + uint64_t x = 0; + int r; + + assert(variable); + assert(ret); + + r = efi_get_variable_string(variable, &j); + if (r < 0) + return r; + + r = safe_atou64(j, &x); + if (r < 0) + return r; + + *ret = x; + return 0; +} + +int efi_loader_get_boot_usec(usec_t *ret_firmware, usec_t *ret_loader) { + uint64_t x, y; + int r; + + assert(ret_firmware); + assert(ret_loader); + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + r = read_usec(EFI_LOADER_VARIABLE(LoaderTimeInitUSec), &x); + if (r < 0) + return log_debug_errno(r, "Failed to read LoaderTimeInitUSec: %m"); + + r = read_usec(EFI_LOADER_VARIABLE(LoaderTimeExecUSec), &y); + if (r < 0) + return log_debug_errno(r, "Failed to read LoaderTimeExecUSec: %m"); + + if (y == 0 || y < x || y - x > USEC_PER_HOUR) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Bad LoaderTimeInitUSec=%"PRIu64", LoaderTimeExecUSec=%" PRIu64"; refusing.", + x, y); + + *ret_firmware = x; + *ret_loader = y; + return 0; +} + +int efi_loader_get_device_part_uuid(sd_id128_t *ret) { + _cleanup_free_ char *p = NULL; + int r; + unsigned parsed[16]; + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderDevicePartUUID), &p); + if (r < 0) + return r; + + if (sscanf(p, SD_ID128_UUID_FORMAT_STR, + &parsed[0], &parsed[1], &parsed[2], &parsed[3], + &parsed[4], &parsed[5], &parsed[6], &parsed[7], + &parsed[8], &parsed[9], &parsed[10], &parsed[11], + &parsed[12], &parsed[13], &parsed[14], &parsed[15]) != 16) + return -EIO; + + if (ret) + for (unsigned i = 0; i < ELEMENTSOF(parsed); i++) + ret->bytes[i] = parsed[i]; + + return 0; +} + +int efi_loader_get_entries(char ***ret) { + _cleanup_free_ char16_t *entries = NULL; + _cleanup_strv_free_ char **l = NULL; + size_t size; + int r; + + assert(ret); + + if (!is_efi_boot()) + return -EOPNOTSUPP; + + r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderEntries), NULL, (void**) &entries, &size); + if (r < 0) + return r; + + /* The variable contains a series of individually NUL terminated UTF-16 strings. We gracefully + * consider the final NUL byte optional (i.e. the last string may or may not end in a NUL byte).*/ + + for (size_t i = 0, start = 0;; i++) { + _cleanup_free_ char *decoded = NULL; + bool end; + + /* Is this the end of the variable's data? */ + end = i * sizeof(char16_t) >= size; + + /* Are we in the middle of a string? (i.e. not at the end of the variable, nor at a NUL terminator?) If + * so, let's go to the next entry. */ + if (!end && entries[i] != 0) + continue; + + /* Empty string at the end of variable? That's the trailer, we are done (i.e. we have a final + * NUL terminator). */ + if (end && start == i) + break; + + /* We reached the end of a string, let's decode it into UTF-8 */ + decoded = utf16_to_utf8(entries + start, (i - start) * sizeof(char16_t)); + if (!decoded) + return -ENOMEM; + + if (efi_loader_entry_name_valid(decoded)) { + r = strv_consume(&l, TAKE_PTR(decoded)); + if (r < 0) + return r; + } else + log_debug("Ignoring invalid loader entry '%s'.", decoded); + + /* Exit the loop if we reached the end of the variable (i.e. we do not have a final NUL + * terminator) */ + if (end) + break; + + /* Continue after the NUL byte */ + start = i + 1; + } + + *ret = TAKE_PTR(l); + return 0; +} + +int efi_loader_get_features(uint64_t *ret) { + _cleanup_free_ void *v = NULL; + size_t s; + int r; + + assert(ret); + + if (!is_efi_boot()) { + *ret = 0; + return 0; + } + + r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderFeatures), NULL, &v, &s); + if (r == -ENOENT) { + _cleanup_free_ char *info = NULL; + + /* The new (v240+) LoaderFeatures variable is not supported, let's see if it's systemd-boot at all */ + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderInfo), &info); + if (r < 0) { + if (r != -ENOENT) + return r; + + /* Variable not set, definitely means not systemd-boot */ + + } else if (first_word(info, "systemd-boot")) { + + /* An older systemd-boot version. Let's hardcode the feature set, since it was pretty + * static in all its versions. */ + + *ret = EFI_LOADER_FEATURE_CONFIG_TIMEOUT | + EFI_LOADER_FEATURE_ENTRY_DEFAULT | + EFI_LOADER_FEATURE_ENTRY_ONESHOT; + + return 0; + } + + /* No features supported */ + *ret = 0; + return 0; + } + if (r < 0) + return r; + + if (s != sizeof(uint64_t)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "LoaderFeatures EFI variable doesn't have the right size."); + + memcpy(ret, v, sizeof(uint64_t)); + return 0; +} + +int efi_stub_get_features(uint64_t *ret) { + _cleanup_free_ void *v = NULL; + size_t s; + int r; + + assert(ret); + + if (!is_efi_boot()) { + *ret = 0; + return 0; + } + + r = efi_get_variable(EFI_LOADER_VARIABLE(StubFeatures), NULL, &v, &s); + if (r == -ENOENT) { + _cleanup_free_ char *info = NULL; + + /* The new (v252+) StubFeatures variable is not supported, let's see if it's systemd-stub at all */ + r = efi_get_variable_string(EFI_LOADER_VARIABLE(StubInfo), &info); + if (r < 0) { + if (r != -ENOENT) + return r; + + /* Variable not set, definitely means not systemd-stub */ + + } else if (first_word(info, "systemd-stub")) { + + /* An older systemd-stub version. Let's hardcode the feature set, since it was pretty + * static in all its versions. */ + + *ret = EFI_STUB_FEATURE_REPORT_BOOT_PARTITION; + return 0; + } + + /* No features supported */ + *ret = 0; + return 0; + } + if (r < 0) + return r; + + if (s != sizeof(uint64_t)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "StubFeatures EFI variable doesn't have the right size."); + + memcpy(ret, v, sizeof(uint64_t)); + return 0; +} + +int efi_measured_uki(int log_level) { + _cleanup_free_ char *pcr_string = NULL; + static int cached = -1; + unsigned pcr_nr; + int r; + + if (cached >= 0) + return cached; + + /* Checks if we are booted on a kernel with sd-stub which measured the kernel into PCR 11 on a TPM2 + * chip. Or in other words, if we are running on a TPM enabled UKI. (TPM 1.2 situations are ignored.) + * + * Returns == 0 and > 0 depending on the result of the test. Returns -EREMOTE if we detected a stub + * being used, but it measured things into a different PCR than we are configured for in + * userspace. (i.e. we expect PCR 11 being used for this by both sd-stub and us) */ + + r = getenv_bool_secure("SYSTEMD_FORCE_MEASURE"); /* Give user a chance to override the variable test, + * for debugging purposes */ + if (r >= 0) + return (cached = r); + if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_FORCE_MEASURE, ignoring: %m"); + + if (!efi_has_tpm2()) + return (cached = 0); + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(StubPcrKernelImage), &pcr_string); + if (r == -ENOENT) + return (cached = 0); + if (r < 0) + return log_full_errno(log_level, r, + "Failed to get StubPcrKernelImage EFI variable: %m"); + + r = safe_atou(pcr_string, &pcr_nr); + if (r < 0) + return log_full_errno(log_level, r, + "Failed to parse StubPcrKernelImage EFI variable: %s", pcr_string); + if (pcr_nr != TPM2_PCR_KERNEL_BOOT) + return log_full_errno(log_level, SYNTHETIC_ERRNO(EREMOTE), + "Kernel stub measured kernel image into PCR %u, which is different than expected %i.", + pcr_nr, TPM2_PCR_KERNEL_BOOT); + + return (cached = 1); +} + +int efi_loader_get_config_timeout_one_shot(usec_t *ret) { + _cleanup_free_ char *v = NULL; + static struct stat cache_stat = {}; + struct stat new_stat; + static usec_t cache; + uint64_t sec; + int r; + + assert(ret); + + /* stat() the EFI variable, to see if the mtime changed. If it did, we need to cache again. */ + if (stat(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot)), &new_stat) < 0) + return -errno; + + if (stat_inode_unmodified(&new_stat, &cache_stat)) { + *ret = cache; + return 0; + } + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot), &v); + if (r < 0) + return r; + + r = safe_atou64(v, &sec); + if (r < 0) + return r; + if (sec > USEC_INFINITY / USEC_PER_SEC) + return -ERANGE; + + cache_stat = new_stat; + *ret = cache = sec * USEC_PER_SEC; /* return in μs */ + return 0; +} + +int efi_loader_update_entry_one_shot_cache(char **cache, struct stat *cache_stat) { + _cleanup_free_ char *v = NULL; + struct stat new_stat; + int r; + + assert(cache); + assert(cache_stat); + + /* stat() the EFI variable, to see if the mtime changed. If it did we need to cache again. */ + if (stat(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderEntryOneShot)), &new_stat) < 0) + return -errno; + + if (stat_inode_unmodified(&new_stat, cache_stat)) + return 0; + + r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntryOneShot), &v); + if (r < 0) + return r; + + if (!efi_loader_entry_name_valid(v)) + return -EINVAL; + + *cache_stat = new_stat; + free_and_replace(*cache, v); + + return 0; +} + +#endif + +bool efi_loader_entry_name_valid(const char *s) { + if (!filename_is_valid(s)) /* Make sure entry names fit in filenames */ + return false; + + return in_charset(s, ALPHANUMERICAL "+-_."); +} diff --git a/src/shared/efi-loader.h b/src/shared/efi-loader.h new file mode 100644 index 0000000..c878eea --- /dev/null +++ b/src/shared/efi-loader.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "efivars-fundamental.h" +#include "efivars.h" + +/* Various calls that interface with EFI variables implementing https://systemd.io/BOOT_LOADER_INTERFACE */ + +#if ENABLE_EFI + +int efi_loader_get_device_part_uuid(sd_id128_t *ret); +int efi_loader_get_boot_usec(usec_t *ret_firmware, usec_t *ret_loader); + +int efi_loader_get_entries(char ***ret); + +int efi_loader_get_features(uint64_t *ret); +int efi_stub_get_features(uint64_t *ret); + +int efi_measured_uki(int log_level); + +int efi_loader_get_config_timeout_one_shot(usec_t *ret); +int efi_loader_update_entry_one_shot_cache(char **cache, struct stat *cache_stat); + +#else + +static inline int efi_loader_get_device_part_uuid(sd_id128_t *u) { + return -EOPNOTSUPP; +} + +static inline int efi_loader_get_boot_usec(usec_t *firmware, usec_t *loader) { + return -EOPNOTSUPP; +} + +static inline int efi_loader_get_entries(char ***ret) { + return -EOPNOTSUPP; +} + +static inline int efi_loader_get_features(uint64_t *ret) { + return -EOPNOTSUPP; +} + +static inline int efi_stub_get_features(uint64_t *ret) { + return -EOPNOTSUPP; +} + +static inline int efi_measured_uki(int log_level) { + return log_full_errno(log_level, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Compiled without support for EFI"); +} + +static inline int efi_loader_get_config_timeout_one_shot(usec_t *ret) { + return -EOPNOTSUPP; +} + +static inline int efi_loader_update_entry_one_shot_cache(char **cache, struct stat *cache_stat) { + return -EOPNOTSUPP; +} + +#endif + +bool efi_loader_entry_name_valid(const char *s); diff --git a/src/shared/elf-util.c b/src/shared/elf-util.c new file mode 100644 index 0000000..24ed16e --- /dev/null +++ b/src/shared/elf-util.c @@ -0,0 +1,899 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_ELFUTILS + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dlfcn-util.h" +#include "elf-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fileio.h" +#include "fd-util.h" +#include "format-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "macro.h" +#include "memstream-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "string-util.h" + +#define FRAMES_MAX 64 +#define THREADS_MAX 64 +#define ELF_PACKAGE_METADATA_ID 0xcafe1a7e + +/* The amount of data we're willing to write to each of the output pipes. */ +#define COREDUMP_PIPE_MAX (1024*1024U) + +static void *dw_dl = NULL; +static void *elf_dl = NULL; + +/* libdw symbols */ +Dwarf_Attribute *(*sym_dwarf_attr_integrate)(Dwarf_Die *, unsigned int, Dwarf_Attribute *); +const char *(*sym_dwarf_diename)(Dwarf_Die *); +const char *(*sym_dwarf_formstring)(Dwarf_Attribute *); +int (*sym_dwarf_getscopes)(Dwarf_Die *, Dwarf_Addr, Dwarf_Die **); +int (*sym_dwarf_getscopes_die)(Dwarf_Die *, Dwarf_Die **); +Elf *(*sym_dwelf_elf_begin)(int); +#if HAVE_DWELF_ELF_E_MACHINE_STRING +const char *(*sym_dwelf_elf_e_machine_string)(int); +#endif +ssize_t (*sym_dwelf_elf_gnu_build_id)(Elf *, const void **); +int (*sym_dwarf_tag)(Dwarf_Die *); +Dwfl_Module *(*sym_dwfl_addrmodule)(Dwfl *, Dwarf_Addr); +Dwfl *(*sym_dwfl_begin)(const Dwfl_Callbacks *); +int (*sym_dwfl_build_id_find_elf)(Dwfl_Module *, void **, const char *, Dwarf_Addr, char **, Elf **); +int (*sym_dwfl_core_file_attach)(Dwfl *, Elf *); +int (*sym_dwfl_core_file_report)(Dwfl *, Elf *, const char *); +void (*sym_dwfl_end)(Dwfl *); +const char *(*sym_dwfl_errmsg)(int); +int (*sym_dwfl_errno)(void); +bool (*sym_dwfl_frame_pc)(Dwfl_Frame *, Dwarf_Addr *, bool *); +ptrdiff_t (*sym_dwfl_getmodules)(Dwfl *, int (*)(Dwfl_Module *, void **, const char *, Dwarf_Addr, void *), void *, ptrdiff_t); +int (*sym_dwfl_getthreads)(Dwfl *, int (*)(Dwfl_Thread *, void *), void *); +Dwarf_Die *(*sym_dwfl_module_addrdie)(Dwfl_Module *, Dwarf_Addr, Dwarf_Addr *); +const char *(*sym_dwfl_module_addrname)(Dwfl_Module *, GElf_Addr); +int (*sym_dwfl_module_build_id)(Dwfl_Module *, const unsigned char **, GElf_Addr *); +Elf *(*sym_dwfl_module_getelf)(Dwfl_Module *, GElf_Addr *); +const char *(*sym_dwfl_module_info)(Dwfl_Module *, void ***, Dwarf_Addr *, Dwarf_Addr *, Dwarf_Addr *, Dwarf_Addr *, const char **, const char **); +int (*sym_dwfl_offline_section_address)(Dwfl_Module *, void **, const char *, Dwarf_Addr, const char *, GElf_Word, const GElf_Shdr *, Dwarf_Addr *); +int (*sym_dwfl_report_end)(Dwfl *, int (*)(Dwfl_Module *, void *, const char *, Dwarf_Addr, void *), void *); +int (*sym_dwfl_standard_find_debuginfo)(Dwfl_Module *, void **, const char *, Dwarf_Addr, const char *, const char *, GElf_Word, char **); +int (*sym_dwfl_thread_getframes)(Dwfl_Thread *, int (*)(Dwfl_Frame *, void *), void *); +pid_t (*sym_dwfl_thread_tid)(Dwfl_Thread *); + +/* libelf symbols */ +Elf *(*sym_elf_begin)(int, Elf_Cmd, Elf *); +int (*sym_elf_end)(Elf *); +Elf_Data *(*sym_elf_getdata_rawchunk)(Elf *, int64_t, size_t, Elf_Type); +GElf_Ehdr *(*sym_gelf_getehdr)(Elf *, GElf_Ehdr *); +int (*sym_elf_getphdrnum)(Elf *, size_t *); +const char *(*sym_elf_errmsg)(int); +int (*sym_elf_errno)(void); +Elf *(*sym_elf_memory)(char *, size_t); +unsigned int (*sym_elf_version)(unsigned int); +GElf_Phdr *(*sym_gelf_getphdr)(Elf *, int, GElf_Phdr *); +size_t (*sym_gelf_getnote)(Elf_Data *, size_t, GElf_Nhdr *, size_t *, size_t *); + +int dlopen_dw(void) { + int r; + + r = dlopen_many_sym_or_warn( + &dw_dl, "libdw.so.1", LOG_DEBUG, + DLSYM_ARG(dwarf_getscopes), + DLSYM_ARG(dwarf_getscopes_die), + DLSYM_ARG(dwarf_tag), + DLSYM_ARG(dwarf_attr_integrate), + DLSYM_ARG(dwarf_formstring), + DLSYM_ARG(dwarf_diename), + DLSYM_ARG(dwelf_elf_gnu_build_id), + DLSYM_ARG(dwelf_elf_begin), +#if HAVE_DWELF_ELF_E_MACHINE_STRING + DLSYM_ARG(dwelf_elf_e_machine_string), +#endif + DLSYM_ARG(dwfl_addrmodule), + DLSYM_ARG(dwfl_frame_pc), + DLSYM_ARG(dwfl_module_addrdie), + DLSYM_ARG(dwfl_module_addrname), + DLSYM_ARG(dwfl_module_info), + DLSYM_ARG(dwfl_module_build_id), + DLSYM_ARG(dwfl_module_getelf), + DLSYM_ARG(dwfl_begin), + DLSYM_ARG(dwfl_core_file_report), + DLSYM_ARG(dwfl_report_end), + DLSYM_ARG(dwfl_getmodules), + DLSYM_ARG(dwfl_core_file_attach), + DLSYM_ARG(dwfl_end), + DLSYM_ARG(dwfl_errmsg), + DLSYM_ARG(dwfl_errno), + DLSYM_ARG(dwfl_build_id_find_elf), + DLSYM_ARG(dwfl_standard_find_debuginfo), + DLSYM_ARG(dwfl_thread_tid), + DLSYM_ARG(dwfl_thread_getframes), + DLSYM_ARG(dwfl_getthreads), + DLSYM_ARG(dwfl_offline_section_address)); + if (r <= 0) + return r; + + return 1; +} + +int dlopen_elf(void) { + int r; + + r = dlopen_many_sym_or_warn( + &elf_dl, "libelf.so.1", LOG_DEBUG, + DLSYM_ARG(elf_begin), + DLSYM_ARG(elf_end), + DLSYM_ARG(elf_getphdrnum), + DLSYM_ARG(elf_getdata_rawchunk), + DLSYM_ARG(elf_errmsg), + DLSYM_ARG(elf_errno), + DLSYM_ARG(elf_memory), + DLSYM_ARG(elf_version), + DLSYM_ARG(gelf_getehdr), + DLSYM_ARG(gelf_getphdr), + DLSYM_ARG(gelf_getnote)); + if (r <= 0) + return r; + + return 1; +} + +typedef struct StackContext { + MemStream m; + Dwfl *dwfl; + Elf *elf; + unsigned n_thread; + unsigned n_frame; + JsonVariant **package_metadata; + Set **modules; +} StackContext; + +static void stack_context_done(StackContext *c) { + assert(c); + + memstream_done(&c->m); + + if (c->dwfl) { + sym_dwfl_end(c->dwfl); + c->dwfl = NULL; + } + + if (c->elf) { + sym_elf_end(c->elf); + c->elf = NULL; + } +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(Elf *, sym_elf_end, NULL); + +static int frame_callback(Dwfl_Frame *frame, void *userdata) { + StackContext *c = ASSERT_PTR(userdata); + Dwarf_Addr pc, pc_adjusted; + const char *fname = NULL, *symbol = NULL; + Dwfl_Module *module; + bool is_activation; + uint64_t module_offset = 0; + + assert(frame); + + if (c->n_frame >= FRAMES_MAX) + return DWARF_CB_ABORT; + + if (!sym_dwfl_frame_pc(frame, &pc, &is_activation)) + return DWARF_CB_ABORT; + + pc_adjusted = pc - (is_activation ? 0 : 1); + + module = sym_dwfl_addrmodule(c->dwfl, pc_adjusted); + if (module) { + Dwarf_Addr start, bias = 0; + Dwarf_Die *cudie; + + cudie = sym_dwfl_module_addrdie(module, pc_adjusted, &bias); + if (cudie) { + _cleanup_free_ Dwarf_Die *scopes = NULL; + int n; + + n = sym_dwarf_getscopes(cudie, pc_adjusted - bias, &scopes); + if (n > 0) + for (Dwarf_Die *s = scopes; s && s < scopes + n; s++) { + Dwarf_Attribute *a, space; + + if (!IN_SET(sym_dwarf_tag(s), DW_TAG_subprogram, DW_TAG_inlined_subroutine, DW_TAG_entry_point)) + continue; + + a = sym_dwarf_attr_integrate(s, DW_AT_MIPS_linkage_name, &space); + if (!a) + a = sym_dwarf_attr_integrate(s, DW_AT_linkage_name, &space); + if (a) + symbol = sym_dwarf_formstring(a); + if (!symbol) + symbol = sym_dwarf_diename(s); + + if (symbol) + break; + } + } + + if (!symbol) + symbol = sym_dwfl_module_addrname(module, pc_adjusted); + + fname = sym_dwfl_module_info(module, NULL, &start, NULL, NULL, NULL, NULL, NULL); + module_offset = pc - start; + } + + if (c->m.f) + fprintf(c->m.f, "#%-2u 0x%016" PRIx64 " %s (%s + 0x%" PRIx64 ")\n", c->n_frame, (uint64_t) pc, strna(symbol), strna(fname), module_offset); + c->n_frame++; + + return DWARF_CB_OK; +} + +static int thread_callback(Dwfl_Thread *thread, void *userdata) { + StackContext *c = ASSERT_PTR(userdata); + pid_t tid; + + assert(thread); + + if (c->n_thread >= THREADS_MAX) + return DWARF_CB_ABORT; + + if (c->n_thread != 0 && c->m.f) + fputc('\n', c->m.f); + + c->n_frame = 0; + + if (c->m.f) { + tid = sym_dwfl_thread_tid(thread); + fprintf(c->m.f, "Stack trace of thread " PID_FMT ":\n", tid); + } + + if (sym_dwfl_thread_getframes(thread, frame_callback, c) < 0) + return DWARF_CB_ABORT; + + c->n_thread++; + + return DWARF_CB_OK; +} + +static char* build_package_reference( + const char *type, + const char *name, + const char *version, + const char *arch) { + + /* Construct an identifier for a specific version of the package. The syntax is most suitable for + * rpm: the resulting string can be used directly in queries and rpm/dnf/yum commands. For dpkg and + * other systems, it might not be usable directly, but users should still be able to figure out the + * meaning. + */ + + return strjoin(type ?: "package", + " ", + name, + + version ? "-" : "", + strempty(version), + + /* arch is meaningful even without version, so always print it */ + arch ? "." : "", + strempty(arch)); +} + +static void report_module_metadata(StackContext *c, const char *name, JsonVariant *metadata) { + assert(c); + assert(name); + + if (!c->m.f) + return; + + fprintf(c->m.f, "Module %s", name); + + if (metadata) { + const char + *build_id = json_variant_string(json_variant_by_key(metadata, "buildId")), + *type = json_variant_string(json_variant_by_key(metadata, "type")), + *package = json_variant_string(json_variant_by_key(metadata, "name")), + *version = json_variant_string(json_variant_by_key(metadata, "version")), + *arch = json_variant_string(json_variant_by_key(metadata, "architecture")); + + if (package) { + /* Version/architecture is only meaningful with a package name. + * Skip the detailed fields if package is unknown. */ + _cleanup_free_ char *id = build_package_reference(type, package, version, arch); + fprintf(c->m.f, " from %s", strnull(id)); + } + + if (build_id && !(package && version)) + fprintf(c->m.f, ", build-id=%s", build_id); + } + + fputs("\n", c->m.f); +} + +static int parse_package_metadata(const char *name, JsonVariant *id_json, Elf *elf, bool *ret_interpreter_found, StackContext *c) { + bool interpreter_found = false; + size_t n_program_headers; + int r; + + assert(name); + assert(elf); + assert(c); + + /* When iterating over PT_LOAD we will visit modules more than once */ + if (set_contains(*c->modules, name)) + return 0; + + r = sym_elf_getphdrnum(elf, &n_program_headers); + if (r < 0) /* Not the handle we are looking for - that's ok, skip it */ + return 0; + + /* Iterate over all program headers in that ELF object. These will have been copied by + * the kernel verbatim when the core file is generated. */ + for (size_t i = 0; i < n_program_headers; ++i) { + GElf_Phdr mem, *program_header; + GElf_Nhdr note_header; + Elf_Data *data; + + /* Package metadata is in PT_NOTE headers. */ + program_header = sym_gelf_getphdr(elf, i, &mem); + if (!program_header || (program_header->p_type != PT_NOTE && program_header->p_type != PT_INTERP)) + continue; + + if (program_header->p_type == PT_INTERP) { + interpreter_found = true; + continue; + } + + /* Fortunately there is an iterator we can use to walk over the + * elements of a PT_NOTE program header. We are interested in the + * note with type. */ + data = sym_elf_getdata_rawchunk(elf, + program_header->p_offset, + program_header->p_filesz, + ELF_T_NHDR); + if (!data) + continue; + + for (size_t note_offset = 0, name_offset, desc_offset; + note_offset < data->d_size && + (note_offset = sym_gelf_getnote(data, note_offset, ¬e_header, &name_offset, &desc_offset)) > 0;) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL; + const char *payload = (const char *)data->d_buf + desc_offset; + + if (note_header.n_namesz == 0 || note_header.n_descsz == 0) + continue; + + /* Package metadata might have different owners, but the + * magic ID is always the same. */ + if (note_header.n_type != ELF_PACKAGE_METADATA_ID) + continue; + + _cleanup_free_ char *payload_0suffixed = NULL; + assert(note_offset > desc_offset); + size_t payload_len = note_offset - desc_offset; + + /* If we are lucky and the payload is NUL-padded, we don't need to copy the string. + * But if happens to go all the way until the end of the buffer, make a copy. */ + if (payload[payload_len-1] != '\0') { + payload_0suffixed = memdup_suffix0(payload, payload_len); + if (!payload_0suffixed) + return log_oom(); + payload = payload_0suffixed; + } + + r = json_parse(payload, 0, &v, NULL, NULL); + if (r < 0) { + _cleanup_free_ char *esc = cescape(payload); + return log_error_errno(r, "json_parse on \"%s\" failed: %m", strnull(esc)); + } + + /* If we have a build-id, merge it in the same JSON object so that it appears all + * nicely together in the logs/metadata. */ + if (id_json) { + r = json_variant_merge_object(&v, id_json); + if (r < 0) + return log_error_errno(r, "json_variant_merge of package meta with buildId failed: %m"); + } + + /* Pretty-print to the buffer, so that the metadata goes as plaintext in the + * journal. */ + report_module_metadata(c, name, v); + + /* Then we build a new object using the module name as the key, and merge it + * with the previous parses, so that in the end it all fits together in a single + * JSON blob. */ + r = json_build(&w, JSON_BUILD_OBJECT(JSON_BUILD_PAIR(name, JSON_BUILD_VARIANT(v)))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON object: %m"); + + r = json_variant_merge_object(c->package_metadata, w); + if (r < 0) + return log_error_errno(r, "json_variant_merge of package meta with buildId failed: %m"); + + /* Finally stash the name, so we avoid double visits. */ + r = set_put_strdup(c->modules, name); + if (r < 0) + return log_error_errno(r, "set_put_strdup failed: %m"); + + if (ret_interpreter_found) + *ret_interpreter_found = interpreter_found; + + return 1; + } + } + + if (ret_interpreter_found) + *ret_interpreter_found = interpreter_found; + + /* Didn't find package metadata for this module - that's ok, just go to the next. */ + return 0; +} + +/* Get the build-id out of an ELF object or a dwarf core module. */ +static int parse_buildid(Dwfl_Module *mod, Elf *elf, const char *name, StackContext *c, JsonVariant **ret_id_json) { + _cleanup_(json_variant_unrefp) JsonVariant *id_json = NULL; + const unsigned char *id; + GElf_Addr id_vaddr; + ssize_t id_len; + int r; + + assert(mod || elf); + assert(name); + assert(c); + + if (mod) + id_len = sym_dwfl_module_build_id(mod, &id, &id_vaddr); + else + id_len = sym_dwelf_elf_gnu_build_id(elf, (const void **)&id); + if (id_len <= 0) { + /* If we don't find a build-id, note it in the journal message, and try + * anyway to find the package metadata. It's unlikely to have the latter + * without the former, but there's no hard rule. */ + if (c->m.f) + fprintf(c->m.f, "Module %s without build-id.\n", name); + } else { + /* We will later parse package metadata json and pass it to our caller. Prepare the + * build-id in json format too, so that it can be appended and parsed cleanly. It + * will then be added as metadata to the journal message with the stack trace. */ + r = json_build(&id_json, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("buildId", JSON_BUILD_HEX(id, id_len)))); + if (r < 0) + return log_error_errno(r, "json_build on buildId failed: %m"); + } + + if (ret_id_json) + *ret_id_json = TAKE_PTR(id_json); + + return 0; +} + +static int module_callback(Dwfl_Module *mod, void **userdata, const char *name, Dwarf_Addr start, void *arg) { + _cleanup_(json_variant_unrefp) JsonVariant *id_json = NULL; + StackContext *c = ASSERT_PTR(arg); + size_t n_program_headers; + GElf_Addr bias; + int r; + Elf *elf; + + assert(mod); + + if (!name) + name = "(unnamed)"; /* For logging purposes */ + + /* We are iterating on each "module", which is what dwfl calls ELF objects contained in the + * core file, and extracting the build-id first and then the package metadata. + * We proceed in a best-effort fashion - not all ELF objects might contain both or either. + * The build-id is easy, as libdwfl parses it during the sym_dwfl_core_file_report() call and + * stores it separately in an internal library struct. */ + r = parse_buildid(mod, NULL, name, c, &id_json); + if (r < 0) + return DWARF_CB_ABORT; + + /* The .note.package metadata is more difficult. From the module, we need to get a reference + * to the ELF object first. We might be lucky and just get it from elfutils. */ + elf = sym_dwfl_module_getelf(mod, &bias); + if (elf) { + r = parse_package_metadata(name, id_json, elf, NULL, c); + if (r < 0) + return DWARF_CB_ABORT; + if (r > 0) + return DWARF_CB_OK; + } else + elf = c->elf; + + /* We did not get the ELF object, or it's just a reference to the core. That is likely + * because we didn't get direct access to the executable, and the version of elfutils does + * not yet support parsing it out of the core file directly. + * So fallback to manual extraction - get the PT_LOAD section from the core, + * and if it's the right one we can interpret it as an Elf object, and parse + * its notes manually. */ + + r = sym_elf_getphdrnum(elf, &n_program_headers); + if (r < 0) { + log_warning("Could not parse number of program headers from core file: %s", + sym_elf_errmsg(-1)); /* -1 retrieves the most recent error */ + report_module_metadata(c, name, id_json); + + return DWARF_CB_OK; + } + + for (size_t i = 0; i < n_program_headers; ++i) { + GElf_Phdr mem, *program_header; + Elf_Data *data; + GElf_Addr end_of_segment; + + /* The core file stores the ELF files in the PT_LOAD segment. */ + program_header = sym_gelf_getphdr(elf, i, &mem); + if (!program_header || program_header->p_type != PT_LOAD) + continue; + + /* Check that the end of segment is a valid address. */ + if (__builtin_add_overflow(program_header->p_vaddr, program_header->p_memsz, &end_of_segment)) { + log_error("Abort due to corrupted core dump, end of segment address %#zx + %#zx overflows", (size_t)program_header->p_vaddr, (size_t)program_header->p_memsz); + return DWARF_CB_ABORT; + } + + /* This PT_LOAD segment doesn't contain the start address, so it can't be the module we are looking for. */ + if (start < program_header->p_vaddr || start >= end_of_segment) + continue; + + /* Now get a usable Elf reference, and parse the notes from it. */ + data = sym_elf_getdata_rawchunk(elf, + program_header->p_offset, + program_header->p_filesz, + ELF_T_NHDR); + if (!data) + continue; + + _cleanup_(sym_elf_endp) Elf *memelf = sym_elf_memory(data->d_buf, data->d_size); + if (!memelf) + continue; + r = parse_package_metadata(name, id_json, memelf, NULL, c); + if (r < 0) + return DWARF_CB_ABORT; + if (r > 0) + break; + } + + return DWARF_CB_OK; +} + +static int parse_core(int fd, const char *executable, char **ret, JsonVariant **ret_package_metadata) { + + const Dwfl_Callbacks callbacks = { + .find_elf = sym_dwfl_build_id_find_elf, + .section_address = sym_dwfl_offline_section_address, + .find_debuginfo = sym_dwfl_standard_find_debuginfo, + }; + + _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL; + _cleanup_set_free_ Set *modules = NULL; + _cleanup_(stack_context_done) StackContext c = { + .package_metadata = &package_metadata, + .modules = &modules, + }; + int r; + + assert(fd >= 0); + + if (lseek(fd, 0, SEEK_SET) < 0) + return log_warning_errno(errno, "Failed to seek to beginning of the core file: %m"); + + if (ret && !memstream_init(&c.m)) + return log_oom(); + + sym_elf_version(EV_CURRENT); + + c.elf = sym_elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!c.elf) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, elf_begin() failed: %s", sym_elf_errmsg(sym_elf_errno())); + + c.dwfl = sym_dwfl_begin(&callbacks); + if (!c.dwfl) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_begin() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno())); + + if (sym_dwfl_core_file_report(c.dwfl, c.elf, executable) < 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_core_file_report() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno())); + + if (sym_dwfl_report_end(c.dwfl, NULL, NULL) != 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_report_end() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno())); + + if (sym_dwfl_getmodules(c.dwfl, &module_callback, &c, 0) < 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_getmodules() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno())); + + if (sym_dwfl_core_file_attach(c.dwfl, c.elf) < 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_core_file_attach() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno())); + + if (sym_dwfl_getthreads(c.dwfl, thread_callback, &c) < 0) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_getthreads() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno())); + + if (ret) { + r = memstream_finalize(&c.m, ret, NULL); + if (r < 0) + return log_warning_errno(r, "Could not parse core file, flushing file buffer failed: %m"); + } + + if (ret_package_metadata) + *ret_package_metadata = TAKE_PTR(package_metadata); + + return 0; +} + +static int parse_elf(int fd, const char *executable, char **ret, JsonVariant **ret_package_metadata) { + _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL, *elf_metadata = NULL; + _cleanup_set_free_ Set *modules = NULL; + _cleanup_(stack_context_done) StackContext c = { + .package_metadata = &package_metadata, + .modules = &modules, + }; + const char *elf_type; + GElf_Ehdr elf_header; + int r; + + assert(fd >= 0); + + if (lseek(fd, 0, SEEK_SET) < 0) + return log_warning_errno(errno, "Failed to seek to beginning of the ELF file: %m"); + + if (ret && !memstream_init(&c.m)) + return log_oom(); + + sym_elf_version(EV_CURRENT); + + c.elf = sym_elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!c.elf) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse ELF file, elf_begin() failed: %s", sym_elf_errmsg(sym_elf_errno())); + + if (!sym_gelf_getehdr(c.elf, &elf_header)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse ELF file, gelf_getehdr() failed: %s", sym_elf_errmsg(sym_elf_errno())); + + if (elf_header.e_type == ET_CORE) { + _cleanup_free_ char *out = NULL; + + r = parse_core(fd, executable, ret ? &out : NULL, &package_metadata); + if (r < 0) + return log_warning_errno(r, "Failed to inspect core file: %m"); + + if (out) + fprintf(c.m.f, "%s", out); + + elf_type = "coredump"; + } else { + _cleanup_(json_variant_unrefp) JsonVariant *id_json = NULL; + const char *e = executable ?: "(unnamed)"; + bool interpreter_found = false; + + r = parse_buildid(NULL, c.elf, e, &c, &id_json); + if (r < 0) + return log_warning_errno(r, "Failed to parse build-id of ELF file: %m"); + + r = parse_package_metadata(e, id_json, c.elf, &interpreter_found, &c); + if (r < 0) + return log_warning_errno(r, "Failed to parse package metadata of ELF file: %m"); + + /* If we found a build-id and nothing else, return at least that. */ + if (!package_metadata && id_json) { + r = json_build(&package_metadata, JSON_BUILD_OBJECT(JSON_BUILD_PAIR(e, JSON_BUILD_VARIANT(id_json)))); + if (r < 0) + return log_warning_errno(r, "Failed to build JSON object: %m"); + } + + if (interpreter_found) + elf_type = "executable"; + else + elf_type = "library"; + } + + /* Note that e_type is always DYN for both executables and libraries, so we can't tell them apart from the header, + * but we will search for the PT_INTERP section when parsing the metadata. */ + r = json_build(&elf_metadata, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("elfType", JSON_BUILD_STRING(elf_type)))); + if (r < 0) + return log_warning_errno(r, "Failed to build JSON object: %m"); + +#if HAVE_DWELF_ELF_E_MACHINE_STRING + const char *elf_architecture = sym_dwelf_elf_e_machine_string(elf_header.e_machine); + if (elf_architecture) { + _cleanup_(json_variant_unrefp) JsonVariant *json_architecture = NULL; + + r = json_build(&json_architecture, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("elfArchitecture", JSON_BUILD_STRING(elf_architecture)))); + if (r < 0) + return log_warning_errno(r, "Failed to build JSON object: %m"); + + r = json_variant_merge_object(&elf_metadata, json_architecture); + if (r < 0) + return log_warning_errno(r, "Failed to merge JSON objects: %m"); + + if (ret) + fprintf(c.m.f, "ELF object binary architecture: %s\n", elf_architecture); + } +#endif + + /* We always at least have the ELF type, so merge that (and possibly the arch). */ + r = json_variant_merge_object(&elf_metadata, package_metadata); + if (r < 0) + return log_warning_errno(r, "Failed to merge JSON objects: %m"); + + if (ret) { + r = memstream_finalize(&c.m, ret, NULL); + if (r < 0) + return log_warning_errno(r, "Could not parse ELF file, flushing file buffer failed: %m"); + } + + if (ret_package_metadata) + *ret_package_metadata = TAKE_PTR(elf_metadata); + + return 0; +} + +int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) { + _cleanup_close_pair_ int error_pipe[2] = EBADF_PAIR, + return_pipe[2] = EBADF_PAIR, + json_pipe[2] = EBADF_PAIR; + _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL; + _cleanup_free_ char *buf = NULL; + int r; + + assert(fd >= 0); + + r = dlopen_dw(); + if (r < 0) + return r; + + r = dlopen_elf(); + if (r < 0) + return r; + + r = RET_NERRNO(pipe2(error_pipe, O_CLOEXEC|O_NONBLOCK)); + if (r < 0) + return r; + + if (ret) { + r = RET_NERRNO(pipe2(return_pipe, O_CLOEXEC|O_NONBLOCK)); + if (r < 0) + return r; + } + + if (ret_package_metadata) { + r = RET_NERRNO(pipe2(json_pipe, O_CLOEXEC|O_NONBLOCK)); + if (r < 0) + return r; + } + + /* Parsing possibly malformed data is crash-happy, so fork. In case we crash, + * the core file will not be lost, and the messages will still be attached to + * the journal. Reading the elf object might be slow, but it still has an upper + * bound since the core files have an upper size limit. It's also not doing any + * system call or interacting with the system in any way, besides reading from + * the file descriptor and writing into these four pipes. */ + r = safe_fork_full("(sd-parse-elf)", + NULL, + (int[]){ fd, error_pipe[1], return_pipe[1], json_pipe[1] }, + 4, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_WAIT|FORK_REOPEN_LOG, + NULL); + if (r < 0) { + if (r == -EPROTO) { /* We should have the errno from the child, but don't clobber original error */ + int e, k; + + k = read(error_pipe[0], &e, sizeof(e)); + if (k < 0 && errno != EAGAIN) /* Pipe is non-blocking, EAGAIN means there's nothing */ + return -errno; + if (k == sizeof(e)) + return e; /* propagate error sent to us from child */ + if (k != 0) + return -EIO; + } + + return r; + } + if (r == 0) { + /* We want to avoid loops, given this can be called from systemd-coredump */ + if (fork_disable_dump) { + r = RET_NERRNO(prctl(PR_SET_DUMPABLE, 0)); + if (r < 0) + goto child_fail; + } + + r = parse_elf(fd, executable, ret ? &buf : NULL, ret_package_metadata ? &package_metadata : NULL); + if (r < 0) + goto child_fail; + + if (buf) { + size_t len = strlen(buf); + + if (len > COREDUMP_PIPE_MAX) { + /* This is iffy. A backtrace can be a few hundred kilobytes, but too much is + * too much. Let's log a warning and ignore the rest. */ + log_warning("Generated backtrace is %zu bytes (more than the limit of %u bytes), backtrace will be truncated.", + len, COREDUMP_PIPE_MAX); + len = COREDUMP_PIPE_MAX; + } + + /* Bump the space for the returned string. + * Failure is ignored, because partial output is still useful. */ + (void) fcntl(return_pipe[1], F_SETPIPE_SZ, len); + + r = loop_write(return_pipe[1], buf, len); + if (r == -EAGAIN) + log_warning("Write failed, backtrace will be truncated."); + else if (r < 0) + goto child_fail; + + return_pipe[1] = safe_close(return_pipe[1]); + } + + if (package_metadata) { + _cleanup_fclose_ FILE *json_out = NULL; + + /* Bump the space for the returned string. We don't know how much space we'll need in + * advance, so we'll just try to write as much as possible and maybe fail later. */ + (void) fcntl(json_pipe[1], F_SETPIPE_SZ, COREDUMP_PIPE_MAX); + + json_out = take_fdopen(&json_pipe[1], "w"); + if (!json_out) { + r = -errno; + goto child_fail; + } + + r = json_variant_dump(package_metadata, JSON_FORMAT_FLUSH, json_out, NULL); + if (r < 0) + log_warning_errno(r, "Failed to write JSON package metadata, ignoring: %m"); + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(error_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + error_pipe[1] = safe_close(error_pipe[1]); + return_pipe[1] = safe_close(return_pipe[1]); + json_pipe[1] = safe_close(json_pipe[1]); + + if (ret) { + _cleanup_fclose_ FILE *in = NULL; + + in = take_fdopen(&return_pipe[0], "r"); + if (!in) + return -errno; + + r = read_full_stream(in, &buf, NULL); + if (r < 0) + return r; + } + + if (ret_package_metadata) { + _cleanup_fclose_ FILE *json_in = NULL; + + json_in = take_fdopen(&json_pipe[0], "r"); + if (!json_in) + return -errno; + + r = json_parse_file(json_in, NULL, 0, &package_metadata, NULL, NULL); + if (r < 0 && r != -ENODATA) /* ENODATA: json was empty, so we got nothing, but that's ok */ + log_warning_errno(r, "Failed to read or parse json metadata, ignoring: %m"); + } + + if (ret) + *ret = TAKE_PTR(buf); + if (ret_package_metadata) + *ret_package_metadata = TAKE_PTR(package_metadata); + + return 0; +} + +#endif diff --git a/src/shared/elf-util.h b/src/shared/elf-util.h new file mode 100644 index 0000000..b28e64c --- /dev/null +++ b/src/shared/elf-util.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "json.h" + +#if HAVE_ELFUTILS +int dlopen_dw(void); +int dlopen_elf(void); + +/* Parse an ELF object in a forked process, so that errors while iterating over + * untrusted and potentially malicious data do not propagate to the main caller's process. + * If fork_disable_dump, the child process will not dump core if it crashes. */ +int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata); +#else +static inline int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "elfutils disabled, parsing ELF objects not supported"); +} +#endif diff --git a/src/shared/enable-mempool.c b/src/shared/enable-mempool.c new file mode 100644 index 0000000..fd582c0 --- /dev/null +++ b/src/shared/enable-mempool.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "env-util.h" +#include "mempool.h" +#include "process-util.h" + +bool mempool_enabled(void) { + static int cache = -1; + + if (!is_main_thread()) + return false; + + if (cache < 0) + cache = getenv_bool("SYSTEMD_MEMPOOL") != 0; + + return cache; +} diff --git a/src/shared/env-file-label.c b/src/shared/env-file-label.c new file mode 100644 index 0000000..5917b63 --- /dev/null +++ b/src/shared/env-file-label.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "env-file-label.h" +#include "env-file.h" +#include "selinux-util.h" + +int write_env_file_label(int dir_fd, const char *fname, char **headers, char **l) { + int r; + + r = mac_selinux_create_file_prepare(fname, S_IFREG); + if (r < 0) + return r; + + r = write_env_file(dir_fd, fname, headers, l); + + mac_selinux_create_file_clear(); + + return r; +} + +int write_vconsole_conf_label(char **l) { + int r; + + r = mac_selinux_create_file_prepare("/etc/vconsole.conf", S_IFREG); + if (r < 0) + return r; + + r = write_vconsole_conf(AT_FDCWD, "/etc/vconsole.conf", l); + + mac_selinux_create_file_clear(); + + return r; +} diff --git a/src/shared/env-file-label.h b/src/shared/env-file-label.h new file mode 100644 index 0000000..5ba45e4 --- /dev/null +++ b/src/shared/env-file-label.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* These functions are split out of fileio.h (and not for example just flags to the functions they wrap) in order to + * optimize linking: This way, -lselinux is needed only for the callers of these functions that need selinux, but not + * for all */ + +int write_env_file_label(int dir_fd, const char *fname, char **headers, char **l); + +int write_vconsole_conf_label(char **l); diff --git a/src/shared/ethtool-link-mode.py b/src/shared/ethtool-link-mode.py new file mode 100644 index 0000000..aac1576 --- /dev/null +++ b/src/shared/ethtool-link-mode.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later + +import re +import shlex +import subprocess +import sys + +OVERRIDES = { + 'autoneg' : 'autonegotiation', +} + +mode, cpp, header = sys.argv[1:] +xml = mode == '--xml' + +command = [*shlex.split(cpp), '-include', header, '-'] +out = subprocess.check_output(command, stdin=subprocess.DEVNULL, universal_newlines=True) + +lines = iter(out.splitlines()) +for line in lines: + if line.startswith('enum ethtool_link_mode_bit_indices {'): + break + +entries = [] +for line in lines: + if line.startswith('}'): + break + # ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + m = re.match(r'^\s*(ETHTOOL_LINK_MODE_((\d*).*)_BIT)\s*=\s*(\d+),', line) + if not m: + continue + enum, name, speed, value = m.groups() + + name = name.lower().replace('_', '-') + name = OVERRIDES.get(name, name) + + duplex = name.split('-')[-1].lower() + if duplex not in {'half', 'full'}: + duplex = '' + + entries += [(enum, name, speed, value, duplex)] + +if xml: + print(' ') + + entries.sort(key=lambda entry: (int(entry[2]) if entry[2] else 1e20, entry[4], entry[1], entry[3])) + +for enum, name, speed, value, duplex in entries: + if xml: + print(f'''\ + + {speed}{duplex} + ''') + else: + enum = f'[{enum}]' + print(f' {enum:50} = "{name}",') + +if xml: + print(' ') + +assert len(entries) >= 99 diff --git a/src/shared/ethtool-util.c b/src/shared/ethtool-util.c new file mode 100644 index 0000000..dce9e00 --- /dev/null +++ b/src/shared/ethtool-util.c @@ -0,0 +1,1423 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "conf-parser.h" +#include "ethtool-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "log.h" +#include "memory-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "strv.h" +#include "strxcpyx.h" + +static const char* const duplex_table[_DUP_MAX] = { + [DUP_FULL] = "full", + [DUP_HALF] = "half" +}; + +DEFINE_STRING_TABLE_LOOKUP(duplex, Duplex); +DEFINE_CONFIG_PARSE_ENUM(config_parse_duplex, duplex, Duplex, "Failed to parse duplex setting"); + +static const struct { + uint32_t opt; + const char *name; +} wol_option_map[] = { + { WAKE_PHY, "phy" }, + { WAKE_UCAST, "unicast", }, + { WAKE_MCAST, "multicast", }, + { WAKE_BCAST, "broadcast", }, + { WAKE_ARP, "arp", }, + { WAKE_MAGIC, "magic", }, + { WAKE_MAGICSECURE, "secureon", }, +}; + +int wol_options_to_string_alloc(uint32_t opts, char **ret) { + _cleanup_free_ char *str = NULL; + + assert(ret); + + if (opts == UINT32_MAX) { + *ret = NULL; + return 0; + } + + for (size_t i = 0; i < ELEMENTSOF(wol_option_map); i++) + if (opts & wol_option_map[i].opt && + !strextend_with_separator(&str, ",", wol_option_map[i].name)) + return -ENOMEM; + + if (!str) { + str = strdup("off"); + if (!str) + return -ENOMEM; + } + + *ret = TAKE_PTR(str); + return 1; +} + +static const char* const port_table[] = { + [NET_DEV_PORT_TP] = "tp", + [NET_DEV_PORT_AUI] = "aui", + [NET_DEV_PORT_MII] = "mii", + [NET_DEV_PORT_FIBRE] = "fibre", + [NET_DEV_PORT_BNC] = "bnc", +}; + +DEFINE_STRING_TABLE_LOOKUP(port, NetDevPort); +DEFINE_CONFIG_PARSE_ENUM(config_parse_port, port, NetDevPort, "Failed to parse Port setting"); + +static const char* const mdi_table[] = { + [ETH_TP_MDI_INVALID] = "unknown", + [ETH_TP_MDI] = "mdi", + [ETH_TP_MDI_X] = "mdi-x", + [ETH_TP_MDI_AUTO] = "auto", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(mdi, int); + +static const char* const netdev_feature_table[_NET_DEV_FEAT_MAX] = { + [NET_DEV_FEAT_SG] = "tx-scatter-gather", + [NET_DEV_FEAT_IP_CSUM] = "tx-checksum-ipv4", + [NET_DEV_FEAT_HW_CSUM] = "tx-checksum-ip-generic", + [NET_DEV_FEAT_IPV6_CSUM] = "tx-checksum-ipv6", + [NET_DEV_FEAT_HIGHDMA] = "highdma", + [NET_DEV_FEAT_FRAGLIST] = "tx-scatter-gather-fraglist", + [NET_DEV_FEAT_HW_VLAN_CTAG_TX] = "tx-vlan-hw-insert", + [NET_DEV_FEAT_HW_VLAN_CTAG_RX] = "rx-vlan-hw-parse", + [NET_DEV_FEAT_HW_VLAN_CTAG_FILTER] = "rx-vlan-filter", + [NET_DEV_FEAT_HW_VLAN_STAG_TX] = "tx-vlan-stag-hw-insert", + [NET_DEV_FEAT_HW_VLAN_STAG_RX] = "rx-vlan-stag-hw-parse", + [NET_DEV_FEAT_HW_VLAN_STAG_FILTER] = "rx-vlan-stag-filter", + [NET_DEV_FEAT_VLAN_CHALLENGED] = "vlan-challenged", + [NET_DEV_FEAT_GSO] = "tx-generic-segmentation", + [NET_DEV_FEAT_LLTX] = "tx-lockless", + [NET_DEV_FEAT_NETNS_LOCAL] = "netns-local", + [NET_DEV_FEAT_GRO] = "rx-gro", + [NET_DEV_FEAT_GRO_HW] = "rx-gro-hw", + [NET_DEV_FEAT_LRO] = "rx-lro", + [NET_DEV_FEAT_TSO] = "tx-tcp-segmentation", + [NET_DEV_FEAT_GSO_ROBUST] = "tx-gso-robust", + [NET_DEV_FEAT_TSO_ECN] = "tx-tcp-ecn-segmentation", + [NET_DEV_FEAT_TSO_MANGLEID] = "tx-tcp-mangleid-segmentation", + [NET_DEV_FEAT_TSO6] = "tx-tcp6-segmentation", + [NET_DEV_FEAT_FSO] = "tx-fcoe-segmentation", + [NET_DEV_FEAT_GSO_GRE] = "tx-gre-segmentation", + [NET_DEV_FEAT_GSO_GRE_CSUM] = "tx-gre-csum-segmentation", + [NET_DEV_FEAT_GSO_IPXIP4] = "tx-ipxip4-segmentation", + [NET_DEV_FEAT_GSO_IPXIP6] = "tx-ipxip6-segmentation", + [NET_DEV_FEAT_GSO_UDP_TUNNEL] = "tx-udp_tnl-segmentation", + [NET_DEV_FEAT_GSO_UDP_TUNNEL_CSUM] = "tx-udp_tnl-csum-segmentation", + [NET_DEV_FEAT_GSO_PARTIAL] = "tx-gso-partial", + [NET_DEV_FEAT_GSO_TUNNEL_REMCSUM] = "tx-tunnel-remcsum-segmentation", + [NET_DEV_FEAT_GSO_SCTP] = "tx-sctp-segmentation", + [NET_DEV_FEAT_GSO_ESP] = "tx-esp-segmentation", + [NET_DEV_FEAT_GSO_UDP_L4] = "tx-udp-segmentation", + [NET_DEV_FEAT_GSO_FRAGLIST] = "tx-gso-list", + [NET_DEV_FEAT_FCOE_CRC] = "tx-checksum-fcoe-crc", + [NET_DEV_FEAT_SCTP_CRC] = "tx-checksum-sctp", + [NET_DEV_FEAT_FCOE_MTU] = "fcoe-mtu", + [NET_DEV_FEAT_NTUPLE] = "rx-ntuple-filter", + [NET_DEV_FEAT_RXHASH] = "rx-hashing", + [NET_DEV_FEAT_RXCSUM] = "rx-checksum", + [NET_DEV_FEAT_NOCACHE_COPY] = "tx-nocache-copy", + [NET_DEV_FEAT_LOOPBACK] = "loopback", + [NET_DEV_FEAT_RXFCS] = "rx-fcs", + [NET_DEV_FEAT_RXALL] = "rx-all", + [NET_DEV_FEAT_HW_L2FW_DOFFLOAD] = "l2-fwd-offload", + [NET_DEV_FEAT_HW_TC] = "hw-tc-offload", + [NET_DEV_FEAT_HW_ESP] = "esp-hw-offload", + [NET_DEV_FEAT_HW_ESP_TX_CSUM] = "esp-tx-csum-hw-offload", + [NET_DEV_FEAT_RX_UDP_TUNNEL_PORT] = "rx-udp_tunnel-port-offload", + [NET_DEV_FEAT_HW_TLS_RECORD] = "tls-hw-record", + [NET_DEV_FEAT_HW_TLS_TX] = "tls-hw-tx-offload", + [NET_DEV_FEAT_HW_TLS_RX] = "tls-hw-rx-offload", + [NET_DEV_FEAT_GRO_FRAGLIST] = "rx-gro-list", + [NET_DEV_FEAT_HW_MACSEC] = "macsec-hw-offload", + [NET_DEV_FEAT_GRO_UDP_FWD] = "rx-udp-gro-forwarding", + [NET_DEV_FEAT_HW_HSR_TAG_INS] = "hsr-tag-ins-offload", + [NET_DEV_FEAT_HW_HSR_TAG_RM] = "hsr-tag-rm-offload", + [NET_DEV_FEAT_HW_HSR_FWD] = "hsr-fwd-offload", + [NET_DEV_FEAT_HW_HSR_DUP] = "hsr-dup-offload", + + [NET_DEV_FEAT_TXCSUM] = "tx-checksum-", /* The suffix "-" means any feature beginning with "tx-checksum-" */ +}; + +static const char* const ethtool_link_mode_bit_table[] = { +# include "ethtool-link-mode.h" +}; +/* Make sure the array is large enough to fit all bits */ +assert_cc((ELEMENTSOF(ethtool_link_mode_bit_table)-1) / 32 < N_ADVERTISE); + +DEFINE_STRING_TABLE_LOOKUP(ethtool_link_mode_bit, enum ethtool_link_mode_bit_indices); + +static int ethtool_connect(int *ethtool_fd) { + int fd; + + assert(ethtool_fd); + + /* This does nothing if already connected. */ + if (*ethtool_fd >= 0) + return 0; + + fd = socket_ioctl_fd(); + if (fd < 0) + return log_debug_errno(fd, "ethtool: could not create control socket: %m"); + + *ethtool_fd = fd; + return 0; +} + +int ethtool_get_driver(int *ethtool_fd, const char *ifname, char **ret) { + struct ethtool_drvinfo ecmd = { + .cmd = ETHTOOL_GDRVINFO, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + char *d; + int r; + + assert(ethtool_fd); + assert(ifname); + assert(ret); + + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (isempty(ecmd.driver)) + return -ENODATA; + + d = strdup(ecmd.driver); + if (!d) + return -ENOMEM; + + *ret = d; + return 0; +} + +int ethtool_get_link_info( + int *ethtool_fd, + const char *ifname, + int *ret_autonegotiation, + uint64_t *ret_speed, + Duplex *ret_duplex, + NetDevPort *ret_port) { + + struct ethtool_cmd ecmd = { + .cmd = ETHTOOL_GSET, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + int r; + + assert(ethtool_fd); + assert(ifname); + + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (ret_autonegotiation) + *ret_autonegotiation = ecmd.autoneg; + + if (ret_speed) { + uint32_t speed; + + speed = ethtool_cmd_speed(&ecmd); + *ret_speed = speed == (uint32_t) SPEED_UNKNOWN ? + UINT64_MAX : (uint64_t) speed * 1000 * 1000; + } + + if (ret_duplex) + *ret_duplex = ecmd.duplex; + + if (ret_port) + *ret_port = ecmd.port; + + return 0; +} + +int ethtool_get_permanent_hw_addr(int *ethtool_fd, const char *ifname, struct hw_addr_data *ret) { + _cleanup_close_ int fd = -EBADF; + struct { + struct ethtool_perm_addr addr; + uint8_t space[HW_ADDR_MAX_SIZE]; + } epaddr = { + .addr.cmd = ETHTOOL_GPERMADDR, + .addr.size = HW_ADDR_MAX_SIZE, + }; + struct ifreq ifr = { + .ifr_data = (caddr_t) &epaddr, + }; + int r; + + assert(ifname); + assert(ret); + + if (!ethtool_fd) + ethtool_fd = &fd; + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (epaddr.addr.size == 0) + return -ENODATA; + + if (epaddr.addr.size > HW_ADDR_MAX_SIZE) + return -EINVAL; + + ret->length = epaddr.addr.size; + memcpy(ret->bytes, epaddr.addr.data, epaddr.addr.size); + return 0; +} + +#define UPDATE(dest, val, updated) \ + do { \ + typeof(val) _v = (val); \ + if (dest != _v) \ + updated = true; \ + dest = _v; \ + } while (false) + +#define UPDATE_WITH_MAX(dest, max, val, updated) \ + do { \ + typeof(dest) _v = (val); \ + typeof(dest) _max = (max); \ + if (_v == 0 || _v > _max) \ + _v = _max; \ + if (dest != _v) \ + updated = true; \ + dest = _v; \ + } while (false) + +int ethtool_set_wol( + int *ethtool_fd, + const char *ifname, + uint32_t wolopts, + const uint8_t password[SOPASS_MAX]) { + + struct ethtool_wolinfo ecmd = { + .cmd = ETHTOOL_GWOL, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + bool need_update = false; + int r; + + assert(ethtool_fd); + assert(ifname); + + if (wolopts == UINT32_MAX && !password) + /* Nothing requested. Return earlier. */ + return 0; + + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + CLEANUP_ERASE(ecmd); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (wolopts == UINT32_MAX) { + /* When password is specified without valid WoL options specified, then enable + * WAKE_MAGICSECURE flag if supported. */ + wolopts = ecmd.wolopts; + if (password && FLAGS_SET(ecmd.supported, WAKE_MAGICSECURE)) + wolopts |= WAKE_MAGICSECURE; + } + + if ((wolopts & ~ecmd.supported) != 0) { + _cleanup_free_ char *str = NULL; + + (void) wol_options_to_string_alloc(wolopts & ~ecmd.supported, &str); + log_debug("Network interface %s does not support requested Wake on LAN options \"%s\", ignoring.", + ifname, strna(str)); + + wolopts &= ecmd.supported; + } + + if (!FLAGS_SET(wolopts, WAKE_MAGICSECURE)) + /* When WAKE_MAGICSECURE flag is not set, then ignore password. */ + password = NULL; + + UPDATE(ecmd.wolopts, wolopts, need_update); + if (password && + memcmp(ecmd.sopass, password, sizeof(ecmd.sopass)) != 0) { + memcpy(ecmd.sopass, password, sizeof(ecmd.sopass)); + need_update = true; + } + + if (!need_update) + return 0; + + ecmd.cmd = ETHTOOL_SWOL; + return RET_NERRNO(ioctl(*ethtool_fd, SIOCETHTOOL, &ifr)); +} + +int ethtool_set_nic_buffer_size(int *ethtool_fd, const char *ifname, const netdev_ring_param *ring) { + struct ethtool_ringparam ecmd = { + .cmd = ETHTOOL_GRINGPARAM, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + bool need_update = false; + int r; + + assert(ethtool_fd); + assert(ifname); + assert(ring); + + if (!ring->rx.set && + !ring->rx_mini.set && + !ring->rx_jumbo.set && + !ring->tx.set) + return 0; + + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (ring->rx.set) + UPDATE_WITH_MAX(ecmd.rx_pending, ecmd.rx_max_pending, ring->rx.value, need_update); + + if (ring->rx_mini.set) + UPDATE_WITH_MAX(ecmd.rx_mini_pending, ecmd.rx_mini_max_pending, ring->rx_mini.value, need_update); + + if (ring->rx_jumbo.set) + UPDATE_WITH_MAX(ecmd.rx_jumbo_pending, ecmd.rx_jumbo_max_pending, ring->rx_jumbo.value, need_update); + + if (ring->tx.set) + UPDATE_WITH_MAX(ecmd.tx_pending, ecmd.tx_max_pending, ring->tx.value, need_update); + + if (!need_update) + return 0; + + ecmd.cmd = ETHTOOL_SRINGPARAM; + return RET_NERRNO(ioctl(*ethtool_fd, SIOCETHTOOL, &ifr)); +} + +static int get_stringset(int ethtool_fd, const char *ifname, enum ethtool_stringset stringset_id, struct ethtool_gstrings **ret) { + _cleanup_free_ struct ethtool_gstrings *strings = NULL; + struct { + struct ethtool_sset_info info; + uint32_t space; + } buffer = { + .info.cmd = ETHTOOL_GSSET_INFO, + .info.sset_mask = UINT64_C(1) << stringset_id, + }; + struct ifreq ifr = { + .ifr_data = (void*) &buffer, + }; + uint32_t len; + + assert(ethtool_fd >= 0); + assert(ifname); + assert(ret); + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (buffer.info.sset_mask == 0) + return -EOPNOTSUPP; + +#pragma GCC diagnostic push +#if HAVE_ZERO_LENGTH_BOUNDS +# pragma GCC diagnostic ignored "-Wzero-length-bounds" +#endif + len = buffer.info.data[0]; +#pragma GCC diagnostic pop + if (len == 0) + return -EOPNOTSUPP; + + strings = malloc0(offsetof(struct ethtool_gstrings, data) + len * ETH_GSTRING_LEN); + if (!strings) + return -ENOMEM; + + strings->cmd = ETHTOOL_GSTRINGS; + strings->string_set = stringset_id; + strings->len = len; + + ifr.ifr_data = (void*) strings; + + if (ioctl(ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + *ret = TAKE_PTR(strings); + return 0; +} + +static int get_features(int ethtool_fd, const char *ifname, uint32_t n_features, struct ethtool_gfeatures **ret) { + _cleanup_free_ struct ethtool_gfeatures *gfeatures = NULL; + struct ifreq ifr; + + assert(ethtool_fd >= 0); + assert(ifname); + assert(ret); + assert(n_features > 0); + + gfeatures = malloc0(offsetof(struct ethtool_gfeatures, features) + + DIV_ROUND_UP(n_features, 32U) * sizeof(gfeatures->features[0])); + if (!gfeatures) + return -ENOMEM; + + gfeatures->cmd = ETHTOOL_GFEATURES; + gfeatures->size = DIV_ROUND_UP(n_features, 32U); + + ifr = (struct ifreq) { + .ifr_data = (void*) gfeatures, + }; + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + *ret = TAKE_PTR(gfeatures); + return 0; +} + +static int set_features_bit( + const struct ethtool_gstrings *strings, + const struct ethtool_gfeatures *gfeatures, + struct ethtool_sfeatures *sfeatures, + const char *feature, + int flag) { + + assert(strings); + assert(gfeatures); + assert(sfeatures); + assert(feature); + + if (flag < 0) + return 0; + + for (uint32_t i = 0; i < strings->len; i++) { + uint32_t block, mask; + + if (!strneq((const char*) &strings->data[i * ETH_GSTRING_LEN], feature, ETH_GSTRING_LEN)) + continue; + + block = i / 32; + mask = UINT32_C(1) << (i % 32); + + if (!FLAGS_SET(gfeatures->features[block].available, mask) || + FLAGS_SET(gfeatures->features[block].never_changed, mask)) + return -EOPNOTSUPP; + + sfeatures->features[block].valid |= mask; + SET_FLAG(sfeatures->features[block].requested, mask, flag); + + return 0; + } + + return -ENODATA; +} + +static int set_features_multiple_bit( + const struct ethtool_gstrings *strings, + const struct ethtool_gfeatures *gfeatures, + struct ethtool_sfeatures *sfeatures, + const char *feature, + int flag) { + + bool found = false; + int r = -ENODATA; + + assert(strings); + assert(gfeatures); + assert(sfeatures); + assert(feature); + + if (flag < 0) + return 0; + + for (uint32_t i = 0; i < strings->len; i++) { + uint32_t block, mask; + + if (!startswith((const char*) &strings->data[i * ETH_GSTRING_LEN], feature)) + continue; + + block = i / 32; + mask = UINT32_C(1) << (i % 32); + + if (!FLAGS_SET(gfeatures->features[block].available, mask) || + FLAGS_SET(gfeatures->features[block].never_changed, mask)) { + r = -EOPNOTSUPP; + continue; + } + + /* The flags is explicitly set by set_features_bit() */ + if (FLAGS_SET(sfeatures->features[block].valid, mask)) + continue; + + sfeatures->features[block].valid |= mask; + SET_FLAG(sfeatures->features[block].requested, mask, flag); + + found = true; + } + + return found ? 0 : r; +} + +int ethtool_set_features(int *ethtool_fd, const char *ifname, const int features[static _NET_DEV_FEAT_MAX]) { + _cleanup_free_ struct ethtool_gstrings *strings = NULL; + _cleanup_free_ struct ethtool_gfeatures *gfeatures = NULL; + _cleanup_free_ struct ethtool_sfeatures *sfeatures = NULL; + struct ifreq ifr; + bool have = false; + int r; + + assert(ethtool_fd); + assert(ifname); + assert(features); + + for (size_t i = 0; i < _NET_DEV_FEAT_MAX; i++) + if (features[i] >= 0) { + have = true; + break; + } + + if (!have) + return 0; + + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + r = get_stringset(*ethtool_fd, ifname, ETH_SS_FEATURES, &strings); + if (r < 0) + return log_debug_errno(r, "ethtool: could not get ethtool feature strings: %m"); + + r = get_features(*ethtool_fd, ifname, strings->len, &gfeatures); + if (r < 0) + return log_debug_errno(r, "ethtool: could not get ethtool features for %s: %m", ifname); + + sfeatures = malloc0(offsetof(struct ethtool_sfeatures, features) + + DIV_ROUND_UP(strings->len, 32U) * sizeof(sfeatures->features[0])); + if (!sfeatures) + return log_oom_debug(); + + sfeatures->cmd = ETHTOOL_SFEATURES; + sfeatures->size = DIV_ROUND_UP(strings->len, 32U); + + for (size_t i = 0; i < _NET_DEV_FEAT_SIMPLE_MAX; i++) { + r = set_features_bit(strings, gfeatures, sfeatures, netdev_feature_table[i], features[i]); + if (r < 0) + log_debug_errno(r, "ethtool: could not set feature %s for %s, ignoring: %m", netdev_feature_table[i], ifname); + } + + for (size_t i = _NET_DEV_FEAT_SIMPLE_MAX; i < _NET_DEV_FEAT_MAX; i++) { + r = set_features_multiple_bit(strings, gfeatures, sfeatures, netdev_feature_table[i], features[i]); + if (r < 0) + log_debug_errno(r, "ethtool: could not set feature %s for %s, ignoring: %m", netdev_feature_table[i], ifname); + } + + ifr = (struct ifreq) { + .ifr_data = (void*) sfeatures, + }; + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return log_debug_errno(errno, "ethtool: could not set ethtool features for %s", ifname); + + return 0; +} + +static int get_glinksettings(int fd, struct ifreq *ifr, struct ethtool_link_usettings **ret) { + struct ecmd { + struct ethtool_link_settings req; + uint32_t link_mode_data[3 * ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; + } ecmd = { + .req.cmd = ETHTOOL_GLINKSETTINGS, + }; + struct ethtool_link_usettings *u; + unsigned offset; + + assert(fd >= 0); + assert(ifr); + assert(ret); + + /* The interaction user/kernel via the new API requires a small ETHTOOL_GLINKSETTINGS + handshake first to agree on the length of the link mode bitmaps. If kernel doesn't + agree with user, it returns the bitmap length it is expecting from user as a negative + length (and cmd field is 0). When kernel and user agree, kernel returns valid info in + all fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS). Based on + https://github.com/torvalds/linux/commit/3f1ac7a700d039c61d8d8b99f28d605d489a60cf + */ + + ifr->ifr_data = (void *) &ecmd; + + if (ioctl(fd, SIOCETHTOOL, ifr) < 0) + return -errno; + + if (ecmd.req.link_mode_masks_nwords >= 0 || ecmd.req.cmd != ETHTOOL_GLINKSETTINGS) + return -EOPNOTSUPP; + + ecmd.req.link_mode_masks_nwords = -ecmd.req.link_mode_masks_nwords; + + ifr->ifr_data = (void *) &ecmd; + + if (ioctl(fd, SIOCETHTOOL, ifr) < 0) + return -errno; + + if (ecmd.req.link_mode_masks_nwords <= 0 || ecmd.req.cmd != ETHTOOL_GLINKSETTINGS) + return -EOPNOTSUPP; + + u = new(struct ethtool_link_usettings, 1); + if (!u) + return -ENOMEM; + + *u = (struct ethtool_link_usettings) { + .base = ecmd.req, + }; + + offset = 0; + memcpy(u->link_modes.supported, &ecmd.link_mode_data[offset], 4 * ecmd.req.link_mode_masks_nwords); + + offset += ecmd.req.link_mode_masks_nwords; + memcpy(u->link_modes.advertising, &ecmd.link_mode_data[offset], 4 * ecmd.req.link_mode_masks_nwords); + + offset += ecmd.req.link_mode_masks_nwords; + memcpy(u->link_modes.lp_advertising, &ecmd.link_mode_data[offset], 4 * ecmd.req.link_mode_masks_nwords); + + *ret = u; + + return 0; +} + +static int get_gset(int fd, struct ifreq *ifr, struct ethtool_link_usettings **ret) { + struct ethtool_link_usettings *e; + struct ethtool_cmd ecmd = { + .cmd = ETHTOOL_GSET, + }; + + assert(fd >= 0); + assert(ifr); + assert(ret); + + ifr->ifr_data = (void *) &ecmd; + + if (ioctl(fd, SIOCETHTOOL, ifr) < 0) + return -errno; + + e = new(struct ethtool_link_usettings, 1); + if (!e) + return -ENOMEM; + + *e = (struct ethtool_link_usettings) { + .base.cmd = ETHTOOL_GSET, + .base.link_mode_masks_nwords = 1, + .base.speed = ethtool_cmd_speed(&ecmd), + .base.duplex = ecmd.duplex, + .base.port = ecmd.port, + .base.phy_address = ecmd.phy_address, + .base.autoneg = ecmd.autoneg, + .base.mdio_support = ecmd.mdio_support, + .base.eth_tp_mdix = ecmd.eth_tp_mdix, + .base.eth_tp_mdix_ctrl = ecmd.eth_tp_mdix_ctrl, + + .link_modes.supported[0] = ecmd.supported, + .link_modes.advertising[0] = ecmd.advertising, + .link_modes.lp_advertising[0] = ecmd.lp_advertising, + }; + + *ret = e; + + return 0; +} + +static int set_slinksettings(int fd, struct ifreq *ifr, const struct ethtool_link_usettings *u) { + struct { + struct ethtool_link_settings req; + uint32_t link_mode_data[3 * ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; + } ecmd = {}; + unsigned offset; + + assert(fd >= 0); + assert(ifr); + assert(u); + + if (u->base.cmd != ETHTOOL_GLINKSETTINGS || u->base.link_mode_masks_nwords <= 0) + return -EINVAL; + + ecmd.req = u->base; + ecmd.req.cmd = ETHTOOL_SLINKSETTINGS; + offset = 0; + memcpy(&ecmd.link_mode_data[offset], u->link_modes.supported, 4 * ecmd.req.link_mode_masks_nwords); + + offset += ecmd.req.link_mode_masks_nwords; + memcpy(&ecmd.link_mode_data[offset], u->link_modes.advertising, 4 * ecmd.req.link_mode_masks_nwords); + + offset += ecmd.req.link_mode_masks_nwords; + memcpy(&ecmd.link_mode_data[offset], u->link_modes.lp_advertising, 4 * ecmd.req.link_mode_masks_nwords); + + ifr->ifr_data = (void *) &ecmd; + + return RET_NERRNO(ioctl(fd, SIOCETHTOOL, ifr)); +} + +static int set_sset(int fd, struct ifreq *ifr, const struct ethtool_link_usettings *u) { + struct ethtool_cmd ecmd = { + .cmd = ETHTOOL_SSET, + }; + + assert(fd >= 0); + assert(ifr); + assert(u); + + if (u->base.cmd != ETHTOOL_GSET || u->base.link_mode_masks_nwords <= 0) + return -EINVAL; + + ecmd.supported = u->link_modes.supported[0]; + ecmd.advertising = u->link_modes.advertising[0]; + ecmd.lp_advertising = u->link_modes.lp_advertising[0]; + + ethtool_cmd_speed_set(&ecmd, u->base.speed); + + ecmd.duplex = u->base.duplex; + ecmd.port = u->base.port; + ecmd.phy_address = u->base.phy_address; + ecmd.autoneg = u->base.autoneg; + ecmd.mdio_support = u->base.mdio_support; + ecmd.eth_tp_mdix = u->base.eth_tp_mdix; + ecmd.eth_tp_mdix_ctrl = u->base.eth_tp_mdix_ctrl; + + ifr->ifr_data = (void *) &ecmd; + + return RET_NERRNO(ioctl(fd, SIOCETHTOOL, ifr)); +} + +int ethtool_set_glinksettings( + int *fd, + const char *ifname, + int autonegotiation, + const uint32_t advertise[static N_ADVERTISE], + uint64_t speed, + Duplex duplex, + NetDevPort port, + uint8_t mdi) { + + _cleanup_free_ struct ethtool_link_usettings *u = NULL; + struct ifreq ifr = {}; + bool changed = false; + int r; + + assert(fd); + assert(ifname); + assert(advertise); + + if (autonegotiation < 0 && memeqzero(advertise, sizeof(uint32_t) * N_ADVERTISE) && + speed == 0 && duplex < 0 && port < 0 && mdi == ETH_TP_MDI_INVALID) + return 0; + + /* If autonegotiation is disabled, the speed and duplex represent the fixed link mode and are + * writable if the driver supports multiple link modes. If it is enabled then they are + * read-only. If the link is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and @duplex is %DUPLEX_UNKNOWN + * or the best enabled duplex mode. */ + + if (speed > 0 || duplex >= 0 || port >= 0) { + if (autonegotiation == AUTONEG_ENABLE || !memeqzero(advertise, sizeof(uint32_t) * N_ADVERTISE)) { + log_debug("ethtool: autonegotiation is enabled, ignoring speed, duplex, or port settings."); + speed = 0; + duplex = _DUP_INVALID; + port = _NET_DEV_PORT_INVALID; + } else { + log_debug("ethtool: setting speed, duplex, or port, disabling autonegotiation."); + autonegotiation = AUTONEG_DISABLE; + } + } + + r = ethtool_connect(fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + r = get_glinksettings(*fd, &ifr, &u); + if (r < 0) { + r = get_gset(*fd, &ifr, &u); + if (r < 0) + return log_debug_errno(r, "ethtool: Cannot get device settings for %s: %m", ifname); + } + + if (speed > 0) + UPDATE(u->base.speed, DIV_ROUND_UP(speed, 1000000), changed); + + if (duplex >= 0) + UPDATE(u->base.duplex, duplex, changed); + + if (port >= 0) + UPDATE(u->base.port, port, changed); + + if (autonegotiation >= 0) + UPDATE(u->base.autoneg, autonegotiation, changed); + + if (!memeqzero(advertise, sizeof(uint32_t) * N_ADVERTISE)) { + UPDATE(u->base.autoneg, AUTONEG_ENABLE, changed); + + changed = changed || + memcmp(&u->link_modes.advertising, advertise, sizeof(uint32_t) * N_ADVERTISE) != 0 || + !memeqzero((uint8_t*) &u->link_modes.advertising + sizeof(uint32_t) * N_ADVERTISE, + ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES - sizeof(uint32_t) * N_ADVERTISE); + memcpy(&u->link_modes.advertising, advertise, sizeof(uint32_t) * N_ADVERTISE); + memzero((uint8_t*) &u->link_modes.advertising + sizeof(uint32_t) * N_ADVERTISE, + ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES - sizeof(uint32_t) * N_ADVERTISE); + } + + if (mdi != ETH_TP_MDI_INVALID) { + if (u->base.eth_tp_mdix_ctrl == ETH_TP_MDI_INVALID) + log_debug("ethtool: setting MDI not supported for %s, ignoring.", ifname); + else + UPDATE(u->base.eth_tp_mdix_ctrl, mdi, changed); + } + + if (!changed) + return 0; + + if (u->base.cmd == ETHTOOL_GLINKSETTINGS) + r = set_slinksettings(*fd, &ifr, u); + else + r = set_sset(*fd, &ifr, u); + if (r < 0) + return log_debug_errno(r, "ethtool: Cannot set device settings for %s: %m", ifname); + + return r; +} + +int ethtool_set_channels(int *fd, const char *ifname, const netdev_channels *channels) { + struct ethtool_channels ecmd = { + .cmd = ETHTOOL_GCHANNELS, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + bool need_update = false; + int r; + + assert(fd); + assert(ifname); + assert(channels); + + if (!channels->rx.set && + !channels->tx.set && + !channels->other.set && + !channels->combined.set) + return 0; + + r = ethtool_connect(fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (channels->rx.set) + UPDATE_WITH_MAX(ecmd.rx_count, ecmd.max_rx, channels->rx.value, need_update); + + if (channels->tx.set) + UPDATE_WITH_MAX(ecmd.tx_count, ecmd.max_tx, channels->tx.value, need_update); + + if (channels->other.set) + UPDATE_WITH_MAX(ecmd.other_count, ecmd.max_other, channels->other.value, need_update); + + if (channels->combined.set) + UPDATE_WITH_MAX(ecmd.combined_count, ecmd.max_combined, channels->combined.value, need_update); + + if (!need_update) + return 0; + + ecmd.cmd = ETHTOOL_SCHANNELS; + return RET_NERRNO(ioctl(*fd, SIOCETHTOOL, &ifr)); +} + +int ethtool_set_flow_control(int *fd, const char *ifname, int rx, int tx, int autoneg) { + struct ethtool_pauseparam ecmd = { + .cmd = ETHTOOL_GPAUSEPARAM, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + bool need_update = false; + int r; + + assert(fd); + assert(ifname); + + if (rx < 0 && tx < 0 && autoneg < 0) + return 0; + + r = ethtool_connect(fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (rx >= 0) + UPDATE(ecmd.rx_pause, (uint32_t) rx, need_update); + + if (tx >= 0) + UPDATE(ecmd.tx_pause, (uint32_t) tx, need_update); + + if (autoneg >= 0) + UPDATE(ecmd.autoneg, (uint32_t) autoneg, need_update); + + if (!need_update) + return 0; + + ecmd.cmd = ETHTOOL_SPAUSEPARAM; + return RET_NERRNO(ioctl(*fd, SIOCETHTOOL, &ifr)); +} + +int ethtool_set_nic_coalesce_settings(int *ethtool_fd, const char *ifname, const netdev_coalesce_param *coalesce) { + struct ethtool_coalesce ecmd = { + .cmd = ETHTOOL_GCOALESCE, + }; + struct ifreq ifr = { + .ifr_data = (void*) &ecmd, + }; + bool need_update = false; + int r; + + assert(ethtool_fd); + assert(ifname); + assert(coalesce); + + if (coalesce->use_adaptive_rx_coalesce < 0 && + coalesce->use_adaptive_tx_coalesce < 0 && + !coalesce->rx_coalesce_usecs.set && + !coalesce->rx_max_coalesced_frames.set && + !coalesce->rx_coalesce_usecs_irq.set && + !coalesce->rx_max_coalesced_frames_irq.set && + !coalesce->tx_coalesce_usecs.set && + !coalesce->tx_max_coalesced_frames.set && + !coalesce->tx_coalesce_usecs_irq.set && + !coalesce->tx_max_coalesced_frames_irq.set && + !coalesce->stats_block_coalesce_usecs.set && + !coalesce->pkt_rate_low.set && + !coalesce->rx_coalesce_usecs_low.set && + !coalesce->rx_max_coalesced_frames_low.set && + !coalesce->tx_coalesce_usecs_low.set && + !coalesce->tx_max_coalesced_frames_low.set && + !coalesce->pkt_rate_high.set && + !coalesce->rx_coalesce_usecs_high.set && + !coalesce->rx_max_coalesced_frames_high.set && + !coalesce->tx_coalesce_usecs_high.set && + !coalesce->tx_max_coalesced_frames_high.set && + !coalesce->rate_sample_interval.set) + return 0; + + r = ethtool_connect(ethtool_fd); + if (r < 0) + return r; + + strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname); + + if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0) + return -errno; + + if (coalesce->use_adaptive_rx_coalesce >= 0) + UPDATE(ecmd.use_adaptive_rx_coalesce, (uint32_t) coalesce->use_adaptive_rx_coalesce, need_update); + + if (coalesce->use_adaptive_tx_coalesce >= 0) + UPDATE(ecmd.use_adaptive_tx_coalesce, (uint32_t) coalesce->use_adaptive_tx_coalesce, need_update); + + if (coalesce->rx_coalesce_usecs.set) + UPDATE(ecmd.rx_coalesce_usecs, coalesce->rx_coalesce_usecs.value, need_update); + + if (coalesce->rx_max_coalesced_frames.set) + UPDATE(ecmd.rx_max_coalesced_frames, coalesce->rx_max_coalesced_frames.value, need_update); + + if (coalesce->rx_coalesce_usecs_irq.set) + UPDATE(ecmd.rx_coalesce_usecs_irq, coalesce->rx_coalesce_usecs_irq.value, need_update); + + if (coalesce->rx_max_coalesced_frames_irq.set) + UPDATE(ecmd.rx_max_coalesced_frames_irq, coalesce->rx_max_coalesced_frames_irq.value, need_update); + + if (coalesce->tx_coalesce_usecs.set) + UPDATE(ecmd.tx_coalesce_usecs, coalesce->tx_coalesce_usecs.value, need_update); + + if (coalesce->tx_max_coalesced_frames.set) + UPDATE(ecmd.tx_max_coalesced_frames, coalesce->tx_max_coalesced_frames.value, need_update); + + if (coalesce->tx_coalesce_usecs_irq.set) + UPDATE(ecmd.tx_coalesce_usecs_irq, coalesce->tx_coalesce_usecs_irq.value, need_update); + + if (coalesce->tx_max_coalesced_frames_irq.set) + UPDATE(ecmd.tx_max_coalesced_frames_irq, coalesce->tx_max_coalesced_frames_irq.value, need_update); + + if (coalesce->stats_block_coalesce_usecs.set) + UPDATE(ecmd.stats_block_coalesce_usecs, coalesce->stats_block_coalesce_usecs.value, need_update); + + if (coalesce->pkt_rate_low.set) + UPDATE(ecmd.pkt_rate_low, coalesce->pkt_rate_low.value, need_update); + + if (coalesce->rx_coalesce_usecs_low.set) + UPDATE(ecmd.rx_coalesce_usecs_low, coalesce->rx_coalesce_usecs_low.value, need_update); + + if (coalesce->rx_max_coalesced_frames_low.set) + UPDATE(ecmd.rx_max_coalesced_frames_low, coalesce->rx_max_coalesced_frames_low.value, need_update); + + if (coalesce->tx_coalesce_usecs_low.set) + UPDATE(ecmd.tx_coalesce_usecs_low, coalesce->tx_coalesce_usecs_low.value, need_update); + + if (coalesce->tx_max_coalesced_frames_low.set) + UPDATE(ecmd.tx_max_coalesced_frames_low, coalesce->tx_max_coalesced_frames_low.value, need_update); + + if (coalesce->pkt_rate_high.set) + UPDATE(ecmd.pkt_rate_high, coalesce->pkt_rate_high.value, need_update); + + if (coalesce->rx_coalesce_usecs_high.set) + UPDATE(ecmd.rx_coalesce_usecs_high, coalesce->rx_coalesce_usecs_high.value, need_update); + + if (coalesce->rx_max_coalesced_frames_high.set) + UPDATE(ecmd.rx_max_coalesced_frames_high, coalesce->rx_max_coalesced_frames_high.value, need_update); + + if (coalesce->tx_coalesce_usecs_high.set) + UPDATE(ecmd.tx_coalesce_usecs_high, coalesce->tx_coalesce_usecs_high.value, need_update); + + if (coalesce->tx_max_coalesced_frames_high.set) + UPDATE(ecmd.tx_max_coalesced_frames_high, coalesce->tx_max_coalesced_frames_high.value, need_update); + + if (coalesce->rate_sample_interval.set) + UPDATE(ecmd.rate_sample_interval, DIV_ROUND_UP(coalesce->rate_sample_interval.value, USEC_PER_SEC), need_update); + + if (!need_update) + return 0; + + ecmd.cmd = ETHTOOL_SCOALESCE; + return RET_NERRNO(ioctl(*ethtool_fd, SIOCETHTOOL, &ifr)); +} + +int config_parse_advertise( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t *advertise = ASSERT_PTR(data); + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + /* Empty string resets the value. */ + memzero(advertise, sizeof(uint32_t) * N_ADVERTISE); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + enum ethtool_link_mode_bit_indices mode; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to split advertise modes '%s', ignoring assignment: %m", rvalue); + return 0; + } + if (r == 0) + return 0; + + mode = ethtool_link_mode_bit_from_string(w); + /* We reuse the kernel provided enum which does not contain negative value. So, the cast + * below is mandatory. Otherwise, the check below always passes and access an invalid address. */ + if ((int) mode < 0) { + log_syntax(unit, LOG_WARNING, filename, line, mode, + "Failed to parse advertise mode, ignoring: %s", w); + continue; + } + + advertise[mode / 32] |= 1UL << (mode % 32); + } +} + +int config_parse_mdi( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint8_t *mdi = ASSERT_PTR(data); + + assert(filename); + assert(rvalue); + + if (isempty(rvalue)) { + *mdi = ETH_TP_MDI_INVALID; + return 0; + } + + if (STR_IN_SET(rvalue, "mdi", "straight")) { + *mdi = ETH_TP_MDI; + return 0; + } + + if (STR_IN_SET(rvalue, "mdi-x", "mdix", "crossover")) { + *mdi = ETH_TP_MDI_X; + return 0; + } + + if (streq(rvalue, "auto")) { + *mdi = ETH_TP_MDI_AUTO; + return 0; + } + + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse %s= setting, ignoring assignment: %s", lvalue, rvalue); + return 0; +} + +int config_parse_ring_buffer_or_channel( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + u32_opt *dst = ASSERT_PTR(data); + uint32_t k; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + dst->value = 0; + dst->set = false; + return 0; + } + + if (streq(rvalue, "max")) { + dst->value = 0; + dst->set = true; + return 0; + } + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + if (k < 1) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid %s= value, ignoring: %s", lvalue, rvalue); + return 0; + } + + dst->value = k; + dst->set = true; + return 0; +} + +int config_parse_wol( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t new_opts = 0, *opts = data; + int r; + + assert(filename); + assert(section); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + *opts = UINT32_MAX; /* Do not update WOL option. */ + return 0; + } + + if (streq(rvalue, "off")) { + *opts = 0; /* Disable WOL. */ + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *w = NULL; + bool found = false; + + r = extract_first_word(&p, &w, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to split wake-on-lan modes '%s', ignoring assignment: %m", rvalue); + return 0; + } + if (r == 0) + break; + + for (size_t i = 0; i < ELEMENTSOF(wol_option_map); i++) + if (streq(w, wol_option_map[i].name)) { + new_opts |= wol_option_map[i].opt; + found = true; + break; + } + + if (!found) + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Unknown wake-on-lan mode '%s', ignoring.", w); + } + + if (*opts == UINT32_MAX) + *opts = new_opts; + else + *opts |= new_opts; + + return 0; +} + +int config_parse_coalesce_u32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + u32_opt *dst = data; + uint32_t k; + int r; + + if (isempty(rvalue)) { + dst->value = 0; + dst->set = false; + return 0; + } + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring: %s", lvalue, rvalue); + return 0; + } + + dst->value = k; + dst->set = true; + return 0; +} + +int config_parse_coalesce_sec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + u32_opt *dst = data; + usec_t usec; + int r; + + if (isempty(rvalue)) { + dst->value = 0; + dst->set = false; + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse coalesce setting value, ignoring: %s", rvalue); + return 0; + } + + if (usec > UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Too large %s= value, ignoring: %s", lvalue, rvalue); + return 0; + } + + if (STR_IN_SET(lvalue, "StatisticsBlockCoalesceSec", "CoalescePacketRateSampleIntervalSec") && usec < 1) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid %s= value, ignoring: %s", lvalue, rvalue); + return 0; + } + + dst->value = (uint32_t) usec; + dst->set = true; + + return 0; +} diff --git a/src/shared/ethtool-util.h b/src/shared/ethtool-util.h new file mode 100644 index 0000000..5303cd7 --- /dev/null +++ b/src/shared/ethtool-util.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "conf-parser.h" +#include "ether-addr-util.h" + +#define N_ADVERTISE 4 + +/* we can't use DUPLEX_ prefix, as it + * clashes with */ +typedef enum Duplex { + DUP_HALF = DUPLEX_HALF, + DUP_FULL = DUPLEX_FULL, + _DUP_MAX, + _DUP_INVALID = -EINVAL, +} Duplex; + +typedef enum NetDevFeature { + NET_DEV_FEAT_SG, + NET_DEV_FEAT_IP_CSUM, + NET_DEV_FEAT_HW_CSUM, + NET_DEV_FEAT_IPV6_CSUM, + NET_DEV_FEAT_HIGHDMA, + NET_DEV_FEAT_FRAGLIST, + NET_DEV_FEAT_HW_VLAN_CTAG_TX, + NET_DEV_FEAT_HW_VLAN_CTAG_RX, + NET_DEV_FEAT_HW_VLAN_CTAG_FILTER, + NET_DEV_FEAT_HW_VLAN_STAG_TX, + NET_DEV_FEAT_HW_VLAN_STAG_RX, + NET_DEV_FEAT_HW_VLAN_STAG_FILTER, + NET_DEV_FEAT_VLAN_CHALLENGED, + NET_DEV_FEAT_GSO, + NET_DEV_FEAT_LLTX, + NET_DEV_FEAT_NETNS_LOCAL, + NET_DEV_FEAT_GRO, + NET_DEV_FEAT_GRO_HW, + NET_DEV_FEAT_LRO, + NET_DEV_FEAT_TSO, + NET_DEV_FEAT_GSO_ROBUST, + NET_DEV_FEAT_TSO_ECN, + NET_DEV_FEAT_TSO_MANGLEID, + NET_DEV_FEAT_TSO6, + NET_DEV_FEAT_FSO, + NET_DEV_FEAT_GSO_GRE, + NET_DEV_FEAT_GSO_GRE_CSUM, + NET_DEV_FEAT_GSO_IPXIP4, + NET_DEV_FEAT_GSO_IPXIP6, + NET_DEV_FEAT_GSO_UDP_TUNNEL, + NET_DEV_FEAT_GSO_UDP_TUNNEL_CSUM, + NET_DEV_FEAT_GSO_PARTIAL, + NET_DEV_FEAT_GSO_TUNNEL_REMCSUM, + NET_DEV_FEAT_GSO_SCTP, + NET_DEV_FEAT_GSO_ESP, + NET_DEV_FEAT_GSO_UDP_L4, + NET_DEV_FEAT_GSO_FRAGLIST, + NET_DEV_FEAT_FCOE_CRC, + NET_DEV_FEAT_SCTP_CRC, + NET_DEV_FEAT_FCOE_MTU, + NET_DEV_FEAT_NTUPLE, + NET_DEV_FEAT_RXHASH, + NET_DEV_FEAT_RXCSUM, + NET_DEV_FEAT_NOCACHE_COPY, + NET_DEV_FEAT_LOOPBACK, + NET_DEV_FEAT_RXFCS, + NET_DEV_FEAT_RXALL, + NET_DEV_FEAT_HW_L2FW_DOFFLOAD, + NET_DEV_FEAT_HW_TC, + NET_DEV_FEAT_HW_ESP, + NET_DEV_FEAT_HW_ESP_TX_CSUM, + NET_DEV_FEAT_RX_UDP_TUNNEL_PORT, + NET_DEV_FEAT_HW_TLS_RECORD, + NET_DEV_FEAT_HW_TLS_TX, + NET_DEV_FEAT_HW_TLS_RX, + NET_DEV_FEAT_GRO_FRAGLIST, + NET_DEV_FEAT_HW_MACSEC, + NET_DEV_FEAT_GRO_UDP_FWD, + NET_DEV_FEAT_HW_HSR_TAG_INS, + NET_DEV_FEAT_HW_HSR_TAG_RM, + NET_DEV_FEAT_HW_HSR_FWD, + NET_DEV_FEAT_HW_HSR_DUP, + _NET_DEV_FEAT_SIMPLE_MAX, + + NET_DEV_FEAT_TXCSUM = _NET_DEV_FEAT_SIMPLE_MAX, + _NET_DEV_FEAT_MAX, + _NET_DEV_FEAT_INVALID = -EINVAL, +} NetDevFeature; + +typedef enum NetDevPort { + NET_DEV_PORT_TP = PORT_TP, + NET_DEV_PORT_AUI = PORT_AUI, + NET_DEV_PORT_MII = PORT_MII, + NET_DEV_PORT_FIBRE = PORT_FIBRE, + NET_DEV_PORT_BNC = PORT_BNC, + NET_DEV_PORT_DA = PORT_DA, + NET_DEV_PORT_NONE = PORT_NONE, + NET_DEV_PORT_OTHER = PORT_OTHER, + _NET_DEV_PORT_MAX, + _NET_DEV_PORT_INVALID = -EINVAL, +} NetDevPort; + +#define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 (SCHAR_MAX) +#define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES (4 * ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32) + +/* layout of the struct passed from/to userland */ +struct ethtool_link_usettings { + struct ethtool_link_settings base; + + struct { + uint32_t supported[ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; + uint32_t advertising[ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; + uint32_t lp_advertising[ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; + } link_modes; +}; + +typedef struct u32_opt { + uint32_t value; /* a value of 0 indicates the hardware advertised maximum should be used. */ + bool set; +} u32_opt; + +typedef struct netdev_channels { + u32_opt rx; + u32_opt tx; + u32_opt other; + u32_opt combined; +} netdev_channels; + +typedef struct netdev_ring_param { + u32_opt rx; + u32_opt rx_mini; + u32_opt rx_jumbo; + u32_opt tx; +} netdev_ring_param; + +typedef struct netdev_coalesce_param { + u32_opt rx_coalesce_usecs; + u32_opt rx_max_coalesced_frames; + u32_opt rx_coalesce_usecs_irq; + u32_opt rx_max_coalesced_frames_irq; + u32_opt tx_coalesce_usecs; + u32_opt tx_max_coalesced_frames; + u32_opt tx_coalesce_usecs_irq; + u32_opt tx_max_coalesced_frames_irq; + u32_opt stats_block_coalesce_usecs; + int use_adaptive_rx_coalesce; + int use_adaptive_tx_coalesce; + u32_opt pkt_rate_low; + u32_opt rx_coalesce_usecs_low; + u32_opt rx_max_coalesced_frames_low; + u32_opt tx_coalesce_usecs_low; + u32_opt tx_max_coalesced_frames_low; + u32_opt pkt_rate_high; + u32_opt rx_coalesce_usecs_high; + u32_opt rx_max_coalesced_frames_high; + u32_opt tx_coalesce_usecs_high; + u32_opt tx_max_coalesced_frames_high; + u32_opt rate_sample_interval; +} netdev_coalesce_param; + +int ethtool_get_driver(int *ethtool_fd, const char *ifname, char **ret); +int ethtool_get_link_info(int *ethtool_fd, const char *ifname, + int *ret_autonegotiation, uint64_t *ret_speed, + Duplex *ret_duplex, NetDevPort *ret_port); +int ethtool_get_permanent_hw_addr(int *ethtool_fd, const char *ifname, struct hw_addr_data *ret); +int ethtool_set_wol(int *ethtool_fd, const char *ifname, uint32_t wolopts, const uint8_t password[SOPASS_MAX]); +int ethtool_set_nic_buffer_size(int *ethtool_fd, const char *ifname, const netdev_ring_param *ring); +int ethtool_set_features(int *ethtool_fd, const char *ifname, const int features[static _NET_DEV_FEAT_MAX]); +int ethtool_set_glinksettings( + int *fd, + const char *ifname, + int autonegotiation, + const uint32_t advertise[static N_ADVERTISE], + uint64_t speed, + Duplex duplex, + NetDevPort port, + uint8_t mdi); +int ethtool_set_channels(int *ethtool_fd, const char *ifname, const netdev_channels *channels); +int ethtool_set_flow_control(int *fd, const char *ifname, int rx, int tx, int autoneg); +int ethtool_set_nic_coalesce_settings(int *ethtool_fd, const char *ifname, const netdev_coalesce_param *coalesce); + +const char *duplex_to_string(Duplex d) _const_; +Duplex duplex_from_string(const char *d) _pure_; + +int wol_options_to_string_alloc(uint32_t opts, char **ret); + +const char *port_to_string(NetDevPort port) _const_; +NetDevPort port_from_string(const char *port) _pure_; + +const char *mdi_to_string(int mdi) _const_; + +const char *ethtool_link_mode_bit_to_string(enum ethtool_link_mode_bit_indices val) _const_; +enum ethtool_link_mode_bit_indices ethtool_link_mode_bit_from_string(const char *str) _pure_; + +CONFIG_PARSER_PROTOTYPE(config_parse_duplex); +CONFIG_PARSER_PROTOTYPE(config_parse_wol); +CONFIG_PARSER_PROTOTYPE(config_parse_port); +CONFIG_PARSER_PROTOTYPE(config_parse_mdi); +CONFIG_PARSER_PROTOTYPE(config_parse_advertise); +CONFIG_PARSER_PROTOTYPE(config_parse_ring_buffer_or_channel); +CONFIG_PARSER_PROTOTYPE(config_parse_coalesce_u32); +CONFIG_PARSER_PROTOTYPE(config_parse_coalesce_sec); +CONFIG_PARSER_PROTOTYPE(config_parse_nic_coalesce_setting); diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c new file mode 100644 index 0000000..c27f3a5 --- /dev/null +++ b/src/shared/exec-util.c @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "conf-files.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-util.h" +#include "escape.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "macro.h" +#include "missing_syscall.h" +#include "path-util.h" +#include "process-util.h" +#include "serialize.h" +#include "set.h" +#include "signal-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tmpfile-util.h" + +#define EXIT_SKIP_REMAINING 77 + +/* Put this test here for a lack of better place */ +assert_cc(EAGAIN == EWOULDBLOCK); + +static int do_spawn(const char *path, char *argv[], int stdout_fd, pid_t *pid, bool set_systemd_exec_pid) { + pid_t _pid; + int r; + + if (null_or_empty_path(path) > 0) { + log_debug("%s is empty (a mask).", path); + return 0; + } + + r = safe_fork("(direxec)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, &_pid); + if (r < 0) + return r; + if (r == 0) { + char *_argv[2]; + + if (stdout_fd >= 0) { + r = rearrange_stdio(STDIN_FILENO, TAKE_FD(stdout_fd), STDERR_FILENO); + if (r < 0) + _exit(EXIT_FAILURE); + } + + if (set_systemd_exec_pid) { + r = setenv_systemd_exec_pid(false); + if (r < 0) + log_warning_errno(r, "Failed to set $SYSTEMD_EXEC_PID, ignoring: %m"); + } + + if (!argv) { + _argv[0] = (char*) path; + _argv[1] = NULL; + argv = _argv; + } else + argv[0] = (char*) path; + + execv(path, argv); + log_error_errno(errno, "Failed to execute %s: %m", path); + _exit(EXIT_FAILURE); + } + + *pid = _pid; + return 1; +} + +static int do_execute( + char* const* paths, + const char *root, + usec_t timeout, + gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX], + void* const callback_args[_STDOUT_CONSUME_MAX], + int output_fd, + char *argv[], + char *envp[], + ExecDirFlags flags) { + + _cleanup_hashmap_free_free_ Hashmap *pids = NULL; + bool parallel_execution; + int r; + + /* We fork this all off from a child process so that we can somewhat cleanly make + * use of SIGALRM to set a time limit. + * + * We attempt to perform parallel execution if configured by the user, however + * if `callbacks` is nonnull, execution must be serial. + */ + parallel_execution = FLAGS_SET(flags, EXEC_DIR_PARALLEL) && !callbacks; + + if (parallel_execution) { + pids = hashmap_new(NULL); + if (!pids) + return log_oom(); + } + + /* Abort execution of this process after the timeout. We simply rely on SIGALRM as + * default action terminating the process, and turn on alarm(). */ + + if (timeout != USEC_INFINITY) + alarm(DIV_ROUND_UP(timeout, USEC_PER_SEC)); + + STRV_FOREACH(e, envp) + if (putenv(*e) != 0) + return log_error_errno(errno, "Failed to set environment variable: %m"); + + STRV_FOREACH(path, paths) { + _cleanup_free_ char *t = NULL; + _cleanup_close_ int fd = -EBADF; + pid_t pid; + + t = path_join(root, *path); + if (!t) + return log_oom(); + + if (callbacks) { + _cleanup_free_ char *bn = NULL; + + r = path_extract_filename(*path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", *path); + + fd = open_serialization_fd(bn); + if (fd < 0) + return log_error_errno(fd, "Failed to open serialization file: %m"); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *args = NULL; + if (argv) + args = quote_command_line(strv_skip(argv, 1), SHELL_ESCAPE_EMPTY); + + log_debug("About to execute %s%s%s", t, argv ? " " : "", argv ? strnull(args) : ""); + } + + r = do_spawn(t, argv, fd, &pid, FLAGS_SET(flags, EXEC_DIR_SET_SYSTEMD_EXEC_PID)); + if (r <= 0) + continue; + + if (parallel_execution) { + r = hashmap_put(pids, PID_TO_PTR(pid), t); + if (r < 0) + return log_oom(); + t = NULL; + } else { + bool skip_remaining = false; + + r = wait_for_terminate_and_check(t, pid, WAIT_LOG_ABNORMAL); + if (r < 0) + return r; + if (r > 0) { + if (FLAGS_SET(flags, EXEC_DIR_SKIP_REMAINING) && r == EXIT_SKIP_REMAINING) { + log_info("%s succeeded with exit status %i, not executing remaining executables.", *path, r); + skip_remaining = true; + } else if (FLAGS_SET(flags, EXEC_DIR_IGNORE_ERRORS)) + log_warning("%s failed with exit status %i, ignoring.", *path, r); + else { + log_error("%s failed with exit status %i.", *path, r); + return r; + } + } + + if (callbacks) { + if (lseek(fd, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek on serialization fd: %m"); + + r = callbacks[STDOUT_GENERATE](TAKE_FD(fd), callback_args[STDOUT_GENERATE]); + if (r < 0) + return log_error_errno(r, "Failed to process output from %s: %m", *path); + } + + if (skip_remaining) + break; + } + } + + if (callbacks) { + r = callbacks[STDOUT_COLLECT](output_fd, callback_args[STDOUT_COLLECT]); + if (r < 0) + return log_error_errno(r, "Callback two failed: %m"); + } + + while (!hashmap_isempty(pids)) { + _cleanup_free_ char *t = NULL; + pid_t pid; + + pid = PTR_TO_PID(hashmap_first_key(pids)); + assert(pid > 0); + + t = hashmap_remove(pids, PID_TO_PTR(pid)); + assert(t); + + r = wait_for_terminate_and_check(t, pid, WAIT_LOG); + if (r < 0) + return r; + if (!FLAGS_SET(flags, EXEC_DIR_IGNORE_ERRORS) && r > 0) + return r; + } + + return 0; +} + +int execute_strv( + const char *name, + char* const* paths, + const char *root, + usec_t timeout, + gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX], + void* const callback_args[_STDOUT_CONSUME_MAX], + char *argv[], + char *envp[], + ExecDirFlags flags) { + + _cleanup_close_ int fd = -EBADF; + pid_t executor_pid; + int r; + + assert(!FLAGS_SET(flags, EXEC_DIR_PARALLEL | EXEC_DIR_SKIP_REMAINING)); + + if (strv_isempty(paths)) + return 0; + + if (callbacks) { + assert(name); + assert(callback_args); + assert(callbacks[STDOUT_GENERATE]); + assert(callbacks[STDOUT_COLLECT]); + assert(callbacks[STDOUT_CONSUME]); + + fd = open_serialization_fd(name); + if (fd < 0) + return log_error_errno(fd, "Failed to open serialization file: %m"); + } + + /* Executes all binaries in the directories serially or in parallel and waits for + * them to finish. Optionally a timeout is applied. If a file with the same name + * exists in more than one directory, the earliest one wins. */ + + r = safe_fork("(sd-exec-strv)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_LOG, &executor_pid); + if (r < 0) + return r; + if (r == 0) { + r = do_execute(paths, root, timeout, callbacks, callback_args, fd, argv, envp, flags); + _exit(r < 0 ? EXIT_FAILURE : r); + } + + r = wait_for_terminate_and_check("(sd-exec-strv)", executor_pid, 0); + if (r < 0) + return r; + if (!FLAGS_SET(flags, EXEC_DIR_IGNORE_ERRORS) && r > 0) + return r; + + if (!callbacks) + return 0; + + if (lseek(fd, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to rewind serialization fd: %m"); + + r = callbacks[STDOUT_CONSUME](TAKE_FD(fd), callback_args[STDOUT_CONSUME]); + if (r < 0) + return log_error_errno(r, "Failed to parse returned data: %m"); + return 0; +} + +int execute_directories( + const char* const* directories, + usec_t timeout, + gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX], + void* const callback_args[_STDOUT_CONSUME_MAX], + char *argv[], + char *envp[], + ExecDirFlags flags) { + + _cleanup_strv_free_ char **paths = NULL; + _cleanup_free_ char *name = NULL; + int r; + + assert(!strv_isempty((char**) directories)); + + r = conf_files_list_strv(&paths, NULL, NULL, CONF_FILES_EXECUTABLE|CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, directories); + if (r < 0) + return log_error_errno(r, "Failed to enumerate executables: %m"); + + if (strv_isempty(paths)) { + log_debug("No executables found."); + return 0; + } + + if (callbacks) { + r = path_extract_filename(directories[0], &name); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", directories[0]); + } + + return execute_strv(name, paths, NULL, timeout, callbacks, callback_args, argv, envp, flags); +} + +static int gather_environment_generate(int fd, void *arg) { + char ***env = ASSERT_PTR(arg); + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **new = NULL; + int r; + + /* Read a series of VAR=value assignments from fd, use them to update the list of + * variables in env. Also update the exported environment. + * + * fd is always consumed, even on error. + */ + + f = fdopen(fd, "r"); + if (!f) { + safe_close(fd); + return -errno; + } + + r = load_env_file_pairs(f, NULL, &new); + if (r < 0) + return r; + + STRV_FOREACH_PAIR(x, y, new) { + if (!env_name_is_valid(*x)) { + log_warning("Invalid variable assignment \"%s=...\", ignoring.", *x); + continue; + } + + r = strv_env_assign(env, *x, *y); + if (r < 0) + return r; + + if (setenv(*x, *y, true) < 0) + return -errno; + } + + return 0; +} + +static int gather_environment_collect(int fd, void *arg) { + _cleanup_fclose_ FILE *f = NULL; + char ***env = ASSERT_PTR(arg); + int r; + + /* Write out a series of env=cescape(VAR=value) assignments to fd. */ + + f = fdopen(fd, "w"); + if (!f) { + safe_close(fd); + return -errno; + } + + r = serialize_strv(f, "env", *env); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return r; + + return 0; +} + +static int gather_environment_consume(int fd, void *arg) { + _cleanup_fclose_ FILE *f = NULL; + char ***env = ASSERT_PTR(arg); + int r = 0; + + /* Read a series of env=cescape(VAR=value) assignments from fd into env. */ + + f = fdopen(fd, "r"); + if (!f) { + safe_close(fd); + return -errno; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *v; + int k; + + k = read_line(f, LONG_LINE_MAX, &line); + if (k < 0) + return k; + if (k == 0) + break; + + v = startswith(line, "env="); + if (!v) { + log_debug("Serialization line \"%s\" unexpectedly didn't start with \"env=\".", line); + if (r == 0) + r = -EINVAL; + + continue; + } + + k = deserialize_environment(v, env); + if (k < 0) { + log_debug_errno(k, "Invalid serialization line \"%s\": %m", line); + + if (r == 0) + r = k; + } + } + + return r; +} + +int exec_command_flags_from_strv(char **ex_opts, ExecCommandFlags *flags) { + ExecCommandFlags ex_flag, ret_flags = 0; + + assert(flags); + + STRV_FOREACH(opt, ex_opts) { + ex_flag = exec_command_flags_from_string(*opt); + if (ex_flag < 0) + return ex_flag; + ret_flags |= ex_flag; + } + + *flags = ret_flags; + + return 0; +} + +int exec_command_flags_to_strv(ExecCommandFlags flags, char ***ex_opts) { + _cleanup_strv_free_ char **ret_opts = NULL; + ExecCommandFlags it = flags; + const char *str; + int r; + + assert(ex_opts); + + if (flags < 0) + return flags; + + for (unsigned i = 0; it != 0; it &= ~(1 << i), i++) + if (FLAGS_SET(flags, (1 << i))) { + str = exec_command_flags_to_string(1 << i); + if (!str) + return -EINVAL; + + r = strv_extend(&ret_opts, str); + if (r < 0) + return r; + } + + *ex_opts = TAKE_PTR(ret_opts); + + return 0; +} + +const gather_stdout_callback_t gather_environment[] = { + gather_environment_generate, + gather_environment_collect, + gather_environment_consume, +}; + +static const char* const exec_command_strings[] = { + "ignore-failure", /* EXEC_COMMAND_IGNORE_FAILURE */ + "privileged", /* EXEC_COMMAND_FULLY_PRIVILEGED */ + "no-setuid", /* EXEC_COMMAND_NO_SETUID */ + "ambient", /* EXEC_COMMAND_AMBIENT_MAGIC */ + "no-env-expand", /* EXEC_COMMAND_NO_ENV_EXPAND */ +}; + +const char* exec_command_flags_to_string(ExecCommandFlags i) { + for (size_t idx = 0; idx < ELEMENTSOF(exec_command_strings); idx++) + if (i == (1 << idx)) + return exec_command_strings[idx]; + + return NULL; +} + +ExecCommandFlags exec_command_flags_from_string(const char *s) { + ssize_t idx; + + idx = string_table_lookup(exec_command_strings, ELEMENTSOF(exec_command_strings), s); + + if (idx < 0) + return _EXEC_COMMAND_FLAGS_INVALID; + else + return 1 << idx; +} + +int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) { + /* Refuse invalid fds, regardless if fexecve() use is enabled or not */ + if (executable_fd < 0) + return -EBADF; + + /* Block any attempts on exploiting Linux' liberal argv[] handling, i.e. CVE-2021-4034 and suchlike */ + if (isempty(executable) || strv_isempty(argv)) + return -EINVAL; + +#if ENABLE_FEXECVE + + execveat(executable_fd, "", argv, envp, AT_EMPTY_PATH); + + if (IN_SET(errno, ENOSYS, ENOENT) || ERRNO_IS_PRIVILEGE(errno)) + /* Old kernel or a script or an overzealous seccomp filter? Let's fall back to execve(). + * + * fexecve(3): "If fd refers to a script (i.e., it is an executable text file that names a + * script interpreter with a first line that begins with the characters #!) and the + * close-on-exec flag has been set for fd, then fexecve() fails with the error ENOENT. This + * error occurs because, by the time the script interpreter is executed, fd has already been + * closed because of the close-on-exec flag. Thus, the close-on-exec flag can't be set on fd + * if it refers to a script." + * + * Unfortunately, if we unset close-on-exec, the script will be executed just fine, but (at + * least in case of bash) the script name, $0, will be shown as /dev/fd/nnn, which breaks + * scripts which make use of $0. Thus, let's fall back to execve() in this case. + */ +#endif + execve(executable, argv, envp); + return -errno; +} + +int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) { + bool stdout_is_tty, stderr_is_tty; + size_t n, i; + va_list ap; + char **l; + int r; + + assert(path); + + /* Spawns a temporary TTY agent, making sure it goes away when we go away */ + + r = safe_fork_full(name, + NULL, + except, + n_except, + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG|FORK_RLIMIT_NOFILE_SAFE, + ret_pid); + if (r < 0) + return r; + if (r > 0) + return 0; + + /* In the child: */ + + stdout_is_tty = isatty(STDOUT_FILENO); + stderr_is_tty = isatty(STDERR_FILENO); + + if (!stdout_is_tty || !stderr_is_tty) { + int fd; + + /* Detach from stdout/stderr and reopen /dev/tty for them. This is important to ensure that + * when systemctl is started via popen() or a similar call that expects to read EOF we + * actually do generate EOF and not delay this indefinitely by keeping an unused copy of + * stdin around. */ + fd = open("/dev/tty", O_WRONLY); + if (fd < 0) { + if (errno != ENXIO) { + log_error_errno(errno, "Failed to open /dev/tty: %m"); + _exit(EXIT_FAILURE); + } + + /* If we get ENXIO here we have no controlling TTY even though stdout/stderr are + * connected to a TTY. That's a weird setup, but let's handle it gracefully: let's + * skip the forking of the agents, given the TTY setup is not in order. */ + } else { + if (!stdout_is_tty && dup2(fd, STDOUT_FILENO) < 0) { + log_error_errno(errno, "Failed to dup2 /dev/tty: %m"); + _exit(EXIT_FAILURE); + } + + if (!stderr_is_tty && dup2(fd, STDERR_FILENO) < 0) { + log_error_errno(errno, "Failed to dup2 /dev/tty: %m"); + _exit(EXIT_FAILURE); + } + + fd = safe_close_above_stdio(fd); + } + } + + /* Count arguments */ + va_start(ap, path); + for (n = 0; va_arg(ap, char*); n++) + ; + va_end(ap); + + /* Allocate strv */ + l = newa(char*, n + 1); + + /* Fill in arguments */ + va_start(ap, path); + for (i = 0; i <= n; i++) + l[i] = va_arg(ap, char*); + va_end(ap); + + execv(path, l); + _exit(EXIT_FAILURE); +} diff --git a/src/shared/exec-util.h b/src/shared/exec-util.h new file mode 100644 index 0000000..b99336e --- /dev/null +++ b/src/shared/exec-util.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" + +typedef int (*gather_stdout_callback_t) (int fd, void *arg); + +enum { + STDOUT_GENERATE, /* from generators to helper process */ + STDOUT_COLLECT, /* from helper process to main process */ + STDOUT_CONSUME, /* process data in main process */ + _STDOUT_CONSUME_MAX, +}; + +typedef enum { + EXEC_DIR_NONE = 0, /* No execdir flags */ + EXEC_DIR_PARALLEL = 1 << 0, /* Execute scripts in parallel, if possible */ + EXEC_DIR_IGNORE_ERRORS = 1 << 1, /* Ignore non-zero exit status of scripts */ + EXEC_DIR_SET_SYSTEMD_EXEC_PID = 1 << 2, /* Set $SYSTEMD_EXEC_PID environment variable */ + EXEC_DIR_SKIP_REMAINING = 1 << 3, /* Ignore remaining executions when one exit with 77. */ +} ExecDirFlags; + +typedef enum ExecCommandFlags { + EXEC_COMMAND_IGNORE_FAILURE = 1 << 0, + EXEC_COMMAND_FULLY_PRIVILEGED = 1 << 1, + EXEC_COMMAND_NO_SETUID = 1 << 2, + EXEC_COMMAND_AMBIENT_MAGIC = 1 << 3, + EXEC_COMMAND_NO_ENV_EXPAND = 1 << 4, + _EXEC_COMMAND_FLAGS_INVALID = -EINVAL, +} ExecCommandFlags; + +int execute_strv( + const char *name, + char* const* paths, + const char *root, + usec_t timeout, + gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX], + void* const callback_args[_STDOUT_CONSUME_MAX], + char *argv[], + char *envp[], + ExecDirFlags flags); + +int execute_directories( + const char* const* directories, + usec_t timeout, + gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX], + void* const callback_args[_STDOUT_CONSUME_MAX], + char *argv[], + char *envp[], + ExecDirFlags flags); + +int exec_command_flags_from_strv(char **ex_opts, ExecCommandFlags *flags); +int exec_command_flags_to_strv(ExecCommandFlags flags, char ***ex_opts); + +extern const gather_stdout_callback_t gather_environment[_STDOUT_CONSUME_MAX]; + +const char* exec_command_flags_to_string(ExecCommandFlags i); +ExecCommandFlags exec_command_flags_from_string(const char *s); + +int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]); + +int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) _sentinel_; diff --git a/src/shared/exit-status.c b/src/shared/exit-status.c new file mode 100644 index 0000000..0ac688b --- /dev/null +++ b/src/shared/exit-status.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "exit-status.h" +#include "macro.h" +#include "parse-util.h" +#include "set.h" +#include "string-util.h" + +const ExitStatusMapping exit_status_mappings[256] = { + /* Exit status ranges: + * + * 0…1 │ ISO C, EXIT_SUCCESS + EXIT_FAILURE + * 2…7 │ LSB exit codes for init scripts + * 8…63 │ (Currently unmapped) + * 64…78 │ BSD defined exit codes + * 79…199 │ (Currently unmapped) + * 200…244 │ systemd's private error codes (might be extended to 254 in future development) + * 245…254 │ (Currently unmapped, but see above) + * + * 255 │ EXIT_EXCEPTION (We use this to propagate exit-by-signal events. It's frequently used by others apps (like bash) + * │ to indicate exit reason that cannot really be expressed in a single exit status value — such as a propagated + * │ signal or such, and we follow that logic here.) + */ + + [EXIT_SUCCESS] = { "SUCCESS", EXIT_STATUS_LIBC }, + [EXIT_FAILURE] = { "FAILURE", EXIT_STATUS_LIBC }, + + [EXIT_CHDIR] = { "CHDIR", EXIT_STATUS_SYSTEMD }, + [EXIT_NICE] = { "NICE", EXIT_STATUS_SYSTEMD }, + [EXIT_FDS] = { "FDS", EXIT_STATUS_SYSTEMD }, + [EXIT_EXEC] = { "EXEC", EXIT_STATUS_SYSTEMD }, + [EXIT_MEMORY] = { "MEMORY", EXIT_STATUS_SYSTEMD }, + [EXIT_LIMITS] = { "LIMITS", EXIT_STATUS_SYSTEMD }, + [EXIT_OOM_ADJUST] = { "OOM_ADJUST", EXIT_STATUS_SYSTEMD }, + [EXIT_SIGNAL_MASK] = { "SIGNAL_MASK", EXIT_STATUS_SYSTEMD }, + [EXIT_STDIN] = { "STDIN", EXIT_STATUS_SYSTEMD }, + [EXIT_STDOUT] = { "STDOUT", EXIT_STATUS_SYSTEMD }, + [EXIT_CHROOT] = { "CHROOT", EXIT_STATUS_SYSTEMD }, + [EXIT_IOPRIO] = { "IOPRIO", EXIT_STATUS_SYSTEMD }, + [EXIT_TIMERSLACK] = { "TIMERSLACK", EXIT_STATUS_SYSTEMD }, + [EXIT_SECUREBITS] = { "SECUREBITS", EXIT_STATUS_SYSTEMD }, + [EXIT_SETSCHEDULER] = { "SETSCHEDULER", EXIT_STATUS_SYSTEMD }, + [EXIT_CPUAFFINITY] = { "CPUAFFINITY", EXIT_STATUS_SYSTEMD }, + [EXIT_GROUP] = { "GROUP", EXIT_STATUS_SYSTEMD }, + [EXIT_USER] = { "USER", EXIT_STATUS_SYSTEMD }, + [EXIT_CAPABILITIES] = { "CAPABILITIES", EXIT_STATUS_SYSTEMD }, + [EXIT_CGROUP] = { "CGROUP", EXIT_STATUS_SYSTEMD }, + [EXIT_SETSID] = { "SETSID", EXIT_STATUS_SYSTEMD }, + [EXIT_CONFIRM] = { "CONFIRM", EXIT_STATUS_SYSTEMD }, + [EXIT_STDERR] = { "STDERR", EXIT_STATUS_SYSTEMD }, + [EXIT_PAM] = { "PAM", EXIT_STATUS_SYSTEMD }, + [EXIT_NETWORK] = { "NETWORK", EXIT_STATUS_SYSTEMD }, + [EXIT_NAMESPACE] = { "NAMESPACE", EXIT_STATUS_SYSTEMD }, + [EXIT_NO_NEW_PRIVILEGES] = { "NO_NEW_PRIVILEGES", EXIT_STATUS_SYSTEMD }, + [EXIT_SECCOMP] = { "SECCOMP", EXIT_STATUS_SYSTEMD }, + [EXIT_SELINUX_CONTEXT] = { "SELINUX_CONTEXT", EXIT_STATUS_SYSTEMD }, + [EXIT_PERSONALITY] = { "PERSONALITY", EXIT_STATUS_SYSTEMD }, + [EXIT_APPARMOR_PROFILE] = { "APPARMOR", EXIT_STATUS_SYSTEMD }, + [EXIT_ADDRESS_FAMILIES] = { "ADDRESS_FAMILIES", EXIT_STATUS_SYSTEMD }, + [EXIT_RUNTIME_DIRECTORY] = { "RUNTIME_DIRECTORY", EXIT_STATUS_SYSTEMD }, + [EXIT_CHOWN] = { "CHOWN", EXIT_STATUS_SYSTEMD }, + [EXIT_SMACK_PROCESS_LABEL] = { "SMACK_PROCESS_LABEL", EXIT_STATUS_SYSTEMD }, + [EXIT_KEYRING] = { "KEYRING", EXIT_STATUS_SYSTEMD }, + [EXIT_STATE_DIRECTORY] = { "STATE_DIRECTORY", EXIT_STATUS_SYSTEMD }, + [EXIT_CACHE_DIRECTORY] = { "CACHE_DIRECTORY", EXIT_STATUS_SYSTEMD }, + [EXIT_LOGS_DIRECTORY] = { "LOGS_DIRECTORY", EXIT_STATUS_SYSTEMD }, + [EXIT_CONFIGURATION_DIRECTORY] = { "CONFIGURATION_DIRECTORY", EXIT_STATUS_SYSTEMD }, + [EXIT_NUMA_POLICY] = { "NUMA_POLICY", EXIT_STATUS_SYSTEMD }, + [EXIT_CREDENTIALS] = { "CREDENTIALS", EXIT_STATUS_SYSTEMD }, + [EXIT_BPF] = { "BPF", EXIT_STATUS_SYSTEMD }, + [EXIT_KSM] = { "KSM", EXIT_STATUS_SYSTEMD }, + + [EXIT_EXCEPTION] = { "EXCEPTION", EXIT_STATUS_SYSTEMD }, + + [EXIT_INVALIDARGUMENT] = { "INVALIDARGUMENT", EXIT_STATUS_LSB }, + [EXIT_NOTIMPLEMENTED] = { "NOTIMPLEMENTED", EXIT_STATUS_LSB }, + [EXIT_NOPERMISSION] = { "NOPERMISSION", EXIT_STATUS_LSB }, + [EXIT_NOTINSTALLED] = { "NOTINSTALLED", EXIT_STATUS_LSB }, + [EXIT_NOTCONFIGURED] = { "NOTCONFIGURED", EXIT_STATUS_LSB }, + [EXIT_NOTRUNNING] = { "NOTRUNNING", EXIT_STATUS_LSB }, + + [EX_USAGE] = { "USAGE", EXIT_STATUS_BSD }, + [EX_DATAERR] = { "DATAERR", EXIT_STATUS_BSD }, + [EX_NOINPUT] = { "NOINPUT", EXIT_STATUS_BSD }, + [EX_NOUSER] = { "NOUSER", EXIT_STATUS_BSD }, + [EX_NOHOST] = { "NOHOST", EXIT_STATUS_BSD }, + [EX_UNAVAILABLE] = { "UNAVAILABLE", EXIT_STATUS_BSD }, + [EX_SOFTWARE] = { "SOFTWARE", EXIT_STATUS_BSD }, + [EX_OSERR] = { "OSERR", EXIT_STATUS_BSD }, + [EX_OSFILE] = { "OSFILE", EXIT_STATUS_BSD }, + [EX_CANTCREAT] = { "CANTCREAT", EXIT_STATUS_BSD }, + [EX_IOERR] = { "IOERR", EXIT_STATUS_BSD }, + [EX_TEMPFAIL] = { "TEMPFAIL", EXIT_STATUS_BSD }, + [EX_PROTOCOL] = { "PROTOCOL", EXIT_STATUS_BSD }, + [EX_NOPERM] = { "NOPERM", EXIT_STATUS_BSD }, + [EX_CONFIG] = { "CONFIG", EXIT_STATUS_BSD }, +}; + +const char* exit_status_to_string(int code, ExitStatusClass class) { + if (code < 0 || (size_t) code >= ELEMENTSOF(exit_status_mappings)) + return NULL; + return class & exit_status_mappings[code].class ? exit_status_mappings[code].name : NULL; +} + +const char* exit_status_class(int code) { + if (code < 0 || (size_t) code >= ELEMENTSOF(exit_status_mappings)) + return NULL; + + switch (exit_status_mappings[code].class) { + case EXIT_STATUS_LIBC: + return "libc"; + case EXIT_STATUS_SYSTEMD: + return "systemd"; + case EXIT_STATUS_LSB: + return "LSB"; + case EXIT_STATUS_BSD: + return "BSD"; + default: return NULL; + } +} + +int exit_status_from_string(const char *s) { + uint8_t val; + int r; + + for (size_t i = 0; i < ELEMENTSOF(exit_status_mappings); i++) + if (streq_ptr(s, exit_status_mappings[i].name)) + return i; + + r = safe_atou8(s, &val); + if (r < 0) + return r; + + return val; +} + +bool is_clean_exit(int code, int status, ExitClean clean, const ExitStatusSet *success_status) { + if (code == CLD_EXITED) + return status == 0 || + (success_status && + bitmap_isset(&success_status->status, status)); + + /* If a daemon does not implement handlers for some of the signals, we do not consider this an + * unclean shutdown */ + if (code == CLD_KILLED) + return (clean == EXIT_CLEAN_DAEMON && IN_SET(status, SIGHUP, SIGINT, SIGTERM, SIGPIPE)) || + (success_status && + bitmap_isset(&success_status->signal, status)); + + return false; +} + +void exit_status_set_free(ExitStatusSet *x) { + assert(x); + + bitmap_clear(&x->status); + bitmap_clear(&x->signal); +} + +bool exit_status_set_is_empty(const ExitStatusSet *x) { + if (!x) + return true; + + return bitmap_isclear(&x->status) && bitmap_isclear(&x->signal); +} + +bool exit_status_set_test(const ExitStatusSet *x, int code, int status) { + if (code == CLD_EXITED && bitmap_isset(&x->status, status)) + return true; + + if (IN_SET(code, CLD_KILLED, CLD_DUMPED) && bitmap_isset(&x->signal, status)) + return true; + + return false; +} diff --git a/src/shared/exit-status.h b/src/shared/exit-status.h new file mode 100644 index 0000000..c22cba0 --- /dev/null +++ b/src/shared/exit-status.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bitmap.h" +#include "hashmap.h" +#include "macro.h" + +/* This defines pretty names for the LSB 'start' verb exit codes. Note that they shouldn't be confused with + * the LSB 'status' verb exit codes which are defined very differently. For details see: + * + * https://refspecs.linuxbase.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html + */ + +enum { + /* EXIT_SUCCESS defined by libc */ + /* EXIT_FAILURE defined by libc */ + EXIT_INVALIDARGUMENT = 2, + EXIT_NOTIMPLEMENTED = 3, + EXIT_NOPERMISSION = 4, + EXIT_NOTINSTALLED = 5, + EXIT_NOTCONFIGURED = 6, + EXIT_NOTRUNNING = 7, + + /* BSD's sysexits.h defines a couple EX_xyz exit codes in the range 64 … 78 */ + + /* The LSB suggests that error codes >= 200 are "reserved". We use them here under the assumption + * that they hence are unused by init scripts. */ + EXIT_CHDIR = 200, + EXIT_NICE, + EXIT_FDS, + EXIT_EXEC, + EXIT_MEMORY, + EXIT_LIMITS, + EXIT_OOM_ADJUST, + EXIT_SIGNAL_MASK, + EXIT_STDIN, + EXIT_STDOUT, + EXIT_CHROOT, /* 210 */ + EXIT_IOPRIO, + EXIT_TIMERSLACK, + EXIT_SECUREBITS, + EXIT_SETSCHEDULER, + EXIT_CPUAFFINITY, + EXIT_GROUP, + EXIT_USER, + EXIT_CAPABILITIES, + EXIT_CGROUP, + EXIT_SETSID, /* 220 */ + EXIT_CONFIRM, + EXIT_STDERR, + _EXIT_RESERVED, /* used to be tcpwrap, don't reuse! */ + EXIT_PAM, + EXIT_NETWORK, + EXIT_NAMESPACE, + EXIT_NO_NEW_PRIVILEGES, + EXIT_SECCOMP, + EXIT_SELINUX_CONTEXT, + EXIT_PERSONALITY, /* 230 */ + EXIT_APPARMOR_PROFILE, + EXIT_ADDRESS_FAMILIES, + EXIT_RUNTIME_DIRECTORY, + _EXIT_RESERVED2, /* used to be used by kdbus, don't reuse */ + EXIT_CHOWN, + EXIT_SMACK_PROCESS_LABEL, + EXIT_KEYRING, + EXIT_STATE_DIRECTORY, + EXIT_CACHE_DIRECTORY, + EXIT_LOGS_DIRECTORY, /* 240 */ + EXIT_CONFIGURATION_DIRECTORY, + EXIT_NUMA_POLICY, + EXIT_CREDENTIALS, + EXIT_BPF, + EXIT_KSM, + + EXIT_EXCEPTION = 255, /* Whenever we want to propagate an abnormal/signal exit, in line with bash */ +}; + +typedef enum ExitStatusClass { + EXIT_STATUS_LIBC = 1 << 0, /* libc EXIT_STATUS/EXIT_FAILURE */ + EXIT_STATUS_SYSTEMD = 1 << 1, /* systemd's own exit codes */ + EXIT_STATUS_LSB = 1 << 2, /* LSB exit codes */ + EXIT_STATUS_BSD = 1 << 3, /* BSD (EX_xyz) exit codes */ + EXIT_STATUS_FULL = EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD | EXIT_STATUS_LSB | EXIT_STATUS_BSD, +} ExitStatusClass; + +typedef struct ExitStatusSet { + Bitmap status; + Bitmap signal; +} ExitStatusSet; + +const char* exit_status_to_string(int code, ExitStatusClass class) _const_; +const char* exit_status_class(int code) _const_; +int exit_status_from_string(const char *s) _pure_; + +typedef struct ExitStatusMapping { + const char *name; + ExitStatusClass class; +} ExitStatusMapping; + +extern const ExitStatusMapping exit_status_mappings[256]; + +typedef enum ExitClean { + EXIT_CLEAN_DAEMON, + EXIT_CLEAN_COMMAND, +} ExitClean; + +bool is_clean_exit(int code, int status, ExitClean clean, const ExitStatusSet *success_status); + +void exit_status_set_free(ExitStatusSet *x); +bool exit_status_set_is_empty(const ExitStatusSet *x); +bool exit_status_set_test(const ExitStatusSet *x, int code, int status); diff --git a/src/shared/extension-util.c b/src/shared/extension-util.c new file mode 100644 index 0000000..d8b16b9 --- /dev/null +++ b/src/shared/extension-util.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "architecture.h" +#include "chase.h" +#include "env-util.h" +#include "extension-util.h" +#include "log.h" +#include "os-util.h" +#include "strv.h" + +int extension_release_validate( + const char *name, + const char *host_os_release_id, + const char *host_os_release_version_id, + const char *host_os_extension_release_level, + const char *host_extension_scope, + char **extension_release, + ImageClass image_class) { + + const char *extension_release_id = NULL, *extension_release_level = NULL, *extension_architecture = NULL; + const char *extension_level = image_class == IMAGE_CONFEXT ? "CONFEXT_LEVEL" : "SYSEXT_LEVEL"; + const char *extension_scope = image_class == IMAGE_CONFEXT ? "CONFEXT_SCOPE" : "SYSEXT_SCOPE"; + + assert(name); + assert(!isempty(host_os_release_id)); + + /* Now that we can look into the extension/confext image, let's see if the OS version is compatible */ + if (strv_isempty(extension_release)) { + log_debug("Extension '%s' carries no release data, ignoring.", name); + return 0; + } + + if (host_extension_scope) { + _cleanup_strv_free_ char **scope_list = NULL; + const char *scope; + bool valid; + + scope = strv_env_pairs_get(extension_release, extension_scope); + if (scope) { + scope_list = strv_split(scope, WHITESPACE); + if (!scope_list) + return -ENOMEM; + } + + /* By default extension are good for attachment in portable service and on the system */ + valid = strv_contains( + scope_list ?: STRV_MAKE("system", "portable"), + host_extension_scope); + if (!valid) { + log_debug("Extension '%s' is not suitable for scope %s, ignoring.", name, host_extension_scope); + return 0; + } + } + + /* When the architecture field is present and not '_any' it must match the host - for now just look at uname but in + * the future we could check if the kernel also supports 32 bit or binfmt has a translator set up for the architecture */ + extension_architecture = strv_env_pairs_get(extension_release, "ARCHITECTURE"); + if (!isempty(extension_architecture) && !streq(extension_architecture, "_any") && + !streq(architecture_to_string(uname_architecture()), extension_architecture)) { + log_debug("Extension '%s' is for architecture '%s', but deployed on top of '%s'.", + name, extension_architecture, architecture_to_string(uname_architecture())); + return 0; + } + + extension_release_id = strv_env_pairs_get(extension_release, "ID"); + if (isempty(extension_release_id)) { + log_debug("Extension '%s' does not contain ID in release file but requested to match '%s' or be '_any'", + name, host_os_release_id); + return 0; + } + + /* A sysext(or confext) with no host OS dependency (static binaries or scripts) can match + * '_any' host OS, and VERSION_ID or SYSEXT_LEVEL(or CONFEXT_LEVEL) are not required anywhere */ + if (streq(extension_release_id, "_any")) { + log_debug("Extension '%s' matches '_any' OS.", name); + return 1; + } + + if (!streq(host_os_release_id, extension_release_id)) { + log_debug("Extension '%s' is for OS '%s', but deployed on top of '%s'.", + name, extension_release_id, host_os_release_id); + return 0; + } + + /* Rolling releases do not typically set VERSION_ID (eg: ArchLinux) */ + if (isempty(host_os_release_version_id) && isempty(host_os_extension_release_level)) { + log_debug("No version info on the host (rolling release?), but ID in %s matched.", name); + return 1; + } + + /* If the extension has a sysext API level declared, then it must match the host API + * level. Otherwise, compare OS version as a whole */ + extension_release_level = strv_env_pairs_get(extension_release, extension_level); + if (!isempty(host_os_extension_release_level) && !isempty(extension_release_level)) { + if (!streq_ptr(host_os_extension_release_level, extension_release_level)) { + log_debug("Extension '%s' is for API level '%s', but running on API level '%s'", + name, strna(extension_release_level), strna(host_os_extension_release_level)); + return 0; + } + } else if (!isempty(host_os_release_version_id)) { + const char *extension_release_version_id; + + extension_release_version_id = strv_env_pairs_get(extension_release, "VERSION_ID"); + if (isempty(extension_release_version_id)) { + log_debug("Extension '%s' does not contain VERSION_ID in release file but requested to match '%s'", + name, strna(host_os_release_version_id)); + return 0; + } + + if (!streq_ptr(host_os_release_version_id, extension_release_version_id)) { + log_debug("Extension '%s' is for OS '%s', but deployed on top of '%s'.", + name, strna(extension_release_version_id), strna(host_os_release_version_id)); + return 0; + } + } else if (isempty(host_os_release_version_id) && isempty(host_os_extension_release_level)) { + /* Rolling releases do not typically set VERSION_ID (eg: ArchLinux) */ + log_debug("No version info on the host (rolling release?), but ID in %s matched.", name); + return 1; + } + + log_debug("Version info of extension '%s' matches host.", name); + return 1; +} + +int parse_env_extension_hierarchies(char ***ret_hierarchies, const char *hierarchy_env) { + _cleanup_free_ char **l = NULL; + int r; + + assert(hierarchy_env); + r = getenv_path_list(hierarchy_env, &l); + if (r == -ENXIO) { + if (streq(hierarchy_env, "SYSTEMD_CONFEXT_HIERARCHIES")) + /* Default for confext when unset */ + l = strv_new("/etc"); + else if (streq(hierarchy_env, "SYSTEMD_SYSEXT_HIERARCHIES")) + /* Default for sysext when unset */ + l = strv_new("/usr", "/opt"); + else if (streq(hierarchy_env, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES")) + /* Combined sysext and confext directories */ + l = strv_new("/usr", "/opt", "/etc"); + else + return -ENXIO; + } else if (r < 0) + return r; + + *ret_hierarchies = TAKE_PTR(l); + return 0; +} + +int extension_has_forbidden_content(const char *root) { + int r; + + /* Insist that extension images do not overwrite the underlying OS release file (it's fine if + * they place one in /etc/os-release, i.e. where things don't matter, as they aren't + * merged.) */ + r = chase("/usr/lib/os-release", root, CHASE_PREFIX_ROOT, NULL, NULL); + if (r > 0) { + log_debug("Extension contains '/usr/lib/os-release', which is not allowed, refusing."); + return 1; + } + if (r < 0 && r != -ENOENT) + return log_debug_errno(r, "Failed to determine whether '/usr/lib/os-release' exists in the extension: %m"); + + return 0; +} diff --git a/src/shared/extension-util.h b/src/shared/extension-util.h new file mode 100644 index 0000000..3cad219 --- /dev/null +++ b/src/shared/extension-util.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "os-util.h" + +/* Given an image name (for logging purposes), a set of os-release values from the host and a key-value pair + * vector of extension-release variables, check that the distro and (system extension level or distro + * version) match and return 1, and 0 otherwise. */ +int extension_release_validate( + const char *name, + const char *host_os_release_id, + const char *host_os_release_version_id, + const char *host_os_extension_release_level, + const char *host_extension_scope, + char **extension_release, + ImageClass image_class); + +/* Parse hierarchy variables and if not set, return "/usr /opt" for sysext and "/etc" for confext */ +int parse_env_extension_hierarchies(char ***ret_hierarchies, const char *hierarchy_env); + +/* Insist that extension images do not overwrite the underlying OS release file (it's fine if they place one + * in /etc/os-release, i.e. where things don't matter, as they aren't merged.) */ +int extension_has_forbidden_content(const char *root); diff --git a/src/shared/fdisk-util.c b/src/shared/fdisk-util.c new file mode 100644 index 0000000..20f32d1 --- /dev/null +++ b/src/shared/fdisk-util.c @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dissect-image.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fdisk-util.h" +#include "parse-util.h" + +#if HAVE_LIBFDISK + +int fdisk_new_context_at( + int dir_fd, + const char *path, + bool read_only, + uint32_t sector_size, + struct fdisk_context **ret) { + + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(ret); + + if (!isempty(path)) { + fd = openat(dir_fd, path, (read_only ? O_RDONLY : O_RDWR)|O_CLOEXEC); + if (fd < 0) + return -errno; + + dir_fd = fd; + } + + c = fdisk_new_context(); + if (!c) + return -ENOMEM; + + if (sector_size == UINT32_MAX) { + r = probe_sector_size_prefer_ioctl(dir_fd, §or_size); + if (r < 0) + return r; + } + + if (sector_size != 0) { + r = fdisk_save_user_sector_size(c, /* phy= */ 0, sector_size); + if (r < 0) + return r; + } + + r = fdisk_assign_device(c, FORMAT_PROC_FD_PATH(dir_fd), read_only); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + return 0; +} + +int fdisk_partition_get_uuid_as_id128(struct fdisk_partition *p, sd_id128_t *ret) { + const char *ids; + + assert(p); + assert(ret); + + ids = fdisk_partition_get_uuid(p); + if (!ids) + return -ENXIO; + + return sd_id128_from_string(ids, ret); +} + +int fdisk_partition_get_type_as_id128(struct fdisk_partition *p, sd_id128_t *ret) { + struct fdisk_parttype *pt; + const char *pts; + + assert(p); + assert(ret); + + pt = fdisk_partition_get_type(p); + if (!pt) + return -ENXIO; + + pts = fdisk_parttype_get_string(pt); + if (!pts) + return -ENXIO; + + return sd_id128_from_string(pts, ret); +} + +int fdisk_partition_get_attrs_as_uint64(struct fdisk_partition *pa, uint64_t *ret) { + uint64_t flags = 0; + const char *a; + int r; + + assert(pa); + assert(ret); + + /* Retrieve current flags as uint64_t mask */ + + a = fdisk_partition_get_attrs(pa); + if (!a) { + *ret = 0; + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&a, &word, ",", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + if (streq(word, "RequiredPartition")) + flags |= SD_GPT_FLAG_REQUIRED_PARTITION; + else if (streq(word, "NoBlockIOProtocol")) + flags |= SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL; + else if (streq(word, "LegacyBIOSBootable")) + flags |= SD_GPT_FLAG_LEGACY_BIOS_BOOTABLE; + else { + const char *e; + unsigned u; + + /* Drop "GUID" prefix if specified */ + e = startswith(word, "GUID:") ?: word; + + if (safe_atou(e, &u) < 0) { + log_debug("Unknown partition flag '%s', ignoring.", word); + continue; + } + + if (u >= sizeof(flags)*8) { /* partition flags on GPT are 64-bit. Let's ignore any further + bits should libfdisk report them */ + log_debug("Partition flag above bit 63 (%s), ignoring.", word); + continue; + } + + flags |= UINT64_C(1) << u; + } + } + + *ret = flags; + return 0; +} + +int fdisk_partition_set_attrs_as_uint64(struct fdisk_partition *pa, uint64_t flags) { + _cleanup_free_ char *attrs = NULL; + int r; + + assert(pa); + + for (unsigned i = 0; i < sizeof(flags) * 8; i++) { + if (!FLAGS_SET(flags, UINT64_C(1) << i)) + continue; + + r = strextendf_with_separator(&attrs, ",", "%u", i); + if (r < 0) + return r; + } + + return fdisk_partition_set_attrs(pa, strempty(attrs)); +} + +#endif diff --git a/src/shared/fdisk-util.h b/src/shared/fdisk-util.h new file mode 100644 index 0000000..a72a596 --- /dev/null +++ b/src/shared/fdisk-util.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_LIBFDISK + +#include + +#include "sd-id128.h" + +#include "macro.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_context*, fdisk_unref_context, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_partition*, fdisk_unref_partition, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_parttype*, fdisk_unref_parttype, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_table*, fdisk_unref_table, NULL); + +int fdisk_new_context_at(int dir_fd, const char *path, bool read_only, uint32_t sector_size, struct fdisk_context **ret); + +int fdisk_partition_get_uuid_as_id128(struct fdisk_partition *p, sd_id128_t *ret); +int fdisk_partition_get_type_as_id128(struct fdisk_partition *p, sd_id128_t *ret); + +int fdisk_partition_get_attrs_as_uint64(struct fdisk_partition *pa, uint64_t *ret); +int fdisk_partition_set_attrs_as_uint64(struct fdisk_partition *pa, uint64_t flags); + +#endif diff --git a/src/shared/fdset.c b/src/shared/fdset.c new file mode 100644 index 0000000..e5b8e92 --- /dev/null +++ b/src/shared/fdset.c @@ -0,0 +1,323 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fdset.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "path-util.h" +#include "set.h" +#include "stat-util.h" + +#define MAKE_SET(s) ((Set*) s) +#define MAKE_FDSET(s) ((FDSet*) s) + +FDSet *fdset_new(void) { + return MAKE_FDSET(set_new(NULL)); +} + +static void fdset_shallow_freep(FDSet **s) { + /* Destroys the set, but does not free the fds inside, like fdset_free()! */ + set_free(MAKE_SET(*ASSERT_PTR(s))); +} + +int fdset_new_array(FDSet **ret, const int fds[], size_t n_fds) { + _cleanup_(fdset_shallow_freep) FDSet *s = NULL; + int r; + + assert(ret); + assert(fds || n_fds == 0); + + s = fdset_new(); + if (!s) + return -ENOMEM; + + for (size_t i = 0; i < n_fds; i++) { + r = fdset_put(s, fds[i]); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(s); + return 0; +} + +void fdset_close(FDSet *s) { + void *p; + + while ((p = set_steal_first(MAKE_SET(s)))) { + int fd = PTR_TO_FD(p); + + /* Valgrind's fd might have ended up in this set here, due to fdset_new_fill(). We'll ignore + * all failures here, so that the EBADFD that valgrind will return us on close() doesn't + * influence us */ + + /* When reloading duplicates of the private bus connection fds and suchlike are closed here, + * which has no effect at all, since they are only duplicates. So don't be surprised about + * these log messages. */ + + if (DEBUG_LOGGING) { + _cleanup_free_ char *path = NULL; + + (void) fd_get_path(fd, &path); + log_debug("Closing set fd %i (%s)", fd, strna(path)); + } + + (void) close_nointr(fd); + } +} + +FDSet* fdset_free(FDSet *s) { + fdset_close(s); + set_free(MAKE_SET(s)); + return NULL; +} + +int fdset_put(FDSet *s, int fd) { + assert(s); + assert(fd >= 0); + + /* Avoid integer overflow in FD_TO_PTR() */ + if (fd == INT_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Refusing invalid fd: %d", fd); + + return set_put(MAKE_SET(s), FD_TO_PTR(fd)); +} + +int fdset_consume(FDSet *s, int fd) { + int r; + + assert(s); + assert(fd >= 0); + + r = fdset_put(s, fd); + if (r < 0) + safe_close(fd); + + return r; +} + +int fdset_put_dup(FDSet *s, int fd) { + _cleanup_close_ int copy = -EBADF; + int r; + + assert(s); + assert(fd >= 0); + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + r = fdset_put(s, copy); + if (r < 0) + return r; + + return TAKE_FD(copy); +} + +bool fdset_contains(FDSet *s, int fd) { + assert(s); + assert(fd >= 0); + + /* Avoid integer overflow in FD_TO_PTR() */ + if (fd == INT_MAX) { + log_debug("Refusing invalid fd: %d", fd); + return false; + } + + return !!set_get(MAKE_SET(s), FD_TO_PTR(fd)); +} + +int fdset_remove(FDSet *s, int fd) { + assert(s); + assert(fd >= 0); + + /* Avoid integer overflow in FD_TO_PTR() */ + if (fd == INT_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Refusing invalid fd: %d", fd); + + return set_remove(MAKE_SET(s), FD_TO_PTR(fd)) ? fd : -ENOENT; +} + +int fdset_new_fill( + int filter_cloexec, /* if < 0 takes all fds, otherwise only those with O_CLOEXEC set (1) or unset (0) */ + FDSet **ret) { + + _cleanup_(fdset_shallow_freep) FDSet *s = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(ret); + + /* Creates an fdset and fills in all currently open file descriptors. Also set all collected fds + * to CLOEXEC. */ + + d = opendir("/proc/self/fd"); + if (!d) { + if (errno == ENOENT && proc_mounted() == 0) + return -ENOSYS; + + return -errno; + } + + s = fdset_new(); + if (!s) + return -ENOMEM; + + FOREACH_DIRENT(de, d, return -errno) { + int fd; + + if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN)) + continue; + + fd = parse_fd(de->d_name); + if (fd < 0) + return fd; + + if (fd < 3) + continue; + if (fd == dirfd(d)) + continue; + + if (filter_cloexec >= 0) { + int fl; + + /* If user asked for that filter by O_CLOEXEC. This is useful so that fds that have + * been passed in can be collected and fds which have been created locally can be + * ignored, under the assumption that only the latter have O_CLOEXEC set. */ + + fl = fcntl(fd, F_GETFD); + if (fl < 0) + return -errno; + + if (FLAGS_SET(fl, FD_CLOEXEC) != !!filter_cloexec) + continue; + } + + /* We need to set CLOEXEC manually only if we're collecting non-CLOEXEC fds. */ + if (filter_cloexec <= 0) { + r = fd_cloexec(fd, true); + if (r < 0) + return r; + } + + r = fdset_put(s, fd); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(s); + return 0; +} + +int fdset_cloexec(FDSet *fds, bool b) { + void *p; + int r; + + assert(fds); + + SET_FOREACH(p, MAKE_SET(fds)) { + r = fd_cloexec(PTR_TO_FD(p), b); + if (r < 0) + return r; + } + + return 0; +} + +int fdset_new_listen_fds(FDSet **ret, bool unset) { + _cleanup_(fdset_shallow_freep) FDSet *s = NULL; + int n, fd, r; + + assert(ret); + + /* Creates an fdset and fills in all passed file descriptors */ + + s = fdset_new(); + if (!s) + return -ENOMEM; + + n = sd_listen_fds(unset); + for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd ++) { + r = fdset_put(s, fd); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(s); + return 0; +} + +int fdset_to_array(FDSet *fds, int **ret) { + unsigned j = 0, m; + void *e; + int *a; + + assert(ret); + + m = fdset_size(fds); + if (m > INT_MAX) /* We want to be able to return an "int" */ + return -ENOMEM; + if (m == 0) { + *ret = NULL; /* suppress array allocation if empty */ + return 0; + } + + a = new(int, m); + if (!a) + return -ENOMEM; + + SET_FOREACH(e, MAKE_SET(fds)) + a[j++] = PTR_TO_FD(e); + + assert(j == m); + + *ret = TAKE_PTR(a); + return (int) m; +} + +int fdset_close_others(FDSet *fds) { + _cleanup_free_ int *a = NULL; + int n; + + n = fdset_to_array(fds, &a); + if (n < 0) + return n; + + return close_all_fds(a, n); +} + +unsigned fdset_size(FDSet *fds) { + return set_size(MAKE_SET(fds)); +} + +bool fdset_isempty(FDSet *fds) { + return set_isempty(MAKE_SET(fds)); +} + +int fdset_iterate(FDSet *s, Iterator *i) { + void *p; + + if (!set_iterate(MAKE_SET(s), i, &p)) + return -ENOENT; + + return PTR_TO_FD(p); +} + +int fdset_steal_first(FDSet *fds) { + void *p; + + p = set_steal_first(MAKE_SET(fds)); + if (!p) + return -ENOENT; + + return PTR_TO_FD(p); +} diff --git a/src/shared/fdset.h b/src/shared/fdset.h new file mode 100644 index 0000000..70a764f --- /dev/null +++ b/src/shared/fdset.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "hashmap.h" +#include "macro.h" +#include "set.h" + +typedef struct FDSet FDSet; + +FDSet* fdset_new(void); +FDSet* fdset_free(FDSet *s); + +int fdset_put(FDSet *s, int fd); +int fdset_consume(FDSet *s, int fd); +int fdset_put_dup(FDSet *s, int fd); + +bool fdset_contains(FDSet *s, int fd); +int fdset_remove(FDSet *s, int fd); + +int fdset_new_array(FDSet **ret, const int *fds, size_t n_fds); +int fdset_new_fill(int filter_cloexec, FDSet **ret); +int fdset_new_listen_fds(FDSet **ret, bool unset); + +int fdset_cloexec(FDSet *fds, bool b); + +int fdset_to_array(FDSet *fds, int **ret); + +int fdset_close_others(FDSet *fds); + +unsigned fdset_size(FDSet *fds); +bool fdset_isempty(FDSet *fds); + +int fdset_iterate(FDSet *s, Iterator *i); + +int fdset_steal_first(FDSet *fds); + +void fdset_close(FDSet *fds); + +#define _FDSET_FOREACH(fd, fds, i) \ + for (Iterator i = ITERATOR_FIRST; ((fd) = fdset_iterate((fds), &i)) >= 0; ) +#define FDSET_FOREACH(fd, fds) \ + _FDSET_FOREACH(fd, fds, UNIQ_T(i, UNIQ)) + +DEFINE_TRIVIAL_CLEANUP_FUNC(FDSet*, fdset_free); +#define _cleanup_fdset_free_ _cleanup_(fdset_freep) diff --git a/src/shared/fileio-label.c b/src/shared/fileio-label.c new file mode 100644 index 0000000..572b8f6 --- /dev/null +++ b/src/shared/fileio-label.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "fileio-label.h" +#include "fileio.h" +#include "selinux-util.h" + +int write_string_file_atomic_label_ts(const char *fn, const char *line, struct timespec *ts) { + int r; + + r = mac_selinux_create_file_prepare(fn, S_IFREG); + if (r < 0) + return r; + + r = write_string_file_ts(fn, line, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC, ts); + + mac_selinux_create_file_clear(); + + return r; +} + +int create_shutdown_run_nologin_or_warn(void) { + int r; + + /* This is used twice: once in systemd-user-sessions.service, in order to block logins when we + * actually go down, and once in systemd-logind.service when shutdowns are scheduled, and logins are + * to be turned off a bit in advance. We use the same wording of the message in both cases. + * + * Traditionally, there was only /etc/nologin, and we managed that. Then, in PAM 1.1 + * support for /run/nologin was added as alternative + * (https://github.com/linux-pam/linux-pam/commit/e9e593f6ddeaf975b7fe8446d184e6bc387d450b). + * 13 years later we stopped managing /etc/nologin, leaving it for the administrator to manage. + */ + + r = write_string_file_atomic_label("/run/nologin", + "System is going down. Unprivileged users are not permitted to log in anymore. " + "For technical details, see pam_nologin(8)."); + if (r < 0) + return log_error_errno(r, "Failed to create /run/nologin: %m"); + + return 0; +} diff --git a/src/shared/fileio-label.h b/src/shared/fileio-label.h new file mode 100644 index 0000000..03b4a16 --- /dev/null +++ b/src/shared/fileio-label.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +/* These functions are split out of fileio.h (and not for example just flags to the functions they wrap) in order to + * optimize linking: This way, -lselinux is needed only for the callers of these functions that need selinux, but not + * for all */ + +int write_string_file_atomic_label_ts(const char *fn, const char *line, struct timespec *ts); +static inline int write_string_file_atomic_label(const char *fn, const char *line) { + return write_string_file_atomic_label_ts(fn, line, NULL); +} + +int create_shutdown_run_nologin_or_warn(void); diff --git a/src/shared/find-esp.c b/src/shared/find-esp.c new file mode 100644 index 0000000..db87084 --- /dev/null +++ b/src/shared/find-esp.c @@ -0,0 +1,909 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-device.h" +#include "sd-id128.h" + +#include "alloc-util.h" +#include "blkid-util.h" +#include "btrfs-util.h" +#include "chase.h" +#include "device-util.h" +#include "devnum-util.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "find-esp.h" +#include "gpt.h" +#include "mount-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "virt.h" + +typedef enum VerifyESPFlags { + VERIFY_ESP_SEARCHING = 1 << 0, /* Downgrade various "not found" logs to debug level */ + VERIFY_ESP_UNPRIVILEGED_MODE = 1 << 1, /* Call into udev rather than blkid */ + VERIFY_ESP_SKIP_FSTYPE_CHECK = 1 << 2, /* Skip filesystem check */ + VERIFY_ESP_SKIP_DEVICE_CHECK = 1 << 3, /* Skip device node check */ +} VerifyESPFlags; + +static VerifyESPFlags verify_esp_flags_init(int unprivileged_mode, const char *env_name_for_relaxing) { + VerifyESPFlags flags = 0; + int r; + + assert(env_name_for_relaxing); + + if (unprivileged_mode < 0) + unprivileged_mode = geteuid() != 0; + if (unprivileged_mode) + flags |= VERIFY_ESP_UNPRIVILEGED_MODE; + + r = getenv_bool(env_name_for_relaxing); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $%s environment variable, assuming false.", env_name_for_relaxing); + else if (r > 0) + flags |= VERIFY_ESP_SKIP_FSTYPE_CHECK | VERIFY_ESP_SKIP_DEVICE_CHECK; + + if (detect_container() > 0) + flags |= VERIFY_ESP_SKIP_DEVICE_CHECK; + + return flags; +} + +static int verify_esp_blkid( + dev_t devid, + VerifyESPFlags flags, + uint32_t *ret_part, + uint64_t *ret_pstart, + uint64_t *ret_psize, + sd_id128_t *ret_uuid) { + + sd_id128_t uuid = SD_ID128_NULL; + uint64_t pstart = 0, psize = 0; + uint32_t part = 0; + +#if HAVE_BLKID + _cleanup_(blkid_free_probep) blkid_probe b = NULL; + _cleanup_free_ char *node = NULL; + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING); + const char *v; + int r; + + r = devname_from_devnum(S_IFBLK, devid, &node); + if (r < 0) + return log_error_errno(r, "Failed to get device path for " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(devid)); + + errno = 0; + b = blkid_new_probe_from_filename(node); + if (!b) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(ENOMEM), "Failed to open file system \"%s\": %m", node); + + blkid_probe_enable_superblocks(b, 1); + blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE); + blkid_probe_enable_partitions(b, 1); + blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS); + + errno = 0; + r = blkid_do_safeprobe(b); + if (r == -2) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system \"%s\" is ambiguous.", node); + if (r == 1) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system \"%s\" does not contain a label.", node); + if (r != 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe file system \"%s\": %m", node); + + r = blkid_probe_lookup_value(b, "TYPE", &v, NULL); + if (r != 0) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "No filesystem found on \"%s\": %m", node); + if (!streq(v, "vfat")) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" is not FAT.", node); + + r = blkid_probe_lookup_value(b, "PART_ENTRY_SCHEME", &v, NULL); + if (r != 0) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" is not located on a partitioned block device.", node); + if (!streq(v, "gpt")) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" is not on a GPT partition table.", node); + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_TYPE", &v, NULL); + if (r != 0) + return log_error_errno(errno ?: EIO, "Failed to probe partition type UUID of \"%s\": %m", node); + if (sd_id128_string_equal(v, SD_GPT_ESP) <= 0) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" has wrong type for an EFI System Partition (ESP).", node); + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_UUID", &v, NULL); + if (r != 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition entry UUID of \"%s\": %m", node); + r = sd_id128_from_string(v, &uuid); + if (r < 0) + return log_error_errno(r, "Partition \"%s\" has invalid UUID \"%s\".", node, v); + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_NUMBER", &v, NULL); + if (r != 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition number of \"%s\": %m", node); + r = safe_atou32(v, &part); + if (r < 0) + return log_error_errno(r, "Failed to parse PART_ENTRY_NUMBER field."); + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_OFFSET", &v, NULL); + if (r != 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition offset of \"%s\": %m", node); + r = safe_atou64(v, &pstart); + if (r < 0) + return log_error_errno(r, "Failed to parse PART_ENTRY_OFFSET field."); + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_SIZE", &v, NULL); + if (r != 0) + return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition size of \"%s\": %m", node); + r = safe_atou64(v, &psize); + if (r < 0) + return log_error_errno(r, "Failed to parse PART_ENTRY_SIZE field."); +#endif + + if (ret_part) + *ret_part = part; + if (ret_pstart) + *ret_pstart = pstart; + if (ret_psize) + *ret_psize = psize; + if (ret_uuid) + *ret_uuid = uuid; + + return 0; +} + +static int verify_esp_udev( + dev_t devid, + VerifyESPFlags flags, + uint32_t *ret_part, + uint64_t *ret_pstart, + uint64_t *ret_psize, + sd_id128_t *ret_uuid) { + + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING); + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + sd_id128_t uuid = SD_ID128_NULL; + uint64_t pstart = 0, psize = 0; + uint32_t part = 0; + const char *node, *v; + int r; + + r = sd_device_new_from_devnum(&d, 'b', devid); + if (r < 0) + return log_error_errno(r, "Failed to get device from device number: %m"); + + r = sd_device_get_devname(d, &node); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device node: %m"); + + r = sd_device_get_property_value(d, "ID_FS_TYPE", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device property: %m"); + if (!streq(v, "vfat")) + return log_device_full_errno(d, + searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" is not FAT.", node ); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_SCHEME", &v); + if (r < 0) + return log_device_full_errno(d, + searching && r == -ENOENT ? LOG_DEBUG : LOG_ERR, + searching && r == -ENOENT ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : r, + "Failed to get device property: %m"); + if (!streq(v, "gpt")) + return log_device_full_errno(d, + searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" is not on a GPT partition table.", node); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device property: %m"); + if (sd_id128_string_equal(v, SD_GPT_ESP) <= 0) + return log_device_full_errno(d, + searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" has wrong type for an EFI System Partition (ESP).", node); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_UUID", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device property: %m"); + r = sd_id128_from_string(v, &uuid); + if (r < 0) + return log_device_error_errno(d, r, "Partition \"%s\" has invalid UUID \"%s\".", node, v); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_NUMBER", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device property: %m"); + r = safe_atou32(v, &part); + if (r < 0) + return log_device_error_errno(d, r, "Failed to parse PART_ENTRY_NUMBER field."); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_OFFSET", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device property: %m"); + r = safe_atou64(v, &pstart); + if (r < 0) + return log_device_error_errno(d, r, "Failed to parse PART_ENTRY_OFFSET field."); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_SIZE", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device property: %m"); + r = safe_atou64(v, &psize); + if (r < 0) + return log_device_error_errno(d, r, "Failed to parse PART_ENTRY_SIZE field."); + + if (ret_part) + *ret_part = part; + if (ret_pstart) + *ret_pstart = pstart; + if (ret_psize) + *ret_psize = psize; + if (ret_uuid) + *ret_uuid = uuid; + + return 0; +} + +static int verify_fsroot_dir( + int dir_fd, + const char *path, + VerifyESPFlags flags, + dev_t *ret_dev) { + + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING), + unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE); + _cleanup_free_ char *f = NULL; + STRUCT_NEW_STATX_DEFINE(sxa); + STRUCT_NEW_STATX_DEFINE(sxb); + int r; + + /* Checks if the specified directory is at the root of its file system, and returns device + * major/minor of the device, if it is. */ + + assert(dir_fd >= 0); + assert(path); + + /* We pass the full path from the root directory file descriptor so we can use it for logging, but + * dir_fd points to the parent directory of the final component of the given path, so we extract the + * filename and operate on that. */ + + r = path_extract_filename(path, &f); + if (r < 0 && r != -EADDRNOTAVAIL) + return log_error_errno(r, "Failed to extract filename of %s: %m", path); + + r = statx_fallback(dir_fd, strempty(f), AT_SYMLINK_NOFOLLOW|(isempty(f) ? AT_EMPTY_PATH : 0), + STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxa.sx); + if (r < 0) + return log_full_errno((searching && r == -ENOENT) || + (unprivileged_mode && ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_ERR, r, + "Failed to determine block device node of \"%s\": %m", path); + + assert(S_ISDIR(sxa.sx.stx_mode)); /* We used O_DIRECTORY above, when opening, so this must hold */ + + if (FLAGS_SET(sxa.sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) { + + /* If we have STATX_ATTR_MOUNT_ROOT, we are happy, that's all we need. We operate under the + * assumption that a top of a mount point is also the top of the file system. (Which of + * course is strictly speaking not always true...) */ + + if (!FLAGS_SET(sxa.sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "Directory \"%s\" is not the root of the file system.", path); + + goto success; + } + + /* Now let's look at the parent */ + r = statx_fallback(dir_fd, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxb.sx); + if (r < 0) + return log_full_errno(unprivileged_mode && ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_ERR, r, + "Failed to determine block device node of parent of \"%s\": %m", path); + + if (statx_inode_same(&sxa.sx, &sxb.sx)) /* for the root dir inode nr for both inodes will be the same */ + goto success; + + if (statx_mount_same(&sxa.nsx, &sxb.nsx)) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "Directory \"%s\" is not the root of the file system.", path); + +success: + if (!ret_dev) + return 0; + + if (sxa.sx.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */ + return btrfs_get_block_device_at(dir_fd, strempty(f), ret_dev); + + *ret_dev = makedev(sxa.sx.stx_dev_major, sxa.sx.stx_dev_minor); + return 0; +} + +static int verify_esp( + int rfd, + const char *path, + char **ret_path, + uint32_t *ret_part, + uint64_t *ret_pstart, + uint64_t *ret_psize, + sd_id128_t *ret_uuid, + dev_t *ret_devid, + VerifyESPFlags flags) { + + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING), + unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE); + _cleanup_free_ char *p = NULL; + _cleanup_close_ int pfd = -EBADF; + dev_t devid = 0; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(path); + + /* This logs about all errors, except: + * + * -ENOENT → if 'searching' is set, and the dir doesn't exist + * -EADDRNOTAVAIL → if 'searching' is set, and the dir doesn't look like an ESP + * -EACESS → if 'unprivileged_mode' is set, and we have trouble accessing the thing + */ + + /* Non-root user can only check the status, so if an error occurred in the following, it does not cause any + * issues. Let's also, silence the error messages. */ + + r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT|CHASE_PARENT, &p, &pfd); + if (r < 0) + return log_full_errno((searching && r == -ENOENT) || + (unprivileged_mode && ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_ERR, + r, "Failed to open parent directory of \"%s\": %m", path); + + if (!FLAGS_SET(flags, VERIFY_ESP_SKIP_FSTYPE_CHECK)) { + _cleanup_free_ char *f = NULL; + struct statfs sfs; + + r = path_extract_filename(p, &f); + if (r < 0 && r != -EADDRNOTAVAIL) + return log_error_errno(r, "Failed to extract filename of %s: %m", p); + + /* Trigger any automounts so that xstatfsat() operates on the mount instead of the mountpoint + * directory. */ + r = trigger_automount_at(pfd, f); + if (r < 0) + return log_error_errno(r, "Failed to trigger automount at %s: %m", p); + + r = xstatfsat(pfd, strempty(f), &sfs); + if (r < 0) + /* If we are searching for the mount point, don't generate a log message if we can't find the path */ + return log_full_errno((searching && r == -ENOENT) || + (unprivileged_mode && r == -EACCES) ? LOG_DEBUG : LOG_ERR, r, + "Failed to check file system type of \"%s\": %m", p); + + if (!F_TYPE_EQUAL(sfs.f_type, MSDOS_SUPER_MAGIC)) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "File system \"%s\" is not a FAT EFI System Partition (ESP) file system.", p); + } + + r = verify_fsroot_dir(pfd, p, flags, FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK) ? NULL : &devid); + if (r < 0) + return r; + + /* In a container we don't have access to block devices, skip this part of the verification, we trust + * the container manager set everything up correctly on its own. */ + if (FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK)) + goto finish; + + if (devnum_is_zero(devid)) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "Could not determine backing block device of directory \"%s\" (btrfs RAID?).", p); + + /* If we are unprivileged we ask udev for the metadata about the partition. If we are privileged we + * use blkid instead. Why? Because this code is called from 'bootctl' which is pretty much an + * emergency recovery tool that should also work when udev isn't up (i.e. from the emergency shell), + * however blkid can't work if we have no privileges to access block devices directly, which is why + * we use udev in that case. */ + if (unprivileged_mode) + r = verify_esp_udev(devid, flags, ret_part, ret_pstart, ret_psize, ret_uuid); + else + r = verify_esp_blkid(devid, flags, ret_part, ret_pstart, ret_psize, ret_uuid); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_devid) + *ret_devid = devid; + + return 0; + +finish: + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_part) + *ret_part = 0; + if (ret_pstart) + *ret_pstart = 0; + if (ret_psize) + *ret_psize = 0; + if (ret_uuid) + *ret_uuid = SD_ID128_NULL; + if (ret_devid) + *ret_devid = 0; + + return 0; +} + +int find_esp_and_warn_at( + int rfd, + const char *path, + int unprivileged_mode, + char **ret_path, + uint32_t *ret_part, + uint64_t *ret_pstart, + uint64_t *ret_psize, + sd_id128_t *ret_uuid, + dev_t *ret_devid) { + + VerifyESPFlags flags; + int r; + + /* This logs about all errors except: + * + * -ENOKEY → when we can't find the partition + * -EACCESS → when unprivileged_mode is true, and we can't access something + */ + + assert(rfd >= 0 || rfd == AT_FDCWD); + + flags = verify_esp_flags_init(unprivileged_mode, "SYSTEMD_RELAX_ESP_CHECKS"); + + if (path) + return verify_esp(rfd, path, ret_path, ret_part, ret_pstart, ret_psize, ret_uuid, ret_devid, flags); + + path = getenv("SYSTEMD_ESP_PATH"); + if (path) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + + if (!path_is_valid(path) || !path_is_absolute(path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "$SYSTEMD_ESP_PATH does not refer to an absolute path, refusing to use it: %s", + path); + + r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT, &p, &fd); + if (r < 0) + return log_error_errno(r, "Failed to resolve path %s: %m", path); + + /* Note: when the user explicitly configured things with an env var we won't validate the + * path beyond checking it refers to a directory. After all we want this to be useful for + * testing. */ + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", p); + if (!S_ISDIR(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "ESP path '%s' is not a directory.", p); + + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_part) + *ret_part = 0; + if (ret_pstart) + *ret_pstart = 0; + if (ret_psize) + *ret_psize = 0; + if (ret_uuid) + *ret_uuid = SD_ID128_NULL; + if (ret_devid) + *ret_devid = st.st_dev; + + return 0; + } + + FOREACH_STRING(dir, "/efi", "/boot", "/boot/efi") { + r = verify_esp(rfd, dir, ret_path, ret_part, ret_pstart, ret_psize, ret_uuid, ret_devid, + flags | VERIFY_ESP_SEARCHING); + if (r >= 0) + return 0; + if (!IN_SET(r, -ENOENT, -EADDRNOTAVAIL, -ENOTDIR, -ENOTTY)) /* This one is not it */ + return r; + } + + /* No logging here */ + return -ENOKEY; +} + +int find_esp_and_warn( + const char *root, + const char *path, + int unprivileged_mode, + char **ret_path, + uint32_t *ret_part, + uint64_t *ret_pstart, + uint64_t *ret_psize, + sd_id128_t *ret_uuid, + dev_t *ret_devid) { + + _cleanup_close_ int rfd = -EBADF; + _cleanup_free_ char *p = NULL; + uint32_t part; + uint64_t pstart, psize; + sd_id128_t uuid; + dev_t devid; + int r; + + rfd = open(empty_to_root(root), O_PATH|O_DIRECTORY|O_CLOEXEC); + if (rfd < 0) + return -errno; + + r = find_esp_and_warn_at(rfd, path, unprivileged_mode, + ret_path ? &p : NULL, + ret_part ? &part : NULL, + ret_pstart ? &pstart : NULL, + ret_psize ? &psize : NULL, + ret_uuid ? &uuid : NULL, + ret_devid ? &devid : NULL); + if (r < 0) + return r; + + if (ret_path) { + r = chaseat_prefix_root(p, root, ret_path); + if (r < 0) + return r; + } + if (ret_part) + *ret_part = part; + if (ret_pstart) + *ret_pstart = pstart; + if (ret_psize) + *ret_psize = psize; + if (ret_uuid) + *ret_uuid = uuid; + if (ret_devid) + *ret_devid = devid; + + return 0; +} + +static int verify_xbootldr_blkid( + dev_t devid, + VerifyESPFlags flags, + sd_id128_t *ret_uuid) { + + sd_id128_t uuid = SD_ID128_NULL; + +#if HAVE_BLKID + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING); + _cleanup_(blkid_free_probep) blkid_probe b = NULL; + _cleanup_free_ char *node = NULL; + const char *type, *v; + int r; + + r = devname_from_devnum(S_IFBLK, devid, &node); + if (r < 0) + return log_error_errno(r, "Failed to get block device path for " DEVNUM_FORMAT_STR ": %m", + DEVNUM_FORMAT_VAL(devid)); + + errno = 0; + b = blkid_new_probe_from_filename(node); + if (!b) + return log_error_errno(errno_or_else(ENOMEM), "%s: Failed to create blkid probe: %m", node); + + blkid_probe_enable_partitions(b, 1); + blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS); + + errno = 0; + r = blkid_do_safeprobe(b); + if (r == _BLKID_SAFEPROBE_AMBIGUOUS) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "%s: File system is ambiguous.", node); + if (r == _BLKID_SAFEPROBE_NOT_FOUND) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "%s: File system does not contain a label.", node); + if (r == _BLKID_SAFEPROBE_ERROR) + return log_error_errno(errno_or_else(EIO), "%s: Failed to probe file system: %m", node); + + assert(r == _BLKID_SAFEPROBE_FOUND); + + r = blkid_probe_lookup_value(b, "PART_ENTRY_SCHEME", &type, NULL); + if (r != 0) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(EIO), + "%s: Failed to probe PART_ENTRY_SCHEME: %m", node); + if (streq(type, "gpt")) { + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_TYPE", &v, NULL); + if (r != 0) + return log_error_errno(errno_or_else(EIO), "%s: Failed to probe PART_ENTRY_TYPE: %m", node); + if (sd_id128_string_equal(v, SD_GPT_XBOOTLDR) <= 0) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV), + "%s: Partition has wrong PART_ENTRY_TYPE=%s for XBOOTLDR partition.", node, v); + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_UUID", &v, NULL); + if (r != 0) + return log_error_errno(errno_or_else(EIO), "%s: Failed to probe PART_ENTRY_UUID: %m", node); + r = sd_id128_from_string(v, &uuid); + if (r < 0) + return log_error_errno(r, "%s: Partition has invalid UUID PART_ENTRY_TYPE=%s: %m", node, v); + + } else if (streq(type, "dos")) { + + errno = 0; + r = blkid_probe_lookup_value(b, "PART_ENTRY_TYPE", &v, NULL); + if (r != 0) + return log_error_errno(errno_or_else(EIO), "%s: Failed to probe PART_ENTRY_TYPE: %m", node); + if (!streq(v, "0xea")) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV), + "%s: Wrong PART_ENTRY_TYPE=%s for XBOOTLDR partition.", node, v); + + } else + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV), + "%s: Not on a GPT or DOS partition table (PART_ENTRY_SCHEME=%s).", node, type); +#endif + + if (ret_uuid) + *ret_uuid = uuid; + + return 0; +} + +static int verify_xbootldr_udev( + dev_t devid, + VerifyESPFlags flags, + sd_id128_t *ret_uuid) { + + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING); + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + sd_id128_t uuid = SD_ID128_NULL; + const char *node, *type, *v; + int r; + + r = sd_device_new_from_devnum(&d, 'b', devid); + if (r < 0) + return log_error_errno(r, "Failed to get block device for " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(devid)); + + r = sd_device_get_devname(d, &node); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device node: %m"); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_SCHEME", &type); + if (r < 0) + return log_device_full_errno(d, + searching && r == -ENOENT ? LOG_DEBUG : LOG_ERR, + searching && r == -ENOENT ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : r, + "Failed to query ID_PART_ENTRY_SCHEME: %m"); + + if (streq(type, "gpt")) { + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to query ID_PART_ENTRY_TYPE: %m"); + + r = sd_id128_string_equal(v, SD_GPT_XBOOTLDR); + if (r < 0) + return log_device_error_errno(d, r, "Failed to parse ID_PART_ENTRY_TYPE=%s: %m", v); + if (r == 0) + return log_device_full_errno( + d, + searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV), + "Partition has wrong ID_PART_ENTRY_TYPE=%s for XBOOTLDR partition.", v); + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_UUID", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to query ID_PART_ENTRY_UUID: %m"); + r = sd_id128_from_string(v, &uuid); + if (r < 0) + return log_device_error_errno(d, r, "Partition has invalid UUID ID_PART_ENTRY_TYPE=%s: %m", v); + + } else if (streq(type, "dos")) { + + r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &v); + if (r < 0) + return log_device_error_errno(d, r, "Failed to query ID_PART_ENTRY_TYPE: %m"); + if (!streq(v, "0xea")) + return log_device_full_errno( + d, + searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV), + "Wrong ID_PART_ENTRY_TYPE=%s for XBOOTLDR partition.", v); + + } else + return log_device_full_errno( + d, + searching ? LOG_DEBUG : LOG_ERR, + searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV), + "Not on a GPT or DOS partition table (ID_PART_ENTRY_SCHEME=%s).", type); + + if (ret_uuid) + *ret_uuid = uuid; + + return 0; +} + +static int verify_xbootldr( + int rfd, + const char *path, + VerifyESPFlags flags, + char **ret_path, + sd_id128_t *ret_uuid, + dev_t *ret_devid) { + + _cleanup_free_ char *p = NULL; + _cleanup_close_ int pfd = -EBADF; + bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING), + unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE); + dev_t devid = 0; + int r; + + assert(rfd >= 0 || rfd == AT_FDCWD); + assert(path); + + r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT|CHASE_PARENT, &p, &pfd); + if (r < 0) + return log_full_errno((searching && r == -ENOENT) || + (unprivileged_mode && ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_ERR, + r, "Failed to open parent directory of \"%s\": %m", path); + + r = verify_fsroot_dir(pfd, p, flags, FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK) ? NULL : &devid); + if (r < 0) + return r; + + if (FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK)) + goto finish; + + if (devnum_is_zero(devid)) + return log_full_errno(searching ? LOG_DEBUG : LOG_ERR, + SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV), + "Could not determine backing block device of directory \"%s\" (btrfs RAID?).%s", + p, + searching ? "" : + "\nHint: set $SYSTEMD_RELAX_XBOOTLDR_CHECKS=yes environment variable " + "to bypass this and further verifications for the directory."); + + if (unprivileged_mode) + r = verify_xbootldr_udev(devid, flags, ret_uuid); + else + r = verify_xbootldr_blkid(devid, flags, ret_uuid); + if (r < 0) + return r; + + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_devid) + *ret_devid = devid; + + return 0; + +finish: + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_uuid) + *ret_uuid = SD_ID128_NULL; + if (ret_devid) + *ret_devid = 0; + + return 0; +} + +int find_xbootldr_and_warn_at( + int rfd, + const char *path, + int unprivileged_mode, + char **ret_path, + sd_id128_t *ret_uuid, + dev_t *ret_devid) { + + VerifyESPFlags flags; + int r; + + /* Similar to find_esp_and_warn(), but finds the XBOOTLDR partition. Returns the same errors. */ + + assert(rfd >= 0 || rfd == AT_FDCWD); + + flags = verify_esp_flags_init(unprivileged_mode, "SYSTEMD_RELAX_XBOOTLDR_CHECKS"); + + if (path) + return verify_xbootldr(rfd, path, flags, ret_path, ret_uuid, ret_devid); + + path = getenv("SYSTEMD_XBOOTLDR_PATH"); + if (path) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + + if (!path_is_valid(path) || !path_is_absolute(path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "$SYSTEMD_XBOOTLDR_PATH does not refer to an absolute path, refusing to use it: %s", + path); + + r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT, &p, &fd); + if (r < 0) + return log_error_errno(r, "Failed to resolve path %s: %m", p); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", p); + if (!S_ISDIR(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "XBOOTLDR path '%s' is not a directory.", p); + + if (ret_path) + *ret_path = TAKE_PTR(p); + if (ret_uuid) + *ret_uuid = SD_ID128_NULL; + if (ret_devid) + *ret_devid = st.st_dev; + + return 0; + } + + r = verify_xbootldr(rfd, "/boot", flags | VERIFY_ESP_SEARCHING, ret_path, ret_uuid, ret_devid); + if (r < 0) { + if (!IN_SET(r, -ENOENT, -EADDRNOTAVAIL, -ENOTDIR, -ENOTTY)) /* This one is not it */ + return r; + + return -ENOKEY; + } + + return 0; +} + +int find_xbootldr_and_warn( + const char *root, + const char *path, + int unprivileged_mode, + char **ret_path, + sd_id128_t *ret_uuid, + dev_t *ret_devid) { + + _cleanup_close_ int rfd = -EBADF; + _cleanup_free_ char *p = NULL; + sd_id128_t uuid; + dev_t devid; + int r; + + rfd = open(empty_to_root(root), O_PATH|O_DIRECTORY|O_CLOEXEC); + if (rfd < 0) + return -errno; + + r = find_xbootldr_and_warn_at(rfd, path, unprivileged_mode, + ret_path ? &p : NULL, + ret_uuid ? &uuid : NULL, + ret_devid ? &devid : NULL); + if (r < 0) + return r; + + if (ret_path) { + r = chaseat_prefix_root(p, root, ret_path); + if (r < 0) + return r; + } + if (ret_uuid) + *ret_uuid = uuid; + if (ret_devid) + *ret_devid = devid; + + return 0; +} diff --git a/src/shared/find-esp.h b/src/shared/find-esp.h new file mode 100644 index 0000000..2e132a7 --- /dev/null +++ b/src/shared/find-esp.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +int find_esp_and_warn_at(int rfd, const char *path, int unprivileged_mode, char **ret_path, uint32_t *ret_part, uint64_t *ret_pstart, uint64_t *ret_psize, sd_id128_t *ret_uuid, dev_t *ret_devid); +int find_esp_and_warn(const char *root, const char *path, int unprivileged_mode, char **ret_path, uint32_t *ret_part, uint64_t *ret_pstart, uint64_t *ret_psize, sd_id128_t *ret_uuid, dev_t *ret_devid); + +int find_xbootldr_and_warn_at(int rfd, const char *path, int unprivileged_mode, char **ret_path, sd_id128_t *ret_uuid, dev_t *ret_devid); +int find_xbootldr_and_warn(const char *root, const char *path, int unprivileged_mode, char **ret_path, sd_id128_t *ret_uuid, dev_t *ret_devid); diff --git a/src/shared/firewall-util-iptables.c b/src/shared/firewall-util-iptables.c new file mode 100644 index 0000000..b70b740 --- /dev/null +++ b/src/shared/firewall-util-iptables.c @@ -0,0 +1,392 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* Temporary work-around for broken glibc vs. linux kernel header definitions + * This is already fixed upstream, remove this when distributions have updated. + */ +#define _NET_IF_H 1 + +#include +#include +#include +#include +#include +#include +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dlfcn-util.h" +#include "firewall-util.h" +#include "firewall-util-private.h" +#include "in-addr-util.h" +#include "macro.h" +#include "socket-util.h" + +static DLSYM_FUNCTION(iptc_check_entry); +static DLSYM_FUNCTION(iptc_commit); +static DLSYM_FUNCTION(iptc_delete_entry); +static DLSYM_FUNCTION(iptc_free); +static DLSYM_FUNCTION(iptc_init); +static DLSYM_FUNCTION(iptc_insert_entry); +static DLSYM_FUNCTION(iptc_strerror); + +static void *iptc_dl = NULL; + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct xtc_handle*, sym_iptc_free, NULL); + +static int entry_fill_basics( + struct ipt_entry *entry, + int protocol, + const char *in_interface, + const union in_addr_union *source, + unsigned source_prefixlen, + const char *out_interface, + const union in_addr_union *destination, + unsigned destination_prefixlen) { + + assert(entry); + + if (out_interface && !ifname_valid(out_interface)) + return -EINVAL; + if (in_interface && !ifname_valid(in_interface)) + return -EINVAL; + + entry->ip.proto = protocol; + + if (in_interface) { + size_t l; + + l = strlen(in_interface); + assert(l < sizeof entry->ip.iniface); + assert(l < sizeof entry->ip.iniface_mask); + + strcpy(entry->ip.iniface, in_interface); + memset(entry->ip.iniface_mask, 0xFF, l + 1); + } + if (source) { + entry->ip.src = source->in; + in4_addr_prefixlen_to_netmask(&entry->ip.smsk, source_prefixlen); + } + + if (out_interface) { + size_t l = strlen(out_interface); + assert(l < sizeof entry->ip.outiface); + assert(l < sizeof entry->ip.outiface_mask); + + strcpy(entry->ip.outiface, out_interface); + memset(entry->ip.outiface_mask, 0xFF, l + 1); + } + if (destination) { + entry->ip.dst = destination->in; + in4_addr_prefixlen_to_netmask(&entry->ip.dmsk, destination_prefixlen); + } + + return 0; +} + +int fw_iptables_add_masquerade( + bool add, + int af, + const union in_addr_union *source, + unsigned source_prefixlen) { + + static const xt_chainlabel chain = "POSTROUTING"; + _cleanup_(sym_iptc_freep) struct xtc_handle *h = NULL; + struct ipt_entry *entry, *mask; + struct ipt_entry_target *t; + size_t sz; + struct nf_nat_ipv4_multi_range_compat *mr; + int r, protocol = 0; + const char *out_interface = NULL; + const union in_addr_union *destination = NULL; + unsigned destination_prefixlen = 0; + + if (af != AF_INET) + return -EOPNOTSUPP; + + if (!source || source_prefixlen == 0) + return -EINVAL; + + r = fw_iptables_init_nat(&h); + if (r < 0) + return r; + + sz = XT_ALIGN(sizeof(struct ipt_entry)) + + XT_ALIGN(sizeof(struct ipt_entry_target)) + + XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat)); + + /* Put together the entry we want to add or remove */ + entry = alloca0(sz); + entry->next_offset = sz; + entry->target_offset = XT_ALIGN(sizeof(struct ipt_entry)); + r = entry_fill_basics(entry, protocol, NULL, source, source_prefixlen, out_interface, destination, destination_prefixlen); + if (r < 0) + return r; + + /* Fill in target part */ + t = ipt_get_target(entry); + t->u.target_size = + XT_ALIGN(sizeof(struct ipt_entry_target)) + + XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat)); + strncpy(t->u.user.name, "MASQUERADE", sizeof(t->u.user.name)); + mr = (struct nf_nat_ipv4_multi_range_compat*) t->data; + mr->rangesize = 1; + + /* Create a search mask entry */ + mask = alloca_safe(sz); + memset(mask, 0xFF, sz); + + if (add) { + if (sym_iptc_check_entry(chain, entry, (unsigned char*) mask, h)) + return 0; + if (errno != ENOENT) /* if other error than not existing yet, fail */ + return -errno; + + if (!sym_iptc_insert_entry(chain, entry, 0, h)) + return -errno; + } else { + if (!sym_iptc_delete_entry(chain, entry, (unsigned char*) mask, h)) { + if (errno == ENOENT) /* if it's already gone, all is good! */ + return 0; + + return -errno; + } + } + + if (!sym_iptc_commit(h)) + return -errno; + + return 0; +} + +int fw_iptables_add_local_dnat( + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote) { + + static const xt_chainlabel chain_pre = "PREROUTING", chain_output = "OUTPUT"; + _cleanup_(sym_iptc_freep) struct xtc_handle *h = NULL; + struct ipt_entry *entry, *mask; + struct ipt_entry_target *t; + struct ipt_entry_match *m; + struct xt_addrtype_info_v1 *at; + struct nf_nat_ipv4_multi_range_compat *mr; + size_t sz, msz; + int r; + const char *in_interface = NULL; + const union in_addr_union *source = NULL; + unsigned source_prefixlen = 0; + const union in_addr_union *destination = NULL; + unsigned destination_prefixlen = 0; + + assert(add || !previous_remote); + + if (af != AF_INET) + return -EOPNOTSUPP; + + if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP)) + return -EOPNOTSUPP; + + if (local_port <= 0) + return -EINVAL; + + if (remote_port <= 0) + return -EINVAL; + + r = fw_iptables_init_nat(&h); + if (r < 0) + return r; + + sz = XT_ALIGN(sizeof(struct ipt_entry)) + + XT_ALIGN(sizeof(struct ipt_entry_match)) + + XT_ALIGN(sizeof(struct xt_addrtype_info_v1)) + + XT_ALIGN(sizeof(struct ipt_entry_target)) + + XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat)); + + if (protocol == IPPROTO_TCP) + msz = XT_ALIGN(sizeof(struct ipt_entry_match)) + + XT_ALIGN(sizeof(struct xt_tcp)); + else + msz = XT_ALIGN(sizeof(struct ipt_entry_match)) + + XT_ALIGN(sizeof(struct xt_udp)); + + sz += msz; + + /* Fill in basic part */ + entry = alloca0(sz); + entry->next_offset = sz; + entry->target_offset = + XT_ALIGN(sizeof(struct ipt_entry)) + + XT_ALIGN(sizeof(struct ipt_entry_match)) + + XT_ALIGN(sizeof(struct xt_addrtype_info_v1)) + + msz; + r = entry_fill_basics(entry, protocol, in_interface, source, source_prefixlen, NULL, destination, destination_prefixlen); + if (r < 0) + return r; + + /* Fill in first match */ + m = (struct ipt_entry_match*) ((uint8_t*) entry + XT_ALIGN(sizeof(struct ipt_entry))); + m->u.match_size = msz; + if (protocol == IPPROTO_TCP) { + struct xt_tcp *tcp; + + strncpy(m->u.user.name, "tcp", sizeof(m->u.user.name)); + tcp = (struct xt_tcp*) m->data; + tcp->dpts[0] = tcp->dpts[1] = local_port; + tcp->spts[0] = 0; + tcp->spts[1] = 0xFFFF; + + } else { + struct xt_udp *udp; + + strncpy(m->u.user.name, "udp", sizeof(m->u.user.name)); + udp = (struct xt_udp*) m->data; + udp->dpts[0] = udp->dpts[1] = local_port; + udp->spts[0] = 0; + udp->spts[1] = 0xFFFF; + } + + /* Fill in second match */ + m = (struct ipt_entry_match*) ((uint8_t*) entry + XT_ALIGN(sizeof(struct ipt_entry)) + msz); + m->u.match_size = + XT_ALIGN(sizeof(struct ipt_entry_match)) + + XT_ALIGN(sizeof(struct xt_addrtype_info_v1)); + strncpy(m->u.user.name, "addrtype", sizeof(m->u.user.name)); + m->u.user.revision = 1; + at = (struct xt_addrtype_info_v1*) m->data; + at->dest = XT_ADDRTYPE_LOCAL; + + /* Fill in target part */ + t = ipt_get_target(entry); + t->u.target_size = + XT_ALIGN(sizeof(struct ipt_entry_target)) + + XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat)); + strncpy(t->u.user.name, "DNAT", sizeof(t->u.user.name)); + mr = (struct nf_nat_ipv4_multi_range_compat*) t->data; + mr->rangesize = 1; + mr->range[0].flags = NF_NAT_RANGE_PROTO_SPECIFIED|NF_NAT_RANGE_MAP_IPS; + mr->range[0].min_ip = mr->range[0].max_ip = remote->in.s_addr; + if (protocol == IPPROTO_TCP) + mr->range[0].min.tcp.port = mr->range[0].max.tcp.port = htobe16(remote_port); + else + mr->range[0].min.udp.port = mr->range[0].max.udp.port = htobe16(remote_port); + + mask = alloca0(sz); + memset(mask, 0xFF, sz); + + if (add) { + /* Add the PREROUTING rule, if it is missing so far */ + if (!sym_iptc_check_entry(chain_pre, entry, (unsigned char*) mask, h)) { + if (errno != ENOENT) + return -EINVAL; + + if (!sym_iptc_insert_entry(chain_pre, entry, 0, h)) + return -errno; + } + + /* If a previous remote is set, remove its entry */ + if (previous_remote && previous_remote->in.s_addr != remote->in.s_addr) { + mr->range[0].min_ip = mr->range[0].max_ip = previous_remote->in.s_addr; + + if (!sym_iptc_delete_entry(chain_pre, entry, (unsigned char*) mask, h)) { + if (errno != ENOENT) + return -errno; + } + + mr->range[0].min_ip = mr->range[0].max_ip = remote->in.s_addr; + } + + /* Add the OUTPUT rule, if it is missing so far */ + if (!in_interface) { + + /* Don't apply onto loopback addresses */ + if (!destination) { + entry->ip.dst.s_addr = htobe32(0x7F000000); + entry->ip.dmsk.s_addr = htobe32(0xFF000000); + entry->ip.invflags = IPT_INV_DSTIP; + } + + if (!sym_iptc_check_entry(chain_output, entry, (unsigned char*) mask, h)) { + if (errno != ENOENT) + return -errno; + + if (!sym_iptc_insert_entry(chain_output, entry, 0, h)) + return -errno; + } + + /* If a previous remote is set, remove its entry */ + if (previous_remote && previous_remote->in.s_addr != remote->in.s_addr) { + mr->range[0].min_ip = mr->range[0].max_ip = previous_remote->in.s_addr; + + if (!sym_iptc_delete_entry(chain_output, entry, (unsigned char*) mask, h)) { + if (errno != ENOENT) + return -errno; + } + } + } + } else { + if (!sym_iptc_delete_entry(chain_pre, entry, (unsigned char*) mask, h)) { + if (errno != ENOENT) + return -errno; + } + + if (!in_interface) { + if (!destination) { + entry->ip.dst.s_addr = htobe32(0x7F000000); + entry->ip.dmsk.s_addr = htobe32(0xFF000000); + entry->ip.invflags = IPT_INV_DSTIP; + } + + if (!sym_iptc_delete_entry(chain_output, entry, (unsigned char*) mask, h)) { + if (errno != ENOENT) + return -errno; + } + } + } + + if (!sym_iptc_commit(h)) + return -errno; + + return 0; +} + +static int dlopen_iptc(void) { + return dlopen_many_sym_or_warn( + &iptc_dl, + "libip4tc.so.2", LOG_DEBUG, + DLSYM_ARG(iptc_check_entry), + DLSYM_ARG(iptc_commit), + DLSYM_ARG(iptc_delete_entry), + DLSYM_ARG(iptc_free), + DLSYM_ARG(iptc_init), + DLSYM_ARG(iptc_insert_entry), + DLSYM_ARG(iptc_strerror)); +} + +int fw_iptables_init_nat(struct xtc_handle **ret) { + _cleanup_(sym_iptc_freep) struct xtc_handle *h = NULL; + int r; + + r = dlopen_iptc(); + if (r < 0) + return r; + + h = sym_iptc_init("nat"); + if (!h) + return log_debug_errno(errno, "Failed to init \"nat\" table: %s", sym_iptc_strerror(errno)); + + if (ret) + *ret = TAKE_PTR(h); + + return 0; +} diff --git a/src/shared/firewall-util-nft.c b/src/shared/firewall-util-nft.c new file mode 100644 index 0000000..fe986ed --- /dev/null +++ b/src/shared/firewall-util-nft.c @@ -0,0 +1,1372 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "escape.h" +#include "extract-word.h" +#include "firewall-util.h" +#include "firewall-util-private.h" +#include "in-addr-util.h" +#include "macro.h" +#include "netlink-internal.h" +#include "netlink-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "time-util.h" + +#define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport" +#define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat" +#define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr" + +#define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC) + +#define UDP_DPORT_OFFSET 2 + +static sd_netlink_message **netlink_message_unref_many(sd_netlink_message **m) { + if (!m) + return NULL; + + /* This does not free array. The end of the array must be NULL. */ + + for (sd_netlink_message **p = m; *p; p++) + *p = sd_netlink_message_unref(*p); + + return m; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(sd_netlink_message**, netlink_message_unref_many); + +static int nfnl_open_expr_container(sd_netlink_message *m, const char *name) { + int r; + + assert(m); + assert(name); + + r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM); + if (r < 0) + return r; + + return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name); +} + +static int nfnl_close_expr_container(sd_netlink_message *m) { + int r; + + assert(m); + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_fib( + sd_netlink_message *m, + uint32_t nft_fib_flags, + enum nft_fib_result result, + enum nft_registers dreg) { + + int r; + + assert(m); + + r = nfnl_open_expr_container(m, "fib"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg)); + if (r < 0) + return r; + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_meta( + sd_netlink_message *m, + enum nft_meta_keys key, + enum nft_registers dreg) { + + int r; + + assert(m); + + r = nfnl_open_expr_container(m, "meta"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg)); + if (r < 0) + return r; + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_payload( + sd_netlink_message *m, + enum nft_payload_bases pb, + uint32_t offset, + uint32_t len, + enum nft_registers dreg) { + + int r; + + assert(m); + + r = nfnl_open_expr_container(m, "payload"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len)); + if (r < 0) + return r; + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_lookup( + sd_netlink_message *m, + const char *set_name, + enum nft_registers sreg, + enum nft_registers dreg) { + + int r; + + assert(m); + assert(set_name); + + r = nfnl_open_expr_container(m, "lookup"); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg)); + if (r < 0) + return r; + + if (dreg != 0) { + r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg)); + if (r < 0) + return r; + } + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_cmp( + sd_netlink_message *m, + enum nft_cmp_ops cmp_op, + enum nft_registers sreg, + const void *data, + size_t dlen) { + + int r; + + assert(m); + assert(data); + + r = nfnl_open_expr_container(m, "cmp"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg)); + if (r < 0) + return r; + + r = sd_netlink_message_append_container_data(m, NFTA_CMP_DATA, NFTA_DATA_VALUE, data, dlen); + if (r < 0) + return r; + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_bitwise( + sd_netlink_message *m, + enum nft_registers sreg, + enum nft_registers dreg, + const void *and, + const void *xor, + uint32_t len) { + + int r; + + assert(m); + assert(and); + assert(xor); + + r = nfnl_open_expr_container(m, "bitwise"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len)); + if (r < 0) + return r; + + r = sd_netlink_message_append_container_data(m, NFTA_BITWISE_MASK, NFTA_DATA_VALUE, and, len); + if (r < 0) + return r; + + r = sd_netlink_message_append_container_data(m, NFTA_BITWISE_XOR, NFTA_DATA_VALUE, xor, len); + if (r < 0) + return r; + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_dnat( + sd_netlink_message *m, + int family, + enum nft_registers areg, + enum nft_registers preg) { + + int r; + + assert(m); + + r = nfnl_open_expr_container(m, "nat"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg)); + if (r < 0) + return r; + + return nfnl_close_expr_container(m); +} + +static int nfnl_add_expr_masq(sd_netlink_message *m) { + int r; + + r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq"); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int sd_nfnl_message_new_masq_rule( + sd_netlink *nfnl, + sd_netlink_message **ret, + int family, + const char *chain) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */ + + assert(nfnl); + assert(ret); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(chain); + + r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS); + if (r < 0) + return r; + + /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */ + if (family == AF_INET) + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr), + sizeof(uint32_t), NFT_REG32_01); + else + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_src.s6_addr), + sizeof(struct in6_addr), NFT_REG32_01); + if (r < 0) + return r; + + /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */ + r = nfnl_add_expr_lookup(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01, 0); + if (r < 0) + return r; + + /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */ + r = nfnl_add_expr_masq(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */ + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +static int sd_nfnl_message_new_dnat_rule_pre( + sd_netlink *nfnl, + sd_netlink_message **ret, + int family, + const char *chain) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + enum nft_registers proto_reg; + uint32_t local = RTN_LOCAL; + int r; + + /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen + * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */ + + assert(nfnl); + assert(ret); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(chain); + + r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS); + if (r < 0) + return r; + + /* 1st statement: fib daddr type local */ + r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01); + if (r < 0) + return r; + + /* 1st statement (cont.): compare RTN_LOCAL */ + r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local)); + if (r < 0) + return r; + + /* 2nd statement: lookup local port in map, fetch address:dport to map to */ + r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01); + if (r < 0) + return r; + + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET, + sizeof(uint16_t), NFT_REG32_02); + if (r < 0) + return r; + + /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and + * store address and port for the dnat mapping in REG1/REG2. */ + r = nfnl_add_expr_lookup(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01); + if (r < 0) + return r; + + proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05; + r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */ + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +static int sd_nfnl_message_new_dnat_rule_out( + sd_netlink *nfnl, + sd_netlink_message **ret, + int family, + const char *chain) { + + static const uint32_t zero = 0, one = 1; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + enum nft_registers proto_reg; + int r; + + assert(nfnl); + assert(ret); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(chain); + + r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS); + if (r < 0) + return r; + + /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */ + if (family == AF_INET) { + uint32_t lonet = htobe32(UINT32_C(0x7F000000)), lomask = htobe32(UINT32_C(0xff000000)); + + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr), + sizeof(lonet), NFT_REG32_01); + if (r < 0) + return r; + /* 1st statement (cont.): bitops/prefix */ + r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask)); + if (r < 0) + return r; + + /* 1st statement (cont.): compare reg1 with 127/8 */ + r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet)); + } else { + struct in6_addr loaddr = IN6ADDR_LOOPBACK_INIT; + + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_dst.s6_addr), + sizeof(loaddr), NFT_REG32_01); + if (r < 0) + return r; + + r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &loaddr, sizeof(loaddr)); + } + if (r < 0) + return r; + + /* 2nd statement: meta oif lo */ + r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01); + if (r < 0) + return r; + + /* 2nd statement (cont.): compare to lo ifindex (1) */ + r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one)); + if (r < 0) + return r; + + /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */ + r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01); + if (r < 0) + return r; + + /* 3rd statement (cont): store the port number in reg2 */ + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET, + sizeof(uint16_t), NFT_REG32_02); + if (r < 0) + return r; + + /* 3rd statement (cont): use reg1 and reg2 and retrieve + * the new destination ip and port number. + * + * reg1 and reg2 are clobbered and will then contain the new + * address/port number. */ + r = nfnl_add_expr_lookup(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01); + if (r < 0) + return r; + + /* 4th statement: dnat connection to address/port retrieved by the + * preceding expression. */ + proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05; + r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */ + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +static int nft_new_set( + struct sd_netlink *nfnl, + sd_netlink_message **ret, + int family, + const char *set_name, + uint32_t set_id, + uint32_t flags, + uint32_t type, + uint32_t klen) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(nfnl); + assert(ret); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(set_name); + + r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen); + if (r < 0) + return r; + + if (flags != 0) { + r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags)); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type)); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return r; +} + +static int nft_new_map( + struct sd_netlink *nfnl, + sd_netlink_message **ret, + int family, + const char *set_name, + uint32_t set_id, + uint32_t flags, + uint32_t type, + uint32_t klen, + uint32_t dtype, + uint32_t dlen) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(nfnl); + assert(ret); + assert(IN_SET(family, AF_INET, AF_INET6)); + assert(set_name); + + r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen)); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +static int nft_add_element( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + const char *table_name, + const char *set_name, + const void *key, + uint32_t klen, + const void *data, + uint32_t dlen) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(nfnl); + assert(ret); + assert(nfproto_is_valid(nfproto)); + assert(table_name); + assert(set_name); + assert(key); + assert(data || dlen == 0); + + + /* + * Ideally there would be an API that provides: + * + * 1) an init function to add the main ruleset skeleton + * 2) a function that populates the sets with all known address/port pairs to s/dnat for + * 3) a function that can remove address/port pairs again. + * + * At this time, the existing API is used which is built on a + * 'add/delete a rule' paradigm. + * + * This replicated here and each element gets added to the set + * one-by-one. + */ + r = sd_nfnl_nft_message_new_setelems(nfnl, &m, /* add = */ true, nfproto, table_name, set_name); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_append_setelem(m, 0, key, klen, data, dlen, 0); + if (r < 0) + return r; + + /* could theoretically append more set elements to add here */ + + r = sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */ + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +static int nft_del_element( + sd_netlink *nfnl, + sd_netlink_message **ret, + int nfproto, + const char *table_name, + const char *set_name, + const void *key, + uint32_t klen, + const void *data, + uint32_t dlen) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(nfnl); + assert(ret); + assert(nfproto_is_valid(nfproto)); + assert(table_name); + assert(set_name); + assert(key); + assert(data || dlen == 0); + + r = sd_nfnl_nft_message_new_setelems(nfnl, &m, /* add = */ false, nfproto, table_name, set_name); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_append_setelem(m, 0, key, klen, data, dlen, 0); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */ + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +/* This is needed so 'nft' userspace tool can properly format the contents + * of the set/map when someone uses 'nft' to inspect their content. + * + * The values cannot be changed, they are part of the nft tool type identifier ABI. */ +#define TYPE_BITS 6 + +enum nft_key_types { + TYPE_IPADDR = 7, + TYPE_IP6ADDR = 8, + TYPE_INET_PROTOCOL = 12, + TYPE_INET_SERVICE = 13, +}; + +static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) { + uint32_t type = (uint32_t)a; + + type <<= TYPE_BITS; + type |= (uint32_t)b; + + return type; +} + +static int fw_nftables_init_family(sd_netlink *nfnl, int family) { + sd_netlink_message *messages[10] = {}; + _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages; + size_t msgcnt = 0, ip_type_size; + uint32_t set_id = 0; + int ip_type, r; + + assert(nfnl); + assert(IN_SET(family, AF_INET, AF_INET6)); + + /* Set F_EXCL so table add fails if the table already exists. */ + r = sd_nfnl_nft_message_new_table(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME, + "prerouting", "nat", + NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME, + "output", "nat", + NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME, + "postrouting", "nat", + NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1); + if (r < 0) + return r; + + if (family == AF_INET) { + ip_type_size = sizeof(uint32_t); + ip_type = TYPE_IPADDR; + } else { + assert(family == AF_INET6); + ip_type_size = sizeof(struct in6_addr); + ip_type = TYPE_IP6ADDR; + } + /* set to store ip address ranges we should masquerade for */ + r = nft_new_set(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size); + if (r < 0) + return r; + + /* + * map to store ip address:port pair to dnat to. elements in concatenation + * are rounded up to 4 bytes. + * + * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not + * sizeof(uint8_t) + sizeof(uint16_t). + */ + r = nft_new_map(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0, + concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2, + concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t)); + if (r < 0) + return r; + + r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &messages[msgcnt++], family, "prerouting"); + if (r < 0) + return r; + + r = sd_nfnl_message_new_dnat_rule_out(nfnl, &messages[msgcnt++], family, "output"); + if (r < 0) + return r; + + r = sd_nfnl_message_new_masq_rule(nfnl, &messages[msgcnt++], family, "postrouting"); + if (r < 0) + return r; + + assert(msgcnt < ELEMENTSOF(messages)); + r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + if (r < 0 && r != -EEXIST) + return r; + + return 0; +} + +int fw_nftables_init_full(FirewallContext *ctx, bool init_tables) { + _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL; + int r; + + assert(ctx); + assert(!ctx->nfnl); + + r = sd_nfnl_socket_open(&nfnl); + if (r < 0) + return r; + + if (init_tables) { + r = fw_nftables_init_family(nfnl, AF_INET); + if (r < 0) + return r; + + if (socket_ipv6_is_supported()) { + r = fw_nftables_init_family(nfnl, AF_INET6); + if (r < 0) + return log_error_errno(r, "Failed to init ipv6 NAT: %m"); + } + } + + ctx->nfnl = TAKE_PTR(nfnl); + return 0; +} + +int fw_nftables_init(FirewallContext *ctx) { + return fw_nftables_init_full(ctx, /* init_tables= */ true); +} + +void fw_nftables_exit(FirewallContext *ctx) { + assert(ctx); + + ctx->nfnl = sd_netlink_unref(ctx->nfnl); +} + +static int nft_message_append_setelem_iprange( + sd_netlink_message *m, + const union in_addr_union *source, + unsigned int prefixlen) { + + uint32_t mask, start, end; + unsigned int nplen; + int r; + + assert(m); + assert(source); + assert(prefixlen <= 32); + + nplen = 32 - prefixlen; + + mask = (1U << nplen) - 1U; + mask = htobe32(~mask); + start = source->in.s_addr & mask; + + r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_append_setelem(m, 0, &start, sizeof(start), NULL, 0, 0); + if (r < 0) + return r; + + end = be32toh(start) + (1U << nplen); + if (end < be32toh(start)) + end = 0U; + end = htobe32(end); + + r = sd_nfnl_nft_message_append_setelem(m, 1, &end, sizeof(end), NULL, 0, NFT_SET_ELEM_INTERVAL_END); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */ +} + +static int nft_message_append_setelem_ip6range( + sd_netlink_message *m, + const union in_addr_union *source, + unsigned int prefixlen) { + + union in_addr_union start, end; + int r; + + assert(m); + assert(source); + + r = in_addr_prefix_range(AF_INET6, source, prefixlen, &start, &end); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_append_setelem(m, 0, &start.in6, sizeof(start.in6), NULL, 0, 0); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_append_setelem(m, 1, &end.in6, sizeof(end.in6), NULL, 0, NFT_SET_ELEM_INTERVAL_END); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */ +} + +int nft_set_element_modify_iprange( + FirewallContext *ctx, + bool add, + int nfproto, + int af, + const char *table, + const char *set, + const union in_addr_union *source, + unsigned int source_prefixlen) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(ctx->nfnl); + assert(IN_SET(af, AF_INET, AF_INET6)); + assert(nfproto_is_valid(nfproto)); + assert(table); + assert(set); + + if (!source || source_prefixlen == 0) + return -EINVAL; + + if (af == AF_INET6 && source_prefixlen < 8) + return -EINVAL; + + r = sd_nfnl_nft_message_new_setelems(ctx->nfnl, &m, add, nfproto, table, set); + if (r < 0) + return r; + + if (af == AF_INET) + r = nft_message_append_setelem_iprange(m, source, source_prefixlen); + else + r = nft_message_append_setelem_ip6range(m, source, source_prefixlen); + if (r < 0) + return r; + + return sd_nfnl_call_batch(ctx->nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); +} + +int nft_set_element_modify_ip( + FirewallContext *ctx, + bool add, + int nfproto, + int af, + const char *table, + const char *set, + const union in_addr_union *source) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(ctx->nfnl); + assert(IN_SET(af, AF_INET, AF_INET6)); + assert(nfproto_is_valid(nfproto)); + assert(table); + assert(set); + + if (!source) + return -EINVAL; + + r = sd_nfnl_nft_message_new_setelems(ctx->nfnl, &m, add, nfproto, table, set); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_append_setelem(m, 0, source, FAMILY_ADDRESS_SIZE(af), NULL, 0, 0); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */ + if (r < 0) + return r; + + return sd_nfnl_call_batch(ctx->nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); +} + +int nft_set_element_modify_any(FirewallContext *ctx, bool add, int nfproto, const char *table, const char *set, const void *element, size_t element_size) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(ctx); + assert(ctx->nfnl); + assert(nfproto_is_valid(nfproto)); + assert(table); + assert(set); + assert(element); + + if (add) + r = nft_add_element(ctx->nfnl, &m, nfproto, table, set, element, element_size, NULL, 0); + else + r = nft_del_element(ctx->nfnl, &m, nfproto, table, set, element, element_size, NULL, 0); + if (r < 0) + return r; + + return sd_nfnl_call_batch(ctx->nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); +} + +static int af_to_nfproto(int af) { + assert(IN_SET(af, AF_INET, AF_INET6)); + + switch (af) { + case AF_INET: + return NFPROTO_IPV4; + case AF_INET6: + return NFPROTO_IPV6; + default: + assert_not_reached(); + } +} + +int fw_nftables_add_masquerade( + FirewallContext *ctx, + bool add, + int af, + const union in_addr_union *source, + unsigned int source_prefixlen) { + + int r; + + assert(ctx); + assert(ctx->nfnl); + assert(IN_SET(af, AF_INET, AF_INET6)); + + if (!socket_ipv6_is_supported() && af == AF_INET6) + return -EOPNOTSUPP; + + r = nft_set_element_modify_iprange(ctx, add, af_to_nfproto(af), af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME, + source, source_prefixlen); + if (r != -ENOENT) + return r; + + /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the + * systemd nat table. + * + * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables + * backend, iptables has builtin chains that cannot be deleted -- the next add operation will + * 'just work'. + * + * In the nftables case, everything gets removed. The next add operation will yield -ENOENT. + * + * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add + * operation. + * + * Note that this doesn't protect against external sabotage such as a + * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short + * of extending the kernel to allow tables to be owned by stystemd-networkd and making them + * non-deleteable except by the 'owning process'. */ + + r = fw_nftables_init_family(ctx->nfnl, af); + if (r < 0) + return r; + + return nft_set_element_modify_iprange(ctx, add, af_to_nfproto(af), af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME, + source, source_prefixlen); +} + +static int fw_nftables_add_local_dnat_internal( + sd_netlink *nfnl, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote) { + + sd_netlink_message *messages[3] = {}; + _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages; + static bool ipv6_supported = true; + uint32_t data[5], key[2], dlen; + size_t msgcnt = 0; + int r; + + assert(nfnl); + assert(add || !previous_remote); + assert(IN_SET(af, AF_INET, AF_INET6)); + + if (!ipv6_supported && af == AF_INET6) + return -EOPNOTSUPP; + + if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP)) + return -EPROTONOSUPPORT; + + if (local_port <= 0) + return -EINVAL; + + key[0] = protocol; + key[1] = htobe16(local_port); + + if (!remote) + return -EOPNOTSUPP; + + if (remote_port <= 0) + return -EINVAL; + + if (af == AF_INET) { + dlen = 8; + data[1] = htobe16(remote_port); + } else { + assert(af == AF_INET6); + dlen = sizeof(data); + data[4] = htobe16(remote_port); + } + + /* If a previous remote is set, remove its entry */ + if (add && previous_remote && !in_addr_equal(af, previous_remote, remote)) { + if (af == AF_INET) + data[0] = previous_remote->in.s_addr; + else + memcpy(data, &previous_remote->in6, sizeof(previous_remote->in6)); + + r = nft_del_element(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_DNAT_MAP_NAME, + key, sizeof(key), data, dlen); + if (r < 0) + return r; + } + + if (af == AF_INET) + data[0] = remote->in.s_addr; + else + memcpy(data, &remote->in6, sizeof(remote->in6)); + + if (add) + r = nft_add_element(nfnl, &messages[msgcnt++], af_to_nfproto(af), NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_DNAT_MAP_NAME, + key, sizeof(key), data, dlen); + else + r = nft_del_element(nfnl, &messages[msgcnt++], af_to_nfproto(af), NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_DNAT_MAP_NAME, + key, sizeof(key), data, dlen); + if (r < 0) + return r; + + assert(msgcnt < ELEMENTSOF(messages)); + r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + if (r == -EOVERFLOW && af == AF_INET6) { + /* The current implementation of DNAT in systemd requires kernel's + * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns + * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */ + log_debug_errno(r, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m"); + ipv6_supported = false; + return -EOPNOTSUPP; + } + if (r < 0) + return r; + + return 0; +} + +int fw_nftables_add_local_dnat( + FirewallContext *ctx, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote) { + + int r; + + assert(ctx); + assert(ctx->nfnl); + assert(IN_SET(af, AF_INET, AF_INET6)); + + if (!socket_ipv6_is_supported() && af == AF_INET6) + return -EOPNOTSUPP; + + r = fw_nftables_add_local_dnat_internal(ctx->nfnl, add, af, protocol, local_port, remote, remote_port, previous_remote); + if (r != -ENOENT) + return r; + + /* See comment in fw_nftables_add_masquerade(). */ + r = fw_nftables_init_family(ctx->nfnl, af); + if (r < 0) + return r; + + /* table created anew; previous address already gone */ + return fw_nftables_add_local_dnat_internal(ctx->nfnl, add, af, protocol, local_port, remote, remote_port, NULL); +} + +static const char *const nfproto_table[] = { + [NFPROTO_ARP] = "arp", + [NFPROTO_BRIDGE] = "bridge", + [NFPROTO_INET] = "inet", + [NFPROTO_IPV4] = "ip", + [NFPROTO_IPV6] = "ip6", + [NFPROTO_NETDEV] = "netdev", +}; + +DEFINE_STRING_TABLE_LOOKUP(nfproto, int); + +static const char *const nft_set_source_table[] = { + [NFT_SET_SOURCE_ADDRESS] = "address", + [NFT_SET_SOURCE_PREFIX] = "prefix", + [NFT_SET_SOURCE_IFINDEX] = "ifindex", + [NFT_SET_SOURCE_CGROUP] = "cgroup", + [NFT_SET_SOURCE_USER] = "user", + [NFT_SET_SOURCE_GROUP] = "group", +}; + +DEFINE_STRING_TABLE_LOOKUP(nft_set_source, int); + +void nft_set_context_clear(NFTSetContext *s) { + assert(s); + + FOREACH_ARRAY(nft_set, s->sets, s->n_sets) { + free(nft_set->table); + free(nft_set->set); + } + + s->n_sets = 0; + s->sets = mfree(s->sets); +} + +int nft_set_add(NFTSetContext *s, NFTSetSource source, int nfproto, const char *table, const char *set) { + _cleanup_free_ char *table_dup = NULL, *set_dup = NULL; + + assert(s); + assert(IN_SET(source, NFT_SET_SOURCE_ADDRESS, NFT_SET_SOURCE_PREFIX, NFT_SET_SOURCE_IFINDEX, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP)); + assert(nfproto_is_valid(nfproto)); + assert(table); + assert(set); + + table_dup = strdup(table); + if (!table_dup) + return -ENOMEM; + + set_dup = strdup(set); + if (!set_dup) + return -ENOMEM; + + if (!GREEDY_REALLOC(s->sets, s->n_sets + 1)) + return -ENOMEM; + + s->sets[s->n_sets++] = (NFTSet) { + .source = source, + .nfproto = nfproto, + .table = TAKE_PTR(table_dup), + .set = TAKE_PTR(set_dup), + }; + + return 0; +} + +int nft_set_context_dup(const NFTSetContext *src, NFTSetContext *dst) { + int r; + _cleanup_(nft_set_context_clear) NFTSetContext d = (NFTSetContext) {}; + + assert(src); + assert(dst); + + FOREACH_ARRAY(nft_set, src->sets, src->n_sets) { + r = nft_set_add(&d, nft_set->source, nft_set->nfproto, nft_set->table, nft_set->set); + if (r < 0) + return r; + } + + *dst = TAKE_STRUCT(d); + + return 0; +} + +int config_parse_nft_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + NFTSetContext *nft_set_context = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(nft_set_context); + assert(IN_SET(ltype, NFT_SET_PARSE_NETWORK, NFT_SET_PARSE_CGROUP)); + + if (isempty(rvalue)) { + nft_set_context_clear(nft_set_context); + + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *tuple = NULL, *source_str = NULL, *family_str = NULL, *table = NULL, *set = NULL; + const char *q = NULL; + int nfproto; + NFTSetSource source; + + r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(rvalue); + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax %s=%s, ignoring: %m", lvalue, strna(esc)); + return 0; + } + if (r == 0) + return 0; + + q = tuple; + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE, &source_str, &family_str, &table, &set, NULL); + if (r == -ENOMEM) + return log_oom(); + if (r != 4 || !isempty(q)) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(tuple); + return log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse NFT set %s, ignoring", strna(esc)); + } + + assert(source_str); + assert(family_str); + assert(table); + assert(set); + + source = nft_set_source_from_string(source_str); + if (source < 0 || + (ltype == NFT_SET_PARSE_NETWORK && !IN_SET(source, NFT_SET_SOURCE_ADDRESS, NFT_SET_SOURCE_PREFIX, NFT_SET_SOURCE_IFINDEX)) || + (ltype == NFT_SET_PARSE_CGROUP && !IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP))) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(source_str); + return log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown NFT source %s, ignoring", strna(esc)); + } + + nfproto = nfproto_from_string(family_str); + if (nfproto < 0) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(family_str); + return log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown NFT protocol family %s, ignoring", strna(esc)); + } + + if (!nft_identifier_valid(table)) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(table); + return log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid table name %s, ignoring", strna(esc)); + } + + if (!nft_identifier_valid(set)) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(set); + return log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid set name %s, ignoring", strna(esc)); + } + + r = nft_set_add(nft_set_context, source, nfproto, table, set); + if (r < 0) + return r; + } + + assert_not_reached(); +} diff --git a/src/shared/firewall-util-private.h b/src/shared/firewall-util-private.h new file mode 100644 index 0000000..38c8dfc --- /dev/null +++ b/src/shared/firewall-util-private.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-netlink.h" + +#include "firewall-util.h" +#include "in-addr-util.h" + +typedef enum FirewallBackend { + FW_BACKEND_NONE, +#if HAVE_LIBIPTC + FW_BACKEND_IPTABLES, +#endif + FW_BACKEND_NFTABLES, + _FW_BACKEND_MAX, + _FW_BACKEND_INVALID = -EINVAL, +} FirewallBackend; + +struct FirewallContext { + FirewallBackend backend; + sd_netlink *nfnl; +}; + +const char *firewall_backend_to_string(FirewallBackend b) _const_; + +int fw_nftables_init(FirewallContext *ctx); +int fw_nftables_init_full(FirewallContext *ctx, bool init_tables); +void fw_nftables_exit(FirewallContext *ctx); + +int fw_nftables_add_masquerade( + FirewallContext *ctx, + bool add, + int af, + const union in_addr_union *source, + unsigned source_prefixlen); + +int fw_nftables_add_local_dnat( + FirewallContext *ctx, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote); + +#if HAVE_LIBIPTC +struct xtc_handle; + +int fw_iptables_add_masquerade( + bool add, + int af, + const union in_addr_union *source, + unsigned source_prefixlen); + +int fw_iptables_add_local_dnat( + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote); + +int fw_iptables_init_nat(struct xtc_handle **ret); +#endif diff --git a/src/shared/firewall-util.c b/src/shared/firewall-util.c new file mode 100644 index 0000000..e96b24a --- /dev/null +++ b/src/shared/firewall-util.c @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "firewall-util.h" +#include "firewall-util-private.h" +#include "log.h" +#include "netlink-util.h" +#include "string-table.h" + +static const char * const firewall_backend_table[_FW_BACKEND_MAX] = { + [FW_BACKEND_NONE] = "none", +#if HAVE_LIBIPTC + [FW_BACKEND_IPTABLES] = "iptables", +#endif + [FW_BACKEND_NFTABLES] = "nftables", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(firewall_backend, FirewallBackend); + +static void firewall_backend_probe(FirewallContext *ctx, bool init_tables) { + const char *e; + + assert(ctx); + + if (ctx->backend != _FW_BACKEND_INVALID) + return; + + e = secure_getenv("SYSTEMD_FIREWALL_BACKEND"); + if (e) { + if (streq(e, "nftables")) + ctx->backend = FW_BACKEND_NFTABLES; + else if (streq(e, "iptables")) +#if HAVE_LIBIPTC + ctx->backend = FW_BACKEND_IPTABLES; +#else + log_debug("Unsupported firewall backend requested, ignoring: %s", e); +#endif + else + log_debug("Unrecognized $SYSTEMD_FIREWALL_BACKEND value, ignoring: %s", e); + } + + if (ctx->backend == _FW_BACKEND_INVALID) { + + if (fw_nftables_init_full(ctx, init_tables) >= 0) + ctx->backend = FW_BACKEND_NFTABLES; + else +#if HAVE_LIBIPTC + ctx->backend = FW_BACKEND_IPTABLES; +#else + ctx->backend = FW_BACKEND_NONE; +#endif + } + + if (ctx->backend != FW_BACKEND_NONE) + log_debug("Using %s as firewall backend.", firewall_backend_to_string(ctx->backend)); + else + log_debug("No firewall backend found."); +} + +int fw_ctx_new_full(FirewallContext **ret, bool init_tables) { + _cleanup_free_ FirewallContext *ctx = NULL; + + ctx = new(FirewallContext, 1); + if (!ctx) + return -ENOMEM; + + *ctx = (FirewallContext) { + .backend = _FW_BACKEND_INVALID, + }; + + firewall_backend_probe(ctx, init_tables); + + *ret = TAKE_PTR(ctx); + return 0; +} + +int fw_ctx_new(FirewallContext **ret) { + return fw_ctx_new_full(ret, /* init_tables= */ true); +} + +FirewallContext *fw_ctx_free(FirewallContext *ctx) { + if (!ctx) + return NULL; + + fw_nftables_exit(ctx); + + return mfree(ctx); +} + +size_t fw_ctx_get_reply_callback_count(FirewallContext *ctx) { + if (!ctx || !ctx->nfnl) + return 0; + + return netlink_get_reply_callback_count(ctx->nfnl); +} + +int fw_add_masquerade( + FirewallContext **ctx, + bool add, + int af, + const union in_addr_union *source, + unsigned source_prefixlen) { + + int r; + + assert(ctx); + + if (!*ctx) { + r = fw_ctx_new(ctx); + if (r < 0) + return r; + } + + switch ((*ctx)->backend) { +#if HAVE_LIBIPTC + case FW_BACKEND_IPTABLES: + return fw_iptables_add_masquerade(add, af, source, source_prefixlen); +#endif + case FW_BACKEND_NFTABLES: + return fw_nftables_add_masquerade(*ctx, add, af, source, source_prefixlen); + default: + return -EOPNOTSUPP; + } +} + +int fw_add_local_dnat( + FirewallContext **ctx, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote) { + + int r; + + assert(ctx); + + if (!*ctx) { + r = fw_ctx_new(ctx); + if (r < 0) + return r; + } + + switch ((*ctx)->backend) { +#if HAVE_LIBIPTC + case FW_BACKEND_IPTABLES: + return fw_iptables_add_local_dnat(add, af, protocol, local_port, remote, remote_port, previous_remote); +#endif + case FW_BACKEND_NFTABLES: + return fw_nftables_add_local_dnat(*ctx, add, af, protocol, local_port, remote, remote_port, previous_remote); + default: + return -EOPNOTSUPP; + } +} diff --git a/src/shared/firewall-util.h b/src/shared/firewall-util.h new file mode 100644 index 0000000..14e35be --- /dev/null +++ b/src/shared/firewall-util.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "conf-parser.h" +#include "in-addr-util.h" + +typedef struct FirewallContext FirewallContext; + +int fw_ctx_new(FirewallContext **ret); +int fw_ctx_new_full(FirewallContext **ret, bool init_tables); +FirewallContext *fw_ctx_free(FirewallContext *ctx); + +DEFINE_TRIVIAL_CLEANUP_FUNC(FirewallContext *, fw_ctx_free); + +size_t fw_ctx_get_reply_callback_count(FirewallContext *ctx); + +int fw_add_masquerade( + FirewallContext **ctx, + bool add, + int af, + const union in_addr_union *source, + unsigned source_prefixlen); + +int fw_add_local_dnat( + FirewallContext **ctx, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote); + +typedef enum NFTSetSource { + NFT_SET_SOURCE_ADDRESS, + NFT_SET_SOURCE_PREFIX, + NFT_SET_SOURCE_IFINDEX, + NFT_SET_SOURCE_CGROUP, + NFT_SET_SOURCE_USER, + NFT_SET_SOURCE_GROUP, + _NFT_SET_SOURCE_MAX, + _NFT_SET_SOURCE_INVALID = -EINVAL, +} NFTSetSource; + +typedef struct NFTSet { + NFTSetSource source; + int nfproto; + char *table; + char *set; +} NFTSet; + +typedef struct NFTSetContext { + NFTSet *sets; + size_t n_sets; +} NFTSetContext; + +void nft_set_context_clear(NFTSetContext *s); +int nft_set_context_dup(const NFTSetContext *src, NFTSetContext *dst); + +const char *nfproto_to_string(int i) _const_; +int nfproto_from_string(const char *s) _pure_; + +const char *nft_set_source_to_string(int i) _const_; +int nft_set_source_from_string(const char *s) _pure_; + +int nft_set_element_modify_iprange( + FirewallContext *ctx, + bool add, + int nfproto, + int af, + const char *table, + const char *set, + const union in_addr_union *source, + unsigned int source_prefixlen); + +int nft_set_element_modify_ip( + FirewallContext *ctx, + bool add, + int nfproto, + int af, + const char *table, + const char *set, + const union in_addr_union *source); + +int nft_set_element_modify_any( + FirewallContext *ctx, + bool add, + int nfproto, + const char *table, + const char *set, + const void *element, + size_t element_size); + +int nft_set_add(NFTSetContext *s, NFTSetSource source, int nfproto, const char *table, const char *set); + +typedef enum NFTSetParseFlags { + NFT_SET_PARSE_NETWORK, + NFT_SET_PARSE_CGROUP, +} NFTSetParseFlags; + +CONFIG_PARSER_PROTOTYPE(config_parse_nft_set); diff --git a/src/shared/format-table.c b/src/shared/format-table.c new file mode 100644 index 0000000..9a19177 --- /dev/null +++ b/src/shared/format-table.c @@ -0,0 +1,3061 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "format-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "gunicode.h" +#include "id128-util.h" +#include "in-addr-util.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "pager.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "terminal-util.h" +#include "time-util.h" +#include "user-util.h" +#include "utf8.h" + +#define DEFAULT_WEIGHT 100 + +/* + A few notes on implementation details: + + - TableCell is a 'fake' structure, it's just used as data type to pass references to specific cell positions in the + table. It can be easily converted to an index number and back. + + - TableData is where the actual data is stored: it encapsulates the data and formatting for a specific cell. It's + 'pseudo-immutable' and ref-counted. When a cell's data's formatting is to be changed, we duplicate the object if the + ref-counting is larger than 1. Note that TableData and its ref-counting is mostly not visible to the outside. The + outside only sees Table and TableCell. + + - The Table object stores a simple one-dimensional array of references to TableData objects, one row after the + previous one. + + - There's no special concept of a "row" or "column" in the table, and no special concept of the "header" row. It's all + derived from the cell index: we know how many cells are to be stored in a row, and can determine the rest from + that. The first row is always the header row. If header display is turned off we simply skip outputting the first + row. Also, when sorting rows we always leave the first row where it is, as the header shouldn't move. + + - Note because there's no row and no column object some properties that might be appropriate as row/column properties + are exposed as cell properties instead. For example, the "weight" of a column (which is used to determine where to + add/remove space preferable when expanding/compressing tables horizontally) is actually made the "weight" of a + cell. Given that we usually need it per-column though we will calculate the average across every cell of the column + instead. + + - To make things easy, when cells are added without any explicit configured formatting, then we'll copy the formatting + from the same cell in the previous cell. This is particularly useful for the "weight" of the cell (see above), as + this means setting the weight of the cells of the header row will nicely propagate to all cells in the other rows. +*/ + +typedef struct TableData { + unsigned n_ref; + TableDataType type; + + size_t minimum_width; /* minimum width for the column */ + size_t maximum_width; /* maximum width for the column */ + size_t formatted_for_width; /* the width we tried to format for */ + unsigned weight; /* the horizontal weight for this column, in case the table is expanded/compressed */ + unsigned ellipsize_percent; /* 0 … 100, where to place the ellipsis when compression is needed */ + unsigned align_percent; /* 0 … 100, where to pad with spaces when expanding is needed. 0: left-aligned, 100: right-aligned */ + + bool uppercase; /* Uppercase string on display */ + + const char *color; /* ANSI color string to use for this cell. When written to terminal should not move cursor. Will automatically be reset after the cell */ + const char *rgap_color; /* The ANSI color to use for the gap right of this cell. Usually used to underline entire rows in a gapless fashion */ + char *url; /* A URL to use for a clickable hyperlink */ + char *formatted; /* A cached textual representation of the cell data, before ellipsation/alignment */ + + union { + uint8_t data[0]; /* data is generic array */ + bool boolean; + usec_t timestamp; + usec_t timespan; + uint64_t size; + char string[0]; + char **strv; + int int_val; + int8_t int8; + int16_t int16; + int32_t int32; + int64_t int64; + unsigned uint_val; + uint8_t uint8; + uint16_t uint16; + uint32_t uint32; + uint64_t uint64; + int percent; /* we use 'int' as datatype for percent values in order to match the result of parse_percent() */ + int ifindex; + union in_addr_union address; + sd_id128_t id128; + uid_t uid; + gid_t gid; + pid_t pid; + mode_t mode; + dev_t devnum; + /* … add more here as we start supporting more cell data types … */ + }; +} TableData; + +static size_t TABLE_CELL_TO_INDEX(TableCell *cell) { + size_t i; + + assert(cell); + + i = PTR_TO_SIZE(cell); + assert(i > 0); + + return i-1; +} + +static TableCell* TABLE_INDEX_TO_CELL(size_t index) { + assert(index != SIZE_MAX); + return SIZE_TO_PTR(index + 1); +} + +struct Table { + size_t n_columns; + size_t n_cells; + + bool header; /* Whether to show the header row? */ + bool vertical; /* Whether to field names are on the left rather than the first line */ + + TableErsatz ersatz; /* What to show when we have an empty cell or an invalid value that cannot be rendered. */ + + size_t width; /* If == 0 format this as wide as necessary. If SIZE_MAX format this to console + * width or less wide, but not wider. Otherwise the width to format this table in. */ + size_t cell_height_max; /* Maximum number of lines per cell. (If there are more, ellipsis is shown. If SIZE_MAX then no limit is set, the default. == 0 is not allowed.) */ + + TableData **data; + + size_t *display_map; /* List of columns to show (by their index). It's fine if columns are listed multiple times or not at all */ + size_t n_display_map; + + size_t *sort_map; /* The columns to order rows by, in order of preference. */ + size_t n_sort_map; + + char **json_fields; + size_t n_json_fields; + + bool *reverse_map; +}; + +Table *table_new_raw(size_t n_columns) { + _cleanup_(table_unrefp) Table *t = NULL; + + assert(n_columns > 0); + + t = new(Table, 1); + if (!t) + return NULL; + + *t = (struct Table) { + .n_columns = n_columns, + .header = true, + .width = SIZE_MAX, + .cell_height_max = SIZE_MAX, + .ersatz = TABLE_ERSATZ_EMPTY, + }; + + return TAKE_PTR(t); +} + +Table *table_new_internal(const char *first_header, ...) { + _cleanup_(table_unrefp) Table *t = NULL; + size_t n_columns = 1; + va_list ap; + int r; + + assert(first_header); + + va_start(ap, first_header); + for (;;) { + if (!va_arg(ap, const char*)) + break; + + n_columns++; + } + va_end(ap); + + t = table_new_raw(n_columns); + if (!t) + return NULL; + + va_start(ap, first_header); + for (const char *h = first_header; h; h = va_arg(ap, const char*)) { + TableCell *cell; + + r = table_add_cell(t, &cell, TABLE_HEADER, h); + if (r < 0) { + va_end(ap); + return NULL; + } + } + va_end(ap); + + assert(t->n_columns == t->n_cells); + return TAKE_PTR(t); +} + +Table *table_new_vertical(void) { + _cleanup_(table_unrefp) Table *t = NULL; + TableCell *cell; + + t = table_new_raw(2); + if (!t) + return NULL; + + t->vertical = true; + t->header = false; + + if (table_add_cell(t, &cell, TABLE_HEADER, "key") < 0) + return NULL; + + if (table_set_align_percent(t, cell, 100) < 0) + return NULL; + + if (table_add_cell(t, &cell, TABLE_HEADER, "value") < 0) + return NULL; + + if (table_set_align_percent(t, cell, 0) < 0) + return NULL; + + return TAKE_PTR(t); +} + +static TableData *table_data_free(TableData *d) { + assert(d); + + free(d->formatted); + free(d->url); + + if (IN_SET(d->type, TABLE_STRV, TABLE_STRV_WRAPPED)) + strv_free(d->strv); + + return mfree(d); +} + +DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(TableData, table_data, table_data_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(TableData*, table_data_unref); + +Table *table_unref(Table *t) { + if (!t) + return NULL; + + for (size_t i = 0; i < t->n_cells; i++) + table_data_unref(t->data[i]); + + free(t->data); + free(t->display_map); + free(t->sort_map); + free(t->reverse_map); + + for (size_t i = 0; i < t->n_json_fields; i++) + free(t->json_fields[i]); + + free(t->json_fields); + + return mfree(t); +} + +static size_t table_data_size(TableDataType type, const void *data) { + + switch (type) { + + case TABLE_EMPTY: + return 0; + + case TABLE_STRING: + case TABLE_PATH: + case TABLE_PATH_BASENAME: + case TABLE_FIELD: + case TABLE_HEADER: + return strlen(data) + 1; + + case TABLE_STRV: + case TABLE_STRV_WRAPPED: + return sizeof(char **); + + case TABLE_BOOLEAN_CHECKMARK: + case TABLE_BOOLEAN: + return sizeof(bool); + + case TABLE_TIMESTAMP: + case TABLE_TIMESTAMP_UTC: + case TABLE_TIMESTAMP_RELATIVE: + case TABLE_TIMESTAMP_RELATIVE_MONOTONIC: + case TABLE_TIMESTAMP_LEFT: + case TABLE_TIMESTAMP_DATE: + case TABLE_TIMESPAN: + case TABLE_TIMESPAN_MSEC: + case TABLE_TIMESPAN_DAY: + return sizeof(usec_t); + + case TABLE_SIZE: + case TABLE_INT64: + case TABLE_UINT64: + case TABLE_UINT64_HEX: + case TABLE_BPS: + return sizeof(uint64_t); + + case TABLE_INT32: + case TABLE_UINT32: + case TABLE_UINT32_HEX: + return sizeof(uint32_t); + + case TABLE_INT16: + case TABLE_UINT16: + return sizeof(uint16_t); + + case TABLE_INT8: + case TABLE_UINT8: + return sizeof(uint8_t); + + case TABLE_INT: + case TABLE_UINT: + case TABLE_PERCENT: + case TABLE_IFINDEX: + case TABLE_SIGNAL: + return sizeof(int); + + case TABLE_IN_ADDR: + return sizeof(struct in_addr); + + case TABLE_IN6_ADDR: + return sizeof(struct in6_addr); + + case TABLE_UUID: + case TABLE_ID128: + return sizeof(sd_id128_t); + + case TABLE_UID: + return sizeof(uid_t); + case TABLE_GID: + return sizeof(gid_t); + case TABLE_PID: + return sizeof(pid_t); + + case TABLE_MODE: + case TABLE_MODE_INODE_TYPE: + return sizeof(mode_t); + + case TABLE_DEVNUM: + return sizeof(dev_t); + + default: + assert_not_reached(); + } +} + +static bool table_data_matches( + TableData *d, + TableDataType type, + const void *data, + size_t minimum_width, + size_t maximum_width, + unsigned weight, + unsigned align_percent, + unsigned ellipsize_percent, + bool uppercase) { + + size_t k, l; + assert(d); + + if (d->type != type) + return false; + + if (d->minimum_width != minimum_width) + return false; + + if (d->maximum_width != maximum_width) + return false; + + if (d->weight != weight) + return false; + + if (d->align_percent != align_percent) + return false; + + if (d->ellipsize_percent != ellipsize_percent) + return false; + + if (d->uppercase != uppercase) + return false; + + /* If a color/url is set, refuse to merge */ + if (d->color || d->rgap_color) + return false; + if (d->url) + return false; + + k = table_data_size(type, data); + l = table_data_size(d->type, d->data); + if (k != l) + return false; + + return memcmp_safe(data, d->data, l) == 0; +} + +static TableData *table_data_new( + TableDataType type, + const void *data, + size_t minimum_width, + size_t maximum_width, + unsigned weight, + unsigned align_percent, + unsigned ellipsize_percent, + bool uppercase) { + + _cleanup_free_ TableData *d = NULL; + size_t data_size; + + data_size = table_data_size(type, data); + + d = malloc0(offsetof(TableData, data) + data_size); + if (!d) + return NULL; + + d->n_ref = 1; + d->type = type; + d->minimum_width = minimum_width; + d->maximum_width = maximum_width; + d->weight = weight; + d->align_percent = align_percent; + d->ellipsize_percent = ellipsize_percent; + d->uppercase = uppercase; + + if (IN_SET(type, TABLE_STRV, TABLE_STRV_WRAPPED)) { + d->strv = strv_copy(data); + if (!d->strv) + return NULL; + } else + memcpy_safe(d->data, data, data_size); + + return TAKE_PTR(d); +} + +int table_add_cell_full( + Table *t, + TableCell **ret_cell, + TableDataType type, + const void *data, + size_t minimum_width, + size_t maximum_width, + unsigned weight, + unsigned align_percent, + unsigned ellipsize_percent) { + + _cleanup_(table_data_unrefp) TableData *d = NULL; + bool uppercase; + TableData *p; + + assert(t); + assert(type >= 0); + assert(type < _TABLE_DATA_TYPE_MAX); + + /* Special rule: patch NULL data fields to the empty field */ + if (!data) + type = TABLE_EMPTY; + + /* Determine the cell adjacent to the current one, but one row up */ + if (t->n_cells >= t->n_columns) + assert_se(p = t->data[t->n_cells - t->n_columns]); + else + p = NULL; + + /* If formatting parameters are left unspecified, copy from the previous row */ + if (minimum_width == SIZE_MAX) + minimum_width = p ? p->minimum_width : 1; + + if (weight == UINT_MAX) + weight = p ? p->weight : DEFAULT_WEIGHT; + + if (align_percent == UINT_MAX) + align_percent = p ? p->align_percent : 0; + + if (ellipsize_percent == UINT_MAX) + ellipsize_percent = p ? p->ellipsize_percent : 100; + + assert(align_percent <= 100); + assert(ellipsize_percent <= 100); + + uppercase = type == TABLE_HEADER; + + /* Small optimization: Pretty often adjacent cells in two subsequent lines have the same data and + * formatting. Let's see if we can reuse the cell data and ref it once more. */ + + if (p && table_data_matches(p, type, data, minimum_width, maximum_width, weight, align_percent, ellipsize_percent, uppercase)) + d = table_data_ref(p); + else { + d = table_data_new(type, data, minimum_width, maximum_width, weight, align_percent, ellipsize_percent, uppercase); + if (!d) + return -ENOMEM; + } + + if (!GREEDY_REALLOC(t->data, MAX(t->n_cells + 1, t->n_columns))) + return -ENOMEM; + + if (ret_cell) + *ret_cell = TABLE_INDEX_TO_CELL(t->n_cells); + + t->data[t->n_cells++] = TAKE_PTR(d); + + return 0; +} + +int table_add_cell_stringf_full(Table *t, TableCell **ret_cell, TableDataType dt, const char *format, ...) { + _cleanup_free_ char *buffer = NULL; + va_list ap; + int r; + + assert(t); + assert(IN_SET(dt, TABLE_STRING, TABLE_PATH, TABLE_PATH_BASENAME, TABLE_FIELD, TABLE_HEADER)); + + va_start(ap, format); + r = vasprintf(&buffer, format, ap); + va_end(ap); + if (r < 0) + return -ENOMEM; + + return table_add_cell(t, ret_cell, dt, buffer); +} + +int table_fill_empty(Table *t, size_t until_column) { + int r; + + assert(t); + + /* Fill the rest of the current line with empty cells until we reach the specified column. Will add + * at least one cell. Pass 0 in order to fill a line to the end or insert an empty line. */ + + if (until_column >= t->n_columns) + return -EINVAL; + + do { + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return r; + + } while ((t->n_cells % t->n_columns) != until_column); + + return 0; +} + +int table_dup_cell(Table *t, TableCell *cell) { + size_t i; + + assert(t); + + /* Add the data of the specified cell a second time as a new cell to the end. */ + + i = TABLE_CELL_TO_INDEX(cell); + if (i >= t->n_cells) + return -ENXIO; + + if (!GREEDY_REALLOC(t->data, MAX(t->n_cells + 1, t->n_columns))) + return -ENOMEM; + + t->data[t->n_cells++] = table_data_ref(t->data[i]); + return 0; +} + +static int table_dedup_cell(Table *t, TableCell *cell) { + _cleanup_free_ char *curl = NULL; + TableData *nd, *od; + size_t i; + + assert(t); + + /* Helper call that ensures the specified cell's data object has a ref count of 1, which we can use before + * changing a cell's formatting without effecting every other cell's formatting that shares the same data */ + + i = TABLE_CELL_TO_INDEX(cell); + if (i >= t->n_cells) + return -ENXIO; + + assert_se(od = t->data[i]); + if (od->n_ref == 1) + return 0; + + assert(od->n_ref > 1); + + if (od->url) { + curl = strdup(od->url); + if (!curl) + return -ENOMEM; + } + + nd = table_data_new( + od->type, + od->data, + od->minimum_width, + od->maximum_width, + od->weight, + od->align_percent, + od->ellipsize_percent, + od->uppercase); + if (!nd) + return -ENOMEM; + + nd->color = od->color; + nd->rgap_color = od->rgap_color; + nd->url = TAKE_PTR(curl); + + table_data_unref(od); + t->data[i] = nd; + + assert(nd->n_ref == 1); + + return 1; +} + +static TableData *table_get_data(Table *t, TableCell *cell) { + size_t i; + + assert(t); + assert(cell); + + /* Get the data object of the specified cell, or NULL if it doesn't exist */ + + i = TABLE_CELL_TO_INDEX(cell); + if (i >= t->n_cells) + return NULL; + + assert(t->data[i]); + assert(t->data[i]->n_ref > 0); + + return t->data[i]; +} + +int table_set_minimum_width(Table *t, TableCell *cell, size_t minimum_width) { + int r; + + assert(t); + assert(cell); + + if (minimum_width == SIZE_MAX) + minimum_width = 1; + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->minimum_width = minimum_width; + return 0; +} + +int table_set_maximum_width(Table *t, TableCell *cell, size_t maximum_width) { + int r; + + assert(t); + assert(cell); + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->maximum_width = maximum_width; + return 0; +} + +int table_set_weight(Table *t, TableCell *cell, unsigned weight) { + int r; + + assert(t); + assert(cell); + + if (weight == UINT_MAX) + weight = DEFAULT_WEIGHT; + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->weight = weight; + return 0; +} + +int table_set_align_percent(Table *t, TableCell *cell, unsigned percent) { + int r; + + assert(t); + assert(cell); + + if (percent == UINT_MAX) + percent = 0; + + assert(percent <= 100); + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->align_percent = percent; + return 0; +} + +int table_set_ellipsize_percent(Table *t, TableCell *cell, unsigned percent) { + int r; + + assert(t); + assert(cell); + + if (percent == UINT_MAX) + percent = 100; + + assert(percent <= 100); + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->ellipsize_percent = percent; + return 0; +} + +int table_set_color(Table *t, TableCell *cell, const char *color) { + int r; + + assert(t); + assert(cell); + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->color = empty_to_null(color); + return 0; +} + +int table_set_rgap_color(Table *t, TableCell *cell, const char *color) { + int r; + + assert(t); + assert(cell); + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + table_get_data(t, cell)->rgap_color = empty_to_null(color); + return 0; +} + +int table_set_url(Table *t, TableCell *cell, const char *url) { + _cleanup_free_ char *copy = NULL; + int r; + + assert(t); + assert(cell); + + if (url) { + copy = strdup(url); + if (!copy) + return -ENOMEM; + } + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + return free_and_replace(table_get_data(t, cell)->url, copy); +} + +int table_set_uppercase(Table *t, TableCell *cell, bool b) { + TableData *d; + int r; + + assert(t); + assert(cell); + + r = table_dedup_cell(t, cell); + if (r < 0) + return r; + + assert_se(d = table_get_data(t, cell)); + + if (d->uppercase == b) + return 0; + + d->formatted = mfree(d->formatted); + d->uppercase = b; + return 1; +} + +int table_update(Table *t, TableCell *cell, TableDataType type, const void *data) { + _cleanup_free_ char *curl = NULL; + TableData *nd, *od; + size_t i; + + assert(t); + assert(cell); + + i = TABLE_CELL_TO_INDEX(cell); + if (i >= t->n_cells) + return -ENXIO; + + assert_se(od = t->data[i]); + + if (od->url) { + curl = strdup(od->url); + if (!curl) + return -ENOMEM; + } + + nd = table_data_new( + type, + data, + od->minimum_width, + od->maximum_width, + od->weight, + od->align_percent, + od->ellipsize_percent, + od->uppercase); + if (!nd) + return -ENOMEM; + + nd->color = od->color; + nd->rgap_color = od->rgap_color; + nd->url = TAKE_PTR(curl); + + table_data_unref(od); + t->data[i] = nd; + + return 0; +} + +int table_add_many_internal(Table *t, TableDataType first_type, ...) { + TableCell *last_cell = NULL; + va_list ap; + int r; + + assert(t); + assert(first_type >= 0); + assert(first_type < _TABLE_DATA_TYPE_MAX); + + va_start(ap, first_type); + + for (TableDataType type = first_type;; type = va_arg(ap, TableDataType)) { + const void *data; + union { + uint64_t size; + usec_t usec; + int int_val; + int8_t int8; + int16_t int16; + int32_t int32; + int64_t int64; + unsigned uint_val; + uint8_t uint8; + uint16_t uint16; + uint32_t uint32; + uint64_t uint64; + int percent; + int ifindex; + bool b; + union in_addr_union address; + sd_id128_t id128; + uid_t uid; + gid_t gid; + pid_t pid; + mode_t mode; + dev_t devnum; + } buffer; + + switch (type) { + + case TABLE_EMPTY: + data = NULL; + break; + + case TABLE_STRING: + case TABLE_PATH: + case TABLE_PATH_BASENAME: + case TABLE_FIELD: + case TABLE_HEADER: + data = va_arg(ap, const char *); + break; + + case TABLE_STRV: + case TABLE_STRV_WRAPPED: + data = va_arg(ap, char * const *); + break; + + case TABLE_BOOLEAN_CHECKMARK: + case TABLE_BOOLEAN: + buffer.b = va_arg(ap, int); + data = &buffer.b; + break; + + case TABLE_TIMESTAMP: + case TABLE_TIMESTAMP_UTC: + case TABLE_TIMESTAMP_RELATIVE: + case TABLE_TIMESTAMP_RELATIVE_MONOTONIC: + case TABLE_TIMESTAMP_LEFT: + case TABLE_TIMESTAMP_DATE: + case TABLE_TIMESPAN: + case TABLE_TIMESPAN_MSEC: + case TABLE_TIMESPAN_DAY: + buffer.usec = va_arg(ap, usec_t); + data = &buffer.usec; + break; + + case TABLE_SIZE: + case TABLE_BPS: + buffer.size = va_arg(ap, uint64_t); + data = &buffer.size; + break; + + case TABLE_INT: + case TABLE_SIGNAL: + buffer.int_val = va_arg(ap, int); + data = &buffer.int_val; + break; + + case TABLE_INT8: { + int x = va_arg(ap, int); + assert(x >= INT8_MIN && x <= INT8_MAX); + + buffer.int8 = x; + data = &buffer.int8; + break; + } + + case TABLE_INT16: { + int x = va_arg(ap, int); + assert(x >= INT16_MIN && x <= INT16_MAX); + + buffer.int16 = x; + data = &buffer.int16; + break; + } + + case TABLE_INT32: + buffer.int32 = va_arg(ap, int32_t); + data = &buffer.int32; + break; + + case TABLE_INT64: + buffer.int64 = va_arg(ap, int64_t); + data = &buffer.int64; + break; + + case TABLE_UINT: + buffer.uint_val = va_arg(ap, unsigned); + data = &buffer.uint_val; + break; + + case TABLE_UINT8: { + unsigned x = va_arg(ap, unsigned); + assert(x <= UINT8_MAX); + + buffer.uint8 = x; + data = &buffer.uint8; + break; + } + + case TABLE_UINT16: { + unsigned x = va_arg(ap, unsigned); + assert(x <= UINT16_MAX); + + buffer.uint16 = x; + data = &buffer.uint16; + break; + } + + case TABLE_UINT32: + case TABLE_UINT32_HEX: + buffer.uint32 = va_arg(ap, uint32_t); + data = &buffer.uint32; + break; + + case TABLE_UINT64: + case TABLE_UINT64_HEX: + buffer.uint64 = va_arg(ap, uint64_t); + data = &buffer.uint64; + break; + + case TABLE_PERCENT: + buffer.percent = va_arg(ap, int); + data = &buffer.percent; + break; + + case TABLE_IFINDEX: + buffer.ifindex = va_arg(ap, int); + data = &buffer.ifindex; + break; + + case TABLE_IN_ADDR: + buffer.address = *va_arg(ap, union in_addr_union *); + data = &buffer.address.in; + break; + + case TABLE_IN6_ADDR: + buffer.address = *va_arg(ap, union in_addr_union *); + data = &buffer.address.in6; + break; + + case TABLE_UUID: + case TABLE_ID128: + buffer.id128 = va_arg(ap, sd_id128_t); + data = &buffer.id128; + break; + + case TABLE_UID: + buffer.uid = va_arg(ap, uid_t); + data = &buffer.uid; + break; + + case TABLE_GID: + buffer.gid = va_arg(ap, gid_t); + data = &buffer.gid; + break; + + case TABLE_PID: + buffer.pid = va_arg(ap, pid_t); + data = &buffer.pid; + break; + + case TABLE_MODE: + case TABLE_MODE_INODE_TYPE: + buffer.mode = va_arg(ap, mode_t); + data = &buffer.mode; + break; + + case TABLE_DEVNUM: + buffer.devnum = va_arg(ap, dev_t); + data = &buffer.devnum; + break; + + case TABLE_SET_MINIMUM_WIDTH: { + size_t w = va_arg(ap, size_t); + + r = table_set_minimum_width(t, last_cell, w); + goto check; + } + + case TABLE_SET_MAXIMUM_WIDTH: { + size_t w = va_arg(ap, size_t); + r = table_set_maximum_width(t, last_cell, w); + goto check; + } + + case TABLE_SET_WEIGHT: { + unsigned w = va_arg(ap, unsigned); + r = table_set_weight(t, last_cell, w); + goto check; + } + + case TABLE_SET_ALIGN_PERCENT: { + unsigned p = va_arg(ap, unsigned); + r = table_set_align_percent(t, last_cell, p); + goto check; + } + + case TABLE_SET_ELLIPSIZE_PERCENT: { + unsigned p = va_arg(ap, unsigned); + r = table_set_ellipsize_percent(t, last_cell, p); + goto check; + } + + case TABLE_SET_COLOR: { + const char *c = va_arg(ap, const char*); + r = table_set_color(t, last_cell, c); + goto check; + } + + case TABLE_SET_RGAP_COLOR: { + const char *c = va_arg(ap, const char*); + r = table_set_rgap_color(t, last_cell, c); + goto check; + } + + case TABLE_SET_BOTH_COLORS: { + const char *c = va_arg(ap, const char*); + + r = table_set_color(t, last_cell, c); + if (r < 0) { + va_end(ap); + return r; + } + + r = table_set_rgap_color(t, last_cell, c); + goto check; + } + + case TABLE_SET_URL: { + const char *u = va_arg(ap, const char*); + r = table_set_url(t, last_cell, u); + goto check; + } + + case TABLE_SET_UPPERCASE: { + int u = va_arg(ap, int); + r = table_set_uppercase(t, last_cell, u); + goto check; + } + + case _TABLE_DATA_TYPE_MAX: + /* Used as end marker */ + va_end(ap); + return 0; + + default: + assert_not_reached(); + } + + r = table_add_cell(t, &last_cell, type, data); + check: + if (r < 0) { + va_end(ap); + return r; + } + } +} + +void table_set_header(Table *t, bool b) { + assert(t); + + t->header = b; +} + +void table_set_width(Table *t, size_t width) { + assert(t); + + t->width = width; +} + +void table_set_cell_height_max(Table *t, size_t height) { + assert(t); + assert(height >= 1 || height == SIZE_MAX); + + t->cell_height_max = height; +} + +void table_set_ersatz_string(Table *t, TableErsatz ersatz) { + assert(t); + assert(ersatz >= 0 && ersatz < _TABLE_ERSATZ_MAX); + + t->ersatz = ersatz; +} + +static const char* table_ersatz_string(const Table *t) { + switch (t->ersatz) { + case TABLE_ERSATZ_EMPTY: + return ""; + case TABLE_ERSATZ_DASH: + return "-"; + case TABLE_ERSATZ_UNSET: + return "(unset)"; + case TABLE_ERSATZ_NA: + return "n/a"; + default: + assert_not_reached(); + } +} + +static int table_set_display_all(Table *t) { + size_t *d; + + assert(t); + + /* Initialize the display map to the identity */ + + d = reallocarray(t->display_map, t->n_columns, sizeof(size_t)); + if (!d) + return -ENOMEM; + + for (size_t i = 0; i < t->n_columns; i++) + d[i] = i; + + t->display_map = d; + t->n_display_map = t->n_columns; + + return 0; +} + +int table_set_display_internal(Table *t, size_t first_column, ...) { + size_t column; + va_list ap; + + assert(t); + + column = first_column; + + va_start(ap, first_column); + for (;;) { + assert(column < t->n_columns); + + if (!GREEDY_REALLOC(t->display_map, MAX(t->n_columns, t->n_display_map+1))) { + va_end(ap); + return -ENOMEM; + } + + t->display_map[t->n_display_map++] = column; + + column = va_arg(ap, size_t); + if (column == SIZE_MAX) + break; + + } + va_end(ap); + + return 0; +} + +int table_set_sort_internal(Table *t, size_t first_column, ...) { + size_t column; + va_list ap; + + assert(t); + + column = first_column; + + va_start(ap, first_column); + for (;;) { + assert(column < t->n_columns); + + if (!GREEDY_REALLOC(t->sort_map, MAX(t->n_columns, t->n_sort_map+1))) { + va_end(ap); + return -ENOMEM; + } + + t->sort_map[t->n_sort_map++] = column; + + column = va_arg(ap, size_t); + if (column == SIZE_MAX) + break; + } + va_end(ap); + + return 0; +} + +int table_hide_column_from_display_internal(Table *t, ...) { + size_t cur = 0; + int r; + + assert(t); + + /* If the display map is empty, initialize it with all available columns */ + if (!t->display_map) { + r = table_set_display_all(t); + if (r < 0) + return r; + } + + for (size_t i = 0; i < t->n_display_map; i++) { + bool listed = false; + va_list ap; + + va_start(ap, t); + for (;;) { + size_t column; + + column = va_arg(ap, size_t); + if (column == SIZE_MAX) + break; + if (column == t->display_map[i]) { + listed = true; + break; + } + } + va_end(ap); + + if (listed) + continue; + + t->display_map[cur++] = t->display_map[i]; + } + + t->n_display_map = cur; + + return 0; +} + +static int cell_data_compare(TableData *a, size_t index_a, TableData *b, size_t index_b) { + int r; + + assert(a); + assert(b); + + if (a->type == b->type) { + + /* We only define ordering for cells of the same data type. If cells with different data types are + * compared we follow the order the cells were originally added in */ + + switch (a->type) { + + case TABLE_STRING: + case TABLE_FIELD: + case TABLE_HEADER: + return strcmp(a->string, b->string); + + case TABLE_PATH: + case TABLE_PATH_BASENAME: + return path_compare(a->string, b->string); + + case TABLE_STRV: + case TABLE_STRV_WRAPPED: + return strv_compare(a->strv, b->strv); + + case TABLE_BOOLEAN: + if (!a->boolean && b->boolean) + return -1; + if (a->boolean && !b->boolean) + return 1; + return 0; + + case TABLE_TIMESTAMP: + case TABLE_TIMESTAMP_UTC: + case TABLE_TIMESTAMP_RELATIVE: + case TABLE_TIMESTAMP_RELATIVE_MONOTONIC: + case TABLE_TIMESTAMP_LEFT: + case TABLE_TIMESTAMP_DATE: + return CMP(a->timestamp, b->timestamp); + + case TABLE_TIMESPAN: + case TABLE_TIMESPAN_MSEC: + case TABLE_TIMESPAN_DAY: + return CMP(a->timespan, b->timespan); + + case TABLE_SIZE: + case TABLE_BPS: + return CMP(a->size, b->size); + + case TABLE_INT: + case TABLE_SIGNAL: + return CMP(a->int_val, b->int_val); + + case TABLE_INT8: + return CMP(a->int8, b->int8); + + case TABLE_INT16: + return CMP(a->int16, b->int16); + + case TABLE_INT32: + return CMP(a->int32, b->int32); + + case TABLE_INT64: + return CMP(a->int64, b->int64); + + case TABLE_UINT: + return CMP(a->uint_val, b->uint_val); + + case TABLE_UINT8: + return CMP(a->uint8, b->uint8); + + case TABLE_UINT16: + return CMP(a->uint16, b->uint16); + + case TABLE_UINT32: + case TABLE_UINT32_HEX: + return CMP(a->uint32, b->uint32); + + case TABLE_UINT64: + case TABLE_UINT64_HEX: + return CMP(a->uint64, b->uint64); + + case TABLE_PERCENT: + return CMP(a->percent, b->percent); + + case TABLE_IFINDEX: + return CMP(a->ifindex, b->ifindex); + + case TABLE_IN_ADDR: + return CMP(a->address.in.s_addr, b->address.in.s_addr); + + case TABLE_IN6_ADDR: + return memcmp(&a->address.in6, &b->address.in6, FAMILY_ADDRESS_SIZE(AF_INET6)); + + case TABLE_UUID: + case TABLE_ID128: + return memcmp(&a->id128, &b->id128, sizeof(sd_id128_t)); + + case TABLE_UID: + return CMP(a->uid, b->uid); + + case TABLE_GID: + return CMP(a->gid, b->gid); + + case TABLE_PID: + return CMP(a->pid, b->pid); + + case TABLE_MODE: + case TABLE_MODE_INODE_TYPE: + return CMP(a->mode, b->mode); + + case TABLE_DEVNUM: + r = CMP(major(a->devnum), major(b->devnum)); + if (r != 0) + return r; + + return CMP(minor(a->devnum), minor(b->devnum)); + + default: + ; + } + } + + /* Generic fallback using the original order in which the cells where added. */ + return CMP(index_a, index_b); +} + +static int table_data_compare(const size_t *a, const size_t *b, Table *t) { + int r; + + assert(t); + assert(t->sort_map); + + /* Make sure the header stays at the beginning */ + if (*a < t->n_columns && *b < t->n_columns) + return 0; + if (*a < t->n_columns) + return -1; + if (*b < t->n_columns) + return 1; + + /* Order other lines by the sorting map */ + for (size_t i = 0; i < t->n_sort_map; i++) { + TableData *d, *dd; + + d = t->data[*a + t->sort_map[i]]; + dd = t->data[*b + t->sort_map[i]]; + + r = cell_data_compare(d, *a, dd, *b); + if (r != 0) + return t->reverse_map && t->reverse_map[t->sort_map[i]] ? -r : r; + } + + /* Order identical lines by the order there were originally added in */ + return CMP(*a, *b); +} + +static char* format_strv_width(char **strv, size_t column_width) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + f = memstream_init(&m); + if (!f) + return NULL; + + size_t position = 0; + STRV_FOREACH(p, strv) { + size_t our_len = utf8_console_width(*p); /* This returns -1 on invalid utf-8 (which shouldn't happen). + * If that happens, we'll just print one item per line. */ + + if (position == 0) { + fputs(*p, f); + position = our_len; + } else if (size_add(size_add(position, 1), our_len) <= column_width) { + fprintf(f, " %s", *p); + position = size_add(size_add(position, 1), our_len); + } else { + fprintf(f, "\n%s", *p); + position = our_len; + } + } + + char *buf; + if (memstream_finalize(&m, &buf, NULL) < 0) + return NULL; + + return buf; +} + +static const char *table_data_format(Table *t, TableData *d, bool avoid_uppercasing, size_t column_width, bool *have_soft) { + assert(d); + + if (d->formatted && + /* Only TABLE_STRV_WRAPPED adjust based on column_width so far… */ + (d->type != TABLE_STRV_WRAPPED || d->formatted_for_width == column_width)) + return d->formatted; + + switch (d->type) { + case TABLE_EMPTY: + return table_ersatz_string(t); + + case TABLE_STRING: + case TABLE_PATH: + case TABLE_PATH_BASENAME: + case TABLE_FIELD: + case TABLE_HEADER: { + _cleanup_free_ char *bn = NULL; + const char *s; + + if (d->type == TABLE_PATH_BASENAME) + s = path_extract_filename(d->string, &bn) < 0 ? d->string : bn; + else + s = d->string; + + if (d->uppercase && !avoid_uppercasing) { + d->formatted = new(char, strlen(s) + (d->type == TABLE_FIELD) + 1); + if (!d->formatted) + return NULL; + + char *q = d->formatted; + for (const char *p = s; *p; p++) + *(q++) = (char) toupper((unsigned char) *p); + + if (d->type == TABLE_FIELD) + *(q++) = ':'; + + *q = 0; + return d->formatted; + } else if (d->type == TABLE_FIELD) { + d->formatted = strjoin(s, ":"); + if (!d->formatted) + return NULL; + + return d->formatted; + } + + if (bn) { + d->formatted = TAKE_PTR(bn); + return d->formatted; + } + + return d->string; + } + + case TABLE_STRV: + if (strv_isempty(d->strv)) + return table_ersatz_string(t); + + d->formatted = strv_join(d->strv, "\n"); + if (!d->formatted) + return NULL; + break; + + case TABLE_STRV_WRAPPED: { + if (strv_isempty(d->strv)) + return table_ersatz_string(t); + + char *buf = format_strv_width(d->strv, column_width); + if (!buf) + return NULL; + + free_and_replace(d->formatted, buf); + d->formatted_for_width = column_width; + if (have_soft) + *have_soft = true; + + break; + } + + case TABLE_BOOLEAN: + return yes_no(d->boolean); + + case TABLE_BOOLEAN_CHECKMARK: + return special_glyph(d->boolean ? SPECIAL_GLYPH_CHECK_MARK : SPECIAL_GLYPH_CROSS_MARK); + + case TABLE_TIMESTAMP: + case TABLE_TIMESTAMP_UTC: + case TABLE_TIMESTAMP_RELATIVE: + case TABLE_TIMESTAMP_RELATIVE_MONOTONIC: + case TABLE_TIMESTAMP_LEFT: + case TABLE_TIMESTAMP_DATE: { + _cleanup_free_ char *p = NULL; + char *ret; + + p = new(char, + IN_SET(d->type, TABLE_TIMESTAMP_RELATIVE, TABLE_TIMESTAMP_RELATIVE_MONOTONIC, TABLE_TIMESTAMP_LEFT) ? + FORMAT_TIMESTAMP_RELATIVE_MAX : FORMAT_TIMESTAMP_MAX); + if (!p) + return NULL; + + if (d->type == TABLE_TIMESTAMP) + ret = format_timestamp(p, FORMAT_TIMESTAMP_MAX, d->timestamp); + else if (d->type == TABLE_TIMESTAMP_UTC) + ret = format_timestamp_style(p, FORMAT_TIMESTAMP_MAX, d->timestamp, TIMESTAMP_UTC); + else if (d->type == TABLE_TIMESTAMP_DATE) + ret = format_timestamp_style(p, FORMAT_TIMESTAMP_MAX, d->timestamp, TIMESTAMP_DATE); + else if (d->type == TABLE_TIMESTAMP_RELATIVE_MONOTONIC) + ret = format_timestamp_relative_monotonic(p, FORMAT_TIMESTAMP_RELATIVE_MAX, d->timestamp); + else + ret = format_timestamp_relative_full(p, FORMAT_TIMESTAMP_RELATIVE_MAX, + d->timestamp, CLOCK_REALTIME, + /* implicit_left = */ d->type == TABLE_TIMESTAMP_LEFT); + if (!ret) + return "-"; + + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_TIMESPAN: + case TABLE_TIMESPAN_MSEC: + case TABLE_TIMESPAN_DAY: { + _cleanup_free_ char *p = NULL; + + p = new(char, FORMAT_TIMESPAN_MAX); + if (!p) + return NULL; + + if (!format_timespan(p, FORMAT_TIMESPAN_MAX, d->timespan, + d->type == TABLE_TIMESPAN ? 0 : + d->type == TABLE_TIMESPAN_MSEC ? USEC_PER_MSEC : USEC_PER_DAY)) + return "-"; + + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_SIZE: { + _cleanup_free_ char *p = NULL; + + p = new(char, FORMAT_BYTES_MAX); + if (!p) + return NULL; + + if (!format_bytes(p, FORMAT_BYTES_MAX, d->size)) + return table_ersatz_string(t); + + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_BPS: { + _cleanup_free_ char *p = NULL; + size_t n; + + p = new(char, FORMAT_BYTES_MAX+2); + if (!p) + return NULL; + + if (!format_bytes_full(p, FORMAT_BYTES_MAX, d->size, 0)) + return table_ersatz_string(t); + + n = strlen(p); + strscpy(p + n, FORMAT_BYTES_MAX + 2 - n, "bps"); + + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_INT: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->int_val) + 1); + if (!p) + return NULL; + + sprintf(p, "%i", d->int_val); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_INT8: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->int8) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIi8, d->int8); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_INT16: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->int16) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIi16, d->int16); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_INT32: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->int32) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIi32, d->int32); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_INT64: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->int64) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIi64, d->int64); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->uint_val) + 1); + if (!p) + return NULL; + + sprintf(p, "%u", d->uint_val); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT8: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->uint8) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIu8, d->uint8); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT16: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->uint16) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIu16, d->uint16); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT32: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->uint32) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIu32, d->uint32); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT32_HEX: { + _cleanup_free_ char *p = NULL; + + p = new(char, 8 + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIx32, d->uint32); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT64: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->uint64) + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIu64, d->uint64); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_UINT64_HEX: { + _cleanup_free_ char *p = NULL; + + p = new(char, 16 + 1); + if (!p) + return NULL; + + sprintf(p, "%" PRIx64, d->uint64); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_PERCENT: { + _cleanup_free_ char *p = NULL; + + p = new(char, DECIMAL_STR_WIDTH(d->percent) + 2); + if (!p) + return NULL; + + sprintf(p, "%i%%" , d->percent); + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_IFINDEX: { + _cleanup_free_ char *p = NULL; + + if (format_ifname_full_alloc(d->ifindex, FORMAT_IFNAME_IFINDEX, &p) < 0) + return NULL; + + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_IN_ADDR: + case TABLE_IN6_ADDR: { + _cleanup_free_ char *p = NULL; + + if (in_addr_to_string(d->type == TABLE_IN_ADDR ? AF_INET : AF_INET6, + &d->address, &p) < 0) + return NULL; + + d->formatted = TAKE_PTR(p); + break; + } + + case TABLE_ID128: { + char *p; + + p = new(char, SD_ID128_STRING_MAX); + if (!p) + return NULL; + + d->formatted = sd_id128_to_string(d->id128, p); + break; + } + + case TABLE_UUID: { + char *p; + + p = new(char, SD_ID128_UUID_STRING_MAX); + if (!p) + return NULL; + + d->formatted = sd_id128_to_uuid_string(d->id128, p); + break; + } + + case TABLE_UID: { + char *p; + + if (!uid_is_valid(d->uid)) + return table_ersatz_string(t); + + p = new(char, DECIMAL_STR_WIDTH(d->uid) + 1); + if (!p) + return NULL; + sprintf(p, UID_FMT, d->uid); + + d->formatted = p; + break; + } + + case TABLE_GID: { + char *p; + + if (!gid_is_valid(d->gid)) + return table_ersatz_string(t); + + p = new(char, DECIMAL_STR_WIDTH(d->gid) + 1); + if (!p) + return NULL; + sprintf(p, GID_FMT, d->gid); + + d->formatted = p; + break; + } + + case TABLE_PID: { + char *p; + + if (!pid_is_valid(d->pid)) + return table_ersatz_string(t); + + p = new(char, DECIMAL_STR_WIDTH(d->pid) + 1); + if (!p) + return NULL; + sprintf(p, PID_FMT, d->pid); + + d->formatted = p; + break; + } + + case TABLE_SIGNAL: { + const char *suffix; + char *p; + + suffix = signal_to_string(d->int_val); + if (!suffix) + return table_ersatz_string(t); + + p = strjoin("SIG", suffix); + if (!p) + return NULL; + + d->formatted = p; + break; + } + + case TABLE_MODE: { + char *p; + + if (d->mode == MODE_INVALID) + return table_ersatz_string(t); + + p = new(char, 4 + 1); + if (!p) + return NULL; + + sprintf(p, "%04o", d->mode & 07777); + d->formatted = p; + break; + } + + case TABLE_MODE_INODE_TYPE: + + if (d->mode == MODE_INVALID) + return table_ersatz_string(t); + + return inode_type_to_string(d->mode); + + case TABLE_DEVNUM: + if (devnum_is_zero(d->devnum)) + return table_ersatz_string(t); + + if (asprintf(&d->formatted, DEVNUM_FORMAT_STR, DEVNUM_FORMAT_VAL(d->devnum)) < 0) + return NULL; + + break; + + default: + assert_not_reached(); + } + + return d->formatted; +} + +static int console_width_height( + const char *s, + size_t *ret_width, + size_t *ret_height) { + + size_t max_width = 0, height = 0; + const char *p; + + assert(s); + + /* Determine the width and height in console character cells the specified string needs. */ + + do { + size_t k; + + p = strchr(s, '\n'); + if (p) { + _cleanup_free_ char *c = NULL; + + c = strndup(s, p - s); + if (!c) + return -ENOMEM; + + k = utf8_console_width(c); + s = p + 1; + } else { + k = utf8_console_width(s); + s = NULL; + } + if (k == SIZE_MAX) + return -EINVAL; + if (k > max_width) + max_width = k; + + height++; + } while (!isempty(s)); + + if (ret_width) + *ret_width = max_width; + + if (ret_height) + *ret_height = height; + + return 0; +} + +static int table_data_requested_width_height( + Table *table, + TableData *d, + size_t available_width, + size_t *ret_width, + size_t *ret_height, + bool *have_soft) { + + _cleanup_free_ char *truncated = NULL; + bool truncation_applied = false; + size_t width, height; + const char *t; + int r; + bool soft = false; + + t = table_data_format(table, d, false, available_width, &soft); + if (!t) + return -ENOMEM; + + if (table->cell_height_max != SIZE_MAX) { + r = string_truncate_lines(t, table->cell_height_max, &truncated); + if (r < 0) + return r; + if (r > 0) + truncation_applied = true; + + t = truncated; + } + + r = console_width_height(t, &width, &height); + if (r < 0) + return r; + + if (d->maximum_width != SIZE_MAX && width > d->maximum_width) + width = d->maximum_width; + + if (width < d->minimum_width) + width = d->minimum_width; + + if (ret_width) + *ret_width = width; + if (ret_height) + *ret_height = height; + if (have_soft && soft) + *have_soft = true; + + return truncation_applied; +} + +static char *align_string_mem(const char *str, const char *url, size_t new_length, unsigned percent) { + size_t w = 0, space, lspace, old_length, clickable_length; + _cleanup_free_ char *clickable = NULL; + const char *p; + char *ret; + int r; + + /* As with ellipsize_mem(), 'old_length' is a byte size while 'new_length' is a width in character cells */ + + assert(str); + assert(percent <= 100); + + old_length = strlen(str); + + if (url) { + r = terminal_urlify(url, str, &clickable); + if (r < 0) + return NULL; + + clickable_length = strlen(clickable); + } else + clickable_length = old_length; + + /* Determine current width on screen */ + p = str; + while (p < str + old_length) { + char32_t c; + + if (utf8_encoded_to_unichar(p, &c) < 0) { + p++, w++; /* count invalid chars as 1 */ + continue; + } + + p = utf8_next_char(p); + w += unichar_iswide(c) ? 2 : 1; + } + + /* Already wider than the target, if so, don't do anything */ + if (w >= new_length) + return clickable ? TAKE_PTR(clickable) : strdup(str); + + /* How much spaces shall we add? An how much on the left side? */ + space = new_length - w; + lspace = space * percent / 100U; + + ret = new(char, space + clickable_length + 1); + if (!ret) + return NULL; + + for (size_t i = 0; i < lspace; i++) + ret[i] = ' '; + memcpy(ret + lspace, clickable ?: str, clickable_length); + for (size_t i = lspace + clickable_length; i < space + clickable_length; i++) + ret[i] = ' '; + + ret[space + clickable_length] = 0; + return ret; +} + +static bool table_data_isempty(TableData *d) { + assert(d); + + if (d->type == TABLE_EMPTY) + return true; + + /* Let's also consider an empty strv as truly empty. */ + if (IN_SET(d->type, TABLE_STRV, TABLE_STRV_WRAPPED)) + return strv_isempty(d->strv); + + /* Note that an empty string we do not consider empty here! */ + return false; +} + +static const char* table_data_color(TableData *d) { + assert(d); + + if (d->color) + return d->color; + + /* Let's implicitly color all "empty" cells in grey, in case an "empty_string" is set that is not empty */ + if (table_data_isempty(d)) + return ansi_grey(); + + if (d->type == TABLE_FIELD) + return ansi_bright_blue(); + if (d->type == TABLE_HEADER) + return ansi_underline(); + + return NULL; +} + +static const char* table_data_rgap_color(TableData *d) { + assert(d); + + if (d->rgap_color) + return d->rgap_color; + + if (d->type == TABLE_HEADER) + return ansi_underline(); + + return NULL; +} + +int table_print(Table *t, FILE *f) { + size_t n_rows, *minimum_width, *maximum_width, display_columns, *requested_width, + table_minimum_width, table_maximum_width, table_requested_width, table_effective_width, + *width = NULL; + _cleanup_free_ size_t *sorted = NULL; + uint64_t *column_weight, weight_sum; + int r; + + assert(t); + + if (!f) + f = stdout; + + /* Ensure we have no incomplete rows */ + assert(t->n_cells % t->n_columns == 0); + + n_rows = t->n_cells / t->n_columns; + assert(n_rows > 0); /* at least the header row must be complete */ + + if (t->sort_map) { + /* If sorting is requested, let's calculate an index table we use to lookup the actual index to display with. */ + + sorted = new(size_t, n_rows); + if (!sorted) + return -ENOMEM; + + for (size_t i = 0; i < n_rows; i++) + sorted[i] = i * t->n_columns; + + typesafe_qsort_r(sorted, n_rows, table_data_compare, t); + } + + if (t->display_map) + display_columns = t->n_display_map; + else + display_columns = t->n_columns; + + assert(display_columns > 0); + + minimum_width = newa(size_t, display_columns); + maximum_width = newa(size_t, display_columns); + requested_width = newa(size_t, display_columns); + column_weight = newa0(uint64_t, display_columns); + + for (size_t j = 0; j < display_columns; j++) { + minimum_width[j] = 1; + maximum_width[j] = SIZE_MAX; + } + + for (unsigned pass = 0; pass < 2; pass++) { + /* First pass: determine column sizes */ + + for (size_t j = 0; j < display_columns; j++) + requested_width[j] = SIZE_MAX; + + bool any_soft = false; + + for (size_t i = t->header ? 0 : 1; i < n_rows; i++) { + TableData **row; + + /* Note that we don't care about ordering at this time, as we just want to determine column sizes, + * hence we don't care for sorted[] during the first pass. */ + row = t->data + i * t->n_columns; + + for (size_t j = 0; j < display_columns; j++) { + TableData *d; + size_t req_width, req_height; + + assert_se(d = row[t->display_map ? t->display_map[j] : j]); + + r = table_data_requested_width_height(t, d, + width ? width[j] : SIZE_MAX, + &req_width, &req_height, &any_soft); + if (r < 0) + return r; + if (r > 0) { /* Truncated because too many lines? */ + _cleanup_free_ char *last = NULL; + const char *field; + + /* If we are going to show only the first few lines of a cell that has + * multiple make sure that we have enough space horizontally to show an + * ellipsis. Hence, let's figure out the last line, and account for its + * length plus ellipsis. */ + + field = table_data_format(t, d, false, + width ? width[j] : SIZE_MAX, + &any_soft); + if (!field) + return -ENOMEM; + + assert_se(t->cell_height_max > 0); + r = string_extract_line(field, t->cell_height_max-1, &last); + if (r < 0) + return r; + + req_width = MAX(req_width, + utf8_console_width(last) + + utf8_console_width(special_glyph(SPECIAL_GLYPH_ELLIPSIS))); + } + + /* Determine the biggest width that any cell in this column would like to have */ + if (requested_width[j] == SIZE_MAX || + requested_width[j] < req_width) + requested_width[j] = req_width; + + /* Determine the minimum width any cell in this column needs */ + if (minimum_width[j] < d->minimum_width) + minimum_width[j] = d->minimum_width; + + /* Determine the maximum width any cell in this column needs */ + if (d->maximum_width != SIZE_MAX && + (maximum_width[j] == SIZE_MAX || + maximum_width[j] > d->maximum_width)) + maximum_width[j] = d->maximum_width; + + /* Determine the full columns weight */ + column_weight[j] += d->weight; + } + } + + /* One space between each column */ + table_requested_width = table_minimum_width = table_maximum_width = display_columns - 1; + + /* Calculate the total weight for all columns, plus the minimum, maximum and requested width for the table. */ + weight_sum = 0; + for (size_t j = 0; j < display_columns; j++) { + weight_sum += column_weight[j]; + + table_minimum_width += minimum_width[j]; + + if (maximum_width[j] == SIZE_MAX) + table_maximum_width = SIZE_MAX; + else + table_maximum_width += maximum_width[j]; + + table_requested_width += requested_width[j]; + } + + /* Calculate effective table width */ + if (t->width != 0 && t->width != SIZE_MAX) + table_effective_width = t->width; + else if (t->width == 0 || + ((pass > 0 || !any_soft) && (pager_have() || !isatty(STDOUT_FILENO)))) + table_effective_width = table_requested_width; + else + table_effective_width = MIN(table_requested_width, columns()); + + if (table_maximum_width != SIZE_MAX && table_effective_width > table_maximum_width) + table_effective_width = table_maximum_width; + + if (table_effective_width < table_minimum_width) + table_effective_width = table_minimum_width; + + if (!width) + width = newa(size_t, display_columns); + + if (table_effective_width >= table_requested_width) { + size_t extra; + + /* We have extra room, let's distribute it among columns according to their weights. We first provide + * each column with what it asked for and the distribute the rest. */ + + extra = table_effective_width - table_requested_width; + + for (size_t j = 0; j < display_columns; j++) { + size_t delta; + + if (weight_sum == 0) + width[j] = requested_width[j] + extra / (display_columns - j); /* Avoid division by zero */ + else + width[j] = requested_width[j] + (extra * column_weight[j]) / weight_sum; + + if (maximum_width[j] != SIZE_MAX && width[j] > maximum_width[j]) + width[j] = maximum_width[j]; + + if (width[j] < minimum_width[j]) + width[j] = minimum_width[j]; + + delta = LESS_BY(width[j], requested_width[j]); + + /* Subtract what we just added from the rest */ + if (extra > delta) + extra -= delta; + else + extra = 0; + + assert(weight_sum >= column_weight[j]); + weight_sum -= column_weight[j]; + } + + break; /* Every column should be happy, no need to repeat calculations. */ + } else { + /* We need to compress the table, columns can't get what they asked for. We first provide each column + * with the minimum they need, and then distribute anything left. */ + bool finalize = false; + size_t extra; + + extra = table_effective_width - table_minimum_width; + + for (size_t j = 0; j < display_columns; j++) + width[j] = SIZE_MAX; + + for (;;) { + bool restart = false; + + for (size_t j = 0; j < display_columns; j++) { + size_t delta, w; + + /* Did this column already get something assigned? If so, let's skip to the next */ + if (width[j] != SIZE_MAX) + continue; + + if (weight_sum == 0) + w = minimum_width[j] + extra / (display_columns - j); /* avoid division by zero */ + else + w = minimum_width[j] + (extra * column_weight[j]) / weight_sum; + + if (w >= requested_width[j]) { + /* Never give more than requested. If we hit a column like this, there's more + * space to allocate to other columns which means we need to restart the + * iteration. However, if we hit a column like this, let's assign it the space + * it wanted for good early. */ + + w = requested_width[j]; + restart = true; + + } else if (!finalize) + continue; + + width[j] = w; + + assert(w >= minimum_width[j]); + delta = w - minimum_width[j]; + + assert(delta <= extra); + extra -= delta; + + assert(weight_sum >= column_weight[j]); + weight_sum -= column_weight[j]; + + if (restart && !finalize) + break; + } + + if (finalize) + break; + + if (!restart) + finalize = true; + } + + if (!any_soft) /* Some columns got less than requested. If some cells were "soft", + * let's try to reformat them with the new widths. Otherwise, let's + * move on. */ + break; + } + } + + /* Second pass: show output */ + for (size_t i = t->header ? 0 : 1; i < n_rows; i++) { + size_t n_subline = 0; + bool more_sublines; + TableData **row; + + if (sorted) + row = t->data + sorted[i]; + else + row = t->data + i * t->n_columns; + + do { + const char *gap_color = NULL; + more_sublines = false; + + for (size_t j = 0; j < display_columns; j++) { + _cleanup_free_ char *buffer = NULL, *extracted = NULL; + bool lines_truncated = false; + const char *field, *color = NULL; + TableData *d; + size_t l; + + assert_se(d = row[t->display_map ? t->display_map[j] : j]); + + field = table_data_format(t, d, false, width[j], NULL); + if (!field) + return -ENOMEM; + + r = string_extract_line(field, n_subline, &extracted); + if (r < 0) + return r; + if (r > 0) { + /* There are more lines to come */ + if ((t->cell_height_max == SIZE_MAX || n_subline + 1 < t->cell_height_max)) + more_sublines = true; /* There are more lines to come */ + else + lines_truncated = true; + } + if (extracted) + field = extracted; + + l = utf8_console_width(field); + if (l > width[j]) { + /* Field is wider than allocated space. Let's ellipsize */ + + buffer = ellipsize(field, width[j], /* ellipsize at the end if we truncated coming lines, otherwise honour configuration */ + lines_truncated ? 100 : d->ellipsize_percent); + if (!buffer) + return -ENOMEM; + + field = buffer; + } else { + if (lines_truncated) { + _cleanup_free_ char *padded = NULL; + + /* We truncated more lines of this cell, let's add an + * ellipsis. We first append it, but that might make our + * string grow above what we have space for, hence ellipsize + * right after. This will truncate the ellipsis and add a new + * one. */ + + padded = strjoin(field, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + if (!padded) + return -ENOMEM; + + buffer = ellipsize(padded, width[j], 100); + if (!buffer) + return -ENOMEM; + + field = buffer; + l = utf8_console_width(field); + } + + if (l < width[j]) { + _cleanup_free_ char *aligned = NULL; + /* Field is shorter than allocated space. Let's align with spaces */ + + aligned = align_string_mem(field, d->url, width[j], d->align_percent); + if (!aligned) + return -ENOMEM; + + /* Drop trailing white spaces of last column when no cosmetics is set. */ + if (j == display_columns - 1 && + (!colors_enabled() || !table_data_color(d)) && + (!urlify_enabled() || !d->url)) + delete_trailing_chars(aligned, NULL); + + free_and_replace(buffer, aligned); + field = buffer; + } + } + + if (l >= width[j] && d->url) { + _cleanup_free_ char *clickable = NULL; + + r = terminal_urlify(d->url, field, &clickable); + if (r < 0) + return r; + + free_and_replace(buffer, clickable); + field = buffer; + } + + if (colors_enabled() && gap_color) + fputs(gap_color, f); + + if (j > 0) + fputc(' ', f); /* column separator left of cell */ + + if (colors_enabled()) { + color = table_data_color(d); + + /* Undo gap color */ + if (gap_color) + fputs(ANSI_NORMAL, f); + + if (color) + fputs(color, f); + } + + fputs(field, f); + + if (colors_enabled() && color) + fputs(ANSI_NORMAL, f); + + gap_color = table_data_rgap_color(d); + } + + fputc('\n', f); + n_subline ++; + } while (more_sublines); + } + + return fflush_and_check(f); +} + +int table_format(Table *t, char **ret) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + int r; + + assert(t); + assert(ret); + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + r = table_print(t, f); + if (r < 0) + return r; + + return memstream_finalize(&m, ret, NULL); +} + +size_t table_get_rows(Table *t) { + if (!t) + return 0; + + assert(t->n_columns > 0); + return t->n_cells / t->n_columns; +} + +size_t table_get_columns(Table *t) { + if (!t) + return 0; + + assert(t->n_columns > 0); + return t->n_columns; +} + +size_t table_get_current_column(Table *t) { + if (!t) + return 0; + + assert(t->n_columns > 0); + return t->n_cells % t->n_columns; +} + +int table_set_reverse(Table *t, size_t column, bool b) { + assert(t); + assert(column < t->n_columns); + + if (!t->reverse_map) { + if (!b) + return 0; + + t->reverse_map = new0(bool, t->n_columns); + if (!t->reverse_map) + return -ENOMEM; + } + + t->reverse_map[column] = b; + return 0; +} + +TableCell *table_get_cell(Table *t, size_t row, size_t column) { + size_t i; + + assert(t); + + if (column >= t->n_columns) + return NULL; + + i = row * t->n_columns + column; + if (i >= t->n_cells) + return NULL; + + return TABLE_INDEX_TO_CELL(i); +} + +const void *table_get(Table *t, TableCell *cell) { + TableData *d; + + assert(t); + + d = table_get_data(t, cell); + if (!d) + return NULL; + + return d->data; +} + +const void* table_get_at(Table *t, size_t row, size_t column) { + TableCell *cell; + + cell = table_get_cell(t, row, column); + if (!cell) + return NULL; + + return table_get(t, cell); +} + +static int table_data_to_json(TableData *d, JsonVariant **ret) { + + switch (d->type) { + + case TABLE_EMPTY: + return json_variant_new_null(ret); + + case TABLE_STRING: + case TABLE_PATH: + case TABLE_PATH_BASENAME: + case TABLE_FIELD: + case TABLE_HEADER: + return json_variant_new_string(ret, d->string); + + case TABLE_STRV: + case TABLE_STRV_WRAPPED: + return json_variant_new_array_strv(ret, d->strv); + + case TABLE_BOOLEAN_CHECKMARK: + case TABLE_BOOLEAN: + return json_variant_new_boolean(ret, d->boolean); + + case TABLE_TIMESTAMP: + case TABLE_TIMESTAMP_UTC: + case TABLE_TIMESTAMP_RELATIVE: + case TABLE_TIMESTAMP_RELATIVE_MONOTONIC: + case TABLE_TIMESTAMP_LEFT: + case TABLE_TIMESTAMP_DATE: + if (d->timestamp == USEC_INFINITY) + return json_variant_new_null(ret); + + return json_variant_new_unsigned(ret, d->timestamp); + + case TABLE_TIMESPAN: + case TABLE_TIMESPAN_MSEC: + case TABLE_TIMESPAN_DAY: + if (d->timespan == USEC_INFINITY) + return json_variant_new_null(ret); + + return json_variant_new_unsigned(ret, d->timespan); + + case TABLE_SIZE: + case TABLE_BPS: + if (d->size == UINT64_MAX) + return json_variant_new_null(ret); + + return json_variant_new_unsigned(ret, d->size); + + case TABLE_INT: + return json_variant_new_integer(ret, d->int_val); + + case TABLE_INT8: + return json_variant_new_integer(ret, d->int8); + + case TABLE_INT16: + return json_variant_new_integer(ret, d->int16); + + case TABLE_INT32: + return json_variant_new_integer(ret, d->int32); + + case TABLE_INT64: + return json_variant_new_integer(ret, d->int64); + + case TABLE_UINT: + return json_variant_new_unsigned(ret, d->uint_val); + + case TABLE_UINT8: + return json_variant_new_unsigned(ret, d->uint8); + + case TABLE_UINT16: + return json_variant_new_unsigned(ret, d->uint16); + + case TABLE_UINT32: + case TABLE_UINT32_HEX: + return json_variant_new_unsigned(ret, d->uint32); + + case TABLE_UINT64: + case TABLE_UINT64_HEX: + return json_variant_new_unsigned(ret, d->uint64); + + case TABLE_PERCENT: + return json_variant_new_integer(ret, d->percent); + + case TABLE_IFINDEX: + if (d->ifindex <= 0) + return json_variant_new_null(ret); + + return json_variant_new_integer(ret, d->ifindex); + + case TABLE_IN_ADDR: + return json_variant_new_array_bytes(ret, &d->address, FAMILY_ADDRESS_SIZE(AF_INET)); + + case TABLE_IN6_ADDR: + return json_variant_new_array_bytes(ret, &d->address, FAMILY_ADDRESS_SIZE(AF_INET6)); + + case TABLE_ID128: + return json_variant_new_id128(ret, d->id128); + + case TABLE_UUID: + return json_variant_new_uuid(ret, d->id128); + + case TABLE_UID: + if (!uid_is_valid(d->uid)) + return json_variant_new_null(ret); + + return json_variant_new_integer(ret, d->uid); + + case TABLE_GID: + if (!gid_is_valid(d->gid)) + return json_variant_new_null(ret); + + return json_variant_new_integer(ret, d->gid); + + case TABLE_PID: + if (!pid_is_valid(d->pid)) + return json_variant_new_null(ret); + + return json_variant_new_integer(ret, d->pid); + + case TABLE_SIGNAL: + if (!SIGNAL_VALID(d->int_val)) + return json_variant_new_null(ret); + + return json_variant_new_integer(ret, d->int_val); + + case TABLE_MODE: + case TABLE_MODE_INODE_TYPE: + if (d->mode == MODE_INVALID) + return json_variant_new_null(ret); + + return json_variant_new_unsigned(ret, d->mode); + + case TABLE_DEVNUM: + if (devnum_is_zero(d->devnum)) + return json_variant_new_null(ret); + + return json_build(ret, JSON_BUILD_ARRAY( + JSON_BUILD_UNSIGNED(major(d->devnum)), + JSON_BUILD_UNSIGNED(minor(d->devnum)))); + + default: + return -EINVAL; + } +} + +static char* string_to_json_field_name(const char *f) { + /* Tries to make a string more suitable as JSON field name. There are no strict rules defined what a + * field name can be hence this is a bit vague and black magic. Right now we only convert spaces to + * underscores and leave everything as is. */ + + char *c = strdup(f); + if (!c) + return NULL; + + for (char *x = c; *x; x++) + if (isspace(*x)) + *x = '_'; + + return c; +} + +static int table_make_json_field_name(Table *t, TableData *d, char **ret) { + _cleanup_free_ char *mangled = NULL; + const char *n; + + assert(t); + assert(d); + assert(ret); + + if (IN_SET(d->type, TABLE_HEADER, TABLE_FIELD)) + n = d->string; + else { + n = table_data_format(t, d, /* avoid_uppercasing= */ true, SIZE_MAX, NULL); + if (!n) + return -ENOMEM; + } + + mangled = string_to_json_field_name(n); + if (!mangled) + return -ENOMEM; + + *ret = TAKE_PTR(mangled); + return 0; +} + +static const char *table_get_json_field_name(Table *t, size_t idx) { + assert(t); + + return idx < t->n_json_fields ? t->json_fields[idx] : NULL; +} + +static int table_to_json_regular(Table *t, JsonVariant **ret) { + JsonVariant **rows = NULL, **elements = NULL; + _cleanup_free_ size_t *sorted = NULL; + size_t n_rows, display_columns; + int r; + + assert(t); + assert(!t->vertical); + + /* Ensure we have no incomplete rows */ + assert(t->n_columns > 0); + assert(t->n_cells % t->n_columns == 0); + + n_rows = t->n_cells / t->n_columns; + assert(n_rows > 0); /* at least the header row must be complete */ + + if (t->sort_map) { + /* If sorting is requested, let's calculate an index table we use to lookup the actual index to display with. */ + + sorted = new(size_t, n_rows); + if (!sorted) + return -ENOMEM; + + for (size_t i = 0; i < n_rows; i++) + sorted[i] = i * t->n_columns; + + typesafe_qsort_r(sorted, n_rows, table_data_compare, t); + } + + if (t->display_map) + display_columns = t->n_display_map; + else + display_columns = t->n_columns; + assert(display_columns > 0); + + elements = new0(JsonVariant*, display_columns * 2); + if (!elements) + return -ENOMEM; + + CLEANUP_ARRAY(elements, (size_t) { display_columns * 2 }, json_variant_unref_many); + + for (size_t j = 0; j < display_columns; j++) { + _cleanup_free_ char *mangled = NULL; + const char *n; + size_t c; + + c = t->display_map ? t->display_map[j] : j; + + /* Use explicitly set JSON field name, if we have one. Otherwise mangle the column field value. */ + n = table_get_json_field_name(t, c); + if (!n) { + r = table_make_json_field_name(t, ASSERT_PTR(t->data[c]), &mangled); + if (r < 0) + return r; + + n = mangled; + } + + r = json_variant_new_string(elements + j*2, n); + if (r < 0) + return r; + } + + rows = new0(JsonVariant*, n_rows-1); + if (!rows) + return -ENOMEM; + + CLEANUP_ARRAY(rows, (size_t) { n_rows - 1 }, json_variant_unref_many); + + for (size_t i = 1; i < n_rows; i++) { + TableData **row; + + if (sorted) + row = t->data + sorted[i]; + else + row = t->data + i * t->n_columns; + + for (size_t j = 0; j < display_columns; j++) { + TableData *d; + size_t k; + + assert_se(d = row[t->display_map ? t->display_map[j] : j]); + + k = j*2+1; + elements[k] = json_variant_unref(elements[k]); + + r = table_data_to_json(d, elements + k); + if (r < 0) + return r; + } + + r = json_variant_new_object(rows + i - 1, elements, display_columns * 2); + if (r < 0) + return r; + } + + return json_variant_new_array(ret, rows, n_rows - 1); +} + +static int table_to_json_vertical(Table *t, JsonVariant **ret) { + JsonVariant **elements = NULL; + size_t n_elements = 0; + int r; + + assert(t); + assert(t->vertical); + + if (t->n_columns != 2) + return -EINVAL; + + /* Ensure we have no incomplete rows */ + assert(t->n_cells % t->n_columns == 0); + + elements = new0(JsonVariant *, t->n_cells); + if (!elements) + return -ENOMEM; + + CLEANUP_ARRAY(elements, n_elements, json_variant_unref_many); + + for (size_t i = t->n_columns; i < t->n_cells; i++) { + + if (i % t->n_columns == 0) { + _cleanup_free_ char *mangled = NULL; + const char *n; + + n = table_get_json_field_name(t, i / t->n_columns - 1); + if (!n) { + r = table_make_json_field_name(t, ASSERT_PTR(t->data[i]), &mangled); + if (r < 0) + return r; + + n = mangled; + } + + r = json_variant_new_string(elements + n_elements, n); + } else + r = table_data_to_json(t->data[i], elements + n_elements); + if (r < 0) + return r; + + n_elements++; + } + + return json_variant_new_object(ret, elements, n_elements); +} + +int table_to_json(Table *t, JsonVariant **ret) { + assert(t); + + if (t->vertical) + return table_to_json_vertical(t, ret); + + return table_to_json_regular(t, ret); +} + +int table_print_json(Table *t, FILE *f, JsonFormatFlags flags) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert(t); + + if (flags & JSON_FORMAT_OFF) /* If JSON output is turned off, use regular output */ + return table_print(t, f); + + if (!f) + f = stdout; + + r = table_to_json(t, &v); + if (r < 0) + return r; + + json_variant_dump(v, flags, f, NULL); + + return fflush_and_check(f); +} + +int table_print_with_pager( + Table *t, + JsonFormatFlags json_format_flags, + PagerFlags pager_flags, + bool show_header) { + + bool saved_header; + int r; + + assert(t); + + /* An all-in-one solution for showing tables, and turning on a pager first. Also optionally suppresses + * the table header and logs about any error. */ + + if (json_format_flags & (JSON_FORMAT_OFF|JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO)) + pager_open(pager_flags); + + saved_header = t->header; + t->header = show_header; + r = table_print_json(t, stdout, json_format_flags); + t->header = saved_header; + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +int table_set_json_field_name(Table *t, size_t idx, const char *name) { + int r; + + assert(t); + + if (name) { + size_t m; + + m = MAX(idx + 1, t->n_json_fields); + if (!GREEDY_REALLOC0(t->json_fields, m)) + return -ENOMEM; + + r = free_and_strdup(t->json_fields + idx, name); + if (r < 0) + return r; + + t->n_json_fields = m; + return r; + } else { + if (idx >= t->n_json_fields) + return 0; + + t->json_fields[idx] = mfree(t->json_fields[idx]); + return 1; + } +} diff --git a/src/shared/format-table.h b/src/shared/format-table.h new file mode 100644 index 0000000..37bfbca --- /dev/null +++ b/src/shared/format-table.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "json.h" +#include "macro.h" +#include "pager.h" + +typedef enum TableDataType { + TABLE_EMPTY, + TABLE_STRING, + TABLE_HEADER, /* in regular mode: the cells in the first row, that carry the column names */ + TABLE_FIELD, /* in vertical mode: the cells in the first column, that carry the field names */ + TABLE_STRV, + TABLE_STRV_WRAPPED, + TABLE_PATH, + TABLE_PATH_BASENAME, /* like TABLE_PATH, but display only last path element (i.e. the "basename") in regular output */ + TABLE_BOOLEAN, + TABLE_BOOLEAN_CHECKMARK, + TABLE_TIMESTAMP, + TABLE_TIMESTAMP_UTC, + TABLE_TIMESTAMP_RELATIVE, + TABLE_TIMESTAMP_RELATIVE_MONOTONIC, + TABLE_TIMESTAMP_LEFT, + TABLE_TIMESTAMP_DATE, + TABLE_TIMESPAN, + TABLE_TIMESPAN_MSEC, + TABLE_TIMESPAN_DAY, + TABLE_SIZE, + TABLE_BPS, + TABLE_INT, + TABLE_INT8, + TABLE_INT16, + TABLE_INT32, + TABLE_INT64, + TABLE_UINT, + TABLE_UINT8, + TABLE_UINT16, + TABLE_UINT32, + TABLE_UINT32_HEX, + TABLE_UINT64, + TABLE_UINT64_HEX, + TABLE_PERCENT, + TABLE_IFINDEX, + TABLE_IN_ADDR, /* Takes a union in_addr_union (or a struct in_addr) */ + TABLE_IN6_ADDR, /* Takes a union in_addr_union (or a struct in6_addr) */ + TABLE_ID128, + TABLE_UUID, + TABLE_UID, + TABLE_GID, + TABLE_PID, + TABLE_SIGNAL, + TABLE_MODE, /* as in UNIX file mode (mode_t), in typical octal output */ + TABLE_MODE_INODE_TYPE, /* also mode_t, but displays only the inode type as string */ + TABLE_DEVNUM, /* a dev_t, displayed in the usual major:minor way */ + _TABLE_DATA_TYPE_MAX, + + /* The following are not really data types, but commands for table_add_cell_many() to make changes to + * a cell just added. */ + TABLE_SET_MINIMUM_WIDTH, + TABLE_SET_MAXIMUM_WIDTH, + TABLE_SET_WEIGHT, + TABLE_SET_ALIGN_PERCENT, + TABLE_SET_ELLIPSIZE_PERCENT, + TABLE_SET_COLOR, + TABLE_SET_RGAP_COLOR, + TABLE_SET_BOTH_COLORS, + TABLE_SET_URL, + TABLE_SET_UPPERCASE, + + _TABLE_DATA_TYPE_INVALID = -EINVAL, +} TableDataType; + +typedef enum TableErsatz { + TABLE_ERSATZ_EMPTY, + TABLE_ERSATZ_DASH, + TABLE_ERSATZ_UNSET, + TABLE_ERSATZ_NA, + _TABLE_ERSATZ_MAX, +} TableErsatz; + +typedef struct Table Table; +typedef struct TableCell TableCell; + +Table *table_new_internal(const char *first_header, ...) _sentinel_; +#define table_new(...) table_new_internal(__VA_ARGS__, NULL) +Table *table_new_raw(size_t n_columns); +Table *table_new_vertical(void); +Table *table_unref(Table *t); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Table*, table_unref); + +int table_add_cell_full(Table *t, TableCell **ret_cell, TableDataType type, const void *data, size_t minimum_width, size_t maximum_width, unsigned weight, unsigned align_percent, unsigned ellipsize_percent); +static inline int table_add_cell(Table *t, TableCell **ret_cell, TableDataType type, const void *data) { + return table_add_cell_full(t, ret_cell, type, data, SIZE_MAX, SIZE_MAX, UINT_MAX, UINT_MAX, UINT_MAX); +} +int table_add_cell_stringf_full(Table *t, TableCell **ret_cell, TableDataType type, const char *format, ...) _printf_(4, 5); +#define table_add_cell_stringf(t, ret_cell, format, ...) table_add_cell_stringf_full(t, ret_cell, TABLE_STRING, format, __VA_ARGS__) + +int table_fill_empty(Table *t, size_t until_column); + +int table_dup_cell(Table *t, TableCell *cell); + +int table_set_minimum_width(Table *t, TableCell *cell, size_t minimum_width); +int table_set_maximum_width(Table *t, TableCell *cell, size_t maximum_width); +int table_set_weight(Table *t, TableCell *cell, unsigned weight); +int table_set_align_percent(Table *t, TableCell *cell, unsigned percent); +int table_set_ellipsize_percent(Table *t, TableCell *cell, unsigned percent); +int table_set_color(Table *t, TableCell *cell, const char *color); +int table_set_rgap_color(Table *t, TableCell *cell, const char *color); +int table_set_url(Table *t, TableCell *cell, const char *url); +int table_set_uppercase(Table *t, TableCell *cell, bool b); + +int table_update(Table *t, TableCell *cell, TableDataType type, const void *data); + +int table_add_many_internal(Table *t, TableDataType first_type, ...); +#define table_add_many(t, ...) table_add_many_internal(t, __VA_ARGS__, _TABLE_DATA_TYPE_MAX) + +void table_set_header(Table *table, bool b); +void table_set_width(Table *t, size_t width); +void table_set_cell_height_max(Table *t, size_t height); +void table_set_ersatz_string(Table *t, TableErsatz ersatz); +int table_set_display_internal(Table *t, size_t first_column, ...); +#define table_set_display(...) table_set_display_internal(__VA_ARGS__, SIZE_MAX) +int table_set_sort_internal(Table *t, size_t first_column, ...); +#define table_set_sort(...) table_set_sort_internal(__VA_ARGS__, SIZE_MAX) +int table_set_reverse(Table *t, size_t column, bool b); +int table_hide_column_from_display_internal(Table *t, ...); +#define table_hide_column_from_display(t, ...) table_hide_column_from_display_internal(t, __VA_ARGS__, (size_t) -1) + +int table_print(Table *t, FILE *f); +int table_format(Table *t, char **ret); + +static inline TableCell* TABLE_HEADER_CELL(size_t i) { + return SIZE_TO_PTR(i + 1); +} + +size_t table_get_rows(Table *t); +size_t table_get_columns(Table *t); + +size_t table_get_current_column(Table *t); + +TableCell *table_get_cell(Table *t, size_t row, size_t column); + +const void *table_get(Table *t, TableCell *cell); +const void *table_get_at(Table *t, size_t row, size_t column); + +int table_to_json(Table *t, JsonVariant **ret); +int table_print_json(Table *t, FILE *f, JsonFormatFlags json_flags); + +int table_print_with_pager(Table *t, JsonFormatFlags json_format_flags, PagerFlags pager_flags, bool show_header); + +int table_set_json_field_name(Table *t, size_t idx, const char *name); + +#define table_log_add_error(r) \ + log_error_errno(r, "Failed to add cells to table: %m") + +#define table_log_print_error(r) \ + log_error_errno(r, "Failed to print table: %m") + +#define table_log_sort_error(r) \ + log_error_errno(r, "Failed to sort table: %m") diff --git a/src/shared/fsck-util.h b/src/shared/fsck-util.h new file mode 100644 index 0000000..855137c --- /dev/null +++ b/src/shared/fsck-util.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* exit codes as defined in fsck(8) */ +enum { + FSCK_SUCCESS = 0, + FSCK_ERROR_CORRECTED = 1 << 0, + FSCK_SYSTEM_SHOULD_REBOOT = 1 << 1, + FSCK_ERRORS_LEFT_UNCORRECTED = 1 << 2, + FSCK_OPERATIONAL_ERROR = 1 << 3, + FSCK_USAGE_OR_SYNTAX_ERROR = 1 << 4, + FSCK_USER_CANCELLED = 1 << 5, + FSCK_SHARED_LIB_ERROR = 1 << 7, +}; diff --git a/src/shared/fstab-util.c b/src/shared/fstab-util.c new file mode 100644 index 0000000..55e76b6 --- /dev/null +++ b/src/shared/fstab-util.c @@ -0,0 +1,366 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "device-nodes.h" +#include "fstab-util.h" +#include "initrd-util.h" +#include "macro.h" +#include "mount-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "strv.h" + +bool fstab_enabled_full(int enabled) { + static int cached = -1; + bool val = true; /* If nothing specified or the check fails, then defaults to true. */ + int r; + + /* If 'enabled' is non-negative, then update the cache with it. */ + if (enabled >= 0) + cached = enabled; + + if (cached >= 0) + return cached; + + r = proc_cmdline_get_bool("fstab", PROC_CMDLINE_STRIP_RD_PREFIX|PROC_CMDLINE_TRUE_WHEN_MISSING, &val); + if (r < 0) + log_debug_errno(r, "Failed to parse fstab= kernel command line option, ignoring: %m"); + + return (cached = val); +} + +int fstab_has_fstype(const char *fstype) { + _cleanup_endmntent_ FILE *f = NULL; + struct mntent *m; + + assert(fstype); + + if (!fstab_enabled()) + return false; + + f = setmntent(fstab_path(), "re"); + if (!f) + return errno == ENOENT ? false : -errno; + + for (;;) { + errno = 0; + m = getmntent(f); + if (!m) + return errno != 0 ? -errno : false; + + if (streq(m->mnt_type, fstype)) + return true; + } + return false; +} + +bool fstab_is_extrinsic(const char *mount, const char *opts) { + + /* Don't bother with the OS data itself */ + if (PATH_IN_SET(mount, + "/", + "/usr", + "/etc")) + return true; + + if (PATH_STARTSWITH_SET(mount, + "/run/initramfs", /* This should stay around from before we boot until after we shutdown */ + "/run/nextroot", /* Similar (though might be updated from the host) */ + "/proc", /* All of this is API VFS */ + "/sys", /* … dito … */ + "/dev")) /* … dito … */ + return true; + + /* If this is an initrd mount, and we are not in the initrd, then leave + * this around forever, too. */ + if (fstab_test_option(opts, "x-initrd.mount\0") && !in_initrd()) + return true; + + return false; +} + +static int fstab_is_same_node(const char *what_fstab, const char *path) { + _cleanup_free_ char *node = NULL; + + assert(what_fstab); + assert(path); + + node = fstab_node_to_udev_node(what_fstab); + if (!node) + return -ENOMEM; + + if (path_equal(node, path)) + return true; + + if (is_device_path(path) && is_device_path(node)) + return devnode_same(node, path); + + return false; +} + +int fstab_is_mount_point_full(const char *where, const char *path) { + _cleanup_endmntent_ FILE *f = NULL; + int r; + + assert(where || path); + + if (!fstab_enabled()) + return false; + + f = setmntent(fstab_path(), "re"); + if (!f) + return errno == ENOENT ? false : -errno; + + for (;;) { + struct mntent *me; + + errno = 0; + me = getmntent(f); + if (!me) + return errno != 0 ? -errno : false; + + if (where && !path_equal(where, me->mnt_dir)) + continue; + + if (!path) + return true; + + r = fstab_is_same_node(me->mnt_fsname, path); + if (r > 0 || (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r))) + return r; + } + + return false; +} + +int fstab_filter_options( + const char *opts, + const char *names, + const char **ret_namefound, + char **ret_value, + char ***ret_values, + char **ret_filtered) { + + const char *namefound = NULL, *x; + _cleanup_strv_free_ char **stor = NULL, **values = NULL; + _cleanup_free_ char *value = NULL, **filtered = NULL; + int r; + + assert(names && *names); + assert(!(ret_value && ret_values)); + + if (!opts) + goto answer; + + /* Finds any options matching 'names', and returns: + * - the last matching option name in ret_namefound, + * - the last matching value in ret_value, + * - any matching values in ret_values, + * - the rest of the option string in ret_filtered. + * + * If !ret_value and !ret_values and !ret_filtered, this function is not allowed to fail. + * + * Returns negative on error, true if any matching options were found, false otherwise. */ + + if (ret_filtered || ret_value || ret_values) { + /* For backwards compatibility, we need to pass-through escape characters. + * The only ones we "consume" are the ones used as "\," or "\\". */ + r = strv_split_full(&stor, opts, ",", EXTRACT_UNESCAPE_SEPARATORS | EXTRACT_UNESCAPE_RELAX); + if (r < 0) + return r; + + filtered = memdup(stor, sizeof(char*) * (strv_length(stor) + 1)); + if (!filtered) + return -ENOMEM; + + char **t = filtered; + for (char **s = t; *s; s++) { + NULSTR_FOREACH(name, names) { + x = startswith(*s, name); + if (!x) + continue; + /* Match name, but when ret_values, only when followed by assignment. */ + if (*x == '=' || (!ret_values && *x == '\0')) { + /* Keep the last occurrence found */ + namefound = name; + goto found; + } + } + + *t = *s; + t++; + continue; + found: + if (ret_value || ret_values) { + assert(IN_SET(*x, '=', '\0')); + + if (ret_value) { + r = free_and_strdup(&value, *x == '=' ? x + 1 : NULL); + if (r < 0) + return r; + } else if (*x) { + r = strv_extend(&values, x + 1); + if (r < 0) + return r; + } + } + } + *t = NULL; + } else + for (const char *word = opts;;) { + const char *end = word; + + /* Look for a *non-escaped* comma separator. Only commas and backslashes can be + * escaped, so "\," and "\\" are the only valid escape sequences, and we can do a + * very simple test here. */ + for (;;) { + end += strcspn(end, ",\\"); + + if (IN_SET(*end, ',', '\0')) + break; + assert(*end == '\\'); + end ++; /* Skip the backslash */ + if (*end != '\0') + end ++; /* Skip the escaped char, but watch out for a trailing comma */ + } + + NULSTR_FOREACH(name, names) { + if (end < word + strlen(name)) + continue; + if (!strneq(word, name, strlen(name))) + continue; + + /* We know that the string is NUL terminated, so *x is valid */ + x = word + strlen(name); + if (IN_SET(*x, '\0', '=', ',')) { + namefound = name; + break; + } + } + + if (*end) + word = end + 1; + else + break; + } + +answer: + if (ret_namefound) + *ret_namefound = namefound; + if (ret_filtered) { + char *f; + + f = strv_join_full(filtered, ",", NULL, true); + if (!f) + return -ENOMEM; + + *ret_filtered = f; + } + if (ret_value) + *ret_value = TAKE_PTR(value); + if (ret_values) + *ret_values = TAKE_PTR(values); + + return !!namefound; +} + +int fstab_find_pri(const char *options, int *ret) { + _cleanup_free_ char *opt = NULL; + int r, pri; + + assert(ret); + + r = fstab_filter_options(options, "pri\0", NULL, &opt, NULL, NULL); + if (r < 0) + return r; + if (r == 0 || !opt) + return 0; + + r = safe_atoi(opt, &pri); + if (r < 0) + return r; + + *ret = pri; + return 1; +} + +static char *unquote(const char *s, const char* quotes) { + size_t l; + assert(s); + + /* This is rather stupid, simply removes the heading and + * trailing quotes if there is one. Doesn't care about + * escaping or anything. + * + * DON'T USE THIS FOR NEW CODE ANYMORE! */ + + l = strlen(s); + if (l < 2) + return strdup(s); + + if (strchr(quotes, s[0]) && s[l-1] == s[0]) + return strndup(s+1, l-2); + + return strdup(s); +} + +static char *tag_to_udev_node(const char *tagvalue, const char *by) { + _cleanup_free_ char *t = NULL, *u = NULL; + size_t enc_len; + + u = unquote(tagvalue, QUOTES); + if (!u) + return NULL; + + enc_len = strlen(u) * 4 + 1; + t = new(char, enc_len); + if (!t) + return NULL; + + if (encode_devnode_name(u, t, enc_len) < 0) + return NULL; + + return strjoin("/dev/disk/by-", by, "/", t); +} + +char *fstab_node_to_udev_node(const char *p) { + const char *q; + + assert(p); + + q = startswith(p, "LABEL="); + if (q) + return tag_to_udev_node(q, "label"); + + q = startswith(p, "UUID="); + if (q) + return tag_to_udev_node(q, "uuid"); + + q = startswith(p, "PARTUUID="); + if (q) + return tag_to_udev_node(q, "partuuid"); + + q = startswith(p, "PARTLABEL="); + if (q) + return tag_to_udev_node(q, "partlabel"); + + return strdup(p); +} + +bool fstab_is_bind(const char *options, const char *fstype) { + + if (fstab_test_option(options, "bind\0" "rbind\0")) + return true; + + if (fstype && STR_IN_SET(fstype, "bind", "rbind")) + return true; + + return false; +} diff --git a/src/shared/fstab-util.h b/src/shared/fstab-util.h new file mode 100644 index 0000000..9cf34f0 --- /dev/null +++ b/src/shared/fstab-util.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +bool fstab_enabled_full(int enabled); +static inline bool fstab_enabled(void) { + return fstab_enabled_full(-1); +} +static inline bool fstab_set_enabled(bool enabled) { + return fstab_enabled_full(enabled); +} + +bool fstab_is_extrinsic(const char *mount, const char *opts); +int fstab_has_fstype(const char *fstype); + +int fstab_is_mount_point_full(const char *where, const char *path); +static inline int fstab_is_mount_point(const char *where) { + return fstab_is_mount_point_full(where, NULL); +} +static inline int fstab_has_node(const char *path) { + return fstab_is_mount_point_full(NULL, path); +} + +int fstab_filter_options( + const char *opts, + const char *names, + const char **ret_namefound, + char **ret_value, + char ***ret_values, + char **ret_filtered); + +static inline bool fstab_test_option(const char *opts, const char *names) { + return !!fstab_filter_options(opts, names, NULL, NULL, NULL, NULL); +} + +int fstab_find_pri(const char *options, int *ret); + +static inline bool fstab_test_yes_no_option(const char *opts, const char *yes_no) { + const char *opt; + + /* If first name given is last, return 1. + * If second name given is last or neither is found, return 0. */ + + assert_se(fstab_filter_options(opts, yes_no, &opt, NULL, NULL, NULL) >= 0); + + return opt == yes_no; +} + +char *fstab_node_to_udev_node(const char *p); + +static inline const char* fstab_path(void) { + return secure_getenv("SYSTEMD_FSTAB") ?: "/etc/fstab"; +} + +bool fstab_is_bind(const char *options, const char *fstype); diff --git a/src/shared/generate-ip-protocol-list.sh b/src/shared/generate-ip-protocol-list.sh new file mode 100755 index 0000000..ff898a9 --- /dev/null +++ b/src/shared/generate-ip-protocol-list.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later + +set -eu +set -o pipefail + +${1:?} -dM -include netinet/in.h - +#include + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "dropin.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fstab-util.h" +#include "generator.h" +#include "initrd-util.h" +#include "log.h" +#include "macro.h" +#include "mkdir-label.h" +#include "path-util.h" +#include "process-util.h" +#include "special.h" +#include "specifier.h" +#include "string-util.h" +#include "time-util.h" +#include "tmpfile-util.h" +#include "unit-name.h" + +int generator_open_unit_file_full( + const char *dir, + const char *source, + const char *fn, + FILE **ret_file, + char **ret_temp_path) { + + _cleanup_free_ char *p = NULL; + FILE *f; + int r; + + assert(dir); + assert(ret_file); + + /* If is specified, it creates a temporary unit file and also returns its + * temporary path. */ + + if (ret_temp_path) { + r = fopen_temporary(dir, &f, &p); + if (r < 0) + return log_error_errno(r, "Failed to create temporary unit file in '%s': %m", dir); + + (void) fchmod(fileno(f), 0644); + + *ret_temp_path = TAKE_PTR(p); + } else { + assert(fn); + + p = path_join(dir, fn); + if (!p) + return log_oom(); + + r = fopen_unlocked(p, "wxe", &f); + if (r < 0) { + if (source && r == -EEXIST) + return log_error_errno(r, + "Failed to create unit file '%s', as it already exists. Duplicate entry in '%s'?", + p, source); + + return log_error_errno(r, "Failed to create unit file '%s': %m", p); + } + } + + fprintf(f, + "# Automatically generated by %s\n\n", + program_invocation_short_name); + + *ret_file = f; + return 0; +} + + +int generator_add_symlink_full( + const char *dir, + const char *dst, + const char *dep_type, + const char *src, + const char *instance) { + + _cleanup_free_ char *dn = NULL, *fn = NULL, *instantiated = NULL, *to = NULL, *from = NULL; + int r; + + assert(dir); + assert(dst); + assert(dep_type); + assert(src); + + /* Adds a symlink from ./ to (if src is absolute) or ../ (otherwise). If + * is specified, then must be a template unit name, and we'll instantiate it. */ + + r = path_extract_directory(src, &dn); + if (r < 0 && r != -EDESTADDRREQ) /* EDESTADDRREQ → just a file name was passed */ + return log_error_errno(r, "Failed to extract directory name from '%s': %m", src); + + r = path_extract_filename(src, &fn); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", src); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Expected path to regular file name, but got '%s', refusing.", src); + + if (instance) { + r = unit_name_replace_instance(fn, instance, &instantiated); + if (r < 0) + return log_error_errno(r, "Failed to instantiate '%s' for '%s': %m", fn, instance); + } + + from = path_join(dn ?: "..", fn); + if (!from) + return log_oom(); + + to = strjoin(dir, "/", dst, ".", dep_type, "/", instantiated ?: fn); + if (!to) + return log_oom(); + + (void) mkdir_parents_label(to, 0755); + + if (symlink(from, to) < 0 && errno != EEXIST) + return log_error_errno(errno, "Failed to create symlink \"%s\": %m", to); + + return 0; +} + +static int generator_add_ordering( + const char *dir, + const char *src, + const char *order, + const char *dst, + const char *instance) { + + _cleanup_free_ char *instantiated = NULL, *p = NULL, *fn = NULL; + _cleanup_fclose_ FILE *f = NULL; + const char *to; + int r; + + assert(dir); + assert(src); + assert(order); + assert(dst); + + /* Adds in an explicit ordering dependency of type from to . If is + * specified, it is inserted into . */ + + if (instance) { + r = unit_name_replace_instance(dst, instance, &instantiated); + if (r < 0) + return log_error_errno(r, "Failed to instantiate '%s' for '%s': %m", dst, instance); + + to = instantiated; + } else + to = dst; + + fn = strjoin(src, ".d/50-order-", to, ".conf"); + if (!fn) + return log_oom(); + + p = path_join(dir, fn); + if (!p) + return log_oom(); + + (void) mkdir_parents_label(p, 0755); + + r = fopen_unlocked(p, "wxe", &f); + if (r < 0) + return log_error_errno(r, "Failed to create '%s': %m", p); + + fprintf(f, + "# Automatically generated by %s\n\n" + "[Unit]\n" + "%s=%s\n", + program_invocation_short_name, + order, + to); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write drop-in '%s': %m", p); + + return 0; +} + +static int write_fsck_sysroot_service( + const char *unit, /* Either SPECIAL_FSCK_ROOT_SERVICE or SPECIAL_FSCK_USR_SERVICE */ + const char *dir, + const char *what, + const char *extra_after) { + + _cleanup_free_ char *device = NULL, *escaped = NULL, *escaped2 = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(unit); + assert(dir); + assert(what); + + /* Writes out special versions of systemd-fsck-root.service and systemd-fsck-usr.service for use in + * the initrd. The regular statically shipped versions of these unit files use / and /usr for as + * paths, which doesn't match what we need for the initrd (where the dirs are /sysroot + + * /sysusr/usr), hence we overwrite those versions here. */ + + escaped = specifier_escape(what); + if (!escaped) + return log_oom(); + + escaped2 = cescape(escaped); + if (!escaped2) + return log_oom(); + + r = unit_name_from_path(what, ".device", &device); + if (r < 0) + return log_error_errno(r, "Failed to convert device \"%s\" to unit name: %m", what); + + r = generator_open_unit_file(dir, /* source = */ NULL, unit, &f); + if (r < 0) + return r; + + fprintf(f, + "[Unit]\n" + "Description=File System Check on %1$s\n" + "Documentation=man:%2$s(8)\n" + "\n" + "DefaultDependencies=no\n" + "BindsTo=%3$s\n" + "Conflicts=shutdown.target\n" + "After=%4$s%5$slocal-fs-pre.target %3$s\n" + "Before=shutdown.target\n" + "\n" + "[Service]\n" + "Type=oneshot\n" + "RemainAfterExit=yes\n" + "ExecStart=" SYSTEMD_FSCK_PATH " %6$s\n" + "TimeoutSec=infinity\n", + escaped, + unit, + device, + strempty(extra_after), + isempty(extra_after) ? "" : " ", + escaped2); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit %s: %m", unit); + + return 0; +} + +int generator_write_fsck_deps( + FILE *f, + const char *dir, + const char *what, + const char *where, + const char *fstype) { + + int r; + + assert(f); + assert(dir); + assert(what); + assert(where); + + /* Let's do an early exit if we are invoked for the root and /usr/ trees in the initrd, to avoid + * generating confusing log messages */ + if (in_initrd() && PATH_IN_SET(where, "/", "/usr")) { + log_debug("Skipping fsck for %s in initrd.", where); + return 0; + } + + if (!is_device_path(what)) { + log_warning("Checking was requested for \"%s\", but it is not a device.", what); + return 0; + } + + if (!isempty(fstype) && !streq(fstype, "auto")) { + r = fsck_exists_for_fstype(fstype); + if (r < 0) + log_warning_errno(r, "Checking was requested for %s, but couldn't detect if fsck.%s may be used, proceeding: %m", what, fstype); + else if (r == 0) { + /* treat missing check as essentially OK */ + log_debug("Checking was requested for %s, but fsck.%s does not exist.", what, fstype); + return 0; + } + } else { + r = fsck_exists(); + if (r < 0) + log_warning_errno(r, "Checking was requested for %s, but couldn't detect if the fsck command may be used, proceeding: %m", what); + else if (r == 0) { + /* treat missing fsck as essentially OK */ + log_debug("Checking was requested for %s, but the fsck command does not exist.", what); + return 0; + } + } + + if (path_equal(where, "/")) { + const char *lnk; + + /* We support running the fsck instance for the root fs while it is already mounted, for + * compatibility with non-initrd boots. It's ugly, but it is how it is. Since – unlike for + * regular file systems – this means the ordering is reversed (i.e. mount *before* fsck) we + * have a separate fsck unit for this, independent of systemd-fsck@.service. */ + + lnk = strjoina(dir, "/" SPECIAL_LOCAL_FS_TARGET ".wants/" SPECIAL_FSCK_ROOT_SERVICE); + + (void) mkdir_parents(lnk, 0755); + if (symlink(SYSTEM_DATA_UNIT_DIR "/" SPECIAL_FSCK_ROOT_SERVICE, lnk) < 0) + return log_error_errno(errno, "Failed to create symlink %s: %m", lnk); + + } else { + _cleanup_free_ char *_fsck = NULL; + const char *fsck, *dep; + + if (in_initrd() && path_equal(where, "/sysroot")) { + r = write_fsck_sysroot_service(SPECIAL_FSCK_ROOT_SERVICE, dir, what, SPECIAL_INITRD_ROOT_DEVICE_TARGET); + if (r < 0) + return r; + + fsck = SPECIAL_FSCK_ROOT_SERVICE; + dep = "Requires"; + + } else if (in_initrd() && path_equal(where, "/sysusr/usr")) { + r = write_fsck_sysroot_service(SPECIAL_FSCK_USR_SERVICE, dir, what, NULL); + if (r < 0) + return r; + + fsck = SPECIAL_FSCK_USR_SERVICE; + dep = "Requires"; + } else { + /* When this is /usr, then let's add a Wants= dependency, otherwise a Requires= + * dependency. Why? We can't possibly unmount /usr during shutdown, but if we have a + * Requires= from /usr onto a fsck@.service unit and that unit is shut down, then + * we'd have to unmount /usr too. */ + + dep = path_equal(where, "/usr") ? "Wants" : "Requires"; + + r = unit_name_from_path_instance("systemd-fsck", what, ".service", &_fsck); + if (r < 0) + return log_error_errno(r, "Failed to create fsck service name: %m"); + + fsck = _fsck; + } + + fprintf(f, + "%1$s=%2$s\n" + "After=%2$s\n", + dep, fsck); + } + + return 0; +} + +int generator_write_timeouts( + const char *dir, + const char *what, + const char *where, + const char *opts, + char **filtered) { + + /* Configure how long we wait for a device that backs a mount point or a + * swap partition to show up. This is useful to support endless device timeouts + * for devices that show up only after user input, like crypto devices. */ + + _cleanup_free_ char *node = NULL, *unit = NULL, *timeout = NULL; + usec_t u; + int r; + + r = fstab_filter_options(opts, "comment=systemd.device-timeout\0" + "x-systemd.device-timeout\0", + NULL, &timeout, NULL, filtered); + if (r < 0) { + log_warning_errno(r, "Failed to parse fstab options, ignoring: %m"); + return 0; + } + if (r == 0) + return 0; + + r = parse_sec_fix_0(timeout, &u); + if (r < 0) { + log_warning("Failed to parse timeout for %s, ignoring: %s", where, timeout); + return 0; + } + + node = fstab_node_to_udev_node(what); + if (!node) + return log_oom(); + if (!is_device_path(node)) { + log_warning("x-systemd.device-timeout ignored for %s", what); + return 0; + } + + r = unit_name_from_path(node, ".device", &unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path: %m"); + + return write_drop_in_format(dir, unit, 50, "device-timeout", + "# Automatically generated by %s\n" + "# from supplied options \"%s\"\n\n" + "[Unit]\n" + "JobRunningTimeoutSec=%s", + program_invocation_short_name, + opts, + timeout); +} + +int generator_write_device_deps( + const char *dir, + const char *what, + const char *where, + const char *opts) { + + /* fstab records that specify _netdev option should apply the network + * ordering on the actual device depending on network connection. If we + * are not mounting real device (NFS, CIFS), we rely on _netdev effect + * on the mount unit itself. */ + + _cleanup_free_ char *node = NULL, *unit = NULL; + int r; + + if (fstab_is_extrinsic(where, opts)) + return 0; + + if (!fstab_test_option(opts, "_netdev\0")) + return 0; + + node = fstab_node_to_udev_node(what); + if (!node) + return log_oom(); + + /* Nothing to apply dependencies to. */ + if (!is_device_path(node)) + return 0; + + r = unit_name_from_path(node, ".device", &unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path \"%s\": %m", + node); + + /* See mount_add_default_dependencies for explanation why we create such + * dependencies. */ + return write_drop_in_format(dir, unit, 50, "netdev-dependencies", + "# Automatically generated by %s\n\n" + "[Unit]\n" + "After=" SPECIAL_NETWORK_ONLINE_TARGET " " SPECIAL_NETWORK_TARGET "\n" + "Wants=" SPECIAL_NETWORK_ONLINE_TARGET "\n", + program_invocation_short_name); +} + +int generator_write_initrd_root_device_deps(const char *dir, const char *what) { + _cleanup_free_ char *unit = NULL; + int r; + + r = unit_name_from_path(what, ".device", &unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path \"%s\": %m", + what); + + return write_drop_in_format(dir, SPECIAL_INITRD_ROOT_DEVICE_TARGET, 50, "root-device", + "# Automatically generated by %s\n\n" + "[Unit]\n" + "Requires=%s\n" + "After=%s", + program_invocation_short_name, + unit, + unit); +} + +int generator_hook_up_mkswap( + const char *dir, + const char *what) { + + _cleanup_free_ char *node = NULL, *unit = NULL, *escaped = NULL, *where_unit = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(dir); + assert(what); + + node = fstab_node_to_udev_node(what); + if (!node) + return log_oom(); + + /* Nothing to work on. */ + if (!is_device_path(node)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot format something that is not a device node: %s", + node); + + r = unit_name_from_path_instance("systemd-mkswap", node, ".service", &unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit instance name from path \"%s\": %m", + node); + + escaped = cescape(node); + if (!escaped) + return log_oom(); + + r = unit_name_from_path(what, ".swap", &where_unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path \"%s\": %m", + what); + + r = generator_open_unit_file(dir, /* source = */ NULL, unit, &f); + if (r < 0) + return r; + + fprintf(f, + "[Unit]\n" + "Description=Make Swap on %%f\n" + "Documentation=man:systemd-mkswap@.service(8)\n" + "\n" + "DefaultDependencies=no\n" + "BindsTo=%%i.device\n" + "After=%%i.device\n" + "Before=%s\n" + "Conflicts=shutdown.target\n" + "Before=shutdown.target\n" + "\n" + "[Service]\n" + "Type=oneshot\n" + "RemainAfterExit=yes\n" + "ExecStart="SYSTEMD_MAKEFS_PATH " swap %s\n" + "TimeoutSec=infinity\n", + where_unit, + escaped); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit %s: %m", unit); + + return generator_add_symlink(dir, where_unit, "requires", unit); +} + +int generator_hook_up_mkfs( + const char *dir, + const char *what, + const char *where, + const char *type) { + + _cleanup_free_ char *node = NULL, *unit = NULL, *escaped = NULL, *where_unit = NULL; + _cleanup_fclose_ FILE *f = NULL; + const char *fsck_unit; + int r; + + assert(dir); + assert(what); + assert(where); + + node = fstab_node_to_udev_node(what); + if (!node) + return log_oom(); + + /* Nothing to work on. */ + if (!is_device_path(node)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot format something that is not a device node: %s", + node); + + if (!type || streq(type, "auto")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot format partition %s, filesystem type is not specified", + node); + + r = unit_name_from_path_instance("systemd-makefs", node, ".service", &unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit instance name from path \"%s\": %m", + node); + + if (in_initrd() && path_equal(where, "/sysroot")) + fsck_unit = SPECIAL_FSCK_ROOT_SERVICE; + else if (in_initrd() && path_equal(where, "/sysusr/usr")) + fsck_unit = SPECIAL_FSCK_USR_SERVICE; + else + fsck_unit = "systemd-fsck@%i.service"; + + escaped = cescape(node); + if (!escaped) + return log_oom(); + + r = unit_name_from_path(where, ".mount", &where_unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path \"%s\": %m", + where); + + r = generator_open_unit_file(dir, /* source = */ NULL, unit, &f); + if (r < 0) + return r; + + fprintf(f, + "[Unit]\n" + "Description=Make File System on %%f\n" + "Documentation=man:systemd-makefs@.service(8)\n" + "\n" + "DefaultDependencies=no\n" + "BindsTo=%%i.device\n" + "After=%%i.device\n" + /* fsck might or might not be used, so let's be safe and order + * ourselves before both systemd-fsck@.service and the mount unit. */ + "Before=%s %s\n" + "Conflicts=shutdown.target\n" + "Before=shutdown.target\n" + "\n" + "[Service]\n" + "Type=oneshot\n" + "RemainAfterExit=yes\n" + "ExecStart="SYSTEMD_MAKEFS_PATH " %s %s\n" + "TimeoutSec=infinity\n", + fsck_unit, + where_unit, + type, + escaped); + // XXX: what about local-fs-pre.target? + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit %s: %m", unit); + + return generator_add_symlink(dir, where_unit, "requires", unit); +} + +int generator_hook_up_growfs( + const char *dir, + const char *where, + const char *target) { + + const char *growfs_unit, *growfs_unit_path; + _cleanup_free_ char *where_unit = NULL, *instance = NULL; + int r; + + assert(dir); + assert(where); + + r = unit_name_from_path(where, ".mount", &where_unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path '%s': %m", where); + + if (empty_or_root(where)) { + growfs_unit = SPECIAL_GROWFS_ROOT_SERVICE; + growfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_GROWFS_ROOT_SERVICE; + } else { + growfs_unit = SPECIAL_GROWFS_SERVICE; + growfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_GROWFS_SERVICE; + + r = unit_name_path_escape(where, &instance); + if (r < 0) + return log_error_errno(r, "Failed to escape path '%s': %m", where); + } + + if (target) { + r = generator_add_ordering(dir, target, "After", growfs_unit, instance); + if (r < 0) + return r; + } + + return generator_add_symlink_full(dir, where_unit, "wants", growfs_unit_path, instance); +} + +int generator_hook_up_pcrfs( + const char *dir, + const char *where, + const char *target) { + + const char *pcrfs_unit, *pcrfs_unit_path; + _cleanup_free_ char *where_unit = NULL, *instance = NULL; + int r; + + assert(dir); + assert(where); + + r = unit_name_from_path(where, ".mount", &where_unit); + if (r < 0) + return log_error_errno(r, "Failed to make unit name from path '%s': %m", where); + + if (empty_or_root(where)) { + pcrfs_unit = SPECIAL_PCRFS_ROOT_SERVICE; + pcrfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_PCRFS_ROOT_SERVICE; + } else { + pcrfs_unit = SPECIAL_PCRFS_SERVICE; + pcrfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_PCRFS_SERVICE; + + r = unit_name_path_escape(where, &instance); + if (r < 0) + return log_error_errno(r, "Failed to escape path '%s': %m", where); + } + + if (target) { + r = generator_add_ordering(dir, target, "After", pcrfs_unit, instance); + if (r < 0) + return r; + } + + return generator_add_symlink_full(dir, where_unit, "wants", pcrfs_unit_path, instance); +} + +int generator_enable_remount_fs_service(const char *dir) { + /* Pull in systemd-remount-fs.service */ + return generator_add_symlink(dir, SPECIAL_LOCAL_FS_TARGET, "wants", + SYSTEM_DATA_UNIT_DIR "/" SPECIAL_REMOUNT_FS_SERVICE); +} + +int generator_write_blockdev_dependency( + FILE *f, + const char *what) { + + _cleanup_free_ char *escaped = NULL; + int r; + + assert(f); + assert(what); + + if (!path_startswith(what, "/dev/")) + return 0; + + r = unit_name_path_escape(what, &escaped); + if (r < 0) + return log_error_errno(r, "Failed to escape device node path %s: %m", what); + + fprintf(f, + "After=blockdev@%s.target\n", + escaped); + + return 0; +} + +int generator_write_cryptsetup_unit_section( + FILE *f, + const char *source) { + + assert(f); + + fprintf(f, + "[Unit]\n" + "Description=Cryptography Setup for %%I\n" + "Documentation=man:crypttab(5) man:systemd-cryptsetup-generator(8) man:systemd-cryptsetup@.service(8)\n"); + + if (source) + fprintf(f, "SourcePath=%s\n", source); + + fprintf(f, + "\n" + "DefaultDependencies=no\n" + "After=cryptsetup-pre.target systemd-udevd-kernel.socket systemd-tpm2-setup-early.service\n" + "Before=blockdev@dev-mapper-%%i.target\n" + "Wants=blockdev@dev-mapper-%%i.target\n" + "IgnoreOnIsolate=true\n"); + + return 0; +} + +int generator_write_cryptsetup_service_section( + FILE *f, + const char *name, + const char *what, + const char *key_file, + const char *options) { + + _cleanup_free_ char *name_escaped = NULL, *what_escaped = NULL, *key_file_escaped = NULL, *options_escaped = NULL; + + assert(f); + assert(name); + assert(what); + + name_escaped = specifier_escape(name); + if (!name_escaped) + return log_oom(); + + what_escaped = specifier_escape(what); + if (!what_escaped) + return log_oom(); + + if (key_file) { + key_file_escaped = specifier_escape(key_file); + if (!key_file_escaped) + return log_oom(); + } + + if (options) { + options_escaped = specifier_escape(options); + if (!options_escaped) + return log_oom(); + } + + fprintf(f, + "\n" + "[Service]\n" + "Type=oneshot\n" + "RemainAfterExit=yes\n" + "TimeoutSec=infinity\n" /* The binary handles timeouts on its own */ + "KeyringMode=shared\n" /* Make sure we can share cached keys among instances */ + "OOMScoreAdjust=500\n" /* Unlocking can allocate a lot of memory if Argon2 is used */ + "ExecStart=" SYSTEMD_CRYPTSETUP_PATH " attach '%s' '%s' '%s' '%s'\n" + "ExecStop=" SYSTEMD_CRYPTSETUP_PATH " detach '%s'\n", + name_escaped, what_escaped, strempty(key_file_escaped), strempty(options_escaped), + name_escaped); + + return 0; +} + +int generator_write_veritysetup_unit_section( + FILE *f, + const char *source) { + + assert(f); + + fprintf(f, + "[Unit]\n" + "Description=Integrity Protection Setup for %%I\n" + "Documentation=man:veritytab(5) man:systemd-veritysetup-generator(8) man:systemd-veritysetup@.service(8)\n"); + + if (source) + fprintf(f, "SourcePath=%s\n", source); + + fprintf(f, + "DefaultDependencies=no\n" + "IgnoreOnIsolate=true\n" + "After=veritysetup-pre.target systemd-udevd-kernel.socket\n" + "Before=blockdev@dev-mapper-%%i.target\n" + "Wants=blockdev@dev-mapper-%%i.target\n"); + + return 0; +} + +int generator_write_veritysetup_service_section( + FILE *f, + const char *name, + const char *data_what, + const char *hash_what, + const char *roothash, + const char *options) { + + _cleanup_free_ char *name_escaped = NULL, *data_what_escaped = NULL, *hash_what_escaped = NULL, + *roothash_escaped = NULL, *options_escaped = NULL; + + assert(f); + assert(name); + assert(data_what); + assert(hash_what); + + name_escaped = specifier_escape(name); + if (!name_escaped) + return log_oom(); + + data_what_escaped = specifier_escape(data_what); + if (!data_what_escaped) + return log_oom(); + + hash_what_escaped = specifier_escape(hash_what); + if (!hash_what_escaped) + return log_oom(); + + roothash_escaped = specifier_escape(roothash); + if (!roothash_escaped) + return log_oom(); + + if (options) { + options_escaped = specifier_escape(options); + if (!options_escaped) + return log_oom(); + } + + fprintf(f, + "\n" + "[Service]\n" + "Type=oneshot\n" + "RemainAfterExit=yes\n" + "ExecStart=" SYSTEMD_VERITYSETUP_PATH " attach '%s' '%s' '%s' '%s' '%s'\n" + "ExecStop=" SYSTEMD_VERITYSETUP_PATH " detach '%s'\n", + name_escaped, data_what_escaped, hash_what_escaped, roothash_escaped, strempty(options_escaped), + name_escaped); + + return 0; +} + +void log_setup_generator(void) { + if (invoked_by_systemd()) { + /* Disable talking to syslog/journal (i.e. the two IPC-based loggers) if we run in system context. */ + if (cg_pid_get_owner_uid(0, NULL) == -ENXIO /* not running in a per-user slice */) + log_set_prohibit_ipc(true); + + /* This effectively means: journal for per-user generators, kmsg otherwise */ + log_set_target(LOG_TARGET_JOURNAL_OR_KMSG); + } + + log_parse_environment(); + (void) log_open(); +} diff --git a/src/shared/generator.h b/src/shared/generator.h new file mode 100644 index 0000000..d97d6ed --- /dev/null +++ b/src/shared/generator.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" +#include "main-func.h" + +int generator_open_unit_file_full(const char *dest, const char *source, const char *name, FILE **ret_file, char **ret_temp_path); + +static inline int generator_open_unit_file(const char *dest, const char *source, const char *name, FILE **ret_file) { + return generator_open_unit_file_full(dest, source, name, ret_file, NULL); +} + +int generator_add_symlink_full(const char *dir, const char *dst, const char *dep_type, const char *src, const char *instance); + +static inline int generator_add_symlink(const char *dir, const char *dst, const char *dep_type, const char *src) { + return generator_add_symlink_full(dir, dst, dep_type, src, NULL); +} + +int generator_write_fsck_deps( + FILE *f, + const char *dir, + const char *what, + const char *where, + const char *type); + +int generator_write_timeouts( + const char *dir, + const char *what, + const char *where, + const char *opts, + char **filtered); + +int generator_write_blockdev_dependency( + FILE *f, + const char *what); + +int generator_write_cryptsetup_unit_section( + FILE *f, + const char *source); + +int generator_write_cryptsetup_service_section( + FILE *f, + const char *name, + const char *what, + const char *password, + const char *options); + +int generator_write_veritysetup_unit_section( + FILE *f, + const char *source); + +int generator_write_veritysetup_service_section( + FILE *f, + const char *name, + const char *data_what, + const char *hash_what, + const char *roothash, + const char *options); + +int generator_write_device_deps( + const char *dir, + const char *what, + const char *where, + const char *opts); + +int generator_write_initrd_root_device_deps( + const char *dir, + const char *what); + +int generator_hook_up_mkswap( + const char *dir, + const char *what); +int generator_hook_up_mkfs( + const char *dir, + const char *what, + const char *where, + const char *type); +int generator_hook_up_growfs( + const char *dir, + const char *where, + const char *target); +int generator_hook_up_pcrfs( + const char *dir, + const char *where, + const char *target); + +int generator_enable_remount_fs_service(const char *dir); + +void log_setup_generator(void); + +/* Similar to DEFINE_MAIN_FUNCTION, but initializes logging and assigns positional arguments. */ +#define DEFINE_MAIN_GENERATOR_FUNCTION(impl) \ + _DEFINE_MAIN_FUNCTION( \ + ({ \ + log_setup_generator(); \ + if (!IN_SET(argc, 2, 4)) \ + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), \ + "This program takes one or three arguments."); \ + }), \ + impl(argv[1], \ + argv[argc == 4 ? 2 : 1], \ + argv[argc == 4 ? 3 : 1]), \ + r < 0 ? EXIT_FAILURE : EXIT_SUCCESS) diff --git a/src/shared/geneve-util.c b/src/shared/geneve-util.c new file mode 100644 index 0000000..36ef9c8 --- /dev/null +++ b/src/shared/geneve-util.c @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "geneve-util.h" +#include "string-table.h" + +static const char* const geneve_df_table[_NETDEV_GENEVE_DF_MAX] = { + [NETDEV_GENEVE_DF_UNSET] = "unset", + [NETDEV_GENEVE_DF_SET] = "set", + [NETDEV_GENEVE_DF_INHERIT] = "inherit", +}; + +DEFINE_STRING_TABLE_LOOKUP(geneve_df, GeneveDF); diff --git a/src/shared/geneve-util.h b/src/shared/geneve-util.h new file mode 100644 index 0000000..acd0e1a --- /dev/null +++ b/src/shared/geneve-util.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" + +typedef enum GeneveDF { + NETDEV_GENEVE_DF_UNSET = GENEVE_DF_UNSET, + NETDEV_GENEVE_DF_SET = GENEVE_DF_SET, + NETDEV_GENEVE_DF_INHERIT = GENEVE_DF_INHERIT, + _NETDEV_GENEVE_DF_MAX, + _NETDEV_GENEVE_DF_INVALID = -EINVAL, +} GeneveDF; + +const char *geneve_df_to_string(GeneveDF d) _const_; +GeneveDF geneve_df_from_string(const char *d) _pure_; diff --git a/src/shared/gpt.c b/src/shared/gpt.c new file mode 100644 index 0000000..d639463 --- /dev/null +++ b/src/shared/gpt.c @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "gpt.h" +#include "string-table.h" +#include "string-util.h" +#include "utf8.h" + +/* Gently push people towards defining GPT type UUIDs for all architectures we know */ +#if !defined(SD_GPT_ROOT_NATIVE) || \ + !defined(SD_GPT_ROOT_NATIVE_VERITY) || \ + !defined(SD_GPT_ROOT_NATIVE_VERITY_SIG) || \ + !defined(SD_GPT_USR_NATIVE) || \ + !defined(SD_GPT_USR_NATIVE_VERITY) || \ + !defined(SD_GPT_USR_NATIVE_VERITY_SIG) +#pragma message "Please define GPT partition types for your architecture." +#endif + +bool partition_designator_is_versioned(PartitionDesignator d) { + /* Returns true for all designators where we want to support a concept of "versioning", i.e. which + * likely contain software binaries (or hashes thereof) that make sense to be versioned as a + * whole. We use this check to automatically pick the newest version of these partitions, by version + * comparing the partition labels. */ + + return IN_SET(d, + PARTITION_ROOT, + PARTITION_USR, + PARTITION_ROOT_VERITY, + PARTITION_USR_VERITY, + PARTITION_ROOT_VERITY_SIG, + PARTITION_USR_VERITY_SIG); +} + +PartitionDesignator partition_verity_of(PartitionDesignator p) { + switch (p) { + + case PARTITION_ROOT: + return PARTITION_ROOT_VERITY; + + case PARTITION_USR: + return PARTITION_USR_VERITY; + + default: + return _PARTITION_DESIGNATOR_INVALID; + } +} + +PartitionDesignator partition_verity_sig_of(PartitionDesignator p) { + switch (p) { + + case PARTITION_ROOT: + return PARTITION_ROOT_VERITY_SIG; + + case PARTITION_USR: + return PARTITION_USR_VERITY_SIG; + + default: + return _PARTITION_DESIGNATOR_INVALID; + } +} + +PartitionDesignator partition_verity_to_data(PartitionDesignator d) { + switch (d) { + + case PARTITION_ROOT_VERITY: + return PARTITION_ROOT; + + case PARTITION_USR_VERITY: + return PARTITION_USR; + + default: + return _PARTITION_DESIGNATOR_INVALID; + } +} + +PartitionDesignator partition_verity_sig_to_data(PartitionDesignator d) { + switch (d) { + + case PARTITION_ROOT_VERITY_SIG: + return PARTITION_ROOT; + + case PARTITION_USR_VERITY_SIG: + return PARTITION_USR; + + default: + return _PARTITION_DESIGNATOR_INVALID; + } +} + +static const char *const partition_designator_table[_PARTITION_DESIGNATOR_MAX] = { + [PARTITION_ROOT] = "root", + [PARTITION_USR] = "usr", + [PARTITION_HOME] = "home", + [PARTITION_SRV] = "srv", + [PARTITION_ESP] = "esp", + [PARTITION_XBOOTLDR] = "xbootldr", + [PARTITION_SWAP] = "swap", + [PARTITION_ROOT_VERITY] = "root-verity", + [PARTITION_USR_VERITY] = "usr-verity", + [PARTITION_ROOT_VERITY_SIG] = "root-verity-sig", + [PARTITION_USR_VERITY_SIG] = "usr-verity-sig", + [PARTITION_TMP] = "tmp", + [PARTITION_VAR] = "var", +}; + +DEFINE_STRING_TABLE_LOOKUP(partition_designator, PartitionDesignator); + +static const char *const partition_mountpoint_table[_PARTITION_DESIGNATOR_MAX] = { + [PARTITION_ROOT] = "/\0", + [PARTITION_USR] = "/usr\0", + [PARTITION_HOME] = "/home\0", + [PARTITION_SRV] = "/srv\0", + [PARTITION_ESP] = "/efi\0/boot\0", + [PARTITION_XBOOTLDR] = "/boot\0", + [PARTITION_TMP] = "/var/tmp\0", + [PARTITION_VAR] = "/var\0", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(partition_mountpoint, PartitionDesignator); + +#define _GPT_ARCH_SEXTET(arch, name) \ + { SD_GPT_ROOT_##arch, "root-" name, ARCHITECTURE_##arch, .designator = PARTITION_ROOT }, \ + { SD_GPT_ROOT_##arch##_VERITY, "root-" name "-verity", ARCHITECTURE_##arch, .designator = PARTITION_ROOT_VERITY }, \ + { SD_GPT_ROOT_##arch##_VERITY_SIG, "root-" name "-verity-sig", ARCHITECTURE_##arch, .designator = PARTITION_ROOT_VERITY_SIG }, \ + { SD_GPT_USR_##arch, "usr-" name, ARCHITECTURE_##arch, .designator = PARTITION_USR }, \ + { SD_GPT_USR_##arch##_VERITY, "usr-" name "-verity", ARCHITECTURE_##arch, .designator = PARTITION_USR_VERITY }, \ + { SD_GPT_USR_##arch##_VERITY_SIG, "usr-" name "-verity-sig", ARCHITECTURE_##arch, .designator = PARTITION_USR_VERITY_SIG } + +/* Two special cases: alias aarch64 to arm64, and amd64 to x86-64. The DSP mixes debianisms and CPUisms: for + * x86, it uses x86 and x86_64, but for aarch64 it uses arm64. This is confusing, and leads to issues for + * callers that have to know which -ism to use for which architecture. But we also don't really want to + * change the spec and add new partition labels, so add a user-friendly aliasing here, so that both are + * accepted but the end result on disk (ie: the partition label). + * So always list the canonical name FIRST, and then any aliases later, so that we can match on aliases, + * but always return the canonical name. And never return directly a match on the name, always re-resolve + * by UUID so that the canonical entry is always found. */ + +const GptPartitionType gpt_partition_type_table[] = { + _GPT_ARCH_SEXTET(ALPHA, "alpha"), + _GPT_ARCH_SEXTET(ARC, "arc"), + _GPT_ARCH_SEXTET(ARM, "arm"), + _GPT_ARCH_SEXTET(ARM, "armv7l"), /* Alias: must be listed after arm */ + _GPT_ARCH_SEXTET(ARM64, "arm64"), + _GPT_ARCH_SEXTET(ARM64, "aarch64"), /* Alias: must be listed after arm64 */ + _GPT_ARCH_SEXTET(IA64, "ia64"), + _GPT_ARCH_SEXTET(LOONGARCH64, "loongarch64"), + _GPT_ARCH_SEXTET(MIPS, "mips"), + _GPT_ARCH_SEXTET(MIPS64, "mips64"), + _GPT_ARCH_SEXTET(MIPS_LE, "mips-le"), + _GPT_ARCH_SEXTET(MIPS64_LE, "mips64-le"), + _GPT_ARCH_SEXTET(PARISC, "parisc"), + _GPT_ARCH_SEXTET(PPC, "ppc"), + _GPT_ARCH_SEXTET(PPC64, "ppc64"), + _GPT_ARCH_SEXTET(PPC64_LE, "ppc64-le"), + _GPT_ARCH_SEXTET(PPC64_LE, "ppc64le"), /* Alias: must be listed after ppc64-le */ + _GPT_ARCH_SEXTET(RISCV32, "riscv32"), + _GPT_ARCH_SEXTET(RISCV64, "riscv64"), + _GPT_ARCH_SEXTET(S390, "s390"), + _GPT_ARCH_SEXTET(S390X, "s390x"), + _GPT_ARCH_SEXTET(TILEGX, "tilegx"), + _GPT_ARCH_SEXTET(X86, "x86"), + _GPT_ARCH_SEXTET(X86_64, "x86-64"), + _GPT_ARCH_SEXTET(X86_64, "x86_64"), /* Alias: must be listed after x86-64 */ + _GPT_ARCH_SEXTET(X86_64, "amd64"), /* Alias: must be listed after x86-64 */ +#ifdef SD_GPT_ROOT_NATIVE + { SD_GPT_ROOT_NATIVE, "root", native_architecture(), .designator = PARTITION_ROOT }, + { SD_GPT_ROOT_NATIVE_VERITY, "root-verity", native_architecture(), .designator = PARTITION_ROOT_VERITY }, + { SD_GPT_ROOT_NATIVE_VERITY_SIG, "root-verity-sig", native_architecture(), .designator = PARTITION_ROOT_VERITY_SIG }, + { SD_GPT_USR_NATIVE, "usr", native_architecture(), .designator = PARTITION_USR }, + { SD_GPT_USR_NATIVE_VERITY, "usr-verity", native_architecture(), .designator = PARTITION_USR_VERITY }, + { SD_GPT_USR_NATIVE_VERITY_SIG, "usr-verity-sig", native_architecture(), .designator = PARTITION_USR_VERITY_SIG }, +#endif +#ifdef SD_GPT_ROOT_SECONDARY + { SD_GPT_ROOT_SECONDARY, "root-secondary", ARCHITECTURE_SECONDARY, .designator = PARTITION_ROOT }, + { SD_GPT_ROOT_SECONDARY_VERITY, "root-secondary-verity", ARCHITECTURE_SECONDARY, .designator = PARTITION_ROOT_VERITY }, + { SD_GPT_ROOT_SECONDARY_VERITY_SIG, "root-secondary-verity-sig", ARCHITECTURE_SECONDARY, .designator = PARTITION_ROOT_VERITY_SIG }, + { SD_GPT_USR_SECONDARY, "usr-secondary", ARCHITECTURE_SECONDARY, .designator = PARTITION_USR }, + { SD_GPT_USR_SECONDARY_VERITY, "usr-secondary-verity", ARCHITECTURE_SECONDARY, .designator = PARTITION_USR_VERITY }, + { SD_GPT_USR_SECONDARY_VERITY_SIG, "usr-secondary-verity-sig", ARCHITECTURE_SECONDARY, .designator = PARTITION_USR_VERITY_SIG }, +#endif + + { SD_GPT_ESP, "esp", _ARCHITECTURE_INVALID, .designator = PARTITION_ESP }, + { SD_GPT_XBOOTLDR, "xbootldr", _ARCHITECTURE_INVALID, .designator = PARTITION_XBOOTLDR }, + { SD_GPT_SWAP, "swap", _ARCHITECTURE_INVALID, .designator = PARTITION_SWAP }, + { SD_GPT_HOME, "home", _ARCHITECTURE_INVALID, .designator = PARTITION_HOME }, + { SD_GPT_SRV, "srv", _ARCHITECTURE_INVALID, .designator = PARTITION_SRV }, + { SD_GPT_VAR, "var", _ARCHITECTURE_INVALID, .designator = PARTITION_VAR }, + { SD_GPT_TMP, "tmp", _ARCHITECTURE_INVALID, .designator = PARTITION_TMP }, + { SD_GPT_USER_HOME, "user-home", _ARCHITECTURE_INVALID, .designator = _PARTITION_DESIGNATOR_INVALID }, + { SD_GPT_LINUX_GENERIC, "linux-generic", _ARCHITECTURE_INVALID, .designator = _PARTITION_DESIGNATOR_INVALID }, + {} +}; + +static const GptPartitionType *gpt_partition_type_find_by_uuid(sd_id128_t id) { + + FOREACH_ARRAY(t, gpt_partition_type_table, ELEMENTSOF(gpt_partition_type_table) - 1) + if (sd_id128_equal(id, t->uuid)) + return t; + + return NULL; +} + +const char *gpt_partition_type_uuid_to_string(sd_id128_t id) { + const GptPartitionType *pt; + + pt = gpt_partition_type_find_by_uuid(id); + if (!pt) + return NULL; + + return pt->name; +} + +const char *gpt_partition_type_uuid_to_string_harder( + sd_id128_t id, + char buffer[static SD_ID128_UUID_STRING_MAX]) { + + const char *s; + + assert(buffer); + + s = gpt_partition_type_uuid_to_string(id); + if (s) + return s; + + return sd_id128_to_uuid_string(id, buffer); +} + +int gpt_partition_type_from_string(const char *s, GptPartitionType *ret) { + sd_id128_t id = SD_ID128_NULL; + int r; + + assert(s); + + FOREACH_ARRAY(t, gpt_partition_type_table, ELEMENTSOF(gpt_partition_type_table) - 1) + if (streq(s, t->name)) { + /* Don't return immediately, instead re-resolve by UUID so that we can support + * aliases like aarch64 -> arm64 transparently. */ + id = t->uuid; + break; + } + + if (sd_id128_is_null(id)) { + r = sd_id128_from_string(s, &id); + if (r < 0) + return r; + } + + if (ret) + *ret = gpt_partition_type_from_uuid(id); + + return 0; +} + +GptPartitionType gpt_partition_type_override_architecture(GptPartitionType type, Architecture arch) { + assert(arch >= 0); + + FOREACH_ARRAY(t, gpt_partition_type_table, ELEMENTSOF(gpt_partition_type_table) - 1) + if (t->designator == type.designator && t->arch == arch) + return *t; + + /* If we can't find an entry with the same designator and the requested architecture, just return the + * original partition type. */ + return type; +} + +Architecture gpt_partition_type_uuid_to_arch(sd_id128_t id) { + const GptPartitionType *pt; + + pt = gpt_partition_type_find_by_uuid(id); + if (!pt) + return _ARCHITECTURE_INVALID; + + return pt->arch; +} + +int gpt_partition_label_valid(const char *s) { + _cleanup_free_ char16_t *recoded = NULL; + + recoded = utf8_to_utf16(s, SIZE_MAX); + if (!recoded) + return -ENOMEM; + + return char16_strlen(recoded) <= GPT_LABEL_MAX; +} + +GptPartitionType gpt_partition_type_from_uuid(sd_id128_t id) { + const GptPartitionType *pt; + + pt = gpt_partition_type_find_by_uuid(id); + if (pt) + return *pt; + + return (GptPartitionType) { + .uuid = id, + .arch = _ARCHITECTURE_INVALID, + .designator = _PARTITION_DESIGNATOR_INVALID, + }; +} + +const char *gpt_partition_type_mountpoint_nulstr(GptPartitionType type) { + return partition_mountpoint_to_string(type.designator); +} + +bool gpt_partition_type_knows_read_only(GptPartitionType type) { + return IN_SET(type.designator, + PARTITION_ROOT, + PARTITION_USR, + /* pretty much implied, but let's set the bit to make things really clear */ + PARTITION_ROOT_VERITY, + PARTITION_USR_VERITY, + PARTITION_HOME, + PARTITION_SRV, + PARTITION_VAR, + PARTITION_TMP, + PARTITION_XBOOTLDR); +} + +bool gpt_partition_type_knows_growfs(GptPartitionType type) { + return IN_SET(type.designator, + PARTITION_ROOT, + PARTITION_USR, + PARTITION_HOME, + PARTITION_SRV, + PARTITION_VAR, + PARTITION_TMP, + PARTITION_XBOOTLDR); +} + +bool gpt_partition_type_knows_no_auto(GptPartitionType type) { + return IN_SET(type.designator, + PARTITION_ROOT, + PARTITION_ROOT_VERITY, + PARTITION_USR, + PARTITION_USR_VERITY, + PARTITION_HOME, + PARTITION_SRV, + PARTITION_VAR, + PARTITION_TMP, + PARTITION_XBOOTLDR, + PARTITION_SWAP); +} + +bool gpt_header_has_signature(const GptHeader *p) { + assert(p); + + if (memcmp(p->signature, (const char[8]) { 'E', 'F', 'I', ' ', 'P', 'A', 'R', 'T' }, 8) != 0) + return false; + + if (le32toh(p->revision) != UINT32_C(0x00010000)) /* the only known revision of the spec: 1.0 */ + return false; + + if (le32toh(p->header_size) < sizeof(GptHeader)) + return false; + + if (le32toh(p->header_size) > 4096) /* larger than a sector? something is off… */ + return false; + + if (le64toh(p->my_lba) != 1) /* this sector must claim to be at sector offset 1 */ + return false; + + return true; +} diff --git a/src/shared/gpt.h b/src/shared/gpt.h new file mode 100644 index 0000000..21976e5 --- /dev/null +++ b/src/shared/gpt.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-gpt.h" +#include "sd-id128.h" + +#include "architecture.h" +#include "id128-util.h" +#include "sparse-endian.h" + +/* maximum length of gpt label */ +#define GPT_LABEL_MAX 36 + +typedef enum PartitionDesignator { + PARTITION_ROOT, /* Primary architecture */ + PARTITION_USR, + PARTITION_HOME, + PARTITION_SRV, + PARTITION_ESP, + PARTITION_XBOOTLDR, + PARTITION_SWAP, + PARTITION_ROOT_VERITY, /* verity data for the PARTITION_ROOT partition */ + PARTITION_USR_VERITY, + PARTITION_ROOT_VERITY_SIG, /* PKCS#7 signature for root hash for the PARTITION_ROOT partition */ + PARTITION_USR_VERITY_SIG, + PARTITION_TMP, + PARTITION_VAR, + _PARTITION_DESIGNATOR_MAX, + _PARTITION_DESIGNATOR_INVALID = -EINVAL, +} PartitionDesignator; + +bool partition_designator_is_versioned(PartitionDesignator d); + +PartitionDesignator partition_verity_of(PartitionDesignator p); +PartitionDesignator partition_verity_sig_of(PartitionDesignator p); +PartitionDesignator partition_verity_to_data(PartitionDesignator d); +PartitionDesignator partition_verity_sig_to_data(PartitionDesignator d); + +const char* partition_designator_to_string(PartitionDesignator d) _const_; +PartitionDesignator partition_designator_from_string(const char *name) _pure_; + +const char *gpt_partition_type_uuid_to_string(sd_id128_t id); +const char *gpt_partition_type_uuid_to_string_harder( + sd_id128_t id, + char buffer[static SD_ID128_UUID_STRING_MAX]); + +#define GPT_PARTITION_TYPE_UUID_TO_STRING_HARDER(id) \ + gpt_partition_type_uuid_to_string_harder((id), (char[SD_ID128_UUID_STRING_MAX]) {}) + +Architecture gpt_partition_type_uuid_to_arch(sd_id128_t id); + +typedef struct GptPartitionType { + sd_id128_t uuid; + const char *name; + Architecture arch; + PartitionDesignator designator; +} GptPartitionType; + +extern const GptPartitionType gpt_partition_type_table[]; + +int gpt_partition_label_valid(const char *s); + +GptPartitionType gpt_partition_type_from_uuid(sd_id128_t id); +int gpt_partition_type_from_string(const char *s, GptPartitionType *ret); + +GptPartitionType gpt_partition_type_override_architecture(GptPartitionType type, Architecture arch); + +const char *gpt_partition_type_mountpoint_nulstr(GptPartitionType type); + +bool gpt_partition_type_knows_read_only(GptPartitionType type); +bool gpt_partition_type_knows_growfs(GptPartitionType type); +bool gpt_partition_type_knows_no_auto(GptPartitionType type); + +typedef struct { + uint8_t partition_type_guid[16]; + uint8_t unique_partition_guid[16]; + le64_t starting_lba; + le64_t ending_lba; + le64_t attributes; + char16_t partition_name[36]; +} _packed_ GptPartitionEntry; + +typedef struct { + char signature[8]; + le32_t revision; + le32_t header_size; + le32_t crc32; + le32_t reserved; + le64_t my_lba; + le64_t alternate_lba; + le64_t first_usable_lba; + le64_t last_usable_lba; + uint8_t disk_guid[16]; + le64_t partition_entry_lba; + le32_t number_of_partition_entries; + le32_t size_of_partition_entry; + le32_t partition_entry_array_crc32; +} _packed_ GptHeader; + +bool gpt_header_has_signature(const GptHeader *p); diff --git a/src/shared/group-record.c b/src/shared/group-record.c new file mode 100644 index 0000000..1e33bdf --- /dev/null +++ b/src/shared/group-record.c @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "group-record.h" +#include "strv.h" +#include "uid-alloc-range.h" +#include "user-util.h" + +GroupRecord* group_record_new(void) { + GroupRecord *h; + + h = new(GroupRecord, 1); + if (!h) + return NULL; + + *h = (GroupRecord) { + .n_ref = 1, + .disposition = _USER_DISPOSITION_INVALID, + .last_change_usec = UINT64_MAX, + .gid = GID_INVALID, + }; + + return h; +} + +static GroupRecord *group_record_free(GroupRecord *g) { + if (!g) + return NULL; + + free(g->group_name); + free(g->realm); + free(g->group_name_and_realm_auto); + free(g->description); + + strv_free(g->members); + free(g->service); + strv_free(g->administrators); + strv_free_erase(g->hashed_password); + + json_variant_unref(g->json); + + return mfree(g); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(GroupRecord, group_record, group_record_free); + +static int dispatch_privileged(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch privileged_dispatch_table[] = { + { "hashedPassword", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(GroupRecord, hashed_password), JSON_SAFE }, + {}, + }; + + return json_dispatch(variant, privileged_dispatch_table, flags, userdata); +} + +static int dispatch_binding(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch binding_dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(GroupRecord, gid), 0 }, + {}, + }; + + JsonVariant *m; + sd_id128_t mid; + int r; + + if (!variant) + return 0; + + if (!json_variant_is_object(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name)); + + r = sd_id128_get_machine(&mid); + if (r < 0) + return json_log(variant, flags, r, "Failed to determine machine ID: %m"); + + m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid)); + if (!m) + return 0; + + return json_dispatch(m, binding_dispatch_table, flags, userdata); +} + +static int dispatch_per_machine(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch per_machine_dispatch_table[] = { + { "matchMachineId", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 }, + { "matchHostname", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 }, + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(GroupRecord, gid), 0 }, + { "members", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, members), JSON_RELAX}, + { "administrators", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, administrators), JSON_RELAX}, + {}, + }; + + JsonVariant *e; + int r; + + if (!variant) + return 0; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + bool matching = false; + JsonVariant *m; + + if (!json_variant_is_object(e)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name)); + + m = json_variant_by_key(e, "matchMachineId"); + if (m) { + r = per_machine_id_match(m, flags); + if (r < 0) + return r; + + matching = r > 0; + } + + if (!matching) { + m = json_variant_by_key(e, "matchHostname"); + if (m) { + r = per_machine_hostname_match(m, flags); + if (r < 0) + return r; + + matching = r > 0; + } + } + + if (!matching) + continue; + + r = json_dispatch(e, per_machine_dispatch_table, flags, userdata); + if (r < 0) + return r; + } + + return 0; +} + +static int dispatch_status(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch status_dispatch_table[] = { + { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(GroupRecord, service), JSON_SAFE }, + {}, + }; + + JsonVariant *m; + sd_id128_t mid; + int r; + + if (!variant) + return 0; + + if (!json_variant_is_object(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name)); + + r = sd_id128_get_machine(&mid); + if (r < 0) + return json_log(variant, flags, r, "Failed to determine machine ID: %m"); + + m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid)); + if (!m) + return 0; + + return json_dispatch(m, status_dispatch_table, flags, userdata); +} + +static int group_record_augment(GroupRecord *h, JsonDispatchFlags json_flags) { + assert(h); + + if (!FLAGS_SET(h->mask, USER_RECORD_REGULAR)) + return 0; + + assert(h->group_name); + + if (!h->group_name_and_realm_auto && h->realm) { + h->group_name_and_realm_auto = strjoin(h->group_name, "@", h->realm); + if (!h->group_name_and_realm_auto) + return json_log_oom(h->json, json_flags); + } + + return 0; +} + +int group_record_load( + GroupRecord *h, + JsonVariant *v, + UserRecordLoadFlags load_flags) { + + static const JsonDispatch group_dispatch_table[] = { + { "groupName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(GroupRecord, group_name), JSON_RELAX}, + { "realm", JSON_VARIANT_STRING, json_dispatch_realm, offsetof(GroupRecord, realm), 0 }, + { "description", JSON_VARIANT_STRING, json_dispatch_gecos, offsetof(GroupRecord, description), 0 }, + { "disposition", JSON_VARIANT_STRING, json_dispatch_user_disposition, offsetof(GroupRecord, disposition), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(GroupRecord, service), JSON_SAFE }, + { "lastChangeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(GroupRecord, last_change_usec), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(GroupRecord, gid), 0 }, + { "members", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, members), JSON_RELAX}, + { "administrators", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, administrators), JSON_RELAX}, + + { "privileged", JSON_VARIANT_OBJECT, dispatch_privileged, 0, 0 }, + + /* Not defined for now, for groups, but let's at least generate sensible errors about it */ + { "secret", JSON_VARIANT_OBJECT, json_dispatch_unsupported, 0, 0 }, + + /* Ignore the perMachine, binding and status stuff here, and process it later, so that it overrides whatever is set above */ + { "perMachine", JSON_VARIANT_ARRAY, NULL, 0, 0 }, + { "binding", JSON_VARIANT_OBJECT, NULL, 0, 0 }, + { "status", JSON_VARIANT_OBJECT, NULL, 0, 0 }, + + /* Ignore 'signature', we check it with explicit accessors instead */ + { "signature", JSON_VARIANT_ARRAY, NULL, 0, 0 }, + {}, + }; + + JsonDispatchFlags json_flags = USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(load_flags); + int r; + + assert(h); + assert(!h->json); + + /* Note that this call will leave a half-initialized record around on failure! */ + + if ((USER_RECORD_REQUIRE_MASK(load_flags) & (USER_RECORD_SECRET|USER_RECORD_PRIVILEGED))) + return json_log(v, json_flags, SYNTHETIC_ERRNO(EINVAL), "Secret and privileged section currently not available for groups, refusing."); + + r = user_group_record_mangle(v, load_flags, &h->json, &h->mask); + if (r < 0) + return r; + + r = json_dispatch(h->json, group_dispatch_table, json_flags, h); + if (r < 0) + return r; + + /* During the parsing operation above we ignored the 'perMachine', 'binding' and 'status' fields, since we want + * them to override the global options. Let's process them now. */ + + r = dispatch_per_machine("perMachine", json_variant_by_key(h->json, "perMachine"), json_flags, h); + if (r < 0) + return r; + + r = dispatch_binding("binding", json_variant_by_key(h->json, "binding"), json_flags, h); + if (r < 0) + return r; + + r = dispatch_status("status", json_variant_by_key(h->json, "status"), json_flags, h); + if (r < 0) + return r; + + if (FLAGS_SET(h->mask, USER_RECORD_REGULAR) && !h->group_name) + return json_log(h->json, json_flags, SYNTHETIC_ERRNO(EINVAL), "Group name field missing, refusing."); + + r = group_record_augment(h, json_flags); + if (r < 0) + return r; + + return 0; +} + +int group_record_build(GroupRecord **ret, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + va_list ap; + int r; + + assert(ret); + + va_start(ap, ret); + r = json_buildv(&v, ap); + va_end(ap); + + if (r < 0) + return r; + + g = group_record_new(); + if (!g) + return -ENOMEM; + + r = group_record_load(g, v, USER_RECORD_LOAD_FULL); + if (r < 0) + return r; + + *ret = TAKE_PTR(g); + return 0; +} + +const char *group_record_group_name_and_realm(GroupRecord *h) { + assert(h); + + /* Return the pre-initialized joined string if it is defined */ + if (h->group_name_and_realm_auto) + return h->group_name_and_realm_auto; + + /* If it's not defined then we cannot have a realm */ + assert(!h->realm); + return h->group_name; +} + +UserDisposition group_record_disposition(GroupRecord *h) { + assert(h); + + if (h->disposition >= 0) + return h->disposition; + + /* If not declared, derive from GID */ + + if (!gid_is_valid(h->gid)) + return _USER_DISPOSITION_INVALID; + + if (h->gid == 0 || h->gid == GID_NOBODY) + return USER_INTRINSIC; + + if (gid_is_system(h->gid)) + return USER_SYSTEM; + + if (gid_is_dynamic(h->gid)) + return USER_DYNAMIC; + + if (gid_is_container(h->gid)) + return USER_CONTAINER; + + if (h->gid > INT32_MAX) + return USER_RESERVED; + + return USER_REGULAR; +} + +int group_record_clone(GroupRecord *h, UserRecordLoadFlags flags, GroupRecord **ret) { + _cleanup_(group_record_unrefp) GroupRecord *c = NULL; + int r; + + assert(h); + assert(ret); + + c = group_record_new(); + if (!c) + return -ENOMEM; + + r = group_record_load(c, h->json, flags); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + return 0; +} diff --git a/src/shared/group-record.h b/src/shared/group-record.h new file mode 100644 index 0000000..f810204 --- /dev/null +++ b/src/shared/group-record.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "json.h" +#include "user-record.h" + +typedef struct GroupRecord { + unsigned n_ref; + UserRecordMask mask; + bool incomplete; + + char *group_name; + char *realm; + char *group_name_and_realm_auto; + + char *description; + + UserDisposition disposition; + uint64_t last_change_usec; + + gid_t gid; + + char **members; + + char *service; + + /* The following exist mostly so that we can cover the full /etc/gshadow set of fields, we currently + * do not actually make use of these */ + char **administrators; /* maps to 'struct sgrp' .sg_adm field */ + char **hashed_password; /* maps to 'struct sgrp' .sg_passwd field */ + + JsonVariant *json; +} GroupRecord; + +GroupRecord* group_record_new(void); +GroupRecord* group_record_ref(GroupRecord *g); +GroupRecord* group_record_unref(GroupRecord *g); + +DEFINE_TRIVIAL_CLEANUP_FUNC(GroupRecord*, group_record_unref); + +int group_record_load(GroupRecord *h, JsonVariant *v, UserRecordLoadFlags flags); +int group_record_build(GroupRecord **ret, ...); +int group_record_clone(GroupRecord *g, UserRecordLoadFlags flags, GroupRecord **ret); + +const char *group_record_group_name_and_realm(GroupRecord *h); +UserDisposition group_record_disposition(GroupRecord *h); diff --git a/src/shared/hibernate-util.c b/src/shared/hibernate-util.c new file mode 100644 index 0000000..0d215e8 --- /dev/null +++ b/src/shared/hibernate-util.c @@ -0,0 +1,520 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2018 Dell Inc. +***/ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "device-util.h" +#include "devnum-util.h" +#include "efivars.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hibernate-util.h" +#include "log.h" +#include "parse-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" + +#define HIBERNATION_SWAP_THRESHOLD 0.98 + +void hibernation_device_done(HibernationDevice *device) { + assert(device); + + free(device->path); +} + +int read_fiemap(int fd, struct fiemap **ret) { + _cleanup_free_ struct fiemap *fiemap = NULL, *result_fiemap = NULL; + struct stat statinfo; + uint32_t result_extents = 0; + uint64_t fiemap_start = 0, fiemap_length; + const size_t n_extra = DIV_ROUND_UP(sizeof(struct fiemap), sizeof(struct fiemap_extent)); + + assert(fd >= 0); + assert(ret); + + if (fstat(fd, &statinfo) < 0) + return log_debug_errno(errno, "Cannot determine file size: %m"); + if (!S_ISREG(statinfo.st_mode)) + return -ENOTTY; + fiemap_length = statinfo.st_size; + + /* Zero this out in case we run on a file with no extents */ + fiemap = calloc(n_extra, sizeof(struct fiemap_extent)); + if (!fiemap) + return -ENOMEM; + + result_fiemap = malloc_multiply(n_extra, sizeof(struct fiemap_extent)); + if (!result_fiemap) + return -ENOMEM; + + /* XFS filesystem has incorrect implementation of fiemap ioctl and + * returns extents for only one block-group at a time, so we need + * to handle it manually, starting the next fiemap call from the end + * of the last extent + */ + while (fiemap_start < fiemap_length) { + *fiemap = (struct fiemap) { + .fm_start = fiemap_start, + .fm_length = fiemap_length, + .fm_flags = FIEMAP_FLAG_SYNC, + }; + + /* Find out how many extents there are */ + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) + return log_debug_errno(errno, "Failed to read extents: %m"); + + /* Nothing to process */ + if (fiemap->fm_mapped_extents == 0) + break; + + /* Resize fiemap to allow us to read in the extents, result fiemap has to hold all + * the extents for the whole file. Add space for the initial struct fiemap. */ + if (!greedy_realloc0((void**) &fiemap, n_extra + fiemap->fm_mapped_extents, sizeof(struct fiemap_extent))) + return -ENOMEM; + + fiemap->fm_extent_count = fiemap->fm_mapped_extents; + fiemap->fm_mapped_extents = 0; + + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) + return log_debug_errno(errno, "Failed to read extents: %m"); + + /* Resize result_fiemap to allow us to copy in the extents */ + if (!greedy_realloc((void**) &result_fiemap, + n_extra + result_extents + fiemap->fm_mapped_extents, sizeof(struct fiemap_extent))) + return -ENOMEM; + + memcpy(result_fiemap->fm_extents + result_extents, + fiemap->fm_extents, + sizeof(struct fiemap_extent) * fiemap->fm_mapped_extents); + + result_extents += fiemap->fm_mapped_extents; + + /* Highly unlikely that it is zero */ + if (_likely_(fiemap->fm_mapped_extents > 0)) { + uint32_t i = fiemap->fm_mapped_extents - 1; + + fiemap_start = fiemap->fm_extents[i].fe_logical + + fiemap->fm_extents[i].fe_length; + + if (fiemap->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) + break; + } + } + + memcpy(result_fiemap, fiemap, sizeof(struct fiemap)); + result_fiemap->fm_mapped_extents = result_extents; + *ret = TAKE_PTR(result_fiemap); + return 0; +} + +static int read_resume_config(dev_t *ret_devno, uint64_t *ret_offset) { + _cleanup_free_ char *devno_str = NULL, *offset_str = NULL; + uint64_t offset; + dev_t devno; + int r; + + assert(ret_devno); + assert(ret_offset); + + r = read_one_line_file("/sys/power/resume", &devno_str); + if (r < 0) + return log_debug_errno(r, "Failed to read /sys/power/resume: %m"); + + r = parse_devnum(devno_str, &devno); + if (r < 0) + return log_debug_errno(r, "Failed to parse /sys/power/resume devno '%s': %m", devno_str); + + r = read_one_line_file("/sys/power/resume_offset", &offset_str); + if (r == -ENOENT) { + log_debug_errno(r, "Kernel does not expose resume_offset, skipping."); + offset = UINT64_MAX; + } else if (r < 0) + return log_debug_errno(r, "Failed to read /sys/power/resume_offset: %m"); + else { + r = safe_atou64(offset_str, &offset); + if (r < 0) + return log_debug_errno(r, + "Failed to parse /sys/power/resume_offset '%s': %m", offset_str); + } + + if (devno == 0 && offset > 0 && offset != UINT64_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Found resume_offset=%" PRIu64 " but resume= is unset, refusing.", offset); + + *ret_devno = devno; + *ret_offset = offset; + + return 0; +} + +/* entry in /proc/swaps */ +typedef struct SwapEntry { + char *path; + bool swapfile; + + uint64_t size; + uint64_t used; + int priority; + + /* Not present in original entry */ + dev_t devno; + uint64_t offset; +} SwapEntry; + +typedef struct SwapEntries { + SwapEntry *swaps; + size_t n_swaps; +} SwapEntries; + +static void swap_entry_done(SwapEntry *entry) { + assert(entry); + + free(entry->path); +} + +static void swap_entries_done(SwapEntries *entries) { + assert(entries); + + FOREACH_ARRAY(i, entries->swaps, entries->n_swaps) + swap_entry_done(i); + + free(entries->swaps); +} + +static int swap_entry_get_resume_config(SwapEntry *swap) { + _cleanup_close_ int fd = -EBADF; + uint64_t offset_raw; + struct stat st; + int r; + + assert(swap); + assert(swap->path); + + fd = open(swap->path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!swap->swapfile) { + if (!S_ISBLK(st.st_mode)) + return -ENOTBLK; + + swap->devno = st.st_rdev; + swap->offset = 0; + return 0; + } + + r = stat_verify_regular(&st); + if (r < 0) + return r; + + r = get_block_device_fd(fd, &swap->devno); + if (r < 0) + return r; + + r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC); + if (r < 0) + return log_debug_errno(r, "Failed to check if swap file '%s' is on Btrfs: %m", swap->path); + if (r > 0) { + r = btrfs_get_file_physical_offset_fd(fd, &offset_raw); + if (r < 0) + return r; + } else { + _cleanup_free_ struct fiemap *fiemap = NULL; + + r = read_fiemap(fd, &fiemap); + if (r < 0) + return log_debug_errno(r, "Failed to read extent map for swap file '%s': %m", swap->path); + + offset_raw = fiemap->fm_extents[0].fe_physical; + } + + swap->offset = offset_raw / page_size(); + return 0; +} + +static int read_swap_entries(SwapEntries *ret) { + _cleanup_(swap_entries_done) SwapEntries entries = {}; + _cleanup_fclose_ FILE *f = NULL; + + assert(ret); + + f = fopen("/proc/swaps", "re"); + if (!f) + return log_debug_errno(errno, "Failed to open /proc/swaps: %m"); + + /* Remove header */ + (void) fscanf(f, "%*s %*s %*s %*s %*s\n"); + + for (unsigned i = 1;; i++) { + _cleanup_(swap_entry_done) SwapEntry swap = {}; + _cleanup_free_ char *type = NULL; + int k; + + k = fscanf(f, + "%ms " /* device/file path */ + "%ms " /* type of swap */ + "%" PRIu64 /* swap size */ + "%" PRIu64 /* used */ + "%i" /* priority */ + "\n", + &swap.path, &type, &swap.size, &swap.used, &swap.priority); + if (k == EOF) + break; + if (k != 5) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse /proc/swaps line %u.", i); + + if (streq(type, "file")) { + if (endswith(swap.path, "\\040(deleted)")) { + log_debug("Swap file '%s' has been deleted, ignoring.", swap.path); + continue; + } + + swap.swapfile = true; + + } else if (streq(type, "partition")) { + const char *node; + + node = path_startswith(swap.path, "/dev/"); + if (node && startswith(node, "zram")) { + log_debug("Swap partition '%s' is a zram device, ignoring.", swap.path); + continue; + } + + swap.swapfile = false; + + } else { + log_debug("Swap type %s is not supported for hibernation, ignoring device: %s", + type, swap.path); + continue; + } + + if (!GREEDY_REALLOC(entries.swaps, entries.n_swaps + 1)) + return log_oom_debug(); + + entries.swaps[entries.n_swaps++] = TAKE_STRUCT(swap); + } + + *ret = TAKE_STRUCT(entries); + return 0; +} + +/* Attempt to find a suitable device for hibernation by parsing /proc/swaps, /sys/power/resume, and + * /sys/power/resume_offset. + * + * Beware: + * Never use a device or file that hasn't been somehow specified by a user who would also be entrusted + * with full system memory access (for example via /sys/power/resume) or that isn't an already active + * swap area! Otherwise various security attacks might become possible, for example an attacker could + * silently attach such a device and circumvent full disk encryption when it would be automatically used + * for hibernation. Also, having a swap area on top of encryption is not per se enough to protect from all + * such attacks. + * + * Returns: + * 1 - Values are set in /sys/power/resume and /sys/power/resume_offset. + * + * 0 - No values are set in /sys/power/resume and /sys/power/resume_offset. + * ret will represent the highest priority swap with most remaining space discovered in /proc/swaps. + * + * Negative value in the case of error */ +int find_suitable_hibernation_device_full(HibernationDevice *ret_device, uint64_t *ret_size, uint64_t *ret_used) { + _cleanup_(swap_entries_done) SwapEntries entries = {}; + SwapEntry *entry = NULL; + uint64_t resume_config_offset; + dev_t resume_config_devno; + int r; + + assert(!ret_size == !ret_used); + + r = read_resume_config(&resume_config_devno, &resume_config_offset); + if (r < 0) + return r; + + r = read_swap_entries(&entries); + if (r < 0) + return r; + if (entries.n_swaps == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOSPC), "No swap space available for hibernation."); + + FOREACH_ARRAY(swap, entries.swaps, entries.n_swaps) { + r = swap_entry_get_resume_config(swap); + if (r < 0) + return log_debug_errno(r, "Failed to get devno and offset for swap '%s': %m", swap->path); + if (swap->devno == 0) { + assert(swap->swapfile); + + log_debug("Swap file '%s' is not backed by block device, ignoring: %m", swap->path); + continue; + } + + if (resume_config_devno > 0) { + if (swap->devno == resume_config_devno && + (!swap->swapfile || resume_config_offset == UINT64_MAX || swap->offset == resume_config_offset)) { + /* /sys/power/resume (resume=) is set, and the calculated swap file offset + * matches with /sys/power/resume_offset. If /sys/power/resume_offset is not + * exposed, we can't do proper check anyway, so use the found swap file too. */ + entry = swap; + break; + } + + /* If resume= is set, don't try to use other swap spaces. */ + continue; + } + + if (!entry || + swap->priority > entry->priority || + swap->size - swap->used > entry->size - entry->used) + entry = swap; + } + + if (!entry) { + /* No need to check n_swaps == 0, since it's rejected early */ + assert(resume_config_devno > 0); + return log_debug_errno(SYNTHETIC_ERRNO(ENOSPC), "Cannot find swap entry corresponding to /sys/power/resume."); + } + + if (ret_device) { + char *path; + + if (entry->swapfile) { + r = device_path_make_canonical(S_IFBLK, entry->devno, &path); + if (r < 0) + return log_debug_errno(r, + "Failed to format canonical device path for devno '" DEVNUM_FORMAT_STR "': %m", + DEVNUM_FORMAT_VAL(entry->devno)); + } else + path = TAKE_PTR(entry->path); + + *ret_device = (HibernationDevice) { + .devno = entry->devno, + .offset = entry->offset, + .path = path, + }; + } + + if (ret_size) { + *ret_size = entry->size; + *ret_used = entry->used; + } + + return resume_config_devno > 0; +} + +static int get_proc_meminfo_active(unsigned long long *ret) { + _cleanup_free_ char *active_str = NULL; + unsigned long long active; + int r; + + assert(ret); + + r = get_proc_field("/proc/meminfo", "Active(anon)", WHITESPACE, &active_str); + if (r < 0) + return log_debug_errno(r, "Failed to retrieve Active(anon) from /proc/meminfo: %m"); + + r = safe_atollu(active_str, &active); + if (r < 0) + return log_debug_errno(r, "Failed to parse Active(anon) '%s' from /proc/meminfo: %m", active_str); + + *ret = active; + return 0; +} + +int hibernation_is_safe(void) { + unsigned long long active; + uint64_t size, used; + bool resume_set, bypass_space_check; + int r; + + bypass_space_check = getenv_bool("SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK") > 0; + + r = find_suitable_hibernation_device_full(NULL, &size, &used); + if (r == -ENOSPC && bypass_space_check) + /* If we don't have any available swap space at all, and SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK + * is set, skip all remaining checks since we can't do that properly anyway. It is quite + * possible that the user is using a setup similar to #30083. When we actually perform + * hibernation in sleep.c we'll check everything again. */ + return 0; + if (r < 0) + return r; + resume_set = r > 0; + + if (!resume_set && !is_efi_boot()) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Not running on EFI and resume= is not set. Hibernation is not safe."); + + if (bypass_space_check) + return true; + + r = get_proc_meminfo_active(&active); + if (r < 0) + return r; + + r = active <= (size - used) * HIBERNATION_SWAP_THRESHOLD; + log_debug("Detected %s swap for hibernation: Active(anon)=%llu kB, size=%" PRIu64 " kB, used=%" PRIu64 " kB, threshold=%.2g%%", + r ? "enough" : "not enough", active, size, used, 100 * HIBERNATION_SWAP_THRESHOLD); + if (!r) + return -ENOSPC; + + return resume_set; +} + +int write_resume_config(dev_t devno, uint64_t offset, const char *device) { + char offset_str[DECIMAL_STR_MAX(uint64_t)]; + _cleanup_free_ char *path = NULL; + const char *devno_str; + int r; + + devno_str = FORMAT_DEVNUM(devno); + xsprintf(offset_str, "%" PRIu64, offset); + + if (!device) { + r = device_path_make_canonical(S_IFBLK, devno, &path); + if (r < 0) + return log_error_errno(r, + "Failed to format canonical device path for devno '" DEVNUM_FORMAT_STR "': %m", + DEVNUM_FORMAT_VAL(devno)); + device = path; + } + + /* We write the offset first since it's safer. Note that this file is only available in 4.17+, so + * fail gracefully if it doesn't exist and we're only overwriting it with 0. */ + r = write_string_file("/sys/power/resume_offset", offset_str, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r == -ENOENT) { + if (offset != 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Can't configure hibernation offset %" PRIu64 ", kernel does not support /sys/power/resume_offset. Refusing.", + offset); + + log_warning_errno(r, "/sys/power/resume_offset is unavailable, skipping writing swap file offset."); + } else if (r < 0) + return log_error_errno(r, + "Failed to write swap file offset %s to /sys/power/resume_offset for device '%s': %m", + offset_str, device); + else + log_debug("Wrote resume_offset=%s for device '%s' to /sys/power/resume_offset.", + offset_str, device); + + r = write_string_file("/sys/power/resume", devno_str, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, + "Failed to write device '%s' (%s) to /sys/power/resume: %m", + device, devno_str); + log_debug("Wrote resume=%s for device '%s' to /sys/power/resume.", devno_str, device); + + return 0; +} diff --git a/src/shared/hibernate-util.h b/src/shared/hibernate-util.h new file mode 100644 index 0000000..2ae10fb --- /dev/null +++ b/src/shared/hibernate-util.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +/* represents values for /sys/power/resume & /sys/power/resume_offset and the corresponding path */ +typedef struct HibernationDevice { + dev_t devno; + uint64_t offset; /* in memory pages */ + char *path; +} HibernationDevice; + +void hibernation_device_done(HibernationDevice *hibernation_device); + +int find_suitable_hibernation_device_full(HibernationDevice *ret_device, uint64_t *ret_size, uint64_t *ret_used); +static inline int find_suitable_hibernation_device(HibernationDevice *ret) { + return find_suitable_hibernation_device_full(ASSERT_PTR(ret), NULL, NULL); +} + +int hibernation_is_safe(void); + +int write_resume_config(dev_t devno, uint64_t offset, const char *device); + +/* Only for test-fiemap */ +int read_fiemap(int fd, struct fiemap **ret); diff --git a/src/shared/hostname-setup.c b/src/shared/hostname-setup.c new file mode 100644 index 0000000..137c29a --- /dev/null +++ b/src/shared/hostname-setup.c @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hostname-setup.h" +#include "hostname-util.h" +#include "log.h" +#include "macro.h" +#include "proc-cmdline.h" +#include "string-table.h" +#include "string-util.h" + +static int sethostname_idempotent_full(const char *s, bool really) { + struct utsname u; + + assert(s); + + assert_se(uname(&u) >= 0); + + if (streq_ptr(s, u.nodename)) + return 0; + + if (really && + sethostname(s, strlen(s)) < 0) + return -errno; + + return 1; +} + +int sethostname_idempotent(const char *s) { + return sethostname_idempotent_full(s, true); +} + +int shorten_overlong(const char *s, char **ret) { + char *h, *p; + + /* Shorten an overlong name to HOST_NAME_MAX or to the first dot, + * whatever comes earlier. */ + + assert(s); + + h = strdup(s); + if (!h) + return -ENOMEM; + + if (hostname_is_valid(h, 0)) { + *ret = h; + return 0; + } + + p = strchr(h, '.'); + if (p) + *p = 0; + + strshorten(h, HOST_NAME_MAX); + + if (!hostname_is_valid(h, 0)) { + free(h); + return -EDOM; + } + + *ret = h; + return 1; +} + +int read_etc_hostname_stream(FILE *f, char **ret) { + int r; + + assert(f); + assert(ret); + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) /* EOF without any hostname? the file is empty, let's treat that exactly like no file at all: ENOENT */ + return -ENOENT; + + /* File may have empty lines or comments, ignore them */ + if (IN_SET(line[0], '\0', '#')) + continue; + + hostname_cleanup(line); /* normalize the hostname */ + + if (!hostname_is_valid(line, VALID_HOSTNAME_TRAILING_DOT)) /* check that the hostname we return is valid */ + return -EBADMSG; + + *ret = TAKE_PTR(line); + return 0; + } +} + +int read_etc_hostname(const char *path, char **ret) { + _cleanup_fclose_ FILE *f = NULL; + + assert(ret); + + if (!path) + path = "/etc/hostname"; + + f = fopen(path, "re"); + if (!f) + return -errno; + + return read_etc_hostname_stream(f, ret); +} + +void hostname_update_source_hint(const char *hostname, HostnameSource source) { + int r; + + /* Why save the value and not just create a flag file? This way we will + * notice if somebody sets the hostname directly (not going through hostnamed). + */ + + if (source == HOSTNAME_DEFAULT) { + r = write_string_file("/run/systemd/default-hostname", hostname, + WRITE_STRING_FILE_CREATE | WRITE_STRING_FILE_ATOMIC); + if (r < 0) + log_warning_errno(r, "Failed to create \"/run/systemd/default-hostname\": %m"); + } else + unlink_or_warn("/run/systemd/default-hostname"); +} + +int hostname_setup(bool really) { + _cleanup_free_ char *b = NULL; + const char *hn = NULL; + HostnameSource source; + bool enoent = false; + int r; + + r = proc_cmdline_get_key("systemd.hostname", 0, &b); + if (r < 0) + log_warning_errno(r, "Failed to retrieve system hostname from kernel command line, ignoring: %m"); + else if (r > 0) { + if (hostname_is_valid(b, VALID_HOSTNAME_TRAILING_DOT)) { + hn = b; + source = HOSTNAME_TRANSIENT; + } else { + log_warning("Hostname specified on kernel command line is invalid, ignoring: %s", b); + b = mfree(b); + } + } + + if (!hn) { + r = read_etc_hostname(NULL, &b); + if (r < 0) { + if (r == -ENOENT) + enoent = true; + else + log_warning_errno(r, "Failed to read configured hostname: %m"); + } else { + hn = b; + source = HOSTNAME_STATIC; + } + } + + if (!hn) { + _cleanup_free_ char *buf = NULL; + + /* Don't override the hostname if it is already set and not explicitly configured */ + + r = gethostname_full(GET_HOSTNAME_ALLOW_LOCALHOST, &buf); + if (r == -ENOMEM) + return log_oom(); + if (r >= 0) { + log_debug("No hostname configured, leaving existing hostname <%s> in place.", buf); + return 0; + } + + if (enoent) + log_info("No hostname configured, using default hostname."); + + hn = b = get_default_hostname(); + if (!hn) + return log_oom(); + + source = HOSTNAME_DEFAULT; + + } + + r = sethostname_idempotent_full(hn, really); + if (r < 0) + return log_warning_errno(r, "Failed to set hostname to <%s>: %m", hn); + if (r == 0) + log_debug("Hostname was already set to <%s>.", hn); + else + log_info("Hostname %s to <%s>.", + really ? "set" : "would have been set", + hn); + + if (really) + hostname_update_source_hint(hn, source); + + return r; +} + +static const char* const hostname_source_table[] = { + [HOSTNAME_STATIC] = "static", + [HOSTNAME_TRANSIENT] = "transient", + [HOSTNAME_DEFAULT] = "default", +}; + +DEFINE_STRING_TABLE_LOOKUP(hostname_source, HostnameSource); diff --git a/src/shared/hostname-setup.h b/src/shared/hostname-setup.h new file mode 100644 index 0000000..6def36c --- /dev/null +++ b/src/shared/hostname-setup.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +typedef enum HostnameSource { + HOSTNAME_STATIC, /* from /etc/hostname */ + HOSTNAME_TRANSIENT, /* a transient hostname set through systemd, hostnamed, the container manager, or otherwise */ + HOSTNAME_DEFAULT, /* the os-release default or the compiled-in fallback were used */ + _HOSTNAME_INVALID = -EINVAL, +} HostnameSource; + +const char* hostname_source_to_string(HostnameSource source) _const_; +HostnameSource hostname_source_from_string(const char *str) _pure_; + +int sethostname_idempotent(const char *s); + +int shorten_overlong(const char *s, char **ret); + +int read_etc_hostname_stream(FILE *f, char **ret); +int read_etc_hostname(const char *path, char **ret); + +void hostname_update_source_hint(const char *hostname, HostnameSource source); +int hostname_setup(bool really); diff --git a/src/shared/hwdb-util.c b/src/shared/hwdb-util.c new file mode 100644 index 0000000..f67e917 --- /dev/null +++ b/src/shared/hwdb-util.c @@ -0,0 +1,712 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "conf-files.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hwdb-internal.h" +#include "hwdb-util.h" +#include "label-util.h" +#include "mkdir-label.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "sort-util.h" +#include "strbuf.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" + +static const char* const conf_file_dirs[] = { + "/etc/udev/hwdb.d", + UDEVLIBEXECDIR "/hwdb.d", + NULL +}; + +/* + * Generic udev properties, key-value database based on modalias strings. + * Uses a Patricia/radix trie to index all matches for efficient lookup. + */ + +/* in-memory trie objects */ +struct trie { + struct trie_node *root; + struct strbuf *strings; + + size_t nodes_count; + size_t children_count; + size_t values_count; +}; + +struct trie_node { + /* prefix, common part for all children of this node */ + size_t prefix_off; + + /* sorted array of pointers to children nodes */ + struct trie_child_entry *children; + uint8_t children_count; + + /* sorted array of key-value pairs */ + struct trie_value_entry *values; + size_t values_count; +}; + +/* children array item with char (0-255) index */ +struct trie_child_entry { + uint8_t c; + struct trie_node *child; +}; + +/* value array item with key-value pairs */ +struct trie_value_entry { + size_t key_off; + size_t value_off; + size_t filename_off; + uint32_t line_number; + uint16_t file_priority; +}; + +static int trie_children_cmp(const struct trie_child_entry *a, const struct trie_child_entry *b) { + return CMP(a->c, b->c); +} + +static int node_add_child(struct trie *trie, struct trie_node *node, struct trie_node *node_child, uint8_t c) { + struct trie_child_entry *child; + + /* extend array, add new entry, sort for bisection */ + child = reallocarray(node->children, node->children_count + 1, sizeof(struct trie_child_entry)); + if (!child) + return -ENOMEM; + + node->children = child; + trie->children_count++; + node->children[node->children_count].c = c; + node->children[node->children_count].child = node_child; + node->children_count++; + typesafe_qsort(node->children, node->children_count, trie_children_cmp); + trie->nodes_count++; + + return 0; +} + +static struct trie_node *node_lookup(const struct trie_node *node, uint8_t c) { + struct trie_child_entry *child; + struct trie_child_entry search; + + search.c = c; + child = typesafe_bsearch(&search, node->children, node->children_count, trie_children_cmp); + if (child) + return child->child; + return NULL; +} + +static void trie_node_cleanup(struct trie_node *node) { + if (!node) + return; + + for (size_t i = 0; i < node->children_count; i++) + trie_node_cleanup(node->children[i].child); + free(node->children); + free(node->values); + free(node); +} + +static struct trie* trie_free(struct trie *trie) { + if (!trie) + return NULL; + + trie_node_cleanup(trie->root); + strbuf_free(trie->strings); + return mfree(trie); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct trie*, trie_free); + +static int trie_values_cmp(const struct trie_value_entry *a, const struct trie_value_entry *b, struct trie *trie) { + return strcmp(trie->strings->buf + a->key_off, + trie->strings->buf + b->key_off); +} + +static int trie_node_add_value(struct trie *trie, struct trie_node *node, + const char *key, const char *value, + const char *filename, uint16_t file_priority, uint32_t line_number, bool compat) { + ssize_t k, v, fn = 0; + struct trie_value_entry *val; + + k = strbuf_add_string(trie->strings, key, strlen(key)); + if (k < 0) + return k; + v = strbuf_add_string(trie->strings, value, strlen(value)); + if (v < 0) + return v; + + if (!compat) { + fn = strbuf_add_string(trie->strings, filename, strlen(filename)); + if (fn < 0) + return fn; + } + + if (node->values_count) { + struct trie_value_entry search = { + .key_off = k, + .value_off = v, + }; + + val = typesafe_bsearch_r(&search, node->values, node->values_count, trie_values_cmp, trie); + if (val) { + /* At this point we have 2 identical properties on the same match-string. + * Since we process files in order, we just replace the previous value. */ + val->value_off = v; + val->filename_off = fn; + val->file_priority = file_priority; + val->line_number = line_number; + return 0; + } + } + + /* extend array, add new entry, sort for bisection */ + val = reallocarray(node->values, node->values_count + 1, sizeof(struct trie_value_entry)); + if (!val) + return -ENOMEM; + trie->values_count++; + node->values = val; + node->values[node->values_count] = (struct trie_value_entry) { + .key_off = k, + .value_off = v, + .filename_off = fn, + .file_priority = file_priority, + .line_number = line_number, + }; + node->values_count++; + typesafe_qsort_r(node->values, node->values_count, trie_values_cmp, trie); + return 0; +} + +static int trie_insert(struct trie *trie, struct trie_node *node, const char *search, + const char *key, const char *value, + const char *filename, uint16_t file_priority, uint32_t line_number, bool compat) { + int r = 0; + + for (size_t i = 0;; i++) { + size_t p; + char c; + struct trie_node *child; + + for (p = 0; (c = trie->strings->buf[node->prefix_off + p]); p++) { + _cleanup_free_ struct trie_node *new_child = NULL; + _cleanup_free_ char *s = NULL; + ssize_t off; + + if (c == search[i + p]) + continue; + + /* split node */ + new_child = new(struct trie_node, 1); + if (!new_child) + return -ENOMEM; + + /* move values from parent to child */ + *new_child = (struct trie_node) { + .prefix_off = node->prefix_off + p+1, + .children = node->children, + .children_count = node->children_count, + .values = node->values, + .values_count = node->values_count, + }; + + /* update parent; use strdup() because the source gets realloc()d */ + s = strndup(trie->strings->buf + node->prefix_off, p); + if (!s) + return -ENOMEM; + + off = strbuf_add_string(trie->strings, s, p); + if (off < 0) + return off; + + *node = (struct trie_node) { + .prefix_off = off, + }; + r = node_add_child(trie, node, new_child, c); + if (r < 0) + return r; + + new_child = NULL; /* avoid cleanup */ + break; + } + i += p; + + c = search[i]; + if (c == '\0') + return trie_node_add_value(trie, node, key, value, filename, file_priority, line_number, compat); + + child = node_lookup(node, c); + if (!child) { + _cleanup_free_ struct trie_node *new_child = NULL; + ssize_t off; + + /* new child */ + new_child = new(struct trie_node, 1); + if (!new_child) + return -ENOMEM; + + off = strbuf_add_string(trie->strings, search + i+1, strlen(search + i+1)); + if (off < 0) + return off; + + *new_child = (struct trie_node) { + .prefix_off = off, + }; + + r = node_add_child(trie, node, new_child, c); + if (r < 0) + return r; + + child = TAKE_PTR(new_child); + return trie_node_add_value(trie, child, key, value, filename, file_priority, line_number, compat); + } + + node = child; + } +} + +struct trie_f { + struct trie *trie; + uint64_t strings_off; + + uint64_t nodes_count; + uint64_t children_count; + uint64_t values_count; +}; + +/* calculate the storage space for the nodes, children arrays, value arrays */ +static void trie_store_nodes_size(struct trie_f *trie, struct trie_node *node, bool compat) { + for (uint64_t i = 0; i < node->children_count; i++) + trie_store_nodes_size(trie, node->children[i].child, compat); + + trie->strings_off += sizeof(struct trie_node_f); + for (uint64_t i = 0; i < node->children_count; i++) + trie->strings_off += sizeof(struct trie_child_entry_f); + for (uint64_t i = 0; i < node->values_count; i++) + trie->strings_off += compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f); +} + +static int64_t trie_store_nodes(struct trie_f *trie, FILE *f, struct trie_node *node, bool compat) { + _cleanup_free_ struct trie_child_entry_f *children = NULL; + int64_t node_off; + + assert(trie); + assert(f); + assert(node); + + if (node->children_count) { + children = new(struct trie_child_entry_f, node->children_count); + if (!children) + return -ENOMEM; + } + + /* post-order recursion */ + for (uint64_t i = 0; i < node->children_count; i++) { + int64_t child_off; + + child_off = trie_store_nodes(trie, f, node->children[i].child, compat); + if (child_off < 0) + return child_off; + + children[i] = (struct trie_child_entry_f) { + .c = node->children[i].c, + .child_off = htole64(child_off), + }; + } + + struct trie_node_f n = { + .prefix_off = htole64(trie->strings_off + node->prefix_off), + .children_count = node->children_count, + .values_count = htole64(node->values_count), + }; + + /* write node */ + node_off = ftello(f); + fwrite(&n, sizeof(struct trie_node_f), 1, f); + trie->nodes_count++; + + /* append children array */ + if (node->children_count) { + fwrite(children, sizeof(struct trie_child_entry_f), node->children_count, f); + trie->children_count += node->children_count; + } + + /* append values array */ + for (uint64_t i = 0; i < node->values_count; i++) { + struct trie_value_entry2_f v = { + .key_off = htole64(trie->strings_off + node->values[i].key_off), + .value_off = htole64(trie->strings_off + node->values[i].value_off), + .filename_off = htole64(trie->strings_off + node->values[i].filename_off), + .line_number = htole32(node->values[i].line_number), + .file_priority = htole16(node->values[i].file_priority), + }; + + fwrite(&v, compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f), 1, f); + } + trie->values_count += node->values_count; + + return node_off; +} + +static int trie_store(struct trie *trie, const char *filename, bool compat) { + struct trie_f t = { + .trie = trie, + .strings_off = sizeof(struct trie_header_f), + }; + _cleanup_(unlink_and_freep) char *filename_tmp = NULL; + _cleanup_fclose_ FILE *f = NULL; + int64_t pos, root_off, size; + int r; + + assert(trie); + assert(filename); + + /* calculate size of header, nodes, children entries, value entries */ + trie_store_nodes_size(&t, trie->root, compat); + + r = fopen_tmpfile_linkable(filename, O_WRONLY|O_CLOEXEC, &filename_tmp, &f); + if (r < 0) + return r; + + if (fchmod(fileno(f), 0444) < 0) + return -errno; + + struct trie_header_f h = { + .signature = HWDB_SIG, + .tool_version = htole64(PROJECT_VERSION), + .header_size = htole64(sizeof(struct trie_header_f)), + .node_size = htole64(sizeof(struct trie_node_f)), + .child_entry_size = htole64(sizeof(struct trie_child_entry_f)), + .value_entry_size = htole64(compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f)), + }; + + /* write nodes */ + if (fseeko(f, sizeof(struct trie_header_f), SEEK_SET) < 0) + return -errno; + + root_off = trie_store_nodes(&t, f, trie->root, compat); + h.nodes_root_off = htole64(root_off); + pos = ftello(f); + h.nodes_len = htole64(pos - sizeof(struct trie_header_f)); + + /* write string buffer */ + fwrite(trie->strings->buf, trie->strings->len, 1, f); + h.strings_len = htole64(trie->strings->len); + + /* write header */ + size = ftello(f); + h.file_size = htole64(size); + if (fseeko(f, 0, SEEK_SET) < 0) + return -errno; + fwrite(&h, sizeof(struct trie_header_f), 1, f); + + r = flink_tmpfile(f, filename_tmp, filename, LINK_TMPFILE_REPLACE|LINK_TMPFILE_SYNC); + if (r < 0) + return r; + + /* write succeeded */ + + log_debug("=== trie on-disk ==="); + log_debug("size: %8"PRIi64" bytes", size); + log_debug("header: %8zu bytes", sizeof(struct trie_header_f)); + log_debug("nodes: %8"PRIu64" bytes (%8"PRIu64")", + t.nodes_count * sizeof(struct trie_node_f), t.nodes_count); + log_debug("child pointers: %8"PRIu64" bytes (%8"PRIu64")", + t.children_count * sizeof(struct trie_child_entry_f), t.children_count); + log_debug("value pointers: %8"PRIu64" bytes (%8"PRIu64")", + t.values_count * (compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f)), t.values_count); + log_debug("string store: %8zu bytes", trie->strings->len); + log_debug("strings start: %8"PRIu64, t.strings_off); + return 0; +} + +static int insert_data(struct trie *trie, char **match_list, char *line, const char *filename, + uint16_t file_priority, uint32_t line_number, bool compat) { + char *value; + + assert(line[0] == ' '); + + value = strchr(line, '='); + if (!value) + return log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL), + "Key-value pair expected but got \"%s\", ignoring.", line); + + value[0] = '\0'; + value++; + + /* Replace multiple leading spaces by a single space */ + while (isblank(line[0]) && isblank(line[1])) + line++; + + if (isempty(line + 1)) + return log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL), + "Empty key in \"%s=%s\", ignoring.", + line, value); + + STRV_FOREACH(entry, match_list) + trie_insert(trie, trie->root, *entry, line, value, filename, file_priority, line_number, compat); + + return 0; +} + +static int import_file(struct trie *trie, const char *filename, uint16_t file_priority, bool compat) { + enum { + HW_NONE, + HW_MATCH, + HW_DATA, + } state = HW_NONE; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **match_list = NULL; + uint32_t line_number = 0; + int r, err; + + f = fopen(filename, "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + size_t len; + char *pos; + + r = read_line_full(f, LONG_LINE_MAX, READ_LINE_NOT_A_TTY, &line); + if (r < 0) + return r; + if (r == 0) + break; + + line_number ++; + + /* comment line */ + if (line[0] == '#') + continue; + + /* strip trailing comment */ + pos = strchr(line, '#'); + if (pos) + pos[0] = '\0'; + + /* strip trailing whitespace */ + len = strlen(line); + while (len > 0 && isspace(line[len-1])) + len--; + line[len] = '\0'; + + switch (state) { + case HW_NONE: + if (len == 0) + break; + + if (line[0] == ' ') { + r = log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL), + "Match expected but got indented property \"%s\", ignoring line.", line); + break; + } + + /* start of record, first match */ + state = HW_MATCH; + + err = strv_extend(&match_list, line); + if (err < 0) + return err; + + break; + + case HW_MATCH: + if (len == 0) { + r = log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL), + "Property expected, ignoring record with no properties."); + state = HW_NONE; + match_list = strv_free(match_list); + break; + } + + if (line[0] != ' ') { + /* another match */ + err = strv_extend(&match_list, line); + if (err < 0) + return err; + + break; + } + + /* first data */ + state = HW_DATA; + err = insert_data(trie, match_list, line, filename, file_priority, line_number, compat); + if (err < 0) + r = err; + break; + + case HW_DATA: + if (len == 0) { + /* end of record */ + state = HW_NONE; + match_list = strv_free(match_list); + break; + } + + if (line[0] != ' ') { + r = log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL), + "Property or empty line expected, got \"%s\", ignoring record.", line); + state = HW_NONE; + match_list = strv_free(match_list); + break; + } + + err = insert_data(trie, match_list, line, filename, file_priority, line_number, compat); + if (err < 0) + r = err; + break; + }; + } + + if (state == HW_MATCH) + log_syntax(NULL, LOG_WARNING, filename, line_number, 0, + "Property expected, ignoring record with no properties."); + + return r; +} + +int hwdb_update(const char *root, const char *hwdb_bin_dir, bool strict, bool compat) { + _cleanup_free_ char *hwdb_bin = NULL; + _cleanup_(trie_freep) struct trie *trie = NULL; + _cleanup_strv_free_ char **files = NULL; + uint16_t file_priority = 1; + int r = 0, err; + + /* The argument 'compat' controls the format version of database. If false, then hwdb.bin will be + * created with additional information such that priority, line number, and filename of database + * source. If true, then hwdb.bin will be created without the information. systemd-hwdb command + * should set the argument false, and 'udevadm hwdb' command should set it true. */ + + hwdb_bin = path_join(root, hwdb_bin_dir ?: "/etc/udev", "hwdb.bin"); + if (!hwdb_bin) + return -ENOMEM; + + trie = new0(struct trie, 1); + if (!trie) + return -ENOMEM; + + /* string store */ + trie->strings = strbuf_new(); + if (!trie->strings) + return -ENOMEM; + + /* index */ + trie->root = new0(struct trie_node, 1); + if (!trie->root) + return -ENOMEM; + + trie->nodes_count++; + + err = conf_files_list_strv(&files, ".hwdb", root, 0, conf_file_dirs); + if (err < 0) + return log_error_errno(err, "Failed to enumerate hwdb files: %m"); + + if (strv_isempty(files)) { + if (unlink(hwdb_bin) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to remove compiled hwdb database %s: %m", hwdb_bin); + + log_info("No hwdb files found, skipping."); + } else + log_info("No hwdb files found, compiled hwdb database %s removed.", hwdb_bin); + + return 0; + } + + STRV_FOREACH(f, files) { + log_debug("Reading file \"%s\"", *f); + err = import_file(trie, *f, file_priority++, compat); + if (err < 0 && strict) + r = err; + } + + strbuf_complete(trie->strings); + + log_debug("=== trie in-memory ==="); + log_debug("nodes: %8zu bytes (%8zu)", + trie->nodes_count * sizeof(struct trie_node), trie->nodes_count); + log_debug("children arrays: %8zu bytes (%8zu)", + trie->children_count * sizeof(struct trie_child_entry), trie->children_count); + log_debug("values arrays: %8zu bytes (%8zu)", + trie->values_count * sizeof(struct trie_value_entry), trie->values_count); + log_debug("strings: %8zu bytes", + trie->strings->len); + log_debug("strings incoming: %8zu bytes (%8zu)", + trie->strings->in_len, trie->strings->in_count); + log_debug("strings dedup'ed: %8zu bytes (%8zu)", + trie->strings->dedup_len, trie->strings->dedup_count); + + (void) mkdir_parents_label(hwdb_bin, 0755); + err = trie_store(trie, hwdb_bin, compat); + if (err < 0) + return log_error_errno(err, "Failed to write database %s: %m", hwdb_bin); + + err = label_fix(hwdb_bin, 0); + if (err < 0) + return err; + + return r; +} + +int hwdb_query(const char *modalias, const char *root) { + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + const char *key, *value; + int r; + + assert(modalias); + + if (!isempty(root)) + NULSTR_FOREACH(p, hwdb_bin_paths) { + _cleanup_free_ char *hwdb_bin = NULL; + + hwdb_bin = path_join(root, p); + if (!hwdb_bin) + return -ENOMEM; + + r = sd_hwdb_new_from_path(hwdb_bin, &hwdb); + if (r >= 0) + break; + } + else + r = sd_hwdb_new(&hwdb); + if (r < 0) + return r; + + SD_HWDB_FOREACH_PROPERTY(hwdb, modalias, key, value) + printf("%s=%s\n", key, value); + + return 0; +} + +bool hwdb_should_reload(sd_hwdb *hwdb) { + bool found = false; + struct stat st; + + if (!hwdb) + return false; + if (!hwdb->f) + return false; + + /* if hwdb.bin doesn't exist anywhere, we need to update */ + NULSTR_FOREACH(p, hwdb_bin_paths) + if (stat(p, &st) >= 0) { + found = true; + break; + } + if (!found) + return true; + + if (timespec_load(&hwdb->st.st_mtim) != timespec_load(&st.st_mtim)) + return true; + return false; +} diff --git a/src/shared/hwdb-util.h b/src/shared/hwdb-util.h new file mode 100644 index 0000000..cb93690 --- /dev/null +++ b/src/shared/hwdb-util.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-hwdb.h" + +bool hwdb_should_reload(sd_hwdb *hwdb); +int hwdb_update(const char *root, const char *hwdb_bin_dir, bool strict, bool compat); +int hwdb_query(const char *modalias, const char *root); diff --git a/src/shared/id128-print.c b/src/shared/id128-print.c new file mode 100644 index 0000000..c9509b2 --- /dev/null +++ b/src/shared/id128-print.c @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "id128-print.h" +#include "log.h" +#include "pretty-print.h" +#include "terminal-util.h" + +int id128_pretty_print_sample(const char *name, sd_id128_t id) { + _cleanup_free_ char *man_link = NULL, *mod_link = NULL; + + const char *on = ansi_highlight(), + *off = ansi_normal(); + + if (terminal_urlify("man:systemd-id128(1)", "systemd-id128(1)", &man_link) < 0) + return log_oom(); + + if (terminal_urlify("https://docs.python.org/3/library/uuid.html", "uuid", &mod_link) < 0) + return log_oom(); + + printf("As string:\n" + "%s" SD_ID128_FORMAT_STR "%s\n\n" + "As UUID:\n" + "%s" SD_ID128_UUID_FORMAT_STR "%s\n\n" + "As %s macro:\n" + "%s#define %s SD_ID128_MAKE(", + on, SD_ID128_FORMAT_VAL(id), off, + on, SD_ID128_FORMAT_VAL(id), off, + man_link, + on, name); + for (size_t i = 0; i < 16; i++) + printf("%02x%s", id.bytes[i], i < 15 ? "," : ""); + printf(")%s\n\n", off); + + printf("As Python constant:\n" + ">>> import %s\n" + ">>> %s%s = uuid.UUID('" SD_ID128_FORMAT_STR "')%s\n", + mod_link, + on, name, SD_ID128_FORMAT_VAL(id), off); + + return 0; +} + + +int id128_pretty_print(sd_id128_t id, Id128PrettyPrintMode mode) { + assert(mode >= 0); + assert(mode < _ID128_PRETTY_PRINT_MODE_MAX); + + if (mode == ID128_PRINT_ID128) { + printf(SD_ID128_FORMAT_STR "\n", + SD_ID128_FORMAT_VAL(id)); + return 0; + } else if (mode == ID128_PRINT_UUID) { + printf(SD_ID128_UUID_FORMAT_STR "\n", + SD_ID128_FORMAT_VAL(id)); + return 0; + } else + return id128_pretty_print_sample("XYZ", id); +} + +int id128_print_new(Id128PrettyPrintMode mode) { + sd_id128_t id; + int r; + + r = sd_id128_randomize(&id); + if (r < 0) + return log_error_errno(r, "Failed to generate ID: %m"); + + return id128_pretty_print(id, mode); +} diff --git a/src/shared/id128-print.h b/src/shared/id128-print.h new file mode 100644 index 0000000..7b2e593 --- /dev/null +++ b/src/shared/id128-print.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include + +#include "sd-id128.h" + +typedef enum Id128PrettyPrintMode { + ID128_PRINT_ID128, + ID128_PRINT_UUID, + ID128_PRINT_PRETTY, + _ID128_PRETTY_PRINT_MODE_MAX, + _ID128_PRETTY_PRINT_MODE_INVALID = -EINVAL, +} Id128PrettyPrintMode; + +int id128_pretty_print_sample(const char *name, sd_id128_t id); +int id128_pretty_print(sd_id128_t id, Id128PrettyPrintMode mode); +int id128_print_new(Id128PrettyPrintMode mode); diff --git a/src/shared/idn-util.c b/src/shared/idn-util.c new file mode 100644 index 0000000..6f36688 --- /dev/null +++ b/src/shared/idn-util.c @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_LIBIDN2 +# include +#elif HAVE_LIBIDN +# include +# include +#endif + +#include "alloc-util.h" +#include "dlfcn-util.h" +#include "idn-util.h" + +#if HAVE_LIBIDN || HAVE_LIBIDN2 +static void* idn_dl = NULL; +#endif + +#if HAVE_LIBIDN2 +int (*sym_idn2_lookup_u8)(const uint8_t* src, uint8_t** lookupname, int flags) = NULL; +const char *(*sym_idn2_strerror)(int rc) _const_ = NULL; +int (*sym_idn2_to_unicode_8z8z)(const char * input, char ** output, int flags) = NULL; + +int dlopen_idn(void) { + return dlopen_many_sym_or_warn( + &idn_dl, "libidn2.so.0", LOG_DEBUG, + DLSYM_ARG(idn2_lookup_u8), + DLSYM_ARG(idn2_strerror), + DLSYM_ARG(idn2_to_unicode_8z8z)); +} +#endif + +#if HAVE_LIBIDN +int (*sym_idna_to_ascii_4i)(const uint32_t * in, size_t inlen, char *out, int flags); +int (*sym_idna_to_unicode_44i)(const uint32_t * in, size_t inlen, uint32_t * out, size_t * outlen, int flags); +char* (*sym_stringprep_ucs4_to_utf8)(const uint32_t * str, ssize_t len, size_t * items_read, size_t * items_written); +uint32_t* (*sym_stringprep_utf8_to_ucs4)(const char *str, ssize_t len, size_t *items_written); + +int dlopen_idn(void) { + _cleanup_(dlclosep) void *dl = NULL; + int r; + + if (idn_dl) + return 0; /* Already loaded */ + + dl = dlopen("libidn.so.12", RTLD_LAZY); + if (!dl) { + /* libidn broke ABI in 1.34, but not in a way we care about (a new field got added to an + * open-coded struct we do not use), hence support both versions. */ + dl = dlopen("libidn.so.11", RTLD_LAZY); + if (!dl) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "libidn support is not installed: %s", dlerror()); + } + + r = dlsym_many_or_warn( + dl, + LOG_DEBUG, + DLSYM_ARG(idna_to_ascii_4i), + DLSYM_ARG(idna_to_unicode_44i), + DLSYM_ARG(stringprep_ucs4_to_utf8), + DLSYM_ARG(stringprep_utf8_to_ucs4)); + if (r < 0) + return r; + + idn_dl = TAKE_PTR(dl); + + return 1; +} +#endif diff --git a/src/shared/idn-util.h b/src/shared/idn-util.h new file mode 100644 index 0000000..e64bd99 --- /dev/null +++ b/src/shared/idn-util.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_LIBIDN2 +# include +#elif HAVE_LIBIDN +# include +# include +#endif + +#include + +#if HAVE_LIBIDN2 || HAVE_LIBIDN +int dlopen_idn(void); +#else +static inline int dlopen_idn(void) { + return -EOPNOTSUPP; +} +#endif + +#if HAVE_LIBIDN2 +extern int (*sym_idn2_lookup_u8)(const uint8_t* src, uint8_t** lookupname, int flags); +extern const char *(*sym_idn2_strerror)(int rc) _const_; +extern int (*sym_idn2_to_unicode_8z8z)(const char * input, char ** output, int flags); +#endif + +#if HAVE_LIBIDN +extern int (*sym_idna_to_ascii_4i)(const uint32_t * in, size_t inlen, char *out, int flags); +extern int (*sym_idna_to_unicode_44i)(const uint32_t * in, size_t inlen,uint32_t * out, size_t * outlen, int flags); +extern char* (*sym_stringprep_ucs4_to_utf8)(const uint32_t * str, ssize_t len, size_t * items_read, size_t * items_written); +extern uint32_t* (*sym_stringprep_utf8_to_ucs4)(const char *str, ssize_t len, size_t *items_written); +#endif diff --git a/src/shared/ima-util.c b/src/shared/ima-util.c new file mode 100644 index 0000000..e37c9ad --- /dev/null +++ b/src/shared/ima-util.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "ima-util.h" + +static int use_ima_cached = -1; + +bool use_ima(void) { + + if (use_ima_cached < 0) + use_ima_cached = access("/sys/kernel/security/ima/", F_OK) >= 0; + + return use_ima_cached; +} diff --git a/src/shared/ima-util.h b/src/shared/ima-util.h new file mode 100644 index 0000000..922db78 --- /dev/null +++ b/src/shared/ima-util.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +bool use_ima(void); diff --git a/src/shared/image-policy.c b/src/shared/image-policy.c new file mode 100644 index 0000000..3c3de50 --- /dev/null +++ b/src/shared/image-policy.c @@ -0,0 +1,774 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "extract-word.h" +#include "image-policy.h" +#include "logarithm.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" + +/* Rationale for the chosen syntax: + * + * → one line, so that it can be reasonably added to a shell command line, for example via `systemd-dissect + * --image-policy=…` or to the kernel command line via `systemd.image_policy=`. + * + * → no use of "," or ";" as separators, so that it can be included in mount/fstab-style option strings and + * doesn't require escaping. Instead, separators are ":", "=", "+" which should be fine both in shell + * command lines and in mount/fstab style option strings. + */ + +static int partition_policy_compare(const PartitionPolicy *a, const PartitionPolicy *b) { + return CMP(ASSERT_PTR(a)->designator, ASSERT_PTR(b)->designator); +} + +static const PartitionPolicy* image_policy_bsearch(const ImagePolicy *policy, PartitionDesignator designator) { + if (!policy) + return NULL; + + return typesafe_bsearch( + &(const PartitionPolicy) { .designator = designator }, + ASSERT_PTR(policy)->policies, + ASSERT_PTR(policy)->n_policies, + partition_policy_compare); +} + +PartitionPolicyFlags partition_policy_flags_extend(PartitionPolicyFlags flags) { + /* If some parts of a flags field are left unspecified, let's fill in all options. */ + + /* If no protection flag is set, then this means all are set */ + if ((flags & _PARTITION_POLICY_USE_MASK) == 0) + flags |= PARTITION_POLICY_OPEN; + + /* If the gpt flags bits are not specified, set both options for each */ + if ((flags & _PARTITION_POLICY_READ_ONLY_MASK) == 0) + flags |= PARTITION_POLICY_READ_ONLY_ON|PARTITION_POLICY_READ_ONLY_OFF; + + if ((flags & _PARTITION_POLICY_GROWFS_MASK) == 0) + flags |= PARTITION_POLICY_GROWFS_ON|PARTITION_POLICY_GROWFS_OFF; + + return flags; +} + +static PartitionPolicyFlags partition_policy_normalized_flags(const PartitionPolicy *policy) { + PartitionPolicyFlags flags = ASSERT_PTR(policy)->flags; + + /* This normalizes the per-partition policy flags. This means if the user left some things + * unspecified, we'll fill in the appropriate "dontcare" policy instead. We'll also mask out bits + * that do not make any sense for specific partition types. */ + + flags = partition_policy_flags_extend(flags); + + /* If this is a verity or verity signature designator, then mask off all protection bits, this after + * all needs no protection, because it *is* the protection */ + if (partition_verity_to_data(policy->designator) >= 0 || + partition_verity_sig_to_data(policy->designator) >= 0) + flags &= ~(PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED); + + /* if this designator has no verity concept, then mask off verity protection flags */ + if (partition_verity_of(policy->designator) < 0) + flags &= ~(PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED); + + /* If the partition must be absent, then the gpt flags don't matter */ + if ((flags & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_ABSENT) + flags &= ~(_PARTITION_POLICY_READ_ONLY_MASK|_PARTITION_POLICY_GROWFS_MASK); + + return flags; +} + +PartitionPolicyFlags image_policy_get(const ImagePolicy *policy, PartitionDesignator designator) { + PartitionDesignator data_designator = _PARTITION_DESIGNATOR_INVALID; + const PartitionPolicy *pp; + + /* No policy means: everything may be used in any mode */ + if (!policy) + return partition_policy_normalized_flags( + &(const PartitionPolicy) { + .flags = PARTITION_POLICY_OPEN, + .designator = designator, + }); + + pp = image_policy_bsearch(policy, designator); + if (pp) + return partition_policy_normalized_flags(pp); + + /* Hmm, so this didn't work, then let's see if we can derive some policy from the underlying data + * partition in case of verity/signature partitions */ + + data_designator = partition_verity_to_data(designator); + if (data_designator >= 0) { + PartitionPolicyFlags data_flags; + + /* So we are asked for the policy for a verity partition, and there's no explicit policy for + * that case. Let's synthesize a policy from the protection setting for the underlying data + * partition. */ + + data_flags = image_policy_get(policy, data_designator); + if (data_flags < 0) + return data_flags; + + /* We need verity if verity or verity with sig is requested */ + if (!(data_flags & (PARTITION_POLICY_SIGNED|PARTITION_POLICY_VERITY))) + return _PARTITION_POLICY_FLAGS_INVALID; + + /* If the data partition may be unused or absent, then the verity partition may too. Also, inherit the partition flags policy */ + return partition_policy_normalized_flags( + &(const PartitionPolicy) { + .flags = PARTITION_POLICY_UNPROTECTED | (data_flags & (PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT)) | + (data_flags & _PARTITION_POLICY_PFLAGS_MASK), + .designator = designator, + }); + } + + data_designator = partition_verity_sig_to_data(designator); + if (data_designator >= 0) { + PartitionPolicyFlags data_flags; + + /* Similar case as for verity partitions, but slightly more strict rules */ + + data_flags = image_policy_get(policy, data_designator); + if (data_flags < 0) + return data_flags; + + if (!(data_flags & PARTITION_POLICY_SIGNED)) + return _PARTITION_POLICY_FLAGS_INVALID; + + return partition_policy_normalized_flags( + &(const PartitionPolicy) { + .flags = PARTITION_POLICY_UNPROTECTED | (data_flags & (PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT)) | + (data_flags & _PARTITION_POLICY_PFLAGS_MASK), + .designator = designator, + }); + } + + return _PARTITION_POLICY_FLAGS_INVALID; /* got nothing */ +} + +PartitionPolicyFlags image_policy_get_exhaustively(const ImagePolicy *policy, PartitionDesignator designator) { + PartitionPolicyFlags flags; + + /* This is just like image_policy_get() but whenever there is no policy for a specific designator, we + * return the default policy. */ + + flags = image_policy_get(policy, designator); + if (flags < 0) + return partition_policy_normalized_flags( + &(const PartitionPolicy) { + .flags = image_policy_default(policy), + .designator = designator, + }); + + return flags; +} + +static PartitionPolicyFlags policy_flag_from_string_one(const char *s) { + assert(s); + + /* This is a bitmask (i.e. not dense), hence we don't use the "string-table.h" stuff here. */ + + if (streq(s, "verity")) + return PARTITION_POLICY_VERITY; + if (streq(s, "signed")) + return PARTITION_POLICY_SIGNED; + if (streq(s, "encrypted")) + return PARTITION_POLICY_ENCRYPTED; + if (streq(s, "unprotected")) + return PARTITION_POLICY_UNPROTECTED; + if (streq(s, "unused")) + return PARTITION_POLICY_UNUSED; + if (streq(s, "absent")) + return PARTITION_POLICY_ABSENT; + if (streq(s, "open")) /* shortcut alias */ + return PARTITION_POLICY_OPEN; + if (streq(s, "ignore")) /* ditto */ + return PARTITION_POLICY_IGNORE; + if (streq(s, "read-only-on")) + return PARTITION_POLICY_READ_ONLY_ON; + if (streq(s, "read-only-off")) + return PARTITION_POLICY_READ_ONLY_OFF; + if (streq(s, "growfs-on")) + return PARTITION_POLICY_GROWFS_ON; + if (streq(s, "growfs-off")) + return PARTITION_POLICY_GROWFS_OFF; + + return _PARTITION_POLICY_FLAGS_INVALID; +} + +PartitionPolicyFlags partition_policy_flags_from_string(const char *s) { + PartitionPolicyFlags flags = 0; + int r; + + assert(s); + + if (empty_or_dash(s)) + return 0; + + for (;;) { + _cleanup_free_ char *f = NULL; + PartitionPolicyFlags ff; + + r = extract_first_word(&s, &f, "+", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + ff = policy_flag_from_string_one(strstrip(f)); + if (ff < 0) + return -EBADRQC; /* recognizable error */ + + flags |= ff; + } + + return flags; +} + +static ImagePolicy* image_policy_new(size_t n_policies) { + ImagePolicy *p; + + if (n_policies > (SIZE_MAX - offsetof(ImagePolicy, policies)) / sizeof(PartitionPolicy)) /* overflow check */ + return NULL; + + p = malloc(offsetof(ImagePolicy, policies) + sizeof(PartitionPolicy) * n_policies); + if (!p) + return NULL; + + *p = (ImagePolicy) { + .default_flags = PARTITION_POLICY_IGNORE, + }; + return p; +} + +int image_policy_from_string(const char *s, ImagePolicy **ret) { + _cleanup_free_ ImagePolicy *p = NULL; + uint64_t dmask = 0; + ImagePolicy *t; + PartitionPolicyFlags symbolic_policy; + int r; + + assert(s); + assert_cc(sizeof(dmask) * 8 >= _PARTITION_DESIGNATOR_MAX); + + /* Recognizable errors: + * + * ENOTUNIQ → Two or more rules for the same partition + * EBADSLT → Unknown partition designator + * EBADRQC → Unknown policy flags + */ + + /* First, let's handle "symbolic" policies, i.e. "-", "*", "~" */ + if (empty_or_dash(s)) + /* ignore policy: everything may exist, but nothing used */ + symbolic_policy = PARTITION_POLICY_IGNORE; + else if (streq(s, "*")) + /* allow policy: everything is allowed */ + symbolic_policy = PARTITION_POLICY_OPEN; + else if (streq(s, "~")) + /* deny policy: nothing may exist */ + symbolic_policy = PARTITION_POLICY_ABSENT; + else + symbolic_policy = _PARTITION_POLICY_FLAGS_INVALID; + + if (symbolic_policy >= 0) { + if (!ret) + return 0; + + p = image_policy_new(0); + if (!p) + return -ENOMEM; + + p->default_flags = symbolic_policy; + *ret = TAKE_PTR(p); + return 0; + } + + /* Allocate the policy at maximum size, i.e. for all designators. We might overshoot a bit, but the + * items are cheap, and we can return unused space to libc once we know we don't need it */ + p = image_policy_new(_PARTITION_DESIGNATOR_MAX); + if (!p) + return -ENOMEM; + + const char *q = s; + bool default_specified = false; + for (;;) { + _cleanup_free_ char *e = NULL, *d = NULL; + PartitionDesignator designator; + PartitionPolicyFlags flags; + char *f, *ds, *fs; + + r = extract_first_word(&q, &e, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + break; + + f = e; + r = extract_first_word((const char**) &f, &d, "=", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Expected designator name followed by '='; got instead: %s", e); + if (!f) /* no separator? */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Missing '=' in policy expression: %s", e); + + ds = strstrip(d); + if (isempty(ds)) { + /* Not partition name? then it's the default policy */ + if (default_specified) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Default partition policy flags specified more than once."); + + designator = _PARTITION_DESIGNATOR_INVALID; + default_specified = true; + } else { + designator = partition_designator_from_string(ds); + if (designator < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT), "Unknown partition designator: %s", ds); /* recognizable error */ + if (dmask & (UINT64_C(1) << designator)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Partition designator specified more than once: %s", ds); + dmask |= UINT64_C(1) << designator; + } + + fs = strstrip(f); + flags = partition_policy_flags_from_string(fs); + if (flags == -EBADRQC) + return log_debug_errno(flags, "Unknown partition policy flag: %s", fs); + if (flags < 0) + return log_debug_errno(flags, "Failed to parse partition policy flags '%s': %m", fs); + + if (designator < 0) + p->default_flags = flags; + else { + p->policies[p->n_policies++] = (PartitionPolicy) { + .designator = designator, + .flags = flags, + }; + } + }; + + assert(p->n_policies <= _PARTITION_DESIGNATOR_MAX); + + /* Return unused space to libc */ + t = realloc(p, offsetof(ImagePolicy, policies) + sizeof(PartitionPolicy) * p->n_policies); + if (t) + p = t; + + typesafe_qsort(p->policies, p->n_policies, partition_policy_compare); + + if (ret) + *ret = TAKE_PTR(p); + + return 0; +} + +int partition_policy_flags_to_string(PartitionPolicyFlags flags, bool simplify, char **ret) { + _cleanup_free_ char *buf = NULL; + const char *l[CONST_LOG2U(_PARTITION_POLICY_MASK) + 1]; /* one string per known flag at most */ + size_t m = 0; + + assert(ret); + + if (flags < 0) + return -EINVAL; + + /* If 'simplify' is false we'll output the precise value of every single flag. + * + * If 'simplify' is true we'll try to make the output shorter, by doing the following: + * + * → we'll spell the long form "verity+signed+encrypted+unprotected+unused+absent" via its + * equivalent shortcut form "open" (which we happily parse btw, see above) + * + * → we'll spell the long form "unused+absent" via its shortcut "ignore" (which we are also happy + * to parse) + * + * → if the read-only/growfs policy flags are both set, we suppress them. this thus removes the + * distinction between "user explicitly declared don't care" and "we implied don't care because + * user didn't say anything". + * + * net result: the resulting string is shorter, but the effective policy declared that way will have + * the same results as the long form. */ + + if (simplify && (flags & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_OPEN) + l[m++] = "open"; + else if (simplify && (flags & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_IGNORE) + l[m++] = "ignore"; + else { + if (flags & PARTITION_POLICY_VERITY) + l[m++] = "verity"; + if (flags & PARTITION_POLICY_SIGNED) + l[m++] = "signed"; + if (flags & PARTITION_POLICY_ENCRYPTED) + l[m++] = "encrypted"; + if (flags & PARTITION_POLICY_UNPROTECTED) + l[m++] = "unprotected"; + if (flags & PARTITION_POLICY_UNUSED) + l[m++] = "unused"; + if (flags & PARTITION_POLICY_ABSENT) + l[m++] = "absent"; + } + + if (!simplify || (!(flags & PARTITION_POLICY_READ_ONLY_ON) != !(flags & PARTITION_POLICY_READ_ONLY_OFF))) { + if (flags & PARTITION_POLICY_READ_ONLY_ON) + l[m++] = "read-only-on"; + if (flags & PARTITION_POLICY_READ_ONLY_OFF) + l[m++] = "read-only-off"; + } + + if (!simplify || (!(flags & PARTITION_POLICY_GROWFS_ON) != !(flags & PARTITION_POLICY_GROWFS_OFF))) { + if (flags & PARTITION_POLICY_GROWFS_OFF) + l[m++] = "growfs-off"; + if (flags & PARTITION_POLICY_GROWFS_ON) + l[m++] = "growfs-on"; + } + + if (m == 0) + buf = strdup("-"); + else { + assert(m+1 < ELEMENTSOF(l)); + l[m] = NULL; + + buf = strv_join((char**) l, "+"); + } + if (!buf) + return -ENOMEM; + + *ret = TAKE_PTR(buf); + return 0; +} + +static bool partition_policy_flags_extended_equal(PartitionPolicyFlags a, PartitionPolicyFlags b) { + return partition_policy_flags_extend(a) == partition_policy_flags_extend(b); +} + +static int image_policy_flags_all_match(const ImagePolicy *policy, PartitionPolicyFlags expected) { + + if (expected < 0) + return -EINVAL; + + if (!partition_policy_flags_extended_equal(image_policy_default(policy), expected)) + return false; + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + PartitionPolicyFlags f, w; + + f = image_policy_get_exhaustively(policy, d); + if (f < 0) + return f; + + w = partition_policy_normalized_flags( + &(const PartitionPolicy) { + .flags = expected, + .designator = d, + }); + if (w < 0) + return w; + if (f != w) + return false; + } + + return true; +} + +bool image_policy_equiv_ignore(const ImagePolicy *policy) { + /* Checks if this is the ignore policy (or equivalent to it), i.e. everything is ignored, aka '-', aka '' */ + return image_policy_flags_all_match(policy, PARTITION_POLICY_IGNORE); +} + +bool image_policy_equiv_allow(const ImagePolicy *policy) { + /* Checks if this is the allow policy (or equivalent to it), i.e. everything is allowed, aka '*' */ + return image_policy_flags_all_match(policy, PARTITION_POLICY_OPEN); +} + +bool image_policy_equiv_deny(const ImagePolicy *policy) { + /* Checks if this is the deny policy (or equivalent to it), i.e. everything must be absent, aka '~' */ + return image_policy_flags_all_match(policy, PARTITION_POLICY_ABSENT); +} + +int image_policy_to_string(const ImagePolicy *policy, bool simplify, char **ret) { + _cleanup_free_ char *s = NULL; + int r; + + assert(ret); + + if (simplify) { + const char *fixed; + + if (image_policy_equiv_allow(policy)) + fixed = "*"; + else if (image_policy_equiv_ignore(policy)) + fixed = "-"; + else if (image_policy_equiv_deny(policy)) + fixed = "~"; + else + fixed = NULL; + + if (fixed) { + s = strdup(fixed); + if (!s) + return -ENOMEM; + + *ret = TAKE_PTR(s); + return 0; + } + } + + for (size_t i = 0; i < image_policy_n_entries(policy); i++) { + const PartitionPolicy *p = policy->policies + i; + _cleanup_free_ char *f = NULL; + const char *t; + + assert(i == 0 || p->designator > policy->policies[i-1].designator); /* Validate perfect ordering */ + + assert_se(t = partition_designator_to_string(p->designator)); + + if (simplify) { + /* Skip policy entries that match the default anyway */ + PartitionPolicyFlags df; + + df = partition_policy_normalized_flags( + &(const PartitionPolicy) { + .flags = image_policy_default(policy), + .designator = p->designator, + }); + if (df < 0) + return df; + + if (df == p->flags) + continue; + } + + r = partition_policy_flags_to_string(p->flags, simplify, &f); + if (r < 0) + return r; + + if (!strextend(&s, isempty(s) ? "" : ":", t, "=", f)) + return -ENOMEM; + } + + if (!simplify || !partition_policy_flags_extended_equal(image_policy_default(policy), PARTITION_POLICY_IGNORE)) { + _cleanup_free_ char *df = NULL; + + r = partition_policy_flags_to_string(image_policy_default(policy), simplify, &df); + if (r < 0) + return r; + + if (!strextend(&s, isempty(s) ? "" : ":", "=", df)) + return -ENOMEM; + } + + if (isempty(s)) { /* no rule and default policy? then let's return "-" */ + s = strdup("-"); + if (!s) + return -ENOMEM; + } + + *ret = TAKE_PTR(s); + return 0; +} + +bool image_policy_equal(const ImagePolicy *a, const ImagePolicy *b) { + if (a == b) + return true; + if (image_policy_n_entries(a) != image_policy_n_entries(b)) + return false; + if (image_policy_default(a) != image_policy_default(b)) + return false; + for (size_t i = 0; i < image_policy_n_entries(a); i++) { + if (a->policies[i].designator != b->policies[i].designator) + return false; + if (a->policies[i].flags != b->policies[i].flags) + return false; + } + + return true; +} + +int image_policy_equivalent(const ImagePolicy *a, const ImagePolicy *b) { + + /* The image_policy_equal() function checks if the policy is defined the exact same way. This + * function here instead looks at the outcome of the two policies instead. Where does this come to + * different results you ask? We imply some logic regarding Verity/Encryption: when no rule is + * defined for a verity partition we can synthesize it from the protection level of the data + * partition it protects. Or: any per-partition rule that is identical to the default rule is + * redundant, and will be recognized as such by image_policy_equivalent() but not by + * image_policy_equal()- */ + + if (!partition_policy_flags_extended_equal(image_policy_default(a), image_policy_default(b))) + return false; + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + PartitionPolicyFlags f, w; + + f = image_policy_get_exhaustively(a, d); + if (f < 0) + return f; + + w = image_policy_get_exhaustively(b, d); + if (w < 0) + return w; + + if (f != w) + return false; + } + + return true; +} + +int config_parse_image_policy( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(image_policy_freep) ImagePolicy *np = NULL; + ImagePolicy **p = ASSERT_PTR(data); + int r; + + assert(rvalue); + + if (isempty(rvalue)) { + *p = image_policy_free(*p); + return 0; + } + + r = image_policy_from_string(rvalue, &np); + if (r == -ENOTUNIQ) + return log_syntax(unit, LOG_ERR, filename, line, r, "Duplicate rule in image policy, refusing: %s", rvalue); + if (r == -EBADSLT) + return log_syntax(unit, LOG_ERR, filename, line, r, "Unknown partition type in image policy, refusing: %s", rvalue); + if (r == -EBADRQC) + return log_syntax(unit, LOG_ERR, filename, line, r, "Unknown partition policy flag in image policy, refusing: %s", rvalue); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse image policy, refusing: %s", rvalue); + + return free_and_replace_full(*p, np, image_policy_free); +} + +int parse_image_policy_argument(const char *s, ImagePolicy **policy) { + _cleanup_(image_policy_freep) ImagePolicy *np = NULL; + int r; + + assert(s); + assert(policy); + + /* + * This function is intended to be used in command line parsers. + * + * NOTE THAT THIS WILL FREE THE PREVIOUS ARGUMENT POINTER ON SUCCESS! + * Hence, do not pass in uninitialized pointers. + */ + + r = image_policy_from_string(s, &np); + if (r == -ENOTUNIQ) + return log_error_errno(r, "Duplicate rule in image policy: %s", s); + if (r == -EBADSLT) + return log_error_errno(r, "Unknown partition type in image policy: %s", s); + if (r == -EBADRQC) + return log_error_errno(r, "Unknown partition policy flag in image policy: %s", s); + if (r < 0) + return log_error_errno(r, "Failed to parse image policy: %s", s); + + return free_and_replace_full(*policy, np, image_policy_free); +} + +const ImagePolicy image_policy_allow = { + /* Allow policy */ + .n_policies = 0, + .default_flags = PARTITION_POLICY_OPEN, +}; + +const ImagePolicy image_policy_deny = { + /* Deny policy */ + .n_policies = 0, + .default_flags = PARTITION_POLICY_ABSENT, +}; + +const ImagePolicy image_policy_ignore = { + /* Ignore policy */ + .n_policies = 0, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +const ImagePolicy image_policy_sysext = { + /* For system extensions, honour root file system, and /usr/ and ignore everything else. After all, + * we are only interested in /usr/ + /opt/ trees anyway, and that's really the only place they can + * be. */ + .n_policies = 2, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +const ImagePolicy image_policy_sysext_strict = { + /* For system extensions, requiring signing */ + .n_policies = 2, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT }, + { PARTITION_USR, PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +const ImagePolicy image_policy_confext = { + /* For configuration extensions, honour root file system, and ignore everything else. After all, we + * are only interested in the /etc/ tree anyway, and that's really the only place it can be. */ + .n_policies = 1, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +const ImagePolicy image_policy_container = { + /* For systemd-nspawn containers we use all partitions, with the exception of swap */ + .n_policies = 8, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_HOME, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_SRV, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_ESP, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_XBOOTLDR, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_TMP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_VAR, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +const ImagePolicy image_policy_host = { + /* For the host policy we basically use everything */ + .n_policies = 9, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_HOME, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_SRV, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_ESP, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_XBOOTLDR, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_SWAP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_TMP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_VAR, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; + +const ImagePolicy image_policy_service = { + /* For RootImage= in services we skip ESP/XBOOTLDR and swap */ + .n_policies = 6, + .policies = { + { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_HOME, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_SRV, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_TMP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + { PARTITION_VAR, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT }, + }, + .default_flags = PARTITION_POLICY_IGNORE, +}; diff --git a/src/shared/image-policy.h b/src/shared/image-policy.h new file mode 100644 index 0000000..f59c16e --- /dev/null +++ b/src/shared/image-policy.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef struct ImagePolicy ImagePolicy; + +#include "conf-parser.h" +#include "dissect-image.h" +#include "errno-list.h" + +typedef enum PartitionPolicyFlags { + /* Not all policy flags really make sense on all partition types, see comments. But even if they + * don't make sense we'll parse them anyway, because maybe one day we'll add them for more partition + * types, too. Moreover, we allow configuring a "default" policy for all partition types for which no + * explicit policy is specified. It's useful if we can use policy flags in there and apply this + * default policy gracefully even to partition types where they don't really make too much sense + * on. Example: a default policy of "verity+encrypted" certainly makes sense, but for /home/ + * partitions this gracefully degrades to "encrypted" (as we do not have a concept of verity for + * /home/), and so on. */ + PARTITION_POLICY_VERITY = 1 << 0, /* must exist, activate with verity (only applies to root/usr partitions) */ + PARTITION_POLICY_SIGNED = 1 << 1, /* must exist, activate with signed verity (only applies to root/usr partitions) */ + PARTITION_POLICY_ENCRYPTED = 1 << 2, /* must exist, activate with LUKS encryption (applies to any data partition, but not to verity/signature partitions */ + PARTITION_POLICY_UNPROTECTED = 1 << 3, /* must exist, activate without encryption/verity */ + PARTITION_POLICY_UNUSED = 1 << 4, /* must exist, don't use */ + PARTITION_POLICY_ABSENT = 1 << 5, /* must not exist */ + PARTITION_POLICY_OPEN = PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED| + PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT, + PARTITION_POLICY_IGNORE = PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT, + _PARTITION_POLICY_USE_MASK = PARTITION_POLICY_OPEN, + + PARTITION_POLICY_READ_ONLY_OFF = 1 << 6, /* State of GPT partition flag "read-only" must be on */ + PARTITION_POLICY_READ_ONLY_ON = 1 << 7, + _PARTITION_POLICY_READ_ONLY_MASK = PARTITION_POLICY_READ_ONLY_OFF|PARTITION_POLICY_READ_ONLY_ON, + PARTITION_POLICY_GROWFS_OFF = 1 << 8, /* State of GPT partition flag "growfs" must be on */ + PARTITION_POLICY_GROWFS_ON = 1 << 9, + _PARTITION_POLICY_GROWFS_MASK = PARTITION_POLICY_GROWFS_OFF|PARTITION_POLICY_GROWFS_ON, + _PARTITION_POLICY_PFLAGS_MASK = _PARTITION_POLICY_READ_ONLY_MASK|_PARTITION_POLICY_GROWFS_MASK, + + _PARTITION_POLICY_MASK = _PARTITION_POLICY_USE_MASK|_PARTITION_POLICY_READ_ONLY_MASK|_PARTITION_POLICY_GROWFS_MASK, + + _PARTITION_POLICY_FLAGS_INVALID = -EINVAL, + _PARTITION_POLICY_FLAGS_ERRNO_MAX = -ERRNO_MAX, /* Ensure the whole errno range fits into this enum */ +} PartitionPolicyFlags; + +assert_cc((_PARTITION_POLICY_USE_MASK | _PARTITION_POLICY_PFLAGS_MASK) >= 0); /* ensure flags don't collide with errno range */ + +typedef struct PartitionPolicy { + PartitionDesignator designator; + PartitionPolicyFlags flags; +} PartitionPolicy; + +struct ImagePolicy { + PartitionPolicyFlags default_flags; /* for any designator not listed in the list below */ + size_t n_policies; + PartitionPolicy policies[]; /* sorted by designator, hence suitable for binary search */ +}; + +/* Default policies for various use cases */ +extern const ImagePolicy image_policy_allow; +extern const ImagePolicy image_policy_deny; +extern const ImagePolicy image_policy_ignore; +extern const ImagePolicy image_policy_sysext; /* No verity required */ +extern const ImagePolicy image_policy_sysext_strict; /* Signed verity required */ +extern const ImagePolicy image_policy_confext; /* No verity required */ +extern const ImagePolicy image_policy_container; +extern const ImagePolicy image_policy_service; +extern const ImagePolicy image_policy_host; + +PartitionPolicyFlags image_policy_get(const ImagePolicy *policy, PartitionDesignator designator); +PartitionPolicyFlags image_policy_get_exhaustively(const ImagePolicy *policy, PartitionDesignator designator); + +/* We want that the NULL image policy means "everything" allowed, hence use these simple accessors to make + * NULL policies work reasonably */ +static inline PartitionPolicyFlags image_policy_default(const ImagePolicy *policy) { + return policy ? policy->default_flags : PARTITION_POLICY_OPEN; +} + +static inline size_t image_policy_n_entries(const ImagePolicy *policy) { + return policy ? policy->n_policies : 0; +} + +PartitionPolicyFlags partition_policy_flags_extend(PartitionPolicyFlags flags); + +PartitionPolicyFlags partition_policy_flags_from_string(const char *s); +int partition_policy_flags_to_string(PartitionPolicyFlags flags, bool simplify, char **ret); + +int image_policy_from_string(const char *s, ImagePolicy **ret); +int image_policy_to_string(const ImagePolicy *policy, bool simplify, char **ret); + +/* Recognizes three special policies by equivalence */ +bool image_policy_equiv_ignore(const ImagePolicy *policy); +bool image_policy_equiv_allow(const ImagePolicy *policy); +bool image_policy_equiv_deny(const ImagePolicy *policy); + +bool image_policy_equal(const ImagePolicy *a, const ImagePolicy *b); /* checks if defined the same way, i.e. has literally the same ruleset */ +int image_policy_equivalent(const ImagePolicy *a, const ImagePolicy *b); /* checks if the outcome is the same, i.e. for all partitions results in the same decisions. */ + +static inline ImagePolicy* image_policy_free(ImagePolicy *p) { + return mfree(p); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(ImagePolicy*, image_policy_free); + +CONFIG_PARSER_PROTOTYPE(config_parse_image_policy); +int parse_image_policy_argument(const char *s, ImagePolicy **policy); diff --git a/src/shared/import-util.c b/src/shared/import-util.c new file mode 100644 index 0000000..9057b78 --- /dev/null +++ b/src/shared/import-util.c @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "chattr-util.h" +#include "errno-util.h" +#include "import-util.h" +#include "log.h" +#include "macro.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "string-table.h" +#include "string-util.h" + +static const char *skip_protocol_and_hostname(const char *url) { + const char *d; + size_t n; + + /* A very very lenient implementation of RFC3986 Section 3.2 */ + + /* Find colon separating protocol and hostname */ + d = strchr(url, ':'); + if (!d || url == d) + return NULL; + d++; + + /* Skip slashes after colon */ + d += strspn(d, "/"); + + /* Skip everything till next slash or end */ + n = strcspn(d, "/?#"); + if (n == 0) + return NULL; + + return d + n; +} + +int import_url_last_component( + const char *url, + char **ret) { + + const char *e, *p, *h; + + /* This extracts the last path component of the specified URI, i.e. the last non-empty substrings + * between two "/" characters. This ignores "Query" and "Fragment" suffixes (as per RFC3986). */ + + h = skip_protocol_and_hostname(url); + if (!h) + return -EINVAL; + + e = h + strcspn(h, "?#"); /* Cut off "Query" and "Fragment" */ + + while (e > h && e[-1] == '/') /* Eat trailing slashes */ + e--; + + p = e; + while (p > h && p[-1] != '/') /* Find component before that */ + p--; + + if (e <= p) /* Empty component? */ + return -EADDRNOTAVAIL; + + if (ret) { + char *s; + + s = strndup(p, e - p); + if (!s) + return -ENOMEM; + + *ret = s; + } + + return 0; +} + +int import_url_change_suffix( + const char *url, + size_t n_drop_components, + const char *suffix, + char **ret) { + + const char *e, *h; + char *s; + + assert(url); + assert(ret); + + /* This drops the specified number of path components of the specified URI, i.e. the specified number + * of non-empty substring between two "/" characters from the end of the string, and then append the + * specified suffix instead. Before doing all this it chops off the "Query" and "Fragment" suffixes + * (they are *not* re-added to the final URL). Note that n_drop_components may be 0 (in which case the + * component are simply added to the end). The suffix may be specified as NULL or empty string in + * which case nothing is appended, only the specified number of components chopped off. Note that the + * function may be called with n_drop_components == 0 and suffix == NULL, in which case the "Query" + * and "Fragment" is chopped off, and ensured the URL ends in a single "/", and that's it. */ + + h = skip_protocol_and_hostname(url); + if (!h) + return -EINVAL; + + e = h + strcspn(h, "?#"); /* Cut off "Query" and "Fragment" */ + + while (e > h && e[-1] == '/') /* Eat trailing slashes */ + e--; + + /* Drop the specified number of components from the end. Note that this is pretty lenient: if there + * are less component we silently drop those and then append the suffix to the top. */ + while (n_drop_components > 0) { + while (e > h && e[-1] != '/') /* Eat last word (we don't mind if empty) */ + e--; + + while (e > h && e[-1] == '/') /* Eat slashes before the last word */ + e--; + + n_drop_components--; + } + + s = new(char, (e - url) + 1 + strlen_ptr(suffix) + 1); + if (!s) + return -ENOMEM; + + strcpy(stpcpy(mempcpy(s, url, e - url), "/"), strempty(suffix)); + *ret = s; + return 0; +} + +static const char* const import_verify_table[_IMPORT_VERIFY_MAX] = { + [IMPORT_VERIFY_NO] = "no", + [IMPORT_VERIFY_CHECKSUM] = "checksum", + [IMPORT_VERIFY_SIGNATURE] = "signature", +}; + +DEFINE_STRING_TABLE_LOOKUP(import_verify, ImportVerify); + +int tar_strip_suffixes(const char *name, char **ret) { + const char *e; + char *s; + + e = endswith(name, ".tar"); + if (!e) + e = endswith(name, ".tar.xz"); + if (!e) + e = endswith(name, ".tar.gz"); + if (!e) + e = endswith(name, ".tar.bz2"); + if (!e) + e = endswith(name, ".tgz"); + if (!e) + e = strchr(name, 0); + + if (e <= name) + return -EINVAL; + + s = strndup(name, e - name); + if (!s) + return -ENOMEM; + + *ret = s; + return 0; +} + +int raw_strip_suffixes(const char *p, char **ret) { + + static const char suffixes[] = + ".xz\0" + ".gz\0" + ".bz2\0" + ".sysext.raw\0" + ".confext.raw\0" + ".raw\0" + ".qcow2\0" + ".img\0" + ".bin\0"; + + _cleanup_free_ char *q = NULL; + + q = strdup(p); + if (!q) + return -ENOMEM; + + for (;;) { + bool changed = false; + + NULSTR_FOREACH(sfx, suffixes) { + char *e; + + e = endswith(q, sfx); + if (e) { + *e = 0; + changed = true; + } + } + + if (!changed) + break; + } + + *ret = TAKE_PTR(q); + + return 0; +} + +int import_assign_pool_quota_and_warn(const char *path) { + int r; + + assert(path); + + r = btrfs_subvol_auto_qgroup(path, 0, true); + if (r == -ENOTTY) { + log_debug_errno(r, "Failed to set up quota hierarchy for %s, as directory is not on btrfs or not a subvolume. Ignoring.", path); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to set up default quota hierarchy for %s: %m", path); + if (r > 0) + log_debug("Set up default quota hierarchy for %s.", path); + + return 0; +} + +int import_set_nocow_and_log(int fd, const char *path) { + int r; + + r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + if (r < 0) + return log_full_errno( + ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set file attributes on %s: %m", path); + + return 0; +} diff --git a/src/shared/import-util.h b/src/shared/import-util.h new file mode 100644 index 0000000..3b2425b --- /dev/null +++ b/src/shared/import-util.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +typedef enum ImportVerify { + IMPORT_VERIFY_NO, + IMPORT_VERIFY_CHECKSUM, + IMPORT_VERIFY_SIGNATURE, + _IMPORT_VERIFY_MAX, + _IMPORT_VERIFY_INVALID = -EINVAL, +} ImportVerify; + +int import_url_last_component(const char *url, char **ret); + +int import_url_change_suffix(const char *url, size_t n_drop_components, const char *suffix, char **ret); + +static inline int import_url_change_last_component(const char *url, const char *suffix, char **ret) { + return import_url_change_suffix(url, 1, suffix, ret); +} + +static inline int import_url_append_component(const char *url, const char *suffix, char **ret) { + return import_url_change_suffix(url, 0, suffix, ret); +} + +const char* import_verify_to_string(ImportVerify v) _const_; +ImportVerify import_verify_from_string(const char *s) _pure_; + +int tar_strip_suffixes(const char *name, char **ret); +int raw_strip_suffixes(const char *name, char **ret); + +int import_assign_pool_quota_and_warn(const char *path); + +int import_set_nocow_and_log(int fd, const char *path); diff --git a/src/shared/in-addr-prefix-util.c b/src/shared/in-addr-prefix-util.c new file mode 100644 index 0000000..7c0033d --- /dev/null +++ b/src/shared/in-addr-prefix-util.c @@ -0,0 +1,325 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "extract-word.h" +#include "hostname-util.h" +#include "in-addr-prefix-util.h" +#include "string-util.h" + +/* 0.0.0.0/0 */ +#define IN_ADDR_PREFIX_IPV4_ANY ((struct in_addr_prefix) { .family = AF_INET }) +/* ::/0 */ +#define IN_ADDR_PREFIX_IPV6_ANY ((struct in_addr_prefix) { .family = AF_INET6 }) +/* 127.0.0.0/8 */ +#define IN_ADDR_PREFIX_IPV4_LOCALHOST \ + ((struct in_addr_prefix) { \ + .family = AF_INET, \ + .address.in.s_addr = htobe32(UINT32_C(127) << 24), \ + .prefixlen = 8, \ + }) +/* ::1/128 */ +#define IN_ADDR_PREFIX_IPV6_LOCALHOST \ + ((struct in_addr_prefix) { \ + .family = AF_INET6, \ + .address.in6 = IN6ADDR_LOOPBACK_INIT, \ + .prefixlen = 128, \ + }) +/* 169.254.0.0/16 */ +#define IN_ADDR_PREFIX_IPV4_LINKLOCAL \ + ((struct in_addr_prefix) { \ + .family = AF_INET, \ + .address.in.s_addr = htobe32((UINT32_C(169) << 24) | \ + (UINT32_C(254) << 16)), \ + .prefixlen = 16, \ + }) +/* fe80::/64 */ +#define IN_ADDR_PREFIX_IPV6_LINKLOCAL \ + ((struct in_addr_prefix) { \ + .family = AF_INET6, \ + .address.in6.s6_addr[0] = 0xfe, \ + .address.in6.s6_addr[1] = 0x80, \ + .prefixlen = 64, \ + }) +/* 224.0.0.0/4 */ +#define IN_ADDR_PREFIX_IPV4_MULTICAST \ + ((struct in_addr_prefix) { \ + .family = AF_INET, \ + .address.in.s_addr = htobe32((UINT32_C(224) << 24)), \ + .prefixlen = 4, \ + }) +/* ff00::/8 */ +#define IN_ADDR_PREFIX_IPV6_MULTICAST \ + ((struct in_addr_prefix) { \ + .family = AF_INET6, \ + .address.in6.s6_addr[0] = 0xff, \ + .prefixlen = 8, \ + }) + +static void in_addr_prefix_hash_func(const struct in_addr_prefix *a, struct siphash *state) { + assert(a); + assert(state); + + siphash24_compress(&a->family, sizeof(a->family), state); + siphash24_compress(&a->prefixlen, sizeof(a->prefixlen), state); + siphash24_compress(&a->address, FAMILY_ADDRESS_SIZE(a->family), state); +} + +static int in_addr_prefix_compare_func(const struct in_addr_prefix *x, const struct in_addr_prefix *y) { + int r; + + assert(x); + assert(y); + + r = CMP(x->family, y->family); + if (r != 0) + return r; + + r = CMP(x->prefixlen, y->prefixlen); + if (r != 0) + return r; + + return memcmp(&x->address, &y->address, FAMILY_ADDRESS_SIZE(x->family)); +} + +DEFINE_HASH_OPS(in_addr_prefix_hash_ops, struct in_addr_prefix, in_addr_prefix_hash_func, in_addr_prefix_compare_func); +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(in_addr_prefix_hash_ops_free, struct in_addr_prefix, in_addr_prefix_hash_func, in_addr_prefix_compare_func, free); + +int in_addr_prefix_add(Set **prefixes, const struct in_addr_prefix *prefix) { + struct in_addr_prefix *copy; + + assert(prefixes); + assert(prefix); + assert(IN_SET(prefix->family, AF_INET, AF_INET6)); + + copy = newdup(struct in_addr_prefix, prefix, 1); + if (!copy) + return -ENOMEM; + + (void) in_addr_mask(copy->family, ©->address, copy->prefixlen); + return set_ensure_consume(prefixes, &in_addr_prefix_hash_ops_free, copy); +} + +int in_addr_prefixes_reduce(Set *prefixes) { + uint32_t ipv4_prefixlen_bits = 0; + uint64_t ipv6_prefixlen_bits[128 / sizeof(uint64_t)] = {}; + uint8_t ipv4_prefixlens[32] = {}, ipv6_prefixlens[128] = {}; + bool ipv4_has_any = false, ipv6_has_any = false; + size_t ipv4_n_prefixlens = 0, ipv6_n_prefixlens = 0; + struct in_addr_prefix *p; + + SET_FOREACH(p, prefixes) + switch (p->family) { + case AF_INET: + assert(p->prefixlen <= 32); + if (p->prefixlen == 0) + ipv4_has_any = true; + else + ipv4_prefixlen_bits |= UINT32_C(1) << (p->prefixlen - 1); + break; + case AF_INET6: + assert(p->prefixlen <= 128); + if (p->prefixlen == 0) + ipv6_has_any = true; + else + ipv6_prefixlen_bits[(p->prefixlen - 1) / sizeof(uint64_t)] |= + UINT64_C(1) << ((p->prefixlen - 1) % sizeof(uint64_t)); + break; + default: + assert_not_reached(); + } + + if (!ipv4_has_any) + for (size_t i = 0; i < 32; i++) + if (ipv4_prefixlen_bits & (UINT32_C(1) << i)) + ipv4_prefixlens[ipv4_n_prefixlens++] = i + 1; + + if (!ipv6_has_any) + for (size_t i = 0; i < 128; i++) + if (ipv6_prefixlen_bits[i / sizeof(uint64_t)] & + (UINT64_C(1) << (i % sizeof(uint64_t)))) + ipv6_prefixlens[ipv6_n_prefixlens++] = i + 1; + + SET_FOREACH(p, prefixes) { + uint8_t *prefixlens; + bool covered; + size_t *n; + + if (p->prefixlen == 0) + continue; + + switch (p->family) { + case AF_INET: + prefixlens = ipv4_prefixlens; + n = &ipv4_n_prefixlens; + covered = ipv4_has_any; + break; + case AF_INET6: + prefixlens = ipv6_prefixlens; + n = &ipv6_n_prefixlens; + covered = ipv6_has_any; + break; + default: + assert_not_reached(); + } + + for (size_t i = 0; i < *n; i++) { + struct in_addr_prefix tmp; + + if (covered) + break; + + if (prefixlens[i] >= p->prefixlen) + break; + + tmp = *p; + tmp.prefixlen = prefixlens[i]; + (void) in_addr_mask(tmp.family, &tmp.address, tmp.prefixlen); + + covered = set_contains(prefixes, &tmp); + } + + if (covered) + free(set_remove(prefixes, p)); + } + + return 0; +} + +int in_addr_prefixes_merge(Set **dest, Set *src) { + struct in_addr_prefix *p; + int r; + + assert(dest); + + SET_FOREACH(p, src) { + r = in_addr_prefix_add(dest, p); + if (r < 0) + return r; + } + + return 0; +} + +bool in_addr_prefixes_is_any(Set *prefixes) { + return + set_contains(prefixes, &IN_ADDR_PREFIX_IPV4_ANY) && + set_contains(prefixes, &IN_ADDR_PREFIX_IPV6_ANY); +} + +int config_parse_in_addr_prefixes( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Set **prefixes = ASSERT_PTR(data); + int r; + + assert(IN_SET(ltype, AF_UNSPEC, AF_INET, AF_INET6)); + + if (isempty(rvalue)) { + *prefixes = set_free(*prefixes); + return 0; + } + + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + return 0; + + if (streq(word, "any")) { + /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */ + + if (ltype != AF_INET6) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_ANY); + if (r < 0) + return log_oom(); + } + + if (ltype != AF_INET) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_ANY); + if (r < 0) + return log_oom(); + } + + } else if (is_localhost(word)) { + /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */ + + if (ltype != AF_INET6) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_LOCALHOST); + if (r < 0) + return log_oom(); + } + + if (ltype != AF_INET) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_LOCALHOST); + if (r < 0) + return log_oom(); + } + + } else if (streq(word, "link-local")) { + /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */ + + if (ltype != AF_INET6) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_LINKLOCAL); + if (r < 0) + return log_oom(); + } + + if (ltype != AF_INET) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_LINKLOCAL); + if (r < 0) + return log_oom(); + } + + } else if (streq(word, "multicast")) { + /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */ + + if (ltype != AF_INET6) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_MULTICAST); + if (r < 0) + return log_oom(); + } + + if (ltype != AF_INET) { + r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_MULTICAST); + if (r < 0) + return log_oom(); + } + + } else { + struct in_addr_prefix a; + + if (ltype == AF_UNSPEC) + r = in_addr_prefix_from_string_auto(word, &a.family, &a.address, &a.prefixlen); + else { + a.family = ltype; + r = in_addr_prefix_from_string(word, a.family, &a.address, &a.prefixlen); + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid address prefix is specified in [%s] %s=, ignoring assignment: %s", + section, lvalue, word); + continue; + } + + r = in_addr_prefix_add(prefixes, &a); + if (r < 0) + return log_oom(); + } + } +} diff --git a/src/shared/in-addr-prefix-util.h b/src/shared/in-addr-prefix-util.h new file mode 100644 index 0000000..53aaad3 --- /dev/null +++ b/src/shared/in-addr-prefix-util.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "set.h" + +struct in_addr_prefix { + int family; + uint8_t prefixlen; + union in_addr_union address; +}; + +int in_addr_prefix_add(Set **prefixes, const struct in_addr_prefix *prefix); +int in_addr_prefixes_reduce(Set *prefixes); +int in_addr_prefixes_merge(Set **dest, Set *src); +/* Returns true if a set contains the two items necessary for "any" (0.0.0.0/0 and ::/0). */ +bool in_addr_prefixes_is_any(Set *prefixes); + +extern const struct hash_ops in_addr_prefix_hash_ops; +extern const struct hash_ops in_addr_prefix_hash_ops_free; + +CONFIG_PARSER_PROTOTYPE(config_parse_in_addr_prefixes); diff --git a/src/shared/initreq.h b/src/shared/initreq.h new file mode 100644 index 0000000..da9783c --- /dev/null +++ b/src/shared/initreq.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.0-or-later */ +/* + * initreq.h Interface to talk to init through /dev/initctl. + * + * Copyright (C) 1995-2004 Miquel van Smoorenburg + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * Version: @(#)initreq.h 1.28 31-Mar-2004 MvS + */ + +#pragma once + +#include + +#if defined(__FreeBSD_kernel__) +# define INIT_FIFO "/etc/.initctl" +#else +# define INIT_FIFO "/dev/initctl" +#endif + +#define INIT_MAGIC 0x03091969 +#define INIT_CMD_START 0 +#define INIT_CMD_RUNLVL 1 +#define INIT_CMD_POWERFAIL 2 +#define INIT_CMD_POWERFAILNOW 3 +#define INIT_CMD_POWEROK 4 +#define INIT_CMD_BSD 5 +#define INIT_CMD_SETENV 6 +#define INIT_CMD_UNSETENV 7 + +#define INIT_CMD_CHANGECONS 12345 + +#ifdef MAXHOSTNAMELEN +# define INITRQ_HLEN MAXHOSTNAMELEN +#else +# define INITRQ_HLEN 64 +#endif + +/* + * This is what BSD 4.4 uses when talking to init. + * Linux doesn't use this right now. + */ +struct init_request_bsd { + char gen_id[8]; /* Beats me.. telnetd uses "fe" */ + char tty_id[16]; /* Tty name minus /dev/tty */ + char host[INITRQ_HLEN]; /* Hostname */ + char term_type[16]; /* Terminal type */ + int signal; /* Signal to send */ + int pid; /* Process to send to */ + char exec_name[128]; /* Program to execute */ + char reserved[128]; /* For future expansion. */ +}; + +/* + * Because of legacy interfaces, "runlevel" and "sleeptime" + * aren't in a separate struct in the union. + * + * The weird sizes are because init expects the whole + * struct to be 384 bytes. + */ +struct init_request { + int magic; /* Magic number */ + int cmd; /* What kind of request */ + int runlevel; /* Runlevel to change to */ + int sleeptime; /* Time between TERM and KILL */ + union { + struct init_request_bsd bsd; + char data[368]; + } i; +}; diff --git a/src/shared/install-file.c b/src/shared/install-file.c new file mode 100644 index 0000000..3b4d651 --- /dev/null +++ b/src/shared/install-file.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "btrfs-util.h" +#include "chattr-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "install-file.h" +#include "missing_syscall.h" +#include "rm-rf.h" +#include "sync-util.h" + +int fs_make_very_read_only(int fd) { + struct stat st; + int r; + + assert(fd >= 0); + + /* Tries to make the specified fd "comprehensively" read-only. Primary use case for this is OS images, + * i.e. either loopback files or larger directory hierarchies. Depending on the inode type and + * backing file system this means something different: + * + * 1. If the fd refers to a btrfs subvolume we'll mark it read-only as a whole + * 2. If the fd refers to any other directory we'll set the FS_IMMUTABLE_FL flag on it + * 3. If the fd refers to a regular file we'll drop the w bits. + * 4. If the fd refers to a block device, use BLKROSET to set read-only state + * + * You might wonder why not drop the x bits for directories. That's because we want to guarantee that + * everything "inside" the image remains largely the way it is, in case you mount it. And since the + * mode of the root dir of the image is pretty visible we don't want to modify it. btrfs subvol flags + * and the FS_IMMUTABLE_FL otoh are much less visible. Changing the mode of regular files should be + * OK though, since after all this is supposed to be used for disk images, i.e. the fs in the disk + * image doesn't make the mode of the loopback file it is stored in visible. */ + + if (fstat(fd, &st) < 0) + return -errno; + + switch (st.st_mode & S_IFMT) { + + case S_IFDIR: + if (btrfs_might_be_subvol(&st)) { + r = btrfs_subvol_set_read_only_fd(fd, true); + if (r >= 0) + return 0; + + if (!ERRNO_IS_NOT_SUPPORTED(r) && r != -EINVAL) + return r; + } + + r = chattr_fd(fd, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL); + if (r < 0) + return r; + + break; + + case S_IFREG: + if ((st.st_mode & 0222) != 0) + if (fchmod(fd, st.st_mode & 07555) < 0) + return -errno; + + break; + + case S_IFBLK: { + int ro = 1; + + if (ioctl(fd, BLKROSET, &ro) < 0) + return -errno; + + break; + } + + default: + return -EBADFD; + } + + return 0; +} + +static int unlinkat_maybe_dir(int dirfd, const char *pathname) { + + /* Invokes unlinkat() for regular files first, and if this fails with EISDIR tries again with + * AT_REMOVEDIR */ + + if (unlinkat(dirfd, pathname, 0) < 0) { + if (errno != EISDIR) + return -errno; + + if (unlinkat(dirfd, pathname, AT_REMOVEDIR) < 0) + return -errno; + } + + return 0; +} + +int install_file(int source_atfd, const char *source_name, + int target_atfd, const char *target_name, + InstallFileFlags flags) { + + _cleanup_close_ int rofd = -EBADF; + int r; + + /* Moves a file or directory tree into place, with some bells and whistles: + * + * 1. Optionally syncs before/after to ensure file installation can be used as barrier + * 2. Optionally marks the file/directory read-only using fs_make_very_read_only() + * 3. Optionally operates in replacing or in non-replacing mode. + * 4. If it replaces will remove the old tree if needed. + */ + + assert(source_atfd >= 0 || source_atfd == AT_FDCWD); + assert(source_name); + assert(target_atfd >= 0 || target_atfd == AT_FDCWD); + + /* If target_name is specified as NULL no renaming takes place. Instead it is assumed the file is + * already in place, and only the syncing/read-only marking shall be applied. Note that with + * target_name=NULL and flags=0 this call is a NOP */ + + if ((flags & (INSTALL_FSYNC|INSTALL_FSYNC_FULL|INSTALL_SYNCFS|INSTALL_READ_ONLY)) != 0) { + _cleanup_close_ int pfd = -EBADF; + struct stat st; + + /* Open an O_PATH fd for the source if we need to sync things or mark things read only. */ + + pfd = openat(source_atfd, source_name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (pfd < 0) + return -errno; + + if (fstat(pfd, &st) < 0) + return -errno; + + switch (st.st_mode & S_IFMT) { + + case S_IFREG: { + _cleanup_close_ int regfd = -EBADF; + + regfd = fd_reopen(pfd, O_RDONLY|O_CLOEXEC); + if (regfd < 0) + return regfd; + + if ((flags & (INSTALL_FSYNC_FULL|INSTALL_SYNCFS)) != 0) { + /* If this is just a regular file (as oppose to a fully populated directory) + * let's downgrade INSTALL_SYNCFS to INSTALL_FSYNC_FULL, after all this is + * going to be a single inode we install */ + r = fsync_full(regfd); + if (r < 0) + return r; + } else if (flags & INSTALL_FSYNC) { + if (fsync(regfd) < 0) + return -errno; + } + + if (flags & INSTALL_READ_ONLY) + rofd = TAKE_FD(regfd); + + break; + } + + case S_IFDIR: { + _cleanup_close_ int dfd = -EBADF; + + dfd = fd_reopen(pfd, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (dfd < 0) + return dfd; + + if (flags & INSTALL_SYNCFS) { + if (syncfs(dfd) < 0) + return -errno; + } else if (flags & INSTALL_FSYNC_FULL) { + r = fsync_full(dfd); + if (r < 0) + return r; + } else if (flags & INSTALL_FSYNC) { + if (fsync(dfd) < 0) + return -errno; + } + + if (flags & INSTALL_READ_ONLY) + rofd = TAKE_FD(dfd); + + break; + } + + default: + /* Other inodes: char/block device inodes, fifos, symlinks, sockets don't need + * syncing themselves, as they only exist in the directory, and have no contents on + * disk */ + + if (target_name && (flags & (INSTALL_FSYNC_FULL|INSTALL_SYNCFS)) != 0) { + r = fsync_directory_of_file(pfd); + if (r < 0) + return r; + } + + break; + } + } + + if (target_name) { + /* Rename the file */ + + if (flags & INSTALL_REPLACE) { + /* First, try a simple renamat(), maybe that's enough */ + if (renameat(source_atfd, source_name, target_atfd, target_name) < 0) { + _cleanup_close_ int dfd = -EBADF; + + if (!IN_SET(errno, EEXIST, ENOTDIR, ENOTEMPTY, EISDIR, EBUSY)) + return -errno; + + /* Hmm, the target apparently existed already. Let's try to use + * RENAME_EXCHANGE. But let's first open the inode if it's a directory, so + * that we can later remove its contents if it's a directory. Why do this + * before the rename()? Mostly because if we have trouble opening the thing + * we want to know before we start actually modifying the file system. */ + + dfd = openat(target_atfd, target_name, O_RDONLY|O_DIRECTORY|O_CLOEXEC, 0); + if (dfd < 0 && errno != ENOTDIR) + return -errno; + + if (renameat2(source_atfd, source_name, target_atfd, target_name, RENAME_EXCHANGE) < 0) { + + if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) + return -errno; + + /* The exchange didn't work, let's remove the target first, and try again */ + + if (dfd >= 0) + (void) rm_rf_children(TAKE_FD(dfd), REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_CHMOD, NULL); + + r = unlinkat_maybe_dir(target_atfd, target_name); + if (r < 0) + return log_debug_errno(r, "Failed to remove target directory: %m"); + + if (renameat(source_atfd, source_name, target_atfd, target_name) < 0) + return -errno; + } else { + /* The exchange worked, hence let's remove the source (i.e. the old target) */ + if (dfd >= 0) + (void) rm_rf_children(TAKE_FD(dfd), REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_CHMOD, NULL); + + r = unlinkat_maybe_dir(source_atfd, source_name); + if (r < 0) + return log_debug_errno(r, "Failed to remove replaced target directory: %m"); + } + } + } else { + r = rename_noreplace(source_atfd, source_name, target_atfd, target_name); + if (r < 0) + return r; + } + } + + if (rofd >= 0) { + r = fs_make_very_read_only(rofd); + if (r < 0) + return r; + } + + if ((flags & (INSTALL_FSYNC_FULL|INSTALL_SYNCFS)) != 0) { + if (target_name) + r = fsync_parent_at(target_atfd, target_name); + else + r = fsync_parent_at(source_atfd, source_name); + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/shared/install-file.h b/src/shared/install-file.h new file mode 100644 index 0000000..c37254f --- /dev/null +++ b/src/shared/install-file.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int fs_make_very_read_only(int fd); + +typedef enum InstallFileFlags { + INSTALL_REPLACE = 1 << 0, /* Replace an existing inode */ + INSTALL_READ_ONLY = 1 << 1, /* Call fs_make_very_read_only() to make the inode comprehensively read-only */ + INSTALL_FSYNC = 1 << 2, /* fsync() file contents before moving file in */ + INSTALL_FSYNC_FULL = 1 << 3, /* like INSTALL_FSYNC, but also fsync() parent dir before+after moving file in */ + INSTALL_SYNCFS = 1 << 4, /* syncfs() before moving file in, fsync() parent dir after moving file in */ +} InstallFileFlags; + +int install_file(int source_atfd, const char *source_name, int target_atfd, const char *target_name, InstallFileFlags flags); diff --git a/src/shared/install-printf.c b/src/shared/install-printf.c new file mode 100644 index 0000000..3cc7093 --- /dev/null +++ b/src/shared/install-printf.c @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "format-util.h" +#include "install-printf.h" +#include "install.h" +#include "macro.h" +#include "specifier.h" +#include "string-util.h" +#include "unit-name.h" +#include "user-util.h" + +static int specifier_prefix_and_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const InstallInfo *i = ASSERT_PTR(userdata); + _cleanup_free_ char *prefix = NULL; + int r; + + r = unit_name_to_prefix_and_instance(i->name, &prefix); + if (r < 0) + return r; + + if (endswith(prefix, "@") && i->default_instance) { + char *ans; + + ans = strjoin(prefix, i->default_instance); + if (!ans) + return -ENOMEM; + *ret = ans; + } else + *ret = TAKE_PTR(prefix); + + return 0; +} + +static int specifier_name(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const InstallInfo *i = ASSERT_PTR(userdata); + char *ans; + + if (unit_name_is_valid(i->name, UNIT_NAME_TEMPLATE) && i->default_instance) + return unit_name_replace_instance(i->name, i->default_instance, ret); + + ans = strdup(i->name); + if (!ans) + return -ENOMEM; + *ret = ans; + return 0; +} + +static int specifier_prefix(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const InstallInfo *i = ASSERT_PTR(userdata); + + return unit_name_to_prefix(i->name, ret); +} + +static int specifier_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const InstallInfo *i = ASSERT_PTR(userdata); + char *instance; + int r; + + r = unit_name_to_instance(i->name, &instance); + if (r < 0) + return r; + + if (isempty(instance)) { + r = free_and_strdup(&instance, strempty(i->default_instance)); + if (r < 0) + return r; + } + + *ret = instance; + return 0; +} + +static int specifier_last_component(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + _cleanup_free_ char *prefix = NULL; + char *dash; + int r; + + assert(ret); + + r = specifier_prefix(specifier, data, root, userdata, &prefix); + if (r < 0) + return r; + + dash = strrchr(prefix, '-'); + if (dash) { + dash = strdup(dash + 1); + if (!dash) + return -ENOMEM; + *ret = dash; + } else + *ret = TAKE_PTR(prefix); + + return 0; +} + +int install_name_printf( + RuntimeScope scope, + const InstallInfo *info, + const char *format, + char **ret) { + /* This is similar to unit_name_printf() */ + + const Specifier table[] = { + { 'i', specifier_instance, NULL }, + { 'j', specifier_last_component, NULL }, + { 'n', specifier_name, NULL }, + { 'N', specifier_prefix_and_instance, NULL }, + { 'p', specifier_prefix, NULL }, + + COMMON_SYSTEM_SPECIFIERS, + + COMMON_CREDS_SPECIFIERS(scope), + {} + }; + + assert(info); + assert(format); + assert(ret); + + return specifier_printf(format, UNIT_NAME_MAX, table, info->root, info, ret); +} diff --git a/src/shared/install-printf.h b/src/shared/install-printf.h new file mode 100644 index 0000000..8c7842b --- /dev/null +++ b/src/shared/install-printf.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "install.h" +#include "unit-name.h" + +int install_name_printf( + RuntimeScope scope, + const InstallInfo *info, + const char *format, + char **ret); diff --git a/src/shared/install.c b/src/shared/install.c new file mode 100644 index 0000000..0f4dab4 --- /dev/null +++ b/src/shared/install.c @@ -0,0 +1,3760 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "constants.h" +#include "dirent-util.h" +#include "errno-list.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hashmap.h" +#include "install-printf.h" +#include "install.h" +#include "locale-util.h" +#include "log.h" +#include "macro.h" +#include "mkdir-label.h" +#include "path-lookup.h" +#include "path-util.h" +#include "rm-rf.h" +#include "set.h" +#include "special.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "unit-file.h" + +#define UNIT_FILE_FOLLOW_SYMLINK_MAX 64 + +typedef enum SearchFlags { + SEARCH_LOAD = 1 << 0, + SEARCH_FOLLOW_CONFIG_SYMLINKS = 1 << 1, + SEARCH_DROPIN = 1 << 2, +} SearchFlags; + +typedef struct { + RuntimeScope scope; + OrderedHashmap *will_process; + OrderedHashmap *have_processed; +} InstallContext; + +struct UnitFilePresetRule { + char *pattern; + PresetAction action; + char **instances; +}; + +/* NB! strings use past tense. */ +static const char *const preset_action_past_tense_table[_PRESET_ACTION_MAX] = { + [PRESET_UNKNOWN] = "unknown", + [PRESET_ENABLE] = "enabled", + [PRESET_DISABLE] = "disabled", + [PRESET_IGNORE] = "ignored", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(preset_action_past_tense, PresetAction); + +static bool install_info_has_rules(const InstallInfo *i) { + assert(i); + + return !strv_isempty(i->aliases) || + !strv_isempty(i->wanted_by) || + !strv_isempty(i->required_by) || + !strv_isempty(i->upheld_by); +} + +static bool install_info_has_also(const InstallInfo *i) { + assert(i); + + return !strv_isempty(i->also); +} + +static void unit_file_preset_rule_done(UnitFilePresetRule *rule) { + assert(rule); + + free(rule->pattern); + strv_free(rule->instances); +} + +void unit_file_presets_done(UnitFilePresets *p) { + if (!p) + return; + + FOREACH_ARRAY(rule, p->rules, p->n_rules) + unit_file_preset_rule_done(rule); + + free(p->rules); + p->n_rules = 0; +} + +static const char *const install_mode_table[_INSTALL_MODE_MAX] = { + [INSTALL_MODE_REGULAR] = "regular", + [INSTALL_MODE_LINKED] = "linked", + [INSTALL_MODE_ALIAS] = "alias", + [INSTALL_MODE_MASKED] = "masked", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(install_mode, InstallMode); + +static int in_search_path(const LookupPaths *lp, const char *path) { + _cleanup_free_ char *parent = NULL; + int r; + + /* Check if 'path' is in lp->search_path. */ + + r = path_extract_directory(ASSERT_PTR(path), &parent); + if (r < 0) + return r; + + return path_strv_contains(ASSERT_PTR(lp)->search_path, parent); +} + +static int underneath_search_path(const LookupPaths *lp, const char *path) { + /* Check if 'path' is underneath lp->search_path. */ + + return !!path_startswith_strv(ASSERT_PTR(path), ASSERT_PTR(lp)->search_path); +} + +static const char* skip_root(const char *root_dir, const char *path) { + assert(path); + + if (!root_dir) + return path; + + const char *e = path_startswith(path, root_dir); + if (!e) + return NULL; + + /* Make sure the returned path starts with a slash */ + if (e[0] != '/') { + if (e == path || e[-1] != '/') + return NULL; + + e--; + } + + return e; +} + +static int path_is_generator(const LookupPaths *lp, const char *path) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(lp); + assert(path); + + r = path_extract_directory(path, &parent); + if (r < 0) + return r; + + return path_equal_ptr(parent, lp->generator) || + path_equal_ptr(parent, lp->generator_early) || + path_equal_ptr(parent, lp->generator_late); +} + +static int path_is_transient(const LookupPaths *lp, const char *path) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(lp); + assert(path); + + r = path_extract_directory(path, &parent); + if (r < 0) + return r; + + return path_equal_ptr(parent, lp->transient); +} + +static int path_is_control(const LookupPaths *lp, const char *path) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(lp); + assert(path); + + r = path_extract_directory(path, &parent); + if (r < 0) + return r; + + return path_equal_ptr(parent, lp->persistent_control) || + path_equal_ptr(parent, lp->runtime_control); +} + +static int path_is_config(const LookupPaths *lp, const char *path, bool check_parent) { + _cleanup_free_ char *parent = NULL; + int r; + + assert(lp); + assert(path); + + /* Note that we do *not* have generic checks for /etc or /run in place, since with + * them we couldn't discern configuration from transient or generated units */ + + if (check_parent) { + r = path_extract_directory(path, &parent); + if (r < 0) + return r; + + path = parent; + } + + return path_equal_ptr(path, lp->persistent_config) || + path_equal_ptr(path, lp->runtime_config); +} + +static int path_is_runtime(const LookupPaths *lp, const char *path, bool check_parent) { + _cleanup_free_ char *parent = NULL; + const char *rpath; + int r; + + assert(lp); + assert(path); + + /* Everything in /run is considered runtime. On top of that we also add + * explicit checks for the various runtime directories, as safety net. */ + + rpath = skip_root(lp->root_dir, path); + if (rpath && path_startswith(rpath, "/run")) + return true; + + if (check_parent) { + r = path_extract_directory(path, &parent); + if (r < 0) + return r; + + path = parent; + } + + return path_equal_ptr(path, lp->runtime_config) || + path_equal_ptr(path, lp->generator) || + path_equal_ptr(path, lp->generator_early) || + path_equal_ptr(path, lp->generator_late) || + path_equal_ptr(path, lp->transient) || + path_equal_ptr(path, lp->runtime_control); +} + +static int path_is_vendor_or_generator(const LookupPaths *lp, const char *path) { + const char *rpath; + + assert(lp); + assert(path); + + rpath = skip_root(lp->root_dir, path); + if (!rpath) + return 0; + + if (path_startswith(rpath, "/usr")) + return true; + + if (path_is_generator(lp, rpath)) + return true; + + return path_equal(rpath, SYSTEM_DATA_UNIT_DIR); +} + +static const char* config_path_from_flags(const LookupPaths *lp, UnitFileFlags flags) { + assert(lp); + + if (FLAGS_SET(flags, UNIT_FILE_PORTABLE)) + return FLAGS_SET(flags, UNIT_FILE_RUNTIME) ? lp->runtime_attached : lp->persistent_attached; + else + return FLAGS_SET(flags, UNIT_FILE_RUNTIME) ? lp->runtime_config : lp->persistent_config; +} + +InstallChangeType install_changes_add( + InstallChange **changes, + size_t *n_changes, + InstallChangeType type, /* INSTALL_CHANGE_SYMLINK, _UNLINK, _IS_MASKED, _IS_DANGLING, … if positive or errno if negative */ + const char *path, + const char *source) { + + _cleanup_free_ char *p = NULL, *s = NULL; + InstallChange *c; + int r; + + assert(!changes == !n_changes); + assert(INSTALL_CHANGE_TYPE_VALID(type)); + + /* Message formatting requires to be set. */ + assert(path); + + /* Register a change or error. Note that the return value may be the error + * that was passed in, or -ENOMEM generated internally. */ + + if (!changes) + return type; + + c = reallocarray(*changes, *n_changes + 1, sizeof(InstallChange)); + if (!c) + return -ENOMEM; + *changes = c; + + r = path_simplify_alloc(path, &p); + if (r < 0) + return r; + + r = path_simplify_alloc(source, &s); + if (r < 0) + return r; + + c[(*n_changes)++] = (InstallChange) { + .type = type, + .path = TAKE_PTR(p), + .source = TAKE_PTR(s), + }; + + return type; +} + +void install_changes_free(InstallChange *changes, size_t n_changes) { + assert(changes || n_changes == 0); + + for (size_t i = 0; i < n_changes; i++) { + free(changes[i].path); + free(changes[i].source); + } + + free(changes); +} + +void install_changes_dump(int r, const char *verb, const InstallChange *changes, size_t n_changes, bool quiet) { + int err = 0; + + assert(changes || n_changes == 0); + /* If verb is not specified, errors are not allowed! */ + assert(verb || r >= 0); + + for (size_t i = 0; i < n_changes; i++) { + if (changes[i].type < 0) + assert(verb); + assert(changes[i].path); + + /* When making changes here, make sure to also change install_error() in dbus-manager.c. */ + + switch (changes[i].type) { + case INSTALL_CHANGE_SYMLINK: + if (!quiet) + log_info("Created symlink %s %s %s.", + changes[i].path, + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + changes[i].source); + break; + case INSTALL_CHANGE_UNLINK: + if (!quiet) + log_info("Removed \"%s\".", changes[i].path); + break; + case INSTALL_CHANGE_IS_MASKED: + if (!quiet) + log_info("Unit %s is masked, ignoring.", changes[i].path); + break; + case INSTALL_CHANGE_IS_MASKED_GENERATOR: + if (!quiet) + log_info("Unit %s is masked via a generator and cannot be unmasked.", + changes[i].path); + break; + case INSTALL_CHANGE_IS_DANGLING: + if (!quiet) + log_info("Unit %s is an alias to a unit that is not present, ignoring.", + changes[i].path); + break; + case INSTALL_CHANGE_DESTINATION_NOT_PRESENT: + if (!quiet) + log_warning("Unit %s is added as a dependency to a non-existent unit %s.", + changes[i].source, changes[i].path); + break; + case INSTALL_CHANGE_AUXILIARY_FAILED: + if (!quiet) + log_warning("Failed to enable auxiliary unit %s, ignoring.", changes[i].path); + break; + case -EEXIST: + if (changes[i].source) + err = log_error_errno(changes[i].type, + "Failed to %s unit, file \"%s\" already exists and is a symlink to \"%s\".", + verb, changes[i].path, changes[i].source); + else + err = log_error_errno(changes[i].type, + "Failed to %s unit, file \"%s\" already exists.", + verb, changes[i].path); + break; + case -ERFKILL: + err = log_error_errno(changes[i].type, "Failed to %s unit, unit %s is masked.", + verb, changes[i].path); + break; + case -EADDRNOTAVAIL: + err = log_error_errno(changes[i].type, "Failed to %s unit, unit %s is transient or generated.", + verb, changes[i].path); + break; + case -ETXTBSY: + err = log_error_errno(changes[i].type, "Failed to %s unit, file %s is under the systemd unit hierarchy already.", + verb, changes[i].path); + break; + case -EBADSLT: + err = log_error_errno(changes[i].type, "Failed to %s unit, invalid specifier in \"%s\".", + verb, changes[i].path); + break; + case -EIDRM: + err = log_error_errno(changes[i].type, "Failed to %s %s, destination unit %s is a non-template unit.", + verb, changes[i].source, changes[i].path); + break; + case -EUCLEAN: + err = log_error_errno(changes[i].type, + "Failed to %s unit, \"%s\" is not a valid unit name.", + verb, changes[i].path); + break; + case -ELOOP: + err = log_error_errno(changes[i].type, "Failed to %s unit, refusing to operate on linked unit file %s.", + verb, changes[i].path); + break; + case -EXDEV: + if (changes[i].source) + err = log_error_errno(changes[i].type, "Failed to %s unit, cannot alias %s as %s.", + verb, changes[i].source, changes[i].path); + else + err = log_error_errno(changes[i].type, "Failed to %s unit, invalid unit reference \"%s\".", + verb, changes[i].path); + break; + case -ENOENT: + err = log_error_errno(changes[i].type, "Failed to %s unit, unit %s does not exist.", + verb, changes[i].path); + break; + case -EUNATCH: + err = log_error_errno(changes[i].type, "Failed to %s unit, cannot resolve specifiers in \"%s\".", + verb, changes[i].path); + break; + default: + assert(changes[i].type < 0); + err = log_error_errno(changes[i].type, "Failed to %s unit, file \"%s\": %m", + verb, changes[i].path); + } + } + + if (r < 0 && err >= 0) + log_error_errno(r, "Failed to %s: %m.", verb); +} + +/** + * Checks if two symlink targets (starting from src) are equivalent as far as the unit enablement logic is + * concerned. If the target is in the unit search path, then anything with the same name is equivalent. + * If outside the unit search path, paths must be identical. + */ +static int chroot_unit_symlinks_equivalent( + const LookupPaths *lp, + const char *src, + const char *target_a, + const char *target_b) { + + assert(lp); + assert(src); + assert(target_a); + assert(target_b); + + /* This will give incorrect results if the paths are relative and go outside + * of the chroot. False negatives are possible. */ + + const char *root = lp->root_dir ?: "/"; + _cleanup_free_ char *dirname = NULL; + int r; + + if (!path_is_absolute(target_a) || !path_is_absolute(target_b)) { + r = path_extract_directory(src, &dirname); + if (r < 0) + return r; + } + + _cleanup_free_ char *a = path_join(path_is_absolute(target_a) ? root : dirname, target_a); + _cleanup_free_ char *b = path_join(path_is_absolute(target_b) ? root : dirname, target_b); + if (!a || !b) + return log_oom(); + + r = path_equal_or_inode_same(a, b, 0); + if (r != 0) + return r; + + _cleanup_free_ char *a_name = NULL, *b_name = NULL; + r = path_extract_filename(a, &a_name); + if (r < 0) + return r; + r = path_extract_filename(b, &b_name); + if (r < 0) + return r; + + return streq(a_name, b_name) && + path_startswith_strv(a, lp->search_path) && + path_startswith_strv(b, lp->search_path); +} + +static int create_symlink( + const LookupPaths *lp, + const char *old_path, + const char *new_path, + bool force, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_free_ char *dest = NULL; + const char *rp; + int r; + + assert(old_path); + assert(new_path); + + rp = skip_root(lp->root_dir, old_path); + if (rp) + old_path = rp; + + /* Actually create a symlink, and remember that we did. This function is + * smart enough to check if there's already a valid symlink in place. + * + * Returns 1 if a symlink was created or already exists and points to the + * right place, or negative on error. + */ + + (void) mkdir_parents_label(new_path, 0755); + + if (symlink(old_path, new_path) >= 0) { + r = install_changes_add(changes, n_changes, INSTALL_CHANGE_SYMLINK, new_path, old_path); + if (r < 0) + return r; + return 1; + } + + if (errno != EEXIST) + return install_changes_add(changes, n_changes, -errno, new_path, NULL); + + r = readlink_malloc(new_path, &dest); + if (r < 0) { + /* translate EINVAL (non-symlink exists) to EEXIST */ + if (r == -EINVAL) + r = -EEXIST; + + return install_changes_add(changes, n_changes, r, new_path, NULL); + } + + if (chroot_unit_symlinks_equivalent(lp, new_path, dest, old_path)) { + log_debug("Symlink %s %s %s already exists", + new_path, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), dest); + return 1; + } + + if (!force) + return install_changes_add(changes, n_changes, -EEXIST, new_path, dest); + + r = symlink_atomic(old_path, new_path); + if (r < 0) + return install_changes_add(changes, n_changes, r, new_path, NULL); + + r = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, new_path, NULL); + if (r < 0) + return r; + r = install_changes_add(changes, n_changes, INSTALL_CHANGE_SYMLINK, new_path, old_path); + if (r < 0) + return r; + + return 1; +} + +static int mark_symlink_for_removal( + Set **remove_symlinks_to, + const char *p) { + + char *n; + int r; + + assert(p); + + r = set_ensure_allocated(remove_symlinks_to, &path_hash_ops); + if (r < 0) + return r; + + r = path_simplify_alloc(p, &n); + if (r < 0) + return r; + + r = set_consume(*remove_symlinks_to, n); + if (r == -EEXIST) + return 0; + if (r < 0) + return r; + + return 1; +} + +static int remove_marked_symlinks_fd( + Set *remove_symlinks_to, + int fd, + const char *path, + const char *config_path, + const LookupPaths *lp, + bool dry_run, + bool *restart, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + + assert(remove_symlinks_to); + assert(fd >= 0); + assert(path); + assert(config_path); + assert(lp); + assert(restart); + + d = fdopendir(fd); + if (!d) { + safe_close(fd); + return -errno; + } + + rewinddir(d); + + FOREACH_DIRENT(de, d, return -errno) + + if (de->d_type == DT_DIR) { + _cleanup_free_ char *p = NULL; + int nfd, q; + + nfd = openat(fd, de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + if (nfd < 0) { + if (errno == ENOENT) + continue; + + if (r == 0) + r = -errno; + continue; + } + + p = path_make_absolute(de->d_name, path); + if (!p) { + safe_close(nfd); + return -ENOMEM; + } + + /* This will close nfd, regardless whether it succeeds or not */ + q = remove_marked_symlinks_fd(remove_symlinks_to, nfd, p, config_path, lp, dry_run, restart, changes, n_changes); + if (q < 0 && r == 0) + r = q; + + } else if (de->d_type == DT_LNK) { + _cleanup_free_ char *p = NULL; + bool found; + int q; + + if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY)) + continue; + + p = path_make_absolute(de->d_name, path); + if (!p) + return -ENOMEM; + path_simplify(p); + + /* We remove all links pointing to a file or path that is marked, as well as all + * files sharing the same name as a file that is marked, and files sharing the same + * name after the instance has been removed. Do path chasing only if we don't already + * know that we want to remove the symlink. */ + found = set_contains(remove_symlinks_to, de->d_name); + + if (!found) { + _cleanup_free_ char *template = NULL; + + q = unit_name_template(de->d_name, &template); + if (q < 0 && q != -EINVAL) + return q; + if (q >= 0) + found = set_contains(remove_symlinks_to, template); + } + + if (!found) { + _cleanup_free_ char *dest = NULL; + + q = chase(p, lp->root_dir, CHASE_NONEXISTENT, &dest, NULL); + if (q == -ENOENT) + continue; + if (q < 0) { + log_debug_errno(q, "Failed to resolve symlink \"%s\": %m", p); + install_changes_add(changes, n_changes, q, p, NULL); + + if (r == 0) + r = q; + continue; + } + + found = set_contains(remove_symlinks_to, dest) || + set_contains(remove_symlinks_to, basename(dest)); + + } + + + if (!found) + continue; + + if (!dry_run) { + if (unlinkat(fd, de->d_name, 0) < 0 && errno != ENOENT) { + if (r == 0) + r = -errno; + install_changes_add(changes, n_changes, -errno, p, NULL); + continue; + } + + (void) rmdir_parents(p, config_path); + } + + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, p, NULL); + if (q < 0) + return q; + + /* Now, remember the full path (but with the root prefix removed) of + * the symlink we just removed, and remove any symlinks to it, too. */ + + const char *rp = skip_root(lp->root_dir, p); + q = mark_symlink_for_removal(&remove_symlinks_to, rp ?: p); + if (q < 0) + return q; + if (q > 0 && !dry_run) + *restart = true; + } + + return r; +} + +static int remove_marked_symlinks( + Set *remove_symlinks_to, + const char *config_path, + const LookupPaths *lp, + bool dry_run, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_close_ int fd = -EBADF; + bool restart; + int r = 0; + + assert(config_path); + assert(lp); + + if (set_size(remove_symlinks_to) <= 0) + return 0; + + fd = open(config_path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return errno == ENOENT ? 0 : -errno; + + do { + int q, cfd; + restart = false; + + cfd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (cfd < 0) + return -errno; + + /* This takes possession of cfd and closes it */ + q = remove_marked_symlinks_fd(remove_symlinks_to, cfd, config_path, config_path, lp, dry_run, &restart, changes, n_changes); + if (r == 0) + r = q; + } while (restart); + + return r; +} + +static int is_symlink_with_known_name(const InstallInfo *i, const char *name) { + int r; + + if (streq(name, i->name)) + return true; + + if (strv_contains(i->aliases, name)) + return true; + + /* Look for template symlink matching DefaultInstance */ + if (i->default_instance && unit_name_is_valid(i->name, UNIT_NAME_TEMPLATE)) { + _cleanup_free_ char *s = NULL; + + r = unit_name_replace_instance(i->name, i->default_instance, &s); + if (r < 0) { + if (r != -EINVAL) + return r; + + } else if (streq(name, s)) + return true; + } + + return false; +} + +static int find_symlinks_in_directory( + DIR *dir, + const char *dir_path, + const char *root_dir, + const InstallInfo *info, + bool ignore_destination, + bool match_name, + bool ignore_same_name, + const char *config_path, + bool *same_name_link) { + + int r = 0; + + FOREACH_DIRENT(de, dir, return -errno) { + bool found_path = false, found_dest = false, b = false; + int q; + + if (de->d_type != DT_LNK) + continue; + + if (!ignore_destination) { + _cleanup_free_ char *dest = NULL; + + /* Acquire symlink destination */ + q = readlinkat_malloc(dirfd(dir), de->d_name, &dest); + if (q == -ENOENT) + continue; + if (q < 0) { + if (r == 0) + r = q; + continue; + } + + /* Make absolute */ + if (!path_is_absolute(dest)) { + char *x; + + x = path_join(dir_path, dest); + if (!x) + return -ENOMEM; + + free_and_replace(dest, x); + } + + /* Check if what the symlink points to matches what we are looking for */ + found_dest = streq(basename(dest), info->name); + } + + assert(unit_name_is_valid(info->name, UNIT_NAME_ANY)); + + /* Check if the symlink itself matches what we are looking for. + * + * If ignore_destination is specified, we only look at the source name. + * + * If ignore_same_name is specified, we are in one of the directories which + * have lower priority than the unit file, and even if a file or symlink with + * this name was found, we should ignore it. */ + + if (ignore_destination || !ignore_same_name) + found_path = streq(de->d_name, info->name); + + if (!found_path && ignore_destination) { + _cleanup_free_ char *template = NULL; + + q = unit_name_template(de->d_name, &template); + if (q < 0 && q != -EINVAL) + return q; + if (q >= 0) + found_dest = streq(template, info->name); + } + + if (found_path && found_dest) { + _cleanup_free_ char *p = NULL, *t = NULL; + + /* Filter out same name links in the main config path */ + p = path_make_absolute(de->d_name, dir_path); + t = path_make_absolute(info->name, config_path); + + if (!p || !t) + return -ENOMEM; + + b = path_equal(p, t); + } + + if (b) + *same_name_link = true; + else if (found_path || found_dest) { + if (!match_name) + return 1; + + /* Check if symlink name is in the set of names used by [Install] */ + q = is_symlink_with_known_name(info, de->d_name); + if (q < 0) + return q; + if (q > 0) + return 1; + } + } + + return r; +} + +static int find_symlinks( + const char *root_dir, + const InstallInfo *i, + bool match_name, + bool ignore_same_name, + const char *config_path, + bool *same_name_link) { + + _cleanup_closedir_ DIR *config_dir = NULL; + int r = 0; + + assert(i); + assert(config_path); + assert(same_name_link); + + config_dir = opendir(config_path); + if (!config_dir) { + if (IN_SET(errno, ENOENT, ENOTDIR, EACCES)) + return 0; + return -errno; + } + + FOREACH_DIRENT(de, config_dir, return -errno) { + const char *suffix; + _cleanup_free_ const char *path = NULL; + _cleanup_closedir_ DIR *d = NULL; + + if (de->d_type != DT_DIR) + continue; + + suffix = strrchr(de->d_name, '.'); + if (!STRPTR_IN_SET(suffix, ".wants", ".requires", ".upholds")) + continue; + + path = path_join(config_path, de->d_name); + if (!path) + return -ENOMEM; + + d = opendir(path); + if (!d) { + log_error_errno(errno, "Failed to open directory \"%s\" while scanning for symlinks, ignoring: %m", path); + continue; + } + + r = find_symlinks_in_directory(d, path, root_dir, i, + /* ignore_destination= */ true, + /* match_name= */ match_name, + /* ignore_same_name= */ ignore_same_name, + config_path, + same_name_link); + if (r > 0) + return 1; + else if (r < 0) + log_debug_errno(r, "Failed to look up symlinks in \"%s\": %m", path); + } + + /* We didn't find any suitable symlinks in .wants, .requires or .upholds directories, + * let's look for linked unit files in this directory. */ + rewinddir(config_dir); + return find_symlinks_in_directory(config_dir, config_path, root_dir, i, + /* ignore_destination= */ false, + /* match_name= */ match_name, + /* ignore_same_name= */ ignore_same_name, + config_path, + same_name_link); +} + +static int find_symlinks_in_scope( + RuntimeScope scope, + const LookupPaths *lp, + const InstallInfo *info, + bool match_name, + UnitFileState *state) { + + bool same_name_link_runtime = false, same_name_link_config = false; + bool enabled_in_runtime = false, enabled_at_all = false; + bool ignore_same_name = false; + int r; + + assert(lp); + assert(info); + + /* As we iterate over the list of search paths in lp->search_path, we may encounter "same name" + * symlinks. The ones which are "below" (i.e. have lower priority) than the unit file itself are + * effectively masked, so we should ignore them. */ + + STRV_FOREACH(p, lp->search_path) { + bool same_name_link = false; + + r = find_symlinks(lp->root_dir, info, match_name, ignore_same_name, *p, &same_name_link); + if (r < 0) + return r; + if (r > 0) { + /* We found symlinks in this dir? Yay! Let's see where precisely it is enabled. */ + + if (path_equal_ptr(*p, lp->persistent_config)) { + /* This is the best outcome, let's return it immediately. */ + *state = UNIT_FILE_ENABLED; + return 1; + } + + /* look for global enablement of user units */ + if (scope == RUNTIME_SCOPE_USER && path_is_user_config_dir(*p)) { + *state = UNIT_FILE_ENABLED; + return 1; + } + + r = path_is_runtime(lp, *p, false); + if (r < 0) + return r; + if (r > 0) + enabled_in_runtime = true; + else + enabled_at_all = true; + + } else if (same_name_link) { + if (path_equal_ptr(*p, lp->persistent_config)) + same_name_link_config = true; + else { + r = path_is_runtime(lp, *p, false); + if (r < 0) + return r; + if (r > 0) + same_name_link_runtime = true; + } + } + + /* Check if next iteration will be "below" the unit file (either a regular file + * or a symlink), and hence should be ignored */ + if (!ignore_same_name && path_startswith(info->path, *p)) + ignore_same_name = true; + } + + if (enabled_in_runtime) { + *state = UNIT_FILE_ENABLED_RUNTIME; + return 1; + } + + /* Here's a special rule: if the unit we are looking for is an instance, and it symlinked in the search path + * outside of runtime and configuration directory, then we consider it statically enabled. Note we do that only + * for instance, not for regular names, as those are merely aliases, while instances explicitly instantiate + * something, and hence are a much stronger concept. */ + if (enabled_at_all && unit_name_is_valid(info->name, UNIT_NAME_INSTANCE)) { + *state = UNIT_FILE_STATIC; + return 1; + } + + /* Hmm, we didn't find it, but maybe we found the same name + * link? */ + if (same_name_link_config) { + *state = UNIT_FILE_LINKED; + return 1; + } + if (same_name_link_runtime) { + *state = UNIT_FILE_LINKED_RUNTIME; + return 1; + } + + return 0; +} + +static void install_info_clear(InstallInfo *i) { + if (!i) + return; + + i->name = mfree(i->name); + i->path = mfree(i->path); + i->root = mfree(i->root); + i->aliases = strv_free(i->aliases); + i->wanted_by = strv_free(i->wanted_by); + i->required_by = strv_free(i->required_by); + i->upheld_by = strv_free(i->upheld_by); + i->also = strv_free(i->also); + i->default_instance = mfree(i->default_instance); + i->symlink_target = mfree(i->symlink_target); +} + +static InstallInfo* install_info_free(InstallInfo *i) { + install_info_clear(i); + return mfree(i); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(InstallInfo*, install_info_free); + +static void install_context_done(InstallContext *ctx) { + assert(ctx); + + ctx->will_process = ordered_hashmap_free_with_destructor(ctx->will_process, install_info_free); + ctx->have_processed = ordered_hashmap_free_with_destructor(ctx->have_processed, install_info_free); +} + +static InstallInfo *install_info_find(InstallContext *ctx, const char *name) { + InstallInfo *i; + + i = ordered_hashmap_get(ctx->have_processed, name); + if (i) + return i; + + return ordered_hashmap_get(ctx->will_process, name); +} + +static int install_info_may_process( + const InstallInfo *i, + const LookupPaths *lp, + InstallChange **changes, + size_t *n_changes) { + assert(i); + assert(lp); + + /* Checks whether the loaded unit file is one we should process, or is masked, + * transient or generated and thus not subject to enable/disable operations. */ + + if (i->install_mode == INSTALL_MODE_MASKED) + return install_changes_add(changes, n_changes, -ERFKILL, i->path, NULL); + if (path_is_generator(lp, i->path) || + path_is_transient(lp, i->path)) + return install_changes_add(changes, n_changes, -EADDRNOTAVAIL, i->path, NULL); + + return 0; +} + +/** + * Adds a new InstallInfo entry under name in the InstallContext.will_process + * hashmap, or retrieves the existing one if already present. + * + * Returns negative on error, 0 if the unit was already known, 1 otherwise. + */ +static int install_info_add( + InstallContext *ctx, + const char *name, + const char *path, + const char *root, + bool auxiliary, + InstallInfo **ret) { + + int r; + + assert(ctx); + + if (!name) { + /* 'name' and 'path' must not both be null. Check here 'path' using assert_se() to + * workaround a bug in gcc that generates a -Wnonnull warning when calling basename(), + * but this cannot be possible in any code path (See #6119). */ + assert_se(path); + name = basename(path); + } + + if (!unit_name_is_valid(name, UNIT_NAME_ANY)) + return -EINVAL; + + InstallInfo *i = install_info_find(ctx, name); + if (i) { + i->auxiliary = i->auxiliary && auxiliary; + + if (ret) + *ret = i; + return 0; + } + + _cleanup_(install_info_freep) InstallInfo *alloc = new(InstallInfo, 1); + if (!alloc) + return -ENOMEM; + + *alloc = (InstallInfo) { + .install_mode = _INSTALL_MODE_INVALID, + .auxiliary = auxiliary, + }; + + alloc->name = strdup(name); + if (!alloc->name) + return -ENOMEM; + + if (root) { + alloc->root = strdup(root); + if (!alloc->root) + return -ENOMEM; + } + + if (path) { + alloc->path = strdup(path); + if (!alloc->path) + return -ENOMEM; + } + + r = ordered_hashmap_ensure_put(&ctx->will_process, &string_hash_ops, alloc->name, alloc); + if (r < 0) + return r; + i = TAKE_PTR(alloc); + + if (ret) + *ret = i; + return 1; +} + +static int config_parse_alias( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + UnitType type; + + assert(unit); + assert(filename); + assert(lvalue); + assert(rvalue); + + type = unit_name_to_type(unit); + if (!unit_type_may_alias(type)) + return log_syntax(unit, LOG_WARNING, filename, line, 0, + "Alias= is not allowed for %s units, ignoring.", + unit_type_to_string(type)); + + return config_parse_strv(unit, filename, line, section, section_line, + lvalue, ltype, rvalue, data, userdata); +} + +static int config_parse_also( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + InstallInfo *info = ASSERT_PTR(userdata); + InstallContext *ctx = ASSERT_PTR(data); + int r; + + assert(unit); + assert(filename); + assert(lvalue); + assert(rvalue); + + for (;;) { + _cleanup_free_ char *word = NULL, *printed = NULL; + + r = extract_first_word(&rvalue, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + r = install_name_printf(ctx->scope, info, word, &printed); + if (r < 0) + return log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve unit name in Also=\"%s\": %m", word); + + r = install_info_add(ctx, printed, NULL, info->root, /* auxiliary= */ true, NULL); + if (r < 0) + return r; + + r = strv_push(&info->also, printed); + if (r < 0) + return r; + + printed = NULL; + } + + return 0; +} + +static int config_parse_default_instance( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + InstallContext *ctx = ASSERT_PTR(data); + InstallInfo *info = ASSERT_PTR(userdata); + _cleanup_free_ char *printed = NULL; + int r; + + assert(unit); + assert(filename); + assert(lvalue); + assert(rvalue); + + if (unit_name_is_valid(unit, UNIT_NAME_INSTANCE)) + /* When enabling an instance, we might be using a template unit file, + * but we should ignore DefaultInstance silently. */ + return 0; + if (!unit_name_is_valid(unit, UNIT_NAME_TEMPLATE)) + return log_syntax(unit, LOG_WARNING, filename, line, 0, + "DefaultInstance= only makes sense for template units, ignoring."); + + r = install_name_printf(ctx->scope, info, rvalue, &printed); + if (r < 0) + return log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to resolve instance name in DefaultInstance=\"%s\": %m", rvalue); + + if (isempty(printed)) + printed = mfree(printed); + + if (printed && !unit_instance_is_valid(printed)) + return log_syntax(unit, LOG_WARNING, filename, line, SYNTHETIC_ERRNO(EINVAL), + "Invalid DefaultInstance= value \"%s\".", printed); + + return free_and_replace(info->default_instance, printed); +} + +static int unit_file_load( + InstallContext *ctx, + InstallInfo *info, + const char *path, + const char *root_dir, + SearchFlags flags) { + + const ConfigTableItem items[] = { + { "Install", "Alias", config_parse_alias, 0, &info->aliases }, + { "Install", "WantedBy", config_parse_strv, 0, &info->wanted_by }, + { "Install", "RequiredBy", config_parse_strv, 0, &info->required_by }, + { "Install", "UpheldBy", config_parse_strv, 0, &info->upheld_by }, + { "Install", "DefaultInstance", config_parse_default_instance, 0, info }, + { "Install", "Also", config_parse_also, 0, ctx }, + {} + }; + + UnitType type; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + assert(info); + assert(path); + + if (!(flags & SEARCH_DROPIN)) { + /* Loading or checking for the main unit file… */ + + type = unit_name_to_type(info->name); + if (type < 0) + return -EINVAL; + if (unit_name_is_valid(info->name, UNIT_NAME_TEMPLATE|UNIT_NAME_INSTANCE) && !unit_type_may_template(type)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: unit type %s cannot be templated, ignoring.", path, unit_type_to_string(type)); + + if (!(flags & SEARCH_LOAD)) { + if (lstat(path, &st) < 0) + return -errno; + + if (null_or_empty(&st)) + info->install_mode = INSTALL_MODE_MASKED; + else if (S_ISREG(st.st_mode)) + info->install_mode = INSTALL_MODE_REGULAR; + else if (S_ISLNK(st.st_mode)) + return -ELOOP; + else if (S_ISDIR(st.st_mode)) + return -EISDIR; + else + return -ENOTTY; + + return 0; + } + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd < 0) + return -errno; + } else { + /* Operating on a drop-in file. If we aren't supposed to load the unit file drop-ins don't matter, let's hence shortcut this. */ + + if (!(flags & SEARCH_LOAD)) + return 0; + + fd = chase_and_open(path, root_dir, 0, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL); + if (fd < 0) + return fd; + } + + if (fstat(fd, &st) < 0) + return -errno; + + if (null_or_empty(&st)) { + if ((flags & SEARCH_DROPIN) == 0) + info->install_mode = INSTALL_MODE_MASKED; + + return 0; + } + + r = stat_verify_regular(&st); + if (r < 0) + return r; + + f = take_fdopen(&fd, "r"); + if (!f) + return -errno; + + /* ctx is only needed if we actually load the file (it's referenced from items[] btw, in case you wonder.) */ + assert(ctx); + + r = config_parse(info->name, path, f, + "Install\0" + "-Unit\0" + "-Automount\0" + "-Device\0" + "-Mount\0" + "-Path\0" + "-Scope\0" + "-Service\0" + "-Slice\0" + "-Socket\0" + "-Swap\0" + "-Target\0" + "-Timer\0", + config_item_table_lookup, items, + 0, info, + NULL); + if (r < 0) + return log_debug_errno(r, "Failed to parse \"%s\": %m", info->name); + + if ((flags & SEARCH_DROPIN) == 0) + info->install_mode = INSTALL_MODE_REGULAR; + + return + (int) strv_length(info->aliases) + + (int) strv_length(info->wanted_by) + + (int) strv_length(info->required_by) + + (int) strv_length(info->upheld_by); +} + +static int unit_file_load_or_readlink( + InstallContext *ctx, + InstallInfo *info, + const char *path, + const LookupPaths *lp, + SearchFlags flags) { + int r; + + r = unit_file_load(ctx, info, path, lp->root_dir, flags); + if (r != -ELOOP || (flags & SEARCH_DROPIN)) + return r; + + /* This is a symlink, let's read and verify it. */ + r = unit_file_resolve_symlink(lp->root_dir, lp->search_path, + NULL, AT_FDCWD, path, + true, &info->symlink_target); + if (r < 0) + return r; + bool outside_search_path = r > 0; + + r = null_or_empty_path_with_root(info->symlink_target, lp->root_dir); + if (r < 0 && r != -ENOENT) + return log_debug_errno(r, "Failed to stat %s: %m", info->symlink_target); + if (r > 0) + info->install_mode = INSTALL_MODE_MASKED; + else if (outside_search_path) + info->install_mode = INSTALL_MODE_LINKED; + else + info->install_mode = INSTALL_MODE_ALIAS; + + return 0; +} + +static int unit_file_search( + InstallContext *ctx, + InstallInfo *info, + const LookupPaths *lp, + SearchFlags flags) { + + const char *dropin_dir_name = NULL, *dropin_template_dir_name = NULL; + _cleanup_strv_free_ char **dirs = NULL, **files = NULL; + _cleanup_free_ char *template = NULL; + bool found_unit = false; + int r, result; + + assert(info); + assert(lp); + + /* Was this unit already loaded? */ + if (info->install_mode != _INSTALL_MODE_INVALID) + return 0; + + if (info->path) + return unit_file_load_or_readlink(ctx, info, info->path, lp, flags); + + assert(info->name); + + if (unit_name_is_valid(info->name, UNIT_NAME_INSTANCE)) { + r = unit_name_template(info->name, &template); + if (r < 0) + return r; + } + + STRV_FOREACH(p, lp->search_path) { + _cleanup_free_ char *path = NULL; + + path = path_join(*p, info->name); + if (!path) + return -ENOMEM; + + r = unit_file_load_or_readlink(ctx, info, path, lp, flags); + if (r >= 0) { + info->path = TAKE_PTR(path); + result = r; + found_unit = true; + break; + } else if (!IN_SET(r, -ENOENT, -ENOTDIR, -EACCES)) + return r; + } + + if (!found_unit && template) { + + /* Unit file doesn't exist, however instance + * enablement was requested. We will check if it is + * possible to load template unit file. */ + + STRV_FOREACH(p, lp->search_path) { + _cleanup_free_ char *path = NULL; + + path = path_join(*p, template); + if (!path) + return -ENOMEM; + + r = unit_file_load_or_readlink(ctx, info, path, lp, flags); + if (r >= 0) { + info->path = TAKE_PTR(path); + result = r; + found_unit = true; + break; + } else if (!IN_SET(r, -ENOENT, -ENOTDIR, -EACCES)) + return r; + } + } + + if (!found_unit) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "Cannot find unit %s%s%s.", + info->name, template ? " or " : "", strempty(template)); + + if (info->install_mode == INSTALL_MODE_MASKED) + return result; + + /* Search for drop-in directories */ + + dropin_dir_name = strjoina(info->name, ".d"); + STRV_FOREACH(p, lp->search_path) { + char *path; + + path = path_join(*p, dropin_dir_name); + if (!path) + return -ENOMEM; + + r = strv_consume(&dirs, path); + if (r < 0) + return r; + } + + if (template) { + dropin_template_dir_name = strjoina(template, ".d"); + STRV_FOREACH(p, lp->search_path) { + char *path; + + path = path_join(*p, dropin_template_dir_name); + if (!path) + return -ENOMEM; + + r = strv_consume(&dirs, path); + if (r < 0) + return r; + } + } + + /* Load drop-in conf files */ + + r = conf_files_list_strv(&files, ".conf", NULL, 0, (const char**) dirs); + if (r < 0) + return log_debug_errno(r, "Failed to get list of conf files: %m"); + + STRV_FOREACH(p, files) { + r = unit_file_load_or_readlink(ctx, info, *p, lp, flags | SEARCH_DROPIN); + if (r < 0) + return log_debug_errno(r, "Failed to load conf file \"%s\": %m", *p); + } + + return result; +} + +static int install_info_follow( + InstallContext *ctx, + InstallInfo *info, + const LookupPaths *lp, + SearchFlags flags, + bool ignore_different_name) { + + assert(ctx); + assert(info); + + if (!IN_SET(info->install_mode, INSTALL_MODE_ALIAS, INSTALL_MODE_LINKED)) + return -EINVAL; + if (!info->symlink_target) + return -EINVAL; + + /* If the basename doesn't match, the caller should add a complete new entry for this. */ + + if (!ignore_different_name && !streq(basename(info->symlink_target), info->name)) + return -EXDEV; + + free_and_replace(info->path, info->symlink_target); + info->install_mode = _INSTALL_MODE_INVALID; + + return unit_file_load_or_readlink(ctx, info, info->path, lp, flags); +} + +/** + * Search for the unit file. If the unit name is a symlink, follow the symlink to the + * target, maybe more than once. Propagate the instance name if present. + */ +static int install_info_traverse( + InstallContext *ctx, + const LookupPaths *lp, + InstallInfo *start, + SearchFlags flags, + InstallInfo **ret) { + + InstallInfo *i; + unsigned k = 0; + int r; + + assert(lp); + assert(start); + assert(ctx); + + r = unit_file_search(ctx, start, lp, flags); + if (r < 0) + return r; + + i = start; + while (IN_SET(i->install_mode, INSTALL_MODE_ALIAS, INSTALL_MODE_LINKED)) { + /* Follow the symlink */ + + if (++k > UNIT_FILE_FOLLOW_SYMLINK_MAX) + return -ELOOP; + + if (!(flags & SEARCH_FOLLOW_CONFIG_SYMLINKS)) { + r = path_is_config(lp, i->path, true); + if (r < 0) + return r; + if (r > 0) + return -ELOOP; + } + + r = install_info_follow(ctx, i, lp, flags, + /* If linked, don't look at the target name */ + /* ignore_different_name= */ i->install_mode == INSTALL_MODE_LINKED); + if (r == -EXDEV && i->symlink_target) { + _cleanup_free_ char *buffer = NULL; + const char *bn; + + /* Target is an alias, create a new install info object and continue with that. */ + + bn = basename(i->symlink_target); + + if (unit_name_is_valid(i->name, UNIT_NAME_INSTANCE) && + unit_name_is_valid(bn, UNIT_NAME_TEMPLATE)) { + + _cleanup_free_ char *instance = NULL; + + r = unit_name_to_instance(i->name, &instance); + if (r < 0) + return r; + + r = unit_name_replace_instance(bn, instance, &buffer); + if (r < 0) + return r; + + if (streq(buffer, i->name)) { + + /* We filled in the instance, and the target stayed the same? If so, + * then let's honour the link as it is. */ + + r = install_info_follow(ctx, i, lp, flags, true); + if (r < 0) + return r; + + continue; + } + + bn = buffer; + } + + r = install_info_add(ctx, bn, NULL, lp->root_dir, /* auxiliary= */ false, &i); + if (r < 0) + return r; + + /* Try again, with the new target we found. */ + r = unit_file_search(ctx, i, lp, flags); + if (r == -ENOENT) + /* Translate error code to highlight this specific case */ + return -ENOLINK; + } + + if (r < 0) + return r; + } + + if (ret) + *ret = i; + + return 0; +} + +/** + * Call install_info_add() with name_or_path as the path (if name_or_path starts with "/") + * or the name (otherwise). root_dir is prepended to the path. + */ +static int install_info_add_auto( + InstallContext *ctx, + const LookupPaths *lp, + const char *name_or_path, + InstallInfo **ret) { + + assert(ctx); + assert(name_or_path); + + if (path_is_absolute(name_or_path)) { + const char *pp; + + pp = prefix_roota(lp->root_dir, name_or_path); + + return install_info_add(ctx, NULL, pp, lp->root_dir, /* auxiliary= */ false, ret); + } else + return install_info_add(ctx, name_or_path, NULL, lp->root_dir, /* auxiliary= */ false, ret); +} + +static int install_info_discover( + InstallContext *ctx, + const LookupPaths *lp, + const char *name_or_path, + SearchFlags flags, + InstallInfo **ret, + InstallChange **changes, + size_t *n_changes) { + + InstallInfo *info; + int r; + + assert(ctx); + assert(lp); + assert(name_or_path); + + r = install_info_add_auto(ctx, lp, name_or_path, &info); + if (r >= 0) + r = install_info_traverse(ctx, lp, info, flags, ret); + + if (r < 0) + install_changes_add(changes, n_changes, r, name_or_path, NULL); + return r; +} + +static int install_info_discover_and_check( + InstallContext *ctx, + const LookupPaths *lp, + const char *name_or_path, + SearchFlags flags, + InstallInfo **ret, + InstallChange **changes, + size_t *n_changes) { + + int r; + + r = install_info_discover(ctx, lp, name_or_path, flags, ret, changes, n_changes); + if (r < 0) + return r; + + return install_info_may_process(ret ? *ret : NULL, lp, changes, n_changes); +} + +int unit_file_verify_alias( + const InstallInfo *info, + const char *dst, + char **ret_dst, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_free_ char *dst_updated = NULL; + int r; + + /* Verify that dst is a valid either a valid alias or a valid .wants/.requires symlink for the target + * unit *i. Return negative on error or if not compatible, zero on success. + * + * ret_dst is set in cases where "instance propagation" happens, i.e. when the instance part is + * inserted into dst. It is not normally set, even on success, so that the caller can easily + * distinguish the case where instance propagation occurred. + * + * Returns: + * -EXDEV when the alias doesn't match the unit, + * -EUCLEAN when the name is invalid, + * -ELOOP when the alias it to the unit itself. + */ + + const char *path_alias = strrchr(dst, '/'); + if (path_alias) { + /* This branch covers legacy Alias= function of creating .wants and .requires symlinks. */ + _cleanup_free_ char *dir = NULL; + char *p; + + path_alias ++; /* skip over slash */ + + r = path_extract_directory(dst, &dir); + if (r < 0) + return log_error_errno(r, "Failed to extract parent directory from '%s': %m", dst); + + p = endswith(dir, ".wants"); + if (!p) + p = endswith(dir, ".requires"); + if (!p) { + install_changes_add(changes, n_changes, -EXDEV, dst, NULL); + return log_debug_errno(SYNTHETIC_ERRNO(EXDEV), "Invalid path \"%s\" in alias.", dir); + } + + *p = '\0'; /* dir should now be a unit name */ + + UnitNameFlags type = unit_name_classify(dir); + if (type < 0) { + install_changes_add(changes, n_changes, -EXDEV, dst, NULL); + return log_debug_errno(SYNTHETIC_ERRNO(EXDEV), + "Invalid unit name component \"%s\" in alias.", dir); + } + + const bool instance_propagation = type == UNIT_NAME_TEMPLATE; + + /* That's the name we want to use for verification. */ + r = unit_symlink_name_compatible(path_alias, info->name, instance_propagation); + if (r < 0) + return log_error_errno(r, "Failed to verify alias validity: %m"); + if (r == 0) { + install_changes_add(changes, n_changes, -EXDEV, dst, info->name); + return log_debug_errno(SYNTHETIC_ERRNO(EXDEV), + "Invalid unit \"%s\" symlink \"%s\".", + info->name, dst); + } + + } else { + /* If the symlink target has an instance set and the symlink source doesn't, we "propagate + * the instance", i.e. instantiate the symlink source with the target instance. */ + if (unit_name_is_valid(dst, UNIT_NAME_TEMPLATE)) { + _cleanup_free_ char *inst = NULL; + + UnitNameFlags type = unit_name_to_instance(info->name, &inst); + if (type < 0) { + install_changes_add(changes, n_changes, -EUCLEAN, info->name, NULL); + return log_debug_errno(type, "Failed to extract instance name from \"%s\": %m", info->name); + } + + if (type == UNIT_NAME_INSTANCE) { + r = unit_name_replace_instance(dst, inst, &dst_updated); + if (r < 0) + return log_error_errno(r, "Failed to build unit name from %s+%s: %m", + dst, inst); + } + } + + r = unit_validate_alias_symlink_or_warn(LOG_DEBUG, dst_updated ?: dst, info->name); + if (r == -ELOOP) /* -ELOOP means self-alias, which we (quietly) ignore */ + return r; + if (r < 0) + return install_changes_add(changes, n_changes, + r == -EINVAL ? -EXDEV : r, + dst_updated ?: dst, + info->name); + } + + *ret_dst = TAKE_PTR(dst_updated); + return 0; +} + +static int install_info_symlink_alias( + RuntimeScope scope, + InstallInfo *info, + const LookupPaths *lp, + const char *config_path, + bool force, + InstallChange **changes, + size_t *n_changes) { + + int r = 0, q; + + assert(info); + assert(lp); + assert(config_path); + + STRV_FOREACH(s, info->aliases) { + _cleanup_free_ char *alias_path = NULL, *dst = NULL, *dst_updated = NULL; + bool broken; + + q = install_name_printf(scope, info, *s, &dst); + if (q < 0) { + install_changes_add(changes, n_changes, q, *s, NULL); + r = r < 0 ? r : q; + continue; + } + + q = unit_file_verify_alias(info, dst, &dst_updated, changes, n_changes); + if (q == -ELOOP) + continue; + if (q < 0) { + r = r < 0 ? r : q; + continue; + } + + alias_path = path_make_absolute(dst_updated ?: dst, config_path); + if (!alias_path) + return -ENOMEM; + + q = chase(alias_path, lp->root_dir, CHASE_NONEXISTENT, NULL, NULL); + if (q < 0 && q != -ENOENT) { + r = r < 0 ? r : q; + continue; + } + broken = q == 0; /* symlink target does not exist? */ + + q = create_symlink(lp, info->path, alias_path, force || broken, changes, n_changes); + r = r < 0 ? r : q; + } + + return r; +} + +static int install_info_symlink_wants( + RuntimeScope scope, + UnitFileFlags file_flags, + InstallInfo *info, + const LookupPaths *lp, + const char *config_path, + char **list, + const char *suffix, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(install_info_clear) InstallInfo instance = { + .install_mode = _INSTALL_MODE_INVALID, + }; + + UnitNameFlags valid_dst_type = UNIT_NAME_ANY; + const char *n; + int r = 0, q; + + assert(info); + assert(lp); + assert(config_path); + + if (strv_isempty(list)) + return 0; + + if (unit_name_is_valid(info->name, UNIT_NAME_PLAIN | UNIT_NAME_INSTANCE)) + /* Not a template unit. Use the name directly. */ + n = info->name; + + else if (info->default_instance) { + /* If this is a template, and we have a default instance, use it. */ + + r = unit_name_replace_instance(info->name, info->default_instance, &instance.name); + if (r < 0) + return r; + + r = unit_file_search(NULL, &instance, lp, SEARCH_FOLLOW_CONFIG_SYMLINKS); + if (r < 0) + return r; + + if (instance.install_mode == INSTALL_MODE_MASKED) + return install_changes_add(changes, n_changes, -ERFKILL, instance.path, NULL); + + n = instance.name; + + } else { + /* We have a template, but no instance yet. When used with an instantiated unit, we will get + * the instance from that unit. Cannot be used with non-instance units. */ + + valid_dst_type = UNIT_NAME_INSTANCE | UNIT_NAME_TEMPLATE; + n = info->name; + } + + r = 0; + STRV_FOREACH(s, list) { + _cleanup_free_ char *path = NULL, *dst = NULL; + + q = install_name_printf(scope, info, *s, &dst); + if (q < 0) { + install_changes_add(changes, n_changes, q, *s, NULL); + if (r >= 0) + r = q; + + continue; + } + + if (!unit_name_is_valid(dst, valid_dst_type)) { + /* Generate a proper error here: EUCLEAN if the name is generally bad, EIDRM if the + * template status doesn't match. If we are doing presets don't bother reporting the + * error. This also covers cases like 'systemctl preset serial-getty@.service', which + * has no DefaultInstance, so there is nothing we can do. At the same time, + * 'systemctl enable serial-getty@.service' should fail, the user should specify an + * instance like in 'systemctl enable serial-getty@ttyS0.service'. + */ + if (file_flags & UNIT_FILE_IGNORE_AUXILIARY_FAILURE) + continue; + + if (unit_name_is_valid(dst, UNIT_NAME_ANY)) + q = install_changes_add(changes, n_changes, -EIDRM, dst, n); + else + q = install_changes_add(changes, n_changes, -EUCLEAN, dst, NULL); + if (r >= 0) + r = q; + + continue; + } + + path = strjoin(config_path, "/", dst, suffix, n); + if (!path) + return -ENOMEM; + + q = create_symlink(lp, info->path, path, true, changes, n_changes); + if ((q < 0 && r >= 0) || r == 0) + r = q; + + if (unit_file_exists(scope, lp, dst) == 0) { + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_DESTINATION_NOT_PRESENT, dst, info->path); + if (q < 0) + return q; + } + } + + return r; +} + +static int install_info_symlink_link( + InstallInfo *info, + const LookupPaths *lp, + const char *config_path, + bool force, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_free_ char *path = NULL; + int r; + + assert(info); + assert(lp); + assert(config_path); + assert(info->path); + + r = in_search_path(lp, info->path); + if (r < 0) + return r; + if (r > 0) + return 0; + + path = path_join(config_path, info->name); + if (!path) + return -ENOMEM; + + return create_symlink(lp, info->path, path, force, changes, n_changes); +} + +static int install_info_apply( + RuntimeScope scope, + UnitFileFlags file_flags, + InstallInfo *info, + const LookupPaths *lp, + const char *config_path, + InstallChange **changes, + size_t *n_changes) { + + int r, q; + + assert(info); + assert(lp); + assert(config_path); + + if (info->install_mode != INSTALL_MODE_REGULAR) + return 0; + + bool force = file_flags & UNIT_FILE_FORCE; + + r = install_info_symlink_link(info, lp, config_path, force, changes, n_changes); + /* Do not count links to the unit file towards the "carries_install_info" count */ + if (r < 0) + /* If linking of the file failed, do not try to create other symlinks, + * because they might would pointing to a non-existent or wrong unit. */ + return r; + + r = install_info_symlink_alias(scope, info, lp, config_path, force, changes, n_changes); + + q = install_info_symlink_wants(scope, file_flags, info, lp, config_path, info->wanted_by, ".wants/", changes, n_changes); + if (r == 0) + r = q; + + q = install_info_symlink_wants(scope, file_flags, info, lp, config_path, info->required_by, ".requires/", changes, n_changes); + if (r == 0) + r = q; + + q = install_info_symlink_wants(scope, file_flags, info, lp, config_path, info->upheld_by, ".upholds/", changes, n_changes); + if (r == 0) + r = q; + + return r; +} + +static int install_context_apply( + InstallContext *ctx, + const LookupPaths *lp, + UnitFileFlags file_flags, + const char *config_path, + SearchFlags flags, + InstallChange **changes, + size_t *n_changes) { + + InstallInfo *i; + int r; + + assert(ctx); + assert(lp); + assert(config_path); + + if (ordered_hashmap_isempty(ctx->will_process)) + return 0; + + r = ordered_hashmap_ensure_allocated(&ctx->have_processed, &string_hash_ops); + if (r < 0) + return r; + + r = 0; + while ((i = ordered_hashmap_first(ctx->will_process))) { + int q; + + q = ordered_hashmap_move_one(ctx->have_processed, ctx->will_process, i->name); + if (q < 0) + return q; + + q = install_info_traverse(ctx, lp, i, flags, NULL); + if (q < 0) { + if (i->auxiliary) { + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_AUXILIARY_FAILED, i->name, NULL); + if (q < 0) + return q; + continue; + } + + return install_changes_add(changes, n_changes, q, i->name, NULL); + } + + /* We can attempt to process a masked unit when a different unit + * that we were processing specifies it in Also=. */ + if (i->install_mode == INSTALL_MODE_MASKED) { + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_IS_MASKED, i->path, NULL); + if (q < 0) + return q; + if (r >= 0) + /* Assume that something *could* have been enabled here, + * avoid "empty [Install] section" warning. */ + r += 1; + continue; + } + + if (i->install_mode != INSTALL_MODE_REGULAR) + continue; + + q = install_info_apply(ctx->scope, file_flags, i, lp, config_path, changes, n_changes); + if (r >= 0) { + if (q < 0) + r = q; + else + r += q; + } + } + + return r; +} + +static int install_context_mark_for_removal( + InstallContext *ctx, + const LookupPaths *lp, + Set **remove_symlinks_to, + const char *config_path, + InstallChange **changes, + size_t *n_changes) { + + InstallInfo *i; + int r; + + assert(ctx); + assert(lp); + assert(config_path); + + /* Marks all items for removal */ + + if (ordered_hashmap_isempty(ctx->will_process)) + return 0; + + r = ordered_hashmap_ensure_allocated(&ctx->have_processed, &string_hash_ops); + if (r < 0) + return r; + + while ((i = ordered_hashmap_first(ctx->will_process))) { + + r = ordered_hashmap_move_one(ctx->have_processed, ctx->will_process, i->name); + if (r < 0) + return r; + + r = install_info_traverse(ctx, lp, i, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, NULL); + if (r == -ENOLINK) { + log_debug_errno(r, "Name %s leads to a dangling symlink, removing name.", i->name); + r = install_changes_add(changes, n_changes, INSTALL_CHANGE_IS_DANGLING, i->path ?: i->name, NULL); + if (r < 0) + return r; + } else if (r == -ENOENT) { + if (i->auxiliary) /* some unit specified in Also= or similar is missing */ + log_debug_errno(r, "Auxiliary unit of %s not found, removing name.", i->name); + else { + log_debug_errno(r, "Unit %s not found, removing name.", i->name); + r = install_changes_add(changes, n_changes, r, i->path ?: i->name, NULL); + if (r < 0) + return r; + } + } else if (r < 0) { + log_debug_errno(r, "Failed to find unit %s, removing name: %m", i->name); + install_changes_add(changes, n_changes, r, i->path ?: i->name, NULL); + } else if (i->install_mode == INSTALL_MODE_MASKED) { + log_debug("Unit file %s is masked, ignoring.", i->name); + install_changes_add(changes, n_changes, INSTALL_CHANGE_IS_MASKED, i->path ?: i->name, NULL); + continue; + } else if (i->install_mode != INSTALL_MODE_REGULAR) { + log_debug("Unit %s has install mode %s, ignoring.", + i->name, install_mode_to_string(i->install_mode) ?: "invalid"); + continue; + } + + r = mark_symlink_for_removal(remove_symlinks_to, i->name); + if (r < 0) + return r; + } + + return 0; +} + +int unit_file_mask( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + const char *config_path; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + config_path = (flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config; + if (!config_path) + return -ENXIO; + + STRV_FOREACH(name, names) { + _cleanup_free_ char *path = NULL; + int q; + + if (!unit_name_is_valid(*name, UNIT_NAME_ANY)) { + if (r == 0) + r = -EINVAL; + continue; + } + + path = path_make_absolute(*name, config_path); + if (!path) + return -ENOMEM; + + q = create_symlink(&lp, "/dev/null", path, flags & UNIT_FILE_FORCE, changes, n_changes); + if (q < 0 && r >= 0) + r = q; + } + + return r; +} + +int unit_file_unmask( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_set_free_free_ Set *remove_symlinks_to = NULL; + _cleanup_strv_free_ char **todo = NULL; + const char *config_path; + size_t n_todo = 0; + int r, q; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + config_path = (flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config; + if (!config_path) + return -ENXIO; + + bool dry_run = flags & UNIT_FILE_DRY_RUN; + + STRV_FOREACH(name, names) { + if (!unit_name_is_valid(*name, UNIT_NAME_ANY)) + return -EINVAL; + + /* If root_dir is set, we don't care about kernel command line or generators. + * But if it is not set, we need to check for interference. */ + if (!root_dir) { + _cleanup_(install_info_clear) InstallInfo info = { + .name = *name, /* We borrow *name temporarily… */ + .install_mode = _INSTALL_MODE_INVALID, + }; + + r = unit_file_search(NULL, &info, &lp, 0); + if (r < 0) { + if (r != -ENOENT) + log_debug_errno(r, "Failed to look up unit %s, ignoring: %m", info.name); + } else if (info.install_mode == INSTALL_MODE_MASKED && + path_is_generator(&lp, info.path)) { + r = install_changes_add(changes, n_changes, + INSTALL_CHANGE_IS_MASKED_GENERATOR, info.name, info.path); + if (r < 0) + return r; + } + + TAKE_PTR(info.name); /* … and give it back here */ + } + + _cleanup_free_ char *path = path_make_absolute(*name, config_path); + if (!path) + return -ENOMEM; + + r = null_or_empty_path(path); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + if (r == 0) + continue; + + if (!GREEDY_REALLOC0(todo, n_todo + 2)) + return -ENOMEM; + + todo[n_todo] = strdup(*name); + if (!todo[n_todo]) + return -ENOMEM; + + n_todo++; + } + + strv_uniq(todo); + + r = 0; + STRV_FOREACH(i, todo) { + _cleanup_free_ char *path = NULL; + const char *rp; + + path = path_make_absolute(*i, config_path); + if (!path) + return -ENOMEM; + + if (!dry_run && unlink(path) < 0) { + if (errno != ENOENT) { + if (r >= 0) + r = -errno; + install_changes_add(changes, n_changes, -errno, path, NULL); + } + + continue; + } + + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, path, NULL); + if (q < 0) + return q; + + rp = skip_root(lp.root_dir, path); + q = mark_symlink_for_removal(&remove_symlinks_to, rp ?: path); + if (q < 0) + return q; + } + + q = remove_marked_symlinks(remove_symlinks_to, config_path, &lp, dry_run, changes, n_changes); + if (r >= 0) + r = q; + + return r; +} + +int unit_file_link( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **files, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_strv_free_ char **todo = NULL; + const char *config_path; + size_t n_todo = 0; + int r, q; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + config_path = (flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config; + if (!config_path) + return -ENXIO; + + STRV_FOREACH(file, files) { + _cleanup_free_ char *full = NULL; + struct stat st; + char *fn; + + if (!path_is_absolute(*file)) + return install_changes_add(changes, n_changes, -EINVAL, *file, NULL); + + fn = basename(*file); + if (!unit_name_is_valid(fn, UNIT_NAME_ANY)) + return install_changes_add(changes, n_changes, -EUCLEAN, *file, NULL); + + full = path_join(lp.root_dir, *file); + if (!full) + return -ENOMEM; + + if (lstat(full, &st) < 0) + return install_changes_add(changes, n_changes, -errno, *file, NULL); + + r = stat_verify_regular(&st); + if (r < 0) + return install_changes_add(changes, n_changes, r, *file, NULL); + + r = in_search_path(&lp, *file); + if (r < 0) + return install_changes_add(changes, n_changes, r, *file, NULL); + if (r > 0) + /* A silent noop if the file is already in the search path. */ + continue; + + r = underneath_search_path(&lp, *file); + if (r > 0) + r = -ETXTBSY; + if (r < 0) + return install_changes_add(changes, n_changes, r, *file, NULL); + + if (!GREEDY_REALLOC0(todo, n_todo + 2)) + return -ENOMEM; + + todo[n_todo] = strdup(*file); + if (!todo[n_todo]) + return -ENOMEM; + + n_todo++; + } + + strv_uniq(todo); + + r = 0; + STRV_FOREACH(i, todo) { + _cleanup_free_ char *new_path = NULL; + + new_path = path_make_absolute(basename(*i), config_path); + if (!new_path) + return -ENOMEM; + + q = create_symlink(&lp, *i, new_path, flags & UNIT_FILE_FORCE, changes, n_changes); + if (q < 0 && r >= 0) + r = q; + } + + return r; +} + +static int path_shall_revert(const LookupPaths *lp, const char *path) { + int r; + + assert(lp); + assert(path); + + /* Checks whether the path is one where the drop-in directories shall be removed. */ + + r = path_is_config(lp, path, true); + if (r != 0) + return r; + + r = path_is_control(lp, path); + if (r != 0) + return r; + + return path_is_transient(lp, path); +} + +int unit_file_revert( + RuntimeScope scope, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_set_free_free_ Set *remove_symlinks_to = NULL; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_strv_free_ char **todo = NULL; + size_t n_todo = 0; + int r, q; + + /* Puts a unit file back into vendor state. This means: + * + * a) we remove all drop-in snippets added by the user ("config"), add to transient units + * ("transient"), and added via "systemctl set-property" ("control"), but not if the drop-in is + * generated ("generated"). + * + * c) if there's a vendor unit file (i.e. one in /usr) we remove any configured overriding unit files + * (i.e. in "config", but not in "transient" or "control" or even "generated"). + * + * We remove all that in both the runtime and the persistent directories, if that applies. + */ + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + STRV_FOREACH(name, names) { + bool has_vendor = false; + + if (!unit_name_is_valid(*name, UNIT_NAME_ANY)) + return -EINVAL; + + STRV_FOREACH(p, lp.search_path) { + _cleanup_free_ char *path = NULL, *dropin = NULL; + struct stat st; + + path = path_make_absolute(*name, *p); + if (!path) + return -ENOMEM; + + r = RET_NERRNO(lstat(path, &st)); + if (r < 0) { + if (r != -ENOENT) + return install_changes_add(changes, n_changes, r, path, NULL); + } else if (S_ISREG(st.st_mode)) { + /* Check if there's a vendor version */ + r = path_is_vendor_or_generator(&lp, path); + if (r < 0) + return install_changes_add(changes, n_changes, r, path, NULL); + if (r > 0) + has_vendor = true; + } + + dropin = strjoin(path, ".d"); + if (!dropin) + return -ENOMEM; + + r = RET_NERRNO(lstat(dropin, &st)); + if (r < 0) { + if (r != -ENOENT) + return install_changes_add(changes, n_changes, r, dropin, NULL); + } else if (S_ISDIR(st.st_mode)) { + /* Remove the drop-ins */ + r = path_shall_revert(&lp, dropin); + if (r < 0) + return install_changes_add(changes, n_changes, r, dropin, NULL); + if (r > 0) { + if (!GREEDY_REALLOC0(todo, n_todo + 2)) + return -ENOMEM; + + todo[n_todo++] = TAKE_PTR(dropin); + } + } + } + + if (!has_vendor) + continue; + + /* OK, there's a vendor version, hence drop all configuration versions */ + STRV_FOREACH(p, lp.search_path) { + _cleanup_free_ char *path = NULL; + struct stat st; + + path = path_make_absolute(*name, *p); + if (!path) + return -ENOMEM; + + r = RET_NERRNO(lstat(path, &st)); + if (r < 0) { + if (r != -ENOENT) + return install_changes_add(changes, n_changes, r, path, NULL); + } else if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) { + r = path_is_config(&lp, path, true); + if (r < 0) + return install_changes_add(changes, n_changes, r, path, NULL); + if (r > 0) { + if (!GREEDY_REALLOC0(todo, n_todo + 2)) + return -ENOMEM; + + todo[n_todo++] = TAKE_PTR(path); + } + } + } + } + + strv_uniq(todo); + + r = 0; + STRV_FOREACH(i, todo) { + _cleanup_strv_free_ char **fs = NULL; + const char *rp; + + (void) get_files_in_directory(*i, &fs); + + q = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL); + if (q < 0 && q != -ENOENT && r >= 0) { + r = q; + continue; + } + + STRV_FOREACH(j, fs) { + _cleanup_free_ char *t = NULL; + + t = path_join(*i, *j); + if (!t) + return -ENOMEM; + + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, t, NULL); + if (q < 0) + return q; + } + + q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, *i, NULL); + if (q < 0) + return q; + + rp = skip_root(lp.root_dir, *i); + q = mark_symlink_for_removal(&remove_symlinks_to, rp ?: *i); + if (q < 0) + return q; + } + + q = remove_marked_symlinks(remove_symlinks_to, lp.runtime_config, &lp, false, changes, n_changes); + if (r >= 0) + r = q; + + q = remove_marked_symlinks(remove_symlinks_to, lp.persistent_config, &lp, false, changes, n_changes); + if (r >= 0) + r = q; + + return r; +} + +int unit_file_add_dependency( + RuntimeScope scope, + UnitFileFlags file_flags, + const char *root_dir, + char **names, + const char *target, + UnitDependency dep, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + InstallInfo *info, *target_info; + const char *config_path; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(target); + assert(IN_SET(dep, UNIT_WANTS, UNIT_REQUIRES)); + + if (!unit_name_is_valid(target, UNIT_NAME_ANY)) + return install_changes_add(changes, n_changes, -EUCLEAN, target, NULL); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + config_path = (file_flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config; + if (!config_path) + return -ENXIO; + + r = install_info_discover_and_check(&ctx, &lp, target, SEARCH_FOLLOW_CONFIG_SYMLINKS, + &target_info, changes, n_changes); + if (r < 0) + return r; + + assert(target_info->install_mode == INSTALL_MODE_REGULAR); + + STRV_FOREACH(name, names) { + char ***l; + + r = install_info_discover_and_check(&ctx, &lp, *name, + SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, changes, n_changes); + if (r < 0) + return r; + + assert(info->install_mode == INSTALL_MODE_REGULAR); + + /* We didn't actually load anything from the unit + * file, but instead just add in our new symlink to + * create. */ + + if (dep == UNIT_WANTS) + l = &info->wanted_by; + else if (dep == UNIT_REQUIRES) + l = &info->required_by; + else + l = &info->upheld_by; + + strv_free(*l); + *l = strv_new(target_info->name); + if (!*l) + return -ENOMEM; + } + + return install_context_apply(&ctx, &lp, file_flags, config_path, + SEARCH_FOLLOW_CONFIG_SYMLINKS, changes, n_changes); +} + +static int do_unit_file_enable( + const LookupPaths *lp, + RuntimeScope scope, + UnitFileFlags flags, + const char *config_path, + char **names_or_paths, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + InstallInfo *info; + int r; + + STRV_FOREACH(name, names_or_paths) { + r = install_info_discover_and_check(&ctx, lp, *name, + SEARCH_LOAD | SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, changes, n_changes); + if (r < 0) + return r; + + assert(info->install_mode == INSTALL_MODE_REGULAR); + } + + /* This will return the number of symlink rules that were + supposed to be created, not the ones actually created. This + is useful to determine whether the passed units had any + installation data at all. */ + + return install_context_apply(&ctx, lp, flags, config_path, + SEARCH_LOAD, changes, n_changes); +} + +int unit_file_enable( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names_or_paths, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + const char *config_path = config_path_from_flags(&lp, flags); + if (!config_path) + return -ENXIO; + + return do_unit_file_enable(&lp, scope, flags, config_path, names_or_paths, changes, n_changes); +} + +static int do_unit_file_disable( + const LookupPaths *lp, + RuntimeScope scope, + UnitFileFlags flags, + const char *config_path, + char **names, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + _cleanup_set_free_free_ Set *remove_symlinks_to = NULL; + InstallInfo *info; + bool has_install_info = false; + int r; + + STRV_FOREACH(name, names) { + if (!unit_name_is_valid(*name, UNIT_NAME_ANY)) + return install_changes_add(changes, n_changes, -EUCLEAN, *name, NULL); + + r = install_info_add(&ctx, *name, NULL, lp->root_dir, /* auxiliary= */ false, &info); + if (r >= 0) + r = install_info_traverse(&ctx, lp, info, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, NULL); + + if (r < 0) + return install_changes_add(changes, n_changes, r, *name, NULL); + + /* If we enable multiple units, some with install info and others without, + * the "empty [Install] section" warning is not shown. Let's make the behavior + * of disable align with that. */ + has_install_info = has_install_info || install_info_has_rules(info) || install_info_has_also(info); + } + + r = install_context_mark_for_removal(&ctx, lp, &remove_symlinks_to, config_path, changes, n_changes); + if (r >= 0) + r = remove_marked_symlinks(remove_symlinks_to, config_path, lp, flags & UNIT_FILE_DRY_RUN, changes, n_changes); + + if (r < 0) + return r; + + /* The warning is shown only if it's a no-op */ + return install_changes_have_modification(*changes, *n_changes) || has_install_info; +} + +int unit_file_disable( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **files, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + const char *config_path = config_path_from_flags(&lp, flags); + if (!config_path) + return -ENXIO; + + return do_unit_file_disable(&lp, scope, flags, config_path, files, changes, n_changes); +} + +static int normalize_linked_files( + RuntimeScope scope, + const LookupPaths *lp, + char **names_or_paths, + char ***ret_names, + char ***ret_files) { + + /* This is similar to normalize_filenames()/normalize_names() in src/systemctl/, + * but operates on real unit names. For each argument we look up the actual path + * where the unit is found. This way linked units can be re-enabled successfully. */ + + _cleanup_strv_free_ char **files = NULL, **names = NULL; + int r; + + STRV_FOREACH(a, names_or_paths) { + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + InstallInfo *i = NULL; + _cleanup_free_ char *n = NULL; + + r = path_extract_filename(*a, &n); + if (r < 0) + return r; + if (r == O_DIRECTORY) + return log_debug_errno(SYNTHETIC_ERRNO(EISDIR), + "Unexpected path to a directory \"%s\", refusing.", *a); + + if (!is_path(*a)) { + r = install_info_discover(&ctx, lp, n, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, &i, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed to discover unit \"%s\", operating on name: %m", n); + } + + r = strv_consume(&names, TAKE_PTR(n)); + if (r < 0) + return r; + + const char *p = NULL; + if (i && i->path && i->root) + /* Use startswith here, because we know that paths are normalized, and + * path_startswith() would give us a relative path, but we need an absolute path + * relative to i->root. + * + * In other words: /var/tmp/instroot.1234/etc/systemd/system/frobnicator.service + * is replaced by /etc/systemd/system/frobnicator.service, which is "absolute" + * in a sense, but only makes sense "relative" to /var/tmp/instroot.1234/. + */ + p = startswith(i->path, i->root); + + r = strv_extend(&files, p ?: *a); + if (r < 0) + return r; + } + + *ret_names = TAKE_PTR(names); + *ret_files = TAKE_PTR(files); + return 0; +} + +int unit_file_reenable( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names_or_paths, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_strv_free_ char **names = NULL, **files = NULL; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + const char *config_path = config_path_from_flags(&lp, flags); + if (!config_path) + return -ENXIO; + + r = normalize_linked_files(scope, &lp, names_or_paths, &names, &files); + if (r < 0) + return r; + + /* First, we invoke the disable command with only the basename... */ + r = do_unit_file_disable(&lp, scope, flags, config_path, names, changes, n_changes); + if (r < 0) + return r; + + /* But the enable command with the full name */ + return do_unit_file_enable(&lp, scope, flags, config_path, files, changes, n_changes); +} + +int unit_file_set_default( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + const char *name, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + InstallInfo *info; + const char *new_path; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(name); + + if (unit_name_to_type(name) != UNIT_TARGET) /* this also validates the name */ + return -EINVAL; + if (streq(name, SPECIAL_DEFAULT_TARGET)) + return -EINVAL; + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + r = install_info_discover_and_check(&ctx, &lp, name, 0, &info, changes, n_changes); + if (r < 0) + return r; + + new_path = strjoina(lp.persistent_config, "/" SPECIAL_DEFAULT_TARGET); + return create_symlink(&lp, info->path, new_path, flags & UNIT_FILE_FORCE, changes, n_changes); +} + +int unit_file_get_default( + RuntimeScope scope, + const char *root_dir, + char **name) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + InstallInfo *info; + char *n; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(name); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + r = install_info_discover(&ctx, &lp, SPECIAL_DEFAULT_TARGET, SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, NULL, NULL); + if (r < 0) + return r; + + n = strdup(info->name); + if (!n) + return -ENOMEM; + + *name = n; + return 0; +} + +int unit_file_lookup_state( + RuntimeScope scope, + const LookupPaths *lp, + const char *name, + UnitFileState *ret) { + + _cleanup_(install_context_done) InstallContext ctx = { .scope = scope }; + InstallInfo *info; + UnitFileState state; + int r; + + assert(lp); + assert(name); + + if (!unit_name_is_valid(name, UNIT_NAME_ANY)) + return -EINVAL; + + r = install_info_discover(&ctx, lp, name, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to discover unit %s: %m", name); + + assert(IN_SET(info->install_mode, INSTALL_MODE_REGULAR, INSTALL_MODE_MASKED)); + log_debug("Found unit %s at %s (%s)", name, strna(info->path), + info->install_mode == INSTALL_MODE_REGULAR ? "regular file" : "mask"); + + /* Shortcut things, if the caller just wants to know if this unit exists. */ + if (!ret) + return 0; + + switch (info->install_mode) { + + case INSTALL_MODE_MASKED: + r = path_is_runtime(lp, info->path, true); + if (r < 0) + return r; + + state = r > 0 ? UNIT_FILE_MASKED_RUNTIME : UNIT_FILE_MASKED; + break; + + case INSTALL_MODE_REGULAR: + /* Check if the name we were querying is actually an alias */ + if (!streq(name, basename(info->path)) && !unit_name_is_valid(info->name, UNIT_NAME_INSTANCE)) { + state = UNIT_FILE_ALIAS; + break; + } + + r = path_is_generator(lp, info->path); + if (r < 0) + return r; + if (r > 0) { + state = UNIT_FILE_GENERATED; + break; + } + + r = path_is_transient(lp, info->path); + if (r < 0) + return r; + if (r > 0) { + state = UNIT_FILE_TRANSIENT; + break; + } + + /* Check if any of the Alias= symlinks have been created. + * We ignore other aliases, and only check those that would + * be created by systemctl enable for this unit. */ + r = find_symlinks_in_scope(scope, lp, info, true, &state); + if (r < 0) + return r; + if (r > 0) + break; + + /* Check if the file is known under other names. If it is, + * it might be in use. Report that as UNIT_FILE_INDIRECT. */ + r = find_symlinks_in_scope(scope, lp, info, false, &state); + if (r < 0) + return r; + if (r > 0) + state = UNIT_FILE_INDIRECT; + else { + if (install_info_has_rules(info)) + state = UNIT_FILE_DISABLED; + else if (install_info_has_also(info)) + state = UNIT_FILE_INDIRECT; + else + state = UNIT_FILE_STATIC; + } + + break; + + default: + assert_not_reached(); + } + + *ret = state; + return 0; +} + +int unit_file_get_state( + RuntimeScope scope, + const char *root_dir, + const char *name, + UnitFileState *ret) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(name); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + return unit_file_lookup_state(scope, &lp, name, ret); +} + +int unit_file_exists(RuntimeScope scope, const LookupPaths *lp, const char *name) { + _cleanup_(install_context_done) InstallContext c = { .scope = scope }; + int r; + + assert(lp); + assert(name); + + if (!unit_name_is_valid(name, UNIT_NAME_ANY)) + return -EINVAL; + + r = install_info_discover(&c, lp, name, 0, NULL, NULL, NULL); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + return 1; +} + +static int split_pattern_into_name_and_instances(const char *pattern, char **out_unit_name, char ***out_instances) { + _cleanup_strv_free_ char **instances = NULL; + _cleanup_free_ char *unit_name = NULL; + int r; + + assert(pattern); + assert(out_instances); + assert(out_unit_name); + + r = extract_first_word(&pattern, &unit_name, NULL, EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + + /* We handle the instances logic when unit name is extracted */ + if (pattern) { + /* We only create instances when a rule of templated unit + * is seen. A rule like enable foo@.service a b c will + * result in an array of (a, b, c) as instance names */ + if (!unit_name_is_valid(unit_name, UNIT_NAME_TEMPLATE)) + return -EINVAL; + + instances = strv_split(pattern, WHITESPACE); + if (!instances) + return -ENOMEM; + + *out_instances = TAKE_PTR(instances); + } + + *out_unit_name = TAKE_PTR(unit_name); + + return 0; +} + +static int presets_find_config(RuntimeScope scope, const char *root_dir, char ***files) { + static const char* const system_dirs[] = {CONF_PATHS("systemd/system-preset"), NULL}; + static const char* const user_dirs[] = {CONF_PATHS_USR("systemd/user-preset"), NULL}; + const char* const* dirs; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + + if (scope == RUNTIME_SCOPE_SYSTEM) + dirs = system_dirs; + else if (IN_SET(scope, RUNTIME_SCOPE_GLOBAL, RUNTIME_SCOPE_USER)) + dirs = user_dirs; + else + assert_not_reached(); + + return conf_files_list_strv(files, ".preset", root_dir, 0, dirs); +} + +static int read_presets(RuntimeScope scope, const char *root_dir, UnitFilePresets *presets) { + _cleanup_(unit_file_presets_done) UnitFilePresets ps = {}; + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(presets); + + r = presets_find_config(scope, root_dir, &files); + if (r < 0) + return r; + + STRV_FOREACH(p, files) { + _cleanup_fclose_ FILE *f = NULL; + int n = 0; + + f = fopen(*p, "re"); + if (!f) { + if (errno == ENOENT) + continue; + + return -errno; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + _cleanup_(unit_file_preset_rule_done) UnitFilePresetRule rule = {}; + const char *parameter; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + n++; + + if (isempty(line)) + continue; + if (strchr(COMMENTS, line[0])) + continue; + + parameter = first_word(line, "enable"); + if (parameter) { + char *unit_name; + char **instances = NULL; + + /* Unit_name will remain the same as parameter when no instances are specified */ + r = split_pattern_into_name_and_instances(parameter, &unit_name, &instances); + if (r < 0) { + log_syntax(NULL, LOG_WARNING, *p, n, r, "Couldn't parse line '%s'. Ignoring.", line); + continue; + } + + rule = (UnitFilePresetRule) { + .pattern = unit_name, + .action = PRESET_ENABLE, + .instances = instances, + }; + } + + parameter = first_word(line, "disable"); + if (parameter) { + char *pattern; + + pattern = strdup(parameter); + if (!pattern) + return -ENOMEM; + + rule = (UnitFilePresetRule) { + .pattern = pattern, + .action = PRESET_DISABLE, + }; + } + + parameter = first_word(line, "ignore"); + if (parameter) { + char *pattern; + + pattern = strdup(parameter); + if (!pattern) + return -ENOMEM; + + rule = (UnitFilePresetRule) { + .pattern = pattern, + .action = PRESET_IGNORE, + }; + } + + if (rule.action) { + if (!GREEDY_REALLOC(ps.rules, ps.n_rules + 1)) + return -ENOMEM; + + ps.rules[ps.n_rules++] = TAKE_STRUCT(rule); + continue; + } + + log_syntax(NULL, LOG_WARNING, *p, n, 0, "Couldn't parse line '%s'. Ignoring.", line); + } + } + + ps.initialized = true; + *presets = TAKE_STRUCT(ps); + + return 0; +} + +static int pattern_match_multiple_instances( + const UnitFilePresetRule rule, + const char *unit_name, + char ***ret) { + + _cleanup_free_ char *templated_name = NULL; + int r; + + /* If no ret is needed or the rule itself does not have instances + * initialized, we return not matching */ + if (!ret || !rule.instances) + return 0; + + r = unit_name_template(unit_name, &templated_name); + if (r < 0) + return r; + if (!streq(rule.pattern, templated_name)) + return 0; + + /* Compose a list of specified instances when unit name is a template */ + if (unit_name_is_valid(unit_name, UNIT_NAME_TEMPLATE)) { + _cleanup_strv_free_ char **out_strv = NULL; + + STRV_FOREACH(iter, rule.instances) { + _cleanup_free_ char *name = NULL; + + r = unit_name_replace_instance(unit_name, *iter, &name); + if (r < 0) + return r; + + r = strv_consume(&out_strv, TAKE_PTR(name)); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(out_strv); + return 1; + } else { + /* We now know the input unit name is an instance name */ + _cleanup_free_ char *instance_name = NULL; + + r = unit_name_to_instance(unit_name, &instance_name); + if (r < 0) + return r; + + if (strv_find(rule.instances, instance_name)) + return 1; + } + return 0; +} + +static int query_presets(const char *name, const UnitFilePresets *presets, char ***instance_name_list) { + PresetAction action = PRESET_UNKNOWN; + + if (!unit_name_is_valid(name, UNIT_NAME_ANY)) + return -EINVAL; + + for (size_t i = 0; i < presets->n_rules; i++) + if (pattern_match_multiple_instances(presets->rules[i], name, instance_name_list) > 0 || + fnmatch(presets->rules[i].pattern, name, FNM_NOESCAPE) == 0) { + action = presets->rules[i].action; + break; + } + + switch (action) { + case PRESET_UNKNOWN: + log_debug("Preset files don't specify rule for %s. Enabling.", name); + return PRESET_ENABLE; + case PRESET_ENABLE: + if (instance_name_list && *instance_name_list) + STRV_FOREACH(s, *instance_name_list) + log_debug("Preset files say enable %s.", *s); + else + log_debug("Preset files say enable %s.", name); + return PRESET_ENABLE; + case PRESET_DISABLE: + log_debug("Preset files say disable %s.", name); + return PRESET_DISABLE; + case PRESET_IGNORE: + log_debug("Preset files say ignore %s.", name); + return PRESET_IGNORE; + default: + assert_not_reached(); + } +} + +PresetAction unit_file_query_preset(RuntimeScope scope, const char *root_dir, const char *name, UnitFilePresets *cached) { + _cleanup_(unit_file_presets_done) UnitFilePresets tmp = {}; + int r; + + if (!cached) + cached = &tmp; + if (!cached->initialized) { + r = read_presets(scope, root_dir, cached); + if (r < 0) + return r; + } + + return query_presets(name, cached, NULL); +} + +static int execute_preset( + UnitFileFlags file_flags, + InstallContext *plus, + InstallContext *minus, + const LookupPaths *lp, + const char *config_path, + char **files, + UnitFilePresetMode mode, + InstallChange **changes, + size_t *n_changes) { + + int r; + + assert(plus); + assert(minus); + assert(lp); + assert(config_path); + + if (mode != UNIT_FILE_PRESET_ENABLE_ONLY) { + _cleanup_set_free_free_ Set *remove_symlinks_to = NULL; + + r = install_context_mark_for_removal(minus, lp, &remove_symlinks_to, config_path, changes, n_changes); + if (r < 0) + return r; + + r = remove_marked_symlinks(remove_symlinks_to, config_path, lp, false, changes, n_changes); + } else + r = 0; + + if (mode != UNIT_FILE_PRESET_DISABLE_ONLY) { + int q; + + /* Returns number of symlinks that where supposed to be installed. */ + q = install_context_apply(plus, lp, + file_flags | UNIT_FILE_IGNORE_AUXILIARY_FAILURE, + config_path, + SEARCH_LOAD, changes, n_changes); + if (r >= 0) { + if (q < 0) + r = q; + else + r += q; + } + } + + return r; +} + +static int preset_prepare_one( + RuntimeScope scope, + InstallContext *plus, + InstallContext *minus, + LookupPaths *lp, + const char *name, + const UnitFilePresets *presets, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(install_context_done) InstallContext tmp = { .scope = scope }; + _cleanup_strv_free_ char **instance_name_list = NULL; + InstallInfo *info; + int r; + + if (install_info_find(plus, name) || install_info_find(minus, name)) + return 0; + + r = install_info_discover(&tmp, lp, name, SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, changes, n_changes); + if (r < 0) + return r; + if (!streq(name, info->name)) { + log_debug("Skipping %s because it is an alias for %s.", name, info->name); + return 0; + } + + r = query_presets(name, presets, &instance_name_list); + if (r < 0) + return r; + + if (r == PRESET_ENABLE) { + if (instance_name_list) + STRV_FOREACH(s, instance_name_list) { + r = install_info_discover_and_check(plus, lp, *s, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, changes, n_changes); + if (r < 0) + return r; + } + else { + r = install_info_discover_and_check(plus, lp, name, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, changes, n_changes); + if (r < 0) + return r; + } + + } else if (r == PRESET_DISABLE) + r = install_info_discover(minus, lp, name, SEARCH_FOLLOW_CONFIG_SYMLINKS, + &info, changes, n_changes); + + return r; +} + +int unit_file_preset( + RuntimeScope scope, + UnitFileFlags file_flags, + const char *root_dir, + char **names, + UnitFilePresetMode mode, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(install_context_done) InstallContext plus = {}, minus = {}; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_(unit_file_presets_done) UnitFilePresets presets = {}; + const char *config_path; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(mode < _UNIT_FILE_PRESET_MODE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + config_path = (file_flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config; + if (!config_path) + return -ENXIO; + + r = read_presets(scope, root_dir, &presets); + if (r < 0) + return r; + + STRV_FOREACH(name, names) { + r = preset_prepare_one(scope, &plus, &minus, &lp, *name, &presets, changes, n_changes); + if (r < 0) + return r; + } + + return execute_preset(file_flags, &plus, &minus, &lp, config_path, names, mode, changes, n_changes); +} + +int unit_file_preset_all( + RuntimeScope scope, + UnitFileFlags file_flags, + const char *root_dir, + UnitFilePresetMode mode, + InstallChange **changes, + size_t *n_changes) { + + _cleanup_(install_context_done) InstallContext plus = {}, minus = {}; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_(unit_file_presets_done) UnitFilePresets presets = {}; + const char *config_path = NULL; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(mode < _UNIT_FILE_PRESET_MODE_MAX); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + config_path = (file_flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config; + if (!config_path) + return -ENXIO; + + r = read_presets(scope, root_dir, &presets); + if (r < 0) + return r; + + STRV_FOREACH(i, lp.search_path) { + _cleanup_closedir_ DIR *d = NULL; + + d = opendir(*i); + if (!d) { + if (errno == ENOENT) + continue; + + return -errno; + } + + FOREACH_DIRENT(de, d, return -errno) { + + if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY)) + continue; + + if (!IN_SET(de->d_type, DT_LNK, DT_REG)) + continue; + + r = preset_prepare_one(scope, &plus, &minus, &lp, de->d_name, &presets, changes, n_changes); + if (r < 0 && + !IN_SET(r, -EEXIST, -ERFKILL, -EADDRNOTAVAIL, -EBADSLT, -EIDRM, -EUCLEAN, -ELOOP, -ENOENT, -EUNATCH, -EXDEV)) + /* Ignore generated/transient/missing/invalid units when applying preset, propagate other errors. + * Coordinate with install_changes_dump() above. */ + return r; + } + } + + return execute_preset(file_flags, &plus, &minus, &lp, config_path, NULL, mode, changes, n_changes); +} + +static UnitFileList* unit_file_list_free(UnitFileList *f) { + if (!f) + return NULL; + + free(f->path); + return mfree(f); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(UnitFileList*, unit_file_list_free); + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + unit_file_list_hash_ops_free, + char, + string_hash_func, + string_compare_func, + UnitFileList, + unit_file_list_free); + +int unit_file_get_list( + RuntimeScope scope, + const char *root_dir, + Hashmap *h, + char **states, + char **patterns) { + + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + int r; + + assert(scope >= 0); + assert(scope < _RUNTIME_SCOPE_MAX); + assert(h); + + r = lookup_paths_init(&lp, scope, 0, root_dir); + if (r < 0) + return r; + + STRV_FOREACH(dirname, lp.search_path) { + _cleanup_closedir_ DIR *d = NULL; + + d = opendir(*dirname); + if (!d) { + if (errno == ENOENT) + continue; + if (IN_SET(errno, ENOTDIR, EACCES)) { + log_debug_errno(errno, "Failed to open \"%s\": %m", *dirname); + continue; + } + + return -errno; + } + + FOREACH_DIRENT(de, d, return -errno) { + _cleanup_(unit_file_list_freep) UnitFileList *f = NULL; + + if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY)) + continue; + + if (!strv_fnmatch_or_empty(patterns, de->d_name, FNM_NOESCAPE)) + continue; + + if (hashmap_get(h, de->d_name)) + continue; + + if (!IN_SET(de->d_type, DT_LNK, DT_REG)) + continue; + + f = new0(UnitFileList, 1); + if (!f) + return -ENOMEM; + + f->path = path_make_absolute(de->d_name, *dirname); + if (!f->path) + return -ENOMEM; + + r = unit_file_lookup_state(scope, &lp, de->d_name, &f->state); + if (r < 0) + f->state = UNIT_FILE_BAD; + + if (!strv_isempty(states) && + !strv_contains(states, unit_file_state_to_string(f->state))) + continue; + + r = hashmap_put(h, basename(f->path), f); + if (r < 0) + return r; + + f = NULL; /* prevent cleanup */ + } + } + + return 0; +} + +static const char* const unit_file_state_table[_UNIT_FILE_STATE_MAX] = { + [UNIT_FILE_ENABLED] = "enabled", + [UNIT_FILE_ENABLED_RUNTIME] = "enabled-runtime", + [UNIT_FILE_LINKED] = "linked", + [UNIT_FILE_LINKED_RUNTIME] = "linked-runtime", + [UNIT_FILE_ALIAS] = "alias", + [UNIT_FILE_MASKED] = "masked", + [UNIT_FILE_MASKED_RUNTIME] = "masked-runtime", + [UNIT_FILE_STATIC] = "static", + [UNIT_FILE_DISABLED] = "disabled", + [UNIT_FILE_INDIRECT] = "indirect", + [UNIT_FILE_GENERATED] = "generated", + [UNIT_FILE_TRANSIENT] = "transient", + [UNIT_FILE_BAD] = "bad", +}; + +DEFINE_STRING_TABLE_LOOKUP(unit_file_state, UnitFileState); + +static const char* const install_change_type_table[_INSTALL_CHANGE_TYPE_MAX] = { + [INSTALL_CHANGE_SYMLINK] = "symlink", + [INSTALL_CHANGE_UNLINK] = "unlink", + [INSTALL_CHANGE_IS_MASKED] = "masked", + [INSTALL_CHANGE_IS_MASKED_GENERATOR] = "masked by generator", + [INSTALL_CHANGE_IS_DANGLING] = "dangling", + [INSTALL_CHANGE_DESTINATION_NOT_PRESENT] = "destination not present", + [INSTALL_CHANGE_AUXILIARY_FAILED] = "auxiliary unit failed", +}; + +DEFINE_STRING_TABLE_LOOKUP(install_change_type, InstallChangeType); + +static const char* const unit_file_preset_mode_table[_UNIT_FILE_PRESET_MODE_MAX] = { + [UNIT_FILE_PRESET_FULL] = "full", + [UNIT_FILE_PRESET_ENABLE_ONLY] = "enable-only", + [UNIT_FILE_PRESET_DISABLE_ONLY] = "disable-only", +}; + +DEFINE_STRING_TABLE_LOOKUP(unit_file_preset_mode, UnitFilePresetMode); diff --git a/src/shared/install.h b/src/shared/install.h new file mode 100644 index 0000000..bc0c6db --- /dev/null +++ b/src/shared/install.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef enum UnitFilePresetMode UnitFilePresetMode; +typedef enum InstallChangeType InstallChangeType; +typedef enum UnitFileFlags UnitFileFlags; +typedef enum InstallMode InstallMode; +typedef struct InstallChange InstallChange; +typedef struct UnitFileList UnitFileList; +typedef struct InstallInfo InstallInfo; + +#include "hashmap.h" +#include "macro.h" +#include "path-lookup.h" +#include "strv.h" +#include "unit-file.h" +#include "unit-name.h" + +enum UnitFilePresetMode { + UNIT_FILE_PRESET_FULL, + UNIT_FILE_PRESET_ENABLE_ONLY, + UNIT_FILE_PRESET_DISABLE_ONLY, + _UNIT_FILE_PRESET_MODE_MAX, + _UNIT_FILE_PRESET_MODE_INVALID = -EINVAL, +}; + +enum InstallChangeType { + INSTALL_CHANGE_SYMLINK, + INSTALL_CHANGE_UNLINK, + INSTALL_CHANGE_IS_MASKED, + INSTALL_CHANGE_IS_MASKED_GENERATOR, + INSTALL_CHANGE_IS_DANGLING, + INSTALL_CHANGE_DESTINATION_NOT_PRESENT, + INSTALL_CHANGE_AUXILIARY_FAILED, + _INSTALL_CHANGE_TYPE_MAX, + _INSTALL_CHANGE_INVALID = -EINVAL, + _INSTALL_CHANGE_ERRNO_MAX = -ERRNO_MAX, /* Ensure this type covers the whole negative errno range */ +}; + +static inline bool INSTALL_CHANGE_TYPE_VALID(InstallChangeType t) { + return t >= _INSTALL_CHANGE_ERRNO_MAX && t < _INSTALL_CHANGE_TYPE_MAX; +} + +enum UnitFileFlags { + UNIT_FILE_RUNTIME = 1 << 0, /* Public API via DBUS, do not change */ + UNIT_FILE_FORCE = 1 << 1, /* Public API via DBUS, do not change */ + UNIT_FILE_PORTABLE = 1 << 2, /* Public API via DBUS, do not change */ + UNIT_FILE_DRY_RUN = 1 << 3, + UNIT_FILE_IGNORE_AUXILIARY_FAILURE = 1 << 4, + _UNIT_FILE_FLAGS_MASK_PUBLIC = UNIT_FILE_RUNTIME|UNIT_FILE_PORTABLE|UNIT_FILE_FORCE, +}; + +/* type can be either one of the INSTALL_CHANGE_SYMLINK, INSTALL_CHANGE_UNLINK, … listed above, or a negative + * errno value. + * + * If source is specified, it should be the contents of the path symlink. In case of an error, source should + * be the existing symlink contents or NULL. */ +struct InstallChange { + int type; /* INSTALL_CHANGE_SYMLINK, … if positive, errno if negative */ + char *path; + char *source; +}; + +static inline bool install_changes_have_modification(const InstallChange* changes, size_t n_changes) { + for (size_t i = 0; i < n_changes; i++) + if (IN_SET(changes[i].type, INSTALL_CHANGE_SYMLINK, INSTALL_CHANGE_UNLINK)) + return true; + return false; +} + +struct UnitFileList { + char *path; + UnitFileState state; +}; + +enum InstallMode { + INSTALL_MODE_REGULAR, + INSTALL_MODE_LINKED, + INSTALL_MODE_ALIAS, + INSTALL_MODE_MASKED, + _INSTALL_MODE_MAX, + _INSTALL_MODE_INVALID = -EINVAL, +}; + +struct InstallInfo { + char *name; + char *path; + char *root; + + char **aliases; + char **wanted_by; + char **required_by; + char **upheld_by; + char **also; + + char *default_instance; + char *symlink_target; + + InstallMode install_mode; + bool auxiliary; +}; + +int unit_file_enable( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names_or_paths, + InstallChange **changes, + size_t *n_changes); +int unit_file_disable( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes); +int unit_file_reenable( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names_or_paths, + InstallChange **changes, + size_t *n_changes); +int unit_file_preset( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + UnitFilePresetMode mode, + InstallChange **changes, + size_t *n_changes); +int unit_file_preset_all( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + UnitFilePresetMode mode, + InstallChange **changes, + size_t *n_changes); +int unit_file_mask( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes); +int unit_file_unmask( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes); +int unit_file_link( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **files, + InstallChange **changes, + size_t *n_changes); +int unit_file_revert( + RuntimeScope scope, + const char *root_dir, + char **names, + InstallChange **changes, + size_t *n_changes); +int unit_file_set_default( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + const char *name, + InstallChange **changes, + size_t *n_changes); +int unit_file_get_default( + RuntimeScope scope, + const char *root_dir, + char **name); +int unit_file_add_dependency( + RuntimeScope scope, + UnitFileFlags flags, + const char *root_dir, + char **names, + const char *target, + UnitDependency dep, + InstallChange **changes, + size_t *n_changes); + +int unit_file_lookup_state( + RuntimeScope scope, + const LookupPaths *paths, + const char *name, + UnitFileState *ret); + +int unit_file_get_state(RuntimeScope scope, const char *root_dir, const char *filename, UnitFileState *ret); +int unit_file_exists(RuntimeScope scope, const LookupPaths *paths, const char *name); + +int unit_file_get_list(RuntimeScope scope, const char *root_dir, Hashmap *h, char **states, char **patterns); + +extern const struct hash_ops unit_file_list_hash_ops_free; + +InstallChangeType install_changes_add(InstallChange **changes, size_t *n_changes, InstallChangeType type, const char *path, const char *source); +void install_changes_free(InstallChange *changes, size_t n_changes); +void install_changes_dump(int r, const char *verb, const InstallChange *changes, size_t n_changes, bool quiet); + +int unit_file_verify_alias( + const InstallInfo *info, + const char *dst, + char **ret_dst, + InstallChange **changes, + size_t *n_changes); + +typedef struct UnitFilePresetRule UnitFilePresetRule; + +typedef struct { + UnitFilePresetRule *rules; + size_t n_rules; + bool initialized; +} UnitFilePresets; + +typedef enum PresetAction { + PRESET_UNKNOWN, + PRESET_ENABLE, + PRESET_DISABLE, + PRESET_IGNORE, + _PRESET_ACTION_MAX, + _PRESET_ACTION_INVALID = -EINVAL, + _PRESET_ACTION_ERRNO_MAX = -ERRNO_MAX, /* Ensure this type covers the whole negative errno range */ +} PresetAction; + +const char *preset_action_past_tense_to_string(PresetAction action); + +void unit_file_presets_done(UnitFilePresets *p); +PresetAction unit_file_query_preset(RuntimeScope scope, const char *root_dir, const char *name, UnitFilePresets *cached); + +const char *unit_file_state_to_string(UnitFileState s) _const_; +UnitFileState unit_file_state_from_string(const char *s) _pure_; +/* from_string conversion is unreliable because of the overlap between -EPERM and -1 for error. */ + +const char *install_change_type_to_string(InstallChangeType t) _const_; +InstallChangeType install_change_type_from_string(const char *s) _pure_; + +const char *unit_file_preset_mode_to_string(UnitFilePresetMode m) _const_; +UnitFilePresetMode unit_file_preset_mode_from_string(const char *s) _pure_; diff --git a/src/shared/ip-protocol-list.c b/src/shared/ip-protocol-list.c new file mode 100644 index 0000000..14155b6 --- /dev/null +++ b/src/shared/ip-protocol-list.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "ip-protocol-list.h" +#include "macro.h" +#include "parse-util.h" +#include "string-util.h" + +static const struct ip_protocol_name* lookup_ip_protocol(register const char *str, register GPERF_LEN_TYPE len); + +#include "ip-protocol-from-name.h" +#include "ip-protocol-to-name.h" + +const char *ip_protocol_to_name(int id) { + + if (id < 0) + return NULL; + + if ((size_t) id >= ELEMENTSOF(ip_protocol_names)) + return NULL; + + return ip_protocol_names[id]; +} + +int ip_protocol_from_name(const char *name) { + const struct ip_protocol_name *sc; + + assert(name); + + sc = lookup_ip_protocol(name, strlen(name)); + if (!sc) + return -EINVAL; + + return sc->id; +} + +int parse_ip_protocol_full(const char *s, bool relaxed) { + int r, p; + + assert(s); + + if (isempty(s)) + return IPPROTO_IP; + + /* People commonly use lowercase protocol names, which we can look up very quickly, so let's try that + * first. */ + r = ip_protocol_from_name(s); + if (r >= 0) + return r; + + /* Do not use strdupa() here, as the input string may come from command line or config files. */ + _cleanup_free_ char *t = strdup(s); + if (!t) + return -ENOMEM; + + r = ip_protocol_from_name(ascii_strlower(t)); + if (r >= 0) + return r; + + r = safe_atoi(t, &p); + if (r < 0) + return r; + if (p < 0) + return -ERANGE; + + /* If @relaxed, we don't check that we have a name for the protocol. */ + if (!relaxed && !ip_protocol_to_name(p)) + return -EPROTONOSUPPORT; + + return p; +} + +const char *ip_protocol_to_tcp_udp(int id) { + return IN_SET(id, IPPROTO_TCP, IPPROTO_UDP) ? + ip_protocol_to_name(id) : NULL; +} + +int ip_protocol_from_tcp_udp(const char *ip_protocol) { + int id = ip_protocol_from_name(ip_protocol); + return IN_SET(id, IPPROTO_TCP, IPPROTO_UDP) ? id : -EINVAL; +} diff --git a/src/shared/ip-protocol-list.h b/src/shared/ip-protocol-list.h new file mode 100644 index 0000000..a0875ef --- /dev/null +++ b/src/shared/ip-protocol-list.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +const char *ip_protocol_to_name(int id); +int ip_protocol_from_name(const char *name); +int parse_ip_protocol_full(const char *s, bool relaxed); +static inline int parse_ip_protocol(const char *s) { + return parse_ip_protocol_full(s, false); +} + +const char *ip_protocol_to_tcp_udp(int id); +int ip_protocol_from_tcp_udp(const char *ip_protocol); diff --git a/src/shared/ip-protocol-to-name.awk b/src/shared/ip-protocol-to-name.awk new file mode 100644 index 0000000..a0671e7 --- /dev/null +++ b/src/shared/ip-protocol-to-name.awk @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +BEGIN{ + print "static const char* const ip_protocol_names[] = { " +} +!/HOPOPTS/ { + printf " [IPPROTO_%s] = \"%s\",\n", $1, tolower($1) +} +END{ + print "};" +} diff --git a/src/shared/ipvlan-util.c b/src/shared/ipvlan-util.c new file mode 100644 index 0000000..1f2e2ff --- /dev/null +++ b/src/shared/ipvlan-util.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "ipvlan-util.h" +#include "string-table.h" + +static const char* const ipvlan_mode_table[_NETDEV_IPVLAN_MODE_MAX] = { + [NETDEV_IPVLAN_MODE_L2] = "L2", + [NETDEV_IPVLAN_MODE_L3] = "L3", + [NETDEV_IPVLAN_MODE_L3S] = "L3S", +}; + +DEFINE_STRING_TABLE_LOOKUP(ipvlan_mode, IPVlanMode); + +static const char* const ipvlan_flags_table[_NETDEV_IPVLAN_FLAGS_MAX] = { + [NETDEV_IPVLAN_FLAGS_BRIGDE] = "bridge", + [NETDEV_IPVLAN_FLAGS_PRIVATE] = "private", + [NETDEV_IPVLAN_FLAGS_VEPA] = "vepa", +}; + +DEFINE_STRING_TABLE_LOOKUP(ipvlan_flags, IPVlanFlags); diff --git a/src/shared/ipvlan-util.h b/src/shared/ipvlan-util.h new file mode 100644 index 0000000..a475b37 --- /dev/null +++ b/src/shared/ipvlan-util.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "macro.h" + +typedef enum IPVlanMode { + NETDEV_IPVLAN_MODE_L2 = IPVLAN_MODE_L2, + NETDEV_IPVLAN_MODE_L3 = IPVLAN_MODE_L3, + NETDEV_IPVLAN_MODE_L3S = IPVLAN_MODE_L3S, + _NETDEV_IPVLAN_MODE_MAX, + _NETDEV_IPVLAN_MODE_INVALID = -EINVAL, +} IPVlanMode; + +typedef enum IPVlanFlags { + NETDEV_IPVLAN_FLAGS_BRIGDE, + NETDEV_IPVLAN_FLAGS_PRIVATE = IPVLAN_F_PRIVATE, + NETDEV_IPVLAN_FLAGS_VEPA = IPVLAN_F_VEPA, + _NETDEV_IPVLAN_FLAGS_MAX, + _NETDEV_IPVLAN_FLAGS_INVALID = -EINVAL, +} IPVlanFlags; + +const char *ipvlan_mode_to_string(IPVlanMode d) _const_; +IPVlanMode ipvlan_mode_from_string(const char *d) _pure_; + +const char *ipvlan_flags_to_string(IPVlanFlags d) _const_; +IPVlanFlags ipvlan_flags_from_string(const char *d) _pure_; diff --git a/src/shared/journal-file-util.c b/src/shared/journal-file-util.c new file mode 100644 index 0000000..e444a2b --- /dev/null +++ b/src/shared/journal-file-util.c @@ -0,0 +1,534 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "chattr-util.h" +#include "copy.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "journal-authenticate.h" +#include "journal-file-util.h" +#include "path-util.h" +#include "random-util.h" +#include "set.h" +#include "stat-util.h" +#include "sync-util.h" + +#define PAYLOAD_BUFFER_SIZE (16U * 1024U) +#define MINIMUM_HOLE_SIZE (1U * 1024U * 1024U / 2U) + +static int journal_file_end_punch_hole(JournalFile *f) { + uint64_t p, sz; + int r; + + r = journal_file_tail_end_by_pread(f, &p); + if (r < 0) + return log_debug_errno(r, "Failed to determine end of tail object: %m"); + + assert(p <= (uint64_t) f->last_stat.st_size); + + sz = ((uint64_t) f->last_stat.st_size) - p; + if (sz < MINIMUM_HOLE_SIZE) + return 0; + + if (fallocate(f->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, p, sz) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), /* Make recognizable */ + "Hole punching not supported by backing file system, skipping."); + + return log_debug_errno(errno, "Failed to punch hole at end of journal file %s: %m", f->path); + } + + return 0; +} + +static int journal_file_entry_array_punch_hole(JournalFile *f, uint64_t p, uint64_t n_entries) { + Object o; + uint64_t offset, sz, n_items = 0, n_unused; + int r; + + if (n_entries == 0) + return 0; + + for (uint64_t q = p; q != 0; q = le64toh(o.entry_array.next_entry_array_offset)) { + r = journal_file_read_object_header(f, OBJECT_ENTRY_ARRAY, q, &o); + if (r < 0) + return r; + + n_items += journal_file_entry_array_n_items(f, &o); + p = q; + } + + if (p == 0) + return 0; + + if (n_entries > n_items) + return -EBADMSG; + + /* Amount of unused items in the final entry array. */ + n_unused = n_items - n_entries; + + if (n_unused == 0) + return 0; + + offset = p + offsetof(Object, entry_array.items) + + (journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f); + sz = p + le64toh(o.object.size) - offset; + + if (sz < MINIMUM_HOLE_SIZE) + return 0; + + if (fallocate(f->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, sz) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), /* Make recognizable */ + "Hole punching not supported by backing file system, skipping."); + + return log_debug_errno(errno, "Failed to punch hole in entry array of %s: %m", f->path); + } + + return 0; +} + +static int journal_file_punch_holes(JournalFile *f) { + HashItem items[PAYLOAD_BUFFER_SIZE / sizeof(HashItem)]; + uint64_t p, sz; + ssize_t n = SSIZE_MAX; + int r; + + r = journal_file_entry_array_punch_hole( + f, le64toh(f->header->entry_array_offset), le64toh(f->header->n_entries)); + if (r < 0) + return r; + + p = le64toh(f->header->data_hash_table_offset); + sz = le64toh(f->header->data_hash_table_size); + + for (uint64_t i = p; i < p + sz && n > 0; i += n) { + size_t m = MIN(sizeof(items), p + sz - i); + n = pread(f->fd, items, m, i); + if (n < 0) + return log_debug_errno(errno, "Failed to read hash table items: %m"); + + /* Let's ignore any partial hash items by rounding down to the nearest multiple of HashItem. */ + n -= n % sizeof(HashItem); + + for (size_t j = 0; j < (size_t) n / sizeof(HashItem); j++) { + Object o; + + for (uint64_t q = le64toh(items[j].head_hash_offset); q != 0; + q = le64toh(o.data.next_hash_offset)) { + + r = journal_file_read_object_header(f, OBJECT_DATA, q, &o); + if (r < 0) { + log_debug_errno(r, "Invalid data object: %m, ignoring"); + break; + } + + if (le64toh(o.data.n_entries) == 0) + continue; + + r = journal_file_entry_array_punch_hole( + f, le64toh(o.data.entry_array_offset), le64toh(o.data.n_entries) - 1); + if (r == -EOPNOTSUPP) + return -EOPNOTSUPP; + + /* Ignore other errors */ + } + } + } + + return 0; +} + +/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync(). + * As a result we use atomic operations on f->offline_state for inter-thread communications with + * journal_file_set_offline() and journal_file_set_online(). */ +static void journal_file_set_offline_internal(JournalFile *f) { + int r; + + assert(f); + assert(f->fd >= 0); + assert(f->header); + + for (;;) { + switch (f->offline_state) { + case OFFLINE_CANCEL: { + OfflineState tmp_state = OFFLINE_CANCEL; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_DONE, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + return; + + case OFFLINE_AGAIN_FROM_SYNCING: { + OfflineState tmp_state = OFFLINE_AGAIN_FROM_SYNCING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_SYNCING, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + break; + + case OFFLINE_AGAIN_FROM_OFFLINING: { + OfflineState tmp_state = OFFLINE_AGAIN_FROM_OFFLINING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_SYNCING, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + break; + + case OFFLINE_SYNCING: + if (f->archive) { + (void) journal_file_end_punch_hole(f); + (void) journal_file_punch_holes(f); + } + + (void) fsync(f->fd); + + { + OfflineState tmp_state = OFFLINE_SYNCING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_OFFLINING, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + + f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE; + (void) fsync(f->fd); + + /* If we've archived the journal file, first try to re-enable COW on the file. If the + * FS_NOCOW_FL flag was never set or we successfully removed it, continue. If we fail + * to remove the flag on the archived file, rewrite the file without the NOCOW flag. + * We need this fallback because on some filesystems (BTRFS), the NOCOW flag cannot + * be removed after data has been written to a file. The only way to remove it is to + * copy all data to a new file without the NOCOW flag set. */ + + if (f->archive) { + r = chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL); + if (r >= 0) + continue; + + log_debug_errno(r, "Failed to re-enable copy-on-write for %s: %m, rewriting file", f->path); + + r = copy_file_atomic_full(FORMAT_PROC_FD_PATH(f->fd), f->path, f->mode, + 0, + FS_NOCOW_FL, + COPY_REPLACE | COPY_FSYNC | COPY_HOLES | COPY_ALL_XATTRS, + NULL, NULL); + if (r < 0) { + log_debug_errno(r, "Failed to rewrite %s: %m", f->path); + continue; + } + } + + break; + + case OFFLINE_OFFLINING: { + OfflineState tmp_state = OFFLINE_OFFLINING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_DONE, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + _fallthrough_; + case OFFLINE_DONE: + return; + + case OFFLINE_JOINED: + log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()"); + return; + } + } +} + +static void * journal_file_set_offline_thread(void *arg) { + JournalFile *f = arg; + + (void) pthread_setname_np(pthread_self(), "journal-offline"); + + journal_file_set_offline_internal(f); + + return NULL; +} + +/* Trigger a restart if the offline thread is mid-flight in a restartable state. */ +static bool journal_file_set_offline_try_restart(JournalFile *f) { + for (;;) { + switch (f->offline_state) { + case OFFLINE_AGAIN_FROM_SYNCING: + case OFFLINE_AGAIN_FROM_OFFLINING: + return true; + + case OFFLINE_CANCEL: { + OfflineState tmp_state = OFFLINE_CANCEL; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_AGAIN_FROM_SYNCING, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + return true; + + case OFFLINE_SYNCING: { + OfflineState tmp_state = OFFLINE_SYNCING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_AGAIN_FROM_SYNCING, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + return true; + + case OFFLINE_OFFLINING: { + OfflineState tmp_state = OFFLINE_OFFLINING; + if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_AGAIN_FROM_OFFLINING, + false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + continue; + } + return true; + + default: + return false; + } + } +} + +/* Sets a journal offline. + * + * If wait is false then an offline is dispatched in a separate thread for a + * subsequent journal_file_set_offline() or journal_file_set_online() of the + * same journal to synchronize with. + * + * If wait is true, then either an existing offline thread will be restarted + * and joined, or if none exists the offline is simply performed in this + * context without involving another thread. + */ +int journal_file_set_offline(JournalFile *f, bool wait) { + int target_state; + bool restarted; + int r; + + assert(f); + + if (!journal_file_writable(f)) + return -EPERM; + + if (f->fd < 0 || !f->header) + return -EINVAL; + + target_state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE; + + /* An offlining journal is implicitly online and may modify f->header->state, + * we must also join any potentially lingering offline thread when already in + * the desired offline state. + */ + if (!journal_file_is_offlining(f) && f->header->state == target_state) + return journal_file_set_offline_thread_join(f); + + /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */ + restarted = journal_file_set_offline_try_restart(f); + if ((restarted && wait) || !restarted) { + r = journal_file_set_offline_thread_join(f); + if (r < 0) + return r; + } + + if (restarted) + return 0; + + /* Initiate a new offline. */ + f->offline_state = OFFLINE_SYNCING; + + if (wait) { + /* Without using a thread if waiting. */ + journal_file_set_offline_internal(f); + + assert(f->offline_state == OFFLINE_DONE); + f->offline_state = OFFLINE_JOINED; + + } else { + sigset_t ss, saved_ss; + int k; + + assert_se(sigfillset(&ss) >= 0); + /* Don't block SIGBUS since the offlining thread accesses a memory mapped file. + * Asynchronous SIGBUS signals can safely be handled by either thread. */ + assert_se(sigdelset(&ss, SIGBUS) >= 0); + + r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss); + if (r > 0) + return -r; + + r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f); + + k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL); + if (r > 0) { + f->offline_state = OFFLINE_JOINED; + return -r; + } + if (k > 0) + return -k; + } + + return 0; +} + +bool journal_file_is_offlining(JournalFile *f) { + assert(f); + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED)) + return false; + + return true; +} + +void journal_file_write_final_tag(JournalFile *f) { + assert(f); +#if HAVE_GCRYPT + if (!JOURNAL_HEADER_SEALED(f->header) || !journal_file_writable(f)) + return; + + int r = journal_file_append_tag(f); + if (r < 0) + log_debug_errno(r, "Failed to append tag when closing journal: %m"); +#endif +} + +JournalFile* journal_file_offline_close(JournalFile *f) { + if (!f) + return NULL; + + journal_file_write_final_tag(f); + + if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0) + journal_file_post_change(f); + sd_event_source_disable_unref(f->post_change_timer); + + journal_file_set_offline(f, true); + + return journal_file_close(f); +} + +JournalFile* journal_file_initiate_close(JournalFile *f, Set *deferred_closes) { + int r; + + assert(f); + + if (deferred_closes) { + r = set_put(deferred_closes, f); + if (r < 0) + log_debug_errno(r, "Failed to add file to deferred close set, closing immediately."); + else { + (void) journal_file_set_offline(f, false); + return NULL; + } + } + + return journal_file_offline_close(f); +} + +int journal_file_rotate( + JournalFile **f, + MMapCache *mmap_cache, + JournalFileFlags file_flags, + uint64_t compress_threshold_bytes, + Set *deferred_closes) { + + _cleanup_free_ char *path = NULL; + JournalFile *new_file = NULL; + int r; + + assert(f); + assert(*f); + + journal_file_write_final_tag(*f); + r = journal_file_archive(*f, &path); + if (r < 0) + return r; + + set_clear_with_destructor(deferred_closes, journal_file_offline_close); + + r = journal_file_open( + /* fd= */ -1, + path, + (*f)->open_flags, + file_flags, + (*f)->mode, + compress_threshold_bytes, + /* metrics= */ NULL, + mmap_cache, + /* template= */ *f, + &new_file); + + journal_file_initiate_close(*f, deferred_closes); + *f = new_file; + + return r; +} + +int journal_file_open_reliably( + const char *fname, + int open_flags, + JournalFileFlags file_flags, + mode_t mode, + uint64_t compress_threshold_bytes, + JournalMetrics *metrics, + MMapCache *mmap_cache, + JournalFile *template, + JournalFile **ret) { + + _cleanup_(journal_file_offline_closep) JournalFile *old_file = NULL; + int r; + + r = journal_file_open( + /* fd= */ -1, + fname, + open_flags, + file_flags, + mode, + compress_threshold_bytes, + metrics, + mmap_cache, + template, + ret); + if (!IN_SET(r, + -EBADMSG, /* Corrupted */ + -EADDRNOTAVAIL, /* Referenced object offset out of bounds */ + -ENODATA, /* Truncated */ + -EHOSTDOWN, /* Other machine */ + -EPROTONOSUPPORT, /* Incompatible feature */ + -EBUSY, /* Unclean shutdown */ + -ESHUTDOWN, /* Already archived */ + -EIO, /* IO error, including SIGBUS on mmap */ + -EIDRM)) /* File has been deleted */ + return r; + + if ((open_flags & O_ACCMODE) == O_RDONLY) + return r; + + if (!(open_flags & O_CREAT)) + return r; + + if (!endswith(fname, ".journal")) + return r; + + /* The file is corrupted. Rotate it away and try it again (but only once) */ + log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname); + + if (!template) { + /* The file is corrupted and no template is specified. Try opening it read-only as the + * template before rotating to inherit its sequence number and ID. */ + r = journal_file_open(-1, fname, + (open_flags & ~(O_ACCMODE|O_CREAT|O_EXCL)) | O_RDONLY, + file_flags, 0, compress_threshold_bytes, NULL, + mmap_cache, NULL, &old_file); + if (r < 0) + log_debug_errno(r, "Failed to continue sequence from file %s, ignoring: %m", fname); + else + template = old_file; + } + + r = journal_file_dispose(AT_FDCWD, fname); + if (r < 0) + return r; + + return journal_file_open(-1, fname, open_flags, file_flags, mode, compress_threshold_bytes, metrics, + mmap_cache, template, ret); +} diff --git a/src/shared/journal-file-util.h b/src/shared/journal-file-util.h new file mode 100644 index 0000000..f9426c4 --- /dev/null +++ b/src/shared/journal-file-util.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "journal-file.h" + +int journal_file_set_offline(JournalFile *f, bool wait); +bool journal_file_is_offlining(JournalFile *f); +void journal_file_write_final_tag(JournalFile *f); +JournalFile* journal_file_offline_close(JournalFile *f); +DEFINE_TRIVIAL_CLEANUP_FUNC(JournalFile*, journal_file_offline_close); + +int journal_file_open_reliably( + const char *fname, + int open_flags, + JournalFileFlags file_flags, + mode_t mode, + uint64_t compress_threshold_bytes, + JournalMetrics *metrics, + MMapCache *mmap_cache, + JournalFile *template, + JournalFile **ret); + +JournalFile* journal_file_initiate_close(JournalFile *f, Set *deferred_closes); +int journal_file_rotate( + JournalFile **f, + MMapCache *mmap_cache, + JournalFileFlags file_flags, + uint64_t compress_threshold_bytes, + Set *deferred_closes); diff --git a/src/shared/journal-importer.c b/src/shared/journal-importer.c new file mode 100644 index 0000000..83e9834 --- /dev/null +++ b/src/shared/journal-importer.c @@ -0,0 +1,482 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "io-util.h" +#include "journal-file.h" +#include "journal-importer.h" +#include "journal-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "unaligned.h" + +enum { + IMPORTER_STATE_LINE = 0, /* waiting to read, or reading line */ + IMPORTER_STATE_DATA_START, /* reading binary data header */ + IMPORTER_STATE_DATA, /* reading binary data */ + IMPORTER_STATE_DATA_FINISH, /* expecting newline */ + IMPORTER_STATE_EOF, /* done */ +}; + +void journal_importer_cleanup(JournalImporter *imp) { + if (imp->fd >= 0 && !imp->passive_fd) { + log_debug("Closing %s (fd=%d)", imp->name ?: "importer", imp->fd); + safe_close(imp->fd); + } + + free(imp->name); + free(imp->buf); + iovw_free_contents(&imp->iovw, false); +} + +static char* realloc_buffer(JournalImporter *imp, size_t size) { + char *b, *old = ASSERT_PTR(imp)->buf; + + b = GREEDY_REALLOC(imp->buf, size); + if (!b) + return NULL; + + iovw_rebase(&imp->iovw, old, imp->buf); + + return b; +} + +static int get_line(JournalImporter *imp, char **line, size_t *size) { + ssize_t n; + char *c = NULL; + + assert(imp); + assert(imp->state == IMPORTER_STATE_LINE); + assert(imp->offset <= imp->filled); + assert(imp->filled <= MALLOC_SIZEOF_SAFE(imp->buf)); + assert(imp->fd >= 0); + + for (;;) { + if (imp->buf) { + size_t start = MAX(imp->scanned, imp->offset); + + c = memchr(imp->buf + start, '\n', + imp->filled - start); + if (c) + break; + } + + imp->scanned = imp->filled; + if (imp->scanned >= DATA_SIZE_MAX) + return log_warning_errno(SYNTHETIC_ERRNO(ENOBUFS), + "Entry is bigger than %u bytes.", + DATA_SIZE_MAX); + + if (imp->passive_fd) + /* we have to wait for some data to come to us */ + return -EAGAIN; + + /* We know that imp->filled is at most DATA_SIZE_MAX, so if + we reallocate it, we'll increase the size at least a bit. */ + assert_cc(DATA_SIZE_MAX < ENTRY_SIZE_MAX); + if (MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled < LINE_CHUNK && + !realloc_buffer(imp, MIN(imp->filled + LINE_CHUNK, ENTRY_SIZE_MAX))) + return log_oom(); + + assert(imp->buf); + assert(MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled >= LINE_CHUNK || + MALLOC_SIZEOF_SAFE(imp->buf) >= ENTRY_SIZE_MAX); + + n = read(imp->fd, + imp->buf + imp->filled, + MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled); + if (n < 0) { + if (errno != EAGAIN) + log_error_errno(errno, "read(%d, ..., %zu): %m", + imp->fd, + MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled); + return -errno; + } else if (n == 0) + return 0; + + imp->filled += n; + } + + *line = imp->buf + imp->offset; + *size = c + 1 - imp->buf - imp->offset; + imp->offset += *size; + + return 1; +} + +static int fill_fixed_size(JournalImporter *imp, void **data, size_t size) { + + assert(imp); + assert(IN_SET(imp->state, IMPORTER_STATE_DATA_START, IMPORTER_STATE_DATA, IMPORTER_STATE_DATA_FINISH)); + assert(size <= DATA_SIZE_MAX); + assert(imp->offset <= imp->filled); + assert(imp->filled <= MALLOC_SIZEOF_SAFE(imp->buf)); + assert(imp->fd >= 0); + assert(data); + + while (imp->filled - imp->offset < size) { + int n; + + if (imp->passive_fd) + /* we have to wait for some data to come to us */ + return -EAGAIN; + + if (!realloc_buffer(imp, imp->offset + size)) + return log_oom(); + + n = read(imp->fd, imp->buf + imp->filled, + MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled); + if (n < 0) { + if (errno != EAGAIN) + log_error_errno(errno, "read(%d, ..., %zu): %m", imp->fd, + MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled); + return -errno; + } else if (n == 0) + return 0; + + imp->filled += n; + } + + *data = imp->buf + imp->offset; + imp->offset += size; + + return 1; +} + +static int get_data_size(JournalImporter *imp) { + int r; + void *data; + + assert(imp); + assert(imp->state == IMPORTER_STATE_DATA_START); + assert(imp->data_size == 0); + + r = fill_fixed_size(imp, &data, sizeof(uint64_t)); + if (r <= 0) + return r; + + imp->data_size = unaligned_read_le64(data); + if (imp->data_size > DATA_SIZE_MAX) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "Stream declares field with size %zu > DATA_SIZE_MAX = %u", + imp->data_size, DATA_SIZE_MAX); + if (imp->data_size == 0) + log_warning("Binary field with zero length"); + + return 1; +} + +static int get_data_data(JournalImporter *imp, void **data) { + int r; + + assert(imp); + assert(data); + assert(imp->state == IMPORTER_STATE_DATA); + + r = fill_fixed_size(imp, data, imp->data_size); + if (r <= 0) + return r; + + return 1; +} + +static int get_data_newline(JournalImporter *imp) { + int r; + char *data; + + assert(imp); + assert(imp->state == IMPORTER_STATE_DATA_FINISH); + + r = fill_fixed_size(imp, (void**) &data, 1); + if (r <= 0) + return r; + + assert(data); + if (*data != '\n') { + char buf[4]; + int l; + + l = cescape_char(*data, buf); + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "Expected newline, got '%.*s'", l, buf); + } + + return 1; +} + +static int process_special_field(JournalImporter *imp, char *line) { + const char *value; + char buf[CELLESCAPE_DEFAULT_LENGTH]; + int r; + + assert(line); + + if (STARTSWITH_SET(line, "__CURSOR=", "__SEQNUM=", "__SEQNUM_ID=")) + /* ignore __CURSOR=, __SEQNUM=, __SEQNUM_ID= which we cannot replicate */ + return 1; + + value = startswith(line, "__REALTIME_TIMESTAMP="); + if (value) { + uint64_t x; + + r = safe_atou64(value, &x); + if (r < 0) + return log_warning_errno(r, "Failed to parse __REALTIME_TIMESTAMP '%s': %m", + cellescape(buf, sizeof buf, value)); + else if (!VALID_REALTIME(x)) { + log_warning("__REALTIME_TIMESTAMP out of range, ignoring: %"PRIu64, x); + return -ERANGE; + } + + imp->ts.realtime = x; + return 1; + } + + value = startswith(line, "__MONOTONIC_TIMESTAMP="); + if (value) { + uint64_t x; + + r = safe_atou64(value, &x); + if (r < 0) + return log_warning_errno(r, "Failed to parse __MONOTONIC_TIMESTAMP '%s': %m", + cellescape(buf, sizeof buf, value)); + else if (!VALID_MONOTONIC(x)) { + log_warning("__MONOTONIC_TIMESTAMP out of range, ignoring: %"PRIu64, x); + return -ERANGE; + } + + imp->ts.monotonic = x; + return 1; + } + + /* Just a single underline, but it needs special treatment too. */ + value = startswith(line, "_BOOT_ID="); + if (value) { + r = sd_id128_from_string(value, &imp->boot_id); + if (r < 0) + return log_warning_errno(r, "Failed to parse _BOOT_ID '%s': %m", + cellescape(buf, sizeof buf, value)); + + /* store the field in the usual fashion too */ + return 0; + } + + value = startswith(line, "__"); + if (value) { + log_notice("Unknown dunder line __%s, ignoring.", cellescape(buf, sizeof buf, value)); + return 1; + } + + /* no dunder */ + return 0; +} + +int journal_importer_process_data(JournalImporter *imp) { + int r; + + switch (imp->state) { + case IMPORTER_STATE_LINE: { + char *line, *sep; + size_t n = 0; + + assert(imp->data_size == 0); + + r = get_line(imp, &line, &n); + if (r < 0) + return r; + if (r == 0) { + imp->state = IMPORTER_STATE_EOF; + return 0; + } + assert(n > 0); + assert(line[n-1] == '\n'); + + if (n == 1) { + log_trace("Received empty line, event is ready"); + return 1; + } + + /* MESSAGE=xxx\n + or + COREDUMP\n + LLLLLLLL0011223344...\n + */ + sep = memchr(line, '=', n); + if (sep) { + /* chomp newline */ + n--; + + if (!journal_field_valid(line, sep - line, true)) { + char buf[64], *t; + + t = strndupa_safe(line, sep - line); + log_debug("Ignoring invalid field: \"%s\"", + cellescape(buf, sizeof buf, t)); + + return 0; + } + + line[n] = '\0'; + r = process_special_field(imp, line); + if (r != 0) + return r < 0 ? r : 0; + + r = iovw_put(&imp->iovw, line, n); + if (r < 0) + return r; + } else { + if (!journal_field_valid(line, n - 1, true)) { + char buf[64], *t; + + t = strndupa_safe(line, n - 1); + log_debug("Ignoring invalid field: \"%s\"", + cellescape(buf, sizeof buf, t)); + + return 0; + } + + /* replace \n with = */ + line[n-1] = '='; + + imp->field_len = n; + imp->state = IMPORTER_STATE_DATA_START; + + /* we cannot put the field in iovec until we have all data */ + } + + log_trace("Received: %.*s (%s)", (int) n, line, sep ? "text" : "binary"); + + return 0; /* continue */ + } + + case IMPORTER_STATE_DATA_START: + assert(imp->data_size == 0); + + r = get_data_size(imp); + // log_debug("get_data_size() -> %d", r); + if (r < 0) + return r; + if (r == 0) { + imp->state = IMPORTER_STATE_EOF; + return 0; + } + + imp->state = imp->data_size > 0 ? + IMPORTER_STATE_DATA : IMPORTER_STATE_DATA_FINISH; + + return 0; /* continue */ + + case IMPORTER_STATE_DATA: { + void *data; + char *field; + + assert(imp->data_size > 0); + + r = get_data_data(imp, &data); + // log_debug("get_data_data() -> %d", r); + if (r < 0) + return r; + if (r == 0) { + imp->state = IMPORTER_STATE_EOF; + return 0; + } + + assert(data); + + field = (char*) data - sizeof(uint64_t) - imp->field_len; + memmove(field + sizeof(uint64_t), field, imp->field_len); + + r = iovw_put(&imp->iovw, field + sizeof(uint64_t), imp->field_len + imp->data_size); + if (r < 0) + return r; + + imp->state = IMPORTER_STATE_DATA_FINISH; + + return 0; /* continue */ + } + + case IMPORTER_STATE_DATA_FINISH: + r = get_data_newline(imp); + // log_debug("get_data_newline() -> %d", r); + if (r < 0) + return r; + if (r == 0) { + imp->state = IMPORTER_STATE_EOF; + return 0; + } + + imp->data_size = 0; + imp->state = IMPORTER_STATE_LINE; + + return 0; /* continue */ + default: + assert_not_reached(); + } +} + +int journal_importer_push_data(JournalImporter *imp, const char *data, size_t size) { + assert(imp); + assert(imp->state != IMPORTER_STATE_EOF); + + if (!realloc_buffer(imp, imp->filled + size)) + return log_error_errno(ENOMEM, + "Failed to store received data of size %zu " + "(in addition to existing %zu bytes with %zu filled): %m", + size, MALLOC_SIZEOF_SAFE(imp->buf), imp->filled); + + memcpy(imp->buf + imp->filled, data, size); + imp->filled += size; + + return 0; +} + +void journal_importer_drop_iovw(JournalImporter *imp) { + size_t remain, target; + + /* This function drops processed data that along with the iovw that points at it */ + + iovw_free_contents(&imp->iovw, false); + + /* possibly reset buffer position */ + remain = imp->filled - imp->offset; + + if (remain == 0) /* no brainer */ + imp->offset = imp->scanned = imp->filled = 0; + else if (imp->offset > MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled && + imp->offset > remain) { + memcpy(imp->buf, imp->buf + imp->offset, remain); + imp->offset = imp->scanned = 0; + imp->filled = remain; + } + + target = MALLOC_SIZEOF_SAFE(imp->buf); + while (target > 16 * LINE_CHUNK && imp->filled < target / 2) + target /= 2; + if (target < MALLOC_SIZEOF_SAFE(imp->buf)) { + char *tmp; + size_t old_size; + + old_size = MALLOC_SIZEOF_SAFE(imp->buf); + + tmp = realloc(imp->buf, target); + if (!tmp) + log_warning("Failed to reallocate buffer to (smaller) size %zu", + target); + else { + log_debug("Reallocated buffer from %zu to %zu bytes", + old_size, target); + imp->buf = tmp; + } + } +} + +bool journal_importer_eof(const JournalImporter *imp) { + return imp->state == IMPORTER_STATE_EOF; +} diff --git a/src/shared/journal-importer.h b/src/shared/journal-importer.h new file mode 100644 index 0000000..d84dcc4 --- /dev/null +++ b/src/shared/journal-importer.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +#include "io-util.h" +#include "iovec-wrapper.h" +#include "time-util.h" + +/* Make sure not to make this smaller than the maximum coredump size. + * See JOURNAL_SIZE_MAX in coredump.c */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#define ENTRY_SIZE_MAX (1024*1024*770u) +#define DATA_SIZE_MAX (1024*1024*768u) +#else +#define ENTRY_SIZE_MAX (1024*1024*13u) +#define DATA_SIZE_MAX (1024*1024*11u) +#endif +#define LINE_CHUNK 8*1024u + +/* The maximum number of fields in an entry */ +#define ENTRY_FIELD_COUNT_MAX 1024u + +typedef struct JournalImporter { + int fd; + bool passive_fd; + char *name; + + char *buf; + size_t offset; /* offset to the beginning of live data in the buffer */ + size_t scanned; /* number of bytes since the beginning of data without a newline */ + size_t filled; /* total number of bytes in the buffer */ + + size_t field_len; /* used for binary fields: the field name length */ + size_t data_size; /* and the size of the binary data chunk being processed */ + + struct iovec_wrapper iovw; + + int state; + dual_timestamp ts; + sd_id128_t boot_id; +} JournalImporter; + +#define JOURNAL_IMPORTER_INIT(_fd) { .fd = (_fd), .iovw = {} } +#define JOURNAL_IMPORTER_MAKE(_fd) (JournalImporter) JOURNAL_IMPORTER_INIT(_fd) + +void journal_importer_cleanup(JournalImporter *); +int journal_importer_process_data(JournalImporter *); +int journal_importer_push_data(JournalImporter *, const char *data, size_t size); +void journal_importer_drop_iovw(JournalImporter *); +bool journal_importer_eof(const JournalImporter *); + +static inline size_t journal_importer_bytes_remaining(const JournalImporter *imp) { + return imp->filled; +} diff --git a/src/shared/journal-util.c b/src/shared/journal-util.c new file mode 100644 index 0000000..d73d7c4 --- /dev/null +++ b/src/shared/journal-util.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "acl-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "log.h" +#include "strv.h" +#include "user-util.h" + +static int access_check_var_log_journal(sd_journal *j, bool want_other_users) { + int r; + + assert(j); + + /* If we are root, we should have access, don't warn. */ + if (getuid() == 0) + return 0; + + /* If we are in the 'systemd-journal' group, we should have + * access too. */ + r = in_group("systemd-journal"); + if (r < 0) + return log_error_errno(r, "Failed to check if we are in the 'systemd-journal' group: %m"); + if (r > 0) + return 0; + +#if HAVE_ACL + _cleanup_strv_free_ char **g = NULL; + const char* dir; + + if (laccess("/run/log/journal", F_OK) >= 0) + dir = "/run/log/journal"; + else + dir = "/var/log/journal"; + + /* If we are in any of the groups listed in the journal ACLs, + * then all is good, too. Let's enumerate all groups from the + * default ACL of the directory, which generally should allow + * access to most journal files too. */ + r = acl_search_groups(dir, &g); + if (r < 0) + return log_error_errno(r, "Failed to search journal ACL: %m"); + if (r > 0) + return 0; + + /* Print a pretty list, if there were ACLs set. */ + if (!strv_isempty(g)) { + _cleanup_free_ char *s = NULL; + + /* There are groups in the ACL, let's list them */ + r = strv_extend(&g, "systemd-journal"); + if (r < 0) + return log_oom(); + + strv_sort(g); + strv_uniq(g); + + s = strv_join(g, "', '"); + if (!s) + return log_oom(); + + log_notice("Hint: You are currently not seeing messages from %s.\n" + " Users in groups '%s' can see all messages.\n" + " Pass -q to turn off this notice.", + want_other_users ? "other users and the system" : "the system", + s); + return 1; + } +#endif + + /* If no ACLs were found, print a short version of the message. */ + log_notice("Hint: You are currently not seeing messages from %s.\n" + " Users in the 'systemd-journal' group can see all messages. Pass -q to\n" + " turn off this notice.", + want_other_users ? "other users and the system" : "the system"); + + return 1; +} + +int journal_access_blocked(sd_journal *j) { + return hashmap_contains(j->errors, INT_TO_PTR(-EACCES)); +} + +int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_users) { + void *code; + char *path; + int r = 0; + + assert(j); + + if (hashmap_isempty(j->errors)) { + if (ordered_hashmap_isempty(j->files) && !quiet) + log_notice("No journal files were found."); + + return 0; + } + + if (journal_access_blocked(j)) { + if (!quiet) + (void) access_check_var_log_journal(j, want_other_users); + + if (ordered_hashmap_isempty(j->files)) + r = log_error_errno(EACCES, "No journal files were opened due to insufficient permissions."); + } + + HASHMAP_FOREACH_KEY(path, code, j->errors) { + int err; + + err = abs(PTR_TO_INT(code)); + + switch (err) { + case EACCES: + continue; + + case ENODATA: + log_warning_errno(err, "Journal file %s is truncated, ignoring file.", path); + break; + + case EPROTONOSUPPORT: + log_warning_errno(err, "Journal file %1$s uses an unsupported feature, ignoring file.\n" + "Use SYSTEMD_LOG_LEVEL=debug journalctl --file=%1$s to see the details.", + path); + break; + + case EBADMSG: + log_warning_errno(err, "Journal file %s corrupted, ignoring file.", path); + break; + + case ETOOMANYREFS: + log_warning_errno(err, "Too many journal files (limit is at %u) in scope, ignoring file '%s'.", JOURNAL_FILES_MAX, path); + break; + + default: + log_warning_errno(err, "An error was encountered while opening journal file or directory %s, ignoring file: %m", path); + break; + } + } + + return r; +} + +int journal_open_machine(sd_journal **ret, const char *machine) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + _cleanup_close_ int machine_fd = -EBADF; + int fd, r; + + assert(ret); + assert(machine); + + if (geteuid() != 0) + /* The file descriptor returned by OpenMachineRootDirectory() will be owned by users/groups of + * the container, thus we need root privileges to override them. */ + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Using the --machine= switch requires root privileges."); + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); + + r = bus_call_method(bus, bus_machine_mgr, "OpenMachineRootDirectory", &error, &reply, "s", machine); + if (r < 0) + return log_error_errno(r, "Failed to open root directory of machine '%s': %s", + machine, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "h", &fd); + if (r < 0) + return bus_log_parse_error(r); + + machine_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (machine_fd < 0) + return log_error_errno(errno, "Failed to duplicate file descriptor: %m"); + + r = sd_journal_open_directory_fd(&j, machine_fd, SD_JOURNAL_OS_ROOT | SD_JOURNAL_TAKE_DIRECTORY_FD); + if (r < 0) + return log_error_errno(r, "Failed to open journal in machine '%s': %m", machine); + + TAKE_FD(machine_fd); + *ret = TAKE_PTR(j); + return 0; +} diff --git a/src/shared/journal-util.h b/src/shared/journal-util.h new file mode 100644 index 0000000..afad249 --- /dev/null +++ b/src/shared/journal-util.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-journal.h" + +int journal_access_blocked(sd_journal *j); +int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_users); +int journal_open_machine(sd_journal **ret, const char *machine); diff --git a/src/shared/json-internal.h b/src/shared/json-internal.h new file mode 100644 index 0000000..a94befa --- /dev/null +++ b/src/shared/json-internal.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include "json.h" + +/* This header should include all prototypes only the JSON parser itself and + * its tests need access to. Normal code consuming the JSON parser should not + * interface with this. */ + +typedef union JsonValue { + /* Encodes a simple value. This structure is generally 8 bytes wide (as double is 64-bit). */ + bool boolean; + double real; + int64_t integer; + uint64_t unsig; +} JsonValue; + +/* Let's protect us against accidental structure size changes on our most relevant arch */ +#ifdef __x86_64__ +assert_cc(sizeof(JsonValue) == 8U); +#endif + +#define JSON_VALUE_NULL ((JsonValue) {}) + +/* We use fake JsonVariant objects for some special values, in order to avoid memory allocations for them. Note that + * effectively this means that there are multiple ways to encode the same objects: via these magic values or as + * properly allocated JsonVariant. We convert between both on-the-fly as necessary. */ +enum +{ + _JSON_VARIANT_MAGIC_TRUE = 1, +#define JSON_VARIANT_MAGIC_TRUE ((JsonVariant*) _JSON_VARIANT_MAGIC_TRUE) + _JSON_VARIANT_MAGIC_FALSE, +#define JSON_VARIANT_MAGIC_FALSE ((JsonVariant*) _JSON_VARIANT_MAGIC_FALSE) + _JSON_VARIANT_MAGIC_NULL, +#define JSON_VARIANT_MAGIC_NULL ((JsonVariant*) _JSON_VARIANT_MAGIC_NULL) + _JSON_VARIANT_MAGIC_ZERO_INTEGER, +#define JSON_VARIANT_MAGIC_ZERO_INTEGER ((JsonVariant*) _JSON_VARIANT_MAGIC_ZERO_INTEGER) + _JSON_VARIANT_MAGIC_ZERO_UNSIGNED, +#define JSON_VARIANT_MAGIC_ZERO_UNSIGNED ((JsonVariant*) _JSON_VARIANT_MAGIC_ZERO_UNSIGNED) + _JSON_VARIANT_MAGIC_ZERO_REAL, +#define JSON_VARIANT_MAGIC_ZERO_REAL ((JsonVariant*) _JSON_VARIANT_MAGIC_ZERO_REAL) + _JSON_VARIANT_MAGIC_EMPTY_STRING, +#define JSON_VARIANT_MAGIC_EMPTY_STRING ((JsonVariant*) _JSON_VARIANT_MAGIC_EMPTY_STRING) + _JSON_VARIANT_MAGIC_EMPTY_ARRAY, +#define JSON_VARIANT_MAGIC_EMPTY_ARRAY ((JsonVariant*) _JSON_VARIANT_MAGIC_EMPTY_ARRAY) + _JSON_VARIANT_MAGIC_EMPTY_OBJECT, +#define JSON_VARIANT_MAGIC_EMPTY_OBJECT ((JsonVariant*) _JSON_VARIANT_MAGIC_EMPTY_OBJECT) + __JSON_VARIANT_MAGIC_MAX +#define _JSON_VARIANT_MAGIC_MAX ((JsonVariant*) __JSON_VARIANT_MAGIC_MAX) +}; + +/* This is only safe as long as we don't define more than 4K magic pointers, i.e. the page size of the simplest + * architectures we support. That's because we rely on the fact that malloc() will never allocate from the first memory + * page, as it is a faulting page for catching NULL pointer dereferences. */ +assert_cc((unsigned) __JSON_VARIANT_MAGIC_MAX < 4096U); + +enum { /* JSON tokens */ + JSON_TOKEN_END, + JSON_TOKEN_COLON, + JSON_TOKEN_COMMA, + JSON_TOKEN_OBJECT_OPEN, + JSON_TOKEN_OBJECT_CLOSE, + JSON_TOKEN_ARRAY_OPEN, + JSON_TOKEN_ARRAY_CLOSE, + JSON_TOKEN_STRING, + JSON_TOKEN_REAL, + JSON_TOKEN_INTEGER, + JSON_TOKEN_UNSIGNED, + JSON_TOKEN_BOOLEAN, + JSON_TOKEN_NULL, + _JSON_TOKEN_MAX, + _JSON_TOKEN_INVALID = -EINVAL, +}; + +int json_tokenize(const char **p, char **ret_string, JsonValue *ret_value, unsigned *ret_line, unsigned *ret_column, void **state, unsigned *line, unsigned *column); diff --git a/src/shared/json.c b/src/shared/json.c new file mode 100644 index 0000000..06c9e85 --- /dev/null +++ b/src/shared/json.c @@ -0,0 +1,5132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "float.h" +#include "hexdecoct.h" +#include "json-internal.h" +#include "json.h" +#include "macro.h" +#include "math-util.h" +#include "memory-util.h" +#include "memstream-util.h" +#include "set.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "user-util.h" +#include "utf8.h" + +/* Refuse putting together variants with a larger depth than 2K by default (as a protection against overflowing stacks + * if code processes JSON objects recursively. Note that we store the depth in an uint16_t, hence make sure this + * remains under 2^16. + * + * The value first was 16k, but it was discovered to be too high on llvm/x86-64. See also: + * https://github.com/systemd/systemd/issues/10738 + * + * The value then was 4k, but it was discovered to be too high on s390x/aarch64. See also: + * https://github.com/systemd/systemd/issues/14396 */ + +#define DEPTH_MAX (2U*1024U) +assert_cc(DEPTH_MAX <= UINT16_MAX); + +typedef struct JsonSource { + /* When we parse from a file or similar, encodes the filename, to indicate the source of a json variant */ + unsigned n_ref; + unsigned max_line; + unsigned max_column; + char name[]; +} JsonSource; + +/* On x86-64 this whole structure should have a size of 6 * 64 bit = 48 bytes */ +struct JsonVariant { + union { + /* We either maintain a reference counter for this variant itself, or we are embedded into an + * array/object, in which case only that surrounding object is ref-counted. (If 'embedded' is false, + * see below.) */ + unsigned n_ref; + + /* If this JsonVariant is part of an array/object, then this field points to the surrounding + * JSON_VARIANT_ARRAY/JSON_VARIANT_OBJECT object. (If 'embedded' is true, see below.) */ + JsonVariant *parent; + }; + + /* If this was parsed from some file or buffer, this stores where from, as well as the source line/column */ + JsonSource *source; + unsigned line, column; + + /* The current 'depth' of the JsonVariant, i.e. how many levels of member variants this has */ + uint16_t depth; + + JsonVariantType type:8; + + /* A marker whether this variant is embedded into in array/object or not. If true, the 'parent' pointer above + * is valid. If false, the 'n_ref' field above is valid instead. */ + bool is_embedded:1; + + /* In some conditions (for example, if this object is part of an array of strings or objects), we don't store + * any data inline, but instead simply reference an external object and act as surrogate of it. In that case + * this bool is set, and the external object is referenced through the .reference field below. */ + bool is_reference:1; + + /* While comparing two arrays, we use this for marking what we already have seen */ + bool is_marked:1; + + /* Erase from memory when freeing */ + bool sensitive:1; + + /* If this is an object the fields are strictly ordered by name */ + bool sorted:1; + + /* If in addition to this object all objects referenced by it are also ordered strictly by name */ + bool normalized:1; + + union { + /* For simple types we store the value in-line. */ + JsonValue value; + + /* For objects and arrays we store the number of elements immediately following */ + size_t n_elements; + + /* If is_reference as indicated above is set, this is where the reference object is actually stored. */ + JsonVariant *reference; + + /* Strings are placed immediately after the structure. Note that when this is a JsonVariant + * embedded into an array we might encode strings up to INLINE_STRING_LENGTH characters + * directly inside the element, while longer strings are stored as references. When this + * object is not embedded into an array, but stand-alone, we allocate the right size for the + * whole structure, i.e. the array might be much larger than INLINE_STRING_LENGTH. */ + DECLARE_FLEX_ARRAY(char, string); + }; +}; + +/* Inside string arrays we have a series of JsonVariant structures one after the other. In this case, strings longer + * than INLINE_STRING_MAX are stored as references, and all shorter ones inline. (This means — on x86-64 — strings up + * to 7 chars are stored within the array elements, and all others in separate allocations) */ +#define INLINE_STRING_MAX (sizeof(JsonVariant) - offsetof(JsonVariant, string) - 1U) + +/* Let's make sure this structure isn't increased in size accidentally. This check is only for our most relevant arch + * (x86-64). */ +#if defined(__x86_64__) && __SIZEOF_POINTER__ == 8 +assert_cc(sizeof(JsonVariant) == 40U); +assert_cc(INLINE_STRING_MAX == 7U); +#endif + +static JsonSource* json_source_new(const char *name) { + JsonSource *s; + + assert(name); + + s = malloc(offsetof(JsonSource, name) + strlen(name) + 1); + if (!s) + return NULL; + + *s = (JsonSource) { + .n_ref = 1, + }; + strcpy(s->name, name); + + return s; +} + +DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(JsonSource, json_source, mfree); + +static bool json_source_equal(JsonSource *a, JsonSource *b) { + if (a == b) + return true; + + if (!a || !b) + return false; + + return streq(a->name, b->name); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(JsonSource*, json_source_unref); + +/* There are four kind of JsonVariant* pointers: + * + * 1. NULL + * 2. A 'regular' one, i.e. pointing to malloc() memory + * 3. A 'magic' one, i.e. one of the special JSON_VARIANT_MAGIC_XYZ values, that encode a few very basic values directly in the pointer. + * 4. A 'const string' one, i.e. a pointer to a const string. + * + * The four kinds of pointers can be discerned like this: + * + * Detecting #1 is easy, just compare with NULL. Detecting #3 is similarly easy: all magic pointers are below + * _JSON_VARIANT_MAGIC_MAX (which is pretty low, within the first memory page, which is special on Linux and other + * OSes, as it is a faulting page). In order to discern #2 and #4 we check the lowest bit. If it's off it's #2, + * otherwise #4. This makes use of the fact that malloc() will return "maximum aligned" memory, which definitely + * means the pointer is even. This means we can use the uneven pointers to reference static strings, as long as we + * make sure that all static strings used like this are aligned to 2 (or higher), and that we mask the bit on + * access. The JSON_VARIANT_STRING_CONST() macro encodes strings as JsonVariant* pointers, with the bit set. */ + +static bool json_variant_is_magic(const JsonVariant *v) { + if (!v) + return false; + + return v < _JSON_VARIANT_MAGIC_MAX; +} + +static bool json_variant_is_const_string(const JsonVariant *v) { + + if (v < _JSON_VARIANT_MAGIC_MAX) + return false; + + /* A proper JsonVariant is aligned to whatever malloc() aligns things too, which is definitely not uneven. We + * hence use all uneven pointers as indicators for const strings. */ + + return (((uintptr_t) v) & 1) != 0; +} + +static bool json_variant_is_regular(const JsonVariant *v) { + + if (v < _JSON_VARIANT_MAGIC_MAX) + return false; + + return (((uintptr_t) v) & 1) == 0; +} + +static JsonVariant *json_variant_dereference(JsonVariant *v) { + + /* Recursively dereference variants that are references to other variants */ + + if (!v) + return NULL; + + if (!json_variant_is_regular(v)) + return v; + + if (!v->is_reference) + return v; + + return json_variant_dereference(v->reference); +} + +static uint16_t json_variant_depth(JsonVariant *v) { + + v = json_variant_dereference(v); + if (!v) + return 0; + + if (!json_variant_is_regular(v)) + return 0; + + return v->depth; +} + +static JsonVariant *json_variant_formalize(JsonVariant *v) { + + /* Converts json variant pointers to their normalized form, i.e. fully dereferenced and wherever + * possible converted to the "magic" version if there is one */ + + if (!v) + return NULL; + + v = json_variant_dereference(v); + + switch (json_variant_type(v)) { + + case JSON_VARIANT_BOOLEAN: + return json_variant_boolean(v) ? JSON_VARIANT_MAGIC_TRUE : JSON_VARIANT_MAGIC_FALSE; + + case JSON_VARIANT_NULL: + return JSON_VARIANT_MAGIC_NULL; + + case JSON_VARIANT_INTEGER: + return json_variant_integer(v) == 0 ? JSON_VARIANT_MAGIC_ZERO_INTEGER : v; + + case JSON_VARIANT_UNSIGNED: + return json_variant_unsigned(v) == 0 ? JSON_VARIANT_MAGIC_ZERO_UNSIGNED : v; + + case JSON_VARIANT_REAL: + return iszero_safe(json_variant_real(v)) ? JSON_VARIANT_MAGIC_ZERO_REAL : v; + + case JSON_VARIANT_STRING: + return isempty(json_variant_string(v)) ? JSON_VARIANT_MAGIC_EMPTY_STRING : v; + + case JSON_VARIANT_ARRAY: + return json_variant_elements(v) == 0 ? JSON_VARIANT_MAGIC_EMPTY_ARRAY : v; + + case JSON_VARIANT_OBJECT: + return json_variant_elements(v) == 0 ? JSON_VARIANT_MAGIC_EMPTY_OBJECT : v; + + default: + return v; + } +} + +static JsonVariant *json_variant_conservative_formalize(JsonVariant *v) { + + /* Much like json_variant_formalize(), but won't simplify if the variant has a source/line location + * attached to it, in order not to lose context */ + + if (!v) + return NULL; + + if (!json_variant_is_regular(v)) + return v; + + if (v->source || v->line > 0 || v->column > 0) + return v; + + return json_variant_formalize(v); +} + +static int json_variant_new(JsonVariant **ret, JsonVariantType type, size_t space) { + JsonVariant *v; + + assert_return(ret, -EINVAL); + + v = malloc0(MAX(sizeof(JsonVariant), + offsetof(JsonVariant, value) + space)); + if (!v) + return -ENOMEM; + + v->n_ref = 1; + v->type = type; + + *ret = v; + return 0; +} + +int json_variant_new_integer(JsonVariant **ret, int64_t i) { + JsonVariant *v; + int r; + + assert_return(ret, -EINVAL); + + if (i == 0) { + *ret = JSON_VARIANT_MAGIC_ZERO_INTEGER; + return 0; + } + + r = json_variant_new(&v, JSON_VARIANT_INTEGER, sizeof(i)); + if (r < 0) + return r; + + v->value.integer = i; + *ret = v; + + return 0; +} + +int json_variant_new_unsigned(JsonVariant **ret, uint64_t u) { + JsonVariant *v; + int r; + + assert_return(ret, -EINVAL); + if (u == 0) { + *ret = JSON_VARIANT_MAGIC_ZERO_UNSIGNED; + return 0; + } + + r = json_variant_new(&v, JSON_VARIANT_UNSIGNED, sizeof(u)); + if (r < 0) + return r; + + v->value.unsig = u; + *ret = v; + + return 0; +} + +int json_variant_new_real(JsonVariant **ret, double d) { + JsonVariant *v; + int r; + + assert_return(ret, -EINVAL); + + r = fpclassify(d); + switch (r) { + case FP_NAN: + case FP_INFINITE: + /* JSON doesn't know NaN, +Infinity or -Infinity. Let's silently convert to 'null'. */ + *ret = JSON_VARIANT_MAGIC_NULL; + return 0; + + case FP_ZERO: + *ret = JSON_VARIANT_MAGIC_ZERO_REAL; + return 0; + } + + r = json_variant_new(&v, JSON_VARIANT_REAL, sizeof(d)); + if (r < 0) + return r; + + v->value.real = d; + *ret = v; + + return 0; +} + +int json_variant_new_boolean(JsonVariant **ret, bool b) { + assert_return(ret, -EINVAL); + + if (b) + *ret = JSON_VARIANT_MAGIC_TRUE; + else + *ret = JSON_VARIANT_MAGIC_FALSE; + + return 0; +} + +int json_variant_new_null(JsonVariant **ret) { + assert_return(ret, -EINVAL); + + *ret = JSON_VARIANT_MAGIC_NULL; + return 0; +} + +int json_variant_new_stringn(JsonVariant **ret, const char *s, size_t n) { + JsonVariant *v; + int r; + + assert_return(ret, -EINVAL); + if (!s) { + assert_return(IN_SET(n, 0, SIZE_MAX), -EINVAL); + return json_variant_new_null(ret); + } + if (n == SIZE_MAX) /* determine length automatically */ + n = strlen(s); + else if (memchr(s, 0, n)) /* don't allow embedded NUL, as we can't express that in JSON */ + return -EINVAL; + if (n == 0) { + *ret = JSON_VARIANT_MAGIC_EMPTY_STRING; + return 0; + } + + if (!utf8_is_valid_n(s, n)) /* JSON strings must be valid UTF-8 */ + return -EUCLEAN; + + r = json_variant_new(&v, JSON_VARIANT_STRING, n + 1); + if (r < 0) + return r; + + memcpy(v->string, s, n); + v->string[n] = 0; + + *ret = v; + return 0; +} + +int json_variant_new_base64(JsonVariant **ret, const void *p, size_t n) { + _cleanup_free_ char *s = NULL; + ssize_t k; + + assert_return(ret, -EINVAL); + assert_return(n == 0 || p, -EINVAL); + + k = base64mem(p, n, &s); + if (k < 0) + return k; + + return json_variant_new_stringn(ret, s, k); +} + +int json_variant_new_base32hex(JsonVariant **ret, const void *p, size_t n) { + _cleanup_free_ char *s = NULL; + + assert_return(ret, -EINVAL); + assert_return(n == 0 || p, -EINVAL); + + s = base32hexmem(p, n, false); + if (!s) + return -ENOMEM; + + return json_variant_new_string(ret, s); +} + +int json_variant_new_hex(JsonVariant **ret, const void *p, size_t n) { + _cleanup_free_ char *s = NULL; + + assert_return(ret, -EINVAL); + assert_return(n == 0 || p, -EINVAL); + + s = hexmem(p, n); + if (!s) + return -ENOMEM; + + return json_variant_new_stringn(ret, s, n*2); +} + +int json_variant_new_octescape(JsonVariant **ret, const void *p, size_t n) { + _cleanup_free_ char *s = NULL; + + assert_return(ret, -EINVAL); + assert_return(n == 0 || p, -EINVAL); + + s = octescape(p, n); + if (!s) + return -ENOMEM; + + return json_variant_new_string(ret, s); +} + +int json_variant_new_id128(JsonVariant **ret, sd_id128_t id) { + return json_variant_new_string(ret, SD_ID128_TO_STRING(id)); +} + +int json_variant_new_uuid(JsonVariant **ret, sd_id128_t id) { + return json_variant_new_string(ret, SD_ID128_TO_UUID_STRING(id)); +} + +static void json_variant_set(JsonVariant *a, JsonVariant *b) { + assert(a); + + b = json_variant_dereference(b); + if (!b) { + a->type = JSON_VARIANT_NULL; + return; + } + + a->type = json_variant_type(b); + switch (a->type) { + + case JSON_VARIANT_INTEGER: + a->value.integer = json_variant_integer(b); + break; + + case JSON_VARIANT_UNSIGNED: + a->value.unsig = json_variant_unsigned(b); + break; + + case JSON_VARIANT_REAL: + a->value.real = json_variant_real(b); + break; + + case JSON_VARIANT_BOOLEAN: + a->value.boolean = json_variant_boolean(b); + break; + + case JSON_VARIANT_STRING: { + const char *s; + + assert_se(s = json_variant_string(b)); + + /* Short strings we can store inline */ + if (strnlen(s, INLINE_STRING_MAX+1) <= INLINE_STRING_MAX) { + strcpy(a->string, s); + break; + } + + /* For longer strings, use a reference… */ + _fallthrough_; + } + + case JSON_VARIANT_ARRAY: + case JSON_VARIANT_OBJECT: + a->is_reference = true; + a->reference = json_variant_ref(json_variant_conservative_formalize(b)); + break; + + case JSON_VARIANT_NULL: + break; + + default: + assert_not_reached(); + } +} + +static void json_variant_copy_source(JsonVariant *v, JsonVariant *from) { + assert(v); + + if (!json_variant_is_regular(from)) + return; + + v->line = from->line; + v->column = from->column; + v->source = json_source_ref(from->source); +} + +static int _json_variant_array_put_element(JsonVariant *array, JsonVariant *element) { + assert(array); + JsonVariant *w = array + 1 + array->n_elements; + + uint16_t d = json_variant_depth(element); + if (d >= DEPTH_MAX) /* Refuse too deep nesting */ + return -ELNRNG; + if (d >= array->depth) + array->depth = d + 1; + array->n_elements ++; + + *w = (JsonVariant) { + .is_embedded = true, + .parent = array, + }; + + json_variant_set(w, element); + json_variant_copy_source(w, element); + + if (!json_variant_is_normalized(element)) + array->normalized = false; + + return 0; +} + +int json_variant_new_array(JsonVariant **ret, JsonVariant **array, size_t n) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + assert_return(ret, -EINVAL); + if (n == 0) { + *ret = JSON_VARIANT_MAGIC_EMPTY_ARRAY; + return 0; + } + assert_return(array, -EINVAL); + + v = new(JsonVariant, n + 1); + if (!v) + return -ENOMEM; + + *v = (JsonVariant) { + .n_ref = 1, + .type = JSON_VARIANT_ARRAY, + .normalized = true, + }; + + while (v->n_elements < n) { + r = _json_variant_array_put_element(v, array[v->n_elements]); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(v); + return 0; +} + +int json_variant_new_array_bytes(JsonVariant **ret, const void *p, size_t n) { + assert_return(ret, -EINVAL); + if (n == 0) { + *ret = JSON_VARIANT_MAGIC_EMPTY_ARRAY; + return 0; + } + assert_return(p, -EINVAL); + + JsonVariant *v = new(JsonVariant, n + 1); + if (!v) + return -ENOMEM; + + *v = (JsonVariant) { + .n_ref = 1, + .type = JSON_VARIANT_ARRAY, + .n_elements = n, + .depth = 1, + }; + + for (size_t i = 0; i < n; i++) { + JsonVariant *w = v + 1 + i; + + *w = (JsonVariant) { + .is_embedded = true, + .parent = v, + .type = JSON_VARIANT_UNSIGNED, + .value.unsig = ((const uint8_t*) p)[i], + }; + } + + v->normalized = true; + + *ret = v; + return 0; +} + +int json_variant_new_array_strv(JsonVariant **ret, char **l) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + size_t n; + int r; + + assert(ret); + + n = strv_length(l); + if (n == 0) { + *ret = JSON_VARIANT_MAGIC_EMPTY_ARRAY; + return 0; + } + + v = new(JsonVariant, n + 1); + if (!v) + return -ENOMEM; + + *v = (JsonVariant) { + .n_ref = 1, + .type = JSON_VARIANT_ARRAY, + .depth = 1, + }; + + for (v->n_elements = 0; v->n_elements < n; v->n_elements++) { + JsonVariant *w = v + 1 + v->n_elements; + size_t k; + + *w = (JsonVariant) { + .is_embedded = true, + .parent = v, + .type = JSON_VARIANT_STRING, + }; + + k = strlen(l[v->n_elements]); + + if (k > INLINE_STRING_MAX) { + /* If string is too long, store it as reference. */ + + r = json_variant_new_string(&w->reference, l[v->n_elements]); + if (r < 0) + return r; + + w->is_reference = true; + } else { + if (!utf8_is_valid_n(l[v->n_elements], k)) /* JSON strings must be valid UTF-8 */ + return -EUCLEAN; + + memcpy(w->string, l[v->n_elements], k+1); + } + } + + v->normalized = true; + + *ret = TAKE_PTR(v); + return 0; +} + +int json_variant_new_object(JsonVariant **ret, JsonVariant **array, size_t n) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + const char *prev = NULL; + bool sorted = true, normalized = true; + + assert_return(ret, -EINVAL); + if (n == 0) { + *ret = JSON_VARIANT_MAGIC_EMPTY_OBJECT; + return 0; + } + assert_return(array, -EINVAL); + assert_return(n % 2 == 0, -EINVAL); + + v = new(JsonVariant, n + 1); + if (!v) + return -ENOMEM; + + *v = (JsonVariant) { + .n_ref = 1, + .type = JSON_VARIANT_OBJECT, + }; + + for (v->n_elements = 0; v->n_elements < n; v->n_elements++) { + JsonVariant *w = v + 1 + v->n_elements, + *c = array[v->n_elements]; + uint16_t d; + + if ((v->n_elements & 1) == 0) { + const char *k; + + if (!json_variant_is_string(c)) + return -EINVAL; /* Every second one needs to be a string, as it is the key name */ + + assert_se(k = json_variant_string(c)); + + if (prev && strcmp(k, prev) <= 0) + sorted = normalized = false; + + prev = k; + } else if (!json_variant_is_normalized(c)) + normalized = false; + + d = json_variant_depth(c); + if (d >= DEPTH_MAX) /* Refuse too deep nesting */ + return -ELNRNG; + if (d >= v->depth) + v->depth = d + 1; + + *w = (JsonVariant) { + .is_embedded = true, + .parent = v, + }; + + json_variant_set(w, c); + json_variant_copy_source(w, c); + } + + v->normalized = normalized; + v->sorted = sorted; + + *ret = TAKE_PTR(v); + return 0; +} + +static size_t json_variant_size(JsonVariant* v) { + if (!json_variant_is_regular(v)) + return 0; + + if (v->is_reference) + return offsetof(JsonVariant, reference) + sizeof(JsonVariant*); + + switch (v->type) { + + case JSON_VARIANT_STRING: + return offsetof(JsonVariant, string) + strlen(v->string) + 1; + + case JSON_VARIANT_REAL: + return offsetof(JsonVariant, value) + sizeof(double); + + case JSON_VARIANT_UNSIGNED: + return offsetof(JsonVariant, value) + sizeof(uint64_t); + + case JSON_VARIANT_INTEGER: + return offsetof(JsonVariant, value) + sizeof(int64_t); + + case JSON_VARIANT_BOOLEAN: + return offsetof(JsonVariant, value) + sizeof(bool); + + case JSON_VARIANT_ARRAY: + case JSON_VARIANT_OBJECT: + return offsetof(JsonVariant, n_elements) + sizeof(size_t); + + case JSON_VARIANT_NULL: + return offsetof(JsonVariant, value); + + default: + assert_not_reached(); + } +} + +static void json_variant_free_inner(JsonVariant *v, bool force_sensitive) { + bool sensitive; + + assert(v); + + if (!json_variant_is_regular(v)) + return; + + json_source_unref(v->source); + + sensitive = v->sensitive || force_sensitive; + + if (v->is_reference) { + if (sensitive) + json_variant_sensitive(v->reference); + + json_variant_unref(v->reference); + return; + } + + if (IN_SET(v->type, JSON_VARIANT_ARRAY, JSON_VARIANT_OBJECT)) + for (size_t i = 0; i < v->n_elements; i++) + json_variant_free_inner(v + 1 + i, sensitive); + + if (sensitive) + explicit_bzero_safe(v, json_variant_size(v)); +} + +static unsigned json_variant_n_ref(const JsonVariant *v) { + /* Return the number of references to v. + * 0 => NULL or not a regular object or embedded. + * >0 => number of references + */ + + if (!v || !json_variant_is_regular(v) || v->is_embedded) + return 0; + + assert(v->n_ref > 0); + return v->n_ref; +} + +JsonVariant *json_variant_ref(JsonVariant *v) { + if (!v) + return NULL; + if (!json_variant_is_regular(v)) + return v; + + if (v->is_embedded) + json_variant_ref(v->parent); /* ref the compounding variant instead */ + else { + assert(v->n_ref > 0); + v->n_ref++; + } + + return v; +} + +JsonVariant *json_variant_unref(JsonVariant *v) { + if (!v) + return NULL; + if (!json_variant_is_regular(v)) + return NULL; + + if (v->is_embedded) + json_variant_unref(v->parent); + else { + assert(v->n_ref > 0); + v->n_ref--; + + if (v->n_ref == 0) { + json_variant_free_inner(v, false); + free(v); + } + } + + return NULL; +} + +void json_variant_unref_many(JsonVariant **array, size_t n) { + assert(array || n == 0); + + for (size_t i = 0; i < n; i++) + json_variant_unref(array[i]); + + free(array); +} + +const char *json_variant_string(JsonVariant *v) { + if (!v) + return NULL; + if (v == JSON_VARIANT_MAGIC_EMPTY_STRING) + return ""; + if (json_variant_is_magic(v)) + goto mismatch; + if (json_variant_is_const_string(v)) { + uintptr_t p = (uintptr_t) v; + + assert((p & 1) != 0); + return (const char*) (p ^ 1U); + } + + if (v->is_reference) + return json_variant_string(v->reference); + if (v->type != JSON_VARIANT_STRING) + goto mismatch; + + return v->string; + +mismatch: + log_debug("Non-string JSON variant requested as string, returning NULL."); + return NULL; +} + +bool json_variant_boolean(JsonVariant *v) { + if (!v) + goto mismatch; + if (v == JSON_VARIANT_MAGIC_TRUE) + return true; + if (v == JSON_VARIANT_MAGIC_FALSE) + return false; + if (!json_variant_is_regular(v)) + goto mismatch; + if (v->type != JSON_VARIANT_BOOLEAN) + goto mismatch; + if (v->is_reference) + return json_variant_boolean(v->reference); + + return v->value.boolean; + +mismatch: + log_debug("Non-boolean JSON variant requested as boolean, returning false."); + return false; +} + +int64_t json_variant_integer(JsonVariant *v) { + if (!v) + goto mismatch; + if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER || + v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED || + v == JSON_VARIANT_MAGIC_ZERO_REAL) + return 0; + if (!json_variant_is_regular(v)) + goto mismatch; + if (v->is_reference) + return json_variant_integer(v->reference); + + switch (v->type) { + + case JSON_VARIANT_INTEGER: + return v->value.integer; + + case JSON_VARIANT_UNSIGNED: + if (v->value.unsig <= INT64_MAX) + return (int64_t) v->value.unsig; + + log_debug("Unsigned integer %" PRIu64 " requested as signed integer and out of range, returning 0.", v->value.unsig); + return 0; + + case JSON_VARIANT_REAL: { + int64_t converted; + + converted = (int64_t) v->value.real; + + if (fp_equal((double) converted, v->value.real)) + return converted; + + log_debug("Real %g requested as integer, and cannot be converted losslessly, returning 0.", v->value.real); + return 0; + } + + default: + break; + } + +mismatch: + log_debug("Non-integer JSON variant requested as integer, returning 0."); + return 0; +} + +uint64_t json_variant_unsigned(JsonVariant *v) { + if (!v) + goto mismatch; + if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER || + v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED || + v == JSON_VARIANT_MAGIC_ZERO_REAL) + return 0; + if (!json_variant_is_regular(v)) + goto mismatch; + if (v->is_reference) + return json_variant_integer(v->reference); + + switch (v->type) { + + case JSON_VARIANT_INTEGER: + if (v->value.integer >= 0) + return (uint64_t) v->value.integer; + + log_debug("Signed integer %" PRIi64 " requested as unsigned integer and out of range, returning 0.", v->value.integer); + return 0; + + case JSON_VARIANT_UNSIGNED: + return v->value.unsig; + + case JSON_VARIANT_REAL: { + uint64_t converted; + + converted = (uint64_t) v->value.real; + + if (fp_equal((double) converted, v->value.real)) + return converted; + + log_debug("Real %g requested as unsigned integer, and cannot be converted losslessly, returning 0.", v->value.real); + return 0; + } + + default: + break; + } + +mismatch: + log_debug("Non-integer JSON variant requested as unsigned, returning 0."); + return 0; +} + +double json_variant_real(JsonVariant *v) { + if (!v) + return 0.0; + if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER || + v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED || + v == JSON_VARIANT_MAGIC_ZERO_REAL) + return 0.0; + if (!json_variant_is_regular(v)) + goto mismatch; + if (v->is_reference) + return json_variant_real(v->reference); + + switch (v->type) { + + case JSON_VARIANT_REAL: + return v->value.real; + + case JSON_VARIANT_INTEGER: { + double converted = (double) v->value.integer; + + if ((int64_t) converted == v->value.integer) + return converted; + + log_debug("Signed integer %" PRIi64 " requested as real, and cannot be converted losslessly, returning 0.", v->value.integer); + return 0.0; + } + + case JSON_VARIANT_UNSIGNED: { + double converted = (double) v->value.unsig; + + if ((uint64_t) converted == v->value.unsig) + return converted; + + log_debug("Unsigned integer %" PRIu64 " requested as real, and cannot be converted losslessly, returning 0.", v->value.unsig); + return 0.0; + } + + default: + break; + } + +mismatch: + log_debug("Non-integer JSON variant requested as integer, returning 0."); + return 0.0; +} + +bool json_variant_is_negative(JsonVariant *v) { + if (!v) + goto mismatch; + if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER || + v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED || + v == JSON_VARIANT_MAGIC_ZERO_REAL) + return false; + if (!json_variant_is_regular(v)) + goto mismatch; + if (v->is_reference) + return json_variant_is_negative(v->reference); + + /* This function is useful as checking whether numbers are negative is pretty complex since we have three types + * of numbers. And some JSON code (OCI for example) uses negative numbers to mark "not defined" numeric + * values. */ + + switch (v->type) { + + case JSON_VARIANT_REAL: + return v->value.real < 0; + + case JSON_VARIANT_INTEGER: + return v->value.integer < 0; + + case JSON_VARIANT_UNSIGNED: + return false; + + default: + break; + } + +mismatch: + log_debug("Non-integer JSON variant tested for negativity, returning false."); + return false; +} + +bool json_variant_is_blank_object(JsonVariant *v) { + /* Returns true if the specified object is null or empty */ + return !v || + json_variant_is_null(v) || + (json_variant_is_object(v) && json_variant_elements(v) == 0); +} + +bool json_variant_is_blank_array(JsonVariant *v) { + return !v || + json_variant_is_null(v) || + (json_variant_is_array(v) && json_variant_elements(v) == 0); +} + +JsonVariantType json_variant_type(JsonVariant *v) { + + if (!v) + return _JSON_VARIANT_TYPE_INVALID; + + if (json_variant_is_const_string(v)) + return JSON_VARIANT_STRING; + + if (v == JSON_VARIANT_MAGIC_TRUE || v == JSON_VARIANT_MAGIC_FALSE) + return JSON_VARIANT_BOOLEAN; + + if (v == JSON_VARIANT_MAGIC_NULL) + return JSON_VARIANT_NULL; + + if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER) + return JSON_VARIANT_INTEGER; + + if (v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED) + return JSON_VARIANT_UNSIGNED; + + if (v == JSON_VARIANT_MAGIC_ZERO_REAL) + return JSON_VARIANT_REAL; + + if (v == JSON_VARIANT_MAGIC_EMPTY_STRING) + return JSON_VARIANT_STRING; + + if (v == JSON_VARIANT_MAGIC_EMPTY_ARRAY) + return JSON_VARIANT_ARRAY; + + if (v == JSON_VARIANT_MAGIC_EMPTY_OBJECT) + return JSON_VARIANT_OBJECT; + + return v->type; +} + +_function_no_sanitize_float_cast_overflow_ +bool json_variant_has_type(JsonVariant *v, JsonVariantType type) { + JsonVariantType rt; + + /* Note: we turn off ubsan float cast overflow detection for this function, since it would complain + * about our float casts but we do them explicitly to detect conversion errors. */ + + v = json_variant_dereference(v); + if (!v) + return false; + + rt = json_variant_type(v); + if (rt == type) + return true; + + /* If it's a const string, then it only can be a string, and if it is not, it's not */ + if (json_variant_is_const_string(v)) + return false; + + /* All three magic zeroes qualify as integer, unsigned and as real */ + if ((v == JSON_VARIANT_MAGIC_ZERO_INTEGER || v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED || v == JSON_VARIANT_MAGIC_ZERO_REAL) && + IN_SET(type, JSON_VARIANT_INTEGER, JSON_VARIANT_UNSIGNED, JSON_VARIANT_REAL, JSON_VARIANT_NUMBER)) + return true; + + /* All other magic variant types are only equal to themselves */ + if (json_variant_is_magic(v)) + return false; + + /* Handle the "number" pseudo type */ + if (type == JSON_VARIANT_NUMBER) + return IN_SET(rt, JSON_VARIANT_INTEGER, JSON_VARIANT_UNSIGNED, JSON_VARIANT_REAL); + + /* Integer conversions are OK in many cases */ + if (rt == JSON_VARIANT_INTEGER && type == JSON_VARIANT_UNSIGNED) + return v->value.integer >= 0; + if (rt == JSON_VARIANT_UNSIGNED && type == JSON_VARIANT_INTEGER) + return v->value.unsig <= INT64_MAX; + + /* Any integer that can be converted lossley to a real and back may also be considered a real */ + if (rt == JSON_VARIANT_INTEGER && type == JSON_VARIANT_REAL) + return (int64_t) (double) v->value.integer == v->value.integer; + if (rt == JSON_VARIANT_UNSIGNED && type == JSON_VARIANT_REAL) + return (uint64_t) (double) v->value.unsig == v->value.unsig; + + /* Any real that can be converted losslessly to an integer and back may also be considered an integer */ + if (rt == JSON_VARIANT_REAL && type == JSON_VARIANT_INTEGER) + return fp_equal((double) (int64_t) v->value.real, v->value.real); + if (rt == JSON_VARIANT_REAL && type == JSON_VARIANT_UNSIGNED) + return fp_equal((double) (uint64_t) v->value.real, v->value.real); + + return false; +} + +size_t json_variant_elements(JsonVariant *v) { + if (!v) + return 0; + if (v == JSON_VARIANT_MAGIC_EMPTY_ARRAY || + v == JSON_VARIANT_MAGIC_EMPTY_OBJECT) + return 0; + if (!json_variant_is_regular(v)) + goto mismatch; + if (!IN_SET(v->type, JSON_VARIANT_ARRAY, JSON_VARIANT_OBJECT)) + goto mismatch; + if (v->is_reference) + return json_variant_elements(v->reference); + + return v->n_elements; + +mismatch: + log_debug("Number of elements in non-array/non-object JSON variant requested, returning 0."); + return 0; +} + +JsonVariant *json_variant_by_index(JsonVariant *v, size_t idx) { + if (!v) + return NULL; + if (v == JSON_VARIANT_MAGIC_EMPTY_ARRAY || + v == JSON_VARIANT_MAGIC_EMPTY_OBJECT) + return NULL; + if (!json_variant_is_regular(v)) + goto mismatch; + if (!IN_SET(v->type, JSON_VARIANT_ARRAY, JSON_VARIANT_OBJECT)) + goto mismatch; + if (v->is_reference) + return json_variant_by_index(v->reference, idx); + if (idx >= v->n_elements) + return NULL; + + return json_variant_conservative_formalize(v + 1 + idx); + +mismatch: + log_debug("Element in non-array/non-object JSON variant requested by index, returning NULL."); + return NULL; +} + +JsonVariant *json_variant_by_key_full(JsonVariant *v, const char *key, JsonVariant **ret_key) { + if (!v) + goto not_found; + if (!key) + goto not_found; + if (v == JSON_VARIANT_MAGIC_EMPTY_OBJECT) + goto not_found; + if (!json_variant_is_regular(v)) + goto mismatch; + if (v->type != JSON_VARIANT_OBJECT) + goto mismatch; + if (v->is_reference) + return json_variant_by_key(v->reference, key); + + if (v->sorted) { + size_t a = 0, b = v->n_elements/2; + + /* If the variant is sorted we can use bisection to find the entry we need in O(log(n)) time */ + + while (b > a) { + JsonVariant *p; + const char *f; + size_t i; + int c; + + i = (a + b) / 2; + p = json_variant_dereference(v + 1 + i*2); + + assert_se(f = json_variant_string(p)); + + c = strcmp(key, f); + if (c == 0) { + if (ret_key) + *ret_key = json_variant_conservative_formalize(v + 1 + i*2); + + return json_variant_conservative_formalize(v + 1 + i*2 + 1); + } else if (c < 0) + b = i; + else + a = i + 1; + } + + goto not_found; + } + + /* The variant is not sorted, hence search for the field linearly */ + for (size_t i = 0; i < v->n_elements; i += 2) { + JsonVariant *p; + + p = json_variant_dereference(v + 1 + i); + + if (!json_variant_has_type(p, JSON_VARIANT_STRING)) + continue; + + if (streq(json_variant_string(p), key)) { + + if (ret_key) + *ret_key = json_variant_conservative_formalize(v + 1 + i); + + return json_variant_conservative_formalize(v + 1 + i + 1); + } + } + +not_found: + if (ret_key) + *ret_key = NULL; + + return NULL; + +mismatch: + log_debug("Element in non-object JSON variant requested by key, returning NULL."); + if (ret_key) + *ret_key = NULL; + + return NULL; +} + +JsonVariant *json_variant_by_key(JsonVariant *v, const char *key) { + return json_variant_by_key_full(v, key, NULL); +} + +bool json_variant_equal(JsonVariant *a, JsonVariant *b) { + JsonVariantType t; + + a = json_variant_formalize(a); + b = json_variant_formalize(b); + + if (a == b) + return true; + + t = json_variant_type(a); + if (!json_variant_has_type(b, t)) + return false; + + switch (t) { + + case JSON_VARIANT_STRING: + return streq(json_variant_string(a), json_variant_string(b)); + + case JSON_VARIANT_INTEGER: + return json_variant_integer(a) == json_variant_integer(b); + + case JSON_VARIANT_UNSIGNED: + return json_variant_unsigned(a) == json_variant_unsigned(b); + + case JSON_VARIANT_REAL: + return fp_equal(json_variant_real(a), json_variant_real(b)); + + case JSON_VARIANT_BOOLEAN: + return json_variant_boolean(a) == json_variant_boolean(b); + + case JSON_VARIANT_NULL: + return true; + + case JSON_VARIANT_ARRAY: { + size_t n = json_variant_elements(a); + if (n != json_variant_elements(b)) + return false; + + for (size_t i = 0; i < n; i++) + if (!json_variant_equal(json_variant_by_index(a, i), json_variant_by_index(b, i))) + return false; + + return true; + } + + case JSON_VARIANT_OBJECT: { + size_t n = json_variant_elements(a); + if (n != json_variant_elements(b)) + return false; + + /* Iterate through all keys in 'a' */ + for (size_t i = 0; i < n; i += 2) { + bool found = false; + + /* Match them against all keys in 'b' */ + for (size_t j = 0; j < n; j += 2) { + JsonVariant *key_b; + + key_b = json_variant_by_index(b, j); + + /* During the first iteration unmark everything */ + if (i == 0) + key_b->is_marked = false; + else if (key_b->is_marked) /* In later iterations if we already marked something, don't bother with it again */ + continue; + + if (found) + continue; + + if (json_variant_equal(json_variant_by_index(a, i), key_b) && + json_variant_equal(json_variant_by_index(a, i+1), json_variant_by_index(b, j+1))) { + /* Key and values match! */ + key_b->is_marked = found = true; + + /* In the first iteration we continue the inner loop since we want to mark + * everything, otherwise exit the loop quickly after we found what we were + * looking for. */ + if (i != 0) + break; + } + } + + if (!found) + return false; + } + + return true; + } + + default: + assert_not_reached(); + } +} + +void json_variant_sensitive(JsonVariant *v) { + assert(v); + + /* Marks a variant as "sensitive", so that it is erased from memory when it is destroyed. This is a + * one-way operation: as soon as it is marked this way it remains marked this way until it's + * destroyed. A magic variant is never sensitive though, even when asked, since it's too + * basic. Similar, const string variant are never sensitive either, after all they are included in + * the source code as they are, which is not suitable for inclusion of secrets. + * + * Note that this flag has a recursive effect: when we destroy an object or array we'll propagate the + * flag to all contained variants. And if those are then destroyed this is propagated further down, + * and so on. */ + + v = json_variant_formalize(v); + if (!json_variant_is_regular(v)) + return; + + v->sensitive = true; +} + +bool json_variant_is_sensitive(JsonVariant *v) { + v = json_variant_formalize(v); + if (!json_variant_is_regular(v)) + return false; + + return v->sensitive; +} + +static void json_variant_propagate_sensitive(JsonVariant *from, JsonVariant *to) { + if (json_variant_is_sensitive(from)) + json_variant_sensitive(to); +} + +int json_variant_get_source(JsonVariant *v, const char **ret_source, unsigned *ret_line, unsigned *ret_column) { + assert_return(v, -EINVAL); + + if (ret_source) + *ret_source = json_variant_is_regular(v) && v->source ? v->source->name : NULL; + + if (ret_line) + *ret_line = json_variant_is_regular(v) ? v->line : 0; + + if (ret_column) + *ret_column = json_variant_is_regular(v) ? v->column : 0; + + return 0; +} + +static int print_source(FILE *f, JsonVariant *v, JsonFormatFlags flags, bool whitespace) { + size_t w, k; + + if (!FLAGS_SET(flags, JSON_FORMAT_SOURCE|JSON_FORMAT_PRETTY)) + return 0; + + if (!json_variant_is_regular(v)) + return 0; + + if (!v->source && v->line == 0 && v->column == 0) + return 0; + + /* The max width we need to format the line numbers for this source file */ + w = (v->source && v->source->max_line > 0) ? + DECIMAL_STR_WIDTH(v->source->max_line) : + DECIMAL_STR_MAX(unsigned)-1; + k = (v->source && v->source->max_column > 0) ? + DECIMAL_STR_WIDTH(v->source->max_column) : + DECIMAL_STR_MAX(unsigned) -1; + + if (whitespace) { + size_t n = 1 + (v->source ? strlen(v->source->name) : 0) + + ((v->source && (v->line > 0 || v->column > 0)) ? 1 : 0) + + (v->line > 0 ? w : 0) + + (((v->source || v->line > 0) && v->column > 0) ? 1 : 0) + + (v->column > 0 ? k : 0) + + 2; + + for (size_t i = 0; i < n; i++) + fputc(' ', f); + } else { + fputc('[', f); + + if (v->source) + fputs(v->source->name, f); + if (v->source && (v->line > 0 || v->column > 0)) + fputc(':', f); + if (v->line > 0) + fprintf(f, "%*u", (int) w, v->line); + if ((v->source || v->line > 0) || v->column > 0) + fputc(':', f); + if (v->column > 0) + fprintf(f, "%*u", (int) k, v->column); + + fputc(']', f); + fputc(' ', f); + } + + return 0; +} + +static void json_format_string(FILE *f, const char *q, JsonFormatFlags flags) { + assert(q); + + fputc('"', f); + + if (flags & JSON_FORMAT_COLOR) + fputs(ansi_green(), f); + + for (; *q; q++) + switch (*q) { + case '"': + fputs("\\\"", f); + break; + + case '\\': + fputs("\\\\", f); + break; + + case '\b': + fputs("\\b", f); + break; + + case '\f': + fputs("\\f", f); + break; + + case '\n': + fputs("\\n", f); + break; + + case '\r': + fputs("\\r", f); + break; + + case '\t': + fputs("\\t", f); + break; + + default: + if ((signed char) *q >= 0 && *q < ' ') + fprintf(f, "\\u%04x", (unsigned) *q); + else + fputc(*q, f); + break; + } + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_NORMAL, f); + + fputc('"', f); +} + +static int json_format(FILE *f, JsonVariant *v, JsonFormatFlags flags, const char *prefix) { + int r; + + assert(f); + assert(v); + + switch (json_variant_type(v)) { + + case JSON_VARIANT_REAL: { + locale_t loc, old_loc; + + loc = newlocale(LC_NUMERIC_MASK, "C", (locale_t) 0); + if (loc == (locale_t) 0) + return -errno; + + if (flags & JSON_FORMAT_COLOR) + fputs(ansi_highlight_blue(), f); + + old_loc = uselocale(loc); + fprintf(f, "%.*e", DECIMAL_DIG, json_variant_real(v)); + uselocale(old_loc); + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_NORMAL, f); + + freelocale(loc); + break; + } + + case JSON_VARIANT_INTEGER: + if (flags & JSON_FORMAT_COLOR) + fputs(ansi_highlight_blue(), f); + + fprintf(f, "%" PRIdMAX, json_variant_integer(v)); + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_NORMAL, f); + break; + + case JSON_VARIANT_UNSIGNED: + if (flags & JSON_FORMAT_COLOR) + fputs(ansi_highlight_blue(), f); + + fprintf(f, "%" PRIuMAX, json_variant_unsigned(v)); + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_NORMAL, f); + break; + + case JSON_VARIANT_BOOLEAN: + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_HIGHLIGHT, f); + + if (json_variant_boolean(v)) + fputs("true", f); + else + fputs("false", f); + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_NORMAL, f); + + break; + + case JSON_VARIANT_NULL: + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_HIGHLIGHT, f); + + fputs("null", f); + + if (flags & JSON_FORMAT_COLOR) + fputs(ANSI_NORMAL, f); + break; + + case JSON_VARIANT_STRING: + json_format_string(f, json_variant_string(v), flags); + break; + + case JSON_VARIANT_ARRAY: { + size_t n = json_variant_elements(v); + if (n == 0) + fputs("[]", f); + else { + _cleanup_free_ char *joined = NULL; + const char *prefix2; + + if (flags & JSON_FORMAT_PRETTY) { + joined = strjoin(strempty(prefix), "\t"); + if (!joined) + return -ENOMEM; + + prefix2 = joined; + fputs("[\n", f); + } else { + prefix2 = strempty(prefix); + fputc('[', f); + } + + for (size_t i = 0; i < n; i++) { + JsonVariant *e; + + assert_se(e = json_variant_by_index(v, i)); + + if (i > 0) { + if (flags & JSON_FORMAT_PRETTY) + fputs(",\n", f); + else + fputc(',', f); + } + + if (flags & JSON_FORMAT_PRETTY) { + print_source(f, e, flags, false); + fputs(prefix2, f); + } + + r = json_format(f, e, flags, prefix2); + if (r < 0) + return r; + } + + if (flags & JSON_FORMAT_PRETTY) { + fputc('\n', f); + print_source(f, v, flags, true); + fputs(strempty(prefix), f); + } + + fputc(']', f); + } + break; + } + + case JSON_VARIANT_OBJECT: { + size_t n = json_variant_elements(v); + if (n == 0) + fputs("{}", f); + else { + _cleanup_free_ char *joined = NULL; + const char *prefix2; + + if (flags & JSON_FORMAT_PRETTY) { + joined = strjoin(strempty(prefix), "\t"); + if (!joined) + return -ENOMEM; + + prefix2 = joined; + fputs("{\n", f); + } else { + prefix2 = strempty(prefix); + fputc('{', f); + } + + for (size_t i = 0; i < n; i += 2) { + JsonVariant *e; + + e = json_variant_by_index(v, i); + + if (i > 0) { + if (flags & JSON_FORMAT_PRETTY) + fputs(",\n", f); + else + fputc(',', f); + } + + if (flags & JSON_FORMAT_PRETTY) { + print_source(f, e, flags, false); + fputs(prefix2, f); + } + + r = json_format(f, e, flags, prefix2); + if (r < 0) + return r; + + fputs(flags & JSON_FORMAT_PRETTY ? " : " : ":", f); + + r = json_format(f, json_variant_by_index(v, i+1), flags, prefix2); + if (r < 0) + return r; + } + + if (flags & JSON_FORMAT_PRETTY) { + fputc('\n', f); + print_source(f, v, flags, true); + fputs(strempty(prefix), f); + } + + fputc('}', f); + } + break; + } + + default: + assert_not_reached(); + } + + return 0; +} + +int json_variant_format(JsonVariant *v, JsonFormatFlags flags, char **ret) { + _cleanup_(memstream_done) MemStream m = {}; + size_t sz; + FILE *f; + int r; + + /* Returns the length of the generated string (without the terminating NUL), + * or negative on error. */ + + assert_return(v, -EINVAL); + assert_return(ret, -EINVAL); + + if (flags & JSON_FORMAT_OFF) + return -ENOEXEC; + + f = memstream_init(&m); + if (!f) + return -ENOMEM; + + r = json_variant_dump(v, flags, f, NULL); + if (r < 0) + return r; + + r = memstream_finalize(&m, ret, &sz); + if (r < 0) + return r; + + return sz; +} + +int json_variant_dump(JsonVariant *v, JsonFormatFlags flags, FILE *f, const char *prefix) { + if (!v) { + if (flags & JSON_FORMAT_EMPTY_ARRAY) + v = JSON_VARIANT_MAGIC_EMPTY_ARRAY; + else + return 0; + } + + if (!f) + f = stdout; + + print_source(f, v, flags, false); + + if (((flags & (JSON_FORMAT_COLOR_AUTO|JSON_FORMAT_COLOR)) == JSON_FORMAT_COLOR_AUTO) && colors_enabled()) + flags |= JSON_FORMAT_COLOR; + + if (((flags & (JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_PRETTY)) == JSON_FORMAT_PRETTY_AUTO)) + flags |= on_tty() ? JSON_FORMAT_PRETTY : JSON_FORMAT_NEWLINE; + + if (flags & JSON_FORMAT_SSE) + fputs("data: ", f); + if (flags & JSON_FORMAT_SEQ) + fputc('\x1e', f); /* ASCII Record Separator */ + + json_format(f, v, flags, prefix); + + if (flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_SEQ|JSON_FORMAT_SSE|JSON_FORMAT_NEWLINE)) + fputc('\n', f); + if (flags & JSON_FORMAT_SSE) + fputc('\n', f); /* In case of SSE add a second newline */ + + if (flags & JSON_FORMAT_FLUSH) + return fflush_and_check(f); + return 0; +} + +int json_variant_filter(JsonVariant **v, char **to_remove) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + _cleanup_free_ JsonVariant **array = NULL; + size_t n = 0, k = 0; + int r; + + assert(v); + + if (json_variant_is_blank_object(*v)) + return 0; + if (!json_variant_is_object(*v)) + return -EINVAL; + + if (strv_isempty(to_remove)) + return 0; + + for (size_t i = 0; i < json_variant_elements(*v); i += 2) { + JsonVariant *p; + + p = json_variant_by_index(*v, i); + if (!json_variant_has_type(p, JSON_VARIANT_STRING)) + return -EINVAL; + + if (strv_contains(to_remove, json_variant_string(p))) { + if (!array) { + array = new(JsonVariant*, json_variant_elements(*v) - 2); + if (!array) + return -ENOMEM; + + for (k = 0; k < i; k++) + array[k] = json_variant_by_index(*v, k); + } + + n++; + } else if (array) { + array[k++] = p; + array[k++] = json_variant_by_index(*v, i + 1); + } + } + + if (n == 0) + return 0; + + r = json_variant_new_object(&w, array, k); + if (r < 0) + return r; + + json_variant_propagate_sensitive(*v, w); + JSON_VARIANT_REPLACE(*v, TAKE_PTR(w)); + + return (int) n; +} + +int json_variant_set_field(JsonVariant **v, const char *field, JsonVariant *value) { + _cleanup_(json_variant_unrefp) JsonVariant *field_variant = NULL, *w = NULL; + _cleanup_free_ JsonVariant **array = NULL; + size_t k = 0; + int r; + + assert(v); + assert(field); + + if (json_variant_is_blank_object(*v)) { + array = new(JsonVariant*, 2); + if (!array) + return -ENOMEM; + + } else { + if (!json_variant_is_object(*v)) + return -EINVAL; + + for (size_t i = 0; i < json_variant_elements(*v); i += 2) { + JsonVariant *p; + + p = json_variant_by_index(*v, i); + if (!json_variant_is_string(p)) + return -EINVAL; + + if (streq(json_variant_string(p), field)) { + + if (!array) { + array = new(JsonVariant*, json_variant_elements(*v)); + if (!array) + return -ENOMEM; + + for (k = 0; k < i; k++) + array[k] = json_variant_by_index(*v, k); + } + + } else if (array) { + array[k++] = p; + array[k++] = json_variant_by_index(*v, i + 1); + } + } + + if (!array) { + array = new(JsonVariant*, json_variant_elements(*v) + 2); + if (!array) + return -ENOMEM; + + for (k = 0; k < json_variant_elements(*v); k++) + array[k] = json_variant_by_index(*v, k); + } + } + + r = json_variant_new_string(&field_variant, field); + if (r < 0) + return r; + + array[k++] = field_variant; + array[k++] = value; + + r = json_variant_new_object(&w, array, k); + if (r < 0) + return r; + + json_variant_propagate_sensitive(*v, w); + JSON_VARIANT_REPLACE(*v, TAKE_PTR(w)); + + return 1; +} + +int json_variant_set_fieldb(JsonVariant **v, const char *field, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + va_list ap; + int r; + + va_start(ap, field); + r = json_buildv(&w, ap); + va_end(ap); + if (r < 0) + return r; + + return json_variant_set_field(v, field, w); +} + +int json_variant_set_field_string(JsonVariant **v, const char *field, const char *value) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + r = json_variant_new_string(&m, value); + if (r < 0) + return r; + + return json_variant_set_field(v, field, m); +} + +int json_variant_set_field_integer(JsonVariant **v, const char *field, int64_t i) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + r = json_variant_new_integer(&m, i); + if (r < 0) + return r; + + return json_variant_set_field(v, field, m); +} + +int json_variant_set_field_unsigned(JsonVariant **v, const char *field, uint64_t u) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + r = json_variant_new_unsigned(&m, u); + if (r < 0) + return r; + + return json_variant_set_field(v, field, m); +} + +int json_variant_set_field_boolean(JsonVariant **v, const char *field, bool b) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + r = json_variant_new_boolean(&m, b); + if (r < 0) + return r; + + return json_variant_set_field(v, field, m); +} + +int json_variant_set_field_strv(JsonVariant **v, const char *field, char **l) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + r = json_variant_new_array_strv(&m, l); + if (r < 0) + return r; + + return json_variant_set_field(v, field, m); +} + +int json_variant_merge_object(JsonVariant **v, JsonVariant *m) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + _cleanup_free_ JsonVariant **array = NULL; + size_t v_elements, m_elements, k; + bool v_blank, m_blank; + int r; + + m = json_variant_dereference(m); + + v_blank = json_variant_is_blank_object(*v); + m_blank = json_variant_is_blank_object(m); + + if (!v_blank && !json_variant_is_object(*v)) + return -EINVAL; + if (!m_blank && !json_variant_is_object(m)) + return -EINVAL; + + if (m_blank) + return 0; /* nothing to do */ + + if (v_blank) { + JSON_VARIANT_REPLACE(*v, json_variant_ref(m)); + return 1; + } + + v_elements = json_variant_elements(*v); + m_elements = json_variant_elements(m); + if (v_elements > SIZE_MAX - m_elements) /* overflow check */ + return -ENOMEM; + + array = new(JsonVariant*, v_elements + m_elements); + if (!array) + return -ENOMEM; + + k = 0; + for (size_t i = 0; i < v_elements; i += 2) { + JsonVariant *u; + + u = json_variant_by_index(*v, i); + if (!json_variant_is_string(u)) + return -EINVAL; + + if (json_variant_by_key(m, json_variant_string(u))) + continue; /* skip if exists in second variant */ + + array[k++] = u; + array[k++] = json_variant_by_index(*v, i + 1); + } + + for (size_t i = 0; i < m_elements; i++) + array[k++] = json_variant_by_index(m, i); + + r = json_variant_new_object(&w, array, k); + if (r < 0) + return r; + + json_variant_propagate_sensitive(*v, w); + json_variant_propagate_sensitive(m, w); + JSON_VARIANT_REPLACE(*v, TAKE_PTR(w)); + + return 1; +} + +int json_variant_merge_objectb(JsonVariant **v, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + va_list ap; + int r; + + va_start(ap, v); + r = json_buildv(&w, ap); + va_end(ap); + if (r < 0) + return r; + + return json_variant_merge_object(v, w); +} + +int json_variant_append_array(JsonVariant **v, JsonVariant *element) { + _cleanup_(json_variant_unrefp) JsonVariant *nv = NULL; + bool blank; + int r; + + assert(v); + assert(element); + + if (!*v || json_variant_is_null(*v)) + blank = true; + else if (json_variant_is_array(*v)) + blank = json_variant_elements(*v) == 0; + else + return -EINVAL; + + if (blank) { + r = json_variant_new_array(&nv, (JsonVariant*[]) { element }, 1); + if (r < 0) + return r; + } else if (json_variant_n_ref(*v) == 1) { + /* Let's bump the reference count on element. We can't do the realloc if we're appending *v + * to itself, or one of the objects embedded in *v to *v. If the reference count grows, we + * need to fall back to the other method below. */ + + _unused_ _cleanup_(json_variant_unrefp) JsonVariant *dummy = json_variant_ref(element); + if (json_variant_n_ref(*v) == 1) { + /* We hold the only reference. Let's mutate the object. */ + size_t size = json_variant_elements(*v); + void *old = *v; + + if (!GREEDY_REALLOC(*v, size + 1 + 1)) + return -ENOMEM; + + if (old != *v) + /* Readjust the parent pointers to the new address */ + for (size_t i = 1; i < size; i++) + (*v)[1 + i].parent = *v; + + return _json_variant_array_put_element(*v, element); + } + } + + if (!blank) { + size_t size = json_variant_elements(*v); + + _cleanup_free_ JsonVariant **array = new(JsonVariant*, size + 1); + if (!array) + return -ENOMEM; + + for (size_t i = 0; i < size; i++) + array[i] = json_variant_by_index(*v, i); + + array[size] = element; + + r = json_variant_new_array(&nv, array, size + 1); + if (r < 0) + return r; + } + + json_variant_propagate_sensitive(*v, nv); + JSON_VARIANT_REPLACE(*v, TAKE_PTR(nv)); + + return 0; +} + +int json_variant_append_arrayb(JsonVariant **v, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + va_list ap; + int r; + + va_start(ap, v); + r = json_buildv(&w, ap); + va_end(ap); + if (r < 0) + return r; + + return json_variant_append_array(v, w); +} + +JsonVariant *json_variant_find(JsonVariant *haystack, JsonVariant *needle) { + JsonVariant *i; + + /* Find a json object in an array. Returns NULL if not found, or if the array is not actually an array. */ + + JSON_VARIANT_ARRAY_FOREACH(i, haystack) + if (json_variant_equal(i, needle)) + return i; + + return NULL; +} + +int json_variant_append_array_nodup(JsonVariant **v, JsonVariant *element) { + assert(v); + + if (json_variant_find(*v, element)) + return 0; + + return json_variant_append_array(v, element); +} + +int json_variant_strv(JsonVariant *v, char ***ret) { + char **l = NULL; + bool sensitive; + int r; + + assert(ret); + + if (!v || json_variant_is_null(v)) { + l = new0(char*, 1); + if (!l) + return -ENOMEM; + + *ret = l; + return 0; + } + + if (!json_variant_is_array(v)) + return -EINVAL; + + sensitive = json_variant_is_sensitive(v); + + size_t n = json_variant_elements(v); + l = new(char*, n+1); + if (!l) + return -ENOMEM; + + for (size_t i = 0; i < n; i++) { + JsonVariant *e; + + assert_se(e = json_variant_by_index(v, i)); + sensitive = sensitive || json_variant_is_sensitive(e); + + if (!json_variant_is_string(e)) { + l[i] = NULL; + r = -EINVAL; + goto fail; + } + + l[i] = strdup(json_variant_string(e)); + if (!l[i]) { + r = -ENOMEM; + goto fail; + } + } + + l[n] = NULL; + *ret = TAKE_PTR(l); + + return 0; + +fail: + if (sensitive) + strv_free_erase(l); + else + strv_free(l); + + return r; +} + +static int json_variant_copy(JsonVariant **nv, JsonVariant *v) { + JsonVariantType t; + JsonVariant *c; + JsonValue value; + const void *source; + size_t k; + + assert(nv); + assert(v); + + /* Let's copy the simple types literally, and the larger types by references */ + t = json_variant_type(v); + switch (t) { + case JSON_VARIANT_INTEGER: + k = sizeof(int64_t); + value.integer = json_variant_integer(v); + source = &value; + break; + + case JSON_VARIANT_UNSIGNED: + k = sizeof(uint64_t); + value.unsig = json_variant_unsigned(v); + source = &value; + break; + + case JSON_VARIANT_REAL: + k = sizeof(double); + value.real = json_variant_real(v); + source = &value; + break; + + case JSON_VARIANT_BOOLEAN: + k = sizeof(bool); + value.boolean = json_variant_boolean(v); + source = &value; + break; + + case JSON_VARIANT_NULL: + k = 0; + source = NULL; + break; + + case JSON_VARIANT_STRING: + source = json_variant_string(v); + k = strnlen(source, INLINE_STRING_MAX + 1); + if (k <= INLINE_STRING_MAX) { + k ++; + break; + } + + _fallthrough_; + + default: + /* Everything else copy by reference */ + + c = malloc0(MAX(sizeof(JsonVariant), + offsetof(JsonVariant, reference) + sizeof(JsonVariant*))); + if (!c) + return -ENOMEM; + + c->n_ref = 1; + c->type = t; + c->is_reference = true; + c->reference = json_variant_ref(json_variant_formalize(v)); + + *nv = c; + return 0; + } + + c = malloc0(MAX(sizeof(JsonVariant), + offsetof(JsonVariant, value) + k)); + if (!c) + return -ENOMEM; + + c->n_ref = 1; + c->type = t; + + memcpy_safe(&c->value, source, k); + + json_variant_propagate_sensitive(v, c); + + *nv = c; + return 0; +} + +static bool json_single_ref(JsonVariant *v) { + + /* Checks whether the caller is the single owner of the object, i.e. can get away with changing it */ + + if (!json_variant_is_regular(v)) + return false; + + if (v->is_embedded) + return json_single_ref(v->parent); + + assert(v->n_ref > 0); + return v->n_ref == 1; +} + +static int json_variant_set_source(JsonVariant **v, JsonSource *source, unsigned line, unsigned column) { + JsonVariant *w; + int r; + + assert(v); + + /* Patch in source and line/column number. Tries to do this in-place if the caller is the sole + * referencer of the object. If not, allocates a new object, possibly a surrogate for the original + * one */ + + if (!*v) + return 0; + + if (source && line > source->max_line) + source->max_line = line; + if (source && column > source->max_column) + source->max_column = column; + + if (!json_variant_is_regular(*v)) { + + if (!source && line == 0 && column == 0) + return 0; + + } else { + if (json_source_equal((*v)->source, source) && + (*v)->line == line && + (*v)->column == column) + return 0; + + if (json_single_ref(*v)) { /* Sole reference? */ + json_source_unref((*v)->source); + (*v)->source = json_source_ref(source); + (*v)->line = line; + (*v)->column = column; + return 1; + } + } + + r = json_variant_copy(&w, *v); + if (r < 0) + return r; + + assert(json_variant_is_regular(w)); + assert(!w->is_embedded); + assert(w->n_ref == 1); + assert(!w->source); + + w->source = json_source_ref(source); + w->line = line; + w->column = column; + + JSON_VARIANT_REPLACE(*v, w); + + return 1; +} + +static void inc_lines_columns(unsigned *line, unsigned *column, const char *s, size_t n) { + assert(line); + assert(column); + assert(s || n == 0); + + while (n > 0) { + if (*s == '\n') { + (*line)++; + *column = 1; + } else if ((signed char) *s >= 0 && *s < 127) /* Process ASCII chars quickly */ + (*column)++; + else { + int w; + + w = utf8_encoded_valid_unichar(s, n); + if (w < 0) /* count invalid unichars as normal characters */ + w = 1; + else if ((size_t) w > n) /* never read more than the specified number of characters */ + w = (int) n; + + (*column)++; + + s += w; + n -= w; + continue; + } + + s++; + n--; + } +} + +static int unhex_ucs2(const char *c, uint16_t *ret) { + int aa, bb, cc, dd; + uint16_t x; + + assert(c); + assert(ret); + + aa = unhexchar(c[0]); + if (aa < 0) + return -EINVAL; + + bb = unhexchar(c[1]); + if (bb < 0) + return -EINVAL; + + cc = unhexchar(c[2]); + if (cc < 0) + return -EINVAL; + + dd = unhexchar(c[3]); + if (dd < 0) + return -EINVAL; + + x = ((uint16_t) aa << 12) | + ((uint16_t) bb << 8) | + ((uint16_t) cc << 4) | + ((uint16_t) dd); + + if (x <= 0) + return -EINVAL; + + *ret = x; + + return 0; +} + +static int json_parse_string(const char **p, char **ret) { + _cleanup_free_ char *s = NULL; + size_t n = 0; + const char *c; + + assert(p); + assert(*p); + assert(ret); + + c = *p; + + if (*c != '"') + return -EINVAL; + + c++; + + for (;;) { + int len; + + /* Check for EOF */ + if (*c == 0) + return -EINVAL; + + /* Check for control characters 0x00..0x1f */ + if (*c > 0 && *c < ' ') + return -EINVAL; + + /* Check for control character 0x7f */ + if (*c == 0x7f) + return -EINVAL; + + if (*c == '"') { + if (!s) { + s = strdup(""); + if (!s) + return -ENOMEM; + } else + s[n] = 0; + + *p = c + 1; + + *ret = TAKE_PTR(s); + return JSON_TOKEN_STRING; + } + + if (*c == '\\') { + char ch = 0; + c++; + + if (*c == 0) + return -EINVAL; + + if (IN_SET(*c, '"', '\\', '/')) + ch = *c; + else if (*c == 'b') + ch = '\b'; + else if (*c == 'f') + ch = '\f'; + else if (*c == 'n') + ch = '\n'; + else if (*c == 'r') + ch = '\r'; + else if (*c == 't') + ch = '\t'; + else if (*c == 'u') { + char16_t x; + int r; + + r = unhex_ucs2(c + 1, &x); + if (r < 0) + return r; + + c += 5; + + if (!GREEDY_REALLOC(s, n + 5)) + return -ENOMEM; + + if (!utf16_is_surrogate(x)) + n += utf8_encode_unichar(s + n, (char32_t) x); + else if (utf16_is_trailing_surrogate(x)) + return -EINVAL; + else { + char16_t y; + + if (c[0] != '\\' || c[1] != 'u') + return -EINVAL; + + r = unhex_ucs2(c + 2, &y); + if (r < 0) + return r; + + c += 6; + + if (!utf16_is_trailing_surrogate(y)) + return -EINVAL; + + n += utf8_encode_unichar(s + n, utf16_surrogate_pair_to_unichar(x, y)); + } + + continue; + } else + return -EINVAL; + + if (!GREEDY_REALLOC(s, n + 2)) + return -ENOMEM; + + s[n++] = ch; + c ++; + continue; + } + + len = utf8_encoded_valid_unichar(c, SIZE_MAX); + if (len < 0) + return len; + + if (!GREEDY_REALLOC(s, n + len + 1)) + return -ENOMEM; + + memcpy(s + n, c, len); + n += len; + c += len; + } +} + +static int json_parse_number(const char **p, JsonValue *ret) { + bool negative = false, exponent_negative = false, is_real = false; + double x = 0.0, y = 0.0, exponent = 0.0, shift = 1.0; + int64_t i = 0; + uint64_t u = 0; + const char *c; + + assert(p); + assert(*p); + assert(ret); + + c = *p; + + if (*c == '-') { + negative = true; + c++; + } + + if (*c == '0') + c++; + else { + if (!strchr("123456789", *c) || *c == 0) + return -EINVAL; + + do { + if (!is_real) { + if (negative) { + + if (i < INT64_MIN / 10) /* overflow */ + is_real = true; + else { + int64_t t = 10 * i; + + if (t < INT64_MIN + (*c - '0')) /* overflow */ + is_real = true; + else + i = t - (*c - '0'); + } + } else { + if (u > UINT64_MAX / 10) /* overflow */ + is_real = true; + else { + uint64_t t = 10 * u; + + if (t > UINT64_MAX - (*c - '0')) /* overflow */ + is_real = true; + else + u = t + (*c - '0'); + } + } + } + + x = 10.0 * x + (*c - '0'); + + c++; + } while (strchr("0123456789", *c) && *c != 0); + } + + if (*c == '.') { + is_real = true; + c++; + + if (!strchr("0123456789", *c) || *c == 0) + return -EINVAL; + + do { + y = 10.0 * y + (*c - '0'); + shift = 10.0 * shift; + c++; + } while (strchr("0123456789", *c) && *c != 0); + } + + if (IN_SET(*c, 'e', 'E')) { + is_real = true; + c++; + + if (*c == '-') { + exponent_negative = true; + c++; + } else if (*c == '+') + c++; + + if (!strchr("0123456789", *c) || *c == 0) + return -EINVAL; + + do { + exponent = 10.0 * exponent + (*c - '0'); + c++; + } while (strchr("0123456789", *c) && *c != 0); + } + + *p = c; + + if (is_real) { + ret->real = ((negative ? -1.0 : 1.0) * (x + (y / shift))) * exp10((exponent_negative ? -1.0 : 1.0) * exponent); + return JSON_TOKEN_REAL; + } else if (negative) { + ret->integer = i; + return JSON_TOKEN_INTEGER; + } else { + ret->unsig = u; + return JSON_TOKEN_UNSIGNED; + } +} + +int json_tokenize( + const char **p, + char **ret_string, + JsonValue *ret_value, + unsigned *ret_line, /* 'ret_line' returns the line at the beginning of this token */ + unsigned *ret_column, + void **state, + unsigned *line, /* 'line' is used as a line state, it always reflect the line we are at after the token was read */ + unsigned *column) { + + unsigned start_line, start_column; + const char *start, *c; + size_t n; + int t, r; + + enum { + STATE_NULL, + STATE_VALUE, + STATE_VALUE_POST, + }; + + assert(p); + assert(*p); + assert(ret_string); + assert(ret_value); + assert(ret_line); + assert(ret_column); + assert(line); + assert(column); + assert(state); + + t = PTR_TO_INT(*state); + if (t == STATE_NULL) { + *line = 1; + *column = 1; + t = STATE_VALUE; + } + + /* Skip over the whitespace */ + n = strspn(*p, WHITESPACE); + inc_lines_columns(line, column, *p, n); + c = *p + n; + + /* Remember where we started processing this token */ + start = c; + start_line = *line; + start_column = *column; + + if (*c == 0) { + *ret_string = NULL; + *ret_value = JSON_VALUE_NULL; + r = JSON_TOKEN_END; + goto finish; + } + + switch (t) { + + case STATE_VALUE: + + if (*c == '{') { + c++; + *state = INT_TO_PTR(STATE_VALUE); + r = JSON_TOKEN_OBJECT_OPEN; + goto null_return; + + } else if (*c == '}') { + c++; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_OBJECT_CLOSE; + goto null_return; + + } else if (*c == '[') { + c++; + *state = INT_TO_PTR(STATE_VALUE); + r = JSON_TOKEN_ARRAY_OPEN; + goto null_return; + + } else if (*c == ']') { + c++; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_ARRAY_CLOSE; + goto null_return; + + } else if (*c == '"') { + + r = json_parse_string(&c, ret_string); + if (r < 0) + return r; + + *ret_value = JSON_VALUE_NULL; + *state = INT_TO_PTR(STATE_VALUE_POST); + goto finish; + + } else if (strchr("-0123456789", *c)) { + + r = json_parse_number(&c, ret_value); + if (r < 0) + return r; + + *ret_string = NULL; + *state = INT_TO_PTR(STATE_VALUE_POST); + goto finish; + + } else if (startswith(c, "true")) { + *ret_string = NULL; + ret_value->boolean = true; + c += 4; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_BOOLEAN; + goto finish; + + } else if (startswith(c, "false")) { + *ret_string = NULL; + ret_value->boolean = false; + c += 5; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_BOOLEAN; + goto finish; + + } else if (startswith(c, "null")) { + *ret_string = NULL; + *ret_value = JSON_VALUE_NULL; + c += 4; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_NULL; + goto finish; + + } + + return -EINVAL; + + case STATE_VALUE_POST: + + if (*c == ':') { + c++; + *state = INT_TO_PTR(STATE_VALUE); + r = JSON_TOKEN_COLON; + goto null_return; + + } else if (*c == ',') { + c++; + *state = INT_TO_PTR(STATE_VALUE); + r = JSON_TOKEN_COMMA; + goto null_return; + + } else if (*c == '}') { + c++; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_OBJECT_CLOSE; + goto null_return; + + } else if (*c == ']') { + c++; + *state = INT_TO_PTR(STATE_VALUE_POST); + r = JSON_TOKEN_ARRAY_CLOSE; + goto null_return; + } + + return -EINVAL; + + default: + assert_not_reached(); + } + +null_return: + *ret_string = NULL; + *ret_value = JSON_VALUE_NULL; + +finish: + inc_lines_columns(line, column, start, c - start); + *p = c; + + *ret_line = start_line; + *ret_column = start_column; + + return r; +} + +typedef enum JsonExpect { + /* The following values are used by json_parse() */ + EXPECT_TOPLEVEL, + EXPECT_END, + EXPECT_OBJECT_FIRST_KEY, + EXPECT_OBJECT_NEXT_KEY, + EXPECT_OBJECT_COLON, + EXPECT_OBJECT_VALUE, + EXPECT_OBJECT_COMMA, + EXPECT_ARRAY_FIRST_ELEMENT, + EXPECT_ARRAY_NEXT_ELEMENT, + EXPECT_ARRAY_COMMA, + + /* And these are used by json_build() */ + EXPECT_ARRAY_ELEMENT, + EXPECT_OBJECT_KEY, +} JsonExpect; + +typedef struct JsonStack { + JsonExpect expect; + JsonVariant **elements; + size_t n_elements; + unsigned line_before; + unsigned column_before; + size_t n_suppress; /* When building: if > 0, suppress this many subsequent elements. If == SIZE_MAX, suppress all subsequent elements */ +} JsonStack; + +static void json_stack_release(JsonStack *s) { + assert(s); + + CLEANUP_ARRAY(s->elements, s->n_elements, json_variant_unref_many); +} + +static int json_parse_internal( + const char **input, + JsonSource *source, + JsonParseFlags flags, + JsonVariant **ret, + unsigned *line, + unsigned *column, + bool continue_end) { + + size_t n_stack = 1; + unsigned line_buffer = 0, column_buffer = 0; + void *tokenizer_state = NULL; + JsonStack *stack = NULL; + const char *p; + int r; + + assert_return(input, -EINVAL); + assert_return(ret, -EINVAL); + + p = *input; + + if (!GREEDY_REALLOC(stack, n_stack)) + return -ENOMEM; + + stack[0] = (JsonStack) { + .expect = EXPECT_TOPLEVEL, + }; + + if (!line) + line = &line_buffer; + if (!column) + column = &column_buffer; + + for (;;) { + _cleanup_(json_variant_unrefp) JsonVariant *add = NULL; + _cleanup_free_ char *string = NULL; + unsigned line_token, column_token; + JsonStack *current; + JsonValue value; + int token; + + assert(n_stack > 0); + current = stack + n_stack - 1; + + if (continue_end && current->expect == EXPECT_END) + goto done; + + token = json_tokenize(&p, &string, &value, &line_token, &column_token, &tokenizer_state, line, column); + if (token < 0) { + r = token; + goto finish; + } + + switch (token) { + + case JSON_TOKEN_END: + if (current->expect != EXPECT_END) { + r = -EINVAL; + goto finish; + } + + assert(current->n_elements == 1); + assert(n_stack == 1); + goto done; + + case JSON_TOKEN_COLON: + + if (current->expect != EXPECT_OBJECT_COLON) { + r = -EINVAL; + goto finish; + } + + current->expect = EXPECT_OBJECT_VALUE; + break; + + case JSON_TOKEN_COMMA: + + if (current->expect == EXPECT_OBJECT_COMMA) + current->expect = EXPECT_OBJECT_NEXT_KEY; + else if (current->expect == EXPECT_ARRAY_COMMA) + current->expect = EXPECT_ARRAY_NEXT_ELEMENT; + else { + r = -EINVAL; + goto finish; + } + + break; + + case JSON_TOKEN_OBJECT_OPEN: + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + if (!GREEDY_REALLOC(stack, n_stack+1)) { + r = -ENOMEM; + goto finish; + } + current = stack + n_stack - 1; + + /* Prepare the expect for when we return from the child */ + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + stack[n_stack++] = (JsonStack) { + .expect = EXPECT_OBJECT_FIRST_KEY, + .line_before = line_token, + .column_before = column_token, + }; + + current = stack + n_stack - 1; + break; + + case JSON_TOKEN_OBJECT_CLOSE: + if (!IN_SET(current->expect, EXPECT_OBJECT_FIRST_KEY, EXPECT_OBJECT_COMMA)) { + r = -EINVAL; + goto finish; + } + + assert(n_stack > 1); + + r = json_variant_new_object(&add, current->elements, current->n_elements); + if (r < 0) + goto finish; + + line_token = current->line_before; + column_token = current->column_before; + + json_stack_release(current); + n_stack--, current--; + + break; + + case JSON_TOKEN_ARRAY_OPEN: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + if (!GREEDY_REALLOC(stack, n_stack+1)) { + r = -ENOMEM; + goto finish; + } + current = stack + n_stack - 1; + + /* Prepare the expect for when we return from the child */ + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + stack[n_stack++] = (JsonStack) { + .expect = EXPECT_ARRAY_FIRST_ELEMENT, + .line_before = line_token, + .column_before = column_token, + }; + + break; + + case JSON_TOKEN_ARRAY_CLOSE: + if (!IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_COMMA)) { + r = -EINVAL; + goto finish; + } + + assert(n_stack > 1); + + r = json_variant_new_array(&add, current->elements, current->n_elements); + if (r < 0) + goto finish; + + line_token = current->line_before; + column_token = current->column_before; + + json_stack_release(current); + n_stack--, current--; + break; + + case JSON_TOKEN_STRING: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_FIRST_KEY, EXPECT_OBJECT_NEXT_KEY, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + r = json_variant_new_string(&add, string); + if (r < 0) + goto finish; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (IN_SET(current->expect, EXPECT_OBJECT_FIRST_KEY, EXPECT_OBJECT_NEXT_KEY)) + current->expect = EXPECT_OBJECT_COLON; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + break; + + case JSON_TOKEN_REAL: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + r = json_variant_new_real(&add, value.real); + if (r < 0) + goto finish; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + break; + + case JSON_TOKEN_INTEGER: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + r = json_variant_new_integer(&add, value.integer); + if (r < 0) + goto finish; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + break; + + case JSON_TOKEN_UNSIGNED: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + r = json_variant_new_unsigned(&add, value.unsig); + if (r < 0) + goto finish; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + break; + + case JSON_TOKEN_BOOLEAN: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + r = json_variant_new_boolean(&add, value.boolean); + if (r < 0) + goto finish; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + break; + + case JSON_TOKEN_NULL: + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + r = json_variant_new_null(&add); + if (r < 0) + goto finish; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_COMMA; + else { + assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)); + current->expect = EXPECT_ARRAY_COMMA; + } + + break; + + default: + assert_not_reached(); + } + + if (add) { + /* If we are asked to make this parsed object sensitive, then let's apply this + * immediately after allocating each variant, so that when we abort half-way + * everything we already allocated that is then freed is correctly marked. */ + if (FLAGS_SET(flags, JSON_PARSE_SENSITIVE)) + json_variant_sensitive(add); + + (void) json_variant_set_source(&add, source, line_token, column_token); + + if (!GREEDY_REALLOC(current->elements, current->n_elements + 1)) { + r = -ENOMEM; + goto finish; + } + + current->elements[current->n_elements++] = TAKE_PTR(add); + } + } + +done: + assert(n_stack == 1); + assert(stack[0].n_elements == 1); + + *ret = json_variant_ref(stack[0].elements[0]); + *input = p; + r = 0; + +finish: + for (size_t i = 0; i < n_stack; i++) + json_stack_release(stack + i); + + free(stack); + + return r; +} + +int json_parse_with_source( + const char *input, + const char *source, + JsonParseFlags flags, + JsonVariant **ret, + unsigned *ret_line, + unsigned *ret_column) { + + _cleanup_(json_source_unrefp) JsonSource *s = NULL; + + if (source) { + s = json_source_new(source); + if (!s) + return -ENOMEM; + } + + return json_parse_internal(&input, s, flags, ret, ret_line, ret_column, false); +} + +int json_parse_with_source_continue( + const char **p, + const char *source, + JsonParseFlags flags, + JsonVariant **ret, + unsigned *ret_line, + unsigned *ret_column) { + + _cleanup_(json_source_unrefp) JsonSource *s = NULL; + + if (source) { + s = json_source_new(source); + if (!s) + return -ENOMEM; + } + + return json_parse_internal(p, s, flags, ret, ret_line, ret_column, true); +} + +int json_parse_file_at( + FILE *f, + int dir_fd, + const char *path, + JsonParseFlags flags, + JsonVariant **ret, + unsigned *ret_line, + unsigned *ret_column) { + + _cleanup_free_ char *text = NULL; + int r; + + if (f) + r = read_full_stream(f, &text, NULL); + else if (path) + r = read_full_file_full(dir_fd, path, UINT64_MAX, SIZE_MAX, 0, NULL, &text, NULL); + else + return -EINVAL; + if (r < 0) + return r; + + if (isempty(text)) + return -ENODATA; + + return json_parse_with_source(text, path, flags, ret, ret_line, ret_column); +} + +int json_buildv(JsonVariant **ret, va_list ap) { + JsonStack *stack = NULL; + size_t n_stack = 1; + const char *name = NULL; + int r; + + assert_return(ret, -EINVAL); + + if (!GREEDY_REALLOC(stack, n_stack)) + return -ENOMEM; + + stack[0] = (JsonStack) { + .expect = EXPECT_TOPLEVEL, + }; + + for (;;) { + _cleanup_(json_variant_unrefp) JsonVariant *add = NULL, *add_more = NULL; + size_t n_subtract = 0; /* how much to subtract from current->n_suppress, i.e. how many elements would + * have been added to the current variant */ + JsonStack *current; + int command; + + assert(n_stack > 0); + current = stack + n_stack - 1; + + if (current->expect == EXPECT_END) + goto done; + + command = va_arg(ap, int); + + switch (command) { + + case _JSON_BUILD_STRING: { + const char *p; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + p = va_arg(ap, const char *); + + if (current->n_suppress == 0) { + r = json_variant_new_string(&add, p); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_INTEGER: { + int64_t j; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + j = va_arg(ap, int64_t); + + if (current->n_suppress == 0) { + r = json_variant_new_integer(&add, j); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_UNSIGNED: { + uint64_t j; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + j = va_arg(ap, uint64_t); + + if (current->n_suppress == 0) { + r = json_variant_new_unsigned(&add, j); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_REAL: { + double d; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + d = va_arg(ap, double); + + if (current->n_suppress == 0) { + r = json_variant_new_real(&add, d); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_BOOLEAN: { + bool b; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + b = va_arg(ap, int); + + if (current->n_suppress == 0) { + r = json_variant_new_boolean(&add, b); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_NULL: + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + if (current->n_suppress == 0) { + r = json_variant_new_null(&add); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + + case _JSON_BUILD_VARIANT: + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + /* Note that we don't care for current->n_suppress here, after all the variant is already + * allocated anyway... */ + add = va_arg(ap, JsonVariant*); + if (!add) + add = JSON_VARIANT_MAGIC_NULL; + else + json_variant_ref(add); + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + + case _JSON_BUILD_VARIANT_ARRAY: { + JsonVariant **array; + size_t n; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + array = va_arg(ap, JsonVariant**); + n = va_arg(ap, size_t); + + if (current->n_suppress == 0) { + r = json_variant_new_array(&add, array, n); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_LITERAL: { + const char *l; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + l = va_arg(ap, const char *); + + if (l) { + /* Note that we don't care for current->n_suppress here, we should generate parsing + * errors even in suppressed object properties */ + + r = json_parse(l, 0, &add, NULL, NULL); + if (r < 0) + goto finish; + } else + add = JSON_VARIANT_MAGIC_NULL; + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_ARRAY_BEGIN: + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + if (!GREEDY_REALLOC(stack, n_stack+1)) { + r = -ENOMEM; + goto finish; + } + current = stack + n_stack - 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + stack[n_stack++] = (JsonStack) { + .expect = EXPECT_ARRAY_ELEMENT, + .n_suppress = current->n_suppress != 0 ? SIZE_MAX : 0, /* if we shall suppress the + * new array, then we should + * also suppress all array + * members */ + }; + + break; + + case _JSON_BUILD_ARRAY_END: + if (current->expect != EXPECT_ARRAY_ELEMENT) { + r = -EINVAL; + goto finish; + } + + assert(n_stack > 1); + + if (current->n_suppress == 0) { + r = json_variant_new_array(&add, current->elements, current->n_elements); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + json_stack_release(current); + n_stack--, current--; + + break; + + case _JSON_BUILD_STRV: { + char **l; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + l = va_arg(ap, char **); + + if (current->n_suppress == 0) { + r = json_variant_new_array_strv(&add, l); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_STRV_ENV_PAIR: { + char **l; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + l = va_arg(ap, char **); + + _cleanup_strv_free_ char **el = NULL; + STRV_FOREACH_PAIR(x, y, l) { + char *n = NULL; + + n = strjoin(*x, "=", *y); + if (!n) { + r = -ENOMEM; + goto finish; + } + + r = strv_consume(&el, n); + if (r < 0) + goto finish; + } + + if (current->n_suppress == 0) { + r = json_variant_new_array_strv(&add, el); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_BASE64: + case _JSON_BUILD_BASE32HEX: + case _JSON_BUILD_HEX: + case _JSON_BUILD_OCTESCAPE: { + const void *p; + size_t n; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + p = va_arg(ap, const void *); + n = va_arg(ap, size_t); + + if (current->n_suppress == 0) { + r = command == _JSON_BUILD_BASE64 ? json_variant_new_base64(&add, p, n) : + command == _JSON_BUILD_BASE32HEX ? json_variant_new_base32hex(&add, p, n) : + command == _JSON_BUILD_HEX ? json_variant_new_hex(&add, p, n) : + json_variant_new_octescape(&add, p, n); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_IOVEC_BASE64: { + const struct iovec *iov; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + iov = ASSERT_PTR(va_arg(ap, const struct iovec*)); + + if (current->n_suppress == 0) { + r = json_variant_new_base64(&add, iov->iov_base, iov->iov_len); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_ID128: + case _JSON_BUILD_UUID: { + const sd_id128_t *id; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + assert_se(id = va_arg(ap, sd_id128_t*)); + + if (current->n_suppress == 0) { + r = command == _JSON_BUILD_ID128 ? + json_variant_new_id128(&add, *id) : + json_variant_new_uuid(&add, *id); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_BYTE_ARRAY: { + const void *array; + size_t n; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + array = va_arg(ap, const void*); + n = va_arg(ap, size_t); + + if (current->n_suppress == 0) { + r = json_variant_new_array_bytes(&add, array, n); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_HW_ADDR: { + const struct hw_addr_data *hw_addr; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + assert_se(hw_addr = va_arg(ap, struct hw_addr_data*)); + + if (current->n_suppress == 0) { + r = json_variant_new_array_bytes(&add, hw_addr->bytes, hw_addr->length); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_STRING_SET: { + Set *set; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + set = va_arg(ap, Set*); + + if (current->n_suppress == 0) { + _cleanup_free_ char **sorted = NULL; + + r = set_dump_sorted(set, (void ***) &sorted, NULL); + if (r < 0) + goto finish; + + r = json_variant_new_array_strv(&add, sorted); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_CALLBACK: { + JsonBuildCallback cb; + void *userdata; + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + cb = va_arg(ap, JsonBuildCallback); + userdata = va_arg(ap, void *); + + if (current->n_suppress == 0) { + if (cb) { + r = cb(&add, name, userdata); + if (r < 0) + goto finish; + } + + if (!add) + add = JSON_VARIANT_MAGIC_NULL; + + name = NULL; + } + + n_subtract = 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + break; + } + + case _JSON_BUILD_OBJECT_BEGIN: + + if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) { + r = -EINVAL; + goto finish; + } + + if (!GREEDY_REALLOC(stack, n_stack+1)) { + r = -ENOMEM; + goto finish; + } + current = stack + n_stack - 1; + + if (current->expect == EXPECT_TOPLEVEL) + current->expect = EXPECT_END; + else if (current->expect == EXPECT_OBJECT_VALUE) + current->expect = EXPECT_OBJECT_KEY; + else + assert(current->expect == EXPECT_ARRAY_ELEMENT); + + stack[n_stack++] = (JsonStack) { + .expect = EXPECT_OBJECT_KEY, + .n_suppress = current->n_suppress != 0 ? SIZE_MAX : 0, /* If we shall suppress the + * new object, then we should + * also suppress all object + * members. */ + }; + + break; + + case _JSON_BUILD_OBJECT_END: + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + assert(n_stack > 1); + + if (current->n_suppress == 0) { + r = json_variant_new_object(&add, current->elements, current->n_elements); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + json_stack_release(current); + n_stack--, current--; + + break; + + case _JSON_BUILD_PAIR: { + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + name = va_arg(ap, const char *); + + if (current->n_suppress == 0) { + r = json_variant_new_string(&add, name); + if (r < 0) + goto finish; + } + + n_subtract = 1; + + current->expect = EXPECT_OBJECT_VALUE; + break; + } + + case _JSON_BUILD_PAIR_CONDITION: { + bool b; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + b = va_arg(ap, int); + name = va_arg(ap, const char *); + + if (b && current->n_suppress == 0) { + r = json_variant_new_string(&add, name); + if (r < 0) + goto finish; + } + + n_subtract = 1; /* we generated one item */ + + if (!b && current->n_suppress != SIZE_MAX) + current->n_suppress += 2; /* Suppress this one and the next item */ + + current->expect = EXPECT_OBJECT_VALUE; + break; + } + + case _JSON_BUILD_PAIR_UNSIGNED_NON_ZERO: { + const char *n; + uint64_t u; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + u = va_arg(ap, uint64_t); + + if (u != 0 && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_unsigned(&add_more, u); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_FINITE_USEC: { + const char *n; + usec_t u; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + u = va_arg(ap, usec_t); + + if (u != USEC_INFINITY && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_unsigned(&add_more, u); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_STRING_NON_EMPTY: { + const char *n, *s; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + s = va_arg(ap, const char *); + + if (!isempty(s) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_string(&add_more, s); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_STRV_NON_EMPTY: { + const char *n; + char **l; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + l = va_arg(ap, char **); + + if (!strv_isempty(l) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_array_strv(&add_more, l); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_VARIANT_NON_NULL: { + JsonVariant *v; + const char *n; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + v = va_arg(ap, JsonVariant *); + + if (v && !json_variant_is_null(v) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + add_more = json_variant_ref(v); + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_IN4_ADDR_NON_NULL: { + const struct in_addr *a; + const char *n; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + a = va_arg(ap, const struct in_addr *); + + if (a && in4_addr_is_set(a) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_array_bytes(&add_more, a, sizeof(struct in_addr)); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_IN6_ADDR_NON_NULL: { + const struct in6_addr *a; + const char *n; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + a = va_arg(ap, const struct in6_addr *); + + if (a && in6_addr_is_set(a) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_array_bytes(&add_more, a, sizeof(struct in6_addr)); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_IN_ADDR_NON_NULL: { + const union in_addr_union *a; + const char *n; + int f; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + a = va_arg(ap, const union in_addr_union *); + f = va_arg(ap, int); + + if (a && in_addr_is_set(f, a) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_array_bytes(&add_more, a->bytes, FAMILY_ADDRESS_SIZE(f)); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL: { + const struct ether_addr *a; + const char *n; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + a = va_arg(ap, const struct ether_addr *); + + if (a && !ether_addr_is_null(a) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_array_bytes(&add_more, a->ether_addr_octet, sizeof(struct ether_addr)); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + + case _JSON_BUILD_PAIR_HW_ADDR_NON_NULL: { + const struct hw_addr_data *a; + const char *n; + + if (current->expect != EXPECT_OBJECT_KEY) { + r = -EINVAL; + goto finish; + } + + n = va_arg(ap, const char *); + a = va_arg(ap, const struct hw_addr_data *); + + if (a && !hw_addr_is_null(a) && current->n_suppress == 0) { + r = json_variant_new_string(&add, n); + if (r < 0) + goto finish; + + r = json_variant_new_array_bytes(&add_more, a->bytes, a->length); + if (r < 0) + goto finish; + } + + n_subtract = 2; /* we generated two item */ + + current->expect = EXPECT_OBJECT_KEY; + break; + } + } + + /* If variants were generated, add them to our current variant, but only if we are not supposed to suppress additions */ + if (add && current->n_suppress == 0) { + if (!GREEDY_REALLOC(current->elements, current->n_elements + 1 + !!add_more)) { + r = -ENOMEM; + goto finish; + } + + current->elements[current->n_elements++] = TAKE_PTR(add); + if (add_more) + current->elements[current->n_elements++] = TAKE_PTR(add_more); + } + + /* If we are supposed to suppress items, let's subtract how many items where generated from + * that counter. Except if the counter is SIZE_MAX, i.e. we shall suppress an infinite number + * of elements on this stack level */ + if (current->n_suppress != SIZE_MAX) { + if (current->n_suppress <= n_subtract) /* Saturated */ + current->n_suppress = 0; + else + current->n_suppress -= n_subtract; + } + } + +done: + assert(n_stack == 1); + assert(stack[0].n_elements == 1); + + *ret = json_variant_ref(stack[0].elements[0]); + r = 0; + +finish: + for (size_t i = 0; i < n_stack; i++) + json_stack_release(stack + i); + + free(stack); + + return r; +} + +int json_build(JsonVariant **ret, ...) { + va_list ap; + int r; + + va_start(ap, ret); + r = json_buildv(ret, ap); + va_end(ap); + + return r; +} + +int json_log_internal( + JsonVariant *variant, + int level, + int error, + const char *file, + int line, + const char *func, + const char *format, ...) { + + PROTECT_ERRNO; + + unsigned source_line, source_column; + char buffer[LINE_MAX]; + const char *source; + va_list ap; + int r; + + errno = ERRNO_VALUE(error); + + va_start(ap, format); + (void) vsnprintf(buffer, sizeof buffer, format, ap); + va_end(ap); + + if (variant) { + r = json_variant_get_source(variant, &source, &source_line, &source_column); + if (r < 0) + return r; + } else { + source = NULL; + source_line = 0; + source_column = 0; + } + + if (source && source_line > 0 && source_column > 0) + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + "CONFIG_FILE=%s", source, + "CONFIG_LINE=%u", source_line, + "CONFIG_COLUMN=%u", source_column, + LOG_MESSAGE("%s:%u:%u: %s", source, source_line, source_column, buffer), + NULL); + else if (source_line > 0 && source_column > 0) + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + "CONFIG_LINE=%u", source_line, + "CONFIG_COLUMN=%u", source_column, + LOG_MESSAGE("(string):%u:%u: %s", source_line, source_column, buffer), + NULL); + else + return log_struct_internal( + level, + error, + file, line, func, + "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR, + LOG_MESSAGE("%s", buffer), + NULL); +} + +static void *dispatch_userdata(const JsonDispatch *p, void *userdata) { + + /* When the userdata pointer is passed in as NULL, then we'll just use the offset as a literal + * address, and convert it to a pointer. Note that might as well just add the offset to the NULL + * pointer, but UndefinedBehaviourSanitizer doesn't like pointer arithmetics based on NULL pointers, + * hence we code this explicitly here. */ + + if (userdata) + return (uint8_t*) userdata + p->offset; + + return SIZE_TO_PTR(p->offset); +} + +int json_dispatch_full( + JsonVariant *v, + const JsonDispatch table[], + JsonDispatchCallback bad, + JsonDispatchFlags flags, + void *userdata, + const char **reterr_bad_field) { + size_t m; + int r, done = 0; + bool *found; + + if (!json_variant_is_object(v)) { + json_log(v, flags, 0, "JSON variant is not an object."); + + if (flags & JSON_PERMISSIVE) + return 0; + + if (reterr_bad_field) + *reterr_bad_field = NULL; + + return -EINVAL; + } + + m = 0; + for (const JsonDispatch *p = table; p->name; p++) + m++; + + found = newa0(bool, m); + + size_t n = json_variant_elements(v); + for (size_t i = 0; i < n; i += 2) { + JsonVariant *key, *value; + const JsonDispatch *p; + + assert_se(key = json_variant_by_index(v, i)); + assert_se(value = json_variant_by_index(v, i+1)); + + for (p = table; p->name; p++) + if (p->name == POINTER_MAX || + streq_ptr(json_variant_string(key), p->name)) + break; + + if (p->name) { /* Found a matching entry! 🙂 */ + JsonDispatchFlags merged_flags; + + merged_flags = flags | p->flags; + + if (p->type != _JSON_VARIANT_TYPE_INVALID && + !json_variant_has_type(value, p->type)) { + + json_log(value, merged_flags, 0, + "Object field '%s' has wrong type %s, expected %s.", json_variant_string(key), + json_variant_type_to_string(json_variant_type(value)), json_variant_type_to_string(p->type)); + + if (merged_flags & JSON_PERMISSIVE) + continue; + + if (reterr_bad_field) + *reterr_bad_field = p->name; + + return -EINVAL; + } + + if (found[p-table]) { + json_log(value, merged_flags, 0, "Duplicate object field '%s'.", json_variant_string(key)); + + if (merged_flags & JSON_PERMISSIVE) + continue; + + if (reterr_bad_field) + *reterr_bad_field = p->name; + + return -ENOTUNIQ; + } + + found[p-table] = true; + + if (p->callback) { + r = p->callback(json_variant_string(key), value, merged_flags, dispatch_userdata(p, userdata)); + if (r < 0) { + if (merged_flags & JSON_PERMISSIVE) + continue; + + if (reterr_bad_field) + *reterr_bad_field = json_variant_string(key); + + return r; + } + } + + done ++; + + } else { /* Didn't find a matching entry! ☹️ */ + + if (bad) { + r = bad(json_variant_string(key), value, flags, userdata); + if (r < 0) { + if (flags & JSON_PERMISSIVE) + continue; + + if (reterr_bad_field) + *reterr_bad_field = json_variant_string(key); + + return r; + } else + done ++; + + } else { + json_log(value, flags, 0, "Unexpected object field '%s'.", json_variant_string(key)); + + if (flags & JSON_PERMISSIVE) + continue; + + if (reterr_bad_field) + *reterr_bad_field = json_variant_string(key); + + return -EADDRNOTAVAIL; + } + } + } + + for (const JsonDispatch *p = table; p->name; p++) { + JsonDispatchFlags merged_flags = p->flags | flags; + + if ((merged_flags & JSON_MANDATORY) && !found[p-table]) { + json_log(v, merged_flags, 0, "Missing object field '%s'.", p->name); + + if ((merged_flags & JSON_PERMISSIVE)) + continue; + + if (reterr_bad_field) + *reterr_bad_field = p->name; + + return -ENXIO; + } + } + + return done; +} + +int json_dispatch_boolean(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + bool *b = ASSERT_PTR(userdata); + + assert(variant); + + if (!json_variant_is_boolean(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a boolean.", strna(name)); + + *b = json_variant_boolean(variant); + return 0; +} + +int json_dispatch_tristate(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int *b = ASSERT_PTR(userdata); + + assert(variant); + + if (json_variant_is_null(variant)) { + *b = -1; + return 0; + } + + if (!json_variant_is_boolean(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a boolean.", strna(name)); + + *b = json_variant_boolean(variant); + return 0; +} + +int json_dispatch_int64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int64_t *i = ASSERT_PTR(userdata); + + assert(variant); + + /* Also accept numbers formatted as string, to increase compatibility with less capable JSON + * implementations that cannot do 64bit integers. */ + if (json_variant_is_string(variant) && safe_atoi64(json_variant_string(variant), i) >= 0) + return 0; + + if (!json_variant_is_integer(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer, nor one formatted as decimal string.", strna(name)); + + *i = json_variant_integer(variant); + return 0; +} + +int json_dispatch_uint64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = ASSERT_PTR(userdata); + + assert(variant); + + /* Since 64bit values (in particular unsigned ones) in JSON are problematic, let's also accept them + * formatted as strings. If this is not desired make sure to set the .type field in JsonDispatch to + * JSON_UNSIGNED rather than _JSON_VARIANT_TYPE_INVALID, so that json_dispatch() already filters out + * the non-matching type. */ + + if (json_variant_is_string(variant) && safe_atou64(json_variant_string(variant), u) >= 0) + return 0; + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an unsigned integer, nor one formatted as decimal string.", strna(name)); + + *u = json_variant_unsigned(variant); + return 0; +} + +int json_dispatch_uint32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint32_t *u = ASSERT_PTR(userdata); + uint64_t u64; + int r; + + assert(variant); + + r = json_dispatch_uint64(name, variant, flags, &u64); + if (r < 0) + return r; + + if (u64 > UINT32_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name)); + + *u = (uint32_t) u64; + return 0; +} + +int json_dispatch_int32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int32_t *i = ASSERT_PTR(userdata); + int64_t i64; + int r; + + assert(variant); + + r = json_dispatch_int64(name, variant, flags, &i64); + if (r < 0) + return r; + + if (i64 < INT32_MIN || i64 > INT32_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name)); + + *i = (int32_t) i64; + return 0; +} + +int json_dispatch_int16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int16_t *i = ASSERT_PTR(userdata); + int64_t i64; + int r; + + assert(variant); + + r = json_dispatch_int64(name, variant, flags, &i64); + if (r < 0) + return r; + + if (i64 < INT16_MIN || i64 > INT16_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name)); + + *i = (int16_t) i64; + return 0; +} + +int json_dispatch_uint16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint16_t *u = ASSERT_PTR(userdata); + uint64_t u64; + int r; + + assert(variant); + + r = json_dispatch_uint64(name, variant, flags, &u64); + if (r < 0) + return r; + + if (u64 > UINT16_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name)); + + *u = (uint16_t) u64; + return 0; +} + +int json_dispatch_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = ASSERT_PTR(userdata); + int r; + + assert(variant); + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(variant))) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name)); + + r = free_and_strdup(s, json_variant_string(variant)); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +int json_dispatch_const_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + const char **s = ASSERT_PTR(userdata); + + assert(variant); + + if (json_variant_is_null(variant)) { + *s = NULL; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(variant))) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name)); + + *s = json_variant_string(variant); + return 0; +} + +int json_dispatch_strv(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + char ***s = ASSERT_PTR(userdata); + JsonVariant *e; + int r; + + assert(variant); + + if (json_variant_is_null(variant)) { + *s = strv_free(*s); + return 0; + } + + /* Let's be flexible here: accept a single string in place of a single-item array */ + if (json_variant_is_string(variant)) { + if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(variant))) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name)); + + l = strv_new(json_variant_string(variant)); + if (!l) + return log_oom(); + + strv_free_and_replace(*s, l); + return 0; + } + + if (!json_variant_is_array(variant)) + return json_log(variant, SYNTHETIC_ERRNO(EINVAL), flags, "JSON field '%s' is not an array.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string."); + + if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(e))) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name)); + + r = strv_extend(&l, json_variant_string(e)); + if (r < 0) + return json_log(e, flags, r, "Failed to append array element: %m"); + } + + strv_free_and_replace(*s, l); + return 0; +} + +int json_dispatch_variant(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + JsonVariant **p = ASSERT_PTR(userdata); + assert(variant); + + /* Takes a reference */ + JSON_VARIANT_REPLACE(*p, json_variant_ref(variant)); + return 0; +} + +int json_dispatch_variant_noref(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + JsonVariant **p = ASSERT_PTR(userdata); + assert(variant); + + /* Doesn't take a reference */ + *p = variant; + return 0; +} + +int json_dispatch_uid_gid(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = userdata; + uint64_t k; + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + assert_cc(sizeof(gid_t) == sizeof(uint32_t)); + + DISABLE_WARNING_TYPE_LIMITS; + assert_cc((UID_INVALID < (uid_t) 0) == (GID_INVALID < (gid_t) 0)); + REENABLE_WARNING; + + if (json_variant_is_null(variant)) { + *uid = UID_INVALID; + return 0; + } + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name)); + + k = json_variant_unsigned(variant); + if (k > UINT32_MAX || !uid_is_valid(k)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid UID/GID.", strna(name)); + + *uid = k; + return 0; +} + +int json_dispatch_user_group_name(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (!valid_user_group_name(n, FLAGS_SET(flags, JSON_RELAX) ? VALID_USER_RELAX : 0)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid user/group name.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +int json_dispatch_id128(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + sd_id128_t *uuid = userdata; + int r; + + if (json_variant_is_null(variant)) { + *uuid = SD_ID128_NULL; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + r = sd_id128_from_string(json_variant_string(variant), uuid); + if (r < 0) + return json_log(variant, flags, r, "JSON field '%s' is not a valid UID.", strna(name)); + + return 0; +} + +int json_dispatch_unsupported(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not allowed in this object.", strna(name)); +} + +int json_dispatch_unbase64_iovec(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + _cleanup_free_ void *buffer = NULL; + struct iovec *iov = ASSERT_PTR(userdata); + size_t sz; + int r; + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + r = json_variant_unbase64(variant, &buffer, &sz); + if (r < 0) + return json_log(variant, flags, r, "JSON field '%s' is not valid Base64 data.", strna(name)); + + free_and_replace(iov->iov_base, buffer); + iov->iov_len = sz; + return 0; +} + +static int json_cmp_strings(const void *x, const void *y) { + JsonVariant *const *a = x, *const *b = y; + + if (!json_variant_is_string(*a) || !json_variant_is_string(*b)) + return CMP(*a, *b); + + return strcmp(json_variant_string(*a), json_variant_string(*b)); +} + +int json_variant_sort(JsonVariant **v) { + _cleanup_free_ JsonVariant **a = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *n = NULL; + size_t m; + int r; + + assert(v); + + if (json_variant_is_sorted(*v)) + return 0; + + if (!json_variant_is_object(*v)) + return -EMEDIUMTYPE; + + /* Sorts they key/value pairs in an object variant */ + + m = json_variant_elements(*v); + a = new(JsonVariant*, m); + if (!a) + return -ENOMEM; + + for (size_t i = 0; i < m; i++) + a[i] = json_variant_by_index(*v, i); + + qsort(a, m/2, sizeof(JsonVariant*)*2, json_cmp_strings); + + r = json_variant_new_object(&n, a, m); + if (r < 0) + return r; + + json_variant_propagate_sensitive(*v, n); + + if (!n->sorted) /* Check if this worked. This will fail if there are multiple identical keys used. */ + return -ENOTUNIQ; + + JSON_VARIANT_REPLACE(*v, TAKE_PTR(n)); + + return 1; +} + +int json_variant_normalize(JsonVariant **v) { + _cleanup_free_ JsonVariant **a = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *n = NULL; + size_t i, m; + int r; + + assert(v); + + if (json_variant_is_normalized(*v)) + return 0; + + if (!json_variant_is_object(*v) && !json_variant_is_array(*v)) + return -EMEDIUMTYPE; + + /* Sorts the key/value pairs in an object variant anywhere down the tree in the specified variant */ + + m = json_variant_elements(*v); + a = new(JsonVariant*, m); + if (!a) + return -ENOMEM; + + for (i = 0; i < m; ) { + a[i] = json_variant_ref(json_variant_by_index(*v, i)); + i++; + + r = json_variant_normalize(&a[i-1]); + if (r < 0) + goto finish; + } + + qsort(a, m/2, sizeof(JsonVariant*)*2, json_cmp_strings); + + if (json_variant_is_object(*v)) + r = json_variant_new_object(&n, a, m); + else { + assert(json_variant_is_array(*v)); + r = json_variant_new_array(&n, a, m); + } + if (r < 0) + goto finish; + + json_variant_propagate_sensitive(*v, n); + + if (!n->normalized) { /* Let's see if normalization worked. It will fail if there are multiple + * identical keys used in the same object anywhere, or if there are floating + * point numbers used (see below) */ + r = -ENOTUNIQ; + goto finish; + } + + JSON_VARIANT_REPLACE(*v, TAKE_PTR(n)); + + r = 1; + +finish: + for (size_t j = 0; j < i; j++) + json_variant_unref(a[j]); + + return r; +} + +bool json_variant_is_normalized(JsonVariant *v) { + /* For now, let's consider anything containing numbers not expressible as integers as non-normalized. + * That's because we cannot sensibly compare them due to accuracy issues, nor even store them if they + * are too large. */ + if (json_variant_is_real(v) && !json_variant_is_integer(v) && !json_variant_is_unsigned(v)) + return false; + + /* The concept only applies to variants that include other variants, i.e. objects and arrays. All + * others are normalized anyway. */ + if (!json_variant_is_object(v) && !json_variant_is_array(v)) + return true; + + /* Empty objects/arrays don't include any other variant, hence are always normalized too */ + if (json_variant_elements(v) == 0) + return true; + + return v->normalized; /* For everything else there's an explicit boolean we maintain */ +} + +bool json_variant_is_sorted(JsonVariant *v) { + + /* Returns true if all key/value pairs of an object are properly sorted. Note that this only applies + * to objects, not arrays. */ + + if (!json_variant_is_object(v)) + return true; + if (json_variant_elements(v) <= 1) + return true; + + return v->sorted; +} + +int json_variant_unbase64(JsonVariant *v, void **ret, size_t *ret_size) { + if (!json_variant_is_string(v)) + return -EINVAL; + + return unbase64mem(json_variant_string(v), SIZE_MAX, ret, ret_size); +} + +int json_variant_unhex(JsonVariant *v, void **ret, size_t *ret_size) { + if (!json_variant_is_string(v)) + return -EINVAL; + + return unhexmem(json_variant_string(v), SIZE_MAX, ret, ret_size); +} + +static const char* const json_variant_type_table[_JSON_VARIANT_TYPE_MAX] = { + [JSON_VARIANT_STRING] = "string", + [JSON_VARIANT_INTEGER] = "integer", + [JSON_VARIANT_UNSIGNED] = "unsigned", + [JSON_VARIANT_REAL] = "real", + [JSON_VARIANT_NUMBER] = "number", + [JSON_VARIANT_BOOLEAN] = "boolean", + [JSON_VARIANT_ARRAY] = "array", + [JSON_VARIANT_OBJECT] = "object", + [JSON_VARIANT_NULL] = "null", +}; + +DEFINE_STRING_TABLE_LOOKUP(json_variant_type, JsonVariantType); diff --git a/src/shared/json.h b/src/shared/json.h new file mode 100644 index 0000000..c40c234 --- /dev/null +++ b/src/shared/json.h @@ -0,0 +1,474 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "ether-addr-util.h" +#include "in-addr-util.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +/* + In case you wonder why we have our own JSON implementation, here are a couple of reasons why this implementation has + benefits over various other implementations: + + - We need support for 64-bit signed and unsigned integers, i.e. the full 64,5bit range of -9223372036854775808…18446744073709551615 + - All our variants are immutable after creation + - Special values such as true, false, zero, null, empty strings, empty array, empty objects require zero dynamic memory + - Progressive parsing + - Our integer/real type implicitly converts, but only if that's safe and loss-lessly possible + - There's a "builder" for putting together objects easily in varargs function calls + - There's a "dispatcher" for mapping objects to C data structures + - Every variant optionally carries parsing location information, which simplifies debugging and parse log error generation + - Formatter has color, line, column support + + Limitations: + - Doesn't allow embedded NUL in strings + - Can't store integers outside of the -9223372036854775808…18446744073709551615 range (it will use 'double' for + values outside this range, which is lossy) + - Can't store negative zero (will be treated identical to positive zero, and not retained across serialization) + - Can't store non-integer numbers that can't be stored in "double" losslessly + - Allows creation and parsing of objects with duplicate keys. The "dispatcher" will refuse them however. This means + we can parse and pass around such objects, but will carefully refuse them when we convert them into our own data. + + (These limitations should be pretty much in line with those of other JSON implementations, in fact might be less + limiting in most cases even.) +*/ + +typedef struct JsonVariant JsonVariant; + +typedef enum JsonVariantType { + JSON_VARIANT_STRING, + JSON_VARIANT_INTEGER, + JSON_VARIANT_UNSIGNED, + JSON_VARIANT_REAL, + JSON_VARIANT_NUMBER, /* This a pseudo-type: we can never create variants of this type, but we use it as wildcard check for the above three types */ + JSON_VARIANT_BOOLEAN, + JSON_VARIANT_ARRAY, + JSON_VARIANT_OBJECT, + JSON_VARIANT_NULL, + _JSON_VARIANT_TYPE_MAX, + _JSON_VARIANT_TYPE_INVALID = -EINVAL, +} JsonVariantType; + +int json_variant_new_stringn(JsonVariant **ret, const char *s, size_t n); +int json_variant_new_base64(JsonVariant **ret, const void *p, size_t n); +int json_variant_new_base32hex(JsonVariant **ret, const void *p, size_t n); +int json_variant_new_hex(JsonVariant **ret, const void *p, size_t n); +int json_variant_new_octescape(JsonVariant **ret, const void *p, size_t n); +int json_variant_new_integer(JsonVariant **ret, int64_t i); +int json_variant_new_unsigned(JsonVariant **ret, uint64_t u); +int json_variant_new_real(JsonVariant **ret, double d); +int json_variant_new_boolean(JsonVariant **ret, bool b); +int json_variant_new_array(JsonVariant **ret, JsonVariant **array, size_t n); +int json_variant_new_array_bytes(JsonVariant **ret, const void *p, size_t n); +int json_variant_new_array_strv(JsonVariant **ret, char **l); +int json_variant_new_object(JsonVariant **ret, JsonVariant **array, size_t n); +int json_variant_new_null(JsonVariant **ret); +int json_variant_new_id128(JsonVariant **ret, sd_id128_t id); +int json_variant_new_uuid(JsonVariant **ret, sd_id128_t id); + +static inline int json_variant_new_string(JsonVariant **ret, const char *s) { + return json_variant_new_stringn(ret, s, SIZE_MAX); +} + +JsonVariant *json_variant_ref(JsonVariant *v); +JsonVariant *json_variant_unref(JsonVariant *v); +void json_variant_unref_many(JsonVariant **array, size_t n); + +#define JSON_VARIANT_REPLACE(v, q) \ + do { \ + typeof(v)* _v = &(v); \ + typeof(q) _q = (q); \ + json_variant_unref(*_v); \ + *_v = _q; \ + } while(0) + +DEFINE_TRIVIAL_CLEANUP_FUNC(JsonVariant *, json_variant_unref); + +const char *json_variant_string(JsonVariant *v); +int64_t json_variant_integer(JsonVariant *v); +uint64_t json_variant_unsigned(JsonVariant *v); +double json_variant_real(JsonVariant *v); +bool json_variant_boolean(JsonVariant *v); + +JsonVariantType json_variant_type(JsonVariant *v); +bool json_variant_has_type(JsonVariant *v, JsonVariantType type); + +static inline bool json_variant_is_string(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_STRING); +} + +static inline bool json_variant_is_integer(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_INTEGER); +} + +static inline bool json_variant_is_unsigned(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_UNSIGNED); +} + +static inline bool json_variant_is_real(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_REAL); +} + +static inline bool json_variant_is_number(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_NUMBER); +} + +static inline bool json_variant_is_boolean(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_BOOLEAN); +} + +static inline bool json_variant_is_array(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_ARRAY); +} + +static inline bool json_variant_is_object(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_OBJECT); +} + +static inline bool json_variant_is_null(JsonVariant *v) { + return json_variant_has_type(v, JSON_VARIANT_NULL); +} + +bool json_variant_is_negative(JsonVariant *v); +bool json_variant_is_blank_object(JsonVariant *v); +bool json_variant_is_blank_array(JsonVariant *v); +bool json_variant_is_normalized(JsonVariant *v); +bool json_variant_is_sorted(JsonVariant *v); + +size_t json_variant_elements(JsonVariant *v); +JsonVariant *json_variant_by_index(JsonVariant *v, size_t index); +JsonVariant *json_variant_by_key(JsonVariant *v, const char *key); +JsonVariant *json_variant_by_key_full(JsonVariant *v, const char *key, JsonVariant **ret_key); + +bool json_variant_equal(JsonVariant *a, JsonVariant *b); + +void json_variant_sensitive(JsonVariant *v); +bool json_variant_is_sensitive(JsonVariant *v); + +struct json_variant_foreach_state { + JsonVariant *variant; + size_t idx; +}; + +#define _JSON_VARIANT_ARRAY_FOREACH(i, v, state) \ + for (struct json_variant_foreach_state state = { (v), 0 }; \ + json_variant_is_array(state.variant) && \ + state.idx < json_variant_elements(state.variant) && \ + ({ i = json_variant_by_index(state.variant, state.idx); \ + true; }); \ + state.idx++) +#define JSON_VARIANT_ARRAY_FOREACH(i, v) \ + _JSON_VARIANT_ARRAY_FOREACH(i, v, UNIQ_T(state, UNIQ)) + +#define _JSON_VARIANT_OBJECT_FOREACH(k, e, v, state) \ + for (struct json_variant_foreach_state state = { (v), 0 }; \ + json_variant_is_object(state.variant) && \ + state.idx < json_variant_elements(state.variant) && \ + ({ k = json_variant_string(json_variant_by_index(state.variant, state.idx)); \ + e = json_variant_by_index(state.variant, state.idx + 1); \ + true; }); \ + state.idx += 2) +#define JSON_VARIANT_OBJECT_FOREACH(k, e, v) \ + _JSON_VARIANT_OBJECT_FOREACH(k, e, v, UNIQ_T(state, UNIQ)) + +int json_variant_get_source(JsonVariant *v, const char **ret_source, unsigned *ret_line, unsigned *ret_column); + +typedef enum JsonFormatFlags { + JSON_FORMAT_NEWLINE = 1 << 0, /* suffix with newline */ + JSON_FORMAT_PRETTY = 1 << 1, /* add internal whitespace to appeal to human readers */ + JSON_FORMAT_PRETTY_AUTO = 1 << 2, /* same, but only if connected to a tty (and JSON_FORMAT_NEWLINE otherwise) */ + JSON_FORMAT_COLOR = 1 << 3, /* insert ANSI color sequences */ + JSON_FORMAT_COLOR_AUTO = 1 << 4, /* insert ANSI color sequences if colors_enabled() says so */ + JSON_FORMAT_SOURCE = 1 << 5, /* prefix with source filename/line/column */ + JSON_FORMAT_SSE = 1 << 6, /* prefix/suffix with W3C server-sent events */ + JSON_FORMAT_SEQ = 1 << 7, /* prefix/suffix with RFC 7464 application/json-seq */ + JSON_FORMAT_FLUSH = 1 << 8, /* call fflush() after dumping JSON */ + JSON_FORMAT_EMPTY_ARRAY = 1 << 9, /* output "[]" for empty input */ + JSON_FORMAT_OFF = 1 << 10, /* make json_variant_format() fail with -ENOEXEC */ +} JsonFormatFlags; + +int json_variant_format(JsonVariant *v, JsonFormatFlags flags, char **ret); +int json_variant_dump(JsonVariant *v, JsonFormatFlags flags, FILE *f, const char *prefix); + +int json_variant_filter(JsonVariant **v, char **to_remove); + +int json_variant_set_field(JsonVariant **v, const char *field, JsonVariant *value); +int json_variant_set_fieldb(JsonVariant **v, const char *field, ...); +int json_variant_set_field_string(JsonVariant **v, const char *field, const char *value); +int json_variant_set_field_integer(JsonVariant **v, const char *field, int64_t value); +int json_variant_set_field_unsigned(JsonVariant **v, const char *field, uint64_t value); +int json_variant_set_field_boolean(JsonVariant **v, const char *field, bool b); +int json_variant_set_field_strv(JsonVariant **v, const char *field, char **l); + +static inline int json_variant_set_field_non_null(JsonVariant **v, const char *field, JsonVariant *value) { + return value && !json_variant_is_null(value) ? json_variant_set_field(v, field, value) : 0; +} + +JsonVariant *json_variant_find(JsonVariant *haystack, JsonVariant *needle); + +int json_variant_append_array(JsonVariant **v, JsonVariant *element); +int json_variant_append_arrayb(JsonVariant **v, ...); +int json_variant_append_array_nodup(JsonVariant **v, JsonVariant *element); + +int json_variant_merge_object(JsonVariant **v, JsonVariant *m); +int json_variant_merge_objectb(JsonVariant **v, ...); + +int json_variant_strv(JsonVariant *v, char ***ret); + +int json_variant_sort(JsonVariant **v); +int json_variant_normalize(JsonVariant **v); + +typedef enum JsonParseFlags { + JSON_PARSE_SENSITIVE = 1 << 0, /* mark variant as "sensitive", i.e. something containing secret key material or such */ +} JsonParseFlags; + +int json_parse_with_source(const char *string, const char *source, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column); +int json_parse_with_source_continue(const char **p, const char *source, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column); + +static inline int json_parse(const char *string, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column) { + return json_parse_with_source(string, NULL, flags, ret, ret_line, ret_column); +} +static inline int json_parse_continue(const char **p, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column) { + return json_parse_with_source_continue(p, NULL, flags, ret, ret_line, ret_column); +} + +int json_parse_file_at(FILE *f, int dir_fd, const char *path, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column); + +static inline int json_parse_file(FILE *f, const char *path, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column) { + return json_parse_file_at(f, AT_FDCWD, path, flags, ret, ret_line, ret_column); +} + +enum { + _JSON_BUILD_STRING, + _JSON_BUILD_INTEGER, + _JSON_BUILD_UNSIGNED, + _JSON_BUILD_REAL, + _JSON_BUILD_BOOLEAN, + _JSON_BUILD_ARRAY_BEGIN, + _JSON_BUILD_ARRAY_END, + _JSON_BUILD_OBJECT_BEGIN, + _JSON_BUILD_OBJECT_END, + _JSON_BUILD_PAIR, + _JSON_BUILD_PAIR_CONDITION, + _JSON_BUILD_NULL, + _JSON_BUILD_VARIANT, + _JSON_BUILD_VARIANT_ARRAY, + _JSON_BUILD_LITERAL, + _JSON_BUILD_STRV, + _JSON_BUILD_STRV_ENV_PAIR, + _JSON_BUILD_BASE64, + _JSON_BUILD_IOVEC_BASE64, + _JSON_BUILD_BASE32HEX, + _JSON_BUILD_HEX, + _JSON_BUILD_OCTESCAPE, + _JSON_BUILD_ID128, + _JSON_BUILD_UUID, + _JSON_BUILD_BYTE_ARRAY, + _JSON_BUILD_HW_ADDR, + _JSON_BUILD_STRING_SET, + _JSON_BUILD_CALLBACK, + _JSON_BUILD_PAIR_UNSIGNED_NON_ZERO, + _JSON_BUILD_PAIR_FINITE_USEC, + _JSON_BUILD_PAIR_STRING_NON_EMPTY, + _JSON_BUILD_PAIR_STRV_NON_EMPTY, + _JSON_BUILD_PAIR_VARIANT_NON_NULL, + _JSON_BUILD_PAIR_VARIANT_ARRAY_NON_EMPTY, + _JSON_BUILD_PAIR_IN4_ADDR_NON_NULL, + _JSON_BUILD_PAIR_IN6_ADDR_NON_NULL, + _JSON_BUILD_PAIR_IN_ADDR_NON_NULL, + _JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL, + _JSON_BUILD_PAIR_HW_ADDR_NON_NULL, + _JSON_BUILD_MAX, +}; + +typedef int (*JsonBuildCallback)(JsonVariant **ret, const char *name, void *userdata); + +#define JSON_BUILD_STRING(s) _JSON_BUILD_STRING, (const char*) { s } +#define JSON_BUILD_INTEGER(i) _JSON_BUILD_INTEGER, (int64_t) { i } +#define JSON_BUILD_UNSIGNED(u) _JSON_BUILD_UNSIGNED, (uint64_t) { u } +#define JSON_BUILD_REAL(d) _JSON_BUILD_REAL, (double) { d } +#define JSON_BUILD_BOOLEAN(b) _JSON_BUILD_BOOLEAN, (bool) { b } +#define JSON_BUILD_ARRAY(...) _JSON_BUILD_ARRAY_BEGIN, __VA_ARGS__, _JSON_BUILD_ARRAY_END +#define JSON_BUILD_EMPTY_ARRAY _JSON_BUILD_ARRAY_BEGIN, _JSON_BUILD_ARRAY_END +#define JSON_BUILD_OBJECT(...) _JSON_BUILD_OBJECT_BEGIN, __VA_ARGS__, _JSON_BUILD_OBJECT_END +#define JSON_BUILD_EMPTY_OBJECT _JSON_BUILD_OBJECT_BEGIN, _JSON_BUILD_OBJECT_END +#define JSON_BUILD_PAIR(n, ...) _JSON_BUILD_PAIR, (const char*) { n }, __VA_ARGS__ +#define JSON_BUILD_PAIR_CONDITION(c, n, ...) _JSON_BUILD_PAIR_CONDITION, (bool) { c }, (const char*) { n }, __VA_ARGS__ +#define JSON_BUILD_NULL _JSON_BUILD_NULL +#define JSON_BUILD_VARIANT(v) _JSON_BUILD_VARIANT, (JsonVariant*) { v } +#define JSON_BUILD_VARIANT_ARRAY(v, n) _JSON_BUILD_VARIANT_ARRAY, (JsonVariant **) { v }, (size_t) { n } +#define JSON_BUILD_LITERAL(l) _JSON_BUILD_LITERAL, (const char*) { l } +#define JSON_BUILD_STRV(l) _JSON_BUILD_STRV, (char**) { l } +#define JSON_BUILD_STRV_ENV_PAIR(l) _JSON_BUILD_STRV_ENV_PAIR, (char**) { l } +#define JSON_BUILD_BASE64(p, n) _JSON_BUILD_BASE64, (const void*) { p }, (size_t) { n } +#define JSON_BUILD_IOVEC_BASE64(iov) _JSON_BUILD_IOVEC_BASE64, (const struct iovec*) { iov } +#define JSON_BUILD_BASE32HEX(p, n) _JSON_BUILD_BASE32HEX, (const void*) { p }, (size_t) { n } +#define JSON_BUILD_HEX(p, n) _JSON_BUILD_HEX, (const void*) { p }, (size_t) { n } +#define JSON_BUILD_OCTESCAPE(p, n) _JSON_BUILD_OCTESCAPE, (const void*) { p }, (size_t) { n } +#define JSON_BUILD_ID128(id) _JSON_BUILD_ID128, (const sd_id128_t*) { &(id) } +#define JSON_BUILD_UUID(id) _JSON_BUILD_UUID, (const sd_id128_t*) { &(id) } +#define JSON_BUILD_BYTE_ARRAY(v, n) _JSON_BUILD_BYTE_ARRAY, (const void*) { v }, (size_t) { n } +#define JSON_BUILD_CONST_STRING(s) _JSON_BUILD_VARIANT, JSON_VARIANT_STRING_CONST(s) +#define JSON_BUILD_IN4_ADDR(v) JSON_BUILD_BYTE_ARRAY((const struct in_addr*) { v }, sizeof(struct in_addr)) +#define JSON_BUILD_IN6_ADDR(v) JSON_BUILD_BYTE_ARRAY((const struct in6_addr*) { v }, sizeof(struct in6_addr)) +#define JSON_BUILD_IN_ADDR(v, f) JSON_BUILD_BYTE_ARRAY(((const union in_addr_union*) { v })->bytes, FAMILY_ADDRESS_SIZE_SAFE(f)) +#define JSON_BUILD_ETHER_ADDR(v) JSON_BUILD_BYTE_ARRAY(((const struct ether_addr*) { v })->ether_addr_octet, sizeof(struct ether_addr)) +#define JSON_BUILD_HW_ADDR(v) _JSON_BUILD_HW_ADDR, (const struct hw_addr_data*) { v } +#define JSON_BUILD_STRING_SET(s) _JSON_BUILD_STRING_SET, (Set *) { s } +#define JSON_BUILD_CALLBACK(c, u) _JSON_BUILD_CALLBACK, (JsonBuildCallback) { c }, (void*) { u } + +#define JSON_BUILD_PAIR_STRING(name, s) JSON_BUILD_PAIR(name, JSON_BUILD_STRING(s)) +#define JSON_BUILD_PAIR_INTEGER(name, i) JSON_BUILD_PAIR(name, JSON_BUILD_INTEGER(i)) +#define JSON_BUILD_PAIR_UNSIGNED(name, u) JSON_BUILD_PAIR(name, JSON_BUILD_UNSIGNED(u)) +#define JSON_BUILD_PAIR_REAL(name, d) JSON_BUILD_PAIR(name, JSON_BUILD_REAL(d)) +#define JSON_BUILD_PAIR_BOOLEAN(name, b) JSON_BUILD_PAIR(name, JSON_BUILD_BOOLEAN(b)) +#define JSON_BUILD_PAIR_ARRAY(name, ...) JSON_BUILD_PAIR(name, JSON_BUILD_ARRAY(__VA_ARGS__)) +#define JSON_BUILD_PAIR_EMPTY_ARRAY(name) JSON_BUILD_PAIR(name, JSON_BUILD_EMPTY_ARRAY) +#define JSON_BUILD_PAIR_OBJECT(name, ...) JSON_BUILD_PAIR(name, JSON_BUILD_OBJECT(__VA_ARGS__)) +#define JSON_BUILD_PAIR_EMPTY_OBJECT(name) JSON_BUILD_PAIR(name, JSON_BUILD_EMPTY_OBJECT) +#define JSON_BUILD_PAIR_NULL(name) JSON_BUILD_PAIR(name, JSON_BUILD_NULL) +#define JSON_BUILD_PAIR_VARIANT(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_VARIANT(v)) +#define JSON_BUILD_PAIR_VARIANT_ARRAY(name, v, n) JSON_BUILD_PAIR(name, JSON_BUILD_VARIANT_ARRAY(v, n)) +#define JSON_BUILD_PAIR_LITERAL(name, l) JSON_BUILD_PAIR(name, JSON_BUILD_LITERAL(l)) +#define JSON_BUILD_PAIR_STRV(name, l) JSON_BUILD_PAIR(name, JSON_BUILD_STRV(l)) +#define JSON_BUILD_PAIR_BASE64(name, p, n) JSON_BUILD_PAIR(name, JSON_BUILD_BASE64(p, n)) +#define JSON_BUILD_PAIR_IOVEC_BASE64(name, iov) JSON_BUILD_PAIR(name, JSON_BUILD_IOVEC_BASE64(iov)) +#define JSON_BUILD_PAIR_HEX(name, p, n) JSON_BUILD_PAIR(name, JSON_BUILD_HEX(p, n)) +#define JSON_BUILD_PAIR_ID128(name, id) JSON_BUILD_PAIR(name, JSON_BUILD_ID128(id)) +#define JSON_BUILD_PAIR_UUID(name, id) JSON_BUILD_PAIR(name, JSON_BUILD_UUID(id)) +#define JSON_BUILD_PAIR_BYTE_ARRAY(name, v, n) JSON_BUILD_PAIR(name, JSON_BUILD_BYTE_ARRAY(v, n)) +#define JSON_BUILD_PAIR_IN4_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_IN4_ADDR(v)) +#define JSON_BUILD_PAIR_IN6_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_IN6_ADDR(v)) +#define JSON_BUILD_PAIR_IN_ADDR(name, v, f) JSON_BUILD_PAIR(name, JSON_BUILD_IN_ADDR(v, f)) +#define JSON_BUILD_PAIR_ETHER_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_ETHER_ADDR(v)) +#define JSON_BUILD_PAIR_HW_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_HW_ADDR(v)) +#define JSON_BUILD_PAIR_STRING_SET(name, s) JSON_BUILD_PAIR(name, JSON_BUILD_STRING_SET(s)) +#define JSON_BUILD_PAIR_CALLBACK(name, c, u) JSON_BUILD_PAIR(name, JSON_BUILD_CALLBACK(c, u)) + +#define JSON_BUILD_PAIR_UNSIGNED_NON_ZERO(name, u) _JSON_BUILD_PAIR_UNSIGNED_NON_ZERO, (const char*) { name }, (uint64_t) { u } +#define JSON_BUILD_PAIR_FINITE_USEC(name, u) _JSON_BUILD_PAIR_FINITE_USEC, (const char*) { name }, (usec_t) { u } +#define JSON_BUILD_PAIR_STRING_NON_EMPTY(name, s) _JSON_BUILD_PAIR_STRING_NON_EMPTY, (const char*) { name }, (const char*) { s } +#define JSON_BUILD_PAIR_STRV_NON_EMPTY(name, l) _JSON_BUILD_PAIR_STRV_NON_EMPTY, (const char*) { name }, (char**) { l } +#define JSON_BUILD_PAIR_VARIANT_NON_NULL(name, v) _JSON_BUILD_PAIR_VARIANT_NON_NULL, (const char*) { name }, (JsonVariant*) { v } +#define JSON_BUILD_PAIR_IN4_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_IN4_ADDR_NON_NULL, (const char*) { name }, (const struct in_addr*) { v } +#define JSON_BUILD_PAIR_IN6_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_IN6_ADDR_NON_NULL, (const char*) { name }, (const struct in6_addr*) { v } +#define JSON_BUILD_PAIR_IN_ADDR_NON_NULL(name, v, f) _JSON_BUILD_PAIR_IN_ADDR_NON_NULL, (const char*) { name }, (const union in_addr_union*) { v }, (int) { f } +#define JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL, (const char*) { name }, (const struct ether_addr*) { v } +#define JSON_BUILD_PAIR_HW_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_HW_ADDR_NON_NULL, (const char*) { name }, (const struct hw_addr_data*) { v } + +int json_build(JsonVariant **ret, ...); +int json_buildv(JsonVariant **ret, va_list ap); + +/* A bitmask of flags used by the dispatch logic. Note that this is a combined bit mask, that is generated from the bit + * mask originally passed into json_dispatch(), the individual bitmask associated with the static JsonDispatch callout + * entry, as well the bitmask specified for json_log() calls */ +typedef enum JsonDispatchFlags { + /* The following three may be set in JsonDispatch's .flags field or the json_dispatch() flags parameter */ + JSON_PERMISSIVE = 1 << 0, /* Shall parsing errors be considered fatal for this property? */ + JSON_MANDATORY = 1 << 1, /* Should existence of this property be mandatory? */ + JSON_LOG = 1 << 2, /* Should the parser log about errors? */ + JSON_SAFE = 1 << 3, /* Don't accept "unsafe" strings in json_dispatch_string() + json_dispatch_string() */ + JSON_RELAX = 1 << 4, /* Use relaxed user name checking in json_dispatch_user_group_name */ + + /* The following two may be passed into log_json() in addition to those above */ + JSON_DEBUG = 1 << 5, /* Indicates that this log message is a debug message */ + JSON_WARNING = 1 << 6, /* Indicates that this log message is a warning message */ +} JsonDispatchFlags; + +typedef int (*JsonDispatchCallback)(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); + +typedef struct JsonDispatch { + const char *name; + JsonVariantType type; + JsonDispatchCallback callback; + size_t offset; + JsonDispatchFlags flags; +} JsonDispatch; + +int json_dispatch_full(JsonVariant *v, const JsonDispatch table[], JsonDispatchCallback bad, JsonDispatchFlags flags, void *userdata, const char **reterr_bad_field); + +static inline int json_dispatch(JsonVariant *v, const JsonDispatch table[], JsonDispatchFlags flags, void *userdata) { + return json_dispatch_full(v, table, NULL, flags, userdata, NULL); +} + +int json_dispatch_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_const_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_strv(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_boolean(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_tristate(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_variant(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_variant_noref(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_int64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_uint64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_uint32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_int32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_uint16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_int16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_uid_gid(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_user_group_name(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_id128(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_unsupported(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_unbase64_iovec(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); + +assert_cc(sizeof(uint32_t) == sizeof(unsigned)); +#define json_dispatch_uint json_dispatch_uint32 + +assert_cc(sizeof(int32_t) == sizeof(int)); +#define json_dispatch_int json_dispatch_int32 + +static inline int json_dispatch_level(JsonDispatchFlags flags) { + + /* Did the user request no logging? If so, then never log higher than LOG_DEBUG. Also, if this is marked as + * debug message, then also log at debug level. */ + + if (!(flags & JSON_LOG) || + (flags & JSON_DEBUG)) + return LOG_DEBUG; + + /* Are we invoked in permissive mode, or is this explicitly marked as warning message? Then this should be + * printed at LOG_WARNING */ + if (flags & (JSON_PERMISSIVE|JSON_WARNING)) + return LOG_WARNING; + + /* Otherwise it's an error. */ + return LOG_ERR; +} + +int json_log_internal(JsonVariant *variant, int level, int error, const char *file, int line, const char *func, const char *format, ...) _printf_(7, 8); + +#define json_log(variant, flags, error, ...) \ + ({ \ + int _level = json_dispatch_level(flags), _e = (error); \ + (log_get_max_level() >= LOG_PRI(_level)) \ + ? json_log_internal(variant, _level, _e, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__) \ + : -ERRNO_VALUE(_e); \ + }) + +#define json_log_oom(variant, flags) \ + json_log(variant, flags, SYNTHETIC_ERRNO(ENOMEM), "Out of memory.") + +#define JSON_VARIANT_STRING_CONST(x) _JSON_VARIANT_STRING_CONST(UNIQ, (x)) + +#define _JSON_VARIANT_STRING_CONST(xq, x) \ + ({ \ + _align_(2) static const char UNIQ_T(json_string_const, xq)[] = (x); \ + assert((((uintptr_t) UNIQ_T(json_string_const, xq)) & 1) == 0); \ + (JsonVariant*) ((uintptr_t) UNIQ_T(json_string_const, xq) + 1); \ + }) + +int json_variant_unbase64(JsonVariant *v, void **ret, size_t *ret_size); +int json_variant_unhex(JsonVariant *v, void **ret, size_t *ret_size); + +const char *json_variant_type_to_string(JsonVariantType t); +JsonVariantType json_variant_type_from_string(const char *s); diff --git a/src/shared/kbd-util.c b/src/shared/kbd-util.c new file mode 100644 index 0000000..2f2d161 --- /dev/null +++ b/src/shared/kbd-util.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "errno-util.h" +#include "kbd-util.h" +#include "log.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "recurse-dir.h" +#include "set.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +struct recurse_dir_userdata { + const char *keymap_name; + Set *keymaps; +}; + +static int keymap_recurse_dir_callback( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + struct recurse_dir_userdata *data = userdata; + _cleanup_free_ char *p = NULL; + int r; + + assert(de); + + /* If 'keymap_name' is non-NULL, return true if keymap 'keymap_name' is found. Otherwise, add all + * keymaps to 'keymaps'. */ + + if (event != RECURSE_DIR_ENTRY) + return RECURSE_DIR_CONTINUE; + + if (!IN_SET(de->d_type, DT_REG, DT_LNK)) + return RECURSE_DIR_CONTINUE; + + const char *e = endswith(de->d_name, ".map") ?: endswith(de->d_name, ".map.gz"); + if (!e) + return RECURSE_DIR_CONTINUE; + + p = strndup(de->d_name, e - de->d_name); + if (!p) + return -ENOMEM; + + if (data->keymap_name) + return streq(p, data->keymap_name) ? 1 : RECURSE_DIR_CONTINUE; + + assert(data->keymaps); + + if (!keymap_is_valid(p)) + return 0; + + r = set_consume(data->keymaps, TAKE_PTR(p)); + if (r < 0) + return r; + + return RECURSE_DIR_CONTINUE; +} + +int get_keymaps(char ***ret) { + _cleanup_set_free_free_ Set *keymaps = NULL; + int r; + + keymaps = set_new(&string_hash_ops); + if (!keymaps) + return -ENOMEM; + + NULSTR_FOREACH(dir, KBD_KEYMAP_DIRS) { + r = recurse_dir_at( + AT_FDCWD, + dir, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE, + keymap_recurse_dir_callback, + &(struct recurse_dir_userdata) { + .keymaps = keymaps, + }); + if (r == -ENOENT) + continue; + if (ERRNO_IS_NEG_RESOURCE(r)) + return log_warning_errno(r, "Failed to read keymap list from %s: %m", dir); + if (r < 0) + log_debug_errno(r, "Failed to read keymap list from %s, ignoring: %m", dir); + } + + _cleanup_strv_free_ char **l = set_get_strv(keymaps); + if (!l) + return -ENOMEM; + + keymaps = set_free(keymaps); /* If we got the strv above, then do a set_free() rather than + * set_free_free() since the entries of the set are now owned by the + * strv */ + + if (strv_isempty(l)) + return -ENOENT; + + strv_sort(l); + + *ret = TAKE_PTR(l); + return 0; +} + +bool keymap_is_valid(const char *name) { + if (isempty(name)) + return false; + + if (strlen(name) >= 128) + return false; + + if (!utf8_is_valid(name)) + return false; + + if (!filename_is_valid(name)) + return false; + + if (!string_is_safe(name)) + return false; + + return true; +} + +int keymap_exists(const char *name) { + int r; + + if (!keymap_is_valid(name)) + return -EINVAL; + + NULSTR_FOREACH(dir, KBD_KEYMAP_DIRS) { + r = recurse_dir_at( + AT_FDCWD, + dir, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE, + keymap_recurse_dir_callback, + &(struct recurse_dir_userdata) { + .keymap_name = name, + }); + if (r > 0) + return true; + if (ERRNO_IS_NEG_RESOURCE(r)) + return r; + if (r < 0 && r != -ENOENT) + log_debug_errno(r, "Failed to read keymap list from %s, ignoring: %m", dir); + } + + return false; +} diff --git a/src/shared/kbd-util.h b/src/shared/kbd-util.h new file mode 100644 index 0000000..aca0dee --- /dev/null +++ b/src/shared/kbd-util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#define KBD_KEYMAP_DIRS \ + "/usr/share/keymaps/\0" \ + "/usr/share/kbd/keymaps/\0" \ + "/usr/lib/kbd/keymaps/\0" + +int get_keymaps(char ***l); +bool keymap_is_valid(const char *name); +int keymap_exists(const char *name); diff --git a/src/shared/kernel-image.c b/src/shared/kernel-image.c new file mode 100644 index 0000000..7dc9e01 --- /dev/null +++ b/src/shared/kernel-image.c @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fileio.h" +#include "env-file.h" +#include "kernel-image.h" +#include "os-util.h" +#include "parse-util.h" +#include "pe-binary.h" +#include "string-table.h" + +#define PE_SECTION_READ_MAX (16U*1024U) + +static const char * const kernel_image_type_table[_KERNEL_IMAGE_TYPE_MAX] = { + [KERNEL_IMAGE_TYPE_UNKNOWN] = "unknown", + [KERNEL_IMAGE_TYPE_UKI] = "uki", + [KERNEL_IMAGE_TYPE_PE] = "pe", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(kernel_image_type, KernelImageType); + +static int uki_read_pretty_name( + int fd, + const PeHeader *pe_header, + const IMAGE_SECTION_HEADER *sections, + char **ret) { + + _cleanup_free_ char *pname = NULL, *name = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ void *osrel = NULL; + size_t osrel_size; + int r; + + assert(fd >= 0); + assert(pe_header); + assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0); + assert(ret); + + r = pe_read_section_data( + fd, + pe_header, + sections, + ".osrel", + /* max_size=*/ PE_SECTION_READ_MAX, + &osrel, + &osrel_size); + if (r == -ENXIO) { /* Section not found */ + *ret = NULL; + return 0; + } + + f = fmemopen(osrel, osrel_size, "r"); + if (!f) + return log_error_errno(errno, "Failed to open embedded os-release file: %m"); + + r = parse_env_file( + f, NULL, + "PRETTY_NAME", &pname, + "NAME", &name); + if (r < 0) + return log_error_errno(r, "Failed to parse embedded os-release file: %m"); + + /* follow the same logic as os_release_pretty_name() */ + if (!isempty(pname)) + *ret = TAKE_PTR(pname); + else if (!isempty(name)) + *ret = TAKE_PTR(name); + else { + char *n = strdup("Linux"); + if (!n) + return log_oom(); + + *ret = n; + } + + return 0; +} + +static int inspect_uki( + int fd, + const PeHeader *pe_header, + const IMAGE_SECTION_HEADER *sections, + char **ret_cmdline, + char **ret_uname, + char **ret_pretty_name) { + + _cleanup_free_ char *cmdline = NULL, *uname = NULL, *pname = NULL; + int r; + + assert(fd >= 0); + assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0); + + if (ret_cmdline) { + r = pe_read_section_data(fd, pe_header, sections, ".cmdline", PE_SECTION_READ_MAX, (void**) &cmdline, NULL); + if (r < 0 && r != -ENXIO) /* If the section doesn't exist, that's fine */ + return r; + } + + if (ret_uname) { + r = pe_read_section_data(fd, pe_header, sections, ".uname", PE_SECTION_READ_MAX, (void**) &uname, NULL); + if (r < 0 && r != -ENXIO) /* If the section doesn't exist, that's fine */ + return r; + } + + if (ret_pretty_name) { + r = uki_read_pretty_name(fd, pe_header, sections, &pname); + if (r < 0) + return r; + } + + if (ret_cmdline) + *ret_cmdline = TAKE_PTR(cmdline); + if (ret_uname) + *ret_uname = TAKE_PTR(uname); + if (ret_pretty_name) + *ret_pretty_name = TAKE_PTR(pname); + + return 0; +} + +int inspect_kernel( + int dir_fd, + const char *filename, + KernelImageType *ret_type, + char **ret_cmdline, + char **ret_uname, + char **ret_pretty_name) { + + _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL; + _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL; + KernelImageType t = KERNEL_IMAGE_TYPE_UNKNOWN; + _cleanup_free_ PeHeader *pe_header = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(filename); + + fd = openat(dir_fd, filename, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open kernel image file '%s': %m", filename); + + r = pe_load_headers(fd, &dos_header, &pe_header); + if (r == -EBADMSG) /* not a valid PE file */ + goto not_uki; + if (r < 0) + return log_error_errno(r, "Failed to parse kernel image file '%s': %m", filename); + + r = pe_load_sections(fd, dos_header, pe_header, §ions); + if (r == -EBADMSG) /* not a valid PE file */ + goto not_uki; + if (r < 0) + return log_error_errno(r, "Failed to load PE sections from kernel image file '%s': %m", filename); + + if (pe_is_uki(pe_header, sections)) { + r = inspect_uki(fd, pe_header, sections, ret_cmdline, ret_uname, ret_pretty_name); + if (r < 0) + return r; + + t = KERNEL_IMAGE_TYPE_UKI; + goto done; + } else + t = KERNEL_IMAGE_TYPE_PE; + +not_uki: + if (ret_cmdline) + *ret_cmdline = NULL; + if (ret_uname) + *ret_uname = NULL; + if (ret_pretty_name) + *ret_pretty_name = NULL; + +done: + if (ret_type) + *ret_type = t; + + return 0; +} diff --git a/src/shared/kernel-image.h b/src/shared/kernel-image.h new file mode 100644 index 0000000..41b2c08 --- /dev/null +++ b/src/shared/kernel-image.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +typedef enum KernelImageType { + KERNEL_IMAGE_TYPE_UNKNOWN, + KERNEL_IMAGE_TYPE_UKI, + KERNEL_IMAGE_TYPE_PE, + _KERNEL_IMAGE_TYPE_MAX, + _KERNEL_IMAGE_TYPE_INVALID = -EINVAL, +} KernelImageType; + +const char* kernel_image_type_to_string(KernelImageType t) _const_; + +int inspect_kernel( + int dir_fd, + const char *filename, + KernelImageType *ret_type, + char **ret_cmdline, + char **ret_uname, + char **ret_pretty_name); diff --git a/src/shared/keyring-util.c b/src/shared/keyring-util.c new file mode 100644 index 0000000..fadd90e --- /dev/null +++ b/src/shared/keyring-util.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "keyring-util.h" +#include "memory-util.h" +#include "missing_syscall.h" + +int keyring_read(key_serial_t serial, void **ret, size_t *ret_size) { + size_t bufsize = 100; + + for (;;) { + _cleanup_(erase_and_freep) uint8_t *buf = NULL; + long n; + + buf = new(uint8_t, bufsize + 1); + if (!buf) + return -ENOMEM; + + n = keyctl(KEYCTL_READ, (unsigned long) serial, (unsigned long) buf, (unsigned long) bufsize, 0); + if (n < 0) + return -errno; + + if ((size_t) n <= bufsize) { + buf[n] = 0; /* NUL terminate, just in case */ + + if (ret) + *ret = TAKE_PTR(buf); + if (ret_size) + *ret_size = n; + + return 0; + } + + bufsize = (size_t) n; + } +} diff --git a/src/shared/keyring-util.h b/src/shared/keyring-util.h new file mode 100644 index 0000000..c8c53f1 --- /dev/null +++ b/src/shared/keyring-util.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "missing_keyctl.h" + +/* Like TAKE_PTR() but for key_serial_t, resetting them to -1 */ +#define TAKE_KEY_SERIAL(key_serial) TAKE_GENERIC(key_serial, key_serial_t, -1) + +int keyring_read(key_serial_t serial, void **ret, size_t *ret_size); diff --git a/src/shared/killall.c b/src/shared/killall.c new file mode 100644 index 0000000..917b773 --- /dev/null +++ b/src/shared/killall.c @@ -0,0 +1,319 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "constants.h" +#include "dirent-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "initrd-util.h" +#include "killall.h" +#include "parse-util.h" +#include "process-util.h" +#include "set.h" +#include "stdio-util.h" +#include "string-util.h" +#include "terminal-util.h" + +static bool argv_has_at(pid_t pid) { + _cleanup_fclose_ FILE *f = NULL; + const char *p; + char c = 0; + + p = procfs_file_alloca(pid, "cmdline"); + f = fopen(p, "re"); + if (!f) { + log_debug_errno(errno, "Failed to open %s, ignoring: %m", p); + return true; /* not really, but has the desired effect */ + } + + /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for + * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as + * actual kernel threads are already filtered out above. */ + (void) fread(&c, 1, 1, f); + + /* Processes with argv[0][0] = '@' we ignore from the killing spree. + * + * https://systemd.io/ROOT_STORAGE_DAEMONS */ + return c == '@'; +} + +static bool is_survivor_cgroup(const PidRef *pid) { + _cleanup_free_ char *cgroup_path = NULL; + int r; + + assert(pidref_is_set(pid)); + + r = cg_pidref_get_path(/* root= */ NULL, pid, &cgroup_path); + if (r < 0) { + log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid->pid); + return false; + } + + r = cg_get_xattr_bool(cgroup_path, "user.survive_final_kill_signal"); + /* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */ + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) + r = cg_get_xattr_bool(cgroup_path, "trusted.survive_final_kill_signal"); + if (r < 0 && !ERRNO_IS_NEG_XATTR_ABSENT(r)) + log_debug_errno(r, + "Failed to get survive_final_kill_signal xattr of %s, ignoring: %m", + cgroup_path); + + return r > 0; +} + +static bool ignore_proc(const PidRef *pid, bool warn_rootfs) { + uid_t uid; + int r; + + assert(pidref_is_set(pid)); + + /* We are PID 1, let's not commit suicide */ + if (pid->pid == 1) + return true; + + /* Ignore kernel threads */ + r = pidref_is_kernel_thread(pid); + if (r != 0) + return true; /* also ignore processes where we can't determine this */ + + /* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */ + if (is_survivor_cgroup(pid)) + return true; + + r = pidref_get_uid(pid, &uid); + if (r < 0) + return true; /* not really, but better safe than sorry */ + + /* Non-root processes otherwise are always subject to be killed */ + if (uid != 0) + return false; + + if (!argv_has_at(pid->pid)) + return false; + + if (warn_rootfs && + pid_from_same_root_fs(pid->pid) > 0) { + + _cleanup_free_ char *comm = NULL; + + (void) pidref_get_comm(pid, &comm); + + log_notice("Process " PID_FMT " (%s) has been marked to be excluded from killing. It is " + "running from the root file system, and thus likely to block re-mounting of the " + "root file system to read-only. Please consider moving it into an initrd file " + "system instead.", pid->pid, strna(comm)); + } + + return true; +} + +static void log_children_no_yet_killed(Set *pids) { + _cleanup_free_ char *lst_child = NULL; + void *p; + int r; + + SET_FOREACH(p, pids) { + _cleanup_free_ char *s = NULL; + + if (pid_get_comm(PTR_TO_PID(p), &s) >= 0) + r = strextendf(&lst_child, ", " PID_FMT " (%s)", PTR_TO_PID(p), s); + else + r = strextendf(&lst_child, ", " PID_FMT, PTR_TO_PID(p)); + if (r < 0) + return (void) log_oom_warning(); + } + + if (isempty(lst_child)) + return; + + log_warning("Waiting for process: %s", lst_child + 2); +} + +static int wait_for_children(Set *pids, sigset_t *mask, usec_t timeout) { + usec_t until, date_log_child, n; + + assert(mask); + + /* Return the number of children remaining in the pids set: That correspond to the number + * of processes still "alive" after the timeout */ + + if (set_isempty(pids)) + return 0; + + n = now(CLOCK_MONOTONIC); + until = usec_add(n, timeout); + date_log_child = usec_add(n, 10u * USEC_PER_SEC); + if (date_log_child > until) + date_log_child = usec_add(n, timeout / 2u); + + for (;;) { + struct timespec ts; + int k; + void *p; + + /* First, let the kernel inform us about killed + * children. Most processes will probably be our + * children, but some are not (might be our + * grandchildren instead...). */ + for (;;) { + pid_t pid; + + pid = waitpid(-1, NULL, WNOHANG); + if (pid == 0) + break; + if (pid < 0) { + if (errno == ECHILD) + break; + + return log_error_errno(errno, "waitpid() failed: %m"); + } + + (void) set_remove(pids, PID_TO_PTR(pid)); + } + + /* Now explicitly check who might be remaining, who + * might not be our child. */ + SET_FOREACH(p, pids) { + + /* kill(pid, 0) sends no signal, but it tells + * us whether the process still exists. */ + if (kill(PTR_TO_PID(p), 0) == 0) + continue; + + if (errno != ESRCH) + continue; + + set_remove(pids, p); + } + + if (set_isempty(pids)) + return 0; + + n = now(CLOCK_MONOTONIC); + if (date_log_child > 0 && n >= date_log_child) { + log_children_no_yet_killed(pids); + /* Log the children not yet killed only once */ + date_log_child = 0; + } + + if (n >= until) + return set_size(pids); + + if (date_log_child > 0) + timespec_store(&ts, MIN(until - n, date_log_child - n)); + else + timespec_store(&ts, until - n); + + k = sigtimedwait(mask, NULL, &ts); + if (k != SIGCHLD) { + + if (k < 0 && errno != EAGAIN) + return log_error_errno(errno, "sigtimedwait() failed: %m"); + + if (k >= 0) + log_warning("sigtimedwait() returned unexpected signal."); + } + } +} + +static int killall(int sig, Set *pids, bool send_sighup) { + _cleanup_closedir_ DIR *dir = NULL; + int n_killed = 0, r; + + /* Send the specified signal to all remaining processes, if not excluded by ignore_proc(). + * Returns the number of processes to which the specified signal was sent */ + + r = proc_dir_open(&dir); + if (r < 0) + return log_warning_errno(r, "opendir(/proc) failed: %m"); + + for (;;) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + + r = proc_dir_read_pidref(dir, &pidref); + if (r < 0) + return log_warning_errno(r, "Failed to enumerate /proc: %m"); + if (r == 0) + break; + + if (ignore_proc(&pidref, sig == SIGKILL && !in_initrd())) + continue; + + if (sig == SIGKILL) { + _cleanup_free_ char *s = NULL; + + (void) pidref_get_comm(&pidref, &s); + log_notice("Sending SIGKILL to PID "PID_FMT" (%s).", pidref.pid, strna(s)); + } + + r = pidref_kill(&pidref, sig); + if (r < 0) { + if (r != -ESRCH) + log_warning_errno(errno, "Could not kill " PID_FMT ", ignoring: %m", pidref.pid); + } else { + n_killed++; + if (pids) { + r = set_put(pids, PID_TO_PTR(pidref.pid)); + if (r < 0) + (void) log_oom_warning(); + } + } + + if (send_sighup) { + /* Optionally, also send a SIGHUP signal, but only if the process has a controlling + * tty. This is useful to allow handling of shells which ignore SIGTERM but react to + * SIGHUP. We do not send this to processes that have no controlling TTY since we + * don't want to trigger reloads of daemon processes. Also we make sure to only send + * this after SIGTERM so that SIGTERM is always first in the queue. */ + + if (get_ctty_devnr(pidref.pid, NULL) >= 0) + /* it's OK if the process is gone, just ignore the result */ + (void) pidref_kill(&pidref, SIGHUP); + } + } + + return n_killed; +} + +int broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout) { + int n_children_left; + sigset_t mask, oldmask; + _cleanup_set_free_ Set *pids = NULL; + + /* Send the specified signal to all remaining processes, if not excluded by ignore_proc(). + * Return: + * - The number of processes still "alive" after the timeout (that should have been killed) + * if the function needs to wait for the end of the processes (wait_for_exit). + * - Otherwise, the number of processes to which the specified signal was sent */ + + if (wait_for_exit) + pids = set_new(NULL); + + assert_se(sigemptyset(&mask) == 0); + assert_se(sigaddset(&mask, SIGCHLD) == 0); + assert_se(sigprocmask(SIG_BLOCK, &mask, &oldmask) == 0); + + if (kill(-1, SIGSTOP) < 0 && errno != ESRCH) + log_warning_errno(errno, "kill(-1, SIGSTOP) failed: %m"); + + n_children_left = killall(sig, pids, send_sighup); + + if (kill(-1, SIGCONT) < 0 && errno != ESRCH) + log_warning_errno(errno, "kill(-1, SIGCONT) failed: %m"); + + if (wait_for_exit && n_children_left > 0) + n_children_left = wait_for_children(pids, &mask, timeout); + + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0); + + return n_children_left; +} diff --git a/src/shared/killall.h b/src/shared/killall.h new file mode 100644 index 0000000..d8ef96f --- /dev/null +++ b/src/shared/killall.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +int broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout); diff --git a/src/shared/label-util.c b/src/shared/label-util.c new file mode 100644 index 0000000..308fbff --- /dev/null +++ b/src/shared/label-util.c @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "btrfs-util.h" +#include "fs-util.h" +#include "label.h" +#include "label-util.h" +#include "macro.h" +#include "selinux-util.h" +#include "smack-util.h" + +int label_fix_full( + int atfd, + const char *inode_path, /* path of inode to apply label to */ + const char *label_path, /* path to use as database lookup key in label database (typically same as inode_path, but not always) */ + LabelFixFlags flags) { + + int r, q; + + if (atfd < 0 && atfd != AT_FDCWD) + return -EBADF; + + if (!inode_path && atfd < 0) /* We need at least one of atfd and an inode path */ + return -EINVAL; + + /* If both atfd and inode_path are specified, we take the specified path relative to atfd which must be an fd to a dir. + * + * If only atfd is specified (and inode_path is NULL), we'll operated on the inode the atfd refers to. + * + * If atfd is AT_FDCWD then we'll operate on the inode the path refers to. + */ + + r = mac_selinux_fix_full(atfd, inode_path, label_path, flags); + q = mac_smack_fix_full(atfd, inode_path, label_path, flags); + if (r < 0) + return r; + if (q < 0) + return q; + + return 0; +} + +int symlink_label(const char *old_path, const char *new_path) { + int r; + + assert(old_path); + assert(new_path); + + r = mac_selinux_create_file_prepare(new_path, S_IFLNK); + if (r < 0) + return r; + + r = RET_NERRNO(symlink(old_path, new_path)); + mac_selinux_create_file_clear(); + + if (r < 0) + return r; + + return mac_smack_fix(new_path, 0); +} + +int symlink_atomic_full_label(const char *from, const char *to, bool make_relative) { + int r; + + assert(from); + assert(to); + + r = mac_selinux_create_file_prepare(to, S_IFLNK); + if (r < 0) + return r; + + r = symlinkat_atomic_full(from, AT_FDCWD, to, make_relative); + mac_selinux_create_file_clear(); + + if (r < 0) + return r; + + return mac_smack_fix(to, 0); +} + +int mknod_label(const char *pathname, mode_t mode, dev_t dev) { + int r; + + assert(pathname); + + r = mac_selinux_create_file_prepare(pathname, mode); + if (r < 0) + return r; + + r = RET_NERRNO(mknod(pathname, mode, dev)); + mac_selinux_create_file_clear(); + + if (r < 0) + return r; + + return mac_smack_fix(pathname, 0); +} + +int btrfs_subvol_make_label(const char *path) { + int r; + + assert(path); + + r = mac_selinux_create_file_prepare(path, S_IFDIR); + if (r < 0) + return r; + + r = btrfs_subvol_make(AT_FDCWD, path); + mac_selinux_create_file_clear(); + + if (r < 0) + return r; + + return mac_smack_fix(path, 0); +} + +static int init_internal(bool lazy) { + int r; + + assert(!(mac_selinux_use() && mac_smack_use())); + + if (lazy) + r = mac_selinux_init_lazy(); + else + r = mac_selinux_init(); + if (r < 0) + return r; + + return mac_smack_init(); +} + +int mac_init_lazy(void) { + return init_internal(/* lazy=*/ true); +} + +int mac_init(void) { + return init_internal(/* lazy=*/ false); +} diff --git a/src/shared/label-util.h b/src/shared/label-util.h new file mode 100644 index 0000000..7fb98c7 --- /dev/null +++ b/src/shared/label-util.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +typedef enum LabelFixFlags { + LABEL_IGNORE_ENOENT = 1 << 0, + LABEL_IGNORE_EROFS = 1 << 1, +} LabelFixFlags; + +int label_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags); + +static inline int label_fix(const char *path, LabelFixFlags flags) { + return label_fix_full(AT_FDCWD, path, path, flags); +} + +int symlink_label(const char *old_path, const char *new_path); +int symlink_atomic_full_label(const char *from, const char *to, bool make_relative); +static inline int symlink_atomic_label(const char *from, const char *to) { + return symlink_atomic_full_label(from, to, false); +} +int mknod_label(const char *pathname, mode_t mode, dev_t dev); + +int btrfs_subvol_make_label(const char *path); + +int mac_init(void); +int mac_init_lazy(void); diff --git a/src/shared/libcrypt-util.c b/src/shared/libcrypt-util.c new file mode 100644 index 0000000..81e6f17 --- /dev/null +++ b/src/shared/libcrypt-util.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_CRYPT_H +/* libxcrypt is a replacement for glibc's libcrypt, and libcrypt might be + * removed from glibc at some point. As part of the removal, defines for + * crypt(3) are dropped from unistd.h, and we must include crypt.h instead. + * + * Newer versions of glibc (v2.0+) already ship crypt.h with a definition + * of crypt(3) as well, so we simply include it if it is present. MariaDB, + * MySQL, PostgreSQL, Perl and some other wide-spread packages do it the + * same way since ages without any problems. + */ +# include +#else +# include +#endif + +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "libcrypt-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "missing_stdlib.h" +#include "random-util.h" +#include "string-util.h" +#include "strv.h" + +int make_salt(char **ret) { + +#if HAVE_CRYPT_GENSALT_RA + const char *e; + char *salt; + + /* If we have crypt_gensalt_ra() we default to the "preferred method" (i.e. usually yescrypt). + * crypt_gensalt_ra() is usually provided by libxcrypt. */ + + e = secure_getenv("SYSTEMD_CRYPT_PREFIX"); + if (!e) +#if HAVE_CRYPT_PREFERRED_METHOD + e = crypt_preferred_method(); +#else + e = "$6$"; +#endif + + log_debug("Generating salt for hash prefix: %s", e); + + salt = crypt_gensalt_ra(e, 0, NULL, 0); + if (!salt) + return -errno; + + *ret = salt; + return 0; +#else + /* If crypt_gensalt_ra() is not available, we use SHA512 and generate the salt on our own. */ + + static const char table[] = + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789" + "./"; + + uint8_t raw[16]; + char *salt, *j; + size_t i; + int r; + + /* This is a bit like crypt_gensalt_ra(), but doesn't require libcrypt, and doesn't do anything but + * SHA512, i.e. is legacy-free and minimizes our deps. */ + + assert_cc(sizeof(table) == 64U + 1U); + + log_debug("Generating fallback salt for hash prefix: $6$"); + + /* Insist on the best randomness by setting RANDOM_BLOCK, this is about keeping passwords secret after all. */ + r = crypto_random_bytes(raw, sizeof(raw)); + if (r < 0) + return r; + + salt = new(char, 3+sizeof(raw)+1+1); + if (!salt) + return -ENOMEM; + + /* We only bother with SHA512 hashed passwords, the rest is legacy, and we don't do legacy. */ + j = stpcpy(salt, "$6$"); + for (i = 0; i < sizeof(raw); i++) + j[i] = table[raw[i] & 63]; + j[i++] = '$'; + j[i] = 0; + + *ret = salt; + return 0; +#endif +} + +#if HAVE_CRYPT_RA +# define CRYPT_RA_NAME "crypt_ra" +#else +# define CRYPT_RA_NAME "crypt_r" + +/* Provide a poor man's fallback that uses a fixed size buffer. */ + +static char* systemd_crypt_ra(const char *phrase, const char *setting, void **data, int *size) { + assert(data); + assert(size); + + /* We allocate the buffer because crypt(3) says: struct crypt_data may be quite large (32kB in this + * implementation of libcrypt; over 128kB in some other implementations). This is large enough that + * it may be unwise to allocate it on the stack. */ + + if (!*data) { + *data = new0(struct crypt_data, 1); + if (!*data) { + errno = -ENOMEM; + return NULL; + } + + *size = (int) (sizeof(struct crypt_data)); + } + + char *t = crypt_r(phrase, setting, *data); + if (!t) + return NULL; + + /* crypt_r may return a pointer to an invalid hashed password on error. Our callers expect NULL on + * error, so let's just return that. */ + if (t[0] == '*') + return NULL; + + return t; +} + +#define crypt_ra systemd_crypt_ra + +#endif + +int hash_password_full(const char *password, void **cd_data, int *cd_size, char **ret) { + _cleanup_free_ char *salt = NULL; + _cleanup_(erase_and_freep) void *_cd_data = NULL; + char *p; + int r, _cd_size = 0; + + assert(!!cd_data == !!cd_size); + + r = make_salt(&salt); + if (r < 0) + return log_debug_errno(r, "Failed to generate salt: %m"); + + errno = 0; + p = crypt_ra(password, salt, cd_data ?: &_cd_data, cd_size ?: &_cd_size); + if (!p) + return log_debug_errno(errno_or_else(SYNTHETIC_ERRNO(EINVAL)), + CRYPT_RA_NAME "() failed: %m"); + + p = strdup(p); + if (!p) + return -ENOMEM; + + *ret = p; + return 0; +} + +bool looks_like_hashed_password(const char *s) { + /* Returns false if the specified string is certainly not a hashed UNIX password. crypt(5) lists + * various hashing methods. We only reject (return false) strings which are documented to have + * different meanings. + * + * In particular, we allow locked passwords, i.e. strings starting with "!", including just "!", + * i.e. the locked empty password. See also fc58c0c7bf7e4f525b916e3e5be0de2307fef04e. + */ + if (!s) + return false; + + s += strspn(s, "!"); /* Skip (possibly duplicated) locking prefix */ + + return !STR_IN_SET(s, "x", "*"); +} + +int test_password_one(const char *hashed_password, const char *password) { + _cleanup_(erase_and_freep) void *cd_data = NULL; + int cd_size = 0; + const char *k; + + errno = 0; + k = crypt_ra(password, hashed_password, &cd_data, &cd_size); + if (!k) { + if (errno == ENOMEM) + return -ENOMEM; + /* Unknown or unavailable hashing method or string too short */ + return 0; + } + + return streq(k, hashed_password); +} + +int test_password_many(char **hashed_password, const char *password) { + int r; + + STRV_FOREACH(hpw, hashed_password) { + r = test_password_one(*hpw, password); + if (r < 0) + return r; + if (r > 0) + return true; + } + + return false; +} diff --git a/src/shared/libcrypt-util.h b/src/shared/libcrypt-util.h new file mode 100644 index 0000000..5b9b945 --- /dev/null +++ b/src/shared/libcrypt-util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int make_salt(char **ret); +int hash_password_full(const char *password, void **cd_data, int *cd_size, char **ret); +static inline int hash_password(const char *password, char **ret) { + return hash_password_full(password, NULL, NULL, ret); +} +bool looks_like_hashed_password(const char *s); +int test_password_one(const char *hashed_password, const char *password); +int test_password_many(char **hashed_password, const char *password); diff --git a/src/shared/libfido2-util.c b/src/shared/libfido2-util.c new file mode 100644 index 0000000..1cc3afe --- /dev/null +++ b/src/shared/libfido2-util.c @@ -0,0 +1,1296 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "libfido2-util.h" + +#if HAVE_LIBFIDO2 +#include "alloc-util.h" +#include "ask-password-api.h" +#include "dlfcn-util.h" +#include "format-table.h" +#include "glyph-util.h" +#include "log.h" +#include "memory-util.h" +#include "random-util.h" +#include "strv.h" +#include "unistd.h" + +static void *libfido2_dl = NULL; + +int (*sym_fido_assert_allow_cred)(fido_assert_t *, const unsigned char *, size_t) = NULL; +void (*sym_fido_assert_free)(fido_assert_t **) = NULL; +size_t (*sym_fido_assert_hmac_secret_len)(const fido_assert_t *, size_t) = NULL; +const unsigned char* (*sym_fido_assert_hmac_secret_ptr)(const fido_assert_t *, size_t) = NULL; +fido_assert_t* (*sym_fido_assert_new)(void) = NULL; +int (*sym_fido_assert_set_clientdata_hash)(fido_assert_t *, const unsigned char *, size_t) = NULL; +int (*sym_fido_assert_set_extensions)(fido_assert_t *, int) = NULL; +int (*sym_fido_assert_set_hmac_salt)(fido_assert_t *, const unsigned char *, size_t) = NULL; +int (*sym_fido_assert_set_rp)(fido_assert_t *, const char *) = NULL; +int (*sym_fido_assert_set_up)(fido_assert_t *, fido_opt_t) = NULL; +int (*sym_fido_assert_set_uv)(fido_assert_t *, fido_opt_t) = NULL; +size_t (*sym_fido_cbor_info_extensions_len)(const fido_cbor_info_t *) = NULL; +char **(*sym_fido_cbor_info_extensions_ptr)(const fido_cbor_info_t *) = NULL; +void (*sym_fido_cbor_info_free)(fido_cbor_info_t **) = NULL; +fido_cbor_info_t* (*sym_fido_cbor_info_new)(void) = NULL; +size_t (*sym_fido_cbor_info_options_len)(const fido_cbor_info_t *) = NULL; +char** (*sym_fido_cbor_info_options_name_ptr)(const fido_cbor_info_t *) = NULL; +const bool* (*sym_fido_cbor_info_options_value_ptr)(const fido_cbor_info_t *) = NULL; +void (*sym_fido_cred_free)(fido_cred_t **) = NULL; +size_t (*sym_fido_cred_id_len)(const fido_cred_t *) = NULL; +const unsigned char* (*sym_fido_cred_id_ptr)(const fido_cred_t *) = NULL; +fido_cred_t* (*sym_fido_cred_new)(void) = NULL; +int (*sym_fido_cred_set_clientdata_hash)(fido_cred_t *, const unsigned char *, size_t) = NULL; +int (*sym_fido_cred_set_extensions)(fido_cred_t *, int) = NULL; +int (*sym_fido_cred_set_rk)(fido_cred_t *, fido_opt_t) = NULL; +int (*sym_fido_cred_set_rp)(fido_cred_t *, const char *, const char *) = NULL; +int (*sym_fido_cred_set_type)(fido_cred_t *, int) = NULL; +int (*sym_fido_cred_set_user)(fido_cred_t *, const unsigned char *, size_t, const char *, const char *, const char *) = NULL; +int (*sym_fido_cred_set_uv)(fido_cred_t *, fido_opt_t) = NULL; +void (*sym_fido_dev_free)(fido_dev_t **) = NULL; +int (*sym_fido_dev_get_assert)(fido_dev_t *, fido_assert_t *, const char *) = NULL; +int (*sym_fido_dev_get_cbor_info)(fido_dev_t *, fido_cbor_info_t *) = NULL; +void (*sym_fido_dev_info_free)(fido_dev_info_t **, size_t) = NULL; +int (*sym_fido_dev_info_manifest)(fido_dev_info_t *, size_t, size_t *) = NULL; +const char* (*sym_fido_dev_info_manufacturer_string)(const fido_dev_info_t *) = NULL; +const char* (*sym_fido_dev_info_product_string)(const fido_dev_info_t *) = NULL; +fido_dev_info_t* (*sym_fido_dev_info_new)(size_t) = NULL; +const char* (*sym_fido_dev_info_path)(const fido_dev_info_t *) = NULL; +const fido_dev_info_t* (*sym_fido_dev_info_ptr)(const fido_dev_info_t *, size_t) = NULL; +bool (*sym_fido_dev_is_fido2)(const fido_dev_t *) = NULL; +int (*sym_fido_dev_make_cred)(fido_dev_t *, fido_cred_t *, const char *) = NULL; +fido_dev_t* (*sym_fido_dev_new)(void) = NULL; +int (*sym_fido_dev_open)(fido_dev_t *, const char *) = NULL; +int (*sym_fido_dev_close)(fido_dev_t *) = NULL; +void (*sym_fido_init)(int) = NULL; +void (*sym_fido_set_log_handler)(fido_log_handler_t *) = NULL; +const char* (*sym_fido_strerr)(int) = NULL; + +static void fido_log_propagate_handler(const char *s) { + log_debug("libfido2: %s", strempty(s)); +} + +int dlopen_libfido2(void) { + int r; + + r = dlopen_many_sym_or_warn( + &libfido2_dl, "libfido2.so.1", LOG_DEBUG, + DLSYM_ARG(fido_assert_allow_cred), + DLSYM_ARG(fido_assert_free), + DLSYM_ARG(fido_assert_hmac_secret_len), + DLSYM_ARG(fido_assert_hmac_secret_ptr), + DLSYM_ARG(fido_assert_new), + DLSYM_ARG(fido_assert_set_clientdata_hash), + DLSYM_ARG(fido_assert_set_extensions), + DLSYM_ARG(fido_assert_set_hmac_salt), + DLSYM_ARG(fido_assert_set_rp), + DLSYM_ARG(fido_assert_set_up), + DLSYM_ARG(fido_assert_set_uv), + DLSYM_ARG(fido_cbor_info_extensions_len), + DLSYM_ARG(fido_cbor_info_extensions_ptr), + DLSYM_ARG(fido_cbor_info_free), + DLSYM_ARG(fido_cbor_info_new), + DLSYM_ARG(fido_cbor_info_options_len), + DLSYM_ARG(fido_cbor_info_options_name_ptr), + DLSYM_ARG(fido_cbor_info_options_value_ptr), + DLSYM_ARG(fido_cred_free), + DLSYM_ARG(fido_cred_id_len), + DLSYM_ARG(fido_cred_id_ptr), + DLSYM_ARG(fido_cred_new), + DLSYM_ARG(fido_cred_set_clientdata_hash), + DLSYM_ARG(fido_cred_set_extensions), + DLSYM_ARG(fido_cred_set_rk), + DLSYM_ARG(fido_cred_set_rp), + DLSYM_ARG(fido_cred_set_type), + DLSYM_ARG(fido_cred_set_user), + DLSYM_ARG(fido_cred_set_uv), + DLSYM_ARG(fido_dev_free), + DLSYM_ARG(fido_dev_get_assert), + DLSYM_ARG(fido_dev_get_cbor_info), + DLSYM_ARG(fido_dev_info_free), + DLSYM_ARG(fido_dev_info_manifest), + DLSYM_ARG(fido_dev_info_manufacturer_string), + DLSYM_ARG(fido_dev_info_new), + DLSYM_ARG(fido_dev_info_path), + DLSYM_ARG(fido_dev_info_product_string), + DLSYM_ARG(fido_dev_info_ptr), + DLSYM_ARG(fido_dev_is_fido2), + DLSYM_ARG(fido_dev_make_cred), + DLSYM_ARG(fido_dev_new), + DLSYM_ARG(fido_dev_open), + DLSYM_ARG(fido_dev_close), + DLSYM_ARG(fido_init), + DLSYM_ARG(fido_set_log_handler), + DLSYM_ARG(fido_strerr)); + if (r < 0) + return r; + + sym_fido_init(FIDO_DEBUG); + sym_fido_set_log_handler(fido_log_propagate_handler); + + return 0; +} + +static int verify_features( + fido_dev_t *d, + const char *path, + int log_level, /* the log level to use when device is not FIDO2 with hmac-secret */ + bool *ret_has_rk, + bool *ret_has_client_pin, + bool *ret_has_up, + bool *ret_has_uv) { + + _cleanup_(fido_cbor_info_free_wrapper) fido_cbor_info_t *di = NULL; + bool found_extension = false; + char **e, **o; + const bool *b; + bool has_rk = false, has_client_pin = false, has_up = true, has_uv = false; /* Defaults are per table in 5.4 in FIDO2 spec */ + size_t n; + int r; + + assert(d); + assert(path); + + if (!sym_fido_dev_is_fido2(d)) + return log_full_errno(log_level, SYNTHETIC_ERRNO(ENODEV), + "Specified device %s is not a FIDO2 device.", path); + + di = sym_fido_cbor_info_new(); + if (!di) + return log_oom(); + + r = sym_fido_dev_get_cbor_info(d, di); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to get CBOR device info for %s: %s", path, sym_fido_strerr(r)); + + e = sym_fido_cbor_info_extensions_ptr(di); + n = sym_fido_cbor_info_extensions_len(di); + for (size_t i = 0; i < n; i++) { + log_debug("FIDO2 device implements extension: %s", e[i]); + if (streq(e[i], "hmac-secret")) + found_extension = true; + } + + o = sym_fido_cbor_info_options_name_ptr(di); + b = sym_fido_cbor_info_options_value_ptr(di); + n = sym_fido_cbor_info_options_len(di); + for (size_t i = 0; i < n; i++) { + log_debug("FIDO2 device implements option %s: %s", o[i], yes_no(b[i])); + if (streq(o[i], "rk")) + has_rk = b[i]; + if (streq(o[i], "clientPin")) + has_client_pin = b[i]; + if (streq(o[i], "up")) + has_up = b[i]; + if (streq(o[i], "uv")) + has_uv = b[i]; + } + + if (!found_extension) + return log_full_errno(log_level, + SYNTHETIC_ERRNO(ENODEV), + "Specified device %s is a FIDO2 device, but does not support the required HMAC-SECRET extension.", path); + + log_debug("Has rk ('Resident Key') support: %s\n" + "Has clientPin support: %s\n" + "Has up ('User Presence') support: %s\n" + "Has uv ('User Verification') support: %s\n", + yes_no(has_rk), + yes_no(has_client_pin), + yes_no(has_up), + yes_no(has_uv)); + + if (ret_has_rk) + *ret_has_rk = has_rk; + if (ret_has_client_pin) + *ret_has_client_pin = has_client_pin; + if (ret_has_up) + *ret_has_up = has_up; + if (ret_has_uv) + *ret_has_uv = has_uv; + + return 0; +} + +static int fido2_assert_set_basic_properties( + fido_assert_t *a, + const char *rp_id, + const void *cid, + size_t cid_size) { + int r; + + assert(a); + assert(rp_id); + assert(cid); + assert(cid_size > 0); + + r = sym_fido_assert_set_rp(a, rp_id); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set FIDO2 assertion ID: %s", sym_fido_strerr(r)); + + r = sym_fido_assert_set_clientdata_hash(a, (const unsigned char[32]) {}, 32); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set FIDO2 assertion client data hash: %s", sym_fido_strerr(r)); + + r = sym_fido_assert_allow_cred(a, cid, cid_size); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to add FIDO2 assertion credential ID: %s", sym_fido_strerr(r)); + + return 0; +} + +static int fido2_common_assert_error_handle(int r) { + switch (r) { + case FIDO_OK: + return 0; + case FIDO_ERR_NO_CREDENTIALS: + return log_error_errno(SYNTHETIC_ERRNO(EBADSLT), + "Wrong security token; needed credentials not present on token."); + case FIDO_ERR_PIN_REQUIRED: + return log_error_errno(SYNTHETIC_ERRNO(ENOANO), + "Security token requires PIN."); + case FIDO_ERR_PIN_AUTH_BLOCKED: + return log_error_errno(SYNTHETIC_ERRNO(EOWNERDEAD), + "PIN of security token is blocked, please remove/reinsert token."); +#ifdef FIDO_ERR_UV_BLOCKED + case FIDO_ERR_UV_BLOCKED: + return log_error_errno(SYNTHETIC_ERRNO(EOWNERDEAD), + "Verification of security token is blocked, please remove/reinsert token."); +#endif + case FIDO_ERR_PIN_INVALID: + return log_error_errno(SYNTHETIC_ERRNO(ENOLCK), + "PIN of security token incorrect."); + case FIDO_ERR_UP_REQUIRED: + return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), + "User presence required."); + case FIDO_ERR_ACTION_TIMEOUT: + return log_error_errno(SYNTHETIC_ERRNO(ENOSTR), + "Token action timeout. (User didn't interact with token quickly enough.)"); + default: + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to ask token for assertion: %s", sym_fido_strerr(r)); + } +} + +static int fido2_is_cred_in_specific_token( + const char *path, + const char *rp_id, + const void *cid, + size_t cid_size, + Fido2EnrollFlags flags) { + + assert(path); + assert(rp_id); + assert(cid); + assert(cid_size); + + _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL; + _cleanup_(fido_assert_free_wrapper) fido_assert_t *a = NULL; + bool has_up = false, has_uv = false; + int r; + + d = sym_fido_dev_new(); + if (!d) + return log_oom(); + + r = sym_fido_dev_open(d, path); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to open FIDO2 device %s: %s", path, sym_fido_strerr(r)); + + r = verify_features(d, path, LOG_ERR, NULL, NULL, &has_up, &has_uv); + if (r == -ENODEV) { /* Not a FIDO2 device or lacking HMAC-SECRET extension */ + log_debug_errno(r, "%s is not a FIDO2 device, or it lacks the hmac-secret extension", path); + return false; + } + if (r < 0) + return r; + + a = sym_fido_assert_new(); + if (!a) + return log_oom(); + + r = fido2_assert_set_basic_properties(a, rp_id, cid, cid_size); + if (r < 0) + return r; + + /* FIDO2 devices may not support pre-flight requests with UV, at least not + * without user interaction [1]. As a result, let's just return true + * here and go ahead with trying the unlock directly. + * Reference: + * 1: https://fidoalliance.org/specs/fido-v2.1-ps-20210615/fido-client-to-authenticator-protocol-v2.1-ps-20210615.html#sctn-getAssert-authnr-alg + * See section 7.4 */ + if (has_uv && FLAGS_SET(flags, FIDO2ENROLL_UV)) { + log_debug("Pre-flight requests with UV are unsupported, device: %s", path); + return true; + } + + /* According to CTAP 2.1 specification, to do pre-flight we need to set up option to false + * with optionally pinUvAuthParam in assertion[1]. But for authenticator that doesn't support + * user presence, once up option is present, the authenticator may return CTAP2_ERR_UNSUPPORTED_OPTION[2]. + * So we simplely omit the option in that case. + * Reference: + * 1: https://fidoalliance.org/specs/fido-v2.1-ps-20210615/fido-client-to-authenticator-protocol-v2.1-ps-20210615.html#pre-flight + * 2: https://fidoalliance.org/specs/fido-v2.0-ps-20190130/fido-client-to-authenticator-protocol-v2.0-ps-20190130.html#authenticatorGetAssertion (in step 5) + */ + if (has_up) + r = sym_fido_assert_set_up(a, FIDO_OPT_FALSE); + else + r = sym_fido_assert_set_up(a, FIDO_OPT_OMIT); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set assertion user presence: %s", sym_fido_strerr(r)); + + r = sym_fido_dev_get_assert(d, a, NULL); + + switch (r) { + case FIDO_OK: + return true; + case FIDO_ERR_NO_CREDENTIALS: + return false; + default: + return fido2_common_assert_error_handle(r); + } +} + +static int fido2_use_hmac_hash_specific_token( + const char *path, + const char *rp_id, + const void *salt, + size_t salt_size, + const void *cid, + size_t cid_size, + char **pins, + Fido2EnrollFlags required, /* client pin/user presence required */ + void **ret_hmac, + size_t *ret_hmac_size) { + + _cleanup_(fido_assert_free_wrapper) fido_assert_t *a = NULL; + _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL; + _cleanup_(erase_and_freep) void *hmac_copy = NULL; + bool has_up, has_client_pin, has_uv; + size_t hmac_size; + const void *hmac; + int r; + + assert(path); + assert(rp_id); + assert(salt); + assert(cid); + assert(ret_hmac); + assert(ret_hmac_size); + + d = sym_fido_dev_new(); + if (!d) + return log_oom(); + + r = sym_fido_dev_open(d, path); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to open FIDO2 device %s: %s", path, sym_fido_strerr(r)); + + r = verify_features(d, path, LOG_ERR, NULL, &has_client_pin, &has_up, &has_uv); + if (r < 0) + return r; + + if (!has_client_pin && FLAGS_SET(required, FIDO2ENROLL_PIN)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), + "PIN required to unlock, but FIDO2 device %s does not support it.", + path); + + if (!has_up && FLAGS_SET(required, FIDO2ENROLL_UP)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), + "User presence test required to unlock, but FIDO2 device %s does not support it.", + path); + + if (!has_uv && FLAGS_SET(required, FIDO2ENROLL_UV)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), + "User verification required to unlock, but FIDO2 device %s does not support it.", + path); + + a = sym_fido_assert_new(); + if (!a) + return log_oom(); + + r = sym_fido_assert_set_extensions(a, FIDO_EXT_HMAC_SECRET); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to enable HMAC-SECRET extension on FIDO2 assertion: %s", sym_fido_strerr(r)); + + r = sym_fido_assert_set_hmac_salt(a, salt, salt_size); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set salt on FIDO2 assertion: %s", sym_fido_strerr(r)); + + r = fido2_assert_set_basic_properties(a, rp_id, cid, cid_size); + if (r < 0) + return r; + + log_info("Asking FIDO2 token for authentication."); + + if (has_up) { + r = sym_fido_assert_set_up(a, FLAGS_SET(required, FIDO2ENROLL_UP) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to %s FIDO2 user presence test: %s", + enable_disable(FLAGS_SET(required, FIDO2ENROLL_UP)), + sym_fido_strerr(r)); + + if (FLAGS_SET(required, FIDO2ENROLL_UP)) + log_notice("%s%sPlease confirm presence on security token to unlock.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + } + + if (has_uv && !FLAGS_SET(required, FIDO2ENROLL_UV_OMIT)) { + r = sym_fido_assert_set_uv(a, FLAGS_SET(required, FIDO2ENROLL_UV) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to %s FIDO2 user verification: %s", + enable_disable(FLAGS_SET(required, FIDO2ENROLL_UV)), + sym_fido_strerr(r)); + + if (FLAGS_SET(required, FIDO2ENROLL_UV)) + log_notice("%s%sPlease verify user on security token to unlock.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + } + + for (;;) { + bool retry_with_up = false, retry_with_pin = false; + + if (FLAGS_SET(required, FIDO2ENROLL_PIN)) { + /* OK, we need a pin, try with all pins in turn */ + if (strv_isempty(pins)) + r = FIDO_ERR_PIN_REQUIRED; + else + STRV_FOREACH(i, pins) { + r = sym_fido_dev_get_assert(d, a, *i); + if (r != FIDO_ERR_PIN_INVALID) + break; + } + + } else + r = sym_fido_dev_get_assert(d, a, NULL); + + /* In some conditions, where a PIN or UP is required we might accept that. Let's check the + * conditions and if so try immediately again. */ + + switch (r) { + + case FIDO_ERR_UP_REQUIRED: + /* So the token asked for "up". Try to turn it on, for compat with systemd 248 and try again. */ + + if (!has_up) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for user presence test but doesn't advertise 'up' feature."); + + if (FLAGS_SET(required, FIDO2ENROLL_UP)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for user presence test but was already enabled."); + + if (FLAGS_SET(required, FIDO2ENROLL_UP_IF_NEEDED)) { + log_notice("%s%sPlease confirm presence on security to unlock.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + retry_with_up = true; + } + + break; + + case FIDO_ERR_UNSUPPORTED_OPTION: + /* AuthenTrend ATKey.Pro returns this instead of FIDO_ERR_UP_REQUIRED, let's handle + * it gracefully (also see below.) */ + + if (has_up && (required & (FIDO2ENROLL_UP|FIDO2ENROLL_UP_IF_NEEDED)) == FIDO2ENROLL_UP_IF_NEEDED) { + log_notice("%s%sGot unsupported option error when user presence test is turned off. Trying with user presence test turned on.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + retry_with_up = true; + } + + break; + + case FIDO_ERR_PIN_REQUIRED: + /* A pin was requested. Maybe supply one, if we are configured to do so on request */ + + if (!has_client_pin) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for PIN but doesn't advertise 'clientPin' feature."); + + if (FLAGS_SET(required, FIDO2ENROLL_PIN) && !strv_isempty(pins)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for PIN but one was already supplied."); + + if ((required & (FIDO2ENROLL_PIN|FIDO2ENROLL_PIN_IF_NEEDED)) == FIDO2ENROLL_PIN_IF_NEEDED) { + /* If a PIN so far wasn't specified but is requested by the device, and + * FIDO2ENROLL_PIN_IF_NEEDED is set, then provide it */ + log_debug("Retrying to create credential with PIN."); + retry_with_pin = true; + } + + break; + + default: + break; + } + + if (!retry_with_up && !retry_with_pin) + break; + + if (retry_with_up) { + r = sym_fido_assert_set_up(a, FIDO_OPT_TRUE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to enable FIDO2 user presence test: %s", sym_fido_strerr(r)); + + required |= FIDO2ENROLL_UP; + } + + if (retry_with_pin) + required |= FIDO2ENROLL_PIN; + } + + r = fido2_common_assert_error_handle(r); + if (r < 0) + return r; + + hmac = sym_fido_assert_hmac_secret_ptr(a, 0); + if (!hmac) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve HMAC secret."); + + hmac_size = sym_fido_assert_hmac_secret_len(a, 0); + + hmac_copy = memdup(hmac, hmac_size); + if (!hmac_copy) + return log_oom(); + + *ret_hmac = TAKE_PTR(hmac_copy); + *ret_hmac_size = hmac_size; + return 0; +} + +/* COSE_ECDH_ES256 is not usable with fido_cred_set_type() thus it's not listed here. */ +static const char *fido2_algorithm_to_string(int alg) { + switch(alg) { + case COSE_ES256: + return "es256"; + case COSE_RS256: + return "rs256"; + case COSE_EDDSA: + return "eddsa"; + default: + return NULL; + } +} + +int fido2_use_hmac_hash( + const char *device, + const char *rp_id, + const void *salt, + size_t salt_size, + const void *cid, + size_t cid_size, + char **pins, + Fido2EnrollFlags required, /* client pin/user presence required */ + void **ret_hmac, + size_t *ret_hmac_size) { + + size_t allocated = 64, found = 0; + fido_dev_info_t *di = NULL; + int r; + + r = dlopen_libfido2(); + if (r < 0) + return log_error_errno(r, "FIDO2 support is not installed."); + + if (device) { + r = fido2_is_cred_in_specific_token(device, rp_id, cid, cid_size, required); + if (r == 0) + /* The caller is expected to attempt other key slots in this case, + * therefore, do not spam the console with error logs here. */ + return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT), + "The credential is not in the token %s.", device); + if (r < 0) + return log_error_errno(r, "Token returned error during pre-flight: %m"); + + return fido2_use_hmac_hash_specific_token(device, rp_id, salt, salt_size, cid, cid_size, pins, required, ret_hmac, ret_hmac_size); + } + + di = sym_fido_dev_info_new(allocated); + if (!di) + return log_oom(); + + r = sym_fido_dev_info_manifest(di, allocated, &found); + if (r == FIDO_ERR_INTERNAL) { + /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */ + r = log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), "Got FIDO_ERR_INTERNAL, assuming no devices."); + goto finish; + } + if (r != FIDO_OK) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO2 devices: %s", sym_fido_strerr(r)); + goto finish; + } + + for (size_t i = 0; i < found; i++) { + const fido_dev_info_t *entry; + const char *path; + + entry = sym_fido_dev_info_ptr(di, i); + if (!entry) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to get device information for FIDO device %zu.", i); + goto finish; + } + + path = sym_fido_dev_info_path(entry); + if (!path) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to query FIDO device path."); + goto finish; + } + + r = fido2_is_cred_in_specific_token(path, rp_id, cid, cid_size, required); + if (r < 0) { + log_error_errno(r, "Token returned error during pre-flight: %m"); + goto finish; + } + if (r == 0) { + log_debug("The credential is not in the token %s, skipping.", path); + continue; + } + + r = fido2_use_hmac_hash_specific_token(path, rp_id, salt, salt_size, cid, cid_size, pins, required, ret_hmac, ret_hmac_size); + if (!IN_SET(r, + -EBADSLT, /* device doesn't understand our credential hash */ + -ENODEV /* device is not a FIDO2 device with HMAC-SECRET */)) + goto finish; + } + + r = -EAGAIN; + +finish: + sym_fido_dev_info_free(&di, allocated); + return r; +} + +#define FIDO2_SALT_SIZE 32 + +int fido2_generate_hmac_hash( + const char *device, + const char *rp_id, + const char *rp_name, + const void *user_id, size_t user_id_len, + const char *user_name, + const char *user_display_name, + const char *user_icon, + const char *askpw_icon_name, + Fido2EnrollFlags lock_with, + int cred_alg, + void **ret_cid, size_t *ret_cid_size, + void **ret_salt, size_t *ret_salt_size, + void **ret_secret, size_t *ret_secret_size, + char **ret_usedpin, + Fido2EnrollFlags *ret_locked_with) { + + _cleanup_(erase_and_freep) void *salt = NULL, *secret_copy = NULL; + _cleanup_(fido_assert_free_wrapper) fido_assert_t *a = NULL; + _cleanup_(fido_cred_free_wrapper) fido_cred_t *c = NULL; + _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL; + _cleanup_(erase_and_freep) char *used_pin = NULL; + bool has_rk, has_client_pin, has_up, has_uv; + _cleanup_free_ char *cid_copy = NULL; + size_t cid_size, secret_size; + const void *cid, *secret; + int r; + + assert(device); + assert(ret_cid); + assert(ret_cid_size); + assert(ret_salt); + assert(ret_salt_size); + assert(ret_secret); + assert(ret_secret_size); + + /* Construction is like this: we generate a salt of 32 bytes. We then ask the FIDO2 device to + * HMAC-SHA256 it for us with its internal key. The result is the key used by LUKS and account + * authentication. LUKS and UNIX password auth all do their own salting before hashing, so that FIDO2 + * device never sees the volume key. + * + * S = HMAC-SHA256(I, D) + * + * with: S → LUKS/account authentication key (never stored) + * I → internal key on FIDO2 device (stored in the FIDO2 device) + * D → salt we generate here (stored in the privileged part of the JSON record) + * + */ + + assert(device); + assert((lock_with & ~(FIDO2ENROLL_PIN|FIDO2ENROLL_UP|FIDO2ENROLL_UV)) == 0); + + r = dlopen_libfido2(); + if (r < 0) + return log_error_errno(r, "FIDO2 token support is not installed."); + + salt = malloc(FIDO2_SALT_SIZE); + if (!salt) + return log_oom(); + + r = crypto_random_bytes(salt, FIDO2_SALT_SIZE); + if (r < 0) + return log_error_errno(r, "Failed to generate salt: %m"); + + d = sym_fido_dev_new(); + if (!d) + return log_oom(); + + r = sym_fido_dev_open(d, device); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to open FIDO2 device %s: %s", device, sym_fido_strerr(r)); + + r = verify_features(d, device, LOG_ERR, &has_rk, &has_client_pin, &has_up, &has_uv); + if (r < 0) + return r; + + /* While enrolling degrade gracefully if the requested feature set isn't available, but let the user know */ + if (!has_client_pin && FLAGS_SET(lock_with, FIDO2ENROLL_PIN)) { + log_notice("Requested to lock with PIN, but FIDO2 device %s does not support it, disabling.", device); + lock_with &= ~FIDO2ENROLL_PIN; + } + + if (!has_up && FLAGS_SET(lock_with, FIDO2ENROLL_UP)) { + log_notice("Locking with user presence test requested, but FIDO2 device %s does not support it, disabling.", device); + lock_with &= ~FIDO2ENROLL_UP; + } + + if (!has_uv && FLAGS_SET(lock_with, FIDO2ENROLL_UV)) { + log_notice("Locking with user verification test requested, but FIDO2 device %s does not support it, disabling.", device); + lock_with &= ~FIDO2ENROLL_UV; + } + + c = sym_fido_cred_new(); + if (!c) + return log_oom(); + + r = sym_fido_cred_set_extensions(c, FIDO_EXT_HMAC_SECRET); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to enable HMAC-SECRET extension on FIDO2 credential: %s", sym_fido_strerr(r)); + + r = sym_fido_cred_set_rp(c, rp_id, rp_name); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set FIDO2 credential relying party ID/name: %s", sym_fido_strerr(r)); + + r = sym_fido_cred_set_type(c, cred_alg); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set FIDO2 credential type to %s: %s", fido2_algorithm_to_string(cred_alg), sym_fido_strerr(r)); + + r = sym_fido_cred_set_user( + c, + user_id, user_id_len, + user_name, + user_display_name, + user_icon); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set FIDO2 credential user data: %s", sym_fido_strerr(r)); + + r = sym_fido_cred_set_clientdata_hash(c, (const unsigned char[32]) {}, 32); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set FIDO2 client data hash: %s", sym_fido_strerr(r)); + + if (has_rk) { + r = sym_fido_cred_set_rk(c, FIDO_OPT_FALSE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to turn off FIDO2 resident key option of credential: %s", sym_fido_strerr(r)); + } + + if (has_uv) { + r = sym_fido_cred_set_uv(c, FIDO_OPT_FALSE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to turn off FIDO2 user verification option of credential: %s", sym_fido_strerr(r)); + } + + /* As per specification "up" is assumed to be implicit when making credentials, hence we don't + * explicitly enable/disable it here */ + + log_info("Initializing FIDO2 credential on security token."); + + if (has_uv || has_up) + log_notice("%s%s(Hint: This might require confirmation of user presence on security token.)", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + + r = sym_fido_dev_make_cred(d, c, NULL); + if (r == FIDO_ERR_PIN_REQUIRED) { + + if (!has_client_pin) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for PIN but doesn't advertise 'clientPin' feature."); + + for (;;) { + _cleanup_strv_free_erase_ char **pin = NULL; + + r = ask_password_auto("Please enter security token PIN:", askpw_icon_name, NULL, "fido2-pin", "fido2-pin", USEC_INFINITY, 0, &pin); + if (r < 0) + return log_error_errno(r, "Failed to acquire user PIN: %m"); + + r = FIDO_ERR_PIN_INVALID; + STRV_FOREACH(i, pin) { + if (isempty(*i)) { + log_notice("PIN may not be empty."); + continue; + } + + r = sym_fido_dev_make_cred(d, c, *i); + if (r == FIDO_OK) { + used_pin = strdup(*i); + if (!used_pin) + return log_oom(); + break; + } + if (r != FIDO_ERR_PIN_INVALID) + break; + } + + if (r != FIDO_ERR_PIN_INVALID) + break; + + log_notice("PIN incorrect, please try again."); + } + } + if (r == FIDO_ERR_PIN_AUTH_BLOCKED) + return log_notice_errno(SYNTHETIC_ERRNO(EPERM), + "Token PIN is currently blocked, please remove and reinsert token."); +#ifdef FIDO_ERR_UV_BLOCKED + if (r == FIDO_ERR_UV_BLOCKED) + return log_notice_errno(SYNTHETIC_ERRNO(EPERM), + "Token verification is currently blocked, please remove and reinsert token."); +#endif + if (r == FIDO_ERR_ACTION_TIMEOUT) + return log_error_errno(SYNTHETIC_ERRNO(ENOSTR), + "Token action timeout. (User didn't interact with token quickly enough.)"); + if (r == FIDO_ERR_UNSUPPORTED_ALGORITHM) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Token doesn't support credential algorithm %s.", fido2_algorithm_to_string(cred_alg)); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to generate FIDO2 credential: %s", sym_fido_strerr(r)); + + cid = sym_fido_cred_id_ptr(c); + if (!cid) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to get FIDO2 credential ID."); + + cid_size = sym_fido_cred_id_len(c); + + a = sym_fido_assert_new(); + if (!a) + return log_oom(); + + r = sym_fido_assert_set_extensions(a, FIDO_EXT_HMAC_SECRET); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to enable HMAC-SECRET extension on FIDO2 assertion: %s", sym_fido_strerr(r)); + + r = sym_fido_assert_set_hmac_salt(a, salt, FIDO2_SALT_SIZE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to set salt on FIDO2 assertion: %s", sym_fido_strerr(r)); + + r = fido2_assert_set_basic_properties(a, rp_id, cid, cid_size); + if (r < 0) + return r; + + log_info("Generating secret key on FIDO2 security token."); + + if (has_up) { + r = sym_fido_assert_set_up(a, FLAGS_SET(lock_with, FIDO2ENROLL_UP) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to %s FIDO2 user presence test: %s", + enable_disable(FLAGS_SET(lock_with, FIDO2ENROLL_UP)), + sym_fido_strerr(r)); + + if (FLAGS_SET(lock_with, FIDO2ENROLL_UP)) + log_notice("%s%sIn order to allow secret key generation, please confirm presence on security token.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + } + + if (has_uv) { + r = sym_fido_assert_set_uv(a, FLAGS_SET(lock_with, FIDO2ENROLL_UV) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to %s FIDO user verification: %s", + enable_disable(FLAGS_SET(lock_with, FIDO2ENROLL_UV)), + sym_fido_strerr(r)); + + if (FLAGS_SET(lock_with, FIDO2ENROLL_UV)) + log_notice("%s%sIn order to allow secret key generation, please verify user on security token.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + } + + for (;;) { + bool retry_with_up = false, retry_with_pin = false; + + r = sym_fido_dev_get_assert(d, a, FLAGS_SET(lock_with, FIDO2ENROLL_PIN) ? used_pin : NULL); + + switch (r) { + + case FIDO_ERR_UP_REQUIRED: + /* If the token asks for "up" when we turn off, then this might be a feature that + * isn't optional. Let's enable it */ + + if (!has_up) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for user presence test but doesn't advertise 'up' feature."); + + if (FLAGS_SET(lock_with, FIDO2ENROLL_UP)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for user presence test but was already enabled."); + + log_notice("%s%sLocking without user presence test requested, but FIDO2 device %s requires it, enabling.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : "", + device); + + retry_with_up = true; + break; + + case FIDO_ERR_UNSUPPORTED_OPTION: + /* AuthenTrend ATKey.Pro says it supports "up", but if we disable it it will fail + * with FIDO_ERR_UNSUPPORTED_OPTION, probably because it isn't actually + * optional. Let's see if turning it on works. This is very similar to the + * FIDO_ERR_UP_REQUIRED case, but since the error is so vague we implement it + * slightly more defensively. */ + + if (has_up && !FLAGS_SET(lock_with, FIDO2ENROLL_UP)) { + log_notice("%s%sGot unsupported option error when user presence test is turned off. Trying with user presence test turned on.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "", + emoji_enabled() ? " " : ""); + retry_with_up = true; + } + + break; + + case FIDO_ERR_PIN_REQUIRED: + if (!has_client_pin) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for client PIN check but doesn't advertise 'clientPin' feature."); + + if (FLAGS_SET(lock_with, FIDO2ENROLL_PIN)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Token asks for user client PIN check but was already enabled."); + + log_debug("Token requires PIN for assertion, enabling."); + retry_with_pin = true; + break; + + default: + break; + } + + if (!retry_with_up && !retry_with_pin) + break; + + if (retry_with_up) { + r = sym_fido_assert_set_up(a, FIDO_OPT_TRUE); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enable FIDO2 user presence test: %s", sym_fido_strerr(r)); + + lock_with |= FIDO2ENROLL_UP; + } + + if (retry_with_pin) + lock_with |= FIDO2ENROLL_PIN; + } + + if (r == FIDO_ERR_ACTION_TIMEOUT) + return log_error_errno(SYNTHETIC_ERRNO(ENOSTR), + "Token action timeout. (User didn't interact with token quickly enough.)"); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to ask token for assertion: %s", sym_fido_strerr(r)); + + secret = sym_fido_assert_hmac_secret_ptr(a, 0); + if (!secret) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve HMAC secret."); + + secret_size = sym_fido_assert_hmac_secret_len(a, 0); + + secret_copy = memdup(secret, secret_size); + if (!secret_copy) + return log_oom(); + + cid_copy = memdup(cid, cid_size); + if (!cid_copy) + return log_oom(); + + *ret_cid = TAKE_PTR(cid_copy); + *ret_cid_size = cid_size; + *ret_salt = TAKE_PTR(salt); + *ret_salt_size = FIDO2_SALT_SIZE; + *ret_secret = TAKE_PTR(secret_copy); + *ret_secret_size = secret_size; + + if (ret_usedpin) + *ret_usedpin = TAKE_PTR(used_pin); + + if (ret_locked_with) + *ret_locked_with = lock_with; + + return 0; +} +#endif + +#if HAVE_LIBFIDO2 +static int check_device_is_fido2_with_hmac_secret(const char *path) { + _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL; + int r; + + d = sym_fido_dev_new(); + if (!d) + return log_oom(); + + r = sym_fido_dev_open(d, path); + if (r != FIDO_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to open FIDO2 device %s: %s", path, sym_fido_strerr(r)); + + r = verify_features(d, path, LOG_DEBUG, NULL, NULL, NULL, NULL); + if (r == -ENODEV) /* Not a FIDO2 device, or not implementing 'hmac-secret' */ + return false; + if (r < 0) + return r; + + return true; +} +#endif + +int fido2_list_devices(void) { +#if HAVE_LIBFIDO2 + _cleanup_(table_unrefp) Table *t = NULL; + size_t allocated = 64, found = 0; + fido_dev_info_t *di = NULL; + int r; + + r = dlopen_libfido2(); + if (r < 0) + return log_error_errno(r, "FIDO2 token support is not installed."); + + di = sym_fido_dev_info_new(allocated); + if (!di) + return log_oom(); + + r = sym_fido_dev_info_manifest(di, allocated, &found); + if (r == FIDO_ERR_INTERNAL || (r == FIDO_OK && found == 0)) { + /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */ + log_info("No FIDO2 devices found."); + r = 0; + goto finish; + } + if (r != FIDO_OK) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO2 devices: %s", sym_fido_strerr(r)); + goto finish; + } + + t = table_new("path", "manufacturer", "product"); + if (!t) { + r = log_oom(); + goto finish; + } + + for (size_t i = 0; i < found; i++) { + const fido_dev_info_t *entry; + + entry = sym_fido_dev_info_ptr(di, i); + if (!entry) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to get device information for FIDO device %zu.", i); + goto finish; + } + + r = check_device_is_fido2_with_hmac_secret(sym_fido_dev_info_path(entry)); + if (r < 0) + goto finish; + if (!r) + continue; + + r = table_add_many( + t, + TABLE_PATH, sym_fido_dev_info_path(entry), + TABLE_STRING, sym_fido_dev_info_manufacturer_string(entry), + TABLE_STRING, sym_fido_dev_info_product_string(entry)); + if (r < 0) { + table_log_add_error(r); + goto finish; + } + } + + r = table_print(t, stdout); + if (r < 0) { + log_error_errno(r, "Failed to show device table: %m"); + goto finish; + } + + r = 0; + +finish: + sym_fido_dev_info_free(&di, allocated); + return r; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "FIDO2 tokens not supported on this build."); +#endif +} + +int fido2_find_device_auto(char **ret) { +#if HAVE_LIBFIDO2 + _cleanup_free_ char *copy = NULL; + size_t di_size = 64, found = 0; + const fido_dev_info_t *entry; + fido_dev_info_t *di = NULL; + const char *path; + int r; + + r = dlopen_libfido2(); + if (r < 0) + return log_error_errno(r, "FIDO2 token support is not installed."); + + di = sym_fido_dev_info_new(di_size); + if (!di) + return log_oom(); + + r = sym_fido_dev_info_manifest(di, di_size, &found); + if (r == FIDO_ERR_INTERNAL || (r == FIDO_OK && found == 0)) { + /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */ + r = log_error_errno(SYNTHETIC_ERRNO(ENODEV), "No FIDO devices found."); + goto finish; + } + if (r != FIDO_OK) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO devices: %s", sym_fido_strerr(r)); + goto finish; + } + if (found > 1) { + r = log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "More than one FIDO device found."); + goto finish; + } + + entry = sym_fido_dev_info_ptr(di, 0); + if (!entry) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to get device information for FIDO device 0."); + goto finish; + } + + r = check_device_is_fido2_with_hmac_secret(sym_fido_dev_info_path(entry)); + if (r < 0) + goto finish; + if (!r) { + r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "FIDO device discovered does not implement FIDO2 with 'hmac-secret' extension."); + goto finish; + } + + path = sym_fido_dev_info_path(entry); + if (!path) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to query FIDO device path."); + goto finish; + } + + copy = strdup(path); + if (!copy) { + r = log_oom(); + goto finish; + } + + *ret = TAKE_PTR(copy); + r = 0; + +finish: + sym_fido_dev_info_free(&di, di_size); + return r; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "FIDO2 tokens not supported on this build."); +#endif +} + +int fido2_have_device(const char *device) { +#if HAVE_LIBFIDO2 + size_t allocated = 64, found = 0; + fido_dev_info_t *di = NULL; + int r; + + /* Return == 0 if not devices are found, > 0 if at least one is found */ + + r = dlopen_libfido2(); + if (r < 0) + return log_error_errno(r, "FIDO2 support is not installed."); + + if (device) { + if (access(device, F_OK) < 0) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to determine whether device '%s' exists: %m", device); + } + + return 1; + } + + di = sym_fido_dev_info_new(allocated); + if (!di) + return log_oom(); + + r = sym_fido_dev_info_manifest(di, allocated, &found); + if (r == FIDO_ERR_INTERNAL) { + /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */ + r = 0; + goto finish; + } + if (r != FIDO_OK) { + r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO2 devices: %s", sym_fido_strerr(r)); + goto finish; + } + + r = found; + +finish: + sym_fido_dev_info_free(&di, allocated); + return r; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "FIDO2 tokens not supported on this build."); +#endif +} + +#if HAVE_LIBFIDO2 +int parse_fido2_algorithm(const char *s, int *ret) { + int a; + + assert(s); + + if (streq(s, "es256")) + a = COSE_ES256; + else if (streq(s, "rs256")) + a = COSE_RS256; + else if (streq(s, "eddsa")) + a = COSE_EDDSA; + else + return -EINVAL; + + if (ret) + *ret = a; + return 0; +} +#endif diff --git a/src/shared/libfido2-util.h b/src/shared/libfido2-util.h new file mode 100644 index 0000000..4cfc95f --- /dev/null +++ b/src/shared/libfido2-util.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +typedef enum Fido2EnrollFlags { + FIDO2ENROLL_PIN = 1 << 0, + FIDO2ENROLL_UP = 1 << 1, /* User presence (ie: touching token) */ + FIDO2ENROLL_UV = 1 << 2, /* User verification (ie: fingerprint) */ + FIDO2ENROLL_PIN_IF_NEEDED = 1 << 3, /* If auth doesn't work without PIN ask for one, as in systemd 248 */ + FIDO2ENROLL_UP_IF_NEEDED = 1 << 4, /* If auth doesn't work without UP, enable it, as in systemd 248 */ + FIDO2ENROLL_UV_OMIT = 1 << 5, /* Leave "uv" untouched, as in systemd 248 */ + _FIDO2ENROLL_TYPE_MAX, + _FIDO2ENROLL_TYPE_INVALID = -EINVAL, +} Fido2EnrollFlags; + +#if HAVE_LIBFIDO2 +#include + +extern int (*sym_fido_assert_allow_cred)(fido_assert_t *, const unsigned char *, size_t); +extern void (*sym_fido_assert_free)(fido_assert_t **); +extern size_t (*sym_fido_assert_hmac_secret_len)(const fido_assert_t *, size_t); +extern const unsigned char* (*sym_fido_assert_hmac_secret_ptr)(const fido_assert_t *, size_t); +extern fido_assert_t* (*sym_fido_assert_new)(void); +extern int (*sym_fido_assert_set_clientdata_hash)(fido_assert_t *, const unsigned char *, size_t); +extern int (*sym_fido_assert_set_extensions)(fido_assert_t *, int); +extern int (*sym_fido_assert_set_hmac_salt)(fido_assert_t *, const unsigned char *, size_t); +extern int (*sym_fido_assert_set_rp)(fido_assert_t *, const char *); +extern int (*sym_fido_assert_set_up)(fido_assert_t *, fido_opt_t); +extern int (*sym_fido_assert_set_uv)(fido_assert_t *, fido_opt_t); +extern size_t (*sym_fido_cbor_info_extensions_len)(const fido_cbor_info_t *); +extern char **(*sym_fido_cbor_info_extensions_ptr)(const fido_cbor_info_t *); +extern void (*sym_fido_cbor_info_free)(fido_cbor_info_t **); +extern fido_cbor_info_t* (*sym_fido_cbor_info_new)(void); +extern size_t (*sym_fido_cbor_info_options_len)(const fido_cbor_info_t *); +extern char** (*sym_fido_cbor_info_options_name_ptr)(const fido_cbor_info_t *); +extern const bool* (*sym_fido_cbor_info_options_value_ptr)(const fido_cbor_info_t *); +extern void (*sym_fido_cred_free)(fido_cred_t **); +extern size_t (*sym_fido_cred_id_len)(const fido_cred_t *); +extern const unsigned char* (*sym_fido_cred_id_ptr)(const fido_cred_t *); +extern fido_cred_t* (*sym_fido_cred_new)(void); +extern int (*sym_fido_cred_set_clientdata_hash)(fido_cred_t *, const unsigned char *, size_t); +extern int (*sym_fido_cred_set_extensions)(fido_cred_t *, int); +extern int (*sym_fido_cred_set_rk)(fido_cred_t *, fido_opt_t); +extern int (*sym_fido_cred_set_rp)(fido_cred_t *, const char *, const char *); +extern int (*sym_fido_cred_set_type)(fido_cred_t *, int); +extern int (*sym_fido_cred_set_user)(fido_cred_t *, const unsigned char *, size_t, const char *, const char *, const char *); +extern int (*sym_fido_cred_set_uv)(fido_cred_t *, fido_opt_t); +extern void (*sym_fido_dev_free)(fido_dev_t **); +extern int (*sym_fido_dev_get_assert)(fido_dev_t *, fido_assert_t *, const char *); +extern int (*sym_fido_dev_get_cbor_info)(fido_dev_t *, fido_cbor_info_t *); +extern void (*sym_fido_dev_info_free)(fido_dev_info_t **, size_t); +extern int (*sym_fido_dev_info_manifest)(fido_dev_info_t *, size_t, size_t *); +extern const char* (*sym_fido_dev_info_manufacturer_string)(const fido_dev_info_t *); +extern const char* (*sym_fido_dev_info_product_string)(const fido_dev_info_t *); +extern fido_dev_info_t* (*sym_fido_dev_info_new)(size_t); +extern const char* (*sym_fido_dev_info_path)(const fido_dev_info_t *); +extern const fido_dev_info_t* (*sym_fido_dev_info_ptr)(const fido_dev_info_t *, size_t); +extern bool (*sym_fido_dev_is_fido2)(const fido_dev_t *); +extern int (*sym_fido_dev_make_cred)(fido_dev_t *, fido_cred_t *, const char *); +extern fido_dev_t* (*sym_fido_dev_new)(void); +extern int (*sym_fido_dev_open)(fido_dev_t *, const char *); +extern int (*sym_fido_dev_close)(fido_dev_t *); +extern void (*sym_fido_init)(int); +extern void (*sym_fido_set_log_handler)(fido_log_handler_t *); +extern const char* (*sym_fido_strerr)(int); + +int dlopen_libfido2(void); + +static inline void fido_cbor_info_free_wrapper(fido_cbor_info_t **p) { + if (*p) + sym_fido_cbor_info_free(p); +} + +static inline void fido_assert_free_wrapper(fido_assert_t **p) { + if (*p) + sym_fido_assert_free(p); +} + +static inline void fido_dev_free_wrapper(fido_dev_t **p) { + if (*p) { + sym_fido_dev_close(*p); + sym_fido_dev_free(p); + } +} + +static inline void fido_cred_free_wrapper(fido_cred_t **p) { + if (*p) + sym_fido_cred_free(p); +} + +int fido2_use_hmac_hash( + const char *device, + const char *rp_id, + const void *salt, + size_t salt_size, + const void *cid, + size_t cid_size, + char **pins, + Fido2EnrollFlags required, + void **ret_hmac, + size_t *ret_hmac_size); + +int fido2_generate_hmac_hash( + const char *device, + const char *rp_id, + const char *rp_name, + const void *user_id, size_t user_id_len, + const char *user_name, + const char *user_display_name, + const char *user_icon, + const char *askpw_icon_name, + Fido2EnrollFlags lock_with, + int cred_alg, + void **ret_cid, size_t *ret_cid_size, + void **ret_salt, size_t *ret_salt_size, + void **ret_secret, size_t *ret_secret_size, + char **ret_usedpin, + Fido2EnrollFlags *ret_locked_with); + +int parse_fido2_algorithm(const char *s, int *ret); +#else +static inline int parse_fido2_algorithm(const char *s, int *ret) { + return -EOPNOTSUPP; +} +#endif + +int fido2_list_devices(void); +int fido2_find_device_auto(char **ret); + +int fido2_have_device(const char *device); diff --git a/src/shared/libmount-util.c b/src/shared/libmount-util.c new file mode 100644 index 0000000..3818904 --- /dev/null +++ b/src/shared/libmount-util.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "libmount-util.h" + +int libmount_parse( + const char *path, + FILE *source, + struct libmnt_table **ret_table, + struct libmnt_iter **ret_iter) { + + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + int r; + + /* Older libmount seems to require this. */ + assert(!source || path); + + table = mnt_new_table(); + iter = mnt_new_iter(MNT_ITER_FORWARD); + if (!table || !iter) + return -ENOMEM; + + /* If source or path are specified, we use on the functions which ignore utab. + * Only if both are empty, we use mnt_table_parse_mtab(). */ + + if (source) + r = mnt_table_parse_stream(table, source, path); + else if (path) + r = mnt_table_parse_file(table, path); + else + r = mnt_table_parse_mtab(table, NULL); + if (r < 0) + return r; + + *ret_table = TAKE_PTR(table); + *ret_iter = TAKE_PTR(iter); + return 0; +} + +int libmount_is_leaf( + struct libmnt_table *table, + struct libmnt_fs *fs) { + int r; + + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter_children = NULL; + iter_children = mnt_new_iter(MNT_ITER_FORWARD); + if (!iter_children) + return log_oom(); + + /* We care only whether it exists, it is unused */ + _unused_ struct libmnt_fs *child; + r = mnt_table_next_child_fs(table, iter_children, fs, &child); + if (r < 0) + return r; + + return r == 1; +} diff --git a/src/shared/libmount-util.h b/src/shared/libmount-util.h new file mode 100644 index 0000000..2f789e7 --- /dev/null +++ b/src/shared/libmount-util.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* This needs to be after sys/mount.h */ +#include + +#include "macro.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct libmnt_table*, mnt_free_table, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct libmnt_iter*, mnt_free_iter, NULL); + +int libmount_parse( + const char *path, + FILE *source, + struct libmnt_table **ret_table, + struct libmnt_iter **ret_iter); + +int libmount_is_leaf( + struct libmnt_table *table, + struct libmnt_fs *fs); diff --git a/src/shared/libshared.sym b/src/shared/libshared.sym new file mode 100644 index 0000000..6a7495a --- /dev/null +++ b/src/shared/libshared.sym @@ -0,0 +1,3 @@ +SD_SHARED { + global: *; +}; diff --git a/src/shared/linux/README b/src/shared/linux/README new file mode 100644 index 0000000..34fc09b --- /dev/null +++ b/src/shared/linux/README @@ -0,0 +1,9 @@ +The files in this directory are copied from kernel-6.2, and the following modifications are applied: +- auto_dev-ioctl.h: set AUTOFS_DEV_IOCTL_VERSION_MINOR to 0 +- auto_dev-ioctl.h: define AUTOFS_IOCTL if not defined +- auto_dev-ioctl.h: use of fake flexible array is fixed +- bpf_insn.h: This is imported from samples/bpf/bpf_insn.h +- bpf_insn.h: BPF_JMP_A() macro is also imported from include/linux/filter.h +- dm-ioctl.h: set DM_VERSION_MINOR to 27 +- ethtool.h: define __KERNEL_DIV_ROUND_UP if not defined +- ethtool.h: add casts in ethtool_cmd_speed() diff --git a/src/shared/linux/auto_dev-ioctl.h b/src/shared/linux/auto_dev-ioctl.h new file mode 100644 index 0000000..c6b7e11 --- /dev/null +++ b/src/shared/linux/auto_dev-ioctl.h @@ -0,0 +1,220 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#ifndef _LINUX_AUTO_DEV_IOCTL_H +#define _LINUX_AUTO_DEV_IOCTL_H + +#include +#include + +#define AUTOFS_DEVICE_NAME "autofs" + +#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +/* + * An ioctl interface for autofs mount point control. + */ + +struct args_protover { + __u32 version; +}; + +struct args_protosubver { + __u32 sub_version; +}; + +struct args_openmount { + __u32 devid; +}; + +struct args_ready { + __u32 token; +}; + +struct args_fail { + __u32 token; + __s32 status; +}; + +struct args_setpipefd { + __s32 pipefd; +}; + +struct args_timeout { + __u64 timeout; +}; + +struct args_requester { + __u32 uid; + __u32 gid; +}; + +struct args_expire { + __u32 how; +}; + +struct args_askumount { + __u32 may_umount; +}; + +struct args_ismountpoint { + union { + struct args_in { + __u32 type; + } in; + struct args_out { + __u32 devid; + __u32 magic; + } out; + }; +}; + +/* + * All the ioctls use this structure. + * When sending a path size must account for the total length + * of the chunk of memory otherwise it is the size of the + * structure. + */ + +struct autofs_dev_ioctl { + __u32 ver_major; + __u32 ver_minor; + __u32 size; /* total size of data passed in + * including this struct */ + __s32 ioctlfd; /* automount command fd */ + + /* Command parameters */ + + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; + + char path[]; +}; + +static __inline__ void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) +{ + memset(in, 0, AUTOFS_DEV_IOCTL_SIZE); + in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + in->size = AUTOFS_DEV_IOCTL_SIZE; + in->ioctlfd = -1; +} + +enum { + /* Get various version info */ + AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, + AUTOFS_DEV_IOCTL_PROTOVER_CMD, + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, + + /* Open mount ioctl fd */ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, + + /* Close mount ioctl fd */ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, + + /* Mount/expire status returns */ + AUTOFS_DEV_IOCTL_READY_CMD, + AUTOFS_DEV_IOCTL_FAIL_CMD, + + /* Activate/deactivate autofs mount */ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, + AUTOFS_DEV_IOCTL_CATATONIC_CMD, + + /* Expiry timeout */ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, + + /* Get mount last requesting uid and gid */ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, + + /* Check for eligible expire candidates */ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, + + /* Request busy status */ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, + + /* Check if path is a mountpoint */ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, +}; + +#ifndef AUTOFS_IOCTL +#define AUTOFS_IOCTL 0x93 +#endif + +#define AUTOFS_DEV_IOCTL_VERSION \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOSUBVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_OPENMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_READY \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_FAIL \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_SETPIPEFD \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CATATONIC \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_TIMEOUT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_REQUESTER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_EXPIRE \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ASKUMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) + +#endif /* _LINUX_AUTO_DEV_IOCTL_H */ diff --git a/src/shared/linux/bpf.h b/src/shared/linux/bpf.h new file mode 100644 index 0000000..9f8af5e --- /dev/null +++ b/src/shared/linux/bpf.h @@ -0,0 +1,7053 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __LINUX_BPF_H__ +#define __LINUX_BPF_H__ + +#include +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word (64-bit) */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +/* jmp encodings */ +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ +}; + +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; +}; + +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * + * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map + * types, the *flags* argument needs to be set to 0, but for other + * map types, it may be specified as: + * + * **BPF_F_LOCK** + * Look up and delete the value of a spin-locked map + * without returning the lock. This must be specified if + * the elements contain a spinlock. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * * **BPF_MAP_TYPE_HASH** + * * **BPF_MAP_TYPE_PERCPU_HASH** + * * **BPF_MAP_TYPE_LRU_HASH** + * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ +enum bpf_cmd { + BPF_MAP_CREATE, + BPF_MAP_LOOKUP_ELEM, + BPF_MAP_UPDATE_ELEM, + BPF_MAP_DELETE_ELEM, + BPF_MAP_GET_NEXT_KEY, + BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, + BPF_PROG_RUN = BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, + BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, + BPF_ITER_CREATE, + BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, +}; + +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ +}; + +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, + BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, + BPF_LSM_MAC, + BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, + BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, + + MAX_BPF_LINK_TYPE, +}; + +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) +#define BPF_F_REPLACE (1U << 2) + +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX] + * insn[0].imm: map fd or fd_idx + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP + */ +#define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_IDX 5 + +/* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE + * insn[0].imm: map fd or fd_idx + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ +#define BPF_PSEUDO_MAP_VALUE 2 +#define BPF_PSEUDO_MAP_IDX_VALUE 6 + +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 + +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 + +/* flags for BPF_MAP_UPDATE_ELEM command */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; + +/* flags for BPF_MAP_CREATE command */ +enum { + BPF_F_NO_PREALLOC = (1U << 0), +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ + BPF_F_NO_COMMON_LRU = (1U << 1), +/* Specify numa node during map creation */ + BPF_F_NUMA_NODE = (1U << 2), + +/* Flags for accessing BPF object from syscall side. */ + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), + +/* Flag for stack_map, store build_id+offset instead of pointer */ + BPF_F_STACK_BUILD_ID = (1U << 5), + +/* Zero-initialize hash function seed. This should only be used for testing. */ + BPF_F_ZERO_SEED = (1U << 6), + +/* Flags for accessing BPF object from program side. */ + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), + +/* Clone map from listener for newly accepted socket */ + BPF_F_CLONE = (1U << 9), + +/* Enable memory-mapping BPF map */ + BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), +}; + +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are always returned 0. + */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) +/* If set, XDP frames will be transmitted after processing */ +#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) + +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + +#define BPF_OBJ_NAME_LEN 16U + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* not used */ + __u32 prog_flags; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; + __u32 core_relo_cnt; /* number of bpf_core_relo */ + __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; + __u32 batch_size; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + __u32 map_id; + __u32 btf_id; + __u32 link_id; + }; + __u32 next_id; + __u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ + __u64 name; + __u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + __u32 prog_fd; /* eBPF program to attach */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + __aligned_u64 cookies; + } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; + }; + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + /* new program fd to update link with */ + __u32 new_prog_fd; + __u32 flags; /* extra flags */ + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags */ + __u32 old_prog_fd; + } link_update; + + struct { + __u32 link_fd; + } link_detach; + + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + +} __attribute__((aligned(8))); + +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * Return + * Current *ktime*. + * + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is + * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_current_pid_tgid(void) + * Description + * Get the current pid and tgid. + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Description + * Get the current uid and gid. + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * long bpf_get_current_comm(void *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * long bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * + * u64 bpf_get_current_task(void) + * Description + * Get the current task. + * Return + * A pointer to the current task struct. + * + * long bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * Return + * void. + * + * long bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * Return + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Description + * Get the owner UID of the socked associated to *skb*. + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * + * long bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * Return + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + * + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For en eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_override_return(struct pt_regs *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Description + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + * + * void *bpf_get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * Return + * A pointer to the local storage area. + * + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_sk_release(void *sock) + * Description + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * long bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * Return + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + * + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + * + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) + * Description + * Delete a bpf-local-storage from a *sk*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + * + * long bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + * + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64-bit jiffies + * Return + * The 64 bit jiffies + * + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * Return + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * Return + * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * Return + * Current *ktime*. + * + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(void *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * Return + * 0 on success, or a negative error in case of failure. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if *flags* are not recognized. + * + * long bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. + * + * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags) + * Description + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * Return + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + * + * long bpf_sys_close(u32 fd) + * Description + * Execute close syscall for given FD. + * Return + * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * Return + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. + * + * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) + * Description + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * struct unix_sock *bpf_skc_to_unix_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) + * Description + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * Return + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * Return + * The number of argument registers of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * Return + * The BPF program's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) + * Description + * Get the total size of a given xdp buff (linear and paged area) + * Return + * The total size of a given xdp buffer. + * + * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + * + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) + * Description + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + * + * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) + * Description + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + * + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not 0. + * + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ + /* */ + +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, +enum bpf_func_id { + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) + __BPF_FUNC_MAX_ID, +}; +#undef __BPF_ENUM_FN + +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; + +/* BPF_FUNC_l4_csum_replace flags. */ +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +enum { + BPF_F_INGRESS = (1ULL << 0), +}; + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; + +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), +/* flags used by BPF_FUNC_get_stackid only. */ + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), +/* flags used by BPF_FUNC_get_stack only. */ + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; + +/* BPF_FUNC_skb_set_tunnel_key flags. */ +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; + +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, +/* BPF_FUNC_perf_event_output for sk_buff input context. */ + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; + +/* Current network namespace */ +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; + +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + +/* BPF_FUNC_skb_adjust_room flags. */ +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), +}; + +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; + +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + +/* BPF_FUNC_sysctl_get_name flags. */ +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; + +/* BPF_FUNC__storage_get flags */ +enum { + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, +}; + +/* BPF_FUNC_read_branch_records flags. */ +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; + +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + +/* Mode for BPF_FUNC_skb_adjust_room helper. */ +enum bpf_adj_room_mode { + BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, +}; + +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, +}; + +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + +enum { + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ +}; + +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; + __u32 vlan_proto; + __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; + __u32 hash; + __u32 tc_classid; + __u32 data; + __u32 data_end; + __u32 napi_id; + + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; + __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; + __u8 tstamp_type; + __u32 :24; /* Padding, future use. */ + __u64 hwtstamp; +}; + +struct bpf_tunnel_key { + __u32 tunnel_id; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; + __u32 tunnel_label; + union { + __u32 local_ipv4; + __u32 local_ipv6[4]; + }; +}; + +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + __u16 ext; /* Padding, future use. */ + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, +}; + +struct bpf_sock { + __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; + __u32 mark; + __u32 priority; + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; + __s32 rx_queue_mapping; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ +}; + +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + +struct bpf_xdp_sock { + __u32 queue_id; +}; + +#define XDP_PACKET_HEADROOM 256 + +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, + XDP_REDIRECT, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; + __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ +}; + +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +enum sk_action { + SK_DROP = 0, + SK_PASS, +}; + +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ +}; + +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); +}; + +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ + __u64 netns_dev; + __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; + __u64 recursion_misses; + __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 btf_vmlinux_value_type_id; + __u64 netns_dev; + __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 :32; /* alignment pad */ + __u64 map_extra; +} __attribute__((aligned(8))); + +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; +} __attribute__((aligned(8))); + +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ + union { + struct { + __u32 map_id; + } map; + }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; + }; + } iter; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; + struct { + __u32 ifindex; + } xdp; + }; +} __attribute__((aligned(8))); + +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __bpf_md_ptr(struct bpf_sock *, sk); +}; + +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ + __u64 skb_hwtstamp; +}; + +/* Definitions for bpf_sock_ops_cb_flags */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ +}; + +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ +}; + +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; + +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; + +struct bpf_cgroup_dev_ctx { + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; + __u32 major; + __u32 minor; +}; + +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; + +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + +struct bpf_fib_lookup { + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + + /* output: MTU value */ + __u16 mtu_result; + }; + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ + + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; + }; + + union { + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route + */ + union { + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; + +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; + __u32 flags; + __be32 flow_label; +}; + +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + +struct bpf_spin_lock { + __u32 val; +}; + +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_dynptr { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ + __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */ +}; + +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ +}; + +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + +#endif /* __LINUX_BPF_H__ */ diff --git a/src/shared/linux/bpf_common.h b/src/shared/linux/bpf_common.h new file mode 100644 index 0000000..f0fe139 --- /dev/null +++ b/src/shared/linux/bpf_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_BPF_COMMON_H__ +#define __LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* __LINUX_BPF_COMMON_H__ */ diff --git a/src/shared/linux/bpf_insn.h b/src/shared/linux/bpf_insn.h new file mode 100644 index 0000000..92ec06b --- /dev/null +++ b/src/shared/linux/bpf_insn.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* eBPF instruction mini library */ +#ifndef __BPF_INSN_H +#define __BPF_INSN_H + +struct bpf_insn; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_JMP_A(OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = 0 }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#endif diff --git a/src/shared/linux/dm-ioctl.h b/src/shared/linux/dm-ioctl.h new file mode 100644 index 0000000..19a64fc --- /dev/null +++ b/src/shared/linux/dm-ioctl.h @@ -0,0 +1,385 @@ +/* SPDX-License-Identifier: LGPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DM_IOCTL_V4_H +#define _LINUX_DM_IOCTL_V4_H + +#include + +#define DM_DIR "mapper" /* Slashes not supported */ +#define DM_CONTROL_NODE "control" +#define DM_MAX_TYPE_NAME 16 +#define DM_NAME_LEN 128 +#define DM_UUID_LEN 129 + +/* + * A traditional ioctl interface for the device mapper. + * + * Each device can have two tables associated with it, an + * 'active' table which is the one currently used by io passing + * through the device, and an 'inactive' one which is a table + * that is being prepared as a replacement for the 'active' one. + * + * DM_VERSION: + * Just get the version information for the ioctl interface. + * + * DM_REMOVE_ALL: + * Remove all dm devices, destroy all tables. Only really used + * for debug. + * + * DM_LIST_DEVICES: + * Get a list of all the dm device names. + * + * DM_DEV_CREATE: + * Create a new device, neither the 'active' or 'inactive' table + * slots will be filled. The device will be in suspended state + * after creation, however any io to the device will get errored + * since it will be out-of-bounds. + * + * DM_DEV_REMOVE: + * Remove a device, destroy any tables. + * + * DM_DEV_RENAME: + * Rename a device or set its uuid if none was previously supplied. + * + * DM_SUSPEND: + * This performs both suspend and resume, depending which flag is + * passed in. + * Suspend: This command will not return until all pending io to + * the device has completed. Further io will be deferred until + * the device is resumed. + * Resume: It is no longer an error to issue this command on an + * unsuspended device. If a table is present in the 'inactive' + * slot, it will be moved to the active slot, then the old table + * from the active slot will be _destroyed_. Finally the device + * is resumed. + * + * DM_DEV_STATUS: + * Retrieves the status for the table in the 'active' slot. + * + * DM_DEV_WAIT: + * Wait for a significant event to occur to the device. This + * could either be caused by an event triggered by one of the + * targets of the table in the 'active' slot, or a table change. + * + * DM_TABLE_LOAD: + * Load a table into the 'inactive' slot for the device. The + * device does _not_ need to be suspended prior to this command. + * + * DM_TABLE_CLEAR: + * Destroy any table in the 'inactive' slot (ie. abort). + * + * DM_TABLE_DEPS: + * Return a set of device dependencies for the 'active' table. + * + * DM_TABLE_STATUS: + * Return the targets status for the 'active' table. + * + * DM_TARGET_MSG: + * Pass a message string to the target at a specific offset of a device. + * + * DM_DEV_SET_GEOMETRY: + * Set the geometry of a device by passing in a string in this format: + * + * "cylinders heads sectors_per_track start_sector" + * + * Beware that CHS geometry is nearly obsolete and only provided + * for compatibility with dm devices that can be booted by a PC + * BIOS. See struct hd_geometry for range limits. Also note that + * the geometry is erased if the device size changes. + */ + +/* + * All ioctl arguments consist of a single chunk of memory, with + * this structure at the start. If a uuid is specified any + * lookup (eg. for a DM_INFO) will be done on that, *not* the + * name. + */ +struct dm_ioctl { + /* + * The version number is made up of three parts: + * major - no backward or forward compatibility, + * minor - only backwards compatible, + * patch - both backwards and forwards compatible. + * + * All clients of the ioctl interface should fill in the + * version number of the interface that they were + * compiled with. + * + * All recognised ioctl commands (ie. those that don't + * return -ENOTTY) fill out this field, even if the + * command failed. + */ + __u32 version[3]; /* in/out */ + __u32 data_size; /* total size of data passed in + * including this struct */ + + __u32 data_start; /* offset to start of data + * relative to start of this struct */ + + __u32 target_count; /* in/out */ + __s32 open_count; /* out */ + __u32 flags; /* in/out */ + + /* + * event_nr holds either the event number (input and output) or the + * udev cookie value (input only). + * The DM_DEV_WAIT ioctl takes an event number as input. + * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls + * use the field as a cookie to return in the DM_COOKIE + * variable with the uevents they issue. + * For output, the ioctls return the event number, not the cookie. + */ + __u32 event_nr; /* in/out */ + __u32 padding; + + __u64 dev; /* in/out */ + + char name[DM_NAME_LEN]; /* device name */ + char uuid[DM_UUID_LEN]; /* unique identifier for + * the block device */ + char data[7]; /* padding or data */ +}; + +/* + * Used to specify tables. These structures appear after the + * dm_ioctl. + */ +struct dm_target_spec { + __u64 sector_start; + __u64 length; + __s32 status; /* used when reading from kernel only */ + + /* + * Location of the next dm_target_spec. + * - When specifying targets on a DM_TABLE_LOAD command, this value is + * the number of bytes from the start of the "current" dm_target_spec + * to the start of the "next" dm_target_spec. + * - When retrieving targets on a DM_TABLE_STATUS command, this value + * is the number of bytes from the start of the first dm_target_spec + * (that follows the dm_ioctl struct) to the start of the "next" + * dm_target_spec. + */ + __u32 next; + + char target_type[DM_MAX_TYPE_NAME]; + + /* + * Parameter string starts immediately after this object. + * Be careful to add padding after string to ensure correct + * alignment of subsequent dm_target_spec. + */ +}; + +/* + * Used to retrieve the target dependencies. + */ +struct dm_target_deps { + __u32 count; /* Array size */ + __u32 padding; /* unused */ + __u64 dev[]; /* out */ +}; + +/* + * Used to get a list of all dm devices. + */ +struct dm_name_list { + __u64 dev; + __u32 next; /* offset to the next record from + the _start_ of this */ + char name[]; + + /* + * The following members can be accessed by taking a pointer that + * points immediately after the terminating zero character in "name" + * and aligning this pointer to next 8-byte boundary. + * Uuid is present if the flag DM_NAME_LIST_FLAG_HAS_UUID is set. + * + * __u32 event_nr; + * __u32 flags; + * char uuid[0]; + */ +}; + +#define DM_NAME_LIST_FLAG_HAS_UUID 1 +#define DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID 2 + +/* + * Used to retrieve the target versions + */ +struct dm_target_versions { + __u32 next; + __u32 version[3]; + + char name[]; +}; + +/* + * Used to pass message to a target + */ +struct dm_target_msg { + __u64 sector; /* Device sector */ + + char message[]; +}; + +/* + * If you change this make sure you make the corresponding change + * to dm-ioctl.c:lookup_ioctl() + */ +enum { + /* Top level cmds */ + DM_VERSION_CMD = 0, + DM_REMOVE_ALL_CMD, + DM_LIST_DEVICES_CMD, + + /* device level cmds */ + DM_DEV_CREATE_CMD, + DM_DEV_REMOVE_CMD, + DM_DEV_RENAME_CMD, + DM_DEV_SUSPEND_CMD, + DM_DEV_STATUS_CMD, + DM_DEV_WAIT_CMD, + + /* Table level cmds */ + DM_TABLE_LOAD_CMD, + DM_TABLE_CLEAR_CMD, + DM_TABLE_DEPS_CMD, + DM_TABLE_STATUS_CMD, + + /* Added later */ + DM_LIST_VERSIONS_CMD, + DM_TARGET_MSG_CMD, + DM_DEV_SET_GEOMETRY_CMD, + DM_DEV_ARM_POLL_CMD, + DM_GET_TARGET_VERSION_CMD, +}; + +#define DM_IOCTL 0xfd + +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) +#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) + +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl) +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl) +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl) +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl) +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) +#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) +#define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_ARM_POLL_CMD, struct dm_ioctl) + +#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) +#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) +#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl) +#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) + +#define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl) +#define DM_GET_TARGET_VERSION _IOWR(DM_IOCTL, DM_GET_TARGET_VERSION_CMD, struct dm_ioctl) + +#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) +#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) + +#define DM_VERSION_MAJOR 4 +#define DM_VERSION_MINOR 27 +#define DM_VERSION_PATCHLEVEL 0 +#define DM_VERSION_EXTRA "-ioctl (2022-02-22)" + +/* Status bits */ +#define DM_READONLY_FLAG (1 << 0) /* In/Out */ +#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */ +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */ + +/* + * Flag passed into ioctl STATUS command to get table information + * rather than current status. + */ +#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */ + +/* + * Flags that indicate whether a table is present in either of + * the two table slots that a device has. + */ +#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */ +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */ + +/* + * Indicates that the buffer passed in wasn't big enough for the + * results. + */ +#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ + +/* + * This flag is now ignored. + */ +#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */ + +/* + * Set this to avoid attempting to freeze any filesystem when suspending. + */ +#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */ + +/* + * Set this to suspend without flushing queued ios. + * Also disables flushing uncommitted changes in the thin target before + * generating statistics for DM_TABLE_STATUS and DM_DEV_WAIT. + */ +#define DM_NOFLUSH_FLAG (1 << 11) /* In */ + +/* + * If set, any table information returned will relate to the inactive + * table instead of the live one. Always check DM_INACTIVE_PRESENT_FLAG + * is set before using the data returned. + */ +#define DM_QUERY_INACTIVE_TABLE_FLAG (1 << 12) /* In */ + +/* + * If set, a uevent was generated for which the caller may need to wait. + */ +#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ + +/* + * If set, rename changes the uuid not the name. Only permitted + * if no uuid was previously supplied: an existing uuid cannot be changed. + */ +#define DM_UUID_FLAG (1 << 14) /* In */ + +/* + * If set, all buffers are wiped after use. Use when sending + * or requesting sensitive data such as an encryption key. + */ +#define DM_SECURE_DATA_FLAG (1 << 15) /* In */ + +/* + * If set, a message generated output data. + */ +#define DM_DATA_OUT_FLAG (1 << 16) /* Out */ + +/* + * If set with DM_DEV_REMOVE or DM_REMOVE_ALL this indicates that if + * the device cannot be removed immediately because it is still in use + * it should instead be scheduled for removal when it gets closed. + * + * On return from DM_DEV_REMOVE, DM_DEV_STATUS or other ioctls, this + * flag indicates that the device is scheduled to be removed when it + * gets closed. + */ +#define DM_DEFERRED_REMOVE (1 << 17) /* In/Out */ + +/* + * If set, the device is suspended internally. + */ +#define DM_INTERNAL_SUSPEND_FLAG (1 << 18) /* Out */ + +/* + * If set, returns in the in buffer passed by UM, the raw table information + * that would be measured by IMA subsystem on device state change. + */ +#define DM_IMA_MEASUREMENT_FLAG (1 << 19) /* In */ + +#endif /* _LINUX_DM_IOCTL_H */ diff --git a/src/shared/linux/ethtool.h b/src/shared/linux/ethtool.h new file mode 100644 index 0000000..3d1da51 --- /dev/null +++ b/src/shared/linux/ethtool.h @@ -0,0 +1,2164 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ethtool.h: Defines for Linux ethtool. + * + * Copyright (C) 1998 David S. Miller (davem@redhat.com) + * Copyright 2001 Jeff Garzik + * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) + * Portions Copyright 2002 Intel (eli.kupermann@intel.com, + * christopher.leech@intel.com, + * scott.feldman@intel.com) + * Portions Copyright (C) Sun Microsystems 2008 + */ + +#ifndef _LINUX_ETHTOOL_H +#define _LINUX_ETHTOOL_H + +#include +#include +#include + +#include /* for INT_MAX */ + +#ifndef __KERNEL_DIV_ROUND_UP +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +/* All structures exposed to userland should be defined such that they + * have the same layout for 32-bit and 64-bit userland. + */ + +/* Note on reserved space. + * Reserved fields must not be accessed directly by user space because + * they may be replaced by a different field in the future. They must + * be initialized to zero before making the request, e.g. via memset + * of the entire structure or implicitly by not being set in a structure + * initializer. + */ + +/** + * struct ethtool_cmd - DEPRECATED, link control and status + * This structure is DEPRECATED, please use struct ethtool_link_settings. + * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET + * @supported: Bitmask of %SUPPORTED_* flags for the link modes, + * physical connectors and other link features for which the + * interface supports autonegotiation or auto-detection. + * Read-only. + * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, + * physical connectors and other link features that are + * advertised through autonegotiation or enabled for + * auto-detection. + * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @transceiver: Historically used to distinguish different possible + * PHY types, but not in a consistent way. Deprecated. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @maxtxpkt: Historically used to report TX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @maxrxpkt: Historically used to report RX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes + * and other link features that the link partner advertised + * through autonegotiation; 0 if unknown or not applicable. + * Read-only. + * @reserved: Reserved for future use; see the note on reserved space. + * + * The link speed in Mbps is split between @speed and @speed_hi. Use + * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to + * access it. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GSET to get the current values before making specific + * changes and then applying them with %ETHTOOL_SSET. + * + * Deprecated fields should be ignored by both users and drivers. + */ +struct ethtool_cmd { + __u32 cmd; + __u32 supported; + __u32 advertising; + __u16 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 transceiver; + __u8 autoneg; + __u8 mdio_support; + __u32 maxtxpkt; + __u32 maxrxpkt; + __u16 speed_hi; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __u32 lp_advertising; + __u32 reserved[2]; +}; + +static __inline__ void ethtool_cmd_speed_set(struct ethtool_cmd *ep, + __u32 speed) +{ + ep->speed = (__u16)(speed & 0xFFFF); + ep->speed_hi = (__u16)(speed >> 16); +} + +static __inline__ __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) +{ + return ((__u32) ep->speed_hi << 16) | (__u32) ep->speed; +} + +/* Device supports clause 22 register access to PHY or peripherals + * using the interface defined in . This should not be + * set if there are known to be no such peripherals present or if + * the driver only emulates clause 22 registers for compatibility. + */ +#define ETH_MDIO_SUPPORTS_C22 1 + +/* Device supports clause 45 register access to PHY or peripherals + * using the interface defined in and . + * This should not be set if there are known to be no such peripherals + * present. + */ +#define ETH_MDIO_SUPPORTS_C45 2 + +#define ETHTOOL_FWVERS_LEN 32 +#define ETHTOOL_BUSINFO_LEN 32 +#define ETHTOOL_EROMVERS_LEN 32 + +/** + * struct ethtool_drvinfo - general driver and device information + * @cmd: Command number = %ETHTOOL_GDRVINFO + * @driver: Driver short name. This should normally match the name + * in its bus driver structure (e.g. pci_driver::name). Must + * not be an empty string. + * @version: Driver version string; may be an empty string + * @fw_version: Firmware version string; driver defined; may be an + * empty string + * @erom_version: Expansion ROM version string; driver defined; may be + * an empty string + * @bus_info: Device bus address. This should match the dev_name() + * string for the underlying bus device, if there is one. May be + * an empty string. + * @reserved2: Reserved for future use; see the note on reserved space. + * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and + * %ETHTOOL_SPFLAGS commands; also the number of strings in the + * %ETH_SS_PRIV_FLAGS set + * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS + * command; also the number of strings in the %ETH_SS_STATS set + * @testinfo_len: Number of results returned by the %ETHTOOL_TEST + * command; also the number of strings in the %ETH_SS_TEST set + * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM + * and %ETHTOOL_SEEPROM commands, in bytes + * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS + * command, in bytes + * + * Users can use the %ETHTOOL_GSSET_INFO command to get the number of + * strings in any string set (from Linux 2.6.34). + */ +struct ethtool_drvinfo { + __u32 cmd; + char driver[32]; + char version[32]; + char fw_version[ETHTOOL_FWVERS_LEN]; + char bus_info[ETHTOOL_BUSINFO_LEN]; + char erom_version[ETHTOOL_EROMVERS_LEN]; + char reserved2[12]; + __u32 n_priv_flags; + __u32 n_stats; + __u32 testinfo_len; + __u32 eedump_len; + __u32 regdump_len; +}; + +#define SOPASS_MAX 6 + +/** + * struct ethtool_wolinfo - Wake-On-Lan configuration + * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL + * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. + * Read-only. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE + * is set in @wolopts. + */ +struct ethtool_wolinfo { + __u32 cmd; + __u32 supported; + __u32 wolopts; + __u8 sopass[SOPASS_MAX]; +}; + +/* for passing single values */ +struct ethtool_value { + __u32 cmd; + __u32 data; +}; + +#define PFC_STORM_PREVENTION_AUTO 0xffff +#define PFC_STORM_PREVENTION_DISABLE 0 + +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ + ETHTOOL_TX_COPYBREAK_BUF_SIZE, + /* + * Add your fresh new tunable attribute above and remember to update + * tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_TUNABLE_COUNT, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[]; +}; + +#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff +#define DOWNSHIFT_DEV_DISABLE 0 + +/* Time in msecs after which link is reported as down + * 0 = lowest time supported by the PHY + * 0xff = off, link down detection according to standard + */ +#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 +#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff + +/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where + * the PHY's RX & TX blocks are put into a low-power mode when there is no + * link detected (typically cable is un-plugged). For RX, only a minimal + * link-detection is available, and for TX the PHY wakes up to send link pulses + * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode. + * + * Some PHYs may support configuration of the wake-up interval for TX pulses, + * and some PHYs may support only disabling TX pulses entirely. For the latter + * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be + * configured from userspace (should the user want it). + * + * The interval units for TX wake-up are in milliseconds, since this should + * cover a reasonable range of intervals: + * - from 1 millisecond, which does not sound like much of a power-saver + * - to ~65 seconds which is quite a lot to wait for a link to come up when + * plugging a cable + */ +#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff +#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe +#define ETHTOOL_PHY_EDPD_DISABLE 0 + +enum phy_tunable_id { + ETHTOOL_PHY_ID_UNSPEC, + ETHTOOL_PHY_DOWNSHIFT, + ETHTOOL_PHY_FAST_LINK_DOWN, + ETHTOOL_PHY_EDPD, + /* + * Add your fresh new phy tunable attribute above and remember to update + * phy_tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_PHY_TUNABLE_COUNT, +}; + +/** + * struct ethtool_regs - hardware register dump + * @cmd: Command number = %ETHTOOL_GREGS + * @version: Dump format version. This is driver-specific and may + * distinguish different chips/revisions. Drivers must use new + * version numbers whenever the dump format changes in an + * incompatible way. + * @len: On entry, the real length of @data. On return, the number of + * bytes used. + * @data: Buffer for the register dump + * + * Users should use %ETHTOOL_GDRVINFO to find the maximum length of + * a register dump for the interface. They must allocate the buffer + * immediately following this structure. + */ +struct ethtool_regs { + __u32 cmd; + __u32 version; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eeprom - EEPROM dump + * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or + * %ETHTOOL_SEEPROM + * @magic: A 'magic cookie' value to guard against accidental changes. + * The value passed in to %ETHTOOL_SEEPROM must match the value + * returned by %ETHTOOL_GEEPROM for the same device. This is + * unused when @cmd is %ETHTOOL_GMODULEEEPROM. + * @offset: Offset within the EEPROM to begin reading/writing, in bytes + * @len: On entry, number of bytes to read/write. On successful + * return, number of bytes actually read/written. In case of + * error, this may indicate at what point the error occurred. + * @data: Buffer to read/write from + * + * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find + * the length of an on-board or module EEPROM, respectively. They + * must allocate the buffer immediately following this structure. + */ +struct ethtool_eeprom { + __u32 cmd; + __u32 magic; + __u32 offset; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eee - Energy Efficient Ethernet information + * @cmd: ETHTOOL_{G,S}EEE + * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations + * for which there is EEE support. + * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations + * advertised as eee capable. + * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex + * combinations advertised by the link partner as eee capable. + * @eee_active: Result of the eee auto negotiation. + * @eee_enabled: EEE configured mode (enabled/disabled). + * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given + * that eee was negotiated. + * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting + * its tx lpi (after reaching 'idle' state). Effective only when eee + * was negotiated and tx_lpi_enabled was set. + * @reserved: Reserved for future use; see the note on reserved space. + */ +struct ethtool_eee { + __u32 cmd; + __u32 supported; + __u32 advertised; + __u32 lp_advertised; + __u32 eee_active; + __u32 eee_enabled; + __u32 tx_lpi_enabled; + __u32 tx_lpi_timer; + __u32 reserved[2]; +}; + +/** + * struct ethtool_modinfo - plugin module eeprom information + * @cmd: %ETHTOOL_GMODULEINFO + * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx + * @eeprom_len: Length of the eeprom + * @reserved: Reserved for future use; see the note on reserved space. + * + * This structure is used to return the information to + * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. + * The type code indicates the eeprom data format + */ +struct ethtool_modinfo { + __u32 cmd; + __u32 type; + __u32 eeprom_len; + __u32 reserved[8]; +}; + +/** + * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates + * @cmd: ETHTOOL_{G,S}COALESCE + * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after + * a packet arrives. + * @rx_max_coalesced_frames: Maximum number of packets to receive + * before an RX interrupt. + * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after + * a packet is sent. + * @tx_max_coalesced_frames: Maximum number of packets to be sent + * before a TX interrupt. + * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @stats_block_coalesce_usecs: How many usecs to delay in-memory + * statistics block updates. Some drivers do not have an + * in-memory statistic block, and in such cases this value is + * ignored. This value must not be zero. + * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. + * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. + * @pkt_rate_low: Threshold for low packet rate (packets per second). + * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is below @pkt_rate_low. + * @rx_max_coalesced_frames_low: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is below @pkt_rate_low. + * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is below @pkt_rate_low. + * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before + * a TX interrupt, when the packet rate is below @pkt_rate_low. + * @pkt_rate_high: Threshold for high packet rate (packets per second). + * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is above @pkt_rate_high. + * @rx_max_coalesced_frames_high: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is above @pkt_rate_high. + * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is above @pkt_rate_high. + * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before + * a TX interrupt, when the packet rate is above @pkt_rate_high. + * @rate_sample_interval: How often to do adaptive coalescing packet rate + * sampling, measured in seconds. Must not be zero. + * + * Each pair of (usecs, max_frames) fields specifies that interrupts + * should be coalesced until + * (usecs > 0 && time_since_first_completion >= usecs) || + * (max_frames > 0 && completed_frames >= max_frames) + * + * It is illegal to set both usecs and max_frames to zero as this + * would cause interrupts to never be generated. To disable + * coalescing, set usecs = 0 and max_frames = 1. + * + * Some implementations ignore the value of max_frames and use the + * condition time_since_first_completion >= usecs + * + * This is deprecated. Drivers for hardware that does not support + * counting completions should validate that max_frames == !rx_usecs. + * + * Adaptive RX/TX coalescing is an algorithm implemented by some + * drivers to improve latency under low packet rates and improve + * throughput under high packet rates. Some drivers only implement + * one of RX or TX adaptive coalescing. Anything not implemented by + * the driver causes these values to be silently ignored. + * + * When the packet rate is below @pkt_rate_high but above + * @pkt_rate_low (both measured in packets per second) the + * normal {rx,tx}_* coalescing parameters are used. + */ +struct ethtool_coalesce { + __u32 cmd; + __u32 rx_coalesce_usecs; + __u32 rx_max_coalesced_frames; + __u32 rx_coalesce_usecs_irq; + __u32 rx_max_coalesced_frames_irq; + __u32 tx_coalesce_usecs; + __u32 tx_max_coalesced_frames; + __u32 tx_coalesce_usecs_irq; + __u32 tx_max_coalesced_frames_irq; + __u32 stats_block_coalesce_usecs; + __u32 use_adaptive_rx_coalesce; + __u32 use_adaptive_tx_coalesce; + __u32 pkt_rate_low; + __u32 rx_coalesce_usecs_low; + __u32 rx_max_coalesced_frames_low; + __u32 tx_coalesce_usecs_low; + __u32 tx_max_coalesced_frames_low; + __u32 pkt_rate_high; + __u32 rx_coalesce_usecs_high; + __u32 rx_max_coalesced_frames_high; + __u32 tx_coalesce_usecs_high; + __u32 tx_max_coalesced_frames_high; + __u32 rate_sample_interval; +}; + +/** + * struct ethtool_ringparam - RX/TX ring parameters + * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM + * @rx_max_pending: Maximum supported number of pending entries per + * RX ring. Read-only. + * @rx_mini_max_pending: Maximum supported number of pending entries + * per RX mini ring. Read-only. + * @rx_jumbo_max_pending: Maximum supported number of pending entries + * per RX jumbo ring. Read-only. + * @tx_max_pending: Maximum supported number of pending entries per + * TX ring. Read-only. + * @rx_pending: Current maximum number of pending entries per RX ring + * @rx_mini_pending: Current maximum number of pending entries per RX + * mini ring + * @rx_jumbo_pending: Current maximum number of pending entries per RX + * jumbo ring + * @tx_pending: Current maximum supported number of pending entries + * per TX ring + * + * If the interface does not have separate RX mini and/or jumbo rings, + * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. + * + * There may also be driver-dependent minimum values for the number + * of entries per ring. + */ +struct ethtool_ringparam { + __u32 cmd; + __u32 rx_max_pending; + __u32 rx_mini_max_pending; + __u32 rx_jumbo_max_pending; + __u32 tx_max_pending; + __u32 rx_pending; + __u32 rx_mini_pending; + __u32 rx_jumbo_pending; + __u32 tx_pending; +}; + +/** + * struct ethtool_channels - configuring number of network channel + * @cmd: ETHTOOL_{G,S}CHANNELS + * @max_rx: Read only. Maximum number of receive channel the driver support. + * @max_tx: Read only. Maximum number of transmit channel the driver support. + * @max_other: Read only. Maximum number of other channel the driver support. + * @max_combined: Read only. Maximum number of combined channel the driver + * support. Set of queues RX, TX or other. + * @rx_count: Valid values are in the range 1 to the max_rx. + * @tx_count: Valid values are in the range 1 to the max_tx. + * @other_count: Valid values are in the range 1 to the max_other. + * @combined_count: Valid values are in the range 1 to the max_combined. + * + * This can be used to configure RX, TX and other channels. + */ + +struct ethtool_channels { + __u32 cmd; + __u32 max_rx; + __u32 max_tx; + __u32 max_other; + __u32 max_combined; + __u32 rx_count; + __u32 tx_count; + __u32 other_count; + __u32 combined_count; +}; + +/** + * struct ethtool_pauseparam - Ethernet pause (flow control) parameters + * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM + * @autoneg: Flag to enable autonegotiation of pause frame use + * @rx_pause: Flag to enable reception of pause frames + * @tx_pause: Flag to enable transmission of pause frames + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + * If the link is autonegotiated, drivers should use + * mii_advertise_flowctrl() or similar code to set the advertised + * pause frame capabilities based on the @rx_pause and @tx_pause flags, + * even if @autoneg is zero. They should also allow the advertised + * pause frame capabilities to be controlled directly through the + * advertising field of &struct ethtool_cmd. + * + * If @autoneg is non-zero, the MAC is configured to send and/or + * receive pause frames according to the result of autonegotiation. + * Otherwise, it is configured directly based on the @rx_pause and + * @tx_pause flags. + */ +struct ethtool_pauseparam { + __u32 cmd; + __u32 autoneg; + __u32 rx_pause; + __u32 tx_pause; +}; + +/* Link extended state */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, + ETHTOOL_LINK_EXT_STATE_MODULE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ +enum ethtool_link_ext_substate_module { + ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, +}; + +#define ETH_GSTRING_LEN 32 + +/** + * enum ethtool_stringset - string set ID + * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST + * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS + * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with + * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS + * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; + * now deprecated + * @ETH_SS_FEATURES: Device feature names + * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_TUNABLES: tunable names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS + * @ETH_SS_PHY_TUNABLES: PHY tunable names + * @ETH_SS_LINK_MODES: link mode names + * @ETH_SS_MSG_CLASSES: debug message class names + * @ETH_SS_WOL_MODES: wake-on-lan modes + * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags + * @ETH_SS_TS_TX_TYPES: timestamping Tx types + * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types + * @ETH_SS_STATS_STD: standardized stats + * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics + * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics + * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics + * @ETH_SS_STATS_RMON: names of RMON statistics + * + * @ETH_SS_COUNT: number of defined string sets + */ +enum ethtool_stringset { + ETH_SS_TEST = 0, + ETH_SS_STATS, + ETH_SS_PRIV_FLAGS, + ETH_SS_NTUPLE_FILTERS, + ETH_SS_FEATURES, + ETH_SS_RSS_HASH_FUNCS, + ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, + ETH_SS_PHY_TUNABLES, + ETH_SS_LINK_MODES, + ETH_SS_MSG_CLASSES, + ETH_SS_WOL_MODES, + ETH_SS_SOF_TIMESTAMPING, + ETH_SS_TS_TX_TYPES, + ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, + ETH_SS_STATS_STD, + ETH_SS_STATS_ETH_PHY, + ETH_SS_STATS_ETH_MAC, + ETH_SS_STATS_ETH_CTRL, + ETH_SS_STATS_RMON, + + /* add new constants above here */ + ETH_SS_COUNT +}; + +/** + * enum ethtool_module_power_mode_policy - plug-in module power mode policy + * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. + * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host + * to high power mode when the first port using it is put administratively + * up and to low power mode when the last port using it is put + * administratively down. + */ +enum ethtool_module_power_mode_policy { + ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, +}; + +/** + * enum ethtool_module_power_mode - plug-in module power mode + * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. + * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. + */ +enum ethtool_module_power_mode { + ETHTOOL_MODULE_POWER_MODE_LOW = 1, + ETHTOOL_MODULE_POWER_MODE_HIGH, +}; + +/** + * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are + * unknown + * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled + * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled + */ +enum ethtool_podl_pse_admin_state { + ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, + ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, +}; + +/** + * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is + * asserted true when the PoDL PSE state diagram variable mr_pse_enable is + * false" + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is + * asserted true when either of the PSE state diagram variables + * pi_detecting or pi_classifying is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” + * is asserted true when the PoDL PSE state diagram variable pi_powered is + * true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted + * true when the PoDL PSE state diagram variable pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true + * when the logical combination of the PoDL PSE state diagram variables + * pi_prebiased*!pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted + * true when the PoDL PSE state diagram variable overload_held is true." + */ +enum ethtool_podl_pse_pw_d_status { + ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, + ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, + ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, + ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, + ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, + ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, + ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, +}; + +/** + * struct ethtool_gstrings - string set for data tagging + * @cmd: Command number = %ETHTOOL_GSTRINGS + * @string_set: String set ID; one of &enum ethtool_stringset + * @len: On return, the number of strings in the string set + * @data: Buffer for strings. Each string is null-padded to a size of + * %ETH_GSTRING_LEN. + * + * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in + * the string set. They must allocate a buffer of the appropriate + * size immediately following this structure. + */ +struct ethtool_gstrings { + __u32 cmd; + __u32 string_set; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_sset_info - string set information + * @cmd: Command number = %ETHTOOL_GSSET_INFO + * @reserved: Reserved for future use; see the note on reserved space. + * @sset_mask: On entry, a bitmask of string sets to query, with bits + * numbered according to &enum ethtool_stringset. On return, a + * bitmask of those string sets queried that are supported. + * @data: Buffer for string set sizes. On return, this contains the + * size of each string set that was queried and supported, in + * order of ID. + * + * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on + * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the + * size of set 1 and @data[1] contains the size of set 2. + * + * Users must allocate a buffer of the appropriate size (4 * number of + * sets queried) immediately following this structure. + */ +struct ethtool_sset_info { + __u32 cmd; + __u32 reserved; + __u64 sset_mask; + __u32 data[]; +}; + +/** + * enum ethtool_test_flags - flags definition of ethtool_test + * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise + * only online tests. + * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. + * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback + * test. + * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test + */ + +enum ethtool_test_flags { + ETH_TEST_FL_OFFLINE = (1 << 0), + ETH_TEST_FL_FAILED = (1 << 1), + ETH_TEST_FL_EXTERNAL_LB = (1 << 2), + ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), +}; + +/** + * struct ethtool_test - device self-test invocation + * @cmd: Command number = %ETHTOOL_TEST + * @flags: A bitmask of flags from &enum ethtool_test_flags. Some + * flags may be set by the user on entry; others may be set by + * the driver on return. + * @reserved: Reserved for future use; see the note on reserved space. + * @len: On return, the number of test results + * @data: Array of test results + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of test results that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of results) immediately + * following this structure. + */ +struct ethtool_test { + __u32 cmd; + __u32 flags; + __u32 reserved; + __u32 len; + __u64 data[]; +}; + +/** + * struct ethtool_stats - device-specific statistics + * @cmd: Command number = %ETHTOOL_GSTATS + * @n_stats: On return, the number of statistics + * @data: Array of statistics + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of statistics that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of statistics) + * immediately following this structure. + */ +struct ethtool_stats { + __u32 cmd; + __u32 n_stats; + __u64 data[]; +}; + +/** + * struct ethtool_perm_addr - permanent hardware address + * @cmd: Command number = %ETHTOOL_GPERMADDR + * @size: On entry, the size of the buffer. On return, the size of the + * address. The command fails if the buffer is too small. + * @data: Buffer for the address + * + * Users must allocate the buffer immediately following this structure. + * A buffer size of %MAX_ADDR_LEN should be sufficient for any address + * type. + */ +struct ethtool_perm_addr { + __u32 cmd; + __u32 size; + __u8 data[]; +}; + +/* boolean flags controlling per-interface behavior characteristics. + * When reading, the flag indicates whether or not a certain behavior + * is enabled/present. When writing, the flag indicates whether + * or not the driver should turn on (set) or off (clear) a behavior. + * + * Some behaviors may read-only (unconditionally absent or present). + * If such is the case, return EINVAL in the set-flags operation if the + * flag differs from the read-only value. + */ +enum ethtool_flags { + ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ + ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ + ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ + ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ + ETH_FLAG_RXHASH = (1 << 28), +}; + +/* The following structures are for supporting RX network flow + * classification and RX n-tuple configuration. Note, all multibyte + * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to + * be in network byte order. + */ + +/** + * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. + * @ip4src: Source host + * @ip4dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tos: Type-of-service + * + * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. + */ +struct ethtool_tcpip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be16 psrc; + __be16 pdst; + __u8 tos; +}; + +/** + * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @spi: Security parameters index + * @tos: Type-of-service + * + * This can be used to specify an IPsec transport or tunnel over IPv4. + */ +struct ethtool_ah_espip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 spi; + __u8 tos; +}; + +#define ETH_RX_NFC_IP4 1 + +/** + * struct ethtool_usrip4_spec - general flow specification for IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tos: Type-of-service + * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 + * @proto: Transport protocol number; mask must be 0 + */ +struct ethtool_usrip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 l4_4_bytes; + __u8 tos; + __u8 ip_ver; + __u8 proto; +}; + +/** + * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. + * @ip6src: Source host + * @ip6dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tclass: Traffic Class + * + * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. + */ +struct ethtool_tcpip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be16 psrc; + __be16 pdst; + __u8 tclass; +}; + +/** + * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @spi: Security parameters index + * @tclass: Traffic Class + * + * This can be used to specify an IPsec transport or tunnel over IPv6. + */ +struct ethtool_ah_espip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 spi; + __u8 tclass; +}; + +/** + * struct ethtool_usrip6_spec - general flow specification for IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tclass: Traffic Class + * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) + */ +struct ethtool_usrip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 l4_4_bytes; + __u8 tclass; + __u8 l4_proto; +}; + +union ethtool_flow_union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethtool_tcpip6_spec tcp_ip6_spec; + struct ethtool_tcpip6_spec udp_ip6_spec; + struct ethtool_tcpip6_spec sctp_ip6_spec; + struct ethtool_ah_espip6_spec ah_ip6_spec; + struct ethtool_ah_espip6_spec esp_ip6_spec; + struct ethtool_usrip6_spec usr_ip6_spec; + struct ethhdr ether_spec; + __u8 hdata[52]; +}; + +/** + * struct ethtool_flow_ext - additional RX flow fields + * @h_dest: destination MAC address + * @vlan_etype: VLAN EtherType + * @vlan_tci: VLAN tag control information + * @data: user defined data + * @padding: Reserved for future use; see the note on reserved space. + * + * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT + * is set in &struct ethtool_rx_flow_spec @flow_type. + * @h_dest is valid if %FLOW_MAC_EXT is set. + */ +struct ethtool_flow_ext { + __u8 padding[2]; + unsigned char h_dest[ETH_ALEN]; + __be16 vlan_etype; + __be16 vlan_tci; + __be32 data[2]; +}; + +/** + * struct ethtool_rx_flow_spec - classification rule for RX flows + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow fields to match (dependent on @flow_type) + * @h_ext: Additional fields to match + * @m_u: Masks for flow field bits to be matched + * @m_ext: Masks for additional field bits to be matched + * Note, all additional fields must be ignored unless @flow_type + * includes the %FLOW_EXT or %FLOW_MAC_EXT flag + * (see &struct ethtool_flow_ext description). + * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC + * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the + * packets should be used for Wake-on-LAN with %WAKE_FILTER + * @location: Location of rule in the table. Locations must be + * numbered such that a flow matching multiple rules will be + * classified according to the first (lowest numbered) rule. + */ +struct ethtool_rx_flow_spec { + __u32 flow_type; + union ethtool_flow_union h_u; + struct ethtool_flow_ext h_ext; + union ethtool_flow_union m_u; + struct ethtool_flow_ext m_ext; + __u64 ring_cookie; + __u32 location; +}; + +/* How rings are laid out when accessing virtual functions or + * offloaded queues is device specific. To allow users to do flow + * steering and specify these queues the ring cookie is partitioned + * into a 32-bit queue index with an 8 bit virtual function id. + * This also leaves the 3bytes for further specifiers. It is possible + * future devices may support more than 256 virtual functions if + * devices start supporting PCIe w/ARI. However at the moment I + * do not know of any devices that support this so I do not reserve + * space for this at this time. If a future patch consumes the next + * byte it should be aware of this possibility. + */ +#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 +static __inline__ __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) +{ + return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; +} + +static __inline__ __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) +{ + return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; +} + +/** + * struct ethtool_rxnfc - command to get or set RX flow classification rules + * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, + * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, + * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS + * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW + * @data: Command-dependent value + * @fs: Flow classification rule + * @rss_context: RSS context to be affected + * @rule_cnt: Number of rules to be affected + * @rule_locs: Array of used rule locations + * + * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating + * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following + * structure fields must not be used, except that if @flow_type includes + * the %FLOW_RSS flag, then @rss_context determines which RSS context to + * act on. + * + * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues + * on return. + * + * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined + * rules on return. If @data is non-zero on return then it is the + * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the + * driver supports any special location values. If that flag is not + * set in @data then special location values should not be used. + * + * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an + * existing rule on entry and @fs contains the rule on return; if + * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is + * filled with the RSS context ID associated with the rule. + * + * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the + * user buffer for @rule_locs on entry. On return, @data is the size + * of the rule table, @rule_cnt is the number of defined rules, and + * @rule_locs contains the locations of the defined rules. Drivers + * must use the second parameter to get_rxnfc() instead of @rule_locs. + * + * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. + * @fs.@location either specifies the location to use or is a special + * location value with %RX_CLS_LOC_SPECIAL flag set. On return, + * @fs.@location is the actual rule location. If @fs.@flow_type + * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to + * use for flow spreading traffic which matches this rule. The value + * from the rxfh indirection table will be added to @fs.@ring_cookie + * to choose which ring to deliver to. + * + * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an + * existing rule on entry. + * + * A driver supporting the special location values for + * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused + * location, and may remove a rule at a later location (lower + * priority) that matches exactly the same set of flows. The special + * values are %RX_CLS_LOC_ANY, selecting any location; + * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum + * priority); and %RX_CLS_LOC_LAST, selecting the last suitable + * location (minimum priority). Additional special values may be + * defined in future and drivers must return -%EINVAL for any + * unrecognised value. + */ +struct ethtool_rxnfc { + __u32 cmd; + __u32 flow_type; + __u64 data; + struct ethtool_rx_flow_spec fs; + union { + __u32 rule_cnt; + __u32 rss_context; + }; + __u32 rule_locs[0]; +}; + + +/** + * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection + * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR + * @size: On entry, the array size of the user buffer, which may be zero. + * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware + * indirection table. + * @ring_index: RX ring/queue index for each hash value + * + * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size + * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means + * the table should be reset to default values. This last feature + * is not supported by the original implementations. + */ +struct ethtool_rxfh_indir { + __u32 cmd; + __u32 size; + __u32 ring_index[]; +}; + +/** + * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. + * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command + * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @indir_size: On entry, the array size of the user buffer for the + * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), + * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, + * the array size of the hardware indirection table. + * @key_size: On entry, the array size of the user buffer for the hash key, + * which may be zero. On return from %ETHTOOL_GRSSH, the size of the + * hardware hash key. + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @rsvd8: Reserved for future use; see the note on reserved space. + * @rsvd32: Reserved for future use; see the note on reserved space. + * @rss_config: RX ring/queue index for each hash value i.e., indirection table + * of @indir_size __u32 elements, followed by hash key of @key_size + * bytes. + * + * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the + * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of + * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested + * and a @indir_size of zero means the indir table should be reset to default + * values (if @rss_context == 0) or that the RSS context should be deleted. + * An hfunc of zero means that hash function setting is not requested. + */ +struct ethtool_rxfh { + __u32 cmd; + __u32 rss_context; + __u32 indir_size; + __u32 key_size; + __u8 hfunc; + __u8 rsvd8[3]; + __u32 rsvd32; + __u32 rss_config[]; +}; +#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff +#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff + +/** + * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow field values to match (dependent on @flow_type) + * @m_u: Masks for flow field value bits to be ignored + * @vlan_tag: VLAN tag to match + * @vlan_tag_mask: Mask for VLAN tag bits to be ignored + * @data: Driver-dependent data to match + * @data_mask: Mask for driver-dependent data bits to be ignored + * @action: RX ring/queue index to deliver to (non-negative) or other action + * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) + * + * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where + * a field value and mask are both zero this is treated as if all mask + * bits are set i.e. the field is ignored. + */ +struct ethtool_rx_ntuple_flow_spec { + __u32 flow_type; + union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethhdr ether_spec; + __u8 hdata[72]; + } h_u, m_u; + + __u16 vlan_tag; + __u16 vlan_tag_mask; + __u64 data; + __u64 data_mask; + + __s32 action; +#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ +#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ +}; + +/** + * struct ethtool_rx_ntuple - command to set or clear RX flow filter + * @cmd: Command number - %ETHTOOL_SRXNTUPLE + * @fs: Flow filter specification + */ +struct ethtool_rx_ntuple { + __u32 cmd; + struct ethtool_rx_ntuple_flow_spec fs; +}; + +#define ETHTOOL_FLASH_MAX_FILENAME 128 +enum ethtool_flash_op_type { + ETHTOOL_FLASH_ALL_REGIONS = 0, +}; + +/* for passing firmware flashing related parameters */ +struct ethtool_flash { + __u32 cmd; + __u32 region; + char data[ETHTOOL_FLASH_MAX_FILENAME]; +}; + +/** + * struct ethtool_dump - used for retrieving, setting device dump + * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or + * %ETHTOOL_SET_DUMP + * @version: FW version of the dump, filled in by driver + * @flag: driver dependent flag for dump setting, filled in by driver during + * get and filled in by ethtool for set operation. + * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when + * firmware dump is disabled. + * @len: length of dump data, used as the length of the user buffer on entry to + * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver + * for %ETHTOOL_GET_DUMP_FLAG command + * @data: data collected for get dump data operation + */ +struct ethtool_dump { + __u32 cmd; + __u32 version; + __u32 flag; + __u32 len; + __u8 data[]; +}; + +#define ETH_FW_DUMP_DISABLE 0 + +/* for returning and changing feature sets */ + +/** + * struct ethtool_get_features_block - block with state of 32 features + * @available: mask of changeable features + * @requested: mask of features requested to be enabled if possible + * @active: mask of currently enabled features + * @never_changed: mask of features not changeable for any device + */ +struct ethtool_get_features_block { + __u32 available; + __u32 requested; + __u32 active; + __u32 never_changed; +}; + +/** + * struct ethtool_gfeatures - command to get state of device's features + * @cmd: command number = %ETHTOOL_GFEATURES + * @size: On entry, the number of elements in the features[] array; + * on return, the number of elements in features[] needed to hold + * all features + * @features: state of features + */ +struct ethtool_gfeatures { + __u32 cmd; + __u32 size; + struct ethtool_get_features_block features[]; +}; + +/** + * struct ethtool_set_features_block - block with request for 32 features + * @valid: mask of features to be changed + * @requested: values of features to be changed + */ +struct ethtool_set_features_block { + __u32 valid; + __u32 requested; +}; + +/** + * struct ethtool_sfeatures - command to request change in device's features + * @cmd: command number = %ETHTOOL_SFEATURES + * @size: array size of the features[] array + * @features: feature change masks + */ +struct ethtool_sfeatures { + __u32 cmd; + __u32 size; + struct ethtool_set_features_block features[]; +}; + +/** + * struct ethtool_ts_info - holds a device's timestamping and PHC association + * @cmd: command number = %ETHTOOL_GET_TS_INFO + * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags + * @phc_index: device index of the associated PHC, or -1 if there is none + * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values + * @tx_reserved: Reserved for future use; see the note on reserved space. + * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values + * @rx_reserved: Reserved for future use; see the note on reserved space. + * + * The bits in the 'tx_types' and 'rx_filters' fields correspond to + * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, + * respectively. For example, if the device supports HWTSTAMP_TX_ON, + * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. + * + * Drivers should only report the filters they actually support without + * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for + * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the + * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. + */ +struct ethtool_ts_info { + __u32 cmd; + __u32 so_timestamping; + __s32 phc_index; + __u32 tx_types; + __u32 tx_reserved[3]; + __u32 rx_filters; + __u32 rx_reserved[3]; +}; + +/* + * %ETHTOOL_SFEATURES changes features present in features[].valid to the + * values of corresponding bits in features[].requested. Bits in .requested + * not set in .valid or not changeable are ignored. + * + * Returns %EINVAL when .valid contains undefined or never-changeable bits + * or size is not equal to required number of features words (32-bit blocks). + * Returns >= 0 if request was completed; bits set in the value mean: + * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not + * changeable (not present in %ETHTOOL_GFEATURES' features[].available) + * those bits were ignored. + * %ETHTOOL_F_WISH - some or all changes requested were recorded but the + * resulting state of bits masked by .valid is not equal to .requested. + * Probably there are other device-specific constraints on some features + * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered + * here as though ignored bits were cleared. + * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling + * compatibility functions. Requested offload state cannot be properly + * managed by kernel. + * + * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of + * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands + * for ETH_SS_FEATURES string set. First entry in the table corresponds to least + * significant bit in features[0] fields. Empty strings mark undefined features. + */ +enum ethtool_sfeatures_retval_bits { + ETHTOOL_F_UNSUPPORTED__BIT, + ETHTOOL_F_WISH__BIT, + ETHTOOL_F_COMPAT__BIT, +}; + +#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) +#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) +#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) + +#define MAX_NUM_QUEUE 4096 + +/** + * struct ethtool_per_queue_op - apply sub command to the queues in mask. + * @cmd: ETHTOOL_PERQUEUE + * @sub_command: the sub command which apply to each queues + * @queue_mask: Bitmap of the queues which sub command apply to + * @data: A complete command structure following for each of the queues addressed + */ +struct ethtool_per_queue_op { + __u32 cmd; + __u32 sub_command; + __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + char data[]; +}; + +/** + * struct ethtool_fecparam - Ethernet Forward Error Correction parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on the port, single bit set, GET only. + * @fec: Bitmask of configured FEC modes. + * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. + * + * Note that @reserved was never validated on input and ethtool user space + * left it uninitialized when calling SET. Hence going forward it can only be + * used to return a value to userspace with GET. + * + * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. + * FEC settings are configured by link autonegotiation whenever it's enabled. + * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. + * + * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. + * It is recommended that drivers only accept a single bit set in @fec. + * When multiple bits are set in @fec drivers may pick mode in an implementation + * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other + * FEC modes, because it's unclear whether in this case other modes constrain + * AUTO or are independent choices. + * Drivers must reject SET requests if they support none of the requested modes. + * + * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead + * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. + * + * See enum ethtool_fec_config_bits for definition of valid bits for both + * @fec and @active_fec. + */ +struct ethtool_fecparam { + __u32 cmd; + /* bitmask of FEC modes */ + __u32 active_fec; + __u32 fec; + __u32 reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually + * based link mode and SFP parameters read from module's + * EEPROM. This bit does _not_ mean autonegotiation. + * @ETHTOOL_FEC_OFF_BIT: No FEC Mode + * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, + ETHTOOL_FEC_LLRS_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) +#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) + +/* CMDs currently supported */ +#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. + * Please use ETHTOOL_GLINKSETTINGS + */ +#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. + * Please use ETHTOOL_SLINKSETTINGS + */ +#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ +#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ +#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ +#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ +#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ +#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ +#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ +/* Get link status for host, i.e. whether the interface *and* the + * physical port (if there is one) are up (ethtool_value). */ +#define ETHTOOL_GLINK 0x0000000a +#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ +#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ +#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ +#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ +#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ +#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ +#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ +#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ +#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ +#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ +#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ +#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ +#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable + * (ethtool_value) */ +#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable + * (ethtool_value). */ +#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ +#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ +#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ +#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ +#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ +#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ +#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ +#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ +#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ +#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ +#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ +#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ + +#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ +#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ +#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ +#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ +#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ +#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ +#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ +#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ +#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ +#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ +#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ +#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ +#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ +#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ +#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ +#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ +#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ + +#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ +#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ +#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ +#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ +#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ +#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ +#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ +#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ +#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ +#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ + +#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ +#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ + +#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ + +#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ +#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ +#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ +#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ + +/* compatibility with older code */ +#define SPARC_ETH_GSET ETHTOOL_GSET +#define SPARC_ETH_SSET ETHTOOL_SSET + +/* Link mode bit indices */ +enum ethtool_link_mode_bit_indices { + ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, + ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, + ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, + ETHTOOL_LINK_MODE_Autoneg_BIT = 6, + ETHTOOL_LINK_MODE_TP_BIT = 7, + ETHTOOL_LINK_MODE_AUI_BIT = 8, + ETHTOOL_LINK_MODE_MII_BIT = 9, + ETHTOOL_LINK_MODE_FIBRE_BIT = 10, + ETHTOOL_LINK_MODE_BNC_BIT = 11, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, + ETHTOOL_LINK_MODE_Pause_BIT = 13, + ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, + ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, + ETHTOOL_LINK_MODE_Backplane_BIT = 16, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, + ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, + ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, + ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, + ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, + ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, + ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, + ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, + ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, + ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, + ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, + ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + + /* must be last entry */ + __ETHTOOL_LINK_MODE_MASK_NBITS +}; + +#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ + (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) + +/* DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new SUPPORTED_* macro for bits > 31. + */ +#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new SUPPORTED_* macro for bits > 31, see + * notice above. + */ + +/* + * DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new ADERTISE_* macro for bits > 31. + */ +#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new ADVERTISED_* macro for bits > 31, see + * notice above. + */ + +/* The following are all involved in forcing a particular link + * mode for the device for setting things. When getting the + * devices settings, these indicate the current mode and whether + * it was forced up into this mode or autonegotiated. + */ + +/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. + * Update drivers/net/phy/phy.c:phy_speed_to_str() and + * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. + */ +#define SPEED_10 10 +#define SPEED_100 100 +#define SPEED_1000 1000 +#define SPEED_2500 2500 +#define SPEED_5000 5000 +#define SPEED_10000 10000 +#define SPEED_14000 14000 +#define SPEED_20000 20000 +#define SPEED_25000 25000 +#define SPEED_40000 40000 +#define SPEED_50000 50000 +#define SPEED_56000 56000 +#define SPEED_100000 100000 +#define SPEED_200000 200000 +#define SPEED_400000 400000 +#define SPEED_800000 800000 + +#define SPEED_UNKNOWN -1 + +static __inline__ int ethtool_validate_speed(__u32 speed) +{ + return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; +} + +/* Duplex, half or full. */ +#define DUPLEX_HALF 0x00 +#define DUPLEX_FULL 0x01 +#define DUPLEX_UNKNOWN 0xff + +static __inline__ int ethtool_validate_duplex(__u8 duplex) +{ + switch (duplex) { + case DUPLEX_HALF: + case DUPLEX_FULL: + case DUPLEX_UNKNOWN: + return 1; + } + + return 0; +} + +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + +/* These are used to throttle the rate of data on the phy interface when the + * native speed of the interface is higher than the link speed. These should + * not be used for phy interfaces which natively support multiple speeds (e.g. + * MII or SGMII). + */ +/* No rate matching performed. */ +#define RATE_MATCH_NONE 0 +/* The phy sends pause frames to throttle the MAC. */ +#define RATE_MATCH_PAUSE 1 +/* The phy asserts CRS to prevent the MAC from transmitting. */ +#define RATE_MATCH_CRS 2 +/* The MAC is programmed with a sufficiently-large IPG. */ +#define RATE_MATCH_OPEN_LOOP 3 + +/* Which connector port. */ +#define PORT_TP 0x00 +#define PORT_AUI 0x01 +#define PORT_MII 0x02 +#define PORT_FIBRE 0x03 +#define PORT_BNC 0x04 +#define PORT_DA 0x05 +#define PORT_NONE 0xef +#define PORT_OTHER 0xff + +/* Which transceiver to use. */ +#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ +#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ +#define XCVR_DUMMY1 0x02 +#define XCVR_DUMMY2 0x03 +#define XCVR_DUMMY3 0x04 + +/* Enable or disable autonegotiation. */ +#define AUTONEG_DISABLE 0x00 +#define AUTONEG_ENABLE 0x01 + +/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then + * the driver is required to renegotiate link + */ +#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ +#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ +#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ +#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ + +/* Wake-On-Lan options. */ +#define WAKE_PHY (1 << 0) +#define WAKE_UCAST (1 << 1) +#define WAKE_MCAST (1 << 2) +#define WAKE_BCAST (1 << 3) +#define WAKE_ARP (1 << 4) +#define WAKE_MAGIC (1 << 5) +#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ +#define WAKE_FILTER (1 << 7) + +#define WOL_MODE_COUNT 8 + +/* L2-L4 network traffic flow types */ +#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ +#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ +#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ +#define AH_ESP_V4_FLOW 0x04 /* hash only */ +#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ +#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ +#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ +#define AH_ESP_V6_FLOW 0x08 /* hash only */ +#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ +#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ +#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ +#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ +#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define IP_USER_FLOW IPV4_USER_FLOW +#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ +#define IPV4_FLOW 0x10 /* hash only */ +#define IPV6_FLOW 0x11 /* hash only */ +#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ +/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ +#define FLOW_EXT 0x80000000 +#define FLOW_MAC_EXT 0x40000000 +/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ +#define FLOW_RSS 0x20000000 + +/* L3-L4 network traffic flow hash options */ +#define RXH_L2DA (1 << 1) +#define RXH_VLAN (1 << 2) +#define RXH_L3_PROTO (1 << 3) +#define RXH_IP_SRC (1 << 4) +#define RXH_IP_DST (1 << 5) +#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ +#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ +#define RXH_DISCARD (1 << 31) + +#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL +#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL + +/* Special RX classification rule insert location values */ +#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ +#define RX_CLS_LOC_ANY 0xffffffff +#define RX_CLS_LOC_FIRST 0xfffffffe +#define RX_CLS_LOC_LAST 0xfffffffd + +/* EEPROM Standards for plug in modules */ +#define ETH_MODULE_SFF_8079 0x1 +#define ETH_MODULE_SFF_8079_LEN 256 +#define ETH_MODULE_SFF_8472 0x2 +#define ETH_MODULE_SFF_8472_LEN 512 +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 + +#define ETH_MODULE_SFF_8636_MAX_LEN 640 +#define ETH_MODULE_SFF_8436_MAX_LEN 640 + +/* Reset flags */ +/* The reset() operation must clear the flags for the components which + * were actually reset. On successful return, the flags indicate the + * components which were not reset, either because they do not exist + * in the hardware or because they cannot be reset independently. The + * driver must never reset any components that were not requested. + */ +enum ethtool_reset_flags { + /* These flags represent components dedicated to the interface + * the command is addressed to. Shift any flag left by + * ETH_RESET_SHARED_SHIFT to reset a shared component of the + * same type. + */ + ETH_RESET_MGMT = 1 << 0, /* Management processor */ + ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ + ETH_RESET_DMA = 1 << 2, /* DMA engine */ + ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ + ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ + ETH_RESET_MAC = 1 << 5, /* Media access controller */ + ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ + ETH_RESET_RAM = 1 << 7, /* RAM shared between + * multiple components */ + ETH_RESET_AP = 1 << 8, /* Application processor */ + + ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to + * this interface */ + ETH_RESET_ALL = 0xffffffff, /* All components used by this + * interface, even if shared */ +}; +#define ETH_RESET_SHARED_SHIFT 16 + + +/** + * struct ethtool_link_settings - link control and status + * + * IMPORTANT, Backward compatibility notice: When implementing new + * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and + * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link + * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS + * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in + * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use + * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link + * settings; do not use %ETHTOOL_SLINKSETTINGS if + * %ETHTOOL_GLINKSETTINGS failed: stick to + * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. + * + * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS + * @speed: Link speed (Mbps) + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @link_mode_masks_nwords: Number of 32-bit words for each of the + * supported, advertising, lp_advertising link mode bitmaps. For + * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user + * (>= 0); on return, if handshake in progress, negative if + * request size unsupported by kernel: absolute value indicates + * kernel expected size and all the other fields but cmd + * are 0; otherwise (handshake completed), strictly positive + * to indicate size used by kernel and cmd field stays + * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For + * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive + * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise + * refused. For drivers: ignore this field (use kernel's + * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will + * be overwritten by kernel. + * @supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * @advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * @lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. + * @master_slave_cfg: Master/slave port mode. + * @master_slave_state: Master/slave port state. + * @rate_matching: Rate adaptation performed by the PHY + * @reserved: Reserved for future use; see the note on reserved space. + * @link_mode_masks: Variable length bitmaps. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt + * are not available in %ethtool_link_settings. These fields will be + * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will + * fail if any of them is set to non-zero value. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GLINKSETTINGS to get the current values before making specific + * changes and then applying them with %ETHTOOL_SLINKSETTINGS. + * + * Drivers that implement %get_link_ksettings and/or + * %set_link_ksettings should ignore the @cmd + * and @link_mode_masks_nwords fields (any change to them overwritten + * by kernel), and rely only on kernel's internal + * %__ETHTOOL_LINK_MODE_MASK_NBITS and + * %ethtool_link_mode_mask_t. Drivers that implement + * %set_link_ksettings() should validate all fields other than @cmd + * and @link_mode_masks_nwords that are not described as read-only or + * deprecated, and must ignore all fields described as read-only. + */ +struct ethtool_link_settings { + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u8 transceiver; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 rate_matching; + __u32 reserved[7]; + __u32 link_mode_masks[]; + /* layout of link_mode_masks fields: + * __u32 map_supported[link_mode_masks_nwords]; + * __u32 map_advertising[link_mode_masks_nwords]; + * __u32 map_lp_advertising[link_mode_masks_nwords]; + */ +}; +#endif /* _LINUX_ETHTOOL_H */ diff --git a/src/shared/local-addresses.c b/src/shared/local-addresses.c new file mode 100644 index 0000000..a1577de --- /dev/null +++ b/src/shared/local-addresses.c @@ -0,0 +1,506 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "local-addresses.h" +#include "macro.h" +#include "netlink-util.h" +#include "sort-util.h" + +static int address_compare(const struct local_address *a, const struct local_address *b) { + int r; + + /* Order lowest scope first, IPv4 before IPv6, lowest interface index first */ + + if (a->family == AF_INET && b->family == AF_INET6) + return -1; + if (a->family == AF_INET6 && b->family == AF_INET) + return 1; + + r = CMP(a->scope, b->scope); + if (r != 0) + return r; + + r = CMP(a->metric, b->metric); + if (r != 0) + return r; + + r = CMP(a->ifindex, b->ifindex); + if (r != 0) + return r; + + return memcmp(&a->address, &b->address, FAMILY_ADDRESS_SIZE(a->family)); +} + +static void suppress_duplicates(struct local_address *list, size_t *n_list) { + size_t old_size, new_size; + + /* Removes duplicate entries, assumes the list of addresses is already sorted. Updates in-place. */ + + if (*n_list < 2) /* list with less than two entries can't have duplicates */ + return; + + old_size = *n_list; + new_size = 1; + + for (size_t i = 1; i < old_size; i++) { + + if (address_compare(list + i, list + new_size - 1) == 0) + continue; + + list[new_size++] = list[i]; + } + + *n_list = new_size; +} + +int local_addresses( + sd_netlink *context, + int ifindex, + int af, + struct local_address **ret) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_free_ struct local_address *list = NULL; + size_t n_list = 0; + int r; + + if (context) + rtnl = sd_netlink_ref(context); + else { + r = sd_netlink_open(&rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_GETADDR, ifindex, af); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) { + struct local_address *a; + unsigned char flags; + uint16_t type; + int ifi, family; + + r = sd_netlink_message_get_errno(m); + if (r < 0) + return r; + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) + return r; + if (type != RTM_NEWADDR) + continue; + + r = sd_rtnl_message_addr_get_ifindex(m, &ifi); + if (r < 0) + return r; + if (ifindex > 0 && ifi != ifindex) + continue; + + r = sd_rtnl_message_addr_get_family(m, &family); + if (r < 0) + return r; + if (af != AF_UNSPEC && af != family) + continue; + + r = sd_rtnl_message_addr_get_flags(m, &flags); + if (r < 0) + return r; + if (flags & IFA_F_DEPRECATED) + continue; + + if (!GREEDY_REALLOC0(list, n_list+1)) + return -ENOMEM; + + a = list + n_list; + + r = sd_rtnl_message_addr_get_scope(m, &a->scope); + if (r < 0) + return r; + + if (ifindex == 0 && IN_SET(a->scope, RT_SCOPE_HOST, RT_SCOPE_NOWHERE)) + continue; + + switch (family) { + + case AF_INET: + r = sd_netlink_message_read_in_addr(m, IFA_LOCAL, &a->address.in); + if (r < 0) { + r = sd_netlink_message_read_in_addr(m, IFA_ADDRESS, &a->address.in); + if (r < 0) + continue; + } + break; + + case AF_INET6: + r = sd_netlink_message_read_in6_addr(m, IFA_LOCAL, &a->address.in6); + if (r < 0) { + r = sd_netlink_message_read_in6_addr(m, IFA_ADDRESS, &a->address.in6); + if (r < 0) + continue; + } + break; + + default: + continue; + } + + a->ifindex = ifi; + a->family = family; + + n_list++; + }; + + if (ret) { + typesafe_qsort(list, n_list, address_compare); + suppress_duplicates(list, &n_list); + *ret = TAKE_PTR(list); + } + + return (int) n_list; +} + +static int add_local_gateway( + struct local_address **list, + size_t *n_list, + int af, + int ifindex, + uint32_t metric, + const RouteVia *via) { + + assert(list); + assert(n_list); + assert(via); + + if (af != AF_UNSPEC && af != via->family) + return 0; + + if (!GREEDY_REALLOC(*list, *n_list + 1)) + return -ENOMEM; + + (*list)[(*n_list)++] = (struct local_address) { + .ifindex = ifindex, + .metric = metric, + .family = via->family, + .address = via->address, + }; + + return 0; +} + +int local_gateways( + sd_netlink *context, + int ifindex, + int af, + struct local_address **ret) { + + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_free_ struct local_address *list = NULL; + size_t n_list = 0; + int r; + + if (context) + rtnl = sd_netlink_ref(context); + else { + r = sd_netlink_open(&rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_route(rtnl, &req, RTM_GETROUTE, af, RTPROT_UNSPEC); + if (r < 0) + return r; + + r = sd_rtnl_message_route_set_type(req, RTN_UNICAST); + if (r < 0) + return r; + + r = sd_rtnl_message_route_set_table(req, RT_TABLE_MAIN); + if (r < 0) + return r; + + r = sd_netlink_message_set_request_dump(req, true); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, req, 0, &reply); + if (r < 0) + return r; + + for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) { + _cleanup_ordered_set_free_free_ OrderedSet *multipath_routes = NULL; + _cleanup_free_ void *rta_multipath = NULL; + union in_addr_union gateway; + uint16_t type; + unsigned char dst_len, src_len, table; + uint32_t ifi = 0, metric = 0; + size_t rta_len; + int family; + RouteVia via; + + r = sd_netlink_message_get_errno(m); + if (r < 0) + return r; + + r = sd_netlink_message_get_type(m, &type); + if (r < 0) + return r; + if (type != RTM_NEWROUTE) + continue; + + /* We only care for default routes */ + r = sd_rtnl_message_route_get_dst_prefixlen(m, &dst_len); + if (r < 0) + return r; + if (dst_len != 0) + continue; + + r = sd_rtnl_message_route_get_src_prefixlen(m, &src_len); + if (r < 0) + return r; + if (src_len != 0) + continue; + + r = sd_rtnl_message_route_get_table(m, &table); + if (r < 0) + return r; + if (table != RT_TABLE_MAIN) + continue; + + r = sd_netlink_message_read_u32(m, RTA_PRIORITY, &metric); + if (r < 0 && r != -ENODATA) + return r; + + r = sd_rtnl_message_route_get_family(m, &family); + if (r < 0) + return r; + if (!IN_SET(family, AF_INET, AF_INET6)) + continue; + + r = sd_netlink_message_read_u32(m, RTA_OIF, &ifi); + if (r < 0 && r != -ENODATA) + return r; + if (r >= 0) { + if (ifi <= 0) + return -EINVAL; + if (ifindex > 0 && (int) ifi != ifindex) + continue; + + r = netlink_message_read_in_addr_union(m, RTA_GATEWAY, family, &gateway); + if (r < 0 && r != -ENODATA) + return r; + if (r >= 0) { + via.family = family; + via.address = gateway; + r = add_local_gateway(&list, &n_list, af, ifi, metric, &via); + if (r < 0) + return r; + + continue; + } + + if (family != AF_INET) + continue; + + r = sd_netlink_message_read(m, RTA_VIA, sizeof(via), &via); + if (r < 0 && r != -ENODATA) + return r; + if (r >= 0) { + r = add_local_gateway(&list, &n_list, af, ifi, metric, &via); + if (r < 0) + return r; + + continue; + } + } + + r = sd_netlink_message_read_data(m, RTA_MULTIPATH, &rta_len, &rta_multipath); + if (r < 0 && r != -ENODATA) + return r; + if (r >= 0) { + MultipathRoute *mr; + + r = rtattr_read_nexthop(rta_multipath, rta_len, family, &multipath_routes); + if (r < 0) + return r; + + ORDERED_SET_FOREACH(mr, multipath_routes) { + if (ifindex > 0 && mr->ifindex != ifindex) + continue; + + r = add_local_gateway(&list, &n_list, af, ifi, metric, &mr->gateway); + if (r < 0) + return r; + } + } + } + + if (ret) { + typesafe_qsort(list, n_list, address_compare); + suppress_duplicates(list, &n_list); + *ret = TAKE_PTR(list); + } + + return (int) n_list; +} + +int local_outbounds( + sd_netlink *context, + int ifindex, + int af, + struct local_address **ret) { + + _cleanup_free_ struct local_address *list = NULL, *gateways = NULL; + size_t n_list = 0; + int r, n_gateways; + + /* Determines our default outbound addresses, i.e. the "primary" local addresses we use to talk to IP + * addresses behind the default routes. This is still an address of the local host (i.e. this doesn't + * resolve NAT or so), but it's the set of addresses the local IP stack most likely uses to talk to + * other hosts. + * + * This works by connect()ing a SOCK_DGRAM socket to the local gateways, and then reading the IP + * address off the socket that was chosen for the routing decision. */ + + n_gateways = local_gateways(context, ifindex, af, &gateways); + if (n_gateways < 0) + return n_gateways; + if (n_gateways == 0) { + /* No gateways? Then we have no outbound addresses either. */ + if (ret) + *ret = NULL; + + return 0; + } + + for (int i = 0; i < n_gateways; i++) { + _cleanup_close_ int fd = -EBADF; + union sockaddr_union sa; + socklen_t salen; + + fd = socket(gateways[i].family, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + switch (gateways[i].family) { + + case AF_INET: + sa.in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr = gateways[i].address.in, + .sin_port = htobe16(53), /* doesn't really matter which port we pick — + * we just care about the routing decision */ + }; + + break; + + case AF_INET6: + sa.in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = gateways[i].address.in6, + .sin6_port = htobe16(53), + .sin6_scope_id = gateways[i].ifindex, + }; + + break; + + default: + assert_not_reached(); + } + + /* So ideally we'd just use IP_UNICAST_IF here to pass the ifindex info to the kernel before + * connect()ing, sot that it influences the routing decision. However, on current kernels + * IP_UNICAST_IF doesn't actually influence the routing decision for UDP — which I think + * should probably just be considered a bug. Once that bug is fixed this is the best API to + * use, since it is the most lightweight. */ + r = socket_set_unicast_if(fd, gateways[i].family, gateways[i].ifindex); + if (r < 0) + log_debug_errno(r, "Failed to set unicast interface index %i, ignoring: %m", gateways[i].ifindex); + + /* We'll also use SO_BINDTOINDEX. This requires CAP_NET_RAW on old kernels, hence there's a + * good chance this fails. Since 5.7 this restriction was dropped and the first + * SO_BINDTOINDEX on a socket may be done without privileges. This one has the benefit of + * really influencing the routing decision, i.e. this one definitely works for us — as long + * as we have the privileges for it. */ + r = socket_bind_to_ifindex(fd, gateways[i].ifindex); + if (r < 0) + log_debug_errno(r, "Failed to bind socket to interface %i, ignoring: %m", gateways[i].ifindex); + + /* Let's now connect() to the UDP socket, forcing the kernel to make a routing decision and + * auto-bind the socket. We ignore failures on this, since that failure might happen for a + * multitude of reasons (policy/firewall issues, who knows?) and some of them might be + * *after* the routing decision and the auto-binding already took place. If so we can still + * make use of the binding and return it. Hence, let's not unnecessarily fail early here: we + * can still easily detect if the auto-binding worked or not, by comparing the bound IP + * address with zero — which we do below. */ + if (connect(fd, &sa.sa, SOCKADDR_LEN(sa)) < 0) + log_debug_errno(errno, "Failed to connect SOCK_DGRAM socket to gateway, ignoring: %m"); + + /* Let's now read the socket address of the socket. A routing decision should have been + * made. Let's verify that and use the data. */ + salen = SOCKADDR_LEN(sa); + if (getsockname(fd, &sa.sa, &salen) < 0) + return -errno; + assert(sa.sa.sa_family == gateways[i].family); + assert(salen == SOCKADDR_LEN(sa)); + + switch (gateways[i].family) { + + case AF_INET: + if (in4_addr_is_null(&sa.in.sin_addr)) /* Auto-binding didn't work. :-( */ + continue; + + if (!GREEDY_REALLOC(list, n_list+1)) + return -ENOMEM; + + list[n_list++] = (struct local_address) { + .family = gateways[i].family, + .ifindex = gateways[i].ifindex, + .address.in = sa.in.sin_addr, + }; + + break; + + case AF_INET6: + if (in6_addr_is_null(&sa.in6.sin6_addr)) + continue; + + if (!GREEDY_REALLOC(list, n_list+1)) + return -ENOMEM; + + list[n_list++] = (struct local_address) { + .family = gateways[i].family, + .ifindex = gateways[i].ifindex, + .address.in6 = sa.in6.sin6_addr, + }; + break; + + default: + assert_not_reached(); + } + } + + if (ret) { + typesafe_qsort(list, n_list, address_compare); + suppress_duplicates(list, &n_list); + *ret = TAKE_PTR(list); + } + + return (int) n_list; +} diff --git a/src/shared/local-addresses.h b/src/shared/local-addresses.h new file mode 100644 index 0000000..38a17d2 --- /dev/null +++ b/src/shared/local-addresses.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-netlink.h" + +#include "in-addr-util.h" + +struct local_address { + int family, ifindex; + unsigned char scope; + uint32_t metric; + union in_addr_union address; +}; + +int local_addresses(sd_netlink *rtnl, int ifindex, int af, struct local_address **ret); + +int local_gateways(sd_netlink *rtnl, int ifindex, int af, struct local_address **ret); + +int local_outbounds(sd_netlink *rtnl, int ifindex, int af, struct local_address **ret); diff --git a/src/shared/locale-setup.c b/src/shared/locale-setup.c new file mode 100644 index 0000000..4e7f486 --- /dev/null +++ b/src/shared/locale-setup.c @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "env-file-label.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "locale-setup.h" +#include "proc-cmdline.h" +#include "stat-util.h" +#include "strv.h" + +void locale_context_clear(LocaleContext *c) { + assert(c); + + c->st = (struct stat) {}; + + for (LocaleVariable i = 0; i < _VARIABLE_LC_MAX; i++) + c->locale[i] = mfree(c->locale[i]); +} + +static int locale_context_load_proc(LocaleContext *c, LocaleLoadFlag flag) { + int r; + + assert(c); + + if (!FLAGS_SET(flag, LOCALE_LOAD_PROC_CMDLINE)) + return 0; + + locale_context_clear(c); + + r = proc_cmdline_get_key_many(PROC_CMDLINE_STRIP_RD_PREFIX, + "locale.LANG", &c->locale[VARIABLE_LANG], + "locale.LANGUAGE", &c->locale[VARIABLE_LANGUAGE], + "locale.LC_CTYPE", &c->locale[VARIABLE_LC_CTYPE], + "locale.LC_NUMERIC", &c->locale[VARIABLE_LC_NUMERIC], + "locale.LC_TIME", &c->locale[VARIABLE_LC_TIME], + "locale.LC_COLLATE", &c->locale[VARIABLE_LC_COLLATE], + "locale.LC_MONETARY", &c->locale[VARIABLE_LC_MONETARY], + "locale.LC_MESSAGES", &c->locale[VARIABLE_LC_MESSAGES], + "locale.LC_PAPER", &c->locale[VARIABLE_LC_PAPER], + "locale.LC_NAME", &c->locale[VARIABLE_LC_NAME], + "locale.LC_ADDRESS", &c->locale[VARIABLE_LC_ADDRESS], + "locale.LC_TELEPHONE", &c->locale[VARIABLE_LC_TELEPHONE], + "locale.LC_MEASUREMENT", &c->locale[VARIABLE_LC_MEASUREMENT], + "locale.LC_IDENTIFICATION", &c->locale[VARIABLE_LC_IDENTIFICATION]); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_debug_errno(r, "Failed to read /proc/cmdline: %m"); + return r; +} + +static int locale_context_load_conf(LocaleContext *c, LocaleLoadFlag flag) { + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + assert(c); + + if (!FLAGS_SET(flag, LOCALE_LOAD_LOCALE_CONF)) + return 0; + + fd = RET_NERRNO(open("/etc/locale.conf", O_CLOEXEC | O_PATH)); + if (fd == -ENOENT) + return 0; + if (fd < 0) + return log_debug_errno(errno, "Failed to open /etc/locale.conf: %m"); + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat /etc/locale.conf: %m"); + + /* If the file is not changed, then we do not need to re-read the file. */ + if (stat_inode_unmodified(&c->st, &st)) + return 0; + + c->st = st; + locale_context_clear(c); + + r = parse_env_file_fd(fd, "/etc/locale.conf", + "LANG", &c->locale[VARIABLE_LANG], + "LANGUAGE", &c->locale[VARIABLE_LANGUAGE], + "LC_CTYPE", &c->locale[VARIABLE_LC_CTYPE], + "LC_NUMERIC", &c->locale[VARIABLE_LC_NUMERIC], + "LC_TIME", &c->locale[VARIABLE_LC_TIME], + "LC_COLLATE", &c->locale[VARIABLE_LC_COLLATE], + "LC_MONETARY", &c->locale[VARIABLE_LC_MONETARY], + "LC_MESSAGES", &c->locale[VARIABLE_LC_MESSAGES], + "LC_PAPER", &c->locale[VARIABLE_LC_PAPER], + "LC_NAME", &c->locale[VARIABLE_LC_NAME], + "LC_ADDRESS", &c->locale[VARIABLE_LC_ADDRESS], + "LC_TELEPHONE", &c->locale[VARIABLE_LC_TELEPHONE], + "LC_MEASUREMENT", &c->locale[VARIABLE_LC_MEASUREMENT], + "LC_IDENTIFICATION", &c->locale[VARIABLE_LC_IDENTIFICATION]); + if (r < 0) + return log_debug_errno(r, "Failed to read /etc/locale.conf: %m"); + + return 1; /* loaded */ +} + +static int locale_context_load_env(LocaleContext *c, LocaleLoadFlag flag) { + int r; + + assert(c); + + if (!FLAGS_SET(flag, LOCALE_LOAD_ENVIRONMENT)) + return 0; + + locale_context_clear(c); + + /* Fill in what we got passed from systemd. */ + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) { + const char *name = ASSERT_PTR(locale_variable_to_string(p)); + + r = free_and_strdup(&c->locale[p], empty_to_null(getenv(name))); + if (r < 0) + return log_oom_debug(); + } + + return 1; /* loaded */ +} + +int locale_context_load(LocaleContext *c, LocaleLoadFlag flag) { + int r; + + assert(c); + + r = locale_context_load_proc(c, flag); + if (r > 0) + goto finalize; + + r = locale_context_load_conf(c, flag); + if (r != 0) + goto finalize; + + r = locale_context_load_env(c, flag); + +finalize: + if (r <= 0) { + /* Nothing loaded, or error. */ + locale_context_clear(c); + return r; + } + + if (FLAGS_SET(flag, LOCALE_LOAD_SIMPLIFY)) + locale_variables_simplify(c->locale); + + return 0; +} + +int locale_context_build_env(const LocaleContext *c, char ***ret_set, char ***ret_unset) { + _cleanup_strv_free_ char **set = NULL, **unset = NULL; + int r; + + assert(c); + + if (!ret_set && !ret_unset) + return 0; + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) { + const char *name = ASSERT_PTR(locale_variable_to_string(p)); + + if (isempty(c->locale[p])) { + if (!ret_unset) + continue; + r = strv_extend(&unset, name); + } else { + if (!ret_set) + continue; + r = strv_env_assign(&set, name, c->locale[p]); + } + if (r < 0) + return r; + } + + if (ret_set) + *ret_set = TAKE_PTR(set); + if (ret_unset) + *ret_unset = TAKE_PTR(unset); + return 0; +} + +int locale_context_save(LocaleContext *c, char ***ret_set, char ***ret_unset) { + _cleanup_strv_free_ char **set = NULL, **unset = NULL; + int r; + + assert(c); + + /* Set values will be returned as strv in *ret on success. */ + + r = locale_context_build_env(c, &set, ret_unset ? &unset : NULL); + if (r < 0) + return r; + + if (strv_isempty(set)) { + if (unlink("/etc/locale.conf") < 0) + return errno == ENOENT ? 0 : -errno; + + c->st = (struct stat) {}; + + if (ret_set) + *ret_set = NULL; + if (ret_unset) + *ret_unset = NULL; + return 0; + } + + r = write_env_file_label(AT_FDCWD, "/etc/locale.conf", NULL, set); + if (r < 0) + return r; + + if (stat("/etc/locale.conf", &c->st) < 0) + return -errno; + + if (ret_set) + *ret_set = TAKE_PTR(set); + if (ret_unset) + *ret_unset = TAKE_PTR(unset); + return 0; +} + +int locale_context_merge(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]) { + assert(c); + assert(l); + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) + if (!isempty(c->locale[p]) && isempty(l[p])) { + l[p] = strdup(c->locale[p]); + if (!l[p]) + return -ENOMEM; + } + + return 0; +} + +void locale_context_take(LocaleContext *c, char *l[_VARIABLE_LC_MAX]) { + assert(c); + assert(l); + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) + free_and_replace(c->locale[p], l[p]); +} + +bool locale_context_equal(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]) { + assert(c); + assert(l); + + for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) + if (!streq_ptr(c->locale[p], l[p])) + return false; + + return true; +} + +int locale_setup(char ***environment) { + _cleanup_(locale_context_clear) LocaleContext c = {}; + _cleanup_strv_free_ char **add = NULL; + int r; + + assert(environment); + + r = locale_context_load(&c, LOCALE_LOAD_PROC_CMDLINE | LOCALE_LOAD_LOCALE_CONF); + if (r < 0) + return r; + + r = locale_context_build_env(&c, &add, NULL); + if (r < 0) + return r; + + if (strv_isempty(add)) { + /* If no locale is configured then default to compile-time default. */ + + add = strv_new("LANG=" SYSTEMD_DEFAULT_LOCALE); + if (!add) + return -ENOMEM; + } + + if (strv_isempty(*environment)) + strv_free_and_replace(*environment, add); + else { + char **merged; + + merged = strv_env_merge(*environment, add); + if (!merged) + return -ENOMEM; + + strv_free_and_replace(*environment, merged); + } + + return 0; +} diff --git a/src/shared/locale-setup.h b/src/shared/locale-setup.h new file mode 100644 index 0000000..537acc7 --- /dev/null +++ b/src/shared/locale-setup.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "locale-util.h" + +typedef struct LocaleContext { + struct stat st; + char *locale[_VARIABLE_LC_MAX]; +} LocaleContext; + +typedef enum LocaleLoadFlag { + LOCALE_LOAD_PROC_CMDLINE = 1 << 0, + LOCALE_LOAD_LOCALE_CONF = 1 << 1, + LOCALE_LOAD_ENVIRONMENT = 1 << 2, + LOCALE_LOAD_SIMPLIFY = 1 << 3, +} LocaleLoadFlag; + +void locale_context_clear(LocaleContext *c); +int locale_context_load(LocaleContext *c, LocaleLoadFlag flag); +int locale_context_build_env(const LocaleContext *c, char ***ret_set, char ***ret_unset); +int locale_context_save(LocaleContext *c, char ***ret_set, char ***ret_unset); + +int locale_context_merge(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]); +void locale_context_take(LocaleContext *c, char *l[_VARIABLE_LC_MAX]); +bool locale_context_equal(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]); + +int locale_setup(char ***environment); diff --git a/src/shared/log-link.h b/src/shared/log-link.h new file mode 100644 index 0000000..5f2b176 --- /dev/null +++ b/src/shared/log-link.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "log.h" + +#define log_interface_full_errno_zerook(ifname, level, error, ...) \ + ({ \ + const char *_ifname = (ifname); \ + _ifname ? log_object_internal(level, error, PROJECT_FILE, __LINE__, __func__, "INTERFACE=", _ifname, NULL, NULL, ##__VA_ARGS__) : \ + log_internal(level, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \ + }) + +#define log_interface_full_errno(ifname, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_interface_full_errno_zerook(ifname, level, _error, __VA_ARGS__); \ + }) + +/* + * The following macros append INTERFACE= to the message. + * The macros require a struct named 'Link' which contains 'char *ifname': + * + * typedef struct Link { + * char *ifname; + * } Link; + * + * See, network/networkd-link.h for example. + */ + +#define log_link_full_errno_zerook(link, level, error, ...) \ + ({ \ + const Link *_l = (link); \ + log_interface_full_errno_zerook(_l ? _l->ifname : NULL, level, error, __VA_ARGS__); \ + }) + +#define log_link_full_errno(link, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_link_full_errno_zerook(link, level, _error, __VA_ARGS__); \ + }) + +#define log_link_full(link, level, ...) (void) log_link_full_errno_zerook(link, level, 0, __VA_ARGS__) + +#define log_link_debug(link, ...) log_link_full(link, LOG_DEBUG, __VA_ARGS__) +#define log_link_info(link, ...) log_link_full(link, LOG_INFO, __VA_ARGS__) +#define log_link_notice(link, ...) log_link_full(link, LOG_NOTICE, __VA_ARGS__) +#define log_link_warning(link, ...) log_link_full(link, LOG_WARNING, __VA_ARGS__) +#define log_link_error(link, ...) log_link_full(link, LOG_ERR, __VA_ARGS__) + +#define log_link_debug_errno(link, error, ...) log_link_full_errno(link, LOG_DEBUG, error, __VA_ARGS__) +#define log_link_info_errno(link, error, ...) log_link_full_errno(link, LOG_INFO, error, __VA_ARGS__) +#define log_link_notice_errno(link, error, ...) log_link_full_errno(link, LOG_NOTICE, error, __VA_ARGS__) +#define log_link_warning_errno(link, error, ...) log_link_full_errno(link, LOG_WARNING, error, __VA_ARGS__) +#define log_link_error_errno(link, error, ...) log_link_full_errno(link, LOG_ERR, error, __VA_ARGS__) + +#define LOG_LINK_MESSAGE(link, fmt, ...) "MESSAGE=%s: " fmt, (link)->ifname, ##__VA_ARGS__ +#define LOG_LINK_INTERFACE(link) "INTERFACE=%s", (link)->ifname diff --git a/src/shared/logs-show.c b/src/shared/logs-show.c new file mode 100644 index 0000000..a5d0400 --- /dev/null +++ b/src/shared/logs-show.c @@ -0,0 +1,2102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-id128.h" +#include "sd-journal.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "journal-internal.h" +#include "journal-util.h" +#include "json.h" +#include "locale-util.h" +#include "log.h" +#include "logs-show.h" +#include "macro.h" +#include "namespace-util.h" +#include "output-mode.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "sparse-endian.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "time-util.h" +#include "utf8.h" +#include "web-util.h" + +/* up to three lines (each up to 100 characters) or 300 characters, whichever is less */ +#define PRINT_LINE_THRESHOLD 3 +#define PRINT_CHAR_THRESHOLD 300 + +#define JSON_THRESHOLD 4096U + +static int print_catalog(FILE *f, sd_journal *j) { + _cleanup_free_ char *t = NULL, *z = NULL; + const char *newline, *prefix; + int r; + + assert(j); + + r = sd_journal_get_catalog(j, &t); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to find catalog entry: %m"); + + if (is_locale_utf8()) + prefix = strjoina(special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), special_glyph(SPECIAL_GLYPH_LIGHT_SHADE)); + else + prefix = "--"; + + newline = strjoina(ansi_normal(), "\n", ansi_grey(), prefix, ansi_normal(), " ", ansi_green()); + + z = strreplace(strstrip(t), "\n", newline); + if (!z) + return log_oom(); + + fprintf(f, "%s%s %s%s", ansi_grey(), prefix, ansi_normal(), ansi_green()); + fputs(z, f); + fprintf(f, "%s\n", ansi_normal()); + + return 1; +} + +static int url_from_catalog(sd_journal *j, char **ret) { + _cleanup_free_ char *t = NULL, *url = NULL; + const char *weblink; + int r; + + assert(j); + assert(ret); + + r = sd_journal_get_catalog(j, &t); + if (r == -ENOENT) + goto notfound; + if (r < 0) + return log_error_errno(r, "Failed to find catalog entry: %m"); + + weblink = find_line_startswith(t, "Documentation:"); + if (!weblink) + goto notfound; + + /* Skip whitespace to value */ + weblink += strspn(weblink, " \t"); + + /* Cut out till next whitespace/newline */ + url = strdupcspn(weblink, WHITESPACE); + if (!url) + return log_oom(); + + if (!documentation_url_is_valid(url)) + goto notfound; + + *ret = TAKE_PTR(url); + return 1; + +notfound: + *ret = NULL; + return 0; +} + +static int parse_field( + const void *data, + size_t length, + const char *field, + size_t field_len, + char **target, + size_t *target_len) { + + size_t nl; + char *buf; + + assert(data); + assert(field); + assert(target); + + if (length < field_len) + return 0; + + if (memcmp(data, field, field_len)) + return 0; + + nl = length - field_len; + + buf = newdup_suffix0(char, (const char*) data + field_len, nl); + if (!buf) + return log_oom(); + + free_and_replace(*target, buf); + + if (target_len) + *target_len = nl; + + return 1; +} + +typedef struct ParseFieldVec { + const char *field; + size_t field_len; + char **target; + size_t *target_len; +} ParseFieldVec; + +#define PARSE_FIELD_VEC_ENTRY(_field, _target, _target_len) { \ + .field = _field, \ + .field_len = strlen(_field), \ + .target = _target, \ + .target_len = _target_len \ + } + +static int parse_fieldv( + const void *data, + size_t length, + const ParseFieldVec *fields, + size_t n_fields) { + + int r; + + for (size_t i = 0; i < n_fields; i++) { + const ParseFieldVec *f = &fields[i]; + + r = parse_field(data, length, f->field, f->field_len, f->target, f->target_len); + if (r < 0) + return r; + if (r > 0) + break; + } + + return 0; +} + +static int field_set_test(const Set *fields, const char *name, size_t n) { + char *s; + + if (!fields) + return 1; + + s = strndupa_safe(name, n); + return set_contains(fields, s); +} + +static bool shall_print(const char *p, size_t l, OutputFlags flags) { + assert(p); + + if (flags & OUTPUT_SHOW_ALL) + return true; + + if (l >= PRINT_CHAR_THRESHOLD) + return false; + + if (!utf8_is_printable(p, l)) + return false; + + return true; +} + +static bool print_multiline( + FILE *f, + unsigned prefix, + unsigned n_columns, + OutputFlags flags, + int priority, + bool audit, + const char* message, + size_t message_len, + size_t highlight[2]) { + + const char *color_on = "", *color_off = "", *highlight_on = ""; + const char *pos, *end; + bool ellipsized = false; + int line = 0; + + if (flags & OUTPUT_COLOR) { + get_log_colors(priority, &color_on, &color_off, &highlight_on); + + if (audit && strempty(color_on)) { + color_on = ANSI_BLUE; + color_off = ANSI_NORMAL; + } + } + + /* A special case: make sure that we print a newline when + the message is empty. */ + if (message_len == 0) + fputs("\n", f); + + for (pos = message; + pos < message + message_len; + pos = end + 1, line++) { + bool tail_line; + int len, indent = (line > 0) * prefix; + for (end = pos; end < message + message_len && *end != '\n'; end++) + ; + len = end - pos; + assert(len >= 0); + + /* We need to figure out when we are showing not-last line, *and* + * will skip subsequent lines. In that case, we will put the dots + * at the end of the line, instead of putting dots in the middle + * or not at all. + */ + tail_line = + line + 1 == PRINT_LINE_THRESHOLD || + end + 1 >= message + PRINT_CHAR_THRESHOLD; + + if (flags & (OUTPUT_FULL_WIDTH | OUTPUT_SHOW_ALL) || + (prefix + len + 1 < n_columns && !tail_line)) { + if (highlight && + (size_t) (pos - message) <= highlight[0] && + highlight[0] < (size_t) len) { + + fprintf(f, "%*s%s%.*s", + indent, "", + color_on, (int) highlight[0], pos); + fprintf(f, "%s%.*s", + highlight_on, + (int) (MIN((size_t) len, highlight[1]) - highlight[0]), + pos + highlight[0]); + if ((size_t) len > highlight[1]) + fprintf(f, "%s%.*s", + color_on, + (int) (len - highlight[1]), + pos + highlight[1]); + fprintf(f, "%s\n", color_off); + + } else + fprintf(f, "%*s%s%.*s%s\n", + indent, "", + color_on, len, pos, color_off); + continue; + } + + /* Beyond this point, ellipsization will happen. */ + ellipsized = true; + + if (prefix < n_columns && n_columns - prefix >= 3) { + if (n_columns - prefix > (unsigned) len + 3) + fprintf(f, "%*s%s%.*s...%s\n", + indent, "", + color_on, len, pos, color_off); + else { + _cleanup_free_ char *e = NULL; + + e = ellipsize_mem(pos, len, n_columns - prefix, + tail_line ? 100 : 90); + if (!e) + fprintf(f, "%*s%s%.*s%s\n", + indent, "", + color_on, len, pos, color_off); + else + fprintf(f, "%*s%s%s%s\n", + indent, "", + color_on, e, color_off); + } + } else + fputs("...\n", f); + + if (tail_line) + break; + } + + return ellipsized; +} + +static int output_timestamp_monotonic( + FILE *f, + OutputMode mode, + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id) { + + int written_chars = 0; + + assert(f); + assert(display_ts); + assert(boot_id); + assert(previous_display_ts); + assert(previous_boot_id); + + if (!VALID_MONOTONIC(display_ts->monotonic)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No valid monotonic timestamp available"); + + written_chars += fprintf(f, "[%5"PRI_USEC".%06"PRI_USEC, display_ts->monotonic / USEC_PER_SEC, display_ts->monotonic % USEC_PER_SEC); + + if (mode == OUTPUT_SHORT_DELTA) { + uint64_t delta; + bool reliable_ts = true; + + if (VALID_MONOTONIC(previous_display_ts->monotonic) && sd_id128_equal(*boot_id, *previous_boot_id)) + delta = usec_sub_unsigned(display_ts->monotonic, previous_display_ts->monotonic); + else if (VALID_REALTIME(display_ts->realtime) && VALID_REALTIME(previous_display_ts->realtime)) { + delta = usec_sub_unsigned(display_ts->realtime, previous_display_ts->realtime); + reliable_ts = false; + } else { + written_chars += fprintf(f, "%16s", ""); + goto finish; + } + + written_chars += fprintf(f, " <%5"PRI_USEC".%06"PRI_USEC"%s>", delta / USEC_PER_SEC, delta % USEC_PER_SEC, reliable_ts ? " " : "*"); + } + +finish: + written_chars += fprintf(f, "%s", "]"); + return written_chars; +} + +static int output_timestamp_realtime( + FILE *f, + sd_journal *j, + OutputMode mode, + OutputFlags flags, + const dual_timestamp *display_ts) { + + char buf[CONST_MAX(FORMAT_TIMESTAMP_MAX, 64U)]; + int r; + + assert(f); + assert(j); + assert(display_ts); + + if (!VALID_REALTIME(display_ts->realtime)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No valid realtime timestamp available"); + + if (IN_SET(mode, OUTPUT_SHORT_FULL, OUTPUT_WITH_UNIT)) { + const char *k; + + if (flags & OUTPUT_UTC) + k = format_timestamp_style(buf, sizeof(buf), display_ts->realtime, TIMESTAMP_UTC); + else + k = format_timestamp(buf, sizeof(buf), display_ts->realtime); + if (!k) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to format timestamp: %" PRIu64, display_ts->realtime); + + } else { + struct tm tm; + time_t t; + + t = (time_t) (display_ts->realtime / USEC_PER_SEC); + + switch (mode) { + + case OUTPUT_SHORT_UNIX: + xsprintf(buf, "%10"PRI_TIME".%06"PRIu64, t, display_ts->realtime % USEC_PER_SEC); + break; + + case OUTPUT_SHORT_ISO: + case OUTPUT_SHORT_ISO_PRECISE: { + size_t tail = strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", + localtime_or_gmtime_r(&t, &tm, flags & OUTPUT_UTC)); + if (tail == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to format ISO time"); + + /* No usec in strftime, need to append */ + if (mode == OUTPUT_SHORT_ISO_PRECISE) { + assert(ELEMENTSOF(buf) - tail >= 7); + snprintf(buf + tail, ELEMENTSOF(buf) - tail, ".%06"PRI_USEC, display_ts->realtime % USEC_PER_SEC); + tail += 7; + } + + int h = tm.tm_gmtoff / 60 / 60; + int m = labs((tm.tm_gmtoff / 60) % 60); + snprintf(buf + tail, ELEMENTSOF(buf) - tail, "%+03d:%02d", h, m); + break; + } + + case OUTPUT_SHORT: + case OUTPUT_SHORT_PRECISE: + + if (strftime(buf, sizeof(buf), "%b %d %H:%M:%S", + localtime_or_gmtime_r(&t, &tm, flags & OUTPUT_UTC)) <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to format syslog time"); + + if (mode == OUTPUT_SHORT_PRECISE) { + size_t k; + + assert(sizeof(buf) > strlen(buf)); + k = sizeof(buf) - strlen(buf); + + r = snprintf(buf + strlen(buf), k, ".%06"PRIu64, display_ts->realtime % USEC_PER_SEC); + if (r <= 0 || (size_t) r >= k) /* too long? */ + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to format precise time"); + } + break; + + default: + assert_not_reached(); + } + } + + fputs(buf, f); + return (int) strlen(buf); +} + +static int output_short( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + const Set *output_fields, + const size_t highlight[2], + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id) { + + int r; + const void *data; + size_t length, n = 0; + _cleanup_free_ char *hostname = NULL, *identifier = NULL, *comm = NULL, *pid = NULL, *fake_pid = NULL, + *message = NULL, *priority = NULL, *transport = NULL, + *config_file = NULL, *unit = NULL, *user_unit = NULL, *documentation_url = NULL; + size_t hostname_len = 0, identifier_len = 0, comm_len = 0, pid_len = 0, fake_pid_len = 0, message_len = 0, + priority_len = 0, transport_len = 0, config_file_len = 0, + unit_len = 0, user_unit_len = 0, documentation_url_len = 0; + int p = LOG_INFO; + bool ellipsized = false, audit; + const ParseFieldVec fields[] = { + PARSE_FIELD_VEC_ENTRY("_PID=", &pid, &pid_len), + PARSE_FIELD_VEC_ENTRY("_COMM=", &comm, &comm_len), + PARSE_FIELD_VEC_ENTRY("MESSAGE=", &message, &message_len), + PARSE_FIELD_VEC_ENTRY("PRIORITY=", &priority, &priority_len), + PARSE_FIELD_VEC_ENTRY("_TRANSPORT=", &transport, &transport_len), + PARSE_FIELD_VEC_ENTRY("_HOSTNAME=", &hostname, &hostname_len), + PARSE_FIELD_VEC_ENTRY("SYSLOG_PID=", &fake_pid, &fake_pid_len), + PARSE_FIELD_VEC_ENTRY("SYSLOG_IDENTIFIER=", &identifier, &identifier_len), + PARSE_FIELD_VEC_ENTRY("CONFIG_FILE=", &config_file, &config_file_len), + PARSE_FIELD_VEC_ENTRY("_SYSTEMD_UNIT=", &unit, &unit_len), + PARSE_FIELD_VEC_ENTRY("_SYSTEMD_USER_UNIT=", &user_unit, &user_unit_len), + PARSE_FIELD_VEC_ENTRY("DOCUMENTATION=", &documentation_url, &documentation_url_len), + }; + size_t highlight_shifted[] = {highlight ? highlight[0] : 0, highlight ? highlight[1] : 0}; + + assert(f); + assert(j); + assert(display_ts); + assert(boot_id); + assert(previous_display_ts); + assert(previous_boot_id); + + /* Set the threshold to one bigger than the actual print threshold, so that if the line is actually + * longer than what we're willing to print, ellipsization will occur. This way we won't output a + * misleading line without any indication of truncation. + */ + (void) sd_journal_set_data_threshold(j, flags & (OUTPUT_SHOW_ALL|OUTPUT_FULL_WIDTH) ? 0 : PRINT_CHAR_THRESHOLD + 1); + + JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) { + r = parse_fieldv(data, length, fields, ELEMENTSOF(fields)); + if (r < 0) + return r; + } + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Skipping message we can't read: %m"); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to get journal fields: %m"); + + if (!message) { + log_debug("Skipping message without MESSAGE= field."); + return 0; + } + + if (!(flags & OUTPUT_SHOW_ALL)) + strip_tab_ansi(&message, &message_len, highlight_shifted); + + if (flags & OUTPUT_TRUNCATE_NEWLINE) + truncate_nl_full(message, &message_len); + + if (priority_len == 1 && *priority >= '0' && *priority <= '7') + p = *priority - '0'; + + audit = streq_ptr(transport, "audit"); + + if (IN_SET(mode, OUTPUT_SHORT_MONOTONIC, OUTPUT_SHORT_DELTA)) + r = output_timestamp_monotonic(f, mode, display_ts, boot_id, previous_display_ts, previous_boot_id); + else + r = output_timestamp_realtime(f, j, mode, flags, display_ts); + if (r < 0) + return r; + n += r; + + if (flags & OUTPUT_NO_HOSTNAME) { + /* Suppress display of the hostname if this is requested. */ + hostname = mfree(hostname); + hostname_len = 0; + } + + if (hostname && shall_print(hostname, hostname_len, flags)) { + fprintf(f, " %.*s", (int) hostname_len, hostname); + n += hostname_len + 1; + } + + if (mode == OUTPUT_WITH_UNIT && ((unit && shall_print(unit, unit_len, flags)) || + (user_unit && shall_print(user_unit, user_unit_len, flags)))) { + if (unit) { + fprintf(f, " %.*s", (int) unit_len, unit); + n += unit_len + 1; + } + if (user_unit) { + if (unit) + fprintf(f, "/%.*s", (int) user_unit_len, user_unit); + else + fprintf(f, " %.*s", (int) user_unit_len, user_unit); + n += unit_len + 1; + } + } else if (identifier && shall_print(identifier, identifier_len, flags)) { + fprintf(f, " %.*s", (int) identifier_len, identifier); + n += identifier_len + 1; + } else if (comm && shall_print(comm, comm_len, flags)) { + fprintf(f, " %.*s", (int) comm_len, comm); + n += comm_len + 1; + } else + fputs(" unknown", f); + + if (pid && shall_print(pid, pid_len, flags)) { + fprintf(f, "[%.*s]", (int) pid_len, pid); + n += pid_len + 2; + } else if (fake_pid && shall_print(fake_pid, fake_pid_len, flags)) { + fprintf(f, "[%.*s]", (int) fake_pid_len, fake_pid); + n += fake_pid_len + 2; + } + + fputs(": ", f); + + if (urlify_enabled()) { + _cleanup_free_ char *c = NULL; + + /* Insert a hyperlink to a documentation URL before the message. Note that we don't make the + * whole message a hyperlink, since otherwise the whole screen might end up being just + * hyperlinks. Moreover, we want to be able to highlight parts of the message (such as the + * config file, see below) hence let's keep the documentation URL link separate. */ + + if (documentation_url && shall_print(documentation_url, documentation_url_len, flags)) { + c = strndup(documentation_url, documentation_url_len); + if (!c) + return log_oom(); + + if (!documentation_url_is_valid(c)) /* Eat up invalid links */ + c = mfree(c); + } + + if (!c) + (void) url_from_catalog(j, &c); /* Acquire from catalog if not embedded in log message itself */ + + if (c) { + _cleanup_free_ char *urlified = NULL; + + if (terminal_urlify(c, special_glyph(SPECIAL_GLYPH_EXTERNAL_LINK), &urlified) >= 0) { + fputs(urlified, f); + fputc(' ', f); + } + } + } + + if (!(flags & OUTPUT_SHOW_ALL) && !utf8_is_printable(message, message_len)) + fprintf(f, "[%s blob data]\n", FORMAT_BYTES(message_len)); + else { + + /* URLify config_file string in message, if the message starts with it. + * Skip URLification if the highlighted pattern overlaps. */ + if (config_file && + message_len >= config_file_len && + memcmp(message, config_file, config_file_len) == 0 && + (message_len == config_file_len || IN_SET(message[config_file_len], ':', ' ')) && + (!highlight || highlight_shifted[0] == 0 || highlight_shifted[0] > config_file_len)) { + + _cleanup_free_ char *t = NULL, *urlified = NULL; + + t = strndup(config_file, config_file_len); + if (t && terminal_urlify_path(t, NULL, &urlified) >= 0) { + size_t urlified_len = strlen(urlified); + size_t shift = urlified_len - config_file_len; + char *joined; + + joined = realloc(urlified, message_len + shift); + if (joined) { + memcpy(joined + urlified_len, message + config_file_len, message_len - config_file_len); + free_and_replace(message, joined); + TAKE_PTR(urlified); + message_len += shift; + if (highlight) { + highlight_shifted[0] += shift; + highlight_shifted[1] += shift; + } + } + } + } + + ellipsized |= + print_multiline(f, n + 2, n_columns, flags, p, audit, + message, message_len, + highlight_shifted); + } + + if (flags & OUTPUT_CATALOG) + (void) print_catalog(f, j); + + return ellipsized; +} + +static int output_verbose( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + const Set *output_fields, + const size_t highlight[2], + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id) { + + const void *data; + size_t length; + _cleanup_free_ char *cursor = NULL; + char buf[FORMAT_TIMESTAMP_MAX + 7]; + const char *timestamp; + int r; + + assert(f); + assert(j); + assert(display_ts); + assert(boot_id); + assert(previous_display_ts); + assert(previous_boot_id); + + (void) sd_journal_set_data_threshold(j, 0); + + if (!VALID_REALTIME(display_ts->realtime)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No valid realtime timestamp available"); + + r = sd_journal_get_cursor(j, &cursor); + if (r < 0) + return log_error_errno(r, "Failed to get cursor: %m"); + + timestamp = format_timestamp_style(buf, sizeof buf, display_ts->realtime, + flags & OUTPUT_UTC ? TIMESTAMP_US_UTC : TIMESTAMP_US); + fprintf(f, "%s%s%s %s[%s]%s\n", + timestamp && (flags & OUTPUT_COLOR) ? ANSI_UNDERLINE : "", + timestamp ?: "(no timestamp)", + timestamp && (flags & OUTPUT_COLOR) ? ANSI_NORMAL : "", + (flags & OUTPUT_COLOR) ? ANSI_GREY : "", + cursor, + (flags & OUTPUT_COLOR) ? ANSI_NORMAL : ""); + + JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) { + _cleanup_free_ char *urlified = NULL; + const char *on = "", *off = ""; + const char *c, *p = NULL; + size_t fieldlen, valuelen; + + c = memchr(data, '=', length); + if (!c) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field."); + + fieldlen = c - (const char*) data; + if (!journal_field_valid(data, fieldlen, true)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field."); + + r = field_set_test(output_fields, data, fieldlen); + if (r < 0) + return r; + if (r == 0) + continue; + + valuelen = length - 1 - fieldlen; + p = c + 1; + + if (flags & OUTPUT_COLOR) { + if (startswith(data, "MESSAGE=")) { + on = ANSI_HIGHLIGHT; + off = ANSI_NORMAL; + } else if (startswith(data, "CONFIG_FILE=")) { + _cleanup_free_ char *u = NULL; + + u = memdup_suffix0(p, valuelen); + if (!u) + return log_oom(); + + if (terminal_urlify_path(u, NULL, &urlified) >= 0) { + p = urlified; + valuelen = strlen(urlified); + } + + } else if (startswith(data, "_")) { + /* Highlight trusted data as such */ + on = ANSI_GREEN; + off = ANSI_NORMAL; + } + } + + if ((flags & OUTPUT_SHOW_ALL) || + (((length < PRINT_CHAR_THRESHOLD) || flags & OUTPUT_FULL_WIDTH) + && utf8_is_printable(data, length))) { + fprintf(f, " %s%.*s=", on, (int) fieldlen, (const char*)data); + print_multiline(f, 4 + fieldlen + 1, 0, OUTPUT_FULL_WIDTH, 0, false, + p, valuelen, + NULL); + fputs(off, f); + } else + fprintf(f, " %s%.*s=[%s blob data]%s\n", + on, + (int) (c - (const char*) data), + (const char*) data, + FORMAT_BYTES(length - (c - (const char *) data) - 1), + off); + } + if (r < 0) + return r; + + if (flags & OUTPUT_CATALOG) + (void) print_catalog(f, j); + + return 0; +} + +static int output_export( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + const Set *output_fields, + const size_t highlight[2], + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id) { + + sd_id128_t journal_boot_id, seqnum_id; + _cleanup_free_ char *cursor = NULL; + usec_t monotonic, realtime; + const void *data; + uint64_t seqnum; + size_t length; + int r; + + assert(j); + assert(display_ts); + assert(boot_id); + assert(previous_display_ts); + assert(previous_boot_id); + + (void) sd_journal_set_data_threshold(j, 0); + + r = sd_journal_get_cursor(j, &cursor); + if (r < 0) + return log_error_errno(r, "Failed to get cursor: %m"); + + r = sd_journal_get_realtime_usec(j, &realtime); + if (r < 0) + return log_error_errno(r, "Failed to get realtime timestamp: %m"); + + r = sd_journal_get_monotonic_usec(j, &monotonic, &journal_boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get monotonic timestamp: %m"); + + r = sd_journal_get_seqnum(j, &seqnum, &seqnum_id); + if (r < 0) + return log_error_errno(r, "Failed to get seqnum: %m"); + + fprintf(f, + "__CURSOR=%s\n" + "__REALTIME_TIMESTAMP=" USEC_FMT "\n" + "__MONOTONIC_TIMESTAMP=" USEC_FMT "\n" + "__SEQNUM=%" PRIu64 "\n" + "__SEQNUM_ID=%s\n" + "_BOOT_ID=%s\n", + cursor, + realtime, + monotonic, + seqnum, + SD_ID128_TO_STRING(seqnum_id), + SD_ID128_TO_STRING(journal_boot_id)); + + JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) { + size_t fieldlen; + const char *c; + + /* We already printed the boot id from the data in the header, hence let's suppress it here */ + if (memory_startswith(data, length, "_BOOT_ID=")) + continue; + + c = memchr(data, '=', length); + if (!c) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field."); + + fieldlen = c - (const char*) data; + if (!journal_field_valid(data, fieldlen, true)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field."); + + r = field_set_test(output_fields, data, fieldlen); + if (r < 0) + return r; + if (!r) + continue; + + if (utf8_is_printable_newline(data, length, false)) + fwrite(data, length, 1, f); + else { + uint64_t le64; + + fwrite(data, fieldlen, 1, f); + fputc('\n', f); + le64 = htole64(length - fieldlen - 1); + fwrite(&le64, sizeof(le64), 1, f); + fwrite(c + 1, length - fieldlen - 1, 1, f); + } + + fputc('\n', f); + } + if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) { + log_debug_errno(r, "Skipping message we can't read: %m"); + return 0; + } + + if (r < 0) + return r; + + fputc('\n', f); + + return 0; +} + +void json_escape( + FILE *f, + const char* p, + size_t l, + OutputFlags flags) { + + assert(f); + assert(p); + + if (!(flags & OUTPUT_SHOW_ALL) && l >= JSON_THRESHOLD) + fputs("null", f); + + else if (!(flags & OUTPUT_SHOW_ALL) && !utf8_is_printable(p, l)) { + bool not_first = false; + + fputs("[ ", f); + + while (l > 0) { + if (not_first) + fprintf(f, ", %u", (uint8_t) *p); + else { + not_first = true; + fprintf(f, "%u", (uint8_t) *p); + } + + p++; + l--; + } + + fputs(" ]", f); + } else { + fputc('"', f); + + while (l > 0) { + if (IN_SET(*p, '"', '\\')) { + fputc('\\', f); + fputc(*p, f); + } else if (*p == '\n') + fputs("\\n", f); + else if ((uint8_t) *p < ' ') + fprintf(f, "\\u%04x", (uint8_t) *p); + else + fputc(*p, f); + + p++; + l--; + } + + fputc('"', f); + } +} + +typedef struct JsonData { + JsonVariant* name; + JsonVariant* values; +} JsonData; + +static JsonData* json_data_free(JsonData *d) { + if (!d) + return NULL; + + json_variant_unref(d->name); + json_variant_unref(d->values); + + return mfree(d); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(JsonData*, json_data_free); + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(json_data_hash_ops_free, + char, string_hash_func, string_compare_func, + JsonData, json_data_free); + +static int update_json_data( + Hashmap *h, + OutputFlags flags, + const char *name, + const void *value, + size_t size) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + JsonData *d; + int r; + + assert(name); + assert(value); + + if (size == SIZE_MAX) + size = strlen(value); + + if (!(flags & OUTPUT_SHOW_ALL) && strlen(name) + 1 + size >= JSON_THRESHOLD) + r = json_variant_new_null(&v); + else if (utf8_is_printable(value, size)) + r = json_variant_new_stringn(&v, value, size); + else + r = json_variant_new_array_bytes(&v, value, size); + if (r < 0) + return log_error_errno(r, "Failed to allocate JSON data: %m"); + + d = hashmap_get(h, name); + if (d) { + r = json_variant_append_array(&d->values, v); + if (r < 0) + return log_error_errno(r, "Failed to append JSON value into array: %m"); + } else { + _cleanup_(json_data_freep) JsonData *e = NULL; + + e = new0(JsonData, 1); + if (!e) + return log_oom(); + + r = json_variant_new_string(&e->name, name); + if (r < 0) + return log_error_errno(r, "Failed to allocate JSON name variant: %m"); + + r = json_variant_append_array(&e->values, v); + if (r < 0) + return log_error_errno(r, "Failed to create JSON value array: %m"); + + r = hashmap_put(h, json_variant_string(e->name), e); + if (r < 0) + return log_error_errno(r, "Failed to insert JSON data into hashmap: %m"); + + TAKE_PTR(e); + } + + return 0; +} + +static int update_json_data_split( + Hashmap *h, + OutputFlags flags, + const Set *output_fields, + const void *data, + size_t size) { + + size_t fieldlen; + const char *eq; + char *name; + + assert(h); + assert(data || size == 0); + + if (memory_startswith(data, size, "_BOOT_ID=")) + return 0; + + eq = memchr(data, '=', MIN(size, JSON_THRESHOLD)); + if (!eq) + return 0; + + fieldlen = eq - (const char*) data; + if (!journal_field_valid(data, fieldlen, true)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field."); + + name = strndupa_safe(data, fieldlen); + if (output_fields && !set_contains(output_fields, name)) + return 0; + + return update_json_data(h, flags, name, eq + 1, size - fieldlen - 1); +} + +static int output_json( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + const Set *output_fields, + const size_t highlight[2], + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id) { + + char usecbuf[CONST_MAX(DECIMAL_STR_MAX(usec_t), DECIMAL_STR_MAX(uint64_t))]; + _cleanup_(json_variant_unrefp) JsonVariant *object = NULL; + _cleanup_hashmap_free_ Hashmap *h = NULL; + sd_id128_t journal_boot_id, seqnum_id; + _cleanup_free_ char *cursor = NULL; + usec_t realtime, monotonic; + JsonVariant **array = NULL; + JsonData *d; + uint64_t seqnum; + size_t n = 0; + int r; + + assert(j); + assert(display_ts); + assert(boot_id); + assert(previous_display_ts); + assert(previous_boot_id); + + (void) sd_journal_set_data_threshold(j, flags & OUTPUT_SHOW_ALL ? 0 : JSON_THRESHOLD); + + r = sd_journal_get_cursor(j, &cursor); + if (r < 0) + return log_error_errno(r, "Failed to get cursor: %m"); + + r = sd_journal_get_realtime_usec(j, &realtime); + if (r < 0) + return log_error_errno(r, "Failed to get realtime timestamp: %m"); + + r = sd_journal_get_monotonic_usec(j, &monotonic, &journal_boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get monotonic timestamp: %m"); + + r = sd_journal_get_seqnum(j, &seqnum, &seqnum_id); + if (r < 0) + return log_error_errno(r, "Failed to get seqnum: %m"); + + h = hashmap_new(&json_data_hash_ops_free); + if (!h) + return log_oom(); + + r = update_json_data(h, flags, "__CURSOR", cursor, SIZE_MAX); + if (r < 0) + return r; + + xsprintf(usecbuf, USEC_FMT, realtime); + r = update_json_data(h, flags, "__REALTIME_TIMESTAMP", usecbuf, SIZE_MAX); + if (r < 0) + return r; + + xsprintf(usecbuf, USEC_FMT, monotonic); + r = update_json_data(h, flags, "__MONOTONIC_TIMESTAMP", usecbuf, SIZE_MAX); + if (r < 0) + return r; + + r = update_json_data(h, flags, "_BOOT_ID", SD_ID128_TO_STRING(journal_boot_id), SIZE_MAX); + if (r < 0) + return r; + + xsprintf(usecbuf, USEC_FMT, seqnum); + r = update_json_data(h, flags, "__SEQNUM", usecbuf, SIZE_MAX); + if (r < 0) + return r; + + r = update_json_data(h, flags, "__SEQNUM_ID", SD_ID128_TO_STRING(seqnum_id), SIZE_MAX); + if (r < 0) + return r; + + for (;;) { + const void *data; + size_t size; + + r = sd_journal_enumerate_data(j, &data, &size); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Skipping message we can't read: %m"); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to read journal: %m"); + if (r == 0) + break; + + r = update_json_data_split(h, flags, output_fields, data, size); + if (r < 0) + return r; + } + + array = new(JsonVariant*, hashmap_size(h)*2); + if (!array) + return log_oom(); + + CLEANUP_ARRAY(array, n, json_variant_unref_many); + + HASHMAP_FOREACH(d, h) { + assert(json_variant_elements(d->values) > 0); + + array[n++] = json_variant_ref(d->name); + + if (json_variant_elements(d->values) == 1) + array[n++] = json_variant_ref(json_variant_by_index(d->values, 0)); + else + array[n++] = json_variant_ref(d->values); + } + + r = json_variant_new_object(&object, array, n); + if (r < 0) + return log_error_errno(r, "Failed to allocate JSON object: %m"); + + return json_variant_dump(object, + output_mode_to_json_format_flags(mode) | + (FLAGS_SET(flags, OUTPUT_COLOR) ? JSON_FORMAT_COLOR : 0), + f, NULL); +} + +static int output_cat_field( + FILE *f, + sd_journal *j, + OutputFlags flags, + int prio, + const char *field, + const size_t highlight[2]) { + + const char *color_on = "", *color_off = "", *highlight_on = ""; + const void *data; + size_t l, fl; + int r; + + if (FLAGS_SET(flags, OUTPUT_COLOR)) + get_log_colors(prio, &color_on, &color_off, &highlight_on); + + r = sd_journal_get_data(j, field, &data, &l); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Skipping message we can't read: %m"); + return 0; + } + if (r == -ENOENT) /* An entry without the requested field */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to get data: %m"); + + fl = strlen(field); + assert(l >= fl + 1); + assert(((char*) data)[fl] == '='); + + data = (const uint8_t*) data + fl + 1; + l -= fl + 1; + + if (FLAGS_SET(flags, OUTPUT_COLOR)) { + if (highlight) { + assert(highlight[0] <= highlight[1]); + assert(highlight[1] <= l); + + fputs(color_on, f); + fwrite((const char*) data, 1, highlight[0], f); + fputs(highlight_on, f); + fwrite((const char*) data + highlight[0], 1, highlight[1] - highlight[0], f); + fputs(color_on, f); + fwrite((const char*) data + highlight[1], 1, l - highlight[1], f); + fputs(color_off, f); + } else { + fputs(color_on, f); + fwrite((const char*) data, 1, l, f); + fputs(color_off, f); + } + } else + fwrite((const char*) data, 1, l, f); + + fputc('\n', f); + return 0; +} + +static int output_cat( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + const Set *output_fields, + const size_t highlight[2], + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id) { + + int r, prio = LOG_INFO; + const char *field; + + assert(j); + assert(f); + assert(display_ts); + assert(boot_id); + assert(previous_display_ts); + assert(previous_boot_id); + + (void) sd_journal_set_data_threshold(j, 0); + + if (FLAGS_SET(flags, OUTPUT_COLOR)) { + const void *data; + size_t l; + + /* Determine priority of this entry, so that we can color it nicely */ + + r = sd_journal_get_data(j, "PRIORITY", &data, &l); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Skipping message we can't read: %m"); + return 0; + } + if (r < 0) { + if (r != -ENOENT) + return log_error_errno(r, "Failed to get data: %m"); + + /* An entry without PRIORITY */ + } else if (l == 10 && memcmp(data, "PRIORITY=", 9) == 0) { + char c = ((char*) data)[9]; + + if (c >= '0' && c <= '7') + prio = c - '0'; + } + } + + if (set_isempty(output_fields)) + return output_cat_field(f, j, flags, prio, "MESSAGE", highlight); + + SET_FOREACH(field, output_fields) { + r = output_cat_field(f, j, flags, prio, field, streq(field, "MESSAGE") ? highlight : NULL); + if (r < 0) + return r; + } + + return 0; +} + +static int get_display_timestamp( + sd_journal *j, + dual_timestamp *ret_display_ts, + sd_id128_t *ret_boot_id) { + + const void *data; + _cleanup_free_ char *realtime = NULL, *monotonic = NULL; + size_t length = 0, realtime_len = 0, monotonic_len = 0; + const ParseFieldVec message_fields[] = { + PARSE_FIELD_VEC_ENTRY("_SOURCE_REALTIME_TIMESTAMP=", &realtime, &realtime_len), + PARSE_FIELD_VEC_ENTRY("_SOURCE_MONOTONIC_TIMESTAMP=", &monotonic, &monotonic_len), + }; + int r; + bool realtime_good = false, monotonic_good = false, boot_id_good = false; + + assert(j); + assert(ret_display_ts); + assert(ret_boot_id); + + JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) { + r = parse_fieldv(data, length, message_fields, ELEMENTSOF(message_fields)); + if (r < 0) + return r; + + if (realtime && monotonic) + break; + } + if (r < 0) + return r; + + if (realtime) + realtime_good = safe_atou64(realtime, &ret_display_ts->realtime) >= 0; + if (!realtime_good || !VALID_REALTIME(ret_display_ts->realtime)) + realtime_good = sd_journal_get_realtime_usec(j, &ret_display_ts->realtime) >= 0; + if (!realtime_good) + ret_display_ts->realtime = USEC_INFINITY; + + if (monotonic) + monotonic_good = safe_atou64(monotonic, &ret_display_ts->monotonic) >= 0; + if (!monotonic_good || !VALID_MONOTONIC(ret_display_ts->monotonic)) + monotonic_good = boot_id_good = sd_journal_get_monotonic_usec(j, &ret_display_ts->monotonic, ret_boot_id) >= 0; + if (!monotonic_good) + ret_display_ts->monotonic = USEC_INFINITY; + + if (!boot_id_good) + boot_id_good = sd_journal_get_monotonic_usec(j, NULL, ret_boot_id) >= 0; + if (!boot_id_good) + *ret_boot_id = SD_ID128_NULL; + + /* Restart all data before */ + sd_journal_restart_data(j); + sd_journal_restart_unique(j); + sd_journal_restart_fields(j); + + return 0; +} + +typedef int (*output_func_t)( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + const Set *output_fields, + const size_t highlight[2], + const dual_timestamp *display_ts, + const sd_id128_t *boot_id, + const dual_timestamp *previous_display_ts, + const sd_id128_t *previous_boot_id); + + +static output_func_t output_funcs[_OUTPUT_MODE_MAX] = { + [OUTPUT_SHORT] = output_short, + [OUTPUT_SHORT_ISO] = output_short, + [OUTPUT_SHORT_ISO_PRECISE] = output_short, + [OUTPUT_SHORT_PRECISE] = output_short, + [OUTPUT_SHORT_MONOTONIC] = output_short, + [OUTPUT_SHORT_DELTA] = output_short, + [OUTPUT_SHORT_UNIX] = output_short, + [OUTPUT_SHORT_FULL] = output_short, + [OUTPUT_VERBOSE] = output_verbose, + [OUTPUT_EXPORT] = output_export, + [OUTPUT_JSON] = output_json, + [OUTPUT_JSON_PRETTY] = output_json, + [OUTPUT_JSON_SSE] = output_json, + [OUTPUT_JSON_SEQ] = output_json, + [OUTPUT_CAT] = output_cat, + [OUTPUT_WITH_UNIT] = output_short, +}; + +int show_journal_entry( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + Set *output_fields, + const size_t highlight[2], + bool *ellipsized, + dual_timestamp *previous_display_ts, + sd_id128_t *previous_boot_id) { + + dual_timestamp display_ts = DUAL_TIMESTAMP_NULL; + sd_id128_t boot_id = SD_ID128_NULL; + int r; + + assert(mode >= 0); + assert(mode < _OUTPUT_MODE_MAX); + assert(previous_display_ts); + assert(previous_boot_id); + + if (n_columns <= 0) + n_columns = columns(); + + r = get_display_timestamp(j, &display_ts, &boot_id); + if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Skipping message we can't read: %m"); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to get journal fields: %m"); + + r = output_funcs[mode]( + f, + j, + mode, + n_columns, + flags, + output_fields, + highlight, + &display_ts, + &boot_id, + previous_display_ts, + previous_boot_id); + + /* Store timestamp and boot ID for next iteration */ + *previous_display_ts = display_ts; + *previous_boot_id = boot_id; + + if (ellipsized && r > 0) + *ellipsized = true; + + return r; +} + +static int maybe_print_begin_newline(FILE *f, OutputFlags *flags) { + assert(f); + assert(flags); + + if (!(*flags & OUTPUT_BEGIN_NEWLINE)) + return 0; + + /* Print a beginning new line if that's request, but only once + * on the first line we print. */ + + fputc('\n', f); + *flags &= ~OUTPUT_BEGIN_NEWLINE; + return 0; +} + +int show_journal( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + usec_t not_before, + unsigned how_many, + OutputFlags flags, + bool *ellipsized) { + + int r; + unsigned line = 0; + bool need_seek = false; + int warn_cutoff = flags & OUTPUT_WARN_CUTOFF; + dual_timestamp previous_display_ts = DUAL_TIMESTAMP_NULL; + sd_id128_t previous_boot_id = SD_ID128_NULL; + + assert(j); + assert(mode >= 0); + assert(mode < _OUTPUT_MODE_MAX); + + if (how_many == UINT_MAX) + need_seek = true; + else { + /* Seek to end */ + r = sd_journal_seek_tail(j); + if (r < 0) + return log_error_errno(r, "Failed to seek to tail: %m"); + + r = sd_journal_previous_skip(j, how_many); + if (r < 0) + return log_error_errno(r, "Failed to skip previous: %m"); + } + + for (;;) { + usec_t usec; + + if (need_seek) { + r = sd_journal_next(j); + if (r < 0) + return log_error_errno(r, "Failed to iterate through journal: %m"); + } + + if (r == 0) + break; + + need_seek = true; + + if (not_before > 0) { + r = sd_journal_get_monotonic_usec(j, &usec, NULL); + + /* -ESTALE is returned if the timestamp is not from this boot */ + if (r == -ESTALE) + continue; + if (r < 0) + return log_error_errno(r, "Failed to get journal time: %m"); + + if (usec < not_before) + continue; + } + + line++; + maybe_print_begin_newline(f, &flags); + + r = show_journal_entry( + f, + j, + mode, + n_columns, + flags, + /* output_fields= */ NULL, + /* highlight= */ NULL, + ellipsized, + &previous_display_ts, + &previous_boot_id); + if (r < 0) + return r; + } + + if (warn_cutoff && line < how_many && not_before > 0) { + sd_id128_t boot_id; + usec_t cutoff = 0; + + /* Check whether the cutoff line is too early */ + + r = sd_id128_get_boot(&boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get boot id: %m"); + + r = sd_journal_get_cutoff_monotonic_usec(j, boot_id, &cutoff, NULL); + if (r < 0) + return log_error_errno(r, "Failed to get journal cutoff time: %m"); + + if (r > 0 && not_before < cutoff) { + maybe_print_begin_newline(f, &flags); + + /* If we logged *something* and no permission error happened, than we can reliably + * emit the warning about rotation. If we didn't log anything and access errors + * happened, emit hint about permissions. Otherwise, give a generic message, since we + * can't diagnose the issue. */ + + bool noaccess = journal_access_blocked(j); + + if (line == 0 && noaccess) + fprintf(f, "Warning: some journal files were not opened due to insufficient permissions.\n"); + else if (!noaccess) + fprintf(f, "Notice: journal has been rotated since unit was started, output may be incomplete.\n"); + else + fprintf(f, "Warning: journal has been rotated since unit was started and some journal " + "files were not opened due to insufficient permissions, output may be incomplete.\n"); + } + + warn_cutoff = false; + } + + return 0; +} + +int add_matches_for_unit(sd_journal *j, const char *unit) { + const char *m1, *m2, *m3, *m4; + int r; + + assert(j); + assert(unit); + + m1 = strjoina("_SYSTEMD_UNIT=", unit); + m2 = strjoina("COREDUMP_UNIT=", unit); + m3 = strjoina("UNIT=", unit); + m4 = strjoina("OBJECT_SYSTEMD_UNIT=", unit); + + (void)( + /* Look for messages from the service itself */ + (r = sd_journal_add_match(j, m1, 0)) || + + /* Look for coredumps of the service */ + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, "MESSAGE_ID=fc2e22bc6ee647b6b90729ab34a250b1", 0)) || + (r = sd_journal_add_match(j, "_UID=0", 0)) || + (r = sd_journal_add_match(j, m2, 0)) || + + /* Look for messages from PID 1 about this service */ + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, "_PID=1", 0)) || + (r = sd_journal_add_match(j, m3, 0)) || + + /* Look for messages from authorized daemons about this service */ + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, "_UID=0", 0)) || + (r = sd_journal_add_match(j, m4, 0)) + ); + + if (r == 0 && endswith(unit, ".slice")) { + const char *m5; + + m5 = strjoina("_SYSTEMD_SLICE=", unit); + + /* Show all messages belonging to a slice */ + (void)( + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m5, 0)) + ); + } + + return r; +} + +int add_matches_for_user_unit(sd_journal *j, const char *unit, uid_t uid) { + int r; + char *m1, *m2, *m3, *m4; + char muid[sizeof("_UID=") + DECIMAL_STR_MAX(uid_t)]; + + assert(j); + assert(unit); + + m1 = strjoina("_SYSTEMD_USER_UNIT=", unit); + m2 = strjoina("USER_UNIT=", unit); + m3 = strjoina("COREDUMP_USER_UNIT=", unit); + m4 = strjoina("OBJECT_SYSTEMD_USER_UNIT=", unit); + sprintf(muid, "_UID="UID_FMT, uid); + + (void) ( + /* Look for messages from the user service itself */ + (r = sd_journal_add_match(j, m1, 0)) || + (r = sd_journal_add_match(j, muid, 0)) || + + /* Look for messages from systemd about this service */ + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m2, 0)) || + (r = sd_journal_add_match(j, muid, 0)) || + + /* Look for coredumps of the service */ + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m3, 0)) || + (r = sd_journal_add_match(j, muid, 0)) || + (r = sd_journal_add_match(j, "_UID=0", 0)) || + + /* Look for messages from authorized daemons about this service */ + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m4, 0)) || + (r = sd_journal_add_match(j, muid, 0)) || + (r = sd_journal_add_match(j, "_UID=0", 0)) + ); + + if (r == 0 && endswith(unit, ".slice")) { + const char *m5; + + m5 = strjoina("_SYSTEMD_USER_SLICE=", unit); + + /* Show all messages belonging to a slice */ + (void)( + (r = sd_journal_add_disjunction(j)) || + (r = sd_journal_add_match(j, m5, 0)) || + (r = sd_journal_add_match(j, muid, 0)) + ); + } + + return r; +} + +static int get_boot_id_for_machine(const char *machine, sd_id128_t *boot_id) { + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, rootfd = -EBADF; + char buf[SD_ID128_UUID_STRING_MAX]; + pid_t pid, child; + ssize_t k; + int r; + + assert(machine); + assert(boot_id); + + r = container_get_leader(machine, &pid); + if (r < 0) + return r; + + r = namespace_open(pid, &pidnsfd, &mntnsfd, NULL, NULL, &rootfd); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) < 0) + return -errno; + + r = namespace_fork("(sd-bootidns)", "(sd-bootid)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, + pidnsfd, mntnsfd, -1, -1, rootfd, &child); + if (r < 0) + return r; + if (r == 0) { + int fd; + + pair[0] = safe_close(pair[0]); + + fd = open("/proc/sys/kernel/random/boot_id", O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + _exit(EXIT_FAILURE); + + r = loop_read_exact(fd, buf, 36, false); + safe_close(fd); + if (r < 0) + _exit(EXIT_FAILURE); + + k = send(pair[1], buf, 36, MSG_NOSIGNAL); + if (k != 36) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + r = wait_for_terminate_and_check("(sd-bootidns)", child, 0); + if (r < 0) + return r; + if (r != EXIT_SUCCESS) + return -EIO; + + k = recv(pair[0], buf, 36, 0); + if (k != 36) + return -EIO; + + buf[36] = 0; + r = sd_id128_from_string(buf, boot_id); + if (r < 0) + return r; + + return 0; +} + +int add_match_boot_id(sd_journal *j, sd_id128_t id) { + char match[STRLEN("_BOOT_ID=") + SD_ID128_STRING_MAX]; + + assert(j); + assert(!sd_id128_is_null(id)); + + sd_id128_to_string(id, stpcpy(match, "_BOOT_ID=")); + return sd_journal_add_match(j, match, strlen(match)); +} + +int add_match_this_boot(sd_journal *j, const char *machine) { + sd_id128_t boot_id; + int r; + + assert(j); + + if (machine) { + r = get_boot_id_for_machine(machine, &boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get boot id of container %s: %m", machine); + } else { + r = sd_id128_get_boot(&boot_id); + if (r < 0) + return log_error_errno(r, "Failed to get boot id: %m"); + } + + r = add_match_boot_id(j, boot_id); + if (r < 0) + return log_error_errno(r, "Failed to add match: %m"); + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + return 0; +} + +int show_journal_by_unit( + FILE *f, + const char *unit, + const char *log_namespace, + OutputMode mode, + unsigned n_columns, + usec_t not_before, + unsigned how_many, + uid_t uid, + OutputFlags flags, + int journal_open_flags, + bool system_unit, + bool *ellipsized) { + + _cleanup_(sd_journal_closep) sd_journal *j = NULL; + int r; + + assert(mode >= 0); + assert(mode < _OUTPUT_MODE_MAX); + assert(unit); + + if (how_many <= 0) + return 0; + + r = sd_journal_open_namespace(&j, log_namespace, journal_open_flags | SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE); + if (r < 0) + return log_error_errno(r, "Failed to open journal: %m"); + + if (system_unit) + r = add_matches_for_unit(j, unit); + else + r = add_matches_for_user_unit(j, unit, uid); + if (r < 0) + return log_error_errno(r, "Failed to add unit matches: %m"); + + r = sd_journal_add_conjunction(j); + if (r < 0) + return log_error_errno(r, "Failed to add conjunction: %m"); + + r = add_match_this_boot(j, NULL); + if (r < 0) + return r; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *filter = NULL; + + filter = journal_make_match_string(j); + if (!filter) + return log_oom(); + + log_debug("Journal filter: %s", filter); + } + + return show_journal(f, j, mode, n_columns, not_before, how_many, flags, ellipsized); +} + +static int discover_next_boot( + sd_journal *j, + sd_id128_t previous_boot_id, + bool advance_older, + BootId *ret) { + + _cleanup_set_free_ Set *broken_ids = NULL; + int r; + + assert(j); + assert(ret); + + /* We expect the journal to be on the last position of a boot + * (in relation to the direction we are going), so that the next + * invocation of sd_journal_next/previous will be from a different + * boot. We then collect any information we desire and then jump + * to the last location of the new boot by using a _BOOT_ID match + * coming from the other journal direction. */ + + /* Make sure we aren't restricted by any _BOOT_ID matches, so that + * we can actually advance to a *different* boot. */ + sd_journal_flush_matches(j); + + for (;;) { + sd_id128_t *id_dup; + BootId boot; + + r = sd_journal_step_one(j, !advance_older); + if (r < 0) + return r; + if (r == 0) { + *ret = (BootId) {}; + return 0; /* End of journal, yay. */ + } + + r = sd_journal_get_monotonic_usec(j, NULL, &boot.id); + if (r < 0) + return r; + + /* We iterate through this in a loop, until the boot ID differs from the previous one. Note that + * normally, this will only require a single iteration, as we moved to the last entry of the previous + * boot entry already. However, it might happen that the per-journal-field entry arrays are less + * complete than the main entry array, and hence might reference an entry that's not actually the last + * one of the boot ID as last one. Let's hence use the per-field array is initial seek position to + * speed things up, but let's not trust that it is complete, and hence, manually advance as + * necessary. */ + + if (!sd_id128_is_null(previous_boot_id) && sd_id128_equal(boot.id, previous_boot_id)) + continue; + + if (set_contains(broken_ids, &boot.id)) + continue; + + /* Yay, we found a new boot ID from the entry object. Let's check there exist corresponding + * entries matching with the _BOOT_ID= data. */ + + r = add_match_boot_id(j, boot.id); + if (r < 0) + return r; + + /* First, seek to the first (or the last when we are going upwards) occurrence of this boot ID. + * You may think this is redundant. Yes, that's redundant unless the journal is corrupted. + * But when the journal is corrupted, especially, badly 'truncated', then the below may fail. + * See https://github.com/systemd/systemd/pull/29334#issuecomment-1736567951. */ + if (advance_older) + r = sd_journal_seek_tail(j); + else + r = sd_journal_seek_head(j); + if (r < 0) + return r; + + r = sd_journal_step_one(j, 0); + if (r < 0) + return r; + if (r == 0) { + log_debug("Whoopsie! We found a boot ID %s but can't read its first entry. " + "The journal seems to be corrupted. Ignoring the boot ID.", + SD_ID128_TO_STRING(boot.id)); + goto try_again; + } + + r = sd_journal_get_realtime_usec(j, &boot.first_usec); + if (r < 0) + return r; + + /* Next, seek to the last occurrence of this boot ID. */ + if (advance_older) + r = sd_journal_seek_head(j); + else + r = sd_journal_seek_tail(j); + if (r < 0) + return r; + + r = sd_journal_step_one(j, 0); + if (r < 0) + return r; + if (r == 0) { + log_debug("Whoopsie! We found a boot ID %s but can't read its last entry. " + "The journal seems to be corrupted. Ignoring the boot ID.", + SD_ID128_TO_STRING(boot.id)); + goto try_again; + } + + r = sd_journal_get_realtime_usec(j, &boot.last_usec); + if (r < 0) + return r; + + sd_journal_flush_matches(j); + *ret = boot; + return 1; + + try_again: + /* Save the bad boot ID. */ + id_dup = newdup(sd_id128_t, &boot.id, 1); + if (!id_dup) + return -ENOMEM; + + r = set_ensure_consume(&broken_ids, &id128_hash_ops_free, id_dup); + if (r < 0) + return r; + + /* Move to the previous position again. */ + sd_journal_flush_matches(j); + + if (!sd_id128_is_null(previous_boot_id)) { + r = add_match_boot_id(j, previous_boot_id); + if (r < 0) + return r; + } + + if (advance_older) + r = sd_journal_seek_head(j); + else + r = sd_journal_seek_tail(j); + if (r < 0) + return r; + + r = sd_journal_step_one(j, 0); + if (r < 0) + return r; + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENODATA), + "Whoopsie! Cannot seek to the last entry of boot %s.", + SD_ID128_TO_STRING(previous_boot_id)); + + sd_journal_flush_matches(j); + } +} + +int journal_find_boot_by_id(sd_journal *j, sd_id128_t boot_id) { + int r; + + assert(j); + assert(!sd_id128_is_null(boot_id)); + + sd_journal_flush_matches(j); + + r = add_match_boot_id(j, boot_id); + if (r < 0) + return r; + + r = sd_journal_seek_head(j); /* seek to oldest */ + if (r < 0) + return r; + + r = sd_journal_next(j); /* read the oldest entry */ + if (r < 0) + return r; + + /* At this point the read pointer is positioned at the oldest occurrence of the reference boot ID. + * After flushing the matches, one more invocation of _previous() will hence place us at the + * following entry, which must then have an older boot ID */ + + sd_journal_flush_matches(j); + return r > 0; +} + +int journal_find_boot_by_offset(sd_journal *j, int offset, sd_id128_t *ret) { + bool advance_older; + int r; + + assert(j); + assert(ret); + + /* Adjust for the asymmetry that offset 0 is the last (and current) boot, while 1 is considered the + * (chronological) first boot in the journal. */ + advance_older = offset <= 0; + + if (advance_older) + r = sd_journal_seek_tail(j); /* seek to newest */ + else + r = sd_journal_seek_head(j); /* seek to oldest */ + if (r < 0) + return r; + + /* No sd_journal_next()/_previous() here. + * + * At this point the read pointer is positioned after the newest/before the oldest entry in the whole + * journal. The next invocation of _previous()/_next() will hence position us at the newest/oldest + * entry we have. */ + + sd_id128_t boot_id = SD_ID128_NULL; + for (int off = !advance_older; ; off += advance_older ? -1 : 1) { + BootId boot; + + r = discover_next_boot(j, boot_id, advance_older, &boot); + if (r < 0) + return r; + if (r == 0) { + *ret = SD_ID128_NULL; + return false; + } + + boot_id = boot.id; + log_debug("Found boot ID %s by offset %i", SD_ID128_TO_STRING(boot_id), off); + + if (off == offset) + break; + } + + *ret = boot_id; + return true; +} + +int journal_get_boots(sd_journal *j, BootId **ret_boots, size_t *ret_n_boots) { + _cleanup_free_ BootId *boots = NULL; + size_t n_boots = 0; + int r; + + assert(j); + assert(ret_boots); + assert(ret_n_boots); + + r = sd_journal_seek_head(j); /* seek to oldest */ + if (r < 0) + return r; + + /* No sd_journal_next()/_previous() here. + * + * At this point the read pointer is positioned before the oldest entry in the whole journal. The + * next invocation of _next() will hence position us at the oldest entry we have. */ + + sd_id128_t previous_boot_id = SD_ID128_NULL; + for (;;) { + BootId boot; + + r = discover_next_boot(j, previous_boot_id, /* advance_older = */ false, &boot); + if (r < 0) + return r; + if (r == 0) + break; + + previous_boot_id = boot.id; + + FOREACH_ARRAY(i, boots, n_boots) + if (sd_id128_equal(i->id, boot.id)) + /* The boot id is already stored, something wrong with the journal files. + * Exiting as otherwise this problem would cause an infinite loop. */ + break; + + if (!GREEDY_REALLOC(boots, n_boots + 1)) + return -ENOMEM; + + boots[n_boots++] = boot; + } + + *ret_boots = TAKE_PTR(boots); + *ret_n_boots = n_boots; + return n_boots > 0; +} diff --git a/src/shared/logs-show.h b/src/shared/logs-show.h new file mode 100644 index 0000000..3a8ce8b --- /dev/null +++ b/src/shared/logs-show.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "sd-id128.h" +#include "sd-journal.h" + +#include "macro.h" +#include "output-mode.h" +#include "time-util.h" + +typedef struct BootId { + sd_id128_t id; + usec_t first_usec; + usec_t last_usec; +} BootId; + +int show_journal_entry( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + OutputFlags flags, + Set *output_fields, + const size_t highlight[2], + bool *ellipsized, + dual_timestamp *previous_display_ts, + sd_id128_t *previous_boot_id); +int show_journal( + FILE *f, + sd_journal *j, + OutputMode mode, + unsigned n_columns, + usec_t not_before, + unsigned how_many, + OutputFlags flags, + bool *ellipsized); + +int add_match_boot_id(sd_journal *j, sd_id128_t id); +int add_match_this_boot(sd_journal *j, const char *machine); + +int add_matches_for_unit( + sd_journal *j, + const char *unit); + +int add_matches_for_user_unit( + sd_journal *j, + const char *unit, + uid_t uid); + +int show_journal_by_unit( + FILE *f, + const char *unit, + const char *namespace, + OutputMode mode, + unsigned n_columns, + usec_t not_before, + unsigned how_many, + uid_t uid, + OutputFlags flags, + int journal_open_flags, + bool system_unit, + bool *ellipsized); + +void json_escape( + FILE *f, + const char* p, + size_t l, + OutputFlags flags); + +int journal_find_boot_by_id(sd_journal *j, sd_id128_t boot_id); +int journal_find_boot_by_offset(sd_journal *j, int offset, sd_id128_t *ret); +int journal_get_boots(sd_journal *j, BootId **ret_boots, size_t *ret_n_boots); diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c new file mode 100644 index 0000000..5860303 --- /dev/null +++ b/src/shared/loop-util.c @@ -0,0 +1,1209 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "data-fd-util.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dissect-image.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "fileio.h" +#include "loop-util.h" +#include "missing_loop.h" +#include "parse-util.h" +#include "path-util.h" +#include "random-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tmpfile-util.h" + +static void cleanup_clear_loop_close(int *fd) { + if (*fd < 0) + return; + + (void) ioctl(*fd, LOOP_CLR_FD); + (void) safe_close(*fd); +} + +static int loop_is_bound(int fd) { + struct loop_info64 info; + + if (ioctl(ASSERT_FD(fd), LOOP_GET_STATUS64, &info) < 0) { + if (errno == ENXIO) + return false; /* not bound! */ + + return -errno; + } + + return true; /* bound! */ +} + +static int get_current_uevent_seqnum(uint64_t *ret) { + _cleanup_free_ char *p = NULL; + int r; + + r = read_full_virtual_file("/sys/kernel/uevent_seqnum", &p, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to read current uevent sequence number: %m"); + + r = safe_atou64(strstrip(p), ret); + if (r < 0) + return log_debug_errno(r, "Failed to parse current uevent sequence number: %s", p); + + return 0; +} + +static int open_lock_fd(int primary_fd, int operation) { + _cleanup_close_ int lock_fd = -EBADF; + + assert(IN_SET(operation & ~LOCK_NB, LOCK_SH, LOCK_EX)); + + lock_fd = fd_reopen(ASSERT_FD(primary_fd), O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (lock_fd < 0) + return lock_fd; + + if (flock(lock_fd, operation) < 0) + return -errno; + + return TAKE_FD(lock_fd); +} + +static int loop_configure_verify_direct_io(int fd, const struct loop_config *c) { + assert(fd >= 0); + assert(c); + + if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) { + struct loop_info64 info; + + if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) + return log_debug_errno(errno, "Failed to issue LOOP_GET_STATUS64: %m"); + +#if HAVE_VALGRIND_MEMCHECK_H + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + + /* On older kernels (<= 5.3) it was necessary to set the block size of the loopback block + * device to the logical block size of the underlying file system. Since there was no nice + * way to query the value, we are not bothering to do this however. On newer kernels the + * block size is propagated automatically and does not require intervention from us. We'll + * check here if enabling direct IO worked, to make this easily debuggable however. + * + * (Should anyone really care and actually wants direct IO on old kernels: it might be worth + * enabling direct IO with iteratively larger block sizes until it eventually works.) + * + * On older kernels (e.g.: 5.10) when this is attempted on a file stored on a dm-crypt + * backed partition the kernel will start returning I/O errors when accessing the mounted + * loop device, so return a recognizable error that causes the operation to be started + * from scratch without the LO_FLAGS_DIRECT_IO flag. */ + if (!FLAGS_SET(info.lo_flags, LO_FLAGS_DIRECT_IO)) + return log_debug_errno( + SYNTHETIC_ERRNO(ENOANO), + "Could not enable direct IO mode, retrying in buffered IO mode."); + } + + return 0; +} + +static int loop_configure_verify(int fd, const struct loop_config *c) { + bool broken = false; + int r; + + assert(fd >= 0); + assert(c); + + if (c->block_size != 0) { + uint32_t ssz; + + r = blockdev_get_sector_size(fd, &ssz); + if (r < 0) + return r; + + if (ssz != c->block_size) { + log_debug("LOOP_CONFIGURE didn't honour requested block size %" PRIu32 ", got %" PRIu32 " instead. Ignoring.", c->block_size, ssz); + broken = true; + } + } + + if (c->info.lo_sizelimit != 0) { + /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the + * block device. If it's used, let's immediately check if it had the desired + * effect hence. And if not use classic LOOP_SET_STATUS64. */ + uint64_t z; + + if (ioctl(fd, BLKGETSIZE64, &z) < 0) + return -errno; + + if (z != c->info.lo_sizelimit) { + log_debug("LOOP_CONFIGURE is broken, doesn't honour .info.lo_sizelimit. Falling back to LOOP_SET_STATUS64."); + broken = true; + } + } + + if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) { + /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag + * into the block device. Let's hence verify if things work correctly here + * before returning. */ + + r = blockdev_partscan_enabled(fd); + if (r < 0) + return r; + if (r == 0) { + log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64."); + broken = true; + } + } + + r = loop_configure_verify_direct_io(fd, c); + if (r < 0) + return r; + + return !broken; +} + +static int loop_configure_fallback(int fd, const struct loop_config *c) { + struct loop_info64 info_copy; + int r; + + assert(fd >= 0); + assert(c); + + /* Only some of the flags LOOP_CONFIGURE can set are also settable via LOOP_SET_STATUS64, hence mask + * them out. */ + info_copy = c->info; + info_copy.lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; + + /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64 + * ioctl can return EAGAIN in case we change the info.lo_offset field, if someone else is accessing the + * block device while we try to reconfigure it. This is a pretty common case, since udev might + * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways: + * first, let's take the BSD lock to ensure that udev will not step in between the point in + * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on + * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms + * needlessly if we are just racing against udev. The latter is protection against all other cases, + * i.e. peers that do not take the BSD lock. */ + + for (unsigned n_attempts = 0;;) { + if (ioctl(fd, LOOP_SET_STATUS64, &info_copy) >= 0) + break; + + if (errno != EAGAIN || ++n_attempts >= 64) + return log_debug_errno(errno, "Failed to configure loopback block device: %m"); + + /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more + * failed attempts we see */ + (void) usleep_safe(UINT64_C(10) * USEC_PER_MSEC + + random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); + } + + /* Work around a kernel bug, where changing offset/size of the loopback device doesn't correctly + * invalidate the buffer cache. For details see: + * + * https://android.googlesource.com/platform/system/apex/+/bef74542fbbb4cd629793f4efee8e0053b360570 + * + * This was fixed in kernel 5.0, see: + * + * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5db470e229e22b7eda6e23b5566e532c96fb5bc3 + * + * We'll run the work-around here in the legacy LOOP_SET_STATUS64 codepath. In the LOOP_CONFIGURE + * codepath above it should not be necessary. */ + if (c->info.lo_offset != 0 || c->info.lo_sizelimit != 0) + if (ioctl(fd, BLKFLSBUF, 0) < 0) + log_debug_errno(errno, "Failed to issue BLKFLSBUF ioctl, ignoring: %m"); + + /* If a block size is requested then try to configure it. If that doesn't work, ignore errors, but + * afterwards, let's validate what is in effect, and if it doesn't match what we want, fail */ + if (c->block_size != 0) { + uint32_t ssz; + + if (ioctl(fd, LOOP_SET_BLOCK_SIZE, (unsigned long) c->block_size) < 0) + log_debug_errno(errno, "Failed to set sector size, ignoring: %m"); + + r = blockdev_get_sector_size(fd, &ssz); + if (r < 0) + return log_debug_errno(r, "Failed to read sector size: %m"); + if (ssz != c->block_size) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Sector size of loopback device doesn't match what we requested, refusing."); + } + + /* LO_FLAGS_DIRECT_IO is a flags we need to configure via explicit ioctls. */ + if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) + if (ioctl(fd, LOOP_SET_DIRECT_IO, 1UL) < 0) + log_debug_errno(errno, "Failed to enable direct IO mode, ignoring: %m"); + + return loop_configure_verify_direct_io(fd, c); +} + +static int loop_configure( + int nr, + int open_flags, + int lock_op, + const struct loop_config *c, + LoopDevice **ret) { + + static bool loop_configure_broken = false; + + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -EBADF; /* This must be declared before lock_fd. */ + _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF; + _cleanup_free_ char *node = NULL; + uint64_t diskseq = 0, seqnum = UINT64_MAX; + usec_t timestamp = USEC_INFINITY; + dev_t devno; + int r; + + assert(nr >= 0); + assert(c); + assert(ret); + + if (asprintf(&node, "/dev/loop%i", nr) < 0) + return log_oom_debug(); + + r = sd_device_new_from_devname(&dev, node); + if (r < 0) + return log_debug_errno(r, "Failed to create sd_device object for \"%s\": %m", node); + + r = sd_device_get_devnum(dev, &devno); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get devnum: %m"); + + fd = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); + if (fd < 0) + return log_device_debug_errno(dev, fd, "Failed to open device: %m"); + + /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened + * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on + * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure + * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a + * long time udev would possibly never run on it again, even though the fd is unlocked, simply + * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to + * automatically release the lock, after we are done. */ + lock_fd = open_lock_fd(fd, LOCK_EX); + if (lock_fd < 0) + return log_device_debug_errno(dev, lock_fd, "Failed to acquire lock: %m"); + + log_device_debug(dev, "Acquired exclusive lock."); + + /* Let's see if backing file is really unattached. Someone may already attach a backing file without + * taking BSD lock. */ + r = loop_is_bound(fd); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to check if the loopback block device is bound: %m"); + if (r > 0) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EBUSY), + "The loopback block device is already bound, ignoring."); + + /* Let's see if the device is really detached, i.e. currently has no associated partition block + * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that + * superficially is detached but still has partition block devices associated for it. Let's then + * manually remove the partitions via BLKPG, and tell the caller we did that via EUCLEAN, so they try + * again. */ + r = block_device_remove_all_partitions(dev, fd); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to remove partitions on the loopback block device: %m"); + if (r > 0) + /* Removed all partitions. Let's report this to the caller, to try again, and count this as + * an attempt. */ + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EUCLEAN), + "Removed partitions on the loopback block device."); + + if (!loop_configure_broken) { + /* Acquire uevent seqnum immediately before attaching the loopback device. This allows + * callers to ignore all uevents with a seqnum before this one, if they need to associate + * uevent with this attachment. Doing so isn't race-free though, as uevents that happen in + * the window between this reading of the seqnum, and the LOOP_CONFIGURE call might still be + * mistaken as originating from our attachment, even though might be caused by an earlier + * use. But doing this at least shortens the race window a bit. */ + r = get_current_uevent_seqnum(&seqnum); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get the current uevent seqnum: %m"); + + timestamp = now(CLOCK_MONOTONIC); + + if (ioctl(fd, LOOP_CONFIGURE, c) < 0) { + /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other + * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL + * rather than ENOTTY on loopback block devices. They should fix that in the kernel, + * but in the meantime we accept both here. */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) + return log_device_debug_errno(dev, errno, "ioctl(LOOP_CONFIGURE) failed: %m"); + + loop_configure_broken = true; + } else { + loop_with_fd = TAKE_FD(fd); + + r = loop_configure_verify(loop_with_fd, c); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to verify if loopback block device is correctly configured: %m"); + if (r == 0) { + /* LOOP_CONFIGURE doesn't work. Remember that. */ + loop_configure_broken = true; + + /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD, + * because LOOP_CLR_FD is async: if the operation cannot be executed right + * away it just sets the autoclear flag on the device. This means there's a + * good chance we cannot actually reuse the loopback device right-away. Hence + * let's assume it's busy, avoid the trouble and let the calling loop call us + * again with a new, likely unused device. */ + return -EBUSY; + } + } + } + + if (loop_configure_broken) { + /* Let's read the seqnum again, to shorten the window. */ + r = get_current_uevent_seqnum(&seqnum); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get the current uevent seqnum: %m"); + + timestamp = now(CLOCK_MONOTONIC); + + if (ioctl(fd, LOOP_SET_FD, c->fd) < 0) + return log_device_debug_errno(dev, errno, "ioctl(LOOP_SET_FD) failed: %m"); + + loop_with_fd = TAKE_FD(fd); + + r = loop_configure_fallback(loop_with_fd, c); + if (r < 0) + return r; + } + + r = fd_get_diskseq(loop_with_fd, &diskseq); + if (r < 0 && r != -EOPNOTSUPP) + return log_device_debug_errno(dev, r, "Failed to get diskseq: %m"); + + switch (lock_op & ~LOCK_NB) { + case LOCK_EX: /* Already in effect */ + break; + case LOCK_SH: /* Downgrade */ + if (flock(lock_fd, lock_op) < 0) + return log_device_debug_errno(dev, errno, "Failed to downgrade lock level: %m"); + break; + case LOCK_UN: /* Release */ + lock_fd = safe_close(lock_fd); + break; + default: + assert_not_reached(); + } + + LoopDevice *d = new(LoopDevice, 1); + if (!d) + return log_oom_debug(); + + *d = (LoopDevice) { + .n_ref = 1, + .fd = TAKE_FD(loop_with_fd), + .lock_fd = TAKE_FD(lock_fd), + .node = TAKE_PTR(node), + .nr = nr, + .devno = devno, + .dev = TAKE_PTR(dev), + .diskseq = diskseq, + .uevent_seqnum_not_before = seqnum, + .timestamp_not_before = timestamp, + .sector_size = c->block_size, + }; + + *ret = TAKE_PTR(d); + return 0; +} + +static int loop_device_make_internal( + const char *path, + int fd, + int open_flags, + uint64_t offset, + uint64_t size, + uint32_t sector_size, + uint32_t loop_flags, + int lock_op, + LoopDevice **ret) { + + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + _cleanup_close_ int reopened_fd = -EBADF, control = -EBADF; + _cleanup_free_ char *backing_file = NULL; + struct loop_config config; + int r, f_flags; + struct stat st; + + assert(ret); + assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); + + if (fstat(ASSERT_FD(fd), &st) < 0) + return -errno; + + if (S_ISBLK(st.st_mode)) { + if (offset == 0 && IN_SET(size, 0, UINT64_MAX)) + /* If this is already a block device and we are supposed to cover the whole of it + * then store an fd to the original open device node — and do not actually create an + * unnecessary loopback device for it. */ + return loop_device_open_from_fd(fd, open_flags, lock_op, ret); + } else { + r = stat_verify_regular(&st); + if (r < 0) + return r; + } + + if (path) { + r = path_make_absolute_cwd(path, &backing_file); + if (r < 0) + return r; + + path_simplify(backing_file); + } else { + r = fd_get_path(fd, &backing_file); + if (r < 0) + return r; + } + + f_flags = fcntl(fd, F_GETFL); + if (f_flags < 0) + return -errno; + + if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) != FLAGS_SET(f_flags, O_DIRECT)) { + /* If LO_FLAGS_DIRECT_IO is requested, then make sure we have the fd open with O_DIRECT, as + * that's required. Conversely, if it's off require that O_DIRECT is off too (that's because + * new kernels will implicitly enable LO_FLAGS_DIRECT_IO if O_DIRECT is set). + * + * Our intention here is that LO_FLAGS_DIRECT_IO is the primary knob, and O_DIRECT derived + * from that automatically. */ + + reopened_fd = fd_reopen(fd, (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0)|O_CLOEXEC|O_NONBLOCK|open_flags); + if (reopened_fd < 0) { + if (!FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) + return log_debug_errno(reopened_fd, "Failed to reopen file descriptor without O_DIRECT: %m"); + + /* Some file systems might not support O_DIRECT, let's gracefully continue without it then. */ + log_debug_errno(reopened_fd, "Failed to enable O_DIRECT for backing file descriptor for loopback device. Continuing without."); + loop_flags &= ~LO_FLAGS_DIRECT_IO; + } else + fd = reopened_fd; /* From now on, operate on our new O_DIRECT fd */ + } + + control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (control < 0) + return -errno; + + if (sector_size == 0) + /* If no sector size is specified, default to the classic default */ + sector_size = 512; + else if (sector_size == UINT32_MAX) { + + if (S_ISBLK(st.st_mode)) + /* If the sector size is specified as UINT32_MAX we'll propagate the sector size of + * the underlying block device. */ + r = blockdev_get_sector_size(fd, §or_size); + else { + _cleanup_close_ int non_direct_io_fd = -EBADF; + int probe_fd; + + assert(S_ISREG(st.st_mode)); + + /* If sector size is specified as UINT32_MAX, we'll try to probe the right sector + * size of the image in question by looking for the GPT partition header at various + * offsets. This of course only works if the image already has a disk label. + * + * So here we actually want to read the file contents ourselves. This is quite likely + * not going to work if we managed to enable O_DIRECT, because in such a case there + * are some pretty strict alignment requirements to offset, size and target, but + * there's no way to query what alignment specifically is actually required. Hence, + * let's avoid the mess, and temporarily open an fd without O_DIRECT for the probing + * logic. */ + + if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) { + non_direct_io_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (non_direct_io_fd < 0) + return non_direct_io_fd; + + probe_fd = non_direct_io_fd; + } else + probe_fd = fd; + + r = probe_sector_size(probe_fd, §or_size); + } + if (r < 0) + return r; + } + + config = (struct loop_config) { + .fd = fd, + .block_size = sector_size, + .info = { + /* Use the specified flags, but configure the read-only flag from the open flags, and force autoclear */ + .lo_flags = (loop_flags & ~LO_FLAGS_READ_ONLY) | ((open_flags & O_ACCMODE) == O_RDONLY ? LO_FLAGS_READ_ONLY : 0) | LO_FLAGS_AUTOCLEAR, + .lo_offset = offset, + .lo_sizelimit = size == UINT64_MAX ? 0 : size, + }, + }; + + /* Loop around LOOP_CTL_GET_FREE, since at the moment we attempt to open the returned device it might + * be gone already, taken by somebody else racing against us. */ + for (unsigned n_attempts = 0;;) { + usec_t usec; + int nr; + + /* Let's take a lock on the control device first. On a busy system, where many programs + * attempt to allocate a loopback device at the same time, we might otherwise keep looping + * around relatively heavy operations: asking for a free loopback device, then opening it, + * validating it, attaching something to it. Let's serialize this whole operation, to make + * unnecessary busywork less likely. Note that this is just something we do to optimize our + * own code (and whoever else decides to use LOCK_EX locks for this), taking this lock is not + * necessary, it just means it's less likely we have to iterate through this loop again and + * again if our own code races against our own code. + * + * Note: our lock protocol is to take the /dev/loop-control lock first, and the block device + * lock second, if both are taken, and always in this order, to avoid ABBA locking issues. */ + if (flock(control, LOCK_EX) < 0) + return -errno; + + nr = ioctl(control, LOOP_CTL_GET_FREE); + if (nr < 0) + return -errno; + + r = loop_configure(nr, open_flags, lock_op, &config, &d); + if (r >= 0) + break; + + /* -ENODEV or friends: Somebody might've gotten the same number from the kernel, used the + * device, and called LOOP_CTL_REMOVE on it. Let's retry with a new number. + * -EBUSY: a file descriptor is already bound to the loopback block device. + * -EUCLEAN: some left-over partition devices that were cleaned up. + * -ENOANO: we tried to use LO_FLAGS_DIRECT_IO but the kernel rejected it. */ + if (!ERRNO_IS_DEVICE_ABSENT(r) && !IN_SET(r, -EBUSY, -EUCLEAN, -ENOANO)) + return r; + + /* OK, this didn't work, let's try again a bit later, but first release the lock on the + * control device */ + if (flock(control, LOCK_UN) < 0) + return -errno; + + if (++n_attempts >= 64) /* Give up eventually */ + return -EBUSY; + + /* If we failed to enable direct IO mode, let's retry without it. We restart the process as + * on some combination of kernel version and storage filesystem, the kernel is very unhappy + * about a failed DIRECT_IO enablement and throws I/O errors. */ + if (r == -ENOANO && FLAGS_SET(config.info.lo_flags, LO_FLAGS_DIRECT_IO)) { + config.info.lo_flags &= ~LO_FLAGS_DIRECT_IO; + open_flags &= ~O_DIRECT; + + int non_direct_io_fd = fd_reopen(config.fd, O_CLOEXEC|O_NONBLOCK|open_flags); + if (non_direct_io_fd < 0) + return log_debug_errno( + non_direct_io_fd, + "Failed to reopen file descriptor without O_DIRECT: %m"); + + safe_close(reopened_fd); + fd = config.fd = /* For cleanups */ reopened_fd = non_direct_io_fd; + } + + /* Wait some random time, to make collision less likely. Let's pick a random time in the + * range 0ms…250ms, linearly scaled by the number of failed attempts. */ + usec = random_u64_range(UINT64_C(10) * USEC_PER_MSEC + + UINT64_C(240) * USEC_PER_MSEC * n_attempts/64); + log_debug("Trying again after %s.", FORMAT_TIMESPAN(usec, USEC_PER_MSEC)); + (void) usleep_safe(usec); + } + + d->backing_file = TAKE_PTR(backing_file); + d->backing_inode = st.st_ino; + d->backing_devno = st.st_dev; + + log_debug("Successfully acquired %s, devno=%u:%u, nr=%i, diskseq=%" PRIu64, + d->node, + major(d->devno), minor(d->devno), + d->nr, + d->diskseq); + + *ret = TAKE_PTR(d); + return 0; +} + +static uint32_t loop_flags_mangle(uint32_t loop_flags) { + int r; + + r = getenv_bool("SYSTEMD_LOOP_DIRECT_IO"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_LOOP_DIRECT_IO, ignoring: %m"); + + return UPDATE_FLAG(loop_flags, LO_FLAGS_DIRECT_IO, r != 0); /* Turn on LO_FLAGS_DIRECT_IO by default, unless explicitly configured to off. */ +} + +int loop_device_make( + int fd, + int open_flags, + uint64_t offset, + uint64_t size, + uint32_t sector_size, + uint32_t loop_flags, + int lock_op, + LoopDevice **ret) { + + assert(fd >= 0); + assert(ret); + + return loop_device_make_internal( + NULL, + fd, + open_flags, + offset, + size, + sector_size, + loop_flags_mangle(loop_flags), + lock_op, + ret); +} + +int loop_device_make_by_path_at( + int dir_fd, + const char *path, + int open_flags, + uint32_t sector_size, + uint32_t loop_flags, + int lock_op, + LoopDevice **ret) { + + int r, basic_flags, direct_flags, rdwr_flags; + _cleanup_close_ int fd = -EBADF; + bool direct = false; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + assert(ret); + assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY)); + + /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying + * read-only if we cannot. */ + + loop_flags = loop_flags_mangle(loop_flags); + + /* Let's open with O_DIRECT if we can. But not all file systems support that, hence fall back to + * non-O_DIRECT mode automatically, if it fails. */ + + basic_flags = O_CLOEXEC|O_NONBLOCK|O_NOCTTY; + direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0; + rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR; + + fd = xopenat(dir_fd, path, basic_flags|direct_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */ + fd = xopenat(dir_fd, path, basic_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0); + else + direct = direct_flags != 0; + if (fd < 0) { + r = fd; + + /* Retry read-only? */ + if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS)) + return r; + + fd = xopenat(dir_fd, path, basic_flags|direct_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0); + if (fd < 0 && direct_flags != 0) /* as above */ + fd = xopenat(dir_fd, path, basic_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0); + else + direct = direct_flags != 0; + if (fd < 0) + return r; /* Propagate original error */ + + open_flags = O_RDONLY; + } else if (open_flags < 0) + open_flags = O_RDWR; + + log_debug("Opened '%s' in %s access mode%s, with O_DIRECT %s%s.", + path, + open_flags == O_RDWR ? "O_RDWR" : "O_RDONLY", + open_flags != rdwr_flags ? " (O_RDWR was requested but not allowed)" : "", + direct ? "enabled" : "disabled", + direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : ""); + + return loop_device_make_internal( + dir_fd == AT_FDCWD ? path : NULL, + fd, + open_flags, + /* offset = */ 0, + /* size = */ 0, + sector_size, + loop_flags, + lock_op, + ret); +} + +int loop_device_make_by_path_memory( + const char *path, + int open_flags, + uint32_t sector_size, + uint32_t loop_flags, + int lock_op, + LoopDevice **ret) { + + _cleanup_close_ int fd = -EBADF, mfd = -EBADF; + _cleanup_free_ char *fn = NULL; + struct stat st; + int r; + + assert(path); + assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); + assert(ret); + + loop_flags &= ~LO_FLAGS_DIRECT_IO; /* memfds don't support O_DIRECT, hence LO_FLAGS_DIRECT_IO can't be used either */ + + fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISREG(st.st_mode) && !S_ISBLK(st.st_mode)) + return -EBADF; + + r = path_extract_filename(path, &fn); + if (r < 0) + return r; + + mfd = memfd_clone_fd(fd, fn, open_flags|O_CLOEXEC); + if (mfd < 0) + return mfd; + + fd = safe_close(fd); /* Let's close the original early */ + + return loop_device_make_internal(NULL, mfd, open_flags, 0, 0, sector_size, loop_flags, lock_op, ret); +} + +static LoopDevice* loop_device_free(LoopDevice *d) { + _cleanup_close_ int control = -EBADF; + int r; + + if (!d) + return NULL; + + /* Release any lock we might have on the device first. We want to open+lock the /dev/loop-control + * device below, but our lock protocol says that if both control and block device locks are taken, + * the control lock needs to be taken first, the block device lock second — in order to avoid ABBA + * locking issues. Moreover, we want to issue LOOP_CLR_FD on the block device further down, and that + * would fail if we had another fd open to the device. */ + d->lock_fd = safe_close(d->lock_fd); + + /* Let's open the control device early, and lock it, so that we can release our block device and + * delete it in a synchronized fashion, and allocators won't needlessly see the block device as free + * while we are about to delete it. */ + if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) { + control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (control < 0) + log_debug_errno(errno, "Failed to open loop control device, cannot remove loop device '%s', ignoring: %m", strna(d->node)); + else if (flock(control, LOCK_EX) < 0) + log_debug_errno(errno, "Failed to lock loop control device, ignoring: %m"); + } + + /* Then let's release the loopback block device */ + if (d->fd >= 0) { + /* Implicitly sync the device, since otherwise in-flight blocks might not get written */ + if (fsync(d->fd) < 0) + log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m"); + + if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) { + /* We are supposed to clear the loopback device. Let's do this synchronously: lock + * the device, manually remove all partitions and then clear it. This should ensure + * udev doesn't concurrently access the devices, and we can be reasonably sure that + * once we are done here the device is cleared and all its partition children + * removed. Note that we lock our primary device fd here (and not a separate locking + * fd, as we do during allocation, since we want to keep the lock all the way through + * the LOOP_CLR_FD, but that call would fail if we had more than one fd open.) */ + + if (flock(d->fd, LOCK_EX) < 0) + log_debug_errno(errno, "Failed to lock loop block device, ignoring: %m"); + + r = block_device_remove_all_partitions(d->dev, d->fd); + if (r < 0) + log_debug_errno(r, "Failed to remove partitions of loopback block device, ignoring: %m"); + + if (ioctl(d->fd, LOOP_CLR_FD) < 0) + log_debug_errno(errno, "Failed to clear loop device, ignoring: %m"); + } + + safe_close(d->fd); + } + + /* Now that the block device is released, let's also try to remove it */ + if (control >= 0) { + useconds_t delay = 5 * USEC_PER_MSEC; /* A total delay of 5090 ms between 39 attempts, + * (4*5 + 5*10 + 5*20 + … + 3*640) = 5090. */ + + for (unsigned attempt = 1;; attempt++) { + if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0) + break; + if (errno != EBUSY || attempt > 38) { + log_debug_errno(errno, "Failed to remove device %s: %m", strna(d->node)); + break; + } + if (attempt % 5 == 0) { + log_debug("Device is still busy after %u attempts…", attempt); + delay *= 2; + } + + (void) usleep_safe(delay); + } + } + + free(d->node); + sd_device_unref(d->dev); + free(d->backing_file); + return mfree(d); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(LoopDevice, loop_device, loop_device_free); + +void loop_device_relinquish(LoopDevice *d) { + assert(d); + + /* Don't attempt to clean up the loop device anymore from this point on. Leave the clean-ing up to the kernel + * itself, using the loop device "auto-clear" logic we already turned on when creating the device. */ + + d->relinquished = true; +} + +void loop_device_unrelinquish(LoopDevice *d) { + assert(d); + d->relinquished = false; +} + +int loop_device_open( + sd_device *dev, + int open_flags, + int lock_op, + LoopDevice **ret) { + + _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF; + _cleanup_free_ char *node = NULL, *backing_file = NULL; + dev_t devnum, backing_devno = 0; + struct loop_info64 info; + ino_t backing_inode = 0; + uint64_t diskseq = 0; + LoopDevice *d; + const char *s; + int r, nr = -1; + + assert(dev); + assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); + assert(ret); + + /* Even if fd is provided through the argument in loop_device_open_from_fd(), we reopen the inode + * here, instead of keeping just a dup() clone of it around, since we want to ensure that the + * O_DIRECT flag of the handle we keep is off, we have our own file index, and have the right + * read/write mode in effect. */ + fd = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); + if (fd < 0) + return fd; + + if ((lock_op & ~LOCK_NB) != LOCK_UN) { + lock_fd = open_lock_fd(fd, lock_op); + if (lock_fd < 0) + return lock_fd; + } + + if (ioctl(fd, LOOP_GET_STATUS64, &info) >= 0) { +#if HAVE_VALGRIND_MEMCHECK_H + /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + nr = info.lo_number; + + if (sd_device_get_sysattr_value(dev, "loop/backing_file", &s) >= 0) { + backing_file = strdup(s); + if (!backing_file) + return -ENOMEM; + } + + backing_devno = info.lo_device; + backing_inode = info.lo_inode; + } + + r = fd_get_diskseq(fd, &diskseq); + if (r < 0 && r != -EOPNOTSUPP) + return r; + + uint32_t sector_size; + r = blockdev_get_sector_size(fd, §or_size); + if (r < 0) + return r; + + r = sd_device_get_devnum(dev, &devnum); + if (r < 0) + return r; + + r = sd_device_get_devname(dev, &s); + if (r < 0) + return r; + + node = strdup(s); + if (!node) + return -ENOMEM; + + d = new(LoopDevice, 1); + if (!d) + return -ENOMEM; + + *d = (LoopDevice) { + .n_ref = 1, + .fd = TAKE_FD(fd), + .lock_fd = TAKE_FD(lock_fd), + .nr = nr, + .node = TAKE_PTR(node), + .dev = sd_device_ref(dev), + .backing_file = TAKE_PTR(backing_file), + .backing_inode = backing_inode, + .backing_devno = backing_devno, + .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */ + .devno = devnum, + .diskseq = diskseq, + .uevent_seqnum_not_before = UINT64_MAX, + .timestamp_not_before = USEC_INFINITY, + .sector_size = sector_size, + }; + + *ret = d; + return 0; +} + +int loop_device_open_from_fd( + int fd, + int open_flags, + int lock_op, + LoopDevice **ret) { + + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int r; + + r = block_device_new_from_fd(ASSERT_FD(fd), 0, &dev); + if (r < 0) + return r; + + return loop_device_open(dev, open_flags, lock_op, ret); +} + +int loop_device_open_from_path( + const char *path, + int open_flags, + int lock_op, + LoopDevice **ret) { + + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int r; + + assert(path); + + r = block_device_new_from_path(path, 0, &dev); + if (r < 0) + return r; + + return loop_device_open(dev, open_flags, lock_op, ret); +} + +static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) { + char sysfs[STRLEN("/sys/dev/block/:/partition") + 2*DECIMAL_STR_MAX(dev_t) + 1]; + _cleanup_free_ char *buffer = NULL; + uint64_t current_offset, current_size, partno; + _cleanup_close_ int whole_fd = -EBADF; + struct stat st; + dev_t devno; + int r; + + /* Resizes the partition the loopback device refer to (assuming it refers to one instead of an actual + * loopback device), and changes the offset, if needed. This is a fancy wrapper around + * BLKPG_RESIZE_PARTITION. */ + + if (fstat(ASSERT_FD(partition_fd), &st) < 0) + return -errno; + + assert(S_ISBLK(st.st_mode)); + + xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev)); + r = read_one_line_file(sysfs, &buffer); + if (r == -ENOENT) /* not a partition, cannot resize */ + return -ENOTTY; + if (r < 0) + return r; + r = safe_atou64(buffer, &partno); + if (r < 0) + return r; + + xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/start", DEVNUM_FORMAT_VAL(st.st_rdev)); + + buffer = mfree(buffer); + r = read_one_line_file(sysfs, &buffer); + if (r < 0) + return r; + r = safe_atou64(buffer, ¤t_offset); + if (r < 0) + return r; + if (current_offset > UINT64_MAX/512U) + return -EINVAL; + current_offset *= 512U; + + if (ioctl(partition_fd, BLKGETSIZE64, ¤t_size) < 0) + return -EINVAL; + + if (size == UINT64_MAX && offset == UINT64_MAX) + return 0; + if (current_size == size && current_offset == offset) + return 0; + + xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/../dev", DEVNUM_FORMAT_VAL(st.st_rdev)); + + buffer = mfree(buffer); + r = read_one_line_file(sysfs, &buffer); + if (r < 0) + return r; + r = parse_devnum(buffer, &devno); + if (r < 0) + return r; + + whole_fd = r = device_open_from_devnum(S_IFBLK, devno, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL); + if (r < 0) + return r; + + return block_device_resize_partition( + whole_fd, + partno, + offset == UINT64_MAX ? current_offset : offset, + size == UINT64_MAX ? current_size : size); +} + +int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) { + struct loop_info64 info; + + assert(d); + assert(d->fd >= 0); + + /* Changes the offset/start of the loop device relative to the beginning of the underlying file or + * block device. If this loop device actually refers to a partition and not a loopback device, we'll + * try to adjust the partition offsets instead. + * + * If either offset or size is UINT64_MAX we won't change that parameter. */ + + if (d->nr < 0) /* not a loopback device */ + return resize_partition(d->fd, offset, size); + + if (ioctl(d->fd, LOOP_GET_STATUS64, &info) < 0) + return -errno; + +#if HAVE_VALGRIND_MEMCHECK_H + /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + + if (size == UINT64_MAX && offset == UINT64_MAX) + return 0; + if (info.lo_sizelimit == size && info.lo_offset == offset) + return 0; + + if (size != UINT64_MAX) + info.lo_sizelimit = size; + if (offset != UINT64_MAX) + info.lo_offset = offset; + + return RET_NERRNO(ioctl(d->fd, LOOP_SET_STATUS64, &info)); +} + +int loop_device_flock(LoopDevice *d, int operation) { + assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX)); + assert(d); + + /* When unlocking just close the lock fd */ + if ((operation & ~LOCK_NB) == LOCK_UN) { + d->lock_fd = safe_close(d->lock_fd); + return 0; + } + + /* If we had no lock fd so far, create one and lock it right-away */ + if (d->lock_fd < 0) { + d->lock_fd = open_lock_fd(ASSERT_FD(d->fd), operation); + if (d->lock_fd < 0) + return d->lock_fd; + + return 0; + } + + /* Otherwise change the current lock mode on the existing fd */ + return RET_NERRNO(flock(d->lock_fd, operation)); +} + +int loop_device_sync(LoopDevice *d) { + assert(d); + + /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that + * we can check the return value though. */ + + return RET_NERRNO(fsync(ASSERT_FD(d->fd))); +} + +int loop_device_set_autoclear(LoopDevice *d, bool autoclear) { + struct loop_info64 info; + + assert(d); + + if (ioctl(ASSERT_FD(d->fd), LOOP_GET_STATUS64, &info) < 0) + return -errno; + + if (autoclear == FLAGS_SET(info.lo_flags, LO_FLAGS_AUTOCLEAR)) + return 0; + + SET_FLAG(info.lo_flags, LO_FLAGS_AUTOCLEAR, autoclear); + + if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0) + return -errno; + + return 1; +} + +int loop_device_set_filename(LoopDevice *d, const char *name) { + struct loop_info64 info; + + assert(d); + + /* Sets the .lo_file_name of the loopback device. This is supposed to contain the path to the file + * backing the block device, but is actually just a free-form string you can pass to the kernel. Most + * tools that actually care for the backing file path use the sysfs attribute file loop/backing_file + * which is a kernel generated string, subject to file system namespaces and such. + * + * .lo_file_name is useful since userspace can select it freely when creating a loopback block + * device, and we can use it for /dev/disk/by-loop-ref/ symlinks, and similar, so that apps can + * recognize their own loopback files. */ + + if (name && strlen(name) >= sizeof(info.lo_file_name)) + return -ENOBUFS; + + if (ioctl(ASSERT_FD(d->fd), LOOP_GET_STATUS64, &info) < 0) + return -errno; + + if (strneq((char*) info.lo_file_name, strempty(name), sizeof(info.lo_file_name))) + return 0; + + if (name) { + strncpy((char*) info.lo_file_name, name, sizeof(info.lo_file_name)-1); + info.lo_file_name[sizeof(info.lo_file_name)-1] = 0; + } else + memzero(info.lo_file_name, sizeof(info.lo_file_name)); + + if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0) + return -errno; + + return 1; +} diff --git a/src/shared/loop-util.h b/src/shared/loop-util.h new file mode 100644 index 0000000..d77c314 --- /dev/null +++ b/src/shared/loop-util.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +#include "macro.h" +#include "time-util.h" + +typedef struct LoopDevice LoopDevice; + +/* Some helpers for setting up loopback block devices */ + +struct LoopDevice { + unsigned n_ref; + int fd; + int lock_fd; + int nr; /* The loopback device index (i.e. 4 for /dev/loop4); if this object encapsulates a non-loopback block device, set to -1 */ + dev_t devno; /* The loopback device's own dev_t */ + char *node; + sd_device *dev; + char *backing_file; + bool relinquished; + dev_t backing_devno; /* The backing file's dev_t */ + ino_t backing_inode; /* The backing file's ino_t */ + uint64_t diskseq; /* Block device sequence number, monothonically incremented by the kernel on create/attach, or 0 if we don't know */ + uint64_t uevent_seqnum_not_before; /* uevent sequm right before we attached the loopback device, or UINT64_MAX if we don't know */ + usec_t timestamp_not_before; /* CLOCK_MONOTONIC timestamp taken immediately before attaching the loopback device, or USEC_INFINITY if we don't know */ + uint32_t sector_size; +}; + +/* Returns true if LoopDevice object is not actually a loopback device but some other block device we just wrap */ +#define LOOP_DEVICE_IS_FOREIGN(d) ((d)->nr < 0) + +int loop_device_make(int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret); +int loop_device_make_by_path_at(int dir_fd, const char *path, int open_flags, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret); +static inline int loop_device_make_by_path(const char *path, int open_flags, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret) { + return loop_device_make_by_path_at(AT_FDCWD, path, open_flags, sector_size, loop_flags, lock_op, ret); +} +int loop_device_make_by_path_memory(const char *path, int open_flags, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret); +int loop_device_open(sd_device *dev, int open_flags, int lock_op, LoopDevice **ret); +int loop_device_open_from_fd(int fd, int open_flags, int lock_op, LoopDevice **ret); +int loop_device_open_from_path(const char *path, int open_flags, int lock_op, LoopDevice **ret); + +LoopDevice* loop_device_ref(LoopDevice *d); +LoopDevice* loop_device_unref(LoopDevice *d); +DEFINE_TRIVIAL_CLEANUP_FUNC(LoopDevice*, loop_device_unref); + +void loop_device_relinquish(LoopDevice *d); +void loop_device_unrelinquish(LoopDevice *d); + +int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size); + +int loop_device_flock(LoopDevice *d, int operation); +int loop_device_sync(LoopDevice *d); + +int loop_device_set_autoclear(LoopDevice *d, bool autoclear); +int loop_device_set_filename(LoopDevice *d, const char *name); diff --git a/src/shared/loopback-setup.c b/src/shared/loopback-setup.c new file mode 100644 index 0000000..a02baf8 --- /dev/null +++ b/src/shared/loopback-setup.c @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-netlink.h" + +#include "loopback-setup.h" +#include "missing_network.h" +#include "netlink-util.h" +#include "time-util.h" + +#define LOOPBACK_SETUP_TIMEOUT_USEC (5 * USEC_PER_SEC) + +struct state { + unsigned n_messages; + int rcode; + const char *error_message; + const char *success_message; + const char *eexist_message; +}; + +static int generic_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { + struct state *s = ASSERT_PTR(userdata); + int r; + + assert(s->n_messages > 0); + s->n_messages--; + + errno = 0; + + r = sd_netlink_message_get_errno(m); + if (r == -EEXIST && s->eexist_message) + log_debug_errno(r, "%s", s->eexist_message); + else if (r < 0) + log_debug_errno(r, "%s: %m", s->error_message); + else + log_debug("%s", s->success_message); + + s->rcode = r; + return 0; +} + +static int start_loopback(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_SETLINK, LOOPBACK_IFINDEX); + if (r < 0) + return r; + + r = sd_rtnl_message_link_set_flags(req, IFF_UP, IFF_UP); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, LOOPBACK_SETUP_TIMEOUT_USEC, "systemd-start-loopback"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int add_ipv4_address(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_prefixlen(req, 8); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_flags(req, IFA_F_PERMANENT); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST); + if (r < 0) + return r; + + r = sd_netlink_message_append_in_addr(req, IFA_LOCAL, &(struct in_addr) { .s_addr = htobe32(INADDR_LOOPBACK) } ); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv4"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int add_ipv6_address(sd_netlink *rtnl, struct state *s) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(rtnl); + assert(s); + + r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET6); + if (r < 0) + return r; + + r = sd_rtnl_message_addr_set_prefixlen(req, 128); + if (r < 0) + return r; + + uint32_t flags = IFA_F_PERMANENT|IFA_F_NOPREFIXROUTE; + r = sd_rtnl_message_addr_set_flags(req, flags & 0xffu); /* rtnetlink wants low 8 bit of flags via regular flags field… */ + if (r < 0) + return r; + if ((flags & ~0xffu) != 0) { + r = sd_netlink_message_append_u32(req, IFA_FLAGS, flags); /* …and the rest of the flags via IFA_FLAGS */ + if (r < 0) + return r; + } + + r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST); + if (r < 0) + return r; + + r = sd_netlink_message_append_in6_addr(req, IFA_LOCAL, &in6addr_loopback); + if (r < 0) + return r; + + r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv6"); + if (r < 0) + return r; + + s->n_messages ++; + return 0; +} + +static int check_loopback(sd_netlink *rtnl) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL; + unsigned flags; + int r; + + r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, LOOPBACK_IFINDEX); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, req, USEC_INFINITY, &reply); + if (r < 0) + return r; + + r = sd_rtnl_message_link_get_flags(reply, &flags); + if (r < 0) + return r; + + return flags & IFF_UP; +} + +int loopback_setup(void) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + struct state state_4 = { + .error_message = "Failed to add address 127.0.0.1 to loopback interface", + .success_message = "Successfully added address 127.0.0.1 to loopback interface", + .eexist_message = "127.0.0.1 has already been added to loopback interface", + }, state_6 = { + .error_message = "Failed to add address ::1 to loopback interface", + .success_message = "Successfully added address ::1 to loopback interface", + .eexist_message = "::1 has already been added to loopback interface", + }, state_up = { + .error_message = "Failed to bring loopback interface up", + .success_message = "Successfully brought loopback interface up", + }; + int r; + + /* Note, we, generally assume callers ignore the return code here (except test cases), hence only log add LOG_WARN level. */ + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_warning_errno(r, "Failed to open netlink, ignoring: %m"); + + /* Note that we add the IP addresses here explicitly even though the kernel does that too implicitly when + * setting up the loopback device. The reason we do this here a second time (and possibly race against the + * kernel) is that we want to synchronously wait until the IP addresses are set up correctly, see + * + * https://github.com/systemd/systemd/issues/5641 */ + + r = add_ipv4_address(rtnl, &state_4); + if (r < 0) + return log_warning_errno(r, "Failed to enqueue IPv4 loopback address add request, ignoring: %m"); + + r = add_ipv6_address(rtnl, &state_6); + if (r < 0) + return log_warning_errno(r, "Failed to enqueue IPv6 loopback address add request, ignoring: %m"); + + r = start_loopback(rtnl, &state_up); + if (r < 0) + return log_warning_errno(r, "Failed to enqueue loopback interface start request, ignoring: %m"); + + while (state_4.n_messages + state_6.n_messages + state_up.n_messages > 0) { + r = sd_netlink_wait(rtnl, LOOPBACK_SETUP_TIMEOUT_USEC); + if (r < 0) + return log_warning_errno(r, "Failed to wait for netlink event, ignoring: %m"); + + r = sd_netlink_process(rtnl, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to process netlink event, ignoring: %m"); + } + + /* Note that we don't really care whether the addresses could be added or not */ + if (state_up.rcode != 0) { + + /* If we lack the permissions to configure the loopback device, but we find it to be already + * configured, let's exit cleanly, in order to supported unprivileged containers. */ + if (ERRNO_IS_PRIVILEGE(state_up.rcode)) { + r = check_loopback(rtnl); + if (r < 0) + log_debug_errno(r, "Failed to check if loopback device might already be up, ignoring: %m"); + else if (r > 0) { + log_debug("Configuring loopback failed, but device is already up, suppressing failure."); + return 0; + } + } + + return log_warning_errno(state_up.rcode, "Failed to configure loopback network device, ignoring: %m"); + } + + return 0; +} diff --git a/src/shared/loopback-setup.h b/src/shared/loopback-setup.h new file mode 100644 index 0000000..a7ee2da --- /dev/null +++ b/src/shared/loopback-setup.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int loopback_setup(void); diff --git a/src/shared/lsm-util.c b/src/shared/lsm-util.c new file mode 100644 index 0000000..7b6d419 --- /dev/null +++ b/src/shared/lsm-util.c @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "extract-word.h" +#include "fileio.h" +#include "lsm-util.h" +#include "string-util.h" + +int lsm_supported(const char *name) { + _cleanup_free_ char *lsm_list = NULL; + int r; + + assert(name); + + r = read_one_line_file("/sys/kernel/security/lsm", &lsm_list); + if (r == -ENOENT) /* LSM support not available at all? */ + return false; + if (r < 0) + return log_debug_errno(r, "Failed to read /sys/kernel/security/lsm: %m"); + + for (const char *p = lsm_list;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ",", 0); + if (r == 0) + return false; + if (r < 0) + return log_debug_errno(r, "Failed to parse /sys/kernel/security/lsm: %m"); + + if (streq(word, name)) + return true; + } +} diff --git a/src/shared/lsm-util.h b/src/shared/lsm-util.h new file mode 100644 index 0000000..c4d9027 --- /dev/null +++ b/src/shared/lsm-util.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int lsm_supported(const char *name); diff --git a/src/shared/machine-credential.c b/src/shared/machine-credential.c new file mode 100644 index 0000000..17f7afc --- /dev/null +++ b/src/shared/machine-credential.c @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "creds-util.h" +#include "escape.h" +#include "extract-word.h" +#include "fileio.h" +#include "macro.h" +#include "memory-util.h" +#include "machine-credential.h" +#include "path-util.h" +#include "string-util-fundamental.h" + +static void machine_credential_done(MachineCredential *cred) { + assert(cred); + + cred->id = mfree(cred->id); + cred->data = erase_and_free(cred->data); + cred->size = 0; +} + +void machine_credential_free_all(MachineCredential *creds, size_t n) { + assert(creds || n == 0); + + FOREACH_ARRAY(cred, creds, n) + machine_credential_done(cred); + + free(creds); +} + +int machine_credential_set(MachineCredential **credentials, size_t *n_credentials, const char *cred_string) { + _cleanup_free_ char *word = NULL, *data = NULL; + ssize_t l; + int r; + const char *p = ASSERT_PTR(cred_string); + + assert(credentials && n_credentials); + assert(*credentials || *n_credentials == 0); + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse --set-credential= parameter: %m"); + if (r == 0 || !p) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", cred_string); + + if (!credential_name_valid(word)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word); + + FOREACH_ARRAY(cred, *credentials, *n_credentials) + if (streq(cred->id, word)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word); + + l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data); + if (l < 0) + return log_error_errno(l, "Failed to unescape credential data: %s", p); + + if (!GREEDY_REALLOC(*credentials, *n_credentials + 1)) + return log_oom(); + + (*credentials)[(*n_credentials)++] = (MachineCredential) { + .id = TAKE_PTR(word), + .data = TAKE_PTR(data), + .size = l, + }; + + return 0; +} + +int machine_credential_load(MachineCredential **credentials, size_t *n_credentials, const char *cred_path) { + ReadFullFileFlags flags = READ_FULL_FILE_SECURE; + _cleanup_(erase_and_freep) char *data = NULL; + _cleanup_free_ char *word = NULL, *j = NULL; + const char *p = ASSERT_PTR(cred_path); + size_t size; + int r; + + assert(credentials && n_credentials); + assert(*credentials || *n_credentials == 0); + + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse --load-credential= parameter: %m"); + if (r == 0 || !p) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", cred_path); + + if (!credential_name_valid(word)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word); + + FOREACH_ARRAY(cred, *credentials, *n_credentials) + if (streq(cred->id, word)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word); + + if (is_path(p) && path_is_valid(p)) + flags |= READ_FULL_FILE_CONNECT_SOCKET; + else if (credential_name_valid(p)) { + const char *e; + + r = get_credentials_dir(&e); + if (r < 0) + return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word); + + j = path_join(e, p); + if (!j) + return log_oom(); + + p = j; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential source appears to be neither a valid path nor a credential name: %s", p); + + r = read_full_file_full(AT_FDCWD, p, UINT64_MAX, SIZE_MAX, + flags, + NULL, + &data, &size); + if (r < 0) + return log_error_errno(r, "Failed to read credential '%s': %m", p); + + if (!GREEDY_REALLOC(*credentials, *n_credentials + 1)) + return log_oom(); + + (*credentials)[(*n_credentials)++] = (MachineCredential) { + .id = TAKE_PTR(word), + .data = TAKE_PTR(data), + .size = size, + }; + + return 0; +} diff --git a/src/shared/machine-credential.h b/src/shared/machine-credential.h new file mode 100644 index 0000000..c9044a2 --- /dev/null +++ b/src/shared/machine-credential.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef struct MachineCredential { + char *id; + void *data; + size_t size; +} MachineCredential; + +void machine_credential_free_all(MachineCredential *creds, size_t n); +int machine_credential_set(MachineCredential **credentials, size_t *n_credentials, const char *cred_string); +int machine_credential_load(MachineCredential **credentials, size_t *n_credentials, const char *cred_path); diff --git a/src/shared/machine-id-setup.c b/src/shared/machine-id-setup.c new file mode 100644 index 0000000..3efba03 --- /dev/null +++ b/src/shared/machine-id-setup.c @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "chase.h" +#include "creds-util.h" +#include "fd-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "log.h" +#include "machine-id-setup.h" +#include "macro.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "path-util.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "sync-util.h" +#include "umask-util.h" +#include "virt.h" + +static int acquire_machine_id_from_credential(sd_id128_t *ret) { + _cleanup_free_ char *buf = NULL; + int r; + + r = read_credential_with_decryption("system.machine_id", (void**) &buf, /* ret_size= */ NULL); + if (r < 0) + return log_warning_errno(r, "Failed to read system.machine_id credential, ignoring: %m"); + if (r == 0) /* not found */ + return -ENXIO; + + r = sd_id128_from_string(buf, ret); + if (r < 0) + return log_warning_errno(r, "Failed to parse system.machine_id credential, ignoring: %m"); + + log_info("Initializing machine ID from credential."); + return 0; +} + +static int generate_machine_id(const char *root, sd_id128_t *ret) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(ret); + + /* First, try reading the D-Bus machine id, unless it is a symlink */ + fd = chase_and_open("/var/lib/dbus/machine-id", root, CHASE_PREFIX_ROOT | CHASE_NOFOLLOW, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL); + if (fd >= 0 && id128_read_fd(fd, ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, ret) >= 0) { + log_info("Initializing machine ID from D-Bus machine ID."); + return 0; + } + + if (isempty(root) && running_in_chroot() <= 0) { + /* Let's use a system credential for the machine ID if we can */ + r = acquire_machine_id_from_credential(ret); + if (r >= 0) + return r; + + /* If that didn't work, see if we are running in a container, + * and a machine ID was passed in via $container_uuid the way + * libvirt/LXC does it */ + + if (detect_container() > 0) { + _cleanup_free_ char *e = NULL; + + if (getenv_for_pid(1, "container_uuid", &e) > 0 && + sd_id128_from_string(e, ret) >= 0) { + log_info("Initializing machine ID from container UUID."); + return 0; + } + + } else if (IN_SET(detect_vm(), VIRTUALIZATION_KVM, VIRTUALIZATION_AMAZON, VIRTUALIZATION_QEMU, VIRTUALIZATION_XEN)) { + + /* If we are not running in a container, see if we are running in a VM that provides + * a system UUID via the SMBIOS/DMI interfaces. Such environments include QEMU/KVM + * with the -uuid on the qemu command line or the Amazon EC2 Nitro hypervisor. */ + + if (id128_get_product(ret) >= 0) { + log_info("Initializing machine ID from VM UUID."); + return 0; + } + } + } + + /* If that didn't work, generate a random machine id */ + r = sd_id128_randomize(ret); + if (r < 0) + return log_error_errno(r, "Failed to generate randomized machine ID: %m"); + + log_info("Initializing machine ID from random generator."); + return 0; +} + +int machine_id_setup(const char *root, bool force_transient, sd_id128_t machine_id, sd_id128_t *ret) { + const char *etc_machine_id, *run_machine_id; + _cleanup_close_ int fd = -EBADF; + bool writable; + int r; + + etc_machine_id = prefix_roota(root, "/etc/machine-id"); + + WITH_UMASK(0000) { + /* We create this 0444, to indicate that this isn't really + * something you should ever modify. Of course, since the file + * will be owned by root it doesn't matter much, but maybe + * people look. */ + + (void) mkdir_parents(etc_machine_id, 0755); + fd = open(etc_machine_id, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY, 0444); + if (fd < 0) { + int old_errno = errno; + + fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + if (old_errno == EROFS && errno == ENOENT) + return log_error_errno(errno, + "System cannot boot: Missing /etc/machine-id and /etc is mounted read-only.\n" + "Booting up is supported only when:\n" + "1) /etc/machine-id exists and is populated.\n" + "2) /etc/machine-id exists and is empty.\n" + "3) /etc/machine-id is missing and /etc is writable.\n"); + else + return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id); + } + + writable = false; + } else + writable = true; + } + + /* A we got a valid machine ID argument, that's what counts */ + if (sd_id128_is_null(machine_id)) { + + /* Try to read any existing machine ID */ + if (id128_read_fd(fd, ID128_FORMAT_PLAIN, ret) >= 0) + return 0; + + /* Hmm, so, the id currently stored is not useful, then let's generate one */ + r = generate_machine_id(root, &machine_id); + if (r < 0) + return r; + } + + if (writable) { + if (lseek(fd, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to seek %s: %m", etc_machine_id); + + if (ftruncate(fd, 0) < 0) + return log_error_errno(errno, "Failed to truncate %s: %m", etc_machine_id); + + /* If the caller requested a transient machine-id, write the string "uninitialized\n" to + * disk and overmount it with a transient file. + * + * Otherwise write the machine-id directly to disk. */ + if (force_transient) { + r = loop_write(fd, "uninitialized\n", SIZE_MAX); + if (r < 0) + return log_error_errno(r, "Failed to write uninitialized %s: %m", etc_machine_id); + + r = fsync_full(fd); + if (r < 0) + return log_error_errno(r, "Failed to sync %s: %m", etc_machine_id); + } else { + r = id128_write_fd(fd, ID128_FORMAT_PLAIN | ID128_SYNC_ON_WRITE, machine_id); + if (r < 0) + return log_error_errno(r, "Failed to write %s: %m", etc_machine_id); + else + goto finish; + } + } + + fd = safe_close(fd); + + /* Hmm, we couldn't or shouldn't write the machine-id to /etc? + * So let's write it to /run/machine-id as a replacement */ + + run_machine_id = prefix_roota(root, "/run/machine-id"); + + WITH_UMASK(0022) + r = id128_write(run_machine_id, ID128_FORMAT_PLAIN, machine_id); + if (r < 0) { + (void) unlink(run_machine_id); + return log_error_errno(r, "Cannot write %s: %m", run_machine_id); + } + + /* And now, let's mount it over */ + r = mount_follow_verbose(LOG_ERR, run_machine_id, etc_machine_id, NULL, MS_BIND, NULL); + if (r < 0) { + (void) unlink(run_machine_id); + return r; + } + + log_full(force_transient ? LOG_DEBUG : LOG_INFO, "Installed transient %s file.", etc_machine_id); + + /* Mark the mount read-only */ + r = mount_follow_verbose(LOG_WARNING, NULL, etc_machine_id, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL); + if (r < 0) + return r; + +finish: + if (ret) + *ret = machine_id; + + return 0; +} + +int machine_id_commit(const char *root) { + _cleanup_close_ int fd = -EBADF, initial_mntns_fd = -EBADF; + const char *etc_machine_id; + sd_id128_t id; + int r; + + /* Before doing anything, sync everything to ensure any changes by first-boot units are persisted. + * + * First, explicitly sync the file systems we care about and check if it worked. */ + FOREACH_STRING(sync_path, "/etc/", "/var/") { + r = syncfs_path(AT_FDCWD, sync_path); + if (r < 0) + return log_error_errno(r, "Cannot sync %s: %m", sync_path); + } + + /* Afterwards, sync() the rest too, but we can't check the return value for these. */ + sync(); + + /* Replaces a tmpfs bind mount of /etc/machine-id by a proper file, atomically. For this, the umount is removed + * in a mount namespace, a new file is created at the right place. Afterwards the mount is also removed in the + * original mount namespace, thus revealing the file that was just created. */ + + etc_machine_id = prefix_roota(root, "/etc/machine-id"); + + r = path_is_mount_point(etc_machine_id, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", etc_machine_id); + if (r == 0) { + log_debug("%s is not a mount point. Nothing to do.", etc_machine_id); + return 0; + } + + /* Read existing machine-id */ + fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id); + + r = fd_is_temporary_fs(fd); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is on a temporary file system: %m", etc_machine_id); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EROFS), + "%s is not on a temporary file system.", + etc_machine_id); + + r = id128_read_fd(fd, ID128_FORMAT_PLAIN, &id); + if (r < 0) + return log_error_errno(r, "We didn't find a valid machine ID in %s: %m", etc_machine_id); + + fd = safe_close(fd); + + /* Store current mount namespace */ + r = namespace_open(0, NULL, &initial_mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Can't fetch current mount namespace: %m"); + + /* Switch to a new mount namespace, isolate ourself and unmount etc_machine_id in our new namespace */ + r = detach_mount_namespace(); + if (r < 0) + return log_error_errno(r, "Failed to set up new mount namespace: %m"); + + r = umount_verbose(LOG_ERR, etc_machine_id, 0); + if (r < 0) + return r; + + /* Update a persistent version of etc_machine_id */ + r = id128_write(etc_machine_id, ID128_FORMAT_PLAIN | ID128_SYNC_ON_WRITE, id); + if (r < 0) + return log_error_errno(r, "Cannot write %s. This is mandatory to get a persistent machine ID: %m", etc_machine_id); + + /* Return to initial namespace and proceed a lazy tmpfs unmount */ + r = namespace_enter(-1, initial_mntns_fd, -1, -1, -1); + if (r < 0) + return log_warning_errno(r, "Failed to switch back to initial mount namespace: %m.\nWe'll keep transient %s file until next reboot.", etc_machine_id); + + if (umount2(etc_machine_id, MNT_DETACH) < 0) + return log_warning_errno(errno, "Failed to unmount transient %s file: %m.\nWe keep that mount until next reboot.", etc_machine_id); + + return 0; +} diff --git a/src/shared/machine-id-setup.h b/src/shared/machine-id-setup.h new file mode 100644 index 0000000..cce5819 --- /dev/null +++ b/src/shared/machine-id-setup.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int machine_id_commit(const char *root); +int machine_id_setup(const char *root, bool force_transient, sd_id128_t requested, sd_id128_t *ret); diff --git a/src/shared/machine-pool.c b/src/shared/machine-pool.c new file mode 100644 index 0000000..b372de4 --- /dev/null +++ b/src/shared/machine-pool.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "btrfs-util.h" +#include "label-util.h" +#include "machine-pool.h" +#include "missing_magic.h" +#include "stat-util.h" + +static int check_btrfs(void) { + struct statfs sfs; + + if (statfs("/var/lib/machines", &sfs) < 0) { + if (errno != ENOENT) + return -errno; + + if (statfs("/var/lib", &sfs) < 0) + return -errno; + } + + return F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC); +} + +int setup_machine_directory(sd_bus_error *error, bool use_btrfs_subvol, bool use_btrfs_quota) { + int r; + + r = check_btrfs(); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether /var/lib/machines is located on btrfs: %m"); + if (r == 0) + return 0; + + if (!use_btrfs_subvol) + return 0; + + (void) btrfs_subvol_make_label("/var/lib/machines"); + + if (!use_btrfs_quota) + return 0; + + r = btrfs_quota_enable("/var/lib/machines", true); + if (r < 0) + log_warning_errno(r, "Failed to enable quota for /var/lib/machines, ignoring: %m"); + + r = btrfs_subvol_auto_qgroup("/var/lib/machines", 0, true); + if (r < 0) + log_warning_errno(r, "Failed to set up default quota hierarchy for /var/lib/machines, ignoring: %m"); + + return 0; +} diff --git a/src/shared/machine-pool.h b/src/shared/machine-pool.h new file mode 100644 index 0000000..c57e478 --- /dev/null +++ b/src/shared/machine-pool.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +int setup_machine_directory(sd_bus_error *error, bool use_btrfs_subvol, bool use_btrfs_quota); diff --git a/src/shared/macvlan-util.c b/src/shared/macvlan-util.c new file mode 100644 index 0000000..11dffe9 --- /dev/null +++ b/src/shared/macvlan-util.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "macvlan-util.h" +#include "string-table.h" + +static const char* const macvlan_mode_table[_NETDEV_MACVLAN_MODE_MAX] = { + [NETDEV_MACVLAN_MODE_PRIVATE] = "private", + [NETDEV_MACVLAN_MODE_VEPA] = "vepa", + [NETDEV_MACVLAN_MODE_BRIDGE] = "bridge", + [NETDEV_MACVLAN_MODE_PASSTHRU] = "passthru", + [NETDEV_MACVLAN_MODE_SOURCE] = "source", +}; + +DEFINE_STRING_TABLE_LOOKUP(macvlan_mode, MacVlanMode); diff --git a/src/shared/macvlan-util.h b/src/shared/macvlan-util.h new file mode 100644 index 0000000..0705ecb --- /dev/null +++ b/src/shared/macvlan-util.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef enum MacVlanMode { + NETDEV_MACVLAN_MODE_PRIVATE = MACVLAN_MODE_PRIVATE, + NETDEV_MACVLAN_MODE_VEPA = MACVLAN_MODE_VEPA, + NETDEV_MACVLAN_MODE_BRIDGE = MACVLAN_MODE_BRIDGE, + NETDEV_MACVLAN_MODE_PASSTHRU = MACVLAN_MODE_PASSTHRU, + NETDEV_MACVLAN_MODE_SOURCE = MACVLAN_MODE_SOURCE, + _NETDEV_MACVLAN_MODE_MAX, + _NETDEV_MACVLAN_MODE_INVALID = -EINVAL, +} MacVlanMode; + +const char *macvlan_mode_to_string(MacVlanMode d) _const_; +MacVlanMode macvlan_mode_from_string(const char *d) _pure_; diff --git a/src/shared/main-func.h b/src/shared/main-func.h new file mode 100644 index 0000000..3f6b6a8 --- /dev/null +++ b/src/shared/main-func.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-daemon.h" + +#include "argv-util.h" +#include "pager.h" +#include "selinux-util.h" +#include "spawn-ask-password-agent.h" +#include "spawn-polkit-agent.h" +#include "static-destruct.h" + +#define _DEFINE_MAIN_FUNCTION(intro, impl, ret) \ + int main(int argc, char *argv[]) { \ + int r; \ + assert_se(argc > 0 && !isempty(argv[0])); \ + save_argc_argv(argc, argv); \ + intro; \ + r = impl; \ + if (r < 0) \ + (void) sd_notifyf(0, "ERRNO=%i", -r); \ + (void) sd_notifyf(0, "EXIT_STATUS=%i", ret); \ + ask_password_agent_close(); \ + polkit_agent_close(); \ + pager_close(); \ + mac_selinux_finish(); \ + static_destruct(); \ + return ret; \ + } + +/* Negative return values from impl are mapped to EXIT_FAILURE, and + * everything else means success! */ +#define DEFINE_MAIN_FUNCTION(impl) \ + _DEFINE_MAIN_FUNCTION(,impl(argc, argv), r < 0 ? EXIT_FAILURE : EXIT_SUCCESS) + +/* Zero is mapped to EXIT_SUCCESS, negative values are mapped to EXIT_FAILURE, + * and positive values are propagated. + * Note: "true" means failure! */ +#define DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(impl) \ + _DEFINE_MAIN_FUNCTION(,impl(argc, argv), r < 0 ? EXIT_FAILURE : r) diff --git a/src/shared/meson.build b/src/shared/meson.build new file mode 100644 index 0000000..b24a541 --- /dev/null +++ b/src/shared/meson.build @@ -0,0 +1,375 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +shared_sources = files( + 'acl-util.c', + 'acpi-fpdt.c', + 'apparmor-util.c', + 'ask-password-api.c', + 'async.c', + 'barrier.c', + 'base-filesystem.c', + 'battery-util.c', + 'binfmt-util.c', + 'bitmap.c', + 'blockdev-util.c', + 'bond-util.c', + 'boot-entry.c', + 'boot-timestamps.c', + 'bootspec.c', + 'bpf-dlopen.c', + 'bpf-program.c', + 'bridge-util.c', + 'btrfs-util.c', + 'bus-get-properties.c', + 'bus-locator.c', + 'bus-log-control-api.c', + 'bus-map-properties.c', + 'bus-message-util.c', + 'bus-object.c', + 'bus-polkit.c', + 'bus-print-properties.c', + 'bus-unit-procs.c', + 'bus-unit-util.c', + 'bus-util.c', + 'bus-wait-for-jobs.c', + 'bus-wait-for-units.c', + 'calendarspec.c', + 'cgroup-setup.c', + 'cgroup-show.c', + 'chown-recursive.c', + 'clean-ipc.c', + 'clock-util.c', + 'common-signal.c', + 'compare-operator.c', + 'condition.c', + 'conf-parser.c', + 'copy.c', + 'coredump-util.c', + 'cpu-set-util.c', + 'creds-util.c', + 'cryptsetup-util.c', + 'daemon-util.c', + 'data-fd-util.c', + 'dev-setup.c', + 'device-nodes.c', + 'discover-image.c', + 'dissect-image.c', + 'dlfcn-util.c', + 'dm-util.c', + 'dns-domain.c', + 'dropin.c', + 'edit-util.c', + 'efi-api.c', + 'efi-loader.c', + 'elf-util.c', + 'enable-mempool.c', + 'env-file-label.c', + 'ethtool-util.c', + 'exec-util.c', + 'exit-status.c', + 'extension-util.c', + 'fdset.c', + 'fileio-label.c', + 'find-esp.c', + 'firewall-util-nft.c', + 'firewall-util.c', + 'format-table.c', + 'fstab-util.c', + 'generator.c', + 'geneve-util.c', + 'gpt.c', + 'group-record.c', + 'hibernate-util.c', + 'hostname-setup.c', + 'hwdb-util.c', + 'id128-print.c', + 'idn-util.c', + 'ima-util.c', + 'image-policy.c', + 'import-util.c', + 'in-addr-prefix-util.c', + 'install-file.c', + 'install-printf.c', + 'install.c', + 'ip-protocol-list.c', + 'ipvlan-util.c', + 'journal-file-util.c', + 'journal-importer.c', + 'journal-util.c', + 'json.c', + 'kbd-util.c', + 'kernel-image.c', + 'keyring-util.c', + 'killall.c', + 'label-util.c', + 'libcrypt-util.c', + 'libfido2-util.c', + 'libmount-util.c', + 'local-addresses.c', + 'locale-setup.c', + 'logs-show.c', + 'loop-util.c', + 'loopback-setup.c', + 'lsm-util.c', + 'machine-credential.c', + 'machine-id-setup.c', + 'machine-pool.c', + 'macvlan-util.c', + 'mkdir-label.c', + 'mkfs-util.c', + 'mount-setup.c', + 'mount-util.c', + 'net-condition.c', + 'netif-naming-scheme.c', + 'netif-sriov.c', + 'netif-util.c', + 'nsflags.c', + 'numa-util.c', + 'open-file.c', + 'openssl-util.c', + 'output-mode.c', + 'pager.c', + 'parse-argument.c', + 'parse-helpers.c', + 'password-quality-util-passwdqc.c', + 'password-quality-util-pwquality.c', + 'pcre2-util.c', + 'pcrextend-util.c', + 'pe-binary.c', + 'pkcs11-util.c', + 'plymouth-util.c', + 'pretty-print.c', + 'ptyfwd.c', + 'qrcode-util.c', + 'quota-util.c', + 'reboot-util.c', + 'recovery-key.c', + 'resize-fs.c', + 'resolve-util.c', + 'rm-rf.c', + 'securebits-util.c', + 'selinux-util.c', + 'serialize.c', + 'service-util.c', + 'sleep-config.c', + 'smack-util.c', + 'socket-label.c', + 'socket-netlink.c', + 'spawn-ask-password-agent.c', + 'spawn-polkit-agent.c', + 'specifier.c', + 'switch-root.c', + 'tmpfile-util-label.c', + 'tomoyo-util.c', + 'tpm2-util.c', + 'tpm2-event-log.c', + 'udev-util.c', + 'user-record-nss.c', + 'user-record-show.c', + 'user-record.c', + 'userdb-dropin.c', + 'userdb.c', + 'varlink.c', + 'varlink-idl.c', + 'varlink-io.systemd.c', + 'varlink-io.systemd.Journal.c', + 'varlink-io.systemd.ManagedOOM.c', + 'varlink-io.systemd.PCRExtend.c', + 'varlink-io.systemd.Resolve.Monitor.c', + 'varlink-io.systemd.Resolve.c', + 'varlink-io.systemd.UserDatabase.c', + 'varlink-io.systemd.oom.c', + 'varlink-io.systemd.service.c', + 'varlink-io.systemd.sysext.c', + 'varlink-org.varlink.service.c', + 'verb-log-control.c', + 'verbs.c', + 'vlan-util.c', + 'volatile-util.c', + 'wall.c', + 'watchdog.c', + 'web-util.c', + 'wifi-util.c', + 'xml.c', +) + +if get_option('tests') != 'false' + shared_sources += files( + 'tests.c', + ) +endif + +generate_syscall_list = find_program('generate-syscall-list.py') +fname = 'syscall-list.h' +syscall_list_h = custom_target( + fname, + input : syscall_list_txt, + output : fname, + command : [generate_syscall_list, + '@INPUT@'], + capture : true) + +if conf.get('HAVE_ACL') == 1 + shared_sources += files( + 'devnode-acl.c', + ) +endif + +if conf.get('ENABLE_UTMP') == 1 + shared_sources += files('utmp-wtmp.c') +endif + +if conf.get('HAVE_SECCOMP') == 1 + shared_sources += files('seccomp-util.c') + shared_sources += syscall_list_h +endif + +if conf.get('HAVE_LIBIPTC') == 1 + shared_sources += files('firewall-util-iptables.c') +endif + +if conf.get('HAVE_LIBBPF') == 1 + shared_sources += files( + 'bpf-link.c', + ) +endif + +if conf.get('HAVE_KMOD') == 1 + shared_sources += files('module-util.c') +endif + +if conf.get('HAVE_PAM') == 1 + shared_sources += files( + 'pam-util.c', + ) +endif + +if conf.get('ENABLE_NSCD') == 1 + shared_sources += files('nscd-flush.c') +endif + +if conf.get('HAVE_LIBFIDO2') == 1 and conf.get('HAVE_LIBCRYPTSETUP') == 1 + shared_sources += files('cryptsetup-fido2.c') +endif + +generate_ip_protocol_list = find_program('generate-ip-protocol-list.sh') +ip_protocol_list_txt = custom_target( + 'ip-protocol-list.txt', + output : 'ip-protocol-list.txt', + command : [generate_ip_protocol_list, cpp], + capture : true) + +fname = 'ip-protocol-from-name.gperf' +gperf_file = custom_target( + fname, + input : ip_protocol_list_txt, + output : fname, + command : [generate_gperfs, 'ip_protocol', 'IPPROTO_', '@INPUT@'], + capture : true) + +fname = 'ip-protocol-from-name.h' +target1 = custom_target( + fname, + input : gperf_file, + output : fname, + command : [gperf, + '-L', 'ANSI-C', '-t', '--ignore-case', + '-N', 'lookup_ip_protocol', + '-H', 'hash_ip_protocol_name', + '-p', '-C', + '@INPUT@'], + capture : true) + +fname = 'ip-protocol-to-name.h' +awkscript = 'ip-protocol-to-name.awk' +target2 = custom_target( + fname, + input : [awkscript, ip_protocol_list_txt], + output : fname, + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true) + +shared_generated_gperf_headers = [target1, target2] +shared_sources += shared_generated_gperf_headers + +fname = 'ethtool-link-mode.h' +ethtool_link_mode_h = custom_target( + fname, + input : ['ethtool-link-mode.py', 'linux/ethtool.h'], + output : fname, + command : [python, '@INPUT0@', '--header', cpp, '@INPUT1@'], + capture : true) +shared_sources += ethtool_link_mode_h + +fname = 'ethtool-link-mode.xml' +ethtool_link_mode_xml = custom_target( + fname, + input : ['ethtool-link-mode.py', 'linux/ethtool.h'], + output : fname, + command : [python, '@INPUT0@', '--xml', cpp, '@INPUT1@'], + capture : true) +man_page_depends += ethtool_link_mode_xml + +libshared_name = 'systemd-shared-@0@'.format(shared_lib_tag) + +libshared_deps = [threads, + libacl, + libblkid, + libcap, + libcrypt, + libdl, + libgcrypt, + libiptc_cflags, + libkmod, + liblz4, + libmount, + libopenssl, + libp11kit_cflags, + libpam, + librt, + libseccomp, + libselinux, + libxenctrl_cflags, + libxz, + libzstd] + +libshared_sym_path = meson.current_source_dir() / 'libshared.sym' +libshared_build_dir = meson.current_build_dir() + +libshared_static = static_library( + libshared_name, + shared_sources, + include_directories : includes, + dependencies : [libshared_deps, + userspace], + c_args : ['-fvisibility=default'], + build_by_default : false) + +libshared = shared_library( + libshared_name, + include_directories : includes, + c_args : ['-fvisibility=default'], + link_args : ['-shared', + '-Wl,--version-script=' + libshared_sym_path], + link_depends : libshared_sym_path, + link_whole : [libshared_static, + libbasic, + libbasic_gcrypt, + libsystemd_static], + dependencies : [libshared_deps, + userspace], + install : true, + install_dir : pkglibdir) + +shared_fdisk_sources = files( + 'fdisk-util.c', +) + +libshared_fdisk = static_library( + 'shared-fdisk', + shared_fdisk_sources, + include_directories : includes, + dependencies : [libfdisk, + userspace], + c_args : ['-fvisibility=default'], + build_by_default : false) diff --git a/src/shared/mkdir-label.c b/src/shared/mkdir-label.c new file mode 100644 index 0000000..e3afc2b --- /dev/null +++ b/src/shared/mkdir-label.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "errno-util.h" +#include "mkdir-label.h" +#include "selinux-util.h" +#include "smack-util.h" +#include "user-util.h" + +int mkdirat_label(int dirfd, const char *path, mode_t mode) { + int r; + + assert(path); + + r = mac_selinux_create_file_prepare_at(dirfd, path, S_IFDIR); + if (r < 0) + return r; + + r = RET_NERRNO(mkdirat(dirfd, path, mode)); + mac_selinux_create_file_clear(); + if (r < 0) + return r; + + return mac_smack_fix_full(dirfd, path, NULL, 0); +} + +int mkdirat_safe_label(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdirat_safe_internal(dir_fd, path, mode, uid, gid, flags, mkdirat_label); +} + +int mkdirat_parents_label(int dir_fd, const char *path, mode_t mode) { + return mkdirat_parents_internal(dir_fd, path, mode, UID_INVALID, UID_INVALID, 0, mkdirat_label); +} + +int mkdir_parents_safe_label(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdir_parents_internal(prefix, path, mode, uid, gid, flags, mkdirat_label); +} + +int mkdir_p_label(const char *path, mode_t mode) { + return mkdir_p_internal(NULL, path, mode, UID_INVALID, UID_INVALID, 0, mkdirat_label); +} diff --git a/src/shared/mkdir-label.h b/src/shared/mkdir-label.h new file mode 100644 index 0000000..a9a8ce3 --- /dev/null +++ b/src/shared/mkdir-label.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "mkdir.h" + +int mkdirat_label(int dirfd, const char *path, mode_t mode); + +static inline int mkdir_label(const char *path, mode_t mode) { + return mkdirat_label(AT_FDCWD, path, mode); +} + +int mkdirat_safe_label(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags); +static inline int mkdir_safe_label(const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) { + return mkdirat_safe_label(AT_FDCWD, path, mode, uid, gid, flags); +} +int mkdirat_parents_label(int dir_fd, const char *path, mode_t mod); +static inline int mkdir_parents_label(const char *path, mode_t mod) { + return mkdirat_parents_label(AT_FDCWD, path, mod); +} + +int mkdir_parents_safe_label(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags); + +int mkdir_p_label(const char *path, mode_t mode); diff --git a/src/shared/mkfs-util.c b/src/shared/mkfs-util.c new file mode 100644 index 0000000..4e58b6e --- /dev/null +++ b/src/shared/mkfs-util.c @@ -0,0 +1,684 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "id128-util.h" +#include "mkfs-util.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "process-util.h" +#include "recurse-dir.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tmpfile-util.h" +#include "utf8.h" + +int mkfs_exists(const char *fstype) { + const char *mkfs; + int r; + + assert(fstype); + + if (STR_IN_SET(fstype, "auto", "swap")) /* these aren't real file system types, refuse early */ + return -EINVAL; + + mkfs = strjoina("mkfs.", fstype); + if (!filename_is_valid(mkfs)) /* refuse file system types with slashes and similar */ + return -EINVAL; + + r = find_executable(mkfs, NULL); + if (r == -ENOENT) + return false; + if (r < 0) + return r; + + return true; +} + +int mkfs_supports_root_option(const char *fstype) { + return fstype_is_ro(fstype) || STR_IN_SET(fstype, "ext2", "ext3", "ext4", "btrfs", "vfat", "xfs"); +} + +static int mangle_linux_fs_label(const char *s, size_t max_len, char **ret) { + /* Not more than max_len bytes (12 or 16) */ + + assert(s); + assert(max_len > 0); + assert(ret); + + const char *q; + char *ans; + + for (q = s; *q;) { + int l; + + l = utf8_encoded_valid_unichar(q, SIZE_MAX); + if (l < 0) + return l; + + if ((size_t) (q - s + l) > max_len) + break; + q += l; + } + + ans = memdup_suffix0(s, q - s); + if (!ans) + return -ENOMEM; + + *ret = ans; + return 0; +} + +static int mangle_fat_label(const char *s, char **ret) { + assert(s); + + _cleanup_free_ char *q = NULL; + int r; + + r = utf8_to_ascii(s, '_', &q); + if (r < 0) + return r; + + /* Classic FAT only allows 11 character uppercase labels */ + strshorten(q, 11); + ascii_strupper(q); + + /* mkfs.vfat: Labels with characters *?.,;:/\|+=<>[]" are not allowed. + * Let's also replace any control chars. */ + for (char *p = q; *p; p++) + if (strchr("*?.,;:/\\|+=<>[]\"", *p) || char_is_cc(*p)) + *p = '_'; + + *ret = TAKE_PTR(q); + return 0; +} + +static int do_mcopy(const char *node, const char *root) { + _cleanup_free_ char *mcopy = NULL; + _cleanup_strv_free_ char **argv = NULL; + _cleanup_close_ int rfd = -EBADF; + _cleanup_free_ DirectoryEntries *de = NULL; + int r; + + assert(node); + assert(root); + + /* Return early if there's nothing to copy. */ + if (dir_is_empty(root, /*ignore_hidden_or_backup=*/ false)) + return 0; + + r = find_executable("mcopy", &mcopy); + if (r == -ENOENT) + return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "Could not find mcopy binary."); + if (r < 0) + return log_error_errno(r, "Failed to determine whether mcopy binary exists: %m"); + + argv = strv_new(mcopy, "-s", "-p", "-Q", "-m", "-i", node); + if (!argv) + return log_oom(); + + /* mcopy copies the top level directory instead of everything in it so we have to pass all + * the subdirectories to mcopy instead to end up with the correct directory structure. */ + + rfd = open(root, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + if (rfd < 0) + return log_error_errno(errno, "Failed to open directory '%s': %m", root); + + r = readdir_all(rfd, RECURSE_DIR_SORT|RECURSE_DIR_ENSURE_TYPE, &de); + if (r < 0) + return log_error_errno(r, "Failed to read '%s' contents: %m", root); + + for (size_t i = 0; i < de->n_entries; i++) { + _cleanup_free_ char *p = NULL; + + p = path_join(root, de->entries[i]->d_name); + if (!p) + return log_oom(); + + if (!IN_SET(de->entries[i]->d_type, DT_REG, DT_DIR)) { + log_debug("%s is not a file/directory which are the only file types supported by vfat, ignoring", p); + continue; + } + + if (strv_consume(&argv, TAKE_PTR(p)) < 0) + return log_oom(); + } + + if (strv_extend(&argv, "::") < 0) + return log_oom(); + + r = safe_fork("(mcopy)", FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_STDOUT_TO_STDERR|FORK_CLOSE_ALL_FDS, NULL); + if (r < 0) + return r; + if (r == 0) { + /* Avoid failures caused by mismatch in expectations between mkfs.vfat and mcopy by disabling + * the stricter mcopy checks using MTOOLS_SKIP_CHECK. */ + execve(mcopy, argv, STRV_MAKE("MTOOLS_SKIP_CHECK=1", "TZ=UTC", strv_find_prefix(environ, "SOURCE_DATE_EPOCH="))); + + log_error_errno(errno, "Failed to execute mcopy: %m"); + + _exit(EXIT_FAILURE); + } + + return 0; +} + +typedef struct ProtofileData { + FILE *file; + bool has_filename_with_spaces; + const char *tmpdir; +} ProtofileData; + +static int protofile_print_item( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + ProtofileData *data = ASSERT_PTR(userdata); + _cleanup_free_ char *copy = NULL; + int r; + + if (event == RECURSE_DIR_LEAVE) { + fputs("$\n", data->file); + return 0; + } + + if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY)) + return RECURSE_DIR_CONTINUE; + + char type = S_ISDIR(sx->stx_mode) ? 'd' : + S_ISREG(sx->stx_mode) ? '-' : + S_ISLNK(sx->stx_mode) ? 'l' : + S_ISFIFO(sx->stx_mode) ? 'p' : + S_ISBLK(sx->stx_mode) ? 'b' : + S_ISCHR(sx->stx_mode) ? 'c' : 0; + if (type == 0) + return RECURSE_DIR_CONTINUE; + + /* The protofile format does not support spaces in filenames as whitespace is used as a token + * delimiter. To work around this limitation, mkfs.xfs allows escaping whitespace by using the / + * character (which isn't allowed in filenames and as such can be used to escape whitespace). See + * https://lore.kernel.org/linux-xfs/20230222090303.h6tujm7y32gjhgal@andromeda/T/#m8066b3e7d62a080ee7434faac4861d944e64493b + * for more information.*/ + + if (strchr(de->d_name, ' ')) { + copy = strdup(de->d_name); + if (!copy) + return log_oom(); + + string_replace_char(copy, ' ', '/'); + data->has_filename_with_spaces = true; + } + + fprintf(data->file, "%s %c%c%c%03o "UID_FMT" "GID_FMT" ", + copy ?: de->d_name, + type, + sx->stx_mode & S_ISUID ? 'u' : '-', + sx->stx_mode & S_ISGID ? 'g' : '-', + (unsigned) (sx->stx_mode & 0777), + sx->stx_uid, sx->stx_gid); + + if (S_ISREG(sx->stx_mode)) { + _cleanup_free_ char *p = NULL; + + /* While we can escape whitespace in the filename, we cannot escape whitespace in the source + * path, so hack around that by creating a symlink to the path in a temporary directory and + * using the symlink as the source path instead. */ + + if (strchr(path, ' ')) { + r = tempfn_random_child(data->tmpdir, "mkfs-xfs", &p); + if (r < 0) + return log_error_errno(r, "Failed to generate random child name in %s: %m", data->tmpdir); + + if (symlink(path, p) < 0) + return log_error_errno(errno, "Failed to symlink %s to %s: %m", p, path); + } + + fputs(p ?: path, data->file); + } else if (S_ISLNK(sx->stx_mode)) { + _cleanup_free_ char *p = NULL; + + r = readlinkat_malloc(dir_fd, de->d_name, &p); + if (r < 0) + return log_error_errno(r, "Failed to read symlink %s: %m", path); + + /* If we have a symlink to a path with whitespace in it, we're out of luck, as there's no way + * to encode that in the mkfs.xfs protofile format. */ + + if (strchr(p, ' ')) + return log_error_errno(r, "Symlinks to paths containing whitespace are not supported by mkfs.xfs: %m"); + + fputs(p, data->file); + } else if (S_ISBLK(sx->stx_mode) || S_ISCHR(sx->stx_mode)) + fprintf(data->file, "%" PRIu32 " %" PRIu32, sx->stx_rdev_major, sx->stx_rdev_minor); + + fputc('\n', data->file); + + return RECURSE_DIR_CONTINUE; +} + +static int make_protofile(const char *root, char **ret_path, bool *ret_has_filename_with_spaces, char **ret_tmpdir) { + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_and_freep) char *p = NULL; + struct ProtofileData data = {}; + const char *vt; + int r; + + assert(ret_path); + assert(ret_has_filename_with_spaces); + assert(ret_tmpdir); + + r = var_tmp_dir(&vt); + if (r < 0) + return log_error_errno(r, "Failed to get persistent temporary directory: %m"); + + r = fopen_temporary_child(vt, &f, &p); + if (r < 0) + return log_error_errno(r, "Failed to open temporary file: %m"); + + /* Explicitly use /tmp here because this directory cannot have spaces its path. */ + r = mkdtemp_malloc("/tmp/systemd-mkfs-XXXXXX", &tmpdir); + if (r < 0) + return log_error_errno(r, "Failed to create temporary directory: %m"); + + data.file = f; + data.tmpdir = tmpdir; + + fputs("/\n" + "0 0\n" + "d--755 0 0\n", f); + + r = recurse_dir_at(AT_FDCWD, root, STATX_TYPE|STATX_MODE|STATX_UID|STATX_GID, UINT_MAX, + RECURSE_DIR_SORT, protofile_print_item, &data); + if (r < 0) + return log_error_errno(r, "Failed to recurse through %s: %m", root); + + fputs("$\n", f); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to flush %s: %m", p); + + *ret_path = TAKE_PTR(p); + *ret_has_filename_with_spaces = data.has_filename_with_spaces; + *ret_tmpdir = TAKE_PTR(tmpdir); + + return 0; +} + +int make_filesystem( + const char *node, + const char *fstype, + const char *label, + const char *root, + sd_id128_t uuid, + bool discard, + bool quiet, + uint64_t sector_size, + char * const *extra_mkfs_args) { + + _cleanup_free_ char *mkfs = NULL, *mangled_label = NULL; + _cleanup_strv_free_ char **argv = NULL, **env = NULL; + _cleanup_(rm_rf_physical_and_freep) char *protofile_tmpdir = NULL; + _cleanup_(unlink_and_freep) char *protofile = NULL; + char vol_id[CONST_MAX(SD_ID128_UUID_STRING_MAX, 8U + 1U)] = {}; + int stdio_fds[3] = { -EBADF, STDERR_FILENO, STDERR_FILENO}; + ForkFlags flags = FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT| + FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO|FORK_REOPEN_LOG; + int r; + + assert(node); + assert(fstype); + assert(label); + + if (fstype_is_ro(fstype) && !root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot generate read-only filesystem %s without a source tree.", + fstype); + + if (streq(fstype, "swap")) { + if (root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "A swap filesystem can't be populated, refusing"); + r = find_executable("mkswap", &mkfs); + if (r == -ENOENT) + return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkswap binary not available."); + if (r < 0) + return log_error_errno(r, "Failed to determine whether mkswap binary exists: %m"); + } else if (streq(fstype, "squashfs")) { + r = find_executable("mksquashfs", &mkfs); + if (r == -ENOENT) + return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mksquashfs binary not available."); + if (r < 0) + return log_error_errno(r, "Failed to determine whether mksquashfs binary exists: %m"); + + } else if (streq(fstype, "erofs")) { + r = find_executable("mkfs.erofs", &mkfs); + if (r == -ENOENT) + return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkfs.erofs binary not available."); + if (r < 0) + return log_error_errno(r, "Failed to determine whether mkfs.erofs binary exists: %m"); + + } else if (fstype_is_ro(fstype)) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Don't know how to create read-only file system '%s', refusing.", + fstype); + } else { + if (root && !mkfs_supports_root_option(fstype)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Populating with source tree is not supported for %s", fstype); + r = mkfs_exists(fstype); + if (r < 0) + return log_error_errno(r, "Failed to determine whether mkfs binary for %s exists: %m", fstype); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkfs binary for %s is not available.", fstype); + + mkfs = strjoin("mkfs.", fstype); + if (!mkfs) + return log_oom(); + } + + if (STR_IN_SET(fstype, "ext2", "ext3", "ext4", "xfs", "swap")) { + size_t max_len = + streq(fstype, "xfs") ? 12 : + streq(fstype, "swap") ? 15 : + 16; + + r = mangle_linux_fs_label(label, max_len, &mangled_label); + if (r < 0) + return log_error_errno(r, "Failed to determine volume label from string \"%s\": %m", label); + label = mangled_label; + + } else if (streq(fstype, "vfat")) { + r = mangle_fat_label(label, &mangled_label); + if (r < 0) + return log_error_errno(r, "Failed to determine FAT label from string \"%s\": %m", label); + label = mangled_label; + + xsprintf(vol_id, "%08" PRIx32, + ((uint32_t) uuid.bytes[0] << 24) | + ((uint32_t) uuid.bytes[1] << 16) | + ((uint32_t) uuid.bytes[2] << 8) | + ((uint32_t) uuid.bytes[3])); /* Take first 32 bytes of UUID */ + } + + if (isempty(vol_id)) + assert_se(sd_id128_to_uuid_string(uuid, vol_id)); + + /* When changing this conditional, also adjust the log statement below. */ + if (STR_IN_SET(fstype, "ext2", "ext3", "ext4")) { + argv = strv_new(mkfs, + "-L", label, + "-U", vol_id, + "-I", "256", + "-m", "0", + "-E", discard ? "discard,lazy_itable_init=1" : "nodiscard,lazy_itable_init=1", + "-b", "4096", + "-T", "default", + node); + + if (root && strv_extend_strv(&argv, STRV_MAKE("-d", root), false) < 0) + return log_oom(); + + if (quiet && strv_extend(&argv, "-q") < 0) + return log_oom(); + + if (sector_size > 0) { + if (strv_extend(&env, "MKE2FS_DEVICE_SECTSIZE") < 0) + return log_oom(); + + if (strv_extendf(&env, "%"PRIu64, sector_size) < 0) + return log_oom(); + } + + } else if (streq(fstype, "btrfs")) { + argv = strv_new(mkfs, + "-L", label, + "-U", vol_id, + node); + if (!argv) + return log_oom(); + + if (!discard && strv_extend(&argv, "--nodiscard") < 0) + return log_oom(); + + if (root && strv_extend_strv(&argv, STRV_MAKE("-r", root), false) < 0) + return log_oom(); + + if (quiet && strv_extend(&argv, "-q") < 0) + return log_oom(); + + /* mkfs.btrfs unconditionally warns about several settings changing from v5.15 onwards which + * isn't silenced by "-q", so let's redirect stdout to /dev/null as well. */ + if (quiet) + stdio_fds[1] = -EBADF; + + } else if (streq(fstype, "f2fs")) { + argv = strv_new(mkfs, + "-g", /* "default options" */ + "-f", /* force override, without this it doesn't seem to want to write to an empty partition */ + "-l", label, + "-U", vol_id, + "-t", one_zero(discard), + node); + + if (quiet && strv_extend(&argv, "-q") < 0) + return log_oom(); + + if (sector_size > 0) { + if (strv_extend(&argv, "-w") < 0) + return log_oom(); + + if (strv_extendf(&argv, "%"PRIu64, sector_size) < 0) + return log_oom(); + } + + } else if (streq(fstype, "xfs")) { + const char *j; + + j = strjoina("uuid=", vol_id); + + argv = strv_new(mkfs, + "-L", label, + "-m", j, + "-m", "reflink=1", + node); + if (!argv) + return log_oom(); + + if (!discard && strv_extend(&argv, "-K") < 0) + return log_oom(); + + if (root) { + bool has_filename_with_spaces = false; + _cleanup_free_ char *protofile_with_opt = NULL; + + r = make_protofile(root, &protofile, &has_filename_with_spaces, &protofile_tmpdir); + if (r < 0) + return r; + + /* Gross hack to make mkfs.xfs interpret slashes as spaces so we can encode filenames + * with spaces in the protofile format. */ + if (has_filename_with_spaces) + protofile_with_opt = strjoin("slashes_are_spaces=1,", protofile); + else + protofile_with_opt = strdup(protofile); + if (!protofile_with_opt) + return -ENOMEM; + + if (strv_extend_strv(&argv, STRV_MAKE("-p", protofile_with_opt), false) < 0) + return log_oom(); + } + + if (sector_size > 0) { + if (strv_extend(&argv, "-s") < 0) + return log_oom(); + + if (strv_extendf(&argv, "size=%"PRIu64, sector_size) < 0) + return log_oom(); + } + + if (quiet && strv_extend(&argv, "-q") < 0) + return log_oom(); + + } else if (streq(fstype, "vfat")) { + + argv = strv_new(mkfs, + "-i", vol_id, + "-n", label, + "-F", "32", /* yes, we force FAT32 here */ + node); + + if (sector_size > 0) { + if (strv_extend(&argv, "-S") < 0) + return log_oom(); + + if (strv_extendf(&argv, "%"PRIu64, sector_size) < 0) + return log_oom(); + } + + /* mkfs.vfat does not have a --quiet option so let's redirect stdout to /dev/null instead. */ + if (quiet) + stdio_fds[1] = -EBADF; + + } else if (streq(fstype, "swap")) { + /* TODO: add --quiet once util-linux v2.38 is available everywhere. */ + + argv = strv_new(mkfs, + "-L", label, + "-U", vol_id, + node); + + if (quiet) + stdio_fds[1] = -EBADF; + + } else if (streq(fstype, "squashfs")) { + + argv = strv_new(mkfs, + root, node, + "-noappend"); + + /* mksquashfs -quiet option is pretty new so let's redirect stdout to /dev/null instead. */ + if (quiet) + stdio_fds[1] = -EBADF; + + } else if (streq(fstype, "erofs")) { + + argv = strv_new(mkfs, + "-U", vol_id, + node, root); + + if (quiet && strv_extend(&argv, "--quiet") < 0) + return log_oom(); + + } else + /* Generic fallback for all other file systems */ + argv = strv_new(mkfs, node); + + if (!argv) + return log_oom(); + + if (extra_mkfs_args && strv_extend_strv(&argv, extra_mkfs_args, false) < 0) + return log_oom(); + + if (streq(fstype, "btrfs")) { + struct stat st; + + if (stat(node, &st) < 0) + return log_error_errno(r, "Failed to stat '%s': %m", node); + + if (S_ISBLK(st.st_mode)) + flags |= FORK_NEW_MOUNTNS; + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *j = NULL; + + j = strv_join(argv, " "); + log_debug("Executing mkfs command: %s", strna(j)); + } + + r = safe_fork_full( + "(mkfs)", + stdio_fds, + /*except_fds=*/ NULL, + /*n_except_fds=*/ 0, + flags, + /*ret_pid=*/ NULL); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + + STRV_FOREACH_PAIR(k, v, env) + if (setenv(*k, *v, /* replace = */ true) < 0) { + log_error_errno(r, "Failed to set %s=%s environment variable: %m", *k, *v); + _exit(EXIT_FAILURE); + } + + /* mkfs.btrfs refuses to operate on block devices with mounted partitions, even if operating + * on unformatted free space, so let's trick it and other mkfs tools into thinking no + * partitions are mounted. See https://github.com/kdave/btrfs-progs/issues/640 for more + ° information. */ + if (flags & FORK_NEW_MOUNTNS) + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/null", "/proc/self/mounts", NULL, MS_BIND, NULL); + + execvp(mkfs, argv); + + log_error_errno(errno, "Failed to execute %s: %m", mkfs); + + _exit(EXIT_FAILURE); + } + + if (root && streq(fstype, "vfat")) { + r = do_mcopy(node, root); + if (r < 0) + return r; + } + + if (STR_IN_SET(fstype, "ext2", "ext3", "ext4", "btrfs", "f2fs", "xfs", "vfat", "swap")) + log_info("%s successfully formatted as %s (label \"%s\", uuid %s)", + node, fstype, label, vol_id); + else if (streq(fstype, "erofs")) + log_info("%s successfully formatted as %s (uuid %s, no label)", + node, fstype, vol_id); + else + log_info("%s successfully formatted as %s (no label or uuid specified)", + node, fstype); + + return 0; +} + +int mkfs_options_from_env(const char *component, const char *fstype, char ***ret) { + _cleanup_strv_free_ char **l = NULL; + const char *e; + char *n; + + assert(component); + assert(fstype); + assert(ret); + + n = strjoina("SYSTEMD_", component, "_MKFS_OPTIONS_", fstype); + e = getenv(ascii_strupper(n)); + if (e) { + l = strv_split(e, NULL); + if (!l) + return -ENOMEM; + } + + *ret = TAKE_PTR(l); + return 0; +} diff --git a/src/shared/mkfs-util.h b/src/shared/mkfs-util.h new file mode 100644 index 0000000..9a1cb58 --- /dev/null +++ b/src/shared/mkfs-util.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-id128.h" + +#include "strv.h" + +int mkfs_exists(const char *fstype); + +int mkfs_supports_root_option(const char *fstype); + +int make_filesystem( + const char *node, + const char *fstype, + const char *label, + const char *root, + sd_id128_t uuid, + bool discard, + bool quiet, + uint64_t sector_size, + char * const *extra_mkfs_args); + +int mkfs_options_from_env(const char *component, const char *fstype, char ***ret); diff --git a/src/shared/module-util.c b/src/shared/module-util.c new file mode 100644 index 0000000..951701d --- /dev/null +++ b/src/shared/module-util.c @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "module-util.h" +#include "proc-cmdline.h" +#include "strv.h" + +static int denylist_modules(const char *p, char ***denylist) { + _cleanup_strv_free_ char **k = NULL; + + assert(p); + assert(denylist); + + k = strv_split(p, ","); + if (!k) + return -ENOMEM; + + if (strv_extend_strv(denylist, k, true) < 0) + return -ENOMEM; + + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + int r; + + if (proc_cmdline_key_streq(key, "module_blacklist")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = denylist_modules(value, data); + if (r < 0) + return r; + } + + return 0; +} + +int module_load_and_warn(struct kmod_ctx *ctx, const char *module, bool verbose) { + const int probe_flags = KMOD_PROBE_APPLY_BLACKLIST; + struct kmod_list *itr; + _cleanup_(kmod_module_unref_listp) struct kmod_list *modlist = NULL; + _cleanup_strv_free_ char **denylist = NULL; + bool denylist_parsed = false; + int r; + + /* verbose==true means we should log at non-debug level if we + * fail to find or load the module. */ + + log_debug("Loading module: %s", module); + + r = kmod_module_new_from_lookup(ctx, module, &modlist); + if (r < 0) + return log_full_errno(verbose ? LOG_ERR : LOG_DEBUG, r, + "Failed to look up module alias '%s': %m", module); + + if (!modlist) + return log_full_errno(verbose ? LOG_ERR : LOG_DEBUG, + SYNTHETIC_ERRNO(ENOENT), + "Failed to find module '%s'", module); + + kmod_list_foreach(itr, modlist) { + _cleanup_(kmod_module_unrefp) struct kmod_module *mod = NULL; + int state, err; + + mod = kmod_module_get_module(itr); + state = kmod_module_get_initstate(mod); + + switch (state) { + case KMOD_MODULE_BUILTIN: + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "Module '%s' is built in", kmod_module_get_name(mod)); + break; + + case KMOD_MODULE_LIVE: + log_debug("Module '%s' is already loaded", kmod_module_get_name(mod)); + break; + + default: + err = kmod_module_probe_insert_module(mod, probe_flags, + NULL, NULL, NULL, NULL); + if (err == 0) + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "Inserted module '%s'", kmod_module_get_name(mod)); + else if (err == KMOD_PROBE_APPLY_BLACKLIST) + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "Module '%s' is deny-listed (by kmod)", kmod_module_get_name(mod)); + else { + assert(err < 0); + + if (err == -EPERM) { + if (!denylist_parsed) { + r = proc_cmdline_parse(parse_proc_cmdline_item, &denylist, 0); + if (r < 0) + log_full_errno(!verbose ? LOG_DEBUG : LOG_WARNING, + r, + "Failed to parse kernel command line, ignoring: %m"); + + denylist_parsed = true; + } + if (strv_contains(denylist, kmod_module_get_name(mod))) { + log_full(verbose ? LOG_INFO : LOG_DEBUG, + "Module '%s' is deny-listed (by kernel)", kmod_module_get_name(mod)); + continue; + } + } + + log_full_errno(!verbose ? LOG_DEBUG : + err == -ENODEV ? LOG_NOTICE : + err == -ENOENT ? LOG_WARNING : + LOG_ERR, + err, + "Failed to insert module '%s': %m", + kmod_module_get_name(mod)); + if (!IN_SET(err, -ENODEV, -ENOENT)) + r = err; + } + } + } + + return r; +} diff --git a/src/shared/module-util.h b/src/shared/module-util.h new file mode 100644 index 0000000..8ca6a06 --- /dev/null +++ b/src/shared/module-util.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct kmod_ctx*, kmod_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct kmod_module*, kmod_module_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct kmod_list*, kmod_module_unref_list, NULL); + +int module_load_and_warn(struct kmod_ctx *ctx, const char *module, bool verbose); diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c new file mode 100644 index 0000000..1226ca1 --- /dev/null +++ b/src/shared/mount-setup.c @@ -0,0 +1,591 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bus-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "conf-files.h" +#include "dev-setup.h" +#include "dirent-util.h" +#include "efi-loader.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "label-util.h" +#include "log.h" +#include "macro.h" +#include "mkdir-label.h" +#include "mount-setup.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "path-util.h" +#include "recurse-dir.h" +#include "set.h" +#include "smack-util.h" +#include "strv.h" +#include "user-util.h" +#include "virt.h" + +typedef enum MountMode { + MNT_NONE = 0, + MNT_FATAL = 1 << 0, + MNT_IN_CONTAINER = 1 << 1, + MNT_CHECK_WRITABLE = 1 << 2, + MNT_FOLLOW_SYMLINK = 1 << 3, +} MountMode; + +typedef struct MountPoint { + const char *what; + const char *where; + const char *type; + const char *options; + unsigned long flags; + bool (*condition_fn)(void); + MountMode mode; +} MountPoint; + +/* The first three entries we might need before SELinux is up. The + * fourth (securityfs) is needed by IMA to load a custom policy. The + * other ones we can delay until SELinux and IMA are loaded. When + * SMACK is enabled we need smackfs, too, so it's a fifth one. */ +#if ENABLE_SMACK +#define N_EARLY_MOUNT 5 +#else +#define N_EARLY_MOUNT 4 +#endif + +static bool check_recursiveprot_supported(void) { + int r; + + if (!cg_is_unified_wanted()) + return false; + + r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL); + if (r < 0) + log_debug_errno(r, "Failed to determiner whether the 'memory_recursiveprot' mount option is supported, assuming not: %m"); + else if (r == 0) + log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option."); + + return r > 0; +} + +static const MountPoint mount_table[] = { + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK }, + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "devtmpfs", "/dev", "devtmpfs", "mode=0755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE }, +#if ENABLE_SMACK + { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, + mac_smack_use, MNT_FATAL }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=01777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + mac_smack_use, MNT_FATAL }, +#endif + { "tmpfs", "/dev/shm", "tmpfs", "mode=01777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "devpts", "/dev/pts", "devpts", "mode=0620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, + NULL, MNT_IN_CONTAINER }, +#if ENABLE_SMACK + { "tmpfs", "/run", "tmpfs", "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + mac_smack_use, MNT_FATAL }, +#endif + { "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, + NULL, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV, + check_recursiveprot_supported, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_legacy_wanted, MNT_IN_CONTAINER }, + { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, + cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, +#if ENABLE_PSTORE + { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE }, +#endif +#if ENABLE_EFI + { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + is_efi_boot, MNT_NONE }, +#endif + { "bpf", "/sys/fs/bpf", "bpf", "mode=0700", MS_NOSUID|MS_NOEXEC|MS_NODEV, + NULL, MNT_NONE, }, +}; + +assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table)); + +bool mount_point_is_api(const char *path) { + /* Checks if this mount point is considered "API", and hence + * should be ignored */ + + for (size_t i = 0; i < ELEMENTSOF(mount_table); i ++) + if (path_equal(path, mount_table[i].where)) + return true; + + return path_startswith(path, "/sys/fs/cgroup/"); +} + +bool mount_point_ignore(const char *path) { + /* These are API file systems that might be mounted by other software, we just list them here so that + * we know that we should ignore them. */ + FOREACH_STRING(i, + /* SELinux file systems */ + "/sys/fs/selinux", + /* Container bind mounts */ + "/dev/console", + "/proc/kmsg", + "/proc/sys", + "/proc/sys/kernel/random/boot_id") + if (path_equal(path, i)) + return true; + + if (path_startswith(path, "/run/host")) /* All mounts passed in from the container manager are + * something we better ignore. */ + return true; + + return false; +} + +static int mount_one(const MountPoint *p, bool relabel) { + int r, priority; + + assert(p); + + priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG; + + if (p->condition_fn && !p->condition_fn()) + return 0; + + /* Relabel first, just in case */ + if (relabel) + (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS); + + r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW); + if (r < 0 && r != -ENOENT) { + log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + if (r > 0) + return 0; + + /* Skip securityfs in a container */ + if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0) + return 0; + + /* The access mode here doesn't really matter too much, since + * the mounted file system will take precedence anyway. */ + if (relabel) + (void) mkdir_p_label(p->where, 0755); + else + (void) mkdir_p(p->where, 0755); + + log_debug("Mounting %s to %s of type %s with options %s.", + p->what, + p->where, + p->type, + strna(p->options)); + + if (FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK)) + r = mount_follow_verbose(priority, p->what, p->where, p->type, p->flags, p->options); + else + r = mount_nofollow_verbose(priority, p->what, p->where, p->type, p->flags, p->options); + if (r < 0) + return (p->mode & MNT_FATAL) ? r : 0; + + /* Relabel again, since we now mounted something fresh here */ + if (relabel) + (void) label_fix(p->where, 0); + + if (p->mode & MNT_CHECK_WRITABLE) { + if (access(p->where, W_OK) < 0) { + r = -errno; + + (void) umount2(p->where, UMOUNT_NOFOLLOW); + (void) rmdir(p->where); + + log_full_errno(priority, r, "Mount point %s not writable after mounting, undoing: %m", p->where); + return (p->mode & MNT_FATAL) ? r : 0; + } + } + + return 1; +} + +static int mount_points_setup(size_t n, bool loaded_policy) { + int ret = 0, r; + + assert(n <= ELEMENTSOF(mount_table)); + + FOREACH_ARRAY(mp, mount_table, n) { + r = mount_one(mp, loaded_policy); + if (r != 0 && ret >= 0) + ret = r; + } + + return ret; +} + +int mount_setup_early(void) { + /* Do a minimal mount of /proc and friends to enable the most basic stuff, such as SELinux */ + return mount_points_setup(N_EARLY_MOUNT, /* loaded_policy= */ false); +} + +static const char *join_with(const char *controller) { + + static const char* const pairs[] = { + "cpu", "cpuacct", + "net_cls", "net_prio", + NULL + }; + + assert(controller); + + /* This will lookup which controller to mount another controller with. Input is a controller name, and output + * is the other controller name. The function works both ways: you can input one and get the other, and input + * the other to get the one. */ + + STRV_FOREACH_PAIR(x, y, pairs) { + if (streq(controller, *x)) + return *y; + if (streq(controller, *y)) + return *x; + } + + return NULL; +} + +static int symlink_controller(const char *target, const char *alias) { + const char *a; + int r; + + assert(target); + assert(alias); + + a = strjoina("/sys/fs/cgroup/", alias); + + r = symlink_idempotent(target, a, false); + if (r < 0) + return log_error_errno(r, "Failed to create symlink %s: %m", a); + +#if HAVE_SMACK_RUN_LABEL + const char *p; + + p = strjoina("/sys/fs/cgroup/", target); + + r = mac_smack_copy(a, p); + if (r < 0 && !ERRNO_IS_NOT_SUPPORTED(r)) + return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a); +#endif + + return 0; +} + +int mount_cgroup_controllers(void) { + _cleanup_set_free_ Set *controllers = NULL; + int r; + + if (!cg_is_legacy_wanted()) + return 0; + + /* Mount all available cgroup controllers that are built into the kernel. */ + r = cg_kernel_controllers(&controllers); + if (r < 0) + return log_error_errno(r, "Failed to enumerate cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL; + const char *other_controller; + MountPoint p = { + .what = "cgroup", + .type = "cgroup", + .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, + .mode = MNT_IN_CONTAINER, + }; + + controller = set_steal_first(controllers); + if (!controller) + break; + + /* Check if we shall mount this together with another controller */ + other_controller = join_with(controller); + if (other_controller) { + _cleanup_free_ char *c = NULL; + + /* Check if the other controller is actually available in the kernel too */ + c = set_remove(controllers, other_controller); + if (c) { + + /* Join the two controllers into one string, and maintain a stable ordering */ + if (strcmp(controller, other_controller) < 0) + options = strjoin(controller, ",", other_controller); + else + options = strjoin(other_controller, ",", controller); + if (!options) + return log_oom(); + } + } + + /* The simple case, where there's only one controller to mount together */ + if (!options) + options = TAKE_PTR(controller); + + where = path_join("/sys/fs/cgroup", options); + if (!where) + return log_oom(); + + p.where = where; + p.options = options; + + r = mount_one(&p, true); + if (r < 0) + return r; + + /* Create symlinks from the individual controller names, in case we have a joined mount */ + if (controller) + (void) symlink_controller(options, controller); + if (other_controller) + (void) symlink_controller(options, other_controller); + } + + /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */ + (void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs", + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, + "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP); + + return 0; +} + +#if HAVE_SELINUX || ENABLE_SMACK +static int relabel_cb( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + switch (event) { + + case RECURSE_DIR_LEAVE: + case RECURSE_DIR_SKIP_MOUNT: + /* If we already saw this dirent when entering it or this is a dirent that on a different + * mount, don't relabel it. */ + return RECURSE_DIR_CONTINUE; + + case RECURSE_DIR_ENTER: + /* /run/initramfs/ + /run/nextroot/ are static data and big, no need to dynamically relabel + * its contents at boot... */ + if (PATH_STARTSWITH_SET(path, "/run/initramfs", "/run/nextroot")) + return RECURSE_DIR_SKIP_ENTRY; + + _fallthrough_; + + default: + /* Otherwise, label it, even if we had trouble stat()ing it and similar. SELinux can figure this out */ + (void) label_fix(path, 0); + return RECURSE_DIR_CONTINUE; + } +} + +static int relabel_tree(const char *path) { + int r; + + r = recurse_dir_at(AT_FDCWD, path, 0, UINT_MAX, RECURSE_DIR_ENSURE_TYPE|RECURSE_DIR_SAME_MOUNT, relabel_cb, NULL); + if (r < 0) + log_debug_errno(r, "Failed to recursively relabel '%s': %m", path); + + return r; +} + +static int relabel_cgroup_filesystems(void) { + int r; + struct statfs st; + + r = cg_all_unified(); + if (r == 0) { + /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this + only when the filesystem has been already populated by a previous instance of systemd + running from initrd. Otherwise don't remount anything and leave the filesystem read-write + for the cgroup filesystems to be mounted inside. */ + if (statfs("/sys/fs/cgroup", &st) < 0) + return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m"); + + if (st.f_flags & ST_RDONLY) + (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL); + + (void) label_fix("/sys/fs/cgroup", 0); + (void) relabel_tree("/sys/fs/cgroup"); + + if (st.f_flags & ST_RDONLY) + (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL); + + } else if (r < 0) + return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m"); + + return 0; +} + +static int relabel_extra(void) { + _cleanup_strv_free_ char **files = NULL; + int r, c = 0; + + /* Support for relabelling additional files or directories after loading the policy. For this, code in the + * initrd simply has to drop in *.relabel files into /run/systemd/relabel-extra.d/. We'll read all such files + * expecting one absolute path by line and will relabel each (and everyone below that in case the path refers + * to a directory). These drop-in files are supposed to be absolutely minimal, and do not understand comments + * and such. After the operation succeeded the files are removed, and the drop-in directory as well, if + * possible. + */ + + r = conf_files_list(&files, ".relabel", NULL, + CONF_FILES_FILTER_MASKED | CONF_FILES_REGULAR, + "/run/systemd/relabel-extra.d/"); + if (r < 0) + return log_error_errno(r, "Failed to enumerate /run/systemd/relabel-extra.d/, ignoring: %m"); + + STRV_FOREACH(file, files) { + _cleanup_fclose_ FILE *f = NULL; + + f = fopen(*file, "re"); + if (!f) { + log_warning_errno(errno, "Failed to open %s, ignoring: %m", *file); + continue; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) { + log_warning_errno(r, "Failed to read %s, ignoring: %m", *file); + break; + } + if (r == 0) /* EOF */ + break; + + path_simplify(line); + + if (!path_is_normalized(line)) { + log_warning("Path to relabel is not normalized, ignoring: %s", line); + continue; + } + + if (!path_is_absolute(line)) { + log_warning("Path to relabel is not absolute, ignoring: %s", line); + continue; + } + + log_debug("Relabelling additional file/directory '%s'.", line); + (void) label_fix(line, 0); + (void) relabel_tree(line); + c++; + } + + if (unlink(*file) < 0) + log_warning_errno(errno, "Failed to remove %s, ignoring: %m", *file); + } + + /* Remove when we complete things. */ + if (rmdir("/run/systemd/relabel-extra.d") < 0 && + errno != ENOENT) + log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/ directory: %m"); + + return c; +} +#endif + +int mount_setup(bool loaded_policy, bool leave_propagation) { + int r; + + r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy); + if (r < 0) + return r; + +#if HAVE_SELINUX || ENABLE_SMACK + /* Nodes in devtmpfs and /run need to be manually updated for + * the appropriate labels, after mounting. The other virtual + * API file systems like /sys and /proc do not need that, they + * use the same label for all their files. */ + if (loaded_policy) { + usec_t before_relabel, after_relabel; + int n_extra; + + before_relabel = now(CLOCK_MONOTONIC); + + FOREACH_STRING(i, "/dev", "/dev/shm", "/run") + (void) relabel_tree(i); + + (void) relabel_cgroup_filesystems(); + + n_extra = relabel_extra(); + + after_relabel = now(CLOCK_MONOTONIC); + + log_info("Relabeled /dev, /dev/shm, /run, /sys/fs/cgroup%s in %s.", + n_extra > 0 ? ", additional files" : "", + FORMAT_TIMESPAN(after_relabel - before_relabel, 0)); + } +#endif + + /* Create a few default symlinks, which are normally created + * by udevd, but some scripts might need them before we start + * udevd. */ + dev_setup(NULL, UID_INVALID, GID_INVALID); + + /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we + * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of + * the box. If specific setups need other settings they can reset the propagation mode to private if + * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a + * container manager we assume the container manager knows what it is doing (for example, because it set up + * some directories with different propagation modes). */ + if (detect_container() <= 0 && !leave_propagation) + if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0) + log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m"); + + /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so + * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will + * misdetect systemd. */ + (void) mkdir_label("/run/systemd", 0755); + (void) mkdir_label("/run/systemd/system", 0755); + + /* Make sure there's always a place where sandboxed environments can mount root file systems they are + * about to move into, even when unprivileged, without having to create a temporary one in /tmp/ + * (which they then have to keep track of and clean) */ + (void) mkdir_label("/run/systemd/mount-rootfs", 0555); + + /* Make sure we have a mount point to hide in sandboxes */ + (void) mkdir_label("/run/credentials", 0755); + + /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount + * inaccessible nodes from. If we run in a container the host might have created these for us already + * in /run/host/inaccessible/. Use those if we can, since that way we likely get access to block/char + * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a + * userns outside the container and thus nicely read-only and not remountable. */ + if (access("/run/host/inaccessible/", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m"); + + (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID); + } else + (void) symlink("../host/inaccessible", "/run/systemd/inaccessible"); + + return 0; +} diff --git a/src/shared/mount-setup.h b/src/shared/mount-setup.h new file mode 100644 index 0000000..29bd62f --- /dev/null +++ b/src/shared/mount-setup.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +int mount_setup_early(void); +int mount_setup(bool loaded_policy, bool leave_propagation); + +int mount_cgroup_controllers(void); + +bool mount_point_is_api(const char *path); +bool mount_point_ignore(const char *path); diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c new file mode 100644 index 0000000..4f2acce --- /dev/null +++ b/src/shared/mount-util.c @@ -0,0 +1,1785 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#if WANT_LINUX_FS_H +#include +#endif + +#include "alloc-util.h" +#include "chase.h" +#include "dissect-image.h" +#include "exec-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "initrd-util.h" +#include "label-util.h" +#include "libmount-util.h" +#include "missing_mount.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "set.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "user-util.h" + +int umount_recursive_full(const char *prefix, int flags, char **keep) { + _cleanup_fclose_ FILE *f = NULL; + int n = 0, r; + + /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and + * keep unmounting them until they are gone. */ + + f = fopen("/proc/self/mountinfo", "re"); /* Pin the file, in case we unmount /proc/ as part of the logic here */ + if (!f) + return log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m"); + + for (;;) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + bool again = false; + + r = libmount_parse("/proc/self/mountinfo", f, &table, &iter); + if (r < 0) + return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + bool shall_keep = false; + struct libmnt_fs *fs; + const char *path; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) + break; + if (r < 0) + return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + + path = mnt_fs_get_target(fs); + if (!path) + continue; + + if (prefix && !path_startswith(path, prefix)) { + log_trace("Not unmounting %s, outside of prefix: %s", path, prefix); + continue; + } + + STRV_FOREACH(k, keep) + /* Match against anything in the path to the dirs to keep, or below the dirs to keep */ + if (path_startswith(path, *k) || path_startswith(*k, path)) { + shall_keep = true; + break; + } + if (shall_keep) { + log_debug("Not unmounting %s, referenced by keep list.", path); + continue; + } + + if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) { + log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path); + continue; + } + + log_trace("Successfully unmounted %s", path); + + again = true; + n++; + + break; + } + + if (!again) + break; + + rewind(f); + } + + return n; +} + +#define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW) + +static uint64_t ms_flags_to_mount_attr(unsigned long a) { + uint64_t f = 0; + + if (FLAGS_SET(a, MS_RDONLY)) + f |= MOUNT_ATTR_RDONLY; + + if (FLAGS_SET(a, MS_NOSUID)) + f |= MOUNT_ATTR_NOSUID; + + if (FLAGS_SET(a, MS_NODEV)) + f |= MOUNT_ATTR_NODEV; + + if (FLAGS_SET(a, MS_NOEXEC)) + f |= MOUNT_ATTR_NOEXEC; + + if (FLAGS_SET(a, MS_NOSYMFOLLOW)) + f |= MOUNT_ATTR_NOSYMFOLLOW; + + return f; +} + +static bool skip_mount_set_attr = false; + +/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it + * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */ +int bind_remount_recursive_with_mountinfo( + const char *prefix, + unsigned long new_flags, + unsigned long flags_mask, + char **deny_list, + FILE *proc_self_mountinfo) { + + _cleanup_fclose_ FILE *proc_self_mountinfo_opened = NULL; + _cleanup_set_free_ Set *done = NULL; + unsigned n_tries = 0; + int r; + + assert(prefix); + + if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && strv_isempty(deny_list) && !skip_mount_set_attr) { + /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */ + + if (mount_setattr(AT_FDCWD, prefix, AT_SYMLINK_NOFOLLOW|AT_RECURSIVE, + &(struct mount_attr) { + .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask), + .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask), + }, MOUNT_ATTR_SIZE_VER0) < 0) { + + log_debug_errno(errno, "mount_setattr() failed, falling back to classic remounting: %m"); + + /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We + * also do this for all other kinds of errors since they are so many different, and + * mount_setattr() has no graceful mode where it continues despite seeing errors one + * some mounts, but we want that. Moreover mount_setattr() only works on the mount + * point inode itself, not a non-mount point inode, and we want to support arbitrary + * prefixes here. */ + + if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */ + skip_mount_set_attr = true; + } else + return 0; /* Nice, this worked! */ + } + + if (!proc_self_mountinfo) { + r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened); + if (r < 0) + return r; + + proc_self_mountinfo = proc_self_mountinfo_opened; + } + + /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY, + * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it + * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it + * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can + * access, too. When mounts are stacked on the same mount point we only care for each individual + * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do + * not have any effect on future submounts that might get propagated, they might be writable + * etc. This includes future submounts that have been triggered via autofs. Also note that we can't + * operate atomically here. Mounts established while we process the tree might or might not get + * noticed and thus might or might not be covered. + * + * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the + * remount operation. Note that we'll ignore the deny list for the top-level path. */ + + for (;;) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + _cleanup_hashmap_free_ Hashmap *todo = NULL; + bool top_autofs = false; + + if (n_tries++ >= 32) /* Let's not retry this loop forever */ + return -EBUSY; + + rewind(proc_self_mountinfo); + + r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter); + if (r < 0) + return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + _cleanup_free_ char *d = NULL; + const char *path, *type, *opts; + unsigned long flags = 0; + struct libmnt_fs *fs; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) /* EOF */ + break; + if (r < 0) + return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + + path = mnt_fs_get_target(fs); + if (!path) + continue; + + if (!path_startswith(path, prefix)) + continue; + + type = mnt_fs_get_fstype(fs); + if (!type) + continue; + + /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid + * triggering them, as we don't make any guarantees for future submounts anyway. If + * they are already triggered, then we will find another entry for this. */ + if (streq(type, "autofs")) { + top_autofs = top_autofs || path_equal(path, prefix); + continue; + } + + if (set_contains(done, path)) + continue; + + /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount + * we shall operate on. */ + if (!path_equal(path, prefix)) { + bool deny_listed = false; + + STRV_FOREACH(i, deny_list) { + if (path_equal(*i, prefix)) + continue; + + if (!path_startswith(*i, prefix)) + continue; + + if (path_startswith(path, *i)) { + deny_listed = true; + log_trace("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix); + break; + } + } + + if (deny_listed) + continue; + } + + opts = mnt_fs_get_vfs_options(fs); + if (opts) { + r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP)); + if (r < 0) + log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path); + } + + d = strdup(path); + if (!d) + return -ENOMEM; + + r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags)); + if (r == -EEXIST) + /* If the same path was recorded, but with different mount flags, update it: + * it means a mount point is overmounted, and libmount returns the "bottom" (or + * older one) first, but we want to reapply the flags from the "top" (or newer + * one). See: https://github.com/systemd/systemd/issues/20032 + * Note that this shouldn't really fail, as we were just told that the key + * exists, and it's an update so we want 'd' to be freed immediately. */ + r = hashmap_update(todo, d, ULONG_TO_PTR(flags)); + if (r < 0) + return r; + if (r > 0) + TAKE_PTR(d); + } + + /* Check if the top-level directory was among what we have seen so far. For that check both + * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll + * not include it in either set but will set this bool. */ + if (!set_contains(done, prefix) && + !(top_autofs || hashmap_contains(todo, prefix))) { + + /* The prefix directory itself is not yet a mount, make it one. */ + r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + /* Immediately rescan, so that we pick up the new mount's flags */ + continue; + } + + /* If we have no submounts to process anymore, we are done */ + if (hashmap_isempty(todo)) + return 0; + + for (;;) { + unsigned long flags; + char *x = NULL; + + /* Take the first mount from our list of mounts to still process */ + flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x)); + if (!x) + break; + + r = set_ensure_consume(&done, &path_hash_ops_free, x); + if (IN_SET(r, 0, -EEXIST)) + continue; /* Already done */ + if (r < 0) + return r; + + /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's + * the default anyway, thus redundant, and in userns we'll get an error if we try to + * explicitly enable it) */ + r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL); + if (r < 0) { + int q; + + /* OK, so the remount of this entry failed. We'll ultimately ignore this in + * almost all cases (there are simply so many reasons why this can fail, + * think autofs, NFS, FUSE, …), but let's generate useful debug messages at + * the very least. */ + + q = path_is_mount_point(x, NULL, 0); + if (IN_SET(q, 0, -ENOENT)) { + /* Hmm, whaaaa? The mount point is not actually a mount point? Then + * it is either obstructed by a later mount or somebody has been + * racing against us and removed it. Either way the mount point + * doesn't matter to us, let's ignore it hence. */ + log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x); + continue; + } + if (q < 0) /* Any other error on this? Just log and continue */ + log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x); + + if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */ + log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x); + continue; + } + + /* Make this fatal if this is the top-level mount */ + if (path_equal(x, prefix)) + return r; + + /* If this is not the top-level mount, then handle this gracefully: log but + * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons + * this might fail without a chance for us to do anything about it, let's + * hence be strict on the top-level mount and lenient on the inner ones. */ + log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x); + continue; + } + + log_trace("Remounted %s.", x); + } + } +} + +int bind_remount_one_with_mountinfo( + const char *path, + unsigned long new_flags, + unsigned long flags_mask, + FILE *proc_self_mountinfo) { + + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + unsigned long flags = 0; + struct libmnt_fs *fs; + const char *opts; + int r; + + assert(path); + assert(proc_self_mountinfo); + + if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && !skip_mount_set_attr) { + /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */ + + if (mount_setattr(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW, + &(struct mount_attr) { + .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask), + .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask), + }, MOUNT_ATTR_SIZE_VER0) < 0) { + + log_debug_errno(errno, "mount_setattr() didn't work, falling back to classic remounting: %m"); + + if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */ + skip_mount_set_attr = true; + } else + return 0; /* Nice, this worked! */ + } + + rewind(proc_self_mountinfo); + + table = mnt_new_table(); + if (!table) + return -ENOMEM; + + r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo"); + if (r < 0) + return r; + + fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD); + if (!fs) { + if (laccess(path, F_OK) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */ + return -errno; + + return -EINVAL; /* Not a mount point we recognize */ + } + + opts = mnt_fs_get_vfs_options(fs); + if (opts) { + r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP)); + if (r < 0) + log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path); + } + + r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL); + if (r < 0) { + if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again, + * since kernel adds it in + * everywhere, because it's the + * default. */ + return r; + + /* Let's handle redundant remounts gracefully */ + log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path); + } + + return 0; +} + +static int mount_switch_root_pivot(int fd_newroot, const char *path) { + assert(fd_newroot >= 0); + assert(path); + + /* Change into the new rootfs. */ + if (fchdir(fd_newroot) < 0) + return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path); + + /* Let the kernel tuck the new root under the old one. */ + if (pivot_root(".", ".") < 0) + return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path); + + /* Get rid of the old root and reveal our brand new root. (This will always operate on the top-most + * mount on our cwd, regardless what our current directory actually points to.) */ + if (umount2(".", MNT_DETACH) < 0) + return log_debug_errno(errno, "Failed to unmount old rootfs: %m"); + + return 0; +} + +static int mount_switch_root_move(int fd_newroot, const char *path) { + assert(fd_newroot >= 0); + assert(path); + + /* Change into the new rootfs. */ + if (fchdir(fd_newroot) < 0) + return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path); + + /* Move the new root fs */ + if (mount(".", "/", NULL, MS_MOVE, NULL) < 0) + return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path); + + /* Also change root dir */ + if (chroot(".") < 0) + return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path); + + return 0; +} + +int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move) { + _cleanup_close_ int fd_newroot = -EBADF; + int r; + + assert(path); + assert(mount_propagation_flag_is_valid(mount_propagation_flag)); + + fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + if (fd_newroot < 0) + return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path); + + if (!force_ms_move) { + r = mount_switch_root_pivot(fd_newroot, path); + if (r < 0) { + log_debug_errno(r, "Failed to pivot into new rootfs '%s', will try to use MS_MOVE instead: %m", path); + force_ms_move = true; + } + } + if (force_ms_move) { + /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the rootfs is + * an initramfs in which case pivot_root() isn't supported. */ + r = mount_switch_root_move(fd_newroot, path); + if (r < 0) + return log_debug_errno(r, "Failed to switch to new rootfs '%s' with MS_MOVE: %m", path); + } + + /* Finally, let's establish the requested propagation flags. */ + if (mount_propagation_flag == 0) + return 0; + + if (mount(NULL, ".", NULL, mount_propagation_flag | MS_REC, 0) < 0) + return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m", + mount_propagation_flag_to_string(mount_propagation_flag), path); + + return 0; +} + +int repeat_unmount(const char *path, int flags) { + bool done = false; + + assert(path); + + /* If there are multiple mounts on a mount point, this + * removes them all */ + + for (;;) { + if (umount2(path, flags) < 0) { + + if (errno == EINVAL) + return done; + + return -errno; + } + + done = true; + } +} + +int mode_to_inaccessible_node( + const char *runtime_dir, + mode_t mode, + char **ret) { + + /* This function maps a node type to a corresponding inaccessible file node. These nodes are created + * during early boot by PID 1. In some cases we lacked the privs to create the character and block + * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a + * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that + * case we use an AF_UNIX file node instead, which is not the same, but close enough for most + * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory + * file nodes, and that's the most important thing that matters. + * + * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if + * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */ + + _cleanup_free_ char *d = NULL; + const char *node; + + assert(ret); + + if (!runtime_dir) + runtime_dir = "/run"; + + if (S_ISLNK(mode)) + return -EINVAL; + + node = inode_type_to_string(mode); + if (!node) + return -EINVAL; + + d = path_join(runtime_dir, "systemd/inaccessible", node); + if (!d) + return -ENOMEM; + + /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also + * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no + * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the + * inaccessible block device node let's see if the block device node actually exists, and if not, + * fall back to the character device node. From there fall back to the socket device node. This means + * in the best case we'll get the right device node type — but if not we'll hopefully at least get a + * device node at all. */ + + if (S_ISBLK(mode) && + access(d, F_OK) < 0 && errno == ENOENT) { + free(d); + d = path_join(runtime_dir, "/systemd/inaccessible/chr"); + if (!d) + return -ENOMEM; + } + + if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) && + access(d, F_OK) < 0 && errno == ENOENT) { + free(d); + d = path_join(runtime_dir, "/systemd/inaccessible/sock"); + if (!d) + return -ENOMEM; + } + + *ret = TAKE_PTR(d); + return 0; +} + +int mount_flags_to_string(unsigned long flags, char **ret) { + static const struct { + unsigned long flag; + const char *name; + } map[] = { + { .flag = MS_RDONLY, .name = "MS_RDONLY", }, + { .flag = MS_NOSUID, .name = "MS_NOSUID", }, + { .flag = MS_NODEV, .name = "MS_NODEV", }, + { .flag = MS_NOEXEC, .name = "MS_NOEXEC", }, + { .flag = MS_SYNCHRONOUS, .name = "MS_SYNCHRONOUS", }, + { .flag = MS_REMOUNT, .name = "MS_REMOUNT", }, + { .flag = MS_MANDLOCK, .name = "MS_MANDLOCK", }, + { .flag = MS_DIRSYNC, .name = "MS_DIRSYNC", }, + { .flag = MS_NOSYMFOLLOW, .name = "MS_NOSYMFOLLOW", }, + { .flag = MS_NOATIME, .name = "MS_NOATIME", }, + { .flag = MS_NODIRATIME, .name = "MS_NODIRATIME", }, + { .flag = MS_BIND, .name = "MS_BIND", }, + { .flag = MS_MOVE, .name = "MS_MOVE", }, + { .flag = MS_REC, .name = "MS_REC", }, + { .flag = MS_SILENT, .name = "MS_SILENT", }, + { .flag = MS_POSIXACL, .name = "MS_POSIXACL", }, + { .flag = MS_UNBINDABLE, .name = "MS_UNBINDABLE", }, + { .flag = MS_PRIVATE, .name = "MS_PRIVATE", }, + { .flag = MS_SLAVE, .name = "MS_SLAVE", }, + { .flag = MS_SHARED, .name = "MS_SHARED", }, + { .flag = MS_RELATIME, .name = "MS_RELATIME", }, + { .flag = MS_KERNMOUNT, .name = "MS_KERNMOUNT", }, + { .flag = MS_I_VERSION, .name = "MS_I_VERSION", }, + { .flag = MS_STRICTATIME, .name = "MS_STRICTATIME", }, + { .flag = MS_LAZYTIME, .name = "MS_LAZYTIME", }, + }; + _cleanup_free_ char *str = NULL; + + assert(ret); + + for (size_t i = 0; i < ELEMENTSOF(map); i++) + if (flags & map[i].flag) { + if (!strextend_with_separator(&str, "|", map[i].name)) + return -ENOMEM; + flags &= ~map[i].flag; + } + + if (!str || flags != 0) + if (strextendf_with_separator(&str, "|", "%lx", flags) < 0) + return -ENOMEM; + + *ret = TAKE_PTR(str); + return 0; +} + +int mount_verbose_full( + int error_log_level, + const char *what, + const char *where, + const char *type, + unsigned long flags, + const char *options, + bool follow_symlink) { + + _cleanup_free_ char *fl = NULL, *o = NULL; + unsigned long f; + int r; + + r = mount_option_mangle(options, flags, &f, &o); + if (r < 0) + return log_full_errno(error_log_level, r, + "Failed to mangle mount options %s: %m", + strempty(options)); + + (void) mount_flags_to_string(f, &fl); + + if (FLAGS_SET(f, MS_REMOUNT|MS_BIND)) + log_debug("Changing mount flags %s (%s \"%s\")...", + where, strnull(fl), strempty(o)); + else if (f & MS_REMOUNT) + log_debug("Remounting superblock %s (%s \"%s\")...", + where, strnull(fl), strempty(o)); + else if (f & (MS_SHARED|MS_PRIVATE|MS_SLAVE|MS_UNBINDABLE)) + log_debug("Changing mount propagation %s (%s \"%s\")", + where, strnull(fl), strempty(o)); + else if (f & MS_BIND) + log_debug("Bind-mounting %s on %s (%s \"%s\")...", + what, where, strnull(fl), strempty(o)); + else if (f & MS_MOVE) + log_debug("Moving mount %s %s %s (%s \"%s\")...", + what, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), where, strnull(fl), strempty(o)); + else + log_debug("Mounting %s (%s) on %s (%s \"%s\")...", + strna(what), strna(type), where, strnull(fl), strempty(o)); + + if (follow_symlink) + r = RET_NERRNO(mount(what, where, type, f, o)); + else + r = mount_nofollow(what, where, type, f, o); + if (r < 0) + return log_full_errno(error_log_level, r, + "Failed to mount %s (type %s) on %s (%s \"%s\"): %m", + strna(what), strna(type), where, strnull(fl), strempty(o)); + return 0; +} + +int umount_verbose( + int error_log_level, + const char *what, + int flags) { + + assert(what); + + log_debug("Umounting %s...", what); + + if (umount2(what, flags) < 0) + return log_full_errno(error_log_level, errno, + "Failed to unmount %s: %m", what); + + return 0; +} + +int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) { + int r; + + assert(fsmount_fd >= 0); + assert(dest); + + /* First, try to mount beneath an existing mount point, and if that works, umount the old mount, + * which is now at the top. This will ensure we can atomically replace a mount. Note that this works + * also in the case where there are submounts down the tree. Mount propagation is allowed but + * restricted to layouts that don't end up propagation the new mount on top of the mount stack. If + * this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get + * -EINVAL and then we fallback to normal mounting. */ + + r = RET_NERRNO(move_mount( + fsmount_fd, + /* from_path= */ "", + /* to_fd= */ -EBADF, + dest, + MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0))); + if (mount_beneath) { + if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */ + log_debug_errno(r, + "Failed to mount beneath '%s', falling back to overmount", + dest); + return RET_NERRNO(move_mount( + fsmount_fd, + /* from_path= */ "", + /* to_fd= */ -EBADF, + dest, + MOVE_MOUNT_F_EMPTY_PATH)); + } + + if (r >= 0) /* If it is, now remove the old mount */ + return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH); + } + + return r; +} + +int mount_option_mangle( + const char *options, + unsigned long mount_flags, + unsigned long *ret_mount_flags, + char **ret_remaining_options) { + + const struct libmnt_optmap *map; + _cleanup_free_ char *ret = NULL; + int r; + + /* This extracts mount flags from the mount options, and stores + * non-mount-flag options to '*ret_remaining_options'. + * E.g., + * "rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000" + * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and + * "size=1630748k,mode=0700,uid=1000,gid=1000". + * See more examples in test-mount-util.c. + * + * If 'options' does not contain any non-mount-flag options, + * then '*ret_remaining_options' is set to NULL instead of empty string. + * The validity of options stored in '*ret_remaining_options' is not checked. + * If 'options' is NULL, this just copies 'mount_flags' to *ret_mount_flags. */ + + assert(ret_mount_flags); + assert(ret_remaining_options); + + map = mnt_get_builtin_optmap(MNT_LINUX_MAP); + if (!map) + return -EINVAL; + + for (const char *p = options;;) { + _cleanup_free_ char *word = NULL; + const struct libmnt_optmap *ent; + + r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE); + if (r < 0) + return r; + if (r == 0) + break; + + for (ent = map; ent->name; ent++) { + /* All entries in MNT_LINUX_MAP do not take any argument. + * Thus, ent->name does not contain "=" or "[=]". */ + if (!streq(word, ent->name)) + continue; + + if (!(ent->mask & MNT_INVERT)) + mount_flags |= ent->id; + else if (mount_flags & ent->id) + mount_flags ^= ent->id; + + break; + } + + /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */ + if (!ent->name && + !startswith_no_case(word, "x-") && + !strextend_with_separator(&ret, ",", word)) + return -ENOMEM; + } + + *ret_mount_flags = mount_flags; + *ret_remaining_options = TAKE_PTR(ret); + + return 0; +} + +static int mount_in_namespace_legacy( + const char *chased_src_path, + int chased_src_fd, + struct stat *chased_src_st, + const char *propagate_path, + const char *incoming_path, + const char *dest, + int pidns_fd, + int mntns_fd, + int root_fd, + bool read_only, + bool make_file_or_directory, + const MountOptions *options, + const ImagePolicy *image_policy, + bool is_image) { + + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p; + bool mount_slave_created = false, mount_slave_mounted = false, + mount_tmp_created = false, mount_tmp_mounted = false, + mount_outside_created = false, mount_outside_mounted = false; + pid_t child; + int r; + + assert(chased_src_path); + assert(chased_src_fd >= 0); + assert(chased_src_st); + assert(propagate_path); + assert(incoming_path); + assert(dest); + assert(pidns_fd >= 0); + assert(mntns_fd >= 0); + assert(root_fd >= 0); + assert(!options || is_image); + + p = strjoina(propagate_path, "/"); + r = laccess(p, F_OK); + if (r < 0) + return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points"); + + /* Our goal is to install a new bind mount into the container, + possibly read-only. This is irritatingly complex + unfortunately, currently. + + First, we start by creating a private playground in /tmp, + that we can mount MS_SLAVE. (Which is necessary, since + MS_MOVE cannot be applied to mounts with MS_SHARED parent + mounts.) */ + + if (!mkdtemp(mount_slave)) + return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave); + + mount_slave_created = true; + + r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL); + if (r < 0) + goto finish; + + mount_slave_mounted = true; + + r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL); + if (r < 0) + goto finish; + + /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */ + mount_tmp = strjoina(mount_slave, "/mount"); + if (is_image) + r = mkdir_p(mount_tmp, 0700); + else + r = make_mount_point_inode_from_stat(chased_src_st, mount_tmp, 0700); + if (r < 0) { + log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp); + goto finish; + } + + mount_tmp_created = true; + + if (is_image) + r = verity_dissect_and_mount( + chased_src_fd, + chased_src_path, + mount_tmp, + options, + image_policy, + /* required_host_os_release_id= */ NULL, + /* required_host_os_release_version_id= */ NULL, + /* required_host_os_release_sysext_level= */ NULL, + /* required_host_os_release_confext_level= */ NULL, + /* required_sysext_scope= */ NULL, + /* ret_image= */ NULL); + else + r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(chased_src_fd), mount_tmp, NULL, MS_BIND, NULL); + if (r < 0) + goto finish; + + mount_tmp_mounted = true; + + /* Third, we remount the new bind mount read-only if requested. */ + if (read_only) { + r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL); + if (r < 0) + goto finish; + } + + /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only + * right-away. */ + + mount_outside = strjoina(propagate_path, "/XXXXXX"); + if (is_image || S_ISDIR(chased_src_st->st_mode)) + r = mkdtemp(mount_outside) ? 0 : -errno; + else { + r = mkostemp_safe(mount_outside); + safe_close(r); + } + if (r < 0) { + log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside); + goto finish; + } + + mount_outside_created = true; + + r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL); + if (r < 0) + goto finish; + + mount_outside_mounted = true; + mount_tmp_mounted = false; + + if (is_image || S_ISDIR(chased_src_st->st_mode)) + (void) rmdir(mount_tmp); + else + (void) unlink(mount_tmp); + mount_tmp_created = false; + + (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW); + mount_slave_mounted = false; + + (void) rmdir(mount_slave); + mount_slave_created = false; + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) { + log_debug_errno(errno, "Failed to create pipe: %m"); + goto finish; + } + + r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, + pidns_fd, mntns_fd, -1, -1, root_fd, &child); + if (r < 0) + goto finish; + if (r == 0) { + _cleanup_free_ char *mount_outside_fn = NULL, *mount_inside = NULL; + + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + if (make_file_or_directory) { + if (!is_image) { + (void) mkdir_parents(dest, 0755); + (void) make_mount_point_inode_from_stat(chased_src_st, dest, 0700); + } else + (void) mkdir_p(dest, 0755); + } + + /* Fifth, move the mount to the right place inside */ + r = path_extract_filename(mount_outside, &mount_outside_fn); + if (r < 0) { + log_debug_errno(r, "Failed to extract filename from propagation file or directory '%s': %m", mount_outside); + goto child_fail; + } + + mount_inside = path_join(incoming_path, mount_outside_fn); + if (!mount_inside) { + r = log_oom_debug(); + goto child_fail; + } + + r = mount_nofollow_verbose(LOG_DEBUG, mount_inside, dest, NULL, MS_MOVE, NULL); + if (r < 0) + goto child_fail; + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + _exit(EXIT_FAILURE); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0); + if (r < 0) { + log_debug_errno(r, "Failed to wait for child: %m"); + goto finish; + } + if (r != EXIT_SUCCESS) { + if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r)) + log_debug_errno(r, "Failed to mount: %m"); + else + log_debug("Child failed."); + goto finish; + } + +finish: + if (mount_outside_mounted) + (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW); + if (mount_outside_created) { + if (is_image || S_ISDIR(chased_src_st->st_mode)) + (void) rmdir(mount_outside); + else + (void) unlink(mount_outside); + } + + if (mount_tmp_mounted) + (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW); + if (mount_tmp_created) { + if (is_image || S_ISDIR(chased_src_st->st_mode)) + (void) rmdir(mount_tmp); + else + (void) unlink(mount_tmp); + } + + if (mount_slave_mounted) + (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW); + if (mount_slave_created) + (void) rmdir(mount_slave); + + return r; +} + +static int mount_in_namespace( + const PidRef *target, + const char *propagate_path, + const char *incoming_path, + const char *src, + const char *dest, + bool read_only, + bool make_file_or_directory, + const MountOptions *options, + const ImagePolicy *image_policy, + bool is_image) { + + _cleanup_(dissected_image_unrefp) DissectedImage *img = NULL; + _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR; + _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF, chased_src_fd = -EBADF, + new_mount_fd = -EBADF; + _cleanup_free_ char *chased_src_path = NULL; + struct stat st; + pid_t child; + int r; + + assert(propagate_path); + assert(incoming_path); + assert(src); + assert(dest); + assert(!options || is_image); + + if (!pidref_is_set(target)) + return -ESRCH; + + r = namespace_open(target->pid, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd); + if (r < 0) + return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m"); + + r = in_same_namespace(target->pid, 0, NAMESPACE_MOUNT); + if (r < 0) + return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m"); + /* We can't add new mounts at runtime if the process wasn't started in a namespace */ + if (r > 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace"); + + r = pidref_verify(target); + if (r < 0) + return log_debug_errno(r, "Failed to verify target process '" PID_FMT "': %m", target->pid); + + r = chase(src, NULL, 0, &chased_src_path, &chased_src_fd); + if (r < 0) + return log_debug_errno(r, "Failed to resolve source path of %s: %m", src); + log_debug("Chased source path of %s to %s", src, chased_src_path); + + if (fstat(chased_src_fd, &st) < 0) + return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src); + if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */ + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src); + + if (!mount_new_api_supported()) /* Fallback if we can't use the new mount API */ + return mount_in_namespace_legacy( + chased_src_path, + chased_src_fd, + &st, + propagate_path, + incoming_path, + dest, + pidns_fd, + mntns_fd, + root_fd, + read_only, + make_file_or_directory, + options, + image_policy, + is_image); + + if (is_image) { + r = verity_dissect_and_mount( + chased_src_fd, + chased_src_path, + /* dest= */ NULL, + options, + image_policy, + /* required_host_os_release_id= */ NULL, + /* required_host_os_release_version_id= */ NULL, + /* required_host_os_release_sysext_level= */ NULL, + /* required_host_os_release_confext_level= */ NULL, + /* required_sysext_scope= */ NULL, + &img); + if (r < 0) + return log_debug_errno( + r, + "Failed to dissect and mount image %s: %m", + chased_src_path); + } else { + new_mount_fd = open_tree( + chased_src_fd, + "", + OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH); + if (new_mount_fd < 0) + return log_debug_errno( + errno, + "Failed to open mount point \"%s\": %m", + chased_src_path); + + if (read_only && mount_setattr(new_mount_fd, "", AT_EMPTY_PATH, + &(struct mount_attr) { + .attr_set = MOUNT_ATTR_RDONLY, + }, MOUNT_ATTR_SIZE_VER0) < 0) + return log_debug_errno( + errno, + "Failed to set mount flags for \"%s\": %m", + chased_src_path); + } + + if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) + return log_debug_errno(errno, "Failed to create pipe: %m"); + + r = namespace_fork("(sd-bindmnt)", + "(sd-bindmnt-inner)", + /* except_fds= */ NULL, + /* n_except_fds= */ 0, + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, + pidns_fd, + mntns_fd, + /* netns_fd= */ -1, + /* userns_fd= */ -1, + root_fd, + &child); + if (r < 0) + return log_debug_errno(r, "Failed to fork off: %m"); + if (r == 0) { + errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); + + if (make_file_or_directory) + (void) mkdir_parents(dest, 0755); + + if (img) { + DissectImageFlags f = DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE; + + if (make_file_or_directory) + f |= DISSECT_IMAGE_MKDIR; + + if (read_only) + f |= DISSECT_IMAGE_READ_ONLY; + + r = dissected_image_mount( + img, + dest, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + f); + } else { + if (make_file_or_directory) + (void) make_mount_point_inode_from_stat(&st, dest, 0700); + + r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true); + } + if (r < 0) { + (void) write(errno_pipe_fd[1], &r, sizeof(r)); + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]); + + r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0); + if (r < 0) + return log_debug_errno(r, "Failed to wait for child: %m"); + if (r != EXIT_SUCCESS) { + if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r)) + return log_debug_errno(r, "Failed to mount: %m"); + + return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Child failed."); + } + + return 0; +} + +int bind_mount_in_namespace( + PidRef * target, + const char *propagate_path, + const char *incoming_path, + const char *src, + const char *dest, + bool read_only, + bool make_file_or_directory) { + + return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, /* options= */ NULL, /* image_policy= */ NULL, /* is_image= */ false); +} + +int mount_image_in_namespace( + PidRef * target, + const char *propagate_path, + const char *incoming_path, + const char *src, + const char *dest, + bool read_only, + bool make_file_or_directory, + const MountOptions *options, + const ImagePolicy *image_policy) { + + return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, image_policy, /* is_image=*/ true); +} + +int make_mount_point(const char *path) { + int r; + + assert(path); + + /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */ + + r = path_is_mount_point(path, NULL, 0); + if (r < 0) + return log_debug_errno(r, "Failed to determine whether '%s' is a mount point: %m", path); + if (r > 0) + return 0; + + r = mount_nofollow_verbose(LOG_DEBUG, path, path, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + return 1; +} + +int fd_make_mount_point(int fd) { + int r; + + assert(fd >= 0); + + r = fd_is_mount_point(fd, NULL, 0); + if (r < 0) + return log_debug_errno(r, "Failed to determine whether file descriptor is a mount point: %m"); + if (r > 0) + return 0; + + r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(fd), FORMAT_PROC_FD_PATH(fd), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + return 1; +} + +int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) { + _cleanup_close_ int userns_fd = -EBADF; + _cleanup_free_ char *line = NULL; + + /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child + * process whose only purpose is to give us a new user namespace. It's killed when we got it. */ + + if (!userns_shift_range_valid(uid_shift, uid_range)) + return -EINVAL; + + if (IN_SET(idmapping, REMOUNT_IDMAPPING_NONE, REMOUNT_IDMAPPING_HOST_ROOT)) { + if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0) + return log_oom_debug(); + + /* If requested we'll include an entry in the mapping so that the host root user can make + * changes to the uidmapped mount like it normally would. Specifically, we'll map the user + * with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants + * to create various missing inodes in the OS tree before booting into it, and this becomes + * very easy and straightforward to do if it can just do it under its own regular UID. Note + * that in that case the container's runtime uidmap (i.e. the one the container payload + * processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned + * by host root in the already uidmapped tree around they'll show up as owned by 'nobody', + * which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them + * to the container's own UID range, but it's good to have a safety net, in case we + * forget it.) */ + if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT) + if (strextendf(&line, + UID_FMT " " UID_FMT " " UID_FMT "\n", + UID_MAPPED_ROOT, 0u, 1u) < 0) + return log_oom_debug(); + } + + if (idmapping == REMOUNT_IDMAPPING_HOST_OWNER) { + /* Remap the owner of the bind mounted directory to the root user within the container. This + * way every file written by root within the container to the bind-mounted directory will + * be owned by the original user. All other user will remain unmapped. */ + if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", owner, uid_shift, 1u) < 0) + return log_oom_debug(); + } + + /* We always assign the same UID and GID ranges */ + userns_fd = userns_acquire(line, line); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to acquire new userns: %m"); + + return TAKE_FD(userns_fd); +} + +int remount_idmap_fd( + char **paths, + int userns_fd) { + + int r; + + assert(userns_fd >= 0); + + /* This remounts all specified paths with the specified userns as idmap. It will do so in in the + * order specified in the strv: the expectation is that the top-level directories are at the + * beginning, and nested directories in the right, so that the tree can be built correctly from left + * to right. */ + + size_t n = strv_length(paths); + if (n == 0) /* Nothing to do? */ + return 0; + + int *mount_fds = NULL; + size_t n_mounts_fds = 0; + + mount_fds = new(int, n); + if (!mount_fds) + return log_oom_debug(); + + CLEANUP_ARRAY(mount_fds, n_mounts_fds, close_many_and_free); + + for (size_t i = 0; i < n; i++) { + int mntfd; + + /* Clone the mount point */ + mntfd = mount_fds[n_mounts_fds] = open_tree(-EBADF, paths[i], OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (mount_fds[n_mounts_fds] < 0) + return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", paths[i]); + + n_mounts_fds++; + + /* Set the user namespace mapping attribute on the cloned mount point */ + if (mount_setattr(mntfd, "", AT_EMPTY_PATH, + &(struct mount_attr) { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = userns_fd, + }, sizeof(struct mount_attr)) < 0) + return log_debug_errno(errno, "Failed to change bind mount attributes for clone of '%s': %m", paths[i]); + } + + for (size_t i = n; i > 0; i--) { /* Unmount the paths right-to-left */ + /* Remove the old mount points now that we have a idmapped mounts as replacement for all of them */ + r = umount_verbose(LOG_DEBUG, paths[i-1], UMOUNT_NOFOLLOW); + if (r < 0) + return r; + } + + for (size_t i = 0; i < n; i++) { /* Mount the replacement mounts left-to-right */ + /* And place the cloned version in its place */ + log_debug("Mounting idmapped fs to '%s'", paths[i]); + if (move_mount(mount_fds[i], "", -EBADF, paths[i], MOVE_MOUNT_F_EMPTY_PATH) < 0) + return log_debug_errno(errno, "Failed to attach UID mapped mount to '%s': %m", paths[i]); + } + + return 0; +} + +int remount_idmap(char **p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) { + _cleanup_close_ int userns_fd = -EBADF; + + userns_fd = make_userns(uid_shift, uid_range, owner, idmapping); + if (userns_fd < 0) + return userns_fd; + + return remount_idmap_fd(p, userns_fd); +} + +typedef struct SubMount { + char *path; + int mount_fd; +} SubMount; + +static void sub_mount_clear(SubMount *s) { + assert(s); + + s->path = mfree(s->path); + s->mount_fd = safe_close(s->mount_fd); +} + +static void sub_mount_array_free(SubMount *s, size_t n) { + assert(s || n == 0); + + for (size_t i = 0; i < n; i++) + sub_mount_clear(s + i); + + free(s); +} + +static int sub_mount_compare(const SubMount *a, const SubMount *b) { + assert(a); + assert(b); + assert(a->path); + assert(b->path); + + return path_compare(a->path, b->path); +} + +static void sub_mount_drop(SubMount *s, size_t n) { + assert(s || n == 0); + + for (size_t m = 0, i = 1; i < n; i++) { + if (path_startswith(s[i].path, s[m].path)) + sub_mount_clear(s + i); + else + m = i; + } +} + +static int get_sub_mounts( + const char *prefix, + SubMount **ret_mounts, + size_t *ret_n_mounts) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + SubMount *mounts = NULL; + size_t n = 0; + int r; + + CLEANUP_ARRAY(mounts, n, sub_mount_array_free); + + assert(prefix); + assert(ret_mounts); + assert(ret_n_mounts); + + r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter); + if (r < 0) + return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + + for (;;) { + _cleanup_close_ int mount_fd = -EBADF; + _cleanup_free_ char *p = NULL; + struct libmnt_fs *fs; + const char *path; + int id1, id2; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) + break; /* EOF */ + if (r < 0) + return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + + path = mnt_fs_get_target(fs); + if (!path) + continue; + + if (isempty(path_startswith(path, prefix))) + continue; + + id1 = mnt_fs_get_id(fs); + r = path_get_mnt_id(path, &id2); + if (r < 0) { + log_debug_errno(r, "Failed to get mount ID of '%s', ignoring: %m", path); + continue; + } + if (id1 != id2) { + /* The path may be hidden by another over-mount or already remounted. */ + log_debug("The mount IDs of '%s' obtained by libmount and path_get_mnt_id() are different (%i vs %i), ignoring.", + path, id1, id2); + continue; + } + + mount_fd = open(path, O_CLOEXEC|O_PATH); + if (mount_fd < 0) { + if (errno == ENOENT) /* The path may be hidden by another over-mount or already unmounted. */ + continue; + + return log_debug_errno(errno, "Failed to open subtree of mounted filesystem '%s': %m", path); + } + + p = strdup(path); + if (!p) + return log_oom_debug(); + + if (!GREEDY_REALLOC(mounts, n + 1)) + return log_oom_debug(); + + mounts[n++] = (SubMount) { + .path = TAKE_PTR(p), + .mount_fd = TAKE_FD(mount_fd), + }; + } + + typesafe_qsort(mounts, n, sub_mount_compare); + sub_mount_drop(mounts, n); + + *ret_mounts = TAKE_PTR(mounts); + *ret_n_mounts = n; + return 0; +} + +int bind_mount_submounts( + const char *source, + const char *target) { + + SubMount *mounts = NULL; + size_t n = 0; + int ret = 0, r; + + /* Bind mounts all child mounts of 'source' to 'target'. Useful when setting up a new procfs instance + * with new mount options to copy the original submounts over. */ + + assert(source); + assert(target); + + CLEANUP_ARRAY(mounts, n, sub_mount_array_free); + + r = get_sub_mounts(source, &mounts, &n); + if (r < 0) + return r; + + FOREACH_ARRAY(m, mounts, n) { + _cleanup_free_ char *t = NULL; + const char *suffix; + + if (isempty(m->path)) + continue; + + assert_se(suffix = path_startswith(m->path, source)); + + t = path_join(target, suffix); + if (!t) + return -ENOMEM; + + r = path_is_mount_point(t, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Failed to detect if '%s' already is a mount point, ignoring: %m", t); + continue; + } + if (r > 0) { + log_debug("Not bind mounting '%s' from '%s' to '%s', since there's already a mountpoint.", suffix, source, target); + continue; + } + + r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(m->mount_fd), t, NULL, MS_BIND|MS_REC, NULL); + if (r < 0 && ret == 0) + ret = r; + } + + return ret; +} + +int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode) { + assert(st); + assert(dest); + + if (S_ISDIR(st->st_mode)) + return mkdir_label(dest, mode); + else + return RET_NERRNO(mknod(dest, S_IFREG|(mode & ~0111), 0)); +} + +int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t mode) { + struct stat st; + + assert(source); + assert(dest); + + if (stat(source, &st) < 0) + return -errno; + + return make_mount_point_inode_from_stat(&st, dest, mode); +} + +int trigger_automount_at(int dir_fd, const char *path) { + _cleanup_free_ char *nested = NULL; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + nested = path_join(path, "a"); + if (!nested) + return -ENOMEM; + + (void) faccessat(dir_fd, nested, F_OK, 0); + + return 0; +} + +unsigned long credentials_fs_mount_flags(bool ro) { + /* A tight set of mount flags for credentials mounts */ + return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0); +} + +int mount_credentials_fs(const char *path, size_t size, bool ro) { + _cleanup_free_ char *opts = NULL; + int r, noswap_supported; + + /* Mounts a file system we can place credentials in, i.e. with tight access modes right from the + * beginning, and ideally swapping turned off. In order of preference: + * + * 1. tmpfs if it supports "noswap" + * 2. ramfs + * 3. tmpfs if it doesn't support "noswap" + */ + + noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */ + if (noswap_supported > 0) { + _cleanup_free_ char *noswap_opts = NULL; + + if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0) + return -ENOMEM; + + /* Best case: tmpfs with noswap (needs kernel >= 6.3) */ + + r = mount_nofollow_verbose( + LOG_DEBUG, + "tmpfs", + path, + "tmpfs", + credentials_fs_mount_flags(ro), + noswap_opts); + if (r >= 0) + return r; + } + + r = mount_nofollow_verbose( + LOG_DEBUG, + "ramfs", + path, + "ramfs", + credentials_fs_mount_flags(ro), + "mode=0700"); + if (r >= 0) + return r; + + if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0) + return -ENOMEM; + + return mount_nofollow_verbose( + LOG_DEBUG, + "tmpfs", + path, + "tmpfs", + credentials_fs_mount_flags(ro), + opts); +} + +int make_fsmount( + int error_log_level, + const char *what, + const char *type, + unsigned long flags, + const char *options, + int userns_fd) { + + _cleanup_close_ int fs_fd = -EBADF, mnt_fd = -EBADF; + _cleanup_free_ char *o = NULL; + unsigned long f; + int r; + + assert(type); + assert(what); + + r = mount_option_mangle(options, flags, &f, &o); + if (r < 0) + return log_full_errno( + error_log_level, r, "Failed to mangle mount options %s: %m", + strempty(options)); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *fl = NULL; + (void) mount_flags_to_string(f, &fl); + + log_debug("Creating mount fd for %s (%s) (%s \"%s\")...", + strna(what), strna(type), strnull(fl), strempty(o)); + } + + fs_fd = fsopen(type, FSOPEN_CLOEXEC); + if (fs_fd < 0) + return log_full_errno(error_log_level, errno, "Failed to open superblock for \"%s\": %m", type); + + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "source", what, 0) < 0) + return log_full_errno(error_log_level, errno, "Failed to set mount source for \"%s\" to \"%s\": %m", type, what); + + if (FLAGS_SET(f, MS_RDONLY)) + if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0) + return log_full_errno(error_log_level, errno, "Failed to set read only mount flag for \"%s\": %m", type); + + for (const char *p = o;;) { + _cleanup_free_ char *word = NULL; + char *eq; + + r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE); + if (r < 0) + return log_full_errno(error_log_level, r, "Failed to parse mount option string \"%s\": %m", o); + if (r == 0) + break; + + eq = strchr(word, '='); + if (eq) { + *eq = 0; + eq++; + + if (fsconfig(fs_fd, FSCONFIG_SET_STRING, word, eq, 0) < 0) + return log_full_errno(error_log_level, errno, "Failed to set mount option \"%s=%s\" for \"%s\": %m", word, eq, type); + } else { + if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, word, NULL, 0) < 0) + return log_full_errno(error_log_level, errno, "Failed to set mount flag \"%s\" for \"%s\": %m", word, type); + } + } + + if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return log_full_errno(error_log_level, errno, "Failed to realize fs fd for \"%s\" (\"%s\"): %m", what, type); + + mnt_fd = fsmount(fs_fd, FSMOUNT_CLOEXEC, 0); + if (mnt_fd < 0) + return log_full_errno(error_log_level, errno, "Failed to create mount fd for \"%s\" (\"%s\"): %m", what, type); + + if (mount_setattr(mnt_fd, "", AT_EMPTY_PATH|AT_RECURSIVE, + &(struct mount_attr) { + .attr_set = ms_flags_to_mount_attr(f) | (userns_fd >= 0 ? MOUNT_ATTR_IDMAP : 0), + .userns_fd = userns_fd, + }, MOUNT_ATTR_SIZE_VER0) < 0) + return log_full_errno(error_log_level, + errno, + "Failed to set mount flags for \"%s\" (\"%s\"): %m", + what, + type); + + return TAKE_FD(mnt_fd); +} diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h new file mode 100644 index 0000000..ef31104 --- /dev/null +++ b/src/shared/mount-util.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dissect-image.h" +#include "errno-util.h" +#include "macro.h" +#include "pidref.h" + +int repeat_unmount(const char *path, int flags); + +int umount_recursive_full(const char *target, int flags, char **keep); + +static inline int umount_recursive(const char *target, int flags) { + return umount_recursive_full(target, flags, NULL); +} + +int bind_remount_recursive_with_mountinfo(const char *prefix, unsigned long new_flags, unsigned long flags_mask, char **deny_list, FILE *proc_self_mountinfo); +static inline int bind_remount_recursive(const char *prefix, unsigned long new_flags, unsigned long flags_mask, char **deny_list) { + return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, NULL); +} + +int bind_remount_one_with_mountinfo(const char *path, unsigned long new_flags, unsigned long flags_mask, FILE *proc_self_mountinfo); + +int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move); +static inline int mount_switch_root(const char *path, unsigned long mount_propagation_flag) { + return mount_switch_root_full(path, mount_propagation_flag, false); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(FILE*, endmntent, NULL); +#define _cleanup_endmntent_ _cleanup_(endmntentp) + +int mount_verbose_full( + int error_log_level, + const char *what, + const char *where, + const char *type, + unsigned long flags, + const char *options, + bool follow_symlink); + +static inline int mount_follow_verbose( + int error_log_level, + const char *what, + const char *where, + const char *type, + unsigned long flags, + const char *options) { + return mount_verbose_full(error_log_level, what, where, type, flags, options, true); +} + +static inline int mount_nofollow_verbose( + int error_log_level, + const char *what, + const char *where, + const char *type, + unsigned long flags, + const char *options) { + return mount_verbose_full(error_log_level, what, where, type, flags, options, false); +} + +int umount_verbose( + int error_log_level, + const char *where, + int flags); + +int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath); + +int mount_option_mangle( + const char *options, + unsigned long mount_flags, + unsigned long *ret_mount_flags, + char **ret_remaining_options); + +int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest); +int mount_flags_to_string(unsigned long flags, char **ret); + +/* Useful for usage with _cleanup_(), unmounts, removes a directory and frees the pointer */ +static inline char* umount_and_rmdir_and_free(char *p) { + PROTECT_ERRNO; + if (p) { + (void) umount_recursive(p, 0); + (void) rmdir(p); + } + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_rmdir_and_free); + +static inline char *umount_and_free(char *p) { + PROTECT_ERRNO; + if (p) + (void) umount_recursive(p, 0); + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_free); + +int bind_mount_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory); +int mount_image_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory, const MountOptions *options, const ImagePolicy *image_policy); + +int make_mount_point(const char *path); +int fd_make_mount_point(int fd); + +typedef enum RemountIdmapping { + REMOUNT_IDMAPPING_NONE, + /* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the + * uidmapped fs. This is useful to ensure that the host root user can safely add inodes to the + * uidmapped fs (which otherwise wouldn't work as the host root user is not defined on the uidmapped + * mount and any attempts to create inodes will then be refused with EOVERFLOW). The idea is that + * these inodes are quickly re-chown()ed to more suitable UIDs/GIDs. Any code that intends to be able + * to add inodes to file systems mapped this way should set this flag, but given it comes with + * certain security implications defaults to off, and requires explicit opt-in. */ + REMOUNT_IDMAPPING_HOST_ROOT, + /* Define a mapping from root user within the container to the owner of the bind mounted directory. + * This ensure no root-owned files will be written in a bind-mounted directory owned by a different + * user. No other users are mapped. */ + REMOUNT_IDMAPPING_HOST_OWNER, + _REMOUNT_IDMAPPING_MAX, + _REMOUNT_IDMAPPING_INVALID = -EINVAL, +} RemountIdmapping; + +int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping); +int remount_idmap_fd(char **p, int userns_fd); +int remount_idmap(char **p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping); + +int bind_mount_submounts( + const char *source, + const char *target); + +/* Creates a mount point (not parents) based on the source path or stat - ie, a file or a directory */ +int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode); +int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t mode); + +int trigger_automount_at(int dir_fd, const char *path); + +unsigned long credentials_fs_mount_flags(bool ro); +int mount_credentials_fs(const char *path, size_t size, bool ro); + +int make_fsmount(int error_log_level, const char *what, const char *type, unsigned long flags, const char *options, int userns_fd); diff --git a/src/shared/net-condition.c b/src/shared/net-condition.c new file mode 100644 index 0000000..d8b0fef --- /dev/null +++ b/src/shared/net-condition.c @@ -0,0 +1,399 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "condition.h" +#include "env-util.h" +#include "log.h" +#include "net-condition.h" +#include "netif-util.h" +#include "network-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "strv.h" +#include "wifi-util.h" + +void net_match_clear(NetMatch *match) { + if (!match) + return; + + match->hw_addr = set_free(match->hw_addr); + match->permanent_hw_addr = set_free(match->permanent_hw_addr); + match->path = strv_free(match->path); + match->driver = strv_free(match->driver); + match->iftype = strv_free(match->iftype); + match->kind = strv_free(match->kind); + match->ifname = strv_free(match->ifname); + match->property = strv_free(match->property); + match->wlan_iftype = strv_free(match->wlan_iftype); + match->ssid = strv_free(match->ssid); + match->bssid = set_free(match->bssid); +} + +bool net_match_is_empty(const NetMatch *match) { + assert(match); + + return + set_isempty(match->hw_addr) && + set_isempty(match->permanent_hw_addr) && + strv_isempty(match->path) && + strv_isempty(match->driver) && + strv_isempty(match->iftype) && + strv_isempty(match->kind) && + strv_isempty(match->ifname) && + strv_isempty(match->property) && + strv_isempty(match->wlan_iftype) && + strv_isempty(match->ssid) && + set_isempty(match->bssid); +} + +static bool net_condition_test_strv(char * const *patterns, const char *string) { + bool match = false, has_positive_rule = false; + + if (strv_isempty(patterns)) + return true; + + STRV_FOREACH(p, patterns) { + const char *q = *p; + bool invert; + + invert = *q == '!'; + q += invert; + + if (!invert) + has_positive_rule = true; + + if (string && fnmatch(q, string, 0) == 0) { + if (invert) + return false; + else + match = true; + } + } + + return has_positive_rule ? match : true; +} + +static bool net_condition_test_ifname(char * const *patterns, const char *ifname, char * const *alternative_names) { + if (net_condition_test_strv(patterns, ifname)) + return true; + + STRV_FOREACH(p, alternative_names) + if (net_condition_test_strv(patterns, *p)) + return true; + + return false; +} + +static int net_condition_test_property(char * const *match_property, sd_device *device) { + if (strv_isempty(match_property)) + return true; + + STRV_FOREACH(p, match_property) { + _cleanup_free_ char *key = NULL; + const char *val, *dev_val; + bool invert, v; + + invert = **p == '!'; + + val = strchr(*p + invert, '='); + if (!val) + return -EINVAL; + + key = strndup(*p + invert, val - *p - invert); + if (!key) + return -ENOMEM; + + val++; + + v = device && + sd_device_get_property_value(device, key, &dev_val) >= 0 && + fnmatch(val, dev_val, 0) == 0; + + if (invert ? v : !v) + return false; + } + + return true; +} + +int net_match_config( + const NetMatch *match, + sd_device *device, + const struct hw_addr_data *hw_addr, + const struct hw_addr_data *permanent_hw_addr, + const char *driver, + unsigned short iftype, + const char *kind, + const char *ifname, + char * const *alternative_names, + enum nl80211_iftype wlan_iftype, + const char *ssid, + const struct ether_addr *bssid) { + + _cleanup_free_ char *iftype_str = NULL; + const char *path = NULL; + + assert(match); + + if (net_get_type_string(device, iftype, &iftype_str) == -ENOMEM) + return -ENOMEM; + + if (device) + (void) sd_device_get_property_value(device, "ID_PATH", &path); + + if (match->hw_addr && (!hw_addr || !set_contains(match->hw_addr, hw_addr))) + return false; + + if (match->permanent_hw_addr && + (!permanent_hw_addr || + !set_contains(match->permanent_hw_addr, permanent_hw_addr))) + return false; + + if (!net_condition_test_strv(match->path, path)) + return false; + + if (!net_condition_test_strv(match->driver, driver)) + return false; + + if (!net_condition_test_strv(match->iftype, iftype_str)) + return false; + + if (!net_condition_test_strv(match->kind, kind)) + return false; + + if (!net_condition_test_ifname(match->ifname, ifname, alternative_names)) + return false; + + if (!net_condition_test_property(match->property, device)) + return false; + + if (!net_condition_test_strv(match->wlan_iftype, nl80211_iftype_to_string(wlan_iftype))) + return false; + + if (!net_condition_test_strv(match->ssid, ssid)) + return false; + + if (match->bssid && (!bssid || !set_contains(match->bssid, bssid))) + return false; + + return true; +} + +int config_parse_net_condition( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ConditionType cond = ltype; + Condition **list = data, *c; + bool negate; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + *list = condition_free_list_type(*list, cond); + return 0; + } + + negate = rvalue[0] == '!'; + if (negate) + rvalue++; + + c = condition_new(cond, rvalue, false, negate); + if (!c) + return log_oom(); + + /* Drop previous assignment. */ + *list = condition_free_list_type(*list, cond); + + LIST_PREPEND(conditions, *list, c); + return 0; +} + +int config_parse_match_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const char *p = ASSERT_PTR(rvalue); + char ***sv = ASSERT_PTR(data); + bool invert; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + *sv = strv_free(*sv); + return 0; + } + + invert = *p == '!'; + p += invert; + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + if (invert) { + k = strjoin("!", word); + if (!k) + return log_oom(); + } else + k = TAKE_PTR(word); + + r = strv_consume(sv, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_match_ifnames( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const char *p = ASSERT_PTR(rvalue); + char ***sv = ASSERT_PTR(data); + bool invert; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + *sv = strv_free(*sv); + return 0; + } + + invert = *p == '!'; + p += invert; + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Failed to parse interface name list, ignoring: %s", rvalue); + return 0; + } + + if (!ifname_valid_full(word, ltype)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Interface name is not valid or too long, ignoring assignment: %s", word); + continue; + } + + if (invert) { + k = strjoin("!", word); + if (!k) + return log_oom(); + } else + k = TAKE_PTR(word); + + r = strv_consume(sv, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } +} + +int config_parse_match_property( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const char *p = ASSERT_PTR(rvalue); + char ***sv = ASSERT_PTR(data); + bool invert; + int r; + + assert(filename); + assert(lvalue); + + if (isempty(rvalue)) { + *sv = strv_free(*sv); + return 0; + } + + invert = *p == '!'; + p += invert; + + for (;;) { + _cleanup_free_ char *word = NULL, *k = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + + if (!env_assignment_is_valid(word)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid property or value, ignoring assignment: %s", word); + continue; + } + + if (invert) { + k = strjoin("!", word); + if (!k) + return log_oom(); + } else + k = TAKE_PTR(word); + + r = strv_consume(sv, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } +} diff --git a/src/shared/net-condition.h b/src/shared/net-condition.h new file mode 100644 index 0000000..0884d43 --- /dev/null +++ b/src/shared/net-condition.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-device.h" + +#include "conf-parser.h" +#include "ether-addr-util.h" +#include "set.h" + +typedef struct NetMatch { + Set *hw_addr; + Set *permanent_hw_addr; + char **path; + char **driver; + char **iftype; /* udev's DEVTYPE field or ARPHRD_XXX, e.g. ether, wlan. */ + char **kind; /* IFLA_INFO_KIND attribute, e.g. gre, gretap, erspan. */ + char **ifname; + char **property; + char **wlan_iftype; + char **ssid; + Set *bssid; +} NetMatch; + +void net_match_clear(NetMatch *match); +bool net_match_is_empty(const NetMatch *match); + +int net_match_config( + const NetMatch *match, + sd_device *device, + const struct hw_addr_data *hw_addr, + const struct hw_addr_data *permanent_hw_addr, + const char *driver, + unsigned short iftype, + const char *kind, + const char *ifname, + char * const *alternative_names, + enum nl80211_iftype wlan_iftype, + const char *ssid, + const struct ether_addr *bssid); + +CONFIG_PARSER_PROTOTYPE(config_parse_net_condition); +CONFIG_PARSER_PROTOTYPE(config_parse_match_strv); +CONFIG_PARSER_PROTOTYPE(config_parse_match_ifnames); +CONFIG_PARSER_PROTOTYPE(config_parse_match_property); diff --git a/src/shared/netif-naming-scheme.c b/src/shared/netif-naming-scheme.c new file mode 100644 index 0000000..fbaf5c5 --- /dev/null +++ b/src/shared/netif-naming-scheme.c @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "netif-naming-scheme.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "string-table.h" + +static const NamingScheme naming_schemes[] = { + { "v238", NAMING_V238 }, + { "v239", NAMING_V239 }, + { "v240", NAMING_V240 }, + { "v241", NAMING_V241 }, + { "v243", NAMING_V243 }, + { "v245", NAMING_V245 }, + { "v247", NAMING_V247 }, + { "v249", NAMING_V249 }, + { "v250", NAMING_V250 }, + { "v251", NAMING_V251 }, + { "v252", NAMING_V252 }, + { "v253", NAMING_V253 }, + { "v254", NAMING_V254 }, + { "v255", NAMING_V255 }, + /* … add more schemes here, as the logic to name devices is updated … */ + + EXTRA_NET_NAMING_MAP +}; + +const NamingScheme* naming_scheme_from_name(const char *name) { + /* "latest" may either be defined explicitly by the extra map, in which case we will find it in + * the table like any other name. After iterating through the table, we check for "latest" again, + * which means that if not mapped explicitly, it maps to the last defined entry, whatever that is. */ + + for (size_t i = 0; i < ELEMENTSOF(naming_schemes); i++) + if (streq(naming_schemes[i].name, name)) + return naming_schemes + i; + + if (streq(name, "latest")) + return naming_schemes + ELEMENTSOF(naming_schemes) - 1; + + return NULL; +} + +const NamingScheme* naming_scheme(void) { + static const NamingScheme *cache = NULL; + _cleanup_free_ char *buffer = NULL; + const char *e, *k; + + if (cache) + return cache; + + /* Acquire setting from the kernel command line */ + (void) proc_cmdline_get_key("net.naming-scheme", 0, &buffer); + + /* Also acquire it from an env var */ + e = getenv("NET_NAMING_SCHEME"); + if (e) { + if (*e == ':') { + /* If prefixed with ':' the kernel cmdline takes precedence */ + k = buffer ?: e + 1; + } else + k = e; /* Otherwise the env var takes precedence */ + } else + k = buffer; + + if (k) { + cache = naming_scheme_from_name(k); + if (cache) { + log_info("Using interface naming scheme '%s'.", cache->name); + return cache; + } + + log_warning("Unknown interface naming scheme '%s' requested, ignoring.", k); + } + + cache = naming_scheme_from_name(DEFAULT_NET_NAMING_SCHEME); + assert(cache); + log_info("Using default interface naming scheme '%s'.", cache->name); + + return cache; +} + +static const char* const name_policy_table[_NAMEPOLICY_MAX] = { + [NAMEPOLICY_KERNEL] = "kernel", + [NAMEPOLICY_KEEP] = "keep", + [NAMEPOLICY_DATABASE] = "database", + [NAMEPOLICY_ONBOARD] = "onboard", + [NAMEPOLICY_SLOT] = "slot", + [NAMEPOLICY_PATH] = "path", + [NAMEPOLICY_MAC] = "mac", +}; + +DEFINE_STRING_TABLE_LOOKUP(name_policy, NamePolicy); + +static const char* const alternative_names_policy_table[_NAMEPOLICY_MAX] = { + [NAMEPOLICY_DATABASE] = "database", + [NAMEPOLICY_ONBOARD] = "onboard", + [NAMEPOLICY_SLOT] = "slot", + [NAMEPOLICY_PATH] = "path", + [NAMEPOLICY_MAC] = "mac", +}; + +DEFINE_STRING_TABLE_LOOKUP(alternative_names_policy, NamePolicy); diff --git a/src/shared/netif-naming-scheme.h b/src/shared/netif-naming-scheme.h new file mode 100644 index 0000000..3f7be08 --- /dev/null +++ b/src/shared/netif-naming-scheme.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +/* So here's the deal: net_id is supposed to be an exercise in providing stable names for network devices. However, we + * also want to keep updating the naming scheme used in future versions of net_id. These two goals of course are + * contradictory: on one hand we want things to not change and on the other hand we want them to improve. Our way out + * of this dilemma is to introduce the "naming scheme" concept: each time we improve the naming logic we define a new + * flag for it. Then, we keep a list of schemes, each identified by a name associated with the flags it implements. Via + * a kernel command line and environment variable we then allow the user to pick the scheme they want us to follow: + * installers could "freeze" the used scheme at the moment of installation this way. + * + * Developers: each time you tweak the naming logic here, define a new flag below, and condition the tweak with + * it. Each time we do a release we'll then add a new scheme entry and include all newly defined flags. + * + * Note that this is only half a solution to the problem though: not only udev/net_id gets updated all the time, the + * kernel gets too. And thus a kernel that previously didn't expose some sysfs attribute we look for might eventually + * do, and thus affect our naming scheme too. Thus, enforcing a naming scheme will make interfacing more stable across + * OS versions, but not fully stabilize them. */ +typedef enum NamingSchemeFlags { + /* First, the individual features */ + NAMING_SR_IOV_V = 1 << 0, /* Use "v" suffix for SR-IOV, see 609948c7043a */ + NAMING_NPAR_ARI = 1 << 1, /* Use NPAR "ARI", see 6bc04997b6ea */ + NAMING_INFINIBAND = 1 << 2, /* Use "ib" prefix for infiniband, see 938d30aa98df */ + NAMING_ZERO_ACPI_INDEX = 1 << 3, /* Use zero acpi_index field, see d81186ef4f6a */ + NAMING_ALLOW_RERENAMES = 1 << 4, /* Allow re-renaming of devices, see #9006 */ + NAMING_STABLE_VIRTUAL_MACS = 1 << 5, /* Use device name to generate MAC, see 6d3646406560 */ + NAMING_NETDEVSIM = 1 << 6, /* Generate names for netdevsim devices, see eaa9d507d855 */ + NAMING_LABEL_NOPREFIX = 1 << 7, /* Don't prepend ID_NET_LABEL_ONBOARD with interface type prefix */ + NAMING_NSPAWN_LONG_HASH = 1 << 8, /* Shorten nspawn interfaces by including 24bit hash, instead of simple truncation */ + NAMING_BRIDGE_NO_SLOT = 1 << 9, /* Don't use PCI hotplug slot information if the corresponding device is a PCI bridge */ + NAMING_SLOT_FUNCTION_ID = 1 << 10, /* Use function_id if present to identify PCI hotplug slots */ + NAMING_16BIT_INDEX = 1 << 11, /* Allow full 16-bit for the onboard index */ + NAMING_REPLACE_STRICTLY = 1 << 12, /* Use udev_replace_ifname() for NAME= rule */ + NAMING_XEN_VIF = 1 << 13, /* Generate names for Xen netfront devices */ + NAMING_BRIDGE_MULTIFUNCTION_SLOT = 1 << 14, /* Use PCI hotplug slot information associated with bridge, but only if PCI device is multifunction. + * This is disabled since v255, as it seems not to work at least for some setups. See issue #28929. */ + NAMING_DEVICETREE_ALIASES = 1 << 15, /* Generate names from devicetree aliases */ + NAMING_USB_HOST = 1 << 16, /* Generate names for usb host */ + NAMING_SR_IOV_R = 1 << 17, /* Use "r" suffix for SR-IOV VF representors */ + + /* And now the masks that combine the features above */ + NAMING_V238 = 0, + NAMING_V239 = NAMING_V238 | NAMING_SR_IOV_V | NAMING_NPAR_ARI, + NAMING_V240 = NAMING_V239 | NAMING_INFINIBAND | NAMING_ZERO_ACPI_INDEX | NAMING_ALLOW_RERENAMES, + NAMING_V241 = NAMING_V240 | NAMING_STABLE_VIRTUAL_MACS, + NAMING_V243 = NAMING_V241 | NAMING_NETDEVSIM | NAMING_LABEL_NOPREFIX, + NAMING_V245 = NAMING_V243 | NAMING_NSPAWN_LONG_HASH, + NAMING_V247 = NAMING_V245 | NAMING_BRIDGE_NO_SLOT, + NAMING_V249 = NAMING_V247 | NAMING_SLOT_FUNCTION_ID | NAMING_16BIT_INDEX | NAMING_REPLACE_STRICTLY, + NAMING_V250 = NAMING_V249 | NAMING_XEN_VIF, + NAMING_V251 = NAMING_V250 | NAMING_BRIDGE_MULTIFUNCTION_SLOT, + NAMING_V252 = NAMING_V251 | NAMING_DEVICETREE_ALIASES, + NAMING_V253 = NAMING_V252 | NAMING_USB_HOST, + NAMING_V254 = NAMING_V253 | NAMING_SR_IOV_R, /* Despite the name, "v254" is NOT the default scheme + * for systemd version 254. It was added in a follow-up + * patch later. NAMING_SR_IOV_R is enabled by default in + * systemd version 255, naming scheme "v255". */ + NAMING_V255 = NAMING_V254 & ~NAMING_BRIDGE_MULTIFUNCTION_SLOT, + + EXTRA_NET_NAMING_SCHEMES + + _NAMING_SCHEME_FLAGS_INVALID = -EINVAL, +} NamingSchemeFlags; + +typedef struct NamingScheme { + const char *name; + NamingSchemeFlags flags; +} NamingScheme; + +const NamingScheme* naming_scheme_from_name(const char *name); +const NamingScheme* naming_scheme(void); + +static inline bool naming_scheme_has(NamingSchemeFlags flags) { + return FLAGS_SET(naming_scheme()->flags, flags); +} + +typedef enum NamePolicy { + NAMEPOLICY_KERNEL, + NAMEPOLICY_KEEP, + NAMEPOLICY_DATABASE, + NAMEPOLICY_ONBOARD, + NAMEPOLICY_SLOT, + NAMEPOLICY_PATH, + NAMEPOLICY_MAC, + _NAMEPOLICY_MAX, + _NAMEPOLICY_INVALID = -EINVAL, +} NamePolicy; + +const char *name_policy_to_string(NamePolicy p) _const_; +NamePolicy name_policy_from_string(const char *p) _pure_; + +const char *alternative_names_policy_to_string(NamePolicy p) _const_; +NamePolicy alternative_names_policy_from_string(const char *p) _pure_; diff --git a/src/shared/netif-sriov.c b/src/shared/netif-sriov.c new file mode 100644 index 0000000..7559b0d --- /dev/null +++ b/src/shared/netif-sriov.c @@ -0,0 +1,643 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "device-util.h" +#include "netlink-util.h" +#include "netif-sriov.h" +#include "parse-util.h" +#include "set.h" +#include "stdio-util.h" +#include "string-util.h" + +static int sr_iov_new(SRIOV **ret) { + SRIOV *sr_iov; + + assert(ret); + + sr_iov = new(SRIOV, 1); + if (!sr_iov) + return -ENOMEM; + + *sr_iov = (SRIOV) { + .vf = UINT32_MAX, + .vlan_proto = ETH_P_8021Q, + .vf_spoof_check_setting = -1, + .trust = -1, + .query_rss = -1, + .link_state = _SR_IOV_LINK_STATE_INVALID, + }; + + *ret = TAKE_PTR(sr_iov); + + return 0; +} + +static int sr_iov_new_static(OrderedHashmap **sr_iov_by_section, const char *filename, unsigned section_line, SRIOV **ret) { + _cleanup_(config_section_freep) ConfigSection *n = NULL; + _cleanup_(sr_iov_freep) SRIOV *sr_iov = NULL; + SRIOV *existing = NULL; + int r; + + assert(sr_iov_by_section); + assert(filename); + assert(section_line > 0); + assert(ret); + + r = config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + existing = ordered_hashmap_get(*sr_iov_by_section, n); + if (existing) { + *ret = existing; + return 0; + } + + r = sr_iov_new(&sr_iov); + if (r < 0) + return r; + + r = ordered_hashmap_ensure_put(sr_iov_by_section, &config_section_hash_ops, n, sr_iov); + if (r < 0) + return r; + + sr_iov->section = TAKE_PTR(n); + sr_iov->sr_iov_by_section = *sr_iov_by_section; + + *ret = TAKE_PTR(sr_iov); + return 0; +} + +SRIOV *sr_iov_free(SRIOV *sr_iov) { + if (!sr_iov) + return NULL; + + if (sr_iov->sr_iov_by_section && sr_iov->section) + ordered_hashmap_remove(sr_iov->sr_iov_by_section, sr_iov->section); + + config_section_free(sr_iov->section); + + return mfree(sr_iov); +} + +void sr_iov_hash_func(const SRIOV *sr_iov, struct siphash *state) { + assert(sr_iov); + assert(state); + + siphash24_compress(&sr_iov->vf, sizeof(sr_iov->vf), state); +} + +int sr_iov_compare_func(const SRIOV *s1, const SRIOV *s2) { + assert(s1); + assert(s2); + + return CMP(s1->vf, s2->vf); +} + +DEFINE_PRIVATE_HASH_OPS( + sr_iov_hash_ops, + SRIOV, + sr_iov_hash_func, + sr_iov_compare_func); + +int sr_iov_set_netlink_message(SRIOV *sr_iov, sd_netlink_message *req) { + int r; + + assert(sr_iov); + assert(req); + + r = sd_netlink_message_open_container(req, IFLA_VFINFO_LIST); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(req, IFLA_VF_INFO); + if (r < 0) + return r; + + if (!ether_addr_is_null(&sr_iov->mac)) { + struct ifla_vf_mac ivm = { + .vf = sr_iov->vf, + }; + + memcpy(ivm.mac, &sr_iov->mac, ETH_ALEN); + r = sd_netlink_message_append_data(req, IFLA_VF_MAC, &ivm, sizeof(struct ifla_vf_mac)); + if (r < 0) + return r; + } + + if (sr_iov->vf_spoof_check_setting >= 0) { + struct ifla_vf_spoofchk ivs = { + .vf = sr_iov->vf, + .setting = sr_iov->vf_spoof_check_setting, + }; + + r = sd_netlink_message_append_data(req, IFLA_VF_SPOOFCHK, &ivs, sizeof(struct ifla_vf_spoofchk)); + if (r < 0) + return r; + } + + if (sr_iov->query_rss >= 0) { + struct ifla_vf_rss_query_en ivs = { + .vf = sr_iov->vf, + .setting = sr_iov->query_rss, + }; + + r = sd_netlink_message_append_data(req, IFLA_VF_RSS_QUERY_EN, &ivs, sizeof(struct ifla_vf_rss_query_en)); + if (r < 0) + return r; + } + + if (sr_iov->trust >= 0) { + struct ifla_vf_trust ivt = { + .vf = sr_iov->vf, + .setting = sr_iov->trust, + }; + + r = sd_netlink_message_append_data(req, IFLA_VF_TRUST, &ivt, sizeof(struct ifla_vf_trust)); + if (r < 0) + return r; + } + + if (sr_iov->link_state >= 0) { + struct ifla_vf_link_state ivl = { + .vf = sr_iov->vf, + .link_state = sr_iov->link_state, + }; + + r = sd_netlink_message_append_data(req, IFLA_VF_LINK_STATE, &ivl, sizeof(struct ifla_vf_link_state)); + if (r < 0) + return r; + } + + if (sr_iov->vlan > 0) { + /* Because of padding, first the buffer must be initialized with 0. */ + struct ifla_vf_vlan_info ivvi = {}; + ivvi.vf = sr_iov->vf; + ivvi.vlan = sr_iov->vlan; + ivvi.qos = sr_iov->qos; + ivvi.vlan_proto = htobe16(sr_iov->vlan_proto); + + r = sd_netlink_message_open_container(req, IFLA_VF_VLAN_LIST); + if (r < 0) + return r; + + r = sd_netlink_message_append_data(req, IFLA_VF_VLAN_INFO, &ivvi, sizeof(struct ifla_vf_vlan_info)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + } + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(req); + if (r < 0) + return r; + + return 0; +} + +int sr_iov_get_num_vfs(sd_device *device, uint32_t *ret) { + const char *str; + uint32_t n; + int r; + + assert(device); + assert(ret); + + r = sd_device_get_sysattr_value(device, "device/sriov_numvfs", &str); + if (r < 0) + return r; + + r = safe_atou32(str, &n); + if (r < 0) + return r; + + *ret = n; + return 0; +} + +int sr_iov_set_num_vfs(sd_device *device, uint32_t num_vfs, OrderedHashmap *sr_iov_by_section) { + char val[DECIMAL_STR_MAX(uint32_t)]; + const char *str; + int r; + + assert(device); + + if (num_vfs == UINT32_MAX) { + uint32_t current_num_vfs; + SRIOV *sr_iov; + + /* If the number of virtual functions is not specified, then use the maximum number of VF + 1. */ + + num_vfs = 0; + ORDERED_HASHMAP_FOREACH(sr_iov, sr_iov_by_section) + num_vfs = MAX(num_vfs, sr_iov->vf + 1); + + if (num_vfs == 0) /* No VF is configured. */ + return 0; + + r = sr_iov_get_num_vfs(device, ¤t_num_vfs); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to get the current number of SR-IOV virtual functions: %m"); + + /* Enough VFs already exist. */ + if (num_vfs <= current_num_vfs) + return 0; + + } else if (num_vfs == 0) { + r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", "0"); + if (r < 0) + log_device_debug_errno(device, r, "Failed to write device/sriov_numvfs sysfs attribute, ignoring: %m"); + + /* Gracefully handle the error in disabling VFs when the interface does not support SR-IOV. */ + return r == -ENOENT ? 0 : r; + } + + /* So, the interface does not have enough VFs. Before increasing the number of VFs, check the + * maximum allowed number of VFs from the sriov_totalvfs sysattr. Note that the sysattr + * currently exists only for PCI drivers. Hence, ignore -ENOENT. + * TODO: netdevsim provides the information in debugfs. */ + r = sd_device_get_sysattr_value(device, "device/sriov_totalvfs", &str); + if (r >= 0) { + uint32_t max_num_vfs; + + r = safe_atou32(str, &max_num_vfs); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to parse device/sriov_totalvfs sysfs attribute '%s': %m", str); + + if (num_vfs > max_num_vfs) + return log_device_debug_errno(device, SYNTHETIC_ERRNO(ERANGE), + "Specified number of virtual functions is out of range. " + "The maximum allowed value is %"PRIu32".", + max_num_vfs); + + } else if (r != -ENOENT) /* Currently, only PCI driver has the attribute. */ + return log_device_debug_errno(device, r, "Failed to read device/sriov_totalvfs sysfs attribute: %m"); + + xsprintf(val, "%"PRIu32, num_vfs); + r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", val); + if (r == -EBUSY) { + /* Some devices e.g. netdevsim refuse to set sriov_numvfs if it has non-zero value. */ + r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", "0"); + if (r >= 0) + r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", val); + } + if (r < 0) + return log_device_debug_errno(device, r, "Failed to write device/sriov_numvfs sysfs attribute: %m"); + + log_device_debug(device, "device/sriov_numvfs sysfs attribute set to '%s'.", val); + return 0; +} + +static int sr_iov_section_verify(uint32_t num_vfs, SRIOV *sr_iov) { + assert(sr_iov); + + if (section_is_invalid(sr_iov->section)) + return -EINVAL; + + if (sr_iov->vf == UINT32_MAX) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: [SR-IOV] section without VirtualFunction= field configured. " + "Ignoring [SR-IOV] section from line %u.", + sr_iov->section->filename, sr_iov->section->line); + + if (sr_iov->vf >= num_vfs) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: VirtualFunction= must be smaller than the value specified in SR-IOVVirtualFunctions=. " + "Ignoring [SR-IOV] section from line %u.", + sr_iov->section->filename, sr_iov->section->line); + + return 0; +} + +int sr_iov_drop_invalid_sections(uint32_t num_vfs, OrderedHashmap *sr_iov_by_section) { + _cleanup_set_free_ Set *set = NULL; + SRIOV *sr_iov; + int r; + + ORDERED_HASHMAP_FOREACH(sr_iov, sr_iov_by_section) { + SRIOV *dup; + + if (sr_iov_section_verify(num_vfs, sr_iov) < 0) { + sr_iov_free(sr_iov); + continue; + } + + dup = set_remove(set, sr_iov); + if (dup) { + log_warning("%s: Conflicting [SR-IOV] section is specified at line %u and %u, " + "dropping the [SR-IOV] section specified at line %u.", + dup->section->filename, sr_iov->section->line, + dup->section->line, dup->section->line); + sr_iov_free(dup); + } + + r = set_ensure_put(&set, &sr_iov_hash_ops, sr_iov); + if (r < 0) + return log_oom(); + assert(r > 0); + } + + return 0; +} + +int config_parse_sr_iov_uint32( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL; + OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data); + uint32_t k; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov); + if (r < 0) + return r; + + if (isempty(rvalue)) { + if (streq(lvalue, "VirtualFunction")) + sr_iov->vf = UINT32_MAX; + else if (streq(lvalue, "VLANId")) + sr_iov->vlan = 0; + else if (streq(lvalue, "QualityOfService")) + sr_iov->qos = 0; + else + assert_not_reached(); + + TAKE_PTR(sr_iov); + return 0; + } + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (streq(lvalue, "VLANId")) { + if (k == 0 || k > 4095) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid SR-IOV VLANId: %u", k); + return 0; + } + sr_iov->vlan = k; + } else if (streq(lvalue, "VirtualFunction")) { + if (k >= INT_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid SR-IOV virtual function: %u", k); + return 0; + } + sr_iov->vf = k; + } else if (streq(lvalue, "QualityOfService")) + sr_iov->qos = k; + else + assert_not_reached(); + + TAKE_PTR(sr_iov); + return 0; +} + +int config_parse_sr_iov_vlan_proto( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL; + OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov); + if (r < 0) + return r; + + if (isempty(rvalue) || streq(rvalue, "802.1Q")) + sr_iov->vlan_proto = ETH_P_8021Q; + else if (streq(rvalue, "802.1ad")) + sr_iov->vlan_proto = ETH_P_8021AD; + else { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Invalid SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + TAKE_PTR(sr_iov); + return 0; +} + +int config_parse_sr_iov_link_state( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL; + OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov); + if (r < 0) + return r; + + /* Unfortunately, SR_IOV_LINK_STATE_DISABLE is 2, not 0. So, we cannot use + * DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN() macro. */ + + if (isempty(rvalue)) { + sr_iov->link_state = _SR_IOV_LINK_STATE_INVALID; + TAKE_PTR(sr_iov); + return 0; + } + + if (streq(rvalue, "auto")) { + sr_iov->link_state = SR_IOV_LINK_STATE_AUTO; + TAKE_PTR(sr_iov); + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + sr_iov->link_state = r ? SR_IOV_LINK_STATE_ENABLE : SR_IOV_LINK_STATE_DISABLE; + TAKE_PTR(sr_iov); + return 0; +} + +int config_parse_sr_iov_boolean( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL; + OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov); + if (r < 0) + return r; + + if (isempty(rvalue)) { + if (streq(lvalue, "MACSpoofCheck")) + sr_iov->vf_spoof_check_setting = -1; + else if (streq(lvalue, "QueryReceiveSideScaling")) + sr_iov->query_rss = -1; + else if (streq(lvalue, "Trust")) + sr_iov->trust = -1; + else + assert_not_reached(); + + TAKE_PTR(sr_iov); + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse '%s=', ignoring: %s", lvalue, rvalue); + return 0; + } + + if (streq(lvalue, "MACSpoofCheck")) + sr_iov->vf_spoof_check_setting = r; + else if (streq(lvalue, "QueryReceiveSideScaling")) + sr_iov->query_rss = r; + else if (streq(lvalue, "Trust")) + sr_iov->trust = r; + else + assert_not_reached(); + + TAKE_PTR(sr_iov); + return 0; +} + +int config_parse_sr_iov_mac( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL; + OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov); + if (r < 0) + return r; + + if (isempty(rvalue)) { + sr_iov->mac = ETHER_ADDR_NULL; + TAKE_PTR(sr_iov); + return 0; + } + + r = parse_ether_addr(rvalue, &sr_iov->mac); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + TAKE_PTR(sr_iov); + return 0; +} + +int config_parse_sr_iov_num_vfs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t n, *num_vfs = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *num_vfs = UINT32_MAX; + return 0; + } + + r = safe_atou32(rvalue, &n); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue); + return 0; + } + + if (n > INT_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "The number of SR-IOV virtual functions is too large. It must be equal to " + "or smaller than 2147483647. Ignoring assignment: %"PRIu32, n); + return 0; + } + + *num_vfs = n; + return 0; +} diff --git a/src/shared/netif-sriov.h b/src/shared/netif-sriov.h new file mode 100644 index 0000000..ee76957 --- /dev/null +++ b/src/shared/netif-sriov.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-device.h" + +#include "conf-parser.h" +#include "ether-addr-util.h" +#include "hashmap.h" + +typedef enum SRIOVLinkState { + SR_IOV_LINK_STATE_AUTO = IFLA_VF_LINK_STATE_AUTO, + SR_IOV_LINK_STATE_ENABLE = IFLA_VF_LINK_STATE_ENABLE, + SR_IOV_LINK_STATE_DISABLE = IFLA_VF_LINK_STATE_DISABLE, + _SR_IOV_LINK_STATE_MAX, + _SR_IOV_LINK_STATE_INVALID = -EINVAL, +} SRIOVLinkState; + +typedef struct SRIOV { + ConfigSection *section; + OrderedHashmap *sr_iov_by_section; + + uint32_t vf; /* 0 - 2147483646 */ + uint32_t vlan; /* 0 - 4095, 0 disables VLAN filter */ + uint32_t qos; + uint16_t vlan_proto; /* ETH_P_8021Q or ETH_P_8021AD */ + int vf_spoof_check_setting; + int query_rss; + int trust; + SRIOVLinkState link_state; + struct ether_addr mac; +} SRIOV; + +SRIOV *sr_iov_free(SRIOV *sr_iov); +void sr_iov_hash_func(const SRIOV *sr_iov, struct siphash *state); +int sr_iov_compare_func(const SRIOV *s1, const SRIOV *s2); +int sr_iov_set_netlink_message(SRIOV *sr_iov, sd_netlink_message *req); +int sr_iov_get_num_vfs(sd_device *device, uint32_t *ret); +int sr_iov_set_num_vfs(sd_device *device, uint32_t num_vfs, OrderedHashmap *sr_iov_by_section); +int sr_iov_drop_invalid_sections(uint32_t num_vfs, OrderedHashmap *sr_iov_by_section); + +DEFINE_SECTION_CLEANUP_FUNCTIONS(SRIOV, sr_iov_free); + +CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_uint32); +CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_boolean); +CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_link_state); +CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_vlan_proto); +CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_mac); +CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_num_vfs); diff --git a/src/shared/netif-util.c b/src/shared/netif-util.c new file mode 100644 index 0000000..f56c564 --- /dev/null +++ b/src/shared/netif-util.c @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "arphrd-util.h" +#include "device-util.h" +#include "log-link.h" +#include "memory-util.h" +#include "netif-util.h" +#include "siphash24.h" +#include "sparse-endian.h" +#include "strv.h" + +bool netif_has_carrier(uint8_t operstate, unsigned flags) { + /* see Documentation/networking/operstates.txt in the kernel sources */ + + if (operstate == IF_OPER_UP) + return true; + + if (operstate != IF_OPER_UNKNOWN) + return false; + + /* operstate may not be implemented, so fall back to flags */ + return FLAGS_SET(flags, IFF_LOWER_UP | IFF_RUNNING) && + !FLAGS_SET(flags, IFF_DORMANT); +} + +int net_get_type_string(sd_device *device, uint16_t iftype, char **ret) { + const char *t; + char *p; + + if (device && + sd_device_get_devtype(device, &t) >= 0 && + !isempty(t)) { + p = strdup(t); + if (!p) + return -ENOMEM; + + *ret = p; + return 0; + } + + t = arphrd_to_name(iftype); + if (!t) + return -ENOENT; + + p = strdup(t); + if (!p) + return -ENOMEM; + + *ret = ascii_strlower(p); + return 0; +} + +const char *net_get_persistent_name(sd_device *device) { + assert(device); + + /* fetch some persistent data unique (on this machine) to this device */ + FOREACH_STRING(field, "ID_NET_NAME_ONBOARD", "ID_NET_NAME_SLOT", "ID_NET_NAME_PATH", "ID_NET_NAME_MAC") { + const char *name; + + if (sd_device_get_property_value(device, field, &name) >= 0) + return name; + } + + return NULL; +} + +/* Used when generating hardware address by udev, and IPv4LL seed by networkd. */ +#define HASH_KEY SD_ID128_MAKE(d3,1e,48,fa,90,fe,4b,4c,9d,af,d5,d7,a1,b1,2e,8a) + +int net_get_unique_predictable_data(sd_device *device, bool use_sysname, uint64_t *ret) { + const char *name; + + assert(device); + assert(ret); + + /* net_get_persistent_name() will return one of the device names based on stable information about + * the device. If this is not available, we fall back to using the actual device name. */ + name = net_get_persistent_name(device); + if (!name && use_sysname) + (void) sd_device_get_sysname(device, &name); + if (!name) + return log_device_debug_errno(device, SYNTHETIC_ERRNO(ENODATA), + "No stable identifying information found"); + + log_device_debug(device, "Using \"%s\" as stable identifying information", name); + + return net_get_unique_predictable_data_from_name(name, &HASH_KEY, ret); +} + +int net_get_unique_predictable_data_from_name( + const char *name, + const sd_id128_t *key, + uint64_t *ret) { + + size_t l, sz; + uint8_t *v; + int r; + + assert(name); + assert(key); + assert(ret); + + l = strlen(name); + sz = sizeof(sd_id128_t) + l; + v = newa(uint8_t, sz); + + /* Fetch some persistent data unique to this machine */ + r = sd_id128_get_machine((sd_id128_t*) v); + if (r < 0) + return r; + + memcpy(v + sizeof(sd_id128_t), name, l); + + /* Let's hash the machine ID plus the device name. We use + * a fixed, but originally randomly created hash key here. */ + *ret = htole64(siphash24(v, sz, key->bytes)); + return 0; +} + +typedef struct Link { + const char *ifname; +} Link; + +int net_verify_hardware_address( + const char *ifname, + bool is_static, + uint16_t iftype, + const struct hw_addr_data *ib_hw_addr, /* current or parent HW address */ + struct hw_addr_data *new_hw_addr) { + + Link link = { .ifname = ifname }; + + assert(new_hw_addr); + + if (new_hw_addr->length == 0) + return 0; + + if (new_hw_addr->length != arphrd_to_hw_addr_len(iftype)) { + if (is_static) + log_link_warning(&link, + "Specified MAC address with invalid length (%zu, expected %zu), refusing.", + new_hw_addr->length, arphrd_to_hw_addr_len(iftype)); + return -EINVAL; + } + + switch (iftype) { + case ARPHRD_ETHER: + /* see eth_random_addr() in the kernel */ + + if (ether_addr_is_null(&new_hw_addr->ether)) { + if (is_static) + log_link_warning(&link, "Specified MAC address is null, refusing."); + return -EINVAL; + } + + if (ether_addr_is_broadcast(&new_hw_addr->ether)) { + if (is_static) + log_link_warning(&link, "Specified MAC address is broadcast, refusing."); + return -EINVAL; + } + + if (ether_addr_is_multicast(&new_hw_addr->ether)) { + if (is_static) + log_link_warning(&link, "Specified MAC address has the multicast bit set, clearing the bit."); + + new_hw_addr->bytes[0] &= 0xfe; + } + + if (!is_static && !ether_addr_is_local(&new_hw_addr->ether)) + /* Adjust local assignment bit when the MAC address is generated randomly. */ + new_hw_addr->bytes[0] |= 0x02; + + break; + + case ARPHRD_INFINIBAND: + /* see ipoib_check_lladdr() in the kernel */ + + assert(ib_hw_addr); + assert(ib_hw_addr->length == INFINIBAND_ALEN); + + if (is_static && + (!memeqzero(new_hw_addr->bytes, INFINIBAND_ALEN - 8) || + memcmp(new_hw_addr->bytes, ib_hw_addr->bytes, INFINIBAND_ALEN - 8) != 0)) + log_link_warning(&link, "Only the last 8 bytes of the InifniBand MAC address can be changed, ignoring the first 12 bytes."); + + if (memeqzero(new_hw_addr->bytes + INFINIBAND_ALEN - 8, 8)) { + if (is_static) + log_link_warning(&link, "The last 8 bytes of the InfiniBand MAC address cannot be null, refusing."); + return -EINVAL; + } + + memcpy(new_hw_addr->bytes, ib_hw_addr->bytes, INFINIBAND_ALEN - 8); + break; + + default: + if (is_static) + log_link_warning(&link, "Unsupported interface type %s%u to set MAC address, refusing.", + strna(arphrd_to_name(iftype)), iftype); + return -EINVAL; + } + + return 0; +} diff --git a/src/shared/netif-util.h b/src/shared/netif-util.h new file mode 100644 index 0000000..fb6a27c --- /dev/null +++ b/src/shared/netif-util.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-device.h" +#include "sd-id128.h" + +#include "ether-addr-util.h" + +bool netif_has_carrier(uint8_t operstate, unsigned flags); +int net_get_type_string(sd_device *device, uint16_t iftype, char **ret); +const char *net_get_persistent_name(sd_device *device); +int net_get_unique_predictable_data(sd_device *device, bool use_sysname, uint64_t *ret); +int net_get_unique_predictable_data_from_name(const char *name, const sd_id128_t *key, uint64_t *ret); +int net_verify_hardware_address( + const char *ifname, + bool is_static, + uint16_t iftype, + const struct hw_addr_data *ib_hw_addr, + struct hw_addr_data *new_hw_addr); diff --git a/src/shared/nscd-flush.c b/src/shared/nscd-flush.c new file mode 100644 index 0000000..6df18d7 --- /dev/null +++ b/src/shared/nscd-flush.c @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "io-util.h" +#include "nscd-flush.h" +#include "socket-util.h" +#include "strv.h" +#include "time-util.h" + +#define NSCD_FLUSH_CACHE_TIMEOUT_USEC (5*USEC_PER_SEC) + +struct nscdInvalidateRequest { + int32_t version; + int32_t type; /* in glibc this is an enum. We don't replicate this here 1:1. Also, wtf, how unportable is that + * even? */ + int32_t key_len; + char dbname[]; +}; + +static int nscd_flush_cache_one(const char *database, usec_t end) { + size_t req_size, has_written = 0, has_read = 0, l; + struct nscdInvalidateRequest *req; + _cleanup_close_ int fd = -EBADF; + int32_t resp; + int events, r; + + assert(database); + + l = strlen(database); + req_size = offsetof(struct nscdInvalidateRequest, dbname) + l + 1; + + req = alloca_safe(req_size); + *req = (struct nscdInvalidateRequest) { + .version = 2, + .type = 10, + .key_len = l + 1, + }; + + strcpy(req->dbname, database); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_debug_errno(errno, "Failed to allocate nscd socket: %m"); + + /* Note: connect() returns EINPROGRESS if O_NONBLOCK is set and establishing a connection takes time. The + * kernel lets us know this way that the connection is now being established, and we should watch with poll() + * to learn when it is fully established. That said, AF_UNIX on Linux never triggers this IRL (connect() is + * always instant on AF_UNIX), hence handling this is mostly just an exercise in defensive, protocol-agnostic + * programming. + * + * connect() returns EAGAIN if the socket's backlog limit has been reached. When we see this we give up right + * away, after all this entire function here is written in a defensive style so that a non-responding nscd + * doesn't stall us for good. (Even if we wanted to handle this better: the Linux kernel doesn't really have a + * nice way to connect() to a server synchronously with a time limit that would also cover dealing with the + * backlog limit. After all SO_RCVTIMEO and SR_SNDTIMEO don't apply to connect(), and alarm() is frickin' ugly + * and not really reasonably usable from threads-aware code.) */ + r = connect_unix_path(fd, AT_FDCWD, "/run/nscd/socket"); + if (r < 0) { + if (r == -EAGAIN) + return log_debug_errno(r, "nscd is overloaded (backlog limit reached) and refuses to take further connections: %m"); + if (r != -EINPROGRESS) + return log_debug_errno(r, "Failed to connect to nscd socket: %m"); + + /* Continue in case of EINPROGRESS, but don't bother with send() or recv() until being notified that + * establishing the connection is complete. */ + events = 0; + } else + events = POLLIN|POLLOUT; /* Let's assume initially that we can write and read to the fd, to suppress + * one poll() invocation */ + for (;;) { + usec_t p; + + if (events & POLLOUT) { + ssize_t m; + + assert(has_written < req_size); + + m = send(fd, (uint8_t*) req + has_written, req_size - has_written, MSG_NOSIGNAL); + if (m < 0) { + if (errno != EAGAIN) /* Note that EAGAIN is returned by the kernel whenever it can't + * take the data right now, and that includes if the connect() is + * asynchronous and we saw EINPROGRESS on it, and it hasn't + * completed yet. */ + return log_debug_errno(errno, "Failed to write to nscd socket: %m"); + } else + has_written += m; + } + + if (events & (POLLIN|POLLERR|POLLHUP)) { + ssize_t m; + + if (has_read >= sizeof(resp)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Response from nscd longer than expected: %m"); + + m = recv(fd, (uint8_t*) &resp + has_read, sizeof(resp) - has_read, 0); + if (m < 0) { + if (errno != EAGAIN) + return log_debug_errno(errno, "Failed to read from nscd socket: %m"); + } else if (m == 0) { /* EOF */ + if (has_read == 0 && has_written >= req_size) /* Older nscd immediately terminated the + * connection, accept that as OK */ + return 1; + + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "nscd prematurely ended connection."); + } else + has_read += m; + } + + if (has_written >= req_size && has_read >= sizeof(resp)) { /* done? */ + if (resp < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "nscd sent us a negative error number: %i", resp); + if (resp > 0) + return log_debug_errno(resp, "nscd return failure code on invalidating '%s'.", database); + return 1; + } + + p = now(CLOCK_MONOTONIC); + if (p >= end) + return -ETIMEDOUT; + + events = fd_wait_for_event(fd, POLLIN | (has_written < req_size ? POLLOUT : 0), end - p); + if (events < 0) + return events; + } +} + +int nscd_flush_cache(char **databases) { + int r = 0; + + /* Tries to invalidate the specified database in nscd. We do this carefully, with a 5s timeout, + * so that we don't block indefinitely on another service. */ + + usec_t end = usec_add(now(CLOCK_MONOTONIC), NSCD_FLUSH_CACHE_TIMEOUT_USEC); + + STRV_FOREACH(i, databases) + RET_GATHER(r, nscd_flush_cache_one(*i, end)); + + return r; +} diff --git a/src/shared/nscd-flush.h b/src/shared/nscd-flush.h new file mode 100644 index 0000000..dac223e --- /dev/null +++ b/src/shared/nscd-flush.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if ENABLE_NSCD +int nscd_flush_cache(char **databases); +#else +static inline void nscd_flush_cache(char **databases) {} +#endif diff --git a/src/shared/nsflags.c b/src/shared/nsflags.c new file mode 100644 index 0000000..d4cee06 --- /dev/null +++ b/src/shared/nsflags.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "extract-word.h" +#include "namespace-util.h" +#include "nsflags.h" +#include "string-util.h" + +int namespace_flags_from_string(const char *name, unsigned long *ret) { + unsigned long flags = 0; + int r; + + assert_se(ret); + + for (;;) { + _cleanup_free_ char *word = NULL; + unsigned long f = 0; + unsigned i; + + r = extract_first_word(&name, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + break; + + for (i = 0; namespace_info[i].proc_name; i++) + if (streq(word, namespace_info[i].proc_name)) { + f = namespace_info[i].clone_flag; + break; + } + + if (f == 0) + return -EINVAL; + + flags |= f; + } + + *ret = flags; + return 0; +} + +int namespace_flags_to_string(unsigned long flags, char **ret) { + _cleanup_free_ char *s = NULL; + unsigned i; + + for (i = 0; namespace_info[i].proc_name; i++) { + if ((flags & namespace_info[i].clone_flag) != namespace_info[i].clone_flag) + continue; + + if (!strextend_with_separator(&s, " ", namespace_info[i].proc_name)) + return -ENOMEM; + } + + *ret = TAKE_PTR(s); + + return 0; +} + +const char *namespace_single_flag_to_string(unsigned long flag) { + for (unsigned i = 0; namespace_info[i].proc_name; i++) + if (namespace_info[i].clone_flag == flag) + return namespace_info[i].proc_name; + + return NULL; +} diff --git a/src/shared/nsflags.h b/src/shared/nsflags.h new file mode 100644 index 0000000..b59740c --- /dev/null +++ b/src/shared/nsflags.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "missing_sched.h" + +/* The combination of all namespace flags defined by the kernel. The right type for this isn't clear. setns() and + * unshare() expect these flags to be passed as (signed) "int", while clone() wants them as "unsigned long". The latter + * is definitely more appropriate for a flags parameter, and also the larger type of the two, hence let's stick to that + * here. */ +#define NAMESPACE_FLAGS_ALL \ + ((unsigned long) (CLONE_NEWCGROUP| \ + CLONE_NEWIPC| \ + CLONE_NEWNET| \ + CLONE_NEWNS| \ + CLONE_NEWPID| \ + CLONE_NEWUSER| \ + CLONE_NEWUTS)) + +#define NAMESPACE_FLAGS_INITIAL ULONG_MAX + +int namespace_flags_from_string(const char *name, unsigned long *ret); +int namespace_flags_to_string(unsigned long flags, char **ret); +const char *namespace_single_flag_to_string(unsigned long flag); diff --git a/src/shared/numa-util.c b/src/shared/numa-util.c new file mode 100644 index 0000000..a954ea3 --- /dev/null +++ b/src/shared/numa-util.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "cpu-set-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "macro.h" +#include "missing_syscall.h" +#include "numa-util.h" +#include "stdio-util.h" +#include "string-table.h" + +bool numa_policy_is_valid(const NUMAPolicy *policy) { + assert(policy); + + if (!mpol_is_valid(numa_policy_get_type(policy))) + return false; + + if (!policy->nodes.set && + !IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL, MPOL_PREFERRED)) + return false; + + if (policy->nodes.set && + numa_policy_get_type(policy) == MPOL_PREFERRED && + CPU_COUNT_S(policy->nodes.allocated, policy->nodes.set) != 1) + return false; + + return true; +} + +static int numa_policy_to_mempolicy(const NUMAPolicy *policy, unsigned long *ret_maxnode, unsigned long **ret_nodes) { + unsigned node, bits = 0, ulong_bits; + _cleanup_free_ unsigned long *out = NULL; + + assert(policy); + assert(ret_maxnode); + assert(ret_nodes); + + if (IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL) || + (numa_policy_get_type(policy) == MPOL_PREFERRED && !policy->nodes.set)) { + *ret_nodes = NULL; + *ret_maxnode = 0; + return 0; + } + + bits = policy->nodes.allocated * 8; + ulong_bits = sizeof(unsigned long) * 8; + + out = new0(unsigned long, DIV_ROUND_UP(policy->nodes.allocated, sizeof(unsigned long))); + if (!out) + return -ENOMEM; + + /* We don't make any assumptions about internal type libc is using to store NUMA node mask. + Hence we need to convert the node mask to the representation expected by set_mempolicy() */ + for (node = 0; node < bits; node++) + if (CPU_ISSET_S(node, policy->nodes.allocated, policy->nodes.set)) + out[node / ulong_bits] |= 1ul << (node % ulong_bits); + + *ret_nodes = TAKE_PTR(out); + *ret_maxnode = bits + 1; + return 0; +} + +int apply_numa_policy(const NUMAPolicy *policy) { + int r; + _cleanup_free_ unsigned long *nodes = NULL; + unsigned long maxnode; + + assert(policy); + + if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS) + return -EOPNOTSUPP; + + if (!numa_policy_is_valid(policy)) + return -EINVAL; + + r = numa_policy_to_mempolicy(policy, &maxnode, &nodes); + if (r < 0) + return r; + + r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode); + if (r < 0) + return -errno; + + return 0; +} + +int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) { + int r; + size_t i; + _cleanup_(cpu_set_reset) CPUSet s = {}; + + assert(policy); + assert(ret); + + for (i = 0; i < policy->nodes.allocated * 8; i++) { + _cleanup_free_ char *l = NULL; + char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1]; + _cleanup_(cpu_set_reset) CPUSet part = {}; + + if (!CPU_ISSET_S(i, policy->nodes.allocated, policy->nodes.set)) + continue; + + xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", i); + + r = read_one_line_file(p, &l); + if (r < 0) + return r; + + r = parse_cpu_set(l, &part); + if (r < 0) + return r; + + r = cpu_set_add_all(&s, &part); + if (r < 0) + return r; + } + + *ret = TAKE_STRUCT(s); + + return 0; +} + +static int numa_max_node(void) { + _cleanup_closedir_ DIR *d = NULL; + int r, max_node = 0; + + d = opendir("/sys/devices/system/node"); + if (!d) + return -errno; + + FOREACH_DIRENT(de, d, break) { + int node; + const char *n; + + if (de->d_type != DT_DIR) + continue; + + n = startswith(de->d_name, "node"); + if (!n) + continue; + + r = safe_atoi(n, &node); + if (r < 0) + continue; + + if (node > max_node) + max_node = node; + } + + return max_node; +} + +int numa_mask_add_all(CPUSet *mask) { + int m; + + assert(mask); + + m = numa_max_node(); + if (m < 0) { + log_debug_errno(m, "Failed to determine maximum NUMA node index, assuming 1023: %m"); + m = 1023; /* CONFIG_NODES_SHIFT is set to 10 on x86_64, i.e. 1024 NUMA nodes in total */ + } + + for (int i = 0; i <= m; i++) { + int r; + + r = cpu_set_add(mask, i); + if (r < 0) + return r; + } + + return 0; +} + +static const char* const mpol_table[] = { + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "preferred", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local", +}; + +DEFINE_STRING_TABLE_LOOKUP(mpol, int); diff --git a/src/shared/numa-util.h b/src/shared/numa-util.h new file mode 100644 index 0000000..2f736c9 --- /dev/null +++ b/src/shared/numa-util.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "cpu-set-util.h" +#include "missing_syscall.h" + +static inline bool mpol_is_valid(int t) { + return t >= MPOL_DEFAULT && t <= MPOL_LOCAL; +} + +typedef struct NUMAPolicy { + /* Always use numa_policy_get_type() to read the value */ + int type; + CPUSet nodes; +} NUMAPolicy; + +bool numa_policy_is_valid(const NUMAPolicy *p); + +static inline int numa_policy_get_type(const NUMAPolicy *p) { + return p->type < 0 ? (p->nodes.set ? MPOL_PREFERRED : -1) : p->type; +} + +static inline void numa_policy_reset(NUMAPolicy *p) { + assert(p); + cpu_set_reset(&p->nodes); + p->type = -1; +} + +int apply_numa_policy(const NUMAPolicy *policy); +int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *set); + +int numa_mask_add_all(CPUSet *mask); + +const char* mpol_to_string(int i) _const_; +int mpol_from_string(const char *s) _pure_; diff --git a/src/shared/open-file.c b/src/shared/open-file.c new file mode 100644 index 0000000..42772bd --- /dev/null +++ b/src/shared/open-file.c @@ -0,0 +1,147 @@ + +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "escape.h" +#include "extract-word.h" +#include "fd-util.h" +#include "open-file.h" +#include "path-util.h" +#include "string-table.h" +#include "string-util.h" + +int open_file_parse(const char *v, OpenFile **ret) { + _cleanup_free_ char *options = NULL; + _cleanup_(open_file_freep) OpenFile *of = NULL; + int r; + + assert(v); + assert(ret); + + of = new0(OpenFile, 1); + if (!of) + return -ENOMEM; + + r = extract_many_words(&v, ":", EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_CUNESCAPE, &of->path, &of->fdname, &options, NULL); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + /* Enforce that at most 3 colon-separated words are present */ + if (!isempty(v)) + return -EINVAL; + + for (const char *p = options;;) { + OpenFileFlag flag; + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ",", 0); + if (r < 0) + return r; + if (r == 0) + break; + + flag = open_file_flags_from_string(word); + if (flag < 0) + return flag; + + if ((flag & of->flags) != 0) + return -EINVAL; + + of->flags |= flag; + } + + if (isempty(of->fdname)) { + of->fdname = mfree(of->fdname); + r = path_extract_filename(of->path, &of->fdname); + if (r < 0) + return r; + } + + r = open_file_validate(of); + if (r < 0) + return r; + + *ret = TAKE_PTR(of); + + return 0; +} + +int open_file_validate(const OpenFile *of) { + assert(of); + + if (!path_is_valid(of->path) || !path_is_absolute(of->path)) + return -EINVAL; + + if (!fdname_is_valid(of->fdname)) + return -EINVAL; + + if ((FLAGS_SET(of->flags, OPENFILE_READ_ONLY) + FLAGS_SET(of->flags, OPENFILE_APPEND) + + FLAGS_SET(of->flags, OPENFILE_TRUNCATE)) > 1) + return -EINVAL; + + if ((of->flags & ~_OPENFILE_MASK_PUBLIC) != 0) + return -EINVAL; + + return 0; +} + +int open_file_to_string(const OpenFile *of, char **ret) { + _cleanup_free_ char *options = NULL, *fname = NULL, *s = NULL; + bool has_fdname = false; + int r; + + assert(of); + assert(ret); + + s = shell_escape(of->path, ":"); + if (!s) + return -ENOMEM; + + r = path_extract_filename(of->path, &fname); + if (r < 0) + return r; + + has_fdname = !streq(fname, of->fdname); + if (has_fdname) + if (!strextend(&s, ":", of->fdname)) + return -ENOMEM; + + for (OpenFileFlag flag = OPENFILE_READ_ONLY; flag < _OPENFILE_MAX; flag <<= 1) + if (FLAGS_SET(of->flags, flag) && !strextend_with_separator(&options, ",", open_file_flags_to_string(flag))) + return -ENOMEM; + + if (options) + if (!(has_fdname ? strextend(&s, ":", options) : strextend(&s, "::", options))) + return -ENOMEM; + + *ret = TAKE_PTR(s); + + return 0; +} + +OpenFile *open_file_free(OpenFile *of) { + if (!of) + return NULL; + + free(of->path); + free(of->fdname); + return mfree(of); +} + +void open_file_free_many(OpenFile **head) { + assert(head); + + LIST_CLEAR(open_files, *head, open_file_free); +} + +static const char * const open_file_flags_table[_OPENFILE_MAX] = { + [OPENFILE_READ_ONLY] = "read-only", + [OPENFILE_APPEND] = "append", + [OPENFILE_TRUNCATE] = "truncate", + [OPENFILE_GRACEFUL] = "graceful", +}; + +DEFINE_STRING_TABLE_LOOKUP(open_file_flags, OpenFileFlag); diff --git a/src/shared/open-file.h b/src/shared/open-file.h new file mode 100644 index 0000000..bb63ec8 --- /dev/null +++ b/src/shared/open-file.h @@ -0,0 +1,36 @@ + +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "list.h" + +typedef enum OpenFileFlag { + OPENFILE_READ_ONLY = 1 << 0, + OPENFILE_APPEND = 1 << 1, + OPENFILE_TRUNCATE = 1 << 2, + OPENFILE_GRACEFUL = 1 << 3, + _OPENFILE_MAX, + _OPENFILE_INVALID = -EINVAL, + _OPENFILE_MASK_PUBLIC = OPENFILE_READ_ONLY | OPENFILE_APPEND | OPENFILE_TRUNCATE | OPENFILE_GRACEFUL, +} OpenFileFlag; + +typedef struct OpenFile { + char *path; + char *fdname; + OpenFileFlag flags; + LIST_FIELDS(struct OpenFile, open_files); +} OpenFile; + +int open_file_parse(const char *v, OpenFile **ret); + +int open_file_validate(const OpenFile *of); + +int open_file_to_string(const OpenFile *of, char **ret); + +OpenFile *open_file_free(OpenFile *of); +DEFINE_TRIVIAL_CLEANUP_FUNC(OpenFile*, open_file_free); + +void open_file_free_many(OpenFile **head); + +const char *open_file_flags_to_string(OpenFileFlag t) _const_; +OpenFileFlag open_file_flags_from_string(const char *t) _pure_; diff --git a/src/shared/openssl-util.c b/src/shared/openssl-util.c new file mode 100644 index 0000000..b0a5563 --- /dev/null +++ b/src/shared/openssl-util.c @@ -0,0 +1,1149 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "hexdecoct.h" +#include "openssl-util.h" +#include "string-util.h" + +#if HAVE_OPENSSL +/* For each error in the the OpenSSL thread error queue, log the provided message and the OpenSSL error + * string. If there are no errors in the OpenSSL thread queue, this logs the message with "No openssl + * errors." This logs at level debug. Returns -EIO (or -ENOMEM). */ +#define log_openssl_errors(fmt, ...) _log_openssl_errors(UNIQ, fmt, ##__VA_ARGS__) +#define _log_openssl_errors(u, fmt, ...) \ + ({ \ + size_t UNIQ_T(MAX, u) = 512 /* arbitrary, but openssl doc states it must be >= 256 */; \ + _cleanup_free_ char *UNIQ_T(BUF, u) = malloc(UNIQ_T(MAX, u)); \ + !UNIQ_T(BUF, u) \ + ? log_oom_debug() \ + : __log_openssl_errors(u, UNIQ_T(BUF, u), UNIQ_T(MAX, u), fmt, ##__VA_ARGS__) \ + ?: log_debug_errno(SYNTHETIC_ERRNO(EIO), fmt ": No OpenSSL errors.", ##__VA_ARGS__); \ + }) +#define __log_openssl_errors(u, buf, max, fmt, ...) \ + ({ \ + int UNIQ_T(R, u) = 0; \ + for (;;) { \ + unsigned long UNIQ_T(E, u) = ERR_get_error(); \ + if (UNIQ_T(E, u) == 0) \ + break; \ + ERR_error_string_n(UNIQ_T(E, u), buf, max); \ + UNIQ_T(R, u) = log_debug_errno(SYNTHETIC_ERRNO(EIO), fmt ": %s", ##__VA_ARGS__, buf); \ + } \ + UNIQ_T(R, u); \ + }) + +int openssl_pkey_from_pem(const void *pem, size_t pem_size, EVP_PKEY **ret) { + assert(pem); + assert(ret); + + _cleanup_fclose_ FILE *f = NULL; + f = fmemopen((void*) pem, pem_size, "r"); + if (!f) + return log_oom_debug(); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = PEM_read_PUBKEY(f, NULL, NULL, NULL); + if (!pkey) + return log_openssl_errors("Failed to parse PEM"); + + *ret = TAKE_PTR(pkey); + + return 0; +} + +/* Returns the number of bytes generated by the specified digest algorithm. This can be used only for + * fixed-size algorithms, e.g. md5, sha1, sha256, etc. Do not use this for variable-sized digest algorithms, + * e.g. shake128. Returns 0 on success, -EOPNOTSUPP if the algorithm is not supported, or < 0 for any other + * error. */ +int openssl_digest_size(const char *digest_alg, size_t *ret_digest_size) { + assert(digest_alg); + assert(ret_digest_size); + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL); +#else + const EVP_MD *md = EVP_get_digestbyname(digest_alg); +#endif + if (!md) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Digest algorithm '%s' not supported.", digest_alg); + + size_t digest_size; +#if OPENSSL_VERSION_MAJOR >= 3 + digest_size = EVP_MD_get_size(md); +#else + digest_size = EVP_MD_size(md); +#endif + if (digest_size == 0) + return log_openssl_errors("Failed to get Digest size"); + + *ret_digest_size = digest_size; + + return 0; +} + +/* Calculate the digest hash value for the provided data, using the specified digest algorithm. Returns 0 on + * success, -EOPNOTSUPP if the digest algorithm is not supported, or < 0 for any other error. */ +int openssl_digest_many( + const char *digest_alg, + const struct iovec data[], + size_t n_data, + void **ret_digest, + size_t *ret_digest_size) { + + int r; + + assert(digest_alg); + assert(data || n_data == 0); + assert(ret_digest); + /* ret_digest_size is optional, as caller may already know the digest size */ + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL); +#else + const EVP_MD *md = EVP_get_digestbyname(digest_alg); +#endif + if (!md) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Digest algorithm '%s' not supported.", digest_alg); + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *ctx = EVP_MD_CTX_new(); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_MD_CTX"); + + if (!EVP_DigestInit_ex(ctx, md, NULL)) + return log_openssl_errors("Failed to initialize EVP_MD_CTX"); + + for (size_t i = 0; i < n_data; i++) + if (!EVP_DigestUpdate(ctx, data[i].iov_base, data[i].iov_len)) + return log_openssl_errors("Failed to update Digest"); + + size_t digest_size; + r = openssl_digest_size(digest_alg, &digest_size); + if (r < 0) + return r; + + _cleanup_free_ void *buf = malloc(digest_size); + if (!buf) + return log_oom_debug(); + + unsigned int size; + if (!EVP_DigestFinal_ex(ctx, buf, &size)) + return log_openssl_errors("Failed to finalize Digest"); + + assert(size == digest_size); + + *ret_digest = TAKE_PTR(buf); + if (ret_digest_size) + *ret_digest_size = size; + + return 0; +} + +/* Calculate the HMAC digest hash value for the provided data, using the provided key and specified digest + * algorithm. Returns 0 on success, -EOPNOTSUPP if the digest algorithm is not supported, or < 0 for any + * other error. */ +int openssl_hmac_many( + const char *digest_alg, + const void *key, + size_t key_size, + const struct iovec data[], + size_t n_data, + void **ret_digest, + size_t *ret_digest_size) { + + assert(digest_alg); + assert(key); + assert(data || n_data == 0); + assert(ret_digest); + /* ret_digest_size is optional, as caller may already know the digest size */ + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL); +#else + const EVP_MD *md = EVP_get_digestbyname(digest_alg); +#endif + if (!md) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Digest algorithm '%s' not supported.", digest_alg); + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(EVP_MAC_freep) EVP_MAC *mac = EVP_MAC_fetch(NULL, "HMAC", NULL); + if (!mac) + return log_openssl_errors("Failed to create new EVP_MAC"); + + _cleanup_(EVP_MAC_CTX_freep) EVP_MAC_CTX *ctx = EVP_MAC_CTX_new(mac); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_MAC_CTX"); + + _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new(); + if (!bld) + return log_openssl_errors("Failed to create new OSSL_PARAM_BLD"); + + if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_MAC_PARAM_DIGEST, (char*) digest_alg, 0)) + return log_openssl_errors("Failed to set HMAC OSSL_MAC_PARAM_DIGEST"); + + _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld); + if (!params) + return log_openssl_errors("Failed to build HMAC OSSL_PARAM"); + + if (!EVP_MAC_init(ctx, key, key_size, params)) + return log_openssl_errors("Failed to initialize EVP_MAC_CTX"); +#else + _cleanup_(HMAC_CTX_freep) HMAC_CTX *ctx = HMAC_CTX_new(); + if (!ctx) + return log_openssl_errors("Failed to create new HMAC_CTX"); + + if (!HMAC_Init_ex(ctx, key, key_size, md, NULL)) + return log_openssl_errors("Failed to initialize HMAC_CTX"); +#endif + + for (size_t i = 0; i < n_data; i++) +#if OPENSSL_VERSION_MAJOR >= 3 + if (!EVP_MAC_update(ctx, data[i].iov_base, data[i].iov_len)) +#else + if (!HMAC_Update(ctx, data[i].iov_base, data[i].iov_len)) +#endif + return log_openssl_errors("Failed to update HMAC"); + + size_t digest_size; +#if OPENSSL_VERSION_MAJOR >= 3 + digest_size = EVP_MAC_CTX_get_mac_size(ctx); +#else + digest_size = HMAC_size(ctx); +#endif + if (digest_size == 0) + return log_openssl_errors("Failed to get HMAC digest size"); + + _cleanup_free_ void *buf = malloc(digest_size); + if (!buf) + return log_oom_debug(); + +#if OPENSSL_VERSION_MAJOR >= 3 + size_t size; + if (!EVP_MAC_final(ctx, buf, &size, digest_size)) +#else + unsigned int size; + if (!HMAC_Final(ctx, buf, &size)) +#endif + return log_openssl_errors("Failed to finalize HMAC"); + + assert(size == digest_size); + + *ret_digest = TAKE_PTR(buf); + if (ret_digest_size) + *ret_digest_size = size; + + return 0; +} + +/* Symmetric Cipher encryption using the alg-bits-mode cipher, e.g. AES-128-CFB. The key is required and must + * be at least the minimum required key length for the cipher. The IV is optional but, if provided, it must + * be at least the minimum iv length for the cipher. If no IV is provided and the cipher requires one, a + * buffer of zeroes is used. Returns 0 on success, -EOPNOTSUPP if the cipher algorithm is not supported, or < + * 0 on any other error. */ +int openssl_cipher_many( + const char *alg, + size_t bits, + const char *mode, + const void *key, + size_t key_size, + const void *iv, + size_t iv_size, + const struct iovec data[], + size_t n_data, + void **ret, + size_t *ret_size) { + + assert(alg); + assert(bits > 0); + assert(mode); + assert(key); + assert(iv || iv_size == 0); + assert(data || n_data == 0); + assert(ret); + assert(ret_size); + + _cleanup_free_ char *cipher_alg = NULL; + if (asprintf(&cipher_alg, "%s-%zu-%s", alg, bits, mode) < 0) + return log_oom_debug(); + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(EVP_CIPHER_freep) EVP_CIPHER *cipher = EVP_CIPHER_fetch(NULL, cipher_alg, NULL); +#else + const EVP_CIPHER *cipher = EVP_get_cipherbyname(cipher_alg); +#endif + if (!cipher) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Cipher algorithm '%s' not supported.", cipher_alg); + + _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *ctx = EVP_CIPHER_CTX_new(); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_CIPHER_CTX"); + + /* Verify enough key data was provided. */ + int cipher_key_length = EVP_CIPHER_key_length(cipher); + assert(cipher_key_length >= 0); + if ((size_t) cipher_key_length > key_size) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Not enough key bytes provided, require %d", cipher_key_length); + + /* Verify enough IV data was provided or, if no IV was provided, use a zeroed buffer for IV data. */ + int cipher_iv_length = EVP_CIPHER_iv_length(cipher); + assert(cipher_iv_length >= 0); + _cleanup_free_ void *zero_iv = NULL; + if (iv_size == 0) { + zero_iv = malloc0(cipher_iv_length); + if (!zero_iv) + return log_oom_debug(); + + iv = zero_iv; + iv_size = (size_t) cipher_iv_length; + } + if ((size_t) cipher_iv_length > iv_size) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Not enough IV bytes provided, require %d", cipher_iv_length); + + if (!EVP_EncryptInit(ctx, cipher, key, iv)) + return log_openssl_errors("Failed to initialize EVP_CIPHER_CTX."); + + int cipher_block_size = EVP_CIPHER_CTX_block_size(ctx); + assert(cipher_block_size > 0); + + _cleanup_free_ uint8_t *buf = NULL; + size_t size = 0; + + for (size_t i = 0; i < n_data; i++) { + /* Cipher may produce (up to) input length + cipher block size of output. */ + if (!GREEDY_REALLOC(buf, size + data[i].iov_len + cipher_block_size)) + return log_oom_debug(); + + int update_size; + if (!EVP_EncryptUpdate(ctx, &buf[size], &update_size, data[i].iov_base, data[i].iov_len)) + return log_openssl_errors("Failed to update Cipher."); + + size += update_size; + } + + if (!GREEDY_REALLOC(buf, size + cipher_block_size)) + return log_oom_debug(); + + int final_size; + if (!EVP_EncryptFinal_ex(ctx, &buf[size], &final_size)) + return log_openssl_errors("Failed to finalize Cipher."); + + *ret = TAKE_PTR(buf); + *ret_size = size + final_size; + + return 0; +} + +/* Perform Single-Step (aka "Concat") KDF. Currently, this only supports using the digest for the auxiliary + * function. The derive_size parameter specifies how many bytes are derived. + * + * For more details see: https://www.openssl.org/docs/manmaster/man7/EVP_KDF-SS.html */ +int kdf_ss_derive( + const char *digest, + const void *key, + size_t key_size, + const void *salt, + size_t salt_size, + const void *info, + size_t info_size, + size_t derive_size, + void **ret) { + +#if OPENSSL_VERSION_MAJOR >= 3 + assert(digest); + assert(key); + assert(derive_size > 0); + assert(ret); + + _cleanup_(EVP_KDF_freep) EVP_KDF *kdf = EVP_KDF_fetch(NULL, "SSKDF", NULL); + if (!kdf) + return log_openssl_errors("Failed to create new EVP_KDF"); + + _cleanup_(EVP_KDF_CTX_freep) EVP_KDF_CTX *ctx = EVP_KDF_CTX_new(kdf); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_KDF_CTX"); + + _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new(); + if (!bld) + return log_openssl_errors("Failed to create new OSSL_PARAM_BLD"); + + _cleanup_free_ void *buf = malloc(derive_size); + if (!buf) + return log_oom_debug(); + + if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_DIGEST, (char*) digest, 0)) + return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_DIGEST"); + + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_KEY, (char*) key, key_size)) + return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_KEY"); + + if (salt) + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_SALT, (char*) salt, salt_size)) + return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_SALT"); + + if (info) + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_INFO, (char*) info, info_size)) + return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_INFO"); + + _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld); + if (!params) + return log_openssl_errors("Failed to build KDF-SS OSSL_PARAM"); + + if (EVP_KDF_derive(ctx, buf, derive_size, params) <= 0) + return log_openssl_errors("OpenSSL KDF-SS derive failed"); + + *ret = TAKE_PTR(buf); + + return 0; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "KDF-SS requires OpenSSL >= 3."); +#endif +} + +/* Perform Key-Based HMAC KDF. The mode must be "COUNTER" or "FEEDBACK". The parameter naming is from the + * OpenSSL api, and maps to SP800-108 naming as "...key, salt, info, and seed correspond to KI, Label, + * Context, and IV (respectively)...". The derive_size parameter specifies how many bytes are derived. + * + * For more details see: https://www.openssl.org/docs/manmaster/man7/EVP_KDF-KB.html */ +int kdf_kb_hmac_derive( + const char *mode, + const char *digest, + const void *key, + size_t key_size, + const void *salt, + size_t salt_size, + const void *info, + size_t info_size, + const void *seed, + size_t seed_size, + size_t derive_size, + void **ret) { + +#if OPENSSL_VERSION_MAJOR >= 3 + assert(mode); + assert(strcaseeq(mode, "COUNTER") || strcaseeq(mode, "FEEDBACK")); + assert(digest); + assert(key || key_size == 0); + assert(salt || salt_size == 0); + assert(info || info_size == 0); + assert(seed || seed_size == 0); + assert(derive_size > 0); + assert(ret); + + _cleanup_(EVP_KDF_freep) EVP_KDF *kdf = EVP_KDF_fetch(NULL, "KBKDF", NULL); + if (!kdf) + return log_openssl_errors("Failed to create new EVP_KDF"); + + _cleanup_(EVP_KDF_CTX_freep) EVP_KDF_CTX *ctx = EVP_KDF_CTX_new(kdf); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_KDF_CTX"); + + _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new(); + if (!bld) + return log_openssl_errors("Failed to create new OSSL_PARAM_BLD"); + + if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_MAC, (char*) "HMAC", 0)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_MAC"); + + if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_MODE, (char*) mode, 0)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_MODE"); + + if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_DIGEST, (char*) digest, 0)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_DIGEST"); + + if (key) + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_KEY, (char*) key, key_size)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_KEY"); + + if (salt) + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_SALT, (char*) salt, salt_size)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_SALT"); + + if (info) + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_INFO, (char*) info, info_size)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_INFO"); + + if (seed) + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_SEED, (char*) seed, seed_size)) + return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_SEED"); + + _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld); + if (!params) + return log_openssl_errors("Failed to build KDF-KB OSSL_PARAM"); + + _cleanup_free_ void *buf = malloc(derive_size); + if (!buf) + return log_oom_debug(); + + if (EVP_KDF_derive(ctx, buf, derive_size, params) <= 0) + return log_openssl_errors("OpenSSL KDF-KB derive failed"); + + *ret = TAKE_PTR(buf); + + return 0; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "KDF-KB requires OpenSSL >= 3."); +#endif +} + +int rsa_encrypt_bytes( + EVP_PKEY *pkey, + const void *decrypted_key, + size_t decrypted_key_size, + void **ret_encrypt_key, + size_t *ret_encrypt_key_size) { + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = NULL; + _cleanup_free_ void *b = NULL; + size_t l; + + ctx = EVP_PKEY_CTX_new(pkey, NULL); + if (!ctx) + return log_openssl_errors("Failed to allocate public key context"); + + if (EVP_PKEY_encrypt_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize public key context"); + + if (EVP_PKEY_CTX_set_rsa_padding(ctx, RSA_PKCS1_PADDING) <= 0) + return log_openssl_errors("Failed to configure PKCS#1 padding"); + + if (EVP_PKEY_encrypt(ctx, NULL, &l, decrypted_key, decrypted_key_size) <= 0) + return log_openssl_errors("Failed to determine encrypted key size"); + + b = malloc(l); + if (!b) + return -ENOMEM; + + if (EVP_PKEY_encrypt(ctx, b, &l, decrypted_key, decrypted_key_size) <= 0) + return log_openssl_errors("Failed to determine encrypted key size"); + + *ret_encrypt_key = TAKE_PTR(b); + *ret_encrypt_key_size = l; + + return 0; +} + +/* Encrypt the key data using RSA-OAEP with the provided label and specified digest algorithm. Returns 0 on + * success, -EOPNOTSUPP if the digest algorithm is not supported, or < 0 for any other error. */ +int rsa_oaep_encrypt_bytes( + const EVP_PKEY *pkey, + const char *digest_alg, + const char *label, + const void *decrypted_key, + size_t decrypted_key_size, + void **ret_encrypt_key, + size_t *ret_encrypt_key_size) { + + assert(pkey); + assert(digest_alg); + assert(label); + assert(decrypted_key); + assert(decrypted_key_size > 0); + assert(ret_encrypt_key); + assert(ret_encrypt_key_size); + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL); +#else + const EVP_MD *md = EVP_get_digestbyname(digest_alg); +#endif + if (!md) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Digest algorithm '%s' not supported.", digest_alg); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new((EVP_PKEY*) pkey, NULL); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_PKEY_CTX"); + + if (EVP_PKEY_encrypt_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize EVP_PKEY_CTX"); + + if (EVP_PKEY_CTX_set_rsa_padding(ctx, RSA_PKCS1_OAEP_PADDING) <= 0) + return log_openssl_errors("Failed to configure RSA-OAEP padding"); + + if (EVP_PKEY_CTX_set_rsa_oaep_md(ctx, md) <= 0) + return log_openssl_errors("Failed to configure RSA-OAEP MD"); + + _cleanup_free_ char *duplabel = strdup(label); + if (!duplabel) + return log_oom_debug(); + + if (EVP_PKEY_CTX_set0_rsa_oaep_label(ctx, duplabel, strlen(duplabel) + 1) <= 0) + return log_openssl_errors("Failed to configure RSA-OAEP label"); + /* ctx owns this now, don't free */ + TAKE_PTR(duplabel); + + size_t size = 0; + if (EVP_PKEY_encrypt(ctx, NULL, &size, decrypted_key, decrypted_key_size) <= 0) + return log_openssl_errors("Failed to determine RSA-OAEP encrypted key size"); + + _cleanup_free_ void *buf = malloc(size); + if (!buf) + return log_oom_debug(); + + if (EVP_PKEY_encrypt(ctx, buf, &size, decrypted_key, decrypted_key_size) <= 0) + return log_openssl_errors("Failed to RSA-OAEP encrypt"); + + *ret_encrypt_key = TAKE_PTR(buf); + *ret_encrypt_key_size = size; + + return 0; +} + +int rsa_pkey_to_suitable_key_size( + EVP_PKEY *pkey, + size_t *ret_suitable_key_size) { + + size_t suitable_key_size; + int bits; + + assert(pkey); + assert(ret_suitable_key_size); + + /* Analyzes the specified public key and that it is RSA. If so, will return a suitable size for a + * disk encryption key to encrypt with RSA for use in PKCS#11 security token schemes. */ + + if (EVP_PKEY_base_id(pkey) != EVP_PKEY_RSA) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "X.509 certificate does not refer to RSA key."); + + bits = EVP_PKEY_bits(pkey); + log_debug("Bits in RSA key: %i", bits); + + /* We use PKCS#1 padding for the RSA cleartext, hence let's leave some extra space for it, hence only + * generate a random key half the size of the RSA length */ + suitable_key_size = bits / 8 / 2; + + if (suitable_key_size < 1) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Uh, RSA key size too short?"); + + *ret_suitable_key_size = suitable_key_size; + return 0; +} + +/* Generate RSA public key from provided "n" and "e" values. Note that if "e" is a number (e.g. uint32_t), it + * must be provided here big-endian, e.g. wrap it with htobe32(). */ +int rsa_pkey_from_n_e(const void *n, size_t n_size, const void *e, size_t e_size, EVP_PKEY **ret) { + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + + assert(n); + assert(e); + assert(ret); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_RSA, NULL); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_PKEY_CTX"); + + _cleanup_(BN_freep) BIGNUM *bn_n = BN_bin2bn(n, n_size, NULL); + if (!bn_n) + return log_openssl_errors("Failed to create BIGNUM for RSA n"); + + _cleanup_(BN_freep) BIGNUM *bn_e = BN_bin2bn(e, e_size, NULL); + if (!bn_e) + return log_openssl_errors("Failed to create BIGNUM for RSA e"); + +#if OPENSSL_VERSION_MAJOR >= 3 + if (EVP_PKEY_fromdata_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize EVP_PKEY_CTX"); + + _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new(); + if (!bld) + return log_openssl_errors("Failed to create new OSSL_PARAM_BLD"); + + if (!OSSL_PARAM_BLD_push_BN(bld, OSSL_PKEY_PARAM_RSA_N, bn_n)) + return log_openssl_errors("Failed to set RSA OSSL_PKEY_PARAM_RSA_N"); + + if (!OSSL_PARAM_BLD_push_BN(bld, OSSL_PKEY_PARAM_RSA_E, bn_e)) + return log_openssl_errors("Failed to set RSA OSSL_PKEY_PARAM_RSA_E"); + + _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld); + if (!params) + return log_openssl_errors("Failed to build RSA OSSL_PARAM"); + + if (EVP_PKEY_fromdata(ctx, &pkey, EVP_PKEY_PUBLIC_KEY, params) <= 0) + return log_openssl_errors("Failed to create RSA EVP_PKEY"); +#else + _cleanup_(RSA_freep) RSA *rsa_key = RSA_new(); + if (!rsa_key) + return log_openssl_errors("Failed to create new RSA"); + + if (!RSA_set0_key(rsa_key, bn_n, bn_e, NULL)) + return log_openssl_errors("Failed to set RSA n/e"); + /* rsa_key owns these now, don't free */ + TAKE_PTR(bn_n); + TAKE_PTR(bn_e); + + pkey = EVP_PKEY_new(); + if (!pkey) + return log_openssl_errors("Failed to create new EVP_PKEY"); + + if (!EVP_PKEY_assign_RSA(pkey, rsa_key)) + return log_openssl_errors("Failed to assign RSA key"); + /* pkey owns this now, don't free */ + TAKE_PTR(rsa_key); +#endif + + *ret = TAKE_PTR(pkey); + + return 0; +} + +/* Get the "n" and "e" values from the pkey. The values are returned in "bin" format, i.e. BN_bn2bin(). */ +int rsa_pkey_to_n_e( + const EVP_PKEY *pkey, + void **ret_n, + size_t *ret_n_size, + void **ret_e, + size_t *ret_e_size) { + + assert(pkey); + assert(ret_n); + assert(ret_n_size); + assert(ret_e); + assert(ret_e_size); + +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_(BN_freep) BIGNUM *bn_n = NULL; + if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_RSA_N, &bn_n)) + return log_openssl_errors("Failed to get RSA n"); + + _cleanup_(BN_freep) BIGNUM *bn_e = NULL; + if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_RSA_E, &bn_e)) + return log_openssl_errors("Failed to get RSA e"); +#else + const RSA *rsa = EVP_PKEY_get0_RSA((EVP_PKEY*) pkey); + if (!rsa) + return log_openssl_errors("Failed to get RSA key from public key"); + + const BIGNUM *bn_n = RSA_get0_n(rsa); + if (!bn_n) + return log_openssl_errors("Failed to get RSA n"); + + const BIGNUM *bn_e = RSA_get0_e(rsa); + if (!bn_e) + return log_openssl_errors("Failed to get RSA e"); +#endif + + size_t n_size = BN_num_bytes(bn_n), e_size = BN_num_bytes(bn_e); + _cleanup_free_ void *n = malloc(n_size), *e = malloc(e_size); + if (!n || !e) + return log_oom_debug(); + + assert(BN_bn2bin(bn_n, n) == (int) n_size); + assert(BN_bn2bin(bn_e, e) == (int) e_size); + + *ret_n = TAKE_PTR(n); + *ret_n_size = n_size; + *ret_e = TAKE_PTR(e); + *ret_e_size = e_size; + + return 0; +} + +/* Generate a new RSA key with the specified number of bits. */ +int rsa_pkey_new(size_t bits, EVP_PKEY **ret) { + assert(ret); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_RSA, NULL); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_PKEY_CTX"); + + if (EVP_PKEY_keygen_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize EVP_PKEY_CTX"); + + if (EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, (int) bits) <= 0) + return log_openssl_errors("Failed to set RSA bits to %zu", bits); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + if (EVP_PKEY_keygen(ctx, &pkey) <= 0) + return log_openssl_errors("Failed to generate ECC key"); + + *ret = TAKE_PTR(pkey); + + return 0; +} + +/* Generate ECC public key from provided curve ID and x/y points. */ +int ecc_pkey_from_curve_x_y( + int curve_id, + const void *x, + size_t x_size, + const void *y, + size_t y_size, + EVP_PKEY **ret) { + + assert(x); + assert(y); + assert(ret); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_EC, NULL); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_PKEY_CTX"); + + _cleanup_(BN_freep) BIGNUM *bn_x = BN_bin2bn(x, x_size, NULL); + if (!bn_x) + return log_openssl_errors("Failed to create BIGNUM x"); + + _cleanup_(BN_freep) BIGNUM *bn_y = BN_bin2bn(y, y_size, NULL); + if (!bn_y) + return log_openssl_errors("Failed to create BIGNUM y"); + + _cleanup_(EC_GROUP_freep) EC_GROUP *group = EC_GROUP_new_by_curve_name(curve_id); + if (!group) + return log_openssl_errors("ECC curve id %d not supported", curve_id); + + _cleanup_(EC_POINT_freep) EC_POINT *point = EC_POINT_new(group); + if (!point) + return log_openssl_errors("Failed to create new EC_POINT"); + + if (!EC_POINT_set_affine_coordinates(group, point, bn_x, bn_y, NULL)) + return log_openssl_errors("Failed to set ECC coordinates"); + +#if OPENSSL_VERSION_MAJOR >= 3 + if (EVP_PKEY_fromdata_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize EVP_PKEY_CTX"); + + _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new(); + if (!bld) + return log_openssl_errors("Failed to create new OSSL_PARAM_BLD"); + + if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_PKEY_PARAM_GROUP_NAME, (char*) OSSL_EC_curve_nid2name(curve_id), 0)) + return log_openssl_errors("Failed to add ECC OSSL_PKEY_PARAM_GROUP_NAME"); + + _cleanup_(OPENSSL_freep) void *pbuf = NULL; + size_t pbuf_len = 0; + pbuf_len = EC_POINT_point2buf(group, point, POINT_CONVERSION_UNCOMPRESSED, (unsigned char**) &pbuf, NULL); + if (pbuf_len == 0) + return log_openssl_errors("Failed to convert ECC point to buffer"); + + if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_PKEY_PARAM_PUB_KEY, pbuf, pbuf_len)) + return log_openssl_errors("Failed to add ECC OSSL_PKEY_PARAM_PUB_KEY"); + + _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld); + if (!params) + return log_openssl_errors("Failed to build ECC OSSL_PARAM"); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + if (EVP_PKEY_fromdata(ctx, &pkey, EVP_PKEY_PUBLIC_KEY, params) <= 0) + return log_openssl_errors("Failed to create ECC EVP_PKEY"); +#else + _cleanup_(EC_KEY_freep) EC_KEY *eckey = EC_KEY_new(); + if (!eckey) + return log_openssl_errors("Failed to create new EC_KEY"); + + if (!EC_KEY_set_group(eckey, group)) + return log_openssl_errors("Failed to set ECC group"); + + if (!EC_KEY_set_public_key(eckey, point)) + return log_openssl_errors("Failed to set ECC point"); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = EVP_PKEY_new(); + if (!pkey) + return log_openssl_errors("Failed to create new EVP_PKEY"); + + if (!EVP_PKEY_assign_EC_KEY(pkey, eckey)) + return log_openssl_errors("Failed to assign ECC key"); + /* pkey owns this now, don't free */ + TAKE_PTR(eckey); +#endif + + *ret = TAKE_PTR(pkey); + + return 0; +} + +int ecc_pkey_to_curve_x_y( + const EVP_PKEY *pkey, + int *ret_curve_id, + void **ret_x, + size_t *ret_x_size, + void **ret_y, + size_t *ret_y_size) { + + _cleanup_(BN_freep) BIGNUM *bn_x = NULL, *bn_y = NULL; + int curve_id; + + assert(pkey); + +#if OPENSSL_VERSION_MAJOR >= 3 + size_t name_size; + if (!EVP_PKEY_get_utf8_string_param(pkey, OSSL_PKEY_PARAM_GROUP_NAME, NULL, 0, &name_size)) + return log_openssl_errors("Failed to get ECC group name size"); + + _cleanup_free_ char *name = new(char, name_size + 1); + if (!name) + return log_oom_debug(); + + if (!EVP_PKEY_get_utf8_string_param(pkey, OSSL_PKEY_PARAM_GROUP_NAME, name, name_size + 1, NULL)) + return log_openssl_errors("Failed to get ECC group name"); + + curve_id = OBJ_sn2nid(name); + if (curve_id == NID_undef) + return log_openssl_errors("Failed to get ECC curve id"); + + if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_EC_PUB_X, &bn_x)) + return log_openssl_errors("Failed to get ECC point x"); + + if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_EC_PUB_Y, &bn_y)) + return log_openssl_errors("Failed to get ECC point y"); +#else + const EC_KEY *eckey = EVP_PKEY_get0_EC_KEY((EVP_PKEY*) pkey); + if (!eckey) + return log_openssl_errors("Failed to get EC_KEY"); + + const EC_GROUP *group = EC_KEY_get0_group(eckey); + if (!group) + return log_openssl_errors("Failed to get EC_GROUP"); + + curve_id = EC_GROUP_get_curve_name(group); + if (curve_id == NID_undef) + return log_openssl_errors("Failed to get ECC curve id"); + + const EC_POINT *point = EC_KEY_get0_public_key(eckey); + if (!point) + return log_openssl_errors("Failed to get EC_POINT"); + + bn_x = BN_new(); + bn_y = BN_new(); + if (!bn_x || !bn_y) + return log_openssl_errors("Failed to create new BIGNUM"); + + if (!EC_POINT_get_affine_coordinates(group, point, bn_x, bn_y, NULL)) + return log_openssl_errors("Failed to get ECC x/y."); +#endif + + size_t x_size = BN_num_bytes(bn_x), y_size = BN_num_bytes(bn_y); + _cleanup_free_ void *x = malloc(x_size), *y = malloc(y_size); + if (!x || !y) + return log_oom_debug(); + + assert(BN_bn2bin(bn_x, x) == (int) x_size); + assert(BN_bn2bin(bn_y, y) == (int) y_size); + + if (ret_curve_id) + *ret_curve_id = curve_id; + if (ret_x) + *ret_x = TAKE_PTR(x); + if (ret_x_size) + *ret_x_size = x_size; + if (ret_y) + *ret_y = TAKE_PTR(y); + if (ret_y_size) + *ret_y_size = y_size; + + return 0; +} + +/* Generate a new ECC key for the specified ECC curve id. */ +int ecc_pkey_new(int curve_id, EVP_PKEY **ret) { + assert(ret); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_EC, NULL); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_PKEY_CTX"); + + if (EVP_PKEY_keygen_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize EVP_PKEY_CTX"); + + if (EVP_PKEY_CTX_set_ec_paramgen_curve_nid(ctx, curve_id) <= 0) + return log_openssl_errors("Failed to set ECC curve %d", curve_id); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + if (EVP_PKEY_keygen(ctx, &pkey) <= 0) + return log_openssl_errors("Failed to generate ECC key"); + + *ret = TAKE_PTR(pkey); + + return 0; +} + +/* Perform ECDH to derive an ECC shared secret between the provided private key and public peer key. For two + * keys, this will result in the same shared secret in either direction; ECDH using Alice's private key and + * Bob's public (peer) key will result in the same shared secret as ECDH using Bob's private key and Alice's + * public (peer) key. On success, this returns 0 and provides the shared secret; otherwise this returns an + * error. */ +int ecc_ecdh(const EVP_PKEY *private_pkey, + const EVP_PKEY *peer_pkey, + void **ret_shared_secret, + size_t *ret_shared_secret_size) { + + assert(private_pkey); + assert(peer_pkey); + assert(ret_shared_secret); + assert(ret_shared_secret_size); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new((EVP_PKEY*) private_pkey, NULL); + if (!ctx) + return log_openssl_errors("Failed to create new EVP_PKEY_CTX"); + + if (EVP_PKEY_derive_init(ctx) <= 0) + return log_openssl_errors("Failed to initialize EVP_PKEY_CTX"); + + if (EVP_PKEY_derive_set_peer(ctx, (EVP_PKEY*) peer_pkey) <= 0) + return log_openssl_errors("Failed to set ECC derive peer"); + + size_t shared_secret_size; + if (EVP_PKEY_derive(ctx, NULL, &shared_secret_size) <= 0) + return log_openssl_errors("Failed to get ECC shared secret size"); + + _cleanup_free_ void *shared_secret = malloc(shared_secret_size); + if (!shared_secret) + return log_oom_debug(); + + if (EVP_PKEY_derive(ctx, (unsigned char*) shared_secret, &shared_secret_size) <= 0) + return log_openssl_errors("Failed to derive ECC shared secret"); + + *ret_shared_secret = TAKE_PTR(shared_secret); + *ret_shared_secret_size = shared_secret_size; + + return 0; +} + +int pubkey_fingerprint(EVP_PKEY *pk, const EVP_MD *md, void **ret, size_t *ret_size) { + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX* m = NULL; + _cleanup_free_ void *d = NULL, *h = NULL; + int sz, lsz, msz; + unsigned umsz; + unsigned char *dd; + + /* Calculates a message digest of the DER encoded public key */ + + assert(pk); + assert(md); + assert(ret); + assert(ret_size); + + sz = i2d_PublicKey(pk, NULL); + if (sz < 0) + return log_openssl_errors("Unable to convert public key to DER format"); + + dd = d = malloc(sz); + if (!d) + return log_oom_debug(); + + lsz = i2d_PublicKey(pk, &dd); + if (lsz < 0) + return log_openssl_errors("Unable to convert public key to DER format"); + + m = EVP_MD_CTX_new(); + if (!m) + return log_openssl_errors("Failed to create new EVP_MD_CTX"); + + if (EVP_DigestInit_ex(m, md, NULL) != 1) + return log_openssl_errors("Failed to initialize %s context", EVP_MD_name(md)); + + if (EVP_DigestUpdate(m, d, lsz) != 1) + return log_openssl_errors("Failed to run %s context", EVP_MD_name(md)); + + msz = EVP_MD_size(md); + assert(msz > 0); + + h = malloc(msz); + if (!h) + return log_oom_debug(); + + umsz = msz; + if (EVP_DigestFinal_ex(m, h, &umsz) != 1) + return log_openssl_errors("Failed to finalize hash context"); + + assert(umsz == (unsigned) msz); + + *ret = TAKE_PTR(h); + *ret_size = msz; + + return 0; +} + +int digest_and_sign( + const EVP_MD *md, + EVP_PKEY *privkey, + const void *data, size_t size, + void **ret, size_t *ret_size) { + + assert(privkey); + assert(ret); + assert(ret_size); + + if (size == 0) + data = ""; /* make sure to pass a valid pointer to OpenSSL */ + else { + assert(data); + + if (size == SIZE_MAX) /* If SIZE_MAX input is a string whose size we determine automatically */ + size = strlen(data); + } + + _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX* mdctx = EVP_MD_CTX_new(); + if (!mdctx) + return log_openssl_errors("Failed to create new EVP_MD_CTX"); + + if (EVP_DigestSignInit(mdctx, NULL, md, NULL, privkey) != 1) + return log_openssl_errors("Failed to initialize signature context"); + + /* Determine signature size */ + size_t ss; + if (EVP_DigestSign(mdctx, NULL, &ss, data, size) != 1) + return log_openssl_errors("Failed to determine size of signature"); + + _cleanup_free_ void *sig = malloc(ss); + if (!sig) + return log_oom_debug(); + + if (EVP_DigestSign(mdctx, sig, &ss, data, size) != 1) + return log_openssl_errors("Failed to sign data"); + + *ret = TAKE_PTR(sig); + *ret_size = ss; + return 0; +} + +# if PREFER_OPENSSL +int string_hashsum( + const char *s, + size_t len, + const char *md_algorithm, + char **ret) { + + _cleanup_free_ void *hash = NULL; + size_t hash_size; + _cleanup_free_ char *enc = NULL; + int r; + + assert(s || len == 0); + assert(md_algorithm); + assert(ret); + + r = openssl_digest(md_algorithm, s, len, &hash, &hash_size); + if (r < 0) + return r; + + enc = hexmem(hash, hash_size); + if (!enc) + return -ENOMEM; + + *ret = TAKE_PTR(enc); + return 0; +} +# endif +#endif + +int x509_fingerprint(X509 *cert, uint8_t buffer[static SHA256_DIGEST_SIZE]) { +#if HAVE_OPENSSL + _cleanup_free_ uint8_t *der = NULL; + int dersz; + + assert(cert); + + dersz = i2d_X509(cert, &der); + if (dersz < 0) + return log_openssl_errors("Unable to convert PEM certificate to DER format"); + + sha256_direct(der, dersz, buffer); + return 0; +#else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL is not supported, cannot calculate X509 fingerprint: %m"); +#endif +} diff --git a/src/shared/openssl-util.h b/src/shared/openssl-util.h new file mode 100644 index 0000000..e3f34a8 --- /dev/null +++ b/src/shared/openssl-util.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "iovec-util.h" +#include "macro.h" +#include "sha256.h" + +#define X509_FINGERPRINT_SIZE SHA256_DIGEST_SIZE + +#if HAVE_OPENSSL +# include +# include +# include +# include +# include +# include +# include +# include +# include +# ifndef OPENSSL_VERSION_MAJOR +/* OPENSSL_VERSION_MAJOR macro was added in OpenSSL 3. Thus, if it doesn't exist, we must be before OpenSSL 3. */ +# define OPENSSL_VERSION_MAJOR 1 +# endif +# if OPENSSL_VERSION_MAJOR >= 3 +# include +# include +# include +# endif + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL_MACRO(void*, OPENSSL_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(X509_NAME*, X509_NAME_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_PKEY_CTX*, EVP_PKEY_CTX_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_CIPHER_CTX*, EVP_CIPHER_CTX_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_POINT*, EC_POINT_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_GROUP*, EC_GROUP_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(BIGNUM*, BN_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(BN_CTX*, BN_CTX_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(ECDSA_SIG*, ECDSA_SIG_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(PKCS7*, PKCS7_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(SSL*, SSL_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(BIO*, BIO_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MD_CTX*, EVP_MD_CTX_free, NULL); +#if OPENSSL_VERSION_MAJOR >= 3 +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_CIPHER*, EVP_CIPHER_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_KDF*, EVP_KDF_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_KDF_CTX*, EVP_KDF_CTX_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MAC*, EVP_MAC_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MAC_CTX*, EVP_MAC_CTX_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MD*, EVP_MD_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OSSL_PARAM*, OSSL_PARAM_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OSSL_PARAM_BLD*, OSSL_PARAM_BLD_free, NULL); +#else +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_KEY*, EC_KEY_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(HMAC_CTX*, HMAC_CTX_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(RSA*, RSA_free, NULL); +#endif + +static inline void sk_X509_free_allp(STACK_OF(X509) **sk) { + if (!sk || !*sk) + return; + + sk_X509_pop_free(*sk, X509_free); +} + +int openssl_pkey_from_pem(const void *pem, size_t pem_size, EVP_PKEY **ret); + +int openssl_digest_size(const char *digest_alg, size_t *ret_digest_size); + +int openssl_digest_many(const char *digest_alg, const struct iovec data[], size_t n_data, void **ret_digest, size_t *ret_digest_size); + +static inline int openssl_digest(const char *digest_alg, const void *buf, size_t len, void **ret_digest, size_t *ret_digest_size) { + return openssl_digest_many(digest_alg, &IOVEC_MAKE((void*) buf, len), 1, ret_digest, ret_digest_size); +} + +int openssl_hmac_many(const char *digest_alg, const void *key, size_t key_size, const struct iovec data[], size_t n_data, void **ret_digest, size_t *ret_digest_size); + +static inline int openssl_hmac(const char *digest_alg, const void *key, size_t key_size, const void *buf, size_t len, void **ret_digest, size_t *ret_digest_size) { + return openssl_hmac_many(digest_alg, key, key_size, &IOVEC_MAKE((void*) buf, len), 1, ret_digest, ret_digest_size); +} + +int openssl_cipher_many(const char *alg, size_t bits, const char *mode, const void *key, size_t key_size, const void *iv, size_t iv_size, const struct iovec data[], size_t n_data, void **ret, size_t *ret_size); + +static inline int openssl_cipher(const char *alg, size_t bits, const char *mode, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *buf, size_t len, void **ret, size_t *ret_size) { + return openssl_cipher_many(alg, bits, mode, key, key_size, iv, iv_size, &IOVEC_MAKE((void*) buf, len), 1, ret, ret_size); +} + +int kdf_ss_derive(const char *digest, const void *key, size_t key_size, const void *salt, size_t salt_size, const void *info, size_t info_size, size_t derive_size, void **ret); + +int kdf_kb_hmac_derive(const char *mode, const char *digest, const void *key, size_t key_size, const void *salt, size_t salt_size, const void *info, size_t info_size, const void *seed, size_t seed_size, size_t derive_size, void **ret); + +int rsa_encrypt_bytes(EVP_PKEY *pkey, const void *decrypted_key, size_t decrypted_key_size, void **ret_encrypt_key, size_t *ret_encrypt_key_size); + +int rsa_oaep_encrypt_bytes(const EVP_PKEY *pkey, const char *digest_alg, const char *label, const void *decrypted_key, size_t decrypted_key_size, void **ret_encrypt_key, size_t *ret_encrypt_key_size); + +int rsa_pkey_to_suitable_key_size(EVP_PKEY *pkey, size_t *ret_suitable_key_size); + +int rsa_pkey_new(size_t bits, EVP_PKEY **ret); + +int rsa_pkey_from_n_e(const void *n, size_t n_size, const void *e, size_t e_size, EVP_PKEY **ret); + +int rsa_pkey_to_n_e(const EVP_PKEY *pkey, void **ret_n, size_t *ret_n_size, void **ret_e, size_t *ret_e_size); + +int ecc_pkey_from_curve_x_y(int curve_id, const void *x, size_t x_size, const void *y, size_t y_size, EVP_PKEY **ret); + +int ecc_pkey_to_curve_x_y(const EVP_PKEY *pkey, int *ret_curve_id, void **ret_x, size_t *ret_x_size, void **ret_y, size_t *ret_y_size); + +int ecc_pkey_new(int curve_id, EVP_PKEY **ret); + +int ecc_ecdh(const EVP_PKEY *private_pkey, const EVP_PKEY *peer_pkey, void **ret_shared_secret, size_t *ret_shared_secret_size); + +int pubkey_fingerprint(EVP_PKEY *pk, const EVP_MD *md, void **ret, size_t *ret_size); + +int digest_and_sign(const EVP_MD *md, EVP_PKEY *privkey, const void *data, size_t size, void **ret, size_t *ret_size); + +#else + +typedef struct X509 X509; +typedef struct EVP_PKEY EVP_PKEY; + +static inline void *X509_free(X509 *p) { + assert(p == NULL); + return NULL; +} + +static inline void *EVP_PKEY_free(EVP_PKEY *p) { + assert(p == NULL); + return NULL; +} + +#endif + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(X509*, X509_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_PKEY*, EVP_PKEY_free, NULL); + +int x509_fingerprint(X509 *cert, uint8_t buffer[static X509_FINGERPRINT_SIZE]); + +#if PREFER_OPENSSL +/* The openssl definition */ +typedef const EVP_MD* hash_md_t; +typedef const EVP_MD* hash_algorithm_t; +typedef int elliptic_curve_t; +typedef EVP_MD_CTX* hash_context_t; +# define OPENSSL_OR_GCRYPT(a, b) (a) + +#elif HAVE_GCRYPT + +# include + +/* The gcrypt definition */ +typedef int hash_md_t; +typedef const char* hash_algorithm_t; +typedef const char* elliptic_curve_t; +typedef gcry_md_hd_t hash_context_t; +# define OPENSSL_OR_GCRYPT(a, b) (b) +#endif + +#if PREFER_OPENSSL +int string_hashsum(const char *s, size_t len, const char *md_algorithm, char **ret); + +static inline int string_hashsum_sha224(const char *s, size_t len, char **ret) { + return string_hashsum(s, len, "SHA224", ret); +} + +static inline int string_hashsum_sha256(const char *s, size_t len, char **ret) { + return string_hashsum(s, len, "SHA256", ret); +} +#endif diff --git a/src/shared/output-mode.c b/src/shared/output-mode.c new file mode 100644 index 0000000..026bf19 --- /dev/null +++ b/src/shared/output-mode.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "output-mode.h" +#include "string-table.h" + +JsonFormatFlags output_mode_to_json_format_flags(OutputMode m) { + + switch (m) { + + case OUTPUT_JSON_SSE: + return JSON_FORMAT_SSE; + + case OUTPUT_JSON_SEQ: + return JSON_FORMAT_SEQ; + + case OUTPUT_JSON_PRETTY: + return JSON_FORMAT_PRETTY; + + default: + return JSON_FORMAT_NEWLINE; + } +} + +static const char *const output_mode_table[_OUTPUT_MODE_MAX] = { + [OUTPUT_SHORT] = "short", + [OUTPUT_SHORT_FULL] = "short-full", + [OUTPUT_SHORT_ISO] = "short-iso", + [OUTPUT_SHORT_ISO_PRECISE] = "short-iso-precise", + [OUTPUT_SHORT_PRECISE] = "short-precise", + [OUTPUT_SHORT_MONOTONIC] = "short-monotonic", + [OUTPUT_SHORT_DELTA] = "short-delta", + [OUTPUT_SHORT_UNIX] = "short-unix", + [OUTPUT_VERBOSE] = "verbose", + [OUTPUT_EXPORT] = "export", + [OUTPUT_JSON] = "json", + [OUTPUT_JSON_PRETTY] = "json-pretty", + [OUTPUT_JSON_SSE] = "json-sse", + [OUTPUT_JSON_SEQ] = "json-seq", + [OUTPUT_CAT] = "cat", + [OUTPUT_WITH_UNIT] = "with-unit", +}; + +DEFINE_STRING_TABLE_LOOKUP(output_mode, OutputMode); diff --git a/src/shared/output-mode.h b/src/shared/output-mode.h new file mode 100644 index 0000000..8683f57 --- /dev/null +++ b/src/shared/output-mode.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "json.h" +#include "macro.h" + +typedef enum OutputMode { + OUTPUT_SHORT, + OUTPUT_SHORT_FULL, + OUTPUT_SHORT_ISO, + OUTPUT_SHORT_ISO_PRECISE, + OUTPUT_SHORT_PRECISE, + OUTPUT_SHORT_MONOTONIC, + OUTPUT_SHORT_DELTA, + OUTPUT_SHORT_UNIX, + OUTPUT_VERBOSE, + OUTPUT_EXPORT, + OUTPUT_JSON, + OUTPUT_JSON_PRETTY, + OUTPUT_JSON_SSE, + OUTPUT_JSON_SEQ, + OUTPUT_CAT, + OUTPUT_WITH_UNIT, + _OUTPUT_MODE_MAX, + _OUTPUT_MODE_INVALID = -EINVAL, +} OutputMode; + +static inline bool OUTPUT_MODE_IS_JSON(OutputMode m) { + return IN_SET(m, OUTPUT_JSON, OUTPUT_JSON_PRETTY, OUTPUT_JSON_SSE, OUTPUT_JSON_SEQ); +} + +/* The output flags definitions are shared by the logs and process tree output. Some apply to both, some only to the + * logs output, others only to the process tree output. */ + +typedef enum OutputFlags { + OUTPUT_SHOW_ALL = 1 << 0, + OUTPUT_FULL_WIDTH = 1 << 1, + OUTPUT_COLOR = 1 << 2, + + /* Specific to log output */ + OUTPUT_WARN_CUTOFF = 1 << 3, + OUTPUT_CATALOG = 1 << 4, + OUTPUT_BEGIN_NEWLINE = 1 << 5, + OUTPUT_UTC = 1 << 6, + OUTPUT_NO_HOSTNAME = 1 << 7, + OUTPUT_TRUNCATE_NEWLINE = 1 << 8, + + /* Specific to process tree output */ + OUTPUT_KERNEL_THREADS = 1 << 9, + OUTPUT_CGROUP_XATTRS = 1 << 10, + OUTPUT_CGROUP_ID = 1 << 11, +} OutputFlags; + +JsonFormatFlags output_mode_to_json_format_flags(OutputMode m); + +const char* output_mode_to_string(OutputMode m) _const_; +OutputMode output_mode_from_string(const char *s) _pure_; diff --git a/src/shared/pager.c b/src/shared/pager.c new file mode 100644 index 0000000..19deefa --- /dev/null +++ b/src/shared/pager.c @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-login.h" + +#include "copy.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "locale-util.h" +#include "log.h" +#include "macro.h" +#include "pager.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +static pid_t pager_pid = 0; + +static int stored_stdout = -1; +static int stored_stderr = -1; +static bool stdout_redirected = false; +static bool stderr_redirected = false; + +_noreturn_ static void pager_fallback(void) { + int r; + + r = copy_bytes(STDIN_FILENO, STDOUT_FILENO, UINT64_MAX, 0); + if (r < 0) { + log_error_errno(r, "Internal pager failed: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); +} + +static int no_quit_on_interrupt(int exe_name_fd, const char *less_opts) { + _cleanup_fclose_ FILE *file = NULL; + _cleanup_free_ char *line = NULL; + int r; + + assert(exe_name_fd >= 0); + assert(less_opts); + + /* This takes ownership of exe_name_fd */ + file = fdopen(exe_name_fd, "r"); + if (!file) { + safe_close(exe_name_fd); + return log_error_errno(errno, "Failed to create FILE object: %m"); + } + + /* Find the last line */ + for (;;) { + _cleanup_free_ char *t = NULL; + + r = read_line(file, LONG_LINE_MAX, &t); + if (r < 0) + return log_error_errno(r, "Failed to read from socket: %m"); + if (r == 0) + break; + + free_and_replace(line, t); + } + + /* We only treat "less" specially. + * Return true whenever option K is *not* set. */ + r = streq_ptr(line, "less") && !strchr(less_opts, 'K'); + + log_debug("Pager executable is \"%s\", options \"%s\", quit_on_interrupt: %s", + strnull(line), less_opts, yes_no(!r)); + return r; +} + +void pager_open(PagerFlags flags) { + _cleanup_close_pair_ int fd[2] = EBADF_PAIR, exe_name_pipe[2] = EBADF_PAIR; + _cleanup_strv_free_ char **pager_args = NULL; + _cleanup_free_ char *l = NULL; + const char *pager, *less_opts; + int r; + + if (flags & PAGER_DISABLE) + return; + + if (pager_pid > 0) + return; + + if (terminal_is_dumb()) + return; + + if (!is_main_thread()) + return (void) log_error_errno(SYNTHETIC_ERRNO(EPERM), "Pager invoked from wrong thread."); + + pager = getenv("SYSTEMD_PAGER"); + if (!pager) + pager = getenv("PAGER"); + + if (pager) { + pager_args = strv_split(pager, WHITESPACE); + if (!pager_args) + return (void) log_oom(); + + /* If the pager is explicitly turned off, honour it */ + if (strv_isempty(pager_args) || strv_equal(pager_args, STRV_MAKE("cat"))) + return; + } + + /* Determine and cache number of columns/lines before we spawn the pager so that we get the value from the + * actual tty */ + (void) columns(); + (void) lines(); + + if (pipe2(fd, O_CLOEXEC) < 0) + return (void) log_error_errno(errno, "Failed to create pager pipe: %m"); + + /* This is a pipe to feed the name of the executed pager binary into the parent */ + if (pipe2(exe_name_pipe, O_CLOEXEC) < 0) + return (void) log_error_errno(errno, "Failed to create exe_name pipe: %m"); + + /* Initialize a good set of less options */ + less_opts = getenv("SYSTEMD_LESS"); + if (!less_opts) + less_opts = "FRSXMK"; + if (flags & PAGER_JUMP_TO_END) { + l = strjoin(less_opts, " +G"); + if (!l) + return (void) log_oom(); + less_opts = l; + } + + /* We set SIGINT as PR_DEATHSIG signal here, to match the "K" parameter we set in $LESS, which enables SIGINT behaviour. */ + r = safe_fork("(pager)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGINT|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pager_pid); + if (r < 0) + return; + if (r == 0) { + const char *less_charset; + + /* In the child start the pager */ + + if (dup2(fd[0], STDIN_FILENO) < 0) { + log_error_errno(errno, "Failed to duplicate file descriptor to STDIN: %m"); + _exit(EXIT_FAILURE); + } + + safe_close_pair(fd); + + if (setenv("LESS", less_opts, 1) < 0) { + log_error_errno(errno, "Failed to set environment variable LESS: %m"); + _exit(EXIT_FAILURE); + } + + /* Initialize a good charset for less. This is particularly important if we output UTF-8 + * characters. */ + less_charset = getenv("SYSTEMD_LESSCHARSET"); + if (!less_charset && is_locale_utf8()) + less_charset = "utf-8"; + if (less_charset && + setenv("LESSCHARSET", less_charset, 1) < 0) { + log_error_errno(errno, "Failed to set environment variable LESSCHARSET: %m"); + _exit(EXIT_FAILURE); + } + + /* People might invoke us from sudo, don't needlessly allow less to be a way to shell out + * privileged stuff. If the user set $SYSTEMD_PAGERSECURE, trust their configuration of the + * pager. If they didn't, use secure mode when under euid is changed. If $SYSTEMD_PAGERSECURE + * wasn't explicitly set, and we autodetect the need for secure mode, only use the pager we + * know to be good. */ + int use_secure_mode = getenv_bool_secure("SYSTEMD_PAGERSECURE"); + bool trust_pager = use_secure_mode >= 0; + if (use_secure_mode == -ENXIO) { + uid_t uid; + + r = sd_pid_get_owner_uid(0, &uid); + if (r < 0) + log_debug_errno(r, "sd_pid_get_owner_uid() failed, enabling pager secure mode: %m"); + + use_secure_mode = r < 0 || uid != geteuid(); + + } else if (use_secure_mode < 0) { + log_warning_errno(use_secure_mode, "Unable to parse $SYSTEMD_PAGERSECURE, assuming true: %m"); + use_secure_mode = true; + } + + /* We generally always set variables used by less, even if we end up using a different pager. + * They shouldn't hurt in any case, and ideally other pagers would look at them too. */ + r = set_unset_env("LESSSECURE", use_secure_mode ? "1" : NULL, true); + if (r < 0) { + log_error_errno(r, "Failed to adjust environment variable LESSSECURE: %m"); + _exit(EXIT_FAILURE); + } + + if (trust_pager && pager_args) { /* The pager config might be set globally, and we cannot + * know if the user adjusted it to be appropriate for the + * secure mode. Thus, start the pager specified through + * envvars only when $SYSTEMD_PAGERSECURE was explicitly set + * as well. */ + r = loop_write(exe_name_pipe[1], pager_args[0], strlen(pager_args[0]) + 1); + if (r < 0) { + log_error_errno(r, "Failed to write pager name to socket: %m"); + _exit(EXIT_FAILURE); + } + + execvp(pager_args[0], pager_args); + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to execute '%s', using fallback pagers: %m", pager_args[0]); + } + + /* Debian's alternatives command for pagers is called 'pager'. Note that we do not call + * sensible-pagers here, since that is just a shell script that implements a logic that is + * similar to this one anyway, but is Debian-specific. */ + static const char* pagers[] = { "pager", "less", "more", "(built-in)" }; + + for (unsigned i = 0; i < ELEMENTSOF(pagers); i++) { + /* Only less (and our trivial fallback) implement secure mode right now. */ + if (use_secure_mode && !STR_IN_SET(pagers[i], "less", "(built-in)")) + continue; + + r = loop_write(exe_name_pipe[1], pagers[i], strlen(pagers[i]) + 1); + if (r < 0) { + log_error_errno(r, "Failed to write pager name to socket: %m"); + _exit(EXIT_FAILURE); + } + + if (i < ELEMENTSOF(pagers) - 1) { + execlp(pagers[i], pagers[i], NULL); + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to execute '%s', will try '%s' next: %m", pagers[i], pagers[i+1]); + } else { + /* Close pipe to signal the parent to start sending data */ + safe_close_pair(exe_name_pipe); + pager_fallback(); + assert_not_reached(); + } + } + } + + /* Return in the parent */ + stored_stdout = fcntl(STDOUT_FILENO, F_DUPFD_CLOEXEC, 3); + if (dup2(fd[1], STDOUT_FILENO) < 0) { + stored_stdout = safe_close(stored_stdout); + return (void) log_error_errno(errno, "Failed to duplicate pager pipe: %m"); + } + stdout_redirected = true; + + stored_stderr = fcntl(STDERR_FILENO, F_DUPFD_CLOEXEC, 3); + if (dup2(fd[1], STDERR_FILENO) < 0) { + stored_stderr = safe_close(stored_stderr); + return (void) log_error_errno(errno, "Failed to duplicate pager pipe: %m"); + } + stderr_redirected = true; + + exe_name_pipe[1] = safe_close(exe_name_pipe[1]); + + r = no_quit_on_interrupt(TAKE_FD(exe_name_pipe[0]), less_opts); + if (r > 0) + (void) ignore_signals(SIGINT); +} + +void pager_close(void) { + + if (pager_pid <= 0) + return; + + /* Inform pager that we are done */ + (void) fflush(stdout); + if (stdout_redirected) + if (stored_stdout < 0 || dup2(stored_stdout, STDOUT_FILENO) < 0) + (void) close(STDOUT_FILENO); + stored_stdout = safe_close(stored_stdout); + (void) fflush(stderr); + if (stderr_redirected) + if (stored_stderr < 0 || dup2(stored_stderr, STDERR_FILENO) < 0) + (void) close(STDERR_FILENO); + stored_stderr = safe_close(stored_stderr); + stdout_redirected = stderr_redirected = false; + + (void) kill(pager_pid, SIGCONT); + (void) wait_for_terminate(TAKE_PID(pager_pid), NULL); + pager_pid = 0; +} + +bool pager_have(void) { + return pager_pid > 0; +} + +int show_man_page(const char *desc, bool null_stdio) { + const char *args[4] = { "man", NULL, NULL, NULL }; + char *e = NULL; + pid_t pid; + size_t k; + int r; + + k = strlen(desc); + + if (desc[k-1] == ')') + e = strrchr(desc, '('); + + if (e) { + char *page = NULL, *section = NULL; + + page = strndupa_safe(desc, e - desc); + section = strndupa_safe(e + 1, desc + k - e - 2); + + args[1] = section; + args[2] = page; + } else + args[1] = desc; + + r = safe_fork("(man)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|(null_stdio ? FORK_REARRANGE_STDIO : 0)|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + execvp(args[0], (char**) args); + log_error_errno(errno, "Failed to execute man: %m"); + _exit(EXIT_FAILURE); + } + + return wait_for_terminate_and_check(NULL, pid, 0); +} diff --git a/src/shared/pager.h b/src/shared/pager.h new file mode 100644 index 0000000..9a9d4c5 --- /dev/null +++ b/src/shared/pager.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +typedef enum PagerFlags { + PAGER_DISABLE = 1 << 0, + PAGER_JUMP_TO_END = 1 << 1, +} PagerFlags; + +void pager_open(PagerFlags flags); +void pager_close(void); +bool pager_have(void) _pure_; + +int show_man_page(const char *page, bool null_stdio); diff --git a/src/shared/pam-util.c b/src/shared/pam-util.c new file mode 100644 index 0000000..f5814ef --- /dev/null +++ b/src/shared/pam-util.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bus-internal.h" +#include "errno-util.h" +#include "format-util.h" +#include "macro.h" +#include "pam-util.h" +#include "process-util.h" +#include "stdio-util.h" +#include "string-util.h" + +int pam_syslog_errno(pam_handle_t *handle, int level, int error, const char *format, ...) { + va_list ap; + + LOCAL_ERRNO(error); + + va_start(ap, format); + pam_vsyslog(handle, LOG_ERR, format, ap); + va_end(ap); + + return error == -ENOMEM ? PAM_BUF_ERR : PAM_SERVICE_ERR; +} + +int pam_syslog_pam_error(pam_handle_t *handle, int level, int error, const char *format, ...) { + /* This wraps pam_syslog() but will replace @PAMERR@ with a string from pam_strerror(). + * @PAMERR@ must be at the very end. */ + + va_list ap; + va_start(ap, format); + + const char *p = endswith(format, "@PAMERR@"); + if (p) { + const char *pamerr = pam_strerror(handle, error); + if (strchr(pamerr, '%')) + pamerr = "n/a"; /* We cannot have any formatting chars */ + + char buf[p - format + strlen(pamerr) + 1]; + xsprintf(buf, "%.*s%s", (int)(p - format), format, pamerr); + + DISABLE_WARNING_FORMAT_NONLITERAL; + pam_vsyslog(handle, level, buf, ap); + REENABLE_WARNING; + } else + pam_vsyslog(handle, level, format, ap); + + va_end(ap); + + return error; +} + +/* A small structure we store inside the PAM session object, that allows us to reuse bus connections but pins + * it to the process thoroughly. */ +struct PamBusData { + sd_bus *bus; + pam_handle_t *pam_handle; + char *cache_id; +}; + +static PamBusData *pam_bus_data_free(PamBusData *d) { + /* The actual destructor */ + if (!d) + return NULL; + + /* NB: PAM sessions usually involve forking off a child process, and thus the PAM context might be + * duplicated in the child. This destructor might be called twice: both in the parent and in the + * child. sd_bus_flush_close_unref() however is smart enough to be a NOP when invoked in any other + * process than the one it was invoked from, hence we don't need to add any extra protection here to + * ensure that destruction of the bus connection in the child affects the parent's connection + * somehow. */ + sd_bus_flush_close_unref(d->bus); + free(d->cache_id); + + /* Note: we don't destroy pam_handle here, because this object is pinned by the handle, and not vice versa! */ + + return mfree(d); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(PamBusData*, pam_bus_data_free); + +static void pam_bus_data_destroy(pam_handle_t *handle, void *data, int error_status) { + /* Destructor when called from PAM. Note that error_status is supposed to tell us via PAM_DATA_SILENT + * whether we are called in a forked off child of the PAM session or in the original parent. We don't + * bother with that however, and instead rely on the PID checks that sd_bus_flush_close_unref() does + * internally anyway. That said, we still generate a warning message, since this really shouldn't + * happen. */ + + if (!data) + return; + + PamBusData *d = data; + if (FLAGS_SET(error_status, PAM_DATA_SILENT) && + d->bus && bus_origin_changed(d->bus)) + /* Please adjust test/units/end.sh when updating the log message. */ + pam_syslog(handle, LOG_DEBUG, "Attempted to close sd-bus after fork whose connection is opened before the fork, this should not happen."); + + pam_bus_data_free(data); +} + +static char* pam_make_bus_cache_id(const char *module_name) { + char *id; + + /* We want to cache bus connections between hooks. But we don't want to allow them to be reused in + * child processes (because sd-bus doesn't support that). We also don't want them to be reused + * between our own PAM modules, because they might be linked against different versions of our + * utility functions and share different state. Hence include both a module ID and a PID in the data + * field ID. */ + + if (asprintf(&id, "system-bus-%s-" PID_FMT, ASSERT_PTR(module_name), getpid_cached()) < 0) + return NULL; + + return id; +} + +void pam_bus_data_disconnectp(PamBusData **_d) { + PamBusData *d = *ASSERT_PTR(_d); + pam_handle_t *handle; + int r; + + /* Disconnects the connection explicitly (for use via _cleanup_()) when called */ + + if (!d) + return; + + handle = ASSERT_PTR(d->pam_handle); /* Keep a reference to the session even after 'd' might be invalidated */ + + r = pam_set_data(handle, ASSERT_PTR(d->cache_id), NULL, NULL); + if (r != PAM_SUCCESS) + pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to release PAM user record data, ignoring: @PAMERR@"); + + /* Note, the pam_set_data() call will invalidate 'd', don't access here anymore */ +} + +int pam_acquire_bus_connection( + pam_handle_t *handle, + const char *module_name, + sd_bus **ret_bus, + PamBusData **ret_pam_bus_data) { + + _cleanup_(pam_bus_data_freep) PamBusData *d = NULL; + _cleanup_free_ char *cache_id = NULL; + int r; + + assert(handle); + assert(module_name); + assert(ret_bus); + + cache_id = pam_make_bus_cache_id(module_name); + if (!cache_id) + return pam_log_oom(handle); + + /* We cache the bus connection so that we can share it between the session and the authentication hooks */ + r = pam_get_data(handle, cache_id, (const void**) &d); + if (r == PAM_SUCCESS && d) + goto success; + if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA)) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get bus connection: @PAMERR@"); + + d = new(PamBusData, 1); + if (!d) + return pam_log_oom(handle); + + *d = (PamBusData) { + .cache_id = TAKE_PTR(cache_id), + .pam_handle = handle, + }; + + r = sd_bus_open_system(&d->bus); + if (r < 0) + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to connect to system bus: %m"); + + r = pam_set_data(handle, d->cache_id, d, pam_bus_data_destroy); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set PAM bus data: @PAMERR@"); + +success: + *ret_bus = sd_bus_ref(d->bus); + + if (ret_pam_bus_data) + *ret_pam_bus_data = d; + + TAKE_PTR(d); /* don't auto-destroy anymore, it's installed now */ + + return PAM_SUCCESS; +} + +int pam_release_bus_connection(pam_handle_t *handle, const char *module_name) { + _cleanup_free_ char *cache_id = NULL; + int r; + + assert(module_name); + + cache_id = pam_make_bus_cache_id(module_name); + if (!cache_id) + return pam_log_oom(handle); + + r = pam_set_data(handle, cache_id, NULL, NULL); + if (r != PAM_SUCCESS) + return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to release PAM user record data: @PAMERR@"); + + return PAM_SUCCESS; +} + +void pam_cleanup_free(pam_handle_t *handle, void *data, int error_status) { + /* A generic destructor for pam_set_data() that just frees the specified data */ + free(data); +} diff --git a/src/shared/pam-util.h b/src/shared/pam-util.h new file mode 100644 index 0000000..5a05fb7 --- /dev/null +++ b/src/shared/pam-util.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" + +int pam_syslog_errno(pam_handle_t *handle, int level, int error, const char *format, ...) _printf_(4,5); + +int pam_syslog_pam_error(pam_handle_t *handle, int level, int error, const char *format, ...) _printf_(4,5); + +/* Call pam_vsyslog if debug is enabled */ +#define pam_debug_syslog(handle, debug, fmt, ...) ({ \ + if (debug) \ + pam_syslog(handle, LOG_DEBUG, fmt, ## __VA_ARGS__); \ + }) + +static inline int pam_log_oom(pam_handle_t *handle) { + /* This is like log_oom(), but uses PAM logging */ + return pam_syslog_errno(handle, LOG_ERR, ENOMEM, "Out of memory."); +} + +static inline int pam_bus_log_create_error(pam_handle_t *handle, int r) { + /* This is like bus_log_create_error(), but uses PAM logging */ + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to create bus message: %m"); +} + +static inline int pam_bus_log_parse_error(pam_handle_t *handle, int r) { + /* This is like bus_log_parse_error(), but uses PAM logging */ + return pam_syslog_errno(handle, LOG_ERR, r, "Failed to parse bus message: %m"); +} + +typedef struct PamBusData PamBusData; +void pam_bus_data_disconnectp(PamBusData **d); + +/* Use a different module name per different PAM module. They are all loaded in the same namespace, and this + * helps avoid a clash in the internal data structures of sd-bus. It will be used as key for cache items. */ +int pam_acquire_bus_connection(pam_handle_t *handle, const char *module_name, sd_bus **ret_bus, PamBusData **ret_bus_data); +int pam_release_bus_connection(pam_handle_t *handle, const char *module_name); + +void pam_cleanup_free(pam_handle_t *handle, void *data, int error_status); diff --git a/src/shared/parse-argument.c b/src/shared/parse-argument.c new file mode 100644 index 0000000..145bd11 --- /dev/null +++ b/src/shared/parse-argument.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "format-table.h" +#include "parse-argument.h" +#include "path-util.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" + +/* All functions in this file emit warnings. */ + +int parse_boolean_argument(const char *optname, const char *s, bool *ret) { + int r; + + /* Returns the result through *ret and the return value. */ + + if (s) { + r = parse_boolean(s); + if (r < 0) + return log_error_errno(r, "Failed to parse boolean argument to %s: %s.", optname, s); + + if (ret) + *ret = r; + return r; + } else { + /* s may be NULL. This is controlled by getopt_long() parameters. */ + if (ret) + *ret = true; + return true; + } +} + +int parse_json_argument(const char *s, JsonFormatFlags *ret) { + assert(s); + assert(ret); + + if (streq(s, "pretty")) + *ret = JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR_AUTO; + else if (streq(s, "short")) + *ret = JSON_FORMAT_NEWLINE; + else if (streq(s, "off")) + *ret = JSON_FORMAT_OFF; + else if (streq(s, "help")) { + puts("pretty\n" + "short\n" + "off"); + return 0; /* 0 means → we showed a brief help, exit now */ + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown argument to --json= switch: %s", s); + + return 1; /* 1 means → properly parsed */ +} + +int parse_path_argument(const char *path, bool suppress_root, char **arg) { + char *p; + int r; + + /* + * This function is intended to be used in command line parsers, to handle paths that are passed + * in. It makes the path absolute, and reduces it to NULL if omitted or root (the latter optionally). + * + * NOTE THAT THIS WILL FREE THE PREVIOUS ARGUMENT POINTER ON SUCCESS! + * Hence, do not pass in uninitialized pointers. + */ + + if (isempty(path)) { + *arg = mfree(*arg); + return 0; + } + + r = path_make_absolute_cwd(path, &p); + if (r < 0) + return log_error_errno(r, "Failed to parse path \"%s\" and make it absolute: %m", path); + + path_simplify(p); + if (suppress_root && empty_or_root(p)) + p = mfree(p); + + return free_and_replace(*arg, p); +} + +int parse_signal_argument(const char *s, int *ret) { + int r; + + assert(s); + assert(ret); + + if (streq(s, "help")) { + DUMP_STRING_TABLE(signal, int, _NSIG); + return 0; + } + + if (streq(s, "list")) { + _cleanup_(table_unrefp) Table *table = NULL; + + table = table_new("signal", "name"); + if (!table) + return log_oom(); + + for (int i = 1; i < _NSIG; i++) { + r = table_add_many( + table, + TABLE_INT, i, + TABLE_SIGNAL, i); + if (r < 0) + return table_log_add_error(r); + } + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; + } + + r = signal_from_string(s); + if (r < 0) + return log_error_errno(r, "Failed to parse signal string \"%s\".", s); + + *ret = r; + return 1; /* work to do */ +} diff --git a/src/shared/parse-argument.h b/src/shared/parse-argument.h new file mode 100644 index 0000000..adad65e --- /dev/null +++ b/src/shared/parse-argument.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "json.h" + +int parse_boolean_argument(const char *optname, const char *s, bool *ret); +int parse_json_argument(const char *s, JsonFormatFlags *ret); +int parse_path_argument(const char *path, bool suppress_root, char **arg); +int parse_signal_argument(const char *s, int *ret); diff --git a/src/shared/parse-helpers.c b/src/shared/parse-helpers.c new file mode 100644 index 0000000..9664b9c --- /dev/null +++ b/src/shared/parse-helpers.c @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "af-list.h" +#include "extract-word.h" +#include "ip-protocol-list.h" +#include "log.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "path-util.h" +#include "utf8.h" + +int path_simplify_and_warn( + char *path, + unsigned flag, + const char *unit, + const char *filename, + unsigned line, + const char *lvalue) { + + bool fatal = flag & PATH_CHECK_FATAL; + + assert(!FLAGS_SET(flag, PATH_CHECK_ABSOLUTE | PATH_CHECK_RELATIVE)); + + if (!utf8_is_valid(path)) + return log_syntax_invalid_utf8(unit, LOG_ERR, filename, line, path); + + if (flag & (PATH_CHECK_ABSOLUTE | PATH_CHECK_RELATIVE)) { + bool absolute; + + absolute = path_is_absolute(path); + + if (!absolute && (flag & PATH_CHECK_ABSOLUTE)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "%s= path is not absolute%s: %s", + lvalue, fatal ? "" : ", ignoring", path); + + if (absolute && (flag & PATH_CHECK_RELATIVE)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "%s= path is absolute%s: %s", + lvalue, fatal ? "" : ", ignoring", path); + } + + path_simplify_full(path, flag & PATH_KEEP_TRAILING_SLASH ? PATH_SIMPLIFY_KEEP_TRAILING_SLASH : 0); + + if (!path_is_valid(path)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "%s= path has invalid length (%zu bytes)%s.", + lvalue, strlen(path), fatal ? "" : ", ignoring"); + + if (!path_is_normalized(path)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "%s= path is not normalized%s: %s", + lvalue, fatal ? "" : ", ignoring", path); + + return 0; +} + +static int parse_af_token( + const char *token, + int *family, + int *ip_protocol, + uint16_t *nr_ports, + uint16_t *port_min) { + + int af; + + assert(token); + assert(family); + + af = af_from_ipv4_ipv6(token); + if (af == AF_UNSPEC) + return -EINVAL; + + *family = af; + return 0; +} + +static int parse_ip_protocol_token( + const char *token, + int *family, + int *ip_protocol, + uint16_t *nr_ports, + uint16_t *port_min) { + + int proto; + + assert(token); + assert(ip_protocol); + + proto = ip_protocol_from_tcp_udp(token); + if (proto < 0) + return -EINVAL; + + *ip_protocol = proto; + return 0; +} + +static int parse_ip_ports_token( + const char *token, + int *family, + int *ip_protocol, + uint16_t *nr_ports, + uint16_t *port_min) { + + assert(token); + assert(nr_ports); + assert(port_min); + + if (streq(token, "any")) + *nr_ports = *port_min = 0; + else { + uint16_t mn = 0, mx = 0; + int r = parse_ip_port_range(token, &mn, &mx); + if (r < 0) + return r; + + *nr_ports = mx - mn + 1; + *port_min = mn; + } + + return 0; +} + +typedef int (*parse_token_f)( + const char *, + int *, + int *, + uint16_t *, + uint16_t *); + +int parse_socket_bind_item( + const char *str, + int *address_family, + int *ip_protocol, + uint16_t *nr_ports, + uint16_t *port_min) { + + /* Order of token parsers is important. */ + const parse_token_f parsers[] = { + &parse_af_token, + &parse_ip_protocol_token, + &parse_ip_ports_token, + }; + parse_token_f const *parser_ptr = parsers; + int af = AF_UNSPEC, proto = 0, r; + uint16_t nr = 0, mn = 0; + const char *p = ASSERT_PTR(str); + + assert(address_family); + assert(ip_protocol); + assert(nr_ports); + assert(port_min); + + if (isempty(p)) + return -EINVAL; + + for (;;) { + _cleanup_free_ char *token = NULL; + + r = extract_first_word(&p, &token, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r == 0) + break; + if (r < 0) + return r; + + if (isempty(token)) + return -EINVAL; + + while (parser_ptr != parsers + ELEMENTSOF(parsers)) { + r = (*parser_ptr)(token, &af, &proto, &nr, &mn); + if (r == -ENOMEM) + return r; + + ++parser_ptr; + /* Continue to next token if parsing succeeded, + * otherwise apply next parser to the same token. + */ + if (r >= 0) + break; + } + if (parser_ptr == parsers + ELEMENTSOF(parsers)) + break; + } + + /* Failed to parse a token. */ + if (r < 0) + return r; + + /* Parsers applied successfully, but end of the string not reached. */ + if (p) + return -EINVAL; + + *address_family = af; + *ip_protocol = proto; + *nr_ports = nr; + *port_min = mn; + return 0; +} + +int config_parse_path_or_ignore( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *n = NULL; + bool fatal = ltype; + char **s = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + goto finalize; + + n = strdup(rvalue); + if (!n) + return log_oom(); + + if (streq(n, "-")) + goto finalize; + + r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE | (fatal ? PATH_CHECK_FATAL : 0), unit, filename, line, lvalue); + if (r < 0) + return fatal ? -ENOEXEC : 0; + +finalize: + return free_and_replace(*s, n); +} diff --git a/src/shared/parse-helpers.h b/src/shared/parse-helpers.h new file mode 100644 index 0000000..3e4ad3c --- /dev/null +++ b/src/shared/parse-helpers.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +enum { + PATH_CHECK_FATAL = 1 << 0, /* If not set, then error message is appended with 'ignoring'. */ + PATH_CHECK_ABSOLUTE = 1 << 1, + PATH_CHECK_RELATIVE = 1 << 2, + PATH_KEEP_TRAILING_SLASH = 1 << 3, +}; + +int path_simplify_and_warn( + char *path, + unsigned flag, + const char *unit, + const char *filename, + unsigned line, + const char *lvalue); + +int parse_socket_bind_item( + const char *str, + int *address_family, + int *ip_protocol, + uint16_t *nr_ports, + uint16_t *port_min); + +int config_parse_path_or_ignore( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata); diff --git a/src/shared/password-quality-util-passwdqc.c b/src/shared/password-quality-util-passwdqc.c new file mode 100644 index 0000000..adfc14d --- /dev/null +++ b/src/shared/password-quality-util-passwdqc.c @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dlfcn-util.h" +#include "errno-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "password-quality-util.h" +#include "strv.h" + +#if HAVE_PASSWDQC + +static void *passwdqc_dl = NULL; + +void (*sym_passwdqc_params_reset)(passwdqc_params_t *params); +int (*sym_passwdqc_params_load)(passwdqc_params_t *params, char **reason, const char *pathname); +int (*sym_passwdqc_params_parse)(passwdqc_params_t *params, char **reason, int argc, const char *const *argv); +void (*sym_passwdqc_params_free)(passwdqc_params_t *params); +const char *(*sym_passwdqc_check)(const passwdqc_params_qc_t *params, const char *newpass, const char *oldpass, const struct passwd *pw); +char *(*sym_passwdqc_random)(const passwdqc_params_qc_t *params); + +int dlopen_passwdqc(void) { + return dlopen_many_sym_or_warn( + &passwdqc_dl, "libpasswdqc.so.1", LOG_DEBUG, + DLSYM_ARG(passwdqc_params_reset), + DLSYM_ARG(passwdqc_params_load), + DLSYM_ARG(passwdqc_params_parse), + DLSYM_ARG(passwdqc_params_free), + DLSYM_ARG(passwdqc_check), + DLSYM_ARG(passwdqc_random)); +} + +static int pwqc_allocate_context(passwdqc_params_t **ret) { + + _cleanup_(sym_passwdqc_params_freep) passwdqc_params_t *params = NULL; + _cleanup_free_ char *load_reason = NULL; + int r; + + assert(ret); + + r = dlopen_passwdqc(); + if (r < 0) + return r; + + params = new0(passwdqc_params_t, 1); + if (!params) + return log_oom(); + + sym_passwdqc_params_reset(params); + + r = sym_passwdqc_params_load(params, &load_reason, "/etc/passwdqc.conf"); + if (r < 0) { + if (!load_reason) + return log_oom(); + log_debug("Failed to load passwdqc configuration file, ignoring: %s", load_reason); + } + + *ret = TAKE_PTR(params); + return 0; +} + +int suggest_passwords(void) { + + _cleanup_(sym_passwdqc_params_freep) passwdqc_params_t *params = NULL; + _cleanup_strv_free_erase_ char **suggestions = NULL; + _cleanup_(erase_and_freep) char *joined = NULL; + int r; + + r = pwqc_allocate_context(¶ms); + if (r < 0) { + if (ERRNO_IS_NOT_SUPPORTED(r)) + return 0; + return log_error_errno(r, "Failed to allocate libpasswdqc context: %m"); + } + + suggestions = new0(char*, N_SUGGESTIONS+1); + if (!suggestions) + return log_oom(); + + for (size_t i = 0; i < N_SUGGESTIONS; i++) { + suggestions[i] = sym_passwdqc_random(¶ms->qc); + if (!suggestions[i]) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to generate password, ignoring"); + } + + joined = strv_join(suggestions, " "); + if (!joined) + return log_oom(); + + printf("Password suggestions: %s\n", joined); + return 1; +} + +int check_password_quality( + const char *password, + const char *old, + const char *username, + char **ret_error) { + + _cleanup_(sym_passwdqc_params_freep) passwdqc_params_t *params = NULL; + const char *check_reason; + int r; + + assert(password); + + r = pwqc_allocate_context(¶ms); + if (r < 0) + return log_debug_errno(r, "Failed to allocate libpasswdqc context: %m"); + + if (username) { + const struct passwd pw = { + .pw_name = (char *) username, + /* + * passwdqc_check() could use this information to check + * whether the password is based on the personal login information, + * but we cannot provide it. + */ + .pw_passwd = (char *) "", + .pw_gecos = (char *) "", + .pw_dir = (char *) "", + .pw_shell = (char *) "" + }; + + check_reason = sym_passwdqc_check(¶ms->qc, password, old, &pw); + } else + check_reason = sym_passwdqc_check(¶ms->qc, password, old, /* pw */ NULL); + + if (check_reason) { + if (ret_error) { + char *e = strdup(check_reason); + if (!e) + return log_oom(); + *ret_error = e; + } + + return 0; /* all bad */ + } + + return 1; /* all good */ +} + +#endif diff --git a/src/shared/password-quality-util-passwdqc.h b/src/shared/password-quality-util-passwdqc.h new file mode 100644 index 0000000..0d528d2 --- /dev/null +++ b/src/shared/password-quality-util-passwdqc.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +#if HAVE_PASSWDQC +#include + +extern void (*sym_passwdqc_params_reset)(passwdqc_params_t *params); +extern int (*sym_passwdqc_params_load)(passwdqc_params_t *params, char **reason, const char *pathname); +extern int (*sym_passwdqc_params_parse)(passwdqc_params_t *params, char **reason, int argc, const char *const *argv); +extern void (*sym_passwdqc_params_free)(passwdqc_params_t *params); +extern const char *(*sym_passwdqc_check)(const passwdqc_params_qc_t *params, const char *newpass, const char *oldpass, const struct passwd *pw); +extern char *(*sym_passwdqc_random)(const passwdqc_params_qc_t *params); + +int dlopen_passwdqc(void); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(passwdqc_params_t*, sym_passwdqc_params_free, NULL); + +int suggest_passwords(void); +int check_password_quality(const char *password, const char *old, const char *username, char **ret_error); + +#endif diff --git a/src/shared/password-quality-util-pwquality.c b/src/shared/password-quality-util-pwquality.c new file mode 100644 index 0000000..80f7d58 --- /dev/null +++ b/src/shared/password-quality-util-pwquality.c @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "dlfcn-util.h" +#include "errno-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "password-quality-util.h" +#include "strv.h" + +#if HAVE_PWQUALITY + +static void *pwquality_dl = NULL; + +int (*sym_pwquality_check)(pwquality_settings_t *pwq, const char *password, const char *oldpassword, const char *user, void **auxerror); +pwquality_settings_t *(*sym_pwquality_default_settings)(void); +void (*sym_pwquality_free_settings)(pwquality_settings_t *pwq); +int (*sym_pwquality_generate)(pwquality_settings_t *pwq, int entropy_bits, char **password); +int (*sym_pwquality_get_str_value)(pwquality_settings_t *pwq, int setting, const char **value); +int (*sym_pwquality_read_config)(pwquality_settings_t *pwq, const char *cfgfile, void **auxerror); +int (*sym_pwquality_set_int_value)(pwquality_settings_t *pwq, int setting, int value); +const char* (*sym_pwquality_strerror)(char *buf, size_t len, int errcode, void *auxerror); + +int dlopen_pwquality(void) { + return dlopen_many_sym_or_warn( + &pwquality_dl, "libpwquality.so.1", LOG_DEBUG, + DLSYM_ARG(pwquality_check), + DLSYM_ARG(pwquality_default_settings), + DLSYM_ARG(pwquality_free_settings), + DLSYM_ARG(pwquality_generate), + DLSYM_ARG(pwquality_get_str_value), + DLSYM_ARG(pwquality_read_config), + DLSYM_ARG(pwquality_set_int_value), + DLSYM_ARG(pwquality_strerror)); +} + +static void pwq_maybe_disable_dictionary(pwquality_settings_t *pwq) { + char buf[PWQ_MAX_ERROR_MESSAGE_LEN]; + const char *path; + int r; + + assert(pwq); + + r = sym_pwquality_get_str_value(pwq, PWQ_SETTING_DICT_PATH, &path); + if (r < 0) { + log_debug("Failed to read libpwquality dictionary path, ignoring: %s", + sym_pwquality_strerror(buf, sizeof(buf), r, NULL)); + return; + } + + if (isempty(path)) { + log_debug("Weird, no dictionary file configured, ignoring."); + return; + } + + if (access(path, F_OK) >= 0) + return; + + if (errno != ENOENT) { + log_debug_errno(errno, "Failed to check if dictionary file %s exists, ignoring: %m", path); + return; + } + + r = sym_pwquality_set_int_value(pwq, PWQ_SETTING_DICT_CHECK, 0); + if (r < 0) + log_debug("Failed to disable libpwquality dictionary check, ignoring: %s", + sym_pwquality_strerror(buf, sizeof(buf), r, NULL)); +} + +static int pwq_allocate_context(pwquality_settings_t **ret) { + _cleanup_(sym_pwquality_free_settingsp) pwquality_settings_t *pwq = NULL; + char buf[PWQ_MAX_ERROR_MESSAGE_LEN]; + void *auxerror; + int r; + + assert(ret); + + r = dlopen_pwquality(); + if (r < 0) + return r; + + pwq = sym_pwquality_default_settings(); + if (!pwq) + return -ENOMEM; + + r = sym_pwquality_read_config(pwq, NULL, &auxerror); + if (r < 0) + log_debug("Failed to read libpwquality configuration, ignoring: %s", + sym_pwquality_strerror(buf, sizeof(buf), r, auxerror)); + + pwq_maybe_disable_dictionary(pwq); + + *ret = TAKE_PTR(pwq); + return 0; +} + +int suggest_passwords(void) { + _cleanup_(sym_pwquality_free_settingsp) pwquality_settings_t *pwq = NULL; + _cleanup_strv_free_erase_ char **suggestions = NULL; + _cleanup_(erase_and_freep) char *joined = NULL; + char buf[PWQ_MAX_ERROR_MESSAGE_LEN]; + size_t i; + int r; + + r = pwq_allocate_context(&pwq); + if (r < 0) { + if (ERRNO_IS_NOT_SUPPORTED(r)) + return 0; + return log_error_errno(r, "Failed to allocate libpwquality context: %m"); + } + + suggestions = new0(char*, N_SUGGESTIONS+1); + if (!suggestions) + return log_oom(); + + for (i = 0; i < N_SUGGESTIONS; i++) { + r = sym_pwquality_generate(pwq, 64, suggestions + i); + if (r < 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to generate password, ignoring: %s", + sym_pwquality_strerror(buf, sizeof(buf), r, NULL)); + } + + joined = strv_join(suggestions, " "); + if (!joined) + return log_oom(); + + printf("Password suggestions: %s\n", joined); + return 1; +} + +int check_password_quality(const char *password, const char *old, const char *username, char **ret_error) { + _cleanup_(sym_pwquality_free_settingsp) pwquality_settings_t *pwq = NULL; + char buf[PWQ_MAX_ERROR_MESSAGE_LEN]; + void *auxerror; + int r; + + assert(password); + + r = pwq_allocate_context(&pwq); + if (r < 0) + return log_debug_errno(r, "Failed to allocate libpwquality context: %m"); + + r = sym_pwquality_check(pwq, password, old, username, &auxerror); + if (r < 0) { + if (ret_error) { + _cleanup_free_ char *e = NULL; + + e = strdup(sym_pwquality_strerror(buf, sizeof(buf), r, auxerror)); + if (!e) + return -ENOMEM; + + *ret_error = TAKE_PTR(e); + } + + return 0; /* all bad */ + } + + return 1; /* all good */ +} + +#endif diff --git a/src/shared/password-quality-util-pwquality.h b/src/shared/password-quality-util-pwquality.h new file mode 100644 index 0000000..a420b0d --- /dev/null +++ b/src/shared/password-quality-util-pwquality.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +#if HAVE_PWQUALITY +/* pwquality.h uses size_t but doesn't include sys/types.h on its own */ +#include +#include + +extern int (*sym_pwquality_check)(pwquality_settings_t *pwq, const char *password, const char *oldpassword, const char *user, void **auxerror); +extern pwquality_settings_t *(*sym_pwquality_default_settings)(void); +extern void (*sym_pwquality_free_settings)(pwquality_settings_t *pwq); +extern int (*sym_pwquality_generate)(pwquality_settings_t *pwq, int entropy_bits, char **password); +extern int (*sym_pwquality_get_str_value)(pwquality_settings_t *pwq, int setting, const char **value); +extern int (*sym_pwquality_read_config)(pwquality_settings_t *pwq, const char *cfgfile, void **auxerror); +extern int (*sym_pwquality_set_int_value)(pwquality_settings_t *pwq, int setting, int value); +extern const char* (*sym_pwquality_strerror)(char *buf, size_t len, int errcode, void *auxerror); + +int dlopen_pwquality(void); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(pwquality_settings_t*, sym_pwquality_free_settings, NULL); + +int suggest_passwords(void); +int check_password_quality(const char *password, const char *old, const char *username, char **ret_error); + +#endif diff --git a/src/shared/password-quality-util.h b/src/shared/password-quality-util.h new file mode 100644 index 0000000..f838ba7 --- /dev/null +++ b/src/shared/password-quality-util.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#define N_SUGGESTIONS 6 + +#if HAVE_PASSWDQC + +#include "password-quality-util-passwdqc.h" + +#elif HAVE_PWQUALITY + +#include "password-quality-util-pwquality.h" + +#else + +static inline int suggest_passwords(void) { + return 0; +} + +static inline int check_password_quality( + const char *password, + const char *old, + const char *username, + char **ret_error) { + if (ret_error) + *ret_error = NULL; + return 1; /* all good */ +} + +#endif diff --git a/src/shared/pcre2-util.c b/src/shared/pcre2-util.c new file mode 100644 index 0000000..578b02d --- /dev/null +++ b/src/shared/pcre2-util.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "dlfcn-util.h" +#include "log.h" +#include "pcre2-util.h" + +#if HAVE_PCRE2 +static void *pcre2_dl = NULL; + +pcre2_match_data* (*sym_pcre2_match_data_create)(uint32_t, pcre2_general_context *); +void (*sym_pcre2_match_data_free)(pcre2_match_data *); +void (*sym_pcre2_code_free)(pcre2_code *); +pcre2_code* (*sym_pcre2_compile)(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, pcre2_compile_context *); +int (*sym_pcre2_get_error_message)(int, PCRE2_UCHAR *, PCRE2_SIZE); +int (*sym_pcre2_match)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *); +PCRE2_SIZE* (*sym_pcre2_get_ovector_pointer)(pcre2_match_data *); + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + pcre2_code_hash_ops_free, + pcre2_code, + (void (*)(const pcre2_code *, struct siphash*))trivial_hash_func, + (int (*)(const pcre2_code *, const pcre2_code*))trivial_compare_func, + sym_pcre2_code_free); +#else +const struct hash_ops pcre2_code_hash_ops_free = {}; +#endif + +int dlopen_pcre2(void) { +#if HAVE_PCRE2 + /* So here's something weird: PCRE2 actually renames the symbols exported by the library via C + * macros, so that the exported symbols carry a suffix "_8" but when used from C the suffix is + * gone. In the argument list below we ignore this mangling. Surprisingly (at least to me), we + * actually get away with that. That's because DLSYM_ARG() useses STRINGIFY() to generate a string + * version of the symbol name, and that resolves the macro mapping implicitly already, so that the + * string actually contains the "_8" suffix already due to that and we don't have to append it + * manually anymore. C is weird. 🤯 */ + + return dlopen_many_sym_or_warn( + &pcre2_dl, "libpcre2-8.so.0", LOG_ERR, + DLSYM_ARG(pcre2_match_data_create), + DLSYM_ARG(pcre2_match_data_free), + DLSYM_ARG(pcre2_code_free), + DLSYM_ARG(pcre2_compile), + DLSYM_ARG(pcre2_get_error_message), + DLSYM_ARG(pcre2_match), + DLSYM_ARG(pcre2_get_ovector_pointer)); +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "PCRE2 support is not compiled in."); +#endif +} + +int pattern_compile_and_log(const char *pattern, PatternCompileCase case_, pcre2_code **ret) { +#if HAVE_PCRE2 + PCRE2_SIZE erroroffset; + _cleanup_(sym_pcre2_code_freep) pcre2_code *p = NULL; + unsigned flags = 0; + int errorcode, r; + + assert(pattern); + + r = dlopen_pcre2(); + if (r < 0) + return r; + + if (case_ == PATTERN_COMPILE_CASE_INSENSITIVE) + flags = PCRE2_CASELESS; + else if (case_ == PATTERN_COMPILE_CASE_AUTO) { + _cleanup_(sym_pcre2_match_data_freep) pcre2_match_data *md = NULL; + bool has_case; + _cleanup_(sym_pcre2_code_freep) pcre2_code *cs = NULL; + + md = sym_pcre2_match_data_create(1, NULL); + if (!md) + return log_oom(); + + r = pattern_compile_and_log("[[:upper:]]", PATTERN_COMPILE_CASE_SENSITIVE, &cs); + if (r < 0) + return r; + + r = sym_pcre2_match(cs, (PCRE2_SPTR8) pattern, PCRE2_ZERO_TERMINATED, 0, 0, md, NULL); + has_case = r >= 0; + + flags = !has_case * PCRE2_CASELESS; + } + + log_debug("Doing case %s matching based on %s", + flags & PCRE2_CASELESS ? "insensitive" : "sensitive", + case_ != PATTERN_COMPILE_CASE_AUTO ? "request" : "pattern casing"); + + p = sym_pcre2_compile((PCRE2_SPTR8) pattern, + PCRE2_ZERO_TERMINATED, flags, &errorcode, &erroroffset, NULL); + if (!p) { + unsigned char buf[LINE_MAX]; + + r = sym_pcre2_get_error_message(errorcode, buf, sizeof buf); + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Bad pattern \"%s\": %s", pattern, + r < 0 ? "unknown error" : (char *)buf); + } + + if (ret) + *ret = TAKE_PTR(p); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "PCRE2 support is not compiled in."); +#endif +} + +int pattern_matches_and_log(pcre2_code *compiled_pattern, const char *message, size_t size, size_t *ret_ovec) { +#if HAVE_PCRE2 + _cleanup_(sym_pcre2_match_data_freep) pcre2_match_data *md = NULL; + int r; + + assert(compiled_pattern); + assert(message); + /* pattern_compile_and_log() must be called before this function is called and that function already + * dlopens pcre2 so we can assert on it being available here. */ + assert(pcre2_dl); + + md = sym_pcre2_match_data_create(1, NULL); + if (!md) + return log_oom(); + + r = sym_pcre2_match(compiled_pattern, + (const unsigned char *)message, + size, + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + md, + NULL); + if (r == PCRE2_ERROR_NOMATCH) + return false; + if (r < 0) { + unsigned char buf[LINE_MAX]; + + r = sym_pcre2_get_error_message(r, buf, sizeof(buf)); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Pattern matching failed: %s", + r < 0 ? "unknown error" : (char*) buf); + } + + if (ret_ovec) { + ret_ovec[0] = sym_pcre2_get_ovector_pointer(md)[0]; + ret_ovec[1] = sym_pcre2_get_ovector_pointer(md)[1]; + } + + return true; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "PCRE2 support is not compiled in."); +#endif +} + +void *pattern_free(pcre2_code *p) { +#if HAVE_PCRE2 + if (!p) + return NULL; + + assert(pcre2_dl); + sym_pcre2_code_free(p); + return NULL; +#else + assert(p == NULL); + return NULL; +#endif +} diff --git a/src/shared/pcre2-util.h b/src/shared/pcre2-util.h new file mode 100644 index 0000000..f1e744d --- /dev/null +++ b/src/shared/pcre2-util.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hash-funcs.h" +#include "macro.h" + +#if HAVE_PCRE2 + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +extern pcre2_match_data* (*sym_pcre2_match_data_create)(uint32_t, pcre2_general_context *); +extern void (*sym_pcre2_match_data_free)(pcre2_match_data *); +extern void (*sym_pcre2_code_free)(pcre2_code *); +extern pcre2_code* (*sym_pcre2_compile)(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, pcre2_compile_context *); +extern int (*sym_pcre2_get_error_message)(int, PCRE2_UCHAR *, PCRE2_SIZE); +extern int (*sym_pcre2_match)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *); +extern PCRE2_SIZE* (*sym_pcre2_get_ovector_pointer)(pcre2_match_data *); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(pcre2_match_data*, sym_pcre2_match_data_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(pcre2_code*, sym_pcre2_code_free, NULL); +#else + +typedef struct {} pcre2_code; + +#endif + +extern const struct hash_ops pcre2_code_hash_ops_free; + +typedef enum { + PATTERN_COMPILE_CASE_AUTO, + PATTERN_COMPILE_CASE_SENSITIVE, + PATTERN_COMPILE_CASE_INSENSITIVE, + _PATTERN_COMPILE_CASE_MAX, + _PATTERN_COMPILE_CASE_INVALID = -EINVAL, +} PatternCompileCase; + +int pattern_compile_and_log(const char *pattern, PatternCompileCase case_, pcre2_code **ret); +int pattern_matches_and_log(pcre2_code *compiled_pattern, const char *message, size_t size, size_t *ret_ovec); +void *pattern_free(pcre2_code *p); + +DEFINE_TRIVIAL_CLEANUP_FUNC(pcre2_code*, pattern_free); + +int dlopen_pcre2(void); diff --git a/src/shared/pcrextend-util.c b/src/shared/pcrextend-util.c new file mode 100644 index 0000000..fa066a4 --- /dev/null +++ b/src/shared/pcrextend-util.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-device.h" + +#include "blkid-util.h" +#include "blockdev-util.h" +#include "chase.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "mountpoint-util.h" +#include "pcrextend-util.h" +#include "strv.h" + +static int device_get_file_system_word( + sd_device *d, + const char *prefix, + char **ret) { + +#if HAVE_BLKID + int r; +#endif + + assert(d); + assert(prefix); + assert(ret); + +#if HAVE_BLKID + _cleanup_close_ int block_fd = sd_device_open(d, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (block_fd < 0) + return block_fd; + + _cleanup_(blkid_free_probep) blkid_probe b = blkid_new_probe(); + if (!b) + return -ENOMEM; + + errno = 0; + r = blkid_probe_set_device(b, block_fd, 0, 0); + if (r != 0) + return errno_or_else(ENOMEM); + + (void) blkid_probe_enable_superblocks(b, 1); + (void) blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE|BLKID_SUBLKS_UUID|BLKID_SUBLKS_LABEL); + (void) blkid_probe_enable_partitions(b, 1); + (void) blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS); + + errno = 0; + r = blkid_do_safeprobe(b); + if (r == _BLKID_SAFEPROBE_ERROR) + return errno_or_else(EIO); + if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND)) + return -ENOPKG; + + assert(r == _BLKID_SAFEPROBE_FOUND); + + _cleanup_strv_free_ char **l = strv_new(prefix); + if (!l) + return -ENOMEM; + + FOREACH_STRING(field, "TYPE", "UUID", "LABEL", "PART_ENTRY_UUID", "PART_ENTRY_TYPE", "PART_ENTRY_NAME") { + const char *v = NULL; + + (void) blkid_probe_lookup_value(b, field, &v, NULL); + + _cleanup_free_ char *escaped = xescape(strempty(v), ":"); /* Avoid ambiguity around ":" */ + if (!escaped) + return -ENOMEM; + + r = strv_consume(&l, TAKE_PTR(escaped)); + if (r < 0) + return r; + } + + assert(strv_length(l) == 7); /* We always want 7 components, to avoid ambiguous strings */ + + _cleanup_free_ char *word = strv_join(l, ":"); + if (!word) + return -ENOMEM; + + *ret = TAKE_PTR(word); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int pcrextend_file_system_word(const char *path, char **ret_word, char **ret_normalized_path) { + _cleanup_free_ char *normalized_path = NULL, *normalized_escaped = NULL, *prefix = NULL, *word = NULL; + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + _cleanup_close_ int dfd = -EBADF; + int r; + + assert(path); + assert(ret_word); + + dfd = chase_and_open(path, NULL, 0, O_DIRECTORY|O_CLOEXEC, &normalized_path); + if (dfd < 0) + return log_error_errno(dfd, "Failed to open path '%s': %m", path); + + r = fd_is_mount_point(dfd, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to determine if path '%s' is mount point: %m", normalized_path); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "Specified path '%s' is not a mount point, refusing: %m", normalized_path); + + normalized_escaped = xescape(normalized_path, ":"); /* Avoid ambiguity around ":" */ + if (!normalized_escaped) + return log_oom(); + + prefix = strjoin("file-system:", normalized_escaped); + if (!prefix) + return log_oom(); + + r = block_device_new_from_fd(dfd, BLOCK_DEVICE_LOOKUP_BACKING, &d); + if (r < 0) { + log_notice_errno(r, "Unable to determine backing block device of '%s', using generic fallback file system identity string: %m", path); + + word = strjoin(prefix, "::::::"); + if (!word) + return log_oom(); + } else { + r = device_get_file_system_word(d, prefix, &word); + if (r < 0) + return log_error_errno(r, "Failed to get file system identifier string for '%s': %m", path); + } + + *ret_word = TAKE_PTR(word); + + if (ret_normalized_path) + *ret_normalized_path = TAKE_PTR(normalized_path); + + return 0; +} + +int pcrextend_machine_id_word(char **ret) { + _cleanup_free_ char *word = NULL; + sd_id128_t mid; + int r; + + assert(ret); + + r = sd_id128_get_machine(&mid); + if (r < 0) + return log_error_errno(r, "Failed to acquire machine ID: %m"); + + word = strjoin("machine-id:", SD_ID128_TO_STRING(mid)); + if (!word) + return log_oom(); + + *ret = TAKE_PTR(word); + return 0; +} diff --git a/src/shared/pcrextend-util.h b/src/shared/pcrextend-util.h new file mode 100644 index 0000000..7dd612b --- /dev/null +++ b/src/shared/pcrextend-util.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int pcrextend_file_system_word(const char *path, char **ret, char **ret_normalized_path); +int pcrextend_machine_id_word(char **ret); diff --git a/src/shared/pe-binary.c b/src/shared/pe-binary.c new file mode 100644 index 0000000..4c05323 --- /dev/null +++ b/src/shared/pe-binary.c @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "log.h" +#include "pe-binary.h" +#include "string-util.h" + +bool pe_header_is_64bit(const PeHeader *h) { + assert(h); + + if (le16toh(h->optional.Magic) == UINT16_C(0x010B)) /* PE32 */ + return false; + + if (le16toh(h->optional.Magic) == UINT16_C(0x020B)) /* PE32+ */ + return true; + + assert_not_reached(); +} + +static size_t pe_header_size(const PeHeader *pe_header) { + assert(pe_header); + + return offsetof(PeHeader, optional) + le16toh(pe_header->pe.SizeOfOptionalHeader); +} + +const IMAGE_DATA_DIRECTORY *pe_header_get_data_directory( + const PeHeader *h, + size_t i) { + + assert(h); + + if (i >= le32toh(PE_HEADER_OPTIONAL_FIELD(h, NumberOfRvaAndSizes))) + return NULL; + + return PE_HEADER_OPTIONAL_FIELD(h, DataDirectory) + i; +} + +const IMAGE_SECTION_HEADER *pe_header_find_section( + const PeHeader *pe_header, + const IMAGE_SECTION_HEADER *sections, + const char *name) { + + size_t n; + + assert(pe_header); + assert(name); + assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0); + + n = strlen(name); + if (n > sizeof(sections[0].Name)) /* Too long? */ + return NULL; + + FOREACH_ARRAY(section, sections, le16toh(pe_header->pe.NumberOfSections)) + if (memcmp(section->Name, name, n) == 0 && + memeqzero(section->Name + n, sizeof(section->Name) - n)) + return section; + + return NULL; +} + +int pe_load_headers( + int fd, + IMAGE_DOS_HEADER **ret_dos_header, + PeHeader **ret_pe_header) { + + _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL; + _cleanup_free_ PeHeader *pe_header = NULL; + ssize_t n; + + assert(fd >= 0); + + dos_header = new(IMAGE_DOS_HEADER, 1); + if (!dos_header) + return log_oom_debug(); + + n = pread(fd, + dos_header, + sizeof(IMAGE_DOS_HEADER), + 0); + if (n < 0) + return log_debug_errno(errno, "Failed to read DOS header: %m"); + if ((size_t) n != sizeof(IMAGE_DOS_HEADER)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading MZ executable header."); + + if (le16toh(dos_header->e_magic) != UINT16_C(0x5A4D)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "File lacks MZ executable header."); + + pe_header = new(PeHeader, 1); + if (!pe_header) + return log_oom_debug(); + + n = pread(fd, + pe_header, + offsetof(PeHeader, optional), + le32toh(dos_header->e_lfanew)); + if (n < 0) + return log_debug_errno(errno, "Failed to read PE executable header: %m"); + if ((size_t) n != offsetof(PeHeader, optional)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading PE executable header."); + + if (le32toh(pe_header->signature) != UINT32_C(0x00004550)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "File lacks PE executable header."); + + if (le16toh(pe_header->pe.SizeOfOptionalHeader) < sizeof_field(PeHeader, optional.Magic)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Optional header size too short for magic."); + + PeHeader *pe_header_tmp = realloc(pe_header, MAX(sizeof(PeHeader), pe_header_size(pe_header))); + if (!pe_header_tmp) + return log_oom_debug(); + pe_header = pe_header_tmp; + + n = pread(fd, + &pe_header->optional, + le16toh(pe_header->pe.SizeOfOptionalHeader), + le32toh(dos_header->e_lfanew) + offsetof(PeHeader, optional)); + if (n < 0) + return log_debug_errno(errno, "Failed to read PE executable optional header: %m"); + if ((size_t) n != le16toh(pe_header->pe.SizeOfOptionalHeader)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading PE executable optional header."); + + if (!IN_SET(le16toh(pe_header->optional.Magic), UINT16_C(0x010B), UINT16_C(0x020B))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Optional header magic invalid."); + + if (pe_header_size(pe_header) != + PE_HEADER_OPTIONAL_FIELD_OFFSET(pe_header, DataDirectory) + + sizeof(IMAGE_DATA_DIRECTORY) * (uint64_t) le32toh(PE_HEADER_OPTIONAL_FIELD(pe_header, NumberOfRvaAndSizes))) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Optional header size mismatch."); + + if (ret_dos_header) + *ret_dos_header = TAKE_PTR(dos_header); + if (ret_pe_header) + *ret_pe_header = TAKE_PTR(pe_header); + + return 0; +} + +int pe_load_sections( + int fd, + const IMAGE_DOS_HEADER *dos_header, + const PeHeader *pe_header, + IMAGE_SECTION_HEADER **ret_sections) { + + _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL; + size_t nos; + ssize_t n; + + assert(fd >= 0); + assert(dos_header); + assert(pe_header); + + nos = le16toh(pe_header->pe.NumberOfSections); + + sections = new(IMAGE_SECTION_HEADER, nos); + if (!sections) + return log_oom_debug(); + + n = pread(fd, + sections, + sizeof(IMAGE_SECTION_HEADER) * nos, + le32toh(dos_header->e_lfanew) + pe_header_size(pe_header)); + if (n < 0) + return log_debug_errno(errno, "Failed to read section table: %m"); + if ((size_t) n != sizeof(IMAGE_SECTION_HEADER) * nos) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading section table."); + + if (ret_sections) + *ret_sections = TAKE_PTR(sections); + + return 0; +} + +int pe_read_section_data( + int fd, + const PeHeader *pe_header, + const IMAGE_SECTION_HEADER *sections, + const char *name, + size_t max_size, + void **ret, + size_t *ret_size) { + + const IMAGE_SECTION_HEADER *section; + _cleanup_free_ void *data = NULL; + size_t n; + ssize_t ss; + + assert(fd >= 0); + assert(pe_header); + assert(sections || pe_header->pe.NumberOfSections == 0); + assert(name); + + section = pe_header_find_section(pe_header, sections, name); + if (!section) + return -ENXIO; + + n = le32toh(section->VirtualSize); + if (n > MIN(max_size, (size_t) SSIZE_MAX)) + return -E2BIG; + + data = malloc(n+1); + if (!data) + return -ENOMEM; + + ss = pread(fd, data, n, le32toh(section->PointerToRawData)); + if (ss < 0) + return -errno; + if ((size_t) ss != n) + return -EIO; + + ((uint8_t*) data)[n] = 0; /* NUL terminate, no matter what */ + + if (ret_size) + *ret_size = n; + else { + /* Check that there are no embedded NUL bytes if the caller doesn't want to know the size + * (i.e. treats the blob as a string) */ + const char *nul; + + nul = memchr(data, 0, n); + if (nul && !memeqzero(nul, n - (nul - (const char*) data))) /* If there's a NUL it must only be NULs from there on */ + return -EBADMSG; + } + if (ret) + *ret = TAKE_PTR(data); + + return 0; +} + +bool pe_is_uki(const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections) { + assert(pe_header); + assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0); + + if (le16toh(pe_header->optional.Subsystem) != IMAGE_SUBSYSTEM_EFI_APPLICATION) + return false; + + return + pe_header_find_section(pe_header, sections, ".osrel") && + pe_header_find_section(pe_header, sections, ".linux") && + pe_header_find_section(pe_header, sections, ".initrd"); +} diff --git a/src/shared/pe-binary.h b/src/shared/pe-binary.h new file mode 100644 index 0000000..2ef44d7 --- /dev/null +++ b/src/shared/pe-binary.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sparse-endian.h" + +/* When naming things we try to stay close to the official Windows APIs as per: + * → https://learn.microsoft.com/en-us/windows/win32/debug/pe-format */ + +typedef struct _packed_ _IMAGE_DOS_HEADER { + le16_t e_magic; + le16_t e_cblp; + le16_t e_cp; + le16_t e_crlc; + le16_t e_cparhdr; + le16_t e_minalloc; + le16_t e_maxalloc; + le16_t e_ss; + le16_t e_sp; + le16_t e_csum; + le16_t e_ip; + le16_t e_cs; + le16_t e_lfarlc; + le16_t e_ovno; + le16_t e_res[4]; + le16_t e_oemid; + le16_t e_oeminfo; + le16_t e_res2[10]; + le32_t e_lfanew; +} IMAGE_DOS_HEADER; + +typedef struct _packed_ _IMAGE_FILE_HEADER { + le16_t Machine; + le16_t NumberOfSections; + le32_t TimeDateStamp; + le32_t PointerToSymbolTable; + le32_t NumberOfSymbols; + le16_t SizeOfOptionalHeader; + le16_t Characteristics; +} IMAGE_FILE_HEADER; + +typedef struct _packed_ _IMAGE_DATA_DIRECTORY { + le32_t VirtualAddress; + le32_t Size; +} IMAGE_DATA_DIRECTORY; + +typedef struct _packed_ _IMAGE_OPTIONAL_HEADER { + /* Standard fields */ + le16_t Magic; + uint8_t MajorLinkerVersion; + uint8_t MinorLinkerVersion; + le32_t SizeOfCode; + le32_t SizeOfInitializedData; + le32_t SizeOfUninitializedData; + le32_t AddressOfEntryPoint; + le32_t BaseOfCode; + + /* Here the PE32 and PE32+ headers differ: PE32+ has one 64bit field, PE32+ has two 32bit fields */ + union { + struct { + le32_t BaseOfData; + le32_t pe32_ImageBase; + }; + le64_t pe32plus_ImageBase; + }; + + /* Additional fields */ + le32_t SectionAlignment; + le32_t FileAlignment; + le16_t MajorOperatingSystemVersion; + le16_t MinorOperatingSystemVersion; + le16_t MajorImageVersion; + le16_t MinorImageVersion; + le16_t MajorSubsystemVersion; + le16_t MinorSubsystemVersion; + le32_t Win32VersionValue; + le32_t SizeOfImage; + le32_t SizeOfHeaders; + le32_t CheckSum; + le16_t Subsystem; + le16_t DllCharacteristics; + + /* Here similar: on PE32+ some fields are 64bit that are 32bit on PE32. */ + union { + struct { + le32_t pe32_SizeOfStackReserve; + le32_t pe32_SizeOfStackCommit; + le32_t pe32_SizeOfHeapReserve; + le32_t pe32_SizeOfHeapCommit; + le32_t pe32_LoaderFlags; + le32_t pe32_NumberOfRvaAndSizes; + IMAGE_DATA_DIRECTORY pe32_DataDirectory[]; + }; + struct { + le64_t pe32plus_SizeOfStackReserve; + le64_t pe32plus_SizeOfStackCommit; + le64_t pe32plus_SizeOfHeapReserve; + le64_t pe32plus_SizeOfHeapCommit; + le32_t pe32plus_LoaderFlags; + le32_t pe32plus_NumberOfRvaAndSizes; + IMAGE_DATA_DIRECTORY pe32plus_DataDirectory[]; + }; + }; +} IMAGE_OPTIONAL_HEADER; + +typedef struct _packed_ PeHeader { + le32_t signature; + IMAGE_FILE_HEADER pe; + IMAGE_OPTIONAL_HEADER optional; +} PeHeader; + +typedef struct _packed_ _IMAGE_SECTION_HEADER { + uint8_t Name[8]; + le32_t VirtualSize; + le32_t VirtualAddress; + le32_t SizeOfRawData; + le32_t PointerToRawData; + le32_t PointerToRelocations; + le32_t PointerToLinenumbers; + le16_t NumberOfRelocations; + le16_t NumberOfLinenumbers; + le32_t Characteristics; +} IMAGE_SECTION_HEADER; + +#define IMAGE_SUBSYSTEM_EFI_APPLICATION 10 + +bool pe_header_is_64bit(const PeHeader *h); + +#define PE_HEADER_OPTIONAL_FIELD(h, field) \ + (pe_header_is_64bit(h) ? (h)->optional.pe32plus_##field : (h)->optional.pe32_##field) + +#define PE_HEADER_OPTIONAL_FIELD_OFFSET(h, field) \ + (pe_header_is_64bit(h) ? offsetof(PeHeader, optional.pe32plus_##field) : offsetof(PeHeader, optional.pe32_##field)) + +const IMAGE_DATA_DIRECTORY *pe_header_get_data_directory(const PeHeader *h, size_t i); +const IMAGE_SECTION_HEADER *pe_header_find_section(const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections, const char *name); + +int pe_load_headers(int fd, IMAGE_DOS_HEADER **ret_dos_header, PeHeader **ret_pe_header); + +int pe_load_sections(int fd, const IMAGE_DOS_HEADER *dos_header, const PeHeader *pe_header, IMAGE_SECTION_HEADER **ret_sections); +int pe_read_section_data(int fd, const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections, const char *name, size_t max_size, void **ret, size_t *ret_size); + +bool pe_is_uki(const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections); diff --git a/src/shared/pkcs11-util.c b/src/shared/pkcs11-util.c new file mode 100644 index 0000000..6e88dc3 --- /dev/null +++ b/src/shared/pkcs11-util.c @@ -0,0 +1,1371 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "ask-password-api.h" +#include "dlfcn-util.h" +#include "env-util.h" +#include "escape.h" +#include "fd-util.h" +#include "format-table.h" +#include "io-util.h" +#include "memory-util.h" +#if HAVE_OPENSSL +#include "openssl-util.h" +#endif +#include "pkcs11-util.h" +#include "random-util.h" +#include "string-util.h" +#include "strv.h" + +bool pkcs11_uri_valid(const char *uri) { + const char *p; + + /* A very superficial checker for RFC7512 PKCS#11 URI syntax */ + + if (isempty(uri)) + return false; + + p = startswith(uri, "pkcs11:"); + if (!p) + return false; + + if (isempty(p)) + return false; + + if (!in_charset(p, ALPHANUMERICAL ".~/-_?;&%=")) + return false; + + return true; +} + +#if HAVE_P11KIT + +static void *p11kit_dl = NULL; + +char *(*sym_p11_kit_module_get_name)(CK_FUNCTION_LIST *module); +void (*sym_p11_kit_modules_finalize_and_release)(CK_FUNCTION_LIST **modules); +CK_FUNCTION_LIST **(*sym_p11_kit_modules_load_and_initialize)(int flags); +const char *(*sym_p11_kit_strerror)(CK_RV rv); +int (*sym_p11_kit_uri_format)(P11KitUri *uri, P11KitUriType uri_type, char **string); +void (*sym_p11_kit_uri_free)(P11KitUri *uri); +CK_ATTRIBUTE_PTR (*sym_p11_kit_uri_get_attributes)(P11KitUri *uri, CK_ULONG *n_attrs); +CK_INFO_PTR (*sym_p11_kit_uri_get_module_info)(P11KitUri *uri); +CK_SLOT_INFO_PTR (*sym_p11_kit_uri_get_slot_info)(P11KitUri *uri); +CK_TOKEN_INFO_PTR (*sym_p11_kit_uri_get_token_info)(P11KitUri *uri); +int (*sym_p11_kit_uri_match_token_info)(const P11KitUri *uri, const CK_TOKEN_INFO *token_info); +const char *(*sym_p11_kit_uri_message)(int code); +P11KitUri *(*sym_p11_kit_uri_new)(void); +int (*sym_p11_kit_uri_parse)(const char *string, P11KitUriType uri_type, P11KitUri *uri); + +int dlopen_p11kit(void) { + return dlopen_many_sym_or_warn( + &p11kit_dl, + "libp11-kit.so.0", LOG_DEBUG, + DLSYM_ARG(p11_kit_module_get_name), + DLSYM_ARG(p11_kit_modules_finalize_and_release), + DLSYM_ARG(p11_kit_modules_load_and_initialize), + DLSYM_ARG(p11_kit_strerror), + DLSYM_ARG(p11_kit_uri_format), + DLSYM_ARG(p11_kit_uri_free), + DLSYM_ARG(p11_kit_uri_get_attributes), + DLSYM_ARG(p11_kit_uri_get_module_info), + DLSYM_ARG(p11_kit_uri_get_slot_info), + DLSYM_ARG(p11_kit_uri_get_token_info), + DLSYM_ARG(p11_kit_uri_match_token_info), + DLSYM_ARG(p11_kit_uri_message), + DLSYM_ARG(p11_kit_uri_new), + DLSYM_ARG(p11_kit_uri_parse)); +} + +int uri_from_string(const char *p, P11KitUri **ret) { + _cleanup_(sym_p11_kit_uri_freep) P11KitUri *uri = NULL; + int r; + + assert(p); + assert(ret); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + uri = sym_p11_kit_uri_new(); + if (!uri) + return -ENOMEM; + + if (sym_p11_kit_uri_parse(p, P11_KIT_URI_FOR_ANY, uri) != P11_KIT_URI_OK) + return -EINVAL; + + *ret = TAKE_PTR(uri); + return 0; +} + +P11KitUri *uri_from_module_info(const CK_INFO *info) { + P11KitUri *uri; + + assert(info); + + if (dlopen_p11kit() < 0) + return NULL; + + uri = sym_p11_kit_uri_new(); + if (!uri) + return NULL; + + *sym_p11_kit_uri_get_module_info(uri) = *info; + return uri; +} + +P11KitUri *uri_from_slot_info(const CK_SLOT_INFO *slot_info) { + P11KitUri *uri; + + assert(slot_info); + + if (dlopen_p11kit() < 0) + return NULL; + + uri = sym_p11_kit_uri_new(); + if (!uri) + return NULL; + + *sym_p11_kit_uri_get_slot_info(uri) = *slot_info; + return uri; +} + +P11KitUri *uri_from_token_info(const CK_TOKEN_INFO *token_info) { + P11KitUri *uri; + + assert(token_info); + + if (dlopen_p11kit() < 0) + return NULL; + + uri = sym_p11_kit_uri_new(); + if (!uri) + return NULL; + + *sym_p11_kit_uri_get_token_info(uri) = *token_info; + return uri; +} + +CK_RV pkcs11_get_slot_list_malloc( + CK_FUNCTION_LIST *m, + CK_SLOT_ID **ret_slotids, + CK_ULONG *ret_n_slotids) { + + CK_RV rv; + + assert(m); + assert(ret_slotids); + assert(ret_n_slotids); + + for (unsigned tries = 0; tries < 16; tries++) { + _cleanup_free_ CK_SLOT_ID *slotids = NULL; + CK_ULONG n_slotids = 0; + + rv = m->C_GetSlotList(0, NULL, &n_slotids); + if (rv != CKR_OK) + return rv; + if (n_slotids == 0) { + *ret_slotids = NULL; + *ret_n_slotids = 0; + return CKR_OK; + } + + slotids = new(CK_SLOT_ID, n_slotids); + if (!slotids) + return CKR_HOST_MEMORY; + + rv = m->C_GetSlotList(0, slotids, &n_slotids); + if (rv == CKR_OK) { + *ret_slotids = TAKE_PTR(slotids); + *ret_n_slotids = n_slotids; + return CKR_OK; + } + + if (rv != CKR_BUFFER_TOO_SMALL) + return rv; + + /* Hu? Maybe somebody plugged something in and things changed? Let's try again */ + } + + return CKR_BUFFER_TOO_SMALL; +} + +char *pkcs11_token_label(const CK_TOKEN_INFO *token_info) { + char *t; + + /* The label is not NUL terminated and likely padded with spaces, let's make a copy here, so that we + * can strip that. */ + t = strndup((char*) token_info->label, sizeof(token_info->label)); + if (!t) + return NULL; + + strstrip(t); + return t; +} + +char *pkcs11_token_manufacturer_id(const CK_TOKEN_INFO *token_info) { + char *t; + + t = strndup((char*) token_info->manufacturerID, sizeof(token_info->manufacturerID)); + if (!t) + return NULL; + + strstrip(t); + return t; +} + +char *pkcs11_token_model(const CK_TOKEN_INFO *token_info) { + char *t; + + t = strndup((char*) token_info->model, sizeof(token_info->model)); + if (!t) + return NULL; + + strstrip(t); + return t; +} + +int pkcs11_token_login_by_pin( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + const CK_TOKEN_INFO *token_info, + const char *token_label, + const void *pin, + size_t pin_size) { + + CK_RV rv; + int r; + + assert(m); + assert(token_info); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + if (FLAGS_SET(token_info->flags, CKF_PROTECTED_AUTHENTICATION_PATH)) { + rv = m->C_Login(session, CKU_USER, NULL, 0); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to log into security token '%s': %s", token_label, sym_p11_kit_strerror(rv)); + + log_info("Successfully logged into security token '%s' via protected authentication path.", token_label); + return 0; + } + + if (!FLAGS_SET(token_info->flags, CKF_LOGIN_REQUIRED)) { + log_info("No login into security token '%s' required.", token_label); + return 0; + } + + if (!pin) + return -ENOANO; + + rv = m->C_Login(session, CKU_USER, (CK_UTF8CHAR*) pin, pin_size); + if (rv == CKR_OK) { + log_info("Successfully logged into security token '%s'.", token_label); + return 0; + } + + if (rv == CKR_PIN_LOCKED) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "PIN has been locked, please reset PIN of security token '%s'.", token_label); + if (!IN_SET(rv, CKR_PIN_INCORRECT, CKR_PIN_LEN_RANGE)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to log into security token '%s': %s", token_label, sym_p11_kit_strerror(rv)); + + return log_notice_errno(SYNTHETIC_ERRNO(ENOLCK), + "PIN for token '%s' is incorrect, please try again.", + token_label); +} + +int pkcs11_token_login( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_SLOT_ID slotid, + const CK_TOKEN_INFO *token_info, + const char *friendly_name, + const char *icon_name, + const char *key_name, + const char *credential_name, + usec_t until, + AskPasswordFlags ask_password_flags, + bool headless, + char **ret_used_pin) { + + _cleanup_free_ char *token_uri_string = NULL, *token_uri_escaped = NULL, *id = NULL, *token_label = NULL; + _cleanup_(sym_p11_kit_uri_freep) P11KitUri *token_uri = NULL; + CK_TOKEN_INFO updated_token_info; + int uri_result, r; + CK_RV rv; + + assert(m); + assert(token_info); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + token_label = pkcs11_token_label(token_info); + if (!token_label) + return log_oom(); + + token_uri = uri_from_token_info(token_info); + if (!token_uri) + return log_oom(); + + uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, &token_uri_string); + if (uri_result != P11_KIT_URI_OK) + return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN), "Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result)); + + r = pkcs11_token_login_by_pin(m, session, token_info, token_label, /* pin= */ NULL, 0); + if (r == 0 && ret_used_pin) + *ret_used_pin = NULL; + + if (r != -ENOANO) /* pin required */ + return r; + + token_uri_escaped = cescape(token_uri_string); + if (!token_uri_escaped) + return log_oom(); + + id = strjoin("pkcs11:", token_uri_escaped); + if (!id) + return log_oom(); + + for (unsigned tries = 0; tries < 3; tries++) { + _cleanup_strv_free_erase_ char **passwords = NULL; + _cleanup_(erase_and_freep) char *envpin = NULL; + + r = getenv_steal_erase("PIN", &envpin); + if (r < 0) + return log_error_errno(r, "Failed to acquire PIN from environment: %m"); + if (r > 0) { + passwords = strv_new(envpin); + if (!passwords) + return log_oom(); + + } else if (headless) + return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "PIN querying disabled via 'headless' option. Use the 'PIN' environment variable."); + else { + _cleanup_free_ char *text = NULL; + + if (FLAGS_SET(token_info->flags, CKF_USER_PIN_FINAL_TRY)) + r = asprintf(&text, + "Please enter correct PIN for security token '%s' in order to unlock %s (final try):", + token_label, friendly_name); + else if (FLAGS_SET(token_info->flags, CKF_USER_PIN_COUNT_LOW)) + r = asprintf(&text, + "PIN has been entered incorrectly previously, please enter correct PIN for security token '%s' in order to unlock %s:", + token_label, friendly_name); + else if (tries == 0) + r = asprintf(&text, + "Please enter PIN for security token '%s' in order to unlock %s:", + token_label, friendly_name); + else + r = asprintf(&text, + "Please enter PIN for security token '%s' in order to unlock %s (try #%u):", + token_label, friendly_name, tries+1); + if (r < 0) + return log_oom(); + + /* We never cache PINs, simply because it's fatal if we use wrong PINs, since usually there are only 3 tries */ + r = ask_password_auto(text, icon_name, id, key_name, credential_name, until, ask_password_flags, &passwords); + if (r < 0) + return log_error_errno(r, "Failed to query PIN for security token '%s': %m", token_label); + } + + STRV_FOREACH(i, passwords) { + r = pkcs11_token_login_by_pin(m, session, token_info, token_label, *i, strlen(*i)); + if (r == 0 && ret_used_pin) { + char *c; + + c = strdup(*i); + if (!c) + return log_oom(); + + *ret_used_pin = c; + } + + if (r != -ENOLCK) + return r; + + /* Refresh the token info, so that we can prompt knowing the new flags if they changed. */ + rv = m->C_GetTokenInfo(slotid, &updated_token_info); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to acquire updated security token information for slot %lu: %s", + slotid, sym_p11_kit_strerror(rv)); + + token_info = &updated_token_info; + } + } + + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Too many attempts to log into token '%s'.", token_label); +} + +int pkcs11_token_find_x509_certificate( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + P11KitUri *search_uri, + CK_OBJECT_HANDLE *ret_object) { + + bool found_class = false, found_certificate_type = false; + _cleanup_free_ CK_ATTRIBUTE *attributes_buffer = NULL; + CK_ULONG n_attributes, a, n_objects; + CK_ATTRIBUTE *attributes = NULL; + CK_OBJECT_HANDLE objects[2]; + CK_RV rv, rv2; + int r; + + assert(m); + assert(search_uri); + assert(ret_object); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + attributes = sym_p11_kit_uri_get_attributes(search_uri, &n_attributes); + for (a = 0; a < n_attributes; a++) { + + /* We use the URI's included match attributes, but make them more strict. This allows users + * to specify a token URL instead of an object URL and the right thing should happen if + * there's only one suitable key on the token. */ + + switch (attributes[a].type) { + + case CKA_CLASS: { + CK_OBJECT_CLASS c; + + if (attributes[a].ulValueLen != sizeof(c)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_CLASS attribute size."); + + memcpy(&c, attributes[a].pValue, sizeof(c)); + if (c != CKO_CERTIFICATE) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected PKCS#11 object is not an X.509 certificate, refusing."); + + found_class = true; + break; + } + + case CKA_CERTIFICATE_TYPE: { + CK_CERTIFICATE_TYPE t; + + if (attributes[a].ulValueLen != sizeof(t)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_CERTIFICATE_TYPE attribute size."); + + memcpy(&t, attributes[a].pValue, sizeof(t)); + if (t != CKC_X_509) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected PKCS#11 object is not an X.509 certificate, refusing."); + + found_certificate_type = true; + break; + }} + } + + if (!found_class || !found_certificate_type) { + /* Hmm, let's slightly extend the attribute list we search for */ + + attributes_buffer = new(CK_ATTRIBUTE, n_attributes + !found_class + !found_certificate_type); + if (!attributes_buffer) + return log_oom(); + + memcpy(attributes_buffer, attributes, sizeof(CK_ATTRIBUTE) * n_attributes); + + if (!found_class) { + static const CK_OBJECT_CLASS class = CKO_CERTIFICATE; + + attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) { + .type = CKA_CLASS, + .pValue = (CK_OBJECT_CLASS*) &class, + .ulValueLen = sizeof(class), + }; + } + + if (!found_certificate_type) { + static const CK_CERTIFICATE_TYPE type = CKC_X_509; + + attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) { + .type = CKA_CERTIFICATE_TYPE, + .pValue = (CK_CERTIFICATE_TYPE*) &type, + .ulValueLen = sizeof(type), + }; + } + + attributes = attributes_buffer; + } + + rv = m->C_FindObjectsInit(session, attributes, n_attributes); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to initialize object find call: %s", sym_p11_kit_strerror(rv)); + + rv = m->C_FindObjects(session, objects, ELEMENTSOF(objects), &n_objects); + rv2 = m->C_FindObjectsFinal(session); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to find objects: %s", sym_p11_kit_strerror(rv)); + if (rv2 != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to finalize object find call: %s", sym_p11_kit_strerror(rv)); + if (n_objects == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Failed to find selected X509 certificate on token."); + if (n_objects > 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "Configured URI matches multiple certificates, refusing."); + + *ret_object = objects[0]; + return 0; +} + +#if HAVE_OPENSSL +int pkcs11_token_read_x509_certificate( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_OBJECT_HANDLE object, + X509 **ret_cert) { + + _cleanup_free_ void *buffer = NULL; + _cleanup_free_ char *t = NULL; + CK_ATTRIBUTE attribute = { + .type = CKA_VALUE + }; + CK_RV rv; + _cleanup_(X509_freep) X509 *x509 = NULL; + X509_NAME *name = NULL; + const unsigned char *p; + int r; + + r = dlopen_p11kit(); + if (r < 0) + return r; + + rv = m->C_GetAttributeValue(session, object, &attribute, 1); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to read X.509 certificate size off token: %s", sym_p11_kit_strerror(rv)); + + buffer = malloc(attribute.ulValueLen); + if (!buffer) + return log_oom(); + + attribute.pValue = buffer; + + rv = m->C_GetAttributeValue(session, object, &attribute, 1); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to read X.509 certificate data off token: %s", sym_p11_kit_strerror(rv)); + + p = attribute.pValue; + x509 = d2i_X509(NULL, &p, attribute.ulValueLen); + if (!x509) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed parse X.509 certificate."); + + name = X509_get_subject_name(x509); + if (!name) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed to acquire X.509 subject name."); + + t = X509_NAME_oneline(name, NULL, 0); + if (!t) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to format X.509 subject name as string."); + + log_debug("Using X.509 certificate issued for '%s'.", t); + + *ret_cert = TAKE_PTR(x509); + return 0; +} +#endif + +int pkcs11_token_find_private_key( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + P11KitUri *search_uri, + CK_OBJECT_HANDLE *ret_object) { + + bool found_decrypt = false, found_class = false, found_key_type = false; + _cleanup_free_ CK_ATTRIBUTE *attributes_buffer = NULL; + CK_ULONG n_attributes, a, n_objects; + CK_ATTRIBUTE *attributes = NULL; + CK_OBJECT_HANDLE objects[2]; + CK_RV rv, rv2; + int r; + + assert(m); + assert(search_uri); + assert(ret_object); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + attributes = sym_p11_kit_uri_get_attributes(search_uri, &n_attributes); + for (a = 0; a < n_attributes; a++) { + + /* We use the URI's included match attributes, but make them more strict. This allows users + * to specify a token URL instead of an object URL and the right thing should happen if + * there's only one suitable key on the token. */ + + switch (attributes[a].type) { + + case CKA_CLASS: { + CK_OBJECT_CLASS c; + + if (attributes[a].ulValueLen != sizeof(c)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_CLASS attribute size."); + + memcpy(&c, attributes[a].pValue, sizeof(c)); + if (c != CKO_PRIVATE_KEY) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Selected PKCS#11 object is not a private key, refusing."); + + found_class = true; + break; + } + + case CKA_DECRYPT: { + CK_BBOOL b; + + if (attributes[a].ulValueLen != sizeof(b)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_DECRYPT attribute size."); + + memcpy(&b, attributes[a].pValue, sizeof(b)); + if (!b) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Selected PKCS#11 object is not suitable for decryption, refusing."); + + found_decrypt = true; + break; + } + + case CKA_KEY_TYPE: { + CK_KEY_TYPE t; + + if (attributes[a].ulValueLen != sizeof(t)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_KEY_TYPE attribute size."); + + memcpy(&t, attributes[a].pValue, sizeof(t)); + if (t != CKK_RSA) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected PKCS#11 object is not an RSA key, refusing."); + + found_key_type = true; + break; + }} + } + + if (!found_decrypt || !found_class || !found_key_type) { + /* Hmm, let's slightly extend the attribute list we search for */ + + attributes_buffer = new(CK_ATTRIBUTE, n_attributes + !found_decrypt + !found_class + !found_key_type); + if (!attributes_buffer) + return log_oom(); + + memcpy(attributes_buffer, attributes, sizeof(CK_ATTRIBUTE) * n_attributes); + + if (!found_decrypt) { + static const CK_BBOOL yes = true; + + attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) { + .type = CKA_DECRYPT, + .pValue = (CK_BBOOL*) &yes, + .ulValueLen = sizeof(yes), + }; + } + + if (!found_class) { + static const CK_OBJECT_CLASS class = CKO_PRIVATE_KEY; + + attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) { + .type = CKA_CLASS, + .pValue = (CK_OBJECT_CLASS*) &class, + .ulValueLen = sizeof(class), + }; + } + + if (!found_key_type) { + static const CK_KEY_TYPE type = CKK_RSA; + + attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) { + .type = CKA_KEY_TYPE, + .pValue = (CK_KEY_TYPE*) &type, + .ulValueLen = sizeof(type), + }; + } + + attributes = attributes_buffer; + } + + rv = m->C_FindObjectsInit(session, attributes, n_attributes); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to initialize object find call: %s", sym_p11_kit_strerror(rv)); + + rv = m->C_FindObjects(session, objects, ELEMENTSOF(objects), &n_objects); + rv2 = m->C_FindObjectsFinal(session); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to find objects: %s", sym_p11_kit_strerror(rv)); + if (rv2 != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to finalize object find call: %s", sym_p11_kit_strerror(rv)); + if (n_objects == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "Failed to find selected private key suitable for decryption on token."); + if (n_objects > 1) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "Configured private key URI matches multiple keys, refusing."); + + *ret_object = objects[0]; + return 0; +} + +int pkcs11_token_decrypt_data( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_OBJECT_HANDLE object, + const void *encrypted_data, + size_t encrypted_data_size, + void **ret_decrypted_data, + size_t *ret_decrypted_data_size) { + + static const CK_MECHANISM mechanism = { + .mechanism = CKM_RSA_PKCS + }; + _cleanup_(erase_and_freep) CK_BYTE *dbuffer = NULL; + CK_ULONG dbuffer_size = 0; + CK_RV rv; + int r; + + assert(m); + assert(encrypted_data); + assert(encrypted_data_size > 0); + assert(ret_decrypted_data); + assert(ret_decrypted_data_size); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + rv = m->C_DecryptInit(session, (CK_MECHANISM*) &mechanism, object); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to initialize decryption on security token: %s", sym_p11_kit_strerror(rv)); + + dbuffer_size = encrypted_data_size; /* Start with something reasonable */ + dbuffer = malloc(dbuffer_size); + if (!dbuffer) + return log_oom(); + + rv = m->C_Decrypt(session, (CK_BYTE*) encrypted_data, encrypted_data_size, dbuffer, &dbuffer_size); + if (rv == CKR_BUFFER_TOO_SMALL) { + erase_and_free(dbuffer); + + dbuffer = malloc(dbuffer_size); + if (!dbuffer) + return log_oom(); + + rv = m->C_Decrypt(session, (CK_BYTE*) encrypted_data, encrypted_data_size, dbuffer, &dbuffer_size); + } + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to decrypt key on security token: %s", sym_p11_kit_strerror(rv)); + + log_info("Successfully decrypted key with security token."); + + *ret_decrypted_data = TAKE_PTR(dbuffer); + *ret_decrypted_data_size = dbuffer_size; + return 0; +} + +int pkcs11_token_acquire_rng( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session) { + + _cleanup_free_ void *buffer = NULL; + size_t rps; + CK_RV rv; + int r; + + assert(m); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + /* While we are at it, let's read some RNG data from the PKCS#11 token and pass it to the kernel + * random pool. This should be cheap if we are talking to the device already. Note that we don't + * credit any entropy, since we don't know about the quality of the pkcs#11 token's RNG. Why bother + * at all? There are two sides to the argument whether to generate private keys on tokens or on the + * host. By crediting some data from the token RNG to the host's pool we at least can say that any + * key generated from it is at least as good as both sources individually. */ + + rps = random_pool_size(); + + buffer = malloc(rps); + if (!buffer) + return log_oom(); + + rv = m->C_GenerateRandom(session, buffer, rps); + if (rv != CKR_OK) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Failed to generate RNG data on security token: %s", sym_p11_kit_strerror(rv)); + + r = random_write_entropy(-1, buffer, rps, false); + if (r < 0) + return log_debug_errno(r, "Failed to write PKCS#11 acquired random data to /dev/urandom: %m"); + + log_debug("Successfully written %zu bytes random data acquired via PKCS#11 to kernel random pool.", rps); + + return 0; +} + +static int token_process( + CK_FUNCTION_LIST *m, + CK_SLOT_ID slotid, + const CK_SLOT_INFO *slot_info, + const CK_TOKEN_INFO *token_info, + P11KitUri *search_uri, + pkcs11_find_token_callback_t callback, + void *userdata) { + + _cleanup_free_ char *token_label = NULL; + CK_SESSION_HANDLE session; + CK_RV rv; + int r; + + assert(m); + assert(slot_info); + assert(token_info); + + token_label = pkcs11_token_label(token_info); + if (!token_label) + return log_oom(); + + rv = m->C_OpenSession(slotid, CKF_SERIAL_SESSION, NULL, NULL, &session); + if (rv != CKR_OK) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to create session for security token '%s': %s", token_label, sym_p11_kit_strerror(rv)); + + if (callback) + r = callback(m, session, slotid, slot_info, token_info, search_uri, userdata); + else + r = 1; /* if not callback was specified, just say we found what we were looking for */ + + rv = m->C_CloseSession(session); + if (rv != CKR_OK) + log_warning("Failed to close session on PKCS#11 token, ignoring: %s", sym_p11_kit_strerror(rv)); + + return r; +} + +static int slot_process( + CK_FUNCTION_LIST *m, + CK_SLOT_ID slotid, + P11KitUri *search_uri, + pkcs11_find_token_callback_t callback, + void *userdata) { + + _cleanup_(sym_p11_kit_uri_freep) P11KitUri* slot_uri = NULL, *token_uri = NULL; + _cleanup_free_ char *token_uri_string = NULL; + CK_TOKEN_INFO token_info; + CK_SLOT_INFO slot_info; + int uri_result, r; + CK_RV rv; + + assert(m); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + /* We return -EAGAIN for all failures we can attribute to a specific slot in some way, so that the + * caller might try other slots before giving up. */ + + rv = m->C_GetSlotInfo(slotid, &slot_info); + if (rv != CKR_OK) { + log_warning("Failed to acquire slot info for slot %lu, ignoring slot: %s", slotid, sym_p11_kit_strerror(rv)); + return -EAGAIN; + } + + slot_uri = uri_from_slot_info(&slot_info); + if (!slot_uri) + return log_oom(); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *slot_uri_string = NULL; + + uri_result = sym_p11_kit_uri_format(slot_uri, P11_KIT_URI_FOR_ANY, &slot_uri_string); + if (uri_result != P11_KIT_URI_OK) { + log_warning("Failed to format slot URI, ignoring slot: %s", sym_p11_kit_uri_message(uri_result)); + return -EAGAIN; + } + + log_debug("Found slot with URI %s", slot_uri_string); + } + + rv = m->C_GetTokenInfo(slotid, &token_info); + if (rv == CKR_TOKEN_NOT_PRESENT) { + return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), + "Token not present in slot, ignoring."); + } else if (rv != CKR_OK) { + log_warning("Failed to acquire token info for slot %lu, ignoring slot: %s", slotid, sym_p11_kit_strerror(rv)); + return -EAGAIN; + } + + token_uri = uri_from_token_info(&token_info); + if (!token_uri) + return log_oom(); + + uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, &token_uri_string); + if (uri_result != P11_KIT_URI_OK) { + log_warning("Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result)); + return -EAGAIN; + } + + if (search_uri && !sym_p11_kit_uri_match_token_info(search_uri, &token_info)) + return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), + "Found non-matching token with URI %s.", + token_uri_string); + + log_debug("Found matching token with URI %s.", token_uri_string); + + return token_process( + m, + slotid, + &slot_info, + &token_info, + search_uri, + callback, + userdata); +} + +static int module_process( + CK_FUNCTION_LIST *m, + P11KitUri *search_uri, + pkcs11_find_token_callback_t callback, + void *userdata) { + + _cleanup_(sym_p11_kit_uri_freep) P11KitUri* module_uri = NULL; + _cleanup_free_ char *name = NULL, *module_uri_string = NULL; + _cleanup_free_ CK_SLOT_ID *slotids = NULL; + CK_ULONG n_slotids = 0; + int uri_result; + CK_INFO info; + size_t k; + CK_RV rv; + int r; + + assert(m); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + /* We ignore most errors from modules here, in order to skip over faulty modules: one faulty module + * should not have the effect that we don't try the others anymore. We indicate such per-module + * failures with -EAGAIN, which let's the caller try the next module. */ + + name = sym_p11_kit_module_get_name(m); + if (!name) + return log_oom(); + + log_debug("Trying PKCS#11 module %s.", name); + + rv = m->C_GetInfo(&info); + if (rv != CKR_OK) { + log_warning("Failed to get info on PKCS#11 module, ignoring module: %s", sym_p11_kit_strerror(rv)); + return -EAGAIN; + } + + module_uri = uri_from_module_info(&info); + if (!module_uri) + return log_oom(); + + uri_result = sym_p11_kit_uri_format(module_uri, P11_KIT_URI_FOR_ANY, &module_uri_string); + if (uri_result != P11_KIT_URI_OK) { + log_warning("Failed to format module URI, ignoring module: %s", sym_p11_kit_uri_message(uri_result)); + return -EAGAIN; + } + + log_debug("Found module with URI %s", module_uri_string); + + rv = pkcs11_get_slot_list_malloc(m, &slotids, &n_slotids); + if (rv != CKR_OK) { + log_warning("Failed to get slot list, ignoring module: %s", sym_p11_kit_strerror(rv)); + return -EAGAIN; + } + if (n_slotids == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), + "This module has no slots? Ignoring module."); + + for (k = 0; k < n_slotids; k++) { + r = slot_process( + m, + slotids[k], + search_uri, + callback, + userdata); + if (r != -EAGAIN) + return r; + } + + return -EAGAIN; +} + +int pkcs11_find_token( + const char *pkcs11_uri, + pkcs11_find_token_callback_t callback, + void *userdata) { + + _cleanup_(sym_p11_kit_modules_finalize_and_releasep) CK_FUNCTION_LIST **modules = NULL; + _cleanup_(sym_p11_kit_uri_freep) P11KitUri *search_uri = NULL; + int r; + + r = dlopen_p11kit(); + if (r < 0) + return r; + + /* Execute the specified callback for each matching token found. If nothing is found returns + * -EAGAIN. Logs about all errors, except for EAGAIN, which the caller has to log about. */ + + if (pkcs11_uri) { + r = uri_from_string(pkcs11_uri, &search_uri); + if (r < 0) + return log_error_errno(r, "Failed to parse PKCS#11 URI '%s': %m", pkcs11_uri); + } + + modules = sym_p11_kit_modules_load_and_initialize(0); + if (!modules) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to initialize pkcs11 modules"); + + for (CK_FUNCTION_LIST **i = modules; *i; i++) { + r = module_process( + *i, + search_uri, + callback, + userdata); + if (r != -EAGAIN) + return r; + } + + return -EAGAIN; +} + +#if HAVE_OPENSSL +struct pkcs11_acquire_certificate_callback_data { + char *pin_used; + X509 *cert; + const char *askpw_friendly_name, *askpw_icon_name; + AskPasswordFlags askpw_flags; + bool headless; +}; + +static void pkcs11_acquire_certificate_callback_data_release(struct pkcs11_acquire_certificate_callback_data *data) { + erase_and_free(data->pin_used); + X509_free(data->cert); +} + +static int pkcs11_acquire_certificate_callback( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_SLOT_ID slot_id, + const CK_SLOT_INFO *slot_info, + const CK_TOKEN_INFO *token_info, + P11KitUri *uri, + void *userdata) { + + _cleanup_(erase_and_freep) char *pin_used = NULL; + struct pkcs11_acquire_certificate_callback_data *data = ASSERT_PTR(userdata); + CK_OBJECT_HANDLE object; + int r; + + assert(m); + assert(slot_info); + assert(token_info); + assert(uri); + + /* Called for every token matching our URI */ + + r = pkcs11_token_login( + m, + session, + slot_id, + token_info, + data->askpw_friendly_name, + data->askpw_icon_name, + "pkcs11-pin", + "pkcs11-pin", + UINT64_MAX, + data->askpw_flags, + data->headless, + &pin_used); + if (r < 0) + return r; + + r = pkcs11_token_find_x509_certificate(m, session, uri, &object); + if (r < 0) + return r; + + r = pkcs11_token_read_x509_certificate(m, session, object, &data->cert); + if (r < 0) + return r; + + /* Let's read some random data off the token and write it to the kernel pool before we generate our + * random key from it. This way we can claim the quality of the RNG is at least as good as the + * kernel's and the token's pool */ + (void) pkcs11_token_acquire_rng(m, session); + + data->pin_used = TAKE_PTR(pin_used); + return 1; +} + +int pkcs11_acquire_certificate( + const char *uri, + const char *askpw_friendly_name, + const char *askpw_icon_name, + X509 **ret_cert, + char **ret_pin_used) { + + _cleanup_(pkcs11_acquire_certificate_callback_data_release) struct pkcs11_acquire_certificate_callback_data data = { + .askpw_friendly_name = askpw_friendly_name, + .askpw_icon_name = askpw_icon_name, + }; + int r; + + assert(uri); + assert(ret_cert); + + r = pkcs11_find_token(uri, pkcs11_acquire_certificate_callback, &data); + if (r == -EAGAIN) /* pkcs11_find_token() doesn't log about this error, but all others */ + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), + "Specified PKCS#11 token with URI '%s' not found.", + uri); + if (r < 0) + return r; + + *ret_cert = TAKE_PTR(data.cert); + + if (ret_pin_used) + *ret_pin_used = TAKE_PTR(data.pin_used); + + return 0; +} +#endif + +static int list_callback( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_SLOT_ID slot_id, + const CK_SLOT_INFO *slot_info, + const CK_TOKEN_INFO *token_info, + P11KitUri *uri, + void *userdata) { + + _cleanup_free_ char *token_uri_string = NULL, *token_label = NULL, *token_manufacturer_id = NULL, *token_model = NULL; + _cleanup_(sym_p11_kit_uri_freep) P11KitUri *token_uri = NULL; + Table *t = userdata; + int uri_result, r; + + assert(slot_info); + assert(token_info); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + /* We only care about hardware devices here with a token inserted. Let's filter everything else + * out. (Note that the user can explicitly specify non-hardware tokens if they like, but during + * enumeration we'll filter those, since software tokens are typically the system certificate store + * and such, and it's typically not what people want to bind their home directories to.) */ + if (!FLAGS_SET(slot_info->flags, CKF_HW_SLOT|CKF_TOKEN_PRESENT)) + return -EAGAIN; + + token_label = pkcs11_token_label(token_info); + if (!token_label) + return log_oom(); + + token_manufacturer_id = pkcs11_token_manufacturer_id(token_info); + if (!token_manufacturer_id) + return log_oom(); + + token_model = pkcs11_token_model(token_info); + if (!token_model) + return log_oom(); + + token_uri = uri_from_token_info(token_info); + if (!token_uri) + return log_oom(); + + uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, &token_uri_string); + if (uri_result != P11_KIT_URI_OK) + return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN), "Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result)); + + r = table_add_many( + t, + TABLE_STRING, token_uri_string, + TABLE_STRING, token_label, + TABLE_STRING, token_manufacturer_id, + TABLE_STRING, token_model); + if (r < 0) + return table_log_add_error(r); + + return -EAGAIN; /* keep scanning */ +} +#endif + +int pkcs11_list_tokens(void) { +#if HAVE_P11KIT + _cleanup_(table_unrefp) Table *t = NULL; + int r; + + t = table_new("uri", "label", "manufacturer", "model"); + if (!t) + return log_oom(); + + r = pkcs11_find_token(NULL, list_callback, t); + if (r < 0 && r != -EAGAIN) + return r; + + if (table_get_rows(t) <= 1) { + log_info("No suitable PKCS#11 tokens found."); + return 0; + } + + r = table_print(t, stdout); + if (r < 0) + return log_error_errno(r, "Failed to show device table: %m"); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "PKCS#11 tokens not supported on this build."); +#endif +} + +#if HAVE_P11KIT +static int auto_callback( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_SLOT_ID slot_id, + const CK_SLOT_INFO *slot_info, + const CK_TOKEN_INFO *token_info, + P11KitUri *uri, + void *userdata) { + + _cleanup_(sym_p11_kit_uri_freep) P11KitUri *token_uri = NULL; + char **t = userdata; + int uri_result, r; + + assert(slot_info); + assert(token_info); + + r = dlopen_p11kit(); + if (r < 0) + return r; + + if (!FLAGS_SET(token_info->flags, CKF_HW_SLOT|CKF_TOKEN_PRESENT)) + return -EAGAIN; + + if (*t) + return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "More than one suitable PKCS#11 token found."); + + token_uri = uri_from_token_info(token_info); + if (!token_uri) + return log_oom(); + + uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, t); + if (uri_result != P11_KIT_URI_OK) + return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN), "Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result)); + + return 0; +} +#endif + +int pkcs11_find_token_auto(char **ret) { +#if HAVE_P11KIT + int r; + + r = pkcs11_find_token(NULL, auto_callback, ret); + if (r == -EAGAIN) + return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "No suitable PKCS#11 tokens found."); + if (r < 0) + return r; + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "PKCS#11 tokens not supported on this build."); +#endif +} + +#if HAVE_P11KIT +void pkcs11_crypt_device_callback_data_release(pkcs11_crypt_device_callback_data *data) { + erase_and_free(data->decrypted_key); + + if (data->free_encrypted_key) + free(data->encrypted_key); +} + +int pkcs11_crypt_device_callback( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_SLOT_ID slot_id, + const CK_SLOT_INFO *slot_info, + const CK_TOKEN_INFO *token_info, + P11KitUri *uri, + void *userdata) { + + pkcs11_crypt_device_callback_data *data = ASSERT_PTR(userdata); + CK_OBJECT_HANDLE object; + int r; + + assert(m); + assert(slot_info); + assert(token_info); + assert(uri); + + /* Called for every token matching our URI */ + + r = pkcs11_token_login( + m, + session, + slot_id, + token_info, + data->friendly_name, + "drive-harddisk", + "pkcs11-pin", + "cryptsetup.pkcs11-pin", + data->until, + data->askpw_flags, + data->headless, + NULL); + if (r < 0) + return r; + + /* We are likely called during early boot, where entropy is scarce. Mix some data from the PKCS#11 + * token, if it supports that. It should be cheap, given that we already are talking to it anyway and + * shouldn't hurt. */ + (void) pkcs11_token_acquire_rng(m, session); + + r = pkcs11_token_find_private_key(m, session, uri, &object); + if (r < 0) + return r; + + r = pkcs11_token_decrypt_data( + m, + session, + object, + data->encrypted_key, + data->encrypted_key_size, + &data->decrypted_key, + &data->decrypted_key_size); + if (r < 0) + return r; + + return 0; +} +#endif diff --git a/src/shared/pkcs11-util.h b/src/shared/pkcs11-util.h new file mode 100644 index 0000000..5bc23c1 --- /dev/null +++ b/src/shared/pkcs11-util.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#if HAVE_P11KIT +# include +# include +#endif + +#include "ask-password-api.h" +#include "macro.h" +#include "openssl-util.h" +#include "time-util.h" + +bool pkcs11_uri_valid(const char *uri); + +#if HAVE_P11KIT + +extern char *(*sym_p11_kit_module_get_name)(CK_FUNCTION_LIST *module); +extern void (*sym_p11_kit_modules_finalize_and_release)(CK_FUNCTION_LIST **modules); +extern CK_FUNCTION_LIST **(*sym_p11_kit_modules_load_and_initialize)(int flags); +extern const char *(*sym_p11_kit_strerror)(CK_RV rv); +extern int (*sym_p11_kit_uri_format)(P11KitUri *uri, P11KitUriType uri_type, char **string); +extern void (*sym_p11_kit_uri_free)(P11KitUri *uri); +extern CK_ATTRIBUTE_PTR (*sym_p11_kit_uri_get_attributes)(P11KitUri *uri, CK_ULONG *n_attrs); +extern CK_INFO_PTR (*sym_p11_kit_uri_get_module_info)(P11KitUri *uri); +extern CK_SLOT_INFO_PTR (*sym_p11_kit_uri_get_slot_info)(P11KitUri *uri); +extern CK_TOKEN_INFO_PTR (*sym_p11_kit_uri_get_token_info)(P11KitUri *uri); +extern int (*sym_p11_kit_uri_match_token_info)(const P11KitUri *uri, const CK_TOKEN_INFO *token_info); +extern const char *(*sym_p11_kit_uri_message)(int code); +extern P11KitUri *(*sym_p11_kit_uri_new)(void); +extern int (*sym_p11_kit_uri_parse)(const char *string, P11KitUriType uri_type, P11KitUri *uri); + +int uri_from_string(const char *p, P11KitUri **ret); + +P11KitUri *uri_from_module_info(const CK_INFO *info); +P11KitUri *uri_from_slot_info(const CK_SLOT_INFO *slot_info); +P11KitUri *uri_from_token_info(const CK_TOKEN_INFO *token_info); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(P11KitUri*, sym_p11_kit_uri_free, NULL); +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(CK_FUNCTION_LIST**, sym_p11_kit_modules_finalize_and_release, NULL); + +CK_RV pkcs11_get_slot_list_malloc(CK_FUNCTION_LIST *m, CK_SLOT_ID **ret_slotids, CK_ULONG *ret_n_slotids); + +char *pkcs11_token_label(const CK_TOKEN_INFO *token_info); +char *pkcs11_token_manufacturer_id(const CK_TOKEN_INFO *token_info); +char *pkcs11_token_model(const CK_TOKEN_INFO *token_info); + +int pkcs11_token_login_by_pin(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, const CK_TOKEN_INFO *token_info, const char *token_label, const void *pin, size_t pin_size); +int pkcs11_token_login(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_SLOT_ID slotid, const CK_TOKEN_INFO *token_info, const char *friendly_name, const char *icon_name, const char *key_name, const char *credential_name, usec_t until, AskPasswordFlags ask_password_flags, bool headless, char **ret_used_pin); + +int pkcs11_token_find_x509_certificate(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, P11KitUri *search_uri, CK_OBJECT_HANDLE *ret_object); +#if HAVE_OPENSSL +int pkcs11_token_read_x509_certificate(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_OBJECT_HANDLE object, X509 **ret_cert); +#endif + +int pkcs11_token_find_private_key(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, P11KitUri *search_uri, CK_OBJECT_HANDLE *ret_object); +int pkcs11_token_decrypt_data(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_OBJECT_HANDLE object, const void *encrypted_data, size_t encrypted_data_size, void **ret_decrypted_data, size_t *ret_decrypted_data_size); + +int pkcs11_token_acquire_rng(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session); + +typedef int (*pkcs11_find_token_callback_t)(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_SLOT_ID slotid, const CK_SLOT_INFO *slot_info, const CK_TOKEN_INFO *token_info, P11KitUri *uri, void *userdata); +int pkcs11_find_token(const char *pkcs11_uri, pkcs11_find_token_callback_t callback, void *userdata); + +#if HAVE_OPENSSL +int pkcs11_acquire_certificate(const char *uri, const char *askpw_friendly_name, const char *askpw_icon_name, X509 **ret_cert, char **ret_pin_used); +#endif + +typedef struct { + const char *friendly_name; + usec_t until; + void *encrypted_key; + size_t encrypted_key_size; + void *decrypted_key; + size_t decrypted_key_size; + bool free_encrypted_key; + bool headless; + AskPasswordFlags askpw_flags; +} pkcs11_crypt_device_callback_data; + +void pkcs11_crypt_device_callback_data_release(pkcs11_crypt_device_callback_data *data); + +int pkcs11_crypt_device_callback( + CK_FUNCTION_LIST *m, + CK_SESSION_HANDLE session, + CK_SLOT_ID slot_id, + const CK_SLOT_INFO *slot_info, + const CK_TOKEN_INFO *token_info, + P11KitUri *uri, + void *userdata); + +int dlopen_p11kit(void); + +#else + +static inline int dlopen_p11kit(void) { + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "p11kit support is not compiled in."); +} + +#endif + +typedef struct { + const char *friendly_name; + usec_t until; + bool headless; + AskPasswordFlags askpw_flags; +} systemd_pkcs11_plugin_params; + +int pkcs11_list_tokens(void); +int pkcs11_find_token_auto(char **ret); diff --git a/src/shared/plymouth-util.c b/src/shared/plymouth-util.c new file mode 100644 index 0000000..31ab340 --- /dev/null +++ b/src/shared/plymouth-util.c @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "io-util.h" +#include "plymouth-util.h" +#include "socket-util.h" + +int plymouth_connect(int flags) { + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "\0/org/freedesktop/plymouthd", + }; + _cleanup_close_ int fd = -EBADF; + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|flags, 0); + if (fd < 0) + return -errno; + + if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) + return -errno; + + return TAKE_FD(fd); +} + +int plymouth_send_raw(const void *raw, size_t size, int flags) { + _cleanup_close_ int fd = -EBADF; + + fd = plymouth_connect(flags); + if (fd < 0) + return fd; + + return loop_write(fd, raw, size); +} diff --git a/src/shared/plymouth-util.h b/src/shared/plymouth-util.h new file mode 100644 index 0000000..04aec70 --- /dev/null +++ b/src/shared/plymouth-util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "errno-util.h" + +int plymouth_connect(int flags); +int plymouth_send_raw(const void *raw, size_t size, int flags); + +static inline bool ERRNO_IS_NO_PLYMOUTH(int r) { + return IN_SET(abs(r), EAGAIN, ENOENT) || ERRNO_IS_DISCONNECT(r); +} diff --git a/src/shared/pretty-print.c b/src/shared/pretty-print.c new file mode 100644 index 0000000..2833063 --- /dev/null +++ b/src/shared/pretty-print.c @@ -0,0 +1,421 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "conf-files.h" +#include "constants.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "pager.h" +#include "path-util.h" +#include "pretty-print.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +void draw_cylon(char buffer[], size_t buflen, unsigned width, unsigned pos) { + char *p = buffer; + + assert(buflen >= CYLON_BUFFER_EXTRA + width + 1); + assert(pos <= width+1); /* 0 or width+1 mean that the center light is behind the corner */ + + if (pos > 1) { + if (pos > 2) + p = mempset(p, ' ', pos-2); + if (log_get_show_color()) + p = stpcpy(p, ANSI_RED); + *p++ = '*'; + } + + if (pos > 0 && pos <= width) { + if (log_get_show_color()) + p = stpcpy(p, ANSI_HIGHLIGHT_RED); + *p++ = '*'; + } + + if (log_get_show_color()) + p = stpcpy(p, ANSI_NORMAL); + + if (pos < width) { + if (log_get_show_color()) + p = stpcpy(p, ANSI_RED); + *p++ = '*'; + if (pos < width-1) + p = mempset(p, ' ', width-1-pos); + if (log_get_show_color()) + p = stpcpy(p, ANSI_NORMAL); + } + + *p = '\0'; +} + +bool urlify_enabled(void) { +#if ENABLE_URLIFY + static int cached_urlify_enabled = -1; + + if (cached_urlify_enabled < 0) { + int val; + + val = getenv_bool("SYSTEMD_URLIFY"); + if (val >= 0) + cached_urlify_enabled = val; + else + cached_urlify_enabled = colors_enabled(); + } + + return cached_urlify_enabled; +#else + return 0; +#endif +} + +int terminal_urlify(const char *url, const char *text, char **ret) { + char *n; + + assert(url); + + /* Takes a URL and a pretty string and formats it as clickable link for the terminal. See + * https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda for details. */ + + if (isempty(text)) + text = url; + + if (urlify_enabled()) + n = strjoin("\x1B]8;;", url, "\a", text, "\x1B]8;;\a"); + else + n = strdup(text); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int file_url_from_path(const char *path, char **ret) { + _cleanup_free_ char *absolute = NULL; + struct utsname u; + char *url = NULL; + int r; + + if (uname(&u) < 0) + return -errno; + + if (!path_is_absolute(path)) { + r = path_make_absolute_cwd(path, &absolute); + if (r < 0) + return r; + + path = absolute; + } + + /* As suggested by https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda, let's include the local + * hostname here. Note that we don't use gethostname_malloc() or gethostname_strict() since we are interested + * in the raw string the kernel has set, whatever it may be, under the assumption that terminals are not overly + * careful with validating the strings either. */ + + url = strjoin("file://", u.nodename, path); + if (!url) + return -ENOMEM; + + *ret = url; + return 0; +} + +int terminal_urlify_path(const char *path, const char *text, char **ret) { + _cleanup_free_ char *url = NULL; + int r; + + assert(path); + + /* Much like terminal_urlify() above, but takes a file system path as input + * and turns it into a proper file:// URL first. */ + + if (isempty(path)) + return -EINVAL; + + if (isempty(text)) + text = path; + + if (!urlify_enabled()) { + char *n; + + n = strdup(text); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; + } + + r = file_url_from_path(path, &url); + if (r < 0) + return r; + + return terminal_urlify(url, text, ret); +} + +int terminal_urlify_man(const char *page, const char *section, char **ret) { + const char *url, *text; + + url = strjoina("man:", page, "(", section, ")"); + text = strjoina(page, "(", section, ") man page"); + + return terminal_urlify(url, text, ret); +} + +typedef enum { + LINE_SECTION, + LINE_COMMENT, + LINE_NORMAL, +} LineType; + +static LineType classify_line_type(const char *line, CatFlags flags) { + const char *t = skip_leading_chars(line, WHITESPACE); + + if ((flags & CAT_FORMAT_HAS_SECTIONS) && *t == '[') + return LINE_SECTION; + if (IN_SET(*t, '#', ';', '\0')) + return LINE_COMMENT; + return LINE_NORMAL; +} + +static int cat_file(const char *filename, bool newline, CatFlags flags) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *urlified = NULL, *section = NULL, *old_section = NULL; + int r; + + f = fopen(filename, "re"); + if (!f) + return -errno; + + r = terminal_urlify_path(filename, NULL, &urlified); + if (r < 0) + return r; + + printf("%s%s# %s%s\n", + newline ? "\n" : "", + ansi_highlight_blue(), + urlified, + ansi_normal()); + fflush(stdout); + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read \"%s\": %m", filename); + if (r == 0) + break; + + LineType line_type = classify_line_type(line, flags); + if (flags & CAT_TLDR) { + if (line_type == LINE_SECTION) { + /* The start of a section, let's not print it yet. */ + free_and_replace(section, line); + continue; + } + + if (line_type == LINE_COMMENT) + continue; + + /* Before we print the actual line, print the last section header */ + if (section) { + /* Do not print redundant section headers */ + if (!streq_ptr(section, old_section)) + printf("%s%s%s\n", + ansi_highlight_cyan(), + section, + ansi_normal()); + + free_and_replace(old_section, section); + } + } + + printf("%s%s%s\n", + line_type == LINE_SECTION ? ansi_highlight_cyan() : + line_type == LINE_COMMENT ? ansi_highlight_grey() : + "", + line, + line_type != LINE_NORMAL ? ansi_normal() : ""); + } + + return 0; +} + +int cat_files(const char *file, char **dropins, CatFlags flags) { + int r; + + if (file) { + r = cat_file(file, /* newline= */ false, flags); + if (r < 0) + return log_warning_errno(r, "Failed to cat %s: %m", file); + } + + STRV_FOREACH(path, dropins) { + r = cat_file(*path, /* newline= */ file || path != dropins, flags); + if (r < 0) + return log_warning_errno(r, "Failed to cat %s: %m", *path); + } + + return 0; +} + +void print_separator(void) { + + /* Outputs a separator line that resolves to whitespace when copied from the terminal. We do that by outputting + * one line filled with spaces with ANSI underline set, followed by a second (empty) line. */ + + if (underline_enabled()) { + size_t i, c; + + c = columns(); + + flockfile(stdout); + fputs_unlocked(ANSI_UNDERLINE, stdout); + + for (i = 0; i < c; i++) + fputc_unlocked(' ', stdout); + + fputs_unlocked(ANSI_NORMAL "\n\n", stdout); + funlockfile(stdout); + } else + fputs("\n\n", stdout); +} + +static int guess_type(const char **name, char ***prefixes, bool *is_collection, const char **extension) { + /* Try to figure out if name is like tmpfiles.d/ or systemd/system-presets/, + * i.e. a collection of directories without a main config file. + * Incidentally, all those formats don't use sections. So we return a single + * is_collection boolean, which also means that the format doesn't use sections. + */ + + _cleanup_free_ char *n = NULL; + bool usr = false, run = false, coll = false; + const char *ext = ".conf"; + /* This is static so that the array doesn't get deallocated when we exit the function */ + static const char* const std_prefixes[] = { CONF_PATHS(""), NULL }; + static const char* const usr_prefixes[] = { CONF_PATHS_USR(""), NULL }; + static const char* const run_prefixes[] = { "/run/", NULL }; + + if (path_equal(*name, "environment.d")) + /* Special case: we need to include /etc/environment in the search path, even + * though the whole concept is called environment.d. */ + *name = "environment"; + + n = strdup(*name); + if (!n) + return log_oom(); + + /* All systemd-style config files should support the /usr-/etc-/run split and + * dropins. Let's add a blanket rule that allows us to support them without keeping + * an explicit list. */ + if (path_startswith(n, "systemd") && endswith(n, ".conf")) + usr = true; + + delete_trailing_chars(n, "/"); + + if (endswith(n, ".d")) + coll = true; + + if (path_equal(n, "environment")) + usr = true; + + if (path_equal(n, "udev/hwdb.d")) + ext = ".hwdb"; + + if (path_equal(n, "udev/rules.d")) + ext = ".rules"; + + if (path_equal(n, "kernel/install.d")) + ext = ".install"; + + if (path_equal(n, "systemd/ntp-units.d")) { + coll = true; + ext = ".list"; + } + + if (path_equal(n, "systemd/relabel-extra.d")) { + coll = run = true; + ext = ".relabel"; + } + + if (PATH_IN_SET(n, "systemd/system-preset", "systemd/user-preset")) { + coll = true; + ext = ".preset"; + } + + if (path_equal(n, "systemd/user-preset")) + usr = true; + + *prefixes = (char**) (usr ? usr_prefixes : run ? run_prefixes : std_prefixes); + *is_collection = coll; + *extension = ext; + return 0; +} + +int conf_files_cat(const char *root, const char *name, CatFlags flags) { + _cleanup_strv_free_ char **dirs = NULL, **files = NULL; + _cleanup_free_ char *path = NULL; + char **prefixes = NULL; /* explicit initialization to appease gcc */ + bool is_collection; + const char *extension; + int r; + + r = guess_type(&name, &prefixes, &is_collection, &extension); + if (r < 0) + return r; + assert(prefixes); + assert(extension); + + STRV_FOREACH(prefix, prefixes) { + assert(endswith(*prefix, "/")); + r = strv_extendf(&dirs, "%s%s%s", *prefix, name, + is_collection ? "" : ".d"); + if (r < 0) + return log_error_errno(r, "Failed to build directory list: %m"); + } + + if (DEBUG_LOGGING) { + log_debug("Looking for configuration in:"); + if (!is_collection) + STRV_FOREACH(prefix, prefixes) + log_debug(" %s%s%s", strempty(root), *prefix, name); + + STRV_FOREACH(t, dirs) + log_debug(" %s%s/*%s", strempty(root), *t, extension); + } + + /* First locate the main config file, if any */ + if (!is_collection) { + STRV_FOREACH(prefix, prefixes) { + path = path_join(root, *prefix, name); + if (!path) + return log_oom(); + if (access(path, F_OK) == 0) + break; + path = mfree(path); + } + + if (!path) + printf("%s# Main configuration file %s not found%s\n", + ansi_highlight_magenta(), + name, + ansi_normal()); + } + + /* Then locate the drop-ins, if any */ + r = conf_files_list_strv(&files, extension, root, 0, (const char* const*) dirs); + if (r < 0) + return log_error_errno(r, "Failed to query file list: %m"); + + /* Show */ + if (is_collection) + flags |= CAT_FORMAT_HAS_SECTIONS; + + return cat_files(path, files, flags); +} diff --git a/src/shared/pretty-print.h b/src/shared/pretty-print.h new file mode 100644 index 0000000..c17e976 --- /dev/null +++ b/src/shared/pretty-print.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "glyph-util.h" +#include "terminal-util.h" + +#define CYLON_BUFFER_EXTRA (2*STRLEN(ANSI_RED) + STRLEN(ANSI_HIGHLIGHT_RED) + 2*STRLEN(ANSI_NORMAL)) + +void draw_cylon(char buffer[], size_t buflen, unsigned width, unsigned pos); + +void print_separator(void); + +int file_url_from_path(const char *path, char **ret); + +bool urlify_enabled(void); + +int terminal_urlify(const char *url, const char *text, char **ret); +int terminal_urlify_path(const char *path, const char *text, char **ret); +int terminal_urlify_man(const char *page, const char *section, char **ret); + +typedef enum CatFlags { + CAT_CONFIG_OFF = 0, + CAT_CONFIG_ON = 1 << 0, + CAT_FORMAT_HAS_SECTIONS = 1 << 1, /* Sections are meaningful for this file format */ + CAT_TLDR = 1 << 2, /* Only print comments and relevant section headers */ +} CatFlags; + +int cat_files(const char *file, char **dropins, CatFlags flags); +int conf_files_cat(const char *root, const char *name, CatFlags flags); + +#define RED_CROSS_MARK_MAX (STRLEN(ANSI_HIGHLIGHT_RED) + STRLEN("✗") + STRLEN(ANSI_NORMAL) + 1) +#define GREEN_CHECK_MARK_MAX (STRLEN(ANSI_HIGHLIGHT_GREEN) + STRLEN("✓") + STRLEN(ANSI_NORMAL) + 1) + +static inline const char *red_cross_mark_internal(char buffer[static RED_CROSS_MARK_MAX]) { + assert(buffer); + assert_se(stpcpy(stpcpy(stpcpy(buffer, ansi_highlight_red()), special_glyph(SPECIAL_GLYPH_CROSS_MARK)), ansi_normal()) < buffer + RED_CROSS_MARK_MAX); + return buffer; +} + +static inline const char *green_check_mark_internal(char buffer[static GREEN_CHECK_MARK_MAX]) { + assert(buffer); + assert_se(stpcpy(stpcpy(stpcpy(buffer, ansi_highlight_green()), special_glyph(SPECIAL_GLYPH_CHECK_MARK)), ansi_normal()) < buffer + GREEN_CHECK_MARK_MAX); + return buffer; +} + +#define RED_CROSS_MARK() red_cross_mark_internal((char[RED_CROSS_MARK_MAX]) {}) +#define GREEN_CHECK_MARK() green_check_mark_internal((char[GREEN_CHECK_MARK_MAX]) {}) + +#define COLOR_MARK_BOOL(b) ((b) ? GREEN_CHECK_MARK() : RED_CROSS_MARK()) diff --git a/src/shared/ptyfwd.c b/src/shared/ptyfwd.c new file mode 100644 index 0000000..195e603 --- /dev/null +++ b/src/shared/ptyfwd.c @@ -0,0 +1,677 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-event.h" + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "ptyfwd.h" +#include "terminal-util.h" +#include "time-util.h" + +struct PTYForward { + sd_event *event; + + int input_fd; + int output_fd; + int master; + + PTYForwardFlags flags; + + sd_event_source *stdin_event_source; + sd_event_source *stdout_event_source; + sd_event_source *master_event_source; + + sd_event_source *sigwinch_event_source; + + struct termios saved_stdin_attr; + struct termios saved_stdout_attr; + + bool close_input_fd:1; + bool close_output_fd:1; + + bool saved_stdin:1; + bool saved_stdout:1; + + bool stdin_readable:1; + bool stdin_hangup:1; + bool stdout_writable:1; + bool stdout_hangup:1; + bool master_readable:1; + bool master_writable:1; + bool master_hangup:1; + + bool read_from_master:1; + + bool done:1; + bool drain:1; + + bool last_char_set:1; + char last_char; + + char in_buffer[LINE_MAX], out_buffer[LINE_MAX]; + size_t in_buffer_full, out_buffer_full; + + usec_t escape_timestamp; + unsigned escape_counter; + + PTYForwardHandler handler; + void *userdata; +}; + +#define ESCAPE_USEC (1*USEC_PER_SEC) + +static void pty_forward_disconnect(PTYForward *f) { + + if (!f) + return; + + f->stdin_event_source = sd_event_source_unref(f->stdin_event_source); + f->stdout_event_source = sd_event_source_unref(f->stdout_event_source); + + f->master_event_source = sd_event_source_unref(f->master_event_source); + f->sigwinch_event_source = sd_event_source_unref(f->sigwinch_event_source); + f->event = sd_event_unref(f->event); + + if (f->output_fd >= 0) { + if (f->saved_stdout) + (void) tcsetattr(f->output_fd, TCSANOW, &f->saved_stdout_attr); + + /* STDIN/STDOUT should not be non-blocking normally, so let's reset it */ + (void) fd_nonblock(f->output_fd, false); + if (f->close_output_fd) + f->output_fd = safe_close(f->output_fd); + } + + if (f->input_fd >= 0) { + if (f->saved_stdin) + (void) tcsetattr(f->input_fd, TCSANOW, &f->saved_stdin_attr); + + (void) fd_nonblock(f->input_fd, false); + if (f->close_input_fd) + f->input_fd = safe_close(f->input_fd); + } + + f->saved_stdout = f->saved_stdin = false; +} + +static int pty_forward_done(PTYForward *f, int rcode) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + assert(f); + + if (f->done) + return 0; + + e = sd_event_ref(f->event); + + f->done = true; + pty_forward_disconnect(f); + + if (f->handler) + return f->handler(f, rcode, f->userdata); + else + return sd_event_exit(e, rcode < 0 ? EXIT_FAILURE : rcode); +} + +static bool look_for_escape(PTYForward *f, const char *buffer, size_t n) { + const char *p; + + assert(f); + assert(buffer); + assert(n > 0); + + for (p = buffer; p < buffer + n; p++) { + + /* Check for ^] */ + if (*p == 0x1D) { + usec_t nw = now(CLOCK_MONOTONIC); + + if (f->escape_counter == 0 || nw > f->escape_timestamp + ESCAPE_USEC) { + f->escape_timestamp = nw; + f->escape_counter = 1; + } else { + (f->escape_counter)++; + + if (f->escape_counter >= 3) + return true; + } + } else { + f->escape_timestamp = 0; + f->escape_counter = 0; + } + } + + return false; +} + +static bool ignore_vhangup(PTYForward *f) { + assert(f); + + if (f->flags & PTY_FORWARD_IGNORE_VHANGUP) + return true; + + if ((f->flags & PTY_FORWARD_IGNORE_INITIAL_VHANGUP) && !f->read_from_master) + return true; + + return false; +} + +static bool drained(PTYForward *f) { + int q = 0; + + assert(f); + + if (f->out_buffer_full > 0) + return false; + + if (f->master_readable) + return false; + + if (ioctl(f->master, TIOCINQ, &q) < 0) + log_debug_errno(errno, "TIOCINQ failed on master: %m"); + else if (q > 0) + return false; + + if (ioctl(f->master, TIOCOUTQ, &q) < 0) + log_debug_errno(errno, "TIOCOUTQ failed on master: %m"); + else if (q > 0) + return false; + + return true; +} + +static int shovel(PTYForward *f) { + ssize_t k; + + assert(f); + + while ((f->stdin_readable && f->in_buffer_full <= 0) || + (f->master_writable && f->in_buffer_full > 0) || + (f->master_readable && f->out_buffer_full <= 0) || + (f->stdout_writable && f->out_buffer_full > 0)) { + + if (f->stdin_readable && f->in_buffer_full < LINE_MAX) { + + k = read(f->input_fd, f->in_buffer + f->in_buffer_full, LINE_MAX - f->in_buffer_full); + if (k < 0) { + + if (errno == EAGAIN) + f->stdin_readable = false; + else if (errno == EIO || ERRNO_IS_DISCONNECT(errno)) { + f->stdin_readable = false; + f->stdin_hangup = true; + + f->stdin_event_source = sd_event_source_unref(f->stdin_event_source); + } else { + log_error_errno(errno, "read(): %m"); + return pty_forward_done(f, -errno); + } + } else if (k == 0) { + /* EOF on stdin */ + f->stdin_readable = false; + f->stdin_hangup = true; + + f->stdin_event_source = sd_event_source_unref(f->stdin_event_source); + } else { + /* Check if ^] has been pressed three times within one second. If we get this we quite + * immediately. */ + if (look_for_escape(f, f->in_buffer + f->in_buffer_full, k)) + return pty_forward_done(f, -ECANCELED); + + f->in_buffer_full += (size_t) k; + } + } + + if (f->master_writable && f->in_buffer_full > 0) { + + k = write(f->master, f->in_buffer, f->in_buffer_full); + if (k < 0) { + + if (IN_SET(errno, EAGAIN, EIO)) + f->master_writable = false; + else if (IN_SET(errno, EPIPE, ECONNRESET)) { + f->master_writable = f->master_readable = false; + f->master_hangup = true; + + f->master_event_source = sd_event_source_unref(f->master_event_source); + } else { + log_error_errno(errno, "write(): %m"); + return pty_forward_done(f, -errno); + } + } else { + assert(f->in_buffer_full >= (size_t) k); + memmove(f->in_buffer, f->in_buffer + k, f->in_buffer_full - k); + f->in_buffer_full -= k; + } + } + + if (f->master_readable && f->out_buffer_full < LINE_MAX) { + + k = read(f->master, f->out_buffer + f->out_buffer_full, LINE_MAX - f->out_buffer_full); + if (k < 0) { + + /* Note that EIO on the master device + * might be caused by vhangup() or + * temporary closing of everything on + * the other side, we treat it like + * EAGAIN here and try again, unless + * ignore_vhangup is off. */ + + if (errno == EAGAIN || (errno == EIO && ignore_vhangup(f))) + f->master_readable = false; + else if (IN_SET(errno, EPIPE, ECONNRESET, EIO)) { + f->master_readable = f->master_writable = false; + f->master_hangup = true; + + f->master_event_source = sd_event_source_unref(f->master_event_source); + } else { + log_error_errno(errno, "read(): %m"); + return pty_forward_done(f, -errno); + } + } else { + f->read_from_master = true; + f->out_buffer_full += (size_t) k; + } + } + + if (f->stdout_writable && f->out_buffer_full > 0) { + + k = write(f->output_fd, f->out_buffer, f->out_buffer_full); + if (k < 0) { + + if (errno == EAGAIN) + f->stdout_writable = false; + else if (errno == EIO || ERRNO_IS_DISCONNECT(errno)) { + f->stdout_writable = false; + f->stdout_hangup = true; + f->stdout_event_source = sd_event_source_unref(f->stdout_event_source); + } else { + log_error_errno(errno, "write(): %m"); + return pty_forward_done(f, -errno); + } + + } else { + + if (k > 0) { + f->last_char = f->out_buffer[k-1]; + f->last_char_set = true; + } + + assert(f->out_buffer_full >= (size_t) k); + memmove(f->out_buffer, f->out_buffer + k, f->out_buffer_full - k); + f->out_buffer_full -= k; + } + } + } + + if (f->stdin_hangup || f->stdout_hangup || f->master_hangup) { + /* Exit the loop if any side hung up and if there's + * nothing more to write or nothing we could write. */ + + if ((f->out_buffer_full <= 0 || f->stdout_hangup) && + (f->in_buffer_full <= 0 || f->master_hangup)) + return pty_forward_done(f, 0); + } + + /* If we were asked to drain, and there's nothing more to handle from the master, then call the callback + * too. */ + if (f->drain && drained(f)) + return pty_forward_done(f, 0); + + return 0; +} + +static int on_master_event(sd_event_source *e, int fd, uint32_t revents, void *userdata) { + PTYForward *f = ASSERT_PTR(userdata); + + assert(e); + assert(e == f->master_event_source); + assert(fd >= 0); + assert(fd == f->master); + + if (revents & (EPOLLIN|EPOLLHUP)) + f->master_readable = true; + + if (revents & (EPOLLOUT|EPOLLHUP)) + f->master_writable = true; + + return shovel(f); +} + +static int on_stdin_event(sd_event_source *e, int fd, uint32_t revents, void *userdata) { + PTYForward *f = ASSERT_PTR(userdata); + + assert(e); + assert(e == f->stdin_event_source); + assert(fd >= 0); + assert(fd == f->input_fd); + + if (revents & (EPOLLIN|EPOLLHUP)) + f->stdin_readable = true; + + return shovel(f); +} + +static int on_stdout_event(sd_event_source *e, int fd, uint32_t revents, void *userdata) { + PTYForward *f = ASSERT_PTR(userdata); + + assert(e); + assert(e == f->stdout_event_source); + assert(fd >= 0); + assert(fd == f->output_fd); + + if (revents & (EPOLLOUT|EPOLLHUP)) + f->stdout_writable = true; + + return shovel(f); +} + +static int on_sigwinch_event(sd_event_source *e, const struct signalfd_siginfo *si, void *userdata) { + PTYForward *f = ASSERT_PTR(userdata); + struct winsize ws; + + assert(e); + assert(e == f->sigwinch_event_source); + + /* The window size changed, let's forward that. */ + if (ioctl(f->output_fd, TIOCGWINSZ, &ws) >= 0) + (void) ioctl(f->master, TIOCSWINSZ, &ws); + + return 0; +} + +int pty_forward_new( + sd_event *event, + int master, + PTYForwardFlags flags, + PTYForward **ret) { + + _cleanup_(pty_forward_freep) PTYForward *f = NULL; + struct winsize ws; + int r; + + f = new(PTYForward, 1); + if (!f) + return -ENOMEM; + + *f = (struct PTYForward) { + .flags = flags, + .master = -EBADF, + .input_fd = -EBADF, + .output_fd = -EBADF, + }; + + if (event) + f->event = sd_event_ref(event); + else { + r = sd_event_default(&f->event); + if (r < 0) + return r; + } + + if (FLAGS_SET(flags, PTY_FORWARD_READ_ONLY)) + f->output_fd = STDOUT_FILENO; + else { + /* If we shall be invoked in interactive mode, let's switch on non-blocking mode, so that we + * never end up staving one direction while we block on the other. However, let's be careful + * here and not turn on O_NONBLOCK for stdin/stdout directly, but of re-opened copies of + * them. This has two advantages: when we are killed abruptly the stdin/stdout fds won't be + * left in O_NONBLOCK state for the next process using them. In addition, if some process + * running in the background wants to continue writing to our stdout it can do so without + * being confused by O_NONBLOCK. */ + + f->input_fd = fd_reopen(STDIN_FILENO, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (f->input_fd < 0) { + /* Handle failures gracefully, after all certain fd types cannot be reopened + * (sockets, …) */ + log_debug_errno(f->input_fd, "Failed to reopen stdin, using original fd: %m"); + + r = fd_nonblock(STDIN_FILENO, true); + if (r < 0) + return r; + + f->input_fd = STDIN_FILENO; + } else + f->close_input_fd = true; + + f->output_fd = fd_reopen(STDOUT_FILENO, O_WRONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); + if (f->output_fd < 0) { + log_debug_errno(f->output_fd, "Failed to reopen stdout, using original fd: %m"); + + r = fd_nonblock(STDOUT_FILENO, true); + if (r < 0) + return r; + + f->output_fd = STDOUT_FILENO; + } else + f->close_output_fd = true; + } + + r = fd_nonblock(master, true); + if (r < 0) + return r; + + f->master = master; + + if (ioctl(f->output_fd, TIOCGWINSZ, &ws) < 0) + /* If we can't get the resolution from the output fd, then use our internal, regular width/height, + * i.e. something derived from $COLUMNS and $LINES if set. */ + ws = (struct winsize) { + .ws_row = lines(), + .ws_col = columns(), + }; + + (void) ioctl(master, TIOCSWINSZ, &ws); + + if (!(flags & PTY_FORWARD_READ_ONLY)) { + assert(f->input_fd >= 0); + + if (tcgetattr(f->input_fd, &f->saved_stdin_attr) >= 0) { + struct termios raw_stdin_attr; + + f->saved_stdin = true; + + raw_stdin_attr = f->saved_stdin_attr; + cfmakeraw(&raw_stdin_attr); + raw_stdin_attr.c_oflag = f->saved_stdin_attr.c_oflag; + tcsetattr(f->input_fd, TCSANOW, &raw_stdin_attr); + } + + if (tcgetattr(f->output_fd, &f->saved_stdout_attr) >= 0) { + struct termios raw_stdout_attr; + + f->saved_stdout = true; + + raw_stdout_attr = f->saved_stdout_attr; + cfmakeraw(&raw_stdout_attr); + raw_stdout_attr.c_iflag = f->saved_stdout_attr.c_iflag; + raw_stdout_attr.c_lflag = f->saved_stdout_attr.c_lflag; + tcsetattr(f->output_fd, TCSANOW, &raw_stdout_attr); + } + + r = sd_event_add_io(f->event, &f->stdin_event_source, f->input_fd, EPOLLIN|EPOLLET, on_stdin_event, f); + if (r < 0 && r != -EPERM) + return r; + + if (r >= 0) + (void) sd_event_source_set_description(f->stdin_event_source, "ptyfwd-stdin"); + } + + r = sd_event_add_io(f->event, &f->stdout_event_source, f->output_fd, EPOLLOUT|EPOLLET, on_stdout_event, f); + if (r == -EPERM) + /* stdout without epoll support. Likely redirected to regular file. */ + f->stdout_writable = true; + else if (r < 0) + return r; + else + (void) sd_event_source_set_description(f->stdout_event_source, "ptyfwd-stdout"); + + r = sd_event_add_io(f->event, &f->master_event_source, master, EPOLLIN|EPOLLOUT|EPOLLET, on_master_event, f); + if (r < 0) + return r; + + (void) sd_event_source_set_description(f->master_event_source, "ptyfwd-master"); + + r = sd_event_add_signal(f->event, &f->sigwinch_event_source, SIGWINCH, on_sigwinch_event, f); + if (r < 0) + return r; + + (void) sd_event_source_set_description(f->sigwinch_event_source, "ptyfwd-sigwinch"); + + *ret = TAKE_PTR(f); + + return 0; +} + +PTYForward *pty_forward_free(PTYForward *f) { + pty_forward_disconnect(f); + return mfree(f); +} + +int pty_forward_get_last_char(PTYForward *f, char *ch) { + assert(f); + assert(ch); + + if (!f->last_char_set) + return -ENXIO; + + *ch = f->last_char; + return 0; +} + +int pty_forward_set_ignore_vhangup(PTYForward *f, bool b) { + int r; + + assert(f); + + if (!!(f->flags & PTY_FORWARD_IGNORE_VHANGUP) == b) + return 0; + + SET_FLAG(f->flags, PTY_FORWARD_IGNORE_VHANGUP, b); + + if (!ignore_vhangup(f)) { + + /* We shall now react to vhangup()s? Let's check + * immediately if we might be in one */ + + f->master_readable = true; + r = shovel(f); + if (r < 0) + return r; + } + + return 0; +} + +bool pty_forward_get_ignore_vhangup(PTYForward *f) { + assert(f); + + return !!(f->flags & PTY_FORWARD_IGNORE_VHANGUP); +} + +bool pty_forward_is_done(PTYForward *f) { + assert(f); + + return f->done; +} + +void pty_forward_set_handler(PTYForward *f, PTYForwardHandler cb, void *userdata) { + assert(f); + + f->handler = cb; + f->userdata = userdata; +} + +bool pty_forward_drain(PTYForward *f) { + assert(f); + + /* Starts draining the forwarder. Specifically: + * + * - Returns true if there are no unprocessed bytes from the pty, false otherwise + * + * - Makes sure the handler function is called the next time the number of unprocessed bytes hits zero + */ + + f->drain = true; + return drained(f); +} + +int pty_forward_set_priority(PTYForward *f, int64_t priority) { + int r; + assert(f); + + if (f->stdin_event_source) { + r = sd_event_source_set_priority(f->stdin_event_source, priority); + if (r < 0) + return r; + } + + r = sd_event_source_set_priority(f->stdout_event_source, priority); + if (r < 0) + return r; + + r = sd_event_source_set_priority(f->master_event_source, priority); + if (r < 0) + return r; + + r = sd_event_source_set_priority(f->sigwinch_event_source, priority); + if (r < 0) + return r; + + return 0; +} + +int pty_forward_set_width_height(PTYForward *f, unsigned width, unsigned height) { + struct winsize ws; + + assert(f); + + if (width == UINT_MAX && height == UINT_MAX) + return 0; /* noop */ + + if (width != UINT_MAX && + (width == 0 || width > USHRT_MAX)) + return -ERANGE; + + if (height != UINT_MAX && + (height == 0 || height > USHRT_MAX)) + return -ERANGE; + + if (width == UINT_MAX || height == UINT_MAX) { + if (ioctl(f->master, TIOCGWINSZ, &ws) < 0) + return -errno; + + if (width != UINT_MAX) + ws.ws_col = width; + if (height != UINT_MAX) + ws.ws_row = height; + } else + ws = (struct winsize) { + .ws_row = height, + .ws_col = width, + }; + + if (ioctl(f->master, TIOCSWINSZ, &ws) < 0) + return -errno; + + /* Make sure we ignore SIGWINCH window size events from now on */ + f->sigwinch_event_source = sd_event_source_unref(f->sigwinch_event_source); + + return 0; +} diff --git a/src/shared/ptyfwd.h b/src/shared/ptyfwd.h new file mode 100644 index 0000000..f0ae6e9 --- /dev/null +++ b/src/shared/ptyfwd.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-event.h" + +#include "macro.h" + +typedef struct PTYForward PTYForward; + +typedef enum PTYForwardFlags { + PTY_FORWARD_READ_ONLY = 1, + + /* Continue reading after hangup? */ + PTY_FORWARD_IGNORE_VHANGUP = 2, + + /* Continue reading after hangup but only if we never read anything else? */ + PTY_FORWARD_IGNORE_INITIAL_VHANGUP = 4, +} PTYForwardFlags; + +typedef int (*PTYForwardHandler)(PTYForward *f, int rcode, void *userdata); + +int pty_forward_new(sd_event *event, int master, PTYForwardFlags flags, PTYForward **f); +PTYForward *pty_forward_free(PTYForward *f); + +int pty_forward_get_last_char(PTYForward *f, char *ch); + +int pty_forward_set_ignore_vhangup(PTYForward *f, bool ignore_vhangup); +bool pty_forward_get_ignore_vhangup(PTYForward *f); + +bool pty_forward_is_done(PTYForward *f); + +void pty_forward_set_handler(PTYForward *f, PTYForwardHandler handler, void *userdata); + +bool pty_forward_drain(PTYForward *f); + +int pty_forward_set_priority(PTYForward *f, int64_t priority); + +int pty_forward_set_width_height(PTYForward *f, unsigned width, unsigned height); + +DEFINE_TRIVIAL_CLEANUP_FUNC(PTYForward*, pty_forward_free); diff --git a/src/shared/qrcode-util.c b/src/shared/qrcode-util.c new file mode 100644 index 0000000..b0dd90a --- /dev/null +++ b/src/shared/qrcode-util.c @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "qrcode-util.h" + +#if HAVE_QRENCODE +#include + +#include "dlfcn-util.h" +#include "locale-util.h" +#include "log.h" +#include "strv.h" +#include "terminal-util.h" + +#define ANSI_WHITE_ON_BLACK "\033[40;37;1m" +#define UNICODE_FULL_BLOCK u8"█" +#define UNICODE_LOWER_HALF_BLOCK u8"▄" +#define UNICODE_UPPER_HALF_BLOCK u8"▀" + +static void *qrcode_dl = NULL; + +static QRcode* (*sym_QRcode_encodeString)(const char *string, int version, QRecLevel level, QRencodeMode hint, int casesensitive) = NULL; +static void (*sym_QRcode_free)(QRcode *qrcode) = NULL; + +int dlopen_qrencode(void) { + int r; + + FOREACH_STRING(s, "libqrencode.so.4", "libqrencode.so.3") { + r = dlopen_many_sym_or_warn( + &qrcode_dl, s, LOG_DEBUG, + DLSYM_ARG(QRcode_encodeString), + DLSYM_ARG(QRcode_free)); + if (r >= 0) + break; + } + + return r; +} + +static void print_border(FILE *output, unsigned width, unsigned row, unsigned column) { + assert(output); + assert(width); + + if (row != UINT_MAX && column != UINT_MAX) { + int r, fd; + + fd = fileno(output); + if (fd < 0) + return (void)log_debug_errno(errno, "Failed to get file descriptor from the file stream: %m"); + + r = set_terminal_cursor_position(fd, row, column); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + + /* Four rows of border */ + for (unsigned y = 0; y < 4; y += 2) { + fputs(ANSI_WHITE_ON_BLACK, output); + + for (unsigned x = 0; x < 4 + width + 4; x++) + fputs(UNICODE_FULL_BLOCK, output); + + fputs(ANSI_NORMAL "\n", output); + r = set_terminal_cursor_position(fd, row + 1, column); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + } + } else { + /* Four rows of border */ + for (unsigned y = 0; y < 4; y += 2) { + fputs(ANSI_WHITE_ON_BLACK, output); + + for (unsigned x = 0; x < 4 + width + 4; x++) + fputs(UNICODE_FULL_BLOCK, output); + + fputs(ANSI_NORMAL "\n", output); + } + } +} + +static void write_qrcode(FILE *output, QRcode *qr, unsigned int row, unsigned int column) { + assert(qr); + + if (!output) + output = stdout; + + print_border(output, qr->width, row, column); + + if (row != UINT_MAX && column != UINT_MAX) { + /* After printing two rows of top border, we need to move the cursor down two rows before starting to print the actual QR code */ + int r, fd, move_down = 2; + fd = fileno(output); + if (fd < 0) + return (void)log_debug_errno(errno, "Failed to get file descriptor from the file stream: %m"); + + r = set_terminal_cursor_position(fd, row + move_down, column); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + + for (unsigned y = 0; y < (unsigned) qr->width; y += 2) { + const uint8_t *row1 = qr->data + qr->width * y; + const uint8_t *row2 = row1 + qr->width; + + fputs(ANSI_WHITE_ON_BLACK, output); + + for (unsigned x = 0; x < 4; x++) + fputs(UNICODE_FULL_BLOCK, output); + + for (unsigned x = 0; x < (unsigned) qr->width; x++) { + bool a, b; + + a = row1[x] & 1; + b = (y+1) < (unsigned) qr->width ? (row2[x] & 1) : false; + + if (a && b) + fputc(' ', output); + else if (a) + fputs(UNICODE_LOWER_HALF_BLOCK, output); + else if (b) + fputs(UNICODE_UPPER_HALF_BLOCK, output); + else + fputs(UNICODE_FULL_BLOCK, output); + } + + for (unsigned x = 0; x < 4; x++) + fputs(UNICODE_FULL_BLOCK, output); + r = set_terminal_cursor_position(fd, row + move_down, column); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + move_down += 1; + fputs(ANSI_NORMAL "\n", output); + } + + print_border(output, qr->width, row + move_down, column); + } else { + + for (unsigned y = 0; y < (unsigned) qr->width; y += 2) { + const uint8_t *row1 = qr->data + qr->width * y; + const uint8_t *row2 = row1 + qr->width; + + fputs(ANSI_WHITE_ON_BLACK, output); + for (unsigned x = 0; x < 4; x++) + fputs(UNICODE_FULL_BLOCK, output); + + for (unsigned x = 0; x < (unsigned) qr->width; x++) { + bool a, b; + + a = row1[x] & 1; + b = (y+1) < (unsigned) qr->width ? (row2[x] & 1) : false; + + if (a && b) + fputc(' ', output); + else if (a) + fputs(UNICODE_LOWER_HALF_BLOCK, output); + else if (b) + fputs(UNICODE_UPPER_HALF_BLOCK, output); + else + fputs(UNICODE_FULL_BLOCK, output); + } + + for (unsigned x = 0; x < 4; x++) + fputs(UNICODE_FULL_BLOCK, output); + fputs(ANSI_NORMAL "\n", output); + } + + print_border(output, qr->width, row, column); + } + + fflush(output); +} + +int print_qrcode_full(FILE *out, const char *header, const char *string, unsigned row, unsigned column, unsigned tty_width, unsigned tty_height) { + QRcode* qr; + int r; + + /* If this is not a UTF-8 system or ANSI colors aren't supported/disabled don't print any QR + * codes */ + if (!is_locale_utf8() || !colors_enabled()) + return -EOPNOTSUPP; + + r = dlopen_qrencode(); + if (r < 0) + return r; + + qr = sym_QRcode_encodeString(string, 0, QR_ECLEVEL_L, QR_MODE_8, 1); + if (!qr) + return -ENOMEM; + + if (row != UINT_MAX && column != UINT_MAX) { + int fd; + unsigned qr_code_width, qr_code_height; + fd = fileno(out); + if (fd < 0) + return log_debug_errno(errno, "Failed to get file descriptor from the file stream: %m"); + qr_code_width = qr_code_height = qr->width + 8; + + if (column + qr_code_width > tty_width) + column = tty_width - qr_code_width; + + /* Terminal characters are twice as high as they are wide so it's qr_code_height / 2, + * our QR code prints an extra new line, so we have -1 as well */ + if (row + qr_code_height > tty_height) + row = tty_height - (qr_code_height / 2 ) - 1; + + if (header) { + r = set_terminal_cursor_position(fd, row - 2, tty_width - qr_code_width - 2); + if (r < 0) + log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m"); + + fprintf(out, "%s:\n\n", header); + } + } else + if (header) + fprintf(out, "\n%s:\n\n", header); + + write_qrcode(out, qr, row, column); + + fputc('\n', out); + + sym_QRcode_free(qr); + return 0; +} +#endif diff --git a/src/shared/qrcode-util.h b/src/shared/qrcode-util.h new file mode 100644 index 0000000..ee58294 --- /dev/null +++ b/src/shared/qrcode-util.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once +#include +#include +#include + +#if HAVE_QRENCODE +int dlopen_qrencode(void); + +int print_qrcode_full(FILE *out, const char *header, const char *string, unsigned row, unsigned column, unsigned tty_width, unsigned tty_height); +static inline int print_qrcode(FILE *out, const char *header, const char *string) { + return print_qrcode_full(out, header, string, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX); +} +#else +static inline int print_qrcode_full(FILE *out, const char *header, const char *string, unsigned row, unsigned column, unsigned tty_width, unsigned tty_height) { + return -EOPNOTSUPP; +} +static inline int print_qrcode(FILE *out, const char *header, const char *string) { + return -EOPNOTSUPP; +} +#endif diff --git a/src/shared/quota-util.c b/src/shared/quota-util.c new file mode 100644 index 0000000..4d014f8 --- /dev/null +++ b/src/shared/quota-util.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "device-util.h" +#include "quota-util.h" + +int quotactl_devnum(int cmd, dev_t devnum, int id, void *addr) { + _cleanup_free_ char *devnode = NULL; + int r; + + /* Like quotactl() but takes a dev_t instead of a path to a device node, and fixes caddr_t → void*, + * like we should, today */ + + r = devname_from_devnum(S_IFBLK, devnum, &devnode); + if (r < 0) + return r; + + if (quotactl(cmd, devnode, id, addr) < 0) + return -errno; + + return 0; +} + +int quotactl_path(int cmd, const char *path, int id, void *addr) { + dev_t devno; + int r; + + /* Like quotactl() but takes a path to some fs object, and changes the backing file system. I.e. the + * argument shouldn't be a block device but a regular file system object */ + + r = get_block_device(path, &devno); + if (r < 0) + return r; + if (devno == 0) /* Doesn't have a block device */ + return -ENODEV; + + return quotactl_devnum(cmd, devno, id, addr); +} diff --git a/src/shared/quota-util.h b/src/shared/quota-util.h new file mode 100644 index 0000000..14a390e --- /dev/null +++ b/src/shared/quota-util.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +/* Wrapper around the QCMD() macro of linux/quota.h that removes some undefined behaviour. A typical quota + * command such as QCMD(Q_GETQUOTA, USRQUOTA) cannot be resolved on platforms where "int" is 32-bit, as it is + * larger than INT_MAX. Yikes, because that are basically all platforms Linux supports. Let's add a wrapper + * that explicitly takes its arguments as unsigned 32-bit, and then converts the shift result explicitly to + * int, acknowledging the undefined behaviour of the kernel headers. This doesn't remove the undefined + * behaviour, but it stops ubsan from complaining about it. */ +static inline int QCMD_FIXED(uint32_t cmd, uint32_t type) { + return (int) QCMD(cmd, type); +} + +int quotactl_devnum(int cmd, dev_t devnum, int id, void *addr); +int quotactl_path(int cmd, const char *path, int id, void *addr); diff --git a/src/shared/reboot-util.c b/src/shared/reboot-util.c new file mode 100644 index 0000000..62ff697 --- /dev/null +++ b/src/shared/reboot-util.c @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#if HAVE_XENCTRL +#define __XEN_INTERFACE_VERSION__ 0x00040900 +#include +#include +#include +#endif + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "proc-cmdline.h" +#include "raw-reboot.h" +#include "reboot-util.h" +#include "string-util.h" +#include "umask-util.h" +#include "virt.h" + +int update_reboot_parameter_and_warn(const char *parameter, bool keep) { + int r; + + if (isempty(parameter)) { + if (keep) + return 0; + + if (unlink("/run/systemd/reboot-param") < 0) { + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to unlink reboot parameter file: %m"); + } + + return 0; + } + + WITH_UMASK(0022) { + r = write_string_file("/run/systemd/reboot-param", parameter, + WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC); + if (r < 0) + return log_warning_errno(r, "Failed to write reboot parameter file: %m"); + } + + return 0; +} + +int read_reboot_parameter(char **parameter) { + int r; + + assert(parameter); + + r = read_one_line_file("/run/systemd/reboot-param", parameter); + if (r < 0 && r != -ENOENT) + return log_debug_errno(r, "Failed to read /run/systemd/reboot-param: %m"); + + return 0; +} + +int reboot_with_parameter(RebootFlags flags) { + int r; + + /* Reboots the system with a parameter that is read from /run/systemd/reboot-param. Returns 0 if + * REBOOT_DRY_RUN was set and the actual reboot operation was hence skipped. If REBOOT_FALLBACK is + * set and the reboot with parameter doesn't work out a fallback to classic reboot() is attempted. If + * REBOOT_FALLBACK is not set, 0 is returned instead, which should be considered indication for the + * caller to fall back to reboot() on its own, or somehow else deal with this. If REBOOT_LOG is + * specified will log about what it is going to do, as well as all errors. */ + + if (detect_container() == 0) { + _cleanup_free_ char *parameter = NULL; + + r = read_one_line_file("/run/systemd/reboot-param", ¶meter); + if (r < 0 && r != -ENOENT) + log_full_errno(flags & REBOOT_LOG ? LOG_WARNING : LOG_DEBUG, r, + "Failed to read reboot parameter file, ignoring: %m"); + + if (!isempty(parameter)) { + log_full(flags & REBOOT_LOG ? LOG_INFO : LOG_DEBUG, + "Rebooting with argument '%s'.", parameter); + + if (flags & REBOOT_DRY_RUN) + return 0; + + (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, parameter); + + log_full_errno(flags & REBOOT_LOG ? LOG_WARNING : LOG_DEBUG, errno, + "Failed to reboot with parameter, retrying without: %m"); + } + } + + if (!(flags & REBOOT_FALLBACK)) + return 0; + + log_full(flags & REBOOT_LOG ? LOG_INFO : LOG_DEBUG, "Rebooting."); + + if (flags & REBOOT_DRY_RUN) + return 0; + + (void) reboot(RB_AUTOBOOT); + + return log_full_errno(flags & REBOOT_LOG ? LOG_ERR : LOG_DEBUG, errno, "Failed to reboot: %m"); +} + +bool shall_restore_state(void) { + static int cached = -1; + bool b = true; /* If nothing specified or the check fails, then defaults to true. */ + int r; + + if (cached >= 0) + return cached; + + r = proc_cmdline_get_bool("systemd.restore_state", PROC_CMDLINE_TRUE_WHEN_MISSING, &b); + if (r < 0) + log_debug_errno(r, "Failed to parse systemd.restore_state= kernel command line option, ignoring: %m"); + + return (cached = b); +} + +static int xen_kexec_loaded(void) { +#if HAVE_XENCTRL + _cleanup_close_ int privcmd_fd = -EBADF, buf_fd = -EBADF; + xen_kexec_status_t *buffer; + size_t size; + int r; + + if (access("/proc/xen", F_OK) < 0) { + if (errno == ENOENT) + return -EOPNOTSUPP; + return log_debug_errno(errno, "Unable to test whether /proc/xen exists: %m"); + } + + size = page_size(); + if (sizeof(xen_kexec_status_t) > size) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "page_size is too small for hypercall"); + + privcmd_fd = open("/dev/xen/privcmd", O_RDWR|O_CLOEXEC); + if (privcmd_fd < 0) + return log_debug_errno(errno, "Cannot access /dev/xen/privcmd: %m"); + + buf_fd = open("/dev/xen/hypercall", O_RDWR|O_CLOEXEC); + if (buf_fd < 0) + return log_debug_errno(errno, "Cannot access /dev/xen/hypercall: %m"); + + buffer = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, buf_fd, 0); + if (buffer == MAP_FAILED) + return log_debug_errno(errno, "Cannot allocate buffer for hypercall: %m"); + + *buffer = (xen_kexec_status_t) { + .type = KEXEC_TYPE_DEFAULT, + }; + + privcmd_hypercall_t call = { + .op = __HYPERVISOR_kexec_op, + .arg = { + KEXEC_CMD_kexec_status, + PTR_TO_UINT64(buffer), + }, + }; + + r = RET_NERRNO(ioctl(privcmd_fd, IOCTL_PRIVCMD_HYPERCALL, &call)); + if (r < 0) + log_debug_errno(r, "kexec_status failed: %m"); + + munmap(buffer, size); + + return r; +#else + return -EOPNOTSUPP; +#endif +} + +bool kexec_loaded(void) { + _cleanup_free_ char *s = NULL; + int r; + + r = xen_kexec_loaded(); + if (r >= 0) + return r; + + r = read_one_line_file("/sys/kernel/kexec_loaded", &s); + if (r < 0) { + if (r != -ENOENT) + log_debug_errno(r, "Unable to read /sys/kernel/kexec_loaded, ignoring: %m"); + return false; + } + + return s[0] == '1'; +} diff --git a/src/shared/reboot-util.h b/src/shared/reboot-util.h new file mode 100644 index 0000000..ccd15c7 --- /dev/null +++ b/src/shared/reboot-util.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int update_reboot_parameter_and_warn(const char *parameter, bool keep); + +typedef enum RebootFlags { + REBOOT_LOG = 1 << 0, /* log about what we are going to do and all errors */ + REBOOT_DRY_RUN = 1 << 1, /* return 0 right before actually doing the reboot */ + REBOOT_FALLBACK = 1 << 2, /* fall back to plain reboot() if argument-based reboot doesn't work, isn't configured or doesn't apply otherwise */ +} RebootFlags; + +int read_reboot_parameter(char **parameter); +int reboot_with_parameter(RebootFlags flags); + +bool shall_restore_state(void); + +bool kexec_loaded(void); diff --git a/src/shared/recovery-key.c b/src/shared/recovery-key.c new file mode 100644 index 0000000..6a2f4d0 --- /dev/null +++ b/src/shared/recovery-key.c @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memory-util.h" +#include "random-util.h" +#include "recovery-key.h" + +const char modhex_alphabet[16] = { + 'c', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'r', 't', 'u', 'v' +}; + +int decode_modhex_char(char x) { + + for (size_t i = 0; i < ELEMENTSOF(modhex_alphabet); i++) + /* Check both upper and lowercase */ + if (modhex_alphabet[i] == x || (modhex_alphabet[i] - 32) == x) + return i; + + return -EINVAL; +} + +int normalize_recovery_key(const char *password, char **ret) { + _cleanup_(erase_and_freep) char *mangled = NULL; + size_t l; + + assert(password); + assert(ret); + + l = strlen(password); + if (!IN_SET(l, + RECOVERY_KEY_MODHEX_RAW_LENGTH*2, /* syntax without dashes */ + RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1)) /* syntax with dashes */ + return -EINVAL; + + mangled = new(char, RECOVERY_KEY_MODHEX_FORMATTED_LENGTH); + if (!mangled) + return -ENOMEM; + + for (size_t i = 0, j = 0; i < RECOVERY_KEY_MODHEX_RAW_LENGTH; i++) { + size_t k; + int a, b; + + if (l == RECOVERY_KEY_MODHEX_RAW_LENGTH*2) + /* Syntax without dashes */ + k = i * 2; + else { + /* Syntax with dashes */ + assert(l == RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1); + k = i * 2 + i / 4; + + if (i > 0 && i % 4 == 0 && password[k-1] != '-') + return -EINVAL; + } + + a = decode_modhex_char(password[k]); + if (a < 0) + return -EINVAL; + b = decode_modhex_char(password[k+1]); + if (b < 0) + return -EINVAL; + + mangled[j++] = modhex_alphabet[a]; + mangled[j++] = modhex_alphabet[b]; + + if (i % 4 == 3) + mangled[j++] = '-'; + } + + mangled[RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1] = 0; + + *ret = TAKE_PTR(mangled); + return 0; +} + +int make_recovery_key(char **ret) { + _cleanup_(erase_and_freep) char *formatted = NULL; + _cleanup_(erase_and_freep) uint8_t *key = NULL; + size_t j = 0; + int r; + + assert(ret); + + key = new(uint8_t, RECOVERY_KEY_MODHEX_RAW_LENGTH); + if (!key) + return -ENOMEM; + + r = crypto_random_bytes(key, RECOVERY_KEY_MODHEX_RAW_LENGTH); + if (r < 0) + return r; + + /* Let's now format it as 64 modhex chars, and after each 8 chars insert a dash */ + formatted = new(char, RECOVERY_KEY_MODHEX_FORMATTED_LENGTH); + if (!formatted) + return -ENOMEM; + + for (size_t i = 0; i < RECOVERY_KEY_MODHEX_RAW_LENGTH; i++) { + formatted[j++] = modhex_alphabet[key[i] >> 4]; + formatted[j++] = modhex_alphabet[key[i] & 0xF]; + + if (i % 4 == 3) + formatted[j++] = '-'; + } + + assert(j == RECOVERY_KEY_MODHEX_FORMATTED_LENGTH); + assert(formatted[RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1] == '-'); + formatted[RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1] = 0; /* replace final dash with a NUL */ + + *ret = TAKE_PTR(formatted); + return 0; +} diff --git a/src/shared/recovery-key.h b/src/shared/recovery-key.h new file mode 100644 index 0000000..68e8051 --- /dev/null +++ b/src/shared/recovery-key.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/* 256 bit keys = 32 bytes */ +#define RECOVERY_KEY_MODHEX_RAW_LENGTH 32 + +/* Formatted as sequences of 64 modhex characters, with dashes inserted after multiples of 8 chars (incl. trailing NUL) */ +#define RECOVERY_KEY_MODHEX_FORMATTED_LENGTH (RECOVERY_KEY_MODHEX_RAW_LENGTH*2/8*9) + +int make_recovery_key(char **ret); + +extern const char modhex_alphabet[16]; + +int decode_modhex_char(char x); + +int normalize_recovery_key(const char *password, char **ret); diff --git a/src/shared/resize-fs.c b/src/shared/resize-fs.c new file mode 100644 index 0000000..178aefa --- /dev/null +++ b/src/shared/resize-fs.c @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "blockdev-util.h" +#include "fs-util.h" +#include "missing_fs.h" +#include "missing_magic.h" +#include "missing_xfs.h" +#include "resize-fs.h" +#include "stat-util.h" + +int resize_fs(int fd, uint64_t sz, uint64_t *ret_size) { + struct statfs sfs; + + assert(fd >= 0); + + /* Rounds down to next block size */ + + if (sz <= 0 || sz == UINT64_MAX) + return -ERANGE; + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + if (is_fs_type(&sfs, EXT4_SUPER_MAGIC)) { + uint64_t u; + + if (sz < EXT4_MINIMAL_SIZE) + return -ERANGE; + + u = sz / sfs.f_bsize; + + if (ioctl(fd, EXT4_IOC_RESIZE_FS, &u) < 0) + return -errno; + + if (ret_size) + *ret_size = u * sfs.f_bsize; + + } else if (is_fs_type(&sfs, BTRFS_SUPER_MAGIC)) { + struct btrfs_ioctl_vol_args args = {}; + + /* 256M is the minimize size enforced by the btrfs kernel code when resizing (which is + * strange btw, as mkfs.btrfs is fine creating file systems > 109M). It will return EINVAL in + * that case, let's catch this error beforehand though, and report a more explanatory + * error. */ + + if (sz < BTRFS_MINIMAL_SIZE) + return -ERANGE; + + sz -= sz % sfs.f_bsize; + + xsprintf(args.name, "%" PRIu64, sz); + + if (ioctl(fd, BTRFS_IOC_RESIZE, &args) < 0) + return -errno; + + if (ret_size) + *ret_size = sz; + + } else if (is_fs_type(&sfs, XFS_SB_MAGIC)) { + xfs_fsop_geom_t geo; + xfs_growfs_data_t d; + + if (sz < XFS_MINIMAL_SIZE) + return -ERANGE; + + if (ioctl(fd, XFS_IOC_FSGEOMETRY, &geo) < 0) + return -errno; + + d = (xfs_growfs_data_t) { + .imaxpct = geo.imaxpct, + .newblocks = sz / geo.blocksize, + }; + + if (ioctl(fd, XFS_IOC_FSGROWFSDATA, &d) < 0) + return -errno; + + if (ret_size) + *ret_size = d.newblocks * geo.blocksize; + + } else + return -EOPNOTSUPP; + + return 0; +} + +uint64_t minimal_size_by_fs_magic(statfs_f_type_t magic) { + + switch (magic) { + + case (statfs_f_type_t) EXT4_SUPER_MAGIC: + return EXT4_MINIMAL_SIZE; + + case (statfs_f_type_t) XFS_SB_MAGIC: + return XFS_MINIMAL_SIZE; + + case (statfs_f_type_t) BTRFS_SUPER_MAGIC: + return BTRFS_MINIMAL_SIZE; + + default: + return UINT64_MAX; + } +} + +uint64_t minimal_size_by_fs_name(const char *name) { + + if (streq_ptr(name, "ext4")) + return EXT4_MINIMAL_SIZE; + + if (streq_ptr(name, "xfs")) + return XFS_MINIMAL_SIZE; + + if (streq_ptr(name, "btrfs")) + return BTRFS_MINIMAL_SIZE; + + return UINT64_MAX; +} + +/* Returns true for the only fs that can online shrink *and* grow */ +bool fs_can_online_shrink_and_grow(statfs_f_type_t magic) { + return magic == (statfs_f_type_t) BTRFS_SUPER_MAGIC; +} diff --git a/src/shared/resize-fs.h b/src/shared/resize-fs.h new file mode 100644 index 0000000..b40943c --- /dev/null +++ b/src/shared/resize-fs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "stat-util.h" + +int resize_fs(int fd, uint64_t sz, uint64_t *ret_size); + +#define BTRFS_MINIMAL_SIZE (256U*1024U*1024U) +#define XFS_MINIMAL_SIZE (300U*1024U*1024U) +#define EXT4_MINIMAL_SIZE (1024U*1024U) + +uint64_t minimal_size_by_fs_magic(statfs_f_type_t magic); +uint64_t minimal_size_by_fs_name(const char *str); + +bool fs_can_online_shrink_and_grow(statfs_f_type_t magic); diff --git a/src/shared/resolve-util.c b/src/shared/resolve-util.c new file mode 100644 index 0000000..820f9bb --- /dev/null +++ b/src/shared/resolve-util.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "resolve-util.h" +#include "string-table.h" + +DEFINE_CONFIG_PARSE_ENUM(config_parse_resolve_support, resolve_support, ResolveSupport, "Failed to parse resolve support setting"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_dnssec_mode, dnssec_mode, DnssecMode, "Failed to parse DNSSEC mode setting"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_dns_over_tls_mode, dns_over_tls_mode, DnsOverTlsMode, "Failed to parse DNS-over-TLS mode setting"); + +static const char* const resolve_support_table[_RESOLVE_SUPPORT_MAX] = { + [RESOLVE_SUPPORT_NO] = "no", + [RESOLVE_SUPPORT_YES] = "yes", + [RESOLVE_SUPPORT_RESOLVE] = "resolve", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(resolve_support, ResolveSupport, RESOLVE_SUPPORT_YES); + +static const char* const dnssec_mode_table[_DNSSEC_MODE_MAX] = { + [DNSSEC_NO] = "no", + [DNSSEC_ALLOW_DOWNGRADE] = "allow-downgrade", + [DNSSEC_YES] = "yes", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dnssec_mode, DnssecMode, DNSSEC_YES); + +static const char* const dns_over_tls_mode_table[_DNS_OVER_TLS_MODE_MAX] = { + [DNS_OVER_TLS_NO] = "no", + [DNS_OVER_TLS_OPPORTUNISTIC] = "opportunistic", + [DNS_OVER_TLS_YES] = "yes", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dns_over_tls_mode, DnsOverTlsMode, DNS_OVER_TLS_YES); + +bool dns_server_address_valid(int family, const union in_addr_union *sa) { + + /* Refuses the 0 IP addresses as well as 127.0.0.53/127.0.0.54 (which is our own DNS stub) */ + + if (!in_addr_is_set(family, sa)) + return false; + + if (family == AF_INET && IN_SET(be32toh(sa->in.s_addr), INADDR_DNS_STUB, INADDR_DNS_PROXY_STUB)) + return false; + + return true; +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_dns_cache_mode, dns_cache_mode, DnsCacheMode, "Failed to parse DNS cache mode setting") + +static const char* const dns_cache_mode_table[_DNS_CACHE_MODE_MAX] = { + [DNS_CACHE_MODE_YES] = "yes", + [DNS_CACHE_MODE_NO] = "no", + [DNS_CACHE_MODE_NO_NEGATIVE] = "no-negative", +}; +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dns_cache_mode, DnsCacheMode, DNS_CACHE_MODE_YES); diff --git a/src/shared/resolve-util.h b/src/shared/resolve-util.h new file mode 100644 index 0000000..2d210f9 --- /dev/null +++ b/src/shared/resolve-util.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "in-addr-util.h" +#include "macro.h" + +/* 127.0.0.53 in native endian (The IP address we listen on with the full DNS stub, i.e. that does LLMNR/mDNS, and stuff) */ +#define INADDR_DNS_STUB ((in_addr_t) 0x7f000035U) + +/* 127.0.0.54 in native endian (The IP address we listen on we only implement "proxy" mode) */ +#define INADDR_DNS_PROXY_STUB ((in_addr_t) 0x7f000036U) + +/* 127.0.0.2 is an address we always map to the local hostname. This is different from 127.0.0.1 which maps to "localhost" */ +#define INADDR_LOCALADDRESS ((in_addr_t) 0x7f000002U) + +typedef enum DnsCacheMode DnsCacheMode; + +enum DnsCacheMode { + DNS_CACHE_MODE_NO, + DNS_CACHE_MODE_YES, + DNS_CACHE_MODE_NO_NEGATIVE, + _DNS_CACHE_MODE_MAX, + _DNS_CACHE_MODE_INVALID = -EINVAL, +}; + +typedef enum ResolveSupport ResolveSupport; +typedef enum DnssecMode DnssecMode; +typedef enum DnsOverTlsMode DnsOverTlsMode; + +/* Do not change the order, see link_get_llmnr_support() or link_get_mdns_support(). */ +enum ResolveSupport { + RESOLVE_SUPPORT_NO, + RESOLVE_SUPPORT_RESOLVE, + RESOLVE_SUPPORT_YES, + _RESOLVE_SUPPORT_MAX, + _RESOLVE_SUPPORT_INVALID = -EINVAL, +}; + +enum DnssecMode { + /* No DNSSEC validation is done */ + DNSSEC_NO, + + /* Validate locally, if the server knows DO, but if not, + * don't. Don't trust the AD bit. If the server doesn't do + * DNSSEC properly, downgrade to non-DNSSEC operation. Of + * course, we then are vulnerable to a downgrade attack, but + * that's life and what is configured. */ + DNSSEC_ALLOW_DOWNGRADE, + + /* Insist on DNSSEC server support, and rather fail than downgrading. */ + DNSSEC_YES, + + _DNSSEC_MODE_MAX, + _DNSSEC_MODE_INVALID = -EINVAL, +}; + +enum DnsOverTlsMode { + /* No connection is made for DNS-over-TLS */ + DNS_OVER_TLS_NO, + + /* Try to connect using DNS-over-TLS, but if connection fails, + * fall back to using an unencrypted connection */ + DNS_OVER_TLS_OPPORTUNISTIC, + + /* Enforce DNS-over-TLS and require valid server certificates */ + DNS_OVER_TLS_YES, + + _DNS_OVER_TLS_MODE_MAX, + _DNS_OVER_TLS_MODE_INVALID = -EINVAL, +}; + +CONFIG_PARSER_PROTOTYPE(config_parse_resolve_support); +CONFIG_PARSER_PROTOTYPE(config_parse_dnssec_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_dns_over_tls_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_dns_cache_mode); + +const char* resolve_support_to_string(ResolveSupport p) _const_; +ResolveSupport resolve_support_from_string(const char *s) _pure_; + +const char* dnssec_mode_to_string(DnssecMode p) _const_; +DnssecMode dnssec_mode_from_string(const char *s) _pure_; + +const char* dns_over_tls_mode_to_string(DnsOverTlsMode p) _const_; +DnsOverTlsMode dns_over_tls_mode_from_string(const char *s) _pure_; + +bool dns_server_address_valid(int family, const union in_addr_union *sa); + +const char* dns_cache_mode_to_string(DnsCacheMode p) _const_; +DnsCacheMode dns_cache_mode_from_string(const char *s) _pure_; + +/* A resolv.conf file containing the DNS server and domain data we learnt from uplink, i.e. the full uplink data */ +#define PRIVATE_UPLINK_RESOLV_CONF "/run/systemd/resolve/resolv.conf" + +/* A resolv.conf file containing the domain data we learnt from uplink, but our own DNS server address. */ +#define PRIVATE_STUB_RESOLV_CONF "/run/systemd/resolve/stub-resolv.conf" + +/* A static resolv.conf file containing no domains, but only our own DNS server address */ +#define PRIVATE_STATIC_RESOLV_CONF LIBEXECDIR "/resolv.conf" diff --git a/src/shared/rm-rf.c b/src/shared/rm-rf.c new file mode 100644 index 0000000..4664215 --- /dev/null +++ b/src/shared/rm-rf.c @@ -0,0 +1,519 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "btrfs-util.h" +#include "cgroup-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "string-util.h" + +/* We treat tmpfs/ramfs + cgroupfs as non-physical file systems. cgroupfs is similar to tmpfs in a way + * after all: we can create arbitrary directory hierarchies in it, and hence can also use rm_rf() on it + * to remove those again. */ +static bool is_physical_fs(const struct statfs *sfs) { + return !is_temporary_fs(sfs) && !is_cgroup_fs(sfs); +} + +static int patch_dirfd_mode( + int dfd, + bool refuse_already_set, + mode_t *ret_old_mode) { + + struct stat st; + int r; + + assert(dfd >= 0); + assert(ret_old_mode); + + if (fstat(dfd, &st) < 0) + return -errno; + if (!S_ISDIR(st.st_mode)) + return -ENOTDIR; + + if (FLAGS_SET(st.st_mode, 0700)) { /* Already set? */ + if (refuse_already_set) + return -EACCES; /* original error */ + + *ret_old_mode = st.st_mode; + return 0; + } + + if (st.st_uid != geteuid()) /* this only works if the UID matches ours */ + return -EACCES; + + r = fchmod_opath(dfd, (st.st_mode | 0700) & 07777); + if (r < 0) + return r; + + *ret_old_mode = st.st_mode; + return 1; +} + +int unlinkat_harder(int dfd, const char *filename, int unlink_flags, RemoveFlags remove_flags) { + mode_t old_mode; + int r; + + /* Like unlinkat(), but tries harder: if we get EACCESS we'll try to set the r/w/x bits on the + * directory. This is useful if we run unprivileged and have some files where the w bit is + * missing. */ + + if (unlinkat(dfd, filename, unlink_flags) >= 0) + return 0; + if (errno != EACCES || !FLAGS_SET(remove_flags, REMOVE_CHMOD)) + return -errno; + + r = patch_dirfd_mode(dfd, /* refuse_already_set = */ true, &old_mode); + if (r < 0) + return r; + + if (unlinkat(dfd, filename, unlink_flags) < 0) { + r = -errno; + /* Try to restore the original access mode if this didn't work */ + (void) fchmod(dfd, old_mode & 07777); + return r; + } + + if (FLAGS_SET(remove_flags, REMOVE_CHMOD_RESTORE) && fchmod(dfd, old_mode & 07777) < 0) + return -errno; + + /* If this worked, we won't reset the old mode by default, since we'll need it for other entries too, + * and we should destroy the whole thing */ + return 0; +} + +int fstatat_harder(int dfd, + const char *filename, + struct stat *ret, + int fstatat_flags, + RemoveFlags remove_flags) { + + mode_t old_mode; + int r; + + /* Like unlink_harder() but does the same for fstatat() */ + + if (fstatat(dfd, filename, ret, fstatat_flags) >= 0) + return 0; + if (errno != EACCES || !FLAGS_SET(remove_flags, REMOVE_CHMOD)) + return -errno; + + r = patch_dirfd_mode(dfd, /* refuse_already_set = */ true, &old_mode); + if (r < 0) + return r; + + if (fstatat(dfd, filename, ret, fstatat_flags) < 0) { + r = -errno; + (void) fchmod(dfd, old_mode & 07777); + return r; + } + + if (FLAGS_SET(remove_flags, REMOVE_CHMOD_RESTORE) && fchmod(dfd, old_mode & 07777) < 0) + return -errno; + + return 0; +} + +static int openat_harder(int dfd, const char *path, int open_flags, RemoveFlags remove_flags, mode_t *ret_old_mode) { + _cleanup_close_ int pfd = -EBADF, fd = -EBADF; + bool chmod_done = false; + mode_t old_mode; + int r; + + assert(dfd >= 0 || dfd == AT_FDCWD); + assert(path); + + /* Unlike unlink_harder() and fstatat_harder(), this chmod the specified path. */ + + if (FLAGS_SET(open_flags, O_PATH) || + !FLAGS_SET(open_flags, O_DIRECTORY) || + !FLAGS_SET(remove_flags, REMOVE_CHMOD)) { + + fd = RET_NERRNO(openat(dfd, path, open_flags)); + if (fd < 0) + return fd; + + if (ret_old_mode) { + struct stat st; + + if (fstat(fd, &st) < 0) + return -errno; + + *ret_old_mode = st.st_mode; + } + + return TAKE_FD(fd); + } + + pfd = RET_NERRNO(openat(dfd, path, (open_flags & (O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW)) | O_PATH)); + if (pfd < 0) + return pfd; + + if (FLAGS_SET(remove_flags, REMOVE_CHMOD)) { + r = patch_dirfd_mode(pfd, /* refuse_already_set = */ false, &old_mode); + if (r < 0) + return r; + + chmod_done = r; + } + + fd = fd_reopen(pfd, open_flags & ~O_NOFOLLOW); + if (fd < 0) { + if (chmod_done) + (void) fchmod_opath(pfd, old_mode & 07777); + return fd; + } + + if (ret_old_mode) + *ret_old_mode = old_mode; + + return TAKE_FD(fd); +} + +static int rm_rf_children_impl( + int fd, + RemoveFlags flags, + const struct stat *root_dev, + mode_t old_mode); + +static int rm_rf_inner_child( + int fd, + const char *fname, + int is_dir, + RemoveFlags flags, + const struct stat *root_dev, + bool allow_recursion) { + + struct stat st; + int r, q = 0; + + assert(fd >= 0); + assert(fname); + + if (is_dir < 0 || + root_dev || + (is_dir > 0 && (root_dev || (flags & REMOVE_SUBVOLUME)))) { + + r = fstatat_harder(fd, fname, &st, AT_SYMLINK_NOFOLLOW, flags); + if (r < 0) + return r; + + is_dir = S_ISDIR(st.st_mode); + } + + if (is_dir) { + /* If root_dev is set, remove subdirectories only if device is same */ + if (root_dev && st.st_dev != root_dev->st_dev) + return 0; + + /* Stop at mount points */ + r = fd_is_mount_point(fd, fname, 0); + if (r < 0) + return r; + if (r > 0) + return 0; + + if ((flags & REMOVE_SUBVOLUME) && btrfs_might_be_subvol(&st)) { + /* This could be a subvolume, try to remove it */ + + r = btrfs_subvol_remove_at(fd, fname, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); + if (r < 0) { + if (!IN_SET(r, -ENOTTY, -EINVAL)) + return r; + + /* ENOTTY, then it wasn't a btrfs subvolume, continue below. */ + } else + /* It was a subvolume, done. */ + return 1; + } + + if (!allow_recursion) + return -EISDIR; + + mode_t old_mode; + int subdir_fd = openat_harder(fd, fname, + O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME, + flags, &old_mode); + if (subdir_fd < 0) + return subdir_fd; + + /* We pass REMOVE_PHYSICAL here, to avoid doing the fstatfs() to check the file system type + * again for each directory */ + q = rm_rf_children_impl(subdir_fd, flags | REMOVE_PHYSICAL, root_dev, old_mode); + + } else if (flags & REMOVE_ONLY_DIRECTORIES) + return 0; + + r = unlinkat_harder(fd, fname, is_dir ? AT_REMOVEDIR : 0, flags); + if (r < 0) + return r; + if (q < 0) + return q; + return 1; +} + +typedef struct TodoEntry { + DIR *dir; /* A directory that we were operating on. */ + char *dirname; /* The filename of that directory itself. */ + mode_t old_mode; /* The original file mode. */ +} TodoEntry; + +static void free_todo_entries(TodoEntry **todos) { + for (TodoEntry *x = *todos; x && x->dir; x++) { + closedir(x->dir); + free(x->dirname); + } + + freep(todos); +} + +int rm_rf_children( + int fd, + RemoveFlags flags, + const struct stat *root_dev) { + + struct stat st; + + assert(fd >= 0); + + if (fstat(fd, &st) < 0) + return -errno; + + return rm_rf_children_impl(fd, flags, root_dev, st.st_mode); +} + +static int rm_rf_children_impl( + int fd, + RemoveFlags flags, + const struct stat *root_dev, + mode_t old_mode) { + + _cleanup_(free_todo_entries) TodoEntry *todos = NULL; + size_t n_todo = 0; + _cleanup_free_ char *dirname = NULL; /* Set when we are recursing and want to delete ourselves */ + int ret = 0, r; + + /* Return the first error we run into, but nevertheless try to go on. + * The passed fd is closed in all cases, including on failure. */ + + for (;;) { /* This loop corresponds to the directory nesting level. */ + _cleanup_closedir_ DIR *d = NULL; + + if (n_todo > 0) { + /* We know that we are in recursion here, because n_todo is set. + * We need to remove the inner directory we were operating on. */ + assert(dirname); + r = unlinkat_harder(dirfd(todos[n_todo-1].dir), dirname, AT_REMOVEDIR, flags); + if (r < 0 && r != -ENOENT) { + if (ret == 0) + ret = r; + + if (FLAGS_SET(flags, REMOVE_CHMOD_RESTORE)) + (void) fchmodat(dirfd(todos[n_todo-1].dir), dirname, old_mode & 07777, 0); + } + dirname = mfree(dirname); + + /* And now let's back out one level up */ + n_todo --; + d = TAKE_PTR(todos[n_todo].dir); + dirname = TAKE_PTR(todos[n_todo].dirname); + old_mode = todos[n_todo].old_mode; + + assert(d); + fd = dirfd(d); /* Retrieve the file descriptor from the DIR object */ + assert(fd >= 0); + } else { + next_fd: + assert(fd >= 0); + d = fdopendir(fd); + if (!d) { + safe_close(fd); + return -errno; + } + fd = dirfd(d); /* We donated the fd to fdopendir(). Let's make sure we sure we have + * the right descriptor even if it were to internally invalidate the + * one we passed. */ + + if (!(flags & REMOVE_PHYSICAL)) { + struct statfs sfs; + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + if (is_physical_fs(&sfs)) { + /* We refuse to clean physical file systems with this call, unless + * explicitly requested. This is extra paranoia just to be sure we + * never ever remove non-state data. */ + + _cleanup_free_ char *path = NULL; + + (void) fd_get_path(fd, &path); + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Attempted to remove disk file system under \"%s\", and we can't allow that.", + strna(path)); + } + } + } + + FOREACH_DIRENT_ALL(de, d, return -errno) { + int is_dir; + + if (dot_or_dot_dot(de->d_name)) + continue; + + is_dir = de->d_type == DT_UNKNOWN ? -1 : de->d_type == DT_DIR; + + r = rm_rf_inner_child(fd, de->d_name, is_dir, flags, root_dev, false); + if (r == -EISDIR) { + /* Push the current working state onto the todo list */ + + if (!GREEDY_REALLOC0(todos, n_todo + 2)) + return log_oom(); + + _cleanup_free_ char *newdirname = strdup(de->d_name); + if (!newdirname) + return log_oom(); + + mode_t mode; + int newfd = openat_harder(fd, de->d_name, + O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME, + flags, &mode); + if (newfd >= 0) { + todos[n_todo++] = (TodoEntry) { + .dir = TAKE_PTR(d), + .dirname = TAKE_PTR(dirname), + .old_mode = old_mode + }; + + fd = newfd; + dirname = TAKE_PTR(newdirname); + old_mode = mode; + + goto next_fd; + + } else if (newfd != -ENOENT && ret == 0) + ret = newfd; + + } else if (r < 0 && r != -ENOENT && ret == 0) + ret = r; + } + + if (FLAGS_SET(flags, REMOVE_SYNCFS) && syncfs(fd) < 0 && ret >= 0) + ret = -errno; + + if (n_todo == 0) { + if (FLAGS_SET(flags, REMOVE_CHMOD_RESTORE) && + fchmod(fd, old_mode & 07777) < 0 && ret >= 0) + ret = -errno; + + break; + } + } + + return ret; +} + +int rm_rf_at(int dir_fd, const char *path, RemoveFlags flags) { + mode_t old_mode; + int fd, r, q = 0; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + /* For now, don't support dropping subvols when also only dropping directories, since we can't do + * this race-freely. */ + if (FLAGS_SET(flags, REMOVE_ONLY_DIRECTORIES|REMOVE_SUBVOLUME)) + return -EINVAL; + + /* We refuse to clean the root file system with this call. This is extra paranoia to never cause a + * really seriously broken system. */ + if (path_is_root_at(dir_fd, path) > 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Attempted to remove entire root file system, and we can't allow that."); + + if (FLAGS_SET(flags, REMOVE_SUBVOLUME | REMOVE_ROOT | REMOVE_PHYSICAL)) { + /* Try to remove as subvolume first */ + r = btrfs_subvol_remove_at(dir_fd, path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); + if (r >= 0) + return r; + + if (FLAGS_SET(flags, REMOVE_MISSING_OK) && r == -ENOENT) + return 0; + + if (!IN_SET(r, -ENOTTY, -EINVAL, -ENOTDIR)) + return r; + + /* Not btrfs or not a subvolume */ + } + + fd = openat_harder(dir_fd, path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME, flags, &old_mode); + if (fd >= 0) { + /* We have a dir */ + r = rm_rf_children_impl(fd, flags, NULL, old_mode); + + if (FLAGS_SET(flags, REMOVE_ROOT)) + q = RET_NERRNO(unlinkat(dir_fd, path, AT_REMOVEDIR)); + } else { + r = fd; + if (FLAGS_SET(flags, REMOVE_MISSING_OK) && r == -ENOENT) + return 0; + + if (!IN_SET(r, -ENOTDIR, -ELOOP)) + return r; + + if (FLAGS_SET(flags, REMOVE_ONLY_DIRECTORIES) || !FLAGS_SET(flags, REMOVE_ROOT)) + return 0; + + if (!FLAGS_SET(flags, REMOVE_PHYSICAL)) { + struct statfs s; + + r = xstatfsat(dir_fd, path, &s); + if (r < 0) + return r; + if (is_physical_fs(&s)) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Attempted to remove files from a disk file system under \"%s\", refusing.", + path); + } + + r = 0; + q = RET_NERRNO(unlinkat(dir_fd, path, 0)); + } + + if (r < 0) + return r; + if (q < 0 && (q != -ENOENT || !FLAGS_SET(flags, REMOVE_MISSING_OK))) + return q; + return 0; +} + +int rm_rf_child(int fd, const char *name, RemoveFlags flags) { + + /* Removes one specific child of the specified directory */ + + if (fd < 0) + return -EBADF; + + if (!filename_is_valid(name)) + return -EINVAL; + + if ((flags & (REMOVE_ROOT|REMOVE_MISSING_OK)) != 0) /* Doesn't really make sense here, we are not supposed to remove 'fd' anyway */ + return -EINVAL; + + if (FLAGS_SET(flags, REMOVE_ONLY_DIRECTORIES|REMOVE_SUBVOLUME)) + return -EINVAL; + + return rm_rf_inner_child(fd, name, -1, flags, NULL, true); +} diff --git a/src/shared/rm-rf.h b/src/shared/rm-rf.h new file mode 100644 index 0000000..6e52bbb --- /dev/null +++ b/src/shared/rm-rf.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" + +typedef enum RemoveFlags { + REMOVE_ONLY_DIRECTORIES = 1 << 0, /* Only remove empty directories, no files */ + REMOVE_ROOT = 1 << 1, /* Remove the specified directory itself too, not just the contents of it */ + REMOVE_PHYSICAL = 1 << 2, /* If not set, only removes files on tmpfs, never physical file systems */ + REMOVE_SUBVOLUME = 1 << 3, /* Drop btrfs subvolumes in the tree too */ + REMOVE_MISSING_OK = 1 << 4, /* If the top-level directory is missing, ignore the ENOENT for it */ + REMOVE_CHMOD = 1 << 5, /* chmod() for write access if we cannot delete or access something */ + REMOVE_CHMOD_RESTORE = 1 << 6, /* Restore the old mode before returning */ + REMOVE_SYNCFS = 1 << 7, /* syncfs() the root of the specified directory after removing everything in it */ +} RemoveFlags; + +int unlinkat_harder(int dfd, const char *filename, int unlink_flags, RemoveFlags remove_flags); +int fstatat_harder(int dfd, + const char *filename, + struct stat *ret, + int fstatat_flags, + RemoveFlags remove_flags); + +/* Note: directory file descriptors passed to the functions below must be + * positioned at the beginning. If the fd was already used for reading, rewind it. */ +int rm_rf_children(int fd, RemoveFlags flags, const struct stat *root_dev); +int rm_rf_child(int fd, const char *name, RemoveFlags flags); +int rm_rf_at(int dir_fd, const char *path, RemoveFlags flags); +static inline int rm_rf(const char *path, RemoveFlags flags) { + return rm_rf_at(AT_FDCWD, path, flags); +} + +/* Useful for usage with _cleanup_(), destroys a directory and frees the pointer */ +static inline char *rm_rf_physical_and_free(char *p) { + PROTECT_ERRNO; + + if (!p) + return NULL; + + (void) rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK|REMOVE_CHMOD); + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, rm_rf_physical_and_free); + +/* Similar as above, but also has magic btrfs subvolume powers */ +static inline char *rm_rf_subvolume_and_free(char *p) { + PROTECT_ERRNO; + + if (!p) + return NULL; + + (void) rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_MISSING_OK|REMOVE_CHMOD); + return mfree(p); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, rm_rf_subvolume_and_free); diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c new file mode 100644 index 0000000..00a8ced --- /dev/null +++ b/src/shared/seccomp-util.c @@ -0,0 +1,2499 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */ +#include "missing_syscall_def.h" +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "macro.h" +#include "namespace-util.h" +#include "nsflags.h" +#include "nulstr-util.h" +#include "process-util.h" +#include "seccomp-util.h" +#include "set.h" +#include "string-util.h" +#include "strv.h" + +/* This array will be modified at runtime as seccomp_restrict_archs is called. */ +uint32_t seccomp_local_archs[] = { + + /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */ + +#if defined(__x86_64__) && defined(__ILP32__) + SCMP_ARCH_X86, + SCMP_ARCH_X86_64, + SCMP_ARCH_X32, /* native */ +#elif defined(__x86_64__) && !defined(__ILP32__) + SCMP_ARCH_X86, + SCMP_ARCH_X32, + SCMP_ARCH_X86_64, /* native */ +#elif defined(__i386__) + SCMP_ARCH_X86, +#elif defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, /* native */ +#elif defined(__arm__) + SCMP_ARCH_ARM, +#elif defined(__loongarch_lp64) + SCMP_ARCH_LOONGARCH64, +#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32 + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32 + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64 + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL64N32, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPS64, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64 + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL64N32, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPSEL64, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32 + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPSEL64N32, + SCMP_ARCH_MIPS64N32, /* native */ +#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32 + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL64N32, /* native */ +#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64) + SCMP_ARCH_PARISC, + SCMP_ARCH_PARISC64, /* native */ +#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC) + SCMP_ARCH_PARISC, +#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64LE, + SCMP_ARCH_PPC64, /* native */ +#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64, + SCMP_ARCH_PPC64LE, /* native */ +#elif defined(__powerpc__) + SCMP_ARCH_PPC, +#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64) + SCMP_ARCH_RISCV64, +#elif defined(__s390x__) + SCMP_ARCH_S390, + SCMP_ARCH_S390X, /* native */ +#elif defined(__s390__) + SCMP_ARCH_S390, +#endif + SECCOMP_LOCAL_ARCH_END + }; + +const char* seccomp_arch_to_string(uint32_t c) { + /* Maintain order used in . + * + * Names used here should be the same as those used for ConditionArchitecture=, + * except for "subarchitectures" like x32. */ + + switch (c) { + case SCMP_ARCH_NATIVE: + return "native"; + case SCMP_ARCH_X86: + return "x86"; + case SCMP_ARCH_X86_64: + return "x86-64"; + case SCMP_ARCH_X32: + return "x32"; + case SCMP_ARCH_ARM: + return "arm"; + case SCMP_ARCH_AARCH64: + return "arm64"; +#ifdef SCMP_ARCH_LOONGARCH64 + case SCMP_ARCH_LOONGARCH64: + return "loongarch64"; +#endif + case SCMP_ARCH_MIPS: + return "mips"; + case SCMP_ARCH_MIPS64: + return "mips64"; + case SCMP_ARCH_MIPS64N32: + return "mips64-n32"; + case SCMP_ARCH_MIPSEL: + return "mips-le"; + case SCMP_ARCH_MIPSEL64: + return "mips64-le"; + case SCMP_ARCH_MIPSEL64N32: + return "mips64-le-n32"; +#ifdef SCMP_ARCH_PARISC + case SCMP_ARCH_PARISC: + return "parisc"; +#endif +#ifdef SCMP_ARCH_PARISC64 + case SCMP_ARCH_PARISC64: + return "parisc64"; +#endif + case SCMP_ARCH_PPC: + return "ppc"; + case SCMP_ARCH_PPC64: + return "ppc64"; + case SCMP_ARCH_PPC64LE: + return "ppc64-le"; +#ifdef SCMP_ARCH_RISCV64 + case SCMP_ARCH_RISCV64: + return "riscv64"; +#endif + case SCMP_ARCH_S390: + return "s390"; + case SCMP_ARCH_S390X: + return "s390x"; + default: + return NULL; + } +} + +int seccomp_arch_from_string(const char *n, uint32_t *ret) { + if (!n) + return -EINVAL; + + assert(ret); + + if (streq(n, "native")) + *ret = SCMP_ARCH_NATIVE; + else if (streq(n, "x86")) + *ret = SCMP_ARCH_X86; + else if (streq(n, "x86-64")) + *ret = SCMP_ARCH_X86_64; + else if (streq(n, "x32")) + *ret = SCMP_ARCH_X32; + else if (streq(n, "arm")) + *ret = SCMP_ARCH_ARM; + else if (streq(n, "arm64")) + *ret = SCMP_ARCH_AARCH64; +#ifdef SCMP_ARCH_LOONGARCH64 + else if (streq(n, "loongarch64")) + *ret = SCMP_ARCH_LOONGARCH64; +#endif + else if (streq(n, "mips")) + *ret = SCMP_ARCH_MIPS; + else if (streq(n, "mips64")) + *ret = SCMP_ARCH_MIPS64; + else if (streq(n, "mips64-n32")) + *ret = SCMP_ARCH_MIPS64N32; + else if (streq(n, "mips-le")) + *ret = SCMP_ARCH_MIPSEL; + else if (streq(n, "mips64-le")) + *ret = SCMP_ARCH_MIPSEL64; + else if (streq(n, "mips64-le-n32")) + *ret = SCMP_ARCH_MIPSEL64N32; +#ifdef SCMP_ARCH_PARISC + else if (streq(n, "parisc")) + *ret = SCMP_ARCH_PARISC; +#endif +#ifdef SCMP_ARCH_PARISC64 + else if (streq(n, "parisc64")) + *ret = SCMP_ARCH_PARISC64; +#endif + else if (streq(n, "ppc")) + *ret = SCMP_ARCH_PPC; + else if (streq(n, "ppc64")) + *ret = SCMP_ARCH_PPC64; + else if (streq(n, "ppc64-le")) + *ret = SCMP_ARCH_PPC64LE; +#ifdef SCMP_ARCH_RISCV64 + else if (streq(n, "riscv64")) + *ret = SCMP_ARCH_RISCV64; +#endif + else if (streq(n, "s390")) + *ret = SCMP_ARCH_S390; + else if (streq(n, "s390x")) + *ret = SCMP_ARCH_S390X; + else + return -EINVAL; + + return 0; +} + +int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int r; + + /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting + * any others. Also, turns off the NNP fiddling. */ + + seccomp = seccomp_init(default_action); + if (!seccomp) + return -ENOMEM; + + if (arch != SCMP_ARCH_NATIVE && + arch != seccomp_arch_native()) { + + r = seccomp_arch_remove(seccomp, seccomp_arch_native()); + if (r < 0) + return r; + + r = seccomp_arch_add(seccomp, arch); + if (r < 0) + return r; + + assert(seccomp_arch_exist(seccomp, arch) >= 0); + assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST); + assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST); + } else { + assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0); + assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0); + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + return r; + +#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4) + if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) { + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1); + if (r < 0) + log_debug_errno(r, "Failed to enable seccomp event logging: %m"); + } +#endif + + *ret = TAKE_PTR(seccomp); + return 0; +} + +static bool is_basic_seccomp_available(void) { + return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0; +} + +static bool is_seccomp_filter_available(void) { + return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 && + errno == EFAULT; +} + +bool is_seccomp_available(void) { + static int cached_enabled = -1; + + if (cached_enabled < 0) { + int b; + + b = getenv_bool_secure("SYSTEMD_SECCOMP"); + if (b != 0) { + if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */ + log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring."); + + cached_enabled = + is_basic_seccomp_available() && + is_seccomp_filter_available(); + } else + cached_enabled = false; + } + + return cached_enabled; +} + +const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_DEFAULT] = { + .name = "@default", + .help = "System calls that are always permitted", + .value = + "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */ + "brk\0" + "cacheflush\0" + "clock_getres\0" + "clock_getres_time64\0" + "clock_gettime\0" + "clock_gettime64\0" + "clock_nanosleep\0" + "clock_nanosleep_time64\0" + "execve\0" + "exit\0" + "exit_group\0" + "futex\0" + "futex_time64\0" + "futex_waitv\0" + "get_robust_list\0" + "get_thread_area\0" + "getegid\0" + "getegid32\0" + "geteuid\0" + "geteuid32\0" + "getgid\0" + "getgid32\0" + "getgroups\0" + "getgroups32\0" + "getpgid\0" + "getpgrp\0" + "getpid\0" + "getppid\0" + "getrandom\0" + "getresgid\0" + "getresgid32\0" + "getresuid\0" + "getresuid32\0" + "getrlimit\0" /* make sure processes can query stack size and such */ + "getsid\0" + "gettid\0" + "gettimeofday\0" + "getuid\0" + "getuid32\0" + "membarrier\0" + "mmap\0" + "mmap2\0" + "mprotect\0" + "munmap\0" + "nanosleep\0" + "pause\0" + "prlimit64\0" + "restart_syscall\0" + "riscv_flush_icache\0" + "riscv_hwprobe\0" + "rseq\0" + "rt_sigreturn\0" + "sched_getaffinity\0" + "sched_yield\0" + "set_robust_list\0" + "set_thread_area\0" + "set_tid_address\0" + "set_tls\0" + "sigreturn\0" + "time\0" + "ugetrlimit\0" + }, + [SYSCALL_FILTER_SET_AIO] = { + .name = "@aio", + .help = "Asynchronous IO", + .value = + "io_cancel\0" + "io_destroy\0" + "io_getevents\0" + "io_pgetevents\0" + "io_pgetevents_time64\0" + "io_setup\0" + "io_submit\0" + "io_uring_enter\0" + "io_uring_register\0" + "io_uring_setup\0" + }, + [SYSCALL_FILTER_SET_BASIC_IO] = { + .name = "@basic-io", + .help = "Basic IO", + .value = + "_llseek\0" + "close\0" + "close_range\0" + "dup\0" + "dup2\0" + "dup3\0" + "lseek\0" + "pread64\0" + "preadv\0" + "preadv2\0" + "pwrite64\0" + "pwritev\0" + "pwritev2\0" + "read\0" + "readv\0" + "write\0" + "writev\0" + }, + [SYSCALL_FILTER_SET_CHOWN] = { + .name = "@chown", + .help = "Change ownership of files and directories", + .value = + "chown\0" + "chown32\0" + "fchown\0" + "fchown32\0" + "fchownat\0" + "lchown\0" + "lchown32\0" + }, + [SYSCALL_FILTER_SET_CLOCK] = { + .name = "@clock", + .help = "Change the system time", + .value = + "adjtimex\0" + "clock_adjtime\0" + "clock_adjtime64\0" + "clock_settime\0" + "clock_settime64\0" + "settimeofday\0" + }, + [SYSCALL_FILTER_SET_CPU_EMULATION] = { + .name = "@cpu-emulation", + .help = "System calls for CPU emulation functionality", + .value = + "modify_ldt\0" + "subpage_prot\0" + "switch_endian\0" + "vm86\0" + "vm86old\0" + }, + [SYSCALL_FILTER_SET_DEBUG] = { + .name = "@debug", + .help = "Debugging, performance monitoring and tracing functionality", + .value = + "lookup_dcookie\0" + "perf_event_open\0" + "pidfd_getfd\0" + "ptrace\0" + "rtas\0" + "s390_runtime_instr\0" + "sys_debug_setcontext\0" + }, + [SYSCALL_FILTER_SET_FILE_SYSTEM] = { + .name = "@file-system", + .help = "File system operations", + .value = + "access\0" + "chdir\0" + "chmod\0" + "close\0" + "creat\0" + "faccessat\0" + "faccessat2\0" + "fallocate\0" + "fchdir\0" + "fchmod\0" + "fchmodat\0" + "fchmodat2\0" + "fcntl\0" + "fcntl64\0" + "fgetxattr\0" + "flistxattr\0" + "fremovexattr\0" + "fsetxattr\0" + "fstat\0" + "fstat64\0" + "fstatat64\0" + "fstatfs\0" + "fstatfs64\0" + "ftruncate\0" + "ftruncate64\0" + "futimesat\0" + "getcwd\0" + "getdents\0" + "getdents64\0" + "getxattr\0" + "inotify_add_watch\0" + "inotify_init\0" + "inotify_init1\0" + "inotify_rm_watch\0" + "lgetxattr\0" + "link\0" + "linkat\0" + "listxattr\0" + "llistxattr\0" + "lremovexattr\0" + "lsetxattr\0" + "lstat\0" + "lstat64\0" + "mkdir\0" + "mkdirat\0" + "mknod\0" + "mknodat\0" + "newfstatat\0" + "oldfstat\0" + "oldlstat\0" + "oldstat\0" + "open\0" + "openat\0" + "openat2\0" + "readlink\0" + "readlinkat\0" + "removexattr\0" + "rename\0" + "renameat\0" + "renameat2\0" + "rmdir\0" + "setxattr\0" + "stat\0" + "stat64\0" + "statfs\0" + "statfs64\0" + "statx\0" + "symlink\0" + "symlinkat\0" + "truncate\0" + "truncate64\0" + "unlink\0" + "unlinkat\0" + "utime\0" + "utimensat\0" + "utimensat_time64\0" + "utimes\0" + }, + [SYSCALL_FILTER_SET_IO_EVENT] = { + .name = "@io-event", + .help = "Event loop system calls", + .value = + "_newselect\0" + "epoll_create\0" + "epoll_create1\0" + "epoll_ctl\0" + "epoll_ctl_old\0" + "epoll_pwait\0" + "epoll_pwait2\0" + "epoll_wait\0" + "epoll_wait_old\0" + "eventfd\0" + "eventfd2\0" + "poll\0" + "ppoll\0" + "ppoll_time64\0" + "pselect6\0" + "pselect6_time64\0" + "select\0" + }, + [SYSCALL_FILTER_SET_IPC] = { + .name = "@ipc", + .help = "SysV IPC, POSIX Message Queues or other IPC", + .value = + "ipc\0" + "memfd_create\0" + "mq_getsetattr\0" + "mq_notify\0" + "mq_open\0" + "mq_timedreceive\0" + "mq_timedreceive_time64\0" + "mq_timedsend\0" + "mq_timedsend_time64\0" + "mq_unlink\0" + "msgctl\0" + "msgget\0" + "msgrcv\0" + "msgsnd\0" + "pipe\0" + "pipe2\0" + "process_madvise\0" + "process_vm_readv\0" + "process_vm_writev\0" + "semctl\0" + "semget\0" + "semop\0" + "semtimedop\0" + "semtimedop_time64\0" + "shmat\0" + "shmctl\0" + "shmdt\0" + "shmget\0" + }, + [SYSCALL_FILTER_SET_KEYRING] = { + .name = "@keyring", + .help = "Kernel keyring access", + .value = + "add_key\0" + "keyctl\0" + "request_key\0" + }, + [SYSCALL_FILTER_SET_MEMLOCK] = { + .name = "@memlock", + .help = "Memory locking control", + .value = + "mlock\0" + "mlock2\0" + "mlockall\0" + "munlock\0" + "munlockall\0" + }, + [SYSCALL_FILTER_SET_MODULE] = { + .name = "@module", + .help = "Loading and unloading of kernel modules", + .value = + "delete_module\0" + "finit_module\0" + "init_module\0" + }, + [SYSCALL_FILTER_SET_MOUNT] = { + .name = "@mount", + .help = "Mounting and unmounting of file systems", + .value = + "chroot\0" + "fsconfig\0" + "fsmount\0" + "fsopen\0" + "fspick\0" + "mount\0" + "mount_setattr\0" + "move_mount\0" + "open_tree\0" + "pivot_root\0" + "umount\0" + "umount2\0" + }, + [SYSCALL_FILTER_SET_NETWORK_IO] = { + .name = "@network-io", + .help = "Network or Unix socket IO, should not be needed if not network facing", + .value = + "accept\0" + "accept4\0" + "bind\0" + "connect\0" + "getpeername\0" + "getsockname\0" + "getsockopt\0" + "listen\0" + "recv\0" + "recvfrom\0" + "recvmmsg\0" + "recvmmsg_time64\0" + "recvmsg\0" + "send\0" + "sendmmsg\0" + "sendmsg\0" + "sendto\0" + "setsockopt\0" + "shutdown\0" + "socket\0" + "socketcall\0" + "socketpair\0" + }, + [SYSCALL_FILTER_SET_OBSOLETE] = { + /* some unknown even to libseccomp */ + .name = "@obsolete", + .help = "Unusual, obsolete or unimplemented system calls", + .value = + "_sysctl\0" + "afs_syscall\0" + "bdflush\0" + "break\0" + "create_module\0" + "ftime\0" + "get_kernel_syms\0" + "getpmsg\0" + "gtty\0" + "idle\0" + "lock\0" + "mpx\0" + "prof\0" + "profil\0" + "putpmsg\0" + "query_module\0" + "security\0" + "sgetmask\0" + "ssetmask\0" + "stime\0" + "stty\0" + "sysfs\0" + "tuxcall\0" + "ulimit\0" + "uselib\0" + "ustat\0" + "vserver\0" + }, + [SYSCALL_FILTER_SET_PKEY] = { + .name = "@pkey", + .help = "System calls used for memory protection keys", + .value = + "pkey_alloc\0" + "pkey_free\0" + "pkey_mprotect\0" + }, + [SYSCALL_FILTER_SET_PRIVILEGED] = { + .name = "@privileged", + .help = "All system calls which need super-user capabilities", + .value = + "@chown\0" + "@clock\0" + "@module\0" + "@raw-io\0" + "@reboot\0" + "@swap\0" + "_sysctl\0" + "acct\0" + "bpf\0" + "capset\0" + "chroot\0" + "fanotify_init\0" + "fanotify_mark\0" + "nfsservctl\0" + "open_by_handle_at\0" + "pivot_root\0" + "quotactl\0" + "quotactl_fd\0" + "setdomainname\0" + "setfsuid\0" + "setfsuid32\0" + "setgroups\0" + "setgroups32\0" + "sethostname\0" + "setresuid\0" + "setresuid32\0" + "setreuid\0" + "setreuid32\0" + "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */ + "setuid32\0" + "vhangup\0" + }, + [SYSCALL_FILTER_SET_PROCESS] = { + .name = "@process", + .help = "Process control, execution, namespacing operations", + .value = + "capget\0" /* Able to query arbitrary processes */ + "clone\0" + /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't + * implement seccomp, so we don't need to list it at all. C.f. + * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */ + "clone3\0" + "execveat\0" + "fork\0" + "getrusage\0" + "kill\0" + "pidfd_open\0" + "pidfd_send_signal\0" + "prctl\0" + "rt_sigqueueinfo\0" + "rt_tgsigqueueinfo\0" + "setns\0" + "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */ + "tgkill\0" + "times\0" + "tkill\0" + "unshare\0" + "vfork\0" + "wait4\0" + "waitid\0" + "waitpid\0" + }, + [SYSCALL_FILTER_SET_RAW_IO] = { + .name = "@raw-io", + .help = "Raw I/O port access", + .value = + "ioperm\0" + "iopl\0" + "pciconfig_iobase\0" + "pciconfig_read\0" + "pciconfig_write\0" + "s390_pci_mmio_read\0" + "s390_pci_mmio_write\0" + }, + [SYSCALL_FILTER_SET_REBOOT] = { + .name = "@reboot", + .help = "Reboot and reboot preparation/kexec", + .value = + "kexec_file_load\0" + "kexec_load\0" + "reboot\0" + }, + [SYSCALL_FILTER_SET_RESOURCES] = { + .name = "@resources", + .help = "Alter resource settings", + .value = + "ioprio_set\0" + "mbind\0" + "migrate_pages\0" + "move_pages\0" + "nice\0" + "sched_setaffinity\0" + "sched_setattr\0" + "sched_setparam\0" + "sched_setscheduler\0" + "set_mempolicy\0" + "set_mempolicy_home_node\0" + "setpriority\0" + "setrlimit\0" + }, + [SYSCALL_FILTER_SET_SANDBOX] = { + .name = "@sandbox", + .help = "Sandbox functionality", + .value = + "landlock_add_rule\0" + "landlock_create_ruleset\0" + "landlock_restrict_self\0" + "seccomp\0" + }, + [SYSCALL_FILTER_SET_SETUID] = { + .name = "@setuid", + .help = "Operations for changing user/group credentials", + .value = + "setgid\0" + "setgid32\0" + "setgroups\0" + "setgroups32\0" + "setregid\0" + "setregid32\0" + "setresgid\0" + "setresgid32\0" + "setresuid\0" + "setresuid32\0" + "setreuid\0" + "setreuid32\0" + "setuid\0" + "setuid32\0" + }, + [SYSCALL_FILTER_SET_SIGNAL] = { + .name = "@signal", + .help = "Process signal handling", + .value = + "rt_sigaction\0" + "rt_sigpending\0" + "rt_sigprocmask\0" + "rt_sigsuspend\0" + "rt_sigtimedwait\0" + "rt_sigtimedwait_time64\0" + "sigaction\0" + "sigaltstack\0" + "signal\0" + "signalfd\0" + "signalfd4\0" + "sigpending\0" + "sigprocmask\0" + "sigsuspend\0" + }, + [SYSCALL_FILTER_SET_SWAP] = { + .name = "@swap", + .help = "Enable/disable swap devices", + .value = + "swapoff\0" + "swapon\0" + }, + [SYSCALL_FILTER_SET_SYNC] = { + .name = "@sync", + .help = "Synchronize files and memory to storage", + .value = + "fdatasync\0" + "fsync\0" + "msync\0" + "sync\0" + "sync_file_range\0" + "sync_file_range2\0" + "syncfs\0" + }, + [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = { + .name = "@system-service", + .help = "General system service operations", + .value = + "@aio\0" + "@basic-io\0" + "@chown\0" + "@default\0" + "@file-system\0" + "@io-event\0" + "@ipc\0" + "@keyring\0" + "@memlock\0" + "@network-io\0" + "@process\0" + "@resources\0" + "@setuid\0" + "@signal\0" + "@sync\0" + "@timer\0" + "arm_fadvise64_64\0" + "capget\0" + "capset\0" + "copy_file_range\0" + "fadvise64\0" + "fadvise64_64\0" + "flock\0" + "get_mempolicy\0" + "getcpu\0" + "getpriority\0" + "ioctl\0" + "ioprio_get\0" + "kcmp\0" + "madvise\0" + "mremap\0" + "name_to_handle_at\0" + "oldolduname\0" + "olduname\0" + "personality\0" + "readahead\0" + "readdir\0" + "remap_file_pages\0" + "sched_get_priority_max\0" + "sched_get_priority_min\0" + "sched_getattr\0" + "sched_getparam\0" + "sched_getscheduler\0" + "sched_rr_get_interval\0" + "sched_rr_get_interval_time64\0" + "sched_yield\0" + "sendfile\0" + "sendfile64\0" + "setfsgid\0" + "setfsgid32\0" + "setfsuid\0" + "setfsuid32\0" + "setpgid\0" + "setsid\0" + "splice\0" + "sysinfo\0" + "tee\0" + "umask\0" + "uname\0" + "userfaultfd\0" + "vmsplice\0" + }, + [SYSCALL_FILTER_SET_TIMER] = { + .name = "@timer", + .help = "Schedule operations by time", + .value = + "alarm\0" + "getitimer\0" + "setitimer\0" + "timer_create\0" + "timer_delete\0" + "timer_getoverrun\0" + "timer_gettime\0" + "timer_gettime64\0" + "timer_settime\0" + "timer_settime64\0" + "timerfd_create\0" + "timerfd_gettime\0" + "timerfd_gettime64\0" + "timerfd_settime\0" + "timerfd_settime64\0" + "times\0" + }, + [SYSCALL_FILTER_SET_KNOWN] = { + .name = "@known", + .help = "All known syscalls declared in the kernel", + .value = + "@obsolete\0" +#include "syscall-list.h" + }, +}; + +const SyscallFilterSet *syscall_filter_set_find(const char *name) { + if (isempty(name) || name[0] != '@') + return NULL; + + for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) + if (streq(syscall_filter_sets[i].name, name)) + return syscall_filter_sets + i; + + return NULL; +} + +static int add_syscall_filter_set( + scmp_filter_ctx seccomp, + const SyscallFilterSet *set, + uint32_t action, + char **exclude, + bool log_missing, + char ***added); + +int seccomp_add_syscall_filter_item( + scmp_filter_ctx *seccomp, + const char *name, + uint32_t action, + char **exclude, + bool log_missing, + char ***added) { + + assert(seccomp); + assert(name); + + if (strv_contains(exclude, name)) + return 0; + + /* Any syscalls that are handled are added to the *added strv. The pointer + * must be either NULL or point to a valid pre-initialized possibly-empty strv. */ + + if (name[0] == '@') { + const SyscallFilterSet *other; + + other = syscall_filter_set_find(name); + if (!other) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Filter set %s is not known!", + name); + + return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added); + + } else { + int id, r; + + id = seccomp_syscall_resolve_name(name); + if (id == __NR_SCMP_ERROR) { + if (log_missing) + log_debug("System call %s is not known, ignoring.", name); + return 0; + } + + r = seccomp_rule_add_exact(seccomp, action, id, 0); + if (r < 0) { + /* If the system call is not known on this architecture, then that's fine, let's ignore it */ + bool ignore = r == -EDOM; + + if (!ignore || log_missing) + log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m", + name, id, ignore ? ", ignoring" : ""); + if (!ignore) + return r; + } + + if (added) { + r = strv_extend(added, name); + if (r < 0) + return r; + } + + return 0; + } +} + +static int add_syscall_filter_set( + scmp_filter_ctx seccomp, + const SyscallFilterSet *set, + uint32_t action, + char **exclude, + bool log_missing, + char ***added) { + + int r; + + /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */ + + assert(seccomp); + assert(set); + + NULSTR_FOREACH(sys, set->value) { + r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added); + if (r < 0) + return r; + } + + return 0; +} + +static uint32_t override_default_action(uint32_t default_action) { + /* When the requested filter is an allow-list, and the default action is something critical, we + * install ENOSYS as the default action, but it will only apply to syscalls which are not in the + * @known set. */ + + if (default_action == SCMP_ACT_ALLOW) + return default_action; + +#ifdef SCMP_ACT_LOG + if (default_action == SCMP_ACT_LOG) + return default_action; +#endif + + return SCMP_ACT_ERRNO(ENOSYS); +} + +int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) { + uint32_t arch, default_action_override; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for + * each local arch. */ + + default_action_override = override_default_action(default_action); + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + _cleanup_strv_free_ char **added = NULL; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, default_action_override); + if (r < 0) + return r; + + r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, &added); + if (r < 0) + return log_debug_errno(r, "Failed to add filter set: %m"); + + if (default_action != default_action_override) + NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) { + int id; + + id = seccomp_syscall_resolve_name(name); + if (id < 0) + continue; + + /* Ignore the syscall if it was already handled above */ + if (strv_contains(added, name)) + continue; + + r = seccomp_rule_add_exact(seccomp, default_action, id, 0); + if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */ + return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m", + name, id); + } + +#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2 + /* We have a large filter here, so let's turn on the binary tree mode if possible. */ + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2); + if (r < 0) + log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m"); +#endif + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) { + uint32_t arch, default_action_override; + int r; + + /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead + * of a SyscallFilterSet* table. */ + + if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW) + return 0; + + default_action_override = override_default_action(default_action); + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + void *syscall_id, *val; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, default_action_override); + if (r < 0) + return r; + + HASHMAP_FOREACH_KEY(val, syscall_id, filter) { + uint32_t a = action; + int id = PTR_TO_INT(syscall_id) - 1; + int error = PTR_TO_INT(val); + + if (error == SECCOMP_ERROR_NUMBER_KILL) + a = scmp_act_kill_process(); +#ifdef SCMP_ACT_LOG + else if (action == SCMP_ACT_LOG) + a = SCMP_ACT_LOG; +#endif + else if (error >= 0) + a = SCMP_ACT_ERRNO(error); + + r = seccomp_rule_add_exact(seccomp, a, id, 0); + if (r < 0) { + /* If the system call is not known on this architecture, then that's + * fine, let's ignore it */ + _cleanup_free_ char *n = NULL; + bool ignore; + + n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id); + ignore = r == -EDOM; + if (!ignore || log_missing) + log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m", + strna(n), id, ignore ? ", ignoring" : ""); + if (!ignore) + return r; + } + } + + if (default_action != default_action_override) + NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) { + int id; + + id = seccomp_syscall_resolve_name(name); + if (id < 0) + continue; + + /* Ignore the syscall if it was already handled above */ + if (hashmap_contains(filter, INT_TO_PTR(id + 1))) + continue; + + r = seccomp_rule_add_exact(seccomp, default_action, id, 0); + if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */ + return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m", + name, id); + } + +#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2 + /* We have a large filter here, so let's turn on the binary tree mode if possible. */ + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2); + if (r < 0) + log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m"); +#endif + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_parse_syscall_filter( + const char *name, + int errno_num, + Hashmap *filter, + SeccompParseFlags flags, + const char *unit, + const char *filename, + unsigned line) { + + int r; + + assert(name); + assert(filter); + + if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0) + return -EINVAL; + + if (name[0] == '@') { + const SyscallFilterSet *set; + + set = syscall_filter_set_find(name); + if (!set) { + if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE)) + return -EINVAL; + + log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "Unknown system call group, ignoring: %s", name); + return 0; + } + + NULSTR_FOREACH(i, set->value) { + /* Call ourselves again, for the group to parse. Note that we downgrade logging here + * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table + * are our own problem, not a problem in user configuration data and we shouldn't + * pretend otherwise by complaining about them. */ + r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line); + if (r < 0) + return r; + } + } else { + int id; + + id = seccomp_syscall_resolve_name(name); + if (id == __NR_SCMP_ERROR) { + if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE)) + return -EINVAL; + + log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "System call %s is not known, ignoring.", name); + return 0; + } + + /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it + * from the list. The entries in allow-list with non-negative error value will be handled + * with SCMP_ACT_ERRNO() instead of the default action. */ + if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) || + (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) { + r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)); + if (r < 0) + switch (r) { + case -ENOMEM: + return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM; + case -EEXIST: + assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0); + break; + default: + return r; + } + } else + (void) hashmap_remove(filter, INT_TO_PTR(id + 1)); + } + + return 0; +} + +int seccomp_restrict_namespaces(unsigned long retain) { + uint32_t arch; + int r; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *s = NULL; + + (void) namespace_flags_to_string(retain, &s); + log_debug("Restricting namespace to: %s.", strna(s)); + } + + /* NOOP? */ + if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL)) + return 0; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + /* We cannot filter on individual flags to clone3(), and we need to disable the + * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other + * users shall fall back to clone(), as if on an older kernel. + * + * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330, + * https://github.com/moby/moby/issues/42680. */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(clone3), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", + seccomp_arch_to_string(arch)); + + if ((retain & NAMESPACE_FLAGS_ALL) == 0) + /* If every single kind of namespace shall be prohibited, then let's block the whole + * setns() syscall altogether. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 0); + else + /* Otherwise, block only the invocations with the appropriate flags in the loop + * below, but also the special invocation with a zero flags argument, right here. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_EQ, 0)); + if (r < 0) { + log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + for (unsigned i = 0; namespace_info[i].proc_name; i++) { + unsigned long f; + + f = namespace_info[i].clone_flag; + if (FLAGS_SET(retain, f)) { + log_debug("Permitting %s.", namespace_info[i].proc_name); + continue; + } + + log_trace("Blocking %s.", namespace_info[i].proc_name); + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(unshare), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + break; + } + + /* On s390/s390x the first two parameters to clone are switched */ + if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X)) + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(clone), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + else + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(clone), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + break; + } + + if ((retain & NAMESPACE_FLAGS_ALL) != 0) { + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + break; + } + } + } + if (r < 0) + continue; + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_sysctl(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + if (IN_SET(arch, + SCMP_ARCH_AARCH64, +#ifdef SCMP_ARCH_LOONGARCH64 + SCMP_ARCH_LOONGARCH64, +#endif +#ifdef SCMP_ARCH_RISCV64 + SCMP_ARCH_RISCV64, +#endif + SCMP_ARCH_X32 + )) + /* No _sysctl syscall */ + continue; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(_sysctl), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_syslog(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(syslog), + 0); + + if (r < 0) { + log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_restrict_address_families(Set *address_families, bool allow_list) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + bool supported; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + switch (arch) { + + case SCMP_ARCH_X86_64: + case SCMP_ARCH_X32: + case SCMP_ARCH_ARM: + case SCMP_ARCH_AARCH64: +#ifdef SCMP_ARCH_LOONGARCH64 + case SCMP_ARCH_LOONGARCH64: +#endif + case SCMP_ARCH_MIPSEL64N32: + case SCMP_ARCH_MIPS64N32: + case SCMP_ARCH_MIPSEL64: + case SCMP_ARCH_MIPS64: +#ifdef SCMP_ARCH_RISCV64 + case SCMP_ARCH_RISCV64: +#endif + /* These we know we support (i.e. are the ones that do not use socketcall()) */ + supported = true; + break; + + case SCMP_ARCH_S390: + case SCMP_ARCH_S390X: + case SCMP_ARCH_X86: + case SCMP_ARCH_MIPSEL: + case SCMP_ARCH_MIPS: +#ifdef SCMP_ARCH_PARISC + case SCMP_ARCH_PARISC: +#endif +#ifdef SCMP_ARCH_PARISC64 + case SCMP_ARCH_PARISC64: +#endif + case SCMP_ARCH_PPC: + case SCMP_ARCH_PPC64: + case SCMP_ARCH_PPC64LE: + default: + /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we + * don't know */ + supported = false; + break; + } + + if (!supported) + continue; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + if (allow_list) { + int first = 0, last = 0; + void *afp; + + /* If this is an allow list, we first block the address families that are out of + * range and then everything that is not in the set. First, we find the lowest and + * highest address family in the set. */ + + SET_FOREACH(afp, address_families) { + int af = PTR_TO_INT(afp); + + if (af <= 0 || af >= af_max()) + continue; + + if (first == 0 || af < first) + first = af; + + if (last == 0 || af > last) + last = af; + } + + assert((first == 0) == (last == 0)); + + if (first == 0) { + + /* No entries in the valid range, block everything */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + } else { + + /* Block everything below the first entry */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_LT, first)); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + /* Block everything above the last entry */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_GT, last)); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + /* Block everything between the first and last entry */ + for (int af = 1; af < af_max(); af++) { + + if (set_contains(address_families, INT_TO_PTR(af))) + continue; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_EQ, af)); + if (r < 0) + break; + } + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + } + + } else { + void *af; + + /* If this is a deny list, then generate one rule for each address family that are + * then combined in OR checks. */ + + SET_FOREACH(af, address_families) { + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af))); + if (r < 0) + break; + } + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_restrict_realtime_full(int error_code) { + static const int permitted_policies[] = { + SCHED_OTHER, + SCHED_BATCH, + SCHED_IDLE, + }; + + int r, max_policy = 0; + uint32_t arch; + unsigned i; + + assert(error_code > 0); + + /* Determine the highest policy constant we want to allow */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] > max_policy) + max_policy = permitted_policies[i]; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int p; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + /* Go through all policies with lower values than that, and block them -- unless they appear in the + * allow list. */ + for (p = 0; p < max_policy; p++) { + bool good = false; + + /* Check if this is in the allow list. */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] == p) { + good = true; + break; + } + + if (good) + continue; + + /* Deny this policy */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(error_code), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_EQ, p)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + } + + /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons + * are unsigned here, hence no need no check for < 0 values. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(error_code), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_GT, max_policy)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp, + uint32_t arch, + int nr, + unsigned arg_cnt, + const struct scmp_arg_cmp arg) { + int r; + + r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg); + if (r < 0) { + _cleanup_free_ char *n = NULL; + + n = seccomp_syscall_resolve_num_arch(arch, nr); + log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m", + strna(n), + seccomp_arch_to_string(arch)); + } + + return r; +} + +/* For known architectures, check that syscalls are indeed defined or not. */ +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64) +assert_cc(SCMP_SYS(shmget) > 0); +assert_cc(SCMP_SYS(shmat) > 0); +assert_cc(SCMP_SYS(shmdt) > 0); +#endif + +int seccomp_memory_deny_write_execute(void) { + uint32_t arch; + unsigned loaded = 0; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r; + + log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + switch (arch) { + + /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc(). + * We ignore that here, which means there's still a way to get writable/executable + * memory, if an IPC key is mapped like this. That's a pity, but no total loss. + * + * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress + * on that front (kernel work done in 5.18). + */ + + case SCMP_ARCH_X86: + case SCMP_ARCH_S390: + filter_syscall = SCMP_SYS(mmap2); + block_syscall = SCMP_SYS(mmap); + /* shmat multiplexed, see above */ + break; + + case SCMP_ARCH_PPC: + case SCMP_ARCH_PPC64: + case SCMP_ARCH_PPC64LE: + case SCMP_ARCH_S390X: + filter_syscall = SCMP_SYS(mmap); + /* shmat multiplexed, see above */ + break; + + case SCMP_ARCH_ARM: + filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */ + shmat_syscall = SCMP_SYS(shmat); + break; + + case SCMP_ARCH_X86_64: + case SCMP_ARCH_X32: + case SCMP_ARCH_AARCH64: +#ifdef SCMP_ARCH_LOONGARCH64 + case SCMP_ARCH_LOONGARCH64: +#endif +#ifdef SCMP_ARCH_RISCV64 + case SCMP_ARCH_RISCV64: +#endif + filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */ + shmat_syscall = SCMP_SYS(shmat); + break; + + /* Please add more definitions here, if you port systemd to other architectures! */ + +#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64) +#warning "Consider adding the right mmap() syscall definitions here!" +#endif + } + + /* Can't filter mmap() on this arch, then skip it */ + if (filter_syscall == 0) + continue; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall, + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE)); + if (r < 0) + continue; + + if (block_syscall != 0) { + r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} ); + if (r < 0) + continue; + } + + r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); + if (r < 0) + continue; + + r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); + if (r < 0) + continue; + + if (shmat_syscall > 0) { + r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall, + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC)); + if (r < 0) + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + loaded++; + } + + if (loaded == 0) + log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=."); + + return loaded; +} + +int seccomp_restrict_archs(Set *archs) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int r; + bool blocked_new = false; + + /* This installs a filter with no rules, but that restricts the system call architectures to the specified + * list. + * + * There are some qualifications. However the most important use is to stop processes from bypassing + * system call restrictions, in case they used a broader (multiplexing) syscall which is only available + * in a non-native architecture. There are no holes in this use case, at least so far. */ + + /* Note libseccomp includes our "native" (current) architecture in the filter by default. + * We do not remove it. For example, our callers expect to be able to call execve() afterwards + * to run a program with the restrictions applied. */ + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) { + uint32_t arch = seccomp_local_archs[i]; + + /* See above comment, our "native" architecture is never blocked. */ + if (arch == seccomp_arch_native()) + continue; + + /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */ + if (arch == SECCOMP_LOCAL_ARCH_BLOCKED) + continue; + + bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1)); + + /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32 + * x32 syscalls should basically match x86-64 for everything except the pointer type. + * The important thing is that you can block the old 32-bit x86 syscalls. + * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */ + if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32) + block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1)); + + if (block) { + seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED; + blocked_new = true; + } else { + r = seccomp_arch_add(seccomp, arch); + if (r < 0 && r != -EEXIST) + return r; + } + } + + /* All architectures that will be blocked by the seccomp program were + * already blocked. */ + if (!blocked_new) + return 0; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + return r; + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m"); + + return 0; +} + +int parse_syscall_archs(char **l, Set **ret_archs) { + _cleanup_set_free_ Set *archs = NULL; + int r; + + assert(l); + assert(ret_archs); + + STRV_FOREACH(s, l) { + uint32_t a; + + r = seccomp_arch_from_string(*s, &a); + if (r < 0) + return -EINVAL; + + r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1)); + if (r < 0) + return -ENOMEM; + } + + *ret_archs = TAKE_PTR(archs); + return 0; +} + +int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) { + int r; + + assert(set); + + NULSTR_FOREACH(i, set->value) { + + if (i[0] == '@') { + const SyscallFilterSet *more; + + more = syscall_filter_set_find(i); + if (!more) + return -ENXIO; + + r = seccomp_filter_set_add(filter, add, more); + if (r < 0) + return r; + } else { + int id; + + id = seccomp_syscall_resolve_name(i); + if (id == __NR_SCMP_ERROR) { + log_debug("System call %s is not known, ignoring.", i); + continue; + } + + if (add) { + r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1)); + if (r < 0) + return r; + } else + (void) hashmap_remove(filter, INT_TO_PTR(id + 1)); + } + } + + return 0; +} + +int seccomp_lock_personality(unsigned long personality) { + uint32_t arch; + int r; + + if (personality >= PERSONALITY_INVALID) + return -EINVAL; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(personality), + 1, + SCMP_A0(SCMP_CMP_NE, personality)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_hostname(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(sethostname), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setdomainname), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) { + /* Checks the mode_t parameter of the following system calls: + * + * → chmod() + fchmod() + fchmodat() + fchmodat2() + * → open() + creat() + openat() + * → mkdir() + mkdirat() + * → mknod() + mknodat() + * + * Returns error if *everything* failed, and 0 otherwise. + */ + int r; + bool any = false; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(chmod), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for chmod: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(fchmod), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for fchmod: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(fchmodat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for fchmodat: %m"); + else + any = true; + +#if defined(__SNR_fchmodat2) + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(fchmodat2), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); +#else + /* It looks like this libseccomp does not know about fchmodat2(). + * Pretend the fchmodat2() system call is not supported at all, + * regardless of the kernel version. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + __NR_fchmodat2, + 0); +#endif + if (r < 0) + log_debug_errno(r, "Failed to add filter for fchmodat2: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mkdir), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mkdir: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mkdirat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mkdirat: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mknod), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mknod: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mknodat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for mknodat: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(open), + 2, + SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT), + SCMP_A2(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for open: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(openat), + 2, + SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT), + SCMP_A3(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat: %m"); + else + any = true; + +#if defined(__SNR_openat2) + /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into + * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do + * for now, since openat2() is very new and code generally needs fallback logic anyway to be + * compatible with kernels that are not absolutely recent. We would normally return EPERM for a + * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs + * to call open() or openat() instead. We can properly enforce policy for those functions. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(openat2), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat2: %m"); + else + any = true; +#endif + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(creat), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, m, m)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for creat: %m"); + else + any = true; + + return any ? 0 : r; +} + +int seccomp_restrict_suid_sgid(void) { + uint32_t arch; + int r, k; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_restrict_sxid(seccomp, S_ISUID); + if (r < 0) + log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", + seccomp_arch_to_string(arch)); + + k = seccomp_restrict_sxid(seccomp, S_ISGID); + if (k < 0) + log_debug_errno(k, "Failed to add sgid rule for architecture %s, ignoring: %m", + seccomp_arch_to_string(arch)); + + if (r < 0 && k < 0) + continue; + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} + +uint32_t scmp_act_kill_process(void) { + + /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never + * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of + * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least + * for single-threaded apps does the right thing. */ + +#ifdef SCMP_ACT_KILL_PROCESS + if (seccomp_api_get() >= 3) + return SCMP_ACT_KILL_PROCESS; +#endif + + return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */ +} + +int parse_syscall_and_errno(const char *in, char **name, int *error) { + _cleanup_free_ char *n = NULL; + char *p; + int e = -1; + + assert(in); + assert(name); + assert(error); + + /* + * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255". + * If errno is omitted, then error is set to -1. + * Empty syscall name is not allowed. + * Here, we do not check that the syscall name is valid or not. + */ + + p = strchr(in, ':'); + if (p) { + e = seccomp_parse_errno_or_action(p + 1); + if (e < 0) + return e; + + n = strndup(in, p - in); + } else + n = strdup(in); + + if (!n) + return -ENOMEM; + + if (isempty(n)) + return -EINVAL; + + *error = e; + *name = TAKE_PTR(n); + + return 0; +} + +static int block_open_flag(scmp_filter_ctx seccomp, int flag) { + bool any = false; + int r; + + /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return + * EINVAL, in the hope the client code will retry without O_SYNC then. */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EINVAL), + SCMP_SYS(open), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for open: %m"); + else + any = true; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EINVAL), + SCMP_SYS(openat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat: %m"); + else + any = true; + +#if defined(__SNR_openat2) + /* The new openat2() system call can't be filtered sensibly, see above. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(openat2), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat2: %m"); + else + any = true; +#endif + + return any ? 0 : r; +} + +int seccomp_suppress_sync(void) { + uint32_t arch; + int r; + + /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately + * manageable, and also masks O_SYNC/O_DSYNC */ + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) { + int id; + + id = seccomp_syscall_resolve_name(c); + if (id == __NR_SCMP_ERROR) { + log_debug("System call %s is not known, ignoring.", c); + continue; + } + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */ + id, + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c); + } + + (void) block_open_flag(seccomp, O_SYNC); +#if O_DSYNC != O_SYNC + (void) block_open_flag(seccomp, O_DSYNC); +#endif + + r = seccomp_load(seccomp); + if (ERRNO_IS_NEG_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", + seccomp_arch_to_string(arch)); + } + + return 0; +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h new file mode 100644 index 0000000..7583357 --- /dev/null +++ b/src/shared/seccomp-util.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_SECCOMP +#include +#endif +#include +#include + +#include "errno-list.h" +#include "errno-util.h" +#include "parse-util.h" +#include "set.h" +#include "string-util.h" + +#if HAVE_SECCOMP + +const char* seccomp_arch_to_string(uint32_t c); +int seccomp_arch_from_string(const char *n, uint32_t *ret); + +int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action); + +bool is_seccomp_available(void); + +typedef struct SyscallFilterSet { + const char *name; + const char *help; + const char *value; +} SyscallFilterSet; + +enum { + /* Please leave DEFAULT first and KNOWN last, but sort the rest alphabetically */ + SYSCALL_FILTER_SET_DEFAULT, + SYSCALL_FILTER_SET_AIO, + SYSCALL_FILTER_SET_BASIC_IO, + SYSCALL_FILTER_SET_CHOWN, + SYSCALL_FILTER_SET_CLOCK, + SYSCALL_FILTER_SET_CPU_EMULATION, + SYSCALL_FILTER_SET_DEBUG, + SYSCALL_FILTER_SET_FILE_SYSTEM, + SYSCALL_FILTER_SET_IO_EVENT, + SYSCALL_FILTER_SET_IPC, + SYSCALL_FILTER_SET_KEYRING, + SYSCALL_FILTER_SET_MEMLOCK, + SYSCALL_FILTER_SET_MODULE, + SYSCALL_FILTER_SET_MOUNT, + SYSCALL_FILTER_SET_NETWORK_IO, + SYSCALL_FILTER_SET_OBSOLETE, + SYSCALL_FILTER_SET_PKEY, + SYSCALL_FILTER_SET_PRIVILEGED, + SYSCALL_FILTER_SET_PROCESS, + SYSCALL_FILTER_SET_RAW_IO, + SYSCALL_FILTER_SET_REBOOT, + SYSCALL_FILTER_SET_RESOURCES, + SYSCALL_FILTER_SET_SANDBOX, + SYSCALL_FILTER_SET_SETUID, + SYSCALL_FILTER_SET_SIGNAL, + SYSCALL_FILTER_SET_SWAP, + SYSCALL_FILTER_SET_SYNC, + SYSCALL_FILTER_SET_SYSTEM_SERVICE, + SYSCALL_FILTER_SET_TIMER, + SYSCALL_FILTER_SET_KNOWN, + _SYSCALL_FILTER_SET_MAX, +}; + +assert_cc(SYSCALL_FILTER_SET_DEFAULT == 0); +assert_cc(SYSCALL_FILTER_SET_KNOWN == _SYSCALL_FILTER_SET_MAX-1); + +extern const SyscallFilterSet syscall_filter_sets[]; + +const SyscallFilterSet *syscall_filter_set_find(const char *name); + +int seccomp_filter_set_add(Hashmap *s, bool b, const SyscallFilterSet *set); + +int seccomp_add_syscall_filter_item( + scmp_filter_ctx *ctx, + const char *name, + uint32_t action, + char **exclude, + bool log_missing, + char ***added); + +int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing); +int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing); + +typedef enum SeccompParseFlags { + SECCOMP_PARSE_INVERT = 1 << 0, + SECCOMP_PARSE_ALLOW_LIST = 1 << 1, + SECCOMP_PARSE_LOG = 1 << 2, + SECCOMP_PARSE_PERMISSIVE = 1 << 3, +} SeccompParseFlags; + +int seccomp_parse_syscall_filter( + const char *name, + int errno_num, + Hashmap *filter, + SeccompParseFlags flags, + const char *unit, + const char *filename, unsigned line); + +int seccomp_restrict_archs(Set *archs); +int seccomp_restrict_namespaces(unsigned long retain); +int seccomp_protect_sysctl(void); +int seccomp_protect_syslog(void); +int seccomp_restrict_address_families(Set *address_families, bool allow_list); +int seccomp_restrict_realtime_full(int error_code); /* This is mostly for testing code. */ +static inline int seccomp_restrict_realtime(void) { + return seccomp_restrict_realtime_full(EPERM); +} +int seccomp_memory_deny_write_execute(void); +int seccomp_lock_personality(unsigned long personality); +int seccomp_protect_hostname(void); +int seccomp_restrict_suid_sgid(void); + +extern uint32_t seccomp_local_archs[]; + +#define SECCOMP_LOCAL_ARCH_END UINT32_MAX + +/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would + * never be in the seccomp_local_archs array anyway so we can use it as a + * marker. */ +#define SECCOMP_LOCAL_ARCH_BLOCKED 0 + +#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \ + for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \ + (arch) != SECCOMP_LOCAL_ARCH_END; \ + (arch) = seccomp_local_archs[++_i]) \ + if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED) + +/* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1 + * ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant + * EFAULT: addresses passed as args (by libseccomp) are invalid */ +static inline bool ERRNO_IS_NEG_SECCOMP_FATAL(intmax_t r) { + return IN_SET(r, + -EPERM, + -EACCES, + -ENOMEM, + -EFAULT); +} +_DEFINE_ABS_WRAPPER(SECCOMP_FATAL); + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(scmp_filter_ctx, seccomp_release, NULL); + +int parse_syscall_archs(char **l, Set **ret_archs); + +uint32_t scmp_act_kill_process(void); + +int parse_syscall_and_errno(const char *in, char **name, int *error); + +int seccomp_suppress_sync(void); + +#else + +static inline bool is_seccomp_available(void) { + return false; +} + +#endif + +/* This is a special value to be used where syscall filters otherwise expect errno numbers, will be + replaced with real seccomp action. */ +enum { + SECCOMP_ERROR_NUMBER_KILL = INT_MAX - 1, +}; + +static inline bool seccomp_errno_or_action_is_valid(int n) { + return n == SECCOMP_ERROR_NUMBER_KILL || errno_is_valid(n); +} + +static inline int seccomp_parse_errno_or_action(const char *p) { + if (streq_ptr(p, "kill")) + return SECCOMP_ERROR_NUMBER_KILL; + return parse_errno(p); +} + +static inline const char *seccomp_errno_or_action_to_string(int num) { + if (num == SECCOMP_ERROR_NUMBER_KILL) + return "kill"; + return errno_to_name(num); +} diff --git a/src/shared/securebits-util.c b/src/shared/securebits-util.c new file mode 100644 index 0000000..c867807 --- /dev/null +++ b/src/shared/securebits-util.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "extract-word.h" +#include "securebits-util.h" +#include "string-util.h" + +int secure_bits_to_string_alloc(int i, char **s) { + _cleanup_free_ char *str = NULL; + size_t len; + int r; + + assert(s); + + r = asprintf(&str, "%s%s%s%s%s%s", + (i & (1 << SECURE_KEEP_CAPS)) ? "keep-caps " : "", + (i & (1 << SECURE_KEEP_CAPS_LOCKED)) ? "keep-caps-locked " : "", + (i & (1 << SECURE_NO_SETUID_FIXUP)) ? "no-setuid-fixup " : "", + (i & (1 << SECURE_NO_SETUID_FIXUP_LOCKED)) ? "no-setuid-fixup-locked " : "", + (i & (1 << SECURE_NOROOT)) ? "noroot " : "", + (i & (1 << SECURE_NOROOT_LOCKED)) ? "noroot-locked " : ""); + if (r < 0) + return -ENOMEM; + + len = strlen(str); + if (len != 0) + str[len - 1] = '\0'; + + *s = TAKE_PTR(str); + + return 0; +} + +int secure_bits_from_string(const char *s) { + int secure_bits = 0; + const char *p; + int r; + + for (p = s;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE); + if (r == -ENOMEM) + return r; + if (r <= 0) + break; + + if (streq(word, "keep-caps")) + secure_bits |= 1 << SECURE_KEEP_CAPS; + else if (streq(word, "keep-caps-locked")) + secure_bits |= 1 << SECURE_KEEP_CAPS_LOCKED; + else if (streq(word, "no-setuid-fixup")) + secure_bits |= 1 << SECURE_NO_SETUID_FIXUP; + else if (streq(word, "no-setuid-fixup-locked")) + secure_bits |= 1 << SECURE_NO_SETUID_FIXUP_LOCKED; + else if (streq(word, "noroot")) + secure_bits |= 1 << SECURE_NOROOT; + else if (streq(word, "noroot-locked")) + secure_bits |= 1 << SECURE_NOROOT_LOCKED; + } + + return secure_bits; +} diff --git a/src/shared/securebits-util.h b/src/shared/securebits-util.h new file mode 100644 index 0000000..caf8e6d --- /dev/null +++ b/src/shared/securebits-util.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "missing_securebits.h" + +int secure_bits_to_string_alloc(int i, char **s); +int secure_bits_from_string(const char *s); + +static inline bool secure_bits_is_valid(int i) { + return ((SECURE_ALL_BITS | SECURE_ALL_LOCKS) & i) == i; +} + +static inline int secure_bits_to_string_alloc_with_check(int n, char **s) { + if (!secure_bits_is_valid(n)) + return -EINVAL; + + return secure_bits_to_string_alloc(n, s); +} diff --git a/src/shared/selinux-util.c b/src/shared/selinux-util.c new file mode 100644 index 0000000..2fef29c --- /dev/null +++ b/src/shared/selinux-util.c @@ -0,0 +1,762 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_SELINUX +#include +#include +#include +#include +#endif + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "label.h" +#include "log.h" +#include "macro.h" +#include "mallinfo-util.h" +#include "path-util.h" +#include "selinux-util.h" +#include "stdio-util.h" +#include "time-util.h" + +#if HAVE_SELINUX +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(context_t, context_free, NULL); +#define _cleanup_context_free_ _cleanup_(context_freep) + +typedef enum Initialized { + UNINITIALIZED, + INITIALIZED, + LAZY_INITIALIZED, +} Initialized; + +static int mac_selinux_reload(int seqno); + +static int cached_use = -1; +static Initialized initialized = UNINITIALIZED; +static int last_policyload = 0; +static struct selabel_handle *label_hnd = NULL; +static bool have_status_page = false; + +#define log_enforcing(...) \ + log_full(mac_selinux_enforcing() ? LOG_ERR : LOG_WARNING, __VA_ARGS__) + +#define log_enforcing_errno(error, ...) \ + ({ \ + bool _enforcing = mac_selinux_enforcing(); \ + int _level = _enforcing ? LOG_ERR : LOG_WARNING; \ + int _e = (error); \ + \ + int _r = (log_get_max_level() >= LOG_PRI(_level)) \ + ? log_internal(_level, _e, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__) \ + : -ERRNO_VALUE(_e); \ + _enforcing ? _r : 0; \ + }) + +static int mac_selinux_label_pre(int dir_fd, const char *path, mode_t mode) { + return mac_selinux_create_file_prepare_at(dir_fd, path, mode); +} + +static int mac_selinux_label_post(int dir_fd, const char *path) { + mac_selinux_create_file_clear(); + return 0; +} +#endif + +bool mac_selinux_use(void) { +#if HAVE_SELINUX + if (_unlikely_(cached_use < 0)) { + cached_use = is_selinux_enabled() > 0; + log_trace("SELinux enabled state cached to: %s", enabled_disabled(cached_use)); + } + + return cached_use; +#else + return false; +#endif +} + +bool mac_selinux_enforcing(void) { + int r = 0; +#if HAVE_SELINUX + + /* If the SELinux status page has been successfully opened, retrieve the enforcing + * status over it to avoid system calls in security_getenforce(). */ + + if (have_status_page) + r = selinux_status_getenforce(); + else + r = security_getenforce(); + +#endif + return r != 0; +} + +void mac_selinux_retest(void) { +#if HAVE_SELINUX + cached_use = -1; +#endif +} + +#if HAVE_SELINUX +static int open_label_db(void) { + struct selabel_handle *hnd; + /* Avoid maybe-uninitialized false positives */ + usec_t before_timestamp = USEC_INFINITY, after_timestamp = USEC_INFINITY; +# if HAVE_GENERIC_MALLINFO + generic_mallinfo before_mallinfo = {}; +# endif + + if (DEBUG_LOGGING) { +# if HAVE_GENERIC_MALLINFO + before_mallinfo = generic_mallinfo_get(); +# endif + before_timestamp = now(CLOCK_MONOTONIC); + } + + hnd = selabel_open(SELABEL_CTX_FILE, NULL, 0); + if (!hnd) + return log_enforcing_errno(errno, "Failed to initialize SELinux labeling handle: %m"); + + if (DEBUG_LOGGING) { + after_timestamp = now(CLOCK_MONOTONIC); +# if HAVE_GENERIC_MALLINFO + generic_mallinfo after_mallinfo = generic_mallinfo_get(); + size_t l = LESS_BY((size_t) after_mallinfo.uordblks, (size_t) before_mallinfo.uordblks); + log_debug("Successfully loaded SELinux database in %s, size on heap is %zuK.", + FORMAT_TIMESPAN(after_timestamp - before_timestamp, 0), + DIV_ROUND_UP(l, 1024)); +# else + log_debug("Successfully loaded SELinux database in %s.", + FORMAT_TIMESPAN(after_timestamp - before_timestamp, 0)); +# endif + } + + /* release memory after measurement */ + if (label_hnd) + selabel_close(label_hnd); + label_hnd = TAKE_PTR(hnd); + + return 0; +} +#endif + +static int selinux_init(bool force) { +#if HAVE_SELINUX + static const LabelOps label_ops = { + .pre = mac_selinux_label_pre, + .post = mac_selinux_label_post, + }; + int r; + + if (!mac_selinux_use()) + return 0; + + if (initialized == INITIALIZED) + return 1; + + /* Internal call from this module? Unless we were explicitly configured to allow lazy initialization + * bail out immediately. Pretend all is good, we do not want callers to abort here, for example at + * early boot when the policy is being initialised. */ + if (!force && initialized != LAZY_INITIALIZED) + return 1; + + r = selinux_status_open(/* netlink fallback */ 1); + if (r < 0) { + if (!ERRNO_IS_PRIVILEGE(errno)) + return log_enforcing_errno(errno, "Failed to open SELinux status page: %m"); + log_warning_errno(errno, "selinux_status_open() with netlink fallback failed, not checking for policy reloads: %m"); + } else if (r == 1) + log_warning("selinux_status_open() failed to open the status page, using the netlink fallback."); + else + have_status_page = true; + + r = open_label_db(); + if (r < 0) { + selinux_status_close(); + return r; + } + + r = label_ops_set(&label_ops); + if (r < 0) + return r; + + /* Save the current policyload sequence number, so mac_selinux_maybe_reload() does not trigger on + * first call without any actual change. */ + last_policyload = selinux_status_policyload(); + + initialized = INITIALIZED; + return 1; +#else + return 0; +#endif +} + +int mac_selinux_init(void) { + return selinux_init(/* force= */ true); +} + +int mac_selinux_init_lazy(void) { +#if HAVE_SELINUX + if (initialized == UNINITIALIZED) + initialized = LAZY_INITIALIZED; /* We'll be back later */ +#endif + + return 0; +} + +void mac_selinux_maybe_reload(void) { +#if HAVE_SELINUX + int policyload; + + if (!initialized) + return; + + /* Do not use selinux_status_updated(3), cause since libselinux 3.2 selinux_check_access(3), + * called in core and user instances, does also use it under the hood. + * That can cause changes to be consumed by selinux_check_access(3) and not being visible here. + * Also do not use selinux callbacks, selinux_set_callback(3), cause they are only automatically + * invoked since libselinux 3.2 by selinux_status_updated(3). + * Relevant libselinux commit: https://github.com/SELinuxProject/selinux/commit/05bdc03130d741e53e1fb45a958d0a2c184be503 + * Debian Bullseye is going to ship libselinux 3.1, so stay compatible for backports. */ + policyload = selinux_status_policyload(); + if (policyload < 0) { + log_debug_errno(errno, "Failed to get SELinux policyload from status page: %m"); + return; + } + + if (policyload != last_policyload) { + mac_selinux_reload(policyload); + last_policyload = policyload; + } +#endif +} + +void mac_selinux_finish(void) { + +#if HAVE_SELINUX + if (label_hnd) { + selabel_close(label_hnd); + label_hnd = NULL; + } + + selinux_status_close(); + have_status_page = false; + + initialized = false; +#endif +} + +#if HAVE_SELINUX +static int mac_selinux_reload(int seqno) { + log_debug("SELinux reload %d", seqno); + + (void) open_label_db(); + + return 0; +} +#endif + +#if HAVE_SELINUX +static int selinux_fix_fd( + int fd, + const char *label_path, + LabelFixFlags flags) { + + _cleanup_freecon_ char* fcon = NULL; + struct stat st; + int r; + + assert(fd >= 0); + assert(label_path); + assert(path_is_absolute(label_path)); + + if (fstat(fd, &st) < 0) + return -errno; + + /* Check for policy reload so 'label_hnd' is kept up-to-date by callbacks */ + mac_selinux_maybe_reload(); + if (!label_hnd) + return 0; + + if (selabel_lookup_raw(label_hnd, &fcon, label_path, st.st_mode) < 0) { + /* If there's no label to set, then exit without warning */ + if (errno == ENOENT) + return 0; + + return log_enforcing_errno(errno, "Unable to lookup intended SELinux security context of %s: %m", label_path); + } + + if (setfilecon_raw(FORMAT_PROC_FD_PATH(fd), fcon) < 0) { + _cleanup_freecon_ char *oldcon = NULL; + + r = -errno; + + /* If the FS doesn't support labels, then exit without warning */ + if (ERRNO_IS_NOT_SUPPORTED(r)) + return 0; + + /* It the FS is read-only and we were told to ignore failures caused by that, suppress error */ + if (r == -EROFS && (flags & LABEL_IGNORE_EROFS)) + return 0; + + /* If the old label is identical to the new one, suppress any kind of error */ + if (getfilecon_raw(FORMAT_PROC_FD_PATH(fd), &oldcon) >= 0 && streq_ptr(fcon, oldcon)) + return 0; + + return log_enforcing_errno(r, "Unable to fix SELinux security context of %s: %m", label_path); + } + + return 0; +} +#endif + +int mac_selinux_fix_full( + int atfd, + const char *inode_path, + const char *label_path, + LabelFixFlags flags) { + + assert(atfd >= 0 || atfd == AT_FDCWD); + assert(atfd >= 0 || inode_path); + +#if HAVE_SELINUX + _cleanup_close_ int opened_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int inode_fd, r; + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + if (!label_hnd) + return 0; + + if (inode_path) { + opened_fd = openat(atfd, inode_path, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (opened_fd < 0) { + if ((flags & LABEL_IGNORE_ENOENT) && errno == ENOENT) + return 0; + + return -errno; + } + + inode_fd = opened_fd; + } else + inode_fd = atfd; + + if (!label_path) { + if (path_is_absolute(inode_path)) + label_path = inode_path; + else { + r = fd_get_path(inode_fd, &p); + if (r < 0) + return r; + + label_path = p; + } + } + + return selinux_fix_fd(inode_fd, label_path, flags); +#else + return 0; +#endif +} + +int mac_selinux_apply(const char *path, const char *label) { + + assert(path); + +#if HAVE_SELINUX + int r; + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + assert(label); + + if (setfilecon(path, label) < 0) + return log_enforcing_errno(errno, "Failed to set SELinux security context %s on path %s: %m", label, path); +#endif + return 0; +} + +int mac_selinux_apply_fd(int fd, const char *path, const char *label) { + + assert(fd >= 0); + +#if HAVE_SELINUX + int r; + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + assert(label); + + if (setfilecon(FORMAT_PROC_FD_PATH(fd), label) < 0) + return log_enforcing_errno(errno, "Failed to set SELinux security context %s on path %s: %m", label, strna(path)); +#endif + return 0; +} + +int mac_selinux_get_create_label_from_exe(const char *exe, char **label) { +#if HAVE_SELINUX + _cleanup_freecon_ char *mycon = NULL, *fcon = NULL; + security_class_t sclass; + int r; + + assert(exe); + assert(label); + + r = selinux_init(/* force= */ false); + if (r < 0) + return r; + if (r == 0) + return -EOPNOTSUPP; + + if (getcon_raw(&mycon) < 0) + return -errno; + if (!mycon) + return -EOPNOTSUPP; + + if (getfilecon_raw(exe, &fcon) < 0) + return -errno; + if (!fcon) + return -EOPNOTSUPP; + + sclass = string_to_security_class("process"); + if (sclass == 0) + return -ENOSYS; + + return RET_NERRNO(security_compute_create_raw(mycon, fcon, sclass, label)); +#else + return -EOPNOTSUPP; +#endif +} + +int mac_selinux_get_our_label(char **ret) { + assert(ret); + +#if HAVE_SELINUX + int r; + + r = selinux_init(/* force= */ false); + if (r < 0) + return r; + if (r == 0) + return -EOPNOTSUPP; + + _cleanup_freecon_ char *con = NULL; + if (getcon_raw(&con) < 0) + return -errno; + if (!con) + return -EOPNOTSUPP; + + *ret = TAKE_PTR(con); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int mac_selinux_get_child_mls_label(int socket_fd, const char *exe, const char *exec_label, char **ret_label) { +#if HAVE_SELINUX + _cleanup_freecon_ char *mycon = NULL, *peercon = NULL, *fcon = NULL; + _cleanup_context_free_ context_t pcon = NULL, bcon = NULL; + const char *range = NULL, *bcon_str = NULL; + security_class_t sclass; + int r; + + assert(socket_fd >= 0); + assert(exe); + assert(ret_label); + + r = selinux_init(/* force= */ false); + if (r < 0) + return r; + if (r == 0) + return -EOPNOTSUPP; + + if (getcon_raw(&mycon) < 0) + return -errno; + if (!mycon) + return -EOPNOTSUPP; + + if (getpeercon_raw(socket_fd, &peercon) < 0) + return -errno; + if (!peercon) + return -EOPNOTSUPP; + + if (!exec_label) { /* If there is no context set for next exec let's use context of target executable */ + if (getfilecon_raw(exe, &fcon) < 0) + return -errno; + if (!fcon) + return -EOPNOTSUPP; + } + + bcon = context_new(mycon); + if (!bcon) + return -ENOMEM; + + pcon = context_new(peercon); + if (!pcon) + return -ENOMEM; + + range = context_range_get(pcon); + if (!range) + return -errno; + + if (context_range_set(bcon, range) != 0) + return -errno; + + bcon_str = context_str(bcon); + if (!bcon_str) + return -ENOMEM; + + sclass = string_to_security_class("process"); + if (sclass == 0) + return -ENOSYS; + + return RET_NERRNO(security_compute_create_raw(bcon_str, fcon, sclass, ret_label)); +#else + return -EOPNOTSUPP; +#endif +} + +char* mac_selinux_free(char *label) { + +#if HAVE_SELINUX + freecon(label); +#else + assert(!label); +#endif + + return NULL; +} + +#if HAVE_SELINUX +static int selinux_create_file_prepare_abspath(const char *abspath, mode_t mode) { + _cleanup_freecon_ char *filecon = NULL; + int r; + + assert(abspath); + assert(path_is_absolute(abspath)); + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + /* Check for policy reload so 'label_hnd' is kept up-to-date by callbacks */ + mac_selinux_maybe_reload(); + if (!label_hnd) + return 0; + + r = selabel_lookup_raw(label_hnd, &filecon, abspath, mode); + if (r < 0) { + /* No context specified by the policy? Proceed without setting it. */ + if (errno == ENOENT) + return 0; + + return log_enforcing_errno(errno, "Failed to determine SELinux security context for %s: %m", abspath); + } + + if (setfscreatecon_raw(filecon) < 0) + return log_enforcing_errno(errno, "Failed to set SELinux security context %s for %s: %m", filecon, abspath); + + return 0; +} +#endif + +int mac_selinux_create_file_prepare_at( + int dir_fd, + const char *path, + mode_t mode) { + +#if HAVE_SELINUX + _cleanup_free_ char *abspath = NULL; + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + if (!label_hnd) + return 0; + + if (isempty(path) || !path_is_absolute(path)) { + if (dir_fd == AT_FDCWD) + r = safe_getcwd(&abspath); + else + r = fd_get_path(dir_fd, &abspath); + if (r < 0) + return r; + + if (!isempty(path) && !path_extend(&abspath, path)) + return -ENOMEM; + + path = abspath; + } + + return selinux_create_file_prepare_abspath(path, mode); +#else + return 0; +#endif +} + +int mac_selinux_create_file_prepare_label(const char *path, const char *label) { +#if HAVE_SELINUX + int r; + + if (!label) + return 0; + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + if (setfscreatecon_raw(label) < 0) + return log_enforcing_errno(errno, "Failed to set specified SELinux security context '%s' for '%s': %m", label, strna(path)); +#endif + return 0; +} + +void mac_selinux_create_file_clear(void) { + +#if HAVE_SELINUX + PROTECT_ERRNO; + + if (selinux_init(/* force= */ false) <= 0) + return; + + setfscreatecon_raw(NULL); +#endif +} + +int mac_selinux_create_socket_prepare(const char *label) { + +#if HAVE_SELINUX + int r; + + assert(label); + + r = selinux_init(/* force= */ false); + if (r <= 0) + return r; + + if (setsockcreatecon(label) < 0) + return log_enforcing_errno(errno, "Failed to set SELinux security context %s for sockets: %m", label); +#endif + + return 0; +} + +void mac_selinux_create_socket_clear(void) { + +#if HAVE_SELINUX + PROTECT_ERRNO; + + if (selinux_init(/* force= */ false) <= 0) + return; + + setsockcreatecon_raw(NULL); +#endif +} + +int mac_selinux_bind(int fd, const struct sockaddr *addr, socklen_t addrlen) { + + /* Binds a socket and label its file system object according to the SELinux policy */ + +#if HAVE_SELINUX + _cleanup_freecon_ char *fcon = NULL; + const struct sockaddr_un *un; + bool context_changed = false; + size_t sz; + char *path; + int r; + + assert(fd >= 0); + assert(addr); + assert(addrlen >= sizeof(sa_family_t)); + + if (selinux_init(/* force= */ false) <= 0) + goto skipped; + + if (!label_hnd) + goto skipped; + + /* Filter out non-local sockets */ + if (addr->sa_family != AF_UNIX) + goto skipped; + + /* Filter out anonymous sockets */ + if (addrlen < offsetof(struct sockaddr_un, sun_path) + 1) + goto skipped; + + /* Filter out abstract namespace sockets */ + un = (const struct sockaddr_un*) addr; + if (un->sun_path[0] == 0) + goto skipped; + + sz = addrlen - offsetof(struct sockaddr_un, sun_path); + if (sz > PATH_MAX) + goto skipped; + path = strndupa_safe(un->sun_path, sz); + + /* Check for policy reload so 'label_hnd' is kept up-to-date by callbacks */ + mac_selinux_maybe_reload(); + if (!label_hnd) + goto skipped; + + if (path_is_absolute(path)) + r = selabel_lookup_raw(label_hnd, &fcon, path, S_IFSOCK); + else { + _cleanup_free_ char *newpath = NULL; + + r = path_make_absolute_cwd(path, &newpath); + if (r < 0) + return r; + + r = selabel_lookup_raw(label_hnd, &fcon, newpath, S_IFSOCK); + } + + if (r < 0) { + /* No context specified by the policy? Proceed without setting it */ + if (errno == ENOENT) + goto skipped; + + r = log_enforcing_errno(errno, "Failed to determine SELinux security context for %s: %m", path); + if (r < 0) + return r; + } else { + if (setfscreatecon_raw(fcon) < 0) { + r = log_enforcing_errno(errno, "Failed to set SELinux security context %s for %s: %m", fcon, path); + if (r < 0) + return r; + } else + context_changed = true; + } + + r = RET_NERRNO(bind(fd, addr, addrlen)); + + if (context_changed) + (void) setfscreatecon_raw(NULL); + + return r; + +skipped: +#endif + return RET_NERRNO(bind(fd, addr, addrlen)); +} diff --git a/src/shared/selinux-util.h b/src/shared/selinux-util.h new file mode 100644 index 0000000..97ab5eb --- /dev/null +++ b/src/shared/selinux-util.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "macro.h" +#include "label-util.h" + +#if HAVE_SELINUX +#include + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(char*, freecon, NULL); +#define _cleanup_freecon_ _cleanup_(freeconp) +#endif + +bool mac_selinux_use(void); +void mac_selinux_retest(void); +bool mac_selinux_enforcing(void); + +int mac_selinux_init(void); +int mac_selinux_init_lazy(void); +void mac_selinux_maybe_reload(void); +void mac_selinux_finish(void); + +int mac_selinux_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags); + +int mac_selinux_apply(const char *path, const char *label); +int mac_selinux_apply_fd(int fd, const char *path, const char *label); + +int mac_selinux_get_create_label_from_exe(const char *exe, char **label); +int mac_selinux_get_our_label(char **label); +int mac_selinux_get_child_mls_label(int socket_fd, const char *exe, const char *exec_label, char **label); +char* mac_selinux_free(char *label); + +int mac_selinux_create_file_prepare_at(int dirfd, const char *path, mode_t mode); +static inline int mac_selinux_create_file_prepare(const char *path, mode_t mode) { + return mac_selinux_create_file_prepare_at(AT_FDCWD, path, mode); +} +int mac_selinux_create_file_prepare_label(const char *path, const char *label); +void mac_selinux_create_file_clear(void); + +int mac_selinux_create_socket_prepare(const char *label); +void mac_selinux_create_socket_clear(void); + +int mac_selinux_bind(int fd, const struct sockaddr *addr, socklen_t addrlen); + +DEFINE_TRIVIAL_CLEANUP_FUNC(char*, mac_selinux_free); diff --git a/src/shared/serialize.c b/src/shared/serialize.c new file mode 100644 index 0000000..483cbc7 --- /dev/null +++ b/src/shared/serialize.c @@ -0,0 +1,552 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "env-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "memfd-util.h" +#include "missing_mman.h" +#include "missing_syscall.h" +#include "parse-util.h" +#include "process-util.h" +#include "serialize.h" +#include "strv.h" +#include "tmpfile-util.h" + +int serialize_item(FILE *f, const char *key, const char *value) { + assert(f); + assert(key); + + if (!value) + return 0; + + /* Make sure that anything we serialize we can also read back again with read_line() with a maximum line size + * of LONG_LINE_MAX. This is a safety net only. All code calling us should filter this out earlier anyway. */ + if (strlen(key) + 1 + strlen(value) + 1 > LONG_LINE_MAX) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Attempted to serialize overly long item '%s', refusing.", key); + + fputs(key, f); + fputc('=', f); + fputs(value, f); + fputc('\n', f); + + return 1; +} + +int serialize_item_escaped(FILE *f, const char *key, const char *value) { + _cleanup_free_ char *c = NULL; + + assert(f); + assert(key); + + if (!value) + return 0; + + c = xescape(value, " "); + if (!c) + return log_oom(); + + return serialize_item(f, key, c); +} + +int serialize_item_format(FILE *f, const char *key, const char *format, ...) { + _cleanup_free_ char *allocated = NULL; + char buf[256]; /* Something reasonably short that fits nicely on any stack (i.e. is considerably less + * than LONG_LINE_MAX (1MiB!) */ + const char *b; + va_list ap; + int k; + + assert(f); + assert(key); + assert(format); + + /* First, let's try to format this into a stack buffer */ + va_start(ap, format); + k = vsnprintf(buf, sizeof(buf), format, ap); + va_end(ap); + + if (k < 0) + return log_warning_errno(errno, "Failed to serialize item '%s', ignoring: %m", key); + if (strlen(key) + 1 + k + 1 > LONG_LINE_MAX) /* See above */ + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Attempted to serialize overly long item '%s', refusing.", key); + + if ((size_t) k < sizeof(buf)) + b = buf; /* Yay, it fit! */ + else { + /* So the string didn't fit in the short buffer above, but was not above our total limit, + * hence let's format it via dynamic memory */ + + va_start(ap, format); + k = vasprintf(&allocated, format, ap); + va_end(ap); + + if (k < 0) + return log_warning_errno(errno, "Failed to serialize item '%s', ignoring: %m", key); + + b = allocated; + } + + fputs(key, f); + fputc('=', f); + fputs(b, f); + fputc('\n', f); + + return 1; +} + +int serialize_fd(FILE *f, FDSet *fds, const char *key, int fd) { + int copy; + + assert(f); + assert(fds); + assert(key); + + if (fd < 0) + return 0; + + copy = fdset_put_dup(fds, fd); + if (copy < 0) + return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m"); + + return serialize_item_format(f, key, "%i", copy); +} + +int serialize_fd_many(FILE *f, FDSet *fds, const char *key, const int fd_array[], size_t n_fd_array) { + _cleanup_free_ char *t = NULL; + + assert(f); + + if (n_fd_array == 0) + return 0; + + assert(fd_array); + + for (size_t i = 0; i < n_fd_array; i++) { + int copy; + + if (fd_array[i] < 0) + return -EBADF; + + copy = fdset_put_dup(fds, fd_array[i]); + if (copy < 0) + return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m"); + + if (strextendf_with_separator(&t, " ", "%i", copy) < 0) + return log_oom(); + } + + return serialize_item(f, key, t); +} + +int serialize_usec(FILE *f, const char *key, usec_t usec) { + assert(f); + assert(key); + + if (usec == USEC_INFINITY) + return 0; + + return serialize_item_format(f, key, USEC_FMT, usec); +} + +int serialize_dual_timestamp(FILE *f, const char *name, const dual_timestamp *t) { + assert(f); + assert(name); + assert(t); + + if (!dual_timestamp_is_set(t)) + return 0; + + return serialize_item_format(f, name, USEC_FMT " " USEC_FMT, t->realtime, t->monotonic); +} + +int serialize_strv(FILE *f, const char *key, char **l) { + int ret = 0, r; + + /* Returns the first error, or positive if anything was serialized, 0 otherwise. */ + + STRV_FOREACH(i, l) { + r = serialize_item_escaped(f, key, *i); + if ((ret >= 0 && r < 0) || + (ret == 0 && r > 0)) + ret = r; + } + + return ret; +} + +int serialize_pidref(FILE *f, FDSet *fds, const char *key, PidRef *pidref) { + int copy; + + assert(f); + assert(fds); + + if (!pidref_is_set(pidref)) + return 0; + + /* If we have a pidfd we serialize the fd and encode the fd number prefixed by "@" in the + * serialization. Otherwise we serialize the numeric PID as it is. */ + + if (pidref->fd < 0) + return serialize_item_format(f, key, PID_FMT, pidref->pid); + + copy = fdset_put_dup(fds, pidref->fd); + if (copy < 0) + return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m"); + + return serialize_item_format(f, key, "@%i", copy); +} + +int serialize_ratelimit(FILE *f, const char *key, const RateLimit *rl) { + assert(rl); + + return serialize_item_format(f, key, + USEC_FMT " " USEC_FMT " %u %u", + rl->begin, + rl->interval, + rl->num, + rl->burst); +} + +int serialize_item_hexmem(FILE *f, const char *key, const void *p, size_t l) { + _cleanup_free_ char *encoded = NULL; + int r; + + assert(f); + assert(key); + + if (!p && l > 0) + return -EINVAL; + + if (l == 0) + return 0; + + encoded = hexmem(p, l); + if (!encoded) + return log_oom_debug(); + + r = serialize_item(f, key, encoded); + if (r < 0) + return r; + + return 1; +} + +int serialize_item_base64mem(FILE *f, const char *key, const void *p, size_t l) { + _cleanup_free_ char *encoded = NULL; + ssize_t len; + int r; + + assert(f); + assert(key); + + if (!p && l > 0) + return -EINVAL; + + if (l == 0) + return 0; + + len = base64mem(p, l, &encoded); + if (len <= 0) + return log_oom_debug(); + + r = serialize_item(f, key, encoded); + if (r < 0) + return r; + + return 1; +} + +int serialize_string_set(FILE *f, const char *key, Set *s) { + const char *e; + int r; + + assert(f); + assert(key); + + if (set_isempty(s)) + return 0; + + /* Serialize as individual items, as each element might contain separators and escapes */ + + SET_FOREACH(e, s) { + r = serialize_item(f, key, e); + if (r < 0) + return r; + } + + return 1; +} + +int serialize_image_policy(FILE *f, const char *key, const ImagePolicy *p) { + _cleanup_free_ char *policy = NULL; + int r; + + assert(f); + assert(key); + + if (!p) + return 0; + + r = image_policy_to_string(p, /* simplify= */ false, &policy); + if (r < 0) + return r; + + r = serialize_item(f, key, policy); + if (r < 0) + return r; + + return 1; +} + +int deserialize_read_line(FILE *f, char **ret) { + _cleanup_free_ char *line = NULL; + int r; + + assert(f); + assert(ret); + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read serialization line: %m"); + if (r == 0) { /* eof */ + *ret = NULL; + return 0; + } + + if (isempty(line)) { /* End marker */ + *ret = NULL; + return 0; + } + + *ret = TAKE_PTR(line); + return 1; +} + +int deserialize_fd(FDSet *fds, const char *value) { + _cleanup_close_ int our_fd = -EBADF; + int parsed_fd; + + assert(value); + + parsed_fd = parse_fd(value); + if (parsed_fd < 0) + return log_debug_errno(parsed_fd, "Failed to parse file descriptor serialization: %s", value); + + our_fd = fdset_remove(fds, parsed_fd); /* Take possession of the fd */ + if (our_fd < 0) + return log_debug_errno(our_fd, "Failed to acquire fd from serialization fds: %m"); + + return TAKE_FD(our_fd); +} + +int deserialize_fd_many(FDSet *fds, const char *value, size_t n, int *ret) { + int r, *fd_array = NULL; + size_t m = 0; + + assert(value); + + fd_array = new(int, n); + if (!fd_array) + return -ENOMEM; + + CLEANUP_ARRAY(fd_array, m, close_many_and_free); + + for (;;) { + _cleanup_free_ char *w = NULL; + int fd; + + r = extract_first_word(&value, &w, NULL, 0); + if (r < 0) + return r; + if (r == 0) { + if (m < n) /* Too few */ + return -EINVAL; + + break; + } + + if (m >= n) /* Too many */ + return -EINVAL; + + fd = deserialize_fd(fds, w); + if (fd < 0) + return fd; + + fd_array[m++] = fd; + } + + memcpy(ret, fd_array, m * sizeof(int)); + fd_array = mfree(fd_array); + + return 0; +} + +int deserialize_strv(const char *value, char ***l) { + ssize_t unescaped_len; + char *unescaped; + + assert(l); + assert(value); + + unescaped_len = cunescape(value, 0, &unescaped); + if (unescaped_len < 0) + return unescaped_len; + + return strv_consume(l, unescaped); +} + +int deserialize_usec(const char *value, usec_t *ret) { + int r; + + assert(value); + assert(ret); + + r = safe_atou64(value, ret); + if (r < 0) + return log_debug_errno(r, "Failed to parse usec value \"%s\": %m", value); + + return 0; +} + +int deserialize_dual_timestamp(const char *value, dual_timestamp *ret) { + uint64_t a, b; + int r, pos; + + assert(value); + assert(ret); + + pos = strspn(value, WHITESPACE); + if (value[pos] == '-') + return -EINVAL; + pos += strspn(value + pos, DIGITS); + pos += strspn(value + pos, WHITESPACE); + if (value[pos] == '-') + return -EINVAL; + + r = sscanf(value, "%" PRIu64 "%" PRIu64 "%n", &a, &b, &pos); + if (r != 2) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse dual timestamp value \"%s\".", + value); + + if (value[pos] != '\0') + /* trailing garbage */ + return -EINVAL; + + *ret = (dual_timestamp) { + .realtime = a, + .monotonic = b, + }; + + return 0; +} + +int deserialize_environment(const char *value, char ***list) { + _cleanup_free_ char *unescaped = NULL; + ssize_t l; + int r; + + assert(value); + assert(list); + + /* Changes the *environment strv inline. */ + + l = cunescape(value, 0, &unescaped); + if (l < 0) + return log_error_errno(l, "Failed to unescape: %m"); + + r = strv_env_replace_consume(list, TAKE_PTR(unescaped)); + if (r < 0) + return log_error_errno(r, "Failed to append environment variable: %m"); + + return 0; +} + +int deserialize_pidref(FDSet *fds, const char *value, PidRef *ret) { + const char *e; + int r; + + assert(value); + assert(ret); + + e = startswith(value, "@"); + if (e) { + int fd = deserialize_fd(fds, e); + + if (fd < 0) + return fd; + + r = pidref_set_pidfd_consume(ret, fd); + } else { + pid_t pid; + + r = parse_pid(value, &pid); + if (r < 0) + return log_debug_errno(r, "Failed to parse PID: %s", value); + + r = pidref_set_pid(ret, pid); + } + if (r < 0) + return log_debug_errno(r, "Failed to initialize pidref: %m"); + + return 0; +} + +void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value) { + usec_t begin, interval; + unsigned num, burst; + + assert(rl); + assert(name); + assert(value); + + if (sscanf(value, USEC_FMT " " USEC_FMT " %u %u", &begin, &interval, &num, &burst) != 4) + return log_notice("Failed to parse %s, ignoring: %s", name, value); + + /* Preserve the counter only if the configuration didn't change. */ + rl->num = (interval == rl->interval && burst == rl->burst) ? num : 0; + rl->begin = begin; +} + +int open_serialization_fd(const char *ident) { + int fd; + + fd = memfd_create_wrapper(ident, MFD_CLOEXEC | MFD_NOEXEC_SEAL); + if (fd < 0) { + const char *path; + + path = getpid_cached() == 1 ? "/run/systemd" : "/tmp"; + fd = open_tmpfile_unlinkable(path, O_RDWR|O_CLOEXEC); + if (fd < 0) + return fd; + + log_debug("Serializing %s to %s.", ident, path); + } else + log_debug("Serializing %s to memfd.", ident); + + return fd; +} + +int open_serialization_file(const char *ident, FILE **ret) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd; + + assert(ret); + + fd = open_serialization_fd(ident); + if (fd < 0) + return fd; + + f = take_fdopen(&fd, "w+"); + if (!f) + return -errno; + + *ret = TAKE_PTR(f); + + return 0; +} diff --git a/src/shared/serialize.h b/src/shared/serialize.h new file mode 100644 index 0000000..355eff9 --- /dev/null +++ b/src/shared/serialize.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "fdset.h" +#include "image-policy.h" +#include "macro.h" +#include "pidref.h" +#include "ratelimit.h" +#include "set.h" +#include "string-util.h" +#include "time-util.h" + +int serialize_item(FILE *f, const char *key, const char *value); +int serialize_item_escaped(FILE *f, const char *key, const char *value); +int serialize_item_format(FILE *f, const char *key, const char *value, ...) _printf_(3,4); +int serialize_item_hexmem(FILE *f, const char *key, const void *p, size_t l); +int serialize_item_base64mem(FILE *f, const char *key, const void *p, size_t l); +int serialize_fd(FILE *f, FDSet *fds, const char *key, int fd); +int serialize_fd_many(FILE *f, FDSet *fds, const char *key, const int fd_array[], size_t n_fd_array); +int serialize_usec(FILE *f, const char *key, usec_t usec); +int serialize_dual_timestamp(FILE *f, const char *key, const dual_timestamp *t); +int serialize_strv(FILE *f, const char *key, char **l); +int serialize_pidref(FILE *f, FDSet *fds, const char *key, PidRef *pidref); +int serialize_ratelimit(FILE *f, const char *key, const RateLimit *rl); +int serialize_string_set(FILE *f, const char *key, Set *s); +int serialize_image_policy(FILE *f, const char *key, const ImagePolicy *p); + +static inline int serialize_bool(FILE *f, const char *key, bool b) { + return serialize_item(f, key, yes_no(b)); +} +static inline int serialize_bool_elide(FILE *f, const char *key, bool b) { + return b ? serialize_item(f, key, yes_no(b)) : 0; +} + +static inline int serialize_item_tristate(FILE *f, const char *key, int value) { + return value >= 0 ? serialize_item_format(f, key, "%i", value) : 0; +} + +int deserialize_read_line(FILE *f, char **ret); + +int deserialize_fd(FDSet *fds, const char *value); +int deserialize_fd_many(FDSet *fds, const char *value, size_t n, int *ret); +int deserialize_usec(const char *value, usec_t *ret); +int deserialize_dual_timestamp(const char *value, dual_timestamp *ret); +int deserialize_environment(const char *value, char ***environment); +int deserialize_strv(const char *value, char ***l); +int deserialize_pidref(FDSet *fds, const char *value, PidRef *ret); +void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value); + +int open_serialization_fd(const char *ident); +int open_serialization_file(const char *ident, FILE **ret); diff --git a/src/shared/service-util.c b/src/shared/service-util.c new file mode 100644 index 0000000..b0585ba --- /dev/null +++ b/src/shared/service-util.c @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "build.h" +#include "pretty-print.h" +#include "service-util.h" +#include "terminal-util.h" + +static int help(const char *program_path, const char *service, const char *description, bool bus_introspect) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man(service, "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "%s%s%s\n\n" + "This program takes no positional arguments.\n\n" + "%sOptions%s:\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --bus-introspect=PATH Write D-Bus XML introspection data\n" + "\nSee the %s for details.\n" + , program_path + , ansi_highlight(), description, ansi_normal() + , ansi_underline(), ansi_normal() + , link + ); + + return 0; /* No further action */ +} + +int service_parse_argv( + const char *service, + const char *description, + const BusObjectImplementation* const* bus_objects, + int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_BUS_INTROSPECT, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(argv[0], service, description, bus_objects); + + case ARG_VERSION: + return version(); + + case ARG_BUS_INTROSPECT: + return bus_introspect_implementations( + stdout, + optarg, + bus_objects); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes no arguments."); + + return 1; /* Further action */ +} diff --git a/src/shared/service-util.h b/src/shared/service-util.h new file mode 100644 index 0000000..360341f --- /dev/null +++ b/src/shared/service-util.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-object.h" + +int service_parse_argv( + const char *service, + const char *description, + const BusObjectImplementation* const* bus_objects, + int argc, char *argv[]); diff --git a/src/shared/sleep-config.c b/src/shared/sleep-config.c new file mode 100644 index 0000000..7282111 --- /dev/null +++ b/src/shared/sleep-config.c @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "constants.h" +#include "device-util.h" +#include "devnum-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "hibernate-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "sleep-config.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +#define DEFAULT_SUSPEND_ESTIMATION_USEC (1 * USEC_PER_HOUR) + +static const char* const sleep_operation_table[_SLEEP_OPERATION_MAX] = { + [SLEEP_SUSPEND] = "suspend", + [SLEEP_HIBERNATE] = "hibernate", + [SLEEP_HYBRID_SLEEP] = "hybrid-sleep", + [SLEEP_SUSPEND_THEN_HIBERNATE] = "suspend-then-hibernate", +}; + +DEFINE_STRING_TABLE_LOOKUP(sleep_operation, SleepOperation); + +static char* const* const sleep_default_state_table[_SLEEP_OPERATION_CONFIG_MAX] = { + [SLEEP_SUSPEND] = STRV_MAKE("mem", "standby", "freeze"), + [SLEEP_HIBERNATE] = STRV_MAKE("disk"), + [SLEEP_HYBRID_SLEEP] = STRV_MAKE("disk"), +}; + +static char* const* const sleep_default_mode_table[_SLEEP_OPERATION_CONFIG_MAX] = { + /* Not used by SLEEP_SUSPEND */ + [SLEEP_HIBERNATE] = STRV_MAKE("platform", "shutdown"), + [SLEEP_HYBRID_SLEEP] = STRV_MAKE("suspend"), +}; + +SleepConfig* sleep_config_free(SleepConfig *sc) { + if (!sc) + return NULL; + + for (SleepOperation i = 0; i < _SLEEP_OPERATION_CONFIG_MAX; i++) { + strv_free(sc->states[i]); + strv_free(sc->modes[i]); + } + + return mfree(sc); +} + +static int config_parse_sleep_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_strv_free_ char **modes = NULL; + char ***sv = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + modes = strv_new(NULL); + if (!modes) + return log_oom(); + } else { + r = strv_split_full(&modes, rvalue, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_oom(); + } + + return free_and_replace(*sv, modes); +} + +static void sleep_config_validate_state_and_mode(SleepConfig *sc) { + assert(sc); + + /* So we should really not allow setting SuspendState= to 'disk', which means hibernation. We have + * SLEEP_HIBERNATE for proper hibernation support, which includes checks for resume support (through + * EFI variable or resume= kernel command line option). It's simply not sensible to call the suspend + * operation but eventually do an unsafe hibernation. */ + if (strv_contains(sc->states[SLEEP_SUSPEND], "disk")) { + strv_remove(sc->states[SLEEP_SUSPEND], "disk"); + log_warning("Sleep state 'disk' is not supported by operation %s, ignoring.", + sleep_operation_to_string(SLEEP_SUSPEND)); + } + assert(!sc->modes[SLEEP_SUSPEND]); + + /* People should use hybrid-sleep instead of setting HibernateMode=suspend. Warn about it but don't + * drop it in this case. */ + if (strv_contains(sc->modes[SLEEP_HIBERNATE], "suspend")) + log_warning("Sleep mode 'suspend' should not be used by operation %s. Please use %s instead.", + sleep_operation_to_string(SLEEP_HIBERNATE), sleep_operation_to_string(SLEEP_HYBRID_SLEEP)); +} + +int parse_sleep_config(SleepConfig **ret) { + _cleanup_(sleep_config_freep) SleepConfig *sc = NULL; + int allow_suspend = -1, allow_hibernate = -1, allow_s2h = -1, allow_hybrid_sleep = -1; + + assert(ret); + + sc = new(SleepConfig, 1); + if (!sc) + return log_oom(); + + *sc = (SleepConfig) { + .hibernate_delay_usec = USEC_INFINITY, + }; + + const ConfigTableItem items[] = { + { "Sleep", "AllowSuspend", config_parse_tristate, 0, &allow_suspend }, + { "Sleep", "AllowHibernation", config_parse_tristate, 0, &allow_hibernate }, + { "Sleep", "AllowSuspendThenHibernate", config_parse_tristate, 0, &allow_s2h }, + { "Sleep", "AllowHybridSleep", config_parse_tristate, 0, &allow_hybrid_sleep }, + + { "Sleep", "SuspendState", config_parse_strv, 0, sc->states + SLEEP_SUSPEND }, + { "Sleep", "SuspendMode", config_parse_warn_compat, DISABLED_LEGACY, NULL }, + + { "Sleep", "HibernateState", config_parse_warn_compat, DISABLED_LEGACY, NULL }, + { "Sleep", "HibernateMode", config_parse_sleep_mode, 0, sc->modes + SLEEP_HIBERNATE }, + + { "Sleep", "HybridSleepState", config_parse_warn_compat, DISABLED_LEGACY, NULL }, + { "Sleep", "HybridSleepMode", config_parse_warn_compat, DISABLED_LEGACY, NULL }, + + { "Sleep", "HibernateDelaySec", config_parse_sec, 0, &sc->hibernate_delay_usec }, + { "Sleep", "SuspendEstimationSec", config_parse_sec, 0, &sc->suspend_estimation_usec }, + {} + }; + + (void) config_parse_config_file("sleep.conf", "Sleep\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, NULL); + + /* use default values unless set */ + sc->allow[SLEEP_SUSPEND] = allow_suspend != 0; + sc->allow[SLEEP_HIBERNATE] = allow_hibernate != 0; + sc->allow[SLEEP_HYBRID_SLEEP] = allow_hybrid_sleep >= 0 ? allow_hybrid_sleep + : (allow_suspend != 0 && allow_hibernate != 0); + sc->allow[SLEEP_SUSPEND_THEN_HIBERNATE] = allow_s2h >= 0 ? allow_s2h + : (allow_suspend != 0 && allow_hibernate != 0); + + for (SleepOperation i = 0; i < _SLEEP_OPERATION_CONFIG_MAX; i++) { + if (!sc->states[i] && sleep_default_state_table[i]) { + sc->states[i] = strv_copy(sleep_default_state_table[i]); + if (!sc->states[i]) + return log_oom(); + } + + if (!sc->modes[i] && sleep_default_mode_table[i]) { + sc->modes[i] = strv_copy(sleep_default_mode_table[i]); + if (!sc->modes[i]) + return log_oom(); + } + } + + if (sc->suspend_estimation_usec == 0) + sc->suspend_estimation_usec = DEFAULT_SUSPEND_ESTIMATION_USEC; + + sleep_config_validate_state_and_mode(sc); + + *ret = TAKE_PTR(sc); + return 0; +} + +int sleep_state_supported(char **states) { + _cleanup_free_ char *supported_sysfs = NULL; + const char *found; + int r; + + if (strv_isempty(states)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMSG), "No sleep state configured."); + + if (access("/sys/power/state", W_OK) < 0) + return log_debug_errno(errno, "/sys/power/state is not writable: %m"); + + r = read_one_line_file("/sys/power/state", &supported_sysfs); + if (r < 0) + return log_debug_errno(r, "Failed to read /sys/power/state: %m"); + + r = string_contains_word_strv(supported_sysfs, NULL, states, &found); + if (r < 0) + return log_debug_errno(r, "Failed to parse /sys/power/state: %m"); + if (r > 0) { + log_debug("Sleep state '%s' is supported by kernel.", found); + return true; + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *joined = strv_join(states, " "); + log_debug("None of the configured sleep states are supported by kernel: %s", strnull(joined)); + } + return false; +} + +int sleep_mode_supported(char **modes) { + _cleanup_free_ char *supported_sysfs = NULL; + int r; + + /* Unlike state, kernel has its own default choice if not configured */ + if (strv_isempty(modes)) { + log_debug("No sleep mode configured, using kernel default."); + return true; + } + + if (access("/sys/power/disk", W_OK) < 0) + return log_debug_errno(errno, "/sys/power/disk is not writable: %m"); + + r = read_one_line_file("/sys/power/disk", &supported_sysfs); + if (r < 0) + return log_debug_errno(r, "Failed to read /sys/power/disk: %m"); + + for (const char *p = supported_sysfs;;) { + _cleanup_free_ char *word = NULL; + char *mode; + size_t l; + + r = extract_first_word(&p, &word, NULL, 0); + if (r < 0) + return log_debug_errno(r, "Failed to parse /sys/power/disk: %m"); + if (r == 0) + break; + + mode = word; + l = strlen(word); + + if (mode[0] == '[' && mode[l - 1] == ']') { + mode[l - 1] = '\0'; + mode++; + } + + if (strv_contains(modes, mode)) { + log_debug("Disk sleep mode '%s' is supported by kernel.", mode); + return true; + } + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *joined = strv_join(modes, " "); + log_debug("None of the configured hibernation power modes are supported by kernel: %s", strnull(joined)); + } + return false; +} + +static int sleep_supported_internal( + const SleepConfig *sleep_config, + SleepOperation operation, + bool check_allowed, + SleepSupport *ret_support); + +static int s2h_supported(const SleepConfig *sleep_config, SleepSupport *ret_support) { + + static const SleepOperation operations[] = { + SLEEP_SUSPEND, + SLEEP_HIBERNATE, + }; + + SleepSupport support; + int r; + + assert(sleep_config); + assert(ret_support); + + if (!clock_supported(CLOCK_BOOTTIME_ALARM)) { + log_debug("CLOCK_BOOTTIME_ALARM is not supported, can't perform %s.", sleep_operation_to_string(SLEEP_SUSPEND_THEN_HIBERNATE)); + *ret_support = SLEEP_ALARM_NOT_SUPPORTED; + return false; + } + + FOREACH_ARRAY(i, operations, ELEMENTSOF(operations)) { + r = sleep_supported_internal(sleep_config, *i, /* check_allowed = */ false, &support); + if (r < 0) + return r; + if (r == 0) { + log_debug("Sleep operation %s is not supported, can't perform %s.", + sleep_operation_to_string(*i), sleep_operation_to_string(SLEEP_SUSPEND_THEN_HIBERNATE)); + *ret_support = support; + return false; + } + } + + assert(support == SLEEP_SUPPORTED); + *ret_support = support; + + return true; +} + +static int sleep_supported_internal( + const SleepConfig *sleep_config, + SleepOperation operation, + bool check_allowed, + SleepSupport *ret_support) { + + int r; + + assert(sleep_config); + assert(operation >= 0); + assert(operation < _SLEEP_OPERATION_MAX); + assert(ret_support); + + if (check_allowed && !sleep_config->allow[operation]) { + log_debug("Sleep operation %s is disabled by configuration.", sleep_operation_to_string(operation)); + *ret_support = SLEEP_DISABLED; + return false; + } + + if (operation == SLEEP_SUSPEND_THEN_HIBERNATE) + return s2h_supported(sleep_config, ret_support); + + assert(operation < _SLEEP_OPERATION_CONFIG_MAX); + + r = sleep_state_supported(sleep_config->states[operation]); + if (r == -ENOMSG) { + *ret_support = SLEEP_NOT_CONFIGURED; + return false; + } + if (r < 0) + return r; + if (r == 0) { + *ret_support = SLEEP_STATE_OR_MODE_NOT_SUPPORTED; + return false; + } + + if (sleep_operation_is_hibernation(operation)) { + r = sleep_mode_supported(sleep_config->modes[operation]); + if (r < 0) + return r; + if (r == 0) { + *ret_support = SLEEP_STATE_OR_MODE_NOT_SUPPORTED; + return false; + } + + r = hibernation_is_safe(); + if (r == -ENOTRECOVERABLE) { + *ret_support = SLEEP_RESUME_NOT_SUPPORTED; + return false; + } + if (r == -ENOSPC) { + *ret_support = SLEEP_NOT_ENOUGH_SWAP_SPACE; + return false; + } + if (r < 0) + return r; + } else + assert(!sleep_config->modes[operation]); + + *ret_support = SLEEP_SUPPORTED; + return true; +} + +int sleep_supported_full(SleepOperation operation, SleepSupport *ret_support) { + _cleanup_(sleep_config_freep) SleepConfig *sleep_config = NULL; + SleepSupport support; + int r; + + assert(operation >= 0); + assert(operation < _SLEEP_OPERATION_MAX); + + r = parse_sleep_config(&sleep_config); + if (r < 0) + return r; + + r = sleep_supported_internal(sleep_config, operation, /* check_allowed = */ true, &support); + if (r < 0) + return r; + + assert((r > 0) == (support == SLEEP_SUPPORTED)); + + if (ret_support) + *ret_support = support; + + return r; +} diff --git a/src/shared/sleep-config.h b/src/shared/sleep-config.h new file mode 100644 index 0000000..bc5aeb9 --- /dev/null +++ b/src/shared/sleep-config.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +typedef enum SleepOperation { + SLEEP_SUSPEND, + SLEEP_HIBERNATE, + SLEEP_HYBRID_SLEEP, + _SLEEP_OPERATION_CONFIG_MAX, + /* The operations above require configuration for mode and state. The ones below are "combined" + * operations that use config from those individual operations. */ + + SLEEP_SUSPEND_THEN_HIBERNATE, + + _SLEEP_OPERATION_MAX, + _SLEEP_OPERATION_INVALID = -EINVAL, +} SleepOperation; + +const char* sleep_operation_to_string(SleepOperation s) _const_; +SleepOperation sleep_operation_from_string(const char *s) _pure_; + +static inline bool sleep_operation_is_hibernation(SleepOperation operation) { + return IN_SET(operation, SLEEP_HIBERNATE, SLEEP_HYBRID_SLEEP); +} + +typedef struct SleepConfig { + bool allow[_SLEEP_OPERATION_MAX]; + + char **states[_SLEEP_OPERATION_CONFIG_MAX]; + char **modes[_SLEEP_OPERATION_CONFIG_MAX]; /* Power mode after writing hibernation image */ + + usec_t hibernate_delay_usec; + usec_t suspend_estimation_usec; +} SleepConfig; + +SleepConfig* sleep_config_free(SleepConfig *sc); +DEFINE_TRIVIAL_CLEANUP_FUNC(SleepConfig*, sleep_config_free); + +int parse_sleep_config(SleepConfig **sleep_config); + +typedef enum SleepSupport { + SLEEP_SUPPORTED, + SLEEP_DISABLED, /* Disabled in SleepConfig.allow */ + SLEEP_NOT_CONFIGURED, /* SleepConfig.states is not configured */ + SLEEP_STATE_OR_MODE_NOT_SUPPORTED, /* SleepConfig.states/modes are not supported by kernel */ + SLEEP_RESUME_NOT_SUPPORTED, + SLEEP_NOT_ENOUGH_SWAP_SPACE, + SLEEP_ALARM_NOT_SUPPORTED, /* CLOCK_BOOTTIME_ALARM is unsupported by kernel (only used by s2h) */ +} SleepSupport; + +int sleep_supported_full(SleepOperation operation, SleepSupport *ret_support); +static inline int sleep_supported(SleepOperation operation) { + return sleep_supported_full(operation, NULL); +} + +/* Only for test-sleep-config */ +int sleep_state_supported(char **states); +int sleep_mode_supported(char **modes); diff --git a/src/shared/smack-util.c b/src/shared/smack-util.c new file mode 100644 index 0000000..1f88e72 --- /dev/null +++ b/src/shared/smack-util.c @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2013 Intel Corporation + + Author: Auke Kok +***/ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "label.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "process-util.h" +#include "smack-util.h" +#include "stdio-util.h" +#include "string-table.h" +#include "xattr-util.h" + +#if ENABLE_SMACK +bool mac_smack_use(void) { + static int cached_use = -1; + + if (cached_use < 0) + cached_use = access("/sys/fs/smackfs/", F_OK) >= 0; + + return cached_use; +} + +static const char* const smack_attr_table[_SMACK_ATTR_MAX] = { + [SMACK_ATTR_ACCESS] = "security.SMACK64", + [SMACK_ATTR_EXEC] = "security.SMACK64EXEC", + [SMACK_ATTR_MMAP] = "security.SMACK64MMAP", + [SMACK_ATTR_TRANSMUTE] = "security.SMACK64TRANSMUTE", + [SMACK_ATTR_IPIN] = "security.SMACK64IPIN", + [SMACK_ATTR_IPOUT] = "security.SMACK64IPOUT", +}; + +DEFINE_STRING_TABLE_LOOKUP(smack_attr, SmackAttr); + +int mac_smack_read(const char *path, SmackAttr attr, char **label) { + assert(path); + assert(attr >= 0 && attr < _SMACK_ATTR_MAX); + assert(label); + + if (!mac_smack_use()) + return 0; + + return getxattr_malloc(path, smack_attr_to_string(attr), label); +} + +int mac_smack_read_fd(int fd, SmackAttr attr, char **label) { + assert(fd >= 0); + assert(attr >= 0 && attr < _SMACK_ATTR_MAX); + assert(label); + + if (!mac_smack_use()) + return 0; + + return fgetxattr_malloc(fd, smack_attr_to_string(attr), label); +} + +int mac_smack_apply_at(int dir_fd, const char *path, SmackAttr attr, const char *label) { + _cleanup_close_ int fd = -EBADF; + + assert(path); + assert(attr >= 0 && attr < _SMACK_ATTR_MAX); + + if (!mac_smack_use()) + return 0; + + fd = openat(dir_fd, path, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (fd < 0) + return -errno; + + return mac_smack_apply_fd(fd, attr, label); +} + +int mac_smack_apply_fd(int fd, SmackAttr attr, const char *label) { + int r; + + assert(fd >= 0); + assert(attr >= 0 && attr < _SMACK_ATTR_MAX); + + if (!mac_smack_use()) + return 0; + + if (label) + r = setxattr(FORMAT_PROC_FD_PATH(fd), smack_attr_to_string(attr), label, strlen(label), 0); + else + r = removexattr(FORMAT_PROC_FD_PATH(fd), smack_attr_to_string(attr)); + if (r < 0) + return -errno; + + return 0; +} + +int mac_smack_apply_pid(pid_t pid, const char *label) { + const char *p; + int r; + + assert(label); + + if (!mac_smack_use()) + return 0; + + p = procfs_file_alloca(pid, "attr/current"); + r = write_string_file(p, label, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + + return r; +} + +static int smack_fix_fd( + int fd, + const char *label_path, + LabelFixFlags flags) { + + const char *label; + struct stat st; + int r; + + /* The caller should have done the sanity checks. */ + assert(fd >= 0); + assert(label_path); + assert(path_is_absolute(label_path)); + + /* Path must be in /dev. */ + if (!path_startswith(label_path, "/dev")) + return 0; + + if (fstat(fd, &st) < 0) + return -errno; + + /* + * Label directories and character devices "*". + * Label symlinks "_". + * Don't change anything else. + */ + + if (S_ISDIR(st.st_mode)) + label = SMACK_STAR_LABEL; + else if (S_ISLNK(st.st_mode)) + label = SMACK_FLOOR_LABEL; + else if (S_ISCHR(st.st_mode)) + label = SMACK_STAR_LABEL; + else + return 0; + + if (setxattr(FORMAT_PROC_FD_PATH(fd), "security.SMACK64", label, strlen(label), 0) < 0) { + _cleanup_free_ char *old_label = NULL; + + r = -errno; + + /* If the FS doesn't support labels, then exit without warning */ + if (ERRNO_IS_NOT_SUPPORTED(r)) + return 0; + + /* It the FS is read-only and we were told to ignore failures caused by that, suppress error */ + if (r == -EROFS && (flags & LABEL_IGNORE_EROFS)) + return 0; + + /* If the old label is identical to the new one, suppress any kind of error */ + if (lgetxattr_malloc(FORMAT_PROC_FD_PATH(fd), "security.SMACK64", &old_label) >= 0 && + streq(old_label, label)) + return 0; + + return log_debug_errno(r, "Unable to fix SMACK label of %s: %m", label_path); + } + + return 0; +} + +int mac_smack_fix_full( + int atfd, + const char *inode_path, + const char *label_path, + LabelFixFlags flags) { + + _cleanup_close_ int opened_fd = -EBADF; + _cleanup_free_ char *p = NULL; + int r, inode_fd; + + assert(atfd >= 0 || atfd == AT_FDCWD); + assert(atfd >= 0 || inode_path); + + if (!mac_smack_use()) + return 0; + + if (inode_path) { + opened_fd = openat(atfd, inode_path, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (opened_fd < 0) { + if ((flags & LABEL_IGNORE_ENOENT) && errno == ENOENT) + return 0; + + return -errno; + } + inode_fd = opened_fd; + } else + inode_fd = atfd; + + if (!label_path) { + if (path_is_absolute(inode_path)) + label_path = inode_path; + else { + r = fd_get_path(inode_fd, &p); + if (r < 0) + return r; + + label_path = p; + } + } + + return smack_fix_fd(inode_fd, label_path, flags); +} + +int mac_smack_copy(const char *dest, const char *src) { + int r; + _cleanup_free_ char *label = NULL; + + assert(dest); + assert(src); + + r = mac_smack_read(src, SMACK_ATTR_ACCESS, &label); + if (r < 0) + return r; + + r = mac_smack_apply(dest, SMACK_ATTR_ACCESS, label); + if (r < 0) + return r; + + return r; +} + +#else +bool mac_smack_use(void) { + return false; +} + +int mac_smack_read(const char *path, SmackAttr attr, char **label) { + return -EOPNOTSUPP; +} + +int mac_smack_read_fd(int fd, SmackAttr attr, char **label) { + return -EOPNOTSUPP; +} + +int mac_smack_apply_at(int dir_fd, const char *path, SmackAttr attr, const char *label) { + return 0; +} + +int mac_smack_apply_fd(int fd, SmackAttr attr, const char *label) { + return 0; +} + +int mac_smack_apply_pid(pid_t pid, const char *label) { + return 0; +} + +int mac_smack_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags) { + return 0; +} + +int mac_smack_copy(const char *dest, const char *src) { + return 0; +} +#endif + +int renameat_and_apply_smack_floor_label(int fdf, const char *from, int fdt, const char *to) { + + assert(fdf >= 0 || fdf == AT_FDCWD); + assert(fdt >= 0 || fdt == AT_FDCWD); + + if (renameat(fdf, from, fdt, to) < 0) + return -errno; + +#if HAVE_SMACK_RUN_LABEL + return mac_smack_apply_at(fdt, to, SMACK_ATTR_ACCESS, SMACK_FLOOR_LABEL); +#else + return 0; +#endif +} + +static int mac_smack_label_pre(int dir_fd, const char *path, mode_t mode) { + return 0; +} + +static int mac_smack_label_post(int dir_fd, const char *path) { + return mac_smack_fix_full(dir_fd, path, NULL, 0); +} + +int mac_smack_init(void) { + static const LabelOps label_ops = { + .pre = mac_smack_label_pre, + .post = mac_smack_label_post, + }; + + if (!mac_smack_use()) + return 0; + + return label_ops_set(&label_ops); +} diff --git a/src/shared/smack-util.h b/src/shared/smack-util.h new file mode 100644 index 0000000..f6ed2ec --- /dev/null +++ b/src/shared/smack-util.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2013 Intel Corporation + + Author: Auke Kok +***/ + +#include +#include + +#include "label-util.h" +#include "macro.h" + +#define SMACK_FLOOR_LABEL "_" +#define SMACK_STAR_LABEL "*" + +typedef enum SmackAttr { + SMACK_ATTR_ACCESS, + SMACK_ATTR_EXEC, + SMACK_ATTR_MMAP, + SMACK_ATTR_TRANSMUTE, + SMACK_ATTR_IPIN, + SMACK_ATTR_IPOUT, + _SMACK_ATTR_MAX, + _SMACK_ATTR_INVALID = -EINVAL, +} SmackAttr; + +bool mac_smack_use(void); +int mac_smack_init(void); + +int mac_smack_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags); +static inline int mac_smack_fix(const char *path, LabelFixFlags flags) { + return mac_smack_fix_full(AT_FDCWD, path, path, flags); +} + +const char* smack_attr_to_string(SmackAttr i) _const_; +SmackAttr smack_attr_from_string(const char *s) _pure_; +int mac_smack_read(const char *path, SmackAttr attr, char **label); +int mac_smack_read_fd(int fd, SmackAttr attr, char **label); +int mac_smack_apply_at(int dir_fd, const char *path, SmackAttr attr, const char *label); +static inline int mac_smack_apply(const char *path, SmackAttr attr, const char *label) { + return mac_smack_apply_at(AT_FDCWD, path, attr, label); +} +int mac_smack_apply_fd(int fd, SmackAttr attr, const char *label); +int mac_smack_apply_pid(pid_t pid, const char *label); +int mac_smack_copy(const char *dest, const char *src); + +int renameat_and_apply_smack_floor_label(int fdf, const char *from, int fdt, const char *to); +static inline int rename_and_apply_smack_floor_label(const char *from, const char *to) { + return renameat_and_apply_smack_floor_label(AT_FDCWD, from, AT_FDCWD, to); +} diff --git a/src/shared/socket-label.c b/src/shared/socket-label.c new file mode 100644 index 0000000..b86a6ad --- /dev/null +++ b/src/shared/socket-label.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "missing_socket.h" +#include "mkdir-label.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "umask-util.h" + +int socket_address_listen( + const SocketAddress *a, + int flags, + int backlog, + SocketAddressBindIPv6Only only, + const char *bind_to_device, + bool reuse_port, + bool free_bind, + bool transparent, + mode_t directory_mode, + mode_t socket_mode, + const char *label) { + + _cleanup_close_ int fd = -EBADF; + const char *p; + int r; + + assert(a); + + r = socket_address_verify(a, true); + if (r < 0) + return r; + + if (socket_address_family(a) == AF_INET6 && !socket_ipv6_is_supported()) + return -EAFNOSUPPORT; + + if (label) { + r = mac_selinux_create_socket_prepare(label); + if (r < 0) + return r; + } + + fd = RET_NERRNO(socket(socket_address_family(a), a->type | flags, a->protocol)); + + if (label) + mac_selinux_create_socket_clear(); + + if (fd < 0) + return fd; + + if (socket_address_family(a) == AF_INET6 && only != SOCKET_ADDRESS_DEFAULT) { + r = setsockopt_int(fd, IPPROTO_IPV6, IPV6_V6ONLY, only == SOCKET_ADDRESS_IPV6_ONLY); + if (r < 0) + return r; + } + + if (IN_SET(socket_address_family(a), AF_INET, AF_INET6)) { + if (bind_to_device) { + r = socket_bind_to_ifname(fd, bind_to_device); + if (r < 0) + return r; + } + + if (reuse_port) { + r = setsockopt_int(fd, SOL_SOCKET, SO_REUSEPORT, true); + if (r < 0) + log_warning_errno(r, "SO_REUSEPORT failed: %m"); + } + + if (free_bind) { + r = socket_set_freebind(fd, socket_address_family(a), true); + if (r < 0) + log_warning_errno(r, "IP_FREEBIND/IPV6_FREEBIND failed: %m"); + } + + if (transparent) { + r = socket_set_transparent(fd, socket_address_family(a), true); + if (r < 0) + log_warning_errno(r, "IP_TRANSPARENT/IPV6_TRANSPARENT failed: %m"); + } + } + + r = setsockopt_int(fd, SOL_SOCKET, SO_REUSEADDR, true); + if (r < 0) + return r; + + p = socket_address_get_path(a); + if (p) { + /* Create parents */ + (void) mkdir_parents_label(p, directory_mode); + + /* Enforce the right access mode for the socket */ + WITH_UMASK(~socket_mode) { + r = mac_selinux_bind(fd, &a->sockaddr.sa, a->size); + if (r == -EADDRINUSE) { + /* Unlink and try again */ + + if (unlink(p) < 0) + return r; /* didn't work, return original error */ + + r = mac_selinux_bind(fd, &a->sockaddr.sa, a->size); + } + if (r < 0) + return r; + } + } else { + if (bind(fd, &a->sockaddr.sa, a->size) < 0) + return -errno; + } + + if (socket_address_can_accept(a)) + if (listen(fd, backlog) < 0) + return -errno; + + /* Let's trigger an inotify event on the socket node, so that anyone waiting for this socket to be connectable + * gets notified */ + if (p) + (void) touch(p); + + return TAKE_FD(fd); +} diff --git a/src/shared/socket-netlink.c b/src/shared/socket-netlink.c new file mode 100644 index 0000000..0ba5762 --- /dev/null +++ b/src/shared/socket-netlink.c @@ -0,0 +1,409 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "extract-word.h" +#include "log.h" +#include "memory-util.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "string-util.h" + +int socket_address_parse(SocketAddress *a, const char *s) { + uint16_t port; + int r; + + assert(a); + assert(s); + + r = socket_address_parse_unix(a, s); + if (r == -EPROTO) + r = socket_address_parse_vsock(a, s); + if (r != -EPROTO) + return r; + + r = parse_ip_port(s, &port); + if (r == -ERANGE) + return r; /* Valid port syntax, but the numerical value is wrong for a port. */ + if (r >= 0) { + /* Just a port */ + if (socket_ipv6_is_supported()) + *a = (SocketAddress) { + .sockaddr.in6 = { + .sin6_family = AF_INET6, + .sin6_port = htobe16(port), + .sin6_addr = in6addr_any, + }, + .size = sizeof(struct sockaddr_in6), + }; + else + *a = (SocketAddress) { + .sockaddr.in = { + .sin_family = AF_INET, + .sin_port = htobe16(port), + .sin_addr.s_addr = INADDR_ANY, + }, + .size = sizeof(struct sockaddr_in), + }; + + } else { + union in_addr_union address; + int family, ifindex; + + r = in_addr_port_ifindex_name_from_string_auto(s, &family, &address, &port, &ifindex, NULL); + if (r < 0) + return r; + + if (port == 0) /* No port, no go. */ + return -EINVAL; + + if (family == AF_INET) + *a = (SocketAddress) { + .sockaddr.in = { + .sin_family = AF_INET, + .sin_addr = address.in, + .sin_port = htobe16(port), + }, + .size = sizeof(struct sockaddr_in), + }; + else if (family == AF_INET6) + *a = (SocketAddress) { + .sockaddr.in6 = { + .sin6_family = AF_INET6, + .sin6_addr = address.in6, + .sin6_port = htobe16(port), + .sin6_scope_id = ifindex, + }, + .size = sizeof(struct sockaddr_in6), + }; + else + assert_not_reached(); + } + + return 0; +} + +int socket_address_parse_and_warn(SocketAddress *a, const char *s) { + SocketAddress b; + int r; + + /* Similar to socket_address_parse() but warns for IPv6 sockets when we don't support them. */ + + r = socket_address_parse(&b, s); + if (r < 0) + return r; + + if (!socket_ipv6_is_supported() && b.sockaddr.sa.sa_family == AF_INET6) { + log_warning("Binding to IPv6 address not available since kernel does not support IPv6."); + return -EAFNOSUPPORT; + } + + *a = b; + return 0; +} + +int socket_address_parse_netlink(SocketAddress *a, const char *s) { + _cleanup_free_ char *word = NULL; + unsigned group = 0; + int family, r; + + assert(a); + assert(s); + + r = extract_first_word(&s, &word, NULL, 0); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + family = netlink_family_from_string(word); + if (family < 0) + return -EINVAL; + + if (!isempty(s)) { + r = safe_atou(s, &group); + if (r < 0) + return r; + } + + *a = (SocketAddress) { + .type = SOCK_RAW, + .sockaddr.nl.nl_family = AF_NETLINK, + .sockaddr.nl.nl_groups = group, + .protocol = family, + .size = sizeof(struct sockaddr_nl), + }; + + return 0; +} + +bool socket_address_is(const SocketAddress *a, const char *s, int type) { + struct SocketAddress b; + + assert(a); + assert(s); + + if (socket_address_parse(&b, s) < 0) + return false; + + b.type = type; + + return socket_address_equal(a, &b); +} + +bool socket_address_is_netlink(const SocketAddress *a, const char *s) { + struct SocketAddress b; + + assert(a); + assert(s); + + if (socket_address_parse_netlink(&b, s) < 0) + return false; + + return socket_address_equal(a, &b); +} + +int make_socket_fd(int log_level, const char* address, int type, int flags) { + SocketAddress a; + int fd, r; + + r = socket_address_parse(&a, address); + if (r < 0) + return log_error_errno(r, "Failed to parse socket address \"%s\": %m", address); + + a.type = type; + + fd = socket_address_listen(&a, type | flags, SOMAXCONN_DELUXE, SOCKET_ADDRESS_DEFAULT, + NULL, false, false, false, 0755, 0644, NULL); + if (fd < 0 || log_get_max_level() >= log_level) { + _cleanup_free_ char *p = NULL; + + r = socket_address_print(&a, &p); + if (r < 0) + return log_error_errno(r, "socket_address_print(): %m"); + + if (fd < 0) + log_error_errno(fd, "Failed to listen on %s: %m", p); + else + log_full(log_level, "Listening on %s", p); + } + + return fd; +} + +int in_addr_port_ifindex_name_from_string_auto( + const char *s, + int *ret_family, + union in_addr_union *ret_address, + uint16_t *ret_port, + int *ret_ifindex, + char **ret_server_name) { + + _cleanup_free_ char *buf1 = NULL, *buf2 = NULL, *name = NULL; + int family, ifindex = 0, r; + union in_addr_union a; + uint16_t port = 0; + const char *m; + + assert(s); + + /* This accepts the following: + * 192.168.0.1:53#example.com + * [2001:4860:4860::8888]:53%eth0#example.com + * + * If ret_port is NULL, then the port cannot be specified. + * If ret_ifindex is NULL, then the interface index cannot be specified. + * If ret_server_name is NULL, then server_name cannot be specified. + * + * ret_family is always AF_INET or AF_INET6. + */ + + m = strchr(s, '#'); + if (m) { + if (!ret_server_name) + return -EINVAL; + + if (isempty(m + 1)) + return -EINVAL; + + name = strdup(m + 1); + if (!name) + return -ENOMEM; + + s = buf1 = strndup(s, m - s); + if (!buf1) + return -ENOMEM; + } + + m = strchr(s, '%'); + if (m) { + if (!ret_ifindex) + return -EINVAL; + + if (isempty(m + 1)) + return -EINVAL; + + if (!ifname_valid_full(m + 1, IFNAME_VALID_ALTERNATIVE | IFNAME_VALID_NUMERIC)) + return -EINVAL; /* We want to return -EINVAL for syntactically invalid names, + * and -ENODEV for valid but nonexistent interfaces. */ + + ifindex = rtnl_resolve_interface(NULL, m + 1); + if (ifindex < 0) + return ifindex; + + s = buf2 = strndup(s, m - s); + if (!buf2) + return -ENOMEM; + } + + m = strrchr(s, ':'); + if (m) { + if (*s == '[') { + _cleanup_free_ char *ip_str = NULL; + + if (!ret_port) + return -EINVAL; + + if (*(m - 1) != ']') + return -EINVAL; + + family = AF_INET6; + + r = parse_ip_port(m + 1, &port); + if (r < 0) + return r; + + ip_str = strndup(s + 1, m - s - 2); + if (!ip_str) + return -ENOMEM; + + r = in_addr_from_string(family, ip_str, &a); + if (r < 0) + return r; + } else { + /* First try to parse the string as IPv6 address without port number */ + r = in_addr_from_string(AF_INET6, s, &a); + if (r < 0) { + /* Then the input should be IPv4 address with port number */ + _cleanup_free_ char *ip_str = NULL; + + if (!ret_port) + return -EINVAL; + + family = AF_INET; + + ip_str = strndup(s, m - s); + if (!ip_str) + return -ENOMEM; + + r = in_addr_from_string(family, ip_str, &a); + if (r < 0) + return r; + + r = parse_ip_port(m + 1, &port); + if (r < 0) + return r; + } else + family = AF_INET6; + } + } else { + family = AF_INET; + r = in_addr_from_string(family, s, &a); + if (r < 0) + return r; + } + + if (ret_family) + *ret_family = family; + if (ret_address) + *ret_address = a; + if (ret_port) + *ret_port = port; + if (ret_ifindex) + *ret_ifindex = ifindex; + if (ret_server_name) + *ret_server_name = TAKE_PTR(name); + + return r; +} + +struct in_addr_full *in_addr_full_free(struct in_addr_full *a) { + if (!a) + return NULL; + + free(a->server_name); + free(a->cached_server_string); + return mfree(a); +} + +int in_addr_full_new( + int family, + const union in_addr_union *a, + uint16_t port, + int ifindex, + const char *server_name, + struct in_addr_full **ret) { + + _cleanup_free_ char *name = NULL; + struct in_addr_full *x; + + assert(ret); + + if (!isempty(server_name)) { + name = strdup(server_name); + if (!name) + return -ENOMEM; + } + + x = new(struct in_addr_full, 1); + if (!x) + return -ENOMEM; + + *x = (struct in_addr_full) { + .family = family, + .address = *a, + .port = port, + .ifindex = ifindex, + .server_name = TAKE_PTR(name), + }; + + *ret = x; + return 0; +} + +int in_addr_full_new_from_string(const char *s, struct in_addr_full **ret) { + _cleanup_free_ char *server_name = NULL; + int family, ifindex, r; + union in_addr_union a; + uint16_t port; + + assert(s); + + r = in_addr_port_ifindex_name_from_string_auto(s, &family, &a, &port, &ifindex, &server_name); + if (r < 0) + return r; + + return in_addr_full_new(family, &a, port, ifindex, server_name, ret); +} + +const char *in_addr_full_to_string(struct in_addr_full *a) { + assert(a); + + if (!a->cached_server_string) + (void) in_addr_port_ifindex_name_to_string( + a->family, + &a->address, + a->port, + a->ifindex, + a->server_name, + &a->cached_server_string); + + return a->cached_server_string; +} diff --git a/src/shared/socket-netlink.h b/src/shared/socket-netlink.h new file mode 100644 index 0000000..6256a83 --- /dev/null +++ b/src/shared/socket-netlink.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "in-addr-util.h" +#include "macro.h" +#include "socket-util.h" + +int make_socket_fd(int log_level, const char* address, int type, int flags); + +int socket_address_parse(SocketAddress *a, const char *s); +int socket_address_parse_and_warn(SocketAddress *a, const char *s); +int socket_address_parse_netlink(SocketAddress *a, const char *s); + +bool socket_address_is(const SocketAddress *a, const char *s, int type); +bool socket_address_is_netlink(const SocketAddress *a, const char *s); + +int in_addr_port_ifindex_name_from_string_auto( + const char *s, + int *ret_family, + union in_addr_union *ret_address, + uint16_t *ret_port, + int *ret_ifindex, + char **ret_server_name); +static inline int in_addr_ifindex_name_from_string_auto(const char *s, int *family, union in_addr_union *ret, int *ifindex, char **server_name) { + return in_addr_port_ifindex_name_from_string_auto(s, family, ret, NULL, ifindex, server_name); +} +static inline int in_addr_ifindex_from_string_auto(const char *s, int *family, union in_addr_union *ret, int *ifindex) { + return in_addr_ifindex_name_from_string_auto(s, family, ret, ifindex, NULL); +} + +struct in_addr_full { + int family; + union in_addr_union address; + uint16_t port; + int ifindex; + char *server_name; + char *cached_server_string; /* Should not be handled directly, but through in_addr_full_to_string(). */ +}; + +struct in_addr_full *in_addr_full_free(struct in_addr_full *a); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct in_addr_full*, in_addr_full_free); +int in_addr_full_new(int family, const union in_addr_union *a, uint16_t port, int ifindex, const char *server_name, struct in_addr_full **ret); +int in_addr_full_new_from_string(const char *s, struct in_addr_full **ret); +const char *in_addr_full_to_string(struct in_addr_full *a); diff --git a/src/shared/spawn-ask-password-agent.c b/src/shared/spawn-ask-password-agent.c new file mode 100644 index 0000000..d34cfff --- /dev/null +++ b/src/shared/spawn-ask-password-agent.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "exec-util.h" +#include "log.h" +#include "process-util.h" +#include "spawn-ask-password-agent.h" + +static pid_t agent_pid = 0; + +int ask_password_agent_open(void) { + int r; + + if (agent_pid > 0) + return 0; + + /* We check STDIN here, not STDOUT, since this is about input, + * not output */ + if (!isatty(STDIN_FILENO)) + return 0; + + if (!is_main_thread()) + return -EPERM; + + r = fork_agent("(sd-askpwagent)", + NULL, 0, + &agent_pid, + SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH, + SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH, "--watch", NULL); + if (r < 0) + return log_error_errno(r, "Failed to fork TTY ask password agent: %m"); + + return 1; +} + +void ask_password_agent_close(void) { + + if (agent_pid <= 0) + return; + + /* Inform agent that we are done */ + sigterm_wait(TAKE_PID(agent_pid)); +} + +int ask_password_agent_open_if_enabled(BusTransport transport, bool ask_password) { + + /* Open the ask password agent as a child process if necessary */ + + if (transport != BUS_TRANSPORT_LOCAL) + return 0; + + if (!ask_password) + return 0; + + return ask_password_agent_open(); +} diff --git a/src/shared/spawn-ask-password-agent.h b/src/shared/spawn-ask-password-agent.h new file mode 100644 index 0000000..a76cdb1 --- /dev/null +++ b/src/shared/spawn-ask-password-agent.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bus-util.h" + +int ask_password_agent_open(void); +void ask_password_agent_close(void); + +int ask_password_agent_open_if_enabled(BusTransport transport, bool ask_password); diff --git a/src/shared/spawn-polkit-agent.c b/src/shared/spawn-polkit-agent.c new file mode 100644 index 0000000..ce3c5fb --- /dev/null +++ b/src/shared/spawn-polkit-agent.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "exec-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "log.h" +#include "macro.h" +#include "process-util.h" +#include "spawn-polkit-agent.h" +#include "stdio-util.h" +#include "time-util.h" + +#if ENABLE_POLKIT +static pid_t agent_pid = 0; + +int polkit_agent_open(void) { + char notify_fd[DECIMAL_STR_MAX(int) + 1]; + int pipe_fd[2], r; + + if (agent_pid > 0) + return 0; + + /* Clients that run as root don't need to activate/query polkit */ + if (geteuid() == 0) + return 0; + + /* We check STDIN here, not STDOUT, since this is about input, not output */ + if (!isatty(STDIN_FILENO)) + return 0; + + if (!is_main_thread()) + return -EPERM; + + if (pipe2(pipe_fd, 0) < 0) + return -errno; + + xsprintf(notify_fd, "%i", pipe_fd[1]); + + r = fork_agent("(polkit-agent)", + &pipe_fd[1], 1, + &agent_pid, + POLKIT_AGENT_BINARY_PATH, + POLKIT_AGENT_BINARY_PATH, "--notify-fd", notify_fd, "--fallback", NULL); + + /* Close the writing side, because that's the one for the agent */ + safe_close(pipe_fd[1]); + + if (r < 0) + log_error_errno(r, "Failed to fork TTY ask password agent: %m"); + else + /* Wait until the agent closes the fd */ + (void) fd_wait_for_event(pipe_fd[0], POLLHUP, USEC_INFINITY); + + safe_close(pipe_fd[0]); + + return r; +} + +void polkit_agent_close(void) { + + if (agent_pid <= 0) + return; + + /* Inform agent that we are done */ + sigterm_wait(TAKE_PID(agent_pid)); +} + +#else + +int polkit_agent_open(void) { + return 0; +} + +void polkit_agent_close(void) { +} + +#endif + +int polkit_agent_open_if_enabled(BusTransport transport, bool ask_password) { + + /* Open the polkit agent as a child process if necessary */ + + if (transport != BUS_TRANSPORT_LOCAL) + return 0; + + if (!ask_password) + return 0; + + return polkit_agent_open(); +} diff --git a/src/shared/spawn-polkit-agent.h b/src/shared/spawn-polkit-agent.h new file mode 100644 index 0000000..325dfdd --- /dev/null +++ b/src/shared/spawn-polkit-agent.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bus-util.h" + +int polkit_agent_open(void); +void polkit_agent_close(void); + +int polkit_agent_open_if_enabled(BusTransport transport, bool ask_password); diff --git a/src/shared/specifier.c b/src/shared/specifier.c new file mode 100644 index 0000000..e5a1f94 --- /dev/null +++ b/src/shared/specifier.c @@ -0,0 +1,498 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "architecture.h" +#include "chase.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "macro.h" +#include "os-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "specifier.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +/* + * Generic infrastructure for replacing %x style specifiers in + * strings. Will call a callback for each replacement. + */ + +/* Any ASCII character or digit: our pool of potential specifiers, + * and "%" used for escaping. */ +#define POSSIBLE_SPECIFIERS ALPHANUMERICAL "%" + +int specifier_printf(const char *text, size_t max_length, const Specifier table[], const char *root, const void *userdata, char **ret) { + _cleanup_free_ char *result = NULL; + bool percent = false; + size_t l; + char *t; + int r; + + assert(ret); + assert(text); + assert(table); + + l = strlen(text); + if (!GREEDY_REALLOC(result, l + 1)) + return -ENOMEM; + t = result; + + for (const char *f = text; *f != '\0'; f++, l--) { + if (percent) { + percent = false; + + if (*f == '%') + *(t++) = '%'; + else { + const Specifier *i; + + for (i = table; i->specifier; i++) + if (i->specifier == *f) + break; + + if (i->lookup) { + _cleanup_free_ char *w = NULL; + size_t k, j; + + r = i->lookup(i->specifier, i->data, root, userdata, &w); + if (r < 0) + return r; + if (isempty(w)) + continue; + + j = t - result; + k = strlen(w); + + if (!GREEDY_REALLOC(result, j + k + l + 1)) + return -ENOMEM; + memcpy(result + j, w, k); + t = result + j + k; + } else if (strchr(POSSIBLE_SPECIFIERS, *f)) + /* Oops, an unknown specifier. */ + return -EBADSLT; + else { + *(t++) = '%'; + *(t++) = *f; + } + } + } else if (*f == '%') + percent = true; + else + *(t++) = *f; + + if ((size_t) (t - result) > max_length) + return -ENAMETOOLONG; + } + + /* If string ended with a stray %, also end with % */ + if (percent) { + *(t++) = '%'; + if ((size_t) (t - result) > max_length) + return -ENAMETOOLONG; + } + *(t++) = 0; + + *ret = TAKE_PTR(result); + return 0; +} + +/* Generic handler for simple string replacements */ + +int specifier_string(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + char *n = NULL; + + assert(ret); + + if (!isempty(data)) { + n = strdup(data); + if (!n) + return -ENOMEM; + } + + *ret = n; + return 0; +} + +int specifier_real_path(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const char *path = data; + + assert(ret); + + if (!path) + return -ENOENT; + + return chase(path, root, 0, ret, NULL); +} + +int specifier_real_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + _cleanup_free_ char *path = NULL; + int r; + + assert(ret); + + r = specifier_real_path(specifier, data, root, userdata, &path); + if (r < 0) + return r; + + assert(path); + return path_extract_directory(path, ret); +} + +int specifier_id128(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const sd_id128_t *id = ASSERT_PTR(data); + char *n; + + n = new(char, SD_ID128_STRING_MAX); + if (!n) + return -ENOMEM; + + *ret = sd_id128_to_string(*id, n); + return 0; +} + +int specifier_uuid(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const sd_id128_t *id = ASSERT_PTR(data); + char *n; + + n = new(char, SD_ID128_UUID_STRING_MAX); + if (!n) + return -ENOMEM; + + *ret = sd_id128_to_uuid_string(*id, n); + return 0; +} + +int specifier_uint64(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const uint64_t *n = ASSERT_PTR(data); + + return asprintf(ret, "%" PRIu64, *n) < 0 ? -ENOMEM : 0; +} + +int specifier_machine_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + sd_id128_t id; + int r; + + assert(ret); + + r = id128_get_machine(root, &id); + if (r < 0) /* Translate error for missing /etc/machine-id file to EUNATCH. */ + return r == -ENOENT ? -EUNATCH : r; + + return specifier_id128(specifier, &id, root, userdata, ret); +} + +int specifier_boot_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + sd_id128_t id; + int r; + + assert(ret); + + r = sd_id128_get_boot(&id); + if (r < 0) + return r; + + return specifier_id128(specifier, &id, root, userdata, ret); +} + +int specifier_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + char *n; + + assert(ret); + + n = gethostname_malloc(); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int specifier_short_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + char *n; + + assert(ret); + + n = gethostname_short_malloc(); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int specifier_pretty_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + char *n = NULL; + + assert(ret); + + if (get_pretty_hostname(&n) < 0) { + n = gethostname_short_malloc(); + if (!n) + return -ENOMEM; + } + + *ret = n; + return 0; +} + +int specifier_kernel_release(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + struct utsname uts; + char *n; + + assert(ret); + + if (uname(&uts) < 0) + return -errno; + + n = strdup(uts.release); + if (!n) + return -ENOMEM; + + *ret = n; + return 0; +} + +int specifier_architecture(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + char *t; + + assert(ret); + + t = strdup(architecture_to_string(uname_architecture())); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +/* Note: fields in /etc/os-release might quite possibly be missing, even if everything is entirely valid + * otherwise. We'll return an empty value or NULL in that case from the functions below. But if the + * os-release file is missing, we'll return -EUNATCH. This means that something is seriously wrong with the + * installation. */ + +static int parse_os_release_specifier(const char *root, const char *id, char **ret) { + _cleanup_free_ char *v = NULL; + int r; + + assert(ret); + + r = parse_os_release(root, id, &v); + if (r >= 0) + /* parse_os_release() calls parse_env_file() which only sets the return value for + * entries found. Let's make sure we set the return value in all cases. */ + *ret = TAKE_PTR(v); + + /* Translate error for missing os-release file to EUNATCH. */ + return r == -ENOENT ? -EUNATCH : r; +} + +int specifier_os_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + return parse_os_release_specifier(root, "ID", ret); +} + +int specifier_os_version_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + return parse_os_release_specifier(root, "VERSION_ID", ret); +} + +int specifier_os_build_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + return parse_os_release_specifier(root, "BUILD_ID", ret); +} + +int specifier_os_variant_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + return parse_os_release_specifier(root, "VARIANT_ID", ret); +} + +int specifier_os_image_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + return parse_os_release_specifier(root, "IMAGE_ID", ret); +} + +int specifier_os_image_version(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + return parse_os_release_specifier(root, "IMAGE_VERSION", ret); +} + +int specifier_group_name(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + RuntimeScope scope = PTR_TO_INT(data); + char *t; + + assert(ret); + + if (scope == RUNTIME_SCOPE_GLOBAL) + return -EINVAL; + + t = gid_to_name(scope == RUNTIME_SCOPE_USER ? getgid() : 0); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +int specifier_group_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + RuntimeScope scope = PTR_TO_INT(data); + gid_t gid; + + assert(ret); + + if (scope == RUNTIME_SCOPE_GLOBAL) + return -EINVAL; + + gid = scope == RUNTIME_SCOPE_USER ? getgid() : 0; + + if (asprintf(ret, UID_FMT, gid) < 0) + return -ENOMEM; + + return 0; +} + +int specifier_user_name(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + RuntimeScope scope = PTR_TO_INT(data); + uid_t uid; + char *t; + + assert(ret); + + if (scope == RUNTIME_SCOPE_GLOBAL) + return -EINVAL; + + uid = scope == RUNTIME_SCOPE_USER ? getuid() : 0; + + /* If we are UID 0 (root), this will not result in NSS, otherwise it might. This is good, as we want + * to be able to run this in PID 1, where our user ID is 0, but where NSS lookups are not allowed. + + * We don't use getusername_malloc() here, because we don't want to look at $USER, to remain + * consistent with specifer_user_id() below. + */ + + t = uid_to_name(uid); + if (!t) + return -ENOMEM; + + *ret = t; + return 0; +} + +int specifier_user_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + RuntimeScope scope = PTR_TO_INT(data); + uid_t uid; + + assert(ret); + + if (scope == RUNTIME_SCOPE_GLOBAL) + return -EINVAL; + + uid = scope == RUNTIME_SCOPE_USER ? getuid() : 0; + + if (asprintf(ret, UID_FMT, uid) < 0) + return -ENOMEM; + + return 0; +} + +int specifier_user_home(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + assert(ret); + + /* On PID 1 (which runs as root) this will not result in NSS, + * which is good. See above */ + + return get_home_dir(ret); +} + +int specifier_user_shell(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + assert(ret); + + /* On PID 1 (which runs as root) this will not result in NSS, + * which is good. See above */ + + return get_shell(ret); +} + +int specifier_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const char *p; + char *copy; + int r; + + assert(ret); + + if (root) /* If root dir is set, don't honour $TMP or similar */ + p = "/tmp"; + else { + r = tmp_dir(&p); + if (r < 0) + return r; + } + copy = strdup(p); + if (!copy) + return -ENOMEM; + + *ret = copy; + return 0; +} + +int specifier_var_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const char *p; + char *copy; + int r; + + assert(ret); + + if (root) + p = "/var/tmp"; + else { + r = var_tmp_dir(&p); + if (r < 0) + return r; + } + copy = strdup(p); + if (!copy) + return -ENOMEM; + + *ret = copy; + return 0; +} + +int specifier_escape_strv(char **l, char ***ret) { + _cleanup_strv_free_ char **z = NULL; + char **p, **q; + + assert(ret); + + if (strv_isempty(l)) { + *ret = NULL; + return 0; + } + + z = new(char*, strv_length(l)+1); + if (!z) + return -ENOMEM; + + for (p = l, q = z; *p; p++, q++) { + + *q = specifier_escape(*p); + if (!*q) + return -ENOMEM; + } + + *q = NULL; + *ret = TAKE_PTR(z); + + return 0; +} + +const Specifier system_and_tmp_specifier_table[] = { + COMMON_SYSTEM_SPECIFIERS, + COMMON_TMP_SPECIFIERS, + {} +}; diff --git a/src/shared/specifier.h b/src/shared/specifier.h new file mode 100644 index 0000000..df72bdc --- /dev/null +++ b/src/shared/specifier.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "string-util.h" + +typedef int (*SpecifierCallback)(char specifier, const void *data, const char *root, const void *userdata, char **ret); + +typedef struct Specifier { + const char specifier; + const SpecifierCallback lookup; + const void *data; +} Specifier; + +int specifier_printf(const char *text, size_t max_length, const Specifier table[], const char *root, const void *userdata, char **ret); + +int specifier_string(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_real_path(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_real_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_id128(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_uuid(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_uint64(char specifier, const void *data, const char *root, const void *userdata, char **ret); + +int specifier_machine_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_boot_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_short_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_pretty_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_kernel_release(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_architecture(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_os_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_os_version_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_os_build_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_os_variant_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_os_image_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_os_image_version(char specifier, const void *data, const char *root, const void *userdata, char **ret); + +int specifier_group_name(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_group_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_user_name(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_user_id(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_user_home(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_user_shell(char specifier, const void *data, const char *root, const void *userdata, char **ret); + +int specifier_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret); +int specifier_var_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret); + +/* Typically, in places where one of the above specifier is to be resolved the other similar ones are to be + * resolved, too. Hence let's define common macros for the relevant array entries. + * + * COMMON_SYSTEM_SPECIFIERS: + * %a: the native userspace architecture + * %A: the OS image version, according to /etc/os-release + * %b: the boot ID of the running system + * %B: the OS build ID, according to /etc/os-release + * %H: the hostname of the running system + * %l: the short hostname of the running system + * %q: the 'pretty' hostname as per /etc/machine-info + * %m: the machine ID of the running system + * %M: the OS image ID, according to /etc/os-release + * %o: the OS ID according to /etc/os-release + * %v: the kernel version + * %w: the OS version ID, according to /etc/os-release + * %W: the OS variant ID, according to /etc/os-release + * + * COMMON_CREDS_SPECIFIERS: + * %g: the groupname of the running user + * %G: the GID of the running user + * %u: the username of the running user + * %U: the UID of the running user + * + * COMMON_TMP_SPECIFIERS: + * %T: the temporary directory (e.g. /tmp, or $TMPDIR, $TEMP, $TMP) + * %V: the temporary directory for large, persistent stuff (e.g. /var/tmp, or $TMPDIR, $TEMP, $TMP) + */ + +#define COMMON_SYSTEM_SPECIFIERS \ + { 'a', specifier_architecture, NULL }, \ + { 'A', specifier_os_image_version, NULL }, \ + { 'b', specifier_boot_id, NULL }, \ + { 'B', specifier_os_build_id, NULL }, \ + { 'H', specifier_hostname, NULL }, \ + { 'l', specifier_short_hostname, NULL }, \ + { 'q', specifier_pretty_hostname, NULL }, \ + { 'm', specifier_machine_id, NULL }, \ + { 'M', specifier_os_image_id, NULL }, \ + { 'o', specifier_os_id, NULL }, \ + { 'v', specifier_kernel_release, NULL }, \ + { 'w', specifier_os_version_id, NULL }, \ + { 'W', specifier_os_variant_id, NULL } + +#define COMMON_CREDS_SPECIFIERS(scope) \ + { 'g', specifier_group_name, INT_TO_PTR(scope) }, \ + { 'G', specifier_group_id, INT_TO_PTR(scope) }, \ + { 'u', specifier_user_name, INT_TO_PTR(scope) }, \ + { 'U', specifier_user_id, INT_TO_PTR(scope) } + +#define COMMON_TMP_SPECIFIERS \ + { 'T', specifier_tmp_dir, NULL }, \ + { 'V', specifier_var_tmp_dir, NULL } + +static inline char* specifier_escape(const char *string) { + return strreplace(string, "%", "%%"); +} + +int specifier_escape_strv(char **l, char ***ret); + +/* A generic specifier table consisting of COMMON_SYSTEM_SPECIFIERS and COMMON_TMP_SPECIFIERS */ +extern const Specifier system_and_tmp_specifier_table[]; diff --git a/src/shared/switch-root.c b/src/shared/switch-root.c new file mode 100644 index 0000000..b620156 --- /dev/null +++ b/src/shared/switch-root.c @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "base-filesystem.h" +#include "chase.h" +#include "creds-util.h" +#include "fd-util.h" +#include "initrd-util.h" +#include "log.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "switch-root.h" +#include "user-util.h" + +int switch_root(const char *new_root, + const char *old_root_after, /* path below the new root, where to place the old root after the transition; may be NULL to unmount it */ + SwitchRootFlags flags) { + + /* Stuff mounted below /run/ we don't save on soft reboot, as it might have lost its relevance, i.e. + * credentials, removable media and such, we rather want that the new boot mounts this fresh. But on + * the switch from initrd we do use MS_REC, as it is expected that mounts set up in /run/ are + * maintained. */ + static const struct { + const char *path; + unsigned long mount_flags; /* Flags to apply if SWITCH_ROOT_RECURSIVE_RUN is unset */ + unsigned long mount_flags_recursive_run; /* Flags to apply if SWITCH_ROOT_RECURSIVE_RUN is set (0 if shall be skipped) */ + } transfer_table[] = { + { "/dev", MS_BIND|MS_REC, MS_BIND|MS_REC }, /* Recursive, because we want to save the original /dev/shm/ + /dev/pts/ and similar */ + { "/sys", MS_BIND|MS_REC, MS_BIND|MS_REC }, /* Similar, we want to retain various API VFS, or the cgroupv1 /sys/fs/cgroup/ tree */ + { "/proc", MS_BIND|MS_REC, MS_BIND|MS_REC }, /* Similar */ + { "/run", MS_BIND, MS_BIND|MS_REC }, /* Recursive except on soft reboot, see above */ + { SYSTEM_CREDENTIALS_DIRECTORY, MS_BIND, 0 /* skip! */ }, /* Credentials passed into the system should survive */ + { ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, MS_BIND, 0 /* skip! */ }, /* Similar */ + { "/run/host", MS_BIND|MS_REC, 0 /* skip! */ }, /* Host supplied hierarchy should also survive */ + }; + + _cleanup_close_ int old_root_fd = -EBADF, new_root_fd = -EBADF; + _cleanup_free_ char *resolved_old_root_after = NULL; + int r, istmp; + + assert(new_root); + + /* Check if we shall remove the contents of the old root */ + old_root_fd = open("/", O_DIRECTORY|O_CLOEXEC); + if (old_root_fd < 0) + return log_error_errno(errno, "Failed to open root directory: %m"); + + new_root_fd = open(new_root, O_DIRECTORY|O_CLOEXEC); + if (new_root_fd < 0) + return log_error_errno(errno, "Failed to open target directory '%s': %m", new_root); + + r = inode_same_at(old_root_fd, "", new_root_fd, "", AT_EMPTY_PATH); + if (r < 0) + return log_error_errno(r, "Failed to determine if old and new root directory are the same: %m"); + if (r > 0) { + log_debug("Skipping switch root, as old and new root directory are the same."); + return 0; + } + + /* Make the new root directory a mount point if it isn't */ + r = fd_make_mount_point(new_root_fd); + if (r < 0) + return log_error_errno(r, "Failed to make new root directory a mount point: %m"); + if (r > 0) { + int fd; + + /* When the path was not a mount point, then we need to reopen the path, otherwise, it still + * points to the underlying directory. */ + + fd = open(new_root, O_DIRECTORY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to reopen target directory '%s': %m", new_root); + + close_and_replace(new_root_fd, fd); + } + + if (FLAGS_SET(flags, SWITCH_ROOT_DESTROY_OLD_ROOT)) { + istmp = fd_is_temporary_fs(old_root_fd); + if (istmp < 0) + return log_error_errno(istmp, "Failed to stat root directory: %m"); + if (istmp > 0) + log_debug("Root directory is on tmpfs, will do cleanup later."); + } else + istmp = -1; /* don't know */ + + if (old_root_after) { + /* Determine where we shall place the old root after the transition */ + r = chase(old_root_after, new_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved_old_root_after, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, old_root_after); + if (r == 0) /* Doesn't exist yet. Let's create it */ + (void) mkdir_p_label(resolved_old_root_after, 0755); + } + + /* We are about to unmount various file systems with MNT_DETACH (either explicitly via umount() or + * indirectly via pivot_root()), and thus do not synchronously wait for them to be fully sync'ed — + * all while making them invisible/inaccessible in the file system tree for later code. That makes + * sync'ing them then difficult. Let's hence issue a manual sync() here, so that we at least can + * guarantee all file systems are an a good state before entering this state. */ + if (!FLAGS_SET(flags, SWITCH_ROOT_DONT_SYNC)) + sync(); + + /* Work-around for kernel design: the kernel refuses MS_MOVE if any file systems are mounted + * MS_SHARED. Hence remount them MS_PRIVATE here as a work-around. + * + * https://bugzilla.redhat.com/show_bug.cgi?id=847418 */ + if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) + return log_error_errno(errno, "Failed to set \"/\" mount propagation to private: %m"); + + /* Do not fail if base_filesystem_create() fails. Not all switch roots are like base_filesystem_create() wants + * them to look like. They might even boot, if they are RO and don't have the FS layout. Just ignore the error + * and switch_root() nevertheless. */ + (void) base_filesystem_create_fd(new_root_fd, new_root, UID_INVALID, GID_INVALID); + + FOREACH_ARRAY(transfer, transfer_table, ELEMENTSOF(transfer_table)) { + _cleanup_free_ char *chased = NULL; + unsigned long mount_flags; + + mount_flags = FLAGS_SET(flags, SWITCH_ROOT_RECURSIVE_RUN) ? transfer->mount_flags_recursive_run : transfer->mount_flags; + if (mount_flags == 0) /* skip if zero */ + continue; + + if (access(transfer->path, F_OK) < 0) { + log_debug_errno(errno, "Path '%s' to move to target root directory, not found, ignoring: %m", transfer->path); + continue; + } + + r = chase(transfer->path, new_root, CHASE_PREFIX_ROOT, &chased, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, transfer->path); + + /* Let's see if it is a mount point already. */ + r = path_is_mount_point(chased, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", chased); + if (r > 0) /* If it is already mounted, then do nothing */ + continue; + + r = mount_nofollow_verbose(LOG_ERR, transfer->path, chased, NULL, mount_flags, NULL); + if (r < 0) + return r; + } + + if (fchdir(new_root_fd) < 0) + return log_error_errno(errno, "Failed to change directory to %s: %m", new_root); + + /* We first try a pivot_root() so that we can umount the old root dir. In many cases (i.e. where rootfs is /), + * that's not possible however, and hence we simply overmount root */ + if (resolved_old_root_after) + r = RET_NERRNO(pivot_root(".", resolved_old_root_after)); + else { + r = RET_NERRNO(pivot_root(".", ".")); + if (r >= 0) { + /* Now unmount the upper of the two stacked file systems */ + if (umount2(".", MNT_DETACH) < 0) + return log_error_errno(errno, "Failed to unmount the old root: %m"); + } + } + if (r < 0) { + log_debug_errno(r, "Pivoting root file system failed, moving mounts instead: %m"); + + if (resolved_old_root_after) { + r = mount_nofollow_verbose(LOG_ERR, "/", resolved_old_root_after, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + } + + /* If we have to use MS_MOVE let's first try to get rid of *all* mounts we can, with the + * exception of the path we want to switch to, plus everything leading to it and within + * it. This is necessary because unlike pivot_root() just moving the mount to the root via + * MS_MOVE won't magically unmount anything below it. Once the chroot() succeeds the mounts + * below would still be around but invisible to us, because not accessible via + * /proc/self/mountinfo. Hence, let's clean everything up first, as long as we still can. */ + (void) umount_recursive_full(NULL, MNT_DETACH, STRV_MAKE(new_root)); + + if (mount(".", "/", NULL, MS_MOVE, NULL) < 0) + return log_error_errno(errno, "Failed to move %s to /: %m", new_root); + + if (chroot(".") < 0) + return log_error_errno(errno, "Failed to change root: %m"); + + if (chdir(".") < 0) + return log_error_errno(errno, "Failed to change directory: %m"); + } + + if (istmp > 0) { + struct stat rb; + + if (fstat(old_root_fd, &rb) < 0) + return log_error_errno(errno, "Failed to stat old root directory: %m"); + + /* Note: the below won't operate on non-memory file systems (i.e. only on tmpfs, ramfs), and + * it will stop at mount boundaries */ + (void) rm_rf_children(TAKE_FD(old_root_fd), 0, &rb); /* takes possession of the dir fd, even on failure */ + } + + return 0; +} diff --git a/src/shared/switch-root.h b/src/shared/switch-root.h new file mode 100644 index 0000000..ba0d280 --- /dev/null +++ b/src/shared/switch-root.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef enum SwitchRootFlags { + SWITCH_ROOT_DESTROY_OLD_ROOT = 1 << 0, /* rm -rf old root when switching – under the condition + * that it is backed by non-persistent tmpfs/ramfs/… */ + SWITCH_ROOT_DONT_SYNC = 1 << 1, /* don't call sync() immediately before switching root */ + SWITCH_ROOT_RECURSIVE_RUN = 1 << 2, /* move /run/ with MS_REC from old to new root */ +} SwitchRootFlags; + +int switch_root(const char *new_root, const char *old_root_after, SwitchRootFlags flags); diff --git a/src/shared/test-tables.h b/src/shared/test-tables.h new file mode 100644 index 0000000..3f20318 --- /dev/null +++ b/src/shared/test-tables.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include +#include + +#include "string-util.h" + +#define _test_table(name, lookup, reverse, size, sparse) \ + for (int64_t _i = -EINVAL, _boring = 0; _i < size + 1; _i++) { \ + const char* _val; \ + int64_t _rev; \ + \ + _val = lookup(_i); \ + if (_val) { \ + _rev = reverse(_val); \ + _boring = 0; \ + } else { \ + _rev = reverse("--no-such--value----"); \ + _boring += _i >= 0; \ + } \ + if (_boring == 0 || _i == size) \ + printf("%s: %" PRIi64 " → %s → %" PRIi64 "\n", name, _i, strnull(_val), _rev); \ + else if (_boring == 1) \ + printf("%*s ...\n", (int) strlen(name), ""); \ + \ + if (_i >= 0 && _i < size) { \ + if (sparse) \ + assert_se(_rev == _i || _rev == -EINVAL); \ + else \ + assert_se(_val && _rev == _i); \ + } else \ + assert_se(!_val && _rev == -EINVAL); \ + } + +#define test_table(lower, upper) \ + _test_table(STRINGIFY(lower), lower##_to_string, lower##_from_string, _##upper##_MAX, false) + +#define test_table_sparse(lower, upper) \ + _test_table(STRINGIFY(lower), lower##_to_string, lower##_from_string, _##upper##_MAX, true) diff --git a/src/shared/tests.c b/src/shared/tests.c new file mode 100644 index 0000000..3882a18 --- /dev/null +++ b/src/shared/tests.c @@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "bus-wait-for-jobs.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "env-file.h" +#include "env-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "namespace-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +char* setup_fake_runtime_dir(void) { + char t[] = "/tmp/fake-xdg-runtime-XXXXXX", *p; + + assert_se(mkdtemp(t)); + assert_se(setenv("XDG_RUNTIME_DIR", t, 1) >= 0); + assert_se(p = strdup(t)); + + return p; +} + +static void load_testdata_env(void) { + static bool called = false; + _cleanup_free_ char *s = NULL, *d = NULL, *envpath = NULL; + _cleanup_strv_free_ char **pairs = NULL; + int r; + + if (called) + return; + called = true; + + assert_se(readlink_and_make_absolute("/proc/self/exe", &s) >= 0); + assert_se(path_extract_directory(s, &d) >= 0); + assert_se(envpath = path_join(d, "systemd-runtest.env")); + + r = load_env_file_pairs(NULL, envpath, &pairs); + if (r < 0) { + log_debug_errno(r, "Reading %s failed: %m", envpath); + return; + } + + STRV_FOREACH_PAIR(k, v, pairs) + assert_se(setenv(*k, *v, 0) >= 0); +} + +int get_testdata_dir(const char *suffix, char **ret) { + const char *dir; + char *p; + + load_testdata_env(); + + /* if the env var is set, use that */ + dir = getenv("SYSTEMD_TEST_DATA"); + if (!dir) + dir = SYSTEMD_TEST_DATA; + if (access(dir, F_OK) < 0) + return log_error_errno(errno, "ERROR: $SYSTEMD_TEST_DATA directory [%s] not accessible: %m", dir); + + p = path_join(dir, suffix); + if (!p) + return log_oom(); + + *ret = p; + return 0; +} + +const char* get_catalog_dir(void) { + const char *env; + + load_testdata_env(); + + /* if the env var is set, use that */ + env = getenv("SYSTEMD_CATALOG_DIR"); + if (!env) + env = SYSTEMD_CATALOG_DIR; + if (access(env, F_OK) < 0) { + fprintf(stderr, "ERROR: $SYSTEMD_CATALOG_DIR directory [%s] does not exist\n", env); + exit(EXIT_FAILURE); + } + return env; +} + +bool slow_tests_enabled(void) { + int r; + + r = getenv_bool("SYSTEMD_SLOW_TESTS"); + if (r >= 0) + return r; + + if (r != -ENXIO) + log_warning_errno(r, "Cannot parse $SYSTEMD_SLOW_TESTS, ignoring."); + return SYSTEMD_SLOW_TESTS_DEFAULT; +} + +void test_setup_logging(int level) { + log_set_max_level(level); + log_parse_environment(); + log_open(); +} + +int write_tmpfile(char *pattern, const char *contents) { + _cleanup_close_ int fd = -EBADF; + + assert(pattern); + assert(contents); + + fd = mkostemp_safe(pattern); + if (fd < 0) + return fd; + + ssize_t l = strlen(contents); + errno = 0; + if (write(fd, contents, l) != l) + return errno_or_else(EIO); + return 0; +} + +bool have_namespaces(void) { + siginfo_t si = {}; + pid_t pid; + + /* Checks whether namespaces are available. In some cases they aren't. We do this by calling unshare(), and we + * do so in a child process in order not to affect our own process. */ + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + /* child */ + if (detach_mount_namespace() < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + assert_se(waitid(P_PID, pid, &si, WEXITED) >= 0); + assert_se(si.si_code == CLD_EXITED); + + if (si.si_status == EXIT_SUCCESS) + return true; + + if (si.si_status == EXIT_FAILURE) + return false; + + assert_not_reached(); +} + +bool can_memlock(void) { + /* Let's see if we can mlock() a larger blob of memory. BPF programs are charged against + * RLIMIT_MEMLOCK, hence let's first make sure we can lock memory at all, and skip the test if we + * cannot. Why not check RLIMIT_MEMLOCK explicitly? Because in container environments the + * RLIMIT_MEMLOCK value we see might not match the RLIMIT_MEMLOCK value actually in effect. */ + + void *p = mmap(NULL, CAN_MEMLOCK_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0); + if (p == MAP_FAILED) + return false; + + bool b = mlock(p, CAN_MEMLOCK_SIZE) >= 0; + if (b) + assert_se(munlock(p, CAN_MEMLOCK_SIZE) >= 0); + + assert_se(munmap(p, CAN_MEMLOCK_SIZE) >= 0); + return b; +} + +static int allocate_scope(void) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *scope = NULL; + const char *object; + int r; + + /* Let's try to run this test in a scope of its own, with delegation turned on, so that PID 1 doesn't + * interfere with our cgroup management. */ + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + + if (asprintf(&scope, "%s-%" PRIx64 ".scope", program_invocation_short_name, random_u64()) < 0) + return log_oom(); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + /* Name and Mode */ + r = sd_bus_message_append(m, "ss", scope, "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, (uint32_t) getpid_cached()); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)", "Delegate", "b", 1); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)", "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* Auxiliary units */ + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, false, NULL); + if (r < 0) + return r; + + return 0; +} + +static int enter_cgroup(char **ret_cgroup, bool enter_subroot) { + _cleanup_free_ char *cgroup_root = NULL, *cgroup_subroot = NULL; + CGroupMask supported; + int r; + + r = allocate_scope(); + if (r < 0) + log_warning_errno(r, "Couldn't allocate a scope unit for this test, proceeding without."); + + r = cg_pid_get_path(NULL, 0, &cgroup_root); + if (r == -ENOMEDIUM) + return log_warning_errno(r, "cg_pid_get_path(NULL, 0, ...) failed: %m"); + assert(r >= 0); + + if (enter_subroot) + assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0); + else { + cgroup_subroot = strdup(cgroup_root); + assert_se(cgroup_subroot != NULL); + } + + assert_se(cg_mask_supported(&supported) >= 0); + + /* If this fails, then we don't mind as the later cgroup operations will fail too, and it's fine if + * we handle any errors at that point. */ + + r = cg_create_everywhere(supported, _CGROUP_MASK_ALL, cgroup_subroot); + if (r < 0) + return r; + + r = cg_attach_everywhere(supported, cgroup_subroot, 0, NULL, NULL); + if (r < 0) + return r; + + if (ret_cgroup) + *ret_cgroup = TAKE_PTR(cgroup_subroot); + + return 0; +} + +int enter_cgroup_subroot(char **ret_cgroup) { + return enter_cgroup(ret_cgroup, true); +} + +int enter_cgroup_root(char **ret_cgroup) { + return enter_cgroup(ret_cgroup, false); +} + +const char *ci_environment(void) { + /* We return a string because we might want to provide multiple bits of information later on: not + * just the general CI environment type, but also whether we're sanitizing or not, etc. The caller is + * expected to use strstr on the returned value. */ + static const char *ans = POINTER_MAX; + int r; + + if (ans != POINTER_MAX) + return ans; + + /* We allow specifying the environment with $CITYPE. Nobody uses this so far, but we are ready. */ + const char *citype = getenv("CITYPE"); + if (!isempty(citype)) + return (ans = citype); + + if (getenv_bool("TRAVIS") > 0) + return (ans = "travis"); + if (getenv_bool("SEMAPHORE") > 0) + return (ans = "semaphore"); + if (getenv_bool("GITHUB_ACTIONS") > 0) + return (ans = "github-actions"); + if (getenv("AUTOPKGTEST_ARTIFACTS") || getenv("AUTOPKGTEST_TMP")) + return (ans = "autopkgtest"); + if (getenv("SALSA_CI_IMAGES")) + return (ans = "salsa-ci"); + + FOREACH_STRING(var, "CI", "CONTINOUS_INTEGRATION") { + /* Those vars are booleans according to Semaphore and Travis docs: + * https://docs.travis-ci.com/user/environment-variables/#default-environment-variables + * https://docs.semaphoreci.com/ci-cd-environment/environment-variables/#ci + */ + r = getenv_bool(var); + if (r > 0) + return (ans = "unknown"); /* Some other unknown thing */ + if (r == 0) + return (ans = NULL); + } + + return (ans = NULL); +} diff --git a/src/shared/tests.h b/src/shared/tests.h new file mode 100644 index 0000000..d76cf2e --- /dev/null +++ b/src/shared/tests.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-daemon.h" + +#include "argv-util.h" +#include "macro.h" +#include "static-destruct.h" +#include "strv.h" + +static inline bool manager_errno_skip_test(int r) { + return IN_SET(abs(r), + EPERM, + EACCES, + EADDRINUSE, + EHOSTDOWN, + ENOENT, + ENOMEDIUM /* cannot determine cgroup */ + ); +} + +char* setup_fake_runtime_dir(void); +int enter_cgroup_subroot(char **ret_cgroup); +int enter_cgroup_root(char **ret_cgroup); +int get_testdata_dir(const char *suffix, char **ret); +const char* get_catalog_dir(void); +bool slow_tests_enabled(void); +void test_setup_logging(int level); + +#define log_tests_skipped(fmt, ...) \ + ({ \ + log_notice("%s: " fmt ", skipping tests.", \ + program_invocation_short_name, \ + ##__VA_ARGS__); \ + EXIT_TEST_SKIP; \ + }) + +#define log_tests_skipped_errno(error, fmt, ...) \ + ({ \ + log_notice_errno(error, \ + "%s: " fmt ", skipping tests: %m", \ + program_invocation_short_name, \ + ##__VA_ARGS__); \ + EXIT_TEST_SKIP; \ + }) + +int write_tmpfile(char *pattern, const char *contents); + +bool have_namespaces(void); + +/* We use the small but non-trivial limit here */ +#define CAN_MEMLOCK_SIZE (512 * 1024U) +bool can_memlock(void); + +/* Define void* buffer and size_t length variables from a hex string. */ +#define DEFINE_HEX_PTR(name, hex) \ + _cleanup_free_ void *name = NULL; \ + size_t name##_len = 0; \ + assert_se(unhexmem(hex, strlen_ptr(hex), &name, &name##_len) >= 0); + +#define TEST_REQ_RUNNING_SYSTEMD(x) \ + if (sd_booted() > 0) { \ + x; \ + } else { \ + printf("systemd not booted, skipping '%s'\n", #x); \ + } + +/* Provide a convenient way to check if we're running in CI. */ +const char *ci_environment(void); + +typedef struct TestFunc { + union f { + void (*void_func)(void); + int (*int_func)(void); + } f; + const char * const name; + bool has_ret:1; + bool sd_booted:1; +} TestFunc; + +/* See static-destruct.h for an explanation of how this works. */ +#define REGISTER_TEST(func, ...) \ + _Pragma("GCC diagnostic ignored \"-Wattributes\"") \ + _section_("SYSTEMD_TEST_TABLE") _alignptr_ _used_ _retain_ _variable_no_sanitize_address_ \ + static const TestFunc UNIQ_T(static_test_table_entry, UNIQ) = { \ + .f = (union f) &(func), \ + .name = STRINGIFY(func), \ + .has_ret = __builtin_types_compatible_p(typeof((union f){}.int_func), typeof(&(func))), \ + ##__VA_ARGS__ \ + } + +extern const TestFunc _weak_ __start_SYSTEMD_TEST_TABLE[]; +extern const TestFunc _weak_ __stop_SYSTEMD_TEST_TABLE[]; + +#define TEST(name, ...) \ + static void test_##name(void); \ + REGISTER_TEST(test_##name, ##__VA_ARGS__); \ + static void test_##name(void) + +#define TEST_RET(name, ...) \ + static int test_##name(void); \ + REGISTER_TEST(test_##name, ##__VA_ARGS__); \ + static int test_##name(void) + +#define TEST_LOG_FUNC() \ + log_info("/* %s() */", __func__) + +static inline int run_test_table(void) { + _cleanup_strv_free_ char **tests = NULL; + int r = EXIT_SUCCESS; + bool ran = false; + const char *e; + + if (!__start_SYSTEMD_TEST_TABLE) + return r; + + e = getenv("TESTFUNCS"); + if (e) { + r = strv_split_full(&tests, e, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse $TESTFUNCS: %m"); + } + + for (const TestFunc *t = ALIGN_PTR(__start_SYSTEMD_TEST_TABLE); + t + 1 <= __stop_SYSTEMD_TEST_TABLE; + t = ALIGN_PTR(t + 1)) { + + if (tests && !strv_contains(tests, t->name)) + continue; + + if (t->sd_booted && sd_booted() <= 0) { + log_info("/* systemd not booted, skipping %s */", t->name); + if (t->has_ret && r == EXIT_SUCCESS) + r = EXIT_TEST_SKIP; + } else { + log_info("/* %s */", t->name); + + if (t->has_ret) { + int r2 = t->f.int_func(); + if (r == EXIT_SUCCESS) + r = r2; + } else + t->f.void_func(); + } + + ran = true; + } + + if (!ran) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "No matching tests found."); + + return r; +} + +#define DEFINE_TEST_MAIN_FULL(log_level, intro, outro) \ + int main(int argc, char *argv[]) { \ + int (*_intro)(void) = intro; \ + int (*_outro)(void) = outro; \ + int _r, _q; \ + test_setup_logging(log_level); \ + save_argc_argv(argc, argv); \ + _r = _intro ? _intro() : EXIT_SUCCESS; \ + if (_r == EXIT_SUCCESS) \ + _r = run_test_table(); \ + _q = _outro ? _outro() : EXIT_SUCCESS; \ + static_destruct(); \ + if (_r < 0) \ + return EXIT_FAILURE; \ + if (_r != EXIT_SUCCESS) \ + return _r; \ + if (_q < 0) \ + return EXIT_FAILURE; \ + return _q; \ + } + +#define DEFINE_TEST_MAIN_WITH_INTRO(log_level, intro) \ + DEFINE_TEST_MAIN_FULL(log_level, intro, NULL) +#define DEFINE_TEST_MAIN(log_level) \ + DEFINE_TEST_MAIN_FULL(log_level, NULL, NULL) diff --git a/src/shared/tmpfile-util-label.c b/src/shared/tmpfile-util-label.c new file mode 100644 index 0000000..a5f364c --- /dev/null +++ b/src/shared/tmpfile-util-label.c @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "selinux-util.h" +#include "tmpfile-util-label.h" +#include "tmpfile-util.h" + +int fopen_temporary_at_label( + int dir_fd, + const char *target, + const char *path, + FILE **f, + char **temp_path) { + + int r; + + assert(dir_fd >= 0 || dir_fd == AT_FDCWD); + assert(path); + + r = mac_selinux_create_file_prepare_at(dir_fd, target, S_IFREG); + if (r < 0) + return r; + + r = fopen_temporary_at(dir_fd, path, f, temp_path); + + mac_selinux_create_file_clear(); + + return r; +} diff --git a/src/shared/tmpfile-util-label.h b/src/shared/tmpfile-util-label.h new file mode 100644 index 0000000..68ab075 --- /dev/null +++ b/src/shared/tmpfile-util-label.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +/* These functions are split out of tmpfile-util.h (and not for example just flags to the functions they + * wrap) in order to optimize linking: this way, -lselinux is needed only for the callers of these functions + * that need selinux, but not for all. */ + +int fopen_temporary_at_label(int dir_fd, const char *target, const char *path, FILE **f, char **temp_path); +static inline int fopen_temporary_label(const char *target, const char *path, FILE **f, char **temp_path) { + return fopen_temporary_at_label(AT_FDCWD, target, path, f, temp_path); +} diff --git a/src/shared/tomoyo-util.c b/src/shared/tomoyo-util.c new file mode 100644 index 0000000..2347179 --- /dev/null +++ b/src/shared/tomoyo-util.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "tomoyo-util.h" + +bool mac_tomoyo_use(void) { + static int cached_use = -1; + + if (cached_use < 0) + cached_use = (access("/sys/kernel/security/tomoyo/version", + F_OK) == 0); + + return cached_use; +} diff --git a/src/shared/tomoyo-util.h b/src/shared/tomoyo-util.h new file mode 100644 index 0000000..a6ee7d4 --- /dev/null +++ b/src/shared/tomoyo-util.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +bool mac_tomoyo_use(void); diff --git a/src/shared/tpm2-event-log.c b/src/shared/tpm2-event-log.c new file mode 100644 index 0000000..2e23846 --- /dev/null +++ b/src/shared/tpm2-event-log.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "tpm2-event-log.h" + +#include "sort-util.h" + +typedef struct tpm2_log_event_type_info { + uint32_t event_type; + const char *name; +} tpm2_log_event_type_info; + +static tpm2_log_event_type_info tpm2_log_event_type_table[] = { + /* Unfortunately the types are defined all over the place, hence we are not using a dense table + * here. + * + * Keep this sorted by event type, so that we can do bisection! */ + { EV_PREBOOT_CERT, "preboot-cert" }, + { EV_POST_CODE, "post-code" }, + { EV_NO_ACTION, "no-action" }, + { EV_SEPARATOR, "separator" }, + { EV_ACTION, "action" }, + { EV_EVENT_TAG, "event-tag" }, + { EV_S_CRTM_CONTENTS, "s-crtm-contents" }, + { EV_S_CRTM_VERSION, "s-crtm-version" }, + { EV_CPU_MICROCODE, "cpu-microcode" }, + { EV_PLATFORM_CONFIG_FLAGS, "platform-config-flags" }, + { EV_TABLE_OF_DEVICES, "table-of-devices" }, + { EV_COMPACT_HASH, "compact-hash" }, + { EV_IPL, "ipl" }, + { EV_IPL_PARTITION_DATA, "ipl-partition-data" }, + { EV_NONHOST_CODE, "nonhost-code" }, + { EV_NONHOST_CONFIG, "nonhost-config" }, + { EV_NONHOST_INFO, "nonhost-info" }, + { EV_OMIT_BOOT_DEVICE_EVENTS, "omit-boot-device-events" }, + /* omitting EV_EFI_EVENT_BASE, since its not an event, but just a base value for other events */ + { EV_EFI_VARIABLE_DRIVER_CONFIG, "efi-variable-driver-config" }, + { EV_EFI_VARIABLE_BOOT, "efi-variable-boot" }, + { EV_EFI_BOOT_SERVICES_APPLICATION, "efi-boot-services-application" }, + { EV_EFI_BOOT_SERVICES_DRIVER, "efi-boot-services-driver" }, + { EV_EFI_RUNTIME_SERVICES_DRIVER, "efi-runtime-services-driver" }, + { EV_EFI_GPT_EVENT, "efi-gpt-event" }, + { EV_EFI_ACTION, "efi-action" }, + { EV_EFI_PLATFORM_FIRMWARE_BLOB, "efi-platform-firmware-blob" }, + { EV_EFI_HANDOFF_TABLES, "efi-handoff-tables" }, + { EV_EFI_PLATFORM_FIRMWARE_BLOB2, "efi-platform-firmware-blob2" }, + { EV_EFI_HANDOFF_TABLES2, "efi-handoff-tables" }, + { EV_EFI_VARIABLE_BOOT2, "efi-variable-boot2" }, + { EV_EFI_HCRTM_EVENT, "efi-hcrtm-event" }, + { EV_EFI_VARIABLE_AUTHORITY, "efi-variable-authority" }, + { EV_EFI_SPDM_FIRMWARE_BLOB, "efi-spdm-firmware-blob" }, + { EV_EFI_SPDM_FIRMWARE_CONFIG, "efi-spdm-firmware-config" }, +}; + +static int tpm2_log_event_type_info_cmp(const tpm2_log_event_type_info *a, const tpm2_log_event_type_info *b) { + return CMP(ASSERT_PTR(a)->event_type, ASSERT_PTR(b)->event_type); +} + +const char *tpm2_log_event_type_to_string(uint32_t type) { + + tpm2_log_event_type_info *found, key = { + .event_type = type, + }; + + found = typesafe_bsearch(&key, tpm2_log_event_type_table, ELEMENTSOF(tpm2_log_event_type_table), tpm2_log_event_type_info_cmp); + + return found ? found->name : NULL; +} diff --git a/src/shared/tpm2-event-log.h b/src/shared/tpm2-event-log.h new file mode 100644 index 0000000..916b805 --- /dev/null +++ b/src/shared/tpm2-event-log.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "tpm2-util.h" + +/* Definitions as per "TCG PC Client Specific Platform Firmware Profile Specification" + * (https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/), + * section 10.4.1 "Event Types" (at least in version 1.05 Revision 23 of the spec) */ +#ifndef EV_PREBOOT_CERT +#define EV_PREBOOT_CERT UINT32_C(0x00000000) +#define EV_POST_CODE UINT32_C(0x00000001) +#define EV_NO_ACTION UINT32_C(0x00000003) +#define EV_SEPARATOR UINT32_C(0x00000004) +#define EV_ACTION UINT32_C(0x00000005) +#define EV_EVENT_TAG UINT32_C(0x00000006) +#define EV_S_CRTM_CONTENTS UINT32_C(0x00000007) +#define EV_S_CRTM_VERSION UINT32_C(0x00000008) +#define EV_CPU_MICROCODE UINT32_C(0x00000009) +#define EV_PLATFORM_CONFIG_FLAGS UINT32_C(0x0000000a) +#define EV_TABLE_OF_DEVICES UINT32_C(0x0000000b) +#define EV_COMPACT_HASH UINT32_C(0x0000000c) +#define EV_IPL UINT32_C(0x0000000d) +#define EV_IPL_PARTITION_DATA UINT32_C(0x0000000e) +#define EV_NONHOST_CODE UINT32_C(0x0000000f) +#define EV_NONHOST_CONFIG UINT32_C(0x00000010) +#define EV_NONHOST_INFO UINT32_C(0x00000011) +#define EV_OMIT_BOOT_DEVICE_EVENTS UINT32_C(0x00000012) +#define EV_EFI_EVENT_BASE UINT32_C(0x80000000) +#define EV_EFI_VARIABLE_DRIVER_CONFIG UINT32_C(0x80000001) +#define EV_EFI_VARIABLE_BOOT UINT32_C(0x80000002) +#define EV_EFI_BOOT_SERVICES_APPLICATION UINT32_C(0x80000003) +#define EV_EFI_BOOT_SERVICES_DRIVER UINT32_C(0x80000004) +#define EV_EFI_RUNTIME_SERVICES_DRIVER UINT32_C(0x80000005) +#define EV_EFI_GPT_EVENT UINT32_C(0x80000006) +#define EV_EFI_ACTION UINT32_C(0x80000007) +#define EV_EFI_PLATFORM_FIRMWARE_BLOB UINT32_C(0x80000008) +#define EV_EFI_HANDOFF_TABLES UINT32_C(0x80000009) +#define EV_EFI_PLATFORM_FIRMWARE_BLOB2 UINT32_C(0x8000000A) +#define EV_EFI_HANDOFF_TABLES2 UINT32_C(0x8000000B) +#define EV_EFI_VARIABLE_BOOT2 UINT32_C(0x8000000C) +#define EV_EFI_HCRTM_EVENT UINT32_C(0x80000010) +#define EV_EFI_VARIABLE_AUTHORITY UINT32_C(0x800000E0) +#define EV_EFI_SPDM_FIRMWARE_BLOB UINT32_C(0x800000E1) +#define EV_EFI_SPDM_FIRMWARE_CONFIG UINT32_C(0x800000E2) +#endif + +/* Defined in drivers/firmware/efi/libstub/efistub.h in the Linux kernel sources */ +#ifndef INITRD_EVENT_TAG_ID +#define INITRD_EVENT_TAG_ID UINT32_C(0x8F3B22EC) +#endif + +#ifndef LOAD_OPTIONS_EVENT_TAG_ID +#define LOAD_OPTIONS_EVENT_TAG_ID UINT32_C(0x8F3B22ED) +#endif + +const char *tpm2_log_event_type_to_string(uint32_t type) _const_; + +#if HAVE_TPM2 + +/* UEFI event log data structures */ +typedef struct _packed_ TCG_PCClientPCREvent { + uint32_t pcrIndex; + uint32_t eventType; + uint8_t digest[20]; + uint32_t eventDataSize; + uint32_t event[]; +} TCG_PCClientPCREvent; + +typedef struct _packed_ packed_TPMT_HA { + uint16_t hashAlg; + TPMU_HA digest; +} packed_TPMT_HA; + +typedef struct _packed_ packed_TPML_DIGEST_VALUES { + uint32_t count; + packed_TPMT_HA digests[]; +} packed_TPML_DIGEST_VALUES; + +typedef struct _packed_ TCG_PCR_EVENT2 { + uint32_t pcrIndex; + uint32_t eventType; + packed_TPML_DIGEST_VALUES digests; + /* … */ +} TCG_PCR_EVENT2; + +typedef struct _packed_ TCG_EfiSpecIdEventAlgorithmSize { + uint16_t algorithmId; + uint16_t digestSize; +} TCG_EfiSpecIdEventAlgorithmSize; + +typedef struct _packed_ tdTCG_EfiSpecIdEvent { + uint8_t signature[16]; + uint32_t platformClass; + uint8_t specVersionMinor; + uint8_t specVersionMajor; + uint8_t specErrata; + uint8_t uintnSize; + uint32_t numberOfAlgorithms; + TCG_EfiSpecIdEventAlgorithmSize digestSizes[]; + /* … */ +} TCG_EfiSpecIDEvent; + +typedef struct _packed_ UEFI_VARIABLE_DATA { + uint8_t variableName[16]; + uint64_t unicodeNameLength; + uint64_t variableDataLength; + char16_t unicodeName[]; + /* … */ +} UEFI_VARIABLE_DATA; + +typedef struct _packed_ TCG_PCClientTaggedEvent{ + uint32_t taggedEventID; + uint32_t taggedEventDataSize; + uint8_t taggedEventData[]; +} TCG_PCClientTaggedEvent; + +typedef struct _packed_ packed_EFI_DEVICE_PATH { + uint8_t type; + uint8_t subType; + uint16_t length; + uint8_t path[]; +} packed_EFI_DEVICE_PATH; + +typedef struct _packed_ UEFI_IMAGE_LOAD_EVENT { + uint64_t imageLocationInMemory; + uint64_t imageLengthInMemory; + uint64_t imageLinkTimeAddress; + uint64_t lengthOfDevicePath; + packed_EFI_DEVICE_PATH devicePath[]; +} UEFI_IMAGE_LOAD_EVENT; + +typedef struct _packed_ UEFI_PLATFORM_FIRMWARE_BLOB { + uint64_t blobBase; + uint64_t blobLength; +} UEFI_PLATFORM_FIRMWARE_BLOB; + +#endif diff --git a/src/shared/tpm2-util.c b/src/shared/tpm2-util.c new file mode 100644 index 0000000..30b4f57 --- /dev/null +++ b/src/shared/tpm2-util.c @@ -0,0 +1,7664 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "constants.h" +#include "cryptsetup-util.h" +#include "dirent-util.h" +#include "dlfcn-util.h" +#include "efi-api.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "hmac.h" +#include "initrd-util.h" +#include "io-util.h" +#include "lock-util.h" +#include "log.h" +#include "logarithm.h" +#include "memory-util.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "random-util.h" +#include "sha256.h" +#include "sort-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "sync-util.h" +#include "time-util.h" +#include "tpm2-util.h" +#include "virt.h" + +#if HAVE_TPM2 +static void *libtss2_esys_dl = NULL; +static void *libtss2_rc_dl = NULL; +static void *libtss2_mu_dl = NULL; + +static TSS2_RC (*sym_Esys_Create)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE_CREATE *inSensitive, const TPM2B_PUBLIC *inPublic, const TPM2B_DATA *outsideInfo, const TPML_PCR_SELECTION *creationPCR, TPM2B_PRIVATE **outPrivate, TPM2B_PUBLIC **outPublic, TPM2B_CREATION_DATA **creationData, TPM2B_DIGEST **creationHash, TPMT_TK_CREATION **creationTicket) = NULL; +static TSS2_RC (*sym_Esys_CreateLoaded)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE_CREATE *inSensitive, const TPM2B_TEMPLATE *inPublic, ESYS_TR *objectHandle, TPM2B_PRIVATE **outPrivate, TPM2B_PUBLIC **outPublic) = NULL; +static TSS2_RC (*sym_Esys_CreatePrimary)(ESYS_CONTEXT *esysContext, ESYS_TR primaryHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE_CREATE *inSensitive, const TPM2B_PUBLIC *inPublic, const TPM2B_DATA *outsideInfo, const TPML_PCR_SELECTION *creationPCR, ESYS_TR *objectHandle, TPM2B_PUBLIC **outPublic, TPM2B_CREATION_DATA **creationData, TPM2B_DIGEST **creationHash, TPMT_TK_CREATION **creationTicket) = NULL; +static TSS2_RC (*sym_Esys_EvictControl)(ESYS_CONTEXT *esysContext, ESYS_TR auth, ESYS_TR objectHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPMI_DH_PERSISTENT persistentHandle, ESYS_TR *newObjectHandle) = NULL; +static void (*sym_Esys_Finalize)(ESYS_CONTEXT **context) = NULL; +static TSS2_RC (*sym_Esys_FlushContext)(ESYS_CONTEXT *esysContext, ESYS_TR flushHandle) = NULL; +static void (*sym_Esys_Free)(void *ptr) = NULL; +static TSS2_RC (*sym_Esys_GetCapability)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2_CAP capability, UINT32 property, UINT32 propertyCount, TPMI_YES_NO *moreData, TPMS_CAPABILITY_DATA **capabilityData) = NULL; +static TSS2_RC (*sym_Esys_GetRandom)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, UINT16 bytesRequested, TPM2B_DIGEST **randomBytes) = NULL; +static TSS2_RC (*sym_Esys_Import)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DATA *encryptionKey, const TPM2B_PUBLIC *objectPublic, const TPM2B_PRIVATE *duplicate, const TPM2B_ENCRYPTED_SECRET *inSymSeed, const TPMT_SYM_DEF_OBJECT *symmetricAlg, TPM2B_PRIVATE **outPrivate) = NULL; +static TSS2_RC (*sym_Esys_Initialize)(ESYS_CONTEXT **esys_context, TSS2_TCTI_CONTEXT *tcti, TSS2_ABI_VERSION *abiVersion) = NULL; +static TSS2_RC (*sym_Esys_Load)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_PRIVATE *inPrivate, const TPM2B_PUBLIC *inPublic, ESYS_TR *objectHandle) = NULL; +static TSS2_RC (*sym_Esys_LoadExternal)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE *inPrivate, const TPM2B_PUBLIC *inPublic, ESYS_TR hierarchy, ESYS_TR *objectHandle) = NULL; +static TSS2_RC (*sym_Esys_NV_DefineSpace)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_AUTH *auth, const TPM2B_NV_PUBLIC *publicInfo, ESYS_TR *nvHandle); +static TSS2_RC (*sym_Esys_NV_UndefineSpace)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR nvIndex, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3); +static TSS2_RC (*sym_Esys_NV_Write)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR nvIndex, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_MAX_NV_BUFFER *data, UINT16 offset); +static TSS2_RC (*sym_Esys_PCR_Extend)(ESYS_CONTEXT *esysContext, ESYS_TR pcrHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPML_DIGEST_VALUES *digests) = NULL; +static TSS2_RC (*sym_Esys_PCR_Read)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1,ESYS_TR shandle2, ESYS_TR shandle3, const TPML_PCR_SELECTION *pcrSelectionIn, UINT32 *pcrUpdateCounter, TPML_PCR_SELECTION **pcrSelectionOut, TPML_DIGEST **pcrValues) = NULL; +static TSS2_RC (*sym_Esys_PolicyAuthValue)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3) = NULL; +static TSS2_RC (*sym_Esys_PolicyAuthorize)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DIGEST *approvedPolicy, const TPM2B_NONCE *policyRef, const TPM2B_NAME *keySign, const TPMT_TK_VERIFIED *checkTicket) = NULL; +static TSS2_RC (*sym_Esys_PolicyAuthorizeNV)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR nvIndex, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3); +static TSS2_RC (*sym_Esys_PolicyGetDigest)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2B_DIGEST **policyDigest) = NULL; +static TSS2_RC (*sym_Esys_PolicyOR)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPML_DIGEST *pHashList) = NULL; +static TSS2_RC (*sym_Esys_PolicyPCR)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DIGEST *pcrDigest, const TPML_PCR_SELECTION *pcrs) = NULL; +static TSS2_RC (*sym_Esys_ReadPublic)(ESYS_CONTEXT *esysContext, ESYS_TR objectHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2B_PUBLIC **outPublic, TPM2B_NAME **name, TPM2B_NAME **qualifiedName) = NULL; +static TSS2_RC (*sym_Esys_StartAuthSession)(ESYS_CONTEXT *esysContext, ESYS_TR tpmKey, ESYS_TR bind, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_NONCE *nonceCaller, TPM2_SE sessionType, const TPMT_SYM_DEF *symmetric, TPMI_ALG_HASH authHash, ESYS_TR *sessionHandle) = NULL; +static TSS2_RC (*sym_Esys_Startup)(ESYS_CONTEXT *esysContext, TPM2_SU startupType) = NULL; +static TSS2_RC (*sym_Esys_TestParms)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPMT_PUBLIC_PARMS *parameters) = NULL; +static TSS2_RC (*sym_Esys_TR_Close)(ESYS_CONTEXT *esys_context, ESYS_TR *rsrc_handle) = NULL; +static TSS2_RC (*sym_Esys_TR_Deserialize)(ESYS_CONTEXT *esys_context, uint8_t const *buffer, size_t buffer_size, ESYS_TR *esys_handle) = NULL; +static TSS2_RC (*sym_Esys_TR_FromTPMPublic)(ESYS_CONTEXT *esysContext, TPM2_HANDLE tpm_handle, ESYS_TR optionalSession1, ESYS_TR optionalSession2, ESYS_TR optionalSession3, ESYS_TR *object) = NULL; +static TSS2_RC (*sym_Esys_TR_GetName)(ESYS_CONTEXT *esysContext, ESYS_TR handle, TPM2B_NAME **name) = NULL; +static TSS2_RC (*sym_Esys_TR_GetTpmHandle)(ESYS_CONTEXT *esys_context, ESYS_TR esys_handle, TPM2_HANDLE *tpm_handle) = NULL; +static TSS2_RC (*sym_Esys_TR_Serialize)(ESYS_CONTEXT *esys_context, ESYS_TR object, uint8_t **buffer, size_t *buffer_size) = NULL; +static TSS2_RC (*sym_Esys_TR_SetAuth)(ESYS_CONTEXT *esysContext, ESYS_TR handle, TPM2B_AUTH const *authValue) = NULL; +static TSS2_RC (*sym_Esys_TRSess_GetAttributes)(ESYS_CONTEXT *esysContext, ESYS_TR session, TPMA_SESSION *flags) = NULL; +static TSS2_RC (*sym_Esys_TRSess_SetAttributes)(ESYS_CONTEXT *esysContext, ESYS_TR session, TPMA_SESSION flags, TPMA_SESSION mask) = NULL; +static TSS2_RC (*sym_Esys_Unseal)(ESYS_CONTEXT *esysContext, ESYS_TR itemHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2B_SENSITIVE_DATA **outData) = NULL; +static TSS2_RC (*sym_Esys_VerifySignature)(ESYS_CONTEXT *esysContext, ESYS_TR keyHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DIGEST *digest, const TPMT_SIGNATURE *signature, TPMT_TK_VERIFIED **validation) = NULL; + +static TSS2_RC (*sym_Tss2_MU_TPM2_CC_Marshal)(TPM2_CC src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2_HANDLE_Marshal)(TPM2_HANDLE src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_DIGEST_Marshal)(TPM2B_DIGEST const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Marshal)(TPM2B_ENCRYPTED_SECRET const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_ENCRYPTED_SECRET *dest) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_NAME_Marshal)(TPM2B_NAME const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_PRIVATE_Marshal)(TPM2B_PRIVATE const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_PRIVATE_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_PRIVATE *dest) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_PUBLIC_Marshal)(TPM2B_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_PUBLIC *dest) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_SENSITIVE_Marshal)(TPM2B_SENSITIVE const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPML_PCR_SELECTION_Marshal)(TPML_PCR_SELECTION const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPMS_NV_PUBLIC_Marshal)(TPMS_NV_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_NV_PUBLIC_Marshal)(TPM2B_NV_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPM2B_NV_PUBLIC_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_NV_PUBLIC *dest) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPMS_ECC_POINT_Marshal)(TPMS_ECC_POINT const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPMT_HA_Marshal)(TPMT_HA const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_TPMT_PUBLIC_Marshal)(TPMT_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; +static TSS2_RC (*sym_Tss2_MU_UINT32_Marshal)(UINT32 src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL; + +static const char* (*sym_Tss2_RC_Decode)(TSS2_RC rc) = NULL; + +int dlopen_tpm2(void) { + int r; + + r = dlopen_many_sym_or_warn( + &libtss2_esys_dl, "libtss2-esys.so.0", LOG_DEBUG, + DLSYM_ARG(Esys_Create), + DLSYM_ARG(Esys_CreateLoaded), + DLSYM_ARG(Esys_CreatePrimary), + DLSYM_ARG(Esys_EvictControl), + DLSYM_ARG(Esys_Finalize), + DLSYM_ARG(Esys_FlushContext), + DLSYM_ARG(Esys_Free), + DLSYM_ARG(Esys_GetCapability), + DLSYM_ARG(Esys_GetRandom), + DLSYM_ARG(Esys_Import), + DLSYM_ARG(Esys_Initialize), + DLSYM_ARG(Esys_Load), + DLSYM_ARG(Esys_LoadExternal), + DLSYM_ARG(Esys_NV_DefineSpace), + DLSYM_ARG(Esys_NV_UndefineSpace), + DLSYM_ARG(Esys_NV_Write), + DLSYM_ARG(Esys_PCR_Extend), + DLSYM_ARG(Esys_PCR_Read), + DLSYM_ARG(Esys_PolicyAuthValue), + DLSYM_ARG(Esys_PolicyAuthorize), + DLSYM_ARG(Esys_PolicyAuthorizeNV), + DLSYM_ARG(Esys_PolicyGetDigest), + DLSYM_ARG(Esys_PolicyOR), + DLSYM_ARG(Esys_PolicyPCR), + DLSYM_ARG(Esys_ReadPublic), + DLSYM_ARG(Esys_StartAuthSession), + DLSYM_ARG(Esys_Startup), + DLSYM_ARG(Esys_TestParms), + DLSYM_ARG(Esys_TR_Close), + DLSYM_ARG(Esys_TR_Deserialize), + DLSYM_ARG(Esys_TR_FromTPMPublic), + DLSYM_ARG(Esys_TR_GetName), + DLSYM_ARG(Esys_TR_Serialize), + DLSYM_ARG(Esys_TR_SetAuth), + DLSYM_ARG(Esys_TRSess_GetAttributes), + DLSYM_ARG(Esys_TRSess_SetAttributes), + DLSYM_ARG(Esys_Unseal), + DLSYM_ARG(Esys_VerifySignature)); + if (r < 0) + return r; + + /* Esys_TR_GetTpmHandle was added to tpm2-tss in version 2.4.0. Once we can set a minimum tpm2-tss + * version of 2.4.0 this sym can be moved up to the normal list above. */ + r = dlsym_many_or_warn(libtss2_esys_dl, LOG_DEBUG, DLSYM_ARG_FORCE(Esys_TR_GetTpmHandle)); + if (r < 0) + log_debug("libtss2-esys too old, does not include Esys_TR_GetTpmHandle."); + + r = dlopen_many_sym_or_warn( + &libtss2_rc_dl, "libtss2-rc.so.0", LOG_DEBUG, + DLSYM_ARG(Tss2_RC_Decode)); + if (r < 0) + return r; + + return dlopen_many_sym_or_warn( + &libtss2_mu_dl, "libtss2-mu.so.0", LOG_DEBUG, + DLSYM_ARG(Tss2_MU_TPM2_CC_Marshal), + DLSYM_ARG(Tss2_MU_TPM2_HANDLE_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_DIGEST_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_ENCRYPTED_SECRET_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_ENCRYPTED_SECRET_Unmarshal), + DLSYM_ARG(Tss2_MU_TPM2B_NAME_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_PRIVATE_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_PRIVATE_Unmarshal), + DLSYM_ARG(Tss2_MU_TPM2B_PUBLIC_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_PUBLIC_Unmarshal), + DLSYM_ARG(Tss2_MU_TPM2B_SENSITIVE_Marshal), + DLSYM_ARG(Tss2_MU_TPML_PCR_SELECTION_Marshal), + DLSYM_ARG(Tss2_MU_TPMS_NV_PUBLIC_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_NV_PUBLIC_Marshal), + DLSYM_ARG(Tss2_MU_TPM2B_NV_PUBLIC_Unmarshal), + DLSYM_ARG(Tss2_MU_TPMS_ECC_POINT_Marshal), + DLSYM_ARG(Tss2_MU_TPMT_HA_Marshal), + DLSYM_ARG(Tss2_MU_TPMT_PUBLIC_Marshal), + DLSYM_ARG(Tss2_MU_UINT32_Marshal)); +} + +void Esys_Freep(void *p) { + if (*(void**) p) + sym_Esys_Free(*(void**) p); +} + +/* Get a specific TPM capability (or capabilities). + * + * Returns 0 if there are no more capability properties of the requested type, or 1 if there are more, or < 0 + * on any error. Both 0 and 1 indicate this completed successfully, but do not indicate how many capability + * properties were provided in 'ret_capability_data'. To find the number of provided properties, check the + * specific type's 'count' field (e.g. for TPM2_CAP_ALGS, check ret_capability_data->algorithms.count). + * + * This calls TPM2_GetCapability() and does not alter the provided data, so it is important to understand how + * that TPM function works. It is recommended to check the TCG TPM specification Part 3 ("Commands") section + * on TPM2_GetCapability() for full details, but a short summary is: if this returns 0, all available + * properties have been provided in ret_capability_data, or no properties were available. If this returns 1, + * there are between 1 and "count" properties provided in ret_capability_data, and there are more available. + * Note that this may provide less than "count" properties even if the TPM has more available. Also, each + * capability category may have more specific requirements than described here; see the spec for exact + * details. */ +static int tpm2_get_capability( + Tpm2Context *c, + TPM2_CAP capability, + uint32_t property, + uint32_t count, + TPMU_CAPABILITIES *ret_capability_data) { + + _cleanup_(Esys_Freep) TPMS_CAPABILITY_DATA *capabilities = NULL; + TPMI_YES_NO more; + TSS2_RC rc; + + assert(c); + + log_debug("Getting TPM2 capability 0x%04" PRIx32 " property 0x%04" PRIx32 " count %" PRIu32 ".", + capability, property, count); + + rc = sym_Esys_GetCapability( + c->esys_context, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + capability, + property, + count, + &more, + &capabilities); + if (rc == TPM2_RC_VALUE) + return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), + "Requested TPM2 capability 0x%04" PRIx32 " property 0x%04" PRIx32 " apparently doesn't exist: %s", + capability, property, sym_Tss2_RC_Decode(rc)); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to get TPM2 capability 0x%04" PRIx32 " property 0x%04" PRIx32 ": %s", + capability, property, sym_Tss2_RC_Decode(rc)); + if (capabilities->capability != capability) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "TPM provided wrong capability: 0x%04" PRIx32 " instead of 0x%04" PRIx32 ".", + capabilities->capability, capability); + + if (ret_capability_data) + *ret_capability_data = capabilities->data; + + return more == TPM2_YES; +} + +#define TPMA_CC_TO_TPM2_CC(cca) (((cca) & TPMA_CC_COMMANDINDEX_MASK) >> TPMA_CC_COMMANDINDEX_SHIFT) + +static int tpm2_cache_capabilities(Tpm2Context *c) { + TPMU_CAPABILITIES capability; + int r; + + assert(c); + + /* Cache the algorithms. The spec indicates supported algorithms can only be modified during runtime + * by the SetAlgorithmSet() command. Unfortunately, the spec doesn't require a TPM reinitialization + * after changing the algorithm set (unless the PCR algorithms are changed). However, the spec also + * indicates the TPM behavior after SetAlgorithmSet() is "vendor-dependent", giving the example of + * flushing sessions and objects, erasing policies, etc. So, if the algorithm set is programmatically + * changed while we are performing some operation, it's reasonable to assume it will break us even if + * we don't cache the algorithms, thus they should be "safe" to cache. */ + TPM2_ALG_ID current_alg = TPM2_ALG_FIRST; + for (;;) { + r = tpm2_get_capability( + c, + TPM2_CAP_ALGS, + (uint32_t) current_alg, /* The spec states to cast TPM2_ALG_ID to uint32_t. */ + TPM2_MAX_CAP_ALGS, + &capability); + if (r < 0) + return r; + + TPML_ALG_PROPERTY algorithms = capability.algorithms; + + /* We should never get 0; the TPM must support some algorithms, and it must not set 'more' if + * there are no more. */ + assert(algorithms.count > 0); + + if (!GREEDY_REALLOC_APPEND( + c->capability_algorithms, + c->n_capability_algorithms, + algorithms.algProperties, + algorithms.count)) + return log_oom_debug(); + + if (r == 0) + break; + + /* Set current_alg to alg id after last alg id the TPM provided */ + current_alg = algorithms.algProperties[algorithms.count - 1].alg + 1; + } + + /* Cache the command capabilities. The spec isn't actually clear if commands can be added/removed + * while running, but that would be crazy, so let's hope it is not possible. */ + TPM2_CC current_cc = TPM2_CC_FIRST; + for (;;) { + r = tpm2_get_capability( + c, + TPM2_CAP_COMMANDS, + current_cc, + TPM2_MAX_CAP_CC, + &capability); + if (r < 0) + return r; + + TPML_CCA commands = capability.command; + + /* We should never get 0; the TPM must support some commands, and it must not set 'more' if + * there are no more. */ + assert(commands.count > 0); + + if (!GREEDY_REALLOC_APPEND( + c->capability_commands, + c->n_capability_commands, + commands.commandAttributes, + commands.count)) + return log_oom_debug(); + + if (r == 0) + break; + + /* Set current_cc to index after last cc the TPM provided */ + current_cc = TPMA_CC_TO_TPM2_CC(commands.commandAttributes[commands.count - 1]) + 1; + } + + /* Cache the ECC curves. The spec isn't actually clear if ECC curves can be added/removed + * while running, but that would be crazy, so let's hope it is not possible. */ + TPM2_ECC_CURVE current_ecc_curve = TPM2_ECC_NONE; + for (;;) { + r = tpm2_get_capability( + c, + TPM2_CAP_ECC_CURVES, + current_ecc_curve, + TPM2_MAX_ECC_CURVES, + &capability); + if (r == -ENXIO) /* If the TPM doesn't support ECC, it might return TPM2_RC_VALUE rather than capability.eccCurves == 0 */ + break; + if (r < 0) + return r; + + TPML_ECC_CURVE ecc_curves = capability.eccCurves; + + /* ECC support isn't required */ + if (ecc_curves.count == 0) + break; + + if (!GREEDY_REALLOC_APPEND( + c->capability_ecc_curves, + c->n_capability_ecc_curves, + ecc_curves.eccCurves, + ecc_curves.count)) + return log_oom_debug(); + + if (r == 0) + break; + + /* Set current_ecc_curve to index after last ecc curve the TPM provided */ + current_ecc_curve = ecc_curves.eccCurves[ecc_curves.count - 1] + 1; + } + + /* Cache the PCR capabilities, which are safe to cache, as the only way they can change is + * TPM2_PCR_Allocate(), which changes the allocation after the next _TPM_Init(). If the TPM is + * reinitialized while we are using it, all our context and sessions will be invalid, so we can + * safely assume the TPM PCR allocation will not change while we are using it. */ + r = tpm2_get_capability( + c, + TPM2_CAP_PCRS, + /* property= */ 0, + /* count= */ 1, + &capability); + if (r < 0) + return r; + if (r == 1) + /* This should never happen. Part 3 ("Commands") of the TCG TPM2 spec in the section for + * TPM2_GetCapability states: "TPM_CAP_PCRS – Returns the current allocation of PCR in a + * TPML_PCR_SELECTION. The property parameter shall be zero. The TPM will always respond to + * this command with the full PCR allocation and moreData will be NO." */ + log_debug("TPM bug: reported multiple PCR sets; using only first set."); + c->capability_pcrs = capability.assignedPCR; + + return 0; +} + +/* Get the TPMA_ALGORITHM for a TPM2_ALG_ID. Returns true if the TPM supports the algorithm and the + * TPMA_ALGORITHM is provided, otherwise false. */ +static bool tpm2_get_capability_alg(Tpm2Context *c, TPM2_ALG_ID alg, TPMA_ALGORITHM *ret) { + assert(c); + + FOREACH_ARRAY(alg_prop, c->capability_algorithms, c->n_capability_algorithms) + if (alg_prop->alg == alg) { + if (ret) + *ret = alg_prop->algProperties; + return true; + } + + log_debug("TPM does not support alg 0x%02" PRIx16 ".", alg); + if (ret) + *ret = 0; + + return false; +} + +bool tpm2_supports_alg(Tpm2Context *c, TPM2_ALG_ID alg) { + return tpm2_get_capability_alg(c, alg, NULL); +} + +/* Get the TPMA_CC for a TPM2_CC. Returns true if the TPM supports the command and the TPMA_CC is provided, + * otherwise false. */ +static bool tpm2_get_capability_command(Tpm2Context *c, TPM2_CC command, TPMA_CC *ret) { + assert(c); + + FOREACH_ARRAY(cca, c->capability_commands, c->n_capability_commands) + if (TPMA_CC_TO_TPM2_CC(*cca) == command) { + if (ret) + *ret = *cca; + return true; + } + + log_debug("TPM does not support command 0x%04" PRIx32 ".", command); + if (ret) + *ret = 0; + + return false; +} + +bool tpm2_supports_command(Tpm2Context *c, TPM2_CC command) { + return tpm2_get_capability_command(c, command, NULL); +} + +/* Returns true if the TPM supports the ECC curve, otherwise false. */ +bool tpm2_supports_ecc_curve(Tpm2Context *c, TPM2_ECC_CURVE ecc_curve) { + assert(c); + + FOREACH_ARRAY(curve, c->capability_ecc_curves, c->n_capability_ecc_curves) + if (*curve == ecc_curve) + return true; + + log_debug("TPM does not support ECC curve 0x%" PRIx16 ".", ecc_curve); + return false; +} + +/* Query the TPM for populated handles. + * + * This provides an array of handle indexes populated in the TPM, starting at the requested handle. The array will + * contain only populated handle addresses (which might not include the requested handle). The number of + * handles will be no more than the 'max' number requested. This will not search past the end of the handle + * range (i.e. handle & 0xff000000). + * + * Returns 0 if all populated handles in the range (starting at the requested handle) were provided (or no + * handles were in the range), or 1 if there are more populated handles in the range, or < 0 on any error. */ +static int tpm2_get_capability_handles( + Tpm2Context *c, + TPM2_HANDLE start, + size_t max, + TPM2_HANDLE **ret_handles, + size_t *ret_n_handles) { + + _cleanup_free_ TPM2_HANDLE *handles = NULL; + size_t n_handles = 0; + TPM2_HANDLE current = start; + int r = 0; + + assert(c); + assert(ret_handles); + assert(ret_n_handles); + + max = MIN(max, UINT32_MAX); + + while (max > 0) { + TPMU_CAPABILITIES capability; + r = tpm2_get_capability(c, TPM2_CAP_HANDLES, current, (uint32_t) max, &capability); + if (r < 0) + return r; + + TPML_HANDLE handle_list = capability.handles; + if (handle_list.count == 0) + break; + + assert(handle_list.count <= max); + + if (n_handles > SIZE_MAX - handle_list.count) + return log_oom_debug(); + + if (!GREEDY_REALLOC_APPEND(handles, n_handles, handle_list.handle, handle_list.count)) + return log_oom_debug(); + + max -= handle_list.count; + + /* Update current to the handle index after the last handle in the list. */ + current = handles[n_handles - 1] + 1; + + if (r == 0) + /* No more handles in this range. */ + break; + } + + *ret_handles = TAKE_PTR(handles); + *ret_n_handles = n_handles; + + return r; +} + +#define TPM2_HANDLE_RANGE(h) ((TPM2_HANDLE)((h) & TPM2_HR_RANGE_MASK)) +#define TPM2_HANDLE_TYPE(h) ((TPM2_HT)(TPM2_HANDLE_RANGE(h) >> TPM2_HR_SHIFT)) + +/* Returns 1 if the handle is populated in the TPM, 0 if not, and < 0 on any error. */ +static int tpm2_get_capability_handle(Tpm2Context *c, TPM2_HANDLE handle) { + _cleanup_free_ TPM2_HANDLE *handles = NULL; + size_t n_handles = 0; + int r; + + r = tpm2_get_capability_handles(c, handle, 1, &handles, &n_handles); + if (r < 0) + return r; + + return n_handles == 0 ? false : handles[0] == handle; +} + +/* Returns 1 if the TPM supports the parms, or 0 if the TPM does not support the parms. */ +bool tpm2_test_parms(Tpm2Context *c, TPMI_ALG_PUBLIC alg, const TPMU_PUBLIC_PARMS *parms) { + TSS2_RC rc; + + assert(c); + assert(parms); + + TPMT_PUBLIC_PARMS parameters = { + .type = alg, + .parameters = *parms, + }; + + rc = sym_Esys_TestParms(c->esys_context, ESYS_TR_NONE, ESYS_TR_NONE, ESYS_TR_NONE, ¶meters); + if (rc != TSS2_RC_SUCCESS) + /* The spec says if the parms are not supported the TPM returns "...the appropriate + * unmarshaling error if a parameter is not valid". Since the spec (currently) defines 15 + * unmarshaling errors, instead of checking for them all here, let's just assume any error + * indicates unsupported parms, and log the specific error text. */ + log_debug("TPM does not support tested parms: %s", sym_Tss2_RC_Decode(rc)); + + return rc == TSS2_RC_SUCCESS; +} + +static bool tpm2_supports_tpmt_public(Tpm2Context *c, const TPMT_PUBLIC *public) { + assert(c); + assert(public); + + return tpm2_test_parms(c, public->type, &public->parameters); +} + +static bool tpm2_supports_tpmt_sym_def_object(Tpm2Context *c, const TPMT_SYM_DEF_OBJECT *parameters) { + assert(c); + assert(parameters); + + TPMU_PUBLIC_PARMS parms = { + .symDetail.sym = *parameters, + }; + + return tpm2_test_parms(c, TPM2_ALG_SYMCIPHER, &parms); +} + +static bool tpm2_supports_tpmt_sym_def(Tpm2Context *c, const TPMT_SYM_DEF *parameters) { + assert(c); + assert(parameters); + + /* Unfortunately, TPMT_SYM_DEF and TPMT_SYM_DEF_OBEJECT are separately defined, even though they are + * functionally identical. */ + TPMT_SYM_DEF_OBJECT object = { + .algorithm = parameters->algorithm, + .keyBits = parameters->keyBits, + .mode = parameters->mode, + }; + + return tpm2_supports_tpmt_sym_def_object(c, &object); +} + +static Tpm2Context *tpm2_context_free(Tpm2Context *c) { + if (!c) + return NULL; + + if (c->esys_context) + sym_Esys_Finalize(&c->esys_context); + + c->tcti_context = mfree(c->tcti_context); + c->tcti_dl = safe_dlclose(c->tcti_dl); + + c->capability_algorithms = mfree(c->capability_algorithms); + c->capability_commands = mfree(c->capability_commands); + c->capability_ecc_curves = mfree(c->capability_ecc_curves); + + return mfree(c); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Tpm2Context, tpm2_context, tpm2_context_free); + +static const TPMT_SYM_DEF SESSION_TEMPLATE_SYM_AES_128_CFB = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, /* The spec requires sessions to use CFB. */ +}; + +int tpm2_context_new(const char *device, Tpm2Context **ret_context) { + _cleanup_(tpm2_context_unrefp) Tpm2Context *context = NULL; + TSS2_RC rc; + int r; + + assert(ret_context); + + context = new(Tpm2Context, 1); + if (!context) + return log_oom_debug(); + + *context = (Tpm2Context) { + .n_ref = 1, + }; + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + if (!device) { + device = secure_getenv("SYSTEMD_TPM2_DEVICE"); + if (device) + /* Setting the env var to an empty string forces tpm2-tss' own device picking + * logic to be used. */ + device = empty_to_null(device); + else + /* If nothing was specified explicitly, we'll use a hardcoded default: the "device" tcti + * driver and the "/dev/tpmrm0" device. We do this since on some distributions the tpm2-abrmd + * might be used and we really don't want that, since it is a system service and that creates + * various ordering issues/deadlocks during early boot. */ + device = "device:/dev/tpmrm0"; + } + + if (device) { + const char *param, *driver, *fn; + const TSS2_TCTI_INFO* info; + TSS2_TCTI_INFO_FUNC func; + size_t sz = 0; + + param = strchr(device, ':'); + if (param) { + /* Syntax #1: Pair of driver string and arbitrary parameter */ + driver = strndupa_safe(device, param - device); + if (isempty(driver)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 driver name is empty, refusing."); + + param++; + } else if (path_is_absolute(device) && path_is_valid(device)) { + /* Syntax #2: TPM device node */ + driver = "device"; + param = device; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid TPM2 driver string, refusing."); + + log_debug("Using TPM2 TCTI driver '%s' with device '%s'.", driver, param); + + fn = strjoina("libtss2-tcti-", driver, ".so.0"); + + /* Better safe than sorry, let's refuse strings that cannot possibly be valid driver early, before going to disk. */ + if (!filename_is_valid(fn)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 driver name '%s' not valid, refusing.", driver); + + context->tcti_dl = dlopen(fn, RTLD_NOW); + if (!context->tcti_dl) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to load %s: %s", fn, dlerror()); + + func = dlsym(context->tcti_dl, TSS2_TCTI_INFO_SYMBOL); + if (!func) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to find TCTI info symbol " TSS2_TCTI_INFO_SYMBOL ": %s", + dlerror()); + + info = func(); + if (!info) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Unable to get TCTI info data."); + + log_debug("Loaded TCTI module '%s' (%s) [Version %" PRIu32 "]", info->name, info->description, info->version); + + rc = info->init(NULL, &sz, NULL); + if (rc != TPM2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to initialize TCTI context: %s", sym_Tss2_RC_Decode(rc)); + + context->tcti_context = malloc0(sz); + if (!context->tcti_context) + return log_oom_debug(); + + rc = info->init(context->tcti_context, &sz, param); + if (rc != TPM2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to initialize TCTI context: %s", sym_Tss2_RC_Decode(rc)); + } + + rc = sym_Esys_Initialize(&context->esys_context, context->tcti_context, NULL); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to initialize TPM context: %s", sym_Tss2_RC_Decode(rc)); + + rc = sym_Esys_Startup(context->esys_context, TPM2_SU_CLEAR); + if (rc == TPM2_RC_INITIALIZE) + log_debug("TPM already started up."); + else if (rc == TSS2_RC_SUCCESS) + log_debug("TPM successfully started up."); + else + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to start up TPM: %s", sym_Tss2_RC_Decode(rc)); + + r = tpm2_cache_capabilities(context); + if (r < 0) + return log_debug_errno(r, "Failed to cache TPM capabilities: %m"); + + /* We require AES and CFB support for session encryption. */ + if (!tpm2_supports_alg(context, TPM2_ALG_AES)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support AES."); + + if (!tpm2_supports_alg(context, TPM2_ALG_CFB)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support CFB."); + + if (!tpm2_supports_tpmt_sym_def(context, &SESSION_TEMPLATE_SYM_AES_128_CFB)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support AES-128-CFB."); + + *ret_context = TAKE_PTR(context); + + return 0; +} + +static void tpm2_handle_cleanup(ESYS_CONTEXT *esys_context, ESYS_TR esys_handle, bool flush) { + TSS2_RC rc; + + if (!esys_context || esys_handle == ESYS_TR_NONE) + return; + + /* Closing the handle removes its reference from the esys_context, but leaves the corresponding + * handle in the actual TPM. Flushing the handle removes its reference from the esys_context as well + * as removing its corresponding handle from the actual TPM. */ + if (flush) + rc = sym_Esys_FlushContext(esys_context, esys_handle); + else + /* We can't use Esys_TR_Close() because the tpm2-tss library does not use reference counting + * for handles, and a single Esys_TR_Close() will remove the handle (internal to the tpm2-tss + * library) that might be in use by other code that is using the same ESYS_CONTEXT. This + * directly affects us; for example the src/test/test-tpm2.c test function + * check_seal_unseal() will encounter this issue and will result in a failure when trying to + * cleanup (i.e. Esys_FlushContext) the transient primary key that the test function + * generates. However, not calling Esys_TR_Close() here should be ok, since any leaked handle + * references will be cleaned up when we free our ESYS_CONTEXT. + * + * An upstream bug is open here: https://github.com/tpm2-software/tpm2-tss/issues/2693 */ + rc = TSS2_RC_SUCCESS; // FIXME: restore sym_Esys_TR_Close() use once tpm2-tss is fixed and adopted widely enough + if (rc != TSS2_RC_SUCCESS) + /* We ignore failures here (besides debug logging), since this is called in error paths, + * where we cannot do anything about failures anymore. And when it is called in successful + * codepaths by this time we already did what we wanted to do, and got the results we wanted + * so there's no reason to make this fail more loudly than necessary. */ + log_debug("Failed to %s TPM handle, ignoring: %s", flush ? "flush" : "close", sym_Tss2_RC_Decode(rc)); +} + +Tpm2Handle *tpm2_handle_free(Tpm2Handle *handle) { + if (!handle) + return NULL; + + _cleanup_(tpm2_context_unrefp) Tpm2Context *context = (Tpm2Context*)handle->tpm2_context; + if (context) + tpm2_handle_cleanup(context->esys_context, handle->esys_handle, handle->flush); + + return mfree(handle); +} + +int tpm2_handle_new(Tpm2Context *context, Tpm2Handle **ret_handle) { + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + + assert(ret_handle); + + handle = new(Tpm2Handle, 1); + if (!handle) + return log_oom_debug(); + + *handle = (Tpm2Handle) { + .tpm2_context = tpm2_context_ref(context), + .esys_handle = ESYS_TR_NONE, + .flush = true, + }; + + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +static int tpm2_read_public( + Tpm2Context *c, + const Tpm2Handle *session, + const Tpm2Handle *handle, + TPM2B_PUBLIC **ret_public, + TPM2B_NAME **ret_name, + TPM2B_NAME **ret_qname) { + + TSS2_RC rc; + + assert(c); + assert(handle); + + rc = sym_Esys_ReadPublic( + c->esys_context, + handle->esys_handle, + session ? session->esys_handle : ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + ret_public, + ret_name, + ret_qname); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to read public info: %s", sym_Tss2_RC_Decode(rc)); + + return 0; +} + +/* Create a Tpm2Handle object that references a pre-existing handle in the TPM, at the handle index provided. + * This should be used only for persistent, transient, or NV handles; and the handle must already exist in + * the TPM at the specified handle index. The handle index should not be 0. Returns 1 if found, 0 if the + * index is empty, or < 0 on error. Also see tpm2_get_srk() below; the SRK is a commonly used persistent + * Tpm2Handle. */ +int tpm2_index_to_handle( + Tpm2Context *c, + TPM2_HANDLE index, + const Tpm2Handle *session, + TPM2B_PUBLIC **ret_public, + TPM2B_NAME **ret_name, + TPM2B_NAME **ret_qname, + Tpm2Handle **ret_handle) { + + TSS2_RC rc; + int r; + + assert(c); + + /* Only allow persistent, transient, or NV index handle types. */ + switch (TPM2_HANDLE_TYPE(index)) { + case TPM2_HT_PERSISTENT: + case TPM2_HT_NV_INDEX: + case TPM2_HT_TRANSIENT: + break; + case TPM2_HT_PCR: + /* PCR handles are referenced by their actual index number and do not need a Tpm2Handle */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid handle 0x%08" PRIx32 " (in PCR range).", index); + case TPM2_HT_HMAC_SESSION: + case TPM2_HT_POLICY_SESSION: + /* Session indexes are only used internally by tpm2-tss (or lower code) */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid handle 0x%08" PRIx32 " (in session range).", index); + case TPM2_HT_PERMANENT: + /* Permanent handles are defined, e.g. ESYS_TR_RH_OWNER. */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid handle 0x%08" PRIx32 " (in permanent range).", index); + default: + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid handle 0x%08" PRIx32 " (in unknown range).", index); + } + + /* For transient handles, the kernel tpm "resource manager" (i.e. /dev/tpmrm0) performs mapping + * which breaks GetCapability requests, so only check GetCapability if it's not a transient handle. + * https://bugzilla.kernel.org/show_bug.cgi?id=218009 */ + if (TPM2_HANDLE_TYPE(index) != TPM2_HT_TRANSIENT) { // FIXME: once kernel bug is fixed, check transient handles too + r = tpm2_get_capability_handle(c, index); + if (r < 0) + return r; + if (r == 0) { + log_debug("TPM handle 0x%08" PRIx32 " not populated.", index); + if (ret_public) + *ret_public = NULL; + if (ret_name) + *ret_name = NULL; + if (ret_qname) + *ret_qname = NULL; + if (ret_handle) + *ret_handle = NULL; + return 0; + } + } + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_handle_new(c, &handle); + if (r < 0) + return r; + + /* Since we didn't create this handle in the TPM (this is only creating an ESYS_TR handle for the + * pre-existing TPM handle), we shouldn't flush (or evict) it on cleanup. */ + handle->flush = false; + + rc = sym_Esys_TR_FromTPMPublic( + c->esys_context, + index, + session ? session->esys_handle : ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + &handle->esys_handle); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to read public info: %s", sym_Tss2_RC_Decode(rc)); + + if (ret_public || ret_name || ret_qname) { + r = tpm2_read_public(c, session, handle, ret_public, ret_name, ret_qname); + if (r < 0) + return r; + } + + if (ret_handle) + *ret_handle = TAKE_PTR(handle); + + return 1; +} + +/* Get the handle index for the provided Tpm2Handle. */ +int tpm2_index_from_handle(Tpm2Context *c, const Tpm2Handle *handle, TPM2_HANDLE *ret_index) { + TSS2_RC rc; + + assert(c); + assert(handle); + assert(ret_index); + + /* Esys_TR_GetTpmHandle was added to tpm2-tss in version 2.4.0. Once we can set a minimum tpm2-tss + * version of 2.4.0 this check can be removed. */ + if (!sym_Esys_TR_GetTpmHandle) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "libtss2-esys too old, does not include Esys_TR_GetTpmHandle."); + + rc = sym_Esys_TR_GetTpmHandle(c->esys_context, handle->esys_handle, ret_index); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to get handle index: %s", sym_Tss2_RC_Decode(rc)); + + return 0; +} + +/* Copy an object in the TPM at a transient handle to a persistent handle. + * + * The provided transient handle must exist in the TPM in the transient range. The persistent handle may be 0 + * or any handle in the persistent range. If 0, this will try each handle in the persistent range, in + * ascending order, until an available one is found. If non-zero, only the requested persistent handle will + * be used. + * + * Note that the persistent handle parameter is an handle index (i.e. number), while the transient handle is + * a Tpm2Handle object. The returned persistent handle will be a Tpm2Handle object that is located in the TPM + * at the requested persistent handle index (or the first available if none was requested). + * + * Returns 1 if the object was successfully persisted, or 0 if there is already a key at the requested + * handle, or < 0 on error. Theoretically, this would also return 0 if no specific persistent handle is + * requested but all persistent handles are used, but it is extremely unlikely the TPM has enough internal + * memory to store the entire persistent range, in which case an error will be returned if the TPM is out of + * memory for persistent storage. The persistent handle is only provided when returning 1. */ +static int tpm2_persist_handle( + Tpm2Context *c, + const Tpm2Handle *transient_handle, + const Tpm2Handle *session, + TPMI_DH_PERSISTENT persistent_handle_index, + Tpm2Handle **ret_persistent_handle) { + + /* We don't use TPM2_PERSISTENT_FIRST and TPM2_PERSISTENT_LAST here due to: + * https://github.com/systemd/systemd/pull/27713#issuecomment-1591864753 */ + TPMI_DH_PERSISTENT first = UINT32_C(0x81000000), last = UINT32_C(0x81ffffff); + TSS2_RC rc; + int r; + + assert(c); + assert(transient_handle); + + /* If persistent handle index specified, only try that. */ + if (persistent_handle_index != 0) { + if (TPM2_HANDLE_TYPE(persistent_handle_index) != TPM2_HT_PERSISTENT) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Handle not in persistent range: 0x%x", persistent_handle_index); + + first = last = persistent_handle_index; + } + + for (TPMI_DH_PERSISTENT requested = first; requested <= last; requested++) { + _cleanup_(tpm2_handle_freep) Tpm2Handle *persistent_handle = NULL; + r = tpm2_handle_new(c, &persistent_handle); + if (r < 0) + return r; + + /* Since this is a persistent handle, don't flush it. */ + persistent_handle->flush = false; + + rc = sym_Esys_EvictControl( + c->esys_context, + ESYS_TR_RH_OWNER, + transient_handle->esys_handle, + session ? session->esys_handle : ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + requested, + &persistent_handle->esys_handle); + if (rc == TSS2_RC_SUCCESS) { + if (ret_persistent_handle) + *ret_persistent_handle = TAKE_PTR(persistent_handle); + + return 1; + } + if (rc != TPM2_RC_NV_DEFINED) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to persist handle: %s", sym_Tss2_RC_Decode(rc)); + } + + if (ret_persistent_handle) + *ret_persistent_handle = NULL; + + return 0; +} + +#define TPM2_CREDIT_RANDOM_FLAG_PATH "/run/systemd/tpm-rng-credited" + +static int tpm2_credit_random(Tpm2Context *c) { + size_t rps, done = 0; + TSS2_RC rc; + usec_t t; + int r; + + assert(c); + + /* Pulls some entropy from the TPM and adds it into the kernel RNG pool. That way we can say that the + * key we will ultimately generate with the kernel random pool is at least as good as the TPM's RNG, + * but likely better. Note that we don't trust the TPM RNG very much, hence do not actually credit + * any entropy. */ + + if (access(TPM2_CREDIT_RANDOM_FLAG_PATH, F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to detect if '" TPM2_CREDIT_RANDOM_FLAG_PATH "' exists, ignoring: %m"); + } else { + log_debug("Not adding TPM2 entropy to the kernel random pool again."); + return 0; /* Already done */ + } + + t = now(CLOCK_MONOTONIC); + + for (rps = random_pool_size(); rps > 0;) { + _cleanup_(Esys_Freep) TPM2B_DIGEST *buffer = NULL; + + rc = sym_Esys_GetRandom( + c->esys_context, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + MIN(rps, 32U), /* 32 is supposedly a safe choice, given that AES 256bit keys are this long, and TPM2 baseline requires support for those. */ + &buffer); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to acquire entropy from TPM: %s", sym_Tss2_RC_Decode(rc)); + + if (buffer->size == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Zero-sized entropy returned from TPM."); + + r = random_write_entropy(-1, buffer->buffer, buffer->size, /* credit= */ false); + if (r < 0) + return log_debug_errno(r, "Failed wo write entropy to kernel: %m"); + + done += buffer->size; + rps = LESS_BY(rps, buffer->size); + } + + log_debug("Added %zu bytes of TPM2 entropy to the kernel random pool in %s.", done, FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - t, 0)); + + r = touch(TPM2_CREDIT_RANDOM_FLAG_PATH); + if (r < 0) + log_debug_errno(r, "Failed to touch '" TPM2_CREDIT_RANDOM_FLAG_PATH "', ignoring: %m"); + + return 0; +} + +/* Get one of the legacy primary key templates. + * + * The legacy templates should only be used for older sealed data that did not use the SRK. Instead of a + * persistent SRK, a transient key was created to seal the data and then flushed; and the exact same template + * must be used to recreate the same transient key to unseal the data. The alg parameter must be TPM2_ALG_RSA + * or TPM2_ALG_ECC. This does not check if the alg is actually supported on this TPM. */ +static int tpm2_get_legacy_template(TPMI_ALG_PUBLIC alg, TPMT_PUBLIC *ret_template) { + /* Do not modify. */ + static const TPMT_PUBLIC legacy_ecc = { + .type = TPM2_ALG_ECC, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = + TPMA_OBJECT_RESTRICTED| + TPMA_OBJECT_DECRYPT| + TPMA_OBJECT_FIXEDTPM| + TPMA_OBJECT_FIXEDPARENT| + TPMA_OBJECT_SENSITIVEDATAORIGIN| + TPMA_OBJECT_USERWITHAUTH, + .parameters.eccDetail = { + .symmetric = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, + }, + .scheme.scheme = TPM2_ALG_NULL, + .curveID = TPM2_ECC_NIST_P256, + .kdf.scheme = TPM2_ALG_NULL, + }, + }; + + /* Do not modify. */ + static const TPMT_PUBLIC legacy_rsa = { + .type = TPM2_ALG_RSA, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = TPMA_OBJECT_RESTRICTED|TPMA_OBJECT_DECRYPT|TPMA_OBJECT_FIXEDTPM|TPMA_OBJECT_FIXEDPARENT|TPMA_OBJECT_SENSITIVEDATAORIGIN|TPMA_OBJECT_USERWITHAUTH, + .parameters.rsaDetail = { + .symmetric = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, + }, + .scheme.scheme = TPM2_ALG_NULL, + .keyBits = 2048, + }, + }; + + assert(ret_template); + + if (alg == TPM2_ALG_ECC) + *ret_template = legacy_ecc; + else if (alg == TPM2_ALG_RSA) + *ret_template = legacy_rsa; + else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unsupported legacy SRK alg: 0x%x", alg); + + return 0; +} + +/* Get a Storage Root Key (SRK) template. + * + * The SRK template values are recommended by the "TCG TPM v2.0 Provisioning Guidance" document in section + * 7.5.1 "Storage Primary Key (SRK) Templates", referencing "TCG EK Credential Profile for TPM Family 2.0". + * The EK Credential Profile version 2.0 provides only a single template each for RSA and ECC, while later EK + * Credential Profile versions provide more templates, and keep the original templates as "L-1" (for RSA) and + * "L-2" (for ECC). + * + * https://trustedcomputinggroup.org/resource/tcg-tpm-v2-0-provisioning-guidance + * https://trustedcomputinggroup.org/resource/http-trustedcomputinggroup-org-wp-content-uploads-tcg-ek-credential-profile + * + * These templates are only needed to create a new persistent SRK (or a new transient key that is + * SRK-compatible). Preferably, the TPM should contain a shared SRK located at the reserved shared SRK handle + * (see TPM2_SRK_HANDLE in tpm2-util.h, and tpm2_get_srk() below). + * + * Returns 0 if the specified algorithm is ECC or RSA, otherwise -EOPNOTSUPP. */ +int tpm2_get_srk_template(TPMI_ALG_PUBLIC alg, TPMT_PUBLIC *ret_template) { + /* The attributes are the same between ECC and RSA templates. This has the changes specified in the + * Provisioning Guidance document, specifically: + * TPMA_OBJECT_USERWITHAUTH is added. + * TPMA_OBJECT_ADMINWITHPOLICY is removed. + * TPMA_OBJECT_NODA is added. */ + TPMA_OBJECT srk_attributes = + TPMA_OBJECT_DECRYPT | + TPMA_OBJECT_FIXEDPARENT | + TPMA_OBJECT_FIXEDTPM | + TPMA_OBJECT_NODA | + TPMA_OBJECT_RESTRICTED | + TPMA_OBJECT_SENSITIVEDATAORIGIN | + TPMA_OBJECT_USERWITHAUTH; + + /* The symmetric configuration is the same between ECC and RSA templates. */ + TPMT_SYM_DEF_OBJECT srk_symmetric = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, + }; + + /* Both templates have an empty authPolicy as specified by the Provisioning Guidance document. */ + + /* From the EK Credential Profile template "L-2". */ + TPMT_PUBLIC srk_ecc = { + .type = TPM2_ALG_ECC, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = srk_attributes, + .parameters.eccDetail = { + .symmetric = srk_symmetric, + .scheme.scheme = TPM2_ALG_NULL, + .curveID = TPM2_ECC_NIST_P256, + .kdf.scheme = TPM2_ALG_NULL, + }, + }; + + /* From the EK Credential Profile template "L-1". */ + TPMT_PUBLIC srk_rsa = { + .type = TPM2_ALG_RSA, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = srk_attributes, + .parameters.rsaDetail = { + .symmetric = srk_symmetric, + .scheme.scheme = TPM2_ALG_NULL, + .keyBits = 2048, + }, + }; + + assert(ret_template); + + switch (alg) { + case TPM2_ALG_ECC: + *ret_template = srk_ecc; + return 0; + case TPM2_ALG_RSA: + *ret_template = srk_rsa; + return 0; + } + + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "No SRK for algorithm 0x%" PRIx16, alg); +} + +/* Get the best supported SRK template. ECC is preferred, then RSA. */ +int tpm2_get_best_srk_template(Tpm2Context *c, TPMT_PUBLIC *ret_template) { + TPMT_PUBLIC template; + int r; + + assert(c); + assert(ret_template); + + r = tpm2_get_srk_template(TPM2_ALG_ECC, &template); + if (r < 0) + return r; + + if (!tpm2_supports_alg(c, TPM2_ALG_ECC)) + log_debug("TPM does not support ECC."); + else if (!tpm2_supports_ecc_curve(c, template.parameters.eccDetail.curveID)) + log_debug("TPM does not support ECC-NIST-P256 curve."); + else if (!tpm2_supports_tpmt_public(c, &template)) + log_debug("TPM does not support SRK ECC template L-2."); + else { + *ret_template = template; + return 0; + } + + r = tpm2_get_srk_template(TPM2_ALG_RSA, &template); + if (r < 0) + return r; + + if (!tpm2_supports_alg(c, TPM2_ALG_RSA)) + log_debug("TPM does not support RSA."); + else if (!tpm2_supports_tpmt_public(c, &template)) + log_debug("TPM does not support SRK RSA template L-1."); + else { + *ret_template = template; + return 0; + } + + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM does not support either SRK template L-1 (RSA) or L-2 (ECC)."); +} + +/* Get the SRK. Returns 1 if SRK is found, 0 if there is no SRK, or < 0 on error. Also see + * tpm2_get_or_create_srk() below. */ +int tpm2_get_srk( + Tpm2Context *c, + const Tpm2Handle *session, + TPM2B_PUBLIC **ret_public, + TPM2B_NAME **ret_name, + TPM2B_NAME **ret_qname, + Tpm2Handle **ret_handle) { + + return tpm2_index_to_handle(c, TPM2_SRK_HANDLE, session, ret_public, ret_name, ret_qname, ret_handle); +} + +/* Get the SRK, creating one if needed. Returns 1 if a new SRK was created and persisted, 0 if an SRK already + * exists, or < 0 on error. */ +int tpm2_get_or_create_srk( + Tpm2Context *c, + const Tpm2Handle *session, + TPM2B_PUBLIC **ret_public, + TPM2B_NAME **ret_name, + TPM2B_NAME **ret_qname, + Tpm2Handle **ret_handle) { + + int r; + + r = tpm2_get_srk(c, session, ret_public, ret_name, ret_qname, ret_handle); + if (r < 0) + return r; + if (r == 1) + return 0; /* 0 → SRK already set up */ + + /* No SRK, create and persist one */ + TPM2B_PUBLIC template = { + .size = sizeof(TPMT_PUBLIC), + }; + r = tpm2_get_best_srk_template(c, &template.publicArea); + if (r < 0) + return log_debug_errno(r, "Could not get best SRK template: %m"); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *transient_handle = NULL; + r = tpm2_create_primary( + c, + session, + &template, + /* sensitive= */ NULL, + /* ret_public= */ NULL, + &transient_handle); + if (r < 0) + return r; + + /* Try to persist the transient SRK we created. No locking needed; if multiple threads are trying to + * persist SRKs concurrently, only one will succeed (r == 1) while the rest will fail (r == 0). In + * either case, all threads will get the persistent SRK below. */ + r = tpm2_persist_handle(c, transient_handle, session, TPM2_SRK_HANDLE, /* ret_persistent_handle= */ NULL); + if (r < 0) + return r; + + /* The SRK should exist now. */ + r = tpm2_get_srk(c, session, ret_public, ret_name, ret_qname, ret_handle); + if (r < 0) + return r; + if (r == 0) + /* This should never happen. */ + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "SRK we just persisted couldn't be found."); + + return 1; /* > 0 → SRK newly set up */ +} + +/* Utility functions for TPMS_PCR_SELECTION. */ + +/* Convert a TPMS_PCR_SELECTION object to a mask. */ +uint32_t tpm2_tpms_pcr_selection_to_mask(const TPMS_PCR_SELECTION *s) { + assert(s); + assert(s->sizeofSelect <= sizeof(s->pcrSelect)); + + uint32_t mask = 0; + for (unsigned i = 0; i < s->sizeofSelect; i++) + SET_FLAG(mask, (uint32_t)s->pcrSelect[i] << (i * 8), true); + return mask; +} + +/* Convert a mask and hash alg to a TPMS_PCR_SELECTION object. */ +void tpm2_tpms_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash_alg, TPMS_PCR_SELECTION *ret) { + assert(ret); + + /* This is currently hardcoded at 24 PCRs, above. */ + if (!TPM2_PCR_MASK_VALID(mask)) + log_debug("PCR mask selections (%x) out of range, ignoring.", + mask & ~((uint32_t)TPM2_PCRS_MASK)); + + *ret = (TPMS_PCR_SELECTION){ + .hash = hash_alg, + .sizeofSelect = TPM2_PCRS_MAX / 8, + .pcrSelect[0] = mask & 0xff, + .pcrSelect[1] = (mask >> 8) & 0xff, + .pcrSelect[2] = (mask >> 16) & 0xff, + }; +} + +/* Test if all bits in the mask are set in the TPMS_PCR_SELECTION. */ +bool tpm2_tpms_pcr_selection_has_mask(const TPMS_PCR_SELECTION *s, uint32_t mask) { + assert(s); + + return FLAGS_SET(tpm2_tpms_pcr_selection_to_mask(s), mask); +} + +static void tpm2_tpms_pcr_selection_update_mask(TPMS_PCR_SELECTION *s, uint32_t mask, bool b) { + assert(s); + + tpm2_tpms_pcr_selection_from_mask(UPDATE_FLAG(tpm2_tpms_pcr_selection_to_mask(s), mask, b), s->hash, s); +} + +/* Add all PCR selections in the mask. */ +void tpm2_tpms_pcr_selection_add_mask(TPMS_PCR_SELECTION *s, uint32_t mask) { + tpm2_tpms_pcr_selection_update_mask(s, mask, 1); +} + +/* Remove all PCR selections in the mask. */ +void tpm2_tpms_pcr_selection_sub_mask(TPMS_PCR_SELECTION *s, uint32_t mask) { + tpm2_tpms_pcr_selection_update_mask(s, mask, 0); +} + +/* Add all PCR selections in 'b' to 'a'. Both must have the same hash alg. */ +void tpm2_tpms_pcr_selection_add(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b) { + assert(a); + assert(b); + assert(a->hash == b->hash); + + tpm2_tpms_pcr_selection_add_mask(a, tpm2_tpms_pcr_selection_to_mask(b)); +} + +/* Remove all PCR selections in 'b' from 'a'. Both must have the same hash alg. */ +void tpm2_tpms_pcr_selection_sub(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b) { + assert(a); + assert(b); + assert(a->hash == b->hash); + + tpm2_tpms_pcr_selection_sub_mask(a, tpm2_tpms_pcr_selection_to_mask(b)); +} + +/* Move all PCR selections in 'b' to 'a'. Both must have the same hash alg. */ +void tpm2_tpms_pcr_selection_move(TPMS_PCR_SELECTION *a, TPMS_PCR_SELECTION *b) { + if (a == b) + return; + + tpm2_tpms_pcr_selection_add(a, b); + tpm2_tpms_pcr_selection_from_mask(0, b->hash, b); +} + +#define FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml) \ + _FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml, UNIQ_T(l, UNIQ)) +#define _FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml, l) \ + for (typeof(tpml) (l) = (tpml); (l); (l) = NULL) \ + FOREACH_ARRAY(tpms, (l)->pcrSelections, (l)->count) + +#define FOREACH_PCR_IN_TPMS_PCR_SELECTION(pcr, tpms) \ + FOREACH_PCR_IN_MASK(pcr, tpm2_tpms_pcr_selection_to_mask(tpms)) + +#define FOREACH_PCR_IN_TPML_PCR_SELECTION(pcr, tpms, tpml) \ + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml) \ + FOREACH_PCR_IN_TPMS_PCR_SELECTION(pcr, tpms) + +char *tpm2_tpms_pcr_selection_to_string(const TPMS_PCR_SELECTION *s) { + assert(s); + + const char *algstr = strna(tpm2_hash_alg_to_string(s->hash)); + + _cleanup_free_ char *mask = tpm2_pcr_mask_to_string(tpm2_tpms_pcr_selection_to_mask(s)); + if (!mask) + return NULL; + + return strjoin(algstr, "(", mask, ")"); +} + +size_t tpm2_tpms_pcr_selection_weight(const TPMS_PCR_SELECTION *s) { + assert(s); + + return popcount(tpm2_tpms_pcr_selection_to_mask(s)); +} + +/* Utility functions for TPML_PCR_SELECTION. */ + +/* Remove the (0-based) index entry from 'l', shift all following entries, and update the count. */ +static void tpm2_tpml_pcr_selection_remove_index(TPML_PCR_SELECTION *l, uint32_t index) { + assert(l); + assert(l->count <= ELEMENTSOF(l->pcrSelections)); + assert(index < l->count); + + size_t s = l->count - (index + 1); + memmove(&l->pcrSelections[index], &l->pcrSelections[index + 1], s * sizeof(l->pcrSelections[0])); + l->count--; +} + +/* Get a TPMS_PCR_SELECTION from a TPML_PCR_SELECTION for the given hash alg. Returns NULL if there is no + * entry for the hash alg. This guarantees the returned entry contains all the PCR selections for the given + * hash alg, which may require modifying the TPML_PCR_SELECTION by removing duplicate entries. */ +static TPMS_PCR_SELECTION *tpm2_tpml_pcr_selection_get_tpms_pcr_selection( + TPML_PCR_SELECTION *l, + TPMI_ALG_HASH hash_alg) { + + assert(l); + assert(l->count <= ELEMENTSOF(l->pcrSelections)); + + TPMS_PCR_SELECTION *selection = NULL; + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(s, l) + if (s->hash == hash_alg) { + selection = s; + break; + } + + if (!selection) + return NULL; + + /* Iterate backwards through the entries, removing any other entries for the hash alg. */ + for (uint32_t i = l->count - 1; i > 0; i--) { + TPMS_PCR_SELECTION *s = &l->pcrSelections[i]; + + if (selection == s) + break; + + if (s->hash == hash_alg) { + tpm2_tpms_pcr_selection_move(selection, s); + tpm2_tpml_pcr_selection_remove_index(l, i); + } + } + + return selection; +} + +/* Combine all duplicate (same hash alg) TPMS_PCR_SELECTION entries in 'l'. */ +static void tpm2_tpml_pcr_selection_cleanup(TPML_PCR_SELECTION *l) { + /* Can't use FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION() because we might modify l->count */ + for (uint32_t i = 0; i < l->count; i++) + /* This removes all duplicate TPMS_PCR_SELECTION entries for this hash. */ + (void) tpm2_tpml_pcr_selection_get_tpms_pcr_selection(l, l->pcrSelections[i].hash); +} + +/* Convert a TPML_PCR_SELECTION object to a mask. Returns empty mask (i.e. 0) if 'hash_alg' is not in the object. */ +uint32_t tpm2_tpml_pcr_selection_to_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash_alg) { + assert(l); + + /* Make a copy, as tpm2_tpml_pcr_selection_get_tpms_pcr_selection() will modify the object if there + * are multiple entries with the requested hash alg. */ + TPML_PCR_SELECTION lcopy = *l; + + TPMS_PCR_SELECTION *s; + s = tpm2_tpml_pcr_selection_get_tpms_pcr_selection(&lcopy, hash_alg); + if (!s) + return 0; + + return tpm2_tpms_pcr_selection_to_mask(s); +} + +/* Convert a mask and hash alg to a TPML_PCR_SELECTION object. */ +void tpm2_tpml_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash_alg, TPML_PCR_SELECTION *ret) { + assert(ret); + + TPMS_PCR_SELECTION s; + tpm2_tpms_pcr_selection_from_mask(mask, hash_alg, &s); + + *ret = (TPML_PCR_SELECTION){ + .count = 1, + .pcrSelections[0] = s, + }; +} + +/* Add the PCR selections in 's' to the corresponding hash alg TPMS_PCR_SELECTION entry in 'l'. Adds a new + * TPMS_PCR_SELECTION entry for the hash alg if needed. This may modify the TPML_PCR_SELECTION by combining + * entries with the same hash alg. */ +void tpm2_tpml_pcr_selection_add_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s) { + assert(l); + assert(s); + + if (tpm2_tpms_pcr_selection_is_empty(s)) + return; + + TPMS_PCR_SELECTION *selection = tpm2_tpml_pcr_selection_get_tpms_pcr_selection(l, s->hash); + if (selection) { + tpm2_tpms_pcr_selection_add(selection, s); + return; + } + + /* It's already broken if the count is higher than the array has size for. */ + assert(l->count <= ELEMENTSOF(l->pcrSelections)); + + /* If full, the cleanup should result in at least one available entry. */ + if (l->count == ELEMENTSOF(l->pcrSelections)) + tpm2_tpml_pcr_selection_cleanup(l); + + assert(l->count < ELEMENTSOF(l->pcrSelections)); + l->pcrSelections[l->count++] = *s; +} + +/* Remove the PCR selections in 's' from the corresponding hash alg TPMS_PCR_SELECTION entry in 'l'. This + * will combine all entries for 's->hash' in 'l'. */ +void tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s) { + assert(l); + assert(s); + + if (tpm2_tpms_pcr_selection_is_empty(s)) + return; + + TPMS_PCR_SELECTION *selection = tpm2_tpml_pcr_selection_get_tpms_pcr_selection(l, s->hash); + if (selection) + tpm2_tpms_pcr_selection_sub(selection, s); +} + +/* Test if all bits in the mask for the hash are set in the TPML_PCR_SELECTION. */ +bool tpm2_tpml_pcr_selection_has_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask) { + assert(l); + + return FLAGS_SET(tpm2_tpml_pcr_selection_to_mask(l, hash), mask); +} + +/* Add the PCR selections in the mask, with the provided hash. */ +void tpm2_tpml_pcr_selection_add_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask) { + TPMS_PCR_SELECTION tpms; + + assert(l); + + tpm2_tpms_pcr_selection_from_mask(mask, hash, &tpms); + tpm2_tpml_pcr_selection_add_tpms_pcr_selection(l, &tpms); +} + +/* Remove the PCR selections in the mask, with the provided hash. */ +void tpm2_tpml_pcr_selection_sub_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask) { + TPMS_PCR_SELECTION tpms; + + assert(l); + + tpm2_tpms_pcr_selection_from_mask(mask, hash, &tpms); + tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(l, &tpms); +} + +/* Add all PCR selections in 'b' to 'a'. */ +void tpm2_tpml_pcr_selection_add(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b) { + assert(a); + assert(b); + + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection_b, b) + tpm2_tpml_pcr_selection_add_tpms_pcr_selection(a, selection_b); +} + +/* Remove all PCR selections in 'b' from 'a'. */ +void tpm2_tpml_pcr_selection_sub(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b) { + assert(a); + assert(b); + + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection_b, b) + tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(a, selection_b); +} + +char *tpm2_tpml_pcr_selection_to_string(const TPML_PCR_SELECTION *l) { + assert(l); + + _cleanup_free_ char *banks = NULL; + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(s, l) { + if (tpm2_tpms_pcr_selection_is_empty(s)) + continue; + + _cleanup_free_ char *str = tpm2_tpms_pcr_selection_to_string(s); + if (!str || !strextend_with_separator(&banks, ",", str)) + return NULL; + } + + return strjoin("[", strempty(banks), "]"); +} + +size_t tpm2_tpml_pcr_selection_weight(const TPML_PCR_SELECTION *l) { + assert(l); + assert(l->count <= ELEMENTSOF(l->pcrSelections)); + + size_t weight = 0; + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(s, l) { + size_t w = tpm2_tpms_pcr_selection_weight(s); + assert(weight <= SIZE_MAX - w); + weight += w; + } + + return weight; +} + +bool tpm2_pcr_value_valid(const Tpm2PCRValue *pcr_value) { + int r; + + if (!pcr_value) + return false; + + if (!TPM2_PCR_INDEX_VALID(pcr_value->index)) { + log_debug("PCR index %u invalid.", pcr_value->index); + return false; + } + + /* If it contains a value, the value size must match the hash size. */ + if (pcr_value->value.size > 0) { + r = tpm2_hash_alg_to_size(pcr_value->hash); + if (r < 0) + return false; + + if (pcr_value->value.size != (size_t) r) { + log_debug("PCR hash 0x%" PRIx16 " expected size %d does not match actual size %" PRIu16 ".", + pcr_value->hash, r, pcr_value->value.size); + return false; + } + } + + return true; +} + +/* Verify all entries are valid, and consistent with each other. The requirements for consistency are: + * + * 1) all entries must be sorted in ascending order (e.g. using tpm2_sort_pcr_values()) + * 2) all entries must be unique, i.e. there cannot be 2 entries with the same hash and index + * + * Returns true if all entries are valid (or if no entries are provided), false otherwise. + */ +bool tpm2_pcr_values_valid(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + if (!pcr_values && n_pcr_values > 0) + return false; + + const Tpm2PCRValue *previous = NULL; + FOREACH_ARRAY(current, pcr_values, n_pcr_values) { + if (!tpm2_pcr_value_valid(current)) + return false; + + if (!previous) { + previous = current; + continue; + } + + /* Hashes must be sorted in ascending order */ + if (current->hash < previous->hash) { + log_debug("PCR values not in ascending order, hash %" PRIu16 " is after %" PRIu16 ".", + current->hash, previous->hash); + return false; + } + + if (current->hash == previous->hash) { + /* Indexes (for the same hash) must be sorted in ascending order */ + if (current->index < previous->index) { + log_debug("PCR values not in ascending order, hash %" PRIu16 " index %u is after %u.", + current->hash, current->index, previous->index); + return false; + } + + /* Indexes (for the same hash) must not be duplicates */ + if (current->index == previous->index) { + log_debug("PCR values contain duplicates for hash %" PRIu16 " index %u.", + current->hash, previous->index); + return false; + } + } + } + + return true; +} + +/* Returns true if any of the provided PCR values has an actual hash value included, false otherwise. */ +bool tpm2_pcr_values_has_any_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + assert(pcr_values || n_pcr_values == 0); + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) + if (v->value.size > 0) + return true; + + return false; +} + +/* Returns true if all of the provided PCR values has an actual hash value included, false otherwise. */ +bool tpm2_pcr_values_has_all_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + assert(pcr_values || n_pcr_values == 0); + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) + if (v->value.size == 0) + return false; + + return true; +} + +static int cmp_pcr_values(const Tpm2PCRValue *a, const Tpm2PCRValue *b) { + assert(a); + assert(b); + + return CMP(a->hash, b->hash) ?: CMP(a->index, b->index); +} + +/* Sort the array of Tpm2PCRValue entries in-place. This sorts first in ascending order of hash algorithm + * (sorting simply by the TPM2 hash algorithm number), and then sorting by pcr index. */ +void tpm2_sort_pcr_values(Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + typesafe_qsort(pcr_values, n_pcr_values, cmp_pcr_values); +} + +int tpm2_pcr_values_from_mask(uint32_t mask, TPMI_ALG_HASH hash, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values) { + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values = 0; + + assert(ret_pcr_values); + assert(ret_n_pcr_values); + + FOREACH_PCR_IN_MASK(index, mask) + if (!GREEDY_REALLOC_APPEND( + pcr_values, + n_pcr_values, + &TPM2_PCR_VALUE_MAKE(index, hash, {}), + 1)) + return log_oom_debug(); + + *ret_pcr_values = TAKE_PTR(pcr_values); + *ret_n_pcr_values = n_pcr_values; + + return 0; +} + +int tpm2_pcr_values_to_mask(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPMI_ALG_HASH hash, uint32_t *ret_mask) { + uint32_t mask = 0; + + assert(pcr_values || n_pcr_values == 0); + assert(ret_mask); + + if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PCR values."); + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) + if (v->hash == hash) + SET_BIT(mask, v->index); + + *ret_mask = mask; + + return 0; +} + +int tpm2_tpml_pcr_selection_from_pcr_values( + const Tpm2PCRValue *pcr_values, + size_t n_pcr_values, + TPML_PCR_SELECTION *ret_selection, + TPM2B_DIGEST **ret_values, + size_t *ret_n_values) { + + TPML_PCR_SELECTION selection = {}; + _cleanup_free_ TPM2B_DIGEST *values = NULL; + size_t n_values = 0; + + assert(pcr_values || n_pcr_values == 0); + + if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "PCR values are not valid."); + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) { + tpm2_tpml_pcr_selection_add_mask(&selection, v->hash, INDEX_TO_MASK(uint32_t, v->index)); + + if (!GREEDY_REALLOC_APPEND(values, n_values, &v->value, 1)) + return log_oom_debug(); + } + + if (ret_selection) + *ret_selection = selection; + if (ret_values) + *ret_values = TAKE_PTR(values); + if (ret_n_values) + *ret_n_values = n_values; + + return 0; +} + +/* Count the number of different hash algorithms for all the entries. */ +int tpm2_pcr_values_hash_count(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, size_t *ret_count) { + TPML_PCR_SELECTION selection; + int r; + + assert(pcr_values); + assert(ret_count); + + r = tpm2_tpml_pcr_selection_from_pcr_values( + pcr_values, + n_pcr_values, + &selection, + /* ret_values= */ NULL, + /* ret_n_values= */ NULL); + if (r < 0) + return r; + + *ret_count = selection.count; + + return 0; +} + +/* Parse a string argument into a Tpm2PCRValue object. + * + * The format is [:hash[=value]] where index is the index number (or name) of the PCR, e.g. 0 (or + * platform-code), hash is the name of the hash algorithm (e.g. sha256) and value is the hex hash digest + * value, optionally with a leading 0x. This does not check for validity of the fields. */ +int tpm2_pcr_value_from_string(const char *arg, Tpm2PCRValue *ret_pcr_value) { + Tpm2PCRValue pcr_value = {}; + const char *p = arg; + int r; + + assert(arg); + assert(ret_pcr_value); + + _cleanup_free_ char *index = NULL; + r = extract_first_word(&p, &index, ":", /* flags= */ 0); + if (r < 1) + return log_debug_errno(r, "Could not parse pcr value '%s': %m", p); + + r = tpm2_pcr_index_from_string(index); + if (r < 0) + return log_debug_errno(r, "Invalid pcr index '%s': %m", index); + pcr_value.index = (unsigned) r; + + if (!isempty(p)) { + _cleanup_free_ char *hash = NULL; + r = extract_first_word(&p, &hash, "=", /* flags= */ 0); + if (r < 1) + return log_debug_errno(r, "Could not parse pcr hash algorithm '%s': %m", p); + + r = tpm2_hash_alg_from_string(hash); + if (r < 0) + return log_debug_errno(r, "Invalid pcr hash algorithm '%s': %m", hash); + pcr_value.hash = (TPMI_ALG_HASH) r; + + if (!isempty(p)) { + /* Remove leading 0x if present */ + p = startswith_no_case(p, "0x") ?: p; + + _cleanup_free_ void *buf = NULL; + size_t buf_size = 0; + r = unhexmem(p, SIZE_MAX, &buf, &buf_size); + if (r < 0) + return log_debug_errno(r, "Invalid pcr hash value '%s': %m", p); + + r = TPM2B_DIGEST_CHECK_SIZE(buf_size); + if (r < 0) + return log_debug_errno(r, "PCR hash value size %zu too large.", buf_size); + + pcr_value.value = TPM2B_DIGEST_MAKE(buf, buf_size); + } + } + + *ret_pcr_value = pcr_value; + + return 0; +} + +/* Return a string for the PCR value. The format is described in tpm2_pcr_value_from_string(). Note that if + * the hash algorithm is not recognized, neither hash name nor hash digest value is included in the + * string. This does not check for validity. */ +char *tpm2_pcr_value_to_string(const Tpm2PCRValue *pcr_value) { + _cleanup_free_ char *index = NULL, *value = NULL; + + if (asprintf(&index, "%u", pcr_value->index) < 0) + return NULL; + + const char *hash = pcr_value->hash > 0 ? tpm2_hash_alg_to_string(pcr_value->hash) : NULL; + + if (hash && pcr_value->value.size > 0) { + value = hexmem(pcr_value->value.buffer, pcr_value->value.size); + if (!value) + return NULL; + } + + return strjoin(index, hash ? ":" : "", strempty(hash), value ? "=" : "", strempty(value)); +} + +/* Parse a string argument into an array of Tpm2PCRValue objects. + * + * The format is zero or more entries separated by ',' or '+'. The format of each entry is described in + * tpm2_pcr_value_from_string(). This does not check for validity of the entries. */ +int tpm2_pcr_values_from_string(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values) { + const char *p = arg; + int r; + + assert(arg); + assert(ret_pcr_values); + assert(ret_n_pcr_values); + + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values = 0; + + for (;;) { + _cleanup_free_ char *pcr_arg = NULL; + r = extract_first_word(&p, &pcr_arg, ",+", /* flags= */ 0); + if (r < 0) + return log_debug_errno(r, "Could not parse pcr values '%s': %m", p); + if (r == 0) + break; + + Tpm2PCRValue pcr_value; + r = tpm2_pcr_value_from_string(pcr_arg, &pcr_value); + if (r < 0) + return r; + + if (!GREEDY_REALLOC_APPEND(pcr_values, n_pcr_values, &pcr_value, 1)) + return log_oom_debug(); + } + + *ret_pcr_values = TAKE_PTR(pcr_values); + *ret_n_pcr_values = n_pcr_values; + + return 0; +} + +/* Return a string representing the array of PCR values. The format is as described in + * tpm2_pcr_values_from_string(). This does not check for validity. */ +char *tpm2_pcr_values_to_string(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + _cleanup_free_ char *s = NULL; + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) { + _cleanup_free_ char *pcrstr = tpm2_pcr_value_to_string(v); + if (!pcrstr || !strextend_with_separator(&s, "+", pcrstr)) + return NULL; + } + + return s ? TAKE_PTR(s) : strdup(""); +} + +void tpm2_log_debug_tpml_pcr_selection(const TPML_PCR_SELECTION *l, const char *msg) { + if (!DEBUG_LOGGING || !l) + return; + + _cleanup_free_ char *s = tpm2_tpml_pcr_selection_to_string(l); + log_debug("%s: %s", msg ?: "PCR selection", strna(s)); +} + +void tpm2_log_debug_pcr_value(const Tpm2PCRValue *pcr_value, const char *msg) { + if (!DEBUG_LOGGING || !pcr_value) + return; + + _cleanup_free_ char *s = tpm2_pcr_value_to_string(pcr_value); + log_debug("%s: %s", msg ?: "PCR value", strna(s)); +} + +void tpm2_log_debug_buffer(const void *buffer, size_t size, const char *msg) { + if (!DEBUG_LOGGING || !buffer || size == 0) + return; + + _cleanup_free_ char *h = hexmem(buffer, size); + log_debug("%s: %s", msg ?: "Buffer", strna(h)); +} + +void tpm2_log_debug_digest(const TPM2B_DIGEST *digest, const char *msg) { + if (digest) + tpm2_log_debug_buffer(digest->buffer, digest->size, msg ?: "Digest"); +} + +void tpm2_log_debug_name(const TPM2B_NAME *name, const char *msg) { + if (name) + tpm2_log_debug_buffer(name->name, name->size, msg ?: "Name"); +} + +static int tpm2_get_policy_digest( + Tpm2Context *c, + const Tpm2Handle *session, + TPM2B_DIGEST **ret_policy_digest) { + + TSS2_RC rc; + + if (!DEBUG_LOGGING && !ret_policy_digest) + return 0; + + assert(c); + assert(session); + + log_debug("Acquiring policy digest."); + + _cleanup_(Esys_Freep) TPM2B_DIGEST *policy_digest = NULL; + rc = sym_Esys_PolicyGetDigest( + c->esys_context, + session->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + &policy_digest); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to get policy digest from TPM: %s", sym_Tss2_RC_Decode(rc)); + + tpm2_log_debug_digest(policy_digest, "Session policy digest"); + + if (ret_policy_digest) + *ret_policy_digest = TAKE_PTR(policy_digest); + + return 0; +} + +int tpm2_create_primary( + Tpm2Context *c, + const Tpm2Handle *session, + const TPM2B_PUBLIC *template, + const TPM2B_SENSITIVE_CREATE *sensitive, + TPM2B_PUBLIC **ret_public, + Tpm2Handle **ret_handle) { + + usec_t ts; + TSS2_RC rc; + int r; + + assert(c); + assert(template); + + log_debug("Creating primary key on TPM."); + + ts = now(CLOCK_MONOTONIC); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_handle_new(c, &handle); + if (r < 0) + return r; + + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + rc = sym_Esys_CreatePrimary( + c->esys_context, + ESYS_TR_RH_OWNER, + session ? session->esys_handle : ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + sensitive ? sensitive : &(TPM2B_SENSITIVE_CREATE) {}, + template, + /* outsideInfo= */ NULL, + &(TPML_PCR_SELECTION) {}, + &handle->esys_handle, + &public, + /* creationData= */ NULL, + /* creationHash= */ NULL, + /* creationTicket= */ NULL); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to generate primary key in TPM: %s", + sym_Tss2_RC_Decode(rc)); + + log_debug("Successfully created primary key on TPM in %s.", + FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - ts, USEC_PER_MSEC)); + + if (ret_public) + *ret_public = TAKE_PTR(public); + if (ret_handle) + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +/* Create a TPM object. Do not use this to create primary keys, because some HW TPMs refuse to allow that; + * instead use tpm2_create_primary(). */ +int tpm2_create(Tpm2Context *c, + const Tpm2Handle *parent, + const Tpm2Handle *session, + const TPMT_PUBLIC *template, + const TPMS_SENSITIVE_CREATE *sensitive, + TPM2B_PUBLIC **ret_public, + TPM2B_PRIVATE **ret_private) { + + usec_t ts; + TSS2_RC rc; + + assert(c); + assert(parent); + assert(template); + + log_debug("Creating object on TPM."); + + ts = now(CLOCK_MONOTONIC); + + TPM2B_PUBLIC tpm2b_public = { + .size = sizeof(*template) - sizeof(template->unique), + .publicArea = *template, + }; + + /* Zero the unique area. */ + zero(tpm2b_public.publicArea.unique); + + TPM2B_SENSITIVE_CREATE tpm2b_sensitive; + if (sensitive) + tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) { + .size = sizeof(*sensitive), + .sensitive = *sensitive, + }; + else + tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) {}; + + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL; + rc = sym_Esys_Create( + c->esys_context, + parent->esys_handle, + session ? session->esys_handle : ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + &tpm2b_sensitive, + &tpm2b_public, + /* outsideInfo= */ NULL, + &(TPML_PCR_SELECTION) {}, + &private, + &public, + /* creationData= */ NULL, + /* creationHash= */ NULL, + /* creationTicket= */ NULL); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to generate object in TPM: %s", + sym_Tss2_RC_Decode(rc)); + + log_debug("Successfully created object on TPM in %s.", + FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - ts, USEC_PER_MSEC)); + + if (ret_public) + *ret_public = TAKE_PTR(public); + if (ret_private) + *ret_private = TAKE_PTR(private); + + return 0; +} + +int tpm2_load( + Tpm2Context *c, + const Tpm2Handle *parent, + const Tpm2Handle *session, + const TPM2B_PUBLIC *public, + const TPM2B_PRIVATE *private, + Tpm2Handle **ret_handle) { + + TSS2_RC rc; + int r; + + assert(c); + assert(public); + assert(private); + assert(ret_handle); + + log_debug("Loading object into TPM."); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_handle_new(c, &handle); + if (r < 0) + return r; + + rc = sym_Esys_Load( + c->esys_context, + parent ? parent->esys_handle : ESYS_TR_RH_OWNER, + session ? session->esys_handle : ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + private, + public, + &handle->esys_handle); + if (rc == TPM2_RC_LOCKOUT) + return log_debug_errno(SYNTHETIC_ERRNO(ENOLCK), + "TPM2 device is in dictionary attack lockout mode."); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to load key into TPM: %s", sym_Tss2_RC_Decode(rc)); + + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +static int tpm2_load_external( + Tpm2Context *c, + const Tpm2Handle *session, + const TPM2B_PUBLIC *public, + const TPM2B_SENSITIVE *private, + Tpm2Handle **ret_handle) { + + TSS2_RC rc; + int r; + + assert(c); + assert(ret_handle); + + log_debug("Loading external key into TPM."); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_handle_new(c, &handle); + if (r < 0) + return r; + + rc = sym_Esys_LoadExternal( + c->esys_context, + session ? session->esys_handle : ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + private, + public, +#if HAVE_TSS2_ESYS3 + /* tpm2-tss >= 3.0.0 requires a ESYS_TR_RH_* constant specifying the requested + * hierarchy, older versions need TPM2_RH_* instead. */ + ESYS_TR_RH_OWNER, +#else + TPM2_RH_OWNER, +#endif + &handle->esys_handle); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to load public key into TPM: %s", sym_Tss2_RC_Decode(rc)); + + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +/* This calls TPM2_CreateLoaded() directly, without checking if the TPM supports it. Callers should instead + * use tpm2_create_loaded(). */ +static int _tpm2_create_loaded( + Tpm2Context *c, + const Tpm2Handle *parent, + const Tpm2Handle *session, + const TPMT_PUBLIC *template, + const TPMS_SENSITIVE_CREATE *sensitive, + TPM2B_PUBLIC **ret_public, + TPM2B_PRIVATE **ret_private, + Tpm2Handle **ret_handle) { + + usec_t ts; + TSS2_RC rc; + int r; + + assert(c); + assert(parent); + assert(template); + + log_debug("Creating loaded object on TPM."); + + ts = now(CLOCK_MONOTONIC); + + /* Copy the input template and zero the unique area. */ + TPMT_PUBLIC template_copy = *template; + zero(template_copy.unique); + + TPM2B_TEMPLATE tpm2b_template; + size_t size = 0; + rc = sym_Tss2_MU_TPMT_PUBLIC_Marshal( + &template_copy, + tpm2b_template.buffer, + sizeof(tpm2b_template.buffer), + &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal public key template: %s", sym_Tss2_RC_Decode(rc)); + assert(size <= UINT16_MAX); + tpm2b_template.size = size; + + TPM2B_SENSITIVE_CREATE tpm2b_sensitive; + if (sensitive) + tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) { + .size = sizeof(*sensitive), + .sensitive = *sensitive, + }; + else + tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) {}; + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_handle_new(c, &handle); + if (r < 0) + return r; + + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL; + rc = sym_Esys_CreateLoaded( + c->esys_context, + parent->esys_handle, + session ? session->esys_handle : ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + &tpm2b_sensitive, + &tpm2b_template, + &handle->esys_handle, + &private, + &public); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to generate loaded object in TPM: %s", + sym_Tss2_RC_Decode(rc)); + + log_debug("Successfully created loaded object on TPM in %s.", + FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - ts, USEC_PER_MSEC)); + + if (ret_public) + *ret_public = TAKE_PTR(public); + if (ret_private) + *ret_private = TAKE_PTR(private); + if (ret_handle) + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +/* This calls TPM2_CreateLoaded() if the TPM supports it, otherwise it calls TPM2_Create() and TPM2_Load() + * separately. Do not use this to create primary keys, because some HW TPMs refuse to allow that; instead use + * tpm2_create_primary(). */ +int tpm2_create_loaded( + Tpm2Context *c, + const Tpm2Handle *parent, + const Tpm2Handle *session, + const TPMT_PUBLIC *template, + const TPMS_SENSITIVE_CREATE *sensitive, + TPM2B_PUBLIC **ret_public, + TPM2B_PRIVATE **ret_private, + Tpm2Handle **ret_handle) { + + int r; + + if (tpm2_supports_command(c, TPM2_CC_CreateLoaded)) + return _tpm2_create_loaded(c, parent, session, template, sensitive, ret_public, ret_private, ret_handle); + + /* Unfortunately, this TPM doesn't support CreateLoaded (added at spec revision 130) so we need to + * create and load manually. */ + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL; + r = tpm2_create(c, parent, session, template, sensitive, &public, &private); + if (r < 0) + return r; + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_load(c, parent, session, public, private, &handle); + if (r < 0) + return r; + + if (ret_public) + *ret_public = TAKE_PTR(public); + if (ret_private) + *ret_private = TAKE_PTR(private); + if (ret_handle) + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +static int tpm2_marshal_private(const TPM2B_PRIVATE *private, void **ret, size_t *ret_size) { + size_t max_size = sizeof(*private), blob_size = 0; + _cleanup_free_ void *blob = NULL; + TSS2_RC rc; + + assert(private); + assert(ret); + assert(ret_size); + + blob = malloc0(max_size); + if (!blob) + return log_oom_debug(); + + rc = sym_Tss2_MU_TPM2B_PRIVATE_Marshal(private, blob, max_size, &blob_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal private key: %s", sym_Tss2_RC_Decode(rc)); + + *ret = TAKE_PTR(blob); + *ret_size = blob_size; + return 0; +} + +static int tpm2_unmarshal_private(const void *data, size_t size, TPM2B_PRIVATE *ret_private) { + TPM2B_PRIVATE private = {}; + size_t offset = 0; + TSS2_RC rc; + + assert(data || size == 0); + assert(ret_private); + + rc = sym_Tss2_MU_TPM2B_PRIVATE_Unmarshal(data, size, &offset, &private); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unmarshal private key: %s", sym_Tss2_RC_Decode(rc)); + if (offset != size) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Garbage at end of private key marshal data."); + + *ret_private = private; + return 0; +} + +int tpm2_marshal_public(const TPM2B_PUBLIC *public, void **ret, size_t *ret_size) { + size_t max_size = sizeof(*public), blob_size = 0; + _cleanup_free_ void *blob = NULL; + TSS2_RC rc; + + assert(public); + assert(ret); + assert(ret_size); + + blob = malloc0(max_size); + if (!blob) + return log_oom_debug(); + + rc = sym_Tss2_MU_TPM2B_PUBLIC_Marshal(public, blob, max_size, &blob_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal public key: %s", sym_Tss2_RC_Decode(rc)); + + *ret = TAKE_PTR(blob); + *ret_size = blob_size; + return 0; +} + +static int tpm2_unmarshal_public(const void *data, size_t size, TPM2B_PUBLIC *ret_public) { + TPM2B_PUBLIC public = {}; + size_t offset = 0; + TSS2_RC rc; + + assert(data || size == 0); + assert(ret_public); + + rc = sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal(data, size, &offset, &public); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unmarshal public key: %s", sym_Tss2_RC_Decode(rc)); + if (offset != size) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Garbage at end of public key marshal data."); + + *ret_public = public; + return 0; +} + +int tpm2_marshal_nv_public(const TPM2B_NV_PUBLIC *nv_public, void **ret, size_t *ret_size) { + size_t max_size = sizeof(*nv_public), blob_size = 0; + _cleanup_free_ void *blob = NULL; + TSS2_RC rc; + + assert(nv_public); + assert(ret); + assert(ret_size); + + blob = malloc0(max_size); + if (!blob) + return log_oom_debug(); + + rc = sym_Tss2_MU_TPM2B_NV_PUBLIC_Marshal(nv_public, blob, max_size, &blob_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal NV public structure: %s", sym_Tss2_RC_Decode(rc)); + + *ret = TAKE_PTR(blob); + *ret_size = blob_size; + return 0; +} + +int tpm2_unmarshal_nv_public(const void *data, size_t size, TPM2B_NV_PUBLIC *ret_nv_public) { + TPM2B_NV_PUBLIC nv_public = {}; + size_t offset = 0; + TSS2_RC rc; + + assert(data || size == 0); + assert(ret_nv_public); + + rc = sym_Tss2_MU_TPM2B_NV_PUBLIC_Unmarshal(data, size, &offset, &nv_public); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unmarshal NV public structure: %s", sym_Tss2_RC_Decode(rc)); + if (offset != size) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Garbage at end of NV public structure marshal data."); + + *ret_nv_public = nv_public; + return 0; +} + +static int tpm2_import( + Tpm2Context *c, + const Tpm2Handle *parent, + const Tpm2Handle *session, + const TPM2B_PUBLIC *public, + const TPM2B_PRIVATE *private, + const TPM2B_ENCRYPTED_SECRET *seed, + const TPM2B_DATA *encryption_key, + const TPMT_SYM_DEF_OBJECT *symmetric, + TPM2B_PRIVATE **ret_private) { + + TSS2_RC rc; + + assert(c); + assert(parent); + assert(!!encryption_key == !!symmetric); + assert(public); + assert(private); + assert(seed); + assert(ret_private); + + log_debug("Importing key into TPM."); + + rc = sym_Esys_Import( + c->esys_context, + parent->esys_handle, + session ? session->esys_handle : ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + encryption_key, + public, + private, + seed, + symmetric ?: &(TPMT_SYM_DEF_OBJECT){ .algorithm = TPM2_ALG_NULL, }, + ret_private); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to import key into TPM: %s", sym_Tss2_RC_Decode(rc)); + + return 0; +} + +/* Read hash values from the specified PCR selection. Provides a Tpm2PCRValue array that contains all + * requested PCR values, in the order provided by the TPM. Normally, the provided pcr values will match + * exactly what is in the provided selection, but the TPM may ignore some selected PCRs (for example, if an + * unimplemented PCR index is requested), in which case those PCRs will be absent from the provided pcr + * values. */ +int tpm2_pcr_read( + Tpm2Context *c, + const TPML_PCR_SELECTION *pcr_selection, + Tpm2PCRValue **ret_pcr_values, + size_t *ret_n_pcr_values) { + + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values = 0; + TSS2_RC rc; + + assert(c); + assert(pcr_selection); + assert(ret_pcr_values); + assert(ret_n_pcr_values); + + TPML_PCR_SELECTION remaining = *pcr_selection; + while (!tpm2_tpml_pcr_selection_is_empty(&remaining)) { + _cleanup_(Esys_Freep) TPML_PCR_SELECTION *current_read = NULL; + _cleanup_(Esys_Freep) TPML_DIGEST *current_values = NULL; + + tpm2_log_debug_tpml_pcr_selection(&remaining, "Reading PCR selection"); + + /* Unfortunately, PCR_Read will not return more than 8 values. */ + rc = sym_Esys_PCR_Read( + c->esys_context, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + &remaining, + NULL, + ¤t_read, + ¤t_values); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to read TPM2 PCRs: %s", sym_Tss2_RC_Decode(rc)); + + tpm2_log_debug_tpml_pcr_selection(current_read, "Read PCR selection"); + + if (tpm2_tpml_pcr_selection_is_empty(current_read)) { + log_debug("TPM2 refused to read possibly unimplemented PCRs, ignoring."); + break; + } + + unsigned i = 0; + FOREACH_PCR_IN_TPML_PCR_SELECTION(index, tpms, current_read) { + assert(i < current_values->count); + Tpm2PCRValue pcr_value = { + .index = index, + .hash = tpms->hash, + .value = current_values->digests[i++], + }; + + tpm2_log_debug_pcr_value(&pcr_value, /* msg= */ NULL); + + if (!GREEDY_REALLOC_APPEND(pcr_values, n_pcr_values, &pcr_value, 1)) + return log_oom_debug(); + } + assert(i == current_values->count); + + tpm2_tpml_pcr_selection_sub(&remaining, current_read); + } + + tpm2_sort_pcr_values(pcr_values, n_pcr_values); + + if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "PCR values read from TPM are not valid."); + + *ret_pcr_values = TAKE_PTR(pcr_values); + *ret_n_pcr_values = n_pcr_values; + + return 0; +} + +/* Read the PCR value for each TPM2PCRValue entry in the array that does not have a value set. If all entries + * have an unset hash (i.e. hash == 0), this first detects the "best" PCR bank to use; otherwise, all entries + * must have a valid hash set. All entries must have a valid index. If this cannot read a PCR value for all + * appropriate entries, this returns an error. This does not check the array for validity. */ +int tpm2_pcr_read_missing_values(Tpm2Context *c, Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + TPMI_ALG_HASH pcr_bank = 0; + int r; + + assert(c); + assert(pcr_values || n_pcr_values == 0); + + if (n_pcr_values > 0) { + size_t hash_count; + r = tpm2_pcr_values_hash_count(pcr_values, n_pcr_values, &hash_count); + if (r < 0) + return log_debug_errno(r, "Could not get hash count from pcr values: %m"); + + if (hash_count == 1 && pcr_values[0].hash == 0) { + uint32_t mask; + r = tpm2_pcr_values_to_mask(pcr_values, n_pcr_values, 0, &mask); + if (r < 0) + return r; + + r = tpm2_get_best_pcr_bank(c, mask, &pcr_bank); + if (r < 0) + return r; + } + } + + FOREACH_ARRAY(v, pcr_values, n_pcr_values) { + if (v->hash == 0) + v->hash = pcr_bank; + + if (v->value.size > 0) + continue; + + TPML_PCR_SELECTION selection; + r = tpm2_tpml_pcr_selection_from_pcr_values(v, 1, &selection, NULL, NULL); + if (r < 0) + return r; + + _cleanup_free_ Tpm2PCRValue *read_values = NULL; + size_t n_read_values; + r = tpm2_pcr_read(c, &selection, &read_values, &n_read_values); + if (r < 0) + return r; + + if (n_read_values == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Could not read PCR hash 0x%" PRIu16 " index %u", + v->hash, v->index); + + assert(n_read_values == 1); + assert(read_values[0].hash == v->hash); + assert(read_values[0].index == v->index); + + v->value = read_values[0].value; + } + + return 0; +} + +static int tpm2_pcr_mask_good( + Tpm2Context *c, + TPMI_ALG_HASH bank, + uint32_t mask) { + + TPML_PCR_SELECTION selection; + int r; + + assert(c); + + /* So we have the problem that some systems might have working TPM2 chips, but the firmware doesn't + * actually measure into them, or only into a suboptimal bank. If so, the PCRs should be all zero or + * all 0xFF. Detect that, so that we can warn and maybe pick a better bank. */ + + tpm2_tpml_pcr_selection_from_mask(mask, bank, &selection); + + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values; + r = tpm2_pcr_read(c, &selection, &pcr_values, &n_pcr_values); + if (r < 0) + return r; + + /* If at least one of the selected PCR values is something other than all 0x00 or all 0xFF we are happy. */ + FOREACH_ARRAY(v, pcr_values, n_pcr_values) + if (!memeqbyte(0x00, v->value.buffer, v->value.size) && + !memeqbyte(0xFF, v->value.buffer, v->value.size)) + return true; + + return false; +} + +static int tpm2_bank_has24(const TPMS_PCR_SELECTION *selection) { + + assert(selection); + + /* As per https://trustedcomputinggroup.org/wp-content/uploads/TCG_PCClient_PFP_r1p05_v23_pub.pdf a + * TPM2 on a Client PC must have at least 24 PCRs. If this TPM has less, just skip over it. */ + if (selection->sizeofSelect < TPM2_PCRS_MAX/8) { + log_debug("Skipping TPM2 PCR bank %s with fewer than 24 PCRs.", + strna(tpm2_hash_alg_to_string(selection->hash))); + return false; + } + + assert_cc(TPM2_PCRS_MAX % 8 == 0); + + /* It's not enough to check how many PCRs there are, we also need to check that the 24 are + * enabled for this bank. Otherwise this TPM doesn't qualify. */ + bool valid = true; + for (size_t j = 0; j < TPM2_PCRS_MAX/8; j++) + if (selection->pcrSelect[j] != 0xFF) { + valid = false; + break; + } + + if (!valid) + log_debug("TPM2 PCR bank %s has fewer than 24 PCR bits enabled, ignoring.", + strna(tpm2_hash_alg_to_string(selection->hash))); + + return valid; +} + +int tpm2_get_best_pcr_bank( + Tpm2Context *c, + uint32_t pcr_mask, + TPMI_ALG_HASH *ret) { + + TPMI_ALG_HASH supported_hash = 0, hash_with_valid_pcr = 0; + int r; + + assert(c); + assert(ret); + + if (pcr_mask == 0) { + log_debug("Asked to pick best PCR bank but no PCRs selected we could derive this from. Defaulting to SHA256."); + *ret = TPM2_ALG_SHA256; /* if no PCRs are selected this doesn't matter anyway... */ + return 0; + } + + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection, &c->capability_pcrs) { + TPMI_ALG_HASH hash = selection->hash; + int good; + + /* For now we are only interested in the SHA1 and SHA256 banks */ + if (!IN_SET(hash, TPM2_ALG_SHA256, TPM2_ALG_SHA1)) + continue; + + r = tpm2_bank_has24(selection); + if (r < 0) + return r; + if (!r) + continue; + + good = tpm2_pcr_mask_good(c, hash, pcr_mask); + if (good < 0) + return good; + + if (hash == TPM2_ALG_SHA256) { + supported_hash = TPM2_ALG_SHA256; + if (good) { + /* Great, SHA256 is supported and has initialized PCR values, we are done. */ + hash_with_valid_pcr = TPM2_ALG_SHA256; + break; + } + } else { + assert(hash == TPM2_ALG_SHA1); + + if (supported_hash == 0) + supported_hash = TPM2_ALG_SHA1; + + if (good && hash_with_valid_pcr == 0) + hash_with_valid_pcr = TPM2_ALG_SHA1; + } + } + + /* We preferably pick SHA256, but only if its PCRs are initialized or neither the SHA1 nor the SHA256 + * PCRs are initialized. If SHA256 is not supported but SHA1 is and its PCRs are too, we prefer + * SHA1. + * + * We log at LOG_NOTICE level whenever we end up using the SHA1 bank or when the PCRs we bind to are + * not initialized. */ + + if (hash_with_valid_pcr == TPM2_ALG_SHA256) { + assert(supported_hash == TPM2_ALG_SHA256); + log_debug("TPM2 device supports SHA256 PCR bank and SHA256 PCRs are valid, yay!"); + *ret = TPM2_ALG_SHA256; + } else if (hash_with_valid_pcr == TPM2_ALG_SHA1) { + if (supported_hash == TPM2_ALG_SHA256) + log_notice("TPM2 device supports both SHA1 and SHA256 PCR banks, but only SHA1 PCRs are valid, falling back to SHA1 bank. This reduces the security level substantially."); + else { + assert(supported_hash == TPM2_ALG_SHA1); + log_notice("TPM2 device lacks support for SHA256 PCR bank, but SHA1 bank is supported and SHA1 PCRs are valid, falling back to SHA1 bank. This reduces the security level substantially."); + } + + *ret = TPM2_ALG_SHA1; + } else if (supported_hash == TPM2_ALG_SHA256) { + log_notice("TPM2 device supports SHA256 PCR bank but none of the selected PCRs are valid! Firmware apparently did not initialize any of the selected PCRs. Proceeding anyway with SHA256 bank. PCR policy effectively unenforced!"); + *ret = TPM2_ALG_SHA256; + } else if (supported_hash == TPM2_ALG_SHA1) { + log_notice("TPM2 device lacks support for SHA256 bank, but SHA1 bank is supported, but none of the selected PCRs are valid! Firmware apparently did not initialize any of the selected PCRs. Proceeding anyway with SHA1 bank. PCR policy effectively unenforced!"); + *ret = TPM2_ALG_SHA1; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM2 module supports neither SHA1 nor SHA256 PCR banks, cannot operate."); + + return 0; +} + +int tpm2_get_good_pcr_banks( + Tpm2Context *c, + uint32_t pcr_mask, + TPMI_ALG_HASH **ret) { + + _cleanup_free_ TPMI_ALG_HASH *good_banks = NULL, *fallback_banks = NULL; + size_t n_good_banks = 0, n_fallback_banks = 0; + int r; + + assert(c); + assert(ret); + + FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection, &c->capability_pcrs) { + TPMI_ALG_HASH hash = selection->hash; + + /* Let's see if this bank is superficially OK, i.e. has at least 24 enabled registers */ + r = tpm2_bank_has24(selection); + if (r < 0) + return r; + if (!r) + continue; + + /* Let's now see if this bank has any of the selected PCRs actually initialized */ + r = tpm2_pcr_mask_good(c, hash, pcr_mask); + if (r < 0) + return r; + + if (n_good_banks + n_fallback_banks >= INT_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "Too many good TPM2 banks?"); + + if (r) { + if (!GREEDY_REALLOC(good_banks, n_good_banks+1)) + return log_oom_debug(); + + good_banks[n_good_banks++] = hash; + } else { + if (!GREEDY_REALLOC(fallback_banks, n_fallback_banks+1)) + return log_oom_debug(); + + fallback_banks[n_fallback_banks++] = hash; + } + } + + /* Preferably, use the good banks (i.e. the ones the PCR values are actually initialized so + * far). Otherwise use the fallback banks (i.e. which exist and are enabled, but so far not used. */ + if (n_good_banks > 0) { + log_debug("Found %zu fully initialized TPM2 banks.", n_good_banks); + *ret = TAKE_PTR(good_banks); + return (int) n_good_banks; + } + if (n_fallback_banks > 0) { + log_debug("Found %zu enabled but un-initialized TPM2 banks.", n_fallback_banks); + *ret = TAKE_PTR(fallback_banks); + return (int) n_fallback_banks; + } + + /* No suitable banks found. */ + *ret = NULL; + return 0; +} + +int tpm2_get_good_pcr_banks_strv( + Tpm2Context *c, + uint32_t pcr_mask, + char ***ret) { + +#if HAVE_OPENSSL + _cleanup_free_ TPMI_ALG_HASH *algs = NULL; + _cleanup_strv_free_ char **l = NULL; + int n_algs; + + assert(c); + assert(ret); + + n_algs = tpm2_get_good_pcr_banks(c, pcr_mask, &algs); + if (n_algs < 0) + return n_algs; + + FOREACH_ARRAY(a, algs, n_algs) { + _cleanup_free_ char *n = NULL; + const EVP_MD *implementation; + const char *salg; + + salg = tpm2_hash_alg_to_string(*a); + if (!salg) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM2 operates with unknown PCR algorithm, can't measure."); + + implementation = EVP_get_digestbyname(salg); + if (!implementation) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM2 operates with unsupported PCR algorithm, can't measure."); + + n = strdup(ASSERT_PTR(EVP_MD_name(implementation))); + if (!n) + return log_oom_debug(); + + ascii_strlower(n); /* OpenSSL uses uppercase digest names, we prefer them lower case. */ + + if (strv_consume(&l, TAKE_PTR(n)) < 0) + return log_oom_debug(); + } + + *ret = TAKE_PTR(l); + return 0; +#else /* HAVE_OPENSSL */ + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled."); +#endif +} + +/* Hash data into the digest. + * + * If 'extend' is true, the hashing operation starts with the existing digest hash (and the digest is + * required to have a hash and its size must be correct). If 'extend' is false, the digest size is + * initialized to the correct size for 'alg' and the hashing operation does not include any existing digest + * hash. If 'extend' is false and no data is provided, the digest is initialized to a zero digest. + * + * On success, the digest hash will be updated with the hashing operation result and the digest size will be + * correct for 'alg'. + * + * This currently only provides SHA256, so 'alg' must be TPM2_ALG_SHA256. */ +int tpm2_digest_many( + TPMI_ALG_HASH alg, + TPM2B_DIGEST *digest, + const struct iovec data[], + size_t n_data, + bool extend) { + + struct sha256_ctx ctx; + + assert(digest); + assert(data || n_data == 0); + + if (alg != TPM2_ALG_SHA256) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Hash algorithm not supported: 0x%x", alg); + + if (extend && digest->size != SHA256_DIGEST_SIZE) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Digest size 0x%x, require 0x%x", + digest->size, (unsigned)SHA256_DIGEST_SIZE); + + /* Since we're hardcoding SHA256 (for now), we can check this at compile time. */ + assert_cc(sizeof(digest->buffer) >= SHA256_DIGEST_SIZE); + + CLEANUP_ERASE(ctx); + + sha256_init_ctx(&ctx); + + if (extend) + sha256_process_bytes(digest->buffer, digest->size, &ctx); + else { + *digest = (TPM2B_DIGEST) { + .size = SHA256_DIGEST_SIZE, + }; + if (n_data == 0) /* If not extending and no data, return zero hash */ + return 0; + } + + FOREACH_ARRAY(d, data, n_data) + sha256_process_bytes(d->iov_base, d->iov_len, &ctx); + + sha256_finish_ctx(&ctx, digest->buffer); + + return 0; +} + +/* Same as tpm2_digest_many() but data is contained in TPM2B_DIGEST[]. The digests may be any size digests. */ +int tpm2_digest_many_digests( + TPMI_ALG_HASH alg, + TPM2B_DIGEST *digest, + const TPM2B_DIGEST data[], + size_t n_data, + bool extend) { + + _cleanup_free_ struct iovec *iovecs = NULL; + + assert(data || n_data == 0); + + iovecs = new(struct iovec, n_data); + if (!iovecs) + return log_oom_debug(); + + for (size_t i = 0; i < n_data; i++) + iovecs[i] = IOVEC_MAKE((void*) data[i].buffer, data[i].size); + + return tpm2_digest_many(alg, digest, iovecs, n_data, extend); +} + +/* This hashes the provided pin into a digest value, but also verifies that the final byte is not 0, because + * the TPM specification Part 1 ("Architecture") section Authorization Values (subsection "Authorization Size + * Convention") states "Trailing octets of zero are to be removed from any string before it is used as an + * authValue". Since the TPM doesn't know if the auth value is a "string" or just a hash digest, any hash + * digest that randomly happens to end in 0 must have the final 0(s) trimmed. + * + * This is required at 2 points. First, when setting the authValue during creation of new sealed objects, in + * tpm2_seal(). This only applies to newly created objects, of course. Second, when using a previously + * created sealed object that has an authValue set, we use the sealed objects as the session bind key. This + * requires calling SetAuth so tpm2-tss can correctly calculate the HMAC to use for the encryption session. + * + * TPM implementations will perform the trimming for any authValue for existing sealed objects, so the + * tpm2-tss library must also perform the trimming before HMAC calculation, but it does not yet; this bug is + * open to add the trimming: https://github.com/tpm2-software/tpm2-tss/issues/2664 + * + * Until our minimum tpm2-tss version contains a fix for that bug, we must perform the trimming + * ourselves. Note that since we are trimming, which is exactly what a TPM implementation would do, this will + * work for both existing objects with a authValue ending in 0(s) as well as new sealed objects we create, + * which we will trim the 0(s) from before sending to the TPM. + */ +static void tpm2_trim_auth_value(TPM2B_AUTH *auth) { + bool trimmed = false; + + assert(auth); + + while (auth->size > 0 && auth->buffer[auth->size - 1] == 0) { + trimmed = true; + auth->size--; + } + + if (trimmed) + log_debug("authValue ends in 0, trimming as required by the TPM2 specification Part 1 section 'HMAC Computation' authValue Note 2."); +} + +int tpm2_get_pin_auth(TPMI_ALG_HASH hash, const char *pin, TPM2B_AUTH *ret_auth) { + TPM2B_AUTH auth = {}; + int r; + + assert(pin); + assert(ret_auth); + + r = tpm2_digest_buffer(hash, &auth, pin, strlen(pin), /* extend= */ false); + if (r < 0) + return r; + + tpm2_trim_auth_value(&auth); + + *ret_auth = TAKE_STRUCT(auth); + + return 0; +} + +int tpm2_set_auth_binary(Tpm2Context *c, const Tpm2Handle *handle, const TPM2B_AUTH *auth) { + TSS2_RC rc; + + assert(c); + assert(handle); + + if (!auth) + return 0; + + rc = sym_Esys_TR_SetAuth(c->esys_context, handle->esys_handle, auth); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to load PIN in TPM: %s", sym_Tss2_RC_Decode(rc)); + + return 0; +} + +int tpm2_set_auth(Tpm2Context *c, const Tpm2Handle *handle, const char *pin) { + TPM2B_AUTH auth = {}; + int r; + + assert(c); + assert(handle); + + if (!pin) + return 0; + + CLEANUP_ERASE(auth); + + r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &auth); + if (r < 0) + return r; + + return tpm2_set_auth_binary(c, handle, &auth); +} + +static bool tpm2_is_encryption_session(Tpm2Context *c, const Tpm2Handle *session) { + TPMA_SESSION flags = 0; + TSS2_RC rc; + + assert(c); + assert(session); + + rc = sym_Esys_TRSess_GetAttributes(c->esys_context, session->esys_handle, &flags); + if (rc != TSS2_RC_SUCCESS) + return false; + + return (flags & TPMA_SESSION_DECRYPT) && (flags & TPMA_SESSION_ENCRYPT); +} + +int tpm2_make_encryption_session( + Tpm2Context *c, + const Tpm2Handle *primary, + const Tpm2Handle *bind_key, + Tpm2Handle **ret_session) { + + const TPMA_SESSION sessionAttributes = TPMA_SESSION_DECRYPT | TPMA_SESSION_ENCRYPT | + TPMA_SESSION_CONTINUESESSION; + TSS2_RC rc; + int r; + + assert(c); + assert(primary); + assert(ret_session); + + log_debug("Starting HMAC encryption session."); + + /* Start a salted, unbound HMAC session with a well-known key (e.g. primary key) as tpmKey, which + * means that the random salt will be encrypted with the well-known key. That way, only the TPM can + * recover the salt, which is then used for key derivation. */ + _cleanup_(tpm2_handle_freep) Tpm2Handle *session = NULL; + r = tpm2_handle_new(c, &session); + if (r < 0) + return r; + + rc = sym_Esys_StartAuthSession( + c->esys_context, + primary->esys_handle, + bind_key ? bind_key->esys_handle : ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + NULL, + TPM2_SE_HMAC, + &SESSION_TEMPLATE_SYM_AES_128_CFB, + TPM2_ALG_SHA256, + &session->esys_handle); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to open session in TPM: %s", sym_Tss2_RC_Decode(rc)); + + /* Enable parameter encryption/decryption with AES in CFB mode. Together with HMAC digests (which are + * always used for sessions), this provides confidentiality, integrity and replay protection for + * operations that use this session. */ + rc = sym_Esys_TRSess_SetAttributes(c->esys_context, session->esys_handle, sessionAttributes, 0xff); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to configure TPM session: %s", sym_Tss2_RC_Decode(rc)); + + *ret_session = TAKE_PTR(session); + + return 0; +} + +int tpm2_make_policy_session( + Tpm2Context *c, + const Tpm2Handle *primary, + const Tpm2Handle *encryption_session, + Tpm2Handle **ret_session) { + + TSS2_RC rc; + int r; + + assert(c); + assert(primary); + assert(encryption_session); + assert(ret_session); + + if (!tpm2_is_encryption_session(c, encryption_session)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Missing encryption session"); + + log_debug("Starting policy session."); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *session = NULL; + r = tpm2_handle_new(c, &session); + if (r < 0) + return r; + + rc = sym_Esys_StartAuthSession( + c->esys_context, + primary->esys_handle, + ESYS_TR_NONE, + encryption_session->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + NULL, + TPM2_SE_POLICY, + &SESSION_TEMPLATE_SYM_AES_128_CFB, + TPM2_ALG_SHA256, + &session->esys_handle); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to open session in TPM: %s", sym_Tss2_RC_Decode(rc)); + + *ret_session = TAKE_PTR(session); + + return 0; +} + +static int find_signature( + JsonVariant *v, + const TPML_PCR_SELECTION *pcr_selection, + const void *fp, + size_t fp_size, + const void *policy, + size_t policy_size, + void *ret_signature, + size_t *ret_signature_size) { + +#if HAVE_OPENSSL + JsonVariant *b, *i; + const char *k; + int r; + + /* Searches for a signature blob in the specified JSON object. Search keys are PCR bank, PCR mask, + * public key, and policy digest. */ + + if (!json_variant_is_object(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature is not a JSON object."); + + uint16_t pcr_bank = pcr_selection->pcrSelections[0].hash; + uint32_t pcr_mask = tpm2_tpml_pcr_selection_to_mask(pcr_selection, pcr_bank); + + k = tpm2_hash_alg_to_string(pcr_bank); + if (!k) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Don't know PCR bank %" PRIu16, pcr_bank); + + /* First, find field by bank */ + b = json_variant_by_key(v, k); + if (!b) + return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), "Signature lacks data for PCR bank '%s'.", k); + + if (!json_variant_is_array(b)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Bank data is not a JSON array."); + + /* Now iterate through all signatures known for this bank */ + JSON_VARIANT_ARRAY_FOREACH(i, b) { + _cleanup_free_ void *fpj_data = NULL, *polj_data = NULL; + JsonVariant *maskj, *fpj, *sigj, *polj; + size_t fpj_size, polj_size; + uint32_t parsed_mask; + + if (!json_variant_is_object(i)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Bank data element is not a JSON object"); + + /* Check if the PCR mask matches our expectations */ + maskj = json_variant_by_key(i, "pcrs"); + if (!maskj) + continue; + + r = tpm2_parse_pcr_json_array(maskj, &parsed_mask); + if (r < 0) + return log_debug_errno(r, "Failed to parse JSON PCR mask"); + + if (parsed_mask != pcr_mask) + continue; /* Not for this PCR mask */ + + /* Then check if this is for the public key we operate with */ + fpj = json_variant_by_key(i, "pkfp"); + if (!fpj) + continue; + + r = json_variant_unhex(fpj, &fpj_data, &fpj_size); + if (r < 0) + return log_debug_errno(r, "Failed to decode fingerprint in JSON data: %m"); + + if (memcmp_nn(fp, fp_size, fpj_data, fpj_size) != 0) + continue; /* Not for this public key */ + + /* Finally, check if this is for the PCR policy we expect this to be */ + polj = json_variant_by_key(i, "pol"); + if (!polj) + continue; + + r = json_variant_unhex(polj, &polj_data, &polj_size); + if (r < 0) + return log_debug_errno(r, "Failed to decode policy hash JSON data: %m"); + + if (memcmp_nn(policy, policy_size, polj_data, polj_size) != 0) + continue; + + /* This entry matches all our expectations, now return the signature included in it */ + sigj = json_variant_by_key(i, "sig"); + if (!sigj) + continue; + + return json_variant_unbase64(sigj, ret_signature, ret_signature_size); + } + + return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), "Couldn't find signature for this PCR bank, PCR index and public key."); +#else /* HAVE_OPENSSL */ + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled."); +#endif +} + +/* Calculates the "name" of a public key. + * + * As specified in TPM2 spec "Part 1: Architecture", a key's "name" is its nameAlg value followed by a hash + * of its TPM2 public area, all properly marshalled. This allows a key's "name" to be dependent not only on + * the key fingerprint, but also on the TPM2-specific fields that associated with the key (i.e. all fields in + * TPMT_PUBLIC). Note that this means an existing key may not change any of its TPMT_PUBLIC fields, since + * that would also change the key name. + * + * Since we (currently) hardcode to always using SHA256 for hashing, this returns an error if the public key + * nameAlg is not TPM2_ALG_SHA256. */ +int tpm2_calculate_pubkey_name(const TPMT_PUBLIC *public, TPM2B_NAME *ret_name) { + TSS2_RC rc; + int r; + + assert(public); + assert(ret_name); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + if (public->nameAlg != TPM2_ALG_SHA256) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unsupported nameAlg: 0x%x", + public->nameAlg); + + _cleanup_free_ uint8_t *buf = NULL; + size_t size = 0; + + buf = (uint8_t*) new(TPMT_PUBLIC, 1); + if (!buf) + return log_oom_debug(); + + rc = sym_Tss2_MU_TPMT_PUBLIC_Marshal(public, buf, sizeof(TPMT_PUBLIC), &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal public key: %s", sym_Tss2_RC_Decode(rc)); + + TPM2B_DIGEST digest = {}; + r = tpm2_digest_buffer(TPM2_ALG_SHA256, &digest, buf, size, /* extend= */ false); + if (r < 0) + return r; + + TPMT_HA ha = { + .hashAlg = TPM2_ALG_SHA256, + }; + assert(digest.size <= sizeof(ha.digest.sha256)); + memcpy_safe(ha.digest.sha256, digest.buffer, digest.size); + + TPM2B_NAME name; + size = 0; + rc = sym_Tss2_MU_TPMT_HA_Marshal(&ha, name.name, sizeof(name.name), &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal key name: %s", sym_Tss2_RC_Decode(rc)); + name.size = size; + + tpm2_log_debug_name(&name, "Calculated public key name"); + + *ret_name = name; + + return 0; +} + +/* Get the "name" of a key from the TPM. + * + * The "name" of a key is explained above in tpm2_calculate_pubkey_name(). + * + * The handle must reference a key already present in the TPM. It may be either a public key only, or a + * public/private keypair. */ +static int tpm2_get_name( + Tpm2Context *c, + const Tpm2Handle *handle, + TPM2B_NAME **ret_name) { + + _cleanup_(Esys_Freep) TPM2B_NAME *name = NULL; + TSS2_RC rc; + + assert(c); + assert(handle); + assert(ret_name); + + rc = sym_Esys_TR_GetName(c->esys_context, handle->esys_handle, &name); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to get name of public key from TPM: %s", sym_Tss2_RC_Decode(rc)); + + tpm2_log_debug_name(name, "Object name"); + + *ret_name = TAKE_PTR(name); + + return 0; +} + +int tpm2_calculate_nv_index_name(const TPMS_NV_PUBLIC *nvpublic, TPM2B_NAME *ret_name) { + TSS2_RC rc; + int r; + + assert(nvpublic); + assert(ret_name); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + if (nvpublic->nameAlg != TPM2_ALG_SHA256) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unsupported nameAlg: 0x%x", + nvpublic->nameAlg); + + _cleanup_free_ uint8_t *buf = NULL; + size_t size = 0; + + buf = (uint8_t*) new(TPMS_NV_PUBLIC, 1); + if (!buf) + return log_oom_debug(); + + rc = sym_Tss2_MU_TPMS_NV_PUBLIC_Marshal(nvpublic, buf, sizeof(TPMS_NV_PUBLIC), &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal NV index: %s", sym_Tss2_RC_Decode(rc)); + + TPM2B_DIGEST digest = {}; + r = tpm2_digest_buffer(TPM2_ALG_SHA256, &digest, buf, size, /* extend= */ false); + if (r < 0) + return r; + + TPMT_HA ha = { + .hashAlg = TPM2_ALG_SHA256, + }; + assert(digest.size <= sizeof(ha.digest.sha256)); + memcpy_safe(ha.digest.sha256, digest.buffer, digest.size); + + TPM2B_NAME name; + size = 0; + rc = sym_Tss2_MU_TPMT_HA_Marshal(&ha, name.name, sizeof(name.name), &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal NV index name: %s", sym_Tss2_RC_Decode(rc)); + name.size = size; + + tpm2_log_debug_name(&name, "Calculated NV index name"); + + *ret_name = name; + + return 0; +} + +/* Extend 'digest' with the PolicyAuthValue calculated hash. */ +int tpm2_calculate_policy_auth_value(TPM2B_DIGEST *digest) { + TPM2_CC command = TPM2_CC_PolicyAuthValue; + TSS2_RC rc; + int r; + + assert(digest); + assert(digest->size == SHA256_DIGEST_SIZE); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + uint8_t buf[sizeof(command)]; + size_t offset = 0; + + rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal PolicyAuthValue command: %s", sym_Tss2_RC_Decode(rc)); + + if (offset != sizeof(command)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Offset 0x%zx wrong after marshalling PolicyAuthValue command", offset); + + r = tpm2_digest_buffer(TPM2_ALG_SHA256, digest, buf, offset, /* extend= */ true); + if (r < 0) + return r; + + tpm2_log_debug_digest(digest, "PolicyAuthValue calculated digest"); + + return 0; +} + +int tpm2_policy_auth_value( + Tpm2Context *c, + const Tpm2Handle *session, + TPM2B_DIGEST **ret_policy_digest) { + + TSS2_RC rc; + + assert(c); + assert(session); + + log_debug("Submitting AuthValue policy."); + + rc = sym_Esys_PolicyAuthValue( + c->esys_context, + session->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to add authValue policy to TPM: %s", + sym_Tss2_RC_Decode(rc)); + + return tpm2_get_policy_digest(c, session, ret_policy_digest); +} + +int tpm2_calculate_policy_authorize_nv( + const TPM2B_NV_PUBLIC *public_info, + TPM2B_DIGEST *digest) { + TPM2_CC command = TPM2_CC_PolicyAuthorizeNV; + TSS2_RC rc; + int r; + + assert(public_info); + assert(digest); + assert(digest->size == SHA256_DIGEST_SIZE); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + uint8_t buf[sizeof(command)]; + size_t offset = 0; + + rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal PolicyAuthorizeNV command: %s", sym_Tss2_RC_Decode(rc)); + + if (offset != sizeof(command)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Offset 0x%zx wrong after marshalling PolicyAuthorizeNV command", offset); + + TPM2B_NV_PUBLIC public_info_copy = *public_info; /* Make a copy, since we must set TPMA_NV_WRITTEN for the calculation */ + public_info_copy.nvPublic.attributes |= TPMA_NV_WRITTEN; + + TPM2B_NAME name = {}; + r = tpm2_calculate_nv_index_name(&public_info_copy.nvPublic, &name); + if (r < 0) + return r; + + struct iovec data[] = { + IOVEC_MAKE(buf, offset), + IOVEC_MAKE(name.name, name.size), + }; + + r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, ELEMENTSOF(data), /* extend= */ true); + if (r < 0) + return r; + + tpm2_log_debug_digest(digest, "PolicyAuthorizeNV calculated digest"); + + return 0; +} + +int tpm2_policy_authorize_nv( + Tpm2Context *c, + const Tpm2Handle *session, + const Tpm2Handle *nv_handle, + TPM2B_DIGEST **ret_policy_digest) { + + TSS2_RC rc; + + assert(c); + assert(session); + + log_debug("Submitting AuthorizeNV policy."); + + rc = sym_Esys_PolicyAuthorizeNV( + c->esys_context, + ESYS_TR_RH_OWNER, + nv_handle->esys_handle, + session->esys_handle, + ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to add AuthorizeNV policy to TPM: %s", + sym_Tss2_RC_Decode(rc)); + + return tpm2_get_policy_digest(c, session, ret_policy_digest); +} + +int tpm2_policy_or( + Tpm2Context *c, + const Tpm2Handle *session, + const TPM2B_DIGEST *branches, size_t n_branches, + TPM2B_DIGEST **ret_policy_digest) { + + TPML_DIGEST hash_list; + TSS2_RC rc; + + assert(c); + assert(session); + + if (n_branches > ELEMENTSOF(hash_list.digests)) + return -EOPNOTSUPP; + + log_debug("Submitting OR policy."); + + hash_list = (TPML_DIGEST) { + .count = n_branches, + }; + + memcpy(hash_list.digests, branches, n_branches * sizeof(TPM2B_DIGEST)); + + if (DEBUG_LOGGING) + for (size_t i = 0; i < hash_list.count; i++) { + _cleanup_free_ char *h = hexmem(hash_list.digests[i].buffer, hash_list.digests[i].size); + log_debug("Submitting OR Branch #%zu: %s", i, h); + } + + rc = sym_Esys_PolicyOR( + c->esys_context, + session->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + &hash_list); + if (rc != TSS2_RC_SUCCESS) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to add OR policy to TPM: %s", + sym_Tss2_RC_Decode(rc)); + + return tpm2_get_policy_digest(c, session, ret_policy_digest); +} + +/* Extend 'digest' with the PolicyOR calculated hash. */ +int tpm2_calculate_policy_or(const TPM2B_DIGEST *branches, size_t n_branches, TPM2B_DIGEST *digest) { + TPM2_CC command = TPM2_CC_PolicyOR; + TSS2_RC rc; + int r; + + assert(digest); + assert(digest->size == SHA256_DIGEST_SIZE); + + if (n_branches == 0) + return -EINVAL; + if (n_branches == 1) + log_warning("PolicyOR with a single branch submitted, this is weird."); + if (n_branches > 8) + return -E2BIG; + + r = dlopen_tpm2(); + if (r < 0) + return log_error_errno(r, "TPM2 support not installed: %m"); + + uint8_t buf[sizeof(command)]; + size_t offset = 0; + + rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset); + if (rc != TSS2_RC_SUCCESS) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal PolicyOR command: %s", sym_Tss2_RC_Decode(rc)); + + if (offset != sizeof(command)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Offset 0x%zx wrong after marshalling PolicyOR command", offset); + _cleanup_free_ struct iovec *data = new(struct iovec, 1 + n_branches); + if (!data) + return log_oom(); + + data[0] = IOVEC_MAKE(buf, offset); + for (size_t i = 0; i < n_branches; i++) { + data[1 + i] = IOVEC_MAKE((void*) branches[i].buffer, branches[i].size); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *h = hexmem(branches[i].buffer, branches[i].size); + log_debug("OR Branch #%zu: %s", i, h); + } + } + + /* PolicyOR does not use the previous hash value; we must zero and then extend it. */ + zero(digest->buffer); + + r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, 1 + n_branches, /* extend= */ true); + if (r < 0) + return r; + + tpm2_log_debug_digest(digest, "PolicyOR calculated digest"); + + return 0; +} + +/* Extend 'digest' with the PolicyPCR calculated hash. */ +int tpm2_calculate_policy_pcr( + const Tpm2PCRValue *pcr_values, + size_t n_pcr_values, + TPM2B_DIGEST *digest) { + + TPM2_CC command = TPM2_CC_PolicyPCR; + TSS2_RC rc; + int r; + + assert(pcr_values || n_pcr_values == 0); + assert(digest); + assert(digest->size == SHA256_DIGEST_SIZE); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + TPML_PCR_SELECTION pcr_selection; + _cleanup_free_ TPM2B_DIGEST *values = NULL; + size_t n_values; + r = tpm2_tpml_pcr_selection_from_pcr_values(pcr_values, n_pcr_values, &pcr_selection, &values, &n_values); + if (r < 0) + return log_debug_errno(r, "Could not convert PCR values to TPML_PCR_SELECTION: %m"); + + TPM2B_DIGEST hash = {}; + r = tpm2_digest_many_digests(TPM2_ALG_SHA256, &hash, values, n_values, /* extend= */ false); + if (r < 0) + return r; + + _cleanup_free_ uint8_t *buf = NULL; + size_t size = 0, maxsize = sizeof(command) + sizeof(pcr_selection); + + buf = malloc(maxsize); + if (!buf) + return log_oom_debug(); + + rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, maxsize, &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal PolicyPCR command: %s", sym_Tss2_RC_Decode(rc)); + + rc = sym_Tss2_MU_TPML_PCR_SELECTION_Marshal(&pcr_selection, buf, maxsize, &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal PCR selection: %s", sym_Tss2_RC_Decode(rc)); + + struct iovec data[] = { + IOVEC_MAKE(buf, size), + IOVEC_MAKE(hash.buffer, hash.size), + }; + r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, ELEMENTSOF(data), /* extend= */ true); + if (r < 0) + return r; + + tpm2_log_debug_digest(digest, "PolicyPCR calculated digest"); + + return 0; +} + +int tpm2_policy_pcr( + Tpm2Context *c, + const Tpm2Handle *session, + const TPML_PCR_SELECTION *pcr_selection, + TPM2B_DIGEST **ret_policy_digest) { + + TSS2_RC rc; + + assert(c); + assert(session); + assert(pcr_selection); + + log_debug("Submitting PCR hash policy."); + + rc = sym_Esys_PolicyPCR( + c->esys_context, + session->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + NULL, + pcr_selection); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to add PCR policy to TPM: %s", sym_Tss2_RC_Decode(rc)); + + return tpm2_get_policy_digest(c, session, ret_policy_digest); +} + +/* Extend 'digest' with the PolicyAuthorize calculated hash. */ +int tpm2_calculate_policy_authorize( + const TPM2B_PUBLIC *public, + const TPM2B_DIGEST *policy_ref, + TPM2B_DIGEST *digest) { + + TPM2_CC command = TPM2_CC_PolicyAuthorize; + TSS2_RC rc; + int r; + + assert(public); + assert(digest); + assert(digest->size == SHA256_DIGEST_SIZE); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + uint8_t buf[sizeof(command)]; + size_t offset = 0; + + rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal PolicyAuthorize command: %s", sym_Tss2_RC_Decode(rc)); + + if (offset != sizeof(command)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Offset 0x%zx wrong after marshalling PolicyAuthorize command", offset); + + TPM2B_NAME name = {}; + r = tpm2_calculate_pubkey_name(&public->publicArea, &name); + if (r < 0) + return r; + + /* PolicyAuthorize does not use the previous hash value; we must zero and then extend it. */ + zero(digest->buffer); + + struct iovec data[] = { + IOVEC_MAKE(buf, offset), + IOVEC_MAKE(name.name, name.size), + }; + r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, ELEMENTSOF(data), /* extend= */ true); + if (r < 0) + return r; + + /* PolicyAuthorize requires hashing twice; this is either an extension or rehashing. */ + if (policy_ref) + r = tpm2_digest_many_digests(TPM2_ALG_SHA256, digest, policy_ref, 1, /* extend= */ true); + else + r = tpm2_digest_rehash(TPM2_ALG_SHA256, digest); + if (r < 0) + return r; + + tpm2_log_debug_digest(digest, "PolicyAuthorize calculated digest"); + + return 0; +} + +static int tpm2_policy_authorize( + Tpm2Context *c, + const Tpm2Handle *session, + TPML_PCR_SELECTION *pcr_selection, + const TPM2B_PUBLIC *public, + const void *fp, + size_t fp_size, + JsonVariant *signature_json, + TPM2B_DIGEST **ret_policy_digest) { + + TSS2_RC rc; + int r; + + assert(c); + assert(session); + assert(pcr_selection); + assert(public); + assert(fp && fp_size > 0); + + log_debug("Adding PCR signature policy."); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *pubkey_handle = NULL; + r = tpm2_load_external(c, NULL, public, NULL, &pubkey_handle); + if (r < 0) + return r; + + /* Acquire the "name" of what we just loaded */ + _cleanup_(Esys_Freep) TPM2B_NAME *pubkey_name = NULL; + r = tpm2_get_name(c, pubkey_handle, &pubkey_name); + if (r < 0) + return r; + + /* If we have a signature, proceed with verifying the PCR digest */ + const TPMT_TK_VERIFIED *check_ticket; + _cleanup_(Esys_Freep) TPMT_TK_VERIFIED *check_ticket_buffer = NULL; + _cleanup_(Esys_Freep) TPM2B_DIGEST *approved_policy = NULL; + if (signature_json) { + r = tpm2_policy_pcr( + c, + session, + pcr_selection, + &approved_policy); + if (r < 0) + return r; + + _cleanup_free_ void *signature_raw = NULL; + size_t signature_size; + + r = find_signature( + signature_json, + pcr_selection, + fp, fp_size, + approved_policy->buffer, + approved_policy->size, + &signature_raw, + &signature_size); + if (r < 0) + return r; + + /* TPM2_VerifySignature() will only verify the RSA part of the RSA+SHA256 signature, + * hence we need to do the SHA256 part ourselves, first */ + TPM2B_DIGEST signature_hash = *approved_policy; + r = tpm2_digest_rehash(TPM2_ALG_SHA256, &signature_hash); + if (r < 0) + return r; + + r = TPM2B_PUBLIC_KEY_RSA_CHECK_SIZE(signature_size); + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Signature larger than buffer."); + + TPMT_SIGNATURE policy_signature = { + .sigAlg = TPM2_ALG_RSASSA, + .signature.rsassa = { + .hash = TPM2_ALG_SHA256, + .sig = TPM2B_PUBLIC_KEY_RSA_MAKE(signature_raw, signature_size), + }, + }; + + rc = sym_Esys_VerifySignature( + c->esys_context, + pubkey_handle->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + &signature_hash, + &policy_signature, + &check_ticket_buffer); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to validate signature in TPM: %s", sym_Tss2_RC_Decode(rc)); + + check_ticket = check_ticket_buffer; + } else { + /* When enrolling, we pass a NULL ticket */ + static const TPMT_TK_VERIFIED check_ticket_null = { + .tag = TPM2_ST_VERIFIED, + .hierarchy = TPM2_RH_OWNER, + }; + + check_ticket = &check_ticket_null; + } + + rc = sym_Esys_PolicyAuthorize( + c->esys_context, + session->esys_handle, + ESYS_TR_NONE, + ESYS_TR_NONE, + ESYS_TR_NONE, + approved_policy, + /* policyRef= */ &(const TPM2B_NONCE) {}, + pubkey_name, + check_ticket); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to push Authorize policy into TPM: %s", sym_Tss2_RC_Decode(rc)); + + return tpm2_get_policy_digest(c, session, ret_policy_digest); +} + +/* Extend 'digest' with the calculated policy hash. */ +int tpm2_calculate_sealing_policy( + const Tpm2PCRValue *pcr_values, + size_t n_pcr_values, + const TPM2B_PUBLIC *public, + bool use_pin, + const Tpm2PCRLockPolicy *pcrlock_policy, + TPM2B_DIGEST *digest) { + + int r; + + assert(pcr_values || n_pcr_values == 0); + assert(digest); + + if (public && pcrlock_policy) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Policies with both signed PCR and pcrlock are currently not supported."); + + if (public) { + r = tpm2_calculate_policy_authorize(public, NULL, digest); + if (r < 0) + return r; + } + + if (pcrlock_policy) { + TPM2B_NV_PUBLIC nv_public; + + r = tpm2_unmarshal_nv_public( + pcrlock_policy->nv_public.iov_base, + pcrlock_policy->nv_public.iov_len, + &nv_public); + if (r < 0) + return r; + + r = tpm2_calculate_policy_authorize_nv(&nv_public, digest); + if (r < 0) + return r; + } + + if (n_pcr_values > 0) { + r = tpm2_calculate_policy_pcr(pcr_values, n_pcr_values, digest); + if (r < 0) + return r; + } + + if (use_pin) { + r = tpm2_calculate_policy_auth_value(digest); + if (r < 0) + return r; + } + + return 0; +} + +static int tpm2_build_sealing_policy( + Tpm2Context *c, + const Tpm2Handle *session, + uint32_t hash_pcr_mask, + uint16_t pcr_bank, + const TPM2B_PUBLIC *public, + const void *fp, + size_t fp_size, + uint32_t pubkey_pcr_mask, + JsonVariant *signature_json, + bool use_pin, + const Tpm2PCRLockPolicy *pcrlock_policy, + TPM2B_DIGEST **ret_policy_digest) { + + int r; + + assert(c); + assert(session); + assert(pubkey_pcr_mask == 0 || public); + + log_debug("Building sealing policy."); + + if ((hash_pcr_mask | pubkey_pcr_mask) != 0) { + r = tpm2_pcr_mask_good(c, pcr_bank, hash_pcr_mask|pubkey_pcr_mask); + if (r < 0) + return r; + if (r == 0) + log_debug("Selected TPM2 PCRs are not initialized on this system."); + } + + if (pubkey_pcr_mask != 0 && pcrlock_policy) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Policies with both signed PCR and pcrlock are currently not supported."); + + if (pubkey_pcr_mask != 0) { + TPML_PCR_SELECTION pcr_selection; + tpm2_tpml_pcr_selection_from_mask(pubkey_pcr_mask, (TPMI_ALG_HASH)pcr_bank, &pcr_selection); + r = tpm2_policy_authorize(c, session, &pcr_selection, public, fp, fp_size, signature_json, NULL); + if (r < 0) + return r; + } + + if (pcrlock_policy) { + _cleanup_(tpm2_handle_freep) Tpm2Handle *nv_handle = NULL; + + r = tpm2_policy_super_pcr( + c, + session, + &pcrlock_policy->prediction, + pcrlock_policy->algorithm); + if (r < 0) + return r; + + r = tpm2_deserialize( + c, + pcrlock_policy->nv_handle.iov_base, + pcrlock_policy->nv_handle.iov_len, + &nv_handle); + if (r < 0) + return r; + + r = tpm2_policy_authorize_nv( + c, + session, + nv_handle, + NULL); + if (r < 0) + return r; + } + + if (hash_pcr_mask != 0) { + TPML_PCR_SELECTION pcr_selection; + tpm2_tpml_pcr_selection_from_mask(hash_pcr_mask, (TPMI_ALG_HASH)pcr_bank, &pcr_selection); + r = tpm2_policy_pcr(c, session, &pcr_selection, NULL); + if (r < 0) + return r; + } + + if (use_pin) { + r = tpm2_policy_auth_value(c, session, NULL); + if (r < 0) + return r; + } + + r = tpm2_get_policy_digest(c, session, ret_policy_digest); + if (r < 0) + return r; + + return 0; +} + +#if HAVE_OPENSSL +static const struct { + TPM2_ECC_CURVE tpm2_ecc_curve_id; + int openssl_ecc_curve_id; +} tpm2_openssl_ecc_curve_table[] = { + { TPM2_ECC_NIST_P192, NID_X9_62_prime192v1, }, + { TPM2_ECC_NIST_P224, NID_secp224r1, }, + { TPM2_ECC_NIST_P256, NID_X9_62_prime256v1, }, + { TPM2_ECC_NIST_P384, NID_secp384r1, }, + { TPM2_ECC_NIST_P521, NID_secp521r1, }, + { TPM2_ECC_SM2_P256, NID_sm2, }, +}; + +static int tpm2_ecc_curve_from_openssl_curve_id(int openssl_ecc_curve_id, TPM2_ECC_CURVE *ret) { + assert(ret); + + FOREACH_ARRAY(t, tpm2_openssl_ecc_curve_table, ELEMENTSOF(tpm2_openssl_ecc_curve_table)) + if (t->openssl_ecc_curve_id == openssl_ecc_curve_id) { + *ret = t->tpm2_ecc_curve_id; + return 0; + } + + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "OpenSSL ECC curve id %d not supported.", openssl_ecc_curve_id); +} + +static int tpm2_ecc_curve_to_openssl_curve_id(TPM2_ECC_CURVE tpm2_ecc_curve_id, int *ret) { + assert(ret); + + FOREACH_ARRAY(t, tpm2_openssl_ecc_curve_table, ELEMENTSOF(tpm2_openssl_ecc_curve_table)) + if (t->tpm2_ecc_curve_id == tpm2_ecc_curve_id) { + *ret = t->openssl_ecc_curve_id; + return 0; + } + + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM2 ECC curve %u not supported.", tpm2_ecc_curve_id); +} + +#define TPM2_RSA_DEFAULT_EXPONENT UINT32_C(0x10001) + +int tpm2_tpm2b_public_to_openssl_pkey(const TPM2B_PUBLIC *public, EVP_PKEY **ret) { + int r; + + assert(public); + assert(ret); + + const TPMT_PUBLIC *p = &public->publicArea; + switch (p->type) { + case TPM2_ALG_ECC: { + int curve_id; + r = tpm2_ecc_curve_to_openssl_curve_id(p->parameters.eccDetail.curveID, &curve_id); + if (r < 0) + return r; + + const TPMS_ECC_POINT *point = &p->unique.ecc; + return ecc_pkey_from_curve_x_y( + curve_id, + point->x.buffer, + point->x.size, + point->y.buffer, + point->y.size, + ret); + } + case TPM2_ALG_RSA: { + /* TPM specification Part 2 ("Structures") section for TPMS_RSA_PARAMS states "An exponent of + * zero indicates that the exponent is the default of 2^16 + 1". */ + uint32_t exponent = htobe32(p->parameters.rsaDetail.exponent ?: TPM2_RSA_DEFAULT_EXPONENT); + return rsa_pkey_from_n_e( + p->unique.rsa.buffer, + p->unique.rsa.size, + &exponent, + sizeof(exponent), + ret); + } + default: + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM2 asymmetric algorithm 0x%" PRIx16 " not supported.", p->type); + } +} + +/* Be careful before changing anything in this function, as the TPM key "name" is calculated using the entire + * TPMT_PUBLIC (after marshalling), and that "name" is used (for example) to calculate the policy hash for + * the Authorize policy. So we must ensure this conversion of a PEM to TPM2B_PUBLIC does not change the + * "name", because it would break unsealing of previously-sealed objects that used (for example) + * tpm2_calculate_policy_authorize(). See bug #30546. */ +int tpm2_tpm2b_public_from_openssl_pkey(const EVP_PKEY *pkey, TPM2B_PUBLIC *ret) { + int key_id, r; + + assert(pkey); + assert(ret); + + TPMT_PUBLIC public = { + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = TPMA_OBJECT_DECRYPT | TPMA_OBJECT_SIGN_ENCRYPT | TPMA_OBJECT_USERWITHAUTH, + .parameters.asymDetail = { + .symmetric.algorithm = TPM2_ALG_NULL, + .scheme.scheme = TPM2_ALG_NULL, + }, + }; + +#if OPENSSL_VERSION_MAJOR >= 3 + key_id = EVP_PKEY_get_id(pkey); +#else + key_id = EVP_PKEY_id(pkey); +#endif + + switch (key_id) { + case EVP_PKEY_EC: { + public.type = TPM2_ALG_ECC; + + int curve_id; + _cleanup_free_ void *x = NULL, *y = NULL; + size_t x_size, y_size; + r = ecc_pkey_to_curve_x_y(pkey, &curve_id, &x, &x_size, &y, &y_size); + if (r < 0) + return log_debug_errno(r, "Could not get ECC key curve/x/y: %m"); + + TPM2_ECC_CURVE curve; + r = tpm2_ecc_curve_from_openssl_curve_id(curve_id, &curve); + if (r < 0) + return r; + + public.parameters.eccDetail.curveID = curve; + + public.parameters.eccDetail.kdf.scheme = TPM2_ALG_NULL; + + r = TPM2B_ECC_PARAMETER_CHECK_SIZE(x_size); + if (r < 0) + return log_debug_errno(r, "ECC key x size %zu too large.", x_size); + + public.unique.ecc.x = TPM2B_ECC_PARAMETER_MAKE(x, x_size); + + r = TPM2B_ECC_PARAMETER_CHECK_SIZE(y_size); + if (r < 0) + return log_debug_errno(r, "ECC key y size %zu too large.", y_size); + + public.unique.ecc.y = TPM2B_ECC_PARAMETER_MAKE(y, y_size); + + break; + } + case EVP_PKEY_RSA: { + public.type = TPM2_ALG_RSA; + + _cleanup_free_ void *n = NULL, *e = NULL; + size_t n_size, e_size; + r = rsa_pkey_to_n_e(pkey, &n, &n_size, &e, &e_size); + if (r < 0) + return log_debug_errno(r, "Could not get RSA key n/e: %m"); + + r = TPM2B_PUBLIC_KEY_RSA_CHECK_SIZE(n_size); + if (r < 0) + return log_debug_errno(r, "RSA key n size %zu too large.", n_size); + + public.unique.rsa = TPM2B_PUBLIC_KEY_RSA_MAKE(n, n_size); + public.parameters.rsaDetail.keyBits = n_size * 8; + + if (sizeof(uint32_t) < e_size) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "RSA key e size %zu too large.", e_size); + + uint32_t exponent = 0; + memcpy(&exponent, e, e_size); + exponent = be32toh(exponent) >> (32 - e_size * 8); + + /* TPM specification Part 2 ("Structures") section for TPMS_RSA_PARAMS states "An exponent of + * zero indicates that the exponent is the default of 2^16 + 1". However, we have no reason + * to special case it in our PEM->TPM2B_PUBLIC conversion, and doing so could break backwards + * compatibility, so even if it is the "default" value of 0x10001, we do not set it to 0. */ + public.parameters.rsaDetail.exponent = exponent; + + break; + } + default: + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "EVP_PKEY type %d not supported.", key_id); + } + + *ret = (TPM2B_PUBLIC) { + .size = sizeof(public), + .publicArea = public, + }; + + return 0; +} +#endif + +int tpm2_tpm2b_public_to_fingerprint( + const TPM2B_PUBLIC *public, + void **ret_fingerprint, + size_t *ret_fingerprint_size) { + +#if HAVE_OPENSSL + int r; + + assert(public); + assert(ret_fingerprint); + assert(ret_fingerprint_size); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + r = tpm2_tpm2b_public_to_openssl_pkey(public, &pkey); + if (r < 0) + return r; + + /* Hardcode fingerprint to SHA256 */ + return pubkey_fingerprint(pkey, EVP_sha256(), ret_fingerprint, ret_fingerprint_size); +#else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled."); +#endif +} + +int tpm2_tpm2b_public_from_pem(const void *pem, size_t pem_size, TPM2B_PUBLIC *ret) { +#if HAVE_OPENSSL + int r; + + assert(pem); + assert(ret); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + r = openssl_pkey_from_pem(pem, pem_size, &pkey); + if (r < 0) + return r; + + return tpm2_tpm2b_public_from_openssl_pkey(pkey, ret); +#else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled."); +#endif +} + +/* Marshal the public, private, and seed objects into a single nonstandard 'blob'. The public and private + * objects are required, while the seed is optional. This is not a (publicly) standard format, this is + * specific to how we currently store the sealed object. This 'blob' can be unmarshalled by + * tpm2_unmarshal_blob(). */ +int tpm2_marshal_blob( + const TPM2B_PUBLIC *public, + const TPM2B_PRIVATE *private, + const TPM2B_ENCRYPTED_SECRET *seed, + void **ret_blob, + size_t *ret_blob_size) { + + TSS2_RC rc; + + assert(public); + assert(private); + assert(ret_blob); + assert(ret_blob_size); + + size_t max_size = sizeof(*private) + sizeof(*public); + if (seed) + max_size += sizeof(*seed); + + _cleanup_free_ void *blob = malloc(max_size); + if (!blob) + return log_oom_debug(); + + size_t blob_size = 0; + rc = sym_Tss2_MU_TPM2B_PRIVATE_Marshal(private, blob, max_size, &blob_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal private key: %s", sym_Tss2_RC_Decode(rc)); + + rc = sym_Tss2_MU_TPM2B_PUBLIC_Marshal(public, blob, max_size, &blob_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal public key: %s", sym_Tss2_RC_Decode(rc)); + + if (seed) { + rc = sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Marshal(seed, blob, max_size, &blob_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal encrypted seed: %s", sym_Tss2_RC_Decode(rc)); + } + + *ret_blob = TAKE_PTR(blob); + *ret_blob_size = blob_size; + + return 0; +} + +/* Unmarshal the 'blob' into public, private, and seed objects. The public and private objects are required + * in the 'blob', while the seed is optional. This is not a (publicly) standard format, this is specific to + * how we currently store the sealed object. This expects the 'blob' to have been created by + * tpm2_marshal_blob(). */ +int tpm2_unmarshal_blob( + const void *blob, + size_t blob_size, + TPM2B_PUBLIC *ret_public, + TPM2B_PRIVATE *ret_private, + TPM2B_ENCRYPTED_SECRET *ret_seed) { + + TSS2_RC rc; + + assert(blob); + assert(ret_public); + assert(ret_private); + assert(ret_seed); + + TPM2B_PRIVATE private = {}; + size_t offset = 0; + rc = sym_Tss2_MU_TPM2B_PRIVATE_Unmarshal(blob, blob_size, &offset, &private); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unmarshal private key: %s", sym_Tss2_RC_Decode(rc)); + + TPM2B_PUBLIC public = {}; + rc = sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal(blob, blob_size, &offset, &public); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unmarshal public key: %s", sym_Tss2_RC_Decode(rc)); + + TPM2B_ENCRYPTED_SECRET seed = {}; + if (blob_size > offset) { + rc = sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Unmarshal(blob, blob_size, &offset, &seed); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unmarshal encrypted seed: %s", sym_Tss2_RC_Decode(rc)); + } + + *ret_public = public; + *ret_private = private; + *ret_seed = seed; + + return 0; +} + +/* Calculate a serialized handle. Once the upstream tpm2-tss library provides an api to do this, we can + * remove this function. The addition of this functionality in tpm2-tss may be tracked here: + * https://github.com/tpm2-software/tpm2-tss/issues/2575 */ +int tpm2_calculate_serialize( + TPM2_HANDLE handle, + const TPM2B_NAME *name, + const TPM2B_PUBLIC *public, + void **ret_serialized, + size_t *ret_serialized_size) { + + TSS2_RC rc; + + assert(name); + assert(public); + assert(ret_serialized); + assert(ret_serialized_size); + + size_t max_size = sizeof(TPM2_HANDLE) + sizeof(TPM2B_NAME) + sizeof(uint32_t) + sizeof(TPM2B_PUBLIC); + _cleanup_free_ void *serialized = malloc(max_size); + if (!serialized) + return log_oom_debug(); + + size_t serialized_size = 0; + rc = sym_Tss2_MU_TPM2_HANDLE_Marshal(handle, serialized, max_size, &serialized_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal tpm handle: %s", sym_Tss2_RC_Decode(rc)); + + rc = sym_Tss2_MU_TPM2B_NAME_Marshal(name, serialized, max_size, &serialized_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal name: %s", sym_Tss2_RC_Decode(rc)); + + /* This is defined (non-publicly) in the tpm2-tss source as IESYSC_KEY_RSRC, to a value of "1". */ + rc = sym_Tss2_MU_UINT32_Marshal(UINT32_C(1), serialized, max_size, &serialized_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal esys resource id: %s", sym_Tss2_RC_Decode(rc)); + + rc = sym_Tss2_MU_TPM2B_PUBLIC_Marshal(public, serialized, max_size, &serialized_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal public: %s", sym_Tss2_RC_Decode(rc)); + + *ret_serialized = TAKE_PTR(serialized); + *ret_serialized_size = serialized_size; + + return 0; +} + +/* Serialize a handle. This produces a binary object that can be later deserialized (by the same TPM), even + * across restarts of the TPM or reboots (assuming the handle is persistent). */ +int tpm2_serialize( + Tpm2Context *c, + const Tpm2Handle *handle, + void **ret_serialized, + size_t *ret_serialized_size) { + + TSS2_RC rc; + + assert(c); + assert(handle); + assert(ret_serialized); + assert(ret_serialized_size); + + _cleanup_(Esys_Freep) unsigned char *serialized = NULL; + size_t size = 0; + rc = sym_Esys_TR_Serialize(c->esys_context, handle->esys_handle, &serialized, &size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to serialize: %s", sym_Tss2_RC_Decode(rc)); + + *ret_serialized = TAKE_PTR(serialized); + *ret_serialized_size = size; + + return 0; +} + +int tpm2_deserialize( + Tpm2Context *c, + const void *serialized, + size_t serialized_size, + Tpm2Handle **ret_handle) { + + TSS2_RC rc; + int r; + + assert(c); + assert(serialized); + assert(ret_handle); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + r = tpm2_handle_new(c, &handle); + if (r < 0) + return r; + + /* Since this is an existing handle in the TPM we should not implicitly flush it. */ + handle->flush = false; + + rc = sym_Esys_TR_Deserialize(c->esys_context, serialized, serialized_size, &handle->esys_handle); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to deserialize: %s", sym_Tss2_RC_Decode(rc)); + + *ret_handle = TAKE_PTR(handle); + + return 0; +} + +#if HAVE_OPENSSL + +/* KDFa() as defined by the TPM spec. */ +static int tpm2_kdfa( + TPMI_ALG_HASH hash_alg, + const void *key, + size_t key_len, + const char *label, + const void *context, + size_t context_len, + size_t bits, + void **ret_key, + size_t *ret_key_len) { + + int r; + + assert(key); + assert(label); + assert(context || context_len == 0); + assert(bits > 0); + assert(bits <= SIZE_MAX - 7); + assert(ret_key); + assert(ret_key_len); + + log_debug("Calculating KDFa()."); + + size_t len = DIV_ROUND_UP(bits, 8); + + const char *hash_alg_name = tpm2_hash_alg_to_string(hash_alg); + if (!hash_alg_name) + return -EOPNOTSUPP; + + _cleanup_free_ void *buf = NULL; + r = kdf_kb_hmac_derive( + "COUNTER", + hash_alg_name, + key, + key_len, + label, + strlen(label), + context, + context_len, + /* seed= */ NULL, + /* seed_len= */ 0, + len, + &buf); + if (r < 0) + return r; + + /* If the number of bits results in a partial byte, the TPM spec requires we zero the unrequested + * bits in the MSB (i.e. at index 0). From the spec Part 1 ("Architecture") section on Key + * Derivation Function, specifically KDFa(): + * + * "The implied return from this function is a sequence of octets with a length equal to (bits + 7) / + * 8. If bits is not an even multiple of 8, then the returned value occupies the least significant + * bits of the returned octet array, and the additional, high-order bits in the 0th octet are + * CLEAR. The unused bits of the most significant octet (MSO) are masked off and not shifted." */ + size_t partial = bits % 8; + if (partial > 0) + ((uint8_t*) buf)[0] &= 0xffu >> (8 - partial); + + *ret_key = TAKE_PTR(buf); + *ret_key_len = len; + + return 0; +} + +/* KDFe() as defined by the TPM spec. */ +static int tpm2_kdfe( + TPMI_ALG_HASH hash_alg, + const void *shared_secret, + size_t shared_secret_len, + const char *label, + const void *context_u, + size_t context_u_size, + const void *context_v, + size_t context_v_size, + size_t bits, + void **ret_key, + size_t *ret_key_len) { + + int r; + + assert(shared_secret); + assert(label); + assert(context_u); + assert(context_v); + assert(bits > 0); + assert(bits <= SIZE_MAX - 7); + assert(ret_key); + assert(ret_key_len); + + log_debug("Calculating KDFe()."); + + size_t len = DIV_ROUND_UP(bits, 8); + + const char *hash_alg_name = tpm2_hash_alg_to_string(hash_alg); + if (!hash_alg_name) + return -EOPNOTSUPP; + + size_t info_len = strlen(label) + 1 + context_u_size + context_v_size; + _cleanup_free_ void *info = malloc(info_len); + if (!info) + return log_oom_debug(); + + void *end = mempcpy(mempcpy(stpcpy(info, label) + 1, context_u, context_u_size), context_v, context_v_size); + /* assert we copied exactly the right amount that we allocated */ + assert(end > info && (uintptr_t) end - (uintptr_t) info == info_len); + + _cleanup_free_ void *buf = NULL; + r = kdf_ss_derive( + hash_alg_name, + shared_secret, + shared_secret_len, + /* salt= */ NULL, + /* salt_size= */ 0, + info, + info_len, + len, + &buf); + if (r < 0) + return r; + + *ret_key = TAKE_PTR(buf); + *ret_key_len = len; + + return 0; +} + +static int tpm2_calculate_seal_public( + const TPM2B_PUBLIC *parent, + const TPMA_OBJECT *attributes, + const TPM2B_DIGEST *policy, + const TPM2B_DIGEST *seed, + const void *secret, + size_t secret_size, + TPM2B_PUBLIC *ret) { + + int r; + + assert(parent); + assert(seed); + assert(secret); + assert(ret); + + log_debug("Calculating public part of sealed object."); + + struct iovec data[] = { + IOVEC_MAKE((void*) seed->buffer, seed->size), + IOVEC_MAKE((void*) secret, secret_size), + }; + TPM2B_DIGEST unique; + r = tpm2_digest_many( + parent->publicArea.nameAlg, + &unique, + data, + ELEMENTSOF(data), + /* extend= */ false); + if (r < 0) + return r; + + *ret = (TPM2B_PUBLIC) { + .size = sizeof(TPMT_PUBLIC), + .publicArea = { + .type = TPM2_ALG_KEYEDHASH, + .nameAlg = parent->publicArea.nameAlg, + .objectAttributes = attributes ? *attributes : 0, + .authPolicy = policy ? *policy : TPM2B_DIGEST_MAKE(NULL, unique.size), + .parameters.keyedHashDetail.scheme.scheme = TPM2_ALG_NULL, + .unique.keyedHash = unique, + }, + }; + + return 0; +} + +static int tpm2_calculate_seal_private( + const TPM2B_PUBLIC *parent, + const TPM2B_NAME *name, + const char *pin, + const TPM2B_DIGEST *seed, + const void *secret, + size_t secret_size, + TPM2B_PRIVATE *ret) { + + TSS2_RC rc; + int r; + + assert(parent); + assert(name); + assert(seed); + assert(secret); + assert(ret); + + log_debug("Calculating private part of sealed object."); + + _cleanup_free_ void *storage_key = NULL; + size_t storage_key_size; + r = tpm2_kdfa(parent->publicArea.nameAlg, + seed->buffer, + seed->size, + "STORAGE", + name->name, + name->size, + (size_t) parent->publicArea.parameters.asymDetail.symmetric.keyBits.sym, + &storage_key, + &storage_key_size); + if (r < 0) + return log_debug_errno(r, "Could not calculate storage key KDFa: %m"); + + r = tpm2_hash_alg_to_size(parent->publicArea.nameAlg); + if (r < 0) + return -EOPNOTSUPP; + + size_t bits = (size_t) r * 8; + + _cleanup_free_ void *integrity_key = NULL; + size_t integrity_key_size; + r = tpm2_kdfa(parent->publicArea.nameAlg, + seed->buffer, + seed->size, + "INTEGRITY", + /* context= */ NULL, + /* n_context= */ 0, + bits, + &integrity_key, + &integrity_key_size); + if (r < 0) + return log_debug_errno(r, "Could not calculate integrity key KDFa: %m"); + + TPM2B_AUTH auth = {}; + if (pin) { + r = tpm2_get_pin_auth(parent->publicArea.nameAlg, pin, &auth); + if (r < 0) + return r; + } + + TPM2B_SENSITIVE sensitive = { + .size = sizeof(TPMT_SENSITIVE), + .sensitiveArea = { + .sensitiveType = TPM2_ALG_KEYEDHASH, + .authValue = auth, + .seedValue = *seed, + .sensitive.bits = TPM2B_SENSITIVE_DATA_MAKE(secret, secret_size), + }, + }; + + _cleanup_free_ void *marshalled_sensitive = malloc(sizeof(sensitive)); + if (!marshalled_sensitive) + return log_oom_debug(); + + size_t marshalled_sensitive_size = 0; + rc = sym_Tss2_MU_TPM2B_SENSITIVE_Marshal( + &sensitive, + marshalled_sensitive, + sizeof(sensitive), + &marshalled_sensitive_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal sensitive: %s", sym_Tss2_RC_Decode(rc)); + + const char *sym_alg = tpm2_sym_alg_to_string(parent->publicArea.parameters.asymDetail.symmetric.algorithm); + if (!sym_alg) + return -EOPNOTSUPP; + + const char *sym_mode = tpm2_sym_mode_to_string(parent->publicArea.parameters.asymDetail.symmetric.mode.sym); + if (!sym_mode) + return -EOPNOTSUPP; + + _cleanup_free_ void *encrypted_sensitive = NULL; + size_t encrypted_sensitive_size; + r = openssl_cipher( + sym_alg, + parent->publicArea.parameters.asymDetail.symmetric.keyBits.sym, + sym_mode, + storage_key, storage_key_size, + /* iv= */ NULL, /* n_iv= */ 0, + marshalled_sensitive, marshalled_sensitive_size, + &encrypted_sensitive, &encrypted_sensitive_size); + if (r < 0) + return r; + + const char *hash_alg_name = tpm2_hash_alg_to_string(parent->publicArea.nameAlg); + if (!hash_alg_name) + return -EOPNOTSUPP; + + _cleanup_free_ void *hmac_buffer = NULL; + size_t hmac_size = 0; + struct iovec hmac_data[] = { + IOVEC_MAKE((void*) encrypted_sensitive, encrypted_sensitive_size), + IOVEC_MAKE((void*) name->name, name->size), + }; + r = openssl_hmac_many( + hash_alg_name, + integrity_key, + integrity_key_size, + hmac_data, + ELEMENTSOF(hmac_data), + &hmac_buffer, + &hmac_size); + if (r < 0) + return r; + + TPM2B_DIGEST outer_hmac = TPM2B_DIGEST_MAKE(hmac_buffer, hmac_size); + + TPM2B_PRIVATE private = {}; + size_t private_size = 0; + rc = sym_Tss2_MU_TPM2B_DIGEST_Marshal( + &outer_hmac, + private.buffer, + sizeof(private.buffer), + &private_size); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal digest: %s", sym_Tss2_RC_Decode(rc)); + private.size = private_size; + + assert(sizeof(private.buffer) - private.size >= encrypted_sensitive_size); + memcpy_safe(&private.buffer[private.size], encrypted_sensitive, encrypted_sensitive_size); + private.size += encrypted_sensitive_size; + + *ret = private; + + return 0; +} + +static int tpm2_calculate_seal_rsa_seed( + const TPM2B_PUBLIC *parent, + void **ret_seed, + size_t *ret_seed_size, + void **ret_encrypted_seed, + size_t *ret_encrypted_seed_size) { + + int r; + + assert(parent); + assert(ret_seed); + assert(ret_seed_size); + assert(ret_encrypted_seed); + assert(ret_encrypted_seed_size); + + log_debug("Calculating encrypted seed for RSA sealed object."); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *parent_pkey = NULL; + r = tpm2_tpm2b_public_to_openssl_pkey(parent, &parent_pkey); + if (r < 0) + return log_debug_errno(r, "Could not convert TPM2B_PUBLIC to OpenSSL PKEY: %m"); + + r = tpm2_hash_alg_to_size(parent->publicArea.nameAlg); + if (r < 0) + return -EOPNOTSUPP; + + size_t seed_size = (size_t) r; + + _cleanup_free_ void *seed = malloc(seed_size); + if (!seed) + return log_oom_debug(); + + r = crypto_random_bytes(seed, seed_size); + if (r < 0) + return log_debug_errno(r, "Failed to generate random seed: %m"); + + const char *hash_alg_name = tpm2_hash_alg_to_string(parent->publicArea.nameAlg); + if (!hash_alg_name) + return -EOPNOTSUPP; + + _cleanup_free_ void *encrypted_seed = NULL; + size_t encrypted_seed_size; + r = rsa_oaep_encrypt_bytes( + parent_pkey, + hash_alg_name, + "DUPLICATE", + seed, + seed_size, + &encrypted_seed, + &encrypted_seed_size); + if (r < 0) + return log_debug_errno(r, "Could not RSA-OAEP encrypt random seed: %m"); + + *ret_seed = TAKE_PTR(seed); + *ret_seed_size = seed_size; + *ret_encrypted_seed = TAKE_PTR(encrypted_seed); + *ret_encrypted_seed_size = encrypted_seed_size; + + return 0; +} + +static int tpm2_calculate_seal_ecc_seed( + const TPM2B_PUBLIC *parent, + void **ret_seed, + size_t *ret_seed_size, + void **ret_encrypted_seed, + size_t *ret_encrypted_seed_size) { + + TSS2_RC rc; + int r; + + assert(parent); + assert(ret_seed); + assert(ret_seed_size); + assert(ret_encrypted_seed); + assert(ret_encrypted_seed_size); + + log_debug("Calculating encrypted seed for ECC sealed object."); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *parent_pkey = NULL; + r = tpm2_tpm2b_public_to_openssl_pkey(parent, &parent_pkey); + if (r < 0) + return log_debug_errno(r, "Could not convert TPM2B_PUBLIC to OpenSSL PKEY: %m"); + + int curve_id; + r = ecc_pkey_to_curve_x_y( + parent_pkey, + &curve_id, + /* ret_x= */ NULL, /* ret_x_size= */ 0, + /* ret_y= */ NULL, /* ret_y_size= */ 0); + if (r < 0) + return r; + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + r = ecc_pkey_new(curve_id, &pkey); + if (r < 0) + return r; + + _cleanup_free_ void *shared_secret = NULL; + size_t shared_secret_size; + r = ecc_ecdh(pkey, parent_pkey, &shared_secret, &shared_secret_size); + if (r < 0) + return log_debug_errno(r, "Could not generate ECC shared secret: %m"); + + _cleanup_free_ void *x = NULL, *y = NULL; + size_t x_size, y_size; + r = ecc_pkey_to_curve_x_y(pkey, /* curve_id= */ NULL, &x, &x_size, &y, &y_size); + if (r < 0) + return log_debug_errno(r, "Could not get ECC get x/y: %m"); + + r = TPM2B_ECC_PARAMETER_CHECK_SIZE(x_size); + if (r < 0) + return log_debug_errno(r, "ECC point x size %zu is too large: %m", x_size); + + r = TPM2B_ECC_PARAMETER_CHECK_SIZE(y_size); + if (r < 0) + return log_debug_errno(r, "ECC point y size %zu is too large: %m", y_size); + + TPMS_ECC_POINT point = { + .x = TPM2B_ECC_PARAMETER_MAKE(x, x_size), + .y = TPM2B_ECC_PARAMETER_MAKE(y, y_size), + }; + + _cleanup_free_ void *encrypted_seed = malloc(sizeof(point)); + if (!encrypted_seed) + return log_oom_debug(); + + size_t encrypted_seed_size = 0; + rc = sym_Tss2_MU_TPMS_ECC_POINT_Marshal(&point, encrypted_seed, sizeof(point), &encrypted_seed_size); + if (rc != TPM2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal ECC point: %s", sym_Tss2_RC_Decode(rc)); + + r = tpm2_hash_alg_to_size(parent->publicArea.nameAlg); + if (r < 0) + return -EOPNOTSUPP; + + size_t bits = (size_t) r * 8; + + _cleanup_free_ void *seed = NULL; + size_t seed_size; + r = tpm2_kdfe(parent->publicArea.nameAlg, + shared_secret, + shared_secret_size, + "DUPLICATE", + x, + x_size, + parent->publicArea.unique.ecc.x.buffer, + parent->publicArea.unique.ecc.x.size, + bits, + &seed, + &seed_size); + if (r < 0) + return log_debug_errno(r, "Could not calculate KDFe: %m"); + + *ret_seed = TAKE_PTR(seed); + *ret_seed_size = seed_size; + *ret_encrypted_seed = TAKE_PTR(encrypted_seed); + *ret_encrypted_seed_size = encrypted_seed_size; + + return 0; +} + +static int tpm2_calculate_seal_seed( + const TPM2B_PUBLIC *parent, + TPM2B_DIGEST *ret_seed, + TPM2B_ENCRYPTED_SECRET *ret_encrypted_seed) { + + int r; + + assert(parent); + assert(ret_seed); + assert(ret_encrypted_seed); + + log_debug("Calculating encrypted seed for sealed object."); + + _cleanup_free_ void *seed = NULL, *encrypted_seed = NULL; + size_t seed_size, encrypted_seed_size; + if (parent->publicArea.type == TPM2_ALG_RSA) + r = tpm2_calculate_seal_rsa_seed(parent, &seed, &seed_size, &encrypted_seed, &encrypted_seed_size); + else if (parent->publicArea.type == TPM2_ALG_ECC) + r = tpm2_calculate_seal_ecc_seed(parent, &seed, &seed_size, &encrypted_seed, &encrypted_seed_size); + else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unsupported parent key type 0x%" PRIx16, parent->publicArea.type); + if (r < 0) + return log_debug_errno(r, "Could not calculate encrypted seed: %m"); + + *ret_seed = TPM2B_DIGEST_MAKE(seed, seed_size); + *ret_encrypted_seed = TPM2B_ENCRYPTED_SECRET_MAKE(encrypted_seed, encrypted_seed_size); + + return 0; +} + +#endif /* HAVE_OPENSSL */ + +int tpm2_calculate_seal( + TPM2_HANDLE parent_handle, + const TPM2B_PUBLIC *parent_public, + const TPMA_OBJECT *attributes, + const void *secret, + size_t secret_size, + const TPM2B_DIGEST *policy, + const char *pin, + void **ret_secret, + size_t *ret_secret_size, + void **ret_blob, + size_t *ret_blob_size, + void **ret_serialized_parent, + size_t *ret_serialized_parent_size) { + +#if HAVE_OPENSSL + int r; + + assert(parent_public); + assert(secret || secret_size == 0); + assert(secret || ret_secret); + assert(!(secret && ret_secret)); /* Either provide a secret, or we create one, but not both */ + assert(ret_blob); + assert(ret_blob_size); + assert(ret_serialized_parent); + assert(ret_serialized_parent_size); + + log_debug("Calculating sealed object."); + + /* Default to the SRK. */ + if (parent_handle == 0) + parent_handle = TPM2_SRK_HANDLE; + + switch (TPM2_HANDLE_TYPE(parent_handle)) { + case TPM2_HT_PERSISTENT: + case TPM2_HT_NV_INDEX: + break; + case TPM2_HT_TRANSIENT: + log_warning("Handle is transient, sealed secret may not be recoverable."); + break; + default: + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Handle 0x%" PRIx32 " not persistent, transient, or NV.", + parent_handle); + } + + _cleanup_(erase_and_freep) void *generated_secret = NULL; + if (!secret) { + /* No secret provided, generate a random secret. We use SHA256 digest length, though it can + * be up to TPM2_MAX_SEALED_DATA. The secret length is not limited to the nameAlg hash + * size. */ + secret_size = TPM2_SHA256_DIGEST_SIZE; + generated_secret = malloc(secret_size); + if (!generated_secret) + return log_oom_debug(); + + r = crypto_random_bytes(generated_secret, secret_size); + if (r < 0) + return log_debug_errno(r, "Failed to generate secret key: %m"); + + secret = generated_secret; + } + + if (secret_size > TPM2_MAX_SEALED_DATA) + return log_debug_errno(SYNTHETIC_ERRNO(EOVERFLOW), + "Secret size %zu too large, limit is %d bytes.", + secret_size, TPM2_MAX_SEALED_DATA); + + TPM2B_DIGEST random_seed; + TPM2B_ENCRYPTED_SECRET seed; + r = tpm2_calculate_seal_seed(parent_public, &random_seed, &seed); + if (r < 0) + return r; + + TPM2B_PUBLIC public; + r = tpm2_calculate_seal_public(parent_public, attributes, policy, &random_seed, secret, secret_size, &public); + if (r < 0) + return r; + + TPM2B_NAME name; + r = tpm2_calculate_pubkey_name(&public.publicArea, &name); + if (r < 0) + return r; + + TPM2B_PRIVATE private; + r = tpm2_calculate_seal_private(parent_public, &name, pin, &random_seed, secret, secret_size, &private); + if (r < 0) + return r; + + _cleanup_free_ void *blob = NULL; + size_t blob_size; + r = tpm2_marshal_blob(&public, &private, &seed, &blob, &blob_size); + if (r < 0) + return log_debug_errno(r, "Could not create sealed blob: %m"); + + TPM2B_NAME parent_name; + r = tpm2_calculate_pubkey_name(&parent_public->publicArea, &parent_name); + if (r < 0) + return r; + + _cleanup_free_ void *serialized_parent = NULL; + size_t serialized_parent_size; + r = tpm2_calculate_serialize( + parent_handle, + &parent_name, + parent_public, + &serialized_parent, + &serialized_parent_size); + if (r < 0) + return r; + + if (ret_secret) + *ret_secret = TAKE_PTR(generated_secret); + if (ret_secret_size) + *ret_secret_size = secret_size; + *ret_blob = TAKE_PTR(blob); + *ret_blob_size = blob_size; + *ret_serialized_parent = TAKE_PTR(serialized_parent); + *ret_serialized_parent_size = serialized_parent_size; + + return 0; +#else /* HAVE_OPENSSL */ + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled."); +#endif +} + +int tpm2_seal(Tpm2Context *c, + uint32_t seal_key_handle, + const TPM2B_DIGEST *policy, + const char *pin, + void **ret_secret, + size_t *ret_secret_size, + void **ret_blob, + size_t *ret_blob_size, + uint16_t *ret_primary_alg, + void **ret_srk_buf, + size_t *ret_srk_buf_size) { + + uint16_t primary_alg = 0; + int r; + + assert(ret_secret); + assert(ret_secret_size); + assert(ret_blob); + assert(ret_blob_size); + + /* So here's what we do here: we connect to the TPM2 chip. It persistently contains a "seed" key that + * is randomized when the TPM2 is first initialized or reset and remains stable across boots. We + * generate a "primary" key pair derived from that (ECC if possible, RSA as fallback). Given the seed + * remains fixed this will result in the same key pair whenever we specify the exact same parameters + * for it. We then create a PCR-bound policy session, which calculates a hash on the current PCR + * values of the indexes we specify. We then generate a randomized key on the host (which is the key + * we actually enroll in the LUKS2 keyslots), which we upload into the TPM2, where it is encrypted + * with the "primary" key, taking the PCR policy session into account. We then download the encrypted + * key from the TPM2 ("sealing") and marshall it into binary form, which is ultimately placed in the + * LUKS2 JSON header. + * + * The TPM2 "seed" key and "primary" keys never leave the TPM2 chip (and cannot be extracted at + * all). The random key we enroll in LUKS2 we generate on the host using the Linux random device. It + * is stored in the LUKS2 JSON only in encrypted form with the "primary" key of the TPM2 chip, thus + * binding the unlocking to the TPM2 chip. */ + + usec_t start = now(CLOCK_MONOTONIC); + + /* We use a keyed hash object (i.e. HMAC) to store the secret key we want to use for unlocking the + * LUKS2 volume with. We don't ever use for HMAC/keyed hash operations however, we just use it + * because it's a key type that is universally supported and suitable for symmetric binary blobs. */ + TPMT_PUBLIC hmac_template = { + .type = TPM2_ALG_KEYEDHASH, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = TPMA_OBJECT_FIXEDTPM | TPMA_OBJECT_FIXEDPARENT, + .parameters.keyedHashDetail.scheme.scheme = TPM2_ALG_NULL, + .unique.keyedHash.size = SHA256_DIGEST_SIZE, + .authPolicy = policy ? *policy : TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE), + }; + + TPMS_SENSITIVE_CREATE hmac_sensitive = { + .data.size = hmac_template.unique.keyedHash.size, + }; + + CLEANUP_ERASE(hmac_sensitive); + + if (pin) { + r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &hmac_sensitive.userAuth); + if (r < 0) + return r; + } + + assert(sizeof(hmac_sensitive.data.buffer) >= hmac_sensitive.data.size); + + (void) tpm2_credit_random(c); + + log_debug("Generating secret key data."); + + r = crypto_random_bytes(hmac_sensitive.data.buffer, hmac_sensitive.data.size); + if (r < 0) + return log_debug_errno(r, "Failed to generate secret key: %m"); + + _cleanup_(tpm2_handle_freep) Tpm2Handle *primary_handle = NULL; + if (ret_srk_buf) { + _cleanup_(Esys_Freep) TPM2B_PUBLIC *primary_public = NULL; + + if (IN_SET(seal_key_handle, 0, TPM2_SRK_HANDLE)) { + r = tpm2_get_or_create_srk( + c, + /* session= */ NULL, + &primary_public, + /* ret_name= */ NULL, + /* ret_qname= */ NULL, + &primary_handle); + if (r < 0) + return r; + } else if (IN_SET(TPM2_HANDLE_TYPE(seal_key_handle), TPM2_HT_TRANSIENT, TPM2_HT_PERSISTENT)) { + r = tpm2_index_to_handle( + c, + seal_key_handle, + /* session= */ NULL, + &primary_public, + /* ret_name= */ NULL, + /* ret_qname= */ NULL, + &primary_handle); + if (r < 0) + return r; + if (r == 0) + /* We do NOT automatically create anything other than the SRK */ + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "No handle found at index 0x%" PRIx32, seal_key_handle); + } else + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Seal key handle 0x%" PRIx32 " is neither transient nor persistent.", + seal_key_handle); + + primary_alg = primary_public->publicArea.type; + } else { + if (seal_key_handle != 0) + log_debug("Using primary alg sealing, but seal key handle also provided; ignoring seal key handle."); + + /* TODO: force all callers to provide ret_srk_buf, so we can stop sealing with the legacy templates. */ + primary_alg = TPM2_ALG_ECC; + + TPM2B_PUBLIC template = { + .size = sizeof(TPMT_PUBLIC), + }; + r = tpm2_get_legacy_template(primary_alg, &template.publicArea); + if (r < 0) + return log_debug_errno(r, "Could not get legacy ECC template: %m"); + + if (!tpm2_supports_tpmt_public(c, &template.publicArea)) { + primary_alg = TPM2_ALG_RSA; + + r = tpm2_get_legacy_template(primary_alg, &template.publicArea); + if (r < 0) + return log_debug_errno(r, "Could not get legacy RSA template: %m"); + + if (!tpm2_supports_tpmt_public(c, &template.publicArea)) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM does not support either ECC or RSA legacy template."); + } + + r = tpm2_create_primary( + c, + /* session= */ NULL, + &template, + /* sensitive= */ NULL, + /* ret_public= */ NULL, + &primary_handle); + if (r < 0) + return r; + } + + _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL; + r = tpm2_make_encryption_session(c, primary_handle, /* bind_key= */ NULL, &encryption_session); + if (r < 0) + return r; + + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL; + r = tpm2_create(c, primary_handle, encryption_session, &hmac_template, &hmac_sensitive, &public, &private); + if (r < 0) + return r; + + _cleanup_(erase_and_freep) void *secret = NULL; + secret = memdup(hmac_sensitive.data.buffer, hmac_sensitive.data.size); + if (!secret) + return log_oom_debug(); + + log_debug("Marshalling private and public part of HMAC key."); + + _cleanup_free_ void *blob = NULL; + size_t blob_size = 0; + r = tpm2_marshal_blob(public, private, /* seed= */ NULL, &blob, &blob_size); + if (r < 0) + return log_debug_errno(r, "Could not create sealed blob: %m"); + + if (DEBUG_LOGGING) + log_debug("Completed TPM2 key sealing in %s.", FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - start, 1)); + + _cleanup_free_ void *srk_buf = NULL; + size_t srk_buf_size = 0; + if (ret_srk_buf) { + _cleanup_(Esys_Freep) void *tmp = NULL; + r = tpm2_serialize(c, primary_handle, &tmp, &srk_buf_size); + if (r < 0) + return r; + + /* + * make a copy since we don't want the caller to understand that + * ESYS allocated the pointer. It would make tracking what deallocator + * to use for srk_buf in which context a PITA. + */ + srk_buf = memdup(tmp, srk_buf_size); + if (!srk_buf) + return log_oom_debug(); + + *ret_srk_buf = TAKE_PTR(srk_buf); + *ret_srk_buf_size = srk_buf_size; + } + + *ret_secret = TAKE_PTR(secret); + *ret_secret_size = hmac_sensitive.data.size; + *ret_blob = TAKE_PTR(blob); + *ret_blob_size = blob_size; + + if (ret_primary_alg) + *ret_primary_alg = primary_alg; + + return 0; +} + +#define RETRY_UNSEAL_MAX 30u + +int tpm2_unseal(Tpm2Context *c, + uint32_t hash_pcr_mask, + uint16_t pcr_bank, + const void *pubkey, + size_t pubkey_size, + uint32_t pubkey_pcr_mask, + JsonVariant *signature, + const char *pin, + const Tpm2PCRLockPolicy *pcrlock_policy, + uint16_t primary_alg, + const void *blob, + size_t blob_size, + const void *known_policy_hash, + size_t known_policy_hash_size, + const void *srk_buf, + size_t srk_buf_size, + void **ret_secret, + size_t *ret_secret_size) { + + TSS2_RC rc; + int r; + + assert(blob); + assert(blob_size > 0); + assert(known_policy_hash_size == 0 || known_policy_hash); + assert(pubkey_size == 0 || pubkey); + assert(ret_secret); + assert(ret_secret_size); + + assert(TPM2_PCR_MASK_VALID(hash_pcr_mask)); + assert(TPM2_PCR_MASK_VALID(pubkey_pcr_mask)); + + /* So here's what we do here: We connect to the TPM2 chip. As we do when sealing we generate a + * "primary" key on the TPM2 chip, with the same parameters as well as a PCR-bound policy session. + * Given we pass the same parameters, this will result in the same "primary" key, and same policy + * hash (the latter of course, only if the PCR values didn't change in between). We unmarshal the + * encrypted key we stored in the LUKS2 JSON token header and upload it into the TPM2, where it is + * decrypted if the seed and the PCR policy were right ("unsealing"). We then download the result, + * and use it to unlock the LUKS2 volume. */ + + usec_t start = now(CLOCK_MONOTONIC); + + TPM2B_PUBLIC public; + TPM2B_PRIVATE private; + TPM2B_ENCRYPTED_SECRET seed = {}; + r = tpm2_unmarshal_blob(blob, blob_size, &public, &private, &seed); + if (r < 0) + return log_debug_errno(r, "Could not extract parts from blob: %m"); + + /* Older code did not save the pcr_bank, and unsealing needed to detect the best pcr bank to use, + * so we need to handle that legacy situation. */ + if (pcr_bank == UINT16_MAX) { + r = tpm2_get_best_pcr_bank(c, hash_pcr_mask|pubkey_pcr_mask, &pcr_bank); + if (r < 0) + return r; + } + + _cleanup_(tpm2_handle_freep) Tpm2Handle *primary_handle = NULL; + if (srk_buf) { + r = tpm2_deserialize(c, srk_buf, srk_buf_size, &primary_handle); + if (r < 0) + return r; + } else if (primary_alg != 0) { + TPM2B_PUBLIC template = { + .size = sizeof(TPMT_PUBLIC), + }; + r = tpm2_get_legacy_template(primary_alg, &template.publicArea); + if (r < 0) + return log_debug_errno(r, "Could not get legacy template: %m"); + + r = tpm2_create_primary( + c, + /* session= */ NULL, + &template, + /* sensitive= */ NULL, + /* ret_public= */ NULL, + &primary_handle); + if (r < 0) + return r; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "No SRK or primary alg provided."); + + if (seed.size > 0) { + /* This is a calculated (or duplicated) sealed object, and must be imported. */ + _cleanup_free_ TPM2B_PRIVATE *imported_private = NULL; + r = tpm2_import(c, + primary_handle, + /* session= */ NULL, + &public, + &private, + &seed, + /* encryption_key= */ NULL, + /* symmetric= */ NULL, + &imported_private); + if (r < 0) + return r; + + private = *imported_private; + } + + log_debug("Loading HMAC key into TPM."); + + /* + * Nothing sensitive on the bus, no need for encryption. Even if an attacker + * gives you back a different key, the session initiation will fail. In the + * SRK model, the tpmKey is verified. In the non-srk model, with pin, the bindKey + * provides protections. + */ + _cleanup_(tpm2_handle_freep) Tpm2Handle *hmac_key = NULL; + r = tpm2_load(c, primary_handle, NULL, &public, &private, &hmac_key); + if (r < 0) + return r; + + TPM2B_PUBLIC pubkey_tpm2b; + _cleanup_free_ void *fp = NULL; + size_t fp_size = 0; + if (pubkey) { + r = tpm2_tpm2b_public_from_pem(pubkey, pubkey_size, &pubkey_tpm2b); + if (r < 0) + return log_debug_errno(r, "Could not create TPMT_PUBLIC: %m"); + + r = tpm2_tpm2b_public_to_fingerprint(&pubkey_tpm2b, &fp, &fp_size); + if (r < 0) + return log_debug_errno(r, "Could not get key fingerprint: %m"); + } + + /* + * if a pin is set for the seal object, use it to bind the session + * key to that object. This prevents active bus interposers from + * faking a TPM and seeing the unsealed value. An active interposer + * could fake a TPM, satisfying the encrypted session, and just + * forward everything to the *real* TPM. + */ + r = tpm2_set_auth(c, hmac_key, pin); + if (r < 0) + return r; + + _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL; + r = tpm2_make_encryption_session(c, primary_handle, hmac_key, &encryption_session); + if (r < 0) + return r; + + _cleanup_(Esys_Freep) TPM2B_SENSITIVE_DATA* unsealed = NULL; + for (unsigned i = RETRY_UNSEAL_MAX;; i--) { + _cleanup_(tpm2_handle_freep) Tpm2Handle *policy_session = NULL; + _cleanup_(Esys_Freep) TPM2B_DIGEST *policy_digest = NULL; + r = tpm2_make_policy_session( + c, + primary_handle, + encryption_session, + &policy_session); + if (r < 0) + return r; + + r = tpm2_build_sealing_policy( + c, + policy_session, + hash_pcr_mask, + pcr_bank, + pubkey ? &pubkey_tpm2b : NULL, + fp, fp_size, + pubkey_pcr_mask, + signature, + !!pin, + pcrlock_policy, + &policy_digest); + if (r < 0) + return r; + + /* If we know the policy hash to expect, and it doesn't match, we can shortcut things here, and not + * wait until the TPM2 tells us to go away. */ + if (known_policy_hash_size > 0 && + memcmp_nn(policy_digest->buffer, policy_digest->size, known_policy_hash, known_policy_hash_size) != 0) { + +#if HAVE_OPENSSL + if (pubkey_size > 0 && + pubkey_tpm2b.publicArea.type == TPM2_ALG_RSA && + pubkey_tpm2b.publicArea.parameters.rsaDetail.exponent == TPM2_RSA_DEFAULT_EXPONENT) { + /* Due to bug #30546, if using RSA pubkey with the default exponent, we may + * need to set the exponent to the TPM special-case value of 0 and retry. */ + log_debug("Policy hash mismatch, retrying with RSA pubkey exponent set to 0."); + pubkey_tpm2b.publicArea.parameters.rsaDetail.exponent = 0; + continue; + } else +#endif + return log_debug_errno(SYNTHETIC_ERRNO(EPERM), + "Current policy digest does not match stored policy digest, cancelling " + "TPM2 authentication attempt."); + } + + log_debug("Unsealing HMAC key."); + + rc = sym_Esys_Unseal( + c->esys_context, + hmac_key->esys_handle, + policy_session->esys_handle, + encryption_session->esys_handle, /* use HMAC session to enable parameter encryption */ + ESYS_TR_NONE, + &unsealed); + if (rc == TSS2_RC_SUCCESS) + break; + if (rc != TPM2_RC_PCR_CHANGED || i == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unseal HMAC key in TPM: %s", sym_Tss2_RC_Decode(rc)); + log_debug("A PCR value changed during the TPM2 policy session, restarting HMAC key unsealing (%u tries left).", i); + } + + _cleanup_(erase_and_freep) char *secret = NULL; + secret = memdup(unsealed->buffer, unsealed->size); + explicit_bzero_safe(unsealed->buffer, unsealed->size); + if (!secret) + return log_oom_debug(); + + if (DEBUG_LOGGING) + log_debug("Completed TPM2 key unsealing in %s.", FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - start, 1)); + + *ret_secret = TAKE_PTR(secret); + *ret_secret_size = unsealed->size; + + return 0; +} + +static TPM2_HANDLE generate_random_nv_index(void) { + return TPM2_NV_INDEX_FIRST + (TPM2_HANDLE) random_u64_range(TPM2_NV_INDEX_LAST - TPM2_NV_INDEX_FIRST + 1); +} + +int tpm2_define_policy_nv_index( + Tpm2Context *c, + const Tpm2Handle *session, + TPM2_HANDLE requested_nv_index, + const TPM2B_DIGEST *write_policy, + const char *pin, + const TPM2B_AUTH *auth, + TPM2_HANDLE *ret_nv_index, + Tpm2Handle **ret_nv_handle, + TPM2B_NV_PUBLIC *ret_nv_public) { + + _cleanup_(tpm2_handle_freep) Tpm2Handle *new_handle = NULL; + TSS2_RC rc; + int r; + + assert(c); + assert(pin || auth); + + r = tpm2_handle_new(c, &new_handle); + if (r < 0) + return r; + + new_handle->flush = false; /* This is a persistent NV index, don't flush hence */ + + TPM2B_AUTH _auth = {}; + CLEANUP_ERASE(_auth); + + if (!auth) { + r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &_auth); + if (r < 0) + return r; + + auth = &_auth; + } + + for (unsigned try = 0; try < 25U; try++) { + TPM2_HANDLE nv_index; + + if (requested_nv_index != 0) + nv_index = requested_nv_index; + else + nv_index = generate_random_nv_index(); + + TPM2B_NV_PUBLIC public_info = { + .size = sizeof_field(TPM2B_NV_PUBLIC, nvPublic), + .nvPublic = { + .nvIndex = nv_index, + .nameAlg = TPM2_ALG_SHA256, + .attributes = TPM2_NT_ORDINARY | TPMA_NV_WRITEALL | TPMA_NV_POLICYWRITE | TPMA_NV_OWNERREAD, + .dataSize = offsetof(TPMT_HA, digest) + tpm2_hash_alg_to_size(TPM2_ALG_SHA256), + }, + }; + + if (write_policy) + public_info.nvPublic.authPolicy = *write_policy; + + rc = sym_Esys_NV_DefineSpace( + c->esys_context, + /* authHandle= */ ESYS_TR_RH_OWNER, + /* shandle1= */ session ? session->esys_handle : ESYS_TR_PASSWORD, + /* shandle2= */ ESYS_TR_NONE, + /* shandle3= */ ESYS_TR_NONE, + auth, + &public_info, + &new_handle->esys_handle); + + if (rc == TSS2_RC_SUCCESS) { + log_debug("NV Index 0x%" PRIx32 " successfully allocated.", nv_index); + + if (ret_nv_index) + *ret_nv_index = nv_index; + + if (ret_nv_handle) + *ret_nv_handle = TAKE_PTR(new_handle); + + if (ret_nv_public) + *ret_nv_public = public_info; + + return 0; + } + if (rc != TPM2_RC_NV_DEFINED) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to allocate NV index: %s", sym_Tss2_RC_Decode(rc)); + + if (requested_nv_index != 0) { + assert(nv_index == requested_nv_index); + return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), + "Requested NV index 0x%" PRIx32 " already taken.", requested_nv_index); + } + + log_debug("NV index 0x%" PRIu32 " already taken, trying another one (%u tries left)", nv_index, try); + } + + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Too many attempts trying to allocate NV index: %s", sym_Tss2_RC_Decode(rc)); +} + +int tpm2_write_policy_nv_index( + Tpm2Context *c, + const Tpm2Handle *policy_session, + TPM2_HANDLE nv_index, + const Tpm2Handle *nv_handle, + const TPM2B_DIGEST *policy_digest) { + + TSS2_RC rc; + + assert(c); + assert(policy_session); + assert(nv_handle); + assert(policy_digest); + + if (policy_digest->size != tpm2_hash_alg_to_size(TPM2_ALG_SHA256)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Policy to store in NV index has wrong size."); + + TPMT_HA ha = { + .hashAlg = TPM2_ALG_SHA256, + }; + assert(policy_digest->size <= sizeof_field(TPMT_HA, digest)); + memcpy_safe(&ha.digest, policy_digest->buffer, policy_digest->size); + + TPM2B_MAX_NV_BUFFER buffer = {}; + size_t written = 0; + rc = sym_Tss2_MU_TPMT_HA_Marshal(&ha, buffer.buffer, sizeof(buffer.buffer), &written); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to marshal policy digest."); + + buffer.size = written; + + rc = sym_Esys_NV_Write( + c->esys_context, + /* authHandle= */ nv_handle->esys_handle, + /* nvIndex= */ nv_handle->esys_handle, + /* shandle1= */ policy_session->esys_handle, + /* shandle2= */ ESYS_TR_NONE, + /* shandle3= */ ESYS_TR_NONE, + &buffer, + /* offset= */ 0); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to write NV index: %s", sym_Tss2_RC_Decode(rc)); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *h = NULL; + h = hexmem(policy_digest->buffer, policy_digest->size); + log_debug("Written policy digest %s to NV index 0x%x", strnull(h), nv_index); + } + + return 0; +} + +int tpm2_undefine_policy_nv_index( + Tpm2Context *c, + const Tpm2Handle *session, + TPM2_HANDLE nv_index, + const Tpm2Handle *nv_handle) { + + TSS2_RC rc; + + assert(c); + assert(nv_handle); + + rc = sym_Esys_NV_UndefineSpace( + c->esys_context, + /* authHandle= */ ESYS_TR_RH_OWNER, + /* nvIndex= */ nv_handle->esys_handle, + /* shandle1= */ session ? session->esys_handle : ESYS_TR_NONE, + /* shandle2= */ ESYS_TR_NONE, + /* shandle3= */ ESYS_TR_NONE); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to undefine NV index: %s", sym_Tss2_RC_Decode(rc)); + + log_debug("Undefined NV index 0x%x", nv_index); + return 0; +} + +int tpm2_seal_data( + Tpm2Context *c, + const struct iovec *data, + const Tpm2Handle *primary_handle, + const Tpm2Handle *encryption_session, + const TPM2B_DIGEST *policy, + struct iovec *ret_public, + struct iovec *ret_private) { + + int r; + + assert(c); + assert(data); + assert(primary_handle); + + /* This is a generic version of tpm2_seal(), that doesn't imply any policy or any specific + * combination of the two keypairs in their marshalling. tpm2_seal() is somewhat specific to the FDE + * usecase. We probably should migrate tpm2_seal() to use tpm2_seal_data() eventually. */ + + if (data->iov_len >= sizeof_field(TPMS_SENSITIVE_CREATE, data.buffer)) + return -E2BIG; + + TPMT_PUBLIC hmac_template = { + .type = TPM2_ALG_KEYEDHASH, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = TPMA_OBJECT_FIXEDTPM | TPMA_OBJECT_FIXEDPARENT, + .parameters.keyedHashDetail.scheme.scheme = TPM2_ALG_NULL, + .unique.keyedHash.size = data->iov_len, + .authPolicy = policy ? *policy : TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE), + }; + + TPMS_SENSITIVE_CREATE hmac_sensitive = { + .data.size = hmac_template.unique.keyedHash.size, + }; + + CLEANUP_ERASE(hmac_sensitive); + + memcpy_safe(hmac_sensitive.data.buffer, data->iov_base, data->iov_len); + + _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL; + _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL; + r = tpm2_create(c, primary_handle, encryption_session, &hmac_template, &hmac_sensitive, &public, &private); + if (r < 0) + return r; + + _cleanup_(iovec_done) struct iovec public_blob = {}, private_blob = {}; + + r = tpm2_marshal_private(private, &private_blob.iov_base, &private_blob.iov_len); + if (r < 0) + return r; + + r = tpm2_marshal_public(public, &public_blob.iov_base, &public_blob.iov_len); + if (r < 0) + return r; + + if (ret_public) + *ret_public = TAKE_STRUCT(public_blob); + if (ret_private) + *ret_private = TAKE_STRUCT(private_blob); + + return 0; +} + +int tpm2_unseal_data( + Tpm2Context *c, + const struct iovec *public_blob, + const struct iovec *private_blob, + const Tpm2Handle *primary_handle, + const Tpm2Handle *policy_session, + const Tpm2Handle *encryption_session, + struct iovec *ret_data) { + + TSS2_RC rc; + int r; + + assert(c); + assert(public_blob); + assert(private_blob); + assert(primary_handle); + + TPM2B_PUBLIC public; + r = tpm2_unmarshal_public(public_blob->iov_base, public_blob->iov_len, &public); + if (r < 0) + return r; + + TPM2B_PRIVATE private; + r = tpm2_unmarshal_private(private_blob->iov_base, private_blob->iov_len, &private); + if (r < 0) + return r; + + _cleanup_(tpm2_handle_freep) Tpm2Handle *what = NULL; + r = tpm2_load(c, primary_handle, NULL, &public, &private, &what); + if (r < 0) + return r; + + _cleanup_(Esys_Freep) TPM2B_SENSITIVE_DATA* unsealed = NULL; + rc = sym_Esys_Unseal( + c->esys_context, + what->esys_handle, + policy_session ? policy_session->esys_handle : ESYS_TR_NONE, + encryption_session ? encryption_session->esys_handle : ESYS_TR_NONE, + ESYS_TR_NONE, + &unsealed); + if (rc == TPM2_RC_PCR_CHANGED) + return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), + "PCR changed while unsealing."); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to unseal data: %s", sym_Tss2_RC_Decode(rc)); + + _cleanup_(iovec_done) struct iovec d = {}; + d = (struct iovec) { + .iov_base = memdup(unsealed->buffer, unsealed->size), + .iov_len = unsealed->size, + }; + + explicit_bzero_safe(unsealed->buffer, unsealed->size); + + if (!d.iov_base) + return log_oom_debug(); + + *ret_data = TAKE_STRUCT(d); + return 0; +} +#endif /* HAVE_TPM2 */ + +int tpm2_list_devices(void) { +#if HAVE_TPM2 + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r; + + r = dlopen_tpm2(); + if (r < 0) + return log_error_errno(r, "TPM2 support is not installed."); + + t = table_new("path", "device", "driver"); + if (!t) + return log_oom(); + + d = opendir("/sys/class/tpmrm"); + if (!d) { + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno, "Failed to open /sys/class/tpmrm: %m"); + if (errno != ENOENT) + return -errno; + } else { + for (;;) { + _cleanup_free_ char *device_path = NULL, *device = NULL, *driver_path = NULL, *driver = NULL, *node = NULL; + struct dirent *de; + + de = readdir_no_dot(d); + if (!de) + break; + + device_path = path_join("/sys/class/tpmrm", de->d_name, "device"); + if (!device_path) + return log_oom(); + + r = readlink_malloc(device_path, &device); + if (r < 0) + log_debug_errno(r, "Failed to read device symlink %s, ignoring: %m", device_path); + else { + driver_path = path_join(device_path, "driver"); + if (!driver_path) + return log_oom(); + + r = readlink_malloc(driver_path, &driver); + if (r < 0) + log_debug_errno(r, "Failed to read driver symlink %s, ignoring: %m", driver_path); + } + + node = path_join("/dev", de->d_name); + if (!node) + return log_oom(); + + r = table_add_many( + t, + TABLE_PATH, node, + TABLE_STRING, device ? last_path_component(device) : NULL, + TABLE_STRING, driver ? last_path_component(driver) : NULL); + if (r < 0) + return table_log_add_error(r); + } + } + + if (table_get_rows(t) <= 1) { + log_info("No suitable TPM2 devices found."); + return 0; + } + + r = table_print(t, stdout); + if (r < 0) + return log_error_errno(r, "Failed to show device table: %m"); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM2 not supported on this build."); +#endif +} + +int tpm2_find_device_auto(char **ret) { +#if HAVE_TPM2 + _cleanup_closedir_ DIR *d = NULL; + int r; + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support is not installed."); + + d = opendir("/sys/class/tpmrm"); + if (!d) { + log_debug_errno(errno, "Failed to open /sys/class/tpmrm: %m"); + if (errno != ENOENT) + return -errno; + } else { + _cleanup_free_ char *node = NULL; + + for (;;) { + struct dirent *de; + + de = readdir_no_dot(d); + if (!de) + break; + + if (node) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), + "More than one TPM2 (tpmrm) device found."); + + node = path_join("/dev", de->d_name); + if (!node) + return log_oom_debug(); + } + + if (node) { + *ret = TAKE_PTR(node); + return 0; + } + } + + return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), "No TPM2 (tpmrm) device found."); +#else + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "TPM2 not supported on this build."); +#endif +} + +#if HAVE_TPM2 +static const char* tpm2_userspace_event_type_table[_TPM2_USERSPACE_EVENT_TYPE_MAX] = { + [TPM2_EVENT_PHASE] = "phase", + [TPM2_EVENT_FILESYSTEM] = "filesystem", + [TPM2_EVENT_VOLUME_KEY] = "volume-key", + [TPM2_EVENT_MACHINE_ID] = "machine-id", +}; + +DEFINE_STRING_TABLE_LOOKUP(tpm2_userspace_event_type, Tpm2UserspaceEventType); + +const char *tpm2_userspace_log_path(void) { + return secure_getenv("SYSTEMD_MEASURE_LOG_USERSPACE") ?: "/run/log/systemd/tpm2-measure.log"; +} + +const char *tpm2_firmware_log_path(void) { + return secure_getenv("SYSTEMD_MEASURE_LOG_FIRMWARE") ?: "/sys/kernel/security/tpm0/binary_bios_measurements"; +} + +#if HAVE_OPENSSL +static int tpm2_userspace_log_open(void) { + _cleanup_close_ int fd = -EBADF; + struct stat st; + const char *e; + int r; + + e = tpm2_userspace_log_path(); + (void) mkdir_parents(e, 0755); + + /* We use access mode 0600 here (even though the measurements should not strictly be confidential), + * because we use BSD file locking on it, and if anyone but root can access the file they can also + * lock it, which we want to avoid. */ + fd = open(e, O_CREAT|O_WRONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0600); + if (fd < 0) + return log_debug_errno(errno, "Failed to open TPM log file '%s' for writing, ignoring: %m", e); + + if (flock(fd, LOCK_EX) < 0) + return log_debug_errno(errno, "Failed to lock TPM log file '%s', ignoring: %m", e); + + if (fstat(fd, &st) < 0) + return log_debug_errno(errno, "Failed to fstat TPM log file '%s', ignoring: %m", e); + + r = stat_verify_regular(&st); + if (r < 0) + return log_debug_errno(r, "TPM log file '%s' is not regular, ignoring: %m", e); + + /* We set the sticky bit when we are about to append to the log file. We'll unset it afterwards + * again. If we manage to take a lock on a file that has it set we know we didn't write it fully and + * it is corrupted. Ideally we'd like to use user xattrs for this, but unfortunately tmpfs (which is + * our assumed backend fs) doesn't know user xattrs. */ + if (st.st_mode & S_ISVTX) + return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "TPM log file '%s' aborted, ignoring.", e); + + if (fchmod(fd, 0600 | S_ISVTX) < 0) + return log_debug_errno(errno, "Failed to chmod() TPM log file '%s', ignoring: %m", e); + + return TAKE_FD(fd); +} + +static int tpm2_userspace_log( + int fd, + unsigned pcr_index, + const TPML_DIGEST_VALUES *values, + Tpm2UserspaceEventType event_type, + const char *description) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *array = NULL; + _cleanup_free_ char *f = NULL; + sd_id128_t boot_id; + int r; + + assert(values); + assert(values->count > 0); + + /* We maintain a local PCR measurement log. This implements a subset of the TCG Canonical Event Log + * Format – the JSON flavour – + * (https://trustedcomputinggroup.org/resource/canonical-event-log-format/), but departs in certain + * ways from it, specifically: + * + * - We don't write out a recnum. It's a bit too vaguely defined which means we'd have to read + * through the whole logs (include firmware logs) before knowing what the next value is we should + * use. Hence we simply don't write this out as append-time, and instead expect a consumer to add + * it in when it uses the data. + * + * - We write this out in RFC 7464 application/json-seq rather than as a JSON array. Writing this as + * JSON array would mean that for each appending we'd have to read the whole log file fully into + * memory before writing it out again. We prefer a strictly append-only write pattern however. (RFC + * 7464 is what jq --seq eats.) Conversion into a proper JSON array is trivial. + * + * It should be possible to convert this format in a relatively straight-forward way into the + * official TCG Canonical Event Log Format on read, by simply adding in a few more fields that can be + * determined from the full dataset. + * + * We set the 'content_type' field to "systemd" to make clear this data is generated by us, and + * include various interesting fields in the 'content' subobject, including a CLOCK_BOOTTIME + * timestamp which can be used to order this measurement against possibly other measurements + * independently done by other subsystems on the system. + */ + + if (fd < 0) /* Apparently tpm2_local_log_open() failed earlier, let's not complain again */ + return 0; + + for (size_t i = 0; i < values->count; i++) { + const EVP_MD *implementation; + const char *a; + + assert_se(a = tpm2_hash_alg_to_string(values->digests[i].hashAlg)); + assert_se(implementation = EVP_get_digestbyname(a)); + + r = json_variant_append_arrayb( + &array, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("hashAlg", a), + JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(&values->digests[i].digest, EVP_MD_size(implementation))))); + if (r < 0) + return log_debug_errno(r, "Failed to append digest object to JSON array: %m"); + } + + assert(array); + + r = sd_id128_get_boot(&boot_id); + if (r < 0) + return log_debug_errno(r, "Failed to acquire boot ID: %m"); + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(pcr_index)), + JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(array)), + JSON_BUILD_PAIR("content_type", JSON_BUILD_STRING("systemd")), + JSON_BUILD_PAIR("content", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_CONDITION(description, "string", JSON_BUILD_STRING(description)), + JSON_BUILD_PAIR("bootId", JSON_BUILD_ID128(boot_id)), + JSON_BUILD_PAIR("timestamp", JSON_BUILD_UNSIGNED(now(CLOCK_BOOTTIME))), + JSON_BUILD_PAIR_CONDITION(event_type >= 0, "eventType", JSON_BUILD_STRING(tpm2_userspace_event_type_to_string(event_type))))))); + if (r < 0) + return log_debug_errno(r, "Failed to build log record JSON: %m"); + + r = json_variant_format(v, JSON_FORMAT_SEQ, &f); + if (r < 0) + return log_debug_errno(r, "Failed to format JSON: %m"); + + if (lseek(fd, 0, SEEK_END) < 0) + return log_debug_errno(errno, "Failed to seek to end of JSON log: %m"); + + r = loop_write(fd, f, SIZE_MAX); + if (r < 0) + return log_debug_errno(r, "Failed to write JSON data to log: %m"); + + if (fsync(fd) < 0) + return log_debug_errno(errno, "Failed to sync JSON data: %m"); + + /* Unset S_ISVTX again */ + if (fchmod(fd, 0600) < 0) + return log_debug_errno(errno, "Failed to chmod() TPM log file, ignoring: %m"); + + r = fsync_full(fd); + if (r < 0) + return log_debug_errno(r, "Failed to sync JSON log: %m"); + + return 1; +} +#endif + +int tpm2_extend_bytes( + Tpm2Context *c, + char **banks, + unsigned pcr_index, + const void *data, + size_t data_size, + const void *secret, + size_t secret_size, + Tpm2UserspaceEventType event_type, + const char *description) { + +#if HAVE_OPENSSL + _cleanup_close_ int log_fd = -EBADF; + TPML_DIGEST_VALUES values = {}; + TSS2_RC rc; + + assert(c); + assert(data || data_size == 0); + assert(secret || secret_size == 0); + + if (data_size == SIZE_MAX) + data_size = strlen(data); + if (secret_size == SIZE_MAX) + secret_size = strlen(secret); + + if (pcr_index >= TPM2_PCRS_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Can't measure into unsupported PCR %u, refusing.", pcr_index); + + if (strv_isempty(banks)) + return 0; + + STRV_FOREACH(bank, banks) { + const EVP_MD *implementation; + int id; + + assert_se(implementation = EVP_get_digestbyname(*bank)); + + if (values.count >= ELEMENTSOF(values.digests)) + return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "Too many banks selected."); + + if ((size_t) EVP_MD_size(implementation) > sizeof(values.digests[values.count].digest)) + return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "Hash result too large for TPM2."); + + id = tpm2_hash_alg_from_string(EVP_MD_name(implementation)); + if (id < 0) + return log_debug_errno(id, "Can't map hash name to TPM2."); + + values.digests[values.count].hashAlg = id; + + /* So here's a twist: sometimes we want to measure secrets (e.g. root file system volume + * key), but we'd rather not leak a literal hash of the secret to the TPM (given that the + * wire is unprotected, and some other subsystem might use the simple, literal hash of the + * secret for other purposes, maybe because it needs a shorter secret derived from it for + * some unrelated purpose, who knows). Hence we instead measure an HMAC signature of a + * private non-secret string instead. */ + if (secret_size > 0) { + if (!HMAC(implementation, secret, secret_size, data, data_size, (unsigned char*) &values.digests[values.count].digest, NULL)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to calculate HMAC of data to measure."); + } else if (EVP_Digest(data, data_size, (unsigned char*) &values.digests[values.count].digest, NULL, implementation, NULL) != 1) + return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to hash data to measure."); + + values.count++; + } + + /* Open + lock the log file *before* we start measuring, so that no one else can come between our log + * and our measurement and change either */ + log_fd = tpm2_userspace_log_open(); + + rc = sym_Esys_PCR_Extend( + c->esys_context, + ESYS_TR_PCR0 + pcr_index, + ESYS_TR_PASSWORD, + ESYS_TR_NONE, + ESYS_TR_NONE, + &values); + if (rc != TSS2_RC_SUCCESS) + return log_debug_errno( + SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Failed to measure into PCR %u: %s", + pcr_index, + sym_Tss2_RC_Decode(rc)); + + /* Now, write what we just extended to the log, too. */ + (void) tpm2_userspace_log(log_fd, pcr_index, &values, event_type, description); + + return 0; +#else /* HAVE_OPENSSL */ + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled."); +#endif +} + +const uint16_t tpm2_hash_algorithms[] = { + TPM2_ALG_SHA1, + TPM2_ALG_SHA256, + TPM2_ALG_SHA384, + TPM2_ALG_SHA512, + 0, +}; + +assert_cc(ELEMENTSOF(tpm2_hash_algorithms) == TPM2_N_HASH_ALGORITHMS + 1); + +static size_t tpm2_hash_algorithm_index(uint16_t algorithm) { + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) + if (tpm2_hash_algorithms[i] == algorithm) + return i; + + return SIZE_MAX; +} + +TPM2B_DIGEST *tpm2_pcr_prediction_result_get_hash(Tpm2PCRPredictionResult *result, uint16_t alg) { + size_t alg_idx; + + assert(result); + + alg_idx = tpm2_hash_algorithm_index(alg); + if (alg_idx == SIZE_MAX) /* Algorithm not known? */ + return NULL; + + if (result->hash[alg_idx].size <= 0) /* No hash value for this algorithm? */ + return NULL; + + return result->hash + alg_idx; +} + +void tpm2_pcr_prediction_done(Tpm2PCRPrediction *p) { + assert(p); + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) + ordered_set_free(p->results[pcr]); +} + +static void tpm2_pcr_prediction_result_hash_func(const Tpm2PCRPredictionResult *banks, struct siphash *state) { + assert(banks); + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) + siphash24_compress_safe(banks->hash[i].buffer, banks->hash[i].size, state); +} + +static int tpm2_pcr_prediction_result_compare_func(const Tpm2PCRPredictionResult *a, const Tpm2PCRPredictionResult *b) { + int r; + + assert(a); + assert(b); + + for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) { + r = memcmp_nn(a->hash[i].buffer, a->hash[i].size, + b->hash[i].buffer, b->hash[i].size); + if (r != 0) + return r; + } + + return 0; +} + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + tpm2_pcr_prediction_result_hash_ops, + Tpm2PCRPredictionResult, + tpm2_pcr_prediction_result_hash_func, + tpm2_pcr_prediction_result_compare_func, + Tpm2PCRPredictionResult, + free); + +static Tpm2PCRPredictionResult *find_prediction_result_by_algorithm(OrderedSet *set, Tpm2PCRPredictionResult *result, size_t alg_idx) { + Tpm2PCRPredictionResult *f; + + assert(result); + assert(alg_idx != SIZE_MAX); + + f = ordered_set_get(set, result); /* Full match? */ + if (f) + return f; + + /* If this doesn't match full, then see if there an entry that at least matches by the relevant + * algorithm (we are fine if predictions are "incomplete" in some algorithms) */ + + ORDERED_SET_FOREACH(f, set) + if (memcmp_nn(result->hash[alg_idx].buffer, result->hash[alg_idx].size, + f->hash[alg_idx].buffer, f->hash[alg_idx].size) == 0) + return f; + + return NULL; +} + +bool tpm2_pcr_prediction_equal( + Tpm2PCRPrediction *a, + Tpm2PCRPrediction *b, + uint16_t algorithm) { + + if (a == b) + return true; + if (!a || !b) + return false; + + if (a->pcrs != b->pcrs) + return false; + + size_t alg_idx = tpm2_hash_algorithm_index(algorithm); + if (alg_idx == SIZE_MAX) + return false; + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + Tpm2PCRPredictionResult *banks; + + ORDERED_SET_FOREACH(banks, a->results[pcr]) + if (!find_prediction_result_by_algorithm(b->results[pcr], banks, alg_idx)) + return false; + + ORDERED_SET_FOREACH(banks, b->results[pcr]) + if (!find_prediction_result_by_algorithm(a->results[pcr], banks, alg_idx)) + return false; + } + + return true; +} + +int tpm2_pcr_prediction_to_json( + const Tpm2PCRPrediction *prediction, + uint16_t algorithm, + JsonVariant **ret) { + + _cleanup_(json_variant_unrefp) JsonVariant *aj = NULL; + int r; + + assert(prediction); + assert(ret); + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + _cleanup_(json_variant_unrefp) JsonVariant *vj = NULL; + Tpm2PCRPredictionResult *banks; + + if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr)) + continue; + + ORDERED_SET_FOREACH(banks, prediction->results[pcr]) { + + TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm); + if (!hash) + continue; + + r = json_variant_append_arrayb( + &vj, + JSON_BUILD_HEX(hash->buffer, hash->size)); + if (r < 0) + return log_error_errno(r, "Failed to append hash variant to JSON array: %m"); + } + + if (!vj) + continue; + + r = json_variant_append_arrayb( + &aj, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_INTEGER("pcr", pcr), + JSON_BUILD_PAIR_VARIANT("values", vj))); + if (r < 0) + return log_error_errno(r, "Failed to append PCR variants to JSON array: %m"); + } + + if (!aj) { + r = json_variant_new_array(&aj, NULL, 0); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(aj); + return 0; +} + +int tpm2_pcr_prediction_from_json( + Tpm2PCRPrediction *prediction, + uint16_t algorithm, + JsonVariant *aj) { + + int r; + + assert(prediction); + + size_t alg_index = tpm2_hash_algorithm_index(algorithm); + assert(alg_index < TPM2_N_HASH_ALGORITHMS); + + if (!json_variant_is_array(aj)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR variant array is not an array."); + + JsonVariant *pcr; + JSON_VARIANT_ARRAY_FOREACH(pcr, aj) { + JsonVariant *nr, *values; + + nr = json_variant_by_key(pcr, "pcr"); + if (!nr) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry lacks PCR index field"); + + if (!json_variant_is_unsigned(nr) || + json_variant_unsigned(nr) >= TPM2_PCRS_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry PCR index is not an integer in the range 0…23"); + + values = json_variant_by_key(pcr, "values"); + if (!values) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry lacks values field"); + + if (!json_variant_is_array(values)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry values field is not an array"); + + prediction->pcrs |= UINT32_C(1) << json_variant_unsigned(nr); + + JsonVariant *v; + JSON_VARIANT_ARRAY_FOREACH(v, values) { + _cleanup_free_ void *buffer = NULL; + size_t size; + + r = json_variant_unhex(v, &buffer, &size); + if (r < 0) + return log_error_errno(r, "Failed to decode PCR policy array hash value"); + + if (size <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "PCR policy array hash value is zero."); + + if (size > sizeof_field(TPM2B_DIGEST, buffer)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "PCR policy array hash value is too large."); + + _cleanup_free_ Tpm2PCRPredictionResult *banks = new0(Tpm2PCRPredictionResult, 1); + if (!banks) + return log_oom(); + + memcpy(banks->hash[alg_index].buffer, buffer, size); + banks->hash[alg_index].size = size; + + r = ordered_set_ensure_put(prediction->results + json_variant_unsigned(nr), &tpm2_pcr_prediction_result_hash_ops, banks); + if (r == -EEXIST) /* Let's allow duplicates */ + continue; + if (r < 0) + return log_error_errno(r, "Failed to insert result into set: %m"); + + TAKE_PTR(banks); + } + } + + return 0; +} + +int tpm2_calculate_policy_super_pcr( + Tpm2PCRPrediction *prediction, + uint16_t algorithm, + TPM2B_DIGEST *pcr_policy) { + + int r; + + assert_se(prediction); + assert_se(pcr_policy); + + /* Start with a zero policy if not specified otherwise. */ + TPM2B_DIGEST super_pcr_policy_digest = *pcr_policy; + + /* First we look for all PCRs that have exactly one allowed hash value, and generate a single PolicyPCR policy from them */ + _cleanup_free_ Tpm2PCRValue *single_values = NULL; + size_t n_single_values = 0; + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr)) + continue; + + if (ordered_set_size(prediction->results[pcr]) != 1) + continue; + + log_debug("Including PCR %" PRIu32 " in single value PolicyPCR expression", pcr); + + Tpm2PCRPredictionResult *banks = ASSERT_PTR(ordered_set_first(prediction->results[pcr])); + + TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm); + if (!hash) + continue; + + if (!GREEDY_REALLOC(single_values, n_single_values + 1)) + return -ENOMEM; + + single_values[n_single_values++] = TPM2_PCR_VALUE_MAKE(pcr, algorithm, *hash); + } + + if (n_single_values > 0) { + /* Evolve policy based on the expected PCR value for what we found. */ + r = tpm2_calculate_policy_pcr( + single_values, + n_single_values, + &super_pcr_policy_digest); + if (r < 0) + return r; + } + + /* Now deal with the PCRs for which we have variants, i.e. more than one allowed values */ + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + _cleanup_free_ TPM2B_DIGEST *pcr_policy_digest_variants = NULL; + size_t n_pcr_policy_digest_variants = 0; + Tpm2PCRPredictionResult *banks; + + if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr)) + continue; + + if (ordered_set_size(prediction->results[pcr]) <= 1) /* We only care for PCRs with 2 or more variants in this loop */ + continue; + + if (ordered_set_size(prediction->results[pcr]) > 8) + return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "PCR policies with more than 8 alternatives per PCR are currently not supported."); + + ORDERED_SET_FOREACH(banks, prediction->results[pcr]) { + /* Start from the super PCR policy from the previous PCR we looked at so far. */ + TPM2B_DIGEST pcr_policy_digest = super_pcr_policy_digest; + + TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm); + if (!hash) + continue; + + /* Evolve it based on the expected PCR value for this PCR */ + r = tpm2_calculate_policy_pcr( + &TPM2_PCR_VALUE_MAKE( + pcr, + algorithm, + *hash), + /* n_pcr_values= */ 1, + &pcr_policy_digest); + if (r < 0) + return r; + + /* Store away this new variant */ + if (!GREEDY_REALLOC(pcr_policy_digest_variants, n_pcr_policy_digest_variants + 1)) + return log_oom(); + + pcr_policy_digest_variants[n_pcr_policy_digest_variants++] = pcr_policy_digest; + + log_debug("Calculated PCR policy variant %zu for PCR %" PRIu32, n_pcr_policy_digest_variants, pcr); + } + + assert_se(n_pcr_policy_digest_variants >= 2); + assert_se(n_pcr_policy_digest_variants <= 8); + + /* Now combine all our variant into one OR policy */ + r = tpm2_calculate_policy_or( + pcr_policy_digest_variants, + n_pcr_policy_digest_variants, + &super_pcr_policy_digest); + if (r < 0) + return r; + + log_debug("Combined %zu variants in OR policy.", n_pcr_policy_digest_variants); + } + + *pcr_policy = super_pcr_policy_digest; + return 0; +} + +int tpm2_policy_super_pcr( + Tpm2Context *c, + const Tpm2Handle *session, + const Tpm2PCRPrediction *prediction, + uint16_t algorithm) { + + int r; + + assert_se(c); + assert_se(session); + assert_se(prediction); + + TPM2B_DIGEST previous_policy_digest = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + + uint32_t single_value_pcrs = 0; + + /* Look for all PCRs that have only a singled allowed hash value, and synthesize a single PolicyPCR policy item for them */ + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr)) + continue; + + if (ordered_set_size(prediction->results[pcr]) != 1) + continue; + + log_debug("Including PCR %" PRIu32 " in single value PolicyPCR expression", pcr); + + single_value_pcrs |= UINT32_C(1) << pcr; + } + + if (single_value_pcrs != 0) { + TPML_PCR_SELECTION pcr_selection; + tpm2_tpml_pcr_selection_from_mask(single_value_pcrs, algorithm, &pcr_selection); + + _cleanup_free_ TPM2B_DIGEST *current_policy_digest = NULL; + r = tpm2_policy_pcr( + c, + session, + &pcr_selection, + ¤t_policy_digest); + if (r < 0) + return r; + + previous_policy_digest = *current_policy_digest; + } + + for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) { + size_t n_branches; + + if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr)) + continue; + + n_branches = ordered_set_size(prediction->results[pcr]); + if (n_branches < 1 || n_branches > 8) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Number of variants per PCR not in range 1…8"); + + if (n_branches == 1) /* Single choice PCRs are already covered by the loop above */ + continue; + + log_debug("Submitting PCR/OR policy for PCR %" PRIu32, pcr); + + TPML_PCR_SELECTION pcr_selection; + tpm2_tpml_pcr_selection_from_mask(UINT32_C(1) << pcr, algorithm, &pcr_selection); + + _cleanup_free_ TPM2B_DIGEST *current_policy_digest = NULL; + r = tpm2_policy_pcr( + c, + session, + &pcr_selection, + ¤t_policy_digest); + if (r < 0) + return r; + + _cleanup_free_ TPM2B_DIGEST *branches = NULL; + branches = new0(TPM2B_DIGEST, n_branches); + if (!branches) + return log_oom(); + + Tpm2PCRPredictionResult *banks; + size_t i = 0; + ORDERED_SET_FOREACH(banks, prediction->results[pcr]) { + TPM2B_DIGEST pcr_policy_digest = previous_policy_digest; + + TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm); + if (!hash) + continue; + + /* Evolve it based on the expected PCR value for this PCR */ + r = tpm2_calculate_policy_pcr( + &TPM2_PCR_VALUE_MAKE( + pcr, + algorithm, + *hash), + /* n_pcr_values= */ 1, + &pcr_policy_digest); + if (r < 0) + return r; + + branches[i++] = pcr_policy_digest; + } + + assert_se(i == n_branches); + + current_policy_digest = mfree(current_policy_digest); + r = tpm2_policy_or( + c, + session, + branches, + n_branches, + ¤t_policy_digest); + if (r < 0) + return r; + + previous_policy_digest = *current_policy_digest; + } + + return 0; +} + +void tpm2_pcrlock_policy_done(Tpm2PCRLockPolicy *data) { + assert(data); + + data->prediction_json = json_variant_unref(data->prediction_json); + tpm2_pcr_prediction_done(&data->prediction); + iovec_done(&data->nv_handle); + iovec_done(&data->nv_public); + iovec_done(&data->srk_handle); + iovec_done(&data->pin_public); + iovec_done(&data->pin_private); +} + +static int json_dispatch_tpm2_algorithm(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint16_t *algorithm = ASSERT_PTR(userdata); + int r; + + r = tpm2_hash_alg_from_string(json_variant_string(variant)); + if (r < 0 || tpm2_hash_algorithm_index(r) == SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid hash algorithm: %s", json_variant_string(variant)); + + *algorithm = r; + return 0; +} + +int tpm2_pcrlock_search_file(const char *path, FILE **ret_file, char **ret_path) { + static const char search[] = + "/run/systemd\0" + "/var/lib/systemd\0"; + + int r; + + if (!path) + path = "pcrlock.json"; + + r = search_and_fopen_nulstr(path, ret_file ? "re" : NULL, NULL, search, ret_file, ret_path); + if (r < 0) + return log_debug_errno(r, "Failed to find TPM2 pcrlock policy file '%s': %m", path); + + return 0; +} + +int tpm2_pcrlock_policy_load( + const char *path, + Tpm2PCRLockPolicy *ret_policy) { + + _cleanup_free_ char *discovered_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + r = tpm2_pcrlock_search_file(path, &f, &discovered_path); + if (r == -ENOENT) { + *ret_policy = (Tpm2PCRLockPolicy) {}; + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to load TPM2 pcrlock policy file: %m"); + + _cleanup_(json_variant_unrefp) JsonVariant *configuration_json = NULL; + r = json_parse_file( + f, + discovered_path, + /* flags = */ 0, + &configuration_json, + /* ret_line= */ NULL, + /* ret_column= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse existing pcrlock policy file '%s': %m", discovered_path); + + JsonDispatch policy_dispatch[] = { + { "pcrBank", JSON_VARIANT_STRING, json_dispatch_tpm2_algorithm, offsetof(Tpm2PCRLockPolicy, algorithm), JSON_MANDATORY }, + { "pcrValues", JSON_VARIANT_ARRAY, json_dispatch_variant, offsetof(Tpm2PCRLockPolicy, prediction_json), JSON_MANDATORY }, + { "nvIndex", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint32, offsetof(Tpm2PCRLockPolicy, nv_index), JSON_MANDATORY }, + { "nvHandle", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, nv_handle), JSON_MANDATORY }, + { "nvPublic", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, nv_public), JSON_MANDATORY }, + { "srkHandle", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, srk_handle), JSON_MANDATORY }, + { "pinPublic", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, pin_public), JSON_MANDATORY }, + { "pinPrivate", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, pin_private), JSON_MANDATORY }, + {} + }; + + _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy policy = {}; + + r = json_dispatch(configuration_json, policy_dispatch, JSON_LOG, &policy); + if (r < 0) + return r; + + r = tpm2_pcr_prediction_from_json(&policy.prediction, policy.algorithm, policy.prediction_json); + if (r < 0) + return r; + + *ret_policy = TAKE_STRUCT(policy); + return 1; +} + +int tpm2_load_public_key_file(const char *path, TPM2B_PUBLIC *ret) { + _cleanup_free_ char *device_key_buffer = NULL; + TPM2B_PUBLIC device_key_public = {}; + size_t device_key_buffer_size; + TSS2_RC rc; + int r; + + assert(path); + assert(ret); + + r = dlopen_tpm2(); + if (r < 0) + return log_debug_errno(r, "TPM2 support not installed: %m"); + + r = read_full_file(path, &device_key_buffer, &device_key_buffer_size); + if (r < 0) + return log_error_errno(r, "Failed to read device key from file '%s': %m", path); + + size_t offset = 0; + rc = sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal( + (uint8_t*) device_key_buffer, + device_key_buffer_size, + &offset, + &device_key_public); + if (rc != TSS2_RC_SUCCESS) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Could not unmarshal public key from file."); + + assert(offset <= device_key_buffer_size); + if (offset != device_key_buffer_size) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Found %zu bytes of trailing garbage in public key file.", + device_key_buffer_size - offset); + + *ret = device_key_public; + return 0; +} +#endif + +char *tpm2_pcr_mask_to_string(uint32_t mask) { + _cleanup_free_ char *s = NULL; + + FOREACH_PCR_IN_MASK(n, mask) + if (strextendf_with_separator(&s, "+", "%d", n) < 0) + return NULL; + + if (!s) + return strdup(""); + + return TAKE_PTR(s); +} + +int tpm2_make_pcr_json_array(uint32_t pcr_mask, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *a = NULL; + int r; + + assert(ret); + + for (size_t i = 0; i < TPM2_PCRS_MAX; i++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + if ((pcr_mask & (UINT32_C(1) << i)) == 0) + continue; + + r = json_variant_new_integer(&e, i); + if (r < 0) + return r; + + r = json_variant_append_array(&a, e); + if (r < 0) + return r; + } + + if (!a) + return json_variant_new_array(ret, NULL, 0); + + *ret = TAKE_PTR(a); + return 0; +} + +int tpm2_parse_pcr_json_array(JsonVariant *v, uint32_t *ret) { + JsonVariant *e; + uint32_t mask = 0; + + if (!json_variant_is_array(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR array is not a JSON array."); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + uint64_t u; + + if (!json_variant_is_unsigned(e)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR is not an unsigned integer."); + + u = json_variant_unsigned(e); + if (u >= TPM2_PCRS_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR number out of range: %" PRIu64, u); + + mask |= UINT32_C(1) << u; + } + + if (ret) + *ret = mask; + + return 0; +} + +int tpm2_make_luks2_json( + int keyslot, + uint32_t hash_pcr_mask, + uint16_t pcr_bank, + const void *pubkey, + size_t pubkey_size, + uint32_t pubkey_pcr_mask, + uint16_t primary_alg, + const void *blob, + size_t blob_size, + const void *policy_hash, + size_t policy_hash_size, + const void *salt, + size_t salt_size, + const void *srk_buf, + size_t srk_buf_size, + TPM2Flags flags, + JsonVariant **ret) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *hmj = NULL, *pkmj = NULL; + _cleanup_free_ char *keyslot_as_string = NULL; + int r; + + assert(blob || blob_size == 0); + assert(policy_hash || policy_hash_size == 0); + assert(pubkey || pubkey_size == 0); + + if (asprintf(&keyslot_as_string, "%i", keyslot) < 0) + return -ENOMEM; + + r = tpm2_make_pcr_json_array(hash_pcr_mask, &hmj); + if (r < 0) + return r; + + if (pubkey_pcr_mask != 0) { + r = tpm2_make_pcr_json_array(pubkey_pcr_mask, &pkmj); + if (r < 0) + return r; + } + + /* Note: We made the mistake of using "-" in the field names, which isn't particular compatible with + * other programming languages. Let's not make things worse though, i.e. future additions to the JSON + * object should use "_" rather than "-" in field names. */ + + r = json_build(&v, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("systemd-tpm2")), + JSON_BUILD_PAIR("keyslots", JSON_BUILD_ARRAY(JSON_BUILD_STRING(keyslot_as_string))), + JSON_BUILD_PAIR("tpm2-blob", JSON_BUILD_BASE64(blob, blob_size)), + JSON_BUILD_PAIR("tpm2-pcrs", JSON_BUILD_VARIANT(hmj)), + JSON_BUILD_PAIR_CONDITION(!!tpm2_hash_alg_to_string(pcr_bank), "tpm2-pcr-bank", JSON_BUILD_STRING(tpm2_hash_alg_to_string(pcr_bank))), + JSON_BUILD_PAIR_CONDITION(!!tpm2_asym_alg_to_string(primary_alg), "tpm2-primary-alg", JSON_BUILD_STRING(tpm2_asym_alg_to_string(primary_alg))), + JSON_BUILD_PAIR("tpm2-policy-hash", JSON_BUILD_HEX(policy_hash, policy_hash_size)), + JSON_BUILD_PAIR("tpm2-pin", JSON_BUILD_BOOLEAN(flags & TPM2_FLAGS_USE_PIN)), + JSON_BUILD_PAIR("tpm2_pcrlock", JSON_BUILD_BOOLEAN(flags & TPM2_FLAGS_USE_PCRLOCK)), + JSON_BUILD_PAIR_CONDITION(pubkey_pcr_mask != 0, "tpm2_pubkey_pcrs", JSON_BUILD_VARIANT(pkmj)), + JSON_BUILD_PAIR_CONDITION(pubkey_pcr_mask != 0, "tpm2_pubkey", JSON_BUILD_BASE64(pubkey, pubkey_size)), + JSON_BUILD_PAIR_CONDITION(salt, "tpm2_salt", JSON_BUILD_BASE64(salt, salt_size)), + JSON_BUILD_PAIR_CONDITION(srk_buf, "tpm2_srk", JSON_BUILD_BASE64(srk_buf, srk_buf_size)))); + if (r < 0) + return r; + + if (ret) + *ret = TAKE_PTR(v); + + return keyslot; +} + +int tpm2_parse_luks2_json( + JsonVariant *v, + int *ret_keyslot, + uint32_t *ret_hash_pcr_mask, + uint16_t *ret_pcr_bank, + void **ret_pubkey, + size_t *ret_pubkey_size, + uint32_t *ret_pubkey_pcr_mask, + uint16_t *ret_primary_alg, + void **ret_blob, + size_t *ret_blob_size, + void **ret_policy_hash, + size_t *ret_policy_hash_size, + void **ret_salt, + size_t *ret_salt_size, + void **ret_srk_buf, + size_t *ret_srk_buf_size, + TPM2Flags *ret_flags) { + + _cleanup_free_ void *blob = NULL, *policy_hash = NULL, *pubkey = NULL, *salt = NULL, *srk_buf = NULL; + size_t blob_size = 0, policy_hash_size = 0, pubkey_size = 0, salt_size = 0, srk_buf_size = 0; + uint32_t hash_pcr_mask = 0, pubkey_pcr_mask = 0; + uint16_t primary_alg = TPM2_ALG_ECC; /* ECC was the only supported algorithm in systemd < 250, use that as implied default, for compatibility */ + uint16_t pcr_bank = UINT16_MAX; /* default: pick automatically */ + int r, keyslot = -1; + TPM2Flags flags = 0; + JsonVariant *w; + + assert(v); + + if (ret_keyslot) { + keyslot = cryptsetup_get_keyslot_from_token(v); + if (keyslot < 0) { + /* Return a recognizable error when parsing this field, so that callers can handle parsing + * errors of the keyslots field gracefully, since it's not 'owned' by us, but by the LUKS2 + * spec */ + log_debug_errno(keyslot, "Failed to extract keyslot index from TPM2 JSON data token, skipping: %m"); + return -EUCLEAN; + } + } + + w = json_variant_by_key(v, "tpm2-pcrs"); + if (!w) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 token data lacks 'tpm2-pcrs' field."); + + r = tpm2_parse_pcr_json_array(w, &hash_pcr_mask); + if (r < 0) + return log_debug_errno(r, "Failed to parse TPM2 PCR mask: %m"); + + /* The bank field is optional, since it was added in systemd 250 only. Before the bank was hardcoded + * to SHA256. */ + w = json_variant_by_key(v, "tpm2-pcr-bank"); + if (w) { + /* The PCR bank field is optional */ + + if (!json_variant_is_string(w)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR bank is not a string."); + + r = tpm2_hash_alg_from_string(json_variant_string(w)); + if (r < 0) + return log_debug_errno(r, "TPM2 PCR bank invalid or not supported: %s", json_variant_string(w)); + + pcr_bank = r; + } + + /* The primary key algorithm field is optional, since it was also added in systemd 250 only. Before + * the algorithm was hardcoded to ECC. */ + w = json_variant_by_key(v, "tpm2-primary-alg"); + if (w) { + /* The primary key algorithm is optional */ + + if (!json_variant_is_string(w)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 primary key algorithm is not a string."); + + r = tpm2_asym_alg_from_string(json_variant_string(w)); + if (r < 0) + return log_debug_errno(r, "TPM2 asymmetric algorithm invalid or not supported: %s", json_variant_string(w)); + + primary_alg = r; + } + + w = json_variant_by_key(v, "tpm2-blob"); + if (!w) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 token data lacks 'tpm2-blob' field."); + + r = json_variant_unbase64(w, &blob, &blob_size); + if (r < 0) + return log_debug_errno(r, "Invalid base64 data in 'tpm2-blob' field."); + + w = json_variant_by_key(v, "tpm2-policy-hash"); + if (!w) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 token data lacks 'tpm2-policy-hash' field."); + + r = json_variant_unhex(w, &policy_hash, &policy_hash_size); + if (r < 0) + return log_debug_errno(r, "Invalid base64 data in 'tpm2-policy-hash' field."); + + w = json_variant_by_key(v, "tpm2-pin"); + if (w) { + if (!json_variant_is_boolean(w)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PIN policy is not a boolean."); + + SET_FLAG(flags, TPM2_FLAGS_USE_PIN, json_variant_boolean(w)); + } + + w = json_variant_by_key(v, "tpm2_pcrlock"); + if (w) { + if (!json_variant_is_boolean(w)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 pclock policy is not a boolean."); + + SET_FLAG(flags, TPM2_FLAGS_USE_PCRLOCK, json_variant_boolean(w)); + } + + w = json_variant_by_key(v, "tpm2_salt"); + if (w) { + r = json_variant_unbase64(w, &salt, &salt_size); + if (r < 0) + return log_debug_errno(r, "Invalid base64 data in 'tpm2_salt' field."); + } + + w = json_variant_by_key(v, "tpm2_pubkey_pcrs"); + if (w) { + r = tpm2_parse_pcr_json_array(w, &pubkey_pcr_mask); + if (r < 0) + return r; + } + + w = json_variant_by_key(v, "tpm2_pubkey"); + if (w) { + r = json_variant_unbase64(w, &pubkey, &pubkey_size); + if (r < 0) + return log_debug_errno(r, "Failed to decode PCR public key."); + } else if (pubkey_pcr_mask != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Public key PCR mask set, but not public key included in JSON data, refusing."); + + w = json_variant_by_key(v, "tpm2_srk"); + if (w) { + r = json_variant_unbase64(w, &srk_buf, &srk_buf_size); + if (r < 0) + return log_debug_errno(r, "Invalid base64 data in 'tpm2_srk' field."); + } + + if (ret_keyslot) + *ret_keyslot = keyslot; + if (ret_hash_pcr_mask) + *ret_hash_pcr_mask = hash_pcr_mask; + if (ret_pcr_bank) + *ret_pcr_bank = pcr_bank; + if (ret_pubkey) + *ret_pubkey = TAKE_PTR(pubkey); + if (ret_pubkey_size) + *ret_pubkey_size = pubkey_size; + if (ret_pubkey_pcr_mask) + *ret_pubkey_pcr_mask = pubkey_pcr_mask; + if (ret_primary_alg) + *ret_primary_alg = primary_alg; + if (ret_blob) + *ret_blob = TAKE_PTR(blob); + if (ret_blob_size) + *ret_blob_size = blob_size; + if (ret_policy_hash) + *ret_policy_hash = TAKE_PTR(policy_hash); + if (ret_policy_hash_size) + *ret_policy_hash_size = policy_hash_size; + if (ret_salt) + *ret_salt = TAKE_PTR(salt); + if (ret_salt_size) + *ret_salt_size = salt_size; + if (ret_flags) + *ret_flags = flags; + if (ret_srk_buf) + *ret_srk_buf = TAKE_PTR(srk_buf); + if (ret_srk_buf_size) + *ret_srk_buf_size = srk_buf_size; + + return 0; +} + +int tpm2_hash_alg_to_size(uint16_t alg) { + switch (alg) { + case TPM2_ALG_SHA1: + return 20; + case TPM2_ALG_SHA256: + return 32; + case TPM2_ALG_SHA384: + return 48; + case TPM2_ALG_SHA512: + return 64; + default: + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown hash algorithm id 0x%" PRIx16, alg); + } +} + +const char *tpm2_hash_alg_to_string(uint16_t alg) { + switch (alg) { + case TPM2_ALG_SHA1: + return "sha1"; + case TPM2_ALG_SHA256: + return "sha256"; + case TPM2_ALG_SHA384: + return "sha384"; + case TPM2_ALG_SHA512: + return "sha512"; + default: + log_debug("Unknown hash algorithm id 0x%" PRIx16, alg); + return NULL; + } +} + +int tpm2_hash_alg_from_string(const char *alg) { + if (strcaseeq_ptr(alg, "sha1")) + return TPM2_ALG_SHA1; + if (strcaseeq_ptr(alg, "sha256")) + return TPM2_ALG_SHA256; + if (strcaseeq_ptr(alg, "sha384")) + return TPM2_ALG_SHA384; + if (strcaseeq_ptr(alg, "sha512")) + return TPM2_ALG_SHA512; + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown hash algorithm name '%s'", alg); +} + +const char *tpm2_asym_alg_to_string(uint16_t alg) { + switch (alg) { + case TPM2_ALG_ECC: + return "ecc"; + case TPM2_ALG_RSA: + return "rsa"; + default: + log_debug("Unknown asymmetric algorithm id 0x%" PRIx16, alg); + return NULL; + } +} + +int tpm2_asym_alg_from_string(const char *alg) { + if (strcaseeq_ptr(alg, "ecc")) + return TPM2_ALG_ECC; + if (strcaseeq_ptr(alg, "rsa")) + return TPM2_ALG_RSA; + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown asymmetric algorithm name '%s'", alg); +} + +const char *tpm2_sym_alg_to_string(uint16_t alg) { + switch (alg) { +#if HAVE_TPM2 + case TPM2_ALG_AES: + return "aes"; +#endif + default: + log_debug("Unknown symmetric algorithm id 0x%" PRIx16, alg); + return NULL; + } +} + +int tpm2_sym_alg_from_string(const char *alg) { +#if HAVE_TPM2 + if (strcaseeq_ptr(alg, "aes")) + return TPM2_ALG_AES; +#endif + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown symmetric algorithm name '%s'", alg); +} + +const char *tpm2_sym_mode_to_string(uint16_t mode) { + switch (mode) { +#if HAVE_TPM2 + case TPM2_ALG_CTR: + return "ctr"; + case TPM2_ALG_OFB: + return "ofb"; + case TPM2_ALG_CBC: + return "cbc"; + case TPM2_ALG_CFB: + return "cfb"; + case TPM2_ALG_ECB: + return "ecb"; +#endif + default: + log_debug("Unknown symmetric mode id 0x%" PRIx16, mode); + return NULL; + } +} + +int tpm2_sym_mode_from_string(const char *mode) { +#if HAVE_TPM2 + if (strcaseeq_ptr(mode, "ctr")) + return TPM2_ALG_CTR; + if (strcaseeq_ptr(mode, "ofb")) + return TPM2_ALG_OFB; + if (strcaseeq_ptr(mode, "cbc")) + return TPM2_ALG_CBC; + if (strcaseeq_ptr(mode, "cfb")) + return TPM2_ALG_CFB; + if (strcaseeq_ptr(mode, "ecb")) + return TPM2_ALG_ECB; +#endif + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown symmetric mode name '%s'", mode); +} + +Tpm2Support tpm2_support(void) { + Tpm2Support support = TPM2_SUPPORT_NONE; + int r; + + if (detect_container() <= 0) { + /* Check if there's a /dev/tpmrm* device via sysfs. If we run in a container we likely just + * got the host sysfs mounted. Since devices are generally not virtualized for containers, + * let's assume containers never have a TPM, at least for now. */ + + r = dir_is_empty("/sys/class/tpmrm", /* ignore_hidden_or_backup= */ false); + if (r < 0) { + if (r != -ENOENT) + log_debug_errno(r, "Unable to test whether /sys/class/tpmrm/ exists and is populated, assuming it is not: %m"); + } else if (r == 0) /* populated! */ + support |= TPM2_SUPPORT_SUBSYSTEM|TPM2_SUPPORT_DRIVER; + else + /* If the directory exists but is empty, we know the subsystem is enabled but no + * driver has been loaded yet. */ + support |= TPM2_SUPPORT_SUBSYSTEM; + } + + if (efi_has_tpm2()) + support |= TPM2_SUPPORT_FIRMWARE; + +#if HAVE_TPM2 + support |= TPM2_SUPPORT_SYSTEM; + + r = dlopen_tpm2(); + if (r >= 0) + support |= TPM2_SUPPORT_LIBRARIES; +#endif + + return support; +} + +#if HAVE_TPM2 +static void tpm2_pcr_values_apply_default_hash_alg(Tpm2PCRValue *pcr_values, size_t n_pcr_values) { + TPMI_ALG_HASH default_hash = 0; + FOREACH_ARRAY(v, pcr_values, n_pcr_values) + if (v->hash != 0) { + default_hash = v->hash; + break; + } + + if (default_hash != 0) + FOREACH_ARRAY(v, pcr_values, n_pcr_values) + if (v->hash == 0) + v->hash = default_hash; +} +#endif + +/* The following tpm2_parse_pcr_argument*() functions all log errors, to match the behavior of system-wide + * parse_*_argument() functions. */ + +/* Parse the PCR selection/value arg(s) and return a corresponding array of Tpm2PCRValue objects. + * + * The format is the same as tpm2_pcr_values_from_string(). The first provided entry with a hash algorithm + * set will be used as the 'default' hash algorithm. All entries with an unset hash algorithm will be updated + * with the 'default' hash algorithm. The resulting array will be sorted and checked for validity. + * + * This will replace *ret_pcr_values with the new array of pcr values; to append to an existing array, use + * tpm2_parse_pcr_argument_append(). */ +int tpm2_parse_pcr_argument(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values) { +#if HAVE_TPM2 + int r; + + assert(arg); + assert(ret_pcr_values); + assert(ret_n_pcr_values); + + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values = 0; + r = tpm2_pcr_values_from_string(arg, &pcr_values, &n_pcr_values); + if (r < 0) + return log_error_errno(r, "Could not parse PCR values from '%s': %m", arg); + + tpm2_pcr_values_apply_default_hash_alg(pcr_values, n_pcr_values); + + tpm2_sort_pcr_values(pcr_values, n_pcr_values); + + if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Parsed PCR values are not valid."); + + *ret_pcr_values = TAKE_PTR(pcr_values); + *ret_n_pcr_values = n_pcr_values; + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support is disabled."); +#endif +} + +/* Same as tpm2_parse_pcr_argument(), but the pcr values array is appended to. If the provided pcr values + * array is not NULL, it must point to an allocated pcr values array and the provided number of pcr values + * must be correct. + * + * Note that 'arg' is parsed into a new array of pcr values independently of any previous pcr values, + * including application of the default hash algorithm. Then the two arrays are combined, the default hash + * algorithm check applied again (in case either the previous or current array had no default hash + * algorithm), and then the resulting array is sorted and rechecked for validity. */ +int tpm2_parse_pcr_argument_append(const char *arg, Tpm2PCRValue **pcr_values, size_t *n_pcr_values) { +#if HAVE_TPM2 + int r; + + assert(arg); + assert(pcr_values); + assert(n_pcr_values); + + _cleanup_free_ Tpm2PCRValue *more_pcr_values = NULL; + size_t n_more_pcr_values; + r = tpm2_parse_pcr_argument(arg, &more_pcr_values, &n_more_pcr_values); + if (r < 0) + return r; + + /* If we got previous values, append them. */ + if (*pcr_values && !GREEDY_REALLOC_APPEND(more_pcr_values, n_more_pcr_values, *pcr_values, *n_pcr_values)) + return log_oom(); + + tpm2_pcr_values_apply_default_hash_alg(more_pcr_values, n_more_pcr_values); + + tpm2_sort_pcr_values(more_pcr_values, n_more_pcr_values); + + if (!tpm2_pcr_values_valid(more_pcr_values, n_more_pcr_values)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Parsed PCR values are not valid."); + + SWAP_TWO(*pcr_values, more_pcr_values); + *n_pcr_values = n_more_pcr_values; + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support is disabled."); +#endif +} + +/* Same as tpm2_parse_pcr_argument() but converts the pcr values to a pcr mask. If more than one hash + * algorithm is included in the pcr values array this results in error. This retains the previous behavior of + * tpm2_parse_pcr_argument() of clearing the mask if 'arg' is empty, replacing the mask if it is set to + * UINT32_MAX, and or-ing the mask otherwise. */ +int tpm2_parse_pcr_argument_to_mask(const char *arg, uint32_t *ret_mask) { +#if HAVE_TPM2 + _cleanup_free_ Tpm2PCRValue *pcr_values = NULL; + size_t n_pcr_values; + int r; + + assert(arg); + assert(ret_mask); + + r = tpm2_parse_pcr_argument(arg, &pcr_values, &n_pcr_values); + if (r < 0) + return r; + + if (n_pcr_values == 0) { + /* This retains the previous behavior of clearing the mask if the arg is empty */ + *ret_mask = 0; + return 0; + } + + size_t hash_count; + r = tpm2_pcr_values_hash_count(pcr_values, n_pcr_values, &hash_count); + if (r < 0) + return log_error_errno(r, "Could not get hash count from pcr values: %m"); + + if (hash_count > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Multiple PCR hash banks selected."); + + uint32_t new_mask; + r = tpm2_pcr_values_to_mask(pcr_values, n_pcr_values, pcr_values[0].hash, &new_mask); + if (r < 0) + return log_error_errno(r, "Could not get pcr values mask: %m"); + + if (*ret_mask == UINT32_MAX) + *ret_mask = new_mask; + else + *ret_mask |= new_mask; + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support is disabled."); +#endif +} + +int tpm2_load_pcr_signature(const char *path, JsonVariant **ret) { + _cleanup_strv_free_ char **search = NULL; + _cleanup_free_ char *discovered_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + /* Tries to load a JSON PCR signature file. Takes an absolute path, a simple file name or NULL. In + * the latter two cases searches in /etc/, /usr/lib/, /run/, as usual. */ + + search = strv_split_nulstr(CONF_PATHS_NULSTR("systemd")); + if (!search) + return log_oom_debug(); + + if (!path) { + /* If no path is specified, then look for "tpm2-pcr-signature.json" automatically. Also, in + * this case include /.extra/ in the search path, but only in this case, and if we run in the + * initrd. We don't want to be too eager here, after all /.extra/ is untrusted territory. */ + + path = "tpm2-pcr-signature.json"; + + if (in_initrd()) + if (strv_extend(&search, "/.extra") < 0) + return log_oom_debug(); + } + + r = search_and_fopen(path, "re", NULL, (const char**) search, &f, &discovered_path); + if (r < 0) + return log_debug_errno(r, "Failed to find TPM PCR signature file '%s': %m", path); + + r = json_parse_file(f, discovered_path, 0, ret, NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to parse TPM PCR signature JSON object '%s': %m", discovered_path); + + return 0; +} + +int tpm2_load_pcr_public_key(const char *path, void **ret_pubkey, size_t *ret_pubkey_size) { + _cleanup_free_ char *discovered_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + /* Tries to load a PCR public key file. Takes an absolute path, a simple file name or NULL. In the + * latter two cases searches in /etc/, /usr/lib/, /run/, as usual. */ + + if (!path) + path = "tpm2-pcr-public-key.pem"; + + r = search_and_fopen(path, "re", NULL, (const char**) CONF_PATHS_STRV("systemd"), &f, &discovered_path); + if (r < 0) + return log_debug_errno(r, "Failed to find TPM PCR public key file '%s': %m", path); + + r = read_full_stream(f, (char**) ret_pubkey, ret_pubkey_size); + if (r < 0) + return log_debug_errno(r, "Failed to load TPM PCR public key PEM file '%s': %m", discovered_path); + + return 0; +} + +#define PBKDF2_HMAC_SHA256_ITERATIONS 10000 + +/* + * Implements PBKDF2 HMAC SHA256 for a derived keylen of 32 + * bytes and for PBKDF2_HMAC_SHA256_ITERATIONS count. + * I found the wikipedia entry relevant and it contains links to + * relevant RFCs: + * - https://en.wikipedia.org/wiki/PBKDF2 + * - https://www.rfc-editor.org/rfc/rfc2898#section-5.2 + */ +int tpm2_util_pbkdf2_hmac_sha256(const void *pass, + size_t passlen, + const void *salt, + size_t saltlen, + uint8_t ret_key[static SHA256_DIGEST_SIZE]) { + + uint8_t _cleanup_(erase_and_freep) *buffer = NULL; + uint8_t u[SHA256_DIGEST_SIZE]; + + /* To keep this simple, since derived KeyLen (dkLen in docs) + * Is the same as the hash output, we don't need multiple + * blocks. Part of the algorithm is to add the block count + * in, but this can be hardcoded to 1. + */ + static const uint8_t block_cnt[] = { 0, 0, 0, 1 }; + + assert (salt); + assert (saltlen > 0); + assert (saltlen <= (SIZE_MAX - sizeof(block_cnt))); + assert (passlen > 0); + + /* + * Build a buffer of salt + block_cnt and hmac_sha256 it we + * do this as we don't have a context builder for HMAC_SHA256. + */ + buffer = malloc(saltlen + sizeof(block_cnt)); + if (!buffer) + return -ENOMEM; + + memcpy(buffer, salt, saltlen); + memcpy(&buffer[saltlen], block_cnt, sizeof(block_cnt)); + + hmac_sha256(pass, passlen, buffer, saltlen + sizeof(block_cnt), u); + + /* dk needs to be an unmodified u as u gets modified in the loop */ + memcpy(ret_key, u, SHA256_DIGEST_SIZE); + uint8_t *dk = ret_key; + + for (size_t i = 1; i < PBKDF2_HMAC_SHA256_ITERATIONS; i++) { + hmac_sha256(pass, passlen, u, sizeof(u), u); + + for (size_t j=0; j < sizeof(u); j++) + dk[j] ^= u[j]; + } + + return 0; +} + +static const char* const tpm2_pcr_index_table[_TPM2_PCR_INDEX_MAX_DEFINED] = { + [TPM2_PCR_PLATFORM_CODE] = "platform-code", + [TPM2_PCR_PLATFORM_CONFIG] = "platform-config", + [TPM2_PCR_EXTERNAL_CODE] = "external-code", + [TPM2_PCR_EXTERNAL_CONFIG] = "external-config", + [TPM2_PCR_BOOT_LOADER_CODE] = "boot-loader-code", + [TPM2_PCR_BOOT_LOADER_CONFIG] = "boot-loader-config", + [TPM2_PCR_HOST_PLATFORM] = "host-platform", + [TPM2_PCR_SECURE_BOOT_POLICY] = "secure-boot-policy", + [TPM2_PCR_KERNEL_INITRD] = "kernel-initrd", + [TPM2_PCR_IMA] = "ima", + [TPM2_PCR_KERNEL_BOOT] = "kernel-boot", + [TPM2_PCR_KERNEL_CONFIG] = "kernel-config", + [TPM2_PCR_SYSEXTS] = "sysexts", + [TPM2_PCR_SHIM_POLICY] = "shim-policy", + [TPM2_PCR_SYSTEM_IDENTITY] = "system-identity", + [TPM2_PCR_DEBUG] = "debug", + [TPM2_PCR_APPLICATION_SUPPORT] = "application-support", +}; + +DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_FALLBACK(tpm2_pcr_index, int, TPM2_PCRS_MAX - 1); +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(tpm2_pcr_index, int); diff --git a/src/shared/tpm2-util.h b/src/shared/tpm2-util.h new file mode 100644 index 0000000..55d7481 --- /dev/null +++ b/src/shared/tpm2-util.h @@ -0,0 +1,478 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bitfield.h" +#include "io-util.h" +#include "json.h" +#include "macro.h" +#include "openssl-util.h" +#include "ordered-set.h" +#include "sha256.h" +#include "tpm2-pcr.h" + +typedef enum TPM2Flags { + TPM2_FLAGS_USE_PIN = 1 << 0, + TPM2_FLAGS_USE_PCRLOCK = 1 << 1, +} TPM2Flags; + +/* As per https://trustedcomputinggroup.org/wp-content/uploads/TCG_PCClient_PFP_r1p05_v23_pub.pdf a + * TPM2 on a Client PC must have at least 24 PCRs. This hardcodes our expectation of 24. */ +#define TPM2_PCRS_MAX 24U +#define TPM2_PCRS_MASK ((UINT32_C(1) << TPM2_PCRS_MAX) - 1) + +/* The SRK handle is defined in the Provisioning Guidance document (see above) in the table "Reserved Handles + * for TPM Provisioning Fundamental Elements". The SRK is useful because it is "shared", meaning it has no + * authValue nor authPolicy set, and thus may be used by anyone on the system to generate derived keys or + * seal secrets. This is useful if the TPM has an auth (password) set for the 'owner hierarchy', which would + * prevent users from generating primary transient keys, unless they knew the owner hierarchy auth. See + * the Provisioning Guidance document for more details. */ +#define TPM2_SRK_HANDLE UINT32_C(0x81000001) + +/* The TPM specification limits sealed data to MAX_SYM_DATA. Unfortunately, tpm2-tss incorrectly + * defines this value as 256; the TPM specification Part 2 ("Structures") section + * "TPMU_SENSITIVE_CREATE" states "For interoperability, MAX_SYM_DATA should be 128." */ +#define TPM2_MAX_SEALED_DATA UINT16_C(128) + +static inline bool TPM2_PCR_INDEX_VALID(unsigned pcr) { + return pcr < TPM2_PCRS_MAX; +} +static inline bool TPM2_PCR_MASK_VALID(uint32_t pcr_mask) { + return pcr_mask <= TPM2_PCRS_MASK; +} + +#define FOREACH_PCR_IN_MASK(pcr, mask) BIT_FOREACH(pcr, mask) + +#define TPM2_N_HASH_ALGORITHMS 4U + +#if HAVE_TPM2 + +#include +#include +#include + +int dlopen_tpm2(void); + +typedef struct { + unsigned n_ref; + + void *tcti_dl; + TSS2_TCTI_CONTEXT *tcti_context; + ESYS_CONTEXT *esys_context; + + /* Some selected cached capabilities of the TPM */ + TPMS_ALG_PROPERTY *capability_algorithms; + size_t n_capability_algorithms; + TPMA_CC *capability_commands; + size_t n_capability_commands; + TPM2_ECC_CURVE *capability_ecc_curves; + size_t n_capability_ecc_curves; + TPML_PCR_SELECTION capability_pcrs; +} Tpm2Context; + +int tpm2_context_new(const char *device, Tpm2Context **ret_context); +Tpm2Context *tpm2_context_ref(Tpm2Context *context); +Tpm2Context *tpm2_context_unref(Tpm2Context *context); +DEFINE_TRIVIAL_CLEANUP_FUNC(Tpm2Context*, tpm2_context_unref); + +typedef struct { + Tpm2Context *tpm2_context; + ESYS_TR esys_handle; + + bool flush; +} Tpm2Handle; + +#define _tpm2_handle(c, h) { .tpm2_context = (c), .esys_handle = (h), } +static const Tpm2Handle TPM2_HANDLE_NONE = _tpm2_handle(NULL, ESYS_TR_NONE); + +void Esys_Freep(void *p); + +int tpm2_handle_new(Tpm2Context *context, Tpm2Handle **ret_handle); +Tpm2Handle *tpm2_handle_free(Tpm2Handle *handle); +DEFINE_TRIVIAL_CLEANUP_FUNC(Tpm2Handle*, tpm2_handle_free); + +typedef struct { + unsigned index; + TPMI_ALG_HASH hash; + TPM2B_DIGEST value; +} Tpm2PCRValue; + +#define TPM2_PCR_VALUE_MAKE(i, h, v) \ + (Tpm2PCRValue) { \ + .index = (i), \ + .hash = (h), \ + .value = ((TPM2B_DIGEST) v), \ + } + +bool tpm2_pcr_value_valid(const Tpm2PCRValue *pcr_value); +bool tpm2_pcr_values_has_any_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values); +bool tpm2_pcr_values_has_all_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values); +int tpm2_pcr_value_from_string(const char *arg, Tpm2PCRValue *ret_pcr_value); +char *tpm2_pcr_value_to_string(const Tpm2PCRValue *pcr_value); + +bool tpm2_pcr_values_valid(const Tpm2PCRValue *pcr_values, size_t n_pcr_values); +void tpm2_sort_pcr_values(Tpm2PCRValue *pcr_values, size_t n_pcr_values); +int tpm2_pcr_values_from_mask(uint32_t mask, TPMI_ALG_HASH hash, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values); +int tpm2_pcr_values_to_mask(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPMI_ALG_HASH hash, uint32_t *ret_mask); +int tpm2_pcr_values_from_string(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values); +char *tpm2_pcr_values_to_string(const Tpm2PCRValue *pcr_values, size_t n_pcr_values); +int tpm2_pcr_values_hash_count(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, size_t *ret_count); +int tpm2_tpml_pcr_selection_from_pcr_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPML_PCR_SELECTION *ret_selection, TPM2B_DIGEST **ret_values, size_t *ret_n_values); + +int tpm2_make_encryption_session(Tpm2Context *c, const Tpm2Handle *primary, const Tpm2Handle *bind_key, Tpm2Handle **ret_session); + +int tpm2_create_primary(Tpm2Context *c, const Tpm2Handle *session, const TPM2B_PUBLIC *template, const TPM2B_SENSITIVE_CREATE *sensitive, TPM2B_PUBLIC **ret_public, Tpm2Handle **ret_handle); +int tpm2_create(Tpm2Context *c, const Tpm2Handle *parent, const Tpm2Handle *session, const TPMT_PUBLIC *template, const TPMS_SENSITIVE_CREATE *sensitive, TPM2B_PUBLIC **ret_public, TPM2B_PRIVATE **ret_private); +int tpm2_create_loaded(Tpm2Context *c, const Tpm2Handle *parent, const Tpm2Handle *session, const TPMT_PUBLIC *template, const TPMS_SENSITIVE_CREATE *sensitive, TPM2B_PUBLIC **ret_public, TPM2B_PRIVATE **ret_private, Tpm2Handle **ret_handle); +int tpm2_load(Tpm2Context *c, const Tpm2Handle *parent, const Tpm2Handle *session, const TPM2B_PUBLIC *public, const TPM2B_PRIVATE *private, Tpm2Handle **ret_handle); +int tpm2_marshal_public(const TPM2B_PUBLIC *public, void **ret, size_t *ret_size); +int tpm2_marshal_nv_public(const TPM2B_NV_PUBLIC *nv_public, void **ret, size_t *ret_size); +int tpm2_unmarshal_nv_public(const void *data, size_t size, TPM2B_NV_PUBLIC *ret_nv_public); +int tpm2_marshal_blob(const TPM2B_PUBLIC *public, const TPM2B_PRIVATE *private, const TPM2B_ENCRYPTED_SECRET *seed, void **ret_blob, size_t *ret_blob_size); +int tpm2_unmarshal_blob(const void *blob, size_t blob_size, TPM2B_PUBLIC *ret_public, TPM2B_PRIVATE *ret_private, TPM2B_ENCRYPTED_SECRET *ret_seed); + +bool tpm2_supports_alg(Tpm2Context *c, TPM2_ALG_ID alg); +bool tpm2_supports_command(Tpm2Context *c, TPM2_CC command); +bool tpm2_supports_ecc_curve(Tpm2Context *c, TPM2_ECC_CURVE ecc_curve); + +bool tpm2_test_parms(Tpm2Context *c, TPMI_ALG_PUBLIC alg, const TPMU_PUBLIC_PARMS *parms); + +int tpm2_get_good_pcr_banks(Tpm2Context *c, uint32_t pcr_mask, TPMI_ALG_HASH **ret_banks); +int tpm2_get_good_pcr_banks_strv(Tpm2Context *c, uint32_t pcr_mask, char ***ret); +int tpm2_get_best_pcr_bank(Tpm2Context *c, uint32_t pcr_mask, TPMI_ALG_HASH *ret); + +const char *tpm2_userspace_log_path(void); +const char *tpm2_firmware_log_path(void); + +typedef enum Tpm2UserspaceEventType { + TPM2_EVENT_PHASE, + TPM2_EVENT_FILESYSTEM, + TPM2_EVENT_VOLUME_KEY, + TPM2_EVENT_MACHINE_ID, + _TPM2_USERSPACE_EVENT_TYPE_MAX, + _TPM2_USERSPACE_EVENT_TYPE_INVALID = -EINVAL, +} Tpm2UserspaceEventType; + +const char* tpm2_userspace_event_type_to_string(Tpm2UserspaceEventType type) _const_; +Tpm2UserspaceEventType tpm2_userspace_event_type_from_string(const char *s) _pure_; + +int tpm2_extend_bytes(Tpm2Context *c, char **banks, unsigned pcr_index, const void *data, size_t data_size, const void *secret, size_t secret_size, Tpm2UserspaceEventType event, const char *description); + +uint32_t tpm2_tpms_pcr_selection_to_mask(const TPMS_PCR_SELECTION *s); +void tpm2_tpms_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash, TPMS_PCR_SELECTION *ret); +bool tpm2_tpms_pcr_selection_has_mask(const TPMS_PCR_SELECTION *s, uint32_t mask); +void tpm2_tpms_pcr_selection_add_mask(TPMS_PCR_SELECTION *s, uint32_t mask); +void tpm2_tpms_pcr_selection_sub_mask(TPMS_PCR_SELECTION *s, uint32_t mask); +void tpm2_tpms_pcr_selection_add(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b); +void tpm2_tpms_pcr_selection_sub(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b); +void tpm2_tpms_pcr_selection_move(TPMS_PCR_SELECTION *a, TPMS_PCR_SELECTION *b); +char *tpm2_tpms_pcr_selection_to_string(const TPMS_PCR_SELECTION *s); +size_t tpm2_tpms_pcr_selection_weight(const TPMS_PCR_SELECTION *s); +#define tpm2_tpms_pcr_selection_is_empty(s) (tpm2_tpms_pcr_selection_weight(s) == 0) + +uint32_t tpm2_tpml_pcr_selection_to_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash); +void tpm2_tpml_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash, TPML_PCR_SELECTION *ret); +bool tpm2_tpml_pcr_selection_has_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask); +void tpm2_tpml_pcr_selection_add_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask); +void tpm2_tpml_pcr_selection_sub_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask); +void tpm2_tpml_pcr_selection_add_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s); +void tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s); +void tpm2_tpml_pcr_selection_add(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b); +void tpm2_tpml_pcr_selection_sub(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b); +char *tpm2_tpml_pcr_selection_to_string(const TPML_PCR_SELECTION *l); +size_t tpm2_tpml_pcr_selection_weight(const TPML_PCR_SELECTION *l); +#define tpm2_tpml_pcr_selection_is_empty(l) (tpm2_tpml_pcr_selection_weight(l) == 0) + +int tpm2_digest_many(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest, const struct iovec data[], size_t count, bool extend); +static inline int tpm2_digest_buffer(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest, const void *data, size_t len, bool extend) { + return tpm2_digest_many(alg, digest, &IOVEC_MAKE((void*) data, len), 1, extend); +} +int tpm2_digest_many_digests(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest, const TPM2B_DIGEST data[], size_t count, bool extend); +static inline int tpm2_digest_rehash(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest) { + return tpm2_digest_many(alg, digest, NULL, 0, true); +} +static inline int tpm2_digest_init(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest) { + return tpm2_digest_many(alg, digest, NULL, 0, false); +} + +void tpm2_log_debug_tpml_pcr_selection(const TPML_PCR_SELECTION *l, const char *msg); +void tpm2_log_debug_pcr_value(const Tpm2PCRValue *pcr_value, const char *msg); +void tpm2_log_debug_buffer(const void *buffer, size_t size, const char *msg); +void tpm2_log_debug_digest(const TPM2B_DIGEST *digest, const char *msg); +void tpm2_log_debug_name(const TPM2B_NAME *name, const char *msg); + +typedef struct Tpm2PCRPredictionResult { + TPM2B_DIGEST hash[TPM2_N_HASH_ALGORITHMS]; /* a hash for each potential algorithm */ +} Tpm2PCRPredictionResult; + +TPM2B_DIGEST *tpm2_pcr_prediction_result_get_hash(Tpm2PCRPredictionResult *result, uint16_t alg); + +/* A structure encapsulating a full set of PCR predictions with alternatives. This can be converted into a + * series of PolicyOR + PolicyPCR items for the TPM. */ +typedef struct Tpm2PCRPrediction { + uint32_t pcrs; /* A mask of pcrs included */ + OrderedSet* results[TPM2_PCRS_MAX]; /* set of Tpm2PCRPredictionResult objects, one for each PCR */ +} Tpm2PCRPrediction; + +void tpm2_pcr_prediction_done(Tpm2PCRPrediction *p); + +extern const struct hash_ops tpm2_pcr_prediction_result_hash_ops; + +bool tpm2_pcr_prediction_equal(Tpm2PCRPrediction *a, Tpm2PCRPrediction *b, uint16_t algorithm); + +int tpm2_pcr_prediction_to_json(const Tpm2PCRPrediction *prediction, uint16_t algorithm, JsonVariant **ret); +int tpm2_pcr_prediction_from_json(Tpm2PCRPrediction *prediction, uint16_t algorithm, JsonVariant *aj); + +/* As structure encapsulating all metadata stored for a pcrlock policy on disk */ +typedef struct Tpm2PCRLockPolicy { + /* The below is the fixed metadata encoding information about the NV index we store the + * PolicyAuthorizeNV policy in, as well as a pinned SRK, and the encrypted PIN to use for writing to + * the NV Index. */ + uint16_t algorithm; + uint32_t nv_index; + struct iovec nv_handle; + struct iovec nv_public; + struct iovec srk_handle; + struct iovec pin_public; + struct iovec pin_private; + + /* The below contains the current prediction whose resulting policy is stored in the NV + * index. Once in JSON and once in parsed form. When the policy is updated the fields below are + * changed, the fields above remain fixed. */ + JsonVariant *prediction_json; + Tpm2PCRPrediction prediction; +} Tpm2PCRLockPolicy; + +void tpm2_pcrlock_policy_done(Tpm2PCRLockPolicy *data); +int tpm2_pcrlock_search_file(const char *path, FILE **ret_file, char **ret_path); +int tpm2_pcrlock_policy_load(const char *path, Tpm2PCRLockPolicy *ret_policy); + +int tpm2_index_to_handle(Tpm2Context *c, TPM2_HANDLE index, const Tpm2Handle *session, TPM2B_PUBLIC **ret_public, TPM2B_NAME **ret_name, TPM2B_NAME **ret_qname, Tpm2Handle **ret_handle); +int tpm2_index_from_handle(Tpm2Context *c, const Tpm2Handle *handle, TPM2_HANDLE *ret_index); + +int tpm2_pcr_read(Tpm2Context *c, const TPML_PCR_SELECTION *pcr_selection, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values); +int tpm2_pcr_read_missing_values(Tpm2Context *c, Tpm2PCRValue *pcr_values, size_t n_pcr_values); + +int tpm2_get_pin_auth(TPMI_ALG_HASH hash, const char *pin, TPM2B_AUTH *ret_auth); +int tpm2_set_auth(Tpm2Context *c, const Tpm2Handle *handle, const char *pin); +int tpm2_set_auth_binary(Tpm2Context *c, const Tpm2Handle *handle, const TPM2B_AUTH *auth); + +int tpm2_make_policy_session(Tpm2Context *c, const Tpm2Handle *primary, const Tpm2Handle *encryption_session, Tpm2Handle **ret_session); + +int tpm2_policy_auth_value(Tpm2Context *c, const Tpm2Handle *session, TPM2B_DIGEST **ret_policy_digest); +int tpm2_policy_authorize_nv(Tpm2Context *c, const Tpm2Handle *session, const Tpm2Handle *nv_handle, TPM2B_DIGEST **ret_policy_digest); +int tpm2_policy_pcr(Tpm2Context *c, const Tpm2Handle *session, const TPML_PCR_SELECTION *pcr_selection, TPM2B_DIGEST **ret_policy_digest); +int tpm2_policy_or(Tpm2Context *c, const Tpm2Handle *session, const TPM2B_DIGEST *branches, size_t n_branches, TPM2B_DIGEST **ret_policy_digest); +int tpm2_policy_super_pcr(Tpm2Context *c, const Tpm2Handle *session, const Tpm2PCRPrediction *prediction, uint16_t algorithm); + +int tpm2_calculate_pubkey_name(const TPMT_PUBLIC *public, TPM2B_NAME *ret_name); +int tpm2_calculate_nv_index_name(const TPMS_NV_PUBLIC *nvpublic, TPM2B_NAME *ret_name); + +int tpm2_calculate_policy_auth_value(TPM2B_DIGEST *digest); +int tpm2_calculate_policy_authorize(const TPM2B_PUBLIC *public, const TPM2B_DIGEST *policy_ref, TPM2B_DIGEST *digest); +int tpm2_calculate_policy_authorize_nv(const TPM2B_NV_PUBLIC *public, TPM2B_DIGEST *digest); +int tpm2_calculate_policy_pcr(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPM2B_DIGEST *digest); +int tpm2_calculate_policy_or(const TPM2B_DIGEST *branches, size_t n_branches, TPM2B_DIGEST *digest); +int tpm2_calculate_policy_super_pcr(Tpm2PCRPrediction *prediction, uint16_t algorithm, TPM2B_DIGEST *pcr_policy); +int tpm2_calculate_serialize(TPM2_HANDLE handle, const TPM2B_NAME *name, const TPM2B_PUBLIC *public, void **ret_serialized, size_t *ret_serialized_size); +int tpm2_calculate_sealing_policy(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, const TPM2B_PUBLIC *public, bool use_pin, const Tpm2PCRLockPolicy *policy, TPM2B_DIGEST *digest); +int tpm2_calculate_seal(TPM2_HANDLE parent_handle, const TPM2B_PUBLIC *parent_public, const TPMA_OBJECT *attributes, const void *secret, size_t secret_size, const TPM2B_DIGEST *policy, const char *pin, void **ret_secret, size_t *ret_secret_size, void **ret_blob, size_t *ret_blob_size, void **ret_serialized_parent, size_t *ret_serialized_parent_size); + +int tpm2_get_srk_template(TPMI_ALG_PUBLIC alg, TPMT_PUBLIC *ret_template); +int tpm2_get_best_srk_template(Tpm2Context *c, TPMT_PUBLIC *ret_template); + +int tpm2_get_srk(Tpm2Context *c, const Tpm2Handle *session, TPM2B_PUBLIC **ret_public, TPM2B_NAME **ret_name, TPM2B_NAME **ret_qname, Tpm2Handle **ret_handle); +int tpm2_get_or_create_srk(Tpm2Context *c, const Tpm2Handle *session, TPM2B_PUBLIC **ret_public, TPM2B_NAME **ret_name, TPM2B_NAME **ret_qname, Tpm2Handle **ret_handle); + +int tpm2_seal(Tpm2Context *c, uint32_t seal_key_handle, const TPM2B_DIGEST *policy, const char *pin, void **ret_secret, size_t *ret_secret_size, void **ret_blob, size_t *ret_blob_size, uint16_t *ret_primary_alg, void **ret_srk_buf, size_t *ret_srk_buf_size); +int tpm2_unseal(Tpm2Context *c, uint32_t hash_pcr_mask, uint16_t pcr_bank, const void *pubkey, size_t pubkey_size, uint32_t pubkey_pcr_mask, JsonVariant *signature, const char *pin, const Tpm2PCRLockPolicy *pcrlock_policy, uint16_t primary_alg, const void *blob, size_t blob_size, const void *policy_hash, size_t policy_hash_size, const void *srk_buf, size_t srk_buf_size, void **ret_secret, size_t *ret_secret_size); + +#if HAVE_OPENSSL +int tpm2_tpm2b_public_to_openssl_pkey(const TPM2B_PUBLIC *public, EVP_PKEY **ret); +int tpm2_tpm2b_public_from_openssl_pkey(const EVP_PKEY *pkey, TPM2B_PUBLIC *ret); +#endif + +int tpm2_tpm2b_public_from_pem(const void *pem, size_t pem_size, TPM2B_PUBLIC *ret); +int tpm2_tpm2b_public_to_fingerprint(const TPM2B_PUBLIC *public, void **ret_fingerprint, size_t *ret_fingerprint_size); + +int tpm2_define_policy_nv_index(Tpm2Context *c, const Tpm2Handle *session, TPM2_HANDLE requested_nv_index, const TPM2B_DIGEST *write_policy, const char *pin, const TPM2B_AUTH *auth, TPM2_HANDLE *ret_nv_index, Tpm2Handle **ret_nv_handle, TPM2B_NV_PUBLIC *ret_nv_public); +int tpm2_write_policy_nv_index(Tpm2Context *c, const Tpm2Handle *policy_session, TPM2_HANDLE nv_index, const Tpm2Handle *nv_handle, const TPM2B_DIGEST *policy_digest); +int tpm2_undefine_policy_nv_index(Tpm2Context *c, const Tpm2Handle *session, TPM2_HANDLE nv_index, const Tpm2Handle *nv_handle); + +int tpm2_seal_data(Tpm2Context *c, const struct iovec *data, const Tpm2Handle *primary_handle, const Tpm2Handle *encryption_session, const TPM2B_DIGEST *policy, struct iovec *ret_public, struct iovec *ret_private); +int tpm2_unseal_data(Tpm2Context *c, const struct iovec *public, const struct iovec *private, const Tpm2Handle *primary_handle, const Tpm2Handle *policy_session, const Tpm2Handle *encryption_session, struct iovec *ret_data); + +int tpm2_serialize(Tpm2Context *c, const Tpm2Handle *handle, void **ret_serialized, size_t *ret_serialized_size); +int tpm2_deserialize(Tpm2Context *c, const void *serialized, size_t serialized_size, Tpm2Handle **ret_handle); + +int tpm2_load_public_key_file(const char *path, TPM2B_PUBLIC *ret); + +/* The tpm2-tss library has many structs that are simply a combination of an array (or object) and + * size. These macros allow easily initializing or assigning instances of such structs from an existing + * buffer/object and size, while also checking the size for safety with the struct buffer/object size. If the + * provided buffer/object is NULL, the resulting struct's buffer/object will be 0s. If the provided size is + * larger than the struct's buffer/object size, this results in assertion failure; to check the size, use one + * of the TPM2B_*_CHECK_SIZE() macros. */ +#define TPM2B_AUTH_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_AUTH, buffer, size) +#define TPM2B_DATA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_DATA, buffer, size) +#define TPM2B_DIGEST_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_DIGEST, buffer, size) +#define TPM2B_ECC_PARAMETER_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_ECC_PARAMETER, buffer, size) +#define TPM2B_ENCRYPTED_SECRET_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_ENCRYPTED_SECRET, secret, size) +#define TPM2B_MAX_BUFFER_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_MAX_BUFFER, buffer, size) +#define TPM2B_NAME_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_NAME, name, size) +#define TPM2B_PRIVATE_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_PRIVATE, buffer, size) +#define TPM2B_PRIVATE_KEY_RSA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_PRIVATE_KEY_RSA, buffer, size) +#define TPM2B_PUBLIC_KEY_RSA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_PUBLIC_KEY_RSA, buffer, size) +#define TPM2B_SENSITIVE_DATA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_SENSITIVE_DATA, buffer, size) +#define TPM2B_BUF_SIZE_STRUCT_MAKE(buf, size, struct_type, buffer_field, size_field) \ + _TPM2B_BUF_SIZE_STRUCT_MAKE(buf, size, UNIQ, struct_type, buffer_field, size_field) +#define _TPM2B_BUF_SIZE_STRUCT_MAKE(buf, size, uniq, struct_type, buffer_field, size_field) \ + ({ \ + typeof(buf) UNIQ_T(BUF, uniq) = (buf); \ + typeof(size) UNIQ_T(SIZE, uniq) = (size); \ + struct_type UNIQ_T(STRUCT, uniq) = { .size_field = UNIQ_T(SIZE, uniq), }; \ + assert(sizeof(UNIQ_T(STRUCT, uniq).buffer_field) >= (size_t) UNIQ_T(SIZE, uniq)); \ + if (UNIQ_T(BUF, uniq)) \ + memcpy_safe(UNIQ_T(STRUCT, uniq).buffer_field, UNIQ_T(BUF, uniq), UNIQ_T(SIZE, uniq)); \ + UNIQ_T(STRUCT, uniq); \ + }) + +/* Check if the size will fit in the TPM2B struct buffer. Returns 0 if the size will fit, otherwise this logs + * a debug message and returns < 0. */ +#define TPM2B_AUTH_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_AUTH, buffer) +#define TPM2B_DATA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_DATA, buffer) +#define TPM2B_DIGEST_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_DIGEST, buffer) +#define TPM2B_ECC_PARAMETER_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_ECC_PARAMETER, buffer) +#define TPM2B_ENCRYPTED_SECRET_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_ENCRYPTED_SECRET, buffer) +#define TPM2B_MAX_BUFFER_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_MAX_BUFFER, buffer) +#define TPM2B_NAME_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_NAME, name) +#define TPM2B_PRIVATE_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_PRIVATE, buffer) +#define TPM2B_PRIVATE_KEY_RSA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_PRIVATE_KEY_RSA, buffer) +#define TPM2B_PUBLIC_KEY_RSA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_PUBLIC_KEY_RSA, buffer) +#define TPM2B_SENSITIVE_DATA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_SENSITIVE_DATA, buffer) +#define TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(size, struct_type, buffer_field) \ + _TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(size, UNIQ, struct_type, buffer_field) +#define _TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(size, uniq, struct_type, buffer_field) \ + ({ \ + size_t UNIQ_T(SIZE, uniq) = (size_t) (size); \ + size_t UNIQ_T(BUFSIZE, uniq) = sizeof_field(struct_type, buffer_field); \ + UNIQ_T(BUFSIZE, uniq) < UNIQ_T(SIZE, uniq) ? \ + log_debug_errno(SYNTHETIC_ERRNO(EINVAL), \ + "Size %zu larger than " #struct_type " buffer size %zu.", \ + UNIQ_T(SIZE, uniq), UNIQ_T(BUFSIZE, uniq)) : \ + 0; \ + }) + +#else /* HAVE_TPM2 */ +typedef struct {} Tpm2Context; +typedef struct {} Tpm2Handle; +typedef struct {} Tpm2PCRValue; + +#define TPM2_PCR_VALUE_MAKE(i, h, v) (Tpm2PCRValue) {} + +static inline int tpm2_pcrlock_search_file(const char *path, FILE **ret_file, char **ret_path) { + return -ENOENT; +} + +#endif /* HAVE_TPM2 */ + +int tpm2_list_devices(void); +int tpm2_find_device_auto(char **ret); + +int tpm2_make_pcr_json_array(uint32_t pcr_mask, JsonVariant **ret); +int tpm2_parse_pcr_json_array(JsonVariant *v, uint32_t *ret); + +int tpm2_make_luks2_json(int keyslot, uint32_t hash_pcr_mask, uint16_t pcr_bank, const void *pubkey, size_t pubkey_size, uint32_t pubkey_pcr_mask, uint16_t primary_alg, const void *blob, size_t blob_size, const void *policy_hash, size_t policy_hash_size, const void *salt, size_t salt_size, const void *srk_buf, size_t srk_buf_size, TPM2Flags flags, JsonVariant **ret); +int tpm2_parse_luks2_json(JsonVariant *v, int *ret_keyslot, uint32_t *ret_hash_pcr_mask, uint16_t *ret_pcr_bank, void **ret_pubkey, size_t *ret_pubkey_size, uint32_t *ret_pubkey_pcr_mask, uint16_t *ret_primary_alg, void **ret_blob, size_t *ret_blob_size, void **ret_policy_hash, size_t *ret_policy_hash_size, void **ret_salt, size_t *ret_salt_size, void **ret_srk_buf, size_t *ret_srk_buf_size, TPM2Flags *ret_flags); + +/* Default to PCR 7 only */ +#define TPM2_PCR_INDEX_DEFAULT UINT32_C(7) +#define TPM2_PCR_MASK_DEFAULT INDEX_TO_MASK(uint32_t, TPM2_PCR_INDEX_DEFAULT) + +/* We want the helpers below to work also if TPM2 libs are not available, hence define these four defines if + * they are missing. */ +#ifndef TPM2_ALG_SHA1 +#define TPM2_ALG_SHA1 0x4 +#endif + +#ifndef TPM2_ALG_SHA256 +#define TPM2_ALG_SHA256 0xB +#endif + +#ifndef TPM2_ALG_SHA384 +#define TPM2_ALG_SHA384 0xC +#endif + +#ifndef TPM2_ALG_SHA512 +#define TPM2_ALG_SHA512 0xD +#endif + +#ifndef TPM2_ALG_ECC +#define TPM2_ALG_ECC 0x23 +#endif + +#ifndef TPM2_ALG_RSA +#define TPM2_ALG_RSA 0x1 +#endif + +int tpm2_hash_alg_to_size(uint16_t alg); + +const char *tpm2_hash_alg_to_string(uint16_t alg) _const_; +int tpm2_hash_alg_from_string(const char *alg) _pure_; + +const char *tpm2_asym_alg_to_string(uint16_t alg) _const_; +int tpm2_asym_alg_from_string(const char *alg) _pure_; + +const char *tpm2_sym_alg_to_string(uint16_t alg) _const_; +int tpm2_sym_alg_from_string(const char *alg) _pure_; + +const char *tpm2_sym_mode_to_string(uint16_t mode) _const_; +int tpm2_sym_mode_from_string(const char *mode) _pure_; + +char *tpm2_pcr_mask_to_string(uint32_t mask); + +extern const uint16_t tpm2_hash_algorithms[]; + +typedef struct { + uint32_t search_pcr_mask; + const char *device; + const char *signature_path; + const char *pcrlock_path; +} systemd_tpm2_plugin_params; + +typedef enum Tpm2Support { + /* NOTE! The systemd-creds tool returns these flags 1:1 as exit status. Hence these flags are pretty + * much ABI! Hence, be extra careful when changing/extending these definitions. */ + TPM2_SUPPORT_NONE = 0, /* no support */ + TPM2_SUPPORT_FIRMWARE = 1 << 0, /* firmware reports TPM2 was used */ + TPM2_SUPPORT_DRIVER = 1 << 1, /* the kernel has a driver loaded for it */ + TPM2_SUPPORT_SYSTEM = 1 << 2, /* we support it ourselves */ + TPM2_SUPPORT_SUBSYSTEM = 1 << 3, /* the kernel has the tpm subsystem enabled */ + TPM2_SUPPORT_LIBRARIES = 1 << 4, /* we can dlopen the tpm2 libraries */ + TPM2_SUPPORT_FULL = TPM2_SUPPORT_FIRMWARE|TPM2_SUPPORT_DRIVER|TPM2_SUPPORT_SYSTEM|TPM2_SUPPORT_SUBSYSTEM|TPM2_SUPPORT_LIBRARIES, +} Tpm2Support; + +Tpm2Support tpm2_support(void); + +int tpm2_parse_pcr_argument(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values); +int tpm2_parse_pcr_argument_append(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values); +int tpm2_parse_pcr_argument_to_mask(const char *arg, uint32_t *mask); + +int tpm2_load_pcr_signature(const char *path, JsonVariant **ret); +int tpm2_load_pcr_public_key(const char *path, void **ret_pubkey, size_t *ret_pubkey_size); + +int tpm2_util_pbkdf2_hmac_sha256(const void *pass, + size_t passlen, + const void *salt, + size_t saltlen, + uint8_t res[static SHA256_DIGEST_SIZE]); + +enum { + /* Additional defines for the PCR index naming enum from "fundamental/tpm2-pcr.h" */ + _TPM2_PCR_INDEX_MAX_DEFINED = TPM2_PCRS_MAX, + _TPM2_PCR_INDEX_INVALID = -EINVAL, +}; + +int tpm2_pcr_index_from_string(const char *s) _pure_; +const char *tpm2_pcr_index_to_string(int pcr) _const_; diff --git a/src/shared/udev-util.c b/src/shared/udev-util.c new file mode 100644 index 0000000..cf28ba8 --- /dev/null +++ b/src/shared/udev-util.c @@ -0,0 +1,439 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "device-nodes.h" +#include "device-private.h" +#include "device-util.h" +#include "env-file.h" +#include "errno-util.h" +#include "fd-util.h" +#include "id128-util.h" +#include "log.h" +#include "macro.h" +#include "parse-util.h" +#include "path-util.h" +#include "signal-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "udev-util.h" +#include "utf8.h" + +int udev_set_max_log_level(char *str) { + size_t n; + + /* This may modify input string. */ + + if (isempty(str)) + return 0; + + /* unquote */ + n = strlen(str); + if (n >= 2 && + ((str[0] == '"' && str[n - 1] == '"') || + (str[0] == '\'' && str[n - 1] == '\''))) { + str[n - 1] = '\0'; + str++; + } + + /* we set the udev log level here explicitly, this is supposed + * to regulate the code in libudev/ and udev/. */ + return log_set_max_level_from_string(str); +} + +int udev_parse_config(void) { + _cleanup_free_ char *log_val = NULL; + int r; + + r = parse_env_file(NULL, "/etc/udev/udev.conf", + "udev_log", &log_val); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + r = udev_set_max_log_level(log_val); + if (r < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to set udev log level '%s', ignoring: %m", log_val); + + return 0; +} + +struct DeviceMonitorData { + const char *sysname; + const char *devlink; + sd_device *device; +}; + +static void device_monitor_data_free(struct DeviceMonitorData *d) { + assert(d); + + sd_device_unref(d->device); +} + +static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) { + struct DeviceMonitorData *data = ASSERT_PTR(userdata); + const char *sysname; + + assert(device); + assert(data->sysname || data->devlink); + assert(!data->device); + + /* Ignore REMOVE events here. We are waiting for initialization after all, not de-initialization. We + * might see a REMOVE event from an earlier use of the device (devices by the same name are recycled + * by the kernel after all), which we should not get confused by. After all we cannot distinguish use + * cycles of the devices, as the udev queue is entirely asynchronous. + * + * If we see a REMOVE event here for the use cycle we actually care about then we won't notice of + * course, but that should be OK, given the timeout logic used on the wait loop: this will be noticed + * by means of -ETIMEDOUT. Thus we won't notice immediately, but eventually, and that should be + * sufficient for an error path that should regularly not happen. + * + * (And yes, we only need to special case REMOVE. It's the only "negative" event type, where a device + * ceases to exist. All other event types are "positive": the device exists and is registered in the + * udev database, thus whenever we see the event, we can consider it initialized.) */ + if (device_for_action(device, SD_DEVICE_REMOVE)) + return 0; + + if (data->sysname && sd_device_get_sysname(device, &sysname) >= 0 && streq(sysname, data->sysname)) + goto found; + + if (data->devlink) { + const char *devlink; + + FOREACH_DEVICE_DEVLINK(device, link) + if (path_equal(link, data->devlink)) + goto found; + + if (sd_device_get_devname(device, &devlink) >= 0 && path_equal(devlink, data->devlink)) + goto found; + } + + return 0; + +found: + data->device = sd_device_ref(device); + return sd_event_exit(sd_device_monitor_get_event(monitor), 0); +} + +static int device_wait_for_initialization_internal( + sd_device *_device, + const char *devlink, + const char *subsystem, + usec_t timeout_usec, + sd_device **ret) { + + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + /* Ensure that if !_device && devlink, device gets unrefd on errors since it will be new */ + _cleanup_(sd_device_unrefp) sd_device *device = sd_device_ref(_device); + _cleanup_(device_monitor_data_free) struct DeviceMonitorData data = { + .devlink = devlink, + }; + int r; + + assert(device || (subsystem && devlink)); + + /* Devlink might already exist, if it does get the device to use the sysname filtering */ + if (!device && devlink) { + r = sd_device_new_from_devname(&device, devlink); + if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r)) + return log_error_errno(r, "Failed to create sd-device object from %s: %m", devlink); + } + + if (device) { + if (sd_device_get_is_initialized(device) > 0) { + if (ret) + *ret = sd_device_ref(device); + return 0; + } + /* We need either the sysname or the devlink for filtering */ + assert_se(sd_device_get_sysname(device, &data.sysname) >= 0 || devlink); + } + + /* Wait until the device is initialized, so that we can get access to the ID_PATH property */ + + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event: %m"); + + r = sd_device_monitor_new(&monitor); + if (r < 0) + return log_error_errno(r, "Failed to acquire monitor: %m"); + + if (device && !subsystem) { + r = sd_device_get_subsystem(device, &subsystem); + if (r < 0 && r != -ENOENT) + return log_device_error_errno(device, r, "Failed to get subsystem: %m"); + } + + if (subsystem) { + r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, subsystem, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add %s subsystem match to monitor: %m", subsystem); + } + + _cleanup_free_ char *desc = NULL; + const char *sysname = NULL; + if (device) + (void) sd_device_get_sysname(device, &sysname); + + desc = strjoin(sysname ?: subsystem, devlink ? ":" : ":initialization", devlink); + if (desc) + (void) sd_device_monitor_set_description(monitor, desc); + + r = sd_device_monitor_attach_event(monitor, event); + if (r < 0) + return log_error_errno(r, "Failed to attach event to device monitor: %m"); + + r = sd_device_monitor_start(monitor, device_monitor_handler, &data); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + if (timeout_usec != USEC_INFINITY) { + r = sd_event_add_time_relative( + event, NULL, + CLOCK_MONOTONIC, timeout_usec, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)); + if (r < 0) + return log_error_errno(r, "Failed to add timeout event source: %m"); + } + + /* Check again, maybe things changed. Udev will re-read the db if the device wasn't initialized yet. */ + if (!device && devlink) { + r = sd_device_new_from_devname(&device, devlink); + if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r)) + return log_error_errno(r, "Failed to create sd-device object from %s: %m", devlink); + } + if (device && sd_device_get_is_initialized(device) > 0) { + if (ret) + *ret = sd_device_ref(device); + return 0; + } + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to wait for device to be initialized: %m"); + + if (ret) + *ret = TAKE_PTR(data.device); + return 0; +} + +int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout_usec, sd_device **ret) { + return device_wait_for_initialization_internal(device, NULL, subsystem, timeout_usec, ret); +} + +int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t timeout_usec, sd_device **ret) { + return device_wait_for_initialization_internal(NULL, devlink, subsystem, timeout_usec, ret); +} + +int device_is_renaming(sd_device *dev) { + int r; + + assert(dev); + + r = sd_device_get_property_value(dev, "ID_RENAMING", NULL); + if (r == -ENOENT) + return false; + if (r < 0) + return r; + + return true; +} + +bool device_for_action(sd_device *dev, sd_device_action_t a) { + sd_device_action_t b; + + assert(dev); + + if (a < 0) + return false; + + if (sd_device_get_action(dev, &b) < 0) + return false; + + return a == b; +} + +void log_device_uevent(sd_device *device, const char *str) { + sd_device_action_t action = _SD_DEVICE_ACTION_INVALID; + sd_id128_t event_id = SD_ID128_NULL; + uint64_t seqnum = 0; + + if (!DEBUG_LOGGING) + return; + + (void) sd_device_get_seqnum(device, &seqnum); + (void) sd_device_get_action(device, &action); + (void) sd_device_get_trigger_uuid(device, &event_id); + log_device_debug(device, "%s%s(SEQNUM=%"PRIu64", ACTION=%s%s%s)", + strempty(str), isempty(str) ? "" : " ", + seqnum, strna(device_action_to_string(action)), + sd_id128_is_null(event_id) ? "" : ", UUID=", + sd_id128_is_null(event_id) ? "" : SD_ID128_TO_UUID_STRING(event_id)); +} + +size_t udev_replace_whitespace(const char *str, char *to, size_t len) { + bool is_space = false; + size_t i, j; + + assert(str); + assert(to); + + /* Copy from 'str' to 'to', while removing all leading and trailing whitespace, and replacing + * each run of consecutive whitespace with a single underscore. The chars from 'str' are copied + * up to the \0 at the end of the string, or at most 'len' chars. This appends \0 to 'to', at + * the end of the copied characters. + * + * If 'len' chars are copied into 'to', the final \0 is placed at len+1 (i.e. 'to[len] = \0'), + * so the 'to' buffer must have at least len+1 chars available. + * + * Note this may be called with 'str' == 'to', i.e. to replace whitespace in-place in a buffer. + * This function can handle that situation. + * + * Note that only 'len' characters are read from 'str'. */ + + i = strspn(str, WHITESPACE); + + for (j = 0; j < len && i < len && str[i] != '\0'; i++) { + if (isspace(str[i])) { + is_space = true; + continue; + } + + if (is_space) { + if (j + 1 >= len) + break; + + to[j++] = '_'; + is_space = false; + } + to[j++] = str[i]; + } + + to[j] = '\0'; + return j; +} + +size_t udev_replace_chars(char *str, const char *allow) { + size_t i = 0, replaced = 0; + + assert(str); + + /* allow chars in allow list, plain ascii, hex-escaping and valid utf8. */ + + while (str[i] != '\0') { + int len; + + if (allow_listed_char_for_devnode(str[i], allow)) { + i++; + continue; + } + + /* accept hex encoding */ + if (str[i] == '\\' && str[i+1] == 'x') { + i += 2; + continue; + } + + /* accept valid utf8 */ + len = utf8_encoded_valid_unichar(str + i, SIZE_MAX); + if (len > 1) { + i += len; + continue; + } + + /* if space is allowed, replace whitespace with ordinary space */ + if (isspace(str[i]) && allow && strchr(allow, ' ')) { + str[i] = ' '; + i++; + replaced++; + continue; + } + + /* everything else is replaced with '_' */ + str[i] = '_'; + i++; + replaced++; + } + return replaced; +} + +int udev_queue_is_empty(void) { + return access("/run/udev/queue", F_OK) < 0 ? + (errno == ENOENT ? true : -errno) : false; +} + +bool udev_available(void) { + static int cache = -1; + + /* The service systemd-udevd is started only when /sys is read write. + * See systemd-udevd.service: ConditionPathIsReadWrite=/sys + * Also, our container interface (http://systemd.io/CONTAINER_INTERFACE/) states that /sys must + * be mounted in read-only mode in containers. */ + + if (cache >= 0) + return cache; + + return (cache = (path_is_read_only_fs("/sys/") <= 0)); +} + +int device_get_vendor_string(sd_device *device, const char **ret) { + int r; + + assert(device); + + FOREACH_STRING(field, "ID_VENDOR_FROM_DATABASE", "ID_VENDOR") { + r = sd_device_get_property_value(device, field, ret); + if (r != -ENOENT) + return r; + } + + return -ENOENT; +} + +int device_get_model_string(sd_device *device, const char **ret) { + int r; + + assert(device); + + FOREACH_STRING(field, "ID_MODEL_FROM_DATABASE", "ID_MODEL") { + r = sd_device_get_property_value(device, field, ret); + if (r != -ENOENT) + return r; + } + + return -ENOENT; +} + +int device_get_property_value_with_fallback( + sd_device *device, + const char *prop, + Hashmap *extra_props, + const char **ret) { + const char *value; + int r; + + assert(device); + assert(prop); + assert(ret); + + r = sd_device_get_property_value(device, prop, &value); + if (r < 0) { + if (r != -ENOENT) + return r; + + value = hashmap_get(extra_props, prop); + if (!value) + return -ENOENT; + } + + *ret = value; + + return 1; +} diff --git a/src/shared/udev-util.h b/src/shared/udev-util.h new file mode 100644 index 0000000..651d335 --- /dev/null +++ b/src/shared/udev-util.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-device.h" + +#include "hashmap.h" +#include "time-util.h" + +int udev_set_max_log_level(char *str); +int udev_parse_config(void); + +int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout_usec, sd_device **ret); +int device_wait_for_devlink(const char *path, const char *subsystem, usec_t timeout_usec, sd_device **ret); +int device_is_renaming(sd_device *dev); + +bool device_for_action(sd_device *dev, sd_device_action_t action); + +void log_device_uevent(sd_device *device, const char *str); + +size_t udev_replace_whitespace(const char *str, char *to, size_t len); +size_t udev_replace_chars(char *str, const char *allow); + +int udev_queue_is_empty(void); + +bool udev_available(void); + +int device_get_vendor_string(sd_device *device, const char **ret); +int device_get_model_string(sd_device *device, const char **ret); + +int device_get_property_value_with_fallback( + sd_device *device, + const char *prop, + Hashmap *extra_props, + const char **ret); diff --git a/src/shared/user-record-nss.c b/src/shared/user-record-nss.c new file mode 100644 index 0000000..414a493 --- /dev/null +++ b/src/shared/user-record-nss.c @@ -0,0 +1,529 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "errno-util.h" +#include "format-util.h" +#include "libcrypt-util.h" +#include "strv.h" +#include "user-record-nss.h" +#include "user-util.h" +#include "utf8.h" + +#define SET_IF(field, condition, value, fallback) \ + field = (condition) ? (value) : (fallback) + +static const char* utf8_only(const char *s) { + return s && utf8_is_valid(s) ? s : NULL; +} + +static int strv_extend_strv_utf8_only(char ***dst, char **src, bool filter_duplicates) { + _cleanup_free_ char **t = NULL; + size_t l, j = 0; + + /* First, do a shallow copy of s, filtering for only valid utf-8 strings */ + l = strv_length(src); + t = new(char*, l + 1); + if (!t) + return -ENOMEM; + + for (size_t i = 0; i < l; i++) + if (utf8_is_valid(src[i])) + t[j++] = src[i]; + if (j == 0) + return 0; + + t[j] = NULL; + return strv_extend_strv(dst, t, filter_duplicates); +} + +int nss_passwd_to_user_record( + const struct passwd *pwd, + const struct spwd *spwd, + UserRecord **ret) { + + _cleanup_(user_record_unrefp) UserRecord *hr = NULL; + int r; + + assert(pwd); + + if (isempty(pwd->pw_name)) + return -EINVAL; + + if (spwd && !streq_ptr(spwd->sp_namp, pwd->pw_name)) + return -EINVAL; + + hr = user_record_new(); + if (!hr) + return -ENOMEM; + + r = free_and_strdup(&hr->user_name, pwd->pw_name); + if (r < 0) + return r; + + /* Some bad NSS modules synthesize GECOS fields with embedded ":" or "\n" characters, which are not + * something we can output in /etc/passwd compatible format, since these are record separators + * there. We normally refuse that, but we need to maintain compatibility with arbitrary NSS modules, + * hence let's do what glibc does: mangle the data to fit the format. */ + if (isempty(pwd->pw_gecos) || streq_ptr(pwd->pw_gecos, hr->user_name)) + hr->real_name = mfree(hr->real_name); + else if (valid_gecos(pwd->pw_gecos)) { + r = free_and_strdup(&hr->real_name, pwd->pw_gecos); + if (r < 0) + return r; + } else { + _cleanup_free_ char *mangled = NULL; + + mangled = mangle_gecos(pwd->pw_gecos); + if (!mangled) + return -ENOMEM; + + free_and_replace(hr->real_name, mangled); + } + + r = free_and_strdup(&hr->home_directory, utf8_only(empty_to_null(pwd->pw_dir))); + if (r < 0) + return r; + + r = free_and_strdup(&hr->shell, utf8_only(empty_to_null(pwd->pw_shell))); + if (r < 0) + return r; + + hr->uid = pwd->pw_uid; + hr->gid = pwd->pw_gid; + + if (spwd && + looks_like_hashed_password(utf8_only(spwd->sp_pwdp))) { /* Ignore locked, disabled, and mojibake passwords */ + strv_free_erase(hr->hashed_password); + hr->hashed_password = strv_new(spwd->sp_pwdp); + if (!hr->hashed_password) + return -ENOMEM; + } else + hr->hashed_password = strv_free_erase(hr->hashed_password); + + /* shadow-utils suggests using "chage -E 0" (or -E 1, depending on which man page you check) + * for locking a whole account, hence check for that. Note that it also defines a way to lock + * just a password instead of the whole account, but that's mostly pointless in times of + * password-less authorization, hence let's not bother. */ + + SET_IF(hr->locked, + spwd && spwd->sp_expire >= 0, + spwd->sp_expire <= 1, -1); + + SET_IF(hr->not_after_usec, + spwd && spwd->sp_expire > 1 && (uint64_t) spwd->sp_expire < (UINT64_MAX-1)/USEC_PER_DAY, + spwd->sp_expire * USEC_PER_DAY, UINT64_MAX); + + SET_IF(hr->password_change_now, + spwd && spwd->sp_lstchg >= 0, + spwd->sp_lstchg == 0, -1); + + SET_IF(hr->last_password_change_usec, + spwd && spwd->sp_lstchg > 0 && (uint64_t) spwd->sp_lstchg <= (UINT64_MAX-1)/USEC_PER_DAY, + spwd->sp_lstchg * USEC_PER_DAY, UINT64_MAX); + + SET_IF(hr->password_change_min_usec, + spwd && spwd->sp_min > 0 && (uint64_t) spwd->sp_min <= (UINT64_MAX-1)/USEC_PER_DAY, + spwd->sp_min * USEC_PER_DAY, UINT64_MAX); + + SET_IF(hr->password_change_max_usec, + spwd && spwd->sp_max > 0 && (uint64_t) spwd->sp_max <= (UINT64_MAX-1)/USEC_PER_DAY, + spwd->sp_max * USEC_PER_DAY, UINT64_MAX); + + SET_IF(hr->password_change_warn_usec, + spwd && spwd->sp_warn > 0 && (uint64_t) spwd->sp_warn <= (UINT64_MAX-1)/USEC_PER_DAY, + spwd->sp_warn * USEC_PER_DAY, UINT64_MAX); + + SET_IF(hr->password_change_inactive_usec, + spwd && spwd->sp_inact > 0 && (uint64_t) spwd->sp_inact <= (UINT64_MAX-1)/USEC_PER_DAY, + spwd->sp_inact * USEC_PER_DAY, UINT64_MAX); + + hr->json = json_variant_unref(hr->json); + r = json_build(&hr->json, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(hr->user_name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(hr->uid)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(hr->gid)), + JSON_BUILD_PAIR_CONDITION(hr->real_name, "realName", JSON_BUILD_STRING(hr->real_name)), + JSON_BUILD_PAIR_CONDITION(hr->home_directory, "homeDirectory", JSON_BUILD_STRING(hr->home_directory)), + JSON_BUILD_PAIR_CONDITION(hr->shell, "shell", JSON_BUILD_STRING(hr->shell)), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(hr->hashed_password), "privileged", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRV(hr->hashed_password)))), + JSON_BUILD_PAIR_CONDITION(hr->locked >= 0, "locked", JSON_BUILD_BOOLEAN(hr->locked)), + JSON_BUILD_PAIR_CONDITION(hr->not_after_usec != UINT64_MAX, "notAfterUSec", JSON_BUILD_UNSIGNED(hr->not_after_usec)), + JSON_BUILD_PAIR_CONDITION(hr->password_change_now >= 0, "passwordChangeNow", JSON_BUILD_BOOLEAN(hr->password_change_now)), + JSON_BUILD_PAIR_CONDITION(hr->last_password_change_usec != UINT64_MAX, "lastPasswordChangeUSec", JSON_BUILD_UNSIGNED(hr->last_password_change_usec)), + JSON_BUILD_PAIR_CONDITION(hr->password_change_min_usec != UINT64_MAX, "passwordChangeMinUSec", JSON_BUILD_UNSIGNED(hr->password_change_min_usec)), + JSON_BUILD_PAIR_CONDITION(hr->password_change_max_usec != UINT64_MAX, "passwordChangeMaxUSec", JSON_BUILD_UNSIGNED(hr->password_change_max_usec)), + JSON_BUILD_PAIR_CONDITION(hr->password_change_warn_usec != UINT64_MAX, "passwordChangeWarnUSec", JSON_BUILD_UNSIGNED(hr->password_change_warn_usec)), + JSON_BUILD_PAIR_CONDITION(hr->password_change_inactive_usec != UINT64_MAX, "passwordChangeInactiveUSec", JSON_BUILD_UNSIGNED(hr->password_change_inactive_usec)))); + + if (r < 0) + return r; + + hr->mask = USER_RECORD_REGULAR | + (!strv_isempty(hr->hashed_password) ? USER_RECORD_PRIVILEGED : 0); + + if (ret) + *ret = TAKE_PTR(hr); + return 0; +} + +int nss_spwd_for_passwd(const struct passwd *pwd, struct spwd *ret_spwd, char **ret_buffer) { + size_t buflen = 4096; + int r; + + assert(pwd); + assert(ret_spwd); + assert(ret_buffer); + + for (;;) { + _cleanup_free_ char *buf = NULL; + struct spwd spwd, *result; + + buf = malloc(buflen); + if (!buf) + return -ENOMEM; + + r = getspnam_r(pwd->pw_name, &spwd, buf, buflen, &result); + if (r == 0) { + if (!result) + return -ESRCH; + + *ret_spwd = *result; + *ret_buffer = TAKE_PTR(buf); + return 0; + } + if (r < 0) + return -EIO; /* Weird, this should not return negative! */ + if (r != ERANGE) + return -r; + + if (buflen > SIZE_MAX / 2) + return -ERANGE; + + buflen *= 2; + buf = mfree(buf); + } +} + +int nss_user_record_by_name( + const char *name, + bool with_shadow, + UserRecord **ret) { + + _cleanup_free_ char *buf = NULL, *sbuf = NULL; + struct passwd pwd, *result; + bool incomplete = false; + size_t buflen = 4096; + struct spwd spwd, *sresult = NULL; + int r; + + assert(name); + + for (;;) { + buf = malloc(buflen); + if (!buf) + return -ENOMEM; + + r = getpwnam_r(name, &pwd, buf, buflen, &result); + if (r == 0) { + if (!result) + return -ESRCH; + + break; + } + + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getpwnam_r() returned a negative value"); + if (r != ERANGE) + return -r; + + if (buflen > SIZE_MAX / 2) + return -ERANGE; + + buflen *= 2; + buf = mfree(buf); + } + + if (with_shadow) { + r = nss_spwd_for_passwd(result, &spwd, &sbuf); + if (r < 0) { + log_debug_errno(r, "Failed to do shadow lookup for user %s, ignoring: %m", name); + incomplete = ERRNO_IS_PRIVILEGE(r); + } else + sresult = &spwd; + } else + incomplete = true; + + r = nss_passwd_to_user_record(result, sresult, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->incomplete = incomplete; + return 0; +} + +int nss_user_record_by_uid( + uid_t uid, + bool with_shadow, + UserRecord **ret) { + + _cleanup_free_ char *buf = NULL, *sbuf = NULL; + struct passwd pwd, *result; + bool incomplete = false; + size_t buflen = 4096; + struct spwd spwd, *sresult = NULL; + int r; + + for (;;) { + buf = malloc(buflen); + if (!buf) + return -ENOMEM; + + r = getpwuid_r(uid, &pwd, buf, buflen, &result); + if (r == 0) { + if (!result) + return -ESRCH; + + break; + } + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getpwuid_r() returned a negative value"); + if (r != ERANGE) + return -r; + + if (buflen > SIZE_MAX / 2) + return -ERANGE; + + buflen *= 2; + buf = mfree(buf); + } + + if (with_shadow) { + r = nss_spwd_for_passwd(result, &spwd, &sbuf); + if (r < 0) { + log_debug_errno(r, "Failed to do shadow lookup for UID " UID_FMT ", ignoring: %m", uid); + incomplete = ERRNO_IS_PRIVILEGE(r); + } else + sresult = &spwd; + } else + incomplete = true; + + r = nss_passwd_to_user_record(result, sresult, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->incomplete = incomplete; + return 0; +} + +int nss_group_to_group_record( + const struct group *grp, + const struct sgrp *sgrp, + GroupRecord **ret) { + + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + int r; + + assert(grp); + + if (isempty(grp->gr_name)) + return -EINVAL; + + if (sgrp && !streq_ptr(sgrp->sg_namp, grp->gr_name)) + return -EINVAL; + + g = group_record_new(); + if (!g) + return -ENOMEM; + + g->group_name = strdup(grp->gr_name); + if (!g->group_name) + return -ENOMEM; + + r = strv_extend_strv_utf8_only(&g->members, grp->gr_mem, false); + if (r < 0) + return r; + + g->gid = grp->gr_gid; + + if (sgrp) { + if (looks_like_hashed_password(utf8_only(sgrp->sg_passwd))) { + g->hashed_password = strv_new(sgrp->sg_passwd); + if (!g->hashed_password) + return -ENOMEM; + } + + r = strv_extend_strv_utf8_only(&g->members, sgrp->sg_mem, true); + if (r < 0) + return r; + + r = strv_extend_strv_utf8_only(&g->administrators, sgrp->sg_adm, false); + if (r < 0) + return r; + } + + r = json_build(&g->json, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(g->group_name)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(g->gid)), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(g->members), "members", JSON_BUILD_STRV(g->members)), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(g->hashed_password), "privileged", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRV(g->hashed_password)))), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(g->administrators), "administrators", JSON_BUILD_STRV(g->administrators)))); + if (r < 0) + return r; + + g->mask = USER_RECORD_REGULAR | + (!strv_isempty(g->hashed_password) ? USER_RECORD_PRIVILEGED : 0); + + if (ret) + *ret = TAKE_PTR(g); + return 0; +} + +int nss_sgrp_for_group(const struct group *grp, struct sgrp *ret_sgrp, char **ret_buffer) { + size_t buflen = 4096; + int r; + + assert(grp); + assert(ret_sgrp); + assert(ret_buffer); + + for (;;) { + _cleanup_free_ char *buf = NULL; + struct sgrp sgrp, *result; + + buf = malloc(buflen); + if (!buf) + return -ENOMEM; + + r = getsgnam_r(grp->gr_name, &sgrp, buf, buflen, &result); + if (r == 0) { + if (!result) + return -ESRCH; + + *ret_sgrp = *result; + *ret_buffer = TAKE_PTR(buf); + return 0; + } + if (r < 0) + return -EIO; /* Weird, this should not return negative! */ + if (r != ERANGE) + return -r; + + if (buflen > SIZE_MAX / 2) + return -ERANGE; + + buflen *= 2; + buf = mfree(buf); + } +} + +int nss_group_record_by_name( + const char *name, + bool with_shadow, + GroupRecord **ret) { + + _cleanup_free_ char *buf = NULL, *sbuf = NULL; + struct group grp, *result; + bool incomplete = false; + size_t buflen = 4096; + struct sgrp sgrp, *sresult = NULL; + int r; + + assert(name); + + for (;;) { + buf = malloc(buflen); + if (!buf) + return -ENOMEM; + + r = getgrnam_r(name, &grp, buf, buflen, &result); + if (r == 0) { + if (!result) + return -ESRCH; + + break; + } + + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getgrnam_r() returned a negative value"); + if (r != ERANGE) + return -r; + if (buflen > SIZE_MAX / 2) + return -ERANGE; + + buflen *= 2; + buf = mfree(buf); + } + + if (with_shadow) { + r = nss_sgrp_for_group(result, &sgrp, &sbuf); + if (r < 0) { + log_debug_errno(r, "Failed to do shadow lookup for group %s, ignoring: %m", result->gr_name); + incomplete = ERRNO_IS_PRIVILEGE(r); + } else + sresult = &sgrp; + } else + incomplete = true; + + r = nss_group_to_group_record(result, sresult, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->incomplete = incomplete; + return 0; +} + +int nss_group_record_by_gid( + gid_t gid, + bool with_shadow, + GroupRecord **ret) { + + _cleanup_free_ char *buf = NULL, *sbuf = NULL; + struct group grp, *result; + bool incomplete = false; + size_t buflen = 4096; + struct sgrp sgrp, *sresult = NULL; + int r; + + for (;;) { + buf = malloc(buflen); + if (!buf) + return -ENOMEM; + + r = getgrgid_r(gid, &grp, buf, buflen, &result); + if (r == 0) { + if (!result) + return -ESRCH; + break; + } + + if (r < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getgrgid_r() returned a negative value"); + if (r != ERANGE) + return -r; + if (buflen > SIZE_MAX / 2) + return -ERANGE; + + buflen *= 2; + buf = mfree(buf); + } + + if (with_shadow) { + r = nss_sgrp_for_group(result, &sgrp, &sbuf); + if (r < 0) { + log_debug_errno(r, "Failed to do shadow lookup for group %s, ignoring: %m", result->gr_name); + incomplete = ERRNO_IS_PRIVILEGE(r); + } else + sresult = &sgrp; + } else + incomplete = true; + + r = nss_group_to_group_record(result, sresult, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->incomplete = incomplete; + return 0; +} diff --git a/src/shared/user-record-nss.h b/src/shared/user-record-nss.h new file mode 100644 index 0000000..22ab04d --- /dev/null +++ b/src/shared/user-record-nss.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include +#include + +#include "group-record.h" +#include "user-record.h" + +/* Synthesize UserRecord and GroupRecord objects from NSS data */ + +int nss_passwd_to_user_record(const struct passwd *pwd, const struct spwd *spwd, UserRecord **ret); +int nss_spwd_for_passwd(const struct passwd *pwd, struct spwd *ret_spwd, char **ret_buffer); + +int nss_user_record_by_name(const char *name, bool with_shadow, UserRecord **ret); +int nss_user_record_by_uid(uid_t uid, bool with_shadow, UserRecord **ret); + +int nss_group_to_group_record(const struct group *grp, const struct sgrp *sgrp, GroupRecord **ret); +int nss_sgrp_for_group(const struct group *grp, struct sgrp *ret_sgrp, char **ret_buffer); + +int nss_group_record_by_name(const char *name, bool with_shadow, GroupRecord **ret); +int nss_group_record_by_gid(gid_t gid, bool with_shadow, GroupRecord **ret); diff --git a/src/shared/user-record-show.c b/src/shared/user-record-show.c new file mode 100644 index 0000000..28fa7a8 --- /dev/null +++ b/src/shared/user-record-show.c @@ -0,0 +1,601 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "cap-list.h" +#include "format-util.h" +#include "fs-util.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "user-record-show.h" +#include "user-util.h" +#include "userdb.h" + +const char *user_record_state_color(const char *state) { + if (STR_IN_SET(state, "unfixated", "absent")) + return ansi_grey(); + else if (streq(state, "active")) + return ansi_highlight_green(); + else if (STR_IN_SET(state, "locked", "dirty")) + return ansi_highlight_yellow(); + + return NULL; +} + +void user_record_show(UserRecord *hr, bool show_full_group_info) { + const char *hd, *ip, *shell; + UserStorage storage; + usec_t t; + size_t k; + int r, b; + + printf(" User name: %s\n", + user_record_user_name_and_realm(hr)); + + if (hr->state) { + const char *color; + + color = user_record_state_color(hr->state); + + printf(" State: %s%s%s\n", + strempty(color), hr->state, color ? ansi_normal() : ""); + } + + printf(" Disposition: %s\n", user_disposition_to_string(user_record_disposition(hr))); + + if (hr->last_change_usec != USEC_INFINITY) { + printf(" Last Change: %s\n", FORMAT_TIMESTAMP(hr->last_change_usec)); + + if (hr->last_change_usec > now(CLOCK_REALTIME)) + printf(" %sModification time lies in the future, system clock wrong?%s\n", + ansi_highlight_yellow(), ansi_normal()); + } + + if (hr->last_password_change_usec != USEC_INFINITY && + hr->last_password_change_usec != hr->last_change_usec) + printf(" Last Passw.: %s\n", FORMAT_TIMESTAMP(hr->last_password_change_usec)); + + r = user_record_test_blocked(hr); + switch (r) { + + case -ENOLCK: + printf(" Login OK: %sno%s (record is locked)\n", ansi_highlight_red(), ansi_normal()); + break; + + case -EL2HLT: + printf(" Login OK: %sno%s (record not valid yet))\n", ansi_highlight_red(), ansi_normal()); + break; + + case -EL3HLT: + printf(" Login OK: %sno%s (record not valid anymore))\n", ansi_highlight_red(), ansi_normal()); + break; + + case -ESTALE: + default: { + usec_t y; + + if (r < 0 && r != -ESTALE) { + errno = -r; + printf(" Login OK: %sno%s (%m)\n", ansi_highlight_red(), ansi_normal()); + break; + } + + if (is_nologin_shell(user_record_shell(hr))) { + printf(" Login OK: %sno%s (nologin shell)\n", ansi_highlight_red(), ansi_normal()); + break; + } + + y = user_record_ratelimit_next_try(hr); + if (y != USEC_INFINITY && y > now(CLOCK_REALTIME)) { + printf(" Login OK: %sno%s (ratelimit)\n", ansi_highlight_red(), ansi_normal()); + break; + } + + printf(" Login OK: %syes%s\n", ansi_highlight_green(), ansi_normal()); + break; + }} + + r = user_record_test_password_change_required(hr); + switch (r) { + + case -EKEYREVOKED: + printf(" Password OK: %schange now%s\n", ansi_highlight_yellow(), ansi_normal()); + break; + + case -EOWNERDEAD: + printf(" Password OK: %sexpired%s (change now!)\n", ansi_highlight_yellow(), ansi_normal()); + break; + + case -EKEYREJECTED: + printf(" Password OK: %sexpired%s (for good)\n", ansi_highlight_red(), ansi_normal()); + break; + + case -EKEYEXPIRED: + printf(" Password OK: %sexpires soon%s\n", ansi_highlight_yellow(), ansi_normal()); + break; + + case -ENETDOWN: + printf(" Password OK: %sno timestamp%s\n", ansi_highlight_red(), ansi_normal()); + break; + + case -EROFS: + printf(" Password OK: %schange not permitted%s\n", ansi_highlight_yellow(), ansi_normal()); + break; + + case -ESTALE: + printf(" Password OK: %slast password change in future%s\n", ansi_highlight_yellow(), ansi_normal()); + break; + + default: + if (r < 0) { + errno = -r; + printf(" Password OK: %sno%s (%m)\n", ansi_highlight_yellow(), ansi_normal()); + break; + } + + if (strv_isempty(hr->hashed_password)) { + if (hr->incomplete) /* Record might be incomplete, due to privs */ + break; + printf(" Password OK: %sno%s (none set)\n", ansi_highlight(), ansi_normal()); + break; + } + if (strv_contains(hr->hashed_password, "")) { + printf(" Password OK: %sno%s (empty set)\n", ansi_highlight_red(), ansi_normal()); + break; + } + bool has_valid_passwords = false; + STRV_FOREACH(p, hr->hashed_password) + if (!hashed_password_is_locked_or_invalid(*p)) { + has_valid_passwords = true; + break; + } + if (has_valid_passwords) + printf(" Password OK: %syes%s\n", ansi_highlight_green(), ansi_normal()); + else + printf(" Password OK: %sno%s (locked)\n", ansi_highlight(), ansi_normal()); + } + if (uid_is_valid(hr->uid)) + printf(" UID: " UID_FMT "\n", hr->uid); + if (gid_is_valid(hr->gid)) { + if (show_full_group_info) { + _cleanup_(group_record_unrefp) GroupRecord *gr = NULL; + + r = groupdb_by_gid(hr->gid, 0, &gr); + if (r < 0) { + errno = -r; + printf(" GID: " GID_FMT " (unresolvable: %m)\n", hr->gid); + } else + printf(" GID: " GID_FMT " (%s)\n", hr->gid, gr->group_name); + } else + printf(" GID: " GID_FMT "\n", hr->gid); + } else if (uid_is_valid(hr->uid)) /* Show UID as GID if not separately configured */ + printf(" GID: " GID_FMT "\n", (gid_t) hr->uid); + + if (show_full_group_info) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + + r = membershipdb_by_user(hr->user_name, 0, &iterator); + if (r < 0) { + errno = -r; + printf(" Aux. Groups: (can't acquire: %m)\n"); + } else { + const char *prefix = " Aux. Groups:"; + + for (;;) { + _cleanup_free_ char *group = NULL; + + r = membershipdb_iterator_get(iterator, NULL, &group); + if (r == -ESRCH) + break; + if (r < 0) { + errno = -r; + printf("%s (can't iterate: %m)\n", prefix); + break; + } + + printf("%s %s\n", prefix, group); + prefix = " "; + } + } + } + + if (hr->real_name && !streq(hr->real_name, hr->user_name)) + printf(" Real Name: %s\n", hr->real_name); + + hd = user_record_home_directory(hr); + if (hd) + printf(" Directory: %s\n", hd); + + storage = user_record_storage(hr); + if (storage >= 0) /* Let's be political, and clarify which storage we like, and which we don't. About CIFS we don't complain. */ + printf(" Storage: %s%s\n", user_storage_to_string(storage), + storage == USER_LUKS ? " (strong encryption)" : + storage == USER_FSCRYPT ? " (weak encryption)" : + IN_SET(storage, USER_DIRECTORY, USER_SUBVOLUME) ? " (no encryption)" : ""); + + ip = user_record_image_path(hr); + if (ip && !streq_ptr(ip, hd)) + printf(" Image Path: %s\n", ip); + + b = user_record_removable(hr); + if (b >= 0) + printf(" Removable: %s\n", yes_no(b)); + + shell = user_record_shell(hr); + if (shell) + printf(" Shell: %s\n", shell); + + if (hr->email_address) + printf(" Email: %s\n", hr->email_address); + if (hr->location) + printf(" Location: %s\n", hr->location); + if (hr->password_hint) + printf(" Passw. Hint: %s\n", hr->password_hint); + if (hr->icon_name) + printf(" Icon Name: %s\n", hr->icon_name); + + if (hr->time_zone) + printf(" Time Zone: %s\n", hr->time_zone); + + if (hr->preferred_language) + printf(" Language: %s\n", hr->preferred_language); + + if (!strv_isempty(hr->environment)) + STRV_FOREACH(i, hr->environment) { + printf(i == hr->environment ? + " Environment: %s\n" : + " %s\n", *i); + } + + if (hr->locked >= 0) + printf(" Locked: %s\n", yes_no(hr->locked)); + + if (hr->not_before_usec != UINT64_MAX) + printf(" Not Before: %s\n", FORMAT_TIMESTAMP(hr->not_before_usec)); + + if (hr->not_after_usec != UINT64_MAX) + printf(" Not After: %s\n", FORMAT_TIMESTAMP(hr->not_after_usec)); + + if (hr->umask != MODE_INVALID) + printf(" UMask: 0%03o\n", hr->umask); + + if (nice_is_valid(hr->nice_level)) + printf(" Nice: %i\n", hr->nice_level); + + for (int j = 0; j < _RLIMIT_MAX; j++) { + if (hr->rlimits[j]) + printf(" Limit: RLIMIT_%s=%" PRIu64 ":%" PRIu64 "\n", + rlimit_to_string(j), (uint64_t) hr->rlimits[j]->rlim_cur, (uint64_t) hr->rlimits[j]->rlim_max); + } + + if (hr->tasks_max != UINT64_MAX) + printf(" Tasks Max: %" PRIu64 "\n", hr->tasks_max); + + if (hr->memory_high != UINT64_MAX) + printf(" Memory High: %s\n", FORMAT_BYTES(hr->memory_high)); + + if (hr->memory_max != UINT64_MAX) + printf(" Memory Max: %s\n", FORMAT_BYTES(hr->memory_max)); + + if (hr->cpu_weight == CGROUP_WEIGHT_IDLE) + printf(" CPU Weight: %s\n", "idle"); + else if (hr->cpu_weight != UINT64_MAX) + printf(" CPU Weight: %" PRIu64 "\n", hr->cpu_weight); + + if (hr->io_weight != UINT64_MAX) + printf(" IO Weight: %" PRIu64 "\n", hr->io_weight); + + if (hr->access_mode != MODE_INVALID) + printf(" Access Mode: 0%03o\n", user_record_access_mode(hr)); + + uint64_t caps = user_record_capability_bounding_set(hr); + if (caps != UINT64_MAX) { + _cleanup_free_ char *scaps = NULL; + + (void) capability_set_to_string_negative(caps, &scaps); + printf(" Bound. Caps: %s\n", strna(scaps)); + } + + caps = user_record_capability_ambient_set(hr); + if (caps != UINT64_MAX) { + _cleanup_free_ char *scaps = NULL; + + (void) capability_set_to_string(caps, &scaps); + printf("Ambient Caps: %s\n", strna(scaps)); + } + + if (storage == USER_LUKS) { + printf("LUKS Discard: online=%s offline=%s\n", yes_no(user_record_luks_discard(hr)), yes_no(user_record_luks_offline_discard(hr))); + + if (!sd_id128_is_null(hr->luks_uuid)) + printf(" LUKS UUID: " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(hr->luks_uuid)); + if (!sd_id128_is_null(hr->partition_uuid)) + printf(" Part UUID: " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(hr->partition_uuid)); + if (!sd_id128_is_null(hr->file_system_uuid)) + printf(" FS UUID: " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(hr->file_system_uuid)); + + if (hr->file_system_type) + printf(" File System: %s\n", user_record_file_system_type(hr)); + + if (hr->luks_extra_mount_options) + printf("LUKS MntOpts: %s\n", hr->luks_extra_mount_options); + + if (hr->luks_cipher) + printf(" LUKS Cipher: %s\n", hr->luks_cipher); + if (hr->luks_cipher_mode) + printf(" Cipher Mode: %s\n", hr->luks_cipher_mode); + if (hr->luks_volume_key_size != UINT64_MAX) + printf(" Volume Key: %" PRIu64 "bit\n", hr->luks_volume_key_size * 8); + + if (hr->luks_pbkdf_type) + printf(" PBKDF Type: %s\n", hr->luks_pbkdf_type); + if (hr->luks_pbkdf_hash_algorithm) + printf(" PBKDF Hash: %s\n", hr->luks_pbkdf_hash_algorithm); + if (hr->luks_pbkdf_force_iterations != UINT64_MAX) + printf(" PBKDF Iters: %" PRIu64 "\n", hr->luks_pbkdf_force_iterations); + if (hr->luks_pbkdf_time_cost_usec != UINT64_MAX) + printf(" PBKDF Time: %s\n", FORMAT_TIMESPAN(hr->luks_pbkdf_time_cost_usec, 0)); + if (hr->luks_pbkdf_memory_cost != UINT64_MAX) + printf(" PBKDF Bytes: %s\n", FORMAT_BYTES(hr->luks_pbkdf_memory_cost)); + + if (hr->luks_pbkdf_parallel_threads != UINT64_MAX) + printf("PBKDF Thread: %" PRIu64 "\n", hr->luks_pbkdf_parallel_threads); + if (hr->luks_sector_size != UINT64_MAX) + printf(" Sector Size: %" PRIu64 "\n", hr->luks_sector_size); + + } else if (storage == USER_CIFS) { + + if (hr->cifs_service) + printf("CIFS Service: %s\n", hr->cifs_service); + + if (hr->cifs_extra_mount_options) + printf("CIFS MntOpts: %s\n", hr->cifs_extra_mount_options); + } + + if (hr->cifs_user_name) + printf(" CIFS User: %s\n", user_record_cifs_user_name(hr)); + if (hr->cifs_domain) + printf(" CIFS Domain: %s\n", hr->cifs_domain); + + if (storage != USER_CLASSIC) + printf(" Mount Flags: %s %s %s\n", + hr->nosuid ? "nosuid" : "suid", + hr->nodev ? "nodev" : "dev", + hr->noexec ? "noexec" : "exec"); + + if (hr->skeleton_directory) + printf(" Skel. Dir.: %s\n", user_record_skeleton_directory(hr)); + + if (hr->disk_size != UINT64_MAX) + printf(" Disk Size: %s\n", FORMAT_BYTES(hr->disk_size)); + + if (hr->disk_usage != UINT64_MAX) { + if (hr->disk_size != UINT64_MAX) { + unsigned permille; + + permille = (unsigned) DIV_ROUND_UP(hr->disk_usage * 1000U, hr->disk_size); /* Round up! */ + printf(" Disk Usage: %s (= %u.%01u%%)\n", + FORMAT_BYTES(hr->disk_usage), + permille / 10, permille % 10); + } else + printf(" Disk Usage: %s\n", FORMAT_BYTES(hr->disk_usage)); + } + + if (hr->disk_free != UINT64_MAX) { + if (hr->disk_size != UINT64_MAX) { + const char *color_on, *color_off; + unsigned permille; + + permille = (unsigned) ((hr->disk_free * 1000U) / hr->disk_size); /* Round down! */ + + /* Color the output red or yellow if we are below 10% resp. 25% free. Because 10% and + * 25% can be a lot of space still, let's additionally make some absolute + * restrictions: 1G and 2G */ + if (permille <= 100U && + hr->disk_free < 1024U*1024U*1024U /* 1G */) { + color_on = ansi_highlight_red(); + color_off = ansi_normal(); + } else if (permille <= 250U && + hr->disk_free < 2U*1024U*1024U*1024U /* 2G */) { + color_on = ansi_highlight_yellow(); + color_off = ansi_normal(); + } else + color_on = color_off = ""; + + printf(" Disk Free: %s%s (= %u.%01u%%)%s\n", + color_on, + FORMAT_BYTES(hr->disk_free), + permille / 10, permille % 10, + color_off); + } else + printf(" Disk Free: %s\n", FORMAT_BYTES(hr->disk_free)); + } + + if (hr->disk_floor != UINT64_MAX) + printf(" Disk Floor: %s\n", FORMAT_BYTES(hr->disk_floor)); + + if (hr->disk_ceiling != UINT64_MAX) + printf("Disk Ceiling: %s\n", FORMAT_BYTES(hr->disk_ceiling)); + + if (hr->good_authentication_counter != UINT64_MAX) + printf(" Good Auth.: %" PRIu64 "\n", hr->good_authentication_counter); + + if (hr->last_good_authentication_usec != UINT64_MAX) + printf(" Last Good: %s\n", FORMAT_TIMESTAMP(hr->last_good_authentication_usec)); + + if (hr->bad_authentication_counter != UINT64_MAX) + printf(" Bad Auth.: %" PRIu64 "\n", hr->bad_authentication_counter); + + if (hr->last_bad_authentication_usec != UINT64_MAX) + printf(" Last Bad: %s\n", FORMAT_TIMESTAMP(hr->last_bad_authentication_usec)); + + t = user_record_ratelimit_next_try(hr); + if (t != USEC_INFINITY) { + usec_t n = now(CLOCK_REALTIME); + + if (t <= n) + printf(" Next Try: anytime\n"); + else + printf(" Next Try: %sin %s%s\n", + ansi_highlight_red(), + FORMAT_TIMESPAN(t - n, USEC_PER_SEC), + ansi_normal()); + } + + if (storage != USER_CLASSIC) + printf(" Auth. Limit: %" PRIu64 " attempts per %s\n", user_record_ratelimit_burst(hr), + FORMAT_TIMESPAN(user_record_ratelimit_interval_usec(hr), 0)); + + if (hr->enforce_password_policy >= 0) + printf(" Passwd Pol.: %s\n", yes_no(hr->enforce_password_policy)); + + if (hr->password_change_min_usec != UINT64_MAX || + hr->password_change_max_usec != UINT64_MAX || + hr->password_change_warn_usec != UINT64_MAX || + hr->password_change_inactive_usec != UINT64_MAX) { + + printf(" Passwd Chg.:"); + + if (hr->password_change_min_usec != UINT64_MAX) { + printf(" min %s", FORMAT_TIMESPAN(hr->password_change_min_usec, 0)); + + if (hr->password_change_max_usec != UINT64_MAX) + printf(" …"); + } + + if (hr->password_change_max_usec != UINT64_MAX) + printf(" max %s", FORMAT_TIMESPAN(hr->password_change_max_usec, 0)); + + if (hr->password_change_warn_usec != UINT64_MAX) + printf("/warn %s", FORMAT_TIMESPAN(hr->password_change_warn_usec, 0)); + + if (hr->password_change_inactive_usec != UINT64_MAX) + printf("/inactive %s", FORMAT_TIMESPAN(hr->password_change_inactive_usec, 0)); + + printf("\n"); + } + + if (hr->password_change_now >= 0) + printf("Pas. Ch. Now: %s\n", yes_no(hr->password_change_now)); + + if (hr->drop_caches >= 0 || user_record_drop_caches(hr)) + printf(" Drop Caches: %s\n", yes_no(user_record_drop_caches(hr))); + + if (hr->auto_resize_mode >= 0) + printf(" Auto Resize: %s\n", auto_resize_mode_to_string(user_record_auto_resize_mode(hr))); + + if (hr->rebalance_weight != REBALANCE_WEIGHT_UNSET) { + uint64_t rb; + + rb = user_record_rebalance_weight(hr); + if (rb == REBALANCE_WEIGHT_OFF) + printf(" Rebalance: off\n"); + else + printf(" Rebalance: weight %" PRIu64 "\n", rb); + } + + if (!strv_isempty(hr->ssh_authorized_keys)) + printf("SSH Pub. Key: %zu\n", strv_length(hr->ssh_authorized_keys)); + + if (!strv_isempty(hr->pkcs11_token_uri)) + STRV_FOREACH(i, hr->pkcs11_token_uri) + printf(i == hr->pkcs11_token_uri ? + "PKCS11 Token: %s\n" : + " %s\n", *i); + + if (hr->n_fido2_hmac_credential > 0) + printf(" FIDO2 Token: %zu\n", hr->n_fido2_hmac_credential); + + if (!strv_isempty(hr->recovery_key_type)) + printf("Recovery Key: %zu\n", strv_length(hr->recovery_key_type)); + + k = strv_length(hr->hashed_password); + if (k == 0) + printf(" Passwords: %snone%s\n", + user_record_disposition(hr) == USER_REGULAR ? ansi_highlight_yellow() : ansi_normal(), ansi_normal()); + else + printf(" Passwords: %zu\n", k); + + if (hr->signed_locally >= 0) + printf(" Local Sig.: %s\n", yes_no(hr->signed_locally)); + + if (hr->stop_delay_usec != UINT64_MAX) + printf(" Stop Delay: %s\n", FORMAT_TIMESPAN(hr->stop_delay_usec, 0)); + + if (hr->auto_login >= 0) + printf("Autom. Login: %s\n", yes_no(hr->auto_login)); + + if (hr->kill_processes >= 0) + printf(" Kill Proc.: %s\n", yes_no(hr->kill_processes)); + + if (hr->service) + printf(" Service: %s\n", hr->service); +} + +void group_record_show(GroupRecord *gr, bool show_full_user_info) { + int r; + + printf(" Group name: %s\n", + group_record_group_name_and_realm(gr)); + + printf(" Disposition: %s\n", user_disposition_to_string(group_record_disposition(gr))); + + if (gr->last_change_usec != USEC_INFINITY) + printf(" Last Change: %s\n", FORMAT_TIMESTAMP(gr->last_change_usec)); + + if (gid_is_valid(gr->gid)) + printf(" GID: " GID_FMT "\n", gr->gid); + + if (show_full_user_info) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + + r = membershipdb_by_group(gr->group_name, 0, &iterator); + if (r < 0) { + errno = -r; + printf(" Members: (can't acquire: %m)"); + } else { + const char *prefix = " Members:"; + + for (;;) { + _cleanup_free_ char *user = NULL; + + r = membershipdb_iterator_get(iterator, &user, NULL); + if (r == -ESRCH) + break; + if (r < 0) { + errno = -r; + printf("%s (can't iterate: %m\n", prefix); + break; + } + + printf("%s %s\n", prefix, user); + prefix = " "; + } + } + } else { + const char *prefix = " Members:"; + + STRV_FOREACH(i, gr->members) { + printf("%s %s\n", prefix, *i); + prefix = " "; + } + } + + if (!strv_isempty(gr->administrators)) { + const char *prefix = " Admins:"; + + STRV_FOREACH(i, gr->administrators) { + printf("%s %s\n", prefix, *i); + prefix = " "; + } + } + + if (gr->description && !streq(gr->description, gr->group_name)) + printf(" Description: %s\n", gr->description); + + if (!strv_isempty(gr->hashed_password)) + printf(" Passwords: %zu\n", strv_length(gr->hashed_password)); + + if (gr->service) + printf(" Service: %s\n", gr->service); +} diff --git a/src/shared/user-record-show.h b/src/shared/user-record-show.h new file mode 100644 index 0000000..dcef065 --- /dev/null +++ b/src/shared/user-record-show.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "user-record.h" +#include "group-record.h" + +const char *user_record_state_color(const char *state); + +void user_record_show(UserRecord *hr, bool show_full_group_info); +void group_record_show(GroupRecord *gr, bool show_full_user_info); diff --git a/src/shared/user-record.c b/src/shared/user-record.c new file mode 100644 index 0000000..3fe3e80 --- /dev/null +++ b/src/shared/user-record.c @@ -0,0 +1,2319 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "cap-list.h" +#include "cgroup-util.h" +#include "dns-domain.h" +#include "env-util.h" +#include "fs-util.h" +#include "glyph-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "memory-util.h" +#include "path-util.h" +#include "pkcs11-util.h" +#include "rlimit-util.h" +#include "string-table.h" +#include "strv.h" +#include "uid-alloc-range.h" +#include "user-record.h" +#include "user-util.h" + +#define DEFAULT_RATELIMIT_BURST 30 +#define DEFAULT_RATELIMIT_INTERVAL_USEC (1*USEC_PER_MINUTE) + +UserRecord* user_record_new(void) { + UserRecord *h; + + h = new(UserRecord, 1); + if (!h) + return NULL; + + *h = (UserRecord) { + .n_ref = 1, + .disposition = _USER_DISPOSITION_INVALID, + .last_change_usec = UINT64_MAX, + .last_password_change_usec = UINT64_MAX, + .umask = MODE_INVALID, + .nice_level = INT_MAX, + .not_before_usec = UINT64_MAX, + .not_after_usec = UINT64_MAX, + .locked = -1, + .storage = _USER_STORAGE_INVALID, + .access_mode = MODE_INVALID, + .disk_size = UINT64_MAX, + .disk_size_relative = UINT64_MAX, + .tasks_max = UINT64_MAX, + .memory_high = UINT64_MAX, + .memory_max = UINT64_MAX, + .cpu_weight = UINT64_MAX, + .io_weight = UINT64_MAX, + .uid = UID_INVALID, + .gid = GID_INVALID, + .nodev = true, + .nosuid = true, + .luks_discard = -1, + .luks_offline_discard = -1, + .luks_volume_key_size = UINT64_MAX, + .luks_pbkdf_force_iterations = UINT64_MAX, + .luks_pbkdf_time_cost_usec = UINT64_MAX, + .luks_pbkdf_memory_cost = UINT64_MAX, + .luks_pbkdf_parallel_threads = UINT64_MAX, + .luks_sector_size = UINT64_MAX, + .disk_usage = UINT64_MAX, + .disk_free = UINT64_MAX, + .disk_ceiling = UINT64_MAX, + .disk_floor = UINT64_MAX, + .signed_locally = -1, + .good_authentication_counter = UINT64_MAX, + .bad_authentication_counter = UINT64_MAX, + .last_good_authentication_usec = UINT64_MAX, + .last_bad_authentication_usec = UINT64_MAX, + .ratelimit_begin_usec = UINT64_MAX, + .ratelimit_count = UINT64_MAX, + .ratelimit_interval_usec = UINT64_MAX, + .ratelimit_burst = UINT64_MAX, + .removable = -1, + .enforce_password_policy = -1, + .auto_login = -1, + .stop_delay_usec = UINT64_MAX, + .kill_processes = -1, + .password_change_min_usec = UINT64_MAX, + .password_change_max_usec = UINT64_MAX, + .password_change_warn_usec = UINT64_MAX, + .password_change_inactive_usec = UINT64_MAX, + .password_change_now = -1, + .pkcs11_protected_authentication_path_permitted = -1, + .fido2_user_presence_permitted = -1, + .fido2_user_verification_permitted = -1, + .drop_caches = -1, + .auto_resize_mode = _AUTO_RESIZE_MODE_INVALID, + .rebalance_weight = REBALANCE_WEIGHT_UNSET, + }; + + return h; +} + +static void pkcs11_encrypted_key_done(Pkcs11EncryptedKey *k) { + if (!k) + return; + + free(k->uri); + erase_and_free(k->data); + erase_and_free(k->hashed_password); +} + +static void fido2_hmac_credential_done(Fido2HmacCredential *c) { + if (!c) + return; + + free(c->id); +} + +static void fido2_hmac_salt_done(Fido2HmacSalt *s) { + if (!s) + return; + + fido2_hmac_credential_done(&s->credential); + erase_and_free(s->salt); + erase_and_free(s->hashed_password); +} + +static void recovery_key_done(RecoveryKey *k) { + if (!k) + return; + + free(k->type); + erase_and_free(k->hashed_password); +} + +static UserRecord* user_record_free(UserRecord *h) { + if (!h) + return NULL; + + free(h->user_name); + free(h->realm); + free(h->user_name_and_realm_auto); + free(h->real_name); + free(h->email_address); + erase_and_free(h->password_hint); + free(h->location); + free(h->icon_name); + + free(h->shell); + + strv_free(h->environment); + free(h->time_zone); + free(h->preferred_language); + rlimit_free_all(h->rlimits); + + free(h->skeleton_directory); + + strv_free_erase(h->hashed_password); + strv_free_erase(h->ssh_authorized_keys); + strv_free_erase(h->password); + strv_free_erase(h->token_pin); + + free(h->cifs_service); + free(h->cifs_user_name); + free(h->cifs_domain); + free(h->cifs_extra_mount_options); + + free(h->image_path); + free(h->image_path_auto); + free(h->home_directory); + free(h->home_directory_auto); + + strv_free(h->member_of); + strv_free(h->capability_bounding_set); + strv_free(h->capability_ambient_set); + + free(h->file_system_type); + free(h->luks_cipher); + free(h->luks_cipher_mode); + free(h->luks_pbkdf_hash_algorithm); + free(h->luks_pbkdf_type); + free(h->luks_extra_mount_options); + + free(h->state); + free(h->service); + + strv_free(h->pkcs11_token_uri); + for (size_t i = 0; i < h->n_pkcs11_encrypted_key; i++) + pkcs11_encrypted_key_done(h->pkcs11_encrypted_key + i); + free(h->pkcs11_encrypted_key); + + for (size_t i = 0; i < h->n_fido2_hmac_credential; i++) + fido2_hmac_credential_done(h->fido2_hmac_credential + i); + for (size_t i = 0; i < h->n_fido2_hmac_salt; i++) + fido2_hmac_salt_done(h->fido2_hmac_salt + i); + + strv_free(h->recovery_key_type); + for (size_t i = 0; i < h->n_recovery_key; i++) + recovery_key_done(h->recovery_key + i); + + json_variant_unref(h->json); + + return mfree(h); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(UserRecord, user_record, user_record_free); + +int json_dispatch_realm(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + r = dns_name_is_valid(n); + if (r < 0) + return json_log(variant, flags, r, "Failed to check if JSON field '%s' is a valid DNS domain.", strna(name)); + if (r == 0) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid DNS domain.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +int json_dispatch_gecos(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (valid_gecos(n)) { + if (free_and_strdup(s, n) < 0) + return json_log_oom(variant, flags); + } else { + _cleanup_free_ char *m = NULL; + + json_log(variant, flags|JSON_DEBUG, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid GECOS compatible string, mangling.", strna(name)); + + m = mangle_gecos(n); + if (!m) + return json_log_oom(variant, flags); + + free_and_replace(*s, m); + } + + return 0; +} + +static int json_dispatch_nice(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + int *nl = userdata; + int64_t m; + + if (json_variant_is_null(variant)) { + *nl = INT_MAX; + return 0; + } + + if (!json_variant_is_integer(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + m = json_variant_integer(variant); + if (m < PRIO_MIN || m >= PRIO_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' is not a valid nice level.", strna(name)); + + *nl = m; + return 0; +} + +static int json_dispatch_rlimit_value(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + rlim_t *ret = userdata; + + if (json_variant_is_null(variant)) + *ret = RLIM_INFINITY; + else if (json_variant_is_unsigned(variant)) { + uint64_t w; + + w = json_variant_unsigned(variant); + if (w == RLIM_INFINITY || (uint64_t) w != json_variant_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "Resource limit value '%s' is out of range.", name); + + *ret = (rlim_t) w; + } else + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit value '%s' is not an unsigned integer.", name); + + return 0; +} + +static int json_dispatch_rlimits(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + struct rlimit** limits = userdata; + JsonVariant *value; + const char *key; + int r; + + assert_se(limits); + + if (json_variant_is_null(variant)) { + rlimit_free_all(limits); + return 0; + } + + if (!json_variant_is_object(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name)); + + JSON_VARIANT_OBJECT_FOREACH(key, value, variant) { + JsonVariant *jcur, *jmax; + struct rlimit rl; + const char *p; + int l; + + p = startswith(key, "RLIMIT_"); + if (!p) + l = -SYNTHETIC_ERRNO(EINVAL); + else + l = rlimit_from_string(p); + if (l < 0) + return json_log(variant, flags, l, "Resource limit '%s' not known.", key); + + if (!json_variant_is_object(value)) + return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' has invalid value.", key); + + if (json_variant_elements(value) != 4) + return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' value is does not have two fields as expected.", key); + + jcur = json_variant_by_key(value, "cur"); + if (!jcur) + return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' lacks 'cur' field.", key); + r = json_dispatch_rlimit_value("cur", jcur, flags, &rl.rlim_cur); + if (r < 0) + return r; + + jmax = json_variant_by_key(value, "max"); + if (!jmax) + return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' lacks 'max' field.", key); + r = json_dispatch_rlimit_value("max", jmax, flags, &rl.rlim_max); + if (r < 0) + return r; + + if (limits[l]) + *(limits[l]) = rl; + else { + limits[l] = newdup(struct rlimit, &rl, 1); + if (!limits[l]) + return log_oom(); + } + } + + return 0; +} + +static int json_dispatch_filename_or_path(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = ASSERT_PTR(userdata); + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (!filename_is_valid(n) && !path_is_normalized(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid file name or normalized path.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +static int json_dispatch_path(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (!path_is_normalized(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a normalized file system path.", strna(name)); + if (!path_is_absolute(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an absolute file system path.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +static int json_dispatch_home_directory(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (!valid_home(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid home directory path.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +static int json_dispatch_image_path(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (empty_or_root(n) || !path_is_valid(n) || !path_is_absolute(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid image path.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +static int json_dispatch_umask(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + mode_t *m = userdata; + uint64_t k; + + if (json_variant_is_null(variant)) { + *m = MODE_INVALID; + return 0; + } + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a number.", strna(name)); + + k = json_variant_unsigned(variant); + if (k > 0777) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), + "JSON field '%s' outside of valid range 0%s0777.", + strna(name), special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + *m = (mode_t) k; + return 0; +} + +static int json_dispatch_access_mode(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + mode_t *m = userdata; + uint64_t k; + + if (json_variant_is_null(variant)) { + *m = MODE_INVALID; + return 0; + } + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a number.", strna(name)); + + k = json_variant_unsigned(variant); + if (k > 07777) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), + "JSON field '%s' outside of valid range 0%s07777.", + strna(name), special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + *m = (mode_t) k; + return 0; +} + +static int json_dispatch_environment(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **n = NULL; + char ***l = userdata; + int r; + + if (json_variant_is_null(variant)) { + *l = strv_free(*l); + return 0; + } + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + for (size_t i = 0; i < json_variant_elements(variant); i++) { + JsonVariant *e; + const char *a; + + e = json_variant_by_index(variant, i); + if (!json_variant_is_string(e)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of strings.", strna(name)); + + assert_se(a = json_variant_string(e)); + + if (!env_assignment_is_valid(a)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of environment variables.", strna(name)); + + r = strv_env_replace_strdup(&n, a); + if (r < 0) + return json_log_oom(variant, flags); + } + + return strv_free_and_replace(*l, n); +} + +int json_dispatch_user_disposition(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserDisposition *disposition = userdata, k; + + if (json_variant_is_null(variant)) { + *disposition = _USER_DISPOSITION_INVALID; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + k = user_disposition_from_string(json_variant_string(variant)); + if (k < 0) + return json_log(variant, flags, k, "Disposition type '%s' not known.", json_variant_string(variant)); + + *disposition = k; + return 0; +} + +static int json_dispatch_storage(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserStorage *storage = userdata, k; + + if (json_variant_is_null(variant)) { + *storage = _USER_STORAGE_INVALID; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + k = user_storage_from_string(json_variant_string(variant)); + if (k < 0) + return json_log(variant, flags, k, "Storage type '%s' not known.", json_variant_string(variant)); + + *storage = k; + return 0; +} + +static int json_dispatch_tasks_or_memory_max(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint64_t *limit = userdata, k; + + if (json_variant_is_null(variant)) { + *limit = UINT64_MAX; + return 0; + } + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name)); + + k = json_variant_unsigned(variant); + if (k <= 0 || k >= UINT64_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "JSON field '%s' is not in valid range %" PRIu64 "%s%" PRIu64 ".", + strna(name), (uint64_t) 1, special_glyph(SPECIAL_GLYPH_ELLIPSIS), UINT64_MAX-1); + + *limit = k; + return 0; +} + +static int json_dispatch_weight(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint64_t *weight = userdata, k; + + if (json_variant_is_null(variant)) { + *weight = UINT64_MAX; + return 0; + } + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name)); + + k = json_variant_unsigned(variant); + if (k <= CGROUP_WEIGHT_MIN || k >= CGROUP_WEIGHT_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "JSON field '%s' is not in valid range %" PRIu64 "%s%" PRIu64 ".", + strna(name), (uint64_t) CGROUP_WEIGHT_MIN, + special_glyph(SPECIAL_GLYPH_ELLIPSIS), (uint64_t) CGROUP_WEIGHT_MAX); + + *weight = k; + return 0; +} + +int json_dispatch_user_group_list(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + char ***list = userdata; + JsonVariant *e; + int r; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of strings.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string."); + + if (!valid_user_group_name(json_variant_string(e), FLAGS_SET(flags, JSON_RELAX) ? VALID_USER_RELAX : 0)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a valid user/group name: %s", json_variant_string(e)); + + r = strv_extend(&l, json_variant_string(e)); + if (r < 0) + return json_log(e, flags, r, "Failed to append array element: %m"); + } + + r = strv_extend_strv(list, l, true); + if (r < 0) + return json_log(variant, flags, r, "Failed to merge user/group arrays: %m"); + + return 0; +} + +static int dispatch_secret(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch secret_dispatch_table[] = { + { "password", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, password), 0 }, + { "tokenPin", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, token_pin), 0 }, + { "pkcs11Pin", /* legacy alias */ _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, token_pin), 0 }, + { "pkcs11ProtectedAuthenticationPathPermitted", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, pkcs11_protected_authentication_path_permitted), 0 }, + { "fido2UserPresencePermitted", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, fido2_user_presence_permitted), 0 }, + { "fido2UserVerificationPermitted", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, fido2_user_verification_permitted), 0 }, + {}, + }; + + return json_dispatch(variant, secret_dispatch_table, flags, userdata); +} + +static int dispatch_pkcs11_uri(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + char **s = userdata; + const char *n; + int r; + + if (json_variant_is_null(variant)) { + *s = mfree(*s); + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + n = json_variant_string(variant); + if (!pkcs11_uri_valid(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid RFC7512 PKCS#11 URI.", strna(name)); + + r = free_and_strdup(s, n); + if (r < 0) + return json_log(variant, flags, r, "Failed to allocate string: %m"); + + return 0; +} + +static int dispatch_pkcs11_uri_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **z = NULL; + char ***l = userdata; + JsonVariant *e; + int r; + + if (json_variant_is_null(variant)) { + *l = strv_free(*l); + return 0; + } + + if (json_variant_is_string(variant)) { + const char *n; + + n = json_variant_string(variant); + if (!pkcs11_uri_valid(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid RFC7512 PKCS#11 URI.", strna(name)); + + z = strv_new(n); + if (!z) + return log_oom(); + + } else { + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string or array of strings.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + const char *n; + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string."); + + n = json_variant_string(e); + if (!pkcs11_uri_valid(n)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element in '%s' is not a valid RFC7512 PKCS#11 URI: %s", strna(name), n); + + r = strv_extend(&z, n); + if (r < 0) + return log_oom(); + } + } + + strv_free_and_replace(*l, z); + return 0; +} + +static int dispatch_pkcs11_key_data(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + Pkcs11EncryptedKey *k = userdata; + size_t l; + void *b; + int r; + + if (json_variant_is_null(variant)) { + k->data = erase_and_free(k->data); + k->size = 0; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + r = unbase64mem(json_variant_string(variant), SIZE_MAX, &b, &l); + if (r < 0) + return json_log(variant, flags, r, "Failed to decode encrypted PKCS#11 key: %m"); + + erase_and_free(k->data); + k->data = b; + k->size = l; + + return 0; +} + +static int dispatch_pkcs11_key(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserRecord *h = userdata; + JsonVariant *e; + int r; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + Pkcs11EncryptedKey *array, *k; + + static const JsonDispatch pkcs11_key_dispatch_table[] = { + { "uri", JSON_VARIANT_STRING, dispatch_pkcs11_uri, offsetof(Pkcs11EncryptedKey, uri), JSON_MANDATORY }, + { "data", JSON_VARIANT_STRING, dispatch_pkcs11_key_data, 0, JSON_MANDATORY }, + { "hashedPassword", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Pkcs11EncryptedKey, hashed_password), JSON_MANDATORY }, + {}, + }; + + if (!json_variant_is_object(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an object."); + + array = reallocarray(h->pkcs11_encrypted_key, h->n_pkcs11_encrypted_key + 1, sizeof(Pkcs11EncryptedKey)); + if (!array) + return log_oom(); + + h->pkcs11_encrypted_key = array; + k = h->pkcs11_encrypted_key + h->n_pkcs11_encrypted_key; + *k = (Pkcs11EncryptedKey) {}; + + r = json_dispatch(e, pkcs11_key_dispatch_table, flags, k); + if (r < 0) { + pkcs11_encrypted_key_done(k); + return r; + } + + h->n_pkcs11_encrypted_key++; + } + + return 0; +} + +static int dispatch_fido2_hmac_credential(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + Fido2HmacCredential *k = userdata; + size_t l; + void *b; + int r; + + if (json_variant_is_null(variant)) { + k->id = mfree(k->id); + k->size = 0; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + r = unbase64mem(json_variant_string(variant), SIZE_MAX, &b, &l); + if (r < 0) + return json_log(variant, flags, r, "Failed to decode FIDO2 credential ID: %m"); + + free_and_replace(k->id, b); + k->size = l; + + return 0; +} + +static int dispatch_fido2_hmac_credential_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserRecord *h = userdata; + JsonVariant *e; + int r; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of strings.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + Fido2HmacCredential *array; + size_t l; + void *b; + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string."); + + array = reallocarray(h->fido2_hmac_credential, h->n_fido2_hmac_credential + 1, sizeof(Fido2HmacCredential)); + if (!array) + return log_oom(); + + r = unbase64mem(json_variant_string(e), SIZE_MAX, &b, &l); + if (r < 0) + return json_log(variant, flags, r, "Failed to decode FIDO2 credential ID: %m"); + + h->fido2_hmac_credential = array; + + h->fido2_hmac_credential[h->n_fido2_hmac_credential++] = (Fido2HmacCredential) { + .id = b, + .size = l, + }; + } + + return 0; +} + +static int dispatch_fido2_hmac_salt_value(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + Fido2HmacSalt *k = userdata; + size_t l; + void *b; + int r; + + if (json_variant_is_null(variant)) { + k->salt = erase_and_free(k->salt); + k->salt_size = 0; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name)); + + r = unbase64mem(json_variant_string(variant), SIZE_MAX, &b, &l); + if (r < 0) + return json_log(variant, flags, r, "Failed to decode FIDO2 salt: %m"); + + erase_and_free(k->salt); + k->salt = b; + k->salt_size = l; + + return 0; +} + +static int dispatch_fido2_hmac_salt(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserRecord *h = userdata; + JsonVariant *e; + int r; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + Fido2HmacSalt *array, *k; + + static const JsonDispatch fido2_hmac_salt_dispatch_table[] = { + { "credential", JSON_VARIANT_STRING, dispatch_fido2_hmac_credential, offsetof(Fido2HmacSalt, credential), JSON_MANDATORY }, + { "salt", JSON_VARIANT_STRING, dispatch_fido2_hmac_salt_value, 0, JSON_MANDATORY }, + { "hashedPassword", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Fido2HmacSalt, hashed_password), JSON_MANDATORY }, + { "up", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Fido2HmacSalt, up), 0 }, + { "uv", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Fido2HmacSalt, uv), 0 }, + { "clientPin", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Fido2HmacSalt, client_pin), 0 }, + {}, + }; + + if (!json_variant_is_object(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an object."); + + array = reallocarray(h->fido2_hmac_salt, h->n_fido2_hmac_salt + 1, sizeof(Fido2HmacSalt)); + if (!array) + return log_oom(); + + h->fido2_hmac_salt = array; + k = h->fido2_hmac_salt + h->n_fido2_hmac_salt; + *k = (Fido2HmacSalt) { + .uv = -1, + .up = -1, + .client_pin = -1, + }; + + r = json_dispatch(e, fido2_hmac_salt_dispatch_table, flags, k); + if (r < 0) { + fido2_hmac_salt_done(k); + return r; + } + + h->n_fido2_hmac_salt++; + } + + return 0; +} + +static int dispatch_recovery_key(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + UserRecord *h = userdata; + JsonVariant *e; + int r; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + RecoveryKey *array, *k; + + static const JsonDispatch recovery_key_dispatch_table[] = { + { "type", JSON_VARIANT_STRING, json_dispatch_string, 0, JSON_MANDATORY }, + { "hashedPassword", JSON_VARIANT_STRING, json_dispatch_string, offsetof(RecoveryKey, hashed_password), JSON_MANDATORY }, + {}, + }; + + if (!json_variant_is_object(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an object."); + + array = reallocarray(h->recovery_key, h->n_recovery_key + 1, sizeof(RecoveryKey)); + if (!array) + return log_oom(); + + h->recovery_key = array; + k = h->recovery_key + h->n_recovery_key; + *k = (RecoveryKey) {}; + + r = json_dispatch(e, recovery_key_dispatch_table, flags, k); + if (r < 0) { + recovery_key_done(k); + return r; + } + + h->n_recovery_key++; + } + + return 0; +} + +static int dispatch_auto_resize_mode(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + AutoResizeMode *mode = userdata, m; + + assert_se(mode); + + if (json_variant_is_null(variant)) { + *mode = _AUTO_RESIZE_MODE_INVALID; + return 0; + } + + if (json_variant_is_boolean(variant)) { + *mode = json_variant_boolean(variant) ? AUTO_RESIZE_SHRINK_AND_GROW : AUTO_RESIZE_OFF; + return 0; + } + + if (!json_variant_is_string(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string, boolean or null.", strna(name)); + + m = auto_resize_mode_from_string(json_variant_string(variant)); + if (m < 0) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid automatic resize mode.", strna(name)); + + *mode = m; + return 0; +} + +static int dispatch_rebalance_weight(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + uint64_t *rebalance_weight = userdata; + uintmax_t u; + + assert_se(rebalance_weight); + + if (json_variant_is_null(variant)) { + *rebalance_weight = REBALANCE_WEIGHT_UNSET; + return 0; + } + + if (json_variant_is_boolean(variant)) { + *rebalance_weight = json_variant_boolean(variant) ? REBALANCE_WEIGHT_DEFAULT : REBALANCE_WEIGHT_OFF; + return 0; + } + + if (!json_variant_is_unsigned(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an unsigned integer, boolean or null.", strna(name)); + + u = json_variant_unsigned(variant); + if (u >= REBALANCE_WEIGHT_MIN && u <= REBALANCE_WEIGHT_MAX) + *rebalance_weight = (uint64_t) u; + else if (u == 0) + *rebalance_weight = REBALANCE_WEIGHT_OFF; + else + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "Rebalance weight is out of valid range %" PRIu64 "%s%" PRIu64 ".", + REBALANCE_WEIGHT_MIN, special_glyph(SPECIAL_GLYPH_ELLIPSIS), REBALANCE_WEIGHT_MAX); + + return 0; +} + +static int dispatch_privileged(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch privileged_dispatch_table[] = { + { "passwordHint", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, password_hint), 0 }, + { "hashedPassword", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, hashed_password), JSON_SAFE }, + { "sshAuthorizedKeys", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, ssh_authorized_keys), 0 }, + { "pkcs11EncryptedKey", JSON_VARIANT_ARRAY, dispatch_pkcs11_key, 0, 0 }, + { "fido2HmacSalt", JSON_VARIANT_ARRAY, dispatch_fido2_hmac_salt, 0, 0 }, + { "recoveryKey", JSON_VARIANT_ARRAY, dispatch_recovery_key, 0, 0 }, + {}, + }; + + return json_dispatch(variant, privileged_dispatch_table, flags, userdata); +} + +static int dispatch_binding(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch binding_dispatch_table[] = { + { "imagePath", JSON_VARIANT_STRING, json_dispatch_image_path, offsetof(UserRecord, image_path), 0 }, + { "homeDirectory", JSON_VARIANT_STRING, json_dispatch_home_directory, offsetof(UserRecord, home_directory), 0 }, + { "partitionUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, partition_uuid), 0 }, + { "luksUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, luks_uuid), 0 }, + { "fileSystemUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, file_system_uuid), 0 }, + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, uid), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, gid), 0 }, + { "storage", JSON_VARIANT_STRING, json_dispatch_storage, offsetof(UserRecord, storage), 0 }, + { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE }, + { "luksCipher", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher), JSON_SAFE }, + { "luksCipherMode", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher_mode), JSON_SAFE }, + { "luksVolumeKeySize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_volume_key_size), 0 }, + {}, + }; + + JsonVariant *m; + sd_id128_t mid; + int r; + + if (!variant) + return 0; + + if (!json_variant_is_object(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name)); + + r = sd_id128_get_machine(&mid); + if (r < 0) + return json_log(variant, flags, r, "Failed to determine machine ID: %m"); + + m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid)); + if (!m) + return 0; + + return json_dispatch(m, binding_dispatch_table, flags, userdata); +} + +int per_machine_id_match(JsonVariant *ids, JsonDispatchFlags flags) { + sd_id128_t mid; + int r; + + r = sd_id128_get_machine(&mid); + if (r < 0) + return json_log(ids, flags, r, "Failed to acquire machine ID: %m"); + + if (json_variant_is_string(ids)) { + sd_id128_t k; + + r = sd_id128_from_string(json_variant_string(ids), &k); + if (r < 0) { + json_log(ids, flags, r, "%s is not a valid machine ID, ignoring: %m", json_variant_string(ids)); + return 0; + } + + return sd_id128_equal(mid, k); + } + + if (json_variant_is_array(ids)) { + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, ids) { + sd_id128_t k; + + if (!json_variant_is_string(e)) { + json_log(e, flags, 0, "Machine ID is not a string, ignoring: %m"); + continue; + } + + r = sd_id128_from_string(json_variant_string(e), &k); + if (r < 0) { + json_log(e, flags, r, "%s is not a valid machine ID, ignoring: %m", json_variant_string(e)); + continue; + } + + if (sd_id128_equal(mid, k)) + return true; + } + + return false; + } + + json_log(ids, flags, 0, "Machine ID is not a string or array of strings, ignoring: %m"); + return false; +} + +int per_machine_hostname_match(JsonVariant *hns, JsonDispatchFlags flags) { + _cleanup_free_ char *hn = NULL; + int r; + + r = gethostname_strict(&hn); + if (r == -ENXIO) { + json_log(hns, flags, r, "No hostname set, not matching perMachine hostname record: %m"); + return false; + } + if (r < 0) + return json_log(hns, flags, r, "Failed to acquire hostname: %m"); + + if (json_variant_is_string(hns)) + return streq(json_variant_string(hns), hn); + + if (json_variant_is_array(hns)) { + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, hns) { + + if (!json_variant_is_string(e)) { + json_log(e, flags, 0, "Hostname is not a string, ignoring: %m"); + continue; + } + + if (streq(json_variant_string(hns), hn)) + return true; + } + + return false; + } + + json_log(hns, flags, 0, "Hostname is not a string or array of strings, ignoring: %m"); + return false; +} + +static int dispatch_per_machine(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch per_machine_dispatch_table[] = { + { "matchMachineId", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 }, + { "matchHostname", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 }, + { "iconName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, icon_name), JSON_SAFE }, + { "location", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, location), 0 }, + { "shell", JSON_VARIANT_STRING, json_dispatch_filename_or_path, offsetof(UserRecord, shell), 0 }, + { "umask", JSON_VARIANT_UNSIGNED, json_dispatch_umask, offsetof(UserRecord, umask), 0 }, + { "environment", JSON_VARIANT_ARRAY, json_dispatch_environment, offsetof(UserRecord, environment), 0 }, + { "timeZone", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, time_zone), JSON_SAFE }, + { "preferredLanguage", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, preferred_language), JSON_SAFE }, + { "niceLevel", _JSON_VARIANT_TYPE_INVALID, json_dispatch_nice, offsetof(UserRecord, nice_level), 0 }, + { "resourceLimits", _JSON_VARIANT_TYPE_INVALID, json_dispatch_rlimits, offsetof(UserRecord, rlimits), 0 }, + { "locked", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, locked), 0 }, + { "notBeforeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_before_usec), 0 }, + { "notAfterUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_after_usec), 0 }, + { "storage", JSON_VARIANT_STRING, json_dispatch_storage, offsetof(UserRecord, storage), 0 }, + { "diskSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size), 0 }, + { "diskSizeRelative", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size_relative), 0 }, + { "skeletonDirectory", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, skeleton_directory), 0 }, + { "accessMode", JSON_VARIANT_UNSIGNED, json_dispatch_access_mode, offsetof(UserRecord, access_mode), 0 }, + { "tasksMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, tasks_max), 0 }, + { "memoryHigh", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_high), 0 }, + { "memoryMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_max), 0 }, + { "cpuWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, cpu_weight), 0 }, + { "ioWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, io_weight), 0 }, + { "mountNoDevices", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nodev), 0 }, + { "mountNoSuid", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nosuid), 0 }, + { "mountNoExecute", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, noexec), 0 }, + { "cifsDomain", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_domain), JSON_SAFE }, + { "cifsUserName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_user_name), JSON_SAFE }, + { "cifsService", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_service), JSON_SAFE }, + { "cifsExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_extra_mount_options), 0 }, + { "imagePath", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, image_path), 0 }, + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, uid), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, gid), 0 }, + { "memberOf", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(UserRecord, member_of), JSON_RELAX}, + { "capabilityBoundingSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_bounding_set), JSON_SAFE }, + { "capabilityAmbientSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_ambient_set), JSON_SAFE }, + { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE }, + { "partitionUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, partition_uuid), 0 }, + { "luksUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, luks_uuid), 0 }, + { "fileSystemUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, file_system_uuid), 0 }, + { "luksDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_discard), 0, }, + { "luksOfflineDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_offline_discard), 0, }, + { "luksCipher", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher), JSON_SAFE }, + { "luksCipherMode", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher_mode), JSON_SAFE }, + { "luksVolumeKeySize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_volume_key_size), 0 }, + { "luksPbkdfHashAlgorithm", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_hash_algorithm), JSON_SAFE }, + { "luksPbkdfType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_type), JSON_SAFE }, + { "luksPbkdfForceIterations", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_force_iterations), 0 }, + { "luksPbkdfTimeCostUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_time_cost_usec), 0 }, + { "luksPbkdfMemoryCost", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_memory_cost), 0 }, + { "luksPbkdfParallelThreads", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_parallel_threads), 0 }, + { "luksSectorSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_sector_size), 0 }, + { "luksExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_extra_mount_options), 0 }, + { "dropCaches", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, drop_caches), 0 }, + { "autoResizeMode", _JSON_VARIANT_TYPE_INVALID, dispatch_auto_resize_mode, offsetof(UserRecord, auto_resize_mode), 0 }, + { "rebalanceWeight", _JSON_VARIANT_TYPE_INVALID, dispatch_rebalance_weight, offsetof(UserRecord, rebalance_weight), 0 }, + { "rateLimitIntervalUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_interval_usec), 0 }, + { "rateLimitBurst", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_burst), 0 }, + { "enforcePasswordPolicy", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, enforce_password_policy), 0 }, + { "autoLogin", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, auto_login), 0 }, + { "stopDelayUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, stop_delay_usec), 0 }, + { "killProcesses", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, kill_processes), 0 }, + { "passwordChangeMinUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_min_usec), 0 }, + { "passwordChangeMaxUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_max_usec), 0 }, + { "passwordChangeWarnUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_warn_usec), 0 }, + { "passwordChangeInactiveUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_inactive_usec), 0 }, + { "passwordChangeNow", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, password_change_now), 0 }, + { "pkcs11TokenUri", JSON_VARIANT_ARRAY, dispatch_pkcs11_uri_array, offsetof(UserRecord, pkcs11_token_uri), 0 }, + { "fido2HmacCredential", JSON_VARIANT_ARRAY, dispatch_fido2_hmac_credential_array, 0, 0 }, + {}, + }; + + JsonVariant *e; + int r; + + if (!variant) + return 0; + + if (!json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + bool matching = false; + JsonVariant *m; + + if (!json_variant_is_object(e)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name)); + + m = json_variant_by_key(e, "matchMachineId"); + if (m) { + r = per_machine_id_match(m, flags); + if (r < 0) + return r; + + matching = r > 0; + } + + if (!matching) { + m = json_variant_by_key(e, "matchHostname"); + if (m) { + r = per_machine_hostname_match(m, flags); + if (r < 0) + return r; + + matching = r > 0; + } + } + + if (!matching) + continue; + + r = json_dispatch(e, per_machine_dispatch_table, flags, userdata); + if (r < 0) + return r; + } + + return 0; +} + +static int dispatch_status(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch status_dispatch_table[] = { + { "diskUsage", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_usage), 0 }, + { "diskFree", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_free), 0 }, + { "diskSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size), 0 }, + { "diskCeiling", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_ceiling), 0 }, + { "diskFloor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_floor), 0 }, + { "state", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, state), JSON_SAFE }, + { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, service), JSON_SAFE }, + { "signedLocally", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, signed_locally), 0 }, + { "goodAuthenticationCounter", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, good_authentication_counter), 0 }, + { "badAuthenticationCounter", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, bad_authentication_counter), 0 }, + { "lastGoodAuthenticationUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_good_authentication_usec), 0 }, + { "lastBadAuthenticationUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_bad_authentication_usec), 0 }, + { "rateLimitBeginUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_begin_usec), 0 }, + { "rateLimitCount", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_count), 0 }, + { "removable", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, removable), 0 }, + { "accessMode", JSON_VARIANT_UNSIGNED, json_dispatch_access_mode, offsetof(UserRecord, access_mode), 0 }, + { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE }, + {}, + }; + + JsonVariant *m; + sd_id128_t mid; + int r; + + if (!variant) + return 0; + + if (!json_variant_is_object(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name)); + + r = sd_id128_get_machine(&mid); + if (r < 0) + return json_log(variant, flags, r, "Failed to determine machine ID: %m"); + + m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid)); + if (!m) + return 0; + + return json_dispatch(m, status_dispatch_table, flags, userdata); +} + +int user_record_build_image_path(UserStorage storage, const char *user_name_and_realm, char **ret) { + const char *suffix; + char *z; + + assert(storage >= 0); + assert(user_name_and_realm); + assert(ret); + + if (storage == USER_LUKS) + suffix = ".home"; + else if (IN_SET(storage, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT)) + suffix = ".homedir"; + else { + *ret = NULL; + return 0; + } + + z = strjoin(get_home_root(), "/", user_name_and_realm, suffix); + if (!z) + return -ENOMEM; + + *ret = path_simplify(z); + return 1; +} + +static int user_record_augment(UserRecord *h, JsonDispatchFlags json_flags) { + int r; + + assert(h); + + if (!FLAGS_SET(h->mask, USER_RECORD_REGULAR)) + return 0; + + assert(h->user_name); + + if (!h->user_name_and_realm_auto && h->realm) { + h->user_name_and_realm_auto = strjoin(h->user_name, "@", h->realm); + if (!h->user_name_and_realm_auto) + return json_log_oom(h->json, json_flags); + } + + /* Let's add in the following automatisms only for regular users, they don't make sense for any others */ + if (user_record_disposition(h) != USER_REGULAR) + return 0; + + if (!h->home_directory && !h->home_directory_auto) { + h->home_directory_auto = path_join(get_home_root(), h->user_name); + if (!h->home_directory_auto) + return json_log_oom(h->json, json_flags); + } + + if (!h->image_path && !h->image_path_auto) { + r = user_record_build_image_path(user_record_storage(h), user_record_user_name_and_realm(h), &h->image_path_auto); + if (r < 0) + return json_log(h->json, json_flags, r, "Failed to determine default image path: %m"); + } + + return 0; +} + +int user_group_record_mangle( + JsonVariant *v, + UserRecordLoadFlags load_flags, + JsonVariant **ret_variant, + UserRecordMask *ret_mask) { + + static const struct { + UserRecordMask mask; + const char *name; + } mask_field[] = { + { USER_RECORD_PRIVILEGED, "privileged" }, + { USER_RECORD_SECRET, "secret" }, + { USER_RECORD_BINDING, "binding" }, + { USER_RECORD_PER_MACHINE, "perMachine" }, + { USER_RECORD_STATUS, "status" }, + { USER_RECORD_SIGNATURE, "signature" }, + }; + + JsonDispatchFlags json_flags = USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(load_flags); + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + JsonVariant *array[ELEMENTSOF(mask_field) * 2]; + size_t n_retain = 0; + UserRecordMask m = 0; + int r; + + assert((load_flags & _USER_RECORD_MASK_MAX) == 0); /* detect mistakes when accidentally passing + * UserRecordMask bit masks as UserRecordLoadFlags + * value */ + + assert(v); + assert(ret_variant); + assert(ret_mask); + + /* Note that this function is shared with the group record parser, hence we try to be generic in our + * log message wording here, to cover both cases. */ + + if (!json_variant_is_object(v)) + return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record is not a JSON object, refusing."); + + if (USER_RECORD_ALLOW_MASK(load_flags) == 0) /* allow nothing? */ + return json_log(v, json_flags, SYNTHETIC_ERRNO(EINVAL), "Nothing allowed in record, refusing."); + + if (USER_RECORD_STRIP_MASK(load_flags) == _USER_RECORD_MASK_MAX) /* strip everything? */ + return json_log(v, json_flags, SYNTHETIC_ERRNO(EINVAL), "Stripping everything from record, refusing."); + + /* Check if we have the special sections and if they match our flags set */ + for (size_t i = 0; i < ELEMENTSOF(mask_field); i++) { + JsonVariant *e, *k; + + if (FLAGS_SET(USER_RECORD_STRIP_MASK(load_flags), mask_field[i].mask)) { + if (!w) + w = json_variant_ref(v); + + r = json_variant_filter(&w, STRV_MAKE(mask_field[i].name)); + if (r < 0) + return json_log(w, json_flags, r, "Failed to remove field from variant: %m"); + + continue; + } + + e = json_variant_by_key_full(v, mask_field[i].name, &k); + if (e) { + if (!FLAGS_SET(USER_RECORD_ALLOW_MASK(load_flags), mask_field[i].mask)) + return json_log(e, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record contains '%s' field, which is not allowed.", mask_field[i].name); + + if (FLAGS_SET(load_flags, USER_RECORD_STRIP_REGULAR)) { + array[n_retain++] = k; + array[n_retain++] = e; + } + + m |= mask_field[i].mask; + } else { + if (FLAGS_SET(USER_RECORD_REQUIRE_MASK(load_flags), mask_field[i].mask)) + return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record lacks '%s' field, which is required.", mask_field[i].name); + } + } + + if (FLAGS_SET(load_flags, USER_RECORD_STRIP_REGULAR)) { + /* If we are supposed to strip regular items, then let's instead just allocate a new object + * with just the stuff we need. */ + + w = json_variant_unref(w); + r = json_variant_new_object(&w, array, n_retain); + if (r < 0) + return json_log(v, json_flags, r, "Failed to allocate new object: %m"); + } else + /* And now check if there's anything else in the record */ + for (size_t i = 0; i < json_variant_elements(v); i += 2) { + const char *f; + bool special = false; + + assert_se(f = json_variant_string(json_variant_by_index(v, i))); + + for (size_t j = 0; j < ELEMENTSOF(mask_field); j++) + if (streq(f, mask_field[j].name)) { /* already covered in the loop above */ + special = true; + continue; + } + + if (!special) { + if ((load_flags & (USER_RECORD_ALLOW_REGULAR|USER_RECORD_REQUIRE_REGULAR)) == 0) + return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record contains '%s' field, which is not allowed.", f); + + m |= USER_RECORD_REGULAR; + break; + } + } + + if (FLAGS_SET(load_flags, USER_RECORD_REQUIRE_REGULAR) && !FLAGS_SET(m, USER_RECORD_REGULAR)) + return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record lacks basic identity fields, which are required."); + + if (!FLAGS_SET(load_flags, USER_RECORD_EMPTY_OK) && m == 0) + return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record is empty."); + + if (w) + *ret_variant = TAKE_PTR(w); + else + *ret_variant = json_variant_ref(v); + + *ret_mask = m; + return 0; +} + +int user_record_load(UserRecord *h, JsonVariant *v, UserRecordLoadFlags load_flags) { + + static const JsonDispatch user_dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(UserRecord, user_name), JSON_RELAX}, + { "realm", JSON_VARIANT_STRING, json_dispatch_realm, offsetof(UserRecord, realm), 0 }, + { "realName", JSON_VARIANT_STRING, json_dispatch_gecos, offsetof(UserRecord, real_name), 0 }, + { "emailAddress", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, email_address), JSON_SAFE }, + { "iconName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, icon_name), JSON_SAFE }, + { "location", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, location), 0 }, + { "disposition", JSON_VARIANT_STRING, json_dispatch_user_disposition, offsetof(UserRecord, disposition), 0 }, + { "lastChangeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_change_usec), 0 }, + { "lastPasswordChangeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_password_change_usec), 0 }, + { "shell", JSON_VARIANT_STRING, json_dispatch_filename_or_path, offsetof(UserRecord, shell), 0 }, + { "umask", JSON_VARIANT_UNSIGNED, json_dispatch_umask, offsetof(UserRecord, umask), 0 }, + { "environment", JSON_VARIANT_ARRAY, json_dispatch_environment, offsetof(UserRecord, environment), 0 }, + { "timeZone", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, time_zone), JSON_SAFE }, + { "preferredLanguage", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, preferred_language), JSON_SAFE }, + { "niceLevel", _JSON_VARIANT_TYPE_INVALID, json_dispatch_nice, offsetof(UserRecord, nice_level), 0 }, + { "resourceLimits", _JSON_VARIANT_TYPE_INVALID, json_dispatch_rlimits, offsetof(UserRecord, rlimits), 0 }, + { "locked", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, locked), 0 }, + { "notBeforeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_before_usec), 0 }, + { "notAfterUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_after_usec), 0 }, + { "storage", JSON_VARIANT_STRING, json_dispatch_storage, offsetof(UserRecord, storage), 0 }, + { "diskSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size), 0 }, + { "diskSizeRelative", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size_relative), 0 }, + { "skeletonDirectory", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, skeleton_directory), 0 }, + { "accessMode", JSON_VARIANT_UNSIGNED, json_dispatch_access_mode, offsetof(UserRecord, access_mode), 0 }, + { "tasksMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, tasks_max), 0 }, + { "memoryHigh", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_high), 0 }, + { "memoryMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_max), 0 }, + { "cpuWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, cpu_weight), 0 }, + { "ioWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, io_weight), 0 }, + { "mountNoDevices", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nodev), 0 }, + { "mountNoSuid", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nosuid), 0 }, + { "mountNoExecute", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, noexec), 0 }, + { "cifsDomain", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_domain), JSON_SAFE }, + { "cifsUserName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_user_name), JSON_SAFE }, + { "cifsService", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_service), JSON_SAFE }, + { "cifsExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_extra_mount_options), 0 }, + { "imagePath", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, image_path), 0 }, + { "homeDirectory", JSON_VARIANT_STRING, json_dispatch_home_directory, offsetof(UserRecord, home_directory), 0 }, + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, uid), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, gid), 0 }, + { "memberOf", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(UserRecord, member_of), JSON_RELAX}, + { "capabilityBoundingSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_bounding_set), JSON_SAFE }, + { "capabilityAmbientSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_ambient_set), JSON_SAFE }, + { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE }, + { "partitionUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, partition_uuid), 0 }, + { "luksUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, luks_uuid), 0 }, + { "fileSystemUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, file_system_uuid), 0 }, + { "luksDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_discard), 0 }, + { "luksOfflineDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_offline_discard), 0 }, + { "luksCipher", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher), JSON_SAFE }, + { "luksCipherMode", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher_mode), JSON_SAFE }, + { "luksVolumeKeySize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_volume_key_size), 0 }, + { "luksPbkdfHashAlgorithm", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_hash_algorithm), JSON_SAFE }, + { "luksPbkdfType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_type), JSON_SAFE }, + { "luksPbkdfForceIterations", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_force_iterations), 0 }, + { "luksPbkdfTimeCostUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_time_cost_usec), 0 }, + { "luksPbkdfMemoryCost", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_memory_cost), 0 }, + { "luksPbkdfParallelThreads", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_parallel_threads), 0 }, + { "luksSectorSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_sector_size), 0 }, + { "luksExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_extra_mount_options), 0 }, + { "dropCaches", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, drop_caches), 0 }, + { "autoResizeMode", _JSON_VARIANT_TYPE_INVALID, dispatch_auto_resize_mode, offsetof(UserRecord, auto_resize_mode), 0 }, + { "rebalanceWeight", _JSON_VARIANT_TYPE_INVALID, dispatch_rebalance_weight, offsetof(UserRecord, rebalance_weight), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, service), JSON_SAFE }, + { "rateLimitIntervalUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_interval_usec), 0 }, + { "rateLimitBurst", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_burst), 0 }, + { "enforcePasswordPolicy", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, enforce_password_policy), 0 }, + { "autoLogin", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, auto_login), 0 }, + { "stopDelayUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, stop_delay_usec), 0 }, + { "killProcesses", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, kill_processes), 0 }, + { "passwordChangeMinUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_min_usec), 0 }, + { "passwordChangeMaxUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_max_usec), 0 }, + { "passwordChangeWarnUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_warn_usec), 0 }, + { "passwordChangeInactiveUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_inactive_usec), 0 }, + { "passwordChangeNow", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, password_change_now), 0 }, + { "pkcs11TokenUri", JSON_VARIANT_ARRAY, dispatch_pkcs11_uri_array, offsetof(UserRecord, pkcs11_token_uri), 0 }, + { "fido2HmacCredential", JSON_VARIANT_ARRAY, dispatch_fido2_hmac_credential_array, 0, 0 }, + { "recoveryKeyType", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, recovery_key_type), 0 }, + + { "secret", JSON_VARIANT_OBJECT, dispatch_secret, 0, 0 }, + { "privileged", JSON_VARIANT_OBJECT, dispatch_privileged, 0, 0 }, + + /* Ignore the perMachine, binding, status stuff here, and process it later, so that it overrides whatever is set above */ + { "perMachine", JSON_VARIANT_ARRAY, NULL, 0, 0 }, + { "binding", JSON_VARIANT_OBJECT, NULL, 0, 0 }, + { "status", JSON_VARIANT_OBJECT, NULL, 0, 0 }, + + /* Ignore 'signature', we check it with explicit accessors instead */ + { "signature", JSON_VARIANT_ARRAY, NULL, 0, 0 }, + {}, + }; + + JsonDispatchFlags json_flags = USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(load_flags); + int r; + + assert(h); + assert(!h->json); + + /* Note that this call will leave a half-initialized record around on failure! */ + + r = user_group_record_mangle(v, load_flags, &h->json, &h->mask); + if (r < 0) + return r; + + r = json_dispatch(h->json, user_dispatch_table, json_flags, h); + if (r < 0) + return r; + + /* During the parsing operation above we ignored the 'perMachine', 'binding' and 'status' fields, + * since we want them to override the global options. Let's process them now. */ + + r = dispatch_per_machine("perMachine", json_variant_by_key(h->json, "perMachine"), json_flags, h); + if (r < 0) + return r; + + r = dispatch_binding("binding", json_variant_by_key(h->json, "binding"), json_flags, h); + if (r < 0) + return r; + + r = dispatch_status("status", json_variant_by_key(h->json, "status"), json_flags, h); + if (r < 0) + return r; + + if (FLAGS_SET(h->mask, USER_RECORD_REGULAR) && !h->user_name) + return json_log(h->json, json_flags, SYNTHETIC_ERRNO(EINVAL), "User name field missing, refusing."); + + r = user_record_augment(h, json_flags); + if (r < 0) + return r; + + return 0; +} + +int user_record_build(UserRecord **ret, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(user_record_unrefp) UserRecord *u = NULL; + va_list ap; + int r; + + assert(ret); + + va_start(ap, ret); + r = json_buildv(&v, ap); + va_end(ap); + + if (r < 0) + return r; + + u = user_record_new(); + if (!u) + return -ENOMEM; + + r = user_record_load(u, v, USER_RECORD_LOAD_FULL); + if (r < 0) + return r; + + *ret = TAKE_PTR(u); + return 0; +} + +const char *user_record_user_name_and_realm(UserRecord *h) { + assert(h); + + /* Return the pre-initialized joined string if it is defined */ + if (h->user_name_and_realm_auto) + return h->user_name_and_realm_auto; + + /* If it's not defined then we cannot have a realm */ + assert(!h->realm); + return h->user_name; +} + +UserStorage user_record_storage(UserRecord *h) { + assert(h); + + if (h->storage >= 0) + return h->storage; + + return USER_CLASSIC; +} + +const char *user_record_file_system_type(UserRecord *h) { + assert(h); + + return h->file_system_type ?: "btrfs"; +} + +const char *user_record_skeleton_directory(UserRecord *h) { + assert(h); + + return h->skeleton_directory ?: "/etc/skel"; +} + +mode_t user_record_access_mode(UserRecord *h) { + assert(h); + + return h->access_mode != MODE_INVALID ? h->access_mode : 0700; +} + +const char* user_record_home_directory(UserRecord *h) { + assert(h); + + if (h->home_directory) + return h->home_directory; + if (h->home_directory_auto) + return h->home_directory_auto; + + /* The root user is special, hence be special about it */ + if (streq_ptr(h->user_name, "root")) + return "/root"; + + return "/"; +} + +const char *user_record_image_path(UserRecord *h) { + assert(h); + + if (h->image_path) + return h->image_path; + if (h->image_path_auto) + return h->image_path_auto; + + return IN_SET(user_record_storage(h), USER_CLASSIC, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT) ? user_record_home_directory(h) : NULL; +} + +const char *user_record_cifs_user_name(UserRecord *h) { + assert(h); + + return h->cifs_user_name ?: h->user_name; +} + +unsigned long user_record_mount_flags(UserRecord *h) { + assert(h); + + return (h->nosuid ? MS_NOSUID : 0) | + (h->noexec ? MS_NOEXEC : 0) | + (h->nodev ? MS_NODEV : 0); +} + +const char *user_record_shell(UserRecord *h) { + assert(h); + + if (h->shell) + return h->shell; + + if (streq_ptr(h->user_name, "root")) + return "/bin/sh"; + + if (user_record_disposition(h) == USER_REGULAR) + return DEFAULT_USER_SHELL; + + return NOLOGIN; +} + +const char *user_record_real_name(UserRecord *h) { + assert(h); + + return h->real_name ?: h->user_name; +} + +bool user_record_luks_discard(UserRecord *h) { + const char *ip; + + assert(h); + + if (h->luks_discard >= 0) + return h->luks_discard; + + ip = user_record_image_path(h); + if (!ip) + return false; + + /* Use discard by default if we are referring to a real block device, but not when operating on a + * loopback device. We want to optimize for SSD and flash storage after all, but we should be careful + * when storing stuff on top of regular file systems in loopback files as doing discard then would + * mean thin provisioning and we should not do that willy-nilly since it means we'll risk EIO later + * on should the disk space to back our file systems not be available. */ + + return path_startswith(ip, "/dev/"); +} + +bool user_record_luks_offline_discard(UserRecord *h) { + const char *ip; + + assert(h); + + if (h->luks_offline_discard >= 0) + return h->luks_offline_discard; + + /* Discard while we are logged out should generally be a good idea, except when operating directly on + * physical media, where we should just bind it to the online discard mode. */ + + ip = user_record_image_path(h); + if (!ip) + return false; + + if (path_startswith(ip, "/dev/")) + return user_record_luks_discard(h); + + return true; +} + +const char *user_record_luks_cipher(UserRecord *h) { + assert(h); + + return h->luks_cipher ?: "aes"; +} + +const char *user_record_luks_cipher_mode(UserRecord *h) { + assert(h); + + return h->luks_cipher_mode ?: "xts-plain64"; +} + +uint64_t user_record_luks_volume_key_size(UserRecord *h) { + assert(h); + + /* We return a value here that can be cast without loss into size_t which is what libcrypsetup expects */ + + if (h->luks_volume_key_size == UINT64_MAX) + return 256 / 8; + + return MIN(h->luks_volume_key_size, SIZE_MAX); +} + +const char* user_record_luks_pbkdf_type(UserRecord *h) { + assert(h); + + return h->luks_pbkdf_type ?: "argon2id"; +} + +uint64_t user_record_luks_pbkdf_force_iterations(UserRecord *h) { + assert(h); + + /* propagate default "benchmark" mode as itself */ + if (h->luks_pbkdf_force_iterations == UINT64_MAX) + return UINT64_MAX; + + /* clamp everything else to actually accepted number of iterations of libcryptsetup */ + return CLAMP(h->luks_pbkdf_force_iterations, 1U, UINT32_MAX); +} + +uint64_t user_record_luks_pbkdf_time_cost_usec(UserRecord *h) { + assert(h); + + /* Returns a value with ms granularity, since that's what libcryptsetup expects */ + + if (h->luks_pbkdf_time_cost_usec == UINT64_MAX) + return 500 * USEC_PER_MSEC; /* We default to 500ms, in contrast to libcryptsetup's 2s, which is just awfully slow on every login */ + + return MIN(DIV_ROUND_UP(h->luks_pbkdf_time_cost_usec, USEC_PER_MSEC), UINT32_MAX) * USEC_PER_MSEC; +} + +uint64_t user_record_luks_pbkdf_memory_cost(UserRecord *h) { + assert(h); + + /* Returns a value with kb granularity, since that's what libcryptsetup expects */ + if (h->luks_pbkdf_memory_cost == UINT64_MAX) + return streq(user_record_luks_pbkdf_type(h), "pbkdf2") ? 0 : /* doesn't apply for simple pbkdf2 */ + 64*1024*1024; /* We default to 64M, since this should work on smaller systems too */ + + return MIN(DIV_ROUND_UP(h->luks_pbkdf_memory_cost, 1024), UINT32_MAX) * 1024; +} + +uint64_t user_record_luks_pbkdf_parallel_threads(UserRecord *h) { + assert(h); + + if (h->luks_pbkdf_parallel_threads == UINT64_MAX) + return streq(user_record_luks_pbkdf_type(h), "pbkdf2") ? 0 : /* doesn't apply for simple pbkdf2 */ + 1; /* We default to 1, since this should work on smaller systems too */ + + return MIN(h->luks_pbkdf_parallel_threads, UINT32_MAX); +} + +uint64_t user_record_luks_sector_size(UserRecord *h) { + assert(h); + + if (h->luks_sector_size == UINT64_MAX) + return 512; + + /* Allow up to 4K due to dm-crypt support and 4K alignment by the homed LUKS backend */ + return CLAMP(UINT64_C(1) << (63 - __builtin_clzl(h->luks_sector_size)), 512U, 4096U); +} + +const char *user_record_luks_pbkdf_hash_algorithm(UserRecord *h) { + assert(h); + + return h->luks_pbkdf_hash_algorithm ?: "sha512"; +} + +gid_t user_record_gid(UserRecord *h) { + assert(h); + + if (gid_is_valid(h->gid)) + return h->gid; + + return (gid_t) h->uid; +} + +UserDisposition user_record_disposition(UserRecord *h) { + assert(h); + + if (h->disposition >= 0) + return h->disposition; + + /* If not declared, derive from UID */ + + if (!uid_is_valid(h->uid)) + return _USER_DISPOSITION_INVALID; + + if (h->uid == 0 || h->uid == UID_NOBODY) + return USER_INTRINSIC; + + if (uid_is_system(h->uid)) + return USER_SYSTEM; + + if (uid_is_dynamic(h->uid)) + return USER_DYNAMIC; + + if (uid_is_container(h->uid)) + return USER_CONTAINER; + + if (h->uid > INT32_MAX) + return USER_RESERVED; + + return USER_REGULAR; +} + +int user_record_removable(UserRecord *h) { + UserStorage storage; + assert(h); + + if (h->removable >= 0) + return h->removable; + + /* Refuse to decide for classic records */ + storage = user_record_storage(h); + if (h->storage < 0 || h->storage == USER_CLASSIC) + return -1; + + /* For now consider only LUKS home directories with a reference by path as removable */ + return storage == USER_LUKS && path_startswith(user_record_image_path(h), "/dev/"); +} + +uint64_t user_record_ratelimit_interval_usec(UserRecord *h) { + assert(h); + + if (h->ratelimit_interval_usec == UINT64_MAX) + return DEFAULT_RATELIMIT_INTERVAL_USEC; + + return h->ratelimit_interval_usec; +} + +uint64_t user_record_ratelimit_burst(UserRecord *h) { + assert(h); + + if (h->ratelimit_burst == UINT64_MAX) + return DEFAULT_RATELIMIT_BURST; + + return h->ratelimit_burst; +} + +bool user_record_can_authenticate(UserRecord *h) { + assert(h); + + /* Returns true if there's some form of property configured that the user can authenticate against */ + + if (h->n_pkcs11_encrypted_key > 0) + return true; + + if (h->n_fido2_hmac_salt > 0) + return true; + + return !strv_isempty(h->hashed_password); +} + +bool user_record_drop_caches(UserRecord *h) { + assert(h); + + if (h->drop_caches >= 0) + return h->drop_caches; + + /* By default drop caches on fscrypt, not otherwise. */ + return user_record_storage(h) == USER_FSCRYPT; +} + +AutoResizeMode user_record_auto_resize_mode(UserRecord *h) { + assert(h); + + if (h->auto_resize_mode >= 0) + return h->auto_resize_mode; + + return user_record_storage(h) == USER_LUKS ? AUTO_RESIZE_SHRINK_AND_GROW : AUTO_RESIZE_OFF; +} + +uint64_t user_record_rebalance_weight(UserRecord *h) { + assert(h); + + if (h->rebalance_weight == REBALANCE_WEIGHT_UNSET) + return REBALANCE_WEIGHT_DEFAULT; + + return h->rebalance_weight; +} + +static uint64_t parse_caps_strv(char **l) { + uint64_t c = 0; + int r; + + STRV_FOREACH(i, l) { + r = capability_from_name(*i); + if (r < 0) + log_debug_errno(r, "Don't know capability '%s', ignoring: %m", *i); + else + c |= UINT64_C(1) << r; + } + + return c; +} + +uint64_t user_record_capability_bounding_set(UserRecord *h) { + assert(h); + + /* Returns UINT64_MAX if no bounding set is configured (!) */ + + if (!h->capability_bounding_set) + return UINT64_MAX; + + return parse_caps_strv(h->capability_bounding_set); +} + +uint64_t user_record_capability_ambient_set(UserRecord *h) { + assert(h); + + /* Returns UINT64_MAX if no ambient set is configured (!) */ + + if (!h->capability_ambient_set) + return UINT64_MAX; + + return parse_caps_strv(h->capability_ambient_set) & user_record_capability_bounding_set(h); +} + +uint64_t user_record_ratelimit_next_try(UserRecord *h) { + assert(h); + + /* Calculates when the it's possible to login next. Returns: + * + * UINT64_MAX → Nothing known + * 0 → Right away + * Any other → Next time in CLOCK_REALTIME in usec (which could be in the past) + */ + + if (h->ratelimit_begin_usec == UINT64_MAX || + h->ratelimit_count == UINT64_MAX) + return UINT64_MAX; + + if (h->ratelimit_begin_usec > now(CLOCK_REALTIME)) /* If the ratelimit time is in the future, then + * the local clock is probably incorrect. Let's + * not refuse login then. */ + return UINT64_MAX; + + if (h->ratelimit_count < user_record_ratelimit_burst(h)) + return 0; + + return usec_add(h->ratelimit_begin_usec, user_record_ratelimit_interval_usec(h)); +} + +bool user_record_equal(UserRecord *a, UserRecord *b) { + assert(a); + assert(b); + + /* We assume that when a record is modified its JSON data is updated at the same time, hence it's + * sufficient to compare the JSON data. */ + + return json_variant_equal(a->json, b->json); +} + +bool user_record_compatible(UserRecord *a, UserRecord *b) { + assert(a); + assert(b); + + /* If either lacks the regular section, we can't really decide, let's hence say they are + * incompatible. */ + if (!(a->mask & b->mask & USER_RECORD_REGULAR)) + return false; + + return streq_ptr(a->user_name, b->user_name) && + streq_ptr(a->realm, b->realm); +} + +int user_record_compare_last_change(UserRecord *a, UserRecord *b) { + assert(a); + assert(b); + + if (a->last_change_usec == b->last_change_usec) + return 0; + + /* Always consider a record with a timestamp newer than one without */ + if (a->last_change_usec == UINT64_MAX) + return -1; + if (b->last_change_usec == UINT64_MAX) + return 1; + + return CMP(a->last_change_usec, b->last_change_usec); +} + +int user_record_clone(UserRecord *h, UserRecordLoadFlags flags, UserRecord **ret) { + _cleanup_(user_record_unrefp) UserRecord *c = NULL; + int r; + + assert(h); + assert(ret); + + c = user_record_new(); + if (!c) + return -ENOMEM; + + r = user_record_load(c, h->json, flags); + if (r < 0) + return r; + + *ret = TAKE_PTR(c); + return 0; +} + +int user_record_masked_equal(UserRecord *a, UserRecord *b, UserRecordMask mask) { + _cleanup_(user_record_unrefp) UserRecord *x = NULL, *y = NULL; + int r; + + assert(a); + assert(b); + + /* Compares the two records, but ignores anything not listed in the specified mask */ + + if ((a->mask & ~mask) != 0) { + r = user_record_clone(a, USER_RECORD_ALLOW(mask) | USER_RECORD_STRIP(~mask & _USER_RECORD_MASK_MAX) | USER_RECORD_PERMISSIVE, &x); + if (r < 0) + return r; + + a = x; + } + + if ((b->mask & ~mask) != 0) { + r = user_record_clone(b, USER_RECORD_ALLOW(mask) | USER_RECORD_STRIP(~mask & _USER_RECORD_MASK_MAX) | USER_RECORD_PERMISSIVE, &y); + if (r < 0) + return r; + + b = y; + } + + return user_record_equal(a, b); +} + +int user_record_test_blocked(UserRecord *h) { + usec_t n; + + /* Checks whether access to the specified user shall be allowed at the moment. Returns: + * + * -ESTALE: Record is from the future + * -ENOLCK: Record is blocked + * -EL2HLT: Record is not valid yet + * -EL3HLT: Record is not valid anymore + * + */ + + assert(h); + + if (h->locked > 0) + return -ENOLCK; + + n = now(CLOCK_REALTIME); + + if (h->not_before_usec != UINT64_MAX && n < h->not_before_usec) + return -EL2HLT; + if (h->not_after_usec != UINT64_MAX && n > h->not_after_usec) + return -EL3HLT; + + if (h->last_change_usec != UINT64_MAX && + h->last_change_usec > n) /* Complain during log-ins when the record is from the future */ + return -ESTALE; + + return 0; +} + +int user_record_test_password_change_required(UserRecord *h) { + bool change_permitted; + usec_t n; + + assert(h); + + /* Checks whether the user must change the password when logging in + + -EKEYREVOKED: Change password now because admin said so + -EOWNERDEAD: Change password now because it expired + -EKEYREJECTED: Password is expired, no changing is allowed + -EKEYEXPIRED: Password is about to expire, warn user + -ENETDOWN: Record has expiration info but no password change timestamp + -EROFS: No password change required nor permitted + -ESTALE: RTC likely incorrect, last password change is in the future + 0: No password change required, but permitted + */ + + /* If a password change request has been set explicitly, it overrides everything */ + if (h->password_change_now > 0) + return -EKEYREVOKED; + + n = now(CLOCK_REALTIME); + + /* Password change in the future? Then our RTC is likely incorrect */ + if (h->last_password_change_usec != UINT64_MAX && + h->last_password_change_usec > n && + (h->password_change_min_usec != UINT64_MAX || + h->password_change_max_usec != UINT64_MAX || + h->password_change_inactive_usec != UINT64_MAX)) + return -ESTALE; + + /* Then, let's check if password changing is currently allowed at all */ + if (h->password_change_min_usec != UINT64_MAX) { + + /* Expiry configured but no password change timestamp known? */ + if (h->last_password_change_usec == UINT64_MAX) + return -ENETDOWN; + + if (h->password_change_min_usec >= UINT64_MAX - h->last_password_change_usec) + change_permitted = false; + else + change_permitted = n >= h->last_password_change_usec + h->password_change_min_usec; + + } else + change_permitted = true; + + /* Let's check whether the password has expired. */ + if (!(h->password_change_max_usec == UINT64_MAX || + h->password_change_max_usec >= UINT64_MAX - h->last_password_change_usec)) { + + uint64_t change_before; + + /* Expiry configured but no password change timestamp known? */ + if (h->last_password_change_usec == UINT64_MAX) + return -ENETDOWN; + + /* Password is in inactive phase? */ + if (h->password_change_inactive_usec != UINT64_MAX && + h->password_change_inactive_usec < UINT64_MAX - h->password_change_max_usec) { + usec_t added; + + added = h->password_change_inactive_usec + h->password_change_max_usec; + if (added < UINT64_MAX - h->last_password_change_usec && + n >= h->last_password_change_usec + added) + return -EKEYREJECTED; + } + + /* Password needs to be changed now? */ + change_before = h->last_password_change_usec + h->password_change_max_usec; + if (n >= change_before) + return change_permitted ? -EOWNERDEAD : -EKEYREJECTED; + + /* Warn user? */ + if (h->password_change_warn_usec != UINT64_MAX && + (change_before < h->password_change_warn_usec || + n >= change_before - h->password_change_warn_usec)) + return change_permitted ? -EKEYEXPIRED : -EROFS; + } + + /* No password changing necessary */ + return change_permitted ? 0 : -EROFS; +} + +static const char* const user_storage_table[_USER_STORAGE_MAX] = { + [USER_CLASSIC] = "classic", + [USER_LUKS] = "luks", + [USER_DIRECTORY] = "directory", + [USER_SUBVOLUME] = "subvolume", + [USER_FSCRYPT] = "fscrypt", + [USER_CIFS] = "cifs", +}; + +DEFINE_STRING_TABLE_LOOKUP(user_storage, UserStorage); + +static const char* const user_disposition_table[_USER_DISPOSITION_MAX] = { + [USER_INTRINSIC] = "intrinsic", + [USER_SYSTEM] = "system", + [USER_DYNAMIC] = "dynamic", + [USER_REGULAR] = "regular", + [USER_CONTAINER] = "container", + [USER_RESERVED] = "reserved", +}; + +DEFINE_STRING_TABLE_LOOKUP(user_disposition, UserDisposition); + +static const char* const auto_resize_mode_table[_AUTO_RESIZE_MODE_MAX] = { + [AUTO_RESIZE_OFF] = "off", + [AUTO_RESIZE_GROW] = "grow", + [AUTO_RESIZE_SHRINK_AND_GROW] = "shrink-and-grow", +}; + +DEFINE_STRING_TABLE_LOOKUP(auto_resize_mode, AutoResizeMode); diff --git a/src/shared/user-record.h b/src/shared/user-record.h new file mode 100644 index 0000000..298dc24 --- /dev/null +++ b/src/shared/user-record.h @@ -0,0 +1,450 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-id128.h" + +#include "json.h" +#include "missing_resource.h" +#include "time-util.h" + +typedef enum UserDisposition { + USER_INTRINSIC, /* root and nobody */ + USER_SYSTEM, /* statically allocated users for system services */ + USER_DYNAMIC, /* dynamically allocated users for system services */ + USER_REGULAR, /* regular (typically human users) */ + USER_CONTAINER, /* UID ranges allocated for container uses */ + USER_RESERVED, /* Range above 2^31 */ + _USER_DISPOSITION_MAX, + _USER_DISPOSITION_INVALID = -EINVAL, +} UserDisposition; + +typedef enum UserHomeStorage { + USER_CLASSIC, + USER_LUKS, + USER_DIRECTORY, /* A directory, and a .identity file in it, which USER_CLASSIC lacks */ + USER_SUBVOLUME, + USER_FSCRYPT, + USER_CIFS, + _USER_STORAGE_MAX, + _USER_STORAGE_INVALID = -EINVAL, +} UserStorage; + +typedef enum UserRecordMask { + /* The various sections an identity record may have, as bit mask */ + USER_RECORD_REGULAR = 1U << 0, + USER_RECORD_SECRET = 1U << 1, + USER_RECORD_PRIVILEGED = 1U << 2, + USER_RECORD_PER_MACHINE = 1U << 3, + USER_RECORD_BINDING = 1U << 4, + USER_RECORD_STATUS = 1U << 5, + USER_RECORD_SIGNATURE = 1U << 6, + _USER_RECORD_MASK_MAX = (1U << 7)-1 +} UserRecordMask; + +typedef enum UserRecordLoadFlags { + /* A set of flags used while loading a user record from JSON data. We leave the lower 6 bits free, + * just as a safety precaution so that we can detect borked conversions between UserRecordMask and + * UserRecordLoadFlags. */ + + /* What to require */ + USER_RECORD_REQUIRE_REGULAR = USER_RECORD_REGULAR << 7, + USER_RECORD_REQUIRE_SECRET = USER_RECORD_SECRET << 7, + USER_RECORD_REQUIRE_PRIVILEGED = USER_RECORD_PRIVILEGED << 7, + USER_RECORD_REQUIRE_PER_MACHINE = USER_RECORD_PER_MACHINE << 7, + USER_RECORD_REQUIRE_BINDING = USER_RECORD_BINDING << 7, + USER_RECORD_REQUIRE_STATUS = USER_RECORD_STATUS << 7, + USER_RECORD_REQUIRE_SIGNATURE = USER_RECORD_SIGNATURE << 7, + + /* What to allow */ + USER_RECORD_ALLOW_REGULAR = USER_RECORD_REGULAR << 14, + USER_RECORD_ALLOW_SECRET = USER_RECORD_SECRET << 14, + USER_RECORD_ALLOW_PRIVILEGED = USER_RECORD_PRIVILEGED << 14, + USER_RECORD_ALLOW_PER_MACHINE = USER_RECORD_PER_MACHINE << 14, + USER_RECORD_ALLOW_BINDING = USER_RECORD_BINDING << 14, + USER_RECORD_ALLOW_STATUS = USER_RECORD_STATUS << 14, + USER_RECORD_ALLOW_SIGNATURE = USER_RECORD_SIGNATURE << 14, + + /* What to strip */ + USER_RECORD_STRIP_REGULAR = USER_RECORD_REGULAR << 21, + USER_RECORD_STRIP_SECRET = USER_RECORD_SECRET << 21, + USER_RECORD_STRIP_PRIVILEGED = USER_RECORD_PRIVILEGED << 21, + USER_RECORD_STRIP_PER_MACHINE = USER_RECORD_PER_MACHINE << 21, + USER_RECORD_STRIP_BINDING = USER_RECORD_BINDING << 21, + USER_RECORD_STRIP_STATUS = USER_RECORD_STATUS << 21, + USER_RECORD_STRIP_SIGNATURE = USER_RECORD_SIGNATURE << 21, + + /* Some special combinations that deserve explicit names */ + USER_RECORD_LOAD_FULL = USER_RECORD_REQUIRE_REGULAR | + USER_RECORD_ALLOW_SECRET | + USER_RECORD_ALLOW_PRIVILEGED | + USER_RECORD_ALLOW_PER_MACHINE | + USER_RECORD_ALLOW_BINDING | + USER_RECORD_ALLOW_STATUS | + USER_RECORD_ALLOW_SIGNATURE, + + USER_RECORD_LOAD_REFUSE_SECRET = USER_RECORD_REQUIRE_REGULAR | + USER_RECORD_ALLOW_PRIVILEGED | + USER_RECORD_ALLOW_PER_MACHINE | + USER_RECORD_ALLOW_BINDING | + USER_RECORD_ALLOW_STATUS | + USER_RECORD_ALLOW_SIGNATURE, + + USER_RECORD_LOAD_MASK_SECRET = USER_RECORD_REQUIRE_REGULAR | + USER_RECORD_ALLOW_PRIVILEGED | + USER_RECORD_ALLOW_PER_MACHINE | + USER_RECORD_ALLOW_BINDING | + USER_RECORD_ALLOW_STATUS | + USER_RECORD_ALLOW_SIGNATURE | + USER_RECORD_STRIP_SECRET, + + USER_RECORD_EXTRACT_SECRET = USER_RECORD_REQUIRE_SECRET | + USER_RECORD_STRIP_REGULAR | + USER_RECORD_STRIP_PRIVILEGED | + USER_RECORD_STRIP_PER_MACHINE | + USER_RECORD_STRIP_BINDING | + USER_RECORD_STRIP_STATUS | + USER_RECORD_STRIP_SIGNATURE, + + USER_RECORD_LOAD_SIGNABLE = USER_RECORD_REQUIRE_REGULAR | + USER_RECORD_ALLOW_PRIVILEGED | + USER_RECORD_ALLOW_PER_MACHINE, + + USER_RECORD_EXTRACT_SIGNABLE = USER_RECORD_LOAD_SIGNABLE | + USER_RECORD_STRIP_SECRET | + USER_RECORD_STRIP_BINDING | + USER_RECORD_STRIP_STATUS | + USER_RECORD_STRIP_SIGNATURE, + + USER_RECORD_LOAD_EMBEDDED = USER_RECORD_REQUIRE_REGULAR | + USER_RECORD_ALLOW_PRIVILEGED | + USER_RECORD_ALLOW_PER_MACHINE | + USER_RECORD_ALLOW_SIGNATURE, + + USER_RECORD_EXTRACT_EMBEDDED = USER_RECORD_LOAD_EMBEDDED | + USER_RECORD_STRIP_SECRET | + USER_RECORD_STRIP_BINDING | + USER_RECORD_STRIP_STATUS, + + /* Whether to log about loader errors beyond LOG_DEBUG */ + USER_RECORD_LOG = 1U << 28, + + /* Whether to ignore errors and load what we can */ + USER_RECORD_PERMISSIVE = 1U << 29, + + /* Whether an empty record is OK */ + USER_RECORD_EMPTY_OK = 1U << 30, +} UserRecordLoadFlags; + +static inline UserRecordLoadFlags USER_RECORD_REQUIRE(UserRecordMask m) { + assert((m & ~_USER_RECORD_MASK_MAX) == 0); + return m << 7; +} + +static inline UserRecordLoadFlags USER_RECORD_ALLOW(UserRecordMask m) { + assert((m & ~_USER_RECORD_MASK_MAX) == 0); + return m << 14; +} + +static inline UserRecordLoadFlags USER_RECORD_STRIP(UserRecordMask m) { + assert((m & ~_USER_RECORD_MASK_MAX) == 0); + return m << 21; +} + +static inline UserRecordMask USER_RECORD_REQUIRE_MASK(UserRecordLoadFlags f) { + return (f >> 7) & _USER_RECORD_MASK_MAX; +} + +static inline UserRecordMask USER_RECORD_ALLOW_MASK(UserRecordLoadFlags f) { + return ((f >> 14) & _USER_RECORD_MASK_MAX) | USER_RECORD_REQUIRE_MASK(f); +} + +static inline UserRecordMask USER_RECORD_STRIP_MASK(UserRecordLoadFlags f) { + return (f >> 21) & _USER_RECORD_MASK_MAX; +} + +static inline JsonDispatchFlags USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(UserRecordLoadFlags flags) { + return (FLAGS_SET(flags, USER_RECORD_LOG) ? JSON_LOG : 0) | + (FLAGS_SET(flags, USER_RECORD_PERMISSIVE) ? JSON_PERMISSIVE : 0); +} + +typedef struct Pkcs11EncryptedKey { + /* The encrypted passphrase, which can be decrypted with the private key indicated below */ + void *data; + size_t size; + + /* Where to find the private key to decrypt the encrypted passphrase above */ + char *uri; + + /* What to test the decrypted passphrase against to allow access (classic UNIX password hash). Note + * that the decrypted passphrase is also used for unlocking LUKS and fscrypt, and if the account is + * backed by LUKS or fscrypt the hashed password is only an additional layer of authentication, not + * the only. */ + char *hashed_password; +} Pkcs11EncryptedKey; + +typedef struct Fido2HmacCredential { + void *id; + size_t size; +} Fido2HmacCredential; + +typedef struct Fido2HmacSalt { + /* The FIDO2 Cridential ID to use */ + Fido2HmacCredential credential; + + /* The FIDO2 salt value */ + void *salt; + size_t salt_size; + + /* What to test the hashed salt value against, usually UNIX password hash here. */ + char *hashed_password; + + /* Whether the 'up', 'uv', 'clientPin' features are enabled. */ + int uv, up, client_pin; +} Fido2HmacSalt; + +typedef struct RecoveryKey { + /* The type of recovery key, must be "modhex64" right now */ + char *type; + + /* A UNIX password hash of the normalized form of modhex64 */ + char *hashed_password; +} RecoveryKey; + +typedef enum AutoResizeMode { + AUTO_RESIZE_OFF, /* no automatic grow/shrink */ + AUTO_RESIZE_GROW, /* grow at login */ + AUTO_RESIZE_SHRINK_AND_GROW, /* shrink at logout + grow at login */ + _AUTO_RESIZE_MODE_MAX, + _AUTO_RESIZE_MODE_INVALID = -EINVAL, +} AutoResizeMode; + +#define REBALANCE_WEIGHT_OFF UINT64_C(0) +#define REBALANCE_WEIGHT_DEFAULT UINT64_C(100) +#define REBALANCE_WEIGHT_BACKING UINT64_C(20) +#define REBALANCE_WEIGHT_MIN UINT64_C(1) +#define REBALANCE_WEIGHT_MAX UINT64_C(10000) +#define REBALANCE_WEIGHT_UNSET UINT64_MAX + +typedef struct UserRecord { + /* The following three fields are not part of the JSON record */ + unsigned n_ref; + UserRecordMask mask; + bool incomplete; /* incomplete due to security restrictions. */ + + char *user_name; + char *realm; + char *user_name_and_realm_auto; /* the user_name field concatenated with '@' and the realm, if the latter is defined */ + char *real_name; + char *email_address; + char *password_hint; + char *icon_name; + char *location; + + UserDisposition disposition; + uint64_t last_change_usec; + uint64_t last_password_change_usec; + + char *shell; + mode_t umask; + char **environment; + char *time_zone; + char *preferred_language; + int nice_level; + struct rlimit *rlimits[_RLIMIT_MAX]; + + int locked; /* prohibit activation in general */ + uint64_t not_before_usec; /* prohibit activation before this unix time */ + uint64_t not_after_usec; /* prohibit activation after this unix time */ + + UserStorage storage; + uint64_t disk_size; + uint64_t disk_size_relative; /* Disk size, relative to the free bytes of the medium, normalized to UINT32_MAX = 100% */ + char *skeleton_directory; + mode_t access_mode; + AutoResizeMode auto_resize_mode; + uint64_t rebalance_weight; + + uint64_t tasks_max; + uint64_t memory_high; + uint64_t memory_max; + uint64_t cpu_weight; + uint64_t io_weight; + + bool nosuid; + bool nodev; + bool noexec; + + char **hashed_password; + char **ssh_authorized_keys; + char **password; + char **token_pin; + + char *cifs_domain; + char *cifs_user_name; + char *cifs_service; + char *cifs_extra_mount_options; + + char *image_path; + char *image_path_auto; /* when none is configured explicitly, this is where we place the implicit image */ + char *home_directory; + char *home_directory_auto; /* when none is set explicitly, this is where we place the implicit home directory */ + + uid_t uid; + gid_t gid; + + char **member_of; + + char *file_system_type; + sd_id128_t partition_uuid; + sd_id128_t luks_uuid; + sd_id128_t file_system_uuid; + + int luks_discard; + int luks_offline_discard; + char *luks_cipher; + char *luks_cipher_mode; + uint64_t luks_volume_key_size; + char *luks_pbkdf_hash_algorithm; + char *luks_pbkdf_type; + uint64_t luks_pbkdf_force_iterations; + uint64_t luks_pbkdf_time_cost_usec; + uint64_t luks_pbkdf_memory_cost; + uint64_t luks_pbkdf_parallel_threads; + uint64_t luks_sector_size; + char *luks_extra_mount_options; + + uint64_t disk_usage; + uint64_t disk_free; + uint64_t disk_ceiling; + uint64_t disk_floor; + + char *state; + char *service; + int signed_locally; + + uint64_t good_authentication_counter; + uint64_t bad_authentication_counter; + uint64_t last_good_authentication_usec; + uint64_t last_bad_authentication_usec; + + uint64_t ratelimit_begin_usec; + uint64_t ratelimit_count; + uint64_t ratelimit_interval_usec; + uint64_t ratelimit_burst; + + int removable; + int enforce_password_policy; + int auto_login; + int drop_caches; + + uint64_t stop_delay_usec; /* How long to leave systemd --user around on log-out */ + int kill_processes; /* Whether to kill user processes forcibly on log-out */ + + /* The following exist mostly so that we can cover the full /etc/shadow set of fields */ + uint64_t password_change_min_usec; /* maps to .sp_min */ + uint64_t password_change_max_usec; /* maps to .sp_max */ + uint64_t password_change_warn_usec; /* maps to .sp_warn */ + uint64_t password_change_inactive_usec; /* maps to .sp_inact */ + int password_change_now; /* Require a password change immediately on next login (.sp_lstchg = 0) */ + + char **pkcs11_token_uri; + Pkcs11EncryptedKey *pkcs11_encrypted_key; + size_t n_pkcs11_encrypted_key; + int pkcs11_protected_authentication_path_permitted; + + Fido2HmacCredential *fido2_hmac_credential; + size_t n_fido2_hmac_credential; + Fido2HmacSalt *fido2_hmac_salt; + size_t n_fido2_hmac_salt; + int fido2_user_presence_permitted; + int fido2_user_verification_permitted; + + char **recovery_key_type; + RecoveryKey *recovery_key; + size_t n_recovery_key; + + char **capability_bounding_set; + char **capability_ambient_set; + + JsonVariant *json; +} UserRecord; + +UserRecord* user_record_new(void); +UserRecord* user_record_ref(UserRecord *h); +UserRecord* user_record_unref(UserRecord *h); + +DEFINE_TRIVIAL_CLEANUP_FUNC(UserRecord*, user_record_unref); + +int user_record_load(UserRecord *h, JsonVariant *v, UserRecordLoadFlags flags); +int user_record_build(UserRecord **ret, ...); + +const char *user_record_user_name_and_realm(UserRecord *h); +UserStorage user_record_storage(UserRecord *h); +const char *user_record_file_system_type(UserRecord *h); +const char *user_record_skeleton_directory(UserRecord *h); +mode_t user_record_access_mode(UserRecord *h); +const char *user_record_home_directory(UserRecord *h); +const char *user_record_image_path(UserRecord *h); +unsigned long user_record_mount_flags(UserRecord *h); +const char *user_record_cifs_user_name(UserRecord *h); +const char *user_record_shell(UserRecord *h); +const char *user_record_real_name(UserRecord *h); +bool user_record_luks_discard(UserRecord *h); +bool user_record_luks_offline_discard(UserRecord *h); +const char *user_record_luks_cipher(UserRecord *h); +const char *user_record_luks_cipher_mode(UserRecord *h); +uint64_t user_record_luks_volume_key_size(UserRecord *h); +const char* user_record_luks_pbkdf_type(UserRecord *h); +uint64_t user_record_luks_pbkdf_force_iterations(UserRecord *h); +usec_t user_record_luks_pbkdf_time_cost_usec(UserRecord *h); +uint64_t user_record_luks_pbkdf_memory_cost(UserRecord *h); +uint64_t user_record_luks_pbkdf_parallel_threads(UserRecord *h); +uint64_t user_record_luks_sector_size(UserRecord *h); +const char *user_record_luks_pbkdf_hash_algorithm(UserRecord *h); +gid_t user_record_gid(UserRecord *h); +UserDisposition user_record_disposition(UserRecord *h); +int user_record_removable(UserRecord *h); +usec_t user_record_ratelimit_interval_usec(UserRecord *h); +uint64_t user_record_ratelimit_burst(UserRecord *h); +bool user_record_can_authenticate(UserRecord *h); +bool user_record_drop_caches(UserRecord *h); +AutoResizeMode user_record_auto_resize_mode(UserRecord *h); +uint64_t user_record_rebalance_weight(UserRecord *h); +uint64_t user_record_capability_bounding_set(UserRecord *h); +uint64_t user_record_capability_ambient_set(UserRecord *h); + +int user_record_build_image_path(UserStorage storage, const char *user_name_and_realm, char **ret); + +bool user_record_equal(UserRecord *a, UserRecord *b); +bool user_record_compatible(UserRecord *a, UserRecord *b); +int user_record_compare_last_change(UserRecord *a, UserRecord *b); + +usec_t user_record_ratelimit_next_try(UserRecord *h); + +int user_record_clone(UserRecord *h, UserRecordLoadFlags flags, UserRecord **ret); +int user_record_masked_equal(UserRecord *a, UserRecord *b, UserRecordMask mask); + +int user_record_test_blocked(UserRecord *h); +int user_record_test_password_change_required(UserRecord *h); + +/* The following six are user by group-record.c, that's why we export them here */ +int json_dispatch_realm(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_gecos(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_user_group_list(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); +int json_dispatch_user_disposition(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata); + +int per_machine_id_match(JsonVariant *ids, JsonDispatchFlags flags); +int per_machine_hostname_match(JsonVariant *hns, JsonDispatchFlags flags); +int user_group_record_mangle(JsonVariant *v, UserRecordLoadFlags load_flags, JsonVariant **ret_variant, UserRecordMask *ret_mask); + +const char* user_storage_to_string(UserStorage t) _const_; +UserStorage user_storage_from_string(const char *s) _pure_; + +const char* user_disposition_to_string(UserDisposition t) _const_; +UserDisposition user_disposition_from_string(const char *s) _pure_; + +const char* auto_resize_mode_to_string(AutoResizeMode m) _const_; +AutoResizeMode auto_resize_mode_from_string(const char *s) _pure_; diff --git a/src/shared/userdb-dropin.c b/src/shared/userdb-dropin.c new file mode 100644 index 0000000..a2d48fa --- /dev/null +++ b/src/shared/userdb-dropin.c @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "path-util.h" +#include "stdio-util.h" +#include "user-util.h" +#include "userdb-dropin.h" + +static int load_user( + FILE *f, + const char *path, + const char *name, + uid_t uid, + UserDBFlags flags, + UserRecord **ret) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(user_record_unrefp) UserRecord *u = NULL; + bool have_privileged; + int r; + + assert(f); + + r = json_parse_file(f, path, 0, &v, NULL, NULL); + if (r < 0) + return r; + + if (FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW) || !path || !(name || uid_is_valid(uid))) + have_privileged = false; + else { + _cleanup_(json_variant_unrefp) JsonVariant *privileged_v = NULL; + _cleanup_free_ char *d = NULL, *j = NULL; + + /* Let's load the "privileged" section from a companion file. But only if USERDB_AVOID_SHADOW + * is not set. After all, the privileged section kinda takes the role of the data from the + * shadow file, hence it makes sense to use the same flag here. + * + * The general assumption is that whoever provides these records makes the .user file + * world-readable, but the .privilege file readable to root and the assigned UID only. But we + * won't verify that here, as it would be too late. */ + + r = path_extract_directory(path, &d); + if (r < 0) + return r; + + if (name) { + j = strjoin(d, "/", name, ".user-privileged"); + if (!j) + return -ENOMEM; + } else { + assert(uid_is_valid(uid)); + if (asprintf(&j, "%s/" UID_FMT ".user-privileged", d, uid) < 0) + return -ENOMEM; + } + + r = json_parse_file(NULL, j, JSON_PARSE_SENSITIVE, &privileged_v, NULL, NULL); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + have_privileged = false; + else if (r == -ENOENT) + have_privileged = true; /* if the privileged file doesn't exist, we are complete */ + else if (r < 0) + return r; + else { + r = json_variant_merge_object(&v, privileged_v); + if (r < 0) + return r; + + have_privileged = true; + } + } + + u = user_record_new(); + if (!u) + return -ENOMEM; + + r = user_record_load( + u, v, + USER_RECORD_REQUIRE_REGULAR| + USER_RECORD_ALLOW_PER_MACHINE| + USER_RECORD_ALLOW_BINDING| + USER_RECORD_ALLOW_SIGNATURE| + (have_privileged ? USER_RECORD_ALLOW_PRIVILEGED : 0)| + USER_RECORD_PERMISSIVE); + if (r < 0) + return r; + + if (name && !streq_ptr(name, u->user_name)) + return -EINVAL; + + if (uid_is_valid(uid) && uid != u->uid) + return -EINVAL; + + u->incomplete = !have_privileged; + + if (ret) + *ret = TAKE_PTR(u); + + return 0; +} + +int dropin_user_record_by_name(const char *name, const char *path, UserDBFlags flags, UserRecord **ret) { + _cleanup_free_ char *found_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(name); + + if (path) { + f = fopen(path, "re"); + if (!f) + return errno == ENOENT ? -ESRCH : -errno; /* We generally want ESRCH to indicate no such user */ + } else { + const char *j; + + j = strjoina(name, ".user"); + if (!filename_is_valid(j)) /* Doesn't qualify as valid filename? Then it's definitely not provided as a drop-in */ + return -ESRCH; + + r = search_and_fopen_nulstr(j, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + path = found_path; + } + + return load_user(f, path, name, UID_INVALID, flags, ret); +} + +int dropin_user_record_by_uid(uid_t uid, const char *path, UserDBFlags flags, UserRecord **ret) { + _cleanup_free_ char *found_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(uid_is_valid(uid)); + + if (path) { + f = fopen(path, "re"); + if (!f) + return errno == ENOENT ? -ESRCH : -errno; + } else { + char buf[DECIMAL_STR_MAX(uid_t) + STRLEN(".user") + 1]; + + xsprintf(buf, UID_FMT ".user", uid); + /* Note that we don't bother to validate this as a filename, as this is generated from a decimal + * integer, i.e. is definitely OK as a filename */ + + r = search_and_fopen_nulstr(buf, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + path = found_path; + } + + return load_user(f, path, NULL, uid, flags, ret); +} + +static int load_group( + FILE *f, + const char *path, + const char *name, + gid_t gid, + UserDBFlags flags, + GroupRecord **ret) { + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + bool have_privileged; + int r; + + assert(f); + + r = json_parse_file(f, path, 0, &v, NULL, NULL); + if (r < 0) + return r; + + if (FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW) || !path || !(name || gid_is_valid(gid))) + have_privileged = false; + else { + _cleanup_(json_variant_unrefp) JsonVariant *privileged_v = NULL; + _cleanup_free_ char *d = NULL, *j = NULL; + + r = path_extract_directory(path, &d); + if (r < 0) + return r; + + if (name) { + j = strjoin(d, "/", name, ".group-privileged"); + if (!j) + return -ENOMEM; + } else { + assert(gid_is_valid(gid)); + if (asprintf(&j, "%s/" GID_FMT ".group-privileged", d, gid) < 0) + return -ENOMEM; + } + + r = json_parse_file(NULL, j, JSON_PARSE_SENSITIVE, &privileged_v, NULL, NULL); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + have_privileged = false; + else if (r == -ENOENT) + have_privileged = true; /* if the privileged file doesn't exist, we are complete */ + else if (r < 0) + return r; + else { + r = json_variant_merge_object(&v, privileged_v); + if (r < 0) + return r; + + have_privileged = true; + } + } + + g = group_record_new(); + if (!g) + return -ENOMEM; + + r = group_record_load( + g, v, + USER_RECORD_REQUIRE_REGULAR| + USER_RECORD_ALLOW_PER_MACHINE| + USER_RECORD_ALLOW_BINDING| + USER_RECORD_ALLOW_SIGNATURE| + (have_privileged ? USER_RECORD_ALLOW_PRIVILEGED : 0)| + USER_RECORD_PERMISSIVE); + if (r < 0) + return r; + + if (name && !streq_ptr(name, g->group_name)) + return -EINVAL; + + if (gid_is_valid(gid) && gid != g->gid) + return -EINVAL; + + g->incomplete = !have_privileged; + + if (ret) + *ret = TAKE_PTR(g); + + return 0; +} + +int dropin_group_record_by_name(const char *name, const char *path, UserDBFlags flags, GroupRecord **ret) { + _cleanup_free_ char *found_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(name); + + if (path) { + f = fopen(path, "re"); + if (!f) + return errno == ENOENT ? -ESRCH : -errno; + } else { + const char *j; + + j = strjoina(name, ".group"); + if (!filename_is_valid(j)) /* Doesn't qualify as valid filename? Then it's definitely not provided as a drop-in */ + return -ESRCH; + + r = search_and_fopen_nulstr(j, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + path = found_path; + } + + return load_group(f, path, name, GID_INVALID, flags, ret); +} + +int dropin_group_record_by_gid(gid_t gid, const char *path, UserDBFlags flags, GroupRecord **ret) { + _cleanup_free_ char *found_path = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(gid_is_valid(gid)); + + if (path) { + f = fopen(path, "re"); + if (!f) + return errno == ENOENT ? -ESRCH : -errno; + } else { + char buf[DECIMAL_STR_MAX(gid_t) + STRLEN(".group") + 1]; + + xsprintf(buf, GID_FMT ".group", gid); + + r = search_and_fopen_nulstr(buf, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path); + if (r == -ENOENT) + return -ESRCH; + if (r < 0) + return r; + + path = found_path; + } + + return load_group(f, path, NULL, gid, flags, ret); +} diff --git a/src/shared/userdb-dropin.h b/src/shared/userdb-dropin.h new file mode 100644 index 0000000..3bd1b9c --- /dev/null +++ b/src/shared/userdb-dropin.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "constants.h" +#include "group-record.h" +#include "user-record.h" +#include "userdb.h" + +/* This could be put together with CONF_PATHS_NULSTR, with the exception of the /run/host/ part in the + * middle, which we use here, but not otherwise. */ +#define USERDB_DROPIN_DIR_NULSTR(n) \ + "/etc/" n "\0" \ + "/run/" n "\0" \ + "/run/host/" n "\0" \ + "/usr/local/lib/" n "\0" \ + "/usr/lib/" n "\0" + +int dropin_user_record_by_name(const char *name, const char *path, UserDBFlags flags, UserRecord **ret); +int dropin_user_record_by_uid(uid_t uid, const char *path, UserDBFlags flags, UserRecord **ret); + +int dropin_group_record_by_name(const char *name, const char *path, UserDBFlags flags, GroupRecord **ret); +int dropin_group_record_by_gid(gid_t gid, const char *path, UserDBFlags flags, GroupRecord **ret); diff --git a/src/shared/userdb.c b/src/shared/userdb.c new file mode 100644 index 0000000..f60d48a --- /dev/null +++ b/src/shared/userdb.c @@ -0,0 +1,1465 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "conf-files.h" +#include "dirent-util.h" +#include "dlfcn-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "missing_syscall.h" +#include "parse-util.h" +#include "set.h" +#include "socket-util.h" +#include "strv.h" +#include "user-record-nss.h" +#include "user-util.h" +#include "userdb-dropin.h" +#include "userdb.h" +#include "varlink.h" + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(link_hash_ops, void, trivial_hash_func, trivial_compare_func, Varlink, varlink_unref); + +typedef enum LookupWhat { + LOOKUP_USER, + LOOKUP_GROUP, + LOOKUP_MEMBERSHIP, + _LOOKUP_WHAT_MAX, +} LookupWhat; + +struct UserDBIterator { + LookupWhat what; + UserDBFlags flags; + Set *links; + bool nss_covered:1; + bool nss_iterating:1; + bool dropin_covered:1; + bool synthesize_root:1; + bool synthesize_nobody:1; + bool nss_systemd_blocked:1; + char **dropins; + size_t current_dropin; + int error; + unsigned n_found; + sd_event *event; + UserRecord *found_user; /* when .what == LOOKUP_USER */ + GroupRecord *found_group; /* when .what == LOOKUP_GROUP */ + + char *found_user_name, *found_group_name; /* when .what == LOOKUP_MEMBERSHIP */ + char **members_of_group; + size_t index_members_of_group; + char *filter_user_name, *filter_group_name; +}; + +UserDBIterator* userdb_iterator_free(UserDBIterator *iterator) { + if (!iterator) + return NULL; + + set_free(iterator->links); + strv_free(iterator->dropins); + + switch (iterator->what) { + + case LOOKUP_USER: + user_record_unref(iterator->found_user); + + if (iterator->nss_iterating) + endpwent(); + + break; + + case LOOKUP_GROUP: + group_record_unref(iterator->found_group); + + if (iterator->nss_iterating) + endgrent(); + + break; + + case LOOKUP_MEMBERSHIP: + free(iterator->found_user_name); + free(iterator->found_group_name); + strv_free(iterator->members_of_group); + free(iterator->filter_user_name); + free(iterator->filter_group_name); + + if (iterator->nss_iterating) + endgrent(); + + break; + + default: + assert_not_reached(); + } + + sd_event_unref(iterator->event); + + if (iterator->nss_systemd_blocked) + assert_se(userdb_block_nss_systemd(false) >= 0); + + return mfree(iterator); +} + +static UserDBIterator* userdb_iterator_new(LookupWhat what, UserDBFlags flags) { + UserDBIterator *i; + + assert(what >= 0); + assert(what < _LOOKUP_WHAT_MAX); + + i = new(UserDBIterator, 1); + if (!i) + return NULL; + + *i = (UserDBIterator) { + .what = what, + .flags = flags, + .synthesize_root = !FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE), + .synthesize_nobody = !FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE), + }; + + return i; +} + +static int userdb_iterator_block_nss_systemd(UserDBIterator *iterator) { + int r; + + assert(iterator); + + if (iterator->nss_systemd_blocked) + return 0; + + r = userdb_block_nss_systemd(true); + if (r < 0) + return r; + + iterator->nss_systemd_blocked = true; + return 1; +} + +struct user_group_data { + JsonVariant *record; + bool incomplete; +}; + +static void user_group_data_done(struct user_group_data *d) { + json_variant_unref(d->record); +} + +struct membership_data { + char *user_name; + char *group_name; +}; + +static void membership_data_done(struct membership_data *d) { + free(d->user_name); + free(d->group_name); +} + +static int userdb_on_query_reply( + Varlink *link, + JsonVariant *parameters, + const char *error_id, + VarlinkReplyFlags flags, + void *userdata) { + + UserDBIterator *iterator = ASSERT_PTR(userdata); + int r; + + if (error_id) { + log_debug("Got lookup error: %s", error_id); + + if (STR_IN_SET(error_id, + "io.systemd.UserDatabase.NoRecordFound", + "io.systemd.UserDatabase.ConflictingRecordFound")) + r = -ESRCH; + else if (streq(error_id, "io.systemd.UserDatabase.ServiceNotAvailable")) + r = -EHOSTDOWN; + else if (streq(error_id, "io.systemd.UserDatabase.EnumerationNotSupported")) + r = -EOPNOTSUPP; + else if (streq(error_id, VARLINK_ERROR_TIMEOUT)) + r = -ETIMEDOUT; + else + r = -EIO; + + goto finish; + } + + switch (iterator->what) { + + case LOOKUP_USER: { + _cleanup_(user_group_data_done) struct user_group_data user_data = {}; + + static const JsonDispatch dispatch_table[] = { + { "record", _JSON_VARIANT_TYPE_INVALID, json_dispatch_variant, offsetof(struct user_group_data, record), 0 }, + { "incomplete", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct user_group_data, incomplete), 0 }, + {} + }; + _cleanup_(user_record_unrefp) UserRecord *hr = NULL; + + assert_se(!iterator->found_user); + + r = json_dispatch(parameters, dispatch_table, 0, &user_data); + if (r < 0) + goto finish; + + if (!user_data.record) { + r = log_debug_errno(SYNTHETIC_ERRNO(EIO), "Reply is missing record key"); + goto finish; + } + + hr = user_record_new(); + if (!hr) { + r = -ENOMEM; + goto finish; + } + + r = user_record_load(hr, user_data.record, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE); + if (r < 0) + goto finish; + + if (!hr->service) { + r = log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "User record does not carry service information, refusing."); + goto finish; + } + + hr->incomplete = user_data.incomplete; + + /* We match the root user by the name since the name is our primary key. We match the nobody + * use by UID though, since the name might differ on OSes */ + if (streq_ptr(hr->user_name, "root")) + iterator->synthesize_root = false; + if (hr->uid == UID_NOBODY) + iterator->synthesize_nobody = false; + + iterator->found_user = TAKE_PTR(hr); + iterator->n_found++; + + /* More stuff coming? then let's just exit cleanly here */ + if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + return 0; + + /* Otherwise, let's remove this link and exit cleanly then */ + r = 0; + goto finish; + } + + case LOOKUP_GROUP: { + _cleanup_(user_group_data_done) struct user_group_data group_data = {}; + + static const JsonDispatch dispatch_table[] = { + { "record", _JSON_VARIANT_TYPE_INVALID, json_dispatch_variant, offsetof(struct user_group_data, record), 0 }, + { "incomplete", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct user_group_data, incomplete), 0 }, + {} + }; + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + + assert_se(!iterator->found_group); + + r = json_dispatch(parameters, dispatch_table, 0, &group_data); + if (r < 0) + goto finish; + + if (!group_data.record) { + r = log_debug_errno(SYNTHETIC_ERRNO(EIO), "Reply is missing record key"); + goto finish; + } + + g = group_record_new(); + if (!g) { + r = -ENOMEM; + goto finish; + } + + r = group_record_load(g, group_data.record, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE); + if (r < 0) + goto finish; + + if (!g->service) { + r = log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Group record does not carry service information, refusing."); + goto finish; + } + + g->incomplete = group_data.incomplete; + + if (streq_ptr(g->group_name, "root")) + iterator->synthesize_root = false; + if (g->gid == GID_NOBODY) + iterator->synthesize_nobody = false; + + iterator->found_group = TAKE_PTR(g); + iterator->n_found++; + + if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + return 0; + + r = 0; + goto finish; + } + + case LOOKUP_MEMBERSHIP: { + _cleanup_(membership_data_done) struct membership_data membership_data = {}; + + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(struct membership_data, user_name), JSON_RELAX }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(struct membership_data, group_name), JSON_RELAX }, + {} + }; + + assert(!iterator->found_user_name); + assert(!iterator->found_group_name); + + r = json_dispatch(parameters, dispatch_table, 0, &membership_data); + if (r < 0) + goto finish; + + iterator->found_user_name = TAKE_PTR(membership_data.user_name); + iterator->found_group_name = TAKE_PTR(membership_data.group_name); + iterator->n_found++; + + if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + return 0; + + r = 0; + goto finish; + } + + default: + assert_not_reached(); + } + +finish: + /* If we got one ESRCH, let that win. This way when we do a wild dump we won't be tripped up by bad + * errors if at least one connection ended cleanly */ + if (r == -ESRCH || iterator->error == 0) + iterator->error = -r; + + assert_se(set_remove(iterator->links, link) == link); + link = varlink_unref(link); + return 0; +} + +static int userdb_connect( + UserDBIterator *iterator, + const char *path, + const char *method, + bool more, + JsonVariant *query) { + + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r; + + assert(iterator); + assert(path); + assert(method); + + r = varlink_connect_address(&vl, path); + if (r < 0) + return log_debug_errno(r, "Unable to connect to %s: %m", path); + + varlink_set_userdata(vl, iterator); + + if (!iterator->event) { + r = sd_event_new(&iterator->event); + if (r < 0) + return log_debug_errno(r, "Unable to allocate event loop: %m"); + } + + r = varlink_attach_event(vl, iterator->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_debug_errno(r, "Failed to attach varlink connection to event loop: %m"); + + (void) varlink_set_description(vl, path); + + r = varlink_bind_reply(vl, userdb_on_query_reply); + if (r < 0) + return log_debug_errno(r, "Failed to bind reply callback: %m"); + + if (more) + r = varlink_observe(vl, method, query); + else + r = varlink_invoke(vl, method, query); + if (r < 0) + return log_debug_errno(r, "Failed to invoke varlink method: %m"); + + r = set_ensure_consume(&iterator->links, &link_hash_ops, TAKE_PTR(vl)); + if (r < 0) + return log_debug_errno(r, "Failed to add varlink connection to set: %m"); + return r; +} + +static int userdb_start_query( + UserDBIterator *iterator, + const char *method, + bool more, + JsonVariant *query, + UserDBFlags flags) { + + _cleanup_strv_free_ char **except = NULL, **only = NULL; + _cleanup_closedir_ DIR *d = NULL; + const char *e; + int r, ret = 0; + + assert(iterator); + assert(method); + + if (FLAGS_SET(flags, USERDB_EXCLUDE_VARLINK)) + return -ENOLINK; + + e = getenv("SYSTEMD_BYPASS_USERDB"); + if (e) { + r = parse_boolean(e); + if (r > 0) + return -ENOLINK; + if (r < 0) { + except = strv_split(e, ":"); + if (!except) + return -ENOMEM; + } + } + + e = getenv("SYSTEMD_ONLY_USERDB"); + if (e) { + only = strv_split(e, ":"); + if (!only) + return -ENOMEM; + } + + /* First, let's talk to the multiplexer, if we can */ + if ((flags & (USERDB_AVOID_MULTIPLEXER|USERDB_EXCLUDE_DYNAMIC_USER|USERDB_EXCLUDE_NSS|USERDB_EXCLUDE_DROPIN|USERDB_DONT_SYNTHESIZE)) == 0 && + !strv_contains(except, "io.systemd.Multiplexer") && + (!only || strv_contains(only, "io.systemd.Multiplexer"))) { + _cleanup_(json_variant_unrefp) JsonVariant *patched_query = json_variant_ref(query); + + r = json_variant_set_field_string(&patched_query, "service", "io.systemd.Multiplexer"); + if (r < 0) + return log_debug_errno(r, "Unable to set service JSON field: %m"); + + r = userdb_connect(iterator, "/run/systemd/userdb/io.systemd.Multiplexer", method, more, patched_query); + if (r >= 0) { + iterator->nss_covered = true; /* The multiplexer does NSS */ + iterator->dropin_covered = true; /* It also handles drop-in stuff */ + return 0; + } + } + + d = opendir("/run/systemd/userdb/"); + if (!d) { + if (errno == ENOENT) + return -ESRCH; + + return -errno; + } + + FOREACH_DIRENT(de, d, return -errno) { + _cleanup_(json_variant_unrefp) JsonVariant *patched_query = NULL; + _cleanup_free_ char *p = NULL; + bool is_nss, is_dropin; + + if (streq(de->d_name, "io.systemd.Multiplexer")) /* We already tried this above, don't try this again */ + continue; + + if (FLAGS_SET(flags, USERDB_EXCLUDE_DYNAMIC_USER) && + streq(de->d_name, "io.systemd.DynamicUser")) + continue; + + /* Avoid NSS if this is requested. Note that we also skip NSS when we were asked to skip the + * multiplexer, since in that case it's safer to do NSS in the client side emulation below + * (and when we run as part of systemd-userdbd.service we don't want to talk to ourselves + * anyway). */ + is_nss = streq(de->d_name, "io.systemd.NameServiceSwitch"); + if ((flags & (USERDB_EXCLUDE_NSS|USERDB_AVOID_MULTIPLEXER)) && is_nss) + continue; + + /* Similar for the drop-in service */ + is_dropin = streq(de->d_name, "io.systemd.DropIn"); + if ((flags & (USERDB_EXCLUDE_DROPIN|USERDB_AVOID_MULTIPLEXER)) && is_dropin) + continue; + + if (strv_contains(except, de->d_name)) + continue; + + if (only && !strv_contains(only, de->d_name)) + continue; + + p = path_join("/run/systemd/userdb/", de->d_name); + if (!p) + return -ENOMEM; + + patched_query = json_variant_ref(query); + r = json_variant_set_field_string(&patched_query, "service", de->d_name); + if (r < 0) + return log_debug_errno(r, "Unable to set service JSON field: %m"); + + r = userdb_connect(iterator, p, method, more, patched_query); + if (is_nss && r >= 0) /* Turn off fallback NSS + dropin if we found the NSS/dropin service + * and could connect to it */ + iterator->nss_covered = true; + if (is_dropin && r >= 0) + iterator->dropin_covered = true; + + if (ret == 0 && r < 0) + ret = r; + } + + if (set_isempty(iterator->links)) + return ret < 0 ? ret : -ESRCH; /* propagate last error we saw if we couldn't connect to anything. */ + + /* We connected to some services, in this case, ignore the ones we failed on */ + return 0; +} + +static int userdb_process( + UserDBIterator *iterator, + UserRecord **ret_user_record, + GroupRecord **ret_group_record, + char **ret_user_name, + char **ret_group_name) { + + int r; + + assert(iterator); + + for (;;) { + if (iterator->what == LOOKUP_USER && iterator->found_user) { + if (ret_user_record) + *ret_user_record = TAKE_PTR(iterator->found_user); + else + iterator->found_user = user_record_unref(iterator->found_user); + + if (ret_group_record) + *ret_group_record = NULL; + if (ret_user_name) + *ret_user_name = NULL; + if (ret_group_name) + *ret_group_name = NULL; + + return 0; + } + + if (iterator->what == LOOKUP_GROUP && iterator->found_group) { + if (ret_group_record) + *ret_group_record = TAKE_PTR(iterator->found_group); + else + iterator->found_group = group_record_unref(iterator->found_group); + + if (ret_user_record) + *ret_user_record = NULL; + if (ret_user_name) + *ret_user_name = NULL; + if (ret_group_name) + *ret_group_name = NULL; + + return 0; + } + + if (iterator->what == LOOKUP_MEMBERSHIP && iterator->found_user_name && iterator->found_group_name) { + if (ret_user_name) + *ret_user_name = TAKE_PTR(iterator->found_user_name); + else + iterator->found_user_name = mfree(iterator->found_user_name); + + if (ret_group_name) + *ret_group_name = TAKE_PTR(iterator->found_group_name); + else + iterator->found_group_name = mfree(iterator->found_group_name); + + if (ret_user_record) + *ret_user_record = NULL; + if (ret_group_record) + *ret_group_record = NULL; + + return 0; + } + + if (set_isempty(iterator->links)) { + if (iterator->error == 0) + return -ESRCH; + + return -abs(iterator->error); + } + + if (!iterator->event) + return -ESRCH; + + r = sd_event_run(iterator->event, UINT64_MAX); + if (r < 0) + return r; + } +} + +static int synthetic_root_user_build(UserRecord **ret) { + return user_record_build( + ret, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_CONST_STRING("root")), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(0)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(0)), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/root")), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic")))); +} + +static int synthetic_nobody_user_build(UserRecord **ret) { + return user_record_build( + ret, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_CONST_STRING(NOBODY_USER_NAME)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(UID_NOBODY)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)), + JSON_BUILD_PAIR("shell", JSON_BUILD_CONST_STRING(NOLOGIN)), + JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic")))); +} + +int userdb_by_name(const char *name, UserDBFlags flags, UserRecord **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *query = NULL; + int r; + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return -EINVAL; + + r = json_build(&query, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name)))); + if (r < 0) + return r; + + iterator = userdb_iterator_new(LOOKUP_USER, flags); + if (!iterator) + return -ENOMEM; + + r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetUserRecord", false, query, flags); + if (r >= 0) { + r = userdb_process(iterator, ret, NULL, NULL, NULL); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !iterator->dropin_covered) { + r = dropin_user_record_by_name(name, NULL, flags, ret); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !iterator->nss_covered) { + /* Make sure the NSS lookup doesn't recurse back to us. */ + + r = userdb_iterator_block_nss_systemd(iterator); + if (r >= 0) { + /* Client-side NSS fallback */ + r = nss_user_record_by_name(name, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret); + if (r >= 0) + return r; + } + } + + if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) { + if (streq(name, "root")) + return synthetic_root_user_build(ret); + + if (streq(name, NOBODY_USER_NAME) && synthesize_nobody()) + return synthetic_nobody_user_build(ret); + } + + return r; +} + +int userdb_by_uid(uid_t uid, UserDBFlags flags, UserRecord **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *query = NULL; + int r; + + if (!uid_is_valid(uid)) + return -EINVAL; + + r = json_build(&query, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)))); + if (r < 0) + return r; + + iterator = userdb_iterator_new(LOOKUP_USER, flags); + if (!iterator) + return -ENOMEM; + + r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetUserRecord", false, query, flags); + if (r >= 0) { + r = userdb_process(iterator, ret, NULL, NULL, NULL); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !iterator->dropin_covered) { + r = dropin_user_record_by_uid(uid, NULL, flags, ret); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !iterator->nss_covered) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r >= 0) { + /* Client-side NSS fallback */ + r = nss_user_record_by_uid(uid, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret); + if (r >= 0) + return r; + } + } + + if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) { + if (uid == 0) + return synthetic_root_user_build(ret); + + if (uid == UID_NOBODY && synthesize_nobody()) + return synthetic_nobody_user_build(ret); + } + + return r; +} + +int userdb_all(UserDBFlags flags, UserDBIterator **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + int r, qr; + + assert(ret); + + iterator = userdb_iterator_new(LOOKUP_USER, flags); + if (!iterator) + return -ENOMEM; + + qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetUserRecord", true, NULL, flags); + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r < 0) + return r; + + setpwent(); + iterator->nss_iterating = true; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) { + r = conf_files_list_nulstr( + &iterator->dropins, + ".user", + NULL, + CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, + USERDB_DROPIN_DIR_NULSTR("userdb")); + if (r < 0) + log_debug_errno(r, "Failed to find user drop-ins, ignoring: %m"); + } + + /* propagate IPC error, but only if there are no drop-ins */ + if (qr < 0 && + !iterator->nss_iterating && + strv_isempty(iterator->dropins)) + return qr; + + *ret = TAKE_PTR(iterator); + return 0; +} + +int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret) { + int r; + + assert(iterator); + assert(iterator->what == LOOKUP_USER); + + if (iterator->nss_iterating) { + struct passwd *pw; + + /* If NSS isn't covered elsewhere, let's iterate through it first, since it probably contains + * the more traditional sources, which are probably good to show first. */ + + pw = getpwent(); + if (pw) { + _cleanup_free_ char *buffer = NULL; + bool incomplete = false; + struct spwd spwd; + + if (streq_ptr(pw->pw_name, "root")) + iterator->synthesize_root = false; + if (pw->pw_uid == UID_NOBODY) + iterator->synthesize_nobody = false; + + if (!FLAGS_SET(iterator->flags, USERDB_SUPPRESS_SHADOW)) { + r = nss_spwd_for_passwd(pw, &spwd, &buffer); + if (r < 0) { + log_debug_errno(r, "Failed to acquire shadow entry for user %s, ignoring: %m", pw->pw_name); + incomplete = ERRNO_IS_PRIVILEGE(r); + } + } else { + r = -EUCLEAN; + incomplete = true; + } + + r = nss_passwd_to_user_record(pw, r >= 0 ? &spwd : NULL, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->incomplete = incomplete; + + iterator->n_found++; + return r; + } + + if (errno != 0) + log_debug_errno(errno, "Failure to iterate NSS user database, ignoring: %m"); + + iterator->nss_iterating = false; + endpwent(); + } + + for (; iterator->dropins && iterator->dropins[iterator->current_dropin]; iterator->current_dropin++) { + const char *i = iterator->dropins[iterator->current_dropin]; + _cleanup_free_ char *fn = NULL; + uid_t uid; + char *e; + + /* Next, let's add in the static drop-ins, which are quick to retrieve */ + + r = path_extract_filename(i, &fn); + if (r < 0) + return r; + + e = endswith(fn, ".user"); /* not actually a .user file? Then skip to next */ + if (!e) + continue; + + *e = 0; /* Chop off suffix */ + + if (parse_uid(fn, &uid) < 0) /* not a UID .user file? Then skip to next */ + continue; + + r = dropin_user_record_by_uid(uid, i, iterator->flags, ret); + if (r < 0) { + log_debug_errno(r, "Failed to parse user record for UID " UID_FMT ", ignoring: %m", uid); + continue; /* If we failed to parse this record, let's suppress it from enumeration, + * and continue with the next record. Maybe someone is dropping it files + * and only partially wrote this one. */ + } + + iterator->current_dropin++; /* make sure on the next call of userdb_iterator_get() we continue with the next dropin */ + iterator->n_found++; + return 0; + } + + /* Then, let's return the users provided by varlink IPC */ + r = userdb_process(iterator, ret, NULL, NULL, NULL); + if (r < 0) { + + /* Finally, synthesize root + nobody if not done yet */ + if (iterator->synthesize_root) { + iterator->synthesize_root = false; + iterator->n_found++; + return synthetic_root_user_build(ret); + } + + if (iterator->synthesize_nobody) { + iterator->synthesize_nobody = false; + iterator->n_found++; + return synthetic_nobody_user_build(ret); + } + + /* if we found at least one entry, then ignore errors and indicate that we reached the end */ + if (iterator->n_found > 0) + return -ESRCH; + } + + return r; +} + +static int synthetic_root_group_build(GroupRecord **ret) { + return group_record_build( + ret, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("groupName", JSON_BUILD_CONST_STRING("root")), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(0)), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic")))); +} + +static int synthetic_nobody_group_build(GroupRecord **ret) { + return group_record_build( + ret, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("groupName", JSON_BUILD_CONST_STRING(NOBODY_GROUP_NAME)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)), + JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic")))); +} + +int groupdb_by_name(const char *name, UserDBFlags flags, GroupRecord **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *query = NULL; + int r; + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return -EINVAL; + + r = json_build(&query, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name)))); + if (r < 0) + return r; + + iterator = userdb_iterator_new(LOOKUP_GROUP, flags); + if (!iterator) + return -ENOMEM; + + r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetGroupRecord", false, query, flags); + if (r >= 0) { + r = userdb_process(iterator, NULL, ret, NULL, NULL); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !(iterator && iterator->dropin_covered)) { + r = dropin_group_record_by_name(name, NULL, flags, ret); + if (r >= 0) + return r; + } + + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !(iterator && iterator->nss_covered)) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r >= 0) { + r = nss_group_record_by_name(name, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret); + if (r >= 0) + return r; + } + } + + if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) { + if (streq(name, "root")) + return synthetic_root_group_build(ret); + + if (streq(name, NOBODY_GROUP_NAME) && synthesize_nobody()) + return synthetic_nobody_group_build(ret); + } + + return r; +} + +int groupdb_by_gid(gid_t gid, UserDBFlags flags, GroupRecord **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *query = NULL; + int r; + + if (!gid_is_valid(gid)) + return -EINVAL; + + r = json_build(&query, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)))); + if (r < 0) + return r; + + iterator = userdb_iterator_new(LOOKUP_GROUP, flags); + if (!iterator) + return -ENOMEM; + + r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetGroupRecord", false, query, flags); + if (r >= 0) { + r = userdb_process(iterator, NULL, ret, NULL, NULL); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !(iterator && iterator->dropin_covered)) { + r = dropin_group_record_by_gid(gid, NULL, flags, ret); + if (r >= 0) + return r; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !(iterator && iterator->nss_covered)) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r >= 0) { + r = nss_group_record_by_gid(gid, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret); + if (r >= 0) + return r; + } + } + + if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) { + if (gid == 0) + return synthetic_root_group_build(ret); + + if (gid == GID_NOBODY && synthesize_nobody()) + return synthetic_nobody_group_build(ret); + } + + return r; +} + +int groupdb_all(UserDBFlags flags, UserDBIterator **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + int r, qr; + + assert(ret); + + iterator = userdb_iterator_new(LOOKUP_GROUP, flags); + if (!iterator) + return -ENOMEM; + + qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetGroupRecord", true, NULL, flags); + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r < 0) + return r; + + setgrent(); + iterator->nss_iterating = true; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) { + r = conf_files_list_nulstr( + &iterator->dropins, + ".group", + NULL, + CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, + USERDB_DROPIN_DIR_NULSTR("userdb")); + if (r < 0) + log_debug_errno(r, "Failed to find group drop-ins, ignoring: %m"); + } + + if (qr < 0 && + !iterator->nss_iterating && + strv_isempty(iterator->dropins)) + return qr; + + *ret = TAKE_PTR(iterator); + return 0; +} + +int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret) { + int r; + + assert(iterator); + assert(iterator->what == LOOKUP_GROUP); + + if (iterator->nss_iterating) { + struct group *gr; + + errno = 0; + gr = getgrent(); + if (gr) { + _cleanup_free_ char *buffer = NULL; + bool incomplete = false; + struct sgrp sgrp; + + if (streq_ptr(gr->gr_name, "root")) + iterator->synthesize_root = false; + if (gr->gr_gid == GID_NOBODY) + iterator->synthesize_nobody = false; + + if (!FLAGS_SET(iterator->flags, USERDB_SUPPRESS_SHADOW)) { + r = nss_sgrp_for_group(gr, &sgrp, &buffer); + if (r < 0) { + log_debug_errno(r, "Failed to acquire shadow entry for group %s, ignoring: %m", gr->gr_name); + incomplete = ERRNO_IS_PRIVILEGE(r); + } + } else { + r = -EUCLEAN; + incomplete = true; + } + + r = nss_group_to_group_record(gr, r >= 0 ? &sgrp : NULL, ret); + if (r < 0) + return r; + + if (ret) + (*ret)->incomplete = incomplete; + + iterator->n_found++; + return r; + } + + if (errno != 0) + log_debug_errno(errno, "Failure to iterate NSS group database, ignoring: %m"); + + iterator->nss_iterating = false; + endgrent(); + } + + for (; iterator->dropins && iterator->dropins[iterator->current_dropin]; iterator->current_dropin++) { + const char *i = iterator->dropins[iterator->current_dropin]; + _cleanup_free_ char *fn = NULL; + gid_t gid; + char *e; + + r = path_extract_filename(i, &fn); + if (r < 0) + return r; + + e = endswith(fn, ".group"); + if (!e) + continue; + + *e = 0; /* Chop off suffix */ + + if (parse_gid(fn, &gid) < 0) + continue; + + r = dropin_group_record_by_gid(gid, i, iterator->flags, ret); + if (r < 0) { + log_debug_errno(r, "Failed to parse group record for GID " GID_FMT ", ignoring: %m", gid); + continue; + } + + iterator->current_dropin++; + iterator->n_found++; + return 0; + } + + r = userdb_process(iterator, NULL, ret, NULL, NULL); + if (r < 0) { + if (iterator->synthesize_root) { + iterator->synthesize_root = false; + iterator->n_found++; + return synthetic_root_group_build(ret); + } + + if (iterator->synthesize_nobody) { + iterator->synthesize_nobody = false; + iterator->n_found++; + return synthetic_nobody_group_build(ret); + } + + /* if we found at least one entry, then ignore errors and indicate that we reached the end */ + if (iterator->n_found > 0) + return -ESRCH; + } + + return r; +} + +static void discover_membership_dropins(UserDBIterator *i, UserDBFlags flags) { + int r; + + r = conf_files_list_nulstr( + &i->dropins, + ".membership", + NULL, + CONF_FILES_REGULAR|CONF_FILES_BASENAME|CONF_FILES_FILTER_MASKED, + USERDB_DROPIN_DIR_NULSTR("userdb")); + if (r < 0) + log_debug_errno(r, "Failed to find membership drop-ins, ignoring: %m"); +} + +int membershipdb_by_user(const char *name, UserDBFlags flags, UserDBIterator **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *query = NULL; + int r, qr; + + assert(ret); + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return -EINVAL; + + r = json_build(&query, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name)))); + if (r < 0) + return r; + + iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags); + if (!iterator) + return -ENOMEM; + + iterator->filter_user_name = strdup(name); + if (!iterator->filter_user_name) + return -ENOMEM; + + qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetMemberships", true, query, flags); + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r < 0) + return r; + + setgrent(); + iterator->nss_iterating = true; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) + discover_membership_dropins(iterator, flags); + + if (qr < 0 && + !iterator->nss_iterating && + strv_isempty(iterator->dropins)) + return qr; + + *ret = TAKE_PTR(iterator); + return 0; +} + +int membershipdb_by_group(const char *name, UserDBFlags flags, UserDBIterator **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *query = NULL; + int r, qr; + + assert(ret); + + if (!valid_user_group_name(name, VALID_USER_RELAX)) + return -EINVAL; + + r = json_build(&query, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name)))); + if (r < 0) + return r; + + iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags); + if (!iterator) + return -ENOMEM; + + iterator->filter_group_name = strdup(name); + if (!iterator->filter_group_name) + return -ENOMEM; + + qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetMemberships", true, query, flags); + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) { + _cleanup_(group_record_unrefp) GroupRecord *gr = NULL; + + r = userdb_iterator_block_nss_systemd(iterator); + if (r < 0) + return r; + + /* We ignore all errors here, since the group might be defined by a userdb native service, and we queried them already above. */ + (void) nss_group_record_by_name(name, false, &gr); + if (gr) { + iterator->members_of_group = strv_copy(gr->members); + if (!iterator->members_of_group) + return -ENOMEM; + + iterator->index_members_of_group = 0; + + iterator->found_group_name = strdup(name); + if (!iterator->found_group_name) + return -ENOMEM; + } + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) + discover_membership_dropins(iterator, flags); + + if (qr < 0 && + strv_isempty(iterator->members_of_group) && + strv_isempty(iterator->dropins)) + return qr; + + *ret = TAKE_PTR(iterator); + return 0; +} + +int membershipdb_all(UserDBFlags flags, UserDBIterator **ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + int r, qr; + + assert(ret); + + iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags); + if (!iterator) + return -ENOMEM; + + qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetMemberships", true, NULL, flags); + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) { + r = userdb_iterator_block_nss_systemd(iterator); + if (r < 0) + return r; + + setgrent(); + iterator->nss_iterating = true; + } + + if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) + discover_membership_dropins(iterator, flags); + + if (qr < 0 && + !iterator->nss_iterating && + strv_isempty(iterator->dropins)) + return qr; + + *ret = TAKE_PTR(iterator); + return 0; +} + +int membershipdb_iterator_get( + UserDBIterator *iterator, + char **ret_user, + char **ret_group) { + + int r; + + assert(iterator); + + for (;;) { + /* If we are iterating through NSS acquire a new group entry if we haven't acquired one yet. */ + if (!iterator->members_of_group) { + struct group *g; + + if (!iterator->nss_iterating) + break; + + assert(!iterator->found_user_name); + do { + errno = 0; + g = getgrent(); + if (!g) { + if (errno != 0) + log_debug_errno(errno, "Failure during NSS group iteration, ignoring: %m"); + break; + } + + } while (iterator->filter_user_name ? !strv_contains(g->gr_mem, iterator->filter_user_name) : + strv_isempty(g->gr_mem)); + + if (g) { + r = free_and_strdup(&iterator->found_group_name, g->gr_name); + if (r < 0) + return r; + + if (iterator->filter_user_name) + iterator->members_of_group = strv_new(iterator->filter_user_name); + else + iterator->members_of_group = strv_copy(g->gr_mem); + if (!iterator->members_of_group) + return -ENOMEM; + + iterator->index_members_of_group = 0; + } else { + iterator->nss_iterating = false; + endgrent(); + break; + } + } + + assert(iterator->found_group_name); + assert(iterator->members_of_group); + assert(!iterator->found_user_name); + + if (iterator->members_of_group[iterator->index_members_of_group]) { + _cleanup_free_ char *cu = NULL, *cg = NULL; + + if (ret_user) { + cu = strdup(iterator->members_of_group[iterator->index_members_of_group]); + if (!cu) + return -ENOMEM; + } + + if (ret_group) { + cg = strdup(iterator->found_group_name); + if (!cg) + return -ENOMEM; + } + + if (ret_user) + *ret_user = TAKE_PTR(cu); + + if (ret_group) + *ret_group = TAKE_PTR(cg); + + iterator->index_members_of_group++; + return 0; + } + + iterator->members_of_group = strv_free(iterator->members_of_group); + iterator->found_group_name = mfree(iterator->found_group_name); + } + + for (; iterator->dropins && iterator->dropins[iterator->current_dropin]; iterator->current_dropin++) { + const char *i = iterator->dropins[iterator->current_dropin], *e, *c; + _cleanup_free_ char *un = NULL, *gn = NULL; + + e = endswith(i, ".membership"); + if (!e) + continue; + + c = memchr(i, ':', e - i); + if (!c) + continue; + + un = strndup(i, c - i); + if (!un) + return -ENOMEM; + if (iterator->filter_user_name) { + if (!streq(un, iterator->filter_user_name)) + continue; + } else if (!valid_user_group_name(un, VALID_USER_RELAX)) + continue; + + c++; /* skip over ':' */ + gn = strndup(c, e - c); + if (!gn) + return -ENOMEM; + if (iterator->filter_group_name) { + if (!streq(gn, iterator->filter_group_name)) + continue; + } else if (!valid_user_group_name(gn, VALID_USER_RELAX)) + continue; + + iterator->current_dropin++; + iterator->n_found++; + + if (ret_user) + *ret_user = TAKE_PTR(un); + if (ret_group) + *ret_group = TAKE_PTR(gn); + + return 0; + } + + r = userdb_process(iterator, NULL, NULL, ret_user, ret_group); + if (r < 0 && iterator->n_found > 0) + return -ESRCH; + + return r; +} + +int membershipdb_by_group_strv(const char *name, UserDBFlags flags, char ***ret) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_strv_free_ char **members = NULL; + int r; + + assert(name); + assert(ret); + + r = membershipdb_by_group(name, flags, &iterator); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *user_name = NULL; + + r = membershipdb_iterator_get(iterator, &user_name, NULL); + if (r == -ESRCH) + break; + if (r < 0) + return r; + + r = strv_consume(&members, TAKE_PTR(user_name)); + if (r < 0) + return r; + } + + strv_sort(members); + strv_uniq(members); + + *ret = TAKE_PTR(members); + return 0; +} + +int userdb_block_nss_systemd(int b) { + _cleanup_(dlclosep) void *dl = NULL; + int (*call)(bool b); + + /* Note that we might be called from libnss_systemd.so.2 itself, but that should be fine, really. */ + + dl = dlopen(LIBDIR "/libnss_systemd.so.2", RTLD_LAZY|RTLD_NODELETE); + if (!dl) { + /* If the file isn't installed, don't complain loudly */ + log_debug("Failed to dlopen(libnss_systemd.so.2), ignoring: %s", dlerror()); + return 0; + } + + call = (int (*)(bool b)) dlsym(dl, "_nss_systemd_block"); + if (!call) + /* If the file is installed but lacks the symbol we expect, things are weird, let's complain */ + return log_debug_errno(SYNTHETIC_ERRNO(ELIBBAD), + "Unable to find symbol _nss_systemd_block in libnss_systemd.so.2: %s", dlerror()); + + return call(b); +} diff --git a/src/shared/userdb.h b/src/shared/userdb.h new file mode 100644 index 0000000..75eb4b2 --- /dev/null +++ b/src/shared/userdb.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "group-record.h" +#include "user-record.h" + +/* Inquire local services for user/group records */ + +typedef struct UserDBIterator UserDBIterator; + +UserDBIterator *userdb_iterator_free(UserDBIterator *iterator); +DEFINE_TRIVIAL_CLEANUP_FUNC(UserDBIterator*, userdb_iterator_free); + +typedef enum UserDBFlags { + /* The main sources */ + USERDB_EXCLUDE_NSS = 1 << 0, /* don't do client-side nor server-side NSS */ + USERDB_EXCLUDE_VARLINK = 1 << 1, /* don't talk to any varlink services */ + USERDB_EXCLUDE_DROPIN = 1 << 2, /* don't load drop-in user/group definitions */ + + /* Modifications */ + USERDB_SUPPRESS_SHADOW = 1 << 3, /* don't do client-side shadow calls (server side might happen though) */ + USERDB_EXCLUDE_DYNAMIC_USER = 1 << 4, /* exclude looking up in io.systemd.DynamicUser */ + USERDB_AVOID_MULTIPLEXER = 1 << 5, /* exclude looking up via io.systemd.Multiplexer */ + USERDB_DONT_SYNTHESIZE = 1 << 6, /* don't synthesize root/nobody */ + + /* Combinations */ + USERDB_NSS_ONLY = USERDB_EXCLUDE_VARLINK|USERDB_EXCLUDE_DROPIN|USERDB_DONT_SYNTHESIZE, + USERDB_DROPIN_ONLY = USERDB_EXCLUDE_NSS|USERDB_EXCLUDE_VARLINK|USERDB_DONT_SYNTHESIZE, +} UserDBFlags; + +/* Well-known errors we'll return here: + * + * -ESRCH: No such user/group + * -ELINK: Varlink logic turned off (and no other source available) + * -EOPNOTSUPP: Enumeration not supported + * -ETIMEDOUT: Time-out + */ + +int userdb_by_name(const char *name, UserDBFlags flags, UserRecord **ret); +int userdb_by_uid(uid_t uid, UserDBFlags flags, UserRecord **ret); +int userdb_all(UserDBFlags flags, UserDBIterator **ret); +int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret); + +int groupdb_by_name(const char *name, UserDBFlags flags, GroupRecord **ret); +int groupdb_by_gid(gid_t gid, UserDBFlags flags, GroupRecord **ret); +int groupdb_all(UserDBFlags flags, UserDBIterator **ret); +int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret); + +int membershipdb_by_user(const char *name, UserDBFlags flags, UserDBIterator **ret); +int membershipdb_by_group(const char *name, UserDBFlags flags, UserDBIterator **ret); +int membershipdb_all(UserDBFlags flags, UserDBIterator **ret); +int membershipdb_iterator_get(UserDBIterator *iterator, char **user, char **group); +int membershipdb_by_group_strv(const char *name, UserDBFlags flags, char ***ret); + +int userdb_block_nss_systemd(int b); diff --git a/src/shared/utmp-wtmp.c b/src/shared/utmp-wtmp.c new file mode 100644 index 0000000..267b350 --- /dev/null +++ b/src/shared/utmp-wtmp.c @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "hostname-util.h" +#include "macro.h" +#include "memory-util.h" +#include "path-util.h" +#include "string-util.h" +#include "time-util.h" +#include "user-util.h" +#include "utmp-wtmp.h" + +int utmp_get_runlevel(int *runlevel, int *previous) { + _unused_ _cleanup_(utxent_cleanup) bool utmpx = false; + struct utmpx *found, lookup = { .ut_type = RUN_LVL }; + const char *e; + + assert(runlevel); + + /* If these values are set in the environment this takes + * precedence. Presumably, sysvinit does this to work around a + * race condition that would otherwise exist where we'd always + * go to disk and hence might read runlevel data that might be + * very new and not apply to the current script being executed. */ + + e = getenv("RUNLEVEL"); + if (!isempty(e)) { + *runlevel = e[0]; + if (previous) + *previous = 0; + + return 0; + } + + if (utmpxname(_PATH_UTMPX) < 0) + return -errno; + + utmpx = utxent_start(); + + found = getutxid(&lookup); + if (!found) + return -errno; + + *runlevel = found->ut_pid & 0xFF; + if (previous) + *previous = (found->ut_pid >> 8) & 0xFF; + + return 0; +} + +static void init_timestamp(struct utmpx *store, usec_t t) { + assert(store); + + if (t <= 0) + t = now(CLOCK_REALTIME); + + store->ut_tv.tv_sec = t / USEC_PER_SEC; + store->ut_tv.tv_usec = t % USEC_PER_SEC; +} + +static void init_entry(struct utmpx *store, usec_t t) { + struct utsname uts = {}; + + assert(store); + + init_timestamp(store, t); + + if (uname(&uts) >= 0) + strncpy(store->ut_host, uts.release, sizeof(store->ut_host)); + + strncpy(store->ut_line, "~", sizeof(store->ut_line)); /* or ~~ ? */ + strncpy(store->ut_id, "~~", sizeof(store->ut_id)); +} + +static int write_entry_utmp(const struct utmpx *store) { + _unused_ _cleanup_(utxent_cleanup) bool utmpx = false; + + assert(store); + + /* utmp is similar to wtmp, but there is only one entry for + * each entry type resp. user; i.e. basically a key/value + * table. */ + + if (utmpxname(_PATH_UTMPX) < 0) + return -errno; + + utmpx = utxent_start(); + + if (pututxline(store)) + return 0; + if (errno == ENOENT) { + /* If utmp/wtmp have been disabled, that's a good thing, hence ignore the error. */ + log_debug_errno(errno, "Not writing utmp: %m"); + return 0; + } + return -errno; +} + +static int write_entry_wtmp(const struct utmpx *store) { + assert(store); + + /* wtmp is a simple append-only file where each entry is + * simply appended to the end; i.e. basically a log. */ + + errno = 0; + updwtmpx(_PATH_WTMPX, store); + if (errno == ENOENT) { + /* If utmp/wtmp have been disabled, that's a good thing, hence ignore the error. */ + log_debug_errno(errno, "Not writing wtmp: %m"); + return 0; + } + if (errno == EROFS) { + log_warning_errno(errno, "Failed to write wtmp record, ignoring: %m"); + return 0; + } + return -errno; +} + +static int write_utmp_wtmp(const struct utmpx *store_utmp, const struct utmpx *store_wtmp) { + int r, s; + + r = write_entry_utmp(store_utmp); + s = write_entry_wtmp(store_wtmp); + return r < 0 ? r : s; +} + +static int write_entry_both(const struct utmpx *store) { + return write_utmp_wtmp(store, store); +} + +int utmp_put_shutdown(void) { + struct utmpx store = {}; + + init_entry(&store, 0); + + store.ut_type = RUN_LVL; + strncpy(store.ut_user, "shutdown", sizeof(store.ut_user)); + + return write_entry_both(&store); +} + +int utmp_put_reboot(usec_t t) { + struct utmpx store = {}; + + init_entry(&store, t); + + store.ut_type = BOOT_TIME; + strncpy(store.ut_user, "reboot", sizeof(store.ut_user)); + + return write_entry_both(&store); +} + +static void copy_suffix(char *buf, size_t buf_size, const char *src) { + size_t l; + + l = strlen(src); + if (l < buf_size) + strncpy(buf, src, buf_size); + else + memcpy(buf, src + l - buf_size, buf_size); +} + +int utmp_put_init_process(const char *id, pid_t pid, pid_t sid, const char *line, int ut_type, const char *user) { + struct utmpx store = { + .ut_type = INIT_PROCESS, + .ut_pid = pid, + .ut_session = sid, + }; + int r; + + assert(id); + assert(ut_type != USER_PROCESS || user); + + init_timestamp(&store, 0); + + /* Copy the whole string if it fits, or just the suffix without the terminating NUL. */ + copy_suffix(store.ut_id, sizeof(store.ut_id), id); + + if (line) + strncpy_exact(store.ut_line, line, sizeof(store.ut_line)); + + r = write_entry_both(&store); + if (r < 0) + return r; + + if (IN_SET(ut_type, LOGIN_PROCESS, USER_PROCESS)) { + store.ut_type = LOGIN_PROCESS; + r = write_entry_both(&store); + if (r < 0) + return r; + } + + if (ut_type == USER_PROCESS) { + store.ut_type = USER_PROCESS; + strncpy(store.ut_user, user, sizeof(store.ut_user)-1); + r = write_entry_both(&store); + if (r < 0) + return r; + } + + return 0; +} + +int utmp_put_dead_process(const char *id, pid_t pid, int code, int status) { + _unused_ _cleanup_(utxent_cleanup) bool utmpx = false; + struct utmpx lookup = { + .ut_type = INIT_PROCESS /* looks for DEAD_PROCESS, LOGIN_PROCESS, USER_PROCESS, too */ + }, store, store_wtmp, *found; + + assert(id); + + utmpx = utxent_start(); + + /* Copy the whole string if it fits, or just the suffix without the terminating NUL. */ + copy_suffix(lookup.ut_id, sizeof(lookup.ut_id), id); + + found = getutxid(&lookup); + if (!found) + return 0; + + if (found->ut_pid != pid) + return 0; + + memcpy(&store, found, sizeof(store)); + store.ut_type = DEAD_PROCESS; + store.ut_exit.e_termination = code; + store.ut_exit.e_exit = status; + + zero(store.ut_user); + zero(store.ut_host); + zero(store.ut_tv); + + memcpy(&store_wtmp, &store, sizeof(store_wtmp)); + /* wtmp wants the current time */ + init_timestamp(&store_wtmp, 0); + + return write_utmp_wtmp(&store, &store_wtmp); +} + +int utmp_put_runlevel(int runlevel, int previous) { + struct utmpx store = {}; + int r; + + assert(runlevel > 0); + + if (previous <= 0) { + /* Find the old runlevel automatically */ + + r = utmp_get_runlevel(&previous, NULL); + if (r < 0) { + if (r != -ESRCH) + return r; + + previous = 0; + } + } + + if (previous == runlevel) + return 0; + + init_entry(&store, 0); + + store.ut_type = RUN_LVL; + store.ut_pid = (runlevel & 0xFF) | ((previous & 0xFF) << 8); + strncpy(store.ut_user, "runlevel", sizeof(store.ut_user)); + + return write_entry_both(&store); +} diff --git a/src/shared/utmp-wtmp.h b/src/shared/utmp-wtmp.h new file mode 100644 index 0000000..2e04fac --- /dev/null +++ b/src/shared/utmp-wtmp.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "time-util.h" + +#if ENABLE_UTMP +#include + +int utmp_get_runlevel(int *runlevel, int *previous); + +int utmp_put_shutdown(void); +int utmp_put_reboot(usec_t timestamp); +int utmp_put_runlevel(int runlevel, int previous); + +int utmp_put_dead_process(const char *id, pid_t pid, int code, int status); +int utmp_put_init_process(const char *id, pid_t pid, pid_t sid, const char *line, int ut_type, const char *user); + +static inline bool utxent_start(void) { + setutxent(); + return true; +} +static inline void utxent_cleanup(bool *initialized) { + assert(initialized); + if (*initialized) + endutxent(); +} + +#else /* ENABLE_UTMP */ + +static inline int utmp_get_runlevel(int *runlevel, int *previous) { + return -ESRCH; +} +static inline int utmp_put_shutdown(void) { + return 0; +} +static inline int utmp_put_reboot(usec_t timestamp) { + return 0; +} +static inline int utmp_put_runlevel(int runlevel, int previous) { + return 0; +} +static inline int utmp_put_dead_process(const char *id, pid_t pid, int code, int status) { + return 0; +} +static inline int utmp_put_init_process(const char *id, pid_t pid, pid_t sid, const char *line, int ut_type, const char *user) { + return 0; +} + +#endif /* ENABLE_UTMP */ diff --git a/src/shared/varlink-idl.c b/src/shared/varlink-idl.c new file mode 100644 index 0000000..655324c --- /dev/null +++ b/src/shared/varlink-idl.c @@ -0,0 +1,1603 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memstream-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "varlink-idl.h" +#include "set.h" + +#define DEPTH_MAX 64U + +enum { + COLOR_SYMBOL_TYPE, /* interface, method, type, error */ + COLOR_FIELD_TYPE, /* string, bool, … */ + COLOR_IDENTIFIER, + COLOR_MARKS, /* [], ->, ?, … */ + COLOR_RESET, + _COLOR_MAX, +}; + +static int varlink_idl_format_all_fields(FILE *f, const VarlinkSymbol *symbol, VarlinkFieldDirection direction, const char *indent, const char *const colors[static _COLOR_MAX]); + +static int varlink_idl_format_enum_values( + FILE *f, + const VarlinkSymbol *symbol, + const char *indent, + const char *const colors[static _COLOR_MAX]) { + + bool first = true; + + assert(f); + assert(symbol); + assert(symbol->symbol_type == VARLINK_ENUM_TYPE); + + for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) { + + if (first) { + first = false; + fputs("(\n", f); + } else + fputs(",\n", f); + + fputs(strempty(indent), f); + fputs("\t", f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(field->name, f); + fputs(colors[COLOR_RESET], f); + } + + if (first) + fputs("()", f); + else { + fputs("\n", f); + fputs(strempty(indent), f); + fputs(")", f); + } + + return 0; +} + +static int varlink_idl_format_field( + FILE *f, + const VarlinkField *field, + const char *indent, + const char *const colors[static _COLOR_MAX]) { + + assert(f); + assert(field); + + fputs(strempty(indent), f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(field->name, f); + fputs(colors[COLOR_RESET], f); + fputs(": ", f); + + if (FLAGS_SET(field->field_flags, VARLINK_NULLABLE)) { + fputs(colors[COLOR_MARKS], f); + fputs("?", f); + fputs(colors[COLOR_RESET], f); + } + + switch (field->field_flags & (VARLINK_MAP|VARLINK_ARRAY)) { + + case VARLINK_MAP: + fputs(colors[COLOR_MARKS], f); + fputs("[", f); + fputs(colors[COLOR_FIELD_TYPE], f); + fputs("string", f); + fputs(colors[COLOR_MARKS], f); + fputs("]", f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_ARRAY: + fputs(colors[COLOR_MARKS], f); + fputs("[]", f); + fputs(colors[COLOR_RESET], f); + break; + + case 0: + break; + + default: + assert_not_reached(); + } + + switch (field->field_type) { + + case VARLINK_BOOL: + fputs(colors[COLOR_FIELD_TYPE], f); + fputs("bool", f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_INT: + fputs(colors[COLOR_FIELD_TYPE], f); + fputs("int", f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_FLOAT: + fputs(colors[COLOR_FIELD_TYPE], f); + fputs("float", f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_STRING: + fputs(colors[COLOR_FIELD_TYPE], f); + fputs("string", f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_OBJECT: + fputs(colors[COLOR_FIELD_TYPE], f); + fputs("object", f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_NAMED_TYPE: + fputs(colors[COLOR_IDENTIFIER], f); + fputs(ASSERT_PTR(field->named_type), f); + fputs(colors[COLOR_RESET], f); + break; + + case VARLINK_STRUCT: + return varlink_idl_format_all_fields(f, ASSERT_PTR(field->symbol), VARLINK_REGULAR, indent, colors); + + case VARLINK_ENUM: + return varlink_idl_format_enum_values(f, ASSERT_PTR(field->symbol), indent, colors); + + default: + assert_not_reached(); + } + + return 0; +} + +static int varlink_idl_format_all_fields( + FILE *f, + const VarlinkSymbol *symbol, + VarlinkFieldDirection filter_direction, + const char *indent, + const char *const colors[static _COLOR_MAX]) { + + _cleanup_free_ char *indent2 = NULL; + bool first = true; + int r; + + assert(f); + assert(symbol); + assert(IN_SET(symbol->symbol_type, VARLINK_STRUCT_TYPE, VARLINK_METHOD, VARLINK_ERROR)); + + indent2 = strjoin(strempty(indent), "\t"); + if (!indent2) + return -ENOMEM; + + for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) { + + if (field->field_direction != filter_direction) + continue; + + if (first) { + first = false; + fputs("(\n", f); + } else + fputs(",\n", f); + + r = varlink_idl_format_field(f, field, indent2, colors); + if (r < 0) + return r; + } + + if (first) + fputs("()", f); + else { + fputs("\n", f); + fputs(strempty(indent), f); + fputs(")", f); + } + + return 0; +} + +static int varlink_idl_format_symbol( + FILE *f, + const VarlinkSymbol *symbol, + const char *const colors[static _COLOR_MAX]) { + int r; + + assert(f); + assert(symbol); + + switch (symbol->symbol_type) { + + case VARLINK_ENUM_TYPE: + fputs(colors[COLOR_SYMBOL_TYPE], f); + fputs("type ", f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(symbol->name, f); + fputs(colors[COLOR_RESET], f); + + r = varlink_idl_format_enum_values(f, symbol, /* indent= */ NULL, colors); + break; + + case VARLINK_STRUCT_TYPE: + fputs(colors[COLOR_SYMBOL_TYPE], f); + fputs("type ", f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(symbol->name, f); + fputs(colors[COLOR_RESET], f); + + r = varlink_idl_format_all_fields(f, symbol, VARLINK_REGULAR, /* indent= */ NULL, colors); + break; + + case VARLINK_METHOD: + fputs(colors[COLOR_SYMBOL_TYPE], f); + fputs("method ", f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(symbol->name, f); + fputs(colors[COLOR_RESET], f); + + r = varlink_idl_format_all_fields(f, symbol, VARLINK_INPUT, /* indent= */ NULL, colors); + if (r < 0) + return r; + + fputs(colors[COLOR_MARKS], f); + fputs(" -> ", f); + fputs(colors[COLOR_RESET], f); + + r = varlink_idl_format_all_fields(f, symbol, VARLINK_OUTPUT, /* indent= */ NULL, colors); + break; + + case VARLINK_ERROR: + fputs(colors[COLOR_SYMBOL_TYPE], f); + fputs("error ", f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(symbol->name, f); + fputs(colors[COLOR_RESET], f); + + r = varlink_idl_format_all_fields(f, symbol, VARLINK_REGULAR, /* indent= */ NULL, colors); + break; + + default: + assert_not_reached(); + } + if (r < 0) + return r; + + fputs("\n", f); + return 0; +} + +static int varlink_idl_format_all_symbols( + FILE *f, + const VarlinkInterface *interface, + VarlinkSymbolType filter_type, + const char *const colors[static _COLOR_MAX]) { + + int r; + + assert(f); + assert(interface); + + for (const VarlinkSymbol *const*symbol = interface->symbols; *symbol; symbol++) { + + if ((*symbol)->symbol_type != filter_type) + continue; + + fputs("\n", f); + + r = varlink_idl_format_symbol(f, *symbol, colors); + if (r < 0) + return r; + } + + return 0; +} + +int varlink_idl_dump(FILE *f, int use_colors, const VarlinkInterface *interface) { + static const char* const color_table[_COLOR_MAX] = { + [COLOR_SYMBOL_TYPE] = ANSI_HIGHLIGHT_GREEN, + [COLOR_FIELD_TYPE] = ANSI_HIGHLIGHT_BLUE, + [COLOR_IDENTIFIER] = ANSI_NORMAL, + [COLOR_MARKS] = ANSI_HIGHLIGHT_MAGENTA, + [COLOR_RESET] = ANSI_NORMAL, + }; + + static const char* const color_off[_COLOR_MAX] = { + "", "", "", "", "", + }; + + int r; + + assert(interface); + + if (!f) + f = stdout; + + if (use_colors < 0) + use_colors = colors_enabled(); + + const char *const *colors = use_colors ? color_table : color_off; + + fputs(colors[COLOR_SYMBOL_TYPE], f); + fputs("interface ", f); + fputs(colors[COLOR_IDENTIFIER], f); + fputs(ASSERT_PTR(interface->name), f); + fputs(colors[COLOR_RESET], f); + fputs("\n", f); + + for (VarlinkSymbolType t = 0; t < _VARLINK_SYMBOL_TYPE_MAX; t++) { + r = varlink_idl_format_all_symbols(f, interface, t, colors); + if (r < 0) + return r; + } + + return 0; +} + +int varlink_idl_format(const VarlinkInterface *interface, char **ret) { + _cleanup_(memstream_done) MemStream memstream = {}; + int r; + + if (!memstream_init(&memstream)) + return -errno; + + r = varlink_idl_dump(memstream.f, /* use_colors= */ false, interface); + if (r < 0) + return r; + + return memstream_finalize(&memstream, ret, NULL); +} + +static VarlinkSymbol *varlink_symbol_free(VarlinkSymbol *symbol) { + if (!symbol) + return NULL; + + /* See comment in varlink_interface_free() regarding the casting away of `const` */ + + free((char*) symbol->name); + + for (size_t i = 0; symbol->fields[i].field_type != _VARLINK_FIELD_TYPE_END_MARKER; i++) { + VarlinkField *field = symbol->fields + i; + + free((void*) field->name); + free((void*) field->named_type); + + /* The symbol pointer might either point to a named symbol, in which case that symbol is + * owned by the interface, or by an anomyous symbol, in which case it is owned by us, and we + * need to free it */ + if (field->symbol && field->field_type != VARLINK_NAMED_TYPE) + varlink_symbol_free((VarlinkSymbol*) field->symbol); + } + + return mfree(symbol); +} + +VarlinkInterface* varlink_interface_free(VarlinkInterface *interface) { + if (!interface) + return NULL; + + /* So here's the thing: in most cases we want that users of this define their interface descriptions + * in C code, and hence the definitions are constant and immutable during the lifecycle of the + * system. Because of that we define all structs with const* pointers. It makes it very nice and + * straight-forward to populate these structs with literal C strings. However, in some not so common + * cases we also want to allocate these structures dynamically on the heap, when parsing interface + * descriptions. But given this should be the exceptional and not the common case, we decided to + * simple cast away the 'const' where needed, even if it is ugly. */ + + free((char*) interface->name); + + for (size_t i = 0; interface->symbols[i]; i++) + varlink_symbol_free((VarlinkSymbol*) interface->symbols[i]); + + return mfree(interface); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkSymbol*, varlink_symbol_free); + +static int varlink_interface_realloc(VarlinkInterface **interface, size_t n_symbols) { + VarlinkInterface *n; + + assert(interface); + + n_symbols ++; /* Space for trailing NULL end marker symbol */ + + /* Overflow check */ + if (n_symbols > (SIZE_MAX - offsetof(VarlinkInterface, symbols)) / sizeof(VarlinkSymbol*)) + return -ENOMEM; + + n = realloc0(*interface, offsetof(VarlinkInterface, symbols) + sizeof(VarlinkSymbol*) * n_symbols); + if (!n) + return -ENOMEM; + + *interface = n; + return 0; +} + +static int varlink_symbol_realloc(VarlinkSymbol **symbol, size_t n_fields) { + VarlinkSymbol *n; + + assert(symbol); + + n_fields ++; /* Space for trailing end marker field */ + + /* Overflow check */ + if (n_fields > (SIZE_MAX - offsetof(VarlinkSymbol, fields)) / sizeof(VarlinkField)) + return -ENOMEM; + + n = realloc0(*symbol, offsetof(VarlinkSymbol, fields) + sizeof(VarlinkField) * n_fields); + if (!n) + return -ENOMEM; + + *symbol = n; + return 0; +} + +#define VALID_CHARS_IDENTIFIER ALPHANUMERICAL "_" +#define VALID_CHARS_RESERVED LOWERCASE_LETTERS +#define VALID_CHARS_INTERFACE_NAME ALPHANUMERICAL ".-" + +static void advance_line_column(const char *p, size_t n, unsigned *line, unsigned *column) { + + assert(p); + assert(line); + assert(column); + + for (; n > 0; p++, n--) { + + if (*p == '\n') { + (*line)++; + *column = 1; + } else + (*column)++; + } +} + +static size_t token_match( + const char *p, + const char *allowed_delimiters, + const char *allowed_chars) { + + /* Checks if the string p begins either with one of the token characters in allowed_delimiters or + * with a string consisting of allowed_chars. */ + + assert(p); + + if (allowed_delimiters && strchr(allowed_delimiters, *p)) + return 1; + + if (!allowed_chars) + return 0; + + return strspn(p, allowed_chars); +} + +static int varlink_idl_subparse_token( + const char **p, + unsigned *line, + unsigned *column, + const char *allowed_delimiters, + const char *allowed_chars, + char **ret_token) { + + _cleanup_free_ char *t = NULL; + size_t l; + + assert(p); + assert(*p); + assert(line); + assert(column); + assert(ret_token); + + if (**p == '\0') { /* eof */ + *ret_token = NULL; + return 0; + } + + l = token_match(*p, allowed_delimiters, allowed_chars); + + /* No token of the permitted character set found? Then let's try to skip over whitespace and try again */ + if (l == 0) { + size_t ll; + + ll = strspn(*p, WHITESPACE); + advance_line_column(*p, ll, line, column); + *p += ll; + + if (**p == '\0') { /* eof */ + *ret_token = NULL; + return 0; + } + + l = token_match(*p, allowed_delimiters, allowed_chars); + if (l == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Couldn't find token of allowed chars '%s' or allowed delimiters '%s'.", strempty(allowed_chars), strempty(allowed_delimiters)); + } + + t = strndup(*p, l); + if (!t) + return -ENOMEM; + + advance_line_column(*p, l, line, column); + *p += l; + + *ret_token = TAKE_PTR(t); + return 1; +} + +static int varlink_idl_subparse_comment( + const char **p, + unsigned *line, + unsigned *column) { + + size_t l; + + assert(p); + assert(*p); + assert(line); + assert(column); + + l = strcspn(*p, NEWLINE); + advance_line_column(*p, l + 1, line, column); + *p += l; + + return 1; +} + +static int varlink_idl_subparse_whitespace( + const char **p, + unsigned *line, + unsigned *column) { + + size_t l; + + assert(p); + assert(*p); + assert(line); + assert(column); + + l = strspn(*p, WHITESPACE); + advance_line_column(*p, l, line, column); + *p += l; + + return 1; +} + +static int varlink_idl_subparse_struct_or_enum(const char **p, unsigned *line, unsigned *column, VarlinkSymbol **symbol, size_t *n_fields, VarlinkFieldDirection direction, unsigned depth); + +static int varlink_idl_subparse_field_type( + const char **p, + unsigned *line, + unsigned *column, + VarlinkField *field, + unsigned depth) { + + size_t l; + int r; + + assert(p); + assert(*p); + assert(line); + assert(field); + + r = varlink_idl_subparse_whitespace(p, line, column); + if (r < 0) + return r; + + if (startswith(*p, "?")) { + field->field_flags |= VARLINK_NULLABLE; + l = 1; + } else { + field->field_flags &= ~VARLINK_NULLABLE; + l = 0; + } + + advance_line_column(*p, l, line, column); + *p += l; + + if (startswith(*p, "[]")) { + l = 2; + field->field_flags = (field->field_flags & ~VARLINK_MAP) | VARLINK_ARRAY; + } else if (startswith(*p, "[string]")) { + l = 8; + field->field_flags = (field->field_flags & ~VARLINK_ARRAY) | VARLINK_MAP; + } else { + l = 0; + field->field_flags = field->field_flags & ~(VARLINK_MAP | VARLINK_ARRAY); + } + + advance_line_column(*p, l, line, column); + *p += l; + + if (startswith(*p, "bool")) { + l = 4; + field->field_type = VARLINK_BOOL; + } else if (startswith(*p, "int")) { + l = 3; + field->field_type = VARLINK_INT; + } else if (startswith(*p, "float")) { + l = 5; + field->field_type = VARLINK_FLOAT; + } else if (startswith(*p, "string")) { + l = 6; + field->field_type = VARLINK_STRING; + } else if (startswith(*p, "object")) { + l = 6; + field->field_type = VARLINK_OBJECT; + } else if (**p == '(') { + _cleanup_(varlink_symbol_freep) VarlinkSymbol *symbol = NULL; + size_t n_fields = 0; + + r = varlink_symbol_realloc(&symbol, n_fields); + if (r < 0) + return r; + + symbol->symbol_type = _VARLINK_SYMBOL_TYPE_INVALID; + + r = varlink_idl_subparse_struct_or_enum( + p, + line, + column, + &symbol, + &n_fields, + VARLINK_REGULAR, + depth + 1); + if (r < 0) + return r; + + if (symbol->symbol_type == VARLINK_STRUCT_TYPE) + field->field_type = VARLINK_STRUCT; + else { + assert(symbol->symbol_type == VARLINK_ENUM_TYPE); + field->field_type = VARLINK_ENUM; + } + + field->symbol = TAKE_PTR(symbol); + l = 0; + } else { + _cleanup_free_ char *token = NULL; + + r = varlink_idl_subparse_token(p, line, column, /* valid_tokens= */ NULL, VALID_CHARS_IDENTIFIER, &token); + if (r < 0) + return r; + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + field->named_type = TAKE_PTR(token); + field->field_type = VARLINK_NAMED_TYPE; + l = 0; + } + + advance_line_column(*p, l, line, column); + *p += l; + + return 0; +} + +static int varlink_idl_subparse_struct_or_enum( + const char **p, + unsigned *line, + unsigned *column, + VarlinkSymbol **symbol, + size_t *n_fields, + VarlinkFieldDirection direction, + unsigned depth) { + + enum { + STATE_OPEN, + STATE_NAME, + STATE_COLON, + STATE_COMMA, + STATE_DONE, + } state = STATE_OPEN; + _cleanup_free_ char *field_name = NULL; + const char *allowed_delimiters = "(", *allowed_chars = NULL; + int r; + + assert(p); + assert(*p); + assert(line); + assert(column); + assert(symbol); + assert(*symbol); + assert(n_fields); + + if (depth > DEPTH_MAX) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Maximum nesting depth reached (%u).", *line, *column, DEPTH_MAX); + + while (state != STATE_DONE) { + _cleanup_free_ char *token = NULL; + + r = varlink_idl_subparse_token( + p, + line, + column, + allowed_delimiters, + allowed_chars, + &token); + if (r < 0) + return r; + + switch (state) { + + case STATE_OPEN: + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + if (!streq(token, "(")) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token); + + state = STATE_NAME; + allowed_delimiters = ")"; + allowed_chars = VALID_CHARS_IDENTIFIER; + break; + + case STATE_NAME: + assert(!field_name); + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + if (streq(token, ")")) + state = STATE_DONE; + else { + field_name = TAKE_PTR(token); + state = STATE_COLON; + allowed_delimiters = ":,)"; + allowed_chars = NULL; + } + + break; + + case STATE_COLON: + assert(field_name); + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + if (streq(token, ":")) { + VarlinkField *field; + + if ((*symbol)->symbol_type < 0) + (*symbol)->symbol_type = VARLINK_STRUCT_TYPE; + if ((*symbol)->symbol_type == VARLINK_ENUM_TYPE) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Enum with struct fields, refusing.", *line, *column); + + r = varlink_symbol_realloc(symbol, *n_fields + 1); + if (r < 0) + return r; + + field = (*symbol)->fields + (*n_fields)++; + *field = (VarlinkField) { + .name = TAKE_PTR(field_name), + .field_type = _VARLINK_FIELD_TYPE_INVALID, + .field_direction = direction, + }; + + r = varlink_idl_subparse_field_type(p, line, column, field, depth); + if (r < 0) + return r; + + state = STATE_COMMA; + allowed_delimiters = ",)"; + allowed_chars = NULL; + + } else if (STR_IN_SET(token, ",", ")")) { + VarlinkField *field; + + if ((*symbol)->symbol_type < 0) + (*symbol)->symbol_type = VARLINK_ENUM_TYPE; + if ((*symbol)->symbol_type != VARLINK_ENUM_TYPE) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Struct with enum fields, refusing.", *line, *column); + + r = varlink_symbol_realloc(symbol, *n_fields + 1); + if (r < 0) + return r; + + field = (*symbol)->fields + (*n_fields)++; + *field = (VarlinkField) { + .name = TAKE_PTR(field_name), + .field_type = VARLINK_ENUM_VALUE, + }; + + if (streq(token, ",")) { + state = STATE_NAME; + allowed_delimiters = NULL; + allowed_chars = VALID_CHARS_IDENTIFIER; + } else { + assert(streq(token, ")")); + state = STATE_DONE; + } + } else + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token); + + break; + + case STATE_COMMA: + assert(!field_name); + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + if (streq(token, ",")) { + state = STATE_NAME; + allowed_delimiters = NULL; + allowed_chars = VALID_CHARS_IDENTIFIER; + } else if (streq(token, ")")) + state = STATE_DONE; + else + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token); + break; + + default: + assert_not_reached(); + } + } + + /* If we don't know the type of the symbol by now it was an empty () which doesn't allow us to + * determine if we look at an enum or a struct */ + if ((*symbol)->symbol_type < 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Ambiguous empty () enum/struct is not permitted.", *line, *column); + + return 0; +} + +static int varlink_idl_resolve_symbol_types(VarlinkInterface *interface, VarlinkSymbol *symbol) { + assert(interface); + assert(symbol); + + for (VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) { + const VarlinkSymbol *found; + + if (field->field_type != VARLINK_NAMED_TYPE) + continue; + + if (field->symbol) /* Already resolved */ + continue; + + if (!field->named_type) + return log_debug_errno(SYNTHETIC_ERRNO(ENETUNREACH), "Named type field lacking a type name."); + + found = varlink_idl_find_symbol(interface, _VARLINK_SYMBOL_TYPE_INVALID, field->named_type); + if (!found) + return log_debug_errno(SYNTHETIC_ERRNO(ENETUNREACH), "Failed to find type '%s'.", field->named_type); + + if (!IN_SET(found->symbol_type, VARLINK_STRUCT_TYPE, VARLINK_ENUM_TYPE)) + return log_debug_errno(SYNTHETIC_ERRNO(ENETUNREACH), "Symbol '%s' is referenced as type but is not a type.", field->named_type); + + field->symbol = found; + } + + return 0; +} + +static int varlink_idl_resolve_types(VarlinkInterface *interface) { + int r; + + assert(interface); + + for (VarlinkSymbol **symbol = (VarlinkSymbol**) interface->symbols; *symbol; symbol++) { + r = varlink_idl_resolve_symbol_types(interface, *symbol); + if (r < 0) + return r; + } + + return 0; +} + +int varlink_idl_parse( + const char *text, + unsigned *line, + unsigned *column, + VarlinkInterface **ret) { + + _cleanup_(varlink_interface_freep) VarlinkInterface *interface = NULL; + _cleanup_(varlink_symbol_freep) VarlinkSymbol *symbol = NULL; + enum { + STATE_PRE_INTERFACE, + STATE_INTERFACE, + STATE_PRE_SYMBOL, + STATE_METHOD, + STATE_METHOD_NAME, + STATE_METHOD_ARROW, + STATE_TYPE, + STATE_TYPE_NAME, + STATE_ERROR, + STATE_ERROR_NAME, + STATE_DONE, + } state = STATE_PRE_INTERFACE; + const char *allowed_delimiters = "#", *allowed_chars = VALID_CHARS_RESERVED; + size_t n_symbols = 0, n_fields = 1; + unsigned _line = 0, _column = 1; + const char **p = &text; + int r; + + if (!line) + line = &_line; + if (!column) + column = &_column; + + while (state != STATE_DONE) { + _cleanup_free_ char *token = NULL; + + r = varlink_idl_subparse_token( + p, + line, + column, + allowed_delimiters, + allowed_chars, + &token); + if (r < 0) + return r; + + switch (state) { + + case STATE_PRE_INTERFACE: + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + if (streq(token, "#")) { + r = varlink_idl_subparse_comment(&text, line, column); + if (r < 0) + return r; + } else if (streq(token, "interface")) { + state = STATE_INTERFACE; + allowed_delimiters = NULL; + allowed_chars = VALID_CHARS_INTERFACE_NAME; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token); + break; + + case STATE_INTERFACE: + assert(!interface); + assert(n_symbols == 0); + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + r = varlink_interface_realloc(&interface, n_symbols); + if (r < 0) + return r; + + interface->name = TAKE_PTR(token); + state = STATE_PRE_SYMBOL; + allowed_delimiters = "#"; + allowed_chars = VALID_CHARS_RESERVED; + break; + + case STATE_PRE_SYMBOL: + if (!token) { + state = STATE_DONE; + break; + } + + if (streq(token, "#")) { + r = varlink_idl_subparse_comment(&text, line, column); + if (r < 0) + return r; + } else if (streq(token, "method")) { + state = STATE_METHOD; + allowed_chars = VALID_CHARS_IDENTIFIER; + } else if (streq(token, "type")) { + state = STATE_TYPE; + allowed_chars = VALID_CHARS_IDENTIFIER; + } else if (streq(token, "error")) { + state = STATE_ERROR; + allowed_chars = VALID_CHARS_IDENTIFIER; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token); + + break; + + case STATE_METHOD: + assert(!symbol); + n_fields = 0; + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + r = varlink_symbol_realloc(&symbol, n_fields); + if (r < 0) + return r; + + symbol->symbol_type = VARLINK_METHOD; + symbol->name = TAKE_PTR(token); + + r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_INPUT, 0); + if (r < 0) + return r; + + state = STATE_METHOD_ARROW; + allowed_chars = "->"; + break; + + case STATE_METHOD_ARROW: + assert(symbol); + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + if (!streq(token, "->")) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token); + + r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_OUTPUT, 0); + if (r < 0) + return r; + + r = varlink_interface_realloc(&interface, n_symbols + 1); + if (r < 0) + return r; + + interface->symbols[n_symbols++] = TAKE_PTR(symbol); + + state = STATE_PRE_SYMBOL; + allowed_chars = VALID_CHARS_RESERVED "#"; + break; + + case STATE_TYPE: + assert(!symbol); + n_fields = 0; + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + r = varlink_symbol_realloc(&symbol, n_fields); + if (r < 0) + return r; + + symbol->symbol_type = _VARLINK_SYMBOL_TYPE_INVALID; /* don't know yet if enum or struct, will be field in by varlink_idl_subparse_struct_or_enum() */ + symbol->name = TAKE_PTR(token); + + r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_REGULAR, 0); + if (r < 0) + return r; + + r = varlink_interface_realloc(&interface, n_symbols + 1); + if (r < 0) + return r; + + interface->symbols[n_symbols++] = TAKE_PTR(symbol); + + state = STATE_PRE_SYMBOL; + allowed_chars = VALID_CHARS_RESERVED "#"; + break; + + case STATE_ERROR: + assert(!symbol); + n_fields = 0; + + if (!token) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column); + + r = varlink_symbol_realloc(&symbol, n_fields); + if (r < 0) + return r; + + symbol->symbol_type = VARLINK_ERROR; + symbol->name = TAKE_PTR(token); + + r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_REGULAR, 0); + if (r < 0) + return r; + + r = varlink_interface_realloc(&interface, n_symbols + 1); + if (r < 0) + return r; + + interface->symbols[n_symbols++] = TAKE_PTR(symbol); + + state = STATE_PRE_SYMBOL; + allowed_chars = VALID_CHARS_RESERVED "#"; + break; + + default: + assert_not_reached(); + } + } + + r = varlink_idl_resolve_types(interface); + if (r < 0) + return r; + + *ret = TAKE_PTR(interface); + return 0; +} + +bool varlink_idl_field_name_is_valid(const char *name) { + if (isempty(name)) + return false; + + /* Field names may start with lower or uppercase char, but no numerals or underscore */ + if (!strchr(LETTERS, name[0])) + return false; + + /* Otherwise fields may be alphanumerical or underscore, but no two underscore may immediately follow + * each other or be trailing */ + bool underscore = false; + for (const char *c = name + 1; *c; c++) { + if (*c == '_') { + if (underscore) + return false; + + underscore = true; + continue; + } + + if (!strchr(ALPHANUMERICAL, *c)) + return false; + + underscore = false; + } + + if (underscore) + return false; + + return true; +} + +bool varlink_idl_symbol_name_is_valid(const char *name) { + if (isempty(name)) + return false; + + /* We might want to reference VARLINK_STRUCT_TYPE and VARLINK_ENUM_TYPE symbols where we also + * reference native types, hence make sure the native type names are refused as symbol names. */ + if (STR_IN_SET(name, "bool", "int", "float", "string", "object")) + return false; + + /* Symbols must be named with an uppercase letter as first character */ + if (!strchr(UPPERCASE_LETTERS, name[0])) + return false; + + for (const char *c = name + 1; *c; c++) + if (!strchr(ALPHANUMERICAL, *c)) + return false; + + return true; +} + +bool varlink_idl_interface_name_is_valid(const char *name) { + if (isempty(name)) + return false; + + /* Interface names must start with a letter, uppercase or lower case, but nothing else */ + if (!strchr(LETTERS, name[0])) + return false; + + /* Otherwise it may be a series of non-empty dot separated labels, which are alphanumerical and may + * contain single dashes in the middle */ + bool dot = false, dash = false; + for (const char *c = name + 1; *c; c++) { + switch (*c) { + + case '.': + if (dot || dash) + return false; + + dot = true; + break; + + case '-': + if (dot || dash) + return false; + + dash = true; + break; + + default: + if (!strchr(ALPHANUMERICAL, *c)) + return false; + + dot = dash = false; + } + } + + if (dot || dash) + return false; + + return true; +} + +static int varlink_idl_symbol_consistent(const VarlinkInterface *interface, const VarlinkSymbol *symbol, int level); + +static int varlink_idl_field_consistent( + const VarlinkInterface *interface, + const VarlinkSymbol *symbol, + const VarlinkField *field, + int level) { + + const char *symbol_name; + int r; + + assert(interface); + assert(symbol); + assert(field); + assert(field->name); + + symbol_name = symbol->name ?: ""; + + if (field->field_type <= 0 || field->field_type >= _VARLINK_FIELD_TYPE_MAX) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Field type for '%s' in symbol '%s' is not valid, refusing.", field->name, symbol_name); + + if (field->field_type == VARLINK_ENUM_VALUE) { + + if (symbol->symbol_type != VARLINK_ENUM_TYPE) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Enum field type for '%s' in non-enum symbol '%s', refusing.", field->name, symbol_name); + + if (field->field_flags != 0) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Enum field '%s' in symbol '%s' has non-zero flags set, refusing.", field->name, symbol_name); + } else { + if (symbol->symbol_type == VARLINK_ENUM_TYPE) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Non-enum field type for '%s' in enum symbol '%s', refusing.", field->name, symbol_name); + + if (!IN_SET(field->field_flags & ~VARLINK_NULLABLE, 0, VARLINK_ARRAY, VARLINK_MAP)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Flags of field '%s' in symbol '%s' is invalid, refusing.", field->name, symbol_name); + } + + if (symbol->symbol_type != VARLINK_METHOD) { + if (field->field_direction != VARLINK_REGULAR) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Direction of '%s' in non-method symbol '%s' not regular, refusing.", field->name, symbol_name); + } else { + if (!IN_SET(field->field_direction, VARLINK_INPUT, VARLINK_OUTPUT)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Direction of '%s' in method symbol '%s' is not input or output, refusing.", field->name, symbol_name); + } + + if (field->symbol) { + if (!IN_SET(field->field_type, VARLINK_STRUCT, VARLINK_ENUM, VARLINK_NAMED_TYPE)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Target symbol for field '%s' in symbol '%s' defined for elemental field, refusing.", field->name, symbol_name); + + if (field->field_type == VARLINK_NAMED_TYPE) { + const VarlinkSymbol *found; + + if (!field->symbol->name || !field->named_type || !streq(field->symbol->name, field->named_type)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Resolved symbol name and named type of field '%s' in symbol '%s' do do not match, refusing.", field->name, symbol_name); + + /* If this is a named type, then check if it's properly part of the interface */ + found = varlink_idl_find_symbol(interface, _VARLINK_SYMBOL_TYPE_INVALID, field->symbol->name); + if (!found) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Resolved symbol of named type of field '%s' in symbol '%s' is not part of the interface, refusing.", field->name, symbol_name); + + if (!IN_SET(found->symbol_type, VARLINK_ENUM_TYPE, VARLINK_STRUCT_TYPE)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Resolved symbol of named type of field '%s' in symbol '%s' is not a type, refusing.", field->name, symbol_name); + } else { + /* If this is an anonymous type, then we recursively check if it's consistent, since + * it's not part of the interface, and hence we won't validate it from there. */ + + r = varlink_idl_symbol_consistent(interface, field->symbol, level); + if (r < 0) + return r; + } + + } else { + if (IN_SET(field->field_type, VARLINK_STRUCT, VARLINK_ENUM, VARLINK_NAMED_TYPE)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "No target symbol for field '%s' in symbol '%s' defined for elemental field, refusing.", field->name, symbol_name); + + if (field->named_type) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Unresolved symbol in field '%s' in symbol '%s', refusing.", field->name, symbol_name); + } + + if (field->named_type) { + if (field->field_type != VARLINK_NAMED_TYPE) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Named type set for field '%s' in symbol '%s' but not a named type field, refusing.", field->name, symbol_name); + } else { + if (field->field_type == VARLINK_NAMED_TYPE) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "No named type set for field '%s' in symbol '%s' but field is a named type field, refusing.", field->name, symbol_name); + } + + return 0; +} + +static bool varlink_symbol_is_empty(const VarlinkSymbol *symbol) { + assert(symbol); + + return symbol->fields[0].field_type == _VARLINK_FIELD_TYPE_END_MARKER; +} + +static int varlink_idl_symbol_consistent( + const VarlinkInterface *interface, + const VarlinkSymbol *symbol, + int level) { + + _cleanup_(set_freep) Set *input_set = NULL, *output_set = NULL; + const char *symbol_name; + int r; + + assert(interface); + assert(symbol); + + symbol_name = symbol->name ?: ""; + + if (symbol->symbol_type < 0 || symbol->symbol_type >= _VARLINK_SYMBOL_TYPE_MAX) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Symbol type for '%s' is not valid, refusing.", symbol_name); + + if (IN_SET(symbol->symbol_type, VARLINK_STRUCT_TYPE, VARLINK_ENUM_TYPE) && varlink_symbol_is_empty(symbol)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Symbol '%s' is empty, refusing.", symbol_name); + + for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) { + Set **name_set = field->field_direction == VARLINK_OUTPUT ? &output_set : &input_set; /* for the method case we need two separate sets, otherwise we use the same */ + + if (!varlink_idl_field_name_is_valid(field->name)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Field name '%s' in symbol '%s' not valid, refusing.", field->name, symbol_name); + + if (set_contains(*name_set, field->name)) + return log_full_errno(level, SYNTHETIC_ERRNO(ENOTUNIQ), "Field '%s' defined twice in symbol '%s', refusing.", field->name, symbol_name); + + if (set_ensure_put(name_set, &string_hash_ops, field->name) < 0) + return log_oom(); + + r = varlink_idl_field_consistent(interface, symbol, field, level); + if (r < 0) + return r; + } + + return 0; +} + +int varlink_idl_consistent(const VarlinkInterface *interface, int level) { + _cleanup_(set_freep) Set *name_set = NULL; + int r; + + assert(interface); + + if (!varlink_idl_interface_name_is_valid(interface->name)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Interface name '%s' is not valid, refusing.", interface->name); + + for (const VarlinkSymbol *const *symbol = interface->symbols; *symbol; symbol++) { + + if (!varlink_idl_symbol_name_is_valid((*symbol)->name)) + return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Symbol name '%s' is not valid, refusing.", strempty((*symbol)->name)); + + if (set_contains(name_set, (*symbol)->name)) + return log_full_errno(level, SYNTHETIC_ERRNO(ENOTUNIQ), "Symbol '%s' defined twice in interface, refusing.", (*symbol)->name); + + if (set_ensure_put(&name_set, &string_hash_ops, (*symbol)->name) < 0) + return log_oom(); + + r = varlink_idl_symbol_consistent(interface, *symbol, level); + if (r < 0) + return r; + } + + return 0; +} + +static int varlink_idl_validate_symbol(const VarlinkSymbol *symbol, JsonVariant *v, VarlinkFieldDirection direction, const char **bad_field); + +static int varlink_idl_validate_field_element_type(const VarlinkField *field, JsonVariant *v) { + assert(field); + + switch (field->field_type) { + + case VARLINK_STRUCT: + case VARLINK_ENUM: + case VARLINK_NAMED_TYPE: + return varlink_idl_validate_symbol(field->symbol, v, VARLINK_REGULAR, NULL); + + case VARLINK_BOOL: + if (!json_variant_is_boolean(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be a bool, but it is not, refusing.", strna(field->name)); + + break; + + case VARLINK_INT: + if (!json_variant_is_integer(v) && !json_variant_is_unsigned(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an int, but it is not, refusing.", strna(field->name)); + + break; + + case VARLINK_FLOAT: + if (!json_variant_is_number(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be a float, but it is not, refusing.", strna(field->name)); + + break; + + case VARLINK_STRING: + if (!json_variant_is_string(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be a string, but it is not, refusing.", strna(field->name)); + + break; + + case VARLINK_OBJECT: + if (!json_variant_is_object(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an object, but it is not, refusing.", strna(field->name)); + + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int varlink_idl_validate_field(const VarlinkField *field, JsonVariant *v) { + int r; + + assert(field); + + if (!v || json_variant_is_null(v)) { + + if (!FLAGS_SET(field->field_flags, VARLINK_NULLABLE)) + return log_debug_errno(SYNTHETIC_ERRNO(ENOANO), "Mandatory field '%s' is null or missing on object, refusing.", strna(field->name)); + + } else if (FLAGS_SET(field->field_flags, VARLINK_ARRAY)) { + JsonVariant *i; + + if (!json_variant_is_array(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an array, but it is not, refusing.", strna(field->name)); + + JSON_VARIANT_ARRAY_FOREACH(i, v) { + r = varlink_idl_validate_field_element_type(field, i); + if (r < 0) + return r; + } + + } else if (FLAGS_SET(field->field_flags, VARLINK_MAP)) { + _unused_ const char *k; + JsonVariant *e; + + if (!json_variant_is_object(v)) + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an object, but it is not, refusing.", strna(field->name)); + + JSON_VARIANT_OBJECT_FOREACH(k, e, v) { + r = varlink_idl_validate_field_element_type(field, e); + if (r < 0) + return r; + } + } else { + + r = varlink_idl_validate_field_element_type(field, v); + if (r < 0) + return r; + } + + return 0; +} + +static int varlink_idl_validate_symbol(const VarlinkSymbol *symbol, JsonVariant *v, VarlinkFieldDirection direction, const char **bad_field) { + int r; + + assert(symbol); + + if (!v) { + if (bad_field) + *bad_field = NULL; + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Null object passed, refusing."); + } + + switch (symbol->symbol_type) { + + case VARLINK_ENUM_TYPE: { + bool found = false; + const char *s; + + if (!json_variant_is_string(v)) { + if (bad_field) + *bad_field = symbol->name; + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Passed non-string to enum field '%s', refusing.", strna(symbol->name)); + } + + assert_se(s = json_variant_string(v)); + + for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) { + + assert(field->field_type == VARLINK_ENUM_VALUE); + + if (streq_ptr(field->name, s)) { + found = true; + break; + } + } + + if (!found) { + if (bad_field) + *bad_field = s; + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Passed unrecognized string '%s' to enum field '%s', refusing.", s, strna(symbol->name)); + } + + break; + } + + case VARLINK_STRUCT_TYPE: + case VARLINK_METHOD: + case VARLINK_ERROR: { + if (!json_variant_is_object(v)) { + if (bad_field) + *bad_field = symbol->name; + return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Passed non-object to field '%s', refusing.", strna(symbol->name)); + } + + for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) { + + if (field->field_direction != direction) + continue; + + r = varlink_idl_validate_field(field, json_variant_by_key(v, field->name)); + if (r < 0) { + if (bad_field) + *bad_field = field->name; + return r; + } + } + + _unused_ JsonVariant *e; + const char *name; + JSON_VARIANT_OBJECT_FOREACH(name, e, v) { + if (!varlink_idl_find_field(symbol, name)) { + if (bad_field) + *bad_field = name; + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Field '%s' not defined for object, refusing.", name); + } + } + + break; + } + + default: + assert_not_reached(); + } + + return 1; /* validated */ +} + +static int varlink_idl_validate_method(const VarlinkSymbol *method, JsonVariant *v, VarlinkFieldDirection direction, const char **bad_field) { + assert(IN_SET(direction, VARLINK_INPUT, VARLINK_OUTPUT)); + + if (!method) + return 0; /* Can't validate */ + if (method->symbol_type != VARLINK_METHOD) + return -EBADMSG; + + return varlink_idl_validate_symbol(method, v, direction, bad_field); +} + +int varlink_idl_validate_method_call(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field) { + return varlink_idl_validate_method(method, v, VARLINK_INPUT, bad_field); +} + +int varlink_idl_validate_method_reply(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field) { + return varlink_idl_validate_method(method, v, VARLINK_OUTPUT, bad_field); +} + +int varlink_idl_validate_error(const VarlinkSymbol *error, JsonVariant *v, const char **bad_field) { + if (!error) + return 0; /* Can't validate */ + if (error->symbol_type != VARLINK_ERROR) + return -EBADMSG; + + return varlink_idl_validate_symbol(error, v, VARLINK_REGULAR, bad_field); +} + +const VarlinkSymbol* varlink_idl_find_symbol( + const VarlinkInterface *interface, + VarlinkSymbolType type, + const char *name) { + + assert(interface); + assert(type < _VARLINK_SYMBOL_TYPE_MAX); + + if (isempty(name)) + return NULL; + + for (const VarlinkSymbol *const*symbol = interface->symbols; *symbol; symbol++) { + if (type >= 0 && (*symbol)->symbol_type != type) + continue; + + if (streq_ptr((*symbol)->name, name)) + return *symbol; + } + + return NULL; +} + +const VarlinkField* varlink_idl_find_field( + const VarlinkSymbol *symbol, + const char *name) { + + assert(symbol); + + if (isempty(name)) + return NULL; + + for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) + if (streq_ptr(field->name, name)) + return field; + + return NULL; +} diff --git a/src/shared/varlink-idl.h b/src/shared/varlink-idl.h new file mode 100644 index 0000000..140b937 --- /dev/null +++ b/src/shared/varlink-idl.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "json.h" +#include "macro.h" + +/* This implements the Varlink Interface Definition Language ("Varlink IDL"), + * i.e. https://varlink.org/Interface-Definition + * + * Primarily allows encoding static interface definitions in C code, that can be converted to the textual IDL + * format on-the-fly. Can also parse the textual format back to C structures. Validates the interface + * definitions for internal consistency and validates JSON objects against the interface definitions. */ + +typedef enum VarlinkSymbolType { + VARLINK_ENUM_TYPE, + VARLINK_STRUCT_TYPE, + VARLINK_METHOD, + VARLINK_ERROR, + _VARLINK_SYMBOL_TYPE_MAX, + _VARLINK_SYMBOL_TYPE_INVALID = -EINVAL, +} VarlinkSymbolType; + +typedef enum VarlinkFieldType { + _VARLINK_FIELD_TYPE_END_MARKER = 0, /* zero type means: this is the last entry in the fields[] array of VarlinkSymbol */ + VARLINK_STRUCT, + VARLINK_ENUM, + VARLINK_NAMED_TYPE, + VARLINK_BOOL, + VARLINK_INT, + VARLINK_FLOAT, + VARLINK_STRING, + VARLINK_OBJECT, + VARLINK_ENUM_VALUE, + _VARLINK_FIELD_TYPE_MAX, + _VARLINK_FIELD_TYPE_INVALID = -EINVAL, +} VarlinkFieldType; + +typedef enum VarlinkFieldDirection { + VARLINK_REGULAR, + VARLINK_INPUT, + VARLINK_OUTPUT, + _VARLINK_FIELD_DIRECTION_MAX, + _VARLINK_FIELD_DIRECTION_INVALID = -EINVAL, +} VarlinkFieldDirection; + +typedef enum VarlinkFieldFlags { + VARLINK_ARRAY = 1 << 0, + VARLINK_MAP = 1 << 1, + VARLINK_NULLABLE = 1 << 2, + _VARLINK_FIELD_FLAGS_MAX = (1 << 3) - 1, + _VARLINK_FIELD_FLAGS_INVALID = -EINVAL, +} VarlinkFieldFlags; + +typedef struct VarlinkField VarlinkField; +typedef struct VarlinkSymbol VarlinkSymbol; +typedef struct VarlinkInterface VarlinkInterface; + +/* Fields are the components making up symbols */ +struct VarlinkField { + const char *name; + VarlinkFieldType field_type; + VarlinkFieldFlags field_flags; + VarlinkFieldDirection field_direction; /* in case of method call fields: whether input or output argument */ + const VarlinkSymbol *symbol; /* VARLINK_STRUCT, VARLINK_ENUM: anonymous symbol that carries the definitions, VARLINK_NAMED_TYPE: resolved symbol */ + const char *named_type; /* VARLINK_NAMED_TYPE */ +}; + +/* Symbols are primary named concepts in an interface, and are methods, errors or named types (either enum or struct). */ +struct VarlinkSymbol { + const char *name; /* most symbols have a name, but sometimes they are created on-the-fly for fields, in which case they are anonymous */ + VarlinkSymbolType symbol_type; + VarlinkField fields[]; +}; + +/* An interface definition has a name and consist of symbols */ +struct VarlinkInterface { + const char *name; + const VarlinkSymbol *symbols[]; +}; + +#define VARLINK_DEFINE_FIELD(_name, _field_type, _field_flags) \ + { .name = #_name, .field_type = (_field_type), .field_flags = (_field_flags) } + +#define VARLINK_DEFINE_FIELD_BY_TYPE(_name, _named_type, _field_flags) \ + { .name = #_name, .field_type = VARLINK_NAMED_TYPE, .named_type = #_named_type, .symbol = &vl_type_ ## _named_type, .field_flags = (_field_flags) } + +#define VARLINK_DEFINE_INPUT(_name, _field_type, _field_flags) \ + { .name = #_name, .field_type = (_field_type), .field_flags = (_field_flags), .field_direction = VARLINK_INPUT } + +#define VARLINK_DEFINE_INPUT_BY_TYPE(_name, _named_type, _field_flags) \ + { .name = #_name, .field_type = VARLINK_NAMED_TYPE, .named_type = #_named_type, .symbol = &vl_type_ ## _named_type, .field_flags = (_field_flags), .field_direction = VARLINK_INPUT } + +#define VARLINK_DEFINE_OUTPUT(_name, _field_type, _field_flags) \ + { .name = #_name, .field_type = (_field_type), .field_flags = (_field_flags), .field_direction = VARLINK_OUTPUT } + +#define VARLINK_DEFINE_OUTPUT_BY_TYPE(_name, _named_type, _field_flags) \ + { .name = #_name, .field_type = VARLINK_NAMED_TYPE, .named_type = #_named_type, .symbol = &vl_type_ ## _named_type, .field_flags = (_field_flags), .field_direction = VARLINK_OUTPUT } + +#define VARLINK_DEFINE_ENUM_VALUE(_name) \ + { .name = #_name, .field_type = VARLINK_ENUM_VALUE } + +#define VARLINK_DEFINE_METHOD(_name, ...) \ + const VarlinkSymbol vl_method_ ## _name = { \ + .name = #_name, \ + .symbol_type = VARLINK_METHOD, \ + .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \ + } + +#define VARLINK_DEFINE_ERROR(_name, ...) \ + const VarlinkSymbol vl_error_ ## _name = { \ + .name = #_name, \ + .symbol_type = VARLINK_ERROR, \ + .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \ + } + +#define VARLINK_DEFINE_STRUCT_TYPE(_name, ...) \ + const VarlinkSymbol vl_type_ ## _name = { \ + .name = #_name, \ + .symbol_type = VARLINK_STRUCT_TYPE, \ + .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \ + } + +#define VARLINK_DEFINE_ENUM_TYPE(_name, ...) \ + const VarlinkSymbol vl_type_ ## _name = { \ + .name = #_name, \ + .symbol_type = VARLINK_ENUM_TYPE, \ + .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \ + } + +#define VARLINK_DEFINE_INTERFACE(_name, _full_name, ...) \ + const VarlinkInterface vl_interface_ ## _name = { \ + .name = (_full_name), \ + .symbols = { __VA_ARGS__ __VA_OPT__(,) NULL}, \ + } + +int varlink_idl_dump(FILE *f, int use_colors, const VarlinkInterface *interface); +int varlink_idl_format(const VarlinkInterface *interface, char **ret); + +int varlink_idl_parse(const char *text, unsigned *ret_line, unsigned *ret_column, VarlinkInterface **ret); +VarlinkInterface* varlink_interface_free(VarlinkInterface *interface); +DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkInterface*, varlink_interface_free); + +bool varlink_idl_field_name_is_valid(const char *name); +bool varlink_idl_symbol_name_is_valid(const char *name); +bool varlink_idl_interface_name_is_valid(const char *name); + +int varlink_idl_consistent(const VarlinkInterface *interface, int level); + +const VarlinkSymbol* varlink_idl_find_symbol(const VarlinkInterface *interface, VarlinkSymbolType type, const char *name); +const VarlinkField* varlink_idl_find_field(const VarlinkSymbol *symbol, const char *name); + +int varlink_idl_validate_method_call(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field); +int varlink_idl_validate_method_reply(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field); +int varlink_idl_validate_error(const VarlinkSymbol *error, JsonVariant *v, const char **bad_field); diff --git a/src/shared/varlink-internal.h b/src/shared/varlink-internal.h new file mode 100644 index 0000000..715202a --- /dev/null +++ b/src/shared/varlink-internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "fdset.h" +#include "varlink.h" + +int varlink_server_serialize(VarlinkServer *s, FILE *f, FDSet *fds); +int varlink_server_deserialize_one(VarlinkServer *s, const char *value, FDSet *fds); diff --git a/src/shared/varlink-io.systemd.Journal.c b/src/shared/varlink-io.systemd.Journal.c new file mode 100644 index 0000000..b93fb72 --- /dev/null +++ b/src/shared/varlink-io.systemd.Journal.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.Journal.h" + +static VARLINK_DEFINE_METHOD(Synchronize); +static VARLINK_DEFINE_METHOD(Rotate); +static VARLINK_DEFINE_METHOD(FlushToVar); +static VARLINK_DEFINE_METHOD(RelinquishVar); + +static VARLINK_DEFINE_ERROR(NotSupportedByNamespaces); + +VARLINK_DEFINE_INTERFACE( + io_systemd_Journal, + "io.systemd.Journal", + &vl_method_Synchronize, + &vl_method_Rotate, + &vl_method_FlushToVar, + &vl_method_RelinquishVar, + &vl_error_NotSupportedByNamespaces); diff --git a/src/shared/varlink-io.systemd.Journal.h b/src/shared/varlink-io.systemd.Journal.h new file mode 100644 index 0000000..0bc94a7 --- /dev/null +++ b/src/shared/varlink-io.systemd.Journal.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_Journal; diff --git a/src/shared/varlink-io.systemd.ManagedOOM.c b/src/shared/varlink-io.systemd.ManagedOOM.c new file mode 100644 index 0000000..d6414b3 --- /dev/null +++ b/src/shared/varlink-io.systemd.ManagedOOM.c @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.ManagedOOM.h" + +/* Pull in vl_type_ControlGroup, since both interfaces need it */ +#include "varlink-io.systemd.oom.h" + +/* This is PID1's Varlink service, where PID 1 is the server and oomd is the client. + * + * Compare with io.systemd.oom where the client/server roles of oomd and the service manager are swapped! */ + +static VARLINK_DEFINE_METHOD( + SubscribeManagedOOMCGroups, + VARLINK_DEFINE_OUTPUT_BY_TYPE(cgroups, ControlGroup, VARLINK_ARRAY)); + +static VARLINK_DEFINE_ERROR(SubscriptionTaken); + +VARLINK_DEFINE_INTERFACE( + io_systemd_ManagedOOM, + "io.systemd.ManagedOOM", + &vl_method_SubscribeManagedOOMCGroups, + &vl_type_ControlGroup, + &vl_error_SubscriptionTaken); diff --git a/src/shared/varlink-io.systemd.ManagedOOM.h b/src/shared/varlink-io.systemd.ManagedOOM.h new file mode 100644 index 0000000..2c8bf54 --- /dev/null +++ b/src/shared/varlink-io.systemd.ManagedOOM.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_ManagedOOM; diff --git a/src/shared/varlink-io.systemd.PCRExtend.c b/src/shared/varlink-io.systemd.PCRExtend.c new file mode 100644 index 0000000..37d403f --- /dev/null +++ b/src/shared/varlink-io.systemd.PCRExtend.c @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.PCRExtend.h" + +static VARLINK_DEFINE_METHOD( + Extend, + VARLINK_DEFINE_INPUT(pcr, VARLINK_INT, 0), + VARLINK_DEFINE_INPUT(text, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(data, VARLINK_STRING, VARLINK_NULLABLE)); + +VARLINK_DEFINE_INTERFACE( + io_systemd_PCRExtend, + "io.systemd.PCRExtend", + &vl_method_Extend); diff --git a/src/shared/varlink-io.systemd.PCRExtend.h b/src/shared/varlink-io.systemd.PCRExtend.h new file mode 100644 index 0000000..ffc075a --- /dev/null +++ b/src/shared/varlink-io.systemd.PCRExtend.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_PCRExtend; diff --git a/src/shared/varlink-io.systemd.Resolve.Monitor.c b/src/shared/varlink-io.systemd.Resolve.Monitor.c new file mode 100644 index 0000000..d95b613 --- /dev/null +++ b/src/shared/varlink-io.systemd.Resolve.Monitor.c @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.Resolve.Monitor.h" + +VARLINK_DEFINE_STRUCT_TYPE( + ResourceKey, + VARLINK_DEFINE_FIELD(class, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(type, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(name, VARLINK_STRING, 0)); + +VARLINK_DEFINE_STRUCT_TYPE( + ResourceRecord, + VARLINK_DEFINE_FIELD_BY_TYPE(key, ResourceKey, 0), + VARLINK_DEFINE_FIELD(priority, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(weight, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(port, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(name, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(cpu, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(os, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(items, VARLINK_STRING, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(address, VARLINK_INT, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(mname, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(rname, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(serial, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(refresh, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(expire, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(minimum, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(exchange, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(version, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(size, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(horiz_pre, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(vert_pre, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(latitude, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(longitude, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(altitude, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(keyTag, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(algorithm, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(digestType, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(digest, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(fptype, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(fingerprint, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(flags, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(protocol, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(dnskey, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(signer, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(typeCovered, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(labels, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(originalTtl, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(expiration, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(inception, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(signature, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(nextDomain, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(types, VARLINK_INT, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(iterations, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(salt, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(hash, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(certUsage, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(selector, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(matchingType, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(data, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(tag, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(value, VARLINK_STRING, VARLINK_NULLABLE)); + +VARLINK_DEFINE_STRUCT_TYPE( + ResourceRecordArray, + VARLINK_DEFINE_FIELD_BY_TYPE(rr, ResourceRecord, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(raw, VARLINK_STRING, 0)); + +VARLINK_DEFINE_STRUCT_TYPE( + Answer, + VARLINK_DEFINE_FIELD_BY_TYPE(rr, ResourceRecord, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(raw, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE)); + +VARLINK_DEFINE_METHOD( + SubscribeQueryResults, + /* First reply */ + VARLINK_DEFINE_OUTPUT(ready, VARLINK_BOOL, VARLINK_NULLABLE), + /* Subsequent replies */ + VARLINK_DEFINE_OUTPUT(state, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(rcode, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(errno, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT_BY_TYPE(question, ResourceKey, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_OUTPUT_BY_TYPE(collectedQuestions, ResourceKey, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_OUTPUT_BY_TYPE(answer, Answer, VARLINK_NULLABLE|VARLINK_ARRAY)); + +VARLINK_DEFINE_STRUCT_TYPE( + CacheEntry, + VARLINK_DEFINE_FIELD_BY_TYPE(key, ResourceKey, 0), + VARLINK_DEFINE_FIELD_BY_TYPE(rrs, ResourceRecordArray, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(type, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(until, VARLINK_INT, 0)); + +VARLINK_DEFINE_STRUCT_TYPE( + ScopeCache, + VARLINK_DEFINE_FIELD(protocol, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(family, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(ifname, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD_BY_TYPE(cache, CacheEntry, VARLINK_ARRAY)); + +VARLINK_DEFINE_METHOD( + DumpCache, + VARLINK_DEFINE_OUTPUT_BY_TYPE(dump, ScopeCache, VARLINK_ARRAY)); + +VARLINK_DEFINE_STRUCT_TYPE( + ServerState, + VARLINK_DEFINE_FIELD(Server, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(Type, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(Interface, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(InterfaceIndex, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(VerifiedFeatureLevel, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(PossibleFeatureLevel, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(DNSSECMode, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(DNSSECSupported, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD(ReceivedUDPFragmentMax, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(FailedUDPAttempts, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(FailedTCPAttempts, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(PacketTruncated, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD(PacketBadOpt, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD(PacketRRSIGMissing, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD(PacketInvalid, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD(PacketDoOff, VARLINK_BOOL, 0)); + +VARLINK_DEFINE_METHOD( + DumpServerState, + VARLINK_DEFINE_OUTPUT_BY_TYPE(dump, ServerState, VARLINK_ARRAY)); + +VARLINK_DEFINE_STRUCT_TYPE( + TransactionStatistics, + VARLINK_DEFINE_FIELD(currentTransactions, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(totalTransactions, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(totalTimeouts, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(totalTimeoutsServedStale, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(totalFailedResponses, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(totalFailedResponsesServedStale, VARLINK_INT, 0)); + +VARLINK_DEFINE_STRUCT_TYPE( + CacheStatistics, + VARLINK_DEFINE_FIELD(size, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(hits, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(misses, VARLINK_INT, 0)); + +VARLINK_DEFINE_STRUCT_TYPE( + DnssecStatistics, + VARLINK_DEFINE_FIELD(secure, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(insecure, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(bogus, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(indeterminate, VARLINK_INT, 0)); + +VARLINK_DEFINE_METHOD( + DumpStatistics, + VARLINK_DEFINE_OUTPUT_BY_TYPE(transactions, TransactionStatistics, 0), + VARLINK_DEFINE_OUTPUT_BY_TYPE(cache, CacheStatistics, 0), + VARLINK_DEFINE_OUTPUT_BY_TYPE(dnssec, DnssecStatistics, 0)); + +VARLINK_DEFINE_METHOD(ResetStatistics); + +VARLINK_DEFINE_INTERFACE( + io_systemd_Resolve_Monitor, + "io.systemd.Resolve.Monitor", + &vl_method_SubscribeQueryResults, + &vl_method_DumpCache, + &vl_method_DumpServerState, + &vl_method_DumpStatistics, + &vl_method_ResetStatistics, + &vl_type_ResourceKey, + &vl_type_ResourceRecord, + &vl_type_ResourceRecordArray, + &vl_type_Answer, + &vl_type_CacheEntry, + &vl_type_ScopeCache, + &vl_type_TransactionStatistics, + &vl_type_CacheStatistics, + &vl_type_DnssecStatistics, + &vl_type_ServerState); diff --git a/src/shared/varlink-io.systemd.Resolve.Monitor.h b/src/shared/varlink-io.systemd.Resolve.Monitor.h new file mode 100644 index 0000000..a133ec3 --- /dev/null +++ b/src/shared/varlink-io.systemd.Resolve.Monitor.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_Resolve_Monitor; diff --git a/src/shared/varlink-io.systemd.Resolve.c b/src/shared/varlink-io.systemd.Resolve.c new file mode 100644 index 0000000..0d8ad28 --- /dev/null +++ b/src/shared/varlink-io.systemd.Resolve.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.Resolve.h" + +static VARLINK_DEFINE_STRUCT_TYPE( + ResolvedAddress, + VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(family, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(address, VARLINK_INT, VARLINK_ARRAY)); + +static VARLINK_DEFINE_METHOD( + ResolveHostname, + VARLINK_DEFINE_INPUT(ifindex, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(name, VARLINK_STRING, 0), + VARLINK_DEFINE_INPUT(family, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(flags, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT_BY_TYPE(addresses, ResolvedAddress, VARLINK_ARRAY), + VARLINK_DEFINE_OUTPUT(name, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(flags, VARLINK_INT, 0)); + +static VARLINK_DEFINE_STRUCT_TYPE( + ResolvedName, + VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(name, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_METHOD( + ResolveAddress, + VARLINK_DEFINE_INPUT(ifindex, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(family, VARLINK_INT, 0), + VARLINK_DEFINE_INPUT(address, VARLINK_INT, VARLINK_ARRAY), + VARLINK_DEFINE_INPUT(flags, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT_BY_TYPE(names, ResolvedName, VARLINK_ARRAY), + VARLINK_DEFINE_OUTPUT(flags, VARLINK_INT, 0)); + +static VARLINK_DEFINE_ERROR(NoNameServers); +static VARLINK_DEFINE_ERROR(NoSuchResourceRecord); +static VARLINK_DEFINE_ERROR(QueryTimedOut); +static VARLINK_DEFINE_ERROR(MaxAttemptsReached); +static VARLINK_DEFINE_ERROR(InvalidReply); +static VARLINK_DEFINE_ERROR(QueryAborted); +static VARLINK_DEFINE_ERROR( + DNSSECValidationFailed, + VARLINK_DEFINE_FIELD(result, VARLINK_STRING, 0)); +static VARLINK_DEFINE_ERROR(NoTrustAnchor); +static VARLINK_DEFINE_ERROR(ResourceRecordTypeUnsupported); +static VARLINK_DEFINE_ERROR(NetworkDown); +static VARLINK_DEFINE_ERROR(NoSource); +static VARLINK_DEFINE_ERROR(StubLoop); +static VARLINK_DEFINE_ERROR( + DNSError, + VARLINK_DEFINE_FIELD(rcode, VARLINK_INT, 0)); +static VARLINK_DEFINE_ERROR(CNAMELoop); +static VARLINK_DEFINE_ERROR(BadAddressSize); + +VARLINK_DEFINE_INTERFACE( + io_systemd_Resolve, + "io.systemd.Resolve", + &vl_method_ResolveHostname, + &vl_method_ResolveAddress, + &vl_type_ResolvedAddress, + &vl_type_ResolvedName, + &vl_error_NoNameServers, + &vl_error_NoSuchResourceRecord, + &vl_error_QueryTimedOut, + &vl_error_MaxAttemptsReached, + &vl_error_InvalidReply, + &vl_error_QueryAborted, + &vl_error_DNSSECValidationFailed, + &vl_error_NoTrustAnchor, + &vl_error_ResourceRecordTypeUnsupported, + &vl_error_NetworkDown, + &vl_error_NoSource, + &vl_error_StubLoop, + &vl_error_DNSError, + &vl_error_CNAMELoop, + &vl_error_BadAddressSize); diff --git a/src/shared/varlink-io.systemd.Resolve.h b/src/shared/varlink-io.systemd.Resolve.h new file mode 100644 index 0000000..5c7ed39 --- /dev/null +++ b/src/shared/varlink-io.systemd.Resolve.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_Resolve; diff --git a/src/shared/varlink-io.systemd.UserDatabase.c b/src/shared/varlink-io.systemd.UserDatabase.c new file mode 100644 index 0000000..c10a7d3 --- /dev/null +++ b/src/shared/varlink-io.systemd.UserDatabase.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.UserDatabase.h" + +static VARLINK_DEFINE_METHOD( + GetUserRecord, + VARLINK_DEFINE_INPUT(uid, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(userName, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(service, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(record, VARLINK_OBJECT, 0), + VARLINK_DEFINE_OUTPUT(incomplete, VARLINK_BOOL, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_METHOD( + GetGroupRecord, + VARLINK_DEFINE_INPUT(gid, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(groupName, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(service, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(record, VARLINK_OBJECT, 0), + VARLINK_DEFINE_OUTPUT(incomplete, VARLINK_BOOL, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_METHOD( + GetMemberships, + VARLINK_DEFINE_INPUT(userName, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(groupName, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(service, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(userName, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(groupName, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_ERROR(NoRecordFound); +static VARLINK_DEFINE_ERROR(BadService); +static VARLINK_DEFINE_ERROR(ServiceNotAvailable); +static VARLINK_DEFINE_ERROR(ConflictingRecordNotFound); +static VARLINK_DEFINE_ERROR(EnumerationNotSupported); + +/* As per https://systemd.io/USER_GROUP_API/ */ +VARLINK_DEFINE_INTERFACE( + io_systemd_UserDatabase, + "io.systemd.UserDatabase", + &vl_method_GetUserRecord, + &vl_method_GetGroupRecord, + &vl_method_GetMemberships, + &vl_error_NoRecordFound, + &vl_error_BadService, + &vl_error_ServiceNotAvailable, + &vl_error_ConflictingRecordNotFound, + &vl_error_EnumerationNotSupported); diff --git a/src/shared/varlink-io.systemd.UserDatabase.h b/src/shared/varlink-io.systemd.UserDatabase.h new file mode 100644 index 0000000..346ca84 --- /dev/null +++ b/src/shared/varlink-io.systemd.UserDatabase.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_UserDatabase; diff --git a/src/shared/varlink-io.systemd.c b/src/shared/varlink-io.systemd.c new file mode 100644 index 0000000..cdfe9ac --- /dev/null +++ b/src/shared/varlink-io.systemd.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.h" + +/* These are local errors that never cross the wire, and are our own invention */ +static VARLINK_DEFINE_ERROR(Disconnected); +static VARLINK_DEFINE_ERROR(TimedOut); +static VARLINK_DEFINE_ERROR(Protocol); + +/* This one we invented, and use for generically propagating system errors (errno) to clients */ +static VARLINK_DEFINE_ERROR( + System, + VARLINK_DEFINE_FIELD(errno, VARLINK_INT, 0)); + +VARLINK_DEFINE_INTERFACE( + io_systemd, + "io.systemd", + &vl_error_Disconnected, + &vl_error_TimedOut, + &vl_error_Protocol, + &vl_error_System); diff --git a/src/shared/varlink-io.systemd.h b/src/shared/varlink-io.systemd.h new file mode 100644 index 0000000..6c17c6c --- /dev/null +++ b/src/shared/varlink-io.systemd.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd; diff --git a/src/shared/varlink-io.systemd.oom.c b/src/shared/varlink-io.systemd.oom.c new file mode 100644 index 0000000..e1da3fa --- /dev/null +++ b/src/shared/varlink-io.systemd.oom.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.oom.h" + +/* This is oomd's Varlink service, where oomd is server and systemd --user is the client. + * + * Compare with io.systemd.ManagedOOM where the client/server roles of the service manager and oomd are + * swapped! */ + +VARLINK_DEFINE_STRUCT_TYPE( + ControlGroup, + VARLINK_DEFINE_FIELD(mode, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(path, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(property, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(limit, VARLINK_INT, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_METHOD( + ReportManagedOOMCGroups, + VARLINK_DEFINE_INPUT_BY_TYPE(cgroups, ControlGroup, VARLINK_ARRAY)); + +VARLINK_DEFINE_INTERFACE( + io_systemd_oom, + "io.systemd.oom", + &vl_method_ReportManagedOOMCGroups, + &vl_type_ControlGroup); diff --git a/src/shared/varlink-io.systemd.oom.h b/src/shared/varlink-io.systemd.oom.h new file mode 100644 index 0000000..911dbc2 --- /dev/null +++ b/src/shared/varlink-io.systemd.oom.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkSymbol vl_type_ControlGroup; +extern const VarlinkInterface vl_interface_io_systemd_oom; diff --git a/src/shared/varlink-io.systemd.service.c b/src/shared/varlink-io.systemd.service.c new file mode 100644 index 0000000..e9df5de --- /dev/null +++ b/src/shared/varlink-io.systemd.service.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include + +#include "varlink-io.systemd.service.h" + +static VARLINK_DEFINE_METHOD(Ping); + +static VARLINK_DEFINE_METHOD(Reload); + +static VARLINK_DEFINE_METHOD( + SetLogLevel, + VARLINK_DEFINE_INPUT(level, VARLINK_INT, 0)); + +VARLINK_DEFINE_INTERFACE( + io_systemd_service, + "io.systemd.service", + &vl_method_Ping, + &vl_method_Reload, + &vl_method_SetLogLevel); + +int varlink_method_ping(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + assert(link); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + log_debug("Received io.systemd.service.Ping"); + + return varlink_reply(link, NULL); +} + +int varlink_method_set_log_level(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch dispatch_table[] = { + { "level", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int64, 0, JSON_MANDATORY }, + {} + }; + + int64_t level; + uid_t uid; + int r; + + assert(link); + assert(parameters); + + /* NOTE: The method does have 1 parameter, but we must compare to 2 here, because + * json_variant_elements() breaks abstraction and exposes internal structure of JsonObject. */ + if (json_variant_elements(parameters) != 2) + return varlink_error_invalid_parameter(link, parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &level); + if (r != 0) + return r; + + if (LOG_PRI(level) != level) + return varlink_error_invalid_parameter(link, parameters); + + r = varlink_get_peer_uid(link, &uid); + if (r < 0) + return r; + + if (uid != getuid() && uid != 0) + return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, parameters); + + log_debug("Received io.systemd.service.SetLogLevel(%" PRIi64 ")", level); + + log_set_max_level(level); + + return varlink_reply(link, NULL); +} diff --git a/src/shared/varlink-io.systemd.service.h b/src/shared/varlink-io.systemd.service.h new file mode 100644 index 0000000..bc90ff0 --- /dev/null +++ b/src/shared/varlink-io.systemd.service.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include "varlink.h" +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_service; + +int varlink_method_ping(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata); +int varlink_method_set_log_level(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata); diff --git a/src/shared/varlink-io.systemd.sysext.c b/src/shared/varlink-io.systemd.sysext.c new file mode 100644 index 0000000..66e3534 --- /dev/null +++ b/src/shared/varlink-io.systemd.sysext.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.sysext.h" + +static VARLINK_DEFINE_ENUM_TYPE( + ImageClass, + VARLINK_DEFINE_ENUM_VALUE(sysext), + VARLINK_DEFINE_ENUM_VALUE(confext)); + +static VARLINK_DEFINE_ENUM_TYPE( + ImageType, + VARLINK_DEFINE_ENUM_VALUE(directory), + VARLINK_DEFINE_ENUM_VALUE(subvolume), + VARLINK_DEFINE_ENUM_VALUE(raw), + VARLINK_DEFINE_ENUM_VALUE(block)); + +static VARLINK_DEFINE_METHOD( + Merge, + VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(force, VARLINK_BOOL, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(noReload, VARLINK_BOOL, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(noexec, VARLINK_BOOL, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_METHOD( + Unmerge, + VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(noReload, VARLINK_BOOL, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_METHOD( + Refresh, + VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(force, VARLINK_BOOL, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(noReload, VARLINK_BOOL, VARLINK_NULLABLE), + VARLINK_DEFINE_INPUT(noexec, VARLINK_BOOL, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_METHOD( + List, + VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT_BY_TYPE(Class, ImageClass, 0), + VARLINK_DEFINE_OUTPUT_BY_TYPE(Type, ImageType, 0), + VARLINK_DEFINE_OUTPUT(Name, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(Path, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(ReadOnly, VARLINK_BOOL, 0), + VARLINK_DEFINE_OUTPUT(CreationTimestamp, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(ModificationTimestamp, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(Usage, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(UsageExclusive, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(Limit, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(LimitExclusive, VARLINK_INT, VARLINK_NULLABLE)); + +static VARLINK_DEFINE_ERROR(NoImagesFound); + +static VARLINK_DEFINE_ERROR( + AlreadyMerged, + VARLINK_DEFINE_FIELD(hierarchy, VARLINK_STRING, 0)); + +VARLINK_DEFINE_INTERFACE( + io_systemd_sysext, + "io.systemd.sysext", + &vl_type_ImageClass, + &vl_type_ImageType, + &vl_method_Merge, + &vl_method_Unmerge, + &vl_method_Refresh, + &vl_method_List, + &vl_error_NoImagesFound, + &vl_error_AlreadyMerged); diff --git a/src/shared/varlink-io.systemd.sysext.h b/src/shared/varlink-io.systemd.sysext.h new file mode 100644 index 0000000..ee649c6 --- /dev/null +++ b/src/shared/varlink-io.systemd.sysext.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_io_systemd_sysext; diff --git a/src/shared/varlink-org.varlink.service.c b/src/shared/varlink-org.varlink.service.c new file mode 100644 index 0000000..e5122c0 --- /dev/null +++ b/src/shared/varlink-org.varlink.service.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-org.varlink.service.h" + +static VARLINK_DEFINE_METHOD( + GetInfo, + VARLINK_DEFINE_OUTPUT(vendor, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(product, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(version, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(url, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(interfaces, VARLINK_STRING, VARLINK_ARRAY)); + +static VARLINK_DEFINE_METHOD( + GetInterfaceDescription, + VARLINK_DEFINE_INPUT(interface, VARLINK_STRING, 0), + VARLINK_DEFINE_OUTPUT(description, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_ERROR( + InterfaceNotFound, + VARLINK_DEFINE_FIELD(interface, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_ERROR( + MethodNotFound, + VARLINK_DEFINE_FIELD(method, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_ERROR( + MethodNotImplemented, + VARLINK_DEFINE_FIELD(method, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_ERROR( + InvalidParameter, + VARLINK_DEFINE_FIELD(parameter, VARLINK_STRING, 0)); + +static VARLINK_DEFINE_ERROR(PermissionDenied); + +static VARLINK_DEFINE_ERROR(ExpectedMore); + +/* As per https://varlink.org/Service */ +VARLINK_DEFINE_INTERFACE( + org_varlink_service, + "org.varlink.service", + &vl_method_GetInfo, + &vl_method_GetInterfaceDescription, + &vl_error_InterfaceNotFound, + &vl_error_MethodNotFound, + &vl_error_MethodNotImplemented, + &vl_error_InvalidParameter, + &vl_error_PermissionDenied, + &vl_error_ExpectedMore); diff --git a/src/shared/varlink-org.varlink.service.h b/src/shared/varlink-org.varlink.service.h new file mode 100644 index 0000000..75c55e6 --- /dev/null +++ b/src/shared/varlink-org.varlink.service.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "varlink-idl.h" + +extern const VarlinkInterface vl_interface_org_varlink_service; diff --git a/src/shared/varlink.c b/src/shared/varlink.c new file mode 100644 index 0000000..749b644 --- /dev/null +++ b/src/shared/varlink.c @@ -0,0 +1,3767 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "io-util.h" +#include "iovec-util.h" +#include "list.h" +#include "path-util.h" +#include "process-util.h" +#include "selinux-util.h" +#include "serialize.h" +#include "set.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "varlink.h" +#include "varlink-internal.h" +#include "varlink-org.varlink.service.h" +#include "varlink-io.systemd.h" +#include "version.h" + +#define VARLINK_DEFAULT_CONNECTIONS_MAX 4096U +#define VARLINK_DEFAULT_CONNECTIONS_PER_UID_MAX 1024U + +#define VARLINK_DEFAULT_TIMEOUT_USEC (45U*USEC_PER_SEC) +#define VARLINK_BUFFER_MAX (16U*1024U*1024U) +#define VARLINK_READ_SIZE (64U*1024U) + +typedef enum VarlinkState { + /* Client side states */ + VARLINK_IDLE_CLIENT, + VARLINK_AWAITING_REPLY, + VARLINK_AWAITING_REPLY_MORE, + VARLINK_CALLING, + VARLINK_CALLED, + VARLINK_PROCESSING_REPLY, + + /* Server side states */ + VARLINK_IDLE_SERVER, + VARLINK_PROCESSING_METHOD, + VARLINK_PROCESSING_METHOD_MORE, + VARLINK_PROCESSING_METHOD_ONEWAY, + VARLINK_PROCESSED_METHOD, + VARLINK_PENDING_METHOD, + VARLINK_PENDING_METHOD_MORE, + + /* Common states (only during shutdown) */ + VARLINK_PENDING_DISCONNECT, + VARLINK_PENDING_TIMEOUT, + VARLINK_PROCESSING_DISCONNECT, + VARLINK_PROCESSING_TIMEOUT, + VARLINK_PROCESSING_FAILURE, + VARLINK_DISCONNECTED, + + _VARLINK_STATE_MAX, + _VARLINK_STATE_INVALID = -EINVAL, +} VarlinkState; + +/* Tests whether we are not yet disconnected. Note that this is true during all states where the connection + * is still good for something, and false only when it's dead for good. This means: when we are + * asynchronously connecting to a peer and the connect() is still pending, then this will return 'true', as + * the connection is still good, and we are likely to be able to properly operate on it soon. */ +#define VARLINK_STATE_IS_ALIVE(state) \ + IN_SET(state, \ + VARLINK_IDLE_CLIENT, \ + VARLINK_AWAITING_REPLY, \ + VARLINK_AWAITING_REPLY_MORE, \ + VARLINK_CALLING, \ + VARLINK_CALLED, \ + VARLINK_PROCESSING_REPLY, \ + VARLINK_IDLE_SERVER, \ + VARLINK_PROCESSING_METHOD, \ + VARLINK_PROCESSING_METHOD_MORE, \ + VARLINK_PROCESSING_METHOD_ONEWAY, \ + VARLINK_PROCESSED_METHOD, \ + VARLINK_PENDING_METHOD, \ + VARLINK_PENDING_METHOD_MORE) + +typedef struct VarlinkJsonQueueItem VarlinkJsonQueueItem; + +/* A queued message we shall write into the socket, along with the file descriptors to send at the same + * time. This queue item binds them together so that message/fd boundaries are maintained throughout the + * whole pipeline. */ +struct VarlinkJsonQueueItem { + LIST_FIELDS(VarlinkJsonQueueItem, queue); + JsonVariant *data; + size_t n_fds; + int fds[]; +}; + +struct Varlink { + unsigned n_ref; + + VarlinkServer *server; + + VarlinkState state; + bool connecting; /* This boolean indicates whether the socket fd we are operating on is currently + * processing an asynchronous connect(). In that state we watch the socket for + * EPOLLOUT, but we refrain from calling read() or write() on the socket as that + * will trigger ENOTCONN. Note that this boolean is kept separate from the + * VarlinkState above on purpose: while the connect() is still not complete we + * already want to allow queuing of messages and similar. Thus it's nice to keep + * these two state concepts separate: the VarlinkState encodes what our own view of + * the connection is, i.e. whether we think it's a server, a client, and has + * something queued already, while 'connecting' tells us a detail about the + * transport used below, that should have no effect on how we otherwise accept and + * process operations from the user. + * + * Or to say this differently: VARLINK_STATE_IS_ALIVE(state) tells you whether the + * connection is good to use, even if it might not be fully connected + * yet. connecting=true then informs you that actually we are still connecting, and + * the connection is actually not established yet and thus any requests you enqueue + * now will still work fine but will be queued only, not sent yet, but that + * shouldn't stop you from using the connection, since eventually whatever you queue + * *will* be sent. + * + * Or to say this even differently: 'state' is a high-level ("application layer" + * high, if you so will) state, while 'conecting' is a low-level ("transport layer" + * low, if you so will) state, and while they are not entirely unrelated and + * sometimes propagate effects to each other they are only asynchronously connected + * at most. */ + unsigned n_pending; + + int fd; + + char *input_buffer; /* valid data starts at input_buffer_index, ends at input_buffer_index+input_buffer_size */ + size_t input_buffer_index; + size_t input_buffer_size; + size_t input_buffer_unscanned; + + void *input_control_buffer; + size_t input_control_buffer_size; + + char *output_buffer; /* valid data starts at output_buffer_index, ends at output_buffer_index+output_buffer_size */ + size_t output_buffer_index; + size_t output_buffer_size; + + int *input_fds; /* file descriptors associated with the data in input_buffer (for fd passing) */ + size_t n_input_fds; + + int *output_fds; /* file descriptors associated with the data in output_buffer (for fd passing) */ + size_t n_output_fds; + + /* Further messages to output not yet formatted into text, and thus not included in output_buffer + * yet. We keep them separate from output_buffer, to not violate fd message boundaries: we want that + * each fd that is sent is associated with its fds, and that fds cannot be accidentally associated + * with preceding or following messages. */ + LIST_HEAD(VarlinkJsonQueueItem, output_queue); + VarlinkJsonQueueItem *output_queue_tail; + + /* The fds to associate with the next message that is about to be enqueued. The user first pushes the + * fds it intends to send via varlink_push_fd() into this queue, and then once the message data is + * submitted we'll combine the fds and the message data into one. */ + int *pushed_fds; + size_t n_pushed_fds; + + VarlinkReply reply_callback; + + JsonVariant *current; + VarlinkSymbol *current_method; + + struct ucred ucred; + bool ucred_acquired:1; + + bool write_disconnected:1; + bool read_disconnected:1; + bool prefer_read_write:1; + bool got_pollhup:1; + + bool allow_fd_passing_input:1; + bool allow_fd_passing_output:1; + + bool output_buffer_sensitive:1; /* whether to erase the output buffer after writing it to the socket */ + + int af; /* address family if socket; AF_UNSPEC if not socket; negative if not known */ + + usec_t timestamp; + usec_t timeout; + + void *userdata; + char *description; + + sd_event *event; + sd_event_source *io_event_source; + sd_event_source *time_event_source; + sd_event_source *quit_event_source; + sd_event_source *defer_event_source; + + pid_t exec_pid; +}; + +typedef struct VarlinkServerSocket VarlinkServerSocket; + +struct VarlinkServerSocket { + VarlinkServer *server; + + int fd; + char *address; + + sd_event_source *event_source; + + LIST_FIELDS(VarlinkServerSocket, sockets); +}; + +struct VarlinkServer { + unsigned n_ref; + VarlinkServerFlags flags; + + LIST_HEAD(VarlinkServerSocket, sockets); + + Hashmap *methods; /* Fully qualified symbol name of a method → VarlinkMethod */ + Hashmap *interfaces; /* Fully qualified interface name → VarlinkInterface* */ + Hashmap *symbols; /* Fully qualified symbol name of method/error → VarlinkSymbol* */ + VarlinkConnect connect_callback; + VarlinkDisconnect disconnect_callback; + + sd_event *event; + int64_t event_priority; + + unsigned n_connections; + Hashmap *by_uid; /* UID_TO_PTR(uid) → UINT_TO_PTR(n_connections) */ + + void *userdata; + char *description; + + unsigned connections_max; + unsigned connections_per_uid_max; + + bool exit_on_idle; +}; + +typedef struct VarlinkCollectContext { + JsonVariant *parameters; + const char *error_id; + VarlinkReplyFlags flags; +} VarlinkCollectContext ; + +static const char* const varlink_state_table[_VARLINK_STATE_MAX] = { + [VARLINK_IDLE_CLIENT] = "idle-client", + [VARLINK_AWAITING_REPLY] = "awaiting-reply", + [VARLINK_AWAITING_REPLY_MORE] = "awaiting-reply-more", + [VARLINK_CALLING] = "calling", + [VARLINK_CALLED] = "called", + [VARLINK_PROCESSING_REPLY] = "processing-reply", + [VARLINK_IDLE_SERVER] = "idle-server", + [VARLINK_PROCESSING_METHOD] = "processing-method", + [VARLINK_PROCESSING_METHOD_MORE] = "processing-method-more", + [VARLINK_PROCESSING_METHOD_ONEWAY] = "processing-method-oneway", + [VARLINK_PROCESSED_METHOD] = "processed-method", + [VARLINK_PENDING_METHOD] = "pending-method", + [VARLINK_PENDING_METHOD_MORE] = "pending-method-more", + [VARLINK_PENDING_DISCONNECT] = "pending-disconnect", + [VARLINK_PENDING_TIMEOUT] = "pending-timeout", + [VARLINK_PROCESSING_DISCONNECT] = "processing-disconnect", + [VARLINK_PROCESSING_TIMEOUT] = "processing-timeout", + [VARLINK_PROCESSING_FAILURE] = "processing-failure", + [VARLINK_DISCONNECTED] = "disconnected", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(varlink_state, VarlinkState); + +#define varlink_log_errno(v, error, fmt, ...) \ + log_debug_errno(error, "%s: " fmt, varlink_description(v), ##__VA_ARGS__) + +#define varlink_log(v, fmt, ...) \ + log_debug("%s: " fmt, varlink_description(v), ##__VA_ARGS__) + +#define varlink_server_log_errno(s, error, fmt, ...) \ + log_debug_errno(error, "%s: " fmt, varlink_server_description(s), ##__VA_ARGS__) + +#define varlink_server_log(s, fmt, ...) \ + log_debug("%s: " fmt, varlink_server_description(s), ##__VA_ARGS__) + +static int varlink_format_queue(Varlink *v); +static void varlink_server_test_exit_on_idle(VarlinkServer *s); + +static const char *varlink_description(Varlink *v) { + return (v ? v->description : NULL) ?: "varlink"; +} + +static const char *varlink_server_description(VarlinkServer *s) { + return (s ? s->description : NULL) ?: "varlink"; +} + +static VarlinkJsonQueueItem *varlink_json_queue_item_free(VarlinkJsonQueueItem *q) { + if (!q) + return NULL; + + json_variant_unref(q->data); + close_many(q->fds, q->n_fds); + + return mfree(q); +} + +static VarlinkJsonQueueItem *varlink_json_queue_item_new(JsonVariant *m, const int fds[], size_t n_fds) { + VarlinkJsonQueueItem *q; + + assert(m); + assert(fds || n_fds == 0); + + q = malloc(offsetof(VarlinkJsonQueueItem, fds) + sizeof(int) * n_fds); + if (!q) + return NULL; + + *q = (VarlinkJsonQueueItem) { + .data = json_variant_ref(m), + .n_fds = n_fds, + }; + + memcpy_safe(q->fds, fds, n_fds * sizeof(int)); + + return TAKE_PTR(q); +} + +static void varlink_set_state(Varlink *v, VarlinkState state) { + assert(v); + assert(state >= 0 && state < _VARLINK_STATE_MAX); + + if (v->state < 0) + varlink_log(v, "Setting state %s", + varlink_state_to_string(state)); + else + varlink_log(v, "Changing state %s %s %s", + varlink_state_to_string(v->state), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + varlink_state_to_string(state)); + + v->state = state; +} + +static int varlink_new(Varlink **ret) { + Varlink *v; + + assert(ret); + + v = new(Varlink, 1); + if (!v) + return -ENOMEM; + + *v = (Varlink) { + .n_ref = 1, + .fd = -EBADF, + + .state = _VARLINK_STATE_INVALID, + + .ucred = UCRED_INVALID, + + .timestamp = USEC_INFINITY, + .timeout = VARLINK_DEFAULT_TIMEOUT_USEC, + + .af = -1, + }; + + *ret = v; + return 0; +} + +int varlink_connect_address(Varlink **ret, const char *address) { + _cleanup_(varlink_unrefp) Varlink *v = NULL; + union sockaddr_union sockaddr; + int r; + + assert_return(ret, -EINVAL); + assert_return(address, -EINVAL); + + r = varlink_new(&v); + if (r < 0) + return log_debug_errno(r, "Failed to create varlink object: %m"); + + v->fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (v->fd < 0) + return log_debug_errno(errno, "Failed to create AF_UNIX socket: %m"); + + v->fd = fd_move_above_stdio(v->fd); + v->af = AF_UNIX; + + r = sockaddr_un_set_path(&sockaddr.un, address); + if (r < 0) { + if (r != -ENAMETOOLONG) + return log_debug_errno(r, "Failed to set socket address '%s': %m", address); + + /* This is a file system path, and too long to fit into sockaddr_un. Let's connect via O_PATH + * to this socket. */ + + r = connect_unix_path(v->fd, AT_FDCWD, address); + } else + r = RET_NERRNO(connect(v->fd, &sockaddr.sa, r)); + + if (r < 0) { + if (!IN_SET(r, -EAGAIN, -EINPROGRESS)) + return log_debug_errno(r, "Failed to connect to %s: %m", address); + + v->connecting = true; /* We are asynchronously connecting, i.e. the connect() is being + * processed in the background. As long as that's the case the socket + * is in a special state: it's there, we can poll it for EPOLLOUT, but + * if we attempt to write() to it before we see EPOLLOUT we'll get + * ENOTCONN (and not EAGAIN, like we would for a normal connected + * socket that isn't writable at the moment). Since ENOTCONN on write() + * hence can mean two different things (i.e. connection not complete + * yet vs. already disconnected again), we store as a boolean whether + * we are still in connect(). */ + } + + varlink_set_state(v, VARLINK_IDLE_CLIENT); + + *ret = TAKE_PTR(v); + return 0; +} + +int varlink_connect_exec(Varlink **ret, const char *_command, char **_argv) { + _cleanup_close_pair_ int pair[2] = EBADF_PAIR; + _cleanup_(sigkill_waitp) pid_t pid = 0; + _cleanup_free_ char *command = NULL; + _cleanup_strv_free_ char **argv = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return(_command, -EINVAL); + + /* Copy the strings, in case they point into our own argv[], which we'll invalidate shortly because + * we rename the child process */ + command = strdup(_command); + if (!command) + return -ENOMEM; + + if (strv_isempty(_argv)) + argv = strv_new(command); + else + argv = strv_copy(_argv); + if (!argv) + return -ENOMEM; + + log_debug("Forking off Varlink child process '%s'.", command); + + if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0, pair) < 0) + return log_debug_errno(errno, "Failed to allocate AF_UNIX socket pair: %m"); + + r = safe_fork_full( + "(sd-vlexec)", + /* stdio_fds= */ NULL, + /* except_fds= */ (int[]) { pair[1] }, + /* n_except_fds= */ 1, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, + &pid); + if (r < 0) + return log_debug_errno(r, "Failed to spawn process: %m"); + if (r == 0) { + char spid[DECIMAL_STR_MAX(pid_t)+1]; + const char *setenv_list[] = { + "LISTEN_FDS", "1", + "LISTEN_PID", spid, + "LISTEN_FDNAMES", "varlink", + NULL, NULL, + }; + /* Child */ + + pair[0] = -EBADF; + + r = move_fd(pair[1], 3, /* cloexec= */ false); + if (r < 0) { + log_debug_errno(r, "Failed to move file descriptor to 3: %m"); + _exit(EXIT_FAILURE); + } + + xsprintf(spid, PID_FMT, pid); + + STRV_FOREACH_PAIR(a, b, setenv_list) { + if (setenv(*a, *b, /* override= */ true) < 0) { + log_debug_errno(errno, "Failed to set environment variable '%s': %m", *a); + _exit(EXIT_FAILURE); + } + } + + execvp(command, argv); + log_debug_errno(r, "Failed to invoke process '%s': %m", command); + _exit(EXIT_FAILURE); + } + + pair[1] = safe_close(pair[1]); + + Varlink *v; + r = varlink_new(&v); + if (r < 0) + return log_debug_errno(r, "Failed to create varlink object: %m"); + + v->fd = TAKE_FD(pair[0]); + v->af = AF_UNIX; + v->exec_pid = TAKE_PID(pid); + varlink_set_state(v, VARLINK_IDLE_CLIENT); + + *ret = v; + return 0; +} + +int varlink_connect_url(Varlink **ret, const char *url) { + _cleanup_free_ char *c = NULL; + const char *p; + bool exec; + int r; + + assert_return(ret, -EINVAL); + assert_return(url, -EINVAL); + + // FIXME: Add support for vsock:, ssh-exec:, ssh-unix: URL schemes here. (The latter with OpenSSH + // 9.4's -W switch for referencing remote AF_UNIX sockets.) + + /* The Varlink URL scheme is a bit underdefined. We support only the unix: transport for now, plus an + * exec: transport we made up ourselves. Strictly speaking this shouldn't even be called URL, since + * it has nothing to do with Internet URLs by RFC. */ + + p = startswith(url, "unix:"); + if (p) + exec = false; + else { + p = startswith(url, "exec:"); + if (!p) + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "URL scheme not supported."); + + exec = true; + } + + /* The varlink.org reference C library supports more than just file system paths. We might want to + * support that one day too. For now simply refuse that. */ + if (p[strcspn(p, ";?#")] != '\0') + return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "URL parameterization with ';', '?', '#' not supported."); + + if (exec || p[0] != '@') { /* no validity checks for abstract namespace */ + + if (!path_is_absolute(p)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path not absolute, refusing."); + + r = path_simplify_alloc(p, &c); + if (r < 0) + return r; + + if (!path_is_normalized(c)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path is not normalized, refusing."); + } + + if (exec) + return varlink_connect_exec(ret, c, NULL); + + return varlink_connect_address(ret, c ?: p); +} + +int varlink_connect_fd(Varlink **ret, int fd) { + Varlink *v; + int r; + + assert_return(ret, -EINVAL); + assert_return(fd >= 0, -EBADF); + + r = fd_nonblock(fd, true); + if (r < 0) + return log_debug_errno(r, "Failed to make fd %d nonblocking: %m", fd); + + r = varlink_new(&v); + if (r < 0) + return log_debug_errno(r, "Failed to create varlink object: %m"); + + v->fd = fd; + v->af = -1, + varlink_set_state(v, VARLINK_IDLE_CLIENT); + + /* Note that if this function is called we assume the passed socket (if it is one) is already + * properly connected, i.e. any asynchronous connect() done on it already completed. Because of that + * we'll not set the 'connecting' boolean here, i.e. we don't need to avoid write()ing to the socket + * until the connection is fully set up. Behaviour here is hence a bit different from + * varlink_connect_address() above, as there we do handle asynchronous connections ourselves and + * avoid doing write() on it before we saw EPOLLOUT for the first time. */ + + *ret = v; + return 0; +} + +static void varlink_detach_event_sources(Varlink *v) { + assert(v); + + v->io_event_source = sd_event_source_disable_unref(v->io_event_source); + v->time_event_source = sd_event_source_disable_unref(v->time_event_source); + v->quit_event_source = sd_event_source_disable_unref(v->quit_event_source); + v->defer_event_source = sd_event_source_disable_unref(v->defer_event_source); +} + +static void varlink_clear_current(Varlink *v) { + assert(v); + + /* Clears the currently processed incoming message */ + v->current = json_variant_unref(v->current); + v->current_method = NULL; + + close_many(v->input_fds, v->n_input_fds); + v->input_fds = mfree(v->input_fds); + v->n_input_fds = 0; +} + +static void varlink_clear(Varlink *v) { + assert(v); + + varlink_detach_event_sources(v); + + v->fd = safe_close(v->fd); + + varlink_clear_current(v); + + v->input_buffer = mfree(v->input_buffer); + v->output_buffer = v->output_buffer_sensitive ? erase_and_free(v->output_buffer) : mfree(v->output_buffer); + + v->input_control_buffer = mfree(v->input_control_buffer); + v->input_control_buffer_size = 0; + + close_many(v->output_fds, v->n_output_fds); + v->output_fds = mfree(v->output_fds); + v->n_output_fds = 0; + + close_many(v->pushed_fds, v->n_pushed_fds); + v->pushed_fds = mfree(v->pushed_fds); + v->n_pushed_fds = 0; + + LIST_CLEAR(queue, v->output_queue, varlink_json_queue_item_free); + v->output_queue_tail = NULL; + + v->event = sd_event_unref(v->event); + + if (v->exec_pid > 0) { + sigterm_wait(v->exec_pid); + v->exec_pid = 0; + } +} + +static Varlink* varlink_destroy(Varlink *v) { + if (!v) + return NULL; + + /* If this is called the server object must already been unreffed here. Why that? because when we + * linked up the varlink connection with the server object we took one ref in each direction */ + assert(!v->server); + + varlink_clear(v); + + free(v->description); + return mfree(v); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(Varlink, varlink, varlink_destroy); + +static int varlink_test_disconnect(Varlink *v) { + assert(v); + + /* Tests whether we the connection has been terminated. We are careful to not stop processing it + * prematurely, since we want to handle half-open connections as well as possible and want to flush + * out and read data before we close down if we can. */ + + /* Already disconnected? */ + if (!VARLINK_STATE_IS_ALIVE(v->state)) + return 0; + + /* Wait until connection setup is complete, i.e. until asynchronous connect() completes */ + if (v->connecting) + return 0; + + /* Still something to write and we can write? Stay around */ + if (v->output_buffer_size > 0 && !v->write_disconnected) + return 0; + + /* Both sides gone already? Then there's no need to stick around */ + if (v->read_disconnected && v->write_disconnected) + goto disconnect; + + /* If we are waiting for incoming data but the read side is shut down, disconnect. */ + if (IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING, VARLINK_IDLE_SERVER) && v->read_disconnected) + goto disconnect; + + /* Similar, if are a client that hasn't written anything yet but the write side is dead, also + * disconnect. We also explicitly check for POLLHUP here since we likely won't notice the write side + * being down if we never wrote anything. */ + if (v->state == VARLINK_IDLE_CLIENT && (v->write_disconnected || v->got_pollhup)) + goto disconnect; + + /* We are on the server side and still want to send out more replies, but we saw POLLHUP already, and + * either got no buffered bytes to write anymore or already saw a write error. In that case we should + * shut down the varlink link. */ + if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE) && (v->write_disconnected || v->output_buffer_size == 0) && v->got_pollhup) + goto disconnect; + + return 0; + +disconnect: + varlink_set_state(v, VARLINK_PENDING_DISCONNECT); + return 1; +} + +static int varlink_write(Varlink *v) { + ssize_t n; + int r; + + assert(v); + + if (!VARLINK_STATE_IS_ALIVE(v->state)) + return 0; + if (v->connecting) /* Writing while we are still wait for a non-blocking connect() to complete will + * result in ENOTCONN, hence exit early here */ + return 0; + if (v->write_disconnected) + return 0; + + /* If needed let's convert some output queue json variants into text form */ + r = varlink_format_queue(v); + if (r < 0) + return r; + + if (v->output_buffer_size == 0) + return 0; + + assert(v->fd >= 0); + + if (v->n_output_fds > 0) { /* If we shall send fds along, we must use sendmsg() */ + struct iovec iov = { + .iov_base = v->output_buffer + v->output_buffer_index, + .iov_len = v->output_buffer_size, + }; + struct msghdr mh = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_controllen = CMSG_SPACE(sizeof(int) * v->n_output_fds), + }; + + mh.msg_control = alloca0(mh.msg_controllen); + + struct cmsghdr *control = CMSG_FIRSTHDR(&mh); + control->cmsg_len = CMSG_LEN(sizeof(int) * v->n_output_fds); + control->cmsg_level = SOL_SOCKET; + control->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(control), v->output_fds, sizeof(int) * v->n_output_fds); + + n = sendmsg(v->fd, &mh, MSG_DONTWAIT|MSG_NOSIGNAL); + } else { + /* We generally prefer recv()/send() (mostly because of MSG_NOSIGNAL) but also want to be compatible + * with non-socket IO, hence fall back automatically. + * + * Use a local variable to help gcc figure out that we set 'n' in all cases. */ + bool prefer_write = v->prefer_read_write; + if (!prefer_write) { + n = send(v->fd, v->output_buffer + v->output_buffer_index, v->output_buffer_size, MSG_DONTWAIT|MSG_NOSIGNAL); + if (n < 0 && errno == ENOTSOCK) + prefer_write = v->prefer_read_write = true; + } + if (prefer_write) + n = write(v->fd, v->output_buffer + v->output_buffer_index, v->output_buffer_size); + } + if (n < 0) { + if (errno == EAGAIN) + return 0; + + if (ERRNO_IS_DISCONNECT(errno)) { + /* If we get informed about a disconnect on write, then let's remember that, but not + * act on it just yet. Let's wait for read() to report the issue first. */ + v->write_disconnected = true; + return 1; + } + + return -errno; + } + + if (v->output_buffer_sensitive) + explicit_bzero_safe(v->output_buffer + v->output_buffer_index, n); + + v->output_buffer_size -= n; + + if (v->output_buffer_size == 0) { + v->output_buffer_index = 0; + v->output_buffer_sensitive = false; /* We can reset the sensitive flag once the buffer is empty */ + } else + v->output_buffer_index += n; + + close_many(v->output_fds, v->n_output_fds); + v->n_output_fds = 0; + + v->timestamp = now(CLOCK_MONOTONIC); + return 1; +} + +#define VARLINK_FDS_MAX (16U*1024U) + +static int varlink_read(Varlink *v) { + struct iovec iov; + struct msghdr mh; + size_t rs; + ssize_t n; + void *p; + + assert(v); + + if (!IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING, VARLINK_IDLE_SERVER)) + return 0; + if (v->connecting) /* read() on a socket while we are in connect() will fail with EINVAL, hence exit early here */ + return 0; + if (v->current) + return 0; + if (v->input_buffer_unscanned > 0) + return 0; + if (v->read_disconnected) + return 0; + + if (v->input_buffer_size >= VARLINK_BUFFER_MAX) + return -ENOBUFS; + + assert(v->fd >= 0); + + if (MALLOC_SIZEOF_SAFE(v->input_buffer) <= v->input_buffer_index + v->input_buffer_size) { + size_t add; + + add = MIN(VARLINK_BUFFER_MAX - v->input_buffer_size, VARLINK_READ_SIZE); + + if (v->input_buffer_index == 0) { + + if (!GREEDY_REALLOC(v->input_buffer, v->input_buffer_size + add)) + return -ENOMEM; + + } else { + char *b; + + b = new(char, v->input_buffer_size + add); + if (!b) + return -ENOMEM; + + memcpy(b, v->input_buffer + v->input_buffer_index, v->input_buffer_size); + + free_and_replace(v->input_buffer, b); + v->input_buffer_index = 0; + } + } + + p = v->input_buffer + v->input_buffer_index + v->input_buffer_size; + rs = MALLOC_SIZEOF_SAFE(v->input_buffer) - (v->input_buffer_index + v->input_buffer_size); + + if (v->allow_fd_passing_input) { + iov = IOVEC_MAKE(p, rs); + + /* Allocate the fd buffer on the heap, since we need a lot of space potentially */ + if (!v->input_control_buffer) { + v->input_control_buffer_size = CMSG_SPACE(sizeof(int) * VARLINK_FDS_MAX); + v->input_control_buffer = malloc(v->input_control_buffer_size); + if (!v->input_control_buffer) + return -ENOMEM; + } + + mh = (struct msghdr) { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = v->input_control_buffer, + .msg_controllen = v->input_control_buffer_size, + }; + + n = recvmsg_safe(v->fd, &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + } else { + bool prefer_read = v->prefer_read_write; + if (!prefer_read) { + n = recv(v->fd, p, rs, MSG_DONTWAIT); + if (n < 0 && errno == ENOTSOCK) + prefer_read = v->prefer_read_write = true; + } + if (prefer_read) + n = read(v->fd, p, rs); + } + if (n < 0) { + if (errno == EAGAIN) + return 0; + + if (ERRNO_IS_DISCONNECT(errno)) { + v->read_disconnected = true; + return 1; + } + + return -errno; + } + if (n == 0) { /* EOF */ + + if (v->allow_fd_passing_input) + cmsg_close_all(&mh); + + v->read_disconnected = true; + return 1; + } + + if (v->allow_fd_passing_input) { + struct cmsghdr* cmsg; + + cmsg = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1); + if (cmsg) { + size_t add; + + /* We only allow file descriptors to be passed along with the first byte of a + * message. If they are passed with any other byte this is a protocol violation. */ + if (v->input_buffer_size != 0) { + cmsg_close_all(&mh); + return -EPROTO; + } + + add = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + if (add > INT_MAX - v->n_input_fds) { + cmsg_close_all(&mh); + return -EBADF; + } + + if (!GREEDY_REALLOC(v->input_fds, v->n_input_fds + add)) { + cmsg_close_all(&mh); + return -ENOMEM; + } + + memcpy_safe(v->input_fds + v->n_input_fds, CMSG_TYPED_DATA(cmsg, int), add * sizeof(int)); + v->n_input_fds += add; + } + } + + v->input_buffer_size += n; + v->input_buffer_unscanned += n; + + return 1; +} + +static int varlink_parse_message(Varlink *v) { + const char *e, *begin; + size_t sz; + int r; + + assert(v); + + if (v->current) + return 0; + if (v->input_buffer_unscanned <= 0) + return 0; + + assert(v->input_buffer_unscanned <= v->input_buffer_size); + assert(v->input_buffer_index + v->input_buffer_size <= MALLOC_SIZEOF_SAFE(v->input_buffer)); + + begin = v->input_buffer + v->input_buffer_index; + + e = memchr(begin + v->input_buffer_size - v->input_buffer_unscanned, 0, v->input_buffer_unscanned); + if (!e) { + v->input_buffer_unscanned = 0; + return 0; + } + + sz = e - begin + 1; + + varlink_log(v, "New incoming message: %s", begin); /* FIXME: should we output the whole message here before validation? + * This may produce a non-printable journal entry if the message + * is invalid. We may also expose privileged information. */ + + r = json_parse(begin, 0, &v->current, NULL, NULL); + if (r < 0) { + /* If we encounter a parse failure flush all data. We cannot possibly recover from this, + * hence drop all buffered data now. */ + v->input_buffer_index = v->input_buffer_size = v->input_buffer_unscanned = 0; + return varlink_log_errno(v, r, "Failed to parse JSON: %m"); + } + + v->input_buffer_size -= sz; + + if (v->input_buffer_size == 0) + v->input_buffer_index = 0; + else + v->input_buffer_index += sz; + + v->input_buffer_unscanned = v->input_buffer_size; + return 1; +} + +static int varlink_test_timeout(Varlink *v) { + assert(v); + + if (!IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING)) + return 0; + if (v->timeout == USEC_INFINITY) + return 0; + + if (now(CLOCK_MONOTONIC) < usec_add(v->timestamp, v->timeout)) + return 0; + + varlink_set_state(v, VARLINK_PENDING_TIMEOUT); + + return 1; +} + +static int varlink_dispatch_local_error(Varlink *v, const char *error) { + int r; + + assert(v); + assert(error); + + if (!v->reply_callback) + return 0; + + r = v->reply_callback(v, NULL, error, VARLINK_REPLY_ERROR|VARLINK_REPLY_LOCAL, v->userdata); + if (r < 0) + log_debug_errno(r, "Reply callback returned error, ignoring: %m"); + + return 1; +} + +static int varlink_dispatch_timeout(Varlink *v) { + assert(v); + + if (v->state != VARLINK_PENDING_TIMEOUT) + return 0; + + varlink_set_state(v, VARLINK_PROCESSING_TIMEOUT); + varlink_dispatch_local_error(v, VARLINK_ERROR_TIMEOUT); + varlink_close(v); + + return 1; +} + +static int varlink_dispatch_disconnect(Varlink *v) { + assert(v); + + if (v->state != VARLINK_PENDING_DISCONNECT) + return 0; + + varlink_set_state(v, VARLINK_PROCESSING_DISCONNECT); + varlink_dispatch_local_error(v, VARLINK_ERROR_DISCONNECTED); + varlink_close(v); + + return 1; +} + +static int varlink_sanitize_parameters(JsonVariant **v) { + int r; + + assert(v); + + /* Varlink always wants a parameters list, hence make one if the caller doesn't want any */ + if (!*v) + return json_variant_new_object(v, NULL, 0); + if (json_variant_is_null(*v)) { + JsonVariant *empty; + + r = json_variant_new_object(&empty, NULL, 0); + if (r < 0) + return r; + + json_variant_unref(*v); + *v = empty; + return 0; + } + if (!json_variant_is_object(*v)) + return -EINVAL; + + return 0; +} + +static int varlink_dispatch_reply(Varlink *v) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + VarlinkReplyFlags flags = 0; + const char *error = NULL; + JsonVariant *e; + const char *k; + int r; + + assert(v); + + if (!IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING)) + return 0; + if (!v->current) + return 0; + + assert(v->n_pending > 0); + + if (!json_variant_is_object(v->current)) + goto invalid; + + JSON_VARIANT_OBJECT_FOREACH(k, e, v->current) { + + if (streq(k, "error")) { + if (error) + goto invalid; + if (!json_variant_is_string(e)) + goto invalid; + + error = json_variant_string(e); + flags |= VARLINK_REPLY_ERROR; + + } else if (streq(k, "parameters")) { + if (parameters) + goto invalid; + if (!json_variant_is_object(e) && !json_variant_is_null(e)) + goto invalid; + + parameters = json_variant_ref(e); + + } else if (streq(k, "continues")) { + if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + goto invalid; + + if (!json_variant_is_boolean(e)) + goto invalid; + + if (json_variant_boolean(e)) + flags |= VARLINK_REPLY_CONTINUES; + } else + goto invalid; + } + + /* Replies with 'continue' set are only OK if we set 'more' when the method call was initiated */ + if (v->state != VARLINK_AWAITING_REPLY_MORE && FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + goto invalid; + + /* An error is final */ + if (error && FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + goto invalid; + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + goto invalid; + + if (IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE)) { + varlink_set_state(v, VARLINK_PROCESSING_REPLY); + + if (v->reply_callback) { + r = v->reply_callback(v, parameters, error, flags, v->userdata); + if (r < 0) + log_debug_errno(r, "Reply callback returned error, ignoring: %m"); + } + + varlink_clear_current(v); + + if (v->state == VARLINK_PROCESSING_REPLY) { + + assert(v->n_pending > 0); + + if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + v->n_pending--; + + varlink_set_state(v, + FLAGS_SET(flags, VARLINK_REPLY_CONTINUES) ? VARLINK_AWAITING_REPLY_MORE : + v->n_pending == 0 ? VARLINK_IDLE_CLIENT : VARLINK_AWAITING_REPLY); + } + } else { + assert(v->state == VARLINK_CALLING); + varlink_set_state(v, VARLINK_CALLED); + } + + return 1; + +invalid: + varlink_set_state(v, VARLINK_PROCESSING_FAILURE); + varlink_dispatch_local_error(v, VARLINK_ERROR_PROTOCOL); + varlink_close(v); + + return 1; +} + +static int generic_method_get_info( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + + _cleanup_strv_free_ char **interfaces = NULL; + _cleanup_free_ char *product = NULL; + int r; + + assert(link); + + if (json_variant_elements(parameters) != 0) + return varlink_errorb(link, VARLINK_ERROR_INVALID_PARAMETER, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_VARIANT("parameter", json_variant_by_index(parameters, 0)))); + + product = strjoin("systemd (", program_invocation_short_name, ")"); + if (!product) + return -ENOMEM; + + VarlinkInterface *interface; + HASHMAP_FOREACH(interface, ASSERT_PTR(link->server)->interfaces) { + r = strv_extend(&interfaces, interface->name); + if (r < 0) + return r; + } + + strv_sort(interfaces); + + return varlink_replyb(link, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("vendor", "The systemd Project"), + JSON_BUILD_PAIR_STRING("product", product), + JSON_BUILD_PAIR_STRING("version", STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")"), + JSON_BUILD_PAIR_STRING("url", "https://systemd.io/"), + JSON_BUILD_PAIR_STRV("interfaces", interfaces))); +} + +static int generic_method_get_interface_description( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + + static const struct JsonDispatch dispatch_table[] = { + { "interface", JSON_VARIANT_STRING, json_dispatch_const_string, 0, JSON_MANDATORY }, + {} + }; + _cleanup_free_ char *text = NULL; + const VarlinkInterface *interface; + const char *name = NULL; + int r; + + assert(link); + + r = json_dispatch(parameters, dispatch_table, 0, &name); + if (r < 0) + return r; + + interface = hashmap_get(ASSERT_PTR(link->server)->interfaces, name); + if (!interface) + return varlink_errorb(link, VARLINK_ERROR_INTERFACE_NOT_FOUND, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("interface", name))); + + r = varlink_idl_format(interface, &text); + if (r < 0) + return r; + + return varlink_replyb(link, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("description", text))); +} + +static int varlink_dispatch_method(Varlink *v) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + VarlinkMethodFlags flags = 0; + const char *method = NULL; + JsonVariant *e; + VarlinkMethod callback; + const char *k; + int r; + + assert(v); + + if (v->state != VARLINK_IDLE_SERVER) + return 0; + if (!v->current) + return 0; + + if (!json_variant_is_object(v->current)) + goto invalid; + + JSON_VARIANT_OBJECT_FOREACH(k, e, v->current) { + + if (streq(k, "method")) { + if (method) + goto invalid; + if (!json_variant_is_string(e)) + goto invalid; + + method = json_variant_string(e); + + } else if (streq(k, "parameters")) { + if (parameters) + goto invalid; + if (!json_variant_is_object(e) && !json_variant_is_null(e)) + goto invalid; + + parameters = json_variant_ref(e); + + } else if (streq(k, "oneway")) { + + if ((flags & (VARLINK_METHOD_ONEWAY|VARLINK_METHOD_MORE)) != 0) + goto invalid; + + if (!json_variant_is_boolean(e)) + goto invalid; + + if (json_variant_boolean(e)) + flags |= VARLINK_METHOD_ONEWAY; + + } else if (streq(k, "more")) { + + if ((flags & (VARLINK_METHOD_ONEWAY|VARLINK_METHOD_MORE)) != 0) + goto invalid; + + if (!json_variant_is_boolean(e)) + goto invalid; + + if (json_variant_boolean(e)) + flags |= VARLINK_METHOD_MORE; + + } else + goto invalid; + } + + if (!method) + goto invalid; + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + goto fail; + + varlink_set_state(v, (flags & VARLINK_METHOD_MORE) ? VARLINK_PROCESSING_METHOD_MORE : + (flags & VARLINK_METHOD_ONEWAY) ? VARLINK_PROCESSING_METHOD_ONEWAY : + VARLINK_PROCESSING_METHOD); + + assert(v->server); + + /* First consult user supplied method implementations */ + callback = hashmap_get(v->server->methods, method); + if (!callback) { + if (streq(method, "org.varlink.service.GetInfo")) + callback = generic_method_get_info; + else if (streq(method, "org.varlink.service.GetInterfaceDescription")) + callback = generic_method_get_interface_description; + } + + if (callback) { + bool invalid = false; + + v->current_method = hashmap_get(v->server->symbols, method); + if (!v->current_method) + log_debug("No interface description defined for method '%s', not validating.", method); + else { + const char *bad_field; + + r = varlink_idl_validate_method_call(v->current_method, parameters, &bad_field); + if (r < 0) { + log_debug_errno(r, "Parameters for method %s() didn't pass validation on field '%s': %m", method, strna(bad_field)); + + if (!FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) { + r = varlink_errorb(v, VARLINK_ERROR_INVALID_PARAMETER, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("parameter", bad_field))); + if (r < 0) + return r; + } + invalid = true; + } + } + + if (!invalid) { + r = callback(v, parameters, flags, v->userdata); + if (r < 0) { + log_debug_errno(r, "Callback for %s returned error: %m", method); + + /* We got an error back from the callback. Propagate it to the client if the method call remains unanswered. */ + if (v->state == VARLINK_PROCESSED_METHOD) + r = 0; /* already processed */ + else if (!FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) { + r = varlink_error_errno(v, r); + if (r < 0) + return r; + } + } + } + } else if (!FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) { + r = varlink_errorb(v, VARLINK_ERROR_METHOD_NOT_FOUND, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)))); + if (r < 0) + return r; + } else + r = 0; + + switch (v->state) { + + case VARLINK_PROCESSED_METHOD: /* Method call is fully processed */ + case VARLINK_PROCESSING_METHOD_ONEWAY: /* ditto */ + varlink_clear_current(v); + varlink_set_state(v, VARLINK_IDLE_SERVER); + break; + + case VARLINK_PROCESSING_METHOD: /* Method call wasn't replied to, will be replied to later */ + varlink_set_state(v, VARLINK_PENDING_METHOD); + break; + + case VARLINK_PROCESSING_METHOD_MORE: /* No reply for a "more" message was sent, more to come */ + varlink_set_state(v, VARLINK_PENDING_METHOD_MORE); + break; + + default: + assert_not_reached(); + } + + return r; + +invalid: + r = -EINVAL; + +fail: + varlink_set_state(v, VARLINK_PROCESSING_FAILURE); + varlink_dispatch_local_error(v, VARLINK_ERROR_PROTOCOL); + varlink_close(v); + + return r; +} + +int varlink_process(Varlink *v) { + int r; + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + varlink_ref(v); + + r = varlink_write(v); + if (r < 0) + varlink_log_errno(v, r, "Write failed: %m"); + if (r != 0) + goto finish; + + r = varlink_dispatch_reply(v); + if (r < 0) + varlink_log_errno(v, r, "Reply dispatch failed: %m"); + if (r != 0) + goto finish; + + r = varlink_dispatch_method(v); + if (r < 0) + varlink_log_errno(v, r, "Method dispatch failed: %m"); + if (r != 0) + goto finish; + + r = varlink_parse_message(v); + if (r < 0) + varlink_log_errno(v, r, "Message parsing failed: %m"); + if (r != 0) + goto finish; + + r = varlink_read(v); + if (r < 0) + varlink_log_errno(v, r, "Read failed: %m"); + if (r != 0) + goto finish; + + r = varlink_test_disconnect(v); + assert(r >= 0); + if (r != 0) + goto finish; + + r = varlink_dispatch_disconnect(v); + assert(r >= 0); + if (r != 0) + goto finish; + + r = varlink_test_timeout(v); + assert(r >= 0); + if (r != 0) + goto finish; + + r = varlink_dispatch_timeout(v); + assert(r >= 0); + if (r != 0) + goto finish; + +finish: + if (r >= 0 && v->defer_event_source) { + int q; + + /* If we did some processing, make sure we are called again soon */ + q = sd_event_source_set_enabled(v->defer_event_source, r > 0 ? SD_EVENT_ON : SD_EVENT_OFF); + if (q < 0) + r = varlink_log_errno(v, q, "Failed to enable deferred event source: %m"); + } + + if (r < 0) { + if (VARLINK_STATE_IS_ALIVE(v->state)) + /* Initiate disconnection */ + varlink_set_state(v, VARLINK_PENDING_DISCONNECT); + else + /* We failed while disconnecting, in that case close right away */ + varlink_close(v); + } + + varlink_unref(v); + return r; +} + +static void handle_revents(Varlink *v, int revents) { + assert(v); + + if (v->connecting) { + /* If we have seen POLLOUT or POLLHUP on a socket we are asynchronously waiting a connect() + * to complete on, we know we are ready. We don't read the connection error here though, + * we'll get the error on the next read() or write(). */ + if ((revents & (POLLOUT|POLLHUP)) == 0) + return; + + varlink_log(v, "Asynchronous connection completed."); + v->connecting = false; + } else { + /* Note that we don't care much about POLLIN/POLLOUT here, we'll just try reading and writing + * what we can. However, we do care about POLLHUP to detect connection termination even if we + * momentarily don't want to read nor write anything. */ + + if (!FLAGS_SET(revents, POLLHUP)) + return; + + varlink_log(v, "Got POLLHUP from socket."); + v->got_pollhup = true; + } +} + +int varlink_wait(Varlink *v, usec_t timeout) { + int r, fd, events; + usec_t t; + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + r = varlink_get_timeout(v, &t); + if (r < 0) + return r; + if (t != USEC_INFINITY) { + usec_t n; + + n = now(CLOCK_MONOTONIC); + if (t < n) + t = 0; + else + t = usec_sub_unsigned(t, n); + } + + if (timeout != USEC_INFINITY && + (t == USEC_INFINITY || timeout < t)) + t = timeout; + + fd = varlink_get_fd(v); + if (fd < 0) + return fd; + + events = varlink_get_events(v); + if (events < 0) + return events; + + r = fd_wait_for_event(fd, events, t); + if (ERRNO_IS_NEG_TRANSIENT(r)) /* Treat EINTR as not a timeout, but also nothing happened, and + * the caller gets a chance to call back into us */ + return 1; + if (r <= 0) + return r; + + handle_revents(v, r); + return 1; +} + +int varlink_is_idle(Varlink *v) { + assert_return(v, -EINVAL); + + /* Returns true if there's nothing pending on the connection anymore, i.e. we processed all incoming + * or outgoing messages fully, or finished disconnection */ + + return IN_SET(v->state, VARLINK_DISCONNECTED, VARLINK_IDLE_CLIENT, VARLINK_IDLE_SERVER); +} + +int varlink_get_fd(Varlink *v) { + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + if (v->fd < 0) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBADF), "No valid fd."); + + return v->fd; +} + +int varlink_get_events(Varlink *v) { + int ret = 0; + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + if (v->connecting) /* When processing an asynchronous connect(), we only wait for EPOLLOUT, which + * tells us that the connection is now complete. Before that we should neither + * write() or read() from the fd. */ + return EPOLLOUT; + + if (!v->read_disconnected && + IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING, VARLINK_IDLE_SERVER) && + !v->current && + v->input_buffer_unscanned <= 0) + ret |= EPOLLIN; + + if (!v->write_disconnected && + v->output_buffer_size > 0) + ret |= EPOLLOUT; + + return ret; +} + +int varlink_get_timeout(Varlink *v, usec_t *ret) { + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + if (IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING) && + v->timeout != USEC_INFINITY) { + if (ret) + *ret = usec_add(v->timestamp, v->timeout); + return 1; + } else { + if (ret) + *ret = USEC_INFINITY; + return 0; + } +} + +int varlink_flush(Varlink *v) { + int ret = 0, r; + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + for (;;) { + if (v->output_buffer_size == 0) + break; + if (v->write_disconnected) + return -ECONNRESET; + + r = varlink_write(v); + if (r < 0) + return r; + if (r > 0) { + ret = 1; + continue; + } + + r = fd_wait_for_event(v->fd, POLLOUT, USEC_INFINITY); + if (ERRNO_IS_NEG_TRANSIENT(r)) + continue; + if (r < 0) + return varlink_log_errno(v, r, "Poll failed on fd: %m"); + assert(r > 0); + + handle_revents(v, r); + } + + return ret; +} + +static void varlink_detach_server(Varlink *v) { + VarlinkServer *saved_server; + assert(v); + + if (!v->server) + return; + + if (v->server->by_uid && + v->ucred_acquired && + uid_is_valid(v->ucred.uid)) { + unsigned c; + + c = PTR_TO_UINT(hashmap_get(v->server->by_uid, UID_TO_PTR(v->ucred.uid))); + assert(c > 0); + + if (c == 1) + (void) hashmap_remove(v->server->by_uid, UID_TO_PTR(v->ucred.uid)); + else + (void) hashmap_replace(v->server->by_uid, UID_TO_PTR(v->ucred.uid), UINT_TO_PTR(c - 1)); + } + + assert(v->server->n_connections > 0); + v->server->n_connections--; + + /* If this is a connection associated to a server, then let's disconnect the server and the + * connection from each other. This drops the dangling reference that connect_callback() set up. But + * before we release the references, let's call the disconnection callback if it is defined. */ + + saved_server = TAKE_PTR(v->server); + + if (saved_server->disconnect_callback) + saved_server->disconnect_callback(saved_server, v, saved_server->userdata); + + varlink_server_test_exit_on_idle(saved_server); + varlink_server_unref(saved_server); + varlink_unref(v); +} + +int varlink_close(Varlink *v) { + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return 0; + + varlink_set_state(v, VARLINK_DISCONNECTED); + + /* Let's take a reference first, since varlink_detach_server() might drop the final (dangling) ref + * which would destroy us before we can call varlink_clear() */ + varlink_ref(v); + varlink_detach_server(v); + varlink_clear(v); + varlink_unref(v); + + return 1; +} + +Varlink* varlink_close_unref(Varlink *v) { + if (!v) + return NULL; + + (void) varlink_close(v); + return varlink_unref(v); +} + +Varlink* varlink_flush_close_unref(Varlink *v) { + if (!v) + return NULL; + + (void) varlink_flush(v); + return varlink_close_unref(v); +} + +static int varlink_format_json(Varlink *v, JsonVariant *m) { + _cleanup_(erase_and_freep) char *text = NULL; + int r; + + assert(v); + assert(m); + + r = json_variant_format(m, 0, &text); + if (r < 0) + return r; + assert(text[r] == '\0'); + + if (v->output_buffer_size + r + 1 > VARLINK_BUFFER_MAX) + return -ENOBUFS; + + varlink_log(v, "Sending message: %s", text); + + if (v->output_buffer_size == 0) { + + free_and_replace(v->output_buffer, text); + + v->output_buffer_size = r + 1; + v->output_buffer_index = 0; + + } else if (v->output_buffer_index == 0) { + + if (!GREEDY_REALLOC(v->output_buffer, v->output_buffer_size + r + 1)) + return -ENOMEM; + + memcpy(v->output_buffer + v->output_buffer_size, text, r + 1); + v->output_buffer_size += r + 1; + } else { + char *n; + const size_t new_size = v->output_buffer_size + r + 1; + + n = new(char, new_size); + if (!n) + return -ENOMEM; + + memcpy(mempcpy(n, v->output_buffer + v->output_buffer_index, v->output_buffer_size), text, r + 1); + + free_and_replace(v->output_buffer, n); + v->output_buffer_size = new_size; + v->output_buffer_index = 0; + } + + if (json_variant_is_sensitive(m)) + v->output_buffer_sensitive = true; /* Propagate sensitive flag */ + else + text = mfree(text); /* No point in the erase_and_free() destructor declared above */ + + return 0; +} + +static int varlink_enqueue_json(Varlink *v, JsonVariant *m) { + VarlinkJsonQueueItem *q; + + assert(v); + assert(m); + + /* If there are no file descriptors to be queued and no queue entries yet we can shortcut things and + * append this entry directly to the output buffer */ + if (v->n_pushed_fds == 0 && !v->output_queue) + return varlink_format_json(v, m); + + /* Otherwise add a queue entry for this */ + q = varlink_json_queue_item_new(m, v->pushed_fds, v->n_pushed_fds); + if (!q) + return -ENOMEM; + + v->n_pushed_fds = 0; /* fds now belong to the queue entry */ + + LIST_INSERT_AFTER(queue, v->output_queue, v->output_queue_tail, q); + v->output_queue_tail = q; + return 0; +} + +static int varlink_format_queue(Varlink *v) { + int r; + + assert(v); + + /* Takes entries out of the output queue and formats them into the output buffer. But only if this + * would not corrupt our fd message boundaries */ + + while (v->output_queue) { + _cleanup_free_ int *array = NULL; + VarlinkJsonQueueItem *q = v->output_queue; + + if (v->n_output_fds > 0) /* unwritten fds? if we'd add more we'd corrupt the fd message boundaries, hence wait */ + return 0; + + if (q->n_fds > 0) { + array = newdup(int, q->fds, q->n_fds); + if (!array) + return -ENOMEM; + } + + r = varlink_format_json(v, q->data); + if (r < 0) + return r; + + /* Take possession of the queue element's fds */ + free(v->output_fds); + v->output_fds = TAKE_PTR(array); + v->n_output_fds = q->n_fds; + q->n_fds = 0; + + LIST_REMOVE(queue, v->output_queue, q); + if (!v->output_queue) + v->output_queue_tail = NULL; + + varlink_json_queue_item_free(q); + } + + return 0; +} + +int varlink_send(Varlink *v, const char *method, JsonVariant *parameters) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + assert_return(method, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + /* We allow enqueuing multiple method calls at once! */ + if (!IN_SET(v->state, VARLINK_IDLE_CLIENT, VARLINK_AWAITING_REPLY)) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)), + JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)), + JSON_BUILD_PAIR("oneway", JSON_BUILD_BOOLEAN(true)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + /* No state change here, this is one-way only after all */ + v->timestamp = now(CLOCK_MONOTONIC); + return 0; +} + +int varlink_sendb(Varlink *v, const char *method, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, method); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_send(v, method, parameters); +} + +int varlink_invoke(Varlink *v, const char *method, JsonVariant *parameters) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + assert_return(method, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + /* We allow enqueuing multiple method calls at once! */ + if (!IN_SET(v->state, VARLINK_IDLE_CLIENT, VARLINK_AWAITING_REPLY)) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)), + JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + varlink_set_state(v, VARLINK_AWAITING_REPLY); + v->n_pending++; + v->timestamp = now(CLOCK_MONOTONIC); + + return 0; +} + +int varlink_invokeb(Varlink *v, const char *method, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, method); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_invoke(v, method, parameters); +} + +int varlink_observe(Varlink *v, const char *method, JsonVariant *parameters) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + assert_return(method, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + /* Note that we don't allow enqueuing multiple method calls when we are in more/continues mode! We + * thus insist on an idle client here. */ + if (v->state != VARLINK_IDLE_CLIENT) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)), + JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)), + JSON_BUILD_PAIR("more", JSON_BUILD_BOOLEAN(true)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + varlink_set_state(v, VARLINK_AWAITING_REPLY_MORE); + v->n_pending++; + v->timestamp = now(CLOCK_MONOTONIC); + + return 0; +} + +int varlink_observeb(Varlink *v, const char *method, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, method); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_observe(v, method, parameters); +} + +int varlink_call( + Varlink *v, + const char *method, + JsonVariant *parameters, + JsonVariant **ret_parameters, + const char **ret_error_id, + VarlinkReplyFlags *ret_flags) { + + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + assert_return(method, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + if (v->state != VARLINK_IDLE_CLIENT) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + assert(v->n_pending == 0); /* n_pending can't be > 0 if we are in VARLINK_IDLE_CLIENT state */ + + /* If there was still a reply pinned from a previous call, now it's the time to get rid of it, so + * that we can assign a new reply shortly. */ + varlink_clear_current(v); + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)), + JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + varlink_set_state(v, VARLINK_CALLING); + v->n_pending++; + v->timestamp = now(CLOCK_MONOTONIC); + + while (v->state == VARLINK_CALLING) { + + r = varlink_process(v); + if (r < 0) + return r; + if (r > 0) + continue; + + r = varlink_wait(v, USEC_INFINITY); + if (r < 0) + return r; + } + + switch (v->state) { + + case VARLINK_CALLED: + assert(v->current); + + varlink_set_state(v, VARLINK_IDLE_CLIENT); + assert(v->n_pending == 1); + v->n_pending--; + + if (ret_parameters) + *ret_parameters = json_variant_by_key(v->current, "parameters"); + if (ret_error_id) + *ret_error_id = json_variant_string(json_variant_by_key(v->current, "error")); + if (ret_flags) + *ret_flags = 0; + + return 1; + + case VARLINK_PENDING_DISCONNECT: + case VARLINK_DISCONNECTED: + return varlink_log_errno(v, SYNTHETIC_ERRNO(ECONNRESET), "Connection was closed."); + + case VARLINK_PENDING_TIMEOUT: + return varlink_log_errno(v, SYNTHETIC_ERRNO(ETIME), "Connection timed out."); + + default: + assert_not_reached(); + } +} + +int varlink_callb( + Varlink *v, + const char *method, + JsonVariant **ret_parameters, + const char **ret_error_id, + VarlinkReplyFlags *ret_flags, ...) { + + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, ret_flags); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_call(v, method, parameters, ret_parameters, ret_error_id, ret_flags); +} + +static void varlink_collect_context_free(VarlinkCollectContext *cc) { + assert(cc); + + json_variant_unref(cc->parameters); + free((char *)cc->error_id); +} + +static int collect_callback( + Varlink *v, + JsonVariant *parameters, + const char *error_id, + VarlinkReplyFlags flags, + void *userdata) { + + VarlinkCollectContext *context = ASSERT_PTR(userdata); + int r; + + assert(v); + + context->flags = flags; + /* If we hit an error, we will drop all collected replies and just return the error_id and flags in varlink_collect() */ + if (error_id) { + context->error_id = error_id; + return 0; + } + + r = json_variant_append_array(&context->parameters, parameters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to append JSON object to array: %m"); + + return 1; +} + +int varlink_collect( + Varlink *v, + const char *method, + JsonVariant *parameters, + JsonVariant **ret_parameters, + const char **ret_error_id, + VarlinkReplyFlags *ret_flags) { + + _cleanup_(varlink_collect_context_free) VarlinkCollectContext context = {}; + int r; + + assert_return(v, -EINVAL); + assert_return(method, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + if (v->state != VARLINK_IDLE_CLIENT) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + assert(v->n_pending == 0); /* n_pending can't be > 0 if we are in VARLINK_IDLE_CLIENT state */ + + /* If there was still a reply pinned from a previous call, now it's the time to get rid of it, so + * that we can assign a new reply shortly. */ + varlink_clear_current(v); + + r = varlink_bind_reply(v, collect_callback); + if (r < 0) + return varlink_log_errno(v, r, "Failed to bind collect callback"); + + varlink_set_userdata(v, &context); + r = varlink_observe(v, method, parameters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to collect varlink method: %m"); + + while (v->state == VARLINK_AWAITING_REPLY_MORE) { + + r = varlink_process(v); + if (r < 0) + return r; + + /* If we get an error from any of the replies, return immediately with just the error_id and flags*/ + if (context.error_id) { + if (ret_error_id) + *ret_error_id = TAKE_PTR(context.error_id); + if (ret_flags) + *ret_flags = context.flags; + return 0; + } + + if (r > 0) + continue; + + r = varlink_wait(v, USEC_INFINITY); + if (r < 0) + return r; + } + + switch (v->state) { + + case VARLINK_IDLE_CLIENT: + break; + + case VARLINK_PENDING_DISCONNECT: + case VARLINK_DISCONNECTED: + return varlink_log_errno(v, SYNTHETIC_ERRNO(ECONNRESET), "Connection was closed."); + + case VARLINK_PENDING_TIMEOUT: + return varlink_log_errno(v, SYNTHETIC_ERRNO(ETIME), "Connection timed out."); + + default: + assert_not_reached(); + } + + if (ret_parameters) + *ret_parameters = TAKE_PTR(context.parameters); + if (ret_error_id) + *ret_error_id = TAKE_PTR(context.error_id); + if (ret_flags) + *ret_flags = context.flags; + return 1; +} + +int varlink_collectb( + Varlink *v, + const char *method, + JsonVariant **ret_parameters, + const char **ret_error_id, + VarlinkReplyFlags *ret_flags, ...) { + + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, ret_flags); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_collect(v, method, parameters, ret_parameters, ret_error_id, ret_flags); +} + +int varlink_reply(Varlink *v, JsonVariant *parameters) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return -ENOTCONN; + if (!IN_SET(v->state, + VARLINK_PROCESSING_METHOD, VARLINK_PROCESSING_METHOD_MORE, + VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE)) + return -EBUSY; + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + if (v->current_method) { + const char *bad_field = NULL; + + r = varlink_idl_validate_method_reply(v->current_method, parameters, &bad_field); + if (r < 0) + log_debug_errno(r, "Return parameters for method reply %s() didn't pass validation on field '%s', ignoring: %m", v->current_method->name, strna(bad_field)); + } + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE)) { + /* We just replied to a method call that was let hanging for a while (i.e. we were outside of + * the varlink_dispatch_method() stack frame), which means with this reply we are ready to + * process further messages. */ + varlink_clear_current(v); + varlink_set_state(v, VARLINK_IDLE_SERVER); + } else + /* We replied to a method call from within the varlink_dispatch_method() stack frame), which + * means we should it handle the rest of the state engine. */ + varlink_set_state(v, VARLINK_PROCESSED_METHOD); + + return 1; +} + +int varlink_replyb(Varlink *v, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, v); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return r; + + return varlink_reply(v, parameters); +} + +int varlink_error(Varlink *v, const char *error_id, JsonVariant *parameters) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + assert_return(error_id, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + if (!IN_SET(v->state, + VARLINK_PROCESSING_METHOD, VARLINK_PROCESSING_METHOD_MORE, + VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE)) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + /* Reset the list of pushed file descriptors before sending an error reply. We do this here to + * simplify code that puts together a complex reply message with fds, and half-way something + * fails. In that case the pushed fds need to be flushed out again. Under the assumption that it + * never makes sense to send fds along with errors we simply flush them out here beforehand, so that + * the callers don't need to do this explicitly. */ + varlink_reset_fds(v); + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("error", JSON_BUILD_STRING(error_id)), + JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + VarlinkSymbol *symbol = hashmap_get(v->server->symbols, error_id); + if (!symbol) + log_debug("No interface description defined for error '%s', not validating.", error_id); + else { + const char *bad_field = NULL; + + r = varlink_idl_validate_error(symbol, parameters, &bad_field); + if (r < 0) + log_debug_errno(r, "Parameters for error %s didn't pass validation on field '%s', ignoring: %m", error_id, strna(bad_field)); + } + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE)) { + varlink_clear_current(v); + varlink_set_state(v, VARLINK_IDLE_SERVER); + } else + varlink_set_state(v, VARLINK_PROCESSED_METHOD); + + return 1; +} + +int varlink_errorb(Varlink *v, const char *error_id, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + assert_return(error_id, -EINVAL); + + va_start(ap, error_id); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_error(v, error_id, parameters); +} + +int varlink_error_invalid_parameter(Varlink *v, JsonVariant *parameters) { + int r; + + assert_return(v, -EINVAL); + assert_return(parameters, -EINVAL); + + /* We expect to be called in one of two ways: the 'parameters' argument is a string variant in which + * case it is the parameter key name that is invalid. Or the 'parameters' argument is an object + * variant in which case we'll pull out the first key. The latter mode is useful in functions that + * don't expect any arguments. */ + + /* varlink_error(...) expects a json object as the third parameter. Passing a string variant causes + * parameter sanitization to fail, and it returns -EINVAL. */ + + if (json_variant_is_string(parameters)) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters_obj = NULL; + + r = json_build(¶meters_obj, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("parameter", JSON_BUILD_VARIANT(parameters)))); + if (r < 0) + return r; + + return varlink_error(v, VARLINK_ERROR_INVALID_PARAMETER, parameters_obj); + } + + if (json_variant_is_object(parameters) && + json_variant_elements(parameters) > 0) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters_obj = NULL; + + r = json_build(¶meters_obj, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("parameter", JSON_BUILD_VARIANT(json_variant_by_index(parameters, 0))))); + if (r < 0) + return r; + + return varlink_error(v, VARLINK_ERROR_INVALID_PARAMETER, parameters_obj); + } + + return -EINVAL; +} + +int varlink_error_errno(Varlink *v, int error) { + return varlink_errorb( + v, + VARLINK_ERROR_SYSTEM, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("errno", JSON_BUILD_INTEGER(abs(error))))); +} + +int varlink_notify(Varlink *v, JsonVariant *parameters) { + _cleanup_(json_variant_unrefp) JsonVariant *m = NULL; + int r; + + assert_return(v, -EINVAL); + + if (v->state == VARLINK_DISCONNECTED) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected."); + + /* If we want to reply with a notify connection but the caller didn't set "more", then return an + * error indicating that we expected to be called with "more" set */ + if (IN_SET(v->state, VARLINK_PROCESSING_METHOD, VARLINK_PENDING_METHOD)) + return varlink_error(v, VARLINK_ERROR_EXPECTED_MORE, NULL); + + if (!IN_SET(v->state, VARLINK_PROCESSING_METHOD_MORE, VARLINK_PENDING_METHOD_MORE)) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy."); + + r = varlink_sanitize_parameters(¶meters); + if (r < 0) + return varlink_log_errno(v, r, "Failed to sanitize parameters: %m"); + + r = json_build(&m, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)), + JSON_BUILD_PAIR("continues", JSON_BUILD_BOOLEAN(true)))); + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + if (v->current_method) { + const char *bad_field = NULL; + + r = varlink_idl_validate_method_reply(v->current_method, parameters, &bad_field); + if (r < 0) + log_debug_errno(r, "Return parameters for method reply %s() didn't pass validation on field '%s', ignoring: %m", v->current_method->name, strna(bad_field)); + } + + r = varlink_enqueue_json(v, m); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enqueue json message: %m"); + + /* No state change, as more is coming */ + return 1; +} + +int varlink_notifyb(Varlink *v, ...) { + _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL; + va_list ap; + int r; + + assert_return(v, -EINVAL); + + va_start(ap, v); + r = json_buildv(¶meters, ap); + va_end(ap); + + if (r < 0) + return varlink_log_errno(v, r, "Failed to build json message: %m"); + + return varlink_notify(v, parameters); +} + +int varlink_dispatch(Varlink *v, JsonVariant *parameters, const JsonDispatch table[], void *userdata) { + const char *bad_field = NULL; + int r; + + assert_return(v, -EINVAL); + assert_return(table, -EINVAL); + + /* A wrapper around json_dispatch_full() that returns a nice InvalidParameter error if we hit a problem with some field. */ + + r = json_dispatch_full(parameters, table, /* bad= */ NULL, /* flags= */ 0, userdata, &bad_field); + if (r < 0) { + if (bad_field) + return varlink_errorb(v, VARLINK_ERROR_INVALID_PARAMETER, + JSON_BUILD_OBJECT(JSON_BUILD_PAIR("parameter", JSON_BUILD_STRING(bad_field)))); + return r; + } + + return 0; +} + +int varlink_bind_reply(Varlink *v, VarlinkReply callback) { + assert_return(v, -EINVAL); + + if (callback && v->reply_callback && callback != v->reply_callback) + return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "A different callback was already set."); + + v->reply_callback = callback; + + return 0; +} + +void* varlink_set_userdata(Varlink *v, void *userdata) { + void *old; + + assert_return(v, NULL); + + old = v->userdata; + v->userdata = userdata; + + return old; +} + +void* varlink_get_userdata(Varlink *v) { + assert_return(v, NULL); + + return v->userdata; +} + +static int varlink_acquire_ucred(Varlink *v) { + int r; + + assert(v); + + if (v->ucred_acquired) + return 0; + + r = getpeercred(v->fd, &v->ucred); + if (r < 0) + return r; + + v->ucred_acquired = true; + return 0; +} + +int varlink_get_peer_uid(Varlink *v, uid_t *ret) { + int r; + + assert_return(v, -EINVAL); + assert_return(ret, -EINVAL); + + r = varlink_acquire_ucred(v); + if (r < 0) + return varlink_log_errno(v, r, "Failed to acquire credentials: %m"); + + if (!uid_is_valid(v->ucred.uid)) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENODATA), "Peer uid is invalid."); + + *ret = v->ucred.uid; + return 0; +} + +int varlink_get_peer_pid(Varlink *v, pid_t *ret) { + int r; + + assert_return(v, -EINVAL); + assert_return(ret, -EINVAL); + + r = varlink_acquire_ucred(v); + if (r < 0) + return varlink_log_errno(v, r, "Failed to acquire credentials: %m"); + + if (!pid_is_valid(v->ucred.pid)) + return varlink_log_errno(v, SYNTHETIC_ERRNO(ENODATA), "Peer uid is invalid."); + + *ret = v->ucred.pid; + return 0; +} + +int varlink_set_relative_timeout(Varlink *v, usec_t timeout) { + assert_return(v, -EINVAL); + assert_return(timeout > 0, -EINVAL); + + v->timeout = timeout; + return 0; +} + +VarlinkServer *varlink_get_server(Varlink *v) { + assert_return(v, NULL); + + return v->server; +} + +int varlink_set_description(Varlink *v, const char *description) { + assert_return(v, -EINVAL); + + return free_and_strdup(&v->description, description); +} + +static int io_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Varlink *v = ASSERT_PTR(userdata); + + assert(s); + + handle_revents(v, revents); + (void) varlink_process(v); + + return 1; +} + +static int time_callback(sd_event_source *s, uint64_t usec, void *userdata) { + Varlink *v = ASSERT_PTR(userdata); + + assert(s); + + (void) varlink_process(v); + return 1; +} + +static int defer_callback(sd_event_source *s, void *userdata) { + Varlink *v = ASSERT_PTR(userdata); + + assert(s); + + (void) varlink_process(v); + return 1; +} + +static int prepare_callback(sd_event_source *s, void *userdata) { + Varlink *v = ASSERT_PTR(userdata); + int r, e; + usec_t until; + bool have_timeout; + + assert(s); + + e = varlink_get_events(v); + if (e < 0) + return e; + + r = sd_event_source_set_io_events(v->io_event_source, e); + if (r < 0) + return varlink_log_errno(v, r, "Failed to set source events: %m"); + + r = varlink_get_timeout(v, &until); + if (r < 0) + return r; + have_timeout = r > 0; + + if (have_timeout) { + r = sd_event_source_set_time(v->time_event_source, until); + if (r < 0) + return varlink_log_errno(v, r, "Failed to set source time: %m"); + } + + r = sd_event_source_set_enabled(v->time_event_source, have_timeout ? SD_EVENT_ON : SD_EVENT_OFF); + if (r < 0) + return varlink_log_errno(v, r, "Failed to enable event source: %m"); + + return 1; +} + +static int quit_callback(sd_event_source *event, void *userdata) { + Varlink *v = ASSERT_PTR(userdata); + + assert(event); + + varlink_flush(v); + varlink_close(v); + + return 1; +} + +int varlink_attach_event(Varlink *v, sd_event *e, int64_t priority) { + int r; + + assert_return(v, -EINVAL); + assert_return(!v->event, -EBUSY); + + if (e) + v->event = sd_event_ref(e); + else { + r = sd_event_default(&v->event); + if (r < 0) + return varlink_log_errno(v, r, "Failed to create event source: %m"); + } + + r = sd_event_add_time(v->event, &v->time_event_source, CLOCK_MONOTONIC, 0, 0, time_callback, v); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(v->time_event_source, priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(v->time_event_source, "varlink-time"); + + r = sd_event_add_exit(v->event, &v->quit_event_source, quit_callback, v); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(v->quit_event_source, priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(v->quit_event_source, "varlink-quit"); + + r = sd_event_add_io(v->event, &v->io_event_source, v->fd, 0, io_callback, v); + if (r < 0) + goto fail; + + r = sd_event_source_set_prepare(v->io_event_source, prepare_callback); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(v->io_event_source, priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(v->io_event_source, "varlink-io"); + + r = sd_event_add_defer(v->event, &v->defer_event_source, defer_callback, v); + if (r < 0) + goto fail; + + r = sd_event_source_set_priority(v->defer_event_source, priority); + if (r < 0) + goto fail; + + (void) sd_event_source_set_description(v->defer_event_source, "varlink-defer"); + + return 0; + +fail: + varlink_log_errno(v, r, "Failed to setup event source: %m"); + varlink_detach_event(v); + return r; +} + +void varlink_detach_event(Varlink *v) { + if (!v) + return; + + varlink_detach_event_sources(v); + + v->event = sd_event_unref(v->event); +} + +sd_event *varlink_get_event(Varlink *v) { + assert_return(v, NULL); + + return v->event; +} + +int varlink_push_fd(Varlink *v, int fd) { + int i; + + assert_return(v, -EINVAL); + assert_return(fd >= 0, -EBADF); + + /* Takes an fd to send along with the *next* varlink message sent via this varlink connection. This + * takes ownership of the specified fd. Use varlink_dup_fd() below to duplicate the fd first. */ + + if (!v->allow_fd_passing_output) + return -EPERM; + + if (v->n_pushed_fds >= INT_MAX) + return -ENOMEM; + + if (!GREEDY_REALLOC(v->pushed_fds, v->n_pushed_fds + 1)) + return -ENOMEM; + + i = (int) v->n_pushed_fds; + v->pushed_fds[v->n_pushed_fds++] = fd; + return i; +} + +int varlink_dup_fd(Varlink *v, int fd) { + _cleanup_close_ int dp = -1; + int r; + + assert_return(v, -EINVAL); + assert_return(fd >= 0, -EBADF); + + /* Like varlink_push_fd() but duplicates the specified fd instead of taking possession of it */ + + dp = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (dp < 0) + return -errno; + + r = varlink_push_fd(v, dp); + if (r < 0) + return r; + + TAKE_FD(dp); + return r; +} + +int varlink_reset_fds(Varlink *v) { + assert_return(v, -EINVAL); + + /* Closes all currently pending fds to send. This may be used whenever the caller is in the process + * of putting together a message with fds, and then eventually something fails and they need to + * rollback the fds. Note that this is implicitly called whenever an error reply is sent, see above. */ + + close_many(v->output_fds, v->n_output_fds); + v->n_output_fds = 0; + return 0; +} + +int varlink_peek_fd(Varlink *v, size_t i) { + assert_return(v, -EINVAL); + + /* Returns one of the file descriptors that were received along with the current message. This does + * not duplicate the fd nor invalidate it, it hence remains in our possession. */ + + if (!v->allow_fd_passing_input) + return -EPERM; + + if (i >= v->n_input_fds) + return -ENXIO; + + return v->input_fds[i]; +} + +int varlink_take_fd(Varlink *v, size_t i) { + assert_return(v, -EINVAL); + + /* Similar to varlink_peek_fd() but the file descriptor's ownership is passed to the caller, and + * we'll invalidate the reference to it under our possession. If called twice in a row will return + * -EBADF */ + + if (!v->allow_fd_passing_input) + return -EPERM; + + if (i >= v->n_input_fds) + return -ENXIO; + + return TAKE_FD(v->input_fds[i]); +} + +static int verify_unix_socket(Varlink *v) { + assert(v); + + if (v->af < 0) { + struct stat st; + + if (fstat(v->fd, &st) < 0) + return -errno; + if (!S_ISSOCK(st.st_mode)) { + v->af = AF_UNSPEC; + return -ENOTSOCK; + } + + v->af = socket_get_family(v->fd); + if (v->af < 0) + return v->af; + } + + return v->af == AF_UNIX ? 0 : -ENOMEDIUM; +} + +int varlink_set_allow_fd_passing_input(Varlink *v, bool b) { + int r; + + assert_return(v, -EINVAL); + + if (v->allow_fd_passing_input == b) + return 0; + + if (!b) { + v->allow_fd_passing_input = false; + return 1; + } + + r = verify_unix_socket(v); + if (r < 0) + return r; + + v->allow_fd_passing_input = true; + return 0; +} + +int varlink_set_allow_fd_passing_output(Varlink *v, bool b) { + int r; + + assert_return(v, -EINVAL); + + if (v->allow_fd_passing_output == b) + return 0; + + if (!b) { + v->allow_fd_passing_output = false; + return 1; + } + + r = verify_unix_socket(v); + if (r < 0) + return r; + + v->allow_fd_passing_output = true; + return 0; +} + +int varlink_server_new(VarlinkServer **ret, VarlinkServerFlags flags) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert_return(ret, -EINVAL); + assert_return((flags & ~_VARLINK_SERVER_FLAGS_ALL) == 0, -EINVAL); + + s = new(VarlinkServer, 1); + if (!s) + return log_oom_debug(); + + *s = (VarlinkServer) { + .n_ref = 1, + .flags = flags, + .connections_max = varlink_server_connections_max(NULL), + .connections_per_uid_max = varlink_server_connections_per_uid_max(NULL), + }; + + r = varlink_server_add_interface_many( + s, + &vl_interface_io_systemd, + &vl_interface_org_varlink_service); + if (r < 0) + return r; + + *ret = TAKE_PTR(s); + return 0; +} + +static VarlinkServer* varlink_server_destroy(VarlinkServer *s) { + char *m; + + if (!s) + return NULL; + + varlink_server_shutdown(s); + + while ((m = hashmap_steal_first_key(s->methods))) + free(m); + + hashmap_free(s->methods); + hashmap_free(s->interfaces); + hashmap_free(s->symbols); + hashmap_free(s->by_uid); + + sd_event_unref(s->event); + + free(s->description); + + return mfree(s); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(VarlinkServer, varlink_server, varlink_server_destroy); + +static int validate_connection(VarlinkServer *server, const struct ucred *ucred) { + int allowed = -1; + + assert(server); + assert(ucred); + + if (FLAGS_SET(server->flags, VARLINK_SERVER_ROOT_ONLY)) + allowed = ucred->uid == 0; + + if (FLAGS_SET(server->flags, VARLINK_SERVER_MYSELF_ONLY)) + allowed = allowed > 0 || ucred->uid == getuid(); + + if (allowed == 0) { /* Allow access when it is explicitly allowed or when neither + * VARLINK_SERVER_ROOT_ONLY nor VARLINK_SERVER_MYSELF_ONLY are specified. */ + varlink_server_log(server, "Unprivileged client attempted connection, refusing."); + return 0; + } + + if (server->n_connections >= server->connections_max) { + varlink_server_log(server, "Connection limit of %u reached, refusing.", server->connections_max); + return 0; + } + + if (FLAGS_SET(server->flags, VARLINK_SERVER_ACCOUNT_UID)) { + unsigned c; + + if (!uid_is_valid(ucred->uid)) { + varlink_server_log(server, "Client with invalid UID attempted connection, refusing."); + return 0; + } + + c = PTR_TO_UINT(hashmap_get(server->by_uid, UID_TO_PTR(ucred->uid))); + if (c >= server->connections_per_uid_max) { + varlink_server_log(server, "Per-UID connection limit of %u reached, refusing.", + server->connections_per_uid_max); + return 0; + } + } + + return 1; +} + +static int count_connection(VarlinkServer *server, const struct ucred *ucred) { + unsigned c; + int r; + + assert(server); + assert(ucred); + + server->n_connections++; + + if (FLAGS_SET(server->flags, VARLINK_SERVER_ACCOUNT_UID)) { + r = hashmap_ensure_allocated(&server->by_uid, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to allocate UID hash table: %m"); + + c = PTR_TO_UINT(hashmap_get(server->by_uid, UID_TO_PTR(ucred->uid))); + + varlink_server_log(server, "Connections of user " UID_FMT ": %u (of %u max)", + ucred->uid, c, server->connections_per_uid_max); + + r = hashmap_replace(server->by_uid, UID_TO_PTR(ucred->uid), UINT_TO_PTR(c + 1)); + if (r < 0) + return log_debug_errno(r, "Failed to increment counter in UID hash table: %m"); + } + + return 0; +} + +int varlink_server_add_connection(VarlinkServer *server, int fd, Varlink **ret) { + _cleanup_(varlink_unrefp) Varlink *v = NULL; + struct ucred ucred = UCRED_INVALID; + bool ucred_acquired; + int r; + + assert_return(server, -EINVAL); + assert_return(fd >= 0, -EBADF); + + if ((server->flags & (VARLINK_SERVER_ROOT_ONLY|VARLINK_SERVER_ACCOUNT_UID)) != 0) { + r = getpeercred(fd, &ucred); + if (r < 0) + return varlink_server_log_errno(server, r, "Failed to acquire peer credentials of incoming socket, refusing: %m"); + + ucred_acquired = true; + + r = validate_connection(server, &ucred); + if (r < 0) + return r; + if (r == 0) + return -EPERM; + } else + ucred_acquired = false; + + r = varlink_new(&v); + if (r < 0) + return varlink_server_log_errno(server, r, "Failed to allocate connection object: %m"); + + r = count_connection(server, &ucred); + if (r < 0) + return r; + + v->fd = fd; + if (server->flags & VARLINK_SERVER_INHERIT_USERDATA) + v->userdata = server->userdata; + + if (ucred_acquired) { + v->ucred = ucred; + v->ucred_acquired = true; + } + + _cleanup_free_ char *desc = NULL; + if (asprintf(&desc, "%s-%i", server->description ?: "varlink", v->fd) >= 0) + v->description = TAKE_PTR(desc); + + /* Link up the server and the connection, and take reference in both directions. Note that the + * reference on the connection is left dangling. It will be dropped when the connection is closed, + * which happens in varlink_close(), including in the event loop quit callback. */ + v->server = varlink_server_ref(server); + varlink_ref(v); + + varlink_set_state(v, VARLINK_IDLE_SERVER); + + if (server->event) { + r = varlink_attach_event(v, server->event, server->event_priority); + if (r < 0) { + varlink_log_errno(v, r, "Failed to attach new connection: %m"); + v->fd = -EBADF; /* take the fd out of the connection again */ + varlink_close(v); + return r; + } + } + + if (ret) + *ret = v; + + return 0; +} + +static VarlinkServerSocket *varlink_server_socket_free(VarlinkServerSocket *ss) { + if (!ss) + return NULL; + + free(ss->address); + return mfree(ss); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkServerSocket *, varlink_server_socket_free); + +static int connect_callback(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + VarlinkServerSocket *ss = ASSERT_PTR(userdata); + _cleanup_close_ int cfd = -EBADF; + Varlink *v = NULL; + int r; + + assert(source); + + varlink_server_log(ss->server, "New incoming connection."); + + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (cfd < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return varlink_server_log_errno(ss->server, errno, "Failed to accept incoming socket: %m"); + } + + r = varlink_server_add_connection(ss->server, cfd, &v); + if (r < 0) + return 0; + + TAKE_FD(cfd); + + if (ss->server->connect_callback) { + r = ss->server->connect_callback(ss->server, v, ss->server->userdata); + if (r < 0) { + varlink_log_errno(v, r, "Connection callback returned error, disconnecting client: %m"); + varlink_close(v); + return 0; + } + } + + return 0; +} + +static int varlink_server_create_listen_fd_socket(VarlinkServer *s, int fd, VarlinkServerSocket **ret_ss) { + _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL; + int r; + + assert(s); + assert(fd >= 0); + assert(ret_ss); + + ss = new(VarlinkServerSocket, 1); + if (!ss) + return log_oom_debug(); + + *ss = (VarlinkServerSocket) { + .server = s, + .fd = fd, + }; + + if (s->event) { + r = sd_event_add_io(s->event, &ss->event_source, fd, EPOLLIN, connect_callback, ss); + if (r < 0) + return r; + + r = sd_event_source_set_priority(ss->event_source, s->event_priority); + if (r < 0) + return r; + } + + *ret_ss = TAKE_PTR(ss); + return 0; +} + +int varlink_server_listen_fd(VarlinkServer *s, int fd) { + _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL; + int r; + + assert_return(s, -EINVAL); + assert_return(fd >= 0, -EBADF); + + r = fd_nonblock(fd, true); + if (r < 0) + return r; + + r = fd_cloexec(fd, true); + if (r < 0) + return r; + + r = varlink_server_create_listen_fd_socket(s, fd, &ss); + if (r < 0) + return r; + + LIST_PREPEND(sockets, s->sockets, TAKE_PTR(ss)); + return 0; +} + +int varlink_server_listen_address(VarlinkServer *s, const char *address, mode_t m) { + _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL; + union sockaddr_union sockaddr; + socklen_t sockaddr_len; + _cleanup_close_ int fd = -EBADF; + int r; + + assert_return(s, -EINVAL); + assert_return(address, -EINVAL); + assert_return((m & ~0777) == 0, -EINVAL); + + r = sockaddr_un_set_path(&sockaddr.un, address); + if (r < 0) + return r; + sockaddr_len = r; + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + fd = fd_move_above_stdio(fd); + + (void) sockaddr_un_unlink(&sockaddr.un); + + WITH_UMASK(~m & 0777) { + r = mac_selinux_bind(fd, &sockaddr.sa, sockaddr_len); + if (r < 0) + return r; + } + + if (listen(fd, SOMAXCONN_DELUXE) < 0) + return -errno; + + r = varlink_server_create_listen_fd_socket(s, fd, &ss); + if (r < 0) + return r; + + r = free_and_strdup(&ss->address, address); + if (r < 0) + return r; + + LIST_PREPEND(sockets, s->sockets, TAKE_PTR(ss)); + TAKE_FD(fd); + return 0; +} + +int varlink_server_listen_auto(VarlinkServer *s) { + _cleanup_strv_free_ char **names = NULL; + int r, n = 0; + + assert_return(s, -EINVAL); + + /* Adds all passed fds marked as "varlink" to our varlink server. These fds can either refer to a + * listening socket or to a connection socket. + * + * See https://varlink.org/#activation for the environment variables this is backed by and the + * recommended "varlink" identifier in $LISTEN_FDNAMES. */ + + r = sd_listen_fds_with_names(/* unset_environment= */ false, &names); + if (r < 0) + return r; + + for (int i = 0; i < r; i++) { + int b, fd; + socklen_t l = sizeof(b); + + if (!streq(names[i], "varlink")) + continue; + + fd = SD_LISTEN_FDS_START + i; + + if (getsockopt(fd, SOL_SOCKET, SO_ACCEPTCONN, &b, &l) < 0) + return -errno; + + assert(l == sizeof(b)); + + if (b) /* Listening socket? */ + r = varlink_server_listen_fd(s, fd); + else /* Otherwise assume connection socket */ + r = varlink_server_add_connection(s, fd, NULL); + if (r < 0) + return r; + + n++; + } + + return n; +} + +void* varlink_server_set_userdata(VarlinkServer *s, void *userdata) { + void *ret; + + assert_return(s, NULL); + + ret = s->userdata; + s->userdata = userdata; + + return ret; +} + +void* varlink_server_get_userdata(VarlinkServer *s) { + assert_return(s, NULL); + + return s->userdata; +} + +int varlink_server_loop_auto(VarlinkServer *server) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int r; + + assert_return(server, -EINVAL); + assert_return(!server->event, -EBUSY); + + /* Runs a Varlink service event loop populated with a passed fd. Exits on the last connection. */ + + r = sd_event_new(&event); + if (r < 0) + return r; + + r = varlink_server_set_exit_on_idle(server, true); + if (r < 0) + return r; + + r = varlink_server_attach_event(server, event, 0); + if (r < 0) + return r; + + r = varlink_server_listen_auto(server); + if (r < 0) + return r; + + return sd_event_loop(event); +} + +static VarlinkServerSocket* varlink_server_socket_destroy(VarlinkServerSocket *ss) { + if (!ss) + return NULL; + + if (ss->server) + LIST_REMOVE(sockets, ss->server->sockets, ss); + + sd_event_source_disable_unref(ss->event_source); + + free(ss->address); + safe_close(ss->fd); + + return mfree(ss); +} + +int varlink_server_shutdown(VarlinkServer *s) { + assert_return(s, -EINVAL); + + while (s->sockets) + varlink_server_socket_destroy(s->sockets); + + return 0; +} + +static void varlink_server_test_exit_on_idle(VarlinkServer *s) { + assert(s); + + if (s->exit_on_idle && s->event && s->n_connections == 0) + (void) sd_event_exit(s->event, 0); +} + +int varlink_server_set_exit_on_idle(VarlinkServer *s, bool b) { + assert_return(s, -EINVAL); + + s->exit_on_idle = b; + varlink_server_test_exit_on_idle(s); + return 0; +} + +static int varlink_server_add_socket_event_source(VarlinkServer *s, VarlinkServerSocket *ss, int64_t priority) { + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL; + int r; + + assert(s); + assert(s->event); + assert(ss); + assert(ss->fd >= 0); + assert(!ss->event_source); + + r = sd_event_add_io(s->event, &es, ss->fd, EPOLLIN, connect_callback, ss); + if (r < 0) + return r; + + r = sd_event_source_set_priority(es, priority); + if (r < 0) + return r; + + ss->event_source = TAKE_PTR(es); + return 0; +} + +int varlink_server_attach_event(VarlinkServer *s, sd_event *e, int64_t priority) { + int r; + + assert_return(s, -EINVAL); + assert_return(!s->event, -EBUSY); + + if (e) + s->event = sd_event_ref(e); + else { + r = sd_event_default(&s->event); + if (r < 0) + return r; + } + + LIST_FOREACH(sockets, ss, s->sockets) { + r = varlink_server_add_socket_event_source(s, ss, priority); + if (r < 0) + goto fail; + } + + s->event_priority = priority; + return 0; + +fail: + varlink_server_detach_event(s); + return r; +} + +int varlink_server_detach_event(VarlinkServer *s) { + assert_return(s, -EINVAL); + + LIST_FOREACH(sockets, ss, s->sockets) + ss->event_source = sd_event_source_disable_unref(ss->event_source); + + sd_event_unref(s->event); + return 0; +} + +sd_event *varlink_server_get_event(VarlinkServer *s) { + assert_return(s, NULL); + + return s->event; +} + +static bool varlink_symbol_in_interface(const char *method, const char *interface) { + const char *p; + + assert(method); + assert(interface); + + p = startswith(method, interface); + if (!p) + return false; + + if (*p != '.') + return false; + + return !strchr(p+1, '.'); +} + +int varlink_server_bind_method(VarlinkServer *s, const char *method, VarlinkMethod callback) { + _cleanup_free_ char *m = NULL; + int r; + + assert_return(s, -EINVAL); + assert_return(method, -EINVAL); + assert_return(callback, -EINVAL); + + if (varlink_symbol_in_interface(method, "org.varlink.service") || + varlink_symbol_in_interface(method, "io.systemd")) + return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), "Cannot bind server to '%s'.", method); + + m = strdup(method); + if (!m) + return log_oom_debug(); + + r = hashmap_ensure_put(&s->methods, &string_hash_ops, m, callback); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0) + return log_debug_errno(r, "Failed to register callback: %m"); + if (r > 0) + TAKE_PTR(m); + + return 0; +} + +int varlink_server_bind_method_many_internal(VarlinkServer *s, ...) { + va_list ap; + int r = 0; + + assert_return(s, -EINVAL); + + va_start(ap, s); + for (;;) { + VarlinkMethod callback; + const char *method; + + method = va_arg(ap, const char *); + if (!method) + break; + + callback = va_arg(ap, VarlinkMethod); + + r = varlink_server_bind_method(s, method, callback); + if (r < 0) + break; + } + va_end(ap); + + return r; +} + +int varlink_server_bind_connect(VarlinkServer *s, VarlinkConnect callback) { + assert_return(s, -EINVAL); + + if (callback && s->connect_callback && callback != s->connect_callback) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "A different callback was already set."); + + s->connect_callback = callback; + return 0; +} + +int varlink_server_bind_disconnect(VarlinkServer *s, VarlinkDisconnect callback) { + assert_return(s, -EINVAL); + + if (callback && s->disconnect_callback && callback != s->disconnect_callback) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "A different callback was already set."); + + s->disconnect_callback = callback; + return 0; +} + +int varlink_server_add_interface(VarlinkServer *s, const VarlinkInterface *interface) { + int r; + + assert_return(s, -EINVAL); + assert_return(interface, -EINVAL); + assert_return(interface->name, -EINVAL); + + if (hashmap_contains(s->interfaces, interface->name)) + return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate registration of interface '%s'.", interface->name); + + r = hashmap_ensure_put(&s->interfaces, &string_hash_ops, interface->name, (void*) interface); + if (r < 0) + return r; + + for (const VarlinkSymbol *const*symbol = interface->symbols; *symbol; symbol++) { + _cleanup_free_ char *j = NULL; + + /* We only ever want to validate method calls/replies and errors against the interface + * definitions, hence don't bother with the type symbols */ + if (!IN_SET((*symbol)->symbol_type, VARLINK_METHOD, VARLINK_ERROR)) + continue; + + j = strjoin(interface->name, ".", (*symbol)->name); + if (!j) + return -ENOMEM; + + r = hashmap_ensure_put(&s->symbols, &string_hash_ops_free, j, (void*) *symbol); + if (r < 0) + return r; + + TAKE_PTR(j); + } + + return 0; +} + +int varlink_server_add_interface_many_internal(VarlinkServer *s, ...) { + va_list ap; + int r = 0; + + assert_return(s, -EINVAL); + + va_start(ap, s); + for (;;) { + const VarlinkInterface *interface = va_arg(ap, const VarlinkInterface*); + if (!interface) + break; + + r = varlink_server_add_interface(s, interface); + if (r < 0) + break; + } + va_end(ap); + + return r; +} + +unsigned varlink_server_connections_max(VarlinkServer *s) { + int dts; + + /* If a server is specified, return the setting for that server, otherwise the default value */ + if (s) + return s->connections_max; + + dts = getdtablesize(); + assert_se(dts > 0); + + /* Make sure we never use up more than ¾th of RLIMIT_NOFILE for IPC */ + if (VARLINK_DEFAULT_CONNECTIONS_MAX > (unsigned) dts / 4 * 3) + return dts / 4 * 3; + + return VARLINK_DEFAULT_CONNECTIONS_MAX; +} + +unsigned varlink_server_connections_per_uid_max(VarlinkServer *s) { + unsigned m; + + if (s) + return s->connections_per_uid_max; + + /* Make sure to never use up more than ¾th of available connections for a single user */ + m = varlink_server_connections_max(NULL); + if (VARLINK_DEFAULT_CONNECTIONS_PER_UID_MAX > m) + return m / 4 * 3; + + return VARLINK_DEFAULT_CONNECTIONS_PER_UID_MAX; +} + +int varlink_server_set_connections_per_uid_max(VarlinkServer *s, unsigned m) { + assert_return(s, -EINVAL); + assert_return(m > 0, -EINVAL); + + s->connections_per_uid_max = m; + return 0; +} + +int varlink_server_set_connections_max(VarlinkServer *s, unsigned m) { + assert_return(s, -EINVAL); + assert_return(m > 0, -EINVAL); + + s->connections_max = m; + return 0; +} + +unsigned varlink_server_current_connections(VarlinkServer *s) { + assert_return(s, UINT_MAX); + + return s->n_connections; +} + +int varlink_server_set_description(VarlinkServer *s, const char *description) { + assert_return(s, -EINVAL); + + return free_and_strdup(&s->description, description); +} + +int varlink_server_serialize(VarlinkServer *s, FILE *f, FDSet *fds) { + assert(f); + assert(fds); + + if (!s) + return 0; + + LIST_FOREACH(sockets, ss, s->sockets) { + int copy; + + assert(ss->address); + assert(ss->fd >= 0); + + fprintf(f, "varlink-server-socket-address=%s", ss->address); + + /* If we fail to serialize the fd, it will be considered an error during deserialization */ + copy = fdset_put_dup(fds, ss->fd); + if (copy < 0) + return copy; + + fprintf(f, " varlink-server-socket-fd=%i", copy); + + fputc('\n', f); + } + + return 0; +} + +int varlink_server_deserialize_one(VarlinkServer *s, const char *value, FDSet *fds) { + _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL; + _cleanup_free_ char *address = NULL; + const char *v = ASSERT_PTR(value); + int r, fd = -EBADF; + char *buf; + size_t n; + + assert(s); + assert(fds); + + n = strcspn(v, " "); + address = strndup(v, n); + if (!address) + return log_oom_debug(); + + if (v[n] != ' ') + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to deserialize VarlinkServerSocket: %s: %m", value); + v = startswith(v + n + 1, "varlink-server-socket-fd="); + if (!v) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to deserialize VarlinkServerSocket fd %s: %m", value); + + n = strcspn(v, " "); + buf = strndupa_safe(v, n); + + fd = parse_fd(buf); + if (fd < 0) + return log_debug_errno(fd, "Unable to parse VarlinkServerSocket varlink-server-socket-fd=%s: %m", buf); + if (!fdset_contains(fds, fd)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADF), + "VarlinkServerSocket varlink-server-socket-fd= has unknown fd %d: %m", fd); + + ss = new(VarlinkServerSocket, 1); + if (!ss) + return log_oom_debug(); + + *ss = (VarlinkServerSocket) { + .server = s, + .address = TAKE_PTR(address), + .fd = fdset_remove(fds, fd), + }; + + r = varlink_server_add_socket_event_source(s, ss, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_debug_errno(r, "Failed to add VarlinkServerSocket event source to the event loop: %m"); + + LIST_PREPEND(sockets, s->sockets, TAKE_PTR(ss)); + return 0; +} + +int varlink_invocation(VarlinkInvocationFlags flags) { + _cleanup_strv_free_ char **names = NULL; + int r, b; + socklen_t l = sizeof(b); + + /* Returns true if this is a "pure" varlink server invocation, i.e. with one fd passed. */ + + r = sd_listen_fds_with_names(/* unset_environment= */ false, &names); + if (r < 0) + return r; + if (r == 0) + return false; + if (r > 1) + return -ETOOMANYREFS; + + if (!strv_equal(names, STRV_MAKE("varlink"))) + return false; + + if (FLAGS_SET(flags, VARLINK_ALLOW_LISTEN|VARLINK_ALLOW_ACCEPT)) /* Both flags set? Then allow everything */ + return true; + + if ((flags & (VARLINK_ALLOW_LISTEN|VARLINK_ALLOW_ACCEPT)) == 0) /* Neither is set, then fail */ + return -EISCONN; + + if (getsockopt(SD_LISTEN_FDS_START, SOL_SOCKET, SO_ACCEPTCONN, &b, &l) < 0) + return -errno; + + assert(l == sizeof(b)); + + if (!FLAGS_SET(flags, b ? VARLINK_ALLOW_LISTEN : VARLINK_ALLOW_ACCEPT)) + return -EISCONN; + + return true; +} diff --git a/src/shared/varlink.h b/src/shared/varlink.h new file mode 100644 index 0000000..6ec708a --- /dev/null +++ b/src/shared/varlink.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-event.h" + +#include "json.h" +#include "time-util.h" +#include "varlink-idl.h" + +/* A minimal Varlink implementation. We only implement the minimal, obvious bits here though. No validation, + * no introspection, no name service, just the stuff actually needed. + * + * You might wonder why we aren't using libvarlink here? Varlink is a very simple protocol, which allows us + * to write our own implementation relatively easily. However, the main reasons are these: + * + * • We want to use our own JSON subsystem, with all the benefits that brings (i.e. accurate unsigned+signed + * 64-bit integers, full fuzzing, logging during parsing and so on). If we'd want to use that with + * libvarlink we'd have to serialize and deserialize all the time from its own representation which is + * inefficient and nasty. + * + * • We want integration into sd-event, but also synchronous event-loop-less operation + * + * • We need proper per-UID accounting and access control, since we want to allow communication between + * unprivileged clients and privileged servers. + * + * • And of course, we don't want the name service and introspection stuff for now (though that might + * change). + */ + +typedef struct Varlink Varlink; +typedef struct VarlinkServer VarlinkServer; + +typedef enum VarlinkReplyFlags { + VARLINK_REPLY_ERROR = 1 << 0, + VARLINK_REPLY_CONTINUES = 1 << 1, + VARLINK_REPLY_LOCAL = 1 << 2, +} VarlinkReplyFlags; + +typedef enum VarlinkMethodFlags { + VARLINK_METHOD_ONEWAY = 1 << 0, + VARLINK_METHOD_MORE = 2 << 1, +} VarlinkMethodFlags; + +typedef enum VarlinkServerFlags { + VARLINK_SERVER_ROOT_ONLY = 1 << 0, /* Only accessible by root */ + VARLINK_SERVER_MYSELF_ONLY = 1 << 1, /* Only accessible by our own UID */ + VARLINK_SERVER_ACCOUNT_UID = 1 << 2, /* Do per user accounting */ + VARLINK_SERVER_INHERIT_USERDATA = 1 << 3, /* Initialize Varlink connection userdata from VarlinkServer userdata */ + _VARLINK_SERVER_FLAGS_ALL = (1 << 4) - 1, +} VarlinkServerFlags; + +typedef int (*VarlinkMethod)(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata); +typedef int (*VarlinkReply)(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata); +typedef int (*VarlinkConnect)(VarlinkServer *server, Varlink *link, void *userdata); +typedef void (*VarlinkDisconnect)(VarlinkServer *server, Varlink *link, void *userdata); + +int varlink_connect_address(Varlink **ret, const char *address); +int varlink_connect_exec(Varlink **ret, const char *command, char **argv); +int varlink_connect_url(Varlink **ret, const char *url); +int varlink_connect_fd(Varlink **ret, int fd); + +Varlink* varlink_ref(Varlink *link); +Varlink* varlink_unref(Varlink *v); + +int varlink_get_fd(Varlink *v); +int varlink_get_events(Varlink *v); +int varlink_get_timeout(Varlink *v, usec_t *ret); + +int varlink_attach_event(Varlink *v, sd_event *e, int64_t priority); +void varlink_detach_event(Varlink *v); +sd_event *varlink_get_event(Varlink *v); + +int varlink_process(Varlink *v); +int varlink_wait(Varlink *v, usec_t timeout); + +int varlink_is_idle(Varlink *v); + +int varlink_flush(Varlink *v); +int varlink_close(Varlink *v); + +Varlink* varlink_flush_close_unref(Varlink *v); +Varlink* varlink_close_unref(Varlink *v); + +/* Enqueue method call, not expecting a reply */ +int varlink_send(Varlink *v, const char *method, JsonVariant *parameters); +int varlink_sendb(Varlink *v, const char *method, ...); + +/* Send method call and wait for reply */ +int varlink_call(Varlink *v, const char *method, JsonVariant *parameters, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags); +int varlink_callb(Varlink *v, const char *method, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags, ...); + +/* Send method call and begin collecting all 'more' replies into an array, finishing when a final reply is sent */ +int varlink_collect(Varlink *v, const char *method, JsonVariant *parameters, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags); +int varlink_collectb(Varlink *v, const char *method, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags, ...); + +/* Enqueue method call, expect a reply, which is eventually delivered to the reply callback */ +int varlink_invoke(Varlink *v, const char *method, JsonVariant *parameters); +int varlink_invokeb(Varlink *v, const char *method, ...); + +/* Enqueue method call, expect a reply now, and possibly more later, which are all delivered to the reply callback */ +int varlink_observe(Varlink *v, const char *method, JsonVariant *parameters); +int varlink_observeb(Varlink *v, const char *method, ...); + +/* Enqueue a final reply */ +int varlink_reply(Varlink *v, JsonVariant *parameters); +int varlink_replyb(Varlink *v, ...); + +/* Enqueue a (final) error */ +int varlink_error(Varlink *v, const char *error_id, JsonVariant *parameters); +int varlink_errorb(Varlink *v, const char *error_id, ...); +int varlink_error_invalid_parameter(Varlink *v, JsonVariant *parameters); +int varlink_error_errno(Varlink *v, int error); + +/* Enqueue a "more" reply */ +int varlink_notify(Varlink *v, JsonVariant *parameters); +int varlink_notifyb(Varlink *v, ...); + +/* Parsing incoming data via json_dispatch() and generate a nice error on parse errors */ +int varlink_dispatch(Varlink *v, JsonVariant *parameters, const JsonDispatch table[], void *userdata); + +/* Write outgoing fds into the socket (to be associated with the next enqueued message) */ +int varlink_push_fd(Varlink *v, int fd); +int varlink_dup_fd(Varlink *v, int fd); +int varlink_reset_fds(Varlink *v); + +/* Read incoming fds from the socket (associated with the currently handled message) */ +int varlink_peek_fd(Varlink *v, size_t i); +int varlink_take_fd(Varlink *v, size_t i); + +int varlink_set_allow_fd_passing_input(Varlink *v, bool b); +int varlink_set_allow_fd_passing_output(Varlink *v, bool b); + +/* Bind a disconnect, reply or timeout callback */ +int varlink_bind_reply(Varlink *v, VarlinkReply reply); + +void* varlink_set_userdata(Varlink *v, void *userdata); +void* varlink_get_userdata(Varlink *v); + +int varlink_get_peer_uid(Varlink *v, uid_t *ret); +int varlink_get_peer_pid(Varlink *v, pid_t *ret); + +int varlink_set_relative_timeout(Varlink *v, usec_t usec); + +VarlinkServer* varlink_get_server(Varlink *v); + +int varlink_set_description(Varlink *v, const char *d); + +/* Create a varlink server */ +int varlink_server_new(VarlinkServer **ret, VarlinkServerFlags flags); +VarlinkServer *varlink_server_ref(VarlinkServer *s); +VarlinkServer *varlink_server_unref(VarlinkServer *s); + +/* Add addresses or fds to listen on */ +int varlink_server_listen_address(VarlinkServer *s, const char *address, mode_t mode); +int varlink_server_listen_fd(VarlinkServer *s, int fd); +int varlink_server_listen_auto(VarlinkServer *s); +int varlink_server_add_connection(VarlinkServer *s, int fd, Varlink **ret); + +/* Bind callbacks */ +int varlink_server_bind_method(VarlinkServer *s, const char *method, VarlinkMethod callback); +int varlink_server_bind_method_many_internal(VarlinkServer *s, ...); +#define varlink_server_bind_method_many(s, ...) varlink_server_bind_method_many_internal(s, __VA_ARGS__, NULL) +int varlink_server_bind_connect(VarlinkServer *s, VarlinkConnect connect); +int varlink_server_bind_disconnect(VarlinkServer *s, VarlinkDisconnect disconnect); + +/* Add interface definition */ +int varlink_server_add_interface(VarlinkServer *s, const VarlinkInterface *interface); +int varlink_server_add_interface_many_internal(VarlinkServer *s, ...); +#define varlink_server_add_interface_many(s, ...) varlink_server_add_interface_many_internal(s, __VA_ARGS__, NULL) + +void* varlink_server_set_userdata(VarlinkServer *s, void *userdata); +void* varlink_server_get_userdata(VarlinkServer *s); + +int varlink_server_attach_event(VarlinkServer *v, sd_event *e, int64_t priority); +int varlink_server_detach_event(VarlinkServer *v); +sd_event *varlink_server_get_event(VarlinkServer *v); + +int varlink_server_loop_auto(VarlinkServer *server); + +int varlink_server_shutdown(VarlinkServer *server); + +int varlink_server_set_exit_on_idle(VarlinkServer *s, bool b); + +unsigned varlink_server_connections_max(VarlinkServer *s); +unsigned varlink_server_connections_per_uid_max(VarlinkServer *s); + +int varlink_server_set_connections_per_uid_max(VarlinkServer *s, unsigned m); +int varlink_server_set_connections_max(VarlinkServer *s, unsigned m); + +unsigned varlink_server_current_connections(VarlinkServer *s); + +int varlink_server_set_description(VarlinkServer *s, const char *description); + +typedef enum VarlinkInvocationFlags { + VARLINK_ALLOW_LISTEN = 1 << 0, + VARLINK_ALLOW_ACCEPT = 1 << 1, + _VARLINK_SERVER_INVOCATION_FLAGS_MAX = (1 << 2) - 1, + _VARLINK_SERVER_INVOCATION_FLAGS_INVALID = -EINVAL, +} VarlinkInvocationFlags; + +int varlink_invocation(VarlinkInvocationFlags flags); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Varlink *, varlink_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(Varlink *, varlink_close_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(Varlink *, varlink_flush_close_unref); +DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkServer *, varlink_server_unref); + +/* These are local errors that never cross the wire, and are our own invention */ +#define VARLINK_ERROR_DISCONNECTED "io.systemd.Disconnected" +#define VARLINK_ERROR_TIMEOUT "io.systemd.TimedOut" +#define VARLINK_ERROR_PROTOCOL "io.systemd.Protocol" + +/* This one we invented, and use for generically propagating system errors (errno) to clients */ +#define VARLINK_ERROR_SYSTEM "io.systemd.System" + +/* These are errors defined in the Varlink spec */ +#define VARLINK_ERROR_INTERFACE_NOT_FOUND "org.varlink.service.InterfaceNotFound" +#define VARLINK_ERROR_METHOD_NOT_FOUND "org.varlink.service.MethodNotFound" +#define VARLINK_ERROR_METHOD_NOT_IMPLEMENTED "org.varlink.service.MethodNotImplemented" +#define VARLINK_ERROR_INVALID_PARAMETER "org.varlink.service.InvalidParameter" + +/* These are errors we came up with and squatted the namespace with */ +#define VARLINK_ERROR_PERMISSION_DENIED "org.varlink.service.PermissionDenied" +#define VARLINK_ERROR_EXPECTED_MORE "org.varlink.service.ExpectedMore" diff --git a/src/shared/verb-log-control.c b/src/shared/verb-log-control.c new file mode 100644 index 0000000..555fb9f --- /dev/null +++ b/src/shared/verb-log-control.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "bus-error.h" +#include "log.h" +#include "strv.h" +#include "syslog-util.h" +#include "verb-log-control.h" + +int verb_log_control_common(sd_bus *bus, const char *destination, const char *verb, const char *value) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool level = endswith(verb, "log-level"); + const BusLocator bloc = { + .destination = destination, + .path = "/org/freedesktop/LogControl1", + .interface = "org.freedesktop.LogControl1", + }; + int r; + + assert(bus); + assert(endswith(verb, "log-level") || endswith(verb, "log-target")); + + if (value) { + if (level) { + r = log_level_from_string(value); + if (r < 0) + return log_error_errno(r, "\"%s\" is not a valid log level.", value); + } + + r = bus_set_property(bus, &bloc, + level ? "LogLevel" : "LogTarget", + &error, "s", value); + if (r < 0) + return log_error_errno(r, "Failed to set log %s of %s to %s: %s", + level ? "level" : "target", + bloc.destination, value, bus_error_message(&error, r)); + } else { + _cleanup_free_ char *t = NULL; + + r = bus_get_property_string(bus, &bloc, + level ? "LogLevel" : "LogTarget", + &error, &t); + if (r < 0) + return log_error_errno(r, "Failed to get log %s of %s: %s", + level ? "level" : "target", + bloc.destination, bus_error_message(&error, r)); + puts(t); + } + + return 0; +} diff --git a/src/shared/verb-log-control.h b/src/shared/verb-log-control.h new file mode 100644 index 0000000..b9e7cdd --- /dev/null +++ b/src/shared/verb-log-control.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bus-locator.h" + +int verb_log_control_common(sd_bus *bus, const char *destination, const char *verb, const char *value); diff --git a/src/shared/verbs.c b/src/shared/verbs.c new file mode 100644 index 0000000..a010952 --- /dev/null +++ b/src/shared/verbs.c @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "env-util.h" +#include "log.h" +#include "macro.h" +#include "process-util.h" +#include "string-util.h" +#include "verbs.h" +#include "virt.h" + +/* Wraps running_in_chroot() which is used in various places, but also adds an environment variable check so external + * processes can reliably force this on. + */ +bool running_in_chroot_or_offline(void) { + int r; + + /* Added to support use cases like rpm-ostree, where from %post scripts we only want to execute "preset", but + * not "start"/"restart" for example. + * + * See docs/ENVIRONMENT.md for docs. + */ + r = getenv_bool("SYSTEMD_OFFLINE"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_OFFLINE: %m"); + else if (r >= 0) + return r > 0; + + /* We've had this condition check for a long time which basically checks for legacy chroot case like Fedora's + * "mock", which is used for package builds. We don't want to try to start systemd services there, since + * without --new-chroot we don't even have systemd running, and even if we did, adding a concept of background + * daemons to builds would be an enormous change, requiring considering things like how the journal output is + * handled, etc. And there's really not a use case today for a build talking to a service. + * + * Note this call itself also looks for a different variable SYSTEMD_IGNORE_CHROOT=1. + */ + r = running_in_chroot(); + if (r < 0) + log_debug_errno(r, "running_in_chroot(): %m"); + + return r > 0; +} + +const Verb* verbs_find_verb(const char *name, const Verb verbs[]) { + assert(verbs); + + for (size_t i = 0; verbs[i].dispatch; i++) + if (name ? streq(name, verbs[i].verb) : FLAGS_SET(verbs[i].flags, VERB_DEFAULT)) + return verbs + i; + + /* At the end of the list? */ + return NULL; +} + +static const Verb* verbs_find_prefix_verb(const char *name, const Verb verbs[]) { + size_t best_distance = SIZE_MAX; + const Verb *best = NULL; + + assert(verbs); + + if (!name) + return NULL; + + for (size_t i = 0; verbs[i].dispatch; i++) { + const char *e; + size_t l; + + e = startswith(verbs[i].verb, name); + if (!e) + continue; + + l = strlen(e); + if (l < best_distance) { + best_distance = l; + best = verbs + i; + } + } + + return best; +} + +static const Verb* verbs_find_closest_verb(const char *name, const Verb verbs[]) { + ssize_t best_distance = SSIZE_MAX; + const Verb *best = NULL; + + assert(verbs); + + if (!name) + return NULL; + + for (size_t i = 0; verbs[i].dispatch; i++) { + ssize_t distance; + + distance = strlevenshtein(verbs[i].verb, name); + if (distance < 0) { + log_debug_errno(distance, "Failed to determine Levenshtein distance between %s and %s: %m", verbs[i].verb, name); + return NULL; + } + + if (distance > 5) /* If the distance is just too far off, don't make a bad suggestion */ + continue; + + if (distance < best_distance) { + best_distance = distance; + best = verbs + i; + } + } + + return best; +} + +int dispatch_verb(int argc, char *argv[], const Verb verbs[], void *userdata) { + const Verb *verb; + const char *name; + int left; + + assert(verbs); + assert(verbs[0].dispatch); + assert(argc >= 0); + assert(argv); + assert(argc >= optind); + + left = argc - optind; + argv += optind; + optind = 0; + name = argv[0]; + + verb = verbs_find_verb(name, verbs); + if (!verb) { + if (name) { + /* Be helperful to the user, and give a hint what the user might have wanted to + * type. We search with two mechanisms: a simple prefix match and – if that didn't + * yield results –, a Levenshtein word distance based match. */ + verb = verbs_find_prefix_verb(name, verbs); + if (!verb) + verb = verbs_find_closest_verb(name, verbs); + if (verb) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown command verb '%s', did you mean '%s'?", name, verb->verb); + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown command verb '%s'.", name); + } + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Command verb required."); + } + + if (!name) + left = 1; + + if (verb->min_args != VERB_ANY && + (unsigned) left < verb->min_args) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too few arguments."); + + if (verb->max_args != VERB_ANY && + (unsigned) left > verb->max_args) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too many arguments."); + + if ((verb->flags & VERB_ONLINE_ONLY) && running_in_chroot_or_offline()) { + log_info("Running in chroot, ignoring command '%s'", name ?: verb->verb); + return 0; + } + + if (!name) + return verb->dispatch(1, STRV_MAKE(verb->verb), userdata); + + return verb->dispatch(left, argv, userdata); + } diff --git a/src/shared/verbs.h b/src/shared/verbs.h new file mode 100644 index 0000000..03819e3 --- /dev/null +++ b/src/shared/verbs.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#define VERB_ANY (UINT_MAX) + +typedef enum VerbFlags { + VERB_DEFAULT = 1 << 0, /* The verb to run if no verb is specified */ + VERB_ONLINE_ONLY = 1 << 1, /* Just do nothing when running in chroot or offline */ +} VerbFlags; + +typedef struct { + const char *verb; + unsigned min_args, max_args; + VerbFlags flags; + int (* const dispatch)(int argc, char *argv[], void *userdata); +} Verb; + +bool running_in_chroot_or_offline(void); + +const Verb* verbs_find_verb(const char *name, const Verb verbs[]); +int dispatch_verb(int argc, char *argv[], const Verb verbs[], void *userdata); diff --git a/src/shared/vlan-util.c b/src/shared/vlan-util.c new file mode 100644 index 0000000..17f2d39 --- /dev/null +++ b/src/shared/vlan-util.c @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "parse-util.h" +#include "string-util.h" +#include "vlan-util.h" + +int parse_vlanid(const char *p, uint16_t *ret) { + uint16_t id; + int r; + + assert(p); + assert(ret); + + r = safe_atou16(p, &id); + if (r < 0) + return r; + if (!vlanid_is_valid(id)) + return -ERANGE; + + *ret = id; + return 0; +} + +int parse_vid_range(const char *p, uint16_t *vid, uint16_t *vid_end) { + unsigned lower, upper; + int r; + + r = parse_range(p, &lower, &upper); + if (r < 0) + return r; + + if (lower > VLANID_MAX || upper > VLANID_MAX || lower > upper) + return -EINVAL; + + *vid = lower; + *vid_end = upper; + return 0; +} + +int config_parse_default_port_vlanid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + uint16_t *id = ASSERT_PTR(data); + + assert(lvalue); + assert(rvalue); + + if (streq(rvalue, "none")) { + *id = 0; + return 0; + } + + return config_parse_vlanid(unit, filename, line, section, section_line, + lvalue, ltype, rvalue, data, userdata); +} + +int config_parse_vlanid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint16_t *id = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_vlanid(rvalue, id); + if (r == -ERANGE) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "VLAN identifier outside of valid range 0…4094, ignoring: %s", rvalue); + return 0; + } + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse VLAN identifier value, ignoring: %s", rvalue); + return 0; + } + + return 0; +} diff --git a/src/shared/vlan-util.h b/src/shared/vlan-util.h new file mode 100644 index 0000000..0336908 --- /dev/null +++ b/src/shared/vlan-util.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "conf-parser.h" + +#define VLANID_MAX 4094 +#define VLANID_INVALID UINT16_MAX + +/* Note that we permit VLAN Id 0 here, as that is apparently OK by the Linux kernel */ +static inline bool vlanid_is_valid(uint16_t id) { + return id <= VLANID_MAX; +} + +int parse_vlanid(const char *p, uint16_t *ret); +int parse_vid_range(const char *p, uint16_t *vid, uint16_t *vid_end); + +CONFIG_PARSER_PROTOTYPE(config_parse_default_port_vlanid); +CONFIG_PARSER_PROTOTYPE(config_parse_vlanid); diff --git a/src/shared/volatile-util.c b/src/shared/volatile-util.c new file mode 100644 index 0000000..5138edb --- /dev/null +++ b/src/shared/volatile-util.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "macro.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "string-table.h" +#include "string-util.h" +#include "volatile-util.h" + +int query_volatile_mode(VolatileMode *ret) { + _cleanup_free_ char *mode = NULL; + int r; + + r = proc_cmdline_get_key("systemd.volatile", PROC_CMDLINE_VALUE_OPTIONAL, &mode); + if (r < 0) + return r; + if (r == 0) { + *ret = VOLATILE_NO; + return 0; + } + + if (mode) { + VolatileMode m; + + m = volatile_mode_from_string(mode); + if (m < 0) + return m; + + *ret = m; + } else + *ret = VOLATILE_YES; + + return 1; +} + +static const char* const volatile_mode_table[_VOLATILE_MODE_MAX] = { + [VOLATILE_NO] = "no", + [VOLATILE_YES] = "yes", + [VOLATILE_STATE] = "state", + [VOLATILE_OVERLAY] = "overlay", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(volatile_mode, VolatileMode, VOLATILE_YES); diff --git a/src/shared/volatile-util.h b/src/shared/volatile-util.h new file mode 100644 index 0000000..6e0206d --- /dev/null +++ b/src/shared/volatile-util.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +typedef enum VolatileMode { + VOLATILE_NO, + VOLATILE_YES, + VOLATILE_STATE, + VOLATILE_OVERLAY, + _VOLATILE_MODE_MAX, + _VOLATILE_MODE_INVALID = -EINVAL, +} VolatileMode; + +VolatileMode volatile_mode_from_string(const char *s); +const char* volatile_mode_to_string(VolatileMode m); + +int query_volatile_mode(VolatileMode *ret); diff --git a/src/shared/wall.c b/src/shared/wall.c new file mode 100644 index 0000000..d5900ef --- /dev/null +++ b/src/shared/wall.c @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-login.h" + +#include "errno-util.h" +#include "fd-util.h" +#include "hostname-util.h" +#include "io-util.h" +#include "path-util.h" +#include "string-util.h" +#include "terminal-util.h" +#include "user-util.h" +#include "utmp-wtmp.h" +#include "wall.h" + +#if ENABLE_UTMP || ENABLE_LOGIND + +#define TIMEOUT_USEC (50 * USEC_PER_MSEC) + +static int write_to_terminal(const char *tty, const char *message) { + _cleanup_close_ int fd = -EBADF; + + assert(tty); + assert(message); + + fd = open(tty, O_WRONLY|O_NONBLOCK|O_NOCTTY|O_CLOEXEC); + if (fd < 0) + return -errno; + if (!isatty(fd)) + return -ENOTTY; + + return loop_write_full(fd, message, SIZE_MAX, TIMEOUT_USEC); +} + +static int wall_utmp( + const char *message, + bool (*match_tty)(const char *tty, bool is_local, void *userdata), + void *userdata) { + +#if ENABLE_UTMP + _unused_ _cleanup_(utxent_cleanup) bool utmpx = false; + struct utmpx *u; + int r = 0; + + assert(message); + + /* libc's setutxent() unfortunately doesn't inform us about success, i.e. whether /var/run/utmp + * exists. Hence we have to check manually first. */ + if (access(_PATH_UTMPX, F_OK) < 0) { + if (errno == ENOENT) + return -ENOPROTOOPT; + + return -errno; + } + + utmpx = utxent_start(); + + while ((u = getutxent())) { + _cleanup_free_ char *p = NULL; + const char *tty_path; + bool is_local; + + if (u->ut_type != USER_PROCESS || isempty(u->ut_user)) + continue; + + /* This access is fine, because strlen("/dev/") < 32 (UT_LINESIZE) */ + if (path_startswith(u->ut_line, "/dev/")) + tty_path = u->ut_line; + else { + if (asprintf(&p, "/dev/%.*s", (int) sizeof(u->ut_line), u->ut_line) < 0) + return -ENOMEM; + + tty_path = p; + } + + /* It seems that the address field is always set for remote logins. For local logins and + * other local entries, we get [0,0,0,0]. */ + is_local = eqzero(u->ut_addr_v6); + + if (!match_tty || match_tty(tty_path, is_local, userdata)) + RET_GATHER(r, write_to_terminal(tty_path, message)); + } + + return r; + +#else + return -ENOPROTOOPT; +#endif +} + +static int wall_logind( + const char *message, + bool (*match_tty)(const char *tty, bool is_local, void *userdata), + void *userdata) { + +#if ENABLE_LOGIND + _cleanup_strv_free_ char **sessions = NULL; + int r; + + assert(message); + + r = sd_get_sessions(&sessions); + if (r <= 0) + return r; + + r = 0; + + STRV_FOREACH(s, sessions) { + _cleanup_free_ char *tty_path = NULL, *tty = NULL, *rhost = NULL; + bool is_local; + int q; + + q = sd_session_get_tty(*s, &tty); + if (IN_SET(q, -ENXIO, -ENODATA)) + continue; + if (q < 0) + return RET_GATHER(r, q); + + tty_path = strjoin("/dev/", tty); + if (!tty_path) + return -ENOMEM; + + (void) sd_session_get_remote_host(*s, &rhost); + is_local = !rhost; + + if (!match_tty || match_tty(tty_path, is_local, userdata)) + RET_GATHER(r, write_to_terminal(tty_path, message)); + } + + return r; + +#else + return -ENOPROTOOPT; +#endif +} + +int wall( + const char *message, + const char *username, + const char *origin_tty, + bool (*match_tty)(const char *tty, bool is_local, void *userdata), + void *userdata) { + + _cleanup_free_ char *text = NULL, *hostname = NULL, *username_alloc = NULL, *stdin_tty = NULL; + int r; + + assert(message); + + hostname = gethostname_malloc(); + if (!hostname) + return -ENOMEM; + + if (!username) { + username_alloc = getlogname_malloc(); + if (!username_alloc) + return -ENOMEM; + + username = username_alloc; + } + + if (!origin_tty) { + (void) getttyname_harder(STDIN_FILENO, &stdin_tty); + origin_tty = stdin_tty; + } + + if (asprintf(&text, + "\r\n" + "Broadcast message from %s@%s%s%s (%s):\r\n\r\n" + "%s\r\n\r\n", + username, hostname, + origin_tty ? " on " : "", strempty(origin_tty), + FORMAT_TIMESTAMP(now(CLOCK_REALTIME)), + message) < 0) + return -ENOMEM; + + r = wall_utmp(text, match_tty, userdata); + if (r == -ENOPROTOOPT) + r = wall_logind(text, match_tty, userdata); + + return r == -ENOPROTOOPT ? 0 : r; +} + +#endif diff --git a/src/shared/wall.h b/src/shared/wall.h new file mode 100644 index 0000000..2964277 --- /dev/null +++ b/src/shared/wall.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#if ENABLE_UTMP || ENABLE_LOGIND + +int wall( + const char *message, + const char *username, + const char *origin_tty, + bool (*match_tty)(const char *tty, bool is_local, void *userdata), + void *userdata); + +#else + +static inline int wall( + const char *message, + const char *username, + const char *origin_tty, + bool (*match_tty)(const char *tty, bool is_local, void *userdata), + void *userdata) { + + return 0; +} + +#endif diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c new file mode 100644 index 0000000..2d79f71 --- /dev/null +++ b/src/shared/watchdog.c @@ -0,0 +1,504 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "devnum-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "log.h" +#include "path-util.h" +#include "string-util.h" +#include "time-util.h" +#include "watchdog.h" + +static int watchdog_fd = -EBADF; +static char *watchdog_device = NULL; +static usec_t watchdog_timeout = 0; /* 0 → close device and USEC_INFINITY → don't change timeout */ +static usec_t watchdog_pretimeout = 0; /* 0 → disable pretimeout and USEC_INFINITY → don't change pretimeout */ +static usec_t watchdog_last_ping = USEC_INFINITY; +static bool watchdog_supports_pretimeout = false; /* Depends on kernel state that might change at runtime */ +static char *watchdog_pretimeout_governor = NULL; + +/* Starting from kernel version 4.5, the maximum allowable watchdog timeout is + * UINT_MAX/1000U seconds (since internal calculations are done in milliseconds + * using unsigned integers. However, the kernel's userspace API for the watchdog + * uses signed integers for its ioctl parameters (even for timeout values and + * bit flags) so this is why we must consider the maximum signed integer value + * as well. + */ +#define WATCHDOG_TIMEOUT_MAX_SEC (CONST_MIN(UINT_MAX/1000U, (unsigned)INT_MAX)) + +#define WATCHDOG_GOV_NAME_MAXLEN 20 /* From the kernel watchdog driver */ + +static int saturated_usec_to_sec(usec_t val) { + usec_t t = DIV_ROUND_UP(val, USEC_PER_SEC); + return MIN(t, (usec_t) WATCHDOG_TIMEOUT_MAX_SEC); /* Saturate to watchdog max */ +} + +static int get_watchdog_sysfs_path(const char *filename, char **ret_path) { + struct stat st; + + if (watchdog_fd < 0) + return -EBADF; + + if (fstat(watchdog_fd, &st)) + return -errno; + + if (!S_ISCHR(st.st_mode)) + return -EBADF; + + if (asprintf(ret_path, "/sys/dev/char/"DEVNUM_FORMAT_STR"/%s", DEVNUM_FORMAT_VAL(st.st_rdev), filename) < 0) + return -ENOMEM; + + return 0; +} + +static int get_pretimeout_governor(char **ret_gov) { + _cleanup_free_ char *sys_fn = NULL; + int r; + + r = get_watchdog_sysfs_path("pretimeout_governor", &sys_fn); + if (r < 0) + return r; + + log_info("Watchdog: reading from %s", sys_fn); + + r = read_virtual_file(sys_fn, WATCHDOG_GOV_NAME_MAXLEN - 1, ret_gov, NULL); + if (r < 0) + return r; + + delete_trailing_chars(*ret_gov, WHITESPACE); + + return 0; +} + +static int set_pretimeout_governor(const char *governor) { + _cleanup_free_ char *sys_fn = NULL; + int r; + + if (isempty(governor)) + return 0; /* Nothing to do */ + + r = get_watchdog_sysfs_path("pretimeout_governor", &sys_fn); + if (r < 0) + return r; + + log_info("Watchdog: setting pretimeout_governor to '%s' via '%s'", governor, sys_fn); + + r = write_string_file(sys_fn, + governor, + WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_VERIFY_ON_FAILURE | WRITE_STRING_FILE_VERIFY_IGNORE_NEWLINE); + if (r < 0) + return log_error_errno(r, "Failed to set pretimeout_governor to '%s': %m", governor); + + return r; +} + +static int watchdog_set_enable(bool enable) { + int flags = enable ? WDIOS_ENABLECARD : WDIOS_DISABLECARD; + + assert(watchdog_fd >= 0); + + if (ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags) < 0) { + if (!enable) + return log_warning_errno(errno, "Failed to disable hardware watchdog, ignoring: %m"); + + /* ENOTTY means the watchdog is always enabled so we're fine */ + log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to enable hardware watchdog, ignoring: %m"); + if (!ERRNO_IS_NOT_SUPPORTED(errno)) + return -errno; + } + + return 0; +} + +static int watchdog_read_timeout(void) { + int sec = 0; + + assert(watchdog_fd >= 0); + + if (ioctl(watchdog_fd, WDIOC_GETTIMEOUT, &sec) < 0) + return -errno; + + assert(sec > 0); + watchdog_timeout = sec * USEC_PER_SEC; + + return 0; +} + +static int watchdog_set_timeout(void) { + int sec; + + assert(watchdog_fd >= 0); + assert(timestamp_is_set(watchdog_timeout)); + + sec = saturated_usec_to_sec(watchdog_timeout); + + if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &sec) < 0) + return -errno; + + assert(sec > 0); /* buggy driver ? */ + watchdog_timeout = sec * USEC_PER_SEC; + + return 0; +} + +static int watchdog_read_pretimeout(void) { + int sec = 0; + + assert(watchdog_fd >= 0); + + if (ioctl(watchdog_fd, WDIOC_GETPRETIMEOUT, &sec) < 0) { + watchdog_pretimeout = 0; + return log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, "Failed to get pretimeout value, ignoring: %m"); + } + + watchdog_pretimeout = sec * USEC_PER_SEC; + + return 0; +} + +static int watchdog_set_pretimeout(void) { + int sec; + + assert(watchdog_fd >= 0); + assert(watchdog_pretimeout != USEC_INFINITY); + + sec = saturated_usec_to_sec(watchdog_pretimeout); + + if (ioctl(watchdog_fd, WDIOC_SETPRETIMEOUT, &sec) < 0) { + watchdog_pretimeout = 0; + + if (ERRNO_IS_NOT_SUPPORTED(errno)) { + log_info("Watchdog does not support pretimeouts."); + return 0; + } + + return log_error_errno(errno, "Failed to set pretimeout to %s: %m", FORMAT_TIMESPAN(sec, USEC_PER_SEC)); + } + + /* The set ioctl does not return the actual value set so get it now. */ + (void) watchdog_read_pretimeout(); + + return 0; +} + +usec_t watchdog_get_last_ping(clockid_t clock) { + return map_clock_usec(watchdog_last_ping, CLOCK_BOOTTIME, clock); +} + +static int watchdog_ping_now(void) { + assert(watchdog_fd >= 0); + + if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) < 0) + return log_warning_errno(errno, "Failed to ping hardware watchdog, ignoring: %m"); + + watchdog_last_ping = now(CLOCK_BOOTTIME); + + return 0; +} + +static int update_pretimeout(void) { + _cleanup_free_ char *governor = NULL; + int r, t_sec, pt_sec; + + if (watchdog_fd < 0) + return 0; + + if (watchdog_timeout == USEC_INFINITY || watchdog_pretimeout == USEC_INFINITY) + return 0; + + if (!watchdog_supports_pretimeout && watchdog_pretimeout == 0) + return 0; /* Nothing to do */ + + /* The configuration changed, do not assume it can still work, as the module(s) + * might have been unloaded. */ + watchdog_supports_pretimeout = false; + + /* Update the pretimeout governor as well */ + (void) set_pretimeout_governor(watchdog_pretimeout_governor); + + r = get_pretimeout_governor(&governor); + if (r < 0) + return log_warning_errno(r, "Watchdog: failed to read pretimeout governor: %m"); + if (isempty(governor)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "Watchdog: no pretimeout governor detected - is the required kernel module loaded?"); + + /* If we have a pretimeout governor, then pretimeout is supported. Without a governor + * pretimeout does not work at all. + * Note that this might require a kernel module that is not autoloaded, so we don't + * cache this, but we check every time the configuration changes. */ + watchdog_supports_pretimeout = true; + + /* Determine if the pretimeout is valid for the current watchdog timeout. */ + t_sec = saturated_usec_to_sec(watchdog_timeout); + pt_sec = saturated_usec_to_sec(watchdog_pretimeout); + if (pt_sec >= t_sec) { + r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot set watchdog pretimeout to %is (%s watchdog timeout of %is)", + pt_sec, pt_sec == t_sec ? "same as" : "longer than", t_sec); + (void) watchdog_read_pretimeout(); + } else + r = watchdog_set_pretimeout(); + + if (watchdog_pretimeout == 0) + log_info("Watchdog pretimeout is disabled."); + else + log_info("Watchdog running with a pretimeout of %s with governor '%s'.", + FORMAT_TIMESPAN(watchdog_pretimeout, 0), + governor); + + return r; +} + +static int update_timeout(void) { + int r; + usec_t previous_timeout; + + assert(watchdog_timeout > 0); + + if (watchdog_fd < 0) + return 0; + + previous_timeout = watchdog_timeout; + + if (watchdog_timeout != USEC_INFINITY) { + r = watchdog_set_timeout(); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r)) + return log_error_errno(r, "Failed to set timeout to %s: %m", + FORMAT_TIMESPAN(watchdog_timeout, 0)); + + log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout."); + watchdog_timeout = USEC_INFINITY; + } + } + + if (watchdog_timeout == USEC_INFINITY) { + r = watchdog_read_timeout(); + if (r < 0) { + if (!ERRNO_IS_NOT_SUPPORTED(r)) + return log_error_errno(r, "Failed to query watchdog HW timeout: %m"); + log_info("Reading watchdog timeout is not supported, reusing the configured timeout."); + watchdog_timeout = previous_timeout; + } + } + + /* If the watchdog timeout was changed, the pretimeout could have been + * changed as well by the driver or the kernel so we need to update the + * pretimeout now. Or if the watchdog is being configured for the first + * time, we want to configure the pretimeout before it is enabled. */ + (void) update_pretimeout(); + + r = watchdog_set_enable(true); + if (r < 0) + return r; + + log_info("Watchdog running with a timeout of %s.", FORMAT_TIMESPAN(watchdog_timeout, 0)); + + return watchdog_ping_now(); +} + +static int open_watchdog(void) { + struct watchdog_info ident; + char **try_order; + int r; + + if (watchdog_fd >= 0) + return 0; + + /* Let's prefer new-style /dev/watchdog0 (i.e. kernel 3.5+) over classic /dev/watchdog. The former + * has the benefit that we can easily find the matching directory in sysfs from it, as the relevant + * sysfs attributes can only be found via /sys/dev/char/: if the new-style device + * major/minor is used, not the old-style. */ + try_order = !watchdog_device || PATH_IN_SET(watchdog_device, "/dev/watchdog", "/dev/watchdog0") ? + STRV_MAKE("/dev/watchdog0", "/dev/watchdog") : STRV_MAKE(watchdog_device); + + STRV_FOREACH(wd, try_order) { + watchdog_fd = open(*wd, O_WRONLY|O_CLOEXEC); + if (watchdog_fd >= 0) { + if (free_and_strdup(&watchdog_device, *wd) < 0) { + r = log_oom_debug(); + goto close_and_fail; + } + + break; + } + + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to open watchdog device %s: %m", *wd); + } + + if (watchdog_fd < 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to open watchdog device %s: %m", watchdog_device ?: "auto"); + + if (ioctl(watchdog_fd, WDIOC_GETSUPPORT, &ident) < 0) + log_debug_errno(errno, "Hardware watchdog %s does not support WDIOC_GETSUPPORT ioctl, ignoring: %m", watchdog_device); + else + log_info("Using hardware watchdog '%s', version %x, device %s", + ident.identity, + ident.firmware_version, + watchdog_device); + + r = update_timeout(); + if (r < 0) + goto close_and_fail; + + return 0; + +close_and_fail: + watchdog_close(/* disarm= */ true); + return r; +} + +const char *watchdog_get_device(void) { + return watchdog_device; +} + +int watchdog_set_device(const char *path) { + int r; + + r = free_and_strdup(&watchdog_device, path); + if (r > 0) /* watchdog_device changed */ + watchdog_close(/* disarm= */ true); + + return r; +} + +int watchdog_setup(usec_t timeout) { + usec_t previous_timeout; + int r; + + /* timeout=0 closes the device whereas passing timeout=USEC_INFINITY opens it (if needed) + * without configuring any particular timeout and thus reuses the programmed value (therefore + * it's a nop if the device is already opened). */ + + if (timeout == 0) { + watchdog_close(true); + return 0; + } + + /* Let's shortcut duplicated requests */ + if (watchdog_fd >= 0 && (timeout == watchdog_timeout || timeout == USEC_INFINITY)) + return 0; + + /* Initialize the watchdog timeout with the caller value. This value is going to be updated by + * update_timeout() with the closest value supported by the driver */ + previous_timeout = watchdog_timeout; + watchdog_timeout = timeout; + + if (watchdog_fd < 0) + return open_watchdog(); + + r = update_timeout(); + if (r < 0) + watchdog_timeout = previous_timeout; + + return r; +} + +int watchdog_setup_pretimeout(usec_t timeout) { + /* timeout=0 disables the pretimeout whereas timeout=USEC_INFINITY is a nop. */ + if ((watchdog_fd >= 0 && timeout == watchdog_pretimeout) || timeout == USEC_INFINITY) + return 0; + + /* Initialize the watchdog timeout with the caller value. This value is + * going to be updated by update_pretimeout() with the running value, + * even if it fails to update the timeout. */ + watchdog_pretimeout = timeout; + + return update_pretimeout(); +} + +int watchdog_setup_pretimeout_governor(const char *governor) { + if (free_and_strdup(&watchdog_pretimeout_governor, governor) < 0) + return -ENOMEM; + + return set_pretimeout_governor(watchdog_pretimeout_governor); +} + +static usec_t calc_timeout(void) { + /* Calculate the effective timeout which accounts for the watchdog + * pretimeout if configured and supported. */ + if (watchdog_supports_pretimeout && timestamp_is_set(watchdog_pretimeout) && watchdog_timeout >= watchdog_pretimeout) + return watchdog_timeout - watchdog_pretimeout; + else + return watchdog_timeout; +} + +usec_t watchdog_runtime_wait(void) { + usec_t timeout = calc_timeout(); + if (!timestamp_is_set(timeout)) + return USEC_INFINITY; + + /* Sleep half the watchdog timeout since the last successful ping at most */ + if (timestamp_is_set(watchdog_last_ping)) { + usec_t ntime = now(CLOCK_BOOTTIME); + + assert(ntime >= watchdog_last_ping); + return usec_sub_unsigned(watchdog_last_ping + (timeout / 2), ntime); + } + + return timeout / 2; +} + +int watchdog_ping(void) { + usec_t ntime, timeout; + + if (watchdog_timeout == 0) + return 0; + + if (watchdog_fd < 0) + /* open_watchdog() will automatically ping the device for us if necessary */ + return open_watchdog(); + + ntime = now(CLOCK_BOOTTIME); + timeout = calc_timeout(); + + /* Never ping earlier than watchdog_timeout/4 and try to ping + * by watchdog_timeout/2 plus scheduling latencies at the latest */ + if (timestamp_is_set(watchdog_last_ping)) { + assert(ntime >= watchdog_last_ping); + if ((ntime - watchdog_last_ping) < (timeout / 4)) + return 0; + } + + return watchdog_ping_now(); +} + +void watchdog_close(bool disarm) { + + /* Once closed, pinging the device becomes a NOP and we request a new + * call to watchdog_setup() to open the device again. */ + watchdog_timeout = 0; + + if (watchdog_fd < 0) + return; + + if (disarm) { + (void) watchdog_set_enable(false); + + /* To be sure, use magic close logic, too */ + for (;;) { + static const char v = 'V'; + + if (write(watchdog_fd, &v, 1) > 0) + break; + + if (errno != EINTR) { + log_warning_errno(errno, "Failed to disarm watchdog timer, ignoring: %m"); + break; + } + } + } + + watchdog_fd = safe_close(watchdog_fd); +} diff --git a/src/shared/watchdog.h b/src/shared/watchdog.h new file mode 100644 index 0000000..a490183 --- /dev/null +++ b/src/shared/watchdog.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "time-util.h" + +const char *watchdog_get_device(void); +usec_t watchdog_get_last_ping(clockid_t clock); + +int watchdog_set_device(const char *path); +int watchdog_setup(usec_t timeout); +int watchdog_setup_pretimeout(usec_t usec); +int watchdog_setup_pretimeout_governor(const char *governor); +int watchdog_ping(void); +void watchdog_close(bool disarm); +usec_t watchdog_runtime_wait(void); + +static inline void watchdog_free_device(void) { + (void) watchdog_set_device(NULL); +} diff --git a/src/shared/web-util.c b/src/shared/web-util.c new file mode 100644 index 0000000..39a300f --- /dev/null +++ b/src/shared/web-util.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "string-util.h" +#include "strv.h" +#include "utf8.h" +#include "web-util.h" + +bool http_etag_is_valid(const char *etag) { + if (isempty(etag)) + return false; + + if (!endswith(etag, "\"")) + return false; + + if (!STARTSWITH_SET(etag, "\"", "W/\"")) + return false; + + return true; +} + +bool http_url_is_valid(const char *url) { + const char *p; + + if (isempty(url)) + return false; + + p = STARTSWITH_SET(url, "http://", "https://"); + if (!p) + return false; + + if (isempty(p)) + return false; + + return ascii_is_valid(p); +} + +bool file_url_is_valid(const char *url) { + const char *p; + + if (isempty(url)) + return false; + + p = startswith(url, "file:/"); + if (isempty(p)) + return false; + + return ascii_is_valid(p); +} + +bool documentation_url_is_valid(const char *url) { + const char *p; + + if (isempty(url)) + return false; + + if (http_url_is_valid(url) || file_url_is_valid(url)) + return true; + + p = STARTSWITH_SET(url, "info:", "man:"); + if (isempty(p)) + return false; + + return ascii_is_valid(p); +} diff --git a/src/shared/web-util.h b/src/shared/web-util.h new file mode 100644 index 0000000..88b4897 --- /dev/null +++ b/src/shared/web-util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "macro.h" + +bool http_url_is_valid(const char *url) _pure_; +bool file_url_is_valid(const char *url) _pure_; + +bool documentation_url_is_valid(const char *url) _pure_; + +bool http_etag_is_valid(const char *etag); diff --git a/src/shared/wifi-util.c b/src/shared/wifi-util.c new file mode 100644 index 0000000..d4e6dca --- /dev/null +++ b/src/shared/wifi-util.c @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "log.h" +#include "string-table.h" +#include "string-util.h" +#include "wifi-util.h" + +int wifi_get_interface(sd_netlink *genl, int ifindex, enum nl80211_iftype *ret_iftype, char **ret_ssid) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL; + _cleanup_free_ char *ssid = NULL; + const char *family; + uint32_t iftype; + size_t len; + int r; + + assert(genl); + assert(ifindex > 0); + + r = sd_genl_message_new(genl, NL80211_GENL_NAME, NL80211_CMD_GET_INTERFACE, &m); + if (r < 0) + return log_debug_errno(r, "Failed to create generic netlink message: %m"); + + r = sd_netlink_message_append_u32(m, NL80211_ATTR_IFINDEX, ifindex); + if (r < 0) + return log_debug_errno(r, "Could not append NL80211_ATTR_IFINDEX attribute: %m"); + + r = sd_netlink_call(genl, m, 0, &reply); + if (r == -ENODEV) { + /* For obsolete WEXT driver. */ + log_debug_errno(r, "Failed to request information about wifi interface %d. " + "The device doesn't seem to have nl80211 interface. Ignoring.", + ifindex); + goto nodata; + } + if (r < 0) + return log_debug_errno(r, "Failed to request information about wifi interface %d: %m", ifindex); + if (!reply) { + log_debug("No reply received to request for information about wifi interface %d, ignoring.", ifindex); + goto nodata; + } + + r = sd_netlink_message_get_errno(reply); + if (r < 0) + return log_debug_errno(r, "Failed to get information about wifi interface %d: %m", ifindex); + + r = sd_genl_message_get_family_name(genl, reply, &family); + if (r < 0) + return log_debug_errno(r, "Failed to determine genl family: %m"); + if (!streq(family, NL80211_GENL_NAME)) { + log_debug("Received message of unexpected genl family '%s', ignoring.", family); + goto nodata; + } + + r = sd_netlink_message_read_u32(reply, NL80211_ATTR_IFTYPE, &iftype); + if (r < 0) + return log_debug_errno(r, "Failed to get NL80211_ATTR_IFTYPE attribute: %m"); + + r = sd_netlink_message_read_data_suffix0(reply, NL80211_ATTR_SSID, &len, (void**) &ssid); + if (r < 0 && r != -ENODATA) + return log_debug_errno(r, "Failed to get NL80211_ATTR_SSID attribute: %m"); + if (r >= 0) { + if (len == 0) { + log_debug("SSID has zero length, ignoring it."); + ssid = mfree(ssid); + } else if (strlen_ptr(ssid) != len) { + log_debug("SSID contains NUL characters, ignoring it."); + ssid = mfree(ssid); + } + } + + if (ret_iftype) + *ret_iftype = iftype; + + if (ret_ssid) + *ret_ssid = TAKE_PTR(ssid); + + return 1; + +nodata: + if (ret_iftype) + *ret_iftype = 0; + if (ret_ssid) + *ret_ssid = NULL; + return 0; +} + +int wifi_get_station(sd_netlink *genl, int ifindex, struct ether_addr *ret_bssid) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL; + const char *family; + int r; + + assert(genl); + assert(ifindex > 0); + assert(ret_bssid); + + r = sd_genl_message_new(genl, NL80211_GENL_NAME, NL80211_CMD_GET_STATION, &m); + if (r < 0) + return log_debug_errno(r, "Failed to create generic netlink message: %m"); + + r = sd_netlink_message_set_flags(m, NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP); + if (r < 0) + return log_debug_errno(r, "Failed to set dump flag: %m"); + + r = sd_netlink_message_append_u32(m, NL80211_ATTR_IFINDEX, ifindex); + if (r < 0) + return log_debug_errno(r, "Could not append NL80211_ATTR_IFINDEX attribute: %m"); + + r = sd_netlink_call(genl, m, 0, &reply); + if (r < 0) + return log_debug_errno(r, "Failed to request information about wifi station: %m"); + if (!reply) { + log_debug("No reply received to request for information about wifi station, ignoring."); + goto nodata; + } + + r = sd_netlink_message_get_errno(reply); + if (r < 0) + return log_debug_errno(r, "Failed to get information about wifi station: %m"); + + r = sd_genl_message_get_family_name(genl, reply, &family); + if (r < 0) + return log_debug_errno(r, "Failed to determine genl family: %m"); + if (!streq(family, NL80211_GENL_NAME)) { + log_debug("Received message of unexpected genl family '%s', ignoring.", family); + goto nodata; + } + + r = sd_netlink_message_read_ether_addr(reply, NL80211_ATTR_MAC, ret_bssid); + if (r == -ENODATA) + goto nodata; + if (r < 0) + return log_debug_errno(r, "Failed to get NL80211_ATTR_MAC attribute: %m"); + + return 1; + +nodata: + *ret_bssid = ETHER_ADDR_NULL; + return 0; +} + +static const char * const nl80211_iftype_table[NUM_NL80211_IFTYPES] = { + [NL80211_IFTYPE_ADHOC] = "ad-hoc", + [NL80211_IFTYPE_STATION] = "station", + [NL80211_IFTYPE_AP] = "ap", + [NL80211_IFTYPE_AP_VLAN] = "ap-vlan", + [NL80211_IFTYPE_WDS] = "wds", + [NL80211_IFTYPE_MONITOR] = "monitor", + [NL80211_IFTYPE_MESH_POINT] = "mesh-point", + [NL80211_IFTYPE_P2P_CLIENT] = "p2p-client", + [NL80211_IFTYPE_P2P_GO] = "p2p-go", + [NL80211_IFTYPE_P2P_DEVICE] = "p2p-device", + [NL80211_IFTYPE_OCB] = "ocb", + [NL80211_IFTYPE_NAN] = "nan", +}; + +DEFINE_STRING_TABLE_LOOKUP(nl80211_iftype, enum nl80211_iftype); + +static const char * const nl80211_cmd_table[__NL80211_CMD_AFTER_LAST] = { + [NL80211_CMD_GET_WIPHY] = "get_wiphy", + [NL80211_CMD_SET_WIPHY] = "set_wiphy", + [NL80211_CMD_NEW_WIPHY] = "new_wiphy", + [NL80211_CMD_DEL_WIPHY] = "del_wiphy", + [NL80211_CMD_GET_INTERFACE] = "get_interface", + [NL80211_CMD_SET_INTERFACE] = "set_interface", + [NL80211_CMD_NEW_INTERFACE] = "new_interface", + [NL80211_CMD_DEL_INTERFACE] = "del_interface", + [NL80211_CMD_GET_KEY] = "get_key", + [NL80211_CMD_SET_KEY] = "set_key", + [NL80211_CMD_NEW_KEY] = "new_key", + [NL80211_CMD_DEL_KEY] = "del_key", + [NL80211_CMD_GET_BEACON] = "get_beacon", + [NL80211_CMD_SET_BEACON] = "set_beacon", + [NL80211_CMD_START_AP] = "start_ap", + [NL80211_CMD_STOP_AP] = "stop_ap", + [NL80211_CMD_GET_STATION] = "get_station", + [NL80211_CMD_SET_STATION] = "set_station", + [NL80211_CMD_NEW_STATION] = "new_station", + [NL80211_CMD_DEL_STATION] = "del_station", + [NL80211_CMD_GET_MPATH] = "get_mpath", + [NL80211_CMD_SET_MPATH] = "set_mpath", + [NL80211_CMD_NEW_MPATH] = "new_mpath", + [NL80211_CMD_DEL_MPATH] = "del_mpath", + [NL80211_CMD_SET_BSS] = "set_bss", + [NL80211_CMD_SET_REG] = "set_reg", + [NL80211_CMD_REQ_SET_REG] = "req_set_reg", + [NL80211_CMD_GET_MESH_CONFIG] = "get_mesh_config", + [NL80211_CMD_SET_MESH_CONFIG] = "set_mesh_config", + [NL80211_CMD_SET_MGMT_EXTRA_IE] = "set_mgmt_extra_ie", + [NL80211_CMD_GET_REG] = "get_reg", + [NL80211_CMD_GET_SCAN] = "get_scan", + [NL80211_CMD_TRIGGER_SCAN] = "trigger_scan", + [NL80211_CMD_NEW_SCAN_RESULTS] = "new_scan_results", + [NL80211_CMD_SCAN_ABORTED] = "scan_aborted", + [NL80211_CMD_REG_CHANGE] = "reg_change", + [NL80211_CMD_AUTHENTICATE] = "authenticate", + [NL80211_CMD_ASSOCIATE] = "associate", + [NL80211_CMD_DEAUTHENTICATE] = "deauthenticate", + [NL80211_CMD_DISASSOCIATE] = "disassociate", + [NL80211_CMD_MICHAEL_MIC_FAILURE] = "michael_mic_failure", + [NL80211_CMD_REG_BEACON_HINT] = "reg_beacon_hint", + [NL80211_CMD_JOIN_IBSS] = "join_ibss", + [NL80211_CMD_LEAVE_IBSS] = "leave_ibss", + [NL80211_CMD_TESTMODE] = "testmode", + [NL80211_CMD_CONNECT] = "connect", + [NL80211_CMD_ROAM] = "roam", + [NL80211_CMD_DISCONNECT] = "disconnect", + [NL80211_CMD_SET_WIPHY_NETNS] = "set_wiphy_netns", + [NL80211_CMD_GET_SURVEY] = "get_survey", + [NL80211_CMD_NEW_SURVEY_RESULTS] = "new_survey_results", + [NL80211_CMD_SET_PMKSA] = "set_pmksa", + [NL80211_CMD_DEL_PMKSA] = "del_pmksa", + [NL80211_CMD_FLUSH_PMKSA] = "flush_pmksa", + [NL80211_CMD_REMAIN_ON_CHANNEL] = "remain_on_channel", + [NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL] = "cancel_remain_on_channel", + [NL80211_CMD_SET_TX_BITRATE_MASK] = "set_tx_bitrate_mask", + [NL80211_CMD_REGISTER_FRAME] = "register_frame", + [NL80211_CMD_FRAME] = "frame", + [NL80211_CMD_FRAME_TX_STATUS] = "frame_tx_status", + [NL80211_CMD_SET_POWER_SAVE] = "set_power_save", + [NL80211_CMD_GET_POWER_SAVE] = "get_power_save", + [NL80211_CMD_SET_CQM] = "set_cqm", + [NL80211_CMD_NOTIFY_CQM] = "notify_cqm", + [NL80211_CMD_SET_CHANNEL] = "set_channel", + [NL80211_CMD_SET_WDS_PEER] = "set_wds_peer", + [NL80211_CMD_FRAME_WAIT_CANCEL] = "frame_wait_cancel", + [NL80211_CMD_JOIN_MESH] = "join_mesh", + [NL80211_CMD_LEAVE_MESH] = "leave_mesh", + [NL80211_CMD_UNPROT_DEAUTHENTICATE] = "unprot_deauthenticate", + [NL80211_CMD_UNPROT_DISASSOCIATE] = "unprot_disassociate", + [NL80211_CMD_NEW_PEER_CANDIDATE] = "new_peer_candidate", + [NL80211_CMD_GET_WOWLAN] = "get_wowlan", + [NL80211_CMD_SET_WOWLAN] = "set_wowlan", + [NL80211_CMD_START_SCHED_SCAN] = "start_sched_scan", + [NL80211_CMD_STOP_SCHED_SCAN] = "stop_sched_scan", + [NL80211_CMD_SCHED_SCAN_RESULTS] = "sched_scan_results", + [NL80211_CMD_SCHED_SCAN_STOPPED] = "sched_scan_stopped", + [NL80211_CMD_SET_REKEY_OFFLOAD] = "set_rekey_offload", + [NL80211_CMD_PMKSA_CANDIDATE] = "pmksa_candidate", + [NL80211_CMD_TDLS_OPER] = "tdls_oper", + [NL80211_CMD_TDLS_MGMT] = "tdls_mgmt", + [NL80211_CMD_UNEXPECTED_FRAME] = "unexpected_frame", + [NL80211_CMD_PROBE_CLIENT] = "probe_client", + [NL80211_CMD_REGISTER_BEACONS] = "register_beacons", + [NL80211_CMD_UNEXPECTED_4ADDR_FRAME] = "unexpected_4addr_frame", + [NL80211_CMD_SET_NOACK_MAP] = "set_noack_map", + [NL80211_CMD_CH_SWITCH_NOTIFY] = "ch_switch_notify", + [NL80211_CMD_START_P2P_DEVICE] = "start_p2p_device", + [NL80211_CMD_STOP_P2P_DEVICE] = "stop_p2p_device", + [NL80211_CMD_CONN_FAILED] = "conn_failed", + [NL80211_CMD_SET_MCAST_RATE] = "set_mcast_rate", + [NL80211_CMD_SET_MAC_ACL] = "set_mac_acl", + [NL80211_CMD_RADAR_DETECT] = "radar_detect", + [NL80211_CMD_GET_PROTOCOL_FEATURES] = "get_protocol_features", + [NL80211_CMD_UPDATE_FT_IES] = "update_ft_ies", + [NL80211_CMD_FT_EVENT] = "ft_event", + [NL80211_CMD_CRIT_PROTOCOL_START] = "crit_protocol_start", + [NL80211_CMD_CRIT_PROTOCOL_STOP] = "crit_protocol_stop", + [NL80211_CMD_GET_COALESCE] = "get_coalesce", + [NL80211_CMD_SET_COALESCE] = "set_coalesce", + [NL80211_CMD_CHANNEL_SWITCH] = "channel_switch", + [NL80211_CMD_VENDOR] = "vendor", + [NL80211_CMD_SET_QOS_MAP] = "set_qos_map", + [NL80211_CMD_ADD_TX_TS] = "add_tx_ts", + [NL80211_CMD_DEL_TX_TS] = "del_tx_ts", + [NL80211_CMD_GET_MPP] = "get_mpp", + [NL80211_CMD_JOIN_OCB] = "join_ocb", + [NL80211_CMD_LEAVE_OCB] = "leave_ocb", + [NL80211_CMD_CH_SWITCH_STARTED_NOTIFY] = "ch_switch_started_notify", + [NL80211_CMD_TDLS_CHANNEL_SWITCH] = "tdls_channel_switch", + [NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH] = "tdls_cancel_channel_switch", + [NL80211_CMD_WIPHY_REG_CHANGE] = "wiphy_reg_change", + [NL80211_CMD_ABORT_SCAN] = "abort_scan", + [NL80211_CMD_START_NAN] = "start_nan", + [NL80211_CMD_STOP_NAN] = "stop_nan", + [NL80211_CMD_ADD_NAN_FUNCTION] = "add_nan_function", + [NL80211_CMD_DEL_NAN_FUNCTION] = "del_nan_function", + [NL80211_CMD_CHANGE_NAN_CONFIG] = "change_nan_config", + [NL80211_CMD_NAN_MATCH] = "nan_match", + [NL80211_CMD_SET_MULTICAST_TO_UNICAST] = "set_multicast_to_unicast", + [NL80211_CMD_UPDATE_CONNECT_PARAMS] = "update_connect_params", + [NL80211_CMD_SET_PMK] = "set_pmk", + [NL80211_CMD_DEL_PMK] = "del_pmk", + [NL80211_CMD_PORT_AUTHORIZED] = "port_authorized", + [NL80211_CMD_RELOAD_REGDB] = "reload_regdb", + [NL80211_CMD_EXTERNAL_AUTH] = "external_auth", + [NL80211_CMD_STA_OPMODE_CHANGED] = "sta_opmode_changed", + [NL80211_CMD_CONTROL_PORT_FRAME] = "control_port_frame", + [NL80211_CMD_GET_FTM_RESPONDER_STATS] = "get_ftm_responder_stats", + [NL80211_CMD_PEER_MEASUREMENT_START] = "peer_measurement_start", + [NL80211_CMD_PEER_MEASUREMENT_RESULT] = "peer_measurement_result", + [NL80211_CMD_PEER_MEASUREMENT_COMPLETE] = "peer_measurement_complete", + [NL80211_CMD_NOTIFY_RADAR] = "notify_radar", + [NL80211_CMD_UPDATE_OWE_INFO] = "update_owe_info", + [NL80211_CMD_PROBE_MESH_LINK] = "probe_mesh_link", + [NL80211_CMD_SET_TID_CONFIG] = "set_tid_config", + [NL80211_CMD_UNPROT_BEACON] = "unprot_beacon", + [NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS] = "control_port_frame_tx_status", + [NL80211_CMD_SET_SAR_SPECS] = "set_sar_specs", + [NL80211_CMD_OBSS_COLOR_COLLISION] = "obss_color_collision", + [NL80211_CMD_COLOR_CHANGE_REQUEST] = "color_change_request", + [NL80211_CMD_COLOR_CHANGE_STARTED] = "color_change_started", + [NL80211_CMD_COLOR_CHANGE_ABORTED] = "color_change_aborted", + [NL80211_CMD_COLOR_CHANGE_COMPLETED] = "color_change_completed", +}; + +DEFINE_STRING_TABLE_LOOKUP_TO_STRING(nl80211_cmd, int); diff --git a/src/shared/wifi-util.h b/src/shared/wifi-util.h new file mode 100644 index 0000000..a762fbc --- /dev/null +++ b/src/shared/wifi-util.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include + +#include "sd-netlink.h" + +#include "ether-addr-util.h" + +int wifi_get_interface(sd_netlink *genl, int ifindex, enum nl80211_iftype *ret_iftype, char **ret_ssid); +int wifi_get_station(sd_netlink *genl, int ifindex, struct ether_addr *ret_bssid); + +const char *nl80211_iftype_to_string(enum nl80211_iftype iftype) _const_; +enum nl80211_iftype nl80211_iftype_from_string(const char *s) _pure_; +const char *nl80211_cmd_to_string(int cmd) _const_; diff --git a/src/shared/xml.c b/src/shared/xml.c new file mode 100644 index 0000000..3b1fb41 --- /dev/null +++ b/src/shared/xml.c @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "macro.h" +#include "string-util.h" +#include "xml.h" + +enum { + STATE_NULL, + STATE_TEXT, + STATE_TAG, + STATE_ATTRIBUTE, +}; + +static void inc_lines(unsigned *line, const char *s, size_t n) { + const char *p = s; + + if (!line) + return; + + for (;;) { + const char *f; + + f = memchr(p, '\n', n); + if (!f) + return; + + n -= (f - p) + 1; + p = f + 1; + (*line)++; + } +} + +/* We don't actually do real XML here. We only read a simplistic + * subset, that is a bit less strict that XML and lacks all the more + * complex features, like entities, or namespaces. However, we do + * support some HTML5-like simplifications */ + +int xml_tokenize(const char **p, char **name, void **state, unsigned *line) { + const char *c, *e, *b; + char *ret; + int t; + + assert(p); + assert(*p); + assert(name); + assert(state); + + t = PTR_TO_INT(*state); + c = *p; + + if (t == STATE_NULL) { + if (line) + *line = 1; + t = STATE_TEXT; + } + + for (;;) { + if (*c == 0) + return XML_END; + + switch (t) { + + case STATE_TEXT: { + int x; + + e = strchrnul(c, '<'); + if (e > c) { + /* More text... */ + ret = strndup(c, e - c); + if (!ret) + return -ENOMEM; + + inc_lines(line, c, e - c); + + *name = ret; + *p = e; + *state = INT_TO_PTR(STATE_TEXT); + + return XML_TEXT; + } + + assert(*e == '<'); + b = c + 1; + + if (startswith(b, "!--")) { + /* A comment */ + e = strstrafter(b + 3, "-->"); + if (!e) + return -EINVAL; + + inc_lines(line, b, e - b); + + c = e; + continue; + } + + if (*b == '?') { + /* Processing instruction */ + + e = strstrafter(b + 1, "?>"); + if (!e) + return -EINVAL; + + inc_lines(line, b, e - b); + + c = e; + continue; + } + + if (*b == '!') { + /* DTD */ + + e = strchr(b + 1, '>'); + if (!e) + return -EINVAL; + + inc_lines(line, b, e + 1 - b); + + c = e + 1; + continue; + } + + if (*b == '/') { + /* A closing tag */ + x = XML_TAG_CLOSE; + b++; + } else + x = XML_TAG_OPEN; + + e = strpbrk(b, WHITESPACE "/>"); + if (!e) + return -EINVAL; + + ret = strndup(b, e - b); + if (!ret) + return -ENOMEM; + + *name = ret; + *p = e; + *state = INT_TO_PTR(STATE_TAG); + + return x; + } + + case STATE_TAG: + + b = c + strspn(c, WHITESPACE); + if (*b == 0) + return -EINVAL; + + inc_lines(line, c, b - c); + + e = b + strcspn(b, WHITESPACE "=/>"); + if (e > b) { + /* An attribute */ + + ret = strndup(b, e - b); + if (!ret) + return -ENOMEM; + + *name = ret; + *p = e; + *state = INT_TO_PTR(STATE_ATTRIBUTE); + + return XML_ATTRIBUTE_NAME; + } + + if (startswith(b, "/>")) { + /* An empty tag */ + + *name = NULL; /* For empty tags we return a NULL name, the caller must be prepared for that */ + *p = b + 2; + *state = INT_TO_PTR(STATE_TEXT); + + return XML_TAG_CLOSE_EMPTY; + } + + if (*b != '>') + return -EINVAL; + + c = b + 1; + t = STATE_TEXT; + continue; + + case STATE_ATTRIBUTE: + + if (*c == '=') { + c++; + + if (IN_SET(*c, '\'', '"')) { + /* Tag with a quoted value */ + + e = strchr(c+1, *c); + if (!e) + return -EINVAL; + + inc_lines(line, c, e - c); + + ret = strndup(c+1, e - c - 1); + if (!ret) + return -ENOMEM; + + *name = ret; + *p = e + 1; + *state = INT_TO_PTR(STATE_TAG); + + return XML_ATTRIBUTE_VALUE; + + } + + /* Tag with a value without quotes */ + + b = strpbrk(c, WHITESPACE ">"); + if (!b) + b = c; + + ret = strndup(c, b - c); + if (!ret) + return -ENOMEM; + + *name = ret; + *p = b; + *state = INT_TO_PTR(STATE_TAG); + return XML_ATTRIBUTE_VALUE; + } + + t = STATE_TAG; + continue; + } + + } + + assert_not_reached(); +} diff --git a/src/shared/xml.h b/src/shared/xml.h new file mode 100644 index 0000000..217b3b0 --- /dev/null +++ b/src/shared/xml.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +enum { + XML_END, + XML_TEXT, + XML_TAG_OPEN, + XML_TAG_CLOSE, + XML_TAG_CLOSE_EMPTY, + XML_ATTRIBUTE_NAME, + XML_ATTRIBUTE_VALUE, +}; + +int xml_tokenize(const char **p, char **name, void **state, unsigned *line); diff --git a/src/shutdown/detach-dm.c b/src/shutdown/detach-dm.c new file mode 100644 index 0000000..8b8f72d --- /dev/null +++ b/src/shutdown/detach-dm.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "detach-dm.h" +#include "device-util.h" +#include "devnum-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "sync-util.h" + +typedef struct DeviceMapper { + char *path; + dev_t devnum; + LIST_FIELDS(struct DeviceMapper, device_mapper); +} DeviceMapper; + +static void device_mapper_free(DeviceMapper **head, DeviceMapper *m) { + assert(head); + assert(m); + + LIST_REMOVE(device_mapper, *head, m); + + free(m->path); + free(m); +} + +static void device_mapper_list_free(DeviceMapper **head) { + assert(head); + + while (*head) + device_mapper_free(head, *head); +} + +static int dm_list_get(DeviceMapper **head) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(head); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "block", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysname(e, "dm-*"); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + _cleanup_free_ char *p = NULL; + const char *dn; + DeviceMapper *m; + dev_t devnum; + + if (sd_device_get_devnum(d, &devnum) < 0 || + sd_device_get_devname(d, &dn) < 0) + continue; + + p = strdup(dn); + if (!p) + return -ENOMEM; + + m = new(DeviceMapper, 1); + if (!m) + return -ENOMEM; + + *m = (DeviceMapper) { + .path = TAKE_PTR(p), + .devnum = devnum, + }; + + LIST_PREPEND(device_mapper, *head, m); + } + + return 0; +} + +static int delete_dm(DeviceMapper *m) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(m); + assert(major(m->devnum) != 0); + assert(m->path); + + fd = open("/dev/mapper/control", O_RDWR|O_CLOEXEC); + if (fd < 0) + return -errno; + + r = fsync_path_at(AT_FDCWD, m->path); + if (r < 0) + log_debug_errno(r, "Failed to sync DM block device %s, ignoring: %m", m->path); + + return RET_NERRNO(ioctl(fd, DM_DEV_REMOVE, &(struct dm_ioctl) { + .version = { + DM_VERSION_MAJOR, + DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL + }, + .data_size = sizeof(struct dm_ioctl), + .dev = m->devnum, + })); +} + +static int dm_points_list_detach(DeviceMapper **head, bool *changed, bool last_try) { + int n_failed = 0, r; + dev_t rootdev = 0, usrdev = 0; + + assert(head); + assert(changed); + + (void) get_block_device("/", &rootdev); + (void) get_block_device("/usr", &usrdev); + + LIST_FOREACH(device_mapper, m, *head) { + if ((major(rootdev) != 0 && rootdev == m->devnum) || + (major(usrdev) != 0 && usrdev == m->devnum)) { + log_debug("Not detaching DM %s that backs the OS itself, skipping.", m->path); + n_failed ++; + continue; + } + + log_info("Detaching DM %s (" DEVNUM_FORMAT_STR ").", m->path, DEVNUM_FORMAT_VAL(m->devnum)); + r = delete_dm(m); + if (r < 0) { + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Could not detach DM %s: %m", m->path); + n_failed++; + continue; + } + + *changed = true; + device_mapper_free(head, m); + } + + return n_failed; +} + +int dm_detach_all(bool *changed, bool last_try) { + _cleanup_(device_mapper_list_free) LIST_HEAD(DeviceMapper, dm_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(dm_list_head); + + r = dm_list_get(&dm_list_head); + if (r < 0) + return r; + + return dm_points_list_detach(&dm_list_head, changed, last_try); +} diff --git a/src/shutdown/detach-dm.h b/src/shutdown/detach-dm.h new file mode 100644 index 0000000..b5f50a3 --- /dev/null +++ b/src/shutdown/detach-dm.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include + +int dm_detach_all(bool *changed, bool last_try); diff --git a/src/shutdown/detach-loopback.c b/src/shutdown/detach-loopback.c new file mode 100644 index 0000000..267509f --- /dev/null +++ b/src/shutdown/detach-loopback.c @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include +#include +#include + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include "sd-device.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "detach-loopback.h" +#include "device-util.h" +#include "fd-util.h" + +typedef struct LoopbackDevice { + char *path; + dev_t devnum; + LIST_FIELDS(struct LoopbackDevice, loopback_device); +} LoopbackDevice; + +static void loopback_device_free(LoopbackDevice **head, LoopbackDevice *m) { + assert(head); + assert(m); + + LIST_REMOVE(loopback_device, *head, m); + + free(m->path); + free(m); +} + +static void loopback_device_list_free(LoopbackDevice **head) { + assert(head); + + while (*head) + loopback_device_free(head, *head); +} + +static int loopback_list_get(LoopbackDevice **head) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(head); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "block", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysname(e, "loop*"); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, "loop/backing_file", NULL, true); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + _cleanup_free_ char *p = NULL; + const char *dn; + LoopbackDevice *lb; + dev_t devnum; + + if (sd_device_get_devnum(d, &devnum) < 0 || + sd_device_get_devname(d, &dn) < 0) + continue; + + p = strdup(dn); + if (!p) + return -ENOMEM; + + lb = new(LoopbackDevice, 1); + if (!lb) + return -ENOMEM; + + *lb = (LoopbackDevice) { + .path = TAKE_PTR(p), + .devnum = devnum, + }; + + LIST_PREPEND(loopback_device, *head, lb); + } + + return 0; +} + +static int delete_loopback(const char *device) { + _cleanup_close_ int fd = -EBADF; + struct loop_info64 info; + + assert(device); + + fd = open(device, O_RDONLY|O_CLOEXEC); + if (fd < 0) { + log_debug_errno(errno, "Failed to open loopback device %s: %m", device); + return errno == ENOENT ? 0 : -errno; + } + + /* Loopback block devices don't sync in-flight blocks when we clear the fd, hence sync explicitly + * first */ + if (fsync(fd) < 0) + log_debug_errno(errno, "Failed to sync loop block device %s, ignoring: %m", device); + + if (ioctl(fd, LOOP_CLR_FD, 0) < 0) { + if (errno == ENXIO) /* Nothing bound, didn't do anything */ + return 0; + + if (errno != EBUSY) + return log_debug_errno(errno, "Failed to clear loopback device %s: %m", device); + + if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) { + if (errno == ENXIO) /* What? Suddenly detached after all? That's fine by us then. */ + return 1; + + log_debug_errno(errno, "Failed to invoke LOOP_GET_STATUS64 on loopback device %s, ignoring: %m", device); + return -EBUSY; /* propagate original error */ + } + +#if HAVE_VALGRIND_MEMCHECK_H + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + + if (FLAGS_SET(info.lo_flags, LO_FLAGS_AUTOCLEAR)) /* someone else already set LO_FLAGS_AUTOCLEAR for us? fine by us */ + return -EBUSY; /* propagate original error */ + + info.lo_flags |= LO_FLAGS_AUTOCLEAR; + if (ioctl(fd, LOOP_SET_STATUS64, &info) < 0) { + if (errno == ENXIO) /* Suddenly detached after all? Fine by us */ + return 1; + + log_debug_errno(errno, "Failed to set LO_FLAGS_AUTOCLEAR flag for loop device %s, ignoring: %m", device); + } else + log_debug("Successfully set LO_FLAGS_AUTOCLEAR flag for loop device %s.", device); + + return -EBUSY; + } + + if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) { + /* If the LOOP_CLR_FD above succeeded we'll see ENXIO here. */ + if (errno == ENXIO) + log_debug("Successfully detached loopback device %s.", device); + else + log_debug_errno(errno, "Failed to invoke LOOP_GET_STATUS64 on loopback device %s, ignoring: %m", device); /* the LOOP_CLR_FD at least worked, let's hope for the best */ + + return 1; + } + +#if HAVE_VALGRIND_MEMCHECK_H + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + + /* Linux makes LOOP_CLR_FD succeed whenever LO_FLAGS_AUTOCLEAR is set without actually doing + * anything. Very confusing. Let's hence not claim we did anything in this case. */ + if (FLAGS_SET(info.lo_flags, LO_FLAGS_AUTOCLEAR)) + log_debug("Successfully called LOOP_CLR_FD on a loopback device %s with autoclear set, which is a NOP.", device); + else + log_debug("Weird, LOOP_CLR_FD succeeded but the device is still attached on %s.", device); + + return -EBUSY; /* Nothing changed, the device is still attached, hence it apparently is still busy */ +} + +static int loopback_points_list_detach(LoopbackDevice **head, bool *changed, bool last_try) { + int n_failed = 0, r; + dev_t rootdev = 0, usrdev = 0; + + assert(head); + assert(changed); + + (void) get_block_device_harder("/", &rootdev); + (void) block_get_whole_disk(rootdev, &rootdev); + + (void) get_block_device_harder("/usr", &usrdev); + (void) block_get_whole_disk(usrdev, &usrdev); + + LIST_FOREACH(loopback_device, m, *head) { + if ((major(rootdev) != 0 && rootdev == m->devnum) || + (major(usrdev) != 0 && usrdev == m->devnum)) { + log_debug("Not detaching loopback device %s that backs the OS itself, skipping.", m->path); + n_failed++; + continue; + } + + log_info("Detaching loopback %s.", m->path); + r = delete_loopback(m->path); + if (r < 0) { + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Could not detach loopback %s: %m", m->path); + n_failed++; + continue; + } + if (r > 0) + *changed = true; + + loopback_device_free(head, m); + } + + return n_failed; +} + +int loopback_detach_all(bool *changed, bool last_try) { + _cleanup_(loopback_device_list_free) LIST_HEAD(LoopbackDevice, loopback_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(loopback_list_head); + + r = loopback_list_get(&loopback_list_head); + if (r < 0) + return r; + + return loopback_points_list_detach(&loopback_list_head, changed, last_try); +} diff --git a/src/shutdown/detach-loopback.h b/src/shutdown/detach-loopback.h new file mode 100644 index 0000000..d6d73f3 --- /dev/null +++ b/src/shutdown/detach-loopback.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include + +int loopback_detach_all(bool *changed, bool last_try); diff --git a/src/shutdown/detach-md.c b/src/shutdown/detach-md.c new file mode 100644 index 0000000..cf3130d --- /dev/null +++ b/src/shutdown/detach-md.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "detach-md.h" +#include "device-util.h" +#include "devnum-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "string-util.h" + +typedef struct RaidDevice { + char *path; + dev_t devnum; + LIST_FIELDS(struct RaidDevice, raid_device); +} RaidDevice; + +static void raid_device_free(RaidDevice **head, RaidDevice *m) { + assert(head); + assert(m); + + LIST_REMOVE(raid_device, *head, m); + + free(m->path); + free(m); +} + +static void raid_device_list_free(RaidDevice **head) { + assert(head); + + while (*head) + raid_device_free(head, *head); +} + +static int md_list_get(RaidDevice **head) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + int r; + + assert(head); + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, "block", true); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysname(e, "md*"); + if (r < 0) + return r; + + /* Filter out partitions. */ + r = sd_device_enumerator_add_match_property(e, "DEVTYPE", "disk"); + if (r < 0) + return r; + + FOREACH_DEVICE(e, d) { + _cleanup_free_ char *p = NULL; + const char *dn, *md_level; + RaidDevice *m; + dev_t devnum; + + r = sd_device_get_devname(d, &dn); + if (r < 0) { + log_device_warning_errno(d, r, "Failed to get name of enumerated device, ignoring: %m"); + continue; + } + + r = sd_device_get_devnum(d, &devnum); + if (r < 0) { + log_device_warning_errno(d, r, "Failed to get devno of enumerated device '%s', ignoring device: %m", dn); + continue; + } + + /* MD "containers" are a special type of MD devices, used for external metadata. Since they + * don't provide RAID functionality in themselves we don't need to stop them. Note that the + * MD_LEVEL udev property is set by mdadm in userspace, which is an optional package. Hence + * let's handle gracefully if the property is missing. */ + + r = sd_device_get_property_value(d, "MD_LEVEL", &md_level); + if (r < 0) + log_device_full_errno(d, + r == -ENOENT ? LOG_DEBUG : LOG_WARNING, + r, + "Failed to get MD_LEVEL property for %s, assuming regular MD device, not a container: %m", dn); + else if (streq(md_level, "container")) { + log_device_debug(d, "Skipping MD device '%s' because it is a container MD device.", dn); + continue; + } + + p = strdup(dn); + if (!p) + return -ENOMEM; + + m = new(RaidDevice, 1); + if (!m) + return -ENOMEM; + + *m = (RaidDevice) { + .path = TAKE_PTR(p), + .devnum = devnum, + }; + + LIST_PREPEND(raid_device, *head, m); + } + + return 0; +} + +static int delete_md(RaidDevice *m) { + _cleanup_close_ int fd = -EBADF; + + assert(m); + assert(major(m->devnum) != 0); + assert(m->path); + + fd = open(m->path, O_RDONLY|O_CLOEXEC|O_EXCL); + if (fd < 0) + return -errno; + + if (fsync(fd) < 0) + log_debug_errno(errno, "Failed to sync MD block device %s, ignoring: %m", m->path); + + return RET_NERRNO(ioctl(fd, STOP_ARRAY, NULL)); +} + +static int md_points_list_detach(RaidDevice **head, bool *changed, bool last_try) { + int n_failed = 0, r; + dev_t rootdev = 0, usrdev = 0; + + assert(head); + assert(changed); + + (void) get_block_device("/", &rootdev); + (void) get_block_device("/usr", &usrdev); + + LIST_FOREACH(raid_device, m, *head) { + if ((major(rootdev) != 0 && rootdev == m->devnum) || + (major(usrdev) != 0 && usrdev == m->devnum)) { + log_debug("Not detaching MD %s that backs the OS itself, skipping.", m->path); + n_failed ++; + continue; + } + + log_info("Stopping MD %s (" DEVNUM_FORMAT_STR ").", m->path, DEVNUM_FORMAT_VAL(m->devnum)); + r = delete_md(m); + if (r < 0) { + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Could not stop MD %s: %m", m->path); + n_failed++; + continue; + } + + *changed = true; + raid_device_free(head, m); + } + + return n_failed; +} + +int md_detach_all(bool *changed, bool last_try) { + _cleanup_(raid_device_list_free) LIST_HEAD(RaidDevice, md_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(md_list_head); + + r = md_list_get(&md_list_head); + if (r < 0) + return r; + + return md_points_list_detach(&md_list_head, changed, last_try); +} diff --git a/src/shutdown/detach-md.h b/src/shutdown/detach-md.h new file mode 100644 index 0000000..3784598 --- /dev/null +++ b/src/shutdown/detach-md.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include + +int md_detach_all(bool *changed, bool last_try); diff --git a/src/shutdown/detach-swap.c b/src/shutdown/detach-swap.c new file mode 100644 index 0000000..fd7dcdf --- /dev/null +++ b/src/shutdown/detach-swap.c @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include + +#include "alloc-util.h" +#include "detach-swap.h" +#include "libmount-util.h" + +static void swap_device_free(SwapDevice **head, SwapDevice *m) { + assert(head); + assert(m); + + LIST_REMOVE(swap_device, *head, m); + + free(m->path); + free(m); +} + +void swap_devices_list_free(SwapDevice **head) { + assert(head); + + while (*head) + swap_device_free(head, *head); +} + +int swap_list_get(const char *swaps, SwapDevice **head) { + _cleanup_(mnt_free_tablep) struct libmnt_table *t = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *i = NULL; + int r; + + assert(head); + + t = mnt_new_table(); + i = mnt_new_iter(MNT_ITER_FORWARD); + if (!t || !i) + return log_oom(); + + r = mnt_table_parse_swaps(t, swaps); + if (r == -ENOENT) /* no /proc/swaps is fine */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", swaps ?: "/proc/swaps"); + + for (;;) { + struct libmnt_fs *fs; + _cleanup_free_ SwapDevice *swap = NULL; + const char *source; + + r = mnt_table_next_fs(t, i, &fs); + if (r == 1) /* EOF */ + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from %s: %m", swaps ?: "/proc/swaps"); + + source = mnt_fs_get_source(fs); + if (!source) + continue; + + swap = new0(SwapDevice, 1); + if (!swap) + return log_oom(); + + swap->path = strdup(source); + if (!swap->path) + return log_oom(); + + LIST_PREPEND(swap_device, *head, TAKE_PTR(swap)); + } + + return 0; +} + +static int swap_points_list_off(SwapDevice **head, bool *changed) { + int n_failed = 0; + + assert(head); + assert(changed); + + LIST_FOREACH(swap_device, m, *head) { + log_info("Deactivating swap %s.", m->path); + if (swapoff(m->path) < 0) { + log_warning_errno(errno, "Could not deactivate swap %s: %m", m->path); + n_failed++; + continue; + } + + *changed = true; + swap_device_free(head, m); + } + + return n_failed; +} + +int swapoff_all(bool *changed) { + _cleanup_(swap_devices_list_free) LIST_HEAD(SwapDevice, swap_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(swap_list_head); + + r = swap_list_get(NULL, &swap_list_head); + if (r < 0) + return r; + + return swap_points_list_off(&swap_list_head, changed); +} diff --git a/src/shutdown/detach-swap.h b/src/shutdown/detach-swap.h new file mode 100644 index 0000000..1ebf5eb --- /dev/null +++ b/src/shutdown/detach-swap.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include + +#include "list.h" + +int swapoff_all(bool *changed); + +/* This is exported just for testing */ +typedef struct SwapDevice { + char *path; + LIST_FIELDS(struct SwapDevice, swap_device); +} SwapDevice; + +int swap_list_get(const char *swaps, SwapDevice **head); +void swap_devices_list_free(SwapDevice **head); diff --git a/src/shutdown/meson.build b/src/shutdown/meson.build new file mode 100644 index 0000000..219f9fd --- /dev/null +++ b/src/shutdown/meson.build @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_shutdown_sources = files( + 'detach-dm.c', + 'detach-loopback.c', + 'detach-md.c', + 'detach-swap.c', + 'shutdown.c', + 'umount.c', +) + +executables += [ + libexec_template + { + 'name' : 'systemd-shutdown', + 'sources' : systemd_shutdown_sources, + 'dependencies' : libmount, + }, + libexec_template + { + 'name' : 'systemd-shutdown.standalone', + 'sources' : systemd_shutdown_sources, + 'c_args' : '-DSTANDALONE', + 'link_with' : [ + libbasic, + libshared_static, + libsystemd_static, + ], + 'dependencies' : libmount, + 'build_by_default' : have_standalone_binaries, + 'install' : have_standalone_binaries, + }, + test_template + { + 'sources' : files( + 'test-umount.c', + 'detach-swap.c', + 'umount.c', + ), + 'dependencies' : libmount, + }, +] diff --git a/src/shutdown/shutdown.c b/src/shutdown/shutdown.c new file mode 100644 index 0000000..b709078 --- /dev/null +++ b/src/shutdown/shutdown.c @@ -0,0 +1,663 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "async.h" +#include "binfmt-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "constants.h" +#include "coredump-util.h" +#include "detach-dm.h" +#include "detach-loopback.h" +#include "detach-md.h" +#include "detach-swap.h" +#include "errno-util.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "getopt-defs.h" +#include "initrd-util.h" +#include "killall.h" +#include "log.h" +#include "parse-util.h" +#include "process-util.h" +#include "reboot-util.h" +#include "rlimit-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "switch-root.h" +#include "sysctl-util.h" +#include "terminal-util.h" +#include "umount.h" +#include "virt.h" +#include "watchdog.h" + +#define SYNC_PROGRESS_ATTEMPTS 3 +#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC) + +static char* arg_verb; +static uint8_t arg_exit_code; +static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC; + +static int parse_argv(int argc, char *argv[]) { + enum { + COMMON_GETOPT_ARGS, + SHUTDOWN_GETOPT_ARGS, + }; + + static const struct option options[] = { + COMMON_GETOPT_OPTIONS, + SHUTDOWN_GETOPT_OPTIONS, + {} + }; + + int c, r; + + assert(argc >= 1); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + + /* "-" prevents getopt from permuting argv[] and moving the verb away + * from argv[1]. Our interface to initrd promises it'll be there. */ + while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0) + switch (c) { + + case ARG_LOG_LEVEL: + r = log_set_max_level_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log level %s, ignoring: %m", optarg); + + break; + + case ARG_LOG_TARGET: + r = log_set_target_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log target %s, ignoring: %m", optarg); + + break; + + case ARG_LOG_COLOR: + + if (optarg) { + r = log_show_color_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log color setting %s, ignoring: %m", optarg); + } else + log_show_color(true); + + break; + + case ARG_LOG_LOCATION: + if (optarg) { + r = log_show_location_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log location setting %s, ignoring: %m", optarg); + } else + log_show_location(true); + + break; + + case ARG_LOG_TIME: + + if (optarg) { + r = log_show_time_from_string(optarg); + if (r < 0) + log_error_errno(r, "Failed to parse log time setting %s, ignoring: %m", optarg); + } else + log_show_time(true); + + break; + + case ARG_EXIT_CODE: + r = safe_atou8(optarg, &arg_exit_code); + if (r < 0) + log_error_errno(r, "Failed to parse exit code %s, ignoring: %m", optarg); + + break; + + case ARG_TIMEOUT: + r = parse_sec(optarg, &arg_timeout); + if (r < 0) + log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring: %m", optarg); + + break; + + case '\001': + if (!arg_verb) + arg_verb = optarg; + else + log_error("Excess arguments, ignoring"); + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (!arg_verb) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Verb argument missing."); + + return 0; +} + +static int switch_root_initramfs(void) { + /* Do not detach the old root, because /run/initramfs/shutdown needs to access it. + * + * Disable sync() during switch-root, we after all sync'ed here plenty, and a dumb sync (as opposed + * to the "smart" sync() we did here that looks at progress parameters) would defeat much of our + * efforts here. As the new root will be /run/initramfs/, it is not necessary to mount /run/ + * recursively. */ + return switch_root( + /* new_root= */ "/run/initramfs", + /* old_root_after= */ "/oldroot", + /* flags= */ SWITCH_ROOT_DONT_SYNC); +} + +/* Read the following fields from /proc/meminfo: + * + * NFS_Unstable + * Writeback + * Dirty + * + * Return true if the sum of these fields is greater than the previous + * value input. For all other issues, report the failure and indicate that + * the sync is not making progress. + */ +static int sync_making_progress(unsigned long long *prev_dirty) { + _cleanup_fclose_ FILE *f = NULL; + unsigned long long val = 0; + int ret; + + f = fopen("/proc/meminfo", "re"); + if (!f) + return log_warning_errno(errno, "Failed to open /proc/meminfo: %m"); + + for (;;) { + _cleanup_free_ char *line = NULL; + unsigned long long ull = 0; + int q; + + q = read_line(f, LONG_LINE_MAX, &line); + if (q < 0) + return log_warning_errno(q, "Failed to parse /proc/meminfo: %m"); + if (q == 0) + break; + + if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:")) + continue; + + errno = 0; + if (sscanf(line, "%*s %llu %*s", &ull) != 1) { + if (errno != 0) + log_warning_errno(errno, "Failed to parse /proc/meminfo: %m"); + else + log_warning("Failed to parse /proc/meminfo"); + + return false; + } + + val += ull; + } + + ret = *prev_dirty > val; + *prev_dirty = val; + return ret; +} + +static void sync_with_progress(void) { + unsigned long long dirty = ULLONG_MAX; + unsigned checks; + pid_t pid; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + /* Due to the possibility of the sync operation hanging, we fork a child process and monitor + * the progress. If the timeout lapses, the assumption is that the particular sync stalled. */ + + r = asynchronous_sync(&pid); + if (r < 0) { + log_error_errno(r, "Failed to fork sync(): %m"); + return; + } + + log_info("Syncing filesystems and block devices."); + + /* Start monitoring the sync operation. If more than + * SYNC_PROGRESS_ATTEMPTS lapse without progress being made, + * we assume that the sync is stalled */ + for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) { + r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC); + if (r == 0) + /* Sync finished without error. + * (The sync itself does not return an error code) */ + return; + else if (r == -ETIMEDOUT) { + /* Reset the check counter if the "Dirty" value is + * decreasing */ + if (sync_making_progress(&dirty) > 0) + checks = 0; + } else { + log_error_errno(r, "Failed to sync filesystems and block devices: %m"); + return; + } + } + + /* Only reached in the event of a timeout. We should issue a kill + * to the stray process. */ + log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid); + (void) kill(pid, SIGKILL); +} + +static int read_current_sysctl_printk_log_level(void) { + _cleanup_free_ char *sysctl_printk_vals = NULL, *sysctl_printk_curr = NULL; + int current_lvl; + const char *p; + int r; + + r = sysctl_read("kernel/printk", &sysctl_printk_vals); + if (r < 0) + return log_debug_errno(r, "Cannot read sysctl kernel.printk: %m"); + + p = sysctl_printk_vals; + r = extract_first_word(&p, &sysctl_printk_curr, NULL, 0); + if (r < 0) + return log_debug_errno(r, "Failed to split out kernel printk priority: %m"); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Short read while reading kernel.printk sysctl"); + + r = safe_atoi(sysctl_printk_curr, ¤t_lvl); + if (r < 0) + return log_debug_errno(r, "Failed to parse kernel.printk sysctl: %s", sysctl_printk_vals); + + return current_lvl; +} + +static void bump_sysctl_printk_log_level(int min_level) { + int current_lvl, r; + + /* Set the logging level to be able to see messages with log level smaller or equal to min_level */ + + current_lvl = read_current_sysctl_printk_log_level(); + if (current_lvl < 0 || current_lvl >= min_level + 1) + return; + + r = sysctl_writef("kernel/printk", "%i", min_level + 1); + if (r < 0) + log_debug_errno(r, "Failed to bump kernel.printk to %i: %m", min_level + 1); +} + +static void init_watchdog(void) { + const char *s; + int r; + + s = getenv("WATCHDOG_DEVICE"); + if (s) { + r = watchdog_set_device(s); + if (r < 0) + log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", s); + } + + s = getenv("WATCHDOG_USEC"); + if (s) { + usec_t usec; + + r = safe_atou64(s, &usec); + if (r < 0) + log_warning_errno(r, "Failed to parse watchdog timeout '%s', ignoring: %m", s); + else + (void) watchdog_setup(usec); + } +} + +int main(int argc, char *argv[]) { + static const char* const dirs[] = { + SYSTEM_SHUTDOWN_PATH, + NULL + }; + _cleanup_free_ char *cgroup = NULL; + char *arguments[3]; + int cmd, r; + + /* Close random fds we might have get passed, just for paranoia, before we open any new fds, for + * example for logging. After all this tool's purpose is about detaching any pinned resources, and + * open file descriptors are the primary way to pin resources. Note that we don't really expect any + * fds to be passed here. */ + (void) close_all_fds(NULL, 0); + + /* The log target defaults to console, but the original systemd process will pass its log target in through a + * command line argument, which will override this default. Also, ensure we'll never log to the journal or + * syslog, as these logging daemons are either already dead or will die very soon. */ + + log_set_target(LOG_TARGET_CONSOLE); + log_set_prohibit_ipc(true); + log_parse_environment(); + + if (getpid_cached() == 1) + log_set_always_reopen_console(true); + + r = parse_argv(argc, argv); + if (r < 0) + goto error; + + log_open(); + + umask(0022); + + if (getpid_cached() != 1) { + r = log_error_errno(SYNTHETIC_ERRNO(EPERM), "Not executed by init (PID 1)."); + goto error; + } + + if (streq(arg_verb, "reboot")) + cmd = RB_AUTOBOOT; + else if (streq(arg_verb, "poweroff")) + cmd = RB_POWER_OFF; + else if (streq(arg_verb, "halt")) + cmd = RB_HALT_SYSTEM; + else if (streq(arg_verb, "kexec")) + cmd = LINUX_REBOOT_CMD_KEXEC; + else if (streq(arg_verb, "exit")) + cmd = 0; /* ignored, just checking that arg_verb is valid */ + else { + r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown action '%s'.", arg_verb); + goto error; + } + + /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with + * a mechanism to pick up systemd's exit status in the VM. Note that we execute this as early as + * possible since otherwise we might shut down the VM before the AF_VSOCK buffers have been flushed. + * While this doesn't guarantee the message will arrive, in practice we do enough work after this + * that the message should always arrive on the host */ + (void) sd_notifyf(0, "EXIT_STATUS=%i", arg_exit_code); + + (void) cg_get_root_path(&cgroup); + bool in_container = detect_container() > 0; + + /* If the logging messages are going to KMSG, and if we are not running from a container, then try to + * update the sysctl kernel.printk current value in order to see "info" messages; This current log + * level is not updated if already big enough. + */ + if (!in_container && + IN_SET(log_get_target(), + LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_SYSLOG_OR_KMSG, + LOG_TARGET_KMSG)) + bump_sysctl_printk_log_level(LOG_WARNING); + + init_watchdog(); + + /* Lock us into memory */ + (void) mlockall(MCL_CURRENT|MCL_FUTURE); + + /* We need to make mounts private so that we can MS_MOVE in unmount_all(). Kernel does not allow + * MS_MOVE when parent mountpoints have shared propagation. */ + if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) + log_warning_errno(errno, "Failed to make mounts private, ignoring: %m"); + + /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that + * slow IO is processed here already and the final process killing spree is not impacted by processes + * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will + * result. */ + if (!in_container) + sync_with_progress(); + + disable_coredumps(); + disable_binfmt(); + + log_info("Sending SIGTERM to remaining processes..."); + broadcast_signal(SIGTERM, true, true, arg_timeout); + + log_info("Sending SIGKILL to remaining processes..."); + broadcast_signal(SIGKILL, true, false, arg_timeout); + + bool need_umount = !in_container, need_swapoff = !in_container, need_loop_detach = !in_container, + need_dm_detach = !in_container, need_md_detach = !in_container, can_initrd, last_try = false; + can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0; + + /* Unmount all mountpoints, swaps, and loopback devices */ + for (;;) { + bool changed = false; + + (void) watchdog_ping(); + + /* Let's trim the cgroup tree on each iteration so that we leave an empty cgroup tree around, + * so that container managers get a nice notify event when we are down */ + if (cgroup) + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false); + + if (need_umount) { + log_info("Unmounting file systems."); + r = umount_all(&changed, last_try); + if (r == 0) { + need_umount = false; + log_info("All filesystems unmounted."); + } else if (r > 0) + log_info("Not all file systems unmounted, %d left.", r); + else + log_error_errno(r, "Unable to unmount file systems: %m"); + } + + if (need_swapoff) { + log_info("Deactivating swaps."); + r = swapoff_all(&changed); + if (r == 0) { + need_swapoff = false; + log_info("All swaps deactivated."); + } else if (r > 0) + log_info("Not all swaps deactivated, %d left.", r); + else + log_error_errno(r, "Unable to deactivate swaps: %m"); + } + + if (need_loop_detach) { + log_info("Detaching loop devices."); + r = loopback_detach_all(&changed, last_try); + if (r == 0) { + need_loop_detach = false; + log_info("All loop devices detached."); + } else if (r > 0) + log_info("Not all loop devices detached, %d left.", r); + else + log_error_errno(r, "Unable to detach loop devices: %m"); + } + + if (need_md_detach) { + log_info("Stopping MD devices."); + r = md_detach_all(&changed, last_try); + if (r == 0) { + need_md_detach = false; + log_info("All MD devices stopped."); + } else if (r > 0) + log_info("Not all MD devices stopped, %d left.", r); + else + log_error_errno(r, "Unable to stop MD devices: %m"); + } + + if (need_dm_detach) { + log_info("Detaching DM devices."); + r = dm_detach_all(&changed, last_try); + if (r == 0) { + need_dm_detach = false; + log_info("All DM devices detached."); + } else if (r > 0) + log_info("Not all DM devices detached, %d left.", r); + else + log_error_errno(r, "Unable to detach DM devices: %m"); + } + + if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach + && !need_md_detach) { + log_info("All filesystems, swaps, loop devices, MD devices and DM devices detached."); + /* Yay, done */ + break; + } + + if (!changed && !last_try && !can_initrd) { + /* There are things we cannot get rid of. Loop one more time in which we will log + * with higher priority to inform the user. Note that we don't need to do this if + * there is an initrd to switch to, because that one is likely to get rid of the + * remaining mounts. If not, it will log about them. */ + last_try = true; + continue; + } + + /* If in this iteration we didn't manage to unmount/deactivate anything, we simply give up */ + if (!changed) { + log_info("Cannot finalize remaining%s%s%s%s%s continuing.", + need_umount ? " file systems," : "", + need_swapoff ? " swap devices," : "", + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : "", + need_md_detach ? " MD devices," : ""); + break; + } + + log_debug("Couldn't finalize remaining %s%s%s%s%s trying again.", + need_umount ? " file systems," : "", + need_swapoff ? " swap devices," : "", + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : "", + need_md_detach ? " MD devices," : ""); + } + + /* We're done with the watchdog. Note that the watchdog is explicitly not stopped here. It remains + * active to guard against any issues during the rest of the shutdown sequence. */ + watchdog_free_device(); + + arguments[0] = NULL; /* Filled in by execute_directories(), when needed */ + arguments[1] = arg_verb; + arguments[2] = NULL; + (void) execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + + (void) rlimit_nofile_safe(); + + if (can_initrd) { + r = switch_root_initramfs(); + if (r >= 0) { + argv[0] = (char*) "/shutdown"; + + (void) setsid(); + (void) make_console_stdio(); + + log_info("Successfully changed into root pivot.\n" + "Returning to initrd..."); + + execv("/shutdown", argv); + log_error_errno(errno, "Failed to execute shutdown binary: %m"); + } else + log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m"); + } + + if (need_umount || need_swapoff || need_loop_detach || need_dm_detach || need_md_detach) + log_error("Unable to finalize remaining%s%s%s%s%s ignoring.", + need_umount ? " file systems," : "", + need_swapoff ? " swap devices," : "", + need_loop_detach ? " loop devices," : "", + need_dm_detach ? " DM devices," : "", + need_md_detach ? " MD devices," : ""); + + /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need + * to be sync'ed explicitly in advance. So let's do this here, but not needlessly slow down + * containers. Note that we sync'ed things already once above, but we did some more work since then + * which might have caused IO, hence let's do it once more. Do not remove this sync, data corruption + * will result. */ + if (!in_container) + sync_with_progress(); + + if (streq(arg_verb, "exit")) { + if (in_container) { + log_info("Exiting container."); + return arg_exit_code; + } + + cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */ + } + + switch (cmd) { + + case LINUX_REBOOT_CMD_KEXEC: + + if (!in_container) { + /* We cheat and exec kexec to avoid doing all its work */ + log_info("Rebooting with kexec."); + + r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL); + if (r == 0) { + const char * const args[] = { + KEXEC, "-e", NULL + }; + + /* Child */ + + execv(args[0], (char * const *) args); + log_debug_errno(errno, "Failed to execute '" KEXEC "' binary, proceeding with reboot(RB_KEXEC): %m"); + + /* execv failed (kexec binary missing?), so try simply reboot(RB_KEXEC) */ + (void) reboot(cmd); + _exit(EXIT_FAILURE); + } + + /* If we are still running, then the kexec can't have worked, let's fall through */ + } + + cmd = RB_AUTOBOOT; + _fallthrough_; + + case RB_AUTOBOOT: + (void) reboot_with_parameter(REBOOT_LOG); + log_info("Rebooting."); + break; + + case RB_POWER_OFF: + log_info("Powering off."); + break; + + case RB_HALT_SYSTEM: + log_info("Halting system."); + break; + + default: + assert_not_reached(); + } + + (void) reboot(cmd); + if (ERRNO_IS_PRIVILEGE(errno) && in_container) { + /* If we are in a container, and we lacked CAP_SYS_BOOT just exit, this will kill our + * container for good. */ + log_info("Exiting container."); + return EXIT_SUCCESS; + } + + r = log_error_errno(errno, "Failed to invoke reboot(): %m"); + + error: + log_struct_errno(LOG_EMERG, r, + LOG_MESSAGE("Critical error while doing system shutdown: %m"), + "MESSAGE_ID=" SD_MESSAGE_SHUTDOWN_ERROR_STR); + freeze(); +} diff --git a/src/shutdown/test-umount.c b/src/shutdown/test-umount.c new file mode 100644 index 0000000..93da2e0 --- /dev/null +++ b/src/shutdown/test-umount.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "detach-swap.h" +#include "errno-util.h" +#include "log.h" +#include "path-util.h" +#include "string-util.h" +#include "tests.h" +#include "umount.h" + +static void test_mount_points_list_one(const char *fname) { + _cleanup_(mount_points_list_free) LIST_HEAD(MountPoint, mp_list_head); + _cleanup_free_ char *testdata_fname = NULL; + + log_info("/* %s(\"%s\") */", __func__, fname ?: "/proc/self/mountinfo"); + + if (fname) { + assert_se(get_testdata_dir(fname, &testdata_fname) >= 0); + fname = testdata_fname; + } + + LIST_HEAD_INIT(mp_list_head); + assert_se(mount_points_list_get(fname, &mp_list_head) >= 0); + + LIST_FOREACH(mount_point, m, mp_list_head) + log_debug("path=%s o=%s f=0x%lx try-ro=%s", + m->path, + strempty(m->remount_options), + m->remount_flags, + yes_no(m->try_remount_ro)); +} + +TEST(mount_points_list) { + test_mount_points_list_one(NULL); + test_mount_points_list_one("/test-umount/empty.mountinfo"); + test_mount_points_list_one("/test-umount/garbled.mountinfo"); + test_mount_points_list_one("/test-umount/rhbug-1554943.mountinfo"); +} + +static void test_swap_list_one(const char *fname) { + _cleanup_(swap_devices_list_free) LIST_HEAD(SwapDevice, sd_list_head); + _cleanup_free_ char *testdata_fname = NULL; + int r; + + log_info("/* %s(\"%s\") */", __func__, fname ?: "/proc/swaps"); + + if (fname) { + assert_se(get_testdata_dir(fname, &testdata_fname) >= 0); + fname = testdata_fname; + } + + LIST_HEAD_INIT(sd_list_head); + r = swap_list_get(fname, &sd_list_head); + if (ERRNO_IS_PRIVILEGE(r)) + return; + assert_se(r >= 0); + + LIST_FOREACH(swap_device, m, sd_list_head) + log_debug("path=%s", m->path); +} + +TEST(swap_list) { + test_swap_list_one(NULL); + test_swap_list_one("/test-umount/example.swaps"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/shutdown/umount.c b/src/shutdown/umount.c new file mode 100644 index 0000000..1a9b99d --- /dev/null +++ b/src/shutdown/umount.c @@ -0,0 +1,494 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "fstab-util.h" +#include "libmount-util.h" +#include "mkdir.h" +#include "mount-setup.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "random-util.h" +#include "signal-util.h" +#include "umount.h" +#include "virt.h" + +static void mount_point_free(MountPoint **head, MountPoint *m) { + assert(head); + assert(m); + + LIST_REMOVE(mount_point, *head, m); + + free(m->path); + free(m->remount_options); + free(m); +} + +void mount_points_list_free(MountPoint **head) { + assert(head); + + while (*head) + mount_point_free(head, *head); +} + +int mount_points_list_get(const char *mountinfo, MountPoint **head) { + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + int r; + + assert(head); + + r = libmount_parse(mountinfo, NULL, &table, &iter); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", mountinfo ?: "/proc/self/mountinfo"); + + for (;;) { + _cleanup_free_ char *options = NULL, *remount_options = NULL; + struct libmnt_fs *fs; + const char *path, *fstype; + unsigned long remount_flags = 0u; + bool try_remount_ro, is_api_vfs; + _cleanup_free_ MountPoint *m = NULL; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) /* EOF */ + break; + if (r < 0) + return log_error_errno(r, "Failed to get next entry from %s: %m", mountinfo ?: "/proc/self/mountinfo"); + + path = mnt_fs_get_target(fs); + if (!path) + continue; + + fstype = mnt_fs_get_fstype(fs); + + /* Combine the generic VFS options with the FS-specific options. Duplicates are not a problem + * here, because the only options that should come up twice are typically ro/rw, which are + * turned into MS_RDONLY or the inversion of it. + * + * Even if there are duplicates later in mount_option_mangle() they shouldn't hurt anyways as + * they override each other. */ + if (!strextend_with_separator(&options, ",", mnt_fs_get_vfs_options(fs))) + return log_oom(); + if (!strextend_with_separator(&options, ",", mnt_fs_get_fs_options(fs))) + return log_oom(); + + /* Ignore mount points we can't unmount because they are API or because we are keeping them + * open (like /dev/console). Also, ignore all mounts below API file systems, since they are + * likely virtual too, and hence not worth spending time on. Also, in unprivileged containers + * we might lack the rights to unmount these things, hence don't bother. */ + if (mount_point_is_api(path) || + mount_point_ignore(path) || + PATH_STARTSWITH_SET(path, "/dev", "/sys", "/proc")) + continue; + + is_api_vfs = fstype_is_api_vfs(fstype); + + /* If we are in a container, don't attempt to read-only mount anything as that brings no real + * benefits, but might confuse the host, as we remount the superblock here, not the bind + * mount. + * + * If the filesystem is a network fs, also skip the remount. It brings no value (we cannot + * leave a "dirty fs") and could hang if the network is down. Note that umount2() is more + * careful and will not hang because of the network being down. */ + try_remount_ro = detect_container() <= 0 && + !fstype_is_network(fstype) && + !is_api_vfs && + !fstype_is_ro(fstype) && + !fstab_test_yes_no_option(options, "ro\0rw\0"); + + if (try_remount_ro) { + /* mount(2) states that mount flags and options need to be exactly the same as they + * were when the filesystem was mounted, except for the desired changes. So we + * reconstruct both here and adjust them for the later remount call too. */ + + r = mnt_fs_get_propagation(fs, &remount_flags); + if (r < 0) { + log_warning_errno(r, "mnt_fs_get_propagation() failed for %s, ignoring: %m", path); + continue; + } + + r = mount_option_mangle(options, remount_flags, &remount_flags, &remount_options); + if (r < 0) { + log_warning_errno(r, "mount_option_mangle failed for %s, ignoring: %m", path); + continue; + } + + /* MS_BIND is special. If it is provided it will only make the mount-point + * read-only. If left out, the super block itself is remounted, which we want. */ + remount_flags = (remount_flags|MS_REMOUNT|MS_RDONLY) & ~MS_BIND; + } + + m = new(MountPoint, 1); + if (!m) + return log_oom(); + + r = libmount_is_leaf(table, fs); + if (r < 0) + return log_error_errno(r, "Failed to get children mounts for %s from %s: %m", path, mountinfo ?: "/proc/self/mountinfo"); + bool leaf = r; + + *m = (MountPoint) { + .remount_options = remount_options, + .remount_flags = remount_flags, + .try_remount_ro = try_remount_ro, + + /* Unmount sysfs/procfs/… lazily, since syncing doesn't matter there, and it's OK if + * something keeps an fd open to it. */ + .umount_lazily = is_api_vfs, + .leaf = leaf, + }; + + m->path = strdup(path); + if (!m->path) + return log_oom(); + + TAKE_PTR(remount_options); + + LIST_PREPEND(mount_point, *head, TAKE_PTR(m)); + } + + return 0; +} + +static bool nonunmountable_path(const char *path) { + assert(path); + + return PATH_IN_SET(path, "/", "/usr") || + path_startswith(path, "/run/initramfs"); +} + +static void log_umount_blockers(const char *mnt) { + _cleanup_free_ char *blockers = NULL; + int r; + + _cleanup_closedir_ DIR *dir = opendir("/proc"); + if (!dir) + return (void) log_warning_errno(errno, "Failed to open /proc/: %m"); + + FOREACH_DIRENT_ALL(de, dir, break) { + if (!IN_SET(de->d_type, DT_DIR, DT_UNKNOWN)) + continue; + + pid_t pid; + if (parse_pid(de->d_name, &pid) < 0) + continue; + + _cleanup_free_ char *fdp = path_join(de->d_name, "fd"); + if (!fdp) + return (void) log_oom(); + + _cleanup_closedir_ DIR *fd_dir = xopendirat(dirfd(dir), fdp, 0); + if (!fd_dir) { + if (errno != ENOENT) /* process gone by now? */ + log_debug_errno(errno, "Failed to open /proc/%s/, ignoring: %m",fdp); + continue; + } + + bool culprit = false; + FOREACH_DIRENT(fd_de, fd_dir, break) { + _cleanup_free_ char *open_file = NULL; + + r = readlinkat_malloc(dirfd(fd_dir), fd_de->d_name, &open_file); + if (r < 0) { + if (r != -ENOENT) /* fd closed by now */ + log_debug_errno(r, "Failed to read link /proc/%s/%s, ignoring: %m", fdp, fd_de->d_name); + continue; + } + + if (path_startswith(open_file, mnt)) { + culprit = true; + break; + } + } + + if (!culprit) + continue; + + _cleanup_free_ char *comm = NULL; + r = pid_get_comm(pid, &comm); + if (r < 0) { + if (r != -ESRCH) /* process gone by now */ + log_debug_errno(r, "Failed to read process name of PID " PID_FMT ": %m", pid); + continue; + } + + if (!strextend_with_separator(&blockers, ", ", comm)) + return (void) log_oom(); + + if (!strextend(&blockers, "(", de->d_name, ")")) + return (void) log_oom(); + } + + if (blockers) + log_warning("Unmounting '%s' blocked by: %s", mnt, blockers); +} + +static int remount_with_timeout(MountPoint *m, bool last_try) { + _cleanup_close_pair_ int pfd[2] = EBADF_PAIR; + _cleanup_(sigkill_nowaitp) pid_t pid = 0; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + assert(m); + + r = pipe2(pfd, O_CLOEXEC|O_NONBLOCK); + if (r < 0) + return r; + + /* Due to the possibility of a remount operation hanging, we fork a child process and set a + * timeout. If the timeout lapses, the assumption is that the particular remount failed. */ + r = safe_fork_full("(sd-remount)", + NULL, + pfd, ELEMENTSOF(pfd), + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_REOPEN_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + pfd[0] = safe_close(pfd[0]); + + log_info("Remounting '%s' read-only with options '%s'.", m->path, strempty(m->remount_options)); + + /* Start the mount operation here in the child */ + r = mount(NULL, m->path, NULL, m->remount_flags, m->remount_options); + if (r < 0) + log_full_errno(last_try ? LOG_ERR : LOG_INFO, + errno, + "Failed to remount '%s' read-only: %m", + m->path); + + (void) write(pfd[1], &r, sizeof(r)); /* try to send errno up */ + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + pfd[1] = safe_close(pfd[1]); + + r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC); + if (r == -ETIMEDOUT) + log_error_errno(r, "Remounting '%s' timed out, issuing SIGKILL to PID " PID_FMT ".", m->path, pid); + else if (r == -EPROTO) { + /* Try to read error code from child */ + if (read(pfd[0], &r, sizeof(r)) == sizeof(r)) + log_debug_errno(r, "Remounting '%s' failed abnormally, child process " PID_FMT " failed: %m", m->path, pid); + else + r = log_debug_errno(EPROTO, "Remounting '%s' failed abnormally, child process " PID_FMT " aborted or exited non-zero.", m->path, pid); + TAKE_PID(pid); /* child exited (just not as we expected) hence don't kill anymore */ + } else if (r < 0) + log_error_errno(r, "Remounting '%s' failed unexpectedly, couldn't wait for child process " PID_FMT ": %m", m->path, pid); + + return r; +} + +static int umount_with_timeout(MountPoint *m, bool last_try) { + _cleanup_close_pair_ int pfd[2] = EBADF_PAIR; + _cleanup_(sigkill_nowaitp) pid_t pid = 0; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + assert(m); + + r = pipe2(pfd, O_CLOEXEC|O_NONBLOCK); + if (r < 0) + return r; + + /* Due to the possibility of a umount operation hanging, we fork a child process and set a + * timeout. If the timeout lapses, the assumption is that the particular umount failed. */ + r = safe_fork_full("(sd-umount)", + NULL, + pfd, ELEMENTSOF(pfd), + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_REOPEN_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + pfd[0] = safe_close(pfd[0]); + + log_info("Unmounting '%s'.", m->path); + + /* Start the mount operation here in the child Using MNT_FORCE causes some filesystems + * (e.g. FUSE and NFS and other network filesystems) to abort any pending requests and return + * -EIO rather than blocking indefinitely. If the filesysten is "busy", this may allow + * processes to die, thus making the filesystem less busy so the unmount might succeed + * (rather than return EBUSY). */ + r = RET_NERRNO(umount2(m->path, + UMOUNT_NOFOLLOW | /* Don't follow symlinks: this should never happen unless our mount list was wrong */ + (m->umount_lazily ? MNT_DETACH : MNT_FORCE))); + if (r < 0) { + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Failed to unmount %s: %m", m->path); + + if (r == -EBUSY && last_try) + log_umount_blockers(m->path); + } + + (void) write(pfd[1], &r, sizeof(r)); /* try to send errno up */ + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + pfd[1] = safe_close(pfd[1]); + + r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC); + if (r == -ETIMEDOUT) + log_error_errno(r, "Unmounting '%s' timed out, issuing SIGKILL to PID " PID_FMT ".", m->path, pid); + else if (r == -EPROTO) { + /* Try to read error code from child */ + if (read(pfd[0], &r, sizeof(r)) == sizeof(r)) + log_debug_errno(r, "Unmounting '%s' failed abnormally, child process " PID_FMT " failed: %m", m->path, pid); + else + r = log_debug_errno(EPROTO, "Unmounting '%s' failed abnormally, child process " PID_FMT " aborted or exited non-zero.", m->path, pid); + TAKE_PID(pid); /* It died, but abnormally, no purpose in killing */ + } else if (r < 0) + log_error_errno(r, "Unmounting '%s' failed unexpectedly, couldn't wait for child process " PID_FMT ": %m", m->path, pid); + + return r; +} + +/* This includes remounting readonly, which changes the kernel mount options. Therefore the list passed to + * this function is invalidated, and should not be reused. */ +static int mount_points_list_umount(MountPoint **head, bool *changed, bool last_try) { + int n_failed = 0, r; + _cleanup_free_ char *resolved_mounts_path = NULL; + + assert(head); + assert(changed); + + LIST_FOREACH(mount_point, m, *head) { + if (m->try_remount_ro) { + /* We always try to remount directories read-only first, before we go on and umount + * them. + * + * Mount points can be stacked. If a mount point is stacked below / or /usr, we + * cannot umount or remount it directly, since there is no way to refer to the + * underlying mount. There's nothing we can do about it for the general case, but we + * can do something about it if it is aliased somewhere else via a bind mount. If we + * explicitly remount the super block of that alias read-only we hence should be + * relatively safe regarding keeping a dirty fs we cannot otherwise see. + * + * Since the remount can hang in the instance of remote filesystems, we remount + * asynchronously and skip the subsequent umount if it fails. */ + if (remount_with_timeout(m, last_try) < 0) { + /* Remount failed, but try unmounting anyway, + * unless this is a mount point we want to skip. */ + if (nonunmountable_path(m->path)) { + n_failed++; + continue; + } + } + } + + /* Skip / and /usr since we cannot unmount that anyway, since we are running from it. They + * have already been remounted ro. */ + if (nonunmountable_path(m->path)) + continue; + + /* Trying to umount */ + r = umount_with_timeout(m, last_try); + if (r < 0) + n_failed++; + else + *changed = true; + + /* If a mount is busy, we move it to not keep parent mount points busy. + * If a mount point is not a leaf, moving it would invalidate our mount table. + * More moving will occur in next iteration with a fresh mount table. + */ + if (r != -EBUSY || !m->leaf) + continue; + + _cleanup_free_ char *dirname = NULL; + + r = path_extract_directory(m->path, &dirname); + if (r < 0) { + n_failed++; + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Cannot find directory for %s: %m", m->path); + continue; + } + + /* We need to canonicalize /run/shutdown/mounts. We cannot compare inodes, since /run + * might be bind mounted somewhere we want to unmount. And we need to move all mounts in + * /run/shutdown/mounts from there. + */ + if (!resolved_mounts_path) + (void) chase("/run/shutdown/mounts", NULL, 0, &resolved_mounts_path, NULL); + if (!path_equal(dirname, resolved_mounts_path)) { + char newpath[STRLEN("/run/shutdown/mounts/") + 16 + 1]; + + xsprintf(newpath, "/run/shutdown/mounts/%016" PRIx64, random_u64()); + + /* on error of is_dir, assume directory */ + if (is_dir(m->path, true) != 0) { + r = mkdir_p(newpath, 0000); + if (r < 0) { + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Could not create directory %s: %m", newpath); + continue; + } + } else { + r = touch_file(newpath, /* parents= */ true, USEC_INFINITY, UID_INVALID, GID_INVALID, 0700); + if (r < 0) { + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Could not create file %s: %m", newpath); + continue; + } + } + + log_info("Moving mount %s to %s.", m->path, newpath); + + r = RET_NERRNO(mount(m->path, newpath, NULL, MS_MOVE, NULL)); + if (r < 0) { + n_failed++; + log_full_errno(last_try ? LOG_ERR : LOG_INFO, r, "Could not move %s to %s: %m", m->path, newpath); + } else + *changed = true; + } + } + + return n_failed; +} + +static int umount_all_once(bool *changed, bool last_try) { + _cleanup_(mount_points_list_free) LIST_HEAD(MountPoint, mp_list_head); + int r; + + assert(changed); + + LIST_HEAD_INIT(mp_list_head); + r = mount_points_list_get(NULL, &mp_list_head); + if (r < 0) + return r; + + return mount_points_list_umount(&mp_list_head, changed, last_try); +} + +int umount_all(bool *changed, bool last_try) { + bool umount_changed; + int r; + + assert(changed); + + /* Retry umount, until nothing can be umounted anymore. Mounts are processed in order, newest + * first. The retries are needed when an old mount has been moved, to a path inside a newer mount. */ + do { + umount_changed = false; + + r = umount_all_once(&umount_changed, last_try); + if (umount_changed) + *changed = true; + } while (umount_changed); + + return r; +} diff --git a/src/shutdown/umount.h b/src/shutdown/umount.h new file mode 100644 index 0000000..f8f9ae8 --- /dev/null +++ b/src/shutdown/umount.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +/*** + Copyright © 2010 ProFUSION embedded systems +***/ + +#include + +#include "list.h" + +int umount_all(bool *changed, bool last_try); + +/* This is exported just for testing */ +typedef struct MountPoint { + char *path; + char *remount_options; + unsigned long remount_flags; + bool try_remount_ro; + bool umount_lazily; + bool leaf; + LIST_FIELDS(struct MountPoint, mount_point); +} MountPoint; + +int mount_points_list_get(const char *mountinfo, MountPoint **head); +void mount_points_list_free(MountPoint **head); diff --git a/src/sleep/battery-capacity.c b/src/sleep/battery-capacity.c new file mode 100644 index 0000000..62a0746 --- /dev/null +++ b/src/sleep/battery-capacity.c @@ -0,0 +1,384 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-device.h" + +#include "battery-capacity.h" +#include "battery-util.h" +#include "device-private.h" +#include "device-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "hexdecoct.h" +#include "id128-util.h" +#include "parse-util.h" +#include "siphash24.h" + +#define DISCHARGE_RATE_FILEPATH "/var/lib/systemd/sleep/battery_discharge_percentage_rate_per_hour" +#define BATTERY_DISCHARGE_RATE_HASH_KEY SD_ID128_MAKE(5f,9a,20,18,38,76,46,07,8d,36,58,0b,bb,c4,e0,63) + +static void *CAPACITY_TO_PTR(int capacity) { + assert(capacity >= 0); + assert(capacity <= 100); + return INT_TO_PTR(capacity + 1); +} + +static int PTR_TO_CAPACITY(void *p) { + int capacity = PTR_TO_INT(p) - 1; + assert(capacity >= 0); + assert(capacity <= 100); + return capacity; +} + +static int siphash24_compress_device_sysattr( + sd_device *dev, + const char *attr, + struct siphash *state) { + + const char *x; + int r; + + assert(dev); + assert(attr); + assert(state); + + r = sd_device_get_sysattr_value(dev, attr, &x); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to read '%s' attribute: %m", attr); + + if (!isempty(x)) + siphash24_compress_string(x, state); + + return 0; +} + +static int siphash24_compress_id128( + int (*getter)(sd_id128_t *ret), + const char *name, + struct siphash *state) { + + sd_id128_t id; + int r; + + assert(getter); + assert(name); + assert(state); + + r = getter(&id); + if (r < 0) + return log_debug_errno(r, "Failed to get %s ID: %m", name); + + siphash24_compress(&id, sizeof(sd_id128_t), state); + return 0; +} + +/* Read system and battery identifier from specific location and generate hash of it */ +static uint64_t system_battery_identifier_hash(sd_device *dev) { + struct siphash state; + + assert(dev); + + siphash24_init(&state, BATTERY_DISCHARGE_RATE_HASH_KEY.bytes); + + (void) siphash24_compress_device_sysattr(dev, "manufacturer", &state); + (void) siphash24_compress_device_sysattr(dev, "model_name", &state); + (void) siphash24_compress_device_sysattr(dev, "serial_number", &state); + (void) siphash24_compress_id128(sd_id128_get_machine, "machine", &state); + (void) siphash24_compress_id128(id128_get_product, "product", &state); + + return siphash24_finalize(&state); +} + +/* Return success if battery percentage discharge rate per hour is in the range 1–199 */ +static bool battery_discharge_rate_is_valid(int battery_discharge_rate) { + return battery_discharge_rate > 0 && battery_discharge_rate < 200; +} + +/* Battery percentage discharge rate per hour is read from specific file. It is stored along with system + * and battery identifier hash to maintain the integrity of discharge rate value */ +static int get_battery_discharge_rate(sd_device *dev, int *ret) { + _cleanup_fclose_ FILE *f = NULL; + uint64_t current_hash_id; + const char *p; + int r; + + assert(dev); + assert(ret); + + f = fopen(DISCHARGE_RATE_FILEPATH, "re"); + if (!f) + return log_debug_errno(errno, "Failed to read discharge rate from " DISCHARGE_RATE_FILEPATH ": %m"); + + current_hash_id = system_battery_identifier_hash(dev); + + for (;;) { + _cleanup_free_ char *stored_hash_id = NULL, *stored_discharge_rate = NULL, *line = NULL; + uint64_t hash_id; + int discharge_rate; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_debug_errno(r, "Failed to read discharge rate from " DISCHARGE_RATE_FILEPATH ": %m"); + if (r == 0) + break; + + p = line; + r = extract_many_words(&p, NULL, 0, &stored_hash_id, &stored_discharge_rate, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to parse hash_id and discharge_rate read from " DISCHARGE_RATE_FILEPATH ": %m"); + if (r != 2) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid number of items fetched from " DISCHARGE_RATE_FILEPATH); + + r = safe_atou64(stored_hash_id, &hash_id); + if (r < 0) + return log_debug_errno(r, "Failed to parse hash ID read from " DISCHARGE_RATE_FILEPATH " location: %m"); + + if (current_hash_id != hash_id) + /* matching device not found, move to next line */ + continue; + + r = safe_atoi(stored_discharge_rate, &discharge_rate); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse discharge rate read from " DISCHARGE_RATE_FILEPATH ": %m"); + + if (!battery_discharge_rate_is_valid(discharge_rate)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ERANGE), "Invalid battery discharge percentage rate per hour: %m"); + + *ret = discharge_rate; + return 0; /* matching device found, exit iteration */ + } + + return -ENOENT; +} + +/* Write battery percentage discharge rate per hour along with system and battery identifier hash to file */ +static int put_battery_discharge_rate(int estimated_battery_discharge_rate, uint64_t system_hash_id, bool trunc) { + int r; + + if (!battery_discharge_rate_is_valid(estimated_battery_discharge_rate)) + return log_debug_errno(SYNTHETIC_ERRNO(ERANGE), + "Invalid battery discharge rate %d%% per hour: %m", + estimated_battery_discharge_rate); + + r = write_string_filef( + DISCHARGE_RATE_FILEPATH, + WRITE_STRING_FILE_CREATE | WRITE_STRING_FILE_MKDIR_0755 | (trunc ? WRITE_STRING_FILE_TRUNCATE : 0), + "%"PRIu64" %d", + system_hash_id, + estimated_battery_discharge_rate); + if (r < 0) + return log_debug_errno(r, "Failed to update %s: %m", DISCHARGE_RATE_FILEPATH); + + log_debug("Estimated discharge rate %d%% per hour successfully saved to %s", estimated_battery_discharge_rate, DISCHARGE_RATE_FILEPATH); + + return 0; +} + +/* Store current capacity of each battery before suspension and timestamp */ +int fetch_batteries_capacity_by_name(Hashmap **ret) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_hashmap_free_ Hashmap *batteries_capacity_by_name = NULL; + int r; + + assert(ret); + + batteries_capacity_by_name = hashmap_new(&string_hash_ops_free); + if (!batteries_capacity_by_name) + return log_oom_debug(); + + r = battery_enumerator_new(&e); + if (r < 0) + return log_debug_errno(r, "Failed to initialize battery enumerator: %m"); + + FOREACH_DEVICE(e, dev) { + _cleanup_free_ char *battery_name_copy = NULL; + const char *battery_name; + int battery_capacity; + + battery_capacity = r = battery_read_capacity_percentage(dev); + if (r < 0) + continue; + + r = sd_device_get_property_value(dev, "POWER_SUPPLY_NAME", &battery_name); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to get POWER_SUPPLY_NAME property, ignoring: %m"); + continue; + } + + battery_name_copy = strdup(battery_name); + if (!battery_name_copy) + return log_oom_debug(); + + r = hashmap_put(batteries_capacity_by_name, battery_name_copy, CAPACITY_TO_PTR(battery_capacity)); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to store battery capacity: %m"); + + TAKE_PTR(battery_name_copy); + } + + *ret = TAKE_PTR(batteries_capacity_by_name); + + return 0; +} + +int get_capacity_by_name(Hashmap *capacities_by_name, const char *name) { + void *p; + + assert(capacities_by_name); + assert(name); + + p = hashmap_get(capacities_by_name, name); + if (!p) + return -ENOENT; + + return PTR_TO_CAPACITY(p); +} + +/* Estimate battery discharge rate using stored previous and current capacity over timestamp difference */ +int estimate_battery_discharge_rate_per_hour( + Hashmap *last_capacity, + Hashmap *current_capacity, + usec_t before_timestamp, + usec_t after_timestamp) { + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool trunc = true; + int r; + + assert(last_capacity); + assert(current_capacity); + assert(before_timestamp < after_timestamp); + + r = battery_enumerator_new(&e); + if (r < 0) + return log_debug_errno(r, "Failed to initialize battery enumerator: %m"); + + FOREACH_DEVICE(e, dev) { + int battery_last_capacity, battery_current_capacity, battery_discharge_rate; + const char *battery_name; + uint64_t system_hash_id; + + r = sd_device_get_property_value(dev, "POWER_SUPPLY_NAME", &battery_name); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to read battery name, ignoring: %m"); + continue; + } + + battery_last_capacity = get_capacity_by_name(last_capacity, battery_name); + if (battery_last_capacity < 0) + continue; + + battery_current_capacity = get_capacity_by_name(current_capacity, battery_name); + if (battery_current_capacity < 0) + continue; + + if (battery_current_capacity >= battery_last_capacity) { + log_device_debug(dev, "Battery was not discharged during suspension"); + continue; + } + + system_hash_id = system_battery_identifier_hash(dev); + + log_device_debug(dev, + "%d%% was discharged in %s. Estimating discharge rate...", + battery_last_capacity - battery_current_capacity, + FORMAT_TIMESPAN(after_timestamp - before_timestamp, USEC_PER_SEC)); + + battery_discharge_rate = (battery_last_capacity - battery_current_capacity) * USEC_PER_HOUR / (after_timestamp - before_timestamp); + r = put_battery_discharge_rate(battery_discharge_rate, system_hash_id, trunc); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to update battery discharge rate, ignoring: %m"); + else + trunc = false; + } + + return 0; +} + +/* Calculate the suspend interval for each battery and then return their sum */ +int get_total_suspend_interval(Hashmap *last_capacity, usec_t *ret) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + usec_t total_suspend_interval = 0; + int r; + + assert(last_capacity); + assert(ret); + + r = battery_enumerator_new(&e); + if (r < 0) + return log_debug_errno(r, "Failed to initialize battery enumerator: %m"); + + FOREACH_DEVICE(e, dev) { + int battery_last_capacity, previous_discharge_rate = 0; + const char *battery_name; + usec_t suspend_interval; + + r = sd_device_get_property_value(dev, "POWER_SUPPLY_NAME", &battery_name); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to read battery name, ignoring: %m"); + continue; + } + + battery_last_capacity = get_capacity_by_name(last_capacity, battery_name); + if (battery_last_capacity <= 0) + continue; + + r = get_battery_discharge_rate(dev, &previous_discharge_rate); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to get discharge rate, ignoring: %m"); + continue; + } + + if (previous_discharge_rate == 0) + continue; + + if (battery_last_capacity * 2 <= previous_discharge_rate) { + log_device_debug(dev, "Current battery capacity percentage too low compared to discharge rate"); + continue; + } + suspend_interval = battery_last_capacity * USEC_PER_HOUR / previous_discharge_rate; + + total_suspend_interval = usec_add(total_suspend_interval, suspend_interval); + } + /* Previous discharge rate is stored in per hour basis converted to usec. + * Subtract 30 minutes from the result to keep a buffer of 30 minutes before battery gets critical */ + total_suspend_interval = usec_sub_unsigned(total_suspend_interval, 30 * USEC_PER_MINUTE); + if (total_suspend_interval == 0) + return -ENOENT; + + *ret = total_suspend_interval; + + return 0; +} + +/* Return true if all batteries have acpi_btp support */ +int battery_trip_point_alarm_exists(void) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool has_battery = false; + int r; + + r = battery_enumerator_new(&e); + if (r < 0) + return log_debug_errno(r, "Failed to initialize battery enumerator: %m"); + + FOREACH_DEVICE(e, dev) { + const char *alarm_attr; + int has_alarm; + + has_battery = true; + + r = sd_device_get_sysattr_value(dev, "alarm", &alarm_attr); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to read battery alarm attribute: %m"); + + r = safe_atoi(alarm_attr, &has_alarm); + if (r < 0) + return log_device_debug_errno(dev, r, + "Failed to parse battery alarm attribute '%s': %m", + alarm_attr); + if (has_alarm <= 0) + return false; + } + + return has_battery; +} diff --git a/src/sleep/battery-capacity.h b/src/sleep/battery-capacity.h new file mode 100644 index 0000000..df7b06c --- /dev/null +++ b/src/sleep/battery-capacity.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" +#include "time-util.h" + +int fetch_batteries_capacity_by_name(Hashmap **ret_current_capacity); +int get_capacity_by_name(Hashmap *capacities_by_name, const char *name); + +int get_total_suspend_interval(Hashmap *last_capacity, usec_t *ret); + +int estimate_battery_discharge_rate_per_hour( + Hashmap *last_capacity, + Hashmap *current_capacity, + usec_t before_timestamp, + usec_t after_timestamp); + +int battery_trip_point_alarm_exists(void); diff --git a/src/sleep/meson.build b/src/sleep/meson.build new file mode 100644 index 0000000..fc0037e --- /dev/null +++ b/src/sleep/meson.build @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-sleep', + 'sources' : files( + 'sleep.c', + 'battery-capacity.c', + ), + }, + test_template + { + 'sources' : files( + 'test-battery-capacity.c', + 'battery-capacity.c', + ), + }, +] + +if install_sysconfdir_samples + install_data('sleep.conf', + install_dir : pkgconfigfiledir) +endif diff --git a/src/sleep/sleep.c b/src/sleep/sleep.c new file mode 100644 index 0000000..21062b2 --- /dev/null +++ b/src/sleep/sleep.c @@ -0,0 +1,651 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2010-2017 Canonical + Copyright © 2018 Dell Inc. +***/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-device.h" +#include "sd-id128.h" +#include "sd-messages.h" + +#include "battery-capacity.h" +#include "battery-util.h" +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "constants.h" +#include "devnum-util.h" +#include "efivars.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "hibernate-util.h" +#include "id128-util.h" +#include "io-util.h" +#include "json.h" +#include "log.h" +#include "main-func.h" +#include "os-util.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "sleep-config.h" +#include "special.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" + +#define DEFAULT_HIBERNATE_DELAY_USEC_NO_BATTERY (2 * USEC_PER_HOUR) + +static SleepOperation arg_operation = _SLEEP_OPERATION_INVALID; + +static int write_efi_hibernate_location(const HibernationDevice *hibernation_device, bool required) { + int log_level = required ? LOG_ERR : LOG_DEBUG; + +#if ENABLE_EFI + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_free_ char *formatted = NULL, *id = NULL, *image_id = NULL, + *version_id = NULL, *image_version = NULL; + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + const char *uuid_str; + sd_id128_t uuid; + struct utsname uts = {}; + int r, log_level_ignore = required ? LOG_WARNING : LOG_DEBUG; + + assert(hibernation_device); + + if (!is_efi_boot()) + return log_full_errno(log_level, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Not an EFI boot, passing HibernateLocation via EFI variable is not possible."); + + r = sd_device_new_from_devnum(&device, 'b', hibernation_device->devno); + if (r < 0) + return log_full_errno(log_level, r, "Failed to create sd-device object for '%s': %m", + hibernation_device->path); + + r = sd_device_get_property_value(device, "ID_FS_UUID", &uuid_str); + if (r < 0) + return log_full_errno(log_level, r, "Failed to get filesystem UUID for device '%s': %m", + hibernation_device->path); + + r = sd_id128_from_string(uuid_str, &uuid); + if (r < 0) + return log_full_errno(log_level, r, "Failed to parse ID_FS_UUID '%s' for device '%s': %m", + uuid_str, hibernation_device->path); + + if (uname(&uts) < 0) + log_full_errno(log_level_ignore, errno, "Failed to get kernel info, ignoring: %m"); + + r = parse_os_release(NULL, + "ID", &id, + "IMAGE_ID", &image_id, + "VERSION_ID", &version_id, + "IMAGE_VERSION", &image_version); + if (r < 0) + log_full_errno(log_level_ignore, r, "Failed to parse os-release, ignoring: %m"); + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UUID("uuid", uuid), + JSON_BUILD_PAIR_UNSIGNED("offset", hibernation_device->offset), + JSON_BUILD_PAIR_CONDITION(!isempty(uts.release), "kernelVersion", JSON_BUILD_STRING(uts.release)), + JSON_BUILD_PAIR_CONDITION(id, "osReleaseId", JSON_BUILD_STRING(id)), + JSON_BUILD_PAIR_CONDITION(image_id, "osReleaseImageId", JSON_BUILD_STRING(image_id)), + JSON_BUILD_PAIR_CONDITION(version_id, "osReleaseVersionId", JSON_BUILD_STRING(version_id)), + JSON_BUILD_PAIR_CONDITION(image_version, "osReleaseImageVersion", JSON_BUILD_STRING(image_version)))); + if (r < 0) + return log_full_errno(log_level, r, "Failed to build JSON object: %m"); + + r = json_variant_format(v, 0, &formatted); + if (r < 0) + return log_full_errno(log_level, r, "Failed to format JSON object: %m"); + + r = efi_set_variable_string(EFI_SYSTEMD_VARIABLE(HibernateLocation), formatted); + if (r < 0) + return log_full_errno(log_level, r, "Failed to set EFI variable HibernateLocation: %m"); + + log_debug("Set EFI variable HibernateLocation to '%s'.", formatted); + return 0; +#else + return log_full_errno(log_level, SYNTHETIC_ERRNO(EOPNOTSUPP), + "EFI support not enabled, passing HibernateLocation via EFI variable is not possible."); +#endif +} + +static int write_state(int fd, char * const *states) { + int r = 0; + + assert(fd >= 0); + assert(states); + + STRV_FOREACH(state, states) { + _cleanup_fclose_ FILE *f = NULL; + int k; + + k = fdopen_independent(fd, "we", &f); + if (k < 0) + return RET_GATHER(r, k); + + k = write_string_stream(f, *state, WRITE_STRING_FILE_DISABLE_BUFFER); + if (k >= 0) { + log_debug("Using sleep state '%s'.", *state); + return 0; + } + + RET_GATHER(r, log_debug_errno(k, "Failed to write '%s' to /sys/power/state: %m", *state)); + } + + return r; +} + +static int write_mode(char * const *modes) { + int r = 0; + + STRV_FOREACH(mode, modes) { + int k; + + k = write_string_file("/sys/power/disk", *mode, WRITE_STRING_FILE_DISABLE_BUFFER); + if (k >= 0) { + log_debug("Using sleep disk mode '%s'.", *mode); + return 0; + } + + RET_GATHER(r, log_debug_errno(k, "Failed to write '%s' to /sys/power/disk: %m", *mode)); + } + + return r; +} + +static int lock_all_homes(void) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + /* Let's synchronously lock all home directories managed by homed that have been marked for it. This + * way the key material required to access these volumes is hopefully removed from memory. */ + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to system bus: %m"); + + r = bus_message_new_method_call(bus, &m, bus_home_mgr, "LockAllHomes"); + if (r < 0) + return bus_log_create_error(r); + + /* If homed is not running it can't have any home directories active either. */ + r = sd_bus_message_set_auto_start(m, false); + if (r < 0) + return log_error_errno(r, "Failed to disable auto-start of LockAllHomes() message: %m"); + + r = sd_bus_call(bus, m, DEFAULT_TIMEOUT_USEC, &error, NULL); + if (r < 0) { + if (!bus_error_is_unknown_service(&error)) + return log_error_errno(r, "Failed to lock home directories: %s", bus_error_message(&error, r)); + + log_debug("systemd-homed is not running, locking of home directories skipped."); + } else + log_debug("Successfully requested locking of all home directories."); + return 0; +} + +static int execute( + const SleepConfig *sleep_config, + SleepOperation operation, + const char *action) { + + const char *arguments[] = { + NULL, + "pre", + /* NB: we use 'arg_operation' instead of 'operation' here, as we want to communicate the overall + * operation here, not the specific one, in case of s2h. */ + sleep_operation_to_string(arg_operation), + NULL + }; + static const char* const dirs[] = { + SYSTEM_SLEEP_PATH, + NULL + }; + + _cleanup_(hibernation_device_done) HibernationDevice hibernation_device = {}; + _cleanup_close_ int state_fd = -EBADF; + int r; + + assert(sleep_config); + assert(operation >= 0); + assert(operation < _SLEEP_OPERATION_CONFIG_MAX); /* Others are handled by execute_s2h() instead */ + + if (strv_isempty(sleep_config->states[operation])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "No sleep states configured for sleep operation %s, can't sleep.", + sleep_operation_to_string(operation)); + + /* This file is opened first, so that if we hit an error, we can abort before modifying any state. */ + state_fd = open("/sys/power/state", O_WRONLY|O_CLOEXEC); + if (state_fd < 0) + return -errno; + + /* Configure hibernation settings if we are supposed to hibernate */ + if (sleep_operation_is_hibernation(operation)) { + bool resume_set; + + r = find_suitable_hibernation_device(&hibernation_device); + if (r < 0) + return log_error_errno(r, "Failed to find location to hibernate to: %m"); + resume_set = r > 0; + + r = write_efi_hibernate_location(&hibernation_device, !resume_set); + if (!resume_set) { + if (r == -EOPNOTSUPP) + return log_error_errno(r, "No valid 'resume=' option found, refusing to hibernate."); + if (r < 0) + return r; + + r = write_resume_config(hibernation_device.devno, hibernation_device.offset, hibernation_device.path); + if (r < 0) + goto fail; + } + + r = write_mode(sleep_config->modes[operation]); + if (r < 0) { + log_error_errno(r, "Failed to write mode to /sys/power/disk: %m"); + goto fail; + } + } + + /* Pass an action string to the call-outs. This is mostly our operation string, except if the + * hibernate step of s-t-h fails, in which case we communicate that with a separate action. */ + if (!action) + action = sleep_operation_to_string(operation); + + if (setenv("SYSTEMD_SLEEP_ACTION", action, /* overwrite = */ 1) < 0) + log_warning_errno(errno, "Failed to set SYSTEMD_SLEEP_ACTION=%s, ignoring: %m", action); + + (void) execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, (char **) arguments, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + (void) lock_all_homes(); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_SLEEP_START_STR, + LOG_MESSAGE("Performing sleep operation '%s'...", sleep_operation_to_string(operation)), + "SLEEP=%s", sleep_operation_to_string(arg_operation)); + + r = write_state(state_fd, sleep_config->states[operation]); + if (r < 0) + log_struct_errno(LOG_ERR, r, + "MESSAGE_ID=" SD_MESSAGE_SLEEP_STOP_STR, + LOG_MESSAGE("Failed to put system to sleep. System resumed again: %m"), + "SLEEP=%s", sleep_operation_to_string(arg_operation)); + else + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_SLEEP_STOP_STR, + LOG_MESSAGE("System returned from sleep operation '%s'.", sleep_operation_to_string(arg_operation)), + "SLEEP=%s", sleep_operation_to_string(arg_operation)); + + arguments[1] = "post"; + (void) execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, (char **) arguments, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + + if (r >= 0) + return 0; + +fail: + if (sleep_operation_is_hibernation(operation) && is_efi_boot()) + (void) efi_set_variable(EFI_SYSTEMD_VARIABLE(HibernateLocation), NULL, 0); + + return r; +} + +/* Return true if wakeup type is APM timer */ +static int check_wakeup_type(void) { + static const char dmi_object_path[] = "/sys/firmware/dmi/entries/1-0/raw"; + uint8_t wakeup_type_byte, tablesize; + _cleanup_free_ char *buf = NULL; + size_t bufsize; + int r; + + /* implementation via dmi/entries */ + r = read_full_virtual_file(dmi_object_path, &buf, &bufsize); + if (r < 0) + return log_debug_errno(r, "Unable to read %s: %m", dmi_object_path); + if (bufsize < 25) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Only read %zu bytes from %s (expected 25)", + bufsize, dmi_object_path); + + /* index 1 stores the size of table */ + tablesize = (uint8_t) buf[1]; + if (tablesize < 25) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Table size less than the index[0x18] where waketype byte is available."); + + wakeup_type_byte = (uint8_t) buf[24]; + /* 0 is Reserved and 8 is AC Power Restored. As per table 12 in + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.4.0.pdf */ + if (wakeup_type_byte >= 128) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Expected value in range 0-127"); + + if (wakeup_type_byte == 3) { + log_debug("DMI BIOS System Information indicates wakeup type is APM Timer"); + return true; + } + + return false; +} + +static int custom_timer_suspend(const SleepConfig *sleep_config) { + usec_t hibernate_timestamp; + int r; + + assert(sleep_config); + + hibernate_timestamp = usec_add(now(CLOCK_BOOTTIME), sleep_config->hibernate_delay_usec); + + while (battery_is_discharging_and_low() == 0) { + _cleanup_hashmap_free_ Hashmap *last_capacity = NULL, *current_capacity = NULL; + _cleanup_close_ int tfd = -EBADF; + struct itimerspec ts = {}; + usec_t suspend_interval; + bool woken_by_timer; + + tfd = timerfd_create(CLOCK_BOOTTIME_ALARM, TFD_NONBLOCK|TFD_CLOEXEC); + if (tfd < 0) + return log_error_errno(errno, "Error creating timerfd: %m"); + + /* Store current battery capacity before suspension */ + r = fetch_batteries_capacity_by_name(&last_capacity); + if (r < 0) + return log_error_errno(r, "Error fetching battery capacity percentage: %m"); + + if (hashmap_isempty(last_capacity)) + /* In case of no battery, system suspend interval will be set to HibernateDelaySec= or 2 hours. */ + suspend_interval = timestamp_is_set(hibernate_timestamp) + ? sleep_config->hibernate_delay_usec : DEFAULT_HIBERNATE_DELAY_USEC_NO_BATTERY; + else { + r = get_total_suspend_interval(last_capacity, &suspend_interval); + if (r < 0) { + log_debug_errno(r, "Failed to estimate suspend interval using previous discharge rate, ignoring: %m"); + /* In case of any errors, especially when we do not know the battery + * discharging rate, system suspend interval will be set to + * SuspendEstimationSec=. */ + suspend_interval = sleep_config->suspend_estimation_usec; + } + } + + /* Do not suspend more than HibernateDelaySec= */ + usec_t before_timestamp = now(CLOCK_BOOTTIME); + suspend_interval = MIN(suspend_interval, usec_sub_unsigned(hibernate_timestamp, before_timestamp)); + if (suspend_interval <= 0) + break; /* system should hibernate */ + + log_debug("Set timerfd wake alarm for %s", FORMAT_TIMESPAN(suspend_interval, USEC_PER_SEC)); + /* Wake alarm for system with or without battery to hibernate or estimate discharge rate whichever is applicable */ + timespec_store(&ts.it_value, suspend_interval); + + if (timerfd_settime(tfd, 0, &ts, NULL) < 0) + return log_error_errno(errno, "Error setting battery estimate timer: %m"); + + r = execute(sleep_config, SLEEP_SUSPEND, NULL); + if (r < 0) + return r; + + r = fd_wait_for_event(tfd, POLLIN, 0); + if (r < 0) + return log_error_errno(r, "Error polling timerfd: %m"); + /* Store fd_wait status */ + woken_by_timer = FLAGS_SET(r, POLLIN); + + r = fetch_batteries_capacity_by_name(¤t_capacity); + if (r < 0 || hashmap_isempty(current_capacity)) { + /* In case of no battery or error while getting charge level, no need to measure + * discharge rate. Instead the system should wake up if it is manual wakeup or + * hibernate if this is a timer wakeup. */ + if (r < 0) + log_debug_errno(r, "Battery capacity percentage unavailable, cannot estimate discharge rate: %m"); + else + log_debug("No battery found."); + if (!woken_by_timer) + return 0; + break; + } + + usec_t after_timestamp = now(CLOCK_BOOTTIME); + log_debug("Attempting to estimate battery discharge rate after wakeup from %s sleep", + FORMAT_TIMESPAN(after_timestamp - before_timestamp, USEC_PER_HOUR)); + + if (after_timestamp != before_timestamp) { + r = estimate_battery_discharge_rate_per_hour(last_capacity, current_capacity, before_timestamp, after_timestamp); + if (r < 0) + log_warning_errno(r, "Failed to estimate and update battery discharge rate, ignoring: %m"); + } else + log_debug("System woke up too early to estimate discharge rate"); + + if (!woken_by_timer) + /* Return as manual wakeup done. This also will return in case battery was charged during suspension */ + return 0; + + r = check_wakeup_type(); + if (r < 0) + log_debug_errno(r, "Failed to check hardware wakeup type, ignoring: %m"); + if (r > 0) { + log_debug("wakeup type is APM timer"); + /* system should hibernate */ + break; + } + } + + return 1; +} + +/* Freeze when invoked and thaw on cleanup */ +static int freeze_thaw_user_slice(const char **method) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + if (!method || !*method) + return 0; + + r = bus_connect_system_systemd(&bus); + if (r < 0) + return log_debug_errno(r, "Failed to open connection to systemd: %m"); + + (void) sd_bus_set_method_call_timeout(bus, FREEZE_TIMEOUT); + + r = bus_call_method(bus, bus_systemd_mgr, *method, &error, NULL, "s", SPECIAL_USER_SLICE); + if (r < 0) + return log_debug_errno(r, "Failed to execute operation: %s", bus_error_message(&error, r)); + + return 1; +} + +static int execute_s2h(const SleepConfig *sleep_config) { + _unused_ _cleanup_(freeze_thaw_user_slice) const char *auto_method_thaw = "ThawUnit"; + int r; + + assert(sleep_config); + + r = freeze_thaw_user_slice(&(const char*) { "FreezeUnit" }); + if (r < 0) + log_debug_errno(r, "Failed to freeze unit user.slice, ignoring: %m"); + + /* Only check if we have automated battery alarms if HibernateDelaySec= is not set, as in that case + * we'll busy poll for the configured interval instead */ + if (!timestamp_is_set(sleep_config->hibernate_delay_usec)) { + r = check_wakeup_type(); + if (r < 0) + log_debug_errno(r, "Failed to check hardware wakeup type, ignoring: %m"); + else { + r = battery_trip_point_alarm_exists(); + if (r < 0) + log_debug_errno(r, "Failed to check whether acpi_btp support is enabled or not, ignoring: %m"); + } + } else + r = 0; /* Force fallback path */ + + if (r > 0) { /* If we have both wakeup alarms and battery trip point support, use them */ + log_debug("Attempting to suspend..."); + r = execute(sleep_config, SLEEP_SUSPEND, NULL); + if (r < 0) + return r; + + r = check_wakeup_type(); + if (r < 0) + return log_debug_errno(r, "Failed to check hardware wakeup type: %m"); + + if (r == 0) + /* For APM Timer wakeup, system should hibernate else wakeup */ + return 0; + } else { + r = custom_timer_suspend(sleep_config); + if (r < 0) + return log_debug_errno(r, "Suspend cycle with manual battery discharge rate estimation failed: %m"); + if (r == 0) + /* manual wakeup */ + return 0; + } + /* For above custom timer, if 1 is returned, system will directly hibernate */ + + log_debug("Attempting to hibernate"); + r = execute(sleep_config, SLEEP_HIBERNATE, NULL); + if (r < 0) { + log_notice("Couldn't hibernate, will try to suspend again."); + + r = execute(sleep_config, SLEEP_SUSPEND, "suspend-after-failed-hibernate"); + if (r < 0) + return r; + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-suspend.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s COMMAND\n\n" + "Suspend the system, hibernate the system, or both.\n\n" + " -h --help Show this help and exit\n" + " --version Print version string and exit\n" + "\nCommands:\n" + " suspend Suspend the system\n" + " hibernate Hibernate the system\n" + " hybrid-sleep Both hibernate and suspend the system\n" + " suspend-then-hibernate Initially suspend and then hibernate\n" + " the system after a fixed period of time or\n" + " when battery is low\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + + } + + if (argc - optind != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Usage: %s COMMAND", + program_invocation_short_name); + + arg_operation = sleep_operation_from_string(argv[optind]); + if (arg_operation < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown command '%s'.", argv[optind]); + + return 1 /* work to do */; +} + +static int run(int argc, char *argv[]) { + _cleanup_(sleep_config_freep) SleepConfig *sleep_config = NULL; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = parse_sleep_config(&sleep_config); + if (r < 0) + return r; + + if (!sleep_config->allow[arg_operation]) + return log_error_errno(SYNTHETIC_ERRNO(EACCES), + "Sleep operation \"%s\" is disabled by configuration, refusing.", + sleep_operation_to_string(arg_operation)); + + switch (arg_operation) { + + case SLEEP_SUSPEND_THEN_HIBERNATE: + r = execute_s2h(sleep_config); + break; + + case SLEEP_HYBRID_SLEEP: + r = execute(sleep_config, SLEEP_HYBRID_SLEEP, NULL); + if (r < 0) { + /* If we can't hybrid sleep, then let's try to suspend at least. After all, the user + * asked us to do both: suspend + hibernate, and it's almost certainly the + * hibernation that failed, hence still do the other thing, the suspend. */ + + log_notice_errno(r, "Couldn't hybrid sleep, will try to suspend instead: %m"); + + r = execute(sleep_config, SLEEP_SUSPEND, "suspend-after-failed-hybrid-sleep"); + } + + break; + + default: + r = execute(sleep_config, arg_operation, NULL); + break; + + } + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/sleep/sleep.conf b/src/sleep/sleep.conf new file mode 100644 index 0000000..fad95b3 --- /dev/null +++ b/src/sleep/sleep.conf @@ -0,0 +1,27 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/sleep.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/sleep.conf' to display the full config. +# +# See systemd-sleep.conf(5) for details. + +[Sleep] +#AllowSuspend=yes +#AllowHibernation=yes +#AllowSuspendThenHibernate=yes +#AllowHybridSleep=yes +#SuspendState=mem standby freeze +#HibernateMode=platform shutdown +#HibernateDelaySec= +#SuspendEstimationSec=60min diff --git a/src/sleep/test-battery-capacity.c b/src/sleep/test-battery-capacity.c new file mode 100644 index 0000000..1b3422a --- /dev/null +++ b/src/sleep/test-battery-capacity.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "battery-capacity.h" +#include "errno-util.h" +#include "hashmap.h" +#include "log.h" +#include "tests.h" + +TEST(fetch_batteries_capacity_by_name) { + _cleanup_hashmap_free_ Hashmap *capacity = NULL; + int r; + + assert_se(fetch_batteries_capacity_by_name(&capacity) >= 0); + log_debug("fetch_batteries_capacity_by_name: %u entries", hashmap_size(capacity)); + + const char *name; + void *cap; + HASHMAP_FOREACH_KEY(cap, name, capacity) { + assert(cap); /* Anything non-null is fine. */ + log_info("Battery %s: capacity = %i", name, get_capacity_by_name(capacity, name)); + } + + for (int i = 0; i < 2; i++) { + usec_t interval; + + if (i > 0) + sleep(1); + + r = get_total_suspend_interval(capacity, &interval); + assert_se(r >= 0 || r == -ENOENT); + log_info("%d: get_total_suspend_interval: %s", i, + r < 0 ? STRERROR(r) : FORMAT_TIMESPAN(interval, USEC_PER_SEC)); + } +} + +static int intro(void) { + if (getuid() != 0) + log_warning("This program is unlikely to work for unprivileged users"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/socket-activate/meson.build b/src/socket-activate/meson.build new file mode 100644 index 0000000..a4d18b5 --- /dev/null +++ b/src/socket-activate/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-socket-activate', + 'public' : true, + 'sources' : files('socket-activate.c'), + 'dependencies' : threads, + }, +] diff --git a/src/socket-activate/socket-activate.c b/src/socket-activate/socket-activate.c new file mode 100644 index 0000000..78ecb29 --- /dev/null +++ b/src/socket-activate/socket-activate.c @@ -0,0 +1,495 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "build.h" +#include "env-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "pretty-print.h" +#include "process-util.h" +#include "signal-util.h" +#include "socket-netlink.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" + +static char **arg_listen = NULL; +static bool arg_accept = false; +static int arg_socket_type = SOCK_STREAM; +static char **arg_args = NULL; +static char **arg_setenv = NULL; +static char **arg_fdnames = NULL; +static bool arg_inetd = false; + +static int add_epoll(int epoll_fd, int fd) { + struct epoll_event ev = { + .events = EPOLLIN, + .data.fd = fd, + }; + + assert(epoll_fd >= 0); + assert(fd >= 0); + + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) + return log_error_errno(errno, "Failed to add event on epoll fd:%d for fd:%d: %m", epoll_fd, fd); + + return 0; +} + +static int open_sockets(int *ret_epoll_fd, bool accept) { + _cleanup_close_ int epoll_fd = -EBADF; + int n, r, count = 0; + + assert(ret_epoll_fd); + + n = sd_listen_fds(true); + if (n < 0) + return log_error_errno(n, "Failed to read listening file descriptors from environment: %m"); + if (n > 0) { + log_info("Received %i descriptors via the environment.", n); + + for (int fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) { + r = fd_cloexec(fd, arg_accept); + if (r < 0) + return r; + + count++; + } + } + + /* Close logging and all other descriptors */ + if (arg_listen) { + _cleanup_free_ int *except = new(int, n); + if (!except) + return log_oom(); + + for (int i = 0; i < n; i++) + except[i] = SD_LISTEN_FDS_START + i; + + log_close(); + log_set_open_when_needed(true); + log_settle_target(); + + r = close_all_fds(except, n); + if (r < 0) + return log_error_errno(r, "Failed to close all file descriptors: %m"); + } + + /* Note: we leak some fd's on error here. It doesn't matter much, since the program will exit + * immediately anyway, but would be a pain to fix. */ + + STRV_FOREACH(address, arg_listen) { + r = make_socket_fd(LOG_DEBUG, *address, arg_socket_type, (arg_accept * SOCK_CLOEXEC)); + if (r < 0) + return log_error_errno(r, "Failed to open '%s': %m", *address); + + assert(r == SD_LISTEN_FDS_START + count); + count++; + } + + if (arg_listen) { + log_open(); + log_set_open_when_needed(false); + } + + epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (epoll_fd < 0) + return log_error_errno(errno, "Failed to create epoll object: %m"); + + for (int fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + count; fd++) { + _cleanup_free_ char *name = NULL; + + getsockname_pretty(fd, &name); + log_info("Listening on %s as %i.", strna(name), fd); + + r = add_epoll(epoll_fd, fd); + if (r < 0) + return r; + } + + *ret_epoll_fd = TAKE_FD(epoll_fd); + return count; +} + +static int exec_process(const char *name, char **argv, int start_fd, size_t n_fds) { + _cleanup_strv_free_ char **envp = NULL; + int r; + + if (arg_inetd && n_fds != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--inetd only supported for single file descriptors."); + + FOREACH_STRING(var, "TERM", "PATH", "USER", "HOME") { + const char *n; + + n = strv_find_prefix(environ, var); + if (!n) + continue; + + r = strv_extend(&envp, n); + if (r < 0) + return r; + } + + if (arg_inetd) { + assert(n_fds == 1); + + r = rearrange_stdio(start_fd, start_fd, STDERR_FILENO); /* invalidates start_fd on success + error */ + if (r < 0) + return log_error_errno(r, "Failed to move fd to stdin+stdout: %m"); + + } else { + if (start_fd != SD_LISTEN_FDS_START) { + assert(n_fds == 1); + + if (dup2(start_fd, SD_LISTEN_FDS_START) < 0) + return log_error_errno(errno, "Failed to dup connection: %m"); + + safe_close(start_fd); + } + + r = strv_extendf(&envp, "LISTEN_FDS=%zu", n_fds); + if (r < 0) + return r; + + r = strv_extendf(&envp, "LISTEN_PID=" PID_FMT, getpid_cached()); + if (r < 0) + return r; + + if (arg_fdnames) { + _cleanup_free_ char *names = NULL; + size_t len; + + len = strv_length(arg_fdnames); + if (len == 1) + for (size_t i = 1; i < n_fds; i++) { + r = strv_extend(&arg_fdnames, arg_fdnames[0]); + if (r < 0) + return log_oom(); + } + else if (len != n_fds) + log_warning("The number of fd names is different than number of fds: %zu vs %zu", len, n_fds); + + names = strv_join(arg_fdnames, ":"); + if (!names) + return log_oom(); + + char *t = strjoin("LISTEN_FDNAMES=", names); + if (!t) + return log_oom(); + + r = strv_consume(&envp, t); + if (r < 0) + return r; + } + } + + STRV_FOREACH(s, arg_setenv) { + r = strv_env_replace_strdup(&envp, *s); + if (r < 0) + return r; + } + + _cleanup_free_ char *joined = strv_join(argv, " "); + if (!joined) + return log_oom(); + + log_info("Execing %s (%s)", name, joined); + execvpe(name, argv, envp); + + return log_error_errno(errno, "Failed to execp %s (%s): %m", name, joined); +} + +static int fork_and_exec_process(const char *child, char **argv, int fd) { + _cleanup_free_ char *joined = NULL; + pid_t child_pid; + int r; + + joined = strv_join(argv, " "); + if (!joined) + return log_oom(); + + r = safe_fork("(activate)", + FORK_RESET_SIGNALS | FORK_DEATHSIG_SIGTERM | FORK_RLIMIT_NOFILE_SAFE | FORK_LOG, + &child_pid); + if (r < 0) + return r; + if (r == 0) { + /* In the child */ + exec_process(child, argv, fd, 1); + _exit(EXIT_FAILURE); + } + + log_info("Spawned %s (%s) as PID " PID_FMT ".", child, joined, child_pid); + return 0; +} + +static int do_accept(const char *name, char **argv, int fd) { + _cleanup_free_ char *local = NULL, *peer = NULL; + _cleanup_close_ int fd_accepted = -EBADF; + + fd_accepted = accept4(fd, NULL, NULL, 0); + if (fd_accepted < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return log_error_errno(errno, "Failed to accept connection on fd:%d: %m", fd); + } + + (void) getsockname_pretty(fd_accepted, &local); + (void) getpeername_pretty(fd_accepted, true, &peer); + log_info("Connection from %s to %s", strna(peer), strna(local)); + + return fork_and_exec_process(name, argv, fd_accepted); +} + +/* SIGCHLD handler. */ +static void sigchld_hdl(int sig) { + PROTECT_ERRNO; + + for (;;) { + siginfo_t si; + int r; + + si.si_pid = 0; + r = waitid(P_ALL, 0, &si, WEXITED | WNOHANG); + if (r < 0) { + if (errno != ECHILD) + log_error_errno(errno, "Failed to reap children: %m"); + return; + } + if (si.si_pid == 0) + return; + + log_info("Child %d died with code %d", si.si_pid, si.si_status); + } +} + +static int install_chld_handler(void) { + static const struct sigaction act = { + .sa_flags = SA_NOCLDSTOP | SA_RESTART, + .sa_handler = sigchld_hdl, + }; + + if (sigaction(SIGCHLD, &act, 0) < 0) + return log_error_errno(errno, "Failed to install SIGCHLD handler: %m"); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-socket-activate", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n" + "\n%sListen on sockets and launch child on connection.%s\n" + "\nOptions:\n" + " -h --help Show this help and exit\n" + " --version Print version string and exit\n" + " -l --listen=ADDR Listen for raw connections at ADDR\n" + " -d --datagram Listen on datagram instead of stream socket\n" + " --seqpacket Listen on SOCK_SEQPACKET instead of stream socket\n" + " -a --accept Spawn separate child for each connection\n" + " -E --setenv=NAME[=VALUE] Pass an environment variable to children\n" + " --fdname=NAME[:NAME...] Specify names for file descriptors\n" + " --inetd Enable inetd file descriptor passing protocol\n" + "\nNote: file descriptors from sd_listen_fds() will be passed through.\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_FDNAME, + ARG_SEQPACKET, + ARG_INETD, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "datagram", no_argument, NULL, 'd' }, + { "seqpacket", no_argument, NULL, ARG_SEQPACKET }, + { "listen", required_argument, NULL, 'l' }, + { "accept", no_argument, NULL, 'a' }, + { "setenv", required_argument, NULL, 'E' }, + { "environment", required_argument, NULL, 'E' }, /* legacy alias */ + { "fdname", required_argument, NULL, ARG_FDNAME }, + { "inetd", no_argument, NULL, ARG_INETD }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+hl:aE:d", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'l': + r = strv_extend(&arg_listen, optarg); + if (r < 0) + return log_oom(); + + break; + + case 'd': + if (arg_socket_type == SOCK_SEQPACKET) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--datagram may not be combined with --seqpacket."); + + arg_socket_type = SOCK_DGRAM; + break; + + case ARG_SEQPACKET: + if (arg_socket_type == SOCK_DGRAM) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--seqpacket may not be combined with --datagram."); + + arg_socket_type = SOCK_SEQPACKET; + break; + + case 'a': + arg_accept = true; + break; + + case 'E': + r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg); + if (r < 0) + return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg); + break; + + case ARG_FDNAME: { + _cleanup_strv_free_ char **names = NULL; + + names = strv_split(optarg, ":"); + if (!names) + return log_oom(); + + STRV_FOREACH(s, names) + if (!fdname_is_valid(*s)) { + _cleanup_free_ char *esc = NULL; + + esc = cescape(*s); + log_warning("File descriptor name \"%s\" is not valid.", esc); + } + + /* Empty optargs means one empty name */ + r = strv_extend_strv(&arg_fdnames, + strv_isempty(names) ? STRV_MAKE("") : names, + false); + if (r < 0) + return log_error_errno(r, "strv_extend_strv: %m"); + break; + } + + case ARG_INETD: + arg_inetd = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind == argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: command to execute is missing.", + program_invocation_short_name); + + if (arg_socket_type == SOCK_DGRAM && arg_accept) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Datagram sockets do not accept connections. " + "The --datagram and --accept options may not be combined."); + + arg_args = argv + optind; + + return 1 /* work to do */; +} + +static int run(int argc, char **argv) { + _cleanup_close_ int epoll_fd = -EBADF; + _cleanup_strv_free_ char **exec_argv = NULL; + int r, n; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + exec_argv = strv_copy(arg_args); + if (!exec_argv) + return log_oom(); + + assert(!strv_isempty(exec_argv)); + + r = install_chld_handler(); + if (r < 0) + return r; + + n = open_sockets(&epoll_fd, arg_accept); + if (n < 0) + return n; + if (n == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "No sockets to listen on specified or passed in."); + + for (;;) { + struct epoll_event event; + + if (epoll_wait(epoll_fd, &event, 1, -1) < 0) { + if (errno == EINTR) + continue; + + return log_error_errno(errno, "epoll_wait() failed: %m"); + } + + log_info("Communication attempt on fd %i.", event.data.fd); + if (arg_accept) { + r = do_accept(exec_argv[0], exec_argv, event.data.fd); + if (r < 0) + return r; + } else + break; + } + + return exec_process(exec_argv[0], exec_argv, SD_LISTEN_FDS_START, (size_t) n); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/socket-proxy/meson.build b/src/socket-proxy/meson.build new file mode 100644 index 0000000..52d63a8 --- /dev/null +++ b/src/socket-proxy/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-socket-proxyd', + 'public' : true, + 'sources' : files('socket-proxyd.c'), + 'dependencies' : threads, + }, +] diff --git a/src/socket-proxy/socket-proxyd.c b/src/socket-proxy/socket-proxyd.c new file mode 100644 index 0000000..287fd6c --- /dev/null +++ b/src/socket-proxy/socket-proxyd.c @@ -0,0 +1,722 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-event.h" +#include "sd-resolve.h" + +#include "alloc-util.h" +#include "build.h" +#include "daemon-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "main-func.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "resolve-private.h" +#include "set.h" +#include "socket-util.h" +#include "string-util.h" + +#define BUFFER_SIZE (256 * 1024) + +static unsigned arg_connections_max = 256; +static const char *arg_remote_host = NULL; +static usec_t arg_exit_idle_time = USEC_INFINITY; + +typedef struct Context { + sd_event *event; + sd_resolve *resolve; + sd_event_source *idle_time; + + Set *listen; + Set *connections; +} Context; + +typedef struct Connection { + Context *context; + + int server_fd, client_fd; + int server_to_client_buffer[2]; /* a pipe */ + int client_to_server_buffer[2]; /* a pipe */ + + size_t server_to_client_buffer_full, client_to_server_buffer_full; + size_t server_to_client_buffer_size, client_to_server_buffer_size; + + sd_event_source *server_event_source, *client_event_source; + + sd_resolve_query *resolve_query; +} Connection; + +static void connection_free(Connection *c) { + assert(c); + + if (c->context) + set_remove(c->context->connections, c); + + sd_event_source_unref(c->server_event_source); + sd_event_source_unref(c->client_event_source); + + safe_close(c->server_fd); + safe_close(c->client_fd); + + safe_close_pair(c->server_to_client_buffer); + safe_close_pair(c->client_to_server_buffer); + + sd_resolve_query_unref(c->resolve_query); + + free(c); +} + +static int idle_time_cb(sd_event_source *s, uint64_t usec, void *userdata) { + Context *c = userdata; + int r; + + if (!set_isempty(c->connections)) { + log_warning("Idle timer fired even though there are connections, ignoring"); + return 0; + } + + r = sd_event_exit(c->event, 0); + if (r < 0) { + log_warning_errno(r, "Error while stopping event loop, ignoring: %m"); + return 0; + } + return 0; +} + +static int connection_release(Connection *c) { + Context *context = ASSERT_PTR(ASSERT_PTR(c)->context); + int r; + + connection_free(c); + + if (arg_exit_idle_time < USEC_INFINITY && set_isempty(context->connections)) { + if (context->idle_time) { + r = sd_event_source_set_time_relative(context->idle_time, arg_exit_idle_time); + if (r < 0) + return log_error_errno(r, "Error while setting idle time: %m"); + + r = sd_event_source_set_enabled(context->idle_time, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Error while enabling idle time: %m"); + } else { + r = sd_event_add_time_relative( + context->event, &context->idle_time, CLOCK_MONOTONIC, + arg_exit_idle_time, 0, idle_time_cb, context); + if (r < 0) + return log_error_errno(r, "Failed to create idle timer: %m"); + } + } + + return 0; +} + +static void context_clear(Context *context) { + assert(context); + + set_free_with_destructor(context->listen, sd_event_source_unref); + set_free_with_destructor(context->connections, connection_free); + + sd_event_unref(context->event); + sd_resolve_unref(context->resolve); + sd_event_source_unref(context->idle_time); +} + +static int connection_create_pipes(Connection *c, int buffer[static 2], size_t *sz) { + int r; + + assert(c); + assert(buffer); + assert(sz); + + if (buffer[0] >= 0) + return 0; + + r = pipe2(buffer, O_CLOEXEC|O_NONBLOCK); + if (r < 0) + return log_error_errno(errno, "Failed to allocate pipe buffer: %m"); + + (void) fcntl(buffer[0], F_SETPIPE_SZ, BUFFER_SIZE); + + r = fcntl(buffer[0], F_GETPIPE_SZ); + if (r < 0) + return log_error_errno(errno, "Failed to get pipe buffer size: %m"); + + assert(r > 0); + *sz = r; + + return 0; +} + +static int connection_shovel( + Connection *c, + int *from, int buffer[2], int *to, + size_t *full, size_t *sz, + sd_event_source **from_source, sd_event_source **to_source) { + + bool shoveled; + + assert(c); + assert(from); + assert(buffer); + assert(buffer[0] >= 0); + assert(buffer[1] >= 0); + assert(to); + assert(full); + assert(sz); + assert(from_source); + assert(to_source); + + do { + ssize_t z; + + shoveled = false; + + if (*full < *sz && *from >= 0 && *to >= 0) { + z = splice(*from, NULL, buffer[1], NULL, *sz - *full, SPLICE_F_MOVE|SPLICE_F_NONBLOCK); + if (z > 0) { + *full += z; + shoveled = true; + } else if (z == 0 || ERRNO_IS_DISCONNECT(errno)) { + *from_source = sd_event_source_unref(*from_source); + *from = safe_close(*from); + } else if (!ERRNO_IS_TRANSIENT(errno)) + return log_error_errno(errno, "Failed to splice: %m"); + } + + if (*full > 0 && *to >= 0) { + z = splice(buffer[0], NULL, *to, NULL, *full, SPLICE_F_MOVE|SPLICE_F_NONBLOCK); + if (z > 0) { + *full -= z; + shoveled = true; + } else if (z == 0 || ERRNO_IS_DISCONNECT(errno)) { + *to_source = sd_event_source_unref(*to_source); + *to = safe_close(*to); + } else if (!ERRNO_IS_TRANSIENT(errno)) + return log_error_errno(errno, "Failed to splice: %m"); + } + } while (shoveled); + + return 0; +} + +static int connection_enable_event_sources(Connection *c); + +static int traffic_cb(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Connection *c = ASSERT_PTR(userdata); + int r; + + assert(s); + assert(fd >= 0); + + r = connection_shovel(c, + &c->server_fd, c->server_to_client_buffer, &c->client_fd, + &c->server_to_client_buffer_full, &c->server_to_client_buffer_size, + &c->server_event_source, &c->client_event_source); + if (r < 0) + goto quit; + + r = connection_shovel(c, + &c->client_fd, c->client_to_server_buffer, &c->server_fd, + &c->client_to_server_buffer_full, &c->client_to_server_buffer_size, + &c->client_event_source, &c->server_event_source); + if (r < 0) + goto quit; + + /* EOF on both sides? */ + if (c->server_fd < 0 && c->client_fd < 0) + goto quit; + + /* Server closed, and all data written to client? */ + if (c->server_fd < 0 && c->server_to_client_buffer_full <= 0) + goto quit; + + /* Client closed, and all data written to server? */ + if (c->client_fd < 0 && c->client_to_server_buffer_full <= 0) + goto quit; + + r = connection_enable_event_sources(c); + if (r < 0) + goto quit; + + return 1; + +quit: + connection_release(c); + return 0; /* ignore errors, continue serving */ +} + +static int connection_enable_event_sources(Connection *c) { + uint32_t a = 0, b = 0; + int r; + + assert(c); + + if (c->server_to_client_buffer_full > 0) + b |= EPOLLOUT; + if (c->server_to_client_buffer_full < c->server_to_client_buffer_size) + a |= EPOLLIN; + + if (c->client_to_server_buffer_full > 0) + a |= EPOLLOUT; + if (c->client_to_server_buffer_full < c->client_to_server_buffer_size) + b |= EPOLLIN; + + if (c->server_event_source) + r = sd_event_source_set_io_events(c->server_event_source, a); + else if (c->server_fd >= 0) + r = sd_event_add_io(c->context->event, &c->server_event_source, c->server_fd, a, traffic_cb, c); + else + r = 0; + + if (r < 0) + return log_error_errno(r, "Failed to set up server event source: %m"); + + if (c->client_event_source) + r = sd_event_source_set_io_events(c->client_event_source, b); + else if (c->client_fd >= 0) + r = sd_event_add_io(c->context->event, &c->client_event_source, c->client_fd, b, traffic_cb, c); + else + r = 0; + + if (r < 0) + return log_error_errno(r, "Failed to set up client event source: %m"); + + return 0; +} + +static int connection_complete(Connection *c) { + int r; + + assert(c); + + r = connection_create_pipes(c, c->server_to_client_buffer, &c->server_to_client_buffer_size); + if (r < 0) + goto fail; + + r = connection_create_pipes(c, c->client_to_server_buffer, &c->client_to_server_buffer_size); + if (r < 0) + goto fail; + + r = connection_enable_event_sources(c); + if (r < 0) + goto fail; + + return 0; + +fail: + connection_release(c); + return 0; /* ignore errors, continue serving */ +} + +static int connect_cb(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Connection *c = ASSERT_PTR(userdata); + socklen_t solen; + int error, r; + + assert(s); + assert(fd >= 0); + + solen = sizeof(error); + r = getsockopt(fd, SOL_SOCKET, SO_ERROR, &error, &solen); + if (r < 0) { + log_error_errno(errno, "Failed to issue SO_ERROR: %m"); + goto fail; + } + + if (error != 0) { + log_error_errno(error, "Failed to connect to remote host: %m"); + goto fail; + } + + c->client_event_source = sd_event_source_unref(c->client_event_source); + + return connection_complete(c); + +fail: + connection_release(c); + return 0; /* ignore errors, continue serving */ +} + +static int connection_start(Connection *c, struct sockaddr *sa, socklen_t salen) { + int r; + + assert(c); + assert(sa); + assert(salen); + + c->client_fd = socket(sa->sa_family, SOCK_STREAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0); + if (c->client_fd < 0) { + log_error_errno(errno, "Failed to get remote socket: %m"); + goto fail; + } + + r = connect(c->client_fd, sa, salen); + if (r < 0) { + if (errno == EINPROGRESS) { + r = sd_event_add_io(c->context->event, &c->client_event_source, c->client_fd, EPOLLOUT, connect_cb, c); + if (r < 0) { + log_error_errno(r, "Failed to add connection socket: %m"); + goto fail; + } + + r = sd_event_source_set_enabled(c->client_event_source, SD_EVENT_ONESHOT); + if (r < 0) { + log_error_errno(r, "Failed to enable oneshot event source: %m"); + goto fail; + } + } else { + log_error_errno(errno, "Failed to connect to remote host: %m"); + goto fail; + } + } else { + r = connection_complete(c); + if (r < 0) + goto fail; + } + + return 0; + +fail: + connection_release(c); + return 0; /* ignore errors, continue serving */ +} + +static int resolve_handler(sd_resolve_query *q, int ret, const struct addrinfo *ai, Connection *c) { + assert(q); + assert(c); + + if (ret != 0) { + log_error("Failed to resolve host: %s", gai_strerror(ret)); + goto fail; + } + + c->resolve_query = sd_resolve_query_unref(c->resolve_query); + + return connection_start(c, ai->ai_addr, ai->ai_addrlen); + +fail: + connection_release(c); + return 0; /* ignore errors, continue serving */ +} + +static int resolve_remote(Connection *c) { + + static const struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM, + }; + + const char *node, *service; + int r; + + if (IN_SET(arg_remote_host[0], '/', '@')) { + union sockaddr_union sa; + int sa_len; + + r = sockaddr_un_set_path(&sa.un, arg_remote_host); + if (r < 0) { + log_error_errno(r, "Specified address doesn't fit in an AF_UNIX address, refusing: %m"); + goto fail; + } + sa_len = r; + + return connection_start(c, &sa.sa, sa_len); + } + + service = strrchr(arg_remote_host, ':'); + if (service) { + node = strndupa_safe(arg_remote_host, + service - arg_remote_host); + service++; + } else { + node = arg_remote_host; + service = "80"; + } + + log_debug("Looking up address info for %s:%s", node, service); + r = resolve_getaddrinfo(c->context->resolve, &c->resolve_query, node, service, &hints, resolve_handler, NULL, c); + if (r < 0) { + log_error_errno(r, "Failed to resolve remote host: %m"); + goto fail; + } + + return 0; + +fail: + connection_release(c); + return 0; /* ignore errors, continue serving */ +} + +static int add_connection_socket(Context *context, int fd) { + Connection *c; + int r; + + assert(context); + assert(fd >= 0); + + if (set_size(context->connections) > arg_connections_max) { + log_warning("Hit connection limit, refusing connection."); + safe_close(fd); + return 0; + } + + if (context->idle_time) { + r = sd_event_source_set_enabled(context->idle_time, SD_EVENT_OFF); + if (r < 0) + log_warning_errno(r, "Unable to disable idle timer, continuing: %m"); + } + + c = new(Connection, 1); + if (!c) { + log_oom(); + return 0; + } + + *c = (Connection) { + .context = context, + .server_fd = fd, + .client_fd = -EBADF, + .server_to_client_buffer = EBADF_PAIR, + .client_to_server_buffer = EBADF_PAIR, + }; + + r = set_ensure_put(&context->connections, NULL, c); + if (r < 0) { + free(c); + log_oom(); + return 0; + } + + return resolve_remote(c); +} + +static int accept_cb(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_free_ char *peer = NULL; + Context *context = ASSERT_PTR(userdata); + int nfd = -EBADF, r; + + assert(s); + assert(fd >= 0); + assert(revents & EPOLLIN); + + nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC); + if (nfd < 0) { + if (!ERRNO_IS_ACCEPT_AGAIN(errno)) + log_warning_errno(errno, "Failed to accept() socket: %m"); + } else { + (void) getpeername_pretty(nfd, true, &peer); + log_debug("New connection from %s", strna(peer)); + + r = add_connection_socket(context, nfd); + if (r < 0) { + log_warning_errno(r, "Failed to accept connection, ignoring: %m"); + safe_close(nfd); + } + } + + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Error while re-enabling listener with ONESHOT: %m"); + + return 1; +} + +static int add_listen_socket(Context *context, int fd) { + sd_event_source *source; + int r; + + assert(context); + assert(fd >= 0); + + r = sd_is_socket(fd, 0, SOCK_STREAM, 1); + if (r < 0) + return log_error_errno(r, "Failed to determine socket type: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Passed in socket is not a stream socket."); + + r = fd_nonblock(fd, true); + if (r < 0) + return log_error_errno(r, "Failed to mark file descriptor non-blocking: %m"); + + r = sd_event_add_io(context->event, &source, fd, EPOLLIN, accept_cb, context); + if (r < 0) + return log_error_errno(r, "Failed to add event source: %m"); + + r = set_ensure_put(&context->listen, NULL, source); + if (r < 0) { + sd_event_source_unref(source); + return log_error_errno(r, "Failed to add source to set: %m"); + } + + r = sd_event_source_set_exit_on_failure(source, true); + if (r < 0) + return log_error_errno(r, "Failed to enable exit-on-failure logic: %m"); + + /* Set the watcher to oneshot in case other processes are also + * watching to accept(). */ + r = sd_event_source_set_enabled(source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable oneshot mode: %m"); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + _cleanup_free_ char *time_link = NULL; + int r; + + r = terminal_urlify_man("systemd-socket-proxyd", "8", &link); + if (r < 0) + return log_oom(); + r = terminal_urlify_man("systemd.time", "7", &time_link); + if (r < 0) + return log_oom(); + + printf("%1$s [HOST:PORT]\n" + "%1$s [SOCKET]\n\n" + "Bidirectionally proxy local sockets to another (possibly remote) socket.\n\n" + " -c --connections-max= Set the maximum number of connections to be accepted\n" + " --exit-idle-time= Exit when without a connection for this duration. See\n" + " the %3$s for time span format\n" + " -h --help Show this help\n" + " --version Show package version\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + time_link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_EXIT_IDLE, + ARG_IGNORE_ENV + }; + + static const struct option options[] = { + { "connections-max", required_argument, NULL, 'c' }, + { "exit-idle-time", required_argument, NULL, ARG_EXIT_IDLE }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "c:h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'c': + r = safe_atou(optarg, &arg_connections_max); + if (r < 0) { + log_error("Failed to parse --connections-max= argument: %s", optarg); + return r; + } + + if (arg_connections_max < 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Connection limit is too low."); + + break; + + case ARG_EXIT_IDLE: + r = parse_sec(optarg, &arg_exit_idle_time); + if (r < 0) + return log_error_errno(r, "Failed to parse --exit-idle-time= argument: %s", optarg); + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Not enough parameters."); + + if (argc != optind+1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many parameters."); + + arg_remote_host = argv[optind]; + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(context_clear) Context context = {}; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL; + int r, n, fd; + + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = sd_event_default(&context.event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = sd_resolve_default(&context.resolve); + if (r < 0) + return log_error_errno(r, "Failed to allocate resolver: %m"); + + r = sd_resolve_attach_event(context.resolve, context.event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach resolver: %m"); + + sd_event_set_watchdog(context.event, true); + + r = sd_listen_fds(1); + if (r < 0) + return log_error_errno(r, "Failed to receive sockets from parent."); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Didn't get any sockets passed in."); + + n = r; + + for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) { + r = add_listen_socket(&context, fd); + if (r < 0) + return r; + } + + notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + r = sd_event_loop(context.event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/stdio-bridge/meson.build b/src/stdio-bridge/meson.build new file mode 100644 index 0000000..99662b1 --- /dev/null +++ b/src/stdio-bridge/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-stdio-bridge', + 'public' : true, + 'sources' : files('stdio-bridge.c'), + }, +] diff --git a/src/stdio-bridge/stdio-bridge.c b/src/stdio-bridge/stdio-bridge.c new file mode 100644 index 0000000..fe551cf --- /dev/null +++ b/src/stdio-bridge/stdio-bridge.c @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "build.h" +#include "bus-internal.h" +#include "bus-util.h" +#include "errno-util.h" +#include "io-util.h" +#include "log.h" +#include "main-func.h" + +#define DEFAULT_BUS_PATH "unix:path=/run/dbus/system_bus_socket" + +static const char *arg_bus_path = DEFAULT_BUS_PATH; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + +static int help(void) { + printf("%s [OPTIONS...]\n\n" + "Forward messages between a pipe or socket and a D-Bus bus.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " -p --bus-path=PATH Path to the bus address (default: %s)\n" + " --system Connect to system bus\n" + " --user Connect to user bus\n" + " -M --machine=CONTAINER Name of local container to connect to\n", + program_invocation_short_name, DEFAULT_BUS_PATH); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_MACHINE, + ARG_USER, + ARG_SYSTEM, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "bus-path", required_argument, NULL, 'p' }, + { "user", no_argument, NULL, ARG_USER }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "machine", required_argument, NULL, 'M' }, + {}, + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hp:M:", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + + case ARG_SYSTEM: + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + break; + + case 'p': + arg_bus_path = optarg; + break; + + case 'M': + arg_bus_path = optarg; + arg_transport = BUS_TRANSPORT_MACHINE; + break; + + case '?': + return -EINVAL; + + default: + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown option code %c", c); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *a = NULL, *b = NULL; + sd_id128_t server_id; + bool is_unix; + int r, in_fd, out_fd; + + log_set_target(LOG_TARGET_JOURNAL_OR_KMSG); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = sd_listen_fds(0); + if (r == 0) { + in_fd = STDIN_FILENO; + out_fd = STDOUT_FILENO; + } else if (r == 1) { + in_fd = SD_LISTEN_FDS_START; + out_fd = SD_LISTEN_FDS_START; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "More than one file descriptor was passed."); + + is_unix = + sd_is_socket(in_fd, AF_UNIX, 0, 0) > 0 && + sd_is_socket(out_fd, AF_UNIX, 0, 0) > 0; + + r = sd_bus_new(&a); + if (r < 0) + return log_error_errno(r, "Failed to allocate bus: %m"); + + if (arg_transport == BUS_TRANSPORT_MACHINE) + r = bus_set_address_machine(a, arg_runtime_scope, arg_bus_path); + else + r = sd_bus_set_address(a, arg_bus_path); + if (r < 0) + return log_error_errno(r, "Failed to set address to connect to: %m"); + + r = sd_bus_negotiate_fds(a, is_unix); + if (r < 0) + return log_error_errno(r, "Failed to set FD negotiation: %m"); + + r = sd_bus_start(a); + if (r < 0) + return log_error_errno(r, "Failed to start bus client: %m"); + + r = sd_bus_get_bus_id(a, &server_id); + if (r < 0) + return log_error_errno(r, "Failed to get server ID: %m"); + + r = sd_bus_new(&b); + if (r < 0) + return log_error_errno(r, "Failed to allocate bus: %m"); + + r = sd_bus_set_fd(b, in_fd, out_fd); + if (r < 0) + return log_error_errno(r, "Failed to set fds: %m"); + + r = sd_bus_set_server(b, 1, server_id); + if (r < 0) + return log_error_errno(r, "Failed to set server mode: %m"); + + r = sd_bus_negotiate_fds(b, is_unix); + if (r < 0) + return log_error_errno(r, "Failed to set FD negotiation: %m"); + + r = sd_bus_set_anonymous(b, true); + if (r < 0) + return log_error_errno(r, "Failed to set anonymous authentication: %m"); + + r = sd_bus_start(b); + if (r < 0) + return log_error_errno(r, "Failed to start bus client: %m"); + + for (;;) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int events_a, events_b, fd; + usec_t timeout_a, timeout_b, t; + + assert_cc(sizeof(usec_t) == sizeof(uint64_t)); + + r = sd_bus_process(a, &m); + if (ERRNO_IS_NEG_DISCONNECT(r)) /* Treat 'connection reset by peer' as clean exit condition */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to process bus a: %m"); + if (m) { + if (sd_bus_message_is_signal(m, "org.freedesktop.DBus.Local", "Disconnected")) + return 0; + + r = sd_bus_send(b, m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send message: %m"); + } + + if (r > 0) + continue; + + r = sd_bus_process(b, &m); + if (ERRNO_IS_NEG_DISCONNECT(r)) /* Treat 'connection reset by peer' as clean exit condition */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to process bus: %m"); + if (m) { + if (sd_bus_message_is_signal(m, "org.freedesktop.DBus.Local", "Disconnected")) + return 0; + + r = sd_bus_send(a, m, NULL); + if (r < 0) + return log_error_errno(r, "Failed to send message: %m"); + } + + if (r > 0) + continue; + + fd = sd_bus_get_fd(a); + if (fd < 0) + return log_error_errno(fd, "Failed to get fd: %m"); + + events_a = sd_bus_get_events(a); + if (events_a < 0) + return log_error_errno(events_a, "Failed to get events mask: %m"); + + r = sd_bus_get_timeout(a, &timeout_a); + if (r < 0) + return log_error_errno(r, "Failed to get timeout: %m"); + + events_b = sd_bus_get_events(b); + if (events_b < 0) + return log_error_errno(events_b, "Failed to get events mask: %m"); + + r = sd_bus_get_timeout(b, &timeout_b); + if (r < 0) + return log_error_errno(r, "Failed to get timeout: %m"); + + t = usec_sub_unsigned(MIN(timeout_a, timeout_b), now(CLOCK_MONOTONIC)); + + struct pollfd p[3] = { + { .fd = fd, .events = events_a }, + { .fd = STDIN_FILENO, .events = events_b & POLLIN }, + { .fd = STDOUT_FILENO, .events = events_b & POLLOUT }, + }; + + r = ppoll_usec(p, ELEMENTSOF(p), t); + if (r < 0 && !ERRNO_IS_TRANSIENT(r)) /* don't be bothered by signals, i.e. EINTR */ + return log_error_errno(r, "ppoll() failed: %m"); + } +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/storagetm/meson.build b/src/storagetm/meson.build new file mode 100644 index 0000000..f95210a --- /dev/null +++ b/src/storagetm/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-storagetm', + 'conditions' : [ + 'ENABLE_STORAGETM', + ], + 'sources' : files('storagetm.c'), + }, +] diff --git a/src/storagetm/storagetm.c b/src/storagetm/storagetm.c new file mode 100644 index 0000000..16d4fb0 --- /dev/null +++ b/src/storagetm/storagetm.c @@ -0,0 +1,1244 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "blockdev-util.h" +#include "build.h" +#include "daemon-util.h" +#include "device-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "id128-util.h" +#include "local-addresses.h" +#include "loop-util.h" +#include "main-func.h" +#include "os-util.h" +#include "parse-argument.h" +#include "path-util.h" +#include "plymouth-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "random-util.h" +#include "recurse-dir.h" +#include "socket-util.h" +#include "terminal-util.h" +#include "udev-util.h" + +static char **arg_devices = NULL; +static char *arg_nqn = NULL; +static int arg_all = 0; + +STATIC_DESTRUCTOR_REGISTER(arg_devices, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_nqn, freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-storagetm", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [DEVICE...]\n" + "\n%sExpose a block device or regular file as NVMe-TCP volume.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --nqn=STRING Select NQN (NVMe Qualified Name)\n" + " -a --all Expose all devices\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_NQN = 0x100, + ARG_VERSION, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "nqn", required_argument, NULL, ARG_NQN }, + { "all", no_argument, NULL, 'a' }, + {} + }; + + int r, c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "ha", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NQN: + if (!filename_is_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "NQN invalid: %s", optarg); + + if (free_and_strdup(&arg_nqn, optarg) < 0) + return log_oom(); + + break; + + case 'a': + arg_all++; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_all > 0) { + if (argc > optind) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expects no further arguments if --all/-a is specified."); + } else { + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expecting device name or --all/-a."); + + for (int i = optind; i < argc; i++) + if (!path_is_valid(argv[i])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid path: %s", argv[i]); + + arg_devices = strv_copy(argv + optind); + } + + if (!arg_nqn) { + sd_id128_t id; + + r = sd_id128_get_machine_app_specific(SD_ID128_MAKE(b4,f9,4e,52,b8,e2,45,db,88,84,6e,2e,c3,f4,ef,18), &id); + if (r < 0) + return log_error_errno(r, "Failed to get machine ID: %m"); + + /* See NVM Express Base Specification 2.0c, 4.5 "NVMe Qualified Names" */ + if (asprintf(&arg_nqn, "nqn.2023-10.io.systemd:storagetm." SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)) < 0) + return log_oom(); + } + + return 1; +} + +typedef struct NvmeSubsystem { + char *name; + struct stat device_stat; + int device_fd; + int nvme_all_subsystems_fd; /* The /sys/kernel/config/nvmet/subsystems/ dir, that contains all subsystems */ + int nvme_our_subsystem_fd; /* Our private subsystem dir below it. */ + char *device; +} NvmeSubsystem; + +static NvmeSubsystem* nvme_subsystem_free(NvmeSubsystem *s) { + if (!s) + return NULL; + + free(s->name); + safe_close(s->nvme_all_subsystems_fd); + safe_close(s->nvme_our_subsystem_fd); + safe_close(s->device_fd); + free(s->device); + + return mfree(s); +} + +static int nvme_subsystem_unlink(NvmeSubsystem *s) { + int r; + + assert(s); + + if (s->nvme_our_subsystem_fd >= 0) { + _cleanup_close_ int namespaces_fd = -EBADF; + + namespaces_fd = openat(s->nvme_our_subsystem_fd, "namespaces", O_CLOEXEC|O_DIRECTORY|O_RDONLY); + if (namespaces_fd < 0) + log_warning_errno(errno, "Failed to open 'namespaces' directory of subsystem '%s': %m", s->name); + else { + _cleanup_free_ DirectoryEntries *de = NULL; + + r = readdir_all(namespaces_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + log_warning_errno(r, "Failed to read 'namespaces' dir of subsystem '%s', ignoring: %m", s->name); + else { + FOREACH_ARRAY(ee, de->entries, de->n_entries) { + _cleanup_free_ char *enable_fn = NULL; + const struct dirent *e = *ee; + + enable_fn = path_join(e->d_name, "enable"); + if (!enable_fn) + return log_oom(); + + r = write_string_file_at(namespaces_fd, enable_fn, "0", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to disable namespace '%s' of NVME subsystem '%s', ignoring: %m", e->d_name, s->name); + + if (unlinkat(namespaces_fd, e->d_name, AT_REMOVEDIR) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to remove namespace '%s' of NVME subsystem '%s', ignoring: %m", e->d_name, s->name); + } + } + } + + s->nvme_our_subsystem_fd = safe_close(s->nvme_our_subsystem_fd); + } + + if (s->nvme_all_subsystems_fd >= 0 && s->name) { + if (unlinkat(s->nvme_all_subsystems_fd, s->name, AT_REMOVEDIR) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to remove NVME subsystem '%s', ignoring: %m", s->name); + + s->nvme_all_subsystems_fd = safe_close(s->nvme_all_subsystems_fd); /* Invalidate the subsystems/ dir fd, to remember we unlinked the thing already */ + + log_info("NVME subsystem '%s' removed.", s->name); + } + + return 0; +} + +static NvmeSubsystem *nvme_subsystem_destroy(NvmeSubsystem *s) { + if (!s) + return NULL; + + (void) nvme_subsystem_unlink(s); + + return nvme_subsystem_free(s); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(NvmeSubsystem*, nvme_subsystem_destroy); + +static int nvme_subsystem_write_metadata(int subsystem_fd, sd_device *device) { + _cleanup_free_ char *image_id = NULL, *image_version = NULL, *os_id = NULL, *os_version = NULL, *combined_model = NULL, *synthetic_serial = NULL; + const char *hwmodel = NULL, *hwserial = NULL, *w; + int r; + + assert(subsystem_fd >= 0); + + (void) parse_os_release( + /* root= */ NULL, + "IMAGE_ID", &image_id, + "IMAGE_VERSION", &image_version, + "ID", &os_id, + "VERSION_ID", &os_version); + + if (device) { + (void) device_get_model_string(device, &hwmodel); + (void) sd_device_get_property_value(device, "ID_SERIAL_SHORT", &hwserial); + } + + w = secure_getenv("SYSTEMD_NVME_MODEL"); + if (!w) { + if (hwmodel && (image_id || os_id)) { + if (asprintf(&combined_model, "%s (%s)", hwmodel, image_id ?: os_id) < 0) + return log_oom(); + w = combined_model; + } else + w = hwmodel ?: image_id ?: os_id; + } + if (w) { + _cleanup_free_ char *truncated = strndup(w, 40); /* kernel refuses more than 40 chars (as per nvme spec) */ + + /* The default string stored in 'attr_model' is "Linux" btw. */ + r = write_string_file_at(subsystem_fd, "attr_model", truncated, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set model of subsystem to '%s', ignoring: %m", w); + } + + w = secure_getenv("SYSTEMD_NVME_FIRMWARE"); + if (!w) + w = image_version ?: os_version; + if (w) { + _cleanup_free_ char *truncated = strndup(w, 8); /* kernel refuses more than 8 chars (as per nvme spec) */ + if (!truncated) + return log_oom(); + + /* The default string stored in 'attr_firmware' is `uname -r` btw, but truncated to 8 chars. */ + r = write_string_file_at(subsystem_fd, "attr_firmware", truncated, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set model of subsystem to '%s', ignoring: %m", truncated); + } + + w = secure_getenv("SYSTEMD_NVME_SERIAL"); + if (!w) { + if (hwserial) + w = hwserial; + else { + sd_id128_t mid; + + r = sd_id128_get_machine_app_specific(SD_ID128_MAKE(39,7f,4d,bf,1e,bf,46,6d,b3,cb,45,b8,0d,49,5b,c1), &mid); + if (r < 0) + log_warning_errno(r, "Failed to get machine ID, ignoring: %m"); + else { + if (asprintf(&synthetic_serial, SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(mid)) < 0) + return log_oom(); + w = synthetic_serial; + } + } + } + if (w) { + _cleanup_free_ char *truncated = strndup(w, 20); /* kernel refuses more than 20 chars (as per nvme spec) */ + if (!truncated) + return log_oom(); + + r = write_string_file_at(subsystem_fd, "attr_serial", truncated, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set serial of subsystem to '%s', ignoring: %m", truncated); + } + + return 0; +} + +static int nvme_namespace_write_metadata(int namespace_fd, sd_device *device, const char *node) { + sd_id128_t id = SD_ID128_NULL; + const char *e; + int r; + + assert(namespace_fd >= 0); + + e = secure_getenv("SYSTEMD_NVME_UUID"); + if (e) { + r = sd_id128_from_string(e, &id); + if (r < 0) + log_warning_errno(r, "Failed to parse $SYSTEMD_NVME_UUID, ignoring: %s", e); + } + + if (sd_id128_is_null(id)) { + const char *serial = NULL; + sd_id128_t mid = SD_ID128_NULL; + + /* We combine machine ID and ID_SERIAL and hash a UUID from it */ + + if (device) { + (void) sd_device_get_property_value(device, "ID_SERIAL", &serial); + if (!serial) + (void) sd_device_get_devname(device, &serial); + } + if (!serial) + serial = node; + + r = sd_id128_get_machine(&mid); + if (r < 0) + log_warning_errno(r, "Failed to get machine ID, ignoring: %m"); + + size_t l = sizeof(mid) + strlen_ptr(serial); + _cleanup_free_ void *j = malloc(l + 1); + if (!j) + return log_oom(); + + strcpy(mempcpy(j, &mid, sizeof(mid)), strempty(serial)); + + id = id128_digest(j, l); + } + + r = write_string_file_at(namespace_fd, "device_uuid", SD_ID128_TO_UUID_STRING(id), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to set uuid of namespace to '%s', ignoring: %m", SD_ID128_TO_UUID_STRING(id)); + + return 0; +} + +static int nvme_subsystem_add(const char *node, int consumed_fd, sd_device *device, NvmeSubsystem **ret) { + _cleanup_(sd_device_unrefp) sd_device *allocated_device = NULL; + _cleanup_close_ int fd = consumed_fd; /* always take possession of the fd */ + int r; + + assert(node); + assert(ret); + + _cleanup_free_ char *fname = NULL; + r = path_extract_filename(node, &fname); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from path: %s", node); + + _cleanup_free_ char *j = NULL; + j = strjoin(arg_nqn, ".", fname); + if (!j) + return log_oom(); + + if (fd < 0) { + fd = RET_NERRNO(open(node, O_RDONLY|O_CLOEXEC|O_NONBLOCK)); + if (fd < 0) + return log_error_errno(fd, "Failed to open '%s': %m", node); + } + + struct stat st; + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat '%s': %m", node); + if (S_ISBLK(st.st_mode)) { + if (!device) { + r = sd_device_new_from_devnum(&allocated_device, 'b', st.st_rdev); + if (r < 0) + return log_error_errno(r, "Failed to get device information for device '%s': %m", node); + + device = allocated_device; + } + } else { + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Not a block device or regular file, refusing: %s", node); + } + + /* Let's lock this device continuously while we are operating on it */ + r = lock_generic_with_timeout(fd, LOCK_BSD, LOCK_EX, 10 * USEC_PER_SEC); + if (r < 0) + return log_error_errno(r, "Failed to lock block device: %m"); + + _cleanup_close_ int subsystems_fd = -EBADF; + subsystems_fd = RET_NERRNO(open("/sys/kernel/config/nvmet/subsystems", O_DIRECTORY|O_CLOEXEC|O_RDONLY)); + if (subsystems_fd < 0) + return log_error_errno(subsystems_fd, "Failed to open /sys/kernel/config/nvmet/subsystems: %m"); + + _cleanup_close_ int subsystem_fd = -EBADF; + subsystem_fd = open_mkdir_at(subsystems_fd, j, O_EXCL|O_RDONLY|O_CLOEXEC, 0777); + if (subsystem_fd < 0) + return log_error_errno(subsystem_fd, "Failed to create NVME subsystem '%s': %m", j); + + r = write_string_file_at(subsystem_fd, "attr_allow_any_host", "1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set 'attr_allow_any_host' flag: %m"); + + (void) nvme_subsystem_write_metadata(subsystem_fd, device); + + _cleanup_close_ int namespace_fd = -EBADF; + namespace_fd = open_mkdir_at(subsystem_fd, "namespaces/1", O_EXCL|O_RDONLY|O_CLOEXEC, 0777); + if (namespace_fd < 0) + return log_error_errno(namespace_fd, "Failed to create NVME namespace '1': %m"); + + (void) nvme_namespace_write_metadata(namespace_fd, device, node); + + /* We use /proc/$PID/fd/$FD rather than /proc/self/fd/$FD, because this string is visible to others + * via configfs, and by including the PID it's clear to who the stuff belongs. */ + r = write_string_file_at(namespace_fd, "device_path", FORMAT_PROC_PID_FD_PATH(0, fd), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write 'device_path' attribute: %m"); + + r = write_string_file_at(namespace_fd, "enable", "1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write 'enable' attribute: %m"); + + _cleanup_(nvme_subsystem_destroyp) NvmeSubsystem *subsys = NULL; + + subsys = new(NvmeSubsystem, 1); + if (!subsys) + return log_oom(); + + *subsys = (NvmeSubsystem) { + .name = TAKE_PTR(j), + .device_fd = TAKE_FD(fd), + .nvme_all_subsystems_fd = TAKE_FD(subsystems_fd), + .nvme_our_subsystem_fd = TAKE_FD(subsystem_fd), + .device_stat = st, + }; + + subsys->device = strdup(node); + if (!subsys->device) + return log_oom(); + + *ret = TAKE_PTR(subsys); + return 0; +} + +typedef struct NvmePort { + uint16_t portnr; /* used for both the IP and the NVME port numer */ + + int nvme_port_fd; + int nvme_ports_fd; + + int ip_family; +} NvmePort; + +static NvmePort *nvme_port_free(NvmePort *p) { + if (!p) + return NULL; + + safe_close(p->nvme_port_fd); + safe_close(p->nvme_ports_fd); + + return mfree(p); +} + +static int nvme_port_unlink(NvmePort *p) { + int r, ret = 0; + + assert(p); + + if (p->nvme_port_fd >= 0) { + _cleanup_close_ int subsystems_dir_fd = -EBADF; + + subsystems_dir_fd = openat(p->nvme_port_fd, "subsystems", O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (subsystems_dir_fd < 0) + log_warning_errno(errno, "Failed to open 'subsystems' dir of port %" PRIu16 ", ignoring: %m", p->portnr); + else { + _cleanup_free_ DirectoryEntries *de = NULL; + + r = readdir_all(subsystems_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + log_warning_errno(r, "Failed to read 'subsystems' dir of port %" PRIu16 ", ignoring: %m", p->portnr); + else + FOREACH_ARRAY(ee, de->entries, de->n_entries) { + const struct dirent *e = *ee; + + if (unlinkat(subsystems_dir_fd, e->d_name, 0) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to remove 'subsystems' symlink '%s' of port %" PRIu16 ", ignoring: %m", e->d_name, p->portnr); + } + } + + p->nvme_port_fd = safe_close(p->nvme_port_fd); + } + + if (p->nvme_ports_fd >= 0) { + _cleanup_free_ char *fn = NULL; + if (asprintf(&fn, "%" PRIu16, p->portnr) < 0) + return log_oom(); + + if (unlinkat(p->nvme_ports_fd, fn, AT_REMOVEDIR) < 0) { + if (errno == ENOENT) + ret = 0; + else + ret = log_warning_errno(errno, "Failed to remove port '%" PRIu16 ", ignoring: %m", p->portnr); + } else + ret = 1; + + p->nvme_ports_fd = safe_close(p->nvme_ports_fd); + } + + return ret; +} + +static NvmePort *nvme_port_destroy(NvmePort *p) { + if (!p) + return NULL; + + (void) nvme_port_unlink(p); + + return nvme_port_free(p); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(NvmePort*, nvme_port_destroy); + +static int nvme_port_add_portnr( + int ports_fd, + uint16_t portnr, + int ip_family, + int *ret_fd) { + + int r; + + assert(ports_fd >= 0); + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + assert(ret_fd); + + _cleanup_free_ char *fname = NULL; + if (asprintf(&fname, "%" PRIu16, portnr) < 0) + return log_oom(); + + _cleanup_close_ int port_fd = -EBADF; + port_fd = open_mkdir_at(ports_fd, fname, O_EXCL|O_RDONLY|O_CLOEXEC, 0777); + if (port_fd < 0) { + if (port_fd != -EEXIST) + return log_error_errno(port_fd, "Failed to create port %" PRIu16 ": %m", portnr); + + *ret_fd = -EBADF; + return 0; + } + + r = write_string_file_at(port_fd, "addr_adrfam", af_to_ipv4_ipv6(ip_family), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set address family on NVME port %" PRIu16 ": %m", portnr); + + r = write_string_file_at(port_fd, "addr_trtype", "tcp", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set transport type on NVME port %" PRIu16 ": %m", portnr); + + r = write_string_file_at(port_fd, "addr_trsvcid", fname, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set IP port on NVME port %" PRIu16 ": %m", portnr); + + r = write_string_file_at(port_fd, "addr_traddr", ip_family == AF_INET6 ? "::" : "0.0.0.0", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set IP address on NVME port %" PRIu16 ": %m", portnr); + + *ret_fd = TAKE_FD(port_fd); + return 1; +} + +static uint16_t calculate_start_port(const char *name, int ip_family) { + struct siphash state; + uint16_t nr; + + assert(name); + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + + /* Use some fixed key Lennart pulled from /dev/urandom, so that we are deterministic */ + siphash24_init(&state, SD_ID128_MAKE(d1,0b,67,b5,e2,b7,4a,91,8d,6b,27,b6,35,c1,9f,d9).bytes); + siphash24_compress_string(name, &state); + siphash24_compress(&ip_family, sizeof(ip_family), &state); + + nr = 1024U + siphash24_finalize(&state) % (0xFFFFU - 1024U); + SET_FLAG(nr, 1, ip_family == AF_INET6); /* Lowest bit reflects family */ + + return nr; +} + +static uint16_t calculate_next_port(int ip_family) { + uint16_t nr; + + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + + nr = 1024U + random_u64_range(0xFFFFU - 1024U); + SET_FLAG(nr, 1, ip_family == AF_INET6); /* Lowest bit reflects family */ + + return nr; +} + +static int nvme_port_add(const char *name, int ip_family, NvmePort **ret) { + int r; + + assert(name); + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + assert(ret); + + _cleanup_close_ int ports_fd = -EBADF; + ports_fd = RET_NERRNO(open("/sys/kernel/config/nvmet/ports", O_DIRECTORY|O_RDONLY|O_CLOEXEC)); + if (ports_fd < 0) + return log_error_errno(ports_fd, "Failed to open /sys/kernel/config/nvmet/ports: %m"); + + _cleanup_close_ int port_fd = -EBADF; + uint16_t portnr = calculate_start_port(name, ip_family); + for (unsigned attempt = 0;; attempt++) { + r = nvme_port_add_portnr(ports_fd, portnr, ip_family, &port_fd); + if (r < 0) + return r; + if (r > 0) + break; + + if (attempt > 16) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Can't find free NVME port after %u attempts.", attempt); + + log_debug_errno(port_fd, "NVME port %" PRIu16 " exists already, randomizing port.", portnr); + + portnr = calculate_next_port(ip_family); + } + + _cleanup_(nvme_port_destroyp) NvmePort *p = new(NvmePort, 1); + if (!p) + return log_oom(); + + *p = (NvmePort) { + .portnr = portnr, + .nvme_ports_fd = TAKE_FD(ports_fd), + .nvme_port_fd = TAKE_FD(port_fd), + .ip_family = ip_family, + }; + + *ret = TAKE_PTR(p); + return 0; +} + +static int nvme_port_link_subsystem(NvmePort *port, NvmeSubsystem *subsys) { + assert(port); + assert(subsys); + + _cleanup_free_ char *target = NULL, *linkname = NULL; + target = path_join("/sys/kernel/config/nvmet/subsystems", subsys->name); + if (!target) + return log_oom(); + + linkname = path_join("subsystems", subsys->name); + if (!linkname) + return log_oom(); + + if (symlinkat(target, port->nvme_port_fd, linkname) < 0) + return log_error_errno(errno, "Failed to link subsystem '%s' to port %" PRIu16 ": %m", subsys->name, port->portnr); + + return 0; +} + +static int nvme_port_unlink_subsystem(NvmePort *port, NvmeSubsystem *subsys) { + assert(port); + assert(subsys); + + _cleanup_free_ char *linkname = NULL; + linkname = path_join("subsystems", subsys->name); + if (!linkname) + return log_oom(); + + if (unlinkat(port->nvme_port_fd, linkname, 0) < 0 && errno != ENOENT) + return log_error_errno(errno, "Failed to unlink subsystem '%s' to port %" PRIu16 ": %m", subsys->name, port->portnr); + + return 0; +} + +static int nvme_subsystem_report(NvmeSubsystem *subsystem, NvmePort *ipv4, NvmePort *ipv6) { + assert(subsystem); + + _cleanup_free_ struct local_address *addresses = NULL; + int n_addresses; + n_addresses = local_addresses(NULL, 0, AF_UNSPEC, &addresses); + if (n_addresses < 0) + return log_error_errno(n_addresses, "Failed to determine local IP addresses: %m"); + + log_notice("NVMe-TCP: %s %s%s%s (%s)", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_COMPUTER_DISK) : "", emoji_enabled() ? " " : "", + subsystem->name, subsystem->device); + + FOREACH_ARRAY(a, addresses, n_addresses) { + NvmePort *port = a->family == AF_INET ? ipv4 : ipv6; + + if (!port) + continue; + + log_info(" %s Try for specific device: nvme connect -t tcp -n '%s' -a %s -s %" PRIu16, + special_glyph(a >= addresses + (n_addresses - 1) ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH), + subsystem->name, + IN_ADDR_TO_STRING(a->family, &a->address), + port->portnr); + } + + return 0; +} + +static int plymouth_send_text(const char *text) { + _cleanup_free_ char *plymouth_message = NULL; + int c, r; + + assert(text); + + c = asprintf(&plymouth_message, + "M\x02%c%s%c" + "A%c", /* pause spinner */ + (int) strlen(text) + 1, text, '\x00', + '\x00'); + if (c < 0) + return log_oom(); + + r = plymouth_send_raw(plymouth_message, c, SOCK_NONBLOCK); + if (r < 0) + return log_full_errno(ERRNO_IS_NO_PLYMOUTH(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to communicate with plymouth, ignoring: %m"); + + return 0; +} + +static int plymouth_notify_port(NvmePort *port, struct local_address *a) { + _cleanup_free_ char *m = NULL; + + if (!port || !a) + return 0; + + if (asprintf(&m, "nvme connect-all -t tcp -a %s -s %" PRIu16, IN_ADDR_TO_STRING(a->family, &a->address), port->portnr) < 0) + return log_oom(); + + return plymouth_send_text(m); +} + +static int nvme_port_report(NvmePort *port, bool *plymouth_done) { + if (!port) + return 0; + + _cleanup_free_ struct local_address *addresses = NULL; + int n_addresses; + n_addresses = local_addresses(NULL, 0, port->ip_family, &addresses); + if (n_addresses < 0) + return log_error_errno(n_addresses, "Failed to determine local IP addresses: %m"); + + log_notice("NVMe-TCP: %s %s%sListening on %s (port %" PRIu16 ")", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_WORLD) : "", emoji_enabled() ? " " : "", + af_to_ipv4_ipv6(port->ip_family), + port->portnr); + + FOREACH_ARRAY(a, addresses, n_addresses) + log_info(" %s Try for all devices: nvme connect-all -t tcp -a %s -s %" PRIu16, + special_glyph(a >= addresses + (n_addresses - 1) ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH), + IN_ADDR_TO_STRING(a->family, &a->address), + port->portnr); + + if (plymouth_done && !*plymouth_done) { + (void) plymouth_notify_port(port, n_addresses > 0 ? addresses : NULL); + *plymouth_done = n_addresses > 0; + } + + return 0; +} + +typedef struct Context { + Hashmap *subsystems; + NvmePort *ipv4_port, *ipv6_port; + + bool display_refresh_scheduled; +} Context; + +static void device_hash_func(const struct stat *q, struct siphash *state) { + assert(q); + + mode_t m = q->st_mode & S_IFMT; + siphash24_compress(&m, sizeof(m), state); + + if (S_ISBLK(q->st_mode) || S_ISCHR(q->st_mode)) { + siphash24_compress(&q->st_rdev, sizeof(q->st_rdev), state); + return; + } + + return inode_hash_func(q, state); +} + +static int device_compare_func(const struct stat *a, const struct stat *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->st_mode & S_IFMT, b->st_mode & S_IFMT); + if (r != 0) + return r; + + if (S_ISBLK(a->st_mode) || S_ISCHR(a->st_mode)) { + r = CMP(major(a->st_rdev), major(b->st_rdev)); + if (r != 0) + return r; + + r = CMP(minor(a->st_rdev), minor(b->st_rdev)); + if (r != 0) + return r; + + return 0; + } + + return inode_compare_func(a, b); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + nvme_subsystem_hash_ops, + struct stat, + device_hash_func, + device_compare_func, + NvmeSubsystem, + nvme_subsystem_destroy); + +static void context_done(Context *c) { + assert(c); + + c->ipv4_port = nvme_port_destroy(c->ipv4_port); + c->ipv6_port = nvme_port_destroy(c->ipv6_port); + + c->subsystems = hashmap_free(c->subsystems); +} + +static void device_track_back(sd_device *d, sd_device **ret) { + int r; + + assert(d); + assert(ret); + + const char *devname = NULL; + (void) sd_device_get_devname(d, &devname); + + _cleanup_(sd_device_unrefp) sd_device *d_originating = NULL; + r = block_device_get_originating(d, &d_originating); + if (r < 0) + log_device_debug_errno(d, r, "Failed to get originating device for '%s', ignoring: %m", strna(devname)); + + sd_device *d_whole = NULL; + r = block_device_get_whole_disk(d_originating ?: d, &d_whole); /* does not ref returned device */ + if (r < 0) + log_device_debug_errno(d, r, "Failed to get whole device for '%s', ignoring: %m", strna(devname)); + + *ret = d_whole ? sd_device_ref(d_whole) : d_originating ? TAKE_PTR(d_originating) : sd_device_ref(d); +} + +static int device_is_same(sd_device *a, sd_device *b) { + dev_t devnum_a, devnum_b; + int r; + + assert(a); + assert(b); + + r = sd_device_get_devnum(a, &devnum_a); + if (r < 0) + return r; + + r = sd_device_get_devnum(b, &devnum_b); + if (r < 0) + return r; + + return devnum_a == devnum_b; +} + +static bool device_is_allowed(sd_device *d) { + int r; + + assert(d); + + if (arg_all >= 2) /* If --all is specified twice we allow even the root fs to shared */ + return true; + + const char *devname; + r = sd_device_get_devname(d, &devname); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device name: %m"); + + dev_t root_devnum; + r = get_block_device("/", &root_devnum); + if (r < 0) { + log_warning_errno(r, "Failed to get backing device of the root file system: %m"); + return false; /* Better safe */ + } + if (root_devnum == 0) /* Not backed by a block device? */ + return true; + + _cleanup_(sd_device_unrefp) sd_device *root_device = NULL; + r = sd_device_new_from_devnum(&root_device, 'b', root_devnum); + if (r < 0) { + log_warning_errno(r, "Failed to get root block device, assuming device '%s' is same as root device: %m", devname); + return false; + } + + _cleanup_(sd_device_unrefp) sd_device *whole_root_device = NULL; + device_track_back(root_device, &whole_root_device); + + _cleanup_(sd_device_unrefp) sd_device *whole_d = NULL; + device_track_back(d, &whole_d); + + r = device_is_same(whole_root_device, whole_d); + if (r < 0) { + log_warning_errno(r, "Failed to determine if root device and device '%s' are the same, assuming they are: %m", devname); + return false; /* Better safe */ + } + + return !r; +} + +static int device_added(Context *c, sd_device *device) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(c); + assert(device); + + const char *sysname; + r = sd_device_get_sysname(device, &sysname); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get device name: %m"); + + log_device_debug(device, "new block device '%s'", sysname); + + if (STARTSWITH_SET(sysname, "loop", "zram")) /* Ignore some devices */ + return 0; + + const char *devname; + r = sd_device_get_devname(device, &devname); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get device node path: %m"); + + struct stat lookup_key = { + .st_mode = S_IFBLK, + }; + + r = sd_device_get_devnum(device, &lookup_key.st_rdev); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get major/minor from device: %m"); + + if (hashmap_contains(c->subsystems, &lookup_key)) { + log_debug("Device '%s' already seen.", devname); + return 0; + } + + if (!device_is_allowed(device)) { + log_device_debug(device, "Not exposing device '%s', as it is backed by root disk.", devname); + return 0; + } + + fd = sd_device_open(device, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) { + log_device_warning_errno(device, fd, "Failed to open newly acquired device '%s', ignoring device: %m", devname); + return 0; + } + + _cleanup_(nvme_subsystem_destroyp) NvmeSubsystem *s = NULL; + r = nvme_subsystem_add(devname, TAKE_FD(fd), device, &s); + if (r < 0) + return r; + + if (c->ipv4_port) { + r = nvme_port_link_subsystem(c->ipv4_port, s); + if (r < 0) + return r; + } + + if (c->ipv6_port) { + r = nvme_port_link_subsystem(c->ipv6_port, s); + if (r < 0) + return r; + } + + r = hashmap_ensure_put(&c->subsystems, &nvme_subsystem_hash_ops, &s->device_stat, s); + if (r < 0) + return log_error_errno(r, "Failed to add subsystem to hash table: %m"); + + (void) nvme_subsystem_report(s, c->ipv4_port, c->ipv6_port); + + TAKE_PTR(s); + return 1; +} + +static int device_removed(Context *c, sd_device *device) { + int r; + + assert(device); + + struct stat lookup_key = { + .st_mode = S_IFBLK, + }; + + r = sd_device_get_devnum(device, &lookup_key.st_rdev); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get major/minor from device: %m"); + + NvmeSubsystem *s = hashmap_remove(c->subsystems, &lookup_key); + if (!s) + return 0; + + log_device_debug(device, "removed block device '%s'", s->name); + + if (c->ipv4_port) + (void) nvme_port_unlink_subsystem(c->ipv4_port, s); + if (c->ipv6_port) + (void) nvme_port_unlink_subsystem(c->ipv6_port, s); + + s = nvme_subsystem_destroy(s); + return 1; +} + +static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Context *c = ASSERT_PTR(userdata); + + if (device_for_action(device, SD_DEVICE_REMOVE)) + device_removed(c, device); + else + device_added(c, device); + + return 0; +} + +static int on_display_refresh(sd_event_source *s, uint64_t usec, void *userdata) { + Context *c = ASSERT_PTR(userdata); + + assert(s); + + c->display_refresh_scheduled = false; + + if (isatty(STDERR_FILENO) > 0) + fputs(ANSI_HOME_CLEAR, stderr); + + /* If we have both IPv4 and IPv6, we display IPv4 info via Plymouth, since it doesn't have much + * space, and IPv4 is simply shorter (and easy to type off screen) */ + + bool plymouth_done = false; + (void) nvme_port_report(c->ipv4_port, &plymouth_done); + (void) nvme_port_report(c->ipv6_port, &plymouth_done); + + if (!plymouth_done) + (void) plymouth_send_text("Network disconnected."); + + NvmeSubsystem *i; + HASHMAP_FOREACH(i, c->subsystems) + (void) nvme_subsystem_report(i, c->ipv4_port, c->ipv6_port); + + return 0; +} + +static int on_address_change(sd_netlink *rtnl, sd_netlink_message *mm, void *userdata) { + Context *c = ASSERT_PTR(userdata); + int r, family; + + assert(rtnl); + assert(mm); + + r = sd_rtnl_message_addr_get_family(mm, &family); + if (r < 0) { + log_warning_errno(r, "Failed to get address family from netlink address message, ignoring: %m"); + return 0; + } + + if (!c->display_refresh_scheduled) { + r = sd_event_add_time_relative( + sd_netlink_get_event(rtnl), + /* ret_slot= */ NULL, + CLOCK_MONOTONIC, + 750 * USEC_PER_MSEC, + 0, + on_display_refresh, + c); + if (r < 0) + log_warning_errno(r, "Failed to schedule display refresh, ignoring: %m"); + else + c->display_refresh_scheduled = true; + } + + return 0; +} + +static int run(int argc, char* argv[]) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(context_done) Context context = {}; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = sd_event_set_signal_exit(event, true); + if (r < 0) + return log_error_errno(r, "Failed to install exit signal handlers: %m"); + + STRV_FOREACH(i, arg_devices) { + _cleanup_(nvme_subsystem_destroyp) NvmeSubsystem *subsys = NULL; + + r = nvme_subsystem_add(*i, -EBADF, /* device= */ NULL, &subsys); + if (r < 0) + return r; + + r = hashmap_ensure_put(&context.subsystems, &nvme_subsystem_hash_ops, &subsys->device_stat, subsys); + if (r == -EEXIST) { + log_warning_errno(r, "Duplicate device '%s' specified, skipping: %m", *i); + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to add subsystem to hash table: %m"); + + TAKE_PTR(subsys); + } + + r = nvme_port_add(arg_nqn, AF_INET, &context.ipv4_port); + if (r < 0) + return r; + + bool plymouth_done = false; + nvme_port_report(context.ipv4_port, &plymouth_done); + + if (socket_ipv6_is_enabled()) { + r = nvme_port_add(arg_nqn, AF_INET6, &context.ipv6_port); + if (r < 0) + return r; + + nvme_port_report(context.ipv6_port, &plymouth_done); + } + + if (!plymouth_done) + (void) plymouth_send_text("Network disconnected."); + + NvmeSubsystem *i; + HASHMAP_FOREACH(i, context.subsystems) { + if (context.ipv4_port) { + r = nvme_port_link_subsystem(context.ipv4_port, i); + if (r < 0) + return r; + } + + if (context.ipv6_port) { + r = nvme_port_link_subsystem(context.ipv6_port, i); + if (r < 0) + return r; + } + + (void) nvme_subsystem_report(i, context.ipv4_port, context.ipv6_port); + } + + if (arg_all > 0) { + r = sd_device_monitor_new(&monitor); + if (r < 0) + return log_error_errno(r, "Failed to allocate device monitor: %m"); + + r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "block", "disk"); + if (r < 0) + return log_error_errno(r, "Failed to configure device monitor match: %m"); + + r = sd_device_monitor_attach_event(monitor, event); + if (r < 0) + return log_error_errno(r, "Failed to attach device monitor to event loop: %m"); + + r = sd_device_monitor_start(monitor, device_monitor_handler, &context); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *enumerator = NULL; + r = sd_device_enumerator_new(&enumerator); + if (r < 0) + return log_error_errno(r, "Failed to allocate enumerator: %m"); + + r = sd_device_enumerator_add_match_subsystem(enumerator, "block", /* match= */ true); + if (r < 0) + return log_error_errno(r, "Failed to match block devices: %m"); + + r = sd_device_enumerator_add_match_property(enumerator, "DEVTYPE", "disk"); + if (r < 0) + return log_error_errno(r, "Failed to match whole block devices: %m"); + + r = sd_device_enumerator_add_nomatch_sysname(enumerator, "loop*"); + if (r < 0) + return log_error_errno(r, "Failed to exclude loop devices: %m"); + + FOREACH_DEVICE(enumerator, device) + device_added(&context, device); + } + + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = sd_netlink_attach_event(rtnl, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach netlink socket to event loop: %m"); + + r = sd_netlink_add_match(rtnl, /* ret_slot= */ NULL, RTM_NEWADDR, on_address_change, /* destroy_callback= */ NULL, &context, "storagetm-newaddr"); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR events: %m"); + + r = sd_netlink_add_match(rtnl, /* ret_slot= */ NULL, RTM_DELADDR, on_address_change, /* destroy_callback= */ NULL, &context, "storagetm-deladdr"); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_DELADDR events: %m"); + + if (isatty(0) > 0) + log_info("Hit Ctrl-C to exit target mode."); + + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = + notify_start("READY=1\n" + "STATUS=Exposing disks in target mode...", + NOTIFY_STOPPING); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + log_info("Exiting target mode."); + return r; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/sulogin-shell/meson.build b/src/sulogin-shell/meson.build new file mode 100644 index 0000000..34b2b32 --- /dev/null +++ b/src/sulogin-shell/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-sulogin-shell', + 'sources' : files('sulogin-shell.c'), + }, +] diff --git a/src/sulogin-shell/sulogin-shell.c b/src/sulogin-shell/sulogin-shell.c new file mode 100644 index 0000000..b26663d --- /dev/null +++ b/src/sulogin-shell/sulogin-shell.c @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2017 Felipe Sateler +***/ + +#include +#include + +#include "sd-bus.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "constants.h" +#include "env-util.h" +#include "initrd-util.h" +#include "log.h" +#include "main-func.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "signal-util.h" +#include "special.h" +#include "unit-def.h" + +static int target_is_inactive(sd_bus *bus, const char *target) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL, *state = NULL; + int r; + + path = unit_dbus_path_from_name(target); + if (!path) + return log_oom(); + + r = sd_bus_get_property_string(bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "ActiveState", + &error, + &state); + if (r < 0) + return log_error_errno(r, "Failed to retrieve unit state: %s", bus_error_message(&error, r)); + + return streq_ptr(state, "inactive"); +} + +static int start_target(sd_bus *bus, const char *target) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + log_info("Starting %s", target); + + /* Start this unit only if we can replace basic.target with it */ + r = bus_call_method( + bus, + bus_systemd_mgr, + "StartUnit", + &error, + NULL, + "ss", target, "isolate"); + + if (r < 0) + return log_error_errno(r, "Failed to start %s: %s", target, bus_error_message(&error, r)); + + return 0; +} + +static int fork_wait(const char* const cmdline[]) { + pid_t pid; + int r; + + r = safe_fork("(sulogin)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + execv(cmdline[0], (char**) cmdline); + log_error_errno(errno, "Failed to execute %s: %m", cmdline[0]); + _exit(EXIT_FAILURE); /* Operational error */ + } + + return wait_for_terminate_and_check(cmdline[0], pid, WAIT_LOG_ABNORMAL); +} + +static void print_mode(const char* mode) { + printf("You are in %s mode. After logging in, type \"journalctl -xb\" to view\n" + "system logs, \"systemctl reboot\" to reboot, or \"exit\"\n" "to continue bootup.\n", mode); + fflush(stdout); +} + +static int run(int argc, char *argv[]) { + const char* sulogin_cmdline[] = { + SULOGIN, + NULL, /* --force */ + NULL + }; + bool force = false; + int r; + + log_setup(); + + print_mode(argc > 1 ? argv[1] : ""); + + if (getenv_bool("SYSTEMD_SULOGIN_FORCE") > 0) + force = true; + + if (!force) { + /* We look the argument in the kernel cmdline under the same name as the environment variable + * to express that this is not supported at the same level as the regular kernel cmdline + * switches. */ + r = proc_cmdline_get_bool("SYSTEMD_SULOGIN_FORCE", /* flags = */ 0, &force); + if (r < 0) + log_debug_errno(r, "Failed to parse SYSTEMD_SULOGIN_FORCE from kernel command line, ignoring: %m"); + } + + if (force) + /* allows passwordless logins if root account is locked. */ + sulogin_cmdline[1] = "--force"; + + for (;;) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + + (void) fork_wait(sulogin_cmdline); + + r = bus_connect_system_systemd(&bus); + if (r < 0) { + log_warning_errno(r, "Failed to get D-Bus connection: %m"); + goto fallback; + } + + log_info("Reloading system manager configuration."); + r = bus_service_manager_reload(bus); + if (r < 0) + goto fallback; + + const char *target = in_initrd() ? SPECIAL_INITRD_TARGET : SPECIAL_DEFAULT_TARGET; + + r = target_is_inactive(bus, target); + if (r < 0) + goto fallback; + if (!r) { + log_warning("%s is not inactive. Please review the %s setting.", target, target); + goto fallback; + } + + if (start_target(bus, target) >= 0) + break; + + fallback: + log_warning("Fallback to the single-user shell."); + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/sysctl/meson.build b/src/sysctl/meson.build new file mode 100644 index 0000000..1fb85c2 --- /dev/null +++ b/src/sysctl/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-sysctl', + 'public' : true, + 'sources' : files('sysctl.c'), + }, +] diff --git a/src/sysctl/sysctl.c b/src/sysctl/sysctl.c new file mode 100644 index 0000000..aac965f --- /dev/null +++ b/src/sysctl/sysctl.c @@ -0,0 +1,490 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "build.h" +#include "conf-files.h" +#include "constants.h" +#include "creds-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "glob-util.h" +#include "hashmap.h" +#include "log.h" +#include "main-func.h" +#include "pager.h" +#include "path-util.h" +#include "pretty-print.h" +#include "string-util.h" +#include "strv.h" +#include "sysctl-util.h" + +static char **arg_prefixes = NULL; +static CatFlags arg_cat_flags = CAT_CONFIG_OFF; +static bool arg_strict = false; +static PagerFlags arg_pager_flags = 0; + +STATIC_DESTRUCTOR_REGISTER(arg_prefixes, strv_freep); + +typedef struct Option { + char *key; + char *value; + bool ignore_failure; +} Option; + +static Option *option_free(Option *o) { + if (!o) + return NULL; + + free(o->key); + free(o->value); + + return mfree(o); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Option*, option_free); +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(option_hash_ops, char, string_hash_func, string_compare_func, Option, option_free); + +static bool test_prefix(const char *p) { + if (strv_isempty(arg_prefixes)) + return true; + + return path_startswith_strv(p, arg_prefixes); +} + +static Option *option_new( + const char *key, + const char *value, + bool ignore_failure) { + + _cleanup_(option_freep) Option *o = NULL; + + assert(key); + + o = new(Option, 1); + if (!o) + return NULL; + + *o = (Option) { + .key = strdup(key), + .value = value ? strdup(value) : NULL, + .ignore_failure = ignore_failure, + }; + + if (!o->key) + return NULL; + if (value && !o->value) + return NULL; + + return TAKE_PTR(o); +} + +static int sysctl_write_or_warn(const char *key, const char *value, bool ignore_failure, bool ignore_enoent) { + int r; + + r = sysctl_write(key, value); + if (r < 0) { + /* Proceed without failing if ignore_failure is true. + * If the sysctl is not available in the kernel or we are running with reduced privileges and + * cannot write it, then log about the issue, and proceed without failing. Unless strict mode + * (arg_strict = true) is enabled, in which case we should fail. (EROFS is treated as a + * permission problem here, since that's how container managers usually protected their + * sysctls.) + * In all other cases log an error and make the tool fail. */ + if (ignore_failure || (!arg_strict && (r == -EROFS || ERRNO_IS_PRIVILEGE(r)))) + log_debug_errno(r, "Couldn't write '%s' to '%s', ignoring: %m", value, key); + else if (ignore_enoent && r == -ENOENT) + log_warning_errno(r, "Couldn't write '%s' to '%s', ignoring: %m", value, key); + else + return log_error_errno(r, "Couldn't write '%s' to '%s': %m", value, key); + } + + return 0; +} + +static int apply_glob_option_with_prefix(OrderedHashmap *sysctl_options, Option *option, const char *prefix) { + _cleanup_strv_free_ char **paths = NULL; + _cleanup_free_ char *pattern = NULL; + int r; + + assert(sysctl_options); + assert(option); + + if (prefix) { + _cleanup_free_ char *key = NULL; + + r = path_glob_can_match(option->key, prefix, &key); + if (r < 0) + return log_error_errno(r, "Failed to check if the glob '%s' matches prefix '%s': %m", + option->key, prefix); + if (r == 0) { + log_debug("The glob '%s' does not match prefix '%s'.", option->key, prefix); + return 0; + } + + log_debug("The glob '%s' is prefixed with '%s': '%s'", option->key, prefix, key); + + if (!string_is_glob(key)) { + /* The prefixed pattern is not glob anymore. Let's skip to call glob(). */ + if (ordered_hashmap_contains(sysctl_options, key)) { + log_debug("Not setting %s (explicit setting exists).", key); + return 0; + } + + return sysctl_write_or_warn(key, option->value, + /* ignore_failure = */ option->ignore_failure, + /* ignore_enoent = */ true); + } + + pattern = path_join("/proc/sys", key); + } else + pattern = path_join("/proc/sys", option->key); + if (!pattern) + return log_oom(); + + r = glob_extend(&paths, pattern, GLOB_NOCHECK); + if (r < 0) { + if (r == -ENOENT) { + log_debug("No match for glob: %s", option->key); + return 0; + } + if (option->ignore_failure || ERRNO_IS_PRIVILEGE(r)) { + log_debug_errno(r, "Failed to resolve glob '%s', ignoring: %m", option->key); + return 0; + } else + return log_error_errno(r, "Couldn't resolve glob '%s': %m", option->key); + } + + STRV_FOREACH(s, paths) { + const char *key; + + assert_se(key = path_startswith(*s, "/proc/sys")); + + if (ordered_hashmap_contains(sysctl_options, key)) { + log_debug("Not setting %s (explicit setting exists).", key); + continue; + } + + RET_GATHER(r, + sysctl_write_or_warn(key, option->value, + /* ignore_failure = */ option->ignore_failure, + /* ignore_enoent = */ !arg_strict)); + } + + return r; +} + +static int apply_glob_option(OrderedHashmap *sysctl_options, Option *option) { + int r = 0; + + if (strv_isempty(arg_prefixes)) + return apply_glob_option_with_prefix(sysctl_options, option, NULL); + + STRV_FOREACH(i, arg_prefixes) + RET_GATHER(r, apply_glob_option_with_prefix(sysctl_options, option, *i)); + return r; +} + +static int apply_all(OrderedHashmap *sysctl_options) { + Option *option; + int r = 0; + + ORDERED_HASHMAP_FOREACH(option, sysctl_options) { + int k; + + /* Ignore "negative match" options, they are there only to exclude stuff from globs. */ + if (!option->value) + continue; + + if (string_is_glob(option->key)) + k = apply_glob_option(sysctl_options, option); + else + k = sysctl_write_or_warn(option->key, option->value, + /* ignore_failure = */ option->ignore_failure, + /* ignore_enoent = */ !arg_strict); + RET_GATHER(r, k); + } + + return r; +} + +static int parse_file(OrderedHashmap **sysctl_options, const char *path, bool ignore_enoent) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *pp = NULL; + unsigned c = 0; + int r; + + assert(path); + + r = search_and_fopen(path, "re", NULL, (const char**) CONF_PATHS_STRV("sysctl.d"), &f, &pp); + if (r < 0) { + if (ignore_enoent && r == -ENOENT) + return 0; + + return log_error_errno(r, "Failed to open file '%s', ignoring: %m", path); + } + + log_debug("Parsing %s", pp); + for (;;) { + _cleanup_(option_freep) Option *new_option = NULL; + _cleanup_free_ char *l = NULL; + bool ignore_failure = false; + Option *existing; + char *value; + int k; + + k = read_stripped_line(f, LONG_LINE_MAX, &l); + if (k == 0) + break; + if (k < 0) + return log_error_errno(k, "Failed to read file '%s', ignoring: %m", pp); + + c++; + + if (isempty(l)) + continue; + if (strchr(COMMENTS, l[0])) + continue; + + char *p = l; + value = strchr(p, '='); + if (value) { + if (p[0] == '-') { + ignore_failure = true; + p++; + } + + *value = 0; + value++; + value = strstrip(value); + + } else { + if (p[0] == '-') + /* We have a "negative match" option. Let's continue with value==NULL. */ + p++; + else { + log_syntax(NULL, LOG_WARNING, pp, c, 0, + "Line is not an assignment, ignoring: %s", p); + if (r == 0) + r = -EINVAL; + continue; + } + } + + p = strstrip(p); + p = sysctl_normalize(p); + + /* We can't filter out globs at this point, we'll need to do that later. */ + if (!string_is_glob(p) && + !test_prefix(p)) + continue; + + existing = ordered_hashmap_get(*sysctl_options, p); + if (existing) { + if (streq_ptr(value, existing->value)) { + existing->ignore_failure = existing->ignore_failure || ignore_failure; + continue; + } + + log_debug("Overwriting earlier assignment of %s at '%s:%u'.", p, pp, c); + option_free(ordered_hashmap_remove(*sysctl_options, p)); + } + + new_option = option_new(p, value, ignore_failure); + if (!new_option) + return log_oom(); + + k = ordered_hashmap_ensure_put(sysctl_options, &option_hash_ops, new_option->key, new_option); + if (k < 0) + return log_error_errno(k, "Failed to add sysctl variable %s to hashmap: %m", p); + + TAKE_PTR(new_option); + } + + return r; +} + +static int read_credential_lines(OrderedHashmap **sysctl_options) { + _cleanup_free_ char *j = NULL; + const char *d; + int r; + + r = get_credentials_dir(&d); + if (r == -ENXIO) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to get credentials directory: %m"); + + j = path_join(d, "sysctl.extra"); + if (!j) + return log_oom(); + + (void) parse_file(sysctl_options, j, /* ignore_enoent= */ true); + return 0; +} + +static int cat_config(char **files) { + pager_open(arg_pager_flags); + + return cat_files(NULL, files, arg_cat_flags); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-sysctl.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [CONFIGURATION FILE...]\n\n" + "Applies kernel sysctl settings.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --cat-config Show configuration files\n" + " --tldr Show non-comment parts of configuration\n" + " --prefix=PATH Only apply rules with the specified prefix\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_CAT_CONFIG, + ARG_TLDR, + ARG_PREFIX, + ARG_NO_PAGER, + ARG_STRICT, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "cat-config", no_argument, NULL, ARG_CAT_CONFIG }, + { "tldr", no_argument, NULL, ARG_TLDR }, + { "prefix", required_argument, NULL, ARG_PREFIX }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "strict", no_argument, NULL, ARG_STRICT }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_CAT_CONFIG: + arg_cat_flags = CAT_CONFIG_ON; + break; + + case ARG_TLDR: + arg_cat_flags = CAT_TLDR; + break; + + case ARG_PREFIX: { + const char *s; + char *p; + + /* We used to require people to specify absolute paths + * in /proc/sys in the past. This is kinda useless, but + * we need to keep compatibility. We now support any + * sysctl name available. */ + sysctl_normalize(optarg); + + s = path_startswith(optarg, "/proc/sys"); + p = strdup(s ?: optarg); + if (!p) + return log_oom(); + + if (strv_consume(&arg_prefixes, p) < 0) + return log_oom(); + + break; + } + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_STRICT: + arg_strict = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_cat_flags != CAT_CONFIG_OFF && argc > optind) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Positional arguments are not allowed with --cat-config/--tldr."); + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_ordered_hashmap_free_ OrderedHashmap *sysctl_options = NULL; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + log_setup(); + + umask(0022); + + if (argc > optind) { + r = 0; + + for (int i = optind; i < argc; i++) + RET_GATHER(r, parse_file(&sysctl_options, argv[i], false)); + + } else { + _cleanup_strv_free_ char **files = NULL; + + r = conf_files_list_strv(&files, ".conf", NULL, 0, (const char**) CONF_PATHS_STRV("sysctl.d")); + if (r < 0) + return log_error_errno(r, "Failed to enumerate sysctl.d files: %m"); + + if (arg_cat_flags != CAT_CONFIG_OFF) + return cat_config(files); + + STRV_FOREACH(f, files) + RET_GATHER(r, parse_file(&sysctl_options, *f, true)); + + RET_GATHER(r, read_credential_lines(&sysctl_options)); + } + + RET_GATHER(r, apply_all(sysctl_options)); + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/sysext/meson.build b/src/sysext/meson.build new file mode 100644 index 0000000..2983970 --- /dev/null +++ b/src/sysext/meson.build @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-sysext', + 'public' : true, + 'conditions' : ['ENABLE_SYSEXT'], + 'sources' : files('sysext.c'), + }, +] + +if conf.get('ENABLE_SYSEXT') == 1 + meson.add_install_script(sh, '-c', ln_s.format(bindir / 'systemd-sysext', + bindir / 'systemd-confext')) +endif diff --git a/src/sysext/sysext.c b/src/sysext/sysext.c new file mode 100644 index 0000000..8dc515e --- /dev/null +++ b/src/sysext/sysext.c @@ -0,0 +1,1568 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "build.h" +#include "bus-locator.h" +#include "bus-error.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "capability-util.h" +#include "chase.h" +#include "constants.h" +#include "devnum-util.h" +#include "discover-image.h" +#include "dissect-image.h" +#include "env-util.h" +#include "escape.h" +#include "extension-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "fs-util.h" +#include "hashmap.h" +#include "initrd-util.h" +#include "log.h" +#include "main-func.h" +#include "missing_magic.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "os-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "sort-util.h" +#include "terminal-util.h" +#include "user-util.h" +#include "varlink.h" +#include "varlink-io.systemd.sysext.h" +#include "verbs.h" + +static char **arg_hierarchies = NULL; /* "/usr" + "/opt" by default for sysext and /etc by default for confext */ +static char *arg_root = NULL; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static bool arg_force = false; +static bool arg_no_reload = false; +static int arg_noexec = -1; +static ImagePolicy *arg_image_policy = NULL; +static bool arg_varlink = false; + +/* Is set to IMAGE_CONFEXT when systemd is called with the confext functionality instead of the default */ +static ImageClass arg_image_class = IMAGE_SYSEXT; + +STATIC_DESTRUCTOR_REGISTER(arg_hierarchies, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +/* Helper struct for naming simplicity and reusability */ +static const struct { + const char *dot_directory_name; + const char *directory_name; + const char *short_identifier; + const char *short_identifier_plural; + const char *level_env; + const char *scope_env; + const char *name_env; + const ImagePolicy *default_image_policy; + unsigned long default_mount_flags; +} image_class_info[_IMAGE_CLASS_MAX] = { + [IMAGE_SYSEXT] = { + .dot_directory_name = ".systemd-sysext", + .directory_name = "systemd-sysext", + .short_identifier = "sysext", + .short_identifier_plural = "extensions", + .level_env = "SYSEXT_LEVEL", + .scope_env = "SYSEXT_SCOPE", + .name_env = "SYSTEMD_SYSEXT_HIERARCHIES", + .default_image_policy = &image_policy_sysext, + .default_mount_flags = MS_RDONLY|MS_NODEV, + }, + [IMAGE_CONFEXT] = { + .dot_directory_name = ".systemd-confext", + .directory_name = "systemd-confext", + .short_identifier = "confext", + .short_identifier_plural = "confexts", + .level_env = "CONFEXT_LEVEL", + .scope_env = "CONFEXT_SCOPE", + .name_env = "SYSTEMD_CONFEXT_HIERARCHIES", + .default_image_policy = &image_policy_confext, + .default_mount_flags = MS_RDONLY|MS_NODEV|MS_NOSUID|MS_NOEXEC, + } +}; + +static int is_our_mount_point( + ImageClass image_class, + const char *p) { + + _cleanup_free_ char *buf = NULL, *f = NULL; + struct stat st; + dev_t dev; + int r; + + assert(p); + + r = path_is_mount_point(p, NULL, 0); + if (r == -ENOENT) { + log_debug_errno(r, "Hierarchy '%s' doesn't exist.", p); + return false; + } + if (r < 0) + return log_error_errno(r, "Failed to determine whether '%s' is a mount point: %m", p); + if (r == 0) { + log_debug("Hierarchy '%s' is not a mount point, skipping.", p); + return false; + } + + /* So we know now that it's a mount point. Now let's check if it's one of ours, so that we don't + * accidentally unmount the user's own /usr/ but just the mounts we established ourselves. We do this + * check by looking into the metadata directory we place in merged mounts: if the file + * ../dev contains the major/minor device pair of the mount we have a good reason to + * believe this is one of our mounts. This thorough check has the benefit that we aren't easily + * confused if people tar up one of our merged trees and untar them elsewhere where we might mistake + * them for a live sysext tree. */ + + f = path_join(p, image_class_info[image_class].dot_directory_name, "dev"); + if (!f) + return log_oom(); + + r = read_one_line_file(f, &buf); + if (r == -ENOENT) { + log_debug("Hierarchy '%s' does not carry a %s/dev file, not a merged tree.", p, image_class_info[image_class].dot_directory_name); + return false; + } + if (r < 0) + return log_error_errno(r, "Failed to determine whether hierarchy '%s' contains '%s/dev': %m", p, image_class_info[image_class].dot_directory_name); + + r = parse_devnum(buf, &dev); + if (r < 0) + return log_error_errno(r, "Failed to parse device major/minor stored in '%s/dev' file on '%s': %m", image_class_info[image_class].dot_directory_name, p); + + if (lstat(p, &st) < 0) + return log_error_errno(r, "Failed to stat %s: %m", p); + + if (st.st_dev != dev) { + log_debug("Hierarchy '%s' reports a different device major/minor than what we are seeing, assuming offline copy.", p); + return false; + } + + return true; +} + +static int need_reload( + ImageClass image_class, + char **hierarchies, + bool no_reload) { + + /* Parse the mounted images to find out if we need to reload the daemon. */ + int r; + + if (no_reload) + return false; + + STRV_FOREACH(p, hierarchies) { + _cleanup_free_ char *f = NULL, *buf = NULL, *resolved = NULL; + _cleanup_strv_free_ char **mounted_extensions = NULL; + + r = chase(*p, arg_root, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r == -ENOENT) { + log_debug_errno(r, "Hierarchy '%s%s' does not exist, ignoring.", strempty(arg_root), *p); + continue; + } + if (r < 0) { + log_warning_errno(r, "Failed to resolve path to hierarchy '%s%s': %m, ignoring.", strempty(arg_root), *p); + continue; + } + + r = is_our_mount_point(image_class, resolved); + if (r < 0) + return r; + if (!r) + continue; + + f = path_join(resolved, image_class_info[image_class].dot_directory_name, image_class_info[image_class].short_identifier_plural); + if (!f) + return log_oom(); + + r = read_full_file(f, &buf, NULL); + if (r < 0) + return log_error_errno(r, "Failed to open '%s': %m", f); + + mounted_extensions = strv_split_newlines(buf); + if (!mounted_extensions) + return log_oom(); + + STRV_FOREACH(extension, mounted_extensions) { + _cleanup_strv_free_ char **extension_release = NULL; + const char *extension_reload_manager = NULL; + int b; + + r = load_extension_release_pairs(arg_root, image_class, *extension, /* relax_extension_release_check */ true, &extension_release); + if (r < 0) { + log_debug_errno(r, "Failed to parse extension-release metadata of %s, ignoring: %m", *extension); + continue; + } + + extension_reload_manager = strv_env_pairs_get(extension_release, "EXTENSION_RELOAD_MANAGER"); + if (isempty(extension_reload_manager)) + continue; + + b = parse_boolean(extension_reload_manager); + if (b < 0) { + log_warning_errno(b, "Failed to parse the extension metadata to know if the manager needs to be reloaded, ignoring: %m"); + continue; + } + + if (b) + /* If at least one extension wants a reload, we reload. */ + return true; + } + } + + return false; +} + +static int daemon_reload(void) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + r = bus_connect_system_systemd(&bus); + if (r < 0) + return log_error_errno(r, "Failed to get D-Bus connection: %m"); + + return bus_service_manager_reload(bus); +} + +static int unmerge_hierarchy( + ImageClass image_class, + const char *p) { + + int r; + + assert(p); + + for (;;) { + /* We only unmount /usr/ if it is a mount point and really one of ours, in order not to break + * systems where /usr/ is a mount point of its own already. */ + + r = is_our_mount_point(image_class, p); + if (r < 0) + return r; + if (r == 0) + break; + + r = umount_verbose(LOG_ERR, p, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to unmount file system '%s': %m", p); + + log_info("Unmerged '%s'.", p); + } + + return 0; +} + +static int unmerge( + ImageClass image_class, + char **hierarchies, + bool no_reload) { + + int r, ret = 0; + bool need_to_reload; + + r = need_reload(image_class, hierarchies, no_reload); + if (r < 0) + return r; + need_to_reload = r > 0; + + STRV_FOREACH(p, hierarchies) { + _cleanup_free_ char *resolved = NULL; + + r = chase(*p, arg_root, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r == -ENOENT) { + log_debug_errno(r, "Hierarchy '%s%s' does not exist, ignoring.", strempty(arg_root), *p); + continue; + } + if (r < 0) { + log_error_errno(r, "Failed to resolve path to hierarchy '%s%s': %m", strempty(arg_root), *p); + if (ret == 0) + ret = r; + + continue; + } + + r = unmerge_hierarchy(image_class, resolved); + if (r < 0 && ret == 0) + ret = r; + } + + if (need_to_reload) { + r = daemon_reload(); + if (r < 0) + return r; + } + + return ret; +} + +static int verb_unmerge(int argc, char **argv, void *userdata) { + int r; + + r = have_effective_cap(CAP_SYS_ADMIN); + if (r < 0) + return log_error_errno(r, "Failed to check if we have enough privileges: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be privileged."); + + return unmerge(arg_image_class, + arg_hierarchies, + arg_no_reload); +} + +static int parse_image_class_parameter(Varlink *link, const char *value, ImageClass *image_class, char ***hierarchies) { + _cleanup_strv_free_ char **h = NULL; + ImageClass c; + int r; + + assert(link); + assert(image_class); + + if (!value) + return 0; + + c = image_class_from_string(value); + if (!IN_SET(c, IMAGE_SYSEXT, IMAGE_CONFEXT)) + return varlink_errorb(link, VARLINK_ERROR_INVALID_PARAMETER, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("parameter", "class"))); + + if (hierarchies) { + r = parse_env_extension_hierarchies(&h, image_class_info[c].name_env); + if (r < 0) + return log_error_errno(r, "Failed to parse environment variable: %m"); + + strv_free_and_replace(*hierarchies, h); + } + + *image_class = c; + return 0; +} + +typedef struct MethodUnmergeParameters { + const char *class; + int no_reload; +} MethodUnmergeParameters; + +static int vl_method_unmerge(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "class", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(MethodUnmergeParameters, class), 0 }, + { "noReload", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(MethodUnmergeParameters, no_reload), 0 }, + {} + }; + MethodUnmergeParameters p = { + .no_reload = -1, + }; + _cleanup_strv_free_ char **hierarchies = NULL; + ImageClass image_class = arg_image_class; + int r; + + assert(link); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = parse_image_class_parameter(link, p.class, &image_class, &hierarchies); + if (r < 0) + return r; + + r = unmerge(image_class, + hierarchies ?: arg_hierarchies, + p.no_reload >= 0 ? p.no_reload : arg_no_reload); + if (r < 0) + return r; + + return varlink_reply(link, NULL); +} + +static int verb_status(int argc, char **argv, void *userdata) { + _cleanup_(table_unrefp) Table *t = NULL; + int r, ret = 0; + + t = table_new("hierarchy", "extensions", "since"); + if (!t) + return log_oom(); + + table_set_ersatz_string(t, TABLE_ERSATZ_DASH); + + STRV_FOREACH(p, arg_hierarchies) { + _cleanup_free_ char *resolved = NULL, *f = NULL, *buf = NULL; + _cleanup_strv_free_ char **l = NULL; + struct stat st; + + r = chase(*p, arg_root, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r == -ENOENT) { + log_debug_errno(r, "Hierarchy '%s%s' does not exist, ignoring.", strempty(arg_root), *p); + continue; + } + if (r < 0) { + log_error_errno(r, "Failed to resolve path to hierarchy '%s%s': %m", strempty(arg_root), *p); + goto inner_fail; + } + + r = is_our_mount_point(arg_image_class, resolved); + if (r < 0) + goto inner_fail; + if (r == 0) { + r = table_add_many( + t, + TABLE_PATH, *p, + TABLE_STRING, "none", + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY); + if (r < 0) + return table_log_add_error(r); + + continue; + } + + f = path_join(resolved, image_class_info[arg_image_class].dot_directory_name, image_class_info[arg_image_class].short_identifier_plural); + if (!f) + return log_oom(); + + r = read_full_file(f, &buf, NULL); + if (r < 0) + return log_error_errno(r, "Failed to open '%s': %m", f); + + l = strv_split_newlines(buf); + if (!l) + return log_oom(); + + if (stat(*p, &st) < 0) + return log_error_errno(r, "Failed to stat() '%s': %m", *p); + + r = table_add_many( + t, + TABLE_PATH, *p, + TABLE_STRV, l, + TABLE_TIMESTAMP, timespec_load(&st.st_mtim)); + if (r < 0) + return table_log_add_error(r); + + continue; + + inner_fail: + if (ret == 0) + ret = r; + } + + (void) table_set_sort(t, (size_t) 0); + + r = table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend); + if (r < 0) + return r; + + return ret; +} + +static int mount_overlayfs( + ImageClass image_class, + int noexec, + const char *where, + char **layers) { + + _cleanup_free_ char *options = NULL; + bool separator = false; + unsigned long flags; + int r; + + assert(where); + + options = strdup("lowerdir="); + if (!options) + return log_oom(); + + STRV_FOREACH(l, layers) { + _cleanup_free_ char *escaped = NULL; + + escaped = shell_escape(*l, ",:"); + if (!escaped) + return log_oom(); + + if (!strextend(&options, separator ? ":" : "", escaped)) + return log_oom(); + + separator = true; + } + + flags = image_class_info[image_class].default_mount_flags; + if (noexec >= 0) + SET_FLAG(flags, MS_NOEXEC, noexec); + + /* Now mount the actual overlayfs */ + r = mount_nofollow_verbose(LOG_ERR, image_class_info[image_class].short_identifier, where, "overlay", flags, options); + if (r < 0) + return r; + + return 0; +} + +static int merge_hierarchy( + ImageClass image_class, + const char *hierarchy, + int noexec, + char **extensions, + char **paths, + const char *meta_path, + const char *overlay_path) { + + _cleanup_free_ char *resolved_hierarchy = NULL, *f = NULL, *buf = NULL; + _cleanup_strv_free_ char **layers = NULL; + struct stat st; + int r; + + assert(hierarchy); + assert(meta_path); + assert(overlay_path); + + /* Resolve the path of the host's version of the hierarchy, i.e. what we want to use as lowest layer + * in the overlayfs stack. */ + r = chase(hierarchy, arg_root, CHASE_PREFIX_ROOT, &resolved_hierarchy, NULL); + if (r == -ENOENT) + log_debug_errno(r, "Hierarchy '%s' on host doesn't exist, not merging.", hierarchy); + else if (r < 0) + return log_error_errno(r, "Failed to resolve host hierarchy '%s': %m", hierarchy); + else { + r = dir_is_empty(resolved_hierarchy, /* ignore_hidden_or_backup= */ false); + if (r < 0) + return log_error_errno(r, "Failed to check if host hierarchy '%s' is empty: %m", resolved_hierarchy); + if (r > 0) { + log_debug("Host hierarchy '%s' is empty, not merging.", resolved_hierarchy); + resolved_hierarchy = mfree(resolved_hierarchy); + } + } + + /* Let's generate a metadata file that lists all extensions we took into account for this + * hierarchy. We include this in the final fs, to make things nicely discoverable and + * recognizable. */ + f = path_join(meta_path, image_class_info[image_class].dot_directory_name, image_class_info[image_class].short_identifier_plural); + if (!f) + return log_oom(); + + buf = strv_join(extensions, "\n"); + if (!buf) + return log_oom(); + + r = write_string_file(f, buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755); + if (r < 0) + return log_error_errno(r, "Failed to write extension meta file '%s': %m", f); + + /* Put the meta path (i.e. our synthesized stuff) at the top of the layer stack */ + layers = strv_new(meta_path); + if (!layers) + return log_oom(); + + /* Put the extensions in the middle */ + STRV_FOREACH(p, paths) { + _cleanup_free_ char *resolved = NULL; + + r = chase(hierarchy, *p, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r == -ENOENT) { + log_debug_errno(r, "Hierarchy '%s' in extension '%s' doesn't exist, not merging.", hierarchy, *p); + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to resolve hierarchy '%s' in extension '%s': %m", hierarchy, *p); + + r = dir_is_empty(resolved, /* ignore_hidden_or_backup= */ false); + if (r < 0) + return log_error_errno(r, "Failed to check if hierarchy '%s' in extension '%s' is empty: %m", resolved, *p); + if (r > 0) { + log_debug("Hierarchy '%s' in extension '%s' is empty, not merging.", hierarchy, *p); + continue; + } + + r = strv_consume(&layers, TAKE_PTR(resolved)); + if (r < 0) + return log_oom(); + } + + if (!layers[1]) /* No extension with files in this hierarchy? Then don't do anything. */ + return 0; + + if (resolved_hierarchy) { + /* Add the host hierarchy as last (lowest) layer in the stack */ + r = strv_consume(&layers, TAKE_PTR(resolved_hierarchy)); + if (r < 0) + return log_oom(); + } + + r = mkdir_p(overlay_path, 0700); + if (r < 0) + return log_error_errno(r, "Failed to make directory '%s': %m", overlay_path); + + r = mount_overlayfs(image_class, noexec, overlay_path, layers); + if (r < 0) + return r; + + /* The overlayfs superblock is read-only. Let's also mark the bind mount read-only. Extra turbo safety 😎 */ + r = bind_remount_recursive(overlay_path, MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) + return log_error_errno(r, "Failed to make bind mount '%s' read-only: %m", overlay_path); + + /* Now we have mounted the new file system. Let's now figure out its .st_dev field, and make that + * available in the metadata directory. This is useful to detect whether the metadata dir actually + * belongs to the fs it is found on: if .st_dev of the top-level mount matches it, it's pretty likely + * we are looking at a live tree, and not an unpacked tar or so of one. */ + if (stat(overlay_path, &st) < 0) + return log_error_errno(r, "Failed to stat mount '%s': %m", overlay_path); + + free(f); + f = path_join(meta_path, image_class_info[image_class].dot_directory_name, "dev"); + if (!f) + return log_oom(); + + r = write_string_file(f, FORMAT_DEVNUM(st.st_dev), WRITE_STRING_FILE_CREATE); + if (r < 0) + return log_error_errno(r, "Failed to write '%s': %m", f); + + /* Make sure the top-level dir has an mtime marking the point we established the merge */ + if (utimensat(AT_FDCWD, meta_path, NULL, AT_SYMLINK_NOFOLLOW) < 0) + return log_error_errno(r, "Failed fix mtime of '%s': %m", meta_path); + + return 1; +} + +static int strverscmp_improvedp(char *const* a, char *const* b) { + /* usable in qsort() for sorting a string array with strverscmp_improved() */ + return strverscmp_improved(*a, *b); +} + +static const ImagePolicy *pick_image_policy(const Image *img) { + assert(img); + assert(img->path); + + /* Explicitly specified policy always wins */ + if (arg_image_policy) + return arg_image_policy; + + /* If located in /.extra/sysext/ in the initrd, then it was placed there by systemd-stub, and was + * picked up from an untrusted ESP. Thus, require a stricter policy by default for them. (For the + * other directories we assume the appropriate level of trust was already established already. */ + + if (in_initrd() && path_startswith(img->path, "/.extra/sysext/")) + return &image_policy_sysext_strict; + + return image_class_info[img->class].default_image_policy; +} + +static int merge_subprocess( + ImageClass image_class, + char **hierarchies, + bool force, + int noexec, + Hashmap *images, + const char *workspace) { + + _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL, *host_os_release_api_level = NULL, *buf = NULL; + _cleanup_strv_free_ char **extensions = NULL, **paths = NULL; + size_t n_extensions = 0; + unsigned n_ignored = 0; + Image *img; + int r; + + /* Mark the whole of /run as MS_SLAVE, so that we can mount stuff below it that doesn't show up on + * the host otherwise. */ + r = mount_nofollow_verbose(LOG_ERR, NULL, "/run", NULL, MS_SLAVE|MS_REC, NULL); + if (r < 0) + return log_error_errno(r, "Failed to remount /run/ MS_SLAVE: %m"); + + /* Let's create the workspace if it's missing */ + r = mkdir_p(workspace, 0700); + if (r < 0) + return log_error_errno(r, "Failed to create '%s': %m", workspace); + + /* Let's mount a tmpfs to our workspace. This way we don't need to clean up the inodes we mount over, + * but let the kernel do that entirely automatically, once our namespace dies. Note that this file + * system won't be visible to anyone but us, since we opened our own namespace and then made the + * /run/ hierarchy (which our workspace is contained in) MS_SLAVE, see above. */ + r = mount_nofollow_verbose(LOG_ERR, image_class_info[image_class].short_identifier, workspace, "tmpfs", 0, "mode=0700"); + if (r < 0) + return r; + + /* Acquire host OS release info, so that we can compare it with the extension's data */ + r = parse_os_release( + arg_root, + "ID", &host_os_release_id, + "VERSION_ID", &host_os_release_version_id, + image_class_info[image_class].level_env, &host_os_release_api_level); + if (r < 0) + return log_error_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(arg_root)); + if (isempty(host_os_release_id)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", + empty_to_root(arg_root)); + + /* Let's now mount all images */ + HASHMAP_FOREACH(img, images) { + _cleanup_free_ char *p = NULL; + + p = path_join(workspace, image_class_info[image_class].short_identifier_plural, img->name); + if (!p) + return log_oom(); + + r = mkdir_p(p, 0700); + if (r < 0) + return log_error_errno(r, "Failed to create %s: %m", p); + + switch (img->type) { + case IMAGE_DIRECTORY: + case IMAGE_SUBVOLUME: + + if (!force) { + r = extension_has_forbidden_content(p); + if (r < 0) + return r; + if (r > 0) { + n_ignored++; + continue; + } + } + + r = mount_nofollow_verbose(LOG_ERR, img->path, p, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + /* Make this a read-only bind mount */ + r = bind_remount_recursive(p, MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) + return log_error_errno(r, "Failed to make bind mount '%s' read-only: %m", p); + + break; + + case IMAGE_RAW: + case IMAGE_BLOCK: { + _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; + _cleanup_(verity_settings_done) VeritySettings verity_settings = VERITY_SETTINGS_DEFAULT; + DissectImageFlags flags = + DISSECT_IMAGE_READ_ONLY | + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_MOUNT_ROOT_ONLY | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES; + + r = verity_settings_load(&verity_settings, img->path, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read verity artifacts for %s: %m", img->path); + + if (verity_settings.data_path) + flags |= DISSECT_IMAGE_NO_PARTITION_TABLE; + + if (!force) + flags |= DISSECT_IMAGE_VALIDATE_OS_EXT; + + r = loop_device_make_by_path( + img->path, + O_RDONLY, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &d); + if (r < 0) + return log_error_errno(r, "Failed to set up loopback device for %s: %m", img->path); + + r = dissect_loop_device_and_warn( + d, + &verity_settings, + /* mount_options= */ NULL, + pick_image_policy(img), + flags, + &m); + if (r < 0) + return r; + + r = dissected_image_load_verity_sig_partition( + m, + d->fd, + &verity_settings); + if (r < 0) + return r; + + r = dissected_image_decrypt_interactively( + m, NULL, + &verity_settings, + flags); + if (r < 0) + return r; + + r = dissected_image_mount_and_warn( + m, + p, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + flags); + if (r < 0 && r != -ENOMEDIUM) + return r; + if (r == -ENOMEDIUM && !force) { + n_ignored++; + continue; + } + + r = dissected_image_relinquish(m); + if (r < 0) + return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m"); + break; + } + default: + assert_not_reached(); + } + + if (force) + log_debug("Force mode enabled, skipping version validation."); + else { + r = extension_release_validate( + img->name, + host_os_release_id, + host_os_release_version_id, + host_os_release_api_level, + in_initrd() ? "initrd" : "system", + image_extension_release(img, image_class), + image_class); + if (r < 0) + return r; + if (r == 0) { + n_ignored++; + continue; + } + } + + /* Nice! This one is an extension we want. */ + r = strv_extend(&extensions, img->name); + if (r < 0) + return log_oom(); + + n_extensions ++; + } + + /* Nothing left? Then shortcut things */ + if (n_extensions == 0) { + if (n_ignored > 0) + log_info("No suitable extensions found (%u ignored due to incompatible image(s)).", n_ignored); + else + log_info("No extensions found."); + return 0; + } + + /* Order by version sort with strverscmp_improved() */ + typesafe_qsort(extensions, n_extensions, strverscmp_improvedp); + + buf = strv_join(extensions, "', '"); + if (!buf) + return log_oom(); + + log_info("Using extensions '%s'.", buf); + + /* Build table of extension paths (in reverse order) */ + paths = new0(char*, n_extensions + 1); + if (!paths) + return log_oom(); + + for (size_t k = 0; k < n_extensions; k++) { + _cleanup_free_ char *p = NULL; + + assert_se(img = hashmap_get(images, extensions[n_extensions - 1 - k])); + + p = path_join(workspace, image_class_info[image_class].short_identifier_plural, img->name); + if (!p) + return log_oom(); + + paths[k] = TAKE_PTR(p); + } + + /* Let's now unmerge the status quo ante, since to build the new overlayfs we need a reference to the + * underlying fs. */ + STRV_FOREACH(h, hierarchies) { + _cleanup_free_ char *resolved = NULL; + + r = chase(*h, arg_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve hierarchy '%s%s': %m", strempty(arg_root), *h); + + r = unmerge_hierarchy(image_class, resolved); + if (r < 0) + return r; + } + + /* Create overlayfs mounts for all hierarchies */ + STRV_FOREACH(h, hierarchies) { + _cleanup_free_ char *meta_path = NULL, *overlay_path = NULL; + + meta_path = path_join(workspace, "meta", *h); /* The place where to store metadata about this instance */ + if (!meta_path) + return log_oom(); + + overlay_path = path_join(workspace, "overlay", *h); /* The resulting overlayfs instance */ + if (!overlay_path) + return log_oom(); + + r = merge_hierarchy( + image_class, + *h, + noexec, + extensions, + paths, + meta_path, + overlay_path); + if (r < 0) + return r; + } + + /* And move them all into place. This is where things appear in the host namespace */ + STRV_FOREACH(h, hierarchies) { + _cleanup_free_ char *p = NULL, *resolved = NULL; + + p = path_join(workspace, "overlay", *h); + if (!p) + return log_oom(); + + if (laccess(p, F_OK) < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to check if '%s' exists: %m", p); + + /* Hierarchy apparently was empty in all extensions, and wasn't mounted, ignoring. */ + continue; + } + + r = chase(*h, arg_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve hierarchy '%s%s': %m", strempty(arg_root), *h); + + r = mkdir_p(resolved, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create hierarchy mount point '%s': %m", resolved); + + r = mount_nofollow_verbose(LOG_ERR, p, resolved, NULL, MS_BIND, NULL); + if (r < 0) + return r; + + log_info("Merged extensions into '%s'.", resolved); + } + + return 1; +} + +static int merge(ImageClass image_class, + char **hierarchies, + bool force, + bool no_reload, + int noexec, + Hashmap *images) { + pid_t pid; + int r; + + r = safe_fork("(sd-merge)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_NEW_MOUNTNS, &pid); + if (r < 0) + return log_error_errno(r, "Failed to fork off child: %m"); + if (r == 0) { + /* Child with its own mount namespace */ + + r = merge_subprocess(image_class, hierarchies, force, noexec, images, "/run/systemd/sysext"); + if (r < 0) + _exit(EXIT_FAILURE); + + /* Our namespace ceases to exist here, also implicitly detaching all temporary mounts we + * created below /run. Nice! */ + + _exit(r > 0 ? EXIT_SUCCESS : 123); /* 123 means: didn't find any extensions */ + } + + r = wait_for_terminate_and_check("(sd-merge)", pid, WAIT_LOG_ABNORMAL); + if (r < 0) + return r; + + if (r == 123) /* exit code 123 means: didn't do anything */ + return 0; + + r = need_reload(image_class, hierarchies, no_reload); + if (r < 0) + return r; + if (r > 0) { + r = daemon_reload(); + if (r < 0) + return r; + } + + return 1; +} + +static int image_discover_and_read_metadata( + ImageClass image_class, + Hashmap **ret_images) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + Image *img; + int r; + + assert(ret_images); + + images = hashmap_new(&image_hash_ops); + if (!images) + return log_oom(); + + r = image_discover(image_class, arg_root, images); + if (r < 0) + return log_error_errno(r, "Failed to discover images: %m"); + + HASHMAP_FOREACH(img, images) { + r = image_read_metadata(img, image_class_info[image_class].default_image_policy); + if (r < 0) + return log_error_errno(r, "Failed to read metadata for image %s: %m", img->name); + } + + *ret_images = TAKE_PTR(images); + + return 0; +} + +static int look_for_merged_hierarchies( + ImageClass image_class, + char **hierarchies, + const char **ret_which) { + int r; + + assert(ret_which); + + /* In merge mode fail if things are already merged. (In --refresh mode below we'll unmerge if we find + * things are already merged...) */ + STRV_FOREACH(p, hierarchies) { + _cleanup_free_ char *resolved = NULL; + + r = chase(*p, arg_root, CHASE_PREFIX_ROOT, &resolved, NULL); + if (r == -ENOENT) { + log_debug_errno(r, "Hierarchy '%s%s' does not exist, ignoring.", strempty(arg_root), *p); + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to resolve path to hierarchy '%s%s': %m", strempty(arg_root), *p); + + r = is_our_mount_point(image_class, resolved); + if (r < 0) + return r; + if (r > 0) { + *ret_which = *p; + return 1; + } + } + + *ret_which = NULL; + return 0; +} + +static int verb_merge(int argc, char **argv, void *userdata) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + const char *which; + int r; + + r = have_effective_cap(CAP_SYS_ADMIN); + if (r < 0) + return log_error_errno(r, "Failed to check if we have enough privileges: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be privileged."); + + r = image_discover_and_read_metadata(arg_image_class, &images); + if (r < 0) + return r; + + r = look_for_merged_hierarchies(arg_image_class, arg_hierarchies, &which); + if (r < 0) + return r; + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Hierarchy '%s' is already merged.", which); + + return merge(arg_image_class, + arg_hierarchies, + arg_force, + arg_no_reload, + arg_noexec, + images); +} + +typedef struct MethodMergeParameters { + const char *class; + int force; + int no_reload; + int noexec; +} MethodMergeParameters; + +static int parse_merge_parameters(Varlink *link, JsonVariant *parameters, MethodMergeParameters *p) { + + static const JsonDispatch dispatch_table[] = { + { "class", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(MethodMergeParameters, class), 0 }, + { "force", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(MethodMergeParameters, force), 0 }, + { "noReload", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(MethodMergeParameters, no_reload), 0 }, + { "noexec", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(MethodMergeParameters, noexec), 0 }, + {} + }; + + assert(link); + assert(parameters); + assert(p); + + return varlink_dispatch(link, parameters, dispatch_table, p); +} + +static int vl_method_merge(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + MethodMergeParameters p = { + .force = -1, + .no_reload = -1, + .noexec = -1, + }; + _cleanup_strv_free_ char **hierarchies = NULL; + ImageClass image_class = arg_image_class; + int r; + + assert(link); + + r = parse_merge_parameters(link, parameters, &p); + if (r != 0) + return r; + + r = parse_image_class_parameter(link, p.class, &image_class, &hierarchies); + if (r < 0) + return r; + + r = image_discover_and_read_metadata(image_class, &images); + if (r < 0) + return r; + + const char *which; + r = look_for_merged_hierarchies( + image_class, + hierarchies ?: arg_hierarchies, + &which); + if (r < 0) + return r; + if (r > 0) + return varlink_errorb(link, "io.systemd.sysext.AlreadyMerged", JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("hierarchy", which))); + + r = merge(image_class, + hierarchies ?: arg_hierarchies, + p.force >= 0 ? p.force : arg_force, + p.no_reload >= 0 ? p.no_reload : arg_no_reload, + p.noexec >= 0 ? p.noexec : arg_noexec, + images); + if (r < 0) + return r; + + return varlink_reply(link, NULL); +} + +static int refresh( + ImageClass image_class, + char **hierarchies, + bool force, + bool no_reload, + int noexec) { + + _cleanup_hashmap_free_ Hashmap *images = NULL; + int r; + + r = image_discover_and_read_metadata(image_class, &images); + if (r < 0) + return r; + + /* Returns > 0 if it did something, i.e. a new overlayfs is mounted now. When it does so it + * implicitly unmounts any overlayfs placed there before. Returns == 0 if it did nothing, i.e. no + * extension images found. In this case the old overlayfs remains in place if there was one. */ + r = merge(image_class, hierarchies, force, no_reload, noexec, images); + if (r < 0) + return r; + if (r == 0) /* No images found? Then unmerge. The goal of --refresh is after all that after having + * called there's a guarantee that the merge status matches the installed extensions. */ + r = unmerge(image_class, hierarchies, no_reload); + + /* Net result here is that: + * + * 1. If an overlayfs was mounted before and no extensions exist anymore, we'll have unmerged things. + * + * 2. If an overlayfs was mounted before, and there are still extensions installed' we'll have + * unmerged and then merged things again. + * + * 3. If an overlayfs so far wasn't mounted, and there are extensions installed, we'll have it + * mounted now. + * + * 4. If there was no overlayfs mount so far, and no extensions installed, we implement a NOP. + */ + + return 0; +} + +static int verb_refresh(int argc, char **argv, void *userdata) { + int r; + + r = have_effective_cap(CAP_SYS_ADMIN); + if (r < 0) + return log_error_errno(r, "Failed to check if we have enough privileges: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be privileged."); + + return refresh(arg_image_class, + arg_hierarchies, + arg_force, + arg_no_reload, + arg_noexec); +} + +static int vl_method_refresh(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + MethodMergeParameters p = { + .force = -1, + .no_reload = -1, + .noexec = -1, + }; + _cleanup_strv_free_ char **hierarchies = NULL; + ImageClass image_class = arg_image_class; + int r; + + assert(link); + + r = parse_merge_parameters(link, parameters, &p); + if (r != 0) + return r; + + r = parse_image_class_parameter(link, p.class, &image_class, &hierarchies); + if (r < 0) + return r; + + r = refresh(image_class, + hierarchies ?: arg_hierarchies, + p.force >= 0 ? p.force : arg_force, + p.no_reload >= 0 ? p.no_reload : arg_no_reload, + p.noexec >= 0 ? p.noexec : arg_noexec); + if (r < 0) + return r; + + return varlink_reply(link, NULL); +} + +static int verb_list(int argc, char **argv, void *userdata) { + _cleanup_hashmap_free_ Hashmap *images = NULL; + _cleanup_(table_unrefp) Table *t = NULL; + Image *img; + int r; + + images = hashmap_new(&image_hash_ops); + if (!images) + return log_oom(); + + r = image_discover(arg_image_class, arg_root, images); + if (r < 0) + return log_error_errno(r, "Failed to discover images: %m"); + + if ((arg_json_format_flags & JSON_FORMAT_OFF) && hashmap_isempty(images)) { + log_info("No OS extensions found."); + return 0; + } + + t = table_new("name", "type", "path", "time"); + if (!t) + return log_oom(); + + HASHMAP_FOREACH(img, images) { + r = table_add_many( + t, + TABLE_STRING, img->name, + TABLE_STRING, image_type_to_string(img->type), + TABLE_PATH, img->path, + TABLE_TIMESTAMP, img->mtime != 0 ? img->mtime : img->crtime); + if (r < 0) + return table_log_add_error(r); + } + + (void) table_set_sort(t, (size_t) 0); + + return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend); +} + +typedef struct MethodListParameters { + const char *class; +} MethodListParameters; + +static int vl_method_list(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "class", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(MethodListParameters, class), 0 }, + {} + }; + MethodListParameters p = { + }; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_hashmap_free_ Hashmap *images = NULL; + ImageClass image_class = arg_image_class; + Image *img; + int r; + + assert(link); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = parse_image_class_parameter(link, p.class, &image_class, NULL); + if (r < 0) + return r; + + images = hashmap_new(&image_hash_ops); + if (!images) + return -ENOMEM; + + r = image_discover(image_class, arg_root, images); + if (r < 0) + return r; + + HASHMAP_FOREACH(img, images) { + if (v) { + /* Send previous item with more=true */ + r = varlink_notify(link, v); + if (r < 0) + return r; + } + + v = json_variant_unref(v); + + r = image_to_json(img, &v); + if (r < 0) + return r; + } + + if (v) /* Send final item with more=false */ + return varlink_reply(link, v); + + return varlink_error(link, "io.systemd.sysext.NoImagesFound", NULL); +} + +static int verb_help(int argc, char **argv, void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-sysext", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND\n" + "\n%5$sMerge extension images into /usr/ and /opt/ hierarchies for\n" + " sysext and into the /etc/ hierarchy for confext.%6$s\n" + " status Show current merge status (default)\n" + " merge Merge extensions into relevant hierarchies\n" + " unmerge Unmerge extensions from relevant hierarchies\n" + " refresh Unmerge/merge extensions again\n" + " list List installed extensions\n" + " -h --help Show this help\n" + " --version Show package version\n" + "\n%3$sOptions:%4$s\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --root=PATH Operate relative to root path\n" + " --json=pretty|short|off\n" + " Generate JSON output\n" + " --force Ignore version incompatibilities\n" + " --no-reload Do not reload the service manager\n" + " --image-policy=POLICY\n" + " Specify disk image dissection policy\n" + " --noexec=BOOL Whether to mount extension overlay with noexec\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_ROOT, + ARG_JSON, + ARG_FORCE, + ARG_IMAGE_POLICY, + ARG_NOEXEC, + ARG_NO_RELOAD, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "root", required_argument, NULL, ARG_ROOT }, + { "json", required_argument, NULL, ARG_JSON }, + { "force", no_argument, NULL, ARG_FORCE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "noexec", required_argument, NULL, ARG_NOEXEC }, + { "no-reload", no_argument, NULL, ARG_NO_RELOAD }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return verb_help(argc, argv, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, false, &arg_root); + if (r < 0) + return r; + /* If --root= is provided, do not reload the service manager */ + arg_no_reload = true; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + + break; + + case ARG_FORCE: + arg_force = true; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_NOEXEC: + r = parse_boolean_argument("--noexec", optarg, NULL); + if (r < 0) + return r; + + arg_noexec = r; + break; + + case ARG_NO_RELOAD: + arg_no_reload = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + r = varlink_invocation(VARLINK_ALLOW_ACCEPT); + if (r < 0) + return log_error_errno(r, "Failed to check if invoked in Varlink mode: %m"); + if (r > 0) + arg_varlink = true; + + return 1; +} + +static int sysext_main(int argc, char *argv[]) { + + static const Verb verbs[] = { + { "status", VERB_ANY, 1, VERB_DEFAULT, verb_status }, + { "merge", VERB_ANY, 1, 0, verb_merge }, + { "unmerge", VERB_ANY, 1, 0, verb_unmerge }, + { "refresh", VERB_ANY, 1, 0, verb_refresh }, + { "list", VERB_ANY, 1, 0, verb_list }, + { "help", VERB_ANY, 1, 0, verb_help }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + arg_image_class = invoked_as(argv, "systemd-confext") ? IMAGE_CONFEXT : IMAGE_SYSEXT; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + /* For debugging purposes it might make sense to do this for other hierarchies than /usr/ and + * /opt/, but let's make that a hacker/debugging feature, i.e. env var instead of cmdline + * switch. */ + r = parse_env_extension_hierarchies(&arg_hierarchies, image_class_info[arg_image_class].name_env); + if (r < 0) + return log_error_errno(r, "Failed to parse environment variable: %m"); + + if (arg_varlink) { + _cleanup_(varlink_server_unrefp) VarlinkServer *varlink_server = NULL; + + /* Invocation as Varlink service */ + + r = varlink_server_new(&varlink_server, VARLINK_SERVER_ROOT_ONLY); + if (r < 0) + return log_error_errno(r, "Failed to allocate Varlink server: %m"); + + r = varlink_server_add_interface(varlink_server, &vl_interface_io_systemd_sysext); + if (r < 0) + return log_error_errno(r, "Failed to add Varlink interface: %m"); + + r = varlink_server_bind_method_many( + varlink_server, + "io.systemd.sysext.Merge", vl_method_merge, + "io.systemd.sysext.Unmerge", vl_method_unmerge, + "io.systemd.sysext.Refresh", vl_method_refresh, + "io.systemd.sysext.List", vl_method_list); + if (r < 0) + return log_error_errno(r, "Failed to bind Varlink methods: %m"); + + r = varlink_server_loop_auto(varlink_server); + if (r == -EPERM) + return log_error_errno(r, "Invoked by unprivileged Varlink peer, refusing."); + if (r < 0) + return log_error_errno(r, "Failed to run Varlink event loop: %m"); + + return EXIT_SUCCESS; + } + + return sysext_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/system-update-generator/meson.build b/src/system-update-generator/meson.build new file mode 100644 index 0000000..cc62919 --- /dev/null +++ b/src/system-update-generator/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + generator_template + { + 'name' : 'systemd-system-update-generator', + 'sources' : files('system-update-generator.c'), + }, +] diff --git a/src/system-update-generator/system-update-generator.c b/src/system-update-generator/system-update-generator.c new file mode 100644 index 0000000..a1782d5 --- /dev/null +++ b/src/system-update-generator/system-update-generator.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fs-util.h" +#include "generator.h" +#include "initrd-util.h" +#include "log.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "special.h" +#include "string-util.h" +#include "unit-file.h" + +/* + * Implements the logic described in systemd.offline-updates(7). + */ + +static const char *arg_dest = NULL; + +static int generate_symlink(void) { + FOREACH_STRING(p, "/system-update", "/etc/system-update") { + if (laccess(p, F_OK) >= 0) { + _cleanup_free_ char *j = NULL; + + j = path_join(arg_dest, SPECIAL_DEFAULT_TARGET); + if (!j) + return log_oom(); + + if (symlink(SYSTEM_DATA_UNIT_DIR "/system-update.target", j) < 0) + return log_error_errno(errno, "Failed to create symlink %s: %m", j); + + return 1; + } + + if (errno != ENOENT) + log_warning_errno(errno, "Failed to check if %s symlink exists, ignoring: %m", p); + } + + return 0; +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + assert(key); + + /* Check if a run level is specified on the kernel command line. The + * command line has higher priority than any on-disk configuration, so + * it'll make any symlink we create moot. + */ + + if (streq(key, "systemd.unit") && !proc_cmdline_value_missing(key, value)) + log_warning("Offline system update overridden by kernel command line systemd.unit= setting"); + else if (!value && runlevel_to_target(key)) + log_warning("Offline system update overridden by runlevel \"%s\" on the kernel command line", key); + + return 0; +} + +static int run(const char *dest, const char *dest_early, const char *dest_late) { + int r; + + assert_se(arg_dest = dest_early); + + if (in_initrd()) { + log_debug("Skipping generator, running in the initrd."); + return EXIT_SUCCESS; + } + + r = generate_symlink(); + if (r <= 0) + return r; + + /* We parse the command line only to emit warnings. */ + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + + return 0; +} + +DEFINE_MAIN_GENERATOR_FUNCTION(run); diff --git a/src/systemctl/fuzz-systemctl-parse-argv.c b/src/systemctl/fuzz-systemctl-parse-argv.c new file mode 100644 index 0000000..9ea8f7a --- /dev/null +++ b/src/systemctl/fuzz-systemctl-parse-argv.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "env-util.h" +#include "fd-util.h" +#include "fuzz.h" +#include "nulstr-util.h" +#include "selinux-util.h" +#include "static-destruct.h" +#include "stdio-util.h" +#include "strv.h" +#include "systemctl.h" +#include "systemctl-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_strv_free_ char **argv = NULL; + _cleanup_close_ int orig_stdout_fd = -EBADF; + int r; + + if (size > 16*1024) + return 0; /* See the comment below about the limit for strv_length(). */ + + fuzz_setup_logging(); + + arg_pager_flags = PAGER_DISABLE; /* We shouldn't execute the pager */ + + argv = strv_parse_nulstr((const char *)data, size); + if (!argv) + return log_oom(); + + if (!argv[0]) + return 0; /* argv[0] should always be present, but may be zero-length. */ + if (strv_length(argv) > 1024) + return 0; /* oss-fuzz reports timeouts which are caused by appending to a very long strv. + * The code is indeed not very efficient, but it's designed for normal command-line + * use, where we don't expect more than a dozen of entries. The fact that it is + * slow with ~100k entries is not particularly interesting. Let's just refuse such + * long command lines. */ + + if (getenv_bool("SYSTEMD_FUZZ_OUTPUT") <= 0) { + orig_stdout_fd = fcntl(fileno(stdout), F_DUPFD_CLOEXEC, 3); + if (orig_stdout_fd < 0) + log_warning_errno(orig_stdout_fd, "Failed to duplicate fd 1: %m"); + else + assert_se(freopen("/dev/null", "w", stdout)); + + opterr = 0; /* do not print errors */ + } + + optind = 0; /* this tells the getopt machinery to reinitialize */ + + r = systemctl_dispatch_parse_argv(strv_length(argv), argv); + if (r < 0) + log_error_errno(r, "Failed to parse args: %m"); + else + log_info(r == 0 ? "Done!" : "Action!"); + + if (orig_stdout_fd >= 0) + assert_se(freopen(FORMAT_PROC_FD_PATH(orig_stdout_fd), "w", stdout)); + + release_busses(); /* We open the bus for communication with logind. + * It needs to be closed to avoid apparent leaks. */ + + mac_selinux_finish(); + + /* Call static destructors to do global state cleanup. We do it here, and not in fuzz-main.c so that + * any global state is destroyed between fuzzer runs. */ + static_destruct(); + + return 0; +} diff --git a/src/systemctl/meson.build b/src/systemctl/meson.build new file mode 100644 index 0000000..255c639 --- /dev/null +++ b/src/systemctl/meson.build @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemctl_sources = files( + 'systemctl-add-dependency.c', + 'systemctl-cancel-job.c', + 'systemctl-clean-or-freeze.c', + 'systemctl-compat-halt.c', + 'systemctl-compat-runlevel.c', + 'systemctl-compat-shutdown.c', + 'systemctl-compat-telinit.c', + 'systemctl-daemon-reload.c', + 'systemctl-edit.c', + 'systemctl-enable.c', + 'systemctl-is-active.c', + 'systemctl-is-enabled.c', + 'systemctl-is-system-running.c', + 'systemctl-kill.c', + 'systemctl-list-dependencies.c', + 'systemctl-list-jobs.c', + 'systemctl-list-machines.c', + 'systemctl-list-unit-files.c', + 'systemctl-list-units.c', + 'systemctl-log-setting.c', + 'systemctl-logind.c', + 'systemctl-mount.c', + 'systemctl-preset-all.c', + 'systemctl-reset-failed.c', + 'systemctl-service-watchdogs.c', + 'systemctl-set-default.c', + 'systemctl-set-environment.c', + 'systemctl-set-property.c', + 'systemctl-show.c', + 'systemctl-start-special.c', + 'systemctl-start-unit.c', + 'systemctl-switch-root.c', + 'systemctl-sysv-compat.c', + 'systemctl-trivial-method.c', + 'systemctl-util.c', + 'systemctl-whoami.c', + 'systemctl.c', +) + +if get_option('link-systemctl-shared') + systemctl_link_with = [libshared] +else + systemctl_link_with = [libsystemd_static, + libshared_static, + libbasic_gcrypt] +endif + +executables += [ + executable_template + { + 'name' : 'systemctl', + 'public' : true, + 'sources' : systemctl_sources, + 'link_with' : systemctl_link_with, + 'dependencies' : [ + libcap, + liblz4, + libselinux, + libxz, + libzstd, + threads, + ], + }, + fuzz_template + { + 'sources' : [ + files('fuzz-systemctl-parse-argv.c'), + systemctl_sources, + ], + 'link_with' : systemctl_link_with, + 'c_args' : ['-DFUZZ_SYSTEMCTL_PARSE_ARGV'], + }, +] + +foreach alias : (['halt', 'poweroff', 'reboot', 'shutdown'] + + (conf.get('HAVE_SYSV_COMPAT') == 1 ? ['runlevel', 'telinit'] : [])) + install_emptydir(sbindir) + meson.add_install_script(sh, '-c', + ln_s.format(bindir / 'systemctl', + sbindir / alias)) +endforeach diff --git a/src/systemctl/systemctl-add-dependency.c b/src/systemctl/systemctl-add-dependency.c new file mode 100644 index 0000000..8df25b5 --- /dev/null +++ b/src/systemctl/systemctl-add-dependency.c @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-add-dependency.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_add_dependency(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **names = NULL; + _cleanup_free_ char *target = NULL; + const char *verb = argv[0]; + UnitDependency dep; + int r; + + if (!argv[1]) + return 0; + + r = unit_name_mangle_with_suffix(argv[1], "as target", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".target", &target); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + r = mangle_names("as dependency", strv_skip(argv, 2), &names); + if (r < 0) + return r; + + if (streq(verb, "add-wants")) + dep = UNIT_WANTS; + else if (streq(verb, "add-requires")) + dep = UNIT_REQUIRES; + else + assert_not_reached(); + + if (install_client_side()) { + InstallChange *changes = NULL; + size_t n_changes = 0; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + r = unit_file_add_dependency(arg_runtime_scope, unit_file_flags_from_args(), arg_root, names, target, dep, &changes, &n_changes); + install_changes_dump(r, "add dependency on", changes, n_changes, arg_quiet); + if (r < 0) + return r; + } else { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "AddDependencyUnitFiles"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, names); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "ssbb", target, unit_dependency_to_string(dep), arg_runtime, arg_force); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to add dependency: %s", bus_error_message(&error, r)); + + r = bus_deserialize_and_dump_unit_file_changes(reply, arg_quiet); + if (r < 0) + return r; + + if (!arg_no_reload) { + r = daemon_reload(ACTION_RELOAD, /* graceful= */ false); + if (r < 0) + return r; + } + } + + return 0; +} diff --git a/src/systemctl/systemctl-add-dependency.h b/src/systemctl/systemctl-add-dependency.h new file mode 100644 index 0000000..11e5c82 --- /dev/null +++ b/src/systemctl/systemctl-add-dependency.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_add_dependency(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-cancel-job.c b/src/systemctl/systemctl-cancel-job.c new file mode 100644 index 0000000..e9f34c1 --- /dev/null +++ b/src/systemctl/systemctl-cancel-job.c @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "parse-util.h" +#include "systemctl-cancel-job.h" +#include "systemctl-trivial-method.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_cancel(int argc, char *argv[], void *userdata) { + sd_bus *bus; + int r; + + if (argc <= 1) /* Shortcut to trivial_method() if no argument is given */ + return verb_trivial_method(argc, argv, userdata); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = 0; + + STRV_FOREACH(name, strv_skip(argv, 1)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + uint32_t id; + int q; + + q = safe_atou32(*name, &id); + if (q < 0) + return log_error_errno(q, "Failed to parse job id \"%s\": %m", *name); + + q = bus_call_method(bus, bus_systemd_mgr, "CancelJob", &error, NULL, "u", id); + if (q < 0) { + log_warning_errno(q, "Failed to cancel job %"PRIu32", ignoring: %s", + id, bus_error_message(&error, q)); + RET_GATHER(r, q); + } + } + + return r; +} diff --git a/src/systemctl/systemctl-cancel-job.h b/src/systemctl/systemctl-cancel-job.h new file mode 100644 index 0000000..397e515 --- /dev/null +++ b/src/systemctl/systemctl-cancel-job.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_cancel(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-clean-or-freeze.c b/src/systemctl/systemctl-clean-or-freeze.c new file mode 100644 index 0000000..40d5f6d --- /dev/null +++ b/src/systemctl/systemctl-clean-or-freeze.c @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-wait-for-units.h" +#include "systemctl-clean-or-freeze.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_clean_or_freeze(int argc, char *argv[], void *userdata) { + _cleanup_(bus_wait_for_units_freep) BusWaitForUnits *w = NULL; + _cleanup_strv_free_ char **names = NULL; + int r, ret = EXIT_SUCCESS; + const char *method; + sd_bus *bus; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + if (!arg_clean_what) { + arg_clean_what = strv_new("cache", "runtime", "fdstore"); + if (!arg_clean_what) + return log_oom(); + } + + r = expand_unit_names(bus, strv_skip(argv, 1), NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + if (!arg_no_block) { + r = bus_wait_for_units_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Failed to allocate unit waiter: %m"); + } + + if (streq(argv[0], "clean")) + method = "CleanUnit"; + else if (streq(argv[0], "freeze")) + method = "FreezeUnit"; + else if (streq(argv[0], "thaw")) + method = "ThawUnit"; + else + assert_not_reached(); + + STRV_FOREACH(name, names) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + + if (w) { + /* If we shall wait for the cleaning to complete, let's add a ref on the unit first */ + r = bus_call_method(bus, bus_systemd_mgr, "RefUnit", &error, NULL, "s", *name); + if (r < 0) { + log_error_errno(r, "Failed to add reference to unit %s: %s", *name, bus_error_message(&error, r)); + if (ret == EXIT_SUCCESS) + ret = r; + continue; + } + } + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", *name); + if (r < 0) + return bus_log_create_error(r); + + if (streq(method, "CleanUnit")) { + r = sd_bus_message_append_strv(m, arg_clean_what); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_call(bus, m, 0, &error, NULL); + if (r < 0) { + log_error_errno(r, "Failed to %s unit %s: %s", argv[0], *name, bus_error_message(&error, r)); + if (ret == EXIT_SUCCESS) { + ret = r; + continue; + } + } + + if (w) { + r = bus_wait_for_units_add_unit(w, *name, BUS_WAIT_REFFED|BUS_WAIT_FOR_MAINTENANCE_END, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to watch unit %s: %m", *name); + } + } + + r = bus_wait_for_units_run(w); + if (r < 0) + return log_error_errno(r, "Failed to wait for units: %m"); + if (r == BUS_WAIT_FAILURE) + ret = EXIT_FAILURE; + + return ret; +} diff --git a/src/systemctl/systemctl-clean-or-freeze.h b/src/systemctl/systemctl-clean-or-freeze.h new file mode 100644 index 0000000..5f2bca4 --- /dev/null +++ b/src/systemctl/systemctl-clean-or-freeze.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_clean_or_freeze(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-compat-halt.c b/src/systemctl/systemctl-compat-halt.c new file mode 100644 index 0000000..4f6e304 --- /dev/null +++ b/src/systemctl/systemctl-compat-halt.c @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-daemon.h" + +#include "alloc-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "reboot-util.h" +#include "systemctl-compat-halt.h" +#include "systemctl-compat-telinit.h" +#include "systemctl-logind.h" +#include "systemctl-start-unit.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" +#include "utmp-wtmp.h" + +static int halt_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("halt", "8", &link); + if (r < 0) + return log_oom(); + + /* Note: if you are tempted to add new command line switches here, please do not. Let this + * compatibility command rest in peace. Its interface is not even owned by us as much as it is by + * sysvinit. If you add something new, add it to "systemctl halt", "systemctl reboot", "systemctl + * poweroff" instead. */ + + printf("%s [OPTIONS...]%s\n" + "\n%s%s the system.%s\n" + "\nOptions:\n" + " --help Show this help\n" + " --halt Halt the machine\n" + " -p --poweroff Switch off the machine\n" + " --reboot Reboot the machine\n" + " -f --force Force immediate halt/power-off/reboot\n" + " -w --wtmp-only Don't halt/power-off/reboot, just write wtmp record\n" + " -d --no-wtmp Don't write wtmp record\n" + " --no-wall Don't send wall message before halt/power-off/reboot\n" + "\n%sThis is a compatibility interface, please use the more powerful 'systemctl %s' command instead.%s\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + arg_action == ACTION_REBOOT ? " [ARG]" : "", + ansi_highlight(), arg_action == ACTION_REBOOT ? "Reboot" : + arg_action == ACTION_POWEROFF ? "Power off" : + "Halt", ansi_normal(), + ansi_highlight_red(), arg_action == ACTION_REBOOT ? "reboot" : + arg_action == ACTION_POWEROFF ? "poweroff" : + "halt", ansi_normal(), + link); + + return 0; +} + +int halt_parse_argv(int argc, char *argv[]) { + enum { + ARG_HELP = 0x100, + ARG_HALT, + ARG_REBOOT, + ARG_NO_WALL + }; + + static const struct option options[] = { + { "help", no_argument, NULL, ARG_HELP }, + { "halt", no_argument, NULL, ARG_HALT }, + { "poweroff", no_argument, NULL, 'p' }, + { "reboot", no_argument, NULL, ARG_REBOOT }, + { "force", no_argument, NULL, 'f' }, + { "wtmp-only", no_argument, NULL, 'w' }, + { "no-wtmp", no_argument, NULL, 'd' }, + { "no-sync", no_argument, NULL, 'n' }, + { "no-wall", no_argument, NULL, ARG_NO_WALL }, + {} + }; + + int c, r, runlevel; + + assert(argc >= 0); + assert(argv); + + /* called in sysvinit system as last command in shutdown/reboot so this is always forceful */ + if (utmp_get_runlevel(&runlevel, NULL) >= 0) + if (IN_SET(runlevel, '0', '6')) + arg_force = 2; + + while ((c = getopt_long(argc, argv, "pfwdnih", options, NULL)) >= 0) + switch (c) { + + case ARG_HELP: + return halt_help(); + + case ARG_HALT: + arg_action = ACTION_HALT; + break; + + case 'p': + if (arg_action != ACTION_REBOOT) + arg_action = ACTION_POWEROFF; + break; + + case ARG_REBOOT: + arg_action = ACTION_REBOOT; + break; + + case 'f': + arg_force = 2; + break; + + case 'w': + arg_dry_run = true; + break; + + case 'd': + arg_no_wtmp = true; + break; + + case 'n': + arg_no_sync = true; + break; + + case ARG_NO_WALL: + arg_no_wall = true; + break; + + case 'i': + case 'h': + /* Compatibility nops */ + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_action == ACTION_REBOOT && (argc == optind || argc == optind + 1)) { + r = update_reboot_parameter_and_warn(argc == optind + 1 ? argv[optind] : NULL, false); + if (r < 0) + return r; + } else if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many arguments."); + + return 1; +} + +int halt_main(void) { + int r; + + if (arg_force == 0) { + /* always try logind first */ + if (arg_when > 0) + r = logind_schedule_shutdown(arg_action); + else { + r = logind_check_inhibitors(arg_action); + if (r < 0) + return r; + + r = logind_reboot(arg_action); + } + if (r >= 0) + return r; + if (IN_SET(r, -EACCES, -EOPNOTSUPP, -EINPROGRESS)) + /* Requested operation requires auth, is not supported on the local system or already in + * progress */ + return r; + /* on all other errors, try low-level operation */ + + /* In order to minimize the difference between operation with and without logind, we explicitly + * enable non-blocking mode for this, as logind's shutdown operations are always non-blocking. */ + arg_no_block = true; + + if (!arg_dry_run) + return start_with_fallback(); + } + + if (geteuid() != 0) { + (void) must_be_root(); + return -EPERM; + } + + if (!arg_no_wtmp) { + if (sd_booted() > 0) + log_debug("Not writing utmp record, assuming that systemd-update-utmp is used."); + else { + r = utmp_put_shutdown(); + if (r < 0) + log_warning_errno(r, "Failed to write utmp record: %m"); + } + } + + if (arg_dry_run) + return 0; + + r = halt_now(arg_action); + return log_error_errno(r, "Failed to %s: %m", action_table[arg_action].verb); +} diff --git a/src/systemctl/systemctl-compat-halt.h b/src/systemctl/systemctl-compat-halt.h new file mode 100644 index 0000000..85b9dda --- /dev/null +++ b/src/systemctl/systemctl-compat-halt.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int halt_parse_argv(int argc, char *argv[]); + +int halt_main(void); diff --git a/src/systemctl/systemctl-compat-runlevel.c b/src/systemctl/systemctl-compat-runlevel.c new file mode 100644 index 0000000..04b6b76 --- /dev/null +++ b/src/systemctl/systemctl-compat-runlevel.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "pretty-print.h" +#include "systemctl-compat-runlevel.h" +#include "systemctl.h" +#include "terminal-util.h" +#include "utmp-wtmp.h" + +static int runlevel_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("runlevel", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n" + "\n%sPrints the previous and current runlevel of the init system.%s\n" + "\nOptions:\n" + " --help Show this help\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +int runlevel_parse_argv(int argc, char *argv[]) { + enum { + ARG_HELP = 0x100, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, ARG_HELP }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "", options, NULL)) >= 0) + switch (c) { + + case ARG_HELP: + return runlevel_help(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many arguments."); + + return 1; +} + +int runlevel_main(void) { + int r, runlevel, previous; + + r = utmp_get_runlevel(&runlevel, &previous); + if (r < 0) { + puts("unknown"); + return r; + } + + printf("%c %c\n", + previous <= 0 ? 'N' : previous, + runlevel <= 0 ? 'N' : runlevel); + + return 0; +} diff --git a/src/systemctl/systemctl-compat-runlevel.h b/src/systemctl/systemctl-compat-runlevel.h new file mode 100644 index 0000000..658524b --- /dev/null +++ b/src/systemctl/systemctl-compat-runlevel.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int runlevel_parse_argv(int argc, char *argv[]); + +int runlevel_main(void); diff --git a/src/systemctl/systemctl-compat-shutdown.c b/src/systemctl/systemctl-compat-shutdown.c new file mode 100644 index 0000000..881d00e --- /dev/null +++ b/src/systemctl/systemctl-compat-shutdown.c @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "pretty-print.h" +#include "reboot-util.h" +#include "systemctl-compat-shutdown.h" +#include "systemctl-sysv-compat.h" +#include "systemctl.h" +#include "terminal-util.h" + +static int shutdown_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("shutdown", "8", &link); + if (r < 0) + return log_oom(); + + /* Note: if you are tempted to add new command line switches here, please do not. Let this + * compatibility command rest in peace. Its interface is not even owned by us as much as it is by + * sysvinit. If you add something new, add it to "systemctl halt", "systemctl reboot", "systemctl + * poweroff" instead. */ + + printf("%s [OPTIONS...] [TIME] [WALL...]\n" + "\n%sShut down the system.%s\n" + "\nOptions:\n" + " --help Show this help\n" + " -H --halt Halt the machine\n" + " -P --poweroff Power-off the machine\n" + " -r --reboot Reboot the machine\n" + " -h Equivalent to --poweroff, overridden by --halt\n" + " -k Don't halt/power-off/reboot, just send warnings\n" + " --no-wall Don't send wall message before halt/power-off/reboot\n" + " -c Cancel a pending shutdown\n" + " --show Show pending shutdown\n" + "\n%sThis is a compatibility interface, please use the more powerful 'systemctl reboot',\n" + "'systemctl poweroff', 'systemctl reboot' commands instead.%s\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), ansi_normal(), + ansi_highlight_red(), ansi_normal(), + link); + + return 0; +} + +int shutdown_parse_argv(int argc, char *argv[]) { + enum { + ARG_HELP = 0x100, + ARG_NO_WALL, + ARG_SHOW + }; + + static const struct option options[] = { + { "help", no_argument, NULL, ARG_HELP }, + { "halt", no_argument, NULL, 'H' }, + { "poweroff", no_argument, NULL, 'P' }, + { "reboot", no_argument, NULL, 'r' }, + { "kexec", no_argument, NULL, 'K' }, /* not documented extension */ + { "no-wall", no_argument, NULL, ARG_NO_WALL }, + { "show", no_argument, NULL, ARG_SHOW }, + {} + }; + + char **wall = NULL; + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "HPrhkKat:fFc", options, NULL)) >= 0) + switch (c) { + + case ARG_HELP: + return shutdown_help(); + + case 'H': + arg_action = ACTION_HALT; + break; + + case 'P': + arg_action = ACTION_POWEROFF; + break; + + case 'r': + if (kexec_loaded()) + arg_action = ACTION_KEXEC; + else + arg_action = ACTION_REBOOT; + break; + + case 'K': + arg_action = ACTION_KEXEC; + break; + + case 'h': + if (arg_action != ACTION_HALT) + arg_action = ACTION_POWEROFF; + break; + + case 'k': + arg_dry_run = true; + break; + + case ARG_NO_WALL: + arg_no_wall = true; + break; + + case 'a': + case 't': /* Note that we also ignore any passed argument to -t, not just the -t itself */ + case 'f': + case 'F': + /* Compatibility nops */ + break; + + case 'c': + arg_action = ACTION_CANCEL_SHUTDOWN; + break; + + case ARG_SHOW: + arg_action = ACTION_SHOW_SHUTDOWN; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (argc > optind && arg_action != ACTION_CANCEL_SHUTDOWN) { + r = parse_shutdown_time_spec(argv[optind], &arg_when); + if (r < 0) { + log_error("Failed to parse time specification: %s", argv[optind]); + return r; + } + } else + arg_when = now(CLOCK_REALTIME) + USEC_PER_MINUTE; + + if (argc > optind && arg_action == ACTION_CANCEL_SHUTDOWN) + /* No time argument for shutdown cancel */ + wall = argv + optind; + else if (argc > optind + 1) + /* We skip the time argument */ + wall = argv + optind + 1; + + if (wall) { + char **copy = strv_copy(wall); + if (!copy) + return log_oom(); + strv_free_and_replace(arg_wall, copy); + } + + optind = argc; + + return 1; +} diff --git a/src/systemctl/systemctl-compat-shutdown.h b/src/systemctl/systemctl-compat-shutdown.h new file mode 100644 index 0000000..7acf941 --- /dev/null +++ b/src/systemctl/systemctl-compat-shutdown.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int shutdown_parse_argv(int argc, char *argv[]); diff --git a/src/systemctl/systemctl-compat-telinit.c b/src/systemctl/systemctl-compat-telinit.c new file mode 100644 index 0000000..20325e5 --- /dev/null +++ b/src/systemctl/systemctl-compat-telinit.c @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "pretty-print.h" +#include "rlimit-util.h" +#include "systemctl-compat-telinit.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-start-unit.h" +#include "systemctl-sysv-compat.h" +#include "systemctl.h" +#include "terminal-util.h" + +static int telinit_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("telinit", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND\n\n" + "%sSend control commands to the init daemon.%s\n" + "\nCommands:\n" + " 0 Power-off the machine\n" + " 6 Reboot the machine\n" + " 2, 3, 4, 5 Start runlevelX.target unit\n" + " 1, s, S Enter rescue mode\n" + " q, Q Reload init daemon configuration\n" + " u, U Reexecute init daemon\n" + "\nOptions:\n" + " --help Show this help\n" + " --no-wall Don't send wall message before halt/power-off/reboot\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +int telinit_parse_argv(int argc, char *argv[]) { + enum { + ARG_HELP = 0x100, + ARG_NO_WALL + }; + + static const struct option options[] = { + { "help", no_argument, NULL, ARG_HELP }, + { "no-wall", no_argument, NULL, ARG_NO_WALL }, + {} + }; + + static const struct { + char from; + enum action to; + } table[] = { + { '0', ACTION_POWEROFF }, + { '6', ACTION_REBOOT }, + { '1', ACTION_RESCUE }, + { '2', ACTION_RUNLEVEL2 }, + { '3', ACTION_RUNLEVEL3 }, + { '4', ACTION_RUNLEVEL4 }, + { '5', ACTION_RUNLEVEL5 }, + { 's', ACTION_RESCUE }, + { 'S', ACTION_RESCUE }, + { 'q', ACTION_RELOAD }, + { 'Q', ACTION_RELOAD }, + { 'u', ACTION_REEXEC }, + { 'U', ACTION_REEXEC } + }; + + unsigned i; + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "", options, NULL)) >= 0) + switch (c) { + + case ARG_HELP: + return telinit_help(); + + case ARG_NO_WALL: + arg_no_wall = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: required argument missing.", + program_invocation_short_name); + + if (optind + 1 < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many arguments."); + + if (strlen(argv[optind]) != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Expected single character argument."); + + for (i = 0; i < ELEMENTSOF(table); i++) + if (table[i].from == argv[optind][0]) + break; + + if (i >= ELEMENTSOF(table)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown command '%s'.", argv[optind]); + + arg_action = table[i].to; + + optind++; + + return 1; +} + +int start_with_fallback(void) { + int r; + + /* First, try systemd via D-Bus. */ + r = verb_start(0, NULL, NULL); + if (r == 0) + return 0; + +#if HAVE_SYSV_COMPAT + /* Nothing else worked, so let's try /dev/initctl */ + if (talk_initctl(action_to_runlevel()) > 0) + return 0; +#endif + + return log_error_errno(r, "Failed to talk to init daemon: %m"); +} + +int reload_with_fallback(void) { + + assert(IN_SET(arg_action, ACTION_RELOAD, ACTION_REEXEC)); + + /* First, try systemd via D-Bus */ + if (daemon_reload(arg_action, /* graceful= */ true) > 0) + return 0; + + /* That didn't work, so let's try signals */ + if (kill(1, arg_action == ACTION_RELOAD ? SIGHUP : SIGTERM) < 0) + return log_error_errno(errno, "kill() failed: %m"); + + return 0; +} + +int exec_telinit(char *argv[]) { + (void) rlimit_nofile_safe(); + (void) execv(TELINIT, argv); + + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Couldn't find an alternative telinit implementation to spawn."); +} diff --git a/src/systemctl/systemctl-compat-telinit.h b/src/systemctl/systemctl-compat-telinit.h new file mode 100644 index 0000000..783c387 --- /dev/null +++ b/src/systemctl/systemctl-compat-telinit.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int telinit_parse_argv(int argc, char *argv[]); +int start_with_fallback(void); +int reload_with_fallback(void); +int exec_telinit(char *argv[]); diff --git a/src/systemctl/systemctl-daemon-reload.c b/src/systemctl/systemctl-daemon-reload.c new file mode 100644 index 0000000..7e9b456 --- /dev/null +++ b/src/systemctl/systemctl-daemon-reload.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int daemon_reload(enum action action, bool graceful) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + const char *method; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + switch (action) { + + case ACTION_RELOAD: + method = "Reload"; + break; + + case ACTION_REEXEC: + method = "Reexecute"; + break; + + default: + return -EINVAL; + } + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + /* Reloading the daemon may take long, hence set a longer timeout here */ + r = sd_bus_call(bus, m, DAEMON_RELOAD_TIMEOUT_SEC, &error, NULL); + + /* On reexecution, we expect a disconnect, not a reply */ + if (IN_SET(r, -ETIMEDOUT, -ECONNRESET) && action == ACTION_REEXEC) + return 1; + if (r < 0) { + if (graceful) { /* If graceful mode is selected, debug log, but don't fail */ + log_debug_errno(r, "%s daemon failed via the bus, ignoring: %s", + method, bus_error_message(&error, r)); + return 0; + } + + return log_error_errno(r, "%s daemon failed: %s", + method, bus_error_message(&error, r)); + } + + return 1; +} + +int verb_daemon_reload(int argc, char *argv[], void *userdata) { + enum action a; + int r; + + assert(argc >= 1); + + if (streq(argv[0], "daemon-reexec")) + a = ACTION_REEXEC; + else if (streq(argv[0], "daemon-reload")) + a = ACTION_RELOAD; + else + assert_not_reached(); + + r = daemon_reload(a, /* graceful= */ false); + if (r < 0) + return r; + + return 0; +} diff --git a/src/systemctl/systemctl-daemon-reload.h b/src/systemctl/systemctl-daemon-reload.h new file mode 100644 index 0000000..ced34ce --- /dev/null +++ b/src/systemctl/systemctl-daemon-reload.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "systemctl.h" + +int daemon_reload(enum action, bool graceful); + +int verb_daemon_reload(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-edit.c b/src/systemctl/systemctl-edit.c new file mode 100644 index 0000000..367afa2 --- /dev/null +++ b/src/systemctl/systemctl-edit.c @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "edit-util.h" +#include "fs-util.h" +#include "pager.h" +#include "path-util.h" +#include "pretty-print.h" +#include "selinux-util.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-edit.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +int verb_cat(int argc, char *argv[], void *userdata) { + _cleanup_hashmap_free_ Hashmap *cached_name_map = NULL, *cached_id_map = NULL; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_strv_free_ char **names = NULL; + sd_bus *bus; + bool first = true; + int r, rc = 0; + + /* Include all units by default — i.e. continue as if the --all option was used */ + if (strv_isempty(arg_states)) + arg_all = true; + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot remotely cat units."); + + r = lookup_paths_init_or_warn(&lp, arg_runtime_scope, 0, arg_root); + if (r < 0) + return r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = expand_unit_names(bus, strv_skip(argv, 1), NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + r = maybe_extend_with_unit_dependencies(bus, &names); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + STRV_FOREACH(name, names) { + _cleanup_free_ char *fragment_path = NULL; + _cleanup_strv_free_ char **dropin_paths = NULL; + + r = unit_find_paths(bus, *name, &lp, false, &cached_name_map, &cached_id_map, &fragment_path, &dropin_paths); + if (r == -ERFKILL) { + printf("%s# Unit %s is masked%s.\n", + ansi_highlight_magenta(), + *name, + ansi_normal()); + continue; + } + if (r == -EKEYREJECTED) { + printf("%s# Unit %s could not be loaded.%s\n", + ansi_highlight_magenta(), + *name, + ansi_normal()); + continue; + } + if (r < 0) + return r; + if (r == 0) { + /* Skip units which have no on-disk counterpart, but propagate the error to the + * user */ + rc = -ENOENT; + continue; + } + + if (first) + first = false; + else + puts(""); + + if (need_daemon_reload(bus, *name) > 0) /* ignore errors (<0), this is informational output */ + fprintf(stderr, + "%s# Warning: %s changed on disk, the version systemd has loaded is outdated.\n" + "%s# This output shows the current version of the unit's original fragment and drop-in files.\n" + "%s# If fragments or drop-ins were added or removed, they are not properly reflected in this output.\n" + "%s# Run 'systemctl%s daemon-reload' to reload units.%s\n", + ansi_highlight_red(), + *name, + ansi_highlight_red(), + ansi_highlight_red(), + ansi_highlight_red(), + arg_runtime_scope == RUNTIME_SCOPE_SYSTEM ? "" : " --user", + ansi_normal()); + + r = cat_files(fragment_path, dropin_paths, /* flags= */ CAT_FORMAT_HAS_SECTIONS); + if (r < 0) + return r; + } + + return rc; +} + +static int get_file_to_edit( + const LookupPaths *lp, + const char *name, + char **ret_path) { + + _cleanup_free_ char *path = NULL; + + assert(lp); + assert(name); + assert(ret_path); + + path = path_join(lp->persistent_config, name); + if (!path) + return log_oom(); + + if (arg_runtime) { + _cleanup_free_ char *run = NULL; + + run = path_join(lp->runtime_config, name); + if (!run) + return log_oom(); + + if (access(path, F_OK) >= 0) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "Refusing to create \"%s\" because it would be overridden by \"%s\" anyway.", + run, path); + + *ret_path = TAKE_PTR(run); + } else + *ret_path = TAKE_PTR(path); + + return 0; +} + +static int unit_file_create_new( + EditFileContext *context, + const LookupPaths *lp, + const char *unit_name, + const char *suffix, + char * const *original_unit_paths) { + + _cleanup_free_ char *unit = NULL, *new_path = NULL; + int r; + + assert(context); + assert(lp); + assert(unit_name); + + unit = strjoin(unit_name, suffix); + if (!unit) + return log_oom(); + + r = get_file_to_edit(lp, unit, &new_path); + if (r < 0) + return r; + + return edit_files_add(context, new_path, NULL, original_unit_paths); +} + +static int unit_file_create_copy( + EditFileContext *context, + const LookupPaths *lp, + const char *unit_name, + const char *fragment_path) { + + _cleanup_free_ char *new_path = NULL; + int r; + + assert(context); + assert(lp); + assert(fragment_path); + assert(unit_name); + + r = get_file_to_edit(lp, unit_name, &new_path); + if (r < 0) + return r; + + if (!path_equal(fragment_path, new_path) && access(new_path, F_OK) >= 0) { + char response; + + r = ask_char(&response, "yn", "\"%s\" already exists. Overwrite with \"%s\"? [(y)es, (n)o] ", new_path, fragment_path); + if (r < 0) + return r; + + if (response != 'y') + return log_warning_errno(SYNTHETIC_ERRNO(EKEYREJECTED), "%s skipped.", unit_name); + } + + return edit_files_add(context, new_path, fragment_path, NULL); +} + +static int find_paths_to_edit( + sd_bus *bus, + EditFileContext *context, + char **names) { + + _cleanup_hashmap_free_ Hashmap *cached_name_map = NULL, *cached_id_map = NULL; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_free_ char *drop_in_alloc = NULL, *suffix = NULL; + const char *drop_in; + int r; + + assert(bus); + assert(context); + assert(names); + + if (isempty(arg_drop_in)) + drop_in = "override.conf"; + else if (!endswith(arg_drop_in, ".conf")) { + drop_in_alloc = strjoin(arg_drop_in, ".conf"); + if (!drop_in_alloc) + return log_oom(); + + drop_in = drop_in_alloc; + } else + drop_in = arg_drop_in; + + if (!filename_is_valid(drop_in)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid drop-in file name '%s'.", drop_in); + + suffix = strjoin(".d/", drop_in); + if (!suffix) + return log_oom(); + + r = lookup_paths_init(&lp, arg_runtime_scope, 0, arg_root); + if (r < 0) + return r; + + STRV_FOREACH(name, names) { + _cleanup_free_ char *path = NULL; + _cleanup_strv_free_ char **unit_paths = NULL; + + r = unit_find_paths(bus, *name, &lp, /* force_client_side= */ false, &cached_name_map, &cached_id_map, &path, &unit_paths); + if (r == -EKEYREJECTED) { + /* If loading of the unit failed server side complete, then the server won't tell us + * the unit file path. In that case, find the file client side. */ + + log_debug_errno(r, "Unit '%s' was not loaded correctly, retrying client-side.", *name); + r = unit_find_paths(bus, *name, &lp, /* force_client_side= */ true, &cached_name_map, &cached_id_map, &path, &unit_paths); + } + if (r == -ERFKILL) + return log_error_errno(r, "Unit '%s' masked, cannot edit.", *name); + if (r < 0) + return r; /* Already logged by unit_find_paths() */ + + if (!path) { + if (!arg_force) { + log_info("Run 'systemctl edit%s --force --full %s' to create a new unit.", + arg_runtime_scope == RUNTIME_SCOPE_GLOBAL ? " --global" : + arg_runtime_scope == RUNTIME_SCOPE_USER ? " --user" : "", + *name); + return -ENOENT; + } + + /* Create a new unit from scratch */ + r = unit_file_create_new( + context, + &lp, + *name, + arg_full ? NULL : suffix, + NULL); + } else { + _cleanup_free_ char *unit_name = NULL; + + r = path_extract_filename(path, &unit_name); + if (r < 0) + return log_error_errno(r, "Failed to extract unit name from path '%s': %m", path); + + /* We follow unit aliases, but we need to propagate the instance */ + if (unit_name_is_valid(*name, UNIT_NAME_INSTANCE) && + unit_name_is_valid(unit_name, UNIT_NAME_TEMPLATE)) { + _cleanup_free_ char *instance = NULL, *tmp_name = NULL; + + r = unit_name_to_instance(*name, &instance); + if (r < 0) + return r; + + r = unit_name_replace_instance(unit_name, instance, &tmp_name); + if (r < 0) + return r; + + free_and_replace(unit_name, tmp_name); + } + + if (arg_full) + r = unit_file_create_copy( + context, + &lp, + unit_name, + path); + else { + r = strv_prepend(&unit_paths, path); + if (r < 0) + return log_oom(); + + r = unit_file_create_new( + context, + &lp, + unit_name, + suffix, + unit_paths); + } + } + if (r < 0) + return r; + } + + return 0; +} + +int verb_edit(int argc, char *argv[], void *userdata) { + _cleanup_(edit_file_context_done) EditFileContext context = { + .marker_start = DROPIN_MARKER_START, + .marker_end = DROPIN_MARKER_END, + .remove_parent = !arg_full, + .overwrite_with_origin = true, + }; + _cleanup_strv_free_ char **names = NULL; + sd_bus *bus; + int r; + + if (!on_tty()) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot edit units if not on a tty."); + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot edit units remotely."); + + r = mac_init(); + if (r < 0) + return r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = expand_unit_names(bus, strv_skip(argv, 1), NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + if (strv_isempty(names)) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "No units matched the specified patterns."); + + STRV_FOREACH(tmp, names) { + r = unit_is_masked(bus, *tmp); + if (r < 0) + return r; + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot edit %s: unit is masked.", *tmp); + } + + r = find_paths_to_edit(bus, &context, names); + if (r < 0) + return r; + + r = do_edit_files_and_install(&context); + if (r < 0) + return r; + + if (!arg_no_reload && !install_client_side()) { + r = daemon_reload(ACTION_RELOAD, /* graceful= */ false); + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/systemctl/systemctl-edit.h b/src/systemctl/systemctl-edit.h new file mode 100644 index 0000000..10dac5c --- /dev/null +++ b/src/systemctl/systemctl-edit.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_cat(int argc, char *argv[], void *userdata); +int verb_edit(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-enable.c b/src/systemctl/systemctl-enable.c new file mode 100644 index 0000000..7d9b7c7 --- /dev/null +++ b/src/systemctl/systemctl-enable.c @@ -0,0 +1,337 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "locale-util.h" +#include "path-util.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-enable.h" +#include "systemctl-start-unit.h" +#include "systemctl-sysv-compat.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int normalize_filenames(char **names) { + int r; + + STRV_FOREACH(u, names) + if (!path_is_absolute(*u)) { + char* normalized_path; + + if (!isempty(arg_root)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Non-absolute paths are not allowed when --root is used: %s", + *u); + + if (!strchr(*u, '/')) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Link argument must contain at least one directory separator.\n" + "If you intended to link a file in the current directory, try ./%s instead.", + *u); + + r = path_make_absolute_cwd(*u, &normalized_path); + if (r < 0) + return r; + + free_and_replace(*u, normalized_path); + } + + return 0; +} + +static int normalize_names(char **names) { + bool was_path = false; + + STRV_FOREACH(u, names) { + int r; + + if (!is_path(*u)) + continue; + + r = free_and_strdup(u, basename(*u)); + if (r < 0) + return log_error_errno(r, "Failed to normalize unit file path: %m"); + + was_path = true; + } + + if (was_path) + log_warning("Warning: Can't execute disable on the unit file path. Proceeding with the unit name."); + + return 0; +} + +int verb_enable(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **names = NULL; + const char *verb = argv[0]; + int carries_install_info = -1; + bool ignore_carries_install_info = arg_quiet || arg_no_warn; + int r; + + if (!argv[1]) + return 0; + + r = mangle_names("to enable", strv_skip(argv, 1), &names); + if (r < 0) + return r; + + r = enable_sysv_units(verb, names); + if (r < 0) + return r; + + /* If the operation was fully executed by the SysV compat, let's finish early */ + if (strv_isempty(names)) { + if (arg_no_reload || install_client_side()) + return 0; + + r = daemon_reload(ACTION_RELOAD, /* graceful= */ false); + return r > 0 ? 0 : r; + } + + if (streq(verb, "disable")) { + r = normalize_names(names); + if (r < 0) + return r; + } + + if (streq(verb, "link")) { + r = normalize_filenames(names); + if (r < 0) + return r; + } + + if (install_client_side()) { + UnitFileFlags flags; + InstallChange *changes = NULL; + size_t n_changes = 0; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + flags = unit_file_flags_from_args(); + if (streq(verb, "enable")) { + r = unit_file_enable(arg_runtime_scope, flags, arg_root, names, &changes, &n_changes); + carries_install_info = r; + } else if (streq(verb, "disable")) { + r = unit_file_disable(arg_runtime_scope, flags, arg_root, names, &changes, &n_changes); + carries_install_info = r; + } else if (streq(verb, "reenable")) { + r = unit_file_reenable(arg_runtime_scope, flags, arg_root, names, &changes, &n_changes); + carries_install_info = r; + } else if (streq(verb, "link")) + r = unit_file_link(arg_runtime_scope, flags, arg_root, names, &changes, &n_changes); + else if (streq(verb, "preset")) + r = unit_file_preset(arg_runtime_scope, flags, arg_root, names, arg_preset_mode, &changes, &n_changes); + else if (streq(verb, "mask")) + r = unit_file_mask(arg_runtime_scope, flags, arg_root, names, &changes, &n_changes); + else if (streq(verb, "unmask")) + r = unit_file_unmask(arg_runtime_scope, flags, arg_root, names, &changes, &n_changes); + else if (streq(verb, "revert")) + r = unit_file_revert(arg_runtime_scope, arg_root, names, &changes, &n_changes); + else + assert_not_reached(); + + install_changes_dump(r, verb, changes, n_changes, arg_quiet); + if (r < 0) + return r; + } else { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool expect_carries_install_info = false; + bool send_runtime = true, send_force = true, send_preset_mode = false; + const char *method, *warn_trigger_operation = NULL; + bool warn_trigger_ignore_masked = true; /* suppress "used uninitialized" warning */ + sd_bus *bus; + + if (STR_IN_SET(verb, "mask", "unmask")) { + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + + r = lookup_paths_init_or_warn(&lp, arg_runtime_scope, 0, arg_root); + if (r < 0) + return r; + + STRV_FOREACH(name, names) { + r = unit_exists(&lp, *name); + if (r < 0) + return r; + if (r == 0) + log_notice("Unit %s does not exist, proceeding anyway.", *name); + } + } + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + if (streq(verb, "enable")) { + method = "EnableUnitFiles"; + expect_carries_install_info = true; + } else if (streq(verb, "disable")) { + method = "DisableUnitFilesWithFlagsAndInstallInfo"; + expect_carries_install_info = true; + send_force = false; + + warn_trigger_operation = "Disabling"; + warn_trigger_ignore_masked = true; + } else if (streq(verb, "reenable")) { + method = "ReenableUnitFiles"; + expect_carries_install_info = true; + } else if (streq(verb, "link")) + method = "LinkUnitFiles"; + else if (streq(verb, "preset")) { + + if (arg_preset_mode != UNIT_FILE_PRESET_FULL) { + method = "PresetUnitFilesWithMode"; + send_preset_mode = true; + } else + method = "PresetUnitFiles"; + + expect_carries_install_info = true; + ignore_carries_install_info = true; + } else if (streq(verb, "mask")) { + method = "MaskUnitFiles"; + + warn_trigger_operation = "Masking"; + warn_trigger_ignore_masked = false; + } else if (streq(verb, "unmask")) { + method = "UnmaskUnitFiles"; + send_force = false; + } else if (streq(verb, "revert")) { + method = "RevertUnitFiles"; + send_runtime = send_force = false; + } else + assert_not_reached(); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, names); + if (r < 0) + return bus_log_create_error(r); + + if (send_preset_mode) { + r = sd_bus_message_append(m, "s", unit_file_preset_mode_to_string(arg_preset_mode)); + if (r < 0) + return bus_log_create_error(r); + } + + if (send_runtime) { + if (streq(method, "DisableUnitFilesWithFlagsAndInstallInfo")) + r = sd_bus_message_append(m, "t", arg_runtime ? (uint64_t) UNIT_FILE_RUNTIME : UINT64_C(0)); + else + r = sd_bus_message_append(m, "b", arg_runtime); + if (r < 0) + return bus_log_create_error(r); + } + + if (send_force) { + r = sd_bus_message_append(m, "b", arg_force); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to %s unit: %s", verb, bus_error_message(&error, r)); + + if (expect_carries_install_info) { + r = sd_bus_message_read(reply, "b", &carries_install_info); + if (r < 0) + return bus_log_parse_error(r); + } + + r = bus_deserialize_and_dump_unit_file_changes(reply, arg_quiet); + if (r < 0) + return r; + + /* Try to reload if enabled */ + if (!arg_no_reload) { + r = daemon_reload(ACTION_RELOAD, /* graceful= */ false); + if (r < 0) + return r; + } + + if (warn_trigger_operation && !arg_quiet && !arg_no_warn) + STRV_FOREACH(unit, names) + warn_triggering_units(bus, *unit, warn_trigger_operation, warn_trigger_ignore_masked); + } + + if (carries_install_info == 0 && !ignore_carries_install_info) + log_notice("The unit files have no installation config (WantedBy=, RequiredBy=, UpheldBy=,\n" + "Also=, or Alias= settings in the [Install] section, and DefaultInstance= for\n" + "template units). This means they are not meant to be enabled or disabled using systemctl.\n" + " \n" /* trick: the space is needed so that the line does not get stripped from output */ + "Possible reasons for having these kinds of units are:\n" + "%1$s A unit may be statically enabled by being symlinked from another unit's\n" + " .wants/, .requires/, or .upholds/ directory.\n" + "%1$s A unit's purpose may be to act as a helper for some other unit which has\n" + " a requirement dependency on it.\n" + "%1$s A unit may be started when needed via activation (socket, path, timer,\n" + " D-Bus, udev, scripted systemctl call, ...).\n" + "%1$s In case of template units, the unit is meant to be enabled with some\n" + " instance name specified.", + special_glyph(SPECIAL_GLYPH_BULLET)); + + if (streq(verb, "disable") && arg_runtime_scope == RUNTIME_SCOPE_USER && !arg_quiet && !arg_no_warn) { + /* If some of the units are disabled in user scope but still enabled in global scope, + * we emit a warning for that. */ + + /* No strv_free here, strings are owned by 'names' */ + _cleanup_free_ char **enabled_in_global_scope = NULL; + + STRV_FOREACH(name, names) { + UnitFileState state; + + r = unit_file_get_state(RUNTIME_SCOPE_GLOBAL, arg_root, *name, &state); + if (r == -ENOENT) + continue; + if (r < 0) + return log_error_errno(r, "Failed to get unit file state for %s: %m", *name); + + if (IN_SET(state, UNIT_FILE_ENABLED, UNIT_FILE_ENABLED_RUNTIME)) { + r = strv_push(&enabled_in_global_scope, *name); + if (r < 0) + return log_oom(); + } + } + + if (!strv_isempty(enabled_in_global_scope)) { + _cleanup_free_ char *joined = NULL; + + joined = strv_join(enabled_in_global_scope, ", "); + if (!joined) + return log_oom(); + + log_notice("The following unit files have been enabled in global scope. This means\n" + "they will still be started automatically after a successful disablement\n" + "in user scope:\n" + "%s", + joined); + } + } + + if (arg_now && STR_IN_SET(argv[0], "enable", "disable", "mask")) { + sd_bus *bus; + size_t len, i; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + len = strv_length(names); + { + char *new_args[len + 2]; + + new_args[0] = (char*) (streq(argv[0], "enable") ? "start" : "stop"); + for (i = 0; i < len; i++) + new_args[i + 1] = basename(names[i]); + new_args[i + 1] = NULL; + + r = verb_start(len + 1, new_args, userdata); + } + } + + return 0; +} diff --git a/src/systemctl/systemctl-enable.h b/src/systemctl/systemctl-enable.h new file mode 100644 index 0000000..f04bbcd --- /dev/null +++ b/src/systemctl/systemctl-enable.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_enable(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-is-active.c b/src/systemctl/systemctl-is-active.c new file mode 100644 index 0000000..596320a --- /dev/null +++ b/src/systemctl/systemctl-is-active.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "pretty-print.h" +#include "syslog-util.h" +#include "systemctl-is-active.h" +#include "systemctl-sysv-compat.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int check_unit_generic(int code, const UnitActiveState good_states[], size_t nb_states, char **args) { + _cleanup_strv_free_ char **names = NULL; + UnitActiveState active_state; + sd_bus *bus; + bool not_found = true, ok = false; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = expand_unit_names(bus, args, NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + STRV_FOREACH(name, names) { + _cleanup_free_ char *load_state = NULL; + + r = get_state_one_unit(bus, *name, &active_state); + if (r < 0) + return r; + + r = unit_load_state(bus, *name, &load_state); + if (r < 0) + return r; + + if (!arg_quiet) + puts(unit_active_state_to_string(active_state)); + + FOREACH_ARRAY(good_state, good_states, nb_states) + if (active_state == *good_state) { + ok = true; + break; + } + + if (!streq(load_state, "not-found")) + not_found = false; + } + + /* We use LSB code 4 ("program or service status is unknown") when the corresponding unit file doesn't exist. */ + return ok ? EXIT_SUCCESS : not_found ? EXIT_PROGRAM_OR_SERVICES_STATUS_UNKNOWN : code; +} + +int verb_is_active(int argc, char *argv[], void *userdata) { + + static const UnitActiveState states[] = { + UNIT_ACTIVE, + UNIT_RELOADING, + }; + + /* According to LSB: 3, "program is not running" */ + return check_unit_generic(EXIT_PROGRAM_NOT_RUNNING, states, ELEMENTSOF(states), strv_skip(argv, 1)); +} + +int verb_is_failed(int argc, char *argv[], void *userdata) { + + static const UnitActiveState states[] = { + UNIT_FAILED, + }; + + int r; + + if (argc > 1) + return check_unit_generic(EXIT_PROGRAM_DEAD_AND_PID_EXISTS, states, ELEMENTSOF(states), strv_skip(argv, 1)); + + /* If no unit is provided, we check SystemState property of the manager, i.e. whether there're failed + * units. */ + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *state = NULL; + sd_bus *bus; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_get_property_string(bus, bus_systemd_mgr, "SystemState", &error, &state); + if (r < 0) + return log_error_errno(r, "Failed to query system state: %s", bus_error_message(&error, r)); + + if (!arg_quiet) + puts(state); + + return streq(state, "degraded") ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/src/systemctl/systemctl-is-active.h b/src/systemctl/systemctl-is-active.h new file mode 100644 index 0000000..950f29a --- /dev/null +++ b/src/systemctl/systemctl-is-active.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_is_active(int argc, char *argv[], void *userdata); +int verb_is_failed(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-is-enabled.c b/src/systemctl/systemctl-is-enabled.c new file mode 100644 index 0000000..01d924f --- /dev/null +++ b/src/systemctl/systemctl-is-enabled.c @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-is-enabled.h" +#include "systemctl-sysv-compat.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int show_installation_targets_client_side(const char *name) { + InstallChange *changes = NULL; + size_t n_changes = 0; + UnitFileFlags flags; + char **p; + int r; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + p = STRV_MAKE(name); + flags = UNIT_FILE_DRY_RUN | + (arg_runtime ? UNIT_FILE_RUNTIME : 0); + + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, flags, NULL, p, &changes, &n_changes); + if (r < 0) + return log_error_errno(r, "Failed to get file links for %s: %m", name); + + FOREACH_ARRAY(c, changes, n_changes) + if (c->type == INSTALL_CHANGE_UNLINK) + printf(" %s\n", c->path); + + return 0; +} + +static int show_installation_targets(sd_bus *bus, const char *name) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *link; + int r; + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnitFileLinks", &error, &reply, "sb", name, arg_runtime); + if (r < 0) + return log_error_errno(r, "Failed to get unit file links for %s: %s", name, bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "s"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "s", &link)) > 0) + printf(" %s\n", link); + + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +int verb_is_enabled(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **names = NULL; + bool not_found, enabled; + int r; + + r = mangle_names("to check", strv_skip(argv, 1), &names); + if (r < 0) + return r; + + r = enable_sysv_units(argv[0], names); + if (r < 0) + return r; + + not_found = r == 0; /* Doesn't have SysV support or SYSV_UNIT_NOT_FOUND */ + enabled = r == SYSV_UNIT_ENABLED; + + if (install_client_side()) { + STRV_FOREACH(name, names) { + UnitFileState state; + + r = unit_file_get_state(arg_runtime_scope, arg_root, *name, &state); + if (r == -ENOENT) { + if (!arg_quiet) + puts("not-found"); + continue; + } else if (r < 0) + return log_error_errno(r, "Failed to get unit file state for %s: %m", *name); + else + not_found = false; + + if (IN_SET(state, + UNIT_FILE_ENABLED, + UNIT_FILE_ENABLED_RUNTIME, + UNIT_FILE_STATIC, + UNIT_FILE_ALIAS, + UNIT_FILE_INDIRECT, + UNIT_FILE_GENERATED)) + enabled = true; + + if (!arg_quiet) { + puts(unit_file_state_to_string(state)); + if (arg_full) { + r = show_installation_targets_client_side(*name); + if (r < 0) + return r; + } + } + } + + r = 0; + } else { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + STRV_FOREACH(name, names) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *s; + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnitFileState", &error, &reply, "s", *name); + if (r == -ENOENT) { + sd_bus_error_free(&error); + + if (!arg_quiet) + puts("not-found"); + continue; + } else if (r < 0) + return log_error_errno(r, + "Failed to get unit file state for %s: %s", + *name, + bus_error_message(&error, r)); + else + not_found = false; + + r = sd_bus_message_read(reply, "s", &s); + if (r < 0) + return bus_log_parse_error(r); + + if (STR_IN_SET(s, "enabled", "enabled-runtime", "static", "alias", "indirect", "generated")) + enabled = true; + + if (!arg_quiet) { + puts(s); + if (arg_full) { + r = show_installation_targets(bus, *name); + if (r < 0) + return r; + } + } + } + } + + return enabled ? EXIT_SUCCESS : not_found ? EXIT_PROGRAM_OR_SERVICES_STATUS_UNKNOWN : EXIT_FAILURE; +} diff --git a/src/systemctl/systemctl-is-enabled.h b/src/systemctl/systemctl-is-enabled.h new file mode 100644 index 0000000..96dff95 --- /dev/null +++ b/src/systemctl/systemctl-is-enabled.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_is_enabled(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-is-system-running.c b/src/systemctl/systemctl-is-system-running.c new file mode 100644 index 0000000..59be6a7 --- /dev/null +++ b/src/systemctl/systemctl-is-system-running.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-event.h" +#include "sd-daemon.h" + +#include "systemctl-util.h" +#include "systemctl-is-system-running.h" +#include "virt.h" +#include "systemctl.h" +#include "bus-util.h" +#include "bus-locator.h" +#include "bus-error.h" + +static int match_startup_finished(sd_bus_message *m, void *userdata, sd_bus_error *error) { + char **state = ASSERT_PTR(userdata); + int r; + + r = bus_get_property_string(sd_bus_message_get_bus(m), bus_systemd_mgr, "SystemState", NULL, state); + + sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), r); + return 0; +} + +int verb_is_system_running(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *slot_startup_finished = NULL; + _cleanup_(sd_event_unrefp) sd_event* event = NULL; + _cleanup_free_ char *state = NULL; + sd_bus *bus; + int r; + + if (!isempty(arg_root) || running_in_chroot() > 0 || (arg_transport == BUS_TRANSPORT_LOCAL && !sd_booted())) { + if (!arg_quiet) + puts("offline"); + return EXIT_FAILURE; + } + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + if (arg_wait) { + r = sd_event_default(&event); + if (r >= 0) + r = sd_bus_attach_event(bus, event, 0); + if (r >= 0) + r = bus_match_signal_async( + bus, + &slot_startup_finished, + bus_systemd_mgr, + "StartupFinished", + match_startup_finished, NULL, &state); + if (r < 0) { + log_warning_errno(r, "Failed to request match for StartupFinished: %m"); + arg_wait = false; + } + } + + r = bus_get_property_string(bus, bus_systemd_mgr, "SystemState", &error, &state); + if (r < 0) { + log_warning_errno(r, "Failed to query system state: %s", bus_error_message(&error, r)); + + if (!arg_quiet) + puts("unknown"); + return EXIT_FAILURE; + } + + if (arg_wait && STR_IN_SET(state, "initializing", "starting")) { + r = sd_event_loop(event); + if (r < 0) { + log_warning_errno(r, "Failed to get property from event loop: %m"); + if (!arg_quiet) + puts("unknown"); + return EXIT_FAILURE; + } + } + + if (!arg_quiet) + puts(state); + + return streq(state, "running") ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/src/systemctl/systemctl-is-system-running.h b/src/systemctl/systemctl-is-system-running.h new file mode 100644 index 0000000..de86211 --- /dev/null +++ b/src/systemctl/systemctl-is-system-running.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_is_system_running(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-kill.c b/src/systemctl/systemctl-kill.c new file mode 100644 index 0000000..c4c6096 --- /dev/null +++ b/src/systemctl/systemctl-kill.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-kill.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_kill(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **names = NULL; + const char *kill_whom; + sd_bus *bus; + int r, q; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + kill_whom = arg_kill_whom ?: "all"; + + /* --fail was specified */ + if (streq(arg_job_mode(), "fail")) + kill_whom = strjoina(kill_whom, "-fail"); + + r = expand_unit_names(bus, strv_skip(argv, 1), NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + STRV_FOREACH(name, names) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + if (arg_kill_value_set) + q = bus_call_method( + bus, + bus_systemd_mgr, + "QueueSignalUnit", + &error, + NULL, + "ssii", *name, kill_whom, arg_signal, arg_kill_value); + else + q = bus_call_method( + bus, + bus_systemd_mgr, + "KillUnit", + &error, + NULL, + "ssi", *name, kill_whom, arg_signal); + if (q < 0) { + log_error_errno(q, "Failed to kill unit %s: %s", *name, bus_error_message(&error, q)); + if (r == 0) + r = q; + } + } + + return r; +} diff --git a/src/systemctl/systemctl-kill.h b/src/systemctl/systemctl-kill.h new file mode 100644 index 0000000..88b2eae --- /dev/null +++ b/src/systemctl/systemctl-kill.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_kill(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-list-dependencies.c b/src/systemctl/systemctl-list-dependencies.c new file mode 100644 index 0000000..a9121f1 --- /dev/null +++ b/src/systemctl/systemctl-list-dependencies.c @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "locale-util.h" +#include "sort-util.h" +#include "special.h" +#include "systemctl-list-dependencies.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +static int list_dependencies_print(const char *name, UnitActiveState state, int level, unsigned branches, bool last) { + _cleanup_free_ char *n = NULL; + size_t max_len = MAX(columns(),20u); + size_t len = 0; + + if (arg_plain || state == _UNIT_ACTIVE_STATE_INVALID) + printf(" "); + else { + const char *on; + + switch (state) { + case UNIT_ACTIVE: + case UNIT_RELOADING: + case UNIT_ACTIVATING: + on = ansi_highlight_green(); + break; + + case UNIT_INACTIVE: + case UNIT_DEACTIVATING: + on = ansi_normal(); + break; + + default: + on = ansi_highlight_red(); + break; + } + + printf("%s%s%s ", on, special_glyph(unit_active_state_to_glyph(state)), ansi_normal()); + } + + if (!arg_plain) { + for (int i = level - 1; i >= 0; i--) { + len += 2; + if (len > max_len - 3 && !arg_full) { + printf("%s...\n",max_len % 2 ? "" : " "); + return 0; + } + printf("%s", special_glyph(branches & (1 << i) ? SPECIAL_GLYPH_TREE_VERTICAL : SPECIAL_GLYPH_TREE_SPACE)); + } + len += 2; + + if (len > max_len - 3 && !arg_full) { + printf("%s...\n",max_len % 2 ? "" : " "); + return 0; + } + + printf("%s", special_glyph(last ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH)); + } + + if (arg_full) { + printf("%s\n", name); + return 0; + } + + n = ellipsize(name, max_len-len, 100); + if (!n) + return log_oom(); + + printf("%s\n", n); + return 0; +} + +static int list_dependencies_compare(char * const *a, char * const *b) { + if (unit_name_to_type(*a) == UNIT_TARGET && unit_name_to_type(*b) != UNIT_TARGET) + return 1; + if (unit_name_to_type(*a) != UNIT_TARGET && unit_name_to_type(*b) == UNIT_TARGET) + return -1; + + return strcasecmp(*a, *b); +} + +static int list_dependencies_one( + sd_bus *bus, + const char *name, + int level, + char ***units, + unsigned branches) { + + _cleanup_strv_free_ char **deps = NULL; + int r; + bool circular = false; + + assert(bus); + assert(name); + assert(units); + + r = strv_extend(units, name); + if (r < 0) + return log_oom(); + + r = unit_get_dependencies(bus, name, &deps); + if (r < 0) + return r; + + typesafe_qsort(deps, strv_length(deps), list_dependencies_compare); + + STRV_FOREACH(c, deps) { + _cleanup_free_ char *load_state = NULL, *sub_state = NULL; + UnitActiveState active_state; + + if (strv_contains(*units, *c)) { + circular = true; + continue; + } + + if (arg_types && !strv_contains(arg_types, unit_type_suffix(*c))) + continue; + + r = get_state_one_unit(bus, *c, &active_state); + if (r < 0) + return r; + + if (arg_states) { + r = unit_load_state(bus, *c, &load_state); + if (r < 0) + return r; + + r = get_sub_state_one_unit(bus, *c, &sub_state); + if (r < 0) + return r; + + if (!strv_overlap(arg_states, STRV_MAKE(unit_active_state_to_string(active_state), load_state, sub_state))) + continue; + } + + r = list_dependencies_print(*c, active_state, level, branches, /* last = */ c[1] == NULL && !circular); + if (r < 0) + return r; + + if (arg_all || unit_name_to_type(*c) == UNIT_TARGET) { + r = list_dependencies_one(bus, *c, level + 1, units, (branches << 1) | (c[1] == NULL ? 0 : 1)); + if (r < 0) + return r; + } + } + + if (circular && !arg_plain) { + r = list_dependencies_print("...", _UNIT_ACTIVE_STATE_INVALID, level, branches, /* last = */ true); + if (r < 0) + return r; + } + + if (!arg_plain) + strv_remove(*units, name); + + return 0; +} + +int verb_list_dependencies(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **units = NULL, **done = NULL; + char **patterns; + sd_bus *bus; + int r; + + /* We won't be able to preserve the tree structure if --type= or --state= is used */ + arg_plain = arg_plain || arg_types || arg_states; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + patterns = strv_skip(argv, 1); + if (strv_isempty(patterns)) { + units = strv_new(SPECIAL_DEFAULT_TARGET); + if (!units) + return log_oom(); + } else { + r = expand_unit_names(bus, patterns, NULL, &units, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + } + + pager_open(arg_pager_flags); + + STRV_FOREACH(u, units) { + if (u != units) + puts(""); + + puts(*u); + r = list_dependencies_one(bus, *u, 0, &done, 0); + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/systemctl/systemctl-list-dependencies.h b/src/systemctl/systemctl-list-dependencies.h new file mode 100644 index 0000000..1e68a5f --- /dev/null +++ b/src/systemctl/systemctl-list-dependencies.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_list_dependencies(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-list-jobs.c b/src/systemctl/systemctl-list-jobs.c new file mode 100644 index 0000000..a752173 --- /dev/null +++ b/src/systemctl/systemctl-list-jobs.c @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "locale-util.h" +#include "systemctl-list-jobs.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +static int output_waiting_jobs(sd_bus *bus, Table *table, uint32_t id, const char *method, const char *prefix) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *name, *type; + uint32_t other_id; + int r; + + assert(bus); + + r = bus_call_method(bus, bus_systemd_mgr, method, &error, &reply, "u", id); + if (r < 0) + return log_debug_errno(r, "Failed to get waiting jobs for job %" PRIu32, id); + + r = sd_bus_message_enter_container(reply, 'a', "(usssoo)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(usssoo)", &other_id, &name, &type, NULL, NULL, NULL)) > 0) { + _cleanup_free_ char *row = NULL; + int rc; + + if (asprintf(&row, "%s %u (%s/%s)", prefix, other_id, name, type) < 0) + return log_oom(); + + rc = table_add_many(table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_RIGHT), + TABLE_STRING, row, + TABLE_EMPTY, + TABLE_EMPTY); + if (rc < 0) + return table_log_add_error(r); + } + + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +struct job_info { + uint32_t id; + const char *name, *type, *state; +}; + +static int output_jobs_list(sd_bus *bus, const struct job_info* jobs, unsigned n, bool skipped) { + _cleanup_(table_unrefp) Table *table = NULL; + const char *on, *off; + int r; + + assert(n == 0 || jobs); + + if (n == 0) { + if (arg_legend != 0) { + on = ansi_highlight_green(); + off = ansi_normal(); + + printf("%sNo jobs %s.%s\n", on, skipped ? "listed" : "running", off); + } + return 0; + } + + pager_open(arg_pager_flags); + + table = table_new("job", "unit", "type", "state"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (const struct job_info *j = jobs; j < jobs + n; j++) { + if (streq(j->state, "running")) + on = ansi_highlight(); + else + on = ""; + + r = table_add_many(table, + TABLE_UINT, j->id, + TABLE_STRING, j->name, + TABLE_SET_COLOR, on, + TABLE_STRING, j->type, + TABLE_STRING, j->state, + TABLE_SET_COLOR, on); + if (r < 0) + return table_log_add_error(r); + + if (arg_jobs_after) + output_waiting_jobs(bus, table, j->id, "GetJobAfter", "\twaiting for job"); + if (arg_jobs_before) + output_waiting_jobs(bus, table, j->id, "GetJobBefore", "\tblocking job"); + } + + r = table_print(table, NULL); + if (r < 0) + return log_error_errno(r, "Failed to print the table: %m"); + + if (arg_legend != 0) { + on = ansi_highlight(); + off = ansi_normal(); + + printf("\n%s%u jobs listed%s.\n", on, n, off); + } + + return 0; +} + +static bool output_show_job(struct job_info *job, char **patterns) { + return strv_fnmatch_or_empty(patterns, job->name, FNM_NOESCAPE); +} + +int verb_list_jobs(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ struct job_info *jobs = NULL; + const char *name, *type, *state; + bool skipped = false; + unsigned c = 0; + sd_bus *bus; + uint32_t id; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_systemd_mgr, "ListJobs", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to list jobs: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, 'a', "(usssoo)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(usssoo)", &id, &name, &type, &state, NULL, NULL)) > 0) { + struct job_info job = { id, name, type, state }; + + if (!output_show_job(&job, strv_skip(argv, 1))) { + skipped = true; + continue; + } + + if (!GREEDY_REALLOC(jobs, c + 1)) + return log_oom(); + + jobs[c++] = job; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + pager_open(arg_pager_flags); + + return output_jobs_list(bus, jobs, c, skipped); +} diff --git a/src/systemctl/systemctl-list-jobs.h b/src/systemctl/systemctl-list-jobs.h new file mode 100644 index 0000000..b10ec79 --- /dev/null +++ b/src/systemctl/systemctl-list-jobs.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_list_jobs(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-list-machines.c b/src/systemctl/systemctl-list-machines.c new file mode 100644 index 0000000..4407d25 --- /dev/null +++ b/src/systemctl/systemctl-list-machines.c @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-login.h" + +#include "bus-map-properties.h" +#include "hostname-util.h" +#include "locale-util.h" +#include "memory-util.h" +#include "sort-util.h" +#include "systemctl-list-machines.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +const struct bus_properties_map machine_info_property_map[] = { + /* Might good to keep same order here as in bus_manager_vtable[], server side */ + { "Version", "s", NULL, offsetof(struct machine_info, version) }, + { "Tainted", "s", NULL, offsetof(struct machine_info, tainted) }, + { "UserspaceTimestamp", "t", NULL, offsetof(struct machine_info, timestamp) }, + { "NNames", "u", NULL, offsetof(struct machine_info, n_names) }, + { "NFailedUnits", "u", NULL, offsetof(struct machine_info, n_failed_units) }, + { "NJobs", "u", NULL, offsetof(struct machine_info, n_jobs) }, + { "ControlGroup", "s", NULL, offsetof(struct machine_info, control_group) }, + { "SystemState", "s", NULL, offsetof(struct machine_info, state) }, + {} +}; + +void machine_info_clear(struct machine_info *info) { + assert(info); + + free(info->name); + free(info->version); + free(info->tainted); + free(info->control_group); + free(info->state); + zero(*info); +} + +static void free_machines_list(struct machine_info *machine_infos, int n) { + if (!machine_infos) + return; + + for (int i = 0; i < n; i++) + machine_info_clear(&machine_infos[i]); + + free(machine_infos); +} + +static int compare_machine_info(const struct machine_info *a, const struct machine_info *b) { + int r; + + r = CMP(b->is_host, a->is_host); + if (r != 0) + return r; + + return strcasecmp(a->name, b->name); +} + +static int get_machine_properties(sd_bus *bus, struct machine_info *mi) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *container = NULL; + int r; + + assert(mi); + + if (!bus) { + r = sd_bus_open_system_machine(&container, mi->name); + if (r < 0) + return r; + + bus = container; + } + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + machine_info_property_map, + BUS_MAP_STRDUP, + NULL, + NULL, + mi); + if (r < 0) + return r; + + return 0; +} + +static bool output_show_machine(const char *name, char **patterns) { + return strv_fnmatch_or_empty(patterns, name, FNM_NOESCAPE); +} + +static int get_machine_list( + sd_bus *bus, + struct machine_info **_machine_infos, + char **patterns) { + + struct machine_info *machine_infos = NULL; + _cleanup_strv_free_ char **m = NULL; + _cleanup_free_ char *hn = NULL; + int c = 0, r; + + hn = gethostname_malloc(); + if (!hn) + return log_oom(); + + if (output_show_machine(hn, patterns)) { + if (!GREEDY_REALLOC0(machine_infos, c+1)) + return log_oom(); + + machine_infos[c].is_host = true; + machine_infos[c].name = TAKE_PTR(hn); + + (void) get_machine_properties(bus, &machine_infos[c]); + c++; + } + + r = sd_get_machine_names(&m); + if (r < 0) + return log_error_errno(r, "Failed to get machine list: %m"); + + STRV_FOREACH(i, m) { + _cleanup_free_ char *class = NULL; + + if (!output_show_machine(*i, patterns)) + continue; + + sd_machine_get_class(*i, &class); + if (!streq_ptr(class, "container")) + continue; + + if (!GREEDY_REALLOC0(machine_infos, c+1)) { + free_machines_list(machine_infos, c); + return log_oom(); + } + + machine_infos[c].is_host = false; + machine_infos[c].name = strdup(*i); + if (!machine_infos[c].name) { + free_machines_list(machine_infos, c); + return log_oom(); + } + + (void) get_machine_properties(NULL, &machine_infos[c]); + c++; + } + + *_machine_infos = machine_infos; + return c; +} + +static int output_machines_list(struct machine_info *machine_infos, unsigned n) { + _cleanup_(table_unrefp) Table *table = NULL; + bool state_missing = false; + int r; + + assert(machine_infos || n == 0); + + table = table_new("", "name", "state", "failed", "jobs"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_plain) { + /* Hide the 'glyph' column when --plain is requested */ + r = table_hide_column_from_display(table, 0); + if (r < 0) + return log_error_errno(r, "Failed to hide column: %m"); + } + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (struct machine_info *m = machine_infos; m < machine_infos + n; m++) { + _cleanup_free_ char *mname = NULL; + const char *on_state = "", *on_failed = ""; + bool circle = false; + + if (streq_ptr(m->state, "degraded")) { + on_state = ansi_highlight_red(); + circle = true; + } else if (!streq_ptr(m->state, "running")) { + on_state = ansi_highlight_yellow(); + circle = true; + } + + if (m->n_failed_units > 0) + on_failed = ansi_highlight_red(); + else + on_failed = ""; + + if (!m->state) + state_missing = true; + + if (m->is_host) + mname = strjoin(strna(m->name), " (host)"); + + r = table_add_many(table, + TABLE_STRING, circle ? special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE) : " ", + TABLE_SET_COLOR, on_state, + TABLE_STRING, m->is_host ? mname : strna(m->name), + TABLE_STRING, strna(m->state), + TABLE_SET_COLOR, on_state, + TABLE_UINT32, m->n_failed_units, + TABLE_SET_COLOR, on_failed, + TABLE_UINT32, m->n_jobs); + if (r < 0) + return table_log_add_error(r); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) { + printf("\n"); + if (state_missing && geteuid() != 0) + printf("Notice: some information only available to privileged users was not shown.\n"); + printf("%u machines listed.\n", n); + } + + return 0; +} + +int verb_list_machines(int argc, char *argv[], void *userdata) { + struct machine_info *machine_infos = NULL; + sd_bus *bus; + int r, rc; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = get_machine_list(bus, &machine_infos, strv_skip(argv, 1)); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + typesafe_qsort(machine_infos, r, compare_machine_info); + rc = output_machines_list(machine_infos, r); + free_machines_list(machine_infos, r); + + return rc; +} diff --git a/src/systemctl/systemctl-list-machines.h b/src/systemctl/systemctl-list-machines.h new file mode 100644 index 0000000..95a6859 --- /dev/null +++ b/src/systemctl/systemctl-list-machines.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "bus-map-properties.h" +#include "time-util.h" + +int verb_list_machines(int argc, char *argv[], void *userdata); + +struct machine_info { + bool is_host; + char *name; + char *version; + char *tainted; + uint64_t timestamp; + uint32_t n_names; + uint32_t n_failed_units; + uint32_t n_jobs; + char *control_group; + char *state; +}; + +void machine_info_clear(struct machine_info *info); + +extern const struct bus_properties_map machine_info_property_map[]; diff --git a/src/systemctl/systemctl-list-unit-files.c b/src/systemctl/systemctl-list-unit-files.c new file mode 100644 index 0000000..fc1ad98 --- /dev/null +++ b/src/systemctl/systemctl-list-unit-files.c @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "sort-util.h" +#include "systemctl-list-unit-files.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +static int compare_unit_file_list(const UnitFileList *a, const UnitFileList *b) { + const char *d1, *d2; + + d1 = strrchr(a->path, '.'); + d2 = strrchr(b->path, '.'); + + if (d1 && d2) { + int r; + + r = strcasecmp(d1, d2); + if (r != 0) + return r; + } + + return strcasecmp(basename(a->path), basename(b->path)); +} + +static bool output_show_unit_file(const UnitFileList *u, char **states, char **patterns) { + assert(u); + + if (!strv_fnmatch_or_empty(patterns, basename(u->path), FNM_NOESCAPE)) + return false; + + if (!strv_isempty(arg_types)) { + const char *dot; + + dot = strrchr(u->path, '.'); + if (!dot) + return false; + + if (!strv_contains(arg_types, dot+1)) + return false; + } + + if (!strv_isempty(states) && + !strv_contains(states, unit_file_state_to_string(u->state))) + return false; + + return true; +} + +static const char* preset_action_to_color(PresetAction action, bool underline) { + assert(action >= 0); + + switch (action) { + case PRESET_ENABLE: + return underline ? ansi_highlight_green_underline() : ansi_highlight_green(); + case PRESET_DISABLE: + return underline ? ansi_highlight_red_underline() : ansi_highlight_red(); + case PRESET_IGNORE: + return underline ? ansi_highlight_yellow_underline() : ansi_highlight_yellow(); + default: + return NULL; + } +} + +static int output_unit_file_list(const UnitFileList *units, unsigned c) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_(unit_file_presets_done) UnitFilePresets presets = {}; + int r; + + table = table_new("unit file", "state", "preset"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + for (const UnitFileList *u = units; u < units + c; u++) { + const char *on_underline = NULL, *on_unit_color = NULL, *id; + bool underline; + + underline = u + 1 < units + c && + !streq(unit_type_suffix(u->path), unit_type_suffix((u + 1)->path)); + + if (underline) + on_underline = ansi_underline(); + + if (IN_SET(u->state, + UNIT_FILE_MASKED, + UNIT_FILE_MASKED_RUNTIME, + UNIT_FILE_DISABLED, + UNIT_FILE_BAD)) + on_unit_color = underline ? ansi_highlight_red_underline() : ansi_highlight_red(); + else if (IN_SET(u->state, + UNIT_FILE_ENABLED, + UNIT_FILE_ALIAS)) + on_unit_color = underline ? ansi_highlight_green_underline() : ansi_highlight_green(); + else + on_unit_color = on_underline; + + id = basename(u->path); + + r = table_add_many(table, + TABLE_STRING, id, + TABLE_SET_BOTH_COLORS, strempty(on_underline), + TABLE_STRING, unit_file_state_to_string(u->state), + TABLE_SET_BOTH_COLORS, strempty(on_unit_color)); + if (r < 0) + return table_log_add_error(r); + + if (show_preset_for_state(u->state)) { + const char *on_preset_color = underline ? on_underline : ansi_normal(); + + r = unit_file_query_preset(arg_runtime_scope, arg_root, id, &presets); + if (r >= 0) + on_preset_color = preset_action_to_color(r, underline); + + r = table_add_many(table, + TABLE_STRING, strna(preset_action_past_tense_to_string(r)), + TABLE_SET_BOTH_COLORS, strempty(on_preset_color)); + } else + r = table_add_many(table, + TABLE_EMPTY, + TABLE_SET_BOTH_COLORS, underline ? ansi_grey_underline() : ansi_grey()); + if (r < 0) + return table_log_add_error(r); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) + printf("\n%u unit files listed.\n", c); + + return 0; +} + +int verb_list_unit_files(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ UnitFileList *units = NULL; + _cleanup_hashmap_free_ Hashmap *h = NULL; + unsigned c = 0; + const char *state; + char *path; + int r; + bool fallback = false; + + if (install_client_side()) { + UnitFileList *u; + unsigned n_units; + + h = hashmap_new(&unit_file_list_hash_ops_free); + if (!h) + return log_oom(); + + r = unit_file_get_list(arg_runtime_scope, arg_root, h, arg_states, strv_skip(argv, 1)); + if (r < 0) + return log_error_errno(r, "Failed to get unit file list: %m"); + + n_units = hashmap_size(h); + + units = new(UnitFileList, n_units); + if (!units) + return log_oom(); + + HASHMAP_FOREACH(u, h) { + if (!output_show_unit_file(u, NULL, NULL)) + continue; + + units[c++] = *u; + } + + assert(c <= n_units); + } else { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "ListUnitFilesByPatterns"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, arg_states); + if (r < 0) + return bus_log_create_error(r); + + if (arg_with_dependencies) { + _cleanup_strv_free_ char **names_with_deps = NULL; + + r = append_unit_dependencies(bus, strv_skip(argv, 1), &names_with_deps); + if (r < 0) + return log_error_errno(r, "Failed to append unit dependencies: %m"); + + r = sd_bus_message_append_strv(m, names_with_deps); + if (r < 0) + return bus_log_create_error(r); + } else { + r = sd_bus_message_append_strv(m, strv_skip(argv, 1)); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0 && sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)) { + /* Fallback to legacy ListUnitFiles method */ + fallback = true; + log_debug_errno(r, "Failed to list unit files: %s Falling back to ListUnitsFiles method.", bus_error_message(&error, r)); + m = sd_bus_message_unref(m); + sd_bus_error_free(&error); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "ListUnitFiles"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + } + if (r < 0) + return log_error_errno(r, "Failed to list unit files: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(ss)", &path, &state)) > 0) { + + if (!GREEDY_REALLOC(units, c + 1)) + return log_oom(); + + units[c] = (struct UnitFileList) { + path, + unit_file_state_from_string(state) + }; + + if (output_show_unit_file(&units[c], + fallback ? arg_states : NULL, + fallback ? strv_skip(argv, 1) : NULL)) + c++; + + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + } + + pager_open(arg_pager_flags); + + typesafe_qsort(units, c, compare_unit_file_list); + r = output_unit_file_list(units, c); + if (r < 0) + return r; + + if (c == 0) + return -ENOENT; + + return 0; +} diff --git a/src/systemctl/systemctl-list-unit-files.h b/src/systemctl/systemctl-list-unit-files.h new file mode 100644 index 0000000..4819fbd --- /dev/null +++ b/src/systemctl/systemctl-list-unit-files.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_list_unit_files(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-list-units.c b/src/systemctl/systemctl-list-units.c new file mode 100644 index 0000000..fbc04b7 --- /dev/null +++ b/src/systemctl/systemctl-list-units.c @@ -0,0 +1,1191 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-login.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "format-table.h" +#include "locale-util.h" +#include "path-util.h" +#include "set.h" +#include "sort-util.h" +#include "systemctl-list-units.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +static int get_unit_list_recursive( + sd_bus *bus, + char **patterns, + UnitInfo **ret_unit_infos, + Set **ret_replies) { + + _cleanup_free_ UnitInfo *unit_infos = NULL; + _cleanup_set_free_ Set *replies = NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int c, r; + + assert(bus); + assert(ret_replies); + assert(ret_unit_infos); + + c = get_unit_list(bus, NULL, patterns, &unit_infos, 0, &reply); + if (c < 0) + return c; + + r = set_ensure_consume(&replies, &bus_message_hash_ops, TAKE_PTR(reply)); + if (r < 0) + return log_oom(); + + if (arg_recursive) { + _cleanup_strv_free_ char **machines = NULL; + + r = sd_get_machine_names(&machines); + if (r < 0) + return log_error_errno(r, "Failed to get machine names: %m"); + + STRV_FOREACH(i, machines) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *container = NULL; + int k; + + r = sd_bus_open_system_machine(&container, *i); + if (r < 0) { + log_warning_errno(r, "Failed to connect to container %s, ignoring: %m", *i); + continue; + } + + k = get_unit_list(container, *i, patterns, &unit_infos, c, &reply); + if (k < 0) + return k; + + c = k; + + r = set_consume(replies, TAKE_PTR(reply)); + if (r < 0) + return log_oom(); + } + } + + *ret_unit_infos = TAKE_PTR(unit_infos); + *ret_replies = TAKE_PTR(replies); + + return c; +} + +static void output_legend(const char *type, size_t n_items) { + const char *on, *off; + + assert(type); + + on = n_items > 0 ? ansi_highlight() : ansi_highlight_red(); + off = ansi_normal(); + + printf("\n%s%zu %ss listed.%s\n", on, n_items, type, off); + if (!arg_all) + printf("Pass --all to see loaded but inactive %ss, too.\n", type); +} + +static int table_add_triggered(Table *table, char **triggered) { + assert(table); + + if (strv_isempty(triggered)) + return table_add_cell(table, NULL, TABLE_EMPTY, NULL); + else if (strv_length(triggered) == 1) + return table_add_cell(table, NULL, TABLE_STRING, triggered[0]); + else + /* This should never happen, currently our socket units can only trigger a + * single unit. But let's handle this anyway, who knows what the future + * brings? */ + return table_add_cell(table, NULL, TABLE_STRV, triggered); +} + +static char *format_unit_id(const char *unit, const char *machine) { + assert(unit); + + return machine ? strjoin(machine, ":", unit) : strdup(unit); +} + +static int output_units_list(const UnitInfo *unit_infos, size_t c) { + _cleanup_(table_unrefp) Table *table = NULL; + size_t job_count = 0; + int r; + + table = table_new("", "unit", "load", "active", "sub", "job", "description"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_plain) { + /* Hide the 'glyph' column when --plain is requested */ + r = table_hide_column_from_display(table, 0); + if (r < 0) + return log_error_errno(r, "Failed to hide column: %m"); + } + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + FOREACH_ARRAY(u, unit_infos, c) { + _cleanup_free_ char *id = NULL; + const char *on_underline = "", *on_loaded = "", *on_active = "", *on_circle = ""; + bool circle = false, underline = false; + + if (u + 1 < unit_infos + c && + !streq(unit_type_suffix(u->id), unit_type_suffix((u + 1)->id))) { + on_underline = ansi_underline(); + underline = true; + } + + if (STR_IN_SET(u->load_state, "error", "not-found", "bad-setting", "masked") && !arg_plain) { + on_circle = underline ? ansi_highlight_yellow_underline() : ansi_highlight_yellow(); + circle = true; + on_loaded = underline ? ansi_highlight_red_underline() : ansi_highlight_red(); + } else if (streq(u->active_state, "failed") && !arg_plain) { + on_circle = underline ? ansi_highlight_red_underline() : ansi_highlight_red(); + circle = true; + on_active = underline ? ansi_highlight_red_underline() : ansi_highlight_red(); + } else { + on_circle = on_underline; + on_active = on_underline; + on_loaded = on_underline; + } + + id = format_unit_id(u->id, u->machine); + if (!id) + return log_oom(); + + r = table_add_many(table, + TABLE_STRING, circle ? special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE) : " ", + TABLE_SET_BOTH_COLORS, on_circle, + TABLE_STRING, id, + TABLE_SET_BOTH_COLORS, on_active, + TABLE_STRING, u->load_state, + TABLE_SET_BOTH_COLORS, on_loaded, + TABLE_STRING, u->active_state, + TABLE_SET_BOTH_COLORS, on_active, + TABLE_STRING, u->sub_state, + TABLE_SET_BOTH_COLORS, on_active, + TABLE_STRING, u->job_id ? u->job_type: "", + TABLE_SET_BOTH_COLORS, on_underline, + TABLE_STRING, u->description, + TABLE_SET_BOTH_COLORS, on_underline); + if (r < 0) + return table_log_add_error(r); + + if (u->job_id != 0) + job_count++; + } + + if (job_count == 0) { + /* There's no data in the JOB column, so let's hide it */ + r = table_hide_column_from_display(table, 5); + if (r < 0) + return log_error_errno(r, "Failed to hide column: %m"); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) { + const char *on, *off; + size_t records = table_get_rows(table) - 1; + + if (records > 0) { + printf("\n" + "%1$sLegend: LOAD %2$s Reflects whether the unit definition was properly loaded.%3$s\n" + "%1$s ACTIVE %2$s The high-level unit activation state, i.e. generalization of SUB.%3$s\n" + "%1$s SUB %2$s The low-level unit activation state, values depend on unit type.%3$s\n", + ansi_grey(), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + ansi_normal()); + if (job_count > 0) + printf("%s JOB %s Pending job for the unit.%s\n", + ansi_grey(), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + ansi_normal()); + } + + putchar('\n'); + + on = records > 0 ? ansi_highlight() : ansi_highlight_red(); + off = ansi_normal(); + + if (arg_all || strv_contains(arg_states, "inactive")) + printf("%s%zu loaded units listed.%s\n" + "%sTo show all installed unit files use 'systemctl list-unit-files'.%s\n", + on, records, off, + ansi_grey(), ansi_normal()); + else if (!arg_states) + printf("%s%zu loaded units listed.%s %sPass --all to see loaded but inactive units, too.%s\n" + "%sTo show all installed unit files use 'systemctl list-unit-files'.%s\n", + on, records, off, + ansi_grey(), ansi_normal(), ansi_grey(), ansi_normal()); + else + printf("%zu loaded units listed.\n", records); + } + + return 0; +} + +int verb_list_units(int argc, char *argv[], void *userdata) { + _cleanup_free_ UnitInfo *unit_infos = NULL; + _cleanup_set_free_ Set *replies = NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + if (arg_with_dependencies) { + _cleanup_strv_free_ char **names = NULL; + + r = append_unit_dependencies(bus, strv_skip(argv, 1), &names); + if (r < 0) + return r; + + r = get_unit_list_recursive(bus, names, &unit_infos, &replies); + if (r < 0) + return r; + } else { + r = get_unit_list_recursive(bus, strv_skip(argv, 1), &unit_infos, &replies); + if (r < 0) + return r; + } + + typesafe_qsort(unit_infos, r, unit_info_compare); + return output_units_list(unit_infos, r); +} + +static int get_triggered_units( + sd_bus *bus, + const char* path, + char*** ret) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + assert(path); + assert(ret); + + r = sd_bus_get_property_strv( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "Triggers", + &error, + ret); + if (r < 0) + return log_error_errno(r, "Failed to determine triggers: %s", bus_error_message(&error, r)); + + return 0; +} + +typedef struct SocketInfo { + const char *machine; + const char* id; + + char* type; + char* path; /* absolute path or socket address */ + + /* Note: triggered is a list here, although it almost certainly will always be one + * unit. Nevertheless, dbus API allows for multiple values, so let's follow that. */ + char** triggered; +} SocketInfo; + +static void socket_info_array_free(SocketInfo *sockets, size_t n_sockets) { + assert(sockets || n_sockets == 0); + + FOREACH_ARRAY(s, sockets, n_sockets) { + free(s->type); + free(s->path); + strv_free(s->triggered); + } + + free(sockets); +} + +static int socket_info_compare(const SocketInfo *a, const SocketInfo *b) { + int r; + + assert(a); + assert(b); + + r = strcasecmp_ptr(a->machine, b->machine); + if (r != 0) + return r; + + r = CMP(path_is_absolute(a->path), path_is_absolute(b->path)); + if (r != 0) + return r; + + r = path_is_absolute(a->path) ? path_compare(a->path, b->path) : strcmp(a->path, b->path); + if (r != 0) + return r; + + return strcmp(a->type, b->type); +} + +static int socket_info_add( + sd_bus *bus, + const UnitInfo *u, + SocketInfo **sockets, + size_t *n_sockets) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_strv_free_ char **triggered = NULL; + const char *type, *path; + int r; + + assert(bus); + assert(u); + assert(sockets); + assert(n_sockets); + + if (!endswith(u->id, ".socket")) + return 0; + + r = get_triggered_units(bus, u->unit_path, &triggered); + if (r < 0) + return r; + + r = sd_bus_get_property( + bus, + "org.freedesktop.systemd1", + u->unit_path, + "org.freedesktop.systemd1.Socket", + "Listen", + &error, + &reply, + "a(ss)"); + if (r < 0) + return log_error_errno(r, "Failed to get list of listening sockets: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(ss)", &type, &path)) > 0) { + _cleanup_free_ char *type_dup = NULL, *path_dup = NULL; + _cleanup_strv_free_ char **triggered_dup = NULL; + + type_dup = strdup(type); + if (!type_dup) + return log_oom(); + + path_dup = strdup(path); + if (!path_dup) + return log_oom(); + + triggered_dup = strv_copy(triggered); + if (!triggered_dup) + return log_oom(); + + if (!GREEDY_REALLOC(*sockets, *n_sockets + 1)) + return log_oom(); + + (*sockets)[(*n_sockets)++] = (SocketInfo) { + .machine = u->machine, + .id = u->id, + .type = TAKE_PTR(type_dup), + .path = TAKE_PTR(path_dup), + .triggered = TAKE_PTR(triggered_dup), + }; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static int output_sockets_list(const SocketInfo *sockets, size_t n_sockets) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(sockets || n_sockets == 0); + + table = table_new("listen", "type", "unit", "activates"); + if (!table) + return log_oom(); + + if (!arg_show_types) { + /* Hide the second (TYPE) column */ + r = table_set_display(table, (size_t) 0, (size_t) 2, (size_t) 3); + if (r < 0) + return log_error_errno(r, "Failed to set columns to display: %m"); + } + + table_set_header(table, arg_legend != 0); + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + FOREACH_ARRAY(s, sockets, n_sockets) { + _cleanup_free_ char *unit = NULL; + + unit = format_unit_id(s->id, s->machine); + if (!unit) + return log_oom(); + + r = table_add_many(table, + TABLE_STRING, s->path, + TABLE_STRING, s->type, + TABLE_STRING, unit); + if (r < 0) + return table_log_add_error(r); + + r = table_add_triggered(table, s->triggered); + if (r < 0) + return table_log_add_error(r); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) + output_legend("socket", n_sockets); + + return 0; +} + +int verb_list_sockets(int argc, char *argv[], void *userdata) { + _cleanup_set_free_ Set *replies = NULL; + _cleanup_strv_free_ char **sockets_with_suffix = NULL; + _cleanup_free_ UnitInfo *unit_infos = NULL; + SocketInfo *sockets = NULL; + size_t n_sockets = 0; + sd_bus *bus; + int r; + + CLEANUP_ARRAY(sockets, n_sockets, socket_info_array_free); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + r = expand_unit_names(bus, strv_skip(argv, 1), ".socket", &sockets_with_suffix, NULL); + if (r < 0) + return r; + + if (argc == 1 || sockets_with_suffix) { + int n; + + n = get_unit_list_recursive(bus, sockets_with_suffix, &unit_infos, &replies); + if (n < 0) + return n; + + FOREACH_ARRAY(u, unit_infos, n) { + r = socket_info_add(bus, u, &sockets, &n_sockets); + if (r < 0) + return r; + } + } + + typesafe_qsort(sockets, n_sockets, socket_info_compare); + output_sockets_list(sockets, n_sockets); + + return 0; +} + +static int get_next_elapse( + sd_bus *bus, + const char *path, + dual_timestamp *next) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + dual_timestamp t; + int r; + + assert(bus); + assert(path); + assert(next); + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Timer", + "NextElapseUSecMonotonic", + &error, + 't', + &t.monotonic); + if (r < 0) + return log_error_errno(r, "Failed to get next elapse time: %s", bus_error_message(&error, r)); + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Timer", + "NextElapseUSecRealtime", + &error, + 't', + &t.realtime); + if (r < 0) + return log_error_errno(r, "Failed to get next elapse time: %s", bus_error_message(&error, r)); + + *next = t; + return 0; +} + +static int get_last_trigger( + sd_bus *bus, + const char *path, + dual_timestamp *last) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + dual_timestamp t; + int r; + + assert(bus); + assert(path); + assert(last); + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Timer", + "LastTriggerUSec", + &error, + 't', + &t.realtime); + if (r < 0) + return log_error_errno(r, "Failed to get last trigger time: %s", bus_error_message(&error, r)); + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Timer", + "LastTriggerUSecMonotonic", + &error, + 't', + &t.monotonic); + if (r < 0) + return log_error_errno(r, "Failed to get last trigger time: %s", bus_error_message(&error, r)); + + *last = t; + return 0; +} + +typedef struct TimerInfo { + const char* machine; + const char* id; + usec_t next_elapse; + dual_timestamp last_trigger; + char **triggered; +} TimerInfo; + +static void timer_info_array_free(TimerInfo *timers, size_t n_timers) { + assert(timers || n_timers == 0); + + FOREACH_ARRAY(t, timers, n_timers) + strv_free(t->triggered); + + free(timers); +} + +static int timer_info_compare(const TimerInfo *a, const TimerInfo *b) { + int r; + + assert(a); + assert(b); + + r = strcasecmp_ptr(a->machine, b->machine); + if (r != 0) + return r; + + r = CMP(a->next_elapse, b->next_elapse); + if (r != 0) + return r; + + return strcmp(a->id, b->id); +} + +static int output_timers_list(const TimerInfo *timers, size_t n_timers) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(timers || n_timers == 0); + + table = table_new("next", "left", "last", "passed", "unit", "activates"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + (void) table_set_align_percent(table, table_get_cell(table, 0, 1), 100); + (void) table_set_align_percent(table, table_get_cell(table, 0, 3), 100); + + FOREACH_ARRAY(t, timers, n_timers) { + _cleanup_free_ char *unit = NULL; + + unit = format_unit_id(t->id, t->machine); + if (!unit) + return log_oom(); + + r = table_add_many(table, + TABLE_TIMESTAMP, t->next_elapse, + TABLE_TIMESTAMP_LEFT, t->next_elapse, + TABLE_TIMESTAMP, t->last_trigger.realtime, + TABLE_TIMESTAMP_RELATIVE_MONOTONIC, t->last_trigger.monotonic, + TABLE_STRING, unit); + if (r < 0) + return table_log_add_error(r); + + r = table_add_triggered(table, t->triggered); + if (r < 0) + return table_log_add_error(r); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) + output_legend("timer", n_timers); + + return 0; +} + +usec_t calc_next_elapse(const dual_timestamp *nw, const dual_timestamp *next) { + usec_t next_elapse; + + assert(nw); + assert(next); + + if (timestamp_is_set(next->monotonic)) { + usec_t converted; + + if (next->monotonic > nw->monotonic) + converted = nw->realtime + (next->monotonic - nw->monotonic); + else + converted = nw->realtime - (nw->monotonic - next->monotonic); + + if (timestamp_is_set(next->realtime)) + next_elapse = MIN(converted, next->realtime); + else + next_elapse = converted; + + } else + next_elapse = next->realtime; + + return next_elapse; +} + +static int add_timer_info( + sd_bus *bus, + const UnitInfo *u, + const dual_timestamp *nw, + TimerInfo **timers, + size_t *n_timers) { + + _cleanup_strv_free_ char **triggered = NULL; + dual_timestamp next, last; + usec_t m; + int r; + + assert(bus); + assert(u); + assert(nw); + assert(timers); + assert(n_timers); + + if (!endswith(u->id, ".timer")) + return 0; + + r = get_triggered_units(bus, u->unit_path, &triggered); + if (r < 0) + return r; + + r = get_next_elapse(bus, u->unit_path, &next); + if (r < 0) + return r; + + r = get_last_trigger(bus, u->unit_path, &last); + if (r < 0) + return r; + + m = calc_next_elapse(nw, &next); + + if (!GREEDY_REALLOC(*timers, *n_timers + 1)) + return log_oom(); + + (*timers)[(*n_timers)++] = (TimerInfo) { + .machine = u->machine, + .id = u->id, + .next_elapse = m, + .last_trigger = last, + .triggered = TAKE_PTR(triggered), + }; + + return 0; +} + +int verb_list_timers(int argc, char *argv[], void *userdata) { + _cleanup_set_free_ Set *replies = NULL; + _cleanup_strv_free_ char **timers_with_suffix = NULL; + _cleanup_free_ UnitInfo *unit_infos = NULL; + TimerInfo *timers = NULL; + size_t n_timers = 0; + sd_bus *bus; + int r; + + CLEANUP_ARRAY(timers, n_timers, timer_info_array_free); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + r = expand_unit_names(bus, strv_skip(argv, 1), ".timer", &timers_with_suffix, NULL); + if (r < 0) + return r; + + if (argc == 1 || timers_with_suffix) { + dual_timestamp nw; + int n; + + n = get_unit_list_recursive(bus, timers_with_suffix, &unit_infos, &replies); + if (n < 0) + return n; + + dual_timestamp_now(&nw); + + FOREACH_ARRAY(u, unit_infos, n) { + r = add_timer_info(bus, u, &nw, &timers, &n_timers); + if (r < 0) + return r; + } + } + + typesafe_qsort(timers, n_timers, timer_info_compare); + output_timers_list(timers, n_timers); + + return 0; +} + +typedef struct AutomountInfo { + const char *machine; + const char *id; + char *what; + char *where; + usec_t timeout_idle_usec; + bool mounted; +} AutomountInfo; + +static void automount_info_array_free(AutomountInfo *automounts, size_t n_automounts) { + assert(automounts || n_automounts == 0); + + FOREACH_ARRAY(i, automounts, n_automounts) { + free(i->what); + free(i->where); + } + + free(automounts); +} + +static int automount_info_compare(const AutomountInfo *a, const AutomountInfo *b) { + int r; + + assert(a); + assert(b); + + r = strcasecmp_ptr(a->machine, b->machine); + if (r != 0) + return r; + + return path_compare(a->where, b->where); +} + +static int automount_info_add( + sd_bus* bus, + const UnitInfo *info, + AutomountInfo **automounts, + size_t *n_automounts) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *mount = NULL, *mount_path = NULL, *where = NULL, *what = NULL, *state = NULL; + uint64_t timeout_idle_usec; + BusLocator locator; + int r; + + assert(bus); + assert(info); + assert(automounts); + assert(n_automounts); + + if (!endswith(info->id, ".automount")) + return 0; + + locator = (BusLocator) { + .destination = "org.freedesktop.systemd1", + .path = info->unit_path, + .interface = "org.freedesktop.systemd1.Automount", + }; + + r = bus_get_property_string(bus, &locator, "Where", &error, &where); + if (r < 0) + return log_error_errno(r, "Failed to get automount target: %s", bus_error_message(&error, r)); + + r = bus_get_property_trivial(bus, &locator, "TimeoutIdleUSec", &error, 't', &timeout_idle_usec); + if (r < 0) + return log_error_errno(r, "Failed to get idle timeout: %s", bus_error_message(&error, r)); + + r = unit_name_from_path(where, ".mount", &mount); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name from path: %m"); + + mount_path = unit_dbus_path_from_name(mount); + if (!mount_path) + return log_oom(); + + locator.path = mount_path; + locator.interface = "org.freedesktop.systemd1.Mount"; + + r = bus_get_property_string(bus, &locator, "What", &error, &what); + if (r < 0) + return log_error_errno(r, "Failed to get mount source: %s", bus_error_message(&error, r)); + + locator.interface = "org.freedesktop.systemd1.Unit"; + + r = bus_get_property_string(bus, &locator, "ActiveState", &error, &state); + if (r < 0) + return log_error_errno(r, "Failed to get mount state: %s", bus_error_message(&error, r)); + + if (!GREEDY_REALLOC(*automounts, *n_automounts + 1)) + return log_oom(); + + (*automounts)[(*n_automounts)++] = (AutomountInfo) { + .machine = info->machine, + .id = info->id, + .what = TAKE_PTR(what), + .where = TAKE_PTR(where), + .timeout_idle_usec = timeout_idle_usec, + .mounted = streq_ptr(state, "active"), + }; + + return 0; +} + +static int output_automounts_list(const AutomountInfo *infos, size_t n_infos) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(infos || n_infos == 0); + + table = table_new("what", "where", "mounted", "idle timeout", "unit"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + FOREACH_ARRAY(info, infos, n_infos) { + _cleanup_free_ char *unit = NULL; + + unit = format_unit_id(info->id, info->machine); + if (!unit) + return log_oom(); + + r = table_add_many(table, + TABLE_STRING, info->what, + TABLE_STRING, info->where, + TABLE_BOOLEAN, info->mounted); + if (r < 0) + return table_log_add_error(r); + + if (timestamp_is_set(info->timeout_idle_usec)) + r = table_add_cell(table, NULL, TABLE_TIMESPAN_MSEC, &info->timeout_idle_usec); + else + r = table_add_cell(table, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_STRING, unit); + if (r < 0) + return table_log_add_error(r); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) + output_legend("automount", n_infos); + + return 0; +} + +int verb_list_automounts(int argc, char *argv[], void *userdata) { + _cleanup_set_free_ Set *replies = NULL; + _cleanup_strv_free_ char **names = NULL; + _cleanup_free_ UnitInfo *unit_infos = NULL; + AutomountInfo *automounts = NULL; + size_t n_automounts = 0; + sd_bus *bus; + int r; + + CLEANUP_ARRAY(automounts, n_automounts, automount_info_array_free); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + r = expand_unit_names(bus, strv_skip(argv, 1), ".automount", &names, NULL); + if (r < 0) + return r; + + if (argc == 1 || automounts) { + int n; + + n = get_unit_list_recursive(bus, names, &unit_infos, &replies); + if (n < 0) + return n; + + FOREACH_ARRAY(u, unit_infos, n) { + r = automount_info_add(bus, u, &automounts, &n_automounts); + if (r < 0) + return r; + } + + } + + typesafe_qsort(automounts, n_automounts, automount_info_compare); + output_automounts_list(automounts, n_automounts); + + return 0; +} + +typedef struct PathInfo { + const char *machine; + const char *id; + + char *path; + char *condition; + + /* Note: triggered is a list here, although it almost certainly will always be one + * unit. Nevertheless, dbus API allows for multiple values, so let's follow that. */ + char** triggered; +} PathInfo; + +static int path_info_compare(const PathInfo *a, const PathInfo *b) { + int r; + + assert(a); + assert(b); + + r = strcasecmp_ptr(a->machine, b->machine); + if (r != 0) + return r; + + r = path_compare(a->path, b->path); + if (r != 0) + return r; + + r = strcmp(a->condition, b->condition); + if (r != 0) + return r; + + return strcasecmp_ptr(a->id, b->id); +} + +static void path_info_array_free(PathInfo *paths, size_t n_paths) { + assert(paths || n_paths == 0); + + FOREACH_ARRAY(p, paths, n_paths) { + free(p->condition); + free(p->path); + strv_free(p->triggered); + } + + free(paths); +} + +static int path_info_add( + sd_bus *bus, + const struct UnitInfo *u, + PathInfo **paths, + size_t *n_paths) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_strv_free_ char **triggered = NULL; + const char *condition, *path; + int r; + + assert(bus); + assert(u); + assert(paths); + assert(n_paths); + + if (!endswith(u->id, ".path")) + return 0; + + r = get_triggered_units(bus, u->unit_path, &triggered); + if (r < 0) + return r; + + r = sd_bus_get_property(bus, + "org.freedesktop.systemd1", + u->unit_path, + "org.freedesktop.systemd1.Path", + "Paths", + &error, + &reply, + "a(ss)"); + if (r < 0) + return log_error_errno(r, "Failed to get paths: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(ss)", &condition, &path)) > 0) { + _cleanup_free_ char *condition_dup = NULL, *path_dup = NULL; + _cleanup_strv_free_ char **triggered_dup = NULL; + + condition_dup = strdup(condition); + if (!condition_dup) + return log_oom(); + + path_dup = strdup(path); + if (!path_dup) + return log_oom(); + + triggered_dup = strv_copy(triggered); + if (!triggered_dup) + return log_oom(); + + if (!GREEDY_REALLOC(*paths, *n_paths + 1)) + return log_oom(); + + (*paths)[(*n_paths)++] = (PathInfo) { + .machine = u->machine, + .id = u->id, + .condition = TAKE_PTR(condition_dup), + .path = TAKE_PTR(path_dup), + .triggered = TAKE_PTR(triggered_dup), + }; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static int output_paths_list(const PathInfo *paths, size_t n_paths) { + _cleanup_(table_unrefp) Table *table = NULL; + int r; + + assert(paths || n_paths == 0); + + table = table_new("path", "condition", "unit", "activates"); + if (!table) + return log_oom(); + + table_set_header(table, arg_legend != 0); + if (arg_full) + table_set_width(table, 0); + + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + + FOREACH_ARRAY(p, paths, n_paths) { + _cleanup_free_ char *unit = NULL; + + unit = format_unit_id(p->id, p->machine); + if (!unit) + return log_oom(); + + r = table_add_many(table, + TABLE_STRING, p->path, + TABLE_STRING, p->condition, + TABLE_STRING, unit); + if (r < 0) + return table_log_add_error(r); + + r = table_add_triggered(table, p->triggered); + if (r < 0) + return table_log_add_error(r); + } + + r = output_table(table); + if (r < 0) + return r; + + if (arg_legend != 0) + output_legend("path", n_paths); + + return 0; +} + +int verb_list_paths(int argc, char *argv[], void *userdata) { + _cleanup_set_free_ Set *replies = NULL; + _cleanup_strv_free_ char **units = NULL; + _cleanup_free_ UnitInfo *unit_infos = NULL; + PathInfo *paths = NULL; + size_t n_paths = 0; + sd_bus *bus; + int r; + + CLEANUP_ARRAY(paths, n_paths, path_info_array_free); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + r = expand_unit_names(bus, strv_skip(argv, 1), ".path", &units, NULL); + if (r < 0) + return r; + + if (argc == 1 || units) { + int n; + + n = get_unit_list_recursive(bus, units, &unit_infos, &replies); + if (n < 0) + return n; + + FOREACH_ARRAY(u, unit_infos, n) { + r = path_info_add(bus, u, &paths, &n_paths); + if (r < 0) + return r; + } + } + + typesafe_qsort(paths, n_paths, path_info_compare); + output_paths_list(paths, n_paths); + + return 0; +} diff --git a/src/systemctl/systemctl-list-units.h b/src/systemctl/systemctl-list-units.h new file mode 100644 index 0000000..cb19054 --- /dev/null +++ b/src/systemctl/systemctl-list-units.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_list_units(int argc, char *argv[], void *userdata); +int verb_list_sockets(int argc, char *argv[], void *userdata); +int verb_list_timers(int argc, char *argv[], void *userdata); +int verb_list_automounts(int argc, char *argv[], void *userdata); +int verb_list_paths(int argc, char *argv[], void *userdata); + +usec_t calc_next_elapse(const dual_timestamp *nw, const dual_timestamp *next); diff --git a/src/systemctl/systemctl-log-setting.c b/src/systemctl/systemctl-log-setting.c new file mode 100644 index 0000000..88b2e49 --- /dev/null +++ b/src/systemctl/systemctl-log-setting.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "pretty-print.h" +#include "syslog-util.h" +#include "systemctl-log-setting.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "verb-log-control.h" + +static void give_log_control1_hint(const char *name) { + _cleanup_free_ char *link = NULL; + + if (arg_quiet) + return; + + (void) terminal_urlify_man("org.freedesktop.LogControl1", "5", &link); + + log_notice("Hint: the service must declare BusName= and implement the appropriate D-Bus interface.\n" + " See the %s for details.", link ?: "org.freedesktop.LogControl1(5) man page"); +} + +int verb_log_setting(int argc, char *argv[], void *userdata) { + sd_bus *bus; + int r; + + assert(argc >= 1 && argc <= 2); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + return verb_log_control_common(bus, "org.freedesktop.systemd1", argv[0], argv[1]); +} + +static int service_name_to_dbus(sd_bus *bus, const char *name, char **ret_dbus_name) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *bus_name = NULL; + int r; + + /* First, look for the BusName= property */ + _cleanup_free_ char *dbus_path = unit_dbus_path_from_name(name); + if (!dbus_path) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Service", + "BusName", + &error, + &bus_name); + if (r < 0) + return log_error_errno(r, "Failed to obtain BusName= property of %s: %s", + name, bus_error_message(&error, r)); + + if (isempty(bus_name)) { + log_error("Unit %s doesn't declare BusName=.", name); + give_log_control1_hint(name); + return -ENOLINK; + } + + *ret_dbus_name = TAKE_PTR(bus_name); + return 0; +} + +int verb_service_log_setting(int argc, char *argv[], void *userdata) { + sd_bus *bus; + _cleanup_free_ char *unit = NULL, *dbus_name = NULL; + int r; + + assert(argc >= 2 && argc <= 3); + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = unit_name_mangle_with_suffix(argv[1], argv[0], + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".service", &unit); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + r = service_name_to_dbus(bus, unit, &dbus_name); + if (r < 0) + return r; + + r = verb_log_control_common(bus, dbus_name, argv[0], argv[2]); + + if (r == -EBADR) + give_log_control1_hint(dbus_name); + + return r; +} diff --git a/src/systemctl/systemctl-log-setting.h b/src/systemctl/systemctl-log-setting.h new file mode 100644 index 0000000..910d6c8 --- /dev/null +++ b/src/systemctl/systemctl-log-setting.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_log_setting(int argc, char *argv[], void *userdata); +int verb_service_log_setting(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-logind.c b/src/systemctl/systemctl-logind.c new file mode 100644 index 0000000..268e528 --- /dev/null +++ b/src/systemctl/systemctl-logind.c @@ -0,0 +1,449 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-login.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "login-util.h" +#include "process-util.h" +#include "systemctl-logind.h" +#include "systemctl-start-unit.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" +#include "user-util.h" + +static int logind_set_wall_message(sd_bus *bus) { +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *m = NULL; + int r; + + assert(bus); + + m = strv_join(arg_wall, " "); + if (!m) + return log_oom(); + + log_debug("%s wall message \"%s\".", arg_dry_run ? "Would set" : "Setting", m); + if (arg_dry_run) + return 0; + + r = bus_call_method(bus, bus_login_mgr, "SetWallMessage", &error, NULL, "sb", m, !arg_no_wall); + if (r < 0) + return log_warning_errno(r, "Failed to set wall message, ignoring: %s", bus_error_message(&error, r)); +#endif + return 0; +} + +/* Ask systemd-logind, which might grant access to unprivileged users through polkit */ +int logind_reboot(enum action a) { +#if ENABLE_LOGIND + static const char* actions[_ACTION_MAX] = { + [ACTION_POWEROFF] = "PowerOff", + [ACTION_REBOOT] = "Reboot", + [ACTION_KEXEC] = "Reboot", + [ACTION_SOFT_REBOOT] = "Reboot", + [ACTION_HALT] = "Halt", + [ACTION_SUSPEND] = "Suspend", + [ACTION_HIBERNATE] = "Hibernate", + [ACTION_HYBRID_SLEEP] = "HybridSleep", + [ACTION_SUSPEND_THEN_HIBERNATE] = "SuspendThenHibernate", + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + uint64_t flags = 0; + sd_bus *bus; + int r; + + assert(a >= 0); + assert(a < _ACTION_MAX); + + if (!actions[a]) + return -EINVAL; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + (void) logind_set_wall_message(bus); + + const char *method_with_flags = strjoina(actions[a], "WithFlags"); + + log_debug("%s org.freedesktop.login1.Manager %s dbus call.", + arg_dry_run ? "Would execute" : "Executing", method_with_flags); + + if (arg_dry_run) + return 0; + + SET_FLAG(flags, SD_LOGIND_ROOT_CHECK_INHIBITORS, arg_check_inhibitors > 0); + SET_FLAG(flags, + SD_LOGIND_REBOOT_VIA_KEXEC, + a == ACTION_KEXEC || (a == ACTION_REBOOT && getenv_bool("SYSTEMCTL_SKIP_AUTO_KEXEC") <= 0)); + SET_FLAG(flags, + SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP, + a == ACTION_REBOOT && getenv_bool("SYSTEMCTL_SKIP_AUTO_SOFT_REBOOT") <= 0); + SET_FLAG(flags, SD_LOGIND_SOFT_REBOOT, a == ACTION_SOFT_REBOOT); + + r = bus_call_method(bus, bus_login_mgr, method_with_flags, &error, NULL, "t", flags); + if (r < 0 && FLAGS_SET(flags, SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP) && + sd_bus_error_has_name(&error, SD_BUS_ERROR_INVALID_ARGS)) { + sd_bus_error_free(&error); + r = bus_call_method( + bus, + bus_login_mgr, + method_with_flags, + &error, + NULL, + "t", + flags & ~SD_LOGIND_SOFT_REBOOT_IF_NEXTROOT_SET_UP); + } + if (r >= 0) + return 0; + if (!sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)) + return log_error_errno(r, "Call to %s failed: %s", actions[a], bus_error_message(&error, r)); + + /* Fall back to original methods in case there is an older version of systemd-logind */ + log_debug("Method %s not available: %s. Falling back to %s", method_with_flags, bus_error_message(&error, r), actions[a]); + sd_bus_error_free(&error); + + r = bus_call_method(bus, bus_login_mgr, actions[a], &error, NULL, "b", arg_ask_password); + if (r < 0) + return log_error_errno(r, "Call to %s failed: %s", actions[a], bus_error_message(&error, r)); + + return 0; +#else + return -ENOSYS; +#endif +} + +int logind_check_inhibitors(enum action a) { +#if ENABLE_LOGIND + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_strv_free_ char **sessions = NULL; + const char *what, *who, *why, *mode; + uint32_t uid, pid; + sd_bus *bus; + unsigned c = 0; + int r; + + assert(a >= 0); + assert(a < _ACTION_MAX); + + if (arg_check_inhibitors == 0 || arg_force > 0) + return 0; + + if (arg_when > 0) + return 0; + + if (arg_check_inhibitors < 0) { + if (geteuid() == 0) + return 0; + + if (!on_tty()) + return 0; + } + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return 0; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_login_mgr, "ListInhibitors", NULL, &reply, NULL); + if (r < 0) + /* If logind is not around, then there are no inhibitors... */ + return 0; + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssuu)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(reply, "(ssssuu)", &what, &who, &why, &mode, &uid, &pid)) > 0) { + _cleanup_free_ char *comm = NULL, *user = NULL; + _cleanup_strv_free_ char **sv = NULL; + + if (!streq(mode, "block")) + continue; + + sv = strv_split(what, ":"); + if (!sv) + return log_oom(); + + if (!pid_is_valid((pid_t) pid)) + return log_error_errno(SYNTHETIC_ERRNO(ERANGE), "Invalid PID "PID_FMT".", (pid_t) pid); + + if (!strv_contains(sv, + IN_SET(a, + ACTION_HALT, + ACTION_POWEROFF, + ACTION_REBOOT, + ACTION_KEXEC) ? "shutdown" : "sleep")) + continue; + + (void) pid_get_comm(pid, &comm); + user = uid_to_name(uid); + + log_warning("Operation inhibited by \"%s\" (PID "PID_FMT" \"%s\", user %s), reason is \"%s\".", + who, (pid_t) pid, strna(comm), strna(user), why); + + c++; + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + /* Check for current sessions */ + sd_get_sessions(&sessions); + STRV_FOREACH(s, sessions) { + _cleanup_free_ char *type = NULL, *tty = NULL, *seat = NULL, *user = NULL, *service = NULL, *class = NULL; + + if (sd_session_get_uid(*s, &uid) < 0 || uid == getuid()) + continue; + + if (sd_session_get_class(*s, &class) < 0 || !streq(class, "user")) + continue; + + if (sd_session_get_type(*s, &type) < 0 || !STR_IN_SET(type, "x11", "wayland", "tty", "mir")) + continue; + + sd_session_get_tty(*s, &tty); + sd_session_get_seat(*s, &seat); + sd_session_get_service(*s, &service); + user = uid_to_name(uid); + + log_warning("User %s is logged in on %s.", strna(user), isempty(tty) ? (isempty(seat) ? strna(service) : seat) : tty); + c++; + } + + if (c <= 0) + return 0; + + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Please retry operation after closing inhibitors and logging out other users.\n" + "Alternatively, ignore inhibitors and users with 'systemctl %s -i'.", + action_table[a].verb); +#else + return 0; +#endif +} + +int prepare_firmware_setup(void) { + + if (!arg_firmware_setup) + return 0; + +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_login_mgr, "SetRebootToFirmwareSetup", &error, NULL, "b", true); + if (r < 0) + return log_error_errno(r, "Cannot indicate to EFI to boot into setup mode: %s", bus_error_message(&error, r)); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Booting into firmware setup not supported."); +#endif +} + +int prepare_boot_loader_menu(void) { + + if (arg_boot_loader_menu == USEC_INFINITY) + return 0; + +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_login_mgr, "SetRebootToBootLoaderMenu", &error, NULL, "t", arg_boot_loader_menu); + if (r < 0) + return log_error_errno(r, "Cannot indicate to boot loader to enter boot loader entry menu: %s", bus_error_message(&error, r)); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Booting into boot loader menu not supported."); +#endif +} + +int prepare_boot_loader_entry(void) { + + if (!arg_boot_loader_entry) + return 0; + +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_login_mgr, "SetRebootToBootLoaderEntry", &error, NULL, "s", arg_boot_loader_entry); + if (r < 0) + return log_error_errno(r, "Cannot set boot into loader entry '%s': %s", arg_boot_loader_entry, bus_error_message(&error, r)); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Booting into boot loader entry not supported."); +#endif +} + +int logind_schedule_shutdown(enum action a) { +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *action; + sd_bus *bus; + int r; + + assert(a >= 0); + assert(a < _ACTION_MAX); + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + action = action_table[a].verb; + if (!action) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Scheduling not supported for this action."); + + if (arg_dry_run) + action = strjoina("dry-", action); + + (void) logind_set_wall_message(bus); + + r = bus_call_method(bus, bus_login_mgr, "ScheduleShutdown", &error, NULL, "st", action, arg_when); + if (r < 0) + return log_warning_errno(r, "Failed to schedule shutdown: %s", bus_error_message(&error, r)); + + if (!arg_quiet) + logind_show_shutdown(); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Cannot schedule shutdown without logind support, proceeding with immediate shutdown."); +#endif +} + +int logind_cancel_shutdown(void) { +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + (void) logind_set_wall_message(bus); + + r = bus_call_method(bus, bus_login_mgr, "CancelScheduledShutdown", &error, NULL, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to talk to logind, shutdown hasn't been cancelled: %s", bus_error_message(&error, r)); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Not compiled with logind support, cannot cancel scheduled shutdowns."); +#endif +} + +int logind_show_shutdown(void) { +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + sd_bus *bus; + const char *action, *pretty_action; + uint64_t elapse; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = bus_get_property(bus, bus_login_mgr, "ScheduledShutdown", &error, &reply, "(st)"); + if (r < 0) + return log_error_errno(r, "Failed to query scheduled shutdown: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "(st)", &action, &elapse); + if (r < 0) + return r; + + if (isempty(action)) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), "No scheduled shutdown."); + + if (STR_IN_SET(action, "halt", "poweroff", "exit")) + pretty_action = "Shutdown"; + else if (streq(action, "kexec")) + pretty_action = "Reboot via kexec"; + else if (streq(action, "reboot")) + pretty_action = "Reboot"; + else /* If we don't recognize the action string, we'll show it as-is */ + pretty_action = action; + + if (arg_action == ACTION_SYSTEMCTL) + log_info("%s scheduled for %s, use 'systemctl %s --when=cancel' to cancel.", + pretty_action, + FORMAT_TIMESTAMP_STYLE(elapse, arg_timestamp_style), + action); + else + log_info("%s scheduled for %s, use 'shutdown -c' to cancel.", + pretty_action, + FORMAT_TIMESTAMP_STYLE(elapse, arg_timestamp_style)); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Not compiled with logind support, cannot show scheduled shutdowns."); +#endif +} + +int help_boot_loader_entry(void) { +#if ENABLE_LOGIND + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_strv_free_ char **l = NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + r = bus_get_property_strv(bus, bus_login_mgr, "BootLoaderEntries", &error, &l); + if (r < 0) + return log_error_errno(r, "Failed to enumerate boot loader entries: %s", bus_error_message(&error, r)); + + if (strv_isempty(l)) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), "No boot loader entries discovered."); + + STRV_FOREACH(i, l) + puts(*i); + + return 0; +#else + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "Not compiled with logind support, cannot display boot loader entries."); +#endif +} diff --git a/src/systemctl/systemctl-logind.h b/src/systemctl/systemctl-logind.h new file mode 100644 index 0000000..516f749 --- /dev/null +++ b/src/systemctl/systemctl-logind.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "systemctl.h" + +int logind_reboot(enum action a); +int logind_check_inhibitors(enum action a); + +int prepare_firmware_setup(void); +int prepare_boot_loader_menu(void); +int prepare_boot_loader_entry(void); + +int logind_schedule_shutdown(enum action a); +int logind_cancel_shutdown(void); +int logind_show_shutdown(void); + +int help_boot_loader_entry(void); diff --git a/src/systemctl/systemctl-mount.c b/src/systemctl/systemctl-mount.c new file mode 100644 index 0000000..d9ad332 --- /dev/null +++ b/src/systemctl/systemctl-mount.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "dissect-image.h" +#include "systemctl-mount.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_bind(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *n = NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = unit_name_mangle(argv[1], arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, &n); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + r = bus_call_method( + bus, + bus_systemd_mgr, + "BindMountUnit", + &error, + NULL, + "sssbb", + n, + argv[2], + argv[3], + arg_read_only, + arg_mkdir); + if (r < 0) + return log_error_errno(r, "Failed to bind mount: %s", bus_error_message(&error, r)); + + return 0; +} + +int verb_mount_image(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *unit = argv[1], *src = argv[2], *dest = argv[3]; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *n = NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = unit_name_mangle(unit, arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, &n); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + r = bus_message_new_method_call( + bus, + &m, + bus_systemd_mgr, + "MountImageUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "sssbb", + n, + src, + dest, + arg_read_only, + arg_mkdir); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + if (argc > 4) { + _cleanup_free_ char *partition = NULL, *mount_options = NULL; + const char *options = argv[4]; + + r = extract_many_words(&options, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + if (r < 0) + return r; + /* Single set of options, applying to the root partition/single filesystem */ + if (r == 1) { + r = sd_bus_message_append(m, "(ss)", "root", partition); + if (r < 0) + return bus_log_create_error(r); + } else if (r > 1) { + if (partition_designator_from_string(partition) < 0) + return bus_log_create_error(-EINVAL); + + r = sd_bus_message_append(m, "(ss)", partition, mount_options); + if (r < 0) + return bus_log_create_error(r); + } + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, -1, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to mount image: %s", bus_error_message(&error, r)); + + return 0; +} diff --git a/src/systemctl/systemctl-mount.h b/src/systemctl/systemctl-mount.h new file mode 100644 index 0000000..b2d0750 --- /dev/null +++ b/src/systemctl/systemctl-mount.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_bind(int argc, char *argv[], void *userdata); +int verb_mount_image(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-preset-all.c b/src/systemctl/systemctl-preset-all.c new file mode 100644 index 0000000..b55f8e3 --- /dev/null +++ b/src/systemctl/systemctl-preset-all.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-preset-all.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_preset_all(int argc, char *argv[], void *userdata) { + int r; + + if (install_client_side()) { + InstallChange *changes = NULL; + size_t n_changes = 0; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + r = unit_file_preset_all(arg_runtime_scope, unit_file_flags_from_args(), arg_root, arg_preset_mode, &changes, &n_changes); + install_changes_dump(r, "preset", changes, n_changes, arg_quiet); + + if (r > 0) + r = 0; + } else { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + sd_bus *bus; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = bus_call_method( + bus, + bus_systemd_mgr, + "PresetAllUnitFiles", + &error, + &reply, + "sbb", + unit_file_preset_mode_to_string(arg_preset_mode), + arg_runtime, + arg_force); + if (r < 0) + return log_error_errno(r, "Failed to preset all units: %s", bus_error_message(&error, r)); + + r = bus_deserialize_and_dump_unit_file_changes(reply, arg_quiet); + if (r < 0) + return r; + + if (!arg_no_reload) { + r = daemon_reload(ACTION_RELOAD, /* graceful= */ false); + if (r < 0) + return r; + } + } + + return 0; +} diff --git a/src/systemctl/systemctl-preset-all.h b/src/systemctl/systemctl-preset-all.h new file mode 100644 index 0000000..4631e7e --- /dev/null +++ b/src/systemctl/systemctl-preset-all.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_preset_all(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-reset-failed.c b/src/systemctl/systemctl-reset-failed.c new file mode 100644 index 0000000..1ca0533 --- /dev/null +++ b/src/systemctl/systemctl-reset-failed.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-reset-failed.h" +#include "systemctl-trivial-method.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_reset_failed(int argc, char *argv[], void *userdata) { + _cleanup_strv_free_ char **names = NULL; + sd_bus *bus; + int r, q; + + if (argc <= 1) /* Shortcut to trivial_method() if no argument is given */ + return verb_trivial_method(argc, argv, userdata); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = expand_unit_names(bus, strv_skip(argv, 1), NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + STRV_FOREACH(name, names) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + q = bus_call_method(bus, bus_systemd_mgr, "ResetFailedUnit", &error, NULL, "s", *name); + if (q < 0) { + log_error_errno(q, "Failed to reset failed state of unit %s: %s", *name, bus_error_message(&error, q)); + if (r == 0) + r = q; + } + } + + return r; +} diff --git a/src/systemctl/systemctl-reset-failed.h b/src/systemctl/systemctl-reset-failed.h new file mode 100644 index 0000000..5da0659 --- /dev/null +++ b/src/systemctl/systemctl-reset-failed.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_reset_failed(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-service-watchdogs.c b/src/systemctl/systemctl-service-watchdogs.c new file mode 100644 index 0000000..620f46a --- /dev/null +++ b/src/systemctl/systemctl-service-watchdogs.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "parse-util.h" +#include "systemctl-service-watchdogs.h" +#include "systemctl-util.h" +#include "systemctl.h" + +int verb_service_watchdogs(int argc, char *argv[], void *userdata) { + sd_bus *bus; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int b, r; + + assert(argv); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + if (argc == 1) { + /* get ServiceWatchdogs */ + r = bus_get_property_trivial(bus, bus_systemd_mgr, "ServiceWatchdogs", &error, 'b', &b); + if (r < 0) + return log_error_errno(r, "Failed to get service-watchdog state: %s", bus_error_message(&error, r)); + + printf("%s\n", yes_no(!!b)); + + } else { + /* set ServiceWatchdogs */ + assert(argc == 2); + + b = parse_boolean(argv[1]); + if (b < 0) + return log_error_errno(b, "Failed to parse service-watchdogs argument: %m"); + + r = bus_set_property(bus, bus_systemd_mgr, "ServiceWatchdogs", &error, "b", b); + if (r < 0) + return log_error_errno(r, "Failed to set service-watchdog state: %s", bus_error_message(&error, r)); + } + + return 0; +} diff --git a/src/systemctl/systemctl-service-watchdogs.h b/src/systemctl/systemctl-service-watchdogs.h new file mode 100644 index 0000000..2f59f5a --- /dev/null +++ b/src/systemctl/systemctl-service-watchdogs.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_service_watchdogs(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-set-default.c b/src/systemctl/systemctl-set-default.c new file mode 100644 index 0000000..58c2bc3 --- /dev/null +++ b/src/systemctl/systemctl-set-default.c @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "proc-cmdline.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-set-default.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + char **ret = data; + + if (streq(key, "systemd.unit")) { + if (proc_cmdline_value_missing(key, value)) + return 0; + if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) { + log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value); + return 0; + } + + return free_and_strdup_warn(ret, key); + + } else if (!value) { + if (runlevel_to_target(key)) + return free_and_strdup_warn(ret, key); + } + + return 0; +} + +static void emit_cmdline_warning(void) { + if (arg_quiet || arg_root) + /* don't bother checking the command line if we're operating on a container */ + return; + + _cleanup_free_ char *override = NULL; + int r; + + r = proc_cmdline_parse(parse_proc_cmdline_item, &override, 0); + if (r < 0) + log_debug_errno(r, "Failed to parse kernel command line, ignoring: %m"); + if (override) + log_notice("Note: found \"%s\" on the kernel command line, which overrides the default unit.", + override); +} + +static int determine_default(char **ret_name) { + int r; + + if (install_client_side()) { + r = unit_file_get_default(arg_runtime_scope, arg_root, ret_name); + if (r == -ERFKILL) + return log_error_errno(r, "Failed to get default target: Unit file is masked."); + if (r < 0) + return log_error_errno(r, "Failed to get default target: %m"); + return 0; + + } else { + sd_bus *bus; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *name; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_systemd_mgr, "GetDefaultTarget", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to get default target: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &name); + if (r < 0) + return bus_log_parse_error(r); + + return free_and_strdup_warn(ret_name, name); + } +} + +int verb_get_default(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *name = NULL; + int r; + + r = determine_default(&name); + if (r < 0) + return r; + + printf("%s\n", name); + + emit_cmdline_warning(); + + return 0; +} + +int verb_set_default(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *unit = NULL; + int r; + + assert(argc >= 2); + assert(argv); + + r = unit_name_mangle_with_suffix(argv[1], "set-default", + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".target", &unit); + if (r < 0) + return log_error_errno(r, "Failed to mangle unit name: %m"); + + if (install_client_side()) { + InstallChange *changes = NULL; + size_t n_changes = 0; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + r = unit_file_set_default(arg_runtime_scope, UNIT_FILE_FORCE, arg_root, unit, &changes, &n_changes); + install_changes_dump(r, "set default", changes, n_changes, arg_quiet); + if (r < 0) + return r; + } else { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + sd_bus *bus; + + polkit_agent_open_maybe(); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_systemd_mgr, "SetDefaultTarget", &error, &reply, "sb", unit, 1); + if (r < 0) + return log_error_errno(r, "Failed to set default target: %s", bus_error_message(&error, r)); + + r = bus_deserialize_and_dump_unit_file_changes(reply, arg_quiet); + if (r < 0) + return r; + + /* Try to reload if enabled */ + if (!arg_no_reload) { + r = daemon_reload(ACTION_RELOAD, /* graceful= */ false); + if (r < 0) + return r; + } + } + + emit_cmdline_warning(); + + if (!arg_quiet) { + _cleanup_free_ char *final = NULL; + + r = determine_default(&final); + if (r < 0) + return r; + + if (!streq(final, unit)) + log_notice("Note: \"%s\" is the default unit (possibly a runtime override).", final); + } + + return 0; +} diff --git a/src/systemctl/systemctl-set-default.h b/src/systemctl/systemctl-set-default.h new file mode 100644 index 0000000..7873e12 --- /dev/null +++ b/src/systemctl/systemctl-set-default.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_get_default(int argc, char *argv[], void *userdata); +int verb_set_default(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-set-environment.c b/src/systemctl/systemctl-set-environment.c new file mode 100644 index 0000000..55d1160 --- /dev/null +++ b/src/systemctl/systemctl-set-environment.c @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "env-util.h" +#include "escape.h" +#include "systemctl-set-environment.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int json_transform_message(sd_bus_message *m, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + const char *text; + int r; + + assert(m); + assert(ret); + + while ((r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &text)) > 0) { + _cleanup_free_ char *n = NULL; + const char *sep; + + sep = strchr(text, '='); + if (!sep) + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), + "Invalid environment block"); + + n = strndup(text, sep - text); + if (!n) + return log_oom(); + + sep++; + + r = json_variant_set_field_string(&v, n, sep); + if (r < 0) + return log_error_errno(r, "Failed to set JSON field '%s' to '%s': %m", n, sep); + } + if (r < 0) + return bus_log_parse_error(r); + + *ret = TAKE_PTR(v); + return 0; +} + +static int print_variable(const char *s) { + const char *sep; + _cleanup_free_ char *esc = NULL; + + sep = strchr(s, '='); + if (!sep) + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), + "Invalid environment block"); + + esc = shell_maybe_quote(sep + 1, SHELL_ESCAPE_POSIX); + if (!esc) + return log_oom(); + + printf("%.*s=%s\n", (int)(sep-s), s, esc); + return 0; +} + +int verb_show_environment(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *text; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + r = bus_get_property(bus, bus_systemd_mgr, "Environment", &error, &reply, "as"); + if (r < 0) + return log_error_errno(r, "Failed to get environment: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "s"); + if (r < 0) + return bus_log_parse_error(r); + + if (OUTPUT_MODE_IS_JSON(arg_output)) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = json_transform_message(reply, &v); + if (r < 0) + return r; + + json_variant_dump(v, output_mode_to_json_format_flags(arg_output), stdout, NULL); + } else { + while ((r = sd_bus_message_read_basic(reply, SD_BUS_TYPE_STRING, &text)) > 0) { + r = print_variable(text); + if (r < 0) + return r; + } + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static void invalid_callback(const char *p, void *userdata) { + _cleanup_free_ char *t = cescape(p); + + log_debug("Ignoring invalid environment assignment \"%s\".", strnull(t)); +} + +int verb_set_environment(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + const char *method; + sd_bus *bus; + int r; + + assert(argc > 1); + assert(argv); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + method = streq(argv[0], "set-environment") + ? "SetEnvironment" + : "UnsetEnvironment"; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, method); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, strv_skip(argv, 1)); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to set environment: %s", bus_error_message(&error, r)); + + return 0; +} + +int verb_import_environment(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetEnvironment"); + if (r < 0) + return bus_log_create_error(r); + + if (argc < 2) { + log_warning("Calling import-environment without a list of variable names is deprecated."); + + _cleanup_strv_free_ char **copy = strv_copy(environ); + if (!copy) + return log_oom(); + + strv_env_clean_with_callback(copy, invalid_callback, NULL); + + STRV_FOREACH(e, copy) + if (string_has_cc(*e, NULL)) + log_notice("Environment variable $%.*s contains control characters, importing anyway.", + (int) strcspn(*e, "="), *e); + + r = sd_bus_message_append_strv(m, copy); + + } else { + r = sd_bus_message_open_container(m, 'a', "s"); + if (r < 0) + return bus_log_create_error(r); + + STRV_FOREACH(a, strv_skip(argv, 1)) { + + if (!env_name_is_valid(*a)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Not a valid environment variable name: %s", *a); + + bool found = false; + STRV_FOREACH(b, environ) { + const char *eq; + + eq = startswith(*b, *a); + if (eq && *eq == '=') { + if (string_has_cc(eq + 1, NULL)) + log_notice("Environment variable $%.*s contains control characters, importing anyway.", + (int) (eq - *b), *b); + + r = sd_bus_message_append(m, "s", *b); + if (r < 0) + return bus_log_create_error(r); + + found = true; + break; + } + } + + if (!found) + log_notice("Environment variable $%s not set, ignoring.", *a); + } + + r = sd_bus_message_close_container(m); + } + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to import environment: %s", bus_error_message(&error, r)); + + return 0; +} diff --git a/src/systemctl/systemctl-set-environment.h b/src/systemctl/systemctl-set-environment.h new file mode 100644 index 0000000..404258a --- /dev/null +++ b/src/systemctl/systemctl-set-environment.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_show_environment(int argc, char *argv[], void *userdata); +int verb_set_environment(int argc, char *argv[], void *userdata); +int verb_import_environment(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-set-property.c b/src/systemctl/systemctl-set-property.c new file mode 100644 index 0000000..5f4b810 --- /dev/null +++ b/src/systemctl/systemctl-set-property.c @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-set-property.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int set_property_one(sd_bus *bus, const char *name, char **properties) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties"); + if (r < 0) + return bus_log_create_error(r); + + UnitType t = unit_name_to_type(name); + if (t < 0) + return log_error_errno(t, "Invalid unit type: %s", name); + + r = sd_bus_message_append(m, "sb", name, arg_runtime); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, SD_BUS_TYPE_ARRAY, "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = bus_append_unit_property_assignment_many(m, t, properties); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to set unit properties on %s: %s", + name, bus_error_message(&error, r)); + + return 0; +} + +int verb_set_property(int argc, char *argv[], void *userdata) { + sd_bus *bus; + _cleanup_strv_free_ char **names = NULL; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + r = expand_unit_names(bus, STRV_MAKE(argv[1]), NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand '%s' into names: %m", argv[1]); + + r = 0; + STRV_FOREACH(name, names) + RET_GATHER(r, set_property_one(bus, *name, strv_skip(argv, 2))); + return r; +} diff --git a/src/systemctl/systemctl-set-property.h b/src/systemctl/systemctl-set-property.h new file mode 100644 index 0000000..0892291 --- /dev/null +++ b/src/systemctl/systemctl-set-property.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_set_property(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-show.c b/src/systemctl/systemctl-show.c new file mode 100644 index 0000000..e7fabcf --- /dev/null +++ b/src/systemctl/systemctl-show.c @@ -0,0 +1,2503 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "af-list.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-print-properties.h" +#include "bus-unit-procs.h" +#include "cgroup-show.h" +#include "cpu-set-util.h" +#include "errno-util.h" +#include "exec-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "format-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "ip-protocol-list.h" +#include "journal-file.h" +#include "list.h" +#include "locale-util.h" +#include "memory-util.h" +#include "numa-util.h" +#include "open-file.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "special.h" +#include "string-table.h" +#include "systemctl-list-machines.h" +#include "systemctl-list-units.h" +#include "systemctl-show.h" +#include "systemctl-sysv-compat.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" +#include "utf8.h" + +static OutputFlags get_output_flags(void) { + return + FLAGS_SET(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) * OUTPUT_SHOW_ALL | + (arg_full || !on_tty() || pager_have()) * OUTPUT_FULL_WIDTH | + colors_enabled() * OUTPUT_COLOR | + !arg_quiet * OUTPUT_WARN_CUTOFF; +} + +typedef struct ExecStatusInfo { + char *name; + + char *path; + char **argv; + + bool ignore; + + usec_t start_timestamp; + usec_t exit_timestamp; + pid_t pid; + int code; + int status; + + ExecCommandFlags flags; + + LIST_FIELDS(struct ExecStatusInfo, exec_status_info_list); +} ExecStatusInfo; + +static void exec_status_info_free(ExecStatusInfo *i) { + assert(i); + + free(i->name); + free(i->path); + strv_free(i->argv); + free(i); +} + +static int exec_status_info_deserialize(sd_bus_message *m, ExecStatusInfo *i, bool is_ex_prop) { + _cleanup_strv_free_ char **ex_opts = NULL; + uint64_t start_timestamp, exit_timestamp, start_timestamp_monotonic, exit_timestamp_monotonic; + const char *path; + uint32_t pid; + int32_t code, status; + int ignore, r; + + assert(m); + assert(i); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_STRUCT, is_ex_prop ? "sasasttttuii" : "sasbttttuii"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + return 0; + + r = sd_bus_message_read(m, "s", &path); + if (r < 0) + return bus_log_parse_error(r); + + i->path = strdup(path); + if (!i->path) + return log_oom(); + + r = sd_bus_message_read_strv(m, &i->argv); + if (r < 0) + return bus_log_parse_error(r); + + r = is_ex_prop ? sd_bus_message_read_strv(m, &ex_opts) : sd_bus_message_read(m, "b", &ignore); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(m, + "ttttuii", + &start_timestamp, &start_timestamp_monotonic, + &exit_timestamp, &exit_timestamp_monotonic, + &pid, + &code, &status); + if (r < 0) + return bus_log_parse_error(r); + + if (is_ex_prop) { + r = exec_command_flags_from_strv(ex_opts, &i->flags); + if (r < 0) + return log_error_errno(r, "Failed to convert strv to ExecCommandFlags: %m"); + + i->ignore = FLAGS_SET(i->flags, EXEC_COMMAND_IGNORE_FAILURE); + } else + i->ignore = ignore; + + i->start_timestamp = (usec_t) start_timestamp; + i->exit_timestamp = (usec_t) exit_timestamp; + i->pid = (pid_t) pid; + i->code = code; + i->status = status; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; +} + +typedef struct UnitCondition { + char *name; + char *param; + bool trigger; + bool negate; + int tristate; + + LIST_FIELDS(struct UnitCondition, conditions); +} UnitCondition; + +static UnitCondition* unit_condition_free(UnitCondition *c) { + if (!c) + return NULL; + + free(c->name); + free(c->param); + return mfree(c); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(UnitCondition*, unit_condition_free); + +typedef struct UnitStatusInfo { + const char *id; + const char *load_state; + const char *active_state; + const char *freezer_state; + const char *sub_state; + const char *unit_file_state; + const char *unit_file_preset; + + const char *description; + const char *following; + + char **documentation; + + const char *fragment_path; + const char *source_path; + const char *control_group; + + char **dropin_paths; + + char **triggered_by; + char **triggers; + + const char *load_error; + const char *result; + + usec_t inactive_exit_timestamp; + usec_t inactive_exit_timestamp_monotonic; + usec_t active_enter_timestamp; + usec_t active_exit_timestamp; + usec_t inactive_enter_timestamp; + + uint64_t runtime_max_sec; + + bool need_daemon_reload; + bool transient; + + /* Service */ + pid_t main_pid; + pid_t control_pid; + const char *status_text; + const char *pid_file; + bool running:1; + int status_errno; + + uint32_t fd_store_max; + uint32_t n_fd_store; + + usec_t start_timestamp; + usec_t exit_timestamp; + + int exit_code, exit_status; + + const char *log_namespace; + + usec_t condition_timestamp; + bool condition_result; + LIST_HEAD(UnitCondition, conditions); + + usec_t assert_timestamp; + bool assert_result; + bool failed_assert_trigger; + bool failed_assert_negate; + const char *failed_assert; + const char *failed_assert_parameter; + usec_t next_elapse_real; + usec_t next_elapse_monotonic; + + /* Socket */ + unsigned n_accepted; + unsigned n_connections; + unsigned n_refused; + bool accept; + + /* Pairs of type, path */ + char **listen; + + /* Device */ + const char *sysfs_path; + + /* Mount, Automount */ + const char *where; + + /* Swap */ + const char *what; + + /* CGroup */ + uint64_t memory_current; + uint64_t memory_peak; + uint64_t memory_swap_current; + uint64_t memory_swap_peak; + uint64_t memory_zswap_current; + uint64_t memory_min; + uint64_t memory_low; + uint64_t startup_memory_low; + uint64_t memory_high; + uint64_t startup_memory_high; + uint64_t memory_max; + uint64_t startup_memory_max; + uint64_t memory_swap_max; + uint64_t startup_memory_swap_max; + uint64_t memory_zswap_max; + uint64_t startup_memory_zswap_max; + uint64_t memory_limit; + uint64_t memory_available; + uint64_t cpu_usage_nsec; + uint64_t tasks_current; + uint64_t tasks_max; + uint64_t ip_ingress_bytes; + uint64_t ip_egress_bytes; + uint64_t io_read_bytes; + uint64_t io_write_bytes; + + uint64_t default_memory_min; + uint64_t default_memory_low; + uint64_t default_startup_memory_low; + + LIST_HEAD(ExecStatusInfo, exec_status_info_list); +} UnitStatusInfo; + +static void unit_status_info_done(UnitStatusInfo *info) { + strv_free(info->documentation); + strv_free(info->dropin_paths); + strv_free(info->triggered_by); + strv_free(info->triggers); + strv_free(info->listen); + + LIST_CLEAR(conditions, info->conditions, unit_condition_free); + LIST_CLEAR(exec_status_info_list, info->exec_status_info_list, exec_status_info_free); +} + +static void format_active_state(const char *active_state, const char **active_on, const char **active_off) { + if (streq_ptr(active_state, "failed")) { + *active_on = ansi_highlight_red(); + *active_off = ansi_normal(); + } else if (STRPTR_IN_SET(active_state, "active", "reloading")) { + *active_on = ansi_highlight_green(); + *active_off = ansi_normal(); + } else + *active_on = *active_off = ""; +} + +static void format_enable_state(const char *enable_state, const char **enable_on, const char **enable_off) { + assert(enable_on); + assert(enable_off); + + if (streq_ptr(enable_state, "disabled")) { + *enable_on = ansi_highlight_yellow(); + *enable_off = ansi_normal(); + } else if (streq_ptr(enable_state, "enabled")) { + *enable_on = ansi_highlight_green(); + *enable_off = ansi_normal(); + } else + *enable_on = *enable_off = ""; +} + +static void print_status_info( + sd_bus *bus, + UnitStatusInfo *i, + bool *ellipsized) { + + const char *active_on, *active_off, *on, *off, *ss, *fs; + const char *enable_on, *enable_off, *preset_on, *preset_off; + _cleanup_free_ char *formatted_path = NULL; + usec_t timestamp; + const char *path; + int r; + + assert(i); + + /* This shows pretty information about a unit. See print_property() for a low-level property + * printer */ + + format_active_state(i->active_state, &active_on, &active_off); + format_enable_state(i->unit_file_state, &enable_on, &enable_off); + format_enable_state(i->unit_file_preset, &preset_on, &preset_off); + + const SpecialGlyph glyph = unit_active_state_to_glyph(unit_active_state_from_string(i->active_state)); + + printf("%s%s%s %s", active_on, special_glyph(glyph), active_off, strna(i->id)); + + if (i->description && !streq_ptr(i->id, i->description)) + printf(" - %s", i->description); + + printf("\n"); + + if (i->following) + printf(" Follows: unit currently follows state of %s\n", i->following); + + if (STRPTR_IN_SET(i->load_state, "error", "not-found", "bad-setting")) { + on = ansi_highlight_red(); + off = ansi_normal(); + } else + on = off = ""; + + path = i->source_path ?: i->fragment_path; + if (path && terminal_urlify_path(path, NULL, &formatted_path) >= 0) + path = formatted_path; + + if (!isempty(i->load_error)) + printf(" Loaded: %s%s%s (Reason: %s)\n", + on, strna(i->load_state), off, i->load_error); + else if (path && !isempty(i->unit_file_state)) { + bool show_preset = !isempty(i->unit_file_preset) && + show_preset_for_state(unit_file_state_from_string(i->unit_file_state)); + + printf(" Loaded: %s%s%s (%s; %s%s%s%s%s%s%s)\n", + on, strna(i->load_state), off, + path, + enable_on, i->unit_file_state, enable_off, + show_preset ? "; preset: " : "", + preset_on, show_preset ? i->unit_file_preset : "", preset_off); + + } else if (path) + printf(" Loaded: %s%s%s (%s)\n", + on, strna(i->load_state), off, path); + else + printf(" Loaded: %s%s%s\n", + on, strna(i->load_state), off); + + if (i->transient) + printf(" Transient: yes\n"); + + if (!strv_isempty(i->dropin_paths)) { + _cleanup_free_ char *dir = NULL; + bool last = false; + + STRV_FOREACH(dropin, i->dropin_paths) { + _cleanup_free_ char *dropin_formatted = NULL; + const char *df; + + if (!dir || last) { + printf(dir ? " " : + " Drop-In: "); + + dir = mfree(dir); + + r = path_extract_directory(*dropin, &dir); + if (r < 0) { + log_error_errno(r, "Failed to extract directory of '%s': %m", *dropin); + break; + } + + printf("%s\n" + " %s", dir, + special_glyph(SPECIAL_GLYPH_TREE_RIGHT)); + } + + last = ! (*(dropin + 1) && startswith(*(dropin + 1), dir)); + + if (terminal_urlify_path(*dropin, basename(*dropin), &dropin_formatted) >= 0) + df = dropin_formatted; + else + df = *dropin; + + printf("%s%s", df, last ? "\n" : ", "); + } + } + + ss = streq_ptr(i->active_state, i->sub_state) ? NULL : i->sub_state; + if (ss) + printf(" Active: %s%s (%s)%s", + active_on, strna(i->active_state), ss, active_off); + else + printf(" Active: %s%s%s", + active_on, strna(i->active_state), active_off); + + fs = !isempty(i->freezer_state) && !streq(i->freezer_state, "running") ? i->freezer_state : NULL; + if (fs) + printf(" %s(%s)%s", ansi_highlight_yellow(), fs, ansi_normal()); + + if (!isempty(i->result) && !streq(i->result, "success")) + printf(" (Result: %s)", i->result); + + timestamp = STRPTR_IN_SET(i->active_state, "active", "reloading") ? i->active_enter_timestamp : + STRPTR_IN_SET(i->active_state, "inactive", "failed") ? i->inactive_enter_timestamp : + STRPTR_IN_SET(i->active_state, "activating") ? i->inactive_exit_timestamp : + i->active_exit_timestamp; + + if (timestamp_is_set(timestamp)) { + printf(" since %s; %s\n", + FORMAT_TIMESTAMP_STYLE(timestamp, arg_timestamp_style), + FORMAT_TIMESTAMP_RELATIVE(timestamp)); + if (streq_ptr(i->active_state, "active") && i->runtime_max_sec < USEC_INFINITY) { + usec_t until_timestamp; + + until_timestamp = usec_add(timestamp, i->runtime_max_sec); + printf(" Until: %s; %s\n", + FORMAT_TIMESTAMP_STYLE(until_timestamp, arg_timestamp_style), + FORMAT_TIMESTAMP_RELATIVE(until_timestamp)); + } + + if (!endswith(i->id, ".target") && + STRPTR_IN_SET(i->active_state, "inactive", "failed") && + timestamp_is_set(i->active_enter_timestamp) && + timestamp_is_set(i->active_exit_timestamp) && + i->active_exit_timestamp >= i->active_enter_timestamp) { + + usec_t duration; + + duration = i->active_exit_timestamp - i->active_enter_timestamp; + printf(" Duration: %s\n", FORMAT_TIMESPAN(duration, MSEC_PER_SEC)); + } + } else + printf("\n"); + + STRV_FOREACH(t, i->triggered_by) { + UnitActiveState state = _UNIT_ACTIVE_STATE_INVALID; + + (void) get_state_one_unit(bus, *t, &state); + format_active_state(unit_active_state_to_string(state), &on, &off); + + printf("%s %s%s%s %s\n", + t == i->triggered_by ? "TriggeredBy:" : " ", + on, special_glyph(unit_active_state_to_glyph(state)), off, + *t); + } + + if (endswith(i->id, ".timer")) { + dual_timestamp nw, next = {i->next_elapse_real, i->next_elapse_monotonic}; + usec_t next_elapse; + + dual_timestamp_now(&nw); + next_elapse = calc_next_elapse(&nw, &next); + + if (timestamp_is_set(next_elapse)) + printf(" Trigger: %s; %s\n", + FORMAT_TIMESTAMP_STYLE(next_elapse, arg_timestamp_style), + FORMAT_TIMESTAMP_RELATIVE(next_elapse)); + else + printf(" Trigger: n/a\n"); + } + + STRV_FOREACH(t, i->triggers) { + UnitActiveState state = _UNIT_ACTIVE_STATE_INVALID; + + (void) get_state_one_unit(bus, *t, &state); + format_active_state(unit_active_state_to_string(state), &on, &off); + + printf("%s %s%s%s %s\n", + t == i->triggers ? " Triggers:" : " ", + on, special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE), off, + *t); + } + + if (!i->condition_result && i->condition_timestamp > 0) { + int n = 0; + + printf(" Condition: start %scondition unmet%s at %s; %s\n", + ansi_highlight_yellow(), ansi_normal(), + FORMAT_TIMESTAMP_STYLE(i->condition_timestamp, arg_timestamp_style), + FORMAT_TIMESTAMP_RELATIVE(i->condition_timestamp)); + + LIST_FOREACH(conditions, c, i->conditions) + if (c->tristate < 0) + n++; + + LIST_FOREACH(conditions, c, i->conditions) + if (c->tristate < 0) + printf(" %s %s=%s%s%s was not met\n", + --n ? special_glyph(SPECIAL_GLYPH_TREE_BRANCH) : special_glyph(SPECIAL_GLYPH_TREE_RIGHT), + c->name, + c->trigger ? "|" : "", + c->negate ? "!" : "", + c->param); + } + + if (!i->assert_result && i->assert_timestamp > 0) { + printf(" Assert: start %sassertion failed%s at %s; %s\n", + ansi_highlight_red(), ansi_normal(), + FORMAT_TIMESTAMP_STYLE(i->assert_timestamp, arg_timestamp_style), + FORMAT_TIMESTAMP_RELATIVE(i->assert_timestamp)); + if (i->failed_assert_trigger) + printf(" none of the trigger assertions were met\n"); + else if (i->failed_assert) + printf(" %s=%s%s was not met\n", + i->failed_assert, + i->failed_assert_negate ? "!" : "", + i->failed_assert_parameter); + } + + if (i->sysfs_path) + printf(" Device: %s\n", i->sysfs_path); + if (i->where) + printf(" Where: %s\n", i->where); + if (i->what) + printf(" What: %s\n", i->what); + + STRV_FOREACH(t, i->documentation) { + _cleanup_free_ char *formatted = NULL; + const char *q; + + if (terminal_urlify(*t, NULL, &formatted) >= 0) + q = formatted; + else + q = *t; + + printf(" %*s %s\n", 9, t == i->documentation ? "Docs:" : "", q); + } + + STRV_FOREACH_PAIR(t, t2, i->listen) + printf(" %*s %s (%s)\n", 9, t == i->listen ? "Listen:" : "", *t2, *t); + + if (i->accept) { + printf(" Accepted: %u; Connected: %u;", i->n_accepted, i->n_connections); + if (i->n_refused) + printf(" Refused: %u", i->n_refused); + printf("\n"); + } + + LIST_FOREACH(exec_status_info_list, p, i->exec_status_info_list) { + _cleanup_free_ char *argv = NULL; + bool good; + + /* Only show exited processes here */ + if (p->code == 0) + continue; + + /* Don't print ExecXYZEx= properties here since it will appear as a + * duplicate of the non-Ex= variant. */ + if (endswith(p->name, "Ex")) + continue; + + argv = strv_join(p->argv, " "); + printf(" Process: "PID_FMT" %s=%s ", p->pid, p->name, strna(argv)); + + good = is_clean_exit(p->code, p->status, EXIT_CLEAN_DAEMON, NULL); + if (!good) { + on = p->ignore ? ansi_highlight_yellow() : ansi_highlight_red(); + off = ansi_normal(); + } else + on = off = ""; + + printf("%s(code=%s, ", on, sigchld_code_to_string(p->code)); + + if (p->code == CLD_EXITED) { + const char *c; + + printf("status=%i", p->status); + + c = exit_status_to_string(p->status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD); + if (c) + printf("/%s", c); + + } else + printf("signal=%s", signal_to_string(p->status)); + + printf(")%s\n", off); + + if (i->main_pid == p->pid && + i->start_timestamp == p->start_timestamp && + i->exit_timestamp == p->start_timestamp) + /* Let's not show this twice */ + i->main_pid = 0; + + if (p->pid == i->control_pid) + i->control_pid = 0; + } + + if (i->main_pid > 0 || i->control_pid > 0) { + if (i->main_pid > 0) { + printf(" Main PID: "PID_FMT, i->main_pid); + + if (i->running) { + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + _cleanup_free_ char *comm = NULL; + + (void) pid_get_comm(i->main_pid, &comm); + if (comm) + printf(" (%s)", comm); + } + + } else if (i->exit_code > 0) { + printf(" (code=%s, ", sigchld_code_to_string(i->exit_code)); + + if (i->exit_code == CLD_EXITED) { + const char *c; + + printf("status=%i", i->exit_status); + + c = exit_status_to_string(i->exit_status, + EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD); + if (c) + printf("/%s", c); + + } else + printf("signal=%s", signal_to_string(i->exit_status)); + printf(")"); + } + } + + if (i->control_pid > 0) { + _cleanup_free_ char *c = NULL; + + if (i->main_pid > 0) + fputs("; Control PID: ", stdout); + else + fputs(" Cntrl PID: ", stdout); /* if first in column, abbreviated so it fits alignment */ + + printf(PID_FMT, i->control_pid); + + if (arg_transport == BUS_TRANSPORT_LOCAL) { + (void) pid_get_comm(i->control_pid, &c); + if (c) + printf(" (%s)", c); + } + } + + printf("\n"); + } + + if (i->status_text) + printf(" Status: \"%s%s%s\"\n", ansi_highlight_cyan(), i->status_text, ansi_normal()); + if (i->status_errno > 0) { + errno = i->status_errno; + printf(" Error: %i (%m)\n", i->status_errno); + } + + if (i->ip_ingress_bytes != UINT64_MAX && i->ip_egress_bytes != UINT64_MAX) + printf(" IP: %s in, %s out\n", + FORMAT_BYTES(i->ip_ingress_bytes), + FORMAT_BYTES(i->ip_egress_bytes)); + + if (i->io_read_bytes != UINT64_MAX && i->io_write_bytes != UINT64_MAX) + printf(" IO: %s read, %s written\n", + FORMAT_BYTES(i->io_read_bytes), + FORMAT_BYTES(i->io_write_bytes)); + + if (i->tasks_current != UINT64_MAX) { + printf(" Tasks: %" PRIu64, i->tasks_current); + + if (i->tasks_max != UINT64_MAX) + printf(" (limit: %" PRIu64 ")\n", i->tasks_max); + else + printf("\n"); + } + + if (i->n_fd_store > 0 || i->fd_store_max > 0) + printf(" FD Store: %u%s (limit: %u)%s\n", i->n_fd_store, ansi_grey(), i->fd_store_max, ansi_normal()); + + if (i->memory_current != UINT64_MAX) { + printf(" Memory: %s", FORMAT_BYTES(i->memory_current)); + + /* Only show current swap if it ever was non-zero or is currently non-zero. In both cases + memory_swap_peak will be non-zero (and not CGROUP_LIMIT_MAX). + Only show the available memory if it was artificially limited. */ + bool show_memory_swap = !IN_SET(i->memory_swap_peak, 0, CGROUP_LIMIT_MAX), + show_memory_zswap_current = !IN_SET(i->memory_zswap_current, 0, CGROUP_LIMIT_MAX), + show_memory_available = i->memory_high != CGROUP_LIMIT_MAX || i->memory_max != CGROUP_LIMIT_MAX; + if (i->memory_peak != CGROUP_LIMIT_MAX || + show_memory_swap || + show_memory_zswap_current || + show_memory_available || + i->memory_min > 0 || + i->memory_low > 0 || i->startup_memory_low > 0 || + i->memory_high != CGROUP_LIMIT_MAX || i->startup_memory_high != CGROUP_LIMIT_MAX || + i->memory_max != CGROUP_LIMIT_MAX || i->startup_memory_max != CGROUP_LIMIT_MAX || + i->memory_swap_max != CGROUP_LIMIT_MAX || i->startup_memory_swap_max != CGROUP_LIMIT_MAX || + i->memory_zswap_max != CGROUP_LIMIT_MAX || i->startup_memory_zswap_max != CGROUP_LIMIT_MAX || + i->memory_available != CGROUP_LIMIT_MAX || + i->memory_limit != CGROUP_LIMIT_MAX) { + const char *prefix = ""; + + printf(" ("); + if (i->memory_min > 0) { + printf("%smin: %s", prefix, FORMAT_BYTES_CGROUP_PROTECTION(i->memory_min)); + prefix = " "; + } + if (i->memory_low > 0) { + printf("%slow: %s", prefix, FORMAT_BYTES_CGROUP_PROTECTION(i->memory_low)); + prefix = " "; + } + if (i->startup_memory_low > 0) { + printf("%slow (startup): %s", prefix, FORMAT_BYTES_CGROUP_PROTECTION(i->startup_memory_low)); + prefix = " "; + } + if (i->memory_high != CGROUP_LIMIT_MAX) { + printf("%shigh: %s", prefix, FORMAT_BYTES(i->memory_high)); + prefix = " "; + } + if (i->startup_memory_high != CGROUP_LIMIT_MAX) { + printf("%shigh (startup): %s", prefix, FORMAT_BYTES(i->startup_memory_high)); + prefix = " "; + } + if (i->memory_max != CGROUP_LIMIT_MAX) { + printf("%smax: %s", prefix, FORMAT_BYTES(i->memory_max)); + prefix = " "; + } + if (i->startup_memory_max != CGROUP_LIMIT_MAX) { + printf("%smax (startup): %s", prefix, FORMAT_BYTES(i->startup_memory_max)); + prefix = " "; + } + if (i->memory_swap_max != CGROUP_LIMIT_MAX) { + printf("%sswap max: %s", prefix, FORMAT_BYTES(i->memory_swap_max)); + prefix = " "; + } + if (i->startup_memory_swap_max != CGROUP_LIMIT_MAX) { + printf("%sswap max (startup): %s", prefix, FORMAT_BYTES(i->startup_memory_swap_max)); + prefix = " "; + } + if (i->memory_zswap_max != CGROUP_LIMIT_MAX) { + printf("%szswap max: %s", prefix, FORMAT_BYTES(i->memory_zswap_max)); + prefix = " "; + } + if (i->startup_memory_zswap_max != CGROUP_LIMIT_MAX) { + printf("%szswap max (startup): %s", prefix, FORMAT_BYTES(i->startup_memory_zswap_max)); + prefix = " "; + } + if (i->memory_limit != CGROUP_LIMIT_MAX) { + printf("%slimit: %s", prefix, FORMAT_BYTES(i->memory_limit)); + prefix = " "; + } + if (show_memory_available) { + printf("%savailable: %s", prefix, FORMAT_BYTES(i->memory_available)); + prefix = " "; + } + if (i->memory_peak != CGROUP_LIMIT_MAX) { + printf("%speak: %s", prefix, FORMAT_BYTES(i->memory_peak)); + prefix = " "; + } + if (show_memory_swap) { + printf("%sswap: %s swap peak: %s", prefix, + FORMAT_BYTES(i->memory_swap_current), FORMAT_BYTES(i->memory_swap_peak)); + prefix = " "; + } + if (show_memory_zswap_current) { + printf("%szswap: %s", prefix, FORMAT_BYTES(i->memory_zswap_current)); + prefix = " "; + } + printf(")"); + } + printf("\n"); + } + + if (i->cpu_usage_nsec != UINT64_MAX) + printf(" CPU: %s\n", FORMAT_TIMESPAN(i->cpu_usage_nsec / NSEC_PER_USEC, USEC_PER_MSEC)); + + if (i->control_group) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + static const char prefix[] = " "; + unsigned c; + + printf(" CGroup: %s\n", i->control_group); + + c = LESS_BY(columns(), strlen(prefix)); + + r = unit_show_processes(bus, i->id, i->control_group, prefix, c, get_output_flags(), &error); + if (r == -EBADR && arg_transport == BUS_TRANSPORT_LOCAL) { + unsigned k = 0; + pid_t extra[2]; + + /* Fallback for older systemd versions where the GetUnitProcesses() call is not yet available */ + + if (i->main_pid > 0) + extra[k++] = i->main_pid; + + if (i->control_pid > 0) + extra[k++] = i->control_pid; + + show_cgroup_and_extra(SYSTEMD_CGROUP_CONTROLLER, i->control_group, prefix, c, extra, k, get_output_flags()); + } else if (r < 0) + log_warning_errno(r, "Failed to dump process list for '%s', ignoring: %s", + i->id, bus_error_message(&error, r)); + } + + if (i->id && arg_transport == BUS_TRANSPORT_LOCAL) + show_journal_by_unit( + stdout, + i->id, + i->log_namespace, + arg_output, + 0, + i->inactive_exit_timestamp_monotonic, + arg_lines, + getuid(), + get_output_flags() | OUTPUT_BEGIN_NEWLINE, + SD_JOURNAL_LOCAL_ONLY, + arg_runtime_scope == RUNTIME_SCOPE_SYSTEM, + ellipsized); + + if (i->need_daemon_reload) + warn_unit_file_changed(i->id); +} + +static void show_unit_help(UnitStatusInfo *i) { + bool previous_man_page = false; + + assert(i); + + if (!i->documentation) { + log_info("Documentation for %s not known.", i->id); + return; + } + + STRV_FOREACH(doc, i->documentation) { + const char *p; + + p = startswith(*doc, "man:"); + + if (p ? doc != i->documentation : previous_man_page) { + puts(""); + fflush(stdout); + } + + previous_man_page = p; + + if (p) + show_man_page(p, /* null_stdio= */ false); + else { + _cleanup_free_ char *t = NULL; + + if ((p = startswith(*doc, "file://"))) + (void) terminal_urlify_path(p, NULL, &t); + + printf("Additional documentation: %s\n", t ?: p ?: *doc); + } + } +} + +static int map_main_pid(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + UnitStatusInfo *i = userdata; + uint32_t u; + int r; + + r = sd_bus_message_read(m, "u", &u); + if (r < 0) + return r; + + i->main_pid = (pid_t) u; + i->running = u > 0; + + return 0; +} + +static int map_load_error(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + const char *message, **p = userdata; + int r; + + r = sd_bus_message_read(m, "(ss)", NULL, &message); + if (r < 0) + return r; + + if (!isempty(message)) + *p = message; + + return 0; +} + +static int map_listen(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + const char *type, *path; + char ***p = userdata; + int r; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(m, "(ss)", &type, &path)) > 0) { + + r = strv_extend(p, type); + if (r < 0) + return r; + + r = strv_extend(p, path); + if (r < 0) + return r; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int map_conditions(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + UnitStatusInfo *i = userdata; + const char *cond, *param; + int trigger, negate; + int32_t state; + int r; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sbbsi)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(m, "(sbbsi)", &cond, &trigger, &negate, ¶m, &state)) > 0) { + _cleanup_(unit_condition_freep) UnitCondition *c = NULL; + + c = new(UnitCondition, 1); + if (!c) + return -ENOMEM; + + *c = (UnitCondition) { + .name = strdup(cond), + .param = strdup(param), + .trigger = trigger, + .negate = negate, + .tristate = state, + }; + + if (!c->name || !c->param) + return -ENOMEM; + + LIST_PREPEND(conditions, i->conditions, TAKE_PTR(c)); + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int map_asserts(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + UnitStatusInfo *i = userdata; + const char *cond, *param; + int trigger, negate; + int32_t state; + int r; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sbbsi)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(m, "(sbbsi)", &cond, &trigger, &negate, ¶m, &state)) > 0) { + if (state < 0 && (!trigger || !i->failed_assert)) { + i->failed_assert = cond; + i->failed_assert_trigger = trigger; + i->failed_assert_negate = negate; + i->failed_assert_parameter = param; + } + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int map_exec(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + _cleanup_free_ ExecStatusInfo *info = NULL; + ExecStatusInfo *last; + UnitStatusInfo *i = userdata; + bool is_ex_prop = endswith(member, "Ex"); + int r; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, is_ex_prop ? "(sasasttttuii)" : "(sasbttttuii)"); + if (r < 0) + return r; + + info = new0(ExecStatusInfo, 1); + if (!info) + return -ENOMEM; + + last = LIST_FIND_TAIL(exec_status_info_list, i->exec_status_info_list); + + while ((r = exec_status_info_deserialize(m, info, is_ex_prop)) > 0) { + + info->name = strdup(member); + if (!info->name) + return -ENOMEM; + + LIST_INSERT_AFTER(exec_status_info_list, i->exec_status_info_list, last, info); + last = info; + + info = new0(ExecStatusInfo, 1); + if (!info) + return -ENOMEM; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + return 0; +} + +static int print_property(const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags) { + char bus_type; + const char *contents; + int r; + + assert(name); + assert(m); + + /* This is a low-level property printer, see print_status_info() for the nicer output */ + + r = sd_bus_message_peek_type(m, &bus_type, &contents); + if (r < 0) + return r; + + switch (bus_type) { + + case SD_BUS_TYPE_INT32: + if (endswith(name, "ActionExitStatus")) { + int32_t i; + + r = sd_bus_message_read_basic(m, bus_type, &i); + if (r < 0) + return r; + + if (i >= 0 && i <= 255) + bus_print_property_valuef(name, expected_value, flags, "%"PRIi32, i); + else if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY)) + bus_print_property_value(name, expected_value, flags, "[not set]"); + + return 1; + } else if (streq(name, "NUMAPolicy")) { + int32_t i; + + r = sd_bus_message_read_basic(m, bus_type, &i); + if (r < 0) + return r; + + bus_print_property_valuef(name, expected_value, flags, "%s", strna(mpol_to_string(i))); + + return 1; + } + break; + + case SD_BUS_TYPE_UINT64: + if (endswith(name, "Timestamp")) { + uint64_t timestamp; + + r = sd_bus_message_read_basic(m, bus_type, ×tamp); + if (r < 0) + return r; + + bus_print_property_value(name, expected_value, flags, FORMAT_TIMESTAMP_STYLE(timestamp, arg_timestamp_style)); + + return 1; + } + break; + + case SD_BUS_TYPE_STRUCT: + + if (contents[0] == SD_BUS_TYPE_UINT32 && streq(name, "Job")) { + uint32_t u; + + r = sd_bus_message_read(m, "(uo)", &u, NULL); + if (r < 0) + return bus_log_parse_error(r); + + if (u > 0) + bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u); + else + bus_print_property_value(name, expected_value, flags, NULL); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRING && streq(name, "Unit")) { + const char *s; + + r = sd_bus_message_read(m, "(so)", &s, NULL); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, s); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRING && streq(name, "LoadError")) { + const char *a = NULL, *b = NULL; + + r = sd_bus_message_read(m, "(ss)", &a, &b); + if (r < 0) + return bus_log_parse_error(r); + + if (!isempty(a) || !isempty(b)) + bus_print_property_valuef(name, expected_value, flags, "%s \"%s\"", strempty(a), strempty(b)); + else + bus_print_property_value(name, expected_value, flags, NULL); + + return 1; + + } else if (STR_IN_SET(name, "SystemCallFilter", "SystemCallLog", "RestrictAddressFamilies", "RestrictNetworkInterfaces", "RestrictFileSystems")) { + _cleanup_strv_free_ char **l = NULL; + int allow_list; + + r = sd_bus_message_enter_container(m, 'r', "bas"); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(m, "b", &allow_list); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_strv(m, &l); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || allow_list || !strv_isempty(l)) { + bool first = true; + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) { + fputs(name, stdout); + fputc('=', stdout); + } + + if (!allow_list) + fputc('~', stdout); + + STRV_FOREACH(i, l) { + if (first) + first = false; + else + fputc(' ', stdout); + + fputs(*i, stdout); + } + fputc('\n', stdout); + } + + return 1; + + } else if (STR_IN_SET(name, "SELinuxContext", "AppArmorProfile", "SmackProcessLabel")) { + int ignore; + const char *s; + + r = sd_bus_message_read(m, "(bs)", &ignore, &s); + if (r < 0) + return bus_log_parse_error(r); + + if (!isempty(s)) + bus_print_property_valuef(name, expected_value, flags, "%s%s", ignore ? "-" : "", s); + else + bus_print_property_value(name, expected_value, flags, NULL); + + return 1; + + } else if (endswith(name, "ExitStatus") && streq(contents, "aiai")) { + const int32_t *status, *signal; + size_t n_status, n_signal; + + r = sd_bus_message_enter_container(m, 'r', "aiai"); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(m, 'i', (const void **) &status, &n_status); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(m, 'i', (const void **) &signal, &n_signal); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + n_status /= sizeof(int32_t); + n_signal /= sizeof(int32_t); + + if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || n_status > 0 || n_signal > 0) { + bool first = true; + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) { + fputs(name, stdout); + fputc('=', stdout); + } + + for (size_t i = 0; i < n_status; i++) { + if (first) + first = false; + else + fputc(' ', stdout); + + printf("%"PRIi32, status[i]); + } + + for (size_t i = 0; i < n_signal; i++) { + const char *str; + + str = signal_to_string((int) signal[i]); + + if (first) + first = false; + else + fputc(' ', stdout); + + if (str) + fputs(str, stdout); + else + printf("%"PRIi32, status[i]); + } + + fputc('\n', stdout); + } + return 1; + } + + break; + + case SD_BUS_TYPE_ARRAY: + + if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "EnvironmentFiles")) { + const char *path; + int ignore; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sb)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(sb)", &path, &ignore)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s (ignore_errors=%s)", path, yes_no(ignore)); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "Paths")) { + const char *type, *path; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &type, &path)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s (%s)", path, type); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "Listen")) { + const char *type, *path; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &type, &path)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s (%s)", path, type); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "TimersMonotonic")) { + const char *base; + uint64_t v, next_elapse; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(stt)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(stt)", &base, &v, &next_elapse)) > 0) + bus_print_property_valuef(name, expected_value, flags, + "{ %s=%s ; next_elapse=%s }", + base, + strna(FORMAT_TIMESPAN(v, 0)), + strna(FORMAT_TIMESPAN(next_elapse, 0))); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "TimersCalendar")) { + const char *base, *spec; + uint64_t next_elapse; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sst)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(sst)", &base, &spec, &next_elapse)) > 0) + bus_print_property_valuef(name, expected_value, flags, + "{ %s=%s ; next_elapse=%s }", base, spec, + FORMAT_TIMESTAMP_STYLE(next_elapse, arg_timestamp_style)); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && startswith(name, "Exec")) { + ExecStatusInfo info = {}; + bool is_ex_prop = endswith(name, "Ex"); + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, is_ex_prop ? "(sasasttttuii)" : "(sasbttttuii)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = exec_status_info_deserialize(m, &info, is_ex_prop)) > 0) { + _cleanup_strv_free_ char **optv = NULL; + _cleanup_free_ char *tt = NULL, *o = NULL; + + tt = strv_join(info.argv, " "); + + if (is_ex_prop) { + r = exec_command_flags_to_strv(info.flags, &optv); + if (r < 0) + return log_error_errno(r, "Failed to convert ExecCommandFlags to strv: %m"); + + o = strv_join(optv, " "); + + bus_print_property_valuef(name, expected_value, flags, + "{ path=%s ; argv[]=%s ; flags=%s ; start_time=[%s] ; stop_time=[%s] ; pid="PID_FMT" ; code=%s ; status=%i%s%s }", + strna(info.path), + strna(tt), + strna(o), + strna(FORMAT_TIMESTAMP_STYLE(info.start_timestamp, arg_timestamp_style)), + strna(FORMAT_TIMESTAMP_STYLE(info.exit_timestamp, arg_timestamp_style)), + info.pid, + sigchld_code_to_string(info.code), + info.status, + info.code == CLD_EXITED ? "" : "/", + strempty(info.code == CLD_EXITED ? NULL : signal_to_string(info.status))); + } else + bus_print_property_valuef(name, expected_value, flags, + "{ path=%s ; argv[]=%s ; ignore_errors=%s ; start_time=[%s] ; stop_time=[%s] ; pid="PID_FMT" ; code=%s ; status=%i%s%s }", + strna(info.path), + strna(tt), + yes_no(info.ignore), + strna(FORMAT_TIMESTAMP_STYLE(info.start_timestamp, arg_timestamp_style)), + strna(FORMAT_TIMESTAMP_STYLE(info.exit_timestamp, arg_timestamp_style)), + info.pid, + sigchld_code_to_string(info.code), + info.status, + info.code == CLD_EXITED ? "" : "/", + strempty(info.code == CLD_EXITED ? NULL : signal_to_string(info.status))); + + free(info.path); + strv_free(info.argv); + zero(info); + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "DeviceAllow")) { + const char *path, *rwm; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &path, &rwm)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s %s", strna(path), strna(rwm)); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && + STR_IN_SET(name, "IODeviceWeight", "BlockIODeviceWeight")) { + const char *path; + uint64_t weight; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(st)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(st)", &path, &weight)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s %"PRIu64, strna(path), weight); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && + (cgroup_io_limit_type_from_string(name) >= 0 || + STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth"))) { + const char *path; + uint64_t bandwidth; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(st)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(st)", &path, &bandwidth)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s %"PRIu64, strna(path), bandwidth); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && + streq(name, "IODeviceLatencyTargetUSec")) { + const char *path; + uint64_t target; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(st)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(st)", &path, &target)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s %s", strna(path), + FORMAT_TIMESPAN(target, 1)); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + + } else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "StandardInputData", "RootHashSignature")) { + _cleanup_free_ char *h = NULL; + const void *p; + size_t sz; + ssize_t n; + + r = sd_bus_message_read_array(m, 'y', &p, &sz); + if (r < 0) + return bus_log_parse_error(r); + + n = base64mem(p, sz, &h); + if (n < 0) + return log_oom(); + + bus_print_property_value(name, expected_value, flags, h); + + return 1; + + } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) { + _cleanup_free_ char *addresses = NULL; + + r = sd_bus_message_enter_container(m, 'a', "(iayu)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + uint32_t prefixlen; + int32_t family; + const void *ap; + size_t an; + + r = sd_bus_message_enter_container(m, 'r', "iayu"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = sd_bus_message_read(m, "i", &family); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read_array(m, 'y', &ap, &an); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_read(m, "u", &prefixlen); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + if (!IN_SET(family, AF_INET, AF_INET6)) + continue; + + if (an != FAMILY_ADDRESS_SIZE(family)) + continue; + + if (prefixlen > FAMILY_ADDRESS_SIZE(family) * 8) + continue; + + if (!strextend_with_separator(&addresses, " ", + IN_ADDR_PREFIX_TO_STRING(family, ap, prefixlen))) + return log_oom(); + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, addresses); + + return 1; + + } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) { + _cleanup_free_ char *paths = NULL; + const char *source, *dest; + int ignore_enoent; + uint64_t rbind; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ssbt)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ssbt)", &source, &dest, &ignore_enoent, &rbind)) > 0) { + _cleanup_free_ char *str = NULL; + + if (isempty(source)) + continue; + + if (asprintf(&str, "%s%s%s%s%s", + ignore_enoent ? "-" : "", + source, + isempty(dest) ? "" : ":", + strempty(dest), + rbind == MS_REC ? ":rbind" : "") < 0) + return log_oom(); + + if (!strextend_with_separator(&paths, " ", str)) + return log_oom(); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, paths); + + return 1; + + } else if (streq(name, "TemporaryFileSystem")) { + _cleanup_free_ char *paths = NULL; + const char *target, *option; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &target, &option)) > 0) { + _cleanup_free_ char *str = NULL; + + if (isempty(target)) + continue; + + if (asprintf(&str, "%s%s%s", target, isempty(option) ? "" : ":", strempty(option)) < 0) + return log_oom(); + + if (!strextend_with_separator(&paths, " ", str)) + return log_oom(); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, paths); + + return 1; + + } else if (streq(name, "LogExtraFields")) { + _cleanup_free_ char *fields = NULL; + const void *p; + size_t sz; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "ay"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read_array(m, 'y', &p, &sz)) > 0) { + _cleanup_free_ char *str = NULL; + const char *eq; + + if (memchr(p, 0, sz)) + continue; + + eq = memchr(p, '=', sz); + if (!eq) + continue; + + if (!journal_field_valid(p, eq - (const char*) p, false)) + continue; + + str = malloc(sz + 1); + if (!str) + return log_oom(); + + memcpy(str, p, sz); + str[sz] = '\0'; + + if (!utf8_is_valid(str)) + continue; + + if (!strextend_with_separator(&fields, " ", str)) + return log_oom(); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, fields); + + return 1; + } else if (contents[0] == SD_BUS_TYPE_BYTE && + STR_IN_SET(name, + "CPUAffinity", "NUMAMask", "AllowedCPUs", "AllowedMemoryNodes", + "EffectiveCPUs", "EffectiveMemoryNodes")) { + + _cleanup_free_ char *affinity = NULL; + _cleanup_(cpu_set_reset) CPUSet set = {}; + const void *a; + size_t n; + + r = sd_bus_message_read_array(m, 'y', &a, &n); + if (r < 0) + return bus_log_parse_error(r); + + r = cpu_set_from_dbus(a, n, &set); + if (r < 0) + return log_error_errno(r, "Failed to deserialize %s: %m", name); + + affinity = cpu_set_to_range_string(&set); + if (!affinity) + return log_oom(); + + bus_print_property_value(name, expected_value, flags, affinity); + + return 1; + } else if (streq(name, "LogFilterPatterns")) { + int is_allowlist; + const char *pattern; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(bs)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(bs)", &is_allowlist, &pattern)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s%s", is_allowlist ? "" : "~", pattern); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + } else if (streq(name, "MountImages")) { + _cleanup_free_ char *paths = NULL; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ssba(ss))"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + _cleanup_free_ char *str = NULL; + const char *source, *destination, *partition, *mount_options; + int ignore_enoent; + + r = sd_bus_message_enter_container(m, 'r', "ssba(ss)"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = sd_bus_message_read(m, "ssb", &source, &destination, &ignore_enoent); + if (r < 0) + return bus_log_parse_error(r); + + str = strjoin(ignore_enoent ? "-" : "", + source, + ":", + destination); + if (!str) + return log_oom(); + + r = sd_bus_message_enter_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &partition, &mount_options)) > 0) + if (!strextend_with_separator(&str, ":", partition, mount_options)) + return log_oom(); + if (r < 0) + return bus_log_parse_error(r); + + if (!strextend_with_separator(&paths, " ", str)) + return log_oom(); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, paths); + + return 1; + + } else if (streq(name, "ExtensionImages")) { + _cleanup_free_ char *paths = NULL; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sba(ss))"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + _cleanup_free_ char *str = NULL; + const char *source, *partition, *mount_options; + int ignore_enoent; + + r = sd_bus_message_enter_container(m, 'r', "sba(ss)"); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + r = sd_bus_message_read(m, "sb", &source, &ignore_enoent); + if (r < 0) + return bus_log_parse_error(r); + + str = strjoin(ignore_enoent ? "-" : "", source); + if (!str) + return log_oom(); + + r = sd_bus_message_enter_container(m, 'a', "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &partition, &mount_options)) > 0) + if (!strextend_with_separator(&str, ":", partition, mount_options)) + return log_oom(); + if (r < 0) + return bus_log_parse_error(r); + + if (!strextend_with_separator(&paths, " ", str)) + return log_oom(); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + } + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + bus_print_property_value(name, expected_value, flags, paths); + + return 1; + + } else if (streq(name, "BPFProgram")) { + const char *a, *p; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &a, &p)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s:%s", a, p); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + } else if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) { + uint16_t nr_ports, port_min; + int32_t af, ip_protocol; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(iiqq)"); + if (r < 0) + return bus_log_parse_error(r); + while ((r = sd_bus_message_read(m, "(iiqq)", &af, &ip_protocol, &nr_ports, &port_min)) > 0) { + const char *family, *colon1, *protocol = "", *colon2 = ""; + + family = strempty(af_to_ipv4_ipv6(af)); + colon1 = isempty(family) ? "" : ":"; + + if (ip_protocol != 0) { + protocol = ip_protocol_to_tcp_udp(ip_protocol); + colon2 = ""; + } + + if (nr_ports == 0) + bus_print_property_valuef(name, expected_value, flags, "%s%s%s%sany", + family, colon1, protocol, colon2); + else if (nr_ports == 1) + bus_print_property_valuef( + name, expected_value, flags, "%s%s%s%s%hu", + family, colon1, protocol, colon2, port_min); + else + bus_print_property_valuef( + name, expected_value, flags, "%s%s%s%s%hu-%hu", + family, colon1, protocol, colon2, port_min, + (uint16_t) (port_min + nr_ports - 1)); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + } else if (STR_IN_SET(name, "StateDirectorySymlink", "RuntimeDirectorySymlink", "CacheDirectorySymlink", "LogsDirectorySymlink")) { + const char *a, *p; + uint64_t symlink_flags; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sst)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(sst)", &a, &p, &symlink_flags)) > 0) + bus_print_property_valuef(name, expected_value, flags, "%s:%s", a, p); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + } else if (contents[0] == SD_BUS_TYPE_STRUCT_BEGIN && streq(name, "OpenFile")) { + char *path, *fdname; + uint64_t offlags; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sst)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(sst)", &path, &fdname, &offlags)) > 0) { + _cleanup_free_ char *ofs = NULL; + + r = open_file_to_string( + &(OpenFile){ + .path = path, + .fdname = fdname, + .flags = offlags, + }, + &ofs); + if (r < 0) + return log_error_errno( + r, "Failed to convert OpenFile= value to string: %m"); + + bus_print_property_value(name, expected_value, flags, ofs); + } + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; + } + + break; + } + + return 0; +} + +typedef enum SystemctlShowMode{ + SYSTEMCTL_SHOW_PROPERTIES, + SYSTEMCTL_SHOW_STATUS, + SYSTEMCTL_SHOW_HELP, + _SYSTEMCTL_SHOW_MODE_MAX, + _SYSTEMCTL_SHOW_MODE_INVALID = -EINVAL, +} SystemctlShowMode; + +static const char* const systemctl_show_mode_table[_SYSTEMCTL_SHOW_MODE_MAX] = { + [SYSTEMCTL_SHOW_PROPERTIES] = "show", + [SYSTEMCTL_SHOW_STATUS] = "status", + [SYSTEMCTL_SHOW_HELP] = "help", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(systemctl_show_mode, SystemctlShowMode); + +static int show_one( + sd_bus *bus, + const char *path, + const char *unit, + SystemctlShowMode show_mode, + bool *new_line, + bool *ellipsized) { + + static const struct bus_properties_map property_map[] = { + { "Id", "s", NULL, offsetof(UnitStatusInfo, id) }, + { "LoadState", "s", NULL, offsetof(UnitStatusInfo, load_state) }, + { "ActiveState", "s", NULL, offsetof(UnitStatusInfo, active_state) }, + { "FreezerState", "s", NULL, offsetof(UnitStatusInfo, freezer_state) }, + { "Documentation", "as", NULL, offsetof(UnitStatusInfo, documentation) }, + {} + }, status_map[] = { + { "Id", "s", NULL, offsetof(UnitStatusInfo, id) }, + { "LoadState", "s", NULL, offsetof(UnitStatusInfo, load_state) }, + { "ActiveState", "s", NULL, offsetof(UnitStatusInfo, active_state) }, + { "FreezerState", "s", NULL, offsetof(UnitStatusInfo, freezer_state) }, + { "SubState", "s", NULL, offsetof(UnitStatusInfo, sub_state) }, + { "UnitFileState", "s", NULL, offsetof(UnitStatusInfo, unit_file_state) }, + { "UnitFilePreset", "s", NULL, offsetof(UnitStatusInfo, unit_file_preset) }, + { "Description", "s", NULL, offsetof(UnitStatusInfo, description) }, + { "Following", "s", NULL, offsetof(UnitStatusInfo, following) }, + { "Documentation", "as", NULL, offsetof(UnitStatusInfo, documentation) }, + { "FragmentPath", "s", NULL, offsetof(UnitStatusInfo, fragment_path) }, + { "SourcePath", "s", NULL, offsetof(UnitStatusInfo, source_path) }, + { "ControlGroup", "s", NULL, offsetof(UnitStatusInfo, control_group) }, + { "DropInPaths", "as", NULL, offsetof(UnitStatusInfo, dropin_paths) }, + { "LoadError", "(ss)", map_load_error, offsetof(UnitStatusInfo, load_error) }, + { "Result", "s", NULL, offsetof(UnitStatusInfo, result) }, + { "TriggeredBy", "as", NULL, offsetof(UnitStatusInfo, triggered_by) }, + { "Triggers", "as", NULL, offsetof(UnitStatusInfo, triggers) }, + { "InactiveExitTimestamp", "t", NULL, offsetof(UnitStatusInfo, inactive_exit_timestamp) }, + { "InactiveExitTimestampMonotonic", "t", NULL, offsetof(UnitStatusInfo, inactive_exit_timestamp_monotonic) }, + { "ActiveEnterTimestamp", "t", NULL, offsetof(UnitStatusInfo, active_enter_timestamp) }, + { "ActiveExitTimestamp", "t", NULL, offsetof(UnitStatusInfo, active_exit_timestamp) }, + { "RuntimeMaxUSec", "t", NULL, offsetof(UnitStatusInfo, runtime_max_sec) }, + { "InactiveEnterTimestamp", "t", NULL, offsetof(UnitStatusInfo, inactive_enter_timestamp) }, + { "NeedDaemonReload", "b", NULL, offsetof(UnitStatusInfo, need_daemon_reload) }, + { "Transient", "b", NULL, offsetof(UnitStatusInfo, transient) }, + { "ExecMainPID", "u", NULL, offsetof(UnitStatusInfo, main_pid) }, + { "MainPID", "u", map_main_pid, 0 }, + { "ControlPID", "u", NULL, offsetof(UnitStatusInfo, control_pid) }, + { "StatusText", "s", NULL, offsetof(UnitStatusInfo, status_text) }, + { "PIDFile", "s", NULL, offsetof(UnitStatusInfo, pid_file) }, + { "StatusErrno", "i", NULL, offsetof(UnitStatusInfo, status_errno) }, + { "FileDescriptorStoreMax", "u", NULL, offsetof(UnitStatusInfo, fd_store_max) }, + { "NFileDescriptorStore", "u", NULL, offsetof(UnitStatusInfo, n_fd_store) }, + { "ExecMainStartTimestamp", "t", NULL, offsetof(UnitStatusInfo, start_timestamp) }, + { "ExecMainExitTimestamp", "t", NULL, offsetof(UnitStatusInfo, exit_timestamp) }, + { "ExecMainCode", "i", NULL, offsetof(UnitStatusInfo, exit_code) }, + { "ExecMainStatus", "i", NULL, offsetof(UnitStatusInfo, exit_status) }, + { "LogNamespace", "s", NULL, offsetof(UnitStatusInfo, log_namespace) }, + { "ConditionTimestamp", "t", NULL, offsetof(UnitStatusInfo, condition_timestamp) }, + { "ConditionResult", "b", NULL, offsetof(UnitStatusInfo, condition_result) }, + { "Conditions", "a(sbbsi)", map_conditions, 0 }, + { "AssertTimestamp", "t", NULL, offsetof(UnitStatusInfo, assert_timestamp) }, + { "AssertResult", "b", NULL, offsetof(UnitStatusInfo, assert_result) }, + { "Asserts", "a(sbbsi)", map_asserts, 0 }, + { "NextElapseUSecRealtime", "t", NULL, offsetof(UnitStatusInfo, next_elapse_real) }, + { "NextElapseUSecMonotonic", "t", NULL, offsetof(UnitStatusInfo, next_elapse_monotonic) }, + { "NAccepted", "u", NULL, offsetof(UnitStatusInfo, n_accepted) }, + { "NConnections", "u", NULL, offsetof(UnitStatusInfo, n_connections) }, + { "NRefused", "u", NULL, offsetof(UnitStatusInfo, n_refused) }, + { "Accept", "b", NULL, offsetof(UnitStatusInfo, accept) }, + { "Listen", "a(ss)", map_listen, offsetof(UnitStatusInfo, listen) }, + { "SysFSPath", "s", NULL, offsetof(UnitStatusInfo, sysfs_path) }, + { "Where", "s", NULL, offsetof(UnitStatusInfo, where) }, + { "What", "s", NULL, offsetof(UnitStatusInfo, what) }, + { "MemoryCurrent", "t", NULL, offsetof(UnitStatusInfo, memory_current) }, + { "MemoryPeak", "t", NULL, offsetof(UnitStatusInfo, memory_peak) }, + { "MemorySwapCurrent", "t", NULL, offsetof(UnitStatusInfo, memory_swap_current) }, + { "MemorySwapPeak", "t", NULL, offsetof(UnitStatusInfo, memory_swap_peak) }, + { "MemoryZSwapCurrent", "t", NULL, offsetof(UnitStatusInfo, memory_zswap_current) }, + { "MemoryAvailable", "t", NULL, offsetof(UnitStatusInfo, memory_available) }, + { "DefaultMemoryMin", "t", NULL, offsetof(UnitStatusInfo, default_memory_min) }, + { "DefaultMemoryLow", "t", NULL, offsetof(UnitStatusInfo, default_memory_low) }, + { "DefaultStartupMemoryLow", "t", NULL, offsetof(UnitStatusInfo, default_startup_memory_low) }, + { "MemoryMin", "t", NULL, offsetof(UnitStatusInfo, memory_min) }, + { "MemoryLow", "t", NULL, offsetof(UnitStatusInfo, memory_low) }, + { "StartupMemoryLow", "t", NULL, offsetof(UnitStatusInfo, startup_memory_low) }, + { "MemoryHigh", "t", NULL, offsetof(UnitStatusInfo, memory_high) }, + { "StartupMemoryHigh", "t", NULL, offsetof(UnitStatusInfo, startup_memory_high) }, + { "MemoryMax", "t", NULL, offsetof(UnitStatusInfo, memory_max) }, + { "StartupMemoryMax", "t", NULL, offsetof(UnitStatusInfo, startup_memory_max) }, + { "MemorySwapMax", "t", NULL, offsetof(UnitStatusInfo, memory_swap_max) }, + { "StartupMemorySwapMax", "t", NULL, offsetof(UnitStatusInfo, startup_memory_swap_max) }, + { "MemoryZSwapMax", "t", NULL, offsetof(UnitStatusInfo, memory_zswap_max) }, + { "StartupMemoryZSwapMax", "t", NULL, offsetof(UnitStatusInfo, startup_memory_zswap_max) }, + { "MemoryLimit", "t", NULL, offsetof(UnitStatusInfo, memory_limit) }, + { "CPUUsageNSec", "t", NULL, offsetof(UnitStatusInfo, cpu_usage_nsec) }, + { "TasksCurrent", "t", NULL, offsetof(UnitStatusInfo, tasks_current) }, + { "TasksMax", "t", NULL, offsetof(UnitStatusInfo, tasks_max) }, + { "IPIngressBytes", "t", NULL, offsetof(UnitStatusInfo, ip_ingress_bytes) }, + { "IPEgressBytes", "t", NULL, offsetof(UnitStatusInfo, ip_egress_bytes) }, + { "IOReadBytes", "t", NULL, offsetof(UnitStatusInfo, io_read_bytes) }, + { "IOWriteBytes", "t", NULL, offsetof(UnitStatusInfo, io_write_bytes) }, + { "ExecCondition", "a(sasbttttuii)", map_exec, 0 }, + { "ExecConditionEx", "a(sasasttttuii)", map_exec, 0 }, + { "ExecStartPre", "a(sasbttttuii)", map_exec, 0 }, + { "ExecStartPreEx", "a(sasasttttuii)", map_exec, 0 }, + { "ExecStart", "a(sasbttttuii)", map_exec, 0 }, + { "ExecStartEx", "a(sasasttttuii)", map_exec, 0 }, + { "ExecStartPost", "a(sasbttttuii)", map_exec, 0 }, + { "ExecStartPostEx", "a(sasasttttuii)", map_exec, 0 }, + { "ExecReload", "a(sasbttttuii)", map_exec, 0 }, + { "ExecReloadEx", "a(sasasttttuii)", map_exec, 0 }, + { "ExecStopPre", "a(sasbttttuii)", map_exec, 0 }, + { "ExecStop", "a(sasbttttuii)", map_exec, 0 }, + { "ExecStopEx", "a(sasasttttuii)", map_exec, 0 }, + { "ExecStopPost", "a(sasbttttuii)", map_exec, 0 }, + { "ExecStopPostEx", "a(sasasttttuii)", map_exec, 0 }, + {} + }; + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_set_free_ Set *found_properties = NULL; + _cleanup_(unit_status_info_done) UnitStatusInfo info = { + .runtime_max_sec = USEC_INFINITY, + .memory_current = UINT64_MAX, + .memory_high = CGROUP_LIMIT_MAX, + .startup_memory_high = CGROUP_LIMIT_MAX, + .memory_max = CGROUP_LIMIT_MAX, + .startup_memory_max = CGROUP_LIMIT_MAX, + .memory_swap_max = CGROUP_LIMIT_MAX, + .startup_memory_swap_max = CGROUP_LIMIT_MAX, + .memory_zswap_max = CGROUP_LIMIT_MAX, + .startup_memory_zswap_max = CGROUP_LIMIT_MAX, + .memory_limit = CGROUP_LIMIT_MAX, + .memory_peak = CGROUP_LIMIT_MAX, + .memory_swap_current = CGROUP_LIMIT_MAX, + .memory_swap_peak = CGROUP_LIMIT_MAX, + .memory_zswap_current = CGROUP_LIMIT_MAX, + .memory_available = CGROUP_LIMIT_MAX, + .cpu_usage_nsec = UINT64_MAX, + .tasks_current = UINT64_MAX, + .tasks_max = UINT64_MAX, + .ip_ingress_bytes = UINT64_MAX, + .ip_egress_bytes = UINT64_MAX, + .io_read_bytes = UINT64_MAX, + .io_write_bytes = UINT64_MAX, + }; + int r; + + assert(path); + assert(new_line); + + log_debug("Showing one %s", path); + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + path, + show_mode == SYSTEMCTL_SHOW_STATUS ? status_map : property_map, + BUS_MAP_BOOLEAN_AS_BOOL, + &error, + &reply, + &info); + if (r < 0) + return log_error_errno(r, "Failed to get properties: %s", bus_error_message(&error, r)); + + if (unit && streq_ptr(info.load_state, "not-found") && streq_ptr(info.active_state, "inactive")) { + log_full(show_mode == SYSTEMCTL_SHOW_PROPERTIES ? LOG_DEBUG : LOG_ERR, + "Unit %s could not be found.", unit); + + if (show_mode == SYSTEMCTL_SHOW_STATUS) + return EXIT_PROGRAM_OR_SERVICES_STATUS_UNKNOWN; + if (show_mode == SYSTEMCTL_SHOW_HELP) + return -ENOENT; + } + + if (*new_line) + printf("\n"); + + *new_line = true; + + if (show_mode == SYSTEMCTL_SHOW_STATUS) { + print_status_info(bus, &info, ellipsized); + + if (info.active_state && !STR_IN_SET(info.active_state, "active", "reloading")) + return EXIT_PROGRAM_NOT_RUNNING; + + return EXIT_PROGRAM_RUNNING_OR_SERVICE_OK; + + } else if (show_mode == SYSTEMCTL_SHOW_HELP) { + show_unit_help(&info); + return 0; + } + + r = sd_bus_message_rewind(reply, true); + if (r < 0) + return log_error_errno(r, "Failed to rewind: %s", bus_error_message(&error, r)); + + r = bus_message_print_all_properties(reply, print_property, arg_properties, arg_print_flags, &found_properties); + if (r < 0) + return bus_log_parse_error(r); + + STRV_FOREACH(pp, arg_properties) + if (!set_contains(found_properties, *pp)) + log_debug("Property %s does not exist.", *pp); + + return 0; +} + +static int get_unit_dbus_path_by_pid_fallback( + sd_bus *bus, + uint32_t pid, + char **ret_path, + char **ret_unit) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *path = NULL, *unit = NULL; + char *p; + int r; + + assert(bus); + assert(ret_path); + assert(ret_unit); + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnitByPID", &error, &reply, "u", pid); + if (r < 0) + return log_error_errno(r, "Failed to get unit for PID %"PRIu32": %s", pid, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &p); + if (r < 0) + return bus_log_parse_error(r); + + path = strdup(p); + if (!path) + return log_oom(); + + r = unit_name_from_dbus_path(path, &unit); + if (r < 0) + return log_oom(); + + *ret_unit = TAKE_PTR(unit); + *ret_path = TAKE_PTR(path); + + return 0; +} + +static int get_unit_dbus_path_by_pid( + sd_bus *bus, + uint32_t pid, + char **ret_path, + char **ret_unit) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *path = NULL, *unit = NULL; + _cleanup_close_ int pidfd = -EBADF; + char *p, *u; + int r; + + assert(bus); + assert(ret_path); + assert(ret_unit); + + /* First, try to send a PIDFD across the wire, so that we can pin the process and there's no race + * condition possible while we wait for the D-Bus reply. If we either don't have PIDFD support in + * the kernel or the new D-Bus method is not available, then fallback to the older method that + * sends the numeric PID. */ + + pidfd = pidfd_open(pid, 0); + if (pidfd < 0 && ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return get_unit_dbus_path_by_pid_fallback(bus, pid, ret_path, ret_unit); + if (pidfd < 0) + return log_error_errno(errno, "Failed to open PID %"PRIu32": %m", pid); + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnitByPIDFD", &error, &reply, "h", pidfd); + if (r < 0 && sd_bus_error_has_name(&error, SD_BUS_ERROR_UNKNOWN_METHOD)) + return get_unit_dbus_path_by_pid_fallback(bus, pid, ret_path, ret_unit); + if (r < 0) + return log_error_errno(r, "Failed to get unit for PID %"PRIu32": %s", pid, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "os", &p, &u); + if (r < 0) + return bus_log_parse_error(r); + + path = strdup(p); + if (!path) + return log_oom(); + + unit = strdup(u); + if (!unit) + return log_oom(); + + *ret_unit = TAKE_PTR(unit); + *ret_path = TAKE_PTR(path); + + return 0; +} + +static int show_all( + sd_bus *bus, + SystemctlShowMode show_mode, + bool *new_line, + bool *ellipsized) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ UnitInfo *unit_infos = NULL; + unsigned c; + int r, ret = 0; + + r = get_unit_list(bus, NULL, NULL, &unit_infos, 0, &reply); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + c = (unsigned) r; + + typesafe_qsort(unit_infos, c, unit_info_compare); + + for (const UnitInfo *u = unit_infos; u < unit_infos + c; u++) { + _cleanup_free_ char *p = NULL; + + p = unit_dbus_path_from_name(u->id); + if (!p) + return log_oom(); + + r = show_one(bus, p, u->id, show_mode, new_line, ellipsized); + if (r < 0) + return r; + if (r > 0 && ret == 0) + ret = r; + } + + return ret; +} + +static int show_system_status(sd_bus *bus) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(machine_info_clear) struct machine_info mi = {}; + static const char prefix[] = " "; + _cleanup_free_ char *hn = NULL; + const char *on, *off; + unsigned c; + int r; + + hn = gethostname_malloc(); + if (!hn) + return log_oom(); + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + machine_info_property_map, + BUS_MAP_STRDUP, + &error, + NULL, + &mi); + if (r < 0) + return log_error_errno(r, "Failed to read server status: %s", bus_error_message(&error, r)); + + if (streq_ptr(mi.state, "degraded")) { + on = ansi_highlight_red(); + off = ansi_normal(); + } else if (streq_ptr(mi.state, "running")) { + on = ansi_highlight_green(); + off = ansi_normal(); + } else { + on = ansi_highlight_yellow(); + off = ansi_normal(); + } + + printf("%s%s%s %s\n", on, special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE), off, arg_host ?: hn); + + printf(" State: %s%s%s\n", + on, strna(mi.state), off); + + printf(" Units: %" PRIu32 " loaded (incl. loaded aliases)\n", mi.n_names); + printf(" Jobs: %" PRIu32 " queued\n", mi.n_jobs); + printf(" Failed: %" PRIu32 " units\n", mi.n_failed_units); + + printf(" Since: %s; %s\n", + FORMAT_TIMESTAMP_STYLE(mi.timestamp, arg_timestamp_style), + FORMAT_TIMESTAMP_RELATIVE(mi.timestamp)); + + printf(" systemd: %s\n", mi.version); + + if (!isempty(mi.tainted)) + printf(" Tainted: %s%s%s\n", ansi_highlight_yellow(), mi.tainted, ansi_normal()); + + printf(" CGroup: %s\n", empty_to_root(mi.control_group)); + + c = LESS_BY(columns(), strlen(prefix)); + + r = unit_show_processes(bus, SPECIAL_ROOT_SLICE, mi.control_group, prefix, c, get_output_flags(), &error); + if (r == -EBADR && arg_transport == BUS_TRANSPORT_LOCAL) /* Compatibility for really old systemd versions */ + show_cgroup(SYSTEMD_CGROUP_CONTROLLER, strempty(mi.control_group), prefix, c, get_output_flags()); + else if (r < 0) + log_warning_errno(r, "Failed to dump process list for '%s', ignoring: %s", + arg_host ?: hn, bus_error_message(&error, r)); + + return 0; +} + +int verb_show(int argc, char *argv[], void *userdata) { + bool new_line = false, ellipsized = false; + SystemctlShowMode show_mode; + int r, ret = 0; + sd_bus *bus; + + assert(argv); + + show_mode = systemctl_show_mode_from_string(argv[0]); + if (show_mode < 0) + return log_error_errno(show_mode, "Invalid argument '%s'.", argv[0]); + + if (show_mode == SYSTEMCTL_SHOW_HELP && argc <= 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "'help' command expects one or more unit names.\n" + "(Alternatively, help for systemctl itself may be shown with --help)"); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + if (argc <= 1) { + /* If no argument or filter is specified inspect the manager itself: + * systemctl status → we show status of the manager + * systemctl status --all → status of the manager + status of all units + * systemctl status --state=… → status of units in listed states + * systemctl status --type=… → status of units of listed types + * systemctl status --failed → status of failed units, mirroring systemctl list-units --failed + */ + + if (!arg_states && !arg_types) { + if (show_mode == SYSTEMCTL_SHOW_PROPERTIES) + /* systemctl show --all → show properties of the manager */ + return show_one(bus, "/org/freedesktop/systemd1", NULL, show_mode, &new_line, &ellipsized); + + r = show_system_status(bus); + if (r < 0) + return r; + + new_line = true; + } + + if (arg_all || arg_states || arg_types) + ret = show_all(bus, show_mode, &new_line, &ellipsized); + } else { + _cleanup_free_ char **patterns = NULL; + + STRV_FOREACH(name, strv_skip(argv, 1)) { + _cleanup_free_ char *path = NULL, *unit = NULL; + uint32_t id; + + if (safe_atou32(*name, &id) < 0) { + if (strv_push(&patterns, *name) < 0) + return log_oom(); + + continue; + } else if (show_mode == SYSTEMCTL_SHOW_PROPERTIES) { + /* Interpret as job id */ + if (asprintf(&path, "/org/freedesktop/systemd1/job/%u", id) < 0) + return log_oom(); + + } else { + /* Interpret as PID */ + r = get_unit_dbus_path_by_pid(bus, id, &path, &unit); + if (r < 0) { + ret = r; + continue; + } + } + + r = show_one(bus, path, unit, show_mode, &new_line, &ellipsized); + if (r < 0) + return r; + if (r > 0 && ret == 0) + ret = r; + } + + if (!strv_isempty(patterns)) { + _cleanup_strv_free_ char **names = NULL; + + r = expand_unit_names(bus, patterns, NULL, &names, NULL); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + r = maybe_extend_with_unit_dependencies(bus, &names); + if (r < 0) + return r; + + STRV_FOREACH(name, names) { + _cleanup_free_ char *path = NULL; + + path = unit_dbus_path_from_name(*name); + if (!path) + return log_oom(); + + r = show_one(bus, path, *name, show_mode, &new_line, &ellipsized); + if (r < 0) + return r; + if (r > 0 && ret == 0) + ret = r; + } + } + } + + if (ellipsized && !arg_quiet) + printf("Hint: Some lines were ellipsized, use -l to show in full.\n"); + + return ret; +} diff --git a/src/systemctl/systemctl-show.h b/src/systemctl/systemctl-show.h new file mode 100644 index 0000000..5aeed51 --- /dev/null +++ b/src/systemctl/systemctl-show.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_show(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-start-special.c b/src/systemctl/systemctl-start-special.c new file mode 100644 index 0000000..d23ce36 --- /dev/null +++ b/src/systemctl/systemctl-start-special.c @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "bootspec.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "efivars.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "reboot-util.h" +#include "systemctl-logind.h" +#include "systemctl-start-special.h" +#include "systemctl-start-unit.h" +#include "systemctl-trivial-method.h" +#include "systemctl-util.h" +#include "systemctl.h" + +static int load_kexec_kernel(void) { + _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL; + _cleanup_free_ char *kernel = NULL, *initrd = NULL, *options = NULL; + const BootEntry *e; + pid_t pid; + int r; + + if (kexec_loaded()) { + log_debug("Kexec kernel already loaded."); + return 0; + } + + if (access(KEXEC, X_OK) < 0) + return log_error_errno(errno, KEXEC" is not available: %m"); + + r = boot_config_load_auto(&config, NULL, NULL); + if (r == -ENOKEY) + /* The call doesn't log about ENOKEY, let's do so here. */ + return log_error_errno(r, + "No kexec kernel loaded and autodetection failed.\n%s", + is_efi_boot() + ? "Cannot automatically load kernel: ESP mount point not found." + : "Automatic loading works only on systems booted with EFI."); + if (r < 0) + return r; + + r = boot_config_select_special_entries(&config, /* skip_efivars= */ false); + if (r < 0) + return r; + + e = boot_config_default_entry(&config); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "No boot loader entry suitable as default, refusing to guess."); + + log_debug("Found default boot loader entry in file \"%s\"", e->path); + + if (!e->kernel) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Boot entry does not refer to Linux kernel, which is not supported currently."); + if (strv_length(e->initrd) > 1) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Boot entry specifies multiple initrds, which is not supported currently."); + + kernel = path_join(e->root, e->kernel); + if (!kernel) + return log_oom(); + + if (!strv_isempty(e->initrd)) { + initrd = path_join(e->root, e->initrd[0]); + if (!initrd) + return log_oom(); + } + + options = strv_join(e->options, " "); + if (!options) + return log_oom(); + + log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, + "%s "KEXEC" --load \"%s\" --append \"%s\"%s%s%s", + arg_dry_run ? "Would run" : "Running", + kernel, + options, + initrd ? " --initrd \"" : NULL, strempty(initrd), initrd ? "\"" : ""); + if (arg_dry_run) + return 0; + + r = safe_fork("(kexec)", FORK_WAIT|FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + const char* const args[] = { + KEXEC, + "--load", kernel, + "--append", options, + initrd ? "--initrd" : NULL, initrd, + NULL + }; + + /* Child */ + execv(args[0], (char * const *) args); + _exit(EXIT_FAILURE); + } + + return 0; +} + +static int set_exit_code(uint8_t code) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus; + int r; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_call_method(bus, bus_systemd_mgr, "SetExitCode", &error, NULL, "y", code); + if (r < 0) + return log_error_errno(r, "Failed to set exit code: %s", bus_error_message(&error, r)); + + return 0; +} + +int verb_start_special(int argc, char *argv[], void *userdata) { + bool termination_action; /* An action that terminates the system, can be performed also by signal. */ + enum action a; + int r; + + assert(argv); + + a = verb_to_action(argv[0]); + + r = logind_check_inhibitors(a); + if (r < 0) + return r; + + if (arg_force >= 2) { + r = must_be_root(); + if (r < 0) + return r; + } + + termination_action = IN_SET(a, ACTION_HALT, ACTION_POWEROFF, ACTION_REBOOT); + + if (termination_action) { + r = prepare_firmware_setup(); + if (r < 0) + return r; + + r = prepare_boot_loader_menu(); + if (r < 0) + return r; + + r = prepare_boot_loader_entry(); + if (r < 0) + return r; + } + + if (a == ACTION_REBOOT) { + if (arg_reboot_argument) { + r = update_reboot_parameter_and_warn(arg_reboot_argument, false); + if (r < 0) + return r; + } + + } else if (a == ACTION_KEXEC) { + r = load_kexec_kernel(); + if (r < 0 && arg_force >= 1) + log_notice("Failed to load kexec kernel, continuing without."); + else if (r < 0) + return r; + + } else if (a == ACTION_EXIT && argc > 1) { + uint8_t code; + + /* If the exit code is not given on the command line, don't reset it to zero: just keep it as + * it might have been set previously. */ + + r = safe_atou8(argv[1], &code); + if (r < 0) + return log_error_errno(r, "Invalid exit code."); + + r = set_exit_code(code); + if (r < 0) + return r; + } + + if (termination_action && arg_force >= 2) + return halt_now(a); + + if (arg_force >= 1 && + (termination_action || IN_SET(a, ACTION_KEXEC, ACTION_EXIT))) + r = verb_trivial_method(argc, argv, userdata); + else { + /* First try logind, to allow authentication with polkit */ + switch (a) { + + case ACTION_POWEROFF: + case ACTION_REBOOT: + case ACTION_KEXEC: + case ACTION_HALT: + case ACTION_SOFT_REBOOT: + if (arg_when == 0) + r = logind_reboot(a); + else if (arg_when != USEC_INFINITY) + r = logind_schedule_shutdown(a); + else /* arg_when == USEC_INFINITY */ + r = logind_cancel_shutdown(); + if (r >= 0 || IN_SET(r, -EACCES, -EOPNOTSUPP, -EINPROGRESS)) + /* The latter indicates that the requested operation requires auth, + * is not supported or already in progress, in which cases we ignore the error. */ + return r; + + /* On all other errors, try low-level operation. In order to minimize the difference + * between operation with and without logind, we explicitly enable non-blocking mode + * for this, as logind's shutdown operations are always non-blocking. */ + arg_no_block = true; + break; + + case ACTION_SUSPEND: + case ACTION_HIBERNATE: + case ACTION_HYBRID_SLEEP: + case ACTION_SUSPEND_THEN_HIBERNATE: + + r = logind_reboot(a); + if (r >= 0 || IN_SET(r, -EACCES, -EOPNOTSUPP, -EINPROGRESS)) + return r; + + arg_no_block = true; + break; + + case ACTION_EXIT: + /* Since exit is so close in behaviour to power-off/reboot, let's also make + * it asynchronous, in order to not confuse the user needlessly with unexpected + * behaviour. */ + arg_no_block = true; + break; + + default: + ; + } + + r = verb_start(argc, argv, userdata); + } + + if (termination_action && arg_force < 2 && + IN_SET(r, -ENOENT, -ETIMEDOUT)) + log_notice("It is possible to perform action directly, see discussion of --force --force in man:systemctl(1)."); + + return r; +} + +int verb_start_system_special(int argc, char *argv[], void *userdata) { + /* Like start_special above, but raises an error when running in user mode */ + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Bad action for %s mode.", + runtime_scope_cmdline_option_to_string(arg_runtime_scope)); + + return verb_start_special(argc, argv, userdata); +} diff --git a/src/systemctl/systemctl-start-special.h b/src/systemctl/systemctl-start-special.h new file mode 100644 index 0000000..9396321 --- /dev/null +++ b/src/systemctl/systemctl-start-special.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_start_special(int argc, char *argv[], void *userdata); +int verb_start_system_special(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-start-unit.c b/src/systemctl/systemctl-start-unit.c new file mode 100644 index 0000000..6927e97 --- /dev/null +++ b/src/systemctl/systemctl-start-unit.c @@ -0,0 +1,409 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" + +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "bus-wait-for-jobs.h" +#include "bus-wait-for-units.h" +#include "macro.h" +#include "special.h" +#include "string-util.h" +#include "systemctl-start-unit.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" + +static const struct { + const char *verb; /* systemctl verb */ + const char *method; /* Name of the specific D-Bus method */ + const char *job_type; /* Job type when passing to the generic EnqueueUnitJob() method */ +} unit_actions[] = { + { "start", "StartUnit", "start" }, + { "stop", "StopUnit", "stop" }, + { "condstop", "StopUnit", "stop" }, /* legacy alias */ + { "reload", "ReloadUnit", "reload" }, + { "restart", "RestartUnit", "restart" }, + { "try-restart", "TryRestartUnit", "try-restart" }, + { "condrestart", "TryRestartUnit", "try-restart" }, /* legacy alias */ + { "reload-or-restart", "ReloadOrRestartUnit", "reload-or-restart" }, + { "try-reload-or-restart", "ReloadOrTryRestartUnit", "reload-or-try-restart" }, + { "reload-or-try-restart", "ReloadOrTryRestartUnit", "reload-or-try-restart" }, /* legacy alias */ + { "condreload", "ReloadOrTryRestartUnit", "reload-or-try-restart" }, /* legacy alias */ + { "force-reload", "ReloadOrTryRestartUnit", "reload-or-try-restart" }, /* legacy alias */ +}; + +static const char *verb_to_method(const char *verb) { + for (size_t i = 0; i < ELEMENTSOF(unit_actions); i++) + if (streq_ptr(unit_actions[i].verb, verb)) + return unit_actions[i].method; + + return "StartUnit"; +} + +static const char *verb_to_job_type(const char *verb) { + for (size_t i = 0; i < ELEMENTSOF(unit_actions); i++) + if (streq_ptr(unit_actions[i].verb, verb)) + return unit_actions[i].job_type; + + return "start"; +} + +static int start_unit_one( + sd_bus *bus, + const char *method, /* When using classic per-job bus methods */ + const char *job_type, /* When using new-style EnqueueUnitJob() */ + const char *name, + const char *mode, + sd_bus_error *error, + BusWaitForJobs *w, + BusWaitForUnits *wu) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *path; + bool done = false; + int r; + + assert(method); + assert(name); + assert(mode); + assert(error); + + log_debug("%s dbus call org.freedesktop.systemd1.Manager %s(%s, %s)", + arg_dry_run ? "Would execute" : "Executing", + method, name, mode); + + if (arg_dry_run) + return 0; + + if (arg_show_transaction) { + _cleanup_(sd_bus_error_free) sd_bus_error enqueue_error = SD_BUS_ERROR_NULL; + + /* Use the new, fancy EnqueueUnitJob() API if the user wants us to print the transaction */ + r = bus_call_method( + bus, + bus_systemd_mgr, + "EnqueueUnitJob", + &enqueue_error, + &reply, + "sss", + name, job_type, mode); + if (r < 0) { + if (!sd_bus_error_has_name(&enqueue_error, SD_BUS_ERROR_UNKNOWN_METHOD)) { + (void) sd_bus_error_move(error, &enqueue_error); + goto fail; + } + + /* Hmm, the API is not yet available. Let's use the classic API instead (see below). */ + log_notice("--show-transaction not supported by this service manager, proceeding without."); + } else { + const char *u, *jt; + uint32_t id; + + r = sd_bus_message_read(reply, "uosos", &id, &path, &u, NULL, &jt); + if (r < 0) + return bus_log_parse_error(r); + + log_info("Enqueued anchor job %" PRIu32 " %s/%s.", id, u, jt); + + r = sd_bus_message_enter_container(reply, 'a', "(uosos)"); + if (r < 0) + return bus_log_parse_error(r); + for (;;) { + r = sd_bus_message_read(reply, "(uosos)", &id, NULL, &u, NULL, &jt); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + log_info("Enqueued auxiliary job %" PRIu32 " %s/%s.", id, u, jt); + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + done = true; + } + } + + if (!done) { + r = bus_call_method(bus, bus_systemd_mgr, method, error, &reply, "ss", name, mode); + if (r < 0) + goto fail; + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + } + + if (need_daemon_reload(bus, name) > 0) + warn_unit_file_changed(name); + + if (w) { + log_debug("Adding %s to the set", path); + r = bus_wait_for_jobs_add(w, path); + if (r < 0) + return log_error_errno(r, "Failed to watch job for %s: %m", name); + } + + if (wu) { + r = bus_wait_for_units_add_unit(wu, name, BUS_WAIT_FOR_INACTIVE|BUS_WAIT_NO_JOB, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to watch unit %s: %m", name); + } + + return 0; + +fail: + /* There's always a fallback possible for legacy actions. */ + if (arg_action != ACTION_SYSTEMCTL) + return r; + + if (sd_bus_error_has_name(error, BUS_ERROR_UNIT_MASKED) && + STR_IN_SET(method, "TryRestartUnit", "ReloadOrTryRestartUnit")) { + /* Ignore masked unit if try-* is requested */ + + log_debug_errno(r, "Failed to %s %s, ignoring: %s", job_type, name, bus_error_message(error, r)); + return 0; + } + + log_error_errno(r, "Failed to %s %s: %s", job_type, name, bus_error_message(error, r)); + + if (!sd_bus_error_has_names(error, BUS_ERROR_NO_SUCH_UNIT, + BUS_ERROR_UNIT_MASKED, + BUS_ERROR_JOB_TYPE_NOT_APPLICABLE)) + log_error("See %s logs and 'systemctl%s status%s %s' for details.", + runtime_scope_to_string(arg_runtime_scope), + arg_runtime_scope == RUNTIME_SCOPE_SYSTEM ? "" : " --user", + name[0] == '-' ? " --" : "", + name); + + return r; +} + +static int enqueue_marked_jobs( + sd_bus *bus, + BusWaitForJobs *w) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + log_debug("%s dbus call org.freedesktop.systemd1.Manager EnqueueMarkedJobs()", + arg_dry_run ? "Would execute" : "Executing"); + + if (arg_dry_run) + return 0; + + r = bus_call_method(bus, bus_systemd_mgr, "EnqueueMarkedJobs", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to start jobs: %s", bus_error_message(&error, r)); + + _cleanup_strv_free_ char **paths = NULL; + r = sd_bus_message_read_strv(reply, &paths); + if (r < 0) + return bus_log_parse_error(r); + + if (w) + STRV_FOREACH(path, paths) { + log_debug("Adding %s to the set", *path); + r = bus_wait_for_jobs_add(w, *path); + if (r < 0) + return log_error_errno(r, "Failed to watch job %s: %m", *path); + } + + return 0; +} + +const struct action_metadata action_table[_ACTION_MAX] = { + [ACTION_HALT] = { SPECIAL_HALT_TARGET, "halt", "replace-irreversibly" }, + [ACTION_POWEROFF] = { SPECIAL_POWEROFF_TARGET, "poweroff", "replace-irreversibly" }, + [ACTION_REBOOT] = { SPECIAL_REBOOT_TARGET, "reboot", "replace-irreversibly" }, + [ACTION_KEXEC] = { SPECIAL_KEXEC_TARGET, "kexec", "replace-irreversibly" }, + [ACTION_SOFT_REBOOT] = { SPECIAL_SOFT_REBOOT_TARGET, "soft-reboot", "replace-irreversibly" }, + [ACTION_RUNLEVEL2] = { SPECIAL_MULTI_USER_TARGET, NULL, "isolate" }, + [ACTION_RUNLEVEL3] = { SPECIAL_MULTI_USER_TARGET, NULL, "isolate" }, + [ACTION_RUNLEVEL4] = { SPECIAL_MULTI_USER_TARGET, NULL, "isolate" }, + [ACTION_RUNLEVEL5] = { SPECIAL_GRAPHICAL_TARGET, NULL, "isolate" }, + [ACTION_RESCUE] = { SPECIAL_RESCUE_TARGET, "rescue", "isolate" }, + [ACTION_EMERGENCY] = { SPECIAL_EMERGENCY_TARGET, "emergency", "isolate" }, + [ACTION_DEFAULT] = { SPECIAL_DEFAULT_TARGET, "default", "isolate" }, + [ACTION_EXIT] = { SPECIAL_EXIT_TARGET, "exit", "replace-irreversibly" }, + [ACTION_SUSPEND] = { SPECIAL_SUSPEND_TARGET, "suspend", "replace-irreversibly" }, + [ACTION_HIBERNATE] = { SPECIAL_HIBERNATE_TARGET, "hibernate", "replace-irreversibly" }, + [ACTION_HYBRID_SLEEP] = { SPECIAL_HYBRID_SLEEP_TARGET, "hybrid-sleep", "replace-irreversibly" }, + [ACTION_SUSPEND_THEN_HIBERNATE] = { SPECIAL_SUSPEND_THEN_HIBERNATE_TARGET, "suspend-then-hibernate", "replace-irreversibly" }, +}; + +enum action verb_to_action(const char *verb) { + for (enum action i = 0; i < _ACTION_MAX; i++) + if (streq_ptr(action_table[i].verb, verb)) + return i; + + return _ACTION_INVALID; +} + +static const char** make_extra_args(const char *extra_args[static 4]) { + size_t n = 0; + + assert(extra_args); + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + extra_args[n++] = "--user"; + + if (arg_transport == BUS_TRANSPORT_REMOTE) { + extra_args[n++] = "-H"; + extra_args[n++] = arg_host; + } else if (arg_transport == BUS_TRANSPORT_MACHINE) { + extra_args[n++] = "-M"; + extra_args[n++] = arg_host; + } else + assert(arg_transport == BUS_TRANSPORT_LOCAL); + + extra_args[n] = NULL; + return extra_args; +} + +int verb_start(int argc, char *argv[], void *userdata) { + _cleanup_(bus_wait_for_units_freep) BusWaitForUnits *wu = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + const char *method, *job_type, *mode, *one_name, *suffix = NULL; + _cleanup_free_ char **stopped_units = NULL; /* Do not use _cleanup_strv_free_ */ + _cleanup_strv_free_ char **names = NULL; + int r, ret = EXIT_SUCCESS; + sd_bus *bus; + + if (arg_wait && !STR_IN_SET(argv[0], "start", "restart")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--wait may only be used with the 'start' or 'restart' commands."); + + /* We cannot do sender tracking on the private bus, so we need the full one for RefUnit to implement + * --wait */ + r = acquire_bus(arg_wait ? BUS_FULL : BUS_MANAGER, &bus); + if (r < 0) + return r; + + ask_password_agent_open_maybe(); + polkit_agent_open_maybe(); + + if (arg_action == ACTION_SYSTEMCTL) { + enum action action; + + action = verb_to_action(argv[0]); + + if (action != _ACTION_INVALID) { + /* A command in style "systemctl reboot", "systemctl poweroff", … */ + method = "StartUnit"; + job_type = "start"; + mode = action_table[action].mode; + one_name = action_table[action].target; + } else { + if (streq(argv[0], "isolate")) { + /* A "systemctl isolate …" command */ + method = "StartUnit"; + job_type = "start"; + mode = "isolate"; + suffix = ".target"; + } else if (!arg_marked) { + /* A command in style of "systemctl start …", "systemctl stop …" and so on */ + method = verb_to_method(argv[0]); + job_type = verb_to_job_type(argv[0]); + mode = arg_job_mode(); + } else + method = job_type = mode = NULL; + + one_name = NULL; + } + } else { + /* A SysV legacy command such as "halt", "reboot", "poweroff", … */ + assert(arg_action >= 0 && arg_action < _ACTION_MAX); + assert(action_table[arg_action].target); + assert(action_table[arg_action].mode); + + method = "StartUnit"; + job_type = "start"; + mode = action_table[arg_action].mode; + one_name = action_table[arg_action].target; + } + + if (one_name) { + names = strv_new(one_name); + if (!names) + return log_oom(); + } else if (!arg_marked) { + bool expanded; + + r = expand_unit_names(bus, strv_skip(argv, 1), suffix, &names, &expanded); + if (r < 0) + return log_error_errno(r, "Failed to expand names: %m"); + + if (!arg_all && expanded && streq(job_type, "start") && !arg_quiet) { + log_warning("Warning: %ssystemctl start called with a glob pattern.%s", + ansi_highlight_red(), + ansi_normal()); + log_notice("Hint: unit globs expand to loaded units, so start will usually have no effect.\n" + " Passing --all will also load units which are pulled in by other units.\n" + " See systemctl(1) for more details."); + } + } + + if (!arg_no_block) { + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch jobs: %m"); + } + + if (arg_wait) { + r = bus_call_method_async(bus, NULL, bus_systemd_mgr, "Subscribe", NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to enable subscription: %m"); + + r = bus_wait_for_units_new(bus, &wu); + if (r < 0) + return log_error_errno(r, "Failed to allocate unit watch context: %m"); + } + + if (arg_marked) + ret = enqueue_marked_jobs(bus, w); + else + STRV_FOREACH(name, names) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + r = start_unit_one(bus, method, job_type, *name, mode, &error, w, wu); + if (ret == EXIT_SUCCESS && r < 0) + ret = translate_bus_error_to_exit_status(r, &error); + + if (r >= 0 && streq(method, "StopUnit")) { + r = strv_push(&stopped_units, *name); + if (r < 0) + return log_oom(); + } + } + + if (!arg_no_block) { + const char* extra_args[4]; + + r = bus_wait_for_jobs(w, arg_quiet, make_extra_args(extra_args)); + if (r < 0) + return r; + + /* When stopping units, warn if they can still be triggered by + * another active unit (socket, path, timer) */ + if (!arg_quiet && !arg_no_warn) + STRV_FOREACH(unit, stopped_units) + warn_triggering_units(bus, *unit, "Stopping", /* ignore_masked = */ true); + } + + if (arg_wait) { + r = bus_wait_for_units_run(wu); + if (r < 0) + return log_error_errno(r, "Failed to wait for units: %m"); + if (r == BUS_WAIT_FAILURE && ret == EXIT_SUCCESS) + ret = EXIT_FAILURE; + } + + return ret; +} diff --git a/src/systemctl/systemctl-start-unit.h b/src/systemctl/systemctl-start-unit.h new file mode 100644 index 0000000..2865016 --- /dev/null +++ b/src/systemctl/systemctl-start-unit.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "systemctl.h" + +int verb_start(int argc, char *argv[], void *userdata); + +struct action_metadata { + const char *target; + const char *verb; + const char *mode; +}; + +extern const struct action_metadata action_table[_ACTION_MAX]; + +enum action verb_to_action(const char *verb); diff --git a/src/systemctl/systemctl-switch-root.c b/src/systemctl/systemctl-switch-root.c new file mode 100644 index 0000000..ae4a1a7 --- /dev/null +++ b/src/systemctl/systemctl-switch-root.c @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "argv-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "chase.h" +#include "fd-util.h" +#include "initrd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "signal-util.h" +#include "stat-util.h" +#include "systemctl.h" +#include "systemctl-switch-root.h" +#include "systemctl-util.h" + +static int same_file_in_root( + const char *root, + const char *a, + const char *b) { + + struct stat sta, stb; + int r; + + r = chase_and_stat(a, root, CHASE_PREFIX_ROOT, NULL, &sta); + if (r < 0) + return r; + + r = chase_and_stat(b, root, CHASE_PREFIX_ROOT, NULL, &stb); + if (r < 0) + return r; + + return stat_inode_same(&sta, &stb); +} + +int verb_switch_root(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *cmdline_init = NULL; + const char *root, *init; + sd_bus *bus; + int r; + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot switch root remotely."); + + if (argc > 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too many arguments."); + + if (argc >= 2) { + root = argv[1]; + + if (!path_is_valid(root)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid root path: %s", root); + + if (!path_is_absolute(root)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root path is not absolute: %s", root); + + r = path_is_root(root); + if (r < 0) + return log_error_errno(r, "Failed to check if switch-root directory '%s' is current root: %m", root); + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot switch to current root directory: %s", root); + } else + root = "/sysroot"; + + if (!in_initrd()) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Not in initrd, refusing switch-root operation."); + + if (argc >= 3) + init = argv[2]; + else { + r = proc_cmdline_get_key("init", 0, &cmdline_init); + if (r < 0) + log_debug_errno(r, "Failed to parse /proc/cmdline: %m"); + + init = cmdline_init; + } + + init = empty_to_null(init); + if (init) { + if (!path_is_valid(init)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid path to init binary: %s", init); + if (!path_is_absolute(init)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path to init binary is not absolute: %s", init); + + /* If the passed init is actually the same as the systemd binary, then let's suppress it. */ + if (same_file_in_root(root, SYSTEMD_BINARY_PATH, init) > 0) + init = NULL; + } + + /* Instruct PID1 to exclude us from its killing spree applied during the transition. Otherwise we + * would exit with a failure status even though the switch to the new root has succeed. */ + assert(saved_argv); + assert(saved_argv[0]); + saved_argv[0][0] = '@'; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + /* If we are slow to exit after the root switch, the new systemd instance will send us a signal to + * terminate. Just ignore it and exit normally. This way the unit does not end up as failed. */ + r = ignore_signals(SIGTERM); + if (r < 0) + log_warning_errno(r, "Failed to change disposition of SIGTERM to ignore: %m"); + + log_debug("Switching root - root: %s; init: %s", root, strna(init)); + + r = bus_call_method(bus, bus_systemd_mgr, "SwitchRoot", &error, NULL, "ss", root, init); + if (r < 0) { + (void) default_signals(SIGTERM); + + return log_error_errno(r, "Failed to switch root: %s", bus_error_message(&error, r)); + } + + return 0; +} diff --git a/src/systemctl/systemctl-switch-root.h b/src/systemctl/systemctl-switch-root.h new file mode 100644 index 0000000..e9ba12b --- /dev/null +++ b/src/systemctl/systemctl-switch-root.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_switch_root(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-sysv-compat.c b/src/systemctl/systemctl-sysv-compat.c new file mode 100644 index 0000000..2aa1ec6 --- /dev/null +++ b/src/systemctl/systemctl-sysv-compat.c @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "env-util.h" +#include "fd-util.h" +#include "initreq.h" +#include "install.h" +#include "io-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "strv.h" +#include "systemctl-sysv-compat.h" +#include "systemctl.h" + +int talk_initctl(char rl) { +#if HAVE_SYSV_COMPAT + _cleanup_close_ int fd = -EBADF; + const char *path; + int r; + + /* Try to switch to the specified SysV runlevel. Returns == 0 if the operation does not apply on this + * system, and > 0 on success. */ + + if (rl == 0) + return 0; + + FOREACH_STRING(_path, "/run/initctl", "/dev/initctl") { + path = _path; + + fd = open(path, O_WRONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd < 0 && errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", path); + if (fd >= 0) + break; + } + if (fd < 0) + return 0; + + struct init_request request = { + .magic = INIT_MAGIC, + .sleeptime = 0, + .cmd = INIT_CMD_RUNLVL, + .runlevel = rl, + }; + + r = loop_write(fd, &request, sizeof(request)); + if (r < 0) + return log_error_errno(r, "Failed to write to %s: %m", path); + + return 1; +#else + return -EOPNOTSUPP; +#endif +} + +int parse_shutdown_time_spec(const char *t, usec_t *ret) { + assert(t); + assert(ret); + + if (streq(t, "now")) + *ret = 0; + else if (!strchr(t, ':')) { + uint64_t u; + + if (safe_atou64(t, &u) < 0) + return -EINVAL; + + *ret = now(CLOCK_REALTIME) + USEC_PER_MINUTE * u; + } else { + char *e = NULL; + long hour, minute; + struct tm tm = {}; + time_t s; + usec_t n; + + errno = 0; + hour = strtol(t, &e, 10); + if (errno > 0 || *e != ':' || hour < 0 || hour > 23) + return -EINVAL; + + minute = strtol(e+1, &e, 10); + if (errno > 0 || *e != 0 || minute < 0 || minute > 59) + return -EINVAL; + + n = now(CLOCK_REALTIME); + s = (time_t) (n / USEC_PER_SEC); + + assert_se(localtime_r(&s, &tm)); + + tm.tm_hour = (int) hour; + tm.tm_min = (int) minute; + tm.tm_sec = 0; + + s = mktime(&tm); + assert(s >= 0); + + *ret = (usec_t) s * USEC_PER_SEC; + + while (*ret <= n) + *ret += USEC_PER_DAY; + } + + return 0; +} + +int enable_sysv_units(const char *verb, char **args) { + int r = 0; + +#if HAVE_SYSV_COMPAT + _cleanup_(lookup_paths_free) LookupPaths paths = {}; + unsigned f = 0; + SysVUnitEnableState enable_state = SYSV_UNIT_NOT_FOUND; + + /* Processes all SysV units, and reshuffles the array so that afterwards only the native units remain */ + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + return 0; + + if (getenv_bool("SYSTEMCTL_SKIP_SYSV") > 0) + return 0; + + if (!STR_IN_SET(verb, + "enable", + "disable", + "is-enabled")) + return 0; + + r = lookup_paths_init_or_warn(&paths, arg_runtime_scope, LOOKUP_PATHS_EXCLUDE_GENERATED, arg_root); + if (r < 0) + return r; + + r = 0; + while (args[f]) { + + const char *argv[] = { + LIBEXECDIR "/systemd-sysv-install", + NULL, /* --root= */ + NULL, /* verb */ + NULL, /* service */ + NULL, + }; + + _cleanup_free_ char *p = NULL, *q = NULL, *l = NULL, *v = NULL; + bool found_native = false, found_sysv; + const char *name; + unsigned c = 1; + pid_t pid; + int j; + + name = args[f++]; + + if (!endswith(name, ".service")) + continue; + + if (path_is_absolute(name)) + continue; + + j = unit_file_exists(arg_runtime_scope, &paths, name); + if (j < 0 && !IN_SET(j, -ELOOP, -ERFKILL, -EADDRNOTAVAIL)) + return log_error_errno(j, "Failed to look up unit file state: %m"); + found_native = j != 0; + + /* If we have both a native unit and a SysV script, enable/disable them both (below); for + * is-enabled, prefer the native unit */ + if (found_native && streq(verb, "is-enabled")) + continue; + + p = path_join(arg_root, SYSTEM_SYSVINIT_PATH, name); + if (!p) + return log_oom(); + + p[strlen(p) - STRLEN(".service")] = 0; + found_sysv = access(p, F_OK) >= 0; + if (!found_sysv) + continue; + + if (!arg_quiet) { + if (found_native) + log_info("Synchronizing state of %s with SysV service script with %s.", name, argv[0]); + else + log_info("%s is not a native service, redirecting to systemd-sysv-install.", name); + } + + if (!isempty(arg_root)) { + q = strjoin("--root=", arg_root); + if (!q) + return log_oom(); + + argv[c++] = q; + } + + /* Let's copy the verb, since it's still pointing directly into the original argv[] array we + * got passed, but safe_fork() is likely going to rewrite that for the new child */ + v = strdup(verb); + if (!v) + return log_oom(); + + argv[c++] = v; + argv[c++] = basename(p); + argv[c] = NULL; + + l = strv_join((char**)argv, " "); + if (!l) + return log_oom(); + + if (!arg_quiet) + log_info("Executing: %s", l); + + j = safe_fork("(sysv-install)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (j < 0) + return j; + if (j == 0) { + /* Child */ + execv(argv[0], (char**) argv); + log_error_errno(errno, "Failed to execute %s: %m", argv[0]); + _exit(EXIT_FAILURE); + } + + j = wait_for_terminate_and_check("sysv-install", pid, WAIT_LOG_ABNORMAL); + if (j < 0) + return j; + if (streq(verb, "is-enabled")) { + if (j == EXIT_SUCCESS) { + if (!arg_quiet) + puts("enabled"); + enable_state = SYSV_UNIT_ENABLED; + } else { + if (!arg_quiet) + puts("disabled"); + if (enable_state != SYSV_UNIT_ENABLED) + enable_state = SYSV_UNIT_DISABLED; + } + + } else if (j != EXIT_SUCCESS) + return -EBADE; /* We don't warn here, under the assumption the script already showed an explanation */ + + if (found_native) + continue; + + /* Remove this entry, so that we don't try enabling it as native unit */ + assert(f > 0); + f--; + assert(args[f] == name); + strv_remove(args + f, name); + } + + if (streq(verb, "is-enabled")) + return enable_state; +#endif + return r; +} + +int action_to_runlevel(void) { +#if HAVE_SYSV_COMPAT + static const char table[_ACTION_MAX] = { + [ACTION_HALT] = '0', + [ACTION_POWEROFF] = '0', + [ACTION_REBOOT] = '6', + [ACTION_RUNLEVEL2] = '2', + [ACTION_RUNLEVEL3] = '3', + [ACTION_RUNLEVEL4] = '4', + [ACTION_RUNLEVEL5] = '5', + [ACTION_RESCUE] = '1' + }; + + assert(arg_action >= 0 && arg_action < _ACTION_MAX); + return table[arg_action]; +#else + return -EOPNOTSUPP; +#endif +} diff --git a/src/systemctl/systemctl-sysv-compat.h b/src/systemctl/systemctl-sysv-compat.h new file mode 100644 index 0000000..05db6ec --- /dev/null +++ b/src/systemctl/systemctl-sysv-compat.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "time-util.h" + +int talk_initctl(char runlevel); + +int parse_shutdown_time_spec(const char *t, usec_t *ret); + +/* The init script exit codes for the LSB 'status' verb. (This is different from the 'start' verb, whose exit + codes are defined in exit-status.h.) + + 0 program is running or service is OK + 1 program is dead and /var/run pid file exists + 2 program is dead and /var/lock lock file exists + 3 program is not running + 4 program or service status is unknown + 5-99 reserved for future LSB use + 100-149 reserved for distribution use + 150-199 reserved for application use + 200-254 reserved + + https://refspecs.linuxbase.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html +*/ +enum { + EXIT_PROGRAM_RUNNING_OR_SERVICE_OK = 0, + EXIT_PROGRAM_DEAD_AND_PID_EXISTS = 1, + EXIT_PROGRAM_DEAD_AND_LOCK_FILE_EXISTS = 2, + EXIT_PROGRAM_NOT_RUNNING = 3, + EXIT_PROGRAM_OR_SERVICES_STATUS_UNKNOWN = 4, +}; + +typedef enum SysVUnitEnableState { + SYSV_UNIT_NOT_FOUND = 0, + SYSV_UNIT_DISABLED, + SYSV_UNIT_ENABLED, +} SysVUnitEnableState; + +int enable_sysv_units(const char *verb, char **args); + +int action_to_runlevel(void) _pure_; diff --git a/src/systemctl/systemctl-trivial-method.c b/src/systemctl/systemctl-trivial-method.c new file mode 100644 index 0000000..02a2912 --- /dev/null +++ b/src/systemctl/systemctl-trivial-method.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl-trivial-method.h" +#include "systemctl-util.h" +#include "systemctl.h" + +/* A generic implementation for cases we just need to invoke a simple method call on the Manager object. */ + +int verb_trivial_method(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + const char *method; + sd_bus *bus; + int r; + + if (arg_dry_run) + return 0; + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + polkit_agent_open_maybe(); + + method = + streq(argv[0], "clear-jobs") || + streq(argv[0], "cancel") ? "ClearJobs" : + streq(argv[0], "reset-failed") ? "ResetFailed" : + streq(argv[0], "halt") ? "Halt" : + streq(argv[0], "reboot") ? "Reboot" : + streq(argv[0], "kexec") ? "KExec" : + streq(argv[0], "soft-reboot") ? "SoftReboot" : + streq(argv[0], "exit") ? "Exit" : + /* poweroff */ "PowerOff"; + + r = bus_call_method(bus, bus_systemd_mgr, method, &error, NULL, NULL); + if (r < 0 && arg_action == ACTION_SYSTEMCTL) + return log_error_errno(r, "Failed to execute operation: %s", bus_error_message(&error, r)); + + /* Note that for the legacy commands (i.e. those with action != ACTION_SYSTEMCTL) we support + * fallbacks to the old ways of doing things, hence don't log any error in that case here. */ + + return r < 0 ? r : 0; +} diff --git a/src/systemctl/systemctl-trivial-method.h b/src/systemctl/systemctl-trivial-method.h new file mode 100644 index 0000000..d36b480 --- /dev/null +++ b/src/systemctl/systemctl-trivial-method.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_trivial_method(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl-util.c b/src/systemctl/systemctl-util.c new file mode 100644 index 0000000..2498725 --- /dev/null +++ b/src/systemctl/systemctl-util.c @@ -0,0 +1,996 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-bus.h" +#include "sd-daemon.h" + +#include "bus-common-errors.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-unit-util.h" +#include "chase.h" +#include "dropin.h" +#include "env-util.h" +#include "exit-status.h" +#include "fs-util.h" +#include "glob-util.h" +#include "macro.h" +#include "path-util.h" +#include "reboot-util.h" +#include "set.h" +#include "spawn-ask-password-agent.h" +#include "spawn-polkit-agent.h" +#include "stat-util.h" +#include "systemctl-util.h" +#include "systemctl.h" +#include "terminal-util.h" +#include "verbs.h" + +static sd_bus *buses[_BUS_FOCUS_MAX] = {}; + +int acquire_bus(BusFocus focus, sd_bus **ret) { + int r; + + assert(focus < _BUS_FOCUS_MAX); + assert(ret); + + if (!IN_SET(arg_runtime_scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--global is not supported for this operation."); + + /* We only go directly to the manager, if we are using a local transport */ + if (arg_transport != BUS_TRANSPORT_LOCAL) + focus = BUS_FULL; + + if (getenv_bool("SYSTEMCTL_FORCE_BUS") > 0) + focus = BUS_FULL; + + if (!buses[focus]) { + if (focus == BUS_MANAGER) + r = bus_connect_transport_systemd(arg_transport, arg_host, arg_runtime_scope, &buses[focus]); + else + r = bus_connect_transport(arg_transport, arg_host, arg_runtime_scope, &buses[focus]); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + (void) sd_bus_set_allow_interactive_authorization(buses[focus], arg_ask_password); + } + + *ret = buses[focus]; + return 0; +} + +void release_busses(void) { + for (BusFocus w = 0; w < _BUS_FOCUS_MAX; w++) + buses[w] = sd_bus_flush_close_unref(buses[w]); +} + +void ask_password_agent_open_maybe(void) { + /* Open the password agent as a child process if necessary */ + + if (arg_dry_run) + return; + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + return; + + ask_password_agent_open_if_enabled(arg_transport, arg_ask_password); +} + +void polkit_agent_open_maybe(void) { + /* Open the polkit agent as a child process if necessary */ + + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + return; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); +} + +int translate_bus_error_to_exit_status(int r, const sd_bus_error *error) { + assert(error); + + if (!sd_bus_error_is_set(error)) + return r; + + if (sd_bus_error_has_names(error, SD_BUS_ERROR_ACCESS_DENIED, + BUS_ERROR_ONLY_BY_DEPENDENCY, + BUS_ERROR_NO_ISOLATION, + BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE)) + return EXIT_NOPERMISSION; + + if (sd_bus_error_has_name(error, BUS_ERROR_NO_SUCH_UNIT)) + return EXIT_NOTINSTALLED; + + if (sd_bus_error_has_names(error, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE, + SD_BUS_ERROR_NOT_SUPPORTED)) + return EXIT_NOTIMPLEMENTED; + + if (sd_bus_error_has_name(error, BUS_ERROR_LOAD_FAILED)) + return EXIT_NOTCONFIGURED; + + if (r != 0) + return r; + + return EXIT_FAILURE; +} + +int get_state_one_unit(sd_bus *bus, const char *unit, UnitActiveState *ret_active_state) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *buf = NULL, *dbus_path = NULL; + UnitActiveState state; + int r; + + assert(bus); + assert(unit); + assert(ret_active_state); + + dbus_path = unit_dbus_path_from_name(unit); + if (!dbus_path) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Unit", + "ActiveState", + &error, + &buf); + if (r < 0) + return log_error_errno(r, "Failed to retrieve unit state: %s", bus_error_message(&error, r)); + + state = unit_active_state_from_string(buf); + if (state < 0) + return log_error_errno(state, "Invalid unit state '%s' for: %s", buf, unit); + + *ret_active_state = state; + return 0; +} + +int get_sub_state_one_unit(sd_bus *bus, const char *unit, char **ret_sub_state) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *sub_state = NULL, *dbus_path = NULL; + int r; + + assert(bus); + assert(unit); + assert(ret_sub_state); + + dbus_path = unit_dbus_path_from_name(unit); + if (!dbus_path) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Unit", + "SubState", + &error, + &sub_state); + if (r < 0) + return log_error_errno(r, "Failed to retrieve unit sub state: %s", bus_error_message(&error, r)); + + *ret_sub_state = TAKE_PTR(sub_state); + return 0; +} + +int get_unit_list( + sd_bus *bus, + const char *machine, + char **patterns, + UnitInfo **unit_infos, + int c, + sd_bus_message **ret_reply) { + + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + bool fallback = false; + + assert(bus); + assert(unit_infos); + assert(ret_reply); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "ListUnitsByPatterns"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, arg_states); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, patterns); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0 && (sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_METHOD, + SD_BUS_ERROR_ACCESS_DENIED))) { + /* Fallback to legacy ListUnitsFiltered method */ + fallback = true; + log_debug_errno(r, "Failed to list units: %s Falling back to ListUnitsFiltered method.", bus_error_message(&error, r)); + m = sd_bus_message_unref(m); + sd_bus_error_free(&error); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "ListUnitsFiltered"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, arg_states); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + } + if (r < 0) + return log_error_errno(r, "Failed to list units: %s", bus_error_message(&error, r)); + + r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); + if (r < 0) + return bus_log_parse_error(r); + + for (;;) { + UnitInfo u; + + r = bus_parse_unit_info(reply, &u); + if (r < 0) + return bus_log_parse_error(r); + if (r == 0) + break; + + u.machine = machine; + + if (!output_show_unit(&u, fallback ? patterns : NULL)) + continue; + + if (!GREEDY_REALLOC(*unit_infos, c+1)) + return log_oom(); + + (*unit_infos)[c++] = u; + } + + r = sd_bus_message_exit_container(reply); + if (r < 0) + return bus_log_parse_error(r); + + *ret_reply = TAKE_PTR(reply); + return c; +} + +int expand_unit_names(sd_bus *bus, char **names, const char* suffix, char ***ret, bool *ret_expanded) { + _cleanup_strv_free_ char **mangled = NULL, **globs = NULL; + int r; + + assert(bus); + assert(ret); + + STRV_FOREACH(name, names) { + UnitNameMangle options = UNIT_NAME_MANGLE_GLOB | (arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN); + char *t; + + r = unit_name_mangle_with_suffix(*name, NULL, options, suffix ?: ".service", &t); + if (r < 0) + return log_error_errno(r, "Failed to mangle name: %m"); + + if (string_is_glob(t)) + r = strv_consume(&globs, t); + else + r = strv_consume(&mangled, t); + if (r < 0) + return log_oom(); + } + + /* Query the manager only if any of the names are a glob, since this is fairly expensive */ + bool expanded = !strv_isempty(globs); + if (expanded) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ UnitInfo *unit_infos = NULL; + size_t n; + + r = get_unit_list(bus, NULL, globs, &unit_infos, 0, &reply); + if (r < 0) + return r; + + n = strv_length(mangled); + + for (int i = 0; i < r; i++) { + if (!GREEDY_REALLOC(mangled, n+2)) + return log_oom(); + + mangled[n] = strdup(unit_infos[i].id); + if (!mangled[n]) + return log_oom(); + + mangled[++n] = NULL; + } + } + + if (ret_expanded) + *ret_expanded = expanded; + + *ret = TAKE_PTR(mangled); + return 0; +} + +int get_active_triggering_units(sd_bus *bus, const char *unit, bool ignore_masked, char ***ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_strv_free_ char **triggered_by = NULL, **active = NULL; + _cleanup_free_ char *name = NULL, *dbus_path = NULL; + int r; + + assert(bus); + assert(unit); + assert(ret); + + r = unit_name_mangle(unit, 0, &name); + if (r < 0) + return r; + + if (ignore_masked) { + r = unit_is_masked(bus, name); + if (r < 0) + return r; + if (r > 0) { + *ret = NULL; + return 0; + } + } + + dbus_path = unit_dbus_path_from_name(name); + if (!dbus_path) + return -ENOMEM; + + r = sd_bus_get_property_strv( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Unit", + "TriggeredBy", + &error, + &triggered_by); + if (r < 0) + return log_debug_errno(r, "Failed to get TriggeredBy property of unit '%s': %s", + name, bus_error_message(&error, r)); + + STRV_FOREACH(i, triggered_by) { + UnitActiveState active_state; + + r = get_state_one_unit(bus, *i, &active_state); + if (r < 0) + return r; + + if (!IN_SET(active_state, UNIT_ACTIVE, UNIT_RELOADING)) + continue; + + r = strv_extend(&active, *i); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(active); + return 0; +} + +void warn_triggering_units(sd_bus *bus, const char *unit, const char *operation, bool ignore_masked) { + _cleanup_strv_free_ char **triggered_by = NULL; + _cleanup_free_ char *joined = NULL; + int r; + + assert(bus); + assert(unit); + assert(operation); + + r = get_active_triggering_units(bus, unit, ignore_masked, &triggered_by); + if (r < 0) { + log_warning_errno(r, + "Failed to get triggering units for '%s', ignoring: %m", unit); + return; + } + + if (strv_isempty(triggered_by)) + return; + + joined = strv_join(triggered_by, ", "); + if (!joined) + return (void) log_oom(); + + log_warning("%s '%s', but its triggering units are still active:\n" + "%s", + operation, unit, joined); +} + +int need_daemon_reload(sd_bus *bus, const char *unit) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *path; + int b, r; + + /* We ignore all errors here, since this is used to show a + * warning only */ + + /* We don't use unit_dbus_path_from_name() directly since we + * don't want to load the unit if it isn't loaded. */ + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnit", NULL, &reply, "s", unit); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return r; + + r = sd_bus_get_property_trivial( + bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "NeedDaemonReload", + NULL, + 'b', &b); + if (r < 0) + return r; + + return b; +} + +void warn_unit_file_changed(const char *unit) { + assert(unit); + + log_warning("Warning: The unit file, source configuration file or drop-ins of %s changed on disk. Run 'systemctl%s daemon-reload' to reload units.", + unit, + arg_runtime_scope == RUNTIME_SCOPE_SYSTEM ? "" : " --user"); +} + +int unit_file_find_path(LookupPaths *lp, const char *unit_name, char **ret_unit_path) { + assert(lp); + assert(unit_name); + + STRV_FOREACH(p, lp->search_path) { + _cleanup_free_ char *path = NULL, *lpath = NULL; + int r; + + path = path_join(*p, unit_name); + if (!path) + return log_oom(); + + r = chase(path, arg_root, 0, &lpath, NULL); + if (r == -ENOENT) + continue; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to access path \"%s\": %m", path); + + if (ret_unit_path) + *ret_unit_path = TAKE_PTR(lpath); + + return 1; + } + + if (ret_unit_path) + *ret_unit_path = NULL; + + return 0; +} + +int unit_find_paths( + sd_bus *bus, + const char *unit_name, + LookupPaths *lp, + bool force_client_side, + Hashmap **cached_name_map, + Hashmap **cached_id_map, + char **ret_fragment_path, + char ***ret_dropin_paths) { + + _cleanup_strv_free_ char **dropins = NULL; + _cleanup_free_ char *path = NULL; + int r; + + /** + * Finds where the unit is defined on disk. Returns 0 if the unit is not found. Returns 1 if it is + * found, and sets: + * + * - the path to the unit in *ret_frament_path, if it exists on disk, + * + * - and a strv of existing drop-ins in *ret_dropin_paths, if the arg is not NULL and any dropins + * were found. + * + * Returns -ERFKILL if the unit is masked, and -EKEYREJECTED if the unit file could not be loaded for + * some reason (the latter only applies if we are going through the service manager). As special + * exception it won't log for these two error cases. + */ + + assert(unit_name); + assert(ret_fragment_path); + assert(lp); + + /* Go via the bus to acquire the path, unless we are explicitly told not to, or when the unit name is a template */ + if (!force_client_side && + !install_client_side() && + !unit_name_is_valid(unit_name, UNIT_NAME_TEMPLATE)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *load_state = NULL, *dbus_path = NULL; + + dbus_path = unit_dbus_path_from_name(unit_name); + if (!dbus_path) + return log_oom(); + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Unit", + "LoadState", + &error, + &load_state); + if (r < 0) + return log_error_errno(r, "Failed to get LoadState: %s", bus_error_message(&error, r)); + + if (streq(load_state, "masked")) + return -ERFKILL; /* special case: no logging */ + if (streq(load_state, "not-found")) { + r = 0; + goto finish; + } + if (!STR_IN_SET(load_state, "loaded", "bad-setting")) + return -EKEYREJECTED; /* special case: no logging */ + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Unit", + "FragmentPath", + &error, + &path); + if (r < 0) + return log_error_errno(r, "Failed to get FragmentPath: %s", bus_error_message(&error, r)); + + if (ret_dropin_paths) { + r = sd_bus_get_property_strv( + bus, + "org.freedesktop.systemd1", + dbus_path, + "org.freedesktop.systemd1.Unit", + "DropInPaths", + &error, + &dropins); + if (r < 0) + return log_error_errno(r, "Failed to get DropInPaths: %s", bus_error_message(&error, r)); + } + } else { + const char *_path; + _cleanup_set_free_free_ Set *names = NULL; + + if (!*cached_name_map) { + r = unit_file_build_name_map(lp, NULL, cached_id_map, cached_name_map, NULL); + if (r < 0) + return r; + } + + r = unit_file_find_fragment(*cached_id_map, *cached_name_map, unit_name, &_path, &names); + if (r < 0) + return log_error_errno(r, "Failed to find fragment for '%s': %m", unit_name); + + if (_path) { + path = strdup(_path); + if (!path) + return log_oom(); + } + + if (ret_dropin_paths) { + r = unit_file_find_dropin_paths(arg_root, lp->search_path, NULL, + ".d", ".conf", + NULL, names, &dropins); + if (r < 0) + return r; + } + } + + finish: + if (isempty(path)) { + *ret_fragment_path = NULL; + r = 0; + } else { + *ret_fragment_path = TAKE_PTR(path); + r = 1; + } + + if (ret_dropin_paths) { + if (!strv_isempty(dropins)) { + *ret_dropin_paths = TAKE_PTR(dropins); + r = 1; + } else + *ret_dropin_paths = NULL; + } + + if (r == 0 && !arg_force) + log_error("No files found for %s.", unit_name); + + return r; +} + +static int unit_find_template_path( + const char *unit_name, + LookupPaths *lp, + char **ret_fragment_path, + char **ret_template) { + + _cleanup_free_ char *t = NULL, *f = NULL; + int r; + + /* Returns 1 if a fragment was found, 0 if not found, negative on error. */ + + r = unit_file_find_path(lp, unit_name, &f); + if (r < 0) + return r; + if (r > 0) { + if (ret_fragment_path) + *ret_fragment_path = TAKE_PTR(f); + if (ret_template) + *ret_template = NULL; + return r; /* found a real unit */ + } + + r = unit_name_template(unit_name, &t); + if (r == -EINVAL) { + if (ret_fragment_path) + *ret_fragment_path = NULL; + if (ret_template) + *ret_template = NULL; + + return 0; /* not a template, does not exist */ + } + if (r < 0) + return log_error_errno(r, "Failed to determine template name: %m"); + + r = unit_file_find_path(lp, t, ret_fragment_path); + if (r < 0) + return r; + + if (ret_template) + *ret_template = r > 0 ? TAKE_PTR(t) : NULL; + + return r; +} + +int unit_is_masked(sd_bus *bus, const char *unit) { + _cleanup_free_ char *load_state = NULL; + int r; + + assert(bus); + assert(unit); + + if (unit_name_is_valid(unit, UNIT_NAME_TEMPLATE)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *state; + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnitFileState", &error, &reply, "s", unit); + if (r < 0) + return log_debug_errno(r, "Failed to get UnitFileState for '%s': %s", + unit, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "s", &state); + if (r < 0) + return bus_log_parse_error_debug(r); + + return STR_IN_SET(state, "masked", "masked-runtime"); + } + + r = unit_load_state(bus, unit, &load_state); + if (r < 0) + return r; + + return streq(load_state, "masked"); +} + +int unit_exists(LookupPaths *lp, const char *unit) { + typedef struct UnitStateInfo { + const char *load_state; + const char *active_state; + } UnitStateInfo; + + static const struct bus_properties_map property_map[] = { + { "LoadState", "s", NULL, offsetof(UnitStateInfo, load_state) }, + { "ActiveState", "s", NULL, offsetof(UnitStateInfo, active_state) }, + {}, + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_free_ char *path = NULL; + UnitStateInfo info = {}; + sd_bus *bus; + int r; + + if (unit_name_is_valid(unit, UNIT_NAME_TEMPLATE)) + return unit_find_template_path(unit, lp, NULL, NULL); + + path = unit_dbus_path_from_name(unit); + if (!path) + return log_oom(); + + r = acquire_bus(BUS_MANAGER, &bus); + if (r < 0) + return r; + + r = bus_map_all_properties(bus, "org.freedesktop.systemd1", path, property_map, 0, &error, &m, &info); + if (r < 0) + return log_error_errno(r, "Failed to get properties: %s", bus_error_message(&error, r)); + + return !streq_ptr(info.load_state, "not-found") || !streq_ptr(info.active_state, "inactive"); +} + + +int append_unit_dependencies(sd_bus *bus, char **names, char ***ret) { + _cleanup_strv_free_ char **with_deps = NULL; + + assert(bus); + assert(ret); + + STRV_FOREACH(name, names) { + _cleanup_strv_free_ char **deps = NULL; + + if (strv_extend(&with_deps, *name) < 0) + return log_oom(); + + (void) unit_get_dependencies(bus, *name, &deps); + + if (strv_extend_strv(&with_deps, deps, true) < 0) + return log_oom(); + } + + *ret = TAKE_PTR(with_deps); + + return 0; +} + +int maybe_extend_with_unit_dependencies(sd_bus *bus, char ***list) { + _cleanup_strv_free_ char **list_with_deps = NULL; + int r; + + assert(bus); + assert(list); + + if (!arg_with_dependencies) + return 0; + + r = append_unit_dependencies(bus, *list, &list_with_deps); + if (r < 0) + return log_error_errno(r, "Failed to append unit dependencies: %m"); + + strv_free(*list); + *list = TAKE_PTR(list_with_deps); + return 0; +} + +int unit_get_dependencies(sd_bus *bus, const char *name, char ***ret) { + _cleanup_strv_free_ char **deps = NULL; + + static const struct bus_properties_map map[_DEPENDENCY_MAX][7] = { + [DEPENDENCY_FORWARD] = { + { "Requires", "as", NULL, 0 }, + { "Requisite", "as", NULL, 0 }, + { "Wants", "as", NULL, 0 }, + { "ConsistsOf", "as", NULL, 0 }, + { "BindsTo", "as", NULL, 0 }, + { "Upholds", "as", NULL, 0 }, + {} + }, + [DEPENDENCY_REVERSE] = { + { "RequiredBy", "as", NULL, 0 }, + { "RequisiteOf", "as", NULL, 0 }, + { "WantedBy", "as", NULL, 0 }, + { "PartOf", "as", NULL, 0 }, + { "BoundBy", "as", NULL, 0 }, + { "UpheldBy", "as", NULL, 0 }, + {} + }, + [DEPENDENCY_AFTER] = { + { "After", "as", NULL, 0 }, + {} + }, + [DEPENDENCY_BEFORE] = { + { "Before", "as", NULL, 0 }, + {} + }, + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *dbus_path = NULL; + int r; + + assert(bus); + assert(name); + assert(ret); + + dbus_path = unit_dbus_path_from_name(name); + if (!dbus_path) + return log_oom(); + + r = bus_map_all_properties(bus, + "org.freedesktop.systemd1", + dbus_path, + map[arg_dependency], + BUS_MAP_STRDUP, + &error, + NULL, + &deps); + if (r < 0) + return log_error_errno(r, "Failed to get properties of %s: %s", name, bus_error_message(&error, r)); + + strv_uniq(deps); /* Sometimes a unit might have multiple deps on the other unit, + * but we still want to show it just once. */ + *ret = TAKE_PTR(deps); + + return 0; +} + +const char* unit_type_suffix(const char *unit) { + const char *dot; + + dot = strrchr(unit, '.'); + if (!dot) + return ""; + + return dot + 1; +} + +bool output_show_unit(const UnitInfo *u, char **patterns) { + assert(u); + + if (!strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE)) + return false; + + if (arg_types && !strv_contains(arg_types, unit_type_suffix(u->id))) + return false; + + if (arg_all) + return true; + + /* Note that '--all' is not purely a state filter, but also a filter that hides units that "follow" + * other units (which is used for device units that appear under different names). */ + if (!isempty(u->following)) + return false; + + if (!strv_isempty(arg_states)) + return true; + + /* By default show all units except the ones in inactive state and with no pending job */ + if (u->job_id > 0) + return true; + + if (streq(u->active_state, "inactive")) + return false; + + return true; +} + +bool install_client_side(void) { + /* Decides when to execute enable/disable/... operations client-side rather than server-side. */ + + if (running_in_chroot_or_offline()) + return true; + + if (sd_booted() <= 0) + return true; + + if (!isempty(arg_root)) + return true; + + if (arg_runtime_scope == RUNTIME_SCOPE_GLOBAL) + return true; + + /* Unsupported environment variable, mostly for debugging purposes */ + if (getenv_bool("SYSTEMCTL_INSTALL_CLIENT_SIDE") > 0) + return true; + + return false; +} + +int output_table(Table *table) { + int r; + + assert(table); + + if (OUTPUT_MODE_IS_JSON(arg_output)) + r = table_print_json(table, NULL, output_mode_to_json_format_flags(arg_output) | JSON_FORMAT_COLOR_AUTO); + else + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +bool show_preset_for_state(UnitFileState state) { + /* Don't show preset state in those unit file states, it'll only confuse users. */ + return !IN_SET(state, + UNIT_FILE_ALIAS, + UNIT_FILE_STATIC, + UNIT_FILE_GENERATED, + UNIT_FILE_TRANSIENT); +} + +UnitFileFlags unit_file_flags_from_args(void) { + return (arg_runtime ? UNIT_FILE_RUNTIME : 0) | + (arg_force ? UNIT_FILE_FORCE : 0); +} + +int mangle_names(const char *operation, char **original_names, char ***ret_mangled_names) { + _cleanup_strv_free_ char **l = NULL; + char **i; + int r; + + assert(ret_mangled_names); + + l = i = new(char*, strv_length(original_names) + 1); + if (!l) + return log_oom(); + + STRV_FOREACH(name, original_names) { + + /* When enabling units qualified path names are OK, too, hence allow them explicitly. */ + + if (is_path(*name)) + r = path_make_absolute_cwd(*name, i); + else + r = unit_name_mangle_with_suffix(*name, operation, + arg_quiet ? 0 : UNIT_NAME_MANGLE_WARN, + ".service", i); + if (r < 0) { + *i = NULL; + return log_error_errno(r, "Failed to mangle unit name or path '%s': %m", *name); + } + + i++; + } + + *i = NULL; + *ret_mangled_names = TAKE_PTR(l); + + return 0; +} + +int halt_now(enum action a) { + /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need + * to be synced explicitly in advance. */ + if (!arg_no_sync && !arg_dry_run) + sync(); + + /* Make sure C-A-D is handled by the kernel from this point on... */ + if (!arg_dry_run) + (void) reboot(RB_ENABLE_CAD); + + switch (a) { + + case ACTION_HALT: + if (!arg_quiet) + log_info("Halting."); + if (arg_dry_run) + return 0; + (void) reboot(RB_HALT_SYSTEM); + return -errno; + + case ACTION_POWEROFF: + if (!arg_quiet) + log_info("Powering off."); + if (arg_dry_run) + return 0; + (void) reboot(RB_POWER_OFF); + return -errno; + + case ACTION_KEXEC: + case ACTION_REBOOT: + return reboot_with_parameter(REBOOT_FALLBACK | + (arg_quiet ? 0 : REBOOT_LOG) | + (arg_dry_run ? REBOOT_DRY_RUN : 0)); + + default: + assert_not_reached(); + } +} diff --git a/src/systemctl/systemctl-util.h b/src/systemctl/systemctl-util.h new file mode 100644 index 0000000..7bddef0 --- /dev/null +++ b/src/systemctl/systemctl-util.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" + +#include "bus-unit-util.h" +#include "format-table.h" +#include "systemctl.h" + +typedef enum BusFocus { + BUS_FULL, /* The full bus indicated via --system or --user */ + BUS_MANAGER, /* The manager itself, possibly directly, possibly via the bus */ + _BUS_FOCUS_MAX +} BusFocus; + +int acquire_bus(BusFocus focus, sd_bus **ret); +void release_busses(void); + +void ask_password_agent_open_maybe(void); +void polkit_agent_open_maybe(void); + +int translate_bus_error_to_exit_status(int r, const sd_bus_error *error); + +int get_state_one_unit(sd_bus *bus, const char *unit, UnitActiveState *ret_active_state); +int get_sub_state_one_unit(sd_bus *bus, const char *unit, char **ret_sub_state); +int get_unit_list(sd_bus *bus, const char *machine, char **patterns, UnitInfo **unit_infos, int c, sd_bus_message **ret_reply); +int expand_unit_names(sd_bus *bus, char **names, const char* suffix, char ***ret, bool *ret_expanded); + +int get_active_triggering_units(sd_bus *bus, const char *unit, bool ignore_masked, char ***ret); +void warn_triggering_units(sd_bus *bus, const char *unit, const char *operation, bool ignore_masked); + +int need_daemon_reload(sd_bus *bus, const char *unit); + +void warn_unit_file_changed(const char *unit); + +int append_unit_dependencies(sd_bus *bus, char **names, char ***ret); +int maybe_extend_with_unit_dependencies(sd_bus *bus, char ***list); + +int unit_file_find_path(LookupPaths *lp, const char *unit_name, char **ret_unit_path); +int unit_find_paths(sd_bus *bus, const char *unit_name, LookupPaths *lp, bool force_client_side, Hashmap **cached_id_map, Hashmap **cached_name_map, char **ret_fragment_path, char ***ret_dropin_paths); + +int unit_is_masked(sd_bus *bus, const char *unit); +int unit_exists(LookupPaths *lp, const char *unit); + +int unit_get_dependencies(sd_bus *bus, const char *name, char ***ret); + +const char* unit_type_suffix(const char *unit); +bool output_show_unit(const UnitInfo *u, char **patterns); + +bool install_client_side(void); + +int output_table(Table *table); + +bool show_preset_for_state(UnitFileState state); + +int mangle_names(const char *operation, char **original_names, char ***ret_mangled_names); + +UnitFileFlags unit_file_flags_from_args(void); + +int halt_now(enum action a); diff --git a/src/systemctl/systemctl-whoami.c b/src/systemctl/systemctl-whoami.c new file mode 100644 index 0000000..4ee6592 --- /dev/null +++ b/src/systemctl/systemctl-whoami.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-error.h" +#include "bus-locator.h" +#include "systemctl.h" +#include "systemctl-util.h" +#include "systemctl-whoami.h" +#include "parse-util.h" + +static int lookup_pid(sd_bus *bus, pid_t pid) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_free_ char *unit = NULL; + const char *path; + int r; + + r = bus_call_method(bus, bus_systemd_mgr, "GetUnitByPID", &error, &reply, "u", (uint32_t) pid); + if (r < 0) + return log_error_errno(r, "Failed to get unit for ourselves: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + r = unit_name_from_dbus_path(path, &unit); + if (r < 0) + return log_error_errno(r, "Failed to extract unit name from D-Bus object path '%s': %m", path); + + printf("%s\n", unit); + return 0; +} + +int verb_whoami(int argc, char *argv[], void *userdata) { + sd_bus *bus; + int r; + + r = acquire_bus(BUS_FULL, &bus); + if (r < 0) + return r; + + char **pids = strv_skip(argv, 1); + + if (strv_isempty(pids)) { + + if (arg_transport != BUS_TRANSPORT_LOCAL) + return log_error_errno(SYNTHETIC_ERRNO(EREMOTE), "Refusing to look up local PID on remote host."); + + return lookup_pid(bus, 0); + } else { + int ret = 0; + + STRV_FOREACH(p, pids) { + pid_t pid; + + r = parse_pid(*p, &pid); + if (r < 0) { + log_error_errno(r, "Failed to parse PID: %s", *p); + if (ret >= 0) + ret = r; + continue; + } + + r = lookup_pid(bus, pid); + if (r < 0 && ret >= 0) + ret = r; + } + + return ret; + } +} diff --git a/src/systemctl/systemctl-whoami.h b/src/systemctl/systemctl-whoami.h new file mode 100644 index 0000000..abdd13b --- /dev/null +++ b/src/systemctl/systemctl-whoami.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +int verb_whoami(int argc, char *argv[], void *userdata); diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c new file mode 100644 index 0000000..dd6f6c9 --- /dev/null +++ b/src/systemctl/systemctl.c @@ -0,0 +1,1348 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" + +#include "build.h" +#include "bus-util.h" +#include "dissect-image.h" +#include "install.h" +#include "main-func.h" +#include "mount-util.h" +#include "output-mode.h" +#include "pager.h" +#include "parse-argument.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "reboot-util.h" +#include "rlimit-util.h" +#include "sigbus.h" +#include "signal-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "systemctl-add-dependency.h" +#include "systemctl-cancel-job.h" +#include "systemctl-clean-or-freeze.h" +#include "systemctl-compat-halt.h" +#include "systemctl-compat-runlevel.h" +#include "systemctl-compat-shutdown.h" +#include "systemctl-compat-telinit.h" +#include "systemctl-daemon-reload.h" +#include "systemctl-edit.h" +#include "systemctl-enable.h" +#include "systemctl-is-active.h" +#include "systemctl-is-enabled.h" +#include "systemctl-is-system-running.h" +#include "systemctl-kill.h" +#include "systemctl-list-dependencies.h" +#include "systemctl-list-jobs.h" +#include "systemctl-list-machines.h" +#include "systemctl-list-unit-files.h" +#include "systemctl-list-units.h" +#include "systemctl-log-setting.h" +#include "systemctl-logind.h" +#include "systemctl-mount.h" +#include "systemctl-preset-all.h" +#include "systemctl-reset-failed.h" +#include "systemctl-service-watchdogs.h" +#include "systemctl-set-default.h" +#include "systemctl-set-environment.h" +#include "systemctl-set-property.h" +#include "systemctl-show.h" +#include "systemctl-start-special.h" +#include "systemctl-start-unit.h" +#include "systemctl-switch-root.h" +#include "systemctl-sysv-compat.h" +#include "systemctl-trivial-method.h" +#include "systemctl-util.h" +#include "systemctl-whoami.h" +#include "systemctl.h" +#include "terminal-util.h" +#include "time-util.h" +#include "verbs.h" +#include "virt.h" + +char **arg_types = NULL; +char **arg_states = NULL; +char **arg_properties = NULL; +bool arg_all = false; +enum dependency arg_dependency = DEPENDENCY_FORWARD; +const char *_arg_job_mode = NULL; +RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; +bool arg_wait = false; +bool arg_no_block = false; +int arg_legend = -1; /* -1: true, unless --quiet is passed, 1: true */ +PagerFlags arg_pager_flags = 0; +bool arg_no_wtmp = false; +bool arg_no_sync = false; +bool arg_no_wall = false; +bool arg_no_reload = false; +BusPrintPropertyFlags arg_print_flags = 0; +bool arg_show_types = false; +int arg_check_inhibitors = -1; +bool arg_dry_run = false; +bool arg_quiet = false; +bool arg_no_warn = false; +bool arg_full = false; +bool arg_recursive = false; +bool arg_with_dependencies = false; +bool arg_show_transaction = false; +int arg_force = 0; +bool arg_ask_password = false; +bool arg_runtime = false; +UnitFilePresetMode arg_preset_mode = UNIT_FILE_PRESET_FULL; +char **arg_wall = NULL; +const char *arg_kill_whom = NULL; +int arg_signal = SIGTERM; +int arg_kill_value; +bool arg_kill_value_set = false; +char *arg_root = NULL; +char *arg_image = NULL; +usec_t arg_when = 0; +const char *arg_reboot_argument = NULL; +enum action arg_action = ACTION_SYSTEMCTL; +BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +const char *arg_host = NULL; +unsigned arg_lines = 10; +OutputMode arg_output = OUTPUT_SHORT; +bool arg_plain = false; +bool arg_firmware_setup = false; +usec_t arg_boot_loader_menu = USEC_INFINITY; +const char *arg_boot_loader_entry = NULL; +bool arg_now = false; +bool arg_jobs_before = false; +bool arg_jobs_after = false; +char **arg_clean_what = NULL; +TimestampStyle arg_timestamp_style = TIMESTAMP_PRETTY; +bool arg_read_only = false; +bool arg_mkdir = false; +bool arg_marked = false; +const char *arg_drop_in = NULL; +ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_types, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_states, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_properties, strv_freep); +STATIC_DESTRUCTOR_REGISTER(_arg_job_mode, unsetp); +STATIC_DESTRUCTOR_REGISTER(arg_wall, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_kill_whom, unsetp); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_reboot_argument, unsetp); +STATIC_DESTRUCTOR_REGISTER(arg_host, unsetp); +STATIC_DESTRUCTOR_REGISTER(arg_boot_loader_entry, unsetp); +STATIC_DESTRUCTOR_REGISTER(arg_clean_what, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_drop_in, unsetp); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +static int systemctl_help(void) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("systemctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%5$sQuery or send control commands to the system manager.%6$s\n" + "\n%3$sUnit Commands:%4$s\n" + " list-units [PATTERN...] List units currently in memory\n" + " list-automounts [PATTERN...] List automount units currently in memory,\n" + " ordered by path\n" + " list-paths [PATTERN...] List path units currently in memory,\n" + " ordered by path\n" + " list-sockets [PATTERN...] List socket units currently in memory,\n" + " ordered by address\n" + " list-timers [PATTERN...] List timer units currently in memory,\n" + " ordered by next elapse\n" + " is-active PATTERN... Check whether units are active\n" + " is-failed [PATTERN...] Check whether units are failed or\n" + " system is in degraded state\n" + " status [PATTERN...|PID...] Show runtime status of one or more units\n" + " show [PATTERN...|JOB...] Show properties of one or more\n" + " units/jobs or the manager\n" + " cat PATTERN... Show files and drop-ins of specified units\n" + " help PATTERN...|PID... Show manual for one or more units\n" + " list-dependencies [UNIT...] Recursively show units which are required\n" + " or wanted by the units or by which those\n" + " units are required or wanted\n" + " start UNIT... Start (activate) one or more units\n" + " stop UNIT... Stop (deactivate) one or more units\n" + " reload UNIT... Reload one or more units\n" + " restart UNIT... Start or restart one or more units\n" + " try-restart UNIT... Restart one or more units if active\n" + " reload-or-restart UNIT... Reload one or more units if possible,\n" + " otherwise start or restart\n" + " try-reload-or-restart UNIT... If active, reload one or more units,\n" + " if supported, otherwise restart\n" + " isolate UNIT Start one unit and stop all others\n" + " kill UNIT... Send signal to processes of a unit\n" + " clean UNIT... Clean runtime, cache, state, logs or\n" + " configuration of unit\n" + " freeze PATTERN... Freeze execution of unit processes\n" + " thaw PATTERN... Resume execution of a frozen unit\n" + " set-property UNIT PROPERTY=VALUE... Sets one or more properties of a unit\n" + " bind UNIT PATH [PATH] Bind-mount a path from the host into a\n" + " unit's namespace\n" + " mount-image UNIT PATH [PATH [OPTS]] Mount an image from the host into a\n" + " unit's namespace\n" + " service-log-level SERVICE [LEVEL] Get/set logging threshold for service\n" + " service-log-target SERVICE [TARGET] Get/set logging target for service\n" + " reset-failed [PATTERN...] Reset failed state for all, one, or more\n" + " units\n" + " whoami [PID...] Return unit caller or specified PIDs are\n" + " part of\n" + "\n%3$sUnit File Commands:%4$s\n" + " list-unit-files [PATTERN...] List installed unit files\n" + " enable [UNIT...|PATH...] Enable one or more unit files\n" + " disable UNIT... Disable one or more unit files\n" + " reenable UNIT... Reenable one or more unit files\n" + " preset UNIT... Enable/disable one or more unit files\n" + " based on preset configuration\n" + " preset-all Enable/disable all unit files based on\n" + " preset configuration\n" + " is-enabled UNIT... Check whether unit files are enabled\n" + " mask UNIT... Mask one or more units\n" + " unmask UNIT... Unmask one or more units\n" + " link PATH... Link one or more units files into\n" + " the search path\n" + " revert UNIT... Revert one or more unit files to vendor\n" + " version\n" + " add-wants TARGET UNIT... Add 'Wants' dependency for the target\n" + " on specified one or more units\n" + " add-requires TARGET UNIT... Add 'Requires' dependency for the target\n" + " on specified one or more units\n" + " edit UNIT... Edit one or more unit files\n" + " get-default Get the name of the default target\n" + " set-default TARGET Set the default target\n" + "\n%3$sMachine Commands:%4$s\n" + " list-machines [PATTERN...] List local containers and host\n" + "\n%3$sJob Commands:%4$s\n" + " list-jobs [PATTERN...] List jobs\n" + " cancel [JOB...] Cancel all, one, or more jobs\n" + "\n%3$sEnvironment Commands:%4$s\n" + " show-environment Dump environment\n" + " set-environment VARIABLE=VALUE... Set one or more environment variables\n" + " unset-environment VARIABLE... Unset one or more environment variables\n" + " import-environment VARIABLE... Import all or some environment variables\n" + "\n%3$sManager State Commands:%4$s\n" + " daemon-reload Reload systemd manager configuration\n" + " daemon-reexec Reexecute systemd manager\n" + " log-level [LEVEL] Get/set logging threshold for manager\n" + " log-target [TARGET] Get/set logging target for manager\n" + " service-watchdogs [BOOL] Get/set service watchdog state\n" + "\n%3$sSystem Commands:%4$s\n" + " is-system-running Check whether system is fully running\n" + " default Enter system default mode\n" + " rescue Enter system rescue mode\n" + " emergency Enter system emergency mode\n" + " halt Shut down and halt the system\n" + " poweroff Shut down and power-off the system\n" + " reboot Shut down and reboot the system\n" + " kexec Shut down and reboot the system with kexec\n" + " soft-reboot Shut down and reboot userspace\n" + " exit [EXIT_CODE] Request user instance or container exit\n" + " switch-root [ROOT [INIT]] Change to a different root file system\n" + " suspend Suspend the system\n" + " hibernate Hibernate the system\n" + " hybrid-sleep Hibernate and suspend the system\n" + " suspend-then-hibernate Suspend the system, wake after a period of\n" + " time, and hibernate" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --system Connect to system manager\n" + " --user Connect to user service manager\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on a local container\n" + " -t --type=TYPE List units of a particular type\n" + " --state=STATE List units with particular LOAD or SUB or ACTIVE state\n" + " --failed Shortcut for --state=failed\n" + " -p --property=NAME Show only properties by this name\n" + " -P NAME Equivalent to --value --property=NAME\n" + " -a --all Show all properties/all units currently in memory,\n" + " including dead/empty ones. To list all units installed\n" + " on the system, use 'list-unit-files' instead.\n" + " -l --full Don't ellipsize unit names on output\n" + " -r --recursive Show unit list of host and local containers\n" + " --reverse Show reverse dependencies with 'list-dependencies'\n" + " --with-dependencies Show unit dependencies with 'status', 'cat',\n" + " 'list-units', and 'list-unit-files'.\n" + " --job-mode=MODE Specify how to deal with already queued jobs, when\n" + " queueing a new job\n" + " -T --show-transaction When enqueuing a unit job, show full transaction\n" + " --show-types When showing sockets, explicitly show their type\n" + " --value When showing properties, only print the value\n" + " --check-inhibitors=MODE\n" + " Whether to check inhibitors before shutting down,\n" + " sleeping, or hibernating\n" + " -i Shortcut for --check-inhibitors=no\n" + " --kill-whom=WHOM Whom to send signal to\n" + " --kill-value=INT Signal value to enqueue\n" + " -s --signal=SIGNAL Which signal to send\n" + " --what=RESOURCES Which types of resources to remove\n" + " --now Start or stop unit after enabling or disabling it\n" + " --dry-run Only print what would be done\n" + " Currently supported by verbs: halt, poweroff, reboot,\n" + " kexec, soft-reboot, suspend, hibernate, \n" + " suspend-then-hibernate, hybrid-sleep, default,\n" + " rescue, emergency, and exit.\n" + " -q --quiet Suppress output\n" + " --no-warn Suppress several warnings shown by default\n" + " --wait For (re)start, wait until service stopped again\n" + " For is-system-running, wait until startup is completed\n" + " --no-block Do not wait until operation finished\n" + " --no-wall Don't send wall message before halt/power-off/reboot\n" + " --no-reload Don't reload daemon after en-/dis-abling unit files\n" + " --legend=BOOL Enable/disable the legend (column headers and hints)\n" + " --no-pager Do not pipe output into a pager\n" + " --no-ask-password Do not ask for system passwords\n" + " --global Edit/enable/disable/mask default user unit files\n" + " globally\n" + " --runtime Edit/enable/disable/mask unit files temporarily until\n" + " next reboot\n" + " -f --force When enabling unit files, override existing symlinks\n" + " When shutting down, execute action immediately\n" + " --preset-mode= Apply only enable, only disable, or all presets\n" + " --root=PATH Edit/enable/disable/mask unit files in the specified\n" + " root directory\n" + " --image=PATH Edit/enable/disable/mask unit files in the specified\n" + " disk image\n" + " --image-policy=POLICY\n" + " Specify disk image dissection policy\n" + " -n --lines=INTEGER Number of journal entries to show\n" + " -o --output=STRING Change journal output mode (short, short-precise,\n" + " short-iso, short-iso-precise, short-full,\n" + " short-monotonic, short-unix, short-delta,\n" + " verbose, export, json, json-pretty, json-sse, cat)\n" + " --firmware-setup Tell the firmware to show the setup menu on next boot\n" + " --boot-loader-menu=TIME\n" + " Boot into boot loader menu on next boot\n" + " --boot-loader-entry=NAME\n" + " Boot into a specific boot loader entry on next boot\n" + " --plain Print unit dependencies as a list instead of a tree\n" + " --timestamp=FORMAT Change format of printed timestamps (pretty, unix,\n" + " us, utc, us+utc)\n" + " --read-only Create read-only bind mount\n" + " --mkdir Create directory before mounting, if missing\n" + " --marked Restart/reload previously marked units\n" + " --drop-in=NAME Edit unit files using the specified drop-in file name\n" + " --when=TIME Schedule halt/power-off/reboot/kexec action after\n" + " a certain timestamp\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static void help_types(void) { + if (arg_legend != 0) + puts("Available unit types:"); + + DUMP_STRING_TABLE(unit_type, UnitType, _UNIT_TYPE_MAX); +} + +static void help_states(void) { + if (arg_legend != 0) + puts("Available unit load states:"); + DUMP_STRING_TABLE(unit_load_state, UnitLoadState, _UNIT_LOAD_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable unit active states:"); + DUMP_STRING_TABLE(unit_active_state, UnitActiveState, _UNIT_ACTIVE_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable unit file states:"); + DUMP_STRING_TABLE(unit_file_state, UnitFileState, _UNIT_FILE_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable automount unit substates:"); + DUMP_STRING_TABLE(automount_state, AutomountState, _AUTOMOUNT_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable device unit substates:"); + DUMP_STRING_TABLE(device_state, DeviceState, _DEVICE_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable mount unit substates:"); + DUMP_STRING_TABLE(mount_state, MountState, _MOUNT_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable path unit substates:"); + DUMP_STRING_TABLE(path_state, PathState, _PATH_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable scope unit substates:"); + DUMP_STRING_TABLE(scope_state, ScopeState, _SCOPE_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable service unit substates:"); + DUMP_STRING_TABLE(service_state, ServiceState, _SERVICE_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable slice unit substates:"); + DUMP_STRING_TABLE(slice_state, SliceState, _SLICE_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable socket unit substates:"); + DUMP_STRING_TABLE(socket_state, SocketState, _SOCKET_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable swap unit substates:"); + DUMP_STRING_TABLE(swap_state, SwapState, _SWAP_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable target unit substates:"); + DUMP_STRING_TABLE(target_state, TargetState, _TARGET_STATE_MAX); + + if (arg_legend != 0) + puts("\nAvailable timer unit substates:"); + DUMP_STRING_TABLE(timer_state, TimerState, _TIMER_STATE_MAX); +} + +static int systemctl_parse_argv(int argc, char *argv[]) { + enum { + ARG_FAIL = 0x100, /* compatibility only */ + ARG_REVERSE, + ARG_AFTER, + ARG_BEFORE, + ARG_CHECK_INHIBITORS, + ARG_DRY_RUN, + ARG_SHOW_TYPES, + ARG_IRREVERSIBLE, /* compatibility only */ + ARG_IGNORE_DEPENDENCIES, /* compatibility only */ + ARG_VALUE, + ARG_VERSION, + ARG_USER, + ARG_SYSTEM, + ARG_GLOBAL, + ARG_NO_BLOCK, + ARG_LEGEND, + ARG_NO_LEGEND, /* compatibility only */ + ARG_NO_PAGER, + ARG_NO_WALL, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_NO_RELOAD, + ARG_KILL_WHOM, + ARG_KILL_VALUE, + ARG_NO_ASK_PASSWORD, + ARG_FAILED, + ARG_RUNTIME, + ARG_PLAIN, + ARG_STATE, + ARG_JOB_MODE, + ARG_PRESET_MODE, + ARG_FIRMWARE_SETUP, + ARG_BOOT_LOADER_MENU, + ARG_BOOT_LOADER_ENTRY, + ARG_NOW, + ARG_MESSAGE, + ARG_WITH_DEPENDENCIES, + ARG_WAIT, + ARG_WHAT, + ARG_REBOOT_ARG, + ARG_TIMESTAMP_STYLE, + ARG_READ_ONLY, + ARG_MKDIR, + ARG_MARKED, + ARG_NO_WARN, + ARG_DROP_IN, + ARG_WHEN, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "type", required_argument, NULL, 't' }, + { "property", required_argument, NULL, 'p' }, + { "all", no_argument, NULL, 'a' }, + { "reverse", no_argument, NULL, ARG_REVERSE }, + { "after", no_argument, NULL, ARG_AFTER }, + { "before", no_argument, NULL, ARG_BEFORE }, + { "show-types", no_argument, NULL, ARG_SHOW_TYPES }, + { "failed", no_argument, NULL, ARG_FAILED }, + { "full", no_argument, NULL, 'l' }, + { "job-mode", required_argument, NULL, ARG_JOB_MODE }, + { "fail", no_argument, NULL, ARG_FAIL }, /* compatibility only */ + { "irreversible", no_argument, NULL, ARG_IRREVERSIBLE }, /* compatibility only */ + { "ignore-dependencies", no_argument, NULL, ARG_IGNORE_DEPENDENCIES }, /* compatibility only */ + { "ignore-inhibitors", no_argument, NULL, 'i' }, /* compatibility only */ + { "check-inhibitors", required_argument, NULL, ARG_CHECK_INHIBITORS }, + { "value", no_argument, NULL, ARG_VALUE }, + { "user", no_argument, NULL, ARG_USER }, + { "system", no_argument, NULL, ARG_SYSTEM }, + { "global", no_argument, NULL, ARG_GLOBAL }, + { "wait", no_argument, NULL, ARG_WAIT }, + { "no-block", no_argument, NULL, ARG_NO_BLOCK }, + { "legend", required_argument, NULL, ARG_LEGEND }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, /* compatibility only */ + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-wall", no_argument, NULL, ARG_NO_WALL }, + { "dry-run", no_argument, NULL, ARG_DRY_RUN }, + { "quiet", no_argument, NULL, 'q' }, + { "no-warn", no_argument, NULL, ARG_NO_WARN }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "force", no_argument, NULL, 'f' }, + { "no-reload", no_argument, NULL, ARG_NO_RELOAD }, + { "kill-whom", required_argument, NULL, ARG_KILL_WHOM }, + { "kill-value", required_argument, NULL, ARG_KILL_VALUE }, + { "signal", required_argument, NULL, 's' }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "runtime", no_argument, NULL, ARG_RUNTIME }, + { "lines", required_argument, NULL, 'n' }, + { "output", required_argument, NULL, 'o' }, + { "plain", no_argument, NULL, ARG_PLAIN }, + { "state", required_argument, NULL, ARG_STATE }, + { "recursive", no_argument, NULL, 'r' }, + { "with-dependencies", no_argument, NULL, ARG_WITH_DEPENDENCIES }, + { "preset-mode", required_argument, NULL, ARG_PRESET_MODE }, + { "firmware-setup", no_argument, NULL, ARG_FIRMWARE_SETUP }, + { "boot-loader-menu", required_argument, NULL, ARG_BOOT_LOADER_MENU }, + { "boot-loader-entry", required_argument, NULL, ARG_BOOT_LOADER_ENTRY }, + { "now", no_argument, NULL, ARG_NOW }, + { "message", required_argument, NULL, ARG_MESSAGE }, + { "show-transaction", no_argument, NULL, 'T' }, + { "what", required_argument, NULL, ARG_WHAT }, + { "reboot-argument", required_argument, NULL, ARG_REBOOT_ARG }, + { "timestamp", required_argument, NULL, ARG_TIMESTAMP_STYLE }, + { "read-only", no_argument, NULL, ARG_READ_ONLY }, + { "mkdir", no_argument, NULL, ARG_MKDIR }, + { "marked", no_argument, NULL, ARG_MARKED }, + { "drop-in", required_argument, NULL, ARG_DROP_IN }, + { "when", required_argument, NULL, ARG_WHEN }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + /* We default to allowing interactive authorization only in systemctl (not in the legacy commands) */ + arg_ask_password = true; + + while ((c = getopt_long(argc, argv, "ht:p:P:alqfs:H:M:n:o:iTr.::", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return systemctl_help(); + + case ARG_VERSION: + return version(); + + case 't': + if (isempty(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--type= requires arguments."); + + for (const char *p = optarg;;) { + _cleanup_free_ char *type = NULL; + + r = extract_first_word(&p, &type, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse type: %s", optarg); + if (r == 0) + break; + + if (streq(type, "help")) { + help_types(); + return 0; + } + + if (unit_type_from_string(type) >= 0) { + if (strv_consume(&arg_types, TAKE_PTR(type)) < 0) + return log_oom(); + continue; + } + + /* It's much nicer to use --state= for load states, but let's support this in + * --types= too for compatibility with old versions */ + if (unit_load_state_from_string(type) >= 0) { + if (strv_consume(&arg_states, TAKE_PTR(type)) < 0) + return log_oom(); + continue; + } + + log_error("Unknown unit type or load state '%s'.", type); + return log_info_errno(SYNTHETIC_ERRNO(EINVAL), + "Use -t help to see a list of allowed values."); + } + + break; + + case 'P': + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_ONLY_VALUE, true); + _fallthrough_; + + case 'p': + /* Make sure that if the empty property list was specified, we won't show any + properties. */ + if (isempty(optarg) && !arg_properties) { + arg_properties = new0(char*, 1); + if (!arg_properties) + return log_oom(); + } else + for (const char *p = optarg;;) { + _cleanup_free_ char *prop = NULL; + + r = extract_first_word(&p, &prop, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse property: %s", optarg); + if (r == 0) + break; + + if (strv_consume(&arg_properties, TAKE_PTR(prop)) < 0) + return log_oom(); + } + + /* If the user asked for a particular property, show it, even if it is empty. */ + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + + break; + + case 'a': + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + arg_all = true; + break; + + case ARG_REVERSE: + arg_dependency = DEPENDENCY_REVERSE; + break; + + case ARG_AFTER: + arg_dependency = DEPENDENCY_AFTER; + arg_jobs_after = true; + break; + + case ARG_BEFORE: + arg_dependency = DEPENDENCY_BEFORE; + arg_jobs_before = true; + break; + + case ARG_SHOW_TYPES: + arg_show_types = true; + break; + + case ARG_VALUE: + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_ONLY_VALUE, true); + break; + + case ARG_JOB_MODE: + _arg_job_mode = optarg; + break; + + case ARG_FAIL: + _arg_job_mode = "fail"; + break; + + case ARG_IRREVERSIBLE: + _arg_job_mode = "replace-irreversibly"; + break; + + case ARG_IGNORE_DEPENDENCIES: + _arg_job_mode = "ignore-dependencies"; + break; + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + + case ARG_SYSTEM: + arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; + break; + + case ARG_GLOBAL: + arg_runtime_scope = RUNTIME_SCOPE_GLOBAL; + break; + + case ARG_WAIT: + arg_wait = true; + break; + + case ARG_NO_BLOCK: + arg_no_block = true; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_LEGEND: + r = parse_boolean_argument("--legend", optarg, NULL); + if (r < 0) + return r; + arg_legend = r; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_WALL: + arg_no_wall = true; + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case 'l': + arg_full = true; + break; + + case ARG_FAILED: + if (strv_extend(&arg_states, "failed") < 0) + return log_oom(); + + break; + + case ARG_DRY_RUN: + arg_dry_run = true; + break; + + case 'q': + arg_quiet = true; + + if (arg_legend < 0) + arg_legend = false; + + break; + + case 'f': + arg_force++; + break; + + case ARG_NO_RELOAD: + arg_no_reload = true; + break; + + case ARG_KILL_WHOM: + arg_kill_whom = optarg; + break; + + case ARG_KILL_VALUE: { + unsigned u; + + if (isempty(optarg)) { + arg_kill_value_set = false; + return 0; + } + + /* First, try to parse unsigned, so that we can support the prefixes 0x, 0o, 0b */ + r = safe_atou_full(optarg, 0, &u); + if (r < 0) + /* If this didn't work, try as signed integer, without those prefixes */ + r = safe_atoi(optarg, &arg_kill_value); + else if (u > INT_MAX) + r = -ERANGE; + else + arg_kill_value = (int) u; + if (r < 0) + return log_error_errno(r, "Unable to parse signal queue value: %s", optarg); + + arg_kill_value_set = true; + break; + } + + case 's': + r = parse_signal_argument(optarg, &arg_signal); + if (r <= 0) + return r; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case ARG_RUNTIME: + arg_runtime = true; + break; + + case 'n': + if (safe_atou(optarg, &arg_lines) < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse lines '%s'", + optarg); + break; + + case 'o': + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(output_mode, OutputMode, _OUTPUT_MODE_MAX); + return 0; + } + + arg_output = output_mode_from_string(optarg); + if (arg_output < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown output '%s'.", + optarg); + + if (OUTPUT_MODE_IS_JSON(arg_output)) { + arg_legend = false; + arg_plain = true; + } + break; + + case 'i': + arg_check_inhibitors = 0; + break; + + case ARG_CHECK_INHIBITORS: + r = parse_tristate_full(optarg, "auto", &arg_check_inhibitors); + if (r < 0) + return log_error_errno(r, "Failed to parse --check-inhibitors= argument: %s", optarg); + break; + + case ARG_PLAIN: + arg_plain = true; + break; + + case ARG_FIRMWARE_SETUP: + arg_firmware_setup = true; + break; + + case ARG_BOOT_LOADER_MENU: + + r = parse_sec(optarg, &arg_boot_loader_menu); + if (r < 0) + return log_error_errno(r, "Failed to parse --boot-loader-menu= argument '%s': %m", optarg); + + break; + + case ARG_BOOT_LOADER_ENTRY: + + if (streq(optarg, "help")) { /* Yes, this means, "help" is not a valid boot loader entry name we can deal with */ + r = help_boot_loader_entry(); + if (r < 0) + return r; + + return 0; + } + + arg_boot_loader_entry = empty_to_null(optarg); + break; + + case ARG_STATE: + if (isempty(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--state= requires arguments."); + + for (const char *p = optarg;;) { + _cleanup_free_ char *s = NULL; + + r = extract_first_word(&p, &s, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse state: %s", optarg); + if (r == 0) + break; + + if (streq(s, "help")) { + help_states(); + return 0; + } + + if (strv_consume(&arg_states, TAKE_PTR(s)) < 0) + return log_oom(); + } + break; + + case 'r': + if (geteuid() != 0) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "--recursive requires root privileges."); + + arg_recursive = true; + break; + + case ARG_PRESET_MODE: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(unit_file_preset_mode, UnitFilePresetMode, _UNIT_FILE_PRESET_MODE_MAX); + return 0; + } + + arg_preset_mode = unit_file_preset_mode_from_string(optarg); + if (arg_preset_mode < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to parse preset mode: %s.", optarg); + + break; + + case ARG_NOW: + arg_now = true; + break; + + case ARG_MESSAGE: + if (strv_extend(&arg_wall, optarg) < 0) + return log_oom(); + break; + + case 'T': + arg_show_transaction = true; + break; + + case ARG_WITH_DEPENDENCIES: + arg_with_dependencies = true; + break; + + case ARG_WHAT: + if (isempty(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--what= requires arguments."); + + for (const char *p = optarg;;) { + _cleanup_free_ char *k = NULL; + + r = extract_first_word(&p, &k, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to parse directory type: %s", optarg); + if (r == 0) + break; + + if (streq(k, "help")) { + puts("runtime\n" + "state\n" + "cache\n" + "logs\n" + "configuration\n" + "fdstore"); + return 0; + } + + r = strv_consume(&arg_clean_what, TAKE_PTR(k)); + if (r < 0) + return log_oom(); + } + + break; + + case ARG_REBOOT_ARG: + arg_reboot_argument = optarg; + break; + + case ARG_TIMESTAMP_STYLE: + if (streq(optarg, "help")) { + DUMP_STRING_TABLE(timestamp_style, TimestampStyle, _TIMESTAMP_STYLE_MAX); + return 0; + } + + arg_timestamp_style = timestamp_style_from_string(optarg); + if (arg_timestamp_style < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid value: %s.", optarg); + + break; + + case ARG_READ_ONLY: + arg_read_only = true; + break; + + case ARG_MKDIR: + arg_mkdir = true; + break; + + case ARG_MARKED: + arg_marked = true; + break; + + case ARG_NO_WARN: + arg_no_warn = true; + break; + + case ARG_DROP_IN: + arg_drop_in = optarg; + break; + + case ARG_WHEN: + if (streq(optarg, "show")) { + r = logind_show_shutdown(); + if (r < 0 && r != -ENODATA) + return r; + + return 0; + } + + if (STR_IN_SET(optarg, "", "cancel")) { + arg_when = USEC_INFINITY; + break; + } + + r = parse_timestamp(optarg, &arg_when); + if (r < 0) + return log_error_errno(r, "Failed to parse --when= argument '%s': %m", optarg); + + if (!timestamp_is_set(arg_when)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid timestamp '%s' specified for --when=.", optarg); + + break; + + case '.': + /* Output an error mimicking getopt, and print a hint afterwards */ + log_error("%s: invalid option -- '.'", program_invocation_name); + log_notice("Hint: to specify units starting with a dash, use \"--\":\n" + " %s [OPTIONS...] COMMAND -- -.%s ...", + program_invocation_name, optarg ?: "mount"); + _fallthrough_; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + /* If we are in --user mode, there's no point in talking to PolicyKit or the infra to query system + * passwords */ + if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + arg_ask_password = false; + + if (arg_transport == BUS_TRANSPORT_REMOTE && arg_runtime_scope != RUNTIME_SCOPE_SYSTEM) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Cannot access user instance remotely."); + + if (arg_wait && arg_no_block) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--wait may not be combined with --no-block."); + + bool do_reload_or_restart = streq_ptr(argv[optind], "reload-or-restart"); + if (arg_marked) { + if (!do_reload_or_restart) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--marked may only be used with 'reload-or-restart'."); + if (optind + 1 < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "No additional arguments allowed with 'reload-or-restart --marked'."); + if (arg_wait) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--marked --wait is not supported."); + if (arg_show_transaction) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--marked --show-transaction is not supported."); + + } else if (do_reload_or_restart) { + if (optind + 1 >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "List of units to restart/reload is required."); + } + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + + return 1; +} + +int systemctl_dispatch_parse_argv(int argc, char *argv[]) { + assert(argc >= 0); + assert(argv); + + if (invoked_as(argv, "halt")) { + arg_action = ACTION_HALT; + return halt_parse_argv(argc, argv); + + } else if (invoked_as(argv, "poweroff")) { + arg_action = ACTION_POWEROFF; + return halt_parse_argv(argc, argv); + + } else if (invoked_as(argv, "reboot")) { + arg_action = ACTION_REBOOT; + return halt_parse_argv(argc, argv); + + } else if (invoked_as(argv, "shutdown")) { + arg_action = ACTION_POWEROFF; + return shutdown_parse_argv(argc, argv); + + } else if (invoked_as(argv, "init")) { + + /* Matches invocations as "init" as well as "telinit", which are synonymous when run + * as PID != 1 on SysV. + * + * On SysV "telinit" was the official command to communicate with PID 1, but "init" would + * redirect itself to "telinit" if called with PID != 1. We follow the same logic here still, + * though we add one level of indirection, as we implement "telinit" in "systemctl". Hence, + * for us if you invoke "init" you get "systemd", but it will execve() "systemctl" + * immediately with argv[] unmodified if PID is != 1. If you invoke "telinit" you directly + * get "systemctl". In both cases we shall do the same thing, which is why we do + * invoked_as(argv, "init") here, as a quick way to match both. + * + * Also see redirect_telinit() in src/core/main.c. */ + + if (sd_booted() > 0) { + arg_action = _ACTION_INVALID; + return telinit_parse_argv(argc, argv); + } else { + /* Hmm, so some other init system is running, we need to forward this request to it. + */ + arg_action = ACTION_TELINIT; + return 1; + } + + } else if (invoked_as(argv, "runlevel")) { + arg_action = ACTION_RUNLEVEL; + return runlevel_parse_argv(argc, argv); + } + + arg_action = ACTION_SYSTEMCTL; + return systemctl_parse_argv(argc, argv); +} + +#ifndef FUZZ_SYSTEMCTL_PARSE_ARGV +static int systemctl_main(int argc, char *argv[]) { + static const Verb verbs[] = { + { "list-units", VERB_ANY, VERB_ANY, VERB_DEFAULT|VERB_ONLINE_ONLY, verb_list_units }, + { "list-unit-files", VERB_ANY, VERB_ANY, 0, verb_list_unit_files }, + { "list-automounts", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_automounts }, + { "list-paths", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_paths }, + { "list-sockets", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_sockets }, + { "list-timers", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_timers }, + { "list-jobs", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_jobs }, + { "list-machines", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_machines }, + { "clear-jobs", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_trivial_method }, + { "cancel", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_cancel }, + { "start", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "stop", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "condstop", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, /* For compatibility with ALTLinux */ + { "reload", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "restart", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "try-restart", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "reload-or-restart", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "reload-or-try-restart", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, /* For compatibility with old systemctl <= 228 */ + { "try-reload-or-restart", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, + { "force-reload", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, /* For compatibility with SysV */ + { "condreload", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, /* For compatibility with ALTLinux */ + { "condrestart", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_start }, /* For compatibility with RH */ + { "isolate", 2, 2, VERB_ONLINE_ONLY, verb_start }, + { "kill", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_kill }, + { "clean", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_clean_or_freeze }, + { "freeze", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_clean_or_freeze }, + { "thaw", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_clean_or_freeze }, + { "is-active", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_is_active }, + { "check", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_is_active }, /* deprecated alias of is-active */ + { "is-failed", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_is_failed }, + { "show", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_show }, + { "cat", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_cat }, + { "status", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_show }, + { "help", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_show }, + { "daemon-reload", 1, 1, VERB_ONLINE_ONLY, verb_daemon_reload }, + { "daemon-reexec", 1, 1, VERB_ONLINE_ONLY, verb_daemon_reload }, + { "log-level", VERB_ANY, 2, VERB_ONLINE_ONLY, verb_log_setting }, + { "log-target", VERB_ANY, 2, VERB_ONLINE_ONLY, verb_log_setting }, + { "service-log-level", 2, 3, VERB_ONLINE_ONLY, verb_service_log_setting }, + { "service-log-target", 2, 3, VERB_ONLINE_ONLY, verb_service_log_setting }, + { "service-watchdogs", VERB_ANY, 2, VERB_ONLINE_ONLY, verb_service_watchdogs }, + { "show-environment", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_show_environment }, + { "set-environment", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_set_environment }, + { "unset-environment", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_set_environment }, + { "import-environment", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_import_environment }, + { "halt", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "poweroff", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "reboot", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "kexec", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "soft-reboot", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "suspend", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "hibernate", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "hybrid-sleep", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "suspend-then-hibernate",VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "default", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_special }, + { "rescue", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "emergency", VERB_ANY, 1, VERB_ONLINE_ONLY, verb_start_system_special }, + { "exit", VERB_ANY, 2, VERB_ONLINE_ONLY, verb_start_special }, + { "reset-failed", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_reset_failed }, + { "enable", 2, VERB_ANY, 0, verb_enable }, + { "disable", 2, VERB_ANY, 0, verb_enable }, + { "is-enabled", 2, VERB_ANY, 0, verb_is_enabled }, + { "reenable", 2, VERB_ANY, 0, verb_enable }, + { "preset", 2, VERB_ANY, 0, verb_enable }, + { "preset-all", VERB_ANY, 1, 0, verb_preset_all }, + { "mask", 2, VERB_ANY, 0, verb_enable }, + { "unmask", 2, VERB_ANY, 0, verb_enable }, + { "link", 2, VERB_ANY, 0, verb_enable }, + { "revert", 2, VERB_ANY, 0, verb_enable }, + { "switch-root", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_switch_root }, + { "list-dependencies", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_list_dependencies }, + { "set-default", 2, 2, 0, verb_set_default }, + { "get-default", VERB_ANY, 1, 0, verb_get_default }, + { "set-property", 3, VERB_ANY, VERB_ONLINE_ONLY, verb_set_property }, + { "is-system-running", VERB_ANY, 1, 0, verb_is_system_running }, + { "add-wants", 3, VERB_ANY, 0, verb_add_dependency }, + { "add-requires", 3, VERB_ANY, 0, verb_add_dependency }, + { "edit", 2, VERB_ANY, VERB_ONLINE_ONLY, verb_edit }, + { "bind", 3, 4, VERB_ONLINE_ONLY, verb_bind }, + { "mount-image", 4, 5, VERB_ONLINE_ONLY, verb_mount_image }, + { "whoami", VERB_ANY, VERB_ANY, VERB_ONLINE_ONLY, verb_whoami }, + {} + }; + + const Verb *verb = verbs_find_verb(argv[optind], verbs); + if (verb && (verb->flags & VERB_ONLINE_ONLY) && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Verb '%s' cannot be used with --root= or --image=.", + argv[optind] ?: verb->verb); + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + int r; + + setlocale(LC_ALL, ""); + log_setup(); + + /* The journal merging logic potentially needs a lot of fds. */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + sigbus_install(); + + r = systemctl_dispatch_parse_argv(argc, argv); + if (r <= 0) + goto finish; + + if (proc_mounted() == 0) + log_full(arg_no_warn ? LOG_DEBUG : LOG_WARNING, + "%s%s/proc/ is not mounted. This is not a supported mode of operation. Please fix\n" + "your invocation environment to mount /proc/ and /sys/ properly. Proceeding anyway.\n" + "Your mileage may vary.", + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_WARNING_SIGN) : "", + emoji_enabled() ? " " : ""); + + if (arg_action != ACTION_SYSTEMCTL && running_in_chroot() > 0) { + if (!arg_quiet) + log_info("Running in chroot, ignoring request."); + r = 0; + goto finish; + } + + /* systemctl_main() will print an error message for the bus connection, but only if it needs to */ + + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_VALIDATE_OS, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } + + switch (arg_action) { + + case ACTION_SYSTEMCTL: + r = systemctl_main(argc, argv); + break; + + /* Legacy command aliases set arg_action. They provide some fallbacks, e.g. to tell sysvinit to + * reboot after you have installed systemd binaries. */ + + case ACTION_HALT: + case ACTION_POWEROFF: + case ACTION_REBOOT: + case ACTION_KEXEC: + r = halt_main(); + break; + + case ACTION_RUNLEVEL2: + case ACTION_RUNLEVEL3: + case ACTION_RUNLEVEL4: + case ACTION_RUNLEVEL5: + case ACTION_RESCUE: + r = start_with_fallback(); + break; + + case ACTION_RELOAD: + case ACTION_REEXEC: + r = reload_with_fallback(); + break; + + case ACTION_CANCEL_SHUTDOWN: + r = logind_cancel_shutdown(); + break; + + case ACTION_SHOW_SHUTDOWN: + r = logind_show_shutdown(); + break; + + case ACTION_RUNLEVEL: + r = runlevel_main(); + break; + + case ACTION_TELINIT: + r = exec_telinit(argv); + break; + + case ACTION_EXIT: + case ACTION_SUSPEND: + case ACTION_HIBERNATE: + case ACTION_HYBRID_SLEEP: + case ACTION_SUSPEND_THEN_HIBERNATE: + case ACTION_EMERGENCY: + case ACTION_DEFAULT: + /* systemctl verbs with no equivalent in the legacy commands. These cannot appear in + * arg_action. Fall through. */ + + case _ACTION_INVALID: + default: + assert_not_reached(); + } + +finish: + release_busses(); + + /* Note that we return r here, not 0, so that we can implement the LSB-like return codes */ + return r; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); +#endif diff --git a/src/systemctl/systemctl.h b/src/systemctl/systemctl.h new file mode 100644 index 0000000..e8ba8f7 --- /dev/null +++ b/src/systemctl/systemctl.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "bus-print-properties.h" +#include "bus-util.h" +#include "image-policy.h" +#include "install.h" +#include "output-mode.h" +#include "pager.h" + +enum action { + ACTION_SYSTEMCTL, + ACTION_HALT, + ACTION_POWEROFF, + ACTION_REBOOT, + ACTION_KEXEC, + ACTION_SOFT_REBOOT, + ACTION_EXIT, + ACTION_SUSPEND, + ACTION_HIBERNATE, + ACTION_HYBRID_SLEEP, + ACTION_SUSPEND_THEN_HIBERNATE, + ACTION_RUNLEVEL2, + ACTION_RUNLEVEL3, + ACTION_RUNLEVEL4, + ACTION_RUNLEVEL5, + ACTION_RESCUE, + ACTION_EMERGENCY, + ACTION_DEFAULT, + ACTION_RELOAD, + ACTION_REEXEC, + ACTION_RUNLEVEL, + ACTION_TELINIT, + ACTION_CANCEL_SHUTDOWN, + ACTION_SHOW_SHUTDOWN, + _ACTION_MAX, + _ACTION_INVALID = -EINVAL, +}; + +enum dependency { + DEPENDENCY_FORWARD, + DEPENDENCY_REVERSE, + DEPENDENCY_AFTER, + DEPENDENCY_BEFORE, + _DEPENDENCY_MAX +}; + +extern char **arg_types; +extern char **arg_states; +extern char **arg_properties; +extern bool arg_all; +extern enum dependency arg_dependency; +extern const char *_arg_job_mode; +extern RuntimeScope arg_runtime_scope; +extern bool arg_wait; +extern bool arg_no_block; +extern int arg_legend; +extern PagerFlags arg_pager_flags; +extern bool arg_no_wtmp; +extern bool arg_no_sync; +extern bool arg_no_wall; +extern bool arg_no_reload; +extern BusPrintPropertyFlags arg_print_flags; +extern bool arg_show_types; +extern int arg_check_inhibitors; +extern bool arg_dry_run; +extern bool arg_quiet; +extern bool arg_no_warn; +extern bool arg_full; +extern bool arg_recursive; +extern bool arg_with_dependencies; +extern bool arg_show_transaction; +extern int arg_force; +extern bool arg_ask_password; +extern bool arg_runtime; +extern UnitFilePresetMode arg_preset_mode; +extern char **arg_wall; +extern const char *arg_kill_whom; +extern int arg_signal; +extern int arg_kill_value; +extern bool arg_kill_value_set; +extern char *arg_root; +extern usec_t arg_when; +extern const char *arg_reboot_argument; +extern enum action arg_action; +extern BusTransport arg_transport; +extern const char *arg_host; +extern unsigned arg_lines; +extern OutputMode arg_output; +extern bool arg_plain; +extern bool arg_firmware_setup; +extern usec_t arg_boot_loader_menu; +extern const char *arg_boot_loader_entry; +extern bool arg_now; +extern bool arg_jobs_before; +extern bool arg_jobs_after; +extern char **arg_clean_what; +extern TimestampStyle arg_timestamp_style; +extern bool arg_read_only; +extern bool arg_mkdir; +extern bool arg_marked; +extern const char *arg_drop_in; +extern ImagePolicy *arg_image_policy; + +static inline const char* arg_job_mode(void) { + return _arg_job_mode ?: "replace"; +} + +int systemctl_dispatch_parse_argv(int argc, char *argv[]); diff --git a/src/systemctl/systemd-sysv-install.SKELETON b/src/systemctl/systemd-sysv-install.SKELETON new file mode 100755 index 0000000..cb58d82 --- /dev/null +++ b/src/systemctl/systemd-sysv-install.SKELETON @@ -0,0 +1,51 @@ +#!/bin/sh +# SPDX-License-Identifier: MIT-0 +# +# This script is called by "systemctl enable/disable" when the given unit is a +# SysV init.d script. It needs to call the distribution's mechanism for +# enabling/disabling those, such as chkconfig, update-rc.d, or similar. This +# can optionally take a --root argument for enabling a SysV init script +# in a chroot or similar. +set -e + +usage() { + echo "Usage: $0 [--root=path] enable|disable|is-enabled " >&2 + exit 1 +} + +unset ROOT + +# parse options +eval set -- "$(getopt -o r: --long root: -- "$@")" +while true; do + case "$1" in + -r|--root) + ROOT="$2" + shift 2 ;; + --) shift ; break ;; + *) usage ;; + esac +done + +NAME="$2" +[ -n "$NAME" ] || usage + +case "$1" in + enable) + # call the command to enable SysV init script $NAME here + # (consider optional $ROOT) + echo "IMPLEMENT ME: enabling SysV init.d script $NAME" + ;; + disable) + # call the command to disable SysV init script $NAME here + # (consider optional $ROOT) + echo "IMPLEMENT ME: disabling SysV init.d script $NAME" + ;; + is-enabled) + # exit with 0 if $NAME is enabled, non-zero if it is disabled + # (consider optional $ROOT) + echo "IMPLEMENT ME: checking SysV init.d script $NAME" + ;; + *) + usage ;; +esac diff --git a/src/systemd/_sd-common.h b/src/systemd/_sd-common.h new file mode 100644 index 0000000..d4381d9 --- /dev/null +++ b/src/systemd/_sd-common.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdcommonhfoo +#define foosdcommonhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +/* This is a private header; never even think of including this directly! */ + +#if defined(__INCLUDE_LEVEL__) && __INCLUDE_LEVEL__ <= 1 && !defined(__COVERITY__) +# error "Do not include _sd-common.h directly; it is a private header." +#endif + +typedef void (*_sd_destroy_t)(void *userdata); + +#ifndef _sd_printf_ +# if __GNUC__ >= 4 +# define _sd_printf_(a,b) __attribute__((__format__(printf, a, b))) +# else +# define _sd_printf_(a,b) +# endif +#endif + +#ifndef _sd_sentinel_ +# define _sd_sentinel_ __attribute__((__sentinel__)) +#endif + +#ifndef _sd_packed_ +# define _sd_packed_ __attribute__((__packed__)) +#endif + +#ifndef _sd_pure_ +# define _sd_pure_ __attribute__((__pure__)) +#endif + +/* Note that strictly speaking __deprecated__ has been available before GCC 6. However, starting with GCC 6 + * it also works on enum values, which we are interested in. Since this is a developer-facing feature anyway + * (as opposed to build engineer-facing), let's hence conditionalize this to gcc 6, given that the developers + * are probably going to use something newer anyway. */ +#ifndef _sd_deprecated_ +# if __GNUC__ >= 6 +# define _sd_deprecated_ __attribute__((__deprecated__)) +# else +# define _sd_deprecated_ +# endif +#endif + +#ifndef _SD_STRINGIFY +# define _SD_XSTRINGIFY(x) #x +# define _SD_STRINGIFY(x) _SD_XSTRINGIFY(x) +#endif + +#ifndef _SD_BEGIN_DECLARATIONS +# ifdef __cplusplus +# define _SD_BEGIN_DECLARATIONS \ + extern "C" { \ + struct _sd_useless_struct_to_allow_trailing_semicolon_ +# else +# define _SD_BEGIN_DECLARATIONS \ + struct _sd_useless_struct_to_allow_trailing_semicolon_ +# endif +#endif + +#ifndef _SD_END_DECLARATIONS +# ifdef __cplusplus +# define _SD_END_DECLARATIONS \ + } \ + struct _sd_useless_cpp_struct_to_allow_trailing_semicolon_ +# else +# define _SD_END_DECLARATIONS \ + struct _sd_useless_struct_to_allow_trailing_semicolon_ +# endif +#endif + +#ifndef _SD_ARRAY_STATIC +# if __STDC_VERSION__ >= 199901L && !defined(__cplusplus) +# define _SD_ARRAY_STATIC static +# else +# define _SD_ARRAY_STATIC +# endif +#endif + +#define _SD_DEFINE_POINTER_CLEANUP_FUNC(type, func) \ + static __inline__ void func##p(type **p) { \ + if (*p) \ + func(*p); \ + } \ + struct _sd_useless_struct_to_allow_trailing_semicolon_ + +/* The following macro should be used in all public enums, to force 64-bit wideness on them, so that we can + * freely extend them later on, without breaking compatibility. */ +#define _SD_ENUM_FORCE_S64(id) \ + _SD_##id##_INT64_MIN = INT64_MIN, \ + _SD_##id##_INT64_MAX = INT64_MAX + +#endif diff --git a/src/systemd/meson.build b/src/systemd/meson.build new file mode 100644 index 0000000..a9cdcd2 --- /dev/null +++ b/src/systemd/meson.build @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +_systemd_headers = [ + 'sd-bus.h', + 'sd-bus-protocol.h', + 'sd-bus-vtable.h', + 'sd-daemon.h', + 'sd-device.h', + 'sd-event.h', + 'sd-gpt.h', + 'sd-hwdb.h', + 'sd-id128.h', + 'sd-journal.h', + 'sd-login.h', + 'sd-messages.h', + 'sd-path.h', +] + +# https://github.com/mesonbuild/meson/issues/1633 +systemd_headers = files(_systemd_headers) + +_not_installed_headers = [ + 'sd-dhcp-client.h', + 'sd-dhcp-lease.h', + 'sd-dhcp-option.h', + 'sd-dhcp-protocol.h', + 'sd-dhcp-server.h', + 'sd-dhcp6-client.h', + 'sd-dhcp6-lease.h', + 'sd-dhcp6-option.h', + 'sd-dhcp6-protocol.h', + 'sd-ipv4acd.h', + 'sd-ipv4ll.h', + 'sd-lldp-rx.h', + 'sd-lldp-tx.h', + 'sd-lldp.h', + 'sd-ndisc.h', + 'sd-netlink.h', + 'sd-network.h', + 'sd-radv.h', + 'sd-resolve.h', +] + +install_headers( + systemd_headers, + '_sd-common.h', + subdir : 'systemd') + +############################################################ + +if want_tests == 'false' + subdir_done() +endif + +opts = [['c'], + ['c', '-ansi'], + ['c', '-std=iso9899:1990'], + ['c', '-std=iso9899:2011']] + +if cc.has_argument('-std=iso9899:2017') + opts += [['c', '-std=iso9899:2017']] +endif + +if cc.has_argument('-std=c2x') + opts += [['c', '-std=c2x']] +endif + +if cxx_cmd != '' + opts += [['c++'], + ['c++', '-std=c++98'], + ['c++', '-std=c++11']] + if cxx.has_argument('-std=c++14') + opts += [['c++', '-std=c++14']] + endif + if cxx.has_argument('-std=c++17') + opts += [['c++', '-std=c++17']] + endif + if cxx.has_argument('-std=c++20') + opts += [['c++', '-std=c++20']] + endif + if cxx.has_argument('-std=c++23') + opts += [['c++', '-std=c++23']] + endif +endif + +foreach header : _systemd_headers + _not_installed_headers + [libudev_h_path] + foreach opt : opts + std_name = opt.length() == 2 ? '_'.join(opt[1].split(':')) : '' + test('cc-' + fs.name(header) + '_' + opt[0] + std_name, + env, + suite : 'headers', + args : [cc.cmd_array(), + '-c', + '-x', opt, + '-Wall', + '-Wextra', + '-Werror', + '-pedantic', + '-Wno-long-long', + '-Wno-variadic-macros', + '-include', meson.current_source_dir() / header, + '-o/dev/null', + '/dev/null']) + endforeach +endforeach diff --git a/src/systemd/sd-bus-protocol.h b/src/systemd/sd-bus-protocol.h new file mode 100644 index 0000000..25c9ab3 --- /dev/null +++ b/src/systemd/sd-bus-protocol.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdbusprotocolhfoo +#define foosdbusprotocolhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* Types of message */ + +enum { + _SD_BUS_MESSAGE_TYPE_INVALID = 0, + SD_BUS_MESSAGE_METHOD_CALL, + SD_BUS_MESSAGE_METHOD_RETURN, + SD_BUS_MESSAGE_METHOD_ERROR, + SD_BUS_MESSAGE_SIGNAL, + _SD_BUS_MESSAGE_TYPE_MAX +}; + +/* Primitive types */ + +enum { + _SD_BUS_TYPE_INVALID = 0, + SD_BUS_TYPE_BYTE = 'y', + SD_BUS_TYPE_BOOLEAN = 'b', + SD_BUS_TYPE_INT16 = 'n', + SD_BUS_TYPE_UINT16 = 'q', + SD_BUS_TYPE_INT32 = 'i', + SD_BUS_TYPE_UINT32 = 'u', + SD_BUS_TYPE_INT64 = 'x', + SD_BUS_TYPE_UINT64 = 't', + SD_BUS_TYPE_DOUBLE = 'd', + SD_BUS_TYPE_STRING = 's', + SD_BUS_TYPE_OBJECT_PATH = 'o', + SD_BUS_TYPE_SIGNATURE = 'g', + SD_BUS_TYPE_UNIX_FD = 'h', + SD_BUS_TYPE_ARRAY = 'a', + SD_BUS_TYPE_VARIANT = 'v', + SD_BUS_TYPE_STRUCT = 'r', /* not actually used in signatures */ + SD_BUS_TYPE_STRUCT_BEGIN = '(', + SD_BUS_TYPE_STRUCT_END = ')', + SD_BUS_TYPE_DICT_ENTRY = 'e', /* not actually used in signatures */ + SD_BUS_TYPE_DICT_ENTRY_BEGIN = '{', + SD_BUS_TYPE_DICT_ENTRY_END = '}' +}; + +/* Well-known errors. Note that this is only a sanitized subset of the + * errors that the reference implementation generates. */ + +#define SD_BUS_ERROR_FAILED "org.freedesktop.DBus.Error.Failed" +#define SD_BUS_ERROR_NO_MEMORY "org.freedesktop.DBus.Error.NoMemory" +#define SD_BUS_ERROR_SERVICE_UNKNOWN "org.freedesktop.DBus.Error.ServiceUnknown" +#define SD_BUS_ERROR_NAME_HAS_NO_OWNER "org.freedesktop.DBus.Error.NameHasNoOwner" +#define SD_BUS_ERROR_NO_REPLY "org.freedesktop.DBus.Error.NoReply" +#define SD_BUS_ERROR_IO_ERROR "org.freedesktop.DBus.Error.IOError" +#define SD_BUS_ERROR_BAD_ADDRESS "org.freedesktop.DBus.Error.BadAddress" +#define SD_BUS_ERROR_NOT_SUPPORTED "org.freedesktop.DBus.Error.NotSupported" +#define SD_BUS_ERROR_LIMITS_EXCEEDED "org.freedesktop.DBus.Error.LimitsExceeded" +#define SD_BUS_ERROR_ACCESS_DENIED "org.freedesktop.DBus.Error.AccessDenied" +#define SD_BUS_ERROR_AUTH_FAILED "org.freedesktop.DBus.Error.AuthFailed" +#define SD_BUS_ERROR_NO_SERVER "org.freedesktop.DBus.Error.NoServer" +#define SD_BUS_ERROR_TIMEOUT "org.freedesktop.DBus.Error.Timeout" +#define SD_BUS_ERROR_NO_NETWORK "org.freedesktop.DBus.Error.NoNetwork" +#define SD_BUS_ERROR_ADDRESS_IN_USE "org.freedesktop.DBus.Error.AddressInUse" +#define SD_BUS_ERROR_DISCONNECTED "org.freedesktop.DBus.Error.Disconnected" +#define SD_BUS_ERROR_INVALID_ARGS "org.freedesktop.DBus.Error.InvalidArgs" +#define SD_BUS_ERROR_FILE_NOT_FOUND "org.freedesktop.DBus.Error.FileNotFound" +#define SD_BUS_ERROR_FILE_EXISTS "org.freedesktop.DBus.Error.FileExists" +#define SD_BUS_ERROR_UNKNOWN_METHOD "org.freedesktop.DBus.Error.UnknownMethod" +#define SD_BUS_ERROR_UNKNOWN_OBJECT "org.freedesktop.DBus.Error.UnknownObject" +#define SD_BUS_ERROR_UNKNOWN_INTERFACE "org.freedesktop.DBus.Error.UnknownInterface" +#define SD_BUS_ERROR_UNKNOWN_PROPERTY "org.freedesktop.DBus.Error.UnknownProperty" +#define SD_BUS_ERROR_PROPERTY_READ_ONLY "org.freedesktop.DBus.Error.PropertyReadOnly" +#define SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN "org.freedesktop.DBus.Error.UnixProcessIdUnknown" +#define SD_BUS_ERROR_INVALID_SIGNATURE "org.freedesktop.DBus.Error.InvalidSignature" +#define SD_BUS_ERROR_INCONSISTENT_MESSAGE "org.freedesktop.DBus.Error.InconsistentMessage" +#define SD_BUS_ERROR_TIMED_OUT "org.freedesktop.DBus.Error.TimedOut" +#define SD_BUS_ERROR_MATCH_RULE_NOT_FOUND "org.freedesktop.DBus.Error.MatchRuleNotFound" +#define SD_BUS_ERROR_MATCH_RULE_INVALID "org.freedesktop.DBus.Error.MatchRuleInvalid" +#define SD_BUS_ERROR_INTERACTIVE_AUTHORIZATION_REQUIRED "org.freedesktop.DBus.Error.InteractiveAuthorizationRequired" +#define SD_BUS_ERROR_INVALID_FILE_CONTENT "org.freedesktop.DBus.Error.InvalidFileContent" +#define SD_BUS_ERROR_SELINUX_SECURITY_CONTEXT_UNKNOWN "org.freedesktop.DBus.Error.SELinuxSecurityContextUnknown" +#define SD_BUS_ERROR_OBJECT_PATH_IN_USE "org.freedesktop.DBus.Error.ObjectPathInUse" + +/* https://dbus.freedesktop.org/doc/dbus-specification.html#message-protocol-marshaling-signature */ +#define SD_BUS_MAXIMUM_SIGNATURE_LENGTH 255 + +/* https://dbus.freedesktop.org/doc/dbus-specification.html#message-protocol-names */ +#define SD_BUS_MAXIMUM_NAME_LENGTH 255 + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-bus-vtable.h b/src/systemd/sd-bus-vtable.h new file mode 100644 index 0000000..5e80ea8 --- /dev/null +++ b/src/systemd/sd-bus-vtable.h @@ -0,0 +1,353 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdbusvtablehfoo +#define foosdbusvtablehfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_bus_vtable sd_bus_vtable; + +#include "sd-bus.h" + +enum { + _SD_BUS_VTABLE_START = '<', + _SD_BUS_VTABLE_END = '>', + _SD_BUS_VTABLE_METHOD = 'M', + _SD_BUS_VTABLE_SIGNAL = 'S', + _SD_BUS_VTABLE_PROPERTY = 'P', + _SD_BUS_VTABLE_WRITABLE_PROPERTY = 'W' +}; + +__extension__ enum { + SD_BUS_VTABLE_DEPRECATED = 1ULL << 0, + SD_BUS_VTABLE_HIDDEN = 1ULL << 1, + SD_BUS_VTABLE_UNPRIVILEGED = 1ULL << 2, + SD_BUS_VTABLE_METHOD_NO_REPLY = 1ULL << 3, + SD_BUS_VTABLE_PROPERTY_CONST = 1ULL << 4, + SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE = 1ULL << 5, + SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION = 1ULL << 6, + SD_BUS_VTABLE_PROPERTY_EXPLICIT = 1ULL << 7, + SD_BUS_VTABLE_SENSITIVE = 1ULL << 8, /* covers both directions: method call + reply */ + SD_BUS_VTABLE_ABSOLUTE_OFFSET = 1ULL << 9, + _SD_BUS_VTABLE_CAPABILITY_MASK = 0xFFFFULL << 40 +}; + +#define SD_BUS_VTABLE_CAPABILITY(x) ((uint64_t) (((x)+1) & 0xFFFF) << 40) + +enum { + _SD_BUS_VTABLE_PARAM_NAMES = 1 << 0 +}; + +extern const unsigned sd_bus_object_vtable_format; + +/* Note: unused areas in the sd_bus_vtable[] array must be initialized to 0. The structure contains an embedded + * union, and the compiler is NOT required to initialize the unused areas of the union when the rest of the + * structure is initialized. Normally the array is defined as read-only data, in which case the linker places + * it in the BSS section, which is always fully initialized, so this is not a concern. But if the array is + * created on the stack or on the heap, care must be taken to initialize the unused areas, for examply by + * first memsetting the whole region to zero before filling the data in. */ + +struct sd_bus_vtable { + /* Please do not initialize this structure directly, use the + * macros below instead */ + + __extension__ uint8_t type:8; + __extension__ uint64_t flags:56; + union { + struct { + size_t element_size; + uint64_t features; + const unsigned *vtable_format_reference; + } start; + struct { + /* This field exists only to make sure we have something to initialize in + * SD_BUS_VTABLE_END in a way that is both compatible with pedantic versions of C and + * C++. It's unused otherwise. */ + size_t _reserved; + } end; + struct { + const char *member; + const char *signature; + const char *result; + sd_bus_message_handler_t handler; + size_t offset; + const char *names; + } method; + struct { + const char *member; + const char *signature; + const char *names; + } signal; + struct { + const char *member; + const char *signature; + sd_bus_property_get_t get; + sd_bus_property_set_t set; + size_t offset; + } property; + } x; +}; + +#define SD_BUS_VTABLE_START(_flags) \ + { \ + .type = _SD_BUS_VTABLE_START, \ + .flags = _flags, \ + .x = { \ + .start = { \ + .element_size = sizeof(sd_bus_vtable), \ + .features = _SD_BUS_VTABLE_PARAM_NAMES, \ + .vtable_format_reference = &sd_bus_object_vtable_format, \ + }, \ + }, \ + } + +/* helper macro to format method and signal parameters, one at a time */ +#define SD_BUS_PARAM(x) #x "\0" + +#define SD_BUS_METHOD_WITH_NAMES_OFFSET(_member, _signature, _in_names, _result, _out_names, _handler, _offset, _flags) \ + { \ + .type = _SD_BUS_VTABLE_METHOD, \ + .flags = _flags, \ + .x = { \ + .method = { \ + .member = _member, \ + .signature = _signature, \ + .result = _result, \ + .handler = _handler, \ + .offset = _offset, \ + .names = _in_names _out_names, \ + }, \ + }, \ + } +#define SD_BUS_METHOD_WITH_OFFSET(_member, _signature, _result, _handler, _offset, _flags) \ + SD_BUS_METHOD_WITH_NAMES_OFFSET(_member, _signature, "", _result, "", _handler, _offset, _flags) +#define SD_BUS_METHOD_WITH_NAMES(_member, _signature, _in_names, _result, _out_names, _handler, _flags) \ + SD_BUS_METHOD_WITH_NAMES_OFFSET(_member, _signature, _in_names, _result, _out_names, _handler, 0, _flags) +#define SD_BUS_METHOD(_member, _signature, _result, _handler, _flags) \ + SD_BUS_METHOD_WITH_NAMES_OFFSET(_member, _signature, "", _result, "", _handler, 0, _flags) + +#define SD_BUS_SIGNAL_WITH_NAMES(_member, _signature, _out_names, _flags) \ + { \ + .type = _SD_BUS_VTABLE_SIGNAL, \ + .flags = _flags, \ + .x = { \ + .signal = { \ + .member = _member, \ + .signature = _signature, \ + .names = _out_names, \ + }, \ + }, \ + } +#define SD_BUS_SIGNAL(_member, _signature, _flags) \ + SD_BUS_SIGNAL_WITH_NAMES(_member, _signature, "", _flags) + +#define SD_BUS_PROPERTY(_member, _signature, _get, _offset, _flags) \ + { \ + .type = _SD_BUS_VTABLE_PROPERTY, \ + .flags = _flags, \ + .x = { \ + .property = { \ + .member = _member, \ + .signature = _signature, \ + .get = _get, \ + .set = NULL, \ + .offset = _offset, \ + }, \ + }, \ + } + +#define SD_BUS_WRITABLE_PROPERTY(_member, _signature, _get, _set, _offset, _flags) \ + { \ + .type = _SD_BUS_VTABLE_WRITABLE_PROPERTY, \ + .flags = _flags, \ + .x = { \ + .property = { \ + .member = _member, \ + .signature = _signature, \ + .get = _get, \ + .set = _set, \ + .offset = _offset, \ + }, \ + }, \ + } + +#define SD_BUS_VTABLE_END \ + { \ + .type = _SD_BUS_VTABLE_END, \ + .flags = 0, \ + .x = { \ + .end = { \ + ._reserved = 0, \ + }, \ + }, \ + } + +#define _SD_ECHO(X) X +#define _SD_CONCAT(X) #X "\0" + +#define _SD_VARARGS_FOREACH_SEQ(_01, _02, _03, _04, _05, _06, _07, _08, _09, _10, \ + _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, \ + _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, \ + _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, \ + NAME, ...) NAME + +#define _SD_VARARGS_FOREACH_EVEN_01(FN, X) FN(X) +#define _SD_VARARGS_FOREACH_EVEN_02(FN, X, Y) FN(X) +#define _SD_VARARGS_FOREACH_EVEN_04(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_02(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_06(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_04(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_08(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_06(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_10(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_08(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_12(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_10(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_14(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_12(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_16(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_14(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_18(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_16(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_20(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_18(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_22(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_20(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_24(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_22(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_26(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_24(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_28(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_26(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_30(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_28(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_32(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_30(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_34(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_32(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_36(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_34(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_38(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_36(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_40(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_38(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_42(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_40(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_44(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_42(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_46(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_44(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_48(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_46(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_EVEN_50(FN, X, Y, ...) FN(X) _SD_VARARGS_FOREACH_EVEN_48(FN, __VA_ARGS__) + +#define _SD_VARARGS_FOREACH_EVEN(FN, ...) \ + _SD_VARARGS_FOREACH_SEQ(__VA_ARGS__, \ + _SD_VARARGS_FOREACH_EVEN_50, _SD_VARARGS_FOREACH_EVEN_49, \ + _SD_VARARGS_FOREACH_EVEN_48, _SD_VARARGS_FOREACH_EVEN_47, \ + _SD_VARARGS_FOREACH_EVEN_46, _SD_VARARGS_FOREACH_EVEN_45, \ + _SD_VARARGS_FOREACH_EVEN_44, _SD_VARARGS_FOREACH_EVEN_43, \ + _SD_VARARGS_FOREACH_EVEN_42, _SD_VARARGS_FOREACH_EVEN_41, \ + _SD_VARARGS_FOREACH_EVEN_40, _SD_VARARGS_FOREACH_EVEN_39, \ + _SD_VARARGS_FOREACH_EVEN_38, _SD_VARARGS_FOREACH_EVEN_37, \ + _SD_VARARGS_FOREACH_EVEN_36, _SD_VARARGS_FOREACH_EVEN_35, \ + _SD_VARARGS_FOREACH_EVEN_34, _SD_VARARGS_FOREACH_EVEN_33, \ + _SD_VARARGS_FOREACH_EVEN_32, _SD_VARARGS_FOREACH_EVEN_31, \ + _SD_VARARGS_FOREACH_EVEN_30, _SD_VARARGS_FOREACH_EVEN_29, \ + _SD_VARARGS_FOREACH_EVEN_28, _SD_VARARGS_FOREACH_EVEN_27, \ + _SD_VARARGS_FOREACH_EVEN_26, _SD_VARARGS_FOREACH_EVEN_25, \ + _SD_VARARGS_FOREACH_EVEN_24, _SD_VARARGS_FOREACH_EVEN_23, \ + _SD_VARARGS_FOREACH_EVEN_22, _SD_VARARGS_FOREACH_EVEN_21, \ + _SD_VARARGS_FOREACH_EVEN_20, _SD_VARARGS_FOREACH_EVEN_19, \ + _SD_VARARGS_FOREACH_EVEN_18, _SD_VARARGS_FOREACH_EVEN_17, \ + _SD_VARARGS_FOREACH_EVEN_16, _SD_VARARGS_FOREACH_EVEN_15, \ + _SD_VARARGS_FOREACH_EVEN_14, _SD_VARARGS_FOREACH_EVEN_13, \ + _SD_VARARGS_FOREACH_EVEN_12, _SD_VARARGS_FOREACH_EVEN_11, \ + _SD_VARARGS_FOREACH_EVEN_10, _SD_VARARGS_FOREACH_EVEN_09, \ + _SD_VARARGS_FOREACH_EVEN_08, _SD_VARARGS_FOREACH_EVEN_07, \ + _SD_VARARGS_FOREACH_EVEN_06, _SD_VARARGS_FOREACH_EVEN_05, \ + _SD_VARARGS_FOREACH_EVEN_04, _SD_VARARGS_FOREACH_EVEN_03, \ + _SD_VARARGS_FOREACH_EVEN_02, _SD_VARARGS_FOREACH_EVEN_01) \ + (FN, __VA_ARGS__) + +#define _SD_VARARGS_FOREACH_ODD_01(FN, X) +#define _SD_VARARGS_FOREACH_ODD_02(FN, X, Y) FN(Y) +#define _SD_VARARGS_FOREACH_ODD_04(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_02(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_06(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_04(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_08(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_06(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_10(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_08(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_12(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_10(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_14(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_12(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_16(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_14(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_18(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_16(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_20(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_18(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_22(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_20(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_24(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_22(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_26(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_24(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_28(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_26(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_30(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_28(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_32(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_30(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_34(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_32(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_36(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_34(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_38(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_36(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_40(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_38(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_42(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_40(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_44(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_42(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_46(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_44(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_48(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_46(FN, __VA_ARGS__) +#define _SD_VARARGS_FOREACH_ODD_50(FN, X, Y, ...) FN(Y) _SD_VARARGS_FOREACH_ODD_48(FN, __VA_ARGS__) + +#define _SD_VARARGS_FOREACH_ODD(FN, ...) \ + _SD_VARARGS_FOREACH_SEQ(__VA_ARGS__, \ + _SD_VARARGS_FOREACH_ODD_50, _SD_VARARGS_FOREACH_ODD_49, \ + _SD_VARARGS_FOREACH_ODD_48, _SD_VARARGS_FOREACH_ODD_47, \ + _SD_VARARGS_FOREACH_ODD_46, _SD_VARARGS_FOREACH_ODD_45, \ + _SD_VARARGS_FOREACH_ODD_44, _SD_VARARGS_FOREACH_ODD_43, \ + _SD_VARARGS_FOREACH_ODD_42, _SD_VARARGS_FOREACH_ODD_41, \ + _SD_VARARGS_FOREACH_ODD_40, _SD_VARARGS_FOREACH_ODD_39, \ + _SD_VARARGS_FOREACH_ODD_38, _SD_VARARGS_FOREACH_ODD_37, \ + _SD_VARARGS_FOREACH_ODD_36, _SD_VARARGS_FOREACH_ODD_35, \ + _SD_VARARGS_FOREACH_ODD_34, _SD_VARARGS_FOREACH_ODD_33, \ + _SD_VARARGS_FOREACH_ODD_32, _SD_VARARGS_FOREACH_ODD_31, \ + _SD_VARARGS_FOREACH_ODD_30, _SD_VARARGS_FOREACH_ODD_29, \ + _SD_VARARGS_FOREACH_ODD_28, _SD_VARARGS_FOREACH_ODD_27, \ + _SD_VARARGS_FOREACH_ODD_26, _SD_VARARGS_FOREACH_ODD_25, \ + _SD_VARARGS_FOREACH_ODD_24, _SD_VARARGS_FOREACH_ODD_23, \ + _SD_VARARGS_FOREACH_ODD_22, _SD_VARARGS_FOREACH_ODD_21, \ + _SD_VARARGS_FOREACH_ODD_20, _SD_VARARGS_FOREACH_ODD_19, \ + _SD_VARARGS_FOREACH_ODD_18, _SD_VARARGS_FOREACH_ODD_17, \ + _SD_VARARGS_FOREACH_ODD_16, _SD_VARARGS_FOREACH_ODD_15, \ + _SD_VARARGS_FOREACH_ODD_14, _SD_VARARGS_FOREACH_ODD_13, \ + _SD_VARARGS_FOREACH_ODD_12, _SD_VARARGS_FOREACH_ODD_11, \ + _SD_VARARGS_FOREACH_ODD_10, _SD_VARARGS_FOREACH_ODD_09, \ + _SD_VARARGS_FOREACH_ODD_08, _SD_VARARGS_FOREACH_ODD_07, \ + _SD_VARARGS_FOREACH_ODD_06, _SD_VARARGS_FOREACH_ODD_05, \ + _SD_VARARGS_FOREACH_ODD_04, _SD_VARARGS_FOREACH_ODD_03, \ + _SD_VARARGS_FOREACH_ODD_02, _SD_VARARGS_FOREACH_ODD_01) \ + (FN, __VA_ARGS__) + +#define SD_BUS_ARGS(...) __VA_ARGS__ +#define SD_BUS_RESULT(...) __VA_ARGS__ + +#define SD_BUS_NO_ARGS SD_BUS_ARGS(NULL) +#define SD_BUS_NO_RESULT SD_BUS_RESULT(NULL) + +#define SD_BUS_METHOD_WITH_ARGS(_member, _args, _result, _handler, _flags) \ + SD_BUS_METHOD_WITH_NAMES(_member, \ + _SD_VARARGS_FOREACH_EVEN(_SD_ECHO, _args), \ + _SD_VARARGS_FOREACH_ODD(_SD_CONCAT, _args), \ + _SD_VARARGS_FOREACH_EVEN(_SD_ECHO, _result), \ + _SD_VARARGS_FOREACH_ODD(_SD_CONCAT, _result) "\0", \ + _handler, _flags) + +#define SD_BUS_METHOD_WITH_ARGS_OFFSET(_member, _args, _result, _handler, _offset, _flags) \ + SD_BUS_METHOD_WITH_NAMES_OFFSET(_member, \ + _SD_VARARGS_FOREACH_EVEN(_SD_ECHO, _args), \ + _SD_VARARGS_FOREACH_ODD(_SD_CONCAT, _args), \ + _SD_VARARGS_FOREACH_EVEN(_SD_ECHO, _result), \ + _SD_VARARGS_FOREACH_ODD(_SD_CONCAT, _result) "\0", \ + _handler, _offset, _flags) + +#define SD_BUS_SIGNAL_WITH_ARGS(_member, _args, _flags) \ + SD_BUS_SIGNAL_WITH_NAMES(_member, \ + _SD_VARARGS_FOREACH_EVEN(_SD_ECHO, _args), \ + _SD_VARARGS_FOREACH_ODD(_SD_CONCAT, _args) "\0", \ + _flags) + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-bus.h b/src/systemd/sd-bus.h new file mode 100644 index 0000000..bd3da36 --- /dev/null +++ b/src/systemd/sd-bus.h @@ -0,0 +1,541 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdbushfoo +#define foosdbushfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "sd-event.h" +#include "sd-id128.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +#define SD_BUS_DEFAULT ((sd_bus *) 1) +#define SD_BUS_DEFAULT_USER ((sd_bus *) 2) +#define SD_BUS_DEFAULT_SYSTEM ((sd_bus *) 3) + +/* Types */ + +typedef struct sd_bus sd_bus; +typedef struct sd_bus_message sd_bus_message; +typedef struct sd_bus_slot sd_bus_slot; +typedef struct sd_bus_creds sd_bus_creds; +typedef struct sd_bus_track sd_bus_track; + +typedef struct { + const char *name; + const char *message; + int _need_free; +} sd_bus_error; + +typedef struct { + const char *name; + int code; +} sd_bus_error_map; + +/* Flags */ + +__extension__ enum { + SD_BUS_CREDS_PID = 1ULL << 0, + SD_BUS_CREDS_TID = 1ULL << 1, + SD_BUS_CREDS_PPID = 1ULL << 2, + SD_BUS_CREDS_UID = 1ULL << 3, + SD_BUS_CREDS_EUID = 1ULL << 4, + SD_BUS_CREDS_SUID = 1ULL << 5, + SD_BUS_CREDS_FSUID = 1ULL << 6, + SD_BUS_CREDS_GID = 1ULL << 7, + SD_BUS_CREDS_EGID = 1ULL << 8, + SD_BUS_CREDS_SGID = 1ULL << 9, + SD_BUS_CREDS_FSGID = 1ULL << 10, + SD_BUS_CREDS_SUPPLEMENTARY_GIDS = 1ULL << 11, + SD_BUS_CREDS_COMM = 1ULL << 12, + SD_BUS_CREDS_TID_COMM = 1ULL << 13, + SD_BUS_CREDS_EXE = 1ULL << 14, + SD_BUS_CREDS_CMDLINE = 1ULL << 15, + SD_BUS_CREDS_CGROUP = 1ULL << 16, + SD_BUS_CREDS_UNIT = 1ULL << 17, + SD_BUS_CREDS_SLICE = 1ULL << 18, + SD_BUS_CREDS_USER_UNIT = 1ULL << 19, + SD_BUS_CREDS_USER_SLICE = 1ULL << 20, + SD_BUS_CREDS_SESSION = 1ULL << 21, + SD_BUS_CREDS_OWNER_UID = 1ULL << 22, + SD_BUS_CREDS_EFFECTIVE_CAPS = 1ULL << 23, + SD_BUS_CREDS_PERMITTED_CAPS = 1ULL << 24, + SD_BUS_CREDS_INHERITABLE_CAPS = 1ULL << 25, + SD_BUS_CREDS_BOUNDING_CAPS = 1ULL << 26, + SD_BUS_CREDS_SELINUX_CONTEXT = 1ULL << 27, + SD_BUS_CREDS_AUDIT_SESSION_ID = 1ULL << 28, + SD_BUS_CREDS_AUDIT_LOGIN_UID = 1ULL << 29, + SD_BUS_CREDS_TTY = 1ULL << 30, + SD_BUS_CREDS_UNIQUE_NAME = 1ULL << 31, + SD_BUS_CREDS_WELL_KNOWN_NAMES = 1ULL << 32, + SD_BUS_CREDS_DESCRIPTION = 1ULL << 33, + SD_BUS_CREDS_AUGMENT = 1ULL << 63, /* special flag, if on sd-bus will augment creds struct, in a potentially race-full way. */ + _SD_BUS_CREDS_ALL = (1ULL << 34) -1 +}; + +__extension__ enum { + SD_BUS_NAME_REPLACE_EXISTING = 1ULL << 0, + SD_BUS_NAME_ALLOW_REPLACEMENT = 1ULL << 1, + SD_BUS_NAME_QUEUE = 1ULL << 2 +}; + +__extension__ enum { + SD_BUS_MESSAGE_DUMP_WITH_HEADER = 1ULL << 0, + SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY = 1ULL << 1, + _SD_BUS_MESSAGE_DUMP_KNOWN_FLAGS = SD_BUS_MESSAGE_DUMP_WITH_HEADER | SD_BUS_MESSAGE_DUMP_SUBTREE_ONLY +}; + +/* Callbacks */ + +typedef int (*sd_bus_message_handler_t)(sd_bus_message *m, void *userdata, sd_bus_error *ret_error); +typedef int (*sd_bus_property_get_t) (sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +typedef int (*sd_bus_property_set_t) (sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *ret_error); +typedef int (*sd_bus_object_find_t) (sd_bus *bus, const char *path, const char *interface, void *userdata, void **ret_found, sd_bus_error *ret_error); +typedef int (*sd_bus_node_enumerator_t) (sd_bus *bus, const char *prefix, void *userdata, char ***ret_nodes, sd_bus_error *ret_error); +typedef int (*sd_bus_track_handler_t) (sd_bus_track *track, void *userdata); +typedef _sd_destroy_t sd_bus_destroy_t; + +#include "sd-bus-protocol.h" +#include "sd-bus-vtable.h" + +/* Naming */ + +int sd_bus_interface_name_is_valid(const char *p); +int sd_bus_service_name_is_valid(const char *p); +int sd_bus_member_name_is_valid(const char *p); +int sd_bus_object_path_is_valid(const char *p); + +/* Connections */ + +int sd_bus_default(sd_bus **ret); +int sd_bus_default_user(sd_bus **ret); +int sd_bus_default_system(sd_bus **ret); + +int sd_bus_open(sd_bus **ret); +int sd_bus_open_with_description(sd_bus **ret, const char *description); +int sd_bus_open_user(sd_bus **ret); +int sd_bus_open_user_with_description(sd_bus **ret, const char *description); +int sd_bus_open_user_machine(sd_bus **ret, const char *machine); +int sd_bus_open_system(sd_bus **ret); +int sd_bus_open_system_with_description(sd_bus **ret, const char *description); +int sd_bus_open_system_remote(sd_bus **ret, const char *host); +int sd_bus_open_system_machine(sd_bus **ret, const char *machine); + +int sd_bus_new(sd_bus **ret); + +int sd_bus_set_address(sd_bus *bus, const char *address); +int sd_bus_set_fd(sd_bus *bus, int input_fd, int output_fd); +int sd_bus_set_exec(sd_bus *bus, const char *path, char *const *argv); +int sd_bus_get_address(sd_bus *bus, const char **address); +int sd_bus_set_bus_client(sd_bus *bus, int b); +int sd_bus_is_bus_client(sd_bus *bus); +int sd_bus_set_server(sd_bus *bus, int b, sd_id128_t bus_id); +int sd_bus_is_server(sd_bus *bus); +int sd_bus_set_anonymous(sd_bus *bus, int b); +int sd_bus_is_anonymous(sd_bus *bus); +int sd_bus_set_trusted(sd_bus *bus, int b); +int sd_bus_is_trusted(sd_bus *bus); +int sd_bus_set_monitor(sd_bus *bus, int b); +int sd_bus_is_monitor(sd_bus *bus); +int sd_bus_set_description(sd_bus *bus, const char *description); +int sd_bus_get_description(sd_bus *bus, const char **description); +int sd_bus_negotiate_creds(sd_bus *bus, int b, uint64_t creds_mask); +int sd_bus_negotiate_timestamp(sd_bus *bus, int b); +int sd_bus_negotiate_fds(sd_bus *bus, int b); +int sd_bus_can_send(sd_bus *bus, char type); +int sd_bus_get_creds_mask(sd_bus *bus, uint64_t *creds_mask); +int sd_bus_set_allow_interactive_authorization(sd_bus *bus, int b); +int sd_bus_get_allow_interactive_authorization(sd_bus *bus); +int sd_bus_set_exit_on_disconnect(sd_bus *bus, int b); +int sd_bus_get_exit_on_disconnect(sd_bus *bus); +int sd_bus_set_close_on_exit(sd_bus *bus, int b); +int sd_bus_get_close_on_exit(sd_bus *bus); +int sd_bus_set_watch_bind(sd_bus *bus, int b); +int sd_bus_get_watch_bind(sd_bus *bus); +int sd_bus_set_connected_signal(sd_bus *bus, int b); +int sd_bus_get_connected_signal(sd_bus *bus); +int sd_bus_set_sender(sd_bus *bus, const char *sender); +int sd_bus_get_sender(sd_bus *bus, const char **ret); + +int sd_bus_start(sd_bus *bus); + +int sd_bus_try_close(sd_bus *bus) _sd_deprecated_; +void sd_bus_close(sd_bus *bus); + +sd_bus* sd_bus_ref(sd_bus *bus); +sd_bus* sd_bus_unref(sd_bus *bus); +sd_bus* sd_bus_close_unref(sd_bus *bus); +sd_bus* sd_bus_flush_close_unref(sd_bus *bus); + +void sd_bus_default_flush_close(void); + +int sd_bus_is_open(sd_bus *bus); +int sd_bus_is_ready(sd_bus *bus); + +int sd_bus_get_bus_id(sd_bus *bus, sd_id128_t *id); +int sd_bus_get_scope(sd_bus *bus, const char **scope); +int sd_bus_get_tid(sd_bus *bus, pid_t *tid); +int sd_bus_get_owner_creds(sd_bus *bus, uint64_t creds_mask, sd_bus_creds **ret); + +int sd_bus_send(sd_bus *bus, sd_bus_message *m, uint64_t *cookie); +int sd_bus_send_to(sd_bus *bus, sd_bus_message *m, const char *destination, uint64_t *cookie); +int sd_bus_call(sd_bus *bus, sd_bus_message *m, uint64_t usec, sd_bus_error *ret_error, sd_bus_message **reply); +int sd_bus_call_async(sd_bus *bus, sd_bus_slot **slot, sd_bus_message *m, sd_bus_message_handler_t callback, void *userdata, uint64_t usec); + +int sd_bus_get_fd(sd_bus *bus); +int sd_bus_get_events(sd_bus *bus); +int sd_bus_get_timeout(sd_bus *bus, uint64_t *timeout_usec); +int sd_bus_process(sd_bus *bus, sd_bus_message **r); +int sd_bus_process_priority(sd_bus *bus, int64_t max_priority, sd_bus_message **r) _sd_deprecated_; +int sd_bus_wait(sd_bus *bus, uint64_t timeout_usec); +int sd_bus_flush(sd_bus *bus); +int sd_bus_enqueue_for_read(sd_bus *bus, sd_bus_message *m); + +sd_bus_slot* sd_bus_get_current_slot(sd_bus *bus); +sd_bus_message* sd_bus_get_current_message(sd_bus *bus); +sd_bus_message_handler_t sd_bus_get_current_handler(sd_bus *bus); +void* sd_bus_get_current_userdata(sd_bus *bus); + +int sd_bus_attach_event(sd_bus *bus, sd_event *e, int priority); +int sd_bus_detach_event(sd_bus *bus); +sd_event* sd_bus_get_event(sd_bus *bus); + +int sd_bus_get_n_queued_read(sd_bus *bus, uint64_t *ret); +int sd_bus_get_n_queued_write(sd_bus *bus, uint64_t *ret); + +int sd_bus_set_method_call_timeout(sd_bus *bus, uint64_t usec); +int sd_bus_get_method_call_timeout(sd_bus *bus, uint64_t *ret); + +int sd_bus_add_filter(sd_bus *bus, sd_bus_slot **slot, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_add_match(sd_bus *bus, sd_bus_slot **slot, const char *match, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_add_match_async(sd_bus *bus, sd_bus_slot **slot, const char *match, sd_bus_message_handler_t callback, sd_bus_message_handler_t install_callback, void *userdata); +int sd_bus_add_object(sd_bus *bus, sd_bus_slot **slot, const char *path, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_add_fallback(sd_bus *bus, sd_bus_slot **slot, const char *prefix, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_add_object_vtable(sd_bus *bus, sd_bus_slot **slot, const char *path, const char *interface, const sd_bus_vtable *vtable, void *userdata); +int sd_bus_add_fallback_vtable(sd_bus *bus, sd_bus_slot **slot, const char *prefix, const char *interface, const sd_bus_vtable *vtable, sd_bus_object_find_t find, void *userdata); +int sd_bus_add_node_enumerator(sd_bus *bus, sd_bus_slot **slot, const char *path, sd_bus_node_enumerator_t callback, void *userdata); +int sd_bus_add_object_manager(sd_bus *bus, sd_bus_slot **slot, const char *path); + +/* Slot object */ + +sd_bus_slot* sd_bus_slot_ref(sd_bus_slot *slot); +sd_bus_slot* sd_bus_slot_unref(sd_bus_slot *slot); + +sd_bus* sd_bus_slot_get_bus(sd_bus_slot *slot); +void* sd_bus_slot_get_userdata(sd_bus_slot *slot); +void* sd_bus_slot_set_userdata(sd_bus_slot *slot, void *userdata); +int sd_bus_slot_set_description(sd_bus_slot *slot, const char *description); +int sd_bus_slot_get_description(sd_bus_slot *slot, const char **description); +int sd_bus_slot_get_floating(sd_bus_slot *slot); +int sd_bus_slot_set_floating(sd_bus_slot *slot, int b); +int sd_bus_slot_set_destroy_callback(sd_bus_slot *s, sd_bus_destroy_t callback); +int sd_bus_slot_get_destroy_callback(sd_bus_slot *s, sd_bus_destroy_t *callback); + +sd_bus_message* sd_bus_slot_get_current_message(sd_bus_slot *slot); +sd_bus_message_handler_t sd_bus_slot_get_current_handler(sd_bus_slot *slot); +void* sd_bus_slot_get_current_userdata(sd_bus_slot *slot); + +/* Message object */ + +int sd_bus_message_new(sd_bus *bus, sd_bus_message **m, uint8_t type); +int sd_bus_message_new_signal(sd_bus *bus, sd_bus_message **m, const char *path, const char *interface, const char *member); +int sd_bus_message_new_signal_to(sd_bus *bus, sd_bus_message **m, const char *destination, const char *path, const char *interface, const char *member); +int sd_bus_message_new_method_call(sd_bus *bus, sd_bus_message **m, const char *destination, const char *path, const char *interface, const char *member); +int sd_bus_message_new_method_return(sd_bus_message *call, sd_bus_message **m); +int sd_bus_message_new_method_error(sd_bus_message *call, sd_bus_message **m, const sd_bus_error *e); +int sd_bus_message_new_method_errorf(sd_bus_message *call, sd_bus_message **m, const char *name, const char *format, ...) _sd_printf_(4, 5); +int sd_bus_message_new_method_errno(sd_bus_message *call, sd_bus_message **m, int error, const sd_bus_error *e); +int sd_bus_message_new_method_errnof(sd_bus_message *call, sd_bus_message **m, int error, const char *format, ...) _sd_printf_(4, 5); + +sd_bus_message* sd_bus_message_ref(sd_bus_message *m); +sd_bus_message* sd_bus_message_unref(sd_bus_message *m); + +int sd_bus_message_seal(sd_bus_message *m, uint64_t cookie, uint64_t timeout_usec); + +int sd_bus_message_get_type(sd_bus_message *m, uint8_t *type); +int sd_bus_message_get_cookie(sd_bus_message *m, uint64_t *cookie); +int sd_bus_message_get_reply_cookie(sd_bus_message *m, uint64_t *cookie); +int sd_bus_message_get_priority(sd_bus_message *m, int64_t *priority) _sd_deprecated_; + +int sd_bus_message_get_expect_reply(sd_bus_message *m); +int sd_bus_message_get_auto_start(sd_bus_message *m); +int sd_bus_message_get_allow_interactive_authorization(sd_bus_message *m); + +const char* sd_bus_message_get_signature(sd_bus_message *m, int complete); +const char* sd_bus_message_get_path(sd_bus_message *m); +const char* sd_bus_message_get_interface(sd_bus_message *m); +const char* sd_bus_message_get_member(sd_bus_message *m); +const char* sd_bus_message_get_destination(sd_bus_message *m); +const char* sd_bus_message_get_sender(sd_bus_message *m); +const sd_bus_error* sd_bus_message_get_error(sd_bus_message *m); +int sd_bus_message_get_errno(sd_bus_message *m); + +int sd_bus_message_get_monotonic_usec(sd_bus_message *m, uint64_t *usec); +int sd_bus_message_get_realtime_usec(sd_bus_message *m, uint64_t *usec); +int sd_bus_message_get_seqnum(sd_bus_message *m, uint64_t *seqnum); + +sd_bus* sd_bus_message_get_bus(sd_bus_message *m); +sd_bus_creds* sd_bus_message_get_creds(sd_bus_message *m); /* do not unref the result */ + +int sd_bus_message_is_signal(sd_bus_message *m, const char *interface, const char *member); +int sd_bus_message_is_method_call(sd_bus_message *m, const char *interface, const char *member); +int sd_bus_message_is_method_error(sd_bus_message *m, const char *name); +int sd_bus_message_is_empty(sd_bus_message *m); +int sd_bus_message_has_signature(sd_bus_message *m, const char *signature); + +int sd_bus_message_set_expect_reply(sd_bus_message *m, int b); +int sd_bus_message_set_auto_start(sd_bus_message *m, int b); +int sd_bus_message_set_allow_interactive_authorization(sd_bus_message *m, int b); + +int sd_bus_message_set_destination(sd_bus_message *m, const char *destination); +int sd_bus_message_set_sender(sd_bus_message *m, const char *sender); +int sd_bus_message_set_priority(sd_bus_message *m, int64_t priority) _sd_deprecated_; + +int sd_bus_message_append(sd_bus_message *m, const char *types, ...); +int sd_bus_message_appendv(sd_bus_message *m, const char *types, va_list ap); +int sd_bus_message_append_basic(sd_bus_message *m, char type, const void *p); +int sd_bus_message_append_array(sd_bus_message *m, char type, const void *ptr, size_t size); +int sd_bus_message_append_array_space(sd_bus_message *m, char type, size_t size, void **ptr); +int sd_bus_message_append_array_iovec(sd_bus_message *m, char type, const struct iovec *iov, unsigned n); +int sd_bus_message_append_array_memfd(sd_bus_message *m, char type, int memfd, uint64_t offset, uint64_t size); +int sd_bus_message_append_string_space(sd_bus_message *m, size_t size, char **s); +int sd_bus_message_append_string_iovec(sd_bus_message *m, const struct iovec *iov, unsigned n); +int sd_bus_message_append_string_memfd(sd_bus_message *m, int memfd, uint64_t offset, uint64_t size); +int sd_bus_message_append_strv(sd_bus_message *m, char **l); +int sd_bus_message_open_container(sd_bus_message *m, char type, const char *contents); +int sd_bus_message_close_container(sd_bus_message *m); +int sd_bus_message_copy(sd_bus_message *m, sd_bus_message *source, int all); + +int sd_bus_message_read(sd_bus_message *m, const char *types, ...); +int sd_bus_message_readv(sd_bus_message *m, const char *types, va_list ap); +int sd_bus_message_read_basic(sd_bus_message *m, char type, void *p); +int sd_bus_message_read_array(sd_bus_message *m, char type, const void **ptr, size_t *size); +int sd_bus_message_read_strv(sd_bus_message *m, char ***l); /* free the result! */ +int sd_bus_message_read_strv_extend(sd_bus_message *m, char ***l); +int sd_bus_message_skip(sd_bus_message *m, const char *types); +int sd_bus_message_enter_container(sd_bus_message *m, char type, const char *contents); +int sd_bus_message_exit_container(sd_bus_message *m); +int sd_bus_message_peek_type(sd_bus_message *m, char *type, const char **contents); +int sd_bus_message_verify_type(sd_bus_message *m, char type, const char *contents); +int sd_bus_message_at_end(sd_bus_message *m, int complete); +int sd_bus_message_rewind(sd_bus_message *m, int complete); +int sd_bus_message_sensitive(sd_bus_message *m); + +int sd_bus_message_dump(sd_bus_message *m, FILE *f, uint64_t flags); + +/* Bus management */ + +int sd_bus_get_unique_name(sd_bus *bus, const char **unique); +int sd_bus_request_name(sd_bus *bus, const char *name, uint64_t flags); +int sd_bus_request_name_async(sd_bus *bus, sd_bus_slot **ret_slot, const char *name, uint64_t flags, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_release_name(sd_bus *bus, const char *name); +int sd_bus_release_name_async(sd_bus *bus, sd_bus_slot **ret_slot, const char *name, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_list_names(sd_bus *bus, char ***acquired, char ***activatable); /* free the results */ +int sd_bus_get_name_creds(sd_bus *bus, const char *name, uint64_t mask, sd_bus_creds **creds); /* unref the result! */ +int sd_bus_get_name_machine_id(sd_bus *bus, const char *name, sd_id128_t *machine); + +/* Convenience calls */ + +int sd_bus_message_send(sd_bus_message *m); +int sd_bus_call_methodv(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, sd_bus_message **reply, const char *types, va_list ap); +int sd_bus_call_method(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, sd_bus_message **reply, const char *types, ...); +int sd_bus_call_method_asyncv(sd_bus *bus, sd_bus_slot **slot, const char *destination, const char *path, const char *interface, const char *member, sd_bus_message_handler_t callback, void *userdata, const char *types, va_list ap); +int sd_bus_call_method_async(sd_bus *bus, sd_bus_slot **slot, const char *destination, const char *path, const char *interface, const char *member, sd_bus_message_handler_t callback, void *userdata, const char *types, ...); +int sd_bus_get_property(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, sd_bus_message **reply, const char *type); +int sd_bus_get_property_trivial(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, char type, void *ret_ptr); +int sd_bus_get_property_string(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, char **ret); /* free the result! */ +int sd_bus_get_property_strv(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, char ***ret); /* free the result! */ +int sd_bus_set_propertyv(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, const char *type, va_list ap); +int sd_bus_set_property(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, sd_bus_error *ret_error, const char *type, ...); + +int sd_bus_reply_method_returnv(sd_bus_message *call, const char *types, va_list ap); +int sd_bus_reply_method_return(sd_bus_message *call, const char *types, ...); +int sd_bus_reply_method_error(sd_bus_message *call, const sd_bus_error *e); +int sd_bus_reply_method_errorfv(sd_bus_message *call, const char *name, const char *format, va_list ap) _sd_printf_(3, 0); +int sd_bus_reply_method_errorf(sd_bus_message *call, const char *name, const char *format, ...) _sd_printf_(3, 4); +int sd_bus_reply_method_errno(sd_bus_message *call, int error, const sd_bus_error *e); +int sd_bus_reply_method_errnofv(sd_bus_message *call, int error, const char *format, va_list ap) _sd_printf_(3, 0); +int sd_bus_reply_method_errnof(sd_bus_message *call, int error, const char *format, ...) _sd_printf_(3, 4); + +int sd_bus_emit_signalv(sd_bus *bus, const char *path, const char *interface, const char *member, const char *types, va_list ap); +int sd_bus_emit_signal(sd_bus *bus, const char *path, const char *interface, const char *member, const char *types, ...); +int sd_bus_emit_signal_tov(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, const char *types, va_list ap); +int sd_bus_emit_signal_to(sd_bus *bus, const char *destination, const char *path, const char *interface, const char *member, const char *types, ...); + +int sd_bus_emit_properties_changed_strv(sd_bus *bus, const char *path, const char *interface, char **names); +int sd_bus_emit_properties_changed(sd_bus *bus, const char *path, const char *interface, const char *name, ...) _sd_sentinel_; + +int sd_bus_emit_object_added(sd_bus *bus, const char *path); +int sd_bus_emit_object_removed(sd_bus *bus, const char *path); +int sd_bus_emit_interfaces_added_strv(sd_bus *bus, const char *path, char **interfaces); +int sd_bus_emit_interfaces_added(sd_bus *bus, const char *path, const char *interface, ...) _sd_sentinel_; +int sd_bus_emit_interfaces_removed_strv(sd_bus *bus, const char *path, char **interfaces); +int sd_bus_emit_interfaces_removed(sd_bus *bus, const char *path, const char *interface, ...) _sd_sentinel_; + +int sd_bus_query_sender_creds(sd_bus_message *m, uint64_t mask, sd_bus_creds **creds); +int sd_bus_query_sender_privilege(sd_bus_message *m, int capability); + +int sd_bus_match_signal(sd_bus *bus, sd_bus_slot **ret, const char *sender, const char *path, const char *interface, const char *member, sd_bus_message_handler_t callback, void *userdata); +int sd_bus_match_signal_async(sd_bus *bus, sd_bus_slot **ret, const char *sender, const char *path, const char *interface, const char *member, sd_bus_message_handler_t match_callback, sd_bus_message_handler_t add_callback, void *userdata); + +/* Credential handling */ + +int sd_bus_creds_new_from_pid(sd_bus_creds **ret, pid_t pid, uint64_t creds_mask); +sd_bus_creds* sd_bus_creds_ref(sd_bus_creds *c); +sd_bus_creds* sd_bus_creds_unref(sd_bus_creds *c); +uint64_t sd_bus_creds_get_mask(const sd_bus_creds *c); +uint64_t sd_bus_creds_get_augmented_mask(const sd_bus_creds *c); + +int sd_bus_creds_get_pid(sd_bus_creds *c, pid_t *pid); +int sd_bus_creds_get_ppid(sd_bus_creds *c, pid_t *ppid); +int sd_bus_creds_get_tid(sd_bus_creds *c, pid_t *tid); +int sd_bus_creds_get_uid(sd_bus_creds *c, uid_t *uid); +int sd_bus_creds_get_euid(sd_bus_creds *c, uid_t *euid); +int sd_bus_creds_get_suid(sd_bus_creds *c, uid_t *suid); +int sd_bus_creds_get_fsuid(sd_bus_creds *c, uid_t *fsuid); +int sd_bus_creds_get_gid(sd_bus_creds *c, gid_t *gid); +int sd_bus_creds_get_egid(sd_bus_creds *c, gid_t *egid); +int sd_bus_creds_get_sgid(sd_bus_creds *c, gid_t *sgid); +int sd_bus_creds_get_fsgid(sd_bus_creds *c, gid_t *fsgid); +int sd_bus_creds_get_supplementary_gids(sd_bus_creds *c, const gid_t **gids); +int sd_bus_creds_get_comm(sd_bus_creds *c, const char **comm); +int sd_bus_creds_get_tid_comm(sd_bus_creds *c, const char **comm); +int sd_bus_creds_get_exe(sd_bus_creds *c, const char **exe); +int sd_bus_creds_get_cmdline(sd_bus_creds *c, char ***cmdline); +int sd_bus_creds_get_cgroup(sd_bus_creds *c, const char **cgroup); +int sd_bus_creds_get_unit(sd_bus_creds *c, const char **unit); +int sd_bus_creds_get_slice(sd_bus_creds *c, const char **slice); +int sd_bus_creds_get_user_unit(sd_bus_creds *c, const char **unit); +int sd_bus_creds_get_user_slice(sd_bus_creds *c, const char **slice); +int sd_bus_creds_get_session(sd_bus_creds *c, const char **session); +int sd_bus_creds_get_owner_uid(sd_bus_creds *c, uid_t *uid); +int sd_bus_creds_has_effective_cap(sd_bus_creds *c, int capability); +int sd_bus_creds_has_permitted_cap(sd_bus_creds *c, int capability); +int sd_bus_creds_has_inheritable_cap(sd_bus_creds *c, int capability); +int sd_bus_creds_has_bounding_cap(sd_bus_creds *c, int capability); +int sd_bus_creds_get_selinux_context(sd_bus_creds *c, const char **context); +int sd_bus_creds_get_audit_session_id(sd_bus_creds *c, uint32_t *sessionid); +int sd_bus_creds_get_audit_login_uid(sd_bus_creds *c, uid_t *loginuid); +int sd_bus_creds_get_tty(sd_bus_creds *c, const char **tty); +int sd_bus_creds_get_unique_name(sd_bus_creds *c, const char **name); +int sd_bus_creds_get_well_known_names(sd_bus_creds *c, char ***names); +int sd_bus_creds_get_description(sd_bus_creds *c, const char **name); + +/* Error structures */ + +#define SD_BUS_ERROR_MAKE_CONST(name, message) ((const sd_bus_error) {(name), (message), 0}) +#define SD_BUS_ERROR_NULL SD_BUS_ERROR_MAKE_CONST(NULL, NULL) + +void sd_bus_error_free(sd_bus_error *e); +int sd_bus_error_set(sd_bus_error *e, const char *name, const char *message); +int sd_bus_error_setf(sd_bus_error *e, const char *name, const char *format, ...) _sd_printf_(3, 4); +int sd_bus_error_setfv(sd_bus_error *e, const char *name, const char *format, va_list ap) _sd_printf_(3,0); + +int sd_bus_error_set_const(sd_bus_error *e, const char *name, const char *message); +int sd_bus_error_set_errno(sd_bus_error *e, int error); +int sd_bus_error_set_errnof(sd_bus_error *e, int error, const char *format, ...) _sd_printf_(3, 4); +int sd_bus_error_set_errnofv(sd_bus_error *e, int error, const char *format, va_list ap) _sd_printf_(3,0); +int sd_bus_error_get_errno(const sd_bus_error *e); +int sd_bus_error_copy(sd_bus_error *dest, const sd_bus_error *e); +int sd_bus_error_move(sd_bus_error *dest, sd_bus_error *e); +int sd_bus_error_is_set(const sd_bus_error *e); +int sd_bus_error_has_name(const sd_bus_error *e, const char *name); +int sd_bus_error_has_names_sentinel(const sd_bus_error *e, ...) _sd_sentinel_; +#define sd_bus_error_has_names(e, ...) sd_bus_error_has_names_sentinel(e, __VA_ARGS__, NULL) + +#define SD_BUS_ERROR_MAP(_name, _code) \ + { \ + .name = _name, \ + .code = _code, \ + } +#define SD_BUS_ERROR_MAP_END \ + { \ + .name = NULL, \ + .code = - 'x', \ + } + +int sd_bus_error_add_map(const sd_bus_error_map *map); + +/* Auxiliary macros */ + +#define SD_BUS_MESSAGE_APPEND_ID128(x) 16, \ + (x).bytes[0], (x).bytes[1], (x).bytes[2], (x).bytes[3], \ + (x).bytes[4], (x).bytes[5], (x).bytes[6], (x).bytes[7], \ + (x).bytes[8], (x).bytes[9], (x).bytes[10], (x).bytes[11], \ + (x).bytes[12], (x).bytes[13], (x).bytes[14], (x).bytes[15] + +#define SD_BUS_MESSAGE_READ_ID128(x) 16, \ + &(x).bytes[0], &(x).bytes[1], &(x).bytes[2], &(x).bytes[3], \ + &(x).bytes[4], &(x).bytes[5], &(x).bytes[6], &(x).bytes[7], \ + &(x).bytes[8], &(x).bytes[9], &(x).bytes[10], &(x).bytes[11], \ + &(x).bytes[12], &(x).bytes[13], &(x).bytes[14], &(x).bytes[15] + +/* Label escaping */ + +int sd_bus_path_encode(const char *prefix, const char *external_id, char **ret_path); +int sd_bus_path_encode_many(char **out, const char *path_template, ...); +int sd_bus_path_decode(const char *path, const char *prefix, char **ret_external_id); +int sd_bus_path_decode_many(const char *path, const char *path_template, ...); + +/* Tracking peers */ + +int sd_bus_track_new(sd_bus *bus, sd_bus_track **track, sd_bus_track_handler_t handler, void *userdata); +sd_bus_track* sd_bus_track_ref(sd_bus_track *track); +sd_bus_track* sd_bus_track_unref(sd_bus_track *track); + +sd_bus* sd_bus_track_get_bus(sd_bus_track *track); +void* sd_bus_track_get_userdata(sd_bus_track *track); +void* sd_bus_track_set_userdata(sd_bus_track *track, void *userdata); + +int sd_bus_track_add_sender(sd_bus_track *track, sd_bus_message *m); +int sd_bus_track_remove_sender(sd_bus_track *track, sd_bus_message *m); +int sd_bus_track_add_name(sd_bus_track *track, const char *name); +int sd_bus_track_remove_name(sd_bus_track *track, const char *name); + +int sd_bus_track_set_recursive(sd_bus_track *track, int b); +int sd_bus_track_get_recursive(sd_bus_track *track); + +unsigned sd_bus_track_count(sd_bus_track *track); +int sd_bus_track_count_sender(sd_bus_track *track, sd_bus_message *m); +int sd_bus_track_count_name(sd_bus_track *track, const char *name); + +const char* sd_bus_track_contains(sd_bus_track *track, const char *name); +const char* sd_bus_track_first(sd_bus_track *track); +const char* sd_bus_track_next(sd_bus_track *track); + +int sd_bus_track_set_destroy_callback(sd_bus_track *s, sd_bus_destroy_t callback); +int sd_bus_track_get_destroy_callback(sd_bus_track *s, sd_bus_destroy_t *ret); + +/* Define helpers so that __attribute__((cleanup(sd_bus_unrefp))) and similar may be used. */ +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus, sd_bus_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus, sd_bus_close_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus, sd_bus_flush_close_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus_slot, sd_bus_slot_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus_message, sd_bus_message_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus_creds, sd_bus_creds_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_bus_track, sd_bus_track_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-daemon.h b/src/systemd/sd-daemon.h new file mode 100644 index 0000000..595b6f3 --- /dev/null +++ b/src/systemd/sd-daemon.h @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddaemonhfoo +#define foosddaemonhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* + The following functionality is provided: + + - Support for logging with log levels on stderr + - File descriptor passing for socket-based activation + - Daemon startup and status notification + - Detection of systemd boots + + See sd-daemon(3) for more information. +*/ + +/* + Log levels for usage on stderr: + + fprintf(stderr, SD_NOTICE "Hello World!\n"); + + This is similar to printk() usage in the kernel. +*/ +#define SD_EMERG "<0>" /* system is unusable */ +#define SD_ALERT "<1>" /* action must be taken immediately */ +#define SD_CRIT "<2>" /* critical conditions */ +#define SD_ERR "<3>" /* error conditions */ +#define SD_WARNING "<4>" /* warning conditions */ +#define SD_NOTICE "<5>" /* normal but significant condition */ +#define SD_INFO "<6>" /* informational */ +#define SD_DEBUG "<7>" /* debug-level messages */ + +/* The first passed file descriptor is fd 3 */ +#define SD_LISTEN_FDS_START 3 + +/* + Returns how many file descriptors have been passed, or a negative + errno code on failure. Optionally, removes the $LISTEN_FDS and + $LISTEN_PID file descriptors from the environment (recommended, but + problematic in threaded environments). If r is the return value of + this function you'll find the file descriptors passed as fds + SD_LISTEN_FDS_START to SD_LISTEN_FDS_START+r-1. Returns a negative + errno style error code on failure. This function call ensures that + the FD_CLOEXEC flag is set for the passed file descriptors, to make + sure they are not passed on to child processes. If FD_CLOEXEC shall + not be set, the caller needs to unset it after this call for all file + descriptors that are used. + + See sd_listen_fds(3) for more information. +*/ +int sd_listen_fds(int unset_environment); + +int sd_listen_fds_with_names(int unset_environment, char ***names); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if + the file descriptor is a FIFO in the file system stored under the + specified path, 0 otherwise. If path is NULL a path name check will + not be done and the call only verifies if the file descriptor + refers to a FIFO. Returns a negative errno style error code on + failure. + + See sd_is_fifo(3) for more information. +*/ +int sd_is_fifo(int fd, const char *path); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if + the file descriptor is a special character device on the file + system stored under the specified path, 0 otherwise. + If path is NULL a path name check will not be done and the call + only verifies if the file descriptor refers to a special character. + Returns a negative errno style error code on failure. + + See sd_is_special(3) for more information. +*/ +int sd_is_special(int fd, const char *path); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if + the file descriptor is a socket of the specified family (AF_INET, + ...) and type (SOCK_DGRAM, SOCK_STREAM, ...), 0 otherwise. If + family is 0 a socket family check will not be done. If type is 0 a + socket type check will not be done and the call only verifies if + the file descriptor refers to a socket. If listening is > 0 it is + verified that the socket is in listening mode. (i.e. listen() has + been called) If listening is == 0 it is verified that the socket is + not in listening mode. If listening is < 0 no listening mode check + is done. Returns a negative errno style error code on failure. + + See sd_is_socket(3) for more information. +*/ +int sd_is_socket(int fd, int family, int type, int listening); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if + the file descriptor is an Internet socket, of the specified family + (either AF_INET or AF_INET6) and the specified type (SOCK_DGRAM, + SOCK_STREAM, ...), 0 otherwise. If version is 0 a protocol version + check is not done. If type is 0 a socket type check will not be + done. If port is 0 a socket port check will not be done. The + listening flag is used the same way as in sd_is_socket(). Returns a + negative errno style error code on failure. + + See sd_is_socket_inet(3) for more information. +*/ +int sd_is_socket_inet(int fd, int family, int type, int listening, uint16_t port); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if the + file descriptor is an Internet socket of the specified type + (SOCK_DGRAM, SOCK_STREAM, ...), and if the address of the socket is + the same as the address specified by addr. The listening flag is used + the same way as in sd_is_socket(). Returns a negative errno style + error code on failure. + + See sd_is_socket_sockaddr(3) for more information. +*/ +int sd_is_socket_sockaddr(int fd, int type, const struct sockaddr* addr, unsigned addr_len, int listening); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if + the file descriptor is an AF_UNIX socket of the specified type + (SOCK_DGRAM, SOCK_STREAM, ...) and path, 0 otherwise. If type is 0 + a socket type check will not be done. If path is NULL a socket path + check will not be done. For normal AF_UNIX sockets set length to + 0. For abstract namespace sockets set length to the length of the + socket name (including the initial 0 byte), and pass the full + socket path in path (including the initial 0 byte). The listening + flag is used the same way as in sd_is_socket(). Returns a negative + errno style error code on failure. + + See sd_is_socket_unix(3) for more information. +*/ +int sd_is_socket_unix(int fd, int type, int listening, const char *path, size_t length); + +/* + Helper call for identifying a passed file descriptor. Returns 1 if + the file descriptor is a POSIX Message Queue of the specified name, + 0 otherwise. If path is NULL a message queue name check is not + done. Returns a negative errno style error code on failure. + + See sd_is_mq(3) for more information. +*/ +int sd_is_mq(int fd, const char *path); + +/* + Informs systemd about changed daemon state. This takes a number of + newline separated environment-style variable assignments in a + string. The following variables are known: + + MAINPID=... The main PID of a daemon, in case systemd did not + fork off the process itself. Example: "MAINPID=4711" + + READY=1 Tells systemd that daemon startup or daemon reload + is finished (only relevant for services of Type=notify). + The passed argument is a boolean "1" or "0". Since there + is little value in signaling non-readiness the only + value daemons should send is "READY=1". + + RELOADING=1 Tell systemd that the daemon began reloading its + configuration. When the configuration has been + reloaded completely, READY=1 should be sent to inform + systemd about this. + + STOPPING=1 Tells systemd that the daemon is about to go down. + + STATUS=... Passes a single-line status string back to systemd + that describes the daemon state. This is free-form + and can be used for various purposes: general state + feedback, fsck-like programs could pass completion + percentages and failing programs could pass a human + readable error message. Example: "STATUS=Completed + 66% of file system check..." + + NOTIFYACCESS=... + Reset the access to the service status notification socket. + Example: "NOTIFYACCESS=main" + + ERRNO=... If a daemon fails, the errno-style error code, + formatted as string. Example: "ERRNO=2" for ENOENT. + + BUSERROR=... If a daemon fails, the D-Bus error-style error + code. Example: "BUSERROR=org.freedesktop.DBus.Error.TimedOut" + + WATCHDOG=1 Tells systemd to update the watchdog timestamp. + Services using this feature should do this in + regular intervals. A watchdog framework can use the + timestamps to detect failed services. Also see + sd_watchdog_enabled() below. + + WATCHDOG_USEC=... + Reset watchdog_usec value during runtime. + To reset watchdog_usec value, start the service again. + Example: "WATCHDOG_USEC=20000000" + + FDSTORE=1 Store the file descriptors passed along with the + message in the per-service file descriptor store, + and pass them to the main process again on next + invocation. This variable is only supported with + sd_pid_notify_with_fds(). + + FDSTOREREMOVE=1 + Remove one or more file descriptors from the file + descriptor store, identified by the name specified + in FDNAME=, see below. + + FDNAME= A name to assign to new file descriptors stored in the + file descriptor store, or the name of the file descriptors + to remove in case of FDSTOREREMOVE=1. + + Daemons can choose to send additional variables. However, it is + recommended to prefix variable names not listed above with X_. + + Returns a negative errno-style error code on failure. Returns > 0 + if systemd could be notified, 0 if it couldn't possibly because + systemd is not running. + + Example: When a daemon finished starting up, it could issue this + call to notify systemd about it: + + sd_notify(0, "READY=1"); + + See sd_notifyf() for more complete examples. + + See sd_notify(3) for more information. +*/ +int sd_notify(int unset_environment, const char *state); + +/* + Similar to sd_notify() but takes a format string. + + Example 1: A daemon could send the following after initialization: + + sd_notifyf(0, "READY=1\n" + "STATUS=Processing requests...\n" + "MAINPID=%lu", + (unsigned long) getpid()); + + Example 2: A daemon could send the following shortly before + exiting, on failure: + + sd_notifyf(0, "STATUS=Failed to start up: %s\n" + "ERRNO=%i", + strerror_r(errnum, (char[1024]){}, 1024), + errnum); + + See sd_notifyf(3) for more information. +*/ +int sd_notifyf(int unset_environment, const char *format, ...) _sd_printf_(2,3); + +/* + Similar to sd_notify(), but send the message on behalf of another + process, if the appropriate permissions are available. +*/ +int sd_pid_notify(pid_t pid, int unset_environment, const char *state); + +/* + Similar to sd_notifyf(), but send the message on behalf of another + process, if the appropriate permissions are available. +*/ +int sd_pid_notifyf(pid_t pid, int unset_environment, const char *format, ...) _sd_printf_(3,4); + +/* + Similar to sd_pid_notify(), but also passes the specified fd array + to the service manager for storage. This is particularly useful for + FDSTORE=1 messages. +*/ +int sd_pid_notify_with_fds(pid_t pid, int unset_environment, const char *state, const int *fds, unsigned n_fds); + +/* + Combination of sd_pid_notifyf() and sd_pid_notify_with_fds() +*/ +int sd_pid_notifyf_with_fds(pid_t pid, int unset_environment, const int *fds, size_t n_fds, const char *format, ...) _sd_printf_(5,6); + +/* + Returns > 0 if synchronization with systemd succeeded. Returns < 0 + on error. Returns 0 if $NOTIFY_SOCKET was not set. Note that the + timeout parameter of this function call takes the timeout in μs, and + will be passed to ppoll(2), hence the behaviour will be similar to + ppoll(2). This function can be called after sending a status message + to systemd, if one needs to synchronize against reception of the + status messages sent before this call is made. Therefore, this + cannot be used to know if the status message was processed + successfully, but to only synchronize against its consumption. +*/ +int sd_notify_barrier(int unset_environment, uint64_t timeout); + +/* + Just like sd_notify_barrier() but also takes a PID to send the barrier message from. +*/ +int sd_pid_notify_barrier(pid_t pid, int unset_environment, uint64_t timeout); + +/* + Returns > 0 if the system was booted with systemd. Returns < 0 on + error. Returns 0 if the system was not booted with systemd. Note + that all of the functions above handle non-systemd boots just + fine. You should NOT protect them with a call to this function. Also + note that this function checks whether the system, not the user + session is controlled by systemd. However the functions above work + for both user and system services. + + See sd_booted(3) for more information. +*/ +int sd_booted(void); + +/* + Returns > 0 if the service manager expects watchdog keep-alive + events to be sent regularly via sd_notify(0, "WATCHDOG=1"). Returns + 0 if it does not expect this. If the usec argument is non-NULL + returns the watchdog timeout in μs after which the service manager + will act on a process that has not sent a watchdog keep alive + message. This function is useful to implement services that + recognize automatically if they are being run under supervision of + systemd with WatchdogSec= set. It is recommended for clients to + generate keep-alive pings via sd_notify(0, "WATCHDOG=1") every half + of the returned time. + + See sd_watchdog_enabled(3) for more information. +*/ +int sd_watchdog_enabled(int unset_environment, uint64_t *usec); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-device.h b/src/systemd/sd-device.h new file mode 100644 index 0000000..b67ec0f --- /dev/null +++ b/src/systemd/sd-device.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddevicehfoo +#define foosddevicehfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "sd-event.h" +#include "sd-id128.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_device sd_device; +typedef struct sd_device_enumerator sd_device_enumerator; +typedef struct sd_device_monitor sd_device_monitor; + +__extension__ typedef enum sd_device_action_t { + SD_DEVICE_ADD, + SD_DEVICE_REMOVE, + SD_DEVICE_CHANGE, + SD_DEVICE_MOVE, + SD_DEVICE_ONLINE, + SD_DEVICE_OFFLINE, + SD_DEVICE_BIND, + SD_DEVICE_UNBIND, + _SD_DEVICE_ACTION_MAX, + _SD_DEVICE_ACTION_INVALID = -EINVAL, + _SD_ENUM_FORCE_S64(DEVICE_ACTION) +} sd_device_action_t; + +/* callback */ + +typedef int (*sd_device_monitor_handler_t)(sd_device_monitor *m, sd_device *device, void *userdata); + +/* device */ + +sd_device *sd_device_ref(sd_device *device); +sd_device *sd_device_unref(sd_device *device); + +int sd_device_new_from_syspath(sd_device **ret, const char *syspath); +int sd_device_new_from_devnum(sd_device **ret, char type, dev_t devnum); +int sd_device_new_from_subsystem_sysname(sd_device **ret, const char *subsystem, const char *sysname); +int sd_device_new_from_device_id(sd_device **ret, const char *id); +int sd_device_new_from_stat_rdev(sd_device **ret, const struct stat *st); +int sd_device_new_from_devname(sd_device **ret, const char *devname); +int sd_device_new_from_path(sd_device **ret, const char *path); +int sd_device_new_from_ifname(sd_device **ret, const char *ifname); +int sd_device_new_from_ifindex(sd_device **ret, int ifindex); + +int sd_device_new_child(sd_device **ret, sd_device *device, const char *suffix); + +int sd_device_get_parent(sd_device *child, sd_device **ret); +int sd_device_get_parent_with_subsystem_devtype(sd_device *child, const char *subsystem, const char *devtype, sd_device **ret); + +int sd_device_get_syspath(sd_device *device, const char **ret); +int sd_device_get_subsystem(sd_device *device, const char **ret); +int sd_device_get_devtype(sd_device *device, const char **ret); +int sd_device_get_devnum(sd_device *device, dev_t *devnum); +int sd_device_get_ifindex(sd_device *device, int *ifindex); +int sd_device_get_driver(sd_device *device, const char **ret); +int sd_device_get_devpath(sd_device *device, const char **ret); +int sd_device_get_devname(sd_device *device, const char **ret); +int sd_device_get_sysname(sd_device *device, const char **ret); +int sd_device_get_sysnum(sd_device *device, const char **ret); +int sd_device_get_action(sd_device *device, sd_device_action_t *ret); +int sd_device_get_seqnum(sd_device *device, uint64_t *ret); +int sd_device_get_diskseq(sd_device *device, uint64_t *ret); + +int sd_device_get_is_initialized(sd_device *device); +int sd_device_get_usec_initialized(sd_device *device, uint64_t *ret); +int sd_device_get_usec_since_initialized(sd_device *device, uint64_t *ret); + +const char *sd_device_get_tag_first(sd_device *device); +const char *sd_device_get_tag_next(sd_device *device); +const char *sd_device_get_current_tag_first(sd_device *device); +const char *sd_device_get_current_tag_next(sd_device *device); +const char *sd_device_get_devlink_first(sd_device *device); +const char *sd_device_get_devlink_next(sd_device *device); +const char *sd_device_get_property_first(sd_device *device, const char **value); +const char *sd_device_get_property_next(sd_device *device, const char **value); +const char *sd_device_get_sysattr_first(sd_device *device); +const char *sd_device_get_sysattr_next(sd_device *device); +sd_device *sd_device_get_child_first(sd_device *device, const char **ret_suffix); +sd_device *sd_device_get_child_next(sd_device *device, const char **ret_suffix); + +int sd_device_has_tag(sd_device *device, const char *tag); +int sd_device_has_current_tag(sd_device *device, const char *tag); +int sd_device_get_property_value(sd_device *device, const char *key, const char **value); +int sd_device_get_trigger_uuid(sd_device *device, sd_id128_t *ret); +int sd_device_get_sysattr_value(sd_device *device, const char *sysattr, const char **_value); + +int sd_device_set_sysattr_value(sd_device *device, const char *sysattr, const char *value); +int sd_device_set_sysattr_valuef(sd_device *device, const char *sysattr, const char *format, ...) _sd_printf_(3, 4); +int sd_device_trigger(sd_device *device, sd_device_action_t action); +int sd_device_trigger_with_uuid(sd_device *device, sd_device_action_t action, sd_id128_t *ret_uuid); +int sd_device_open(sd_device *device, int flags); + +/* device enumerator */ + +int sd_device_enumerator_new(sd_device_enumerator **ret); +sd_device_enumerator *sd_device_enumerator_ref(sd_device_enumerator *enumerator); +sd_device_enumerator *sd_device_enumerator_unref(sd_device_enumerator *enumerator); + +sd_device *sd_device_enumerator_get_device_first(sd_device_enumerator *enumerator); +sd_device *sd_device_enumerator_get_device_next(sd_device_enumerator *enumerator); +sd_device *sd_device_enumerator_get_subsystem_first(sd_device_enumerator *enumerator); +sd_device *sd_device_enumerator_get_subsystem_next(sd_device_enumerator *enumerator); + +int sd_device_enumerator_add_match_subsystem(sd_device_enumerator *enumerator, const char *subsystem, int match); +int sd_device_enumerator_add_match_sysattr(sd_device_enumerator *enumerator, const char *sysattr, const char *value, int match); +int sd_device_enumerator_add_match_property(sd_device_enumerator *enumerator, const char *property, const char *value); +int sd_device_enumerator_add_match_property_required(sd_device_enumerator *enumerator, const char *property, const char *value); +int sd_device_enumerator_add_match_sysname(sd_device_enumerator *enumerator, const char *sysname); +int sd_device_enumerator_add_nomatch_sysname(sd_device_enumerator *enumerator, const char *sysname); +int sd_device_enumerator_add_match_tag(sd_device_enumerator *enumerator, const char *tag); +int sd_device_enumerator_add_match_parent(sd_device_enumerator *enumerator, sd_device *parent); +int sd_device_enumerator_allow_uninitialized(sd_device_enumerator *enumerator); + +/* device monitor */ + +int sd_device_monitor_new(sd_device_monitor **ret); +sd_device_monitor *sd_device_monitor_ref(sd_device_monitor *m); +sd_device_monitor *sd_device_monitor_unref(sd_device_monitor *m); + +int sd_device_monitor_set_receive_buffer_size(sd_device_monitor *m, size_t size); +int sd_device_monitor_attach_event(sd_device_monitor *m, sd_event *event); +int sd_device_monitor_detach_event(sd_device_monitor *m); +sd_event *sd_device_monitor_get_event(sd_device_monitor *m); +sd_event_source *sd_device_monitor_get_event_source(sd_device_monitor *m); +int sd_device_monitor_set_description(sd_device_monitor *m, const char *description); +int sd_device_monitor_get_description(sd_device_monitor *m, const char **ret); +int sd_device_monitor_start(sd_device_monitor *m, sd_device_monitor_handler_t callback, void *userdata); +int sd_device_monitor_stop(sd_device_monitor *m); + +int sd_device_monitor_filter_add_match_subsystem_devtype(sd_device_monitor *m, const char *subsystem, const char *devtype); +int sd_device_monitor_filter_add_match_tag(sd_device_monitor *m, const char *tag); +int sd_device_monitor_filter_add_match_sysattr(sd_device_monitor *m, const char *sysattr, const char *value, int match); +int sd_device_monitor_filter_add_match_parent(sd_device_monitor *m, sd_device *device, int match); +int sd_device_monitor_filter_update(sd_device_monitor *m); +int sd_device_monitor_filter_remove(sd_device_monitor *m); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_device, sd_device_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_device_enumerator, sd_device_enumerator_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_device_monitor, sd_device_monitor_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp-client.h b/src/systemd/sd-dhcp-client.h new file mode 100644 index 0000000..3a8abc8 --- /dev/null +++ b/src/systemd/sd-dhcp-client.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcpclienthfoo +#define foosddhcpclienthfoo + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "sd-device.h" +#include "sd-dhcp-lease.h" +#include "sd-dhcp-option.h" +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +enum { + SD_DHCP_CLIENT_EVENT_STOP = 0, + SD_DHCP_CLIENT_EVENT_IP_ACQUIRE = 1, + SD_DHCP_CLIENT_EVENT_IP_CHANGE = 2, + SD_DHCP_CLIENT_EVENT_EXPIRED = 3, + SD_DHCP_CLIENT_EVENT_RENEW = 4, + SD_DHCP_CLIENT_EVENT_SELECTING = 5, + SD_DHCP_CLIENT_EVENT_TRANSIENT_FAILURE = 6 /* Sent when we have not received a reply after the first few attempts. + * The client may want to start acquiring link-local addresses. */ +}; + +typedef struct sd_dhcp_client sd_dhcp_client; + +typedef int (*sd_dhcp_client_callback_t)(sd_dhcp_client *client, int event, void *userdata); +int sd_dhcp_client_set_callback( + sd_dhcp_client *client, + sd_dhcp_client_callback_t cb, + void *userdata); + +int sd_dhcp_client_set_request_option( + sd_dhcp_client *client, + uint8_t option); +int sd_dhcp_client_set_request_address( + sd_dhcp_client *client, + const struct in_addr *last_address); +int sd_dhcp_client_set_request_broadcast( + sd_dhcp_client *client, + int broadcast); +int sd_dhcp_client_set_ifindex( + sd_dhcp_client *client, + int interface_index); +int sd_dhcp_client_set_ifname( + sd_dhcp_client *client, + const char *interface_name); +int sd_dhcp_client_get_ifname(sd_dhcp_client *client, const char **ret); +int sd_dhcp_client_set_mac( + sd_dhcp_client *client, + const uint8_t *hw_addr, + const uint8_t *bcast_addr, + size_t addr_len, + uint16_t arp_type); +int sd_dhcp_client_set_client_id( + sd_dhcp_client *client, + uint8_t type, + const uint8_t *data, + size_t data_len); +__extension__ int sd_dhcp_client_set_iaid_duid_llt( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid, + uint64_t llt_time); +__extension__ int sd_dhcp_client_set_iaid_duid_ll( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid); +__extension__ int sd_dhcp_client_set_iaid_duid_en( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid); +__extension__ int sd_dhcp_client_set_iaid_duid_uuid( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid); +__extension__ int sd_dhcp_client_set_iaid_duid_raw( + sd_dhcp_client *client, + bool iaid_set, + uint32_t iaid, + uint16_t duid_type, + const uint8_t *duid, + size_t duid_len); +__extension__ int sd_dhcp_client_set_rapid_commit( + sd_dhcp_client *client, + bool rapid_commit); +int sd_dhcp_client_get_client_id( + sd_dhcp_client *client, + uint8_t *ret_type, + const uint8_t **ret_data, + size_t *ret_data_len); +int sd_dhcp_client_set_mtu( + sd_dhcp_client *client, + uint32_t mtu); +int sd_dhcp_client_set_max_attempts( + sd_dhcp_client *client, + uint64_t attempt); +int sd_dhcp_client_set_client_port( + sd_dhcp_client *client, + uint16_t port); +int sd_dhcp_client_set_hostname( + sd_dhcp_client *client, + const char *hostname); +int sd_dhcp_client_set_vendor_class_identifier( + sd_dhcp_client *client, + const char *vci); +int sd_dhcp_client_set_mud_url( + sd_dhcp_client *client, + const char *mudurl); +int sd_dhcp_client_set_user_class( + sd_dhcp_client *client, + char * const *user_class); +int sd_dhcp_client_get_lease( + sd_dhcp_client *client, + sd_dhcp_lease **ret); +int sd_dhcp_client_set_service_type( + sd_dhcp_client *client, + int type); +int sd_dhcp_client_set_socket_priority( + sd_dhcp_client *client, + int so_priority); +int sd_dhcp_client_set_fallback_lease_lifetime( + sd_dhcp_client *client, + uint64_t fallback_lease_lifetime); + +int sd_dhcp_client_add_option(sd_dhcp_client *client, sd_dhcp_option *v); +int sd_dhcp_client_add_vendor_option(sd_dhcp_client *client, sd_dhcp_option *v); + +int sd_dhcp_client_is_running(sd_dhcp_client *client); +int sd_dhcp_client_stop(sd_dhcp_client *client); +int sd_dhcp_client_start(sd_dhcp_client *client); +int sd_dhcp_client_send_release(sd_dhcp_client *client); +int sd_dhcp_client_send_decline(sd_dhcp_client *client); +int sd_dhcp_client_send_renew(sd_dhcp_client *client); +int sd_dhcp_client_set_ipv6_connectivity(sd_dhcp_client *client, int have); +int sd_dhcp_client_interrupt_ipv6_only_mode(sd_dhcp_client *client); + +sd_dhcp_client *sd_dhcp_client_ref(sd_dhcp_client *client); +sd_dhcp_client *sd_dhcp_client_unref(sd_dhcp_client *client); + +/* NOTE: anonymize parameter is used to initialize PRL memory with different + * options when using RFC7844 Anonymity Profiles */ +int sd_dhcp_client_new(sd_dhcp_client **ret, int anonymize); + +int sd_dhcp_client_id_to_string(const void *data, size_t len, char **ret); + +int sd_dhcp_client_attach_event( + sd_dhcp_client *client, + sd_event *event, + int64_t priority); +int sd_dhcp_client_detach_event(sd_dhcp_client *client); +sd_event *sd_dhcp_client_get_event(sd_dhcp_client *client); +int sd_dhcp_client_attach_device(sd_dhcp_client *client, sd_device *dev); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp_client, sd_dhcp_client_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp-lease.h b/src/systemd/sd-dhcp-lease.h new file mode 100644 index 0000000..1ef53cc --- /dev/null +++ b/src/systemd/sd-dhcp-lease.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcpleasehfoo +#define foosddhcpleasehfoo + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_dhcp_lease sd_dhcp_lease; +typedef struct sd_dhcp_route sd_dhcp_route; + +sd_dhcp_lease *sd_dhcp_lease_ref(sd_dhcp_lease *lease); +sd_dhcp_lease *sd_dhcp_lease_unref(sd_dhcp_lease *lease); + +__extension__ typedef enum sd_dhcp_lease_server_type_t { + SD_DHCP_LEASE_DNS, + SD_DHCP_LEASE_NTP, + SD_DHCP_LEASE_SIP, + SD_DHCP_LEASE_POP3, + SD_DHCP_LEASE_SMTP, + SD_DHCP_LEASE_LPR, + _SD_DHCP_LEASE_SERVER_TYPE_MAX, + _SD_DHCP_LEASE_SERVER_TYPE_INVALID = -EINVAL, + _SD_ENUM_FORCE_S64(DHCP_LEASE_SERVER_TYPE) +} sd_dhcp_lease_server_type_t; + +int sd_dhcp_lease_get_address(sd_dhcp_lease *lease, struct in_addr *addr); +int sd_dhcp_lease_get_timestamp(sd_dhcp_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp_lease_get_lifetime(sd_dhcp_lease *lease, uint64_t *ret); +int sd_dhcp_lease_get_t1(sd_dhcp_lease *lease, uint64_t *ret); +int sd_dhcp_lease_get_t2(sd_dhcp_lease *lease, uint64_t *ret); +int sd_dhcp_lease_get_lifetime_timestamp(sd_dhcp_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp_lease_get_t1_timestamp(sd_dhcp_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp_lease_get_t2_timestamp(sd_dhcp_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp_lease_get_broadcast(sd_dhcp_lease *lease, struct in_addr *addr); +int sd_dhcp_lease_get_netmask(sd_dhcp_lease *lease, struct in_addr *addr); +int sd_dhcp_lease_get_prefix(sd_dhcp_lease *lease, struct in_addr *ret_prefix, uint8_t *ret_prefixlen); +int sd_dhcp_lease_get_router(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_next_server(sd_dhcp_lease *lease, struct in_addr *addr); +int sd_dhcp_lease_get_server_identifier(sd_dhcp_lease *lease, struct in_addr *addr); +int sd_dhcp_lease_get_servers(sd_dhcp_lease *lease, sd_dhcp_lease_server_type_t what, const struct in_addr **addr); +int sd_dhcp_lease_get_dns(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_ntp(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_sip(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_pop3(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_smtp(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_lpr(sd_dhcp_lease *lease, const struct in_addr **addr); +int sd_dhcp_lease_get_mtu(sd_dhcp_lease *lease, uint16_t *mtu); +int sd_dhcp_lease_get_domainname(sd_dhcp_lease *lease, const char **domainname); +int sd_dhcp_lease_get_search_domains(sd_dhcp_lease *lease, char ***domains); +int sd_dhcp_lease_get_hostname(sd_dhcp_lease *lease, const char **hostname); +int sd_dhcp_lease_get_root_path(sd_dhcp_lease *lease, const char **root_path); +int sd_dhcp_lease_get_captive_portal(sd_dhcp_lease *lease, const char **captive_portal); +int sd_dhcp_lease_get_static_routes(sd_dhcp_lease *lease, sd_dhcp_route ***ret); +int sd_dhcp_lease_get_classless_routes(sd_dhcp_lease *lease, sd_dhcp_route ***ret); +int sd_dhcp_lease_get_vendor_specific(sd_dhcp_lease *lease, const void **data, size_t *data_len); +int sd_dhcp_lease_get_client_id(sd_dhcp_lease *lease, const void **client_id, size_t *client_id_len); +int sd_dhcp_lease_get_timezone(sd_dhcp_lease *lease, const char **timezone); +int sd_dhcp_lease_get_6rd( + sd_dhcp_lease *lease, + uint8_t *ret_ipv4masklen, + uint8_t *ret_prefixlen, + struct in6_addr *ret_prefix, + const struct in_addr **ret_br_addresses, + size_t *ret_n_br_addresses); +int sd_dhcp_lease_has_6rd(sd_dhcp_lease *lease); + +int sd_dhcp_route_get_destination(sd_dhcp_route *route, struct in_addr *destination); +int sd_dhcp_route_get_destination_prefix_length(sd_dhcp_route *route, uint8_t *length); +int sd_dhcp_route_get_gateway(sd_dhcp_route *route, struct in_addr *gateway); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp_lease, sd_dhcp_lease_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp-option.h b/src/systemd/sd-dhcp-option.h new file mode 100644 index 0000000..1486ec7 --- /dev/null +++ b/src/systemd/sd-dhcp-option.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcpoptionhfoo +#define foosddhcpoptionhfoo + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "sd-dhcp-protocol.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_dhcp_option sd_dhcp_option; + +int sd_dhcp_option_new(uint8_t option, const void *data, size_t length, sd_dhcp_option **ret); +sd_dhcp_option *sd_dhcp_option_ref(sd_dhcp_option *ra); +sd_dhcp_option *sd_dhcp_option_unref(sd_dhcp_option *ra); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp_option, sd_dhcp_option_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp-protocol.h b/src/systemd/sd-dhcp-protocol.h new file mode 100644 index 0000000..d8b7537 --- /dev/null +++ b/src/systemd/sd-dhcp-protocol.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcpprotocolhfoo +#define foosddhcpprotocolhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* https://www.iana.org/assignments/bootp-dhcp-parameters/bootp-dhcp-parameters.xhtml#options */ +enum { + SD_DHCP_OPTION_PAD = 0, /* [RFC2132] */ + SD_DHCP_OPTION_SUBNET_MASK = 1, /* [RFC2132] */ + SD_DHCP_OPTION_TIME_OFFSET = 2, /* [RFC2132], deprecated by 100 and 101 */ + SD_DHCP_OPTION_ROUTER = 3, /* [RFC2132] */ + SD_DHCP_OPTION_TIME_SERVER = 4, /* [RFC2132] */ + SD_DHCP_OPTION_NAME_SERVER = 5, /* [RFC2132] */ + SD_DHCP_OPTION_DOMAIN_NAME_SERVER = 6, /* [RFC2132] */ + SD_DHCP_OPTION_LOG_SERVER = 7, /* [RFC2132] */ + SD_DHCP_OPTION_QUOTES_SERVER = 8, /* [RFC2132] */ + SD_DHCP_OPTION_LPR_SERVER = 9, /* [RFC2132] */ + SD_DHCP_OPTION_IMPRESS_SERVER = 10, /* [RFC2132] */ + SD_DHCP_OPTION_RLP_SERVER = 11, /* [RFC2132] */ + SD_DHCP_OPTION_HOST_NAME = 12, /* [RFC2132] */ + SD_DHCP_OPTION_BOOT_FILE_SIZE = 13, /* [RFC2132] */ + SD_DHCP_OPTION_MERIT_DUMP_FILE = 14, /* [RFC2132] */ + SD_DHCP_OPTION_DOMAIN_NAME = 15, /* [RFC2132] */ + SD_DHCP_OPTION_SWAP_SERVER = 16, /* [RFC2132] */ + SD_DHCP_OPTION_ROOT_PATH = 17, /* [RFC2132] */ + SD_DHCP_OPTION_EXTENSION_FILE = 18, /* [RFC2132] */ + SD_DHCP_OPTION_FORWARD = 19, /* [RFC2132] */ + SD_DHCP_OPTION_SOURCE_ROUTE = 20, /* [RFC2132] */ + SD_DHCP_OPTION_POLICY_FILTER = 21, /* [RFC2132] */ + SD_DHCP_OPTION_MAX_DATAGRAM_ASSEMBLY = 22, /* [RFC2132] */ + SD_DHCP_OPTION_DEFAULT_IP_TTL = 23, /* [RFC2132] */ + SD_DHCP_OPTION_MTU_TIMEOUT = 24, /* [RFC2132] */ + SD_DHCP_OPTION_MTU_PLATEAU = 25, /* [RFC2132] */ + SD_DHCP_OPTION_MTU_INTERFACE = 26, /* [RFC2132] */ + SD_DHCP_OPTION_MTU_SUBNET = 27, /* [RFC2132] */ + SD_DHCP_OPTION_BROADCAST = 28, /* [RFC2132] */ + SD_DHCP_OPTION_MASK_DISCOVERY = 29, /* [RFC2132] */ + SD_DHCP_OPTION_MASK_SUPPLIER = 30, /* [RFC2132] */ + SD_DHCP_OPTION_ROUTER_DISCOVERY = 31, /* [RFC2132] */ + SD_DHCP_OPTION_ROUTER_REQUEST = 32, /* [RFC2132] */ + SD_DHCP_OPTION_STATIC_ROUTE = 33, /* [RFC2132] */ + SD_DHCP_OPTION_TRAILERS = 34, /* [RFC2132] */ + SD_DHCP_OPTION_ARP_TIMEOUT = 35, /* [RFC2132] */ + SD_DHCP_OPTION_ETHERNET = 36, /* [RFC2132] */ + SD_DHCP_OPTION_DEFAULT_TCP_TTL = 37, /* [RFC2132] */ + SD_DHCP_OPTION_KEEPALIVE_TIME = 38, /* [RFC2132] */ + SD_DHCP_OPTION_KEEPALIVE_DATA = 39, /* [RFC2132] */ + SD_DHCP_OPTION_NIS_DOMAIN = 40, /* [RFC2132] */ + SD_DHCP_OPTION_NIS_SERVER = 41, /* [RFC2132] */ + SD_DHCP_OPTION_NTP_SERVER = 42, /* [RFC2132] */ + SD_DHCP_OPTION_VENDOR_SPECIFIC = 43, /* [RFC2132] */ + SD_DHCP_OPTION_NETBIOS_NAME_SERVER = 44, /* [RFC2132] */ + SD_DHCP_OPTION_NETBIOS_DIST_SERVER = 45, /* [RFC2132] */ + SD_DHCP_OPTION_NETBIOS_NODE_TYPE = 46, /* [RFC2132] */ + SD_DHCP_OPTION_NETBIOS_SCOPE = 47, /* [RFC2132] */ + SD_DHCP_OPTION_X_WINDOW_FONT = 48, /* [RFC2132] */ + SD_DHCP_OPTION_X_WINDOW_MANAGER = 49, /* [RFC2132] */ + SD_DHCP_OPTION_REQUESTED_IP_ADDRESS = 50, /* [RFC2132] */ + SD_DHCP_OPTION_IP_ADDRESS_LEASE_TIME = 51, /* [RFC2132] */ + SD_DHCP_OPTION_OVERLOAD = 52, /* [RFC2132] */ + SD_DHCP_OPTION_MESSAGE_TYPE = 53, /* [RFC2132] */ + SD_DHCP_OPTION_SERVER_IDENTIFIER = 54, /* [RFC2132] */ + SD_DHCP_OPTION_PARAMETER_REQUEST_LIST = 55, /* [RFC2132] */ + SD_DHCP_OPTION_ERROR_MESSAGE = 56, /* [RFC2132] */ + SD_DHCP_OPTION_MAXIMUM_MESSAGE_SIZE = 57, /* [RFC2132] */ + SD_DHCP_OPTION_RENEWAL_TIME = 58, /* [RFC2132] */ + SD_DHCP_OPTION_REBINDING_TIME = 59, /* [RFC2132] */ + SD_DHCP_OPTION_VENDOR_CLASS_IDENTIFIER = 60, /* [RFC2132] */ + SD_DHCP_OPTION_CLIENT_IDENTIFIER = 61, /* [RFC2132] */ + SD_DHCP_OPTION_NETWARE_IP_DOMAIN = 62, /* [RFC2242] */ + SD_DHCP_OPTION_NETWARE_IP_OPTION = 63, /* [RFC2242] */ + SD_DHCP_OPTION_NIS_DOMAIN_NAME = 64, /* [RFC2132] */ + SD_DHCP_OPTION_NIS_SERVER_ADDR = 65, /* [RFC2132] */ + SD_DHCP_OPTION_BOOT_SERVER_NAME = 66, /* [RFC2132] */ + SD_DHCP_OPTION_BOOT_FILENAME = 67, /* [RFC2132] */ + SD_DHCP_OPTION_HOME_AGENT_ADDRESSES = 68, /* [RFC2132] */ + SD_DHCP_OPTION_SMTP_SERVER = 69, /* [RFC2132] */ + SD_DHCP_OPTION_POP3_SERVER = 70, /* [RFC2132] */ + SD_DHCP_OPTION_NNTP_SERVER = 71, /* [RFC2132] */ + SD_DHCP_OPTION_WWW_SERVER = 72, /* [RFC2132] */ + SD_DHCP_OPTION_FINGER_SERVER = 73, /* [RFC2132] */ + SD_DHCP_OPTION_IRC_SERVER = 74, /* [RFC2132] */ + SD_DHCP_OPTION_STREETTALK_SERVER = 75, /* [RFC2132] */ + SD_DHCP_OPTION_STDA_SERVER = 76, /* [RFC2132] */ + SD_DHCP_OPTION_USER_CLASS = 77, /* [RFC3004] */ + SD_DHCP_OPTION_DIRECTORY_AGENT = 78, /* [RFC2610] */ + SD_DHCP_OPTION_SERVICE_SCOPE = 79, /* [RFC2610] */ + SD_DHCP_OPTION_RAPID_COMMIT = 80, /* [RFC4039] */ + SD_DHCP_OPTION_FQDN = 81, /* [RFC4702] */ + SD_DHCP_OPTION_RELAY_AGENT_INFORMATION = 82, /* [RFC3046] */ + SD_DHCP_OPTION_ISNS = 83, /* [RFC4174] */ + /* option code 84 is unassigned [RFC3679] */ + SD_DHCP_OPTION_NDS_SERVER = 85, /* [RFC2241] */ + SD_DHCP_OPTION_NDS_TREE_NAME = 86, /* [RFC2241] */ + SD_DHCP_OPTION_NDS_CONTEXT = 87, /* [RFC2241] */ + SD_DHCP_OPTION_BCMCS_CONTROLLER_DOMAIN_NAME = 88, /* [RFC4280] */ + SD_DHCP_OPTION_BCMCS_CONTROLLER_ADDRESS = 89, /* [RFC4280] */ + SD_DHCP_OPTION_AUTHENTICATION = 90, /* [RFC3118] */ + SD_DHCP_OPTION_CLIENT_LAST_TRANSACTION_TIME = 91, /* [RFC4388] */ + SD_DHCP_OPTION_ASSOCIATED_IP = 92, /* [RFC4388] */ + SD_DHCP_OPTION_CLIENT_SYSTEM = 93, /* [RFC4578] */ + SD_DHCP_OPTION_CLIENT_NDI = 94, /* [RFC4578] */ + SD_DHCP_OPTION_LDAP = 95, /* [RFC3679] */ + /* option code 96 is unassigned [RFC3679] */ + SD_DHCP_OPTION_UUID = 97, /* [RFC4578] */ + SD_DHCP_OPTION_USER_AUTHENTICATION = 98, /* [RFC2485] */ + SD_DHCP_OPTION_GEOCONF_CIVIC = 99, /* [RFC4776] */ + SD_DHCP_OPTION_POSIX_TIMEZONE = 100, /* [RFC4833] */ + SD_DHCP_OPTION_TZDB_TIMEZONE = 101, /* [RFC4833] */ + /* option codes 102-107 are unassigned [RFC3679] */ + SD_DHCP_OPTION_IPV6_ONLY_PREFERRED = 108, /* [RFC8925] */ + SD_DHCP_OPTION_DHCP4O6_SOURCE_ADDRESS = 109, /* [RFC8539] */ + /* option codes 110-111 are unassigned [RFC3679] */ + SD_DHCP_OPTION_NETINFO_ADDRESS = 112, /* [RFC3679] */ + SD_DHCP_OPTION_NETINFO_TAG = 113, /* [RFC3679] */ + SD_DHCP_OPTION_DHCP_CAPTIVE_PORTAL = 114, /* [RFC8910] */ + /* option code 115 is unassigned [RFC3679] */ + SD_DHCP_OPTION_AUTO_CONFIG = 116, /* [RFC2563] */ + SD_DHCP_OPTION_NAME_SERVICE_SEARCH = 117, /* [RFC2937] */ + SD_DHCP_OPTION_SUBNET_SELECTION = 118, /* [RFC3011] */ + SD_DHCP_OPTION_DOMAIN_SEARCH = 119, /* [RFC3397] */ + SD_DHCP_OPTION_SIP_SERVER = 120, /* [RFC3361] */ + SD_DHCP_OPTION_CLASSLESS_STATIC_ROUTE = 121, /* [RFC3442] */ + SD_DHCP_OPTION_CABLELABS_CLIENT_CONFIGURATION = 122, /* [RFC3495] */ + SD_DHCP_OPTION_GEOCONF = 123, /* [RFC6225] */ + SD_DHCP_OPTION_VENDOR_CLASS = 124, /* [RFC3925] */ + SD_DHCP_OPTION_VENDOR_SPECIFIC_INFORMATION = 125, /* [RFC3925] */ + /* option codes 126-127 are unassigned [RFC3679] */ + /* option codes 128-135 are assigned to use by PXE, but they are vendor specific [RFC4578] */ + SD_DHCP_OPTION_PANA_AGENT = 136, /* [RFC5192] */ + SD_DHCP_OPTION_LOST_SERVER_FQDN = 137, /* [RFC5223] */ + SD_DHCP_OPTION_CAPWAP_AC_ADDRESS = 138, /* [RFC5417] */ + SD_DHCP_OPTION_MOS_ADDRESS = 139, /* [RFC5678] */ + SD_DHCP_OPTION_MOS_FQDN = 140, /* [RFC5678] */ + SD_DHCP_OPTION_SIP_SERVICE_DOMAIN = 141, /* [RFC6011] */ + SD_DHCP_OPTION_ANDSF_ADDRESS = 142, /* [RFC6153] */ + SD_DHCP_OPTION_SZTP_REDIRECT = 143, /* [RFC8572] */ + SD_DHCP_OPTION_GEOLOC = 144, /* [RFC6225] */ + SD_DHCP_OPTION_FORCERENEW_NONCE_CAPABLE = 145, /* [RFC6704] */ + SD_DHCP_OPTION_RDNSS_SELECTION = 146, /* [RFC6731] */ + SD_DHCP_OPTION_DOTS_RI = 147, /* [RFC8973] */ + SD_DHCP_OPTION_DOTS_ADDRESS = 148, /* [RFC8973] */ + /* option code 149 is unassigned [RFC3942] */ + SD_DHCP_OPTION_TFTP_SERVER_ADDRESS = 150, /* [RFC5859] */ + SD_DHCP_OPTION_STATUS_CODE = 151, /* [RFC6926] */ + SD_DHCP_OPTION_BASE_TIME = 152, /* [RFC6926] */ + SD_DHCP_OPTION_START_TIME_OF_STATE = 153, /* [RFC6926] */ + SD_DHCP_OPTION_QUERY_START_TIME = 154, /* [RFC6926] */ + SD_DHCP_OPTION_QUERY_END_TIME = 155, /* [RFC6926] */ + SD_DHCP_OPTION_DHCP_STATE = 156, /* [RFC6926] */ + SD_DHCP_OPTION_DATA_SOURCE = 157, /* [RFC6926] */ + SD_DHCP_OPTION_PCP_SERVER = 158, /* [RFC7291] */ + SD_DHCP_OPTION_PORT_PARAMS = 159, /* [RFC7618] */ + /* option code 160 is unassigned [RFC7710][RFC8910] */ + SD_DHCP_OPTION_MUD_URL = 161, /* [RFC8520] */ + /* option codes 162-174 are unassigned [RFC3942] */ + /* option codes 175-177 are temporary assigned. */ + /* option codes 178-207 are unassigned [RFC3942] */ + SD_DHCP_OPTION_PXELINUX_MAGIC = 208, /* [RFC5071] Deprecated */ + SD_DHCP_OPTION_CONFIGURATION_FILE = 209, /* [RFC5071] */ + SD_DHCP_OPTION_PATH_PREFIX = 210, /* [RFC5071] */ + SD_DHCP_OPTION_REBOOT_TIME = 211, /* [RFC5071] */ + SD_DHCP_OPTION_6RD = 212, /* [RFC5969] */ + SD_DHCP_OPTION_ACCESS_DOMAIN = 213, /* [RFC5986] */ + /* option codes 214-219 are unassigned */ + SD_DHCP_OPTION_SUBNET_ALLOCATION = 220, /* [RFC6656] */ + SD_DHCP_OPTION_VIRTUAL_SUBNET_SELECTION = 221, /* [RFC6607] */ + /* option codes 222-223 are unassigned [RFC3942] */ + /* option codes 224-254 are reserved for private use */ + SD_DHCP_OPTION_PRIVATE_BASE = 224, + SD_DHCP_OPTION_PRIVATE_CLASSLESS_STATIC_ROUTE = 249, /* [RFC7844] */ + SD_DHCP_OPTION_PRIVATE_PROXY_AUTODISCOVERY = 252, /* [RFC7844] */ + SD_DHCP_OPTION_PRIVATE_LAST = 254, + SD_DHCP_OPTION_END = 255 /* [RFC2132] */ +}; + +/* Suboptions for SD_DHCP_OPTION_RELAY_AGENT_INFORMATION option */ +enum { + SD_DHCP_RELAY_AGENT_CIRCUIT_ID = 1, + SD_DHCP_RELAY_AGENT_REMOTE_ID = 2 +}; + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp-server.h b/src/systemd/sd-dhcp-server.h new file mode 100644 index 0000000..feafa5d --- /dev/null +++ b/src/systemd/sd-dhcp-server.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcpserverhfoo +#define foosddhcpserverhfoo + +/*** + Copyright © 2013 Intel Corporation. All rights reserved. + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "sd-dhcp-lease.h" +#include "sd-dhcp-option.h" +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_dhcp_server sd_dhcp_server; + +enum { + SD_DHCP_SERVER_EVENT_LEASE_CHANGED = 1 << 0 +}; + +int sd_dhcp_server_new(sd_dhcp_server **ret, int ifindex); + +int sd_dhcp_server_set_ifname(sd_dhcp_server *server, const char *ifname); +int sd_dhcp_server_get_ifname(sd_dhcp_server *server, const char **ret); + +sd_dhcp_server *sd_dhcp_server_ref(sd_dhcp_server *server); +sd_dhcp_server *sd_dhcp_server_unref(sd_dhcp_server *server); + +int sd_dhcp_server_attach_event(sd_dhcp_server *server, sd_event *event, int64_t priority); +int sd_dhcp_server_detach_event(sd_dhcp_server *server); +sd_event *sd_dhcp_server_get_event(sd_dhcp_server *server); + +typedef void (*sd_dhcp_server_callback_t)(sd_dhcp_server *server, uint64_t event, void *userdata); + +int sd_dhcp_server_set_callback(sd_dhcp_server *server, sd_dhcp_server_callback_t cb, void *userdata); + +int sd_dhcp_server_is_running(sd_dhcp_server *server); + +int sd_dhcp_server_start(sd_dhcp_server *server); +int sd_dhcp_server_stop(sd_dhcp_server *server); + +int sd_dhcp_server_configure_pool(sd_dhcp_server *server, const struct in_addr *address, unsigned char prefixlen, uint32_t offset, uint32_t size); + +int sd_dhcp_server_set_boot_server_address(sd_dhcp_server *server, const struct in_addr *address); +int sd_dhcp_server_set_boot_server_name(sd_dhcp_server *server, const char *name); +int sd_dhcp_server_set_boot_filename(sd_dhcp_server *server, const char *filename); +int sd_dhcp_server_set_bind_to_interface(sd_dhcp_server *server, int enabled); +int sd_dhcp_server_set_timezone(sd_dhcp_server *server, const char *timezone); +int sd_dhcp_server_set_router(sd_dhcp_server *server, const struct in_addr *address); + +int sd_dhcp_server_set_servers( + sd_dhcp_server *server, + sd_dhcp_lease_server_type_t what, + const struct in_addr addresses[], + size_t n_addresses); + +int sd_dhcp_server_set_lpr(sd_dhcp_server *server, const struct in_addr lpr[], size_t n); +int sd_dhcp_server_set_dns(sd_dhcp_server *server, const struct in_addr dns[], size_t n); +int sd_dhcp_server_set_ntp(sd_dhcp_server *server, const struct in_addr ntp[], size_t n); +int sd_dhcp_server_set_sip(sd_dhcp_server *server, const struct in_addr sip[], size_t n); +int sd_dhcp_server_set_pop3(sd_dhcp_server *server, const struct in_addr pop3[], size_t n); +int sd_dhcp_server_set_smtp(sd_dhcp_server *server, const struct in_addr smtp[], size_t n); + +int sd_dhcp_server_add_option(sd_dhcp_server *server, sd_dhcp_option *v); +int sd_dhcp_server_add_vendor_option(sd_dhcp_server *server, sd_dhcp_option *v); +int sd_dhcp_server_set_static_lease(sd_dhcp_server *server, const struct in_addr *address, uint8_t *client_id, size_t client_id_size); + +int sd_dhcp_server_set_max_lease_time(sd_dhcp_server *server, uint64_t t); +int sd_dhcp_server_set_default_lease_time(sd_dhcp_server *server, uint64_t t); +int sd_dhcp_server_set_ipv6_only_preferred_usec(sd_dhcp_server *server, uint64_t t); +int sd_dhcp_server_set_rapid_commit(sd_dhcp_server *server, int enabled); + +int sd_dhcp_server_forcerenew(sd_dhcp_server *server); + +int sd_dhcp_server_is_in_relay_mode(sd_dhcp_server *server); +int sd_dhcp_server_set_relay_target(sd_dhcp_server *server, const struct in_addr* address); +int sd_dhcp_server_set_relay_agent_information(sd_dhcp_server *server, const char* circuit_id, const char* remote_id); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp_server, sd_dhcp_server_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp6-client.h b/src/systemd/sd-dhcp6-client.h new file mode 100644 index 0000000..0ceadb8 --- /dev/null +++ b/src/systemd/sd-dhcp6-client.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcp6clienthfoo +#define foosddhcp6clienthfoo + +/*** + Copyright © 2014 Intel Corporation. All rights reserved. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "sd-device.h" +#include "sd-dhcp6-lease.h" +#include "sd-dhcp6-option.h" +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +enum { + SD_DHCP6_CLIENT_EVENT_STOP = 0, + SD_DHCP6_CLIENT_EVENT_RESEND_EXPIRE = 10, + SD_DHCP6_CLIENT_EVENT_RETRANS_MAX = 11, + SD_DHCP6_CLIENT_EVENT_IP_ACQUIRE = 12, + SD_DHCP6_CLIENT_EVENT_INFORMATION_REQUEST = 13 +}; + +typedef struct sd_dhcp6_client sd_dhcp6_client; + +typedef void (*sd_dhcp6_client_callback_t)(sd_dhcp6_client *client, int event, void *userdata); +int sd_dhcp6_client_set_callback( + sd_dhcp6_client *client, + sd_dhcp6_client_callback_t cb, + void *userdata); + +int sd_dhcp6_client_set_ifindex( + sd_dhcp6_client *client, + int interface_index); +int sd_dhcp6_client_set_ifname( + sd_dhcp6_client *client, + const char *interface_name); +int sd_dhcp6_client_get_ifname(sd_dhcp6_client *client, const char **ret); +int sd_dhcp6_client_set_local_address( + sd_dhcp6_client *client, + const struct in6_addr *local_address); +int sd_dhcp6_client_set_mac( + sd_dhcp6_client *client, + const uint8_t *addr, + size_t addr_len, + uint16_t arp_type); +int sd_dhcp6_client_set_duid_llt(sd_dhcp6_client *client, uint64_t llt_time); +int sd_dhcp6_client_set_duid_ll(sd_dhcp6_client *client); +int sd_dhcp6_client_set_duid_en(sd_dhcp6_client *client); +int sd_dhcp6_client_set_duid_uuid(sd_dhcp6_client *client); +int sd_dhcp6_client_set_duid_raw(sd_dhcp6_client *client, uint16_t duid_type, const uint8_t *duid, size_t duid_len); +int sd_dhcp6_client_set_iaid( + sd_dhcp6_client *client, + uint32_t iaid); +int sd_dhcp6_client_get_iaid( + sd_dhcp6_client *client, + uint32_t *iaid); +int sd_dhcp6_client_duid_as_string( + sd_dhcp6_client *client, + char **duid); +int sd_dhcp6_client_set_fqdn( + sd_dhcp6_client *client, + const char *fqdn); +int sd_dhcp6_client_set_information_request( + sd_dhcp6_client *client, + int enabled); +int sd_dhcp6_client_get_information_request( + sd_dhcp6_client *client, + int *enabled); +int sd_dhcp6_client_set_request_option( + sd_dhcp6_client *client, + uint16_t option); +int sd_dhcp6_client_set_request_mud_url( + sd_dhcp6_client *client, + const char *mudurl); +int sd_dhcp6_client_set_request_user_class( + sd_dhcp6_client *client, + char * const *user_class); +int sd_dhcp6_client_set_request_vendor_class( + sd_dhcp6_client *client, + char * const *vendor_class); +int sd_dhcp6_client_set_prefix_delegation_hint( + sd_dhcp6_client *client, + uint8_t prefixlen, + const struct in6_addr *pd_prefix); +int sd_dhcp6_client_get_prefix_delegation(sd_dhcp6_client *client, + int *delegation); +int sd_dhcp6_client_set_prefix_delegation(sd_dhcp6_client *client, + int delegation); +int sd_dhcp6_client_get_address_request(sd_dhcp6_client *client, + int *request); +int sd_dhcp6_client_set_address_request(sd_dhcp6_client *client, + int request); +int sd_dhcp6_client_add_vendor_option(sd_dhcp6_client *client, + sd_dhcp6_option *v); +int sd_dhcp6_client_set_rapid_commit(sd_dhcp6_client *client, int enable); +int sd_dhcp6_client_set_send_release(sd_dhcp6_client *client, int enable); + +int sd_dhcp6_client_get_lease( + sd_dhcp6_client *client, + sd_dhcp6_lease **ret); + +int sd_dhcp6_client_add_option(sd_dhcp6_client *client, sd_dhcp6_option *v); + +int sd_dhcp6_client_stop(sd_dhcp6_client *client); +int sd_dhcp6_client_start(sd_dhcp6_client *client); +int sd_dhcp6_client_is_running(sd_dhcp6_client *client); +int sd_dhcp6_client_attach_event( + sd_dhcp6_client *client, + sd_event *event, + int64_t priority); +int sd_dhcp6_client_detach_event(sd_dhcp6_client *client); +sd_event *sd_dhcp6_client_get_event(sd_dhcp6_client *client); +int sd_dhcp6_client_attach_device(sd_dhcp6_client *client, sd_device *dev); +sd_dhcp6_client *sd_dhcp6_client_ref(sd_dhcp6_client *client); +sd_dhcp6_client *sd_dhcp6_client_unref(sd_dhcp6_client *client); +int sd_dhcp6_client_new(sd_dhcp6_client **ret); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp6_client, sd_dhcp6_client_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp6-lease.h b/src/systemd/sd-dhcp6-lease.h new file mode 100644 index 0000000..e18d578 --- /dev/null +++ b/src/systemd/sd-dhcp6-lease.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcp6leasehfoo +#define foosddhcp6leasehfoo + +/*** + Copyright © 2014-2015 Intel Corporation. All rights reserved. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "sd-dhcp6-option.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_dhcp6_lease sd_dhcp6_lease; + +int sd_dhcp6_lease_get_timestamp(sd_dhcp6_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp6_lease_get_t1(sd_dhcp6_lease *lease, uint64_t *ret); +int sd_dhcp6_lease_get_t1_timestamp(sd_dhcp6_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp6_lease_get_t2(sd_dhcp6_lease *lease, uint64_t *ret); +int sd_dhcp6_lease_get_t2_timestamp(sd_dhcp6_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp6_lease_get_valid_lifetime(sd_dhcp6_lease *lease, uint64_t *ret); +int sd_dhcp6_lease_get_valid_lifetime_timestamp(sd_dhcp6_lease *lease, clockid_t clock, uint64_t *ret); +int sd_dhcp6_lease_get_server_address(sd_dhcp6_lease *lease, struct in6_addr *ret); + +int sd_dhcp6_lease_address_iterator_reset(sd_dhcp6_lease *lease); +int sd_dhcp6_lease_address_iterator_next(sd_dhcp6_lease *lease); +int sd_dhcp6_lease_get_address( + sd_dhcp6_lease *lease, + struct in6_addr *ret); +int sd_dhcp6_lease_get_address_lifetime( + sd_dhcp6_lease *lease, + uint64_t *ret_lifetime_preferred, + uint64_t *ret_lifetime_valid); +int sd_dhcp6_lease_get_address_lifetime_timestamp( + sd_dhcp6_lease *lease, + clockid_t clock, + uint64_t *ret_lifetime_preferred, + uint64_t *ret_lifetime_valid); +int sd_dhcp6_lease_has_address(sd_dhcp6_lease *lease); + +int sd_dhcp6_lease_pd_iterator_reset(sd_dhcp6_lease *lease); +int sd_dhcp6_lease_pd_iterator_next(sd_dhcp6_lease *lease); +int sd_dhcp6_lease_get_pd_prefix( + sd_dhcp6_lease *lease, + struct in6_addr *ret_prefix, + uint8_t *ret_prefix_length); +int sd_dhcp6_lease_get_pd_lifetime( + sd_dhcp6_lease *lease, + uint64_t *ret_lifetime_preferred, + uint64_t *ret_lifetime_valid); +int sd_dhcp6_lease_get_pd_lifetime_timestamp( + sd_dhcp6_lease *lease, + clockid_t clock, + uint64_t *ret_lifetime_preferred, + uint64_t *ret_lifetime_valid); +int sd_dhcp6_lease_has_pd_prefix(sd_dhcp6_lease *lease); + +int sd_dhcp6_lease_get_dns(sd_dhcp6_lease *lease, const struct in6_addr **ret); +int sd_dhcp6_lease_get_domains(sd_dhcp6_lease *lease, char ***ret); +int sd_dhcp6_lease_get_ntp_addrs(sd_dhcp6_lease *lease, const struct in6_addr **ret); +int sd_dhcp6_lease_get_ntp_fqdn(sd_dhcp6_lease *lease, char ***ret); +int sd_dhcp6_lease_get_fqdn(sd_dhcp6_lease *lease, const char **ret); +int sd_dhcp6_lease_get_captive_portal(sd_dhcp6_lease *lease, const char **ret); +int sd_dhcp6_lease_get_vendor_options(sd_dhcp6_lease *lease, sd_dhcp6_option ***ret); + +sd_dhcp6_lease *sd_dhcp6_lease_ref(sd_dhcp6_lease *lease); +sd_dhcp6_lease *sd_dhcp6_lease_unref(sd_dhcp6_lease *lease); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp6_lease, sd_dhcp6_lease_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp6-option.h b/src/systemd/sd-dhcp6-option.h new file mode 100644 index 0000000..3201242 --- /dev/null +++ b/src/systemd/sd-dhcp6-option.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcp6optionhfoo +#define foosddhcp6optionhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "sd-dhcp6-protocol.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_dhcp6_option sd_dhcp6_option; + +int sd_dhcp6_option_new(uint16_t option, const void *data, size_t length, uint32_t enterprise_identifier, sd_dhcp6_option **ret); +sd_dhcp6_option *sd_dhcp6_option_ref(sd_dhcp6_option *ra); +sd_dhcp6_option *sd_dhcp6_option_unref(sd_dhcp6_option *ra); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_dhcp6_option, sd_dhcp6_option_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-dhcp6-protocol.h b/src/systemd/sd-dhcp6-protocol.h new file mode 100644 index 0000000..78c80f7 --- /dev/null +++ b/src/systemd/sd-dhcp6-protocol.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosddhcp6protocolhfoo +#define foosddhcp6protocolhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* https://www.iana.org/assignments/dhcpv6-parameters/dhcpv6-parameters.xhtml#dhcpv6-parameters-2 */ +enum { + SD_DHCP6_OPTION_CLIENTID = 1, /* RFC 8415 */ + SD_DHCP6_OPTION_SERVERID = 2, /* RFC 8415 */ + SD_DHCP6_OPTION_IA_NA = 3, /* RFC 8415 */ + SD_DHCP6_OPTION_IA_TA = 4, /* RFC 8415 */ + SD_DHCP6_OPTION_IAADDR = 5, /* RFC 8415 */ + SD_DHCP6_OPTION_ORO = 6, /* RFC 8415 */ + SD_DHCP6_OPTION_PREFERENCE = 7, /* RFC 8415 */ + SD_DHCP6_OPTION_ELAPSED_TIME = 8, /* RFC 8415 */ + SD_DHCP6_OPTION_RELAY_MSG = 9, /* RFC 8415 */ + /* option code 10 is unassigned */ + SD_DHCP6_OPTION_AUTH = 11, /* RFC 8415 */ + SD_DHCP6_OPTION_UNICAST = 12, /* RFC 8415 */ + SD_DHCP6_OPTION_STATUS_CODE = 13, /* RFC 8415 */ + SD_DHCP6_OPTION_RAPID_COMMIT = 14, /* RFC 8415 */ + SD_DHCP6_OPTION_USER_CLASS = 15, /* RFC 8415 */ + SD_DHCP6_OPTION_VENDOR_CLASS = 16, /* RFC 8415 */ + SD_DHCP6_OPTION_VENDOR_OPTS = 17, /* RFC 8415 */ + SD_DHCP6_OPTION_INTERFACE_ID = 18, /* RFC 8415 */ + SD_DHCP6_OPTION_RECONF_MSG = 19, /* RFC 8415 */ + SD_DHCP6_OPTION_RECONF_ACCEPT = 20, /* RFC 8415 */ + SD_DHCP6_OPTION_SIP_SERVER_DOMAIN_NAME = 21, /* RFC 3319 */ + SD_DHCP6_OPTION_SIP_SERVER_ADDRESS = 22, /* RFC 3319 */ + SD_DHCP6_OPTION_DNS_SERVER = 23, /* RFC 3646 */ + SD_DHCP6_OPTION_DOMAIN = 24, /* RFC 3646 */ + SD_DHCP6_OPTION_IA_PD = 25, /* RFC 3633, RFC 8415 */ + SD_DHCP6_OPTION_IA_PD_PREFIX = 26, /* RFC 3633, RFC 8415 */ + SD_DHCP6_OPTION_NIS_SERVER = 27, /* RFC 3898 */ + SD_DHCP6_OPTION_NISP_SERVER = 28, /* RFC 3898 */ + SD_DHCP6_OPTION_NIS_DOMAIN_NAME = 29, /* RFC 3898 */ + SD_DHCP6_OPTION_NISP_DOMAIN_NAME = 30, /* RFC 3898 */ + SD_DHCP6_OPTION_SNTP_SERVER = 31, /* RFC 4075, deprecated */ + SD_DHCP6_OPTION_INFORMATION_REFRESH_TIME = 32, /* RFC 4242, 8415, sec. 21.23 */ + SD_DHCP6_OPTION_BCMCS_SERVER_D = 33, /* RFC 4280 */ + SD_DHCP6_OPTION_BCMCS_SERVER_A = 34, /* RFC 4280 */ + /* option code 35 is unassigned */ + SD_DHCP6_OPTION_GEOCONF_CIVIC = 36, /* RFC 4776 */ + SD_DHCP6_OPTION_REMOTE_ID = 37, /* RFC 4649 */ + SD_DHCP6_OPTION_SUBSCRIBER_ID = 38, /* RFC 4580 */ + SD_DHCP6_OPTION_CLIENT_FQDN = 39, /* RFC 4704 */ + SD_DHCP6_OPTION_PANA_AGENT = 40, /* RFC 5192 */ + SD_DHCP6_OPTION_POSIX_TIMEZONE = 41, /* RFC 4833 */ + SD_DHCP6_OPTION_TZDB_TIMEZONE = 42, /* RFC 4833 */ + SD_DHCP6_OPTION_ERO = 43, /* RFC 4994 */ + SD_DHCP6_OPTION_LQ_QUERY = 44, /* RFC 5007 */ + SD_DHCP6_OPTION_CLIENT_DATA = 45, /* RFC 5007 */ + SD_DHCP6_OPTION_CLT_TIME = 46, /* RFC 5007 */ + SD_DHCP6_OPTION_LQ_RELAY_DATA = 47, /* RFC 5007 */ + SD_DHCP6_OPTION_LQ_CLIENT_LINK = 48, /* RFC 5007 */ + SD_DHCP6_OPTION_MIP6_HNIDF = 49, /* RFC 6610 */ + SD_DHCP6_OPTION_MIP6_VDINF = 50, /* RFC 6610 */ + SD_DHCP6_OPTION_V6_LOST = 51, /* RFC 5223 */ + SD_DHCP6_OPTION_CAPWAP_AC_V6 = 52, /* RFC 5417 */ + SD_DHCP6_OPTION_RELAY_ID = 53, /* RFC 5460 */ + SD_DHCP6_OPTION_IPV6_ADDRESS_MOS = 54, /* RFC 5678 */ + SD_DHCP6_OPTION_IPV6_FQDN_MOS = 55, /* RFC 5678 */ + SD_DHCP6_OPTION_NTP_SERVER = 56, /* RFC 5908 */ + SD_DHCP6_OPTION_V6_ACCESS_DOMAIN = 57, /* RFC 5986 */ + SD_DHCP6_OPTION_SIP_UA_CS_LIST = 58, /* RFC 6011 */ + SD_DHCP6_OPTION_BOOTFILE_URL = 59, /* RFC 5970 */ + SD_DHCP6_OPTION_BOOTFILE_PARAM = 60, /* RFC 5970 */ + SD_DHCP6_OPTION_CLIENT_ARCH_TYPE = 61, /* RFC 5970 */ + SD_DHCP6_OPTION_NII = 62, /* RFC 5970 */ + SD_DHCP6_OPTION_GEOLOCATION = 63, /* RFC 6225 */ + SD_DHCP6_OPTION_AFTR_NAME = 64, /* RFC 6334 */ + SD_DHCP6_OPTION_ERP_LOCAL_DOMAIN_NAME = 65, /* RFC 6440 */ + SD_DHCP6_OPTION_RSOO = 66, /* RFC 6422 */ + SD_DHCP6_OPTION_PD_EXCLUDE = 67, /* RFC 6603 */ + SD_DHCP6_OPTION_VSS = 68, /* RFC 6607 */ + SD_DHCP6_OPTION_MIP6_IDINF = 69, /* RFC 6610 */ + SD_DHCP6_OPTION_MIP6_UDINF = 70, /* RFC 6610 */ + SD_DHCP6_OPTION_MIP6_HNP = 71, /* RFC 6610 */ + SD_DHCP6_OPTION_MIP6_HAA = 72, /* RFC 6610 */ + SD_DHCP6_OPTION_MIP6_HAF = 73, /* RFC 6610 */ + SD_DHCP6_OPTION_RDNSS_SELECTION = 74, /* RFC 6731 */ + SD_DHCP6_OPTION_KRB_PRINCIPAL_NAME = 75, /* RFC 6784 */ + SD_DHCP6_OPTION_KRB_REALM_NAME = 76, /* RFC 6784 */ + SD_DHCP6_OPTION_KRB_DEFAULT_REALM_NAME = 77, /* RFC 6784 */ + SD_DHCP6_OPTION_KRB_KDC = 78, /* RFC 6784 */ + SD_DHCP6_OPTION_CLIENT_LINKLAYER_ADDR = 79, /* RFC 6939 */ + SD_DHCP6_OPTION_LINK_ADDRESS = 80, /* RFC 6977 */ + SD_DHCP6_OPTION_RADIUS = 81, /* RFC 7037 */ + SD_DHCP6_OPTION_SOL_MAX_RT = 82, /* RFC 7083, RFC 8415 */ + SD_DHCP6_OPTION_INF_MAX_RT = 83, /* RFC 7083, RFC 8415 */ + SD_DHCP6_OPTION_ADDRSEL = 84, /* RFC 7078 */ + SD_DHCP6_OPTION_ADDRSEL_TABLE = 85, /* RFC 7078 */ + SD_DHCP6_OPTION_V6_PCP_SERVER = 86, /* RFC 7291 */ + SD_DHCP6_OPTION_DHCPV4_MSG = 87, /* RFC 7341 */ + SD_DHCP6_OPTION_DHCP4_O_DHCP6_SERVER = 88, /* RFC 7341 */ + SD_DHCP6_OPTION_S46_RULE = 89, /* RFC 7598 */ + SD_DHCP6_OPTION_S46_BR = 90, /* RFC 7598, RFC 8539 */ + SD_DHCP6_OPTION_S46_DMR = 91, /* RFC 7598 */ + SD_DHCP6_OPTION_S46_V4V6BIND = 92, /* RFC 7598 */ + SD_DHCP6_OPTION_S46_PORTPARAMS = 93, /* RFC 7598 */ + SD_DHCP6_OPTION_S46_CONT_MAPE = 94, /* RFC 7598 */ + SD_DHCP6_OPTION_S46_CONT_MAPT = 95, /* RFC 7598 */ + SD_DHCP6_OPTION_S46_CONT_LW = 96, /* RFC 7598 */ + SD_DHCP6_OPTION_4RD = 97, /* RFC 7600 */ + SD_DHCP6_OPTION_4RD_MAP_RULE = 98, /* RFC 7600 */ + SD_DHCP6_OPTION_4RD_NON_MAP_RULE = 99, /* RFC 7600 */ + SD_DHCP6_OPTION_LQ_BASE_TIME = 100, /* RFC 7653 */ + SD_DHCP6_OPTION_LQ_START_TIME = 101, /* RFC 7653 */ + SD_DHCP6_OPTION_LQ_END_TIME = 102, /* RFC 7653 */ + SD_DHCP6_OPTION_CAPTIVE_PORTAL = 103, /* RFC 8910 */ + SD_DHCP6_OPTION_MPL_PARAMETERS = 104, /* RFC 7774 */ + SD_DHCP6_OPTION_ANI_ATT = 105, /* RFC 7839 */ + SD_DHCP6_OPTION_ANI_NETWORK_NAME = 106, /* RFC 7839 */ + SD_DHCP6_OPTION_ANI_AP_NAME = 107, /* RFC 7839 */ + SD_DHCP6_OPTION_ANI_AP_BSSID = 108, /* RFC 7839 */ + SD_DHCP6_OPTION_ANI_OPERATOR_ID = 109, /* RFC 7839 */ + SD_DHCP6_OPTION_ANI_OPERATOR_REALM = 110, /* RFC 7839 */ + SD_DHCP6_OPTION_S46_PRIORITY = 111, /* RFC 8026 */ + SD_DHCP6_OPTION_MUD_URL_V6 = 112, /* RFC 8520 */ + SD_DHCP6_OPTION_V6_PREFIX64 = 113, /* RFC 8115 */ + SD_DHCP6_OPTION_F_BINDING_STATUS = 114, /* RFC 8156 */ + SD_DHCP6_OPTION_F_CONNECT_FLAGS = 115, /* RFC 8156 */ + SD_DHCP6_OPTION_F_DNS_REMOVAL_INFO = 116, /* RFC 8156 */ + SD_DHCP6_OPTION_F_DNS_HOST_NAME = 117, /* RFC 8156 */ + SD_DHCP6_OPTION_F_DNS_ZONE_NAME = 118, /* RFC 8156 */ + SD_DHCP6_OPTION_F_DNS_FLAGS = 119, /* RFC 8156 */ + SD_DHCP6_OPTION_F_EXPIRATION_TIME = 120, /* RFC 8156 */ + SD_DHCP6_OPTION_F_MAX_UNACKED_BNDUPD = 121, /* RFC 8156 */ + SD_DHCP6_OPTION_F_MCLT = 122, /* RFC 8156 */ + SD_DHCP6_OPTION_F_PARTNER_LIFETIME = 123, /* RFC 8156 */ + SD_DHCP6_OPTION_F_PARTNER_LIFETIME_SENT = 124, /* RFC 8156 */ + SD_DHCP6_OPTION_F_PARTNER_DOWN_TIME = 125, /* RFC 8156 */ + SD_DHCP6_OPTION_F_PARTNER_RAW_CLT_TIME = 126, /* RFC 8156 */ + SD_DHCP6_OPTION_F_PROTOCOL_VERSION = 127, /* RFC 8156 */ + SD_DHCP6_OPTION_F_KEEPALIVE_TIME = 128, /* RFC 8156 */ + SD_DHCP6_OPTION_F_RECONFIGURE_DATA = 129, /* RFC 8156 */ + SD_DHCP6_OPTION_F_RELATIONSHIP_NAME = 130, /* RFC 8156 */ + SD_DHCP6_OPTION_F_SERVER_FLAGS = 131, /* RFC 8156 */ + SD_DHCP6_OPTION_F_SERVER_STATE = 132, /* RFC 8156 */ + SD_DHCP6_OPTION_F_START_TIME_OF_STATE = 133, /* RFC 8156 */ + SD_DHCP6_OPTION_F_STATE_EXPIRATION_TIME = 134, /* RFC 8156 */ + SD_DHCP6_OPTION_RELAY_PORT = 135, /* RFC 8357 */ + SD_DHCP6_OPTION_V6_SZTP_REDIRECT = 136, /* RFC 8572 */ + SD_DHCP6_OPTION_S46_BIND_IPV6_PREFIX = 137, /* RFC 8539 */ + SD_DHCP6_OPTION_IA_LL = 138, /* RFC 8947 */ + SD_DHCP6_OPTION_LLADDR = 139, /* RFC 8947 */ + SD_DHCP6_OPTION_SLAP_QUAD = 140, /* RFC 8948 */ + SD_DHCP6_OPTION_V6_DOTS_RI = 141, /* RFC 8973 */ + SD_DHCP6_OPTION_V6_DOTS_ADDRESS = 142, /* RFC 8973 */ + SD_DHCP6_OPTION_IPV6_ADDRESS_ANDSF = 143 /* RFC 6153 */ + /* option codes 144-65535 are unassigned */ +}; + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h new file mode 100644 index 0000000..49d6975 --- /dev/null +++ b/src/systemd/sd-event.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdeventhfoo +#define foosdeventhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "_sd-common.h" + +/* + Why is this better than pure epoll? + + - Supports event source prioritization + - Scales better with a large number of time events because it does not require one timerfd each + - Automatically tries to coalesce timer events system-wide + - Handles signals, child PIDs, inotify events + - Supports systemd-style automatic watchdog event generation +*/ + +_SD_BEGIN_DECLARATIONS; + +#define SD_EVENT_DEFAULT ((sd_event *) 1) + +typedef struct sd_event sd_event; +typedef struct sd_event_source sd_event_source; + +enum { + SD_EVENT_OFF = 0, + SD_EVENT_ON = 1, + SD_EVENT_ONESHOT = -1 +}; + +enum { + SD_EVENT_INITIAL, + SD_EVENT_ARMED, + SD_EVENT_PENDING, + SD_EVENT_RUNNING, + SD_EVENT_EXITING, + SD_EVENT_FINISHED, + SD_EVENT_PREPARING +}; + +enum { + /* And everything in-between and outside is good too */ + SD_EVENT_PRIORITY_IMPORTANT = -100, + SD_EVENT_PRIORITY_NORMAL = 0, + SD_EVENT_PRIORITY_IDLE = 100 +}; + +#define SD_EVENT_SIGNAL_PROCMASK (1 << 30) + +typedef int (*sd_event_handler_t)(sd_event_source *s, void *userdata); +typedef int (*sd_event_io_handler_t)(sd_event_source *s, int fd, uint32_t revents, void *userdata); +typedef int (*sd_event_time_handler_t)(sd_event_source *s, uint64_t usec, void *userdata); +typedef int (*sd_event_signal_handler_t)(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata); +#if defined _GNU_SOURCE || (defined _POSIX_C_SOURCE && _POSIX_C_SOURCE >= 199309L) +typedef int (*sd_event_child_handler_t)(sd_event_source *s, const siginfo_t *si, void *userdata); +#else +typedef void* sd_event_child_handler_t; +#endif +typedef int (*sd_event_inotify_handler_t)(sd_event_source *s, const struct inotify_event *event, void *userdata); +typedef _sd_destroy_t sd_event_destroy_t; + +int sd_event_default(sd_event **e); + +int sd_event_new(sd_event **e); +sd_event* sd_event_ref(sd_event *e); +sd_event* sd_event_unref(sd_event *e); + +int sd_event_add_io(sd_event *e, sd_event_source **s, int fd, uint32_t events, sd_event_io_handler_t callback, void *userdata); +int sd_event_add_time(sd_event *e, sd_event_source **s, clockid_t clock, uint64_t usec, uint64_t accuracy, sd_event_time_handler_t callback, void *userdata); +int sd_event_add_time_relative(sd_event *e, sd_event_source **s, clockid_t clock, uint64_t usec, uint64_t accuracy, sd_event_time_handler_t callback, void *userdata); +int sd_event_add_signal(sd_event *e, sd_event_source **s, int sig, sd_event_signal_handler_t callback, void *userdata); +int sd_event_add_child(sd_event *e, sd_event_source **s, pid_t pid, int options, sd_event_child_handler_t callback, void *userdata); +int sd_event_add_child_pidfd(sd_event *e, sd_event_source **s, int pidfd, int options, sd_event_child_handler_t callback, void *userdata); +int sd_event_add_inotify(sd_event *e, sd_event_source **s, const char *path, uint32_t mask, sd_event_inotify_handler_t callback, void *userdata); +int sd_event_add_inotify_fd(sd_event *e, sd_event_source **s, int fd, uint32_t mask, sd_event_inotify_handler_t callback, void *userdata); +int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); +int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); +int sd_event_add_exit(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); +int sd_event_add_memory_pressure(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); + +int sd_event_prepare(sd_event *e); +int sd_event_wait(sd_event *e, uint64_t usec); +int sd_event_dispatch(sd_event *e); +int sd_event_run(sd_event *e, uint64_t usec); +int sd_event_loop(sd_event *e); +int sd_event_exit(sd_event *e, int code); + +int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec); + +int sd_event_get_fd(sd_event *e); +int sd_event_get_state(sd_event *e); +int sd_event_get_tid(sd_event *e, pid_t *tid); +int sd_event_get_exit_code(sd_event *e, int *code); +int sd_event_set_watchdog(sd_event *e, int b); +int sd_event_get_watchdog(sd_event *e); +int sd_event_get_iteration(sd_event *e, uint64_t *ret); +int sd_event_set_signal_exit(sd_event *e, int b); + +sd_event_source* sd_event_source_ref(sd_event_source *s); +sd_event_source* sd_event_source_unref(sd_event_source *s); +sd_event_source* sd_event_source_disable_unref(sd_event_source *s); + +sd_event *sd_event_source_get_event(sd_event_source *s); +void* sd_event_source_get_userdata(sd_event_source *s); +void* sd_event_source_set_userdata(sd_event_source *s, void *userdata); + +int sd_event_source_set_description(sd_event_source *s, const char *description); +int sd_event_source_get_description(sd_event_source *s, const char **description); +int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback); +int sd_event_source_get_pending(sd_event_source *s); +int sd_event_source_get_priority(sd_event_source *s, int64_t *priority); +int sd_event_source_set_priority(sd_event_source *s, int64_t priority); +int sd_event_source_get_enabled(sd_event_source *s, int *enabled); +int sd_event_source_set_enabled(sd_event_source *s, int enabled); +int sd_event_source_get_io_fd(sd_event_source *s); +int sd_event_source_set_io_fd(sd_event_source *s, int fd); +int sd_event_source_get_io_fd_own(sd_event_source *s); +int sd_event_source_set_io_fd_own(sd_event_source *s, int own); +int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events); +int sd_event_source_set_io_events(sd_event_source *s, uint32_t events); +int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents); +int sd_event_source_get_time(sd_event_source *s, uint64_t *usec); +int sd_event_source_set_time(sd_event_source *s, uint64_t usec); +int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec); +int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec); +int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec); +int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock); +int sd_event_source_get_signal(sd_event_source *s); +int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid); +int sd_event_source_get_child_pidfd(sd_event_source *s); +int sd_event_source_get_child_pidfd_own(sd_event_source *s); +int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own); +int sd_event_source_get_child_process_own(sd_event_source *s); +int sd_event_source_set_child_process_own(sd_event_source *s, int own); +#if defined _GNU_SOURCE || (defined _POSIX_C_SOURCE && _POSIX_C_SOURCE >= 199309L) +int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags); +#else +int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags); +#endif +int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret); +int sd_event_source_set_memory_pressure_type(sd_event_source *e, const char *ty); +int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); +int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); +int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret); +int sd_event_source_get_floating(sd_event_source *s); +int sd_event_source_set_floating(sd_event_source *s, int b); +int sd_event_source_get_exit_on_failure(sd_event_source *s); +int sd_event_source_set_exit_on_failure(sd_event_source *s, int b); +int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval_usec, unsigned burst); +int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval_usec, unsigned *ret_burst); +int sd_event_source_is_ratelimited(sd_event_source *s); +int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback); +int sd_event_source_leave_ratelimit(sd_event_source *s); + +int sd_event_trim_memory(void); + +/* Define helpers so that __attribute__((cleanup(sd_event_unrefp))) and similar may be used. */ +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event, sd_event_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event_source, sd_event_source_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event_source, sd_event_source_disable_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-gpt.h b/src/systemd/sd-gpt.h new file mode 100644 index 0000000..7ffa57a --- /dev/null +++ b/src/systemd/sd-gpt.h @@ -0,0 +1,369 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdgpthfoo +#define foosdgpthfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "sd-id128.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +#define SD_GPT_ROOT_ALPHA SD_ID128_MAKE(65,23,f8,ae,3e,b1,4e,2a,a0,5a,18,b6,95,ae,65,6f) +#define SD_GPT_ROOT_ARC SD_ID128_MAKE(d2,7f,46,ed,29,19,4c,b8,bd,25,95,31,f3,c1,65,34) +#define SD_GPT_ROOT_ARM SD_ID128_MAKE(69,da,d7,10,2c,e4,4e,3c,b1,6c,21,a1,d4,9a,be,d3) +#define SD_GPT_ROOT_ARM64 SD_ID128_MAKE(b9,21,b0,45,1d,f0,41,c3,af,44,4c,6f,28,0d,3f,ae) +#define SD_GPT_ROOT_IA64 SD_ID128_MAKE(99,3d,8d,3d,f8,0e,42,25,85,5a,9d,af,8e,d7,ea,97) +#define SD_GPT_ROOT_LOONGARCH64 SD_ID128_MAKE(77,05,58,00,79,2c,4f,94,b3,9a,98,c9,1b,76,2b,b6) +#define SD_GPT_ROOT_MIPS SD_ID128_MAKE(e9,43,45,44,6e,2c,47,cc,ba,e2,12,d6,de,af,b4,4c) +#define SD_GPT_ROOT_MIPS64 SD_ID128_MAKE(d1,13,af,76,80,ef,41,b4,bd,b6,0c,ff,4d,3d,4a,25) +#define SD_GPT_ROOT_MIPS_LE SD_ID128_MAKE(37,c5,8c,8a,d9,13,41,56,a2,5f,48,b1,b6,4e,07,f0) +#define SD_GPT_ROOT_MIPS64_LE SD_ID128_MAKE(70,0b,da,43,7a,34,45,07,b1,79,ee,b9,3d,7a,7c,a3) +#define SD_GPT_ROOT_PARISC SD_ID128_MAKE(1a,ac,db,3b,54,44,41,38,bd,9e,e5,c2,23,9b,23,46) +#define SD_GPT_ROOT_PPC SD_ID128_MAKE(1d,e3,f1,ef,fa,98,47,b5,8d,cd,4a,86,0a,65,4d,78) +#define SD_GPT_ROOT_PPC64 SD_ID128_MAKE(91,2a,de,1d,a8,39,49,13,89,64,a1,0e,ee,08,fb,d2) +#define SD_GPT_ROOT_PPC64_LE SD_ID128_MAKE(c3,1c,45,e6,3f,39,41,2e,80,fb,48,09,c4,98,05,99) +#define SD_GPT_ROOT_RISCV32 SD_ID128_MAKE(60,d5,a7,fe,8e,7d,43,5c,b7,14,3d,d8,16,21,44,e1) +#define SD_GPT_ROOT_RISCV64 SD_ID128_MAKE(72,ec,70,a6,cf,74,40,e6,bd,49,4b,da,08,e8,f2,24) +#define SD_GPT_ROOT_S390 SD_ID128_MAKE(08,a7,ac,ea,62,4c,4a,20,91,e8,6e,0f,a6,7d,23,f9) +#define SD_GPT_ROOT_S390X SD_ID128_MAKE(5e,ea,d9,a9,fe,09,4a,1e,a1,d7,52,0d,00,53,13,06) +#define SD_GPT_ROOT_TILEGX SD_ID128_MAKE(c5,0c,dd,70,38,62,4c,c3,90,e1,80,9a,8c,93,ee,2c) +#define SD_GPT_ROOT_X86 SD_ID128_MAKE(44,47,95,40,f2,97,41,b2,9a,f7,d1,31,d5,f0,45,8a) +#define SD_GPT_ROOT_X86_64 SD_ID128_MAKE(4f,68,bc,e3,e8,cd,4d,b1,96,e7,fb,ca,f9,84,b7,09) +#define SD_GPT_USR_ALPHA SD_ID128_MAKE(e1,8c,f0,8c,33,ec,4c,0d,82,46,c6,c6,fb,3d,a0,24) +#define SD_GPT_USR_ARC SD_ID128_MAKE(79,78,a6,83,63,16,49,22,bb,ee,38,bf,f5,a2,fe,cc) +#define SD_GPT_USR_ARM SD_ID128_MAKE(7d,03,59,a3,02,b3,4f,0a,86,5c,65,44,03,e7,06,25) +#define SD_GPT_USR_ARM64 SD_ID128_MAKE(b0,e0,10,50,ee,5f,43,90,94,9a,91,01,b1,71,04,e9) +#define SD_GPT_USR_IA64 SD_ID128_MAKE(43,01,d2,a6,4e,3b,4b,2a,bb,94,9e,0b,2c,42,25,ea) +#define SD_GPT_USR_LOONGARCH64 SD_ID128_MAKE(e6,11,c7,02,57,5c,4c,be,9a,46,43,4f,a0,bf,7e,3f) +#define SD_GPT_USR_MIPS SD_ID128_MAKE(77,3b,2a,bc,2a,99,43,98,8b,f5,03,ba,ac,40,d0,2b) +#define SD_GPT_USR_MIPS64 SD_ID128_MAKE(57,e1,39,58,73,31,43,65,8e,6e,35,ee,ee,17,c6,1b) +#define SD_GPT_USR_MIPS_LE SD_ID128_MAKE(0f,48,68,e9,99,52,47,06,97,9f,3e,d3,a4,73,e9,47) +#define SD_GPT_USR_MIPS64_LE SD_ID128_MAKE(c9,7c,1f,32,ba,06,40,b4,9f,22,23,60,61,b0,8a,a8) +#define SD_GPT_USR_PARISC SD_ID128_MAKE(dc,4a,44,80,69,17,42,62,a4,ec,db,93,84,94,9f,25) +#define SD_GPT_USR_PPC SD_ID128_MAKE(7d,14,fe,c5,cc,71,41,5d,9d,6c,06,bf,0b,3c,3e,af) +#define SD_GPT_USR_PPC64 SD_ID128_MAKE(2c,97,39,e2,f0,68,46,b3,9f,d0,01,c5,a9,af,bc,ca) +#define SD_GPT_USR_PPC64_LE SD_ID128_MAKE(15,bb,03,af,77,e7,4d,4a,b1,2b,c0,d0,84,f7,49,1c) +#define SD_GPT_USR_RISCV32 SD_ID128_MAKE(b9,33,fb,22,5c,3f,4f,91,af,90,e2,bb,0f,a5,07,02) +#define SD_GPT_USR_RISCV64 SD_ID128_MAKE(be,ae,c3,4b,84,42,43,9b,a4,0b,98,43,81,ed,09,7d) +#define SD_GPT_USR_S390 SD_ID128_MAKE(cd,0f,86,9b,d0,fb,4c,a0,b1,41,9e,a8,7c,c7,8d,66) +#define SD_GPT_USR_S390X SD_ID128_MAKE(8a,4f,57,70,50,aa,4e,d3,87,4a,99,b7,10,db,6f,ea) +#define SD_GPT_USR_TILEGX SD_ID128_MAKE(55,49,70,29,c7,c1,44,cc,aa,39,81,5e,d1,55,86,30) +#define SD_GPT_USR_X86 SD_ID128_MAKE(75,25,0d,76,8c,c6,45,8e,bd,66,bd,47,cc,81,a8,12) +#define SD_GPT_USR_X86_64 SD_ID128_MAKE(84,84,68,0c,95,21,48,c6,9c,11,b0,72,06,56,f6,9e) + +/* Verity partitions for the root partitions above (we only define them for the root and /usr partitions, + * because only they are commonly read-only and hence suitable for verity). */ +#define SD_GPT_ROOT_ALPHA_VERITY SD_ID128_MAKE(fc,56,d9,e9,e6,e5,4c,06,be,32,e7,44,07,ce,09,a5) +#define SD_GPT_ROOT_ARC_VERITY SD_ID128_MAKE(24,b2,d9,75,0f,97,45,21,af,a1,cd,53,1e,42,1b,8d) +#define SD_GPT_ROOT_ARM_VERITY SD_ID128_MAKE(73,86,cd,f2,20,3c,47,a9,a4,98,f2,ec,ce,45,a2,d6) +#define SD_GPT_ROOT_ARM64_VERITY SD_ID128_MAKE(df,33,00,ce,d6,9f,4c,92,97,8c,9b,fb,0f,38,d8,20) +#define SD_GPT_ROOT_IA64_VERITY SD_ID128_MAKE(86,ed,10,d5,b6,07,45,bb,89,57,d3,50,f2,3d,05,71) +#define SD_GPT_ROOT_LOONGARCH64_VERITY SD_ID128_MAKE(f3,39,3b,22,e9,af,46,13,a9,48,9d,3b,fb,d0,c5,35) +#define SD_GPT_ROOT_MIPS_VERITY SD_ID128_MAKE(7a,43,07,99,f7,11,4c,7e,8e,5b,1d,68,5b,d4,86,07) +#define SD_GPT_ROOT_MIPS64_VERITY SD_ID128_MAKE(57,95,36,f8,6a,33,40,55,a9,5a,df,2d,5e,2c,42,a8) +#define SD_GPT_ROOT_MIPS_LE_VERITY SD_ID128_MAKE(d7,d1,50,d2,2a,04,4a,33,8f,12,16,65,12,05,ff,7b) +#define SD_GPT_ROOT_MIPS64_LE_VERITY SD_ID128_MAKE(16,b4,17,f8,3e,06,4f,57,8d,d2,9b,52,32,f4,1a,a6) +#define SD_GPT_ROOT_PARISC_VERITY SD_ID128_MAKE(d2,12,a4,30,fb,c5,49,f9,a9,83,a7,fe,ef,2b,8d,0e) +#define SD_GPT_ROOT_PPC64_LE_VERITY SD_ID128_MAKE(90,6b,d9,44,45,89,4a,ae,a4,e4,dd,98,39,17,44,6a) +#define SD_GPT_ROOT_PPC64_VERITY SD_ID128_MAKE(92,25,a9,a3,3c,19,4d,89,b4,f6,ee,ff,88,f1,76,31) +#define SD_GPT_ROOT_PPC_VERITY SD_ID128_MAKE(98,cf,e6,49,15,88,46,dc,b2,f0,ad,d1,47,42,49,25) +#define SD_GPT_ROOT_RISCV32_VERITY SD_ID128_MAKE(ae,02,53,be,11,67,40,07,ac,68,43,92,6c,14,c5,de) +#define SD_GPT_ROOT_RISCV64_VERITY SD_ID128_MAKE(b6,ed,55,82,44,0b,42,09,b8,da,5f,f7,c4,19,ea,3d) +#define SD_GPT_ROOT_S390_VERITY SD_ID128_MAKE(7a,c6,3b,47,b2,5c,46,3b,8d,f8,b4,a9,4e,6c,90,e1) +#define SD_GPT_ROOT_S390X_VERITY SD_ID128_MAKE(b3,25,bf,be,c7,be,4a,b8,83,57,13,9e,65,2d,2f,6b) +#define SD_GPT_ROOT_TILEGX_VERITY SD_ID128_MAKE(96,60,61,ec,28,e4,4b,2e,b4,a5,1f,0a,82,5a,1d,84) +#define SD_GPT_ROOT_X86_64_VERITY SD_ID128_MAKE(2c,73,57,ed,eb,d2,46,d9,ae,c1,23,d4,37,ec,2b,f5) +#define SD_GPT_ROOT_X86_VERITY SD_ID128_MAKE(d1,3c,5d,3b,b5,d1,42,2a,b2,9f,94,54,fd,c8,9d,76) +#define SD_GPT_USR_ALPHA_VERITY SD_ID128_MAKE(8c,ce,0d,25,c0,d0,4a,44,bd,87,46,33,1b,f1,df,67) +#define SD_GPT_USR_ARC_VERITY SD_ID128_MAKE(fc,a0,59,8c,d8,80,45,91,8c,16,4e,da,05,c7,34,7c) +#define SD_GPT_USR_ARM_VERITY SD_ID128_MAKE(c2,15,d7,51,7b,cd,46,49,be,90,66,27,49,0a,4c,05) +#define SD_GPT_USR_ARM64_VERITY SD_ID128_MAKE(6e,11,a4,e7,fb,ca,4d,ed,b9,e9,e1,a5,12,bb,66,4e) +#define SD_GPT_USR_IA64_VERITY SD_ID128_MAKE(6a,49,1e,03,3b,e7,45,45,8e,38,83,32,0e,0e,a8,80) +#define SD_GPT_USR_LOONGARCH64_VERITY SD_ID128_MAKE(f4,6b,2c,26,59,ae,48,f0,91,06,c5,0e,d4,7f,67,3d) +#define SD_GPT_USR_MIPS_VERITY SD_ID128_MAKE(6e,5a,1b,c8,d2,23,49,b7,bc,a8,37,a5,fc,ce,b9,96) +#define SD_GPT_USR_MIPS64_VERITY SD_ID128_MAKE(81,cf,9d,90,74,58,4d,f4,8d,cf,c8,a3,a4,04,f0,9b) +#define SD_GPT_USR_MIPS_LE_VERITY SD_ID128_MAKE(46,b9,8d,8d,b5,5c,4e,8f,aa,b3,37,fc,a7,f8,07,52) +#define SD_GPT_USR_MIPS64_LE_VERITY SD_ID128_MAKE(3c,3d,61,fe,b5,f3,41,4d,bb,71,87,39,a6,94,a4,ef) +#define SD_GPT_USR_PARISC_VERITY SD_ID128_MAKE(58,43,d6,18,ec,37,48,d7,9f,12,ce,a8,e0,87,68,b2) +#define SD_GPT_USR_PPC64_LE_VERITY SD_ID128_MAKE(ee,2b,99,83,21,e8,41,53,86,d9,b6,90,1a,54,d1,ce) +#define SD_GPT_USR_PPC64_VERITY SD_ID128_MAKE(bd,b5,28,a5,a2,59,47,5f,a8,7d,da,53,fa,73,6a,07) +#define SD_GPT_USR_PPC_VERITY SD_ID128_MAKE(df,76,5d,00,27,0e,49,e5,bc,75,f4,7b,b2,11,8b,09) +#define SD_GPT_USR_RISCV32_VERITY SD_ID128_MAKE(cb,1e,e4,e3,8c,d0,41,36,a0,a4,aa,61,a3,2e,87,30) +#define SD_GPT_USR_RISCV64_VERITY SD_ID128_MAKE(8f,10,56,be,9b,05,47,c4,81,d6,be,53,12,8e,5b,54) +#define SD_GPT_USR_S390_VERITY SD_ID128_MAKE(b6,63,c6,18,e7,bc,4d,6d,90,aa,11,b7,56,bb,17,97) +#define SD_GPT_USR_S390X_VERITY SD_ID128_MAKE(31,74,1c,c4,1a,2a,41,11,a5,81,e0,0b,44,7d,2d,06) +#define SD_GPT_USR_TILEGX_VERITY SD_ID128_MAKE(2f,b4,bf,56,07,fa,42,da,81,32,6b,13,9f,20,26,ae) +#define SD_GPT_USR_X86_64_VERITY SD_ID128_MAKE(77,ff,5f,63,e7,b6,46,33,ac,f4,15,65,b8,64,c0,e6) +#define SD_GPT_USR_X86_VERITY SD_ID128_MAKE(8f,46,1b,0d,14,ee,4e,81,9a,a9,04,9b,6f,b9,7a,bd) + +/* PKCS#7 Signatures for the Verity Root Hashes */ +#define SD_GPT_ROOT_ALPHA_VERITY_SIG SD_ID128_MAKE(d4,64,95,b7,a0,53,41,4f,80,f7,70,0c,99,92,1e,f8) +#define SD_GPT_ROOT_ARC_VERITY_SIG SD_ID128_MAKE(14,3a,70,ba,cb,d3,4f,06,91,9f,6c,05,68,3a,78,bc) +#define SD_GPT_ROOT_ARM_VERITY_SIG SD_ID128_MAKE(42,b0,45,5f,eb,11,49,1d,98,d3,56,14,5b,a9,d0,37) +#define SD_GPT_ROOT_ARM64_VERITY_SIG SD_ID128_MAKE(6d,b6,9d,e6,29,f4,47,58,a7,a5,96,21,90,f0,0c,e3) +#define SD_GPT_ROOT_IA64_VERITY_SIG SD_ID128_MAKE(e9,8b,36,ee,32,ba,48,82,9b,12,0c,e1,46,55,f4,6a) +#define SD_GPT_ROOT_LOONGARCH64_VERITY_SIG SD_ID128_MAKE(5a,fb,67,eb,ec,c8,4f,85,ae,8e,ac,1e,7c,50,e7,d0) +#define SD_GPT_ROOT_MIPS_VERITY_SIG SD_ID128_MAKE(bb,a2,10,a2,9c,5d,45,ee,9e,87,ff,2c,cb,d0,02,d0) +#define SD_GPT_ROOT_MIPS64_VERITY_SIG SD_ID128_MAKE(43,ce,94,d4,0f,3d,49,99,82,50,b9,de,af,d9,8e,6e) +#define SD_GPT_ROOT_MIPS_LE_VERITY_SIG SD_ID128_MAKE(c9,19,cc,1f,44,56,4e,ff,91,8c,f7,5e,94,52,5c,a5) +#define SD_GPT_ROOT_MIPS64_LE_VERITY_SIG SD_ID128_MAKE(90,4e,58,ef,5c,65,4a,31,9c,57,6a,f5,fc,7c,5d,e7) +#define SD_GPT_ROOT_PARISC_VERITY_SIG SD_ID128_MAKE(15,de,61,70,65,d3,43,1c,91,6e,b0,dc,d8,39,3f,25) +#define SD_GPT_ROOT_PPC64_LE_VERITY_SIG SD_ID128_MAKE(d4,a2,36,e7,e8,73,4c,07,bf,1d,bf,6c,f7,f1,c3,c6) +#define SD_GPT_ROOT_PPC64_VERITY_SIG SD_ID128_MAKE(f5,e2,c2,0c,45,b2,4f,fa,bc,e9,2a,60,73,7e,1a,af) +#define SD_GPT_ROOT_PPC_VERITY_SIG SD_ID128_MAKE(1b,31,b5,aa,ad,d9,46,3a,b2,ed,bd,46,7f,c8,57,e7) +#define SD_GPT_ROOT_RISCV32_VERITY_SIG SD_ID128_MAKE(3a,11,2a,75,87,29,43,80,b4,cf,76,4d,79,93,44,48) +#define SD_GPT_ROOT_RISCV64_VERITY_SIG SD_ID128_MAKE(ef,e0,f0,87,ea,8d,44,69,82,1a,4c,2a,96,a8,38,6a) +#define SD_GPT_ROOT_S390_VERITY_SIG SD_ID128_MAKE(34,82,38,8e,42,54,43,5a,a2,41,76,6a,06,5f,99,60) +#define SD_GPT_ROOT_S390X_VERITY_SIG SD_ID128_MAKE(c8,01,87,a5,73,a3,49,1a,90,1a,01,7c,3f,a9,53,e9) +#define SD_GPT_ROOT_TILEGX_VERITY_SIG SD_ID128_MAKE(b3,67,14,39,97,b0,4a,53,90,f7,2d,5a,8f,3a,d4,7b) +#define SD_GPT_ROOT_X86_64_VERITY_SIG SD_ID128_MAKE(41,09,2b,05,9f,c8,45,23,99,4f,2d,ef,04,08,b1,76) +#define SD_GPT_ROOT_X86_VERITY_SIG SD_ID128_MAKE(59,96,fc,05,10,9c,48,de,80,8b,23,fa,08,30,b6,76) +#define SD_GPT_USR_ALPHA_VERITY_SIG SD_ID128_MAKE(5c,6e,1c,76,07,6a,45,7a,a0,fe,f3,b4,cd,21,ce,6e) +#define SD_GPT_USR_ARC_VERITY_SIG SD_ID128_MAKE(94,f9,a9,a1,99,71,42,7a,a4,00,50,cb,29,7f,0f,35) +#define SD_GPT_USR_ARM_VERITY_SIG SD_ID128_MAKE(d7,ff,81,2f,37,d1,49,02,a8,10,d7,6b,a5,7b,97,5a) +#define SD_GPT_USR_ARM64_VERITY_SIG SD_ID128_MAKE(c2,3c,e4,ff,44,bd,4b,00,b2,d4,b4,1b,34,19,e0,2a) +#define SD_GPT_USR_IA64_VERITY_SIG SD_ID128_MAKE(8d,e5,8b,c2,2a,43,46,0d,b1,4e,a7,6e,4a,17,b4,7f) +#define SD_GPT_USR_LOONGARCH64_VERITY_SIG SD_ID128_MAKE(b0,24,f3,15,d3,30,44,4c,84,61,44,bb,de,52,4e,99) +#define SD_GPT_USR_MIPS_VERITY_SIG SD_ID128_MAKE(97,ae,15,8d,f2,16,49,7b,80,57,f7,f9,05,77,0f,54) +#define SD_GPT_USR_MIPS64_VERITY_SIG SD_ID128_MAKE(05,81,6c,e2,dd,40,4a,c6,a6,1d,37,d3,2d,c1,ba,7d) +#define SD_GPT_USR_MIPS_LE_VERITY_SIG SD_ID128_MAKE(3e,23,ca,0b,a4,bc,4b,4e,80,87,5a,b6,a2,6a,a8,a9) +#define SD_GPT_USR_MIPS64_LE_VERITY_SIG SD_ID128_MAKE(f2,c2,c7,ee,ad,cc,43,51,b5,c6,ee,98,16,b6,6e,16) +#define SD_GPT_USR_PARISC_VERITY_SIG SD_ID128_MAKE(45,0d,d7,d1,32,24,45,ec,9c,f2,a4,3a,34,6d,71,ee) +#define SD_GPT_USR_PPC64_LE_VERITY_SIG SD_ID128_MAKE(c8,bf,bd,1e,26,8e,45,21,8b,ba,bf,31,4c,39,95,57) +#define SD_GPT_USR_PPC64_VERITY_SIG SD_ID128_MAKE(0b,88,88,63,d7,f8,4d,9e,97,66,23,9f,ce,4d,58,af) +#define SD_GPT_USR_PPC_VERITY_SIG SD_ID128_MAKE(70,07,89,1d,d3,71,4a,80,86,a4,5c,b8,75,b9,30,2e) +#define SD_GPT_USR_RISCV32_VERITY_SIG SD_ID128_MAKE(c3,83,6a,13,31,37,45,ba,b5,83,b1,6c,50,fe,5e,b4) +#define SD_GPT_USR_RISCV64_VERITY_SIG SD_ID128_MAKE(d2,f9,00,0a,7a,18,45,3f,b5,cd,4d,32,f7,7a,7b,32) +#define SD_GPT_USR_S390_VERITY_SIG SD_ID128_MAKE(17,44,0e,4f,a8,d0,46,7f,a4,6e,39,12,ae,6e,f2,c5) +#define SD_GPT_USR_S390X_VERITY_SIG SD_ID128_MAKE(3f,32,48,16,66,7b,46,ae,86,ee,9b,0c,0c,6c,11,b4) +#define SD_GPT_USR_TILEGX_VERITY_SIG SD_ID128_MAKE(4e,de,75,e2,6c,cc,4c,c8,b9,c7,70,33,4b,08,75,10) +#define SD_GPT_USR_X86_64_VERITY_SIG SD_ID128_MAKE(e7,bb,33,fb,06,cf,4e,81,82,73,e5,43,b4,13,e2,e2) +#define SD_GPT_USR_X86_VERITY_SIG SD_ID128_MAKE(97,4a,71,c0,de,41,43,c3,be,5d,5c,5c,cd,1a,d2,c0) + +#define SD_GPT_ESP SD_ID128_MAKE(c1,2a,73,28,f8,1f,11,d2,ba,4b,00,a0,c9,3e,c9,3b) +#define SD_GPT_ESP_STR SD_ID128_MAKE_UUID_STR(c1,2a,73,28,f8,1f,11,d2,ba,4b,00,a0,c9,3e,c9,3b) +#define SD_GPT_XBOOTLDR SD_ID128_MAKE(bc,13,c2,ff,59,e6,42,62,a3,52,b2,75,fd,6f,71,72) +#define SD_GPT_XBOOTLDR_STR SD_ID128_MAKE_UUID_STR(bc,13,c2,ff,59,e6,42,62,a3,52,b2,75,fd,6f,71,72) +#define SD_GPT_SWAP SD_ID128_MAKE(06,57,fd,6d,a4,ab,43,c4,84,e5,09,33,c8,4b,4f,4f) +#define SD_GPT_SWAP_STR SD_ID128_MAKE_UUID_STR(06,57,fd,6d,a4,ab,43,c4,84,e5,09,33,c8,4b,4f,4f) +#define SD_GPT_HOME SD_ID128_MAKE(93,3a,c7,e1,2e,b4,4f,13,b8,44,0e,14,e2,ae,f9,15) +#define SD_GPT_HOME_STR SD_ID128_MAKE_UUID_STR(93,3a,c7,e1,2e,b4,4f,13,b8,44,0e,14,e2,ae,f9,15) +#define SD_GPT_SRV SD_ID128_MAKE(3b,8f,84,25,20,e0,4f,3b,90,7f,1a,25,a7,6f,98,e8) +#define SD_GPT_SRV_STR SD_ID128_MAKE_UUID_STR(3b,8f,84,25,20,e0,4f,3b,90,7f,1a,25,a7,6f,98,e8) +#define SD_GPT_VAR SD_ID128_MAKE(4d,21,b0,16,b5,34,45,c2,a9,fb,5c,16,e0,91,fd,2d) +#define SD_GPT_VAR_STR SD_ID128_MAKE_UUID_STR(4d,21,b0,16,b5,34,45,c2,a9,fb,5c,16,e0,91,fd,2d) +#define SD_GPT_TMP SD_ID128_MAKE(7e,c6,f5,57,3b,c5,4a,ca,b2,93,16,ef,5d,f6,39,d1) +#define SD_GPT_TMP_STR SD_ID128_MAKE_UUID_STR(7e,c6,f5,57,3b,c5,4a,ca,b2,93,16,ef,5d,f6,39,d1) +#define SD_GPT_USER_HOME SD_ID128_MAKE(77,3f,91,ef,66,d4,49,b5,bd,83,d6,83,bf,40,ad,16) +#define SD_GPT_USER_HOME_STR SD_ID128_MAKE_UUID_STR(77,3f,91,ef,66,d4,49,b5,bd,83,d6,83,bf,40,ad,16) +#define SD_GPT_LINUX_GENERIC SD_ID128_MAKE(0f,c6,3d,af,84,83,47,72,8e,79,3d,69,d8,47,7d,e4) +#define SD_GPT_LINUX_GENERIC_STR SD_ID128_MAKE_UUID_STR(0f,c6,3d,af,84,83,47,72,8e,79,3d,69,d8,47,7d,e4) + +/* Maintain same order as above */ +#if defined(__alpha__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_ALPHA +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_ALPHA_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_ALPHA_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_ALPHA +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_ALPHA_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_ALPHA_VERITY_SIG + +#elif defined(__arc__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_ARC +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_ARC_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_ARC_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_ARC +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_ARC_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_ARC_VERITY_SIG + +#elif defined(__aarch64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_ARM64 +# define SD_GPT_ROOT_SECONDARY SD_GPT_ROOT_ARM +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_ARM64_VERITY +# define SD_GPT_ROOT_SECONDARY_VERITY SD_GPT_ROOT_ARM_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_ARM64_VERITY_SIG +# define SD_GPT_ROOT_SECONDARY_VERITY_SIG SD_GPT_ROOT_ARM_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_ARM64 +# define SD_GPT_USR_SECONDARY SD_GPT_USR_ARM +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_ARM64_VERITY +# define SD_GPT_USR_SECONDARY_VERITY SD_GPT_USR_ARM_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_ARM64_VERITY_SIG +# define SD_GPT_USR_SECONDARY_VERITY_SIG SD_GPT_USR_ARM_VERITY_SIG +#elif defined(__arm__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_ARM +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_ARM_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_ARM_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_ARM +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_ARM_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_ARM_VERITY_SIG + +#elif defined(__ia64__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_IA64 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_IA64_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_IA64_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_IA64 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_IA64_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_IA64_VERITY_SIG + +#elif defined(__loongarch_lp64) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_LOONGARCH64 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_LOONGARCH64_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_LOONGARCH64_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_LOONGARCH64 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_LOONGARCH64_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_LOONGARCH64_VERITY_SIG + +#elif defined(__mips__) && !defined(__mips64) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_MIPS +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_MIPS_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_MIPS_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_MIPS +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_MIPS_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_MIPS_VERITY_SIG +#elif defined(__mips64) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_MIPS64 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_MIPS64_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_MIPS64_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_MIPS64 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_MIPS64_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_MIPS64_VERITY_SIG + +#elif defined(__mips__) && !defined(__mips64) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_MIPS_LE +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_MIPS_LE_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_MIPS_LE_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_MIPS_LE +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_MIPS_LE_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_MIPS_LE_VERITY_SIG +#elif defined(__mips64) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_MIPS64_LE +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_MIPS64_LE_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_MIPS64_LE_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_MIPS64_LE +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_MIPS64_LE_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_MIPS64_LE_VERITY_SIG + +#elif defined(__parisc__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_PARISC +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_PARISC_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_PARISC_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_PARISC +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_PARISC_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_PARISC_VERITY_SIG + +#elif defined(__powerpc__) && defined(__PPC64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_PPC64_LE +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_PPC64_LE_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_PPC64_LE_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_PPC64_LE +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_PPC64_LE_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_PPC64_LE_VERITY_SIG +#elif defined(__powerpc__) && defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_PPC64 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_PPC64_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_PPC64_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_PPC64 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_PPC64_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_PPC64_VERITY_SIG +#elif defined(__powerpc__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_PPC +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_PPC_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_PPC_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_PPC +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_PPC_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_PPC_VERITY_SIG + +#elif defined(__riscv) && __riscv_xlen == 32 +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_RISCV32 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_RISCV32_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_RISCV32_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_RISCV32 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_RISCV32_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_RISCV32_VERITY_SIG +#elif defined(__riscv) && __riscv_xlen == 64 +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_RISCV64 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_RISCV64_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_RISCV64_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_RISCV64 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_RISCV64_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_RISCV64_VERITY_SIG + +#elif defined(__s390__) && !defined(__s390x__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_S390 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_S390_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_S390_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_S390 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_S390_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_S390_VERITY_SIG + +#elif defined(__s390x__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_S390X +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_S390X_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_S390X_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_S390X +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_S390X_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_S390X_VERITY_SIG + +#elif defined(__tilegx__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_TILEGX +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_TILEGX_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_TILEGX_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_TILEGX +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_TILEGX_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_TILEGX_VERITY_SIG + +#elif defined(__x86_64__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_X86_64 +# define SD_GPT_ROOT_SECONDARY SD_GPT_ROOT_X86 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_X86_64_VERITY +# define SD_GPT_ROOT_SECONDARY_VERITY SD_GPT_ROOT_X86_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_X86_64_VERITY_SIG +# define SD_GPT_ROOT_SECONDARY_VERITY_SIG SD_GPT_ROOT_X86_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_X86_64 +# define SD_GPT_USR_SECONDARY SD_GPT_USR_X86 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_X86_64_VERITY +# define SD_GPT_USR_SECONDARY_VERITY SD_GPT_USR_X86_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_X86_64_VERITY_SIG +# define SD_GPT_USR_SECONDARY_VERITY_SIG SD_GPT_USR_X86_VERITY_SIG +#elif defined(__i386__) +# define SD_GPT_ROOT_NATIVE SD_GPT_ROOT_X86 +# define SD_GPT_ROOT_NATIVE_VERITY SD_GPT_ROOT_X86_VERITY +# define SD_GPT_ROOT_NATIVE_VERITY_SIG SD_GPT_ROOT_X86_VERITY_SIG +# define SD_GPT_USR_NATIVE SD_GPT_USR_X86 +# define SD_GPT_USR_NATIVE_VERITY SD_GPT_USR_X86_VERITY +# define SD_GPT_USR_NATIVE_VERITY_SIG SD_GPT_USR_X86_VERITY_SIG +#endif + +/* Partition attributes defined by the UEFI specification. */ +#define SD_GPT_FLAG_REQUIRED_PARTITION (UINT64_C(1) << 0) +#define SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL (UINT64_C(1) << 1) +#define SD_GPT_FLAG_LEGACY_BIOS_BOOTABLE (UINT64_C(1) << 2) + +/* Flags we recognize on the root, usr, xbootldr, swap, home, srv, var, tmp partitions when doing + * auto-discovery. + * + * The first two happen to be identical to what Microsoft defines for its own Basic Data Partitions + * in "winioctl.h": GPT_BASIC_DATA_ATTRIBUTE_READ_ONLY, GPT_BASIC_DATA_ATTRIBUTE_NO_DRIVE_LETTER. + */ +#define SD_GPT_FLAG_READ_ONLY (UINT64_C(1) << 60) +#define SD_GPT_FLAG_NO_AUTO (UINT64_C(1) << 63) +#define SD_GPT_FLAG_GROWFS (UINT64_C(1) << 59) + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-hwdb.h b/src/systemd/sd-hwdb.h new file mode 100644 index 0000000..ff880f1 --- /dev/null +++ b/src/systemd/sd-hwdb.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdhwdbhfoo +#define foosdhwdbhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_hwdb sd_hwdb; + +sd_hwdb *sd_hwdb_ref(sd_hwdb *hwdb); +sd_hwdb *sd_hwdb_unref(sd_hwdb *hwdb); + +int sd_hwdb_new(sd_hwdb **ret); +int sd_hwdb_new_from_path(const char *path, sd_hwdb **ret); + +int sd_hwdb_get(sd_hwdb *hwdb, const char *modalias, const char *key, const char **value); + +int sd_hwdb_seek(sd_hwdb *hwdb, const char *modalias); +int sd_hwdb_enumerate(sd_hwdb *hwdb, const char **key, const char **value); + +/* the inverse condition avoids ambiguity of dangling 'else' after the macro */ +#define SD_HWDB_FOREACH_PROPERTY(hwdb, modalias, key, value) \ + if (sd_hwdb_seek(hwdb, modalias) < 0) { } \ + else while (sd_hwdb_enumerate(hwdb, &(key), &(value)) > 0) + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_hwdb, sd_hwdb_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-id128.h b/src/systemd/sd-id128.h new file mode 100644 index 0000000..a984a9d --- /dev/null +++ b/src/systemd/sd-id128.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdid128hfoo +#define foosdid128hfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* 128-bit ID APIs. See sd-id128(3) for more information. */ + +typedef union sd_id128 sd_id128_t; + +union sd_id128 { + uint8_t bytes[16]; + uint64_t qwords[2]; +}; + +#define SD_ID128_STRING_MAX 33U +#define SD_ID128_UUID_STRING_MAX 37U + +char *sd_id128_to_string(sd_id128_t id, char s[_SD_ARRAY_STATIC SD_ID128_STRING_MAX]); +char *sd_id128_to_uuid_string(sd_id128_t id, char s[_SD_ARRAY_STATIC SD_ID128_UUID_STRING_MAX]); +int sd_id128_from_string(const char *s, sd_id128_t *ret); + +#define SD_ID128_TO_STRING(id) sd_id128_to_string((id), (char[SD_ID128_STRING_MAX]) {}) +#define SD_ID128_TO_UUID_STRING(id) sd_id128_to_uuid_string((id), (char[SD_ID128_UUID_STRING_MAX]) {}) + +int sd_id128_randomize(sd_id128_t *ret); + +int sd_id128_get_machine(sd_id128_t *ret); +int sd_id128_get_boot(sd_id128_t *ret); +int sd_id128_get_invocation(sd_id128_t *ret); + +int sd_id128_get_app_specific(sd_id128_t base, sd_id128_t app_id, sd_id128_t *ret); +int sd_id128_get_machine_app_specific(sd_id128_t app_id, sd_id128_t *ret); +int sd_id128_get_boot_app_specific(sd_id128_t app_id, sd_id128_t *ret); + +#define SD_ID128_ARRAY(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) \ + { .bytes = { 0x##v0, 0x##v1, 0x##v2, 0x##v3, 0x##v4, 0x##v5, 0x##v6, 0x##v7, \ + 0x##v8, 0x##v9, 0x##v10, 0x##v11, 0x##v12, 0x##v13, 0x##v14, 0x##v15 }} + +#define SD_ID128_MAKE(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) \ + ((const sd_id128_t) SD_ID128_ARRAY(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)) + +/* Note that SD_ID128_FORMAT_VAL will evaluate the passed argument 16 + * times. It is hence not a good idea to call this macro with an + * expensive function as parameter or an expression with side + * effects */ + +#define SD_ID128_FORMAT_STR "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x" +#define SD_ID128_FORMAT_VAL(x) (x).bytes[0], (x).bytes[1], (x).bytes[2], (x).bytes[3], (x).bytes[4], (x).bytes[5], (x).bytes[6], (x).bytes[7], (x).bytes[8], (x).bytes[9], (x).bytes[10], (x).bytes[11], (x).bytes[12], (x).bytes[13], (x).bytes[14], (x).bytes[15] + +/* Like SD_ID128_FORMAT_STR, but formats as UUID, not in plain format (Strictly Big Endian byte order, + * i.e. treats everything as RFC4122 Variant 1 UUIDs, even if variant says otherwise, but matching other + * Linux userspace behaviour.) */ +#define SD_ID128_UUID_FORMAT_STR "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x" + +#define SD_ID128_CONST_STR(x) \ + ((const char[SD_ID128_STRING_MAX]) { \ + ((x).bytes[0] >> 4) >= 10 ? 'a' + ((x).bytes[0] >> 4) - 10 : '0' + ((x).bytes[0] >> 4), \ + ((x).bytes[0] & 15) >= 10 ? 'a' + ((x).bytes[0] & 15) - 10 : '0' + ((x).bytes[0] & 15), \ + ((x).bytes[1] >> 4) >= 10 ? 'a' + ((x).bytes[1] >> 4) - 10 : '0' + ((x).bytes[1] >> 4), \ + ((x).bytes[1] & 15) >= 10 ? 'a' + ((x).bytes[1] & 15) - 10 : '0' + ((x).bytes[1] & 15), \ + ((x).bytes[2] >> 4) >= 10 ? 'a' + ((x).bytes[2] >> 4) - 10 : '0' + ((x).bytes[2] >> 4), \ + ((x).bytes[2] & 15) >= 10 ? 'a' + ((x).bytes[2] & 15) - 10 : '0' + ((x).bytes[2] & 15), \ + ((x).bytes[3] >> 4) >= 10 ? 'a' + ((x).bytes[3] >> 4) - 10 : '0' + ((x).bytes[3] >> 4), \ + ((x).bytes[3] & 15) >= 10 ? 'a' + ((x).bytes[3] & 15) - 10 : '0' + ((x).bytes[3] & 15), \ + ((x).bytes[4] >> 4) >= 10 ? 'a' + ((x).bytes[4] >> 4) - 10 : '0' + ((x).bytes[4] >> 4), \ + ((x).bytes[4] & 15) >= 10 ? 'a' + ((x).bytes[4] & 15) - 10 : '0' + ((x).bytes[4] & 15), \ + ((x).bytes[5] >> 4) >= 10 ? 'a' + ((x).bytes[5] >> 4) - 10 : '0' + ((x).bytes[5] >> 4), \ + ((x).bytes[5] & 15) >= 10 ? 'a' + ((x).bytes[5] & 15) - 10 : '0' + ((x).bytes[5] & 15), \ + ((x).bytes[6] >> 4) >= 10 ? 'a' + ((x).bytes[6] >> 4) - 10 : '0' + ((x).bytes[6] >> 4), \ + ((x).bytes[6] & 15) >= 10 ? 'a' + ((x).bytes[6] & 15) - 10 : '0' + ((x).bytes[6] & 15), \ + ((x).bytes[7] >> 4) >= 10 ? 'a' + ((x).bytes[7] >> 4) - 10 : '0' + ((x).bytes[7] >> 4), \ + ((x).bytes[7] & 15) >= 10 ? 'a' + ((x).bytes[7] & 15) - 10 : '0' + ((x).bytes[7] & 15), \ + ((x).bytes[8] >> 4) >= 10 ? 'a' + ((x).bytes[8] >> 4) - 10 : '0' + ((x).bytes[8] >> 4), \ + ((x).bytes[8] & 15) >= 10 ? 'a' + ((x).bytes[8] & 15) - 10 : '0' + ((x).bytes[8] & 15), \ + ((x).bytes[9] >> 4) >= 10 ? 'a' + ((x).bytes[9] >> 4) - 10 : '0' + ((x).bytes[9] >> 4), \ + ((x).bytes[9] & 15) >= 10 ? 'a' + ((x).bytes[9] & 15) - 10 : '0' + ((x).bytes[9] & 15), \ + ((x).bytes[10] >> 4) >= 10 ? 'a' + ((x).bytes[10] >> 4) - 10 : '0' + ((x).bytes[10] >> 4), \ + ((x).bytes[10] & 15) >= 10 ? 'a' + ((x).bytes[10] & 15) - 10 : '0' + ((x).bytes[10] & 15), \ + ((x).bytes[11] >> 4) >= 10 ? 'a' + ((x).bytes[11] >> 4) - 10 : '0' + ((x).bytes[11] >> 4), \ + ((x).bytes[11] & 15) >= 10 ? 'a' + ((x).bytes[11] & 15) - 10 : '0' + ((x).bytes[11] & 15), \ + ((x).bytes[12] >> 4) >= 10 ? 'a' + ((x).bytes[12] >> 4) - 10 : '0' + ((x).bytes[12] >> 4), \ + ((x).bytes[12] & 15) >= 10 ? 'a' + ((x).bytes[12] & 15) - 10 : '0' + ((x).bytes[12] & 15), \ + ((x).bytes[13] >> 4) >= 10 ? 'a' + ((x).bytes[13] >> 4) - 10 : '0' + ((x).bytes[13] >> 4), \ + ((x).bytes[13] & 15) >= 10 ? 'a' + ((x).bytes[13] & 15) - 10 : '0' + ((x).bytes[13] & 15), \ + ((x).bytes[14] >> 4) >= 10 ? 'a' + ((x).bytes[14] >> 4) - 10 : '0' + ((x).bytes[14] >> 4), \ + ((x).bytes[14] & 15) >= 10 ? 'a' + ((x).bytes[14] & 15) - 10 : '0' + ((x).bytes[14] & 15), \ + ((x).bytes[15] >> 4) >= 10 ? 'a' + ((x).bytes[15] >> 4) - 10 : '0' + ((x).bytes[15] >> 4), \ + ((x).bytes[15] & 15) >= 10 ? 'a' + ((x).bytes[15] & 15) - 10 : '0' + ((x).bytes[15] & 15), \ + 0 }) + +#define SD_ID128_MAKE_STR(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + #a #b #c #d #e #f #g #h #i #j #k #l #m #n #o #p + +#define SD_ID128_MAKE_UUID_STR(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + #a #b #c #d "-" #e #f "-" #g #h "-" #i #j "-" #k #l #m #n #o #p + +_sd_pure_ static __inline__ int sd_id128_equal(sd_id128_t a, sd_id128_t b) { + return a.qwords[0] == b.qwords[0] && a.qwords[1] == b.qwords[1]; +} + +int sd_id128_string_equal(const char *s, sd_id128_t id); + +_sd_pure_ static __inline__ int sd_id128_is_null(sd_id128_t a) { + return a.qwords[0] == 0 && a.qwords[1] == 0; +} + +_sd_pure_ static __inline__ int sd_id128_is_allf(sd_id128_t a) { + return a.qwords[0] == UINT64_C(0xFFFFFFFFFFFFFFFF) && a.qwords[1] == UINT64_C(0xFFFFFFFFFFFFFFFF); +} + +#define SD_ID128_NULL ((const sd_id128_t) { .qwords = { 0, 0 }}) +#define SD_ID128_ALLF ((const sd_id128_t) { .qwords = { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF) }}) + +_sd_pure_ static __inline__ int sd_id128_in_setv(sd_id128_t a, va_list ap) { + for (;;) { + sd_id128_t b = va_arg(ap, sd_id128_t); + + if (sd_id128_is_null(b)) + return 0; + + if (sd_id128_equal(a, b)) + return 1; + } +} + +_sd_pure_ static __inline__ int sd_id128_in_set_sentinel(sd_id128_t a, ...) { + va_list ap; + int r; + + va_start(ap, a); + r = sd_id128_in_setv(a, ap); + va_end(ap); + + return r; +} + +#define sd_id128_in_set(a, ...) \ + sd_id128_in_set_sentinel(a, ##__VA_ARGS__, SD_ID128_NULL) + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-ipv4acd.h b/src/systemd/sd-ipv4acd.h new file mode 100644 index 0000000..6be5770 --- /dev/null +++ b/src/systemd/sd-ipv4acd.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdipv4acdfoo +#define foosdipv4acdfoo + +/*** + Copyright © 2014 Axis Communications AB. All rights reserved. + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +enum { + SD_IPV4ACD_EVENT_STOP = 0, + SD_IPV4ACD_EVENT_BIND = 1, + SD_IPV4ACD_EVENT_CONFLICT = 2 +}; + +typedef struct sd_ipv4acd sd_ipv4acd; +typedef void (*sd_ipv4acd_callback_t)(sd_ipv4acd *acd, int event, void *userdata); +typedef int (*sd_ipv4acd_check_mac_callback_t)(sd_ipv4acd *acd, const struct ether_addr *mac, void *userdata); + +int sd_ipv4acd_detach_event(sd_ipv4acd *acd); +int sd_ipv4acd_attach_event(sd_ipv4acd *acd, sd_event *event, int64_t priority); +int sd_ipv4acd_get_address(sd_ipv4acd *acd, struct in_addr *address); +int sd_ipv4acd_set_callback(sd_ipv4acd *acd, sd_ipv4acd_callback_t cb, void *userdata); +int sd_ipv4acd_set_check_mac_callback(sd_ipv4acd *acd, sd_ipv4acd_check_mac_callback_t cb, void *userdata); +int sd_ipv4acd_set_mac(sd_ipv4acd *acd, const struct ether_addr *addr); +int sd_ipv4acd_set_ifindex(sd_ipv4acd *acd, int interface_index); +int sd_ipv4acd_get_ifindex(sd_ipv4acd *acd); +int sd_ipv4acd_set_ifname(sd_ipv4acd *acd, const char *interface_name); +int sd_ipv4acd_get_ifname(sd_ipv4acd *acd, const char **ret); +int sd_ipv4acd_set_address(sd_ipv4acd *acd, const struct in_addr *address); +int sd_ipv4acd_is_running(sd_ipv4acd *acd); +int sd_ipv4acd_is_bound(sd_ipv4acd *acd); +__extension__ int sd_ipv4acd_start(sd_ipv4acd *acd, bool reset_conflicts); +int sd_ipv4acd_stop(sd_ipv4acd *acd); +sd_ipv4acd *sd_ipv4acd_ref(sd_ipv4acd *acd); +sd_ipv4acd *sd_ipv4acd_unref(sd_ipv4acd *acd); +int sd_ipv4acd_new(sd_ipv4acd **ret); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_ipv4acd, sd_ipv4acd_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-ipv4ll.h b/src/systemd/sd-ipv4ll.h new file mode 100644 index 0000000..35e4679 --- /dev/null +++ b/src/systemd/sd-ipv4ll.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdipv4llfoo +#define foosdipv4llfoo + +/*** + Copyright © 2014 Axis Communications AB. All rights reserved. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +enum { + SD_IPV4LL_EVENT_STOP = 0, + SD_IPV4LL_EVENT_BIND = 1, + SD_IPV4LL_EVENT_CONFLICT = 2 +}; + +typedef struct sd_ipv4ll sd_ipv4ll; +typedef void (*sd_ipv4ll_callback_t)(sd_ipv4ll *ll, int event, void *userdata); +typedef int (*sd_ipv4ll_check_mac_callback_t)(sd_ipv4ll *ll, const struct ether_addr *mac, void *userdata); + +int sd_ipv4ll_detach_event(sd_ipv4ll *ll); +int sd_ipv4ll_attach_event(sd_ipv4ll *ll, sd_event *event, int64_t priority); +int sd_ipv4ll_get_address(sd_ipv4ll *ll, struct in_addr *address); +int sd_ipv4ll_set_callback(sd_ipv4ll *ll, sd_ipv4ll_callback_t cb, void *userdata); +int sd_ipv4ll_set_check_mac_callback(sd_ipv4ll *ll, sd_ipv4ll_check_mac_callback_t cb, void *userdata); +int sd_ipv4ll_set_mac(sd_ipv4ll *ll, const struct ether_addr *addr); +int sd_ipv4ll_set_ifindex(sd_ipv4ll *ll, int interface_index); +int sd_ipv4ll_get_ifindex(sd_ipv4ll *ll); +int sd_ipv4ll_set_ifname(sd_ipv4ll *ll, const char *interface_name); +int sd_ipv4ll_get_ifname(sd_ipv4ll *ll, const char **ret); +int sd_ipv4ll_set_address(sd_ipv4ll *ll, const struct in_addr *address); +int sd_ipv4ll_set_address_seed(sd_ipv4ll *ll, uint64_t seed); +int sd_ipv4ll_is_running(sd_ipv4ll *ll); +int sd_ipv4ll_restart(sd_ipv4ll *ll); +int sd_ipv4ll_start(sd_ipv4ll *ll); +int sd_ipv4ll_stop(sd_ipv4ll *ll); +sd_ipv4ll *sd_ipv4ll_ref(sd_ipv4ll *ll); +sd_ipv4ll *sd_ipv4ll_unref(sd_ipv4ll *ll); +int sd_ipv4ll_new(sd_ipv4ll **ret); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_ipv4ll, sd_ipv4ll_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-journal.h b/src/systemd/sd-journal.h new file mode 100644 index 0000000..7d2d75d --- /dev/null +++ b/src/systemd/sd-journal.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdjournalhfoo +#define foosdjournalhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "_sd-common.h" + +/* Journal APIs. See sd-journal(3) for more information. */ + +_SD_BEGIN_DECLARATIONS; + +/* Write to daemon */ +int sd_journal_print(int priority, const char *format, ...) _sd_printf_(2, 3); +int sd_journal_printv(int priority, const char *format, va_list ap) _sd_printf_(2, 0); +int sd_journal_send(const char *format, ...) _sd_printf_(1, 0) _sd_sentinel_; +int sd_journal_sendv(const struct iovec *iov, int n); +int sd_journal_perror(const char *message); + +/* Used by the macros below. You probably don't want to call this directly. */ +int sd_journal_print_with_location(int priority, const char *file, const char *line, const char *func, const char *format, ...) _sd_printf_(5, 6); +int sd_journal_printv_with_location(int priority, const char *file, const char *line, const char *func, const char *format, va_list ap) _sd_printf_(5, 0); +int sd_journal_send_with_location(const char *file, const char *line, const char *func, const char *format, ...) _sd_printf_(4, 0) _sd_sentinel_; +int sd_journal_sendv_with_location(const char *file, const char *line, const char *func, const struct iovec *iov, int n); +int sd_journal_perror_with_location(const char *file, const char *line, const char *func, const char *message); + +/* implicitly add code location to messages sent, if this is enabled */ +#ifndef SD_JOURNAL_SUPPRESS_LOCATION + +#define sd_journal_print(priority, ...) sd_journal_print_with_location(priority, "CODE_FILE=" __FILE__, "CODE_LINE=" _SD_STRINGIFY(__LINE__), __func__, __VA_ARGS__) +#define sd_journal_printv(priority, format, ap) sd_journal_printv_with_location(priority, "CODE_FILE=" __FILE__, "CODE_LINE=" _SD_STRINGIFY(__LINE__), __func__, format, ap) +#define sd_journal_send(...) sd_journal_send_with_location("CODE_FILE=" __FILE__, "CODE_LINE=" _SD_STRINGIFY(__LINE__), __func__, __VA_ARGS__) +#define sd_journal_sendv(iovec, n) sd_journal_sendv_with_location("CODE_FILE=" __FILE__, "CODE_LINE=" _SD_STRINGIFY(__LINE__), __func__, iovec, n) +#define sd_journal_perror(message) sd_journal_perror_with_location("CODE_FILE=" __FILE__, "CODE_LINE=" _SD_STRINGIFY(__LINE__), __func__, message) + +#endif + +int sd_journal_stream_fd(const char *identifier, int priority, int level_prefix); + +/* Browse journal stream */ + +typedef struct sd_journal sd_journal; + +/* Open flags */ +enum { + SD_JOURNAL_LOCAL_ONLY = 1 << 0, + SD_JOURNAL_RUNTIME_ONLY = 1 << 1, + SD_JOURNAL_SYSTEM = 1 << 2, + SD_JOURNAL_CURRENT_USER = 1 << 3, + SD_JOURNAL_OS_ROOT = 1 << 4, + SD_JOURNAL_ALL_NAMESPACES = 1 << 5, /* Show all namespaces, not just the default or specified one */ + SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE = 1 << 6, /* Show default namespace in addition to specified one */ + SD_JOURNAL_TAKE_DIRECTORY_FD = 1 << 7, /* sd_journal_open_directory_fd() will take ownership of the provided file descriptor. */ + + SD_JOURNAL_SYSTEM_ONLY _sd_deprecated_ = SD_JOURNAL_SYSTEM /* old name */ +}; + +/* Wakeup event types */ +enum { + SD_JOURNAL_NOP, + SD_JOURNAL_APPEND, + SD_JOURNAL_INVALIDATE +}; + +int sd_journal_open(sd_journal **ret, int flags); +int sd_journal_open_namespace(sd_journal **ret, const char *name_space, int flags); +int sd_journal_open_directory(sd_journal **ret, const char *path, int flags); +int sd_journal_open_directory_fd(sd_journal **ret, int fd, int flags); +int sd_journal_open_files(sd_journal **ret, const char **paths, int flags); +int sd_journal_open_files_fd(sd_journal **ret, int fds[], unsigned n_fds, int flags); +int sd_journal_open_container(sd_journal **ret, const char *machine, int flags) _sd_deprecated_; +void sd_journal_close(sd_journal *j); + +int sd_journal_previous(sd_journal *j); +int sd_journal_next(sd_journal *j); +int sd_journal_step_one(sd_journal *j, int advanced); + +int sd_journal_previous_skip(sd_journal *j, uint64_t skip); +int sd_journal_next_skip(sd_journal *j, uint64_t skip); + +int sd_journal_get_realtime_usec(sd_journal *j, uint64_t *ret); +int sd_journal_get_monotonic_usec(sd_journal *j, uint64_t *ret, sd_id128_t *ret_boot_id); +int sd_journal_get_seqnum(sd_journal *j, uint64_t *ret_seqnum, sd_id128_t *ret_seqnum_id); + +int sd_journal_set_data_threshold(sd_journal *j, size_t sz); +int sd_journal_get_data_threshold(sd_journal *j, size_t *sz); + +int sd_journal_get_data(sd_journal *j, const char *field, const void **data, size_t *l); +int sd_journal_enumerate_data(sd_journal *j, const void **data, size_t *l); +int sd_journal_enumerate_available_data(sd_journal *j, const void **data, size_t *l); +void sd_journal_restart_data(sd_journal *j); + +int sd_journal_add_match(sd_journal *j, const void *data, size_t size); +int sd_journal_add_disjunction(sd_journal *j); +int sd_journal_add_conjunction(sd_journal *j); +void sd_journal_flush_matches(sd_journal *j); + +int sd_journal_seek_head(sd_journal *j); +int sd_journal_seek_tail(sd_journal *j); +int sd_journal_seek_monotonic_usec(sd_journal *j, sd_id128_t boot_id, uint64_t usec); +int sd_journal_seek_realtime_usec(sd_journal *j, uint64_t usec); +int sd_journal_seek_cursor(sd_journal *j, const char *cursor); + +int sd_journal_get_cursor(sd_journal *j, char **cursor); +int sd_journal_test_cursor(sd_journal *j, const char *cursor); + +int sd_journal_get_cutoff_realtime_usec(sd_journal *j, uint64_t *from, uint64_t *to); +int sd_journal_get_cutoff_monotonic_usec(sd_journal *j, const sd_id128_t boot_id, uint64_t *from, uint64_t *to); + +int sd_journal_get_usage(sd_journal *j, uint64_t *bytes); + +int sd_journal_query_unique(sd_journal *j, const char *field); +int sd_journal_enumerate_unique(sd_journal *j, const void **data, size_t *l); +int sd_journal_enumerate_available_unique(sd_journal *j, const void **data, size_t *l); +void sd_journal_restart_unique(sd_journal *j); + +int sd_journal_enumerate_fields(sd_journal *j, const char **field); +void sd_journal_restart_fields(sd_journal *j); + +int sd_journal_get_fd(sd_journal *j); +int sd_journal_get_events(sd_journal *j); +int sd_journal_get_timeout(sd_journal *j, uint64_t *timeout_usec); +int sd_journal_process(sd_journal *j); +int sd_journal_wait(sd_journal *j, uint64_t timeout_usec); +int sd_journal_reliable_fd(sd_journal *j); + +int sd_journal_get_catalog(sd_journal *j, char **text); +int sd_journal_get_catalog_for_message_id(sd_id128_t id, char **text); + +int sd_journal_has_runtime_files(sd_journal *j); +int sd_journal_has_persistent_files(sd_journal *j); + +/* The inverse condition avoids ambiguity of dangling 'else' after the macro */ +#define SD_JOURNAL_FOREACH(j) \ + if (sd_journal_seek_head(j) < 0) { } \ + else while (sd_journal_next(j) > 0) + +/* The inverse condition avoids ambiguity of dangling 'else' after the macro */ +#define SD_JOURNAL_FOREACH_BACKWARDS(j) \ + if (sd_journal_seek_tail(j) < 0) { } \ + else while (sd_journal_previous(j) > 0) + +/* Iterate through all available data fields of the current journal entry */ +#define SD_JOURNAL_FOREACH_DATA(j, data, l) \ + for (sd_journal_restart_data(j); sd_journal_enumerate_available_data((j), &(data), &(l)) > 0; ) + +/* Iterate through all available values of a specific field */ +#define SD_JOURNAL_FOREACH_UNIQUE(j, data, l) \ + for (sd_journal_restart_unique(j); sd_journal_enumerate_available_unique((j), &(data), &(l)) > 0; ) + +/* Iterate through all known field names */ +#define SD_JOURNAL_FOREACH_FIELD(j, field) \ + for (sd_journal_restart_fields(j); sd_journal_enumerate_fields((j), &(field)) > 0; ) + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_journal, sd_journal_close); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-lldp-rx.h b/src/systemd/sd-lldp-rx.h new file mode 100644 index 0000000..504d7f5 --- /dev/null +++ b/src/systemd/sd-lldp-rx.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdlldprxhfoo +#define foosdlldprxhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include + +#include "sd-event.h" +#include "sd-lldp.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_lldp_rx sd_lldp_rx; +typedef struct sd_lldp_neighbor sd_lldp_neighbor; + +__extension__ typedef enum sd_lldp_rx_event_t { + SD_LLDP_RX_EVENT_ADDED, + SD_LLDP_RX_EVENT_REMOVED, + SD_LLDP_RX_EVENT_UPDATED, + SD_LLDP_RX_EVENT_REFRESHED, + _SD_LLDP_RX_EVENT_MAX, + _SD_LLDP_RX_EVENT_INVALID = -EINVAL, + _SD_ENUM_FORCE_S64(LLDP_RX_EVENT) +} sd_lldp_rx_event_t; + +typedef void (*sd_lldp_rx_callback_t)(sd_lldp_rx *lldp_rx, sd_lldp_rx_event_t event, sd_lldp_neighbor *n, void *userdata); + +int sd_lldp_rx_new(sd_lldp_rx **ret); +sd_lldp_rx *sd_lldp_rx_ref(sd_lldp_rx *lldp_rx); +sd_lldp_rx *sd_lldp_rx_unref(sd_lldp_rx *lldp_rx); + +int sd_lldp_rx_start(sd_lldp_rx *lldp_rx); +int sd_lldp_rx_stop(sd_lldp_rx *lldp_rx); +int sd_lldp_rx_is_running(sd_lldp_rx *lldp_rx); + +int sd_lldp_rx_attach_event(sd_lldp_rx *lldp_rx, sd_event *event, int64_t priority); +int sd_lldp_rx_detach_event(sd_lldp_rx *lldp_rx); +sd_event *sd_lldp_rx_get_event(sd_lldp_rx *lldp_rx); + +int sd_lldp_rx_set_callback(sd_lldp_rx *lldp_rx, sd_lldp_rx_callback_t cb, void *userdata); +int sd_lldp_rx_set_ifindex(sd_lldp_rx *lldp_rx, int ifindex); +int sd_lldp_rx_set_ifname(sd_lldp_rx *lldp_rx, const char *ifname); +int sd_lldp_rx_get_ifname(sd_lldp_rx *lldp_rx, const char **ret); + +/* Controls how much and what to store in the neighbors database */ +int sd_lldp_rx_set_neighbors_max(sd_lldp_rx *lldp_rx, uint64_t n); +int sd_lldp_rx_match_capabilities(sd_lldp_rx *lldp_rx, uint16_t mask); +int sd_lldp_rx_set_filter_address(sd_lldp_rx *lldp_rx, const struct ether_addr *address); + +int sd_lldp_rx_get_neighbors(sd_lldp_rx *lldp_rx, sd_lldp_neighbor ***neighbors); + +int sd_lldp_neighbor_from_raw(sd_lldp_neighbor **ret, const void *raw, size_t raw_size); +sd_lldp_neighbor *sd_lldp_neighbor_ref(sd_lldp_neighbor *n); +sd_lldp_neighbor *sd_lldp_neighbor_unref(sd_lldp_neighbor *n); + +/* Access to LLDP frame metadata */ +int sd_lldp_neighbor_get_source_address(sd_lldp_neighbor *n, struct ether_addr* address); +int sd_lldp_neighbor_get_destination_address(sd_lldp_neighbor *n, struct ether_addr* address); +int sd_lldp_neighbor_get_timestamp(sd_lldp_neighbor *n, clockid_t clock, uint64_t *ret); +int sd_lldp_neighbor_get_raw(sd_lldp_neighbor *n, const void **ret, size_t *size); + +/* High-level, direct, parsed out field access. These fields exist at most once, hence may be queried directly. */ +int sd_lldp_neighbor_get_chassis_id(sd_lldp_neighbor *n, uint8_t *type, const void **ret, size_t *size); +int sd_lldp_neighbor_get_chassis_id_as_string(sd_lldp_neighbor *n, const char **ret); +int sd_lldp_neighbor_get_port_id(sd_lldp_neighbor *n, uint8_t *type, const void **ret, size_t *size); +int sd_lldp_neighbor_get_port_id_as_string(sd_lldp_neighbor *n, const char **ret); +int sd_lldp_neighbor_get_ttl(sd_lldp_neighbor *n, uint16_t *ret_sec); +int sd_lldp_neighbor_get_system_name(sd_lldp_neighbor *n, const char **ret); +int sd_lldp_neighbor_get_system_description(sd_lldp_neighbor *n, const char **ret); +int sd_lldp_neighbor_get_port_description(sd_lldp_neighbor *n, const char **ret); +int sd_lldp_neighbor_get_mud_url(sd_lldp_neighbor *n, const char **ret); +int sd_lldp_neighbor_get_system_capabilities(sd_lldp_neighbor *n, uint16_t *ret); +int sd_lldp_neighbor_get_enabled_capabilities(sd_lldp_neighbor *n, uint16_t *ret); + +/* Low-level, iterative TLV access. This is for everything else, it iteratively goes through all available TLVs + * (including the ones covered with the calls above), and allows multiple TLVs for the same fields. */ +int sd_lldp_neighbor_tlv_rewind(sd_lldp_neighbor *n); +int sd_lldp_neighbor_tlv_next(sd_lldp_neighbor *n); +int sd_lldp_neighbor_tlv_get_type(sd_lldp_neighbor *n, uint8_t *type); +int sd_lldp_neighbor_tlv_is_type(sd_lldp_neighbor *n, uint8_t type); +int sd_lldp_neighbor_tlv_get_oui(sd_lldp_neighbor *n, uint8_t oui[_SD_ARRAY_STATIC 3], uint8_t *subtype); +int sd_lldp_neighbor_tlv_is_oui(sd_lldp_neighbor *n, const uint8_t oui[_SD_ARRAY_STATIC 3], uint8_t subtype); +int sd_lldp_neighbor_tlv_get_raw(sd_lldp_neighbor *n, const void **ret, size_t *size); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_lldp_rx, sd_lldp_rx_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_lldp_neighbor, sd_lldp_neighbor_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-lldp-tx.h b/src/systemd/sd-lldp-tx.h new file mode 100644 index 0000000..2eeb6b6 --- /dev/null +++ b/src/systemd/sd-lldp-tx.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdlldptxhfoo +#define foosdlldptxhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include + +#include "sd-event.h" +#include "sd-lldp.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_lldp_tx sd_lldp_tx; + +__extension__ typedef enum sd_lldp_multicast_mode_t { + SD_LLDP_MULTICAST_MODE_NEAREST_BRIDGE, + SD_LLDP_MULTICAST_MODE_NON_TPMR_BRIDGE, + SD_LLDP_MULTICAST_MODE_CUSTOMER_BRIDGE, + _SD_LLDP_MULTICAST_MODE_MAX, + _SD_LLDP_MULTICAST_MODE_INVALID = -EINVAL, + _SD_ENUM_FORCE_S64(LLDP_TX_MODE) +} sd_lldp_multicast_mode_t; + +int sd_lldp_tx_new(sd_lldp_tx **ret); +sd_lldp_tx *sd_lldp_tx_ref(sd_lldp_tx *lldp_tx); +sd_lldp_tx *sd_lldp_tx_unref(sd_lldp_tx *lldp_tx); + +int sd_lldp_tx_start(sd_lldp_tx *lldp_tx); +int sd_lldp_tx_stop(sd_lldp_tx *lldp_tx); +int sd_lldp_tx_is_running(sd_lldp_tx *lldp_tx); + +int sd_lldp_tx_attach_event(sd_lldp_tx *lldp_tx, sd_event *event, int64_t priority); +int sd_lldp_tx_detach_event(sd_lldp_tx *lldp_tx); + +int sd_lldp_tx_set_ifindex(sd_lldp_tx *lldp_tx, int ifindex); +int sd_lldp_tx_set_ifname(sd_lldp_tx *lldp_tx, const char *ifname); +int sd_lldp_tx_get_ifname(sd_lldp_tx *lldp_tx, const char **ret); + +int sd_lldp_tx_set_multicast_mode(sd_lldp_tx *lldp_tx, sd_lldp_multicast_mode_t mode); +int sd_lldp_tx_set_hwaddr(sd_lldp_tx *lldp_tx, const struct ether_addr *hwaddr); +int sd_lldp_tx_set_port_description(sd_lldp_tx *lldp_tx, const char *port_description); +int sd_lldp_tx_set_hostname(sd_lldp_tx *lldp_tx, const char *hostname); +int sd_lldp_tx_set_pretty_hostname(sd_lldp_tx *lldp_tx, const char *pretty_hostname); +int sd_lldp_tx_set_mud_url(sd_lldp_tx *lldp_tx, const char *mud_url); +int sd_lldp_tx_set_capabilities(sd_lldp_tx *lldp_tx, uint16_t supported, uint16_t enabled); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_lldp_tx, sd_lldp_tx_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-lldp.h b/src/systemd/sd-lldp.h new file mode 100644 index 0000000..4069c5b --- /dev/null +++ b/src/systemd/sd-lldp.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdlldphfoo +#define foosdlldphfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* IEEE 802.1AB-2009 Clause 8: TLV Types */ +enum { + SD_LLDP_TYPE_END = 0, + SD_LLDP_TYPE_CHASSIS_ID = 1, + SD_LLDP_TYPE_PORT_ID = 2, + SD_LLDP_TYPE_TTL = 3, + SD_LLDP_TYPE_PORT_DESCRIPTION = 4, + SD_LLDP_TYPE_SYSTEM_NAME = 5, + SD_LLDP_TYPE_SYSTEM_DESCRIPTION = 6, + SD_LLDP_TYPE_SYSTEM_CAPABILITIES = 7, + SD_LLDP_TYPE_MGMT_ADDRESS = 8, + SD_LLDP_TYPE_PRIVATE = 127 +}; + +/* IEEE 802.1AB-2009 Clause 8.5.2: Chassis subtypes */ +enum { + SD_LLDP_CHASSIS_SUBTYPE_RESERVED = 0, + SD_LLDP_CHASSIS_SUBTYPE_CHASSIS_COMPONENT = 1, + SD_LLDP_CHASSIS_SUBTYPE_INTERFACE_ALIAS = 2, + SD_LLDP_CHASSIS_SUBTYPE_PORT_COMPONENT = 3, + SD_LLDP_CHASSIS_SUBTYPE_MAC_ADDRESS = 4, + SD_LLDP_CHASSIS_SUBTYPE_NETWORK_ADDRESS = 5, + SD_LLDP_CHASSIS_SUBTYPE_INTERFACE_NAME = 6, + SD_LLDP_CHASSIS_SUBTYPE_LOCALLY_ASSIGNED = 7 +}; + +/* IEEE 802.1AB-2009 Clause 8.5.3: Port subtype */ +enum { + SD_LLDP_PORT_SUBTYPE_RESERVED = 0, + SD_LLDP_PORT_SUBTYPE_INTERFACE_ALIAS = 1, + SD_LLDP_PORT_SUBTYPE_PORT_COMPONENT = 2, + SD_LLDP_PORT_SUBTYPE_MAC_ADDRESS = 3, + SD_LLDP_PORT_SUBTYPE_NETWORK_ADDRESS = 4, + SD_LLDP_PORT_SUBTYPE_INTERFACE_NAME = 5, + SD_LLDP_PORT_SUBTYPE_AGENT_CIRCUIT_ID = 6, + SD_LLDP_PORT_SUBTYPE_LOCALLY_ASSIGNED = 7 +}; + +/* IEEE 802.1AB-2009 Clause 8.5.8: System capabilities */ +enum { + SD_LLDP_SYSTEM_CAPABILITIES_OTHER = 1 << 0, + SD_LLDP_SYSTEM_CAPABILITIES_REPEATER = 1 << 1, + SD_LLDP_SYSTEM_CAPABILITIES_BRIDGE = 1 << 2, + SD_LLDP_SYSTEM_CAPABILITIES_WLAN_AP = 1 << 3, + SD_LLDP_SYSTEM_CAPABILITIES_ROUTER = 1 << 4, + SD_LLDP_SYSTEM_CAPABILITIES_PHONE = 1 << 5, + SD_LLDP_SYSTEM_CAPABILITIES_DOCSIS = 1 << 6, + SD_LLDP_SYSTEM_CAPABILITIES_STATION = 1 << 7, + SD_LLDP_SYSTEM_CAPABILITIES_CVLAN = 1 << 8, + SD_LLDP_SYSTEM_CAPABILITIES_SVLAN = 1 << 9, + SD_LLDP_SYSTEM_CAPABILITIES_TPMR = 1 << 10 +}; + +#define SD_LLDP_SYSTEM_CAPABILITIES_ALL UINT16_MAX + +#define SD_LLDP_SYSTEM_CAPABILITIES_ALL_ROUTERS \ + ((uint16_t) \ + (SD_LLDP_SYSTEM_CAPABILITIES_REPEATER | \ + SD_LLDP_SYSTEM_CAPABILITIES_BRIDGE | \ + SD_LLDP_SYSTEM_CAPABILITIES_WLAN_AP | \ + SD_LLDP_SYSTEM_CAPABILITIES_ROUTER | \ + SD_LLDP_SYSTEM_CAPABILITIES_DOCSIS | \ + SD_LLDP_SYSTEM_CAPABILITIES_CVLAN | \ + SD_LLDP_SYSTEM_CAPABILITIES_SVLAN | \ + SD_LLDP_SYSTEM_CAPABILITIES_TPMR)) + +#define SD_LLDP_OUI_802_1 (const uint8_t[]) { 0x00, 0x80, 0xc2 } +#define SD_LLDP_OUI_802_3 (const uint8_t[]) { 0x00, 0x12, 0x0f } + +#define _SD_LLDP_OUI_IANA 0x00, 0x00, 0x5E +#define SD_LLDP_OUI_IANA (const uint8_t[]) { _SD_LLDP_OUI_IANA } + +#define SD_LLDP_OUI_IANA_SUBTYPE_MUD 0x01 +#define SD_LLDP_OUI_IANA_MUD \ + (const uint8_t[]) { _SD_LLDP_OUI_IANA, SD_LLDP_OUI_IANA_SUBTYPE_MUD } + +/* IEEE 802.1AB-2009 Annex E */ +enum { + SD_LLDP_OUI_802_1_SUBTYPE_PORT_VLAN_ID = 1, + SD_LLDP_OUI_802_1_SUBTYPE_PORT_PROTOCOL_VLAN_ID = 2, + SD_LLDP_OUI_802_1_SUBTYPE_VLAN_NAME = 3, + SD_LLDP_OUI_802_1_SUBTYPE_PROTOCOL_IDENTITY = 4, + SD_LLDP_OUI_802_1_SUBTYPE_VID_USAGE_DIGEST = 5, + SD_LLDP_OUI_802_1_SUBTYPE_MANAGEMENT_VID = 6, + SD_LLDP_OUI_802_1_SUBTYPE_LINK_AGGREGATION = 7 +}; + +/* IEEE 802.1AB-2009 Annex F */ +enum { + SD_LLDP_OUI_802_3_SUBTYPE_MAC_PHY_CONFIG_STATUS = 1, + SD_LLDP_OUI_802_3_SUBTYPE_POWER_VIA_MDI = 2, + SD_LLDP_OUI_802_3_SUBTYPE_LINK_AGGREGATION = 3, + SD_LLDP_OUI_802_3_SUBTYPE_MAXIMUM_FRAME_SIZE = 4 +}; + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-login.h b/src/systemd/sd-login.h new file mode 100644 index 0000000..c84f2c0 --- /dev/null +++ b/src/systemd/sd-login.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdloginhfoo +#define foosdloginhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "_sd-common.h" + +/* + * A few points: + * + * Instead of returning an empty string array or empty uid array, we + * may return NULL. + * + * Free the data the library returns with libc free(). String arrays + * are NULL terminated, and you need to free the array itself, in + * addition to the strings contained. + * + * We return error codes as negative errno, kernel-style. On success, we + * return 0 or positive. + * + * These functions access data in /proc, /sys/fs/cgroup, and /run. All + * of these are virtual file systems; therefore, accesses are + * relatively cheap. + * + * See sd-login(3) for more information. + */ + +_SD_BEGIN_DECLARATIONS; + +/* Get session from PID. Note that 'shared' processes of a user are + * not attached to a session, but only attached to a user. This will + * return an error for system processes and 'shared' processes of a + * user. */ +int sd_pid_get_session(pid_t pid, char **session); + +/* Get UID of the owner of the session of the PID (or in case the + * process is a 'shared' user process, the UID of that user is + * returned). This will not return the UID of the process, but rather + * the UID of the owner of the cgroup that the process is in. This will + * return an error for system processes. */ +int sd_pid_get_owner_uid(pid_t pid, uid_t *uid); + +/* Get systemd non-slice unit (i.e. service) name from PID, for system + * services. This will return an error for non-service processes. */ +int sd_pid_get_unit(pid_t pid, char **unit); + +/* Get systemd non-slice unit (i.e. service) name from PID, for user + * services. This will return an error for non-user-service + * processes. */ +int sd_pid_get_user_unit(pid_t pid, char **unit); + +/* Get slice name from PID. */ +int sd_pid_get_slice(pid_t pid, char **slice); + +/* Get user slice name from PID. */ +int sd_pid_get_user_slice(pid_t pid, char **slice); + +/* Get machine name from PID, for processes assigned to a VM or + * container. This will return an error for non-machine processes. */ +int sd_pid_get_machine_name(pid_t pid, char **machine); + +/* Get the control group from a PID, relative to the root of the + * hierarchy. */ +int sd_pid_get_cgroup(pid_t pid, char **cgroup); + +/* Equivalent to the corresponding sd_pid_get* functions, but take a + * PIDFD instead of a PID, to ensure there can be no possible PID + * recycle issues before/after the calls. */ +int sd_pidfd_get_session(int pidfd, char **session); +int sd_pidfd_get_owner_uid(int pidfd, uid_t *uid); +int sd_pidfd_get_unit(int pidfd, char **unit); +int sd_pidfd_get_user_unit(int pidfd, char **unit); +int sd_pidfd_get_slice(int pidfd, char **slice); +int sd_pidfd_get_user_slice(int pidfd, char **slice); +int sd_pidfd_get_machine_name(int pidfd, char **machine); +int sd_pidfd_get_cgroup(int pidfd, char **cgroup); + +/* Similar to sd_pid_get_session(), but retrieves data about the peer + * of a connected AF_UNIX socket */ +int sd_peer_get_session(int fd, char **session); + +/* Similar to sd_pid_get_owner_uid(), but retrieves data about the peer of + * a connected AF_UNIX socket */ +int sd_peer_get_owner_uid(int fd, uid_t *uid); + +/* Similar to sd_pid_get_unit(), but retrieves data about the peer of + * a connected AF_UNIX socket */ +int sd_peer_get_unit(int fd, char **unit); + +/* Similar to sd_pid_get_user_unit(), but retrieves data about the peer of + * a connected AF_UNIX socket */ +int sd_peer_get_user_unit(int fd, char **unit); + +/* Similar to sd_pid_get_slice(), but retrieves data about the peer of + * a connected AF_UNIX socket */ +int sd_peer_get_slice(int fd, char **slice); + +/* Similar to sd_pid_get_user_slice(), but retrieves data about the peer of + * a connected AF_UNIX socket */ +int sd_peer_get_user_slice(int fd, char **slice); + +/* Similar to sd_pid_get_machine_name(), but retrieves data about the + * peer of a connected AF_UNIX socket */ +int sd_peer_get_machine_name(int fd, char **machine); + +/* Similar to sd_pid_get_cgroup(), but retrieves data about the peer + * of a connected AF_UNIX socket. */ +int sd_peer_get_cgroup(int fd, char **cgroup); + +/* Get state from UID. Possible states: offline, lingering, online, active, closing */ +int sd_uid_get_state(uid_t uid, char **state); + +/* Return primary session of user, if there is any */ +int sd_uid_get_display(uid_t uid, char **session); + +/* Determine the login time of user */ +int sd_uid_get_login_time(uid_t uid, uint64_t *usec); + +/* Return 1 if UID has session on seat. If require_active is true, this will + * look for active sessions only. */ +int sd_uid_is_on_seat(uid_t uid, int require_active, const char *seat); + +/* Return sessions of user. If require_active is true, this will look for + * active sessions only. Returns the number of sessions. + * If sessions is NULL, this will just return the number of sessions. */ +int sd_uid_get_sessions(uid_t uid, int require_active, char ***sessions); + +/* Return seats of user is on. If require_active is true, this will look for + * active seats only. Returns the number of seats. + * If seats is NULL, this will just return the number of seats. */ +int sd_uid_get_seats(uid_t uid, int require_active, char ***seats); + +/* Return 1 if the session is active. */ +int sd_session_is_active(const char *session); + +/* Return 1 if the session is remote. */ +int sd_session_is_remote(const char *session); + +/* Get state from session. Possible states: online, active, closing. + * This function is a more generic version of sd_session_is_active(). */ +int sd_session_get_state(const char *session, char **state); + +/* Determine user ID of session */ +int sd_session_get_uid(const char *session, uid_t *uid); + +/* Determine username of session */ +int sd_session_get_username(const char *session, char **username); + +/* Determine seat of session */ +int sd_session_get_seat(const char *session, char **seat); + +/* Determine the start time of session */ +int sd_session_get_start_time(const char *session, uint64_t *usec); + +/* Determine the (PAM) service name this session was registered by. */ +int sd_session_get_service(const char *session, char **service); + +/* Determine the type of this session, i.e. one of "tty", "x11", "wayland", "mir" or "unspecified". */ +int sd_session_get_type(const char *session, char **type); + +/* Determine the class of this session, i.e. one of "user", "greeter" or "lock-screen". */ +int sd_session_get_class(const char *session, char **clazz); + +/* Determine the desktop brand of this session, i.e. something like "GNOME", "KDE" or "systemd-console". */ +int sd_session_get_desktop(const char *session, char **desktop); + +/* Determine the X11 display of this session. */ +int sd_session_get_display(const char *session, char **display); + +/* Determine the leader process of this session. */ +int sd_session_get_leader(const char *session, pid_t *leader); + +/* Determine the remote host of this session. */ +int sd_session_get_remote_host(const char *session, char **remote_host); + +/* Determine the remote user of this session (if provided by PAM). */ +int sd_session_get_remote_user(const char *session, char **remote_user); + +/* Determine the TTY of this session. */ +int sd_session_get_tty(const char *session, char **display); + +/* Determine the VT number of this session. */ +int sd_session_get_vt(const char *session, unsigned *vtnr); + +/* Return active session and user of seat */ +int sd_seat_get_active(const char *seat, char **session, uid_t *uid); + +/* Return sessions and users on seat. Returns number of sessions. + * If sessions is NULL, this returns only the number of sessions. */ +int sd_seat_get_sessions( + const char *seat, + char ***ret_sessions, + uid_t **ret_uids, + unsigned *ret_n_uids); + +/* Return whether the seat is multi-session capable */ +int sd_seat_can_multi_session(const char *seat) _sd_deprecated_; + +/* Return whether the seat is TTY capable, i.e. suitable for showing console UIs */ +int sd_seat_can_tty(const char *seat); + +/* Return whether the seat is graphics capable, i.e. suitable for showing graphical UIs */ +int sd_seat_can_graphical(const char *seat); + +/* Return the class of machine */ +int sd_machine_get_class(const char *machine, char **clazz); + +/* Return the list if host-side network interface indices of a machine */ +int sd_machine_get_ifindices(const char *machine, int **ret_ifindices); + +/* Get all seats, store in *seats. Returns the number of seats. If + * seats is NULL, this only returns the number of seats. */ +int sd_get_seats(char ***seats); + +/* Get all sessions, store in *sessions. Returns the number of + * sessions. If sessions is NULL, this only returns the number of sessions. */ +int sd_get_sessions(char ***sessions); + +/* Get all logged in users, store in *users. Returns the number of + * users. If users is NULL, this only returns the number of users. */ +int sd_get_uids(uid_t **users); + +/* Get all running virtual machines/containers */ +int sd_get_machine_names(char ***machines); + +/* Monitor object */ +typedef struct sd_login_monitor sd_login_monitor; + +/* Create a new monitor. Category must be NULL, "seat", "session", + * "uid", or "machine" to get monitor events for the specific category + * (or all). */ +int sd_login_monitor_new(const char *category, sd_login_monitor** ret); + +/* Destroys the passed monitor. Returns NULL. */ +sd_login_monitor* sd_login_monitor_unref(sd_login_monitor *m); + +/* Flushes the monitor */ +int sd_login_monitor_flush(sd_login_monitor *m); + +/* Get FD from monitor */ +int sd_login_monitor_get_fd(sd_login_monitor *m); + +/* Get poll() mask to monitor */ +int sd_login_monitor_get_events(sd_login_monitor *m); + +/* Get timeout for poll(), as usec value relative to CLOCK_MONOTONIC's epoch */ +int sd_login_monitor_get_timeout(sd_login_monitor *m, uint64_t *timeout_usec); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_login_monitor, sd_login_monitor_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h new file mode 100644 index 0000000..e3f6806 --- /dev/null +++ b/src/systemd/sd-messages.h @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdmessageshfoo +#define foosdmessageshfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "sd-id128.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* Hey! If you add a new message here, you *must* also update the message catalog with an appropriate explanation */ + +/* And if you add a new ID here, make sure to generate a random one with "systemd-id128 new". Do not use any + * other IDs, and do not count them up manually. */ + +#define SD_MESSAGE_JOURNAL_START SD_ID128_MAKE(f7,73,79,a8,49,0b,40,8b,be,5f,69,40,50,5a,77,7b) +#define SD_MESSAGE_JOURNAL_START_STR SD_ID128_MAKE_STR(f7,73,79,a8,49,0b,40,8b,be,5f,69,40,50,5a,77,7b) +#define SD_MESSAGE_JOURNAL_STOP SD_ID128_MAKE(d9,3f,b3,c9,c2,4d,45,1a,97,ce,a6,15,ce,59,c0,0b) +#define SD_MESSAGE_JOURNAL_STOP_STR SD_ID128_MAKE_STR(d9,3f,b3,c9,c2,4d,45,1a,97,ce,a6,15,ce,59,c0,0b) +#define SD_MESSAGE_JOURNAL_DROPPED SD_ID128_MAKE(a5,96,d6,fe,7b,fa,49,94,82,8e,72,30,9e,95,d6,1e) +#define SD_MESSAGE_JOURNAL_DROPPED_STR SD_ID128_MAKE_STR(a5,96,d6,fe,7b,fa,49,94,82,8e,72,30,9e,95,d6,1e) +#define SD_MESSAGE_JOURNAL_MISSED SD_ID128_MAKE(e9,bf,28,e6,e8,34,48,1b,b6,f4,8f,54,8a,d1,36,06) +#define SD_MESSAGE_JOURNAL_MISSED_STR SD_ID128_MAKE_STR(e9,bf,28,e6,e8,34,48,1b,b6,f4,8f,54,8a,d1,36,06) +#define SD_MESSAGE_JOURNAL_USAGE SD_ID128_MAKE(ec,38,7f,57,7b,84,4b,8f,a9,48,f3,3c,ad,9a,75,e6) +#define SD_MESSAGE_JOURNAL_USAGE_STR SD_ID128_MAKE_STR(ec,38,7f,57,7b,84,4b,8f,a9,48,f3,3c,ad,9a,75,e6) + +#define SD_MESSAGE_COREDUMP SD_ID128_MAKE(fc,2e,22,bc,6e,e6,47,b6,b9,07,29,ab,34,a2,50,b1) +#define SD_MESSAGE_COREDUMP_STR SD_ID128_MAKE_STR(fc,2e,22,bc,6e,e6,47,b6,b9,07,29,ab,34,a2,50,b1) +#define SD_MESSAGE_TRUNCATED_CORE SD_ID128_MAKE(5a,ad,d8,e9,54,dc,4b,1a,8c,95,4d,63,fd,9e,11,37) +#define SD_MESSAGE_TRUNCATED_CORE_STR SD_ID128_MAKE_STR(5a,ad,d8,e9,54,dc,4b,1a,8c,95,4d,63,fd,9e,11,37) +#define SD_MESSAGE_BACKTRACE SD_ID128_MAKE(1f,4e,0a,44,a8,86,49,93,9a,ae,a3,4f,c6,da,8c,95) +#define SD_MESSAGE_BACKTRACE_STR SD_ID128_MAKE_STR(1f,4e,0a,44,a8,86,49,93,9a,ae,a3,4f,c6,da,8c,95) + +#define SD_MESSAGE_SESSION_START SD_ID128_MAKE(8d,45,62,0c,1a,43,48,db,b1,74,10,da,57,c6,0c,66) +#define SD_MESSAGE_SESSION_START_STR SD_ID128_MAKE_STR(8d,45,62,0c,1a,43,48,db,b1,74,10,da,57,c6,0c,66) +#define SD_MESSAGE_SESSION_STOP SD_ID128_MAKE(33,54,93,94,24,b4,45,6d,98,02,ca,83,33,ed,42,4a) +#define SD_MESSAGE_SESSION_STOP_STR SD_ID128_MAKE_STR(33,54,93,94,24,b4,45,6d,98,02,ca,83,33,ed,42,4a) +#define SD_MESSAGE_SEAT_START SD_ID128_MAKE(fc,be,fc,5d,a2,3d,42,80,93,f9,7c,82,a9,29,0f,7b) +#define SD_MESSAGE_SEAT_START_STR SD_ID128_MAKE_STR(fc,be,fc,5d,a2,3d,42,80,93,f9,7c,82,a9,29,0f,7b) +#define SD_MESSAGE_SEAT_STOP SD_ID128_MAKE(e7,85,2b,fe,46,78,4e,d0,ac,cd,e0,4b,c8,64,c2,d5) +#define SD_MESSAGE_SEAT_STOP_STR SD_ID128_MAKE_STR(e7,85,2b,fe,46,78,4e,d0,ac,cd,e0,4b,c8,64,c2,d5) +#define SD_MESSAGE_MACHINE_START SD_ID128_MAKE(24,d8,d4,45,25,73,40,24,96,06,83,81,a6,31,2d,f2) +#define SD_MESSAGE_MACHINE_START_STR SD_ID128_MAKE_STR(24,d8,d4,45,25,73,40,24,96,06,83,81,a6,31,2d,f2) +#define SD_MESSAGE_MACHINE_STOP SD_ID128_MAKE(58,43,2b,d3,ba,ce,47,7c,b5,14,b5,63,81,b8,a7,58) +#define SD_MESSAGE_MACHINE_STOP_STR SD_ID128_MAKE_STR(58,43,2b,d3,ba,ce,47,7c,b5,14,b5,63,81,b8,a7,58) + +#define SD_MESSAGE_TIME_CHANGE SD_ID128_MAKE(c7,a7,87,07,9b,35,4e,aa,a9,e7,7b,37,18,93,cd,27) +#define SD_MESSAGE_TIME_CHANGE_STR SD_ID128_MAKE_STR(c7,a7,87,07,9b,35,4e,aa,a9,e7,7b,37,18,93,cd,27) +#define SD_MESSAGE_TIMEZONE_CHANGE SD_ID128_MAKE(45,f8,2f,4a,ef,7a,4b,bf,94,2c,e8,61,d1,f2,09,90) +#define SD_MESSAGE_TIMEZONE_CHANGE_STR SD_ID128_MAKE_STR(45,f8,2f,4a,ef,7a,4b,bf,94,2c,e8,61,d1,f2,09,90) + +#define SD_MESSAGE_TAINTED SD_ID128_MAKE(50,87,6a,9d,b0,0f,4c,40,bd,e1,a2,ad,38,1c,3a,1b) +#define SD_MESSAGE_TAINTED_STR SD_ID128_MAKE_STR(50,87,6a,9d,b0,0f,4c,40,bd,e1,a2,ad,38,1c,3a,1b) +#define SD_MESSAGE_STARTUP_FINISHED SD_ID128_MAKE(b0,7a,24,9c,d0,24,41,4a,82,dd,00,cd,18,13,78,ff) +#define SD_MESSAGE_STARTUP_FINISHED_STR SD_ID128_MAKE_STR(b0,7a,24,9c,d0,24,41,4a,82,dd,00,cd,18,13,78,ff) +#define SD_MESSAGE_USER_STARTUP_FINISHED SD_ID128_MAKE(ee,d0,0a,68,ff,d8,4e,31,88,21,05,fd,97,3a,bd,d1) +#define SD_MESSAGE_USER_STARTUP_FINISHED_STR SD_ID128_MAKE_STR(ee,d0,0a,68,ff,d8,4e,31,88,21,05,fd,97,3a,bd,d1) + +#define SD_MESSAGE_SLEEP_START SD_ID128_MAKE(6b,bd,95,ee,97,79,41,e4,97,c4,8b,e2,7c,25,41,28) +#define SD_MESSAGE_SLEEP_START_STR SD_ID128_MAKE_STR(6b,bd,95,ee,97,79,41,e4,97,c4,8b,e2,7c,25,41,28) +#define SD_MESSAGE_SLEEP_STOP SD_ID128_MAKE(88,11,e6,df,2a,8e,40,f5,8a,94,ce,a2,6f,8e,bf,14) +#define SD_MESSAGE_SLEEP_STOP_STR SD_ID128_MAKE_STR(88,11,e6,df,2a,8e,40,f5,8a,94,ce,a2,6f,8e,bf,14) + +#define SD_MESSAGE_SHUTDOWN SD_ID128_MAKE(98,26,88,66,d1,d5,4a,49,9c,4e,98,92,1d,93,bc,40) +#define SD_MESSAGE_SHUTDOWN_STR SD_ID128_MAKE_STR(98,26,88,66,d1,d5,4a,49,9c,4e,98,92,1d,93,bc,40) + +#define SD_MESSAGE_FACTORY_RESET SD_ID128_MAKE(c1,4a,af,76,ec,28,4a,5f,a1,f1,05,f8,8d,fb,06,1c) +#define SD_MESSAGE_FACTORY_RESET_STR SD_ID128_MAKE_STR(c1,4a,af,76,ec,28,4a,5f,a1,f1,05,f8,8d,fb,06,1c) + +#define SD_MESSAGE_CRASH_EXIT SD_ID128_MAKE(d9,ec,5e,95,e4,b6,46,aa,ae,a2,fd,05,21,4e,db,da) +#define SD_MESSAGE_CRASH_EXIT_STR SD_ID128_MAKE_STR(d9,ec,5e,95,e4,b6,46,aa,ae,a2,fd,05,21,4e,db,da) +#define SD_MESSAGE_CRASH_FAILED SD_ID128_MAKE(3e,d0,16,3e,86,8a,44,17,ab,8b,9e,21,04,07,a9,6c) +#define SD_MESSAGE_CRASH_FAILED_STR SD_ID128_MAKE_STR(3e,d0,16,3e,86,8a,44,17,ab,8b,9e,21,04,07,a9,6c) +#define SD_MESSAGE_CRASH_FREEZE SD_ID128_MAKE(64,5c,73,55,37,63,4a,e0,a3,2b,15,a7,c6,cb,a7,d4) +#define SD_MESSAGE_CRASH_FREEZE_STR SD_ID128_MAKE_STR(64,5c,73,55,37,63,4a,e0,a3,2b,15,a7,c6,cb,a7,d4) + +#define SD_MESSAGE_CRASH_NO_COREDUMP SD_ID128_MAKE(5a,dd,b3,a0,6a,73,4d,33,96,b7,94,bf,98,fb,2d,01) +#define SD_MESSAGE_CRASH_NO_COREDUMP_STR SD_ID128_MAKE_STR(5a,dd,b3,a0,6a,73,4d,33,96,b7,94,bf,98,fb,2d,01) +#define SD_MESSAGE_CRASH_NO_FORK SD_ID128_MAKE(5c,9e,98,de,4a,b9,4c,6a,9d,04,d0,ad,79,3b,d9,03) +#define SD_MESSAGE_CRASH_NO_FORK_STR SD_ID128_MAKE_STR(5c,9e,98,de,4a,b9,4c,6a,9d,04,d0,ad,79,3b,d9,03) +#define SD_MESSAGE_CRASH_UNKNOWN_SIGNAL SD_ID128_MAKE(5e,6f,1f,5e,4d,b6,4a,0e,ae,e3,36,82,49,d2,0b,94) +#define SD_MESSAGE_CRASH_UNKNOWN_SIGNAL_STR SD_ID128_MAKE_STR(5e,6f,1f,5e,4d,b6,4a,0e,ae,e3,36,82,49,d2,0b,94) +#define SD_MESSAGE_CRASH_SYSTEMD_SIGNAL SD_ID128_MAKE(83,f8,4b,35,ee,26,4f,74,a3,89,6a,97,17,af,34,cb) +#define SD_MESSAGE_CRASH_SYSTEMD_SIGNAL_STR SD_ID128_MAKE_STR(83,f8,4b,35,ee,26,4f,74,a3,89,6a,97,17,af,34,cb) +#define SD_MESSAGE_CRASH_PROCESS_SIGNAL SD_ID128_MAKE(3a,73,a9,8b,af,5b,4b,19,99,29,e3,22,6c,0b,e7,83) +#define SD_MESSAGE_CRASH_PROCESS_SIGNAL_STR SD_ID128_MAKE_STR(3a,73,a9,8b,af,5b,4b,19,99,29,e3,22,6c,0b,e7,83) +#define SD_MESSAGE_CRASH_WAITPID_FAILED SD_ID128_MAKE(2e,d1,8d,4f,78,ca,47,f0,a9,bc,25,27,1c,26,ad,b4) +#define SD_MESSAGE_CRASH_WAITPID_FAILED_STR SD_ID128_MAKE_STR(2e,d1,8d,4f,78,ca,47,f0,a9,bc,25,27,1c,26,ad,b4) +#define SD_MESSAGE_CRASH_COREDUMP_FAILED SD_ID128_MAKE(56,b1,cd,96,f2,42,46,c5,b6,07,66,6f,da,95,23,56) +#define SD_MESSAGE_CRASH_COREDUMP_FAILED_STR SD_ID128_MAKE_STR(56,b1,cd,96,f2,42,46,c5,b6,07,66,6f,da,95,23,56) +#define SD_MESSAGE_CRASH_COREDUMP_PID SD_ID128_MAKE(4a,c7,56,6d,4d,75,48,f4,98,1f,62,9a,28,f0,f8,29) +#define SD_MESSAGE_CRASH_COREDUMP_PID_STR SD_ID128_MAKE_STR(4a,c7,56,6d,4d,75,48,f4,98,1f,62,9a,28,f0,f8,29) +#define SD_MESSAGE_CRASH_SHELL_FORK_FAILED SD_ID128_MAKE(38,e8,b1,e0,39,ad,46,92,91,b1,8b,44,c5,53,a5,b7) +#define SD_MESSAGE_CRASH_SHELL_FORK_FAILED_STR SD_ID128_MAKE_STR(38,e8,b1,e0,39,ad,46,92,91,b1,8b,44,c5,53,a5,b7) +#define SD_MESSAGE_CRASH_EXECLE_FAILED SD_ID128_MAKE(87,27,29,b4,7d,be,47,3e,b7,68,cc,ec,d4,77,be,da) +#define SD_MESSAGE_CRASH_EXECLE_FAILED_STR SD_ID128_MAKE_STR(87,27,29,b4,7d,be,47,3e,b7,68,cc,ec,d4,77,be,da) + +#define SD_MESSAGE_SELINUX_FAILED SD_ID128_MAKE(65,8a,67,ad,c1,c9,40,b3,b3,31,6e,7e,86,28,83,4a) +#define SD_MESSAGE_SELINUX_FAILED_STR SD_ID128_MAKE_STR(65,8a,67,ad,c1,c9,40,b3,b3,31,6e,7e,86,28,83,4a) + +#define SD_MESSAGE_BATTERY_LOW_WARNING SD_ID128_MAKE(e6,f4,56,bd,92,00,4d,95,80,16,0b,22,07,55,51,86) +#define SD_MESSAGE_BATTERY_LOW_WARNING_STR SD_ID128_MAKE_STR(e6,f4,56,bd,92,00,4d,95,80,16,0b,22,07,55,51,86) +#define SD_MESSAGE_BATTERY_LOW_POWEROFF SD_ID128_MAKE(26,74,37,d3,3f,dd,41,09,9a,d7,62,21,cc,24,a3,35) +#define SD_MESSAGE_BATTERY_LOW_POWEROFF_STR SD_ID128_MAKE_STR(26,74,37,d3,3f,dd,41,09,9a,d7,62,21,cc,24,a3,35) + +#define SD_MESSAGE_CORE_MAINLOOP_FAILED SD_ID128_MAKE(79,e0,5b,67,bc,45,45,d1,92,2f,e4,71,07,ee,60,c5) +#define SD_MESSAGE_CORE_MAINLOOP_FAILED_STR SD_ID128_MAKE_STR(79,e0,5b,67,bc,45,45,d1,92,2f,e4,71,07,ee,60,c5) +#define SD_MESSAGE_CORE_NO_XDGDIR_PATH SD_ID128_MAKE(db,b1,36,b1,0e,f4,45,7b,a4,7a,79,5d,62,f1,08,c9) +#define SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR SD_ID128_MAKE_STR(db,b1,36,b1,0e,f4,45,7b,a4,7a,79,5d,62,f1,08,c9) +#define SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER SD_ID128_MAKE(ed,15,8c,2d,f8,88,4f,a5,84,ee,ad,2d,90,2c,10,32) +#define SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR SD_ID128_MAKE_STR(ed,15,8c,2d,f8,88,4f,a5,84,ee,ad,2d,90,2c,10,32) +#define SD_MESSAGE_CORE_CAPABILITY_BOUNDING SD_ID128_MAKE(42,69,5b,50,0d,f0,48,29,8b,ee,37,15,9c,aa,9f,2e) +#define SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR SD_ID128_MAKE_STR(42,69,5b,50,0d,f0,48,29,8b,ee,37,15,9c,aa,9f,2e) +#define SD_MESSAGE_CORE_DISABLE_PRIVILEGES SD_ID128_MAKE(bf,c2,43,07,24,ab,44,49,97,35,b4,f9,4c,ca,92,95) +#define SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR SD_ID128_MAKE_STR(bf,c2,43,07,24,ab,44,49,97,35,b4,f9,4c,ca,92,95) +#define SD_MESSAGE_CORE_START_TARGET_FAILED SD_ID128_MAKE(59,28,8a,f5,23,be,43,a2,8d,49,4e,41,e2,6e,45,10) +#define SD_MESSAGE_CORE_START_TARGET_FAILED_STR SD_ID128_MAKE_STR(59,28,8a,f5,23,be,43,a2,8d,49,4e,41,e2,6e,45,10) +#define SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED SD_ID128_MAKE(68,9b,4f,cc,97,b4,48,6e,a5,da,92,db,69,c9,e3,14) +#define SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR SD_ID128_MAKE_STR(68,9b,4f,cc,97,b4,48,6e,a5,da,92,db,69,c9,e3,14) +#define SD_MESSAGE_CORE_FD_SET_FAILED SD_ID128_MAKE(5e,d8,36,f1,76,6f,4a,8a,9f,c5,da,45,aa,e2,3b,29) +#define SD_MESSAGE_CORE_FD_SET_FAILED_STR SD_ID128_MAKE_STR(5e,d8,36,f1,76,6f,4a,8a,9f,c5,da,45,aa,e2,3b,29) +#define SD_MESSAGE_CORE_PID1_ENVIRONMENT SD_ID128_MAKE(6a,40,fb,fb,d2,ba,4b,8d,b0,2f,b4,0c,9c,d0,90,d7) +#define SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR SD_ID128_MAKE_STR(6a,40,fb,fb,d2,ba,4b,8d,b0,2f,b4,0c,9c,d0,90,d7) +#define SD_MESSAGE_CORE_MANAGER_ALLOCATE SD_ID128_MAKE(0e,54,47,09,84,ac,41,96,89,74,3d,95,7a,11,9e,2e) +#define SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR SD_ID128_MAKE_STR(0e,54,47,09,84,ac,41,96,89,74,3d,95,7a,11,9e,2e) + +#define SD_MESSAGE_SMACK_FAILED_WRITE SD_ID128_MAKE(d6,7f,a9,f8,47,aa,4b,04,8a,2a,e3,35,35,33,1a,db) +#define SD_MESSAGE_SMACK_FAILED_WRITE_STR SD_ID128_MAKE_STR(d6,7f,a9,f8,47,aa,4b,04,8a,2a,e3,35,35,33,1a,db) + +#define SD_MESSAGE_SHUTDOWN_ERROR SD_ID128_MAKE(af,55,a6,f7,5b,54,44,31,b7,26,49,f3,6f,f6,d6,2c) +#define SD_MESSAGE_SHUTDOWN_ERROR_STR SD_ID128_MAKE_STR(af,55,a6,f7,5b,54,44,31,b7,26,49,f3,6f,f6,d6,2c) + +#define SD_MESSAGE_VALGRIND_HELPER_FORK SD_ID128_MAKE(d1,8e,03,39,ef,b2,4a,06,8d,9c,10,60,22,10,48,c2) +#define SD_MESSAGE_VALGRIND_HELPER_FORK_STR SD_ID128_MAKE_STR(d1,8e,03,39,ef,b2,4a,06,8d,9c,10,60,22,10,48,c2) + +/* The messages below are actually about jobs, not really about units, the macros are misleadingly named. + * Moreover SD_MESSAGE_UNIT_FAILED is not actually about a failing unit but about a failed start job. A job + * either finishes with SD_MESSAGE_UNIT_STARTED or with SD_MESSAGE_UNIT_FAILED hence. */ +#define SD_MESSAGE_UNIT_STARTING SD_ID128_MAKE(7d,49,58,e8,42,da,4a,75,8f,6c,1c,dc,7b,36,dc,c5) +#define SD_MESSAGE_UNIT_STARTING_STR SD_ID128_MAKE_STR(7d,49,58,e8,42,da,4a,75,8f,6c,1c,dc,7b,36,dc,c5) +#define SD_MESSAGE_UNIT_STARTED SD_ID128_MAKE(39,f5,34,79,d3,a0,45,ac,8e,11,78,62,48,23,1f,bf) +#define SD_MESSAGE_UNIT_STARTED_STR SD_ID128_MAKE_STR(39,f5,34,79,d3,a0,45,ac,8e,11,78,62,48,23,1f,bf) +#define SD_MESSAGE_UNIT_FAILED SD_ID128_MAKE(be,02,cf,68,55,d2,42,8b,a4,0d,f7,e9,d0,22,f0,3d) +#define SD_MESSAGE_UNIT_FAILED_STR SD_ID128_MAKE_STR(be,02,cf,68,55,d2,42,8b,a4,0d,f7,e9,d0,22,f0,3d) +#define SD_MESSAGE_UNIT_STOPPING SD_ID128_MAKE(de,5b,42,6a,63,be,47,a7,b6,ac,3e,aa,c8,2e,2f,6f) +#define SD_MESSAGE_UNIT_STOPPING_STR SD_ID128_MAKE_STR(de,5b,42,6a,63,be,47,a7,b6,ac,3e,aa,c8,2e,2f,6f) +#define SD_MESSAGE_UNIT_STOPPED SD_ID128_MAKE(9d,1a,aa,27,d6,01,40,bd,96,36,54,38,aa,d2,02,86) +#define SD_MESSAGE_UNIT_STOPPED_STR SD_ID128_MAKE_STR(9d,1a,aa,27,d6,01,40,bd,96,36,54,38,aa,d2,02,86) +#define SD_MESSAGE_UNIT_RELOADING SD_ID128_MAKE(d3,4d,03,7f,ff,18,47,e6,ae,66,9a,37,0e,69,47,25) +#define SD_MESSAGE_UNIT_RELOADING_STR SD_ID128_MAKE_STR(d3,4d,03,7f,ff,18,47,e6,ae,66,9a,37,0e,69,47,25) +#define SD_MESSAGE_UNIT_RELOADED SD_ID128_MAKE(7b,05,eb,c6,68,38,42,22,ba,a8,88,11,79,cf,da,54) +#define SD_MESSAGE_UNIT_RELOADED_STR SD_ID128_MAKE_STR(7b,05,eb,c6,68,38,42,22,ba,a8,88,11,79,cf,da,54) + +#define SD_MESSAGE_UNIT_RESTART_SCHEDULED SD_ID128_MAKE(5e,b0,34,94,b6,58,48,70,a5,36,b3,37,29,08,09,b3) +#define SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR SD_ID128_MAKE_STR(5e,b0,34,94,b6,58,48,70,a5,36,b3,37,29,08,09,b3) + +#define SD_MESSAGE_UNIT_RESOURCES SD_ID128_MAKE(ae,8f,7b,86,6b,03,47,b9,af,31,fe,1c,80,b1,27,c0) +#define SD_MESSAGE_UNIT_RESOURCES_STR SD_ID128_MAKE_STR(ae,8f,7b,86,6b,03,47,b9,af,31,fe,1c,80,b1,27,c0) + +#define SD_MESSAGE_UNIT_SUCCESS SD_ID128_MAKE(7a,d2,d1,89,f7,e9,4e,70,a3,8c,78,13,54,91,24,48) +#define SD_MESSAGE_UNIT_SUCCESS_STR SD_ID128_MAKE_STR(7a,d2,d1,89,f7,e9,4e,70,a3,8c,78,13,54,91,24,48) +#define SD_MESSAGE_UNIT_SKIPPED SD_ID128_MAKE(0e,42,84,a0,ca,ca,4b,fc,81,c0,bb,67,86,97,26,73) +#define SD_MESSAGE_UNIT_SKIPPED_STR SD_ID128_MAKE_STR(0e,42,84,a0,ca,ca,4b,fc,81,c0,bb,67,86,97,26,73) +#define SD_MESSAGE_UNIT_FAILURE_RESULT SD_ID128_MAKE(d9,b3,73,ed,55,a6,4f,eb,82,42,e0,2d,be,79,a4,9c) +#define SD_MESSAGE_UNIT_FAILURE_RESULT_STR SD_ID128_MAKE_STR(d9,b3,73,ed,55,a6,4f,eb,82,42,e0,2d,be,79,a4,9c) + +#define SD_MESSAGE_SPAWN_FAILED SD_ID128_MAKE(64,12,57,65,1c,1b,4e,c9,a8,62,4d,7a,40,a9,e1,e7) +#define SD_MESSAGE_SPAWN_FAILED_STR SD_ID128_MAKE_STR(64,12,57,65,1c,1b,4e,c9,a8,62,4d,7a,40,a9,e1,e7) + +#define SD_MESSAGE_UNIT_PROCESS_EXIT SD_ID128_MAKE(98,e3,22,20,3f,7a,4e,d2,90,d0,9f,e0,3c,09,fe,15) +#define SD_MESSAGE_UNIT_PROCESS_EXIT_STR SD_ID128_MAKE_STR(98,e3,22,20,3f,7a,4e,d2,90,d0,9f,e0,3c,09,fe,15) + +#define SD_MESSAGE_FORWARD_SYSLOG_MISSED SD_ID128_MAKE(00,27,22,9c,a0,64,41,81,a7,6c,4e,92,45,8a,fa,2e) +#define SD_MESSAGE_FORWARD_SYSLOG_MISSED_STR SD_ID128_MAKE_STR(00,27,22,9c,a0,64,41,81,a7,6c,4e,92,45,8a,fa,2e) + +#define SD_MESSAGE_OVERMOUNTING SD_ID128_MAKE(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7) +#define SD_MESSAGE_OVERMOUNTING_STR SD_ID128_MAKE_STR(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7) + +#define SD_MESSAGE_UNIT_OOMD_KILL SD_ID128_MAKE(d9,89,61,1b,15,e4,4c,9d,bf,31,e3,c8,12,56,e4,ed) +#define SD_MESSAGE_UNIT_OOMD_KILL_STR SD_ID128_MAKE_STR(d9,89,61,1b,15,e4,4c,9d,bf,31,e3,c8,12,56,e4,ed) + +#define SD_MESSAGE_UNIT_OUT_OF_MEMORY SD_ID128_MAKE(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef) +#define SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR SD_ID128_MAKE_STR(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef) + +#define SD_MESSAGE_LID_OPENED SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f) +#define SD_MESSAGE_LID_OPENED_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f) +#define SD_MESSAGE_LID_CLOSED SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,70) +#define SD_MESSAGE_LID_CLOSED_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,70) +#define SD_MESSAGE_SYSTEM_DOCKED SD_ID128_MAKE(f5,f4,16,b8,62,07,4b,28,92,7a,48,c3,ba,7d,51,ff) +#define SD_MESSAGE_SYSTEM_DOCKED_STR SD_ID128_MAKE_STR(f5,f4,16,b8,62,07,4b,28,92,7a,48,c3,ba,7d,51,ff) +#define SD_MESSAGE_SYSTEM_UNDOCKED SD_ID128_MAKE(51,e1,71,bd,58,52,48,56,81,10,14,4c,51,7c,ca,53) +#define SD_MESSAGE_SYSTEM_UNDOCKED_STR SD_ID128_MAKE_STR(51,e1,71,bd,58,52,48,56,81,10,14,4c,51,7c,ca,53) +#define SD_MESSAGE_POWER_KEY SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,71) +#define SD_MESSAGE_POWER_KEY_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,71) +#define SD_MESSAGE_POWER_KEY_LONG_PRESS SD_ID128_MAKE(3e,01,17,10,1e,b2,43,c1,b9,a5,0d,b3,49,4a,b1,0b) +#define SD_MESSAGE_POWER_KEY_LONG_PRESS_STR SD_ID128_MAKE_STR(3e,01,17,10,1e,b2,43,c1,b9,a5,0d,b3,49,4a,b1,0b) +#define SD_MESSAGE_REBOOT_KEY SD_ID128_MAKE(9f,a9,d2,c0,12,13,4e,c3,85,45,1f,fe,31,6f,97,d0) +#define SD_MESSAGE_REBOOT_KEY_STR SD_ID128_MAKE_STR(9f,a9,d2,c0,12,13,4e,c3,85,45,1f,fe,31,6f,97,d0) +#define SD_MESSAGE_REBOOT_KEY_LONG_PRESS SD_ID128_MAKE(f1,c5,9a,58,c9,d9,43,66,89,65,c3,37,ca,ec,59,75) +#define SD_MESSAGE_REBOOT_KEY_LONG_PRESS_STR SD_ID128_MAKE_STR(f1,c5,9a,58,c9,d9,43,66,89,65,c3,37,ca,ec,59,75) +#define SD_MESSAGE_SUSPEND_KEY SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,72) +#define SD_MESSAGE_SUSPEND_KEY_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,72) +#define SD_MESSAGE_SUSPEND_KEY_LONG_PRESS SD_ID128_MAKE(bf,da,f6,d3,12,ab,40,07,bc,1f,e4,0a,15,df,78,e8) +#define SD_MESSAGE_SUSPEND_KEY_LONG_PRESS_STR SD_ID128_MAKE_STR(bf,da,f6,d3,12,ab,40,07,bc,1f,e4,0a,15,df,78,e8) +#define SD_MESSAGE_HIBERNATE_KEY SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,73) +#define SD_MESSAGE_HIBERNATE_KEY_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,73) +#define SD_MESSAGE_HIBERNATE_KEY_LONG_PRESS SD_ID128_MAKE(16,78,36,df,6f,7f,42,8e,98,14,72,27,b2,dc,89,45) +#define SD_MESSAGE_HIBERNATE_KEY_LONG_PRESS_STR SD_ID128_MAKE_STR(16,78,36,df,6f,7f,42,8e,98,14,72,27,b2,dc,89,45) + +#define SD_MESSAGE_INVALID_CONFIGURATION SD_ID128_MAKE(c7,72,d2,4e,9a,88,4c,be,b9,ea,12,62,5c,30,6c,01) +#define SD_MESSAGE_INVALID_CONFIGURATION_STR SD_ID128_MAKE_STR(c7,72,d2,4e,9a,88,4c,be,b9,ea,12,62,5c,30,6c,01) + +#define SD_MESSAGE_DNSSEC_FAILURE SD_ID128_MAKE(16,75,d7,f1,72,17,40,98,b1,10,8b,f8,c7,dc,8f,5d) +#define SD_MESSAGE_DNSSEC_FAILURE_STR SD_ID128_MAKE_STR(16,75,d7,f1,72,17,40,98,b1,10,8b,f8,c7,dc,8f,5d) +#define SD_MESSAGE_DNSSEC_TRUST_ANCHOR_REVOKED SD_ID128_MAKE(4d,44,08,cf,d0,d1,44,85,91,84,d1,e6,5d,7c,8a,65) +#define SD_MESSAGE_DNSSEC_TRUST_ANCHOR_REVOKED_STR SD_ID128_MAKE_STR(4d,44,08,cf,d0,d1,44,85,91,84,d1,e6,5d,7c,8a,65) +#define SD_MESSAGE_DNSSEC_DOWNGRADE SD_ID128_MAKE(36,db,2d,fa,5a,90,45,e1,bd,4a,f5,f9,3e,1c,f0,57) +#define SD_MESSAGE_DNSSEC_DOWNGRADE_STR SD_ID128_MAKE_STR(36,db,2d,fa,5a,90,45,e1,bd,4a,f5,f9,3e,1c,f0,57) + +#define SD_MESSAGE_UNSAFE_USER_NAME SD_ID128_MAKE(b6,1f,da,c6,12,e9,4b,91,82,28,5b,99,88,43,06,1f) +#define SD_MESSAGE_UNSAFE_USER_NAME_STR SD_ID128_MAKE_STR(b6,1f,da,c6,12,e9,4b,91,82,28,5b,99,88,43,06,1f) + +#define SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE SD_ID128_MAKE(1b,3b,b9,40,37,f0,4b,bf,81,02,8e,13,5a,12,d2,93) +#define SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE_STR SD_ID128_MAKE_STR(1b,3b,b9,40,37,f0,4b,bf,81,02,8e,13,5a,12,d2,93) +#define SD_MESSAGE_DEVICE_PATH_NOT_SUITABLE SD_ID128_MAKE(01,01,90,13,8f,49,4e,29,a0,ef,66,69,74,95,31,aa) +#define SD_MESSAGE_DEVICE_PATH_NOT_SUITABLE_STR SD_ID128_MAKE_STR(01,01,90,13,8f,49,4e,29,a0,ef,66,69,74,95,31,aa) + +#define SD_MESSAGE_NOBODY_USER_UNSUITABLE SD_ID128_MAKE(b4,80,32,5f,9c,39,4a,7b,80,2c,23,1e,51,a2,75,2c) +#define SD_MESSAGE_NOBODY_USER_UNSUITABLE_STR SD_ID128_MAKE_STR(b4,80,32,5f,9c,39,4a,7b,80,2c,23,1e,51,a2,75,2c) + +#define SD_MESSAGE_SYSTEMD_UDEV_SETTLE_DEPRECATED SD_ID128_MAKE(1c,04,54,c1,bd,22,41,e0,ac,6f,ef,b4,bc,63,14,33) +#define SD_MESSAGE_SYSTEMD_UDEV_SETTLE_DEPRECATED_STR SD_ID128_MAKE_STR(1c,04,54,c1,bd,22,41,e0,ac,6f,ef,b4,bc,63,14,33) + +#define SD_MESSAGE_TIME_SYNC SD_ID128_MAKE(7c,8a,41,f3,7b,76,49,41,a0,e1,78,0b,1b,e2,f0,37) +#define SD_MESSAGE_TIME_SYNC_STR SD_ID128_MAKE_STR(7c,8a,41,f3,7b,76,49,41,a0,e1,78,0b,1b,e2,f0,37) + +#define SD_MESSAGE_TIME_BUMP SD_ID128_MAKE(7d,b7,3c,8a,f0,d9,4e,eb,82,2a,e0,43,23,fe,6a,b6) +#define SD_MESSAGE_TIME_BUMP_STR SD_ID128_MAKE_STR(7d,b7,3c,8a,f0,d9,4e,eb,82,2a,e0,43,23,fe,6a,b6) + +#define SD_MESSAGE_SHUTDOWN_SCHEDULED SD_ID128_MAKE(9e,70,66,27,9d,c8,40,3d,a7,9c,e4,b1,a6,90,64,b2) +#define SD_MESSAGE_SHUTDOWN_SCHEDULED_STR SD_ID128_MAKE_STR(9e,70,66,27,9d,c8,40,3d,a7,9c,e4,b1,a6,90,64,b2) + +#define SD_MESSAGE_SHUTDOWN_CANCELED SD_ID128_MAKE(24,9f,6f,b9,e6,e2,42,8c,96,f3,f0,87,56,81,ff,a3) +#define SD_MESSAGE_SHUTDOWN_CANCELED_STR SD_ID128_MAKE_STR(24,9f,6f,b9,e6,e2,42,8c,96,f3,f0,87,56,81,ff,a3) + +#define SD_MESSAGE_TPM_PCR_EXTEND SD_ID128_MAKE(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab) +#define SD_MESSAGE_TPM_PCR_EXTEND_STR SD_ID128_MAKE_STR(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab) + +#define SD_MESSAGE_MEMORY_TRIM SD_ID128_MAKE(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21) +#define SD_MESSAGE_MEMORY_TRIM_STR SD_ID128_MAKE_STR(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21) + +#define SD_MESSAGE_SYSV_GENERATOR_DEPRECATED SD_ID128_MAKE(a8,fa,8d,ac,db,1d,44,3e,95,03,b8,be,36,7a,6a,db) +#define SD_MESSAGE_SYSV_GENERATOR_DEPRECATED_STR SD_ID128_MAKE_STR(a8,fa,8d,ac,db,1d,44,3e,95,03,b8,be,36,7a,6a,db) + +#define SD_MESSAGE_PORTABLE_ATTACHED SD_ID128_MAKE(18,7c,62,eb,1e,7f,46,3b,b5,30,39,4f,52,cb,09,0f) +#define SD_MESSAGE_PORTABLE_ATTACHED_STR SD_ID128_MAKE_STR(18,7c,62,eb,1e,7f,46,3b,b5,30,39,4f,52,cb,09,0f) +#define SD_MESSAGE_PORTABLE_DETACHED SD_ID128_MAKE(76,c5,c7,54,d6,28,49,0d,8e,cb,a4,c9,d0,42,11,2b) +#define SD_MESSAGE_PORTABLE_DETACHED_STR SD_ID128_MAKE_STR(76,c5,c7,54,d6,28,49,0d,8e,cb,a4,c9,d0,42,11,2b) + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-ndisc.h b/src/systemd/sd-ndisc.h new file mode 100644 index 0000000..3f93e3a --- /dev/null +++ b/src/systemd/sd-ndisc.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdndiscfoo +#define foosdndiscfoo + +/*** + Copyright © 2014 Intel Corporation. All rights reserved. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* Neighbor Discovery Options, RFC 4861, Section 4.6 and + * https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml#icmpv6-parameters-5 */ +enum { + SD_NDISC_OPTION_SOURCE_LL_ADDRESS = 1, + SD_NDISC_OPTION_TARGET_LL_ADDRESS = 2, + SD_NDISC_OPTION_PREFIX_INFORMATION = 3, + SD_NDISC_OPTION_MTU = 5, + SD_NDISC_OPTION_ROUTE_INFORMATION = 24, + SD_NDISC_OPTION_RDNSS = 25, + SD_NDISC_OPTION_FLAGS_EXTENSION = 26, + SD_NDISC_OPTION_DNSSL = 31, + SD_NDISC_OPTION_CAPTIVE_PORTAL = 37, + SD_NDISC_OPTION_PREF64 = 38 +}; + +/* Route preference, RFC 4191, Section 2.1 */ +enum { + SD_NDISC_PREFERENCE_LOW = 3U, + SD_NDISC_PREFERENCE_MEDIUM = 0U, + SD_NDISC_PREFERENCE_HIGH = 1U +}; + +typedef struct sd_ndisc sd_ndisc; +typedef struct sd_ndisc_router sd_ndisc_router; + +__extension__ typedef enum sd_ndisc_event_t { + SD_NDISC_EVENT_TIMEOUT, + SD_NDISC_EVENT_ROUTER, + _SD_NDISC_EVENT_MAX, + _SD_NDISC_EVENT_INVALID = -EINVAL, + _SD_ENUM_FORCE_S64(NDISC_EVENT) +} sd_ndisc_event_t; + +typedef void (*sd_ndisc_callback_t)(sd_ndisc *nd, sd_ndisc_event_t event, sd_ndisc_router *rt, void *userdata); + +int sd_ndisc_new(sd_ndisc **ret); +sd_ndisc *sd_ndisc_ref(sd_ndisc *nd); +sd_ndisc *sd_ndisc_unref(sd_ndisc *nd); + +int sd_ndisc_start(sd_ndisc *nd); +int sd_ndisc_stop(sd_ndisc *nd); + +int sd_ndisc_attach_event(sd_ndisc *nd, sd_event *event, int64_t priority); +int sd_ndisc_detach_event(sd_ndisc *nd); +sd_event *sd_ndisc_get_event(sd_ndisc *nd); + +int sd_ndisc_set_callback(sd_ndisc *nd, sd_ndisc_callback_t cb, void *userdata); +int sd_ndisc_set_ifindex(sd_ndisc *nd, int interface_index); +int sd_ndisc_set_ifname(sd_ndisc *nd, const char *interface_name); +int sd_ndisc_get_ifname(sd_ndisc *nd, const char **ret); +int sd_ndisc_set_mac(sd_ndisc *nd, const struct ether_addr *mac_addr); + +sd_ndisc_router *sd_ndisc_router_ref(sd_ndisc_router *rt); +sd_ndisc_router *sd_ndisc_router_unref(sd_ndisc_router *rt); + +int sd_ndisc_router_get_address(sd_ndisc_router *rt, struct in6_addr *ret); +int sd_ndisc_router_get_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); +int sd_ndisc_router_get_raw(sd_ndisc_router *rt, const void **ret, size_t *ret_size); + +int sd_ndisc_router_get_hop_limit(sd_ndisc_router *rt, uint8_t *ret); +int sd_ndisc_router_get_icmp6_ratelimit(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_get_flags(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_get_preference(sd_ndisc_router *rt, unsigned *ret); +int sd_ndisc_router_get_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_get_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); +int sd_ndisc_router_get_mtu(sd_ndisc_router *rt, uint32_t *ret); + +/* Generic option access */ +int sd_ndisc_router_option_rewind(sd_ndisc_router *rt); +int sd_ndisc_router_option_next(sd_ndisc_router *rt); +int sd_ndisc_router_option_get_type(sd_ndisc_router *rt, uint8_t *ret); +int sd_ndisc_router_option_is_type(sd_ndisc_router *rt, uint8_t type); +int sd_ndisc_router_option_get_raw(sd_ndisc_router *rt, const void **ret, size_t *ret_size); + +/* Specific option access: SD_NDISC_OPTION_PREFIX_INFORMATION */ +int sd_ndisc_router_prefix_get_valid_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_prefix_get_valid_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); +int sd_ndisc_router_prefix_get_preferred_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_prefix_get_preferred_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); +int sd_ndisc_router_prefix_get_flags(sd_ndisc_router *rt, uint8_t *ret); +int sd_ndisc_router_prefix_get_address(sd_ndisc_router *rt, struct in6_addr *ret); +int sd_ndisc_router_prefix_get_prefixlen(sd_ndisc_router *rt, unsigned *ret); + +/* Specific option access: SD_NDISC_OPTION_ROUTE_INFORMATION */ +int sd_ndisc_router_route_get_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_route_get_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); +int sd_ndisc_router_route_get_address(sd_ndisc_router *rt, struct in6_addr *ret); +int sd_ndisc_router_route_get_prefixlen(sd_ndisc_router *rt, unsigned *ret); +int sd_ndisc_router_route_get_preference(sd_ndisc_router *rt, unsigned *ret); + +/* Specific option access: SD_NDISC_OPTION_RDNSS */ +int sd_ndisc_router_rdnss_get_addresses(sd_ndisc_router *rt, const struct in6_addr **ret); +int sd_ndisc_router_rdnss_get_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_rdnss_get_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); + +/* Specific option access: SD_NDISC_OPTION_DNSSL */ +int sd_ndisc_router_dnssl_get_domains(sd_ndisc_router *rt, char ***ret); +int sd_ndisc_router_dnssl_get_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_dnssl_get_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); + +/* Specific option access: SD_NDISC_OPTION_CAPTIVE_PORTAL */ +int sd_ndisc_router_captive_portal_get_uri(sd_ndisc_router *rt, const char **ret, size_t *ret_size); + +/* Specific option access: SD_NDISC_OPTION_PREF64 */ +int sd_ndisc_router_prefix64_get_prefix(sd_ndisc_router *rt, struct in6_addr *ret); +int sd_ndisc_router_prefix64_get_prefixlen(sd_ndisc_router *rt, unsigned *ret); +int sd_ndisc_router_prefix64_get_lifetime(sd_ndisc_router *rt, uint64_t *ret); +int sd_ndisc_router_prefix64_get_lifetime_timestamp(sd_ndisc_router *rt, clockid_t clock, uint64_t *ret); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_ndisc, sd_ndisc_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_ndisc_router, sd_ndisc_router_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-netlink.h b/src/systemd/sd-netlink.h new file mode 100644 index 0000000..4119c45 --- /dev/null +++ b/src/systemd/sd-netlink.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdnetlinkhfoo +#define foosdnetlinkhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_netlink sd_netlink; +typedef struct sd_netlink_message sd_netlink_message; +typedef struct sd_netlink_slot sd_netlink_slot; + +/* callback */ +typedef int (*sd_netlink_message_handler_t)(sd_netlink *nl, sd_netlink_message *m, void *userdata); +typedef _sd_destroy_t sd_netlink_destroy_t; + +/* bus */ +int sd_netlink_open(sd_netlink **ret); +int sd_netlink_open_fd(sd_netlink **ret, int fd); +int sd_netlink_increase_rxbuf(sd_netlink *nl, const size_t size); + +sd_netlink *sd_netlink_ref(sd_netlink *nl); +sd_netlink *sd_netlink_unref(sd_netlink *nl); + +int sd_netlink_send(sd_netlink *nl, sd_netlink_message *message, uint32_t *serial); +int sd_netlink_call_async(sd_netlink *nl, sd_netlink_slot **ret_slot, sd_netlink_message *message, + sd_netlink_message_handler_t callback, sd_netlink_destroy_t destoy_callback, + void *userdata, uint64_t usec, const char *description); +int sd_netlink_call(sd_netlink *nl, sd_netlink_message *message, uint64_t timeout, + sd_netlink_message **reply); +int sd_netlink_read(sd_netlink *nl, uint32_t serial, uint64_t timeout, sd_netlink_message **reply); + +int sd_netlink_get_events(sd_netlink *nl); +int sd_netlink_get_timeout(sd_netlink *nl, uint64_t *timeout); +int sd_netlink_process(sd_netlink *nl, sd_netlink_message **ret); +int sd_netlink_wait(sd_netlink *nl, uint64_t timeout); + +int sd_netlink_add_match(sd_netlink *nl, sd_netlink_slot **ret_slot, uint16_t match, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, const char *description); + +int sd_netlink_attach_event(sd_netlink *nl, sd_event *e, int64_t priority); +int sd_netlink_detach_event(sd_netlink *nl); +sd_event* sd_netlink_get_event(sd_netlink *nl); +int sd_netlink_attach_filter(sd_netlink *nl, size_t len, const struct sock_filter *filter); + +/* Message construction */ +int sd_netlink_message_append_string(sd_netlink_message *m, uint16_t attr_type, const char *data); +int sd_netlink_message_append_strv(sd_netlink_message *m, uint16_t attr_type, const char* const *data); +int sd_netlink_message_append_flag(sd_netlink_message *m, uint16_t attr_type); +int sd_netlink_message_append_u8(sd_netlink_message *m, uint16_t attr_type, uint8_t data); +int sd_netlink_message_append_u16(sd_netlink_message *m, uint16_t attr_type, uint16_t data); +int sd_netlink_message_append_u32(sd_netlink_message *m, uint16_t attr_type, uint32_t data); +int sd_netlink_message_append_u64(sd_netlink_message *m, uint16_t attr_type, uint64_t data); +int sd_netlink_message_append_s8(sd_netlink_message *m, uint16_t attr_type, int8_t data); +int sd_netlink_message_append_s16(sd_netlink_message *m, uint16_t attr_type, int16_t data); +int sd_netlink_message_append_s32(sd_netlink_message *m, uint16_t attr_type, int32_t data); +int sd_netlink_message_append_s64(sd_netlink_message *m, uint16_t attr_type, int64_t data); +int sd_netlink_message_append_data(sd_netlink_message *m, uint16_t attr_type, const void *data, size_t len); +int sd_netlink_message_append_container_data( + sd_netlink_message *m, + uint16_t container_type, + uint16_t attr_type, + const void *data, + size_t len); +int sd_netlink_message_append_in_addr(sd_netlink_message *m, uint16_t attr_type, const struct in_addr *data); +int sd_netlink_message_append_in6_addr(sd_netlink_message *m, uint16_t attr_type, const struct in6_addr *data); +int sd_netlink_message_append_sockaddr_in(sd_netlink_message *m, uint16_t attr_type, const struct sockaddr_in *data); +int sd_netlink_message_append_sockaddr_in6(sd_netlink_message *m, uint16_t attr_type, const struct sockaddr_in6 *data); +int sd_netlink_message_append_ether_addr(sd_netlink_message *m, uint16_t attr_type, const struct ether_addr *data); +int sd_netlink_message_append_cache_info(sd_netlink_message *m, uint16_t attr_type, const struct ifa_cacheinfo *info); + +int sd_netlink_message_open_container(sd_netlink_message *m, uint16_t attr_type); +int sd_netlink_message_open_container_union(sd_netlink_message *m, uint16_t attr_type, const char *key); +int sd_netlink_message_close_container(sd_netlink_message *m); + +int sd_netlink_message_open_array(sd_netlink_message *m, uint16_t type); +int sd_netlink_message_cancel_array(sd_netlink_message *m); + +/* Reading messages */ +int sd_netlink_message_read(sd_netlink_message *m, uint16_t attr_type, size_t size, void *data); +int sd_netlink_message_read_data(sd_netlink_message *m, uint16_t attr_type, size_t *ret_size, void **ret_data); +int sd_netlink_message_read_data_suffix0(sd_netlink_message *m, uint16_t attr_type, size_t *ret_size, void **ret_data); +int sd_netlink_message_read_string_strdup(sd_netlink_message *m, uint16_t attr_type, char **data); +int sd_netlink_message_read_string(sd_netlink_message *m, uint16_t attr_type, const char **data); +int sd_netlink_message_read_strv(sd_netlink_message *m, uint16_t container_type, uint16_t attr_type, char ***ret); +int sd_netlink_message_read_u8(sd_netlink_message *m, uint16_t attr_type, uint8_t *data); +int sd_netlink_message_read_u16(sd_netlink_message *m, uint16_t attr_type, uint16_t *data); +int sd_netlink_message_read_u32(sd_netlink_message *m, uint16_t attr_type, uint32_t *data); +int sd_netlink_message_read_ether_addr(sd_netlink_message *m, uint16_t attr_type, struct ether_addr *data); +int sd_netlink_message_read_cache_info(sd_netlink_message *m, uint16_t attr_type, struct ifa_cacheinfo *info); +int sd_netlink_message_read_in_addr(sd_netlink_message *m, uint16_t attr_type, struct in_addr *data); +int sd_netlink_message_read_in6_addr(sd_netlink_message *m, uint16_t attr_type, struct in6_addr *data); +int sd_netlink_message_has_flag(sd_netlink_message *m, uint16_t attr_type); +int sd_netlink_message_enter_container(sd_netlink_message *m, uint16_t attr_type); +int sd_netlink_message_enter_array(sd_netlink_message *m, uint16_t attr_type); +int sd_netlink_message_exit_container(sd_netlink_message *m); + +int sd_netlink_message_rewind(sd_netlink_message *m, sd_netlink *nl); + +sd_netlink_message *sd_netlink_message_next(sd_netlink_message *m); + +sd_netlink_message *sd_netlink_message_ref(sd_netlink_message *m); +sd_netlink_message *sd_netlink_message_unref(sd_netlink_message *m); + +int sd_netlink_message_set_request_dump(sd_netlink_message *m, int dump); +int sd_netlink_message_is_error(sd_netlink_message *m); +int sd_netlink_message_get_errno(sd_netlink_message *m); +int sd_netlink_message_get_type(sd_netlink_message *m, uint16_t *type); +int sd_netlink_message_set_flags(sd_netlink_message *m, uint16_t flags); +int sd_netlink_message_is_broadcast(sd_netlink_message *m); +int sd_netlink_message_get_max_attribute(sd_netlink_message *m, uint16_t *ret); + +/* rtnl */ +int sd_rtnl_message_get_family(sd_netlink_message *m, int *family); + +int sd_rtnl_message_new_addr(sd_netlink *nl, sd_netlink_message **ret, uint16_t msg_type, int index, int family); +int sd_rtnl_message_new_addr_update(sd_netlink *nl, sd_netlink_message **ret, int index, int family); +int sd_rtnl_message_addr_set_prefixlen(sd_netlink_message *m, unsigned char prefixlen); +int sd_rtnl_message_addr_set_scope(sd_netlink_message *m, unsigned char scope); +int sd_rtnl_message_addr_set_flags(sd_netlink_message *m, unsigned char flags); +int sd_rtnl_message_addr_get_family(sd_netlink_message *m, int *family); +int sd_rtnl_message_addr_get_prefixlen(sd_netlink_message *m, unsigned char *ret_prefixlen); +int sd_rtnl_message_addr_get_scope(sd_netlink_message *m, unsigned char *ret_scope); +int sd_rtnl_message_addr_get_flags(sd_netlink_message *m, unsigned char *ret_flags); +int sd_rtnl_message_addr_get_ifindex(sd_netlink_message *m, int *ret_ifindex); + +int sd_rtnl_message_new_link(sd_netlink *nl, sd_netlink_message **ret, uint16_t msg_type, int index); +int sd_rtnl_message_link_set_flags(sd_netlink_message *m, unsigned flags, unsigned change); +int sd_rtnl_message_link_set_type(sd_netlink_message *m, unsigned type); +int sd_rtnl_message_link_set_family(sd_netlink_message *m, unsigned family); +int sd_rtnl_message_link_get_ifindex(sd_netlink_message *m, int *ifindex); +int sd_rtnl_message_link_get_flags(sd_netlink_message *m, unsigned *flags); +int sd_rtnl_message_link_get_type(sd_netlink_message *m, unsigned short *type); + +int sd_rtnl_message_new_route(sd_netlink *nl, sd_netlink_message **ret, uint16_t nlmsg_type, int rtm_family, unsigned char rtm_protocol); +int sd_rtnl_message_route_set_dst_prefixlen(sd_netlink_message *m, unsigned char prefixlen); +int sd_rtnl_message_route_set_src_prefixlen(sd_netlink_message *m, unsigned char prefixlen); +int sd_rtnl_message_route_set_scope(sd_netlink_message *m, unsigned char scope); +int sd_rtnl_message_route_set_flags(sd_netlink_message *m, unsigned flags); +int sd_rtnl_message_route_set_table(sd_netlink_message *m, unsigned char table); +int sd_rtnl_message_route_set_type(sd_netlink_message *m, unsigned char type); +int sd_rtnl_message_route_get_flags(sd_netlink_message *m, unsigned *flags); +int sd_rtnl_message_route_get_family(sd_netlink_message *m, int *family); +int sd_rtnl_message_route_get_protocol(sd_netlink_message *m, unsigned char *protocol); +int sd_rtnl_message_route_get_scope(sd_netlink_message *m, unsigned char *scope); +int sd_rtnl_message_route_get_tos(sd_netlink_message *m, unsigned char *tos); +int sd_rtnl_message_route_get_table(sd_netlink_message *m, unsigned char *table); +int sd_rtnl_message_route_get_dst_prefixlen(sd_netlink_message *m, unsigned char *dst_len); +int sd_rtnl_message_route_get_src_prefixlen(sd_netlink_message *m, unsigned char *src_len); +int sd_rtnl_message_route_get_type(sd_netlink_message *m, unsigned char *type); + +int sd_rtnl_message_new_nexthop(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, int nh_family, unsigned char nh_protocol); +int sd_rtnl_message_nexthop_set_flags(sd_netlink_message *m, uint8_t flags); +int sd_rtnl_message_nexthop_get_flags(sd_netlink_message *m, uint8_t *ret); +int sd_rtnl_message_nexthop_get_family(sd_netlink_message *m, uint8_t *family); +int sd_rtnl_message_nexthop_get_protocol(sd_netlink_message *m, uint8_t *protocol); + +int sd_rtnl_message_new_neigh(sd_netlink *nl, sd_netlink_message **ret, uint16_t nlmsg_type, int index, int nda_family); +int sd_rtnl_message_neigh_set_flags(sd_netlink_message *m, uint8_t flags); +int sd_rtnl_message_neigh_set_state(sd_netlink_message *m, uint16_t state); +int sd_rtnl_message_neigh_get_family(sd_netlink_message *m, int *family); +int sd_rtnl_message_neigh_get_ifindex(sd_netlink_message *m, int *index); +int sd_rtnl_message_neigh_get_state(sd_netlink_message *m, uint16_t *state); +int sd_rtnl_message_neigh_get_flags(sd_netlink_message *m, uint8_t *flags); + +int sd_rtnl_message_new_addrlabel(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, int ifindex, int ifal_family); +int sd_rtnl_message_addrlabel_set_prefixlen(sd_netlink_message *m, unsigned char prefixlen); +int sd_rtnl_message_addrlabel_get_prefixlen(sd_netlink_message *m, unsigned char *prefixlen); + +int sd_rtnl_message_new_routing_policy_rule(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, int ifal_family); +int sd_rtnl_message_routing_policy_rule_set_tos(sd_netlink_message *m, uint8_t tos); +int sd_rtnl_message_routing_policy_rule_get_tos(sd_netlink_message *m, uint8_t *tos); +int sd_rtnl_message_routing_policy_rule_set_table(sd_netlink_message *m, uint8_t table); +int sd_rtnl_message_routing_policy_rule_get_table(sd_netlink_message *m, uint8_t *table); +int sd_rtnl_message_routing_policy_rule_set_fib_src_prefixlen(sd_netlink_message *m, uint8_t len); +int sd_rtnl_message_routing_policy_rule_get_fib_src_prefixlen(sd_netlink_message *m, uint8_t *len); +int sd_rtnl_message_routing_policy_rule_set_fib_dst_prefixlen(sd_netlink_message *m, uint8_t len); +int sd_rtnl_message_routing_policy_rule_get_fib_dst_prefixlen(sd_netlink_message *m, uint8_t *len); +int sd_rtnl_message_routing_policy_rule_set_fib_type(sd_netlink_message *m, uint8_t type); +int sd_rtnl_message_routing_policy_rule_get_fib_type(sd_netlink_message *m, uint8_t *type); +int sd_rtnl_message_routing_policy_rule_set_flags(sd_netlink_message *m, uint32_t flags); +int sd_rtnl_message_routing_policy_rule_get_flags(sd_netlink_message *m, uint32_t *flags); + +int sd_rtnl_message_new_traffic_control(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, + int ifindex, uint32_t handle, uint32_t parent); +int sd_rtnl_message_traffic_control_get_ifindex(sd_netlink_message *m, int *ret); +int sd_rtnl_message_traffic_control_get_handle(sd_netlink_message *m, uint32_t *ret); +int sd_rtnl_message_traffic_control_get_parent(sd_netlink_message *m, uint32_t *ret); + +int sd_rtnl_message_new_mdb(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, int mdb_ifindex); + +/* genl */ +int sd_genl_socket_open(sd_netlink **ret); +int sd_genl_message_new(sd_netlink *genl, const char *family_name, uint8_t cmd, sd_netlink_message **ret); +int sd_genl_message_get_family_name(sd_netlink *genl, sd_netlink_message *m, const char **ret); +int sd_genl_message_get_command(sd_netlink *genl, sd_netlink_message *m, uint8_t *ret); +int sd_genl_add_match(sd_netlink *nl, sd_netlink_slot **ret_slot, const char *family_name, + const char *multicast_group_name, uint8_t command, + sd_netlink_message_handler_t callback, + sd_netlink_destroy_t destroy_callback, + void *userdata, const char *description); + +/* slot */ +sd_netlink_slot *sd_netlink_slot_ref(sd_netlink_slot *slot); +sd_netlink_slot *sd_netlink_slot_unref(sd_netlink_slot *slot); + +sd_netlink *sd_netlink_slot_get_netlink(sd_netlink_slot *slot); +void *sd_netlink_slot_get_userdata(sd_netlink_slot *slot); +void *sd_netlink_slot_set_userdata(sd_netlink_slot *slot, void *userdata); +int sd_netlink_slot_get_destroy_callback(sd_netlink_slot *slot, sd_netlink_destroy_t *callback); +int sd_netlink_slot_set_destroy_callback(sd_netlink_slot *slot, sd_netlink_destroy_t callback); +int sd_netlink_slot_get_floating(sd_netlink_slot *slot); +int sd_netlink_slot_set_floating(sd_netlink_slot *slot, int b); +int sd_netlink_slot_get_description(sd_netlink_slot *slot, const char **description); +int sd_netlink_slot_set_description(sd_netlink_slot *slot, const char *description); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_netlink, sd_netlink_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_netlink_message, sd_netlink_message_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_netlink_slot, sd_netlink_slot_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-network.h b/src/systemd/sd-network.h new file mode 100644 index 0000000..d292719 --- /dev/null +++ b/src/systemd/sd-network.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdnetworkhfoo +#define foosdnetworkhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "_sd-common.h" + +/* + * A few points: + * + * Instead of returning an empty string array or empty integer array, we + * may return NULL. + * + * Free the data the library returns with libc free(). String arrays + * are NULL terminated, and you need to free the array itself in + * addition to the strings contained. + * + * We return error codes as negative errno, kernel-style. On success, we + * return 0 or positive. + * + * These functions access data in /run. This is a virtual file system; + * therefore, accesses are relatively cheap. + * + * See sd-network(3) for more information. + */ + +_SD_BEGIN_DECLARATIONS; + +/* Get overall operational state + * Possible states: down, up, dormant, carrier, degraded, routable + * Possible return codes: + * -ENODATA: networkd is not aware of any links + */ +int sd_network_get_operational_state(char **ret); +int sd_network_get_carrier_state(char **ret); +int sd_network_get_address_state(char **ret); +int sd_network_get_ipv4_address_state(char **ret); +int sd_network_get_ipv6_address_state(char **ret); +int sd_network_get_online_state(char **ret); + +/* Get DNS entries for all links. These are string representations of + * IP addresses */ +int sd_network_get_dns(char ***ret); + +/* Get NTP entries for all links. These are domain names or string + * representations of IP addresses */ +int sd_network_get_ntp(char ***ret); + +/* Get the search domains for all links. */ +int sd_network_get_search_domains(char ***ret); + +/* Get the search domains for all links. */ +int sd_network_get_route_domains(char ***ret); + +/* Get setup state from ifindex. + * Possible states: + * pending: udev is still processing the link, we don't yet know if we will manage it + * failed: networkd failed to manage the link + * configuring: in the process of retrieving configuration or configuring the link + * configured: link configured successfully + * unmanaged: networkd is not handling the link + * linger: the link is gone, but has not yet been dropped by networkd + * Possible return codes: + * -ENODATA: networkd is not aware of the link + */ +int sd_network_link_get_setup_state(int ifindex, char **ret); + +/* Get operational state from ifindex. + * Possible states: + * off: the device is powered down + * no-carrier: the device is powered up, but it does not yet have a carrier + * dormant: the device has a carrier, but is not yet ready for normal traffic + * carrier: the link has a carrier + * degraded: the link has carrier and addresses valid on the local link configured + * routable: the link has carrier and routable address configured + * Possible return codes: + * -ENODATA: networkd is not aware of the link + */ +int sd_network_link_get_operational_state(int ifindex, char **ret); +int sd_network_link_get_required_operstate_for_online(int ifindex, char **ret); +int sd_network_link_get_required_family_for_online(int ifindex, char **ret); +int sd_network_link_get_carrier_state(int ifindex, char **ret); +int sd_network_link_get_address_state(int ifindex, char **ret); +int sd_network_link_get_ipv4_address_state(int ifindex, char **ret); +int sd_network_link_get_ipv6_address_state(int ifindex, char **ret); +int sd_network_link_get_online_state(int ifindex, char **ret); + +/* Indicates whether the network is relevant to being online. + * Possible return codes: + * 0: the connection is not required + * 1: the connection is required to consider the system online + * <0: networkd is not aware of the link + */ +int sd_network_link_get_required_for_online(int ifindex); + +/* Get activation policy for ifindex. + * Possible values are as specified for ActivationPolicy= + */ +int sd_network_link_get_activation_policy(int ifindex, char **ret); + +/* Get path to .network file applied to link */ +int sd_network_link_get_network_file(int ifindex, char **ret); + +/* Get paths to .network file dropins applied to link */ +int sd_network_link_get_network_file_dropins(int ifindex, char ***ret); + +/* Get DNS entries for a given link. These are string representations of + * IP addresses */ +int sd_network_link_get_dns(int ifindex, char ***ret); + +/* Get NTP entries for a given link. These are domain names or string + * representations of IP addresses */ +int sd_network_link_get_ntp(int ifindex, char ***ret); + +/* Get SIP entries for a given link. These are string + * representations of IP addresses */ +int sd_network_link_get_sip(int ifindex, char ***ret); + +/* Get the captive portal address for a given link. */ +int sd_network_link_get_captive_portal(int ifindex, char **ret); + +/* Indicates whether or not LLMNR should be enabled for the link + * Possible levels of support: yes, no, resolve + * Possible return codes: + * -ENODATA: networkd is not aware of the link + */ +int sd_network_link_get_llmnr(int ifindex, char **ret); + +/* Indicates whether or not MulticastDNS should be enabled for the + * link. + * Possible levels of support: yes, no, resolve + * Possible return codes: + * -ENODATA: networkd is not aware of the link + */ +int sd_network_link_get_mdns(int ifindex, char **ret); + +/* Indicates whether or not DNS-over-TLS should be enabled for the + * link. + * Possible levels of support: yes, no, opportunistic + * Possible return codes: + * -ENODATA: networkd is not aware of the link + */ +int sd_network_link_get_dns_over_tls(int ifindex, char **ret); + +/* Indicates whether or not DNSSEC should be enabled for the link + * Possible levels of support: yes, no, allow-downgrade + * Possible return codes: + * -ENODATA: networkd is not aware of the link + */ +int sd_network_link_get_dnssec(int ifindex, char **ret); + +/* Returns the list of per-interface DNSSEC negative trust anchors + * Possible return codes: + * -ENODATA: networkd is not aware of the link, or has no such data + */ +int sd_network_link_get_dnssec_negative_trust_anchors(int ifindex, char ***ret); + +/* Get the search DNS domain names for a given link. */ +int sd_network_link_get_search_domains(int ifindex, char ***ret); + +/* Get the route DNS domain names for a given link. */ +int sd_network_link_get_route_domains(int ifindex, char ***ret); + +/* Get whether this link shall be used as 'default route' for DNS queries */ +int sd_network_link_get_dns_default_route(int ifindex); + +/* Get the carrier interface indexes to which current link is bound to. */ +int sd_network_link_get_carrier_bound_to(int ifindex, int **ret); + +/* Get the CARRIERS that are bound to current link. */ +int sd_network_link_get_carrier_bound_by(int ifindex, int **ret); + +/* Get DHCPv6 client IAID for a given link. */ +int sd_network_link_get_dhcp6_client_iaid_string(int ifindex, char **ret); + +/* Get DHCPv6 client DUID for a given link. */ +int sd_network_link_get_dhcp6_client_duid_string(int ifindex, char **ret); + +int sd_network_link_get_stat(int ifindex, struct stat *ret); + +/* Monitor object */ +typedef struct sd_network_monitor sd_network_monitor; + +/* Create a new monitor. Category must be NULL, "links" or "leases". */ +int sd_network_monitor_new(sd_network_monitor **ret, const char *category); + +/* Destroys the passed monitor. Returns NULL. */ +sd_network_monitor* sd_network_monitor_unref(sd_network_monitor *m); + +/* Flushes the monitor */ +int sd_network_monitor_flush(sd_network_monitor *m); + +/* Get FD from monitor */ +int sd_network_monitor_get_fd(sd_network_monitor *m); + +/* Get poll() mask to monitor */ +int sd_network_monitor_get_events(sd_network_monitor *m); + +/* Get timeout for poll(), as usec value relative to CLOCK_MONOTONIC's epoch */ +int sd_network_monitor_get_timeout(sd_network_monitor *m, uint64_t *ret_usec); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_network_monitor, sd_network_monitor_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-path.h b/src/systemd/sd-path.h new file mode 100644 index 0000000..fcd90aa --- /dev/null +++ b/src/systemd/sd-path.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdpathhfoo +#define foosdpathhfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +enum { + /* Temporary files */ + SD_PATH_TEMPORARY, + SD_PATH_TEMPORARY_LARGE, + + /* Vendor supplied data */ + SD_PATH_SYSTEM_BINARIES, + SD_PATH_SYSTEM_INCLUDE, + SD_PATH_SYSTEM_LIBRARY_PRIVATE, + SD_PATH_SYSTEM_LIBRARY_ARCH, + SD_PATH_SYSTEM_SHARED, + SD_PATH_SYSTEM_CONFIGURATION_FACTORY, + SD_PATH_SYSTEM_STATE_FACTORY, + + /* System configuration, runtime, state, ... */ + SD_PATH_SYSTEM_CONFIGURATION, + SD_PATH_SYSTEM_RUNTIME, + SD_PATH_SYSTEM_RUNTIME_LOGS, + SD_PATH_SYSTEM_STATE_PRIVATE, + SD_PATH_SYSTEM_STATE_LOGS, + SD_PATH_SYSTEM_STATE_CACHE, + SD_PATH_SYSTEM_STATE_SPOOL, + + /* Vendor supplied data */ + SD_PATH_USER_BINARIES, + SD_PATH_USER_LIBRARY_PRIVATE, + SD_PATH_USER_LIBRARY_ARCH, + SD_PATH_USER_SHARED, + + /* User configuration, state, runtime ... */ + SD_PATH_USER_CONFIGURATION, + SD_PATH_USER_RUNTIME, + SD_PATH_USER_STATE_CACHE, + /* → SD_PATH_USER_STATE_PRIVATE is added at the bottom */ + + /* User resources */ + SD_PATH_USER, /* $HOME itself */ + SD_PATH_USER_DOCUMENTS, + SD_PATH_USER_MUSIC, + SD_PATH_USER_PICTURES, + SD_PATH_USER_VIDEOS, + SD_PATH_USER_DOWNLOAD, + SD_PATH_USER_PUBLIC, + SD_PATH_USER_TEMPLATES, + SD_PATH_USER_DESKTOP, + + /* Search paths */ + SD_PATH_SEARCH_BINARIES, + SD_PATH_SEARCH_BINARIES_DEFAULT, + SD_PATH_SEARCH_LIBRARY_PRIVATE, + SD_PATH_SEARCH_LIBRARY_ARCH, + SD_PATH_SEARCH_SHARED, + SD_PATH_SEARCH_CONFIGURATION_FACTORY, + SD_PATH_SEARCH_STATE_FACTORY, + SD_PATH_SEARCH_CONFIGURATION, + + /* Various systemd paths, generally mirroring systemd.pc — Except we drop the "dir" suffix (and + * replaces "path" by "search"), since this API is about dirs/paths anyway, and contains "path" + * already in the prefix */ + SD_PATH_SYSTEMD_UTIL, + + SD_PATH_SYSTEMD_SYSTEM_UNIT, + SD_PATH_SYSTEMD_SYSTEM_PRESET, + SD_PATH_SYSTEMD_SYSTEM_CONF, + SD_PATH_SYSTEMD_USER_UNIT, + SD_PATH_SYSTEMD_USER_PRESET, + SD_PATH_SYSTEMD_USER_CONF, + + SD_PATH_SYSTEMD_SEARCH_SYSTEM_UNIT, + SD_PATH_SYSTEMD_SEARCH_USER_UNIT, + + SD_PATH_SYSTEMD_SYSTEM_GENERATOR, + SD_PATH_SYSTEMD_USER_GENERATOR, + SD_PATH_SYSTEMD_SEARCH_SYSTEM_GENERATOR, + SD_PATH_SYSTEMD_SEARCH_USER_GENERATOR, + + SD_PATH_SYSTEMD_SLEEP, + SD_PATH_SYSTEMD_SHUTDOWN, + + SD_PATH_TMPFILES, + SD_PATH_SYSUSERS, + SD_PATH_SYSCTL, + SD_PATH_BINFMT, + SD_PATH_MODULES_LOAD, + SD_PATH_CATALOG, + + /* systemd-networkd search paths */ + SD_PATH_SYSTEMD_SEARCH_NETWORK, + + /* systemd environment generators */ + SD_PATH_SYSTEMD_SYSTEM_ENVIRONMENT_GENERATOR, + SD_PATH_SYSTEMD_USER_ENVIRONMENT_GENERATOR, + SD_PATH_SYSTEMD_SEARCH_SYSTEM_ENVIRONMENT_GENERATOR, + SD_PATH_SYSTEMD_SEARCH_USER_ENVIRONMENT_GENERATOR, + + SD_PATH_USER_STATE_PRIVATE, + + _SD_PATH_MAX +}; + +int sd_path_lookup(uint64_t type, const char *suffix, char **path); +int sd_path_lookup_strv(uint64_t type, const char *suffix, char ***paths); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-radv.h b/src/systemd/sd-radv.h new file mode 100644 index 0000000..8ea0838 --- /dev/null +++ b/src/systemd/sd-radv.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdradvfoo +#define foosdradvfoo + +/*** + Copyright © 2017 Intel Corporation. All rights reserved. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include + +#include "_sd-common.h" +#include "sd-event.h" +#include "sd-ndisc.h" + +_SD_BEGIN_DECLARATIONS; + +typedef struct sd_radv sd_radv; +typedef struct sd_radv_prefix sd_radv_prefix; +typedef struct sd_radv_route_prefix sd_radv_route_prefix; +typedef struct sd_radv_pref64_prefix sd_radv_pref64_prefix; + +/* Router Advertisement */ +int sd_radv_new(sd_radv **ret); +sd_radv *sd_radv_ref(sd_radv *ra); +sd_radv *sd_radv_unref(sd_radv *ra); + +int sd_radv_attach_event(sd_radv *ra, sd_event *event, int64_t priority); +int sd_radv_detach_event(sd_radv *nd); +sd_event *sd_radv_get_event(sd_radv *ra); + +int sd_radv_start(sd_radv *ra); +int sd_radv_stop(sd_radv *ra); +int sd_radv_is_running(sd_radv *ra); + +int sd_radv_set_ifindex(sd_radv *ra, int interface_index); +int sd_radv_set_ifname(sd_radv *ra, const char *interface_name); +int sd_radv_get_ifname(sd_radv *ra, const char **ret); +int sd_radv_set_mac(sd_radv *ra, const struct ether_addr *mac_addr); +int sd_radv_set_mtu(sd_radv *ra, uint32_t mtu); +int sd_radv_set_hop_limit(sd_radv *ra, uint8_t hop_limit); +int sd_radv_set_retransmit(sd_radv *ra, uint64_t usec); +int sd_radv_set_router_lifetime(sd_radv *ra, uint64_t usec); +int sd_radv_set_managed_information(sd_radv *ra, int managed); +int sd_radv_set_other_information(sd_radv *ra, int other); +int sd_radv_set_preference(sd_radv *ra, unsigned preference); +int sd_radv_add_prefix(sd_radv *ra, sd_radv_prefix *p); +int sd_radv_add_route_prefix(sd_radv *ra, sd_radv_route_prefix *p); +int sd_radv_add_pref64_prefix(sd_radv *ra, sd_radv_pref64_prefix *p); +void sd_radv_remove_prefix(sd_radv *ra, const struct in6_addr *prefix, unsigned char prefixlen); +int sd_radv_set_rdnss(sd_radv *ra, uint64_t lifetime_usec, + const struct in6_addr *dns, size_t n_dns); +int sd_radv_set_dnssl(sd_radv *ra, uint64_t lifetime_usec, char **search_list); + +/* Advertised prefixes */ +int sd_radv_prefix_new(sd_radv_prefix **ret); +sd_radv_prefix *sd_radv_prefix_ref(sd_radv_prefix *ra); +sd_radv_prefix *sd_radv_prefix_unref(sd_radv_prefix *ra); + +int sd_radv_prefix_set_prefix(sd_radv_prefix *p, const struct in6_addr *in6_addr, + unsigned char prefixlen); +int sd_radv_prefix_get_prefix(sd_radv_prefix *p, struct in6_addr *ret_in6_addr, + unsigned char *ret_prefixlen); +int sd_radv_prefix_set_onlink(sd_radv_prefix *p, int onlink); +int sd_radv_prefix_set_address_autoconfiguration(sd_radv_prefix *p, + int address_autoconfiguration); +int sd_radv_prefix_set_valid_lifetime(sd_radv_prefix *p, uint64_t lifetime_usec, uint64_t valid_until); +int sd_radv_prefix_set_preferred_lifetime(sd_radv_prefix *p, uint64_t lifetime_usec, uint64_t valid_until); + +int sd_radv_route_prefix_new(sd_radv_route_prefix **ret); +sd_radv_route_prefix *sd_radv_route_prefix_ref(sd_radv_route_prefix *ra); +sd_radv_route_prefix *sd_radv_route_prefix_unref(sd_radv_route_prefix *ra); + +int sd_radv_route_prefix_set_prefix(sd_radv_route_prefix *p, const struct in6_addr *in6_addr, unsigned char prefixlen); +int sd_radv_route_prefix_set_lifetime(sd_radv_route_prefix *p, uint64_t lifetime_usec, uint64_t valid_until); + +int sd_radv_pref64_prefix_new(sd_radv_pref64_prefix **ret); +int sd_radv_pref64_prefix_set_prefix(sd_radv_pref64_prefix *p, const struct in6_addr *prefix, + uint8_t prefixlen, uint64_t lifetime_usec); +sd_radv_pref64_prefix *sd_radv_pref64_prefix_ref(sd_radv_pref64_prefix *ra); +sd_radv_pref64_prefix *sd_radv_pref64_prefix_unref(sd_radv_pref64_prefix *ra); + +/* Mobile IPv6 extension: Home Agent Info. */ +int sd_radv_set_home_agent_information(sd_radv *ra, int home_agent); +int sd_radv_set_home_agent_preference(sd_radv *ra, uint16_t preference); +int sd_radv_set_home_agent_lifetime(sd_radv *ra, uint64_t usec); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_radv, sd_radv_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_radv_prefix, sd_radv_prefix_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_radv_route_prefix, sd_radv_route_prefix_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_radv_pref64_prefix, sd_radv_pref64_prefix_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-resolve.h b/src/systemd/sd-resolve.h new file mode 100644 index 0000000..f5dfae8 --- /dev/null +++ b/src/systemd/sd-resolve.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdresolvehfoo +#define foosdresolvehfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +/* 'struct addrinfo' needs _GNU_SOURCE */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#include +#include +#include +#include + +#include "sd-event.h" + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +/* An opaque sd-resolve session structure */ +typedef struct sd_resolve sd_resolve; + +/* An opaque sd-resolve query structure */ +typedef struct sd_resolve_query sd_resolve_query; + +/* A callback on completion */ +typedef int (*sd_resolve_getaddrinfo_handler_t)(sd_resolve_query *q, int ret, const struct addrinfo *ai, void *userdata); +typedef int (*sd_resolve_getnameinfo_handler_t)(sd_resolve_query *q, int ret, const char *host, const char *serv, void *userdata); +typedef _sd_destroy_t sd_resolve_destroy_t; + +enum { + SD_RESOLVE_GET_HOST = 1 << 0, + SD_RESOLVE_GET_SERVICE = 1 << 1, + SD_RESOLVE_GET_BOTH = SD_RESOLVE_GET_HOST | SD_RESOLVE_GET_SERVICE +}; + +int sd_resolve_default(sd_resolve **ret); + +/* Allocate a new sd-resolve session. */ +int sd_resolve_new(sd_resolve **ret); + +/* Free a sd-resolve session. This destroys all attached + * sd_resolve_query objects automatically. */ +sd_resolve* sd_resolve_unref(sd_resolve *resolve); +sd_resolve* sd_resolve_ref(sd_resolve *resolve); + +/* Return the UNIX file descriptor to poll() for events on. Use this + * function to integrate sd-resolve with your custom main loop. */ +int sd_resolve_get_fd(sd_resolve *resolve); + +/* Return the poll() events (a combination of flags like POLLIN, + * POLLOUT, ...) to check for. */ +int sd_resolve_get_events(sd_resolve *resolve); + +/* Return the poll() timeout to pass. Returns UINT64_MAX as + * timeout if no timeout is needed. */ +int sd_resolve_get_timeout(sd_resolve *resolve, uint64_t *timeout_usec); + +/* Process pending responses. After this function is called, you can + * get the next completed query object(s) using + * sd_resolve_get_next(). */ +int sd_resolve_process(sd_resolve *resolve); + +/* Wait for a resolve event to complete. */ +int sd_resolve_wait(sd_resolve *resolve, uint64_t timeout_usec); + +int sd_resolve_get_tid(sd_resolve *resolve, pid_t *tid); + +int sd_resolve_attach_event(sd_resolve *resolve, sd_event *e, int64_t priority); +int sd_resolve_detach_event(sd_resolve *resolve); +sd_event *sd_resolve_get_event(sd_resolve *resolve); + +/* Issue a name-to-address query on the specified session. The + * arguments are compatible with those of libc's + * getaddrinfo(3). The function returns a new query object. When the + * query is completed, you may retrieve the results using + * sd_resolve_getaddrinfo_done(). */ +int sd_resolve_getaddrinfo(sd_resolve *resolve, sd_resolve_query **q, const char *node, const char *service, const struct addrinfo *hints, sd_resolve_getaddrinfo_handler_t callback, void *userdata); + +/* Issue an address-to-name query on the specified session. The + * arguments are compatible with those of libc's + * getnameinfo(3). The function returns a new query object. When the + * query is completed, you may retrieve the results using + * sd_resolve_getnameinfo_done(). Set gethost (resp. getserv) to non-zero + * if you want to query the hostname (resp. the service name). */ +int sd_resolve_getnameinfo(sd_resolve *resolve, sd_resolve_query **q, const struct sockaddr *sa, socklen_t salen, int flags, uint64_t get, sd_resolve_getnameinfo_handler_t callback, void *userdata); + +sd_resolve_query *sd_resolve_query_ref(sd_resolve_query *q); +sd_resolve_query *sd_resolve_query_unref(sd_resolve_query *q); + +/* Returns non-zero when the query operation specified by q has been completed. */ +int sd_resolve_query_is_done(sd_resolve_query *q); + +void *sd_resolve_query_get_userdata(sd_resolve_query *q); +void *sd_resolve_query_set_userdata(sd_resolve_query *q, void *userdata); +int sd_resolve_query_get_destroy_callback(sd_resolve_query *q, sd_resolve_destroy_t *destroy_callback); +int sd_resolve_query_set_destroy_callback(sd_resolve_query *q, sd_resolve_destroy_t destroy_callback); +int sd_resolve_query_get_floating(sd_resolve_query *q); +int sd_resolve_query_set_floating(sd_resolve_query *q, int b); + +sd_resolve *sd_resolve_query_get_resolve(sd_resolve_query *q); + +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_resolve, sd_resolve_unref); +_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_resolve_query, sd_resolve_query_unref); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/systemd/sd-utf8.h b/src/systemd/sd-utf8.h new file mode 100644 index 0000000..556b215 --- /dev/null +++ b/src/systemd/sd-utf8.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#ifndef foosdutf8hfoo +#define foosdutf8hfoo + +/*** + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "_sd-common.h" + +_SD_BEGIN_DECLARATIONS; + +_sd_pure_ const char *sd_utf8_is_valid(const char *s); +_sd_pure_ const char *sd_ascii_is_valid(const char *s); + +_SD_END_DECLARATIONS; + +#endif diff --git a/src/sysupdate/meson.build b/src/sysupdate/meson.build new file mode 100644 index 0000000..417b47a --- /dev/null +++ b/src/sysupdate/meson.build @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_sysupdate_sources = files( + 'sysupdate-cache.c', + 'sysupdate-instance.c', + 'sysupdate-partition.c', + 'sysupdate-pattern.c', + 'sysupdate-resource.c', + 'sysupdate-transfer.c', + 'sysupdate-update-set.c', + 'sysupdate.c', +) + +executables += [ + libexec_template + { + 'name' : 'systemd-sysupdate', + 'public' : true, + 'conditions' : ['ENABLE_SYSUPDATE'], + 'sources' : systemd_sysupdate_sources, + 'link_with' : [ + libshared, + libshared_fdisk, + ], + 'dependencies' : [ + libblkid, + libfdisk, + libopenssl, + threads, + ], + }, +] diff --git a/src/sysupdate/sysupdate-cache.c b/src/sysupdate/sysupdate-cache.c new file mode 100644 index 0000000..8dad3ee --- /dev/null +++ b/src/sysupdate/sysupdate-cache.c @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memory-util.h" +#include "sysupdate-cache.h" + +#define WEB_CACHE_ENTRIES_MAX 64U +#define WEB_CACHE_ITEM_SIZE_MAX (64U*1024U*1024U) + +static WebCacheItem* web_cache_item_free(WebCacheItem *i) { + if (!i) + return NULL; + + free(i->url); + return mfree(i); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(WebCacheItem*, web_cache_item_free); + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(web_cache_hash_ops, char, string_hash_func, string_compare_func, WebCacheItem, web_cache_item_free); + +int web_cache_add_item( + Hashmap **web_cache, + const char *url, + bool verified, + const void *data, + size_t size) { + + _cleanup_(web_cache_item_freep) WebCacheItem *item = NULL; + _cleanup_free_ char *u = NULL; + int r; + + assert(web_cache); + assert(url); + assert(data || size == 0); + + if (size > WEB_CACHE_ITEM_SIZE_MAX) + return -E2BIG; + + item = web_cache_get_item(*web_cache, url, verified); + if (item && memcmp_nn(item->data, item->size, data, size) == 0) + return 0; + + if (hashmap_size(*web_cache) >= (size_t) (WEB_CACHE_ENTRIES_MAX + !!hashmap_get(*web_cache, url))) + return -ENOSPC; + + r = hashmap_ensure_allocated(web_cache, &web_cache_hash_ops); + if (r < 0) + return r; + + u = strdup(url); + if (!u) + return -ENOMEM; + + item = malloc(offsetof(WebCacheItem, data) + size + 1); + if (!item) + return -ENOMEM; + + *item = (WebCacheItem) { + .url = TAKE_PTR(u), + .size = size, + .verified = verified, + }; + + /* Just to be extra paranoid, let's NUL terminate the downloaded buffer */ + *(uint8_t*) mempcpy(item->data, data, size) = 0; + + web_cache_item_free(hashmap_remove(*web_cache, url)); + + r = hashmap_put(*web_cache, item->url, item); + if (r < 0) + return r; + + TAKE_PTR(item); + return 1; +} + +WebCacheItem* web_cache_get_item(Hashmap *web_cache, const char *url, bool verified) { + WebCacheItem *i; + + i = hashmap_get(web_cache, url); + if (!i) + return NULL; + + if (i->verified != verified) + return NULL; + + return i; +} diff --git a/src/sysupdate/sysupdate-cache.h b/src/sysupdate/sysupdate-cache.h new file mode 100644 index 0000000..d6a7897 --- /dev/null +++ b/src/sysupdate/sysupdate-cache.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" + +typedef struct WebCacheItem { + char *url; + bool verified; + size_t size; + uint8_t data[]; +} WebCacheItem; + +/* A simple in-memory cache for downloaded manifests. Very likely multiple transfers will use the same + * manifest URLs, hence let's make sure we only download them once within each sysupdate invocation. */ + +int web_cache_add_item(Hashmap **cache, const char *url, bool verified, const void *data, size_t size); + +WebCacheItem* web_cache_get_item(Hashmap *cache, const char *url, bool verified); diff --git a/src/sysupdate/sysupdate-instance.c b/src/sysupdate/sysupdate-instance.c new file mode 100644 index 0000000..16bfab9 --- /dev/null +++ b/src/sysupdate/sysupdate-instance.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sysupdate-instance.h" + +void instance_metadata_destroy(InstanceMetadata *m) { + assert(m); + free(m->version); +} + +int instance_new( + Resource *rr, + const char *path, + const InstanceMetadata *f, + Instance **ret) { + + _cleanup_(instance_freep) Instance *i = NULL; + _cleanup_free_ char *p = NULL, *v = NULL; + + assert(rr); + assert(path); + assert(f); + assert(f->version); + assert(ret); + + p = strdup(path); + if (!p) + return log_oom(); + + v = strdup(f->version); + if (!v) + return log_oom(); + + i = new(Instance, 1); + if (!i) + return log_oom(); + + *i = (Instance) { + .resource = rr, + .metadata = *f, + .path = TAKE_PTR(p), + .partition_info = PARTITION_INFO_NULL, + }; + + i->metadata.version = TAKE_PTR(v); + + *ret = TAKE_PTR(i); + return 0; +} + +Instance *instance_free(Instance *i) { + if (!i) + return NULL; + + instance_metadata_destroy(&i->metadata); + + free(i->path); + partition_info_destroy(&i->partition_info); + + return mfree(i); +} diff --git a/src/sysupdate/sysupdate-instance.h b/src/sysupdate/sysupdate-instance.h new file mode 100644 index 0000000..2860d29 --- /dev/null +++ b/src/sysupdate/sysupdate-instance.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +#include "fs-util.h" +#include "time-util.h" + +typedef struct InstanceMetadata InstanceMetadata; +typedef struct Instance Instance; + +#include "sysupdate-resource.h" +#include "sysupdate-partition.h" + +struct InstanceMetadata { + /* Various bits of metadata for each instance, that is either derived from the filename/GPT label or + * from metadata of the file/partition itself */ + char *version; + sd_id128_t partition_uuid; + bool partition_uuid_set; + uint64_t partition_flags; /* GPT partition flags */ + bool partition_flags_set; + usec_t mtime; + mode_t mode; + uint64_t size; /* uncompressed size of the file */ + uint64_t tries_done, tries_left; /* for boot assessment counters */ + int no_auto; + int read_only; + int growfs; + uint8_t sha256sum[32]; /* SHA256 sum of the download (i.e. compressed) file */ + bool sha256sum_set; +}; + +#define INSTANCE_METADATA_NULL \ + { \ + .mtime = USEC_INFINITY, \ + .mode = MODE_INVALID, \ + .size = UINT64_MAX, \ + .tries_done = UINT64_MAX, \ + .tries_left = UINT64_MAX, \ + .no_auto = -1, \ + .read_only = -1, \ + .growfs = -1, \ + } + +struct Instance { + /* A pointer back to the resource this belongs to */ + Resource *resource; + + /* Metadata of this version */ + InstanceMetadata metadata; + + /* Where we found the instance */ + char *path; + PartitionInfo partition_info; +}; + +void instance_metadata_destroy(InstanceMetadata *m); + +int instance_new(Resource *rr, const char *path, const InstanceMetadata *f, Instance **ret); +Instance *instance_free(Instance *i); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Instance*, instance_free); diff --git a/src/sysupdate/sysupdate-partition.c b/src/sysupdate/sysupdate-partition.c new file mode 100644 index 0000000..6f8e072 --- /dev/null +++ b/src/sysupdate/sysupdate-partition.c @@ -0,0 +1,284 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "extract-word.h" +#include "gpt.h" +#include "id128-util.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "sysupdate-partition.h" + +void partition_info_destroy(PartitionInfo *p) { + assert(p); + + p->label = mfree(p->label); + p->device = mfree(p->device); +} + +int read_partition_info( + struct fdisk_context *c, + struct fdisk_table *t, + size_t i, + PartitionInfo *ret) { + + _cleanup_free_ char *label_copy = NULL, *device = NULL; + const char *label; + struct fdisk_partition *p; + uint64_t start, size, flags; + unsigned long ssz; + sd_id128_t ptid, id; + GptPartitionType type; + size_t partno; + int r; + + assert(c); + assert(t); + assert(ret); + + p = fdisk_table_get_partition(t, i); + if (!p) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to read partition metadata: %m"); + + if (fdisk_partition_is_used(p) <= 0) { + *ret = (PartitionInfo) PARTITION_INFO_NULL; + return 0; /* not found! */ + } + + if (fdisk_partition_has_partno(p) <= 0 || + fdisk_partition_has_start(p) <= 0 || + fdisk_partition_has_size(p) <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Found a partition without a number, position or size."); + + partno = fdisk_partition_get_partno(p); + + start = fdisk_partition_get_start(p); + ssz = fdisk_get_sector_size(c); + assert(start <= UINT64_MAX / ssz); + start *= ssz; + + size = fdisk_partition_get_size(p); + assert(size <= UINT64_MAX / ssz); + size *= ssz; + + label = fdisk_partition_get_name(p); + if (!label) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Found a partition without a label."); + + r = fdisk_partition_get_type_as_id128(p, &ptid); + if (r < 0) + return log_error_errno(r, "Failed to read partition type UUID: %m"); + + r = fdisk_partition_get_uuid_as_id128(p, &id); + if (r < 0) + return log_error_errno(r, "Failed to read partition UUID: %m"); + + r = fdisk_partition_get_attrs_as_uint64(p, &flags); + if (r < 0) + return log_error_errno(r, "Failed to get partition flags: %m"); + + r = fdisk_partition_to_string(p, c, FDISK_FIELD_DEVICE, &device); + if (r != 0) + return log_error_errno(r, "Failed to get partition device name: %m"); + + label_copy = strdup(label); + if (!label_copy) + return log_oom(); + + type = gpt_partition_type_from_uuid(ptid); + + *ret = (PartitionInfo) { + .partno = partno, + .start = start, + .size = size, + .flags = flags, + .type = ptid, + .uuid = id, + .label = TAKE_PTR(label_copy), + .device = TAKE_PTR(device), + .no_auto = FLAGS_SET(flags, SD_GPT_FLAG_NO_AUTO) && gpt_partition_type_knows_no_auto(type), + .read_only = FLAGS_SET(flags, SD_GPT_FLAG_READ_ONLY) && gpt_partition_type_knows_read_only(type), + .growfs = FLAGS_SET(flags, SD_GPT_FLAG_GROWFS) && gpt_partition_type_knows_growfs(type), + }; + + return 1; /* found! */ +} + +int find_suitable_partition( + const char *device, + uint64_t space, + sd_id128_t *partition_type, + PartitionInfo *ret) { + + _cleanup_(partition_info_destroy) PartitionInfo smallest = PARTITION_INFO_NULL; + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + _cleanup_(fdisk_unref_tablep) struct fdisk_table *t = NULL; + size_t n_partitions; + int r; + + assert(device); + assert(ret); + + r = fdisk_new_context_at(AT_FDCWD, device, /* read_only= */ true, /* sector_size= */ UINT32_MAX, &c); + if (r < 0) + return log_error_errno(r, "Failed to create fdisk context from '%s': %m", device); + + if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), "Disk %s has no GPT disk label, not suitable.", device); + + r = fdisk_get_partitions(c, &t); + if (r < 0) + return log_error_errno(r, "Failed to acquire partition table: %m"); + + n_partitions = fdisk_table_get_nents(t); + for (size_t i = 0; i < n_partitions; i++) { + _cleanup_(partition_info_destroy) PartitionInfo pinfo = PARTITION_INFO_NULL; + + r = read_partition_info(c, t, i, &pinfo); + if (r < 0) + return r; + if (r == 0) /* not assigned */ + continue; + + /* Filter out non-matching partition types */ + if (partition_type && !sd_id128_equal(pinfo.type, *partition_type)) + continue; + + if (!streq_ptr(pinfo.label, "_empty")) /* used */ + continue; + + if (space != UINT64_MAX && pinfo.size < space) /* too small */ + continue; + + if (smallest.partno != SIZE_MAX && smallest.size <= pinfo.size) /* already found smaller */ + continue; + + smallest = pinfo; + pinfo = (PartitionInfo) PARTITION_INFO_NULL; + } + + if (smallest.partno == SIZE_MAX) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), "No available partition of a suitable size found."); + + *ret = smallest; + smallest = (PartitionInfo) PARTITION_INFO_NULL; + + return 0; +} + +int patch_partition( + const char *device, + const PartitionInfo *info, + PartitionChange change) { + + _cleanup_(fdisk_unref_partitionp) struct fdisk_partition *pa = NULL; + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + bool tweak_no_auto, tweak_read_only, tweak_growfs; + GptPartitionType type; + int r, fd; + + assert(device); + assert(info); + assert(change <= _PARTITION_CHANGE_MAX); + + if (change == 0) /* Nothing to do */ + return 0; + + r = fdisk_new_context_at(AT_FDCWD, device, /* read_only= */ false, /* sector_size= */ UINT32_MAX, &c); + if (r < 0) + return log_error_errno(r, "Failed to create fdisk context from '%s': %m", device); + + assert_se((fd = fdisk_get_devfd(c)) >= 0); + + /* Make sure udev doesn't read the device while we make changes (this lock is released automatically + * by the kernel when the fd is closed, i.e. when the fdisk context is freed, hence no explicit + * unlock by us here anywhere.) */ + if (flock(fd, LOCK_EX) < 0) + return log_error_errno(errno, "Failed to lock block device '%s': %m", device); + + if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), "Disk %s has no GPT disk label, not suitable.", device); + + r = fdisk_get_partition(c, info->partno, &pa); + if (r < 0) + return log_error_errno(r, "Failed to read partition %zu of GPT label of '%s': %m", info->partno, device); + + if (change & PARTITION_LABEL) { + r = fdisk_partition_set_name(pa, info->label); + if (r < 0) + return log_error_errno(r, "Failed to update partition label: %m"); + } + + if (change & PARTITION_UUID) { + r = fdisk_partition_set_uuid(pa, SD_ID128_TO_UUID_STRING(info->uuid)); + if (r < 0) + return log_error_errno(r, "Failed to update partition UUID: %m"); + } + + type = gpt_partition_type_from_uuid(info->type); + + /* Tweak the read-only flag, but only if supported by the partition type */ + tweak_no_auto = + FLAGS_SET(change, PARTITION_NO_AUTO) && + gpt_partition_type_knows_no_auto(type); + tweak_read_only = + FLAGS_SET(change, PARTITION_READ_ONLY) && + gpt_partition_type_knows_read_only(type); + tweak_growfs = + FLAGS_SET(change, PARTITION_GROWFS) && + gpt_partition_type_knows_growfs(type); + + if (change & PARTITION_FLAGS) { + uint64_t flags; + + /* Update the full flags parameter, and import the read-only flag into it */ + + flags = info->flags; + if (tweak_no_auto) + SET_FLAG(flags, SD_GPT_FLAG_NO_AUTO, info->no_auto); + if (tweak_read_only) + SET_FLAG(flags, SD_GPT_FLAG_READ_ONLY, info->read_only); + if (tweak_growfs) + SET_FLAG(flags, SD_GPT_FLAG_GROWFS, info->growfs); + + r = fdisk_partition_set_attrs_as_uint64(pa, flags); + if (r < 0) + return log_error_errno(r, "Failed to update partition flags: %m"); + + } else if (tweak_no_auto || tweak_read_only || tweak_growfs) { + uint64_t old_flags, new_flags; + + /* So we aren't supposed to update the full flags parameter, but we are supposed to update + * the RO flag of it. */ + + r = fdisk_partition_get_attrs_as_uint64(pa, &old_flags); + if (r < 0) + return log_error_errno(r, "Failed to get old partition flags: %m"); + + new_flags = old_flags; + if (tweak_no_auto) + SET_FLAG(new_flags, SD_GPT_FLAG_NO_AUTO, info->no_auto); + if (tweak_read_only) + SET_FLAG(new_flags, SD_GPT_FLAG_READ_ONLY, info->read_only); + if (tweak_growfs) + SET_FLAG(new_flags, SD_GPT_FLAG_GROWFS, info->growfs); + + if (new_flags != old_flags) { + r = fdisk_partition_set_attrs_as_uint64(pa, new_flags); + if (r < 0) + return log_error_errno(r, "Failed to update partition flags: %m"); + } + } + + r = fdisk_set_partition(c, info->partno, pa); + if (r < 0) + return log_error_errno(r, "Failed to update partition: %m"); + + r = fdisk_write_disklabel(c); + if (r < 0) + return log_error_errno(r, "Failed to write updated partition table: %m"); + + return 0; +} diff --git a/src/sysupdate/sysupdate-partition.h b/src/sysupdate/sysupdate-partition.h new file mode 100644 index 0000000..672eb93 --- /dev/null +++ b/src/sysupdate/sysupdate-partition.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +#include "sd-id128.h" + +#include "fdisk-util.h" +#include "macro.h" + +typedef struct PartitionInfo PartitionInfo; + +typedef enum PartitionChange { + PARTITION_FLAGS = 1 << 0, + PARTITION_NO_AUTO = 1 << 1, + PARTITION_READ_ONLY = 1 << 2, + PARTITION_GROWFS = 1 << 3, + PARTITION_UUID = 1 << 4, + PARTITION_LABEL = 1 << 5, + _PARTITION_CHANGE_MAX = (1 << 6) - 1, /* all of the above */ + _PARTITION_CHANGE_INVALID = -EINVAL, +} PartitionChange; + +struct PartitionInfo { + size_t partno; + uint64_t start, size; + uint64_t flags; + sd_id128_t type, uuid; + char *label; + char *device; /* Note that this might point to some non-existing path in case we operate on a loopback file */ + bool no_auto:1; + bool read_only:1; + bool growfs:1; +}; + +#define PARTITION_INFO_NULL \ + { \ + .partno = SIZE_MAX, \ + .start = UINT64_MAX, \ + .size = UINT64_MAX, \ + } + +void partition_info_destroy(PartitionInfo *p); + +int read_partition_info(struct fdisk_context *c, struct fdisk_table *t, size_t i, PartitionInfo *ret); + +int find_suitable_partition(const char *device, uint64_t space, sd_id128_t *partition_type, PartitionInfo *ret); +int patch_partition(const char *device, const PartitionInfo *info, PartitionChange change); diff --git a/src/sysupdate/sysupdate-pattern.c b/src/sysupdate/sysupdate-pattern.c new file mode 100644 index 0000000..ff018d8 --- /dev/null +++ b/src/sysupdate/sysupdate-pattern.c @@ -0,0 +1,643 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "hexdecoct.h" +#include "list.h" +#include "parse-util.h" +#include "path-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "sysupdate-pattern.h" + +typedef enum PatternElementType { + PATTERN_LITERAL, + PATTERN_VERSION, + PATTERN_PARTITION_UUID, + PATTERN_PARTITION_FLAGS, + PATTERN_MTIME, + PATTERN_MODE, + PATTERN_SIZE, + PATTERN_TRIES_DONE, + PATTERN_TRIES_LEFT, + PATTERN_NO_AUTO, + PATTERN_READ_ONLY, + PATTERN_GROWFS, + PATTERN_SHA256SUM, + PATTERN_SLASH, + _PATTERN_ELEMENT_TYPE_MAX, + _PATTERN_ELEMENT_TYPE_INVALID = -EINVAL, +} PatternElementType; + +typedef struct PatternElement PatternElement; + +struct PatternElement { + PatternElementType type; + LIST_FIELDS(PatternElement, elements); + char literal[]; +}; + +static PatternElement *pattern_element_free_all(PatternElement *e) { + LIST_CLEAR(elements, e, free); + + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(PatternElement*, pattern_element_free_all); + +static PatternElementType pattern_element_type_from_char(char c) { + switch (c) { + case 'v': + return PATTERN_VERSION; + case 'u': + return PATTERN_PARTITION_UUID; + case 'f': + return PATTERN_PARTITION_FLAGS; + case 't': + return PATTERN_MTIME; + case 'm': + return PATTERN_MODE; + case 's': + return PATTERN_SIZE; + case 'd': + return PATTERN_TRIES_DONE; + case 'l': + return PATTERN_TRIES_LEFT; + case 'a': + return PATTERN_NO_AUTO; + case 'r': + return PATTERN_READ_ONLY; + case 'g': + return PATTERN_GROWFS; + case 'h': + return PATTERN_SHA256SUM; + default: + return _PATTERN_ELEMENT_TYPE_INVALID; + } +} + +static bool valid_char(char x) { + + /* Let's refuse control characters here, and let's reserve some characters typically used in pattern + * languages so that we can use them later, possibly. */ + + if ((unsigned) x < ' ' || x >= 127) + return false; + + return !IN_SET(x, '$', '*', '?', '[', ']', '!', '\\', '|'); +} + +static int pattern_split( + const char *pattern, + PatternElement **ret) { + + _cleanup_(pattern_element_free_allp) PatternElement *first = NULL; + bool at = false, last_literal = true, last_slash = false; + PatternElement *last = NULL; + uint64_t mask_found = 0; + size_t l, k = 0; + + assert(pattern); + + l = strlen(pattern); + + for (const char *e = pattern; *e != 0; e++) { + if (*e == '@') { + if (!at) { + at = true; + continue; + } + + /* Two at signs in a sequence, write out one */ + at = false; + + } else if (at) { + PatternElementType t; + uint64_t bit; + + t = pattern_element_type_from_char(*e); + if (t < 0) + return log_debug_errno(t, "Unknown pattern field marker '@%c'.", *e); + + bit = UINT64_C(1) << t; + if (mask_found & bit) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Pattern field marker '@%c' appears twice in pattern.", *e); + + /* We insist that two pattern field markers are separated by some literal string that + * we can use to separate the fields when parsing. */ + if (!last_literal && !last_slash) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Found two pattern field markers without separating literal."); + + if (ret) { + PatternElement *z; + + z = malloc(offsetof(PatternElement, literal)); + if (!z) + return -ENOMEM; + + z->type = t; + LIST_INSERT_AFTER(elements, first, last, z); + last = z; + } + + mask_found |= bit; + last_slash = last_literal = at = false; + continue; + } + + if (*e == '/') { + if (ret) { + PatternElement *z; + + z = malloc(offsetof(PatternElement, literal)); + if (!z) + return -ENOMEM; + + z->type = PATTERN_SLASH; + LIST_INSERT_AFTER(elements, first, last, z); + last = z; + } + + last_literal = false; + last_slash = true; + continue ; + } + + if (!valid_char(*e)) + return log_debug_errno( + SYNTHETIC_ERRNO(EBADRQC), + "Invalid character 0x%0x in pattern, refusing.", + (unsigned) *e); + + last_literal = true; + last_slash = false; + + if (!ret) + continue; + + if (!last || last->type != PATTERN_LITERAL) { + PatternElement *z; + + z = malloc0(offsetof(PatternElement, literal) + l + 1); /* l is an upper bound to all literal elements */ + if (!z) + return -ENOMEM; + + z->type = PATTERN_LITERAL; + k = 0; + + LIST_INSERT_AFTER(elements, first, last, z); + last = z; + } + + assert(last); + assert(last->type == PATTERN_LITERAL); + + last->literal[k++] = *e; + } + + if (at) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Trailing @ character found, refusing."); + if (!(mask_found & (UINT64_C(1) << PATTERN_VERSION))) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Version field marker '@v' not specified in pattern, refusing."); + + if (ret) + *ret = TAKE_PTR(first); + + return 0; +} + +int pattern_match(const char *pattern, const char *s, InstanceMetadata *ret) { + _cleanup_(instance_metadata_destroy) InstanceMetadata found = INSTANCE_METADATA_NULL; + _cleanup_(pattern_element_free_allp) PatternElement *elements = NULL; + const char *p; + int r; + + assert(pattern); + assert(s); + + r = pattern_split(pattern, &elements); + if (r < 0) + return r; + + p = s; + LIST_FOREACH(elements, e, elements) { + _cleanup_free_ char *t = NULL; + const char *n; + + if (e->type == PATTERN_SLASH) { + if (*p == '/') { + ++p; + continue; + } else if (*p == '\0') + goto retry; + else + goto nope; + } + + if (e->type == PATTERN_LITERAL) { + const char *k; + + /* Skip literal fields */ + k = startswith(p, e->literal); + if (!k) + goto nope; + + p = k; + continue; + } + + if (e->elements_next) { + /* The next element must be literal, as we use it to determine where to split */ + assert(e->elements_next->type == PATTERN_LITERAL); + + n = strstr(p, e->elements_next->literal); + if (!n) + goto nope; + + } else + /* End of the string */ + assert_se(n = strchr(p, 0)); + t = strndup(p, n - p); + if (!t) + return -ENOMEM; + + switch (e->type) { + + case PATTERN_VERSION: + if (!version_is_valid(t)) { + log_debug("Version string is not valid, refusing: %s", t); + goto nope; + } + + assert(!found.version); + found.version = TAKE_PTR(t); + break; + + case PATTERN_PARTITION_UUID: { + sd_id128_t id; + + if (sd_id128_from_string(t, &id) < 0) + goto nope; + + assert(!found.partition_uuid_set); + found.partition_uuid = id; + found.partition_uuid_set = true; + break; + } + + case PATTERN_PARTITION_FLAGS: { + uint64_t f; + + if (safe_atoux64(t, &f) < 0) + goto nope; + + if (found.partition_flags_set && found.partition_flags != f) + goto nope; + + assert(!found.partition_flags_set); + found.partition_flags = f; + found.partition_flags_set = true; + break; + } + + case PATTERN_MTIME: { + uint64_t v; + + if (safe_atou64(t, &v) < 0) + goto nope; + if (v == USEC_INFINITY) /* Don't permit our internal special infinity value */ + goto nope; + if (v / 1000000U > TIME_T_MAX) /* Make sure this fits in a timespec structure */ + goto nope; + + assert(found.mtime == USEC_INFINITY); + found.mtime = v; + break; + } + + case PATTERN_MODE: { + mode_t m; + + r = parse_mode(t, &m); + if (r < 0) + goto nope; + if (m & ~0775) /* Don't allow world-writable files or suid files to be generated this way */ + goto nope; + + assert(found.mode == MODE_INVALID); + found.mode = m; + break; + } + + case PATTERN_SIZE: { + uint64_t u; + + r = safe_atou64(t, &u); + if (r < 0) + goto nope; + if (u == UINT64_MAX) + goto nope; + + assert(found.size == UINT64_MAX); + found.size = u; + break; + } + + case PATTERN_TRIES_DONE: { + uint64_t u; + + r = safe_atou64(t, &u); + if (r < 0) + goto nope; + if (u == UINT64_MAX) + goto nope; + + assert(found.tries_done == UINT64_MAX); + found.tries_done = u; + break; + } + + case PATTERN_TRIES_LEFT: { + uint64_t u; + + r = safe_atou64(t, &u); + if (r < 0) + goto nope; + if (u == UINT64_MAX) + goto nope; + + assert(found.tries_left == UINT64_MAX); + found.tries_left = u; + break; + } + + case PATTERN_NO_AUTO: + r = parse_boolean(t); + if (r < 0) + goto nope; + + assert(found.no_auto < 0); + found.no_auto = r; + break; + + case PATTERN_READ_ONLY: + r = parse_boolean(t); + if (r < 0) + goto nope; + + assert(found.read_only < 0); + found.read_only = r; + break; + + case PATTERN_GROWFS: + r = parse_boolean(t); + if (r < 0) + goto nope; + + assert(found.growfs < 0); + found.growfs = r; + break; + + case PATTERN_SHA256SUM: { + _cleanup_free_ void *d = NULL; + size_t l; + + if (strlen(t) != sizeof(found.sha256sum) * 2) + goto nope; + + r = unhexmem(t, sizeof(found.sha256sum) * 2, &d, &l); + if (r == -ENOMEM) + return r; + if (r < 0) + goto nope; + + assert(!found.sha256sum_set); + assert(l == sizeof(found.sha256sum)); + memcpy(found.sha256sum, d, l); + found.sha256sum_set = true; + break; + } + + default: + assert_se("unexpected pattern element"); + } + + p = n; + } + + if (ret) { + *ret = found; + found = (InstanceMetadata) INSTANCE_METADATA_NULL; + } + + return PATTERN_MATCH_YES; + +nope: + if (ret) + *ret = (InstanceMetadata) INSTANCE_METADATA_NULL; + + return PATTERN_MATCH_NO; + +retry: + if (ret) + *ret = (InstanceMetadata) INSTANCE_METADATA_NULL; + + return PATTERN_MATCH_RETRY; +} + +int pattern_match_many(char **patterns, const char *s, InstanceMetadata *ret) { + _cleanup_(instance_metadata_destroy) InstanceMetadata found = INSTANCE_METADATA_NULL; + int r; + + STRV_FOREACH(p, patterns) { + r = pattern_match(*p, s, &found); + if (r < 0) + return r; + if (r > 0) { + if (ret) { + *ret = found; + found = (InstanceMetadata) INSTANCE_METADATA_NULL; + } + + return r; + } + } + + if (ret) + *ret = (InstanceMetadata) INSTANCE_METADATA_NULL; + + return PATTERN_MATCH_NO; +} + +int pattern_valid(const char *pattern) { + int r; + + r = pattern_split(pattern, NULL); + if (r == -EINVAL) + return false; + if (r < 0) + return r; + + return true; +} + +int pattern_format( + const char *pattern, + const InstanceMetadata *fields, + char **ret) { + + _cleanup_(pattern_element_free_allp) PatternElement *elements = NULL; + _cleanup_free_ char *j = NULL; + int r; + + assert(pattern); + assert(fields); + assert(ret); + + r = pattern_split(pattern, &elements); + if (r < 0) + return r; + + LIST_FOREACH(elements, e, elements) { + + switch (e->type) { + + case PATTERN_SLASH: + if (!strextend(&j, "/")) + return -ENOMEM; + + break; + + case PATTERN_LITERAL: + if (!strextend(&j, e->literal)) + return -ENOMEM; + + break; + + case PATTERN_VERSION: + if (!fields->version) + return -ENXIO; + + if (!strextend(&j, fields->version)) + return -ENOMEM; + break; + + case PATTERN_PARTITION_UUID: { + char formatted[SD_ID128_STRING_MAX]; + + if (!fields->partition_uuid_set) + return -ENXIO; + + if (!strextend(&j, sd_id128_to_string(fields->partition_uuid, formatted))) + return -ENOMEM; + + break; + } + + case PATTERN_PARTITION_FLAGS: + if (!fields->partition_flags_set) + return -ENXIO; + + r = strextendf(&j, "%" PRIx64, fields->partition_flags); + if (r < 0) + return r; + + break; + + case PATTERN_MTIME: + if (fields->mtime == USEC_INFINITY) + return -ENXIO; + + r = strextendf(&j, "%" PRIu64, fields->mtime); + if (r < 0) + return r; + + break; + + case PATTERN_MODE: + if (fields->mode == MODE_INVALID) + return -ENXIO; + + r = strextendf(&j, "%03o", fields->mode); + if (r < 0) + return r; + + break; + + case PATTERN_SIZE: + if (fields->size == UINT64_MAX) + return -ENXIO; + + r = strextendf(&j, "%" PRIu64, fields->size); + if (r < 0) + return r; + break; + + case PATTERN_TRIES_DONE: + if (fields->tries_done == UINT64_MAX) + return -ENXIO; + + r = strextendf(&j, "%" PRIu64, fields->tries_done); + if (r < 0) + return r; + break; + + case PATTERN_TRIES_LEFT: + if (fields->tries_left == UINT64_MAX) + return -ENXIO; + + r = strextendf(&j, "%" PRIu64, fields->tries_left); + if (r < 0) + return r; + break; + + case PATTERN_NO_AUTO: + if (fields->no_auto < 0) + return -ENXIO; + + if (!strextend(&j, one_zero(fields->no_auto))) + return -ENOMEM; + + break; + + case PATTERN_READ_ONLY: + if (fields->read_only < 0) + return -ENXIO; + + if (!strextend(&j, one_zero(fields->read_only))) + return -ENOMEM; + + break; + + case PATTERN_GROWFS: + if (fields->growfs < 0) + return -ENXIO; + + if (!strextend(&j, one_zero(fields->growfs))) + return -ENOMEM; + + break; + + case PATTERN_SHA256SUM: { + _cleanup_free_ char *h = NULL; + + if (!fields->sha256sum_set) + return -ENXIO; + + h = hexmem(fields->sha256sum, sizeof(fields->sha256sum)); + if (!h) + return -ENOMEM; + + if (!strextend(&j, h)) + return -ENOMEM; + + break; + } + + default: + assert_not_reached(); + } + } + + *ret = TAKE_PTR(j); + return 0; +} diff --git a/src/sysupdate/sysupdate-pattern.h b/src/sysupdate/sysupdate-pattern.h new file mode 100644 index 0000000..e8ea104 --- /dev/null +++ b/src/sysupdate/sysupdate-pattern.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sysupdate-instance.h" +#include "time-util.h" + +enum { + PATTERN_MATCH_NO, + PATTERN_MATCH_YES, + PATTERN_MATCH_RETRY, +}; + +int pattern_match(const char *pattern, const char *s, InstanceMetadata *ret); +int pattern_match_many(char **patterns, const char *s, InstanceMetadata *ret); +int pattern_valid(const char *pattern); +int pattern_format(const char *pattern, const InstanceMetadata *fields, char **ret); diff --git a/src/sysupdate/sysupdate-resource.c b/src/sysupdate/sysupdate-resource.c new file mode 100644 index 0000000..e4bdd88 --- /dev/null +++ b/src/sysupdate/sysupdate-resource.c @@ -0,0 +1,707 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "chase.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "find-esp.h" +#include "glyph-util.h" +#include "gpt.h" +#include "hexdecoct.h" +#include "import-util.h" +#include "macro.h" +#include "process-util.h" +#include "sort-util.h" +#include "string-table.h" +#include "sysupdate-cache.h" +#include "sysupdate-instance.h" +#include "sysupdate-pattern.h" +#include "sysupdate-resource.h" +#include "sysupdate.h" +#include "utf8.h" + +void resource_destroy(Resource *rr) { + assert(rr); + + free(rr->path); + strv_free(rr->patterns); + + for (size_t i = 0; i < rr->n_instances; i++) + instance_free(rr->instances[i]); + free(rr->instances); +} + +static int resource_add_instance( + Resource *rr, + const char *path, + const InstanceMetadata *f, + Instance **ret) { + + Instance *i; + int r; + + assert(rr); + assert(path); + assert(f); + assert(f->version); + + if (!GREEDY_REALLOC(rr->instances, rr->n_instances + 1)) + return log_oom(); + + r = instance_new(rr, path, f, &i); + if (r < 0) + return r; + + rr->instances[rr->n_instances++] = i; + + if (ret) + *ret = i; + + return 0; +} + +static int resource_load_from_directory_recursive( + Resource *rr, + DIR* d, + const char* relpath, + mode_t m) { + int r; + + for (;;) { + _cleanup_(instance_metadata_destroy) InstanceMetadata extracted_fields = INSTANCE_METADATA_NULL; + _cleanup_free_ char *joined = NULL, *rel_joined = NULL; + Instance *instance; + struct dirent *de; + struct stat st; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + return log_error_errno(errno, "Failed to read directory '%s': %m", rr->path); + break; + } + + switch (de->d_type) { + + case DT_UNKNOWN: + break; + + case DT_DIR: + if (!IN_SET(m, S_IFDIR, S_IFREG)) + continue; + + break; + + case DT_REG: + if (m != S_IFREG) + continue; + break; + + default: + continue; + } + + if (fstatat(dirfd(d), de->d_name, &st, AT_NO_AUTOMOUNT) < 0) { + if (errno == ENOENT) /* Gone by now? */ + continue; + + return log_error_errno(errno, "Failed to stat %s/%s: %m", rr->path, de->d_name); + } + + if (!(S_ISDIR(st.st_mode) && S_ISREG(m)) && ((st.st_mode & S_IFMT) != m)) + continue; + + rel_joined = path_join(relpath, de->d_name); + if (!rel_joined) + return log_oom(); + + r = pattern_match_many(rr->patterns, rel_joined, &extracted_fields); + if (r == PATTERN_MATCH_RETRY) { + _cleanup_closedir_ DIR *subdir = NULL; + + subdir = xopendirat(dirfd(d), rel_joined, 0); + if (!subdir) + continue; + + r = resource_load_from_directory_recursive(rr, subdir, rel_joined, m); + if (r < 0) + return r; + if (r == 0) + continue; + } + else if (r < 0) + return log_error_errno(r, "Failed to match pattern: %m"); + else if (r == PATTERN_MATCH_NO) + continue; + + if (de->d_type == DT_DIR && m != S_IFDIR) + continue; + + joined = path_join(rr->path, rel_joined); + if (!joined) + return log_oom(); + + r = resource_add_instance(rr, joined, &extracted_fields, &instance); + if (r < 0) + return r; + + /* Inherit these from the source, if not explicitly overwritten */ + if (instance->metadata.mtime == USEC_INFINITY) + instance->metadata.mtime = timespec_load(&st.st_mtim) ?: USEC_INFINITY; + + if (instance->metadata.mode == MODE_INVALID) + instance->metadata.mode = st.st_mode & 0775; /* mask out world-writability and suid and stuff, for safety */ + } + + return 0; +} + +static int resource_load_from_directory( + Resource *rr, + mode_t m) { + _cleanup_closedir_ DIR *d = NULL; + + assert(rr); + assert(IN_SET(rr->type, RESOURCE_TAR, RESOURCE_REGULAR_FILE, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME)); + assert(IN_SET(m, S_IFREG, S_IFDIR)); + + d = opendir(rr->path); + if (!d) { + if (errno == ENOENT) { + log_debug_errno(errno, "Directory %s does not exist, not loading any resources: %m", rr->path); + return 0; + } + + return log_error_errno(errno, "Failed to open directory '%s': %m", rr->path); + } + + return resource_load_from_directory_recursive(rr, d, NULL, m); +} + +static int resource_load_from_blockdev(Resource *rr) { + _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL; + _cleanup_(fdisk_unref_tablep) struct fdisk_table *t = NULL; + size_t n_partitions; + int r; + + assert(rr); + + r = fdisk_new_context_at(AT_FDCWD, rr->path, /* read_only= */ true, /* sector_size= */ UINT32_MAX, &c); + if (r < 0) + return log_error_errno(r, "Failed to create fdisk context from '%s': %m", rr->path); + + if (!fdisk_is_labeltype(c, FDISK_DISKLABEL_GPT)) + return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON), "Disk %s has no GPT disk label, not suitable.", rr->path); + + r = fdisk_get_partitions(c, &t); + if (r < 0) + return log_error_errno(r, "Failed to acquire partition table: %m"); + + n_partitions = fdisk_table_get_nents(t); + for (size_t i = 0; i < n_partitions; i++) { + _cleanup_(instance_metadata_destroy) InstanceMetadata extracted_fields = INSTANCE_METADATA_NULL; + _cleanup_(partition_info_destroy) PartitionInfo pinfo = PARTITION_INFO_NULL; + Instance *instance; + + r = read_partition_info(c, t, i, &pinfo); + if (r < 0) + return r; + if (r == 0) /* not assigned */ + continue; + + /* Check if partition type matches */ + if (rr->partition_type_set && !sd_id128_equal(pinfo.type, rr->partition_type.uuid)) + continue; + + /* A label of "_empty" means "not used so far" for us */ + if (streq_ptr(pinfo.label, "_empty")) { + rr->n_empty++; + continue; + } + + r = pattern_match_many(rr->patterns, pinfo.label, &extracted_fields); + if (r < 0) + return log_error_errno(r, "Failed to match pattern: %m"); + if (IN_SET(r, PATTERN_MATCH_NO, PATTERN_MATCH_RETRY)) + continue; + + r = resource_add_instance(rr, pinfo.device, &extracted_fields, &instance); + if (r < 0) + return r; + + instance->partition_info = pinfo; + pinfo = (PartitionInfo) PARTITION_INFO_NULL; + + /* Inherit data from source if not configured explicitly */ + if (!instance->metadata.partition_uuid_set) { + instance->metadata.partition_uuid = instance->partition_info.uuid; + instance->metadata.partition_uuid_set = true; + } + + if (!instance->metadata.partition_flags_set) { + instance->metadata.partition_flags = instance->partition_info.flags; + instance->metadata.partition_flags_set = true; + } + + if (instance->metadata.read_only < 0) + instance->metadata.read_only = instance->partition_info.read_only; + } + + return 0; +} + +static int download_manifest( + const char *url, + bool verify_signature, + char **ret_buffer, + size_t *ret_size) { + + _cleanup_free_ char *buffer = NULL, *suffixed_url = NULL; + _cleanup_close_pair_ int pfd[2] = EBADF_PAIR; + _cleanup_fclose_ FILE *manifest = NULL; + size_t size = 0; + pid_t pid; + int r; + + assert(url); + assert(ret_buffer); + assert(ret_size); + + /* Download a SHA256SUMS file as manifest */ + + r = import_url_append_component(url, "SHA256SUMS", &suffixed_url); + if (r < 0) + return log_error_errno(r, "Failed to append SHA256SUMS to URL: %m"); + + if (pipe2(pfd, O_CLOEXEC) < 0) + return log_error_errno(errno, "Failed to allocate pipe: %m"); + + log_info("%s Acquiring manifest file %s%s", special_glyph(SPECIAL_GLYPH_DOWNLOAD), + suffixed_url, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = safe_fork_full("(sd-pull)", + (int[]) { -EBADF, pfd[1], STDERR_FILENO }, + NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG, + &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + + const char *cmdline[] = { + "systemd-pull", + "raw", + "--direct", /* just download the specified URL, don't download anything else */ + "--verify", verify_signature ? "signature" : "no", /* verify the manifest file */ + suffixed_url, + "-", /* write to stdout */ + NULL + }; + + execv(pull_binary_path(), (char *const*) cmdline); + log_error_errno(errno, "Failed to execute %s tool: %m", pull_binary_path()); + _exit(EXIT_FAILURE); + }; + + pfd[1] = safe_close(pfd[1]); + + /* We'll first load the entire manifest into memory before parsing it. That's because the + * systemd-pull tool can validate the download only after its completion, but still pass the data to + * us as it runs. We thus need to check the return value of the process *before* parsing, to be + * reasonably safe. */ + + manifest = fdopen(pfd[0], "r"); + if (!manifest) + return log_error_errno(errno, "Failed allocate FILE object for manifest file: %m"); + + TAKE_FD(pfd[0]); + + r = read_full_stream(manifest, &buffer, &size); + if (r < 0) + return log_error_errno(r, "Failed to read manifest file from child: %m"); + + manifest = safe_fclose(manifest); + + r = wait_for_terminate_and_check("(sd-pull)", pid, WAIT_LOG); + if (r < 0) + return r; + if (r != 0) + return -EPROTO; + + *ret_buffer = TAKE_PTR(buffer); + *ret_size = size; + + return 0; +} + +static int resource_load_from_web( + Resource *rr, + bool verify, + Hashmap **web_cache) { + + size_t manifest_size = 0, left = 0; + _cleanup_free_ char *buf = NULL; + const char *manifest, *p; + size_t line_nr = 1; + WebCacheItem *ci; + int r; + + assert(rr); + + ci = web_cache ? web_cache_get_item(*web_cache, rr->path, verify) : NULL; + if (ci) { + log_debug("Manifest web cache hit for %s.", rr->path); + + manifest = (char*) ci->data; + manifest_size = ci->size; + } else { + log_debug("Manifest web cache miss for %s.", rr->path); + + r = download_manifest(rr->path, verify, &buf, &manifest_size); + if (r < 0) + return r; + + manifest = buf; + } + + if (memchr(manifest, 0, manifest_size)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Manifest file has embedded NUL byte, refusing."); + if (!utf8_is_valid_n(manifest, manifest_size)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Manifest file is not valid UTF-8, refusing."); + + p = manifest; + left = manifest_size; + + while (left > 0) { + _cleanup_(instance_metadata_destroy) InstanceMetadata extracted_fields = INSTANCE_METADATA_NULL; + _cleanup_free_ char *fn = NULL; + _cleanup_free_ void *h = NULL; + Instance *instance; + const char *e; + size_t hlen; + + /* 64 character hash + separator + filename + newline */ + if (left < 67) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Corrupt manifest at line %zu, refusing.", line_nr); + + if (p[0] == '\\') + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "File names with escapes not supported in manifest at line %zu, refusing.", line_nr); + + r = unhexmem(p, 64, &h, &hlen); + if (r < 0) + return log_error_errno(r, "Failed to parse digest at manifest line %zu, refusing.", line_nr); + + p += 64, left -= 64; + + if (*p != ' ') + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing space separator at manifest line %zu, refusing.", line_nr); + p++, left--; + + if (!IN_SET(*p, '*', ' ')) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing binary/text input marker at manifest line %zu, refusing.", line_nr); + p++, left--; + + e = memchr(p, '\n', left); + if (!e) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Truncated manifest file at line %zu, refusing.", line_nr); + if (e == p) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Empty filename specified at manifest line %zu, refusing.", line_nr); + + fn = strndup(p, e - p); + if (!fn) + return log_oom(); + + if (!filename_is_valid(fn)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid filename specified at manifest line %zu, refusing.", line_nr); + if (string_has_cc(fn, NULL)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Filename contains control characters at manifest line %zu, refusing.", line_nr); + + r = pattern_match_many(rr->patterns, fn, &extracted_fields); + if (r < 0) + return log_error_errno(r, "Failed to match pattern: %m"); + if (r == PATTERN_MATCH_YES) { + _cleanup_free_ char *path = NULL; + + r = import_url_append_component(rr->path, fn, &path); + if (r < 0) + return log_error_errno(r, "Failed to build instance URL: %m"); + + r = resource_add_instance(rr, path, &extracted_fields, &instance); + if (r < 0) + return r; + + assert(hlen == sizeof(instance->metadata.sha256sum)); + + if (instance->metadata.sha256sum_set) { + if (memcmp(instance->metadata.sha256sum, h, hlen) != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "SHA256 sum parsed from filename and manifest don't match at line %zu, refusing.", line_nr); + } else { + memcpy(instance->metadata.sha256sum, h, hlen); + instance->metadata.sha256sum_set = true; + } + } + + left -= (e - p) + 1; + p = e + 1; + + line_nr++; + } + + if (!ci && web_cache) { + r = web_cache_add_item(web_cache, rr->path, verify, manifest, manifest_size); + if (r < 0) + log_debug_errno(r, "Failed to add manifest '%s' to cache, ignoring: %m", rr->path); + else + log_debug("Added manifest '%s' to cache.", rr->path); + } + + return 0; +} + +static int instance_cmp(Instance *const*a, Instance *const*b) { + int r; + + assert(a); + assert(b); + assert(*a); + assert(*b); + assert((*a)->metadata.version); + assert((*b)->metadata.version); + + /* Newest version at the beginning */ + r = strverscmp_improved((*a)->metadata.version, (*b)->metadata.version); + if (r != 0) + return -r; + + /* Instances don't have to be uniquely named (uniqueness on partition tables is not enforced at all, + * and since we allow multiple matching patterns not even in directories they are unique). Hence + * let's order by path as secondary ordering key. */ + return path_compare((*a)->path, (*b)->path); +} + +int resource_load_instances(Resource *rr, bool verify, Hashmap **web_cache) { + int r; + + assert(rr); + + switch (rr->type) { + + case RESOURCE_TAR: + case RESOURCE_REGULAR_FILE: + r = resource_load_from_directory(rr, S_IFREG); + break; + + case RESOURCE_DIRECTORY: + case RESOURCE_SUBVOLUME: + r = resource_load_from_directory(rr, S_IFDIR); + break; + + case RESOURCE_PARTITION: + r = resource_load_from_blockdev(rr); + break; + + case RESOURCE_URL_FILE: + case RESOURCE_URL_TAR: + r = resource_load_from_web(rr, verify, web_cache); + break; + + default: + assert_not_reached(); + } + if (r < 0) + return r; + + typesafe_qsort(rr->instances, rr->n_instances, instance_cmp); + return 0; +} + +Instance* resource_find_instance(Resource *rr, const char *version) { + Instance key = { + .metadata.version = (char*) version, + }, *k = &key; + + Instance **found; + found = typesafe_bsearch(&k, rr->instances, rr->n_instances, instance_cmp); + if (!found) + return NULL; + + return *found; +} + +int resource_resolve_path( + Resource *rr, + const char *root, + const char *node) { + + _cleanup_free_ char *p = NULL; + dev_t d; + int r; + + assert(rr); + + if (IN_SET(rr->path_relative_to, PATH_RELATIVE_TO_ESP, PATH_RELATIVE_TO_XBOOTLDR, PATH_RELATIVE_TO_BOOT) && + !IN_SET(rr->type, RESOURCE_REGULAR_FILE, RESOURCE_DIRECTORY)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Paths relative to %s are only allowed for regular-file or directory resources.", + path_relative_to_to_string(rr->path_relative_to)); + + if (rr->path_auto) { + struct stat orig_root_stats; + + /* NB: If the root mount has been replaced by some form of volatile file system (overlayfs), + * the original root block device node is symlinked in /run/systemd/volatile-root. Let's + * follow that link here. If that doesn't exist, we check the backing device of "/usr". We + * don't actually check the backing device of the root fs "/", in order to support + * environments where the root fs is a tmpfs, and the OS itself placed exclusively in + * /usr/. */ + + if (rr->type != RESOURCE_PARTITION) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Automatic root path discovery only supported for partition resources."); + + if (node) { /* If --image= is specified, directly use the loopback device */ + r = free_and_strdup_warn(&rr->path, node); + if (r < 0) + return r; + + return 0; + } + + if (root) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Block device is not allowed when using --root= mode."); + + r = stat("/run/systemd/volatile-root", &orig_root_stats); + if (r < 0) { + if (errno == ENOENT) /* volatile-root not found */ + r = get_block_device_harder("/usr/", &d); + else + return log_error_errno(r, "Failed to stat /run/systemd/volatile-root: %m"); + } else if (!S_ISBLK(orig_root_stats.st_mode)) /* symlink was present but not block device */ + return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "/run/systemd/volatile-root is not linked to a block device."); + else /* symlink was present and a block device */ + d = orig_root_stats.st_rdev; + + } else if (rr->type == RESOURCE_PARTITION) { + _cleanup_close_ int fd = -EBADF, real_fd = -EBADF; + _cleanup_free_ char *resolved = NULL; + struct stat st; + + r = chase(rr->path, root, CHASE_PREFIX_ROOT, &resolved, &fd); + if (r < 0) + return log_error_errno(r, "Failed to resolve '%s': %m", rr->path); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", resolved); + + if (S_ISBLK(st.st_mode) && root) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "When using --root= or --image= access to device nodes is prohibited."); + + if (S_ISREG(st.st_mode) || S_ISBLK(st.st_mode)) { + /* Not a directory, hence no need to find backing block device for the path */ + free_and_replace(rr->path, resolved); + return 0; + } + + if (!S_ISDIR(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "Target path '%s' does not refer to regular file, directory or block device, refusing.", rr->path); + + if (node) { /* If --image= is specified all file systems are backed by the same loopback device, hence shortcut things. */ + r = free_and_strdup_warn(&rr->path, node); + if (r < 0) + return r; + + return 0; + } + + real_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (real_fd < 0) + return log_error_errno(real_fd, "Failed to convert O_PATH file descriptor for %s to regular file descriptor: %m", rr->path); + + r = get_block_device_harder_fd(fd, &d); + + } else if (RESOURCE_IS_FILESYSTEM(rr->type)) { + _cleanup_free_ char *resolved = NULL, *relative_to = NULL; + ChaseFlags chase_flags = CHASE_PREFIX_ROOT; + + if (rr->path_relative_to == PATH_RELATIVE_TO_ROOT) { + relative_to = strdup(empty_to_root(root)); + if (!relative_to) + return log_oom(); + } else { /* boot, esp, or xbootldr */ + r = 0; + if (IN_SET(rr->path_relative_to, PATH_RELATIVE_TO_BOOT, PATH_RELATIVE_TO_XBOOTLDR)) + r = find_xbootldr_and_warn(root, NULL, /* unprivileged_mode= */ -1, &relative_to, NULL, NULL); + if (r == -ENOKEY || rr->path_relative_to == PATH_RELATIVE_TO_ESP) + r = find_esp_and_warn(root, NULL, -1, &relative_to, NULL, NULL, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve $BOOT: %m"); + log_debug("Resolved $BOOT to '%s'", relative_to); + + /* Since this partition is read from EFI, there should be no symlinks */ + chase_flags |= CHASE_PROHIBIT_SYMLINKS; + } + + r = chase(rr->path, relative_to, chase_flags, &resolved, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve '%s' (relative to '%s'): %m", rr->path, relative_to); + + free_and_replace(rr->path, resolved); + return 0; + } else + return 0; /* Otherwise assume there's nothing to resolve */ + + if (r < 0) + return log_error_errno(r, "Failed to determine block device of file system: %m"); + + r = block_get_whole_disk(d, &d); + if (r < 0) + return log_error_errno(r, "Failed to find whole disk device for partition backing file system: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "File system is not placed on a partition block device, cannot determine whole block device backing root file system."); + + r = devname_from_devnum(S_IFBLK, d, &p); + if (r < 0) + return r; + + if (rr->path) + log_info("Automatically discovered block device '%s' from '%s'.", p, rr->path); + else + log_info("Automatically discovered root block device '%s'.", p); + + free_and_replace(rr->path, p); + return 1; +} + +static const char *resource_type_table[_RESOURCE_TYPE_MAX] = { + [RESOURCE_URL_FILE] = "url-file", + [RESOURCE_URL_TAR] = "url-tar", + [RESOURCE_TAR] = "tar", + [RESOURCE_PARTITION] = "partition", + [RESOURCE_REGULAR_FILE] = "regular-file", + [RESOURCE_DIRECTORY] = "directory", + [RESOURCE_SUBVOLUME] = "subvolume", +}; + +DEFINE_STRING_TABLE_LOOKUP(resource_type, ResourceType); + +static const char *path_relative_to_table[_PATH_RELATIVE_TO_MAX] = { + [PATH_RELATIVE_TO_ROOT] = "root", + [PATH_RELATIVE_TO_ESP] = "esp", + [PATH_RELATIVE_TO_XBOOTLDR] = "xbootldr", + [PATH_RELATIVE_TO_BOOT] = "boot", +}; + +DEFINE_STRING_TABLE_LOOKUP(path_relative_to, PathRelativeTo); diff --git a/src/sysupdate/sysupdate-resource.h b/src/sysupdate/sysupdate-resource.h new file mode 100644 index 0000000..51f74cf --- /dev/null +++ b/src/sysupdate/sysupdate-resource.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "gpt.h" +#include "hashmap.h" +#include "macro.h" + +/* Forward declare this type so that the headers below can use it */ +typedef struct Resource Resource; + +#include "sysupdate-instance.h" + +typedef enum ResourceType { + RESOURCE_URL_FILE, + RESOURCE_URL_TAR, + RESOURCE_TAR, + RESOURCE_PARTITION, + RESOURCE_REGULAR_FILE, + RESOURCE_DIRECTORY, + RESOURCE_SUBVOLUME, + _RESOURCE_TYPE_MAX, + _RESOURCE_TYPE_INVALID = -EINVAL, +} ResourceType; + +static inline bool RESOURCE_IS_SOURCE(ResourceType t) { + return IN_SET(t, + RESOURCE_URL_FILE, + RESOURCE_URL_TAR, + RESOURCE_TAR, + RESOURCE_REGULAR_FILE, + RESOURCE_DIRECTORY, + RESOURCE_SUBVOLUME); +} + +static inline bool RESOURCE_IS_TARGET(ResourceType t) { + return IN_SET(t, + RESOURCE_PARTITION, + RESOURCE_REGULAR_FILE, + RESOURCE_DIRECTORY, + RESOURCE_SUBVOLUME); +} + +/* Returns true for all resources that deal with file system objects, i.e. where we operate on top of the + * file system layer, instead of below. */ +static inline bool RESOURCE_IS_FILESYSTEM(ResourceType t) { + return IN_SET(t, + RESOURCE_TAR, + RESOURCE_REGULAR_FILE, + RESOURCE_DIRECTORY, + RESOURCE_SUBVOLUME); +} + +static inline bool RESOURCE_IS_TAR(ResourceType t) { + return IN_SET(t, + RESOURCE_TAR, + RESOURCE_URL_TAR); +} + +static inline bool RESOURCE_IS_URL(ResourceType t) { + return IN_SET(t, + RESOURCE_URL_TAR, + RESOURCE_URL_FILE); +} + +typedef enum PathRelativeTo { + /* Please make sure to follow the naming of the corresponding PartitionDesignator enum values, + * where this makes sense, like for the following three. */ + PATH_RELATIVE_TO_ROOT, + PATH_RELATIVE_TO_ESP, + PATH_RELATIVE_TO_XBOOTLDR, + PATH_RELATIVE_TO_BOOT, /* Refers to $BOOT from the BLS. No direct counterpart in PartitionDesignator */ + _PATH_RELATIVE_TO_MAX, + _PATH_RELATIVE_TO_INVALID = -EINVAL, +} PathRelativeTo; + +struct Resource { + ResourceType type; + + /* Where to look for instances, and what to match precisely */ + char *path; + bool path_auto; /* automatically find root path (only available if target resource, not source resource) */ + PathRelativeTo path_relative_to; + char **patterns; + GptPartitionType partition_type; + bool partition_type_set; + + /* All instances of this resource we found */ + Instance **instances; + size_t n_instances; + + /* If this is a partition resource (RESOURCE_PARTITION), then how many partition slots are currently unassigned, that we can use */ + size_t n_empty; +}; + +void resource_destroy(Resource *rr); + +int resource_load_instances(Resource *rr, bool verify, Hashmap **web_cache); + +Instance* resource_find_instance(Resource *rr, const char *version); + +int resource_resolve_path(Resource *rr, const char *root, const char *node); + +ResourceType resource_type_from_string(const char *s) _pure_; +const char *resource_type_to_string(ResourceType t) _const_; + +PathRelativeTo path_relative_to_from_string(const char *s) _pure_; +const char *path_relative_to_to_string(PathRelativeTo r) _const_; diff --git a/src/sysupdate/sysupdate-transfer.c b/src/sysupdate/sysupdate-transfer.c new file mode 100644 index 0000000..f8f4a15 --- /dev/null +++ b/src/sysupdate/sysupdate-transfer.c @@ -0,0 +1,1252 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "chase.h" +#include "conf-parser.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "glyph-util.h" +#include "gpt.h" +#include "hexdecoct.h" +#include "install-file.h" +#include "mkdir.h" +#include "parse-helpers.h" +#include "parse-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "specifier.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "strv.h" +#include "sync-util.h" +#include "sysupdate-pattern.h" +#include "sysupdate-resource.h" +#include "sysupdate-transfer.h" +#include "sysupdate.h" +#include "tmpfile-util.h" +#include "web-util.h" + +/* Default value for InstancesMax= for fs object targets */ +#define DEFAULT_FILE_INSTANCES_MAX 3 + +Transfer *transfer_free(Transfer *t) { + if (!t) + return NULL; + + t->temporary_path = rm_rf_subvolume_and_free(t->temporary_path); + + free(t->definition_path); + free(t->min_version); + strv_free(t->protected_versions); + free(t->current_symlink); + free(t->final_path); + + partition_info_destroy(&t->partition_info); + + resource_destroy(&t->source); + resource_destroy(&t->target); + + return mfree(t); +} + +Transfer *transfer_new(void) { + Transfer *t; + + t = new(Transfer, 1); + if (!t) + return NULL; + + *t = (Transfer) { + .source.type = _RESOURCE_TYPE_INVALID, + .target.type = _RESOURCE_TYPE_INVALID, + .remove_temporary = true, + .mode = MODE_INVALID, + .tries_left = UINT64_MAX, + .tries_done = UINT64_MAX, + .verify = true, + + /* the three flags, as configured by the user */ + .no_auto = -1, + .read_only = -1, + .growfs = -1, + + /* the read only flag, as ultimately determined */ + .install_read_only = -1, + + .partition_info = PARTITION_INFO_NULL, + }; + + return t; +} + +static const Specifier specifier_table[] = { + COMMON_SYSTEM_SPECIFIERS, + COMMON_TMP_SPECIFIERS, + {} +}; + +static int config_parse_protect_version( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + char ***protected_versions = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = specifier_printf(rvalue, NAME_MAX, specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in ProtectVersion=, ignoring: %s", rvalue); + return 0; + } + + if (!version_is_valid(resolved)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "ProtectVersion= string is not valid, ignoring: %s", resolved); + return 0; + } + + r = strv_extend(protected_versions, resolved); + if (r < 0) + return log_oom(); + + return 0; +} + +static int config_parse_min_version( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + char **version = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = specifier_printf(rvalue, NAME_MAX, specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in MinVersion=, ignoring: %s", rvalue); + return 0; + } + + if (!version_is_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "MinVersion= string is not valid, ignoring: %s", resolved); + return 0; + } + + return free_and_replace(*version, resolved); +} + +static int config_parse_current_symlink( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *resolved = NULL; + char **current_symlink = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = specifier_printf(rvalue, NAME_MAX, specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in CurrentSymlink=, ignoring: %s", rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue); + if (r < 0) + return 0; + + return free_and_replace(*current_symlink, resolved); +} + +static int config_parse_instances_max( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t *instances_max = data, i; + int r; + + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + *instances_max = 0; /* Revert to default logic, see transfer_read_definition() */ + return 0; + } + + r = safe_atou64(rvalue, &i); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse InstancesMax= value, ignoring: %s", rvalue); + return 0; + } + + if (i < 2) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "InstancesMax= value must be at least 2, bumping: %s", rvalue); + *instances_max = 2; + } else + *instances_max = i; + + return 0; +} + +static int config_parse_resource_pattern( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***patterns = ASSERT_PTR(data); + int r; + + assert(rvalue); + + if (isempty(rvalue)) { + *patterns = strv_free(*patterns); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL, *resolved = NULL; + + r = extract_first_word(&rvalue, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to extract first pattern from MatchPattern=, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + r = specifier_printf(word, NAME_MAX, specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in MatchPattern=, ignoring: %s", rvalue); + return 0; + } + + if (!pattern_valid(resolved)) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), + "MatchPattern= string is not valid, refusing: %s", resolved); + + r = strv_consume(patterns, TAKE_PTR(resolved)); + if (r < 0) + return log_oom(); + } + + strv_uniq(*patterns); + return 0; +} + +static int config_parse_resource_path( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + _cleanup_free_ char *resolved = NULL; + Resource *rr = ASSERT_PTR(data); + int r; + + assert(rvalue); + + if (streq(rvalue, "auto")) { + rr->path_auto = true; + rr->path = mfree(rr->path); + return 0; + } + + r = specifier_printf(rvalue, PATH_MAX-1, specifier_table, arg_root, NULL, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to expand specifiers in Path=, ignoring: %s", rvalue); + return 0; + } + + /* Note that we don't validate the path as being absolute or normalized. We'll do that in + * transfer_read_definition() as we might not know yet whether Path refers to a URL or a file system + * path. */ + + rr->path_auto = false; + return free_and_replace(rr->path, resolved); +} + +static DEFINE_CONFIG_PARSE_ENUM(config_parse_resource_type, resource_type, ResourceType, "Invalid resource type"); + +static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_resource_path_relto, path_relative_to, PathRelativeTo, + PATH_RELATIVE_TO_ROOT, "Invalid PathRelativeTo= value"); + +static int config_parse_resource_ptype( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Resource *rr = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = gpt_partition_type_from_string(rvalue, &rr->partition_type); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed parse partition type, ignoring: %s", rvalue); + return 0; + } + + rr->partition_type_set = true; + return 0; +} + +static int config_parse_partition_uuid( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Transfer *t = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = sd_id128_from_string(rvalue, &t->partition_uuid); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed parse partition UUID, ignoring: %s", rvalue); + return 0; + } + + t->partition_uuid_set = true; + return 0; +} + +static int config_parse_partition_flags( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Transfer *t = ASSERT_PTR(data); + int r; + + assert(rvalue); + + r = safe_atou64(rvalue, &t->partition_flags); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed parse partition flags, ignoring: %s", rvalue); + return 0; + } + + t->partition_flags_set = true; + return 0; +} + +int transfer_read_definition(Transfer *t, const char *path) { + int r; + + assert(t); + assert(path); + + ConfigTableItem table[] = { + { "Transfer", "MinVersion", config_parse_min_version, 0, &t->min_version }, + { "Transfer", "ProtectVersion", config_parse_protect_version, 0, &t->protected_versions }, + { "Transfer", "Verify", config_parse_bool, 0, &t->verify }, + { "Source", "Type", config_parse_resource_type, 0, &t->source.type }, + { "Source", "Path", config_parse_resource_path, 0, &t->source }, + { "Source", "PathRelativeTo", config_parse_resource_path_relto, 0, &t->source.path_relative_to }, + { "Source", "MatchPattern", config_parse_resource_pattern, 0, &t->source.patterns }, + { "Target", "Type", config_parse_resource_type, 0, &t->target.type }, + { "Target", "Path", config_parse_resource_path, 0, &t->target }, + { "Target", "PathRelativeTo", config_parse_resource_path_relto, 0, &t->target.path_relative_to }, + { "Target", "MatchPattern", config_parse_resource_pattern, 0, &t->target.patterns }, + { "Target", "MatchPartitionType", config_parse_resource_ptype, 0, &t->target }, + { "Target", "PartitionUUID", config_parse_partition_uuid, 0, t }, + { "Target", "PartitionFlags", config_parse_partition_flags, 0, t }, + { "Target", "PartitionNoAuto", config_parse_tristate, 0, &t->no_auto }, + { "Target", "PartitionGrowFileSystem", config_parse_tristate, 0, &t->growfs }, + { "Target", "ReadOnly", config_parse_tristate, 0, &t->read_only }, + { "Target", "Mode", config_parse_mode, 0, &t->mode }, + { "Target", "TriesLeft", config_parse_uint64, 0, &t->tries_left }, + { "Target", "TriesDone", config_parse_uint64, 0, &t->tries_done }, + { "Target", "InstancesMax", config_parse_instances_max, 0, &t->instances_max }, + { "Target", "RemoveTemporary", config_parse_bool, 0, &t->remove_temporary }, + { "Target", "CurrentSymlink", config_parse_current_symlink, 0, &t->current_symlink }, + {} + }; + + r = config_parse(NULL, path, NULL, + "Transfer\0" + "Source\0" + "Target\0", + config_item_table_lookup, table, + CONFIG_PARSE_WARN, + t, + NULL); + if (r < 0) + return r; + + if (!RESOURCE_IS_SOURCE(t->source.type)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Source Type= must be one of url-file, url-tar, tar, regular-file, directory, subvolume."); + + if (t->target.type < 0) { + switch (t->source.type) { + + case RESOURCE_URL_FILE: + case RESOURCE_REGULAR_FILE: + t->target.type = + t->target.path && path_startswith(t->target.path, "/dev/") ? + RESOURCE_PARTITION : RESOURCE_REGULAR_FILE; + break; + + case RESOURCE_URL_TAR: + case RESOURCE_TAR: + case RESOURCE_DIRECTORY: + t->target.type = RESOURCE_DIRECTORY; + break; + + case RESOURCE_SUBVOLUME: + t->target.type = RESOURCE_SUBVOLUME; + break; + + default: + assert_not_reached(); + } + } + + if (!RESOURCE_IS_TARGET(t->target.type)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Target Type= must be one of partition, regular-file, directory, subvolume."); + + if ((IN_SET(t->source.type, RESOURCE_URL_FILE, RESOURCE_PARTITION, RESOURCE_REGULAR_FILE) && + !IN_SET(t->target.type, RESOURCE_PARTITION, RESOURCE_REGULAR_FILE)) || + (IN_SET(t->source.type, RESOURCE_URL_TAR, RESOURCE_TAR, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME) && + !IN_SET(t->target.type, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME))) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Target type '%s' is incompatible with source type '%s', refusing.", + resource_type_to_string(t->source.type), resource_type_to_string(t->target.type)); + + if (!t->source.path && !t->source.path_auto) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Source specification lacks Path=."); + + if (t->source.path) { + if (RESOURCE_IS_FILESYSTEM(t->source.type) || t->source.type == RESOURCE_PARTITION) + if (!path_is_absolute(t->source.path) || !path_is_normalized(t->source.path)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Source path is not a normalized, absolute path: %s", t->source.path); + + /* We unofficially support file:// in addition to http:// and https:// for url + * sources. That's mostly for testing, since it relieves us from having to set up a HTTP + * server, and CURL abstracts this away from us thankfully. */ + if (RESOURCE_IS_URL(t->source.type)) + if (!http_url_is_valid(t->source.path) && !file_url_is_valid(t->source.path)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Source path is not a valid HTTP or HTTPS URL: %s", t->source.path); + } + + if (strv_isempty(t->source.patterns)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Source specification lacks MatchPattern=."); + + if (!t->target.path && !t->target.path_auto) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Target specification lacks Path= field."); + + if (t->target.path && + (!path_is_absolute(t->target.path) || !path_is_normalized(t->target.path))) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Target path is not a normalized, absolute path: %s", t->target.path); + + if (strv_isempty(t->target.patterns)) { + strv_free(t->target.patterns); + t->target.patterns = strv_copy(t->source.patterns); + if (!t->target.patterns) + return log_oom(); + } + + if (t->current_symlink && !RESOURCE_IS_FILESYSTEM(t->target.type) && !path_is_absolute(t->current_symlink)) + return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL), + "Current symlink must be absolute path if target is partition: %s", t->current_symlink); + + /* When no instance limit is set, use all available partition slots in case of partitions, or 3 in case of fs objects */ + if (t->instances_max == 0) + t->instances_max = t->target.type == RESOURCE_PARTITION ? UINT64_MAX : DEFAULT_FILE_INSTANCES_MAX; + + return 0; +} + +int transfer_resolve_paths( + Transfer *t, + const char *root, + const char *node) { + + int r; + + /* If Path=auto is used in [Source] or [Target] sections, let's automatically detect the path of the + * block device to use. Moreover, if this path points to a directory but we need a block device, + * automatically determine the backing block device, so that users can reference block devices by + * mount point. */ + + assert(t); + + r = resource_resolve_path(&t->source, root, node); + if (r < 0) + return r; + + r = resource_resolve_path(&t->target, root, node); + if (r < 0) + return r; + + return 0; +} + +static void transfer_remove_temporary(Transfer *t) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(t); + + if (!t->remove_temporary) + return; + + if (!IN_SET(t->target.type, RESOURCE_REGULAR_FILE, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME)) + return; + + /* Removes all temporary files/dirs from previous runs in the target directory, i.e. all those starting with '.#' */ + + d = opendir(t->target.path); + if (!d) { + if (errno == ENOENT) + return; + + log_debug_errno(errno, "Failed to open target directory '%s', ignoring: %m", t->target.path); + return; + } + + for (;;) { + struct dirent *de; + + errno = 0; + de = readdir_no_dot(d); + if (!de) { + if (errno != 0) + log_debug_errno(errno, "Failed to read target directory '%s', ignoring: %m", t->target.path); + break; + } + + if (!startswith(de->d_name, ".#")) + continue; + + r = rm_rf_child(dirfd(d), de->d_name, REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_CHMOD); + if (r == -ENOENT) + continue; + if (r < 0) { + log_warning_errno(r, "Failed to remove temporary resource instance '%s/%s', ignoring: %m", t->target.path, de->d_name); + continue; + } + + log_debug("Removed temporary resource instance '%s/%s'.", t->target.path, de->d_name); + } +} + +int transfer_vacuum( + Transfer *t, + uint64_t space, + const char *extra_protected_version) { + + uint64_t instances_max, limit; + int r, count = 0; + + assert(t); + + transfer_remove_temporary(t); + + /* First, calculate how many instances to keep, based on the instance limit — but keep at least one */ + + instances_max = arg_instances_max != UINT64_MAX ? arg_instances_max : t->instances_max; + assert(instances_max >= 1); + if (instances_max == UINT64_MAX) /* Keep infinite instances? */ + limit = UINT64_MAX; + else if (space > instances_max) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Asked to delete more instances than total maximum allowed number of instances, refusing."); + else if (space == instances_max) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Asked to delete all possible instances, can't allow that. One instance must always remain."); + else + limit = instances_max - space; + + if (t->target.type == RESOURCE_PARTITION) { + uint64_t rm, remain; + + /* If we are looking at a partition table, we also have to take into account how many + * partition slots of the right type are available */ + + if (t->target.n_empty + t->target.n_instances < 2) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Partition table has less than two partition slots of the right type " SD_ID128_UUID_FORMAT_STR " (%s), refusing.", + SD_ID128_FORMAT_VAL(t->target.partition_type.uuid), + gpt_partition_type_uuid_to_string(t->target.partition_type.uuid)); + if (space > t->target.n_empty + t->target.n_instances) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Partition table does not have enough partition slots of right type " SD_ID128_UUID_FORMAT_STR " (%s) for operation.", + SD_ID128_FORMAT_VAL(t->target.partition_type.uuid), + gpt_partition_type_uuid_to_string(t->target.partition_type.uuid)); + if (space == t->target.n_empty + t->target.n_instances) + return log_error_errno(SYNTHETIC_ERRNO(ENOSPC), + "Asked to empty all partition table slots of the right type " SD_ID128_UUID_FORMAT_STR " (%s), can't allow that. One instance must always remain.", + SD_ID128_FORMAT_VAL(t->target.partition_type.uuid), + gpt_partition_type_uuid_to_string(t->target.partition_type.uuid)); + + rm = LESS_BY(space, t->target.n_empty); + remain = LESS_BY(t->target.n_instances, rm); + limit = MIN(limit, remain); + } + + while (t->target.n_instances > limit) { + Instance *oldest; + size_t p = t->target.n_instances - 1; + + for (;;) { + oldest = t->target.instances[p]; + assert(oldest); + + /* If this is listed among the protected versions, then let's not remove it */ + if (!strv_contains(t->protected_versions, oldest->metadata.version) && + (!extra_protected_version || !streq(extra_protected_version, oldest->metadata.version))) + break; + + log_debug("Version '%s' is protected, not removing.", oldest->metadata.version); + if (p == 0) { + oldest = NULL; + break; + } + + p--; + } + + if (!oldest) /* Nothing more to remove */ + break; + + assert(oldest->resource); + + log_info("%s Removing old '%s' (%s).", special_glyph(SPECIAL_GLYPH_RECYCLING), oldest->path, resource_type_to_string(oldest->resource->type)); + + switch (t->target.type) { + + case RESOURCE_REGULAR_FILE: + case RESOURCE_DIRECTORY: + case RESOURCE_SUBVOLUME: + r = rm_rf(oldest->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_MISSING_OK|REMOVE_CHMOD); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to make room, deleting '%s' failed: %m", oldest->path); + + (void) rmdir_parents(oldest->path, t->target.path); + + break; + + case RESOURCE_PARTITION: { + PartitionInfo pinfo = oldest->partition_info; + + /* label "_empty" means "no contents" for our purposes */ + pinfo.label = (char*) "_empty"; + + r = patch_partition(t->target.path, &pinfo, PARTITION_LABEL); + if (r < 0) + return r; + + t->target.n_empty++; + break; + } + + default: + assert_not_reached(); + break; + } + + instance_free(oldest); + memmove(t->target.instances + p, t->target.instances + p + 1, (t->target.n_instances - p - 1) * sizeof(Instance*)); + t->target.n_instances--; + + count++; + } + + return count; +} + +static void compile_pattern_fields( + const Transfer *t, + const Instance *i, + InstanceMetadata *ret) { + + assert(t); + assert(i); + assert(ret); + + *ret = (InstanceMetadata) { + .version = i->metadata.version, + + /* We generally prefer explicitly configured values for the transfer over those automatically + * derived from the source instance. Also, if the source is a tar archive, then let's not + * patch mtime/mode and use the one embedded in the tar file */ + .partition_uuid = t->partition_uuid_set ? t->partition_uuid : i->metadata.partition_uuid, + .partition_uuid_set = t->partition_uuid_set || i->metadata.partition_uuid_set, + .partition_flags = t->partition_flags_set ? t->partition_flags : i->metadata.partition_flags, + .partition_flags_set = t->partition_flags_set || i->metadata.partition_flags_set, + .mtime = RESOURCE_IS_TAR(i->resource->type) ? USEC_INFINITY : i->metadata.mtime, + .mode = t->mode != MODE_INVALID ? t->mode : (RESOURCE_IS_TAR(i->resource->type) ? MODE_INVALID : i->metadata.mode), + .size = i->metadata.size, + .tries_done = t->tries_done != UINT64_MAX ? t->tries_done : + i->metadata.tries_done != UINT64_MAX ? i->metadata.tries_done : 0, + .tries_left = t->tries_left != UINT64_MAX ? t->tries_left : + i->metadata.tries_left != UINT64_MAX ? i->metadata.tries_left : 3, + .no_auto = t->no_auto >= 0 ? t->no_auto : i->metadata.no_auto, + .read_only = t->read_only >= 0 ? t->read_only : i->metadata.read_only, + .growfs = t->growfs >= 0 ? t->growfs : i->metadata.growfs, + .sha256sum_set = i->metadata.sha256sum_set, + }; + + memcpy(ret->sha256sum, i->metadata.sha256sum, sizeof(ret->sha256sum)); +} + +static int run_helper( + const char *name, + const char *path, + const char * const cmdline[]) { + + int r; + + assert(name); + assert(path); + assert(cmdline); + + r = safe_fork(name, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + + execv(path, (char *const*) cmdline); + log_error_errno(errno, "Failed to execute %s tool: %m", path); + _exit(EXIT_FAILURE); + } + + return 0; +} + +int transfer_acquire_instance(Transfer *t, Instance *i) { + _cleanup_free_ char *formatted_pattern = NULL, *digest = NULL; + char offset[DECIMAL_STR_MAX(uint64_t)+1], max_size[DECIMAL_STR_MAX(uint64_t)+1]; + const char *where = NULL; + InstanceMetadata f; + Instance *existing; + int r; + + assert(t); + assert(i); + assert(i->resource); + assert(t == container_of(i->resource, Transfer, source)); + + /* Does this instance already exist in the target? Then we don't need to acquire anything */ + existing = resource_find_instance(&t->target, i->metadata.version); + if (existing) { + log_info("No need to acquire '%s', already installed.", i->path); + return 0; + } + + assert(!t->final_path); + assert(!t->temporary_path); + assert(!strv_isempty(t->target.patterns)); + + /* Format the target name using the first pattern specified */ + compile_pattern_fields(t, i, &f); + r = pattern_format(t->target.patterns[0], &f, &formatted_pattern); + if (r < 0) + return log_error_errno(r, "Failed to format target pattern: %m"); + + if (RESOURCE_IS_FILESYSTEM(t->target.type)) { + + if (!path_is_valid_full(formatted_pattern, /* accept_dot_dot = */ false)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Formatted pattern is not suitable as file name, refusing: %s", formatted_pattern); + + t->final_path = path_join(t->target.path, formatted_pattern); + if (!t->final_path) + return log_oom(); + + r = mkdir_parents(t->final_path, 0755); + if (r < 0) + return log_error_errno(r, "Cannot create target directory: %m"); + + r = tempfn_random(t->final_path, "sysupdate", &t->temporary_path); + if (r < 0) + return log_error_errno(r, "Failed to generate temporary target path: %m"); + + where = t->final_path; + } + + if (t->target.type == RESOURCE_PARTITION) { + r = gpt_partition_label_valid(formatted_pattern); + if (r < 0) + return log_error_errno(r, "Failed to determine if formatted pattern is suitable as GPT partition label: %s", formatted_pattern); + if (!r) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Formatted pattern is not suitable as GPT partition label, refusing: %s", formatted_pattern); + + r = find_suitable_partition( + t->target.path, + i->metadata.size, + t->target.partition_type_set ? &t->target.partition_type.uuid : NULL, + &t->partition_info); + if (r < 0) + return r; + + xsprintf(offset, "%" PRIu64, t->partition_info.start); + xsprintf(max_size, "%" PRIu64, t->partition_info.size); + + where = t->partition_info.device; + } + + assert(where); + + log_info("%s Acquiring %s %s %s...", special_glyph(SPECIAL_GLYPH_DOWNLOAD), i->path, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), where); + + if (RESOURCE_IS_URL(i->resource->type)) { + /* For URL sources we require the SHA256 sum to be known so that we can validate the + * download. */ + + if (!i->metadata.sha256sum_set) + return log_error_errno(r, "SHA256 checksum not known for download '%s', refusing.", i->path); + + digest = hexmem(i->metadata.sha256sum, sizeof(i->metadata.sha256sum)); + if (!digest) + return log_oom(); + } + + switch (i->resource->type) { /* Source */ + + case RESOURCE_REGULAR_FILE: + + switch (t->target.type) { /* Target */ + + case RESOURCE_REGULAR_FILE: + + /* regular file → regular file (why fork off systemd-import for such a simple file + * copy case? implicit decompression mostly, and thus also sandboxing. Also, the + * importer has some tricks up its sleeve, such as sparse file generation, which we + * want to take benefit of, too.) */ + + r = run_helper("(sd-import-raw)", + import_binary_path(), + (const char* const[]) { + "systemd-import", + "raw", + "--direct", /* just copy/unpack the specified file, don't do anything else */ + arg_sync ? "--sync=yes" : "--sync=no", + i->path, + t->temporary_path, + NULL + }); + break; + + case RESOURCE_PARTITION: + + /* regular file → partition */ + + r = run_helper("(sd-import-raw)", + import_binary_path(), + (const char* const[]) { + "systemd-import", + "raw", + "--direct", /* just copy/unpack the specified file, don't do anything else */ + "--offset", offset, + "--size-max", max_size, + arg_sync ? "--sync=yes" : "--sync=no", + i->path, + t->target.path, + NULL + }); + break; + + default: + assert_not_reached(); + } + + break; + + case RESOURCE_DIRECTORY: + case RESOURCE_SUBVOLUME: + assert(IN_SET(t->target.type, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME)); + + /* directory/subvolume → directory/subvolume */ + + r = run_helper("(sd-import-fs)", + import_fs_binary_path(), + (const char* const[]) { + "systemd-import-fs", + "run", + "--direct", /* just untar the specified file, don't do anything else */ + arg_sync ? "--sync=yes" : "--sync=no", + t->target.type == RESOURCE_SUBVOLUME ? "--btrfs-subvol=yes" : "--btrfs-subvol=no", + i->path, + t->temporary_path, + NULL + }); + break; + + case RESOURCE_TAR: + assert(IN_SET(t->target.type, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME)); + + /* tar → directory/subvolume */ + + r = run_helper("(sd-import-tar)", + import_binary_path(), + (const char* const[]) { + "systemd-import", + "tar", + "--direct", /* just untar the specified file, don't do anything else */ + arg_sync ? "--sync=yes" : "--sync=no", + t->target.type == RESOURCE_SUBVOLUME ? "--btrfs-subvol=yes" : "--btrfs-subvol=no", + i->path, + t->temporary_path, + NULL + }); + break; + + case RESOURCE_URL_FILE: + + switch (t->target.type) { + + case RESOURCE_REGULAR_FILE: + + /* url file → regular file */ + + r = run_helper("(sd-pull-raw)", + pull_binary_path(), + (const char* const[]) { + "systemd-pull", + "raw", + "--direct", /* just download the specified URL, don't download anything else */ + "--verify", digest, /* validate by explicit SHA256 sum */ + arg_sync ? "--sync=yes" : "--sync=no", + i->path, + t->temporary_path, + NULL + }); + break; + + case RESOURCE_PARTITION: + + /* url file → partition */ + + r = run_helper("(sd-pull-raw)", + pull_binary_path(), + (const char* const[]) { + "systemd-pull", + "raw", + "--direct", /* just download the specified URL, don't download anything else */ + "--verify", digest, /* validate by explicit SHA256 sum */ + "--offset", offset, + "--size-max", max_size, + arg_sync ? "--sync=yes" : "--sync=no", + i->path, + t->target.path, + NULL + }); + break; + + default: + assert_not_reached(); + } + + break; + + case RESOURCE_URL_TAR: + assert(IN_SET(t->target.type, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME)); + + r = run_helper("(sd-pull-tar)", + pull_binary_path(), + (const char*const[]) { + "systemd-pull", + "tar", + "--direct", /* just download the specified URL, don't download anything else */ + "--verify", digest, /* validate by explicit SHA256 sum */ + t->target.type == RESOURCE_SUBVOLUME ? "--btrfs-subvol=yes" : "--btrfs-subvol=no", + arg_sync ? "--sync=yes" : "--sync=no", + i->path, + t->temporary_path, + NULL + }); + break; + + default: + assert_not_reached(); + } + if (r < 0) + return r; + + if (RESOURCE_IS_FILESYSTEM(t->target.type)) { + bool need_sync = false; + assert(t->temporary_path); + + /* Apply file attributes if set */ + if (f.mtime != USEC_INFINITY) { + struct timespec ts; + + timespec_store(&ts, f.mtime); + + if (utimensat(AT_FDCWD, t->temporary_path, (struct timespec[2]) { ts, ts }, AT_SYMLINK_NOFOLLOW) < 0) + return log_error_errno(errno, "Failed to adjust mtime of '%s': %m", t->temporary_path); + + need_sync = true; + } + + if (f.mode != MODE_INVALID) { + /* Try with AT_SYMLINK_NOFOLLOW first, because it's the safe thing to do. Older + * kernels don't support that however, in that case we fall back to chmod(). Not as + * safe, but shouldn't be a problem, given that we don't create symlinks here. */ + if (fchmodat(AT_FDCWD, t->temporary_path, f.mode, AT_SYMLINK_NOFOLLOW) < 0 && + (!ERRNO_IS_NOT_SUPPORTED(errno) || chmod(t->temporary_path, f.mode) < 0)) + return log_error_errno(errno, "Failed to adjust mode of '%s': %m", t->temporary_path); + + need_sync = true; + } + + /* Synchronize */ + if (arg_sync && need_sync) { + if (t->target.type == RESOURCE_REGULAR_FILE) + r = fsync_path_and_parent_at(AT_FDCWD, t->temporary_path); + else { + assert(IN_SET(t->target.type, RESOURCE_DIRECTORY, RESOURCE_SUBVOLUME)); + r = syncfs_path(AT_FDCWD, t->temporary_path); + } + if (r < 0) + return log_error_errno(r, "Failed to synchronize file system backing '%s': %m", t->temporary_path); + } + + t->install_read_only = f.read_only; + } + + if (t->target.type == RESOURCE_PARTITION) { + free_and_replace(t->partition_info.label, formatted_pattern); + t->partition_change = PARTITION_LABEL; + + if (f.partition_uuid_set) { + t->partition_info.uuid = f.partition_uuid; + t->partition_change |= PARTITION_UUID; + } + + if (f.partition_flags_set) { + t->partition_info.flags = f.partition_flags; + t->partition_change |= PARTITION_FLAGS; + } + + if (f.no_auto >= 0) { + t->partition_info.no_auto = f.no_auto; + t->partition_change |= PARTITION_NO_AUTO; + } + + if (f.read_only >= 0) { + t->partition_info.read_only = f.read_only; + t->partition_change |= PARTITION_READ_ONLY; + } + + if (f.growfs >= 0) { + t->partition_info.growfs = f.growfs; + t->partition_change |= PARTITION_GROWFS; + } + } + + /* For regular file cases the only step left is to install the file in place, which install_file() + * will do via rename(). For partition cases the only step left is to update the partition table, + * which is done at the same place. */ + + log_info("Successfully acquired '%s'.", i->path); + return 0; +} + +int transfer_install_instance( + Transfer *t, + Instance *i, + const char *root) { + + int r; + + assert(t); + assert(i); + assert(i->resource); + assert(t == container_of(i->resource, Transfer, source)); + + if (t->temporary_path) { + assert(RESOURCE_IS_FILESYSTEM(t->target.type)); + assert(t->final_path); + + r = install_file(AT_FDCWD, t->temporary_path, + AT_FDCWD, t->final_path, + INSTALL_REPLACE| + (t->install_read_only > 0 ? INSTALL_READ_ONLY : 0)| + (t->target.type == RESOURCE_REGULAR_FILE ? INSTALL_FSYNC_FULL : INSTALL_SYNCFS)); + if (r < 0) + return log_error_errno(r, "Failed to move '%s' into place: %m", t->final_path); + + log_info("Successfully installed '%s' (%s) as '%s' (%s).", + i->path, + resource_type_to_string(i->resource->type), + t->final_path, + resource_type_to_string(t->target.type)); + + t->temporary_path = mfree(t->temporary_path); + } + + if (t->partition_change != 0) { + assert(t->target.type == RESOURCE_PARTITION); + + r = patch_partition( + t->target.path, + &t->partition_info, + t->partition_change); + if (r < 0) + return r; + + log_info("Successfully installed '%s' (%s) as '%s' (%s).", + i->path, + resource_type_to_string(i->resource->type), + t->partition_info.device, + resource_type_to_string(t->target.type)); + } + + if (t->current_symlink) { + _cleanup_free_ char *buf = NULL, *parent = NULL, *relative = NULL, *resolved = NULL; + const char *link_path, *link_target; + bool resolve_link_path = false; + + if (RESOURCE_IS_FILESYSTEM(t->target.type)) { + + assert(t->target.path); + + if (path_is_absolute(t->current_symlink)) { + link_path = t->current_symlink; + resolve_link_path = true; + } else { + buf = path_make_absolute(t->current_symlink, t->target.path); + if (!buf) + return log_oom(); + + link_path = buf; + } + + link_target = t->final_path; + + } else if (t->target.type == RESOURCE_PARTITION) { + + assert(path_is_absolute(t->current_symlink)); + + link_path = t->current_symlink; + link_target = t->partition_info.device; + + resolve_link_path = true; + } else + assert_not_reached(); + + if (resolve_link_path && root) { + r = chase(link_path, root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve current symlink path '%s': %m", link_path); + + link_path = resolved; + } + + if (link_target) { + r = path_extract_directory(link_path, &parent); + if (r < 0) + return log_error_errno(r, "Failed to extract directory of target path '%s': %m", link_path); + + r = path_make_relative(parent, link_target, &relative); + if (r < 0) + return log_error_errno(r, "Failed to make symlink path '%s' relative to '%s': %m", link_target, parent); + + r = symlink_atomic(relative, link_path); + if (r < 0) + return log_error_errno(r, "Failed to update current symlink '%s' %s '%s': %m", + link_path, + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + relative); + + log_info("Updated symlink '%s' %s '%s'.", + link_path, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), relative); + } + } + + return 0; +} diff --git a/src/sysupdate/sysupdate-transfer.h b/src/sysupdate/sysupdate-transfer.h new file mode 100644 index 0000000..b0c2a6e --- /dev/null +++ b/src/sysupdate/sysupdate-transfer.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +#include "sd-id128.h" + +/* Forward declare this type so that the headers below can use it */ +typedef struct Transfer Transfer; + +#include "sysupdate-partition.h" +#include "sysupdate-resource.h" + +struct Transfer { + char *definition_path; + char *min_version; + char **protected_versions; + char *current_symlink; + bool verify; + + Resource source, target; + + uint64_t instances_max; + bool remove_temporary; + + /* When creating a new partition/file, optionally override these attributes explicitly */ + sd_id128_t partition_uuid; + bool partition_uuid_set; + uint64_t partition_flags; + bool partition_flags_set; + mode_t mode; + uint64_t tries_left, tries_done; + int no_auto; + int read_only; + int growfs; + + /* If we create a new file/dir/subvol in the fs, the temporary and final path we create it under, as well as the read-only flag for it */ + char *temporary_path; + char *final_path; + int install_read_only; + + /* If we write to a partition in a partition table, the metrics of it */ + PartitionInfo partition_info; + PartitionChange partition_change; +}; + +Transfer *transfer_new(void); + +Transfer *transfer_free(Transfer *t); +DEFINE_TRIVIAL_CLEANUP_FUNC(Transfer*, transfer_free); + +int transfer_read_definition(Transfer *t, const char *path); + +int transfer_resolve_paths(Transfer *t, const char *root, const char *node); + +int transfer_vacuum(Transfer *t, uint64_t space, const char *extra_protected_version); + +int transfer_acquire_instance(Transfer *t, Instance *i); + +int transfer_install_instance(Transfer *t, Instance *i, const char *root); diff --git a/src/sysupdate/sysupdate-update-set.c b/src/sysupdate/sysupdate-update-set.c new file mode 100644 index 0000000..6d6051d --- /dev/null +++ b/src/sysupdate/sysupdate-update-set.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "glyph-util.h" +#include "string-util.h" +#include "sysupdate-update-set.h" +#include "terminal-util.h" + +UpdateSet *update_set_free(UpdateSet *us) { + if (!us) + return NULL; + + free(us->version); + free(us->instances); /* The objects referenced by this array are freed via resource_free(), not us */ + + return mfree(us); +} + +int update_set_cmp(UpdateSet *const*a, UpdateSet *const*b) { + assert(a); + assert(b); + assert(*a); + assert(*b); + assert((*a)->version); + assert((*b)->version); + + /* Newest version at the beginning */ + return -strverscmp_improved((*a)->version, (*b)->version); +} + +const char *update_set_flags_to_color(UpdateSetFlags flags) { + + if (flags == 0 || (flags & UPDATE_OBSOLETE)) + return (flags & UPDATE_NEWEST) ? ansi_highlight_grey() : ansi_grey(); + + if (FLAGS_SET(flags, UPDATE_INSTALLED|UPDATE_NEWEST)) + return ansi_highlight(); + + if (FLAGS_SET(flags, UPDATE_INSTALLED|UPDATE_PROTECTED)) + return ansi_highlight_magenta(); + + if ((flags & (UPDATE_AVAILABLE|UPDATE_INSTALLED|UPDATE_NEWEST|UPDATE_OBSOLETE)) == (UPDATE_AVAILABLE|UPDATE_NEWEST)) + return ansi_highlight_green(); + + return NULL; +} + +const char *update_set_flags_to_glyph(UpdateSetFlags flags) { + + if (flags == 0 || (flags & UPDATE_OBSOLETE)) + return special_glyph(SPECIAL_GLYPH_MULTIPLICATION_SIGN); + + if (FLAGS_SET(flags, UPDATE_INSTALLED|UPDATE_NEWEST)) + return special_glyph(SPECIAL_GLYPH_BLACK_CIRCLE); + + if (FLAGS_SET(flags, UPDATE_INSTALLED|UPDATE_PROTECTED)) + return special_glyph(SPECIAL_GLYPH_WHITE_CIRCLE); + + if ((flags & (UPDATE_AVAILABLE|UPDATE_INSTALLED|UPDATE_NEWEST|UPDATE_OBSOLETE)) == (UPDATE_AVAILABLE|UPDATE_NEWEST)) + return special_glyph(SPECIAL_GLYPH_CIRCLE_ARROW); + + return " "; +} diff --git a/src/sysupdate/sysupdate-update-set.h b/src/sysupdate/sysupdate-update-set.h new file mode 100644 index 0000000..5dd94bc --- /dev/null +++ b/src/sysupdate/sysupdate-update-set.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include +#include + +typedef struct UpdateSet UpdateSet; + +#include "sysupdate-instance.h" + +typedef enum UpdateSetFlags { + UPDATE_NEWEST = 1 << 0, + UPDATE_AVAILABLE = 1 << 1, + UPDATE_INSTALLED = 1 << 2, + UPDATE_OBSOLETE = 1 << 3, + UPDATE_PROTECTED = 1 << 4, +} UpdateSetFlags; + +struct UpdateSet { + UpdateSetFlags flags; + char *version; + Instance **instances; + size_t n_instances; +}; + +UpdateSet *update_set_free(UpdateSet *us); + +int update_set_cmp(UpdateSet *const*a, UpdateSet *const*b); + +const char *update_set_flags_to_color(UpdateSetFlags flags); +const char *update_set_flags_to_glyph(UpdateSetFlags flags); diff --git a/src/sysupdate/sysupdate-util.c b/src/sysupdate/sysupdate-util.c new file mode 100644 index 0000000..eacc592 --- /dev/null +++ b/src/sysupdate/sysupdate-util.c @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "path-util.h" +#include "sysupdate-util.h" diff --git a/src/sysupdate/sysupdate.c b/src/sysupdate/sysupdate.c new file mode 100644 index 0000000..023eaac --- /dev/null +++ b/src/sysupdate/sysupdate.c @@ -0,0 +1,1416 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "chase.h" +#include "conf-files.h" +#include "constants.h" +#include "dirent-util.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "format-table.h" +#include "glyph-util.h" +#include "hexdecoct.h" +#include "login-util.h" +#include "main-func.h" +#include "mount-util.h" +#include "os-util.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "set.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" +#include "sysupdate-transfer.h" +#include "sysupdate-update-set.h" +#include "sysupdate.h" +#include "terminal-util.h" +#include "utf8.h" +#include "verbs.h" + +static char *arg_definitions = NULL; +bool arg_sync = true; +uint64_t arg_instances_max = UINT64_MAX; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +char *arg_root = NULL; +static char *arg_image = NULL; +static bool arg_reboot = false; +static char *arg_component = NULL; +static int arg_verify = -1; +static ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_definitions, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_component, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +typedef struct Context { + Transfer **transfers; + size_t n_transfers; + + UpdateSet **update_sets; + size_t n_update_sets; + + UpdateSet *newest_installed, *candidate; + + Hashmap *web_cache; /* Cache for downloaded resources, keyed by URL */ +} Context; + +static Context *context_free(Context *c) { + if (!c) + return NULL; + + for (size_t i = 0; i < c->n_transfers; i++) + transfer_free(c->transfers[i]); + free(c->transfers); + + for (size_t i = 0; i < c->n_update_sets; i++) + update_set_free(c->update_sets[i]); + free(c->update_sets); + + hashmap_free(c->web_cache); + + return mfree(c); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Context*, context_free); + +static Context *context_new(void) { + /* For now, no fields to initialize non-zero */ + return new0(Context, 1); +} + +static int context_read_definitions( + Context *c, + const char *directory, + const char *component, + const char *root, + const char *node) { + + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(c); + + if (directory) + r = conf_files_list_strv(&files, ".conf", NULL, CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, (const char**) STRV_MAKE(directory)); + else if (component) { + _cleanup_strv_free_ char **n = NULL; + char **l = CONF_PATHS_STRV(""); + size_t k = 0; + + n = new0(char*, strv_length(l) + 1); + if (!n) + return log_oom(); + + STRV_FOREACH(i, l) { + char *j; + + j = strjoin(*i, "sysupdate.", component, ".d"); + if (!j) + return log_oom(); + + n[k++] = j; + } + + r = conf_files_list_strv(&files, ".conf", root, CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, (const char**) n); + } else + r = conf_files_list_strv(&files, ".conf", root, CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, (const char**) CONF_PATHS_STRV("sysupdate.d")); + if (r < 0) + return log_error_errno(r, "Failed to enumerate *.conf files: %m"); + + STRV_FOREACH(f, files) { + _cleanup_(transfer_freep) Transfer *t = NULL; + + if (!GREEDY_REALLOC(c->transfers, c->n_transfers + 1)) + return log_oom(); + + t = transfer_new(); + if (!t) + return log_oom(); + + t->definition_path = strdup(*f); + if (!t->definition_path) + return log_oom(); + + r = transfer_read_definition(t, *f); + if (r < 0) + return r; + + c->transfers[c->n_transfers++] = TAKE_PTR(t); + } + + if (c->n_transfers == 0) { + if (arg_component) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "No transfer definitions for component '%s' found.", arg_component); + + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "No transfer definitions found."); + } + + for (size_t i = 0; i < c->n_transfers; i++) { + r = transfer_resolve_paths(c->transfers[i], root, node); + if (r < 0) + return r; + } + + return 0; +} + +static int context_load_installed_instances(Context *c) { + int r; + + assert(c); + + log_info("Discovering installed instances%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + for (size_t i = 0; i < c->n_transfers; i++) { + r = resource_load_instances( + &c->transfers[i]->target, + arg_verify >= 0 ? arg_verify : c->transfers[i]->verify, + &c->web_cache); + if (r < 0) + return r; + } + + return 0; +} + +static int context_load_available_instances(Context *c) { + int r; + + assert(c); + + log_info("Discovering available instances%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + for (size_t i = 0; i < c->n_transfers; i++) { + assert(c->transfers[i]); + + r = resource_load_instances( + &c->transfers[i]->source, + arg_verify >= 0 ? arg_verify : c->transfers[i]->verify, + &c->web_cache); + if (r < 0) + return r; + } + + return 0; +} + +static int context_discover_update_sets_by_flag(Context *c, UpdateSetFlags flags) { + _cleanup_free_ Instance **cursor_instances = NULL; + _cleanup_free_ char *boundary = NULL; + bool newest_found = false; + int r; + + assert(c); + assert(IN_SET(flags, UPDATE_AVAILABLE, UPDATE_INSTALLED)); + + for (;;) { + bool incomplete = false, exists = false; + UpdateSetFlags extra_flags = 0; + _cleanup_free_ char *cursor = NULL; + UpdateSet *us = NULL; + + for (size_t k = 0; k < c->n_transfers; k++) { + Transfer *t = c->transfers[k]; + bool cursor_found = false; + Resource *rr; + + assert(t); + + if (flags == UPDATE_AVAILABLE) + rr = &t->source; + else { + assert(flags == UPDATE_INSTALLED); + rr = &t->target; + } + + for (size_t j = 0; j < rr->n_instances; j++) { + Instance *i = rr->instances[j]; + + assert(i); + + /* Is the instance we are looking at equal or newer than the boundary? If so, we + * already checked this version, and it wasn't complete, let's ignore it. */ + if (boundary && strverscmp_improved(i->metadata.version, boundary) >= 0) + continue; + + if (cursor) { + if (strverscmp_improved(i->metadata.version, cursor) != 0) + continue; + } else { + cursor = strdup(i->metadata.version); + if (!cursor) + return log_oom(); + } + + cursor_found = true; + + if (!cursor_instances) { + cursor_instances = new(Instance*, c->n_transfers); + if (!cursor_instances) + return -ENOMEM; + } + cursor_instances[k] = i; + break; + } + + if (!cursor) /* No suitable instance beyond the boundary found? Then we are done! */ + break; + + if (!cursor_found) { + /* Hmm, we didn't find the version indicated by 'cursor' among the instances + * of this transfer, let's skip it. */ + incomplete = true; + break; + } + + if (t->min_version && strverscmp_improved(t->min_version, cursor) > 0) + extra_flags |= UPDATE_OBSOLETE; + + if (strv_contains(t->protected_versions, cursor)) + extra_flags |= UPDATE_PROTECTED; + } + + if (!cursor) /* EOL */ + break; + + r = free_and_strdup_warn(&boundary, cursor); + if (r < 0) + return r; + + if (incomplete) /* One transfer was missing this version, ignore the whole thing */ + continue; + + /* See if we already have this update set in our table */ + for (size_t i = 0; i < c->n_update_sets; i++) { + if (strverscmp_improved(c->update_sets[i]->version, cursor) != 0) + continue; + + /* We only store the instances we found first, but we remember we also found it again */ + c->update_sets[i]->flags |= flags | extra_flags; + exists = true; + newest_found = true; + break; + } + + if (exists) + continue; + + /* Doesn't exist yet, let's add it */ + if (!GREEDY_REALLOC(c->update_sets, c->n_update_sets + 1)) + return log_oom(); + + us = new(UpdateSet, 1); + if (!us) + return log_oom(); + + *us = (UpdateSet) { + .flags = flags | (newest_found ? 0 : UPDATE_NEWEST) | extra_flags, + .version = TAKE_PTR(cursor), + .instances = TAKE_PTR(cursor_instances), + .n_instances = c->n_transfers, + }; + + c->update_sets[c->n_update_sets++] = us; + + newest_found = true; + + /* Remember which one is the newest installed */ + if ((us->flags & (UPDATE_NEWEST|UPDATE_INSTALLED)) == (UPDATE_NEWEST|UPDATE_INSTALLED)) + c->newest_installed = us; + + /* Remember which is the newest non-obsolete, available (and not installed) version, which we declare the "candidate" */ + if ((us->flags & (UPDATE_NEWEST|UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_OBSOLETE)) == (UPDATE_NEWEST|UPDATE_AVAILABLE)) + c->candidate = us; + } + + /* Newest installed is newer than or equal to candidate? Then suppress the candidate */ + if (c->newest_installed && c->candidate && strverscmp_improved(c->newest_installed->version, c->candidate->version) >= 0) + c->candidate = NULL; + + return 0; +} + +static int context_discover_update_sets(Context *c) { + int r; + + assert(c); + + log_info("Determining installed update sets%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = context_discover_update_sets_by_flag(c, UPDATE_INSTALLED); + if (r < 0) + return r; + + log_info("Determining available update sets%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = context_discover_update_sets_by_flag(c, UPDATE_AVAILABLE); + if (r < 0) + return r; + + typesafe_qsort(c->update_sets, c->n_update_sets, update_set_cmp); + return 0; +} + +static const char *update_set_flags_to_string(UpdateSetFlags flags) { + + switch ((unsigned) flags) { + + case 0: + return "n/a"; + + case UPDATE_INSTALLED|UPDATE_NEWEST: + case UPDATE_INSTALLED|UPDATE_NEWEST|UPDATE_PROTECTED: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_NEWEST: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_NEWEST|UPDATE_PROTECTED: + return "current"; + + case UPDATE_AVAILABLE|UPDATE_NEWEST: + case UPDATE_AVAILABLE|UPDATE_NEWEST|UPDATE_PROTECTED: + return "candidate"; + + case UPDATE_INSTALLED: + case UPDATE_INSTALLED|UPDATE_AVAILABLE: + return "installed"; + + case UPDATE_INSTALLED|UPDATE_PROTECTED: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_PROTECTED: + return "protected"; + + case UPDATE_AVAILABLE: + case UPDATE_AVAILABLE|UPDATE_PROTECTED: + return "available"; + + case UPDATE_INSTALLED|UPDATE_OBSOLETE|UPDATE_NEWEST: + case UPDATE_INSTALLED|UPDATE_OBSOLETE|UPDATE_NEWEST|UPDATE_PROTECTED: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_OBSOLETE|UPDATE_NEWEST: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_OBSOLETE|UPDATE_NEWEST|UPDATE_PROTECTED: + return "current+obsolete"; + + case UPDATE_INSTALLED|UPDATE_OBSOLETE: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_OBSOLETE: + return "installed+obsolete"; + + case UPDATE_INSTALLED|UPDATE_OBSOLETE|UPDATE_PROTECTED: + case UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_OBSOLETE|UPDATE_PROTECTED: + return "protected+obsolete"; + + case UPDATE_AVAILABLE|UPDATE_OBSOLETE: + case UPDATE_AVAILABLE|UPDATE_OBSOLETE|UPDATE_PROTECTED: + case UPDATE_AVAILABLE|UPDATE_OBSOLETE|UPDATE_NEWEST: + case UPDATE_AVAILABLE|UPDATE_OBSOLETE|UPDATE_NEWEST|UPDATE_PROTECTED: + return "available+obsolete"; + + default: + assert_not_reached(); + } +} + + +static int context_show_table(Context *c) { + _cleanup_(table_unrefp) Table *t = NULL; + int r; + + assert(c); + + t = table_new("", "version", "installed", "available", "assessment"); + if (!t) + return log_oom(); + + (void) table_set_align_percent(t, table_get_cell(t, 0, 0), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 2), 50); + (void) table_set_align_percent(t, table_get_cell(t, 0, 3), 50); + + for (size_t i = 0; i < c->n_update_sets; i++) { + UpdateSet *us = c->update_sets[i]; + const char *color; + + color = update_set_flags_to_color(us->flags); + + r = table_add_many(t, + TABLE_STRING, update_set_flags_to_glyph(us->flags), + TABLE_SET_COLOR, color, + TABLE_STRING, us->version, + TABLE_SET_COLOR, color, + TABLE_STRING, special_glyph_check_mark_space(FLAGS_SET(us->flags, UPDATE_INSTALLED)), + TABLE_SET_COLOR, color, + TABLE_STRING, special_glyph_check_mark_space(FLAGS_SET(us->flags, UPDATE_AVAILABLE)), + TABLE_SET_COLOR, color, + TABLE_STRING, update_set_flags_to_string(us->flags), + TABLE_SET_COLOR, color); + if (r < 0) + return table_log_add_error(r); + } + + return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend); +} + +static UpdateSet *context_update_set_by_version(Context *c, const char *version) { + assert(c); + assert(version); + + for (size_t i = 0; i < c->n_update_sets; i++) + if (streq(c->update_sets[i]->version, version)) + return c->update_sets[i]; + + return NULL; +} + +static int context_show_version(Context *c, const char *version) { + bool show_fs_columns = false, show_partition_columns = false, + have_fs_attributes = false, have_partition_attributes = false, + have_size = false, have_tries = false, have_no_auto = false, + have_read_only = false, have_growfs = false, have_sha256 = false; + _cleanup_(table_unrefp) Table *t = NULL; + UpdateSet *us; + int r; + + assert(c); + assert(version); + + us = context_update_set_by_version(c, version); + if (!us) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Update '%s' not found.", version); + + if (arg_json_format_flags & (JSON_FORMAT_OFF|JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO)) + (void) pager_open(arg_pager_flags); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) + printf("%s%s%s Version: %s\n" + " State: %s%s%s\n" + "Installed: %s%s\n" + "Available: %s%s\n" + "Protected: %s%s%s\n" + " Obsolete: %s%s%s\n\n", + strempty(update_set_flags_to_color(us->flags)), update_set_flags_to_glyph(us->flags), ansi_normal(), us->version, + strempty(update_set_flags_to_color(us->flags)), update_set_flags_to_string(us->flags), ansi_normal(), + yes_no(us->flags & UPDATE_INSTALLED), FLAGS_SET(us->flags, UPDATE_INSTALLED|UPDATE_NEWEST) ? " (newest)" : "", + yes_no(us->flags & UPDATE_AVAILABLE), (us->flags & (UPDATE_INSTALLED|UPDATE_AVAILABLE|UPDATE_NEWEST)) == (UPDATE_AVAILABLE|UPDATE_NEWEST) ? " (newest)" : "", + FLAGS_SET(us->flags, UPDATE_INSTALLED|UPDATE_PROTECTED) ? ansi_highlight() : "", yes_no(FLAGS_SET(us->flags, UPDATE_INSTALLED|UPDATE_PROTECTED)), ansi_normal(), + us->flags & UPDATE_OBSOLETE ? ansi_highlight_red() : "", yes_no(us->flags & UPDATE_OBSOLETE), ansi_normal()); + + + t = table_new("type", "path", "ptuuid", "ptflags", "mtime", "mode", "size", "tries-done", "tries-left", "noauto", "ro", "growfs", "sha256"); + if (!t) + return log_oom(); + + (void) table_set_align_percent(t, table_get_cell(t, 0, 3), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 4), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 5), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 6), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 7), 100); + (void) table_set_align_percent(t, table_get_cell(t, 0, 8), 100); + table_set_ersatz_string(t, TABLE_ERSATZ_DASH); + + /* Determine if the target will make use of partition/fs attributes for any of the transfers */ + for (size_t n = 0; n < c->n_transfers; n++) { + Transfer *tr = c->transfers[n]; + + if (tr->target.type == RESOURCE_PARTITION) + show_partition_columns = true; + if (RESOURCE_IS_FILESYSTEM(tr->target.type)) + show_fs_columns = true; + } + + for (size_t n = 0; n < us->n_instances; n++) { + Instance *i = us->instances[n]; + + r = table_add_many(t, + TABLE_STRING, resource_type_to_string(i->resource->type), + TABLE_PATH, i->path); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.partition_uuid_set) { + have_partition_attributes = true; + r = table_add_cell(t, NULL, TABLE_UUID, &i->metadata.partition_uuid); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.partition_flags_set) { + have_partition_attributes = true; + r = table_add_cell(t, NULL, TABLE_UINT64_HEX, &i->metadata.partition_flags); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.mtime != USEC_INFINITY) { + have_fs_attributes = true; + r = table_add_cell(t, NULL, TABLE_TIMESTAMP, &i->metadata.mtime); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.mode != MODE_INVALID) { + have_fs_attributes = true; + r = table_add_cell(t, NULL, TABLE_MODE, &i->metadata.mode); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.size != UINT64_MAX) { + have_size = true; + r = table_add_cell(t, NULL, TABLE_SIZE, &i->metadata.size); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.tries_done != UINT64_MAX) { + have_tries = true; + r = table_add_cell(t, NULL, TABLE_UINT64, &i->metadata.tries_done); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.tries_left != UINT64_MAX) { + have_tries = true; + r = table_add_cell(t, NULL, TABLE_UINT64, &i->metadata.tries_left); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.no_auto >= 0) { + bool b; + + have_no_auto = true; + b = i->metadata.no_auto; + r = table_add_cell(t, NULL, TABLE_BOOLEAN, &b); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + if (i->metadata.read_only >= 0) { + bool b; + + have_read_only = true; + b = i->metadata.read_only; + r = table_add_cell(t, NULL, TABLE_BOOLEAN, &b); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.growfs >= 0) { + bool b; + + have_growfs = true; + b = i->metadata.growfs; + r = table_add_cell(t, NULL, TABLE_BOOLEAN, &b); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + + if (i->metadata.sha256sum_set) { + _cleanup_free_ char *formatted = NULL; + + have_sha256 = true; + + formatted = hexmem(i->metadata.sha256sum, sizeof(i->metadata.sha256sum)); + if (!formatted) + return log_oom(); + + r = table_add_cell(t, NULL, TABLE_STRING, formatted); + } else + r = table_add_cell(t, NULL, TABLE_EMPTY, NULL); + if (r < 0) + return table_log_add_error(r); + } + + /* Hide the fs/partition columns if we don't have any data to show there */ + if (!have_fs_attributes) + show_fs_columns = false; + if (!have_partition_attributes) + show_partition_columns = false; + + if (!show_partition_columns) + (void) table_hide_column_from_display(t, 2, 3); + if (!show_fs_columns) + (void) table_hide_column_from_display(t, 4, 5); + if (!have_size) + (void) table_hide_column_from_display(t, 6); + if (!have_tries) + (void) table_hide_column_from_display(t, 7, 8); + if (!have_no_auto) + (void) table_hide_column_from_display(t, 9); + if (!have_read_only) + (void) table_hide_column_from_display(t, 10); + if (!have_growfs) + (void) table_hide_column_from_display(t, 11); + if (!have_sha256) + (void) table_hide_column_from_display(t, 12); + + return table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend); +} + +static int context_vacuum( + Context *c, + uint64_t space, + const char *extra_protected_version) { + + int r, count = 0; + + assert(c); + + if (space == 0) + log_info("Making room%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + else + log_info("Making room for %" PRIu64 " updates%s", space,special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + for (size_t i = 0; i < c->n_transfers; i++) { + r = transfer_vacuum(c->transfers[i], space, extra_protected_version); + if (r < 0) + return r; + + count = MAX(count, r); + } + + if (count > 0) + log_info("Removed %i instances.", count); + else + log_info("Removed no instances."); + + return 0; +} + +static int context_make_offline(Context **ret, const char *node) { + _cleanup_(context_freep) Context* context = NULL; + int r; + + assert(ret); + + /* Allocates a context object and initializes everything we can initialize offline, i.e. without + * checking on the update source (i.e. the Internet) what versions are available */ + + context = context_new(); + if (!context) + return log_oom(); + + r = context_read_definitions(context, arg_definitions, arg_component, arg_root, node); + if (r < 0) + return r; + + r = context_load_installed_instances(context); + if (r < 0) + return r; + + *ret = TAKE_PTR(context); + return 0; +} + +static int context_make_online(Context **ret, const char *node) { + _cleanup_(context_freep) Context* context = NULL; + int r; + + assert(ret); + + /* Like context_make_offline(), but also communicates with the update source looking for new + * versions. */ + + r = context_make_offline(&context, node); + if (r < 0) + return r; + + r = context_load_available_instances(context); + if (r < 0) + return r; + + r = context_discover_update_sets(context); + if (r < 0) + return r; + + *ret = TAKE_PTR(context); + return 0; +} + +static int context_apply( + Context *c, + const char *version, + UpdateSet **ret_applied) { + + UpdateSet *us = NULL; + int r; + + assert(c); + + if (version) { + us = context_update_set_by_version(c, version); + if (!us) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Update '%s' not found.", version); + } else { + if (!c->candidate) { + log_info("No update needed."); + + if (ret_applied) + *ret_applied = NULL; + + return 0; + } + + us = c->candidate; + } + + if (FLAGS_SET(us->flags, UPDATE_INSTALLED)) { + log_info("Selected update '%s' is already installed. Skipping update.", us->version); + + if (ret_applied) + *ret_applied = NULL; + + return 0; + } + if (!FLAGS_SET(us->flags, UPDATE_AVAILABLE)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected update '%s' is not available, refusing.", us->version); + if (FLAGS_SET(us->flags, UPDATE_OBSOLETE)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected update '%s' is obsolete, refusing.", us->version); + + assert((us->flags & (UPDATE_AVAILABLE|UPDATE_INSTALLED|UPDATE_OBSOLETE)) == UPDATE_AVAILABLE); + + if (!FLAGS_SET(us->flags, UPDATE_NEWEST)) + log_notice("Selected update '%s' is not the newest, proceeding anyway.", us->version); + if (c->newest_installed && strverscmp_improved(c->newest_installed->version, us->version) > 0) + log_notice("Selected update '%s' is older than newest installed version, proceeding anyway.", us->version); + + log_info("Selected update '%s' for install.", us->version); + + (void) sd_notifyf(false, + "STATUS=Making room for '%s'.", us->version); + + /* Let's make some room. We make sure for each transfer we have one free space to fill. While + * removing stuff we'll protect the version we are trying to acquire. Why that? Maybe an earlier + * download succeeded already, in which case we shouldn't remove it just to acquire it again */ + r = context_vacuum( + c, + /* space = */ 1, + /* extra_protected_version = */ us->version); + if (r < 0) + return r; + + if (arg_sync) + sync(); + + (void) sd_notifyf(false, + "STATUS=Updating to '%s'.\n", us->version); + + /* There should now be one instance picked for each transfer, and the order is the same */ + assert(us->n_instances == c->n_transfers); + + for (size_t i = 0; i < c->n_transfers; i++) { + r = transfer_acquire_instance(c->transfers[i], us->instances[i]); + if (r < 0) + return r; + } + + if (arg_sync) + sync(); + + for (size_t i = 0; i < c->n_transfers; i++) { + r = transfer_install_instance(c->transfers[i], us->instances[i], arg_root); + if (r < 0) + return r; + } + + log_info("%s Successfully installed update '%s'.", special_glyph(SPECIAL_GLYPH_SPARKLES), us->version); + + if (ret_applied) + *ret_applied = us; + + return 1; +} + +static int reboot_now(void) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL; + int r; + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to open bus connection: %m"); + + r = bus_call_method(bus, bus_login_mgr, "RebootWithFlags", &error, NULL, "t", + (uint64_t) SD_LOGIND_ROOT_CHECK_INHIBITORS); + if (r < 0) + return log_error_errno(r, "Failed to issue reboot request: %s", bus_error_message(&error, r)); + + return 0; +} + +static int process_image( + bool ro, + char **ret_mounted_dir, + LoopDevice **ret_loop_device) { + + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; + int r; + + assert(ret_mounted_dir); + assert(ret_loop_device); + + if (!arg_image) + return 0; + + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + (ro ? DISSECT_IMAGE_READ_ONLY : 0) | + DISSECT_IMAGE_FSCK | + DISSECT_IMAGE_MKDIR | + DISSECT_IMAGE_GROWFS | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + + *ret_mounted_dir = TAKE_PTR(mounted_dir); + *ret_loop_device = TAKE_PTR(loop_device); + + return 0; +} + +static int verb_list(int argc, char **argv, void *userdata) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL; + _cleanup_(context_freep) Context* context = NULL; + const char *version; + int r; + + assert(argc <= 2); + version = argc >= 2 ? argv[1] : NULL; + + r = process_image(/* ro= */ true, &mounted_dir, &loop_device); + if (r < 0) + return r; + + r = context_make_online(&context, loop_device ? loop_device->node : NULL); + if (r < 0) + return r; + + if (version) + return context_show_version(context, version); + else + return context_show_table(context); +} + +static int verb_check_new(int argc, char **argv, void *userdata) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL; + _cleanup_(context_freep) Context* context = NULL; + int r; + + assert(argc <= 1); + + r = process_image(/* ro= */ true, &mounted_dir, &loop_device); + if (r < 0) + return r; + + r = context_make_online(&context, loop_device ? loop_device->node : NULL); + if (r < 0) + return r; + + if (!context->candidate) { + log_debug("No candidate found."); + return EXIT_FAILURE; + } + + puts(context->candidate->version); + return EXIT_SUCCESS; +} + +static int verb_vacuum(int argc, char **argv, void *userdata) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL; + _cleanup_(context_freep) Context* context = NULL; + int r; + + assert(argc <= 1); + + r = process_image(/* ro= */ false, &mounted_dir, &loop_device); + if (r < 0) + return r; + + r = context_make_offline(&context, loop_device ? loop_device->node : NULL); + if (r < 0) + return r; + + return context_vacuum(context, 0, NULL); +} + +static int verb_update(int argc, char **argv, void *userdata) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL; + _cleanup_(context_freep) Context* context = NULL; + _cleanup_free_ char *booted_version = NULL; + UpdateSet *applied = NULL; + const char *version; + int r; + + assert(argc <= 2); + version = argc >= 2 ? argv[1] : NULL; + + if (arg_reboot) { + /* If automatic reboot on completion is requested, let's first determine the currently booted image */ + + r = parse_os_release(arg_root, "IMAGE_VERSION", &booted_version); + if (r < 0) + return log_error_errno(r, "Failed to parse /etc/os-release: %m"); + if (!booted_version) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), "/etc/os-release lacks IMAGE_VERSION field."); + } + + r = process_image(/* ro= */ false, &mounted_dir, &loop_device); + if (r < 0) + return r; + + r = context_make_online(&context, loop_device ? loop_device->node : NULL); + if (r < 0) + return r; + + r = context_apply(context, version, &applied); + if (r < 0) + return r; + + if (r > 0 && arg_reboot) { + assert(applied); + assert(booted_version); + + if (strverscmp_improved(applied->version, booted_version) > 0) { + log_notice("Newly installed version is newer than booted version, rebooting."); + return reboot_now(); + } + + log_info("Booted version is newer or identical to newly installed version, not rebooting."); + } + + return 0; +} + +static int verb_pending_or_reboot(int argc, char **argv, void *userdata) { + _cleanup_(context_freep) Context* context = NULL; + _cleanup_free_ char *booted_version = NULL; + int r; + + assert(argc == 1); + + if (arg_image || arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "The --root=/--image= switches may not be combined with the '%s' operation.", argv[0]); + + r = context_make_offline(&context, NULL); + if (r < 0) + return r; + + log_info("Determining installed update sets%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = context_discover_update_sets_by_flag(context, UPDATE_INSTALLED); + if (r < 0) + return r; + if (!context->newest_installed) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), "Couldn't find any suitable installed versions."); + + r = parse_os_release(arg_root, "IMAGE_VERSION", &booted_version); + if (r < 0) /* yes, arg_root is NULL here, but we have to pass something, and it's a lot more readable + * if we see what the first argument is about */ + return log_error_errno(r, "Failed to parse /etc/os-release: %m"); + if (!booted_version) + return log_error_errno(SYNTHETIC_ERRNO(ENODATA), "/etc/os-release lacks IMAGE_VERSION= field."); + + r = strverscmp_improved(context->newest_installed->version, booted_version); + if (r > 0) { + log_notice("Newest installed version '%s' is newer than booted version '%s'.%s", + context->newest_installed->version, booted_version, + streq(argv[0], "pending") ? " Reboot recommended." : ""); + + if (streq(argv[0], "reboot")) + return reboot_now(); + + return EXIT_SUCCESS; + } else if (r == 0) + log_info("Newest installed version '%s' matches booted version '%s'.", + context->newest_installed->version, booted_version); + else + log_warning("Newest installed version '%s' is older than booted version '%s'.", + context->newest_installed->version, booted_version); + + if (streq(argv[0], "pending")) /* When called as 'pending' tell the caller via failure exit code that there's nothing newer installed */ + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +static int component_name_valid(const char *c) { + _cleanup_free_ char *j = NULL; + + /* See if the specified string enclosed in the directory prefix+suffix would be a valid file name */ + + if (isempty(c)) + return false; + + if (string_has_cc(c, NULL)) + return false; + + if (!utf8_is_valid(c)) + return false; + + j = strjoin("sysupdate.", c, ".d"); + if (!j) + return -ENOMEM; + + return filename_is_valid(j); +} + +static int verb_components(int argc, char **argv, void *userdata) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted_dir = NULL; + _cleanup_set_free_ Set *names = NULL; + _cleanup_free_ char **z = NULL; /* We use simple free() rather than strv_free() here, since set_free() will free the strings for us */ + char **l = CONF_PATHS_STRV(""); + bool has_default_component = false; + int r; + + assert(argc <= 1); + + r = process_image(/* ro= */ false, &mounted_dir, &loop_device); + if (r < 0) + return r; + + STRV_FOREACH(i, l) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_free_ char *p = NULL; + + r = chase_and_opendir(*i, arg_root, CHASE_PREFIX_ROOT, &p, &d); + if (r == -ENOENT) + continue; + if (r < 0) + return log_error_errno(r, "Failed to open directory '%s': %m", *i); + + for (;;) { + _cleanup_free_ char *n = NULL; + struct dirent *de; + const char *e, *a; + + de = readdir_ensure_type(d); + if (!de) { + if (errno != 0) + return log_error_errno(errno, "Failed to enumerate directory '%s': %m", p); + + break; + } + + if (de->d_type != DT_DIR) + continue; + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (streq(de->d_name, "sysupdate.d")) { + has_default_component = true; + continue; + } + + e = startswith(de->d_name, "sysupdate."); + if (!e) + continue; + + a = endswith(e, ".d"); + if (!a) + continue; + + n = strndup(e, a - e); + if (!n) + return log_oom(); + + r = component_name_valid(n); + if (r < 0) + return log_error_errno(r, "Unable to validate component name: %m"); + if (r == 0) + continue; + + r = set_ensure_consume(&names, &string_hash_ops_free, TAKE_PTR(n)); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Failed to add component to set: %m"); + } + } + + if (!has_default_component && set_isempty(names)) { + log_info("No components defined."); + return 0; + } + + z = set_get_strv(names); + if (!z) + return log_oom(); + + strv_sort(z); + + if (has_default_component) + printf("%s%s\n", + ansi_highlight(), ansi_normal()); + + STRV_FOREACH(i, z) + puts(*i); + + return 0; +} + +static int verb_help(int argc, char **argv, void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-sysupdate", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] [VERSION]\n" + "\n%5$sUpdate OS images.%6$s\n" + "\n%3$sCommands:%4$s\n" + " list [VERSION] Show installed and available versions\n" + " check-new Check if there's a new version available\n" + " update [VERSION] Install new version now\n" + " vacuum Make room, by deleting old versions\n" + " pending Report whether a newer version is installed than\n" + " currently booted\n" + " reboot Reboot if a newer version is installed than booted\n" + " components Show list of components\n" + " -h --help Show this help\n" + " --version Show package version\n" + "\n%3$sOptions:%4$s\n" + " -C --component=NAME Select component to update\n" + " --definitions=DIR Find transfer definitions in specified directory\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY\n" + " Specify disk image dissection policy\n" + " -m --instances-max=INT How many instances to maintain\n" + " --sync=BOOL Controls whether to sync data to disk\n" + " --verify=BOOL Force signature verification on or off\n" + " --reboot Reboot after updating to newer version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --json=pretty|short|off\n" + " Generate JSON output\n" + "\nSee the %2$s for details.\n" + , program_invocation_short_name + , link + , ansi_underline(), ansi_normal() + , ansi_highlight(), ansi_normal() + ); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_SYNC, + ARG_DEFINITIONS, + ARG_JSON, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_REBOOT, + ARG_VERIFY, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "definitions", required_argument, NULL, ARG_DEFINITIONS }, + { "instances-max", required_argument, NULL, 'm' }, + { "sync", required_argument, NULL, ARG_SYNC }, + { "json", required_argument, NULL, ARG_JSON }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "reboot", no_argument, NULL, ARG_REBOOT }, + { "component", required_argument, NULL, 'C' }, + { "verify", required_argument, NULL, ARG_VERIFY }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hm:C:", options, NULL)) >= 0) { + + switch (c) { + + case 'h': + return verb_help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case 'm': + r = safe_atou64(optarg, &arg_instances_max); + if (r < 0) + return log_error_errno(r, "Failed to parse --instances-max= parameter: %s", optarg); + + break; + + case ARG_SYNC: + r = parse_boolean_argument("--sync=", optarg, &arg_sync); + if (r < 0) + return r; + break; + + case ARG_DEFINITIONS: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_definitions); + if (r < 0) + return r; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_REBOOT: + arg_reboot = true; + break; + + case 'C': + if (isempty(optarg)) { + arg_component = mfree(arg_component); + break; + } + + r = component_name_valid(optarg); + if (r < 0) + return log_error_errno(r, "Failed to determine if component name is valid: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Component name invalid: %s", optarg); + + r = free_and_strdup_warn(&arg_component, optarg); + if (r < 0) + return r; + + break; + + case ARG_VERIFY: { + bool b; + + r = parse_boolean_argument("--verify=", optarg, &b); + if (r < 0) + return r; + + arg_verify = b; + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + } + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + + if ((arg_image || arg_root) && arg_reboot) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "The --reboot switch may not be combined with --root= or --image=."); + + if (arg_definitions && arg_component) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "The --definitions= and --component= switches may not be combined."); + + return 1; +} + +static int sysupdate_main(int argc, char *argv[]) { + + static const Verb verbs[] = { + { "list", VERB_ANY, 2, VERB_DEFAULT, verb_list }, + { "components", VERB_ANY, 1, 0, verb_components }, + { "check-new", VERB_ANY, 1, 0, verb_check_new }, + { "update", VERB_ANY, 2, 0, verb_update }, + { "vacuum", VERB_ANY, 1, 0, verb_vacuum }, + { "reboot", 1, 1, 0, verb_pending_or_reboot }, + { "pending", 1, 1, 0, verb_pending_or_reboot }, + { "help", VERB_ANY, 1, 0, verb_help }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return sysupdate_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/sysupdate/sysupdate.h b/src/sysupdate/sysupdate.h new file mode 100644 index 0000000..6d387b7 --- /dev/null +++ b/src/sysupdate/sysupdate.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +extern bool arg_sync; +extern uint64_t arg_instances_max; +extern char *arg_root; + +static inline const char* import_binary_path(void) { + return secure_getenv("SYSTEMD_IMPORT_PATH") ?: SYSTEMD_IMPORT_PATH; +} + +static inline const char* import_fs_binary_path(void) { + return secure_getenv("SYSTEMD_IMPORT_FS_PATH") ?: SYSTEMD_IMPORT_FS_PATH; +} + +static inline const char *pull_binary_path(void) { + return secure_getenv("SYSTEMD_PULL_PATH") ?: SYSTEMD_PULL_PATH; +} diff --git a/src/sysusers/meson.build b/src/sysusers/meson.build new file mode 100644 index 0000000..fcb291d --- /dev/null +++ b/src/sysusers/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-sysusers', + 'public' : true, + 'conditions' : ['ENABLE_SYSUSERS'], + 'sources' : files('sysusers.c'), + }, + executable_template + { + 'name' : 'systemd-sysusers.standalone', + 'public' : have_standalone_binaries, + 'conditions' : ['ENABLE_SYSUSERS'], + 'sources' : files('sysusers.c'), + 'c_args' : '-DSTANDALONE', + 'link_with' : [ + libbasic, + libbasic_gcrypt, + libshared_static, + libsystemd_static, + ], + 'build_by_default' : have_standalone_binaries, + 'install' : have_standalone_binaries, + }, +] diff --git a/src/sysusers/sysusers.c b/src/sysusers/sysusers.c new file mode 100644 index 0000000..514f3c7 --- /dev/null +++ b/src/sysusers/sysusers.c @@ -0,0 +1,2394 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "build.h" +#include "chase.h" +#include "conf-files.h" +#include "constants.h" +#include "copy.h" +#include "creds-util.h" +#include "dissect-image.h" +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "libcrypt-util.h" +#include "main-func.h" +#include "memory-util.h" +#include "mount-util.h" +#include "nscd-flush.h" +#include "pager.h" +#include "parse-argument.h" +#include "path-util.h" +#include "pretty-print.h" +#include "selinux-util.h" +#include "set.h" +#include "smack-util.h" +#include "specifier.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "sync-util.h" +#include "tmpfile-util-label.h" +#include "uid-alloc-range.h" +#include "uid-range.h" +#include "user-util.h" +#include "utf8.h" + +typedef enum ItemType { + ADD_USER = 'u', + ADD_GROUP = 'g', + ADD_MEMBER = 'm', + ADD_RANGE = 'r', +} ItemType; + +static const char* item_type_to_string(ItemType t) { + switch (t) { + case ADD_USER: + return "user"; + case ADD_GROUP: + return "group"; + case ADD_MEMBER: + return "member"; + case ADD_RANGE: + return "range"; + default: + assert_not_reached(); + } +} + +typedef struct Item { + ItemType type; + + char *name; + char *group_name; + char *uid_path; + char *gid_path; + char *description; + char *home; + char *shell; + + gid_t gid; + uid_t uid; + + char *filename; + unsigned line; + + bool gid_set; + + /* When set the group with the specified GID must exist + * and the check if a UID clashes with the GID is skipped. + */ + bool id_set_strict; + + bool uid_set; + + bool todo_user; + bool todo_group; +} Item; + +static char *arg_root = NULL; +static char *arg_image = NULL; +static CatFlags arg_cat_flags = CAT_CONFIG_OFF; +static const char *arg_replace = NULL; +static bool arg_dry_run = false; +static bool arg_inline = false; +static PagerFlags arg_pager_flags = 0; +static ImagePolicy *arg_image_policy = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +typedef struct Context { + OrderedHashmap *users, *groups; + OrderedHashmap *todo_uids, *todo_gids; + OrderedHashmap *members; + + Hashmap *database_by_uid, *database_by_username; + Hashmap *database_by_gid, *database_by_groupname; + + /* A helper set to hold names that are used by database_by_{uid,gid,username,groupname} above. */ + Set *names; + + uid_t search_uid; + UidRange *uid_range; + + UGIDAllocationRange login_defs; + bool login_defs_need_warning; +} Context; + +static void context_done(Context *c) { + assert(c); + + ordered_hashmap_free(c->groups); + ordered_hashmap_free(c->users); + ordered_hashmap_free(c->members); + ordered_hashmap_free(c->todo_uids); + ordered_hashmap_free(c->todo_gids); + + hashmap_free(c->database_by_uid); + hashmap_free(c->database_by_username); + hashmap_free(c->database_by_gid); + hashmap_free(c->database_by_groupname); + + set_free_free(c->names); + uid_range_free(c->uid_range); +} + +static int errno_is_not_exists(int code) { + /* See getpwnam(3) and getgrnam(3): those codes and others can be returned if the user or group are + * not found. */ + return IN_SET(code, 0, ENOENT, ESRCH, EBADF, EPERM); +} + +/* Note: the lifetime of the compound literal is the immediately surrounding block, + * see C11 §6.5.2.5, and + * https://stackoverflow.com/questions/34880638/compound-literal-lifetime-and-if-blocks */ +#define FORMAT_UID(is_set, uid) \ + ((is_set) ? snprintf_ok((char[DECIMAL_STR_MAX(uid_t)]){}, DECIMAL_STR_MAX(uid_t), UID_FMT, uid) : "(unset)") +#define FORMAT_GID(is_set, gid) \ + ((is_set) ? snprintf_ok((char[DECIMAL_STR_MAX(gid_t)]){}, DECIMAL_STR_MAX(gid_t), GID_FMT, gid) : "(unset)") + +static void maybe_emit_login_defs_warning(Context *c) { + assert(c); + + if (!c->login_defs_need_warning) + return; + + if (c->login_defs.system_alloc_uid_min != SYSTEM_ALLOC_UID_MIN || + c->login_defs.system_uid_max != SYSTEM_UID_MAX) + log_warning("login.defs specifies UID allocation range "UID_FMT"–"UID_FMT + " that is different than the built-in defaults ("UID_FMT"–"UID_FMT")", + c->login_defs.system_alloc_uid_min, c->login_defs.system_uid_max, + (uid_t) SYSTEM_ALLOC_UID_MIN, (uid_t) SYSTEM_UID_MAX); + if (c->login_defs.system_alloc_gid_min != SYSTEM_ALLOC_GID_MIN || + c->login_defs.system_gid_max != SYSTEM_GID_MAX) + log_warning("login.defs specifies GID allocation range "GID_FMT"–"GID_FMT + " that is different than the built-in defaults ("GID_FMT"–"GID_FMT")", + c->login_defs.system_alloc_gid_min, c->login_defs.system_gid_max, + (gid_t) SYSTEM_ALLOC_GID_MIN, (gid_t) SYSTEM_GID_MAX); + + c->login_defs_need_warning = false; +} + +static int load_user_database(Context *c) { + _cleanup_fclose_ FILE *f = NULL; + const char *passwd_path; + struct passwd *pw; + int r; + + assert(c); + + passwd_path = prefix_roota(arg_root, "/etc/passwd"); + f = fopen(passwd_path, "re"); + if (!f) + return errno == ENOENT ? 0 : -errno; + + r = hashmap_ensure_allocated(&c->database_by_username, &string_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&c->database_by_uid, NULL); + if (r < 0) + return r; + + /* Note that we use NULL, i.e. trivial_hash_ops here, so identical strings can exist in the set. */ + r = set_ensure_allocated(&c->names, NULL); + if (r < 0) + return r; + + while ((r = fgetpwent_sane(f, &pw)) > 0) { + + char *n = strdup(pw->pw_name); + if (!n) + return -ENOMEM; + + r = set_consume(c->names, n); + if (r < 0) + return r; + assert(r > 0); /* The set uses pointer comparisons, so n must not be in the set. */ + + r = hashmap_put(c->database_by_username, n, UID_TO_PTR(pw->pw_uid)); + if (r == -EEXIST) + log_debug_errno(r, "%s: user '%s' is listed twice, ignoring duplicate uid.", + passwd_path, n); + else if (r < 0) + return r; + + r = hashmap_put(c->database_by_uid, UID_TO_PTR(pw->pw_uid), n); + if (r == -EEXIST) + log_debug_errno(r, "%s: uid "UID_FMT" is listed twice, ignoring duplicate name.", + passwd_path, pw->pw_uid); + else if (r < 0) + return r; + } + return r; +} + +static int load_group_database(Context *c) { + _cleanup_fclose_ FILE *f = NULL; + const char *group_path; + struct group *gr; + int r; + + assert(c); + + group_path = prefix_roota(arg_root, "/etc/group"); + f = fopen(group_path, "re"); + if (!f) + return errno == ENOENT ? 0 : -errno; + + r = hashmap_ensure_allocated(&c->database_by_groupname, &string_hash_ops); + if (r < 0) + return r; + + r = hashmap_ensure_allocated(&c->database_by_gid, NULL); + if (r < 0) + return r; + + /* Note that we use NULL, i.e. trivial_hash_ops here, so identical strings can exist in the set. */ + r = set_ensure_allocated(&c->names, NULL); + if (r < 0) + return r; + + while ((r = fgetgrent_sane(f, &gr)) > 0) { + + char *n = strdup(gr->gr_name); + if (!n) + return -ENOMEM; + + r = set_consume(c->names, n); + if (r < 0) + return r; + assert(r > 0); /* The set uses pointer comparisons, so n must not be in the set. */ + + r = hashmap_put(c->database_by_groupname, n, GID_TO_PTR(gr->gr_gid)); + if (r == -EEXIST) + log_debug_errno(r, "%s: group '%s' is listed twice, ignoring duplicate gid.", + group_path, n); + else if (r < 0) + return r; + + r = hashmap_put(c->database_by_gid, GID_TO_PTR(gr->gr_gid), n); + if (r == -EEXIST) + log_debug_errno(r, "%s: gid "GID_FMT" is listed twice, ignoring duplicate name.", + group_path, gr->gr_gid); + else if (r < 0) + return r; + } + return r; +} + +static int make_backup(const char *target, const char *x) { + _cleanup_(unlink_and_freep) char *dst_tmp = NULL; + _cleanup_fclose_ FILE *dst = NULL; + _cleanup_close_ int src = -EBADF; + const char *backup; + struct stat st; + int r; + + assert(target); + assert(x); + + src = open(x, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (src < 0) { + if (errno == ENOENT) /* No backup necessary... */ + return 0; + + return -errno; + } + + if (fstat(src, &st) < 0) + return -errno; + + r = fopen_temporary_label( + target, /* The path for which to the look up the label */ + x, /* Where we want the file actually to end up */ + &dst, /* The temporary file we write to */ + &dst_tmp); + if (r < 0) + return r; + + r = copy_bytes(src, fileno(dst), UINT64_MAX, COPY_REFLINK); + if (r < 0) + return r; + + backup = strjoina(x, "-"); + + /* Copy over the access mask. Don't fail on chmod() or chown(). If it stays owned by us and/or + * unreadable by others, then it isn't too bad... */ + r = fchmod_and_chown_with_fallback(fileno(dst), dst_tmp, st.st_mode & 07777, st.st_uid, st.st_gid); + if (r < 0) + log_warning_errno(r, "Failed to change access mode or ownership of %s: %m", backup); + + if (futimens(fileno(dst), (const struct timespec[2]) { st.st_atim, st.st_mtim }) < 0) + log_warning_errno(errno, "Failed to fix access and modification time of %s: %m", backup); + + r = fsync_full(fileno(dst)); + if (r < 0) + return r; + + if (rename(dst_tmp, backup) < 0) + return errno; + + dst_tmp = mfree(dst_tmp); /* disable the unlink_and_freep() hook now that the file has been renamed */ + return 0; +} + +static int putgrent_with_members( + Context *c, + const struct group *gr, + FILE *group) { + + char **a; + + assert(c); + assert(gr); + assert(group); + + a = ordered_hashmap_get(c->members, gr->gr_name); + if (a) { + _cleanup_strv_free_ char **l = NULL; + bool added = false; + + l = strv_copy(gr->gr_mem); + if (!l) + return -ENOMEM; + + STRV_FOREACH(i, a) { + if (strv_contains(l, *i)) + continue; + + if (strv_extend(&l, *i) < 0) + return -ENOMEM; + + added = true; + } + + if (added) { + struct group t; + int r; + + strv_uniq(l); + strv_sort(l); + + t = *gr; + t.gr_mem = l; + + r = putgrent_sane(&t, group); + return r < 0 ? r : 1; + } + } + + return putgrent_sane(gr, group); +} + +#if ENABLE_GSHADOW +static int putsgent_with_members( + Context *c, + const struct sgrp *sg, + FILE *gshadow) { + + char **a; + + assert(sg); + assert(gshadow); + + a = ordered_hashmap_get(c->members, sg->sg_namp); + if (a) { + _cleanup_strv_free_ char **l = NULL; + bool added = false; + + l = strv_copy(sg->sg_mem); + if (!l) + return -ENOMEM; + + STRV_FOREACH(i, a) { + if (strv_contains(l, *i)) + continue; + + if (strv_extend(&l, *i) < 0) + return -ENOMEM; + + added = true; + } + + if (added) { + struct sgrp t; + int r; + + strv_uniq(l); + strv_sort(l); + + t = *sg; + t.sg_mem = l; + + r = putsgent_sane(&t, gshadow); + return r < 0 ? r : 1; + } + } + + return putsgent_sane(sg, gshadow); +} +#endif + +static const char* pick_shell(const Item *i) { + if (i->type != ADD_USER) + return NULL; + if (i->shell) + return i->shell; + if (i->uid_set && i->uid == 0) + return default_root_shell(arg_root); + return NOLOGIN; +} + +static int write_temporary_passwd( + Context *c, + const char *passwd_path, + FILE **ret_tmpfile, + char **ret_tmpfile_path) { + + _cleanup_fclose_ FILE *original = NULL, *passwd = NULL; + _cleanup_(unlink_and_freep) char *passwd_tmp = NULL; + struct passwd *pw = NULL; + Item *i; + int r; + + assert(c); + + if (ordered_hashmap_isempty(c->todo_uids)) + return 0; + + if (arg_dry_run) { + log_info("Would write /etc/passwd%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + return 0; + } + + r = fopen_temporary_label("/etc/passwd", passwd_path, &passwd, &passwd_tmp); + if (r < 0) + return log_debug_errno(r, "Failed to open temporary copy of %s: %m", passwd_path); + + original = fopen(passwd_path, "re"); + if (original) { + + /* Allow fallback path for when /proc is not mounted. On any normal system /proc will be + * mounted, but e.g. when 'dnf --installroot' is used, it might not be. There is no security + * relevance here, since the environment is ultimately trusted, and not requiring /proc makes + * it easier to depend on sysusers in packaging scripts and suchlike. */ + r = copy_rights_with_fallback(fileno(original), fileno(passwd), passwd_tmp); + if (r < 0) + return log_debug_errno(r, "Failed to copy permissions from %s to %s: %m", + passwd_path, passwd_tmp); + + while ((r = fgetpwent_sane(original, &pw)) > 0) { + i = ordered_hashmap_get(c->users, pw->pw_name); + if (i && i->todo_user) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: User \"%s\" already exists.", + passwd_path, pw->pw_name); + + if (ordered_hashmap_contains(c->todo_uids, UID_TO_PTR(pw->pw_uid))) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: Detected collision for UID " UID_FMT ".", + passwd_path, pw->pw_uid); + + /* Make sure we keep the NIS entries (if any) at the end. */ + if (IN_SET(pw->pw_name[0], '+', '-')) + break; + + r = putpwent_sane(pw, passwd); + if (r < 0) + return log_debug_errno(r, "Failed to add existing user \"%s\" to temporary passwd file: %m", + pw->pw_name); + } + if (r < 0) + return log_debug_errno(r, "Failed to read %s: %m", passwd_path); + + } else { + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to open %s: %m", passwd_path); + if (fchmod(fileno(passwd), 0644) < 0) + return log_debug_errno(errno, "Failed to fchmod %s: %m", passwd_tmp); + } + + ORDERED_HASHMAP_FOREACH(i, c->todo_uids) { + _cleanup_free_ char *creds_shell = NULL, *cn = NULL; + + struct passwd n = { + .pw_name = i->name, + .pw_uid = i->uid, + .pw_gid = i->gid, + .pw_gecos = (char*) strempty(i->description), + + /* "x" means the password is stored in the shadow file */ + .pw_passwd = (char*) PASSWORD_SEE_SHADOW, + + /* We default to the root directory as home */ + .pw_dir = i->home ?: (char*) "/", + + /* Initialize the shell to nologin, with one exception: + * for root we patch in something special */ + .pw_shell = (char*) pick_shell(i), + }; + + /* Try to pick up the shell for this account via the credentials logic */ + cn = strjoin("passwd.shell.", i->name); + if (!cn) + return -ENOMEM; + + r = read_credential(cn, (void**) &creds_shell, NULL); + if (r < 0) + log_debug_errno(r, "Couldn't read credential '%s', ignoring: %m", cn); + else + n.pw_shell = creds_shell; + + r = putpwent_sane(&n, passwd); + if (r < 0) + return log_debug_errno(r, "Failed to add new user \"%s\" to temporary passwd file: %m", + i->name); + } + + /* Append the remaining NIS entries if any */ + while (pw) { + r = putpwent_sane(pw, passwd); + if (r < 0) + return log_debug_errno(r, "Failed to add existing user \"%s\" to temporary passwd file: %m", + pw->pw_name); + + r = fgetpwent_sane(original, &pw); + if (r < 0) + return log_debug_errno(r, "Failed to read %s: %m", passwd_path); + if (r == 0) + break; + } + + r = fflush_sync_and_check(passwd); + if (r < 0) + return log_debug_errno(r, "Failed to flush %s: %m", passwd_tmp); + + *ret_tmpfile = TAKE_PTR(passwd); + *ret_tmpfile_path = TAKE_PTR(passwd_tmp); + + return 0; +} + +static usec_t epoch_or_now(void) { + uint64_t epoch; + + if (getenv_uint64_secure("SOURCE_DATE_EPOCH", &epoch) >= 0) { + if (epoch > UINT64_MAX/USEC_PER_SEC) /* Overflow check */ + return USEC_INFINITY; + return (usec_t) epoch * USEC_PER_SEC; + } + + return now(CLOCK_REALTIME); +} + +static int write_temporary_shadow( + Context *c, + const char *shadow_path, + FILE **ret_tmpfile, + char **ret_tmpfile_path) { + + _cleanup_fclose_ FILE *original = NULL, *shadow = NULL; + _cleanup_(unlink_and_freep) char *shadow_tmp = NULL; + struct spwd *sp = NULL; + long lstchg; + Item *i; + int r; + + assert(c); + + if (ordered_hashmap_isempty(c->todo_uids)) + return 0; + + if (arg_dry_run) { + log_info("Would write /etc/shadow%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + return 0; + } + + r = fopen_temporary_label("/etc/shadow", shadow_path, &shadow, &shadow_tmp); + if (r < 0) + return log_debug_errno(r, "Failed to open temporary copy of %s: %m", shadow_path); + + lstchg = (long) (epoch_or_now() / USEC_PER_DAY); + + original = fopen(shadow_path, "re"); + if (original) { + + r = copy_rights_with_fallback(fileno(original), fileno(shadow), shadow_tmp); + if (r < 0) + return log_debug_errno(r, "Failed to copy permissions from %s to %s: %m", + shadow_path, shadow_tmp); + + while ((r = fgetspent_sane(original, &sp)) > 0) { + i = ordered_hashmap_get(c->users, sp->sp_namp); + if (i && i->todo_user) { + /* we will update the existing entry */ + sp->sp_lstchg = lstchg; + + /* only the /etc/shadow stage is left, so we can + * safely remove the item from the todo set */ + i->todo_user = false; + ordered_hashmap_remove(c->todo_uids, UID_TO_PTR(i->uid)); + } + + /* Make sure we keep the NIS entries (if any) at the end. */ + if (IN_SET(sp->sp_namp[0], '+', '-')) + break; + + r = putspent_sane(sp, shadow); + if (r < 0) + return log_debug_errno(r, "Failed to add existing user \"%s\" to temporary shadow file: %m", + sp->sp_namp); + + } + if (r < 0) + return log_debug_errno(r, "Failed to read %s: %m", shadow_path); + + } else { + if (errno != ENOENT) + return log_debug_errno(errno, "Failed to open %s: %m", shadow_path); + if (fchmod(fileno(shadow), 0000) < 0) + return log_debug_errno(errno, "Failed to fchmod %s: %m", shadow_tmp); + } + + ORDERED_HASHMAP_FOREACH(i, c->todo_uids) { + _cleanup_(erase_and_freep) char *creds_password = NULL; + bool is_hashed; + + struct spwd n = { + .sp_namp = i->name, + .sp_lstchg = lstchg, + .sp_min = -1, + .sp_max = -1, + .sp_warn = -1, + .sp_inact = -1, + .sp_expire = -1, + .sp_flag = ULONG_MAX, /* this appears to be what everybody does ... */ + }; + + r = get_credential_user_password(i->name, &creds_password, &is_hashed); + if (r < 0) + log_debug_errno(r, "Couldn't read password credential for user '%s', ignoring: %m", i->name); + + if (creds_password && !is_hashed) { + _cleanup_(erase_and_freep) char* plaintext_password = TAKE_PTR(creds_password); + r = hash_password(plaintext_password, &creds_password); + if (r < 0) + return log_debug_errno(r, "Failed to hash password: %m"); + } + + if (creds_password) + n.sp_pwdp = creds_password; + else if (streq(i->name, "root")) + /* Let firstboot set the password later */ + n.sp_pwdp = (char*) PASSWORD_UNPROVISIONED; + else + n.sp_pwdp = (char*) PASSWORD_LOCKED_AND_INVALID; + + r = putspent_sane(&n, shadow); + if (r < 0) + return log_debug_errno(r, "Failed to add new user \"%s\" to temporary shadow file: %m", + i->name); + } + + /* Append the remaining NIS entries if any */ + while (sp) { + r = putspent_sane(sp, shadow); + if (r < 0) + return log_debug_errno(r, "Failed to add existing user \"%s\" to temporary shadow file: %m", + sp->sp_namp); + + r = fgetspent_sane(original, &sp); + if (r < 0) + return log_debug_errno(r, "Failed to read %s: %m", shadow_path); + if (r == 0) + break; + } + if (!IN_SET(errno, 0, ENOENT)) + return -errno; + + r = fflush_sync_and_check(shadow); + if (r < 0) + return log_debug_errno(r, "Failed to flush %s: %m", shadow_tmp); + + *ret_tmpfile = TAKE_PTR(shadow); + *ret_tmpfile_path = TAKE_PTR(shadow_tmp); + + return 0; +} + +static int write_temporary_group( + Context *c, + const char *group_path, + FILE **ret_tmpfile, + char **ret_tmpfile_path) { + + _cleanup_fclose_ FILE *original = NULL, *group = NULL; + _cleanup_(unlink_and_freep) char *group_tmp = NULL; + bool group_changed = false; + struct group *gr = NULL; + Item *i; + int r; + + assert(c); + + if (ordered_hashmap_isempty(c->todo_gids) && ordered_hashmap_isempty(c->members)) + return 0; + + if (arg_dry_run) { + log_info("Would write /etc/group%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + return 0; + } + + r = fopen_temporary_label("/etc/group", group_path, &group, &group_tmp); + if (r < 0) + return log_error_errno(r, "Failed to open temporary copy of %s: %m", group_path); + + original = fopen(group_path, "re"); + if (original) { + + r = copy_rights_with_fallback(fileno(original), fileno(group), group_tmp); + if (r < 0) + return log_error_errno(r, "Failed to copy permissions from %s to %s: %m", + group_path, group_tmp); + + while ((r = fgetgrent_sane(original, &gr)) > 0) { + /* Safety checks against name and GID collisions. Normally, + * this should be unnecessary, but given that we look at the + * entries anyway here, let's make an extra verification + * step that we don't generate duplicate entries. */ + + i = ordered_hashmap_get(c->groups, gr->gr_name); + if (i && i->todo_group) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: Group \"%s\" already exists.", + group_path, gr->gr_name); + + if (ordered_hashmap_contains(c->todo_gids, GID_TO_PTR(gr->gr_gid))) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: Detected collision for GID " GID_FMT ".", + group_path, gr->gr_gid); + + /* Make sure we keep the NIS entries (if any) at the end. */ + if (IN_SET(gr->gr_name[0], '+', '-')) + break; + + r = putgrent_with_members(c, gr, group); + if (r < 0) + return log_error_errno(r, "Failed to add existing group \"%s\" to temporary group file: %m", + gr->gr_name); + if (r > 0) + group_changed = true; + } + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", group_path); + + } else { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", group_path); + if (fchmod(fileno(group), 0644) < 0) + return log_error_errno(errno, "Failed to fchmod %s: %m", group_tmp); + } + + ORDERED_HASHMAP_FOREACH(i, c->todo_gids) { + struct group n = { + .gr_name = i->name, + .gr_gid = i->gid, + .gr_passwd = (char*) PASSWORD_SEE_SHADOW, + }; + + r = putgrent_with_members(c, &n, group); + if (r < 0) + return log_error_errno(r, "Failed to add new group \"%s\" to temporary group file: %m", + gr->gr_name); + + group_changed = true; + } + + /* Append the remaining NIS entries if any */ + while (gr) { + r = putgrent_sane(gr, group); + if (r < 0) + return log_error_errno(r, "Failed to add existing group \"%s\" to temporary group file: %m", + gr->gr_name); + + r = fgetgrent_sane(original, &gr); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", group_path); + if (r == 0) + break; + } + + r = fflush_sync_and_check(group); + if (r < 0) + return log_error_errno(r, "Failed to flush %s: %m", group_tmp); + + if (group_changed) { + *ret_tmpfile = TAKE_PTR(group); + *ret_tmpfile_path = TAKE_PTR(group_tmp); + } + return 0; +} + +static int write_temporary_gshadow( + Context *c, + const char * gshadow_path, + FILE **ret_tmpfile, + char **ret_tmpfile_path) { + +#if ENABLE_GSHADOW + _cleanup_fclose_ FILE *original = NULL, *gshadow = NULL; + _cleanup_(unlink_and_freep) char *gshadow_tmp = NULL; + bool group_changed = false; + Item *i; + int r; + + assert(c); + + if (ordered_hashmap_isempty(c->todo_gids) && ordered_hashmap_isempty(c->members)) + return 0; + + if (arg_dry_run) { + log_info("Would write /etc/gshadow%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + return 0; + } + + r = fopen_temporary_label("/etc/gshadow", gshadow_path, &gshadow, &gshadow_tmp); + if (r < 0) + return log_error_errno(r, "Failed to open temporary copy of %s: %m", gshadow_path); + + original = fopen(gshadow_path, "re"); + if (original) { + struct sgrp *sg; + + r = copy_rights_with_fallback(fileno(original), fileno(gshadow), gshadow_tmp); + if (r < 0) + return log_error_errno(r, "Failed to copy permissions from %s to %s: %m", + gshadow_path, gshadow_tmp); + + while ((r = fgetsgent_sane(original, &sg)) > 0) { + + i = ordered_hashmap_get(c->groups, sg->sg_namp); + if (i && i->todo_group) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s: Group \"%s\" already exists.", + gshadow_path, sg->sg_namp); + + r = putsgent_with_members(c, sg, gshadow); + if (r < 0) + return log_error_errno(r, "Failed to add existing group \"%s\" to temporary gshadow file: %m", + sg->sg_namp); + if (r > 0) + group_changed = true; + } + if (r < 0) + return r; + + } else { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", gshadow_path); + if (fchmod(fileno(gshadow), 0000) < 0) + return log_error_errno(errno, "Failed to fchmod %s: %m", gshadow_tmp); + } + + ORDERED_HASHMAP_FOREACH(i, c->todo_gids) { + struct sgrp n = { + .sg_namp = i->name, + .sg_passwd = (char*) PASSWORD_LOCKED_AND_INVALID, + }; + + r = putsgent_with_members(c, &n, gshadow); + if (r < 0) + return log_error_errno(r, "Failed to add new group \"%s\" to temporary gshadow file: %m", + n.sg_namp); + + group_changed = true; + } + + r = fflush_sync_and_check(gshadow); + if (r < 0) + return log_error_errno(r, "Failed to flush %s: %m", gshadow_tmp); + + if (group_changed) { + *ret_tmpfile = TAKE_PTR(gshadow); + *ret_tmpfile_path = TAKE_PTR(gshadow_tmp); + } +#endif + return 0; +} + +static int write_files(Context *c) { + _cleanup_fclose_ FILE *passwd = NULL, *group = NULL, *shadow = NULL, *gshadow = NULL; + _cleanup_(unlink_and_freep) char *passwd_tmp = NULL, *group_tmp = NULL, *shadow_tmp = NULL, *gshadow_tmp = NULL; + int r; + + const char + *passwd_path = prefix_roota(arg_root, "/etc/passwd"), + *shadow_path = prefix_roota(arg_root, "/etc/shadow"), + *group_path = prefix_roota(arg_root, "/etc/group"), + *gshadow_path = prefix_roota(arg_root, "/etc/gshadow"); + + assert(c); + + r = write_temporary_group(c, group_path, &group, &group_tmp); + if (r < 0) + return r; + + r = write_temporary_gshadow(c, gshadow_path, &gshadow, &gshadow_tmp); + if (r < 0) + return r; + + r = write_temporary_passwd(c, passwd_path, &passwd, &passwd_tmp); + if (r < 0) + return r; + + r = write_temporary_shadow(c, shadow_path, &shadow, &shadow_tmp); + if (r < 0) + return r; + + /* Make a backup of the old files */ + if (group) { + r = make_backup("/etc/group", group_path); + if (r < 0) + return log_error_errno(r, "Failed to backup %s: %m", group_path); + } + if (gshadow) { + r = make_backup("/etc/gshadow", gshadow_path); + if (r < 0) + return log_error_errno(r, "Failed to backup %s: %m", gshadow_path); + } + + if (passwd) { + r = make_backup("/etc/passwd", passwd_path); + if (r < 0) + return log_error_errno(r, "Failed to backup %s: %m", passwd_path); + } + if (shadow) { + r = make_backup("/etc/shadow", shadow_path); + if (r < 0) + return log_error_errno(r, "Failed to backup %s: %m", shadow_path); + } + + /* And make the new files count */ + if (group) { + r = rename_and_apply_smack_floor_label(group_tmp, group_path); + if (r < 0) + return log_error_errno(r, "Failed to rename %s to %s: %m", + group_tmp, group_path); + group_tmp = mfree(group_tmp); + + if (!arg_root && !arg_image) + (void) nscd_flush_cache(STRV_MAKE("group")); + } + if (gshadow) { + r = rename_and_apply_smack_floor_label(gshadow_tmp, gshadow_path); + if (r < 0) + return log_error_errno(r, "Failed to rename %s to %s: %m", + gshadow_tmp, gshadow_path); + + gshadow_tmp = mfree(gshadow_tmp); + } + + if (passwd) { + r = rename_and_apply_smack_floor_label(passwd_tmp, passwd_path); + if (r < 0) + return log_error_errno(r, "Failed to rename %s to %s: %m", + passwd_tmp, passwd_path); + + passwd_tmp = mfree(passwd_tmp); + + if (!arg_root && !arg_image) + (void) nscd_flush_cache(STRV_MAKE("passwd")); + } + if (shadow) { + r = rename_and_apply_smack_floor_label(shadow_tmp, shadow_path); + if (r < 0) + return log_error_errno(r, "Failed to rename %s to %s: %m", + shadow_tmp, shadow_path); + + shadow_tmp = mfree(shadow_tmp); + } + + return 0; +} + +static int uid_is_ok( + Context *c, + uid_t uid, + const char *name, + bool check_with_gid) { + + assert(c); + + /* Let's see if we already have assigned the UID a second time */ + if (ordered_hashmap_get(c->todo_uids, UID_TO_PTR(uid))) + return 0; + + /* Try to avoid using uids that are already used by a group + * that doesn't have the same name as our new user. */ + if (check_with_gid) { + Item *i; + + i = ordered_hashmap_get(c->todo_gids, GID_TO_PTR(uid)); + if (i && !streq(i->name, name)) + return 0; + } + + /* Let's check the files directly */ + if (hashmap_contains(c->database_by_uid, UID_TO_PTR(uid))) + return 0; + + if (check_with_gid) { + const char *n; + + n = hashmap_get(c->database_by_gid, GID_TO_PTR(uid)); + if (n && !streq(n, name)) + return 0; + } + + /* Let's also check via NSS, to avoid UID clashes over LDAP and such, just in case */ + if (!arg_root) { + struct passwd *p; + struct group *g; + + errno = 0; + p = getpwuid(uid); + if (p) + return 0; + if (!IN_SET(errno, 0, ENOENT)) + return -errno; + + if (check_with_gid) { + errno = 0; + g = getgrgid((gid_t) uid); + if (g) { + if (!streq(g->gr_name, name)) + return 0; + } else if (!IN_SET(errno, 0, ENOENT)) + return -errno; + } + } + + return 1; +} + +static int root_stat(const char *p, struct stat *st) { + const char *fix; + + fix = prefix_roota(arg_root, p); + return RET_NERRNO(stat(fix, st)); +} + +static int read_id_from_file(Item *i, uid_t *ret_uid, gid_t *ret_gid) { + struct stat st; + bool found_uid = false, found_gid = false; + uid_t uid = 0; + gid_t gid = 0; + + assert(i); + + /* First, try to get the GID directly */ + if (ret_gid && i->gid_path && root_stat(i->gid_path, &st) >= 0) { + gid = st.st_gid; + found_gid = true; + } + + /* Then, try to get the UID directly */ + if ((ret_uid || (ret_gid && !found_gid)) + && i->uid_path + && root_stat(i->uid_path, &st) >= 0) { + + uid = st.st_uid; + found_uid = true; + + /* If we need the gid, but had no success yet, also derive it from the UID path */ + if (ret_gid && !found_gid) { + gid = st.st_gid; + found_gid = true; + } + } + + /* If that didn't work yet, then let's reuse the GID as UID */ + if (ret_uid && !found_uid && i->gid_path) { + + if (found_gid) { + uid = (uid_t) gid; + found_uid = true; + } else if (root_stat(i->gid_path, &st) >= 0) { + uid = (uid_t) st.st_gid; + found_uid = true; + } + } + + if (ret_uid) { + if (!found_uid) + return 0; + + *ret_uid = uid; + } + + if (ret_gid) { + if (!found_gid) + return 0; + + *ret_gid = gid; + } + + return 1; +} + +static int add_user(Context *c, Item *i) { + void *z; + int r; + + assert(c); + assert(i); + + /* Check the database directly */ + z = hashmap_get(c->database_by_username, i->name); + if (z) { + log_debug("User %s already exists.", i->name); + i->uid = PTR_TO_UID(z); + i->uid_set = true; + return 0; + } + + if (!arg_root) { + struct passwd *p; + + /* Also check NSS */ + errno = 0; + p = getpwnam(i->name); + if (p) { + log_debug("User %s already exists.", i->name); + i->uid = p->pw_uid; + i->uid_set = true; + + r = free_and_strdup(&i->description, p->pw_gecos); + if (r < 0) + return log_oom(); + + return 0; + } + if (!errno_is_not_exists(errno)) + return log_error_errno(errno, "Failed to check if user %s already exists: %m", i->name); + } + + /* Try to use the suggested numeric UID */ + if (i->uid_set) { + r = uid_is_ok(c, i->uid, i->name, !i->id_set_strict); + if (r < 0) + return log_error_errno(r, "Failed to verify UID " UID_FMT ": %m", i->uid); + if (r == 0) { + log_info("Suggested user ID " UID_FMT " for %s already used.", i->uid, i->name); + i->uid_set = false; + } + } + + /* If that didn't work, try to read it from the specified path */ + if (!i->uid_set) { + uid_t candidate; + + if (read_id_from_file(i, &candidate, NULL) > 0) { + + if (candidate <= 0 || !uid_range_contains(c->uid_range, candidate)) + log_debug("User ID " UID_FMT " of file not suitable for %s.", candidate, i->name); + else { + r = uid_is_ok(c, candidate, i->name, true); + if (r < 0) + return log_error_errno(r, "Failed to verify UID " UID_FMT ": %m", i->uid); + else if (r > 0) { + i->uid = candidate; + i->uid_set = true; + } else + log_debug("User ID " UID_FMT " of file for %s is already used.", candidate, i->name); + } + } + } + + /* Otherwise, try to reuse the group ID */ + if (!i->uid_set && i->gid_set) { + r = uid_is_ok(c, (uid_t) i->gid, i->name, true); + if (r < 0) + return log_error_errno(r, "Failed to verify UID " UID_FMT ": %m", i->uid); + if (r > 0) { + i->uid = (uid_t) i->gid; + i->uid_set = true; + } + } + + /* And if that didn't work either, let's try to find a free one */ + if (!i->uid_set) { + maybe_emit_login_defs_warning(c); + + for (;;) { + r = uid_range_next_lower(c->uid_range, &c->search_uid); + if (r < 0) + return log_error_errno(r, "No free user ID available for %s.", i->name); + + r = uid_is_ok(c, c->search_uid, i->name, true); + if (r < 0) + return log_error_errno(r, "Failed to verify UID " UID_FMT ": %m", i->uid); + else if (r > 0) + break; + } + + i->uid_set = true; + i->uid = c->search_uid; + } + + r = ordered_hashmap_ensure_put(&c->todo_uids, NULL, UID_TO_PTR(i->uid), i); + if (r == -EEXIST) + return log_error_errno(r, "Requested user %s with UID " UID_FMT " and gid" GID_FMT " to be created is duplicated " + "or conflicts with another user.", i->name, i->uid, i->gid); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to store user %s with UID " UID_FMT " and GID " GID_FMT " to be created: %m", + i->name, i->uid, i->gid); + + i->todo_user = true; + log_info("Creating user '%s' (%s) with UID " UID_FMT " and GID " GID_FMT ".", + i->name, strna(i->description), i->uid, i->gid); + + return 0; +} + +static int gid_is_ok( + Context *c, + gid_t gid, + const char *groupname, + bool check_with_uid) { + + struct group *g; + struct passwd *p; + Item *user; + char *username; + + assert(c); + assert(groupname); + + if (ordered_hashmap_get(c->todo_gids, GID_TO_PTR(gid))) + return 0; + + /* Avoid reusing gids that are already used by a different user */ + if (check_with_uid) { + user = ordered_hashmap_get(c->todo_uids, UID_TO_PTR(gid)); + if (user && !streq(user->name, groupname)) + return 0; + } + + if (hashmap_contains(c->database_by_gid, GID_TO_PTR(gid))) + return 0; + + if (check_with_uid) { + username = hashmap_get(c->database_by_uid, UID_TO_PTR(gid)); + if (username && !streq(username, groupname)) + return 0; + } + + if (!arg_root) { + errno = 0; + g = getgrgid(gid); + if (g) + return 0; + if (!IN_SET(errno, 0, ENOENT)) + return -errno; + + if (check_with_uid) { + errno = 0; + p = getpwuid((uid_t) gid); + if (p) + return 0; + if (!IN_SET(errno, 0, ENOENT)) + return -errno; + } + } + + return 1; +} + +static int get_gid_by_name( + Context *c, + const char *name, + gid_t *ret_gid) { + + void *z; + + assert(c); + assert(ret_gid); + + /* Check the database directly */ + z = hashmap_get(c->database_by_groupname, name); + if (z) { + *ret_gid = PTR_TO_GID(z); + return 0; + } + + /* Also check NSS */ + if (!arg_root) { + struct group *g; + + errno = 0; + g = getgrnam(name); + if (g) { + *ret_gid = g->gr_gid; + return 0; + } + if (!errno_is_not_exists(errno)) + return log_error_errno(errno, "Failed to check if group %s already exists: %m", name); + } + + return -ENOENT; +} + +static int add_group(Context *c, Item *i) { + int r; + + assert(c); + assert(i); + + r = get_gid_by_name(c, i->name, &i->gid); + if (r != -ENOENT) { + if (r < 0) + return r; + log_debug("Group %s already exists.", i->name); + i->gid_set = true; + return 0; + } + + /* Try to use the suggested numeric GID */ + if (i->gid_set) { + r = gid_is_ok(c, i->gid, i->name, false); + if (r < 0) + return log_error_errno(r, "Failed to verify GID " GID_FMT ": %m", i->gid); + if (i->id_set_strict) { + /* If we require the GID to already exist we can return here: + * r > 0: means the GID does not exist -> fail + * r == 0: means the GID exists -> nothing more to do. + */ + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Failed to create %s: please create GID " GID_FMT, + i->name, i->gid); + if (r == 0) + return 0; + } + if (r == 0) { + log_info("Suggested group ID " GID_FMT " for %s already used.", i->gid, i->name); + i->gid_set = false; + } + } + + /* Try to reuse the numeric uid, if there's one */ + if (!i->gid_set && i->uid_set) { + r = gid_is_ok(c, (gid_t) i->uid, i->name, true); + if (r < 0) + return log_error_errno(r, "Failed to verify GID " GID_FMT ": %m", i->gid); + if (r > 0) { + i->gid = (gid_t) i->uid; + i->gid_set = true; + } + } + + /* If that didn't work, try to read it from the specified path */ + if (!i->gid_set) { + gid_t candidate; + + if (read_id_from_file(i, NULL, &candidate) > 0) { + + if (candidate <= 0 || !uid_range_contains(c->uid_range, candidate)) + log_debug("Group ID " GID_FMT " of file not suitable for %s.", candidate, i->name); + else { + r = gid_is_ok(c, candidate, i->name, true); + if (r < 0) + return log_error_errno(r, "Failed to verify GID " GID_FMT ": %m", i->gid); + else if (r > 0) { + i->gid = candidate; + i->gid_set = true; + } else + log_debug("Group ID " GID_FMT " of file for %s already used.", candidate, i->name); + } + } + } + + /* And if that didn't work either, let's try to find a free one */ + if (!i->gid_set) { + maybe_emit_login_defs_warning(c); + + for (;;) { + /* We look for new GIDs in the UID pool! */ + r = uid_range_next_lower(c->uid_range, &c->search_uid); + if (r < 0) + return log_error_errno(r, "No free group ID available for %s.", i->name); + + r = gid_is_ok(c, c->search_uid, i->name, true); + if (r < 0) + return log_error_errno(r, "Failed to verify GID " GID_FMT ": %m", i->gid); + else if (r > 0) + break; + } + + i->gid_set = true; + i->gid = c->search_uid; + } + + r = ordered_hashmap_ensure_put(&c->todo_gids, NULL, GID_TO_PTR(i->gid), i); + if (r == -EEXIST) + return log_error_errno(r, "Requested group %s with GID "GID_FMT " to be created is duplicated or conflicts with another user.", i->name, i->gid); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to store group %s with GID " GID_FMT " to be created: %m", i->name, i->gid); + + i->todo_group = true; + log_info("Creating group '%s' with GID " GID_FMT ".", i->name, i->gid); + + return 0; +} + +static int process_item(Context *c, Item *i) { + int r; + + assert(c); + assert(i); + + switch (i->type) { + + case ADD_USER: { + Item *j = NULL; + + if (!i->gid_set) + j = ordered_hashmap_get(c->groups, i->group_name ?: i->name); + + if (j && j->todo_group) { + /* When a group with the target name is already in queue, + * use the information about the group and do not create + * duplicated group entry. */ + i->gid_set = j->gid_set; + i->gid = j->gid; + i->id_set_strict = true; + } else if (i->group_name) { + /* When a group name was given instead of a GID and it's + * not in queue, then it must already exist. */ + r = get_gid_by_name(c, i->group_name, &i->gid); + if (r < 0) + return log_error_errno(r, "Group %s not found.", i->group_name); + i->gid_set = true; + i->id_set_strict = true; + } else { + r = add_group(c, i); + if (r < 0) + return r; + } + + return add_user(c, i); + } + + case ADD_GROUP: + return add_group(c, i); + + default: + assert_not_reached(); + } +} + +static Item* item_free(Item *i) { + if (!i) + return NULL; + + free(i->name); + free(i->group_name); + free(i->uid_path); + free(i->gid_path); + free(i->description); + free(i->home); + free(i->shell); + free(i->filename); + return mfree(i); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Item*, item_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(item_hash_ops, char, string_hash_func, string_compare_func, Item, item_free); + +static Item* item_new(ItemType type, const char *name, const char *filename, unsigned line) { + assert(name); + assert(!!filename == (line > 0)); + + _cleanup_(item_freep) Item *new = new(Item, 1); + if (!new) + return NULL; + + *new = (Item) { + .type = type, + .line = line, + }; + + if (free_and_strdup(&new->name, name) < 0 || + free_and_strdup(&new->filename, filename) < 0) + return NULL; + + return TAKE_PTR(new); +} + +static int add_implicit(Context *c) { + char *g, **l; + int r; + + assert(c); + + /* Implicitly create additional users and groups, if they were listed in "m" lines */ + ORDERED_HASHMAP_FOREACH_KEY(l, g, c->members) { + STRV_FOREACH(m, l) + if (!ordered_hashmap_get(c->users, *m)) { + _cleanup_(item_freep) Item *j = + item_new(ADD_USER, *m, /* filename= */ NULL, /* line= */ 0); + if (!j) + return log_oom(); + + r = ordered_hashmap_ensure_put(&c->users, &item_hash_ops, j->name, j); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to add implicit user '%s': %m", j->name); + + log_debug("Adding implicit user '%s' due to m line", j->name); + TAKE_PTR(j); + } + + if (!(ordered_hashmap_get(c->users, g) || + ordered_hashmap_get(c->groups, g))) { + _cleanup_(item_freep) Item *j = + item_new(ADD_GROUP, g, /* filename= */ NULL, /* line= */ 0); + if (!j) + return log_oom(); + + r = ordered_hashmap_ensure_put(&c->groups, &item_hash_ops, j->name, j); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to add implicit group '%s': %m", j->name); + + log_debug("Adding implicit group '%s' due to m line", j->name); + TAKE_PTR(j); + } + } + + return 0; +} + +static int item_equivalent(Item *a, Item *b) { + int r; + + assert(a); + assert(b); + + if (a->type != b->type) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because types differ"); + return false; + } + + if (!streq_ptr(a->name, b->name)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because names differ ('%s' vs. '%s')", + a->name, b->name); + return false; + } + + /* Paths were simplified previously, so we can use streq. */ + if (!streq_ptr(a->uid_path, b->uid_path)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because UID paths differ (%s vs. %s)", + a->uid_path ?: "(unset)", b->uid_path ?: "(unset)"); + return false; + } + + if (!streq_ptr(a->gid_path, b->gid_path)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because GID paths differ (%s vs. %s)", + a->gid_path ?: "(unset)", b->gid_path ?: "(unset)"); + return false; + } + + if (!streq_ptr(a->description, b->description)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because descriptions differ ('%s' vs. '%s')", + strempty(a->description), strempty(b->description)); + return false; + } + + if ((a->uid_set != b->uid_set) || + (a->uid_set && a->uid != b->uid)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because UIDs differ (%s vs. %s)", + FORMAT_UID(a->uid_set, a->uid), FORMAT_UID(b->uid_set, b->uid)); + return false; + } + + if ((a->gid_set != b->gid_set) || + (a->gid_set && a->gid != b->gid)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because GIDs differ (%s vs. %s)", + FORMAT_GID(a->gid_set, a->gid), FORMAT_GID(b->gid_set, b->gid)); + return false; + } + + if (!streq_ptr(a->home, b->home)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because home directories differ ('%s' vs. '%s')", + strempty(a->description), strempty(b->description)); + return false; + } + + /* Check if the two paths refer to the same file. + * If the paths are equal (after normalization), it's obviously the same file. + * If both paths specify a nologin shell, treat them as the same (e.g. /bin/true and /bin/false). + * Otherwise, try to resolve the paths, and see if we get the same result, (e.g. /sbin/nologin and + * /usr/sbin/nologin). + * If we can't resolve something, treat different paths as different. */ + + const char *a_shell = pick_shell(a), + *b_shell = pick_shell(b); + if (!path_equal_ptr(a_shell, b_shell) && + !(is_nologin_shell(a_shell) && is_nologin_shell(b_shell))) { + _cleanup_free_ char *pa = NULL, *pb = NULL; + + r = chase(a_shell, arg_root, CHASE_PREFIX_ROOT | CHASE_NONEXISTENT, &pa, NULL); + if (r < 0) { + log_full_errno(ERRNO_IS_RESOURCE(r) ? LOG_ERR : LOG_DEBUG, + r, "Failed to look up path '%s%s%s': %m", + strempty(arg_root), arg_root ? "/" : "", a_shell); + return ERRNO_IS_RESOURCE(r) ? r : false; + } + + r = chase(b_shell, arg_root, CHASE_PREFIX_ROOT | CHASE_NONEXISTENT, &pb, NULL); + if (r < 0) { + log_full_errno(ERRNO_IS_RESOURCE(r) ? LOG_ERR : LOG_DEBUG, + r, "Failed to look up path '%s%s%s': %m", + strempty(arg_root), arg_root ? "/" : "", b_shell); + return ERRNO_IS_RESOURCE(r) ? r : false; + } + + if (!path_equal(pa, pb)) { + log_syntax(NULL, LOG_DEBUG, a->filename, a->line, 0, + "Item not equivalent because shells differ ('%s' vs. '%s')", + pa, pb); + return false; + } + } + + return true; +} + +static int parse_line( + Context *c, + const char *fname, + unsigned line, + const char *buffer) { + + _cleanup_free_ char *action = NULL, + *name = NULL, *resolved_name = NULL, + *id = NULL, *resolved_id = NULL, + *description = NULL, *resolved_description = NULL, + *home = NULL, *resolved_home = NULL, + *shell = NULL, *resolved_shell = NULL; + _cleanup_(item_freep) Item *i = NULL; + Item *existing; + OrderedHashmap *h; + int r; + const char *p; + + assert(c); + assert(fname); + assert(line >= 1); + assert(buffer); + + /* Parse columns */ + p = buffer; + r = extract_many_words(&p, NULL, EXTRACT_UNQUOTE, + &action, &name, &id, &description, &home, &shell, NULL); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, "Syntax error."); + if (r < 2) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Missing action and name columns."); + if (!isempty(p)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Trailing garbage."); + + /* Verify action */ + if (strlen(action) != 1) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Unknown modifier '%s'.", action); + + if (!IN_SET(action[0], ADD_USER, ADD_GROUP, ADD_MEMBER, ADD_RANGE)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), + "Unknown command type '%c'.", action[0]); + + /* Verify name */ + if (empty_or_dash(name)) + name = mfree(name); + + if (name) { + r = specifier_printf(name, NAME_MAX, system_and_tmp_specifier_table, arg_root, NULL, &resolved_name); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to replace specifiers in '%s': %m", name); + + if (!valid_user_group_name(resolved_name, 0)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "'%s' is not a valid user or group name.", resolved_name); + } + + /* Verify id */ + if (empty_or_dash(id)) + id = mfree(id); + + if (id) { + r = specifier_printf(id, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &resolved_id); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to replace specifiers in '%s': %m", name); + } + + /* Verify description */ + if (empty_or_dash(description)) + description = mfree(description); + + if (description) { + r = specifier_printf(description, LONG_LINE_MAX, system_and_tmp_specifier_table, arg_root, NULL, &resolved_description); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to replace specifiers in '%s': %m", description); + + if (!valid_gecos(resolved_description)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "'%s' is not a valid GECOS field.", resolved_description); + } + + /* Verify home */ + if (empty_or_dash(home)) + home = mfree(home); + + if (home) { + r = specifier_printf(home, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &resolved_home); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to replace specifiers in '%s': %m", home); + + path_simplify(resolved_home); + + if (!valid_home(resolved_home)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "'%s' is not a valid home directory field.", resolved_home); + } + + /* Verify shell */ + if (empty_or_dash(shell)) + shell = mfree(shell); + + if (shell) { + r = specifier_printf(shell, PATH_MAX-1, system_and_tmp_specifier_table, arg_root, NULL, &resolved_shell); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to replace specifiers in '%s': %m", shell); + + path_simplify(resolved_shell); + + if (!valid_shell(resolved_shell)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "'%s' is not a valid login shell field.", resolved_shell); + } + + switch (action[0]) { + + case ADD_RANGE: + if (resolved_name) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type 'r' don't take a name field."); + + if (!resolved_id) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type 'r' require an ID range in the third field."); + + if (description || home || shell) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type '%c' don't take a %s field.", + action[0], + description ? "GECOS" : home ? "home directory" : "login shell"); + + r = uid_range_add_str(&c->uid_range, resolved_id); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Invalid UID range %s.", resolved_id); + + return 0; + + case ADD_MEMBER: { + /* Try to extend an existing member or group item */ + if (!name) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type 'm' require a user name in the second field."); + + if (!resolved_id) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type 'm' require a group name in the third field."); + + if (!valid_user_group_name(resolved_id, 0)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "'%s' is not a valid user or group name.", resolved_id); + + if (description || home || shell) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type '%c' don't take a %s field.", + action[0], + description ? "GECOS" : home ? "home directory" : "login shell"); + + r = string_strv_ordered_hashmap_put(&c->members, resolved_id, resolved_name); + if (r < 0) + return log_error_errno(r, "Failed to store mapping for %s: %m", resolved_id); + + return 0; + } + + case ADD_USER: + if (!name) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type 'u' require a user name in the second field."); + + r = ordered_hashmap_ensure_allocated(&c->users, &item_hash_ops); + if (r < 0) + return log_oom(); + + i = item_new(ADD_USER, resolved_name, fname, line); + if (!i) + return log_oom(); + + if (resolved_id) { + if (path_is_absolute(resolved_id)) + i->uid_path = path_simplify(TAKE_PTR(resolved_id)); + else { + _cleanup_free_ char *uid = NULL, *gid = NULL; + if (split_pair(resolved_id, ":", &uid, &gid) == 0) { + r = parse_gid(gid, &i->gid); + if (r < 0) { + if (valid_user_group_name(gid, 0)) + i->group_name = TAKE_PTR(gid); + else + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to parse GID: '%s': %m", id); + } else { + i->gid_set = true; + i->id_set_strict = true; + } + free_and_replace(resolved_id, uid); + } + if (!streq(resolved_id, "-")) { + r = parse_uid(resolved_id, &i->uid); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to parse UID: '%s': %m", id); + i->uid_set = true; + } + } + } + + i->description = TAKE_PTR(resolved_description); + i->home = TAKE_PTR(resolved_home); + i->shell = TAKE_PTR(resolved_shell); + + h = c->users; + break; + + case ADD_GROUP: + if (!name) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type 'g' require a user name in the second field."); + + if (description || home || shell) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), + "Lines of type '%c' don't take a %s field.", + action[0], + description ? "GECOS" : home ? "home directory" : "login shell"); + + r = ordered_hashmap_ensure_allocated(&c->groups, &item_hash_ops); + if (r < 0) + return log_oom(); + + i = item_new(ADD_GROUP, resolved_name, fname, line); + if (!i) + return log_oom(); + + if (resolved_id) { + if (path_is_absolute(resolved_id)) + i->gid_path = path_simplify(TAKE_PTR(resolved_id)); + else { + r = parse_gid(resolved_id, &i->gid); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, + "Failed to parse GID: '%s': %m", id); + + i->gid_set = true; + } + } + + h = c->groups; + break; + + default: + assert_not_reached(); + } + + existing = ordered_hashmap_get(h, i->name); + if (existing) { + /* Two functionally-equivalent items are fine */ + r = item_equivalent(i, existing); + if (r < 0) + return r; + if (r == 0) { + if (existing->filename) + log_syntax(NULL, LOG_WARNING, fname, line, 0, + "Conflict with earlier configuration for %s '%s' in %s:%u, ignoring line.", + item_type_to_string(i->type), + i->name, + existing->filename, existing->line); + else + log_syntax(NULL, LOG_WARNING, fname, line, 0, + "Conflict with earlier configuration for %s '%s', ignoring line.", + item_type_to_string(i->type), + i->name); + } + + return 0; + } + + r = ordered_hashmap_put(h, i->name, i); + if (r < 0) + return log_oom(); + + i = NULL; + return 0; +} + +static int read_config_file(Context *c, const char *fn, bool ignore_enoent) { + _cleanup_fclose_ FILE *rf = NULL; + _cleanup_free_ char *pp = NULL; + FILE *f = NULL; + unsigned v = 0; + int r = 0; + + assert(c); + assert(fn); + + if (streq(fn, "-")) + f = stdin; + else { + r = search_and_fopen(fn, "re", arg_root, (const char**) CONF_PATHS_STRV("sysusers.d"), &rf, &pp); + if (r < 0) { + if (ignore_enoent && r == -ENOENT) + return 0; + + return log_error_errno(r, "Failed to open '%s', ignoring: %m", fn); + } + + f = rf; + fn = pp; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + int k; + + k = read_stripped_line(f, LONG_LINE_MAX, &line); + if (k < 0) + return log_error_errno(k, "Failed to read '%s': %m", fn); + if (k == 0) + break; + + v++; + + if (IN_SET(line[0], 0, '#')) + continue; + + k = parse_line(c, fn, v, line); + if (k < 0 && r == 0) + r = k; + } + + if (ferror(f)) { + log_error_errno(errno, "Failed to read from file %s: %m", fn); + if (r == 0) + r = -EIO; + } + + return r; +} + +static int cat_config(void) { + _cleanup_strv_free_ char **files = NULL; + int r; + + r = conf_files_list_with_replacement(arg_root, CONF_PATHS_STRV("sysusers.d"), arg_replace, &files, NULL); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + return cat_files(NULL, files, arg_cat_flags); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-sysusers.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [CONFIGURATION FILE...]\n\n" + "Creates system user accounts.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --cat-config Show configuration files\n" + " --tldr Show non-comment parts of configuration\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + " --replace=PATH Treat arguments as replacement for PATH\n" + " --dry-run Just print what would be done\n" + " --inline Treat arguments as configuration lines\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_CAT_CONFIG, + ARG_TLDR, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_REPLACE, + ARG_DRY_RUN, + ARG_INLINE, + ARG_NO_PAGER, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "cat-config", no_argument, NULL, ARG_CAT_CONFIG }, + { "tldr", no_argument, NULL, ARG_TLDR }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "replace", required_argument, NULL, ARG_REPLACE }, + { "dry-run", no_argument, NULL, ARG_DRY_RUN }, + { "inline", no_argument, NULL, ARG_INLINE }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_CAT_CONFIG: + arg_cat_flags = CAT_CONFIG_ON; + break; + + case ARG_TLDR: + arg_cat_flags = CAT_TLDR; + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: +#ifdef STANDALONE + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "This systemd-sysusers version is compiled without support for --image=."); +#else + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + break; +#endif + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_REPLACE: + if (!path_is_absolute(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "The argument to --replace= must be an absolute path."); + if (!endswith(optarg, ".conf")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "The argument to --replace= must have the extension '.conf'."); + + arg_replace = optarg; + break; + + case ARG_DRY_RUN: + arg_dry_run = true; + break; + + case ARG_INLINE: + arg_inline = true; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_replace && arg_cat_flags != CAT_CONFIG_OFF) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --replace= is not supported with --cat-config/--tldr."); + + if (arg_replace && optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "When --replace= is given, some configuration items must be specified."); + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Use either --root= or --image=, the combination of both is not supported."); + + return 1; +} + +static int parse_arguments(Context *c, char **args) { + unsigned pos = 1; + int r; + + assert(c); + + STRV_FOREACH(arg, args) { + if (arg_inline) + /* Use (argument):n, where n==1 for the first positional arg */ + r = parse_line(c, "(argument)", pos, *arg); + else + r = read_config_file(c, *arg, /* ignore_enoent= */ false); + if (r < 0) + return r; + + pos++; + } + + return 0; +} + +static int read_config_files(Context *c, char **args) { + _cleanup_strv_free_ char **files = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(c); + + r = conf_files_list_with_replacement(arg_root, CONF_PATHS_STRV("sysusers.d"), arg_replace, &files, &p); + if (r < 0) + return r; + + STRV_FOREACH(f, files) + if (p && path_equal(*f, p)) { + log_debug("Parsing arguments at position \"%s\"%s", *f, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = parse_arguments(c, args); + if (r < 0) + return r; + } else { + log_debug("Reading config file \"%s\"%s", *f, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + /* Just warn, ignore result otherwise */ + (void) read_config_file(c, *f, /* ignore_enoent= */ true); + } + + return 0; +} + +static int read_credential_lines(Context *c) { + _cleanup_free_ char *j = NULL; + const char *d; + int r; + + assert(c); + + r = get_credentials_dir(&d); + if (r == -ENXIO) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to get credentials directory: %m"); + + j = path_join(d, "sysusers.extra"); + if (!j) + return log_oom(); + + (void) read_config_file(c, j, /* ignore_enoent= */ true); + return 0; +} + +static int run(int argc, char *argv[]) { +#ifndef STANDALONE + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; +#endif + _cleanup_close_ int lock = -EBADF; + _cleanup_(context_done) Context c = { + .search_uid = UID_INVALID, + }; + + Item *i; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + log_setup(); + + if (arg_cat_flags != CAT_CONFIG_OFF) + return cat_config(); + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + +#ifndef STANDALONE + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_VALIDATE_OS | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_FSCK | + DISSECT_IMAGE_GROWFS, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } +#else + assert(!arg_image); +#endif + + /* If command line arguments are specified along with --replace, read all configuration files and + * insert the positional arguments at the specified place. Otherwise, if command line arguments are + * specified, execute just them, and finally, without --replace= or any positional arguments, just + * read configuration and execute it. */ + if (arg_replace || optind >= argc) + r = read_config_files(&c, argv + optind); + else + r = parse_arguments(&c, argv + optind); + if (r < 0) + return r; + + r = read_credential_lines(&c); + if (r < 0) + return r; + + /* Let's tell nss-systemd not to synthesize the "root" and "nobody" entries for it, so that our + * detection whether the names or UID/GID area already used otherwise doesn't get confused. After + * all, even though nss-systemd synthesizes these users/groups, they should still appear in + * /etc/passwd and /etc/group, as the synthesizing logic is merely supposed to be fallback for cases + * where we run with a completely unpopulated /etc. */ + if (setenv("SYSTEMD_NSS_BYPASS_SYNTHETIC", "1", 1) < 0) + return log_error_errno(errno, "Failed to set SYSTEMD_NSS_BYPASS_SYNTHETIC environment variable: %m"); + + if (!c.uid_range) { + /* Default to default range of SYSTEMD_UID_MIN..SYSTEM_UID_MAX. */ + r = read_login_defs(&c.login_defs, NULL, arg_root); + if (r < 0) + return log_error_errno(r, "Failed to read %s%s: %m", + strempty(arg_root), "/etc/login.defs"); + + c.login_defs_need_warning = true; + + /* We pick a range that very conservative: we look at compiled-in maximum and the value in + * /etc/login.defs. That way the UIDs/GIDs which we allocate will be interpreted correctly, + * even if /etc/login.defs is removed later. (The bottom bound doesn't matter much, since + * it's only used during allocation, so we use the configured value directly). */ + uid_t begin = c.login_defs.system_alloc_uid_min, + end = MIN3((uid_t) SYSTEM_UID_MAX, c.login_defs.system_uid_max, c.login_defs.system_gid_max); + if (begin < end) { + r = uid_range_add(&c.uid_range, begin, end - begin + 1); + if (r < 0) + return log_oom(); + } + } + + r = add_implicit(&c); + if (r < 0) + return r; + + if (!arg_dry_run) { + lock = take_etc_passwd_lock(arg_root); + if (lock < 0) + return log_error_errno(lock, "Failed to take /etc/passwd lock: %m"); + } + + r = load_user_database(&c); + if (r < 0) + return log_error_errno(r, "Failed to load user database: %m"); + + r = load_group_database(&c); + if (r < 0) + return log_error_errno(r, "Failed to read group database: %m"); + + ORDERED_HASHMAP_FOREACH(i, c.groups) + (void) process_item(&c, i); + + ORDERED_HASHMAP_FOREACH(i, c.users) + (void) process_item(&c, i); + + return write_files(&c); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/sysv-generator/meson.build b/src/sysv-generator/meson.build new file mode 100644 index 0000000..4e89439 --- /dev/null +++ b/src/sysv-generator/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + generator_template + { + 'name' : 'systemd-sysv-generator', + 'conditions' : ['HAVE_SYSV_COMPAT'], + 'sources' : files('sysv-generator.c'), + }, +] diff --git a/src/sysv-generator/sysv-generator.c b/src/sysv-generator/sysv-generator.c new file mode 100644 index 0000000..4485e2e --- /dev/null +++ b/src/sysv-generator/sysv-generator.c @@ -0,0 +1,935 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-messages.h" + +#include "alloc-util.h" +#include "dirent-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "generator.h" +#include "hashmap.h" +#include "hexdecoct.h" +#include "initrd-util.h" +#include "install.h" +#include "log.h" +#include "main-func.h" +#include "mkdir.h" +#include "path-lookup.h" +#include "path-util.h" +#include "set.h" +#include "special.h" +#include "specifier.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" + +/* 🚨 Note: this generator is deprecated! Please do not add new features! Instead, please port remaining SysV + * scripts over to native unit files! Thank you! 🚨 */ + +static const struct { + const char *path; + const char *target; +} rcnd_table[] = { + /* Standard SysV runlevels for start-up */ + { "rc1.d", SPECIAL_RESCUE_TARGET }, + { "rc2.d", SPECIAL_MULTI_USER_TARGET }, + { "rc3.d", SPECIAL_MULTI_USER_TARGET }, + { "rc4.d", SPECIAL_MULTI_USER_TARGET }, + { "rc5.d", SPECIAL_GRAPHICAL_TARGET }, + + /* We ignore the SysV runlevels for shutdown here, as SysV services get default dependencies anyway, and that + * means they are shut down anyway at system power off if running. */ +}; + +static const char *arg_dest = NULL; + +typedef struct SysvStub { + char *name; + char *path; + char *description; + int sysv_start_priority; + char *pid_file; + char **before; + char **after; + char **wants; + char **wanted_by; + bool has_lsb; + bool reload; + bool loaded; +} SysvStub; + +static SysvStub* free_sysvstub(SysvStub *s) { + if (!s) + return NULL; + + free(s->name); + free(s->path); + free(s->description); + free(s->pid_file); + strv_free(s->before); + strv_free(s->after); + strv_free(s->wants); + strv_free(s->wanted_by); + return mfree(s); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(SysvStub*, free_sysvstub); + +static void free_sysvstub_hashmapp(Hashmap **h) { + hashmap_free_with_destructor(*h, free_sysvstub); +} + +static int add_alias(const char *service, const char *alias) { + _cleanup_free_ char *link = NULL; + + assert(service); + assert(alias); + + link = path_join(arg_dest, alias); + if (!link) + return -ENOMEM; + + if (symlink(service, link) < 0) { + if (errno == EEXIST) + return 0; + + return -errno; + } + + return 1; +} + +static int generate_unit_file(SysvStub *s) { + _cleanup_free_ char *path_escaped = NULL, *unit = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(s); + + if (!s->loaded) + return 0; + + path_escaped = specifier_escape(s->path); + if (!path_escaped) + return log_oom(); + + unit = path_join(arg_dest, s->name); + if (!unit) + return log_oom(); + + /* We might already have a symlink with the same name from a Provides:, + * or from backup files like /etc/init.d/foo.bak. Real scripts always win, + * so remove an existing link */ + if (is_symlink(unit) > 0) { + log_warning("Overwriting existing symlink %s with real service.", unit); + (void) unlink(unit); + } + + f = fopen(unit, "wxe"); + if (!f) + return log_error_errno(errno, "Failed to create unit file %s: %m", unit); + + fprintf(f, + "# Automatically generated by systemd-sysv-generator\n\n" + "[Unit]\n" + "Documentation=man:systemd-sysv-generator(8)\n" + "SourcePath=%s\n", + path_escaped); + + if (s->description) { + _cleanup_free_ char *t = NULL; + + t = specifier_escape(s->description); + if (!t) + return log_oom(); + + fprintf(f, "Description=%s\n", t); + } + + STRV_FOREACH(p, s->before) + fprintf(f, "Before=%s\n", *p); + STRV_FOREACH(p, s->after) + fprintf(f, "After=%s\n", *p); + STRV_FOREACH(p, s->wants) + fprintf(f, "Wants=%s\n", *p); + + fprintf(f, + "\n[Service]\n" + "Type=forking\n" + "Restart=no\n" + "TimeoutSec=5min\n" + "IgnoreSIGPIPE=no\n" + "KillMode=process\n" + "GuessMainPID=no\n" + "RemainAfterExit=%s\n", + yes_no(!s->pid_file)); + + if (s->pid_file) { + _cleanup_free_ char *t = NULL; + + t = specifier_escape(s->pid_file); + if (!t) + return log_oom(); + + fprintf(f, "PIDFile=%s\n", t); + } + + /* Consider two special LSB exit codes a clean exit */ + if (s->has_lsb) + fprintf(f, + "SuccessExitStatus=%i %i\n", + EXIT_NOTINSTALLED, + EXIT_NOTCONFIGURED); + + fprintf(f, + "ExecStart=%s start\n" + "ExecStop=%s stop\n", + path_escaped, path_escaped); + + if (s->reload) + fprintf(f, "ExecReload=%s reload\n", path_escaped); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit %s: %m", unit); + + STRV_FOREACH(p, s->wanted_by) + (void) generator_add_symlink(arg_dest, *p, "wants", s->name); + + return 1; +} + +static bool usage_contains_reload(const char *line) { + return (strcasestr(line, "{reload|") || + strcasestr(line, "{reload}") || + strcasestr(line, "{reload\"") || + strcasestr(line, "|reload|") || + strcasestr(line, "|reload}") || + strcasestr(line, "|reload\"")); +} + +static char *sysv_translate_name(const char *name) { + _cleanup_free_ char *c = NULL; + char *res; + + c = strdup(name); + if (!c) + return NULL; + + res = endswith(c, ".sh"); + if (res) + *res = 0; + + if (unit_name_mangle(c, 0, &res) < 0) + return NULL; + + return res; +} + +static int sysv_translate_facility(SysvStub *s, unsigned line, const char *name, char **ret) { + + /* We silently ignore the $ prefix here. According to the LSB + * spec it simply indicates whether something is a + * standardized name or a distribution-specific one. Since we + * just follow what already exists and do not introduce new + * uses or names we don't care who introduced a new name. */ + + static const char * const table[] = { + /* LSB defined facilities */ + "local_fs", NULL, + "network", SPECIAL_NETWORK_ONLINE_TARGET, + "named", SPECIAL_NSS_LOOKUP_TARGET, + "portmap", SPECIAL_RPCBIND_TARGET, + "remote_fs", SPECIAL_REMOTE_FS_TARGET, + "syslog", NULL, + "time", SPECIAL_TIME_SYNC_TARGET, + }; + + _cleanup_free_ char *filename = NULL; + const char *n; + char *e, *m; + int r; + + assert(name); + assert(s); + assert(ret); + + r = path_extract_filename(s->path, &filename); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from path '%s': %m", s->path); + + n = *name == '$' ? name + 1 : name; + + for (size_t i = 0; i < ELEMENTSOF(table); i += 2) { + if (!streq(table[i], n)) + continue; + + if (!table[i+1]) { + *ret = NULL; + return 0; + } + + m = strdup(table[i+1]); + if (!m) + return log_oom(); + + *ret = m; + return 1; + } + + /* If we don't know this name, fallback heuristics to figure + * out whether something is a target or a service alias. */ + + /* Facilities starting with $ are most likely targets */ + if (*name == '$') { + r = unit_name_build(n, NULL, ".target", ret); + if (r < 0) + return log_error_errno(r, "[%s:%u] Could not build name for facility %s: %m", s->path, line, name); + + return 1; + } + + /* Strip ".sh" suffix from file name for comparison */ + e = endswith(filename, ".sh"); + if (e) + *e = '\0'; + + /* Names equaling the file name of the services are redundant */ + if (streq_ptr(n, filename)) { + *ret = NULL; + return 0; + } + + /* Everything else we assume to be normal service names */ + m = sysv_translate_name(n); + if (!m) + return log_oom(); + + *ret = m; + return 1; +} + +static int handle_provides(SysvStub *s, unsigned line, const char *full_text, const char *text) { + int r; + + assert(s); + assert(full_text); + assert(text); + + for (;;) { + _cleanup_free_ char *word = NULL, *m = NULL; + + r = extract_first_word(&text, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX); + if (r < 0) + return log_error_errno(r, "[%s:%u] Failed to parse word from provides string: %m", s->path, line); + if (r == 0) + break; + + r = sysv_translate_facility(s, line, word, &m); + if (r <= 0) /* continue on error */ + continue; + + switch (unit_name_to_type(m)) { + + case UNIT_SERVICE: + log_debug("Adding Provides: alias '%s' for '%s'", m, s->name); + r = add_alias(s->name, m); + if (r < 0) + log_warning_errno(r, "[%s:%u] Failed to add LSB Provides name %s, ignoring: %m", s->path, line, m); + break; + + case UNIT_TARGET: + + /* NB: SysV targets which are provided by a + * service are pulled in by the services, as + * an indication that the generic service is + * now available. This is strictly one-way. + * The targets do NOT pull in SysV services! */ + + r = strv_extend(&s->before, m); + if (r < 0) + return log_oom(); + + r = strv_extend(&s->wants, m); + if (r < 0) + return log_oom(); + + if (streq(m, SPECIAL_NETWORK_ONLINE_TARGET)) { + r = strv_extend(&s->before, SPECIAL_NETWORK_TARGET); + if (r < 0) + return log_oom(); + r = strv_extend(&s->wants, SPECIAL_NETWORK_TARGET); + if (r < 0) + return log_oom(); + } + + break; + + case _UNIT_TYPE_INVALID: + log_warning("Unit name '%s' is invalid", m); + break; + + default: + log_warning("Unknown unit type for unit '%s'", m); + } + } + + return 0; +} + +static int handle_dependencies(SysvStub *s, unsigned line, const char *full_text, const char *text) { + int r; + + assert(s); + assert(full_text); + assert(text); + + for (;;) { + _cleanup_free_ char *word = NULL, *m = NULL; + bool is_before; + + r = extract_first_word(&text, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX); + if (r < 0) + return log_error_errno(r, "[%s:%u] Failed to parse word from provides string: %m", s->path, line); + if (r == 0) + break; + + r = sysv_translate_facility(s, line, word, &m); + if (r <= 0) /* continue on error */ + continue; + + is_before = startswith_no_case(full_text, "X-Start-Before:"); + + if (streq(m, SPECIAL_NETWORK_ONLINE_TARGET) && !is_before) { + /* the network-online target is special, as it needs to be actively pulled in */ + r = strv_extend(&s->after, m); + if (r < 0) + return log_oom(); + + r = strv_extend(&s->wants, m); + } else + r = strv_extend(is_before ? &s->before : &s->after, m); + if (r < 0) + return log_oom(); + } + + return 0; +} + +static int load_sysv(SysvStub *s) { + _cleanup_fclose_ FILE *f = NULL; + unsigned line = 0; + int r; + enum { + NORMAL, + DESCRIPTION, + LSB, + LSB_DESCRIPTION, + USAGE_CONTINUATION + } state = NORMAL; + _cleanup_free_ char *short_description = NULL, *long_description = NULL, *chkconfig_description = NULL; + char *description; + bool supports_reload = false; + + assert(s); + + f = fopen(s->path, "re"); + if (!f) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open %s: %m", s->path); + } + + log_debug("Loading SysV script %s", s->path); + + for (;;) { + _cleanup_free_ char *l = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &l); + if (r < 0) + return log_error_errno(r, "Failed to read configuration file '%s': %m", s->path); + if (r == 0) + break; + + line++; + + if (l[0] != '#') { + /* Try to figure out whether this init script supports + * the reload operation. This heuristic looks for + * "Usage" lines which include the reload option. */ + if (state == USAGE_CONTINUATION || + (state == NORMAL && strcasestr(l, "usage"))) { + if (usage_contains_reload(l)) { + supports_reload = true; + state = NORMAL; + } else if (endswith(l, "\\")) + state = USAGE_CONTINUATION; + else + state = NORMAL; + } + + continue; + } + + if (state == NORMAL && streq(l, "### BEGIN INIT INFO")) { + state = LSB; + s->has_lsb = true; + continue; + } + + if (IN_SET(state, LSB_DESCRIPTION, LSB) && streq(l, "### END INIT INFO")) { + state = NORMAL; + continue; + } + + char *t = l + 1; + t += strspn(t, WHITESPACE); + + if (state == NORMAL) { + + /* Try to parse Red Hat style description */ + + if (startswith_no_case(t, "description:")) { + + size_t k; + const char *j; + + k = strlen(t); + if (k > 0 && t[k-1] == '\\') { + state = DESCRIPTION; + t[k-1] = 0; + } + + j = empty_to_null(strstrip(t+12)); + + r = free_and_strdup(&chkconfig_description, j); + if (r < 0) + return log_oom(); + + } else if (startswith_no_case(t, "pidfile:")) { + const char *fn; + + state = NORMAL; + + fn = strstrip(t+8); + if (!path_is_absolute(fn)) { + log_error("[%s:%u] PID file not absolute. Ignoring.", s->path, line); + continue; + } + + r = free_and_strdup(&s->pid_file, fn); + if (r < 0) + return log_oom(); + } + + } else if (state == DESCRIPTION) { + + /* Try to parse Red Hat style description + * continuation */ + + size_t k; + const char *j; + + k = strlen(t); + if (k > 0 && t[k-1] == '\\') + t[k-1] = 0; + else + state = NORMAL; + + j = strstrip(t); + if (!isempty(j) && !strextend_with_separator(&chkconfig_description, " ", j)) + return log_oom(); + + } else if (IN_SET(state, LSB, LSB_DESCRIPTION)) { + + if (startswith_no_case(t, "Provides:")) { + state = LSB; + + r = handle_provides(s, line, t, t + 9); + if (r < 0) + return r; + + } else if (startswith_no_case(t, "Required-Start:") || + startswith_no_case(t, "Should-Start:") || + startswith_no_case(t, "X-Start-Before:") || + startswith_no_case(t, "X-Start-After:")) { + + state = LSB; + + r = handle_dependencies(s, line, t, strchr(t, ':') + 1); + if (r < 0) + return r; + + } else if (startswith_no_case(t, "Description:")) { + const char *j; + + state = LSB_DESCRIPTION; + + j = empty_to_null(strstrip(t+12)); + + r = free_and_strdup(&long_description, j); + if (r < 0) + return log_oom(); + + } else if (startswith_no_case(t, "Short-Description:")) { + const char *j; + + state = LSB; + + j = empty_to_null(strstrip(t+18)); + + r = free_and_strdup(&short_description, j); + if (r < 0) + return log_oom(); + + } else if (state == LSB_DESCRIPTION) { + + if (startswith(l, "#\t") || startswith(l, "# ")) { + const char *j; + + j = strstrip(t); + if (!isempty(j) && !strextend_with_separator(&long_description, " ", j)) + return log_oom(); + } else + state = LSB; + } + } + } + + s->reload = supports_reload; + + /* We use the long description only if + * no short description is set. */ + + if (short_description) + description = short_description; + else if (chkconfig_description) + description = chkconfig_description; + else if (long_description) + description = long_description; + else + description = NULL; + + if (description) { + char *d; + + d = strjoin(s->has_lsb ? "LSB: " : "SYSV: ", description); + if (!d) + return log_oom(); + + s->description = d; + } + + s->loaded = true; + return 0; +} + +static int fix_order(SysvStub *s, Hashmap *all_services) { + SysvStub *other; + int r; + + assert(s); + + if (!s->loaded) + return 0; + + if (s->sysv_start_priority < 0) + return 0; + + HASHMAP_FOREACH(other, all_services) { + if (s == other) + continue; + + if (!other->loaded) + continue; + + if (other->sysv_start_priority < 0) + continue; + + /* If both units have modern headers we don't care + * about the priorities */ + if (s->has_lsb && other->has_lsb) + continue; + + if (other->sysv_start_priority < s->sysv_start_priority) { + r = strv_extend(&s->after, other->name); + if (r < 0) + return log_oom(); + + } else if (other->sysv_start_priority > s->sysv_start_priority) { + r = strv_extend(&s->before, other->name); + if (r < 0) + return log_oom(); + } else + continue; + + /* FIXME: Maybe we should compare the name here lexicographically? */ + } + + return 0; +} + +static int acquire_search_path(const char *def, const char *envvar, char ***ret) { + _cleanup_strv_free_ char **l = NULL; + const char *e; + int r; + + assert(def); + assert(envvar); + + e = getenv(envvar); + if (e) { + r = path_split_and_make_absolute(e, &l); + if (r < 0) + return log_error_errno(r, "Failed to make $%s search path absolute: %m", envvar); + } + + if (strv_isempty(l)) { + strv_free(l); + + l = strv_new(def); + if (!l) + return log_oom(); + } + + if (!path_strv_resolve_uniq(l, NULL)) + return log_oom(); + + *ret = TAKE_PTR(l); + + return 0; +} + +static int enumerate_sysv(const LookupPaths *lp, Hashmap *all_services) { + _cleanup_strv_free_ char **sysvinit_path = NULL; + int r; + + assert(lp); + + r = acquire_search_path(SYSTEM_SYSVINIT_PATH, "SYSTEMD_SYSVINIT_PATH", &sysvinit_path); + if (r < 0) + return r; + + STRV_FOREACH(path, sysvinit_path) { + _cleanup_closedir_ DIR *d = NULL; + + d = opendir(*path); + if (!d) { + if (errno != ENOENT) + log_warning_errno(errno, "Opening %s failed, ignoring: %m", *path); + continue; + } + + FOREACH_DIRENT(de, d, log_error_errno(errno, "Failed to enumerate directory %s, ignoring: %m", *path)) { + _cleanup_free_ char *fpath = NULL, *name = NULL; + _cleanup_(free_sysvstubp) SysvStub *service = NULL; + struct stat st; + + if (fstatat(dirfd(d), de->d_name, &st, 0) < 0) { + log_warning_errno(errno, "stat() failed on %s/%s, ignoring: %m", *path, de->d_name); + continue; + } + + if (!(st.st_mode & S_IXUSR)) + continue; + + if (!S_ISREG(st.st_mode)) + continue; + + name = sysv_translate_name(de->d_name); + if (!name) + return log_oom(); + + if (hashmap_contains(all_services, name)) + continue; + + r = unit_file_exists(RUNTIME_SCOPE_SYSTEM, lp, name); + if (r < 0 && !IN_SET(r, -ELOOP, -ERFKILL, -EADDRNOTAVAIL)) { + log_debug_errno(r, "Failed to detect whether %s exists, skipping: %m", name); + continue; + } else if (r != 0) { + log_debug("Native unit for %s already exists, skipping.", name); + continue; + } + + fpath = path_join(*path, de->d_name); + if (!fpath) + return log_oom(); + + log_struct(LOG_WARNING, + LOG_MESSAGE("SysV service '%s' lacks a native systemd unit file. " + "%s Automatically generating a unit file for compatibility. Please update package to include a native systemd unit file, in order to make it safe, robust and future-proof. " + "%s This compatibility logic is deprecated, expect removal soon. %s", + fpath, + special_glyph(SPECIAL_GLYPH_RECYCLING), + special_glyph(SPECIAL_GLYPH_WARNING_SIGN), special_glyph(SPECIAL_GLYPH_WARNING_SIGN)), + "MESSAGE_ID=" SD_MESSAGE_SYSV_GENERATOR_DEPRECATED_STR, + "SYSVSCRIPT=%s", fpath, + "UNIT=%s", name); + + service = new(SysvStub, 1); + if (!service) + return log_oom(); + + *service = (SysvStub) { + .sysv_start_priority = -1, + .name = TAKE_PTR(name), + .path = TAKE_PTR(fpath), + }; + + r = hashmap_put(all_services, service->name, service); + if (r < 0) + return log_oom(); + + TAKE_PTR(service); + } + } + + return 0; +} + +static int set_dependencies_from_rcnd(const LookupPaths *lp, Hashmap *all_services) { + Set *runlevel_services[ELEMENTSOF(rcnd_table)] = {}; + _cleanup_strv_free_ char **sysvrcnd_path = NULL; + SysvStub *service; + int r; + + assert(lp); + + r = acquire_search_path(SYSTEM_SYSVRCND_PATH, "SYSTEMD_SYSVRCND_PATH", &sysvrcnd_path); + if (r < 0) + return r; + + STRV_FOREACH(p, sysvrcnd_path) + for (unsigned i = 0; i < ELEMENTSOF(rcnd_table); i ++) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_free_ char *path = NULL; + + path = path_join(*p, rcnd_table[i].path); + if (!path) { + r = log_oom(); + goto finish; + } + + d = opendir(path); + if (!d) { + if (errno != ENOENT) + log_warning_errno(errno, "Opening %s failed, ignoring: %m", path); + + continue; + } + + FOREACH_DIRENT(de, d, log_warning_errno(errno, "Failed to enumerate directory %s, ignoring: %m", path)) { + _cleanup_free_ char *name = NULL, *fpath = NULL; + int a, b; + + if (de->d_name[0] != 'S') + continue; + + if (strlen(de->d_name) < 4) + continue; + + a = undecchar(de->d_name[1]); + b = undecchar(de->d_name[2]); + + if (a < 0 || b < 0) + continue; + + fpath = path_join(*p, de->d_name); + if (!fpath) { + r = log_oom(); + goto finish; + } + + name = sysv_translate_name(de->d_name + 3); + if (!name) { + r = log_oom(); + goto finish; + } + + service = hashmap_get(all_services, name); + if (!service) { + log_debug("Ignoring %s symlink in %s, not generating %s.", de->d_name, rcnd_table[i].path, name); + continue; + } + + service->sysv_start_priority = MAX(a*10 + b, service->sysv_start_priority); + + r = set_ensure_put(&runlevel_services[i], NULL, service); + if (r < 0) { + log_oom(); + goto finish; + } + } + } + + for (unsigned i = 0; i < ELEMENTSOF(rcnd_table); i++) + SET_FOREACH(service, runlevel_services[i]) { + r = strv_extend(&service->before, rcnd_table[i].target); + if (r < 0) { + log_oom(); + goto finish; + } + r = strv_extend(&service->wanted_by, rcnd_table[i].target); + if (r < 0) { + log_oom(); + goto finish; + } + } + + r = 0; + +finish: + for (unsigned i = 0; i < ELEMENTSOF(rcnd_table); i++) + set_free(runlevel_services[i]); + + return r; +} + +static int run(const char *dest, const char *dest_early, const char *dest_late) { + _cleanup_(free_sysvstub_hashmapp) Hashmap *all_services = NULL; + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + SysvStub *service; + int r; + + if (in_initrd()) { + log_debug("Skipping generator, running in the initrd."); + return EXIT_SUCCESS; + } + + assert_se(arg_dest = dest_late); + + r = lookup_paths_init_or_warn(&lp, RUNTIME_SCOPE_SYSTEM, LOOKUP_PATHS_EXCLUDE_GENERATED, NULL); + if (r < 0) + return r; + + all_services = hashmap_new(&string_hash_ops); + if (!all_services) + return log_oom(); + + r = enumerate_sysv(&lp, all_services); + if (r < 0) + return r; + + r = set_dependencies_from_rcnd(&lp, all_services); + if (r < 0) + return r; + + HASHMAP_FOREACH(service, all_services) + (void) load_sysv(service); + + HASHMAP_FOREACH(service, all_services) { + (void) fix_order(service, all_services); + (void) generate_unit_file(service); + } + + return 0; +} + +DEFINE_MAIN_GENERATOR_FUNCTION(run); diff --git a/src/test/generate-sym-test.py b/src/test/generate-sym-test.py new file mode 100755 index 0000000..028d108 --- /dev/null +++ b/src/test/generate-sym-test.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later + +import os +import re +import sys + +def process_sym_file(file): + for line in file: + m = re.search(r'^ +([a-zA-Z0-9_]+);', line) + if m: + if m[1] == 'sd_bus_object_vtable_format': + print(' {{"{0}", &{0}}},'.format(m[1])) + else: + print(' {{"{0}", {0}}},'.format(m[1])) + +def process_source_file(file): + for line in file: + # Functions + m = re.search(r'^_public_\s+(\S+\s+)+\**(\w+)\s*\(', line) + if m: + print(' {{ "{0}", {0} }},'.format(m[2])) + # Variables + m = re.search(r'^_public_\s+(\S+\s+)+\**(\w+)\s*=', line) + if m: + print(' {{ "{0}", &{0} }},'.format(m[2])) + # Functions defined through a macro + m = re.search(r'^DEFINE_PUBLIC_TRIVIAL_REF_FUNC\([^,]+,\s*(\w+)\s*\)', line) + if m: + print(' {{ "{0}_ref", {0}_ref }},'.format(m[1])) + m = re.search(r'^DEFINE_PUBLIC_TRIVIAL_UNREF_FUNC\([^,]+,\s*(\w+)\s*,', line) + if m: + print(' {{ "{0}_unref", {0}_unref }},'.format(m[1])) + m = re.search(r"^DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC\([^,]+,\s*(\w+)\s*,", line) + if m: + print(' {{ "{0}_ref", {0}_ref }},'.format(m[1])) + print(' {{ "{0}_unref", {0}_unref }},'.format(m[1])) + +print('''/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +''') + +for header in sys.argv[3:]: + print('#include "{}"'.format(header.split('/')[-1])) + +print(''' +/* We want to check deprecated symbols too, without complaining */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +''') + +print(''' +struct symbol { + const char *name; + const void *symbol; +}; +static struct symbol symbols_from_sym[] = {''') + +with open(sys.argv[1], "r") as f: + process_sym_file(f) + +print(''' {} +}, symbols_from_source[] = {''') + +for dirpath, _, filenames in sorted(os.walk(sys.argv[2])): + for filename in sorted(filenames): + if not filename.endswith(".c") and not filename.endswith(".h"): + continue + with open(os.path.join(dirpath, filename), "r") as f: + process_source_file(f) + +print(''' {} +}; + +static int sort_callback(const void *a, const void *b) { + const struct symbol *x = a, *y = b; + return strcmp(x->name, y->name); +} + +int main(void) { + size_t i, j; + + qsort(symbols_from_sym, sizeof(symbols_from_sym)/sizeof(symbols_from_sym[0])-1, sizeof(symbols_from_sym[0]), sort_callback); + qsort(symbols_from_source, sizeof(symbols_from_source)/sizeof(symbols_from_source[0])-1, sizeof(symbols_from_source[0]), sort_callback); + + puts("From symbol file:"); + for (i = 0; symbols_from_sym[i].name; i++) + printf("%p: %s\\n", symbols_from_sym[i].symbol, symbols_from_sym[i].name); + + puts("\\nFrom source files:"); + for (j = 0; symbols_from_source[j].name; j++) + printf("%p: %s\\n", symbols_from_source[j].symbol, symbols_from_source[j].name); + + puts(""); + printf("Found %zu symbols from symbol file.\\n", i); + printf("Found %zu symbols from source files.\\n", j); + + for (i = 0; symbols_from_sym[i].name; i++) { + struct symbol*n = bsearch(symbols_from_sym+i, symbols_from_source, sizeof(symbols_from_source)/sizeof(symbols_from_source[0])-1, sizeof(symbols_from_source[0]), sort_callback); + if (!n) + printf("Found in symbol file, but not in sources: %s\\n", symbols_from_sym[i].name); + } + + for (j = 0; symbols_from_source[j].name; j++) { + struct symbol*n = bsearch(symbols_from_source+j, symbols_from_source, sizeof(symbols_from_sym)/sizeof(symbols_from_sym[0])-1, sizeof(symbols_from_sym[0]), sort_callback); + if (!n) + printf("Found in sources, but not in symbol file: %s\\n", symbols_from_source[i].name); + } + + return i == j ? EXIT_SUCCESS : EXIT_FAILURE; +}''') diff --git a/src/test/meson.build b/src/test/meson.build new file mode 100644 index 0000000..cce90d7 --- /dev/null +++ b/src/test/meson.build @@ -0,0 +1,597 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +awkscript = 'test-hashmap-ordered.awk' +test_hashmap_ordered_c = custom_target( + 'test-hashmap-ordered.c', + input : [awkscript, 'test-hashmap-plain.c'], + output : 'test-hashmap-ordered.c', + command : [awk, '-f', '@INPUT0@', '@INPUT1@'], + capture : true, + build_by_default : want_tests != 'false') + +path = run_command(sh, '-c', 'echo "$PATH"', check: true).stdout().strip() +test_env = environment() +test_env.set('SYSTEMD_LANGUAGE_FALLBACK_MAP', language_fallback_map) +test_env.set('PATH', project_build_root + ':' + path) +test_env.set('PROJECT_BUILD_ROOT', project_build_root) +test_env.set('SYSTEMD_SLOW_TESTS', slow_tests ? '1' : '0') + +if efi_addon != '' + test_env.set('EFI_ADDON', efi_addon) +endif + +############################################################ + +generate_sym_test_py = find_program('generate-sym-test.py') + +test_libsystemd_sym_c = custom_target( + 'test-libsystemd-sym.c', + input : [libsystemd_sym_path] + systemd_headers + libsystemd_sources, + output : 'test-libsystemd-sym.c', + command : [generate_sym_test_py, libsystemd_sym_path, libsystemd_dir_path] + systemd_headers, + capture : true, + build_by_default : want_tests != 'false') + +test_libudev_sym_c = custom_target( + 'test-libudev-sym.c', + input : [libudev_sym_path, libudev_h_path] + libudev_sources, + output : 'test-libudev-sym.c', + command : [generate_sym_test_py, libudev_sym_path, libudev_dir_path, libudev_h_path], + capture : true, + build_by_default : want_tests != 'false') + +############################################################ + +simple_tests += files( + 'test-alloc-util.c', + 'test-architecture.c', + 'test-argv-util.c', + 'test-barrier.c', + 'test-bitfield.c', + 'test-bitmap.c', + 'test-blockdev-util.c', + 'test-bootspec.c', + 'test-bus-util.c', + 'test-calendarspec.c', + 'test-cgroup-setup.c', + 'test-cgroup-util.c', + 'test-cgroup.c', + 'test-chase.c', + 'test-clock.c', + 'test-compare-operator.c', + 'test-condition.c', + 'test-conf-files.c', + 'test-conf-parser.c', + 'test-copy.c', + 'test-coredump-util.c', + 'test-cpu-set-util.c', + 'test-creds.c', + 'test-daemon.c', + 'test-data-fd-util.c', + 'test-date.c', + 'test-dev-setup.c', + 'test-device-nodes.c', + 'test-devnum-util.c', + 'test-dns-domain.c', + 'test-ellipsize.c', + 'test-env-file.c', + 'test-env-util.c', + 'test-errno-util.c', + 'test-escape.c', + 'test-ether-addr-util.c', + 'test-exec-util.c', + 'test-execve.c', + 'test-exit-status.c', + 'test-extract-word.c', + 'test-fdset.c', + 'test-fiemap.c', + 'test-fileio.c', + 'test-firewall-util.c', + 'test-format-table.c', + 'test-format-util.c', + 'test-fs-util.c', + 'test-fstab-util.c', + 'test-glob-util.c', + 'test-gpt.c', + 'test-gunicode.c', + 'test-hash-funcs.c', + 'test-hexdecoct.c', + 'test-hmac.c', + 'test-hostname-setup.c', + 'test-hostname-util.c', + 'test-id128.c', + 'test-image-policy.c', + 'test-import-util.c', + 'test-in-addr-prefix-util.c', + 'test-in-addr-util.c', + 'test-install-file.c', + 'test-install-root.c', + 'test-io-util.c', + 'test-journal-importer.c', + 'test-kbd-util.c', + 'test-limits-util.c', + 'test-list.c', + 'test-local-addresses.c', + 'test-locale-util.c', + 'test-lock-util.c', + 'test-log.c', + 'test-logarithm.c', + 'test-macro.c', + 'test-memfd-util.c', + 'test-memory-util.c', + 'test-mempool.c', + 'test-memstream-util.c', + 'test-mkdir.c', + 'test-modhex.c', + 'test-mountpoint-util.c', + 'test-net-naming-scheme.c', + 'test-nulstr-util.c', + 'test-open-file.c', + 'test-ordered-set.c', + 'test-os-util.c', + 'test-parse-argument.c', + 'test-parse-helpers.c', + 'test-path-lookup.c', + 'test-path-util.c', + 'test-percent-util.c', + 'test-pretty-print.c', + 'test-prioq.c', + 'test-proc-cmdline.c', + 'test-procfs-util.c', + 'test-psi-util.c', + 'test-ratelimit.c', + 'test-raw-clone.c', + 'test-recurse-dir.c', + 'test-replace-var.c', + 'test-rlimit-util.c', + 'test-rm-rf.c', + 'test-sd-hwdb.c', + 'test-sd-path.c', + 'test-secure-bits.c', + 'test-selinux.c', + 'test-serialize.c', + 'test-set.c', + 'test-sha256.c', + 'test-sigbus.c', + 'test-signal-util.c', + 'test-siphash24.c', + 'test-sleep-config.c', + 'test-socket-netlink.c', + 'test-socket-util.c', + 'test-specifier.c', + 'test-stat-util.c', + 'test-static-destruct.c', + 'test-strbuf.c', + 'test-string-util.c', + 'test-strip-tab-ansi.c', + 'test-strv.c', + 'test-strxcpyx.c', + 'test-sysctl-util.c', + 'test-terminal-util.c', + 'test-tmpfile-util.c', + 'test-udev-util.c', + 'test-uid-alloc-range.c', + 'test-uid-range.c', + 'test-umask-util.c', + 'test-unaligned.c', + 'test-unit-file.c', + 'test-user-util.c', + 'test-utf8.c', + 'test-verbs.c', + 'test-web-util.c', + 'test-xattr-util.c', + 'test-xml.c', +) + +############################################################ + +common_test_dependencies = [ + libblkid, + libmount, + librt, + libseccomp, + libselinux, + threads, +] + +executables += [ + test_template + { + 'sources' : files('test-acl-util.c'), + 'conditions' : ['HAVE_ACL'], + }, + test_template + { + 'sources' : files('test-af-list.c') + + generated_gperf_headers, + }, + test_template + { + 'sources' : files('test-arphrd-util.c') + + generated_gperf_headers, + }, + test_template + { + 'sources' : files('test-ask-password-api.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-async.c'), + 'timeout' : 120, + }, + test_template + { + 'sources' : files('test-boot-timestamps.c'), + 'conditions' : ['ENABLE_EFI'], + }, + test_template + { + 'sources' : files('test-btrfs.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-btrfs-physical-offset.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-cap-list.c') + + generated_gperf_headers, + 'dependencies' : libcap, + }, + test_template + { + 'sources' : files('test-capability.c'), + 'dependencies' : libcap, + }, + test_template + { + 'sources' : files('test-chase-manual.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-compress-benchmark.c'), + 'link_with' : [ + libbasic_compress, + libshared, + ], + 'timeout' : 90, + }, + test_template + { + 'sources' : files('test-compress.c'), + 'link_with' : [ + libbasic_compress, + libshared, + ], + }, + test_template + { + 'sources' : files('test-cryptolib.c'), + 'dependencies' : lib_openssl_or_gcrypt, + 'conditions' : ['HAVE_OPENSSL_OR_GCRYPT'], + }, + test_template + { + 'sources' : files('test-dlopen-so.c'), + 'dependencies' : libp11kit_cflags + }, + test_template + { + # only static linking apart from libdl, to make sure that the + # module is linked to all libraries that it uses. + 'sources' : files('test-dlopen.c'), + 'link_with' : libbasic, + 'dependencies' : libdl, + 'install' : false, + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-errno-list.c') + + generated_gperf_headers, + }, + test_template + { + 'sources' : files('test-fd-util.c'), + 'dependencies' : libseccomp, + }, + test_template + { + 'sources' : files( + 'test-hashmap.c', + 'test-hashmap-plain.c', + ) + [ + test_hashmap_ordered_c, + ], + 'timeout' : 180, + }, + test_template + { + 'sources' : files('test-ip-protocol-list.c') + + shared_generated_gperf_headers, + }, + test_template + { + 'sources' : files('test-ipcrm.c'), + 'type' : 'unsafe', + }, + test_template + { + 'sources' : files('test-json.c'), + 'dependencies' : libm, + }, + test_template + { + 'sources' : files('test-libcrypt-util.c'), + 'dependencies' : libcrypt, + 'timeout' : 120, + }, + test_template + { + 'sources' : files('test-libmount.c'), + 'dependencies' : [ + libmount, + threads, + ], + }, + test_template + { + 'sources' : files('test-loopback.c'), + 'dependencies' : common_test_dependencies, + }, + test_template + { + 'sources' : files('test-math-util.c'), + 'dependencies' : libm, + }, + test_template + { + 'sources' : files('test-mempress.c'), + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('test-mount-util.c'), + 'dependencies' : libmount, + }, + test_template + { + 'sources' : files('test-netlink-manual.c'), + 'dependencies' : libkmod, + 'conditions' : ['HAVE_KMOD'], + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-nft-set.c'), + 'type' : 'manual', + }, + test_template + { + 'sources' : files('test-nscd-flush.c'), + 'conditions' : ['ENABLE_NSCD'], + 'type' : 'manual', + }, + test_template + { + 'sources' : files( + 'test-nss-hosts.c', + 'nss-test-util.c', + ), + 'dependencies' : libdl, + 'conditions' : ['ENABLE_NSS'], + 'timeout' : 120, + }, + test_template + { + 'sources' : files( + 'test-nss-users.c', + 'nss-test-util.c', + ), + 'dependencies' : libdl, + 'conditions' : ['ENABLE_NSS'], + }, + test_template + { + 'sources' : files('test-openssl.c'), + 'dependencies' : libopenssl, + 'conditions' : ['HAVE_OPENSSL'], + }, + test_template + { + 'sources' : files('test-parse-util.c'), + 'dependencies' : libm, + }, + test_template + { + 'sources' : files('test-process-util.c'), + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('test-qrcode-util.c'), + 'dependencies' : libdl, + }, + test_template + { + 'sources' : files('test-random-util.c'), + 'dependencies' : libm, + 'timeout' : 120, + }, + test_template + { + 'sources' : files('test-sbat.c'), + 'conditions' : ['ENABLE_BOOTLOADER'], + 'c_args' : '-I@0@'.format(efi_config_h_dir), + }, + test_template + { + 'sources' : files('test-seccomp.c'), + 'dependencies' : libseccomp, + 'conditions' : ['HAVE_SECCOMP'], + }, + test_template + { + 'sources' : files('test-set-disable-mempool.c'), + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('test-sizeof.c'), + 'link_with' : libbasic, + }, + test_template + { + 'sources' : files('test-time-util.c'), + 'timeout' : 120, + }, + test_template + { + 'sources' : files('test-tpm2.c'), + 'dependencies' : libopenssl, + 'timeout' : 120, + }, + test_template + { + 'sources' : files('test-utmp.c'), + 'conditions' : ['ENABLE_UTMP'], + }, + test_template + { + 'sources' : files('test-varlink.c'), + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('test-varlink-idl.c'), + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('test-watchdog.c'), + 'type' : 'unsafe', + }, + + # Symbol tests + test_template + { + 'name' : 'test-libsystemd-sym', + 'sources' : test_libsystemd_sym_c, + 'link_with' : libsystemd, + 'suite' : 'libsystemd', + }, + test_template + { + 'name' : 'test-libsystemd-static-sym', + 'sources' : test_libsystemd_sym_c, + 'link_with' : install_libsystemd_static, + 'build_by_default' : want_tests != 'false' and static_libsystemd != 'false', + 'install' : install_tests and static_libsystemd != 'false', + 'suite' : 'libsystemd', + }, + test_template + { + 'name' : 'test-libudev-sym', + 'sources' : test_libudev_sym_c, + 'include_directories' : libudev_includes, + 'c_args' : ['-Wno-deprecated-declarations'] + test_cflags, + 'link_with' : libudev, + 'suite' : 'libudev', + }, + test_template + { + 'name' : 'test-libudev-static-sym', + 'sources' : test_libudev_sym_c, + 'include_directories' : libudev_includes, + 'c_args' : ['-Wno-deprecated-declarations'] + test_cflags, + 'link_with' : install_libudev_static, + 'build_by_default' : want_tests != 'false' and static_libudev != 'false', + 'install' : install_tests and static_libudev != 'false', + 'suite' : 'libudev', + }, + + # Tests that link to libcore, i.e. tests for pid1 code. + core_test_template + { + 'sources' : files('test-bpf-devices.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-bpf-firewall.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-bpf-foreign-programs.c'), + }, + core_test_template + { + 'sources' : files('test-bpf-lsm.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-cgroup-cpu.c'), + }, + core_test_template + { + 'sources' : files('test-cgroup-mask.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-cgroup-unit-default.c'), + }, + core_test_template + { + 'sources' : files('test-chown-rec.c'), + }, + core_test_template + { + 'sources' : files('test-core-unit.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-emergency-action.c'), + }, + core_test_template + { + 'sources' : files('test-engine.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-execute.c'), + 'dependencies' : common_test_dependencies, + 'timeout' : 360, + }, + core_test_template + { + 'sources' : files('test-install.c'), + 'type' : 'manual', + }, + core_test_template + { + 'sources' : files('test-job-type.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-load-fragment.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-loop-block.c'), + 'dependencies' : [threads, libblkid], + 'parallel' : false, + }, + core_test_template + { + 'sources' : files('test-manager.c'), + }, + core_test_template + { + 'sources' : files('test-namespace.c'), + 'dependencies' : [ + libblkid, + threads, + ], + }, + core_test_template + { + 'sources' : files('test-ns.c'), + 'dependencies' : common_test_dependencies, + 'type' : 'manual', + }, + core_test_template + { + 'sources' : files('test-path.c'), + 'dependencies' : common_test_dependencies, + 'timeout' : 120, + }, + core_test_template + { + 'sources' : files('test-sched-prio.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-socket-bind.c'), + 'dependencies' : libdl, + 'conditions' : ['BPF_FRAMEWORK'], + }, + core_test_template + { + 'sources' : files('test-tables.c'), + }, + core_test_template + { + 'sources' : files('test-unit-name.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-unit-serialize.c'), + 'dependencies' : common_test_dependencies, + }, + core_test_template + { + 'sources' : files('test-watch-pid.c'), + 'dependencies' : common_test_dependencies, + }, + + # Tests from other directories that have link_with deps that were not defined earlier + test_template + { + 'sources' : files('../libsystemd/sd-bus/test-bus-error.c'), + 'link_with' : [ + libshared_static, + libsystemd_static, + ], + }, + test_template + { + 'sources' : files('../libsystemd/sd-device/test-sd-device-thread.c'), + 'link_with' : libsystemd, + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('../libudev/test-udev-device-thread.c'), + 'link_with' : libudev, + 'dependencies' : threads, + }, + test_template + { + 'sources' : files('../libudev/test-libudev.c'), + 'link_with' : [ + libshared, + libudev_basic, + ], + }, +] diff --git a/src/test/nss-test-util.c b/src/test/nss-test-util.c new file mode 100644 index 0000000..20643f8 --- /dev/null +++ b/src/test/nss-test-util.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "nss-test-util.h" +#include "string-util.h" + +const char* nss_status_to_string(enum nss_status status, char *buf, size_t buf_len) { + switch (status) { + case NSS_STATUS_TRYAGAIN: + return "NSS_STATUS_TRYAGAIN"; + case NSS_STATUS_UNAVAIL: + return "NSS_STATUS_UNAVAIL"; + case NSS_STATUS_NOTFOUND: + return "NSS_STATUS_NOTFOUND"; + case NSS_STATUS_SUCCESS: + return "NSS_STATUS_SUCCESS"; + case NSS_STATUS_RETURN: + return "NSS_STATUS_RETURN"; + default: + (void) snprintf(buf, buf_len, "%i", status); + return buf; + } +}; + +void* nss_open_handle(const char *dir, const char *module, int flags) { + const char *path = NULL; + void *handle; + + if (dir) + path = strjoina(dir, "/libnss_", module, ".so.2"); + if (!path || access(path, F_OK) < 0) + path = strjoina("libnss_", module, ".so.2"); + + log_debug("Using %s", path); + handle = dlopen(path, flags); + if (!handle) + log_error("Failed to load module %s: %s", module, dlerror()); + return handle; +} diff --git a/src/test/nss-test-util.h b/src/test/nss-test-util.h new file mode 100644 index 0000000..f081e64 --- /dev/null +++ b/src/test/nss-test-util.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include + +const char* nss_status_to_string(enum nss_status status, char *buf, size_t buf_len); +void* nss_open_handle(const char *dir, const char *module, int flags); diff --git a/src/test/test-acl-util.c b/src/test/test-acl-util.c new file mode 100644 index 0000000..eb9678a --- /dev/null +++ b/src/test/test-acl-util.c @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "acl-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" + +TEST_RET(add_acls_for_user) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-empty.XXXXXX"; + _cleanup_close_ int fd = -EBADF; + char *cmd; + uid_t uid; + int r; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + /* Use the mode that user journal files use */ + assert_se(fchmod(fd, 0640) == 0); + + cmd = strjoina("ls -l ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + if (getuid() == 0) { + const char *nobody = NOBODY_USER_NAME; + r = get_user_creds(&nobody, &uid, NULL, NULL, NULL, 0); + if (r < 0) + uid = 0; + } else + uid = getuid(); + + r = fd_add_uid_acl_permission(fd, uid, ACL_READ); + if (ERRNO_IS_NOT_SUPPORTED(r)) + return log_tests_skipped("no ACL support on /tmp"); + + log_info_errno(r, "fd_add_uid_acl_permission(%i, "UID_FMT", ACL_READ): %m", fd, uid); + assert_se(r >= 0); + + cmd = strjoina("ls -l ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + /* set the acls again */ + + r = fd_add_uid_acl_permission(fd, uid, ACL_READ); + assert_se(r >= 0); + + cmd = strjoina("ls -l ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + return 0; +} + +TEST(fd_acl_make_read_only) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-empty.XXXXXX"; + _cleanup_close_ int fd = -EBADF; + const char *cmd; + struct stat st; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + /* make it more exciting */ + (void) fd_add_uid_acl_permission(fd, 1, ACL_READ|ACL_WRITE|ACL_EXECUTE); + + assert_se(fstat(fd, &st) >= 0); + assert_se((st.st_mode & 0200) == 0200); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("stat ", fn); + assert_se(system(cmd) == 0); + + log_info("read-only"); + assert_se(fd_acl_make_read_only(fd)); + + assert_se(fstat(fd, &st) >= 0); + assert_se((st.st_mode & 0222) == 0000); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("stat ", fn); + assert_se(system(cmd) == 0); + + log_info("writable"); + assert_se(fd_acl_make_writable(fd)); + + assert_se(fstat(fd, &st) >= 0); + assert_se((st.st_mode & 0222) == 0200); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("stat ", fn); + assert_se(system(cmd) == 0); + + log_info("read-only"); + assert_se(fd_acl_make_read_only(fd)); + + assert_se(fstat(fd, &st) >= 0); + assert_se((st.st_mode & 0222) == 0000); + + cmd = strjoina("getfacl -p ", fn); + assert_se(system(cmd) == 0); + + cmd = strjoina("stat ", fn); + assert_se(system(cmd) == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-af-list.c b/src/test/test-af-list.c new file mode 100644 index 0000000..45655d7 --- /dev/null +++ b/src/test/test-af-list.c @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "macro.h" +#include "string-util.h" +#include "tests.h" + +_unused_ +static const struct af_name* lookup_af(register const char *str, register GPERF_LEN_TYPE len); + +#include "af-from-name.h" +#include "af-list.h" +#include "af-to-name.h" + +TEST(af_list) { + for (unsigned i = 0; i < ELEMENTSOF(af_names); i++) { + if (af_names[i]) { + assert_se(streq(af_to_name(i), af_names[i])); + assert_se(af_from_name(af_names[i]) == (int) i); + } + } + + assert_se(af_to_name(af_max()) == NULL); + assert_se(af_to_name(-1) == NULL); + assert_se(af_from_name("huddlduddl") == -EINVAL); + assert_se(af_from_name("") == -EINVAL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-alloc-util.c b/src/test/test-alloc-util.c new file mode 100644 index 0000000..24cb5f7 --- /dev/null +++ b/src/test/test-alloc-util.c @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "macro.h" +#include "memory-util.h" +#include "random-util.h" +#include "tests.h" + +TEST(alloca) { + static const uint8_t zero[997] = { }; + char *t; + + t = alloca_align(17, 512); + assert_se(!((uintptr_t)t & 0xff)); + memzero(t, 17); + + t = alloca0_align(997, 1024); + assert_se(!((uintptr_t)t & 0x1ff)); + assert_se(!memcmp(t, zero, 997)); +} + +TEST(GREEDY_REALLOC) { + _cleanup_free_ int *a = NULL, *b = NULL, *c = NULL; + size_t i, j, n_c = 0; + + /* Give valgrind a chance to verify our realloc() operations */ + + for (i = 0; i < 20480; i++) { + assert_se(GREEDY_REALLOC(a, i + 1)); + assert_se(MALLOC_ELEMENTSOF(a) >= i + 1); + assert_se(MALLOC_SIZEOF_SAFE(a) >= (i + 1) * sizeof(int)); + a[i] = (int) i; + assert_se(GREEDY_REALLOC(a, i / 2)); + assert_se(MALLOC_ELEMENTSOF(a) >= i / 2); + assert_se(MALLOC_SIZEOF_SAFE(a) >= (i / 2) * sizeof(int)); + } + + for (j = 0; j < i / 2; j++) + assert_se(a[j] == (int) j); + + for (i = 30; i < 20480; i += 7) { + assert_se(GREEDY_REALLOC(b, i + 1)); + assert_se(MALLOC_ELEMENTSOF(b) >= i + 1); + assert_se(MALLOC_SIZEOF_SAFE(b) >= (i + 1) * sizeof(int)); + b[i] = (int) i; + assert_se(GREEDY_REALLOC(b, i / 2)); + assert_se(MALLOC_ELEMENTSOF(b) >= i / 2); + assert_se(MALLOC_SIZEOF_SAFE(b) >= (i / 2) * sizeof(int)); + } + + for (j = 30; j < i / 2; j += 7) + assert_se(b[j] == (int) j); + + size_t n_from = 10; + int from[n_from]; + for (i = 0; i < 2048; i++) { + for (j = 0; j < n_from; j++) + from[j] = n_from * i + j; + + _cleanup_free_ int *before = NULL; + size_t n_before = 0; + assert_se(GREEDY_REALLOC_APPEND(before, n_before, c, n_c)); + assert_se(before); + assert_se(n_before == n_c); + assert_se(memcmp_safe(c, before, n_c) == 0); + + assert_se(GREEDY_REALLOC_APPEND(c, n_c, from, n_from)); + assert_se(n_c == n_before + n_from); + assert_se(MALLOC_ELEMENTSOF(c) >= n_c); + assert_se(MALLOC_SIZEOF_SAFE(c) >= n_c * sizeof(int)); + assert_se(memcmp_safe(c, before, n_before) == 0); + assert_se(memcmp_safe(&c[n_before], from, n_from) == 0); + + before = mfree(before); + assert_se(!before); + n_before = 0; + assert_se(GREEDY_REALLOC_APPEND(before, n_before, c, n_c)); + assert_se(before); + assert_se(n_before == n_c); + assert_se(memcmp_safe(c, before, n_c) == 0); + + assert_se(GREEDY_REALLOC_APPEND(c, n_c, NULL, 0)); + assert_se(c); + assert_se(n_c == n_before); + assert_se(MALLOC_ELEMENTSOF(c) >= n_c); + assert_se(MALLOC_SIZEOF_SAFE(c) >= n_c * sizeof(int)); + assert_se(memcmp_safe(c, before, n_c) == 0); + } + + for (j = 0; j < i * n_from; j++) + assert_se(c[j] == (int) j); +} + +TEST(memdup_multiply_and_greedy_realloc) { + static const int org[] = { 1, 2, 3 }; + _cleanup_free_ int *dup; + size_t i; + int *p; + + dup = memdup_suffix0_multiply(org, 3, sizeof(int)); + assert_se(dup); + assert_se(dup[0] == 1); + assert_se(dup[1] == 2); + assert_se(dup[2] == 3); + assert_se(((uint8_t*) dup)[sizeof(int) * 3] == 0); + free(dup); + + dup = memdup_multiply(org, 3, sizeof(int)); + assert_se(dup); + assert_se(dup[0] == 1); + assert_se(dup[1] == 2); + assert_se(dup[2] == 3); + + memzero(dup + 3, malloc_usable_size(dup) - sizeof(int) * 3); + + p = dup; + assert_se(GREEDY_REALLOC0(dup, 2) == p); + + p = GREEDY_REALLOC0(dup, 10); + assert_se(p == dup); + assert_se(MALLOC_ELEMENTSOF(p) >= 10); + assert_se(p[0] == 1); + assert_se(p[1] == 2); + assert_se(p[2] == 3); + for (i = 3; i < MALLOC_ELEMENTSOF(p); i++) + assert_se(p[i] == 0); +} + +TEST(bool_assign) { + bool b, c, *cp = &c, d, e, f, g, h; + + b = 123; + *cp = -11; + d = 0xF & 0xFF; + e = b & d; + f = 0x0; + g = cp; /* cast from pointer */ + h = NULL; /* cast from pointer */ + + assert_se(b); + assert_se(c); + assert_se(d); + assert_se(e); + assert_se(!f); + assert_se(g); + assert_se(!h); +} + +static int cleanup_counter = 0; + +static void cleanup1(void *a) { + log_info("%s(%p)", __func__, a); + assert_se(++cleanup_counter == *(int*) a); +} +static void cleanup2(void *a) { + log_info("%s(%p)", __func__, a); + assert_se(++cleanup_counter == *(int*) a); +} +static void cleanup3(void *a) { + log_info("%s(%p)", __func__, a); + assert_se(++cleanup_counter == *(int*) a); +} + +TEST(cleanup_order) { + _cleanup_(cleanup1) int x1 = 4, x2 = 3; + _cleanup_(cleanup3) int z = 2; + _cleanup_(cleanup2) int y = 1; + log_debug("x1: %p", &x1); + log_debug("x2: %p", &x2); + log_debug("y: %p", &y); + log_debug("z: %p", &z); +} + +TEST(auto_erase_memory) { + _cleanup_(erase_and_freep) uint8_t *p1, *p2; + + /* print address of p2, else e.g. clang-11 will optimize it out */ + log_debug("p1: %p p2: %p", &p1, &p2); + + assert_se(p1 = new(uint8_t, 4703)); /* use prime size, to ensure that there will be free space at the + * end of the allocation, since malloc() enforces alignment */ + assert_se(p2 = new(uint8_t, 4703)); + + assert_se(crypto_random_bytes(p1, 4703) == 0); + + /* before we exit the scope, do something with this data, so that the compiler won't optimize this away */ + memcpy(p2, p1, 4703); + for (size_t i = 0; i < 4703; i++) + assert_se(p1[i] == p2[i]); +} + +#define TEST_SIZES(f, n) \ + do { \ + log_debug("requested=%zu vs. malloc_size=%zu vs. gcc_size=%zu", \ + n * sizeof(*f), \ + malloc_usable_size(f), \ + __builtin_object_size(f, 0)); \ + assert_se(MALLOC_ELEMENTSOF(f) >= n); \ + assert_se(MALLOC_SIZEOF_SAFE(f) >= sizeof(*f) * n); \ + assert_se(malloc_usable_size(f) >= sizeof(*f) * n); \ + assert_se(__builtin_object_size(f, 0) >= sizeof(*f) * n); \ + } while (false) + +TEST(malloc_size_safe) { + _cleanup_free_ uint32_t *f = NULL; + size_t n = 4711; + + /* Let's check the macros and built-ins work on NULL and return the expected values */ + assert_se(MALLOC_ELEMENTSOF((float*) NULL) == 0); + assert_se(MALLOC_SIZEOF_SAFE((float*) NULL) == 0); + assert_se(malloc_usable_size(NULL) == 0); /* as per man page, this is safe and defined */ + assert_se(__builtin_object_size(NULL, 0) == SIZE_MAX); /* as per docs SIZE_MAX is returned for pointers where the size isn't known */ + + /* Then, let's try these macros once with constant size values, so that __builtin_object_size() + * definitely can work (as long as -O2 is used when compiling) */ + assert_se(f = new(uint32_t, n)); + TEST_SIZES(f, n); + + /* Finally, let's use some dynamically sized allocations, to make sure this doesn't deteriorate */ + for (unsigned i = 0; i < 50; i++) { + _cleanup_free_ uint64_t *g = NULL; + size_t m; + + m = random_u64_range(16*1024); + assert_se(g = new(uint64_t, m)); + TEST_SIZES(g, m); + } +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-architecture.c b/src/test/test-architecture.c new file mode 100644 index 0000000..8731e1c --- /dev/null +++ b/src/test/test-architecture.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "architecture.h" +#include "errno-util.h" +#include "log.h" +#include "tests.h" +#include "virt.h" + +int main(int argc, char *argv[]) { + Virtualization v; + Architecture a; + const char *p; + + test_setup_logging(LOG_INFO); + + assert_se(architecture_from_string("") < 0); + assert_se(architecture_from_string(NULL) < 0); + assert_se(architecture_from_string("hoge") < 0); + assert_se(architecture_to_string(-1) == NULL); + assert_se(architecture_from_string(architecture_to_string(0)) == 0); + assert_se(architecture_from_string(architecture_to_string(1)) == 1); + + v = detect_virtualization(); + if (ERRNO_IS_NEG_PRIVILEGE(v)) + return log_tests_skipped("Cannot detect virtualization"); + + assert_se(v >= 0); + + log_info("virtualization=%s id=%s", + VIRTUALIZATION_IS_CONTAINER(v) ? "container" : + VIRTUALIZATION_IS_VM(v) ? "vm" : "n/a", + virtualization_to_string(v)); + + a = uname_architecture(); + assert_se(a >= 0); + + p = architecture_to_string(a); + assert_se(p); + log_info("uname architecture=%s", p); + assert_se(architecture_from_string(p) == a); + + a = native_architecture(); + assert_se(a >= 0); + + p = architecture_to_string(a); + assert_se(p); + log_info("native architecture=%s", p); + assert_se(architecture_from_string(p) == a); + + log_info("primary library architecture=" LIB_ARCH_TUPLE); + + return 0; +} diff --git a/src/test/test-argv-util.c b/src/test/test-argv-util.c new file mode 100644 index 0000000..5bf2903 --- /dev/null +++ b/src/test/test-argv-util.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#if HAVE_VALGRIND_VALGRIND_H +# include +#endif + +#include "argv-util.h" +#include "missing_sched.h" +#include "process-util.h" +#include "tests.h" +#include "virt.h" + +static void test_rename_process_now(const char *p, int ret) { + _cleanup_free_ char *comm = NULL, *cmdline = NULL; + int r; + + log_info("/* %s(%s) */", __func__, p); + + r = rename_process(p); + assert_se(r == ret || + (ret == 0 && r >= 0) || + (ret > 0 && r > 0)); + + log_debug_errno(r, "rename_process(%s): %m", p); + + if (r < 0) + return; + +#if HAVE_VALGRIND_VALGRIND_H + /* see above, valgrind is weird, we can't verify what we are doing here */ + if (RUNNING_ON_VALGRIND) + return; +#endif + + assert_se(pid_get_comm(0, &comm) >= 0); + log_debug("comm = <%s>", comm); + assert_se(strneq(comm, p, TASK_COMM_LEN-1)); + /* We expect comm to be at most 16 bytes (TASK_COMM_LEN). The kernel may raise this limit in the + * future. We'd only check the initial part, at least until we recompile, but this will still pass. */ + + r = pid_get_cmdline(0, SIZE_MAX, 0, &cmdline); + assert_se(r >= 0); + /* we cannot expect cmdline to be renamed properly without privileges */ + if (geteuid() == 0) { + if (r == 0 && detect_container() > 0) + log_info("cmdline = <%s> (not verified, Running in unprivileged container?)", cmdline); + else { + log_info("cmdline = <%s> (expected <%.*s>)", cmdline, (int) strlen("test-process-util"), p); + + bool skip = cmdline[0] == '"'; /* A shortcut to check if the string is quoted */ + + assert_se(strneq(cmdline + skip, p, strlen("test-process-util"))); + assert_se(startswith(cmdline + skip, p)); + } + } else + log_info("cmdline = <%s> (not verified)", cmdline); +} + +static void test_rename_process_one(const char *p, int ret) { + siginfo_t si; + pid_t pid; + + log_info("/* %s(%s) */", __func__, p); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + /* child */ + test_rename_process_now(p, ret); + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate(pid, &si) >= 0); + assert_se(si.si_code == CLD_EXITED); + assert_se(si.si_status == EXIT_SUCCESS); +} + +TEST(rename_process_invalid) { + assert_se(rename_process(NULL) == -EINVAL); + assert_se(rename_process("") == -EINVAL); +} + +TEST(rename_process_multi) { + pid_t pid; + + pid = fork(); + assert_se(pid >= 0); + + if (pid > 0) { + siginfo_t si; + + assert_se(wait_for_terminate(pid, &si) >= 0); + assert_se(si.si_code == CLD_EXITED); + assert_se(si.si_status == EXIT_SUCCESS); + + return; + } + + /* child */ + test_rename_process_now("one", 1); + test_rename_process_now("more", 0); /* longer than "one", hence truncated */ + (void) setresuid(99, 99, 99); /* change uid when running privileged */ + test_rename_process_now("time!", 0); + test_rename_process_now("0", 1); /* shorter than "one", should fit */ + _exit(EXIT_SUCCESS); +} + +TEST(rename_process) { + test_rename_process_one("foo", 1); /* should always fit */ + test_rename_process_one("this is a really really long process name, followed by some more words", 0); /* unlikely to fit */ + test_rename_process_one("1234567", 1); /* should always fit */ +} + +TEST(argv_help) { + assert_se(argv_looks_like_help(1, STRV_MAKE("program"))); + assert_se(argv_looks_like_help(2, STRV_MAKE("program", "help"))); + assert_se(argv_looks_like_help(3, STRV_MAKE("program", "arg1", "--help"))); + assert_se(argv_looks_like_help(4, STRV_MAKE("program", "arg1", "arg2", "-h"))); + assert_se(!argv_looks_like_help(2, STRV_MAKE("program", "arg1"))); + assert_se(!argv_looks_like_help(4, STRV_MAKE("program", "arg1", "arg2", "--h"))); +} + +static int intro(void) { + log_show_color(true); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-arphrd-util.c b/src/test/test-arphrd-util.c new file mode 100644 index 0000000..d8dd464 --- /dev/null +++ b/src/test/test-arphrd-util.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "arphrd-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(arphrd) { + for (int i = 0; i <= ARPHRD_VOID + 1; i++) { + const char *name; + + name = arphrd_to_name(i); + if (name) { + log_info("%i: %s", i, name); + + assert_se(arphrd_from_name(name) == i); + } + } + + assert_se(arphrd_to_name(ARPHRD_VOID + 1) == NULL); + assert_se(arphrd_from_name("huddlduddl") == -EINVAL); + assert_se(arphrd_from_name("") == -EINVAL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-ask-password-api.c b/src/test/test-ask-password-api.c new file mode 100644 index 0000000..b24159e --- /dev/null +++ b/src/test/test-ask-password-api.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "ask-password-api.h" +#include "strv.h" +#include "tests.h" + +TEST(ask_password) { + int r; + _cleanup_strv_free_ char **ret = NULL; + + r = ask_password_tty(-1, "hello?", "da key", 0, ASK_PASSWORD_CONSOLE_COLOR, NULL, &ret); + if (r == -ECANCELED) + assert_se(ret == NULL); + else { + assert_se(r >= 0); + assert_se(strv_length(ret) == 1); + log_info("Got \"%s\"", *ret); + } +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-async.c b/src/test/test-async.c new file mode 100644 index 0000000..75bc4d8 --- /dev/null +++ b/src/test/test-async.c @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "async.h" +#include "fs-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(asynchronous_sync) { + assert_se(asynchronous_sync(NULL) >= 0); +} + +TEST(asynchronous_close) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-asynchronous_close.XXXXXX"; + int fd, r; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + asynchronous_close(fd); + + sleep(1); + + assert_se(fcntl(fd, F_GETFD) == -1); + assert_se(errno == EBADF); + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, NULL); + assert(r >= 0); + + if (r == 0) { + /* child */ + + assert(make_reaper_process(true) >= 0); + + fd = open("/dev/null", O_RDONLY|O_CLOEXEC); + assert_se(fd >= 0); + asynchronous_close(fd); + + sleep(1); + + assert_se(fcntl(fd, F_GETFD) == -1); + assert_se(errno == EBADF); + + _exit(EXIT_SUCCESS); + } +} + +TEST(asynchronous_rm_rf) { + _cleanup_free_ char *t = NULL, *k = NULL; + int r; + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + assert_se(k = path_join(t, "somefile")); + assert_se(touch(k) >= 0); + assert_se(asynchronous_rm_rf(t, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + /* Do this once more, form a subreaper. Which is nice, because we can watch the async child even + * though detached */ + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + _cleanup_free_ char *tt = NULL, *kk = NULL; + + /* child */ + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + assert_se(make_reaper_process(true) >= 0); + + assert_se(mkdtemp_malloc(NULL, &tt) >= 0); + assert_se(kk = path_join(tt, "somefile")); + assert_se(touch(kk) >= 0); + assert_se(asynchronous_rm_rf(tt, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + for (;;) { + siginfo_t si = {}; + + assert_se(waitid(P_ALL, 0, &si, WEXITED) >= 0); + + if (access(tt, F_OK) < 0) { + assert_se(errno == ENOENT); + break; + } + + /* wasn't the rm_rf() call. let's wait longer */ + } + + _exit(EXIT_SUCCESS); + } +} + + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-barrier.c b/src/test/test-barrier.c new file mode 100644 index 0000000..7e8bfc0 --- /dev/null +++ b/src/test/test-barrier.c @@ -0,0 +1,441 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* + * IPC barrier tests + * These tests verify the correct behavior of the IPC Barrier implementation. + * Note that the tests use alarm-timers to verify dead-locks and timeouts. These + * might not work on slow machines where 20ms are too short to perform specific + * operations (though, very unlikely). In case that turns out true, we have to + * increase it at the slightly cost of lengthen test-duration on other machines. + */ + +#include +#include +#include +#include + +#include "barrier.h" +#include "errno-util.h" +#include "tests.h" +#include "time-util.h" +#include "virt.h" + +/* 20ms to test deadlocks; All timings use multiples of this constant as + * alarm/sleep timers. If this timeout is too small for slow machines to perform + * the requested operations, we have to increase it. On an i7 this works fine + * with 1ms base-time, so 20ms should be just fine for everyone. */ +#define BASE_TIME (20 * USEC_PER_MSEC) + +static void set_alarm(usec_t usecs) { + struct itimerval v = { }; + + timeval_store(&v.it_value, usecs); + assert_se(setitimer(ITIMER_REAL, &v, NULL) >= 0); +} + +#define TEST_BARRIER(_FUNCTION, _CHILD_CODE, _WAIT_CHILD, _PARENT_CODE, _WAIT_PARENT) \ + TEST(_FUNCTION) { \ + Barrier b = BARRIER_NULL; \ + pid_t pid1, pid2; \ + \ + assert_se(barrier_create(&b) >= 0); \ + assert_se(b.me > 0); \ + assert_se(b.them > 0); \ + assert_se(b.pipe[0] > 0); \ + assert_se(b.pipe[1] > 0); \ + \ + pid1 = fork(); \ + assert_se(pid1 >= 0); \ + if (pid1 == 0) { \ + barrier_set_role(&b, BARRIER_CHILD); \ + { _CHILD_CODE; } \ + exit(42); \ + } \ + \ + pid2 = fork(); \ + assert_se(pid2 >= 0); \ + if (pid2 == 0) { \ + barrier_set_role(&b, BARRIER_PARENT); \ + { _PARENT_CODE; } \ + exit(42); \ + } \ + \ + barrier_destroy(&b); \ + set_alarm(999999); \ + { _WAIT_CHILD; } \ + { _WAIT_PARENT; } \ + set_alarm(0); \ + } + +#define TEST_BARRIER_WAIT_SUCCESS(_pid) \ + ({ \ + int pidr, status; \ + pidr = waitpid(_pid, &status, 0); \ + assert_se(pidr == _pid); \ + assert_se(WIFEXITED(status)); \ + assert_se(WEXITSTATUS(status) == 42); \ + }) + +#define TEST_BARRIER_WAIT_ALARM(_pid) \ + ({ \ + int pidr, status; \ + pidr = waitpid(_pid, &status, 0); \ + assert_se(pidr == _pid); \ + assert_se(WIFSIGNALED(status)); \ + assert_se(WTERMSIG(status) == SIGALRM); \ + }) + +/* + * Test basic sync points + * This places a barrier in both processes and waits synchronously for them. + * The timeout makes sure the sync works as expected. The usleep_safe() on one side + * makes sure the exit of the parent does not overwrite previous barriers. Due + * to the usleep_safe(), we know that the parent already exited, thus there's a + * pending HUP on the pipe. However, the barrier_sync() prefers reads on the + * eventfd, thus we can safely wait on the barrier. + */ +TEST_BARRIER(barrier_sync, + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + usleep_safe(BASE_TIME * 2); + assert_se(barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test wait_next() + * This places a barrier in the parent and syncs on it. The child sleeps while + * the parent places the barrier and then waits for a barrier. The wait will + * succeed as the child hasn't read the parent's barrier, yet. The following + * barrier and sync synchronize the exit. + */ +TEST_BARRIER(barrier_wait_next, + ({ + usleep_safe(BASE_TIME); + set_alarm(BASE_TIME * 10); + assert_se(barrier_wait_next(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME * 4); + assert_se(barrier_place(&b)); + assert_se(barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test wait_next() multiple times + * This places two barriers in the parent and waits for the child to exit. The + * child sleeps 20ms so both barriers _should_ be in place. It then waits for + * the parent to place the next barrier twice. The first call will fetch both + * barriers and return. However, the second call will stall as the parent does + * not place a 3rd barrier (the sleep caught two barriers). wait_next() is does + * not look at barrier-links so this stall is expected. Thus this test times + * out. + */ +TEST_BARRIER(barrier_wait_next_twice, + ({ + usleep_safe(BASE_TIME); + set_alarm(BASE_TIME); + assert_se(barrier_wait_next(&b)); + assert_se(barrier_wait_next(&b)); + assert_se(0); + }), + TEST_BARRIER_WAIT_ALARM(pid1), + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + usleep_safe(BASE_TIME * 4); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test wait_next() with local barriers + * This is the same as test_barrier_wait_next_twice, but places local barriers + * between both waits. This does not have any effect on the wait so it times out + * like the other test. + */ +TEST_BARRIER(barrier_wait_next_twice_local, + ({ + usleep_safe(BASE_TIME); + set_alarm(BASE_TIME); + assert_se(barrier_wait_next(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_wait_next(&b)); + assert_se(0); + }), + TEST_BARRIER_WAIT_ALARM(pid1), + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + usleep_safe(BASE_TIME * 4); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test wait_next() with sync_next() + * This is again the same as test_barrier_wait_next_twice but uses a + * synced wait as the second wait. This works just fine because the local state + * has no barriers placed, therefore, the remote is always in sync. + */ +TEST_BARRIER(barrier_wait_next_twice_sync, + ({ + usleep_safe(BASE_TIME); + set_alarm(BASE_TIME); + assert_se(barrier_wait_next(&b)); + assert_se(barrier_sync_next(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test wait_next() with sync_next() and local barriers + * This is again the same as test_barrier_wait_next_twice_local but uses a + * synced wait as the second wait. This works just fine because the local state + * is in sync with the remote. + */ +TEST_BARRIER(barrier_wait_next_twice_local_sync, + ({ + usleep_safe(BASE_TIME); + set_alarm(BASE_TIME); + assert_se(barrier_wait_next(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_sync_next(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test sync_next() and sync() + * This tests sync_*() synchronizations and makes sure they work fine if the + * local state is behind the remote state. + */ +TEST_BARRIER(barrier_sync_next, + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_sync_next(&b)); + assert_se(barrier_sync(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_sync_next(&b)); + assert_se(barrier_sync_next(&b)); + assert_se(barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME * 10); + usleep_safe(BASE_TIME); + assert_se(barrier_place(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test sync_next() and sync() with local barriers + * This tests timeouts if sync_*() is used if local barriers are placed but the + * remote didn't place any. + */ +TEST_BARRIER(barrier_sync_next_local, + ({ + set_alarm(BASE_TIME); + assert_se(barrier_place(&b)); + assert_se(barrier_sync_next(&b)); + assert_se(0); + }), + TEST_BARRIER_WAIT_ALARM(pid1), + ({ + usleep_safe(BASE_TIME * 2); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test sync_next() and sync() with local barriers and abortion + * This is the same as test_barrier_sync_next_local but aborts the sync in the + * parent. Therefore, the sync_next() succeeds just fine due to the abortion. + */ +TEST_BARRIER(barrier_sync_next_local_abort, + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(!barrier_sync_next(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + assert_se(barrier_abort(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test matched wait_abortion() + * This runs wait_abortion() with remote abortion. + */ +TEST_BARRIER(barrier_wait_abortion, + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_wait_abortion(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + assert_se(barrier_abort(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test unmatched wait_abortion() + * This runs wait_abortion() without any remote abortion going on. It thus must + * timeout. + */ +TEST_BARRIER(barrier_wait_abortion_unmatched, + ({ + set_alarm(BASE_TIME); + assert_se(barrier_wait_abortion(&b)); + assert_se(0); + }), + TEST_BARRIER_WAIT_ALARM(pid1), + ({ + usleep_safe(BASE_TIME * 2); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test matched wait_abortion() with local abortion + * This runs wait_abortion() with local and remote abortion. + */ +TEST_BARRIER(barrier_wait_abortion_local, + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_abort(&b)); + assert_se(!barrier_wait_abortion(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + assert_se(barrier_abort(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test unmatched wait_abortion() with local abortion + * This runs wait_abortion() with only local abortion. This must time out. + */ +TEST_BARRIER(barrier_wait_abortion_local_unmatched, + ({ + set_alarm(BASE_TIME); + assert_se(barrier_abort(&b)); + assert_se(!barrier_wait_abortion(&b)); + assert_se(0); + }), + TEST_BARRIER_WAIT_ALARM(pid1), + ({ + usleep_safe(BASE_TIME * 2); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test child exit + * Place barrier and sync with the child. The child only exits()s, which should + * cause an implicit abortion and wake the parent. + */ +TEST_BARRIER(barrier_exit, + ({ + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME * 10); + assert_se(barrier_place(&b)); + assert_se(!barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + +/* + * Test child exit with sleep + * Same as test_barrier_exit but verifies the test really works due to the + * child-exit. We add a usleep_safe() which triggers the alarm in the parent and + * causes the test to time out. + */ +TEST_BARRIER(barrier_no_exit, + ({ + usleep_safe(BASE_TIME * 2); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + set_alarm(BASE_TIME); + assert_se(barrier_place(&b)); + assert_se(!barrier_sync(&b)); + }), + TEST_BARRIER_WAIT_ALARM(pid2)); + +/* + * Test pending exit against sync + * The parent places a barrier *and* exits. The 20ms wait in the child + * guarantees both are pending. However, our logic prefers pending barriers over + * pending exit-abortions (unlike normal abortions), thus the wait_next() must + * succeed, same for the sync_next() as our local barrier-count is smaller than + * the remote. Once we place a barrier our count is equal, so the sync still + * succeeds. Only if we place one more barrier, we're ahead of the remote, thus + * we will fail due to HUP on the pipe. + */ +TEST_BARRIER(barrier_pending_exit, + ({ + set_alarm(BASE_TIME * 4); + usleep_safe(BASE_TIME * 2); + assert_se(barrier_wait_next(&b)); + assert_se(barrier_sync_next(&b)); + assert_se(barrier_place(&b)); + assert_se(barrier_sync_next(&b)); + assert_se(barrier_place(&b)); + assert_se(!barrier_sync_next(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid1), + ({ + assert_se(barrier_place(&b)); + }), + TEST_BARRIER_WAIT_SUCCESS(pid2)); + + +static int intro(void) { + if (!slow_tests_enabled()) + return log_tests_skipped("slow tests are disabled"); + + /* + * This test uses real-time alarms and sleeps to test for CPU races explicitly. This is highly + * fragile if your system is under load. We already increased the BASE_TIME value to make the tests + * more robust, but that just makes the test take significantly longer. Given the recent issues when + * running the test in a virtualized environments, limit it to bare metal machines only, to minimize + * false-positives in CIs. + */ + + Virtualization v = detect_virtualization(); + if (ERRNO_IS_NEG_PRIVILEGE(v)) + return log_tests_skipped("Cannot detect virtualization"); + + if (v != VIRTUALIZATION_NONE) + return log_tests_skipped("This test requires a baremetal machine"); + + return EXIT_SUCCESS; + } + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-bitfield.c b/src/test/test-bitfield.c new file mode 100644 index 0000000..f26b423 --- /dev/null +++ b/src/test/test-bitfield.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "bitfield.h" +#include "log.h" +#include "tests.h" + +#define TEST_BITS(bits, v, ...) \ + ({ \ + assert_se((!!BITS_SET(bits, ##__VA_ARGS__)) == v); \ + assert_se((!!BITS_SET(~(bits), ##__VA_ARGS__)) == !v); \ + }) +#define TEST_BIT(bits, v, i) \ + ({ \ + assert_se((!!BIT_SET(bits, i)) == v); \ + assert_se((!!BIT_SET(~(bits), i)) == !v); \ + TEST_BITS(bits, v, i); \ + }) + +#define TEST_BIT_SET(bits, i) TEST_BIT(bits, 1, i) +#define TEST_BIT_CLEAR(bits, i) TEST_BIT(bits, 0, i) + +#define TEST_BITS_SET(bits, ...) TEST_BITS(bits, 1, ##__VA_ARGS__) +#define TEST_BITS_CLEAR(bits, ...) TEST_BITS(bits, 0, ##__VA_ARGS__) + +TEST(bits) { + int count; + + /* Test uint8_t */ + TEST_BIT_SET(0x81, 0); + TEST_BIT_SET(0x81, 7); + TEST_BITS_SET(0x81, 0, 7); + TEST_BIT_CLEAR(0x81, 4); + TEST_BIT_CLEAR(0x81, 6); + TEST_BITS_CLEAR(0x81, 1, 2, 3, 4, 5, 6); + uint8_t expected8 = 0; + BIT_FOREACH(i, 0x81) + expected8 |= UINT8_C(1) << i; + assert_se(expected8 == 0x81); + uint8_t u8 = 0x91; + TEST_BIT_SET(u8, 4); + TEST_BITS_SET(u8, 0, 4, 7); + TEST_BIT_CLEAR(u8, 2); + TEST_BITS_CLEAR(u8, 1, 2, 3, 5, 6); + SET_BIT(u8, 1); + TEST_BITS_SET(u8, 0, 1, 4, 7); + TEST_BITS_CLEAR(u8, 2, 3, 5, 6); + SET_BITS(u8, 3, 5); + TEST_BITS_SET(u8, 0, 1, 3, 4, 5, 7); + TEST_BITS_CLEAR(u8, 2, 6); + CLEAR_BIT(u8, 4); + TEST_BITS_SET(u8, 0, 1, 3, 5, 7); + TEST_BITS_CLEAR(u8, 2, 4, 6); + CLEAR_BITS(u8, 1); + CLEAR_BITS(u8, 0, 7); + TEST_BITS_SET(u8, 3, 5); + TEST_BITS_CLEAR(u8, 0, 1, 2, 4, 6, 7); + expected8 = 0; + BIT_FOREACH(i, u8) + expected8 |= UINT8_C(1) << i; + assert_se(expected8 == u8); + u8 = 0; + TEST_BITS_CLEAR(u8, 0, 1, 2, 3, 4, 5, 6, 7); + BIT_FOREACH(i, u8) + assert_se(0); + u8 = ~u8; + TEST_BITS_SET(u8, 0, 1, 2, 3, 4, 5, 6, 7); + count = 0; + BIT_FOREACH(i, u8) + count++; + assert_se(count == 8); + uint8_t _u8 = u8; + SET_BITS(u8); + assert_se(_u8 == u8); + CLEAR_BITS(u8); + assert_se(_u8 == u8); + + /* Test uint16_t */ + TEST_BIT_SET(0x1f81, 10); + TEST_BITS_SET(0x1f81, 0, 7, 8, 9, 10, 11, 12); + TEST_BIT_CLEAR(0x1f81, 13); + TEST_BITS_CLEAR(0x1f81, 1, 2, 3, 4, 5, 6, 13, 14, 15); + uint16_t expected16 = 0; + BIT_FOREACH(i, 0x1f81) + expected16 |= UINT16_C(1) << i; + assert_se(expected16 == 0x1f81); + uint16_t u16 = 0xf060; + TEST_BIT_SET(u16, 12); + TEST_BITS_SET(u16, 5, 6, 12, 13, 14, 15); + TEST_BIT_CLEAR(u16, 9); + TEST_BITS_CLEAR(u16, 0, 1, 2, 3, 4, 7, 8, 9, 10, 11); + SET_BITS(u16, 1, 8); + TEST_BITS_SET(u16, 1, 5, 6, 8, 12, 13, 14, 15); + TEST_BITS_CLEAR(u16, 0, 2, 3, 4, 7, 9, 10, 11); + CLEAR_BITS(u16, 13, 14); + TEST_BITS_SET(u16, 1, 5, 6, 8, 12, 15); + TEST_BITS_CLEAR(u16, 0, 2, 3, 4, 7, 9, 10, 11, 13, 14); + expected16 = 0; + BIT_FOREACH(i, u16) + expected16 |= UINT16_C(1) << i; + assert_se(expected16 == u16); + u16 = 0; + TEST_BITS_CLEAR(u16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BIT_FOREACH(i, u16) + assert_se(0); + u16 = ~u16; + TEST_BITS_SET(u16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + count = 0; + BIT_FOREACH(i, u16) + count++; + assert_se(count == 16); + uint16_t _u16 = u16; + SET_BITS(u16); + assert_se(_u16 == u16); + CLEAR_BITS(u16); + assert_se(_u16 == u16); + + /* Test uint32_t */ + TEST_BIT_SET(0x80224f10, 11); + TEST_BITS_SET(0x80224f10, 4, 8, 9, 10, 11, 14, 17, 21, 31); + TEST_BIT_CLEAR(0x80224f10, 28); + TEST_BITS_CLEAR(0x80224f10, 0, 1, 2, 3, 5, 6, 7, 12, 13, 15, 16, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30); + uint32_t expected32 = 0; + BIT_FOREACH(i, 0x80224f10) + expected32 |= UINT32_C(1) << i; + assert_se(expected32 == 0x80224f10); + uint32_t u32 = 0x605e0388; + TEST_BIT_SET(u32, 3); + TEST_BIT_SET(u32, 30); + TEST_BITS_SET(u32, 3, 7, 8, 9, 17, 18, 19, 20, 22, 29, 30); + TEST_BIT_CLEAR(u32, 0); + TEST_BIT_CLEAR(u32, 31); + TEST_BITS_CLEAR(u32, 0, 1, 2, 4, 5, 6, 10, 11, 12, 13, 14, 15, 16, 21, 23, 24, 25, 26, 27, 28, 31); + SET_BITS(u32, 1, 25, 26); + TEST_BITS_SET(u32, 1, 3, 7, 8, 9, 17, 18, 19, 20, 22, 25, 26, 29, 30); + TEST_BITS_CLEAR(u32, 0, 2, 4, 5, 6, 10, 11, 12, 13, 14, 15, 16, 21, 23, 24, 27, 28, 31); + CLEAR_BITS(u32, 29, 17, 1); + TEST_BITS_SET(u32, 3, 7, 8, 9, 18, 19, 20, 22, 25, 26, 30); + TEST_BITS_CLEAR(u32, 0, 1, 2, 4, 5, 6, 10, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 27, 28, 29, 31); + expected32 = 0; + BIT_FOREACH(i, u32) + expected32 |= UINT32_C(1) << i; + assert_se(expected32 == u32); + u32 = 0; + TEST_BITS_CLEAR(u32, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + BIT_FOREACH(i, u32) + assert_se(0); + u32 = ~u32; + TEST_BITS_SET(u32, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + count = 0; + BIT_FOREACH(i, u32) + count++; + assert_se(count == 32); + uint32_t _u32 = u32; + SET_BITS(u32); + assert_se(_u32 == u32); + CLEAR_BITS(u32); + assert_se(_u32 == u32); + + /* Test uint64_t */ + TEST_BIT_SET(0x18ba1400f4857460, 60); + TEST_BITS_SET(0x18ba1400f4857460, 5, 6, 10, 12, 13, 14, 16, 18, 23, 26, 28, 29, 30, 31, 42, 44, 49, 51, 52, 53, 55, 59, 60); + TEST_BIT_CLEAR(UINT64_C(0x18ba1400f4857460), 0); + TEST_BIT_CLEAR(UINT64_C(0x18ba1400f4857460), 63); + TEST_BITS_CLEAR(UINT64_C(0x18ba1400f4857460), 0, 1, 2, 3, 4, 7, 8, 9, 11, 15, 17, 19, 20, 21, 22, 24, 25, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 50, 54, 56, 57, 58, 61, 62, 63); + uint64_t expected64 = 0; + BIT_FOREACH(i, 0x18ba1400f4857460) + expected64 |= UINT64_C(1) << i; + assert_se(expected64 == 0x18ba1400f4857460); + uint64_t u64 = 0xa90e2d8507a65739; + TEST_BIT_SET(u64, 0); + TEST_BIT_SET(u64, 63); + TEST_BITS_SET(u64, 0, 3, 4, 5, 8, 9, 10, 12, 14, 17, 18, 21, 23, 24, 25, 26, 32, 34, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61, 63); + TEST_BIT_CLEAR(u64, 1); + TEST_BITS_CLEAR(u64, 1, 2, 6, 7, 11, 13, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 33, 35, 36, 37, 38, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60, 62); + SET_BIT(u64, 1); + TEST_BITS_SET(u64, 0, 1, 3, 4, 5, 8, 9, 10, 12, 14, 17, 18, 21, 23, 24, 25, 26, 32, 34, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61, 63); + TEST_BITS_CLEAR(u64, 2, 6, 7, 11, 13, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 33, 35, 36, 37, 38, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60, 62); + CLEAR_BIT(u64, 63); + TEST_BITS_SET(u64, 0, 1, 3, 4, 5, 8, 9, 10, 12, 14, 17, 18, 21, 23, 24, 25, 26, 32, 34, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61); + TEST_BITS_CLEAR(u64, 2, 6, 7, 11, 13, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 33, 35, 36, 37, 38, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60, 62, 63); + SET_BIT(u64, 62); + TEST_BITS_SET(u64, 0, 1, 3, 4, 5, 8, 9, 10, 12, 14, 17, 18, 21, 23, 24, 25, 26, 32, 34, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61, 62); + TEST_BITS_CLEAR(u64, 2, 6, 7, 11, 13, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 33, 35, 36, 37, 38, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60, 63); + SET_BITS(u64, 63, 62, 7, 13, 38, 40); + TEST_BITS_SET(u64, 0, 1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 17, 18, 21, 23, 24, 25, 26, 32, 34, 38, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61, 62, 63); + TEST_BITS_CLEAR(u64, 2, 6, 11, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 33, 35, 36, 37, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60); + CLEAR_BIT(u64, 32); + TEST_BITS_SET(u64, 0, 1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 17, 18, 21, 23, 24, 25, 26, 34, 38, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61, 62, 63); + TEST_BITS_CLEAR(u64, 2, 6, 11, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60); + CLEAR_BITS(u64, 0, 2, 11, 63, 32, 58); + TEST_BITS_SET(u64, 1, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 17, 18, 21, 23, 24, 25, 26, 34, 38, 39, 40, 42, 43, 45, 49, 50, 51, 56, 59, 61, 62); + TEST_BITS_CLEAR(u64, 0, 2, 6, 11, 15, 16, 19, 20, 22, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 41, 44, 46, 47, 48, 52, 53, 54, 55, 57, 58, 60, 63); + expected64 = 0; + BIT_FOREACH(i, u64) + expected64 |= UINT64_C(1) << i; + assert_se(expected64 == u64); + u64 = 0; + TEST_BITS_CLEAR(u64, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + BIT_FOREACH(i, u64) + assert_se(0); + u64 = ~u64; + TEST_BITS_SET(u64, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + count = 0; + BIT_FOREACH(i, u64) + count++; + assert_se(count == 64); + uint64_t _u64 = u64; + SET_BITS(u64); + assert_se(_u64 == u64); + CLEAR_BITS(u64); + assert_se(_u64 == u64); + + /* Verify these use cases are constant-folded. */ +#if !defined(__clang__) || (__clang_major__ >= 13) + /* Clang 11 and 12 (and possibly older) do not grok those; skip them. */ + assert_cc(__builtin_constant_p(INDEX_TO_MASK(uint8_t, 1))); + assert_cc(__builtin_constant_p(INDEX_TO_MASK(uint16_t, 1))); + assert_cc(__builtin_constant_p(INDEX_TO_MASK(uint32_t, 1))); + assert_cc(__builtin_constant_p(INDEX_TO_MASK(uint64_t, 1))); + + assert_cc(__builtin_constant_p(BIT_SET((uint8_t)2, 1))); + assert_cc(__builtin_constant_p(BIT_SET((uint16_t)2, 1))); + assert_cc(__builtin_constant_p(BIT_SET((uint32_t)2, 1))); + assert_cc(__builtin_constant_p(BIT_SET((uint64_t)2, 1))); +#endif +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-bitmap.c b/src/test/test-bitmap.c new file mode 100644 index 0000000..8acf833 --- /dev/null +++ b/src/test/test-bitmap.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bitmap.h" +#include "tests.h" + +int main(int argc, const char *argv[]) { + _cleanup_bitmap_free_ Bitmap *b = NULL, *b2 = NULL; + unsigned n = UINT_MAX, i = 0; + + test_setup_logging(LOG_DEBUG); + + b = bitmap_new(); + assert_se(b); + + assert_se(bitmap_ensure_allocated(&b) == 0); + b = bitmap_free(b); + assert_se(bitmap_ensure_allocated(&b) == 0); + + assert_se(bitmap_isset(b, 0) == false); + assert_se(bitmap_isset(b, 1) == false); + assert_se(bitmap_isset(b, 256) == false); + assert_se(bitmap_isclear(b) == true); + + assert_se(bitmap_set(b, 0) == 0); + assert_se(bitmap_isset(b, 0) == true); + assert_se(bitmap_isclear(b) == false); + bitmap_unset(b, 0); + assert_se(bitmap_isset(b, 0) == false); + assert_se(bitmap_isclear(b) == true); + + assert_se(bitmap_set(b, 1) == 0); + assert_se(bitmap_isset(b, 1) == true); + assert_se(bitmap_isclear(b) == false); + bitmap_unset(b, 1); + assert_se(bitmap_isset(b, 1) == false); + assert_se(bitmap_isclear(b) == true); + + assert_se(bitmap_set(b, 256) == 0); + assert_se(bitmap_isset(b, 256) == true); + assert_se(bitmap_isclear(b) == false); + bitmap_unset(b, 256); + assert_se(bitmap_isset(b, 256) == false); + assert_se(bitmap_isclear(b) == true); + + assert_se(bitmap_set(b, 32) == 0); + bitmap_unset(b, 0); + assert_se(bitmap_isset(b, 32) == true); + bitmap_unset(b, 32); + + BITMAP_FOREACH(n, NULL) + assert_not_reached(); + + assert_se(bitmap_set(b, 0) == 0); + assert_se(bitmap_set(b, 1) == 0); + assert_se(bitmap_set(b, 256) == 0); + + BITMAP_FOREACH(n, b) { + assert_se(n == i); + if (i == 0) + i = 1; + else if (i == 1) + i = 256; + else if (i == 256) + i = UINT_MAX; + } + + assert_se(i == UINT_MAX); + + i = 0; + + BITMAP_FOREACH(n, b) { + assert_se(n == i); + if (i == 0) + i = 1; + else if (i == 1) + i = 256; + else if (i == 256) + i = UINT_MAX; + } + + assert_se(i == UINT_MAX); + + b2 = bitmap_copy(b); + assert_se(b2); + assert_se(bitmap_equal(b, b2) == true); + assert_se(bitmap_equal(b, b) == true); + assert_se(bitmap_equal(b, NULL) == false); + assert_se(bitmap_equal(NULL, b) == false); + assert_se(bitmap_equal(NULL, NULL) == true); + + bitmap_clear(b); + assert_se(bitmap_isclear(b) == true); + assert_se(bitmap_equal(b, b2) == false); + b2 = bitmap_free(b2); + + assert_se(bitmap_set(b, UINT_MAX) == -ERANGE); + + b = bitmap_free(b); + assert_se(bitmap_ensure_allocated(&b) == 0); + assert_se(bitmap_ensure_allocated(&b2) == 0); + + assert_se(bitmap_equal(b, b2)); + assert_se(bitmap_set(b, 0) == 0); + bitmap_unset(b, 0); + assert_se(bitmap_equal(b, b2)); + + assert_se(bitmap_set(b, 1) == 0); + bitmap_clear(b); + assert_se(bitmap_equal(b, b2)); + + assert_se(bitmap_set(b, 0) == 0); + assert_se(bitmap_set(b2, 0) == 0); + assert_se(bitmap_equal(b, b2)); + + return 0; +} diff --git a/src/test/test-blockdev-util.c b/src/test/test-blockdev-util.c new file mode 100644 index 0000000..134386c --- /dev/null +++ b/src/test/test-blockdev-util.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "blockdev-util.h" +#include "errno-util.h" +#include "tests.h" + +static void test_path_is_encrypted_one(const char *p, int expect) { + int r; + + r = path_is_encrypted(p); + if (IN_SET(r, -ENOENT, -ELOOP) || ERRNO_IS_NEG_PRIVILEGE(r)) + /* This might fail, if btrfs is used and we run in a container. In that case we cannot + * resolve the device node paths that BTRFS_IOC_DEV_INFO returns, because the device nodes + * are unlikely to exist in the container. But if we can't stat() them we cannot determine + * the dev_t of them, and thus cannot figure out if they are encrypted. Hence let's just + * ignore ENOENT here. Also skip the test if we lack privileges. + * ELOOP might happen if the mount point is a symlink, as seen with under + * some rpm-ostree distros */ + return; + assert_se(r >= 0); + + log_info("%s encrypted: %s", p, yes_no(r)); + + assert_se(expect < 0 || ((r > 0) == (expect > 0))); +} + +TEST(path_is_encrypted) { + int booted = sd_booted(); /* If this is run in build environments such as koji, /dev/ might be a + * regular fs. Don't assume too much if not running under systemd. */ + + log_info("/* %s (sd_booted=%d) */", __func__, booted); + + test_path_is_encrypted_one("/home", -1); + test_path_is_encrypted_one("/var", -1); + test_path_is_encrypted_one("/", -1); + test_path_is_encrypted_one("/proc", false); + test_path_is_encrypted_one("/sys", false); + test_path_is_encrypted_one("/dev", booted > 0 ? false : -1); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-boot-timestamps.c b/src/test/test-boot-timestamps.c new file mode 100644 index 0000000..c3e4876 --- /dev/null +++ b/src/test/test-boot-timestamps.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "acpi-fpdt.h" +#include "boot-timestamps.h" +#include "efi-loader.h" +#include "errno-util.h" +#include "log.h" +#include "tests.h" + +static int test_acpi_fpdt(void) { + usec_t loader_start, loader_exit; + int r; + + r = acpi_get_boot_usec(&loader_start, &loader_exit); + if (r < 0) { + bool ok = IN_SET(r, -ENOENT, -ENODATA, -ERANGE) || ERRNO_IS_PRIVILEGE(r); + + log_full_errno(ok ? LOG_DEBUG : LOG_ERR, r, "Failed to read ACPI FPDT: %m"); + return ok ? 0 : r; + } + + log_info("ACPI FPDT: loader start=%s exit=%s duration=%s", + FORMAT_TIMESPAN(loader_start, USEC_PER_MSEC), + FORMAT_TIMESPAN(loader_exit, USEC_PER_MSEC), + FORMAT_TIMESPAN(loader_exit - loader_start, USEC_PER_MSEC)); + return 1; +} + +static int test_efi_loader(void) { + usec_t loader_start, loader_exit; + int r; + + r = efi_loader_get_boot_usec(&loader_start, &loader_exit); + if (r < 0) { + bool ok = IN_SET(r, -ENOENT, -EOPNOTSUPP) || ERRNO_IS_PRIVILEGE(r); + + log_full_errno(ok ? LOG_DEBUG : LOG_ERR, r, "Failed to read EFI loader data: %m"); + return ok ? 0 : r; + } + + log_info("EFI Loader: start=%s exit=%s duration=%s", + FORMAT_TIMESPAN(loader_start, USEC_PER_MSEC), + FORMAT_TIMESPAN(loader_exit, USEC_PER_MSEC), + FORMAT_TIMESPAN(loader_exit - loader_start, USEC_PER_MSEC)); + return 1; +} + +static int test_boot_timestamps(void) { + dual_timestamp fw, l, k; + int r; + + dual_timestamp_from_monotonic(&k, 0); + + r = boot_timestamps(NULL, &fw, &l); + if (r < 0) { + bool ok = IN_SET(r, -ENOENT, -EOPNOTSUPP) || ERRNO_IS_PRIVILEGE(r); + + log_full_errno(ok ? LOG_DEBUG : LOG_ERR, r, "Failed to read variables: %m"); + return ok ? 0 : r; + } + + log_info("Firmware began %s before kernel.", FORMAT_TIMESPAN(fw.monotonic, 0)); + log_info("Loader began %s before kernel.", FORMAT_TIMESPAN(l.monotonic, 0)); + log_info("Firmware began %s.", FORMAT_TIMESTAMP(fw.realtime)); + log_info("Loader began %s.", FORMAT_TIMESTAMP(l.realtime)); + log_info("Kernel began %s.", FORMAT_TIMESTAMP(k.realtime)); + return 1; +} + +int main(int argc, char* argv[]) { + int p, q, r; + + test_setup_logging(LOG_DEBUG); + + p = test_acpi_fpdt(); + assert_se(p >= 0); + q = test_efi_loader(); + assert_se(q >= 0); + r = test_boot_timestamps(); + assert_se(r >= 0); + + if (p == 0 && q == 0 && r == 0) + return log_tests_skipped("access to firmware variables not possible"); + + return EXIT_SUCCESS; +} diff --git a/src/test/test-bootspec.c b/src/test/test-bootspec.c new file mode 100644 index 0000000..18611fc --- /dev/null +++ b/src/test/test-bootspec.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bootspec.h" +#include "fileio.h" +#include "path-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST_RET(bootspec_sort) { + + static const struct { + const char *fname; + const char *contents; + } entries[] = { + { + .fname = "a-10.conf", + .contents = + "title A\n" + "version 10\n" + "machine-id dd235d00696545768f6f693bfd23b15f\n", + }, + { + .fname = "a-5.conf", + .contents = + "title A\n" + "version 5\n" + "machine-id dd235d00696545768f6f693bfd23b15f\n", + }, + { + .fname = "b.conf", + .contents = + "title B\n" + "version 3\n" + "machine-id b75451ad92f94feeab50b0b442768dbd\n", + }, + { + .fname = "c.conf", + .contents = + "title C\n" + "sort-key xxxx\n" + "version 5\n" + "machine-id 309de666fd5044268a9a26541ac93176\n", + }, + { + .fname = "cx.conf", + .contents = + "title C\n" + "sort-key xxxx\n" + "version 10\n" + "machine-id 309de666fd5044268a9a26541ac93176\n", + }, + { + .fname = "d.conf", + .contents = + "title D\n" + "sort-key kkkk\n" + "version 100\n" + "machine-id 81c6e3147cf544c19006af023e22b292\n", + }, + }; + + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL; + + assert_se(mkdtemp_malloc("/tmp/bootspec-testXXXXXX", &d) >= 0); + + for (size_t i = 0; i < ELEMENTSOF(entries); i++) { + _cleanup_free_ char *j = NULL; + + j = path_join(d, "/loader/entries/", entries[i].fname); + assert_se(j); + + assert_se(write_string_file(j, entries[i].contents, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + } + + assert_se(boot_config_load(&config, d, NULL) >= 0); + + assert_se(config.n_entries == 6); + + /* First, because has sort key, and its the lowest one */ + assert_se(streq(config.entries[0].id, "d.conf")); + + /* These two have a sort key, and newest must be first */ + assert_se(streq(config.entries[1].id, "cx.conf")); + assert_se(streq(config.entries[2].id, "c.conf")); + + /* The following ones have no sort key, hence order by version compared ids, lowest first */ + assert_se(streq(config.entries[3].id, "b.conf")); + assert_se(streq(config.entries[4].id, "a-10.conf")); + assert_se(streq(config.entries[5].id, "a-5.conf")); + + return 0; +} + +static void test_extract_tries_one(const char *fname, int ret, const char *stripped, unsigned tries_left, unsigned tries_done) { + _cleanup_free_ char *p = NULL; + unsigned l, d; + + assert_se(boot_filename_extract_tries(fname, &p, &l, &d) == ret); + if (ret < 0) + return; + + assert_se(streq_ptr(p, stripped)); + assert_se(l == tries_left); + assert_se(d == tries_done); +} + +TEST_RET(bootspec_extract_tries) { + test_extract_tries_one("foo.conf", 0, "foo.conf", UINT_MAX, UINT_MAX); + + test_extract_tries_one("foo+0.conf", 0, "foo.conf", 0, UINT_MAX); + test_extract_tries_one("foo+1.conf", 0, "foo.conf", 1, UINT_MAX); + test_extract_tries_one("foo+2.conf", 0, "foo.conf", 2, UINT_MAX); + test_extract_tries_one("foo+33.conf", 0, "foo.conf", 33, UINT_MAX); + + assert_cc(INT_MAX == INT32_MAX); + test_extract_tries_one("foo+2147483647.conf", 0, "foo.conf", 2147483647, UINT_MAX); + test_extract_tries_one("foo+2147483648.conf", -ERANGE, NULL, UINT_MAX, UINT_MAX); + + test_extract_tries_one("foo+33-0.conf", 0, "foo.conf", 33, 0); + test_extract_tries_one("foo+33-1.conf", 0, "foo.conf", 33, 1); + test_extract_tries_one("foo+33-107.conf", 0, "foo.conf", 33, 107); + test_extract_tries_one("foo+33-107.efi", 0, "foo.efi", 33, 107); + test_extract_tries_one("foo+33-2147483647.conf", 0, "foo.conf", 33, 2147483647); + test_extract_tries_one("foo+33-2147483648.conf", -ERANGE, NULL, UINT_MAX, UINT_MAX); + + test_extract_tries_one("foo+007-000008.conf", 0, "foo.conf", 7, 8); + + test_extract_tries_one("foo-1.conf", 0, "foo-1.conf", UINT_MAX, UINT_MAX); + test_extract_tries_one("foo-999.conf", 0, "foo-999.conf", UINT_MAX, UINT_MAX); + test_extract_tries_one("foo-.conf", 0, "foo-.conf", UINT_MAX, UINT_MAX); + + test_extract_tries_one("foo+.conf", 0, "foo+.conf", UINT_MAX, UINT_MAX); + test_extract_tries_one("+.conf", 0, "+.conf", UINT_MAX, UINT_MAX); + test_extract_tries_one("-.conf", 0, "-.conf", UINT_MAX, UINT_MAX); + test_extract_tries_one("", 0, "", UINT_MAX, UINT_MAX); + + test_extract_tries_one("+1.", 0, ".", 1, UINT_MAX); + test_extract_tries_one("+1-7.", 0, ".", 1, 7); + + test_extract_tries_one("some+name+24324-22.efi", 0, "some+name.efi", 24324, 22); + test_extract_tries_one("sels+2-3+7-6.", 0, "sels+2-3.", 7, 6); + test_extract_tries_one("a+1-2..", 0, "a+1-2..", UINT_MAX, UINT_MAX); + test_extract_tries_one("ses.sgesge.+4-1.efi", 0, "ses.sgesge..efi", 4, 1); + test_extract_tries_one("abc+0x4.conf", 0, "abc+0x4.conf", UINT_MAX, UINT_MAX); + test_extract_tries_one("def+1-0x3.conf", 0, "def+1-0x3.conf", UINT_MAX, UINT_MAX); + + return 0; +} + +TEST_RET(bootspec_boot_config_find_entry) { + + static const struct { + const char *fname; + const char *contents; + } entries[] = { + { + .fname = "a-10.conf", + .contents = + "title A\n" + "version 10\n" + "machine-id dd235d00696545768f6f693bfd23b15f\n", + }, + { + .fname = "a-05.conf", + .contents = + "title A\n" + "version 10\n" + "machine-id dd235d00696545768f6f693bfd23b15f\n", + }, + }; + + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL; + + assert_se(mkdtemp_malloc("/tmp/bootspec-testXXXXXX", &d) >= 0); + + for (size_t i = 0; i < ELEMENTSOF(entries); i++) { + _cleanup_free_ char *j = NULL; + + j = path_join(d, "/loader/entries/", entries[i].fname); + assert_se(j); + + assert_se(write_string_file(j, entries[i].contents, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + } + + assert_se(boot_config_load(&config, d, NULL) >= 0); + assert_se(config.n_entries == 2); + + // Test finding the first entry + BootEntry *entry = boot_config_find_entry(&config, "a-10.conf"); + assert_se(entry && streq(entry->id, "a-10.conf")); + + // Test finding the second entry + entry = boot_config_find_entry(&config, "a-05.conf"); + assert_se(entry && streq(entry->id, "a-05.conf")); + + // Test finding a non-existent entry + entry = boot_config_find_entry(&config, "nonexistent.conf"); + assert_se(entry == NULL); + + // Test case-insensitivity + entry = boot_config_find_entry(&config, "A-10.CONF"); + assert_se(entry && streq(entry->id, "a-10.conf")); + + + return 0; +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-bpf-devices.c b/src/test/test-bpf-devices.c new file mode 100644 index 0000000..4bd606e --- /dev/null +++ b/src/test/test-bpf-devices.c @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-devices.h" +#include "bpf-program.h" +#include "cgroup-setup.h" +#include "errno-list.h" +#include "fd-util.h" +#include "fs-util.h" +#include "path-util.h" +#include "tests.h" + +static void test_policy_closed(const char *cgroup_path, BPFProgram **installed_prog) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + unsigned wrong = 0; + int r; + + log_info("/* %s */", __func__); + + r = bpf_devices_cgroup_init(&prog, CGROUP_DEVICE_POLICY_CLOSED, true); + assert_se(r >= 0); + + r = bpf_devices_allow_list_static(prog, cgroup_path); + assert_se(r >= 0); + + r = bpf_devices_apply_policy(&prog, CGROUP_DEVICE_POLICY_CLOSED, true, cgroup_path, installed_prog); + assert_se(r >= 0); + + FOREACH_STRING(s, "/dev/null", + "/dev/zero", + "/dev/full", + "/dev/random", + "/dev/urandom", + "/dev/tty", + "/dev/ptmx") { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd < 0 && errno == EPERM; + /* We ignore errors other than EPERM, e.g. ENOENT or ENXIO */ + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 < 0 && errno == EPERM; + } + assert_se(wrong == 0); +} + +static void test_policy_strict(const char *cgroup_path, BPFProgram **installed_prog) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + unsigned wrong = 0; + int r; + + log_info("/* %s */", __func__); + + r = bpf_devices_cgroup_init(&prog, CGROUP_DEVICE_POLICY_STRICT, true); + assert_se(r >= 0); + + r = bpf_devices_allow_list_device(prog, cgroup_path, "/dev/null", CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + assert_se(r >= 0); + + r = bpf_devices_allow_list_device(prog, cgroup_path, "/dev/random", CGROUP_DEVICE_READ); + assert_se(r >= 0); + + r = bpf_devices_allow_list_device(prog, cgroup_path, "/dev/zero", CGROUP_DEVICE_WRITE); + assert_se(r >= 0); + + r = bpf_devices_apply_policy(&prog, CGROUP_DEVICE_POLICY_STRICT, true, cgroup_path, installed_prog); + assert_se(r >= 0); + + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/null"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd < 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 < 0; + } + + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/random"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd < 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 >= 0; + } + + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/zero"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd >= 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 < 0; + } + + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/full"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd >= 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 >= 0; + } + + assert_se(wrong == 0); +} + +static void test_policy_allow_list_major(const char *pattern, const char *cgroup_path, BPFProgram **installed_prog) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + unsigned wrong = 0; + int r; + + log_info("/* %s(%s) */", __func__, pattern); + + r = bpf_devices_cgroup_init(&prog, CGROUP_DEVICE_POLICY_STRICT, true); + assert_se(r >= 0); + + r = bpf_devices_allow_list_major(prog, cgroup_path, pattern, 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + assert_se(r >= 0); + + r = bpf_devices_apply_policy(&prog, CGROUP_DEVICE_POLICY_STRICT, true, cgroup_path, installed_prog); + assert_se(r >= 0); + + /* /dev/null, /dev/full have major==1, /dev/tty has major==5 */ + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/null"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd < 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 < 0; + } + + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/full"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd < 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 < 0; + } + + { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *s = "/dev/tty"; + + fd = open(s, O_CLOEXEC|O_RDONLY|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd >= 0; + + fd2 = open(s, O_CLOEXEC|O_WRONLY|O_NOCTTY); + log_debug("open(%s, \"w\") = %d/%s", s, fd2, fd2 < 0 ? errno_to_name(errno) : "-"); + wrong += fd2 >= 0; + } + + assert_se(wrong == 0); +} + +static void test_policy_allow_list_major_star(char type, const char *cgroup_path, BPFProgram **installed_prog) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + unsigned wrong = 0; + int r; + + log_info("/* %s(type=%c) */", __func__, type); + + r = bpf_devices_cgroup_init(&prog, CGROUP_DEVICE_POLICY_STRICT, true); + assert_se(r >= 0); + + r = bpf_devices_allow_list_major(prog, cgroup_path, "*", type, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + assert_se(r >= 0); + + r = bpf_devices_apply_policy(&prog, CGROUP_DEVICE_POLICY_STRICT, true, cgroup_path, installed_prog); + assert_se(r >= 0); + + { + _cleanup_close_ int fd = -EBADF; + const char *s = "/dev/null"; + + fd = open(s, O_CLOEXEC|O_RDWR|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + if (type == 'c') + wrong += fd < 0; + else + wrong += fd >= 0; + } + + assert_se(wrong == 0); +} + +static void test_policy_empty(bool add_mismatched, const char *cgroup_path, BPFProgram **installed_prog) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + unsigned wrong = 0; + int r; + + log_info("/* %s(add_mismatched=%s) */", __func__, yes_no(add_mismatched)); + + r = bpf_devices_cgroup_init(&prog, CGROUP_DEVICE_POLICY_STRICT, add_mismatched); + assert_se(r >= 0); + + if (add_mismatched) { + r = bpf_devices_allow_list_major(prog, cgroup_path, "foobarxxx", 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); + assert_se(r < 0); + } + + r = bpf_devices_apply_policy(&prog, CGROUP_DEVICE_POLICY_STRICT, false, cgroup_path, installed_prog); + assert_se(r >= 0); + + { + _cleanup_close_ int fd = -EBADF; + const char *s = "/dev/null"; + + fd = open(s, O_CLOEXEC|O_RDWR|O_NOCTTY); + log_debug("open(%s, \"r\") = %d/%s", s, fd, fd < 0 ? errno_to_name(errno) : "-"); + wrong += fd >= 0; + } + + assert_se(wrong == 0); +} + + +int main(int argc, char *argv[]) { + _cleanup_free_ char *cgroup = NULL, *parent = NULL; + _cleanup_(rmdir_and_freep) char *controller_path = NULL; + CGroupMask supported; + struct rlimit rl; + int r; + + test_setup_logging(LOG_DEBUG); + + assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0); + rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE); + (void) setrlimit(RLIMIT_MEMLOCK, &rl); + + r = cg_all_unified(); + if (r <= 0) + return log_tests_skipped("We don't seem to be running with unified cgroup hierarchy"); + + if (!can_memlock()) + return log_tests_skipped("Can't use mlock()"); + + r = enter_cgroup_subroot(&cgroup); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + if (r < 0) + return log_tests_skipped_errno(r, "Failed to prepare cgroup subtree"); + + r = bpf_devices_supported(); + if (r == 0) + return log_tests_skipped("BPF device filter not supported"); + assert_se(r == 1); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &controller_path); + assert_se(r >= 0); + + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + + test_policy_closed(cgroup, &prog); + test_policy_strict(cgroup, &prog); + + test_policy_allow_list_major("mem", cgroup, &prog); + test_policy_allow_list_major("1", cgroup, &prog); + + test_policy_allow_list_major_star('c', cgroup, &prog); + test_policy_allow_list_major_star('b', cgroup, &prog); + + test_policy_empty(false, cgroup, &prog); + test_policy_empty(true, cgroup, &prog); + + assert_se(path_extract_directory(cgroup, &parent) >= 0); + + assert_se(cg_mask_supported(&supported) >= 0); + r = cg_attach_everywhere(supported, parent, 0, NULL, NULL); + assert_se(r >= 0); + + return 0; +} diff --git a/src/test/test-bpf-firewall.c b/src/test/test-bpf-firewall.c new file mode 100644 index 0000000..c4175bc --- /dev/null +++ b/src/test/test-bpf-firewall.c @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "in-addr-prefix-util.h" +#include "load-fragment.h" +#include "manager.h" +#include "memory-util.h" +#include "rm-rf.h" +#include "service.h" +#include "tests.h" +#include "unit-serialize.h" +#include "virt.h" + +int main(int argc, char *argv[]) { + const struct bpf_insn exit_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), /* drop */ + BPF_EXIT_INSN() + }; + + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + CGroupContext *cc = NULL; + _cleanup_(bpf_program_freep) BPFProgram *p = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *u; + char log_buf[65535]; + struct rlimit rl; + int r; + union bpf_attr attr; + bool test_custom_filter = false; + const char *test_prog = "/sys/fs/bpf/test-dropper"; + + test_setup_logging(LOG_DEBUG); + + if (detect_container() > 0) + return log_tests_skipped("test-bpf-firewall fails inside LXC and Docker containers: https://github.com/systemd/systemd/issues/9666"); + + assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0); + rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE); + (void) setrlimit(RLIMIT_MEMLOCK, &rl); + + if (!can_memlock()) + return log_tests_skipped("Can't use mlock()"); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + _cleanup_free_ char *unit_dir = NULL; + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, "sd_trivial", &p); + assert_se(r == 0); + + r = bpf_program_add_instructions(p, exit_insn, ELEMENTSOF(exit_insn)); + assert_se(r == 0); + + r = bpf_firewall_supported(); + if (r == BPF_FIREWALL_UNSUPPORTED) + return log_tests_skipped("BPF firewalling not supported"); + assert_se(r > 0); + + if (r == BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + log_notice("BPF firewalling with BPF_F_ALLOW_MULTI supported. Yay!"); + test_custom_filter = true; + } else + log_notice("BPF firewalling (though without BPF_F_ALLOW_MULTI) supported. Good."); + + r = bpf_program_load_kernel(p, log_buf, ELEMENTSOF(log_buf)); + assert_se(r >= 0); + + if (test_custom_filter) { + zero(attr); + attr.pathname = PTR_TO_UINT64(test_prog); + attr.bpf_fd = p->kernel_fd; + attr.file_flags = 0; + + (void) unlink(test_prog); + + r = bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); + if (r < 0) { + log_warning_errno(errno, "BPF object pinning failed, will not run custom filter test: %m"); + test_custom_filter = false; + } + } + + p = bpf_program_free(p); + + /* The simple tests succeeded. Now let's try full unit-based use-case. */ + + assert_se(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "foo.service") == 0); + assert_se(cc = unit_get_cgroup_context(u)); + u->perpetual = true; + + cc->ip_accounting = true; + + assert_se(config_parse_in_addr_prefixes(u->id, "filename", 1, "Service", 1, "IPAddressAllow", 0, "10.0.1.0/24", &cc->ip_address_allow, NULL) == 0); + assert_se(config_parse_in_addr_prefixes(u->id, "filename", 1, "Service", 1, "IPAddressAllow", 0, "127.0.0.2", &cc->ip_address_allow, NULL) == 0); + assert_se(config_parse_in_addr_prefixes(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "127.0.0.3", &cc->ip_address_deny, NULL) == 0); + assert_se(config_parse_in_addr_prefixes(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "10.0.3.2/24", &cc->ip_address_deny, NULL) == 0); + assert_se(config_parse_in_addr_prefixes(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "127.0.0.1/25", &cc->ip_address_deny, NULL) == 0); + assert_se(config_parse_in_addr_prefixes(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "127.0.0.4", &cc->ip_address_deny, NULL) == 0); + + assert_se(set_size(cc->ip_address_allow) == 2); + assert_se(set_size(cc->ip_address_deny) == 4); + + /* The deny list is defined redundantly, let's ensure it will be properly reduced */ + assert_se(in_addr_prefixes_reduce(cc->ip_address_allow) >= 0); + assert_se(in_addr_prefixes_reduce(cc->ip_address_deny) >= 0); + + assert_se(set_size(cc->ip_address_allow) == 2); + assert_se(set_size(cc->ip_address_deny) == 2); + + assert_se(set_contains(cc->ip_address_allow, &(struct in_addr_prefix) { + .family = AF_INET, + .address.in.s_addr = htobe32((UINT32_C(10) << 24) | (UINT32_C(1) << 8)), + .prefixlen = 24 })); + assert_se(set_contains(cc->ip_address_allow, &(struct in_addr_prefix) { + .family = AF_INET, + .address.in.s_addr = htobe32(0x7f000002), + .prefixlen = 32 })); + assert_se(set_contains(cc->ip_address_deny, &(struct in_addr_prefix) { + .family = AF_INET, + .address.in.s_addr = htobe32(0x7f000000), + .prefixlen = 25 })); + assert_se(set_contains(cc->ip_address_deny, &(struct in_addr_prefix) { + .family = AF_INET, + .address.in.s_addr = htobe32((UINT32_C(10) << 24) | (UINT32_C(3) << 8)), + .prefixlen = 24 })); + + assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "/bin/ping -c 1 127.0.0.2 -W 5", SERVICE(u)->exec_command, u) == 0); + assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "/bin/ping -c 1 127.0.0.3 -W 5", SERVICE(u)->exec_command, u) == 0); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]); + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next); + assert_se(!SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->command_next); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + unit_dump(u, stdout, NULL); + + r = bpf_firewall_compile(u); + if (IN_SET(r, -ENOTTY, -ENOSYS, -EPERM)) + return log_tests_skipped("Kernel doesn't support the necessary bpf bits (masked out via seccomp?)"); + assert_se(r >= 0); + + assert_se(u->ip_bpf_ingress); + assert_se(u->ip_bpf_egress); + + r = bpf_program_load_kernel(u->ip_bpf_ingress, log_buf, ELEMENTSOF(log_buf)); + + log_notice("log:"); + log_notice("-------"); + log_notice("%s", log_buf); + log_notice("-------"); + + assert_se(r >= 0); + + r = bpf_program_load_kernel(u->ip_bpf_egress, log_buf, ELEMENTSOF(log_buf)); + + log_notice("log:"); + log_notice("-------"); + log_notice("%s", log_buf); + log_notice("-------"); + + assert_se(r >= 0); + + assert_se(unit_start(u, NULL) >= 0); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) + assert_se(sd_event_run(m->event, UINT64_MAX) >= 0); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code == CLD_EXITED && + SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.status == EXIT_SUCCESS); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->exec_status.code != CLD_EXITED || + SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->exec_status.status != EXIT_SUCCESS); + + if (test_custom_filter) { + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "custom-filter.service") == 0); + assert_se(cc = unit_get_cgroup_context(u)); + u->perpetual = true; + + cc->ip_accounting = true; + + assert_se(config_parse_ip_filter_bpf_progs(u->id, "filename", 1, "Service", 1, "IPIngressFilterPath", 0, test_prog, &cc->ip_filters_ingress, u) == 0); + assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "-/bin/ping -c 1 127.0.0.1 -W 5", SERVICE(u)->exec_command, u) == 0); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + assert_se(unit_start(u, NULL) >= 0); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) + assert_se(sd_event_run(m->event, UINT64_MAX) >= 0); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code != CLD_EXITED || + SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.status != EXIT_SUCCESS); + + (void) unlink(test_prog); + assert_se(SERVICE(u)->state == SERVICE_DEAD); + } + + return 0; +} diff --git a/src/test/test-bpf-foreign-programs.c b/src/test/test-bpf-foreign-programs.c new file mode 100644 index 0000000..35c7e0d --- /dev/null +++ b/src/test/test-bpf-foreign-programs.c @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "bpf-foreign.h" +#include "load-fragment.h" +#include "manager.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "service.h" +#include "tests.h" +#include "unit.h" +#include "virt.h" + +struct Test { + const char *option_name; + enum bpf_prog_type prog_type; + enum bpf_attach_type attach_type; + const char *bpffs_path; +}; + +typedef struct Test Test; + +#define BPFFS_PATH(prog_suffix) ("/sys/fs/bpf/test-bpf-foreing-" # prog_suffix) +static const Test single_prog[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, +}; +static const Test path_split_test[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("path:split:test"), + }, +}; + +static const Test same_prog_same_hook[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock"), + } +}; + +static const Test multi_prog_same_hook[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock-0"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock-1"), + } +}; + +static const Test same_prog_multi_hook[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_EGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + } +}; + +static const Test same_prog_multi_option_0[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, + { + .option_name = "IPIngressFilterPath", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + } +}; + +static const Test same_prog_multi_option_1[] = { + { + .option_name = "IPEgressFilterPath", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_EGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_EGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + } +}; +#undef BPFFS_PATH + +static int bpf_foreign_test_to_string(enum bpf_attach_type attach_type, const char *bpffs_path, char **ret_str) { + const char *s = NULL; + + assert_se(bpffs_path); + assert_se(ret_str); + + assert_se(s = bpf_cgroup_attach_type_to_string(attach_type)); + assert_se(*ret_str = strjoin(s, ":", bpffs_path)); + + return 0; +} + +static char **unlink_paths_and_free(char **paths) { + STRV_FOREACH(i, paths) + (void) unlink(*i); + + return strv_free(paths); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(char **, unlink_paths_and_free); + +static int pin_programs(Unit *u, CGroupContext *cc, const Test *test_suite, size_t test_suite_size, char ***paths_ret) { + _cleanup_(unlink_paths_and_freep) char **bpffs_paths = NULL; + static const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN() + }; + char log_buf[0xffff]; + int r; + + assert_se(paths_ret); + + for (size_t i = 0; i < test_suite_size; i++) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + _cleanup_free_ char *str = NULL; + + r = bpf_foreign_test_to_string(test_suite[i].attach_type, test_suite[i].bpffs_path, &str); + if (r < 0) + return log_error_errno(r, "Failed to convert program to string"); + + r = bpf_program_new(test_suite[i].prog_type, "sd_trivial", &prog); + if (r < 0) + return log_error_errno(r, "Failed to create program '%s'", str); + + r = bpf_program_add_instructions(prog, trivial, ELEMENTSOF(trivial)); + if (r < 0) + return log_error_errno(r, "Failed to add trivial instructions for '%s'", str); + + r = bpf_program_load_kernel(prog, log_buf, ELEMENTSOF(log_buf)); + if (r < 0) + return log_error_errno(r, "Failed to load BPF program '%s'", str); + + if (strv_contains(bpffs_paths, test_suite[i].bpffs_path)) + continue; + + r = strv_extend(&bpffs_paths, test_suite[i].bpffs_path); + if (r < 0) + return log_error_errno(r, "Failed to put path into a vector: %m"); + + r = bpf_program_pin(prog->kernel_fd, test_suite[i].bpffs_path); + if (r < 0) + return log_error_errno(r, "Failed to pin BPF program '%s'", str); + } + + *paths_ret = TAKE_PTR(bpffs_paths); + return 0; +} + +static int test_bpf_cgroup_programs(Manager *m, const char *unit_name, const Test *test_suite, size_t test_suite_size) { + _cleanup_(unlink_paths_and_freep) char **bpffs_paths = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + CGroupContext *cc = NULL; + int cld_code, r; + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, unit_name) == 0); + assert_se(cc = unit_get_cgroup_context(u)); + + r = pin_programs(u, cc, test_suite, test_suite_size, &bpffs_paths); + if (r < 0) + return log_error_errno(r, "Failed to pin programs: %m"); + + for (size_t i = 0; i < test_suite_size; i++) { + if (streq(test_suite[i].option_name, "BPFProgram")) { + _cleanup_free_ char *option = NULL; + r = bpf_foreign_test_to_string(test_suite[i].attach_type, test_suite[i].bpffs_path, &option); + if (r < 0) + return log_error_errno(r, "Failed to compose option string: %m"); + r = config_parse_bpf_foreign_program( + u->id, "filename", 1, "Service", 1, test_suite[i].option_name, 0, option, cc, u); + + if (r < 0) + return log_error_errno(r, "Failed to parse option string '%s': %m", option); + } else if (STR_IN_SET(test_suite[i].option_name, "IPIngressFilterPath", "IPEgressFilterPath")) { + const char *option = test_suite[i].bpffs_path; + void *paths = NULL; + + if (streq(test_suite[i].option_name, "IPIngressFilterPath")) + paths = &cc->ip_filters_ingress; + else + paths = &cc->ip_filters_egress; + + r = config_parse_ip_filter_bpf_progs( + u->id, "filename", 1, "Service", 1, test_suite[i].option_name, 0, option, paths, u); + if (r < 0) + return log_error_errno(r, "Failed to parse option string '%s': %m", option); + } + } + + r = config_parse_exec( + u->id, + "filename", + 1, + "Service", + 1, + "ExecStart", + SERVICE_EXEC_START, + "-/bin/ping -c 5 127.0.0.1 -W 1", + SERVICE(u)->exec_command, + u); + if (r < 0) + return log_error_errno(r, "Failed to parse ExecStart"); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + r = unit_start(u, NULL); + if (r < 0) + return log_error_errno(r, "Unit start failed %m"); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) { + r = sd_event_run(m->event, UINT64_MAX); + if (r < 0) + return log_error_errno(errno, "Event run failed %m"); + } + + cld_code = SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code; + if (cld_code != CLD_EXITED) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "Child didn't exit normally, code='%s'", sigchld_code_to_string(cld_code)); + + if (SERVICE(u)->state != SERVICE_DEAD) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Service is not dead"); + + return r; +} + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_free_ char *unit_dir = NULL; + struct rlimit rl; + int r; + + test_setup_logging(LOG_DEBUG); + + if (detect_container() > 0) + return log_tests_skipped("test-bpf fails inside LXC and Docker containers: https://github.com/systemd/systemd/issues/9666"); + + if (getuid() != 0) + return log_tests_skipped("not running as root"); + + assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0); + rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE); + (void) setrlimit_closest(RLIMIT_MEMLOCK, &rl); + + if (!can_memlock()) + return log_tests_skipped("Can't use mlock()"); + + r = cg_all_unified(); + if (r <= 0) + return log_tests_skipped("Unified hierarchy is required"); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + assert_se(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(test_bpf_cgroup_programs(m, + "single_prog.service", single_prog, ELEMENTSOF(single_prog)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "multi_prog_same_hook.service", + multi_prog_same_hook, ELEMENTSOF(multi_prog_same_hook)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_multi_hook.service", + same_prog_multi_hook, ELEMENTSOF(same_prog_multi_hook)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_multi_option_0.service", + same_prog_multi_option_0, ELEMENTSOF(same_prog_multi_option_0)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_multi_option_1.service", + same_prog_multi_option_1, ELEMENTSOF(same_prog_multi_option_1)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_same_hook.service", + same_prog_same_hook, + ELEMENTSOF(same_prog_same_hook)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "path_split_test.service", + path_split_test, + ELEMENTSOF(path_split_test)) >= 0); + return 0; +} diff --git a/src/test/test-bpf-lsm.c b/src/test/test-bpf-lsm.c new file mode 100644 index 0000000..42ea64c --- /dev/null +++ b/src/test/test-bpf-lsm.c @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-lsm.h" +#include "load-fragment.h" +#include "manager.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "service.h" +#include "strv.h" +#include "tests.h" +#include "unit.h" +#include "virt.h" + +static int test_restrict_filesystems(Manager *m, const char *unit_name, const char *file_path, char **allowed_filesystems) { + _cleanup_free_ char *exec_start = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + ExecContext *ec = NULL; + int cld_code, r; + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, unit_name) == 0); + assert_se(ec = unit_get_exec_context(u)); + + STRV_FOREACH(allow_filesystem, allowed_filesystems) { + r = config_parse_restrict_filesystems( + u->id, "filename", 1, "Service", 1, "RestrictFileSystems", 0, + *allow_filesystem, ec, u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to parse RestrictFileSystems: %m"); + } + + assert_se(exec_start = strjoin("cat ", file_path)); + r = config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", + SERVICE_EXEC_START, exec_start, SERVICE(u)->exec_command, u); + if (r < 0) + return log_error_errno(r, "Failed to parse ExecStart"); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + r = unit_start(u, NULL); + if (r < 0) + return log_error_errno(r, "Unit start failed %m"); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) { + r = sd_event_run(m->event, UINT64_MAX); + if (r < 0) + return log_error_errno(errno, "Event run failed %m"); + } + + cld_code = SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code; + if (cld_code != CLD_EXITED) + return log_error_errno(-SYNTHETIC_ERRNO(EBUSY), "ExecStart didn't exited, code='%s'", sigchld_code_to_string(cld_code)); + + if (SERVICE(u)->state != SERVICE_DEAD) + return log_error_errno(-SYNTHETIC_ERRNO(EBUSY), "Service is not dead"); + + return 0; +} + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_free_ char *unit_dir = NULL; + struct rlimit rl; + int r; + + test_setup_logging(LOG_DEBUG); + + assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0); + rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE); + (void) setrlimit_closest(RLIMIT_MEMLOCK, &rl); + + if (!can_memlock()) + return log_tests_skipped("Can't use mlock()"); + + if (!lsm_bpf_supported(/* initialize = */ true)) + return log_tests_skipped("LSM BPF hooks are not supported"); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_BASIC, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + /* We need to enable access to the filesystem where the binary is so we + * add @common-block */ + assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/tracing/printk_formats", STRV_MAKE("@common-block")) < 0); + assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/tracing/printk_formats", STRV_MAKE("tracefs", "@common-block")) >= 0); + assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/tracing/printk_formats", STRV_MAKE("tracefs", "@common-block", "~tracefs")) < 0); + assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/debug/sleep_time", STRV_MAKE("@common-block")) < 0); + assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/debug/sleep_time", STRV_MAKE("debugfs", "@common-block")) >= 0); + assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/debug/sleep_time", STRV_MAKE("~debugfs")) < 0); + + return 0; +} diff --git a/src/test/test-btrfs-physical-offset.c b/src/test/test-btrfs-physical-offset.c new file mode 100644 index 0000000..221c08e --- /dev/null +++ b/src/test/test-btrfs-physical-offset.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "btrfs-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "log.h" +#include "memory-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_close_ int fd = -EBADF; + uint64_t offset; + int r; + + assert(argc == 2); + assert(!isempty(argv[1])); + + test_setup_logging(LOG_DEBUG); + + fd = open(argv[1], O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + log_error_errno(errno, "Failed to open '%s': %m", argv[1]); + return EXIT_FAILURE; + } + + r = btrfs_get_file_physical_offset_fd(fd, &offset); + if (r < 0) { + log_error_errno(r, "Failed to get physical offset of '%s': %m", argv[1]); + return EXIT_FAILURE; + } + + printf("%" PRIu64 "\n", offset / page_size()); + return EXIT_SUCCESS; +} diff --git a/src/test/test-btrfs.c b/src/test/test-btrfs.c new file mode 100644 index 0000000..205142e --- /dev/null +++ b/src/test/test-btrfs.c @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "btrfs-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "fileio.h" +#include "format-util.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + BtrfsQuotaInfo quota; + int r, fd; + + test_setup_logging(LOG_DEBUG); + + fd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd < 0) + log_error_errno(errno, "Failed to open root directory: %m"); + else { + BtrfsSubvolInfo info; + + r = btrfs_subvol_get_info_fd(fd, 0, &info); + if (r < 0) + log_error_errno(r, "Failed to get subvolume info: %m"); + else { + log_info("otime: %s", FORMAT_TIMESTAMP(info.otime)); + log_info("read-only (search): %s", yes_no(info.read_only)); + } + + r = btrfs_qgroup_get_quota_fd(fd, 0, "a); + if (r < 0) + log_error_errno(r, "Failed to get quota info: %m"); + else { + log_info("referenced: %s", strna(FORMAT_BYTES(quota.referenced))); + log_info("exclusive: %s", strna(FORMAT_BYTES(quota.exclusive))); + log_info("referenced_max: %s", strna(FORMAT_BYTES(quota.referenced_max))); + log_info("exclusive_max: %s", strna(FORMAT_BYTES(quota.exclusive_max))); + } + + r = btrfs_subvol_get_read_only_fd(fd); + if (r < 0) + log_error_errno(r, "Failed to get read only flag: %m"); + else + log_info("read-only (ioctl): %s", yes_no(r)); + + safe_close(fd); + } + + r = btrfs_subvol_make(AT_FDCWD, "/xxxtest"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + r = write_string_file("/xxxtest/file", "ljsadhfljasdkfhlkjdsfha", WRITE_STRING_FILE_CREATE); + if (r < 0) + log_error_errno(r, "Failed to write file: %m"); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, "/xxxtest", AT_FDCWD, "/xxxtest2", 0); + if (r < 0) + log_error_errno(r, "Failed to make snapshot: %m"); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, "/xxxtest", AT_FDCWD, "/xxxtest3", BTRFS_SNAPSHOT_READ_ONLY); + if (r < 0) + log_error_errno(r, "Failed to make snapshot: %m"); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, "/xxxtest", AT_FDCWD, "/xxxtest4", BTRFS_SNAPSHOT_LOCK_BSD); + if (r < 0) + log_error_errno(r, "Failed to make snapshot: %m"); + if (r >= 0) + assert_se(xopenat_lock(AT_FDCWD, "/xxxtest4", 0, 0, 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + + safe_close(r); + + r = btrfs_subvol_remove("/xxxtest", BTRFS_REMOVE_QUOTA); + if (r < 0) + log_error_errno(r, "Failed to remove subvolume: %m"); + + r = btrfs_subvol_remove("/xxxtest2", BTRFS_REMOVE_QUOTA); + if (r < 0) + log_error_errno(r, "Failed to remove subvolume: %m"); + + r = btrfs_subvol_remove("/xxxtest3", BTRFS_REMOVE_QUOTA); + if (r < 0) + log_error_errno(r, "Failed to remove subvolume: %m"); + + r = btrfs_subvol_remove("/xxxtest4", BTRFS_REMOVE_QUOTA); + if (r < 0) + log_error_errno(r, "Failed to remove subvolume: %m"); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, "/etc", AT_FDCWD, "/etc2", + BTRFS_SNAPSHOT_READ_ONLY|BTRFS_SNAPSHOT_FALLBACK_COPY); + if (r < 0) + log_error_errno(r, "Failed to make snapshot: %m"); + + r = btrfs_subvol_remove("/etc2", BTRFS_REMOVE_QUOTA); + if (r < 0) + log_error_errno(r, "Failed to remove subvolume: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxrectest"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxrectest/xxxrectest2"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxrectest/xxxrectest3"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxrectest/xxxrectest3/sub"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + if (mkdir("/xxxrectest/dir", 0755) < 0) + log_error_errno(errno, "Failed to make directory: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxrectest/dir/xxxrectest4"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + if (mkdir("/xxxrectest/dir/xxxrectest4/dir", 0755) < 0) + log_error_errno(errno, "Failed to make directory: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxrectest/dir/xxxrectest4/dir/xxxrectest5"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + if (mkdir("/xxxrectest/mnt", 0755) < 0) + log_error_errno(errno, "Failed to make directory: %m"); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, "/xxxrectest", AT_FDCWD, "/xxxrectest2", BTRFS_SNAPSHOT_RECURSIVE); + if (r < 0) + log_error_errno(r, "Failed to snapshot subvolume: %m"); + + r = btrfs_subvol_remove("/xxxrectest", BTRFS_REMOVE_QUOTA|BTRFS_REMOVE_RECURSIVE); + if (r < 0) + log_error_errno(r, "Failed to recursively remove subvolume: %m"); + + r = btrfs_subvol_remove("/xxxrectest2", BTRFS_REMOVE_QUOTA|BTRFS_REMOVE_RECURSIVE); + if (r < 0) + log_error_errno(r, "Failed to recursively remove subvolume: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxquotatest"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + r = btrfs_subvol_auto_qgroup("/xxxquotatest", 0, true); + if (r < 0) + log_error_errno(r, "Failed to set up auto qgroup: %m"); + + r = btrfs_subvol_make(AT_FDCWD, "/xxxquotatest/beneath"); + if (r < 0) + log_error_errno(r, "Failed to make subvolume: %m"); + + r = btrfs_subvol_auto_qgroup("/xxxquotatest/beneath", 0, false); + if (r < 0) + log_error_errno(r, "Failed to set up auto qgroup: %m"); + + r = btrfs_qgroup_set_limit("/xxxquotatest/beneath", 0, 4ULL * 1024 * 1024 * 1024); + if (r < 0) + log_error_errno(r, "Failed to set up quota limit: %m"); + + r = btrfs_subvol_set_subtree_quota_limit("/xxxquotatest", 0, 5ULL * 1024 * 1024 * 1024); + if (r < 0) + log_error_errno(r, "Failed to set up quota limit: %m"); + + r = btrfs_subvol_snapshot_at(AT_FDCWD, "/xxxquotatest", AT_FDCWD, "/xxxquotatest2", + BTRFS_SNAPSHOT_RECURSIVE|BTRFS_SNAPSHOT_QUOTA); + if (r < 0) + log_error_errno(r, "Failed to set up snapshot: %m"); + + r = btrfs_qgroup_get_quota("/xxxquotatest2/beneath", 0, "a); + if (r < 0) + log_error_errno(r, "Failed to query quota: %m"); + + if (r >= 0) + assert_se(quota.referenced_max == 4ULL * 1024 * 1024 * 1024); + + r = btrfs_subvol_get_subtree_quota("/xxxquotatest2", 0, "a); + if (r < 0) + log_error_errno(r, "Failed to query quota: %m"); + + if (r >= 0) + assert_se(quota.referenced_max == 5ULL * 1024 * 1024 * 1024); + + r = btrfs_subvol_remove("/xxxquotatest", BTRFS_REMOVE_QUOTA|BTRFS_REMOVE_RECURSIVE); + if (r < 0) + log_error_errno(r, "Failed remove subvolume: %m"); + + r = btrfs_subvol_remove("/xxxquotatest2", BTRFS_REMOVE_QUOTA|BTRFS_REMOVE_RECURSIVE); + if (r < 0) + log_error_errno(r, "Failed remove subvolume: %m"); + + return 0; +} diff --git a/src/test/test-bus-util.c b/src/test/test-bus-util.c new file mode 100644 index 0000000..2f52bca --- /dev/null +++ b/src/test/test-bus-util.c @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bus-util.h" +#include "log.h" +#include "tests.h" + +static int callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) { + return 1; +} + +static void destroy_callback(void *userdata) { + int *n_called = userdata; + + (*n_called) ++; +} + +TEST(destroy_callback) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + sd_bus_slot *slot = NULL; + sd_bus_destroy_t t; + + int r, n_called = 0; + + r = bus_open_system_watch_bind_with_description(&bus, "test-bus"); + if (r < 0) { + log_error_errno(r, "Failed to connect to bus: %m"); + return; + } + + r = sd_bus_request_name_async(bus, &slot, "org.freedesktop.systemd.test-bus-util", 0, callback, &n_called); + assert_se(r == 1); + + assert_se(sd_bus_slot_get_destroy_callback(slot, NULL) == 0); + assert_se(sd_bus_slot_get_destroy_callback(slot, &t) == 0); + + assert_se(sd_bus_slot_set_destroy_callback(slot, destroy_callback) == 0); + assert_se(sd_bus_slot_get_destroy_callback(slot, NULL) == 1); + assert_se(sd_bus_slot_get_destroy_callback(slot, &t) == 1); + assert_se(t == destroy_callback); + + /* Force cleanup so we can look at n_called */ + assert_se(n_called == 0); + sd_bus_slot_unref(slot); + assert_se(n_called == 1); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-calendarspec.c b/src/test/test-calendarspec.c new file mode 100644 index 0000000..18a0f8f --- /dev/null +++ b/src/test/test-calendarspec.c @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "calendarspec.h" +#include "env-util.h" +#include "errno-util.h" +#include "string-util.h" +#include "tests.h" + +static void _test_one(int line, const char *input, const char *output) { + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + _cleanup_free_ char *p = NULL, *q = NULL; + usec_t u; + int r; + + r = calendar_spec_from_string(input, &c); + if (r < 0) + log_error_errno(r, "Failed to parse \"%s\": %m", input); + assert_se(r >= 0); + + assert_se(calendar_spec_to_string(c, &p) >= 0); + log_info("line %d: \"%s\" → \"%s\"%s%s", line, input, p, + !streq(p, output) ? " expected:" : "", + !streq(p, output) ? output : ""); + + assert_se(streq(p, output)); + + u = now(CLOCK_REALTIME); + r = calendar_spec_next_usec(c, u, &u); + log_info("Next: %s", r < 0 ? STRERROR(r) : FORMAT_TIMESTAMP(u)); + c = calendar_spec_free(c); + + assert_se(calendar_spec_from_string(p, &c) >= 0); + assert_se(calendar_spec_to_string(c, &q) >= 0); + + assert_se(streq(q, p)); +} +#define test_one(input, output) _test_one(__LINE__, input, output) + +static void _test_next(int line, const char *input, const char *new_tz, usec_t after, usec_t expect) { + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + usec_t u; + char *old_tz; + int r; + + old_tz = getenv("TZ"); + if (old_tz) + old_tz = strdupa_safe(old_tz); + + if (!isempty(new_tz)) + new_tz = strjoina(":", new_tz); + + assert_se(set_unset_env("TZ", new_tz, true) == 0); + tzset(); + + assert_se(calendar_spec_from_string(input, &c) >= 0); + + log_info("line %d: \"%s\" new_tz=%s", line, input, strnull(new_tz)); + + u = after; + r = calendar_spec_next_usec(c, after, &u); + log_info("At: %s", r < 0 ? STRERROR(r) : FORMAT_TIMESTAMP_STYLE(u, TIMESTAMP_US)); + if (expect != USEC_INFINITY) + assert_se(r >= 0 && u == expect); + else + assert_se(r == -ENOENT); + + assert_se(set_unset_env("TZ", old_tz, true) == 0); + tzset(); +} +#define test_next(input, new_tz, after, expect) _test_next(__LINE__, input,new_tz,after,expect) + +TEST(timestamp) { + char buf[FORMAT_TIMESTAMP_MAX]; + _cleanup_free_ char *t = NULL; + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + usec_t x, y; + + /* Ensure that a timestamp is also a valid calendar specification. Convert forth and back */ + + x = now(CLOCK_REALTIME); + + assert_se(format_timestamp_style(buf, sizeof buf, x, TIMESTAMP_US)); + log_info("%s", buf); + assert_se(calendar_spec_from_string(buf, &c) >= 0); + assert_se(calendar_spec_to_string(c, &t) >= 0); + log_info("%s", t); + + assert_se(parse_timestamp(t, &y) >= 0); + assert_se(y == x); +} + +TEST(hourly_bug_4031) { + _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL; + usec_t n, u, w; + int r; + + assert_se(calendar_spec_from_string("hourly", &c) >= 0); + n = now(CLOCK_REALTIME); + assert_se((r = calendar_spec_next_usec(c, n, &u)) >= 0); + + log_info("Now: %s (%"PRIu64")", FORMAT_TIMESTAMP_STYLE(n, TIMESTAMP_US), n); + log_info("Next hourly: %s (%"PRIu64")", r < 0 ? STRERROR(r) : FORMAT_TIMESTAMP_STYLE(u, TIMESTAMP_US), u); + + assert_se((r = calendar_spec_next_usec(c, u, &w)) >= 0); + log_info("Next hourly: %s (%"PRIu64")", r < 0 ? STRERROR(r) : FORMAT_TIMESTAMP_STYLE(w, TIMESTAMP_US), w); + + assert_se(n < u); + assert_se(u <= n + USEC_PER_HOUR); + assert_se(u < w); + assert_se(w <= u + USEC_PER_HOUR); +} + +TEST(calendar_spec_one) { + test_one("Sat,Thu,Mon-Wed,Sat-Sun", "Mon..Thu,Sat,Sun *-*-* 00:00:00"); + test_one("Sat,Thu,Mon..Wed,Sat..Sun", "Mon..Thu,Sat,Sun *-*-* 00:00:00"); + test_one("Mon,Sun 12-*-* 2,1:23", "Mon,Sun 2012-*-* 01,02:23:00"); + test_one("Wed *-1", "Wed *-*-01 00:00:00"); + test_one("Wed-Wed,Wed *-1", "Wed *-*-01 00:00:00"); + test_one("Wed..Wed,Wed *-1", "Wed *-*-01 00:00:00"); + test_one("Wed, 17:48", "Wed *-*-* 17:48:00"); + test_one("Wednesday,", "Wed *-*-* 00:00:00"); + test_one("Wed-Sat,Tue 12-10-15 1:2:3", "Tue..Sat 2012-10-15 01:02:03"); + test_one("Wed..Sat,Tue 12-10-15 1:2:3", "Tue..Sat 2012-10-15 01:02:03"); + test_one("*-*-7 0:0:0", "*-*-07 00:00:00"); + test_one("10-15", "*-10-15 00:00:00"); + test_one("monday *-12-* 17:00", "Mon *-12-* 17:00:00"); + test_one("Mon,Fri *-*-3,1,2 *:30:45", "Mon,Fri *-*-01,02,03 *:30:45"); + test_one("12,14,13,12:20,10,30", "*-*-* 12,13,14:10,20,30:00"); + test_one("mon,fri *-1/2-1,3 *:30:45", "Mon,Fri *-01/2-01,03 *:30:45"); + test_one("03-05 08:05:40", "*-03-05 08:05:40"); + test_one("08:05:40", "*-*-* 08:05:40"); + test_one("05:40", "*-*-* 05:40:00"); + test_one("Sat,Sun 12-05 08:05:40", "Sat,Sun *-12-05 08:05:40"); + test_one("Sat,Sun 08:05:40", "Sat,Sun *-*-* 08:05:40"); + test_one("2003-03-05 05:40", "2003-03-05 05:40:00"); + test_one("2003-03-05", "2003-03-05 00:00:00"); + test_one("03-05", "*-03-05 00:00:00"); + test_one("hourly", "*-*-* *:00:00"); + test_one("daily", "*-*-* 00:00:00"); + test_one("monthly", "*-*-01 00:00:00"); + test_one("weekly", "Mon *-*-* 00:00:00"); + test_one("minutely", "*-*-* *:*:00"); + test_one("quarterly", "*-01,04,07,10-01 00:00:00"); + test_one("semi-annually", "*-01,07-01 00:00:00"); + test_one("annually", "*-01-01 00:00:00"); + test_one("*:2/3", "*-*-* *:02/3:00"); + test_one("2015-10-25 01:00:00 uTc", "2015-10-25 01:00:00 UTC"); + test_one("2015-10-25 01:00:00 Asia/Vladivostok", "2015-10-25 01:00:00 Asia/Vladivostok"); + test_one("weekly Pacific/Auckland", "Mon *-*-* 00:00:00 Pacific/Auckland"); + test_one("2016-03-27 03:17:00.4200005", "2016-03-27 03:17:00.420001"); + test_one("2016-03-27 03:17:00/0.42", "2016-03-27 03:17:00/0.420000"); + test_one("9..11,13:00,30", "*-*-* 09..11,13:00,30:00"); + test_one("1..3-1..3 1..3:1..3", "*-01..03-01..03 01..03:01..03:00"); + test_one("00:00:1.125..2.125", "*-*-* 00:00:01.125000..02.125000"); + test_one("00:00:1.0..3.8", "*-*-* 00:00:01..03"); + test_one("00:00:01..03", "*-*-* 00:00:01..03"); + test_one("00:00:01/2,02..03", "*-*-* 00:00:01/2,02..03"); + test_one("*:4,30:0..3", "*-*-* *:04,30:00..03"); + test_one("*:4,30:0/1", "*-*-* *:04,30:*"); + test_one("*:4,30:0/1,3,5", "*-*-* *:04,30:*"); + test_one("*-*~1 Utc", "*-*~01 00:00:00 UTC"); + test_one("*-*~05,3 ", "*-*~03,05 00:00:00"); + test_one("*-*~* 00:00:00", "*-*-* 00:00:00"); + test_one("Monday", "Mon *-*-* 00:00:00"); + test_one("Monday *-*-*", "Mon *-*-* 00:00:00"); + test_one("*-*-*", "*-*-* 00:00:00"); + test_one("*:*:*", "*-*-* *:*:*"); + test_one("*:*", "*-*-* *:*:00"); + test_one("12:*", "*-*-* 12:*:00"); + test_one("*:30", "*-*-* *:30:00"); + test_one("93..00-*-*", "1993..2000-*-* 00:00:00"); + test_one("00..07-*-*", "2000..2007-*-* 00:00:00"); + test_one("*:20..39/5", "*-*-* *:20..35/5:00"); + test_one("00:00:20..40/1", "*-*-* 00:00:20..40"); + test_one("*~03/1,03..05", "*-*~03/1,03..05 00:00:00"); + /* UNIX timestamps are always UTC */ + test_one("@1493187147", "2017-04-26 06:12:27 UTC"); + test_one("@1493187147 UTC", "2017-04-26 06:12:27 UTC"); + test_one("@0", "1970-01-01 00:00:00 UTC"); + test_one("@0 UTC", "1970-01-01 00:00:00 UTC"); + test_one("*:05..05", "*-*-* *:05:00"); + test_one("*:05..10/6", "*-*-* *:05:00"); +} + +TEST(calendar_spec_next) { + test_next("2016-03-27 03:17:00", "", 12345, 1459048620000000); + test_next("2016-03-27 03:17:00", "CET", 12345, 1459041420000000); + test_next("2016-03-27 03:17:00", "EET", 12345, -1); + test_next("2016-03-27 03:17:00 UTC", NULL, 12345, 1459048620000000); + test_next("2016-03-27 03:17:00 UTC", "", 12345, 1459048620000000); + test_next("2016-03-27 03:17:00 UTC", "CET", 12345, 1459048620000000); + test_next("2016-03-27 03:17:00 UTC", "EET", 12345, 1459048620000000); + test_next("2016-03-27 03:17:00.420000001 UTC", "EET", 12345, 1459048620420000); + test_next("2016-03-27 03:17:00.4200005 UTC", "EET", 12345, 1459048620420001); + test_next("2015-11-13 09:11:23.42", "EET", 12345, 1447398683420000); + test_next("2015-11-13 09:11:23.42/1.77", "EET", 1447398683420000, 1447398685190000); + test_next("2015-11-13 09:11:23.42/1.77", "EET", 1447398683419999, 1447398683420000); + test_next("Sun 16:00:00", "CET", 1456041600123456, 1456066800000000); + test_next("*-04-31", "", 12345, -1); + test_next("2016-02~01 UTC", "", 12345, 1456704000000000); + test_next("Mon 2017-05~01..07 UTC", "", 12345, 1496016000000000); + test_next("Mon 2017-05~07/1 UTC", "", 12345, 1496016000000000); + test_next("*-*-01/5 04:00:00 UTC", "", 1646010000000000, 1646107200000000); + test_next("*-01/7-01 04:00:00 UTC", "", 1664607600000000, 1672545600000000); + test_next("2017-08-06 9,11,13,15,17:00 UTC", "", 1502029800000000, 1502031600000000); + test_next("2017-08-06 9..17/2:00 UTC", "", 1502029800000000, 1502031600000000); + test_next("2016-12-* 3..21/6:00 UTC", "", 1482613200000001, 1482634800000000); + test_next("2017-09-24 03:30:00 Pacific/Auckland", "", 12345, 1506177000000000); + /* Due to daylight saving time - 2017-09-24 02:30:00 does not exist */ + test_next("2017-09-24 02:30:00 Pacific/Auckland", "", 12345, -1); + test_next("2017-04-02 02:30:00 Pacific/Auckland", "", 12345, 1491053400000000); + /* Confirm that even though it's a time change here (backward) 02:30 happens only once */ + test_next("2017-04-02 02:30:00 Pacific/Auckland", "", 1491053400000000, -1); + test_next("2017-04-02 03:30:00 Pacific/Auckland", "", 12345, 1491060600000000); + /* Confirm that timezones in the Spec work regardless of current timezone */ + test_next("2017-09-09 20:42:00 Pacific/Auckland", "", 12345, 1504946520000000); + test_next("2017-09-09 20:42:00 Pacific/Auckland", "EET", 12345, 1504946520000000); + /* Check that we don't start looping if mktime() moves us backwards */ + test_next("Sun *-*-* 01:00:00 Europe/Dublin", "", 1616412478000000, 1617494400000000); + test_next("Sun *-*-* 01:00:00 Europe/Dublin", "IST", 1616412478000000, 1617494400000000); +} + +TEST(calendar_spec_from_string) { + CalendarSpec *c; + + assert_se(calendar_spec_from_string("test", &c) == -EINVAL); + assert_se(calendar_spec_from_string(" utc", &c) == -EINVAL); + assert_se(calendar_spec_from_string(" ", &c) == -EINVAL); + assert_se(calendar_spec_from_string("", &c) == -EINVAL); + assert_se(calendar_spec_from_string("7", &c) == -EINVAL); + assert_se(calendar_spec_from_string("121212:1:2", &c) == -EINVAL); + assert_se(calendar_spec_from_string("2000-03-05.23 00:00:00", &c) == -EINVAL); + assert_se(calendar_spec_from_string("2000-03-05 00:00.1:00", &c) == -EINVAL); + assert_se(calendar_spec_from_string("00:00:00/0.00000001", &c) == -ERANGE); + assert_se(calendar_spec_from_string("00:00:00.0..00.9", &c) == -EINVAL); + assert_se(calendar_spec_from_string("2016~11-22", &c) == -EINVAL); + assert_se(calendar_spec_from_string("*-*~5/5", &c) == -EINVAL); + assert_se(calendar_spec_from_string("Monday.. 12:00", &c) == -EINVAL); + assert_se(calendar_spec_from_string("Monday..", &c) == -EINVAL); + assert_se(calendar_spec_from_string("-00:+00/-5", &c) == -EINVAL); + assert_se(calendar_spec_from_string("00:+00/-5", &c) == -EINVAL); + assert_se(calendar_spec_from_string("2016- 11- 24 12: 30: 00", &c) == -EINVAL); + assert_se(calendar_spec_from_string("*~29", &c) == -EINVAL); + assert_se(calendar_spec_from_string("*~16..31", &c) == -EINVAL); + assert_se(calendar_spec_from_string("12..1/2-*", &c) == -EINVAL); + assert_se(calendar_spec_from_string("20/4:00", &c) == -EINVAL); + assert_se(calendar_spec_from_string("00:00/60", &c) == -EINVAL); + assert_se(calendar_spec_from_string("00:00:2300", &c) == -ERANGE); + assert_se(calendar_spec_from_string("00:00:18446744073709551615", &c) == -ERANGE); + assert_se(calendar_spec_from_string("@88588582097858858", &c) == -ERANGE); + assert_se(calendar_spec_from_string("*:4,30:*,5", &c) == -EINVAL); + assert_se(calendar_spec_from_string("*:4,30:5,*", &c) == -EINVAL); + assert_se(calendar_spec_from_string("*:4,30:*\n", &c) == -EINVAL); +} + +static int intro(void) { + /* Tests have hard-coded results that do not expect a specific timezone to be set by the caller */ + assert_se(unsetenv("TZ") >= 0); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-cap-list.c b/src/test/test-cap-list.c new file mode 100644 index 0000000..a9cbf69 --- /dev/null +++ b/src/test/test-cap-list.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "cap-list.h" +#include "capability-util.h" +#include "parse-util.h" +#include "random-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +/* verify the capability parser */ +TEST(cap_list) { + assert_se(!capability_to_name(-1)); + assert_se(!capability_to_name(capability_list_length())); + assert_se(!capability_to_name(63)); + assert_se(!capability_to_name(64)); + + assert_se(!CAPABILITY_TO_STRING(-1)); + if (capability_list_length() <= 62) + assert_se(streq(CAPABILITY_TO_STRING(62), "0x3e")); + assert_se(!CAPABILITY_TO_STRING(64)); + + for (int i = 0; i < capability_list_length(); i++) { + const char *n; + + assert_se(n = capability_to_name(i)); + assert_se(capability_from_name(n) == i); + printf("%s = %i\n", n, i); + + assert_se(streq(CAPABILITY_TO_STRING(i), n)); + } + + assert_se(capability_from_name("asdfbsd") == -EINVAL); + assert_se(capability_from_name("CAP_AUDIT_READ") == CAP_AUDIT_READ); + assert_se(capability_from_name("cap_audit_read") == CAP_AUDIT_READ); + assert_se(capability_from_name("cAp_aUdIt_rEAd") == CAP_AUDIT_READ); + assert_se(capability_from_name("0") == 0); + assert_se(capability_from_name("15") == 15); + assert_se(capability_from_name("62") == 62); + assert_se(capability_from_name("63") == -EINVAL); + assert_se(capability_from_name("64") == -EINVAL); + assert_se(capability_from_name("-1") == -EINVAL); + + for (int i = 0; i < capability_list_length(); i++) { + _cleanup_cap_free_charp_ char *a = NULL; + const char *b; + unsigned u; + + assert_se(a = cap_to_name(i)); + + /* quit the loop as soon as libcap starts returning + * numeric ids, formatted as strings */ + if (safe_atou(a, &u) >= 0) + break; + + assert_se(b = capability_to_name(i)); + + printf("%s vs. %s\n", a, b); + + assert_se(strcasecmp(a, b) == 0); + } +} + +static void test_capability_set_one(uint64_t c, const char *t) { + _cleanup_free_ char *t1 = NULL; + uint64_t c1, c_masked = c & all_capabilities(); + + assert_se(capability_set_to_string(c, &t1) == 0); + assert_se(streq(t1, t)); + + assert_se(capability_set_from_string(t1, &c1) > 0); + assert_se(c1 == c_masked); + + free(t1); + assert_se(t1 = strjoin("'cap_chown cap_dac_override' \"cap_setgid cap_setuid\"", t, + " hogehoge foobar 18446744073709551616 3.14 -3 ", t)); + assert_se(capability_set_from_string(t1, &c1) == 0); + assert_se(c1 == c_masked); +} + +TEST(capability_set_from_string) { + uint64_t c; + + assert_se(capability_set_from_string(NULL, &c) > 0); + assert_se(c == 0); + + assert_se(capability_set_from_string("", &c) > 0); + assert_se(c == 0); + + assert_se(capability_set_from_string("0", &c) > 0); + assert_se(c == UINT64_C(1)); + + assert_se(capability_set_from_string("1", &c) > 0); + assert_se(c == UINT64_C(1) << 1); + + assert_se(capability_set_from_string("0 1 2 3", &c) > 0); + assert_se(c == (UINT64_C(1) << 4) - 1); +} + +static void test_capability_set_to_strv_one(uint64_t m, char **l) { + _cleanup_strv_free_ char **b = NULL; + + assert_se(capability_set_to_strv(m, &b) >= 0); + assert_se(strv_equal(l, b)); +} + +TEST(capability_set_to_strv) { + test_capability_set_to_strv_one(0, STRV_MAKE(NULL)); + test_capability_set_to_strv_one(UINT64_C(1) << CAP_MKNOD, STRV_MAKE("cap_mknod")); + test_capability_set_to_strv_one((UINT64_C(1) << CAP_MKNOD) | + (UINT64_C(1) << CAP_NET_BIND_SERVICE), STRV_MAKE("cap_net_bind_service", "cap_mknod")); + test_capability_set_to_strv_one((UINT64_C(1) << CAP_MKNOD) | + (UINT64_C(1) << CAP_NET_BIND_SERVICE) | + (UINT64_C(1) << CAP_IPC_OWNER), STRV_MAKE("cap_net_bind_service", "cap_ipc_owner", "cap_mknod")); +} + +static void test_capability_set_to_string_invalid(uint64_t invalid_cap_set) { + uint64_t c; + + test_capability_set_one(invalid_cap_set, ""); + + c = (UINT64_C(1) << CAP_DAC_OVERRIDE | invalid_cap_set); + test_capability_set_one(c, "cap_dac_override"); + + c = (UINT64_C(1) << CAP_CHOWN | + UINT64_C(1) << CAP_DAC_OVERRIDE | + UINT64_C(1) << CAP_DAC_READ_SEARCH | + UINT64_C(1) << CAP_FOWNER | + UINT64_C(1) << CAP_SETGID | + UINT64_C(1) << CAP_SETUID | + UINT64_C(1) << CAP_SYS_PTRACE | + UINT64_C(1) << CAP_SYS_ADMIN | + UINT64_C(1) << CAP_AUDIT_CONTROL | + UINT64_C(1) << CAP_MAC_OVERRIDE | + UINT64_C(1) << CAP_SYSLOG | + invalid_cap_set); + test_capability_set_one(c, ("cap_chown cap_dac_override cap_dac_read_search cap_fowner " + "cap_setgid cap_setuid cap_sys_ptrace cap_sys_admin " + "cap_audit_control cap_mac_override cap_syslog")); +} + +TEST(capability_set_to_string) { + test_capability_set_to_string_invalid(0); + + /* once the kernel supports 62 caps, there are no 'invalid' numbers + * for us to test with */ + if (cap_last_cap() < 62) + test_capability_set_to_string_invalid(all_capabilities() + 1); +} + +TEST(capability_set_to_string_negative) { + + for (unsigned i = 0; i < 150; i++) { + _cleanup_free_ char *a = NULL, *b = NULL; + + uint64_t m = + random_u64() % (UINT64_C(1) << (cap_last_cap() + 1)); + + assert_se(capability_set_to_string(m, &a) >= 0); + assert_se(capability_set_to_string_negative(m, &b) >= 0); + + printf("%s (%zu) → ", a, strlen(a)); + + if (streq(a, b)) + printf("same\n"); + else + printf("%s (%zu)\n", b, strlen(b)); + + assert_se(strlen(b) <= strlen(a)); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-capability.c b/src/test/test-capability.c new file mode 100644 index 0000000..e8a0569 --- /dev/null +++ b/src/test/test-capability.c @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#define TEST_CAPABILITY_C + +#include "alloc-util.h" +#include "capability-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "macro.h" +#include "missing_prctl.h" +#include "parse-util.h" +#include "process-util.h" +#include "string-util.h" +#include "tests.h" + +static uid_t test_uid = -1; +static gid_t test_gid = -1; + +#if HAS_FEATURE_ADDRESS_SANITIZER +/* Keep CAP_SYS_PTRACE when running under Address Sanitizer */ +static const uint64_t test_flags = UINT64_C(1) << CAP_SYS_PTRACE; +#else +/* We keep CAP_DAC_OVERRIDE to avoid errors with gcov when doing test coverage */ +static const uint64_t test_flags = UINT64_C(1) << CAP_DAC_OVERRIDE; +#endif + +/* verify cap_last_cap() against /proc/sys/kernel/cap_last_cap */ +static void test_last_cap_file(void) { + _cleanup_free_ char *content = NULL; + unsigned long val = 0; + int r; + + r = read_one_line_file("/proc/sys/kernel/cap_last_cap", &content); + if (r == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(r)) /* kernel pre 3.2 or no access */ + return; + assert_se(r >= 0); + + r = safe_atolu(content, &val); + assert_se(r >= 0); + assert_se(val != 0); + assert_se(val == cap_last_cap()); +} + +/* verify cap_last_cap() against syscall probing */ +static void test_last_cap_probe(void) { + unsigned long p = (unsigned long)CAP_LAST_CAP; + + if (prctl(PR_CAPBSET_READ, p) < 0) { + for (p--; p > 0; p --) + if (prctl(PR_CAPBSET_READ, p) >= 0) + break; + } else { + for (;; p++) + if (prctl(PR_CAPBSET_READ, p+1) < 0) + break; + } + + assert_se(p != 0); + assert_se(p == cap_last_cap()); +} + +static void fork_test(void (*test_func)(void)) { + pid_t pid = 0; + + pid = fork(); + assert_se(pid >= 0); + if (pid == 0) { + test_func(); + exit(EXIT_SUCCESS); + } else if (pid > 0) { + int status; + + assert_se(waitpid(pid, &status, 0) > 0); + assert_se(WIFEXITED(status) && WEXITSTATUS(status) == 0); + } +} + +static void show_capabilities(void) { + cap_t caps; + char *text; + + caps = cap_get_proc(); + assert_se(caps); + + text = cap_to_text(caps, NULL); + assert_se(text); + + log_info("Capabilities:%s", text); + cap_free(caps); + cap_free(text); +} + +static int setup_tests(bool *run_ambient) { + struct passwd *nobody; + int r; + + nobody = getpwnam(NOBODY_USER_NAME); + if (!nobody) + return log_warning_errno(SYNTHETIC_ERRNO(ENOENT), "Couldn't find 'nobody' user: %m"); + + test_uid = nobody->pw_uid; + test_gid = nobody->pw_gid; + + r = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0); + /* There's support for PR_CAP_AMBIENT if the prctl() call succeeded or error code was something else + * than EINVAL. The EINVAL check should be good enough to rule out false positives. */ + *run_ambient = r >= 0 || errno != EINVAL; + + return 0; +} + +static void test_drop_privileges_keep_net_raw(void) { + int sock; + + sock = socket(AF_INET, SOCK_RAW, IPPROTO_UDP); + assert_se(sock >= 0); + safe_close(sock); + + assert_se(drop_privileges(test_uid, test_gid, test_flags | (1ULL << CAP_NET_RAW)) >= 0); + assert_se(getuid() == test_uid); + assert_se(getgid() == test_gid); + show_capabilities(); + + sock = socket(AF_INET, SOCK_RAW, IPPROTO_UDP); + assert_se(sock >= 0); + safe_close(sock); +} + +static void test_drop_privileges_dontkeep_net_raw(void) { + int sock; + + sock = socket(AF_INET, SOCK_RAW, IPPROTO_UDP); + assert_se(sock >= 0); + safe_close(sock); + + assert_se(drop_privileges(test_uid, test_gid, test_flags) >= 0); + assert_se(getuid() == test_uid); + assert_se(getgid() == test_gid); + show_capabilities(); + + sock = socket(AF_INET, SOCK_RAW, IPPROTO_UDP); + assert_se(sock < 0); +} + +static void test_drop_privileges_fail(void) { + assert_se(drop_privileges(test_uid, test_gid, test_flags) >= 0); + assert_se(getuid() == test_uid); + assert_se(getgid() == test_gid); + + assert_se(drop_privileges(test_uid, test_gid, test_flags) < 0); + assert_se(drop_privileges(0, 0, test_flags) < 0); +} + +static void test_drop_privileges(void) { + fork_test(test_drop_privileges_fail); + + if (have_effective_cap(CAP_NET_RAW) <= 0) /* The remaining two tests only work if we have CAP_NET_RAW + * in the first place. If we are run in some restricted + * container environment we might not. */ + return; + + fork_test(test_drop_privileges_keep_net_raw); + fork_test(test_drop_privileges_dontkeep_net_raw); +} + +static void test_have_effective_cap(void) { + assert_se(have_effective_cap(CAP_KILL) > 0); + assert_se(have_effective_cap(CAP_CHOWN) > 0); + + assert_se(drop_privileges(test_uid, test_gid, test_flags | (1ULL << CAP_KILL)) >= 0); + assert_se(getuid() == test_uid); + assert_se(getgid() == test_gid); + + assert_se(have_effective_cap(CAP_KILL) > 0); + assert_se(have_effective_cap(CAP_CHOWN) == 0); +} + +static void test_update_inherited_set(void) { + cap_t caps; + uint64_t set = 0; + cap_flag_value_t fv; + + caps = cap_get_proc(); + assert_se(caps); + + set = (UINT64_C(1) << CAP_CHOWN); + + assert_se(!capability_update_inherited_set(caps, set)); + assert_se(!cap_get_flag(caps, CAP_CHOWN, CAP_INHERITABLE, &fv)); + assert_se(fv == CAP_SET); + + cap_free(caps); +} + +static void test_apply_ambient_caps(void) { + cap_t caps; + uint64_t set = 0; + cap_flag_value_t fv; + + assert_se(prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) == 0); + + set = (UINT64_C(1) << CAP_CHOWN); + + assert_se(!capability_ambient_set_apply(set, true)); + + caps = cap_get_proc(); + assert_se(caps); + assert_se(!cap_get_flag(caps, CAP_CHOWN, CAP_INHERITABLE, &fv)); + assert_se(fv == CAP_SET); + cap_free(caps); + + assert_se(prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) == 1); + + assert_se(!capability_ambient_set_apply(0, true)); + caps = cap_get_proc(); + assert_se(caps); + assert_se(!cap_get_flag(caps, CAP_CHOWN, CAP_INHERITABLE, &fv)); + assert_se(fv == CAP_CLEAR); + cap_free(caps); + + assert_se(prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) == 0); +} + +static void test_ensure_cap_64_bit(void) { + _cleanup_free_ char *content = NULL; + unsigned long p = 0; + int r; + + r = read_one_line_file("/proc/sys/kernel/cap_last_cap", &content); + if (r == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(r)) /* kernel pre 3.2 or no access */ + return; + assert_se(r >= 0); + + assert_se(safe_atolu(content, &p) >= 0); + + /* If caps don't fit into 64-bit anymore, we have a problem, fail the test. */ + assert_se(p <= 63); + + /* Also check for the header definition */ + assert_cc(CAP_LAST_CAP <= 63); +} + +static void test_capability_get_ambient(void) { + uint64_t c; + int r; + + assert_se(capability_get_ambient(&c) >= 0); + + r = safe_fork("(getambient)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL); + assert_se(r >= 0); + + if (r == 0) { + int x, y; + /* child */ + assert_se(capability_get_ambient(&c) >= 0); + + x = capability_ambient_set_apply( + (UINT64_C(1) << CAP_MKNOD)| + (UINT64_C(1) << CAP_LINUX_IMMUTABLE), + /* also_inherit= */ true); + assert_se(x >= 0 || ERRNO_IS_PRIVILEGE(x)); + + assert_se(capability_get_ambient(&c) >= 0); + assert_se(x < 0 || FLAGS_SET(c, UINT64_C(1) << CAP_MKNOD)); + assert_se(x < 0 || FLAGS_SET(c, UINT64_C(1) << CAP_LINUX_IMMUTABLE)); + assert_se(x < 0 || !FLAGS_SET(c, UINT64_C(1) << CAP_SETPCAP)); + + y = capability_bounding_set_drop( + ((UINT64_C(1) << CAP_LINUX_IMMUTABLE)| + (UINT64_C(1) << CAP_SETPCAP)), + /* right_now= */ true); + assert_se(y >= 0 || ERRNO_IS_PRIVILEGE(y)); + + assert_se(capability_get_ambient(&c) >= 0); + assert_se(x < 0 || y < 0 || !FLAGS_SET(c, UINT64_C(1) << CAP_MKNOD)); + assert_se(x < 0 || y < 0 || FLAGS_SET(c, UINT64_C(1) << CAP_LINUX_IMMUTABLE)); + assert_se(x < 0 || y < 0 || !FLAGS_SET(c, UINT64_C(1) << CAP_SETPCAP)); + + y = capability_bounding_set_drop( + (UINT64_C(1) << CAP_SETPCAP), + /* right_now= */ true); + assert_se(y >= 0 || ERRNO_IS_PRIVILEGE(y)); + + assert_se(capability_get_ambient(&c) >= 0); + assert_se(x < 0 || y < 0 || !FLAGS_SET(c, UINT64_C(1) << CAP_MKNOD)); + assert_se(x < 0 || y < 0 || !FLAGS_SET(c, UINT64_C(1) << CAP_LINUX_IMMUTABLE)); + assert_se(x < 0 || y < 0 || !FLAGS_SET(c, UINT64_C(1) << CAP_SETPCAP)); + + _exit(EXIT_SUCCESS); + } +} + +int main(int argc, char *argv[]) { + bool run_ambient; + + test_setup_logging(LOG_DEBUG); + + test_ensure_cap_64_bit(); + + test_last_cap_file(); + test_last_cap_probe(); + + log_info("have ambient caps: %s", yes_no(ambient_capabilities_supported())); + + if (getuid() != 0) + return log_tests_skipped("not running as root"); + + if (setup_tests(&run_ambient) < 0) + return log_tests_skipped("setup failed"); + + show_capabilities(); + + test_drop_privileges(); + test_update_inherited_set(); + + fork_test(test_have_effective_cap); + + if (run_ambient) + fork_test(test_apply_ambient_caps); + + test_capability_get_ambient(); + + return 0; +} diff --git a/src/test/test-cgroup-cpu.c b/src/test/test-cgroup-cpu.c new file mode 100644 index 0000000..fcf84d3 --- /dev/null +++ b/src/test/test-cgroup-cpu.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "cgroup.h" +#include "log.h" +#include "tests.h" + +TEST(group_cpu_adjust_period) { + /* Period 1ms, quota 40% -> Period 2.5ms */ + assert_se(2500 == cgroup_cpu_adjust_period(USEC_PER_MSEC, 400 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 10ms, quota 10% -> keep. */ + assert_se(10 * USEC_PER_MSEC == cgroup_cpu_adjust_period(10 * USEC_PER_MSEC, 100 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 1ms, quota 1000% -> keep. */ + assert_se(USEC_PER_MSEC == cgroup_cpu_adjust_period(USEC_PER_MSEC, 10000 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 100ms, quota 30% -> keep. */ + assert_se(100 * USEC_PER_MSEC == cgroup_cpu_adjust_period(100 * USEC_PER_MSEC, 300 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 5s, quota 40% -> adjust to 1s. */ + assert_se(USEC_PER_SEC == cgroup_cpu_adjust_period(5 * USEC_PER_SEC, 400 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 2s, quota 250% -> adjust to 1s. */ + assert_se(USEC_PER_SEC == cgroup_cpu_adjust_period(2 * USEC_PER_SEC, 2500 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 10us, quota 5,000,000% -> adjust to 1ms. */ + assert_se(USEC_PER_MSEC == cgroup_cpu_adjust_period(10, 50000000 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 10ms, quota 50,000% -> keep. */ + assert_se(10 * USEC_PER_MSEC == cgroup_cpu_adjust_period(10 * USEC_PER_MSEC, 500000 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 10ms, quota 1% -> adjust to 100ms. */ + assert_se(100 * USEC_PER_MSEC == cgroup_cpu_adjust_period(10 * USEC_PER_MSEC, 10 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 10ms, quota .001% -> adjust to 1s. */ + assert_se(1 * USEC_PER_SEC == cgroup_cpu_adjust_period(10 * USEC_PER_MSEC, 10, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 0ms, quota 200% -> adjust to 1ms. */ + assert_se(1 * USEC_PER_MSEC == cgroup_cpu_adjust_period(0, 2 * USEC_PER_SEC, USEC_PER_MSEC, USEC_PER_SEC)); + /* Period 0ms, quota 40% -> adjust to 2.5ms. */ + assert_se(2500 == cgroup_cpu_adjust_period(0, 400 * USEC_PER_MSEC, USEC_PER_MSEC, USEC_PER_SEC)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-cgroup-mask.c b/src/test/test-cgroup-mask.c new file mode 100644 index 0000000..bfc8fac --- /dev/null +++ b/src/test/test-cgroup-mask.c @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "cgroup.h" +#include "cgroup-util.h" +#include "macro.h" +#include "manager.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "unit.h" + +#define ASSERT_CGROUP_MASK(got, expected) \ + log_cgroup_mask(got, expected); \ + assert_se(got == expected) + +#define ASSERT_CGROUP_MASK_JOINED(got, expected) ASSERT_CGROUP_MASK(got, CGROUP_MASK_EXTEND_JOINED(expected)) + +static void log_cgroup_mask(CGroupMask got, CGroupMask expected) { + _cleanup_free_ char *e_store = NULL, *g_store = NULL; + + assert_se(cg_mask_to_string(expected, &e_store) >= 0); + log_info("Expected mask: %s", e_store); + assert_se(cg_mask_to_string(got, &g_store) >= 0); + log_info("Got mask: %s", g_store); +} + +TEST_RET(cgroup_mask, .sd_booted = true) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep, *nomem_parent, *nomem_leaf; + int r; + CGroupMask cpu_accounting_mask = get_cpu_accounting_mask(); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + /* Prepare the manager. */ + _cleanup_free_ char *unit_dir = NULL; + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m); + if (IN_SET(r, -EPERM, -EACCES)) { + log_error_errno(r, "manager_new: %m"); + return log_tests_skipped("cannot create manager"); + } + + assert_se(r >= 0); + + /* Turn off all kinds of default accounting, so that we can + * verify the masks resulting of our configuration and nothing + * else. */ + m->defaults.cpu_accounting = + m->defaults.memory_accounting = + m->defaults.blockio_accounting = + m->defaults.io_accounting = + m->defaults.tasks_accounting = false; + m->defaults.tasks_max = CGROUP_TASKS_MAX_UNSET; + + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + /* Load units and verify hierarchy. */ + assert_se(manager_load_startable_unit_or_warn(m, "parent.slice", NULL, &parent) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "son.service", NULL, &son) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "daughter.service", NULL, &daughter) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "grandchild.service", NULL, &grandchild) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "parent-deep.slice", NULL, &parent_deep) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "nomem.slice", NULL, &nomem_parent) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "nomemleaf.service", NULL, &nomem_leaf) >= 0); + assert_se(UNIT_GET_SLICE(son) == parent); + assert_se(UNIT_GET_SLICE(daughter) == parent); + assert_se(UNIT_GET_SLICE(parent_deep) == parent); + assert_se(UNIT_GET_SLICE(grandchild) == parent_deep); + assert_se(UNIT_GET_SLICE(nomem_leaf) == nomem_parent); + root = UNIT_GET_SLICE(parent); + assert_se(UNIT_GET_SLICE(nomem_parent) == root); + + /* Verify per-unit cgroups settings. */ + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(son), CGROUP_MASK_CPU); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(daughter), cpu_accounting_mask); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(grandchild), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent_deep), CGROUP_MASK_MEMORY); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_parent), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(root), 0); + + /* Verify aggregation of member masks */ + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(son), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(daughter), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(grandchild), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent_deep), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_leaf), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + + /* Verify aggregation of sibling masks. */ + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(son), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(daughter), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(grandchild), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent_deep), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_parent), (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + + /* Verify aggregation of target masks. */ + ASSERT_CGROUP_MASK(unit_get_target_mask(son), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(daughter), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(grandchild), 0); + ASSERT_CGROUP_MASK(unit_get_target_mask(parent_deep), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_leaf), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(root), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + + /* Verify aggregation of enable masks. */ + ASSERT_CGROUP_MASK(unit_get_enable_mask(son), 0); + ASSERT_CGROUP_MASK(unit_get_enable_mask(daughter), 0); + ASSERT_CGROUP_MASK(unit_get_enable_mask(grandchild), 0); + ASSERT_CGROUP_MASK(unit_get_enable_mask(parent_deep), 0); + ASSERT_CGROUP_MASK(unit_get_enable_mask(parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_enable_mask(nomem_parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_enable_mask(nomem_leaf), 0); + ASSERT_CGROUP_MASK(unit_get_enable_mask(root), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + + return 0; +} + +static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) { + _cleanup_free_ char *b = NULL; + + assert_se(cg_mask_to_string(mask, &b) >= 0); + assert_se(streq_ptr(b, t)); +} + +TEST(cg_mask_to_string) { + test_cg_mask_to_string_one(0, NULL); + test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct cpuset io blkio memory devices pids bpf-firewall bpf-devices bpf-foreign bpf-socket-bind bpf-restrict-network-interfaces"); + test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu"); + test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct"); + test_cg_mask_to_string_one(CGROUP_MASK_CPUSET, "cpuset"); + test_cg_mask_to_string_one(CGROUP_MASK_IO, "io"); + test_cg_mask_to_string_one(CGROUP_MASK_BLKIO, "blkio"); + test_cg_mask_to_string_one(CGROUP_MASK_MEMORY, "memory"); + test_cg_mask_to_string_one(CGROUP_MASK_DEVICES, "devices"); + test_cg_mask_to_string_one(CGROUP_MASK_PIDS, "pids"); + test_cg_mask_to_string_one(CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT, "cpu cpuacct"); + test_cg_mask_to_string_one(CGROUP_MASK_CPU|CGROUP_MASK_PIDS, "cpu pids"); + test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT|CGROUP_MASK_PIDS, "cpuacct pids"); + test_cg_mask_to_string_one(CGROUP_MASK_DEVICES|CGROUP_MASK_PIDS, "devices pids"); + test_cg_mask_to_string_one(CGROUP_MASK_IO|CGROUP_MASK_BLKIO, "io blkio"); +} + +static void cgroup_device_permissions_test_normalize(const char *a, const char *b) { + assert_se(streq_ptr(cgroup_device_permissions_to_string(cgroup_device_permissions_from_string(a)), b)); +} + +TEST(cgroup_device_permissions) { + for (CGroupDevicePermissions p = 0; p < _CGROUP_DEVICE_PERMISSIONS_MAX; p++) { + const char *s; + + assert_se(s = cgroup_device_permissions_to_string(p)); + assert_se(cgroup_device_permissions_from_string(s) == p); + } + + cgroup_device_permissions_test_normalize("", ""); + cgroup_device_permissions_test_normalize("rw", "rw"); + cgroup_device_permissions_test_normalize("wr", "rw"); + cgroup_device_permissions_test_normalize("wwrr", "rw"); + cgroup_device_permissions_test_normalize("mmmmmmmmmmmmmm", "m"); + cgroup_device_permissions_test_normalize("mmmmrrrrmmmwwmwmwmwmwmrmrmr", "rwm"); + + assert_se(cgroup_device_permissions_from_string(NULL) == -EINVAL); + assert_se(cgroup_device_permissions_from_string("rwq") == -EINVAL); + assert_se(cgroup_device_permissions_from_string("RW") == -EINVAL); + assert_se(cgroup_device_permissions_from_string("") == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-cgroup-setup.c b/src/test/test-cgroup-setup.c new file mode 100644 index 0000000..e669e9b --- /dev/null +++ b/src/test/test-cgroup-setup.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "errno-util.h" +#include "log.h" +#include "proc-cmdline.h" +#include "string-util.h" +#include "tests.h" + +static void test_is_wanted_print_one(bool header) { + _cleanup_free_ char *cmdline = NULL; + + log_info("-- %s --", __func__); + assert_se(proc_cmdline(&cmdline) >= 0); + log_info("cmdline: %s", cmdline); + if (header) { + log_info("default-hierarchy=" DEFAULT_HIERARCHY_NAME); + (void) system("findmnt -n /sys/fs/cgroup"); + } + + log_info("is_unified_wanted() → %s", yes_no(cg_is_unified_wanted())); + log_info("is_hybrid_wanted() → %s", yes_no(cg_is_hybrid_wanted())); + log_info("is_legacy_wanted() → %s", yes_no(cg_is_legacy_wanted())); + log_info(" "); +} + +TEST(is_wanted_print) { + test_is_wanted_print_one(true); + test_is_wanted_print_one(false); /* run twice to test caching */ +} + +TEST(is_wanted) { + assert_se(setenv("SYSTEMD_PROC_CMDLINE", + "systemd.unified_cgroup_hierarchy", 1) >= 0); + test_is_wanted_print_one(false); + + assert_se(setenv("SYSTEMD_PROC_CMDLINE", + "systemd.unified_cgroup_hierarchy=0", 1) >= 0); + test_is_wanted_print_one(false); + + assert_se(setenv("SYSTEMD_PROC_CMDLINE", + "systemd.unified_cgroup_hierarchy=0 " + "systemd.legacy_systemd_cgroup_controller", 1) >= 0); + test_is_wanted_print_one(false); + + assert_se(setenv("SYSTEMD_PROC_CMDLINE", + "systemd.unified_cgroup_hierarchy=0 " + "systemd.legacy_systemd_cgroup_controller=0", 1) >= 0); + test_is_wanted_print_one(false); + + /* cgroup_no_v1=all implies unified cgroup hierarchy, unless otherwise + * explicitly specified. */ + assert_se(setenv("SYSTEMD_PROC_CMDLINE", + "cgroup_no_v1=all", 1) >= 0); + test_is_wanted_print_one(false); + + assert_se(setenv("SYSTEMD_PROC_CMDLINE", + "cgroup_no_v1=all " + "systemd.unified_cgroup_hierarchy=0", 1) >= 0); + test_is_wanted_print_one(false); +} + +static int intro(void) { + if (access("/proc/cmdline", R_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return log_tests_skipped("can't read /proc/cmdline"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-cgroup-unit-default.c b/src/test/test-cgroup-unit-default.c new file mode 100644 index 0000000..62618ce --- /dev/null +++ b/src/test/test-cgroup-unit-default.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "cgroup.h" +#include "manager.h" +#include "rm-rf.h" +#include "tests.h" +#include "unit.h" + +TEST_RET(default_memory_low, .sd_booted = true) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *root, *dml, + *dml_passthrough, *dml_passthrough_empty, *dml_passthrough_set_dml, *dml_passthrough_set_ml, + *dml_override, *dml_override_empty, + *dml_discard, *dml_discard_empty, *dml_discard_set_ml; + uint64_t dml_tree_default; + int r; + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + _cleanup_free_ char *unit_dir = NULL; + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m); + if (IN_SET(r, -EPERM, -EACCES)) { + log_error_errno(r, "manager_new: %m"); + return log_tests_skipped("cannot create manager"); + } + + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + /* dml.slice has DefaultMemoryLow=50. Beyond that, individual subhierarchies look like this: + * + * 1. dml-passthrough.slice sets MemoryLow=100. This should not affect its children, as only + * DefaultMemoryLow is propagated, not MemoryLow. As such, all leaf services should end up with + * memory.low as 50, inherited from dml.slice, *except* for dml-passthrough-set-ml.service, which + * should have the value of 0, as it has MemoryLow explicitly set. + * + * ┌───────────┐ + * │ dml.slice │ + * └─────┬─────┘ + * MemoryLow=100 + * ┌───────────┴───────────┐ + * │ dml-passthrough.slice │ + * └───────────┬───────────┘ + * ┌───────────────────────────────────┼───────────────────────────────────┐ + * no new settings DefaultMemoryLow=15 MemoryLow=0 + * ┌───────────────┴───────────────┐ ┌────────────────┴────────────────┐ ┌───────────────┴────────────────┐ + * │ dml-passthrough-empty.service │ │ dml-passthrough-set-dml.service │ │ dml-passthrough-set-ml.service │ + * └───────────────────────────────┘ └─────────────────────────────────┘ └────────────────────────────────┘ + * + * 2. dml-override.slice sets DefaultMemoryLow=10. As such, dml-override-empty.service should also + * end up with a memory.low of 10. dml-override.slice should still have a memory.low of 50. + * + * ┌───────────┐ + * │ dml.slice │ + * └─────┬─────┘ + * DefaultMemoryLow=10 + * ┌─────────┴──────────┐ + * │ dml-override.slice │ + * └─────────┬──────────┘ + * no new settings + * ┌─────────────┴──────────────┐ + * │ dml-override-empty.service │ + * └────────────────────────────┘ + * + * 3. dml-discard.slice sets DefaultMemoryLow= with no rvalue. As such, + * dml-discard-empty.service should end up with a value of 0. + * dml-discard-set-ml.service sets MemoryLow=15, and as such should have that override the + * reset DefaultMemoryLow value. dml-discard.slice should still have an eventual memory.low of 50. + * + * ┌───────────┐ + * │ dml.slice │ + * └─────┬─────┘ + * DefaultMemoryLow= + * ┌─────────┴─────────┐ + * │ dml-discard.slice │ + * └─────────┬─────────┘ + * ┌──────────────┴───────────────┐ + * no new settings MemoryLow=15 + * ┌─────────────┴─────────────┐ ┌─────────────┴──────────────┐ + * │ dml-discard-empty.service │ │ dml-discard-set-ml.service │ + * └───────────────────────────┘ └────────────────────────────┘ + */ + assert_se(manager_load_startable_unit_or_warn(m, "dml.slice", NULL, &dml) >= 0); + + assert_se(manager_load_startable_unit_or_warn(m, "dml-passthrough.slice", NULL, &dml_passthrough) >= 0); + assert_se(UNIT_GET_SLICE(dml_passthrough) == dml); + assert_se(manager_load_startable_unit_or_warn(m, "dml-passthrough-empty.service", NULL, &dml_passthrough_empty) >= 0); + assert_se(UNIT_GET_SLICE(dml_passthrough_empty) == dml_passthrough); + assert_se(manager_load_startable_unit_or_warn(m, "dml-passthrough-set-dml.service", NULL, &dml_passthrough_set_dml) >= 0); + assert_se(UNIT_GET_SLICE(dml_passthrough_set_dml) == dml_passthrough); + assert_se(manager_load_startable_unit_or_warn(m, "dml-passthrough-set-ml.service", NULL, &dml_passthrough_set_ml) >= 0); + assert_se(UNIT_GET_SLICE(dml_passthrough_set_ml) == dml_passthrough); + + assert_se(manager_load_startable_unit_or_warn(m, "dml-override.slice", NULL, &dml_override) >= 0); + assert_se(UNIT_GET_SLICE(dml_override) == dml); + assert_se(manager_load_startable_unit_or_warn(m, "dml-override-empty.service", NULL, &dml_override_empty) >= 0); + assert_se(UNIT_GET_SLICE(dml_override_empty) == dml_override); + + assert_se(manager_load_startable_unit_or_warn(m, "dml-discard.slice", NULL, &dml_discard) >= 0); + assert_se(UNIT_GET_SLICE(dml_discard) == dml); + assert_se(manager_load_startable_unit_or_warn(m, "dml-discard-empty.service", NULL, &dml_discard_empty) >= 0); + assert_se(UNIT_GET_SLICE(dml_discard_empty) == dml_discard); + assert_se(manager_load_startable_unit_or_warn(m, "dml-discard-set-ml.service", NULL, &dml_discard_set_ml) >= 0); + assert_se(UNIT_GET_SLICE(dml_discard_set_ml) == dml_discard); + + assert_se(root = UNIT_GET_SLICE(dml)); + assert_se(!UNIT_GET_SLICE(root)); + + assert_se(unit_get_ancestor_memory_low(root) == CGROUP_LIMIT_MIN); + + assert_se(unit_get_ancestor_memory_low(dml) == CGROUP_LIMIT_MIN); + dml_tree_default = unit_get_cgroup_context(dml)->default_memory_low; + assert_se(dml_tree_default == 50); + + assert_se(unit_get_ancestor_memory_low(dml_passthrough) == 100); + assert_se(unit_get_ancestor_memory_low(dml_passthrough_empty) == dml_tree_default); + assert_se(unit_get_ancestor_memory_low(dml_passthrough_set_dml) == 50); + assert_se(unit_get_ancestor_memory_low(dml_passthrough_set_ml) == 0); + + assert_se(unit_get_ancestor_memory_low(dml_override) == dml_tree_default); + assert_se(unit_get_ancestor_memory_low(dml_override_empty) == 10); + + assert_se(unit_get_ancestor_memory_low(dml_discard) == dml_tree_default); + assert_se(unit_get_ancestor_memory_low(dml_discard_empty) == CGROUP_LIMIT_MIN); + assert_se(unit_get_ancestor_memory_low(dml_discard_set_ml) == 15); + + return 0; +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c new file mode 100644 index 0000000..51f52d9 --- /dev/null +++ b/src/test/test-cgroup-util.c @@ -0,0 +1,466 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "dirent-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "special.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "user-util.h" + +static void check_p_d_u(const char *path, int code, const char *result) { + _cleanup_free_ char *unit = NULL; + int r; + + r = cg_path_decode_unit(path, &unit); + printf("%s: %s → %s %d expected %s %d\n", __func__, path, unit, r, strnull(result), code); + assert_se(r == code); + assert_se(streq_ptr(unit, result)); +} + +TEST(path_decode_unit) { + check_p_d_u("getty@tty2.service", 0, "getty@tty2.service"); + check_p_d_u("getty@tty2.service/", 0, "getty@tty2.service"); + check_p_d_u("getty@tty2.service/xxx", 0, "getty@tty2.service"); + check_p_d_u("getty@.service/", -ENXIO, NULL); + check_p_d_u("getty@.service", -ENXIO, NULL); + check_p_d_u("getty.service", 0, "getty.service"); + check_p_d_u("getty", -ENXIO, NULL); + check_p_d_u("getty/waldo", -ENXIO, NULL); + check_p_d_u("_cpu.service", 0, "cpu.service"); +} + +static void check_p_g_u(const char *path, int code, const char *result) { + _cleanup_free_ char *unit = NULL; + int r; + + r = cg_path_get_unit(path, &unit); + printf("%s: %s → %s %d expected %s %d\n", __func__, path, unit, r, strnull(result), code); + assert_se(r == code); + assert_se(streq_ptr(unit, result)); +} + +TEST(path_get_unit) { + check_p_g_u("/system.slice/foobar.service/sdfdsaf", 0, "foobar.service"); + check_p_g_u("/system.slice/getty@tty5.service", 0, "getty@tty5.service"); + check_p_g_u("/system.slice/getty@tty5.service/aaa/bbb", 0, "getty@tty5.service"); + check_p_g_u("/system.slice/getty@tty5.service/", 0, "getty@tty5.service"); + check_p_g_u("/system.slice/getty@tty6.service/tty5", 0, "getty@tty6.service"); + check_p_g_u("sadfdsafsda", -ENXIO, NULL); + check_p_g_u("/system.slice/getty####@tty6.service/xxx", -ENXIO, NULL); + check_p_g_u("/system.slice/system-waldo.slice/foobar.service/sdfdsaf", 0, "foobar.service"); + check_p_g_u("/system.slice/system-waldo.slice/_cpu.service/sdfdsaf", 0, "cpu.service"); + check_p_g_u("/user.slice/user-1000.slice/user@1000.service/server.service", 0, "user@1000.service"); + check_p_g_u("/user.slice/user-1000.slice/user@.service/server.service", -ENXIO, NULL); +} + +static void check_p_g_u_p(const char *path, int code, const char *result) { + _cleanup_free_ char *unit_path = NULL; + int r; + + r = cg_path_get_unit_path(path, &unit_path); + printf("%s: %s → %s %d expected %s %d\n", __func__, path, unit_path, r, strnull(result), code); + assert_se(r == code); + assert_se(streq_ptr(unit_path, result)); +} + +TEST(path_get_unit_path) { + check_p_g_u_p("/system.slice/foobar.service/sdfdsaf", 0, "/system.slice/foobar.service"); + check_p_g_u_p("/system.slice/getty@tty5.service", 0, "/system.slice/getty@tty5.service"); + check_p_g_u_p("/system.slice/getty@tty5.service/aaa/bbb", 0, "/system.slice/getty@tty5.service"); + check_p_g_u_p("/system.slice/getty@tty5.service/", 0, "/system.slice/getty@tty5.service"); + check_p_g_u_p("/system.slice/getty@tty6.service/tty5", 0, "/system.slice/getty@tty6.service"); + check_p_g_u_p("sadfdsafsda", -ENXIO, NULL); + check_p_g_u_p("/system.slice/getty####@tty6.service/xxx", -ENXIO, NULL); + check_p_g_u_p("/system.slice/system-waldo.slice/foobar.service/sdfdsaf", 0, "/system.slice/system-waldo.slice/foobar.service"); + check_p_g_u_p("/system.slice/system-waldo.slice/_cpu.service/sdfdsaf", 0, "/system.slice/system-waldo.slice/_cpu.service"); + check_p_g_u_p("/system.slice/system-waldo.slice/_cpu.service", 0, "/system.slice/system-waldo.slice/_cpu.service"); + check_p_g_u_p("/user.slice/user-1000.slice/user@1000.service/server.service", 0, "/user.slice/user-1000.slice/user@1000.service"); + check_p_g_u_p("/user.slice/user-1000.slice/user@.service/server.service", -ENXIO, NULL); + check_p_g_u_p("/user.slice/_user-1000.slice/user@1000.service/foobar.slice/foobar@pie.service", 0, "/user.slice/_user-1000.slice/user@1000.service"); + check_p_g_u_p("/_session-2.scope/_foobar@pie.service/pa/po", 0, "/_session-2.scope"); +} + +static void check_p_g_u_u(const char *path, int code, const char *result) { + _cleanup_free_ char *unit = NULL; + int r; + + r = cg_path_get_user_unit(path, &unit); + printf("%s: %s → %s %d expected %s %d\n", __func__, path, unit, r, strnull(result), code); + assert_se(r == code); + assert_se(streq_ptr(unit, result)); +} + +TEST(path_get_user_unit) { + check_p_g_u_u("/user.slice/user-1000.slice/session-2.scope/foobar.service", 0, "foobar.service"); + check_p_g_u_u("/user.slice/user-1000.slice/session-2.scope/waldo.slice/foobar.service", 0, "foobar.service"); + check_p_g_u_u("/user.slice/user-1002.slice/session-2.scope/foobar.service/waldo", 0, "foobar.service"); + check_p_g_u_u("/user.slice/user-1000.slice/session-2.scope/foobar.service/waldo/uuuux", 0, "foobar.service"); + check_p_g_u_u("/user.slice/user-1000.slice/session-2.scope/waldo/waldo/uuuux", -ENXIO, NULL); + check_p_g_u_u("/user.slice/user-1000.slice/session-2.scope/foobar@pie.service/pa/po", 0, "foobar@pie.service"); + check_p_g_u_u("/session-2.scope/foobar@pie.service/pa/po", 0, "foobar@pie.service"); + check_p_g_u_u("/xyz.slice/xyz-waldo.slice/session-77.scope/foobar@pie.service/pa/po", 0, "foobar@pie.service"); + check_p_g_u_u("/meh.service", -ENXIO, NULL); + check_p_g_u_u("/session-3.scope/_cpu.service", 0, "cpu.service"); + check_p_g_u_u("/user.slice/user-1000.slice/user@1000.service/server.service", 0, "server.service"); + check_p_g_u_u("/user.slice/user-1000.slice/user@1000.service/foobar.slice/foobar@pie.service", 0, "foobar@pie.service"); + check_p_g_u_u("/user.slice/user-1000.slice/user@.service/server.service", -ENXIO, NULL); +} + +static void check_p_g_s(const char *path, int code, const char *result) { + _cleanup_free_ char *s = NULL; + + assert_se(cg_path_get_session(path, &s) == code); + assert_se(streq_ptr(s, result)); +} + +TEST(path_get_session) { + check_p_g_s("/user.slice/user-1000.slice/session-2.scope/foobar.service", 0, "2"); + check_p_g_s("/session-3.scope", 0, "3"); + check_p_g_s("/session-.scope", -ENXIO, NULL); + check_p_g_s("", -ENXIO, NULL); +} + +static void check_p_g_o_u(const char *path, int code, uid_t result) { + uid_t uid = 0; + + assert_se(cg_path_get_owner_uid(path, &uid) == code); + assert_se(uid == result); +} + +TEST(path_get_owner_uid) { + check_p_g_o_u("/user.slice/user-1000.slice/session-2.scope/foobar.service", 0, 1000); + check_p_g_o_u("/user.slice/user-1006.slice", 0, 1006); + check_p_g_o_u("", -ENXIO, 0); +} + +static void check_p_g_slice(const char *path, int code, const char *result) { + _cleanup_free_ char *s = NULL; + + assert_se(cg_path_get_slice(path, &s) == code); + assert_se(streq_ptr(s, result)); +} + +TEST(path_get_slice) { + check_p_g_slice("/user.slice", 0, "user.slice"); + check_p_g_slice("/foobar", 0, SPECIAL_ROOT_SLICE); + check_p_g_slice("/user.slice/user-waldo.slice", 0, "user-waldo.slice"); + check_p_g_slice("", 0, SPECIAL_ROOT_SLICE); + check_p_g_slice("foobar", 0, SPECIAL_ROOT_SLICE); + check_p_g_slice("foobar.slice", 0, "foobar.slice"); + check_p_g_slice("foo.slice/foo-bar.slice/waldo.service", 0, "foo-bar.slice"); +} + +static void check_p_g_u_slice(const char *path, int code, const char *result) { + _cleanup_free_ char *s = NULL; + + assert_se(cg_path_get_user_slice(path, &s) == code); + assert_se(streq_ptr(s, result)); +} + +TEST(path_get_user_slice) { + check_p_g_u_slice("/user.slice", -ENXIO, NULL); + check_p_g_u_slice("/foobar", -ENXIO, NULL); + check_p_g_u_slice("/user.slice/user-waldo.slice", -ENXIO, NULL); + check_p_g_u_slice("", -ENXIO, NULL); + check_p_g_u_slice("foobar", -ENXIO, NULL); + check_p_g_u_slice("foobar.slice", -ENXIO, NULL); + check_p_g_u_slice("foo.slice/foo-bar.slice/waldo.service", -ENXIO, NULL); + + check_p_g_u_slice("foo.slice/foo-bar.slice/user@1000.service", 0, SPECIAL_ROOT_SLICE); + check_p_g_u_slice("foo.slice/foo-bar.slice/user@1000.service/", 0, SPECIAL_ROOT_SLICE); + check_p_g_u_slice("foo.slice/foo-bar.slice/user@1000.service///", 0, SPECIAL_ROOT_SLICE); + check_p_g_u_slice("foo.slice/foo-bar.slice/user@1000.service/waldo.service", 0, SPECIAL_ROOT_SLICE); + check_p_g_u_slice("foo.slice/foo-bar.slice/user@1000.service/piep.slice/foo.service", 0, "piep.slice"); + check_p_g_u_slice("/foo.slice//foo-bar.slice/user@1000.service/piep.slice//piep-pap.slice//foo.service", 0, "piep-pap.slice"); +} + +TEST(get_paths, .sd_booted = true) { + _cleanup_free_ char *a = NULL; + + assert_se(cg_get_root_path(&a) >= 0); + log_info("Root = %s", a); +} + +TEST(proc) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert_se(proc_dir_open(&d) >= 0); + + for (;;) { + _cleanup_free_ char *path = NULL, *path_shifted = NULL, *session = NULL, *unit = NULL, *user_unit = NULL, *machine = NULL, *slice = NULL; + _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; + uid_t uid = UID_INVALID; + + r = proc_dir_read_pidref(d, &pid); + assert_se(r >= 0); + + if (r == 0) + break; + + if (pidref_is_kernel_thread(&pid) != 0) + continue; + + cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid.pid, &path); + cg_pid_get_path_shifted(pid.pid, NULL, &path_shifted); + cg_pid_get_owner_uid(pid.pid, &uid); + cg_pid_get_session(pid.pid, &session); + cg_pid_get_unit(pid.pid, &unit); + cg_pid_get_user_unit(pid.pid, &user_unit); + cg_pid_get_machine_name(pid.pid, &machine); + cg_pid_get_slice(pid.pid, &slice); + + printf(PID_FMT"\t%s\t%s\t"UID_FMT"\t%s\t%s\t%s\t%s\t%s\n", + pid.pid, + path, + path_shifted, + uid, + session, + unit, + user_unit, + machine, + slice); + } +} + +static void test_escape_one(const char *s, const char *expected) { + _cleanup_free_ char *b = NULL; + + assert_se(s); + assert_se(expected); + + assert_se(cg_escape(s, &b) >= 0); + assert_se(streq(b, expected)); + + assert_se(streq(cg_unescape(b), s)); + + assert_se(filename_is_valid(b)); + assert_se(!cg_needs_escape(s) || b[0] == '_'); +} + +TEST(escape, .sd_booted = true) { + test_escape_one("foobar", "foobar"); + test_escape_one(".foobar", "_.foobar"); + test_escape_one("foobar.service", "foobar.service"); + test_escape_one("cgroup.service", "_cgroup.service"); + test_escape_one("tasks", "_tasks"); + if (access("/sys/fs/cgroup/cpu", F_OK) == 0) + test_escape_one("cpu.service", "_cpu.service"); + test_escape_one("_foobar", "__foobar"); + test_escape_one("", "_"); + test_escape_one("_", "__"); + test_escape_one(".", "_."); +} + +TEST(controller_is_valid) { + assert_se(cg_controller_is_valid("foobar")); + assert_se(cg_controller_is_valid("foo_bar")); + assert_se(cg_controller_is_valid("name=foo")); + assert_se(!cg_controller_is_valid("")); + assert_se(!cg_controller_is_valid("name=")); + assert_se(!cg_controller_is_valid("=")); + assert_se(!cg_controller_is_valid("cpu,cpuacct")); + assert_se(!cg_controller_is_valid("_")); + assert_se(!cg_controller_is_valid("_foobar")); + assert_se(!cg_controller_is_valid("tatü")); +} + +static void test_slice_to_path_one(const char *unit, const char *path, int error) { + _cleanup_free_ char *ret = NULL; + int r; + + log_info("unit: %s", unit); + + r = cg_slice_to_path(unit, &ret); + log_info("actual: %s / %d", strnull(ret), r); + log_info("expect: %s / %d", strnull(path), error); + assert_se(r == error); + assert_se(streq_ptr(ret, path)); +} + +TEST(slice_to_path) { + test_slice_to_path_one("foobar.slice", "foobar.slice", 0); + test_slice_to_path_one("foobar-waldo.slice", "foobar.slice/foobar-waldo.slice", 0); + test_slice_to_path_one("foobar-waldo.service", NULL, -EINVAL); + test_slice_to_path_one(SPECIAL_ROOT_SLICE, "", 0); + test_slice_to_path_one("--.slice", NULL, -EINVAL); + test_slice_to_path_one("-", NULL, -EINVAL); + test_slice_to_path_one("-foo-.slice", NULL, -EINVAL); + test_slice_to_path_one("-foo.slice", NULL, -EINVAL); + test_slice_to_path_one("foo-.slice", NULL, -EINVAL); + test_slice_to_path_one("foo--bar.slice", NULL, -EINVAL); + test_slice_to_path_one("foo.slice/foo--bar.slice", NULL, -EINVAL); + test_slice_to_path_one("a-b.slice", "a.slice/a-b.slice", 0); + test_slice_to_path_one("a-b-c-d-e.slice", "a.slice/a-b.slice/a-b-c.slice/a-b-c-d.slice/a-b-c-d-e.slice", 0); + + test_slice_to_path_one("foobar@.slice", NULL, -EINVAL); + test_slice_to_path_one("foobar@waldo.slice", NULL, -EINVAL); + test_slice_to_path_one("foobar@waldo.service", NULL, -EINVAL); + test_slice_to_path_one("-foo@-.slice", NULL, -EINVAL); + test_slice_to_path_one("-foo@.slice", NULL, -EINVAL); + test_slice_to_path_one("foo@-.slice", NULL, -EINVAL); + test_slice_to_path_one("foo@@bar.slice", NULL, -EINVAL); + test_slice_to_path_one("foo.slice/foo@@bar.slice", NULL, -EINVAL); +} + +static void test_shift_path_one(const char *raw, const char *root, const char *shifted) { + const char *s = NULL; + + assert_se(cg_shift_path(raw, root, &s) >= 0); + assert_se(streq(s, shifted)); +} + +TEST(shift_path) { + test_shift_path_one("/foobar/waldo", "/", "/foobar/waldo"); + test_shift_path_one("/foobar/waldo", "", "/foobar/waldo"); + test_shift_path_one("/foobar/waldo", "/foobar", "/waldo"); + test_shift_path_one("/foobar/waldo", "/hogehoge", "/foobar/waldo"); +} + +TEST(mask_supported, .sd_booted = true) { + CGroupMask m; + CGroupController c; + + assert_se(cg_mask_supported(&m) >= 0); + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) + printf("'%s' is supported: %s\n", cgroup_controller_to_string(c), yes_no(m & CGROUP_CONTROLLER_TO_MASK(c))); +} + +TEST(is_cgroup_fs, .sd_booted = true) { + struct statfs sfs; + assert_se(statfs("/sys/fs/cgroup", &sfs) == 0); + if (is_temporary_fs(&sfs)) + assert_se(statfs("/sys/fs/cgroup/systemd", &sfs) == 0); + assert_se(is_cgroup_fs(&sfs)); +} + +TEST(fd_is_cgroup_fs, .sd_booted = true) { + int fd; + + fd = open("/sys/fs/cgroup", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + assert_se(fd >= 0); + if (fd_is_temporary_fs(fd)) { + fd = safe_close(fd); + fd = open("/sys/fs/cgroup/systemd", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW); + assert_se(fd >= 0); + } + assert_se(fd_is_cgroup_fs(fd)); + fd = safe_close(fd); +} + +TEST(cg_tests) { + int all, hybrid, systemd, r; + + r = cg_unified(); + if (r == -ENOMEDIUM) { + log_tests_skipped("cgroup not mounted"); + return; + } + assert_se(r >= 0); + + all = cg_all_unified(); + assert_se(IN_SET(all, 0, 1)); + + hybrid = cg_hybrid_unified(); + assert_se(IN_SET(hybrid, 0, 1)); + + systemd = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + assert_se(IN_SET(systemd, 0, 1)); + + if (all) { + assert_se(systemd); + assert_se(!hybrid); + + } else if (hybrid) { + assert_se(systemd); + assert_se(!all); + + } else + assert_se(!systemd); +} + +TEST(cg_get_keyed_attribute) { + _cleanup_free_ char *val = NULL; + char *vals3[3] = {}, *vals3a[3] = {}; + int i, r; + + r = cg_get_keyed_attribute("cpu", "/init.scope", "no_such_file", STRV_MAKE("no_such_attr"), &val); + if (r == -ENOMEDIUM || ERRNO_IS_PRIVILEGE(r)) { + log_info_errno(r, "Skipping most of %s, /sys/fs/cgroup not accessible: %m", __func__); + return; + } + + assert_se(r == -ENOENT); + assert_se(val == NULL); + + if (access("/sys/fs/cgroup/init.scope/cpu.stat", R_OK) < 0) { + log_info_errno(errno, "Skipping most of %s, /init.scope/cpu.stat not accessible: %m", __func__); + return; + } + + assert_se(cg_get_keyed_attribute("cpu", "/init.scope", "cpu.stat", STRV_MAKE("no_such_attr"), &val) == -ENXIO); + assert_se(cg_get_keyed_attribute_graceful("cpu", "/init.scope", "cpu.stat", STRV_MAKE("no_such_attr"), &val) == 0); + assert_se(val == NULL); + + assert_se(cg_get_keyed_attribute("cpu", "/init.scope", "cpu.stat", STRV_MAKE("usage_usec"), &val) == 0); + val = mfree(val); + + assert_se(cg_get_keyed_attribute_graceful("cpu", "/init.scope", "cpu.stat", STRV_MAKE("usage_usec"), &val) == 1); + log_info("cpu /init.scope cpu.stat [usage_usec] → \"%s\"", val); + + assert_se(cg_get_keyed_attribute("cpu", "/init.scope", "cpu.stat", STRV_MAKE("usage_usec", "no_such_attr"), vals3) == -ENXIO); + assert_se(cg_get_keyed_attribute_graceful("cpu", "/init.scope", "cpu.stat", STRV_MAKE("usage_usec", "no_such_attr"), vals3) == 1); + assert_se(vals3[0] && !vals3[1]); + free(vals3[0]); + + assert_se(cg_get_keyed_attribute("cpu", "/init.scope", "cpu.stat", STRV_MAKE("usage_usec", "usage_usec"), vals3) == -ENXIO); + assert_se(cg_get_keyed_attribute_graceful("cpu", "/init.scope", "cpu.stat", STRV_MAKE("usage_usec", "usage_usec"), vals3) == 1); + assert_se(vals3[0] && !vals3[1]); + free(vals3[0]); + + assert_se(cg_get_keyed_attribute("cpu", "/init.scope", "cpu.stat", + STRV_MAKE("usage_usec", "user_usec", "system_usec"), vals3) == 0); + for (i = 0; i < 3; i++) + free(vals3[i]); + + assert_se(cg_get_keyed_attribute_graceful("cpu", "/init.scope", "cpu.stat", + STRV_MAKE("usage_usec", "user_usec", "system_usec"), vals3) == 3); + log_info("cpu /init.scope cpu.stat [usage_usec user_usec system_usec] → \"%s\", \"%s\", \"%s\"", + vals3[0], vals3[1], vals3[2]); + + assert_se(cg_get_keyed_attribute("cpu", "/init.scope", "cpu.stat", + STRV_MAKE("system_usec", "user_usec", "usage_usec"), vals3a) == 0); + for (i = 0; i < 3; i++) + free(vals3a[i]); + + assert_se(cg_get_keyed_attribute_graceful("cpu", "/init.scope", "cpu.stat", + STRV_MAKE("system_usec", "user_usec", "usage_usec"), vals3a) == 3); + log_info("cpu /init.scope cpu.stat [system_usec user_usec usage_usec] → \"%s\", \"%s\", \"%s\"", + vals3a[0], vals3a[1], vals3a[2]); + + for (i = 0; i < 3; i++) { + free(vals3[i]); + free(vals3a[i]); + } +} + +TEST(bfq_weight_conversion) { + assert_se(BFQ_WEIGHT(1) == 1); + assert_se(BFQ_WEIGHT(50) == 50); + assert_se(BFQ_WEIGHT(100) == 100); + assert_se(BFQ_WEIGHT(500) == 136); + assert_se(BFQ_WEIGHT(5000) == 545); + assert_se(BFQ_WEIGHT(10000) == 1000); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-cgroup.c b/src/test/test-cgroup.c new file mode 100644 index 0000000..0fbd635 --- /dev/null +++ b/src/test/test-cgroup.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "errno-util.h" +#include "path-util.h" +#include "process-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(cg_split_spec) { + char *c, *p; + + assert_se(cg_split_spec("foobar:/", &c, &p) == 0); + assert_se(streq(c, "foobar")); + assert_se(streq(p, "/")); + c = mfree(c); + p = mfree(p); + + assert_se(cg_split_spec("foobar:", &c, &p) == 0); + c = mfree(c); + p = mfree(p); + + assert_se(cg_split_spec("foobar:asdfd", &c, &p) < 0); + assert_se(cg_split_spec(":///", &c, &p) < 0); + assert_se(cg_split_spec(":", &c, &p) < 0); + assert_se(cg_split_spec("", &c, &p) < 0); + assert_se(cg_split_spec("fo/obar:/", &c, &p) < 0); + + assert_se(cg_split_spec("/", &c, &p) >= 0); + assert_se(c == NULL); + assert_se(streq(p, "/")); + p = mfree(p); + + assert_se(cg_split_spec("foo", &c, &p) >= 0); + assert_se(streq(c, "foo")); + assert_se(p == NULL); + c = mfree(c); +} + +TEST(cg_create) { + int r; + + r = cg_unified_cached(false); + if (r == -ENOMEDIUM) { + log_tests_skipped("cgroup not mounted"); + return; + } + assert_se(r >= 0); + + _cleanup_free_ char *here = NULL; + assert_se(cg_pid_get_path_shifted(0, NULL, &here) >= 0); + + const char *test_a = prefix_roota(here, "/test-a"), + *test_b = prefix_roota(here, "/test-b"), + *test_c = prefix_roota(here, "/test-b/test-c"), + *test_d = prefix_roota(here, "/test-b/test-d"); + char *path; + + log_info("Paths for test:\n%s\n%s", test_a, test_b); + + /* Possibly clean up left-overs from aboted previous runs */ + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, test_a, /* delete_root= */ true); + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, test_b, /* delete_root= */ true); + + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, test_a); + if (IN_SET(r, -EPERM, -EACCES, -EROFS)) { + log_info_errno(r, "Skipping %s: %m", __func__); + return; + } + + assert_se(r == 1); + assert_se(cg_create(SYSTEMD_CGROUP_CONTROLLER, test_a) == 0); + assert_se(cg_create(SYSTEMD_CGROUP_CONTROLLER, test_b) == 1); + assert_se(cg_create(SYSTEMD_CGROUP_CONTROLLER, test_c) == 1); + assert_se(cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, test_b, 0) == 0); + + assert_se(cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, getpid_cached(), &path) == 0); + assert_se(streq(path, test_b)); + free(path); + + assert_se(cg_attach(SYSTEMD_CGROUP_CONTROLLER, test_a, 0) == 0); + + assert_se(cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, getpid_cached(), &path) == 0); + assert_se(path_equal(path, test_a)); + free(path); + + assert_se(cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, test_d, 0) == 1); + + assert_se(cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, getpid_cached(), &path) == 0); + assert_se(path_equal(path, test_d)); + free(path); + + assert_se(cg_get_path(SYSTEMD_CGROUP_CONTROLLER, test_d, NULL, &path) == 0); + log_debug("test_d: %s", path); + const char *full_d; + if (cg_all_unified()) + full_d = strjoina("/sys/fs/cgroup", test_d); + else if (cg_hybrid_unified()) + full_d = strjoina("/sys/fs/cgroup/unified", test_d); + else + full_d = strjoina("/sys/fs/cgroup/systemd", test_d); + assert_se(path_equal(path, full_d)); + free(path); + + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, test_a) > 0); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, test_b) > 0); + assert_se(cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, test_a) > 0); + assert_se(cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, test_b) == 0); + + assert_se(cg_kill_recursive(test_a, 0, 0, NULL, NULL, NULL) == 0); + assert_se(cg_kill_recursive(test_b, 0, 0, NULL, NULL, NULL) > 0); + + assert_se(cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, test_b, SYSTEMD_CGROUP_CONTROLLER, test_a, 0) > 0); + + assert_se(cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, test_a) == 0); + assert_se(cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, test_b) > 0); + + assert_se(cg_kill_recursive(test_a, 0, 0, NULL, NULL, NULL) > 0); + assert_se(cg_kill_recursive(test_b, 0, 0, NULL, NULL, NULL) == 0); + + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, test_b, false); + + assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, test_b) == 0); + assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, test_a) < 0); + assert_se(cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, test_a, SYSTEMD_CGROUP_CONTROLLER, here, 0) > 0); + assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, test_a) == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-chase-manual.c b/src/test/test-chase-manual.c new file mode 100644 index 0000000..475f089 --- /dev/null +++ b/src/test/test-chase-manual.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include + +#include "chase.h" +#include "fd-util.h" +#include "log.h" +#include "main-func.h" +#include "tests.h" + +static char *arg_root = NULL; +static int arg_flags = 0; +static bool arg_open = false; + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_ROOT = 0x1000, + ARG_OPEN, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "root", required_argument, NULL, ARG_ROOT }, + { "open", no_argument, NULL, ARG_OPEN }, + + { "prefix-root", no_argument, NULL, CHASE_PREFIX_ROOT }, + { "nonexistent", no_argument, NULL, CHASE_NONEXISTENT }, + { "no_autofs", no_argument, NULL, CHASE_NO_AUTOFS }, + { "safe", no_argument, NULL, CHASE_SAFE }, + { "trail-slash", no_argument, NULL, CHASE_TRAIL_SLASH }, + { "step", no_argument, NULL, CHASE_STEP }, + { "nofollow", no_argument, NULL, CHASE_NOFOLLOW }, + { "warn", no_argument, NULL, CHASE_WARN }, + {} + }; + + int c; + + assert_se(argc >= 0); + assert_se(argv); + + while ((c = getopt_long(argc, argv, "", options, NULL)) >= 0) + switch (c) { + + case 'h': + printf("Syntax:\n" + " %s [OPTION...] path...\n" + "Options:\n" + , argv[0]); + for (size_t i = 0; i < ELEMENTSOF(options) - 1; i++) + printf(" --%s\n", options[i].name); + return 0; + + case ARG_ROOT: + arg_root = optarg; + break; + + case ARG_OPEN: + arg_open = true; + break; + + case CHASE_PREFIX_ROOT: + case CHASE_NONEXISTENT: + case CHASE_NO_AUTOFS: + case CHASE_SAFE: + case CHASE_TRAIL_SLASH: + case CHASE_STEP: + case CHASE_NOFOLLOW: + case CHASE_WARN: + arg_flags |= c; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind == argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "At least one argument is required."); + + return 1; +} + +static int run(int argc, char **argv) { + int r; + + test_setup_logging(LOG_DEBUG); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + for (int i = optind; i < argc; i++) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + + printf("%s ", argv[i]); + fflush(stdout); + + r = chase(argv[i], arg_root, arg_flags, &p, arg_open ? &fd : NULL); + if (r < 0) + log_error_errno(r, "failed: %m"); + else { + log_info("→ %s", p); + if (arg_open) + assert_se(fd >= 0); + else + assert_se(fd == -EBADF); + } + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-chase.c b/src/test/test-chase.c new file mode 100644 index 0000000..dbbc99b --- /dev/null +++ b/src/test/test-chase.c @@ -0,0 +1,756 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "chase.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "id128-util.h" +#include "mkdir.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static const char *arg_test_dir = NULL; + +static void test_chase_extract_filename_one(const char *path, const char *root, const char *expected) { + _cleanup_free_ char *ret1 = NULL, *ret2 = NULL, *fname = NULL; + + log_debug("/* %s(path=%s, root=%s) */", __func__, path, strnull(root)); + + assert_se(chase(path, root, CHASE_EXTRACT_FILENAME, &ret1, NULL) > 0); + assert_se(streq(ret1, expected)); + + assert_se(chase(path, root, 0, &ret2, NULL) > 0); + assert_se(chase_extract_filename(ret2, root, &fname) >= 0); + assert_se(streq(fname, expected)); +} + +TEST(chase) { + _cleanup_free_ char *result = NULL, *pwd = NULL; + _cleanup_close_ int pfd = -EBADF; + char *temp; + const char *top, *p, *pslash, *q, *qslash; + struct stat st; + int r; + + temp = strjoina(arg_test_dir ?: "/tmp", "/test-chase.XXXXXX"); + assert_se(mkdtemp(temp)); + + top = strjoina(temp, "/top"); + assert_se(mkdir(top, 0700) >= 0); + + p = strjoina(top, "/dot"); + if (symlink(".", p) < 0) { + assert_se(IN_SET(errno, EINVAL, ENOSYS, ENOTTY, EPERM)); + log_tests_skipped_errno(errno, "symlink() not possible"); + goto cleanup; + }; + + p = strjoina(top, "/dotdot"); + assert_se(symlink("..", p) >= 0); + + p = strjoina(top, "/dotdota"); + assert_se(symlink("../a", p) >= 0); + + p = strjoina(temp, "/a"); + assert_se(symlink("b", p) >= 0); + + p = strjoina(temp, "/b"); + assert_se(symlink("/usr", p) >= 0); + + p = strjoina(temp, "/start"); + assert_se(symlink("top/dot/dotdota", p) >= 0); + + /* Paths that use symlinks underneath the "root" */ + + r = chase(p, NULL, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, "/usr")); + result = mfree(result); + + r = chase(p, "/.//../../../", 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, "/usr")); + result = mfree(result); + + pslash = strjoina(p, "/"); + r = chase(pslash, NULL, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, "/usr/")); + result = mfree(result); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r == -ENOENT); + + r = chase(pslash, temp, 0, &result, NULL); + assert_se(r == -ENOENT); + + q = strjoina(temp, "/usr"); + + r = chase(p, temp, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + assert_se(path_equal(result, q)); + result = mfree(result); + + qslash = strjoina(q, "/"); + + r = chase(pslash, temp, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + assert_se(path_equal(result, qslash)); + result = mfree(result); + + assert_se(mkdir(q, 0700) >= 0); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, q)); + result = mfree(result); + + r = chase(pslash, temp, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, qslash)); + result = mfree(result); + + p = strjoina(temp, "/slash"); + assert_se(symlink("/", p) >= 0); + + r = chase(p, NULL, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, "/")); + result = mfree(result); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, temp)); + result = mfree(result); + + /* Tests for CHASE_EXTRACT_FILENAME and chase_extract_filename() */ + + p = strjoina(temp, "/start"); + pslash = strjoina(p, "/"); + test_chase_extract_filename_one(p, NULL, "usr"); + test_chase_extract_filename_one(pslash, NULL, "usr"); + test_chase_extract_filename_one(p, temp, "usr"); + test_chase_extract_filename_one(pslash, temp, "usr"); + + p = strjoina(temp, "/slash"); + test_chase_extract_filename_one(p, NULL, "."); + test_chase_extract_filename_one(p, temp, "."); + + /* Paths that would "escape" outside of the "root" */ + + p = strjoina(temp, "/6dots"); + assert_se(symlink("../../..", p) >= 0); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r > 0 && path_equal(result, temp)); + result = mfree(result); + + p = strjoina(temp, "/6dotsusr"); + assert_se(symlink("../../../usr", p) >= 0); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r > 0 && path_equal(result, q)); + result = mfree(result); + + p = strjoina(temp, "/top/8dotsusr"); + assert_se(symlink("../../../../usr", p) >= 0); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r > 0 && path_equal(result, q)); + result = mfree(result); + + /* Paths that contain repeated slashes */ + + p = strjoina(temp, "/slashslash"); + assert_se(symlink("///usr///", p) >= 0); + + r = chase(p, NULL, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, "/usr")); + assert_se(streq(result, "/usr")); /* we guarantee that we drop redundant slashes */ + result = mfree(result); + + r = chase(p, temp, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, q)); + result = mfree(result); + + /* Paths underneath the "root" with different UIDs while using CHASE_SAFE */ + + if (geteuid() == 0) { + p = strjoina(temp, "/user"); + assert_se(mkdir(p, 0755) >= 0); + assert_se(chown(p, UID_NOBODY, GID_NOBODY) >= 0); + + q = strjoina(temp, "/user/root"); + assert_se(mkdir(q, 0755) >= 0); + + p = strjoina(q, "/link"); + assert_se(symlink("/", p) >= 0); + + /* Fail when user-owned directories contain root-owned subdirectories. */ + r = chase(p, temp, CHASE_SAFE, &result, NULL); + assert_se(r == -ENOLINK); + result = mfree(result); + + /* Allow this when the user-owned directories are all in the "root". */ + r = chase(p, q, CHASE_SAFE, &result, NULL); + assert_se(r > 0); + result = mfree(result); + } + + /* Paths using . */ + + r = chase("/etc/./.././", NULL, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(result, "/")); + result = mfree(result); + + r = chase("/etc/./.././", "/etc", 0, &result, NULL); + assert_se(r > 0 && path_equal(result, "/etc")); + result = mfree(result); + + r = chase("/../.././//../../etc", NULL, 0, &result, NULL); + assert_se(r > 0); + assert_se(streq(result, "/etc")); + result = mfree(result); + + r = chase("/../.././//../../test-chase.fsldajfl", NULL, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + assert_se(streq(result, "/test-chase.fsldajfl")); + result = mfree(result); + + r = chase("/../.././//../../etc", "/", CHASE_PREFIX_ROOT, &result, NULL); + assert_se(r > 0); + assert_se(streq(result, "/etc")); + result = mfree(result); + + r = chase("/../.././//../../test-chase.fsldajfl", "/", CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + assert_se(streq(result, "/test-chase.fsldajfl")); + result = mfree(result); + + r = chase("/etc/machine-id/foo", NULL, 0, &result, NULL); + assert_se(IN_SET(r, -ENOTDIR, -ENOENT)); + result = mfree(result); + + /* Path that loops back to self */ + + p = strjoina(temp, "/recursive-symlink"); + assert_se(symlink("recursive-symlink", p) >= 0); + r = chase(p, NULL, 0, &result, NULL); + assert_se(r == -ELOOP); + + /* Path which doesn't exist */ + + p = strjoina(temp, "/idontexist"); + r = chase(p, NULL, 0, &result, NULL); + assert_se(r == -ENOENT); + + r = chase(p, NULL, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + assert_se(path_equal(result, p)); + result = mfree(result); + + p = strjoina(temp, "/idontexist/meneither"); + r = chase(p, NULL, 0, &result, NULL); + assert_se(r == -ENOENT); + + r = chase(p, NULL, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + assert_se(path_equal(result, p)); + result = mfree(result); + + /* Relative paths */ + + assert_se(safe_getcwd(&pwd) >= 0); + + assert_se(chdir(temp) >= 0); + + p = "this/is/a/relative/path"; + r = chase(p, NULL, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + + p = strjoina(temp, "/", p); + assert_se(path_equal(result, p)); + result = mfree(result); + + p = "this/is/a/relative/path"; + r = chase(p, temp, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == 0); + + p = strjoina(temp, "/", p); + assert_se(path_equal(result, p)); + result = mfree(result); + + assert_se(chdir(pwd) >= 0); + + /* Path which doesn't exist, but contains weird stuff */ + + p = strjoina(temp, "/idontexist/.."); + r = chase(p, NULL, 0, &result, NULL); + assert_se(r == -ENOENT); + + r = chase(p, NULL, CHASE_NONEXISTENT, &result, NULL); + assert_se(r == -ENOENT); + + p = strjoina(temp, "/target"); + q = strjoina(temp, "/top"); + assert_se(symlink(q, p) >= 0); + p = strjoina(temp, "/target/idontexist"); + r = chase(p, NULL, 0, &result, NULL); + assert_se(r == -ENOENT); + + if (geteuid() == 0) { + p = strjoina(temp, "/priv1"); + assert_se(mkdir(p, 0755) >= 0); + + q = strjoina(p, "/priv2"); + assert_se(mkdir(q, 0755) >= 0); + + assert_se(chase(q, NULL, CHASE_SAFE, NULL, NULL) >= 0); + + assert_se(chown(q, UID_NOBODY, GID_NOBODY) >= 0); + assert_se(chase(q, NULL, CHASE_SAFE, NULL, NULL) >= 0); + + assert_se(chown(p, UID_NOBODY, GID_NOBODY) >= 0); + assert_se(chase(q, NULL, CHASE_SAFE, NULL, NULL) >= 0); + + assert_se(chown(q, 0, 0) >= 0); + assert_se(chase(q, NULL, CHASE_SAFE, NULL, NULL) == -ENOLINK); + + assert_se(rmdir(q) >= 0); + assert_se(symlink("/etc/passwd", q) >= 0); + assert_se(chase(q, NULL, CHASE_SAFE, NULL, NULL) == -ENOLINK); + + assert_se(chown(p, 0, 0) >= 0); + assert_se(chase(q, NULL, CHASE_SAFE, NULL, NULL) >= 0); + } + + p = strjoina(temp, "/machine-id-test"); + assert_se(symlink("/usr/../etc/./machine-id", p) >= 0); + + r = chase(p, NULL, 0, NULL, &pfd); + if (r != -ENOENT && sd_id128_get_machine(NULL) >= 0) { + _cleanup_close_ int fd = -EBADF; + sd_id128_t a, b; + + assert_se(pfd >= 0); + + fd = fd_reopen(pfd, O_RDONLY|O_CLOEXEC); + assert_se(fd >= 0); + safe_close(pfd); + + assert_se(id128_read_fd(fd, ID128_FORMAT_PLAIN, &a) >= 0); + assert_se(sd_id128_get_machine(&b) >= 0); + assert_se(sd_id128_equal(a, b)); + } + + assert_se(lstat(p, &st) >= 0); + r = chase_and_unlink(p, NULL, 0, 0, &result); + assert_se(r == 0); + assert_se(path_equal(result, p)); + result = mfree(result); + assert_se(lstat(p, &st) == -1 && errno == ENOENT); + + /* Test CHASE_NOFOLLOW */ + + p = strjoina(temp, "/target"); + q = strjoina(temp, "/symlink"); + assert_se(symlink(p, q) >= 0); + r = chase(q, NULL, CHASE_NOFOLLOW, &result, &pfd); + assert_se(r >= 0); + assert_se(pfd >= 0); + assert_se(path_equal(result, q)); + assert_se(fstat(pfd, &st) >= 0); + assert_se(S_ISLNK(st.st_mode)); + result = mfree(result); + pfd = safe_close(pfd); + + /* s1 -> s2 -> nonexistent */ + q = strjoina(temp, "/s1"); + assert_se(symlink("s2", q) >= 0); + p = strjoina(temp, "/s2"); + assert_se(symlink("nonexistent", p) >= 0); + r = chase(q, NULL, CHASE_NOFOLLOW, &result, &pfd); + assert_se(r >= 0); + assert_se(pfd >= 0); + assert_se(path_equal(result, q)); + assert_se(fstat(pfd, &st) >= 0); + assert_se(S_ISLNK(st.st_mode)); + result = mfree(result); + pfd = safe_close(pfd); + + /* Test CHASE_STEP */ + + p = strjoina(temp, "/start"); + r = chase(p, NULL, CHASE_STEP, &result, NULL); + assert_se(r == 0); + p = strjoina(temp, "/top/dot/dotdota"); + assert_se(streq(p, result)); + result = mfree(result); + + r = chase(p, NULL, CHASE_STEP, &result, NULL); + assert_se(r == 0); + p = strjoina(temp, "/top/dotdota"); + assert_se(streq(p, result)); + result = mfree(result); + + r = chase(p, NULL, CHASE_STEP, &result, NULL); + assert_se(r == 0); + p = strjoina(temp, "/top/../a"); + assert_se(streq(p, result)); + result = mfree(result); + + r = chase(p, NULL, CHASE_STEP, &result, NULL); + assert_se(r == 0); + p = strjoina(temp, "/a"); + assert_se(streq(p, result)); + result = mfree(result); + + r = chase(p, NULL, CHASE_STEP, &result, NULL); + assert_se(r == 0); + p = strjoina(temp, "/b"); + assert_se(streq(p, result)); + result = mfree(result); + + r = chase(p, NULL, CHASE_STEP, &result, NULL); + assert_se(r == 0); + assert_se(streq("/usr", result)); + result = mfree(result); + + r = chase("/usr", NULL, CHASE_STEP, &result, NULL); + assert_se(r > 0); + assert_se(streq("/usr", result)); + result = mfree(result); + + /* Make sure that symlinks in the "root" path are not resolved, but those below are */ + p = strjoina("/etc/..", temp, "/self"); + assert_se(symlink(".", p) >= 0); + q = strjoina(p, "/top/dot/dotdota"); + r = chase(q, p, 0, &result, NULL); + assert_se(r > 0); + assert_se(path_equal(path_startswith(result, p), "usr")); + result = mfree(result); + + /* Test CHASE_PROHIBIT_SYMLINKS */ + + assert_se(chase("top/dot", temp, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, NULL, NULL) == -EREMCHG); + assert_se(chase("top/dot", temp, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_WARN, NULL, NULL) == -EREMCHG); + assert_se(chase("top/dotdot", temp, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, NULL, NULL) == -EREMCHG); + assert_se(chase("top/dotdot", temp, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_WARN, NULL, NULL) == -EREMCHG); + assert_se(chase("top/dot/dot", temp, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, NULL, NULL) == -EREMCHG); + assert_se(chase("top/dot/dot", temp, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_WARN, NULL, NULL) == -EREMCHG); + + cleanup: + assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); +} + +TEST(chaseat) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *result = NULL; + _cleanup_closedir_ DIR *dir = NULL; + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + const char *p; + + assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); + + /* Test that AT_FDCWD with CHASE_AT_RESOLVE_IN_ROOT resolves against / and not the current working + * directory. */ + + assert_se(symlinkat("/usr", tfd, "abc") >= 0); + + p = strjoina(t, "/abc"); + assert_se(chaseat(AT_FDCWD, p, CHASE_AT_RESOLVE_IN_ROOT, &result, NULL) >= 0); + assert_se(streq(result, "/usr")); + result = mfree(result); + + /* If the file descriptor points to the root directory, the result will be absolute. */ + + fd = open("/", O_CLOEXEC | O_DIRECTORY | O_PATH); + assert_se(fd >= 0); + + assert_se(chaseat(fd, p, 0, &result, NULL) >= 0); + assert_se(streq(result, "/usr")); + result = mfree(result); + + assert_se(chaseat(fd, p, CHASE_AT_RESOLVE_IN_ROOT, &result, NULL) >= 0); + assert_se(streq(result, "/usr")); + result = mfree(result); + + fd = safe_close(fd); + + /* If the file descriptor does not point to the root directory, the result will be relative + * unless the result is outside of the specified file descriptor. */ + + assert_se(chaseat(tfd, "abc", 0, &result, NULL) >= 0); + assert_se(streq(result, "/usr")); + result = mfree(result); + + assert_se(chaseat(tfd, "/abc", 0, &result, NULL) >= 0); + assert_se(streq(result, "/usr")); + result = mfree(result); + + assert_se(chaseat(tfd, "abc", CHASE_AT_RESOLVE_IN_ROOT, NULL, NULL) == -ENOENT); + assert_se(chaseat(tfd, "/abc", CHASE_AT_RESOLVE_IN_ROOT, NULL, NULL) == -ENOENT); + + assert_se(chaseat(tfd, "abc", CHASE_AT_RESOLVE_IN_ROOT | CHASE_NONEXISTENT, &result, NULL) >= 0); + assert_se(streq(result, "usr")); + result = mfree(result); + + assert_se(chaseat(tfd, "/abc", CHASE_AT_RESOLVE_IN_ROOT | CHASE_NONEXISTENT, &result, NULL) >= 0); + assert_se(streq(result, "usr")); + result = mfree(result); + + /* Test that absolute path or not are the same when resolving relative to a directory file + * descriptor and that we always get a relative path back. */ + + assert_se(fd = openat(tfd, "def", O_CREAT|O_CLOEXEC, 0700) >= 0); + fd = safe_close(fd); + assert_se(symlinkat("/def", tfd, "qed") >= 0); + assert_se(chaseat(tfd, "qed", CHASE_AT_RESOLVE_IN_ROOT, &result, NULL) >= 0); + assert_se(streq(result, "def")); + result = mfree(result); + assert_se(chaseat(tfd, "/qed", CHASE_AT_RESOLVE_IN_ROOT, &result, NULL) >= 0); + assert_se(streq(result, "def")); + result = mfree(result); + + /* Valid directory file descriptor without CHASE_AT_RESOLVE_IN_ROOT should resolve symlinks against + * host's root. */ + assert_se(chaseat(tfd, "/qed", 0, NULL, NULL) == -ENOENT); + + /* Test CHASE_PARENT */ + + assert_se((fd = open_mkdir_at(tfd, "chase", O_CLOEXEC, 0755)) >= 0); + assert_se(symlinkat("/def", fd, "parent") >= 0); + fd = safe_close(fd); + + /* Make sure that when we chase a symlink parent directory, that we chase the parent directory of the + * symlink target and not the symlink itself. But if we add CHASE_NOFOLLOW, we get the parent + * directory of the symlink itself. */ + + assert_se(chaseat(tfd, "chase/parent", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT, &result, &fd) >= 0); + assert_se(faccessat(fd, "def", F_OK, 0) >= 0); + assert_se(streq(result, "def")); + fd = safe_close(fd); + result = mfree(result); + + assert_se(chaseat(tfd, "chase/parent", CHASE_AT_RESOLVE_IN_ROOT|CHASE_PARENT|CHASE_NOFOLLOW, &result, &fd) >= 0); + assert_se(faccessat(fd, "parent", F_OK, AT_SYMLINK_NOFOLLOW) >= 0); + assert_se(streq(result, "chase/parent")); + fd = safe_close(fd); + result = mfree(result); + + assert_se(chaseat(tfd, "chase", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT, &result, &fd) >= 0); + assert_se(faccessat(fd, "chase", F_OK, 0) >= 0); + assert_se(streq(result, "chase")); + fd = safe_close(fd); + result = mfree(result); + + assert_se(chaseat(tfd, "/", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT, &result, NULL) >= 0); + assert_se(streq(result, ".")); + result = mfree(result); + + assert_se(chaseat(tfd, ".", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT, &result, NULL) >= 0); + assert_se(streq(result, ".")); + result = mfree(result); + + /* Test CHASE_MKDIR_0755 */ + + assert_se(chaseat(tfd, "m/k/d/i/r", CHASE_MKDIR_0755|CHASE_NONEXISTENT, &result, NULL) >= 0); + assert_se(faccessat(tfd, "m/k/d/i", F_OK, 0) >= 0); + assert_se(RET_NERRNO(faccessat(tfd, "m/k/d/i/r", F_OK, 0)) == -ENOENT); + assert_se(streq(result, "m/k/d/i/r")); + result = mfree(result); + + assert_se(chaseat(tfd, "m/../q", CHASE_MKDIR_0755|CHASE_NONEXISTENT, &result, NULL) >= 0); + assert_se(faccessat(tfd, "m", F_OK, 0) >= 0); + assert_se(RET_NERRNO(faccessat(tfd, "q", F_OK, 0)) == -ENOENT); + assert_se(streq(result, "q")); + result = mfree(result); + + assert_se(chaseat(tfd, "i/../p", CHASE_MKDIR_0755|CHASE_NONEXISTENT, NULL, NULL) == -ENOENT); + + /* Test CHASE_EXTRACT_FILENAME */ + + assert_se(chaseat(tfd, "chase/parent", CHASE_AT_RESOLVE_IN_ROOT|CHASE_PARENT|CHASE_NOFOLLOW|CHASE_EXTRACT_FILENAME, &result, &fd) >= 0); + assert_se(faccessat(fd, result, F_OK, AT_SYMLINK_NOFOLLOW) >= 0); + assert_se(streq(result, "parent")); + fd = safe_close(fd); + result = mfree(result); + + assert_se(chaseat(tfd, "chase", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT|CHASE_EXTRACT_FILENAME, &result, &fd) >= 0); + assert_se(faccessat(fd, result, F_OK, 0) >= 0); + assert_se(streq(result, "chase")); + fd = safe_close(fd); + result = mfree(result); + + assert_se(chaseat(tfd, "/", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT|CHASE_EXTRACT_FILENAME, &result, NULL) >= 0); + assert_se(streq(result, ".")); + result = mfree(result); + + assert_se(chaseat(tfd, ".", CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT|CHASE_EXTRACT_FILENAME, &result, NULL) >= 0); + assert_se(streq(result, ".")); + result = mfree(result); + + assert_se(chaseat(tfd, NULL, CHASE_PARENT|CHASE_AT_RESOLVE_IN_ROOT|CHASE_EXTRACT_FILENAME, &result, NULL) >= 0); + assert_se(streq(result, ".")); + result = mfree(result); + + /* Test chase_and_openat() */ + + fd = chase_and_openat(tfd, "o/p/e/n/f/i/l/e", CHASE_MKDIR_0755, O_CREAT|O_EXCL|O_CLOEXEC, NULL); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) >= 0); + fd = safe_close(fd); + + fd = chase_and_openat(tfd, "o/p/e/n/d/i/r", CHASE_MKDIR_0755, O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, NULL); + assert_se(fd >= 0); + assert_se(fd_verify_directory(fd) >= 0); + fd = safe_close(fd); + + fd = chase_and_openat(tfd, NULL, CHASE_PARENT|CHASE_EXTRACT_FILENAME, O_PATH|O_DIRECTORY|O_CLOEXEC, &result); + assert_se(fd >= 0); + assert_se(streq(result, ".")); + fd = safe_close(fd); + result = mfree(result); + + /* Test chase_and_openatdir() */ + + assert_se(chase_and_opendirat(tfd, "o/p/e/n/d/i", 0, &result, &dir) >= 0); + FOREACH_DIRENT(de, dir, assert_not_reached()) + assert_se(streq(de->d_name, "r")); + assert_se(streq(result, "o/p/e/n/d/i")); + result = mfree(result); + + /* Test chase_and_statat() */ + + assert_se(chase_and_statat(tfd, "o/p", 0, &result, &st) >= 0); + assert_se(stat_verify_directory(&st) >= 0); + assert_se(streq(result, "o/p")); + result = mfree(result); + + /* Test chase_and_accessat() */ + + assert_se(chase_and_accessat(tfd, "o/p/e", 0, F_OK, &result) >= 0); + assert_se(streq(result, "o/p/e")); + result = mfree(result); + + /* Test chase_and_fopenat_unlocked() */ + + assert_se(chase_and_fopenat_unlocked(tfd, "o/p/e/n/f/i/l/e", 0, "re", &result, &f) >= 0); + assert_se(fread(&(char[1]) {}, 1, 1, f) == 0); + assert_se(feof(f)); + f = safe_fclose(f); + assert_se(streq(result, "o/p/e/n/f/i/l/e")); + result = mfree(result); + + /* Test chase_and_unlinkat() */ + + assert_se(chase_and_unlinkat(tfd, "o/p/e/n/f/i/l/e", 0, 0, &result) >= 0); + assert_se(streq(result, "o/p/e/n/f/i/l/e")); + result = mfree(result); + + /* Test chase_and_open_parent_at() */ + + assert_se((fd = chase_and_open_parent_at(tfd, "chase/parent", CHASE_AT_RESOLVE_IN_ROOT|CHASE_NOFOLLOW, &result)) >= 0); + assert_se(faccessat(fd, result, F_OK, AT_SYMLINK_NOFOLLOW) >= 0); + assert_se(streq(result, "parent")); + fd = safe_close(fd); + result = mfree(result); + + assert_se((fd = chase_and_open_parent_at(tfd, "chase", CHASE_AT_RESOLVE_IN_ROOT, &result)) >= 0); + assert_se(faccessat(fd, result, F_OK, 0) >= 0); + assert_se(streq(result, "chase")); + fd = safe_close(fd); + result = mfree(result); + + assert_se((fd = chase_and_open_parent_at(tfd, "/", CHASE_AT_RESOLVE_IN_ROOT, &result)) >= 0); + assert_se(streq(result, ".")); + fd = safe_close(fd); + result = mfree(result); + + assert_se((fd = chase_and_open_parent_at(tfd, ".", CHASE_AT_RESOLVE_IN_ROOT, &result)) >= 0); + assert_se(streq(result, ".")); + fd = safe_close(fd); + result = mfree(result); +} + +TEST(chaseat_prefix_root) { + _cleanup_free_ char *cwd = NULL, *ret = NULL, *expected = NULL; + + assert_se(safe_getcwd(&cwd) >= 0); + + assert_se(chaseat_prefix_root("/hoge", NULL, &ret) >= 0); + assert_se(streq(ret, "/hoge")); + + ret = mfree(ret); + + assert_se(chaseat_prefix_root("/hoge", "a/b/c", &ret) >= 0); + assert_se(streq(ret, "/hoge")); + + ret = mfree(ret); + + assert_se(chaseat_prefix_root("hoge", "/a/b//./c///", &ret) >= 0); + assert_se(streq(ret, "/a/b/c/hoge")); + + ret = mfree(ret); + + assert_se(chaseat_prefix_root("hoge", "a/b//./c///", &ret) >= 0); + assert_se(expected = path_join(cwd, "a/b/c/hoge")); + assert_se(streq(ret, expected)); + + ret = mfree(ret); + expected = mfree(expected); + + assert_se(chaseat_prefix_root("./hoge/aaa/../././b", "/a/b//./c///", &ret) >= 0); + assert_se(streq(ret, "/a/b/c/hoge/aaa/../././b")); + + ret = mfree(ret); + + assert_se(chaseat_prefix_root("./hoge/aaa/../././b", "a/b//./c///", &ret) >= 0); + assert_se(expected = path_join(cwd, "a/b/c/hoge/aaa/../././b")); + assert_se(streq(ret, expected)); +} + +TEST(trailing_dot_dot) { + _cleanup_free_ char *path = NULL, *fdpath = NULL; + _cleanup_close_ int fd = -EBADF; + + assert_se(chase("/usr/..", NULL, CHASE_PARENT, &path, &fd) >= 0); + assert_se(path_equal(path, "/")); + assert_se(fd_get_path(fd, &fdpath) >= 0); + assert_se(path_equal(fdpath, "/")); + + path = mfree(path); + fdpath = mfree(fdpath); + fd = safe_close(fd); + + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + _cleanup_free_ char *sub = ASSERT_PTR(path_join(t, "a/b/c/d")); + assert_se(mkdir_p(sub, 0700) >= 0); + _cleanup_free_ char *suffixed = ASSERT_PTR(path_join(sub, "..")); + assert_se(chase(suffixed, NULL, CHASE_PARENT, &path, &fd) >= 0); + _cleanup_free_ char *expected1 = ASSERT_PTR(path_join(t, "a/b/c")); + _cleanup_free_ char *expected2 = ASSERT_PTR(path_join(t, "a/b")); + + assert_se(path_equal(path, expected1)); + assert_se(fd_get_path(fd, &fdpath) >= 0); + assert_se(path_equal(fdpath, expected2)); +} + +static int intro(void) { + arg_test_dir = saved_argv[1]; + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-chown-rec.c b/src/test/test-chown-rec.c new file mode 100644 index 0000000..5d83f59 --- /dev/null +++ b/src/test/test-chown-rec.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "chown-recursive.h" +#include "log.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static const uint8_t acl[] = { + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x02, 0x00, 0x07, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x07, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x07, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x20, 0x00, 0x05, 0x00, + 0xff, 0xff, 0xff, 0xff, +}; + +static const uint8_t default_acl[] = { + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x04, 0x00, 0x07, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x08, 0x00, 0x07, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x07, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x20, 0x00, 0x05, 0x00, + 0xff, 0xff, 0xff, 0xff, +}; + +static bool has_xattr(const char *p) { + char buffer[sizeof(acl) * 4]; + + if (lgetxattr(p, "system.posix_acl_access", buffer, sizeof(buffer)) < 0) + return !ERRNO_IS_XATTR_ABSENT(errno); + + return true; +} + +TEST(chown_recursive) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + struct stat st; + const char *p; + const uid_t uid = getuid(); + const gid_t gid = getgid(); + int r; + + umask(022); + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + + p = strjoina(t, "/dir"); + assert_se(mkdir(p, 0777) >= 0); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == uid); + assert_se(st.st_gid == gid); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/symlink"); + assert_se(symlink("../../", p) >= 0); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISLNK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0777); + assert_se(st.st_uid == uid); + assert_se(st.st_gid == gid); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/reg"); + assert_se(mknod(p, S_IFREG|0777, 0) >= 0); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISREG(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == uid); + assert_se(st.st_gid == gid); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/sock"); + assert_se(mknod(p, S_IFSOCK|0777, 0) >= 0); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISSOCK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == uid); + assert_se(st.st_gid == gid); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/fifo"); + assert_se(mknod(p, S_IFIFO|0777, 0) >= 0); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISFIFO(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == uid); + assert_se(st.st_gid == gid); + assert_se(!has_xattr(p)); + + /* We now apply an xattr to the dir, and check it again */ + p = strjoina(t, "/dir"); + r = RET_NERRNO(setxattr(p, "system.posix_acl_access", acl, sizeof(acl), 0)); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + return (void) log_tests_skipped_errno(r, "no acl supported on /tmp"); + + assert_se(r >= 0); + assert_se(setxattr(p, "system.posix_acl_default", default_acl, sizeof(default_acl), 0) >= 0); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0775); /* acl change changed the mode too */ + assert_se(st.st_uid == uid); + assert_se(st.st_gid == gid); + assert_se(has_xattr(p)); + + assert_se(path_chown_recursive(t, 1, 2, 07777, 0) >= 0); + + p = strjoina(t, "/dir"); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0775); + assert_se(st.st_uid == 1); + assert_se(st.st_gid == 2); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/symlink"); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISLNK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0777); + assert_se(st.st_uid == 1); + assert_se(st.st_gid == 2); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/reg"); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISREG(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == 1); + assert_se(st.st_gid == 2); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/sock"); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISSOCK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == 1); + assert_se(st.st_gid == 2); + assert_se(!has_xattr(p)); + + p = strjoina(t, "/dir/fifo"); + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISFIFO(st.st_mode)); + assert_se((st.st_mode & 07777) == 0755); + assert_se(st.st_uid == 1); + assert_se(st.st_gid == 2); + assert_se(!has_xattr(p)); +} + +static int intro(void) { + if (geteuid() != 0) + return log_tests_skipped("not running as root"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-clock.c b/src/test/test-clock.c new file mode 100644 index 0000000..123831a --- /dev/null +++ b/src/test/test-clock.c @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2016 Canonical Ltd. +***/ + +#include +#include + +#include "clock-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(clock_is_localtime) { + _cleanup_(unlink_tempfilep) char adjtime[] = "/tmp/test-adjtime.XXXXXX"; + _cleanup_fclose_ FILE* f = NULL; + + static const struct scenario { + const char* contents; + int expected_result; + } scenarios[] = { + /* adjtime configures UTC */ + {"0.0 0 0\n0\nUTC\n", 0}, + /* adjtime configures local time */ + {"0.0 0 0\n0\nLOCAL\n", 1}, + /* no final EOL */ + {"0.0 0 0\n0\nUTC", 0}, + {"0.0 0 0\n0\nLOCAL", 1}, + /* empty value -> defaults to UTC */ + {"0.0 0 0\n0\n", 0}, + /* unknown value -> defaults to UTC */ + {"0.0 0 0\n0\nFOO\n", 0}, + /* no third line */ + {"0.0 0 0", 0}, + {"0.0 0 0\n", 0}, + {"0.0 0 0\n0", 0}, + }; + + /* without an adjtime file we default to UTC */ + assert_se(clock_is_localtime("/nonexisting/adjtime") == 0); + + assert_se(fmkostemp_safe(adjtime, "w", &f) == 0); + log_info("adjtime test file: %s", adjtime); + + for (size_t i = 0; i < ELEMENTSOF(scenarios); ++i) { + log_info("scenario #%zu:, expected result %i", i, scenarios[i].expected_result); + log_info("%s", scenarios[i].contents); + rewind(f); + assert_se(ftruncate(fileno(f), 0) == 0); + assert_se(write_string_stream(f, scenarios[i].contents, WRITE_STRING_FILE_AVOID_NEWLINE) == 0); + assert_se(clock_is_localtime(adjtime) == scenarios[i].expected_result); + } +} + +/* Test with the real /etc/adjtime */ +TEST(clock_is_localtime_system) { + int r; + r = clock_is_localtime(NULL); + + if (access("/etc/adjtime", R_OK) == 0) { + log_info("/etc/adjtime is readable, clock_is_localtime() == %i", r); + /* if /etc/adjtime exists we expect some answer, no error or + * crash */ + assert_se(IN_SET(r, 0, 1)); + } else + /* default is UTC if there is no /etc/adjtime */ + assert_se(r == 0 || ERRNO_IS_PRIVILEGE(r)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-compare-operator.c b/src/test/test-compare-operator.c new file mode 100644 index 0000000..3d8f46f --- /dev/null +++ b/src/test/test-compare-operator.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "compare-operator.h" +#include "tests.h" + +TEST(parse_compare_operator) { + const char *str_a = "<>version"; + assert_se(parse_compare_operator(&str_a, 0) == COMPARE_UNEQUAL); + const char *str_b = "$=version"; + assert_se(parse_compare_operator(&str_b, 0) == _COMPARE_OPERATOR_INVALID); + assert_se(parse_compare_operator(&str_b, COMPARE_ALLOW_FNMATCH) == COMPARE_FNMATCH_EQUAL); + const char *str_c = "eq oranges"; + assert_se(parse_compare_operator(&str_c, 0) == _COMPARE_OPERATOR_INVALID); + assert_se(parse_compare_operator(&str_c, COMPARE_ALLOW_TEXTUAL) == COMPARE_EQUAL); + const char *str_d = ""; + assert_se(parse_compare_operator(&str_d, 0) == _COMPARE_OPERATOR_INVALID); + const char *str_e = "!=!="; /* parse_compare_operator() moves the pointer */ + assert_se(parse_compare_operator(&str_e, COMPARE_EQUAL_BY_STRING) == COMPARE_STRING_UNEQUAL); + assert_se(parse_compare_operator(&str_e, 0) == COMPARE_UNEQUAL); + assert_se(parse_compare_operator(&str_e, 0) == _COMPARE_OPERATOR_INVALID); +} + +TEST(test_order) { + assert_se(!test_order(5, COMPARE_LOWER)); + assert_se(!test_order(5, COMPARE_LOWER_OR_EQUAL)); + assert_se(!test_order(5, COMPARE_EQUAL)); + assert_se(test_order(5, COMPARE_UNEQUAL)); + assert_se(test_order(5, COMPARE_GREATER_OR_EQUAL)); + assert_se(test_order(5, COMPARE_GREATER)); + assert_se(test_order(5, COMPARE_STRING_EQUAL) == -EINVAL); +} + +TEST(version_or_fnmatch_compare) { + assert_se(version_or_fnmatch_compare(COMPARE_STRING_EQUAL, "locale", "locale")); + assert_se(version_or_fnmatch_compare(COMPARE_STRING_UNEQUAL, "locale", "LOCALE")); + assert_se(version_or_fnmatch_compare(COMPARE_FNMATCH_EQUAL, "locaale", "loc*le")); + assert_se(version_or_fnmatch_compare(COMPARE_FNMATCH_UNEQUAL, "locaale", "loc?le")); + assert_se(version_or_fnmatch_compare(COMPARE_GREATER, "local512", "local256")); + assert_se(version_or_fnmatch_compare(COMPARE_LOWER, "local52", "local256")); + assert_se(version_or_fnmatch_compare(_COMPARE_OPERATOR_MAX, "local512", "local256") == -EINVAL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-compress-benchmark.c b/src/test/test-compress-benchmark.c new file mode 100644 index 0000000..1727db8 --- /dev/null +++ b/src/test/test-compress-benchmark.c @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "compress.h" +#include "env-util.h" +#include "macro.h" +#include "memory-util.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "random-util.h" +#include "string-util.h" +#include "tests.h" + +typedef int (compress_t)(const void *src, uint64_t src_size, void *dst, + size_t dst_alloc_size, size_t *dst_size); +typedef int (decompress_t)(const void *src, uint64_t src_size, + void **dst, size_t* dst_size, size_t dst_max); + +#if HAVE_COMPRESSION + +static usec_t arg_duration; +static size_t arg_start; + +#define MAX_SIZE (1024*1024LU) +#define PRIME 1048571 /* A prime close enough to one megabyte that mod 4 == 3 */ + +static size_t _permute(size_t x) { + size_t residue; + + if (x >= PRIME) + return x; + + residue = x*x % PRIME; + if (x <= PRIME / 2) + return residue; + else + return PRIME - residue; +} + +static size_t permute(size_t x) { + return _permute((_permute(x) + arg_start) % MAX_SIZE ^ 0xFF345); +} + +static char* make_buf(size_t count, const char *type) { + char *buf; + size_t i; + + buf = malloc(count); + assert_se(buf); + + if (streq(type, "zeros")) + memzero(buf, count); + else if (streq(type, "simple")) + for (i = 0; i < count; i++) + buf[i] = 'a' + i % ('z' - 'a' + 1); + else if (streq(type, "random")) { + size_t step = count / 10; + + random_bytes(buf, step); + memzero(buf + 1*step, step); + random_bytes(buf + 2*step, step); + memzero(buf + 3*step, step); + random_bytes(buf + 4*step, step); + memzero(buf + 5*step, step); + random_bytes(buf + 6*step, step); + memzero(buf + 7*step, step); + random_bytes(buf + 8*step, step); + memzero(buf + 9*step, step); + } else + assert_not_reached(); + + return buf; +} + +static void test_compress_decompress(const char* label, const char* type, + compress_t compress, decompress_t decompress) { + usec_t n, n2 = 0; + float dt; + + _cleanup_free_ char *text = NULL, *buf = NULL; + _cleanup_free_ void *buf2 = NULL; + size_t skipped = 0, compressed = 0, total = 0; + + text = make_buf(MAX_SIZE, type); + buf = calloc(MAX_SIZE + 1, 1); + assert_se(text && buf); + + n = now(CLOCK_MONOTONIC); + + for (size_t i = 0; i <= MAX_SIZE; i++) { + size_t j = 0, k = 0, size; + int r; + + size = permute(i); + if (size == 0) + continue; + + log_debug("%s %zu %zu", type, i, size); + + memzero(buf, MIN(size + 1000, MAX_SIZE)); + + r = compress(text, size, buf, size, &j); + /* assume compression must be successful except for small or random inputs */ + assert_se(r >= 0 || (size < 2048 && r == -ENOBUFS) || streq(type, "random")); + + /* check for overwrites */ + assert_se(buf[size] == 0); + if (r < 0) { + skipped += size; + continue; + } + + assert_se(j > 0); + if (j >= size) + log_error("%s \"compressed\" %zu -> %zu", label, size, j); + + r = decompress(buf, j, &buf2, &k, 0); + assert_se(r == 0); + assert_se(k == size); + + assert_se(memcmp(text, buf2, size) == 0); + + total += size; + compressed += j; + + n2 = now(CLOCK_MONOTONIC); + if (n2 - n > arg_duration) + break; + } + + dt = (n2-n) / 1e6; + + log_info("%s/%s: compressed & decompressed %zu bytes in %.2fs (%.2fMiB/s), " + "mean compression %.2f%%, skipped %zu bytes", + label, type, total, dt, + total / 1024. / 1024 / dt, + 100 - compressed * 100. / total, + skipped); +} +#endif + +int main(int argc, char *argv[]) { +#if HAVE_COMPRESSION + test_setup_logging(LOG_INFO); + + if (argc >= 2) { + unsigned x; + + assert_se(safe_atou(argv[1], &x) >= 0); + arg_duration = x * USEC_PER_SEC; + } else + arg_duration = slow_tests_enabled() ? + 2 * USEC_PER_SEC : USEC_PER_SEC / 50; + + if (argc == 3) + (void) safe_atozu(argv[2], &arg_start); + else + arg_start = getpid_cached(); + + NULSTR_FOREACH(i, "zeros\0simple\0random\0") { +#if HAVE_XZ + test_compress_decompress("XZ", i, compress_blob_xz, decompress_blob_xz); +#endif +#if HAVE_LZ4 + test_compress_decompress("LZ4", i, compress_blob_lz4, decompress_blob_lz4); +#endif +#if HAVE_ZSTD + test_compress_decompress("ZSTD", i, compress_blob_zstd, decompress_blob_zstd); +#endif + } + return 0; +#else + return log_tests_skipped("No compression feature is enabled"); +#endif +} diff --git a/src/test/test-compress.c b/src/test/test-compress.c new file mode 100644 index 0000000..2f20d00 --- /dev/null +++ b/src/test/test-compress.c @@ -0,0 +1,373 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#if HAVE_LZ4 +#include +#endif + +#include "alloc-util.h" +#include "compress.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "memory-util.h" +#include "path-util.h" +#include "random-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +#if HAVE_XZ +# define XZ_OK 0 +#else +# define XZ_OK -EPROTONOSUPPORT +#endif + +#if HAVE_LZ4 +# define LZ4_OK 0 +#else +# define LZ4_OK -EPROTONOSUPPORT +#endif + +#define HUGE_SIZE (4096*1024) + +typedef int (compress_blob_t)(const void *src, uint64_t src_size, + void *dst, size_t dst_alloc_size, size_t *dst_size); +typedef int (decompress_blob_t)(const void *src, uint64_t src_size, + void **dst, + size_t* dst_size, size_t dst_max); +typedef int (decompress_sw_t)(const void *src, uint64_t src_size, + void **buffer, + const void *prefix, size_t prefix_len, + uint8_t extra); + +typedef int (compress_stream_t)(int fdf, int fdt, uint64_t max_bytes, uint64_t *uncompressed_size); +typedef int (decompress_stream_t)(int fdf, int fdt, uint64_t max_size); + +#if HAVE_COMPRESSION +_unused_ static void test_compress_decompress( + const char *compression, + compress_blob_t compress, + decompress_blob_t decompress, + const char *data, + size_t data_len, + bool may_fail) { + + char compressed[512]; + size_t csize; + _cleanup_free_ char *decompressed = NULL; + int r; + + log_info("/* testing %s %s blob compression/decompression */", + compression, data); + + r = compress(data, data_len, compressed, sizeof(compressed), &csize); + if (r == -ENOBUFS) { + log_info_errno(r, "compression failed: %m"); + assert_se(may_fail); + } else { + assert_se(r >= 0); + r = decompress(compressed, csize, + (void **) &decompressed, &csize, 0); + assert_se(r == 0); + assert_se(decompressed); + assert_se(memcmp(decompressed, data, data_len) == 0); + } + + r = decompress("garbage", 7, + (void **) &decompressed, &csize, 0); + assert_se(r < 0); + + /* make sure to have the minimal lz4 compressed size */ + r = decompress("00000000\1g", 9, + (void **) &decompressed, &csize, 0); + assert_se(r < 0); + + r = decompress("\100000000g", 9, + (void **) &decompressed, &csize, 0); + assert_se(r < 0); + + explicit_bzero_safe(decompressed, MALLOC_SIZEOF_SAFE(decompressed)); +} + +_unused_ static void test_decompress_startswith(const char *compression, + compress_blob_t compress, + decompress_sw_t decompress_sw, + const char *data, + size_t data_len, + bool may_fail) { + + char *compressed; + _cleanup_free_ char *compressed1 = NULL, *compressed2 = NULL, *decompressed = NULL; + size_t csize, len; + int r; + + log_info("/* testing decompress_startswith with %s on %.20s text */", + compression, data); + +#define BUFSIZE_1 512 +#define BUFSIZE_2 20000 + + compressed = compressed1 = malloc(BUFSIZE_1); + assert_se(compressed1); + r = compress(data, data_len, compressed, BUFSIZE_1, &csize); + if (r == -ENOBUFS) { + log_info_errno(r, "compression failed: %m"); + assert_se(may_fail); + + compressed = compressed2 = malloc(BUFSIZE_2); + assert_se(compressed2); + r = compress(data, data_len, compressed, BUFSIZE_2, &csize); + } + assert_se(r >= 0); + + len = strlen(data); + + r = decompress_sw(compressed, csize, (void **) &decompressed, data, len, '\0'); + assert_se(r > 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, data, len, 'w'); + assert_se(r == 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, "barbarbar", 9, ' '); + assert_se(r == 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, data, len - 1, data[len-1]); + assert_se(r > 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, data, len - 1, 'w'); + assert_se(r == 0); + r = decompress_sw(compressed, csize, (void **) &decompressed, data, len, '\0'); + assert_se(r > 0); +} + +_unused_ static void test_decompress_startswith_short(const char *compression, + compress_blob_t compress, + decompress_sw_t decompress_sw) { + +#define TEXT "HUGE=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + + char buf[1024]; + size_t csize; + int r; + + log_info("/* %s with %s */", __func__, compression); + + r = compress(TEXT, sizeof TEXT, buf, sizeof buf, &csize); + assert_se(r >= 0); + + for (size_t i = 1; i < strlen(TEXT); i++) { + _cleanup_free_ void *buf2 = NULL; + + assert_se(buf2 = malloc(i)); + + assert_se(decompress_sw(buf, csize, &buf2, TEXT, i, TEXT[i]) == 1); + assert_se(decompress_sw(buf, csize, &buf2, TEXT, i, 'y') == 0); + } +} + +_unused_ static void test_compress_stream(const char *compression, + const char *cat, + compress_stream_t compress, + decompress_stream_t decompress, + const char *srcfile) { + + _cleanup_close_ int src = -EBADF, dst = -EBADF, dst2 = -EBADF; + _cleanup_(unlink_tempfilep) char + pattern[] = "/tmp/systemd-test.compressed.XXXXXX", + pattern2[] = "/tmp/systemd-test.compressed.XXXXXX"; + int r; + _cleanup_free_ char *cmd = NULL, *cmd2 = NULL; + struct stat st = {}; + uint64_t uncompressed_size; + + r = find_executable(cat, NULL); + if (r < 0) { + log_error_errno(r, "Skipping %s, could not find %s binary: %m", __func__, cat); + return; + } + + log_debug("/* testing %s compression */", compression); + + log_debug("/* create source from %s */", srcfile); + + assert_se((src = open(srcfile, O_RDONLY|O_CLOEXEC)) >= 0); + + log_debug("/* test compression */"); + + assert_se((dst = mkostemp_safe(pattern)) >= 0); + + assert_se(compress(src, dst, -1, &uncompressed_size) >= 0); + + if (cat) { + assert_se(asprintf(&cmd, "%s %s | diff %s -", cat, pattern, srcfile) > 0); + assert_se(system(cmd) == 0); + } + + log_debug("/* test decompression */"); + + assert_se((dst2 = mkostemp_safe(pattern2)) >= 0); + + assert_se(stat(srcfile, &st) == 0); + assert_se((uint64_t)st.st_size == uncompressed_size); + + assert_se(lseek(dst, 0, SEEK_SET) == 0); + r = decompress(dst, dst2, st.st_size); + assert_se(r == 0); + + assert_se(asprintf(&cmd2, "diff %s %s", srcfile, pattern2) > 0); + assert_se(system(cmd2) == 0); + + log_debug("/* test faulty decompression */"); + + assert_se(lseek(dst, 1, SEEK_SET) == 1); + r = decompress(dst, dst2, st.st_size); + assert_se(IN_SET(r, 0, -EBADMSG)); + + assert_se(lseek(dst, 0, SEEK_SET) == 0); + assert_se(lseek(dst2, 0, SEEK_SET) == 0); + r = decompress(dst, dst2, st.st_size - 1); + assert_se(r == -EFBIG); +} +#endif + +#if HAVE_LZ4 +static void test_lz4_decompress_partial(void) { + char buf[20000], buf2[100]; + size_t buf_size = sizeof(buf), compressed; + int r; + _cleanup_free_ char *huge = NULL; + + log_debug("/* %s */", __func__); + + assert_se(huge = malloc(HUGE_SIZE)); + memcpy(huge, "HUGE=", STRLEN("HUGE=")); + memset(&huge[STRLEN("HUGE=")], 'x', HUGE_SIZE - STRLEN("HUGE=") - 1); + huge[HUGE_SIZE - 1] = '\0'; + + r = LZ4_compress_default(huge, buf, HUGE_SIZE, buf_size); + assert_se(r >= 0); + compressed = r; + log_info("Compressed %i → %zu", HUGE_SIZE, compressed); + + r = LZ4_decompress_safe(buf, huge, r, HUGE_SIZE); + assert_se(r >= 0); + log_info("Decompressed → %i", r); + + r = LZ4_decompress_safe_partial(buf, huge, + compressed, + 12, HUGE_SIZE); + assert_se(r >= 0); + log_info("Decompressed partial %i/%i → %i", 12, HUGE_SIZE, r); + + for (size_t size = 1; size < sizeof(buf2); size++) { + /* This failed in older lz4s but works in newer ones. */ + r = LZ4_decompress_safe_partial(buf, buf2, compressed, size, size); + log_info("Decompressed partial %zu/%zu → %i (%s)", size, size, r, + r < 0 ? "bad" : "good"); + if (r >= 0 && LZ4_versionNumber() >= 10803) + /* lz4 <= 1.8.2 should fail that test, let's only check for newer ones */ + assert_se(memcmp(buf2, huge, r) == 0); + } +} +#endif + +int main(int argc, char *argv[]) { +#if HAVE_COMPRESSION + _unused_ const char text[] = + "text\0foofoofoofoo AAAA aaaaaaaaa ghost busters barbarbar FFF" + "foofoofoofoo AAAA aaaaaaaaa ghost busters barbarbar FFF"; + + /* The file to test compression on can be specified as the first argument */ + const char *srcfile = argc > 1 ? argv[1] : argv[0]; + + char data[512] = "random\0"; + + _cleanup_free_ char *huge = NULL; + + assert_se(huge = malloc(HUGE_SIZE)); + memcpy(huge, "HUGE=", STRLEN("HUGE=")); + memset(&huge[STRLEN("HUGE=")], 'x', HUGE_SIZE - STRLEN("HUGE=") - 1); + huge[HUGE_SIZE - 1] = '\0'; + + test_setup_logging(LOG_DEBUG); + + random_bytes(data + 7, sizeof(data) - 7); + +#if HAVE_XZ + test_compress_decompress("XZ", compress_blob_xz, decompress_blob_xz, + text, sizeof(text), false); + test_compress_decompress("XZ", compress_blob_xz, decompress_blob_xz, + data, sizeof(data), true); + + test_decompress_startswith("XZ", + compress_blob_xz, decompress_startswith_xz, + text, sizeof(text), false); + test_decompress_startswith("XZ", + compress_blob_xz, decompress_startswith_xz, + data, sizeof(data), true); + test_decompress_startswith("XZ", + compress_blob_xz, decompress_startswith_xz, + huge, HUGE_SIZE, true); + + test_compress_stream("XZ", "xzcat", + compress_stream_xz, decompress_stream_xz, srcfile); + + test_decompress_startswith_short("XZ", compress_blob_xz, decompress_startswith_xz); + +#else + log_info("/* XZ test skipped */"); +#endif + +#if HAVE_LZ4 + test_compress_decompress("LZ4", compress_blob_lz4, decompress_blob_lz4, + text, sizeof(text), false); + test_compress_decompress("LZ4", compress_blob_lz4, decompress_blob_lz4, + data, sizeof(data), true); + + test_decompress_startswith("LZ4", + compress_blob_lz4, decompress_startswith_lz4, + text, sizeof(text), false); + test_decompress_startswith("LZ4", + compress_blob_lz4, decompress_startswith_lz4, + data, sizeof(data), true); + test_decompress_startswith("LZ4", + compress_blob_lz4, decompress_startswith_lz4, + huge, HUGE_SIZE, true); + + test_compress_stream("LZ4", "lz4cat", + compress_stream_lz4, decompress_stream_lz4, srcfile); + + test_lz4_decompress_partial(); + + test_decompress_startswith_short("LZ4", compress_blob_lz4, decompress_startswith_lz4); + +#else + log_info("/* LZ4 test skipped */"); +#endif + +#if HAVE_ZSTD + test_compress_decompress("ZSTD", compress_blob_zstd, decompress_blob_zstd, + text, sizeof(text), false); + test_compress_decompress("ZSTD", compress_blob_zstd, decompress_blob_zstd, + data, sizeof(data), true); + + test_decompress_startswith("ZSTD", + compress_blob_zstd, decompress_startswith_zstd, + text, sizeof(text), false); + test_decompress_startswith("ZSTD", + compress_blob_zstd, decompress_startswith_zstd, + data, sizeof(data), true); + test_decompress_startswith("ZSTD", + compress_blob_zstd, decompress_startswith_zstd, + huge, HUGE_SIZE, true); + + test_compress_stream("ZSTD", "zstdcat", + compress_stream_zstd, decompress_stream_zstd, srcfile); + + test_decompress_startswith_short("ZSTD", compress_blob_zstd, decompress_startswith_zstd); +#else + log_info("/* ZSTD test skipped */"); +#endif + + return 0; +#else + return log_tests_skipped("no compression algorithm supported"); +#endif +} diff --git a/src/test/test-condition.c b/src/test/test-condition.c new file mode 100644 index 0000000..bb98761 --- /dev/null +++ b/src/test/test-condition.c @@ -0,0 +1,1496 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "apparmor-util.h" +#include "architecture.h" +#include "audit-util.h" +#include "battery-util.h" +#include "cgroup-util.h" +#include "condition.h" +#include "confidential-virt.h" +#include "cpu-set-util.h" +#include "efi-loader.h" +#include "env-util.h" +#include "errno-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "id128-util.h" +#include "ima-util.h" +#include "limits-util.h" +#include "log.h" +#include "macro.h" +#include "nulstr-util.h" +#include "os-util.h" +#include "path-util.h" +#include "process-util.h" +#include "psi-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "set.h" +#include "smack-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "tomoyo-util.h" +#include "uid-alloc-range.h" +#include "user-util.h" +#include "virt.h" + +TEST(condition_test_path) { + Condition *condition; + + condition = condition_new(CONDITION_PATH_EXISTS, "/bin/sh", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_EXISTS, "/bin/s?", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_EXISTS_GLOB, "/bin/s?", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_EXISTS_GLOB, "/bin/s?", false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_EXISTS, "/thiscertainlywontexist", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_EXISTS, "/thiscertainlywontexist", false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_DIRECTORY, "/bin", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_DIRECTORY_NOT_EMPTY, "/bin", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_FILE_NOT_EMPTY, "/bin/sh", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_FILE_IS_EXECUTABLE, "/bin/sh", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_FILE_IS_EXECUTABLE, "/etc/passwd", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_MOUNT_POINT, "/proc", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_MOUNT_POINT, "/", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_MOUNT_POINT, "/bin", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_READ_WRITE, "/tmp", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_ENCRYPTED, "/sys", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_PATH_IS_SYMBOLIC_LINK, "/dev/stdout", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); +} + +TEST(condition_test_control_group_hierarchy) { + Condition *condition; + int r; + + r = cg_unified(); + if (r == -ENOMEDIUM) { + log_tests_skipped("cgroup not mounted"); + return; + } + assert_se(r >= 0); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, "v1", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == (r < CGROUP_UNIFIED_ALL)); + condition_free(condition); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, "v2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == (r >= CGROUP_UNIFIED_ALL)); + condition_free(condition); +} + +TEST(condition_test_control_group_controller) { + Condition *condition; + CGroupMask system_mask; + _cleanup_free_ char *controller_name = NULL; + int r; + + r = cg_unified(); + if (r == -ENOMEDIUM) { + log_tests_skipped("cgroup not mounted"); + return; + } + assert_se(r >= 0); + + /* Invalid controllers are ignored */ + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, "thisisnotarealcontroller", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, "thisisnotarealcontroller", false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + assert_se(cg_mask_supported(&system_mask) >= 0); + + /* Individual valid controllers one by one */ + for (CGroupController controller = 0; controller < _CGROUP_CONTROLLER_MAX; controller++) { + const char *local_controller_name = cgroup_controller_to_string(controller); + log_info("chosen controller is '%s'", local_controller_name); + if (system_mask & CGROUP_CONTROLLER_TO_MASK(controller)) { + log_info("this controller is available"); + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, local_controller_name, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, local_controller_name, false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + } else { + log_info("this controller is unavailable"); + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, local_controller_name, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, local_controller_name, false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + } + } + + /* Multiple valid controllers at the same time */ + assert_se(cg_mask_to_string(system_mask, &controller_name) >= 0); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, strempty(controller_name), false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_CONTROL_GROUP_CONTROLLER, strempty(controller_name), false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); +} + +TEST(condition_test_ac_power) { + Condition *condition; + + condition = condition_new(CONDITION_AC_POWER, "true", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == on_ac_power()); + condition_free(condition); + + condition = condition_new(CONDITION_AC_POWER, "false", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) != on_ac_power()); + condition_free(condition); + + condition = condition_new(CONDITION_AC_POWER, "false", false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == on_ac_power()); + condition_free(condition); +} + +TEST(condition_test_host) { + _cleanup_free_ char *hostname = NULL; + Condition *condition; + sd_id128_t id; + int r; + + r = sd_id128_get_machine(&id); + if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) + return (void) log_tests_skipped("/etc/machine-id missing"); + assert_se(r >= 0); + + condition = condition_new(CONDITION_HOST, SD_ID128_TO_STRING(id), false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_HOST, "garbage value jjjjjjjjjjjjjj", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_HOST, SD_ID128_TO_STRING(id), false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + hostname = gethostname_malloc(); + assert_se(hostname); + + /* if hostname looks like an id128 then skip testing it */ + if (id128_is_valid(hostname)) + log_notice("hostname is an id128, skipping test"); + else { + condition = condition_new(CONDITION_HOST, hostname, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + } +} + +TEST(condition_test_architecture) { + Condition *condition; + const char *sa; + Architecture a; + + a = uname_architecture(); + assert_se(a >= 0); + + sa = architecture_to_string(a); + assert_se(sa); + + condition = condition_new(CONDITION_ARCHITECTURE, sa, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_ARCHITECTURE, "garbage value", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_ARCHITECTURE, sa, false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); +} + +TEST(condition_test_firmware) { + Condition *condition; + + /* Empty parameter */ + condition = condition_new(CONDITION_FIRMWARE, "", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + /* uefi parameter */ + condition = condition_new(CONDITION_FIRMWARE, "uefi", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == is_efi_boot()); + condition_free(condition); +} + +TEST(condition_test_firmware_device_tree) { + Condition *condition; + bool is_device_tree_system; + + /* device-tree parameter */ + is_device_tree_system = (access("/sys/firmware/devicetree/", F_OK) == 0); + + condition = condition_new(CONDITION_FIRMWARE, "device-tree", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == is_device_tree_system); + condition_free(condition); + + /* device-tree-compatible parameter */ + if (!is_device_tree_system) { + condition = condition_new(CONDITION_FIRMWARE, "device-tree-compatible()", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + } else { + _cleanup_free_ char *dtcompat = NULL; + _cleanup_strv_free_ char **dtcompatlist = NULL; + size_t dtcompat_size; + int r; + + r = read_full_virtual_file("/proc/device-tree/compatible", &dtcompat, &dtcompat_size); + if (r < 0) { + condition = condition_new(CONDITION_FIRMWARE, "device-tree-compatible()", false, false); + assert_se(condition); + if (r == -ENOENT) + assert_se(condition_test(condition, environ) == 0); + else + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + return; + } + + dtcompatlist = strv_parse_nulstr(dtcompat, dtcompat_size); + + STRV_FOREACH(c, dtcompatlist) { + _cleanup_free_ char *expression = NULL; + + assert_se(expression = strjoin("device-tree-compatible(", *c, ")")); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + } + } +} + +TEST(condition_test_firmware_smbios) { + Condition *condition; + _cleanup_free_ char *bios_vendor = NULL, *bios_version = NULL; + const char *expression; + + /* smbios-field parameter */ + /* Test some malformed smbios-field arguments */ + condition = condition_new(CONDITION_FIRMWARE, "smbios-field()", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_FIRMWARE, "smbios-field(malformed)", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_FIRMWARE, "smbios-field(malformed", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_FIRMWARE, "smbios-field(malformed=)", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_FIRMWARE, "smbios-field(malformed=)", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_FIRMWARE, "smbios-field(not_existing=nothing garbage)", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + /* Test not existing SMBIOS field */ + condition = condition_new(CONDITION_FIRMWARE, "smbios-field(not_existing=nothing)", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + /* Test with bios_vendor, if available */ + if (read_virtual_file("/sys/class/dmi/id/bios_vendor", SIZE_MAX, &bios_vendor, NULL) <= 0) + return; + + /* remove trailing newline */ + strstrip(bios_vendor); + + /* Check if the bios_vendor contains any spaces we should quote */ + const char *quote = strchr(bios_vendor, ' ') ? "\"" : ""; + + /* Test equality / inequality using fnmatch() */ + expression = strjoina("smbios-field(bios_vendor $= ", quote, bios_vendor, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_vendor$=", quote, bios_vendor, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_vendor !$= ", quote, bios_vendor, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_vendor!$=", quote, bios_vendor, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_vendor $= ", quote, bios_vendor, "*", quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + /* Test version comparison with bios_version, if available */ + if (read_virtual_file("/sys/class/dmi/id/bios_version", SIZE_MAX, &bios_version, NULL) <= 0) + return; + + /* remove trailing newline */ + strstrip(bios_version); + + /* Check if the bios_version contains any spaces we should quote */ + quote = strchr(bios_version, ' ') ? "\"" : ""; + + expression = strjoina("smbios-field(bios_version = ", quote, bios_version, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_version != ", quote, bios_version, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_version <= ", quote, bios_version, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_version >= ", quote, bios_version, quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_version < ", quote, bios_version, ".1", quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + expression = strjoina("smbios-field(bios_version > ", quote, bios_version, ".1", quote, ")"); + condition = condition_new(CONDITION_FIRMWARE, expression, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); +} + +TEST(condition_test_kernel_command_line) { + Condition *condition; + int r; + + condition = condition_new(CONDITION_KERNEL_COMMAND_LINE, "thisreallyshouldntbeonthekernelcommandline", false, false); + assert_se(condition); + r = condition_test(condition, environ); + if (ERRNO_IS_PRIVILEGE(r)) + return; + assert_se(r == 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_COMMAND_LINE, "andthis=neither", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); +} + +TEST(condition_test_kernel_version) { + Condition *condition; + struct utsname u; + const char *v; + + condition = condition_new(CONDITION_KERNEL_VERSION, "*thisreallyshouldntbeinthekernelversion*", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "*", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + /* An artificially empty condition. It evaluates to true, but normally + * such condition cannot be created, because the condition list is reset instead. */ + condition = condition_new(CONDITION_KERNEL_VERSION, "", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + assert_se(uname(&u) >= 0); + + condition = condition_new(CONDITION_KERNEL_VERSION, u.release, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + strshorten(u.release, 4); + strcpy(strchr(u.release, 0), "*"); + + condition = condition_new(CONDITION_KERNEL_VERSION, u.release, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + /* 0.1.2 would be a very very very old kernel */ + condition = condition_new(CONDITION_KERNEL_VERSION, "> 0.1.2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, ">0.1.2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "'>0.1.2' '<9.0.0'", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "> 0.1.2 < 9.0.0", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, ">", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, ">= 0.1.2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "< 0.1.2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "<= 0.1.2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "= 0.1.2", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + /* 4711.8.15 is a very very very future kernel */ + condition = condition_new(CONDITION_KERNEL_VERSION, "< 4711.8.15", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "<= 4711.8.15", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "= 4711.8.15", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, "> 4711.8.15", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_KERNEL_VERSION, " >= 4711.8.15", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + assert_se(uname(&u) >= 0); + + v = strjoina(">=", u.release); + condition = condition_new(CONDITION_KERNEL_VERSION, v, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + v = strjoina("= ", u.release); + condition = condition_new(CONDITION_KERNEL_VERSION, v, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + v = strjoina("<=", u.release); + condition = condition_new(CONDITION_KERNEL_VERSION, v, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + v = strjoina("> ", u.release); + condition = condition_new(CONDITION_KERNEL_VERSION, v, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + v = strjoina("< ", u.release); + condition = condition_new(CONDITION_KERNEL_VERSION, v, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); +} + +TEST(condition_test_credential) { + _cleanup_(rm_rf_physical_and_freep) char *n1 = NULL, *n2 = NULL; + _cleanup_free_ char *d1 = NULL, *d2 = NULL, *j = NULL; + Condition *condition; + + assert_se(free_and_strdup(&d1, getenv("CREDENTIALS_DIRECTORY")) >= 0); + assert_se(free_and_strdup(&d2, getenv("ENCRYPTED_CREDENTIALS_DIRECTORY")) >= 0); + + assert_se(unsetenv("CREDENTIALS_DIRECTORY") >= 0); + assert_se(unsetenv("ENCRYPTED_CREDENTIALS_DIRECTORY") >= 0); + + condition = condition_new(CONDITION_CREDENTIAL, "definitelymissing", /* trigger= */ false, /* negate= */ false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + /* invalid */ + condition = condition_new(CONDITION_CREDENTIAL, "..", /* trigger= */ false, /* negate= */ false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + assert_se(mkdtemp_malloc(NULL, &n1) >= 0); + assert_se(mkdtemp_malloc(NULL, &n2) >= 0); + + assert_se(setenv("CREDENTIALS_DIRECTORY", n1, /* overwrite= */ true) >= 0); + assert_se(setenv("ENCRYPTED_CREDENTIALS_DIRECTORY", n2, /* overwrite= */ true) >= 0); + + condition = condition_new(CONDITION_CREDENTIAL, "stillmissing", /* trigger= */ false, /* negate= */ false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + assert_se(j = path_join(n1, "existing")); + assert_se(touch(j) >= 0); + assert_se(j); + condition = condition_new(CONDITION_CREDENTIAL, "existing", /* trigger= */ false, /* negate= */ false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + free(j); + + assert_se(j = path_join(n2, "existing-encrypted")); + assert_se(touch(j) >= 0); + assert_se(j); + condition = condition_new(CONDITION_CREDENTIAL, "existing-encrypted", /* trigger= */ false, /* negate= */ false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + assert_se(set_unset_env("CREDENTIALS_DIRECTORY", d1, /* overwrite= */ true) >= 0); + assert_se(set_unset_env("ENCRYPTED_CREDENTIALS_DIRECTORY", d2, /* overwrite= */ true) >= 0); +} + +#if defined(__i386__) || defined(__x86_64__) +TEST(condition_test_cpufeature) { + Condition *condition; + + condition = condition_new(CONDITION_CPU_FEATURE, "fpu", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_FEATURE, "somecpufeaturethatreallydoesntmakesense", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_FEATURE, "a", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); +} +#endif + +TEST(condition_test_security) { + Condition *condition; + + condition = condition_new(CONDITION_SECURITY, "garbage oifdsjfoidsjoj", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "selinux", false, true); + assert_se(condition); + assert_se(condition_test(condition, environ) != mac_selinux_use()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "apparmor", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == mac_apparmor_use()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "tomoyo", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == mac_tomoyo_use()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "ima", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == use_ima()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "smack", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == mac_smack_use()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "audit", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == use_audit()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "uefi-secureboot", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == is_efi_secure_boot()); + condition_free(condition); + + condition = condition_new(CONDITION_SECURITY, "cvm", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == + (detect_confidential_virtualization() != CONFIDENTIAL_VIRTUALIZATION_NONE)); + condition_free(condition); +} + +TEST(print_securities) { + log_info("------ enabled security technologies ------"); + log_info("SELinux: %s", yes_no(mac_selinux_use())); + log_info("AppArmor: %s", yes_no(mac_apparmor_use())); + log_info("Tomoyo: %s", yes_no(mac_tomoyo_use())); + log_info("IMA: %s", yes_no(use_ima())); + log_info("SMACK: %s", yes_no(mac_smack_use())); + log_info("Audit: %s", yes_no(use_audit())); + log_info("UEFI secure boot: %s", yes_no(is_efi_secure_boot())); + log_info("Confidential VM: %s", yes_no + (detect_confidential_virtualization() != CONFIDENTIAL_VIRTUALIZATION_NONE)); + log_info("-------------------------------------------"); +} + +TEST(condition_test_virtualization) { + Condition *condition; + int r; + + condition = condition_new(CONDITION_VIRTUALIZATION, "garbage oifdsjfoidsjoj", false, false); + assert_se(condition); + r = condition_test(condition, environ); + if (ERRNO_IS_PRIVILEGE(r)) + return; + log_info("ConditionVirtualization=garbage → %i", r); + assert_se(r == 0); + condition_free(condition); + + condition = condition_new(CONDITION_VIRTUALIZATION, "container", false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionVirtualization=container → %i", r); + assert_se(r == !!detect_container()); + condition_free(condition); + + condition = condition_new(CONDITION_VIRTUALIZATION, "vm", false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionVirtualization=vm → %i", r); + assert_se(r == (detect_vm() && !detect_container())); + condition_free(condition); + + condition = condition_new(CONDITION_VIRTUALIZATION, "private-users", false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionVirtualization=private-users → %i", r); + assert_se(r == !!running_in_userns()); + condition_free(condition); + + NULSTR_FOREACH(virt, + "kvm\0" + "amazon\0" + "qemu\0" + "bochs\0" + "xen\0" + "uml\0" + "vmware\0" + "oracle\0" + "microsoft\0" + "zvm\0" + "parallels\0" + "bhyve\0" + "vm_other\0") { + + condition = condition_new(CONDITION_VIRTUALIZATION, virt, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionVirtualization=%s → %i", virt, r); + assert_se(r >= 0); + condition_free(condition); + } +} + +TEST(condition_test_user) { + Condition *condition; + char* uid; + char* username; + int r; + + condition = condition_new(CONDITION_USER, "garbage oifdsjfoidsjoj", false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=garbage → %i", r); + assert_se(r == 0); + condition_free(condition); + + assert_se(asprintf(&uid, "%"PRIu32, UINT32_C(0xFFFF)) > 0); + condition = condition_new(CONDITION_USER, uid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=%s → %i", uid, r); + assert_se(r == 0); + condition_free(condition); + free(uid); + + assert_se(asprintf(&uid, "%u", (unsigned)getuid()) > 0); + condition = condition_new(CONDITION_USER, uid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=%s → %i", uid, r); + assert_se(r > 0); + condition_free(condition); + free(uid); + + assert_se(asprintf(&uid, "%u", (unsigned)getuid()+1) > 0); + condition = condition_new(CONDITION_USER, uid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=%s → %i", uid, r); + assert_se(r == 0); + condition_free(condition); + free(uid); + + username = getusername_malloc(); + assert_se(username); + condition = condition_new(CONDITION_USER, username, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=%s → %i", username, r); + assert_se(r > 0); + condition_free(condition); + free(username); + + username = (char*)(geteuid() == 0 ? NOBODY_USER_NAME : "root"); + condition = condition_new(CONDITION_USER, username, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=%s → %i", username, r); + assert_se(r == 0); + condition_free(condition); + + condition = condition_new(CONDITION_USER, "@system", false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionUser=@system → %i", r); + if (uid_is_system(getuid()) || uid_is_system(geteuid())) + assert_se(r > 0); + else + assert_se(r == 0); + condition_free(condition); +} + +TEST(condition_test_group) { + Condition *condition; + char* gid; + char* groupname; + gid_t *gids, max_gid; + int ngroups_max, ngroups, r, i; + + assert_se(0 < asprintf(&gid, "%u", UINT32_C(0xFFFF))); + condition = condition_new(CONDITION_GROUP, gid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionGroup=%s → %i", gid, r); + assert_se(r == 0); + condition_free(condition); + free(gid); + + assert_se(0 < asprintf(&gid, "%u", getgid())); + condition = condition_new(CONDITION_GROUP, gid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionGroup=%s → %i", gid, r); + assert_se(r > 0); + condition_free(condition); + free(gid); + + ngroups_max = sysconf(_SC_NGROUPS_MAX); + assert_se(ngroups_max > 0); + + gids = newa(gid_t, ngroups_max); + + ngroups = getgroups(ngroups_max, gids); + assert_se(ngroups >= 0); + + max_gid = getgid(); + for (i = 0; i < ngroups; i++) { + _cleanup_free_ char *name = NULL; + + assert_se(0 < asprintf(&gid, "%u", gids[i])); + condition = condition_new(CONDITION_GROUP, gid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionGroup=%s → %i", gid, r); + assert_se(r > 0); + condition_free(condition); + free(gid); + max_gid = gids[i] > max_gid ? gids[i] : max_gid; + + name = gid_to_name(gids[i]); + assert_se(name); + if (STR_IN_SET(name, "sbuild", "buildd")) + return; /* Debian package build in chroot, groupnames won't match, skip */ + condition = condition_new(CONDITION_GROUP, name, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionGroup=%s → %i", name, r); + assert_se(r > 0); + condition_free(condition); + max_gid = gids[i] > max_gid ? gids[i] : max_gid; + } + + assert_se(0 < asprintf(&gid, "%u", max_gid+1)); + condition = condition_new(CONDITION_GROUP, gid, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionGroup=%s → %i", gid, r); + assert_se(r == 0); + condition_free(condition); + free(gid); + + groupname = (char*)(getegid() == 0 ? NOBODY_GROUP_NAME : "root"); + condition = condition_new(CONDITION_GROUP, groupname, false, false); + assert_se(condition); + r = condition_test(condition, environ); + log_info("ConditionGroup=%s → %i", groupname, r); + assert_se(r == 0); + condition_free(condition); +} + +static void test_condition_test_cpus_one(const char *s, bool result) { + Condition *condition; + int r; + + log_debug("%s=%s", condition_type_to_string(CONDITION_CPUS), s); + + condition = condition_new(CONDITION_CPUS, s, false, false); + assert_se(condition); + + r = condition_test(condition, environ); + assert_se(r >= 0); + assert_se(r == result); + condition_free(condition); +} + +TEST(condition_test_cpus) { + _cleanup_free_ char *t = NULL; + int cpus; + + cpus = cpus_in_affinity_mask(); + assert_se(cpus >= 0); + + test_condition_test_cpus_one("> 0", true); + test_condition_test_cpus_one(">= 0", true); + test_condition_test_cpus_one("!= 0", true); + test_condition_test_cpus_one("<= 0", false); + test_condition_test_cpus_one("< 0", false); + test_condition_test_cpus_one("= 0", false); + + test_condition_test_cpus_one("> 100000", false); + test_condition_test_cpus_one("= 100000", false); + test_condition_test_cpus_one(">= 100000", false); + test_condition_test_cpus_one("< 100000", true); + test_condition_test_cpus_one("!= 100000", true); + test_condition_test_cpus_one("<= 100000", true); + + assert_se(asprintf(&t, "= %i", cpus) >= 0); + test_condition_test_cpus_one(t, true); + t = mfree(t); + + assert_se(asprintf(&t, "<= %i", cpus) >= 0); + test_condition_test_cpus_one(t, true); + t = mfree(t); + + assert_se(asprintf(&t, ">= %i", cpus) >= 0); + test_condition_test_cpus_one(t, true); + t = mfree(t); + + assert_se(asprintf(&t, "!= %i", cpus) >= 0); + test_condition_test_cpus_one(t, false); + t = mfree(t); + + assert_se(asprintf(&t, "< %i", cpus) >= 0); + test_condition_test_cpus_one(t, false); + t = mfree(t); + + assert_se(asprintf(&t, "> %i", cpus) >= 0); + test_condition_test_cpus_one(t, false); + t = mfree(t); +} + +static void test_condition_test_memory_one(const char *s, bool result) { + Condition *condition; + int r; + + log_debug("%s=%s", condition_type_to_string(CONDITION_MEMORY), s); + + condition = condition_new(CONDITION_MEMORY, s, false, false); + assert_se(condition); + + r = condition_test(condition, environ); + assert_se(r >= 0); + assert_se(r == result); + condition_free(condition); +} + +TEST(condition_test_memory) { + _cleanup_free_ char *t = NULL; + uint64_t memory; + + memory = physical_memory(); + + test_condition_test_memory_one("> 0", true); + test_condition_test_memory_one(">= 0", true); + test_condition_test_memory_one("!= 0", true); + test_condition_test_memory_one("<= 0", false); + test_condition_test_memory_one("< 0", false); + test_condition_test_memory_one("= 0", false); + + test_condition_test_memory_one("> 18446744073709547520", false); + test_condition_test_memory_one("= 18446744073709547520", false); + test_condition_test_memory_one(">= 18446744073709547520", false); + test_condition_test_memory_one("< 18446744073709547520", true); + test_condition_test_memory_one("!= 18446744073709547520", true); + test_condition_test_memory_one("<= 18446744073709547520", true); + + test_condition_test_memory_one("> 100T", false); + test_condition_test_memory_one("= 100T", false); + test_condition_test_memory_one(">= 100T", false); + test_condition_test_memory_one("< 100T", true); + test_condition_test_memory_one("!= 100T", true); + test_condition_test_memory_one("<= 100T", true); + + test_condition_test_memory_one("> 100 T", false); + test_condition_test_memory_one("= 100 T", false); + test_condition_test_memory_one(">= 100 T", false); + test_condition_test_memory_one("< 100 T", true); + test_condition_test_memory_one("!= 100 T", true); + test_condition_test_memory_one("<= 100 T", true); + + test_condition_test_memory_one("> 100 T 1 G", false); + test_condition_test_memory_one("= 100 T 1 G", false); + test_condition_test_memory_one(">= 100 T 1 G", false); + test_condition_test_memory_one("< 100 T 1 G", true); + test_condition_test_memory_one("!= 100 T 1 G", true); + test_condition_test_memory_one("<= 100 T 1 G", true); + + assert_se(asprintf(&t, "= %" PRIu64, memory) >= 0); + test_condition_test_memory_one(t, true); + t = mfree(t); + + assert_se(asprintf(&t, "<= %" PRIu64, memory) >= 0); + test_condition_test_memory_one(t, true); + t = mfree(t); + + assert_se(asprintf(&t, ">= %" PRIu64, memory) >= 0); + test_condition_test_memory_one(t, true); + t = mfree(t); + + assert_se(asprintf(&t, "!= %" PRIu64, memory) >= 0); + test_condition_test_memory_one(t, false); + t = mfree(t); + + assert_se(asprintf(&t, "< %" PRIu64, memory) >= 0); + test_condition_test_memory_one(t, false); + t = mfree(t); + + assert_se(asprintf(&t, "> %" PRIu64, memory) >= 0); + test_condition_test_memory_one(t, false); + t = mfree(t); +} + +static void test_condition_test_environment_one(const char *s, bool result) { + Condition *condition; + int r; + + log_debug("%s=%s", condition_type_to_string(CONDITION_ENVIRONMENT), s); + + condition = condition_new(CONDITION_ENVIRONMENT, s, false, false); + assert_se(condition); + + r = condition_test(condition, environ); + assert_se(r >= 0); + assert_se(r == result); + condition_free(condition); +} + +TEST(condition_test_environment) { + assert_se(setenv("EXISTINGENVVAR", "foo", false) >= 0); + + test_condition_test_environment_one("MISSINGENVVAR", false); + test_condition_test_environment_one("MISSINGENVVAR=foo", false); + test_condition_test_environment_one("MISSINGENVVAR=", false); + + test_condition_test_environment_one("EXISTINGENVVAR", true); + test_condition_test_environment_one("EXISTINGENVVAR=foo", true); + test_condition_test_environment_one("EXISTINGENVVAR=bar", false); + test_condition_test_environment_one("EXISTINGENVVAR=", false); +} + +TEST(condition_test_os_release) { + _cleanup_strv_free_ char **os_release_pairs = NULL; + _cleanup_free_ char *version_id = NULL; + const char *key_value_pair; + Condition *condition; + + /* Should not happen, but it's a test so we don't know the environment. */ + if (load_os_release_pairs(NULL, &os_release_pairs) < 0) + return; + if (strv_length(os_release_pairs) < 2) + return; + + condition = condition_new(CONDITION_OS_RELEASE, "_THISHOPEFULLYWONTEXIST=01234 56789", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRONG FORMAT", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRONG!<>=FORMAT", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRONG FORMAT=", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRONG =FORMAT", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRONG = FORMAT", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRONGFORMAT= ", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "WRO NG=FORMAT", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == -EINVAL); + condition_free(condition); + + condition = condition_new(CONDITION_OS_RELEASE, "", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + /* load_os_release_pairs() removes quotes, we have to add them back, + * otherwise we get a string: "PRETTY_NAME=Debian GNU/Linux 10 (buster)" + * which is wrong, as the value is not quoted anymore. */ + const char *quote = strchr(os_release_pairs[1], ' ') ? "\"" : ""; + key_value_pair = strjoina(os_release_pairs[0], "=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina(os_release_pairs[0], "!=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + /* Test fnmatch() operators */ + key_value_pair = strjoina(os_release_pairs[0], "$=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina(os_release_pairs[0], "!$=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + /* Some distros (eg: Arch) do not set VERSION_ID */ + if (parse_os_release(NULL, "VERSION_ID", &version_id) <= 0) + return; + + key_value_pair = strjoina("VERSION_ID", "=", version_id); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "!=", version_id); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "<=", version_id); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", ">=", version_id); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "<", version_id, ".1"); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", ">", version_id, ".1"); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "=", version_id, " ", os_release_pairs[0], "=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "!=", version_id, " ", os_release_pairs[0], "=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "=", version_id, " ", os_release_pairs[0], "!=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "!=", version_id, " ", os_release_pairs[0], "!=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) == 0); + condition_free(condition); + + key_value_pair = strjoina("VERSION_ID", "<", version_id, ".1", " ", os_release_pairs[0], "=", quote, os_release_pairs[1], quote); + condition = condition_new(CONDITION_OS_RELEASE, key_value_pair, false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); +} + +TEST(condition_test_psi) { + Condition *condition; + CGroupMask mask; + int r; + + if (!is_pressure_supported()) + return (void) log_notice("Pressure Stall Information (PSI) is not supported, skipping %s", __func__); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "sbarabau", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "10%sbarabau", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "10% sbarabau", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "-10", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "10%/10min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "10min/10%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "10% 5min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "/5min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_IO_PRESSURE, "10s / ", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "100%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "0%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "0.0%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "100%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "0%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "0.0%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "0.01%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "0.0%/10sec", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "100.0% / 1min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_IO_PRESSURE, "50.0% / 1min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + r = cg_all_unified(); + if (r < 0) + return (void) log_notice("Failed to determine whether the unified cgroups hierarchy is used, skipping %s", __func__); + if (r == 0) + return (void) log_notice("Requires the unified cgroups hierarchy, skipping %s", __func__); + + if (cg_mask_supported(&mask) < 0) + return (void) log_notice("Failed to get supported cgroup controllers, skipping %s", __func__); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return (void) log_notice("Requires the cgroup memory controller, skipping %s", __func__); + + if (!FLAGS_SET(mask, CGROUP_MASK_CPU)) + return (void) log_notice("Requires the cgroup CPU controller, skipping %s", __func__); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, " : / ", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) < 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "hopefullythisisnotarealone.slice:100% / 10sec", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) > 0); + condition_free(condition); + + condition = condition_new(CONDITION_CPU_PRESSURE, "-.slice:100.0% / 1min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "-.slice:0.0%/5min", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_MEMORY_PRESSURE, "-.slice:100.0%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); + + condition = condition_new(CONDITION_IO_PRESSURE, "-.slice:0.0%", false, false); + assert_se(condition); + assert_se(condition_test(condition, environ) >= 0); + condition_free(condition); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-conf-files.c b/src/test/test-conf-files.c new file mode 100644 index 0000000..4253490 --- /dev/null +++ b/src/test/test-conf-files.c @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2014 Michael Marineau +***/ + +#include +#include + +#include "alloc-util.h" +#include "conf-files.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "macro.h" +#include "mkdir.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(conf_files_list) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF; + _cleanup_strv_free_ char **result = NULL; + const char *search1, *search2, *search1_a, *search1_b, *search1_c, *search2_aa; + + tfd = mkdtemp_open("/tmp/test-conf-files-XXXXXX", O_PATH, &t); + assert(tfd >= 0); + + assert_se(mkdirat(tfd, "dir1", 0755) >= 0); + assert_se(mkdirat(tfd, "dir2", 0755) >= 0); + + search1 = strjoina(t, "/dir1/"); + search2 = strjoina(t, "/dir2/"); + + FOREACH_STRING(p, "a.conf", "b.conf", "c.foo") { + _cleanup_free_ char *path = NULL; + + assert_se(path = path_join(search1, p)); + assert_se(write_string_file(path, "foobar", WRITE_STRING_FILE_CREATE) >= 0); + } + + assert_se(symlinkat("/dev/null", tfd, "dir1/m.conf") >= 0); + + FOREACH_STRING(p, "a.conf", "aa.conf", "m.conf") { + _cleanup_free_ char *path = NULL; + + assert_se(path = path_join(search2, p)); + assert_se(write_string_file(path, "hogehoge", WRITE_STRING_FILE_CREATE) >= 0); + } + + search1_a = strjoina(search1, "a.conf"); + search1_b = strjoina(search1, "b.conf"); + search1_c = strjoina(search1, "c.foo"); + search2_aa = strjoina(search2, "aa.conf"); + + /* search dir1 without suffix */ + assert_se(conf_files_list(&result, NULL, NULL, CONF_FILES_FILTER_MASKED, search1) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search1_b, search1_c))); + + result = strv_free(result); + + assert_se(conf_files_list(&result, NULL, t, CONF_FILES_FILTER_MASKED, "/dir1/") >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search1_b, search1_c))); + + result = strv_free(result); + + assert_se(conf_files_list_at(&result, NULL, AT_FDCWD, CONF_FILES_FILTER_MASKED, search1) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search1_b, search1_c))); + + result = strv_free(result); + + assert_se(conf_files_list_at(&result, NULL, tfd, CONF_FILES_FILTER_MASKED, "/dir1/") >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("dir1/a.conf", "dir1/b.conf", "dir1/c.foo"))); + + result = strv_free(result); + + /* search dir1 with suffix */ + assert_se(conf_files_list(&result, ".conf", NULL, CONF_FILES_FILTER_MASKED, search1) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search1_b))); + + result = strv_free(result); + + assert_se(conf_files_list(&result, ".conf", t, CONF_FILES_FILTER_MASKED, "/dir1/") >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search1_b))); + + result = strv_free(result); + + assert_se(conf_files_list_at(&result, ".conf", AT_FDCWD, CONF_FILES_FILTER_MASKED, search1) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search1_b))); + + result = strv_free(result); + + assert_se(conf_files_list_at(&result, ".conf", tfd, CONF_FILES_FILTER_MASKED, "/dir1/") >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("dir1/a.conf", "dir1/b.conf"))); + + result = strv_free(result); + + /* search two dirs */ + assert_se(conf_files_list_strv(&result, ".conf", NULL, CONF_FILES_FILTER_MASKED, STRV_MAKE_CONST(search1, search2)) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search2_aa, search1_b))); + + result = strv_free(result); + + assert_se(conf_files_list_strv(&result, ".conf", t, CONF_FILES_FILTER_MASKED, STRV_MAKE_CONST("/dir1/", "/dir2/")) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search2_aa, search1_b))); + + result = strv_free(result); + + assert_se(conf_files_list_strv_at(&result, ".conf", AT_FDCWD, CONF_FILES_FILTER_MASKED, STRV_MAKE_CONST(search1, search2)) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE(search1_a, search2_aa, search1_b))); + + result = strv_free(result); + + assert_se(conf_files_list_strv_at(&result, ".conf", tfd, CONF_FILES_FILTER_MASKED, STRV_MAKE_CONST("/dir1/", "/dir2/")) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("dir1/a.conf", "dir2/aa.conf", "dir1/b.conf"))); + + result = strv_free(result); + + /* filename only */ + assert_se(conf_files_list_strv(&result, ".conf", NULL, CONF_FILES_FILTER_MASKED | CONF_FILES_BASENAME, STRV_MAKE_CONST(search1, search2)) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("a.conf", "aa.conf", "b.conf"))); + + result = strv_free(result); + + assert_se(conf_files_list_strv(&result, ".conf", t, CONF_FILES_FILTER_MASKED | CONF_FILES_BASENAME, STRV_MAKE_CONST("/dir1/", "/dir2/")) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("a.conf", "aa.conf", "b.conf"))); + + result = strv_free(result); + + assert_se(conf_files_list_strv_at(&result, ".conf", AT_FDCWD, CONF_FILES_FILTER_MASKED | CONF_FILES_BASENAME, STRV_MAKE_CONST(search1, search2)) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("a.conf", "aa.conf", "b.conf"))); + + result = strv_free(result); + + assert_se(conf_files_list_strv_at(&result, ".conf", tfd, CONF_FILES_FILTER_MASKED | CONF_FILES_BASENAME, STRV_MAKE_CONST("/dir1/", "/dir2/")) >= 0); + strv_print(result); + assert_se(strv_equal(result, STRV_MAKE("a.conf", "aa.conf", "b.conf"))); +} + +static void test_conf_files_insert_one(const char *root) { + _cleanup_strv_free_ char **s = NULL; + + log_info("/* %s root=%s */", __func__, strempty(root)); + + char **dirs = STRV_MAKE("/dir1", "/dir2", "/dir3"); + + _cleanup_free_ const char + *foo1 = path_join(root, "/dir1/foo.conf"), + *foo2 = path_join(root, "/dir2/foo.conf"), + *bar2 = path_join(root, "/dir2/bar.conf"), + *zzz3 = path_join(root, "/dir3/zzz.conf"), + *whatever = path_join(root, "/whatever.conf"); + + assert_se(conf_files_insert(&s, root, dirs, "/dir2/foo.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(foo2))); + + /* The same file again, https://github.com/systemd/systemd/issues/11124 */ + assert_se(conf_files_insert(&s, root, dirs, "/dir2/foo.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(foo2))); + + /* Lower priority → new entry is ignored */ + assert_se(conf_files_insert(&s, root, dirs, "/dir3/foo.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(foo2))); + + /* Higher priority → new entry replaces */ + assert_se(conf_files_insert(&s, root, dirs, "/dir1/foo.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(foo1))); + + /* Earlier basename */ + assert_se(conf_files_insert(&s, root, dirs, "/dir2/bar.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(bar2, foo1))); + + /* Later basename */ + assert_se(conf_files_insert(&s, root, dirs, "/dir3/zzz.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(bar2, foo1, zzz3))); + + /* All lower priority → all ignored */ + assert_se(conf_files_insert(&s, root, dirs, "/dir3/zzz.conf") == 0); + assert_se(conf_files_insert(&s, root, dirs, "/dir2/bar.conf") == 0); + assert_se(conf_files_insert(&s, root, dirs, "/dir3/bar.conf") == 0); + assert_se(conf_files_insert(&s, root, dirs, "/dir2/foo.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(bar2, foo1, zzz3))); + + /* Two entries that don't match any of the directories, but match basename */ + assert_se(conf_files_insert(&s, root, dirs, "/dir4/zzz.conf") == 0); + assert_se(conf_files_insert(&s, root, dirs, "/zzz.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(bar2, foo1, zzz3))); + + /* An entry that doesn't match any of the directories, no match at all */ + assert_se(conf_files_insert(&s, root, dirs, "/whatever.conf") == 0); + assert_se(strv_equal(s, STRV_MAKE(bar2, foo1, whatever, zzz3))); +} + +TEST(conf_files_insert) { + test_conf_files_insert_one(NULL); + test_conf_files_insert_one("/root"); + test_conf_files_insert_one("/root/"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-conf-parser.c b/src/test/test-conf-parser.c new file mode 100644 index 0000000..0acb413 --- /dev/null +++ b/src/test/test-conf-parser.c @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "conf-parser.h" +#include "fd-util.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static void test_config_parse_path_one(const char *rvalue, const char *expected) { + _cleanup_free_ char *path = NULL; + + assert_se(config_parse_path("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &path, NULL) >= 0); + assert_se(streq_ptr(expected, path)); +} + +static void test_config_parse_log_level_one(const char *rvalue, int expected) { + int log_level = 0; + + assert_se(config_parse_log_level("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &log_level, NULL) >= 0); + assert_se(expected == log_level); +} + +static void test_config_parse_log_facility_one(const char *rvalue, int expected) { + int log_facility = 0; + + assert_se(config_parse_log_facility("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &log_facility, NULL) >= 0); + assert_se(expected == log_facility); +} + +static void test_config_parse_iec_size_one(const char *rvalue, size_t expected) { + size_t iec_size = 0; + + assert_se(config_parse_iec_size("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &iec_size, NULL) >= 0); + assert_se(expected == iec_size); +} + +static void test_config_parse_si_uint64_one(const char *rvalue, uint64_t expected) { + uint64_t si_uint64 = 0; + + assert_se(config_parse_si_uint64("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &si_uint64, NULL) >= 0); + assert_se(expected == si_uint64); +} + +static void test_config_parse_int_one(const char *rvalue, int expected) { + int v = -1; + + assert_se(config_parse_int("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &v, NULL) >= 0); + assert_se(expected == v); +} + +static void test_config_parse_unsigned_one(const char *rvalue, unsigned expected) { + unsigned v = 0; + + assert_se(config_parse_unsigned("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &v, NULL) >= 0); + assert_se(expected == v); +} + +static void test_config_parse_strv_one(const char *rvalue, char **expected) { + _cleanup_strv_free_ char **strv = NULL; + + assert_se(config_parse_strv("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &strv, NULL) >= 0); + assert_se(strv_equal(expected, strv)); +} + +static void test_config_parse_mode_one(const char *rvalue, mode_t expected) { + mode_t v = 0; + + assert_se(config_parse_mode("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &v, NULL) >= 0); + assert_se(expected == v); +} + +static void test_config_parse_sec_one(const char *rvalue, usec_t expected) { + usec_t v = 0; + + assert_se(config_parse_sec("unit", "filename", 1, "section", 1, "lvalue", 0, rvalue, &v, NULL) >= 0); + assert_se(expected == v); +} + +static void test_config_parse_nsec_one(const char *rvalue, nsec_t expected) { + nsec_t v = 0; + + assert_se(config_parse_nsec("unit", "filename", 1, "nsection", 1, "lvalue", 0, rvalue, &v, NULL) >= 0); + assert_se(expected == v); +} + +TEST(config_parse_path) { + test_config_parse_path_one("/path", "/path"); + test_config_parse_path_one("/path//////////", "/path"); + test_config_parse_path_one("///path/foo///bar////bar//", "/path/foo/bar/bar"); + test_config_parse_path_one("/path//./////hogehoge///.", "/path/hogehoge"); + test_config_parse_path_one("/path/\xc3\x80", "/path/\xc3\x80"); + + test_config_parse_path_one("not_absolute/path", NULL); + test_config_parse_path_one("/path/\xc3\x7f", NULL); +} + +TEST(config_parse_log_level) { + test_config_parse_log_level_one("debug", LOG_DEBUG); + test_config_parse_log_level_one("info", LOG_INFO); + + test_config_parse_log_level_one("garbage", 0); +} + +TEST(config_parse_log_facility) { + test_config_parse_log_facility_one("mail", LOG_MAIL); + test_config_parse_log_facility_one("user", LOG_USER); + + test_config_parse_log_facility_one("garbage", 0); +} + +TEST(config_parse_iec_size) { + test_config_parse_iec_size_one("1024", 1024); + test_config_parse_iec_size_one("2K", 2048); + test_config_parse_iec_size_one("10M", 10 * 1024 * 1024); + test_config_parse_iec_size_one("1G", 1 * 1024 * 1024 * 1024); + test_config_parse_iec_size_one("0G", 0); + test_config_parse_iec_size_one("0", 0); + + test_config_parse_iec_size_one("-982", 0); + test_config_parse_iec_size_one("49874444198739873000000G", 0); + test_config_parse_iec_size_one("garbage", 0); +} + +TEST(config_parse_si_uint64) { + test_config_parse_si_uint64_one("1024", 1024); + test_config_parse_si_uint64_one("2K", 2000); + test_config_parse_si_uint64_one("10M", 10 * 1000 * 1000); + test_config_parse_si_uint64_one("1G", 1 * 1000 * 1000 * 1000); + test_config_parse_si_uint64_one("0G", 0); + test_config_parse_si_uint64_one("0", 0); + + test_config_parse_si_uint64_one("-982", 0); + test_config_parse_si_uint64_one("49874444198739873000000G", 0); + test_config_parse_si_uint64_one("garbage", 0); +} + +TEST(config_parse_int) { + test_config_parse_int_one("1024", 1024); + test_config_parse_int_one("-1024", -1024); + test_config_parse_int_one("0", 0); + + test_config_parse_int_one("99999999999999999999999999999999999999999999999999999999", -1); + test_config_parse_int_one("-99999999999999999999999999999999999999999999999999999999", -1); + test_config_parse_int_one("1G", -1); + test_config_parse_int_one("garbage", -1); +} + +TEST(config_parse_unsigned) { + test_config_parse_unsigned_one("10241024", 10241024); + test_config_parse_unsigned_one("1024", 1024); + test_config_parse_unsigned_one("0", 0); + + test_config_parse_unsigned_one("99999999999999999999999999999999999999999999999999999999", 0); + test_config_parse_unsigned_one("1G", 0); + test_config_parse_unsigned_one("garbage", 0); + test_config_parse_unsigned_one("1000garbage", 0); +} + +TEST(config_parse_strv) { + test_config_parse_strv_one("", STRV_MAKE_EMPTY); + test_config_parse_strv_one("foo", STRV_MAKE("foo")); + test_config_parse_strv_one("foo bar foo", STRV_MAKE("foo", "bar", "foo")); + test_config_parse_strv_one("\"foo bar\" foo", STRV_MAKE("foo bar", "foo")); + test_config_parse_strv_one("\xc3\x80", STRV_MAKE("\xc3\x80")); + test_config_parse_strv_one("\xc3\x7f", STRV_MAKE("\xc3\x7f")); +} + +TEST(config_parse_mode) { + test_config_parse_mode_one("777", 0777); + test_config_parse_mode_one("644", 0644); + + test_config_parse_mode_one("-777", 0); + test_config_parse_mode_one("999", 0); + test_config_parse_mode_one("garbage", 0); + test_config_parse_mode_one("777garbage", 0); + test_config_parse_mode_one("777 garbage", 0); +} + +TEST(config_parse_sec) { + test_config_parse_sec_one("1", 1 * USEC_PER_SEC); + test_config_parse_sec_one("1s", 1 * USEC_PER_SEC); + test_config_parse_sec_one("100ms", 100 * USEC_PER_MSEC); + test_config_parse_sec_one("5min 20s", 5 * 60 * USEC_PER_SEC + 20 * USEC_PER_SEC); + + test_config_parse_sec_one("-1", 0); + test_config_parse_sec_one("10foo", 0); + test_config_parse_sec_one("garbage", 0); +} + +TEST(config_parse_nsec) { + test_config_parse_nsec_one("1", 1); + test_config_parse_nsec_one("1s", 1 * NSEC_PER_SEC); + test_config_parse_nsec_one("100ms", 100 * NSEC_PER_MSEC); + test_config_parse_nsec_one("5min 20s", 5 * 60 * NSEC_PER_SEC + 20 * NSEC_PER_SEC); + + test_config_parse_nsec_one("-1", 0); + test_config_parse_nsec_one("10foo", 0); + test_config_parse_nsec_one("garbage", 0); +} + +TEST(config_parse_iec_uint64) { + uint64_t offset = 0; + assert_se(config_parse_iec_uint64(NULL, "/this/file", 11, "Section", 22, "Size", 0, "4M", &offset, NULL) == 0); + assert_se(offset == 4 * 1024 * 1024); + + assert_se(config_parse_iec_uint64(NULL, "/this/file", 11, "Section", 22, "Size", 0, "4.5M", &offset, NULL) == 0); +} + +#define x10(x) x x x x x x x x x x +#define x100(x) x10(x10(x)) +#define x1000(x) x10(x100(x)) + +static const char* const config_file[] = { + "[Section]\n" + "setting1=1\n", + + "[Section]\n" + "setting1=1", /* no terminating newline */ + + "\n\n\n\n[Section]\n\n\n" + "setting1=1", /* some whitespace, no terminating newline */ + + "[Section]\n" + "[Section]\n" + "setting1=1\n" + "setting1= 2 \t\n" + "setting1= 1\n", /* repeated settings */ + + "[Section]\n" + "[Section]\n" + "setting1=1\n" + "setting1=2\\\n" + " \n" /* empty line breaks continuation */ + "setting1=1\n", /* repeated settings */ + + "[Section]\n" + "setting1=1\\\n" /* normal continuation */ + "2\\\n" + "3\n", + + "[Section]\n" + "#hogehoge\\\n" /* continuation is ignored in comment */ + "setting1=1\\\n" /* normal continuation */ + "2\\\n" + "3\n", + + "[Section]\n" + "setting1=1\\\n" /* normal continuation */ + "#hogehoge\\\n" /* commented out line in continuation is ignored */ + "2\\\n" + "3\n", + + "[Section]\n" + " #hogehoge\\\n" /* whitespaces before comments */ + " setting1=1\\\n" /* whitespaces before key */ + "2\\\n" + "3\n", + + "[Section]\n" + " setting1=1\\\n" /* whitespaces before key */ + " #hogehoge\\\n" /* commented out line prefixed with whitespaces in continuation */ + "2\\\n" + "3\n", + + "[Section]\n" + "setting1=1\\\n" /* continuation with extra trailing backslash at the end */ + "2\\\n" + "3\\\n", + + "[Section]\n" + "setting1=1\\\\\\\n" /* continuation with trailing escape symbols */ + "\\\\2\n", /* note that C requires one level of escaping, so the + * parser gets "…1 BS BS BS NL BS BS 2 NL", which + * it translates into "…1 BS BS SP BS BS 2" */ + + "\n[Section]\n\n" + "setting1=" /* a line above LINE_MAX length */ + x1000("ABCD") + "\n", + + "[Section]\n" + "setting1=" /* a line above LINE_MAX length, with continuation */ + x1000("ABCD") "\\\n" + "foobar", + + "[Section]\n" + "setting1=" /* a line above LINE_MAX length, with continuation */ + x1000("ABCD") "\\\n" /* and an extra trailing backslash */ + "foobar\\\n", + + "[Section]\n" + "setting1=" /* a line above the allowed limit: 9 + 1050000 + 1 */ + x1000(x1000("x") x10("abcde")) "\n", + + "[Section]\n" + "setting1=" /* many continuation lines, together above the limit */ + x1000(x1000("x") x10("abcde") "\\\n") "xxx", + + "[Section]\n" + "setting1=2\n" + "[NoWarnSection]\n" + "setting1=3\n" + "[WarnSection]\n" + "setting1=3\n" + "[X-Section]\n" + "setting1=3\n", +}; + +static void test_config_parse_one(unsigned i, const char *s) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-conf-parser.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *setting1 = NULL; + int r; + + const ConfigTableItem items[] = { + { "Section", "setting1", config_parse_string, 0, &setting1}, + {} + }; + + log_info("== %s[%u] ==", __func__, i); + + assert_se(fmkostemp_safe(name, "r+", &f) == 0); + assert_se(fwrite(s, strlen(s), 1, f) == 1); + rewind(f); + + /* + int config_parse(const char *unit, + const char *filename, + FILE *f, + const char *sections, + ConfigItemLookup lookup, + const void *table, + ConfigParseFlags flags, + void *userdata, + struct stat *ret_stat); + */ + + r = config_parse(NULL, name, f, + "Section\0" + "-NoWarnSection\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, + NULL, + NULL); + + switch (i) { + case 0 ... 4: + assert_se(r == 1); + assert_se(streq(setting1, "1")); + break; + + case 5 ... 10: + assert_se(r == 1); + assert_se(streq(setting1, "1 2 3")); + break; + + case 11: + assert_se(r == 1); + assert_se(streq(setting1, "1\\\\ \\\\2")); + break; + + case 12: + assert_se(r == 1); + assert_se(streq(setting1, x1000("ABCD"))); + break; + + case 13 ... 14: + assert_se(r == 1); + assert_se(streq(setting1, x1000("ABCD") " foobar")); + break; + + case 15 ... 16: + assert_se(r == -ENOBUFS); + assert_se(setting1 == NULL); + break; + + case 17: + assert_se(r == 1); + assert_se(streq(setting1, "2")); + break; + } +} + +TEST(config_parse) { + for (unsigned i = 0; i < ELEMENTSOF(config_file); i++) + test_config_parse_one(i, config_file[i]); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-copy.c b/src/test/test-copy.c new file mode 100644 index 0000000..f3144f0 --- /dev/null +++ b/src/test/test-copy.c @@ -0,0 +1,532 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "copy.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "log.h" +#include "macro.h" +#include "mkdir.h" +#include "path-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" +#include "xattr-util.h" + +TEST(copy_file) { + _cleanup_free_ char *buf = NULL; + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-copy_file.XXXXXX"; + _cleanup_(unlink_tempfilep) char fn_copy[] = "/tmp/test-copy_file.XXXXXX"; + size_t sz = 0; + int fd; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + close(fd); + + fd = mkostemp_safe(fn_copy); + assert_se(fd >= 0); + close(fd); + + assert_se(write_string_file(fn, "foo bar bar bar foo", WRITE_STRING_FILE_CREATE) == 0); + + assert_se(copy_file(fn, fn_copy, 0, 0644, COPY_REFLINK) == 0); + + assert_se(read_full_file(fn_copy, &buf, &sz) == 0); + assert_se(streq(buf, "foo bar bar bar foo\n")); + assert_se(sz == 20); +} + +static bool read_file_at_and_streq(int dir_fd, const char *path, const char *expected) { + _cleanup_free_ char *buf = NULL; + + assert_se(read_full_file_at(dir_fd, path, &buf, NULL) == 0); + return streq(buf, expected); +} + +TEST(copy_tree_replace_file) { + _cleanup_free_ char *src = NULL, *dst = NULL; + + assert_se(tempfn_random("/tmp/test-copy_file.XXXXXX", NULL, &src) >= 0); + assert_se(tempfn_random("/tmp/test-copy_file.XXXXXX", NULL, &dst) >= 0); + + assert_se(write_string_file(src, "bar bar", WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(dst, "foo foo foo", WRITE_STRING_FILE_CREATE) == 0); + + /* The file exists- now overwrite original contents, and test the COPY_REPLACE flag. */ + + assert_se(copy_tree(src, dst, UID_INVALID, GID_INVALID, COPY_REFLINK, NULL, NULL) == -EEXIST); + + assert_se(read_file_at_and_streq(AT_FDCWD, dst, "foo foo foo\n")); + + assert_se(copy_tree(src, dst, UID_INVALID, GID_INVALID, COPY_REFLINK|COPY_REPLACE, NULL, NULL) == 0); + + assert_se(read_file_at_and_streq(AT_FDCWD, dst, "bar bar\n")); +} + +TEST(copy_tree_replace_dirs) { + _cleanup_(rm_rf_physical_and_freep) char *srcp = NULL, *dstp = NULL; + _cleanup_close_ int src = -EBADF, dst = -EBADF; + + /* Create the random source/destination directories */ + assert_se((src = mkdtemp_open(NULL, 0, &srcp)) >= 0); + assert_se((dst = mkdtemp_open(NULL, 0, &dstp)) >= 0); + + /* Populate some data to differentiate the files. */ + assert_se(write_string_file_at(src, "foo", "src file 1", WRITE_STRING_FILE_CREATE) >= 0); + assert_se(write_string_file_at(src, "bar", "src file 2", WRITE_STRING_FILE_CREATE) == 0); + + assert_se(write_string_file_at(dst, "foo", "dest file 1", WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file_at(dst, "bar", "dest file 2", WRITE_STRING_FILE_CREATE) == 0); + + /* Copying without COPY_REPLACE should fail because the destination file already exists. */ + assert_se(copy_tree_at(src, ".", dst, ".", UID_INVALID, GID_INVALID, COPY_REFLINK, NULL, NULL) == -EEXIST); + + assert_se(read_file_at_and_streq(src, "foo", "src file 1\n")); + assert_se(read_file_at_and_streq(src, "bar", "src file 2\n")); + assert_se(read_file_at_and_streq(dst, "foo", "dest file 1\n")); + assert_se(read_file_at_and_streq(dst, "bar", "dest file 2\n")); + + assert_se(copy_tree_at(src, ".", dst, ".", UID_INVALID, GID_INVALID, COPY_REFLINK|COPY_REPLACE|COPY_MERGE, NULL, NULL) == 0); + + assert_se(read_file_at_and_streq(src, "foo", "src file 1\n")); + assert_se(read_file_at_and_streq(src, "bar", "src file 2\n")); + assert_se(read_file_at_and_streq(dst, "foo", "src file 1\n")); + assert_se(read_file_at_and_streq(dst, "bar", "src file 2\n")); +} + +TEST(copy_file_fd) { + _cleanup_(unlink_tempfilep) char in_fn[] = "/tmp/test-copy-file-fd-XXXXXX"; + _cleanup_(unlink_tempfilep) char out_fn[] = "/tmp/test-copy-file-fd-XXXXXX"; + _cleanup_close_ int in_fd = -EBADF, out_fd = -EBADF; + const char *text = "boohoo\nfoo\n\tbar\n"; + char buf[64] = {}; + + in_fd = mkostemp_safe(in_fn); + assert_se(in_fd >= 0); + out_fd = mkostemp_safe(out_fn); + assert_se(out_fd >= 0); + + assert_se(write_string_file(in_fn, text, WRITE_STRING_FILE_CREATE) == 0); + assert_se(copy_file_fd("/a/file/which/does/not/exist/i/guess", out_fd, COPY_REFLINK) < 0); + assert_se(copy_file_fd(in_fn, out_fd, COPY_REFLINK) >= 0); + assert_se(lseek(out_fd, SEEK_SET, 0) == 0); + + assert_se(read(out_fd, buf, sizeof buf) == (ssize_t) strlen(text)); + assert_se(streq(buf, text)); +} + +TEST(copy_tree) { + _cleanup_hashmap_free_ Hashmap *denylist = NULL; + _cleanup_free_ char *cp = NULL; + char original_dir[] = "/tmp/test-copy_tree/"; + char copy_dir[] = "/tmp/test-copy_tree-copy/"; + char **files = STRV_MAKE("file", "dir1/file", "dir1/dir2/file", "dir1/dir2/dir3/dir4/dir5/file"); + char **symlinks = STRV_MAKE("link", "file", + "link2", "dir1/file"); + char **hardlinks = STRV_MAKE("hlink", "file", + "hlink2", "dir1/file"); + const char *unixsockp, *ignorep; + struct stat st; + int xattr_worked = -1; /* xattr support is optional in temporary directories, hence use it if we can, + * but don't fail if we can't */ + + (void) rm_rf(copy_dir, REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf(original_dir, REMOVE_ROOT|REMOVE_PHYSICAL); + + STRV_FOREACH(p, files) { + _cleanup_free_ char *f = NULL, *c = NULL; + int k; + + assert_se(f = path_join(original_dir, *p)); + + assert_se(write_string_file(f, "file", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) == 0); + + assert_se(base64mem(*p, strlen(*p), &c) >= 0); + + k = setxattr(f, "user.testxattr", c, strlen(c), 0); + assert_se(xattr_worked < 0 || ((k >= 0) == !!xattr_worked)); + xattr_worked = k >= 0; + } + + STRV_FOREACH_PAIR(ll, p, symlinks) { + _cleanup_free_ char *f = NULL, *l = NULL; + + assert_se(f = path_join(original_dir, *p)); + assert_se(l = path_join(original_dir, *ll)); + + assert_se(mkdir_parents(l, 0755) >= 0); + assert_se(symlink(f, l) == 0); + } + + STRV_FOREACH_PAIR(ll, p, hardlinks) { + _cleanup_free_ char *f = NULL, *l = NULL; + + assert_se(f = path_join(original_dir, *p)); + assert_se(l = path_join(original_dir, *ll)); + + assert_se(mkdir_parents(l, 0755) >= 0); + assert_se(link(f, l) == 0); + } + + unixsockp = strjoina(original_dir, "unixsock"); + assert_se(mknod(unixsockp, S_IFSOCK|0644, 0) >= 0); + + ignorep = strjoina(original_dir, "ignore/file"); + assert_se(write_string_file(ignorep, "ignore", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) == 0); + assert_se(RET_NERRNO(stat(ignorep, &st)) >= 0); + assert_se(cp = memdup(&st, sizeof(st))); + assert_se(hashmap_ensure_put(&denylist, &inode_hash_ops, cp, INT_TO_PTR(DENY_INODE)) >= 0); + TAKE_PTR(cp); + + assert_se(copy_tree(original_dir, copy_dir, UID_INVALID, GID_INVALID, COPY_REFLINK|COPY_MERGE|COPY_HARDLINKS, denylist, NULL) == 0); + + STRV_FOREACH(p, files) { + _cleanup_free_ char *buf = NULL, *f = NULL, *c = NULL; + size_t sz; + int k; + + assert_se(f = path_join(copy_dir, *p)); + + assert_se(access(f, F_OK) == 0); + assert_se(read_full_file(f, &buf, &sz) == 0); + assert_se(streq(buf, "file\n")); + + k = lgetxattr_malloc(f, "user.testxattr", &c); + assert_se(xattr_worked < 0 || ((k >= 0) == !!xattr_worked)); + + if (k >= 0) { + _cleanup_free_ char *d = NULL; + + assert_se(base64mem(*p, strlen(*p), &d) >= 0); + assert_se(streq(d, c)); + } + } + + STRV_FOREACH_PAIR(ll, p, symlinks) { + _cleanup_free_ char *target = NULL, *f = NULL, *l = NULL; + + assert_se(f = strjoin(original_dir, *p)); + assert_se(l = strjoin(copy_dir, *ll)); + + assert_se(chase(l, NULL, 0, &target, NULL) == 1); + assert_se(path_equal(f, target)); + } + + STRV_FOREACH_PAIR(ll, p, hardlinks) { + _cleanup_free_ char *f = NULL, *l = NULL; + struct stat a, b; + + assert_se(f = strjoin(copy_dir, *p)); + assert_se(l = strjoin(copy_dir, *ll)); + + assert_se(lstat(f, &a) >= 0); + assert_se(lstat(l, &b) >= 0); + + assert_se(a.st_ino == b.st_ino); + assert_se(a.st_dev == b.st_dev); + } + + unixsockp = strjoina(copy_dir, "unixsock"); + assert_se(stat(unixsockp, &st) >= 0); + assert_se(S_ISSOCK(st.st_mode)); + + assert_se(copy_tree(original_dir, copy_dir, UID_INVALID, GID_INVALID, COPY_REFLINK, denylist, NULL) < 0); + assert_se(copy_tree("/tmp/inexistent/foo/bar/fsdoi", copy_dir, UID_INVALID, GID_INVALID, COPY_REFLINK, denylist, NULL) < 0); + + ignorep = strjoina(copy_dir, "ignore/file"); + assert_se(RET_NERRNO(access(ignorep, F_OK)) == -ENOENT); + + (void) rm_rf(copy_dir, REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf(original_dir, REMOVE_ROOT|REMOVE_PHYSICAL); +} + +TEST(copy_bytes) { + _cleanup_close_pair_ int pipefd[2] = EBADF_PAIR; + _cleanup_close_ int infd = -EBADF; + int r, r2; + char buf[1024], buf2[1024]; + + infd = open("/usr/lib/os-release", O_RDONLY|O_CLOEXEC); + if (infd < 0) + infd = open("/etc/os-release", O_RDONLY|O_CLOEXEC); + assert_se(infd >= 0); + + assert_se(pipe2(pipefd, O_CLOEXEC) == 0); + + r = copy_bytes(infd, pipefd[1], UINT64_MAX, 0); + assert_se(r == 0); + + r = read(pipefd[0], buf, sizeof(buf)); + assert_se(r >= 0); + + assert_se(lseek(infd, 0, SEEK_SET) == 0); + r2 = read(infd, buf2, sizeof(buf2)); + assert_se(r == r2); + + assert_se(strneq(buf, buf2, r)); + + /* test copy_bytes with invalid descriptors */ + r = copy_bytes(pipefd[0], pipefd[0], 1, 0); + assert_se(r == -EBADF); + + r = copy_bytes(pipefd[1], pipefd[1], 1, 0); + assert_se(r == -EBADF); + + r = copy_bytes(pipefd[1], infd, 1, 0); + assert_se(r == -EBADF); +} + +static void test_copy_bytes_regular_file_one(const char *src, bool try_reflink, uint64_t max_bytes) { + _cleanup_(unlink_tempfilep) char fn2[] = "/tmp/test-copy-file-XXXXXX"; + _cleanup_(unlink_tempfilep) char fn3[] = "/tmp/test-copy-file-XXXXXX"; + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF, fd3 = -EBADF; + int r; + struct stat buf, buf2, buf3; + + log_info("%s try_reflink=%s max_bytes=%" PRIu64, __func__, yes_no(try_reflink), max_bytes); + + fd = open(src, O_CLOEXEC | O_PATH); + assert_se(fd >= 0); + + fd2 = mkostemp_safe(fn2); + assert_se(fd2 >= 0); + + fd3 = mkostemp_safe(fn3); + assert_se(fd3 >= 0); + + r = copy_bytes(fd, fd2, max_bytes, try_reflink ? COPY_REFLINK : 0); + if (max_bytes == UINT64_MAX) + assert_se(r == 0); + else + assert_se(IN_SET(r, 0, 1)); + + assert_se(fstat(fd, &buf) == 0); + assert_se(fstat(fd2, &buf2) == 0); + assert_se((uint64_t) buf2.st_size == MIN((uint64_t) buf.st_size, max_bytes)); + + if (max_bytes < UINT64_MAX) + /* Make sure the file is now higher than max_bytes */ + assert_se(ftruncate(fd2, max_bytes + 1) == 0); + + assert_se(lseek(fd2, 0, SEEK_SET) == 0); + + r = copy_bytes(fd2, fd3, max_bytes, try_reflink ? COPY_REFLINK : 0); + if (max_bytes == UINT64_MAX) + assert_se(r == 0); + else + /* We cannot distinguish between the input being exactly max_bytes + * or longer than max_bytes (without trying to read one more byte, + * or calling stat, or FION_READ, etc, and we don't want to do any + * of that). So we expect "truncation" since we know that file we + * are copying is exactly max_bytes bytes. */ + assert_se(r == 1); + + assert_se(fstat(fd3, &buf3) == 0); + + if (max_bytes == UINT64_MAX) + assert_se(buf3.st_size == buf2.st_size); + else + assert_se((uint64_t) buf3.st_size == max_bytes); +} + +TEST(copy_bytes_regular_file) { + test_copy_bytes_regular_file_one(saved_argv[0], false, UINT64_MAX); + test_copy_bytes_regular_file_one(saved_argv[0], true, UINT64_MAX); + test_copy_bytes_regular_file_one(saved_argv[0], false, 1000); /* smaller than copy buffer size */ + test_copy_bytes_regular_file_one(saved_argv[0], true, 1000); + test_copy_bytes_regular_file_one(saved_argv[0], false, 32000); /* larger than copy buffer size */ + test_copy_bytes_regular_file_one(saved_argv[0], true, 32000); +} + +TEST(copy_atomic) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + const char *q; + int r; + + assert_se(mkdtemp_malloc(NULL, &p) >= 0); + + q = strjoina(p, "/fstab"); + + r = copy_file_atomic("/etc/fstab", q, 0644, COPY_REFLINK); + if (r == -ENOENT || ERRNO_IS_PRIVILEGE(r)) + return; + + assert_se(copy_file_atomic("/etc/fstab", q, 0644, COPY_REFLINK) == -EEXIST); + + assert_se(copy_file_atomic("/etc/fstab", q, 0644, COPY_REPLACE) >= 0); +} + +TEST(copy_proc) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + _cleanup_free_ char *f = NULL, *a = NULL, *b = NULL; + + /* Check if copying data from /proc/ works correctly, i.e. let's see if https://lwn.net/Articles/846403/ is a problem for us */ + + assert_se(mkdtemp_malloc(NULL, &p) >= 0); + assert_se(f = path_join(p, "version")); + assert_se(copy_file("/proc/version", f, 0, MODE_INVALID, 0) >= 0); + + assert_se(read_one_line_file("/proc/version", &a) >= 0); + assert_se(read_one_line_file(f, &b) >= 0); + assert_se(streq(a, b)); + assert_se(!isempty(a)); +} + +TEST_RET(copy_holes) { + _cleanup_(unlink_tempfilep) char fn[] = "/var/tmp/test-copy-hole-fd-XXXXXX"; + _cleanup_(unlink_tempfilep) char fn_copy[] = "/var/tmp/test-copy-hole-fd-XXXXXX"; + struct stat stat; + off_t blksz; + int r, fd, fd_copy; + char *buf; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + fd_copy = mkostemp_safe(fn_copy); + assert_se(fd_copy >= 0); + + r = RET_NERRNO(fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 1)); + if (ERRNO_IS_NOT_SUPPORTED(r)) + return log_tests_skipped("Filesystem doesn't support hole punching"); + assert_se(r >= 0); + + assert_se(fstat(fd, &stat) >= 0); + blksz = stat.st_blksize; + buf = alloca_safe(blksz); + memset(buf, 1, blksz); + + /* We need to make sure to create hole in multiples of the block size, otherwise filesystems (btrfs) + * might silently truncate/extend the holes. */ + + assert_se(lseek(fd, blksz, SEEK_CUR) >= 0); + assert_se(write(fd, buf, blksz) >= 0); + assert_se(lseek(fd, 0, SEEK_END) == 2 * blksz); + /* Only ftruncate() can create holes at the end of a file. */ + assert_se(ftruncate(fd, 3 * blksz) >= 0); + assert_se(lseek(fd, 0, SEEK_SET) >= 0); + + assert_se(copy_bytes(fd, fd_copy, UINT64_MAX, COPY_HOLES) >= 0); + + /* Test that the hole starts at the beginning of the file. */ + assert_se(lseek(fd_copy, 0, SEEK_HOLE) == 0); + /* Test that the hole has the expected size. */ + assert_se(lseek(fd_copy, 0, SEEK_DATA) == blksz); + assert_se(lseek(fd_copy, blksz, SEEK_HOLE) == 2 * blksz); + assert_se(lseek(fd_copy, 2 * blksz, SEEK_DATA) < 0 && errno == ENXIO); + + /* Test that the copied file has the correct size. */ + assert_se(fstat(fd_copy, &stat) >= 0); + assert_se(stat.st_size == 3 * blksz); + + close(fd); + close(fd_copy); + + return 0; +} + +TEST_RET(copy_holes_with_gaps) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF, fd_copy = -EBADF; + struct stat st; + off_t blksz; + char *buf; + int r; + + assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); + assert_se((fd = openat(tfd, "src", O_CREAT | O_RDWR, 0600)) >= 0); + assert_se((fd_copy = openat(tfd, "dst", O_CREAT | O_WRONLY, 0600)) >= 0); + + assert_se(fstat(fd, &st) >= 0); + blksz = st.st_blksize; + buf = alloca_safe(blksz); + memset(buf, 1, blksz); + + /* Create a file with: + * - hole of 1 block + * - data of 2 block + * - hole of 2 blocks + * - data of 1 block + * + * Since sparse files are based on blocks and not bytes, we need to make + * sure that the holes are aligned to the block size. + */ + + r = RET_NERRNO(fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, blksz)); + if (ERRNO_IS_NOT_SUPPORTED(r)) + return log_tests_skipped("Filesystem doesn't support hole punching"); + + assert_se(lseek(fd, blksz, SEEK_CUR) >= 0); + assert_se(loop_write(fd, buf, blksz) >= 0); + assert_se(loop_write(fd, buf, blksz) >= 0); + assert_se(lseek(fd, 2 * blksz, SEEK_CUR) >= 0); + assert_se(loop_write(fd, buf, blksz) >= 0); + assert_se(lseek(fd, 0, SEEK_SET) >= 0); + assert_se(fsync(fd) >= 0); + + /* Copy to the start of the second hole */ + assert_se(copy_bytes(fd, fd_copy, 3 * blksz, COPY_HOLES) >= 0); + assert_se(fstat(fd_copy, &st) >= 0); + assert_se(st.st_size == 3 * blksz); + + /* Copy to the middle of the second hole */ + assert_se(lseek(fd, 0, SEEK_SET) >= 0); + assert_se(lseek(fd_copy, 0, SEEK_SET) >= 0); + assert_se(ftruncate(fd_copy, 0) >= 0); + assert_se(copy_bytes(fd, fd_copy, 4 * blksz, COPY_HOLES) >= 0); + assert_se(fstat(fd_copy, &st) >= 0); + assert_se(st.st_size == 4 * blksz); + + /* Copy to the end of the second hole */ + assert_se(lseek(fd, 0, SEEK_SET) >= 0); + assert_se(lseek(fd_copy, 0, SEEK_SET) >= 0); + assert_se(ftruncate(fd_copy, 0) >= 0); + assert_se(copy_bytes(fd, fd_copy, 5 * blksz, COPY_HOLES) >= 0); + assert_se(fstat(fd_copy, &st) >= 0); + assert_se(st.st_size == 5 * blksz); + + /* Copy everything */ + assert_se(lseek(fd, 0, SEEK_SET) >= 0); + assert_se(lseek(fd_copy, 0, SEEK_SET) >= 0); + assert_se(ftruncate(fd_copy, 0) >= 0); + assert_se(copy_bytes(fd, fd_copy, UINT64_MAX, COPY_HOLES) >= 0); + assert_se(fstat(fd_copy, &st) >= 0); + assert_se(st.st_size == 6 * blksz); + + return 0; +} + +TEST(copy_lock) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF; + + assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); + assert_se(mkdirat(tfd, "abc", 0755) >= 0); + assert_se(write_string_file_at(tfd, "abc/def", "abc", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se((fd = copy_directory_at(tfd, "abc", tfd, "qed", COPY_LOCK_BSD)) >= 0); + assert_se(faccessat(tfd, "qed", F_OK, 0) >= 0); + assert_se(faccessat(tfd, "qed/def", F_OK, 0) >= 0); + assert_se(xopenat_lock(tfd, "qed", 0, 0, 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + fd = safe_close(fd); + + assert_se((fd = copy_file_at(tfd, "abc/def", tfd, "poi", 0, 0644, COPY_LOCK_BSD))); + assert_se(read_file_at_and_streq(tfd, "poi", "abc\n")); + assert_se(xopenat_lock(tfd, "poi", 0, 0, 0, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + fd = safe_close(fd); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-core-unit.c b/src/test/test-core-unit.c new file mode 100644 index 0000000..dc108cc --- /dev/null +++ b/src/test/test-core-unit.c @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "escape.h" +#include "tests.h" +#include "unit.h" + +static void test_unit_escape_setting_one( + const char *s, + const char *expected_exec_env, + const char *expected_exec, + const char *expected_c) { + + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL, *d = NULL, + *s_esc = NULL, *a_esc = NULL, *b_esc = NULL, *c_esc = NULL, *d_esc = NULL; + const char *t; + + if (!expected_exec_env) + expected_exec_env = s; + if (!expected_exec) + expected_exec = expected_exec_env; + if (!expected_c) + expected_c = expected_exec; + assert_se(s_esc = cescape(s)); + + assert_se(t = unit_escape_setting(s, 0, &a)); + assert_se(a_esc = cescape(t)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, a_esc); + assert_se(a == NULL); + assert_se(t == s); + + assert_se(t = unit_escape_setting(s, UNIT_ESCAPE_EXEC_SYNTAX_ENV, &b)); + assert_se(b_esc = cescape(t)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, b_esc); + assert_se(b == NULL || streq(b, t)); + assert_se(streq(t, expected_exec_env)); + + assert_se(t = unit_escape_setting(s, UNIT_ESCAPE_EXEC_SYNTAX, &c)); + assert_se(c_esc = cescape(t)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, c_esc); + assert_se(c == NULL || streq(c, t)); + assert_se(streq(t, expected_exec)); + + assert_se(t = unit_escape_setting(s, UNIT_ESCAPE_C, &d)); + assert_se(d_esc = cescape(t)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, d_esc); + assert_se(d == NULL || streq(d, t)); + assert_se(streq(t, expected_c)); +} + +TEST(unit_escape_setting) { + test_unit_escape_setting_one("/sbin/sbash", NULL, NULL, NULL); + test_unit_escape_setting_one("$", "$$", "$", "$"); + test_unit_escape_setting_one("$$", "$$$$", "$$", "$$"); + test_unit_escape_setting_one("'", "'", NULL, "\\'"); + test_unit_escape_setting_one("\"", "\\\"", NULL, NULL); + test_unit_escape_setting_one("\t", "\\t", NULL, NULL); + test_unit_escape_setting_one(" ", NULL, NULL, NULL); + test_unit_escape_setting_one("$;'\"\t\n", "$$;'\\\"\\t\\n", "$;'\\\"\\t\\n", "$;\\'\\\"\\t\\n"); +} + +static void test_unit_concat_strv_one( + char **s, + const char *expected_none, + const char *expected_exec_env, + const char *expected_exec, + const char *expected_c) { + + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL, *d = NULL, + *s_ser = NULL, *s_esc = NULL, *a_esc = NULL, *b_esc = NULL, *c_esc = NULL, *d_esc = NULL; + + assert_se(s_ser = strv_join(s, "_")); + assert_se(s_esc = cescape(s_ser)); + if (!expected_exec_env) + expected_exec_env = expected_none; + if (!expected_exec) + expected_exec = expected_none; + if (!expected_c) + expected_c = expected_none; + + assert_se(a = unit_concat_strv(s, 0)); + assert_se(a_esc = cescape(a)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, a_esc); + assert_se(streq(a, expected_none)); + + assert_se(b = unit_concat_strv(s, UNIT_ESCAPE_EXEC_SYNTAX_ENV)); + assert_se(b_esc = cescape(b)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, b_esc); + assert_se(streq(b, expected_exec_env)); + + assert_se(c = unit_concat_strv(s, UNIT_ESCAPE_EXEC_SYNTAX)); + assert_se(c_esc = cescape(c)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, c_esc); + assert_se(streq(c, expected_exec)); + + assert_se(d = unit_concat_strv(s, UNIT_ESCAPE_C)); + assert_se(d_esc = cescape(d)); + log_debug("%s: [%s] → [%s]", __func__, s_esc, d_esc); + assert_se(streq(d, expected_c)); +} + +TEST(unit_concat_strv) { + test_unit_concat_strv_one(STRV_MAKE("a", "b", "c"), + "\"a\" \"b\" \"c\"", + NULL, + NULL, + NULL); + test_unit_concat_strv_one(STRV_MAKE("a", " ", "$", "$$", ""), + "\"a\" \" \" \"$\" \"$$\" \"\"", + "\"a\" \" \" \"$$\" \"$$$$\" \"\"", + NULL, + NULL); + test_unit_concat_strv_one(STRV_MAKE("\n", " ", "\t"), + "\"\n\" \" \" \"\t\"", + "\"\\n\" \" \" \"\\t\"", + "\"\\n\" \" \" \"\\t\"", + "\"\\n\" \" \" \"\\t\""); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-coredump-util.c b/src/test/test-coredump-util.c new file mode 100644 index 0000000..4e7f3b4 --- /dev/null +++ b/src/test/test-coredump-util.c @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "coredump-util.h" +#include "fileio.h" +#include "fd-util.h" +#include "format-util.h" +#include "macro.h" +#include "tests.h" + +TEST(coredump_filter_to_from_string) { + for (CoredumpFilter i = 0; i < _COREDUMP_FILTER_MAX; i++) { + const char *n; + + assert_se(n = coredump_filter_to_string(i)); + log_info("0x%x\t%s", 1u << i, n); + assert_se(coredump_filter_from_string(n) == i); + + uint64_t f; + assert_se(coredump_filter_mask_from_string(n, &f) == 0); + assert_se(f == 1u << i); + } +} + +TEST(coredump_filter_mask_from_string) { + uint64_t f; + assert_se(coredump_filter_mask_from_string("default", &f) == 0); + assert_se(f == COREDUMP_FILTER_MASK_DEFAULT); + assert_se(coredump_filter_mask_from_string("all", &f) == 0); + assert_se(f == COREDUMP_FILTER_MASK_ALL); + + assert_se(coredump_filter_mask_from_string(" default\tdefault\tdefault ", &f) == 0); + assert_se(f == COREDUMP_FILTER_MASK_DEFAULT); + + assert_se(coredump_filter_mask_from_string("defaulta", &f) < 0); + assert_se(coredump_filter_mask_from_string("default defaulta default", &f) < 0); + assert_se(coredump_filter_mask_from_string("default default defaulta", &f) < 0); + + assert_se(coredump_filter_mask_from_string("private-anonymous default", &f) == 0); + assert_se(f == COREDUMP_FILTER_MASK_DEFAULT); + + assert_se(coredump_filter_mask_from_string("shared-file-backed shared-dax", &f) == 0); + assert_se(f == (1 << COREDUMP_FILTER_SHARED_FILE_BACKED | + 1 << COREDUMP_FILTER_SHARED_DAX)); + + assert_se(coredump_filter_mask_from_string("private-file-backed private-dax 0xF", &f) == 0); + assert_se(f == (1 << COREDUMP_FILTER_PRIVATE_FILE_BACKED | + 1 << COREDUMP_FILTER_PRIVATE_DAX | + 0xF)); + + assert_se(coredump_filter_mask_from_string("11", &f) == 0); + assert_se(f == 0x11); + + assert_se(coredump_filter_mask_from_string("0x1101", &f) == 0); + assert_se(f == 0x1101); + + assert_se(coredump_filter_mask_from_string("0", &f) == 0); + assert_se(f == 0); + + assert_se(coredump_filter_mask_from_string("all", &f) == 0); + assert_se(FLAGS_SET(f, (1 << COREDUMP_FILTER_PRIVATE_ANONYMOUS | + 1 << COREDUMP_FILTER_SHARED_ANONYMOUS | + 1 << COREDUMP_FILTER_PRIVATE_FILE_BACKED | + 1 << COREDUMP_FILTER_SHARED_FILE_BACKED | + 1 << COREDUMP_FILTER_ELF_HEADERS | + 1 << COREDUMP_FILTER_PRIVATE_HUGE | + 1 << COREDUMP_FILTER_SHARED_HUGE | + 1 << COREDUMP_FILTER_PRIVATE_DAX | + 1 << COREDUMP_FILTER_SHARED_DAX))); +} + +static void test_parse_auxv_two( + uint8_t elf_class, + size_t offset, + const char *data, + size_t data_size, + int expect_at_secure, + uid_t expect_uid, + uid_t expect_euid, + gid_t expect_gid, + gid_t expect_egid) { + + int at_secure; + uid_t uid, euid; + gid_t gid, egid; + assert_se(parse_auxv(LOG_ERR, elf_class, data, data_size, + &at_secure, &uid, &euid, &gid, &egid) == 0); + + log_debug("[offset=%zu] at_secure=%d, uid="UID_FMT", euid="UID_FMT", gid="GID_FMT", egid="GID_FMT, + offset, + at_secure, uid, euid, gid, egid); + + assert_se(uid == expect_uid); + assert_se(euid == expect_euid); + assert_se(gid == expect_gid); + assert_se(egid == expect_egid); +} + +static void test_parse_auxv_one( + uint8_t elf_class, + int dir_fd, + const char *filename, + int expect_at_secure, + uid_t expect_uid, + uid_t expect_euid, + gid_t expect_gid, + gid_t expect_egid) { + + _cleanup_free_ char *buf; + const char *data; + size_t data_size; + log_info("Parsing %s…", filename); + assert_se(read_full_file_at(dir_fd, filename, &buf, &data_size) >= 0); + + for (size_t offset = 0; offset < 8; offset++) { + _cleanup_free_ char *buf2 = NULL; + + if (offset == 0) + data = buf; + else { + assert_se(buf2 = malloc(offset + data_size)); + memcpy(buf2 + offset, buf, data_size); + data = buf2 + offset; + } + + test_parse_auxv_two(elf_class, offset, data, data_size, + expect_at_secure, expect_uid, expect_euid, expect_gid, expect_egid); + } +} + +TEST(parse_auxv) { + _cleanup_free_ char *dir = NULL; + _cleanup_close_ int dir_fd = -EBADF; + + assert_se(get_testdata_dir("auxv", &dir) >= 0); + dir_fd = open(dir, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH); + assert_se(dir_fd >= 0); + + if (__BYTE_ORDER == __LITTLE_ENDIAN) { + test_parse_auxv_one(ELFCLASS32, dir_fd, "resolved.arm32", 0, 193, 193, 193, 193); + test_parse_auxv_one(ELFCLASS64, dir_fd, "bash.riscv64", 0, 1001, 1001, 1001, 1001); + test_parse_auxv_one(ELFCLASS32, dir_fd, "sleep.i686", 0, 1000, 1000, 1000, 1000); + /* after chgrp and chmod g+s */ + test_parse_auxv_one(ELFCLASS32, dir_fd, "sleep32.i686", 1, 1000, 1000, 1000, 10); + test_parse_auxv_one(ELFCLASS64, dir_fd, "sleep64.amd64", 1, 1000, 1000, 1000, 10); + + test_parse_auxv_one(ELFCLASS64, dir_fd, "sudo.aarch64", 1, 1494200408, 0, 1494200408, 1494200408); + test_parse_auxv_one(ELFCLASS64, dir_fd, "sudo.amd64", 1, 1000, 0, 1000, 1000); + + /* Those run unprivileged, but start as root. */ + test_parse_auxv_one(ELFCLASS64, dir_fd, "dbus-broker-launch.amd64", 0, 0, 0, 0, 0); + test_parse_auxv_one(ELFCLASS64, dir_fd, "dbus-broker-launch.aarch64", 0, 0, 0, 0, 0); + test_parse_auxv_one(ELFCLASS64, dir_fd, "polkitd.aarch64", 0, 0, 0, 0, 0); + } else { + test_parse_auxv_one(ELFCLASS64, dir_fd, "cat.s390x", 0, 3481, 3481, 3481, 3481); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-cpu-set-util.c b/src/test/test-cpu-set-util.c new file mode 100644 index 0000000..a0660f5 --- /dev/null +++ b/src/test/test-cpu-set-util.c @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "cpu-set-util.h" +#include "string-util.h" +#include "tests.h" +#include "macro.h" + +TEST(parse_cpu_set) { + CPUSet c = {}; + _cleanup_free_ char *str = NULL; + int cpu; + + /* Single value */ + assert_se(parse_cpu_set_full("0", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.set); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_ISSET_S(0, c.allocated, c.set)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 1); + + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "0")); + str = mfree(str); + cpu_set_reset(&c); + + /* Simple range (from CPUAffinity example) */ + assert_se(parse_cpu_set_full("1 2 4", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.set); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_ISSET_S(1, c.allocated, c.set)); + assert_se(CPU_ISSET_S(2, c.allocated, c.set)); + assert_se(CPU_ISSET_S(4, c.allocated, c.set)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 3); + + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "1-2 4")); + str = mfree(str); + cpu_set_reset(&c); + + /* A more interesting range */ + assert_se(parse_cpu_set_full("0 1 2 3 8 9 10 11", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 8); + for (cpu = 0; cpu < 4; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + for (cpu = 8; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "0-3 8-11")); + str = mfree(str); + cpu_set_reset(&c); + + /* Quoted strings */ + assert_se(parse_cpu_set_full("8 '9' 10 \"11\"", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 4); + for (cpu = 8; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "8-11")); + str = mfree(str); + cpu_set_reset(&c); + + /* Use commas as separators */ + assert_se(parse_cpu_set_full("0,1,2,3 8,9,10,11", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 8); + for (cpu = 0; cpu < 4; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + for (cpu = 8; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + cpu_set_reset(&c); + + /* Commas with spaces (and trailing comma, space) */ + assert_se(parse_cpu_set_full("0, 1, 2, 3, 4, 5, 6, 7, 63, ", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 9); + for (cpu = 0; cpu < 8; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + + assert_se(CPU_ISSET_S(63, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "0-7 63")); + str = mfree(str); + cpu_set_reset(&c); + + /* Ranges */ + assert_se(parse_cpu_set_full("0-3,8-11", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 8); + for (cpu = 0; cpu < 4; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + for (cpu = 8; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + cpu_set_reset(&c); + + /* Ranges with trailing comma, space */ + assert_se(parse_cpu_set_full("0-3 8-11, ", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 8); + for (cpu = 0; cpu < 4; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + for (cpu = 8; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "0-3 8-11")); + str = mfree(str); + cpu_set_reset(&c); + + /* Negative range (returns empty cpu_set) */ + assert_se(parse_cpu_set_full("3-0", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 0); + cpu_set_reset(&c); + + /* Overlapping ranges */ + assert_se(parse_cpu_set_full("0-7 4-11", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 12); + for (cpu = 0; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "0-11")); + str = mfree(str); + cpu_set_reset(&c); + + /* Mix ranges and individual CPUs */ + assert_se(parse_cpu_set_full("0,2 4-11", &c, true, NULL, "fake", 1, "CPUAffinity") >= 0); + assert_se(c.allocated >= DIV_ROUND_UP(sizeof(__cpu_mask), 8)); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 10); + assert_se(CPU_ISSET_S(0, c.allocated, c.set)); + assert_se(CPU_ISSET_S(2, c.allocated, c.set)); + for (cpu = 4; cpu < 12; cpu++) + assert_se(CPU_ISSET_S(cpu, c.allocated, c.set)); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "0 2 4-11")); + str = mfree(str); + cpu_set_reset(&c); + + /* Garbage */ + assert_se(parse_cpu_set_full("0 1 2 3 garbage", &c, true, NULL, "fake", 1, "CPUAffinity") == -EINVAL); + assert_se(!c.set); + assert_se(c.allocated == 0); + + /* Range with garbage */ + assert_se(parse_cpu_set_full("0-3 8-garbage", &c, true, NULL, "fake", 1, "CPUAffinity") == -EINVAL); + assert_se(!c.set); + assert_se(c.allocated == 0); + + /* Empty string */ + assert_se(parse_cpu_set_full("", &c, true, NULL, "fake", 1, "CPUAffinity") == 0); + assert_se(!c.set); /* empty string returns NULL */ + assert_se(c.allocated == 0); + + /* Runaway quoted string */ + assert_se(parse_cpu_set_full("0 1 2 3 \"4 5 6 7 ", &c, true, NULL, "fake", 1, "CPUAffinity") == -EINVAL); + assert_se(!c.set); + assert_se(c.allocated == 0); + + /* Maximum allocation */ + assert_se(parse_cpu_set_full("8000-8191", &c, true, NULL, "fake", 1, "CPUAffinity") == 0); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 192); + assert_se(str = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", str); + str = mfree(str); + assert_se(str = cpu_set_to_range_string(&c)); + log_info("cpu_set_to_range_string: %s", str); + assert_se(streq(str, "8000-8191")); + str = mfree(str); + cpu_set_reset(&c); +} + +TEST(parse_cpu_set_extend) { + CPUSet c = {}; + _cleanup_free_ char *s1 = NULL, *s2 = NULL; + + assert_se(parse_cpu_set_extend("1 3", &c, true, NULL, "fake", 1, "CPUAffinity") == 1); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 2); + assert_se(s1 = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", s1); + + assert_se(parse_cpu_set_extend("4", &c, true, NULL, "fake", 1, "CPUAffinity") == 1); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 3); + assert_se(s2 = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", s2); + + assert_se(parse_cpu_set_extend("", &c, true, NULL, "fake", 1, "CPUAffinity") == 0); + assert_se(!c.set); + assert_se(c.allocated == 0); + log_info("cpu_set_to_string: (null)"); +} + +TEST(cpu_set_to_from_dbus) { + _cleanup_(cpu_set_reset) CPUSet c = {}, c2 = {}; + _cleanup_free_ char *s = NULL; + + assert_se(parse_cpu_set_extend("1 3 8 100-200", &c, true, NULL, "fake", 1, "CPUAffinity") == 1); + assert_se(s = cpu_set_to_string(&c)); + log_info("cpu_set_to_string: %s", s); + assert_se(CPU_COUNT_S(c.allocated, c.set) == 104); + + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + static const char expected[32] = + "\x0A\x01\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\xF0\xFF\xFF\xFF\xFF\xFF\xFF\xFF" + "\xFF\xFF\xFF\xFF\xFF\x01"; + + assert_se(cpu_set_to_dbus(&c, &array, &allocated) == 0); + assert_se(array); + assert_se(allocated == c.allocated); + + assert_se(allocated <= sizeof expected); + assert_se(allocated >= DIV_ROUND_UP(201u, 8u)); /* We need at least 201 bits for our mask */ + assert_se(memcmp(array, expected, allocated) == 0); + + assert_se(cpu_set_from_dbus(array, allocated, &c2) == 0); + assert_se(c2.set); + assert_se(c2.allocated == c.allocated); + assert_se(memcmp(c.set, c2.set, c.allocated) == 0); +} + +TEST(cpus_in_affinity_mask) { + int r; + + r = cpus_in_affinity_mask(); + assert_se(r > 0); + log_info("cpus_in_affinity_mask: %d", r); +} + +TEST(print_cpu_alloc_size) { + log_info("CPU_ALLOC_SIZE(1) = %zu", CPU_ALLOC_SIZE(1)); + log_info("CPU_ALLOC_SIZE(9) = %zu", CPU_ALLOC_SIZE(9)); + log_info("CPU_ALLOC_SIZE(64) = %zu", CPU_ALLOC_SIZE(64)); + log_info("CPU_ALLOC_SIZE(65) = %zu", CPU_ALLOC_SIZE(65)); + log_info("CPU_ALLOC_SIZE(1024) = %zu", CPU_ALLOC_SIZE(1024)); + log_info("CPU_ALLOC_SIZE(1025) = %zu", CPU_ALLOC_SIZE(1025)); + log_info("CPU_ALLOC_SIZE(8191) = %zu", CPU_ALLOC_SIZE(8191)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-creds.c b/src/test/test-creds.c new file mode 100644 index 0000000..acb198c --- /dev/null +++ b/src/test/test-creds.c @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "creds-util.h" +#include "fileio.h" +#include "path-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(read_credential_strings) { + _cleanup_free_ char *x = NULL, *y = NULL, *saved = NULL, *p = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_fclose_ FILE *f = NULL; + + const char *e = getenv("CREDENTIALS_DIRECTORY"); + if (e) + assert_se(saved = strdup(e)); + + assert_se(read_credential_strings_many("foo", &x, "bar", &y) == 0); + assert_se(x == NULL); + assert_se(y == NULL); + + assert_se(mkdtemp_malloc(NULL, &tmp) >= 0); + + assert_se(setenv("CREDENTIALS_DIRECTORY", tmp, /* override= */ true) >= 0); + + assert_se(read_credential_strings_many("foo", &x, "bar", &y) == 0); + assert_se(x == NULL); + assert_se(y == NULL); + + assert_se(p = path_join(tmp, "bar")); + assert_se(write_string_file(p, "piff", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_AVOID_NEWLINE) >= 0); + + assert_se(read_credential_strings_many("foo", &x, "bar", &y) == 0); + assert_se(x == NULL); + assert_se(streq(y, "piff")); + + assert_se(write_string_file(p, "paff", WRITE_STRING_FILE_TRUNCATE|WRITE_STRING_FILE_AVOID_NEWLINE) >= 0); + + assert_se(read_credential_strings_many("foo", &x, "bar", &y) == 0); + assert_se(x == NULL); + assert_se(streq(y, "piff")); + + p = mfree(p); + assert_se(p = path_join(tmp, "foo")); + assert_se(write_string_file(p, "knurz", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_AVOID_NEWLINE) >= 0); + + assert_se(read_credential_strings_many("foo", &x, "bar", &y) >= 0); + assert_se(streq(x, "knurz")); + assert_se(streq(y, "piff")); + + y = mfree(y); + + assert_se(read_credential_strings_many("foo", &x, "bar", &y) >= 0); + assert_se(streq(x, "knurz")); + assert_se(streq(y, "paff")); + + p = mfree(p); + assert_se(p = path_join(tmp, "bazz")); + assert_se(f = fopen(p, "w")); + assert_se(fwrite("x\0y", 1, 3, f) == 3); /* embedded NUL byte should result in EBADMSG when reading back with read_credential_strings_many() */ + f = safe_fclose(f); + + assert_se(read_credential_strings_many("bazz", &x, "foo", &y) == -EBADMSG); + assert_se(streq(x, "knurz")); + assert_se(streq(y, "paff")); + + if (saved) + assert_se(setenv("CREDENTIALS_DIRECTORY", saved, /* override= */ 1) >= 0); + else + assert_se(unsetenv("CREDENTIALS_DIRECTORY") >= 0); +} + +TEST(credential_name_valid) { + char buf[NAME_MAX+2]; + + assert_se(!credential_name_valid(NULL)); + assert_se(!credential_name_valid("")); + assert_se(!credential_name_valid(".")); + assert_se(!credential_name_valid("..")); + assert_se(!credential_name_valid("foo/bar")); + assert_se(credential_name_valid("foo")); + + memset(buf, 'x', sizeof(buf)-1); + buf[sizeof(buf)-1] = 0; + assert_se(!credential_name_valid(buf)); + + buf[sizeof(buf)-2] = 0; + assert_se(credential_name_valid(buf)); +} + +TEST(credential_glob_valid) { + char buf[NAME_MAX+2]; + + assert_se(!credential_glob_valid(NULL)); + assert_se(!credential_glob_valid("")); + assert_se(!credential_glob_valid(".")); + assert_se(!credential_glob_valid("..")); + assert_se(!credential_glob_valid("foo/bar")); + assert_se(credential_glob_valid("foo")); + assert_se(credential_glob_valid("foo*")); + assert_se(credential_glob_valid("x*")); + assert_se(credential_glob_valid("*")); + assert_se(!credential_glob_valid("?")); + assert_se(!credential_glob_valid("*a")); + assert_se(!credential_glob_valid("a?")); + assert_se(!credential_glob_valid("a[abc]")); + assert_se(!credential_glob_valid("a[abc]")); + + memset(buf, 'x', sizeof(buf)-1); + buf[sizeof(buf)-1] = 0; + assert_se(!credential_glob_valid(buf)); + + buf[sizeof(buf)-2] = 0; + assert_se(credential_glob_valid(buf)); + + buf[sizeof(buf)-2] = '*'; + assert_se(credential_glob_valid(buf)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-cryptolib.c b/src/test/test-cryptolib.c new file mode 100644 index 0000000..6202a5d --- /dev/null +++ b/src/test/test-cryptolib.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "gcrypt-util.h" +#include "macro.h" +#include "openssl-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(string_hashsum) { + _cleanup_free_ char *out1 = NULL, *out2 = NULL, *out3 = NULL, *out4 = NULL; + + assert_se(string_hashsum("asdf", 4, + OPENSSL_OR_GCRYPT("SHA224", GCRY_MD_SHA224), + &out1) == 0); + /* echo -n 'asdf' | sha224sum - */ + assert_se(streq(out1, "7872a74bcbf298a1e77d507cd95d4f8d96131cbbd4cdfc571e776c8a")); + + assert_se(string_hashsum("asdf", 4, + OPENSSL_OR_GCRYPT("SHA256", GCRY_MD_SHA256), + &out2) == 0); + /* echo -n 'asdf' | sha256sum - */ + assert_se(streq(out2, "f0e4c2f76c58916ec258f246851bea091d14d4247a2fc3e18694461b1816e13b")); + + assert_se(string_hashsum("", 0, + OPENSSL_OR_GCRYPT("SHA224", GCRY_MD_SHA224), + &out3) == 0); + /* echo -n '' | sha224sum - */ + assert_se(streq(out3, "d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f")); + + assert_se(string_hashsum("", 0, + OPENSSL_OR_GCRYPT("SHA256", GCRY_MD_SHA256), + &out4) == 0); + /* echo -n '' | sha256sum - */ + assert_se(streq(out4, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-daemon.c b/src/test/test-daemon.c new file mode 100644 index 0000000..b880521 --- /dev/null +++ b/src/test/test-daemon.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-daemon.h" + +#include "parse-util.h" +#include "strv.h" +#include "time-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_strv_free_ char **l = NULL; + int n, i; + usec_t duration = USEC_PER_SEC / 10; + + test_setup_logging(LOG_DEBUG); + + if (argc >= 2) { + unsigned x; + + assert_se(safe_atou(argv[1], &x) >= 0); + duration = x * USEC_PER_SEC; + } + + n = sd_listen_fds_with_names(false, &l); + if (n < 0) { + log_error_errno(n, "Failed to get listening fds: %m"); + return EXIT_FAILURE; + } + + for (i = 0; i < n; i++) + log_info("fd=%i name=%s", SD_LISTEN_FDS_START + i, l[i]); + + sd_notify(0, + "STATUS=Starting up"); + usleep_safe(duration); + + sd_notify(0, + "STATUS=Running\n" + "READY=1"); + usleep_safe(duration); + + sd_notify(0, + "STATUS=Reloading\n" + "RELOADING=1"); + usleep_safe(duration); + + sd_notify(0, + "STATUS=Running\n" + "READY=1"); + usleep_safe(duration); + + sd_notify(0, + "STATUS=Quitting\n" + "STOPPING=1"); + usleep_safe(duration); + + return EXIT_SUCCESS; +} diff --git a/src/test/test-data-fd-util.c b/src/test/test-data-fd-util.c new file mode 100644 index 0000000..aa68132 --- /dev/null +++ b/src/test/test-data-fd-util.c @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "data-fd-util.h" +#include "fd-util.h" +#include "memory-util.h" +#include "process-util.h" +#include "tests.h" +#include "random-util.h" + +static void test_acquire_data_fd_one(unsigned flags) { + char wbuffer[196*1024 - 7]; + char rbuffer[sizeof(wbuffer)]; + int fd; + + fd = acquire_data_fd("foo", 3, flags); + assert_se(fd >= 0); + + zero(rbuffer); + assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 3); + assert_se(streq(rbuffer, "foo")); + + fd = safe_close(fd); + + fd = acquire_data_fd("", 0, flags); + assert_se(fd >= 0); + + zero(rbuffer); + assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 0); + assert_se(streq(rbuffer, "")); + + fd = safe_close(fd); + + random_bytes(wbuffer, sizeof(wbuffer)); + + fd = acquire_data_fd(wbuffer, sizeof(wbuffer), flags); + assert_se(fd >= 0); + + zero(rbuffer); + assert_se(read(fd, rbuffer, sizeof(rbuffer)) == sizeof(rbuffer)); + assert_se(memcmp(rbuffer, wbuffer, sizeof(rbuffer)) == 0); + + fd = safe_close(fd); +} + +TEST(acquire_data_fd) { + test_acquire_data_fd_one(0); + test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL); + test_acquire_data_fd_one(ACQUIRE_NO_MEMFD); + test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD); + test_acquire_data_fd_one(ACQUIRE_NO_PIPE); + test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_PIPE); + test_acquire_data_fd_one(ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE); + test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE); + test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE|ACQUIRE_NO_TMPFILE); +} + +static void assert_equal_fd(int fd1, int fd2) { + for (;;) { + uint8_t a[4096], b[4096]; + ssize_t x, y; + + x = read(fd1, a, sizeof(a)); + assert_se(x >= 0); + + y = read(fd2, b, sizeof(b)); + assert_se(y >= 0); + + assert_se(x == y); + + if (x == 0) + break; + + assert_se(memcmp(a, b, x) == 0); + } +} + +TEST(copy_data_fd) { + _cleanup_close_ int fd1 = -EBADF, fd2 = -EBADF; + _cleanup_close_pair_ int sfd[2] = EBADF_PAIR; + _cleanup_(sigkill_waitp) pid_t pid = -1; + int r; + + fd1 = open("/etc/fstab", O_RDONLY|O_CLOEXEC); + if (fd1 >= 0) { + + fd2 = copy_data_fd(fd1); + assert_se(fd2 >= 0); + + assert_se(lseek(fd1, 0, SEEK_SET) == 0); + assert_equal_fd(fd1, fd2); + } + + fd1 = safe_close(fd1); + fd2 = safe_close(fd2); + + fd1 = acquire_data_fd("hallo", 6, 0); + assert_se(fd1 >= 0); + + fd2 = copy_data_fd(fd1); + assert_se(fd2 >= 0); + + safe_close(fd1); + fd1 = acquire_data_fd("hallo", 6, 0); + assert_se(fd1 >= 0); + + assert_equal_fd(fd1, fd2); + + fd1 = safe_close(fd1); + fd2 = safe_close(fd2); + + assert_se(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, sfd) >= 0); + + r = safe_fork("(sd-pipe)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_LOG, &pid); + assert_se(r >= 0); + + if (r == 0) { + /* child */ + + sfd[0] = safe_close(sfd[0]); + + for (uint64_t i = 0; i < 1536*1024 / sizeof(uint64_t); i++) + assert_se(write(sfd[1], &i, sizeof(i)) == sizeof(i)); + + sfd[1] = safe_close(sfd[1]); + + _exit(EXIT_SUCCESS); + } + + sfd[1] = safe_close(sfd[1]); + + fd2 = copy_data_fd(sfd[0]); + assert_se(fd2 >= 0); + + uint64_t j; + for (uint64_t i = 0; i < 1536*1024 / sizeof(uint64_t); i++) { + assert_se(read(fd2, &j, sizeof(j)) == sizeof(j)); + assert_se(i == j); + } + + assert_se(read(fd2, &j, sizeof(j)) == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-date.c b/src/test/test-date.c new file mode 100644 index 0000000..162ac34 --- /dev/null +++ b/src/test/test-date.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" + +static void test_should_pass(const char *p) { + usec_t t, q; + char buf[FORMAT_TIMESTAMP_MAX], buf_relative[FORMAT_TIMESTAMP_RELATIVE_MAX]; + + log_info("Test: %s", p); + assert_se(parse_timestamp(p, &t) >= 0); + assert_se(format_timestamp_style(buf, sizeof(buf), t, TIMESTAMP_US)); + log_info("\"%s\" → \"%s\"", p, buf); + + assert_se(parse_timestamp(buf, &q) >= 0); + if (q != t) + log_error("round-trip failed: \"%s\" → \"%s\"", + buf, FORMAT_TIMESTAMP_STYLE(q, TIMESTAMP_US)); + assert_se(q == t); + + assert_se(format_timestamp_relative(buf_relative, sizeof(buf_relative), t)); + log_info("%s", strna(buf_relative)); +} + +static void test_should_parse(const char *p) { + usec_t t; + + log_info("Test: %s", p); + assert_se(parse_timestamp(p, &t) >= 0); + log_info("\"%s\" → \"@%" PRI_USEC "\"", p, t); +} + +static void test_should_fail(const char *p) { + usec_t t; + int r; + + log_info("Test: %s", p); + r = parse_timestamp(p, &t); + if (r >= 0) + log_info("\"%s\" → \"@%" PRI_USEC "\" (unexpected)", p, t); + else + log_info("parse_timestamp() returns %d (expected)", r); + assert_se(r < 0); +} + +static void test_one(const char *p) { + _cleanup_free_ char *with_utc = NULL; + + with_utc = strjoin(p, " UTC"); + test_should_pass(p); + test_should_pass(with_utc); +} + +static void test_one_noutc(const char *p) { + _cleanup_free_ char *with_utc = NULL; + + with_utc = strjoin(p, " UTC"); + test_should_pass(p); + test_should_fail(with_utc); +} + +int main(int argc, char *argv[]) { + /* Tests have hard-coded results that do not expect a specific timezone to be set by the caller */ + assert_se(unsetenv("TZ") >= 0); + + test_setup_logging(LOG_DEBUG); + + test_one("17:41"); + test_one("18:42:44"); + test_one("18:42:44.0"); + test_one("18:42:44.999999999999"); + test_one("12-10-02 12:13:14"); + test_one("12-10-2 12:13:14"); + test_one("12-10-03 12:13"); + test_one("2012-12-30 18:42"); + test_one("2012-10-02"); + test_one("Mar 12 12:01:01"); + test_one("Mar 12 12:01:01.687197"); + test_one("Tue 2012-10-02"); + test_one("yesterday"); + test_one("today"); + test_one("tomorrow"); + test_one_noutc("16:20 UTC"); + test_one_noutc("16:20 Asia/Seoul"); + test_one_noutc("tomorrow Asia/Seoul"); + test_one_noutc("2012-12-30 18:42 Asia/Seoul"); + test_one_noutc("now"); + test_one_noutc("+2d"); + test_one_noutc("+2y 4d"); + test_one_noutc("5months ago"); + test_one_noutc("@1395716396"); + test_should_parse("1970-1-1 UTC"); + test_should_pass("1970-1-1 00:00:01 UTC"); + test_should_fail("1969-12-31 UTC"); + test_should_fail("-1000y"); + test_should_fail("today UTC UTC"); + test_should_fail("now Asia/Seoul"); + test_should_fail("+2d Asia/Seoul"); + test_should_fail("@1395716396 Asia/Seoul"); +#if SIZEOF_TIME_T == 8 + test_should_pass("9999-12-30 23:59:59 UTC"); + test_should_fail("9999-12-31 00:00:00 UTC"); + test_should_fail("10000-01-01 00:00:00 UTC"); +#elif SIZEOF_TIME_T == 4 + test_should_pass("2038-01-18 03:14:07 UTC"); + test_should_fail("2038-01-18 03:14:08 UTC"); +#endif + + return 0; +} diff --git a/src/test/test-dev-setup.c b/src/test/test-dev-setup.c new file mode 100644 index 0000000..b75576a --- /dev/null +++ b/src/test/test-dev-setup.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "capability-util.h" +#include "dev-setup.h" +#include "fs-util.h" +#include "mkdir.h" +#include "path-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "tmpfile-util.h" + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + const char *f; + struct stat st; + + test_setup_logging(LOG_DEBUG); + + if (have_effective_cap(CAP_DAC_OVERRIDE) <= 0) + return log_tests_skipped("missing capability (CAP_DAC_OVERRIDE)"); + + assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0); + + f = prefix_roota(p, "/run/systemd"); + assert_se(mkdir_p(f, 0755) >= 0); + + assert_se(make_inaccessible_nodes(f, 1, 1) >= 0); + + f = prefix_roota(p, "/run/systemd/inaccessible/reg"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISREG(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/dir"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/fifo"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISFIFO(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/sock"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISSOCK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/chr"); + if (stat(f, &st) < 0) + assert_se(errno == ENOENT); + else { + assert_se(S_ISCHR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + } + + f = prefix_roota(p, "/run/systemd/inaccessible/blk"); + if (stat(f, &st) < 0) + assert_se(errno == ENOENT); + else { + assert_se(S_ISBLK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + } + + return EXIT_SUCCESS; +} diff --git a/src/test/test-device-nodes.c b/src/test/test-device-nodes.c new file mode 100644 index 0000000..36fa2ce --- /dev/null +++ b/src/test/test-device-nodes.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "device-nodes.h" +#include "string-util.h" +#include "tests.h" + +/* helpers for test_encode_devnode_name */ +static char *do_encode_string(const char *in) { + size_t out_len = strlen(in) * 4 + 1; + char *out = malloc(out_len); + + assert_se(out); + assert_se(encode_devnode_name(in, out, out_len) >= 0); + puts(out); + + return out; +} + +static bool expect_encoded_as(const char *in, const char *expected) { + _cleanup_free_ char *encoded = do_encode_string(in); + return streq(encoded, expected); +} + +TEST(encode_devnode_name) { + assert_se(expect_encoded_as("systemd sucks", "systemd\\x20sucks")); + assert_se(expect_encoded_as("pinkiepie", "pinkiepie")); + assert_se(expect_encoded_as("valíd\\ųtf8", "valíd\\x5cųtf8")); + assert_se(expect_encoded_as("s/ash/ng", "s\\x2fash\\x2fng")); + assert_se(expect_encoded_as("/", "\\x2f")); + assert_se(expect_encoded_as("!", "\\x21")); + assert_se(expect_encoded_as("QEMU ", "QEMU\\x20\\x20\\x20\\x20")); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-devnum-util.c b/src/test/test-devnum-util.c new file mode 100644 index 0000000..2068e35 --- /dev/null +++ b/src/test/test-devnum-util.c @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "devnum-util.h" +#include "path-util.h" +#include "stat-util.h" +#include "tests.h" + +TEST(parse_devnum) { + dev_t dev; + + assert_se(parse_devnum("", &dev) == -EINVAL); + assert_se(parse_devnum("junk", &dev) == -EINVAL); + assert_se(parse_devnum("0", &dev) == -EINVAL); + assert_se(parse_devnum("5", &dev) == -EINVAL); + assert_se(parse_devnum("5:", &dev) == -EINVAL); + assert_se(parse_devnum(":5", &dev) == -EINVAL); + assert_se(parse_devnum("-1:-1", &dev) == -EINVAL); +#if SIZEOF_DEV_T < 8 + assert_se(parse_devnum("4294967295:4294967295", &dev) == -EINVAL); +#endif + assert_se(parse_devnum("8:11", &dev) >= 0 && major(dev) == 8 && minor(dev) == 11); + assert_se(parse_devnum("0:0", &dev) >= 0 && major(dev) == 0 && minor(dev) == 0); +} + +TEST(device_major_minor_valid) { + /* on glibc dev_t is 64-bit, even though in the kernel it is only 32-bit */ + assert_cc(sizeof(dev_t) == sizeof(uint64_t)); + + assert_se(DEVICE_MAJOR_VALID(0U)); + assert_se(DEVICE_MINOR_VALID(0U)); + + assert_se(DEVICE_MAJOR_VALID(1U)); + assert_se(DEVICE_MINOR_VALID(1U)); + + assert_se(!DEVICE_MAJOR_VALID(-1U)); + assert_se(!DEVICE_MINOR_VALID(-1U)); + + assert_se(DEVICE_MAJOR_VALID(1U << 10)); + assert_se(DEVICE_MINOR_VALID(1U << 10)); + + assert_se(DEVICE_MAJOR_VALID((1U << 12) - 1)); + assert_se(DEVICE_MINOR_VALID((1U << 20) - 1)); + + assert_se(!DEVICE_MAJOR_VALID((1U << 12))); + assert_se(!DEVICE_MINOR_VALID((1U << 20))); + + assert_se(!DEVICE_MAJOR_VALID(1U << 25)); + assert_se(!DEVICE_MINOR_VALID(1U << 25)); + + assert_se(!DEVICE_MAJOR_VALID(UINT32_MAX)); + assert_se(!DEVICE_MINOR_VALID(UINT32_MAX)); + + assert_se(!DEVICE_MAJOR_VALID(UINT64_MAX)); + assert_se(!DEVICE_MINOR_VALID(UINT64_MAX)); + + assert_se(DEVICE_MAJOR_VALID(major(0))); + assert_se(DEVICE_MINOR_VALID(minor(0))); +} + +static void test_device_path_make_canonical_one(const char *path) { + _cleanup_free_ char *resolved = NULL, *raw = NULL; + struct stat st; + dev_t devno; + mode_t mode; + int r; + + log_debug("> %s", path); + + if (stat(path, &st) < 0) { + assert_se(errno == ENOENT); + log_notice("Path %s not found, skipping test", path); + return; + } + + r = device_path_make_canonical(st.st_mode, st.st_rdev, &resolved); + if (r == -ENOENT) { + /* maybe /dev/char/x:y and /dev/block/x:y are missing in this test environment, because we + * run in a container or so? */ + log_notice("Device %s cannot be resolved, skipping test", path); + return; + } + + assert_se(r >= 0); + assert_se(path_equal(path, resolved)); + + assert_se(device_path_make_major_minor(st.st_mode, st.st_rdev, &raw) >= 0); + assert_se(device_path_parse_major_minor(raw, &mode, &devno) >= 0); + + assert_se(st.st_rdev == devno); + assert_se((st.st_mode & S_IFMT) == (mode & S_IFMT)); +} + +TEST(device_path_make_canonical) { + test_device_path_make_canonical_one("/dev/null"); + test_device_path_make_canonical_one("/dev/zero"); + test_device_path_make_canonical_one("/dev/full"); + test_device_path_make_canonical_one("/dev/random"); + test_device_path_make_canonical_one("/dev/urandom"); + test_device_path_make_canonical_one("/dev/tty"); + + if (is_device_node("/run/systemd/inaccessible/blk") > 0) { + test_device_path_make_canonical_one("/run/systemd/inaccessible/chr"); + test_device_path_make_canonical_one("/run/systemd/inaccessible/blk"); + } +} + +static void test_devnum_format_str_one(dev_t devnum, const char *s) { + dev_t x; + + assert_se(streq(FORMAT_DEVNUM(devnum), s)); + assert_se(parse_devnum(s, &x) >= 0); + assert_se(x == devnum); +} + +TEST(devnum_format_str) { + test_devnum_format_str_one(makedev(0, 0), "0:0"); + test_devnum_format_str_one(makedev(1, 2), "1:2"); + test_devnum_format_str_one(makedev(99, 100), "99:100"); + test_devnum_format_str_one(makedev(4095, 1048575), "4095:1048575"); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-dlopen-so.c b/src/test/test-dlopen-so.c new file mode 100644 index 0000000..e98b8da --- /dev/null +++ b/src/test/test-dlopen-so.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "bpf-dlopen.h" +#include "cryptsetup-util.h" +#include "elf-util.h" +#include "idn-util.h" +#include "libfido2-util.h" +#include "macro.h" +#include "main-func.h" +#include "password-quality-util-passwdqc.h" +#include "password-quality-util-pwquality.h" +#include "pcre2-util.h" +#include "pkcs11-util.h" +#include "qrcode-util.h" +#include "tests.h" +#include "tpm2-util.h" + +static int run(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + /* Try to load each of our weak library dependencies once. This is supposed to help finding cases + * where .so versions change and distributions update, but systemd doesn't have the new so names + * around yet. */ + +#if HAVE_LIBIDN2 || HAVE_LIBIDN + assert_se(dlopen_idn() >= 0); +#endif + +#if HAVE_LIBCRYPTSETUP + assert_se(dlopen_cryptsetup() >= 0); +#endif + +#if HAVE_PASSWDQC + assert_se(dlopen_passwdqc() >= 0); +#endif + +#if HAVE_PWQUALITY + assert_se(dlopen_pwquality() >= 0); +#endif + +#if HAVE_QRENCODE + assert_se(dlopen_qrencode() >= 0); +#endif + +#if HAVE_TPM2 + assert_se(dlopen_tpm2() >= 0); +#endif + +#if HAVE_LIBFIDO2 + assert_se(dlopen_libfido2() >= 0); +#endif + +#if HAVE_LIBBPF + assert_se(dlopen_bpf() >= 0); +#endif + +#if HAVE_ELFUTILS + assert_se(dlopen_dw() >= 0); + assert_se(dlopen_elf() >= 0); +#endif + +#if HAVE_PCRE2 + assert_se(dlopen_pcre2() >= 0); +#endif + +#if HAVE_P11KIT + assert_se(dlopen_p11kit() >= 0); +#endif + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-dlopen.c b/src/test/test-dlopen.c new file mode 100644 index 0000000..9c31537 --- /dev/null +++ b/src/test/test-dlopen.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "macro.h" + +int main(int argc, char **argv) { + void *handles[argc - 1]; + int i; + + for (i = 0; i < argc - 1; i++) + assert_se(handles[i] = dlopen(argv[i + 1], RTLD_NOW)); + + for (i--; i >= 0; i--) + assert_se(dlclose(handles[i]) == 0); + + return EXIT_SUCCESS; +} diff --git a/src/test/test-dns-domain.c b/src/test/test-dns-domain.c new file mode 100644 index 0000000..6c107e2 --- /dev/null +++ b/src/test/test-dns-domain.c @@ -0,0 +1,753 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "dns-domain.h" +#include "macro.h" +#include "string-util.h" +#include "tests.h" + +static void test_dns_label_unescape_one(const char *what, const char *expect, size_t buffer_sz, int ret, int ret_ldh) { + char buffer[buffer_sz]; + int r; + const char *w = what; + + log_info("%s, %s, %zu, →%d/%d", what, expect, buffer_sz, ret, ret_ldh); + + r = dns_label_unescape(&w, buffer, buffer_sz, 0); + assert_se(r == ret); + if (r >= 0) + assert_se(streq(buffer, expect)); + + w = what; + r = dns_label_unescape(&w, buffer, buffer_sz, DNS_LABEL_LDH); + assert_se(r == ret_ldh); + if (r >= 0) + assert_se(streq(buffer, expect)); + + w = what; + r = dns_label_unescape(&w, buffer, buffer_sz, DNS_LABEL_NO_ESCAPES); + const int ret_noe = strchr(what, '\\') ? -EINVAL : ret; + assert_se(r == ret_noe); + if (r >= 0) + assert_se(streq(buffer, expect)); +} + +TEST(dns_label_unescape) { + test_dns_label_unescape_one("hallo", "hallo", 6, 5, 5); + test_dns_label_unescape_one("hallo", "hallo", 4, -ENOBUFS, -ENOBUFS); + test_dns_label_unescape_one("", "", 10, 0, 0); + test_dns_label_unescape_one("hallo\\.foobar", "hallo.foobar", 20, 12, -EINVAL); + test_dns_label_unescape_one("hallo.foobar", "hallo", 10, 5, 5); + test_dns_label_unescape_one("hallo\n.foobar", "hallo", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_one("hallo\\", "hallo", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_one("hallo\\032 ", "hallo ", 20, 7, -EINVAL); + test_dns_label_unescape_one(".", "", 20, 0, 0); + test_dns_label_unescape_one("..", "", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_one(".foobar", "", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_one("foobar.", "foobar", 20, 6, 6); + test_dns_label_unescape_one("foobar..", "foobar", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_one("foo-bar", "foo-bar", 20, 7, 7); + test_dns_label_unescape_one("foo-", "foo-", 20, 4, -EINVAL); + test_dns_label_unescape_one("-foo", "-foo", 20, 4, -EINVAL); + test_dns_label_unescape_one("-foo-", "-foo-", 20, 5, -EINVAL); + test_dns_label_unescape_one("foo-.", "foo-", 20, 4, -EINVAL); + test_dns_label_unescape_one("foo.-", "foo", 20, 3, 3); + test_dns_label_unescape_one("foo\\032", "foo ", 20, 4, -EINVAL); + test_dns_label_unescape_one("foo\\045", "foo-", 20, 4, -EINVAL); + test_dns_label_unescape_one("głąb", "głąb", 20, 6, -EINVAL); +} + +static void test_dns_name_to_wire_format_one(const char *what, const char *expect, size_t buffer_sz, int ret) { + uint8_t buffer[buffer_sz]; + int r; + + log_info("%s, %s, %zu, →%d", what, strnull(expect), buffer_sz, ret); + + r = dns_name_to_wire_format(what, buffer, buffer_sz, false); + assert_se(r == ret); + + if (r >= 0) { + assert(expect); /* for gcc */ + assert_se(memcmp(buffer, expect, r) == 0); + } +} + +TEST(dns_name_to_wire_format) { + static const char out0[] = { 0 }; + static const char out1[] = { 3, 'f', 'o', 'o', 0 }; + static const char out2[] = { 5, 'h', 'a', 'l', 'l', 'o', 3, 'f', 'o', 'o', 3, 'b', 'a', 'r', 0 }; + static const char out3[] = { 4, ' ', 'f', 'o', 'o', 3, 'b', 'a', 'r', 0 }; + static const char out4[] = { 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 9, 'a', '1', '2', '3', '4', '5', '6', '7', '8', + 3, 'a', '1', '2', 0 }; + + test_dns_name_to_wire_format_one("", out0, sizeof(out0), sizeof(out0)); + + test_dns_name_to_wire_format_one("foo", out1, sizeof(out1), sizeof(out1)); + test_dns_name_to_wire_format_one("foo", out1, sizeof(out1) + 1, sizeof(out1)); + test_dns_name_to_wire_format_one("foo", out1, sizeof(out1) - 1, -ENOBUFS); + + test_dns_name_to_wire_format_one("hallo.foo.bar", out2, sizeof(out2), sizeof(out2)); + test_dns_name_to_wire_format_one("hallo.foo..bar", NULL, 32, -EINVAL); + + test_dns_name_to_wire_format_one("\\032foo.bar", out3, sizeof(out3), sizeof(out3)); + + test_dns_name_to_wire_format_one("a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a123", NULL, 500, -EINVAL); + test_dns_name_to_wire_format_one("a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12", out4, sizeof(out4), sizeof(out4)); +} + +static void test_dns_label_unescape_suffix_one(const char *what, const char *expect1, const char *expect2, size_t buffer_sz, int ret1, int ret2) { + char buffer[buffer_sz]; + const char *label; + int r; + + log_info("%s, %s, %s, %zu, %d, %d", what, expect1, expect2, buffer_sz, ret1, ret2); + + label = what + strlen(what); + + r = dns_label_unescape_suffix(what, &label, buffer, buffer_sz); + assert_se(r == ret1); + if (r >= 0) + assert_se(streq(buffer, expect1)); + + r = dns_label_unescape_suffix(what, &label, buffer, buffer_sz); + assert_se(r == ret2); + if (r >= 0) + assert_se(streq(buffer, expect2)); +} + +TEST(dns_label_unescape_suffix) { + test_dns_label_unescape_suffix_one("hallo", "hallo", "", 6, 5, 0); + test_dns_label_unescape_suffix_one("hallo", "hallo", "", 4, -ENOBUFS, -ENOBUFS); + test_dns_label_unescape_suffix_one("", "", "", 10, 0, 0); + test_dns_label_unescape_suffix_one("hallo\\.foobar", "hallo.foobar", "", 20, 12, 0); + test_dns_label_unescape_suffix_one("hallo.foobar", "foobar", "hallo", 10, 6, 5); + test_dns_label_unescape_suffix_one("hallo.foobar\n", "foobar", "foobar", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_suffix_one("hallo\\", "hallo", "hallo", 20, -EINVAL, -EINVAL); + test_dns_label_unescape_suffix_one("hallo\\032 ", "hallo ", "", 20, 7, 0); + test_dns_label_unescape_suffix_one(".", "", "", 20, 0, 0); + test_dns_label_unescape_suffix_one("..", "", "", 20, 0, -EINVAL); + test_dns_label_unescape_suffix_one(".foobar", "foobar", "", 20, 6, -EINVAL); + test_dns_label_unescape_suffix_one("foobar.", "foobar", "", 20, 6, 0); + test_dns_label_unescape_suffix_one("foo\\\\bar", "foo\\bar", "", 20, 7, 0); + test_dns_label_unescape_suffix_one("foo.bar", "bar", "foo", 20, 3, 3); + test_dns_label_unescape_suffix_one("foo..bar", "bar", "", 20, 3, -EINVAL); + test_dns_label_unescape_suffix_one("foo...bar", "bar", "", 20, 3, -EINVAL); + test_dns_label_unescape_suffix_one("foo\\.bar", "foo.bar", "", 20, 7, 0); + test_dns_label_unescape_suffix_one("foo\\\\.bar", "bar", "foo\\", 20, 3, 4); + test_dns_label_unescape_suffix_one("foo\\\\\\.bar", "foo\\.bar", "", 20, 8, 0); +} + +static void test_dns_label_escape_one(const char *what, size_t l, const char *expect, int ret) { + _cleanup_free_ char *t = NULL; + int r; + + log_info("%s, %zu, %s, →%d", what, l, strnull(expect), ret); + + r = dns_label_escape_new(what, l, &t); + assert_se(r == ret); + + if (r < 0) + return; + + assert_se(streq_ptr(expect, t)); +} + +TEST(dns_label_escape) { + test_dns_label_escape_one("", 0, NULL, -EINVAL); + test_dns_label_escape_one("hallo", 5, "hallo", 5); + test_dns_label_escape_one("hallo", 6, "hallo\\000", 9); + test_dns_label_escape_one("hallo hallo.foobar,waldi", 24, "hallo\\032hallo\\.foobar\\044waldi", 31); +} + +static void test_dns_name_normalize_one(const char *what, const char *expect, int ret) { + _cleanup_free_ char *t = NULL; + int r; + + r = dns_name_normalize(what, 0, &t); + assert_se(r == ret); + + if (r < 0) + return; + + assert_se(streq_ptr(expect, t)); +} + +TEST(dns_name_normalize) { + test_dns_name_normalize_one("", ".", 0); + test_dns_name_normalize_one("f", "f", 0); + test_dns_name_normalize_one("f.waldi", "f.waldi", 0); + test_dns_name_normalize_one("f \\032.waldi", "f\\032\\032.waldi", 0); + test_dns_name_normalize_one("\\000", "\\000", 0); + test_dns_name_normalize_one("..", NULL, -EINVAL); + test_dns_name_normalize_one(".foobar", NULL, -EINVAL); + test_dns_name_normalize_one("foobar.", "foobar", 0); + test_dns_name_normalize_one(".", ".", 0); +} + +static void test_dns_name_equal_one(const char *a, const char *b, int ret) { + int r; + + r = dns_name_equal(a, b); + assert_se(r == ret); + + r = dns_name_equal(b, a); + assert_se(r == ret); +} + +TEST(dns_name_equal) { + test_dns_name_equal_one("", "", true); + test_dns_name_equal_one("x", "x", true); + test_dns_name_equal_one("x", "x.", true); + test_dns_name_equal_one("abc.def", "abc.def", true); + test_dns_name_equal_one("abc.def", "ABC.def", true); + test_dns_name_equal_one("abc.def", "CBA.def", false); + test_dns_name_equal_one("", "xxx", false); + test_dns_name_equal_one("ab", "a", false); + test_dns_name_equal_one("\\000", "\\000", true); + test_dns_name_equal_one(".", "", true); + test_dns_name_equal_one(".", ".", true); + test_dns_name_equal_one("..", "..", -EINVAL); +} + +static void test_dns_name_between_one(const char *a, const char *b, const char *c, int ret) { + int r; + + r = dns_name_between(a, b, c); + assert_se(r == ret); + + r = dns_name_between(c, b, a); + if (ret >= 0) + assert_se(r == 0 || dns_name_equal(a, c) > 0); + else + assert_se(r == ret); +} + +TEST(dns_name_between) { + /* see https://tools.ietf.org/html/rfc4034#section-6.1 + Note that we use "\033.z.example" in stead of "\001.z.example" as we + consider the latter invalid */ + test_dns_name_between_one("example", "a.example", "yljkjljk.a.example", true); + test_dns_name_between_one("a.example", "yljkjljk.a.example", "Z.a.example", true); + test_dns_name_between_one("yljkjljk.a.example", "Z.a.example", "zABC.a.EXAMPLE", true); + test_dns_name_between_one("Z.a.example", "zABC.a.EXAMPLE", "z.example", true); + test_dns_name_between_one("zABC.a.EXAMPLE", "z.example", "\\033.z.example", true); + test_dns_name_between_one("z.example", "\\033.z.example", "*.z.example", true); + test_dns_name_between_one("\\033.z.example", "*.z.example", "\\200.z.example", true); + test_dns_name_between_one("*.z.example", "\\200.z.example", "example", true); + test_dns_name_between_one("\\200.z.example", "example", "a.example", true); + + test_dns_name_between_one("example", "a.example", "example", true); + test_dns_name_between_one("example", "example", "example", false); + test_dns_name_between_one("example", "example", "yljkjljk.a.example", false); + test_dns_name_between_one("example", "yljkjljk.a.example", "yljkjljk.a.example", false); + test_dns_name_between_one("hkps.pool.sks-keyservers.net", "_pgpkey-https._tcp.hkps.pool.sks-keyservers.net", "ipv4.pool.sks-keyservers.net", true); +} + +static void test_dns_name_endswith_one(const char *a, const char *b, int ret) { + assert_se(dns_name_endswith(a, b) == ret); +} + +TEST(dns_name_endswith) { + test_dns_name_endswith_one("", "", true); + test_dns_name_endswith_one("", "xxx", false); + test_dns_name_endswith_one("xxx", "", true); + test_dns_name_endswith_one("x", "x", true); + test_dns_name_endswith_one("x", "y", false); + test_dns_name_endswith_one("x.y", "y", true); + test_dns_name_endswith_one("x.y", "Y", true); + test_dns_name_endswith_one("x.y", "x", false); + test_dns_name_endswith_one("x.y.z", "Z", true); + test_dns_name_endswith_one("x.y.z", "y.Z", true); + test_dns_name_endswith_one("x.y.z", "x.y.Z", true); + test_dns_name_endswith_one("x.y.z", "waldo", false); + test_dns_name_endswith_one("x.y.z.u.v.w", "y.z", false); + test_dns_name_endswith_one("x.y.z.u.v.w", "u.v.w", true); + test_dns_name_endswith_one("x.y\001.z", "waldo", -EINVAL); +} + +static void test_dns_name_startswith_one(const char *a, const char *b, int ret) { + assert_se(dns_name_startswith(a, b) == ret); +} + +TEST(dns_name_startswith) { + test_dns_name_startswith_one("", "", true); + test_dns_name_startswith_one("", "xxx", false); + test_dns_name_startswith_one("xxx", "", true); + test_dns_name_startswith_one("x", "x", true); + test_dns_name_startswith_one("x", "y", false); + test_dns_name_startswith_one("x.y", "x.y", true); + test_dns_name_startswith_one("x.y", "y.x", false); + test_dns_name_startswith_one("x.y", "x", true); + test_dns_name_startswith_one("x.y", "X", true); + test_dns_name_startswith_one("x.y", "y", false); + test_dns_name_startswith_one("x.y", "", true); + test_dns_name_startswith_one("x.y", "X", true); +} + +TEST(dns_name_is_root) { + assert_se(dns_name_is_root("")); + assert_se(dns_name_is_root(".")); + assert_se(!dns_name_is_root("xxx")); + assert_se(!dns_name_is_root("xxx.")); + assert_se(!dns_name_is_root("..")); +} + +TEST(dns_name_is_single_label) { + assert_se(!dns_name_is_single_label("")); + assert_se(!dns_name_is_single_label(".")); + assert_se(!dns_name_is_single_label("..")); + assert_se(dns_name_is_single_label("x")); + assert_se(dns_name_is_single_label("x.")); + assert_se(!dns_name_is_single_label("xx.yy")); +} + +static void test_dns_name_reverse_one(const char *address, const char *name) { + _cleanup_free_ char *p = NULL; + union in_addr_union a, b = {}; + int familya, familyb; + + assert_se(in_addr_from_string_auto(address, &familya, &a) >= 0); + assert_se(dns_name_reverse(familya, &a, &p) >= 0); + assert_se(streq(p, name)); + assert_se(dns_name_address(p, &familyb, &b) > 0); + assert_se(familya == familyb); + assert_se(in_addr_equal(familya, &a, &b)); +} + +TEST(dns_name_reverse) { + test_dns_name_reverse_one("47.11.8.15", "15.8.11.47.in-addr.arpa"); + test_dns_name_reverse_one("fe80::47", "7.4.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.e.f.ip6.arpa"); + test_dns_name_reverse_one("127.0.0.1", "1.0.0.127.in-addr.arpa"); + test_dns_name_reverse_one("::1", "1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa"); +} + +static void test_dns_name_concat_one(const char *a, const char *b, int r, const char *result) { + _cleanup_free_ char *p = NULL; + + assert_se(dns_name_concat(a, b, 0, &p) == r); + assert_se(streq_ptr(p, result)); +} + +TEST(dns_name_concat) { + test_dns_name_concat_one("", "", 0, "."); + test_dns_name_concat_one(".", "", 0, "."); + test_dns_name_concat_one("", ".", 0, "."); + test_dns_name_concat_one(".", ".", 0, "."); + test_dns_name_concat_one("foo", "bar", 0, "foo.bar"); + test_dns_name_concat_one("foo.foo", "bar.bar", 0, "foo.foo.bar.bar"); + test_dns_name_concat_one("foo", NULL, 0, "foo"); + test_dns_name_concat_one("foo", ".", 0, "foo"); + test_dns_name_concat_one("foo.", "bar.", 0, "foo.bar"); + test_dns_name_concat_one(NULL, NULL, 0, "."); + test_dns_name_concat_one(NULL, ".", 0, "."); + test_dns_name_concat_one(NULL, "foo", 0, "foo"); +} + +static void test_dns_name_is_valid_one(const char *s, int ret, int ret_ldh) { + log_info("%s, →%d", s, ret); + + assert_se(dns_name_is_valid(s) == ret); + assert_se(dns_name_is_valid_ldh(s) == ret_ldh); +} + +TEST(dns_name_is_valid) { + test_dns_name_is_valid_one("foo", 1, 1); + test_dns_name_is_valid_one("foo.", 1, 1); + test_dns_name_is_valid_one("foo..", 0, 0); + test_dns_name_is_valid_one("Foo", 1, 1); + test_dns_name_is_valid_one("foo.bar", 1, 1); + test_dns_name_is_valid_one("foo.bar.baz", 1, 1); + test_dns_name_is_valid_one("", 1, 1); + test_dns_name_is_valid_one("foo..bar", 0, 0); + test_dns_name_is_valid_one(".foo.bar", 0, 0); + test_dns_name_is_valid_one("foo.bar.", 1, 1); + test_dns_name_is_valid_one("foo.bar..", 0, 0); + test_dns_name_is_valid_one("\\zbar", 0, 0); + test_dns_name_is_valid_one("ä", 1, 0); + test_dns_name_is_valid_one("\n", 0, 0); + + test_dns_name_is_valid_one("dash-", 1, 0); + test_dns_name_is_valid_one("-dash", 1, 0); + test_dns_name_is_valid_one("dash-dash", 1, 1); + test_dns_name_is_valid_one("foo.dash-", 1, 0); + test_dns_name_is_valid_one("foo.-dash", 1, 0); + test_dns_name_is_valid_one("foo.dash-dash", 1, 1); + test_dns_name_is_valid_one("foo.dash-.bar", 1, 0); + test_dns_name_is_valid_one("foo.-dash.bar", 1, 0); + test_dns_name_is_valid_one("foo.dash-dash.bar", 1, 1); + test_dns_name_is_valid_one("dash-.bar", 1, 0); + test_dns_name_is_valid_one("-dash.bar", 1, 0); + test_dns_name_is_valid_one("dash-dash.bar", 1, 1); + test_dns_name_is_valid_one("-.bar", 1, 0); + test_dns_name_is_valid_one("foo.-", 1, 0); + + /* 256 characters */ + test_dns_name_is_valid_one("a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345", 0, 0); + + /* 255 characters */ + test_dns_name_is_valid_one("a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a1234", 0, 0); + + /* 254 characters */ + test_dns_name_is_valid_one("a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a123", 0, 0); + + /* 253 characters */ + test_dns_name_is_valid_one("a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12345678.a12", 1, 1); + + /* label of 64 chars length */ + test_dns_name_is_valid_one("a123456789a123456789a123456789a123456789a123456789a123456789a123", 0, 0); + + /* label of 63 chars length */ + test_dns_name_is_valid_one("a123456789a123456789a123456789a123456789a123456789a123456789a12", 1, 1); +} + +TEST(dns_service_name_is_valid) { + assert_se(dns_service_name_is_valid("Lennart's Compüter")); + assert_se(dns_service_name_is_valid("piff.paff")); + + assert_se(!dns_service_name_is_valid(NULL)); + assert_se(!dns_service_name_is_valid("")); + assert_se(!dns_service_name_is_valid("foo\nbar")); + assert_se(!dns_service_name_is_valid("foo\201bar")); + assert_se(!dns_service_name_is_valid("this is an overly long string that is certainly longer than 63 characters")); +} + +TEST(dns_srv_type_is_valid) { + assert_se(dns_srv_type_is_valid("_http._tcp")); + assert_se(dns_srv_type_is_valid("_foo-bar._tcp")); + assert_se(dns_srv_type_is_valid("_w._udp")); + assert_se(dns_srv_type_is_valid("_a800._tcp")); + assert_se(dns_srv_type_is_valid("_a-800._tcp")); + + assert_se(!dns_srv_type_is_valid(NULL)); + assert_se(!dns_srv_type_is_valid("")); + assert_se(!dns_srv_type_is_valid("x")); + assert_se(!dns_srv_type_is_valid("_foo")); + assert_se(!dns_srv_type_is_valid("_tcp")); + assert_se(!dns_srv_type_is_valid("_")); + assert_se(!dns_srv_type_is_valid("_foo.")); + assert_se(!dns_srv_type_is_valid("_föo._tcp")); + assert_se(!dns_srv_type_is_valid("_f\no._tcp")); + assert_se(!dns_srv_type_is_valid("_800._tcp")); + assert_se(!dns_srv_type_is_valid("_-800._tcp")); + assert_se(!dns_srv_type_is_valid("_-foo._tcp")); + assert_se(!dns_srv_type_is_valid("_piep._foo._udp")); +} + +TEST(dnssd_srv_type_is_valid) { + assert_se(dnssd_srv_type_is_valid("_http._tcp")); + assert_se(dnssd_srv_type_is_valid("_foo-bar._tcp")); + assert_se(dnssd_srv_type_is_valid("_w._udp")); + assert_se(dnssd_srv_type_is_valid("_a800._tcp")); + assert_se(dnssd_srv_type_is_valid("_a-800._tcp")); + + assert_se(!dnssd_srv_type_is_valid(NULL)); + assert_se(!dnssd_srv_type_is_valid("")); + assert_se(!dnssd_srv_type_is_valid("x")); + assert_se(!dnssd_srv_type_is_valid("_foo")); + assert_se(!dnssd_srv_type_is_valid("_tcp")); + assert_se(!dnssd_srv_type_is_valid("_")); + assert_se(!dnssd_srv_type_is_valid("_foo.")); + assert_se(!dnssd_srv_type_is_valid("_föo._tcp")); + assert_se(!dnssd_srv_type_is_valid("_f\no._tcp")); + assert_se(!dnssd_srv_type_is_valid("_800._tcp")); + assert_se(!dnssd_srv_type_is_valid("_-800._tcp")); + assert_se(!dnssd_srv_type_is_valid("_-foo._tcp")); + assert_se(!dnssd_srv_type_is_valid("_piep._foo._udp")); + assert_se(!dnssd_srv_type_is_valid("_foo._unknown")); +} + +static void test_dns_service_join_one(const char *a, const char *b, const char *c, int r, const char *d) { + _cleanup_free_ char *x = NULL, *y = NULL, *z = NULL, *t = NULL; + + log_info("%s, %s, %s, →%d, %s", strnull(a), strnull(b), strnull(c), r, strnull(d)); + + assert_se(dns_service_join(a, b, c, &t) == r); + assert_se(streq_ptr(t, d)); + + if (r < 0) + return; + + assert_se(dns_service_split(t, &x, &y, &z) >= 0); + assert_se(streq_ptr(a, x)); + assert_se(streq_ptr(b, y)); + assert_se(dns_name_equal(c, z) > 0); +} + +TEST(dns_service_join) { + test_dns_service_join_one("", "", "", -EINVAL, NULL); + test_dns_service_join_one("", "_http._tcp", "", -EINVAL, NULL); + test_dns_service_join_one("", "_http._tcp", "foo", -EINVAL, NULL); + test_dns_service_join_one("foo", "", "foo", -EINVAL, NULL); + test_dns_service_join_one("foo", "foo", "foo", -EINVAL, NULL); + + test_dns_service_join_one("foo", "_http._tcp", "", 0, "foo._http._tcp"); + test_dns_service_join_one(NULL, "_http._tcp", "", 0, "_http._tcp"); + test_dns_service_join_one("foo", "_http._tcp", "foo", 0, "foo._http._tcp.foo"); + test_dns_service_join_one(NULL, "_http._tcp", "foo", 0, "_http._tcp.foo"); + test_dns_service_join_one("Lennart's PC", "_pc._tcp", "foo.bar.com", 0, "Lennart\\039s\\032PC._pc._tcp.foo.bar.com"); + test_dns_service_join_one(NULL, "_pc._tcp", "foo.bar.com", 0, "_pc._tcp.foo.bar.com"); +} + +static void test_dns_service_split_one(const char *joined, const char *a, const char *b, const char *c, int r) { + _cleanup_free_ char *x = NULL, *y = NULL, *z = NULL, *t = NULL; + + log_info("%s, %s, %s, %s, →%d", joined, strnull(a), strnull(b), strnull(c), r); + + assert_se(dns_service_split(joined, &x, &y, &z) == r); + assert_se(streq_ptr(x, a)); + assert_se(streq_ptr(y, b)); + assert_se(streq_ptr(z, c)); + + if (r < 0) + return; + + if (y) { + assert_se(dns_service_join(x, y, z, &t) == 0); + assert_se(dns_name_equal(joined, t) > 0); + } else + assert_se(!x && dns_name_equal(z, joined) > 0); +} + +TEST(dns_service_split) { + test_dns_service_split_one("", NULL, NULL, ".", 0); + test_dns_service_split_one("foo", NULL, NULL, "foo", 0); + test_dns_service_split_one("foo.bar", NULL, NULL, "foo.bar", 0); + test_dns_service_split_one("_foo.bar", NULL, NULL, "_foo.bar", 0); + test_dns_service_split_one("_foo._bar", NULL, "_foo._bar", ".", 0); + test_dns_service_split_one("_meh._foo._bar", "_meh", "_foo._bar", ".", 0); + test_dns_service_split_one("Wuff\\032Wuff._foo._bar.waldo.com", "Wuff Wuff", "_foo._bar", "waldo.com", 0); + test_dns_service_split_one("_Q._Q-------------------------------------------------------------", NULL, "_Q._Q-------------------------------------------------------------", ".", 0); +} + +static void test_dns_name_change_suffix_one(const char *name, const char *old_suffix, const char *new_suffix, int r, const char *result) { + _cleanup_free_ char *s = NULL; + + log_info("%s, %s, %s, →%s", name, old_suffix, new_suffix, strnull(result)); + + assert_se(dns_name_change_suffix(name, old_suffix, new_suffix, &s) == r); + assert_se(streq_ptr(s, result)); +} + +TEST(dns_name_change_suffix) { + test_dns_name_change_suffix_one("foo.bar", "bar", "waldo", 1, "foo.waldo"); + test_dns_name_change_suffix_one("foo.bar.waldi.quux", "foo.bar.waldi.quux", "piff.paff", 1, "piff.paff"); + test_dns_name_change_suffix_one("foo.bar.waldi.quux", "bar.waldi.quux", "piff.paff", 1, "foo.piff.paff"); + test_dns_name_change_suffix_one("foo.bar.waldi.quux", "waldi.quux", "piff.paff", 1, "foo.bar.piff.paff"); + test_dns_name_change_suffix_one("foo.bar.waldi.quux", "quux", "piff.paff", 1, "foo.bar.waldi.piff.paff"); + test_dns_name_change_suffix_one("foo.bar.waldi.quux", "", "piff.paff", 1, "foo.bar.waldi.quux.piff.paff"); + test_dns_name_change_suffix_one("", "", "piff.paff", 1, "piff.paff"); + test_dns_name_change_suffix_one("", "", "", 1, "."); + test_dns_name_change_suffix_one("a", "b", "c", 0, NULL); +} + +static void test_dns_name_suffix_one(const char *name, unsigned n_labels, const char *result, int ret) { + const char *p = NULL; + + log_info("%s, %u, → %s, %d", name, n_labels, strnull(result), ret); + + assert_se(ret == dns_name_suffix(name, n_labels, &p)); + assert_se(streq_ptr(p, result)); +} + +TEST(dns_name_suffix) { + test_dns_name_suffix_one("foo.bar", 2, "foo.bar", 0); + test_dns_name_suffix_one("foo.bar", 1, "bar", 1); + test_dns_name_suffix_one("foo.bar", 0, "", 2); + test_dns_name_suffix_one("foo.bar", 3, NULL, -EINVAL); + test_dns_name_suffix_one("foo.bar", 4, NULL, -EINVAL); + + test_dns_name_suffix_one("bar", 1, "bar", 0); + test_dns_name_suffix_one("bar", 0, "", 1); + test_dns_name_suffix_one("bar", 2, NULL, -EINVAL); + test_dns_name_suffix_one("bar", 3, NULL, -EINVAL); + + test_dns_name_suffix_one("", 0, "", 0); + test_dns_name_suffix_one("", 1, NULL, -EINVAL); + test_dns_name_suffix_one("", 2, NULL, -EINVAL); +} + +static void test_dns_name_count_labels_one(const char *name, int n) { + log_info("%s, →%d", name, n); + + assert_se(dns_name_count_labels(name) == n); +} + +TEST(dns_name_count_labels) { + test_dns_name_count_labels_one("foo.bar.quux.", 3); + test_dns_name_count_labels_one("foo.bar.quux", 3); + test_dns_name_count_labels_one("foo.bar.", 2); + test_dns_name_count_labels_one("foo.bar", 2); + test_dns_name_count_labels_one("foo.", 1); + test_dns_name_count_labels_one("foo", 1); + test_dns_name_count_labels_one("", 0); + test_dns_name_count_labels_one(".", 0); + test_dns_name_count_labels_one("..", -EINVAL); +} + +static void test_dns_name_equal_skip_one(const char *a, unsigned n_labels, const char *b, int ret) { + log_info("%s, %u, %s, →%d", a, n_labels, b, ret); + + assert_se(dns_name_equal_skip(a, n_labels, b) == ret); +} + +TEST(dns_name_equal_skip) { + test_dns_name_equal_skip_one("foo", 0, "bar", 0); + test_dns_name_equal_skip_one("foo", 0, "foo", 1); + test_dns_name_equal_skip_one("foo", 1, "foo", 0); + test_dns_name_equal_skip_one("foo", 2, "foo", 0); + + test_dns_name_equal_skip_one("foo.bar", 0, "foo.bar", 1); + test_dns_name_equal_skip_one("foo.bar", 1, "foo.bar", 0); + test_dns_name_equal_skip_one("foo.bar", 2, "foo.bar", 0); + test_dns_name_equal_skip_one("foo.bar", 3, "foo.bar", 0); + + test_dns_name_equal_skip_one("foo.bar", 0, "bar", 0); + test_dns_name_equal_skip_one("foo.bar", 1, "bar", 1); + test_dns_name_equal_skip_one("foo.bar", 2, "bar", 0); + test_dns_name_equal_skip_one("foo.bar", 3, "bar", 0); + + test_dns_name_equal_skip_one("foo.bar", 0, "", 0); + test_dns_name_equal_skip_one("foo.bar", 1, "", 0); + test_dns_name_equal_skip_one("foo.bar", 2, "", 1); + test_dns_name_equal_skip_one("foo.bar", 3, "", 0); + + test_dns_name_equal_skip_one("", 0, "", 1); + test_dns_name_equal_skip_one("", 1, "", 0); + test_dns_name_equal_skip_one("", 1, "foo", 0); + test_dns_name_equal_skip_one("", 2, "foo", 0); +} + +TEST(dns_name_compare_func) { + assert_se(dns_name_compare_func("", "") == 0); + assert_se(dns_name_compare_func("", ".") == 0); + assert_se(dns_name_compare_func(".", "") == 0); + assert_se(dns_name_compare_func("foo", "foo.") == 0); + assert_se(dns_name_compare_func("foo.", "foo") == 0); + assert_se(dns_name_compare_func("foo", "foo") == 0); + assert_se(dns_name_compare_func("foo.", "foo.") == 0); + assert_se(dns_name_compare_func("heise.de", "HEISE.DE.") == 0); + + assert_se(dns_name_compare_func("de.", "heise.de") != 0); +} + +static void test_dns_name_common_suffix_one(const char *a, const char *b, const char *result) { + const char *c; + + log_info("%s, %s, →%s", a, b, result); + + assert_se(dns_name_common_suffix(a, b, &c) >= 0); + assert_se(streq(c, result)); +} + +TEST(dns_name_common_suffix) { + test_dns_name_common_suffix_one("", "", ""); + test_dns_name_common_suffix_one("foo", "", ""); + test_dns_name_common_suffix_one("", "foo", ""); + test_dns_name_common_suffix_one("foo", "bar", ""); + test_dns_name_common_suffix_one("bar", "foo", ""); + test_dns_name_common_suffix_one("foo", "foo", "foo"); + test_dns_name_common_suffix_one("quux.foo", "foo", "foo"); + test_dns_name_common_suffix_one("foo", "quux.foo", "foo"); + test_dns_name_common_suffix_one("this.is.a.short.sentence", "this.is.another.short.sentence", "short.sentence"); + test_dns_name_common_suffix_one("FOO.BAR", "tEST.bAR", "BAR"); +} + +static void test_dns_name_apply_idna_one(const char *s, int expected, const char *result) { + _cleanup_free_ char *buf = NULL; + int r; + + r = dns_name_apply_idna(s, &buf); + log_debug("dns_name_apply_idna: \"%s\" → %d/\"%s\" (expected %d/\"%s\")", + s, r, strnull(buf), expected, strnull(result)); + + /* Different libidn2 versions are more and less accepting + * of underscore-prefixed names. So let's list the lowest + * expected return value. */ + assert_se(r >= expected); + if (expected == 1) + assert_se(dns_name_equal(buf, result) == 1); +} + +TEST(dns_name_apply_idna) { + const int ret = HAVE_LIBIDN2 | HAVE_LIBIDN; + + /* IDNA2008 forbids names with hyphens in third and fourth positions + * (https://tools.ietf.org/html/rfc5891#section-4.2.3.1). + * IDNA2003 does not have this restriction + * (https://tools.ietf.org/html/rfc3490#section-5). + * This means that when using libidn we will transform and test more + * labels. If registrars follow IDNA2008 we'll just be performing a + * useless lookup. + */ + const int ret2 = HAVE_LIBIDN; + + test_dns_name_apply_idna_one("", ret, ""); + test_dns_name_apply_idna_one("foo", ret, "foo"); + test_dns_name_apply_idna_one("foo.", ret, "foo"); + test_dns_name_apply_idna_one("foo.bar", ret, "foo.bar"); + test_dns_name_apply_idna_one("foo.bar.", ret, "foo.bar"); + test_dns_name_apply_idna_one("föö", ret, "xn--f-1gaa"); + test_dns_name_apply_idna_one("föö.", ret, "xn--f-1gaa"); + test_dns_name_apply_idna_one("föö.bär", ret, "xn--f-1gaa.xn--br-via"); + test_dns_name_apply_idna_one("föö.bär.", ret, "xn--f-1gaa.xn--br-via"); + test_dns_name_apply_idna_one("xn--f-1gaa.xn--br-via", ret, "xn--f-1gaa.xn--br-via"); + + test_dns_name_apply_idna_one("_443._tcp.fedoraproject.org", ret2, + "_443._tcp.fedoraproject.org"); + test_dns_name_apply_idna_one("_443", ret2, "_443"); + test_dns_name_apply_idna_one("gateway", ret, "gateway"); + test_dns_name_apply_idna_one("_gateway", ret2, "_gateway"); + + test_dns_name_apply_idna_one("r3---sn-ab5l6ne7.googlevideo.com", ret2, + ret2 ? "r3---sn-ab5l6ne7.googlevideo.com" : ""); +} + +TEST(dns_name_is_valid_or_address) { + assert_se(dns_name_is_valid_or_address(NULL) == 0); + assert_se(dns_name_is_valid_or_address("") == 0); + assert_se(dns_name_is_valid_or_address("foobar") > 0); + assert_se(dns_name_is_valid_or_address("foobar.com") > 0); + assert_se(dns_name_is_valid_or_address("foobar..com") == 0); + assert_se(dns_name_is_valid_or_address("foobar.com.") > 0); + assert_se(dns_name_is_valid_or_address("127.0.0.1") > 0); + assert_se(dns_name_is_valid_or_address("::") > 0); + assert_se(dns_name_is_valid_or_address("::1") > 0); +} + +TEST(dns_name_dot_suffixed) { + assert_se(dns_name_dot_suffixed("") == 0); + assert_se(dns_name_dot_suffixed(".") > 0); + assert_se(dns_name_dot_suffixed("foo") == 0); + assert_se(dns_name_dot_suffixed("foo.") > 0); + assert_se(dns_name_dot_suffixed("foo\\..") > 0); + assert_se(dns_name_dot_suffixed("foo\\.") == 0); + assert_se(dns_name_dot_suffixed("foo.bar.") > 0); + assert_se(dns_name_dot_suffixed("foo.bar\\.\\.\\..") > 0); + assert_se(dns_name_dot_suffixed("foo.bar\\.\\.\\.\\.") == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-ellipsize.c b/src/test/test-ellipsize.c new file mode 100644 index 0000000..c272c56 --- /dev/null +++ b/src/test/test-ellipsize.c @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "constants.h" +#include "escape.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tests.h" +#include "utf8.h" + +static void test_ellipsize_mem_one(const char *s, size_t old_length, size_t new_length) { + _cleanup_free_ char *n = NULL; + _cleanup_free_ char *t1 = NULL, *t2 = NULL, *t3 = NULL; + char buf[LINE_MAX]; + bool has_wide_chars; + size_t max_width; + + n = memdup_suffix0(s, old_length); + + if (!utf8_is_valid(n)) + /* We don't support invalid sequences… */ + return; + + /* Report out inputs. We duplicate the data so that cellescape + * can properly report truncated multibyte sequences. */ + log_info("%s \"%s\" old_length=%zu/%zu new_length=%zu", __func__, + cellescape(buf, sizeof buf, n), + old_length, utf8_console_width(n), + new_length); + + /* To keep this test simple, any case with wide chars starts with this glyph */ + has_wide_chars = startswith(s, "你"); + max_width = MIN(utf8_console_width(n), new_length); + + t1 = ellipsize_mem(n, old_length, new_length, 30); + log_info("30%% → %s utf8_console_width=%zu", t1, utf8_console_width(t1)); + if (!has_wide_chars) + assert_se(utf8_console_width(t1) == max_width); + else + assert_se(utf8_console_width(t1) <= max_width); + + t2 = ellipsize_mem(n, old_length, new_length, 90); + log_info("90%% → %s utf8_console_width=%zu", t2, utf8_console_width(t2)); + if (!has_wide_chars) + assert_se(utf8_console_width(t2) == max_width); + else + assert_se(utf8_console_width(t2) <= max_width); + + t3 = ellipsize_mem(n, old_length, new_length, 100); + log_info("100%% → %s utf8_console_width=%zu", t3, utf8_console_width(t3)); + if (!has_wide_chars) + assert_se(utf8_console_width(t3) == max_width); + else + assert_se(utf8_console_width(t3) <= max_width); + + if (new_length >= old_length) { + assert_se(streq(t1, n)); + assert_se(streq(t2, n)); + assert_se(streq(t3, n)); + } +} + +TEST(ellipsize_mem) { + FOREACH_STRING(s, + "_XXXXXXXXXXX_", /* ASCII */ + "_aąęółśćńżźć_", /* two-byte utf-8 */ + "გამარჯობა", /* multi-byte utf-8 */ + "你好世界", /* wide characters */ + "你გą世óoó界") /* a mix */ + for (ssize_t l = strlen(s); l >= 0; l--) + for (ssize_t k = strlen(s) + 1; k >= 0; k--) + test_ellipsize_mem_one(s, l, k); +} + +static void test_ellipsize_one(const char *p) { + _cleanup_free_ char *t = NULL; + t = ellipsize(p, columns(), 70); + puts(t); + free(t); + t = ellipsize(p, columns(), 0); + puts(t); + free(t); + t = ellipsize(p, columns(), 100); + puts(t); + free(t); + t = ellipsize(p, 0, 50); + puts(t); + free(t); + t = ellipsize(p, 1, 50); + puts(t); + free(t); + t = ellipsize(p, 2, 50); + puts(t); + free(t); + t = ellipsize(p, 3, 50); + puts(t); + free(t); + t = ellipsize(p, 4, 50); + puts(t); + free(t); + t = ellipsize(p, 5, 50); + puts(t); +} + +TEST(ellipsize) { + test_ellipsize_one(DIGITS LETTERS DIGITS LETTERS); + test_ellipsize_one("한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어"); + test_ellipsize_one("-日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国"); + test_ellipsize_one("中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国-中国中国中国中国中国中国中国中国中国中国中国中国中国"); + test_ellipsize_one("sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd"); + test_ellipsize_one("🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮"); + test_ellipsize_one("Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."); + test_ellipsize_one("shórt"); +} + +TEST(ellipsize_ansi) { + const char *s = ANSI_HIGHLIGHT_YELLOW_UNDERLINE "yęllow" + ANSI_HIGHLIGHT_GREY_UNDERLINE "grěy" + ANSI_HIGHLIGHT_BLUE_UNDERLINE "blue" + ANSI_NORMAL "nórmął"; + size_t len = strlen(s); + + for (unsigned percent = 0; percent <= 100; percent += 15) + for (ssize_t x = 21; x >= 0; x--) { + _cleanup_free_ char *t = ellipsize_mem(s, len, x, percent); + printf("%02zd: \"%s\"\n", x, t); + assert_se(utf8_is_valid(t)); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *e = cescape(t); + printf(" : \"%s\"\n", e); + } + } +} + +TEST(ellipsize_ansi_cats) { + _cleanup_free_ char *e = NULL, *f = NULL, *g = NULL, *h = NULL; + + /* Make sure we don't cut off in the middle of an ANSI escape sequence. */ + + e = ellipsize("01" ANSI_NORMAL "23", 4, 0); + puts(e); + assert_se(streq(e, "01" ANSI_NORMAL "23")); + f = ellipsize("ab" ANSI_NORMAL "cd", 4, 90); + puts(f); + assert_se(streq(f, "ab" ANSI_NORMAL "cd")); + + g = ellipsize("🐱🐱" ANSI_NORMAL "🐱🐱" ANSI_NORMAL, 5, 0); + puts(g); + assert_se(streq(g, "…" ANSI_NORMAL "🐱🐱" ANSI_NORMAL)); + h = ellipsize("🐱🐱" ANSI_NORMAL "🐱🐱" ANSI_NORMAL, 5, 90); + puts(h); + assert_se(streq(h, "🐱…" ANSI_NORMAL "🐱" ANSI_NORMAL)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-emergency-action.c b/src/test/test-emergency-action.c new file mode 100644 index 0000000..5c0ce7f --- /dev/null +++ b/src/test/test-emergency-action.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "emergency-action.h" +#include "tests.h" + +TEST(parse_emergency_action) { + EmergencyAction x; + + assert_se(parse_emergency_action("none", RUNTIME_SCOPE_USER, &x) == 0); + assert_se(x == EMERGENCY_ACTION_NONE); + assert_se(parse_emergency_action("reboot", RUNTIME_SCOPE_USER, &x) == -EOPNOTSUPP); + assert_se(parse_emergency_action("reboot-force", RUNTIME_SCOPE_USER, &x) == -EOPNOTSUPP); + assert_se(parse_emergency_action("reboot-immediate", RUNTIME_SCOPE_USER, &x) == -EOPNOTSUPP); + assert_se(parse_emergency_action("poweroff", RUNTIME_SCOPE_USER, &x) == -EOPNOTSUPP); + assert_se(parse_emergency_action("poweroff-force", RUNTIME_SCOPE_USER, &x) == -EOPNOTSUPP); + assert_se(parse_emergency_action("poweroff-immediate", RUNTIME_SCOPE_USER, &x) == -EOPNOTSUPP); + assert_se(x == EMERGENCY_ACTION_NONE); + assert_se(parse_emergency_action("exit", RUNTIME_SCOPE_USER, &x) == 0); + assert_se(x == EMERGENCY_ACTION_EXIT); + assert_se(parse_emergency_action("exit-force", RUNTIME_SCOPE_USER, &x) == 0); + assert_se(x == EMERGENCY_ACTION_EXIT_FORCE); + assert_se(parse_emergency_action("exit-forcee", RUNTIME_SCOPE_USER, &x) == -EINVAL); + + assert_se(parse_emergency_action("none", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(x == EMERGENCY_ACTION_NONE); + assert_se(parse_emergency_action("reboot", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(x == EMERGENCY_ACTION_REBOOT); + assert_se(parse_emergency_action("reboot-force", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(x == EMERGENCY_ACTION_REBOOT_FORCE); + assert_se(parse_emergency_action("reboot-immediate", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(x == EMERGENCY_ACTION_REBOOT_IMMEDIATE); + assert_se(parse_emergency_action("poweroff", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(x == EMERGENCY_ACTION_POWEROFF); + assert_se(parse_emergency_action("poweroff-force", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(x == EMERGENCY_ACTION_POWEROFF_FORCE); + assert_se(parse_emergency_action("poweroff-immediate", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("exit", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("exit-force", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("exit-forcee", RUNTIME_SCOPE_SYSTEM, &x) == -EINVAL); + assert_se(x == EMERGENCY_ACTION_EXIT_FORCE); + assert_se(parse_emergency_action("kexec", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("kexec-force", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("kexec-forcee", RUNTIME_SCOPE_SYSTEM, &x) == -EINVAL); + assert_se(x == EMERGENCY_ACTION_KEXEC_FORCE); + assert_se(parse_emergency_action("halt", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("halt-force", RUNTIME_SCOPE_SYSTEM, &x) == 0); + assert_se(parse_emergency_action("halt-forcee", RUNTIME_SCOPE_SYSTEM, &x) == -EINVAL); + assert_se(x == EMERGENCY_ACTION_HALT_FORCE); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-engine.c b/src/test/test-engine.c new file mode 100644 index 0000000..cf77e7c --- /dev/null +++ b/src/test/test-engine.c @@ -0,0 +1,300 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "bus-util.h" +#include "manager.h" +#include "manager-dump.h" +#include "rm-rf.h" +#include "service.h" +#include "slice.h" +#include "special.h" +#include "strv.h" +#include "tests.h" +#include "unit-serialize.h" + +static void verify_dependency_atoms(void) { + UnitDependencyAtom combined = 0, multi_use_atoms = 0; + + /* Let's guarantee that our dependency type/atom translation tables are fully correct */ + + for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) { + UnitDependencyAtom a; + UnitDependency reverse; + bool has_superset = false; + + assert_se((a = unit_dependency_to_atom(d)) >= 0); + + for (UnitDependency t = 0; t < _UNIT_DEPENDENCY_MAX; t++) { + UnitDependencyAtom b; + + if (t == d) + continue; + + assert_se((b = unit_dependency_to_atom(t)) >= 0); + + if ((a & b) == a) { + has_superset = true; + break; + } + } + + reverse = unit_dependency_from_unique_atom(a); + assert_se(reverse == _UNIT_DEPENDENCY_INVALID || reverse >= 0); + + assert_se((reverse < 0) == has_superset); /* If one dependency type is a superset of another, + * then the reverse mapping is not unique, verify + * that. */ + + log_info("Verified dependency type: %s", unit_dependency_to_string(d)); + + multi_use_atoms |= combined & a; + combined |= a; + } + + /* Make sure all atoms are used, i.e. there's at least one dependency type that references it. */ + assert_se(combined == _UNIT_DEPENDENCY_ATOM_MAX); + + for (UnitDependencyAtom a = 1; a <= _UNIT_DEPENDENCY_ATOM_MAX; a <<= 1) { + + if (multi_use_atoms & a) { + /* If an atom is used by multiple dep types, then mapping the atom to a dependency is + * not unique and *must* fail */ + assert_se(unit_dependency_from_unique_atom(a) == _UNIT_DEPENDENCY_INVALID); + continue; + } + + /* If only a single dep type uses specific atom, let's guarantee our mapping table is + complete, and thus the atom can be mapped to the single dep type that is used. */ + assert_se(unit_dependency_from_unique_atom(a) >= 0); + } +} + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error err = SD_BUS_ERROR_NULL; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *a = NULL, *b = NULL, *c = NULL, *d = NULL, *e = NULL, *g = NULL, + *h = NULL, *i = NULL, *a_conj = NULL, *unit_with_multiple_dashes = NULL, *stub = NULL, + *tomato = NULL, *sauce = NULL, *fruit = NULL, *zupa = NULL; + Job *j; + int r; + + test_setup_logging(LOG_DEBUG); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + /* prepare the test */ + _cleanup_free_ char *unit_dir = NULL; + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m); + if (manager_errno_skip_test(r)) + return log_tests_skipped_errno(r, "manager_new"); + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + printf("Load1:\n"); + assert_se(manager_load_startable_unit_or_warn(m, "a.service", NULL, &a) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "b.service", NULL, &b) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "c.service", NULL, &c) >= 0); + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test1: (Trivial)\n"); + r = manager_add_job(m, JOB_START, c, JOB_REPLACE, NULL, &err, &j); + if (sd_bus_error_is_set(&err)) + log_error("error: %s: %s", err.name, err.message); + assert_se(r == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Load2:\n"); + manager_clear_jobs(m); + assert_se(manager_load_startable_unit_or_warn(m, "d.service", NULL, &d) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "e.service", NULL, &e) >= 0); + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test2: (Cyclic Order, Unfixable)\n"); + assert_se(manager_add_job(m, JOB_START, d, JOB_REPLACE, NULL, NULL, &j) == -EDEADLK); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test3: (Cyclic Order, Fixable, Garbage Collector)\n"); + assert_se(manager_add_job(m, JOB_START, e, JOB_REPLACE, NULL, NULL, &j) == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test4: (Identical transaction)\n"); + assert_se(manager_add_job(m, JOB_START, e, JOB_FAIL, NULL, NULL, &j) == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Load3:\n"); + assert_se(manager_load_startable_unit_or_warn(m, "g.service", NULL, &g) >= 0); + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test5: (Colliding transaction, fail)\n"); + assert_se(manager_add_job(m, JOB_START, g, JOB_FAIL, NULL, NULL, &j) == -EDEADLK); + + printf("Test6: (Colliding transaction, replace)\n"); + assert_se(manager_add_job(m, JOB_START, g, JOB_REPLACE, NULL, NULL, &j) == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test7: (Unmergeable job type, fail)\n"); + assert_se(manager_add_job(m, JOB_STOP, g, JOB_FAIL, NULL, NULL, &j) == -EDEADLK); + + printf("Test8: (Mergeable job type, fail)\n"); + assert_se(manager_add_job(m, JOB_RESTART, g, JOB_FAIL, NULL, NULL, &j) == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test9: (Unmergeable job type, replace)\n"); + assert_se(manager_add_job(m, JOB_STOP, g, JOB_REPLACE, NULL, NULL, &j) == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Load4:\n"); + assert_se(manager_load_startable_unit_or_warn(m, "h.service", NULL, &h) >= 0); + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test10: (Unmergeable job type of auxiliary job, fail)\n"); + assert_se(manager_add_job(m, JOB_START, h, JOB_FAIL, NULL, NULL, &j) == 0); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Load5:\n"); + manager_clear_jobs(m); + assert_se(manager_load_startable_unit_or_warn(m, "i.service", NULL, &i) >= 0); + SERVICE(a)->state = SERVICE_RUNNING; + SERVICE(d)->state = SERVICE_RUNNING; + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test11: (Start/stop job ordering, execution cycle)\n"); + assert_se(manager_add_job(m, JOB_START, i, JOB_FAIL, NULL, NULL, &j) == 0); + assert_se(unit_has_job_type(a, JOB_STOP)); + assert_se(unit_has_job_type(d, JOB_STOP)); + assert_se(unit_has_job_type(b, JOB_START)); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Load6:\n"); + manager_clear_jobs(m); + assert_se(manager_load_startable_unit_or_warn(m, "a-conj.service", NULL, &a_conj) >= 0); + SERVICE(a)->state = SERVICE_DEAD; + manager_dump_units(m, stdout, /* patterns= */ NULL, "\t"); + + printf("Test12: (Trivial cycle, Unfixable)\n"); + assert_se(manager_add_job(m, JOB_START, a_conj, JOB_REPLACE, NULL, NULL, &j) == -EDEADLK); + manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t"); + + assert_se(!hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), b)); + assert_se(!hashmap_get(unit_get_dependencies(b, UNIT_RELOAD_PROPAGATED_FROM), a)); + assert_se(!hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), c)); + assert_se(!hashmap_get(unit_get_dependencies(c, UNIT_RELOAD_PROPAGATED_FROM), a)); + + assert_se(unit_add_dependency(a, UNIT_PROPAGATES_RELOAD_TO, b, true, UNIT_DEPENDENCY_UDEV) >= 0); + assert_se(unit_add_dependency(a, UNIT_PROPAGATES_RELOAD_TO, c, true, UNIT_DEPENDENCY_PROC_SWAP) >= 0); + + assert_se( hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), b)); + assert_se( hashmap_get(unit_get_dependencies(b, UNIT_RELOAD_PROPAGATED_FROM), a)); + assert_se( hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), c)); + assert_se( hashmap_get(unit_get_dependencies(c, UNIT_RELOAD_PROPAGATED_FROM), a)); + + unit_remove_dependencies(a, UNIT_DEPENDENCY_UDEV); + + assert_se(!hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), b)); + assert_se(!hashmap_get(unit_get_dependencies(b, UNIT_RELOAD_PROPAGATED_FROM), a)); + assert_se( hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), c)); + assert_se( hashmap_get(unit_get_dependencies(c, UNIT_RELOAD_PROPAGATED_FROM), a)); + + unit_remove_dependencies(a, UNIT_DEPENDENCY_PROC_SWAP); + + assert_se(!hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), b)); + assert_se(!hashmap_get(unit_get_dependencies(b, UNIT_RELOAD_PROPAGATED_FROM), a)); + assert_se(!hashmap_get(unit_get_dependencies(a, UNIT_PROPAGATES_RELOAD_TO), c)); + assert_se(!hashmap_get(unit_get_dependencies(c, UNIT_RELOAD_PROPAGATED_FROM), a)); + + assert_se(manager_load_unit(m, "unit-with-multiple-dashes.service", NULL, NULL, &unit_with_multiple_dashes) >= 0); + + assert_se(strv_equal(unit_with_multiple_dashes->documentation, STRV_MAKE("man:test", "man:override2", "man:override3"))); + assert_se(streq_ptr(unit_with_multiple_dashes->description, "override4")); + + /* Now merge a synthetic unit into the existing one */ + assert_se(unit_new_for_name(m, sizeof(Service), "merged.service", &stub) >= 0); + assert_se(unit_add_dependency_by_name(stub, UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_FILE) >= 0); + assert_se(unit_add_dependency_by_name(stub, UNIT_AFTER, "quux.target", true, UNIT_DEPENDENCY_FILE) >= 0); + assert_se(unit_add_dependency_by_name(stub, UNIT_AFTER, SPECIAL_ROOT_SLICE, true, UNIT_DEPENDENCY_FILE) >= 0); + assert_se(unit_add_dependency_by_name(stub, UNIT_REQUIRES, "non-existing.mount", true, UNIT_DEPENDENCY_FILE) >= 0); + assert_se(unit_add_dependency_by_name(stub, UNIT_ON_FAILURE, "non-existing-on-failure.target", true, UNIT_DEPENDENCY_FILE) >= 0); + assert_se(unit_add_dependency_by_name(stub, UNIT_ON_SUCCESS, "non-existing-on-success.target", true, UNIT_DEPENDENCY_FILE) >= 0); + + log_info("/* Merging a+stub, dumps before */"); + unit_dump(a, stderr, NULL); + unit_dump(stub, stderr, NULL); + assert_se(unit_merge(a, stub) >= 0); + log_info("/* Dump of merged a+stub */"); + unit_dump(a, stderr, NULL); + + assert_se( unit_has_dependency(a, UNIT_ATOM_AFTER, manager_get_unit(m, SPECIAL_BASIC_TARGET))); + assert_se( unit_has_dependency(a, UNIT_ATOM_AFTER, manager_get_unit(m, "quux.target"))); + assert_se( unit_has_dependency(a, UNIT_ATOM_AFTER, manager_get_unit(m, SPECIAL_ROOT_SLICE))); + assert_se( unit_has_dependency(a, UNIT_ATOM_PULL_IN_START, manager_get_unit(m, "non-existing.mount"))); + assert_se( unit_has_dependency(a, UNIT_ATOM_RETROACTIVE_START_REPLACE, manager_get_unit(m, "non-existing.mount"))); + assert_se( unit_has_dependency(a, UNIT_ATOM_ON_FAILURE, manager_get_unit(m, "non-existing-on-failure.target"))); + assert_se( unit_has_dependency(manager_get_unit(m, "non-existing-on-failure.target"), UNIT_ATOM_ON_FAILURE_OF, a)); + assert_se( unit_has_dependency(a, UNIT_ATOM_ON_SUCCESS, manager_get_unit(m, "non-existing-on-success.target"))); + assert_se( unit_has_dependency(manager_get_unit(m, "non-existing-on-success.target"), UNIT_ATOM_ON_SUCCESS_OF, a)); + assert_se(!unit_has_dependency(a, UNIT_ATOM_ON_FAILURE, manager_get_unit(m, "basic.target"))); + assert_se(!unit_has_dependency(a, UNIT_ATOM_ON_SUCCESS, manager_get_unit(m, "basic.target"))); + assert_se(!unit_has_dependency(a, UNIT_ATOM_ON_FAILURE_OF, manager_get_unit(m, "basic.target"))); + assert_se(!unit_has_dependency(a, UNIT_ATOM_ON_SUCCESS_OF, manager_get_unit(m, "basic.target"))); + assert_se(!unit_has_dependency(a, UNIT_ATOM_PROPAGATES_RELOAD_TO, manager_get_unit(m, "non-existing-on-failure.target"))); + + assert_se(unit_has_name(a, "a.service")); + assert_se(unit_has_name(a, "merged.service")); + + unsigned mm = 1; + Unit *other; + + UNIT_FOREACH_DEPENDENCY(other, a, UNIT_ATOM_AFTER) { + mm *= unit_has_name(other, SPECIAL_BASIC_TARGET) ? 3 : 1; + mm *= unit_has_name(other, "quux.target") ? 5 : 1; + mm *= unit_has_name(other, SPECIAL_ROOT_SLICE) ? 7 : 1; + } + + UNIT_FOREACH_DEPENDENCY(other, a, UNIT_ATOM_ON_FAILURE) + mm *= unit_has_name(other, "non-existing-on-failure.target") ? 11 : 1; + + UNIT_FOREACH_DEPENDENCY(other, a, UNIT_ATOM_PULL_IN_START) + mm *= unit_has_name(other, "non-existing.mount") ? 13 : 1; + + assert_se(mm == 3U*5U*7U*11U*13U); + + verify_dependency_atoms(); + + /* Test adding multiple Slice= dependencies; only the last should remain */ + assert_se(unit_new_for_name(m, sizeof(Service), "tomato.service", &tomato) >= 0); + assert_se(unit_new_for_name(m, sizeof(Slice), "sauce.slice", &sauce) >= 0); + assert_se(unit_new_for_name(m, sizeof(Slice), "fruit.slice", &fruit) >= 0); + assert_se(unit_new_for_name(m, sizeof(Slice), "zupa.slice", &zupa) >= 0); + + unit_set_slice(tomato, sauce); + unit_set_slice(tomato, fruit); + unit_set_slice(tomato, zupa); + + assert_se(UNIT_GET_SLICE(tomato) == zupa); + assert_se(!unit_has_dependency(tomato, UNIT_ATOM_IN_SLICE, sauce)); + assert_se(!unit_has_dependency(tomato, UNIT_ATOM_IN_SLICE, fruit)); + assert_se( unit_has_dependency(tomato, UNIT_ATOM_IN_SLICE, zupa)); + + assert_se(!unit_has_dependency(tomato, UNIT_ATOM_REFERENCES, sauce)); + assert_se(!unit_has_dependency(tomato, UNIT_ATOM_REFERENCES, fruit)); + assert_se( unit_has_dependency(tomato, UNIT_ATOM_REFERENCES, zupa)); + + assert_se(!unit_has_dependency(sauce, UNIT_ATOM_SLICE_OF, tomato)); + assert_se(!unit_has_dependency(fruit, UNIT_ATOM_SLICE_OF, tomato)); + assert_se( unit_has_dependency(zupa, UNIT_ATOM_SLICE_OF, tomato)); + + assert_se(!unit_has_dependency(sauce, UNIT_ATOM_REFERENCED_BY, tomato)); + assert_se(!unit_has_dependency(fruit, UNIT_ATOM_REFERENCED_BY, tomato)); + assert_se( unit_has_dependency(zupa, UNIT_ATOM_REFERENCED_BY, tomato)); + + return 0; +} diff --git a/src/test/test-env-file.c b/src/test/test-env-file.c new file mode 100644 index 0000000..3fc6d62 --- /dev/null +++ b/src/test/test-env-file.c @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "env-file.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "macro.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +/* In case of repeating keys, later entries win. */ + +#define env_file_1 \ + "a=a\n" \ + "a=b\n" \ + "a=b\n" \ + "a=a\n" \ + "b=b\\\n" \ + "c\n" \ + "d= d\\\n" \ + "e \\\n" \ + "f \n" \ + "g=g\\ \n" \ + "h= ąęół\\ śćńźżμ \n" \ + "i=i\\" + +#define env_file_2 \ + "a=a\\\n" + +#define env_file_3 \ + "#SPAMD_ARGS=\"-d --socketpath=/var/lib/bulwark/spamd \\\n" \ + "#--nouser-config \\\n" \ + "normal1=line\\\n" \ + "111\n" \ + ";normal=ignored \\\n" \ + "normal2=line222\n" \ + "normal ignored \\\n" + +#define env_file_4 \ + "# Generated\n" \ + "\n" \ + "HWMON_MODULES=\"coretemp f71882fg\"\n" \ + "\n" \ + "# For compatibility reasons\n" \ + "\n" \ + "MODULE_0=coretemp\n" \ + "MODULE_1=f71882fg" + +#define env_file_5 \ + "a=\n" \ + "b=" + +#define env_file_6 \ + "a=\\ \\n \\t \\x \\y \\' \n" \ + "b= \\$' \n" \ + "c= ' \\n\\t\\$\\`\\\\\n" \ + "' \n" \ + "d= \" \\n\\t\\$\\`\\\\\n" \ + "\" \n" + +TEST(load_env_file_1) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, env_file_1) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == 0); + assert_se(streq(data[0], "a=a")); + assert_se(streq(data[1], "b=bc")); + assert_se(streq(data[2], "d=de f")); + assert_se(streq(data[3], "g=g ")); + assert_se(streq(data[4], "h=ąęół śćńźżμ")); + assert_se(streq(data[5], "i=i")); + assert_se(data[6] == NULL); +} + +TEST(load_env_file_2) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, env_file_2) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == 0); + assert_se(streq(data[0], "a=a")); + assert_se(data[1] == NULL); +} + +TEST(load_env_file_3) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, env_file_3) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == 0); + assert_se(streq(data[0], "normal1=line111")); + assert_se(streq(data[1], "normal2=line222")); + assert_se(data[2] == NULL); +} + +TEST(load_env_file_4) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, env_file_4) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == 0); + assert_se(streq(data[0], "HWMON_MODULES=coretemp f71882fg")); + assert_se(streq(data[1], "MODULE_0=coretemp")); + assert_se(streq(data[2], "MODULE_1=f71882fg")); + assert_se(data[3] == NULL); +} + +TEST(load_env_file_5) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, env_file_5) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == 0); + assert_se(streq(data[0], "a=")); + assert_se(streq(data[1], "b=")); + assert_se(data[2] == NULL); +} + +TEST(load_env_file_6) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, env_file_6) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == 0); + assert_se(streq(data[0], "a= n t x y '")); + assert_se(streq(data[1], "b=$'")); + assert_se(streq(data[2], "c= \\n\\t\\$\\`\\\\\n")); + assert_se(streq(data[3], "d= \\n\\t$`\\\n")); + assert_se(data[4] == NULL); +} + +TEST(load_env_file_invalid_utf8) { + /* Test out a couple of assignments where the key/value has an invalid + * UTF-8 character ("noncharacter") + * + * See: https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Non-characters + */ + FOREACH_STRING(s, + "fo\ufffeo=bar", + "foo=b\uffffar", + "baz=hello world\ufffe") { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-load-env-file.XXXXXX"; + assert_se(write_tmpfile(name, s) == 0); + + _cleanup_strv_free_ char **data = NULL; + assert_se(load_env_file(NULL, name, &data) == -EINVAL); + assert_se(!data); + } +} + +TEST(write_and_load_env_file) { + /* Make sure that our writer, parser and the shell agree on what our env var files mean */ + + FOREACH_STRING(v, + "obbardc-laptop", + "obbardc\\-laptop", + "obbardc-lap\\top", + "obbardc-lap\\top", + "obbardc-lap\\\\top", + "double\"quote", + "single\'quote", + "dollar$dollar", + "newline\nnewline") { + _cleanup_(unlink_and_freep) char *p = NULL; + _cleanup_strv_free_ char **l = NULL; + _cleanup_free_ char *j = NULL, *w = NULL, *cmd = NULL, *from_shell = NULL; + _cleanup_pclose_ FILE *f = NULL; + size_t sz; + + assert_se(tempfn_random_child(NULL, NULL, &p) >= 0); + + assert_se(j = strjoin("TEST=", v)); + assert_se(write_env_file(AT_FDCWD, p, STRV_MAKE("# header 1", "", "# header 2"), STRV_MAKE(j)) >= 0); + + assert_se(cmd = strjoin(". ", p, " && /bin/echo -n \"$TEST\"")); + assert_se(f = popen(cmd, "re")); + assert_se(read_full_stream(f, &from_shell, &sz) >= 0); + assert_se(sz == strlen(v)); + assert_se(streq(from_shell, v)); + + assert_se(load_env_file(NULL, p, &l) >= 0); + assert_se(strv_equal(l, STRV_MAKE(j))); + + assert_se(parse_env_file(NULL, p, "TEST", &w) >= 0); + assert_se(streq_ptr(w, v)); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-env-util.c b/src/test/test-env-util.c new file mode 100644 index 0000000..dffbad6 --- /dev/null +++ b/src/test/test-env-util.c @@ -0,0 +1,563 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "serialize.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +TEST(strv_env_delete) { + _cleanup_strv_free_ char **a = NULL, **b = NULL, **c = NULL, **d = NULL; + + a = strv_new("FOO=BAR", "WALDO=WALDO", "WALDO=", "PIEP", "SCHLUMPF=SMURF"); + assert_se(a); + + b = strv_new("PIEP", "FOO"); + assert_se(b); + + c = strv_new("SCHLUMPF"); + assert_se(c); + + d = strv_env_delete(a, 2, b, c); + assert_se(d); + + assert_se(streq(d[0], "WALDO=WALDO")); + assert_se(streq(d[1], "WALDO=")); + assert_se(strv_length(d) == 2); +} + +TEST(strv_env_get) { + char **l = STRV_MAKE("ONE_OR_TWO=1", "THREE=3", "ONE_OR_TWO=2", "FOUR=4"); + + assert_se(streq(strv_env_get(l, "ONE_OR_TWO"), "2")); + assert_se(streq(strv_env_get(l, "THREE"), "3")); + assert_se(streq(strv_env_get(l, "FOUR"), "4")); +} + +TEST(strv_env_pairs_get) { + char **l = STRV_MAKE("ONE_OR_TWO", "1", "THREE", "3", "ONE_OR_TWO", "2", "FOUR", "4", "FIVE", "5", "SIX", "FIVE", "SEVEN", "7"); + + assert_se(streq(strv_env_pairs_get(l, "ONE_OR_TWO"), "2")); + assert_se(streq(strv_env_pairs_get(l, "THREE"), "3")); + assert_se(streq(strv_env_pairs_get(l, "FOUR"), "4")); + assert_se(streq(strv_env_pairs_get(l, "FIVE"), "5")); +} + +TEST(strv_env_unset) { + _cleanup_strv_free_ char **l = NULL; + + l = strv_new("PIEP", "SCHLUMPF=SMURFF", "NANANANA=YES"); + assert_se(l); + + assert_se(strv_env_unset(l, "SCHLUMPF") == l); + + assert_se(streq(l[0], "PIEP")); + assert_se(streq(l[1], "NANANANA=YES")); + assert_se(strv_length(l) == 2); +} + +TEST(strv_env_merge) { + char **a = STRV_MAKE("FOO=BAR", "WALDO=WALDO", "WALDO=", "PIEP", "SCHLUMPF=SMURF", "EQ==="); + char **b = STRV_MAKE("FOO=KKK", "FOO=", "PIEP=", "SCHLUMPF=SMURFF", "NANANANA=YES"); + + _cleanup_strv_free_ char **r = strv_env_merge(NULL, a, NULL, b, NULL, a, b, b, NULL); + assert_se(r); + assert_se(streq(r[0], "FOO=")); + assert_se(streq(r[1], "WALDO=")); + assert_se(streq(r[2], "PIEP")); + assert_se(streq(r[3], "SCHLUMPF=SMURFF")); + assert_se(streq(r[4], "EQ===")); + assert_se(streq(r[5], "PIEP=")); + assert_se(streq(r[6], "NANANANA=YES")); + assert_se(strv_length(r) == 7); + + assert_se(strv_env_clean(r) == r); + assert_se(streq(r[0], "FOO=")); + assert_se(streq(r[1], "WALDO=")); + assert_se(streq(r[2], "SCHLUMPF=SMURFF")); + assert_se(streq(r[3], "EQ===")); + assert_se(streq(r[4], "PIEP=")); + assert_se(streq(r[5], "NANANANA=YES")); + assert_se(strv_length(r) == 6); +} + +TEST(strv_env_replace_strdup) { + _cleanup_strv_free_ char **a = NULL; + + assert_se(strv_env_replace_strdup(&a, "a=a") == 1); + assert_se(strv_env_replace_strdup(&a, "b=b") == 1); + assert_se(strv_env_replace_strdup(&a, "a=A") == 0); + assert_se(strv_env_replace_strdup(&a, "c") == -EINVAL); + + assert_se(strv_length(a) == 2); + strv_sort(a); + assert_se(streq(a[0], "a=A")); + assert_se(streq(a[1], "b=b")); +} + +TEST(strv_env_replace_strdup_passthrough) { + _cleanup_strv_free_ char **a = NULL; + + assert_se(putenv((char*) "a=a") == 0); + assert_se(putenv((char*) "b=") == 0); + assert_se(unsetenv("c") == 0); + + assert_se(strv_env_replace_strdup_passthrough(&a, "a") == 1); + assert_se(strv_env_replace_strdup_passthrough(&a, "b") == 1); + assert_se(strv_env_replace_strdup_passthrough(&a, "c") == 1); + assert_se(strv_env_replace_strdup_passthrough(&a, "a") == 0); + assert_se(strv_env_replace_strdup_passthrough(&a, "$a") == -EINVAL); + + assert_se(strv_length(a) == 3); + assert_se(streq(a[0], "a=a")); + assert_se(streq(a[1], "b=")); + assert_se(streq(a[2], "c=")); +} + +TEST(strv_env_assign) { + _cleanup_strv_free_ char **a = NULL; + + assert_se(strv_env_assign(&a, "a", "a") == 1); + assert_se(strv_env_assign(&a, "b", "b") == 1); + assert_se(strv_env_assign(&a, "a", "A") == 0); + assert_se(strv_env_assign(&a, "b", NULL) == 0); + + assert_se(strv_env_assign(&a, "a=", "B") == -EINVAL); + + assert_se(strv_length(a) == 1); + assert_se(streq(a[0], "a=A")); +} + +TEST(strv_env_assign_many) { + _cleanup_strv_free_ char **a = NULL; + + assert_se(strv_env_assign_many(&a, "a", "a", "b", "b") >= 0); + + assert_se(strv_length(a) == 2); + assert_se(strv_contains(a, "a=a")); + assert_se(strv_contains(a, "b=b")); + + assert_se(strv_env_assign_many(&a, "a", "A", "b", "b", "c", "c") >= 0); + assert_se(strv_length(a) == 3); + assert_se(strv_contains(a, "a=A")); + assert_se(strv_contains(a, "b=b")); + assert_se(strv_contains(a, "c=c")); + + assert_se(strv_env_assign_many(&a, "b", NULL, "c", "C") >= 0); + assert_se(strv_length(a) == 2); + assert_se(strv_contains(a, "a=A")); + assert_se(strv_contains(a, "c=C")); + + assert_se(strv_env_assign_many(&a, "a=", "B") == -EINVAL); + assert_se(strv_length(a) == 2); + assert_se(strv_contains(a, "a=A")); + assert_se(strv_contains(a, "c=C")); +} + +TEST(env_strv_get_n) { + const char *_env[] = { + "FOO=NO NO NO", + "FOO=BAR BAR", + "BAR=waldo", + "PATH=unset", + NULL + }; + char **env = (char**) _env; + + assert_se(streq(strv_env_get_n(env, "FOO__", 3, 0), "BAR BAR")); + assert_se(streq(strv_env_get_n(env, "FOO__", 3, REPLACE_ENV_USE_ENVIRONMENT), "BAR BAR")); + assert_se(streq(strv_env_get_n(env, "FOO", 3, 0), "BAR BAR")); + assert_se(streq(strv_env_get_n(env, "FOO", 3, REPLACE_ENV_USE_ENVIRONMENT), "BAR BAR")); + + assert_se(streq(strv_env_get_n(env, "PATH__", 4, 0), "unset")); + assert_se(streq(strv_env_get_n(env, "PATH", 4, 0), "unset")); + assert_se(streq(strv_env_get_n(env, "PATH__", 4, REPLACE_ENV_USE_ENVIRONMENT), "unset")); + assert_se(streq(strv_env_get_n(env, "PATH", 4, REPLACE_ENV_USE_ENVIRONMENT), "unset")); + + env[3] = NULL; /* kill our $PATH */ + + assert_se(!strv_env_get_n(env, "PATH__", 4, 0)); + assert_se(!strv_env_get_n(env, "PATH", 4, 0)); + assert_se(streq_ptr(strv_env_get_n(env, "PATH__", 4, REPLACE_ENV_USE_ENVIRONMENT), + getenv("PATH"))); + assert_se(streq_ptr(strv_env_get_n(env, "PATH", 4, REPLACE_ENV_USE_ENVIRONMENT), + getenv("PATH"))); +} + +static void test_replace_env1(bool braceless) { + log_info("/* %s(braceless=%s) */", __func__, yes_no(braceless)); + + const char *env[] = { + "FOO=BAR BAR", + "BAR=waldo", + NULL + }; + _cleanup_free_ char *t = NULL, *s = NULL, *q = NULL, *r = NULL, *p = NULL; + unsigned flags = REPLACE_ENV_ALLOW_BRACELESS*braceless; + + assert_se(replace_env("FOO=$FOO=${FOO}", (char**) env, flags, &t) >= 0); + assert_se(streq(t, braceless ? "FOO=BAR BAR=BAR BAR" : "FOO=$FOO=BAR BAR")); + + assert_se(replace_env("BAR=$BAR=${BAR}", (char**) env, flags, &s) >= 0); + assert_se(streq(s, braceless ? "BAR=waldo=waldo" : "BAR=$BAR=waldo")); + + assert_se(replace_env("BARBAR=$BARBAR=${BARBAR}", (char**) env, flags, &q) >= 0); + assert_se(streq(q, braceless ? "BARBAR==" : "BARBAR=$BARBAR=")); + + assert_se(replace_env("BAR=$BAR$BAR${BAR}${BAR}", (char**) env, flags, &r) >= 0); + assert_se(streq(r, braceless ? "BAR=waldowaldowaldowaldo" : "BAR=$BAR$BARwaldowaldo")); + + assert_se(replace_env("${BAR}$BAR$BAR", (char**) env, flags, &p) >= 0); + assert_se(streq(p, braceless ? "waldowaldowaldo" : "waldo$BAR$BAR")); +} + +static void test_replace_env2(bool extended) { + log_info("/* %s(extended=%s) */", __func__, yes_no(extended)); + + const char *env[] = { + "FOO=foo", + "BAR=bar", + NULL + }; + _cleanup_free_ char *t = NULL, *s = NULL, *q = NULL, *r = NULL, *p = NULL, *x = NULL, *y = NULL; + unsigned flags = REPLACE_ENV_ALLOW_EXTENDED*extended; + + assert_se(replace_env("FOO=${FOO:-${BAR}}", (char**) env, flags, &t) >= 0); + assert_se(streq(t, extended ? "FOO=foo" : "FOO=${FOO:-bar}")); + + assert_se(replace_env("BAR=${XXX:-${BAR}}", (char**) env, flags, &s) >= 0); + assert_se(streq(s, extended ? "BAR=bar" : "BAR=${XXX:-bar}")); + + assert_se(replace_env("XXX=${XXX:+${BAR}}", (char**) env, flags, &q) >= 0); + assert_se(streq(q, extended ? "XXX=" : "XXX=${XXX:+bar}")); + + assert_se(replace_env("FOO=${FOO:+${BAR}}", (char**) env, flags, &r) >= 0); + assert_se(streq(r, extended ? "FOO=bar" : "FOO=${FOO:+bar}")); + + assert_se(replace_env("FOO=${FOO:-${BAR}post}", (char**) env, flags, &p) >= 0); + assert_se(streq(p, extended ? "FOO=foo" : "FOO=${FOO:-barpost}")); + + assert_se(replace_env("XXX=${XXX:+${BAR}post}", (char**) env, flags, &x) >= 0); + assert_se(streq(x, extended ? "XXX=" : "XXX=${XXX:+barpost}")); + + assert_se(replace_env("FOO=${FOO}between${BAR:-baz}", (char**) env, flags, &y) >= 0); + assert_se(streq(y, extended ? "FOO=foobetweenbar" : "FOO=foobetween${BAR:-baz}")); +} + +TEST(replace_env) { + test_replace_env1(false); + test_replace_env1(true); + test_replace_env2(false); + test_replace_env2(true); +} + +TEST(replace_env_argv) { + const char *env[] = { + "FOO=BAR BAR", + "BAR=waldo", + NULL + }; + const char *line[] = { + "FOO$FOO", + "FOO$FOOFOO", + "FOO${FOO}$FOO", + "FOO${FOO}", + "${FOO}", + "$FOO", + "$FOO$FOO", + "${FOO}${BAR}", + "${FOO", + "FOO$$${FOO}", + "$$FOO${FOO}", + "${FOO:-${BAR}}", + "${QUUX:-${FOO}}", + "${FOO:+${BAR}}", + "${QUUX:+${BAR}}", + "${FOO:+|${BAR}|}}", + "${FOO:+|${BAR}{|}", + NULL + }; + _cleanup_strv_free_ char **r = NULL; + + assert_se(replace_env_argv((char**) line, (char**) env, &r, NULL, NULL) >= 0); + assert_se(r); + assert_se(streq(r[0], "FOO$FOO")); + assert_se(streq(r[1], "FOO$FOOFOO")); + assert_se(streq(r[2], "FOOBAR BAR$FOO")); + assert_se(streq(r[3], "FOOBAR BAR")); + assert_se(streq(r[4], "BAR BAR")); + assert_se(streq(r[5], "BAR")); + assert_se(streq(r[6], "BAR")); + assert_se(streq(r[7], "BAR BARwaldo")); + assert_se(streq(r[8], "${FOO")); + assert_se(streq(r[9], "FOO$BAR BAR")); + assert_se(streq(r[10], "$FOOBAR BAR")); + assert_se(streq(r[11], "${FOO:-waldo}")); + assert_se(streq(r[12], "${QUUX:-BAR BAR}")); + assert_se(streq(r[13], "${FOO:+waldo}")); + assert_se(streq(r[14], "${QUUX:+waldo}")); + assert_se(streq(r[15], "${FOO:+|waldo|}}")); + assert_se(streq(r[16], "${FOO:+|waldo{|}")); + assert_se(strv_length(r) == 17); +} + +TEST(replace_env_argv_bad) { + + const char *env[] = { + "FOO=BAR BAR", + "BAR=waldo", + NULL + }; + + const char *line[] = { + "$FOO", + "A${FOO}B", + "a${~}${%}b", + "x${}y", + "$UNSET2", + "z${UNSET3}z${UNSET1}z", + "piff${UNSET2}piff", + NULL + }; + + _cleanup_strv_free_ char **bad = NULL, **unset = NULL, **replaced = NULL; + + assert_se(replace_env_argv((char**) line, (char**) env, &replaced, &unset, &bad) >= 0); + + assert_se(strv_equal(replaced, STRV_MAKE( + "BAR", + "BAR", + "ABAR BARB", + "ab", + "xy", + "zzz", + "piffpiff"))); + + assert_se(strv_equal(unset, STRV_MAKE( + "UNSET1", + "UNSET2", + "UNSET3"))); + assert_se(strv_equal(bad, STRV_MAKE("", + "%", + "~"))); +} + +TEST(env_clean) { + _cleanup_strv_free_ char **e = strv_new("FOOBAR=WALDO", + "FOOBAR=WALDO", + "FOOBAR", + "F", + "X=", + "F=F", + "=", + "=F", + "", + "0000=000", + "äöüß=abcd", + "abcd=äöüß", + "xyz\n=xyz", + "xyz=xyz\n", + "another=one", + "another=final one", + "CRLF=\r\n", + "LESS_TERMCAP_mb=\x1b[01;31m", + "BASH_FUNC_foo%%=() { echo foo\n}"); + assert_se(e); + assert_se(!strv_env_is_valid(e)); + assert_se(strv_env_clean(e) == e); + assert_se(strv_env_is_valid(e)); + + assert_se(streq(e[0], "FOOBAR=WALDO")); + assert_se(streq(e[1], "X=")); + assert_se(streq(e[2], "F=F")); + assert_se(streq(e[3], "abcd=äöüß")); + assert_se(streq(e[4], "xyz=xyz\n")); + assert_se(streq(e[5], "another=final one")); + assert_se(streq(e[6], "CRLF=\r\n")); + assert_se(streq(e[7], "LESS_TERMCAP_mb=\x1b[01;31m")); + assert_se(e[8] == NULL); +} + +TEST(env_name_is_valid) { + assert_se(env_name_is_valid("test")); + + assert_se(!env_name_is_valid(NULL)); + assert_se(!env_name_is_valid("")); + assert_se(!env_name_is_valid("xxx\a")); + assert_se(!env_name_is_valid("xxx\007b")); + assert_se(!env_name_is_valid("\007\009")); + assert_se(!env_name_is_valid("5_starting_with_a_number_is_wrong")); + assert_se(!env_name_is_valid("#¤%&?_only_numbers_letters_and_underscore_allowed")); +} + +TEST(env_value_is_valid) { + assert_se(env_value_is_valid("")); + assert_se(env_value_is_valid("głąb kapuściany")); + assert_se(env_value_is_valid("printf \"\\x1b]0;\\x07\"")); + assert_se(env_value_is_valid("tab\tcharacter")); + assert_se(env_value_is_valid("new\nline")); + assert_se(env_value_is_valid("Show this?\rNope. Show that!")); + assert_se(env_value_is_valid("new DOS\r\nline")); + + assert_se(!env_value_is_valid("\xc5")); /* A truncated utf-8-encoded "ł". + * We currently disallow that. */ +} + +TEST(env_assignment_is_valid) { + assert_se(env_assignment_is_valid("a=")); + assert_se(env_assignment_is_valid("b=głąb kapuściany")); + assert_se(env_assignment_is_valid("c=\\007\\009\\011")); + assert_se(env_assignment_is_valid("e=printf \"\\x1b]0;\\x07\"")); + assert_se(env_assignment_is_valid("f=tab\tcharacter")); + assert_se(env_assignment_is_valid("g=new\nline")); + + assert_se(!env_assignment_is_valid("=")); + assert_se(!env_assignment_is_valid("a b=")); + assert_se(!env_assignment_is_valid("a =")); + assert_se(!env_assignment_is_valid(" b=")); + /* no dots or dashes: http://tldp.org/LDP/abs/html/gotchas.html */ + assert_se(!env_assignment_is_valid("a.b=")); + assert_se(!env_assignment_is_valid("a-b=")); + assert_se(!env_assignment_is_valid("\007=głąb kapuściany")); + assert_se(!env_assignment_is_valid("c\009=\007\009\011")); + assert_se(!env_assignment_is_valid("głąb=printf \"\x1b]0;\x07\"")); +} + +TEST(putenv_dup) { + assert_se(putenv_dup("A=a1", true) == 0); + assert_se(streq_ptr(getenv("A"), "a1")); + assert_se(putenv_dup("A=a1", true) == 0); + assert_se(streq_ptr(getenv("A"), "a1")); + assert_se(putenv_dup("A=a2", false) == 0); + assert_se(streq_ptr(getenv("A"), "a1")); + assert_se(putenv_dup("A=a2", true) == 0); + assert_se(streq_ptr(getenv("A"), "a2")); +} + +TEST(setenv_systemd_exec_pid) { + _cleanup_free_ char *saved = NULL; + const char *e; + pid_t p; + + e = getenv("SYSTEMD_EXEC_PID"); + if (e) + assert_se(saved = strdup(e)); + + assert_se(unsetenv("SYSTEMD_EXEC_PID") >= 0); + assert_se(setenv_systemd_exec_pid(true) == 0); + assert_se(!getenv("SYSTEMD_EXEC_PID")); + + assert_se(setenv("SYSTEMD_EXEC_PID", "*", 1) >= 0); + assert_se(setenv_systemd_exec_pid(true) == 0); + assert_se(e = getenv("SYSTEMD_EXEC_PID")); + assert_se(streq(e, "*")); + + assert_se(setenv("SYSTEMD_EXEC_PID", "123abc", 1) >= 0); + assert_se(setenv_systemd_exec_pid(true) == 1); + assert_se(e = getenv("SYSTEMD_EXEC_PID")); + assert_se(parse_pid(e, &p) >= 0); + assert_se(p == getpid_cached()); + + assert_se(unsetenv("SYSTEMD_EXEC_PID") >= 0); + assert_se(setenv_systemd_exec_pid(false) == 1); + assert_se(e = getenv("SYSTEMD_EXEC_PID")); + assert_se(parse_pid(e, &p) >= 0); + assert_se(p == getpid_cached()); + + assert_se(set_unset_env("SYSTEMD_EXEC_PID", saved, 1) >= 0); +} + +TEST(getenv_steal_erase) { + int r; + + r = safe_fork("(sd-getenvstealerase)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + if (r == 0) { + _cleanup_strv_free_ char **l = NULL; + + /* child */ + + assert_se(getenv_steal_erase("thisenvvardefinitelywontexist", NULL) == 0); + + l = strv_new("FOO=BAR", "QUUX=PIFF", "ONE=TWO", "A=B"); + assert_se(strv_length(l) == 4); + + environ = l; + + STRV_FOREACH(e, environ) { + _cleanup_free_ char *n = NULL, *copy1 = NULL, *copy2 = NULL; + char *eq; + + eq = strchr(*e, '='); + if (!eq) + continue; + + n = strndup(*e, eq - *e); + assert_se(n); + + copy1 = strdup(eq + 1); + assert_se(copy1); + + assert_se(streq_ptr(getenv(n), copy1)); + assert_se(getenv(n) == eq + 1); + assert_se(getenv_steal_erase(n, ©2) > 0); + assert_se(streq_ptr(copy1, copy2)); + assert_se(isempty(eq + 1)); + assert_se(!getenv(n)); + } + + environ = NULL; + l = strv_free(l); + + _exit(EXIT_SUCCESS); + } + + assert_se(r > 0); +} + +TEST(strv_env_name_is_valid) { + assert_se(strv_env_name_is_valid(STRV_MAKE("HOME", "USER", "SHELL", "PATH"))); + assert_se(!strv_env_name_is_valid(STRV_MAKE("", "PATH", "home", "user", "SHELL"))); + assert_se(!strv_env_name_is_valid(STRV_MAKE("HOME", "USER", "SHELL", "USER"))); +} + +TEST(getenv_path_list) { + _cleanup_strv_free_ char **path_list = NULL; + + /* Empty paths */ + FOREACH_STRING(s, "", ":", ":::::", " : ::: :: :") { + assert_se(setenv("TEST_GETENV_PATH_LIST", s, 1) >= 0); + assert_se(getenv_path_list("TEST_GETENV_PATH_LIST", &path_list) == -EINVAL); + assert_se(!path_list); + } + + /* Invalid paths */ + FOREACH_STRING(s, ".", "..", "/../", "/", "/foo/bar/baz/../foo", "foo/bar/baz") { + assert_se(setenv("TEST_GETENV_PATH_LIST", s, 1) >= 0); + assert_se(getenv_path_list("TEST_GETENV_PATH_LIST", &path_list) == -EINVAL); + assert_se(!path_list); + } + + /* Valid paths mixed with invalid ones */ + assert_se(setenv("TEST_GETENV_PATH_LIST", "/foo:/bar/baz:/../:/hello", 1) >= 0); + assert_se(getenv_path_list("TEST_GETENV_PATH_LIST", &path_list) == -EINVAL); + assert_se(!path_list); + + /* Finally some valid paths */ + assert_se(setenv("TEST_GETENV_PATH_LIST", "/foo:/bar/baz:/hello/world:/path with spaces:/final", 1) >= 0); + assert_se(getenv_path_list("TEST_GETENV_PATH_LIST", &path_list) >= 0); + assert_se(streq(path_list[0], "/foo")); + assert_se(streq(path_list[1], "/bar/baz")); + assert_se(streq(path_list[2], "/hello/world")); + assert_se(streq(path_list[3], "/path with spaces")); + assert_se(streq(path_list[4], "/final")); + assert_se(path_list[5] == NULL); + + assert_se(unsetenv("TEST_GETENV_PATH_LIST") >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-errno-list.c b/src/test/test-errno-list.c new file mode 100644 index 0000000..f91a1f7 --- /dev/null +++ b/src/test/test-errno-list.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "errno-list.h" +#include "errno-to-name.h" +#include "macro.h" +#include "string-util.h" +#include "tests.h" + +TEST(errno_list) { + for (size_t i = 0; i < ELEMENTSOF(errno_names); i++) { + if (errno_names[i]) { + assert_se(streq(errno_to_name(i), errno_names[i])); + assert_se(errno_from_name(errno_names[i]) == (int) i); + } + } + +#ifdef ECANCELLED + /* ECANCELLED is an alias of ECANCELED. */ + assert_se(streq(errno_to_name(ECANCELLED), "ECANCELED")); +#endif + assert_se(streq(errno_to_name(ECANCELED), "ECANCELED")); + +#ifdef EREFUSED + /* EREFUSED is an alias of ECONNREFUSED. */ + assert_se(streq(errno_to_name(EREFUSED), "ECONNREFUSED")); +#endif + assert_se(streq(errno_to_name(ECONNREFUSED), "ECONNREFUSED")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-errno-util.c b/src/test/test-errno-util.c new file mode 100644 index 0000000..376d532 --- /dev/null +++ b/src/test/test-errno-util.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "errno-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(strerror_not_threadsafe) { + /* Just check that strerror really is not thread-safe. */ + log_info("strerror(%d) → %s", 200, strerror(200)); + log_info("strerror(%d) → %s", 201, strerror(201)); + log_info("strerror(%d) → %s", INT_MAX, strerror(INT_MAX)); + + log_info("strerror(%d), strerror(%d) → %p, %p", 200, 201, strerror(200), strerror(201)); + + /* This call is not allowed, because the first returned string becomes invalid when + * we call strerror the second time: + * + * log_info("strerror(%d), strerror(%d) → %s, %s", 200, 201, strerror(200), strerror(201)); + */ +} + +TEST(STRERROR) { + /* Just check that STRERROR really is thread-safe. */ + log_info("STRERROR(%d) → %s", 200, STRERROR(200)); + log_info("STRERROR(%d) → %s", 201, STRERROR(201)); + log_info("STRERROR(%d), STRERROR(%d) → %s, %s", 200, 201, STRERROR(200), STRERROR(201)); + + const char *a = STRERROR(200), *b = STRERROR(201); + assert_se(strstr(a, "200")); + assert_se(strstr(b, "201")); + + /* Check with negative values */ + assert_se(streq(a, STRERROR(-200))); + assert_se(streq(b, STRERROR(-201))); + + const char *c = STRERROR(INT_MAX); + char buf[DECIMAL_STR_MAX(int)]; + xsprintf(buf, "%d", INT_MAX); /* INT_MAX is hexadecimal, use printf to convert to decimal */ + log_info("STRERROR(%d) → %s", INT_MAX, c); + assert_se(strstr(c, buf)); +} + +TEST(STRERROR_OR_ELSE) { + log_info("STRERROR_OR_ELSE(0, \"EOF\") → %s", STRERROR_OR_EOF(0)); + log_info("STRERROR_OR_ELSE(EPERM, \"EOF\") → %s", STRERROR_OR_EOF(EPERM)); + log_info("STRERROR_OR_ELSE(-EPERM, \"EOF\") → %s", STRERROR_OR_EOF(-EPERM)); +} + +TEST(PROTECT_ERRNO) { + errno = 12; + { + PROTECT_ERRNO; + errno = 11; + } + assert_se(errno == 12); +} + +static void test_unprotect_errno_inner_function(void) { + PROTECT_ERRNO; + + errno = 2222; +} + +TEST(UNPROTECT_ERRNO) { + errno = 4711; + + PROTECT_ERRNO; + + errno = 815; + + UNPROTECT_ERRNO; + + assert_se(errno == 4711); + + test_unprotect_errno_inner_function(); + + assert_se(errno == 4711); +} + +TEST(RET_GATHER) { + int x = 0, y = 2; + + assert_se(RET_GATHER(x, 5) == 0); + assert_se(RET_GATHER(x, -5) == -5); + assert_se(RET_GATHER(x, -1) == -5); + + assert_se(RET_GATHER(x, y++) == -5); + assert_se(y == 3); +} + +TEST(ERRNO_IS_TRANSIENT) { + assert_se( ERRNO_IS_NEG_TRANSIENT(-EINTR)); + assert_se(!ERRNO_IS_NEG_TRANSIENT(EINTR)); + assert_se( ERRNO_IS_TRANSIENT(-EINTR)); + assert_se( ERRNO_IS_TRANSIENT(EINTR)); + + /* Test with type wider than int */ + ssize_t r = -EAGAIN; + assert_se( ERRNO_IS_NEG_TRANSIENT(r)); + + /* On 64-bit arches, now (int) r == EAGAIN */ + r = SSIZE_MAX - EAGAIN + 1; + assert_se(!ERRNO_IS_NEG_TRANSIENT(r)); + + assert_se(!ERRNO_IS_NEG_TRANSIENT(INT_MAX)); + assert_se(!ERRNO_IS_NEG_TRANSIENT(INT_MIN)); + assert_se(!ERRNO_IS_NEG_TRANSIENT(INTMAX_MAX)); + assert_se(!ERRNO_IS_NEG_TRANSIENT(INTMAX_MIN)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-escape.c b/src/test/test-escape.c new file mode 100644 index 0000000..21786ae --- /dev/null +++ b/src/test/test-escape.c @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "escape.h" +#include "macro.h" +#include "tests.h" + +TEST(cescape) { + _cleanup_free_ char *t = NULL; + + assert_se(t = cescape("abc\\\"\b\f\n\r\t\v\a\003\177\234\313")); + assert_se(streq(t, "abc\\\\\\\"\\b\\f\\n\\r\\t\\v\\a\\003\\177\\234\\313")); +} + +TEST(xescape) { + _cleanup_free_ char *t = NULL; + + assert_se(t = xescape("abc\\\"\b\f\n\r\t\v\a\003\177\234\313", "")); + assert_se(streq(t, "abc\\x5c\"\\x08\\x0c\\x0a\\x0d\\x09\\x0b\\x07\\x03\\x7f\\x9c\\xcb")); +} + +static void test_xescape_full_one(bool eight_bits) { + const char* escaped = !eight_bits ? + "a\\x62c\\x5c\"\\x08\\x0c\\x0a\\x0d\\x09\\x0b\\x07\\x03\\x7f\\x9c\\xcb" : + "a\\x62c\\x5c\"\\x08\\x0c\\x0a\\x0d\\x09\\x0b\\x07\\x03\177\234\313"; + const unsigned full_fit = !eight_bits ? 55 : 46; + XEscapeFlags flags = eight_bits * XESCAPE_8_BIT; + + log_info("/* %s */", __func__); + + for (unsigned i = 0; i < 60; i++) { + _cleanup_free_ char *t = NULL, *q = NULL; + + assert_se(t = xescape_full("abc\\\"\b\f\n\r\t\v\a\003\177\234\313", "b", i, flags)); + + log_info("%02u: <%s>", i, t); + + if (i >= full_fit) + assert_se(streq(t, escaped)); + else if (i >= 3) { + /* We need up to four columns, so up to three columns may be wasted */ + assert_se(strlen(t) == i || strlen(t) == i - 1 || strlen(t) == i - 2 || strlen(t) == i - 3); + assert_se(strneq(t, escaped, i - 3) || strneq(t, escaped, i - 4) || + strneq(t, escaped, i - 5) || strneq(t, escaped, i - 6)); + assert_se(endswith(t, "...")); + } else { + assert_se(strlen(t) == i); + assert_se(strneq(t, "...", i)); + } + + assert_se(q = xescape_full("abc\\\"\b\f\n\r\t\v\a\003\177\234\313", "b", i, + flags | XESCAPE_FORCE_ELLIPSIS)); + + log_info("%02u: <%s>", i, q); + if (i > 0) + assert_se(endswith(q, ".")); + assert_se(strlen(q) <= i); + assert_se(strlen(q) + 3 >= strlen(t)); + } +} + +TEST(xescape_full) { + test_xescape_full_one(false); + test_xescape_full_one(true); +} + +TEST(cunescape) { + _cleanup_free_ char *unescaped = NULL; + + assert_se(cunescape("abc\\\\\\\"\\b\\f\\a\\n\\r\\t\\v\\003\\177\\234\\313\\000\\x00", 0, &unescaped) < 0); + assert_se(cunescape("abc\\\\\\\"\\b\\f\\a\\n\\r\\t\\v\\003\\177\\234\\313\\000\\x00", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "abc\\\"\b\f\a\n\r\t\v\003\177\234\313\\000\\x00")); + unescaped = mfree(unescaped); + + /* incomplete sequences */ + assert_se(cunescape("\\x0", 0, &unescaped) < 0); + assert_se(cunescape("\\x0", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "\\x0")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\x", 0, &unescaped) < 0); + assert_se(cunescape("\\x", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "\\x")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\", 0, &unescaped) < 0); + assert_se(cunescape("\\", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "\\")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\11", 0, &unescaped) < 0); + assert_se(cunescape("\\11", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "\\11")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\1", 0, &unescaped) < 0); + assert_se(cunescape("\\1", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "\\1")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\u0000", 0, &unescaped) < 0); + assert_se(cunescape("\\u00DF\\U000000df\\u03a0\\U00000041", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "ßßΠA")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\073", 0, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, ";")); + unescaped = mfree(unescaped); + + assert_se(cunescape("A=A\\\\x0aB", 0, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "A=A\\x0aB")); + unescaped = mfree(unescaped); + + assert_se(cunescape("A=A\\\\x0aB", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "A=A\\x0aB")); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\x00\\x00\\x00", UNESCAPE_ACCEPT_NUL, &unescaped) == 3); + assert_se(memcmp(unescaped, "\0\0\0", 3) == 0); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\u0000\\u0000\\u0000", UNESCAPE_ACCEPT_NUL, &unescaped) == 3); + assert_se(memcmp(unescaped, "\0\0\0", 3) == 0); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\U00000000\\U00000000\\U00000000", UNESCAPE_ACCEPT_NUL, &unescaped) == 3); + assert_se(memcmp(unescaped, "\0\0\0", 3) == 0); + unescaped = mfree(unescaped); + + assert_se(cunescape("\\000\\000\\000", UNESCAPE_ACCEPT_NUL, &unescaped) == 3); + assert_se(memcmp(unescaped, "\0\0\0", 3) == 0); +} + +static void test_shell_escape_one(const char *s, const char *bad, const char *expected) { + _cleanup_free_ char *r = NULL; + + assert_se(r = shell_escape(s, bad)); + log_debug("%s → %s (expected %s)", s, r, expected); + assert_se(streq_ptr(r, expected)); +} + +TEST(shell_escape) { + test_shell_escape_one("", "", ""); + test_shell_escape_one("\\", "", "\\\\"); + test_shell_escape_one("foobar", "", "foobar"); + test_shell_escape_one("foobar", "o", "f\\o\\obar"); + test_shell_escape_one("foo:bar,baz", ",:", "foo\\:bar\\,baz"); + test_shell_escape_one("foo\nbar\nbaz", ",:", "foo\\nbar\\nbaz"); +} + +static void test_shell_maybe_quote_one(const char *s, ShellEscapeFlags flags, const char *expected) { + _cleanup_free_ char *ret = NULL; + + assert_se(ret = shell_maybe_quote(s, flags)); + log_debug("[%s] → [%s] (%s)", s, ret, expected); + assert_se(streq(ret, expected)); +} + +TEST(shell_maybe_quote) { + test_shell_maybe_quote_one("", 0, ""); + test_shell_maybe_quote_one("", SHELL_ESCAPE_EMPTY, "\"\""); + test_shell_maybe_quote_one("", SHELL_ESCAPE_POSIX, ""); + test_shell_maybe_quote_one("", SHELL_ESCAPE_POSIX | SHELL_ESCAPE_EMPTY, "\"\""); + test_shell_maybe_quote_one("\\", 0, "\"\\\\\""); + test_shell_maybe_quote_one("\\", SHELL_ESCAPE_POSIX, "$'\\\\'"); + test_shell_maybe_quote_one("\"", 0, "\"\\\"\""); + test_shell_maybe_quote_one("\"", SHELL_ESCAPE_POSIX, "$'\"'"); + test_shell_maybe_quote_one("foobar", 0, "foobar"); + test_shell_maybe_quote_one("foobar", SHELL_ESCAPE_POSIX, "foobar"); + test_shell_maybe_quote_one("foo bar", 0, "\"foo bar\""); + test_shell_maybe_quote_one("foo bar", SHELL_ESCAPE_POSIX, "$'foo bar'"); + test_shell_maybe_quote_one("foo\tbar", 0, "\"foo\\tbar\""); + test_shell_maybe_quote_one("foo\tbar", SHELL_ESCAPE_POSIX, "$'foo\\tbar'"); + test_shell_maybe_quote_one("foo\nbar", 0, "\"foo\\nbar\""); + test_shell_maybe_quote_one("foo\nbar", SHELL_ESCAPE_POSIX, "$'foo\\nbar'"); + test_shell_maybe_quote_one("foo \"bar\" waldo", 0, "\"foo \\\"bar\\\" waldo\""); + test_shell_maybe_quote_one("foo \"bar\" waldo", SHELL_ESCAPE_POSIX, "$'foo \"bar\" waldo'"); + test_shell_maybe_quote_one("foo$bar", 0, "\"foo\\$bar\""); + test_shell_maybe_quote_one("foo$bar", SHELL_ESCAPE_EMPTY, "\"foo\\$bar\""); + test_shell_maybe_quote_one("foo$bar", SHELL_ESCAPE_POSIX, "$'foo$bar'"); + test_shell_maybe_quote_one("foo$bar", SHELL_ESCAPE_POSIX | SHELL_ESCAPE_EMPTY, "$'foo$bar'"); + + /* Exclamation mark is special in the interactive shell, but we don't treat it so. */ + test_shell_maybe_quote_one("foo!bar", 0, "\"foo!bar\""); + test_shell_maybe_quote_one("foo!bar", SHELL_ESCAPE_POSIX, "$'foo!bar'"); + + /* Control characters and unicode */ + test_shell_maybe_quote_one("a\nb\001", 0, "\"a\\nb\\001\""); + test_shell_maybe_quote_one("a\nb\001", SHELL_ESCAPE_POSIX, "$'a\\nb\\001'"); + + test_shell_maybe_quote_one("głąb", 0, "głąb"); + test_shell_maybe_quote_one("głąb", SHELL_ESCAPE_POSIX, "głąb"); + + test_shell_maybe_quote_one("głąb\002\003", 0, "\"głąb\\002\\003\""); + test_shell_maybe_quote_one("głąb\002\003", SHELL_ESCAPE_POSIX, "$'głąb\\002\\003'"); + + test_shell_maybe_quote_one("głąb\002\003rząd", 0, "\"głąb\\002\\003rząd\""); + test_shell_maybe_quote_one("głąb\002\003rząd", SHELL_ESCAPE_POSIX, "$'głąb\\002\\003rząd'"); + + /* Bogus UTF-8 strings */ + test_shell_maybe_quote_one("\250\350", 0, "\"\\250\\350\""); + test_shell_maybe_quote_one("\250\350", SHELL_ESCAPE_POSIX, "$'\\250\\350'"); +} + +static void test_quote_command_line_one(char **argv, const char *expected) { + _cleanup_free_ char *s = NULL; + + assert_se(s = quote_command_line(argv, SHELL_ESCAPE_EMPTY)); + log_info("%s", s); + assert_se(streq(s, expected)); +} + +TEST(quote_command_line) { + test_quote_command_line_one(STRV_MAKE("true", "true"), + "true true"); + test_quote_command_line_one(STRV_MAKE("true", "with a space"), + "true \"with a space\""); + test_quote_command_line_one(STRV_MAKE("true", "with a 'quote'"), + "true \"with a 'quote'\""); + test_quote_command_line_one(STRV_MAKE("true", "with a \"quote\""), + "true \"with a \\\"quote\\\"\""); + test_quote_command_line_one(STRV_MAKE("true", "$dollar"), + "true \"\\$dollar\""); +} + +static void test_octescape_one(const char *s, const char *expected) { + _cleanup_free_ char *ret = NULL; + + assert_se(ret = octescape(s, strlen_ptr(s))); + log_debug("octescape(\"%s\") → \"%s\" (expected: \"%s\")", strnull(s), ret, expected); + assert_se(streq(ret, expected)); +} + +TEST(octescape) { + test_octescape_one(NULL, ""); + test_octescape_one("", ""); + test_octescape_one("foo", "foo"); + test_octescape_one("\"\\\"", "\\042\\134\\042"); + test_octescape_one("\123\213\222", "\123\\213\\222"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-ether-addr-util.c b/src/test/test-ether-addr-util.c new file mode 100644 index 0000000..d680f80 --- /dev/null +++ b/src/test/test-ether-addr-util.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "ether-addr-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(ether_addr_helpers) { + struct ether_addr a; + + a = ETHER_ADDR_NULL; + assert_se(ether_addr_is_null(&a)); + assert_se(!ether_addr_is_broadcast(&a)); + assert_se(!ether_addr_is_multicast(&a)); + assert_se(ether_addr_is_unicast(&a)); + assert_se(!ether_addr_is_local(&a)); + assert_se(ether_addr_is_global(&a)); + + memset(a.ether_addr_octet, 0xff, sizeof(a)); + assert_se(!ether_addr_is_null(&a)); + assert_se(ether_addr_is_broadcast(&a)); + assert_se(ether_addr_is_multicast(&a)); + assert_se(!ether_addr_is_unicast(&a)); + assert_se(ether_addr_is_local(&a)); + assert_se(!ether_addr_is_global(&a)); + + a = (struct ether_addr) { { 0x01, 0x23, 0x34, 0x56, 0x78, 0x9a } }; + assert_se(!ether_addr_is_null(&a)); + assert_se(!ether_addr_is_broadcast(&a)); + assert_se(ether_addr_is_multicast(&a)); + assert_se(!ether_addr_is_unicast(&a)); + assert_se(!ether_addr_is_local(&a)); + assert_se(ether_addr_is_global(&a)); +} + +#define INFINIBAD_ADDR_1 ((const struct hw_addr_data){ .length = 20, .infiniband = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20} }) + +TEST(HW_ADDR_TO_STRING) { + const char *s = HW_ADDR_TO_STR(&(const struct hw_addr_data){6}); + log_info("null: %s", s); + + log_info("null×2: %s, %s", + HW_ADDR_TO_STR(&(const struct hw_addr_data){6}), + HW_ADDR_TO_STR(&(const struct hw_addr_data){6})); + log_info("null×3: %s, %s, %s", + HW_ADDR_TO_STR(&(const struct hw_addr_data){6}), + s, + HW_ADDR_TO_STR(&(const struct hw_addr_data){6})); + + log_info("infiniband: %s", HW_ADDR_TO_STR(&INFINIBAD_ADDR_1)); + + /* Let's nest function calls in a stupid way. */ + _cleanup_free_ char *t = NULL; + log_info("infiniband×3: %s\n%14s%s\n%14s%s", + HW_ADDR_TO_STR(&(const struct hw_addr_data){20}), "", + t = strdup(HW_ADDR_TO_STR(&INFINIBAD_ADDR_1)), "", + HW_ADDR_TO_STR(&(const struct hw_addr_data){20})); + + const char *p; + /* Let's use a separate selection statement */ + if ((p = HW_ADDR_TO_STR(&(const struct hw_addr_data){6}))) + log_info("joint: %s, %s", s, p); +} + +static void test_parse_hw_addr_full_one(const char *in, size_t expected_len, const char *expected) { + struct hw_addr_data h; + int r; + + r = parse_hw_addr_full(in, expected_len, &h); + log_debug_errno(r, "parse_hw_addr(\"%s\", len=%zu) → \"%s\" (expected: \"%s\") : %d/%m", + in, expected_len, r >= 0 ? HW_ADDR_TO_STR(&h) : "n/a", strna(expected), r); + assert_se((r >= 0) == !!expected); + if (r >= 0) { + if (!IN_SET(expected_len, 0, SIZE_MAX)) + assert_se(h.length == expected_len); + assert_se(streq(HW_ADDR_TO_STR(&h), expected)); + } +} + +TEST(parse_hw_addr) { + /* IPv4 */ + test_parse_hw_addr_full_one("10.0.0.1", 0, "0a:00:00:01"); + test_parse_hw_addr_full_one("10.0.0.1", 4, "0a:00:00:01"); + test_parse_hw_addr_full_one("192.168.0.1", 0, "c0:a8:00:01"); + test_parse_hw_addr_full_one("192.168.0.1", 4, "c0:a8:00:01"); + /* IPv6 */ + test_parse_hw_addr_full_one("::", 0, "00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"); + test_parse_hw_addr_full_one("::", 16, "00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"); + test_parse_hw_addr_full_one("::1", 0, "00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:01"); + test_parse_hw_addr_full_one("::1", 16, "00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:01"); + test_parse_hw_addr_full_one("1234::", 0, "12:34:00:00:00:00:00:00:00:00:00:00:00:00:00:00"); + test_parse_hw_addr_full_one("1234::", 16, "12:34:00:00:00:00:00:00:00:00:00:00:00:00:00:00"); + test_parse_hw_addr_full_one("12:34::56", 0, "00:12:00:34:00:00:00:00:00:00:00:00:00:00:00:56"); + test_parse_hw_addr_full_one("12:34::56", 16, "00:12:00:34:00:00:00:00:00:00:00:00:00:00:00:56"); + test_parse_hw_addr_full_one("12aa:34::56", 0, "12:aa:00:34:00:00:00:00:00:00:00:00:00:00:00:56"); + test_parse_hw_addr_full_one("12aa:34::56", 16, "12:aa:00:34:00:00:00:00:00:00:00:00:00:00:00:56"); + test_parse_hw_addr_full_one("1234:5678:90ab:cdef:1234:5678:90ab:cdef", 0, "12:34:56:78:90:ab:cd:ef:12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("1234:5678:90ab:cdef:1234:5678:90ab:cdef", 16, "12:34:56:78:90:ab:cd:ef:12:34:56:78:90:ab:cd:ef"); + /* Dot */ + test_parse_hw_addr_full_one("12.34", 0, "00:12:00:34"); + test_parse_hw_addr_full_one("12.34", 4, "00:12:00:34"); + test_parse_hw_addr_full_one("12.34", SIZE_MAX, "00:12:00:34"); + test_parse_hw_addr_full_one("12.34.56", 0, "00:12:00:34:00:56"); + test_parse_hw_addr_full_one("12.34.56", 6, "00:12:00:34:00:56"); + test_parse_hw_addr_full_one("12.34.56", SIZE_MAX, "00:12:00:34:00:56"); + test_parse_hw_addr_full_one("12.34.56.78", 0, "0c:22:38:4e"); /* IPv4 address */ + test_parse_hw_addr_full_one("12.34.56.78", 4, "0c:22:38:4e"); /* IPv4 address */ + test_parse_hw_addr_full_one("12.34.56.78", 8, "00:12:00:34:00:56:00:78"); + test_parse_hw_addr_full_one("12.34.56.78", SIZE_MAX, "00:12:00:34:00:56:00:78"); + test_parse_hw_addr_full_one("12.34.56.78.90", 0, NULL); + test_parse_hw_addr_full_one("12.34.56.78.90", 10, "00:12:00:34:00:56:00:78:00:90"); + test_parse_hw_addr_full_one("12.34.56.78.90", SIZE_MAX, "00:12:00:34:00:56:00:78:00:90"); + test_parse_hw_addr_full_one("aabb.ccdd", 0, "aa:bb:cc:dd"); + test_parse_hw_addr_full_one("aabb.ccdd", 4, "aa:bb:cc:dd"); + test_parse_hw_addr_full_one("aabb.ccdd", SIZE_MAX, "aa:bb:cc:dd"); + test_parse_hw_addr_full_one("aabb.ccdd.eeff", 0, "aa:bb:cc:dd:ee:ff"); + test_parse_hw_addr_full_one("aabb.ccdd.eeff", 6, "aa:bb:cc:dd:ee:ff"); + test_parse_hw_addr_full_one("aabb.ccdd.eeff", SIZE_MAX, "aa:bb:cc:dd:ee:ff"); + /* Colon */ + test_parse_hw_addr_full_one("12:34", 0, NULL); + test_parse_hw_addr_full_one("12:34", 2, "12:34"); + test_parse_hw_addr_full_one("12:34", SIZE_MAX, "12:34"); + test_parse_hw_addr_full_one("12:34:56:78:90:ab", 0, "12:34:56:78:90:ab"); + test_parse_hw_addr_full_one("12:34:56:78:90:ab", 6, "12:34:56:78:90:ab"); + test_parse_hw_addr_full_one("12:34:56:78:90:ab", SIZE_MAX, "12:34:56:78:90:ab"); + test_parse_hw_addr_full_one("12:34:56:78:90:ab:cd:ef", 0, "00:12:00:34:00:56:00:78:00:90:00:ab:00:cd:00:ef"); /* IPv6 */ + test_parse_hw_addr_full_one("12:34:56:78:90:ab:cd:ef", 8, "12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("12:34:56:78:90:ab:cd:ef", 16, "00:12:00:34:00:56:00:78:00:90:00:ab:00:cd:00:ef"); /* IPv6 */ + test_parse_hw_addr_full_one("12:34:56:78:90:ab:cd:ef", SIZE_MAX, "12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("12:34:56:78:90:AB:CD:EF", 0, "00:12:00:34:00:56:00:78:00:90:00:ab:00:cd:00:ef"); /* IPv6 */ + test_parse_hw_addr_full_one("12:34:56:78:90:AB:CD:EF", 8, "12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("12:34:56:78:90:AB:CD:EF", 16, "00:12:00:34:00:56:00:78:00:90:00:ab:00:cd:00:ef"); /* IPv6 */ + test_parse_hw_addr_full_one("12:34:56:78:90:AB:CD:EF", SIZE_MAX, "12:34:56:78:90:ab:cd:ef"); + /* Hyphen */ + test_parse_hw_addr_full_one("12-34", 0, NULL); + test_parse_hw_addr_full_one("12-34", 2, "12:34"); + test_parse_hw_addr_full_one("12-34", SIZE_MAX, "12:34"); + test_parse_hw_addr_full_one("12-34-56-78-90-ab-cd-ef", 0, NULL); + test_parse_hw_addr_full_one("12-34-56-78-90-ab-cd-ef", 8, "12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("12-34-56-78-90-ab-cd-ef", SIZE_MAX, "12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("12-34-56-78-90-AB-CD-EF", 0, NULL); + test_parse_hw_addr_full_one("12-34-56-78-90-AB-CD-EF", 8, "12:34:56:78:90:ab:cd:ef"); + test_parse_hw_addr_full_one("12-34-56-78-90-AB-CD-EF", SIZE_MAX, "12:34:56:78:90:ab:cd:ef"); + + /* Invalid */ + test_parse_hw_addr_full_one("", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("12", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("12.", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("12.34.", SIZE_MAX, NULL); + test_parse_hw_addr_full_one(".12", SIZE_MAX, NULL); + test_parse_hw_addr_full_one(".12.34", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("12.34:56", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("1234:56", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("1234:56", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("12:34:", SIZE_MAX, NULL); + test_parse_hw_addr_full_one(":12:34", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("::1", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("aa:bb-cc", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("aa:xx", SIZE_MAX, NULL); + test_parse_hw_addr_full_one("aa bb", SIZE_MAX, NULL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-exec-util.c b/src/test/test-exec-util.c new file mode 100644 index 0000000..2304f6a --- /dev/null +++ b/src/test/test-exec-util.c @@ -0,0 +1,456 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "copy.h" +#include "constants.h" +#include "env-util.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static int here = 0, here2 = 0, here3 = 0; +static void *ignore_stdout_args[] = { &here, &here2, &here3 }; + +/* noop handlers, just check that arguments are passed correctly */ +static int ignore_stdout_func(int fd, void *arg) { + assert_se(fd >= 0); + assert_se(arg == &here); + safe_close(fd); + + return 0; +} +static int ignore_stdout_func2(int fd, void *arg) { + assert_se(fd >= 0); + assert_se(arg == &here2); + safe_close(fd); + + return 0; +} +static int ignore_stdout_func3(int fd, void *arg) { + assert_se(fd >= 0); + assert_se(arg == &here3); + safe_close(fd); + + return 0; +} + +static const gather_stdout_callback_t ignore_stdout[] = { + ignore_stdout_func, + ignore_stdout_func2, + ignore_stdout_func3, +}; + +static void test_execute_directory_one(bool gather_stdout) { + _cleanup_(rm_rf_physical_and_freep) char *tmp_lo = NULL, *tmp_hi = NULL; + const char *name, *name2, *name3, + *overridden, *override, + *masked, *mask, + *masked2, *mask2, /* the mask is non-executable */ + *masked2e, *mask2e; /* the mask is executable */ + + log_info("/* %s (%s) */", __func__, gather_stdout ? "gathering stdout" : "asynchronous"); + + assert_se(mkdtemp_malloc("/tmp/test-exec-util.lo.XXXXXXX", &tmp_lo) >= 0); + assert_se(mkdtemp_malloc("/tmp/test-exec-util.hi.XXXXXXX", &tmp_hi) >= 0); + + const char * dirs[] = { tmp_hi, tmp_lo, NULL }; + + name = strjoina(tmp_lo, "/script"); + name2 = strjoina(tmp_hi, "/script2"); + name3 = strjoina(tmp_lo, "/useless"); + overridden = strjoina(tmp_lo, "/overridden"); + override = strjoina(tmp_hi, "/overridden"); + masked = strjoina(tmp_lo, "/masked"); + mask = strjoina(tmp_hi, "/masked"); + masked2 = strjoina(tmp_lo, "/masked2"); + mask2 = strjoina(tmp_hi, "/masked2"); + masked2e = strjoina(tmp_lo, "/masked2e"); + mask2e = strjoina(tmp_hi, "/masked2e"); + + assert_se(write_string_file(name, + "#!/bin/sh\necho 'Executing '$0\ntouch $(dirname $0)/it_works", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name2, + "#!/bin/sh\necho 'Executing '$0\ntouch $(dirname $0)/it_works2", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(overridden, + "#!/bin/sh\necho 'Executing '$0\ntouch $(dirname $0)/failed", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(override, + "#!/bin/sh\necho 'Executing '$0", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(masked, + "#!/bin/sh\necho 'Executing '$0\ntouch $(dirname $0)/failed", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(masked2, + "#!/bin/sh\necho 'Executing '$0\ntouch $(dirname $0)/failed", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(masked2e, + "#!/bin/sh\necho 'Executing '$0\ntouch $(dirname $0)/failed", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(symlink("/dev/null", mask) == 0); + assert_se(touch(mask2) == 0); + assert_se(touch(mask2e) == 0); + assert_se(touch(name3) >= 0); + + assert_se(chmod(name, 0755) == 0); + assert_se(chmod(name2, 0755) == 0); + assert_se(chmod(overridden, 0755) == 0); + assert_se(chmod(override, 0755) == 0); + assert_se(chmod(masked, 0755) == 0); + assert_se(chmod(masked2, 0755) == 0); + assert_se(chmod(masked2e, 0755) == 0); + assert_se(chmod(mask2e, 0755) == 0); + + if (access(name, X_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return; + + if (gather_stdout) + execute_directories(dirs, DEFAULT_TIMEOUT_USEC, ignore_stdout, ignore_stdout_args, NULL, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + else + execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, NULL, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + + assert_se(chdir(tmp_lo) == 0); + assert_se(access("it_works", F_OK) >= 0); + assert_se(access("failed", F_OK) < 0); + + assert_se(chdir(tmp_hi) == 0); + assert_se(access("it_works2", F_OK) >= 0); + assert_se(access("failed", F_OK) < 0); +} + +TEST(execute_directory) { + test_execute_directory_one(true); + test_execute_directory_one(false); +} + +TEST(execution_order) { + _cleanup_(rm_rf_physical_and_freep) char *tmp_lo = NULL, *tmp_hi = NULL; + const char *name, *name2, *name3, *overridden, *override, *masked, *mask; + const char *output, *t; + _cleanup_free_ char *contents = NULL; + + assert_se(mkdtemp_malloc("/tmp/test-exec-util-lo.XXXXXXX", &tmp_lo) >= 0); + assert_se(mkdtemp_malloc("/tmp/test-exec-util-hi.XXXXXXX", &tmp_hi) >= 0); + + const char *dirs[] = { tmp_hi, tmp_lo, NULL }; + + output = strjoina(tmp_hi, "/output"); + + log_info("/* %s >>%s */", __func__, output); + + /* write files in "random" order */ + name2 = strjoina(tmp_lo, "/90-bar"); + name = strjoina(tmp_hi, "/80-foo"); + name3 = strjoina(tmp_lo, "/last"); + overridden = strjoina(tmp_lo, "/30-override"); + override = strjoina(tmp_hi, "/30-override"); + masked = strjoina(tmp_lo, "/10-masked"); + mask = strjoina(tmp_hi, "/10-masked"); + + t = strjoina("#!/bin/sh\necho $(basename $0) >>", output); + assert_se(write_string_file(name, t, WRITE_STRING_FILE_CREATE) == 0); + + t = strjoina("#!/bin/sh\necho $(basename $0) >>", output); + assert_se(write_string_file(name2, t, WRITE_STRING_FILE_CREATE) == 0); + + t = strjoina("#!/bin/sh\necho $(basename $0) >>", output); + assert_se(write_string_file(name3, t, WRITE_STRING_FILE_CREATE) == 0); + + t = strjoina("#!/bin/sh\necho OVERRIDDEN >>", output); + assert_se(write_string_file(overridden, t, WRITE_STRING_FILE_CREATE) == 0); + + t = strjoina("#!/bin/sh\necho $(basename $0) >>", output); + assert_se(write_string_file(override, t, WRITE_STRING_FILE_CREATE) == 0); + + t = strjoina("#!/bin/sh\necho MASKED >>", output); + assert_se(write_string_file(masked, t, WRITE_STRING_FILE_CREATE) == 0); + + assert_se(symlink("/dev/null", mask) == 0); + + assert_se(chmod(name, 0755) == 0); + assert_se(chmod(name2, 0755) == 0); + assert_se(chmod(name3, 0755) == 0); + assert_se(chmod(overridden, 0755) == 0); + assert_se(chmod(override, 0755) == 0); + assert_se(chmod(masked, 0755) == 0); + + if (access(name, X_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return; + + execute_directories(dirs, DEFAULT_TIMEOUT_USEC, ignore_stdout, ignore_stdout_args, NULL, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + + assert_se(read_full_file(output, &contents, NULL) >= 0); + assert_se(streq(contents, "30-override\n80-foo\n90-bar\nlast\n")); +} + +static int gather_stdout_one(int fd, void *arg) { + char ***s = arg, *t; + char buf[128] = {}; + + assert_se(s); + assert_se(read(fd, buf, sizeof buf) >= 0); + safe_close(fd); + + assert_se(t = strndup(buf, sizeof buf)); + assert_se(strv_push(s, t) >= 0); + + return 0; +} +static int gather_stdout_two(int fd, void *arg) { + char ***s = arg; + + STRV_FOREACH(t, *s) + assert_se(write(fd, *t, strlen(*t)) == (ssize_t) strlen(*t)); + safe_close(fd); + + return 0; +} +static int gather_stdout_three(int fd, void *arg) { + char **s = arg; + char buf[128] = {}; + + assert_se(read(fd, buf, sizeof buf - 1) > 0); + safe_close(fd); + assert_se(*s = strndup(buf, sizeof buf)); + + return 0; +} + +const gather_stdout_callback_t gather_stdouts[] = { + gather_stdout_one, + gather_stdout_two, + gather_stdout_three, +}; + +TEST(stdout_gathering) { + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + const char *name, *name2, *name3; + int r; + + char **tmp = NULL; /* this is only used in the forked process, no cleanup here */ + _cleanup_free_ char *output = NULL; + + void* args[] = {&tmp, &tmp, &output}; + + assert_se(mkdtemp_malloc("/tmp/test-exec-util.XXXXXXX", &tmpdir) >= 0); + + const char *dirs[] = { tmpdir, NULL }; + + /* write files */ + name = strjoina(tmpdir, "/10-foo"); + name2 = strjoina(tmpdir, "/20-bar"); + name3 = strjoina(tmpdir, "/30-last"); + + assert_se(write_string_file(name, + "#!/bin/sh\necho a\necho b\necho c\n", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name2, + "#!/bin/sh\necho d\n", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name3, + "#!/bin/sh\nsleep 1", + WRITE_STRING_FILE_CREATE) == 0); + + assert_se(chmod(name, 0755) == 0); + assert_se(chmod(name2, 0755) == 0); + assert_se(chmod(name3, 0755) == 0); + + if (access(name, X_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return; + + r = execute_directories(dirs, DEFAULT_TIMEOUT_USEC, gather_stdouts, args, NULL, NULL, + EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + assert_se(r >= 0); + + log_info("got: %s", output); + + assert_se(streq(output, "a\nb\nc\nd\n")); +} + +TEST(environment_gathering) { + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + const char *name, *name2, *name3, *old; + int r; + + char **tmp = NULL; /* this is only used in the forked process, no cleanup here */ + _cleanup_strv_free_ char **env = NULL; + + void* const args[] = { &tmp, &tmp, &env }; + + assert_se(mkdtemp_malloc("/tmp/test-exec-util.XXXXXXX", &tmpdir) >= 0); + + const char *dirs[] = { tmpdir, NULL }; + + /* write files */ + name = strjoina(tmpdir, "/10-foo"); + name2 = strjoina(tmpdir, "/20-bar"); + name3 = strjoina(tmpdir, "/30-last"); + + assert_se(write_string_file(name, + "#!/bin/sh\n" + "echo A=23\n", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name2, + "#!/bin/sh\n" + "echo A=22:$A\n\n\n", /* substitution from previous generator */ + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name3, + "#!/bin/sh\n" + "echo A=$A:24\n" + "echo B=12\n" + "echo C=000\n" + "echo C=001\n" /* variable overwriting */ + /* various invalid entries */ + "echo unset A\n" + "echo unset A=\n" + "echo unset A=B\n" + "echo unset \n" + "echo A B=C\n" + "echo A\n" + /* test variable assignment without newline */ + "echo PATH=$PATH:/no/such/file", /* no newline */ + WRITE_STRING_FILE_CREATE) == 0); + + assert_se(chmod(name, 0755) == 0); + assert_se(chmod(name2, 0755) == 0); + assert_se(chmod(name3, 0755) == 0); + + /* When booting in containers or without initrd there might not be any PATH in the environment and if + * there is no PATH /bin/sh built-in PATH may leak and override systemd's DEFAULT_PATH which is not + * good. Force our own PATH in environment, to prevent expansion of sh built-in $PATH */ + old = getenv("PATH"); + r = setenv("PATH", "no-sh-built-in-path", 1); + assert_se(r >= 0); + + if (access(name, X_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return; + + r = execute_directories(dirs, DEFAULT_TIMEOUT_USEC, gather_environment, args, NULL, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + assert_se(r >= 0); + + STRV_FOREACH(p, env) + log_info("got env: \"%s\"", *p); + + assert_se(streq(strv_env_get(env, "A"), "22:23:24")); + assert_se(streq(strv_env_get(env, "B"), "12")); + assert_se(streq(strv_env_get(env, "C"), "001")); + assert_se(streq(strv_env_get(env, "PATH"), "no-sh-built-in-path:/no/such/file")); + + /* now retest with "default" path passed in, as created by + * manager_default_environment */ + env = strv_free(env); + env = strv_new("PATH=" DEFAULT_PATH); + assert_se(env); + + r = execute_directories(dirs, DEFAULT_TIMEOUT_USEC, gather_environment, args, NULL, env, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS); + assert_se(r >= 0); + + STRV_FOREACH(p, env) + log_info("got env: \"%s\"", *p); + + assert_se(streq(strv_env_get(env, "A"), "22:23:24")); + assert_se(streq(strv_env_get(env, "B"), "12")); + assert_se(streq(strv_env_get(env, "C"), "001")); + assert_se(streq(strv_env_get(env, "PATH"), DEFAULT_PATH ":/no/such/file")); + + /* reset environ PATH */ + assert_se(set_unset_env("PATH", old, true) == 0); +} + +TEST(error_catching) { + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + const char *name, *name2, *name3; + int r; + + assert_se(mkdtemp_malloc("/tmp/test-exec-util.XXXXXXX", &tmpdir) >= 0); + + const char *dirs[] = { tmpdir, NULL }; + + /* write files */ + name = strjoina(tmpdir, "/10-foo"); + name2 = strjoina(tmpdir, "/20-bar"); + name3 = strjoina(tmpdir, "/30-last"); + + assert_se(write_string_file(name, + "#!/bin/sh\necho a\necho b\necho c\n", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name2, + "#!/bin/sh\nexit 42\n", + WRITE_STRING_FILE_CREATE) == 0); + assert_se(write_string_file(name3, + "#!/bin/sh\nexit 12", + WRITE_STRING_FILE_CREATE) == 0); + + assert_se(chmod(name, 0755) == 0); + assert_se(chmod(name2, 0755) == 0); + assert_se(chmod(name3, 0755) == 0); + + if (access(name, X_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return; + + r = execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, NULL, NULL, EXEC_DIR_NONE); + + /* we should exit with the error code of the first script that failed */ + assert_se(r == 42); +} + +TEST(exec_command_flags_from_strv) { + ExecCommandFlags flags = 0; + char **valid_strv = STRV_MAKE("no-env-expand", "no-setuid", "ignore-failure"); + char **invalid_strv = STRV_MAKE("no-env-expand", "no-setuid", "nonexistent-option", "ignore-failure"); + int r; + + r = exec_command_flags_from_strv(valid_strv, &flags); + + assert_se(r == 0); + assert_se(FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND)); + assert_se(FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID)); + assert_se(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE)); + assert_se(!FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC)); + assert_se(!FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED)); + + r = exec_command_flags_from_strv(invalid_strv, &flags); + + assert_se(r == -EINVAL); +} + +TEST(exec_command_flags_to_strv) { + _cleanup_strv_free_ char **opts = NULL, **empty_opts = NULL, **invalid_opts = NULL; + ExecCommandFlags flags = 0; + int r; + + flags |= (EXEC_COMMAND_AMBIENT_MAGIC|EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_IGNORE_FAILURE); + + r = exec_command_flags_to_strv(flags, &opts); + + assert_se(r == 0); + assert_se(strv_equal(opts, STRV_MAKE("ignore-failure", "ambient", "no-env-expand"))); + + r = exec_command_flags_to_strv(0, &empty_opts); + + assert_se(r == 0); + assert_se(strv_equal(empty_opts, STRV_MAKE_EMPTY)); + + flags = _EXEC_COMMAND_FLAGS_INVALID; + + r = exec_command_flags_to_strv(flags, &invalid_opts); + + assert_se(r == -EINVAL); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-execute.c b/src/test/test-execute.c new file mode 100644 index 0000000..4f6ad5d --- /dev/null +++ b/src/test/test-execute.c @@ -0,0 +1,1550 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-event.h" + +#include "capability-util.h" +#include "cpu-set-util.h" +#include "copy.h" +#include "dropin.h" +#include "errno-list.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "macro.h" +#include "manager.h" +#include "missing_prctl.h" +#include "mkdir.h" +#include "mount-util.h" +#include "path-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "seccomp-util.h" +#include "service.h" +#include "signal-util.h" +#include "static-destruct.h" +#include "stat-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "unit.h" +#include "user-util.h" +#include "virt.h" + +#define PRIVATE_UNIT_DIR "/run/test-execute-unit-dir" + +static char *user_runtime_unit_dir = NULL; +static bool can_unshare; +static bool have_net_dummy; +static bool have_netns; +static unsigned n_ran_tests = 0; + +STATIC_DESTRUCTOR_REGISTER(user_runtime_unit_dir, freep); + +typedef void (*test_function_t)(Manager *m); + +static int cld_dumped_to_killed(int code) { + /* Depending on the system, seccomp version, … some signals might result in dumping, others in plain + * killing. Let's ignore the difference here, and map both cases to CLD_KILLED */ + return code == CLD_DUMPED ? CLD_KILLED : code; +} + +static void wait_for_service_finish(Manager *m, Unit *unit) { + Service *service = NULL; + usec_t ts; + usec_t timeout = 2 * USEC_PER_MINUTE; + + assert_se(m); + assert_se(unit); + + service = SERVICE(unit); + printf("%s\n", unit->id); + exec_context_dump(&service->exec_context, stdout, "\t"); + ts = now(CLOCK_MONOTONIC); + while (!IN_SET(service->state, SERVICE_DEAD, SERVICE_FAILED)) { + int r; + usec_t n; + + r = sd_event_run(m->event, 100 * USEC_PER_MSEC); + assert_se(r >= 0); + + n = now(CLOCK_MONOTONIC); + if (ts + timeout < n) { + log_error("Test timeout when testing %s", unit->id); + r = unit_kill(unit, KILL_ALL, SIGKILL, SI_USER, 0, NULL); + if (r < 0) + log_error_errno(r, "Failed to kill %s: %m", unit->id); + exit(EXIT_FAILURE); + } + } +} + +static void check_main_result(const char *file, unsigned line, const char *func, + Manager *m, Unit *unit, int status_expected, int code_expected) { + Service *service = NULL; + + assert_se(m); + assert_se(unit); + + wait_for_service_finish(m, unit); + + service = SERVICE(unit); + exec_status_dump(&service->main_exec_status, stdout, "\t"); + + if (cld_dumped_to_killed(service->main_exec_status.code) != cld_dumped_to_killed(code_expected)) { + log_error("%s:%u:%s %s: can_unshare=%s: exit code %d, expected %d", + file, line, func, unit->id, yes_no(can_unshare), + service->main_exec_status.code, code_expected); + abort(); + } + + if (service->main_exec_status.status != status_expected) { + log_error("%s:%u:%s: %s: can_unshare=%s: exit status %d, expected %d", + file, line, func, unit->id, yes_no(can_unshare), + service->main_exec_status.status, status_expected); + abort(); + } +} + +static void check_service_result(const char *file, unsigned line, const char *func, + Manager *m, Unit *unit, ServiceResult result_expected) { + Service *service = NULL; + + assert_se(m); + assert_se(unit); + + wait_for_service_finish(m, unit); + + service = SERVICE(unit); + + if (service->result != result_expected) { + log_error("%s:%u:%s: %s: can_unshare=%s: service end result %s, expected %s", + file, line, func, unit->id, yes_no(can_unshare), + service_result_to_string(service->result), + service_result_to_string(result_expected)); + abort(); + } +} + +static bool check_nobody_user_and_group(void) { + static int cache = -1; + struct passwd *p; + struct group *g; + + if (cache >= 0) + return !!cache; + + if (!synthesize_nobody()) + goto invalid; + + p = getpwnam(NOBODY_USER_NAME); + if (!p || + !streq(p->pw_name, NOBODY_USER_NAME) || + p->pw_uid != UID_NOBODY || + p->pw_gid != GID_NOBODY) + goto invalid; + + p = getpwuid(UID_NOBODY); + if (!p || + !streq(p->pw_name, NOBODY_USER_NAME) || + p->pw_uid != UID_NOBODY || + p->pw_gid != GID_NOBODY) + goto invalid; + + g = getgrnam(NOBODY_GROUP_NAME); + if (!g || + !streq(g->gr_name, NOBODY_GROUP_NAME) || + g->gr_gid != GID_NOBODY) + goto invalid; + + g = getgrgid(GID_NOBODY); + if (!g || + !streq(g->gr_name, NOBODY_GROUP_NAME) || + g->gr_gid != GID_NOBODY) + goto invalid; + + cache = 1; + return true; + +invalid: + cache = 0; + return false; +} + +static bool check_user_has_group_with_same_name(const char *name) { + struct passwd *p; + struct group *g; + + assert_se(name); + + p = getpwnam(name); + if (!p || + !streq(p->pw_name, name)) + return false; + + g = getgrgid(p->pw_gid); + if (!g || + !streq(g->gr_name, name)) + return false; + + return true; +} + +static bool is_inaccessible_available(void) { + FOREACH_STRING(p, + "/run/systemd/inaccessible/reg", + "/run/systemd/inaccessible/dir", + "/run/systemd/inaccessible/chr", + "/run/systemd/inaccessible/blk", + "/run/systemd/inaccessible/fifo", + "/run/systemd/inaccessible/sock") + if (access(p, F_OK) < 0) + return false; + + return true; +} + +static void start_parent_slices(Unit *unit) { + Unit *slice; + + slice = UNIT_GET_SLICE(unit); + if (slice) { + start_parent_slices(slice); + int r = unit_start(slice, NULL); + assert_se(r >= 0 || r == -EALREADY); + } +} + +static bool have_userns_privileges(void) { + pid_t pid; + int r; + + r = safe_fork("(sd-test-check-userns)", + FORK_RESET_SIGNALS | + FORK_CLOSE_ALL_FDS | + FORK_DEATHSIG_SIGKILL, + &pid); + assert(r >= 0); + if (r == 0) { + /* Keep CAP_SYS_ADMIN if we have it to ensure we give an + * accurate result to the caller. Some kernels have a + * kernel.unprivileged_userns_clone sysctl which can be + * configured to make CLONE_NEWUSER require CAP_SYS_ADMIN. + * Additionally, AppArmor may restrict unprivileged user + * namespace creation. */ + r = capability_bounding_set_drop(UINT64_C(1) << CAP_SYS_ADMIN, /* right_now = */ true); + if (r < 0) { + log_debug_errno(r, "Failed to drop capabilities: %m"); + _exit(2); + } + + r = RET_NERRNO(unshare(CLONE_NEWUSER)); + if (r < 0 && !ERRNO_IS_NEG_PRIVILEGE(r)) + log_debug_errno(r, "Failed to create user namespace: %m"); + + _exit(r >= 0 ? EXIT_SUCCESS : ERRNO_IS_NEG_PRIVILEGE(r) ? EXIT_FAILURE : 2); + } + + /* The exit code records the result of the check: + * EXIT_SUCCESS => we can use user namespaces + * EXIT_FAILURE => we can NOT use user namespaces + * 2 => some other error occurred */ + r = wait_for_terminate_and_check("(sd-test-check-userns)", pid, 0); + if (!IN_SET(r, EXIT_SUCCESS, EXIT_FAILURE)) + log_debug("Failed to check if user namespaces can be used, assuming not."); + + return r == EXIT_SUCCESS; +} + +static void _test(const char *file, unsigned line, const char *func, + Manager *m, const char *unit_name, int status_expected, int code_expected) { + Unit *unit; + + assert_se(unit_name); + + assert_se(manager_load_startable_unit_or_warn(m, unit_name, NULL, &unit) >= 0); + /* We need to start the slices as well otherwise the slice cgroups might be pruned + * in on_cgroup_empty_event. */ + start_parent_slices(unit); + assert_se(unit_start(unit, NULL) >= 0); + check_main_result(file, line, func, m, unit, status_expected, code_expected); + + ++n_ran_tests; +} +#define test(m, unit_name, status_expected, code_expected) \ + _test(PROJECT_FILE, __LINE__, __func__, m, unit_name, status_expected, code_expected) + +static void _test_service(const char *file, unsigned line, const char *func, + Manager *m, const char *unit_name, ServiceResult result_expected) { + Unit *unit; + + assert_se(unit_name); + + assert_se(manager_load_startable_unit_or_warn(m, unit_name, NULL, &unit) >= 0); + assert_se(unit_start(unit, NULL) >= 0); + check_service_result(file, line, func, m, unit, result_expected); +} +#define test_service(m, unit_name, result_expected) \ + _test_service(PROJECT_FILE, __LINE__, __func__, m, unit_name, result_expected) + +static void test_exec_bindpaths(Manager *m) { + assert_se(mkdir_p("/tmp/test-exec-bindpaths", 0755) >= 0); + assert_se(mkdir_p("/tmp/test-exec-bindreadonlypaths", 0755) >= 0); + + test(m, "exec-bindpaths.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); + + (void) rm_rf("/tmp/test-exec-bindpaths", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/tmp/test-exec-bindreadonlypaths", REMOVE_ROOT|REMOVE_PHYSICAL); +} + +static void test_exec_cpuaffinity(Manager *m) { + _cleanup_(cpu_set_reset) CPUSet c = {}; + + assert_se(cpu_set_realloc(&c, 8192) >= 0); /* just allocate the maximum possible size */ + assert_se(sched_getaffinity(0, c.allocated, c.set) >= 0); + + if (!CPU_ISSET_S(0, c.allocated, c.set)) { + log_notice("Cannot use CPU 0, skipping %s", __func__); + return; + } + + test(m, "exec-cpuaffinity1.service", 0, CLD_EXITED); + test(m, "exec-cpuaffinity2.service", 0, CLD_EXITED); + + if (!CPU_ISSET_S(1, c.allocated, c.set) || + !CPU_ISSET_S(2, c.allocated, c.set)) { + log_notice("Cannot use CPU 1 or 2, skipping remaining tests in %s", __func__); + return; + } + + test(m, "exec-cpuaffinity3.service", 0, CLD_EXITED); +} + +static void test_exec_credentials(Manager *m) { + test(m, "exec-set-credential.service", 0, CLD_EXITED); + test(m, "exec-load-credential.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_CREDENTIALS, CLD_EXITED); + test(m, "exec-credentials-dir-specifier.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_CREDENTIALS, CLD_EXITED); +} + +static void test_exec_workingdirectory(Manager *m) { + assert_se(mkdir_p("/tmp/test-exec_workingdirectory", 0755) >= 0); + + test(m, "exec-workingdirectory.service", 0, CLD_EXITED); + test(m, "exec-workingdirectory-trailing-dot.service", 0, CLD_EXITED); + + (void) rm_rf("/tmp/test-exec_workingdirectory", REMOVE_ROOT|REMOVE_PHYSICAL); +} + +static void test_exec_execsearchpath(Manager *m) { + assert_se(mkdir_p("/tmp/test-exec_execsearchpath", 0755) >= 0); + + assert_se(copy_file("/bin/ls", "/tmp/test-exec_execsearchpath/ls_temp", 0, 0777, COPY_REPLACE) >= 0); + + test(m, "exec-execsearchpath.service", 0, CLD_EXITED); + + assert_se(rm_rf("/tmp/test-exec_execsearchpath", REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + test(m, "exec-execsearchpath.service", EXIT_EXEC, CLD_EXITED); +} + +static void test_exec_execsearchpath_specifier(Manager *m) { + test(m, "exec-execsearchpath-unit-specifier.service", 0, CLD_EXITED); +} + +static void test_exec_execsearchpath_environment(Manager *m) { + test(m, "exec-execsearchpath-environment.service", 0, CLD_EXITED); + test(m, "exec-execsearchpath-environment-path-set.service", 0, CLD_EXITED); +} + +static void test_exec_execsearchpath_environment_files(Manager *m) { + static const char path_not_set[] = + "VAR1='word1 word2'\n" + "VAR2=word3 \n" + "# comment1\n" + "\n" + "; comment2\n" + " ; # comment3\n" + "line without an equal\n" + "VAR3='$word 5 6'\n" + "VAR4='new\nline'\n" + "VAR5=password\\with\\backslashes"; + + static const char path_set[] = + "VAR1='word1 word2'\n" + "VAR2=word3 \n" + "# comment1\n" + "\n" + "; comment2\n" + " ; # comment3\n" + "line without an equal\n" + "VAR3='$word 5 6'\n" + "VAR4='new\nline'\n" + "VAR5=password\\with\\backslashes\n" + "PATH=/usr"; + + int r; + + r = write_string_file("/tmp/test-exec_execsearchpath_environmentfile.conf", path_not_set, WRITE_STRING_FILE_CREATE); + + assert_se(r == 0); + + test(m, "exec-execsearchpath-environmentfile.service", 0, CLD_EXITED); + + (void) unlink("/tmp/test-exec_environmentfile.conf"); + + + r = write_string_file("/tmp/test-exec_execsearchpath_environmentfile-set.conf", path_set, WRITE_STRING_FILE_CREATE); + + assert_se(r == 0); + + test(m, "exec-execsearchpath-environmentfile-set.service", 0, CLD_EXITED); + + (void) unlink("/tmp/test-exec_environmentfile-set.conf"); +} + +static void test_exec_execsearchpath_passenvironment(Manager *m) { + assert_se(setenv("VAR1", "word1 word2", 1) == 0); + assert_se(setenv("VAR2", "word3", 1) == 0); + assert_se(setenv("VAR3", "$word 5 6", 1) == 0); + assert_se(setenv("VAR4", "new\nline", 1) == 0); + assert_se(setenv("VAR5", "passwordwithbackslashes", 1) == 0); + + test(m, "exec-execsearchpath-passenvironment.service", 0, CLD_EXITED); + + assert_se(setenv("PATH", "/usr", 1) == 0); + test(m, "exec-execsearchpath-passenvironment-set.service", 0, CLD_EXITED); + + assert_se(unsetenv("VAR1") == 0); + assert_se(unsetenv("VAR2") == 0); + assert_se(unsetenv("VAR3") == 0); + assert_se(unsetenv("VAR4") == 0); + assert_se(unsetenv("VAR5") == 0); + assert_se(unsetenv("PATH") == 0); +} + +static void test_exec_personality(Manager *m) { +#if defined(__x86_64__) + test(m, "exec-personality-x86-64.service", 0, CLD_EXITED); + +#elif defined(__s390__) + test(m, "exec-personality-s390.service", 0, CLD_EXITED); + +#elif defined(__powerpc64__) +# if __BYTE_ORDER == __BIG_ENDIAN + test(m, "exec-personality-ppc64.service", 0, CLD_EXITED); +# else + test(m, "exec-personality-ppc64le.service", 0, CLD_EXITED); +# endif + +#elif defined(__aarch64__) + test(m, "exec-personality-aarch64.service", 0, CLD_EXITED); + +#elif defined(__i386__) + test(m, "exec-personality-x86.service", 0, CLD_EXITED); +#elif defined(__loongarch_lp64) + test(m, "exec-personality-loongarch64.service", 0, CLD_EXITED); +#else + log_notice("Unknown personality, skipping %s", __func__); +#endif +} + +static void test_exec_ignoresigpipe(Manager *m) { + test(m, "exec-ignoresigpipe-yes.service", 0, CLD_EXITED); + test(m, "exec-ignoresigpipe-no.service", SIGPIPE, CLD_KILLED); +} + +static void test_exec_privatetmp(Manager *m) { + assert_se(touch("/tmp/test-exec_privatetmp") >= 0); + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) { + test(m, "exec-privatetmp-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-privatetmp-disabled-by-prefix.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + } + + test(m, "exec-privatetmp-no.service", 0, CLD_EXITED); + + (void) unlink("/tmp/test-exec_privatetmp"); +} + +static void test_exec_privatedevices(Manager *m) { + int r; + + if (detect_container() > 0) { + log_notice("Testing in container, skipping %s", __func__); + return; + } + if (!is_inaccessible_available()) { + log_notice("Testing without inaccessible, skipping %s", __func__); + return; + } + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) { + test(m, "exec-privatedevices-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + if (access("/dev/kmsg", F_OK) >= 0) + test(m, "exec-privatedevices-bind.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-privatedevices-disabled-by-prefix.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-privatedevices-yes-with-group.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + } + + test(m, "exec-privatedevices-no.service", 0, CLD_EXITED); + + /* We use capsh to test if the capabilities are + * properly set, so be sure that it exists */ + r = find_executable("capsh", NULL); + if (r < 0) { + log_notice_errno(r, "Could not find capsh binary, skipping remaining tests in %s: %m", __func__); + return; + } + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) { + test(m, "exec-privatedevices-yes-capability-mknod.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-privatedevices-yes-capability-sys-rawio.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + } + + test(m, "exec-privatedevices-no-capability-mknod.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-privatedevices-no-capability-sys-rawio.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED); +} + +static void test_exec_protecthome(Manager *m) { + if (!can_unshare) { + log_notice("Cannot reliably unshare, skipping %s", __func__); + return; + } + + test(m, "exec-protecthome-tmpfs-vs-protectsystem-strict.service", 0, CLD_EXITED); +} + +static void test_exec_protectkernelmodules(Manager *m) { + int r; + + if (detect_container() > 0) { + log_notice("Testing in container, skipping %s", __func__); + return; + } + if (!is_inaccessible_available()) { + log_notice("Testing without inaccessible, skipping %s", __func__); + return; + } + + r = find_executable("capsh", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find capsh binary: %m", __func__); + return; + } + + test(m, "exec-protectkernelmodules-no-capabilities.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED); + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) { + test(m, "exec-protectkernelmodules-yes-capabilities.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-protectkernelmodules-yes-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + } +} + +static void test_exec_readonlypaths(Manager *m) { + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) + test(m, "exec-readonlypaths-simple.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + + if (path_is_read_only_fs("/var") > 0) { + log_notice("Directory /var is readonly, skipping remaining tests in %s", __func__); + return; + } + + test(m, "exec-readonlypaths.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-readonlypaths-with-bindpaths.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-readonlypaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); +} + +static void test_exec_readwritepaths(Manager *m) { + + if (path_is_read_only_fs("/") > 0) { + log_notice("Root directory is readonly, skipping %s", __func__); + return; + } + + test(m, "exec-readwritepaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); +} + +static void test_exec_inaccessiblepaths(Manager *m) { + + if (!is_inaccessible_available()) { + log_notice("Testing without inaccessible, skipping %s", __func__); + return; + } + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) + test(m, "exec-inaccessiblepaths-sys.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + + if (path_is_read_only_fs("/") > 0) { + log_notice("Root directory is readonly, skipping remaining tests in %s", __func__); + return; + } + + test(m, "exec-inaccessiblepaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); +} + +static int on_spawn_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + char **result = userdata; + char buf[4096]; + ssize_t l; + + assert_se(s); + assert_se(fd >= 0); + + l = read(fd, buf, sizeof(buf) - 1); + if (l < 0) { + if (errno == EAGAIN) + goto reenable; + + return 0; + } + if (l == 0) + return 0; + + buf[l] = '\0'; + if (result) + assert_se(strextend(result, buf)); + else + log_error("ldd: %s", buf); + +reenable: + /* Re-enable the event source if we did not encounter EOF */ + assert_se(sd_event_source_set_enabled(s, SD_EVENT_ONESHOT) >= 0); + return 0; +} + +static int on_spawn_timeout(sd_event_source *s, uint64_t usec, void *userdata) { + pid_t *pid = userdata; + + assert_se(pid); + + (void) kill(*pid, SIGKILL); + + return 1; +} + +static int on_spawn_sigchld(sd_event_source *s, const siginfo_t *si, void *userdata) { + int ret = -EIO; + + assert_se(si); + + if (si->si_code == CLD_EXITED) + ret = si->si_status; + + sd_event_exit(sd_event_source_get_event(s), ret); + return 1; +} + +static int find_libraries(const char *exec, char ***ret) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *sigchld_source = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *stdout_source = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *stderr_source = NULL; + _cleanup_close_pair_ int outpipe[2] = EBADF_PAIR, errpipe[2] = EBADF_PAIR; + _cleanup_strv_free_ char **libraries = NULL; + _cleanup_free_ char *result = NULL; + pid_t pid; + int r; + + assert_se(exec); + assert_se(ret); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + + assert_se(pipe2(outpipe, O_NONBLOCK|O_CLOEXEC) == 0); + assert_se(pipe2(errpipe, O_NONBLOCK|O_CLOEXEC) == 0); + + r = safe_fork_full("(spawn-ldd)", + (int[]) { -EBADF, outpipe[1], errpipe[1] }, + NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG, &pid); + assert_se(r >= 0); + if (r == 0) { + execlp("ldd", "ldd", exec, NULL); + _exit(EXIT_FAILURE); + } + + outpipe[1] = safe_close(outpipe[1]); + errpipe[1] = safe_close(errpipe[1]); + + assert_se(sd_event_new(&e) >= 0); + + assert_se(sd_event_add_time_relative(e, NULL, CLOCK_MONOTONIC, + 10 * USEC_PER_SEC, USEC_PER_SEC, on_spawn_timeout, &pid) >= 0); + assert_se(sd_event_add_io(e, &stdout_source, outpipe[0], EPOLLIN, on_spawn_io, &result) >= 0); + assert_se(sd_event_source_set_enabled(stdout_source, SD_EVENT_ONESHOT) >= 0); + assert_se(sd_event_add_io(e, &stderr_source, errpipe[0], EPOLLIN, on_spawn_io, NULL) >= 0); + assert_se(sd_event_source_set_enabled(stderr_source, SD_EVENT_ONESHOT) >= 0); + assert_se(sd_event_add_child(e, &sigchld_source, pid, WEXITED, on_spawn_sigchld, NULL) >= 0); + /* SIGCHLD should be processed after IO is complete */ + assert_se(sd_event_source_set_priority(sigchld_source, SD_EVENT_PRIORITY_NORMAL + 1) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + _cleanup_strv_free_ char **v = NULL; + assert_se(strv_split_newlines_full(&v, result, 0) >= 0); + + STRV_FOREACH(q, v) { + _cleanup_free_ char *word = NULL; + const char *p = *q; + + r = extract_first_word(&p, &word, NULL, 0); + assert_se(r >= 0); + if (r == 0) + continue; + + if (path_is_absolute(word)) { + assert_se(strv_consume(&libraries, TAKE_PTR(word)) >= 0); + continue; + } + + word = mfree(word); + r = extract_first_word(&p, &word, NULL, 0); + assert_se(r >= 0); + if (r == 0) + continue; + + if (!streq_ptr(word, "=>")) + continue; + + word = mfree(word); + r = extract_first_word(&p, &word, NULL, 0); + assert_se(r >= 0); + if (r == 0) + continue; + + if (path_is_absolute(word)) { + assert_se(strv_consume(&libraries, TAKE_PTR(word)) >= 0); + continue; + } + } + + *ret = TAKE_PTR(libraries); + return 0; +} + +static void test_exec_mount_apivfs(Manager *m) { + _cleanup_free_ char *fullpath_touch = NULL, *fullpath_test = NULL, *data = NULL; + _cleanup_strv_free_ char **libraries = NULL, **libraries_test = NULL; + int r; + + assert_se(user_runtime_unit_dir); + + r = find_executable("ldd", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find 'ldd' command: %m", __func__); + return; + } + r = find_executable("touch", &fullpath_touch); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find 'touch' command: %m", __func__); + return; + } + r = find_executable("test", &fullpath_test); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find 'test' command: %m", __func__); + return; + } + + if (MANAGER_IS_USER(m) && !have_userns_privileges()) + return (void)log_notice("Skipping %s, do not have user namespace privileges", __func__); + + assert_se(find_libraries(fullpath_touch, &libraries) >= 0); + assert_se(find_libraries(fullpath_test, &libraries_test) >= 0); + assert_se(strv_extend_strv(&libraries, libraries_test, true) >= 0); + + assert_se(strextend(&data, "[Service]\n")); + assert_se(strextend(&data, "ExecStart=", fullpath_touch, " /aaa\n")); + assert_se(strextend(&data, "ExecStart=", fullpath_test, " -f /aaa\n")); + assert_se(strextend(&data, "BindReadOnlyPaths=", fullpath_touch, "\n")); + assert_se(strextend(&data, "BindReadOnlyPaths=", fullpath_test, "\n")); + + STRV_FOREACH(p, libraries) + assert_se(strextend(&data, "BindReadOnlyPaths=", *p, "\n")); + + assert_se(write_drop_in(user_runtime_unit_dir, "exec-mount-apivfs-no.service", 10, "bind-mount", data) >= 0); + + assert_se(mkdir_p("/tmp/test-exec-mount-apivfs-no/root", 0755) >= 0); + + test(m, "exec-mount-apivfs-no.service", can_unshare || !MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + + (void) rm_rf("/tmp/test-exec-mount-apivfs-no/root", REMOVE_ROOT|REMOVE_PHYSICAL); +} + +static void test_exec_noexecpaths(Manager *m) { + + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) + test(m, "exec-noexecpaths-simple.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); + else + return (void)log_notice("Skipping %s, do not have user namespace privileges", __func__); +} + +static void test_exec_temporaryfilesystem(Manager *m) { + + test(m, "exec-temporaryfilesystem-options.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-temporaryfilesystem-ro.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-temporaryfilesystem-rw.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-temporaryfilesystem-usr.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); +} + +static void test_exec_systemcallfilter(Manager *m) { +#if HAVE_SECCOMP + int r; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + + test(m, "exec-systemcallfilter-not-failing.service", 0, CLD_EXITED); + test(m, "exec-systemcallfilter-not-failing2.service", 0, CLD_EXITED); + test(m, "exec-systemcallfilter-not-failing3.service", 0, CLD_EXITED); + test(m, "exec-systemcallfilter-failing.service", SIGSYS, CLD_KILLED); + test(m, "exec-systemcallfilter-failing2.service", SIGSYS, CLD_KILLED); + test(m, "exec-systemcallfilter-failing3.service", SIGSYS, CLD_KILLED); + + r = find_executable("python3", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping remaining tests in %s, could not find python3 binary: %m", __func__); + return; + } + + test(m, "exec-systemcallfilter-with-errno-name.service", errno_from_name("EILSEQ"), CLD_EXITED); + test(m, "exec-systemcallfilter-with-errno-number.service", 255, CLD_EXITED); + test(m, "exec-systemcallfilter-with-errno-multi.service", errno_from_name("EILSEQ"), CLD_EXITED); + test(m, "exec-systemcallfilter-with-errno-in-allow-list.service", errno_from_name("EILSEQ"), CLD_EXITED); + test(m, "exec-systemcallfilter-override-error-action.service", SIGSYS, CLD_KILLED); + test(m, "exec-systemcallfilter-override-error-action2.service", errno_from_name("EILSEQ"), CLD_EXITED); + + test(m, "exec-systemcallfilter-nonewprivileges.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + test(m, "exec-systemcallfilter-nonewprivileges-protectclock.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + + r = find_executable("capsh", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find capsh binary: %m", __func__); + return; + } + + test(m, "exec-systemcallfilter-nonewprivileges-bounding1.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + test(m, "exec-systemcallfilter-nonewprivileges-bounding2.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); +#endif +} + +static void test_exec_systemcallerrornumber(Manager *m) { +#if HAVE_SECCOMP + int r; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + + r = find_executable("python3", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find python3 binary: %m", __func__); + return; + } + + test(m, "exec-systemcallerrornumber-name.service", errno_from_name("EACCES"), CLD_EXITED); + test(m, "exec-systemcallerrornumber-number.service", 255, CLD_EXITED); +#endif +} + +static void test_exec_restrictnamespaces(Manager *m) { +#if HAVE_SECCOMP + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + + test(m, "exec-restrictnamespaces-no.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-restrictnamespaces-yes.service", 1, CLD_EXITED); + test(m, "exec-restrictnamespaces-mnt.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-restrictnamespaces-mnt-deny-list.service", 1, CLD_EXITED); + test(m, "exec-restrictnamespaces-merge-and.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-restrictnamespaces-merge-or.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-restrictnamespaces-merge-all.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED); +#endif +} + +static void test_exec_systemcallfilter_system(Manager *m) { +/* Skip this particular test case when running under ASan, as + * LSan intermittently segfaults when accessing memory right + * after the test finishes. Generally, ASan & LSan don't like + * the seccomp stuff. + */ +#if HAVE_SECCOMP && !HAS_FEATURE_ADDRESS_SANITIZER + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + + test(m, "exec-systemcallfilter-system-user.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + + if (!check_nobody_user_and_group()) { + log_notice("nobody user/group is not synthesized or may conflict to other entries, skipping remaining tests in %s", __func__); + return; + } + + if (!STR_IN_SET(NOBODY_USER_NAME, "nobody", "nfsnobody")) { + log_notice("Unsupported nobody user name '%s', skipping remaining tests in %s", NOBODY_USER_NAME, __func__); + return; + } + + test(m, "exec-systemcallfilter-system-user-" NOBODY_USER_NAME ".service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); +#endif +} + +static void test_exec_user(Manager *m) { + test(m, "exec-user.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + + if (!check_nobody_user_and_group()) { + log_notice("nobody user/group is not synthesized or may conflict to other entries, skipping remaining tests in %s", __func__); + return; + } + + if (!STR_IN_SET(NOBODY_USER_NAME, "nobody", "nfsnobody")) { + log_notice("Unsupported nobody user name '%s', skipping remaining tests in %s", NOBODY_USER_NAME, __func__); + return; + } + + test(m, "exec-user-" NOBODY_USER_NAME ".service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); +} + +static void test_exec_group(Manager *m) { + test(m, "exec-group.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + + if (!check_nobody_user_and_group()) { + log_notice("nobody user/group is not synthesized or may conflict to other entries, skipping remaining tests in %s", __func__); + return; + } + + if (!STR_IN_SET(NOBODY_GROUP_NAME, "nobody", "nfsnobody", "nogroup")) { + log_notice("Unsupported nobody group name '%s', skipping remaining tests in %s", NOBODY_GROUP_NAME, __func__); + return; + } + + test(m, "exec-group-" NOBODY_GROUP_NAME ".service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); +} + +static void test_exec_supplementarygroups(Manager *m) { + int status = MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP; + test(m, "exec-supplementarygroups.service", status, CLD_EXITED); + test(m, "exec-supplementarygroups-single-group.service", status, CLD_EXITED); + test(m, "exec-supplementarygroups-single-group-user.service", status, CLD_EXITED); + test(m, "exec-supplementarygroups-multiple-groups-default-group-user.service", status, CLD_EXITED); + test(m, "exec-supplementarygroups-multiple-groups-withgid.service", status, CLD_EXITED); + test(m, "exec-supplementarygroups-multiple-groups-withuid.service", status, CLD_EXITED); +} + +static char* private_directory_bad(Manager *m) { + /* This mirrors setup_exec_directory(). */ + + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { + _cleanup_free_ char *p = NULL; + struct stat st; + + assert_se(p = path_join(m->prefix[dt], "private")); + + if (stat(p, &st) >= 0 && + (st.st_mode & (S_IRWXG|S_IRWXO))) + return TAKE_PTR(p); + } + + return NULL; +} + +static void test_exec_dynamicuser(Manager *m) { + _cleanup_free_ char *bad = private_directory_bad(m); + if (bad) { + log_warning("%s: %s has bad permissions, skipping test.", __func__, bad); + return; + } + + if (strstr_ptr(ci_environment(), "github-actions")) { + log_notice("%s: skipping test on GH Actions because of systemd/systemd#10337", __func__); + return; + } + + int status = can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NAMESPACE : EXIT_GROUP; + + test(m, "exec-dynamicuser-fixeduser.service", status, CLD_EXITED); + if (check_user_has_group_with_same_name("adm")) + test(m, "exec-dynamicuser-fixeduser-adm.service", status, CLD_EXITED); + if (check_user_has_group_with_same_name("games")) + test(m, "exec-dynamicuser-fixeduser-games.service", status, CLD_EXITED); + test(m, "exec-dynamicuser-fixeduser-one-supplementarygroup.service", status, CLD_EXITED); + test(m, "exec-dynamicuser-supplementarygroups.service", status, CLD_EXITED); + test(m, "exec-dynamicuser-statedir.service", status, CLD_EXITED); + + (void) rm_rf("/var/lib/quux", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/test-dynamicuser-migrate", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/test-dynamicuser-migrate2", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/waldo", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/private/quux", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/private/test-dynamicuser-migrate", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/private/test-dynamicuser-migrate2", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/private/waldo", REMOVE_ROOT|REMOVE_PHYSICAL); + + test(m, "exec-dynamicuser-statedir-migrate-step1.service", 0, CLD_EXITED); + test(m, "exec-dynamicuser-statedir-migrate-step2.service", status, CLD_EXITED); + test(m, "exec-dynamicuser-statedir-migrate-step1.service", 0, CLD_EXITED); + + (void) rm_rf("/var/lib/test-dynamicuser-migrate", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/test-dynamicuser-migrate2", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/private/test-dynamicuser-migrate", REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf("/var/lib/private/test-dynamicuser-migrate2", REMOVE_ROOT|REMOVE_PHYSICAL); + + test(m, "exec-dynamicuser-runtimedirectory1.service", status, CLD_EXITED); + test(m, "exec-dynamicuser-runtimedirectory2.service", status, CLD_EXITED); + test(m, "exec-dynamicuser-runtimedirectory3.service", status, CLD_EXITED); +} + +static void test_exec_environment(Manager *m) { + test(m, "exec-environment-no-substitute.service", 0, CLD_EXITED); + test(m, "exec-environment.service", 0, CLD_EXITED); + test(m, "exec-environment-multiple.service", 0, CLD_EXITED); + test(m, "exec-environment-empty.service", 0, CLD_EXITED); +} + +static void test_exec_environmentfile(Manager *m) { + static const char e[] = + "VAR1='word1 word2'\n" + "VAR2=word3 \n" + "# comment1\n" + "\n" + "; comment2\n" + " ; # comment3\n" + "line without an equal\n" + "VAR3='$word 5 6'\n" + "VAR4='new\nline'\n" + "VAR5=password\\with\\backslashes"; + int r; + + r = write_string_file("/tmp/test-exec_environmentfile.conf", e, WRITE_STRING_FILE_CREATE); + assert_se(r == 0); + + test(m, "exec-environmentfile.service", 0, CLD_EXITED); + + (void) unlink("/tmp/test-exec_environmentfile.conf"); +} + +static void test_exec_passenvironment(Manager *m) { + /* test-execute runs under MANAGER_USER which, by default, forwards all + * variables present in the environment, but only those that are + * present _at the time it is created_! + * + * So these PassEnvironment checks are still expected to work, since we + * are ensuring the variables are not present at manager creation (they + * are unset explicitly in main) and are only set here. + * + * This is still a good approximation of how a test for MANAGER_SYSTEM + * would work. + */ + assert_se(setenv("VAR1", "word1 word2", 1) == 0); + assert_se(setenv("VAR2", "word3", 1) == 0); + assert_se(setenv("VAR3", "$word 5 6", 1) == 0); + assert_se(setenv("VAR4", "new\nline", 1) == 0); + assert_se(setenv("VAR5", "passwordwithbackslashes", 1) == 0); + test(m, "exec-passenvironment.service", 0, CLD_EXITED); + test(m, "exec-passenvironment-repeated.service", 0, CLD_EXITED); + test(m, "exec-passenvironment-empty.service", 0, CLD_EXITED); + assert_se(unsetenv("VAR1") == 0); + assert_se(unsetenv("VAR2") == 0); + assert_se(unsetenv("VAR3") == 0); + assert_se(unsetenv("VAR4") == 0); + assert_se(unsetenv("VAR5") == 0); + test(m, "exec-passenvironment-absent.service", 0, CLD_EXITED); +} + +static void test_exec_umask(Manager *m) { + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) { + test(m, "exec-umask-default.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + test(m, "exec-umask-0177.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + } else + return (void)log_notice("Skipping %s, do not have user namespace privileges", __func__); +} + +static void test_exec_runtimedirectory(Manager *m) { + (void) rm_rf("/run/test-exec_runtimedirectory2", REMOVE_ROOT|REMOVE_PHYSICAL); + test(m, "exec-runtimedirectory.service", 0, CLD_EXITED); + (void) rm_rf("/run/test-exec_runtimedirectory2", REMOVE_ROOT|REMOVE_PHYSICAL); + + test(m, "exec-runtimedirectory-mode.service", 0, CLD_EXITED); + test(m, "exec-runtimedirectory-owner.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); + + if (!check_nobody_user_and_group()) { + log_notice("nobody user/group is not synthesized or may conflict to other entries, skipping remaining tests in %s", __func__); + return; + } + + if (!STR_IN_SET(NOBODY_GROUP_NAME, "nobody", "nfsnobody", "nogroup")) { + log_notice("Unsupported nobody group name '%s', skipping remaining tests in %s", NOBODY_GROUP_NAME, __func__); + return; + } + + test(m, "exec-runtimedirectory-owner-" NOBODY_GROUP_NAME ".service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_GROUP, CLD_EXITED); +} + +static void test_exec_capabilityboundingset(Manager *m) { + int r; + + r = find_executable("capsh", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find capsh binary: %m", __func__); + return; + } + + if (have_effective_cap(CAP_CHOWN) <= 0 || + have_effective_cap(CAP_FOWNER) <= 0 || + have_effective_cap(CAP_KILL) <= 0) { + log_notice("Skipping %s, this process does not have enough capabilities", __func__); + return; + } + + test(m, "exec-capabilityboundingset-simple.service", 0, CLD_EXITED); + test(m, "exec-capabilityboundingset-reset.service", 0, CLD_EXITED); + test(m, "exec-capabilityboundingset-merge.service", 0, CLD_EXITED); + test(m, "exec-capabilityboundingset-invert.service", 0, CLD_EXITED); +} + +static void test_exec_basic(Manager *m) { + if (MANAGER_IS_SYSTEM(m) || have_userns_privileges()) + test(m, "exec-basic.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED); + else + return (void)log_notice("Skipping %s, do not have user namespace privileges", __func__); +} + +static void test_exec_ambientcapabilities(Manager *m) { + int r; + + /* Check if the kernel has support for ambient capabilities. Run + * the tests only if that's the case. Clearing all ambient + * capabilities is fine, since we are expecting them to be unset + * in the first place for the tests. */ + r = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0); + if (r < 0 && IN_SET(errno, EINVAL, EOPNOTSUPP, ENOSYS)) { + log_notice("Skipping %s, the kernel does not support ambient capabilities", __func__); + return; + } + + if (have_effective_cap(CAP_CHOWN) <= 0 || + have_effective_cap(CAP_NET_RAW) <= 0) { + log_notice("Skipping %s, this process does not have enough capabilities", __func__); + return; + } + + test(m, "exec-ambientcapabilities.service", 0, CLD_EXITED); + test(m, "exec-ambientcapabilities-merge.service", 0, CLD_EXITED); + + if (have_effective_cap(CAP_SETUID) > 0) + test(m, "exec-ambientcapabilities-dynuser.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED); + + if (!check_nobody_user_and_group()) { + log_notice("nobody user/group is not synthesized or may conflict to other entries, skipping remaining tests in %s", __func__); + return; + } + + if (!STR_IN_SET(NOBODY_USER_NAME, "nobody", "nfsnobody")) { + log_notice("Unsupported nobody user name '%s', skipping remaining tests in %s", NOBODY_USER_NAME, __func__); + return; + } + + test(m, "exec-ambientcapabilities-" NOBODY_USER_NAME ".service", 0, CLD_EXITED); + test(m, "exec-ambientcapabilities-merge-" NOBODY_USER_NAME ".service", 0, CLD_EXITED); +} + +static void test_exec_privatenetwork(Manager *m) { + int r; + + if (!have_net_dummy) + return (void)log_notice("Skipping %s, dummy network interface not available", __func__); + + if (MANAGER_IS_USER(m) && !have_userns_privileges()) + return (void)log_notice("Skipping %s, do not have user namespace privileges", __func__); + + r = find_executable("ip", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find ip binary: %m", __func__); + return; + } + + test(m, "exec-privatenetwork-yes-privatemounts-no.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-privatenetwork-yes-privatemounts-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_NAMESPACE, CLD_EXITED); +} + +static void test_exec_networknamespacepath(Manager *m) { + int r; + + if (!have_net_dummy) + return (void)log_notice("Skipping %s, dummy network interface not available", __func__); + + if (!have_netns) + return (void)log_notice("Skipping %s, network namespace not available", __func__); + + if (MANAGER_IS_USER(m) && !have_userns_privileges()) + return (void)log_notice("Skipping %s, do not have user namespace privileges", __func__); + + r = find_executable("ip", NULL); + if (r < 0) { + log_notice_errno(r, "Skipping %s, could not find ip binary: %m", __func__); + return; + } + + test(m, "exec-networknamespacepath-privatemounts-no.service", MANAGER_IS_SYSTEM(m) ? EXIT_SUCCESS : EXIT_FAILURE, CLD_EXITED); + test(m, "exec-networknamespacepath-privatemounts-yes.service", can_unshare ? EXIT_SUCCESS : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED); +} + +static void test_exec_oomscoreadjust(Manager *m) { + test(m, "exec-oomscoreadjust-positive.service", 0, CLD_EXITED); + + if (detect_container() > 0) { + log_notice("Testing in container, skipping remaining tests in %s", __func__); + return; + } + test(m, "exec-oomscoreadjust-negative.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED); +} + +static void test_exec_ioschedulingclass(Manager *m) { + test(m, "exec-ioschedulingclass-none.service", 0, CLD_EXITED); + test(m, "exec-ioschedulingclass-idle.service", 0, CLD_EXITED); + test(m, "exec-ioschedulingclass-best-effort.service", 0, CLD_EXITED); + + if (detect_container() > 0) { + log_notice("Testing in container, skipping remaining tests in %s", __func__); + return; + } + test(m, "exec-ioschedulingclass-realtime.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_IOPRIO, CLD_EXITED); +} + +static void test_exec_unsetenvironment(Manager *m) { + test(m, "exec-unsetenvironment.service", 0, CLD_EXITED); +} + +static void test_exec_specifier(Manager *m) { + test(m, "exec-specifier.service", 0, CLD_EXITED); + if (MANAGER_IS_SYSTEM(m)) + test(m, "exec-specifier-system.service", 0, CLD_EXITED); + else + test(m, "exec-specifier-user.service", 0, CLD_EXITED); + test(m, "exec-specifier@foo-bar.service", 0, CLD_EXITED); + test(m, "exec-specifier-interpolation.service", 0, CLD_EXITED); +} + +static void test_exec_standardinput(Manager *m) { + test(m, "exec-standardinput-data.service", 0, CLD_EXITED); + test(m, "exec-standardinput-file.service", 0, CLD_EXITED); + test(m, "exec-standardinput-file-cat.service", 0, CLD_EXITED); +} + +static void test_exec_standardoutput(Manager *m) { + test(m, "exec-standardoutput-file.service", 0, CLD_EXITED); +} + +static void test_exec_standardoutput_append(Manager *m) { + test(m, "exec-standardoutput-append.service", 0, CLD_EXITED); +} + +static void test_exec_standardoutput_truncate(Manager *m) { + test(m, "exec-standardoutput-truncate.service", 0, CLD_EXITED); +} + +static void test_exec_condition(Manager *m) { + test_service(m, "exec-condition-failed.service", SERVICE_FAILURE_EXIT_CODE); + test_service(m, "exec-condition-skip.service", SERVICE_SKIP_CONDITION); +} + +static void test_exec_umask_namespace(Manager *m) { + /* exec-specifier-credentials-dir.service creates /run/credentials and enables implicit + * InaccessiblePath= for the directory for all later services with mount namespace. */ + if (!is_inaccessible_available()) { + log_notice("Testing without inaccessible, skipping %s", __func__); + return; + } + test(m, "exec-umask-namespace.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NAMESPACE : EXIT_GROUP, CLD_EXITED); +} + +typedef struct test_entry { + test_function_t f; + const char *name; +} test_entry; + +#define entry(x) {x, #x} + +static void run_tests(RuntimeScope scope, char **patterns) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_free_ char *unit_paths = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + usec_t start, finish; + int r; + + static const test_entry tests[] = { + entry(test_exec_basic), + entry(test_exec_ambientcapabilities), + entry(test_exec_bindpaths), + entry(test_exec_capabilityboundingset), + entry(test_exec_condition), + entry(test_exec_cpuaffinity), + entry(test_exec_credentials), + entry(test_exec_dynamicuser), + entry(test_exec_environment), + entry(test_exec_environmentfile), + entry(test_exec_execsearchpath), + entry(test_exec_execsearchpath_environment), + entry(test_exec_execsearchpath_environment_files), + entry(test_exec_execsearchpath_passenvironment), + entry(test_exec_execsearchpath_specifier), + entry(test_exec_group), + entry(test_exec_ignoresigpipe), + entry(test_exec_inaccessiblepaths), + entry(test_exec_ioschedulingclass), + entry(test_exec_mount_apivfs), + entry(test_exec_networknamespacepath), + entry(test_exec_noexecpaths), + entry(test_exec_oomscoreadjust), + entry(test_exec_passenvironment), + entry(test_exec_personality), + entry(test_exec_privatedevices), + entry(test_exec_privatenetwork), + entry(test_exec_privatetmp), + entry(test_exec_protecthome), + entry(test_exec_protectkernelmodules), + entry(test_exec_readonlypaths), + entry(test_exec_readwritepaths), + entry(test_exec_restrictnamespaces), + entry(test_exec_runtimedirectory), + entry(test_exec_specifier), + entry(test_exec_standardinput), + entry(test_exec_standardoutput), + entry(test_exec_standardoutput_append), + entry(test_exec_standardoutput_truncate), + entry(test_exec_supplementarygroups), + entry(test_exec_systemcallerrornumber), + entry(test_exec_systemcallfilter), + entry(test_exec_systemcallfilter_system), + entry(test_exec_temporaryfilesystem), + entry(test_exec_umask), + entry(test_exec_umask_namespace), + entry(test_exec_unsetenvironment), + entry(test_exec_user), + entry(test_exec_workingdirectory), + {}, + }; + + assert_se(unsetenv("USER") == 0); + assert_se(unsetenv("LOGNAME") == 0); + assert_se(unsetenv("SHELL") == 0); + assert_se(unsetenv("HOME") == 0); + assert_se(unsetenv("TMPDIR") == 0); + + /* Unset VARx, especially, VAR1, VAR2 and VAR3, which are used in the PassEnvironment test cases, + * otherwise (and if they are present in the environment), `manager_default_environment` will copy + * them into the default environment which is passed to each created job, which will make the tests + * that expect those not to be present to fail. */ + assert_se(unsetenv("VAR1") == 0); + assert_se(unsetenv("VAR2") == 0); + assert_se(unsetenv("VAR3") == 0); + assert_se(unsetenv("VAR4") == 0); + assert_se(unsetenv("VAR5") == 0); + + assert_se(runtime_dir = setup_fake_runtime_dir()); + assert_se(user_runtime_unit_dir = path_join(runtime_dir, "systemd/user")); + assert_se(unit_paths = strjoin(PRIVATE_UNIT_DIR, ":", user_runtime_unit_dir)); + assert_se(set_unit_path(unit_paths) >= 0); + + r = manager_new(scope, MANAGER_TEST_RUN_BASIC, &m); + if (manager_errno_skip_test(r)) + return (void) log_tests_skipped_errno(r, "manager_new"); + assert_se(r >= 0); + + m->defaults.std_output = EXEC_OUTPUT_NULL; /* don't rely on host journald */ + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + /* Uncomment below if you want to make debugging logs stored to journal. */ + //manager_override_log_target(m, LOG_TARGET_AUTO); + //manager_override_log_level(m, LOG_DEBUG); + + /* Measure and print the time that it takes to run tests, excluding startup of the manager object, + * to try and measure latency of spawning services */ + n_ran_tests = 0; + start = now(CLOCK_MONOTONIC); + + for (const test_entry *test = tests; test->f; test++) + if (strv_fnmatch_or_empty(patterns, test->name, FNM_NOESCAPE)) + test->f(m); + else + log_info("Skipping %s because it does not match any pattern.", test->name); + + finish = now(CLOCK_MONOTONIC); + + log_info("ran %u tests with %s manager + unshare=%s in: %s", + n_ran_tests, + scope == RUNTIME_SCOPE_SYSTEM ? "system" : "user", + yes_no(can_unshare), + FORMAT_TIMESPAN(finish - start, USEC_PER_MSEC)); +} + +static int prepare_ns(const char *process_name) { + int r; + + r = safe_fork(process_name, + FORK_RESET_SIGNALS | + FORK_CLOSE_ALL_FDS | + FORK_DEATHSIG_SIGTERM | + FORK_WAIT | + FORK_REOPEN_LOG | + FORK_LOG | + FORK_NEW_MOUNTNS | + FORK_MOUNTNS_SLAVE, + NULL); + assert_se(r >= 0); + if (r == 0) { + _cleanup_free_ char *unit_dir = NULL; + + /* Make "/" read-only. */ + assert_se(mount_nofollow_verbose(LOG_DEBUG, NULL, "/", NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) >= 0); + + /* Creating a new user namespace in the above means all MS_SHARED mounts become MS_SLAVE. + * Let's put them back to MS_SHARED here, since that's what we want as defaults. (This will + * not reconnect propagation, but simply create new peer groups for all our mounts). */ + assert_se(mount_follow_verbose(LOG_DEBUG, NULL, "/", NULL, MS_SHARED|MS_REC, NULL) >= 0); + + assert_se(mkdir_p(PRIVATE_UNIT_DIR, 0755) >= 0); + assert_se(mount_nofollow_verbose(LOG_DEBUG, "tmpfs", PRIVATE_UNIT_DIR, "tmpfs", MS_NOSUID|MS_NODEV, NULL) >= 0); + + /* Copy unit files to make them accessible even when unprivileged. */ + assert_se(get_testdata_dir("test-execute/", &unit_dir) >= 0); + assert_se(copy_directory_at(AT_FDCWD, unit_dir, AT_FDCWD, PRIVATE_UNIT_DIR, COPY_MERGE_EMPTY) >= 0); + + /* Mount tmpfs on the following directories to make not StateDirectory= or friends disturb the host. */ + FOREACH_STRING(p, "/dev/shm", "/root", "/tmp", "/var/tmp", "/var/lib") + assert_se(mount_nofollow_verbose(LOG_DEBUG, "tmpfs", p, "tmpfs", MS_NOSUID|MS_NODEV, NULL) >= 0); + + /* Prepare credstore like tmpfiles.d/credstore.conf for LoadCredential= tests. */ + FOREACH_STRING(p, "/run/credstore", "/run/credstore.encrypted") { + assert_se(mkdir_p(p, 0) >= 0); + assert_se(mount_nofollow_verbose(LOG_DEBUG, "tmpfs", p, "tmpfs", MS_NOSUID|MS_NODEV, "mode=0000") >= 0); + } + + assert_se(write_string_file("/run/credstore/test-execute.load-credential", "foo", WRITE_STRING_FILE_CREATE) >= 0); + } + + return r; +} + +TEST(run_tests_root) { + _cleanup_strv_free_ char **filters = NULL; + + if (!have_namespaces()) + return (void) log_tests_skipped("unshare() is disabled"); + + /* safe_fork() clears saved_argv in the child process. Let's copy it. */ + assert_se(filters = strv_copy(strv_skip(saved_argv, 1))); + + if (prepare_ns("(test-execute-root)") == 0) { + can_unshare = true; + run_tests(RUNTIME_SCOPE_SYSTEM, filters); + _exit(EXIT_SUCCESS); + } +} + +TEST(run_tests_without_unshare) { + if (!have_namespaces()) { + /* unshare() is already filtered. */ + can_unshare = false; + run_tests(RUNTIME_SCOPE_SYSTEM, strv_skip(saved_argv, 1)); + return; + } + +#if HAVE_SECCOMP + _cleanup_strv_free_ char **filters = NULL; + int r; + + /* The following tests are for 1beab8b0d0ff2d7d1436b52d4a0c3d56dc908962. */ + if (!is_seccomp_available()) + return (void) log_tests_skipped("Seccomp not available, cannot run unshare() filtered tests"); + + /* safe_fork() clears saved_argv in the child process. Let's copy it. */ + assert_se(filters = strv_copy(strv_skip(saved_argv, 1))); + + if (prepare_ns("(test-execute-without-unshare)") == 0) { + _cleanup_hashmap_free_ Hashmap *s = NULL; + + r = seccomp_syscall_resolve_name("unshare"); + assert_se(r != __NR_SCMP_ERROR); + assert_se(hashmap_ensure_put(&s, NULL, UINT32_TO_PTR(r + 1), INT_TO_PTR(-1)) >= 0); + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EOPNOTSUPP), true) >= 0); + + /* Check unshare() is actually filtered. */ + assert_se(unshare(CLONE_NEWNS) < 0); + assert_se(errno == EOPNOTSUPP); + + can_unshare = false; + run_tests(RUNTIME_SCOPE_SYSTEM, filters); + _exit(EXIT_SUCCESS); + } +#else + log_tests_skipped("Built without seccomp support, cannot run unshare() filtered tests"); +#endif +} + +TEST(run_tests_unprivileged) { + _cleanup_strv_free_ char **filters = NULL; + + if (!have_namespaces()) + return (void) log_tests_skipped("unshare() is disabled"); + + /* safe_fork() clears saved_argv in the child process. Let's copy it. */ + assert_se(filters = strv_copy(strv_skip(saved_argv, 1))); + + if (prepare_ns("(test-execute-unprivileged)") == 0) { + assert_se(capability_bounding_set_drop(0, /* right_now = */ true) >= 0); + + can_unshare = false; + run_tests(RUNTIME_SCOPE_USER, filters); + _exit(EXIT_SUCCESS); + } +} + +static int intro(void) { +#if HAS_FEATURE_ADDRESS_SANITIZER + if (strstr_ptr(ci_environment(), "travis") || strstr_ptr(ci_environment(), "github-actions")) + return log_tests_skipped("Running on Travis CI/GH Actions under ASan, see https://github.com/systemd/systemd/issues/10696"); +#endif + /* It is needed otherwise cgroup creation fails */ + if (geteuid() != 0 || have_effective_cap(CAP_SYS_ADMIN) <= 0) + return log_tests_skipped("not privileged"); + + if (enter_cgroup_subroot(NULL) == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + if (path_is_read_only_fs("/sys") > 0) + return log_tests_skipped("/sys is mounted read-only"); + + /* Create dummy network interface for testing PrivateNetwork=yes */ + have_net_dummy = system("ip link add dummy-test-exec type dummy") == 0; + + if (have_net_dummy) { + /* Create a network namespace and a dummy interface in it for NetworkNamespacePath= */ + have_netns = system("ip netns add test-execute-netns") == 0; + have_netns = have_netns && system("ip netns exec test-execute-netns ip link add dummy-test-ns type dummy") == 0; + } + + return EXIT_SUCCESS; +} + +static int outro(void) { + if (have_net_dummy) { + (void) system("ip link del dummy-test-exec"); + (void) system("ip netns del test-execute-netns"); + } + + (void) rmdir(PRIVATE_UNIT_DIR); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_FULL(LOG_DEBUG, intro, outro); diff --git a/src/test/test-execve.c b/src/test/test-execve.c new file mode 100644 index 0000000..e7a9a51 --- /dev/null +++ b/src/test/test-execve.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "exec-util.h" +#include "fd-util.h" +#include "log.h" +#include "main-func.h" +#include "strv.h" +#include "tests.h" + +/* This program can be used to call programs through fexecve / execveat(…, "", …, AT_EMPTY_PATH), + * when compiled with -Dfexecve=true, and the fallback paths, when -Dfexecve=false. + * + * Example: + * $ strace -e execveat build/test-execve /bin/grep Name /proc/self/status + * execveat(3, "", ["/bin/grep", "Name", "/proc/self/status"], NULL, AT_EMPTY_PATH) = 0 + * Name: 3 + * + * FIXME: use the new kernel api to set COMM properly when the kernel makes that available. + * C.f. ceedbf8185fc7593366679f02d31da63af8c4bd1. + */ + +static int run(int argc, char **argv) { + _cleanup_close_ int fd = -EBADF; + char **args = strv_skip(argv, 1); + int r; + + test_setup_logging(LOG_DEBUG); + + args = !strv_isempty(args) ? args : STRV_MAKE("/bin/true"); + + fd = open(args[0], O_RDONLY | O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "open(%s) failed: %m", args[0]); + + r = fexecve_or_execve(fd, args[0], args, NULL); + assert_se(r < 0); + return log_error_errno(r, "fexecve_or_execve(%s) failed: %m", args[0]); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-exit-status.c b/src/test/test-exit-status.c new file mode 100644 index 0000000..86d3976 --- /dev/null +++ b/src/test/test-exit-status.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "exit-status.h" +#include "string-util.h" +#include "tests.h" + +TEST(exit_status_to_string) { + for (int i = -1; i <= 256; i++) { + const char *s, *class; + + s = exit_status_to_string(i, EXIT_STATUS_FULL); + class = exit_status_class(i); + log_info("%d: %s%s%s%s", + i, s ?: "-", + class ? " (" : "", strempty(class), class ? ")" : ""); + + if (s) + assert_se(exit_status_from_string(s) == i); + } +} + +TEST(exit_status_from_string) { + assert_se(exit_status_from_string("11") == 11); + assert_se(exit_status_from_string("-1") == -ERANGE); + assert_se(exit_status_from_string("256") == -ERANGE); + assert_se(exit_status_from_string("foo") == -EINVAL); + assert_se(exit_status_from_string("SUCCESS") == 0); + assert_se(exit_status_from_string("FAILURE") == 1); +} + +TEST(exit_status_NUMA_POLICY) { + assert_se(streq(exit_status_to_string(EXIT_NUMA_POLICY, EXIT_STATUS_FULL), "NUMA_POLICY")); + assert_se(streq(exit_status_to_string(EXIT_NUMA_POLICY, EXIT_STATUS_SYSTEMD), "NUMA_POLICY")); + assert_se(!exit_status_to_string(EXIT_NUMA_POLICY, EXIT_STATUS_BSD)); + assert_se(!exit_status_to_string(EXIT_NUMA_POLICY, EXIT_STATUS_LSB)); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-extract-word.c b/src/test/test-extract-word.c new file mode 100644 index 0000000..6e12fbe --- /dev/null +++ b/src/test/test-extract-word.c @@ -0,0 +1,763 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "extract-word.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" + +TEST(extract_first_word) { + const char *p, *original; + char *t; + + p = original = "foobar waldo"; + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "foobar")); + free(t); + assert_se(p == original + 7); + + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "waldo")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word(&p, &t, NULL, 0) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "\"foobar\" \'waldo\'"; + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "\"foobar\"")); + free(t); + assert_se(p == original + 9); + + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "\'waldo\'")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word(&p, &t, NULL, 0) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "\"foobar\" \'waldo\'"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "foobar")); + free(t); + assert_se(p == original + 9); + + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "waldo")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word(&p, &t, NULL, 0) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "\""; + assert_se(extract_first_word(&p, &t, NULL, 0) == 1); + assert_se(streq(t, "\"")); + free(t); + assert_se(isempty(p)); + + p = original = "\""; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) == -EINVAL); + assert_se(p == original + 1); + + p = original = "\'"; + assert_se(extract_first_word(&p, &t, NULL, 0) == 1); + assert_se(streq(t, "\'")); + free(t); + assert_se(isempty(p)); + + p = original = "\'"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) == -EINVAL); + assert_se(p == original + 1); + + p = original = "\'fooo"; + assert_se(extract_first_word(&p, &t, NULL, 0) == 1); + assert_se(streq(t, "\'fooo")); + free(t); + assert_se(isempty(p)); + + p = original = "KEY=val \"KEY2=val with space\" \"KEY3=val with \\\"quotation\\\"\""; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) == 1); + assert_se(streq(t, "KEY=val")); + free(t); + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) == 1); + assert_se(streq(t, "KEY2=val with space")); + free(t); + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) == 1); + assert_se(streq(t, "KEY3=val with \"quotation\"")); + free(t); + assert_se(isempty(p)); + + p = original = "KEY=val \"KEY2=val space\" \"KEY3=val with \\\"quotation\\\"\""; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_RETAIN_ESCAPE) == 1); + assert_se(streq(t, "KEY=val")); + free(t); + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_RETAIN_ESCAPE) == 1); + assert_se(streq(t, "\"KEY2=val")); + free(t); + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_RETAIN_ESCAPE) == 1); + assert_se(streq(t, "space\"")); + free(t); + assert_se(startswith(p, "\"KEY3=")); + + p = original = "\'fooo"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) == -EINVAL); + assert_se(p == original + 5); + + p = original = "\'fooo"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX) > 0); + assert_se(streq(t, "fooo")); + free(t); + assert_se(isempty(p)); + + p = original = "\"fooo"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX) > 0); + assert_se(streq(t, "fooo")); + free(t); + assert_se(isempty(p)); + + p = original = "yay\'foo\'bar"; + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "yay\'foo\'bar")); + free(t); + assert_se(isempty(p)); + + p = original = "yay\'foo\'bar"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "yayfoobar")); + free(t); + assert_se(isempty(p)); + + p = original = " foobar "; + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "foobar")); + free(t); + assert_se(isempty(p)); + + p = original = " foo\\ba\\x6ar "; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE) > 0); + assert_se(streq(t, "foo\ba\x6ar")); + free(t); + assert_se(isempty(p)); + + p = original = " foo\\ba\\x6ar "; + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "foobax6ar")); + free(t); + assert_se(isempty(p)); + + p = original = " f\\u00f6o \"pi\\U0001F4A9le\" "; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE) > 0); + assert_se(streq(t, "föo")); + free(t); + assert_se(p == original + 13); + + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE) > 0); + assert_se(streq(t, "pi\360\237\222\251le")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_RELAX) > 0); + assert_se(streq(t, "fooo")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNESCAPE_RELAX) > 0); + assert_se(streq(t, "fooo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNESCAPE_RELAX|EXTRACT_RELAX) > 0); + assert_se(streq(t, "fooo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX) > 0); + assert_se(streq(t, "fooo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "\"foo\\"; + assert_se(extract_first_word(&p, &t, NULL, 0) == -EINVAL); + assert_se(p == original + 5); + + p = original = "\"foo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX) > 0); + assert_se(streq(t, "foo")); + free(t); + assert_se(isempty(p)); + + p = original = "foo::bar"; + assert_se(extract_first_word(&p, &t, ":", 0) == 1); + assert_se(streq(t, "foo")); + free(t); + assert_se(p == original + 5); + + assert_se(extract_first_word(&p, &t, ":", 0) == 1); + assert_se(streq(t, "bar")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word(&p, &t, ":", 0) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "foo\\:bar::waldo"; + assert_se(extract_first_word(&p, &t, ":", 0) == 1); + assert_se(streq(t, "foo:bar")); + free(t); + assert_se(p == original + 10); + + assert_se(extract_first_word(&p, &t, ":", 0) == 1); + assert_se(streq(t, "waldo")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word(&p, &t, ":", 0) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "\"foo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_UNESCAPE_RELAX) == -EINVAL); + assert_se(p == original + 5); + + p = original = "\"foo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_UNESCAPE_RELAX|EXTRACT_RELAX) > 0); + assert_se(streq(t, "foo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "\"foo\\"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX|EXTRACT_RELAX) > 0); + assert_se(streq(t, "foo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_RELAX) > 0); + assert_se(streq(t, "fooo bar")); + free(t); + assert_se(p == original + 10); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNESCAPE_RELAX) > 0); + assert_se(streq(t, "fooo bar")); + free(t); + assert_se(p == original + 10); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNESCAPE_RELAX|EXTRACT_RELAX) > 0); + assert_se(streq(t, "fooo bar")); + free(t); + assert_se(p == original + 10); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE) == -EINVAL); + assert_se(p == original + 5); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX) > 0); + assert_se(streq(t, "fooo\\ bar")); + free(t); + assert_se(p == original + 10); + + p = original = "\\w+@\\K[\\d.]+"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE) == -EINVAL); + assert_se(p == original + 1); + + p = original = "\\w+@\\K[\\d.]+"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX) > 0); + assert_se(streq(t, "\\w+@\\K[\\d.]+")); + free(t); + assert_se(isempty(p)); + + p = original = "\\w+\\b"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_RELAX) > 0); + assert_se(streq(t, "\\w+\b")); + free(t); + assert_se(isempty(p)); + + p = original = "-N ''"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "-N")); + free(t); + assert_se(p == original + 3); + + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "")); + free(t); + assert_se(isempty(p)); + + p = original = ":foo\\:bar::waldo:"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(t); + assert_se(streq(t, "")); + free(t); + assert_se(p == original + 1); + + assert_se(extract_first_word(&p, &t, ":", EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(streq(t, "foo:bar")); + free(t); + assert_se(p == original + 10); + + assert_se(extract_first_word(&p, &t, ":", EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(t); + assert_se(streq(t, "")); + free(t); + assert_se(p == original + 11); + + assert_se(extract_first_word(&p, &t, ":", EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(streq(t, "waldo")); + free(t); + assert_se(p == original + 17); + + assert_se(extract_first_word(&p, &t, ":", EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(streq(t, "")); + free(t); + assert_se(p == NULL); + + assert_se(extract_first_word(&p, &t, ":", EXTRACT_DONT_COALESCE_SEPARATORS) == 0); + assert_se(!t); + assert_se(!p); + + p = "foo\\xbar"; + assert_se(extract_first_word(&p, &t, NULL, 0) > 0); + assert_se(streq(t, "fooxbar")); + free(t); + assert_se(p == NULL); + + p = "foo\\xbar"; + assert_se(extract_first_word(&p, &t, NULL, EXTRACT_RETAIN_ESCAPE) > 0); + assert_se(streq(t, "foo\\xbar")); + free(t); + assert_se(p == NULL); + + p = "\\:"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, ":")); + free(t); + assert_se(p == NULL); + + p = "a\\:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a:b")); + free(t); + assert_se(p == NULL); + + p = "a\\ b:c"; + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a b")); + free(t); + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "c")); + free(t); + assert_se(p == NULL); + + p = "a\\ b:c\\x"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_UNESCAPE_SEPARATORS) == -EINVAL); + + p = "a\\\\ b:c\\\\x"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a\\ b")); + free(t); + assert_se(extract_first_word(&p, &t, ":", EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "c\\x")); + free(t); + assert_se(p == NULL); + + p = "\\:"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, ":")); + free(t); + assert_se(p == NULL); + + p = "a\\:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a:b")); + free(t); + assert_se(p == NULL); + + p = "a\\ b:c"; + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a b")); + free(t); + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "c")); + free(t); + assert_se(p == NULL); + + p = "a\\ b:c\\x"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == -EINVAL); + + p = "a\\\\ b:c\\\\x"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a\\ b")); + free(t); + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "c\\x")); + free(t); + assert_se(p == NULL); + + p = "\\:"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE) == -EINVAL); + + p = "a\\:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE) == -EINVAL); + assert_se(extract_first_word(&p, &t, ":", EXTRACT_CUNESCAPE) == 1); + assert_se(streq(t, "b")); + free(t); + + p = "a\\ b:c"; + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_CUNESCAPE) == -EINVAL); + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_CUNESCAPE) == 1); + assert_se(streq(t, "b")); + free(t); + assert_se(extract_first_word(&p, &t, WHITESPACE ":", EXTRACT_CUNESCAPE) == 1); + assert_se(streq(t, "c")); + free(t); + assert_se(p == NULL); + + p = original = "foobar=\"waldo\"maldo, baldo"; + assert_se(extract_first_word(&p, &t, "=\", ", 0) > 0); + assert_se(streq(t, "foobar")); + free(t); + assert_se(extract_first_word(&p, &t, "=\", ", 0) > 0); + assert_se(streq(t, "waldo")); + free(t); + assert_se(extract_first_word(&p, &t, "=\", ", 0) > 0); + assert_se(streq(t, "maldo")); + free(t); + assert_se(extract_first_word(&p, &t, "=\", ", 0) > 0); + assert_se(streq(t, "baldo")); + free(t); + + p = original = "mode=\"1777\",size=\"10%\",nr_inodes=\"400\"k,uid=\"496,,107\"520,gi\"\"'d=49610,'\"\"7520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\""; + assert_se(extract_first_word(&p, &t, ",", EXTRACT_KEEP_QUOTE) > 0); + assert_se(streq(t, "mode=\"1777\"")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_KEEP_QUOTE) > 0); + assert_se(streq(t, "size=\"10%\"")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_KEEP_QUOTE) > 0); + assert_se(streq(t, "nr_inodes=\"400\"k")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_KEEP_QUOTE) > 0); + assert_se(streq(t, "uid=\"496,,107\"520")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_KEEP_QUOTE) > 0); + assert_se(streq(t, "gi\"\"'d=49610,'\"\"7520")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_KEEP_QUOTE) > 0); + assert_se(streq(t, "context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\"")); + free(t); + + p = original = "mode=\"1777\",size=\"10%\",nr_inodes=\"400\"k,uid=\"496,,107\"520,gi\"\"'d=49610,'\"\"7520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\""; + assert_se(extract_first_word(&p, &t, ",", EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "mode=1777")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "size=10%")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "nr_inodes=400k")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "uid=496,,107520")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "gid=49610,7520")); + free(t); + assert_se(extract_first_word(&p, &t, ",", EXTRACT_UNQUOTE) > 0); + assert_se(streq(t, "context=system_u:object_r:svirt_sandbox_file_t:s0:c0,c1")); + free(t); + + p = "a:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_RETAIN_SEPARATORS) == 1); + assert_se(streq(t, "a")); + assert_se(streq(p, ":b")); + free(t); + assert_se(extract_first_word(&p, &t, ":", EXTRACT_RETAIN_SEPARATORS) == 1); + assert_se(streq(t, "b")); + free(t); + + p = "a>:b"; + assert_se(extract_first_word(&p, &t, ">:", EXTRACT_RETAIN_SEPARATORS) == 1); + assert_se(streq(t, "a")); + assert_se(streq(p, ">:b")); + free(t); + assert_se(extract_first_word(&p, &t, ">:", EXTRACT_RETAIN_SEPARATORS) == 1); + assert_se(streq(t, "b")); + free(t); + + p = "a>:b"; + assert_se(extract_first_word(&p, &t, ">:", EXTRACT_RETAIN_SEPARATORS|EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(streq(t, "a")); + assert_se(streq(p, ">:b")); + free(t); + assert_se(extract_first_word(&p, &t, ">:", EXTRACT_RETAIN_SEPARATORS|EXTRACT_DONT_COALESCE_SEPARATORS) == 1); + assert_se(streq(t, "")); + assert_se(streq(p, ">:b")); + free(t); + + p = "a\\:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_RETAIN_SEPARATORS|EXTRACT_RETAIN_ESCAPE) == 1); + assert_se(streq(t, "a\\")); + assert_se(streq(p, ":b")); + free(t); + + p = "a\\:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_RETAIN_SEPARATORS) == 1); + assert_se(streq(t, "a:b")); + assert_se(!p); + free(t); + + p = "a\\:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_RETAIN_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a:b")); + assert_se(!p); + free(t); + + p = "a\\:a:b"; + assert_se(extract_first_word(&p, &t, ":", EXTRACT_RETAIN_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS) == 1); + assert_se(streq(t, "a:a")); + assert_se(streq(p, ":b")); + free(t); +} + +TEST(extract_first_word_and_warn) { + const char *p, *original; + char *t; + + p = original = "foobar waldo"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "foobar")); + free(t); + assert_se(p == original + 7); + + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "waldo")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "\"foobar\" \'waldo\'"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "foobar")); + free(t); + assert_se(p == original + 9); + + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "waldo")); + free(t); + assert_se(isempty(p)); + + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) == 0); + assert_se(!t); + assert_se(isempty(p)); + + p = original = "\""; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE, NULL, "fake", 1, original) == -EINVAL); + assert_se(p == original + 1); + + p = original = "\'"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE, NULL, "fake", 1, original) == -EINVAL); + assert_se(p == original + 1); + + p = original = "\'fooo"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE, NULL, "fake", 1, original) == -EINVAL); + assert_se(p == original + 5); + + p = original = "\'fooo"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo")); + free(t); + assert_se(isempty(p)); + + p = original = " foo\\ba\\x6ar "; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "foo\ba\x6ar")); + free(t); + assert_se(isempty(p)); + + p = original = " foo\\ba\\x6ar "; + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "foobax6ar")); + free(t); + assert_se(isempty(p)); + + p = original = " f\\u00f6o \"pi\\U0001F4A9le\" "; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "föo")); + free(t); + assert_se(p == original + 13); + + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "pi\360\237\222\251le")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_RELAX, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo\\")); + free(t); + assert_se(isempty(p)); + + p = original = "\"foo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE, NULL, "fake", 1, original) == -EINVAL); + assert_se(p == original + 5); + + p = original = "\"foo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_RELAX, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "foo")); + free(t); + assert_se(isempty(p)); + + p = original = "\"foo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, NULL, "fake", 1, original) == -EINVAL); + assert_se(p == original + 5); + + p = original = "\"foo\\"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_RELAX, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "foo")); + free(t); + assert_se(isempty(p)); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_RELAX, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo bar")); + free(t); + assert_se(p == original + 10); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, 0, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo bar")); + free(t); + assert_se(p == original + 10); + + p = original = "fooo\\ bar quux"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "fooo\\ bar")); + free(t); + assert_se(p == original + 10); + + p = original = "\\w+@\\K[\\d.]+"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "\\w+@\\K[\\d.]+")); + free(t); + assert_se(isempty(p)); + + p = original = "\\w+\\b"; + assert_se(extract_first_word_and_warn(&p, &t, NULL, EXTRACT_CUNESCAPE, NULL, "fake", 1, original) > 0); + assert_se(streq(t, "\\w+\b")); + free(t); + assert_se(isempty(p)); +} + +TEST(extract_many_words) { + const char *p, *original; + char *a, *b, *c, *d, *e, *f; + + p = original = "foobar waldi piep"; + assert_se(extract_many_words(&p, NULL, 0, &a, &b, &c, NULL) == 3); + assert_se(isempty(p)); + assert_se(streq_ptr(a, "foobar")); + assert_se(streq_ptr(b, "waldi")); + assert_se(streq_ptr(c, "piep")); + free(a); + free(b); + free(c); + + p = original = "foobar:waldi:piep ba1:ba2"; + assert_se(extract_many_words(&p, ":" WHITESPACE, 0, &a, &b, &c, NULL) == 3); + assert_se(!isempty(p)); + assert_se(streq_ptr(a, "foobar")); + assert_se(streq_ptr(b, "waldi")); + assert_se(streq_ptr(c, "piep")); + assert_se(extract_many_words(&p, ":" WHITESPACE, 0, &d, &e, &f, NULL) == 2); + assert_se(isempty(p)); + assert_se(streq_ptr(d, "ba1")); + assert_se(streq_ptr(e, "ba2")); + assert_se(isempty(f)); + free(a); + free(b); + free(c); + free(d); + free(e); + free(f); + + p = original = "'foobar' wa\"ld\"i "; + assert_se(extract_many_words(&p, NULL, 0, &a, &b, &c, NULL) == 2); + assert_se(isempty(p)); + assert_se(streq_ptr(a, "'foobar'")); + assert_se(streq_ptr(b, "wa\"ld\"i")); + assert_se(streq_ptr(c, NULL)); + free(a); + free(b); + + p = original = "'foobar' wa\"ld\"i "; + assert_se(extract_many_words(&p, NULL, EXTRACT_UNQUOTE, &a, &b, &c, NULL) == 2); + assert_se(isempty(p)); + assert_se(streq_ptr(a, "foobar")); + assert_se(streq_ptr(b, "waldi")); + assert_se(streq_ptr(c, NULL)); + free(a); + free(b); + + p = original = ""; + assert_se(extract_many_words(&p, NULL, 0, &a, &b, &c, NULL) == 0); + assert_se(isempty(p)); + assert_se(streq_ptr(a, NULL)); + assert_se(streq_ptr(b, NULL)); + assert_se(streq_ptr(c, NULL)); + + p = original = " "; + assert_se(extract_many_words(&p, NULL, 0, &a, &b, &c, NULL) == 0); + assert_se(isempty(p)); + assert_se(streq_ptr(a, NULL)); + assert_se(streq_ptr(b, NULL)); + assert_se(streq_ptr(c, NULL)); + + p = original = "foobar"; + assert_se(extract_many_words(&p, NULL, 0, NULL) == 0); + assert_se(p == original); + + p = original = "foobar waldi"; + assert_se(extract_many_words(&p, NULL, 0, &a, NULL) == 1); + assert_se(p == original+7); + assert_se(streq_ptr(a, "foobar")); + free(a); + + p = original = " foobar "; + assert_se(extract_many_words(&p, NULL, 0, &a, NULL) == 1); + assert_se(isempty(p)); + assert_se(streq_ptr(a, "foobar")); + free(a); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-fd-util.c b/src/test/test-fd-util.c new file mode 100644 index 0000000..021d4b4 --- /dev/null +++ b/src/test/test-fd-util.c @@ -0,0 +1,765 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "data-fd-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "macro.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "mkdir.h" +#include "mount-util.h" +#include "namespace-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "seccomp-util.h" +#include "serialize.h" +#include "stat-util.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(close_many) { + int fds[3]; + _cleanup_(unlink_tempfilep) char name0[] = "/tmp/test-close-many.XXXXXX"; + _cleanup_(unlink_tempfilep) char name1[] = "/tmp/test-close-many.XXXXXX"; + _cleanup_(unlink_tempfilep) char name2[] = "/tmp/test-close-many.XXXXXX"; + + fds[0] = mkostemp_safe(name0); + fds[1] = mkostemp_safe(name1); + fds[2] = mkostemp_safe(name2); + + close_many(fds, 2); + + assert_se(fcntl(fds[0], F_GETFD) == -1); + assert_se(fcntl(fds[1], F_GETFD) == -1); + assert_se(fcntl(fds[2], F_GETFD) >= 0); + + safe_close(fds[2]); +} + +TEST(close_nointr) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-test-close_nointr.XXXXXX"; + int fd; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se(close_nointr(fd) >= 0); + assert_se(close_nointr(fd) < 0); +} + +TEST(same_fd) { + _cleanup_close_pair_ int p[2]; + _cleanup_close_ int a, b, c; + + assert_se(pipe2(p, O_CLOEXEC) >= 0); + assert_se((a = fcntl(p[0], F_DUPFD, 3)) >= 0); + assert_se((b = open("/dev/null", O_RDONLY|O_CLOEXEC)) >= 0); + assert_se((c = fcntl(a, F_DUPFD, 3)) >= 0); + + assert_se(same_fd(p[0], p[0]) > 0); + assert_se(same_fd(p[1], p[1]) > 0); + assert_se(same_fd(a, a) > 0); + assert_se(same_fd(b, b) > 0); + + assert_se(same_fd(a, p[0]) > 0); + assert_se(same_fd(p[0], a) > 0); + assert_se(same_fd(c, p[0]) > 0); + assert_se(same_fd(p[0], c) > 0); + assert_se(same_fd(a, c) > 0); + assert_se(same_fd(c, a) > 0); + + assert_se(same_fd(p[0], p[1]) == 0); + assert_se(same_fd(p[1], p[0]) == 0); + assert_se(same_fd(p[0], b) == 0); + assert_se(same_fd(b, p[0]) == 0); + assert_se(same_fd(p[1], a) == 0); + assert_se(same_fd(a, p[1]) == 0); + assert_se(same_fd(p[1], b) == 0); + assert_se(same_fd(b, p[1]) == 0); + + assert_se(same_fd(a, b) == 0); + assert_se(same_fd(b, a) == 0); +} + +TEST(open_serialization_fd) { + _cleanup_close_ int fd = -EBADF; + + fd = open_serialization_fd("test"); + assert_se(fd >= 0); + + assert_se(write(fd, "test\n", 5) == 5); +} + +TEST(open_serialization_file) { + _cleanup_fclose_ FILE *f = NULL; + int r; + + r = open_serialization_file("test", &f); + assert_se(r >= 0); + assert_se(f); + + assert_se(fwrite("test\n", 1, 5, f) == 5); +} + +TEST(fd_move_above_stdio) { + int original_stdin, new_fd; + + original_stdin = fcntl(0, F_DUPFD, 3); + assert_se(original_stdin >= 3); + assert_se(close_nointr(0) != EBADF); + + new_fd = open("/dev/null", O_RDONLY); + assert_se(new_fd == 0); + + new_fd = fd_move_above_stdio(new_fd); + assert_se(new_fd >= 3); + + assert_se(dup(original_stdin) == 0); + assert_se(close_nointr(original_stdin) != EBADF); + assert_se(close_nointr(new_fd) != EBADF); +} + +TEST(rearrange_stdio) { + pid_t pid; + int r; + + r = safe_fork("rearrange", FORK_WAIT|FORK_LOG, &pid); + assert_se(r >= 0); + + if (r == 0) { + _cleanup_free_ char *path = NULL; + char buffer[10]; + + /* Child */ + + safe_close(STDERR_FILENO); /* Let's close an fd < 2, to make it more interesting */ + + assert_se(rearrange_stdio(-EBADF, -EBADF, -EBADF) >= 0); + + assert_se(fd_get_path(STDIN_FILENO, &path) >= 0); + assert_se(path_equal(path, "/dev/null")); + path = mfree(path); + + assert_se(fd_get_path(STDOUT_FILENO, &path) >= 0); + assert_se(path_equal(path, "/dev/null")); + path = mfree(path); + + assert_se(fd_get_path(STDOUT_FILENO, &path) >= 0); + assert_se(path_equal(path, "/dev/null")); + path = mfree(path); + + safe_close(STDIN_FILENO); + safe_close(STDOUT_FILENO); + safe_close(STDERR_FILENO); + + { + int pair[2]; + assert_se(pipe(pair) >= 0); + assert_se(pair[0] == 0); + assert_se(pair[1] == 1); + assert_se(fd_move_above_stdio(0) == 3); + } + assert_se(open("/dev/full", O_WRONLY|O_CLOEXEC) == 0); + assert_se(acquire_data_fd("foobar", 6, 0) == 2); + + assert_se(rearrange_stdio(2, 0, 1) >= 0); + + assert_se(write(1, "x", 1) < 0 && errno == ENOSPC); + assert_se(write(2, "z", 1) == 1); + assert_se(read(3, buffer, sizeof(buffer)) == 1); + assert_se(buffer[0] == 'z'); + assert_se(read(0, buffer, sizeof(buffer)) == 6); + assert_se(memcmp(buffer, "foobar", 6) == 0); + + assert_se(rearrange_stdio(-EBADF, 1, 2) >= 0); + assert_se(write(1, "a", 1) < 0 && errno == ENOSPC); + assert_se(write(2, "y", 1) == 1); + assert_se(read(3, buffer, sizeof(buffer)) == 1); + assert_se(buffer[0] == 'y'); + + assert_se(fd_get_path(0, &path) >= 0); + assert_se(path_equal(path, "/dev/null")); + path = mfree(path); + + _exit(EXIT_SUCCESS); + } +} + +TEST(read_nr_open) { + log_info("nr-open: %i", read_nr_open()); +} + +static size_t validate_fds( + bool opened, + const int *fds, + size_t n_fds) { + + size_t c = 0; + + /* Validates that fds in the specified array are one of the following three: + * + * 1. < 0 (test is skipped) or + * 2. opened (if 'opened' param is true) or + * 3. closed (if 'opened' param is false) + */ + + for (size_t i = 0; i < n_fds; i++) { + if (fds[i] < 0) + continue; + + if (opened) + assert_se(fcntl(fds[i], F_GETFD) >= 0); + else + assert_se(fcntl(fds[i], F_GETFD) < 0 && errno == EBADF); + + c++; + } + + return c; /* Return number of fds >= 0 in the array */ +} + +static void test_close_all_fds_inner(void) { + _cleanup_free_ int *fds = NULL, *keep = NULL; + size_t n_fds, n_keep; + int max_fd; + + log_info("/* %s */", __func__); + + rlimit_nofile_bump(-1); + + max_fd = get_max_fd(); + assert_se(max_fd > 10); + + if (max_fd > 7000) { + /* If the worst fallback is activated we need to iterate through all possible fds, hence, + * let's lower the limit a small bit, so that we don't run for too long. Yes, this undoes the + * rlimit_nofile_bump() call above partially. */ + + (void) setrlimit_closest(RLIMIT_NOFILE, &(struct rlimit) { 7000, 7000 }); + max_fd = 7000; + } + + /* Try to use 5000 fds, but when we can't bump the rlimit to make that happen use the whole limit minus 10 */ + n_fds = MIN(((size_t) max_fd & ~1U) - 10U, 5000U); + assert_se((n_fds & 1U) == 0U); /* make sure even number of fds */ + + /* Allocate the determined number of fds, always two at a time */ + assert_se(fds = new(int, n_fds)); + for (size_t i = 0; i < n_fds; i += 2) + assert_se(pipe2(fds + i, O_CLOEXEC) >= 0); + + /* Validate this worked */ + assert_se(validate_fds(true, fds, n_fds) == n_fds); + + /* Randomized number of fds to keep, but at most every second */ + n_keep = (random_u64() % (n_fds / 2)); + + /* Now randomly select a number of fds from the array above to keep */ + assert_se(keep = new(int, n_keep)); + for (size_t k = 0; k < n_keep; k++) { + for (;;) { + size_t p; + + p = random_u64() % n_fds; + if (fds[p] >= 0) { + keep[k] = TAKE_FD(fds[p]); + break; + } + } + } + + /* Check that all fds from both arrays are still open, and test how many in each are >= 0 */ + assert_se(validate_fds(true, fds, n_fds) == n_fds - n_keep); + assert_se(validate_fds(true, keep, n_keep) == n_keep); + + /* Close logging fd first, so that we don't confuse it by closing its fd */ + log_close(); + log_set_open_when_needed(true); + log_settle_target(); + + /* Close all but the ones to keep */ + assert_se(close_all_fds(keep, n_keep) >= 0); + + assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep); + assert_se(validate_fds(true, keep, n_keep) == n_keep); + + /* Close everything else too! */ + assert_se(close_all_fds(NULL, 0) >= 0); + + assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep); + assert_se(validate_fds(false, keep, n_keep) == n_keep); + + log_set_open_when_needed(false); + log_open(); +} + +static int seccomp_prohibit_close_range(void) { +#if HAVE_SECCOMP && defined(__SNR_close_range) + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int r; + + r = seccomp_init_for_arch(&seccomp, SCMP_ARCH_NATIVE, SCMP_ACT_ALLOW); + if (r < 0) + return log_warning_errno(r, "Failed to acquire seccomp context, ignoring: %m"); + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(close_range), + 0); + if (r < 0) + return log_warning_errno(r, "Failed to add close_range() rule, ignoring: %m"); + + r = seccomp_load(seccomp); + if (r < 0) + return log_warning_errno(r, "Failed to apply close_range() restrictions, ignoring: %m"); + + return 0; +#else + return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Seccomp support or close_range() syscall definition not available."); +#endif +} + +TEST(close_all_fds) { + int r; + + /* Runs the test four times. Once as is. Once with close_range() syscall blocked via seccomp, once + * with /proc/ overmounted, and once with the combination of both. This should trigger all fallbacks + * in the close_range_all() function. */ + + r = safe_fork("(caf-plain)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + if (r == 0) { + test_close_all_fds_inner(); + _exit(EXIT_SUCCESS); + } + assert_se(r >= 0); + + if (geteuid() != 0) + return (void) log_tests_skipped("Lacking privileges for test with close_range() blocked and /proc/ overmounted"); + + r = safe_fork("(caf-noproc)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL); + if (r == 0) { + r = mount_nofollow_verbose(LOG_WARNING, "tmpfs", "/proc", "tmpfs", 0, NULL); + if (r < 0) + log_notice("Overmounting /proc/ didn't work, skipping close_all_fds() with masked /proc/."); + else + test_close_all_fds_inner(); + _exit(EXIT_SUCCESS); + } + assert_se(r >= 0); + + if (!is_seccomp_available()) + return (void) log_tests_skipped("Seccomp not available"); + + r = safe_fork("(caf-seccomp)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + if (r == 0) { + r = seccomp_prohibit_close_range(); + if (r < 0) + log_notice("Applying seccomp filter didn't work, skipping close_all_fds() test with masked close_range()."); + else + test_close_all_fds_inner(); + + _exit(EXIT_SUCCESS); + } + assert_se(r >= 0); + + r = safe_fork("(caf-scnp)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL); + if (r == 0) { + r = seccomp_prohibit_close_range(); + if (r < 0) + log_notice("Applying seccomp filter didn't work, skipping close_all_fds() test with masked close_range()."); + else { + r = mount_nofollow_verbose(LOG_WARNING, "tmpfs", "/proc", "tmpfs", 0, NULL); + if (r < 0) + log_notice("Overmounting /proc/ didn't work, skipping close_all_fds() with masked /proc/."); + else + test_close_all_fds_inner(); + } + + test_close_all_fds_inner(); + _exit(EXIT_SUCCESS); + } + assert_se(r >= 0); +} + +TEST(format_proc_fd_path) { + assert_se(streq_ptr(FORMAT_PROC_FD_PATH(0), "/proc/self/fd/0")); + assert_se(streq_ptr(FORMAT_PROC_FD_PATH(1), "/proc/self/fd/1")); + assert_se(streq_ptr(FORMAT_PROC_FD_PATH(2), "/proc/self/fd/2")); + assert_se(streq_ptr(FORMAT_PROC_FD_PATH(3), "/proc/self/fd/3")); + assert_se(streq_ptr(FORMAT_PROC_FD_PATH(2147483647), "/proc/self/fd/2147483647")); +} + +TEST(fd_reopen) { + _cleanup_close_ int fd1 = -EBADF, fd2 = -EBADF; + struct stat st1, st2; + int fl; + + /* Test this with a directory */ + fd1 = open("/proc", O_DIRECTORY|O_PATH|O_CLOEXEC); + assert_se(fd1 >= 0); + + assert_se(fstat(fd1, &st1) >= 0); + assert_se(S_ISDIR(st1.st_mode)); + + fl = fcntl(fd1, F_GETFL); + assert_se(fl >= 0); + assert_se(FLAGS_SET(fl, O_DIRECTORY)); + assert_se(FLAGS_SET(fl, O_PATH)); + + /* fd_reopen() with O_NOFOLLOW will systematically fail, since it is implemented via a symlink in /proc/self/fd/ */ + assert_se(fd_reopen(fd1, O_RDONLY|O_CLOEXEC|O_NOFOLLOW) == -ELOOP); + assert_se(fd_reopen(fd1, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW) == -ELOOP); + + fd2 = fd_reopen(fd1, O_RDONLY|O_DIRECTORY|O_CLOEXEC); /* drop the O_PATH */ + assert_se(fd2 >= 0); + + assert_se(fstat(fd2, &st2) >= 0); + assert_se(S_ISDIR(st2.st_mode)); + assert_se(stat_inode_same(&st1, &st2)); + + fl = fcntl(fd2, F_GETFL); + assert_se(fl >= 0); + assert_se(FLAGS_SET(fl, O_DIRECTORY)); + assert_se(!FLAGS_SET(fl, O_PATH)); + + safe_close(fd1); + + fd1 = fd_reopen(fd2, O_DIRECTORY|O_PATH|O_CLOEXEC); /* reacquire the O_PATH */ + assert_se(fd1 >= 0); + + assert_se(fstat(fd1, &st1) >= 0); + assert_se(S_ISDIR(st1.st_mode)); + assert_se(stat_inode_same(&st1, &st2)); + + fl = fcntl(fd1, F_GETFL); + assert_se(fl >= 0); + assert_se(FLAGS_SET(fl, O_DIRECTORY)); + assert_se(FLAGS_SET(fl, O_PATH)); + + safe_close(fd1); + + /* And now, test this with a file. */ + fd1 = open("/proc/version", O_PATH|O_CLOEXEC); + assert_se(fd1 >= 0); + + assert_se(fstat(fd1, &st1) >= 0); + assert_se(S_ISREG(st1.st_mode)); + + fl = fcntl(fd1, F_GETFL); + assert_se(fl >= 0); + assert_se(!FLAGS_SET(fl, O_DIRECTORY)); + assert_se(FLAGS_SET(fl, O_PATH)); + + assert_se(fd_reopen(fd1, O_RDONLY|O_DIRECTORY|O_CLOEXEC) == -ENOTDIR); + fd2 = fd_reopen(fd1, O_RDONLY|O_CLOEXEC); /* drop the O_PATH */ + assert_se(fd2 >= 0); + + assert_se(fstat(fd2, &st2) >= 0); + assert_se(S_ISREG(st2.st_mode)); + assert_se(stat_inode_same(&st1, &st2)); + + fl = fcntl(fd2, F_GETFL); + assert_se(fl >= 0); + assert_se(!FLAGS_SET(fl, O_DIRECTORY)); + assert_se(!FLAGS_SET(fl, O_PATH)); + + safe_close(fd1); + + assert_se(fd_reopen(fd2, O_DIRECTORY|O_PATH|O_CLOEXEC) == -ENOTDIR); + fd1 = fd_reopen(fd2, O_PATH|O_CLOEXEC); /* reacquire the O_PATH */ + assert_se(fd1 >= 0); + + assert_se(fstat(fd1, &st1) >= 0); + assert_se(S_ISREG(st1.st_mode)); + assert_se(stat_inode_same(&st1, &st2)); + + fl = fcntl(fd1, F_GETFL); + assert_se(fl >= 0); + assert_se(!FLAGS_SET(fl, O_DIRECTORY)); + assert_se(FLAGS_SET(fl, O_PATH)); + + /* Also check the right error is generated if the fd is already closed */ + safe_close(fd1); + assert_se(fd_reopen(fd1, O_RDONLY|O_CLOEXEC) == -EBADF); + fd1 = -EBADF; + + /* Validate what happens if we reopen a symlink */ + fd1 = open("/proc/self", O_PATH|O_CLOEXEC|O_NOFOLLOW); + assert_se(fd1 >= 0); + assert_se(fstat(fd1, &st1) >= 0); + assert_se(S_ISLNK(st1.st_mode)); + + fd2 = fd_reopen(fd1, O_PATH|O_CLOEXEC); + assert_se(fd2 >= 0); + assert_se(fstat(fd2, &st2) >= 0); + assert_se(S_ISLNK(st2.st_mode)); + assert_se(stat_inode_same(&st1, &st2)); + fd2 = safe_close(fd2); + + /* So here's the thing: if we have an O_PATH fd to a symlink, we *cannot* convert it to a regular fd + * with that. i.e. you cannot have the VFS follow a symlink pinned via an O_PATH fd. */ + assert_se(fd_reopen(fd1, O_RDONLY|O_CLOEXEC) == -ELOOP); +} + +TEST(fd_reopen_condition) { + _cleanup_close_ int fd1 = -EBADF, fd3 = -EBADF; + int fd2, fl; + + /* Open without O_PATH */ + fd1 = open("/usr/", O_RDONLY|O_DIRECTORY|O_CLOEXEC); + assert_se(fd1 >= 0); + + fl = fcntl(fd1, F_GETFL); + assert_se(FLAGS_SET(fl, O_DIRECTORY)); + assert_se(!FLAGS_SET(fl, O_PATH)); + + fd2 = fd_reopen_condition(fd1, O_DIRECTORY, O_DIRECTORY|O_PATH, &fd3); + assert_se(fd2 == fd1); + assert_se(fd3 < 0); + + /* Switch on O_PATH */ + fd2 = fd_reopen_condition(fd1, O_DIRECTORY|O_PATH, O_DIRECTORY|O_PATH, &fd3); + assert_se(fd2 != fd1); + assert_se(fd3 == fd2); + + fl = fcntl(fd2, F_GETFL); + assert_se(FLAGS_SET(fl, O_DIRECTORY)); + assert_se(FLAGS_SET(fl, O_PATH)); + + close_and_replace(fd1, fd3); + + fd2 = fd_reopen_condition(fd1, O_DIRECTORY|O_PATH, O_DIRECTORY|O_PATH, &fd3); + assert_se(fd2 == fd1); + assert_se(fd3 < 0); + + /* Switch off O_PATH again */ + fd2 = fd_reopen_condition(fd1, O_DIRECTORY, O_DIRECTORY|O_PATH, &fd3); + assert_se(fd2 != fd1); + assert_se(fd3 == fd2); + + fl = fcntl(fd2, F_GETFL); + assert_se(FLAGS_SET(fl, O_DIRECTORY)); + assert_se(!FLAGS_SET(fl, O_PATH)); + + close_and_replace(fd1, fd3); + + fd2 = fd_reopen_condition(fd1, O_DIRECTORY, O_DIRECTORY|O_PATH, &fd3); + assert_se(fd2 == fd1); + assert_se(fd3 < 0); +} + +TEST(take_fd) { + _cleanup_close_ int fd1 = -EBADF, fd2 = -EBADF; + int array[2] = EBADF_PAIR, i = 0; + + assert_se(fd1 == -EBADF); + assert_se(fd2 == -EBADF); + + fd1 = eventfd(0, EFD_CLOEXEC); + assert_se(fd1 >= 0); + + fd2 = TAKE_FD(fd1); + assert_se(fd1 == -EBADF); + assert_se(fd2 >= 0); + + assert_se(array[0] == -EBADF); + assert_se(array[1] == -EBADF); + + array[0] = TAKE_FD(fd2); + assert_se(fd1 == -EBADF); + assert_se(fd2 == -EBADF); + assert_se(array[0] >= 0); + assert_se(array[1] == -EBADF); + + array[1] = TAKE_FD(array[i]); + assert_se(array[0] == -EBADF); + assert_se(array[1] >= 0); + + i = 1 - i; + array[0] = TAKE_FD(*(array + i)); + assert_se(array[0] >= 0); + assert_se(array[1] == -EBADF); + + i = 1 - i; + fd1 = TAKE_FD(array[i]); + assert_se(fd1 >= 0); + assert_se(array[0] == -EBADF); + assert_se(array[1] == -EBADF); +} + +TEST(dir_fd_is_root) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert_se(dir_fd_is_root_or_cwd(AT_FDCWD) > 0); + + assert_se((fd = open("/", O_CLOEXEC|O_PATH|O_DIRECTORY|O_NOFOLLOW)) >= 0); + assert_se(dir_fd_is_root(fd) > 0); + assert_se(dir_fd_is_root_or_cwd(fd) > 0); + + fd = safe_close(fd); + + assert_se((fd = open("/usr", O_CLOEXEC|O_PATH|O_DIRECTORY|O_NOFOLLOW)) >= 0); + assert_se(dir_fd_is_root(fd) == 0); + assert_se(dir_fd_is_root_or_cwd(fd) == 0); + + r = detach_mount_namespace(); + if (r < 0) + return (void) log_tests_skipped_errno(r, "Failed to detach mount namespace"); + + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_free_ char *x = NULL, *y = NULL; + + assert_se(mkdtemp_malloc("/tmp/test-mkdir-XXXXXX", &tmp) >= 0); + assert_se(x = path_join(tmp, "x")); + assert_se(y = path_join(tmp, "x/y")); + assert_se(mkdir_p(y, 0755) >= 0); + assert_se(mount_nofollow_verbose(LOG_DEBUG, x, y, NULL, MS_BIND, NULL) >= 0); + + fd = safe_close(fd); + + assert_se((fd = open(tmp, O_CLOEXEC|O_PATH|O_DIRECTORY|O_NOFOLLOW)) >= 0); + assert_se(dir_fd_is_root(fd) == 0); + assert_se(dir_fd_is_root_or_cwd(fd) == 0); + + fd = safe_close(fd); + + assert_se((fd = open(x, O_CLOEXEC|O_PATH|O_DIRECTORY|O_NOFOLLOW)) >= 0); + assert_se(dir_fd_is_root(fd) == 0); + assert_se(dir_fd_is_root_or_cwd(fd) == 0); + + fd = safe_close(fd); + + assert_se((fd = open(y, O_CLOEXEC|O_PATH|O_DIRECTORY|O_NOFOLLOW)) >= 0); + assert_se(dir_fd_is_root(fd) == 0); + assert_se(dir_fd_is_root_or_cwd(fd) == 0); +} + +TEST(fd_get_path) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *p = NULL, *q = NULL, *saved_cwd = NULL; + + tfd = mkdtemp_open(NULL, O_PATH, &t); + assert_se(tfd >= 0); + assert_se(fd_get_path(tfd, &p) >= 0); + assert_se(streq(p, t)); + + p = mfree(p); + + assert_se(safe_getcwd(&saved_cwd) >= 0); + assert_se(chdir(t) >= 0); + + assert_se(fd_get_path(AT_FDCWD, &p) >= 0); + assert_se(streq(p, t)); + + p = mfree(p); + + assert_se(q = path_join(t, "regular")); + assert_se(touch(q) >= 0); + assert_se(mkdirat_parents(tfd, "subdir/symlink", 0755) >= 0); + assert_se(symlinkat("../regular", tfd, "subdir/symlink") >= 0); + assert_se(symlinkat("subdir", tfd, "symdir") >= 0); + + fd = openat(tfd, "regular", O_CLOEXEC|O_PATH); + assert_se(fd >= 0); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(AT_FDCWD, "regular", O_CLOEXEC|O_PATH); + assert_se(fd >= 0); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(tfd, "subdir/symlink", O_CLOEXEC|O_PATH); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) >= 0); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(AT_FDCWD, "subdir/symlink", O_CLOEXEC|O_PATH); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) >= 0); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(tfd, "symdir//./symlink", O_CLOEXEC|O_PATH); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) >= 0); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(AT_FDCWD, "symdir//./symlink", O_CLOEXEC|O_PATH); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) >= 0); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + q = mfree(q); + fd = safe_close(fd); + + assert_se(q = path_join(t, "subdir/symlink")); + fd = openat(tfd, "subdir/symlink", O_CLOEXEC|O_PATH|O_NOFOLLOW); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) == -ELOOP); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(AT_FDCWD, "subdir/symlink", O_CLOEXEC|O_PATH|O_NOFOLLOW); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) == -ELOOP); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(tfd, "symdir//./symlink", O_CLOEXEC|O_PATH|O_NOFOLLOW); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) == -ELOOP); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + p = mfree(p); + fd = safe_close(fd); + + fd = openat(AT_FDCWD, "symdir//./symlink", O_CLOEXEC|O_PATH|O_NOFOLLOW); + assert_se(fd >= 0); + assert_se(fd_verify_regular(fd) == -ELOOP); + assert_se(fd_get_path(fd, &p) >= 0); + assert_se(streq(p, q)); + + assert_se(chdir(saved_cwd) >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-fdset.c b/src/test/test-fdset.c new file mode 100644 index 0000000..8f00e59 --- /dev/null +++ b/src/test/test-fdset.c @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "fdset.h" +#include "fs-util.h" +#include "macro.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(fdset_new_fill) { + _cleanup_fdset_free_ FDSet *fdset = NULL; + int fd = -EBADF, flags; + + log_close(); + log_set_open_when_needed(true); + + fd = open("/dev/null", O_CLOEXEC|O_RDONLY); + assert_se(fd >= 0); + + assert_se(fdset_new_fill(/* filter_cloexec= */ -1, &fdset) >= 0); + assert_se(fdset_contains(fdset, fd)); + fdset = fdset_free(fdset); + assert_se(fcntl(fd, F_GETFD) < 0); + assert_se(errno == EBADF); + + fd = open("/dev/null", O_CLOEXEC|O_RDONLY); + assert_se(fd >= 0); + + assert_se(fdset_new_fill(/* filter_cloexec= */ 0, &fdset) >= 0); + assert_se(!fdset_contains(fdset, fd)); + fdset = fdset_free(fdset); + assert_se(fcntl(fd, F_GETFD) >= 0); + + assert_se(fdset_new_fill(/* filter_cloexec= */ 1, &fdset) >= 0); + assert_se(fdset_contains(fdset, fd)); + fdset = fdset_free(fdset); + assert_se(fcntl(fd, F_GETFD) < 0); + assert_se(errno == EBADF); + + fd = open("/dev/null", O_RDONLY); + assert_se(fd >= 0); + + assert_se(fdset_new_fill(/* filter_cloexec= */ 1, &fdset) >= 0); + assert_se(!fdset_contains(fdset, fd)); + fdset = fdset_free(fdset); + assert_se(fcntl(fd, F_GETFD) >= 0); + + assert_se(fdset_new_fill(/* filter_cloexec= */ 0, &fdset) >= 0); + assert_se(fdset_contains(fdset, fd)); + flags = fcntl(fd, F_GETFD); + assert_se(flags >= 0); + assert_se(FLAGS_SET(flags, FD_CLOEXEC)); + fdset = fdset_free(fdset); + assert_se(fcntl(fd, F_GETFD) < 0); + assert_se(errno == EBADF); + + log_open(); +} + +TEST(fdset_put_dup) { + _cleanup_close_ int fd = -EBADF; + int copyfd = -EBADF; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_put_dup.XXXXXX"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + copyfd = fdset_put_dup(fdset, fd); + assert_se(copyfd >= 0 && copyfd != fd); + assert_se(fdset_contains(fdset, copyfd)); + assert_se(!fdset_contains(fdset, fd)); +} + +TEST(fdset_cloexec) { + int fd = -EBADF; + _cleanup_fdset_free_ FDSet *fdset = NULL; + int flags = -1; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_cloexec.XXXXXX"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + assert_se(fdset_put(fdset, fd)); + + assert_se(fdset_cloexec(fdset, false) >= 0); + flags = fcntl(fd, F_GETFD); + assert_se(flags >= 0); + assert_se(!(flags & FD_CLOEXEC)); + + assert_se(fdset_cloexec(fdset, true) >= 0); + flags = fcntl(fd, F_GETFD); + assert_se(flags >= 0); + assert_se(flags & FD_CLOEXEC); +} + +TEST(fdset_close_others) { + int fd = -EBADF; + int copyfd = -EBADF; + _cleanup_fdset_free_ FDSet *fdset = NULL; + int flags = -1; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_close_others.XXXXXX"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + copyfd = fdset_put_dup(fdset, fd); + assert_se(copyfd >= 0); + + assert_se(fdset_close_others(fdset) >= 0); + flags = fcntl(fd, F_GETFD); + assert_se(flags < 0); + flags = fcntl(copyfd, F_GETFD); + assert_se(flags >= 0); +} + +TEST(fdset_remove) { + _cleanup_close_ int fd = -EBADF; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_remove.XXXXXX"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + assert_se(fdset_put(fdset, fd) >= 0); + assert_se(fdset_remove(fdset, fd) >= 0); + assert_se(!fdset_contains(fdset, fd)); + + assert_se(fcntl(fd, F_GETFD) >= 0); +} + +TEST(fdset_iterate) { + int fd = -EBADF; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_iterate.XXXXXX"; + int c = 0; + int a; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + assert_se(fdset_put(fdset, fd) >= 0); + assert_se(fdset_put(fdset, fd) >= 0); + assert_se(fdset_put(fdset, fd) >= 0); + + FDSET_FOREACH(a, fdset) { + c++; + assert_se(a == fd); + } + assert_se(c == 1); +} + +TEST(fdset_isempty) { + int fd; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_isempty.XXXXXX"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + + assert_se(fdset_isempty(fdset)); + assert_se(fdset_put(fdset, fd) >= 0); + assert_se(!fdset_isempty(fdset)); +} + +TEST(fdset_steal_first) { + int fd; + _cleanup_fdset_free_ FDSet *fdset = NULL; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fdset_steal_first.XXXXXX"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + + fdset = fdset_new(); + assert_se(fdset); + + assert_se(fdset_steal_first(fdset) < 0); + assert_se(fdset_put(fdset, fd) >= 0); + assert_se(fdset_steal_first(fdset) == fd); + assert_se(fdset_steal_first(fdset) < 0); + assert_se(fdset_put(fdset, fd) >= 0); +} + +TEST(fdset_new_array) { + int fds[] = {10, 11, 12, 13}; + _cleanup_fdset_free_ FDSet *fdset = NULL; + + assert_se(fdset_new_array(&fdset, fds, 4) >= 0); + assert_se(fdset_size(fdset) == 4); + assert_se(fdset_contains(fdset, 10)); + assert_se(fdset_contains(fdset, 11)); + assert_se(fdset_contains(fdset, 12)); + assert_se(fdset_contains(fdset, 13)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-fiemap.c b/src/test/test-fiemap.c new file mode 100644 index 0000000..380638b --- /dev/null +++ b/src/test/test-fiemap.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "errno-util.h" +#include "fd-util.h" +#include "log.h" +#include "hibernate-util.h" +#include "tests.h" + +static int test_fiemap_one(const char *path) { + _cleanup_free_ struct fiemap *fiemap = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + log_info("/* %s */", __func__); + + fd = open(path, O_RDONLY | O_CLOEXEC | O_NONBLOCK); + if (fd < 0) + return log_error_errno(errno, "failed to open %s: %m", path); + r = read_fiemap(fd, &fiemap); + if (r == -EOPNOTSUPP) + exit(log_tests_skipped("Not supported")); + if (r < 0) + return log_error_errno(r, "Unable to read extent map for '%s': %m", path); + log_info("extent map information for %s:", path); + log_info("\t start: %" PRIu64, (uint64_t) fiemap->fm_start); + log_info("\t length: %" PRIu64, (uint64_t) fiemap->fm_length); + log_info("\t flags: %" PRIu32, fiemap->fm_flags); + log_info("\t number of mapped extents: %" PRIu32, fiemap->fm_mapped_extents); + log_info("\t extent count: %" PRIu32, fiemap->fm_extent_count); + if (fiemap->fm_extent_count > 0) + log_info("\t first extent location: %" PRIu64, + (uint64_t) (fiemap->fm_extents[0].fe_physical / page_size())); + + return 0; +} + +TEST_RET(fiemap) { + int r = 0; + + assert_se(test_fiemap_one(saved_argv[0]) == 0); + for (int i = 1; i < saved_argc; i++) { + int k = test_fiemap_one(saved_argv[i]); + if (r == 0) + r = k; + } + + return r; +} + +static int intro(void) { + if (getuid() != 0) + log_warning("This program is unlikely to work for unprivileged users"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-fileio.c b/src/test/test-fileio.c new file mode 100644 index 0000000..ad98a92 --- /dev/null +++ b/src/test/test-fileio.c @@ -0,0 +1,1151 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "ctype.h" +#include "env-file.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "iovec-util.h" +#include "memfd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(parse_env_file) { + _cleanup_(unlink_tempfilep) char + t[] = "/tmp/test-fileio-in-XXXXXX", + p[] = "/tmp/test-fileio-out-XXXXXX"; + FILE *f; + _cleanup_free_ char *one = NULL, *two = NULL, *three = NULL, *four = NULL, *five = NULL, + *six = NULL, *seven = NULL, *eight = NULL, *nine = NULL, *ten = NULL, + *eleven = NULL, *twelve = NULL, *thirteen = NULL; + _cleanup_strv_free_ char **a = NULL, **b = NULL; + unsigned k; + int r; + + assert_se(fmkostemp_safe(t, "w", &f) == 0); + fputs("one=BAR \n" + "# comment\n" + " # comment \n" + " ; comment \n" + " two = bar \n" + "invalid line\n" + "invalid line #comment\n" + "three = \"333\n" + "xxxx\"\n" + "four = \'44\\\"44\'\n" + "five = \"55\\\"55\" \"FIVE\" cinco \n" + "six = seis sechs\\\n" + " sis\n" + "seven=\"sevenval\" #nocomment\n" + "eight=eightval #nocomment\n" + "export nine=nineval\n" + "ten=ignored\n" + "ten=ignored\n" + "ten=\n" + "eleven=\\value\n" + "twelve=\"\\value\"\n" + "thirteen='\\value'", f); + + fflush(f); + fclose(f); + + r = load_env_file(NULL, t, &a); + assert_se(r >= 0); + + STRV_FOREACH(i, a) + log_info("Got: <%s>", *i); + + assert_se(streq_ptr(a[0], "one=BAR")); + assert_se(streq_ptr(a[1], "two=bar")); + assert_se(streq_ptr(a[2], "three=333\nxxxx")); + assert_se(streq_ptr(a[3], "four=44\\\"44")); + assert_se(streq_ptr(a[4], "five=55\"55FIVEcinco")); + assert_se(streq_ptr(a[5], "six=seis sechs sis")); + assert_se(streq_ptr(a[6], "seven=sevenval#nocomment")); + assert_se(streq_ptr(a[7], "eight=eightval #nocomment")); + assert_se(streq_ptr(a[8], "export nine=nineval")); + assert_se(streq_ptr(a[9], "ten=")); + assert_se(streq_ptr(a[10], "eleven=value")); + assert_se(streq_ptr(a[11], "twelve=\\value")); + assert_se(streq_ptr(a[12], "thirteen=\\value")); + assert_se(a[13] == NULL); + + strv_env_clean(a); + + k = 0; + STRV_FOREACH(i, b) { + log_info("Got2: <%s>", *i); + assert_se(streq(*i, a[k++])); + } + + r = parse_env_file( + NULL, t, + "one", &one, + "two", &two, + "three", &three, + "four", &four, + "five", &five, + "six", &six, + "seven", &seven, + "eight", &eight, + "export nine", &nine, + "ten", &ten, + "eleven", &eleven, + "twelve", &twelve, + "thirteen", &thirteen); + assert_se(r == 0); + + log_info("one=[%s]", strna(one)); + log_info("two=[%s]", strna(two)); + log_info("three=[%s]", strna(three)); + log_info("four=[%s]", strna(four)); + log_info("five=[%s]", strna(five)); + log_info("six=[%s]", strna(six)); + log_info("seven=[%s]", strna(seven)); + log_info("eight=[%s]", strna(eight)); + log_info("export nine=[%s]", strna(nine)); + log_info("ten=[%s]", strna(nine)); + log_info("eleven=[%s]", strna(eleven)); + log_info("twelve=[%s]", strna(twelve)); + log_info("thirteen=[%s]", strna(thirteen)); + + assert_se(streq(one, "BAR")); + assert_se(streq(two, "bar")); + assert_se(streq(three, "333\nxxxx")); + assert_se(streq(four, "44\\\"44")); + assert_se(streq(five, "55\"55FIVEcinco")); + assert_se(streq(six, "seis sechs sis")); + assert_se(streq(seven, "sevenval#nocomment")); + assert_se(streq(eight, "eightval #nocomment")); + assert_se(streq(nine, "nineval")); + assert_se(ten == NULL); + assert_se(streq(eleven, "value")); + assert_se(streq(twelve, "\\value")); + assert_se(streq(thirteen, "\\value")); + + { + /* prepare a temporary file to write the environment to */ + _cleanup_close_ int fd = mkostemp_safe(p); + assert_se(fd >= 0); + } + + r = write_env_file(AT_FDCWD, p, NULL, a); + assert_se(r >= 0); + + r = load_env_file(NULL, p, &b); + assert_se(r >= 0); +} + +static void test_one_shell_var(const char *file, const char *variable, const char *value) { + _cleanup_free_ char *cmd = NULL, *from_shell = NULL; + _cleanup_pclose_ FILE *f = NULL; + size_t sz; + + assert_se(cmd = strjoin(". ", file, " && /bin/echo -n \"$", variable, "\"")); + assert_se(f = popen(cmd, "re")); + assert_se(read_full_stream(f, &from_shell, &sz) >= 0); + assert_se(sz == strlen(value)); + assert_se(streq(from_shell, value)); +} + +TEST(parse_multiline_env_file) { + _cleanup_(unlink_tempfilep) char + t[] = "/tmp/test-fileio-in-XXXXXX", + p[] = "/tmp/test-fileio-out-XXXXXX"; + FILE *f; + _cleanup_strv_free_ char **a = NULL, **b = NULL; + int r; + + assert_se(fmkostemp_safe(t, "w", &f) == 0); + fputs("one=BAR\\\n" + "\\ \\ \\ \\ VAR\\\n" + "\\\tGAR\n" + "#comment\n" + "two=\"bar\\\n" + " var\\\n" + "\tgar\"\n" + "#comment\n" + "tri=\"bar \\\n" + " var \\\n" + "\tgar \"\n", f); + + assert_se(fflush_and_check(f) >= 0); + fclose(f); + + test_one_shell_var(t, "one", "BAR VAR\tGAR"); + test_one_shell_var(t, "two", "bar var\tgar"); + test_one_shell_var(t, "tri", "bar var \tgar "); + + r = load_env_file(NULL, t, &a); + assert_se(r >= 0); + + STRV_FOREACH(i, a) + log_info("Got: <%s>", *i); + + assert_se(streq_ptr(a[0], "one=BAR VAR\tGAR")); + assert_se(streq_ptr(a[1], "two=bar var\tgar")); + assert_se(streq_ptr(a[2], "tri=bar var \tgar ")); + assert_se(a[3] == NULL); + + { + _cleanup_close_ int fd = mkostemp_safe(p); + assert_se(fd >= 0); + } + + r = write_env_file(AT_FDCWD, p, NULL, a); + assert_se(r >= 0); + + r = load_env_file(NULL, p, &b); + assert_se(r >= 0); +} + +TEST(merge_env_file) { + _cleanup_(unlink_tempfilep) char t[] = "/tmp/test-fileio-XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **a = NULL; + int r; + + assert_se(fmkostemp_safe(t, "w", &f) == 0); + log_info("/* %s (%s) */", __func__, t); + + r = write_string_stream(f, + "one=1 \n" + "twelve=${one}2\n" + "twentyone=2${one}\n" + "one=2\n" + "twentytwo=2${one}\n" + "xxx_minus_three=$xxx - 3\n" + "xxx=0x$one$one$one\n" + "yyy=${one:-fallback}\n" + "zzz=${one:+replacement}\n" + "zzzz=${foobar:-${nothing}}\n" + "zzzzz=${nothing:+${nothing}}\n" + , WRITE_STRING_FILE_AVOID_NEWLINE); + assert_se(r >= 0); + + r = merge_env_file(&a, NULL, t); + assert_se(r >= 0); + strv_sort(a); + + STRV_FOREACH(i, a) + log_info("Got: <%s>", *i); + + assert_se(streq(a[0], "one=2")); + assert_se(streq(a[1], "twelve=12")); + assert_se(streq(a[2], "twentyone=21")); + assert_se(streq(a[3], "twentytwo=22")); + assert_se(streq(a[4], "xxx=0x222")); + assert_se(streq(a[5], "xxx_minus_three= - 3")); + assert_se(streq(a[6], "yyy=2")); + assert_se(streq(a[7], "zzz=replacement")); + assert_se(streq(a[8], "zzzz=")); + assert_se(streq(a[9], "zzzzz=")); + assert_se(a[10] == NULL); + + r = merge_env_file(&a, NULL, t); + assert_se(r >= 0); + strv_sort(a); + + STRV_FOREACH(i, a) + log_info("Got2: <%s>", *i); + + assert_se(streq(a[0], "one=2")); + assert_se(streq(a[1], "twelve=12")); + assert_se(streq(a[2], "twentyone=21")); + assert_se(streq(a[3], "twentytwo=22")); + assert_se(streq(a[4], "xxx=0x222")); + assert_se(streq(a[5], "xxx_minus_three=0x222 - 3")); + assert_se(streq(a[6], "yyy=2")); + assert_se(streq(a[7], "zzz=replacement")); + assert_se(streq(a[8], "zzzz=")); + assert_se(streq(a[9], "zzzzz=")); + assert_se(a[10] == NULL); +} + +TEST(merge_env_file_invalid) { + _cleanup_(unlink_tempfilep) char t[] = "/tmp/test-fileio-XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **a = NULL; + int r; + + assert_se(fmkostemp_safe(t, "w", &f) == 0); + log_info("/* %s (%s) */", __func__, t); + + r = write_string_stream(f, + "unset one \n" + "unset one= \n" + "unset one=1 \n" + "one \n" + "one = \n" + "one two =\n" + "\x20two=\n" + "#comment=comment\n" + ";comment2=comment2\n" + "#\n" + "\n\n" /* empty line */ + , WRITE_STRING_FILE_AVOID_NEWLINE); + assert_se(r >= 0); + + r = merge_env_file(&a, NULL, t); + assert_se(r >= 0); + + STRV_FOREACH(i, a) + log_info("Got: <%s>", *i); + + assert_se(strv_isempty(a)); +} + +TEST(executable_is_script) { + _cleanup_(unlink_tempfilep) char t[] = "/tmp/test-fileio-XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + char *command; + int r; + + assert_se(fmkostemp_safe(t, "w", &f) == 0); + fputs("#! /bin/script -a -b \ngoo goo", f); + fflush(f); + + r = executable_is_script(t, &command); + assert_se(r > 0); + assert_se(streq(command, "/bin/script")); + free(command); + + r = executable_is_script("/bin/sh", &command); + assert_se(r == 0); + + r = executable_is_script("/usr/bin/yum", &command); + if (r > 0) { + assert_se(startswith(command, "/")); + free(command); + } +} + +TEST(status_field) { + _cleanup_free_ char *p = NULL, *s = NULL, *z = NULL; + unsigned long long total = 0, buffers = 0; + int r; + + r = get_proc_field("/proc/meminfo", "MemTotal", WHITESPACE, &p); + if (r != -ENOENT) { + assert_se(r == 0); + puts(p); + assert_se(safe_atollu(p, &total) == 0); + } + + r = get_proc_field("/proc/meminfo", "Buffers", WHITESPACE, &s); + if (r != -ENOENT) { + assert_se(r == 0); + puts(s); + assert_se(safe_atollu(s, &buffers) == 0); + } + + if (p) + assert_se(buffers < total); + + /* Seccomp should be a good test for field full of zeros. */ + r = get_proc_field("/proc/meminfo", "Seccomp", WHITESPACE, &z); + if (r != -ENOENT) { + assert_se(r == 0); + puts(z); + assert_se(safe_atollu(z, &buffers) == 0); + } +} + +TEST(capeff) { + for (int pid = 0; pid < 2; pid++) { + _cleanup_free_ char *capeff = NULL; + int r, p; + + r = get_process_capeff(0, &capeff); + log_info("capeff: '%s' (r=%d)", capeff, r); + + if (IN_SET(r, -ENOENT, -EPERM)) + return; + + assert_se(r == 0); + assert_se(*capeff); + p = capeff[strspn(capeff, HEXDIGITS)]; + assert_se(!p || isspace(p)); + } +} + +TEST(read_one_line_file) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-fileio-1lf-XXXXXX"; + int fd; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *buf, *buf2, *buf3, *buf4, *buf5; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + f = fdopen(fd, "we"); + assert_se(f); + + assert_se(read_one_line_file(fn, &buf) == 0); + assert_se(streq_ptr(buf, "")); + assert_se(read_one_line_file(fn, &buf2) == 0); + assert_se(streq_ptr(buf2, "")); + + assert_se(write_string_stream(f, "x", WRITE_STRING_FILE_AVOID_NEWLINE) >= 0); + fflush(f); + + assert_se(read_one_line_file(fn, &buf3) == 1); + assert_se(streq_ptr(buf3, "x")); + + assert_se(write_string_stream(f, "\n", WRITE_STRING_FILE_AVOID_NEWLINE) >= 0); + fflush(f); + + assert_se(read_one_line_file(fn, &buf4) == 2); + assert_se(streq_ptr(buf4, "x")); + + assert_se(write_string_stream(f, "\n", WRITE_STRING_FILE_AVOID_NEWLINE) >= 0); + fflush(f); + + assert_se(read_one_line_file(fn, &buf5) == 2); + assert_se(streq_ptr(buf5, "x")); +} + +TEST(write_string_stream) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-write_string_stream-XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + int fd; + char buf[64]; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + f = fdopen(fd, "r"); + assert_se(f); + assert_se(write_string_stream(f, "boohoo", 0) < 0); + f = safe_fclose(f); + + f = fopen(fn, "r+"); + assert_se(f); + + assert_se(write_string_stream(f, "boohoo", 0) == 0); + rewind(f); + + assert_se(fgets(buf, sizeof(buf), f)); + assert_se(streq(buf, "boohoo\n")); + f = safe_fclose(f); + + f = fopen(fn, "w+"); + assert_se(f); + + assert_se(write_string_stream(f, "boohoo", WRITE_STRING_FILE_AVOID_NEWLINE) == 0); + rewind(f); + + assert_se(fgets(buf, sizeof(buf), f)); + printf(">%s<", buf); + assert_se(streq(buf, "boohoo")); +} + +TEST(write_string_file) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-write_string_file-XXXXXX"; + char buf[64] = {}; + _cleanup_close_ int fd = -EBADF; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + assert_se(write_string_file(fn, "boohoo", WRITE_STRING_FILE_CREATE) == 0); + + assert_se(read(fd, buf, sizeof(buf)) == 7); + assert_se(streq(buf, "boohoo\n")); +} + +TEST(write_string_file_no_create) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-write_string_file_no_create-XXXXXX"; + _cleanup_close_ int fd = -EBADF; + char buf[64] = {}; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + assert_se(write_string_file("/a/file/which/does/not/exists/i/guess", "boohoo", 0) < 0); + assert_se(write_string_file(fn, "boohoo", 0) == 0); + + assert_se(read(fd, buf, sizeof buf) == (ssize_t) strlen("boohoo\n")); + assert_se(streq(buf, "boohoo\n")); +} + +TEST(write_string_file_verify) { + _cleanup_free_ char *buf = NULL, *buf2 = NULL; + int r; + + r = read_one_line_file("/proc/version", &buf); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + return; + assert_se(r >= 0); + assert_se(buf2 = strjoin(buf, "\n")); + + r = write_string_file("/proc/version", buf, 0); + assert_se(IN_SET(r, -EACCES, -EIO)); + r = write_string_file("/proc/version", buf2, 0); + assert_se(IN_SET(r, -EACCES, -EIO)); + + assert_se(write_string_file("/proc/version", buf, WRITE_STRING_FILE_VERIFY_ON_FAILURE) == 0); + assert_se(write_string_file("/proc/version", buf2, WRITE_STRING_FILE_VERIFY_ON_FAILURE) == 0); + + r = write_string_file("/proc/version", buf, WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_AVOID_NEWLINE); + assert_se(IN_SET(r, -EACCES, -EIO)); + assert_se(write_string_file("/proc/version", buf2, WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_AVOID_NEWLINE) == 0); +} + +static void check_file_pairs_one(char **l) { + assert_se(l); + assert_se(strv_length(l) == 14); + + STRV_FOREACH_PAIR(k, v, l) { + assert_se(STR_IN_SET(*k, "NAME", "ID", "PRETTY_NAME", "ANSI_COLOR", "HOME_URL", "SUPPORT_URL", "BUG_REPORT_URL")); + printf("%s=%s\n", *k, *v); + assert_se(!streq(*k, "NAME") || streq(*v, "Arch Linux")); + assert_se(!streq(*k, "ID") || streq(*v, "arch")); + assert_se(!streq(*k, "PRETTY_NAME") || streq(*v, "Arch Linux")); + assert_se(!streq(*k, "ANSI_COLOR") || streq(*v, "0;36")); + assert_se(!streq(*k, "HOME_URL") || streq(*v, "https://www.archlinux.org/")); + assert_se(!streq(*k, "SUPPORT_URL") || streq(*v, "https://bbs.archlinux.org/")); + assert_se(!streq(*k, "BUG_REPORT_URL") || streq(*v, "https://bugs.archlinux.org/")); + } +} + +TEST(load_env_file_pairs) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-load_env_file_pairs-XXXXXX"; + int fd, r; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **l = NULL; + + fd = mkostemp_safe(fn); + assert_se(fd >= 0); + + r = write_string_file(fn, + "NAME=\"Arch Linux\"\n" + "ID=arch\n" + "PRETTY_NAME=\"Arch Linux\"\n" + "ANSI_COLOR=\"0;36\"\n" + "HOME_URL=\"https://www.archlinux.org/\"\n" + "SUPPORT_URL=\"https://bbs.archlinux.org/\"\n" + "BUG_REPORT_URL=\"https://bugs.archlinux.org/\"\n", + WRITE_STRING_FILE_CREATE); + assert_se(r == 0); + + r = load_env_file_pairs_fd(fd, fn, &l); + assert_se(r >= 0); + check_file_pairs_one(l); + l = strv_free(l); + + f = fdopen(fd, "r"); + assert_se(f); + + r = load_env_file_pairs(f, fn, &l); + assert_se(r >= 0); + check_file_pairs_one(l); +} + +TEST(search_and_fopen) { + static const char* const dirs[] = { + "/tmp/foo/bar", + "/tmp", + NULL + }; + char name[] = "/tmp/test-search_and_fopen.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + const char *e; + int r; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + fd = safe_close(fd); + + r = search_and_fopen(basename(name), "re", NULL, (const char**) dirs, &f, &p); + assert_se(r >= 0); + assert_se(e = path_startswith(p, "/tmp/")); + assert_se(streq(basename(name), e)); + f = safe_fclose(f); + p = mfree(p); + + r = search_and_fopen(basename(name), NULL, NULL, (const char**) dirs, NULL, &p); + assert_se(r >= 0); + assert_se(e = path_startswith(p, "/tmp/")); + assert_se(streq(basename(name), e)); + p = mfree(p); + + r = search_and_fopen(name, "re", NULL, (const char**) dirs, &f, &p); + assert_se(r >= 0); + assert_se(path_equal(name, p)); + f = safe_fclose(f); + p = mfree(p); + + r = search_and_fopen(name, NULL, NULL, (const char**) dirs, NULL, &p); + assert_se(r >= 0); + assert_se(path_equal(name, p)); + p = mfree(p); + + r = search_and_fopen(basename(name), "re", "/", (const char**) dirs, &f, &p); + assert_se(r >= 0); + assert_se(e = path_startswith(p, "/tmp/")); + assert_se(streq(basename(name), e)); + f = safe_fclose(f); + p = mfree(p); + + r = search_and_fopen(basename(name), NULL, "/", (const char**) dirs, NULL, &p); + assert_se(r >= 0); + assert_se(e = path_startswith(p, "/tmp/")); + assert_se(streq(basename(name), e)); + p = mfree(p); + + r = search_and_fopen("/a/file/which/does/not/exist/i/guess", "re", NULL, (const char**) dirs, &f, &p); + assert_se(r == -ENOENT); + r = search_and_fopen("/a/file/which/does/not/exist/i/guess", NULL, NULL, (const char**) dirs, NULL, &p); + assert_se(r == -ENOENT); + r = search_and_fopen("afilewhichdoesnotexistiguess", "re", NULL, (const char**) dirs, &f, &p); + assert_se(r == -ENOENT); + r = search_and_fopen("afilewhichdoesnotexistiguess", NULL, NULL, (const char**) dirs, NULL, &p); + assert_se(r == -ENOENT); + + r = unlink(name); + assert_se(r == 0); + + r = search_and_fopen(basename(name), "re", NULL, (const char**) dirs, &f, &p); + assert_se(r == -ENOENT); + r = search_and_fopen(basename(name), NULL, NULL, (const char**) dirs, NULL, &p); + assert_se(r == -ENOENT); +} + +TEST(search_and_fopen_nulstr) { + static const char dirs[] = + "/tmp/foo/bar\0" + "/tmp\0"; + + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-search_and_fopen.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + const char *e; + int r; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + fd = safe_close(fd); + + r = search_and_fopen_nulstr(basename(name), "re", NULL, dirs, &f, &p); + assert_se(r >= 0); + assert_se(e = path_startswith(p, "/tmp/")); + assert_se(streq(basename(name), e)); + f = safe_fclose(f); + p = mfree(p); + + r = search_and_fopen_nulstr(name, "re", NULL, dirs, &f, &p); + assert_se(r >= 0); + assert_se(path_equal(name, p)); + f = safe_fclose(f); + p = mfree(p); + + r = search_and_fopen_nulstr("/a/file/which/does/not/exist/i/guess", "re", NULL, dirs, &f, &p); + assert_se(r == -ENOENT); + r = search_and_fopen_nulstr("afilewhichdoesnotexistiguess", "re", NULL, dirs, &f, &p); + assert_se(r == -ENOENT); + + r = unlink(name); + assert_se(r == 0); + + r = search_and_fopen_nulstr(basename(name), "re", NULL, dirs, &f, &p); + assert_se(r == -ENOENT); +} + +TEST(writing_tmpfile) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-systemd_writing_tmpfile.XXXXXX"; + _cleanup_free_ char *contents = NULL; + size_t size; + _cleanup_close_ int fd = -EBADF; + int r; + + struct iovec iov[] = { + IOVEC_MAKE_STRING("abc\n"), + IOVEC_MAKE_STRING(ALPHANUMERICAL "\n"), + IOVEC_MAKE_STRING(""), + }; + + fd = mkostemp_safe(name); + printf("tmpfile: %s", name); + + r = writev(fd, iov, 3); + assert_se(r >= 0); + + r = read_full_file(name, &contents, &size); + assert_se(r == 0); + printf("contents: %s", contents); + assert_se(streq(contents, "abc\n" ALPHANUMERICAL "\n")); +} + +TEST(tempfn) { + char *ret = NULL, *p; + + assert_se(tempfn_xxxxxx("/foo/bar/waldo", NULL, &ret) >= 0); + assert_se(streq_ptr(ret, "/foo/bar/.#waldoXXXXXX")); + free(ret); + + assert_se(tempfn_xxxxxx("/foo/bar/waldo", "[miau]", &ret) >= 0); + assert_se(streq_ptr(ret, "/foo/bar/.#[miau]waldoXXXXXX")); + free(ret); + + assert_se(tempfn_random("/foo/bar/waldo", NULL, &ret) >= 0); + assert_se(p = startswith(ret, "/foo/bar/.#waldo")); + assert_se(strlen(p) == 16); + assert_se(in_charset(p, "0123456789abcdef")); + free(ret); + + assert_se(tempfn_random("/foo/bar/waldo", "[wuff]", &ret) >= 0); + assert_se(p = startswith(ret, "/foo/bar/.#[wuff]waldo")); + assert_se(strlen(p) == 16); + assert_se(in_charset(p, "0123456789abcdef")); + free(ret); + + assert_se(tempfn_random_child("/foo/bar/waldo", NULL, &ret) >= 0); + assert_se(p = startswith(ret, "/foo/bar/waldo/.#")); + assert_se(strlen(p) == 16); + assert_se(in_charset(p, "0123456789abcdef")); + free(ret); + + assert_se(tempfn_random_child("/foo/bar/waldo", "[kikiriki]", &ret) >= 0); + assert_se(p = startswith(ret, "/foo/bar/waldo/.#[kikiriki]")); + assert_se(strlen(p) == 16); + assert_se(in_charset(p, "0123456789abcdef")); + free(ret); +} + +static const char chars[] = + "Aąę„”\n루\377"; + +DISABLE_WARNING_TYPE_LIMITS; + +TEST(fgetc) { + _cleanup_fclose_ FILE *f = NULL; + char c; + + assert_se(f = fmemopen_unlocked((void*) chars, sizeof(chars), "r")); + + for (size_t i = 0; i < sizeof(chars); i++) { + assert_se(safe_fgetc(f, &c) == 1); + assert_se(c == chars[i]); + + if (ungetc(c, f) == EOF) { + /* EOF is -1, and hence we can't push value 255 in this way – if char is signed */ + assert_se(c == (char) EOF); + assert_se(CHAR_MIN == -128); /* verify that char is signed on this platform */ + } else { + assert_se(safe_fgetc(f, &c) == 1); + assert_se(c == chars[i]); + } + + /* But it works when we push it properly cast */ + assert_se(ungetc((unsigned char) c, f) != EOF); + assert_se(safe_fgetc(f, &c) == 1); + assert_se(c == chars[i]); + } + + assert_se(safe_fgetc(f, &c) == 0); +} + +REENABLE_WARNING; + +static const char buffer[] = + "Some test data\n" + "루Non-ascii chars: ąę„”\n" + "terminators\r\n" + "and even more\n\r" + "now the same with a NUL\n\0" + "and more\r\0" + "and even more\r\n\0" + "and yet even more\n\r\0" + "With newlines, and a NUL byte\0" + "\n" + "an empty line\n" + "an ignored line\n" + "and a very long line that is supposed to be truncated, because it is so long\n"; + +static void test_read_line_one_file(FILE *f) { + _cleanup_free_ char *line = NULL; + + assert_se(read_line(f, SIZE_MAX, &line) == 15 && streq(line, "Some test data")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) > 0 && streq(line, "루Non-ascii chars: ąę„”")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) == 13 && streq(line, "terminators")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) == 15 && streq(line, "and even more")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) == 25 && streq(line, "now the same with a NUL")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) == 10 && streq(line, "and more")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) == 16 && streq(line, "and even more")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, &line) == 20 && streq(line, "and yet even more")); + line = mfree(line); + + assert_se(read_line(f, 1024, &line) == 30 && streq(line, "With newlines, and a NUL byte")); + line = mfree(line); + + assert_se(read_line(f, 1024, &line) == 1 && streq(line, "")); + line = mfree(line); + + assert_se(read_line(f, 1024, &line) == 14 && streq(line, "an empty line")); + line = mfree(line); + + assert_se(read_line(f, SIZE_MAX, NULL) == 16); + + assert_se(read_line(f, 16, &line) == -ENOBUFS); + line = mfree(line); + + /* read_line() stopped when it hit the limit, that means when we continue reading we'll read at the first + * character after the previous limit. Let's make use of that to continue our test. */ + assert_se(read_line(f, 1024, &line) == 62 && streq(line, "line that is supposed to be truncated, because it is so long")); + line = mfree(line); + + assert_se(read_line(f, 1024, &line) == 0 && streq(line, "")); +} + +TEST(read_line1) { + _cleanup_fclose_ FILE *f = NULL; + + assert_se(f = fmemopen_unlocked((void*) buffer, sizeof(buffer), "r")); + test_read_line_one_file(f); +} + +TEST(read_line2) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-fileio.XXXXXX"; + int fd; + _cleanup_fclose_ FILE *f = NULL; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se((size_t) write(fd, buffer, sizeof(buffer)) == sizeof(buffer)); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(f = fdopen(fd, "r")); + + test_read_line_one_file(f); +} + +TEST(read_line3) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *line = NULL; + int r; + + f = fopen("/proc/uptime", "re"); + if (!f && IN_SET(errno, ENOENT, EPERM)) + return; + assert_se(f); + + r = read_line(f, LINE_MAX, &line); + assert_se(r >= 0); + if (r == 0) + assert_se(line && isempty(line)); + else + assert_se((size_t) r == strlen(line) + 1); + assert_se(read_line(f, LINE_MAX, NULL) == 0); +} + +TEST(read_line4) { + static const struct { + size_t length; + const char *string; + } eof_endings[] = { + /* Each of these will be followed by EOF and should generate the one same single string */ + { 3, "foo" }, + { 4, "foo\n" }, + { 4, "foo\r" }, + { 4, "foo\0" }, + { 5, "foo\n\0" }, + { 5, "foo\r\0" }, + { 5, "foo\r\n" }, + { 5, "foo\n\r" }, + { 6, "foo\r\n\0" }, + { 6, "foo\n\r\0" }, + }; + + int r; + + for (size_t i = 0; i < ELEMENTSOF(eof_endings); i++) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *s = NULL; + + assert_se(f = fmemopen_unlocked((void*) eof_endings[i].string, eof_endings[i].length, "r")); + + r = read_line(f, SIZE_MAX, &s); + assert_se((size_t) r == eof_endings[i].length); + assert_se(streq_ptr(s, "foo")); + + assert_se(read_line(f, SIZE_MAX, NULL) == 0); /* Ensure we hit EOF */ + } +} + +TEST(read_nul_string) { + static const char test[] = "string nr. 1\0" + "string nr. 2\n\0" + "\377empty string follows\0" + "\0" + "final string\n is empty\0" + "\0"; + + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *s = NULL; + + assert_se(f = fmemopen_unlocked((void*) test, sizeof(test)-1, "r")); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 13 && streq_ptr(s, "string nr. 1")); + s = mfree(s); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 14 && streq_ptr(s, "string nr. 2\n")); + s = mfree(s); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 22 && streq_ptr(s, "\377empty string follows")); + s = mfree(s); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 1 && streq_ptr(s, "")); + s = mfree(s); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 23 && streq_ptr(s, "final string\n is empty")); + s = mfree(s); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 1 && streq_ptr(s, "")); + s = mfree(s); + + assert_se(read_nul_string(f, LONG_LINE_MAX, &s) == 0 && streq_ptr(s, "")); +} + +TEST(read_full_file_socket) { + _cleanup_(rm_rf_physical_and_freep) char *z = NULL; + _cleanup_close_ int listener = -EBADF; + _cleanup_free_ char *data = NULL, *clientname = NULL; + union sockaddr_union sa; + const char *j, *jj; + size_t size; + pid_t pid; + int r; + + listener = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(listener >= 0); + + assert_se(mkdtemp_malloc(NULL, &z) >= 0); + j = strjoina(z, "/socket"); + + assert_se(sockaddr_un_set_path(&sa.un, j) >= 0); + + assert_se(bind(listener, &sa.sa, SOCKADDR_UN_LEN(sa.un)) >= 0); + assert_se(listen(listener, 1) >= 0); + + /* Make sure the socket doesn't fit into a struct sockaddr_un, but we can still access it */ + jj = strjoina(z, "/a_very_long_patha_very_long_patha_very_long_patha_very_long_patha_very_long_patha_very_long_patha_very_long_patha_very_long_path"); + assert_se(strlen(jj) > sizeof_field(struct sockaddr_un, sun_path)); + assert_se(rename(j, jj) >= 0); + + /* Bind the *client* socket to some randomized name, to verify that this works correctly. */ + assert_se(asprintf(&clientname, "@%" PRIx64 "/test-bindname", random_u64()) >= 0); + + r = safe_fork("(server)", FORK_DEATHSIG_SIGTERM|FORK_LOG, &pid); + assert_se(r >= 0); + if (r == 0) { + union sockaddr_union peer = {}; + socklen_t peerlen = sizeof(peer); + _cleanup_close_ int rfd = -EBADF; + /* child */ + + rfd = accept4(listener, NULL, 0, SOCK_CLOEXEC); + assert_se(rfd >= 0); + + assert_se(getpeername(rfd, &peer.sa, &peerlen) >= 0); + + assert_se(peer.un.sun_family == AF_UNIX); + assert_se(peerlen > offsetof(struct sockaddr_un, sun_path)); + assert_se(peer.un.sun_path[0] == 0); + assert_se(streq(peer.un.sun_path + 1, clientname + 1)); + +#define TEST_STR "This is a test\nreally." + + assert_se(write(rfd, TEST_STR, strlen(TEST_STR)) == strlen(TEST_STR)); + _exit(EXIT_SUCCESS); + } + + assert_se(read_full_file_full(AT_FDCWD, jj, UINT64_MAX, SIZE_MAX, 0, NULL, &data, &size) == -ENXIO); + assert_se(read_full_file_full(AT_FDCWD, jj, UINT64_MAX, SIZE_MAX, READ_FULL_FILE_CONNECT_SOCKET, clientname, &data, &size) >= 0); + assert_se(size == strlen(TEST_STR)); + assert_se(streq(data, TEST_STR)); + + assert_se(wait_for_terminate_and_check("(server)", pid, WAIT_LOG) >= 0); +#undef TEST_STR +} + +TEST(read_full_file_offset_size) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_and_freep) char *fn = NULL; + _cleanup_free_ char *rbuf = NULL; + size_t rbuf_size; + uint8_t buf[4711]; + + random_bytes(buf, sizeof(buf)); + + assert_se(tempfn_random_child(NULL, NULL, &fn) >= 0); + assert_se(f = fopen(fn, "we")); + assert_se(fwrite(buf, 1, sizeof(buf), f) == sizeof(buf)); + assert_se(fflush_and_check(f) >= 0); + + assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, SIZE_MAX, 0, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == sizeof(buf)); + assert_se(memcmp(buf, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, 128, 0, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == 128); + assert_se(memcmp(buf, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, 128, READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) == -E2BIG); + assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, sizeof(buf)-1, READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) == -E2BIG); + assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, sizeof(buf), READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == sizeof(buf)); + assert_se(memcmp(buf, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, 47, 128, READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) == -E2BIG); + assert_se(read_full_file_full(AT_FDCWD, fn, 47, sizeof(buf)-47-1, READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) == -E2BIG); + assert_se(read_full_file_full(AT_FDCWD, fn, 47, sizeof(buf)-47, READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == sizeof(buf)-47); + assert_se(memcmp(buf+47, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, UINT64_MAX, sizeof(buf)+1, READ_FULL_FILE_FAIL_WHEN_LARGER, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == sizeof(buf)); + assert_se(memcmp(buf, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, 1234, SIZE_MAX, 0, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == sizeof(buf) - 1234); + assert_se(memcmp(buf + 1234, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, 2345, 777, 0, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == 777); + assert_se(memcmp(buf + 2345, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, 4700, 20, 0, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == 11); + assert_se(memcmp(buf + 4700, rbuf, rbuf_size) == 0); + rbuf = mfree(rbuf); + + assert_se(read_full_file_full(AT_FDCWD, fn, 10000, 99, 0, NULL, &rbuf, &rbuf_size) >= 0); + assert_se(rbuf_size == 0); + rbuf = mfree(rbuf); +} + +static void test_read_virtual_file_one(size_t max_size) { + int r; + + log_info("/* %s (max_size=%zu) */", __func__, max_size); + + FOREACH_STRING(filename, + "/proc/1/cmdline", + "/etc/nsswitch.conf", + "/sys/kernel/uevent_seqnum", + "/proc/kcore", + "/proc/kallsyms", + "/proc/self/exe", + "/proc/self/pagemap") { + + _cleanup_free_ char *buf = NULL; + size_t size = 0; + + r = read_virtual_file(filename, max_size, &buf, &size); + if (r < 0) { + log_info_errno(r, "read_virtual_file(\"%s\", %zu): %m", filename, max_size); + assert_se(ERRNO_IS_PRIVILEGE(r) || /* /proc/kcore is not accessible to unpriv */ + IN_SET(r, + -ENOENT, /* Some of the files might be absent */ + -EINVAL, /* too small reads from /proc/self/pagemap trigger EINVAL */ + -EFBIG, /* /proc/kcore and /proc/self/pagemap should be too large */ + -EBADF)); /* /proc/kcore is masked when we are running in docker. */ + } else + log_info("read_virtual_file(\"%s\", %zu): %s (%zu bytes)", filename, max_size, r ? "non-truncated" : "truncated", size); + } +} + +TEST(read_virtual_file) { + test_read_virtual_file_one(0); + test_read_virtual_file_one(1); + test_read_virtual_file_one(2); + test_read_virtual_file_one(20); + test_read_virtual_file_one(4096); + test_read_virtual_file_one(4097); + test_read_virtual_file_one(SIZE_MAX); +} + +TEST(fdopen_independent) { +#define TEST_TEXT "this is some random test text we are going to write to a memfd" + _cleanup_close_ int fd = -EBADF; + _cleanup_fclose_ FILE *f = NULL; + char buf[STRLEN(TEST_TEXT) + 1]; + + fd = memfd_new("fdopen_independent"); + if (fd < 0) { + assert_se(ERRNO_IS_NOT_SUPPORTED(fd)); + return; + } + + assert_se(write(fd, TEST_TEXT, strlen(TEST_TEXT)) == strlen(TEST_TEXT)); + /* we'll leave the read offset at the end of the memfd, the fdopen_independent() descriptors should + * start at the beginning anyway */ + + assert_se(fdopen_independent(fd, "re", &f) >= 0); + zero(buf); + assert_se(fread(buf, 1, sizeof(buf), f) == strlen(TEST_TEXT)); + assert_se(streq(buf, TEST_TEXT)); + assert_se((fcntl(fileno(f), F_GETFL) & O_ACCMODE) == O_RDONLY); + assert_se(FLAGS_SET(fcntl(fileno(f), F_GETFD), FD_CLOEXEC)); + f = safe_fclose(f); + + assert_se(fdopen_independent(fd, "r", &f) >= 0); + zero(buf); + assert_se(fread(buf, 1, sizeof(buf), f) == strlen(TEST_TEXT)); + assert_se(streq(buf, TEST_TEXT)); + assert_se((fcntl(fileno(f), F_GETFL) & O_ACCMODE) == O_RDONLY); + assert_se(!FLAGS_SET(fcntl(fileno(f), F_GETFD), FD_CLOEXEC)); + f = safe_fclose(f); + + assert_se(fdopen_independent(fd, "r+e", &f) >= 0); + zero(buf); + assert_se(fread(buf, 1, sizeof(buf), f) == strlen(TEST_TEXT)); + assert_se(streq(buf, TEST_TEXT)); + assert_se((fcntl(fileno(f), F_GETFL) & O_ACCMODE) == O_RDWR); + assert_se(FLAGS_SET(fcntl(fileno(f), F_GETFD), FD_CLOEXEC)); + f = safe_fclose(f); +} + + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-firewall-util.c b/src/test/test-firewall-util.c new file mode 100644 index 0000000..3f47a30 --- /dev/null +++ b/src/test/test-firewall-util.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "firewall-util.h" +#include "firewall-util-private.h" +#include "log.h" +#include "random-util.h" +#include "socket-util.h" +#include "tests.h" + +static void test_v6(FirewallContext *ctx) { + union in_addr_union u1, u2, u3; + uint8_t prefixlen; + int r; + + assert_se(ctx); + + log_info("/* %s(backend=%s) */", __func__, firewall_backend_to_string(ctx->backend)); + + if (!socket_ipv6_is_supported()) + return log_info("IPv6 is not supported by kernel, skipping tests."); + + assert_se(in_addr_from_string(AF_INET6, "dead::beef", &u1) >= 0); + assert_se(in_addr_from_string(AF_INET6, "1c3::c01d", &u2) >= 0); + + prefixlen = random_u64_range(128 + 1 - 8) + 8; + random_bytes(&u3, sizeof(u3)); + + assert_se(fw_add_masquerade(&ctx, true, AF_INET6, &u1, 128) >= 0); + assert_se(fw_add_masquerade(&ctx, false, AF_INET6, &u1, 128) >= 0); + assert_se(fw_add_masquerade(&ctx, true, AF_INET6, &u1, 64) >= 0); + assert_se(fw_add_masquerade(&ctx, false, AF_INET6, &u1, 64) >= 0); + assert_se(fw_add_masquerade(&ctx, true, AF_INET6, &u3, prefixlen) >= 0); + assert_se(fw_add_masquerade(&ctx, false, AF_INET6, &u3, prefixlen) >= 0); + + r = fw_add_local_dnat(&ctx, true, AF_INET6, IPPROTO_TCP, 4711, &u1, 815, NULL); + if (r == -EOPNOTSUPP) { + log_info("IPv6 DNAT seems not supported, skipping the following tests."); + return; + } + assert_se(r >= 0); + + assert_se(fw_add_local_dnat(&ctx, true, AF_INET6, IPPROTO_TCP, 4711, &u2, 815, &u1) >= 0); + assert_se(fw_add_local_dnat(&ctx, false, AF_INET6, IPPROTO_TCP, 4711, &u2, 815, NULL) >= 0); + +} + +static union in_addr_union *parse_addr(const char *str, union in_addr_union *u) { + assert_se(str); + assert_se(u); + assert_se(in_addr_from_string(AF_INET, str, u) >= 0); + return u; +} + +static bool test_v4(FirewallContext *ctx) { + union in_addr_union u, v; + int r; + + assert_se(ctx); + + log_info("/* %s(backend=%s) */", __func__, firewall_backend_to_string(ctx->backend)); + +#if HAVE_LIBIPTC + if (ctx->backend == FW_BACKEND_IPTABLES && fw_iptables_init_nat(NULL) < 0) { + log_debug("iptables backend is used, but nat table is not enabled, skipping tests"); + return false; + } +#endif + + assert_se(fw_add_masquerade(&ctx, true, AF_INET, NULL, 0) == -EINVAL); + assert_se(fw_add_masquerade(&ctx, true, AF_INET, parse_addr("10.1.2.0", &u), 0) == -EINVAL); + + r = fw_add_masquerade(&ctx, true, AF_INET, parse_addr("10.1.2.3", &u), 32); + if (r < 0) { + bool ignore = IN_SET(r, -EPERM, -EOPNOTSUPP, -ENOPROTOOPT); + + log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r, + "Failed to add IPv4 masquerade%s: %m", + ignore ? ", skipping following tests" : ""); + + if (ignore) + return false; + } + assert_se(r >= 0); + + assert_se(fw_add_masquerade(&ctx, true, AF_INET, parse_addr("10.0.2.0", &u), 28) >= 0); + assert_se(fw_add_masquerade(&ctx, false, AF_INET, parse_addr("10.0.2.0", &u), 28) >= 0); + assert_se(fw_add_masquerade(&ctx, false, AF_INET, parse_addr("10.1.2.3", &u), 32) >= 0); + assert_se(fw_add_local_dnat(&ctx, true, AF_INET, IPPROTO_TCP, 4711, parse_addr("1.2.3.4", &u), 815, NULL) >= 0); + assert_se(fw_add_local_dnat(&ctx, true, AF_INET, IPPROTO_TCP, 4711, parse_addr("1.2.3.4", &u), 815, NULL) >= 0); + assert_se(fw_add_local_dnat(&ctx, true, AF_INET, IPPROTO_TCP, 4711, parse_addr("1.2.3.5", &u), 815, parse_addr("1.2.3.4", &v)) >= 0); + assert_se(fw_add_local_dnat(&ctx, false, AF_INET, IPPROTO_TCP, 4711, parse_addr("1.2.3.5", &u), 815, NULL) >= 0); + + return true; +} + +int main(int argc, char *argv[]) { + _cleanup_(fw_ctx_freep) FirewallContext *ctx = NULL; + + test_setup_logging(LOG_DEBUG); + + if (getuid() != 0) + return log_tests_skipped("not root"); + + assert_se(fw_ctx_new(&ctx) >= 0); + assert_se(ctx); + + if (ctx->backend == FW_BACKEND_NONE) + return log_tests_skipped("no firewall backend supported"); + + if (test_v4(ctx) && ctx->backend == FW_BACKEND_NFTABLES) + test_v6(ctx); + +#if HAVE_LIBIPTC + if (ctx->backend != FW_BACKEND_IPTABLES) { + ctx->backend = FW_BACKEND_IPTABLES; + test_v4(ctx); + } +#endif + + return 0; +} diff --git a/src/test/test-format-table.c b/src/test/test-format-table.c new file mode 100644 index 0000000..7d544b1 --- /dev/null +++ b/src/test/test-format-table.c @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "format-table.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "time-util.h" + +TEST(issue_9549) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(table = table_new("name", "type", "ro", "usage", "created", "modified")); + assert_se(table_set_align_percent(table, TABLE_HEADER_CELL(3), 100) >= 0); + assert_se(table_add_many(table, + TABLE_STRING, "foooo", + TABLE_STRING, "raw", + TABLE_BOOLEAN, false, + TABLE_SIZE, (uint64_t) (673.7*1024*1024), + TABLE_STRING, "Wed 2018-07-11 00:10:33 JST", + TABLE_STRING, "Wed 2018-07-11 00:16:00 JST") >= 0); + + table_set_width(table, 75); + assert_se(table_format(table, &formatted) >= 0); + + printf("%s\n", formatted); + assert_se(streq(formatted, + "NAME TYPE RO USAGE CREATED MODIFIED\n" + "foooo raw no 673.6M Wed 2018-07-11 00:10:33 J… Wed 2018-07-11 00:16:00 JST\n" + )); +} + +TEST(multiline) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(table = table_new("foo", "bar")); + + assert_se(table_set_align_percent(table, TABLE_HEADER_CELL(1), 100) >= 0); + + assert_se(table_add_many(table, + TABLE_STRING, "three\ndifferent\nlines", + TABLE_STRING, "two\nlines\n") >= 0); + + table_set_cell_height_max(table, 1); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three… two…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 2); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different… lines\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 3); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, SIZE_MAX); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n")); + formatted = mfree(formatted); + + assert_se(table_add_many(table, + TABLE_STRING, "short", + TABLE_STRING, "a\npair") >= 0); + + assert_se(table_add_many(table, + TABLE_STRING, "short2\n", + TABLE_STRING, "a\nfour\nline\ncell") >= 0); + + table_set_cell_height_max(table, 1); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three… two…\n" + "short a…\n" + "short2 a…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 2); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different… lines\n" + "short a\n" + " pair\n" + "short2 a\n" + " four…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 3); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n" + "short a\n" + " pair\n" + "short2 a\n" + " four\n" + " line…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, SIZE_MAX); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n" + "short a\n" + " pair\n" + "short2 a\n" + " four\n" + " line\n" + " cell\n")); + formatted = mfree(formatted); +} + +TEST(strv) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(table = table_new("foo", "bar")); + + assert_se(table_set_align_percent(table, TABLE_HEADER_CELL(1), 100) >= 0); + + assert_se(table_add_many(table, + TABLE_STRV, STRV_MAKE("three", "different", "lines"), + TABLE_STRV, STRV_MAKE("two", "lines")) >= 0); + + table_set_cell_height_max(table, 1); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three… two…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 2); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different… lines\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 3); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, SIZE_MAX); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n")); + formatted = mfree(formatted); + + assert_se(table_add_many(table, + TABLE_STRING, "short", + TABLE_STRV, STRV_MAKE("a", "pair")) >= 0); + + assert_se(table_add_many(table, + TABLE_STRV, STRV_MAKE("short2"), + TABLE_STRV, STRV_MAKE("a", "four", "line", "cell")) >= 0); + + table_set_cell_height_max(table, 1); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three… two…\n" + "short a…\n" + "short2 a…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 2); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different… lines\n" + "short a\n" + " pair\n" + "short2 a\n" + " four…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 3); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n" + "short a\n" + " pair\n" + "short2 a\n" + " four\n" + " line…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, SIZE_MAX); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three two\n" + "different lines\n" + "lines \n" + "short a\n" + " pair\n" + "short2 a\n" + " four\n" + " line\n" + " cell\n")); + formatted = mfree(formatted); +} + +TEST(strv_wrapped) { + _cleanup_(table_unrefp) Table *table = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(table = table_new("foo", "bar")); + + assert_se(table_set_align_percent(table, TABLE_HEADER_CELL(1), 100) >= 0); + + assert_se(table_add_many(table, + TABLE_STRV_WRAPPED, STRV_MAKE("three", "different", "lines"), + TABLE_STRV_WRAPPED, STRV_MAKE("two", "lines")) >= 0); + + table_set_cell_height_max(table, 1); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different lines two lines\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 2); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different lines two lines\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 3); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different lines two lines\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, SIZE_MAX); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different lines two lines\n")); + formatted = mfree(formatted); + + assert_se(table_add_many(table, + TABLE_STRING, "short", + TABLE_STRV_WRAPPED, STRV_MAKE("a", "pair")) >= 0); + + assert_se(table_add_many(table, + TABLE_STRV_WRAPPED, STRV_MAKE("short2"), + TABLE_STRV_WRAPPED, STRV_MAKE("a", "eight", "line", "ćęłł", + "___5___", "___6___", "___7___", "___8___")) >= 0); + + table_set_cell_height_max(table, 1); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different… two lines\n" + "short a pair\n" + "short2 a eight line ćęłł…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 2); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different two lines\n" + "lines \n" + "short a pair\n" + "short2 a eight line ćęłł\n" + " ___5___ ___6___…\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, 3); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different two lines\n" + "lines \n" + "short a pair\n" + "short2 a eight line ćęłł\n" + " ___5___ ___6___\n" + " ___7___ ___8___\n")); + formatted = mfree(formatted); + + table_set_cell_height_max(table, SIZE_MAX); + assert_se(table_format(table, &formatted) >= 0); + fputs(formatted, stdout); + assert_se(streq(formatted, + "FOO BAR\n" + "three different two lines\n" + "lines \n" + "short a pair\n" + "short2 a eight line ćęłł\n" + " ___5___ ___6___\n" + " ___7___ ___8___\n")); + formatted = mfree(formatted); +} + +TEST(json) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL; + _cleanup_(table_unrefp) Table *t = NULL; + + assert_se(t = table_new("foo bar", "quux", "piep miau")); + assert_se(table_set_json_field_name(t, 2, "zzz") >= 0); + + assert_se(table_add_many(t, + TABLE_STRING, "v1", + TABLE_UINT64, UINT64_C(4711), + TABLE_BOOLEAN, true) >= 0); + + assert_se(table_add_many(t, + TABLE_STRV, STRV_MAKE("a", "b", "c"), + TABLE_EMPTY, + TABLE_MODE, 0755) >= 0); + + assert_se(table_to_json(t, &v) >= 0); + + assert_se(json_build(&w, + JSON_BUILD_ARRAY( + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("foo_bar", JSON_BUILD_CONST_STRING("v1")), + JSON_BUILD_PAIR("quux", JSON_BUILD_UNSIGNED(4711)), + JSON_BUILD_PAIR("zzz", JSON_BUILD_BOOLEAN(true))), + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("foo_bar", JSON_BUILD_STRV(STRV_MAKE("a", "b", "c"))), + JSON_BUILD_PAIR("quux", JSON_BUILD_NULL), + JSON_BUILD_PAIR("zzz", JSON_BUILD_UNSIGNED(0755))))) >= 0); + + assert_se(json_variant_equal(v, w)); +} + +TEST(table) { + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(t = table_new("one", "two", "three", "four")); + + assert_se(table_set_align_percent(t, TABLE_HEADER_CELL(3), 100) >= 0); + + assert_se(table_add_many(t, + TABLE_STRING, "xxx", + TABLE_STRING, "yyy", + TABLE_BOOLEAN, true, + TABLE_INT, -1) >= 0); + + assert_se(table_add_many(t, + TABLE_STRING, "a long field", + TABLE_STRING, "yyy", + TABLE_SET_UPPERCASE, 1, + TABLE_BOOLEAN, false, + TABLE_INT, -999999) >= 0); + + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "ONE TWO THREE FOUR\n" + "xxx yyy yes -1\n" + "a long field YYY no -999999\n")); + + formatted = mfree(formatted); + + table_set_width(t, 40); + + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "ONE TWO THREE FOUR\n" + "xxx yyy yes -1\n" + "a long field YYY no -999999\n")); + + formatted = mfree(formatted); + + table_set_width(t, 15); + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "ONE TWO TH… FO…\n" + "xxx yyy yes -1\n" + "a … YYY no -9…\n")); + + formatted = mfree(formatted); + + table_set_width(t, 5); + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "… … … …\n" + "… … … …\n" + "… … … …\n")); + + formatted = mfree(formatted); + + table_set_width(t, 3); + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "… … … …\n" + "… … … …\n" + "… … … …\n")); + + formatted = mfree(formatted); + + table_set_width(t, SIZE_MAX); + assert_se(table_set_sort(t, (size_t) 0, (size_t) 2, SIZE_MAX) >= 0); + + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "ONE TWO THREE FOUR\n" + "a long field YYY no -999999\n" + "xxx yyy yes -1\n")); + + formatted = mfree(formatted); + + table_set_header(t, false); + + assert_se(table_add_many(t, + TABLE_STRING, "fäää", + TABLE_STRING, "uuu", + TABLE_BOOLEAN, true, + TABLE_INT, 42) >= 0); + + assert_se(table_add_many(t, + TABLE_STRING, "fäää", + TABLE_STRING, "zzz", + TABLE_BOOLEAN, false, + TABLE_INT, 0) >= 0); + + assert_se(table_add_many(t, + TABLE_EMPTY, + TABLE_SIZE, (uint64_t) 4711, + TABLE_TIMESPAN, (usec_t) 5*USEC_PER_MINUTE, + TABLE_INT64, (uint64_t) -123456789) >= 0); + + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + assert_se(streq(formatted, + "a long field YYY no -999999\n" + "fäää zzz no 0\n" + "fäää uuu yes 42\n" + "xxx yyy yes -1\n" + " 4.6K 5min -123456789\n")); + + formatted = mfree(formatted); + + assert_se(table_set_display(t, (size_t) 2, (size_t) 0, (size_t) 2, (size_t) 0, (size_t) 0, SIZE_MAX) >= 0); + + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + + if (isatty(STDOUT_FILENO)) + assert_se(streq(formatted, + "no a long f… no a long f… a long fi…\n" + "no fäää no fäää fäää\n" + "yes fäää yes fäää fäää\n" + "yes xxx yes xxx xxx\n" + "5min 5min \n")); + else + assert_se(streq(formatted, + "no a long field no a long field a long field\n" + "no fäää no fäää fäää\n" + "yes fäää yes fäää fäää\n" + "yes xxx yes xxx xxx\n" + "5min 5min \n")); +} + +TEST(vertical) { + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(t = table_new_vertical()); + + assert_se(table_add_many(t, + TABLE_FIELD, "pfft aa", TABLE_STRING, "foo", + TABLE_FIELD, "uuu o", TABLE_SIZE, UINT64_C(1024), + TABLE_FIELD, "lllllllllllo", TABLE_STRING, "jjjjjjjjjjjjjjjjj") >= 0); + + assert_se(table_set_json_field_name(t, 1, "dimpfelmoser") >= 0); + + assert_se(table_format(t, &formatted) >= 0); + + assert_se(streq(formatted, + " pfft aa: foo\n" + " uuu o: 1.0K\n" + "lllllllllllo: jjjjjjjjjjjjjjjjj\n")); + + _cleanup_(json_variant_unrefp) JsonVariant *a = NULL, *b = NULL; + assert_se(table_to_json(t, &a) >= 0); + + assert_se(json_build(&b, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("pfft_aa", JSON_BUILD_STRING("foo")), + JSON_BUILD_PAIR("dimpfelmoser", JSON_BUILD_UNSIGNED(1024)), + JSON_BUILD_PAIR("lllllllllllo", JSON_BUILD_STRING("jjjjjjjjjjjjjjjjj")))) >= 0); + + assert_se(json_variant_equal(a, b)); +} + +TEST(path_basename) { + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(t = table_new("x")); + + table_set_header(t, false); + + assert_se(table_add_many(t, + TABLE_PATH_BASENAME, "/foo/bar", + TABLE_PATH_BASENAME, "/quux/bar", + TABLE_PATH_BASENAME, "/foo/baz") >= 0); + + assert_se(table_format(t, &formatted) >= 0); + + assert_se(streq(formatted, "bar\nbar\nbaz\n")); +} + +TEST(dup_cell) { + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_free_ char *formatted = NULL; + + assert_se(t = table_new("foo", "bar", "x", "baz", ".", "%", "!", "~", "+")); + table_set_width(t, 75); + + assert_se(table_add_many(t, + TABLE_STRING, "hello", + TABLE_UINT8, UINT8_C(42), + TABLE_UINT16, UINT16_C(666), + TABLE_UINT32, UINT32_C(253), + TABLE_PERCENT, 0, + TABLE_PATH_BASENAME, "/foo/bar", + TABLE_STRING, "aaa", + TABLE_STRING, "bbb", + TABLE_STRING, "ccc") >= 0); + + /* Add the second row by duping cells */ + for (size_t i = 0; i < table_get_columns(t); i++) + assert_se(table_dup_cell(t, table_get_cell(t, 1, i)) >= 0); + + /* Another row, but dupe the last three strings from the same cell */ + assert_se(table_add_many(t, + TABLE_STRING, "aaa", + TABLE_UINT8, UINT8_C(0), + TABLE_UINT16, UINT16_C(65535), + TABLE_UINT32, UINT32_C(4294967295), + TABLE_PERCENT, 100, + TABLE_PATH_BASENAME, "../") >= 0); + + for (size_t i = 6; i < table_get_columns(t); i++) + assert_se(table_dup_cell(t, table_get_cell(t, 2, 0)) >= 0); + + assert_se(table_format(t, &formatted) >= 0); + printf("%s\n", formatted); + assert_se(streq(formatted, + "FOO BAR X BAZ . % ! ~ +\n" + "hello 42 666 253 0% bar aaa bbb ccc\n" + "hello 42 666 253 0% bar aaa bbb ccc\n" + "aaa 0 65535 4294967295 100% ../ hello hello hello\n")); +} + +static int intro(void) { + assert_se(setenv("SYSTEMD_COLORS", "0", 1) >= 0); + assert_se(setenv("COLUMNS", "40", 1) >= 0); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-format-util.c b/src/test/test-format-util.c new file mode 100644 index 0000000..3063509 --- /dev/null +++ b/src/test/test-format-util.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "format-util.h" +#include "macro.h" +#include "string-util.h" +#include "tests.h" +#include "uchar.h" + +/* Do some basic checks on STRLEN() and DECIMAL_STR_MAX() */ +assert_cc(STRLEN("") == 0); +assert_cc(STRLEN("a") == 1); +assert_cc(STRLEN("123") == 3); +assert_cc(STRLEN(u8"") == 0); +assert_cc(STRLEN(u8"a") == 1); +assert_cc(STRLEN(u8"123") == 3); +assert_cc(STRLEN(u"") == 0); +assert_cc(STRLEN(u"a") == sizeof(char16_t)); +assert_cc(STRLEN(u"123") == 3 * sizeof(char16_t)); +assert_cc(STRLEN(U"") == 0); +assert_cc(STRLEN(U"a") == sizeof(char32_t)); +assert_cc(STRLEN(U"123") == 3 * sizeof(char32_t)); +assert_cc(STRLEN(L"") == 0); +assert_cc(STRLEN(L"a") == sizeof(wchar_t)); +assert_cc(STRLEN(L"123") == 3 * sizeof(wchar_t)); +assert_cc(DECIMAL_STR_MAX(uint8_t) == STRLEN("255")+1); +assert_cc(DECIMAL_STR_MAX(int8_t) == STRLEN("-127")+1); +assert_cc(DECIMAL_STR_MAX(uint64_t) == STRLEN("18446744073709551615")+1); +assert_cc(DECIMAL_STR_MAX(int64_t) == CONST_MAX(STRLEN("-9223372036854775808"), STRLEN("9223372036854775807"))+1); +assert_cc(DECIMAL_STR_MAX(signed char) == STRLEN("-127")+1); +assert_cc(DECIMAL_STR_MAX(unsigned char) == STRLEN("255")+1); +assert_cc(CONST_MAX(DECIMAL_STR_MAX(int8_t), STRLEN("xxx")) == 5); + +static void test_format_bytes_one(uint64_t val, bool trailing_B, const char *iec_with_p, const char *iec_without_p, + const char *si_with_p, const char *si_without_p) { + char buf[FORMAT_BYTES_MAX]; + + assert_se(streq_ptr(format_bytes_full(buf, sizeof buf, val, FORMAT_BYTES_USE_IEC | FORMAT_BYTES_BELOW_POINT | (trailing_B ? FORMAT_BYTES_TRAILING_B : 0)), iec_with_p)); + assert_se(streq_ptr(format_bytes_full(buf, sizeof buf, val, FORMAT_BYTES_USE_IEC | (trailing_B ? FORMAT_BYTES_TRAILING_B : 0)), iec_without_p)); + assert_se(streq_ptr(format_bytes_full(buf, sizeof buf, val, FORMAT_BYTES_BELOW_POINT | (trailing_B ? FORMAT_BYTES_TRAILING_B : 0)), si_with_p)); + assert_se(streq_ptr(format_bytes_full(buf, sizeof buf, val, trailing_B ? FORMAT_BYTES_TRAILING_B : 0), si_without_p)); +} + +TEST(format_bytes) { + test_format_bytes_one(900, true, "900B", "900B", "900B", "900B"); + test_format_bytes_one(900, false, "900", "900", "900", "900"); + test_format_bytes_one(1023, true, "1023B", "1023B", "1.0K", "1K"); + test_format_bytes_one(1023, false, "1023", "1023", "1.0K", "1K"); + test_format_bytes_one(1024, true, "1.0K", "1K", "1.0K", "1K"); + test_format_bytes_one(1024, false, "1.0K", "1K", "1.0K", "1K"); + test_format_bytes_one(1100, true, "1.0K", "1K", "1.1K", "1K"); + test_format_bytes_one(1500, true, "1.4K", "1K", "1.5K", "1K"); + test_format_bytes_one(UINT64_C(3)*1024*1024, true, "3.0M", "3M", "3.1M", "3M"); + test_format_bytes_one(UINT64_C(3)*1024*1024*1024, true, "3.0G", "3G", "3.2G", "3G"); + test_format_bytes_one(UINT64_C(3)*1024*1024*1024*1024, true, "3.0T", "3T", "3.2T", "3T"); + test_format_bytes_one(UINT64_C(3)*1024*1024*1024*1024*1024, true, "3.0P", "3P", "3.3P", "3P"); + test_format_bytes_one(UINT64_C(3)*1024*1024*1024*1024*1024*1024, true, "3.0E", "3E", "3.4E", "3E"); + test_format_bytes_one(UINT64_MAX, true, NULL, NULL, NULL, NULL); + test_format_bytes_one(UINT64_MAX, false, NULL, NULL, NULL, NULL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c new file mode 100644 index 0000000..ef335b4 --- /dev/null +++ b/src/test/test-fs-util.c @@ -0,0 +1,796 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "copy.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "macro.h" +#include "mkdir.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "sync-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "virt.h" + +static const char *arg_test_dir = NULL; + +TEST(readlink_and_make_absolute) { + const char *tempdir, *name, *name2, *name_alias; + _cleanup_free_ char *r1 = NULL, *r2 = NULL, *pwd = NULL; + + tempdir = strjoina(arg_test_dir ?: "/tmp", "/test-readlink_and_make_absolute"); + name = strjoina(tempdir, "/original"); + name2 = "test-readlink_and_make_absolute/original"; + name_alias = strjoina(arg_test_dir ?: "/tmp", "/test-readlink_and_make_absolute-alias"); + + assert_se(mkdir_safe(tempdir, 0755, getuid(), getgid(), MKDIR_WARN_MODE) >= 0); + assert_se(touch(name) >= 0); + + if (symlink(name, name_alias) < 0) { + assert_se(IN_SET(errno, EINVAL, ENOSYS, ENOTTY, EPERM)); + log_tests_skipped_errno(errno, "symlink() not possible"); + } else { + assert_se(readlink_and_make_absolute(name_alias, &r1) >= 0); + assert_se(streq(r1, name)); + assert_se(unlink(name_alias) >= 0); + + assert_se(safe_getcwd(&pwd) >= 0); + + assert_se(chdir(tempdir) >= 0); + assert_se(symlink(name2, name_alias) >= 0); + assert_se(readlink_and_make_absolute(name_alias, &r2) >= 0); + assert_se(streq(r2, name)); + assert_se(unlink(name_alias) >= 0); + + assert_se(chdir(pwd) >= 0); + } + + assert_se(rm_rf(tempdir, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); +} + +TEST(get_files_in_directory) { + _cleanup_strv_free_ char **l = NULL, **t = NULL; + + assert_se(get_files_in_directory(arg_test_dir ?: "/tmp", &l) >= 0); + assert_se(get_files_in_directory(".", &t) >= 0); + assert_se(get_files_in_directory(".", NULL) >= 0); +} + +TEST(var_tmp) { + _cleanup_free_ char *tmpdir_backup = NULL, *temp_backup = NULL, *tmp_backup = NULL; + const char *tmp_dir = NULL, *t; + + t = getenv("TMPDIR"); + if (t) { + tmpdir_backup = strdup(t); + assert_se(tmpdir_backup); + } + + t = getenv("TEMP"); + if (t) { + temp_backup = strdup(t); + assert_se(temp_backup); + } + + t = getenv("TMP"); + if (t) { + tmp_backup = strdup(t); + assert_se(tmp_backup); + } + + assert_se(unsetenv("TMPDIR") >= 0); + assert_se(unsetenv("TEMP") >= 0); + assert_se(unsetenv("TMP") >= 0); + + assert_se(var_tmp_dir(&tmp_dir) >= 0); + assert_se(streq(tmp_dir, "/var/tmp")); + + assert_se(setenv("TMPDIR", "/tmp", true) >= 0); + assert_se(streq(getenv("TMPDIR"), "/tmp")); + + assert_se(var_tmp_dir(&tmp_dir) >= 0); + assert_se(streq(tmp_dir, "/tmp")); + + assert_se(setenv("TMPDIR", "/88_does_not_exist_88", true) >= 0); + assert_se(streq(getenv("TMPDIR"), "/88_does_not_exist_88")); + + assert_se(var_tmp_dir(&tmp_dir) >= 0); + assert_se(streq(tmp_dir, "/var/tmp")); + + if (tmpdir_backup) { + assert_se(setenv("TMPDIR", tmpdir_backup, true) >= 0); + assert_se(streq(getenv("TMPDIR"), tmpdir_backup)); + } + + if (temp_backup) { + assert_se(setenv("TEMP", temp_backup, true) >= 0); + assert_se(streq(getenv("TEMP"), temp_backup)); + } + + if (tmp_backup) { + assert_se(setenv("TMP", tmp_backup, true) >= 0); + assert_se(streq(getenv("TMP"), tmp_backup)); + } +} + +TEST(dot_or_dot_dot) { + assert_se(!dot_or_dot_dot(NULL)); + assert_se(!dot_or_dot_dot("")); + assert_se(!dot_or_dot_dot("xxx")); + assert_se(dot_or_dot_dot(".")); + assert_se(dot_or_dot_dot("..")); + assert_se(!dot_or_dot_dot(".foo")); + assert_se(!dot_or_dot_dot("..foo")); +} + +TEST(access_fd) { + _cleanup_(rmdir_and_freep) char *p = NULL; + _cleanup_close_ int fd = -EBADF; + const char *a; + + a = strjoina(arg_test_dir ?: "/tmp", "/access-fd.XXXXXX"); + assert_se(mkdtemp_malloc(a, &p) >= 0); + + fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC); + assert_se(fd >= 0); + + assert_se(access_fd(fd, R_OK) >= 0); + assert_se(access_fd(fd, F_OK) >= 0); + assert_se(access_fd(fd, W_OK) >= 0); + + assert_se(fchmod(fd, 0000) >= 0); + + assert_se(access_fd(fd, F_OK) >= 0); + + if (geteuid() == 0) { + assert_se(access_fd(fd, R_OK) >= 0); + assert_se(access_fd(fd, W_OK) >= 0); + } else { + assert_se(access_fd(fd, R_OK) == -EACCES); + assert_se(access_fd(fd, W_OK) == -EACCES); + } +} + +TEST(touch_file) { + uid_t test_uid, test_gid; + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + struct stat st; + const char *a; + usec_t test_mtime; + int r; + + test_uid = geteuid() == 0 ? 65534 : getuid(); + test_gid = geteuid() == 0 ? 65534 : getgid(); + + test_mtime = usec_sub_unsigned(now(CLOCK_REALTIME), USEC_PER_WEEK); + + a = strjoina(arg_test_dir ?: "/dev/shm", "/touch-file-XXXXXX"); + assert_se(mkdtemp_malloc(a, &p) >= 0); + + a = strjoina(p, "/regular"); + r = touch_file(a, false, test_mtime, test_uid, test_gid, 0640); + if (r < 0) { + assert_se(IN_SET(r, -EINVAL, -ENOSYS, -ENOTTY, -EPERM)); + log_tests_skipped_errno(errno, "touch_file() not possible"); + return; + } + + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISREG(st.st_mode)); + assert_se((st.st_mode & 0777) == 0640); + assert_se(timespec_load(&st.st_mtim) == test_mtime); + + a = strjoina(p, "/dir"); + assert_se(mkdir(a, 0775) >= 0); + assert_se(touch_file(a, false, test_mtime, test_uid, test_gid, 0640) >= 0); + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 0777) == 0640); + assert_se(timespec_load(&st.st_mtim) == test_mtime); + + a = strjoina(p, "/fifo"); + assert_se(mkfifo(a, 0775) >= 0); + assert_se(touch_file(a, false, test_mtime, test_uid, test_gid, 0640) >= 0); + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISFIFO(st.st_mode)); + assert_se((st.st_mode & 0777) == 0640); + assert_se(timespec_load(&st.st_mtim) == test_mtime); + + a = strjoina(p, "/sock"); + assert_se(mknod(a, 0775 | S_IFSOCK, 0) >= 0); + assert_se(touch_file(a, false, test_mtime, test_uid, test_gid, 0640) >= 0); + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISSOCK(st.st_mode)); + assert_se((st.st_mode & 0777) == 0640); + assert_se(timespec_load(&st.st_mtim) == test_mtime); + + if (geteuid() == 0) { + a = strjoina(p, "/bdev"); + r = mknod(a, 0775 | S_IFBLK, makedev(0, 0)); + if (r < 0 && errno == EPERM && detect_container() > 0) { + log_notice("Running in unprivileged container? Skipping remaining tests in %s", __func__); + return; + } + assert_se(r >= 0); + assert_se(touch_file(a, false, test_mtime, test_uid, test_gid, 0640) >= 0); + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISBLK(st.st_mode)); + assert_se((st.st_mode & 0777) == 0640); + assert_se(timespec_load(&st.st_mtim) == test_mtime); + + a = strjoina(p, "/cdev"); + assert_se(mknod(a, 0775 | S_IFCHR, makedev(0, 0)) >= 0); + assert_se(touch_file(a, false, test_mtime, test_uid, test_gid, 0640) >= 0); + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISCHR(st.st_mode)); + assert_se((st.st_mode & 0777) == 0640); + assert_se(timespec_load(&st.st_mtim) == test_mtime); + } + + a = strjoina(p, "/lnk"); + assert_se(symlink("target", a) >= 0); + assert_se(touch_file(a, false, test_mtime, test_uid, test_gid, 0640) >= 0); + assert_se(lstat(a, &st) >= 0); + assert_se(st.st_uid == test_uid); + assert_se(st.st_gid == test_gid); + assert_se(S_ISLNK(st.st_mode)); + assert_se(timespec_load(&st.st_mtim) == test_mtime); +} + +TEST(unlinkat_deallocate) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + + assert_se(tempfn_random_child(arg_test_dir, "unlink-deallocation", &p) >= 0); + + fd = open(p, O_WRONLY|O_CLOEXEC|O_CREAT|O_EXCL, 0600); + assert_se(fd >= 0); + + assert_se(write(fd, "hallo\n", 6) == 6); + + assert_se(fstat(fd, &st) >= 0); + assert_se(st.st_size == 6); + assert_se(st.st_blocks > 0); + assert_se(st.st_nlink == 1); + + assert_se(unlinkat_deallocate(AT_FDCWD, p, UNLINK_ERASE) >= 0); + + assert_se(fstat(fd, &st) >= 0); + assert_se(IN_SET(st.st_size, 0, 6)); /* depending on whether hole punching worked the size will be 6 + (it worked) or 0 (we had to resort to truncation) */ + assert_se(st.st_blocks == 0); + assert_se(st.st_nlink == 0); +} + +TEST(fsync_directory_of_file) { + _cleanup_close_ int fd = -EBADF; + + fd = open_tmpfile_unlinkable(arg_test_dir, O_RDWR); + assert_se(fd >= 0); + + assert_se(fsync_directory_of_file(fd) >= 0); +} + +TEST(rename_noreplace) { + static const char* const table[] = { + "/reg", + "/dir", + "/fifo", + "/socket", + "/symlink", + NULL + }; + + _cleanup_(rm_rf_physical_and_freep) char *z = NULL; + const char *j = NULL; + + if (arg_test_dir) + j = strjoina(arg_test_dir, "/testXXXXXX"); + assert_se(mkdtemp_malloc(j, &z) >= 0); + + j = strjoina(z, table[0]); + assert_se(touch(j) >= 0); + + j = strjoina(z, table[1]); + assert_se(mkdir(j, 0777) >= 0); + + j = strjoina(z, table[2]); + (void) mkfifo(j, 0777); + + j = strjoina(z, table[3]); + (void) mknod(j, S_IFSOCK | 0777, 0); + + j = strjoina(z, table[4]); + (void) symlink("foobar", j); + + STRV_FOREACH(a, table) { + _cleanup_free_ char *x = NULL, *y = NULL; + + x = strjoin(z, *a); + assert_se(x); + + if (access(x, F_OK) < 0) { + assert_se(errno == ENOENT); + continue; + } + + STRV_FOREACH(b, table) { + _cleanup_free_ char *w = NULL; + + w = strjoin(z, *b); + assert_se(w); + + if (access(w, F_OK) < 0) { + assert_se(errno == ENOENT); + continue; + } + + assert_se(rename_noreplace(AT_FDCWD, x, AT_FDCWD, w) == -EEXIST); + } + + y = strjoin(z, "/somethingelse"); + assert_se(y); + + assert_se(rename_noreplace(AT_FDCWD, x, AT_FDCWD, y) >= 0); + assert_se(rename_noreplace(AT_FDCWD, y, AT_FDCWD, x) >= 0); + } +} + +TEST(chmod_and_chown) { + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + struct stat st; + const char *p; + + if (geteuid() != 0) + return; + + BLOCK_WITH_UMASK(0000); + + assert_se(mkdtemp_malloc(NULL, &d) >= 0); + + p = strjoina(d, "/reg"); + assert_se(mknod(p, S_IFREG | 0123, 0) >= 0); + + assert_se(chmod_and_chown(p, S_IFREG | 0321, 1, 2) >= 0); + assert_se(chmod_and_chown(p, S_IFDIR | 0555, 3, 4) == -EINVAL); + + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISREG(st.st_mode)); + assert_se((st.st_mode & 07777) == 0321); + + p = strjoina(d, "/dir"); + assert_se(mkdir(p, 0123) >= 0); + + assert_se(chmod_and_chown(p, S_IFDIR | 0321, 1, 2) >= 0); + assert_se(chmod_and_chown(p, S_IFREG | 0555, 3, 4) == -EINVAL); + + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0321); + + p = strjoina(d, "/lnk"); + assert_se(symlink("idontexist", p) >= 0); + + assert_se(chmod_and_chown(p, S_IFLNK | 0321, 1, 2) >= 0); + assert_se(chmod_and_chown(p, S_IFREG | 0555, 3, 4) == -EINVAL); + assert_se(chmod_and_chown(p, S_IFDIR | 0555, 3, 4) == -EINVAL); + + assert_se(lstat(p, &st) >= 0); + assert_se(S_ISLNK(st.st_mode)); +} + +static void create_binary_file(const char *p, const void *data, size_t l) { + _cleanup_close_ int fd = -EBADF; + + fd = open(p, O_CREAT|O_WRONLY|O_EXCL|O_CLOEXEC, 0600); + assert_se(fd >= 0); + assert_se(write(fd, data, l) == (ssize_t) l); +} + +TEST(conservative_rename) { + _cleanup_(unlink_and_freep) char *p = NULL; + _cleanup_free_ char *q = NULL; + size_t l = 16*1024 + random_u64() % (32 * 1024); /* some randomly sized buffer 16k…48k */ + uint8_t buffer[l+1]; + + random_bytes(buffer, l); + + assert_se(tempfn_random_child(NULL, NULL, &p) >= 0); + create_binary_file(p, buffer, l); + + assert_se(tempfn_random_child(NULL, NULL, &q) >= 0); + + /* Check that the hardlinked "copy" is detected */ + assert_se(link(p, q) >= 0); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) == 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); + + /* Check that a manual copy is detected */ + assert_se(copy_file(p, q, 0, MODE_INVALID, COPY_REFLINK) >= 0); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) == 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); + + /* Check that a manual new writeout is also detected */ + create_binary_file(q, buffer, l); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) == 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); + + /* Check that a minimally changed version is detected */ + buffer[47] = ~buffer[47]; + create_binary_file(q, buffer, l); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) > 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); + + /* Check that this really is new updated version */ + create_binary_file(q, buffer, l); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) == 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); + + /* Make sure we detect extended files */ + buffer[l++] = 47; + create_binary_file(q, buffer, l); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) > 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); + + /* Make sure we detect truncated files */ + l--; + create_binary_file(q, buffer, l); + assert_se(conservative_renameat(AT_FDCWD, q, AT_FDCWD, p) > 0); + assert_se(access(q, F_OK) < 0 && errno == ENOENT); +} + +static void test_rmdir_parents_one( + const char *prefix, + const char *path, + const char *stop, + int expected, + const char *test_exist, + const char *test_nonexist_subdir) { + + const char *p, *s; + + log_debug("/* %s(%s, %s) */", __func__, path, stop); + + p = strjoina(prefix, path); + s = strjoina(prefix, stop); + + if (expected >= 0) + assert_se(mkdir_parents(p, 0700) >= 0); + + assert_se(rmdir_parents(p, s) == expected); + + if (expected >= 0) { + const char *e, *f; + + e = strjoina(prefix, test_exist); + f = strjoina(e, test_nonexist_subdir); + + assert_se(access(e, F_OK) >= 0); + assert_se(access(f, F_OK) < 0); + } +} + +TEST(rmdir_parents) { + char *temp; + + temp = strjoina(arg_test_dir ?: "/tmp", "/test-rmdir.XXXXXX"); + assert_se(mkdtemp(temp)); + + test_rmdir_parents_one(temp, "/aaa/../hoge/foo", "/hoge/foo", -EINVAL, NULL, NULL); + test_rmdir_parents_one(temp, "/aaa/bbb/ccc", "/hoge/../aaa", -EINVAL, NULL, NULL); + + test_rmdir_parents_one(temp, "/aaa/bbb/ccc/ddd/eee", "/aaa/bbb/ccc/ddd", 0, "/aaa/bbb/ccc/ddd", "/eee"); + test_rmdir_parents_one(temp, "/aaa/bbb/ccc/ddd/eee", "/aaa/bbb/ccc", 0, "/aaa/bbb/ccc", "/ddd"); + test_rmdir_parents_one(temp, "/aaa/bbb/ccc/ddd/eee", "/aaa/bbb", 0, "/aaa/bbb", "/ccc"); + test_rmdir_parents_one(temp, "/aaa/bbb/ccc/ddd/eee", "/aaa", 0, "/aaa", "/bbb"); + test_rmdir_parents_one(temp, "/aaa/bbb/ccc/ddd/eee", "/", 0, "/", "/aaa"); + + test_rmdir_parents_one(temp, "/aaa/bbb/ccc/ddd/eee", "/aaa/hoge/foo", 0, "/aaa", "/bbb"); + test_rmdir_parents_one(temp, "/aaa////bbb/.//ccc//ddd/eee///./.", "///././aaa/.", 0, "/aaa", "/bbb"); + + assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); +} + +static void test_parse_cifs_service_one(const char *f, const char *h, const char *s, const char *d, int ret) { + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL; + + assert_se(parse_cifs_service(f, &a, &b, &c) == ret); + assert_se(streq_ptr(a, h)); + assert_se(streq_ptr(b, s)); + assert_se(streq_ptr(c, d)); +} + +TEST(parse_cifs_service) { + test_parse_cifs_service_one("//foo/bar/baz", "foo", "bar", "baz", 0); + test_parse_cifs_service_one("\\\\foo\\bar\\baz", "foo", "bar", "baz", 0); + test_parse_cifs_service_one("//foo/bar", "foo", "bar", NULL, 0); + test_parse_cifs_service_one("\\\\foo\\bar", "foo", "bar", NULL, 0); + test_parse_cifs_service_one("//foo/bar/baz/uuu", "foo", "bar", "baz/uuu", 0); + test_parse_cifs_service_one("\\\\foo\\bar\\baz\\uuu", "foo", "bar", "baz/uuu", 0); + + test_parse_cifs_service_one(NULL, NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("abc", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("abc/cde/efg", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("//foo/bar/baz/..", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("//foo///", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("//foo/.", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("//foo/a/.", NULL, NULL, NULL, -EINVAL); + test_parse_cifs_service_one("//./a", NULL, NULL, NULL, -EINVAL); +} + +TEST(open_mkdir_at) { + _cleanup_close_ int fd = -EBADF, subdir_fd = -EBADF, subsubdir_fd = -EBADF; + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + struct stat sta, stb; + + assert_se(open_mkdir_at(AT_FDCWD, "/", O_EXCL|O_CLOEXEC, 0) == -EEXIST); + assert_se(open_mkdir_at(AT_FDCWD, ".", O_EXCL|O_CLOEXEC, 0) == -EEXIST); + + fd = open_mkdir_at(AT_FDCWD, "/", O_CLOEXEC, 0); + assert_se(fd >= 0); + assert_se(stat("/", &sta) >= 0); + assert_se(fstat(fd, &stb) >= 0); + assert_se(stat_inode_same(&sta, &stb)); + fd = safe_close(fd); + + fd = open_mkdir_at(AT_FDCWD, ".", O_CLOEXEC, 0); + assert_se(stat(".", &sta) >= 0); + assert_se(fstat(fd, &stb) >= 0); + assert_se(stat_inode_same(&sta, &stb)); + fd = safe_close(fd); + + assert_se(open_mkdir_at(AT_FDCWD, "/proc", O_EXCL|O_CLOEXEC, 0) == -EEXIST); + + fd = open_mkdir_at(AT_FDCWD, "/proc", O_CLOEXEC, 0); + assert_se(fd >= 0); + fd = safe_close(fd); + + assert_se(open_mkdir_at(AT_FDCWD, "/bin/sh", O_EXCL|O_CLOEXEC, 0) == -EEXIST); + assert_se(open_mkdir_at(AT_FDCWD, "/bin/sh", O_CLOEXEC, 0) == -EEXIST); + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + + assert_se(open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0) == -EEXIST); + assert_se(open_mkdir_at(AT_FDCWD, t, O_PATH|O_EXCL|O_CLOEXEC, 0) == -EEXIST); + + fd = open_mkdir_at(AT_FDCWD, t, O_CLOEXEC, 0000); + assert_se(fd >= 0); + fd = safe_close(fd); + + fd = open_mkdir_at(AT_FDCWD, t, O_PATH|O_CLOEXEC, 0000); + assert_se(fd >= 0); + + subdir_fd = open_mkdir_at(fd, "xxx", O_PATH|O_EXCL|O_CLOEXEC, 0700); + assert_se(subdir_fd >= 0); + + assert_se(open_mkdir_at(fd, "xxx", O_PATH|O_EXCL|O_CLOEXEC, 0) == -EEXIST); + + subsubdir_fd = open_mkdir_at(subdir_fd, "yyy", O_EXCL|O_CLOEXEC, 0700); + assert_se(subsubdir_fd >= 0); + subsubdir_fd = safe_close(subsubdir_fd); + + assert_se(open_mkdir_at(subdir_fd, "yyy", O_EXCL|O_CLOEXEC, 0) == -EEXIST); + + assert_se(open_mkdir_at(fd, "xxx/yyy", O_EXCL|O_CLOEXEC, 0) == -EEXIST); + + subsubdir_fd = open_mkdir_at(fd, "xxx/yyy", O_CLOEXEC, 0700); + assert_se(subsubdir_fd >= 0); +} + +TEST(openat_report_new) { + _cleanup_free_ char *j = NULL; + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + _cleanup_close_ int fd = -EBADF; + bool b; + + assert_se(mkdtemp_malloc(NULL, &d) >= 0); + + j = path_join(d, "test"); + assert_se(j); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(b); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(!b); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(!b); + + assert_se(unlink(j) >= 0); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(b); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(!b); + + assert_se(unlink(j) >= 0); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, NULL); + assert_se(fd >= 0); + fd = safe_close(fd); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(!b); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(!b); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT|O_EXCL, 0666, &b); + assert_se(fd == -EEXIST); + + assert_se(unlink(j) >= 0); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR, 0666, &b); + assert_se(fd == -ENOENT); + + fd = openat_report_new(AT_FDCWD, j, O_RDWR|O_CREAT|O_EXCL, 0666, &b); + assert_se(fd >= 0); + fd = safe_close(fd); + assert_se(b); +} + +TEST(xopenat) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF, fd2 = -EBADF; + + assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); + + /* Test that xopenat() creates directories if O_DIRECTORY is specified. */ + + assert_se((fd = xopenat(tfd, "abc", O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, 0, 0755)) >= 0); + assert_se((fd_verify_directory(fd) >= 0)); + fd = safe_close(fd); + + assert_se(xopenat(tfd, "abc", O_DIRECTORY|O_CREAT|O_EXCL|O_CLOEXEC, 0, 0755) == -EEXIST); + + assert_se((fd = xopenat(tfd, "abc", O_DIRECTORY|O_CREAT|O_CLOEXEC, 0, 0755)) >= 0); + assert_se((fd_verify_directory(fd) >= 0)); + fd = safe_close(fd); + + /* Test that xopenat() creates regular files if O_DIRECTORY is not specified. */ + + assert_se((fd = xopenat(tfd, "def", O_CREAT|O_EXCL|O_CLOEXEC, 0, 0644)) >= 0); + assert_se(fd_verify_regular(fd) >= 0); + fd = safe_close(fd); + + /* Test that we can reopen an existing fd with xopenat() by specifying an empty path. */ + + assert_se((fd = xopenat(tfd, "def", O_PATH|O_CLOEXEC, 0, 0)) >= 0); + assert_se((fd2 = xopenat(fd, "", O_RDWR|O_CLOEXEC, 0, 0644)) >= 0); +} + +TEST(xopenat_lock) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF; + siginfo_t si; + + assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); + + /* Test that we can acquire an exclusive lock on a directory in one process, remove the directory, + * and close the file descriptor and still properly create the directory and acquire the lock in + * another process. */ + + fd = xopenat_lock(tfd, "abc", O_CREAT|O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX); + assert_se(fd >= 0); + assert_se(faccessat(tfd, "abc", F_OK, 0) >= 0); + assert_se(fd_verify_directory(fd) >= 0); + assert_se(xopenat_lock(tfd, "abc", O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + + pid_t pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + safe_close(fd); + + fd = xopenat_lock(tfd, "abc", O_CREAT|O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX); + assert_se(fd >= 0); + assert_se(faccessat(tfd, "abc", F_OK, 0) >= 0); + assert_se(fd_verify_directory(fd) >= 0); + assert_se(xopenat_lock(tfd, "abc", O_DIRECTORY|O_CLOEXEC, 0, 0755, LOCK_BSD, LOCK_EX|LOCK_NB) == -EAGAIN); + + _exit(EXIT_SUCCESS); + } + + /* We need to give the child process some time to get past the xopenat() call in xopenat_lock() and + * block in the call to lock_generic() waiting for the lock to become free. We can't modify + * xopenat_lock() to signal an eventfd to let us know when that has happened, so we just sleep for a + * little and assume that's enough time for the child process to get along far enough. It doesn't + * matter if it doesn't get far enough, in that case we just won't trigger the fallback logic in + * xopenat_lock(), but the test will still succeed. */ + assert_se(usleep_safe(20 * USEC_PER_MSEC) >= 0); + + assert_se(unlinkat(tfd, "abc", AT_REMOVEDIR) >= 0); + fd = safe_close(fd); + + assert_se(wait_for_terminate(pid, &si) >= 0); + assert_se(si.si_code == CLD_EXITED); + + assert_se(xopenat_lock(tfd, "abc", 0, 0, 0755, LOCK_POSIX, LOCK_EX) == -EBADF); + assert_se(xopenat_lock(tfd, "def", O_DIRECTORY, 0, 0755, LOCK_POSIX, LOCK_EX) == -EBADF); +} + +static int intro(void) { + arg_test_dir = saved_argv[1]; + return EXIT_SUCCESS; +} + +TEST(readlinkat_malloc) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *p = NULL, *q = NULL; + const char *expect = "hgoehogefoobar"; + + tfd = mkdtemp_open(NULL, O_PATH, &t); + assert_se(tfd >= 0); + + assert_se(symlinkat(expect, tfd, "linkname") >= 0); + + assert_se(readlinkat_malloc(tfd, "linkname", &p) >= 0); + assert_se(streq(p, expect)); + p = mfree(p); + + fd = openat(tfd, "linkname", O_PATH | O_NOFOLLOW | O_CLOEXEC); + assert_se(fd >= 0); + assert_se(readlinkat_malloc(fd, NULL, &p) >= 0); + assert_se(streq(p, expect)); + p = mfree(p); + assert_se(readlinkat_malloc(fd, "", &p) >= 0); + assert_se(streq(p, expect)); + p = mfree(p); + fd = safe_close(fd); + + assert_se(q = path_join(t, "linkname")); + assert_se(readlinkat_malloc(AT_FDCWD, q, &p) >= 0); + assert_se(streq(p, expect)); + p = mfree(p); + assert_se(readlinkat_malloc(INT_MAX, q, &p) >= 0); + assert_se(streq(p, expect)); + p = mfree(p); + q = mfree(q); +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-fstab-util.c b/src/test/test-fstab-util.c new file mode 100644 index 0000000..89365b0 --- /dev/null +++ b/src/test/test-fstab-util.c @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fstab-util.h" +#include "log.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +/* +int fstab_filter_options( + const char *opts, + const char *names, + const char **ret_namefound, + const char **ret_value, + const char **ret_values, + char **ret_filtered); +*/ + +static void do_fstab_filter_options(const char *opts, + const char *remove, + int r_expected, + int r_values_expected, + const char *name_expected, + const char *value_expected, + const char *values_expected, + const char *filtered_expected) { + int r; + const char *name; + _cleanup_free_ char *value = NULL, *filtered = NULL, *joined = NULL; + _cleanup_strv_free_ char **values = NULL; + + /* test mode which returns the last value */ + + r = fstab_filter_options(opts, remove, &name, &value, NULL, &filtered); + log_info("1: \"%s\" → %d, \"%s\", \"%s\", \"%s\", expected %d, \"%s\", \"%s\", \"%s\"", + opts, r, strnull(name), value, filtered, + r_expected, strnull(name_expected), strnull(value_expected), filtered_expected ?: opts); + assert_se(r == r_expected); + assert_se(streq_ptr(name, name_expected)); + assert_se(streq_ptr(value, value_expected)); + assert_se(streq_ptr(filtered, filtered_expected ?: opts)); + + /* test mode which returns all the values */ + + r = fstab_filter_options(opts, remove, &name, NULL, &values, NULL); + assert_se(joined = strv_join(values, ":")); + log_info("2: \"%s\" → %d, \"%s\", \"%s\", expected %d, \"%s\", \"%s\"", + opts, r, strnull(name), joined, + r_values_expected, strnull(name_expected), strnull(values_expected)); + assert_se(r == r_values_expected); + assert_se(streq_ptr(name, r_values_expected > 0 ? name_expected : NULL)); + assert_se(streq_ptr(joined, values_expected)); + + /* also test the malloc-less mode */ + r = fstab_filter_options(opts, remove, &name, NULL, NULL, NULL); + log_info("3: \"%s\" → %d, \"%s\", expected %d, \"%s\"\n-", + opts, r, strnull(name), + r_expected, strnull(name_expected)); + assert_se(r == r_expected); + assert_se(streq_ptr(name, name_expected)); +} + +TEST(fstab_filter_options) { + do_fstab_filter_options("opt=0", "opt\0x-opt\0", 1, 1, "opt", "0", "0", ""); + do_fstab_filter_options("opt=0", "x-opt\0opt\0", 1, 1, "opt", "0", "0", ""); + do_fstab_filter_options("opt", "opt\0x-opt\0", 1, 0, "opt", NULL, "", ""); + do_fstab_filter_options("opt", "x-opt\0opt\0", 1, 0, "opt", NULL, "", ""); + do_fstab_filter_options("x-opt", "x-opt\0opt\0", 1, 0, "x-opt", NULL, "", ""); + + do_fstab_filter_options("opt=0,other", "opt\0x-opt\0", 1, 1, "opt", "0", "0", "other"); + do_fstab_filter_options("opt=0,other", "x-opt\0opt\0", 1, 1, "opt", "0", "0", "other"); + do_fstab_filter_options("opt,other", "opt\0x-opt\0", 1, 0, "opt", NULL, "", "other"); + do_fstab_filter_options("opt,other", "x-opt\0opt\0", 1, 0, "opt", NULL, "", "other"); + do_fstab_filter_options("x-opt,other", "opt\0x-opt\0", 1, 0, "x-opt", NULL, "", "other"); + + do_fstab_filter_options("opt=0\\,1,other", "opt\0x-opt\0", 1, 1, "opt", "0,1", "0,1", "other"); + do_fstab_filter_options("opt=0,other,x-opt\\,foobar", "x-opt\0opt\0", 1, 1, "opt", "0", "0", "other,x-opt\\,foobar"); + do_fstab_filter_options("opt,other,x-opt\\,part", "opt\0x-opt\0", 1, 0, "opt", NULL, "", "other,x-opt\\,part"); + do_fstab_filter_options("opt,other,part\\,x-opt", "x-opt\0opt\0", 1, 0, "opt", NULL, "", "other,part\\,x-opt"); + do_fstab_filter_options("opt,other\\,\\,\\,opt,x-part", "opt\0x-opt\0", 1, 0, "opt", NULL, "", "other\\,\\,\\,opt,x-part"); + + do_fstab_filter_options("opto=0,other", "opt\0x-opt\0", 0, 0, NULL, NULL, "", NULL); + do_fstab_filter_options("opto,other", "opt\0x-opt\0", 0, 0, NULL, NULL, "", NULL); + do_fstab_filter_options("x-opto,other", "opt\0x-opt\0", 0, 0, NULL, NULL, "", NULL); + + do_fstab_filter_options("first,opt=0", "opt\0x-opt\0", 1, 1, "opt", "0", "0", "first"); + do_fstab_filter_options("first=1,opt=0", "opt\0x-opt\0", 1, 1, "opt", "0", "0", "first=1"); + do_fstab_filter_options("first,opt=", "opt\0x-opt\0", 1, 1, "opt", "", "", "first"); + do_fstab_filter_options("first=1,opt", "opt\0x-opt\0", 1, 0, "opt", NULL, "", "first=1"); + do_fstab_filter_options("first=1,x-opt", "opt\0x-opt\0", 1, 0, "x-opt", NULL, "", "first=1"); + + do_fstab_filter_options("first,opt=0,last=1", "opt\0x-opt\0", 1, 1, "opt", "0", "0", "first,last=1"); + do_fstab_filter_options("first=1,opt=0,last=2", "x-opt\0opt\0", 1, 1, "opt", "0", "0", "first=1,last=2"); + do_fstab_filter_options("first,opt,last", "opt\0", 1, 0, "opt", NULL, "", "first,last"); + do_fstab_filter_options("first=1,opt,last", "x-opt\0opt\0", 1, 0, "opt", NULL, "", "first=1,last"); + do_fstab_filter_options("first=,opt,last", "opt\0noopt\0", 1, 0, "opt", NULL, "", "first=,last"); + + /* check repeated options */ + do_fstab_filter_options("first,opt=0,noopt=1,last=1", "opt\0noopt\0", 1, 1, "noopt", "1", "0:1", "first,last=1"); + do_fstab_filter_options("first=1,opt=0,last=2,opt=1", "opt\0", 1, 1, "opt", "1", "0:1", "first=1,last=2"); + do_fstab_filter_options("x-opt=0,x-opt=1", "opt\0x-opt\0", 1, 1, "x-opt", "1", "0:1", ""); + do_fstab_filter_options("opt=0,x-opt=1", "opt\0x-opt\0", 1, 1, "x-opt", "1", "0:1", ""); + do_fstab_filter_options("opt=0,opt=1,opt=,opt=,opt=2", "opt\0noopt\0", 1, 1, "opt", "2", "0:1:::2", ""); + + /* check that semicolons are not misinterpreted */ + do_fstab_filter_options("opt=0;", "opt\0", 1, 1, "opt", "0;", "0;", ""); + do_fstab_filter_options("opt;=0", "x-opt\0opt\0noopt\0x-noopt\0", 0, 0, NULL, NULL, "", NULL); + do_fstab_filter_options("opt;", "opt\0x-opt\0", 0, 0, NULL, NULL, "", NULL); + + /* check that spaces are not misinterpreted */ + do_fstab_filter_options("opt=0 ", "opt\0", 1, 1, "opt", "0 ", "0 ", ""); + do_fstab_filter_options("opt =0", "x-opt\0opt\0noopt\0x-noopt\0", 0, 0, NULL, NULL, "", NULL); + do_fstab_filter_options(" opt ", "opt\0x-opt\0", 0, 0, NULL, NULL, "", NULL); + + /* check function with NULL args */ + do_fstab_filter_options(NULL, "opt\0", 0, 0, NULL, NULL, "", ""); + do_fstab_filter_options("", "opt\0", 0, 0, NULL, NULL, "", ""); + + /* unnecessary comma separators */ + do_fstab_filter_options("opt=x,,,,", "opt\0", 1, 1, "opt", "x", "x", ""); + do_fstab_filter_options(",,,opt=x,,,,", "opt\0", 1, 1, "opt", "x", "x", ""); + + /* escaped characters */ + do_fstab_filter_options("opt1=\\\\,opt2=\\xff", "opt1\0", 1, 1, "opt1", "\\", "\\", "opt2=\\xff"); + do_fstab_filter_options("opt1=\\\\,opt2=\\xff", "opt2\0", 1, 1, "opt2", "\\xff", "\\xff", "opt1=\\"); +} + +TEST(fstab_find_pri) { + int pri = -1; + + assert_se(fstab_find_pri("pri", &pri) == 0); + assert_se(pri == -1); + + assert_se(fstab_find_pri("pri=11", &pri) == 1); + assert_se(pri == 11); + + assert_se(fstab_find_pri("pri=-2", &pri) == 1); + assert_se(pri == -2); + + assert_se(fstab_find_pri("opt,pri=12,opt", &pri) == 1); + assert_se(pri == 12); + + assert_se(fstab_find_pri("opt,opt,pri=12,pri=13", &pri) == 1); + assert_se(pri == 13); +} + +TEST(fstab_yes_no_option) { + assert_se(fstab_test_yes_no_option("nofail,fail,nofail", "nofail\0fail\0") == true); + assert_se(fstab_test_yes_no_option("nofail,nofail,fail", "nofail\0fail\0") == false); + assert_se(fstab_test_yes_no_option("abc,cde,afail", "nofail\0fail\0") == false); + assert_se(fstab_test_yes_no_option("nofail,fail=0,nofail=0", "nofail\0fail\0") == true); + assert_se(fstab_test_yes_no_option("nofail,nofail=0,fail=0", "nofail\0fail\0") == false); +} + +TEST(fstab_node_to_udev_node) { + char *n; + + n = fstab_node_to_udev_node("LABEL=applé/jack"); + puts(n); + assert_se(streq(n, "/dev/disk/by-label/applé\\x2fjack")); + free(n); + + n = fstab_node_to_udev_node("PARTLABEL=pinkié pie"); + puts(n); + assert_se(streq(n, "/dev/disk/by-partlabel/pinkié\\x20pie")); + free(n); + + n = fstab_node_to_udev_node("UUID=037b9d94-148e-4ee4-8d38-67bfe15bb535"); + puts(n); + assert_se(streq(n, "/dev/disk/by-uuid/037b9d94-148e-4ee4-8d38-67bfe15bb535")); + free(n); + + n = fstab_node_to_udev_node("PARTUUID=037b9d94-148e-4ee4-8d38-67bfe15bb535"); + puts(n); + assert_se(streq(n, "/dev/disk/by-partuuid/037b9d94-148e-4ee4-8d38-67bfe15bb535")); + free(n); + + n = fstab_node_to_udev_node("PONIES=awesome"); + puts(n); + assert_se(streq(n, "PONIES=awesome")); + free(n); + + n = fstab_node_to_udev_node("/dev/xda1"); + puts(n); + assert_se(streq(n, "/dev/xda1")); + free(n); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-glob-util.c b/src/test/test-glob-util.c new file mode 100644 index 0000000..9b3e73c --- /dev/null +++ b/src/test/test-glob-util.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "dirent-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "macro.h" +#include "rm-rf.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(glob_first) { + char *first, name[] = "/tmp/test-glob_first.XXXXXX"; + int fd = -EBADF; + int r; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + close(fd); + + r = glob_first("/tmp/test-glob_first*", &first); + assert_se(r == 1); + assert_se(streq(name, first)); + first = mfree(first); + + r = unlink(name); + assert_se(r == 0); + r = glob_first("/tmp/test-glob_first*", &first); + assert_se(r == 0); + assert_se(first == NULL); +} + +TEST(glob_exists) { + char name[] = "/tmp/test-glob_exists.XXXXXX"; + int fd = -EBADF; + int r; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + close(fd); + + r = glob_exists("/tmp/test-glob_exists*"); + assert_se(r == 1); + + r = unlink(name); + assert_se(r == 0); + r = glob_exists("/tmp/test-glob_exists*"); + assert_se(r == 0); +} + +static void closedir_wrapper(void* v) { + (void) closedir(v); +} + +TEST(glob_no_dot) { + char template[] = "/tmp/test-glob-util.XXXXXXX"; + const char *fn; + + _cleanup_globfree_ glob_t g = { + .gl_closedir = closedir_wrapper, + .gl_readdir = (struct dirent *(*)(void *)) readdir_no_dot, + .gl_opendir = (void *(*)(const char *)) opendir, + .gl_lstat = lstat, + .gl_stat = stat, + }; + + int r; + + assert_se(mkdtemp(template)); + + fn = strjoina(template, "/*"); + r = glob(fn, GLOB_NOSORT|GLOB_BRACE|GLOB_ALTDIRFUNC, NULL, &g); + assert_se(r == GLOB_NOMATCH); + + fn = strjoina(template, "/.*"); + r = glob(fn, GLOB_NOSORT|GLOB_BRACE|GLOB_ALTDIRFUNC, NULL, &g); + assert_se(r == GLOB_NOMATCH); + + (void) rm_rf(template, REMOVE_ROOT|REMOVE_PHYSICAL); +} + +TEST(safe_glob) { + char template[] = "/tmp/test-glob-util.XXXXXXX"; + const char *fn, *fn2, *fname; + + _cleanup_globfree_ glob_t g = {}; + int r; + + assert_se(mkdtemp(template)); + + fn = strjoina(template, "/*"); + r = safe_glob(fn, 0, &g); + assert_se(r == -ENOENT); + + fn2 = strjoina(template, "/.*"); + r = safe_glob(fn2, GLOB_NOSORT|GLOB_BRACE, &g); + assert_se(r == -ENOENT); + + fname = strjoina(template, "/.foobar"); + assert_se(touch(fname) == 0); + + r = safe_glob(fn, 0, &g); + assert_se(r == -ENOENT); + + r = safe_glob(fn2, GLOB_NOSORT|GLOB_BRACE, &g); + assert_se(r == 0); + assert_se(g.gl_pathc == 1); + assert_se(streq(g.gl_pathv[0], fname)); + assert_se(g.gl_pathv[1] == NULL); + + (void) rm_rf(template, REMOVE_ROOT|REMOVE_PHYSICAL); +} + +static void test_glob_non_glob_prefix_one(const char *path, const char *expected) { + _cleanup_free_ char *t; + + assert_se(glob_non_glob_prefix(path, &t) == 0); + assert_se(streq(t, expected)); +} + +TEST(glob_non_glob) { + test_glob_non_glob_prefix_one("/tmp/.X11-*", "/tmp/"); + test_glob_non_glob_prefix_one("/tmp/*", "/tmp/"); + test_glob_non_glob_prefix_one("/tmp*", "/"); + test_glob_non_glob_prefix_one("/tmp/*/whatever", "/tmp/"); + test_glob_non_glob_prefix_one("/tmp/*/whatever?", "/tmp/"); + test_glob_non_glob_prefix_one("/?", "/"); + + char *x; + assert_se(glob_non_glob_prefix("?", &x) == -ENOENT); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-gpt.c b/src/test/test-gpt.c new file mode 100644 index 0000000..fa5923e --- /dev/null +++ b/src/test/test-gpt.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "architecture.h" +#include "glyph-util.h" +#include "gpt.h" +#include "log.h" +#include "pretty-print.h" +#include "strv.h" +#include "terminal-util.h" +#include "tests.h" + +TEST(gpt_types_against_architectures) { + int r; + + /* Dumps a table indicating for which architectures we know we have matching GPT partition + * types. Also validates whether we can properly categorize the entries. */ + + FOREACH_STRING(prefix, "root-", "usr-") + for (Architecture a = 0; a < _ARCHITECTURE_MAX; a++) + FOREACH_STRING(suffix, "", "-verity", "-verity-sig") { + _cleanup_free_ char *joined = NULL; + GptPartitionType type; + + joined = strjoin(prefix, architecture_to_string(a), suffix); + if (!joined) + return (void) log_oom(); + + r = gpt_partition_type_from_string(joined, &type); + if (r < 0) { + printf("%s %s\n", RED_CROSS_MARK(), joined); + continue; + } + + printf("%s %s\n", GREEN_CHECK_MARK(), joined); + + if (streq(prefix, "root-") && streq(suffix, "")) + assert_se(type.designator == PARTITION_ROOT); + if (streq(prefix, "root-") && streq(suffix, "-verity")) + assert_se(type.designator == PARTITION_ROOT_VERITY); + if (streq(prefix, "usr-") && streq(suffix, "")) + assert_se(type.designator == PARTITION_USR); + if (streq(prefix, "usr-") && streq(suffix, "-verity")) + assert_se(type.designator == PARTITION_USR_VERITY); + + assert_se(type.arch == a); + } +} + +TEST(verity_mappings) { + for (PartitionDesignator p = 0; p < _PARTITION_DESIGNATOR_MAX; p++) { + PartitionDesignator q; + + q = partition_verity_of(p); + assert_se(q < 0 || partition_verity_to_data(q) == p); + + q = partition_verity_sig_of(p); + assert_se(q < 0 || partition_verity_sig_to_data(q) == p); + + q = partition_verity_to_data(p); + assert_se(q < 0 || partition_verity_of(q) == p); + + q = partition_verity_sig_to_data(p); + assert_se(q < 0 || partition_verity_sig_of(q) == p); + } +} + +TEST(type_alias_same) { + /* Check that the partition type table is consistent, i.e. all aliases of the same partition type + * carry the same metadata */ + + for (const GptPartitionType *t = gpt_partition_type_table; t->name; t++) { + GptPartitionType x, y; + + x = gpt_partition_type_from_uuid(t->uuid); /* search first by uuid */ + assert_se(gpt_partition_type_from_string(t->name, &y) >= 0); /* search first by name */ + + assert_se(t->arch == x.arch); + assert_se(t->arch == y.arch); + assert_se(t->designator == x.designator); + assert_se(t->designator == y.designator); + } +} + +TEST(override_architecture) { + GptPartitionType x, y; + + assert_se(gpt_partition_type_from_string("root-x86-64", &x) >= 0); + assert_se(x.arch == ARCHITECTURE_X86_64); + + assert_se(gpt_partition_type_from_string("root-arm64", &y) >= 0); + assert(y.arch == ARCHITECTURE_ARM64); + + x = gpt_partition_type_override_architecture(x, ARCHITECTURE_ARM64); + assert_se(x.arch == y.arch); + assert_se(x.designator == y.designator); + assert_se(sd_id128_equal(x.uuid, y.uuid)); + assert_se(streq(x.name, y.name)); + + /* If the partition type does not have an architecture, nothing should change. */ + + assert_se(gpt_partition_type_from_string("esp", &x) >= 0); + y = x; + + x = gpt_partition_type_override_architecture(x, ARCHITECTURE_ARM64); + assert_se(x.arch == y.arch); + assert_se(x.designator == y.designator); + assert_se(sd_id128_equal(x.uuid, y.uuid)); + assert_se(streq(x.name, y.name)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-gunicode.c b/src/test/test-gunicode.c new file mode 100644 index 0000000..1836cdc --- /dev/null +++ b/src/test/test-gunicode.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "gunicode.h" +#include "tests.h" +#include "utf8.h" + +TEST(unichar_iswide) { + char32_t c; + int r; + + /* FIXME: the cats are wide, but we get this wrong */ + for (const char *narrow = "abX_…ąęµ!" "😼😿🙀😸😻"; *narrow; narrow += r) { + r = utf8_encoded_to_unichar(narrow, &c); + bool w = unichar_iswide(c); + assert_se(r > 0); + assert_se(!w); + } + + for (const char *wide = "🐱/¥"; *wide; wide += r) { + r = utf8_encoded_to_unichar(wide, &c); + bool w = unichar_iswide(c); + assert_se(r > 0); + assert_se(w); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-hash-funcs.c b/src/test/test-hash-funcs.c new file mode 100644 index 0000000..f5166c1 --- /dev/null +++ b/src/test/test-hash-funcs.c @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "tests.h" +#include "hash-funcs.h" +#include "set.h" + +TEST(path_hash_set) { + /* The goal is to make sure that non-simplified path are hashed as expected, + * and that we don't need to simplify them beforehand. */ + + /* No freeing of keys, we operate on static strings here… */ + _cleanup_set_free_ Set *set = NULL; + + assert_se(set_isempty(set)); + assert_se(set_ensure_put(&set, &path_hash_ops, "foo") == 1); + assert_se(set_ensure_put(&set, &path_hash_ops, "foo") == 0); + assert_se(set_ensure_put(&set, &path_hash_ops, "bar") == 1); + assert_se(set_ensure_put(&set, &path_hash_ops, "bar") == 0); + assert_se(set_ensure_put(&set, &path_hash_ops, "/foo") == 1); + assert_se(set_ensure_put(&set, &path_hash_ops, "/bar") == 1); + assert_se(set_ensure_put(&set, &path_hash_ops, "/foo/.") == 0); + assert_se(set_ensure_put(&set, &path_hash_ops, "/./bar/./.") == 0); + + assert_se(set_contains(set, "foo")); + assert_se(set_contains(set, "bar")); + assert_se(set_contains(set, "./foo")); + assert_se(set_contains(set, "./foo/.")); + assert_se(set_contains(set, "./bar")); + assert_se(set_contains(set, "./bar/.")); + assert_se(set_contains(set, "/foo")); + assert_se(set_contains(set, "/bar")); + assert_se(set_contains(set, "//./foo")); + assert_se(set_contains(set, "///./foo/.")); + assert_se(set_contains(set, "////./bar")); + assert_se(set_contains(set, "/////./bar/.")); + + assert_se(set_contains(set, "foo/")); + assert_se(set_contains(set, "bar/")); + assert_se(set_contains(set, "./foo/")); + assert_se(set_contains(set, "./foo/./")); + assert_se(set_contains(set, "./bar/")); + assert_se(set_contains(set, "./bar/./")); + assert_se(set_contains(set, "/foo/")); + assert_se(set_contains(set, "/bar/")); + assert_se(set_contains(set, "//./foo/")); + assert_se(set_contains(set, "///./foo/./")); + assert_se(set_contains(set, "////./bar/")); + assert_se(set_contains(set, "/////./bar/./")); + + assert_se(!set_contains(set, "foo.")); + assert_se(!set_contains(set, ".bar")); + assert_se(!set_contains(set, "./foo.")); + assert_se(!set_contains(set, "./.foo/.")); + assert_se(!set_contains(set, "../bar")); + assert_se(!set_contains(set, "./bar/..")); + assert_se(!set_contains(set, "./foo..")); + assert_se(!set_contains(set, "/..bar")); + assert_se(!set_contains(set, "//../foo")); + assert_se(!set_contains(set, "///../foo/.")); + assert_se(!set_contains(set, "////../bar")); + assert_se(!set_contains(set, "/////../bar/.")); + + assert_se(!set_contains(set, "foo./")); + assert_se(!set_contains(set, ".bar/")); + assert_se(!set_contains(set, "./foo./")); + assert_se(!set_contains(set, "./.foo/./")); + assert_se(!set_contains(set, "../bar/")); + assert_se(!set_contains(set, "./bar/../")); + assert_se(!set_contains(set, "./foo../")); + assert_se(!set_contains(set, "/..bar/")); + assert_se(!set_contains(set, "//../foo/")); + assert_se(!set_contains(set, "///../foo/./")); + assert_se(!set_contains(set, "////../bar/")); + assert_se(!set_contains(set, "/////../bar/./")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-hashmap-ordered.awk b/src/test/test-hashmap-ordered.awk new file mode 100644 index 0000000..88ffc25 --- /dev/null +++ b/src/test/test-hashmap-ordered.awk @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +BEGIN { + print "/* GENERATED FILE */"; + print "#define ORDERED" +} +{ + if (!match($0, "^#include")) + gsub(/hashmap/, "ordered_hashmap"); + gsub(/HASHMAP/, "ORDERED_HASHMAP"); + gsub(/Hashmap/, "OrderedHashmap"); + print +} diff --git a/src/test/test-hashmap-plain.c b/src/test/test-hashmap-plain.c new file mode 100644 index 0000000..152b1c0 --- /dev/null +++ b/src/test/test-hashmap-plain.c @@ -0,0 +1,1008 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "hashmap.h" +#include "log.h" +#include "nulstr-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "time-util.h" + +TEST(hashmap_replace) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + _cleanup_free_ char *val1 = NULL, *val2 = NULL, *val3 = NULL, *val4 = NULL, *val5 = NULL; + char *r; + + m = hashmap_new(&string_hash_ops); + + val1 = strdup("val1"); + assert_se(val1); + val2 = strdup("val2"); + assert_se(val2); + val3 = strdup("val3"); + assert_se(val3); + val4 = strdup("val4"); + assert_se(val4); + val5 = strdup("val5"); + assert_se(val5); + + hashmap_put(m, "key 1", val1); + hashmap_put(m, "key 2", val2); + hashmap_put(m, "key 3", val3); + hashmap_put(m, "key 4", val4); + + hashmap_replace(m, "key 3", val1); + r = hashmap_get(m, "key 3"); + assert_se(streq(r, "val1")); + + hashmap_replace(m, "key 5", val5); + r = hashmap_get(m, "key 5"); + assert_se(streq(r, "val5")); +} + +TEST(hashmap_copy) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + _cleanup_hashmap_free_free_ Hashmap *copy = NULL; + char *val1, *val2, *val3, *val4, *r; + + val1 = strdup("val1"); + assert_se(val1); + val2 = strdup("val2"); + assert_se(val2); + val3 = strdup("val3"); + assert_se(val3); + val4 = strdup("val4"); + assert_se(val4); + + m = hashmap_new(&string_hash_ops); + + hashmap_put(m, "key 1", val1); + hashmap_put(m, "key 2", val2); + hashmap_put(m, "key 3", val3); + hashmap_put(m, "key 4", val4); + + copy = hashmap_copy(m); + + r = hashmap_get(copy, "key 1"); + assert_se(streq(r, "val1")); + r = hashmap_get(copy, "key 2"); + assert_se(streq(r, "val2")); + r = hashmap_get(copy, "key 3"); + assert_se(streq(r, "val3")); + r = hashmap_get(copy, "key 4"); + assert_se(streq(r, "val4")); +} + +TEST(hashmap_get_strv) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + _cleanup_strv_free_ char **strv = NULL; + char *val1, *val2, *val3, *val4; + + val1 = strdup("val1"); + assert_se(val1); + val2 = strdup("val2"); + assert_se(val2); + val3 = strdup("val3"); + assert_se(val3); + val4 = strdup("val4"); + assert_se(val4); + + m = hashmap_new(&string_hash_ops); + + hashmap_put(m, "key 1", val1); + hashmap_put(m, "key 2", val2); + hashmap_put(m, "key 3", val3); + hashmap_put(m, "key 4", val4); + + strv = hashmap_get_strv(m); + +#ifndef ORDERED + strv = strv_sort(strv); +#endif + + assert_se(streq(strv[0], "val1")); + assert_se(streq(strv[1], "val2")); + assert_se(streq(strv[2], "val3")); + assert_se(streq(strv[3], "val4")); +} + +TEST(hashmap_move_one) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL, *n = NULL; + char *val1, *val2, *val3, *val4, *r; + + val1 = strdup("val1"); + assert_se(val1); + val2 = strdup("val2"); + assert_se(val2); + val3 = strdup("val3"); + assert_se(val3); + val4 = strdup("val4"); + assert_se(val4); + + m = hashmap_new(&string_hash_ops); + n = hashmap_new(&string_hash_ops); + + hashmap_put(m, "key 1", val1); + hashmap_put(m, "key 2", val2); + hashmap_put(m, "key 3", val3); + hashmap_put(m, "key 4", val4); + + assert_se(hashmap_move_one(n, NULL, "key 3") == -ENOENT); + assert_se(hashmap_move_one(n, m, "key 5") == -ENOENT); + assert_se(hashmap_move_one(n, m, "key 3") == 0); + assert_se(hashmap_move_one(n, m, "key 4") == 0); + + r = hashmap_get(n, "key 3"); + assert_se(r && streq(r, "val3")); + r = hashmap_get(n, "key 4"); + assert_se(r && streq(r, "val4")); + r = hashmap_get(m, "key 3"); + assert_se(!r); + + assert_se(hashmap_move_one(n, m, "key 3") == -EEXIST); +} + +TEST(hashmap_move) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL, *n = NULL; + char *val1, *val2, *val3, *val4, *r; + + val1 = strdup("val1"); + assert_se(val1); + val2 = strdup("val2"); + assert_se(val2); + val3 = strdup("val3"); + assert_se(val3); + val4 = strdup("val4"); + assert_se(val4); + + m = hashmap_new(&string_hash_ops); + n = hashmap_new(&string_hash_ops); + + hashmap_put(n, "key 1", strdup(val1)); + hashmap_put(m, "key 1", val1); + hashmap_put(m, "key 2", val2); + hashmap_put(m, "key 3", val3); + hashmap_put(m, "key 4", val4); + + assert_se(hashmap_move(n, NULL) == 0); + assert_se(hashmap_move(n, m) == 0); + + assert_se(hashmap_size(m) == 1); + r = hashmap_get(m, "key 1"); + assert_se(r && streq(r, "val1")); + + r = hashmap_get(n, "key 1"); + assert_se(r && streq(r, "val1")); + r = hashmap_get(n, "key 2"); + assert_se(r && streq(r, "val2")); + r = hashmap_get(n, "key 3"); + assert_se(r && streq(r, "val3")); + r = hashmap_get(n, "key 4"); + assert_se(r && streq(r, "val4")); +} + +TEST(hashmap_update) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + _cleanup_free_ char *val1 = NULL, *val2 = NULL; + char *r; + + m = hashmap_new(&string_hash_ops); + val1 = strdup("old_value"); + assert_se(val1); + val2 = strdup("new_value"); + assert_se(val2); + + hashmap_put(m, "key 1", val1); + r = hashmap_get(m, "key 1"); + assert_se(streq(r, "old_value")); + + assert_se(hashmap_update(m, "key 2", val2) == -ENOENT); + r = hashmap_get(m, "key 1"); + assert_se(streq(r, "old_value")); + + assert_se(hashmap_update(m, "key 1", val2) == 0); + r = hashmap_get(m, "key 1"); + assert_se(streq(r, "new_value")); +} + +TEST(hashmap_put) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + int valid_hashmap_put; + void *val1 = (void*) "val 1"; + void *val2 = (void*) "val 2"; + _cleanup_free_ char* key1 = NULL; + + assert_se(hashmap_ensure_allocated(&m, &string_hash_ops) == 1); + assert_se(m); + + valid_hashmap_put = hashmap_put(m, "key 1", val1); + assert_se(valid_hashmap_put == 1); + assert_se(hashmap_put(m, "key 1", val1) == 0); + assert_se(hashmap_put(m, "key 1", val2) == -EEXIST); + key1 = strdup("key 1"); + assert_se(hashmap_put(m, key1, val1) == 0); + assert_se(hashmap_put(m, key1, val2) == -EEXIST); +} + +TEST(hashmap_remove1) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + char *r; + + r = hashmap_remove(NULL, "key 1"); + assert_se(r == NULL); + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + r = hashmap_remove(m, "no such key"); + assert_se(r == NULL); + + hashmap_put(m, "key 1", (void*) "val 1"); + hashmap_put(m, "key 2", (void*) "val 2"); + + r = hashmap_remove(m, "key 1"); + assert_se(streq(r, "val 1")); + + r = hashmap_get(m, "key 2"); + assert_se(streq(r, "val 2")); + assert_se(!hashmap_get(m, "key 1")); +} + +TEST(hashmap_remove2) { + _cleanup_hashmap_free_free_free_ Hashmap *m = NULL; + char key1[] = "key 1"; + char key2[] = "key 2"; + char val1[] = "val 1"; + char val2[] = "val 2"; + void *r, *r2; + + r = hashmap_remove2(NULL, "key 1", &r2); + assert_se(r == NULL); + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + r = hashmap_remove2(m, "no such key", &r2); + assert_se(r == NULL); + + hashmap_put(m, strdup(key1), strdup(val1)); + hashmap_put(m, strdup(key2), strdup(val2)); + + r = hashmap_remove2(m, key1, &r2); + assert_se(streq(r, val1)); + assert_se(streq(r2, key1)); + free(r); + free(r2); + + r = hashmap_get(m, key2); + assert_se(streq(r, val2)); + assert_se(!hashmap_get(m, key1)); +} + +TEST(hashmap_remove_value) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + char *r; + + char val1[] = "val 1"; + char val2[] = "val 2"; + + r = hashmap_remove_value(NULL, "key 1", val1); + assert_se(r == NULL); + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + r = hashmap_remove_value(m, "key 1", val1); + assert_se(r == NULL); + + hashmap_put(m, "key 1", val1); + hashmap_put(m, "key 2", val2); + + r = hashmap_remove_value(m, "key 1", val1); + assert_se(streq(r, "val 1")); + + r = hashmap_get(m, "key 2"); + assert_se(streq(r, "val 2")); + assert_se(!hashmap_get(m, "key 1")); + + r = hashmap_remove_value(m, "key 2", val1); + assert_se(r == NULL); + + r = hashmap_get(m, "key 2"); + assert_se(streq(r, "val 2")); + assert_se(!hashmap_get(m, "key 1")); +} + +TEST(hashmap_remove_and_put) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + int valid; + char *r; + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + valid = hashmap_remove_and_put(m, "invalid key", "new key", NULL); + assert_se(valid == -ENOENT); + + valid = hashmap_put(m, "key 1", (void*) (const char *) "val 1"); + assert_se(valid == 1); + + valid = hashmap_remove_and_put(NULL, "key 1", "key 2", (void*) (const char *) "val 2"); + assert_se(valid == -ENOENT); + + valid = hashmap_remove_and_put(m, "key 1", "key 2", (void*) (const char *) "val 2"); + assert_se(valid == 0); + + r = hashmap_get(m, "key 2"); + assert_se(streq(r, "val 2")); + assert_se(!hashmap_get(m, "key 1")); + + valid = hashmap_put(m, "key 3", (void*) (const char *) "val 3"); + assert_se(valid == 1); + valid = hashmap_remove_and_put(m, "key 3", "key 2", (void*) (const char *) "val 2"); + assert_se(valid == -EEXIST); +} + +TEST(hashmap_remove_and_replace) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + int valid; + void *key1 = UINT_TO_PTR(1); + void *key2 = UINT_TO_PTR(2); + void *key3 = UINT_TO_PTR(3); + void *r; + int i, j; + + m = hashmap_new(&trivial_hash_ops); + assert_se(m); + + valid = hashmap_remove_and_replace(m, key1, key2, NULL); + assert_se(valid == -ENOENT); + + valid = hashmap_put(m, key1, key1); + assert_se(valid == 1); + + valid = hashmap_remove_and_replace(NULL, key1, key2, key2); + assert_se(valid == -ENOENT); + + valid = hashmap_remove_and_replace(m, key1, key2, key2); + assert_se(valid == 0); + + r = hashmap_get(m, key2); + assert_se(r == key2); + assert_se(!hashmap_get(m, key1)); + + valid = hashmap_put(m, key3, key3); + assert_se(valid == 1); + valid = hashmap_remove_and_replace(m, key3, key2, key2); + assert_se(valid == 0); + r = hashmap_get(m, key2); + assert_se(r == key2); + assert_se(!hashmap_get(m, key3)); + + /* Repeat this test several times to increase the chance of hitting + * the less likely case in hashmap_remove_and_replace where it + * compensates for the backward shift. */ + for (i = 0; i < 20; i++) { + hashmap_clear(m); + + for (j = 1; j < 7; j++) + hashmap_put(m, UINT_TO_PTR(10*i + j), UINT_TO_PTR(10*i + j)); + valid = hashmap_remove_and_replace(m, UINT_TO_PTR(10*i + 1), + UINT_TO_PTR(10*i + 2), + UINT_TO_PTR(10*i + 2)); + assert_se(valid == 0); + assert_se(!hashmap_get(m, UINT_TO_PTR(10*i + 1))); + for (j = 2; j < 7; j++) { + r = hashmap_get(m, UINT_TO_PTR(10*i + j)); + assert_se(r == UINT_TO_PTR(10*i + j)); + } + } +} + +TEST(hashmap_ensure_allocated) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + int r; + + r = hashmap_ensure_allocated(&m, &string_hash_ops); + assert_se(r == 1); + + r = hashmap_ensure_allocated(&m, &string_hash_ops); + assert_se(r == 0); + + /* different hash ops shouldn't matter at this point */ + r = hashmap_ensure_allocated(&m, &trivial_hash_ops); + assert_se(r == 0); +} + +TEST(hashmap_foreach_key) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + bool key_found[] = { false, false, false, false }; + const char *s; + const char *key; + static const char key_table[] = + "key 1\0" + "key 2\0" + "key 3\0" + "key 4\0"; + + m = hashmap_new(&string_hash_ops); + + NULSTR_FOREACH(k, key_table) + hashmap_put(m, k, (void*) (const char*) "my dummy val"); + + HASHMAP_FOREACH_KEY(s, key, m) { + assert_se(s); + if (!key_found[0] && streq(key, "key 1")) + key_found[0] = true; + else if (!key_found[1] && streq(key, "key 2")) + key_found[1] = true; + else if (!key_found[2] && streq(key, "key 3")) + key_found[2] = true; + else if (!key_found[3] && streq(key, "fail")) + key_found[3] = true; + } + + assert_se(m); + assert_se(key_found[0] && key_found[1] && key_found[2] && !key_found[3]); +} + +TEST(hashmap_foreach) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL; + bool value_found[] = { false, false, false, false }; + char *val1, *val2, *val3, *val4, *s; + unsigned count; + + val1 = strdup("my val1"); + assert_se(val1); + val2 = strdup("my val2"); + assert_se(val2); + val3 = strdup("my val3"); + assert_se(val3); + val4 = strdup("my val4"); + assert_se(val4); + + count = 0; + HASHMAP_FOREACH(s, m) + count++; + assert_se(count == 0); + + m = hashmap_new(&string_hash_ops); + + count = 0; + HASHMAP_FOREACH(s, m) + count++; + assert_se(count == 0); + + hashmap_put(m, "Key 1", val1); + hashmap_put(m, "Key 2", val2); + hashmap_put(m, "Key 3", val3); + hashmap_put(m, "Key 4", val4); + + HASHMAP_FOREACH(s, m) { + if (!value_found[0] && streq(s, val1)) + value_found[0] = true; + else if (!value_found[1] && streq(s, val2)) + value_found[1] = true; + else if (!value_found[2] && streq(s, val3)) + value_found[2] = true; + else if (!value_found[3] && streq(s, val4)) + value_found[3] = true; + } + + assert_se(m); + assert_se(value_found[0] && value_found[1] && value_found[2] && value_found[3]); +} + +TEST(hashmap_merge) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL; + _cleanup_hashmap_free_ Hashmap *n = NULL; + char *val1, *val2, *val3, *val4, *r; + + val1 = strdup("my val1"); + assert_se(val1); + val2 = strdup("my val2"); + assert_se(val2); + val3 = strdup("my val3"); + assert_se(val3); + val4 = strdup("my val4"); + assert_se(val4); + + m = hashmap_new(&string_hash_ops); + n = hashmap_new(&string_hash_ops); + + hashmap_put(m, "Key 1", val1); + hashmap_put(m, "Key 2", val2); + hashmap_put(n, "Key 3", val3); + hashmap_put(n, "Key 4", val4); + + assert_se(hashmap_merge(m, n) == 0); + r = hashmap_get(m, "Key 3"); + assert_se(r && streq(r, "my val3")); + r = hashmap_get(m, "Key 4"); + assert_se(r && streq(r, "my val4")); + + assert_se(m); + assert_se(n); +} + +TEST(hashmap_contains) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL; + char *val1; + + val1 = strdup("my val"); + assert_se(val1); + + m = hashmap_new(&string_hash_ops); + + assert_se(!hashmap_contains(m, "Key 1")); + hashmap_put(m, "Key 1", val1); + assert_se(hashmap_contains(m, "Key 1")); + assert_se(!hashmap_contains(m, "Key 2")); + + assert_se(!hashmap_contains(NULL, "Key 1")); + + assert_se(m); +} + +TEST(hashmap_isempty) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL; + char *val1; + + val1 = strdup("my val"); + assert_se(val1); + + m = hashmap_new(&string_hash_ops); + + assert_se(hashmap_isempty(m)); + hashmap_put(m, "Key 1", val1); + assert_se(!hashmap_isempty(m)); + + assert_se(m); +} + +TEST(hashmap_size) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL; + char *val1, *val2, *val3, *val4; + + val1 = strdup("my val"); + assert_se(val1); + val2 = strdup("my val"); + assert_se(val2); + val3 = strdup("my val"); + assert_se(val3); + val4 = strdup("my val"); + assert_se(val4); + + assert_se(hashmap_size(NULL) == 0); + assert_se(hashmap_buckets(NULL) == 0); + + m = hashmap_new(&string_hash_ops); + + hashmap_put(m, "Key 1", val1); + hashmap_put(m, "Key 2", val2); + hashmap_put(m, "Key 3", val3); + hashmap_put(m, "Key 4", val4); + + assert_se(m); + assert_se(hashmap_size(m) == 4); + assert_se(hashmap_buckets(m) >= 4); +} + +TEST(hashmap_get) { + _cleanup_hashmap_free_free_ Hashmap *m = NULL; + char *r; + char *val; + + val = strdup("my val"); + assert_se(val); + + r = hashmap_get(NULL, "Key 1"); + assert_se(r == NULL); + + m = hashmap_new(&string_hash_ops); + + hashmap_put(m, "Key 1", val); + + r = hashmap_get(m, "Key 1"); + assert_se(streq(r, val)); + + r = hashmap_get(m, "no such key"); + assert_se(r == NULL); + + assert_se(m); +} + +TEST(hashmap_get2) { + _cleanup_(hashmap_free_free_freep) Hashmap *m = NULL; + char *r; + char *val; + char key_orig[] = "Key 1"; + void *key_copy; + + val = strdup("my val"); + assert_se(val); + + key_copy = strdup(key_orig); + assert_se(key_copy); + + r = hashmap_get2(NULL, key_orig, &key_copy); + assert_se(r == NULL); + + m = hashmap_new(&string_hash_ops); + + hashmap_put(m, key_copy, val); + key_copy = NULL; + + r = hashmap_get2(m, key_orig, &key_copy); + assert_se(streq(r, val)); + assert_se(key_orig != key_copy); + assert_se(streq(key_orig, key_copy)); + + r = hashmap_get2(m, "no such key", NULL); + assert_se(r == NULL); + + assert_se(m); +} + +static void crippled_hashmap_func(const void *p, struct siphash *state) { + return trivial_hash_func(INT_TO_PTR(PTR_TO_INT(p) & 0xff), state); +} + +static const struct hash_ops crippled_hashmap_ops = { + .hash = crippled_hashmap_func, + .compare = trivial_compare_func, +}; + +TEST(hashmap_many) { + Hashmap *h; + unsigned i, j; + void *v, *k; + bool slow = slow_tests_enabled(); + const struct { + const char *title; + const struct hash_ops *ops; + unsigned n_entries; + } tests[] = { + { "trivial_hashmap_ops", NULL, slow ? 1 << 20 : 240 }, + { "crippled_hashmap_ops", &crippled_hashmap_ops, slow ? 1 << 14 : 140 }, + }; + + log_info("/* %s (%s) */", __func__, slow ? "slow" : "fast"); + + for (j = 0; j < ELEMENTSOF(tests); j++) { + usec_t ts = now(CLOCK_MONOTONIC), n; + + assert_se(h = hashmap_new(tests[j].ops)); + + for (i = 1; i < tests[j].n_entries*3; i+=3) { + assert_se(hashmap_put(h, UINT_TO_PTR(i), UINT_TO_PTR(i)) >= 0); + assert_se(PTR_TO_UINT(hashmap_get(h, UINT_TO_PTR(i))) == i); + } + + for (i = 1; i < tests[j].n_entries*3; i++) + assert_se(hashmap_contains(h, UINT_TO_PTR(i)) == (i % 3 == 1)); + + log_info("%s %u <= %u * 0.8 = %g", + tests[j].title, hashmap_size(h), hashmap_buckets(h), hashmap_buckets(h) * 0.8); + + assert_se(hashmap_size(h) <= hashmap_buckets(h) * 0.8); + assert_se(hashmap_size(h) == tests[j].n_entries); + + while (!hashmap_isempty(h)) { + k = hashmap_first_key(h); + v = hashmap_remove(h, k); + assert_se(v == k); + } + + hashmap_free(h); + + n = now(CLOCK_MONOTONIC); + log_info("test took %s", FORMAT_TIMESPAN(n - ts, 0)); + } +} + +extern unsigned custom_counter; +extern const struct hash_ops boring_hash_ops, custom_hash_ops; + +TEST(hashmap_free) { + Hashmap *h; + bool slow = slow_tests_enabled(); + usec_t ts, n; + unsigned n_entries = slow ? 1 << 20 : 240; + + const struct { + const char *title; + const struct hash_ops *ops; + unsigned expect_counter; + } tests[] = { + { "string_hash_ops", &boring_hash_ops, 2 * n_entries}, + { "custom_free_hash_ops", &custom_hash_ops, 0 }, + }; + + log_info("/* %s (%s, %u entries) */", __func__, slow ? "slow" : "fast", n_entries); + + for (unsigned j = 0; j < ELEMENTSOF(tests); j++) { + ts = now(CLOCK_MONOTONIC); + assert_se(h = hashmap_new(tests[j].ops)); + + custom_counter = 0; + for (unsigned i = 0; i < n_entries; i++) { + char s[DECIMAL_STR_MAX(unsigned)]; + char *k, *v; + + xsprintf(s, "%u", i); + assert_se(k = strdup(s)); + assert_se(v = strdup(s)); + custom_counter += 2; + + assert_se(hashmap_put(h, k, v) >= 0); + } + + hashmap_free(h); + + n = now(CLOCK_MONOTONIC); + log_info("%s test took %s", tests[j].title, FORMAT_TIMESPAN(n - ts, 0)); + + assert_se(custom_counter == tests[j].expect_counter); + } +} + +typedef struct Item { + int seen; +} Item; +static void item_seen(Item *item) { + item->seen++; +} + +TEST(hashmap_free_with_destructor) { + Hashmap *m; + struct Item items[4] = {}; + unsigned i; + + assert_se(m = hashmap_new(NULL)); + for (i = 0; i < ELEMENTSOF(items) - 1; i++) + assert_se(hashmap_put(m, INT_TO_PTR(i), items + i) == 1); + + m = hashmap_free_with_destructor(m, item_seen); + assert_se(items[0].seen == 1); + assert_se(items[1].seen == 1); + assert_se(items[2].seen == 1); + assert_se(items[3].seen == 0); +} + +TEST(hashmap_first) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + assert_se(!hashmap_first(m)); + assert_se(hashmap_put(m, "key 1", (void*) "val 1") == 1); + assert_se(streq(hashmap_first(m), "val 1")); + assert_se(hashmap_put(m, "key 2", (void*) "val 2") == 1); +#ifdef ORDERED + assert_se(streq(hashmap_first(m), "val 1")); + assert_se(hashmap_remove(m, "key 1")); + assert_se(streq(hashmap_first(m), "val 2")); +#endif +} + +TEST(hashmap_first_key) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + assert_se(!hashmap_first_key(m)); + assert_se(hashmap_put(m, "key 1", NULL) == 1); + assert_se(streq(hashmap_first_key(m), "key 1")); + assert_se(hashmap_put(m, "key 2", NULL) == 1); +#ifdef ORDERED + assert_se(streq(hashmap_first_key(m), "key 1")); + assert_se(hashmap_remove(m, "key 1") == NULL); + assert_se(streq(hashmap_first_key(m), "key 2")); +#endif +} + +TEST(hashmap_steal_first_key) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + assert_se(!hashmap_steal_first_key(m)); + assert_se(hashmap_put(m, "key 1", NULL) == 1); + assert_se(streq(hashmap_steal_first_key(m), "key 1")); + + assert_se(hashmap_isempty(m)); +} + +TEST(hashmap_steal_first) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + int seen[3] = {}; + char *val; + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + assert_se(hashmap_put(m, "key 1", (void*) "1") == 1); + assert_se(hashmap_put(m, "key 2", (void*) "22") == 1); + assert_se(hashmap_put(m, "key 3", (void*) "333") == 1); + + while ((val = hashmap_steal_first(m))) + seen[strlen(val) - 1]++; + + assert_se(seen[0] == 1 && seen[1] == 1 && seen[2] == 1); + + assert_se(hashmap_isempty(m)); +} + +TEST(hashmap_clear_free_free) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + + m = hashmap_new(&string_hash_ops); + assert_se(m); + + assert_se(hashmap_put(m, strdup("key 1"), NULL) == 1); + assert_se(hashmap_put(m, strdup("key 2"), NULL) == 1); + assert_se(hashmap_put(m, strdup("key 3"), NULL) == 1); + + hashmap_clear_free_free(m); + assert_se(hashmap_isempty(m)); + + assert_se(hashmap_put(m, strdup("key 1"), strdup("value 1")) == 1); + assert_se(hashmap_put(m, strdup("key 2"), strdup("value 2")) == 1); + assert_se(hashmap_put(m, strdup("key 3"), strdup("value 3")) == 1); + + hashmap_clear_free_free(m); + assert_se(hashmap_isempty(m)); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(test_hash_ops_key, char, string_hash_func, string_compare_func, free); +DEFINE_PRIVATE_HASH_OPS_FULL(test_hash_ops_full, char, string_hash_func, string_compare_func, free, char, free); + +TEST(hashmap_clear_free_with_destructor) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + + m = hashmap_new(&test_hash_ops_key); + assert_se(m); + + assert_se(hashmap_put(m, strdup("key 1"), NULL) == 1); + assert_se(hashmap_put(m, strdup("key 2"), NULL) == 1); + assert_se(hashmap_put(m, strdup("key 3"), NULL) == 1); + + hashmap_clear_free(m); + assert_se(hashmap_isempty(m)); + m = hashmap_free(m); + + m = hashmap_new(&test_hash_ops_full); + assert_se(m); + + assert_se(hashmap_put(m, strdup("key 1"), strdup("value 1")) == 1); + assert_se(hashmap_put(m, strdup("key 2"), strdup("value 2")) == 1); + assert_se(hashmap_put(m, strdup("key 3"), strdup("value 3")) == 1); + + hashmap_clear_free(m); + assert_se(hashmap_isempty(m)); +} + +TEST(hashmap_reserve) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + + m = hashmap_new(&string_hash_ops); + + assert_se(hashmap_reserve(m, 1) == 0); + assert_se(hashmap_buckets(m) < 1000); + assert_se(hashmap_reserve(m, 1000) == 0); + assert_se(hashmap_buckets(m) >= 1000); + assert_se(hashmap_isempty(m)); + + assert_se(hashmap_put(m, "key 1", (void*) "val 1") == 1); + + assert_se(hashmap_reserve(m, UINT_MAX) == -ENOMEM); + assert_se(hashmap_reserve(m, UINT_MAX - 1) == -ENOMEM); +} + +TEST(path_hashmap) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + + assert_se(h = hashmap_new(&path_hash_ops)); + + assert_se(hashmap_put(h, "foo", INT_TO_PTR(1)) >= 0); + assert_se(hashmap_put(h, "/foo", INT_TO_PTR(2)) >= 0); + assert_se(hashmap_put(h, "//foo", INT_TO_PTR(3)) == -EEXIST); + assert_se(hashmap_put(h, "//foox/", INT_TO_PTR(4)) >= 0); + assert_se(hashmap_put(h, "/foox////", INT_TO_PTR(5)) == -EEXIST); + assert_se(hashmap_put(h, "//././/foox//.//.", INT_TO_PTR(5)) == -EEXIST); + assert_se(hashmap_put(h, "foo//////bar/quux//", INT_TO_PTR(6)) >= 0); + assert_se(hashmap_put(h, "foo/bar//quux/", INT_TO_PTR(8)) == -EEXIST); + assert_se(hashmap_put(h, "foo./ba.r//.quux/", INT_TO_PTR(9)) >= 0); + assert_se(hashmap_put(h, "foo./ba.r//.//.quux///./", INT_TO_PTR(10)) == -EEXIST); + + assert_se(hashmap_get(h, "foo") == INT_TO_PTR(1)); + assert_se(hashmap_get(h, "foo/") == INT_TO_PTR(1)); + assert_se(hashmap_get(h, "foo////") == INT_TO_PTR(1)); + assert_se(hashmap_get(h, "/foo") == INT_TO_PTR(2)); + assert_se(hashmap_get(h, "//foo") == INT_TO_PTR(2)); + assert_se(hashmap_get(h, "/////foo////") == INT_TO_PTR(2)); + assert_se(hashmap_get(h, "/////foox////") == INT_TO_PTR(4)); + assert_se(hashmap_get(h, "/.///./foox//.//") == INT_TO_PTR(4)); + assert_se(hashmap_get(h, "/foox/") == INT_TO_PTR(4)); + assert_se(hashmap_get(h, "/foox") == INT_TO_PTR(4)); + assert_se(!hashmap_get(h, "foox")); + assert_se(hashmap_get(h, "foo/bar/quux") == INT_TO_PTR(6)); + assert_se(hashmap_get(h, "foo////bar////quux/////") == INT_TO_PTR(6)); + assert_se(!hashmap_get(h, "/foo////bar////quux/////")); + assert_se(hashmap_get(h, "foo././//ba.r////.quux///.//.") == INT_TO_PTR(9)); +} + +TEST(string_strv_hashmap) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + char **s; + + assert_se(string_strv_hashmap_put(&m, "foo", "bar") == 1); + assert_se(string_strv_hashmap_put(&m, "foo", "bar") == 0); + assert_se(string_strv_hashmap_put(&m, "foo", "BAR") == 1); + assert_se(string_strv_hashmap_put(&m, "foo", "BAR") == 0); + assert_se(string_strv_hashmap_put(&m, "foo", "bar") == 0); + assert_se(hashmap_contains(m, "foo")); + + s = hashmap_get(m, "foo"); + assert_se(strv_equal(s, STRV_MAKE("bar", "BAR"))); + + assert_se(string_strv_hashmap_put(&m, "xxx", "bar") == 1); + assert_se(string_strv_hashmap_put(&m, "xxx", "bar") == 0); + assert_se(string_strv_hashmap_put(&m, "xxx", "BAR") == 1); + assert_se(string_strv_hashmap_put(&m, "xxx", "BAR") == 0); + assert_se(string_strv_hashmap_put(&m, "xxx", "bar") == 0); + assert_se(hashmap_contains(m, "xxx")); + + s = hashmap_get(m, "xxx"); + assert_se(strv_equal(s, STRV_MAKE("bar", "BAR"))); +} + +TEST(hashmap_dump_sorted) { + static void * const expected[] = { UINT_TO_PTR(123U), UINT_TO_PTR(12U), UINT_TO_PTR(345U), }; + _cleanup_hashmap_free_ Hashmap *m = NULL; + _cleanup_free_ void **vals = NULL; + size_t n; + + assert_se(m = hashmap_new(&string_hash_ops)); + + assert_se(hashmap_dump_sorted(m, &vals, &n) >= 0); + assert_se(n == 0); + assert_se(!vals); + + assert_se(hashmap_put(m, "key 0", expected[0]) == 1); + assert_se(hashmap_put(m, "key 1", expected[1]) == 1); + assert_se(hashmap_put(m, "key 2", expected[2]) == 1); + + assert_se(hashmap_dump_sorted(m, &vals, &n) >= 0); + assert_se(n == ELEMENTSOF(expected)); + assert_se(memcmp(vals, expected, n * sizeof(void*)) == 0); + + vals = mfree(vals); + m = hashmap_free(m); + + assert_se(m = hashmap_new(NULL)); + + assert_se(hashmap_dump_sorted(m, &vals, &n) >= 0); + assert_se(n == 0); + assert_se(!vals); + + assert_se(hashmap_put(m, UINT_TO_PTR(333U), expected[2]) == 1); + assert_se(hashmap_put(m, UINT_TO_PTR(222U), expected[1]) == 1); + assert_se(hashmap_put(m, UINT_TO_PTR(111U), expected[0]) == 1); + + assert_se(hashmap_dump_sorted(m, &vals, &n) >= 0); + assert_se(n == ELEMENTSOF(expected)); + assert_se(memcmp(vals, expected, n * sizeof(void*)) == 0); +} + +/* Signal to test-hashmap.c that tests from this compilation unit were run. */ +extern int n_extern_tests_run; +TEST(ensure_extern_hashmap_tests) { + n_extern_tests_run++; +} diff --git a/src/test/test-hashmap.c b/src/test/test-hashmap.c new file mode 100644 index 0000000..5daa0e6 --- /dev/null +++ b/src/test/test-hashmap.c @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "hashmap.h" +#include "string-util.h" +#include "tests.h" + +unsigned custom_counter = 0; +static void custom_destruct(void* p) { + custom_counter--; + free(p); +} + +DEFINE_HASH_OPS_FULL(boring_hash_ops, char, string_hash_func, string_compare_func, free, char, free); +DEFINE_HASH_OPS_FULL(custom_hash_ops, char, string_hash_func, string_compare_func, custom_destruct, char, custom_destruct); + +TEST(ordered_hashmap_next) { + _cleanup_ordered_hashmap_free_ OrderedHashmap *m = NULL; + int i; + + assert_se(m = ordered_hashmap_new(NULL)); + for (i = -2; i <= 2; i++) + assert_se(ordered_hashmap_put(m, INT_TO_PTR(i), INT_TO_PTR(i+10)) == 1); + for (i = -2; i <= 1; i++) + assert_se(ordered_hashmap_next(m, INT_TO_PTR(i)) == INT_TO_PTR(i+11)); + assert_se(!ordered_hashmap_next(m, INT_TO_PTR(2))); + assert_se(!ordered_hashmap_next(NULL, INT_TO_PTR(1))); + assert_se(!ordered_hashmap_next(m, INT_TO_PTR(3))); +} + +TEST(uint64_compare_func) { + const uint64_t a = 0x100, b = 0x101; + + assert_se(uint64_compare_func(&a, &a) == 0); + assert_se(uint64_compare_func(&a, &b) == -1); + assert_se(uint64_compare_func(&b, &a) == 1); +} + +TEST(trivial_compare_func) { + assert_se(trivial_compare_func(INT_TO_PTR('a'), INT_TO_PTR('a')) == 0); + assert_se(trivial_compare_func(INT_TO_PTR('a'), INT_TO_PTR('b')) == -1); + assert_se(trivial_compare_func(INT_TO_PTR('b'), INT_TO_PTR('a')) == 1); +} + +TEST(string_compare_func) { + assert_se(string_compare_func("fred", "wilma") != 0); + assert_se(string_compare_func("fred", "fred") == 0); +} + +static void compare_cache(Hashmap *map, IteratedCache *cache) { + const void **keys = NULL, **values = NULL; + unsigned num, idx; + void *k, *v; + + assert_se(iterated_cache_get(cache, &keys, &values, &num) == 0); + assert_se(num == 0 || keys); + assert_se(num == 0 || values); + + idx = 0; + HASHMAP_FOREACH_KEY(v, k, map) { + assert_se(v == values[idx]); + assert_se(k == keys[idx]); + + idx++; + } + + assert_se(idx == num); +} + +TEST(iterated_cache) { + Hashmap *m; + IteratedCache *c; + + assert_se(m = hashmap_new(NULL)); + assert_se(c = hashmap_iterated_cache_new(m)); + compare_cache(m, c); + + for (int stage = 0; stage < 100; stage++) { + + for (int i = 0; i < 100; i++) { + int foo = stage * 1000 + i; + + assert_se(hashmap_put(m, INT_TO_PTR(foo), INT_TO_PTR(foo + 777)) == 1); + } + + compare_cache(m, c); + + if (!(stage % 10)) { + for (int i = 0; i < 100; i++) { + int foo = stage * 1000 + i; + + assert_se(hashmap_remove(m, INT_TO_PTR(foo)) == INT_TO_PTR(foo + 777)); + } + + compare_cache(m, c); + } + } + + hashmap_clear(m); + compare_cache(m, c); + + assert_se(hashmap_free(m) == NULL); + assert_se(iterated_cache_free(c) == NULL); +} + +TEST(hashmap_put_strdup) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + char *s; + + /* We don't have ordered_hashmap_put_strdup() yet. If it is added, + * these tests should be moved to test-hashmap-plain.c. */ + + assert_se(hashmap_put_strdup(&m, "foo", "bar") == 1); + assert_se(hashmap_put_strdup(&m, "foo", "bar") == 0); + assert_se(hashmap_put_strdup(&m, "foo", "BAR") == -EEXIST); + assert_se(hashmap_put_strdup(&m, "foo", "bar") == 0); + assert_se(hashmap_contains(m, "foo")); + + s = hashmap_get(m, "foo"); + assert_se(streq(s, "bar")); + + assert_se(hashmap_put_strdup(&m, "xxx", "bar") == 1); + assert_se(hashmap_put_strdup(&m, "xxx", "bar") == 0); + assert_se(hashmap_put_strdup(&m, "xxx", "BAR") == -EEXIST); + assert_se(hashmap_put_strdup(&m, "xxx", "bar") == 0); + assert_se(hashmap_contains(m, "xxx")); + + s = hashmap_get(m, "xxx"); + assert_se(streq(s, "bar")); +} + +TEST(hashmap_put_strdup_null) { + _cleanup_hashmap_free_ Hashmap *m = NULL; + char *s; + + assert_se(hashmap_put_strdup(&m, "foo", "bar") == 1); + assert_se(hashmap_put_strdup(&m, "foo", "bar") == 0); + assert_se(hashmap_put_strdup(&m, "foo", NULL) == -EEXIST); + assert_se(hashmap_put_strdup(&m, "foo", "bar") == 0); + assert_se(hashmap_contains(m, "foo")); + + s = hashmap_get(m, "foo"); + assert_se(streq(s, "bar")); + + assert_se(hashmap_put_strdup(&m, "xxx", NULL) == 1); + assert_se(hashmap_put_strdup(&m, "xxx", "bar") == -EEXIST); + assert_se(hashmap_put_strdup(&m, "xxx", NULL) == 0); + assert_se(hashmap_contains(m, "xxx")); + + s = hashmap_get(m, "xxx"); + assert_se(s == NULL); +} + +/* This file tests in test-hashmap-plain.c, and tests in test-hashmap-ordered.c, which is generated + * from test-hashmap-plain.c. Hashmap tests should be added to test-hashmap-plain.c, and here only if + * they don't apply to ordered hashmaps. */ + +/* This variable allows us to assert that the tests from different compilation units were actually run. */ +int n_extern_tests_run = 0; + +static int intro(void) { + assert_se(n_extern_tests_run == 0); + return EXIT_SUCCESS; +} + +static int outro(void) { + /* Ensure hashmap and ordered_hashmap were tested. */ + assert_se(n_extern_tests_run == 2); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_FULL(LOG_INFO, intro, outro); diff --git a/src/test/test-hexdecoct.c b/src/test/test-hexdecoct.c new file mode 100644 index 0000000..f884008 --- /dev/null +++ b/src/test/test-hexdecoct.c @@ -0,0 +1,548 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "hexdecoct.h" +#include "macro.h" +#include "random-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(hexchar) { + assert_se(hexchar(0xa) == 'a'); + assert_se(hexchar(0x0) == '0'); +} + +TEST(unhexchar) { + assert_se(unhexchar('a') == 0xA); + assert_se(unhexchar('A') == 0xA); + assert_se(unhexchar('0') == 0x0); +} + +TEST(base32hexchar) { + assert_se(base32hexchar(0) == '0'); + assert_se(base32hexchar(9) == '9'); + assert_se(base32hexchar(10) == 'A'); + assert_se(base32hexchar(31) == 'V'); +} + +TEST(unbase32hexchar) { + assert_se(unbase32hexchar('0') == 0); + assert_se(unbase32hexchar('9') == 9); + assert_se(unbase32hexchar('A') == 10); + assert_se(unbase32hexchar('V') == 31); + assert_se(unbase32hexchar('=') == -EINVAL); +} + +TEST(base64char) { + assert_se(base64char(0) == 'A'); + assert_se(base64char(26) == 'a'); + assert_se(base64char(63) == '/'); +} + +TEST(unbase64char) { + assert_se(unbase64char('A') == 0); + assert_se(unbase64char('Z') == 25); + assert_se(unbase64char('a') == 26); + assert_se(unbase64char('z') == 51); + assert_se(unbase64char('0') == 52); + assert_se(unbase64char('9') == 61); + assert_se(unbase64char('+') == 62); + assert_se(unbase64char('/') == 63); + assert_se(unbase64char('=') == -EINVAL); +} + +TEST(octchar) { + assert_se(octchar(00) == '0'); + assert_se(octchar(07) == '7'); +} + +TEST(unoctchar) { + assert_se(unoctchar('0') == 00); + assert_se(unoctchar('7') == 07); +} + +TEST(decchar) { + assert_se(decchar(0) == '0'); + assert_se(decchar(9) == '9'); +} + +TEST(undecchar) { + assert_se(undecchar('0') == 0); + assert_se(undecchar('9') == 9); +} + +static void test_hexmem_one(const char *in, const char *expected) { + _cleanup_free_ char *result = NULL; + _cleanup_free_ void *mem = NULL; + size_t len; + + assert_se(result = hexmem(in, strlen_ptr(in))); + log_debug("hexmem(\"%s\") → \"%s\" (expected: \"%s\")", strnull(in), result, expected); + assert_se(streq(result, expected)); + + assert_se(unhexmem(result, SIZE_MAX, &mem, &len) >= 0); + assert_se(memcmp_safe(mem, in, len) == 0); +} + +TEST(hexmem) { + test_hexmem_one(NULL, ""); + test_hexmem_one("", ""); + test_hexmem_one("foo", "666f6f"); +} + +static void test_unhexmem_one(const char *s, size_t l, int retval) { + _cleanup_free_ char *hex = NULL; + _cleanup_free_ void *mem = NULL; + size_t len; + + assert_se(unhexmem(s, l, &mem, &len) == retval); + if (retval == 0) { + char *answer; + + if (l == SIZE_MAX) + l = strlen(s); + + assert_se(hex = hexmem(mem, len)); + answer = strndupa_safe(strempty(s), l); + assert_se(streq(delete_chars(answer, WHITESPACE), hex)); + } +} + +TEST(unhexmem) { + const char *hex = "efa2149213"; + const char *hex_space = " e f a\n 2\r 14\n\r\t9\t2 \n1\r3 \r\r\t"; + const char *hex_invalid = "efa214921o"; + + test_unhexmem_one(NULL, 0, 0); + test_unhexmem_one("", 0, 0); + test_unhexmem_one("", SIZE_MAX, 0); + test_unhexmem_one(" \n \t\r \t\t \n\n\n", SIZE_MAX, 0); + test_unhexmem_one(hex_invalid, strlen(hex_invalid), -EINVAL); + test_unhexmem_one(hex_invalid, (size_t) - 1, -EINVAL); + test_unhexmem_one(hex, strlen(hex) - 1, -EPIPE); + test_unhexmem_one(hex, strlen(hex), 0); + test_unhexmem_one(hex, SIZE_MAX, 0); + test_unhexmem_one(hex_space, strlen(hex_space), 0); + test_unhexmem_one(hex_space, SIZE_MAX, 0); +} + +/* https://tools.ietf.org/html/rfc4648#section-10 */ +TEST(base32hexmem) { + char *b32; + + b32 = base32hexmem("", STRLEN(""), true); + assert_se(b32); + assert_se(streq(b32, "")); + free(b32); + + b32 = base32hexmem("f", STRLEN("f"), true); + assert_se(b32); + assert_se(streq(b32, "CO======")); + free(b32); + + b32 = base32hexmem("fo", STRLEN("fo"), true); + assert_se(b32); + assert_se(streq(b32, "CPNG====")); + free(b32); + + b32 = base32hexmem("foo", STRLEN("foo"), true); + assert_se(b32); + assert_se(streq(b32, "CPNMU===")); + free(b32); + + b32 = base32hexmem("foob", STRLEN("foob"), true); + assert_se(b32); + assert_se(streq(b32, "CPNMUOG=")); + free(b32); + + b32 = base32hexmem("fooba", STRLEN("fooba"), true); + assert_se(b32); + assert_se(streq(b32, "CPNMUOJ1")); + free(b32); + + b32 = base32hexmem("foobar", STRLEN("foobar"), true); + assert_se(b32); + assert_se(streq(b32, "CPNMUOJ1E8======")); + free(b32); + + b32 = base32hexmem("", STRLEN(""), false); + assert_se(b32); + assert_se(streq(b32, "")); + free(b32); + + b32 = base32hexmem("f", STRLEN("f"), false); + assert_se(b32); + assert_se(streq(b32, "CO")); + free(b32); + + b32 = base32hexmem("fo", STRLEN("fo"), false); + assert_se(b32); + assert_se(streq(b32, "CPNG")); + free(b32); + + b32 = base32hexmem("foo", STRLEN("foo"), false); + assert_se(b32); + assert_se(streq(b32, "CPNMU")); + free(b32); + + b32 = base32hexmem("foob", STRLEN("foob"), false); + assert_se(b32); + assert_se(streq(b32, "CPNMUOG")); + free(b32); + + b32 = base32hexmem("fooba", STRLEN("fooba"), false); + assert_se(b32); + assert_se(streq(b32, "CPNMUOJ1")); + free(b32); + + b32 = base32hexmem("foobar", STRLEN("foobar"), false); + assert_se(b32); + assert_se(streq(b32, "CPNMUOJ1E8")); + free(b32); +} + +static void test_unbase32hexmem_one(const char *hex, bool padding, int retval, const char *ans) { + _cleanup_free_ void *mem = NULL; + size_t len; + + assert_se(unbase32hexmem(hex, SIZE_MAX, padding, &mem, &len) == retval); + if (retval == 0) { + char *str; + + str = strndupa_safe(mem, len); + assert_se(streq(str, ans)); + } +} + +TEST(unbase32hexmem) { + test_unbase32hexmem_one("", true, 0, ""); + + test_unbase32hexmem_one("CO======", true, 0, "f"); + test_unbase32hexmem_one("CPNG====", true, 0, "fo"); + test_unbase32hexmem_one("CPNMU===", true, 0, "foo"); + test_unbase32hexmem_one("CPNMUOG=", true, 0, "foob"); + test_unbase32hexmem_one("CPNMUOJ1", true, 0, "fooba"); + test_unbase32hexmem_one("CPNMUOJ1E8======", true, 0, "foobar"); + + test_unbase32hexmem_one("A", true, -EINVAL, NULL); + test_unbase32hexmem_one("A=======", true, -EINVAL, NULL); + test_unbase32hexmem_one("AAA=====", true, -EINVAL, NULL); + test_unbase32hexmem_one("AAAAAA==", true, -EINVAL, NULL); + test_unbase32hexmem_one("AB======", true, -EINVAL, NULL); + test_unbase32hexmem_one("AAAB====", true, -EINVAL, NULL); + test_unbase32hexmem_one("AAAAB===", true, -EINVAL, NULL); + test_unbase32hexmem_one("AAAAAAB=", true, -EINVAL, NULL); + + test_unbase32hexmem_one("XPNMUOJ1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CXNMUOJ1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CPXMUOJ1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CPNXUOJ1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CPNMXOJ1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CPNMUXJ1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CPNMUOX1", true, -EINVAL, NULL); + test_unbase32hexmem_one("CPNMUOJX", true, -EINVAL, NULL); + + test_unbase32hexmem_one("", false, 0, ""); + test_unbase32hexmem_one("CO", false, 0, "f"); + test_unbase32hexmem_one("CPNG", false, 0, "fo"); + test_unbase32hexmem_one("CPNMU", false, 0, "foo"); + test_unbase32hexmem_one("CPNMUOG", false, 0, "foob"); + test_unbase32hexmem_one("CPNMUOJ1", false, 0, "fooba"); + test_unbase32hexmem_one("CPNMUOJ1E8", false, 0, "foobar"); + test_unbase32hexmem_one("CPNMUOG=", false, -EINVAL, NULL); + test_unbase32hexmem_one("CPNMUOJ1E8======", false, -EINVAL, NULL); + + test_unbase32hexmem_one("A", false, -EINVAL, NULL); + test_unbase32hexmem_one("AAA", false, -EINVAL, NULL); + test_unbase32hexmem_one("AAAAAA", false, -EINVAL, NULL); + test_unbase32hexmem_one("AB", false, -EINVAL, NULL); + test_unbase32hexmem_one("AAAB", false, -EINVAL, NULL); + test_unbase32hexmem_one("AAAAB", false, -EINVAL, NULL); + test_unbase32hexmem_one("AAAAAAB", false, -EINVAL, NULL); +} + +/* https://tools.ietf.org/html/rfc4648#section-10 */ +TEST(base64mem) { + char *b64; + + assert_se(base64mem("", STRLEN(""), &b64) == 0); + assert_se(streq(b64, "")); + free(b64); + + assert_se(base64mem("f", STRLEN("f"), &b64) == 4); + assert_se(streq(b64, "Zg==")); + free(b64); + + assert_se(base64mem("fo", STRLEN("fo"), &b64) == 4); + assert_se(streq(b64, "Zm8=")); + free(b64); + + assert_se(base64mem("foo", STRLEN("foo"), &b64) == 4); + assert_se(streq(b64, "Zm9v")); + free(b64); + + assert_se(base64mem("foob", STRLEN("foob"), &b64) == 8); + assert_se(streq(b64, "Zm9vYg==")); + free(b64); + + assert_se(base64mem("fooba", STRLEN("fooba"), &b64) == 8); + assert_se(streq(b64, "Zm9vYmE=")); + free(b64); + + assert_se(base64mem("foobar", STRLEN("foobar"), &b64) == 8); + assert_se(streq(b64, "Zm9vYmFy")); + free(b64); +} + +TEST(base64mem_linebreak) { + uint8_t data[4096]; + + for (size_t i = 0; i < 20; i++) { + _cleanup_free_ char *encoded = NULL; + _cleanup_free_ void *decoded = NULL; + size_t decoded_size; + uint64_t n, m; + ssize_t l; + + /* Try a bunch of differently sized blobs */ + n = random_u64_range(sizeof(data)); + random_bytes(data, n); + + /* Break at various different columns */ + m = 1 + random_u64_range(n + 5); + + l = base64mem_full(data, n, m, &encoded); + assert_se(l >= 0); + assert_se(encoded); + assert_se((size_t) l == strlen(encoded)); + + assert_se(unbase64mem(encoded, SIZE_MAX, &decoded, &decoded_size) >= 0); + assert_se(decoded_size == n); + assert_se(memcmp(data, decoded, n) == 0); + + /* Also try in secure mode */ + decoded = mfree(decoded); + decoded_size = 0; + assert_se(unbase64mem_full(encoded, SIZE_MAX, /* secure= */ true, &decoded, &decoded_size) >= 0); + assert_se(decoded_size == n); + assert_se(memcmp(data, decoded, n) == 0); + + for (size_t j = 0; j < (size_t) l; j++) + assert_se((encoded[j] == '\n') == (j % (m + 1) == m)); + } +} + +static void test_base64_append_one(char **buf, size_t *len, const char *in, const char *expected) { + ssize_t new_len; + + new_len = base64_append(buf, *len, in, strlen_ptr(in), 8, 12); + assert_se(new_len >= 0); + log_debug("base64_append_one(\"%s\")\nresult:\n%s\nexpected:\n%s", in, strnull(*buf), strnull(expected)); + assert_se((size_t) new_len == strlen_ptr(*buf)); + assert_se(streq_ptr(*buf, expected)); + *len = new_len; +} + +TEST(base64_append) { + _cleanup_free_ char *buf = NULL; + size_t len = 0; + + test_base64_append_one(&buf, &len, "", NULL); + test_base64_append_one(&buf, &len, "f", + "Zg=="); + test_base64_append_one(&buf, &len, "fo", + "Zg== Zm8="); + test_base64_append_one(&buf, &len, "foo", + "Zg== Zm8=\n" + " Zm9v"); + test_base64_append_one(&buf, &len, "foob", + "Zg== Zm8=\n" + " Zm9v\n" + " Zm9v\n" + " Yg=="); + test_base64_append_one(&buf, &len, "fooba", + "Zg== Zm8=\n" + " Zm9v\n" + " Zm9v\n" + " Yg==\n" + " Zm9v\n" + " YmE="); + test_base64_append_one(&buf, &len, "foobar", + "Zg== Zm8=\n" + " Zm9v\n" + " Zm9v\n" + " Yg==\n" + " Zm9v\n" + " YmE=\n" + " Zm9v\n" + " YmFy"); + + assert_se(free_and_strdup(&buf, "hogehogehogehoge") >= 0); + len = strlen(buf); + + test_base64_append_one(&buf, &len, "", + "hogehogehogehoge"); + test_base64_append_one(&buf, &len, "f", + "hogehogehogehoge\n" + " Zg=="); + test_base64_append_one(&buf, &len, "fo", + "hogehogehogehoge\n" + " Zg==\n" + " Zm8="); + test_base64_append_one(&buf, &len, "foo", + "hogehogehogehoge\n" + " Zg==\n" + " Zm8=\n" + " Zm9v"); + test_base64_append_one(&buf, &len, "foob", + "hogehogehogehoge\n" + " Zg==\n" + " Zm8=\n" + " Zm9v\n" + " Zm9v\n" + " Yg=="); + test_base64_append_one(&buf, &len, "fooba", + "hogehogehogehoge\n" + " Zg==\n" + " Zm8=\n" + " Zm9v\n" + " Zm9v\n" + " Yg==\n" + " Zm9v\n" + " YmE="); + test_base64_append_one(&buf, &len, "foobar", + "hogehogehogehoge\n" + " Zg==\n" + " Zm8=\n" + " Zm9v\n" + " Zm9v\n" + " Yg==\n" + " Zm9v\n" + " YmE=\n" + " Zm9v\n" + " YmFy"); + + assert_se(free_and_strdup(&buf, "hogehogehogehoge") >= 0); + len = strlen(buf); + + test_base64_append_one(&buf, &len, "foobarfoobarfoobarfoobar", + "hogehogehogehoge\n" + " Zm9v\n" + " YmFy\n" + " Zm9v\n" + " YmFy\n" + " Zm9v\n" + " YmFy\n" + " Zm9v\n" + " YmFy"); + + assert_se(free_and_strdup(&buf, "aaa") >= 0); + len = strlen(buf); + + test_base64_append_one(&buf, &len, "foobarfoobarfoobarfoobar", + "aaa Zm9vYmFy\n" + " Zm9vYmFy\n" + " Zm9vYmFy\n" + " Zm9vYmFy"); +} + +static void test_unbase64mem_one(const char *input, const char *output, int ret) { + _cleanup_free_ void *buffer = NULL; + size_t size = 0; + + assert_se(unbase64mem(input, SIZE_MAX, &buffer, &size) == ret); + if (ret >= 0) { + assert_se(size == strlen(output)); + assert_se(memcmp(buffer, output, size) == 0); + assert_se(((char*) buffer)[size] == 0); + } + + /* also try in secure mode */ + buffer = mfree(buffer); + size = 0; + + assert_se(unbase64mem_full(input, SIZE_MAX, /* secure=*/ true, &buffer, &size) == ret); + if (ret >= 0) { + assert_se(size == strlen(output)); + assert_se(memcmp(buffer, output, size) == 0); + assert_se(((char*) buffer)[size] == 0); + } +} + +TEST(unbase64mem) { + + test_unbase64mem_one("", "", 0); + test_unbase64mem_one("Zg==", "f", 0); + test_unbase64mem_one("Zm8=", "fo", 0); + test_unbase64mem_one("Zm9v", "foo", 0); + test_unbase64mem_one("Zm9vYg==", "foob", 0); + test_unbase64mem_one("Zm9vYmE=", "fooba", 0); + test_unbase64mem_one("Zm9vYmFy", "foobar", 0); + + test_unbase64mem_one(" ", "", 0); + test_unbase64mem_one(" \n\r ", "", 0); + test_unbase64mem_one(" Zg\n== ", "f", 0); + test_unbase64mem_one(" Zm 8=\r", "fo", 0); + test_unbase64mem_one(" Zm9\n\r\r\nv ", "foo", 0); + test_unbase64mem_one(" Z m9vYg==\n\r", "foob", 0); + test_unbase64mem_one(" Zm 9vYmE= ", "fooba", 0); + test_unbase64mem_one(" Z m9v YmFy ", "foobar", 0); + + test_unbase64mem_one("A", NULL, -EPIPE); + test_unbase64mem_one("A====", NULL, -EINVAL); + test_unbase64mem_one("AAB==", NULL, -EINVAL); + test_unbase64mem_one(" A A A B = ", NULL, -EINVAL); + test_unbase64mem_one(" Z m 8 = q u u x ", NULL, -ENAMETOOLONG); +} + +TEST(hexdump) { + uint8_t data[146]; + unsigned i; + + hexdump(stdout, NULL, 0); + hexdump(stdout, "", 0); + hexdump(stdout, "", 1); + hexdump(stdout, "x", 1); + hexdump(stdout, "x", 2); + hexdump(stdout, "foobar", 7); + hexdump(stdout, "f\nobar", 7); + hexdump(stdout, "xxxxxxxxxxxxxxxxxxxxyz", 23); + + for (i = 0; i < ELEMENTSOF(data); i++) + data[i] = i*2; + + hexdump(stdout, data, sizeof(data)); +} + +TEST(base64withwithouturl) { + static const uint8_t plaintext[] = { + 0xcc, 0xa1, 0x72, 0x22, 0xae, 0xda, 0x66, 0x7e, 0x04, 0xa6, 0xe0, 0x82, + 0x9a, 0x97, 0x05, 0xf6, 0x33, 0xe0, 0x0f, 0xc2, 0x45, 0x13, 0x58, 0x3f, + 0xc5, 0xf4, 0xf4, 0x31, 0xab, 0x3c, 0x5f, 0x83, 0x34, 0x5b, 0x27, 0x32, + 0x8a, 0x04, 0x6c, 0x43, 0x82, 0x07, 0xe3, 0x2c, 0xac, 0xb9, 0xfb, 0xac, + 0xd0, 0x03, 0x91, 0x42, 0xcb, 0xa4, 0xde, 0x87, 0x86, 0x85, 0x10, 0xbb, + 0xb7, 0x5b, 0x4b, 0xc8, 0xa0, 0xf4, 0x22, 0x1d, 0x15, 0x71, 0x87, 0x9d, + 0xbf, 0x9f, 0xa9, 0xf1, 0xee, 0xa2, 0xb6, 0xaa, 0xc8, 0xc3, 0x37, 0x9c, + 0xbb, 0xdf, 0x3e, 0xac, 0xdc, 0x94, 0x54, 0x38, 0x56, 0x07, 0x34, 0xb4, + 0x3c, 0xcc, 0x31, 0x13 + }; + + _cleanup_free_ void *buffer = NULL; + size_t size; + + /* This is regular base64 */ + assert_se(unbase64mem("zKFyIq7aZn4EpuCCmpcF9jPgD8JFE1g/xfT0Mas8X4M0WycyigRsQ4IH4yysufus0AORQsuk3oeGhRC7t1tLyKD0Ih0VcYedv5+p8e6itqrIwzecu98+rNyUVDhWBzS0PMwxEw==", SIZE_MAX, &buffer, &size) >= 0); + assert_se(memcmp_nn(plaintext, sizeof(plaintext), buffer, size) == 0); + buffer = mfree(buffer); + + /* This is the same but in base64url */ + assert_se(unbase64mem("zKFyIq7aZn4EpuCCmpcF9jPgD8JFE1g_xfT0Mas8X4M0WycyigRsQ4IH4yysufus0AORQsuk3oeGhRC7t1tLyKD0Ih0VcYedv5-p8e6itqrIwzecu98-rNyUVDhWBzS0PMwxEw==", SIZE_MAX, &buffer, &size) >= 0); + assert_se(memcmp_nn(plaintext, sizeof(plaintext), buffer, size) == 0); + + /* Hint: use xxd -i to generate the static C array from some data, and basenc --base64 + basenc + * --base64url to generate the correctly encoded base64 strings */ +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-hmac.c b/src/test/test-hmac.c new file mode 100644 index 0000000..1b788b1 --- /dev/null +++ b/src/test/test-hmac.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "hexdecoct.h" +#include "hmac.h" +#include "string-util.h" +#include "tests.h" + +static void hmac_sha256_by_string(const char *key, const char *value, uint8_t res[static SHA256_DIGEST_SIZE]) { + hmac_sha256(key, strlen(key), value, strlen(value), res); +} + +TEST(hmac) { + uint8_t result[SHA256_DIGEST_SIZE]; + char *hex_result = NULL; + + /* Results compared with output of 'echo -n "" | openssl dgst -sha256 -hmac ""' */ + + hmac_sha256_by_string("waldo", + "", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "cadd5e42114351181f3abff477641d88efb57d2b5641a1e5c6d623363a6d3bad")); + hex_result = mfree(hex_result); + + hmac_sha256_by_string("waldo", + "baldohaldo", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "c47ad5031ba21605e52c6ca68090d66a2dd5ccf84efa4bace15361a8cba63cda")); + hex_result = mfree(hex_result); + + hmac_sha256_by_string("waldo", + "baldo haldo", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "4e8974ad6c08b98cc2519cd1e27aa7195769fcf86db1dd7ceaab4d44c490ad69")); + hex_result = mfree(hex_result); + + hmac_sha256_by_string("waldo", + "baldo 4e8974ad6c08b98cc2519cd1e27aa7195769fcf86db1dd7ceaab4d44c490ad69 haldo", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "039f3df430b19753ffb493e5b90708f75c5210b63c6bcbef3374eb3f0a3f97f7")); + hex_result = mfree(hex_result); + + hmac_sha256_by_string("4e8974ad6c08b98cc2519cd1e27aa7195769fcf86db1dd7ceaab4d44c490ad69", + "baldo haldo", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "c4cfaf48077cbb0bbd177a09e59ec4c248f4ca771503410f5b54b98d88d2f47b")); + hex_result = mfree(hex_result); + + hmac_sha256_by_string("4e8974ad6c08b98cc2519cd1e27aa7195769fcf86db1dd7ceaab4d44c490ad69", + "supercalifragilisticexpialidocious", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "2c059e7a63c4c3b23f47966a65fd2f8a2f5d7161e2e90d78ff68866b5c375cb7")); + hex_result = mfree(hex_result); + + hmac_sha256_by_string("4e8974ad6c08b98cc2519cd1e27aa7195769fcf86db1dd7ceaab4d44c490ad69c47ad5031ba21605e52c6ca68090d66a2dd5ccf84efa4bace15361a8cba63cda", + "supercalifragilisticexpialidocious", + result); + hex_result = hexmem(result, sizeof(result)); + assert_se(streq_ptr(hex_result, "1dd1d1d45b9d9f9673dc9983c968c46ff3168e03cfeb4156a219eba1af4cff5f")); + hex_result = mfree(hex_result); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-hostname-setup.c b/src/test/test-hostname-setup.c new file mode 100644 index 0000000..94e5ece --- /dev/null +++ b/src/test/test-hostname-setup.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hostname-setup.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(read_etc_hostname) { + _cleanup_(unlink_tempfilep) char path[] = "/tmp/hostname.XXXXXX"; + char *hostname; + int fd; + + fd = mkostemp_safe(path); + assert_se(fd > 0); + close(fd); + + /* simple hostname */ + assert_se(write_string_file(path, "foo", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_etc_hostname(path, &hostname) == 0); + assert_se(streq(hostname, "foo")); + hostname = mfree(hostname); + + /* with comment */ + assert_se(write_string_file(path, "# comment\nfoo", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_etc_hostname(path, &hostname) == 0); + assert_se(hostname); + assert_se(streq(hostname, "foo")); + hostname = mfree(hostname); + + /* with comment and extra whitespace */ + assert_se(write_string_file(path, "# comment\n\n foo ", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_etc_hostname(path, &hostname) == 0); + assert_se(hostname); + assert_se(streq(hostname, "foo")); + hostname = mfree(hostname); + + /* cleans up name */ + assert_se(write_string_file(path, "!foo/bar.com", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_etc_hostname(path, &hostname) == 0); + assert_se(hostname); + assert_se(streq(hostname, "foobar.com")); + hostname = mfree(hostname); + + /* no value set */ + hostname = (char*) 0x1234; + assert_se(write_string_file(path, "# nothing here\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_etc_hostname(path, &hostname) == -ENOENT); + assert_se(hostname == (char*) 0x1234); /* does not touch argument on error */ + + /* nonexisting file */ + assert_se(read_etc_hostname("/non/existing", &hostname) == -ENOENT); + assert_se(hostname == (char*) 0x1234); /* does not touch argument on error */ +} + +TEST(hostname_setup) { + hostname_setup(false); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-hostname-util.c b/src/test/test-hostname-util.c new file mode 100644 index 0000000..77e9a19 --- /dev/null +++ b/src/test/test-hostname-util.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fileio.h" +#include "hostname-util.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(hostname_is_valid) { + assert_se(hostname_is_valid("foobar", 0)); + assert_se(hostname_is_valid("foobar.com", 0)); + assert_se(!hostname_is_valid("foobar.com.", 0)); + assert_se(hostname_is_valid("fooBAR", 0)); + assert_se(hostname_is_valid("fooBAR.com", 0)); + assert_se(!hostname_is_valid("fooBAR.", 0)); + assert_se(!hostname_is_valid("fooBAR.com.", 0)); + assert_se(!hostname_is_valid("fööbar", 0)); + assert_se(!hostname_is_valid("", 0)); + assert_se(!hostname_is_valid(".", 0)); + assert_se(!hostname_is_valid("..", 0)); + assert_se(!hostname_is_valid("foobar.", 0)); + assert_se(!hostname_is_valid(".foobar", 0)); + assert_se(!hostname_is_valid("foo..bar", 0)); + assert_se(!hostname_is_valid("foo.bar..", 0)); + assert_se(!hostname_is_valid("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", 0)); + assert_se(!hostname_is_valid("au-xph5-rvgrdsb5hcxc-47et3a5vvkrc-server-wyoz4elpdpe3.openstack.local", 0)); + + assert_se(hostname_is_valid("foobar", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(hostname_is_valid("foobar.com", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(hostname_is_valid("foobar.com.", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(hostname_is_valid("fooBAR", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(hostname_is_valid("fooBAR.com", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("fooBAR.", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(hostname_is_valid("fooBAR.com.", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("fööbar", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid(".", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("..", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("foobar.", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid(".foobar", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("foo..bar", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("foo.bar..", VALID_HOSTNAME_TRAILING_DOT)); + assert_se(!hostname_is_valid("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", VALID_HOSTNAME_TRAILING_DOT)); +} + +TEST(hostname_cleanup) { + char *s; + + s = strdupa_safe("foobar"); + assert_se(streq(hostname_cleanup(s), "foobar")); + s = strdupa_safe("foobar.com"); + assert_se(streq(hostname_cleanup(s), "foobar.com")); + s = strdupa_safe("foobar.com."); + assert_se(streq(hostname_cleanup(s), "foobar.com")); + s = strdupa_safe("foo-bar.-com-."); + assert_se(streq(hostname_cleanup(s), "foo-bar.com")); + s = strdupa_safe("foo-bar-.-com-."); + assert_se(streq(hostname_cleanup(s), "foo-bar--com")); + s = strdupa_safe("--foo-bar.-com"); + assert_se(streq(hostname_cleanup(s), "foo-bar.com")); + s = strdupa_safe("fooBAR"); + assert_se(streq(hostname_cleanup(s), "fooBAR")); + s = strdupa_safe("fooBAR.com"); + assert_se(streq(hostname_cleanup(s), "fooBAR.com")); + s = strdupa_safe("fooBAR."); + assert_se(streq(hostname_cleanup(s), "fooBAR")); + s = strdupa_safe("fooBAR.com."); + assert_se(streq(hostname_cleanup(s), "fooBAR.com")); + s = strdupa_safe("fööbar"); + assert_se(streq(hostname_cleanup(s), "fbar")); + s = strdupa_safe(""); + assert_se(isempty(hostname_cleanup(s))); + s = strdupa_safe("."); + assert_se(isempty(hostname_cleanup(s))); + s = strdupa_safe(".."); + assert_se(isempty(hostname_cleanup(s))); + s = strdupa_safe("foobar."); + assert_se(streq(hostname_cleanup(s), "foobar")); + s = strdupa_safe(".foobar"); + assert_se(streq(hostname_cleanup(s), "foobar")); + s = strdupa_safe("foo..bar"); + assert_se(streq(hostname_cleanup(s), "foo.bar")); + s = strdupa_safe("foo.bar.."); + assert_se(streq(hostname_cleanup(s), "foo.bar")); + s = strdupa_safe("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + assert_se(streq(hostname_cleanup(s), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")); + s = strdupa_safe("xxxx........xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + assert_se(streq(hostname_cleanup(s), "xxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")); +} + +TEST(hostname_malloc) { + _cleanup_free_ char *h = NULL, *l = NULL; + + assert_se(h = gethostname_malloc()); + log_info("hostname_malloc: \"%s\"", h); + + assert_se(l = gethostname_short_malloc()); + log_info("hostname_short_malloc: \"%s\"", l); +} + +TEST(default_hostname) { + if (!hostname_is_valid(FALLBACK_HOSTNAME, 0)) { + log_error("Configured fallback hostname \"%s\" is not valid.", FALLBACK_HOSTNAME); + exit(EXIT_FAILURE); + } + + _cleanup_free_ char *n = get_default_hostname(); + assert_se(n); + log_info("get_default_hostname: \"%s\"", n); + assert_se(hostname_is_valid(n, 0)); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-id128.c b/src/test/test-id128.c new file mode 100644 index 0000000..ae7df27 --- /dev/null +++ b/src/test/test-id128.c @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-id128.h" + +#include "alloc-util.h" +#include "fd-util.h" +#include "id128-util.h" +#include "macro.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +#define ID128_WALDI SD_ID128_MAKE(01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f, 10) +#define STR_WALDI "0102030405060708090a0b0c0d0e0f10" +#define UUID_WALDI "01020304-0506-0708-090a-0b0c0d0e0f10" +#define STR_NULL "00000000000000000000000000000000" + +TEST(id128) { + sd_id128_t id, id2; + char t[SD_ID128_STRING_MAX], q[SD_ID128_UUID_STRING_MAX]; + _cleanup_free_ char *b = NULL; + _cleanup_close_ int fd = -EBADF; + + assert_se(sd_id128_randomize(&id) == 0); + printf("random: %s\n", sd_id128_to_string(id, t)); + + assert_se(sd_id128_from_string(t, &id2) == 0); + assert_se(sd_id128_equal(id, id2)); + assert_se(sd_id128_in_set(id, id)); + assert_se(sd_id128_in_set(id, id2)); + assert_se(sd_id128_in_set(id, id2, id)); + assert_se(sd_id128_in_set(id, ID128_WALDI, id)); + assert_se(!sd_id128_in_set(id)); + assert_se(!sd_id128_in_set(id, ID128_WALDI)); + assert_se(!sd_id128_in_set(id, ID128_WALDI, ID128_WALDI)); + + if (sd_booted() > 0 && sd_id128_get_machine(NULL) >= 0) { + assert_se(sd_id128_get_machine(&id) == 0); + printf("machine: %s\n", sd_id128_to_string(id, t)); + + assert_se(sd_id128_get_boot(&id) == 0); + printf("boot: %s\n", sd_id128_to_string(id, t)); + } + + printf("waldi: %s\n", sd_id128_to_string(ID128_WALDI, t)); + assert_se(streq(t, STR_WALDI)); + + assert_se(asprintf(&b, SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(ID128_WALDI)) == 32); + printf("waldi2: %s\n", b); + assert_se(streq(t, b)); + + printf("waldi3: %s\n", sd_id128_to_uuid_string(ID128_WALDI, q)); + assert_se(streq(q, UUID_WALDI)); + + b = mfree(b); + assert_se(asprintf(&b, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(ID128_WALDI)) == 36); + printf("waldi4: %s\n", b); + assert_se(streq(q, b)); + + assert_se(sd_id128_from_string(STR_WALDI, &id) >= 0); + assert_se(sd_id128_equal(id, ID128_WALDI)); + + assert_se(sd_id128_from_string(UUID_WALDI, &id) >= 0); + assert_se(sd_id128_equal(id, ID128_WALDI)); + + assert_se(sd_id128_from_string("", &id) < 0); + assert_se(sd_id128_from_string("01020304-0506-0708-090a-0b0c0d0e0f101", &id) < 0); + assert_se(sd_id128_from_string("01020304-0506-0708-090a-0b0c0d0e0f10-", &id) < 0); + assert_se(sd_id128_from_string("01020304-0506-0708-090a0b0c0d0e0f10", &id) < 0); + assert_se(sd_id128_from_string("010203040506-0708-090a-0b0c0d0e0f10", &id) < 0); + + assert_se(id128_from_string_nonzero(STR_WALDI, &id) == 0); + assert_se(id128_from_string_nonzero(STR_NULL, &id) == -ENXIO); + assert_se(id128_from_string_nonzero("01020304-0506-0708-090a-0b0c0d0e0f101", &id) < 0); + assert_se(id128_from_string_nonzero("01020304-0506-0708-090a-0b0c0d0e0f10-", &id) < 0); + assert_se(id128_from_string_nonzero("01020304-0506-0708-090a0b0c0d0e0f10", &id) < 0); + assert_se(id128_from_string_nonzero("010203040506-0708-090a-0b0c0d0e0f10", &id) < 0); + + assert_se(id128_is_valid(STR_WALDI)); + assert_se(id128_is_valid(UUID_WALDI)); + assert_se(!id128_is_valid("")); + assert_se(!id128_is_valid("01020304-0506-0708-090a-0b0c0d0e0f101")); + assert_se(!id128_is_valid("01020304-0506-0708-090a-0b0c0d0e0f10-")); + assert_se(!id128_is_valid("01020304-0506-0708-090a0b0c0d0e0f10")); + assert_se(!id128_is_valid("010203040506-0708-090a-0b0c0d0e0f10")); + + fd = open_tmpfile_unlinkable(NULL, O_RDWR|O_CLOEXEC); + assert_se(fd >= 0); + + /* First, write as UUID */ + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(id128_write_fd(fd, ID128_FORMAT_UUID, id) >= 0); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_PLAIN, &id2) == -EUCLEAN); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_UUID, &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_ANY, &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + + /* Second, write as plain */ + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(id128_write_fd(fd, ID128_FORMAT_PLAIN, id) >= 0); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_UUID, &id2) == -EUCLEAN); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_PLAIN, &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_ANY, &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + + /* Third, write plain without trailing newline */ + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(write(fd, sd_id128_to_string(id, t), 32) == 32); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_UUID, &id2) == -EUCLEAN); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_PLAIN, &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + + /* Fourth, write UUID without trailing newline */ + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(write(fd, sd_id128_to_uuid_string(id, q), 36) == 36); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_PLAIN, &id2) == -EUCLEAN); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_UUID, &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + + /* Fifth, tests for "uninitialized" */ + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + assert_se(write(fd, "uninitialized", STRLEN("uninitialized")) == STRLEN("uninitialized")); + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_ANY, NULL) == -ENOPKG); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + assert_se(write(fd, "uninitialized\n", STRLEN("uninitialized\n")) == STRLEN("uninitialized\n")); + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_ANY, NULL) == -ENOPKG); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + assert_se(write(fd, "uninitialized\nfoo", STRLEN("uninitialized\nfoo")) == STRLEN("uninitialized\nfoo")); + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_ANY, NULL) == -EUCLEAN); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + assert_se(write(fd, "uninit", STRLEN("uninit")) == STRLEN("uninit")); + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(id128_read_fd(fd, ID128_FORMAT_ANY, NULL) == -EUCLEAN); + + /* build/systemd-id128 -a f03daaeb1c334b43a732172944bf772e show 51df0b4bc3b04c9780e299b98ca373b8 */ + assert_se(sd_id128_get_app_specific(SD_ID128_MAKE(51,df,0b,4b,c3,b0,4c,97,80,e2,99,b9,8c,a3,73,b8), + SD_ID128_MAKE(f0,3d,aa,eb,1c,33,4b,43,a7,32,17,29,44,bf,77,2e), &id) >= 0); + assert_se(sd_id128_equal(id, SD_ID128_MAKE(1d,ee,59,54,e7,5c,4d,6f,b9,6c,c6,c0,4c,a1,8a,86))); + + if (sd_booted() > 0 && sd_id128_get_machine(NULL) >= 0) { + assert_se(sd_id128_get_machine_app_specific(SD_ID128_MAKE(f0,3d,aa,eb,1c,33,4b,43,a7,32,17,29,44,bf,77,2e), &id) >= 0); + assert_se(sd_id128_get_machine_app_specific(SD_ID128_MAKE(f0,3d,aa,eb,1c,33,4b,43,a7,32,17,29,44,bf,77,2e), &id2) >= 0); + assert_se(sd_id128_equal(id, id2)); + assert_se(sd_id128_get_machine_app_specific(SD_ID128_MAKE(51,df,0b,4b,c3,b0,4c,97,80,e2,99,b9,8c,a3,73,b8), &id2) >= 0); + assert_se(!sd_id128_equal(id, id2)); + } + + /* Check return values */ + assert_se(sd_id128_get_app_specific(SD_ID128_ALLF, SD_ID128_NULL, &id) == -ENXIO); + assert_se(sd_id128_get_app_specific(SD_ID128_NULL, SD_ID128_ALLF, &id) == 0); +} + +TEST(sd_id128_get_invocation) { + sd_id128_t id; + int r; + + /* Query the invocation ID */ + r = sd_id128_get_invocation(&id); + if (r < 0) + log_warning_errno(r, "Failed to get invocation ID, ignoring: %m"); + else + log_info("Invocation ID: " SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)); +} + +TEST(benchmark_sd_id128_get_machine_app_specific) { + unsigned iterations = slow_tests_enabled() ? 1000000 : 1000; + usec_t t, q; + + if (sd_id128_get_machine(NULL) < 0) + return (void) log_tests_skipped("/etc/machine-id is not initialized"); + + log_info("/* %s (%u iterations) */", __func__, iterations); + + sd_id128_t id = ID128_WALDI, id2; + + t = now(CLOCK_MONOTONIC); + + for (unsigned i = 0; i < iterations; i++) { + id.qwords[1] = i; + + assert_se(sd_id128_get_machine_app_specific(id, &id2) >= 0); + } + + q = now(CLOCK_MONOTONIC) - t; + + log_info("%lf μs each", (double) q / iterations); +} + +TEST(id128_at) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF; + _cleanup_free_ char *p = NULL; + sd_id128_t id, i; + + tfd = mkdtemp_open(NULL, O_PATH, &t); + assert_se(tfd >= 0); + assert_se(mkdirat(tfd, "etc", 0755) >= 0); + assert_se(symlinkat("etc", tfd, "etc2") >= 0); + assert_se(symlinkat("machine-id", tfd, "etc/hoge-id") >= 0); + + assert_se(sd_id128_randomize(&id) == 0); + + assert_se(id128_write_at(tfd, "etc/machine-id", ID128_FORMAT_PLAIN, id) >= 0); + if (geteuid() == 0) + assert_se(id128_write_at(tfd, "etc/machine-id", ID128_FORMAT_PLAIN, id) >= 0); + else + assert_se(id128_write_at(tfd, "etc/machine-id", ID128_FORMAT_PLAIN, id) == -EACCES); + assert_se(unlinkat(tfd, "etc/machine-id", 0) >= 0); + assert_se(id128_write_at(tfd, "etc2/machine-id", ID128_FORMAT_PLAIN, id) >= 0); + assert_se(unlinkat(tfd, "etc/machine-id", 0) >= 0); + assert_se(id128_write_at(tfd, "etc/hoge-id", ID128_FORMAT_PLAIN, id) >= 0); + assert_se(unlinkat(tfd, "etc/machine-id", 0) >= 0); + assert_se(id128_write_at(tfd, "etc2/hoge-id", ID128_FORMAT_PLAIN, id) >= 0); + + /* id128_read_at() */ + i = SD_ID128_NULL; /* Not necessary in real code, but for testing that the id is really assigned. */ + assert_se(id128_read_at(tfd, "etc/machine-id", ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + i = SD_ID128_NULL; + assert_se(id128_read_at(tfd, "etc2/machine-id", ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + i = SD_ID128_NULL; + assert_se(id128_read_at(tfd, "etc/hoge-id", ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + i = SD_ID128_NULL; + assert_se(id128_read_at(tfd, "etc2/hoge-id", ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + /* id128_read() */ + assert_se(p = path_join(t, "/etc/machine-id")); + + i = SD_ID128_NULL; + assert_se(id128_read(p, ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + free(p); + assert_se(p = path_join(t, "/etc2/machine-id")); + + i = SD_ID128_NULL; + assert_se(id128_read(p, ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + free(p); + assert_se(p = path_join(t, "/etc/hoge-id")); + + i = SD_ID128_NULL; + assert_se(id128_read(p, ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + free(p); + assert_se(p = path_join(t, "/etc2/hoge-id")); + + i = SD_ID128_NULL; + assert_se(id128_read(p, ID128_FORMAT_PLAIN, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + /* id128_get_machine_at() */ + i = SD_ID128_NULL; + assert_se(id128_get_machine_at(tfd, &i) >= 0); + assert_se(sd_id128_equal(id, i)); + + /* id128_get_machine() */ + i = SD_ID128_NULL; + assert_se(id128_get_machine(t, &i) >= 0); + assert_se(sd_id128_equal(id, i)); +} + +TEST(ID128_REFUSE_NULL) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF; + sd_id128_t id; + + tfd = mkdtemp_open(NULL, O_PATH, &t); + assert_se(tfd >= 0); + + assert_se(id128_write_at(tfd, "zero-id", ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, (sd_id128_t) {}) == -ENOMEDIUM); + assert_se(unlinkat(tfd, "zero-id", 0) >= 0); + assert_se(id128_write_at(tfd, "zero-id", ID128_FORMAT_PLAIN, (sd_id128_t) {}) >= 0); + + assert_se(sd_id128_randomize(&id) == 0); + assert_se(!sd_id128_equal(id, SD_ID128_NULL)); + assert_se(id128_read_at(tfd, "zero-id", ID128_FORMAT_PLAIN, &id) >= 0); + assert_se(sd_id128_equal(id, SD_ID128_NULL)); + + assert_se(id128_read_at(tfd, "zero-id", ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, &id) == -ENOMEDIUM); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-image-policy.c b/src/test/test-image-policy.c new file mode 100644 index 0000000..d9fe556 --- /dev/null +++ b/src/test/test-image-policy.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "image-policy.h" +#include "pretty-print.h" +#include "string-util.h" +#include "tests.h" +#include "pager.h" + +static void test_policy(const ImagePolicy *p, const char *name) { + _cleanup_free_ char *as_string = NULL, *as_string_simplified = NULL; + _cleanup_free_ ImagePolicy *parsed = NULL; + + assert_se(image_policy_to_string(p, /* simplified= */ false, &as_string) >= 0); + assert_se(image_policy_to_string(p, /* simplified= */ true, &as_string_simplified) >= 0); + + printf("%s%s", ansi_underline(), name); + + if (!streq(as_string_simplified, name)) { + printf(" → %s", as_string_simplified); + + if (!streq(as_string, as_string_simplified)) + printf(" (aka %s)", as_string); + } + + printf("%s\n", ansi_normal()); + + assert_se(image_policy_from_string(as_string, &parsed) >= 0); + assert_se(image_policy_equal(p, parsed)); + parsed = image_policy_free(parsed); + + assert_se(image_policy_from_string(as_string_simplified, &parsed) >= 0); + assert_se(image_policy_equivalent(p, parsed)); + parsed = image_policy_free(parsed); + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + _cleanup_free_ char *k = NULL; + PartitionPolicyFlags f; + + f = image_policy_get(p, d); + if (f < 0) { + f = image_policy_get_exhaustively(p, d); + assert_se(f >= 0); + assert_se(partition_policy_flags_to_string(f, /* simplified= */ true, &k) >= 0); + + printf("%s\t%s → n/a (exhaustively: %s)%s\n", ansi_grey(), partition_designator_to_string(d), k, ansi_normal()); + } else { + assert_se(partition_policy_flags_to_string(f, /* simplified= */ true, &k) >= 0); + printf("\t%s → %s\n", partition_designator_to_string(d), k); + } + } + + _cleanup_free_ char *w = NULL; + assert_se(partition_policy_flags_to_string(image_policy_default(p), /* simplified= */ true, &w) >= 0); + printf("\tdefault → %s\n", w); +} + +static void test_policy_string(const char *t) { + _cleanup_free_ ImagePolicy *parsed = NULL; + + assert_se(image_policy_from_string(t, &parsed) >= 0); + test_policy(parsed, t); +} + +static void test_policy_equiv(const char *s, bool (*func)(const ImagePolicy *p)) { + _cleanup_(image_policy_freep) ImagePolicy *p = NULL; + + assert_se(image_policy_from_string(s, &p) >= 0); + + assert_se(func(p)); + assert_se(func == image_policy_equiv_ignore || !image_policy_equiv_ignore(p)); + assert_se(func == image_policy_equiv_allow || !image_policy_equiv_allow(p)); + assert_se(func == image_policy_equiv_deny || !image_policy_equiv_deny(p)); +} + +TEST_RET(test_image_policy_to_string) { + test_policy(&image_policy_allow, "*"); + test_policy(&image_policy_ignore, "-"); + test_policy(&image_policy_deny, "~"); + test_policy(&image_policy_sysext, "sysext"); + test_policy(&image_policy_sysext_strict, "sysext-strict"); + test_policy(&image_policy_confext, "confext"); + test_policy(&image_policy_container, "container"); + test_policy(&image_policy_host, "host"); + test_policy(&image_policy_service, "service"); + test_policy(NULL, "null"); + + test_policy_string(""); + test_policy_string("-"); + test_policy_string("*"); + test_policy_string("~"); + test_policy_string("swap=open"); + test_policy_string("swap=open:root=signed"); + test_policy_string("swap=open:root=signed+read-only-on+growfs-off:=absent"); + test_policy_string("=-"); + test_policy_string("="); + + test_policy_equiv("", image_policy_equiv_ignore); + test_policy_equiv("-", image_policy_equiv_ignore); + test_policy_equiv("*", image_policy_equiv_allow); + test_policy_equiv("~", image_policy_equiv_deny); + test_policy_equiv("=absent", image_policy_equiv_deny); + test_policy_equiv("=open", image_policy_equiv_allow); + test_policy_equiv("=verity+signed+encrypted+unprotected+unused+absent", image_policy_equiv_allow); + test_policy_equiv("=signed+verity+encrypted+unused+unprotected+absent", image_policy_equiv_allow); + test_policy_equiv("=ignore", image_policy_equiv_ignore); + test_policy_equiv("=absent+unused", image_policy_equiv_ignore); + test_policy_equiv("=unused+absent", image_policy_equiv_ignore); + test_policy_equiv("root=ignore:=ignore", image_policy_equiv_ignore); + + assert_se(image_policy_from_string("pfft", NULL) == -EINVAL); + assert_se(image_policy_from_string("öäüß", NULL) == -EINVAL); + assert_se(image_policy_from_string(":", NULL) == -EINVAL); + assert_se(image_policy_from_string("a=", NULL) == -EBADSLT); + assert_se(image_policy_from_string("=a", NULL) == -EBADRQC); + assert_se(image_policy_from_string("==", NULL) == -EBADRQC); + assert_se(image_policy_from_string("root=verity:root=encrypted", NULL) == -ENOTUNIQ); + assert_se(image_policy_from_string("root=grbl", NULL) == -EBADRQC); + assert_se(image_policy_from_string("wowza=grbl", NULL) == -EBADSLT); + + return 0; +} + +TEST(extend) { + assert_se(partition_policy_flags_extend(0) == _PARTITION_POLICY_MASK); + assert_se(partition_policy_flags_extend(_PARTITION_POLICY_MASK) == _PARTITION_POLICY_MASK); + assert_se(partition_policy_flags_extend(PARTITION_POLICY_UNPROTECTED) == (PARTITION_POLICY_UNPROTECTED|_PARTITION_POLICY_PFLAGS_MASK)); + assert_se(partition_policy_flags_extend(PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_READ_ONLY_ON) == (PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_READ_ONLY_ON|_PARTITION_POLICY_GROWFS_MASK)); + assert_se(partition_policy_flags_extend(PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_READ_ONLY_ON|PARTITION_POLICY_GROWFS_OFF) == (PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_READ_ONLY_ON|PARTITION_POLICY_GROWFS_OFF)); + assert_se(partition_policy_flags_extend(PARTITION_POLICY_GROWFS_ON) == (PARTITION_POLICY_GROWFS_ON|_PARTITION_POLICY_USE_MASK|_PARTITION_POLICY_READ_ONLY_MASK)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-import-util.c b/src/test/test-import-util.c new file mode 100644 index 0000000..7930fe5 --- /dev/null +++ b/src/test/test-import-util.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "import-util.h" +#include "log.h" +#include "string-util.h" +#include "tests.h" + +static void test_import_url_last_component_one(const char *input, const char *output, int ret) { + _cleanup_free_ char *s = NULL; + + assert_se(import_url_last_component(input, &s) == ret); + assert_se(streq_ptr(output, s)); +} + +TEST(import_url_last_component) { + test_import_url_last_component_one("https://foobar/waldo/quux", "quux", 0); + test_import_url_last_component_one("https://foobar/waldo/quux/", "quux", 0); + test_import_url_last_component_one("https://foobar/waldo/", "waldo", 0); + test_import_url_last_component_one("https://foobar/", NULL, -EADDRNOTAVAIL); + test_import_url_last_component_one("https://foobar", NULL, -EADDRNOTAVAIL); + test_import_url_last_component_one("https://foobar/waldo/quux?foo=bar", "quux", 0); + test_import_url_last_component_one("https://foobar/waldo/quux/?foo=bar", "quux", 0); + test_import_url_last_component_one("https://foobar/waldo/quux/?foo=bar#piep", "quux", 0); + test_import_url_last_component_one("https://foobar/waldo/quux/#piep", "quux", 0); + test_import_url_last_component_one("https://foobar/waldo/quux#piep", "quux", 0); + test_import_url_last_component_one("https://", NULL, -EINVAL); + test_import_url_last_component_one("", NULL, -EINVAL); + test_import_url_last_component_one(":", NULL, -EINVAL); + test_import_url_last_component_one(":/", NULL, -EINVAL); + test_import_url_last_component_one("x:/", NULL, -EINVAL); + test_import_url_last_component_one("x:y", NULL, -EADDRNOTAVAIL); + test_import_url_last_component_one("x:y/z", "z", 0); +} + +static void test_import_url_change_suffix_one(const char *input, size_t n, const char *suffix, const char *output, int ret) { + _cleanup_free_ char *s = NULL; + + assert_se(import_url_change_suffix(input, n, suffix, &s) == ret); + assert_se(streq_ptr(output, s)); +} + +TEST(import_url_change_suffix) { + test_import_url_change_suffix_one("https://foobar/waldo/quux", 1, "wuff", "https://foobar/waldo/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux/", 1, "wuff", "https://foobar/waldo/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux///?mief", 1, "wuff", "https://foobar/waldo/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux///?mief#opopo", 1, "wuff", "https://foobar/waldo/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux/quff", 2, "wuff", "https://foobar/waldo/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux/quff/", 2, "wuff", "https://foobar/waldo/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux/quff", 0, "wuff", "https://foobar/waldo/quux/quff/wuff", 0); + test_import_url_change_suffix_one("https://foobar/waldo/quux/quff?aa?bb##4", 0, "wuff", "https://foobar/waldo/quux/quff/wuff", 0); + test_import_url_change_suffix_one("https://", 0, "wuff", NULL, -EINVAL); + test_import_url_change_suffix_one("", 0, "wuff", NULL, -EINVAL); + test_import_url_change_suffix_one(":", 0, "wuff", NULL, -EINVAL); + test_import_url_change_suffix_one(":/", 0, "wuff", NULL, -EINVAL); + test_import_url_change_suffix_one("x:/", 0, "wuff", NULL, -EINVAL); + test_import_url_change_suffix_one("x:y", 0, "wuff", "x:y/wuff", 0); + test_import_url_change_suffix_one("x:y/z", 0, "wuff", "x:y/z/wuff", 0); + test_import_url_change_suffix_one("x:y/z/", 0, "wuff", "x:y/z/wuff", 0); + test_import_url_change_suffix_one("x:y/z/", 1, "wuff", "x:y/wuff", 0); + test_import_url_change_suffix_one("x:y/z/", 2, "wuff", "x:y/wuff", 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-in-addr-prefix-util.c b/src/test/test-in-addr-prefix-util.c new file mode 100644 index 0000000..661ca8f --- /dev/null +++ b/src/test/test-in-addr-prefix-util.c @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "in-addr-prefix-util.h" +#include "tests.h" + +static void test_in_addr_prefix_to_string_one(int f, const char *addr, unsigned prefixlen) { + union in_addr_union ua; + assert_se(in_addr_from_string(f, addr, &ua) >= 0); + + const char *r = IN_ADDR_PREFIX_TO_STRING(f, &ua, prefixlen); + assert_se(r); + printf("%s: %s/%u == %s\n", __func__, addr, prefixlen, r); + assert_se(startswith(r, addr)); + + assert_se(streq(r, IN_ADDR_PREFIX_TO_STRING(f, &ua, prefixlen))); + assert_se(streq(IN_ADDR_PREFIX_TO_STRING(f, &ua, prefixlen), r)); +} + +TEST(in_addr_to_string_prefix) { + test_in_addr_prefix_to_string_one(AF_INET, "192.168.0.1", 0); + test_in_addr_prefix_to_string_one(AF_INET, "192.168.0.1", 1); + test_in_addr_prefix_to_string_one(AF_INET, "192.168.0.1", 31); + test_in_addr_prefix_to_string_one(AF_INET, "192.168.0.1", 32); + test_in_addr_prefix_to_string_one(AF_INET, "192.168.0.1", 256); + test_in_addr_prefix_to_string_one(AF_INET, "10.11.12.13", UINT_MAX); + test_in_addr_prefix_to_string_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 0); + test_in_addr_prefix_to_string_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", UINT_MAX); + test_in_addr_prefix_to_string_one(AF_INET6, "::1", 11); + test_in_addr_prefix_to_string_one(AF_INET6, "fe80::", 33); +} + +static void test_config_parse_in_addr_prefixes_one(int family, const union in_addr_union *addr, uint8_t prefixlen, Set **prefixes) { + const char *str = IN_ADDR_PREFIX_TO_STRING(family, addr, prefixlen); + assert_se(str); + + assert_se(config_parse_in_addr_prefixes("unit", "filename", 1, "Service", 1, "IPAddressAllow", 0, str, prefixes, NULL) >= 0); + + assert_se(streq(str, IN_ADDR_PREFIX_TO_STRING(family, addr, prefixlen))); + assert_se(streq(IN_ADDR_PREFIX_TO_STRING(family, addr, prefixlen), str)); +} + +static void test_config_parse_in_addr_prefixes(Set **ret) { + _cleanup_set_free_ Set *prefixes = NULL; + + log_info("/* %s() */", __func__); + + for (uint32_t i = 0; i < 256; i++) { + /* ipv4 link-local address */ + test_config_parse_in_addr_prefixes_one(AF_INET, &(union in_addr_union) { + .in.s_addr = htobe32((UINT32_C(169) << 24) | + (UINT32_C(254) << 16) | + (i << 8)), + }, 24, &prefixes); + + /* ipv6 multicast address */ + test_config_parse_in_addr_prefixes_one(AF_INET6, &(union in_addr_union) { + .in6.s6_addr[0] = 0xff, + .in6.s6_addr[1] = i, + }, 16, &prefixes); + + for (uint32_t j = 0; j < 256; j++) { + test_config_parse_in_addr_prefixes_one(AF_INET, &(union in_addr_union) { + .in.s_addr = htobe32((UINT32_C(169) << 24) | + (UINT32_C(254) << 16) | + (i << 8) | j), + }, 32, &prefixes); + + test_config_parse_in_addr_prefixes_one(AF_INET6, &(union in_addr_union) { + .in6.s6_addr[0] = 0xff, + .in6.s6_addr[1] = i, + .in6.s6_addr[2] = j, + }, 24, &prefixes); + } + } + + *ret = TAKE_PTR(prefixes); +} + +static void test_in_addr_prefixes_reduce(Set *prefixes) { + log_info("/* %s() */", __func__); + + assert_se(set_size(prefixes) == 2 * 256 * 257); + assert_se(!in_addr_prefixes_is_any(prefixes)); + + assert_se(in_addr_prefixes_reduce(prefixes) >= 0); + assert_se(set_size(prefixes) == 2 * 256); + assert_se(!in_addr_prefixes_is_any(prefixes)); + + assert_se(config_parse_in_addr_prefixes("unit", "filename", 1, "Service", 1, "IPAddressAllow", 0, "link-local", &prefixes, NULL) == 0); + assert_se(set_size(prefixes) == 2 * 256 + 2); + assert_se(!in_addr_prefixes_is_any(prefixes)); + + assert_se(in_addr_prefixes_reduce(prefixes) >= 0); + assert_se(set_size(prefixes) == 256 + 2); + assert_se(!in_addr_prefixes_is_any(prefixes)); + + assert_se(config_parse_in_addr_prefixes("unit", "filename", 1, "Service", 1, "IPAddressAllow", 0, "multicast", &prefixes, NULL) == 0); + assert_se(set_size(prefixes) == 256 + 4); + assert_se(!in_addr_prefixes_is_any(prefixes)); + + assert_se(in_addr_prefixes_reduce(prefixes) >= 0); + assert_se(set_size(prefixes) == 4); + assert_se(!in_addr_prefixes_is_any(prefixes)); + + assert_se(config_parse_in_addr_prefixes("unit", "filename", 1, "Service", 1, "IPAddressAllow", 0, "any", &prefixes, NULL) == 0); + assert_se(set_size(prefixes) == 6); + assert_se(in_addr_prefixes_is_any(prefixes)); + + assert_se(in_addr_prefixes_reduce(prefixes) >= 0); + assert_se(set_size(prefixes) == 2); + assert_se(in_addr_prefixes_is_any(prefixes)); +} + +TEST(in_addr_prefixes) { + _cleanup_set_free_ Set *prefixes = NULL; + + test_config_parse_in_addr_prefixes(&prefixes); + test_in_addr_prefixes_reduce(prefixes); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-in-addr-util.c b/src/test/test-in-addr-util.c new file mode 100644 index 0000000..93ab1c5 --- /dev/null +++ b/src/test/test-in-addr-util.c @@ -0,0 +1,408 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "in-addr-util.h" +#include "strv.h" +#include "tests.h" + +static void test_in_addr_prefix_from_string_one( + const char *p, + int family, + int ret, + const union in_addr_union *u, + unsigned char prefixlen, + int ret_refuse, + unsigned char prefixlen_refuse) { + + union in_addr_union q; + unsigned char l; + int f, r; + + r = in_addr_prefix_from_string(p, family, &q, &l); + assert_se(r == ret); + + if (r < 0) + return; + + assert_se(in_addr_equal(family, &q, u)); + assert_se(l == prefixlen); + + r = in_addr_prefix_from_string_auto(p, &f, &q, &l); + assert_se(r >= 0); + + assert_se(f == family); + assert_se(in_addr_equal(family, &q, u)); + assert_se(l == prefixlen); + + r = in_addr_prefix_from_string_auto_internal(p, PREFIXLEN_REFUSE, &f, &q, &l); + assert_se(r == ret_refuse); + + if (r >= 0) { + assert_se(f == family); + assert_se(in_addr_equal(family, &q, u)); + assert_se(l == prefixlen_refuse); + } +} + +TEST(in_addr_prefix_from_string) { + test_in_addr_prefix_from_string_one("", AF_INET, -EINVAL, NULL, 0, -EINVAL, 0); + test_in_addr_prefix_from_string_one("/", AF_INET, -EINVAL, NULL, 0, -EINVAL, 0); + test_in_addr_prefix_from_string_one("/8", AF_INET, -EINVAL, NULL, 0, -EINVAL, 0); + test_in_addr_prefix_from_string_one("1.2.3.4", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 32, -ENOANO, 0); + test_in_addr_prefix_from_string_one("1.2.3.4/0", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 0, 0, 0); + test_in_addr_prefix_from_string_one("1.2.3.4/1", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 1, 0, 1); + test_in_addr_prefix_from_string_one("1.2.3.4/2", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 2, 0, 2); + test_in_addr_prefix_from_string_one("1.2.3.4/32", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 32, 0, 32); + test_in_addr_prefix_from_string_one("1.2.3.4/33", AF_INET, -ERANGE, NULL, 0, -ERANGE, 0); + test_in_addr_prefix_from_string_one("1.2.3.4/-1", AF_INET, -ERANGE, NULL, 0, -ERANGE, 0); + test_in_addr_prefix_from_string_one("::1", AF_INET, -EINVAL, NULL, 0, -EINVAL, 0); + + test_in_addr_prefix_from_string_one("", AF_INET6, -EINVAL, NULL, 0, -EINVAL, 0); + test_in_addr_prefix_from_string_one("/", AF_INET6, -EINVAL, NULL, 0, -EINVAL, 0); + test_in_addr_prefix_from_string_one("/8", AF_INET6, -EINVAL, NULL, 0, -EINVAL, 0); + test_in_addr_prefix_from_string_one("::1", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 128, -ENOANO, 0); + test_in_addr_prefix_from_string_one("::1/0", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 0, 0, 0); + test_in_addr_prefix_from_string_one("::1/1", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 1, 0, 1); + test_in_addr_prefix_from_string_one("::1/2", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 2, 0, 2); + test_in_addr_prefix_from_string_one("::1/32", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 32, 0, 32); + test_in_addr_prefix_from_string_one("::1/33", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 33, 0, 33); + test_in_addr_prefix_from_string_one("::1/64", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 64, 0, 64); + test_in_addr_prefix_from_string_one("::1/128", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 128, 0, 128); + test_in_addr_prefix_from_string_one("::1/129", AF_INET6, -ERANGE, NULL, 0, -ERANGE, 0); + test_in_addr_prefix_from_string_one("::1/-1", AF_INET6, -ERANGE, NULL, 0, -ERANGE, 0); +} + +static void test_in_addr_prefix_to_string_valid(int family, const char *p) { + union in_addr_union u; + unsigned char l; + + log_info("%s: %s", __func__, p); + + assert_se(in_addr_prefix_from_string(p, family, &u, &l) >= 0); + assert_se(streq(p, IN_ADDR_PREFIX_TO_STRING(family, &u, l))); +} + +static void test_in_addr_prefix_to_string_unoptimized(int family, const char *p) { + union in_addr_union u1, u2; + unsigned char len1, len2; + + log_info("%s: %s", __func__, p); + + assert_se(in_addr_prefix_from_string(p, family, &u1, &len1) >= 0); + const char *str1 = IN_ADDR_PREFIX_TO_STRING(family, &u1, len1); + assert_se(str1); + assert_se(in_addr_prefix_from_string(str1, family, &u2, &len2) >= 0); + const char *str2 = IN_ADDR_PREFIX_TO_STRING(family, &u2, len2); + assert_se(str2); + + assert_se(streq(str1, str2)); + assert_se(len1 == len2); + assert_se(in_addr_equal(family, &u1, &u2) > 0); +} + +TEST(in_addr_prefix_to_string) { + test_in_addr_prefix_to_string_valid(AF_INET, "0.0.0.0/32"); + test_in_addr_prefix_to_string_valid(AF_INET, "1.2.3.4/0"); + test_in_addr_prefix_to_string_valid(AF_INET, "1.2.3.4/24"); + test_in_addr_prefix_to_string_valid(AF_INET, "1.2.3.4/32"); + test_in_addr_prefix_to_string_valid(AF_INET, "255.255.255.255/32"); + + test_in_addr_prefix_to_string_valid(AF_INET6, "::1/128"); + test_in_addr_prefix_to_string_valid(AF_INET6, "fd00:abcd::1/64"); + test_in_addr_prefix_to_string_valid(AF_INET6, "fd00:abcd::1234:1/64"); + test_in_addr_prefix_to_string_valid(AF_INET6, "1111:2222:3333:4444:5555:6666:7777:8888/128"); + + test_in_addr_prefix_to_string_unoptimized(AF_INET, "0.0.0.0"); + test_in_addr_prefix_to_string_unoptimized(AF_INET, "192.168.0.1"); + + test_in_addr_prefix_to_string_unoptimized(AF_INET6, "fd00:0000:0000:0000:0000:0000:0000:0001/64"); + test_in_addr_prefix_to_string_unoptimized(AF_INET6, "fd00:1111::0000:2222:3333:4444:0001/64"); +} + +TEST(in_addr_random_prefix) { + _cleanup_free_ char *str = NULL; + union in_addr_union a; + + assert_se(in_addr_from_string(AF_INET, "192.168.10.1", &a) >= 0); + + assert_se(in_addr_random_prefix(AF_INET, &a, 31, 32) >= 0); + assert_se(in_addr_to_string(AF_INET, &a, &str) >= 0); + assert_se(STR_IN_SET(str, "192.168.10.0", "192.168.10.1")); + str = mfree(str); + + assert_se(in_addr_random_prefix(AF_INET, &a, 24, 26) >= 0); + assert_se(in_addr_to_string(AF_INET, &a, &str) >= 0); + assert_se(startswith(str, "192.168.10.")); + str = mfree(str); + + assert_se(in_addr_random_prefix(AF_INET, &a, 16, 24) >= 0); + assert_se(in_addr_to_string(AF_INET, &a, &str) >= 0); + assert_se(fnmatch("192.168.[0-9]*.0", str, 0) == 0); + str = mfree(str); + + assert_se(in_addr_random_prefix(AF_INET, &a, 8, 24) >= 0); + assert_se(in_addr_to_string(AF_INET, &a, &str) >= 0); + assert_se(fnmatch("192.[0-9]*.[0-9]*.0", str, 0) == 0); + str = mfree(str); + + assert_se(in_addr_random_prefix(AF_INET, &a, 8, 16) >= 0); + assert_se(in_addr_to_string(AF_INET, &a, &str) >= 0); + assert_se(fnmatch("192.[0-9]*.0.0", str, 0) == 0); + str = mfree(str); + + assert_se(in_addr_from_string(AF_INET6, "fd00::1", &a) >= 0); + + assert_se(in_addr_random_prefix(AF_INET6, &a, 16, 64) >= 0); + assert_se(in_addr_to_string(AF_INET6, &a, &str) >= 0); + assert_se(startswith(str, "fd00:")); + str = mfree(str); + + assert_se(in_addr_random_prefix(AF_INET6, &a, 8, 16) >= 0); + assert_se(in_addr_to_string(AF_INET6, &a, &str) >= 0); + assert_se(fnmatch("fd??::", str, 0) == 0); + str = mfree(str); +} + +TEST(in_addr_is_null) { + union in_addr_union i = {}; + + assert_se(in_addr_is_null(AF_INET, &i) == true); + assert_se(in_addr_is_null(AF_INET6, &i) == true); + + i.in.s_addr = 0x1000000; + assert_se(in_addr_is_null(AF_INET, &i) == false); + assert_se(in_addr_is_null(AF_INET6, &i) == false); + + assert_se(in_addr_is_null(-1, &i) == -EAFNOSUPPORT); +} + +static void test_in_addr_prefix_intersect_one(unsigned f, const char *a, unsigned apl, const char *b, unsigned bpl, int result) { + union in_addr_union ua, ub; + + assert_se(in_addr_from_string(f, a, &ua) >= 0); + assert_se(in_addr_from_string(f, b, &ub) >= 0); + + assert_se(in_addr_prefix_intersect(f, &ua, apl, &ub, bpl) == result); +} + +TEST(in_addr_prefix_intersect) { + test_in_addr_prefix_intersect_one(AF_INET, "255.255.255.255", 32, "255.255.255.254", 32, 0); + test_in_addr_prefix_intersect_one(AF_INET, "255.255.255.255", 0, "255.255.255.255", 32, 1); + test_in_addr_prefix_intersect_one(AF_INET, "0.0.0.0", 0, "47.11.8.15", 32, 1); + + test_in_addr_prefix_intersect_one(AF_INET, "1.1.1.1", 24, "1.1.1.1", 24, 1); + test_in_addr_prefix_intersect_one(AF_INET, "2.2.2.2", 24, "1.1.1.1", 24, 0); + + test_in_addr_prefix_intersect_one(AF_INET, "1.1.1.1", 24, "1.1.1.127", 25, 1); + test_in_addr_prefix_intersect_one(AF_INET, "1.1.1.1", 24, "1.1.1.127", 26, 1); + test_in_addr_prefix_intersect_one(AF_INET, "1.1.1.1", 25, "1.1.1.127", 25, 1); + test_in_addr_prefix_intersect_one(AF_INET, "1.1.1.1", 25, "1.1.1.255", 25, 0); + + test_in_addr_prefix_intersect_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 128, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:fffe", 128, 0); + test_in_addr_prefix_intersect_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 0, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 128, 1); + test_in_addr_prefix_intersect_one(AF_INET6, "::", 0, "beef:beef:beef:beef:beef:beef:beef:beef", 128, 1); + + test_in_addr_prefix_intersect_one(AF_INET6, "1::2", 64, "1::2", 64, 1); + test_in_addr_prefix_intersect_one(AF_INET6, "2::2", 64, "1::2", 64, 0); + + test_in_addr_prefix_intersect_one(AF_INET6, "1::1", 120, "1::007f", 121, 1); + test_in_addr_prefix_intersect_one(AF_INET6, "1::1", 120, "1::007f", 122, 1); + test_in_addr_prefix_intersect_one(AF_INET6, "1::1", 121, "1::007f", 121, 1); + test_in_addr_prefix_intersect_one(AF_INET6, "1::1", 121, "1::00ff", 121, 0); +} + +static void test_in_addr_prefix_next_one(unsigned f, const char *before, unsigned pl, const char *after) { + union in_addr_union ubefore, uafter, t; + + log_debug("/* %s(%s, prefixlen=%u) */", __func__, before, pl); + + assert_se(in_addr_from_string(f, before, &ubefore) >= 0); + + t = ubefore; + assert_se((in_addr_prefix_next(f, &t, pl) >= 0) == !!after); + + if (after) { + assert_se(in_addr_from_string(f, after, &uafter) >= 0); + assert_se(in_addr_equal(f, &t, &uafter) > 0); + } +} + +TEST(in_addr_prefix_next) { + test_in_addr_prefix_next_one(AF_INET, "192.168.0.0", 24, "192.168.1.0"); + test_in_addr_prefix_next_one(AF_INET, "192.168.0.0", 16, "192.169.0.0"); + test_in_addr_prefix_next_one(AF_INET, "192.168.0.0", 20, "192.168.16.0"); + + test_in_addr_prefix_next_one(AF_INET, "0.0.0.0", 32, "0.0.0.1"); + test_in_addr_prefix_next_one(AF_INET, "255.255.255.254", 32, "255.255.255.255"); + test_in_addr_prefix_next_one(AF_INET, "255.255.255.255", 32, NULL); + test_in_addr_prefix_next_one(AF_INET, "255.255.255.0", 24, NULL); + + test_in_addr_prefix_next_one(AF_INET6, "4400::", 128, "4400::0001"); + test_in_addr_prefix_next_one(AF_INET6, "4400::", 120, "4400::0100"); + test_in_addr_prefix_next_one(AF_INET6, "4400::", 127, "4400::0002"); + test_in_addr_prefix_next_one(AF_INET6, "4400::", 8, "4500::"); + test_in_addr_prefix_next_one(AF_INET6, "4400::", 7, "4600::"); + + test_in_addr_prefix_next_one(AF_INET6, "::", 128, "::1"); + + test_in_addr_prefix_next_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 128, NULL); + test_in_addr_prefix_next_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ff00", 120, NULL); +} + +static void test_in_addr_prefix_nth_one(unsigned f, const char *before, unsigned pl, uint64_t nth, const char *after) { + union in_addr_union ubefore, uafter, t; + + log_debug("/* %s(%s, prefixlen=%u, nth=%"PRIu64") */", __func__, before, pl, nth); + + assert_se(in_addr_from_string(f, before, &ubefore) >= 0); + + t = ubefore; + assert_se((in_addr_prefix_nth(f, &t, pl, nth) >= 0) == !!after); + + if (after) { + assert_se(in_addr_from_string(f, after, &uafter) >= 0); + assert_se(in_addr_equal(f, &t, &uafter) > 0); + } +} + +TEST(in_addr_prefix_nth) { + test_in_addr_prefix_nth_one(AF_INET, "192.168.0.0", 24, 0, "192.168.0.0"); + test_in_addr_prefix_nth_one(AF_INET, "192.168.0.123", 24, 0, "192.168.0.0"); + test_in_addr_prefix_nth_one(AF_INET, "192.168.0.123", 24, 1, "192.168.1.0"); + test_in_addr_prefix_nth_one(AF_INET, "192.168.0.0", 24, 4, "192.168.4.0"); + test_in_addr_prefix_nth_one(AF_INET, "192.168.0.0", 25, 1, "192.168.0.128"); + test_in_addr_prefix_nth_one(AF_INET, "192.168.255.0", 25, 1, "192.168.255.128"); + test_in_addr_prefix_nth_one(AF_INET, "192.168.255.0", 24, 0, "192.168.255.0"); + test_in_addr_prefix_nth_one(AF_INET, "255.255.255.255", 32, 1, NULL); + test_in_addr_prefix_nth_one(AF_INET, "255.255.255.255", 0, 1, NULL); + + test_in_addr_prefix_nth_one(AF_INET6, "4400::", 8, 1, "4500::"); + test_in_addr_prefix_nth_one(AF_INET6, "4400::", 7, 1, "4600::"); + test_in_addr_prefix_nth_one(AF_INET6, "4400::", 64, 1, "4400:0:0:1::"); + test_in_addr_prefix_nth_one(AF_INET6, "4400::", 64, 2, "4400:0:0:2::"); + test_in_addr_prefix_nth_one(AF_INET6, "4400::", 64, 0xbad, "4400:0:0:0bad::"); + test_in_addr_prefix_nth_one(AF_INET6, "4400:0:0:ffff::", 64, 1, "4400:0:1::"); + test_in_addr_prefix_nth_one(AF_INET6, "4400::", 56, ((uint64_t)1<<48) -1, "44ff:ffff:ffff:ff00::"); + test_in_addr_prefix_nth_one(AF_INET6, "0000::", 8, 255, "ff00::"); + test_in_addr_prefix_nth_one(AF_INET6, "0000::", 8, 256, NULL); + test_in_addr_prefix_nth_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 128, 1, NULL); + test_in_addr_prefix_nth_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 0, 1, NULL); + test_in_addr_prefix_nth_one(AF_INET6, "1234:5678:90ab:cdef:1234:5678:90ab:cdef", 12, 1, "1240::"); +} + +static void test_in_addr_prefix_range_one( + int family, + const char *in, + unsigned prefixlen, + const char *expected_start, + const char *expected_end) { + + union in_addr_union a, s, e; + + log_debug("/* %s(%s, prefixlen=%u) */", __func__, in, prefixlen); + + assert_se(in_addr_from_string(family, in, &a) >= 0); + assert_se((in_addr_prefix_range(family, &a, prefixlen, &s, &e) >= 0) == !!expected_start); + + if (expected_start) { + union in_addr_union es; + + assert_se(in_addr_from_string(family, expected_start, &es) >= 0); + assert_se(in_addr_equal(family, &s, &es) > 0); + } + if (expected_end) { + union in_addr_union ee; + + assert_se(in_addr_from_string(family, expected_end, &ee) >= 0); + assert_se(in_addr_equal(family, &e, &ee) > 0); + } +} + +TEST(in_addr_prefix_range) { + test_in_addr_prefix_range_one(AF_INET, "192.168.123.123", 24, "192.168.123.0", "192.168.124.0"); + test_in_addr_prefix_range_one(AF_INET, "192.168.123.123", 16, "192.168.0.0", "192.169.0.0"); + + test_in_addr_prefix_range_one(AF_INET6, "dead:beef::", 64, "dead:beef::", "dead:beef:0:1::"); + test_in_addr_prefix_range_one(AF_INET6, "dead:0:0:beef::", 64, "dead:0:0:beef::", "dead:0:0:bef0::"); + test_in_addr_prefix_range_one(AF_INET6, "2001::", 48, "2001::", "2001:0:1::"); + test_in_addr_prefix_range_one(AF_INET6, "2001::", 56, "2001::", "2001:0:0:0100::"); + test_in_addr_prefix_range_one(AF_INET6, "2001::", 65, "2001::", "2001::8000:0:0:0"); + test_in_addr_prefix_range_one(AF_INET6, "2001::", 66, "2001::", "2001::4000:0:0:0"); + test_in_addr_prefix_range_one(AF_INET6, "2001::", 127, "2001::", "2001::2"); +} + +static void test_in_addr_to_string_one(int f, const char *addr) { + union in_addr_union ua; + _cleanup_free_ char *r; + + assert_se(in_addr_from_string(f, addr, &ua) >= 0); + assert_se(in_addr_to_string(f, &ua, &r) >= 0); + printf("%s: %s == %s\n", __func__, addr, r); + assert_se(streq(addr, r)); + + assert_se(streq(r, IN_ADDR_TO_STRING(f, &ua))); +} + +TEST(in_addr_to_string) { + test_in_addr_to_string_one(AF_INET, "192.168.0.1"); + test_in_addr_to_string_one(AF_INET, "10.11.12.13"); + test_in_addr_to_string_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"); + test_in_addr_to_string_one(AF_INET6, "::1"); + test_in_addr_to_string_one(AF_INET6, "fe80::"); +} + +TEST(in_addr_prefixlen_to_netmask) { + union in_addr_union addr; + static const char *const ipv4_netmasks[] = { + "0.0.0.0", "128.0.0.0", "192.0.0.0", "224.0.0.0", "240.0.0.0", + "248.0.0.0", "252.0.0.0", "254.0.0.0", "255.0.0.0", + "255.128.0.0", "255.192.0.0", "255.224.0.0", "255.240.0.0", + "255.248.0.0", "255.252.0.0", "255.254.0.0", "255.255.0.0", + "255.255.128.0", "255.255.192.0", "255.255.224.0", "255.255.240.0", + "255.255.248.0", "255.255.252.0", "255.255.254.0", "255.255.255.0", + "255.255.255.128", "255.255.255.192", "255.255.255.224", "255.255.255.240", + "255.255.255.248", "255.255.255.252", "255.255.255.254", "255.255.255.255", + }; + + static const char *const ipv6_netmasks[] = { + [0] = "::", + [1] = "8000::", + [2] = "c000::", + [7] = "fe00::", + [8] = "ff00::", + [9] = "ff80::", + [16] = "ffff::", + [17] = "ffff:8000::", + [32] = "ffff:ffff::", + [33] = "ffff:ffff:8000::", + [64] = "ffff:ffff:ffff:ffff::", + [65] = "ffff:ffff:ffff:ffff:8000::", + [96] = "ffff:ffff:ffff:ffff:ffff:ffff::", + [97] = "ffff:ffff:ffff:ffff:ffff:ffff:8000:0", + [127] = "ffff:ffff:ffff:ffff:ffff:ffff:ffff:fffe", + [128] = "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff" + }; + + for (unsigned char prefixlen = 0; prefixlen <= 32; prefixlen++) { + _cleanup_free_ char *result = NULL; + + assert_se(in_addr_prefixlen_to_netmask(AF_INET, &addr, prefixlen) >= 0); + assert_se(in_addr_to_string(AF_INET, &addr, &result) >= 0); + printf("test_in_addr_prefixlen_to_netmask: %s == %s\n", ipv4_netmasks[prefixlen], result); + assert_se(streq(ipv4_netmasks[prefixlen], result)); + } + + for (unsigned char prefixlen = 0; prefixlen <= 128; prefixlen++) { + _cleanup_free_ char *result = NULL; + + assert_se(in_addr_prefixlen_to_netmask(AF_INET6, &addr, prefixlen) >= 0); + assert_se(in_addr_to_string(AF_INET6, &addr, &result) >= 0); + printf("test_in_addr_prefixlen_to_netmask: %s\n", result); + if (ipv6_netmasks[prefixlen]) + assert_se(streq(ipv6_netmasks[prefixlen], result)); + } +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-install-file.c b/src/test/test-install-file.c new file mode 100644 index 0000000..8206eb0 --- /dev/null +++ b/src/test/test-install-file.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fileio.h" +#include "install-file.h" +#include "path-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "umask-util.h" + +TEST(install_file) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + _cleanup_free_ char *a = NULL, *b = NULL, *c = NULL; + struct stat stat1, stat2; + + assert_se(mkdtemp_malloc(NULL, &p) >= 0); + assert_se(a = path_join(p, "foo")); + assert_se(b = path_join(p, "bar")); + + WITH_UMASK(0077) + assert_se(write_string_file(a, "wups", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(lstat(a, &stat1) >= 0); + assert_se(S_ISREG(stat1.st_mode)); + + assert_se(install_file(AT_FDCWD, a, AT_FDCWD, b, 0) >= 0); + assert_se(install_file(AT_FDCWD, b, AT_FDCWD, a, INSTALL_FSYNC) >= 0); + + assert_se(write_string_file(b, "ttss", WRITE_STRING_FILE_CREATE) >= 0); + assert_se(install_file(AT_FDCWD, a, AT_FDCWD, b, INSTALL_FSYNC_FULL) == -EEXIST); + assert_se(install_file(AT_FDCWD, a, AT_FDCWD, b, INSTALL_FSYNC_FULL|INSTALL_REPLACE) >= 0); + + assert_se(stat(b, &stat2) >= 0); + assert_se(stat1.st_dev == stat2.st_dev); + assert_se(stat1.st_ino == stat2.st_ino); + assert_se((stat2.st_mode & 0222) != 0); /* writable */ + + assert_se(install_file(AT_FDCWD, b, AT_FDCWD, a, INSTALL_FSYNC_FULL|INSTALL_REPLACE|INSTALL_READ_ONLY) >= 0); + + assert_se(stat(a, &stat2) >= 0); + assert_se(stat1.st_dev == stat2.st_dev); + assert_se(stat1.st_ino == stat2.st_ino); + assert_se((stat2.st_mode & 0222) == 0); /* read-only */ + + assert_se(mkdir(b, 0755) >= 0); + assert_se(c = path_join(b, "dir")); + assert_se(mkdir(c, 0755) >= 0); + free(c); + assert_se(c = path_join(b, "reg")); + assert_se(mknod(c, S_IFREG|0755, 0) >= 0); + free(c); + assert_se(c = path_join(b, "fifo")); + assert_se(mknod(c, S_IFIFO|0755, 0) >= 0); + + assert_se(install_file(AT_FDCWD, b, AT_FDCWD, a, INSTALL_FSYNC_FULL) == -EEXIST); + assert_se(install_file(AT_FDCWD, b, AT_FDCWD, a, INSTALL_FSYNC_FULL|INSTALL_REPLACE) == 0); + + assert_se(write_string_file(b, "ttss", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(install_file(AT_FDCWD, b, AT_FDCWD, a, INSTALL_FSYNC_FULL) == -EEXIST); + assert_se(install_file(AT_FDCWD, b, AT_FDCWD, a, INSTALL_FSYNC_FULL|INSTALL_REPLACE) == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-install-root.c b/src/test/test-install-root.c new file mode 100644 index 0000000..efd75b2 --- /dev/null +++ b/src/test/test-install-root.c @@ -0,0 +1,1299 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "install.h" +#include "mkdir.h" +#include "rm-rf.h" +#include "special.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static char *root = NULL; + +STATIC_DESTRUCTOR_REGISTER(root, rm_rf_physical_and_freep); + +TEST(basic_mask_and_enable) { + const char *p; + UnitFileState state; + InstallChange *changes = NULL; + size_t n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "e.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "f.service", NULL) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/a.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/b.service"); + assert_se(symlink("a.service", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + p = strjoina(root, "/usr/lib/systemd/system/c.service"); + assert_se(symlink("/usr/lib/systemd/system/a.service", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + p = strjoina(root, "/usr/lib/systemd/system/d.service"); + assert_se(symlink("c.service", p) >= 0); + + /* This one is interesting, as d follows a relative, then an absolute symlink */ + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + assert_se(unit_file_mask(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/dev/null")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/a.service"); + assert_se(streq(changes[0].path, p)); + + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", &state) >= 0 && state == UNIT_FILE_MASKED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", &state) >= 0 && state == UNIT_FILE_MASKED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", &state) >= 0 && state == UNIT_FILE_MASKED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", &state) >= 0 && state == UNIT_FILE_MASKED); + + /* Enabling a masked unit should fail! */ + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) == -ERFKILL); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_unmask(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/a.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/a.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/a.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + /* Enabling it again should succeed but be a NOP */ + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 0); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/a.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + /* Disabling a disabled unit must succeed but be a NOP */ + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("a.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 0); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + /* Let's enable this indirectly via a symlink */ + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("d.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/a.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/a.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + /* Let's try to reenable */ + + assert_se(unit_file_reenable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("b.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/a.service"); + assert_se(streq(changes[0].path, p)); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[1].source, "/usr/lib/systemd/system/a.service")); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "a.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "b.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "c.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "d.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + /* Test masking with relative symlinks */ + + p = strjoina(root, "/usr/lib/systemd/system/e.service"); + assert_se(symlink("../../../../../../dev/null", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "e.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "e.service", &state) >= 0 && state == UNIT_FILE_MASKED); + + assert_se(unlink(p) == 0); + assert_se(symlink("/usr/../dev/null", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "e.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "e.service", &state) >= 0 && state == UNIT_FILE_MASKED); + + assert_se(unlink(p) == 0); + + /* Test enabling with unknown dependency target */ + + p = strjoina(root, "/usr/lib/systemd/system/f.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=x.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "f.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "f.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("f.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/f.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/x.target.wants/f.service"); + assert_se(streq(changes[0].path, p)); + assert_se(changes[1].type == INSTALL_CHANGE_DESTINATION_NOT_PRESENT); + p = strjoina(root, "/usr/lib/systemd/system/f.service"); + assert_se(streq(changes[1].source, p)); + assert_se(streq(changes[1].path, "x.target")); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "f.service", &state) >= 0 && state == UNIT_FILE_ENABLED); +} + +TEST(linked_units) { + const char *p, *q; + UnitFileState state; + InstallChange *changes = NULL; + size_t n_changes = 0, i; + + /* + * We'll test three cases here: + * + * a) a unit file in /opt, that we use "systemctl link" and + * "systemctl enable" on to make it available to the system + * + * b) a unit file in /opt, that is statically linked into + * /usr/lib/systemd/system, that "enable" should work on + * correctly. + * + * c) a unit file in /opt, that is linked into + * /etc/systemd/system, and where "enable" should result in + * -ELOOP, since using information from /etc to generate + * information in /etc should not be allowed. + */ + + p = strjoina(root, "/opt/linked.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/opt/linked2.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/opt/linked3.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked2.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked3.service", NULL) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/linked2.service"); + assert_se(symlink("/opt/linked2.service", p) >= 0); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/linked3.service"); + assert_se(symlink("/opt/linked3.service", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked2.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked3.service", &state) >= 0 && state == UNIT_FILE_LINKED); + + /* First, let's link the unit into the search path */ + assert_se(unit_file_link(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("/opt/linked.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/opt/linked.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/linked.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked.service", &state) >= 0 && state == UNIT_FILE_LINKED); + + /* Let's unlink it from the search path again */ + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("linked.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/linked.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked.service", NULL) == -ENOENT); + + /* Now, let's not just link it, but also enable it */ + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("/opt/linked.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 2); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/linked.service"); + q = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/linked.service"); + for (i = 0 ; i < n_changes; i++) { + assert_se(changes[i].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[i].source, "/opt/linked.service")); + + if (p && streq(changes[i].path, p)) + p = NULL; + else if (q && streq(changes[i].path, q)) + q = NULL; + else + assert_not_reached(); + } + assert_se(!p && !q); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + + /* And let's unlink it again */ + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("linked.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 2); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/linked.service"); + q = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/linked.service"); + for (i = 0; i < n_changes; i++) { + assert_se(changes[i].type == INSTALL_CHANGE_UNLINK); + + if (p && streq(changes[i].path, p)) + p = NULL; + else if (q && streq(changes[i].path, q)) + q = NULL; + else + assert_not_reached(); + } + assert_se(!p && !q); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "linked.service", NULL) == -ENOENT); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("linked2.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 2); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/linked2.service"); + q = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/linked2.service"); + for (i = 0 ; i < n_changes; i++) { + assert_se(changes[i].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[i].source, "/opt/linked2.service")); + + if (p && streq(changes[i].path, p)) + p = NULL; + else if (q && streq(changes[i].path, q)) + q = NULL; + else + assert_not_reached(); + } + assert_se(!p && !q); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("linked3.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(startswith(changes[0].path, root)); + assert_se(endswith(changes[0].path, "linked3.service")); + assert_se(streq(changes[0].source, "/opt/linked3.service")); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; +} + +TEST(default) { + _cleanup_free_ char *def = NULL; + InstallChange *changes = NULL; + size_t n_changes = 0; + const char *p; + + p = strjoina(root, "/usr/lib/systemd/system/test-default-real.target"); + assert_se(write_string_file(p, "# pretty much empty", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/test-default.target"); + assert_se(symlink("test-default-real.target", p) >= 0); + + assert_se(unit_file_get_default(RUNTIME_SCOPE_SYSTEM, root, &def) == -ENOENT); + + assert_se(unit_file_set_default(RUNTIME_SCOPE_SYSTEM, 0, root, "idontexist.target", &changes, &n_changes) == -ENOENT); + assert_se(n_changes == 1); + assert_se(changes[0].type == -ENOENT); + assert_se(streq_ptr(changes[0].path, "idontexist.target")); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_default(RUNTIME_SCOPE_SYSTEM, root, &def) == -ENOENT); + + assert_se(unit_file_set_default(RUNTIME_SCOPE_SYSTEM, 0, root, "test-default.target", &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/test-default-real.target")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR "/" SPECIAL_DEFAULT_TARGET); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_default(RUNTIME_SCOPE_SYSTEM, root, &def) >= 0); + assert_se(streq_ptr(def, "test-default-real.target")); +} + +TEST(add_dependency) { + InstallChange *changes = NULL; + size_t n_changes = 0; + const char *p; + + p = strjoina(root, "/usr/lib/systemd/system/real-add-dependency-test-target.target"); + assert_se(write_string_file(p, "# pretty much empty", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/add-dependency-test-target.target"); + assert_se(symlink("real-add-dependency-test-target.target", p) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/real-add-dependency-test-service.service"); + assert_se(write_string_file(p, "# pretty much empty", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/add-dependency-test-service.service"); + assert_se(symlink("real-add-dependency-test-service.service", p) >= 0); + + assert_se(unit_file_add_dependency(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("add-dependency-test-service.service"), "add-dependency-test-target.target", UNIT_WANTS, &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/real-add-dependency-test-service.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/real-add-dependency-test-target.target.wants/real-add-dependency-test-service.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; +} + +TEST(template_enable) { + InstallChange *changes = NULL; + size_t n_changes = 0; + UnitFileState state; + const char *p; + + log_info("== %s ==", __func__); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/template@.service"); + assert_se(write_string_file(p, + "[Install]\n" + "DefaultInstance=def\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/template-symlink@.service"); + assert_se(symlink("template@.service", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + log_info("== %s with template@.service enabled ==", __func__); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("template@.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/template@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/template@def.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@def.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("template@.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + log_info("== %s with template@foo.service enabled ==", __func__); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("template@foo.service"), &changes, &n_changes) >= 0); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/template@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/template@foo.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) >= 0 && state == UNIT_FILE_INDIRECT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("template@foo.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@quux.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@quux.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + log_info("== %s with template-symlink@quux.service enabled ==", __func__); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("template-symlink@quux.service"), &changes, &n_changes) >= 0); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/template@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/template@quux.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@.service", &state) >= 0 && state == UNIT_FILE_INDIRECT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template@quux.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "template-symlink@quux.service", &state) >= 0 && state == UNIT_FILE_ENABLED); +} + +TEST(indirect) { + InstallChange *changes = NULL; + size_t n_changes = 0; + UnitFileState state; + const char *p; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirecta.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirectb.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirectc.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/indirecta.service"); + assert_se(write_string_file(p, + "[Install]\n" + "Also=indirectb.service\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/indirectb.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/indirectc.service"); + assert_se(symlink("indirecta.service", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirecta.service", &state) >= 0 && state == UNIT_FILE_INDIRECT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirectb.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirectc.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("indirectc.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/indirectb.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/indirectb.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirecta.service", &state) >= 0 && state == UNIT_FILE_INDIRECT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirectb.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "indirectc.service", &state) >= 0 && state == UNIT_FILE_ALIAS); + + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("indirectc.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/indirectb.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; +} + +TEST(preset_and_list) { + InstallChange *changes = NULL; + size_t n_changes = 0, i; + const char *p, *q; + UnitFileState state; + bool got_yes = false, got_no = false; + UnitFileList *fl; + _cleanup_hashmap_free_ Hashmap *h = NULL; + + CLEANUP_ARRAY(changes, n_changes, install_changes_free); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-yes.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-no.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/preset-yes.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/preset-no.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/preset-ignore.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system-preset/test.preset"); + assert_se(write_string_file(p, + "enable *-yes.*\n" + "ignore *-ignore.*\n" + "disable *\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-yes.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-no.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("preset-yes.service"), UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/preset-yes.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/preset-yes.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-yes.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-no.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("preset-yes.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/preset-yes.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-yes.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-no.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("preset-no.service"), UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(n_changes == 0); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-yes.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-no.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_preset_all(RUNTIME_SCOPE_SYSTEM, 0, root, UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + + assert_se(n_changes > 0); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/preset-yes.service"); + + for (i = 0; i < n_changes; i++) { + + if (changes[i].type == INSTALL_CHANGE_SYMLINK) { + assert_se(streq(changes[i].source, "/usr/lib/systemd/system/preset-yes.service")); + assert_se(streq(changes[i].path, p)); + } else + assert_se(changes[i].type == INSTALL_CHANGE_UNLINK); + } + + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-yes.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-no.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(h = hashmap_new(&unit_file_list_hash_ops_free)); + assert_se(unit_file_get_list(RUNTIME_SCOPE_SYSTEM, root, h, NULL, NULL) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/preset-yes.service"); + q = strjoina(root, "/usr/lib/systemd/system/preset-no.service"); + + HASHMAP_FOREACH(fl, h) { + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, basename(fl->path), &state) >= 0); + assert_se(fl->state == state); + + if (streq(fl->path, p)) { + got_yes = true; + assert_se(fl->state == UNIT_FILE_ENABLED); + } else if (streq(fl->path, q)) { + got_no = true; + assert_se(fl->state == UNIT_FILE_DISABLED); + } else + assert_se(IN_SET(fl->state, UNIT_FILE_DISABLED, UNIT_FILE_STATIC, UNIT_FILE_INDIRECT, UNIT_FILE_ALIAS)); + } + + assert_se(got_yes && got_no); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("preset-ignore.service"), &changes, &n_changes) >= 0); + assert_se(unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("preset-ignore.service"), UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "preset-ignore.service", &state) >= 0 && state == UNIT_FILE_ENABLED); +} + +TEST(revert) { + const char *p; + UnitFileState state; + InstallChange *changes = NULL; + size_t n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "xx.service", NULL) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "yy.service", NULL) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/xx.service"); + assert_se(write_string_file(p, "# Empty\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "xx.service", NULL) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "xx.service", &state) >= 0 && state == UNIT_FILE_STATIC); + + /* Initially there's nothing to revert */ + assert_se(unit_file_revert(RUNTIME_SCOPE_SYSTEM, root, STRV_MAKE("xx.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 0); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/xx.service"); + assert_se(write_string_file(p, "# Empty override\n", WRITE_STRING_FILE_CREATE) >= 0); + + /* Revert the override file */ + assert_se(unit_file_revert(RUNTIME_SCOPE_SYSTEM, root, STRV_MAKE("xx.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/xx.service.d/dropin.conf"); + assert_se(write_string_file(p, "# Empty dropin\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + /* Revert the dropin file */ + assert_se(unit_file_revert(RUNTIME_SCOPE_SYSTEM, root, STRV_MAKE("xx.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + assert_se(streq(changes[0].path, p)); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/xx.service.d"); + assert_se(changes[1].type == INSTALL_CHANGE_UNLINK); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; +} + +TEST(preset_order) { + InstallChange *changes = NULL; + size_t n_changes = 0; + const char *p; + UnitFileState state; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-1.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-2.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/prefix-1.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/prefix-2.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system-preset/test.preset"); + assert_se(write_string_file(p, + "enable prefix-1.service\n" + "disable prefix-*.service\n" + "enable prefix-2.service\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-1.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-2.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("prefix-1.service"), UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/prefix-1.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/prefix-1.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-1.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-2.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("prefix-2.service"), UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(n_changes == 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-1.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "prefix-2.service", &state) >= 0 && state == UNIT_FILE_DISABLED); +} + +TEST(static_instance) { + UnitFileState state; + const char *p; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "static-instance@.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "static-instance@foo.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/static-instance@.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "static-instance@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "static-instance@foo.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/static-instance@foo.service"); + assert_se(symlink("static-instance@.service", p) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "static-instance@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "static-instance@foo.service", &state) >= 0 && state == UNIT_FILE_STATIC); +} + +TEST(with_dropin) { + const char *p; + UnitFileState state; + InstallChange *changes = NULL; + size_t n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-1.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-4a.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-4b.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-1.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-1.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=graphical.target\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-1.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/with-dropin-2.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-2.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=graphical.target\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-3.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/with-dropin-3.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=graphical.target\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-4a.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/with-dropin-4a.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "Also=with-dropin-4b.service\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-4a.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-4b.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-4b.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-1.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-1.service")); + assert_se(streq(changes[1].source, "/usr/lib/systemd/system/with-dropin-1.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-1.service"); + assert_se(streq(changes[0].path, p)); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/graphical.target.wants/with-dropin-1.service"); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-2.service"), &changes, &n_changes) == 1); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, SYSTEM_CONFIG_UNIT_DIR"/with-dropin-2.service")); + assert_se(streq(changes[1].source, SYSTEM_CONFIG_UNIT_DIR"/with-dropin-2.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-2.service"); + assert_se(streq(changes[0].path, p)); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/graphical.target.wants/with-dropin-2.service"); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-3.service"), &changes, &n_changes) == 1); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-3.service")); + assert_se(streq(changes[1].source, "/usr/lib/systemd/system/with-dropin-3.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-3.service"); + assert_se(streq(changes[0].path, p)); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/graphical.target.wants/with-dropin-3.service"); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-4a.service"), &changes, &n_changes) == 2); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-4a.service")); + assert_se(streq(changes[1].source, "/usr/lib/systemd/system/with-dropin-4b.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-4a.service"); + assert_se(streq(changes[0].path, p)); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-4b.service"); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-1.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-4a.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-4b.service", &state) >= 0 && state == UNIT_FILE_ENABLED); +} + +TEST(with_dropin_template) { + const char *p; + UnitFileState state; + InstallChange *changes = NULL; + size_t n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-1@.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2@.service", &state) == -ENOENT); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3@.service", &state) == -ENOENT); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-1@.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-1@.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=graphical.target\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-1@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-2@.service"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-2@instance-1.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "WantedBy=graphical.target\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-3@.service"); + assert_se(write_string_file(p, + "[Install]\n" + "DefaultInstance=instance-1\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/with-dropin-3@.service.d/dropin.conf"); + assert_se(write_string_file(p, + "[Install]\n" + "DefaultInstance=instance-2\n", WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MKDIR_0755) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-1@instance-1.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-1@.service")); + assert_se(streq(changes[1].source, "/usr/lib/systemd/system/with-dropin-1@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-1@instance-1.service"); + assert_se(streq(changes[0].path, p)); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/graphical.target.wants/with-dropin-1@instance-1.service"); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-2@instance-1.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 2); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(changes[1].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-2@.service")); + assert_se(streq(changes[1].source, "/usr/lib/systemd/system/with-dropin-2@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-2@instance-1.service"); + assert_se(streq(changes[0].path, p)); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/graphical.target.wants/with-dropin-2@instance-1.service"); + assert_se(streq(changes[1].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-2@instance-2.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-2@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-2@instance-2.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("with-dropin-3@.service"), &changes, &n_changes) == 1); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + assert_se(streq(changes[0].source, "/usr/lib/systemd/system/with-dropin-3@.service")); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/with-dropin-3@instance-2.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-1@instance-1.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2@instance-1.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-2@instance-2.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3@instance-1.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "with-dropin-3@instance-2.service", &state) >= 0 && state == UNIT_FILE_ENABLED); +} + +TEST(preset_multiple_instances) { + InstallChange *changes = NULL; + size_t n_changes = 0; + const char *p; + UnitFileState state; + + /* Set up template service files and preset file */ + p = strjoina(root, "/usr/lib/systemd/system/foo@.service"); + assert_se(write_string_file(p, + "[Install]\n" + "DefaultInstance=def\n" + "WantedBy=multi-user.target\n", WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + p = strjoina(root, "/usr/lib/systemd/system-preset/test.preset"); + assert_se(write_string_file(p, + "enable foo@.service bar0 bar1 bartest\n" + "enable emptylist@.service\n" /* This line ensures the old functionality for templated unit still works */ + "disable *\n" , WRITE_STRING_FILE_CREATE) >= 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bar0.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + /* Preset a single instantiated unit specified in the list */ + assert_se(unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("foo@bar0.service"), UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bar0.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_SYMLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/foo@bar0.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + assert_se(unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, root, STRV_MAKE("foo@bar0.service"), &changes, &n_changes) >= 0); + assert_se(n_changes == 1); + assert_se(changes[0].type == INSTALL_CHANGE_UNLINK); + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/multi-user.target.wants/foo@bar0.service"); + assert_se(streq(changes[0].path, p)); + install_changes_free(changes, n_changes); + changes = NULL; n_changes = 0; + + /* Check for preset-all case, only instances on the list should be enabled, not including the default instance */ + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bar1.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bartest.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + + assert_se(unit_file_preset_all(RUNTIME_SCOPE_SYSTEM, 0, root, UNIT_FILE_PRESET_FULL, &changes, &n_changes) >= 0); + assert_se(n_changes > 0); + + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@def.service", &state) >= 0 && state == UNIT_FILE_DISABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bar0.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bar1.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + assert_se(unit_file_get_state(RUNTIME_SCOPE_SYSTEM, root, "foo@bartest.service", &state) >= 0 && state == UNIT_FILE_ENABLED); + + install_changes_free(changes, n_changes); +} + +static void verify_one( + const InstallInfo *i, + const char *alias, + int expected, + const char *updated_name) { + int r; + static const InstallInfo *last_info = NULL; + _cleanup_free_ char *alias2 = NULL; + + if (i != last_info) + log_info("-- %s --", (last_info = i)->name); + + r = unit_file_verify_alias(i, alias, &alias2, NULL, NULL); + log_info_errno(r, "alias %s ← %s: %d/%m (expected %d)%s%s%s", + i->name, alias, r, expected, + alias2 ? " [" : "", strempty(alias2), + alias2 ? "]" : ""); + assert_se(r == expected); + + /* This is test for "instance propagation". This propagation matters mostly for WantedBy= and + * RequiredBy= settings, and less so for Alias=. The only case where it should happen is when we have + * an Alias=alias@.service an instantiated template template@instance. In that case the instance name + * should be propagated into the alias as alias@instance. */ + assert_se(streq_ptr(alias2, updated_name)); +} + +TEST(verify_alias) { + const InstallInfo + plain_service = { .name = (char*) "plain.service" }, + bare_template = { .name = (char*) "template1@.service" }, + di_template = { .name = (char*) "template2@.service", + .default_instance = (char*) "di" }, + inst_template = { .name = (char*) "template3@inst.service" }, + di_inst_template = { .name = (char*) "template4@inst.service", + .default_instance = (char*) "di" }; + + verify_one(&plain_service, "alias.service", 0, NULL); + verify_one(&plain_service, "alias.socket", -EXDEV, NULL); + verify_one(&plain_service, "alias@.service", -EXDEV, NULL); + verify_one(&plain_service, "alias@inst.service", -EXDEV, NULL); + + /* Setting WantedBy= and RequiredBy= through Alias= is supported for the sake of backwards + * compatibility. */ + verify_one(&plain_service, "foo.target.wants/plain.service", 0, NULL); + verify_one(&plain_service, "foo.target.wants/plain.socket", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.wants/plain@.service", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.wants/service", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.requires/plain.service", 0, NULL); + verify_one(&plain_service, "foo.target.requires/plain.socket", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.requires/plain@.service", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.requires/service", -EXDEV, NULL); + verify_one(&plain_service, "asdf.requires/plain.service", -EXDEV, NULL); /* invalid unit name component */ + /* The newly-added UpheldBy= (.upholds/) and other suffixes should be rejected */ + verify_one(&plain_service, "foo.target.upholds/plain.service", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.upholds/plain.socket", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.upholds/plain@.service", -EXDEV, NULL); + verify_one(&plain_service, "foo.target.upholds/service", -EXDEV, NULL); + verify_one(&plain_service, "foo.service/plain.service", -EXDEV, NULL); /* missing dir suffix */ + verify_one(&plain_service, "foo.target.conf/plain.service", -EXDEV, NULL); + + verify_one(&bare_template, "alias.service", -EXDEV, NULL); + verify_one(&bare_template, "alias.socket", -EXDEV, NULL); + verify_one(&bare_template, "alias@.socket", -EXDEV, NULL); + verify_one(&bare_template, "alias@inst.socket", -EXDEV, NULL); + /* A general alias alias@.service → template1@.service. */ + verify_one(&bare_template, "alias@.service", 0, NULL); + /* Only a specific instance is aliased, see the discussion in https://github.com/systemd/systemd/pull/13119. */ + verify_one(&bare_template, "alias@inst.service", 0, NULL); + verify_one(&bare_template, "foo.target.wants/plain.service", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.wants/plain.socket", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.wants/plain@.service", -EXDEV, NULL); + /* Name mismatch: we cannot allow this, because plain@foo.service would be pulled in by foo.target, + * but would not be resolveable on its own, since systemd doesn't know how to load the fragment. */ + verify_one(&bare_template, "foo.target.wants/plain@foo.service", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.wants/template1@foo.service", 0, NULL); + verify_one(&bare_template, "foo.target.wants/service", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.requires/plain.service", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.requires/plain.socket", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.requires/plain@.service", -EXDEV, NULL); /* instance missing */ + verify_one(&bare_template, "foo.target.requires/template1@inst.service", 0, NULL); + verify_one(&bare_template, "foo.target.requires/service", -EXDEV, NULL); + verify_one(&bare_template, "foo.target.conf/plain.service", -EXDEV, NULL); + verify_one(&bare_template, "FOO@.target.requires/plain@.service", -EXDEV, NULL); /* template name mismatch */ + verify_one(&bare_template, "FOO@inst.target.requires/plain@.service", -EXDEV, NULL); + verify_one(&bare_template, "FOO@inst.target.requires/plain@inst.service", -EXDEV, NULL); + verify_one(&bare_template, "FOO@.target.requires/template1@.service", 0, NULL); /* instance propagated */ + verify_one(&bare_template, "FOO@inst.target.requires/template1@.service", -EXDEV, NULL); /* instance missing */ + verify_one(&bare_template, "FOO@inst.target.requires/template1@inst.service", 0, NULL); /* instance provided */ + + verify_one(&di_template, "alias.service", -EXDEV, NULL); + verify_one(&di_template, "alias.socket", -EXDEV, NULL); + verify_one(&di_template, "alias@.socket", -EXDEV, NULL); + verify_one(&di_template, "alias@inst.socket", -EXDEV, NULL); + verify_one(&di_template, "alias@inst.service", 0, NULL); + verify_one(&di_template, "alias@.service", 0, NULL); + verify_one(&di_template, "alias@di.service", 0, NULL); + verify_one(&di_template, "foo.target.wants/plain.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.wants/plain.socket", -EXDEV, NULL); + verify_one(&di_template, "foo.target.wants/plain@.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.wants/plain@di.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.wants/template2@di.service", 0, NULL); + verify_one(&di_template, "foo.target.wants/service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.requires/plain.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.requires/plain.socket", -EXDEV, NULL); + verify_one(&di_template, "foo.target.requires/plain@.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.requires/plain@di.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.requires/plain@foo.service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.requires/template2@.service", -EXDEV, NULL); /* instance missing */ + verify_one(&di_template, "foo.target.requires/template2@di.service", 0, NULL); + verify_one(&di_template, "foo.target.requires/service", -EXDEV, NULL); + verify_one(&di_template, "foo.target.conf/plain.service", -EXDEV, NULL); + + verify_one(&inst_template, "alias.service", -EXDEV, NULL); + verify_one(&inst_template, "alias.socket", -EXDEV, NULL); + verify_one(&inst_template, "alias@.socket", -EXDEV, NULL); + verify_one(&inst_template, "alias@inst.socket", -EXDEV, NULL); + verify_one(&inst_template, "alias@inst.service", 0, NULL); + verify_one(&inst_template, "alias@.service", 0, "alias@inst.service"); + verify_one(&inst_template, "alias@di.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/plain.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/plain.socket", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/plain@.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/plain@di.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/plain@inst.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/template3@foo.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.wants/template3@inst.service", 0, NULL); + verify_one(&inst_template, "bar.target.wants/service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/plain.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/plain.socket", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/plain@.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/plain@di.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/plain@inst.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/template3@foo.service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.requires/template3@inst.service", 0, NULL); + verify_one(&inst_template, "bar.target.requires/service", -EXDEV, NULL); + verify_one(&inst_template, "bar.target.conf/plain.service", -EXDEV, NULL); + verify_one(&inst_template, "BAR@.target.requires/plain@.service", -EXDEV, NULL); /* template name mismatch */ + verify_one(&inst_template, "BAR@inst.target.requires/plain@.service", -EXDEV, NULL); + verify_one(&inst_template, "BAR@inst.target.requires/plain@inst.service", -EXDEV, NULL); + verify_one(&inst_template, "BAR@.target.requires/template3@.service", -EXDEV, NULL); /* instance missing */ + verify_one(&inst_template, "BAR@inst.target.requires/template3@.service", -EXDEV, NULL); /* instance missing */ + verify_one(&inst_template, "BAR@inst.target.requires/template3@inst.service", 0, NULL); /* instance provided */ + verify_one(&inst_template, "BAR@inst.target.requires/template3@ins2.service", -EXDEV, NULL); /* instance mismatch */ + + /* explicit alias overrides DefaultInstance */ + verify_one(&di_inst_template, "alias.service", -EXDEV, NULL); + verify_one(&di_inst_template, "alias.socket", -EXDEV, NULL); + verify_one(&di_inst_template, "alias@.socket", -EXDEV, NULL); + verify_one(&di_inst_template, "alias@inst.socket", -EXDEV, NULL); + verify_one(&di_inst_template, "alias@inst.service", 0, NULL); + verify_one(&di_inst_template, "alias@.service", 0, "alias@inst.service"); + verify_one(&di_inst_template, "alias@di.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/plain.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/plain.socket", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/plain@.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/plain@di.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/template4@foo.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/template4@inst.service", 0, NULL); + verify_one(&di_inst_template, "goo.target.wants/template4@di.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.wants/service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/plain.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/plain.socket", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/plain@.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/plain@di.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/plain@inst.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/template4@foo.service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.requires/template4@inst.service", 0, NULL); + verify_one(&di_inst_template, "goo.target.requires/service", -EXDEV, NULL); + verify_one(&di_inst_template, "goo.target.conf/plain.service", -EXDEV, NULL); +} + +static int intro(void) { + const char *p; + + assert_se(mkdtemp_malloc("/tmp/rootXXXXXX", &root) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/"); + assert_se(mkdir_p(p, 0755) >= 0); + + p = strjoina(root, SYSTEM_CONFIG_UNIT_DIR"/"); + assert_se(mkdir_p(p, 0755) >= 0); + + p = strjoina(root, "/run/systemd/system/"); + assert_se(mkdir_p(p, 0755) >= 0); + + p = strjoina(root, "/opt/"); + assert_se(mkdir_p(p, 0755) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system-preset/"); + assert_se(mkdir_p(p, 0755) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/multi-user.target"); + assert_se(write_string_file(p, "# pretty much empty", WRITE_STRING_FILE_CREATE) >= 0); + + p = strjoina(root, "/usr/lib/systemd/system/graphical.target"); + assert_se(write_string_file(p, "# pretty much empty", WRITE_STRING_FILE_CREATE) >= 0); + + return EXIT_SUCCESS; +} + + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-install.c b/src/test/test-install.c new file mode 100644 index 0000000..b54252e --- /dev/null +++ b/src/test/test-install.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "install.h" +#include "tests.h" + +static void dump_changes(InstallChange *c, unsigned n) { + unsigned i; + + assert_se(n == 0 || c); + + for (i = 0; i < n; i++) { + if (c[i].type == INSTALL_CHANGE_UNLINK) + printf("rm '%s'\n", c[i].path); + else if (c[i].type == INSTALL_CHANGE_SYMLINK) + printf("ln -s '%s' '%s'\n", c[i].source, c[i].path); + } +} + +int main(int argc, char* argv[]) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + UnitFileList *p; + int r; + const char *const files[] = { "avahi-daemon.service", NULL }; + const char *const files2[] = { "/home/lennart/test.service", NULL }; + InstallChange *changes = NULL; + size_t n_changes = 0; + UnitFileState state = 0; + + test_setup_logging(LOG_DEBUG); + + h = hashmap_new(&unit_file_list_hash_ops_free); + r = unit_file_get_list(RUNTIME_SCOPE_SYSTEM, NULL, h, NULL, NULL); + assert_se(r == 0); + + HASHMAP_FOREACH(p, h) { + UnitFileState s = _UNIT_FILE_STATE_INVALID; + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(p->path), &s); + + assert_se((r < 0 && p->state == UNIT_FILE_BAD) || + (p->state == s)); + + fprintf(stderr, "%s (%s)\n", + p->path, + unit_file_state_to_string(p->state)); + } + + log_info("/*** enable **/"); + + r = unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + log_info("/*** enable2 **/"); + + r = unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_ENABLED); + + log_info("/*** disable ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_DISABLED); + + log_info("/*** mask ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_mask(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + log_info("/*** mask2 ***/"); + r = unit_file_mask(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_MASKED); + + log_info("/*** unmask ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_unmask(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + log_info("/*** unmask2 ***/"); + r = unit_file_unmask(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_DISABLED); + + log_info("/*** mask ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_mask(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_MASKED); + + log_info("/*** disable ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + log_info("/*** disable2 ***/"); + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_MASKED); + + log_info("/*** umask ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_unmask(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, files[0], &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_DISABLED); + + log_info("/*** enable files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_enable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files2, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_ENABLED); + + log_info("/*** disable files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, NULL, STRV_MAKE(basename(files2[0])), &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r < 0); + + log_info("/*** link files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_link(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files2, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_LINKED); + + log_info("/*** disable files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, NULL, STRV_MAKE(basename(files2[0])), &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r < 0); + + log_info("/*** link files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_link(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files2, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_LINKED); + + log_info("/*** reenable files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_reenable(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files2, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_ENABLED); + + log_info("/*** disable files2 ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_disable(RUNTIME_SCOPE_SYSTEM, 0, NULL, STRV_MAKE(basename(files2[0])), &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files2[0]), &state); + assert_se(r < 0); + log_info("/*** preset files ***/"); + changes = NULL; + n_changes = 0; + + r = unit_file_preset(RUNTIME_SCOPE_SYSTEM, 0, NULL, (char**) files, UNIT_FILE_PRESET_FULL, &changes, &n_changes); + assert_se(r >= 0); + + dump_changes(changes, n_changes); + install_changes_free(changes, n_changes); + + r = unit_file_get_state(RUNTIME_SCOPE_SYSTEM, NULL, basename(files[0]), &state); + assert_se(r >= 0); + assert_se(state == UNIT_FILE_ENABLED); + + return 0; +} diff --git a/src/test/test-io-util.c b/src/test/test-io-util.c new file mode 100644 index 0000000..fa3ef86 --- /dev/null +++ b/src/test/test-io-util.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "macro.h" +#include "tests.h" + +static void test_sparse_write_one(int fd, const char *buffer, size_t n) { + char check[n]; + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(ftruncate(fd, 0) >= 0); + assert_se(sparse_write(fd, buffer, n, 4) == (ssize_t) n); + + assert_se(lseek(fd, 0, SEEK_CUR) == (off_t) n); + assert_se(ftruncate(fd, n) >= 0); + + assert_se(lseek(fd, 0, SEEK_SET) == 0); + assert_se(read(fd, check, n) == (ssize_t) n); + + assert_se(memcmp(buffer, check, n) == 0); +} + +TEST(sparse_write) { + const char test_a[] = "test"; + const char test_b[] = "\0\0\0\0test\0\0\0\0"; + const char test_c[] = "\0\0test\0\0\0\0"; + const char test_d[] = "\0\0test\0\0\0test\0\0\0\0test\0\0\0\0\0test\0\0\0test\0\0\0\0test\0\0\0\0\0\0\0\0"; + const char test_e[] = "test\0\0\0\0test"; + _cleanup_close_ int fd = -EBADF; + char fn[] = "/tmp/sparseXXXXXX"; + + fd = mkostemp(fn, O_CLOEXEC); + assert_se(fd >= 0); + (void) unlink(fn); + + test_sparse_write_one(fd, test_a, sizeof(test_a)); + test_sparse_write_one(fd, test_b, sizeof(test_b)); + test_sparse_write_one(fd, test_c, sizeof(test_c)); + test_sparse_write_one(fd, test_d, sizeof(test_d)); + test_sparse_write_one(fd, test_e, sizeof(test_e)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-ip-protocol-list.c b/src/test/test-ip-protocol-list.c new file mode 100644 index 0000000..dfff015 --- /dev/null +++ b/src/test/test-ip-protocol-list.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "macro.h" +#include "ip-protocol-list.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tests.h" + +static void test_int(int i) { + char str[DECIMAL_STR_MAX(int)]; + + assert_se(ip_protocol_from_name(ip_protocol_to_name(i)) == i); + + xsprintf(str, "%i", i); + assert_se(ip_protocol_from_name(ip_protocol_to_name(parse_ip_protocol(str))) == i); +} + +static void test_int_fail(int i, int error) { + char str[DECIMAL_STR_MAX(int)]; + + assert_se(!ip_protocol_to_name(i)); + + xsprintf(str, "%i", i); + assert_se(parse_ip_protocol(str) == error); +} + +static void test_str(const char *s) { + assert_se(streq(ip_protocol_to_name(ip_protocol_from_name(s)), s)); + assert_se(streq(ip_protocol_to_name(parse_ip_protocol(s)), s)); +} + +static void test_str_fail(const char *s, int error) { + assert_se(ip_protocol_from_name(s) == -EINVAL); + assert_se(parse_ip_protocol(s) == error); +} + +TEST(integer) { + test_int(IPPROTO_TCP); + test_int(IPPROTO_DCCP); + test_int_fail(-1, -ERANGE); + test_int_fail(1024 * 1024, -EPROTONOSUPPORT); +} + +TEST(string) { + test_str("sctp"); + test_str("udp"); + test_str_fail("hoge", -EINVAL); + test_str_fail("-1", -ERANGE); + test_str_fail("1000000000", -EPROTONOSUPPORT); +} + +TEST(parse_ip_protocol) { + assert_se(parse_ip_protocol("sctp") == IPPROTO_SCTP); + assert_se(parse_ip_protocol("ScTp") == IPPROTO_SCTP); + assert_se(parse_ip_protocol("ip") == IPPROTO_IP); + assert_se(parse_ip_protocol("") == IPPROTO_IP); + assert_se(parse_ip_protocol("1") == 1); + assert_se(parse_ip_protocol("0") == 0); + assert_se(parse_ip_protocol("-10") == -ERANGE); + assert_se(parse_ip_protocol("100000000") == -EPROTONOSUPPORT); +} + +TEST(parse_ip_protocol_full) { + assert_se(parse_ip_protocol_full("-1", true) == -ERANGE); + assert_se(parse_ip_protocol_full("0", true) == 0); + assert_se(parse_ip_protocol_full("11", true) == 11); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-ipcrm.c b/src/test/test-ipcrm.c new file mode 100644 index 0000000..238f0bf --- /dev/null +++ b/src/test/test-ipcrm.c @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "clean-ipc.h" +#include "errno-util.h" +#include "main-func.h" +#include "tests.h" +#include "user-util.h" + +static int run(int argc, char *argv[]) { + uid_t uid; + int r; + const char* name = argv[1] ?: NOBODY_USER_NAME; + + test_setup_logging(LOG_INFO); + + r = get_user_creds(&name, &uid, NULL, NULL, NULL, 0); + if (r == -ESRCH) + return log_tests_skipped("Failed to resolve user"); + if (r < 0) + return log_error_errno(r, "Failed to resolve \"%s\": %m", name); + + r = clean_ipc_by_uid(uid); + if (ERRNO_IS_PRIVILEGE(r)) + return log_tests_skipped("No privileges"); + + return r; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/test/test-job-type.c b/src/test/test-job-type.c new file mode 100644 index 0000000..519fff7 --- /dev/null +++ b/src/test/test-job-type.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "service.h" +#include "tests.h" +#include "unit.h" + +int main(int argc, char *argv[]) { + const ServiceState test_states[] = { SERVICE_DEAD, SERVICE_RUNNING }; + + test_setup_logging(LOG_DEBUG); + + for (size_t i = 0; i < ELEMENTSOF(test_states); i++) { + /* fake a unit */ + Service s = { + .meta.load_state = UNIT_LOADED, + .type = SERVICE_SIMPLE, + .state = test_states[i], + }; + Unit *u = UNIT(&s); + + printf("\nWith collapsing for service state %s\n" + "=========================================\n", service_state_to_string(s.state)); + for (JobType a = 0; a < _JOB_TYPE_MAX_MERGING; a++) { + for (JobType b = 0; b < _JOB_TYPE_MAX_MERGING; b++) { + + JobType ab = a; + bool merged_ab = job_type_merge_and_collapse(&ab, b, u) >= 0; + + if (!job_type_is_mergeable(a, b)) { + assert_se(!merged_ab); + printf("Not mergeable: %s + %s\n", job_type_to_string(a), job_type_to_string(b)); + continue; + } + + assert_se(merged_ab); + printf("%s + %s = %s\n", job_type_to_string(a), job_type_to_string(b), job_type_to_string(ab)); + + for (JobType c = 0; c < _JOB_TYPE_MAX_MERGING; c++) { + + /* Verify transitivity of mergeability of job types */ + assert_se(!job_type_is_mergeable(a, b) || + !job_type_is_mergeable(b, c) || + job_type_is_mergeable(a, c)); + + /* Verify that merged entries can be merged with the same entries + * they can be merged with separately */ + assert_se(!job_type_is_mergeable(a, c) || job_type_is_mergeable(ab, c)); + assert_se(!job_type_is_mergeable(b, c) || job_type_is_mergeable(ab, c)); + + /* Verify that if a merged with b is not mergeable with c, then + * either a or b is not mergeable with c either. */ + assert_se(job_type_is_mergeable(ab, c) || !job_type_is_mergeable(a, c) || !job_type_is_mergeable(b, c)); + + JobType bc = b; + if (job_type_merge_and_collapse(&bc, c, u) >= 0) { + + /* Verify associativity */ + + JobType ab_c = ab; + assert_se(job_type_merge_and_collapse(&ab_c, c, u) == 0); + + JobType bc_a = bc; + assert_se(job_type_merge_and_collapse(&bc_a, a, u) == 0); + + JobType a_bc = a; + assert_se(job_type_merge_and_collapse(&a_bc, bc, u) == 0); + + assert_se(ab_c == bc_a); + assert_se(ab_c == a_bc); + + printf("%s + %s + %s = %s\n", job_type_to_string(a), job_type_to_string(b), job_type_to_string(c), job_type_to_string(ab_c)); + } + } + } + } + } + + return 0; +} diff --git a/src/test/test-journal-importer.c b/src/test/test-journal-importer.c new file mode 100644 index 0000000..ec9e49e --- /dev/null +++ b/src/test/test-journal-importer.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "log.h" +#include "journal-importer.h" +#include "path-util.h" +#include "string-util.h" +#include "tests.h" + +static void assert_iovec_entry(const struct iovec *iovec, const char* content) { + assert_se(strlen(content) == iovec->iov_len); + assert_se(memcmp(content, iovec->iov_base, iovec->iov_len) == 0); +} + +#define COREDUMP_PROC_GROUP \ + "COREDUMP_PROC_CGROUP=1:name=systemd:/\n" \ + "0::/user.slice/user-1002.slice/user@1002.service/gnome-terminal-server.service\n" + +TEST(basic_parsing) { + _cleanup_(journal_importer_cleanup) JournalImporter imp = JOURNAL_IMPORTER_INIT(-1); + _cleanup_free_ char *journal_data_path = NULL; + int r; + + assert_se(get_testdata_dir("journal-data/journal-1.txt", &journal_data_path) >= 0); + imp.fd = open(journal_data_path, O_RDONLY|O_CLOEXEC); + assert_se(imp.fd >= 0); + + do + r = journal_importer_process_data(&imp); + while (r == 0 && !journal_importer_eof(&imp)); + assert_se(r == 1); + + /* We read one entry, so we should get EOF on next read, but not yet */ + assert_se(!journal_importer_eof(&imp)); + + assert_se(imp.iovw.count == 6); + assert_iovec_entry(&imp.iovw.iovec[0], "_BOOT_ID=1531fd22ec84429e85ae888b12fadb91"); + assert_iovec_entry(&imp.iovw.iovec[1], "_TRANSPORT=journal"); + assert_iovec_entry(&imp.iovw.iovec[2], COREDUMP_PROC_GROUP); + assert_iovec_entry(&imp.iovw.iovec[3], "COREDUMP_RLIMIT=-1"); + assert_iovec_entry(&imp.iovw.iovec[4], COREDUMP_PROC_GROUP); + assert_iovec_entry(&imp.iovw.iovec[5], "_SOURCE_REALTIME_TIMESTAMP=1478389147837945"); + + /* Let's check if we get EOF now */ + r = journal_importer_process_data(&imp); + assert_se(r == 0); + assert_se(journal_importer_eof(&imp)); +} + +TEST(bad_input) { + _cleanup_(journal_importer_cleanup) JournalImporter imp = JOURNAL_IMPORTER_INIT(-1); + _cleanup_free_ char *journal_data_path = NULL; + int r; + + assert_se(get_testdata_dir("journal-data/journal-1.txt", &journal_data_path) >= 0); + imp.fd = open(journal_data_path, O_RDONLY|O_CLOEXEC); + assert_se(imp.fd >= 0); + + do + r = journal_importer_process_data(&imp); + while (!journal_importer_eof(&imp)); + assert_se(r == 0); /* If we don't have enough input, 0 is returned */ + + assert_se(journal_importer_eof(&imp)); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-json.c b/src/test/test-json.c new file mode 100644 index 0000000..c120a70 --- /dev/null +++ b/src/test/test-json.c @@ -0,0 +1,816 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "json-internal.h" +#include "json.h" +#include "math-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static void test_tokenizer_one(const char *data, ...) { + unsigned line = 0, column = 0; + void *state = NULL; + va_list ap; + + _cleanup_free_ char *cdata = NULL; + assert_se(cdata = cescape(data)); + log_info("/* %s data=\"%s\" */", __func__, cdata); + + va_start(ap, data); + + for (;;) { + unsigned token_line, token_column; + _cleanup_free_ char *str = NULL; + JsonValue v = JSON_VALUE_NULL; + int t, tt; + + t = json_tokenize(&data, &str, &v, &token_line, &token_column, &state, &line, &column); + tt = va_arg(ap, int); + + assert_se(t == tt); + + if (t == JSON_TOKEN_END || t < 0) + break; + + else if (t == JSON_TOKEN_STRING) { + const char *nn; + + nn = va_arg(ap, const char *); + assert_se(streq_ptr(nn, str)); + + } else if (t == JSON_TOKEN_REAL) { + double d; + + d = va_arg(ap, double); + + assert_se(fabs(d - v.real) < 1e-10 || + fabs((d - v.real) / v.real) < 1e-10); + + } else if (t == JSON_TOKEN_INTEGER) { + int64_t i; + + i = va_arg(ap, int64_t); + assert_se(i == v.integer); + + } else if (t == JSON_TOKEN_UNSIGNED) { + uint64_t u; + + u = va_arg(ap, uint64_t); + assert_se(u == v.unsig); + + } else if (t == JSON_TOKEN_BOOLEAN) { + bool b; + + b = va_arg(ap, int); + assert_se(b == v.boolean); + } + } + + va_end(ap); +} + +typedef void (*Test)(JsonVariant *); + +static void test_variant_one(const char *data, Test test) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL; + _cleanup_free_ char *s = NULL; + int r; + + _cleanup_free_ char *cdata; + assert_se(cdata = cescape(data)); + log_info("/* %s data=\"%s\" */", __func__, cdata); + + r = json_parse(data, 0, &v, NULL, NULL); + assert_se(r == 0); + assert_se(v); + + r = json_variant_format(v, 0, &s); + assert_se(r >= 0); + assert_se(s); + assert_se((size_t) r == strlen(s)); + + log_info("formatted normally: %s", s); + + r = json_parse(data, JSON_PARSE_SENSITIVE, &w, NULL, NULL); + assert_se(r == 0); + assert_se(w); + assert_se(json_variant_has_type(v, json_variant_type(w))); + assert_se(json_variant_has_type(w, json_variant_type(v))); + assert_se(json_variant_equal(v, w)); + + s = mfree(s); + w = json_variant_unref(w); + + r = json_variant_format(v, JSON_FORMAT_PRETTY, &s); + assert_se(r >= 0); + assert_se(s); + assert_se((size_t) r == strlen(s)); + + log_info("formatted prettily:\n%s", s); + + r = json_parse(data, 0, &w, NULL, NULL); + assert_se(r == 0); + assert_se(w); + + assert_se(json_variant_has_type(v, json_variant_type(w))); + assert_se(json_variant_has_type(w, json_variant_type(v))); + assert_se(json_variant_equal(v, w)); + + s = mfree(s); + r = json_variant_format(v, JSON_FORMAT_COLOR, &s); + assert_se(r >= 0); + assert_se(s); + assert_se((size_t) r == strlen(s)); + printf("Normal with color: %s\n", s); + + s = mfree(s); + r = json_variant_format(v, JSON_FORMAT_COLOR|JSON_FORMAT_PRETTY, &s); + assert_se(r >= 0); + assert_se(s); + assert_se((size_t) r == strlen(s)); + printf("Pretty with color:\n%s\n", s); + + if (test) + test(v); +} + +static void test_1(JsonVariant *v) { + JsonVariant *p, *q; + unsigned i; + + log_info("/* %s */", __func__); + + /* 3 keys + 3 values */ + assert_se(json_variant_elements(v) == 6); + + /* has k */ + p = json_variant_by_key(v, "k"); + assert_se(p && json_variant_type(p) == JSON_VARIANT_STRING); + + /* k equals v */ + assert_se(streq(json_variant_string(p), "v")); + + /* has foo */ + p = json_variant_by_key(v, "foo"); + assert_se(p && json_variant_type(p) == JSON_VARIANT_ARRAY && json_variant_elements(p) == 3); + + /* check foo[0] = 1, foo[1] = 2, foo[2] = 3 */ + for (i = 0; i < 3; ++i) { + q = json_variant_by_index(p, i); + assert_se(q && json_variant_type(q) == JSON_VARIANT_UNSIGNED && json_variant_unsigned(q) == (i+1)); + assert_se(q && json_variant_has_type(q, JSON_VARIANT_INTEGER) && json_variant_integer(q) == (i+1)); + } + + /* has bar */ + p = json_variant_by_key(v, "bar"); + assert_se(p && json_variant_type(p) == JSON_VARIANT_OBJECT && json_variant_elements(p) == 2); + + /* zap is null */ + q = json_variant_by_key(p, "zap"); + assert_se(q && json_variant_type(q) == JSON_VARIANT_NULL); +} + +static void test_2(JsonVariant *v) { + JsonVariant *p, *q; + + log_info("/* %s */", __func__); + + /* 2 keys + 2 values */ + assert_se(json_variant_elements(v) == 4); + + /* has mutant */ + p = json_variant_by_key(v, "mutant"); + assert_se(p && json_variant_type(p) == JSON_VARIANT_ARRAY && json_variant_elements(p) == 4); + + /* mutant[0] == 1 */ + q = json_variant_by_index(p, 0); + assert_se(q && json_variant_type(q) == JSON_VARIANT_UNSIGNED && json_variant_unsigned(q) == 1); + assert_se(q && json_variant_has_type(q, JSON_VARIANT_INTEGER) && json_variant_integer(q) == 1); + + /* mutant[1] == null */ + q = json_variant_by_index(p, 1); + assert_se(q && json_variant_type(q) == JSON_VARIANT_NULL); + + /* mutant[2] == "1" */ + q = json_variant_by_index(p, 2); + assert_se(q && json_variant_type(q) == JSON_VARIANT_STRING && streq(json_variant_string(q), "1")); + + /* mutant[3] == JSON_VARIANT_OBJECT */ + q = json_variant_by_index(p, 3); + assert_se(q && json_variant_type(q) == JSON_VARIANT_OBJECT && json_variant_elements(q) == 2); + + /* has 1 */ + p = json_variant_by_key(q, "1"); + assert_se(p && json_variant_type(p) == JSON_VARIANT_ARRAY && json_variant_elements(p) == 2); + + /* "1"[0] == 1 */ + q = json_variant_by_index(p, 0); + assert_se(q && json_variant_type(q) == JSON_VARIANT_UNSIGNED && json_variant_unsigned(q) == 1); + assert_se(q && json_variant_has_type(q, JSON_VARIANT_INTEGER) && json_variant_integer(q) == 1); + + /* "1"[1] == "1" */ + q = json_variant_by_index(p, 1); + assert_se(q && json_variant_type(q) == JSON_VARIANT_STRING && streq(json_variant_string(q), "1")); + + /* has thisisaverylongproperty */ + p = json_variant_by_key(v, "thisisaverylongproperty"); + assert_se(p && json_variant_type(p) == JSON_VARIANT_REAL && fabs(json_variant_real(p) - 1.27) < 0.001); +} + +static void test_zeroes(JsonVariant *v) { + /* Make sure zero is how we expect it. */ + log_info("/* %s */", __func__); + + assert_se(json_variant_elements(v) == 13); + + for (size_t i = 0; i < json_variant_elements(v); i++) { + JsonVariant *w; + size_t j; + + assert_se(w = json_variant_by_index(v, i)); + + assert_se(json_variant_integer(w) == 0); + assert_se(json_variant_unsigned(w) == 0U); + + assert_se(iszero_safe(json_variant_real(w))); + + assert_se(json_variant_is_integer(w)); + assert_se(json_variant_is_unsigned(w)); + assert_se(json_variant_is_real(w)); + assert_se(json_variant_is_number(w)); + + assert_se(!json_variant_is_negative(w)); + + assert_se(IN_SET(json_variant_type(w), JSON_VARIANT_INTEGER, JSON_VARIANT_UNSIGNED, JSON_VARIANT_REAL)); + + for (j = 0; j < json_variant_elements(v); j++) { + JsonVariant *q; + + assert_se(q = json_variant_by_index(v, j)); + + assert_se(json_variant_equal(w, q)); + } + } +} + +TEST(build) { + _cleanup_(json_variant_unrefp) JsonVariant *a = NULL, *b = NULL; + _cleanup_free_ char *s = NULL, *t = NULL; + + assert_se(json_build(&a, JSON_BUILD_STRING("hallo")) >= 0); + assert_se(json_build(&b, JSON_BUILD_LITERAL(" \"hallo\" ")) >= 0); + assert_se(json_variant_equal(a, b)); + + b = json_variant_unref(b); + + assert_se(json_build(&b, JSON_BUILD_VARIANT(a)) >= 0); + assert_se(json_variant_equal(a, b)); + + b = json_variant_unref(b); + assert_se(json_build(&b, JSON_BUILD_STRING("pief")) >= 0); + assert_se(!json_variant_equal(a, b)); + + a = json_variant_unref(a); + b = json_variant_unref(b); + + assert_se(json_build(&a, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("one", JSON_BUILD_INTEGER(7)), + JSON_BUILD_PAIR("two", JSON_BUILD_REAL(2.0)), + JSON_BUILD_PAIR("three", JSON_BUILD_INTEGER(0)))) >= 0); + + assert_se(json_build(&b, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("two", JSON_BUILD_INTEGER(2)), + JSON_BUILD_PAIR("three", JSON_BUILD_REAL(0)), + JSON_BUILD_PAIR("one", JSON_BUILD_REAL(7)))) >= 0); + + assert_se(json_variant_equal(a, b)); + + a = json_variant_unref(a); + b = json_variant_unref(b); + + const char* arr_1234[] = {"one", "two", "three", "four", NULL}; + assert_se(json_build(&a, JSON_BUILD_ARRAY(JSON_BUILD_OBJECT(JSON_BUILD_PAIR("x", JSON_BUILD_BOOLEAN(true)), + JSON_BUILD_PAIR("y", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("this", JSON_BUILD_NULL)))), + JSON_BUILD_VARIANT(NULL), + JSON_BUILD_LITERAL(NULL), + JSON_BUILD_STRING(NULL), + JSON_BUILD_NULL, + JSON_BUILD_INTEGER(77), + JSON_BUILD_ARRAY(JSON_BUILD_VARIANT(JSON_VARIANT_STRING_CONST("foobar")), + JSON_BUILD_VARIANT(JSON_VARIANT_STRING_CONST("zzz"))), + JSON_BUILD_STRV((char**) arr_1234))) >= 0); + + assert_se(json_variant_format(a, 0, &s) >= 0); + log_info("GOT: %s", s); + assert_se(json_parse(s, 0, &b, NULL, NULL) >= 0); + assert_se(json_variant_equal(a, b)); + + a = json_variant_unref(a); + b = json_variant_unref(b); + + assert_se(json_build(&a, JSON_BUILD_REAL(M_PI)) >= 0); + + s = mfree(s); + assert_se(json_variant_format(a, 0, &s) >= 0); + log_info("GOT: %s", s); + assert_se(json_parse(s, 0, &b, NULL, NULL) >= 0); + assert_se(json_variant_format(b, 0, &t) >= 0); + log_info("GOT: %s", t); + + assert_se(streq(s, t)); + + a = json_variant_unref(a); + b = json_variant_unref(b); + + assert_se(json_build(&a, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("x", JSON_BUILD_STRING("y")), + JSON_BUILD_PAIR("z", JSON_BUILD_CONST_STRING("a")), + JSON_BUILD_PAIR("b", JSON_BUILD_CONST_STRING("c")) + )) >= 0); + + assert_se(json_build(&b, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("x", JSON_BUILD_STRING("y")), + JSON_BUILD_PAIR_CONDITION(false, "p", JSON_BUILD_STRING("q")), + JSON_BUILD_PAIR_CONDITION(true, "z", JSON_BUILD_CONST_STRING("a")), + JSON_BUILD_PAIR_CONDITION(false, "j", JSON_BUILD_ARRAY(JSON_BUILD_STRING("k"), JSON_BUILD_CONST_STRING("u"), JSON_BUILD_CONST_STRING("i"))), + JSON_BUILD_PAIR("b", JSON_BUILD_CONST_STRING("c")) + )) >= 0); + + assert_se(json_variant_equal(a, b)); +} + +TEST(json_parse_file_empty) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + assert_se(fopen_unlocked("/dev/null", "re", &f) >= 0); + assert_se(json_parse_file(f, "waldo", 0, &v, NULL, NULL) == -ENODATA); + assert_se(v == NULL); +} + +TEST(json_parse_file_invalid) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + assert_se(f = fmemopen_unlocked((void*) "kookoo", 6, "r")); + assert_se(json_parse_file(f, "waldo", 0, &v, NULL, NULL) == -EINVAL); + assert_se(v == NULL); +} + +TEST(source) { + static const char data[] = + "\n" + "\n" + "{\n" + "\"foo\" : \"bar\", \n" + "\"qüüx\" : [ 1, 2, 3,\n" + "4,\n" + "5 ],\n" + "\"miep\" : { \"hallo\" : 1 },\n" + "\n" + "\"zzzzzz\" \n" + ":\n" + "[ true, \n" + "false, 7.5, {} ]\n" + "}\n"; + + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + printf("--- original begin ---\n" + "%s" + "--- original end ---\n", data); + + assert_se(f = fmemopen_unlocked((void*) data, strlen(data), "r")); + + assert_se(json_parse_file(f, "waldo", 0, &v, NULL, NULL) >= 0); + + printf("--- non-pretty begin ---\n"); + json_variant_dump(v, 0, stdout, NULL); + printf("\n--- non-pretty end ---\n"); + + printf("--- pretty begin ---\n"); + json_variant_dump(v, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR|JSON_FORMAT_SOURCE, stdout, NULL); + printf("--- pretty end ---\n"); +} + +TEST(depth) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + int r; + + v = JSON_VARIANT_STRING_CONST("start"); + + /* Let's verify that the maximum depth checks work */ + + for (unsigned i = 0;; i++) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + + assert_se(i <= UINT16_MAX); + if (i & 1) + r = json_variant_new_array(&w, &v, 1); + else + r = json_variant_new_object(&w, (JsonVariant*[]) { JSON_VARIANT_STRING_CONST("key"), v }, 2); + if (r == -ELNRNG) { + log_info("max depth at %u", i); + break; + } +#if HAS_FEATURE_MEMORY_SANITIZER + /* msan doesn't like the stack nesting to be too deep. Let's quit early. */ + if (i >= 128) { + log_info("quitting early at depth %u", i); + break; + } +#endif + + assert_se(r >= 0); + + json_variant_unref(v); + v = TAKE_PTR(w); + } + + json_variant_dump(v, 0, stdout, NULL); + fputs("\n", stdout); +} + +TEST(normalize) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL; + _cleanup_free_ char *t = NULL; + + assert_se(json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("b", JSON_BUILD_STRING("x")), + JSON_BUILD_PAIR("c", JSON_BUILD_CONST_STRING("y")), + JSON_BUILD_PAIR("a", JSON_BUILD_CONST_STRING("z")))) >= 0); + + assert_se(!json_variant_is_sorted(v)); + assert_se(!json_variant_is_normalized(v)); + + assert_se(json_variant_format(v, 0, &t) >= 0); + assert_se(streq(t, "{\"b\":\"x\",\"c\":\"y\",\"a\":\"z\"}")); + t = mfree(t); + + assert_se(json_build(&w, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("bar", JSON_BUILD_STRING("zzz")), + JSON_BUILD_PAIR("foo", JSON_BUILD_VARIANT(v)))) >= 0); + + assert_se(json_variant_is_sorted(w)); + assert_se(!json_variant_is_normalized(w)); + + assert_se(json_variant_format(w, 0, &t) >= 0); + assert_se(streq(t, "{\"bar\":\"zzz\",\"foo\":{\"b\":\"x\",\"c\":\"y\",\"a\":\"z\"}}")); + t = mfree(t); + + assert_se(json_variant_sort(&v) >= 0); + assert_se(json_variant_is_sorted(v)); + assert_se(json_variant_is_normalized(v)); + + assert_se(json_variant_format(v, 0, &t) >= 0); + assert_se(streq(t, "{\"a\":\"z\",\"b\":\"x\",\"c\":\"y\"}")); + t = mfree(t); + + assert_se(json_variant_normalize(&w) >= 0); + assert_se(json_variant_is_sorted(w)); + assert_se(json_variant_is_normalized(w)); + + assert_se(json_variant_format(w, 0, &t) >= 0); + assert_se(streq(t, "{\"bar\":\"zzz\",\"foo\":{\"a\":\"z\",\"b\":\"x\",\"c\":\"y\"}}")); + t = mfree(t); +} + +TEST(bisect) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + /* Tests the bisection logic in json_variant_by_key() */ + + for (char c = 'z'; c >= 'a'; c--) { + + if ((c % 3) == 0) + continue; + + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + assert_se(json_variant_new_stringn(&w, (char[4]) { '<', c, c, '>' }, 4) >= 0); + assert_se(json_variant_set_field(&v, (char[2]) { c, 0 }, w) >= 0); + } + + json_variant_dump(v, JSON_FORMAT_COLOR|JSON_FORMAT_PRETTY, NULL, NULL); + + assert_se(!json_variant_is_sorted(v)); + assert_se(!json_variant_is_normalized(v)); + assert_se(json_variant_normalize(&v) >= 0); + assert_se(json_variant_is_sorted(v)); + assert_se(json_variant_is_normalized(v)); + + json_variant_dump(v, JSON_FORMAT_COLOR|JSON_FORMAT_PRETTY, NULL, NULL); + + for (char c = 'a'; c <= 'z'; c++) { + JsonVariant *k; + const char *z; + + k = json_variant_by_key(v, (char[2]) { c, 0 }); + assert_se(!k == ((c % 3) == 0)); + + if (!k) + continue; + + assert_se(json_variant_is_string(k)); + + z = (char[5]){ '<', c, c, '>', 0}; + assert_se(streq(json_variant_string(k), z)); + } +} + +static void test_float_match(JsonVariant *v) { + const double delta = 0.0001; + + assert_se(json_variant_is_array(v)); + assert_se(json_variant_elements(v) == 11); + assert_se(fabs(1.0 - (DBL_MIN / json_variant_real(json_variant_by_index(v, 0)))) <= delta); + assert_se(fabs(1.0 - (DBL_MAX / json_variant_real(json_variant_by_index(v, 1)))) <= delta); + assert_se(json_variant_is_null(json_variant_by_index(v, 2))); /* nan is not supported by json → null */ + assert_se(json_variant_is_null(json_variant_by_index(v, 3))); /* +inf is not supported by json → null */ + assert_se(json_variant_is_null(json_variant_by_index(v, 4))); /* -inf is not supported by json → null */ + assert_se(json_variant_is_null(json_variant_by_index(v, 5)) || + fabs(1.0 - (HUGE_VAL / json_variant_real(json_variant_by_index(v, 5)))) <= delta); /* HUGE_VAL might be +inf, but might also be something else */ + assert_se(json_variant_is_real(json_variant_by_index(v, 6)) && + json_variant_is_integer(json_variant_by_index(v, 6)) && + json_variant_integer(json_variant_by_index(v, 6)) == 0); + assert_se(json_variant_is_real(json_variant_by_index(v, 7)) && + json_variant_is_integer(json_variant_by_index(v, 7)) && + json_variant_integer(json_variant_by_index(v, 7)) == 10); + assert_se(json_variant_is_real(json_variant_by_index(v, 8)) && + json_variant_is_integer(json_variant_by_index(v, 8)) && + json_variant_integer(json_variant_by_index(v, 8)) == -10); + assert_se(json_variant_is_real(json_variant_by_index(v, 9)) && + !json_variant_is_integer(json_variant_by_index(v, 9))); + assert_se(fabs(1.0 - (DBL_MIN / 2 / json_variant_real(json_variant_by_index(v, 9)))) <= delta); + assert_se(json_variant_is_real(json_variant_by_index(v, 10)) && + !json_variant_is_integer(json_variant_by_index(v, 10))); + assert_se(fabs(1.0 - (-DBL_MIN / 2 / json_variant_real(json_variant_by_index(v, 10)))) <= delta); +} + +TEST(float) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL; + _cleanup_free_ char *text = NULL; + + assert_se(json_build(&v, JSON_BUILD_ARRAY( + JSON_BUILD_REAL(DBL_MIN), + JSON_BUILD_REAL(DBL_MAX), + JSON_BUILD_REAL(NAN), + JSON_BUILD_REAL(INFINITY), + JSON_BUILD_REAL(-INFINITY), + JSON_BUILD_REAL(HUGE_VAL), + JSON_BUILD_REAL(0), + JSON_BUILD_REAL(10), + JSON_BUILD_REAL(-10), + JSON_BUILD_REAL(DBL_MIN / 2), + JSON_BUILD_REAL(-DBL_MIN / 2))) >= 0); + + json_variant_dump(v, JSON_FORMAT_COLOR|JSON_FORMAT_PRETTY, NULL, NULL); + + test_float_match(v); + + assert_se(json_variant_format(v, 0, &text) >= 0); + assert_se(json_parse(text, 0, &w, NULL, NULL) >= 0); + + json_variant_dump(w, JSON_FORMAT_COLOR|JSON_FORMAT_PRETTY, NULL, NULL); + + test_float_match(w); +} + +static void test_equal_text(JsonVariant *v, const char *text) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + + assert_se(json_parse(text, 0, &w, NULL, NULL) >= 0); + assert_se(json_variant_equal(v, w) || (!v && json_variant_is_null(w))); +} + +TEST(set_field) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + test_equal_text(v, "null"); + assert_se(json_variant_set_field(&v, "foo", NULL) >= 0); + test_equal_text(v, "{\"foo\" : null}"); + assert_se(json_variant_set_field(&v, "bar", JSON_VARIANT_STRING_CONST("quux")) >= 0); + test_equal_text(v, "{\"foo\" : null, \"bar\" : \"quux\"}"); + assert_se(json_variant_set_field(&v, "foo", JSON_VARIANT_STRING_CONST("quux2")) >= 0); + test_equal_text(v, "{\"foo\" : \"quux2\", \"bar\" : \"quux\"}"); + assert_se(json_variant_set_field(&v, "bar", NULL) >= 0); + test_equal_text(v, "{\"foo\" : \"quux2\", \"bar\" : null}"); +} + +TEST(tokenizer) { + test_tokenizer_one("x", -EINVAL); + test_tokenizer_one("", JSON_TOKEN_END); + test_tokenizer_one(" ", JSON_TOKEN_END); + test_tokenizer_one("0", JSON_TOKEN_UNSIGNED, (uint64_t) 0, JSON_TOKEN_END); + test_tokenizer_one("-0", JSON_TOKEN_INTEGER, (int64_t) 0, JSON_TOKEN_END); + test_tokenizer_one("1234", JSON_TOKEN_UNSIGNED, (uint64_t) 1234, JSON_TOKEN_END); + test_tokenizer_one("-1234", JSON_TOKEN_INTEGER, (int64_t) -1234, JSON_TOKEN_END); + test_tokenizer_one("18446744073709551615", JSON_TOKEN_UNSIGNED, (uint64_t) UINT64_MAX, JSON_TOKEN_END); + test_tokenizer_one("-9223372036854775808", JSON_TOKEN_INTEGER, (int64_t) INT64_MIN, JSON_TOKEN_END); + test_tokenizer_one("18446744073709551616", JSON_TOKEN_REAL, (double) 18446744073709551616.0L, JSON_TOKEN_END); + test_tokenizer_one("-9223372036854775809", JSON_TOKEN_REAL, (double) -9223372036854775809.0L, JSON_TOKEN_END); + test_tokenizer_one("-1234", JSON_TOKEN_INTEGER, (int64_t) -1234, JSON_TOKEN_END); + test_tokenizer_one("3.141", JSON_TOKEN_REAL, (double) 3.141, JSON_TOKEN_END); + test_tokenizer_one("0.0", JSON_TOKEN_REAL, (double) 0.0, JSON_TOKEN_END); + test_tokenizer_one("7e3", JSON_TOKEN_REAL, (double) 7e3, JSON_TOKEN_END); + test_tokenizer_one("-7e-3", JSON_TOKEN_REAL, (double) -7e-3, JSON_TOKEN_END); + test_tokenizer_one("true", JSON_TOKEN_BOOLEAN, true, JSON_TOKEN_END); + test_tokenizer_one("false", JSON_TOKEN_BOOLEAN, false, JSON_TOKEN_END); + test_tokenizer_one("null", JSON_TOKEN_NULL, JSON_TOKEN_END); + test_tokenizer_one("{}", JSON_TOKEN_OBJECT_OPEN, JSON_TOKEN_OBJECT_CLOSE, JSON_TOKEN_END); + test_tokenizer_one("\t {\n} \n", JSON_TOKEN_OBJECT_OPEN, JSON_TOKEN_OBJECT_CLOSE, JSON_TOKEN_END); + test_tokenizer_one("[]", JSON_TOKEN_ARRAY_OPEN, JSON_TOKEN_ARRAY_CLOSE, JSON_TOKEN_END); + test_tokenizer_one("\t [] \n\n", JSON_TOKEN_ARRAY_OPEN, JSON_TOKEN_ARRAY_CLOSE, JSON_TOKEN_END); + test_tokenizer_one("\"\"", JSON_TOKEN_STRING, "", JSON_TOKEN_END); + test_tokenizer_one("\"foo\"", JSON_TOKEN_STRING, "foo", JSON_TOKEN_END); + test_tokenizer_one("\"foo\\nfoo\"", JSON_TOKEN_STRING, "foo\nfoo", JSON_TOKEN_END); + test_tokenizer_one("{\"foo\" : \"bar\"}", JSON_TOKEN_OBJECT_OPEN, JSON_TOKEN_STRING, "foo", JSON_TOKEN_COLON, JSON_TOKEN_STRING, "bar", JSON_TOKEN_OBJECT_CLOSE, JSON_TOKEN_END); + test_tokenizer_one("{\"foo\" : [true, false]}", JSON_TOKEN_OBJECT_OPEN, JSON_TOKEN_STRING, "foo", JSON_TOKEN_COLON, JSON_TOKEN_ARRAY_OPEN, JSON_TOKEN_BOOLEAN, true, JSON_TOKEN_COMMA, JSON_TOKEN_BOOLEAN, false, JSON_TOKEN_ARRAY_CLOSE, JSON_TOKEN_OBJECT_CLOSE, JSON_TOKEN_END); + test_tokenizer_one("\"\xef\xbf\xbd\"", JSON_TOKEN_STRING, "\xef\xbf\xbd", JSON_TOKEN_END); + test_tokenizer_one("\"\\ufffd\"", JSON_TOKEN_STRING, "\xef\xbf\xbd", JSON_TOKEN_END); + test_tokenizer_one("\"\\uf\"", -EINVAL); + test_tokenizer_one("\"\\ud800a\"", -EINVAL); + test_tokenizer_one("\"\\udc00\\udc00\"", -EINVAL); + test_tokenizer_one("\"\\ud801\\udc37\"", JSON_TOKEN_STRING, "\xf0\x90\x90\xb7", JSON_TOKEN_END); + + test_tokenizer_one("[1, 2, -3]", JSON_TOKEN_ARRAY_OPEN, JSON_TOKEN_UNSIGNED, (uint64_t) 1, JSON_TOKEN_COMMA, JSON_TOKEN_UNSIGNED, (uint64_t) 2, JSON_TOKEN_COMMA, JSON_TOKEN_INTEGER, (int64_t) -3, JSON_TOKEN_ARRAY_CLOSE, JSON_TOKEN_END); +} + +TEST(variant) { + test_variant_one("{\"k\": \"v\", \"foo\": [1, 2, 3], \"bar\": {\"zap\": null}}", test_1); + test_variant_one("{\"mutant\": [1, null, \"1\", {\"1\": [1, \"1\"]}], \"thisisaverylongproperty\": 1.27}", test_2); + test_variant_one("{\"foo\" : \"\\u0935\\u093f\\u0935\\u0947\\u0915\\u0916\\u094d\\u092f\\u093e\\u0924\\u093f\\u0930\\u0935\\u093f\\u092a\\u094d\\u0932\\u0935\\u093e\\u0020\\u0939\\u093e\\u0928\\u094b\\u092a\\u093e\\u092f\\u0903\\u0964\"}", NULL); + + test_variant_one("[ 0, -0, 0.0, -0.0, 0.000, -0.000, 0e0, -0e0, 0e+0, -0e-0, 0e-0, -0e000, 0e+000 ]", test_zeroes); +} + +TEST(json_variant_merge_objectb) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL; + + assert_se(json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("b", JSON_BUILD_STRING("x")), + JSON_BUILD_PAIR("c", JSON_BUILD_CONST_STRING("y")), + JSON_BUILD_PAIR("a", JSON_BUILD_CONST_STRING("z")))) >= 0); + + assert_se(json_variant_merge_objectb(&w, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("b", JSON_BUILD_STRING("x")))) >= 0); + assert_se(json_variant_merge_objectb(&w, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("c", JSON_BUILD_STRING("y")))) >= 0); + assert_se(json_variant_merge_objectb(&w, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("a", JSON_BUILD_STRING("z")))) >= 0); + + assert_se(json_variant_equal(v, w)); +} + +static void json_array_append_with_source_one(bool source) { + _cleanup_(json_variant_unrefp) JsonVariant *a, *b; + + /* Parse two sources, each with a different name and line/column numbers */ + + assert_se(json_parse_with_source(" [41]", source ? "string 1" : NULL, 0, + &a, NULL, NULL) >= 0); + assert_se(json_parse_with_source("\n\n [42]", source ? "string 2" : NULL, 0, + &b, NULL, NULL) >= 0); + + assert_se(json_variant_is_array(a)); + assert_se(json_variant_elements(a) == 1); + assert_se(json_variant_is_array(b)); + assert_se(json_variant_elements(b) == 1); + + /* Verify source information */ + + const char *s1, *s2; + unsigned line1, col1, line2, col2; + assert_se(json_variant_get_source(a, &s1, &line1, &col1) >= 0); + assert_se(json_variant_get_source(b, &s2, &line2, &col2) >= 0); + + assert_se(streq_ptr(s1, source ? "string 1" : NULL)); + assert_se(streq_ptr(s2, source ? "string 2" : NULL)); + assert_se(line1 == 1); + assert_se(col1 == 2); + assert_se(line2 == 3); + assert_se(col2 == 4); + + /* Append one elem from the second array (and source) to the first. */ + + JsonVariant *elem; + assert_se(elem = json_variant_by_index(b, 0)); + assert_se(json_variant_is_integer(elem)); + assert_se(json_variant_elements(elem) == 0); + + assert_se(json_variant_append_array(&a, elem) >= 0); + + assert_se(json_variant_is_array(a)); + assert_se(json_variant_elements(a) == 2); + + /* Verify that source information was propagated correctly */ + + assert_se(json_variant_get_source(elem, &s1, &line1, &col1) >= 0); + assert_se(elem = json_variant_by_index(a, 1)); + assert_se(json_variant_get_source(elem, &s2, &line2, &col2) >= 0); + + assert_se(streq_ptr(s1, source ? "string 2" : NULL)); + assert_se(streq_ptr(s2, source ? "string 2" : NULL)); + assert_se(line1 == 3); + assert_se(col1 == 5); + assert_se(line2 == 3); + assert_se(col2 == 5); +} + +TEST(json_array_append_with_source) { + json_array_append_with_source_one(true); +} + +TEST(json_array_append_without_source) { + json_array_append_with_source_one(false); +} + +TEST(json_array_append_nodup) { + _cleanup_(json_variant_unrefp) JsonVariant *l = NULL, *s = NULL, *wd = NULL, *nd = NULL; + + assert_se(json_build(&l, JSON_BUILD_STRV(STRV_MAKE("foo", "bar", "baz", "bar", "baz", "foo", "qux", "baz"))) >= 0); + assert_se(json_build(&s, JSON_BUILD_STRV(STRV_MAKE("foo", "bar", "baz", "qux"))) >= 0); + + assert_se(!json_variant_equal(l, s)); + assert_se(json_variant_elements(l) == 8); + assert_se(json_variant_elements(s) == 4); + + JsonVariant *i; + JSON_VARIANT_ARRAY_FOREACH(i, l) { + assert_se(json_variant_append_array(&wd, i) >= 0); + assert_se(json_variant_append_array_nodup(&nd, i) >= 0); + } + + assert_se(json_variant_elements(wd) == 8); + assert_se(json_variant_equal(l, wd)); + assert_se(!json_variant_equal(s, wd)); + + assert_se(json_variant_elements(nd) == 4); + assert_se(!json_variant_equal(l, nd)); + assert_se(json_variant_equal(s, nd)); +} + +TEST(json_dispatch) { + struct foobar { + uint64_t a, b; + int64_t c, d; + uint32_t e, f; + int32_t g, h; + uint16_t i, j; + int16_t k, l; + } foobar = {}; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + assert_se(json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("a", JSON_BUILD_UNSIGNED(UINT64_MAX)), + JSON_BUILD_PAIR("b", JSON_BUILD_STRING("18446744073709551615")), + JSON_BUILD_PAIR("c", JSON_BUILD_INTEGER(INT64_MIN)), + JSON_BUILD_PAIR("d", JSON_BUILD_STRING("-9223372036854775808")), + JSON_BUILD_PAIR("e", JSON_BUILD_UNSIGNED(UINT32_MAX)), + JSON_BUILD_PAIR("f", JSON_BUILD_STRING("4294967295")), + JSON_BUILD_PAIR("g", JSON_BUILD_INTEGER(INT32_MIN)), + JSON_BUILD_PAIR("h", JSON_BUILD_STRING("-2147483648")), + JSON_BUILD_PAIR("i", JSON_BUILD_UNSIGNED(UINT16_MAX)), + JSON_BUILD_PAIR("j", JSON_BUILD_STRING("65535")), + JSON_BUILD_PAIR("k", JSON_BUILD_INTEGER(INT16_MIN)), + JSON_BUILD_PAIR("l", JSON_BUILD_STRING("-32768")))) >= 0); + + assert_se(json_variant_dump(v, JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO, stdout, /* prefix= */ NULL) >= 0); + + JsonDispatch table[] = { + { "a", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct foobar, a) }, + { "b", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(struct foobar, b) }, + { "c", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int64, offsetof(struct foobar, c) }, + { "d", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int64, offsetof(struct foobar, d) }, + { "e", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint32, offsetof(struct foobar, e) }, + { "f", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint32, offsetof(struct foobar, f) }, + { "g", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int32, offsetof(struct foobar, g) }, + { "h", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int32, offsetof(struct foobar, h) }, + { "i", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint16, offsetof(struct foobar, i) }, + { "j", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint16, offsetof(struct foobar, j) }, + { "k", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int16, offsetof(struct foobar, k) }, + { "l", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int16, offsetof(struct foobar, l) }, + {} + }; + + assert_se(json_dispatch(v, table, JSON_LOG, &foobar) >= 0); + + assert_se(foobar.a == UINT64_MAX); + assert_se(foobar.b == UINT64_MAX); + assert_se(foobar.c == INT64_MIN); + assert_se(foobar.d == INT64_MIN); + + assert_se(foobar.e == UINT32_MAX); + assert_se(foobar.f == UINT32_MAX); + assert_se(foobar.g == INT32_MIN); + assert_se(foobar.h == INT32_MIN); + + assert_se(foobar.i == UINT16_MAX); + assert_se(foobar.j == UINT16_MAX); + assert_se(foobar.k == INT16_MIN); + assert_se(foobar.l == INT16_MIN); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-kbd-util.c b/src/test/test-kbd-util.c new file mode 100644 index 0000000..0a166c6 --- /dev/null +++ b/src/test/test-kbd-util.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "kbd-util.h" +#include "log.h" +#include "strv.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_strv_free_ char **maps = NULL; + int r; + + log_show_color(true); + test_setup_logging(LOG_DEBUG); + + r = get_keymaps(&maps); + if (r < 0) { + log_error_errno(r, "Failed to acquire keymaps: %m"); + return 0; + } + + STRV_FOREACH(m, maps) { + log_info("Found keymap: %s", *m); + assert_se(keymap_exists(*m) > 0); + } + + return 0; +} diff --git a/src/test/test-libcrypt-util.c b/src/test/test-libcrypt-util.c new file mode 100644 index 0000000..f88a9f9 --- /dev/null +++ b/src/test/test-libcrypt-util.c @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if HAVE_CRYPT_H +# include +#else +# include +#endif + +#include "strv.h" +#include "tests.h" +#include "libcrypt-util.h" + +static void test_crypt_preferred_method(void) { + log_info("/* %s */", __func__); + + log_info("crypt_preferred_method: %s", +#if HAVE_CRYPT_PREFERRED_METHOD + crypt_preferred_method() +#else + "(not available)" +#endif + ); +} + +static void test_make_salt(void) { + log_info("/* %s */", __func__); + + for (int i = 0; i < 10; i++) { + _cleanup_free_ char *t; + + assert_se(make_salt(&t) == 0); + log_info("%s", t); + } +} + +static int test_hash_password(void) { + log_info("/* %s */", __func__); + + /* As a warm-up exercise, check if we can hash passwords. */ + + bool have_sane_hash = false; + + FOREACH_STRING(hash, + "ew3bU1.hoKk4o", + "$1$gc5rWpTB$wK1aul1PyBn9AX1z93stk1", + "$2b$12$BlqcGkB/7BFvNMXKGxDea.5/8D6FTny.cbNcHW/tqcrcyo6ZJd8u2", + "$5$lGhDrcrao9zb5oIK$05KlOVG3ocknx/ThreqXE/gk.XzFFBMTksc4t2CPDUD", + "$6$c7wB/3GiRk0VHf7e$zXJ7hN0aLZapE.iO4mn/oHu6.prsXTUG/5k1AxpgR85ELolyAcaIGRgzfwJs3isTChMDBjnthZyaMCfCNxo9I.", + "$y$j9T$$9cKOWsAm4m97WiYk61lPPibZpy3oaGPIbsL4koRe/XD") { + int b; + + b = test_password_one(hash, "ppp"); + log_info("%s: %s", hash, yes_no(b)); +#if defined(XCRYPT_VERSION_MAJOR) + /* xcrypt is supposed to always implement all methods. */ + assert_se(b); +#endif + + if (b && IN_SET(hash[1], '6', 'y')) + have_sane_hash = true; + } + + return have_sane_hash; +} + +static void test_hash_password_full(void) { + log_info("/* %s */", __func__); + + _cleanup_free_ void *cd_data = NULL; + int cd_size = 0; + + log_info("sizeof(struct crypt_data): %zu bytes", sizeof(struct crypt_data)); + + for (unsigned c = 0; c < 2; c++) + FOREACH_STRING(i, "abc123", "h⸿sło") { + _cleanup_free_ char *hashed; + + if (c == 0) + assert_se(hash_password_full(i, &cd_data, &cd_size, &hashed) == 0); + else + assert_se(hash_password_full(i, NULL, NULL, &hashed) == 0); + log_debug("\"%s\" → \"%s\"", i, hashed); + log_info("crypt_r[a] buffer size: %i bytes", cd_size); + + assert_se(test_password_one(hashed, i) == true); + assert_se(test_password_one(i, hashed) <= 0); /* We get an error for non-utf8 */ + assert_se(test_password_one(hashed, "foobar") == false); + assert_se(test_password_many(STRV_MAKE(hashed), i) == true); + assert_se(test_password_many(STRV_MAKE(hashed), "foobar") == false); + assert_se(test_password_many(STRV_MAKE(hashed, hashed, hashed), "foobar") == false); + assert_se(test_password_many(STRV_MAKE("$y$j9T$dlCXwkX0GC5L6B8Gf.4PN/$VCyEH", + hashed, + "$y$j9T$SAayASazWZIQeJd9AS02m/$"), + i) == true); + assert_se(test_password_many(STRV_MAKE("$W$j9T$dlCXwkX0GC5L6B8Gf.4PN/$VCyEH", /* no such method exists... */ + hashed, + "$y$j9T$SAayASazWZIQeJd9AS02m/$"), + i) == true); + assert_se(test_password_many(STRV_MAKE("$y$j9T$dlCXwkX0GC5L6B8Gf.4PN/$VCyEH", + hashed, + "$y$j9T$SAayASazWZIQeJd9AS02m/$"), + "") == false); + assert_se(test_password_many(STRV_MAKE("$W$j9T$dlCXwkX0GC5L6B8Gf.4PN/$VCyEH", /* no such method exists... */ + hashed, + "$y$j9T$SAayASazWZIQeJd9AS02m/$"), + "") == false); + } +} + +int main(int argc, char *argv[]) { + test_setup_logging(LOG_DEBUG); + +#if defined(__powerpc__) && !defined(XCRYPT_VERSION_MAJOR) + return log_tests_skipped("crypt_r() causes a buffer overflow on ppc64el, see https://github.com/systemd/systemd/pull/16981#issuecomment-691203787"); +#endif + + test_crypt_preferred_method(); + test_make_salt(); + + if (!test_hash_password()) + return log_tests_skipped("crypt doesn't support yescrypt or sha512crypt"); + + test_hash_password_full(); + + return 0; +} diff --git a/src/test/test-libmount.c b/src/test/test-libmount.c new file mode 100644 index 0000000..9ba428e --- /dev/null +++ b/src/test/test-libmount.c @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "escape.h" +#include "libmount-util.h" +#include "tests.h" + +static void test_libmount_unescaping_one( + const char *title, + const char *string, + bool may_fail, + const char *expected_source, + const char *expected_target) { + /* A test for libmount really */ + int r; + + log_info("/* %s %s */", __func__, title); + + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + _cleanup_fclose_ FILE *f = NULL; + + f = fmemopen((char*) string, strlen(string), "r"); + assert_se(f); + + assert_se(libmount_parse(title, f, &table, &iter) >= 0); + + struct libmnt_fs *fs; + const char *source, *target; + _cleanup_free_ char *x = NULL, *cs = NULL, *s = NULL, *ct = NULL, *t = NULL; + + /* We allow this call and the checks below to fail in some cases. See the case definitions below. */ + + r = mnt_table_next_fs(table, iter, &fs); + if (r != 0 && may_fail) { + log_error_errno(r, "mnt_table_next_fs failed: %m"); + return; + } + assert_se(r == 0); + + assert_se(x = cescape(string)); + + assert_se(source = mnt_fs_get_source(fs)); + assert_se(target = mnt_fs_get_target(fs)); + + assert_se(cs = cescape(source)); + assert_se(ct = cescape(target)); + + assert_se(cunescape(source, UNESCAPE_RELAX, &s) >= 0); + assert_se(cunescape(target, UNESCAPE_RELAX, &t) >= 0); + + log_info("from '%s'", x); + log_info("source: '%s'", source); + log_info("source: '%s'", cs); + log_info("source: '%s'", s); + log_info("expected: '%s'", strna(expected_source)); + log_info("target: '%s'", target); + log_info("target: '%s'", ct); + log_info("target: '%s'", t); + log_info("expected: '%s'", strna(expected_target)); + + assert_se(may_fail || streq(source, expected_source)); + assert_se(may_fail || streq(target, expected_target)); + + assert_se(mnt_table_next_fs(table, iter, &fs) == 1); +} + +TEST(libmount_unescaping) { + test_libmount_unescaping_one( + "escaped space + utf8", + "729 38 0:59 / /tmp/„zupa\\040zębowa” rw,relatime shared:395 - tmpfs die\\040Brühe rw,seclabel", + false, + "die Brühe", + "/tmp/„zupa zębowa”" + ); + + test_libmount_unescaping_one( + "escaped newline", + "729 38 0:59 / /tmp/x\\012y rw,relatime shared:395 - tmpfs newline rw,seclabel", + false, + "newline", + "/tmp/x\ny" + ); + + /* The result of "mount -t tmpfs '' /tmp/emptysource". + * This will fail with libmount <= v2.33. + * See https://github.com/karelzak/util-linux/commit/18a52a5094. + */ + test_libmount_unescaping_one( + "empty source", + "760 38 0:60 / /tmp/emptysource rw,relatime shared:410 - tmpfs rw,seclabel", + true, + "", + "/tmp/emptysource" + ); + + /* The kernel leaves \r as is. + * Also see https://github.com/karelzak/util-linux/issues/780. + */ + test_libmount_unescaping_one( + "foo\\rbar", + "790 38 0:61 / /tmp/foo\rbar rw,relatime shared:425 - tmpfs tmpfs rw,seclabel", + true, + "tmpfs", + "/tmp/foo\rbar" + ); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-limits-util.c b/src/test/test-limits-util.c new file mode 100644 index 0000000..3b6c8c0 --- /dev/null +++ b/src/test/test-limits-util.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "format-util.h" +#include "limits-util.h" +#include "tests.h" + +TEST(physical_memory) { + uint64_t p; + + p = physical_memory(); + assert_se(p > 0); + assert_se(p < UINT64_MAX); + assert_se(p % page_size() == 0); + + log_info("Memory: %s (%" PRIu64 ")", FORMAT_BYTES(p), p); +} + +TEST(physical_memory_scale) { + uint64_t p; + + p = physical_memory(); + + assert_se(physical_memory_scale(0, 100) == 0); + assert_se(physical_memory_scale(100, 100) == p); + + log_info("Memory original: %" PRIu64, physical_memory()); + log_info("Memory scaled by 50%%: %" PRIu64, physical_memory_scale(50, 100)); + log_info("Memory divided by 2: %" PRIu64, physical_memory() / 2); + log_info("Page size: %zu", page_size()); + + /* There might be an uneven number of pages, hence permit these calculations to be half a page off... */ + assert_se(page_size()/2 + physical_memory_scale(50, 100) - p/2 <= page_size()); + assert_se(physical_memory_scale(200, 100) == p*2); + + assert_se(physical_memory_scale(0, 1) == 0); + assert_se(physical_memory_scale(1, 1) == p); + assert_se(physical_memory_scale(2, 1) == p*2); + + assert_se(physical_memory_scale(0, 2) == 0); + + assert_se(page_size()/2 + physical_memory_scale(1, 2) - p/2 <= page_size()); + assert_se(physical_memory_scale(2, 2) == p); + assert_se(physical_memory_scale(4, 2) == p*2); + + assert_se(physical_memory_scale(0, UINT32_MAX) == 0); + assert_se(physical_memory_scale(UINT32_MAX, UINT32_MAX) == p); + + /* overflow */ + assert_se(physical_memory_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX); +} + +TEST(system_tasks_max) { + uint64_t t; + + t = system_tasks_max(); + assert_se(t > 0); + assert_se(t < UINT64_MAX); + + log_info("Max tasks: %" PRIu64, t); +} + +TEST(system_tasks_max_scale) { + uint64_t t; + + t = system_tasks_max(); + + assert_se(system_tasks_max_scale(0, 100) == 0); + assert_se(system_tasks_max_scale(100, 100) == t); + + assert_se(system_tasks_max_scale(0, 1) == 0); + assert_se(system_tasks_max_scale(1, 1) == t); + assert_se(system_tasks_max_scale(2, 1) == 2*t); + + assert_se(system_tasks_max_scale(0, 2) == 0); + assert_se(system_tasks_max_scale(1, 2) == t/2); + assert_se(system_tasks_max_scale(2, 2) == t); + assert_se(system_tasks_max_scale(3, 2) == (3*t)/2); + assert_se(system_tasks_max_scale(4, 2) == t*2); + + assert_se(system_tasks_max_scale(0, UINT32_MAX) == 0); + assert_se(system_tasks_max_scale((UINT32_MAX-1)/2, UINT32_MAX-1) == t/2); + assert_se(system_tasks_max_scale(UINT32_MAX, UINT32_MAX) == t); + + /* overflow */ + + assert_se(system_tasks_max_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-list.c b/src/test/test-list.c new file mode 100644 index 0000000..87f7c7b --- /dev/null +++ b/src/test/test-list.c @@ -0,0 +1,286 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "list.h" +#include "tests.h" + +int main(int argc, const char *argv[]) { + test_setup_logging(LOG_DEBUG); + + size_t i; + typedef struct list_item { + LIST_FIELDS(struct list_item, item_list); + } list_item; + LIST_HEAD(list_item, head); + LIST_HEAD(list_item, head2); + list_item items[4]; + + LIST_HEAD_INIT(head); + LIST_HEAD_INIT(head2); + assert_se(head == NULL); + assert_se(head2 == NULL); + + for (i = 0; i < ELEMENTSOF(items); i++) { + LIST_INIT(item_list, &items[i]); + assert_se(LIST_JUST_US(item_list, &items[i])); + assert_se(LIST_PREPEND(item_list, head, &items[i]) == &items[i]); + } + + i = 0; + LIST_FOREACH_OTHERS(item_list, cursor, &items[2]) { + i++; + assert_se(cursor != &items[2]); + } + assert_se(i == ELEMENTSOF(items)-1); + + i = 0; + LIST_FOREACH_OTHERS(item_list, cursor, &items[0]) { + i++; + assert_se(cursor != &items[0]); + } + assert_se(i == ELEMENTSOF(items)-1); + + i = 0; + LIST_FOREACH_OTHERS(item_list, cursor, &items[3]) { + i++; + assert_se(cursor != &items[3]); + } + assert_se(i == ELEMENTSOF(items)-1); + + assert_se(!LIST_JUST_US(item_list, head)); + + assert_se(items[0].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[0]); + assert_se(items[2].item_list_next == &items[1]); + assert_se(items[3].item_list_next == &items[2]); + + assert_se(items[0].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + list_item *cursor = LIST_FIND_HEAD(item_list, &items[0]); + assert_se(cursor == &items[3]); + + cursor = LIST_FIND_TAIL(item_list, &items[3]); + assert_se(cursor == &items[0]); + + assert_se(LIST_REMOVE(item_list, head, &items[1]) == &items[1]); + assert_se(LIST_JUST_US(item_list, &items[1])); + + assert_se(items[0].item_list_next == NULL); + assert_se(items[2].item_list_next == &items[0]); + assert_se(items[3].item_list_next == &items[2]); + + assert_se(items[0].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_INSERT_AFTER(item_list, head, &items[3], &items[1]) == &items[1]); + assert_se(items[0].item_list_next == NULL); + assert_se(items[2].item_list_next == &items[0]); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + + assert_se(items[0].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_REMOVE(item_list, head, &items[1]) == &items[1]); + assert_se(LIST_JUST_US(item_list, &items[1])); + + assert_se(items[0].item_list_next == NULL); + assert_se(items[2].item_list_next == &items[0]); + assert_se(items[3].item_list_next == &items[2]); + + assert_se(items[0].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_INSERT_BEFORE(item_list, head, &items[2], &items[1]) == &items[1]); + assert_se(items[0].item_list_next == NULL); + assert_se(items[2].item_list_next == &items[0]); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + + assert_se(items[0].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_REMOVE(item_list, head, &items[0]) == &items[0]); + assert_se(LIST_JUST_US(item_list, &items[0])); + + assert_se(items[2].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_INSERT_BEFORE(item_list, head, &items[3], &items[0]) == &items[0]); + assert_se(items[2].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + assert_se(items[0].item_list_next == &items[3]); + + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == &items[0]); + assert_se(items[0].item_list_prev == NULL); + assert_se(head == &items[0]); + + assert_se(LIST_REMOVE(item_list, head, &items[0]) == &items[0]); + assert_se(LIST_JUST_US(item_list, &items[0])); + + assert_se(items[2].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_INSERT_BEFORE(item_list, head, NULL, &items[0]) == &items[0]); + assert_se(items[0].item_list_next == NULL); + assert_se(items[2].item_list_next == &items[0]); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + + assert_se(items[0].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_REMOVE(item_list, head, &items[0]) == &items[0]); + assert_se(LIST_JUST_US(item_list, &items[0])); + + assert_se(items[2].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[3].item_list_next == &items[1]); + + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_REMOVE(item_list, head, &items[1]) == &items[1]); + assert_se(LIST_JUST_US(item_list, &items[1])); + + assert_se(items[2].item_list_next == NULL); + assert_se(items[3].item_list_next == &items[2]); + + assert_se(items[2].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_REMOVE(item_list, head, &items[2]) == &items[2]); + assert_se(LIST_JUST_US(item_list, &items[2])); + assert_se(LIST_JUST_US(item_list, head)); + + assert_se(LIST_REMOVE(item_list, head, &items[3]) == &items[3]); + assert_se(LIST_JUST_US(item_list, &items[3])); + + assert_se(head == NULL); + + for (i = 0; i < ELEMENTSOF(items); i++) { + assert_se(LIST_JUST_US(item_list, &items[i])); + assert_se(LIST_APPEND(item_list, head, &items[i]) == &items[i]); + } + + assert_se(!LIST_JUST_US(item_list, head)); + + assert_se(items[0].item_list_next == &items[1]); + assert_se(items[1].item_list_next == &items[2]); + assert_se(items[2].item_list_next == &items[3]); + assert_se(items[3].item_list_next == NULL); + + assert_se(items[0].item_list_prev == NULL); + assert_se(items[1].item_list_prev == &items[0]); + assert_se(items[2].item_list_prev == &items[1]); + assert_se(items[3].item_list_prev == &items[2]); + + for (i = 0; i < ELEMENTSOF(items); i++) + assert_se(LIST_REMOVE(item_list, head, &items[i]) == &items[i]); + + assert_se(head == NULL); + + for (i = 0; i < ELEMENTSOF(items) / 2; i++) { + LIST_INIT(item_list, &items[i]); + assert_se(LIST_JUST_US(item_list, &items[i])); + assert_se(LIST_PREPEND(item_list, head, &items[i]) == &items[i]); + } + + for (i = ELEMENTSOF(items) / 2; i < ELEMENTSOF(items); i++) { + LIST_INIT(item_list, &items[i]); + assert_se(LIST_JUST_US(item_list, &items[i])); + assert_se(LIST_PREPEND(item_list, head2, &items[i]) == &items[i]); + } + + assert_se(items[0].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[0]); + assert_se(items[2].item_list_next == NULL); + assert_se(items[3].item_list_next == &items[2]); + + assert_se(items[0].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == NULL); + assert_se(items[2].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_JOIN(item_list, head2, head) == head2); + assert_se(head == NULL); + + assert_se(items[0].item_list_next == NULL); + assert_se(items[1].item_list_next == &items[0]); + assert_se(items[2].item_list_next == &items[1]); + assert_se(items[3].item_list_next == &items[2]); + + assert_se(items[0].item_list_prev == &items[1]); + assert_se(items[1].item_list_prev == &items[2]); + assert_se(items[2].item_list_prev == &items[3]); + assert_se(items[3].item_list_prev == NULL); + + assert_se(LIST_JOIN(item_list, head, head2) == head); + assert_se(head2 == NULL); + assert_se(head); + + for (i = 0; i < ELEMENTSOF(items); i++) + assert_se(LIST_REMOVE(item_list, head, &items[i]) == &items[i]); + + assert_se(head == NULL); + + assert_se(LIST_PREPEND(item_list, head, items + 0) == items + 0); + assert_se(LIST_PREPEND(item_list, head, items + 1) == items + 1); + assert_se(LIST_PREPEND(item_list, head, items + 2) == items + 2); + + assert_se(LIST_POP(item_list, head) == items + 2); + assert_se(LIST_POP(item_list, head) == items + 1); + assert_se(LIST_POP(item_list, head) == items + 0); + assert_se(LIST_POP(item_list, head) == NULL); + + /* No-op on an empty list */ + + LIST_CLEAR(item_list, head, free); + + /* A non-empty list is cleared */ + + assert_se(LIST_PREPEND(item_list, head, new0(list_item, 1))); + assert_se(LIST_PREPEND(item_list, head, new0(list_item, 1))); + + LIST_CLEAR(item_list, head, free); + + assert_se(head == NULL); + + /* A list can be cleared partially */ + + assert_se(LIST_PREPEND(item_list, head, new0(list_item, 1))); + assert_se(LIST_PREPEND(item_list, head, new0(list_item, 1))); + assert_se(LIST_PREPEND(item_list, head, items + 0) == items + 0); + + LIST_CLEAR(item_list, head->item_list_next, free); + + assert_se(head == items + 0); + assert_se(head->item_list_next == NULL); + + return 0; +} diff --git a/src/test/test-load-fragment.c b/src/test/test-load-fragment.c new file mode 100644 index 0000000..8d2cec0 --- /dev/null +++ b/src/test/test-load-fragment.c @@ -0,0 +1,1105 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "all-units.h" +#include "alloc-util.h" +#include "capability-util.h" +#include "conf-parser.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "hostname-util.h" +#include "install-printf.h" +#include "install.h" +#include "load-fragment.h" +#include "macro.h" +#include "memory-util.h" +#include "open-file.h" +#include "pcre2-util.h" +#include "rm-rf.h" +#include "specifier.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" + +/* Nontrivial value serves as a placeholder to check that parsing function (didn't) change it */ +#define CGROUP_LIMIT_DUMMY 3 + +static char *runtime_dir = NULL; + +STATIC_DESTRUCTOR_REGISTER(runtime_dir, rm_rf_physical_and_freep); + +/* For testing type compatibility. */ +_unused_ ConfigPerfItemLookup unused_lookup = load_fragment_gperf_lookup; + +TEST_RET(unit_file_get_set) { + int r; + _cleanup_hashmap_free_ Hashmap *h = NULL; + UnitFileList *p; + + h = hashmap_new(&unit_file_list_hash_ops_free); + assert_se(h); + + r = unit_file_get_list(RUNTIME_SCOPE_SYSTEM, NULL, h, NULL, NULL); + if (IN_SET(r, -EPERM, -EACCES)) + return log_tests_skipped_errno(r, "unit_file_get_list"); + + log_full_errno(r == 0 ? LOG_INFO : LOG_ERR, r, + "unit_file_get_list: %m"); + if (r < 0) + return EXIT_FAILURE; + + HASHMAP_FOREACH(p, h) + printf("%s = %s\n", p->path, unit_file_state_to_string(p->state)); + + return 0; +} + +static void check_execcommand(ExecCommand *c, + const char* path, + const char* argv0, + const char* argv1, + const char* argv2, + bool ignore) { + size_t n; + + assert_se(c); + log_info("expect: \"%s\" [\"%s\" \"%s\" \"%s\"]", + path, argv0 ?: path, strnull(argv1), strnull(argv2)); + n = strv_length(c->argv); + log_info("actual: \"%s\" [\"%s\" \"%s\" \"%s\"]", + c->path, c->argv[0], n > 0 ? c->argv[1] : "(null)", n > 1 ? c->argv[2] : "(null)"); + assert_se(streq(c->path, path)); + assert_se(streq(c->argv[0], argv0 ?: path)); + if (n > 0) + assert_se(streq_ptr(c->argv[1], argv1)); + if (n > 1) + assert_se(streq_ptr(c->argv[2], argv2)); + assert_se(!!(c->flags & EXEC_COMMAND_IGNORE_FAILURE) == ignore); +} + +TEST(config_parse_exec) { + /* int config_parse_exec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) */ + int r; + + ExecCommand *c = NULL, *c1; + const char *ccc; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) { + log_notice_errno(r, "Skipping test: manager_new: %m"); + return; + } + + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + + log_info("/* basic test */"); + r = config_parse_exec(NULL, "fake", 1, "section", 1, + "LValue", 0, "/RValue r1", + &c, u); + assert_se(r >= 0); + check_execcommand(c, "/RValue", "/RValue", "r1", NULL, false); + + r = config_parse_exec(NULL, "fake", 2, "section", 1, + "LValue", 0, "/RValue///slashes r1///", + &c, u); + + log_info("/* test slashes */"); + assert_se(r >= 0); + c1 = c->command_next; + check_execcommand(c1, "/RValue/slashes", "/RValue///slashes", "r1///", NULL, false); + + log_info("/* trailing slash */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "/RValue/ argv0 r1", + &c, u); + assert_se(r == -ENOEXEC); + assert_se(c1->command_next == NULL); + + log_info("/* honour_argv0 */"); + r = config_parse_exec(NULL, "fake", 3, "section", 1, + "LValue", 0, "@/RValue///slashes2 ///argv0 r1", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue/slashes2", "///argv0", "r1", NULL, false); + + log_info("/* honour_argv0, no args */"); + r = config_parse_exec(NULL, "fake", 3, "section", 1, + "LValue", 0, "@/RValue", + &c, u); + assert_se(r == -ENOEXEC); + assert_se(c1->command_next == NULL); + + log_info("/* no command, whitespace only, reset */"); + r = config_parse_exec(NULL, "fake", 3, "section", 1, + "LValue", 0, "", + &c, u); + assert_se(r == 0); + assert_se(c == NULL); + + log_info("/* ignore && honour_argv0 */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "-@/RValue///slashes3 argv0a r1", + &c, u); + assert_se(r >= 0); + c1 = c; + check_execcommand(c1, "/RValue/slashes3", "argv0a", "r1", NULL, true); + + log_info("/* ignore && honour_argv0 */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "@-/RValue///slashes4 argv0b r1", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue/slashes4", "argv0b", "r1", NULL, true); + + log_info("/* ignore && ignore */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "--/RValue argv0 r1", + &c, u); + assert_se(r == 0); + assert_se(c1->command_next == NULL); + + log_info("/* ignore && ignore (2) */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "-@-/RValue argv0 r1", + &c, u); + assert_se(r == 0); + assert_se(c1->command_next == NULL); + + log_info("/* semicolon */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "-@/RValue argv0 r1 ; " + "/goo/goo boo", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue", "argv0", "r1", NULL, true); + + c1 = c1->command_next; + check_execcommand(c1, "/goo/goo", NULL, "boo", NULL, false); + + log_info("/* two semicolons in a row */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "-@/RValue argv0 r1 ; ; " + "/goo/goo boo", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue", "argv0", "r1", NULL, true); + c1 = c1->command_next; + check_execcommand(c1, "/goo/goo", "/goo/goo", "boo", NULL, false); + + log_info("/* trailing semicolon */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "-@/RValue argv0 r1 ; ", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue", "argv0", "r1", NULL, true); + + assert_se(c1->command_next == NULL); + + log_info("/* trailing semicolon, no whitespace */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "-@/RValue argv0 r1 ;", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue", "argv0", "r1", NULL, true); + + assert_se(c1->command_next == NULL); + + log_info("/* trailing semicolon in single quotes */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "-@/RValue argv0 r1 ';'", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/RValue", "argv0", "r1", ";", true); + + log_info("/* escaped semicolon */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/bin/find \\;", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/bin/find", NULL, ";", NULL, false); + + log_info("/* escaped semicolon with following arg */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/sbin/find \\; /x", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/sbin/find", NULL, ";", "/x", false); + + log_info("/* escaped semicolon as part of an expression */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/sbin/find \\;x", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/sbin/find", NULL, "\\;x", NULL, false); + + log_info("/* encoded semicolon */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/bin/find \\073", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/bin/find", NULL, ";", NULL, false); + + log_info("/* quoted semicolon */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/bin/find \";\"", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/bin/find", NULL, ";", NULL, false); + + log_info("/* quoted semicolon with following arg */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/sbin/find \";\" /x", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/sbin/find", NULL, ";", "/x", false); + + log_info("/* spaces in the filename */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "\"/PATH WITH SPACES/daemon\" -1 -2", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/PATH WITH SPACES/daemon", NULL, "-1", "-2", false); + + log_info("/* spaces in the filename, no args */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "\"/PATH WITH SPACES/daemon -1 -2\"", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/PATH WITH SPACES/daemon -1 -2", NULL, NULL, NULL, false); + + log_info("/* spaces in the filename, everything quoted */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "\"/PATH WITH SPACES/daemon\" \"-1\" '-2'", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/PATH WITH SPACES/daemon", NULL, "-1", "-2", false); + + log_info("/* escaped spaces in the filename */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "\"/PATH\\sWITH\\sSPACES/daemon\" '-1 -2'", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/PATH WITH SPACES/daemon", NULL, "-1 -2", NULL, false); + + log_info("/* escaped spaces in the filename (2) */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "\"/PATH\\x20WITH\\x20SPACES/daemon\" \"-1 -2\"", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/PATH WITH SPACES/daemon", NULL, "-1 -2", NULL, false); + + for (ccc = "abfnrtv\\\'\"x"; *ccc; ccc++) { + /* \\x is an incomplete hexadecimal sequence, invalid because of the slash */ + char path[] = "/path\\X"; + path[sizeof(path) - 2] = *ccc; + + log_info("/* invalid character: \\%c */", *ccc); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, path, + &c, u); + assert_se(r == -ENOEXEC); + assert_se(c1->command_next == NULL); + } + + log_info("/* valid character: \\s */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "/path\\s", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/path ", NULL, NULL, NULL, false); + + log_info("/* quoted backslashes */"); + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, + "/bin/grep '\\w+\\K'", + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, "/bin/grep", NULL, "\\w+\\K", NULL, false); + + log_info("/* trailing backslash: \\ */"); + /* backslash is invalid */ + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "/path\\", + &c, u); + assert_se(r == -ENOEXEC); + assert_se(c1->command_next == NULL); + + log_info("/* missing ending ' */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "/path 'foo", + &c, u); + assert_se(r == -ENOEXEC); + assert_se(c1->command_next == NULL); + + log_info("/* missing ending ' with trailing backslash */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "/path 'foo\\", + &c, u); + assert_se(r == -ENOEXEC); + assert_se(c1->command_next == NULL); + + log_info("/* invalid space between modifiers */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "- /path", + &c, u); + assert_se(r == 0); + assert_se(c1->command_next == NULL); + + log_info("/* only modifiers, no path */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "-", + &c, u); + assert_se(r == 0); + assert_se(c1->command_next == NULL); + + log_info("/* long arg */"); /* See issue #22957. */ + + char x[LONG_LINE_MAX-100], *y; + y = mempcpy(x, "/bin/echo ", STRLEN("/bin/echo ")); + memset(y, 'x', sizeof(x) - STRLEN("/bin/echo ") - 1); + x[sizeof(x) - 1] = '\0'; + + r = config_parse_exec(NULL, "fake", 5, "section", 1, + "LValue", 0, x, + &c, u); + assert_se(r >= 0); + c1 = c1->command_next; + check_execcommand(c1, + "/bin/echo", NULL, y, NULL, false); + + log_info("/* empty argument, reset */"); + r = config_parse_exec(NULL, "fake", 4, "section", 1, + "LValue", 0, "", + &c, u); + assert_se(r == 0); + assert_se(c == NULL); + + exec_command_free_list(c); +} + +TEST(config_parse_log_extra_fields) { + /* int config_parse_log_extra_fields( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) */ + + int r; + + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + ExecContext c = {}; + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) { + log_notice_errno(r, "Skipping test: manager_new: %m"); + return; + } + + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + + log_info("/* %s – basic test */", __func__); + r = config_parse_log_extra_fields(NULL, "fake", 1, "section", 1, + "LValue", 0, "FOO=BAR \"QOOF=quux ' ' \"", + &c, u); + assert_se(r >= 0); + assert_se(c.n_log_extra_fields == 2); + assert_se(strneq(c.log_extra_fields[0].iov_base, "FOO=BAR", c.log_extra_fields[0].iov_len)); + assert_se(strneq(c.log_extra_fields[1].iov_base, "QOOF=quux ' ' ", c.log_extra_fields[1].iov_len)); + + log_info("/* %s – add some */", __func__); + r = config_parse_log_extra_fields(NULL, "fake", 1, "section", 1, + "LValue", 0, "FOO2=BAR2 QOOF2=quux ' '", + &c, u); + assert_se(r >= 0); + assert_se(c.n_log_extra_fields == 4); + assert_se(strneq(c.log_extra_fields[0].iov_base, "FOO=BAR", c.log_extra_fields[0].iov_len)); + assert_se(strneq(c.log_extra_fields[1].iov_base, "QOOF=quux ' ' ", c.log_extra_fields[1].iov_len)); + assert_se(strneq(c.log_extra_fields[2].iov_base, "FOO2=BAR2", c.log_extra_fields[2].iov_len)); + assert_se(strneq(c.log_extra_fields[3].iov_base, "QOOF2=quux", c.log_extra_fields[3].iov_len)); + + exec_context_dump(&c, stdout, " --> "); + + log_info("/* %s – reset */", __func__); + r = config_parse_log_extra_fields(NULL, "fake", 1, "section", 1, + "LValue", 0, "", + &c, u); + assert_se(r >= 0); + assert_se(c.n_log_extra_fields == 0); + + exec_context_free_log_extra_fields(&c); + + log_info("/* %s – bye */", __func__); +} + +TEST(install_printf, .sd_booted = true) { + char name[] = "name.service", + path[] = "/run/systemd/system/name.service"; + InstallInfo i = { .name = name, .path = path, }; + InstallInfo i2 = { .name= name, .path = path, }; + char name3[] = "name@inst.service", + path3[] = "/run/systemd/system/name.service"; + InstallInfo i3 = { .name = name3, .path = path3, }; + InstallInfo i4 = { .name = name3, .path = path3, }; + + _cleanup_free_ char *mid = NULL, *bid = NULL, *host = NULL, *gid = NULL, *group = NULL, *uid = NULL, *user = NULL; + + if (sd_id128_get_machine(NULL) >= 0) + assert_se(specifier_machine_id('m', NULL, NULL, NULL, &mid) >= 0 && mid); + if (sd_booted() > 0) + assert_se(specifier_boot_id('b', NULL, NULL, NULL, &bid) >= 0 && bid); + assert_se(host = gethostname_malloc()); + assert_se(group = gid_to_name(getgid())); + assert_se(asprintf(&gid, UID_FMT, getgid()) >= 0); + assert_se(user = uid_to_name(getuid())); + assert_se(asprintf(&uid, UID_FMT, getuid()) >= 0); + +#define expect(scope, src, pattern, result) \ + do { \ + _cleanup_free_ char *t = NULL, \ + *d1 = ASSERT_PTR(strdup(i.name)), \ + *d2 = ASSERT_PTR(strdup(i.path)); \ + int r = install_name_printf(scope, &src, pattern, &t); \ + assert_se(result ? r >= 0 : r < 0); \ + memzero(i.name, strlen(i.name)); \ + memzero(i.path, strlen(i.path)); \ + if (result) { \ + printf("%s\n", t); \ + assert_se(streq(t, result)); \ + } else \ + assert_se(!t); \ + strcpy(i.name, d1); \ + strcpy(i.path, d2); \ + } while (false) + + expect(RUNTIME_SCOPE_SYSTEM, i, "%n", "name.service"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%N", "name"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%p", "name"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%i", ""); + expect(RUNTIME_SCOPE_SYSTEM, i, "%j", "name"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%g", "root"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%G", "0"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%u", "root"); + expect(RUNTIME_SCOPE_SYSTEM, i, "%U", "0"); + + expect(RUNTIME_SCOPE_SYSTEM, i, "%m", mid); + expect(RUNTIME_SCOPE_SYSTEM, i, "%b", bid); + expect(RUNTIME_SCOPE_SYSTEM, i, "%H", host); + + expect(RUNTIME_SCOPE_SYSTEM, i2, "%g", "root"); + expect(RUNTIME_SCOPE_SYSTEM, i2, "%G", "0"); + expect(RUNTIME_SCOPE_SYSTEM, i2, "%u", "root"); + expect(RUNTIME_SCOPE_SYSTEM, i2, "%U", "0"); + + expect(RUNTIME_SCOPE_USER, i2, "%g", group); + expect(RUNTIME_SCOPE_USER, i2, "%G", gid); + expect(RUNTIME_SCOPE_USER, i2, "%u", user); + expect(RUNTIME_SCOPE_USER, i2, "%U", uid); + + /* gcc-12.0.1-0.9.fc36.x86_64 insist that streq(…, NULL) is called, + * even though the call is inside of a conditional where the pointer is checked. :( */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnonnull" + expect(RUNTIME_SCOPE_GLOBAL, i2, "%g", NULL); + expect(RUNTIME_SCOPE_GLOBAL, i2, "%G", NULL); + expect(RUNTIME_SCOPE_GLOBAL, i2, "%u", NULL); + expect(RUNTIME_SCOPE_GLOBAL, i2, "%U", NULL); +#pragma GCC diagnostic pop + + expect(RUNTIME_SCOPE_SYSTEM, i3, "%n", "name@inst.service"); + expect(RUNTIME_SCOPE_SYSTEM, i3, "%N", "name@inst"); + expect(RUNTIME_SCOPE_SYSTEM, i3, "%p", "name"); + expect(RUNTIME_SCOPE_USER, i3, "%g", group); + expect(RUNTIME_SCOPE_USER, i3, "%G", gid); + expect(RUNTIME_SCOPE_USER, i3, "%u", user); + expect(RUNTIME_SCOPE_USER, i3, "%U", uid); + + expect(RUNTIME_SCOPE_SYSTEM, i3, "%m", mid); + expect(RUNTIME_SCOPE_SYSTEM, i3, "%b", bid); + expect(RUNTIME_SCOPE_SYSTEM, i3, "%H", host); + + expect(RUNTIME_SCOPE_USER, i4, "%g", group); + expect(RUNTIME_SCOPE_USER, i4, "%G", gid); + expect(RUNTIME_SCOPE_USER, i4, "%u", user); + expect(RUNTIME_SCOPE_USER, i4, "%U", uid); +} + +static uint64_t make_cap(int cap) { + return ((uint64_t) 1ULL << (uint64_t) cap); +} + +TEST(config_parse_capability_set) { + /* int config_parse_capability_set( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) */ + int r; + uint64_t capability_bounding_set = 0; + + r = config_parse_capability_set(NULL, "fake", 1, "section", 1, + "CapabilityBoundingSet", 0, "CAP_NET_RAW", + &capability_bounding_set, NULL); + assert_se(r >= 0); + assert_se(capability_bounding_set == make_cap(CAP_NET_RAW)); + + r = config_parse_capability_set(NULL, "fake", 1, "section", 1, + "CapabilityBoundingSet", 0, "CAP_NET_ADMIN", + &capability_bounding_set, NULL); + assert_se(r >= 0); + assert_se(capability_bounding_set == (make_cap(CAP_NET_RAW) | make_cap(CAP_NET_ADMIN))); + + r = config_parse_capability_set(NULL, "fake", 1, "section", 1, + "CapabilityBoundingSet", 0, "~CAP_NET_ADMIN", + &capability_bounding_set, NULL); + assert_se(r >= 0); + assert_se(capability_bounding_set == make_cap(CAP_NET_RAW)); + + r = config_parse_capability_set(NULL, "fake", 1, "section", 1, + "CapabilityBoundingSet", 0, "", + &capability_bounding_set, NULL); + assert_se(r >= 0); + assert_se(capability_bounding_set == UINT64_C(0)); + + r = config_parse_capability_set(NULL, "fake", 1, "section", 1, + "CapabilityBoundingSet", 0, "~", + &capability_bounding_set, NULL); + assert_se(r >= 0); + assert_se(cap_test_all(capability_bounding_set)); + + capability_bounding_set = 0; + r = config_parse_capability_set(NULL, "fake", 1, "section", 1, + "CapabilityBoundingSet", 0, " 'CAP_NET_RAW' WAT_CAP??? CAP_NET_ADMIN CAP'_trailing_garbage", + &capability_bounding_set, NULL); + assert_se(r >= 0); + assert_se(capability_bounding_set == (make_cap(CAP_NET_RAW) | make_cap(CAP_NET_ADMIN))); +} + +TEST(config_parse_rlimit) { + struct rlimit * rl[_RLIMIT_MAX] = {}; + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "55", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == 55); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == rl[RLIMIT_NOFILE]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "55:66", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == 55); + assert_se(rl[RLIMIT_NOFILE]->rlim_max == 66); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "infinity", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == RLIM_INFINITY); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == rl[RLIMIT_NOFILE]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "infinity:infinity", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == RLIM_INFINITY); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == rl[RLIMIT_NOFILE]->rlim_max); + + rl[RLIMIT_NOFILE]->rlim_cur = 10; + rl[RLIMIT_NOFILE]->rlim_max = 20; + + /* Invalid values don't change rl */ + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "10:20:30", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == 10); + assert_se(rl[RLIMIT_NOFILE]->rlim_max == 20); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "wat:wat", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == 10); + assert_se(rl[RLIMIT_NOFILE]->rlim_max == 20); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "66:wat", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == 10); + assert_se(rl[RLIMIT_NOFILE]->rlim_max == 20); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitNOFILE", RLIMIT_NOFILE, "200:100", rl, NULL) >= 0); + assert_se(rl[RLIMIT_NOFILE]); + assert_se(rl[RLIMIT_NOFILE]->rlim_cur == 10); + assert_se(rl[RLIMIT_NOFILE]->rlim_max == 20); + + rl[RLIMIT_NOFILE] = mfree(rl[RLIMIT_NOFILE]); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitCPU", RLIMIT_CPU, "56", rl, NULL) >= 0); + assert_se(rl[RLIMIT_CPU]); + assert_se(rl[RLIMIT_CPU]->rlim_cur == 56); + assert_se(rl[RLIMIT_CPU]->rlim_cur == rl[RLIMIT_CPU]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitCPU", RLIMIT_CPU, "57s", rl, NULL) >= 0); + assert_se(rl[RLIMIT_CPU]); + assert_se(rl[RLIMIT_CPU]->rlim_cur == 57); + assert_se(rl[RLIMIT_CPU]->rlim_cur == rl[RLIMIT_CPU]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitCPU", RLIMIT_CPU, "40s:1m", rl, NULL) >= 0); + assert_se(rl[RLIMIT_CPU]); + assert_se(rl[RLIMIT_CPU]->rlim_cur == 40); + assert_se(rl[RLIMIT_CPU]->rlim_max == 60); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitCPU", RLIMIT_CPU, "infinity", rl, NULL) >= 0); + assert_se(rl[RLIMIT_CPU]); + assert_se(rl[RLIMIT_CPU]->rlim_cur == RLIM_INFINITY); + assert_se(rl[RLIMIT_CPU]->rlim_cur == rl[RLIMIT_CPU]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitCPU", RLIMIT_CPU, "1234ms", rl, NULL) >= 0); + assert_se(rl[RLIMIT_CPU]); + assert_se(rl[RLIMIT_CPU]->rlim_cur == 2); + assert_se(rl[RLIMIT_CPU]->rlim_cur == rl[RLIMIT_CPU]->rlim_max); + + rl[RLIMIT_CPU] = mfree(rl[RLIMIT_CPU]); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "58", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == 58); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == rl[RLIMIT_RTTIME]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "58:60", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == 58); + assert_se(rl[RLIMIT_RTTIME]->rlim_max == 60); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "59s", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == 59 * USEC_PER_SEC); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == rl[RLIMIT_RTTIME]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "59s:123s", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == 59 * USEC_PER_SEC); + assert_se(rl[RLIMIT_RTTIME]->rlim_max == 123 * USEC_PER_SEC); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "infinity", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == RLIM_INFINITY); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == rl[RLIMIT_RTTIME]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "infinity:infinity", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == RLIM_INFINITY); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == rl[RLIMIT_RTTIME]->rlim_max); + + assert_se(config_parse_rlimit(NULL, "fake", 1, "section", 1, "LimitRTTIME", RLIMIT_RTTIME, "2345ms", rl, NULL) >= 0); + assert_se(rl[RLIMIT_RTTIME]); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == 2345 * USEC_PER_MSEC); + assert_se(rl[RLIMIT_RTTIME]->rlim_cur == rl[RLIMIT_RTTIME]->rlim_max); + + rl[RLIMIT_RTTIME] = mfree(rl[RLIMIT_RTTIME]); +} + +TEST(config_parse_pass_environ) { + /* int config_parse_pass_environ( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) */ + int r; + _cleanup_strv_free_ char **passenv = NULL; + + r = config_parse_pass_environ(NULL, "fake", 1, "section", 1, + "PassEnvironment", 0, "A B", + &passenv, NULL); + assert_se(r >= 0); + assert_se(strv_length(passenv) == 2); + assert_se(streq(passenv[0], "A")); + assert_se(streq(passenv[1], "B")); + + r = config_parse_pass_environ(NULL, "fake", 1, "section", 1, + "PassEnvironment", 0, "", + &passenv, NULL); + assert_se(r >= 0); + assert_se(strv_isempty(passenv)); + + r = config_parse_pass_environ(NULL, "fake", 1, "section", 1, + "PassEnvironment", 0, "'invalid name' 'normal_name' A=1 'special_name$$' \\", + &passenv, NULL); + assert_se(r >= 0); + assert_se(strv_length(passenv) == 1); + assert_se(streq(passenv[0], "normal_name")); +} + +TEST(config_parse_unit_env_file) { + /* int config_parse_unit_env_file( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) */ + + _cleanup_(manager_freep) Manager *m = NULL; + Unit *u; + _cleanup_strv_free_ char **files = NULL; + int r; + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) { + log_notice_errno(r, "Skipping test: manager_new: %m"); + return; + } + + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "foobar.service") == 0); + + r = config_parse_unit_env_file(u->id, "fake", 1, "section", 1, + "EnvironmentFile", 0, "not-absolute", + &files, u); + assert_se(r == 0); + assert_se(strv_isempty(files)); + + r = config_parse_unit_env_file(u->id, "fake", 1, "section", 1, + "EnvironmentFile", 0, "/absolute1", + &files, u); + assert_se(r == 0); + assert_se(strv_length(files) == 1); + + r = config_parse_unit_env_file(u->id, "fake", 1, "section", 1, + "EnvironmentFile", 0, "/absolute2", + &files, u); + assert_se(r == 0); + assert_se(strv_length(files) == 2); + assert_se(streq(files[0], "/absolute1")); + assert_se(streq(files[1], "/absolute2")); + + r = config_parse_unit_env_file(u->id, "fake", 1, "section", 1, + "EnvironmentFile", 0, "", + &files, u); + assert_se(r == 0); + assert_se(strv_isempty(files)); + + r = config_parse_unit_env_file(u->id, "fake", 1, "section", 1, + "EnvironmentFile", 0, "/path/%n.conf", + &files, u); + assert_se(r == 0); + assert_se(strv_length(files) == 1); + assert_se(streq(files[0], "/path/foobar.service.conf")); +} + +TEST(unit_dump_config_items) { + unit_dump_config_items(stdout); +} + +TEST(config_parse_memory_limit) { + /* int config_parse_memory_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) */ + CGroupContext c; + struct limit_test { + const char *limit; + const char *value; + uint64_t *result; + uint64_t expected; + } limit_tests[]= { + { "MemoryMin", "", &c.memory_min, CGROUP_LIMIT_MIN }, + { "MemoryMin", "0", &c.memory_min, CGROUP_LIMIT_MIN }, + { "MemoryMin", "10", &c.memory_min, 10 }, + { "MemoryMin", "infinity", &c.memory_min, CGROUP_LIMIT_MAX }, + { "MemoryLow", "", &c.memory_low, CGROUP_LIMIT_MIN }, + { "MemoryLow", "0", &c.memory_low, CGROUP_LIMIT_MIN }, + { "MemoryLow", "10", &c.memory_low, 10 }, + { "MemoryLow", "infinity", &c.memory_low, CGROUP_LIMIT_MAX }, + { "MemoryHigh", "", &c.memory_high, CGROUP_LIMIT_MAX }, + { "MemoryHigh", "0", &c.memory_high, CGROUP_LIMIT_DUMMY }, + { "MemoryHigh", "10", &c.memory_high, 10 }, + { "MemoryHigh", "infinity", &c.memory_high, CGROUP_LIMIT_MAX }, + { "MemoryMax", "", &c.memory_max, CGROUP_LIMIT_MAX }, + { "MemoryMax", "0", &c.memory_max, CGROUP_LIMIT_DUMMY }, + { "MemoryMax", "10", &c.memory_max, 10 }, + { "MemoryMax", "infinity", &c.memory_max, CGROUP_LIMIT_MAX }, + }; + size_t i; + int r; + + for (i = 0; i < ELEMENTSOF(limit_tests); i++) { + c.memory_min = CGROUP_LIMIT_DUMMY; + c.memory_low = CGROUP_LIMIT_DUMMY; + c.memory_high = CGROUP_LIMIT_DUMMY; + c.memory_max = CGROUP_LIMIT_DUMMY; + r = config_parse_memory_limit(NULL, "fake", 1, "section", 1, + limit_tests[i].limit, 1, + limit_tests[i].value, &c, NULL); + log_info("%s=%s\t%"PRIu64"==%"PRIu64, + limit_tests[i].limit, limit_tests[i].value, + *limit_tests[i].result, limit_tests[i].expected); + assert_se(r >= 0); + assert_se(*limit_tests[i].result == limit_tests[i].expected); + } + +} + +TEST(contains_instance_specifier_superset) { + assert_se(contains_instance_specifier_superset("foobar@a%i")); + assert_se(contains_instance_specifier_superset("foobar@%ia")); + assert_se(contains_instance_specifier_superset("foobar@%n")); + assert_se(contains_instance_specifier_superset("foobar@%n.service")); + assert_se(contains_instance_specifier_superset("foobar@%N")); + assert_se(contains_instance_specifier_superset("foobar@%N.service")); + assert_se(contains_instance_specifier_superset("foobar@baz.%N.service")); + assert_se(contains_instance_specifier_superset("@%N.service")); + assert_se(contains_instance_specifier_superset("@%N")); + assert_se(contains_instance_specifier_superset("@%a%N")); + + assert_se(!contains_instance_specifier_superset("foobar@%i.service")); + assert_se(!contains_instance_specifier_superset("foobar%ia.service")); + assert_se(!contains_instance_specifier_superset("foobar@%%n.service")); + assert_se(!contains_instance_specifier_superset("foobar@baz.service")); + assert_se(!contains_instance_specifier_superset("%N.service")); + assert_se(!contains_instance_specifier_superset("%N")); + assert_se(!contains_instance_specifier_superset("@%aN")); + assert_se(!contains_instance_specifier_superset("@%a%b")); +} + +TEST(unit_is_recursive_template_dependency) { + _cleanup_(manager_freep) Manager *m = NULL; + Unit *u; + int r; + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) { + log_notice_errno(r, "Skipping test: manager_new: %m"); + return; + } + + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "foobar@1.service") == 0); + u->fragment_path = strdup("/foobar@.service"); + + assert_se(hashmap_put_strdup(&m->unit_id_map, "foobar@foobar@123.service", "/foobar@.service")); + assert_se(hashmap_put_strdup(&m->unit_id_map, "foobar@foobar@456.service", "/custom.service")); + + /* Test that %n, %N and any extension of %i specifiers in the instance are detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@%N.service") == 1); + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@%n.service") == 1); + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@a%i.service") == 1); + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@%ia.service") == 1); + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@%x%n.service") == 1); + /* Test that %i on its own is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@%i.service") == 0); + /* Test that a specifier other than %i, %n and %N is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@%xn.service") == 0); + /* Test that an expanded specifier is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.service", "foobar@foobar@123.service") == 0); + /* Test that a dependency with a custom fragment path is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@456.service", "foobar@%n.service") == 0); + /* Test that a dependency without a fragment path is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@789.service", "foobar@%n.service") == 0); + /* Test that a dependency with a different prefix is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "quux@foobar@123.service", "quux@%n.service") == 0); + /* Test that a dependency of a different type is not detected as recursive. */ + assert_se(unit_is_likely_recursive_template_dependency(u, "foobar@foobar@123.mount", "foobar@%n.mount") == 0); +} + +#define TEST_PATTERN(_regex, _allowed_patterns_count, _denied_patterns_count) \ + { \ + .regex = _regex, \ + .allowed_patterns_count = _allowed_patterns_count, \ + .denied_patterns_count = _denied_patterns_count \ + } + +TEST(config_parse_log_filter_patterns) { + ExecContext c = {}; + + static const struct { + const char *regex; + size_t allowed_patterns_count; + size_t denied_patterns_count; + } regex_tests[] = { + TEST_PATTERN("", 0, 0), + TEST_PATTERN(".*", 1, 0), + TEST_PATTERN("~.*", 1, 1), + TEST_PATTERN("", 0, 0), + TEST_PATTERN("~.*", 0, 1), + TEST_PATTERN("[.*", 0, 1), /* Invalid pattern. */ + TEST_PATTERN(".*gg.*", 1, 1), + TEST_PATTERN("~.*", 1, 1), /* Already in the patterns list. */ + TEST_PATTERN("[.*", 1, 1), /* Invalid pattern. */ + TEST_PATTERN("\\x7ehello", 2, 1), + TEST_PATTERN("", 0, 0), + TEST_PATTERN("~foobar", 0, 1), + }; + + if (ERRNO_IS_NOT_SUPPORTED(dlopen_pcre2())) + return (void) log_tests_skipped("PCRE2 support is not available"); + + for (size_t i = 0; i < ELEMENTSOF(regex_tests); i++) { + assert_se(config_parse_log_filter_patterns(NULL, "fake", 1, "section", 1, "LogFilterPatterns", 1, + regex_tests[i].regex, &c, NULL) >= 0); + + assert_se(set_size(c.log_filter_allowed_patterns) == regex_tests[i].allowed_patterns_count); + assert_se(set_size(c.log_filter_denied_patterns) == regex_tests[i].denied_patterns_count); + + /* Ensure `~` is properly removed */ + const char *p; + SET_FOREACH(p, c.log_filter_allowed_patterns) + assert_se(p && p[0] != '~'); + SET_FOREACH(p, c.log_filter_denied_patterns) + assert_se(p && p[0] != '~'); + } + + exec_context_done(&c); +} + +TEST(config_parse_open_file) { + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + _cleanup_(open_file_freep) OpenFile *of = NULL; + int r; + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) { + log_notice_errno(r, "Skipping test: manager_new: %m"); + return; + } + + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "foobar.service") == 0); + + r = config_parse_open_file(NULL, "fake", 1, "section", 1, + "OpenFile", 0, "/proc/1/ns/mnt:host-mount-namespace:read-only", + &of, u); + assert_se(r >= 0); + assert_se(of); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == OPENFILE_READ_ONLY); + + of = open_file_free(of); + r = config_parse_open_file(NULL, "fake", 1, "section", 1, + "OpenFile", 0, "/proc/1/ns/mnt::read-only", + &of, u); + assert_se(r >= 0); + assert_se(of); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "mnt")); + assert_se(of->flags == OPENFILE_READ_ONLY); + + r = config_parse_open_file(NULL, "fake", 1, "section", 1, + "OpenFile", 0, "", + &of, u); + assert_se(r >= 0); + assert_se(!of); +} + +static int intro(void) { + if (enter_cgroup_subroot(NULL) == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(runtime_dir = setup_fake_runtime_dir()); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-local-addresses.c b/src/test/test-local-addresses.c new file mode 100644 index 0000000..5a02465 --- /dev/null +++ b/src/test/test-local-addresses.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "in-addr-util.h" +#include "local-addresses.h" +#include "tests.h" + +static void print_local_addresses(struct local_address *a, unsigned n) { + for (unsigned i = 0; i < n; i++) { + _cleanup_free_ char *b = NULL; + + assert_se(in_addr_to_string(a[i].family, &a[i].address, &b) >= 0); + log_debug("%s if%i scope=%i metric=%u address=%s", af_to_name(a[i].family), a[i].ifindex, a[i].scope, a[i].metric, b); + } +} + +TEST(local_addresses) { + struct local_address *a = NULL; + int n; + + n = local_addresses(NULL, 0, AF_INET, &a); + assert_se(n >= 0); + log_debug("/* Local Addresses(ifindex:0, AF_INET) */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_addresses(NULL, 0, AF_INET6, &a); + assert_se(n >= 0); + log_debug("/* Local Addresses(ifindex:0, AF_INET6) */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_addresses(NULL, 0, AF_UNSPEC, &a); + assert_se(n >= 0); + log_debug("/* Local Addresses(ifindex:0, AF_UNSPEC) */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_addresses(NULL, 1, AF_INET, &a); + assert_se(n >= 0); + log_debug("/* Local Addresses(ifindex:1, AF_INET) */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_addresses(NULL, 1, AF_INET6, &a); + assert_se(n >= 0); + log_debug("/* Local Addresses(ifindex:1, AF_INET6) */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_addresses(NULL, 1, AF_UNSPEC, &a); + assert_se(n >= 0); + log_debug("/* Local Addresses(ifindex:1, AF_UNSPEC) */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_gateways(NULL, 0, AF_UNSPEC, &a); + assert_se(n >= 0); + log_debug("/* Local Gateways */"); + print_local_addresses(a, (unsigned) n); + a = mfree(a); + + n = local_outbounds(NULL, 0, AF_UNSPEC, &a); + assert_se(n >= 0); + log_debug("/* Local Outbounds */"); + print_local_addresses(a, (unsigned) n); + free(a); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-locale-util.c b/src/test/test-locale-util.c new file mode 100644 index 0000000..39f71c6 --- /dev/null +++ b/src/test/test-locale-util.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "glyph-util.h" +#include "kbd-util.h" +#include "locale-util.h" +#include "macro.h" +#include "strv.h" +#include "tests.h" + +TEST(get_locales) { + _cleanup_strv_free_ char **locales = NULL; + int r; + + r = get_locales(&locales); + assert_se(r >= 0); + assert_se(locales); + + STRV_FOREACH(p, locales) { + puts(*p); + assert_se(locale_is_valid(*p)); + } +} + +TEST(locale_is_valid) { + assert_se(locale_is_valid("en_EN.utf8")); + assert_se(locale_is_valid("fr_FR.utf8")); + assert_se(locale_is_valid("fr_FR@euro")); + assert_se(locale_is_valid("fi_FI")); + assert_se(locale_is_valid("POSIX")); + assert_se(locale_is_valid("C")); + + assert_se(!locale_is_valid("")); + assert_se(!locale_is_valid("/usr/bin/foo")); + assert_se(!locale_is_valid("\x01gar\x02 bage\x03")); +} + +TEST(locale_is_installed) { + /* Always available */ + assert_se(locale_is_installed("POSIX") > 0); + assert_se(locale_is_installed("C") > 0); + + /* Might, or might not be installed. */ + assert_se(locale_is_installed("en_EN.utf8") >= 0); + assert_se(locale_is_installed("fr_FR.utf8") >= 0); + assert_se(locale_is_installed("fr_FR@euro") >= 0); + assert_se(locale_is_installed("fi_FI") >= 0); + + /* Definitely not valid */ + assert_se(locale_is_installed("") == 0); + assert_se(locale_is_installed("/usr/bin/foo") == 0); + assert_se(locale_is_installed("\x01gar\x02 bage\x03") == 0); + + /* Definitely not installed */ + assert_se(locale_is_installed("zz_ZZ") == 0); +} + +TEST(keymaps) { + _cleanup_strv_free_ char **kmaps = NULL; + int r; + + assert_se(!keymap_is_valid("")); + assert_se(!keymap_is_valid("/usr/bin/foo")); + assert_se(!keymap_is_valid("\x01gar\x02 bage\x03")); + + r = get_keymaps(&kmaps); + if (r == -ENOENT) + return; /* skip test if no keymaps are installed */ + + assert_se(r >= 0); + assert_se(kmaps); + + STRV_FOREACH(p, kmaps) { + puts(*p); + assert_se(keymap_is_valid(*p)); + } + + assert_se(keymap_is_valid("uk")); + assert_se(keymap_is_valid("de-nodeadkeys")); + assert_se(keymap_is_valid("ANSI-dvorak")); + assert_se(keymap_is_valid("unicode")); +} + +#define dump_glyph(x) log_info(STRINGIFY(x) ": %s", special_glyph(x)) +TEST(dump_special_glyphs) { + assert_cc(SPECIAL_GLYPH_WORLD + 1 == _SPECIAL_GLYPH_MAX); + + log_info("is_locale_utf8: %s", yes_no(is_locale_utf8())); + + dump_glyph(SPECIAL_GLYPH_TREE_VERTICAL); + dump_glyph(SPECIAL_GLYPH_TREE_BRANCH); + dump_glyph(SPECIAL_GLYPH_TREE_RIGHT); + dump_glyph(SPECIAL_GLYPH_TREE_SPACE); + dump_glyph(SPECIAL_GLYPH_TREE_TOP); + dump_glyph(SPECIAL_GLYPH_VERTICAL_DOTTED); + dump_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET); + dump_glyph(SPECIAL_GLYPH_BLACK_CIRCLE); + dump_glyph(SPECIAL_GLYPH_WHITE_CIRCLE); + dump_glyph(SPECIAL_GLYPH_MULTIPLICATION_SIGN); + dump_glyph(SPECIAL_GLYPH_CIRCLE_ARROW); + dump_glyph(SPECIAL_GLYPH_BULLET); + dump_glyph(SPECIAL_GLYPH_MU); + dump_glyph(SPECIAL_GLYPH_CHECK_MARK); + dump_glyph(SPECIAL_GLYPH_CROSS_MARK); + dump_glyph(SPECIAL_GLYPH_LIGHT_SHADE); + dump_glyph(SPECIAL_GLYPH_DARK_SHADE); + dump_glyph(SPECIAL_GLYPH_FULL_BLOCK); + dump_glyph(SPECIAL_GLYPH_SIGMA); + dump_glyph(SPECIAL_GLYPH_ARROW_UP); + dump_glyph(SPECIAL_GLYPH_ARROW_DOWN); + dump_glyph(SPECIAL_GLYPH_ARROW_LEFT); + dump_glyph(SPECIAL_GLYPH_ARROW_RIGHT); + dump_glyph(SPECIAL_GLYPH_ELLIPSIS); + dump_glyph(SPECIAL_GLYPH_EXTERNAL_LINK); + dump_glyph(SPECIAL_GLYPH_ECSTATIC_SMILEY); + dump_glyph(SPECIAL_GLYPH_HAPPY_SMILEY); + dump_glyph(SPECIAL_GLYPH_SLIGHTLY_HAPPY_SMILEY); + dump_glyph(SPECIAL_GLYPH_NEUTRAL_SMILEY); + dump_glyph(SPECIAL_GLYPH_SLIGHTLY_UNHAPPY_SMILEY); + dump_glyph(SPECIAL_GLYPH_UNHAPPY_SMILEY); + dump_glyph(SPECIAL_GLYPH_DEPRESSED_SMILEY); + dump_glyph(SPECIAL_GLYPH_LOCK_AND_KEY); + dump_glyph(SPECIAL_GLYPH_TOUCH); + dump_glyph(SPECIAL_GLYPH_RECYCLING); + dump_glyph(SPECIAL_GLYPH_DOWNLOAD); + dump_glyph(SPECIAL_GLYPH_SPARKLES); + dump_glyph(SPECIAL_GLYPH_LOW_BATTERY); + dump_glyph(SPECIAL_GLYPH_WARNING_SIGN); + dump_glyph(SPECIAL_GLYPH_COMPUTER_DISK); + dump_glyph(SPECIAL_GLYPH_WORLD); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-lock-util.c b/src/test/test-lock-util.c new file mode 100644 index 0000000..5edd087 --- /dev/null +++ b/src/test/test-lock-util.c @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "lock-util.h" +#include "rm-rf.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(make_lock_file) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF; + _cleanup_(release_lock_file) LockFile lock1 = LOCK_FILE_INIT, lock2 = LOCK_FILE_INIT; + + assert_se((tfd = mkdtemp_open(NULL, 0, &t)) >= 0); + + assert_se(make_lock_file_at(tfd, "lock", LOCK_EX, &lock1) >= 0); + assert_se(faccessat(tfd, "lock", F_OK, 0) >= 0); + assert_se(make_lock_file_at(tfd, "lock", LOCK_EX|LOCK_NB, &lock2) == -EBUSY); + release_lock_file(&lock1); + assert_se(RET_NERRNO(faccessat(tfd, "lock", F_OK, 0)) == -ENOENT); + assert_se(make_lock_file_at(tfd, "lock", LOCK_EX, &lock2) >= 0); + release_lock_file(&lock2); + assert_se(make_lock_file_at(tfd, "lock", LOCK_SH, &lock1) >= 0); + assert_se(faccessat(tfd, "lock", F_OK, 0) >= 0); + assert_se(make_lock_file_at(tfd, "lock", LOCK_SH, &lock2) >= 0); + release_lock_file(&lock1); + assert_se(faccessat(tfd, "lock", F_OK, 0) >= 0); + release_lock_file(&lock2); + + assert_se(fchdir(tfd) >= 0); + assert_se(make_lock_file_at(tfd, "lock", LOCK_EX, &lock1) >= 0); + assert_se(make_lock_file("lock", LOCK_EX|LOCK_NB, &lock2) == -EBUSY); +} + +static void test_lock_generic_with_timeout_for_type(LockType type) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int tfd = -EBADF, tfd2 = -EBADF; + + tfd = mkdtemp_open(NULL, 0, &t); + assert_se(tfd >= 0); + + tfd2 = fd_reopen(tfd, O_CLOEXEC|O_DIRECTORY); + assert_se(tfd2 >= 0); + + assert_se(lock_generic(tfd, LOCK_BSD, LOCK_EX) >= 0); + assert_se(lock_generic(tfd2, LOCK_BSD, LOCK_EX|LOCK_NB) == -EWOULDBLOCK); + + usec_t start = now(CLOCK_MONOTONIC); + assert_se(lock_generic_with_timeout(tfd2, LOCK_BSD, LOCK_EX, 200 * USEC_PER_MSEC) == -ETIMEDOUT); + assert_se(usec_sub_unsigned(now(CLOCK_MONOTONIC), start) >= 200 * USEC_PER_MSEC); + + assert_se(lock_generic(tfd, LOCK_BSD, LOCK_UN) >= 0); + assert_se(lock_generic_with_timeout(tfd2, LOCK_BSD, LOCK_EX, 200 * USEC_PER_MSEC) == 0); + assert_se(lock_generic(tfd, LOCK_BSD, LOCK_EX|LOCK_NB) == -EWOULDBLOCK); +} + +TEST(lock_generic_with_timeout) { + test_lock_generic_with_timeout_for_type(LOCK_BSD); + test_lock_generic_with_timeout_for_type(LOCK_UNPOSIX); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-log.c b/src/test/test-log.c new file mode 100644 index 0000000..b5ba67b --- /dev/null +++ b/src/test/test-log.c @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "format-util.h" +#include "io-util.h" +#include "iovec-util.h" +#include "iovec-wrapper.h" +#include "log.h" +#include "process-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +assert_cc(IS_SYNTHETIC_ERRNO(SYNTHETIC_ERRNO(EINVAL))); +assert_cc(!IS_SYNTHETIC_ERRNO(EINVAL)); +assert_cc(IS_SYNTHETIC_ERRNO(SYNTHETIC_ERRNO(0))); +assert_cc(!IS_SYNTHETIC_ERRNO(0)); + +#define X10(x) x x x x x x x x x x +#define X100(x) X10(X10(x)) +#define X1000(x) X100(X10(x)) + +static void test_file(void) { + log_info("__FILE__: %s", __FILE__); + log_info("RELATIVE_SOURCE_PATH: %s", RELATIVE_SOURCE_PATH); + log_info("PROJECT_FILE: %s", PROJECT_FILE); + + assert_se(startswith(__FILE__, RELATIVE_SOURCE_PATH "/")); +} + +static void test_log_struct(void) { + log_struct(LOG_INFO, + "MESSAGE=Waldo PID="PID_FMT" (no errno)", getpid_cached(), + "SERVICE=piepapo"); + + /* The same as above, just using LOG_MESSAGE(), which is generally recommended */ + log_struct(LOG_INFO, + LOG_MESSAGE("Waldo PID="PID_FMT" (no errno)", getpid_cached()), + "SERVICE=piepapo"); + + log_struct_errno(LOG_INFO, EILSEQ, + LOG_MESSAGE("Waldo PID="PID_FMT": %m (normal)", getpid_cached()), + "SERVICE=piepapo"); + + log_struct_errno(LOG_INFO, SYNTHETIC_ERRNO(EILSEQ), + LOG_MESSAGE("Waldo PID="PID_FMT": %m (synthetic)", getpid_cached()), + "SERVICE=piepapo"); + + log_struct(LOG_INFO, + LOG_MESSAGE("Foobar PID="PID_FMT, getpid_cached()), + "FORMAT_STR_TEST=1=%i A=%c 2=%hi 3=%li 4=%lli 1=%p foo=%s 2.5=%g 3.5=%g 4.5=%Lg", + (int) 1, 'A', (short) 2, (long int) 3, (long long int) 4, (void*) 1, "foo", (float) 2.5f, (double) 3.5, (long double) 4.5, + "SUFFIX=GOT IT"); +} + +static void test_long_lines(void) { + log_object_internal(LOG_NOTICE, + EUCLEAN, + X1000("abcd_") ".txt", + 1000000, + X1000("fff") "unc", + "OBJECT=", + X1000("obj_") "ect", + "EXTRA=", + X1000("ext_") "tra", + "asdfasdf %s asdfasdfa", "foobar"); +} + +static void test_log_syntax(void) { + assert_se(log_syntax("unit", LOG_ERR, "filename", 10, EINVAL, "EINVAL: %s: %m", "hogehoge") == -EINVAL); + assert_se(log_syntax("unit", LOG_ERR, "filename", 10, -ENOENT, "ENOENT: %s: %m", "hogehoge") == -ENOENT); + assert_se(log_syntax("unit", LOG_ERR, "filename", 10, SYNTHETIC_ERRNO(ENOTTY), "ENOTTY: %s: %m", "hogehoge") == -ENOTTY); +} + +static void test_log_context(void) { + { + char **strv = STRV_MAKE("FIRST=abc", "SECOND=qrs"); + + LOG_CONTEXT_PUSH("THIRD=pfs"); + LOG_CONTEXT_PUSH("FOURTH=def"); + LOG_CONTEXT_PUSH_STRV(strv); + LOG_CONTEXT_PUSH_STRV(strv); + + /* Test that the log context was set up correctly. The strv we pushed twice should only + * result in one log context which is reused. */ + assert_se(log_context_num_contexts() == 3); + assert_se(log_context_num_fields() == 4); + + /* Test that everything still works with modifications to the log context. */ + test_log_struct(); + test_long_lines(); + test_log_syntax(); + + { + LOG_CONTEXT_PUSH("FIFTH=123"); + LOG_CONTEXT_PUSH_STRV(strv); + + /* Check that our nested fields got added correctly. */ + assert_se(log_context_num_contexts() == 4); + assert_se(log_context_num_fields() == 5); + + /* Test that everything still works in a nested block. */ + test_log_struct(); + test_long_lines(); + test_log_syntax(); + } + + /* Check that only the fields from the nested block got removed. */ + assert_se(log_context_num_contexts() == 3); + assert_se(log_context_num_fields() == 4); + } + + assert_se(log_context_num_contexts() == 0); + assert_se(log_context_num_fields() == 0); + + { + _cleanup_(log_context_unrefp) LogContext *ctx = NULL; + + char **strv = STRV_MAKE("SIXTH=ijn", "SEVENTH=PRP"); + assert_se(ctx = log_context_new_strv(strv, /*owned=*/ false)); + + assert_se(log_context_num_contexts() == 1); + assert_se(log_context_num_fields() == 2); + + /* Test that everything still works with a manually configured log context. */ + test_log_struct(); + test_long_lines(); + test_log_syntax(); + } + + { + char **strv = NULL; + + assert_se(strv = strv_new("ABC", "DEF")); + LOG_CONTEXT_CONSUME_STRV(strv); + + assert_se(log_context_num_contexts() == 1); + assert_se(log_context_num_fields() == 2); + } + + { + /* Test that everything still works with a mixed strv and iov. */ + struct iovec iov[] = { + IOVEC_MAKE_STRING("ABC=def"), + IOVEC_MAKE_STRING("GHI=jkl"), + }; + _cleanup_free_ struct iovec_wrapper *iovw = iovw_new(); + assert_se(iovw); + assert_se(iovw_consume(iovw, strdup("MNO=pqr"), STRLEN("MNO=pqr") + 1) == 0); + + LOG_CONTEXT_PUSH_IOV(iov, ELEMENTSOF(iov)); + LOG_CONTEXT_PUSH_IOV(iov, ELEMENTSOF(iov)); + LOG_CONTEXT_CONSUME_IOV(iovw->iovec, iovw->count); + LOG_CONTEXT_PUSH("STU=vwx"); + + assert_se(log_context_num_contexts() == 3); + assert_se(log_context_num_fields() == 4); + + test_log_struct(); + test_long_lines(); + test_log_syntax(); + } + + { + LOG_CONTEXT_PUSH_KEY_VALUE("ABC=", "QED"); + LOG_CONTEXT_PUSH_KEY_VALUE("ABC=", "QED"); + assert_se(log_context_num_contexts() == 1); + assert_se(log_context_num_fields() == 1); + + test_log_struct(); + test_long_lines(); + test_log_syntax(); + } + + assert_se(log_context_num_contexts() == 0); + assert_se(log_context_num_fields() == 0); +} + +static void test_log_prefix(void) { + { + LOG_SET_PREFIX("ABC"); + + test_log_struct(); + test_long_lines(); + test_log_syntax(); + + { + LOG_SET_PREFIX("QED"); + + test_log_struct(); + test_long_lines(); + test_log_syntax(); + } + + test_log_struct(); + test_long_lines(); + test_log_syntax(); + } + + test_log_struct(); + test_long_lines(); + test_log_syntax(); +} + +int main(int argc, char* argv[]) { + test_setup_logging(LOG_DEBUG); + + test_file(); + + assert_se(log_info_errno(SYNTHETIC_ERRNO(EUCLEAN), "foo") == -EUCLEAN); + + for (int target = 0; target < _LOG_TARGET_MAX; target++) { + log_set_target(target); + log_open(); + + test_log_struct(); + test_long_lines(); + test_log_syntax(); + test_log_context(); + test_log_prefix(); + } + + return 0; +} diff --git a/src/test/test-logarithm.c b/src/test/test-logarithm.c new file mode 100644 index 0000000..b35fea9 --- /dev/null +++ b/src/test/test-logarithm.c @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "logarithm.h" +#include "tests.h" + +TEST(LOG2ULL) { + assert_se(LOG2ULL(0) == 0); + assert_se(LOG2ULL(1) == 0); + assert_se(LOG2ULL(8) == 3); + assert_se(LOG2ULL(9) == 3); + assert_se(LOG2ULL(15) == 3); + assert_se(LOG2ULL(16) == 4); + assert_se(LOG2ULL(1024*1024) == 20); + assert_se(LOG2ULL(1024*1024+5) == 20); +} + +TEST(CONST_LOG2ULL) { + assert_se(CONST_LOG2ULL(0) == 0); + assert_se(CONST_LOG2ULL(1) == 0); + assert_se(CONST_LOG2ULL(8) == 3); + assert_se(CONST_LOG2ULL(9) == 3); + assert_se(CONST_LOG2ULL(15) == 3); + assert_se(CONST_LOG2ULL(16) == 4); + assert_se(CONST_LOG2ULL(1024*1024) == 20); + assert_se(CONST_LOG2ULL(1024*1024+5) == 20); +} + +TEST(NONCONST_LOG2ULL) { + assert_se(NONCONST_LOG2ULL(0) == 0); + assert_se(NONCONST_LOG2ULL(1) == 0); + assert_se(NONCONST_LOG2ULL(8) == 3); + assert_se(NONCONST_LOG2ULL(9) == 3); + assert_se(NONCONST_LOG2ULL(15) == 3); + assert_se(NONCONST_LOG2ULL(16) == 4); + assert_se(NONCONST_LOG2ULL(1024*1024) == 20); + assert_se(NONCONST_LOG2ULL(1024*1024+5) == 20); +} + +TEST(log2u64) { + assert_se(log2u64(0) == 0); + assert_se(log2u64(1) == 0); + assert_se(log2u64(8) == 3); + assert_se(log2u64(9) == 3); + assert_se(log2u64(15) == 3); + assert_se(log2u64(16) == 4); + assert_se(log2u64(1024*1024) == 20); + assert_se(log2u64(1024*1024+5) == 20); +} + +TEST(log2u) { + assert_se(log2u(0) == 0); + assert_se(log2u(1) == 0); + assert_se(log2u(2) == 1); + assert_se(log2u(3) == 1); + assert_se(log2u(4) == 2); + assert_se(log2u(32) == 5); + assert_se(log2u(33) == 5); + assert_se(log2u(63) == 5); + assert_se(log2u(INT_MAX) == sizeof(int)*8-2); +} + +TEST(log2i) { + assert_se(log2i(0) == 0); + assert_se(log2i(1) == 0); + assert_se(log2i(2) == 1); + assert_se(log2i(3) == 1); + assert_se(log2i(4) == 2); + assert_se(log2i(32) == 5); + assert_se(log2i(33) == 5); + assert_se(log2i(63) == 5); + assert_se(log2i(INT_MAX) == sizeof(int)*8-2); +} + +TEST(popcount) { + uint16_t u16a = 0x0000; + uint16_t u16b = 0xFFFF; + uint32_t u32a = 0x00000010; + uint32_t u32b = 0xFFFFFFFF; + uint64_t u64a = 0x0000000000000010; + uint64_t u64b = 0x0100000000100010; + + assert_se(popcount(u16a) == 0); + assert_se(popcount(u16b) == 16); + assert_se(popcount(u32a) == 1); + assert_se(popcount(u32b) == 32); + assert_se(popcount(u64a) == 1); + assert_se(popcount(u64b) == 3); + + /* This would fail: + * error: ‘_Generic’ selector of type ‘int’ is not compatible with any association + * assert_se(popcount(0x10) == 1); + */ +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-loop-block.c b/src/test/test-loop-block.c new file mode 100644 index 0000000..1bd00d1 --- /dev/null +++ b/src/test/test-loop-block.c @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "capability-util.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "gpt.h" +#include "main-func.h" +#include "missing_loop.h" +#include "mkfs-util.h" +#include "mount-util.h" +#include "namespace-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" +#include "virt.h" + +static unsigned arg_n_threads = 5; +static unsigned arg_n_iterations = 3; +static usec_t arg_timeout = 0; + +#if HAVE_BLKID +static usec_t end = 0; + +static void verify_dissected_image(DissectedImage *dissected) { + assert_se(dissected->partitions[PARTITION_ESP].found); + assert_se(dissected->partitions[PARTITION_ESP].node); + assert_se(dissected->partitions[PARTITION_XBOOTLDR].found); + assert_se(dissected->partitions[PARTITION_XBOOTLDR].node); + assert_se(dissected->partitions[PARTITION_ROOT].found); + assert_se(dissected->partitions[PARTITION_ROOT].node); + assert_se(dissected->partitions[PARTITION_HOME].found); + assert_se(dissected->partitions[PARTITION_HOME].node); +} + +static void verify_dissected_image_harder(DissectedImage *dissected) { + verify_dissected_image(dissected); + + assert_se(streq(dissected->partitions[PARTITION_ESP].fstype, "vfat")); + assert_se(streq(dissected->partitions[PARTITION_XBOOTLDR].fstype, "vfat")); + assert_se(streq(dissected->partitions[PARTITION_ROOT].fstype, "ext4")); + assert_se(streq(dissected->partitions[PARTITION_HOME].fstype, "ext4")); +} + +static void* thread_func(void *ptr) { + int fd = PTR_TO_FD(ptr); + int r; + + for (unsigned i = 0; i < arg_n_iterations; i++) { + _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL; + + if (now(CLOCK_MONOTONIC) >= end) { + log_notice("Time's up, exiting thread's loop"); + break; + } + + log_notice("> Thread iteration #%u.", i); + + assert_se(mkdtemp_malloc(NULL, &mounted) >= 0); + + r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, 0, LO_FLAGS_PARTSCAN, LOCK_SH, &loop); + if (r < 0) + log_error_errno(r, "Failed to allocate loopback device: %m"); + assert_se(r >= 0); + assert_se(loop->dev); + assert_se(loop->backing_file); + + log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted); + + r = dissect_loop_device(loop, NULL, NULL, NULL, DISSECT_IMAGE_READ_ONLY|DISSECT_IMAGE_ADD_PARTITION_DEVICES|DISSECT_IMAGE_PIN_PARTITION_DEVICES, &dissected); + if (r < 0) + log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node); + assert_se(r >= 0); + + log_info("Dissected loop device %s", loop->node); + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + if (!dissected->partitions[d].found) + continue; + + log_notice("Found node %s fstype %s designator %s", + dissected->partitions[d].node, + dissected->partitions[d].fstype, + partition_designator_to_string(d)); + } + + verify_dissected_image(dissected); + + r = dissected_image_mount( + dissected, + mounted, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* userns_fd= */ -EBADF, + DISSECT_IMAGE_READ_ONLY); + log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted); + assert_se(r >= 0); + + /* Now the block device is mounted, we don't need no manual lock anymore, the devices are now + * pinned by the mounts. */ + assert_se(loop_device_flock(loop, LOCK_UN) >= 0); + + log_notice("Unmounting %s", mounted); + mounted = umount_and_rmdir_and_free(mounted); + + log_notice("Unmounted."); + + dissected = dissected_image_unref(dissected); + + log_notice("Detaching loop device %s", loop->node); + loop = loop_device_unref(loop); + log_notice("Detached loop device."); + } + + log_notice("Leaving thread"); + + return NULL; +} +#endif + +static bool have_root_gpt_type(void) { +#ifdef SD_GPT_ROOT_NATIVE + return true; +#else + return false; +#endif +} + +static int run(int argc, char *argv[]) { +#if HAVE_BLKID + _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL; + pthread_t threads[arg_n_threads]; + sd_id128_t id; +#endif + _cleanup_free_ char *p = NULL, *cmd = NULL; + _cleanup_pclose_ FILE *sfdisk = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + test_setup_logging(LOG_DEBUG); + log_show_tid(true); + log_show_time(true); + log_show_color(true); + + if (argc >= 2) { + r = safe_atou(argv[1], &arg_n_threads); + if (r < 0) + return log_error_errno(r, "Failed to parse first argument (number of threads): %s", argv[1]); + if (arg_n_threads <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Number of threads must be at least 1, refusing."); + } + + if (argc >= 3) { + r = safe_atou(argv[2], &arg_n_iterations); + if (r < 0) + return log_error_errno(r, "Failed to parse second argument (number of iterations): %s", argv[2]); + if (arg_n_iterations <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Number of iterations must be at least 1, refusing."); + } + + if (argc >= 4) { + r = parse_sec(argv[3], &arg_timeout); + if (r < 0) + return log_error_errno(r, "Failed to parse third argument (timeout): %s", argv[3]); + } + + if (argc >= 5) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too many arguments (expected 3 at max)."); + + if (!have_root_gpt_type()) + return log_tests_skipped("No root partition GPT defined for this architecture"); + + r = find_executable("sfdisk", NULL); + if (r < 0) + return log_tests_skipped_errno(r, "Could not find sfdisk command"); + + assert_se(tempfn_random_child("/var/tmp", "sfdisk", &p) >= 0); + fd = open(p, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC|O_NOFOLLOW, 0666); + assert_se(fd >= 0); + assert_se(ftruncate(fd, 256*1024*1024) >= 0); + + assert_se(cmd = strjoin("sfdisk ", p)); + assert_se(sfdisk = popen(cmd, "we")); + + /* A reasonably complex partition table that fits on a 64K disk */ + fputs("label: gpt\n" + "size=32M, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B\n" + "size=32M, type=BC13C2FF-59E6-4262-A352-B275FD6F7172\n" + "size=32M, type=0657FD6D-A4AB-43C4-84E5-0933C84B4F4F\n" + "size=32M, type=", sfdisk); + +#ifdef SD_GPT_ROOT_NATIVE + fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(SD_GPT_ROOT_NATIVE)); +#else + fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(SD_GPT_ROOT_X86_64)); +#endif + + fputs("\n" + "size=32M, type=933AC7E1-2EB4-4F13-B844-0E14E2AEF915\n", sfdisk); + + assert_se(pclose(sfdisk) == 0); + sfdisk = NULL; + +#if HAVE_BLKID + assert_se(dissect_image_file(p, NULL, NULL, NULL, 0, &dissected) >= 0); + verify_dissected_image(dissected); + dissected = dissected_image_unref(dissected); +#endif + + if (geteuid() != 0 || have_effective_cap(CAP_SYS_ADMIN) <= 0) { + log_tests_skipped("not running privileged"); + return 0; + } + + if (detect_container() > 0) { + log_tests_skipped("Test not supported in a container, requires udev/uevent notifications"); + return 0; + } + + assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, 0, LO_FLAGS_PARTSCAN, LOCK_EX, &loop) >= 0); + +#if HAVE_BLKID + assert_se(dissect_loop_device(loop, NULL, NULL, NULL, DISSECT_IMAGE_ADD_PARTITION_DEVICES|DISSECT_IMAGE_PIN_PARTITION_DEVICES, &dissected) >= 0); + verify_dissected_image(dissected); + + FOREACH_STRING(fs, "vfat", "ext4") { + r = mkfs_exists(fs); + assert_se(r >= 0); + if (!r) { + log_tests_skipped("mkfs.{vfat|ext4} not installed"); + return 0; + } + } + assert_se(r >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_ESP].node, "vfat", "EFI", NULL, id, true, false, 0, NULL) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_XBOOTLDR].node, "vfat", "xbootldr", NULL, id, true, false, 0, NULL) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_ROOT].node, "ext4", "root", NULL, id, true, false, 0, NULL) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_HOME].node, "ext4", "home", NULL, id, true, false, 0, NULL) >= 0); + + dissected = dissected_image_unref(dissected); + + /* We created the file systems now via the per-partition block devices. But the dissection code might + * probe them via the whole block device. These block devices have separate buffer caches though, + * hence what was written via the partition device might not appear on the whole block device + * yet. Let's hence explicitly flush the whole block device, so that the read-back definitely + * works. */ + assert_se(ioctl(loop->fd, BLKFLSBUF, 0) >= 0); + + /* Try to read once, without pinning or adding partitions, i.e. by only accessing the whole block + * device. */ + assert_se(dissect_loop_device(loop, NULL, NULL, NULL, 0, &dissected) >= 0); + verify_dissected_image_harder(dissected); + dissected = dissected_image_unref(dissected); + + /* Now go via the loopback device after all, but this time add/pin, because now we want to mount it. */ + assert_se(dissect_loop_device(loop, NULL, NULL, NULL, DISSECT_IMAGE_ADD_PARTITION_DEVICES|DISSECT_IMAGE_PIN_PARTITION_DEVICES, &dissected) >= 0); + verify_dissected_image_harder(dissected); + + assert_se(mkdtemp_malloc(NULL, &mounted) >= 0); + + /* We are particularly correct here, and now downgrade LOCK → LOCK_SH. That's because we are done + * with formatting the file systems, so we don't need the exclusive lock anymore. From now on a + * shared one is fine. This way udev can now probe the device if it wants, but still won't call + * BLKRRPART on it, and that's good, because that would destroy our partition table while we are at + * it. */ + assert_se(loop_device_flock(loop, LOCK_SH) >= 0); + + /* This is a test for the loopback block device setup code and it's use by the image dissection + * logic: since the kernel APIs are hard use and prone to races, let's test this in a heavy duty + * test: we open a bunch of threads and repeatedly allocate and deallocate loopback block devices in + * them in parallel, with an image file with a number of partitions. */ + assert_se(detach_mount_namespace() >= 0); + + /* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */ + assert_se(dissected_image_mount( + dissected, + mounted, + /* uid_shift= */ UID_INVALID, + /* uid_range= */ UID_INVALID, + /* usernfs_fd= */ -EBADF, + 0) >= 0); + + /* Now we mounted everything, the partitions are pinned. Now it's fine to release the lock + * fully. This means udev could now issue BLKRRPART again, but that's OK given this will fail because + * we now mounted the device. */ + assert_se(loop_device_flock(loop, LOCK_UN) >= 0); + + assert_se(umount_recursive(mounted, 0) >= 0); + loop = loop_device_unref(loop); + + log_notice("Threads are being started now"); + + /* zero timeout means pick default: let's make sure we run for 10s on slow systems at max */ + if (arg_timeout == 0) + arg_timeout = slow_tests_enabled() ? 5 * USEC_PER_SEC : 1 * USEC_PER_SEC; + + end = usec_add(now(CLOCK_MONOTONIC), arg_timeout); + + if (arg_n_threads > 1) + for (unsigned i = 0; i < arg_n_threads; i++) + assert_se(pthread_create(threads + i, NULL, thread_func, FD_TO_PTR(fd)) == 0); + + log_notice("All threads started now."); + + if (arg_n_threads == 1) + assert_se(thread_func(FD_TO_PTR(fd)) == NULL); + else + for (unsigned i = 0; i < arg_n_threads; i++) { + log_notice("Joining thread #%u.", i); + + void *k; + assert_se(pthread_join(threads[i], &k) == 0); + assert_se(!k); + + log_notice("Joined thread #%u.", i); + } + + log_notice("Threads are all terminated now."); +#else + log_notice("Cutting test short, since we do not have libblkid."); +#endif + return 0; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/test/test-loopback.c b/src/test/test-loopback.c new file mode 100644 index 0000000..48869ae --- /dev/null +++ b/src/test/test-loopback.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "errno-util.h" +#include "log.h" +#include "loopback-setup.h" +#include "tests.h" + +TEST_RET(loopback_setup) { + int r; + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { + if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) + return log_tests_skipped("lacking privileges or namespaces not supported"); + return log_error_errno(errno, "Failed to create user+network namespace: %m"); + } + + r = loopback_setup(); + if (r < 0) + return log_error_errno(r, "loopback: %m"); + + log_info("> ipv6 main"); + /* <0 → fork error, ==0 → success, >0 → error in child */ + assert_se(system("ip -6 route show table main") >= 0); + + log_info("> ipv6 local"); + assert_se(system("ip -6 route show table local") >=0); + + log_info("> ipv4 main"); + assert_se(system("ip -4 route show table main") >= 0); + + log_info("> ipv4 local"); + assert_se(system("ip -4 route show table local") >= 0); + + return EXIT_SUCCESS; +} + +static int intro(void) { + log_show_color(true); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-macro.c b/src/test/test-macro.c new file mode 100644 index 0000000..b91a1f9 --- /dev/null +++ b/src/test/test-macro.c @@ -0,0 +1,1040 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "log.h" +#include "macro.h" +#include "tests.h" + +TEST(saturate_add) { + assert_se(saturate_add(1, 2, UINT8_MAX) == 3); + assert_se(saturate_add(1, UINT8_MAX-2, UINT8_MAX) == UINT8_MAX-1); + assert_se(saturate_add(1, UINT8_MAX-1, UINT8_MAX) == UINT8_MAX); + assert_se(saturate_add(1, UINT8_MAX, UINT8_MAX) == UINT8_MAX); + assert_se(saturate_add(2, UINT8_MAX, UINT8_MAX) == UINT8_MAX); + assert_se(saturate_add(60, 60, 50) == 50); +} + +TEST(ALIGN_POWER2) { + unsigned long i, p2; + + assert_se(ALIGN_POWER2(0) == 0); + assert_se(ALIGN_POWER2(1) == 1); + assert_se(ALIGN_POWER2(2) == 2); + assert_se(ALIGN_POWER2(3) == 4); + assert_se(ALIGN_POWER2(4) == 4); + assert_se(ALIGN_POWER2(5) == 8); + assert_se(ALIGN_POWER2(6) == 8); + assert_se(ALIGN_POWER2(7) == 8); + assert_se(ALIGN_POWER2(9) == 16); + assert_se(ALIGN_POWER2(10) == 16); + assert_se(ALIGN_POWER2(11) == 16); + assert_se(ALIGN_POWER2(12) == 16); + assert_se(ALIGN_POWER2(13) == 16); + assert_se(ALIGN_POWER2(14) == 16); + assert_se(ALIGN_POWER2(15) == 16); + assert_se(ALIGN_POWER2(16) == 16); + assert_se(ALIGN_POWER2(17) == 32); + + assert_se(ALIGN_POWER2(ULONG_MAX) == 0); + assert_se(ALIGN_POWER2(ULONG_MAX - 1) == 0); + assert_se(ALIGN_POWER2(ULONG_MAX - 1024) == 0); + assert_se(ALIGN_POWER2(ULONG_MAX / 2) == ULONG_MAX / 2 + 1); + assert_se(ALIGN_POWER2(ULONG_MAX + 1) == 0); + + for (i = 1; i < 131071; ++i) { + for (p2 = 1; p2 < i; p2 <<= 1) + /* empty */ ; + + assert_se(ALIGN_POWER2(i) == p2); + } + + for (i = ULONG_MAX - 1024; i < ULONG_MAX; ++i) { + for (p2 = 1; p2 && p2 < i; p2 <<= 1) + /* empty */ ; + + assert_se(ALIGN_POWER2(i) == p2); + } +} + +TEST(MAX) { + static const struct { + int a; + int b[CONST_MAX(10, 100)]; + } val1 = { + .a = CONST_MAX(10, 100), + }; + int d = 0; + unsigned long x = 12345; + unsigned long y = 54321; + const char str[] = "a_string_constant"; + const unsigned long long arr[] = {9999ULL, 10ULL, 0ULL, 3000ULL, 2000ULL, 1000ULL, 100ULL, 9999999ULL}; + void *p = (void *)str; + void *q = (void *)&str[16]; + + assert_cc(sizeof(val1.b) == sizeof(int) * 100); + + /* CONST_MAX returns (void) instead of a value if the passed arguments + * are not of the same type or not constant expressions. */ + assert_cc(__builtin_types_compatible_p(typeof(CONST_MAX(1, 10)), int)); + assert_cc(__builtin_types_compatible_p(typeof(CONST_MAX(1, 1U)), void)); + + assert_se(val1.a == 100); + assert_se(MAX(++d, 0) == 1); + assert_se(d == 1); + + assert_cc(MAXSIZE(char[3], uint16_t) == 3); + assert_cc(MAXSIZE(char[3], uint32_t) == 4); + assert_cc(MAXSIZE(char, long) == sizeof(long)); + + assert_se(MAX(-5, 5) == 5); + assert_se(MAX(5, 5) == 5); + assert_se(MAX(MAX(1, MAX(2, MAX(3, 4))), 5) == 5); + assert_se(MAX(MAX(1, MAX(2, MAX(3, 2))), 1) == 3); + assert_se(MAX(MIN(1, MIN(2, MIN(3, 4))), 5) == 5); + assert_se(MAX(MAX(1, MIN(2, MIN(3, 2))), 1) == 2); + assert_se(LESS_BY(8, 4) == 4); + assert_se(LESS_BY(8, 8) == 0); + assert_se(LESS_BY(4, 8) == 0); + assert_se(LESS_BY(16, LESS_BY(8, 4)) == 12); + assert_se(LESS_BY(4, LESS_BY(8, 4)) == 0); + assert_se(CMP(3, 5) == -1); + assert_se(CMP(5, 3) == 1); + assert_se(CMP(5, 5) == 0); + assert_se(CMP(x, y) == -1); + assert_se(CMP(y, x) == 1); + assert_se(CMP(x, x) == 0); + assert_se(CMP(y, y) == 0); + assert_se(CMP(UINT64_MAX, (uint64_t) 0) == 1); + assert_se(CMP((uint64_t) 0, UINT64_MAX) == -1); + assert_se(CMP(UINT64_MAX, UINT64_MAX) == 0); + assert_se(CMP(INT64_MIN, INT64_MAX) == -1); + assert_se(CMP(INT64_MAX, INT64_MIN) == 1); + assert_se(CMP(INT64_MAX, INT64_MAX) == 0); + assert_se(CMP(INT64_MIN, INT64_MIN) == 0); + assert_se(CMP(INT64_MAX, (int64_t) 0) == 1); + assert_se(CMP((int64_t) 0, INT64_MIN) == 1); + assert_se(CMP(INT64_MIN, (int64_t) 0) == -1); + assert_se(CMP((int64_t) 0, INT64_MAX) == -1); + assert_se(CMP(&str[2], &str[7]) == -1); + assert_se(CMP(&str[2], &str[2]) == 0); + assert_se(CMP(&str[7], (const char *)str) == 1); + assert_se(CMP(str[2], str[7]) == 1); + assert_se(CMP(str[7], *str) == 1); + assert_se(CMP((const unsigned long long *)arr, &arr[3]) == -1); + assert_se(CMP(*arr, arr[3]) == 1); + assert_se(CMP(p, q) == -1); + assert_se(CMP(q, p) == 1); + assert_se(CMP(p, p) == 0); + assert_se(CMP(q, q) == 0); + assert_se(CLAMP(-5, 0, 1) == 0); + assert_se(CLAMP(5, 0, 1) == 1); + assert_se(CLAMP(5, -10, 1) == 1); + assert_se(CLAMP(5, -10, 10) == 5); + assert_se(CLAMP(CLAMP(0, -10, 10), CLAMP(-5, 10, 20), CLAMP(100, -5, 20)) == 10); +} + +#pragma GCC diagnostic push +#ifdef __clang__ +# pragma GCC diagnostic ignored "-Waddress-of-packed-member" +#endif + +TEST(container_of) { + struct mytype { + uint8_t pad1[3]; + uint64_t v1; + uint8_t pad2[2]; + uint32_t v2; + } myval = { }; + + assert_cc(sizeof(myval) >= 17); + assert_se(container_of(&myval.v1, struct mytype, v1) == &myval); + assert_se(container_of(&myval.v2, struct mytype, v2) == &myval); + assert_se(container_of(&container_of(&myval.v2, + struct mytype, + v2)->v1, + struct mytype, + v1) == &myval); +} + +#pragma GCC diagnostic pop + +TEST(DIV_ROUND_UP) { + int div; + + /* basic tests */ + assert_se(DIV_ROUND_UP(0, 8) == 0); + assert_se(DIV_ROUND_UP(1, 8) == 1); + assert_se(DIV_ROUND_UP(8, 8) == 1); + assert_se(DIV_ROUND_UP(12, 8) == 2); + assert_se(DIV_ROUND_UP(16, 8) == 2); + + /* test multiple evaluation */ + div = 0; + assert_se(DIV_ROUND_UP(div++, 8) == 0 && div == 1); + assert_se(DIV_ROUND_UP(++div, 8) == 1 && div == 2); + assert_se(DIV_ROUND_UP(8, div++) == 4 && div == 3); + assert_se(DIV_ROUND_UP(8, ++div) == 2 && div == 4); + + /* overflow test with exact division */ + assert_se(sizeof(0U) == 4); + assert_se(0xfffffffaU % 10U == 0U); + assert_se(0xfffffffaU / 10U == 429496729U); + assert_se(DIV_ROUND_UP(0xfffffffaU, 10U) == 429496729U); + assert_se((0xfffffffaU + 10U - 1U) / 10U == 0U); + assert_se(0xfffffffaU / 10U + !!(0xfffffffaU % 10U) == 429496729U); + + /* overflow test with rounded division */ + assert_se(0xfffffffdU % 10U == 3U); + assert_se(0xfffffffdU / 10U == 429496729U); + assert_se(DIV_ROUND_UP(0xfffffffdU, 10U) == 429496730U); + assert_se((0xfffffffdU + 10U - 1U) / 10U == 0U); + assert_se(0xfffffffdU / 10U + !!(0xfffffffdU % 10U) == 429496730U); +} + +TEST(PTR_TO_INT) { + /* Primary reason to have this test is to validate that pointers are large enough to hold entire int range */ + assert_se(PTR_TO_INT(INT_TO_PTR(0)) == 0); + assert_se(PTR_TO_INT(INT_TO_PTR(1)) == 1); + assert_se(PTR_TO_INT(INT_TO_PTR(-1)) == -1); + assert_se(PTR_TO_INT(INT_TO_PTR(INT_MAX)) == INT_MAX); + assert_se(PTR_TO_INT(INT_TO_PTR(INT_MIN)) == INT_MIN); +} + +TEST(IN_SET) { + assert_se(IN_SET(1, 1, 2)); + assert_se(IN_SET(1, 1, 2, 3, 4)); + assert_se(IN_SET(2, 1, 2, 3, 4)); + assert_se(IN_SET(3, 1, 2, 3, 4)); + assert_se(IN_SET(4, 1, 2, 3, 4)); + assert_se(!IN_SET(0, 1, 2)); + assert_se(!IN_SET(0, 1, 2, 3, 4)); + + struct { + unsigned x:3; + } t = { 1 }; + + assert_se(IN_SET(t.x, 1, 2)); + assert_se(IN_SET(t.x, 1, 2, 3, 4)); + assert_se(IN_SET(t.x, 2, 3, 4, 1)); + assert_se(!IN_SET(t.x, 0, 2)); + assert_se(!IN_SET(t.x, 2, 3, 4)); +} + +TEST(FOREACH_POINTER) { + int a, b, c, *i; + size_t k = 0; + + FOREACH_POINTER(i, &a, &b, &c) { + switch (k) { + + case 0: + assert_se(i == &a); + break; + + case 1: + assert_se(i == &b); + break; + + case 2: + assert_se(i == &c); + break; + + default: + assert_not_reached(); + break; + } + + k++; + } + + assert_se(k == 3); + + FOREACH_POINTER(i, &b) { + assert_se(k == 3); + assert_se(i == &b); + k = 4; + } + + assert_se(k == 4); + + FOREACH_POINTER(i, NULL, &c, NULL, &b, NULL, &a, NULL) { + switch (k) { + + case 4: + assert_se(i == NULL); + break; + + case 5: + assert_se(i == &c); + break; + + case 6: + assert_se(i == NULL); + break; + + case 7: + assert_se(i == &b); + break; + + case 8: + assert_se(i == NULL); + break; + + case 9: + assert_se(i == &a); + break; + + case 10: + assert_se(i == NULL); + break; + + default: + assert_not_reached(); + break; + } + + k++; + } + + assert_se(k == 11); +} + +TEST(FOREACH_VA_ARGS) { + size_t i; + + i = 0; + uint8_t u8, u8_1 = 1, u8_2 = 2, u8_3 = 3; + VA_ARGS_FOREACH(u8, u8_2, 8, 0xff, u8_1, u8_3, 0, 1) { + switch(i++) { + case 0: assert_se(u8 == u8_2); break; + case 1: assert_se(u8 == 8); break; + case 2: assert_se(u8 == 0xff); break; + case 3: assert_se(u8 == u8_1); break; + case 4: assert_se(u8 == u8_3); break; + case 5: assert_se(u8 == 0); break; + case 6: assert_se(u8 == 1); break; + default: assert_se(false); + } + } + assert_se(i == 7); + i = 0; + VA_ARGS_FOREACH(u8, 0) { + assert_se(u8 == 0); + assert_se(i++ == 0); + } + assert_se(i == 1); + i = 0; + VA_ARGS_FOREACH(u8, 0xff) { + assert_se(u8 == 0xff); + assert_se(i++ == 0); + } + assert_se(i == 1); + VA_ARGS_FOREACH(u8) + assert_se(false); + + i = 0; + uint32_t u32, u32_1 = 0xffff0000, u32_2 = 10, u32_3 = 0xffff; + VA_ARGS_FOREACH(u32, 1, 100, u32_2, 1000, u32_3, u32_1, 1, 0) { + switch(i++) { + case 0: assert_se(u32 == 1); break; + case 1: assert_se(u32 == 100); break; + case 2: assert_se(u32 == u32_2); break; + case 3: assert_se(u32 == 1000); break; + case 4: assert_se(u32 == u32_3); break; + case 5: assert_se(u32 == u32_1); break; + case 6: assert_se(u32 == 1); break; + case 7: assert_se(u32 == 0); break; + default: assert_se(false); + } + } + assert_se(i == 8); + i = 0; + VA_ARGS_FOREACH(u32, 0) { + assert_se(u32 == 0); + assert_se(i++ == 0); + } + assert_se(i == 1); + i = 0; + VA_ARGS_FOREACH(u32, 1000) { + assert_se(u32 == 1000); + assert_se(i++ == 0); + } + assert_se(i == 1); + VA_ARGS_FOREACH(u32) + assert_se(false); + + i = 0; + uint64_t u64, u64_1 = 0xffffffffffffffff, u64_2 = 50, u64_3 = 0xffff; + VA_ARGS_FOREACH(u64, 44, 0, u64_3, 100, u64_2, u64_1, 50000) { + switch(i++) { + case 0: assert_se(u64 == 44); break; + case 1: assert_se(u64 == 0); break; + case 2: assert_se(u64 == u64_3); break; + case 3: assert_se(u64 == 100); break; + case 4: assert_se(u64 == u64_2); break; + case 5: assert_se(u64 == u64_1); break; + case 6: assert_se(u64 == 50000); break; + default: assert_se(false); + } + } + assert_se(i == 7); + i = 0; + VA_ARGS_FOREACH(u64, 0) { + assert_se(u64 == 0); + assert_se(i++ == 0); + } + assert_se(i == 1); + i = 0; + VA_ARGS_FOREACH(u64, 0xff00ff00000000) { + assert_se(u64 == 0xff00ff00000000); + assert_se(i++ == 0); + } + assert_se(i == 1); + VA_ARGS_FOREACH(u64) + assert_se(false); + + struct test { + int a; + char b; + }; + + i = 0; + struct test s, + s_1 = { .a = 0, .b = 'c', }, + s_2 = { .a = 100000, .b = 'z', }, + s_3 = { .a = 0xff, .b = 'q', }, + s_4 = { .a = 1, .b = 'x', }; + VA_ARGS_FOREACH(s, s_1, (struct test){ .a = 10, .b = 'd', }, s_2, (struct test){}, s_3, s_4) { + switch(i++) { + case 0: assert_se(s.a == 0 ); assert_se(s.b == 'c'); break; + case 1: assert_se(s.a == 10 ); assert_se(s.b == 'd'); break; + case 2: assert_se(s.a == 100000); assert_se(s.b == 'z'); break; + case 3: assert_se(s.a == 0 ); assert_se(s.b == 0 ); break; + case 4: assert_se(s.a == 0xff ); assert_se(s.b == 'q'); break; + case 5: assert_se(s.a == 1 ); assert_se(s.b == 'x'); break; + default: assert_se(false); + } + } + assert_se(i == 6); + i = 0; + VA_ARGS_FOREACH(s, (struct test){ .a = 1, .b = 'A', }) { + assert_se(s.a == 1); + assert_se(s.b == 'A'); + assert_se(i++ == 0); + } + assert_se(i == 1); + VA_ARGS_FOREACH(s) + assert_se(false); + + i = 0; + struct test *p, *p_1 = &s_1, *p_2 = &s_2, *p_3 = &s_3, *p_4 = &s_4; + VA_ARGS_FOREACH(p, p_1, NULL, p_2, p_3, NULL, p_4, NULL) { + switch(i++) { + case 0: assert_se(p == p_1); break; + case 1: assert_se(p == NULL); break; + case 2: assert_se(p == p_2); break; + case 3: assert_se(p == p_3); break; + case 4: assert_se(p == NULL); break; + case 5: assert_se(p == p_4); break; + case 6: assert_se(p == NULL); break; + default: assert_se(false); + } + } + assert_se(i == 7); + i = 0; + VA_ARGS_FOREACH(p, p_3) { + assert_se(p == p_3); + assert_se(i++ == 0); + } + assert_se(i == 1); + VA_ARGS_FOREACH(p) + assert_se(false); + + i = 0; + void *v, *v_1 = p_1, *v_2 = p_2, *v_3 = p_3; + uint32_t *u32p = &u32; + VA_ARGS_FOREACH(v, v_1, NULL, u32p, v_3, p_2, p_4, v_2, NULL) { + switch(i++) { + case 0: assert_se(v == v_1); break; + case 1: assert_se(v == NULL); break; + case 2: assert_se(v == u32p); break; + case 3: assert_se(v == v_3); break; + case 4: assert_se(v == p_2); break; + case 5: assert_se(v == p_4); break; + case 6: assert_se(v == v_2); break; + case 7: assert_se(v == NULL); break; + default: assert_se(false); + } + } + assert_se(i == 8); + i = 0; + VA_ARGS_FOREACH(v, NULL) { + assert_se(v == NULL); + assert_se(i++ == 0); + } + assert_se(i == 1); + i = 0; + VA_ARGS_FOREACH(v, v_1) { + assert_se(v == v_1); + assert_se(i++ == 0); + } + assert_se(i == 1); + VA_ARGS_FOREACH(v) + assert_se(false); +} + +TEST(ALIGN_TO) { + assert_se(ALIGN_TO(0, 1) == 0); + assert_se(ALIGN_TO(1, 1) == 1); + assert_se(ALIGN_TO(2, 1) == 2); + assert_se(ALIGN_TO(3, 1) == 3); + assert_se(ALIGN_TO(4, 1) == 4); + assert_se(ALIGN_TO(SIZE_MAX-1, 1) == SIZE_MAX-1); + assert_se(ALIGN_TO(SIZE_MAX, 1) == SIZE_MAX); + + assert_se(ALIGN_TO(0, 2) == 0); + assert_se(ALIGN_TO(1, 2) == 2); + assert_se(ALIGN_TO(2, 2) == 2); + assert_se(ALIGN_TO(3, 2) == 4); + assert_se(ALIGN_TO(4, 2) == 4); + assert_se(ALIGN_TO(SIZE_MAX-3, 2) == SIZE_MAX-3); + assert_se(ALIGN_TO(SIZE_MAX-2, 2) == SIZE_MAX-1); + assert_se(ALIGN_TO(SIZE_MAX-1, 2) == SIZE_MAX-1); + assert_se(ALIGN_TO(SIZE_MAX, 2) == SIZE_MAX); /* overflow */ + + assert_se(ALIGN_TO(0, 4) == 0); + assert_se(ALIGN_TO(1, 4) == 4); + assert_se(ALIGN_TO(2, 4) == 4); + assert_se(ALIGN_TO(3, 4) == 4); + assert_se(ALIGN_TO(4, 4) == 4); + assert_se(ALIGN_TO(SIZE_MAX-3, 4) == SIZE_MAX-3); + assert_se(ALIGN_TO(SIZE_MAX-2, 4) == SIZE_MAX); /* overflow */ + assert_se(ALIGN_TO(SIZE_MAX-1, 4) == SIZE_MAX); /* overflow */ + assert_se(ALIGN_TO(SIZE_MAX, 4) == SIZE_MAX); /* overflow */ + + assert_se(ALIGN_TO_U64(0, 1) == 0); + assert_se(ALIGN_TO_U64(1, 1) == 1); + assert_se(ALIGN_TO_U64(2, 1) == 2); + assert_se(ALIGN_TO_U64(3, 1) == 3); + assert_se(ALIGN_TO_U64(4, 1) == 4); + assert_se(ALIGN_TO_U64(UINT64_MAX-1, 1) == UINT64_MAX-1); + assert_se(ALIGN_TO_U64(UINT64_MAX, 1) == UINT64_MAX); + + assert_se(ALIGN_TO_U64(0, 2) == 0); + assert_se(ALIGN_TO_U64(1, 2) == 2); + assert_se(ALIGN_TO_U64(2, 2) == 2); + assert_se(ALIGN_TO_U64(3, 2) == 4); + assert_se(ALIGN_TO_U64(4, 2) == 4); + assert_se(ALIGN_TO_U64(UINT64_MAX-3, 2) == UINT64_MAX-3); + assert_se(ALIGN_TO_U64(UINT64_MAX-2, 2) == UINT64_MAX-1); + assert_se(ALIGN_TO_U64(UINT64_MAX-1, 2) == UINT64_MAX-1); + assert_se(ALIGN_TO_U64(UINT64_MAX, 2) == UINT64_MAX); /* overflow */ + + assert_se(ALIGN_TO_U64(0, 4) == 0); + assert_se(ALIGN_TO_U64(1, 4) == 4); + assert_se(ALIGN_TO_U64(2, 4) == 4); + assert_se(ALIGN_TO_U64(3, 4) == 4); + assert_se(ALIGN_TO_U64(4, 4) == 4); + assert_se(ALIGN_TO_U64(UINT64_MAX-3, 4) == UINT64_MAX-3); + assert_se(ALIGN_TO_U64(UINT64_MAX-2, 4) == UINT64_MAX); /* overflow */ + assert_se(ALIGN_TO_U64(UINT64_MAX-1, 4) == UINT64_MAX); /* overflow */ + assert_se(ALIGN_TO_U64(UINT64_MAX, 4) == UINT64_MAX); /* overflow */ + + assert_cc(CONST_ALIGN_TO(96, 512) == 512); + assert_cc(CONST_ALIGN_TO(511, 512) == 512); + assert_cc(CONST_ALIGN_TO(512, 512) == 512); + assert_cc(CONST_ALIGN_TO(513, 512) == 1024); + assert_cc(CONST_ALIGN_TO(sizeof(int), 64) == 64); + + assert_cc(__builtin_types_compatible_p(typeof(CONST_ALIGN_TO(4, 3)), void)); + assert_cc(__builtin_types_compatible_p(typeof(CONST_ALIGN_TO(SIZE_MAX, 512)), void)); +} + +TEST(align_down) { + assert_se(ALIGN_DOWN(0, 1) == 0); + assert_se(ALIGN_DOWN(1, 1) == 1); + assert_se(ALIGN_DOWN(2, 1) == 2); + assert_se(ALIGN_DOWN(3, 1) == 3); + assert_se(ALIGN_DOWN(4, 1) == 4); + assert_se(ALIGN_DOWN(SIZE_MAX-1, 1) == SIZE_MAX-1); + assert_se(ALIGN_DOWN(SIZE_MAX, 1) == SIZE_MAX); + + assert_se(ALIGN_DOWN(0, 2) == 0); + assert_se(ALIGN_DOWN(1, 2) == 0); + assert_se(ALIGN_DOWN(2, 2) == 2); + assert_se(ALIGN_DOWN(3, 2) == 2); + assert_se(ALIGN_DOWN(4, 2) == 4); + assert_se(ALIGN_DOWN(SIZE_MAX-1, 2) == SIZE_MAX-1); + assert_se(ALIGN_DOWN(SIZE_MAX, 2) == SIZE_MAX-1); + + assert_se(ALIGN_DOWN(0, 4) == 0); + assert_se(ALIGN_DOWN(1, 4) == 0); + assert_se(ALIGN_DOWN(2, 4) == 0); + assert_se(ALIGN_DOWN(3, 4) == 0); + assert_se(ALIGN_DOWN(4, 4) == 4); + assert_se(ALIGN_DOWN(SIZE_MAX-1, 4) == SIZE_MAX-3); + assert_se(ALIGN_DOWN(SIZE_MAX, 4) == SIZE_MAX-3); + + assert_se(ALIGN_DOWN_U64(0, 1) == 0); + assert_se(ALIGN_DOWN_U64(1, 1) == 1); + assert_se(ALIGN_DOWN_U64(2, 1) == 2); + assert_se(ALIGN_DOWN_U64(3, 1) == 3); + assert_se(ALIGN_DOWN_U64(4, 1) == 4); + assert_se(ALIGN_DOWN_U64(UINT64_MAX-1, 1) == UINT64_MAX-1); + assert_se(ALIGN_DOWN_U64(UINT64_MAX, 1) == UINT64_MAX); + + assert_se(ALIGN_DOWN_U64(0, 2) == 0); + assert_se(ALIGN_DOWN_U64(1, 2) == 0); + assert_se(ALIGN_DOWN_U64(2, 2) == 2); + assert_se(ALIGN_DOWN_U64(3, 2) == 2); + assert_se(ALIGN_DOWN_U64(4, 2) == 4); + assert_se(ALIGN_DOWN_U64(UINT64_MAX-1, 2) == UINT64_MAX-1); + assert_se(ALIGN_DOWN_U64(UINT64_MAX, 2) == UINT64_MAX-1); + + assert_se(ALIGN_DOWN_U64(0, 4) == 0); + assert_se(ALIGN_DOWN_U64(1, 4) == 0); + assert_se(ALIGN_DOWN_U64(2, 4) == 0); + assert_se(ALIGN_DOWN_U64(3, 4) == 0); + assert_se(ALIGN_DOWN_U64(4, 4) == 4); + assert_se(ALIGN_DOWN_U64(UINT64_MAX-1, 4) == UINT64_MAX-3); + assert_se(ALIGN_DOWN_U64(UINT64_MAX, 4) == UINT64_MAX-3); +} + +TEST(align_offset) { + assert_se(ALIGN_OFFSET(0, 1) == 0); + assert_se(ALIGN_OFFSET(1, 1) == 0); + assert_se(ALIGN_OFFSET(2, 1) == 0); + assert_se(ALIGN_OFFSET(3, 1) == 0); + assert_se(ALIGN_OFFSET(4, 1) == 0); + assert_se(ALIGN_OFFSET(SIZE_MAX-1, 1) == 0); + assert_se(ALIGN_OFFSET(SIZE_MAX, 1) == 0); + + assert_se(ALIGN_OFFSET(0, 2) == 0); + assert_se(ALIGN_OFFSET(1, 2) == 1); + assert_se(ALIGN_OFFSET(2, 2) == 0); + assert_se(ALIGN_OFFSET(3, 2) == 1); + assert_se(ALIGN_OFFSET(4, 2) == 0); + assert_se(ALIGN_OFFSET(SIZE_MAX-1, 2) == 0); + assert_se(ALIGN_OFFSET(SIZE_MAX, 2) == 1); + + assert_se(ALIGN_OFFSET(0, 4) == 0); + assert_se(ALIGN_OFFSET(1, 4) == 1); + assert_se(ALIGN_OFFSET(2, 4) == 2); + assert_se(ALIGN_OFFSET(3, 4) == 3); + assert_se(ALIGN_OFFSET(4, 4) == 0); + assert_se(ALIGN_OFFSET(SIZE_MAX-1, 4) == 2); + assert_se(ALIGN_OFFSET(SIZE_MAX, 4) == 3); + + assert_se(ALIGN_OFFSET_U64(0, 1) == 0); + assert_se(ALIGN_OFFSET_U64(1, 1) == 0); + assert_se(ALIGN_OFFSET_U64(2, 1) == 0); + assert_se(ALIGN_OFFSET_U64(3, 1) == 0); + assert_se(ALIGN_OFFSET_U64(4, 1) == 0); + assert_se(ALIGN_OFFSET_U64(UINT64_MAX-1, 1) == 0); + assert_se(ALIGN_OFFSET_U64(UINT64_MAX, 1) == 0); + + assert_se(ALIGN_OFFSET_U64(0, 2) == 0); + assert_se(ALIGN_OFFSET_U64(1, 2) == 1); + assert_se(ALIGN_OFFSET_U64(2, 2) == 0); + assert_se(ALIGN_OFFSET_U64(3, 2) == 1); + assert_se(ALIGN_OFFSET_U64(4, 2) == 0); + assert_se(ALIGN_OFFSET_U64(UINT64_MAX-1, 2) == 0); + assert_se(ALIGN_OFFSET_U64(UINT64_MAX, 2) == 1); + + assert_se(ALIGN_OFFSET_U64(0, 4) == 0); + assert_se(ALIGN_OFFSET_U64(1, 4) == 1); + assert_se(ALIGN_OFFSET_U64(2, 4) == 2); + assert_se(ALIGN_OFFSET_U64(3, 4) == 3); + assert_se(ALIGN_OFFSET_U64(4, 4) == 0); + assert_se(ALIGN_OFFSET_U64(UINT64_MAX-1, 4) == 2); + assert_se(ALIGN_OFFSET_U64(UINT64_MAX, 4) == 3); +} + +TEST(flags) { + enum { + F1 = 1 << 0, + F2 = 1 << 1, + F3 = 1 << 2, + F_ALL = F1 | F2 | F3 + }; + unsigned n, f; + + assert_se(FLAGS_SET(0, 0)); + assert_se(FLAGS_SET(F1, F1)); + assert_se(FLAGS_SET(F1 | F2, F1)); + assert_se(FLAGS_SET(F1 | F3, F1 | F3)); + assert_se(FLAGS_SET(F1 | F2 | F3, F_ALL)); + assert_se(!FLAGS_SET(0, F1)); + assert_se(!FLAGS_SET(F2, F1)); + assert_se(!FLAGS_SET(F1 | F2, F3)); + assert_se(!FLAGS_SET(F1 | F2, F1 | F3)); + assert_se(!FLAGS_SET(F1 | F2 | F3, ~F_ALL)); + + /* Check for no double eval. */ + n = F2; + f = F1; + assert_se(!FLAGS_SET(--n, ++f)); + assert_se(n == F1); + assert_se(f == F2); + + SET_FLAG(n, F3, true); + assert_se(n == (F1 | F3)); + SET_FLAG(n, F2, false); + assert_se(n == (F1 | F3)); + SET_FLAG(n, F3, false); + assert_se(n == F1); + SET_FLAG(n, F1, true); + assert_se(n == F1); + SET_FLAG(n, F1 | F3, true); + assert_se(n == (F1 | F3)); + SET_FLAG(n, F_ALL, false); + assert_se(n == 0); + + assert_se(UPDATE_FLAG(0, 0, true) == 0); + assert_se(UPDATE_FLAG(0, F1, true) == F1); + assert_se(UPDATE_FLAG(0, F1 | F2, true) == (F1 | F2)); + assert_se(UPDATE_FLAG(F1, 0, true) == F1); + assert_se(UPDATE_FLAG(F1, F1, true) == F1); + assert_se(UPDATE_FLAG(F1, F3, true) == (F1 | F3)); + assert_se(UPDATE_FLAG(F1, F1 | F3, true) == (F1 | F3)); + assert_se(UPDATE_FLAG(F1, F_ALL, true) == F_ALL); + assert_se(UPDATE_FLAG(0, 0, false) == 0); + assert_se(UPDATE_FLAG(0, F1, false) == 0); + assert_se(UPDATE_FLAG(0, F1 | F2, false) == 0); + assert_se(UPDATE_FLAG(F1, 0, false) == F1); + assert_se(UPDATE_FLAG(F1, F1, false) == 0); + assert_se(UPDATE_FLAG(F1, F3, false) == F1); + assert_se(UPDATE_FLAG(F1, F1 | F3, false) == 0); + assert_se(UPDATE_FLAG(F1, F2 | F3, false) == F1); + assert_se(UPDATE_FLAG(F1, F_ALL, false) == 0); + assert_se(UPDATE_FLAG(F_ALL, F_ALL, false) == 0); + + /* Check for no double eval. */ + n = F2; + f = F1; + assert_se(UPDATE_FLAG(--n, ++f, true) == (F1 | F2)); + assert_se(n == F1); + assert_se(f == F2); +} + +TEST(DECIMAL_STR_WIDTH) { + assert_se(DECIMAL_STR_WIDTH(0) == 1); + assert_se(DECIMAL_STR_WIDTH(1) == 1); + assert_se(DECIMAL_STR_WIDTH(2) == 1); + assert_se(DECIMAL_STR_WIDTH(9) == 1); + assert_se(DECIMAL_STR_WIDTH(10) == 2); + assert_se(DECIMAL_STR_WIDTH(11) == 2); + assert_se(DECIMAL_STR_WIDTH(99) == 2); + assert_se(DECIMAL_STR_WIDTH(100) == 3); + assert_se(DECIMAL_STR_WIDTH(101) == 3); + assert_se(DECIMAL_STR_WIDTH(-1) == 2); + assert_se(DECIMAL_STR_WIDTH(-2) == 2); + assert_se(DECIMAL_STR_WIDTH(-9) == 2); + assert_se(DECIMAL_STR_WIDTH(-10) == 3); + assert_se(DECIMAL_STR_WIDTH(-11) == 3); + assert_se(DECIMAL_STR_WIDTH(-99) == 3); + assert_se(DECIMAL_STR_WIDTH(-100) == 4); + assert_se(DECIMAL_STR_WIDTH(-101) == 4); + assert_se(DECIMAL_STR_WIDTH(UINT64_MAX) == STRLEN("18446744073709551615")); + assert_se(DECIMAL_STR_WIDTH(INT64_MAX) == STRLEN("9223372036854775807")); + assert_se(DECIMAL_STR_WIDTH(INT64_MIN) == STRLEN("-9223372036854775808")); +} + +TEST(DECIMAL_STR_MAX) { + int8_t s8_longest = INT8_MIN; + int16_t s16_longest = INT16_MIN; + int32_t s32_longest = INT32_MIN; + int64_t s64_longest = INT64_MIN; + uint8_t u8_longest = UINT8_MAX; + uint16_t u16_longest = UINT16_MAX; + uint32_t u32_longest = UINT32_MAX; + uint64_t u64_longest = UINT64_MAX; + + /* NB: Always add +1, because DECIMAL_STR_MAX() includes space for trailing NUL byte, but + * DECIMAL_STR_WIDTH() does not! */ + assert_se(DECIMAL_STR_MAX(int8_t) == DECIMAL_STR_WIDTH(s8_longest)+1); + assert_se(DECIMAL_STR_MAX(int16_t) == DECIMAL_STR_WIDTH(s16_longest)+1); + assert_se(DECIMAL_STR_MAX(int32_t) == DECIMAL_STR_WIDTH(s32_longest)+1); + assert_se(DECIMAL_STR_MAX(int64_t) == DECIMAL_STR_WIDTH(s64_longest)+1); + + assert_se(DECIMAL_STR_MAX(uint8_t) == DECIMAL_STR_WIDTH(u8_longest)+1); + assert_se(DECIMAL_STR_MAX(uint16_t) == DECIMAL_STR_WIDTH(u16_longest)+1); + assert_se(DECIMAL_STR_MAX(uint32_t) == DECIMAL_STR_WIDTH(u32_longest)+1); + assert_se(DECIMAL_STR_MAX(uint64_t) == DECIMAL_STR_WIDTH(u64_longest)+1); +} + +TEST(PTR_SUB1) { + static const uint64_t x[4] = { 2, 3, 4, 5 }; + const uint64_t *p; + + p = x + ELEMENTSOF(x)-1; + assert_se(*p == 5); + + p = PTR_SUB1(p, x); + assert_se(*p == 4); + + p = PTR_SUB1(p, x); + assert_se(*p == 3); + + p = PTR_SUB1(p, x); + assert_se(*p == 2); + + p = PTR_SUB1(p, x); + assert_se(!p); + + p = PTR_SUB1(p, x); + assert_se(!p); +} + +TEST(ISPOWEROF2) { + uint64_t u; + int64_t i; + + /* First, test constant expressions */ + assert_se(!ISPOWEROF2(-2)); + assert_se(!ISPOWEROF2(-1)); + assert_se(!ISPOWEROF2(0)); + assert_se(ISPOWEROF2(1)); + assert_se(ISPOWEROF2(2)); + assert_se(!ISPOWEROF2(3)); + assert_se(ISPOWEROF2(4)); + assert_se(!ISPOWEROF2(5)); + assert_se(!ISPOWEROF2(6)); + assert_se(!ISPOWEROF2(7)); + assert_se(ISPOWEROF2(8)); + assert_se(!ISPOWEROF2(9)); + assert_se(!ISPOWEROF2(1022)); + assert_se(ISPOWEROF2(1024)); + assert_se(!ISPOWEROF2(1025)); + assert_se(!ISPOWEROF2(UINT64_C(0xffffffff))); + assert_se(ISPOWEROF2(UINT64_C(0x100000000))); + assert_se(!ISPOWEROF2(UINT64_C(0x100000001))); + + /* Then, test dynamic expressions, and if they are side-effect free */ + i = -2; + assert_se(!ISPOWEROF2(i++)); + assert_se(i == -1); + assert_se(!ISPOWEROF2(i++)); + assert_se(i == 0); + assert_se(!ISPOWEROF2(i++)); + assert_se(i == 1); + assert_se(ISPOWEROF2(i++)); + assert_se(i == 2); + assert_se(ISPOWEROF2(i++)); + assert_se(i == 3); + assert_se(!ISPOWEROF2(i++)); + assert_se(i == 4); + assert_se(ISPOWEROF2(i++)); + assert_se(i == 5); + assert_se(!ISPOWEROF2(i)); + + u = 0; + assert_se(!ISPOWEROF2(u++)); + assert_se(u == 1); + assert_se(ISPOWEROF2(u++)); + assert_se(u == 2); + assert_se(ISPOWEROF2(u++)); + assert_se(u == 3); + assert_se(!ISPOWEROF2(u++)); + assert_se(u == 4); + assert_se(ISPOWEROF2(u++)); + assert_se(u == 5); + assert_se(!ISPOWEROF2(u)); +} + +TEST(ALIGNED) { + assert_se(IS_ALIGNED16(NULL)); + assert_se(IS_ALIGNED32(NULL)); + assert_se(IS_ALIGNED64(NULL)); + + uint64_t u64; + uint32_t u32; + uint16_t u16; + + assert_se(IS_ALIGNED16(&u16)); + assert_se(IS_ALIGNED16(&u32)); + assert_se(IS_ALIGNED16(&u64)); + assert_se(IS_ALIGNED32(&u32)); + assert_se(IS_ALIGNED32(&u64)); + assert_se(IS_ALIGNED64(&u64)); + + _align_(32) uint8_t ua256; + _align_(8) uint8_t ua64; + _align_(4) uint8_t ua32; + _align_(2) uint8_t ua16; + + assert_se(IS_ALIGNED16(&ua256)); + assert_se(IS_ALIGNED32(&ua256)); + assert_se(IS_ALIGNED64(&ua256)); + + assert_se(IS_ALIGNED16(&ua64)); + assert_se(IS_ALIGNED32(&ua64)); + assert_se(IS_ALIGNED64(&ua64)); + + assert_se(IS_ALIGNED16(&ua32)); + assert_se(IS_ALIGNED32(&ua32)); + + assert_se(IS_ALIGNED16(&ua16)); + +#ifdef __x86_64__ + /* Conditionalized on x86-64, since there we know for sure that all three types are aligned to + * their size. Too lazy to figure it out for other archs */ + void *p = UINT_TO_PTR(1); /* definitely not aligned */ + assert_se(!IS_ALIGNED16(p)); + assert_se(!IS_ALIGNED32(p)); + assert_se(!IS_ALIGNED64(p)); + + assert_se(IS_ALIGNED16(ALIGN2_PTR(p))); + assert_se(IS_ALIGNED32(ALIGN4_PTR(p))); + assert_se(IS_ALIGNED64(ALIGN8_PTR(p))); + + p = UINT_TO_PTR(-1); /* also definitely not aligned */ + assert_se(!IS_ALIGNED16(p)); + assert_se(!IS_ALIGNED32(p)); + assert_se(!IS_ALIGNED64(p)); +#endif +} + +TEST(FOREACH_ARRAY) { + int a[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + int b[10] = { 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + int x, n; + + x = n = 0; + FOREACH_ARRAY(i, a, 10) { + x += *i; + n++; + } + assert_se(x == 45); + assert_se(n == 10); + + x = n = 0; + FOREACH_ARRAY(i, a, 10) + FOREACH_ARRAY(j, b, 10) { + x += (*i) * (*j); + n++; + } + assert_se(x == 45 * 45); + assert_se(n == 10 * 10); + + x = n = 0; + FOREACH_ARRAY(i, a, 5) + FOREACH_ARRAY(j, b, 5) { + x += (*i) * (*j); + n++; + } + assert_se(x == 10 * 35); + assert_se(n == 5 * 5); + + x = n = 0; + FOREACH_ARRAY(i, a, 0) + FOREACH_ARRAY(j, b, 0) { + x += (*i) * (*j); + n++; + } + assert_se(x == 0); + assert_se(n == 0); + + x = n = 0; + FOREACH_ARRAY(i, a, -1) + FOREACH_ARRAY(j, b, -1) { + x += (*i) * (*j); + n++; + } + assert_se(x == 0); + assert_se(n == 0); +} + +#define TEST_ROUND_UP_BY_TYPE(type, max_value) \ + ({ \ + type x, y; \ + x = 0, y = 1; \ + assert_se(ROUND_UP(x, y) == 0); \ + x = 0, y = 2; \ + assert_se(ROUND_UP(x, y) == 0); \ + x = 0, y = 3; \ + assert_se(ROUND_UP(x, y) == 0); \ + x = 0, y = 4; \ + assert_se(ROUND_UP(x, y) == 0); \ + x = 1, y = 1; \ + assert_se(ROUND_UP(x, y) == 1); \ + x = 1, y = 2; \ + assert_se(ROUND_UP(x, y) == 2); \ + x = 1, y = 3; \ + assert_se(ROUND_UP(x, y) == 3); \ + x = 1, y = 4; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = 2, y = 1; \ + assert_se(ROUND_UP(x, y) == 2); \ + x = 2, y = 2; \ + assert_se(ROUND_UP(x, y) == 2); \ + x = 2, y = 3; \ + assert_se(ROUND_UP(x, y) == 3); \ + x = 2, y = 4; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = 3, y = 1; \ + assert_se(ROUND_UP(x, y) == 3); \ + x = 3, y = 2; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = 3, y = 3; \ + assert_se(ROUND_UP(x, y) == 3); \ + x = 3, y = 4; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = 4, y = 1; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = 4, y = 2; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = 4, y = 3; \ + assert_se(ROUND_UP(x, y) == 6); \ + x = 4, y = 4; \ + assert_se(ROUND_UP(x, y) == 4); \ + x = max_value, y = 1; \ + assert_se(ROUND_UP(x, y) == max_value); \ + x = max_value, y = 2; \ + assert_se(ROUND_UP(x, y) == max_value); \ + x = max_value, y = 3; \ + assert_se(ROUND_UP(x, y) == max_value); \ + x = max_value, y = 4; \ + assert_se(ROUND_UP(x, y) == max_value); \ + x = max_value-1, y = 1; \ + assert_se(ROUND_UP(x, y) == max_value-1); \ + x = max_value-1, y = 2; \ + assert_se(ROUND_UP(x, y) == max_value-1); \ + x = max_value-1, y = 4; \ + assert_se(ROUND_UP(x, y) == max_value); \ + }) + +TEST(ROUND_UP) { + TEST_ROUND_UP_BY_TYPE(uint8_t, UINT8_MAX); + TEST_ROUND_UP_BY_TYPE(uint16_t, UINT16_MAX); + TEST_ROUND_UP_BY_TYPE(uint32_t, UINT32_MAX); + TEST_ROUND_UP_BY_TYPE(uint64_t, UINT64_MAX); +} + +TEST(u64_multiply_safe) { + assert_se(u64_multiply_safe(0, 0) == 0); + assert_se(u64_multiply_safe(10, 0) == 0); + assert_se(u64_multiply_safe(0, 10) == 0); + assert_se(u64_multiply_safe(10, 10) == 100); + + assert_se(u64_multiply_safe(UINT64_MAX, 0) == 0); + assert_se(u64_multiply_safe(UINT64_MAX, 1) == UINT64_MAX); + assert_se(u64_multiply_safe(UINT64_MAX, 2) == 0); + assert_se(u64_multiply_safe(0, UINT64_MAX) == 0); + assert_se(u64_multiply_safe(1, UINT64_MAX) == UINT64_MAX); + assert_se(u64_multiply_safe(2, UINT64_MAX) == 0); + + assert_se(u64_multiply_safe(UINT64_MAX / 2, 0) == 0); + assert_se(u64_multiply_safe(UINT64_MAX / 2, 1) == UINT64_MAX / 2); + assert_se(u64_multiply_safe(UINT64_MAX / 2, 2) == UINT64_MAX - 1); + assert_se(u64_multiply_safe(UINT64_MAX / 2, 3) == 0); + assert_se(u64_multiply_safe(0, UINT64_MAX / 2) == 0); + assert_se(u64_multiply_safe(1, UINT64_MAX / 2) == UINT64_MAX / 2); + assert_se(u64_multiply_safe(2, UINT64_MAX / 2) == UINT64_MAX - 1); + assert_se(u64_multiply_safe(3, UINT64_MAX / 2) == 0); + + assert_se(u64_multiply_safe(UINT64_MAX, UINT64_MAX) == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-manager.c b/src/test/test-manager.c new file mode 100644 index 0000000..76e094b --- /dev/null +++ b/src/test/test-manager.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "manager.h" +#include "tests.h" + +TEST(manager_taint_string) { + Manager m = {}; + + _cleanup_free_ char *a = manager_taint_string(&m); + assert_se(a); + log_debug("taint string: '%s'", a); + + if (cg_all_unified() == 0) + assert_se(strstr(a, "cgroupsv1")); + else + assert_se(!strstr(a, "cgroupsv1")); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-math-util.c b/src/test/test-math-util.c new file mode 100644 index 0000000..9771576 --- /dev/null +++ b/src/test/test-math-util.c @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "math-util.h" +#include "tests.h" + +TEST(iszero_safe) { + /* zeros */ + assert_se(iszero_safe(0.0)); + assert_se(iszero_safe(-0.0)); + assert_se(iszero_safe(0e0)); + assert_se(iszero_safe(-0e0)); + assert_se(iszero_safe(0e+0)); + assert_se(iszero_safe(0e-0)); + assert_se(iszero_safe(-0e-0)); + assert_se(iszero_safe(-0e000)); + assert_se(iszero_safe(0e000)); + + /* non-zero normal values */ + assert_se(!iszero_safe(42.0)); + assert_se(!iszero_safe(M_PI)); + assert_se(!iszero_safe(DBL_MAX)); + assert_se(!iszero_safe(-DBL_MAX)); + assert_se(!iszero_safe(DBL_MIN)); + assert_se(!iszero_safe(-DBL_MIN)); + assert_se(!iszero_safe(1 / DBL_MAX)); + + /* subnormal values */ + assert_se(!iszero_safe(DBL_MIN / 2)); + assert_se(!iszero_safe(-DBL_MIN / 42)); + assert_se(!iszero_safe(1 / DBL_MAX / 2)); + + /* too small values which cannot be in subnormal form */ + assert_se( iszero_safe(DBL_MIN / DBL_MAX)); + assert_se( iszero_safe(DBL_MIN / -DBL_MAX)); + assert_se( iszero_safe(-DBL_MIN / DBL_MAX)); + assert_se( iszero_safe(-DBL_MIN / -DBL_MAX)); + + /* NaN or infinity */ + assert_se(!iszero_safe(NAN)); + assert_se(!iszero_safe(INFINITY)); + assert_se(!iszero_safe(-INFINITY)); + assert_se(!iszero_safe(1 / NAN)); + + /* inverse of infinity */ + assert_se( iszero_safe(1 / INFINITY)); + assert_se( iszero_safe(1 / -INFINITY)); + assert_se( iszero_safe(-1 / INFINITY)); + assert_se( iszero_safe(-1 / -INFINITY)); + assert_se( iszero_safe(42 / -INFINITY)); + assert_se( iszero_safe(-42 / -INFINITY)); + assert_se( iszero_safe(DBL_MIN / INFINITY)); + assert_se( iszero_safe(DBL_MIN / -INFINITY)); + assert_se( iszero_safe(DBL_MAX / INFINITY / 2)); + assert_se( iszero_safe(DBL_MAX / -INFINITY * DBL_MAX)); + + /* infinity / infinity is NaN */ + assert_se(!iszero_safe(INFINITY / INFINITY)); + assert_se(!iszero_safe(INFINITY * 2 / INFINITY)); + assert_se(!iszero_safe(INFINITY / DBL_MAX / INFINITY)); +} + +TEST(fp_equal) { + /* normal values */ + assert_se( fp_equal(0.0, -0e0)); + assert_se( fp_equal(3.0, 3)); + assert_se(!fp_equal(3.000001, 3)); + assert_se( fp_equal(M_PI, M_PI)); + assert_se(!fp_equal(M_PI, -M_PI)); + assert_se( fp_equal(DBL_MAX, DBL_MAX)); + assert_se(!fp_equal(DBL_MAX, -DBL_MAX)); + assert_se(!fp_equal(-DBL_MAX, DBL_MAX)); + assert_se( fp_equal(-DBL_MAX, -DBL_MAX)); + assert_se( fp_equal(DBL_MIN, DBL_MIN)); + assert_se(!fp_equal(DBL_MIN, -DBL_MIN)); + assert_se(!fp_equal(-DBL_MIN, DBL_MIN)); + assert_se( fp_equal(-DBL_MIN, -DBL_MIN)); + + /* subnormal values */ + assert_se( fp_equal(DBL_MIN / 10, DBL_MIN / 10)); + assert_se(!fp_equal(DBL_MIN / 10, -DBL_MIN / 10)); + assert_se(!fp_equal(-DBL_MIN / 10, DBL_MIN / 10)); + assert_se( fp_equal(-DBL_MIN / 10, -DBL_MIN / 10)); + assert_se(!fp_equal(DBL_MIN / 10, DBL_MIN / 15)); + assert_se(!fp_equal(DBL_MIN / 10, DBL_MIN / 15)); + + /* subnormal difference */ + assert_se(!fp_equal(DBL_MIN / 10, DBL_MIN + DBL_MIN / 10)); + assert_se( fp_equal(3.0, 3.0 + DBL_MIN / 2)); /* 3.0 + DBL_MIN / 2 is truncated to 3.0 */ + + /* too small values */ + assert_se( fp_equal(DBL_MIN / DBL_MAX, -DBL_MIN / DBL_MAX)); + + /* NaN or infinity */ + assert_se(!fp_equal(NAN, NAN)); + assert_se(!fp_equal(NAN, 0)); + assert_se(!fp_equal(NAN, INFINITY)); + assert_se(!fp_equal(INFINITY, INFINITY)); + assert_se(!fp_equal(INFINITY, -INFINITY)); + assert_se(!fp_equal(-INFINITY, INFINITY)); + assert_se(!fp_equal(-INFINITY, -INFINITY)); + + /* inverse of infinity */ + assert_se( fp_equal(0, 1 / INFINITY)); + assert_se( fp_equal(42 / INFINITY, 1 / -INFINITY)); + assert_se(!fp_equal(42 / INFINITY, INFINITY / INFINITY)); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-memfd-util.c b/src/test/test-memfd-util.c new file mode 100644 index 0000000..f8e1b46 --- /dev/null +++ b/src/test/test-memfd-util.c @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "errno-util.h" +#include "fd-util.h" +#include "memfd-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(memfd_get_sealed) { +#define TEST_TEXT "this is some random test text we are going to write to a memfd" + _cleanup_close_ int fd = -EBADF; + + fd = memfd_new("test-memfd-get-sealed"); + if (fd < 0) { + assert_se(ERRNO_IS_NOT_SUPPORTED(fd)); + return; + } + + assert_se(write(fd, TEST_TEXT, strlen(TEST_TEXT)) == strlen(TEST_TEXT)); + /* we'll leave the read offset at the end of the memfd, the fdopen_independent() descriptors should + * start at the beginning anyway */ + + assert_se(memfd_get_sealed(fd) == 0); + assert_se(memfd_set_sealed(fd) >= 0); + assert_se(memfd_get_sealed(fd) > 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-memory-util.c b/src/test/test-memory-util.c new file mode 100644 index 0000000..cd4b64a --- /dev/null +++ b/src/test/test-memory-util.c @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memory-util.h" +#include "tests.h" + +TEST(eqzero) { + const uint32_t zeros[] = {0, 0, 0}; + const uint32_t ones[] = {1, 1}; + const uint32_t mixed[] = {0, 1, 0, 0, 0}; + const uint8_t longer[] = {[55] = 255}; + + assert_se(eqzero(zeros)); + assert_se(!eqzero(ones)); + assert_se(!eqzero(mixed)); + assert_se(!eqzero(longer)); +} + +static void my_destructor(struct iovec *iov, size_t n) { + /* not really a destructor, just something we can use to check if the destruction worked */ + memset(iov, 'y', sizeof(struct iovec) * n); +} + +TEST(cleanup_array) { + struct iovec *iov, *saved_iov; + size_t n, saved_n; + + n = 7; + iov = new(struct iovec, n); + assert_se(iov); + + memset(iov, 'x', sizeof(struct iovec) * n); + + saved_iov = iov; + saved_n = n; + + { + assert_se(memeqbyte('x', saved_iov, sizeof(struct iovec) * saved_n)); + assert_se(iov); + assert_se(n > 0); + + CLEANUP_ARRAY(iov, n, my_destructor); + + assert_se(memeqbyte('x', saved_iov, sizeof(struct iovec) * saved_n)); + assert_se(iov); + assert_se(n > 0); + } + + assert_se(memeqbyte('y', saved_iov, sizeof(struct iovec) * saved_n)); + assert_se(!iov); + assert_se(n == 0); + + free(saved_iov); +} + +TEST(page_align) { + assert_se(PAGE_ALIGN(page_size() - 1) == page_size()); + assert_se(PAGE_ALIGN(page_size() ) == page_size()); + assert_se(PAGE_ALIGN(page_size() + 1) == page_size() * 2); + assert_se(PAGE_ALIGN(page_size() * 123 - 1) == page_size() * 123); + assert_se(PAGE_ALIGN(page_size() * 123 ) == page_size() * 123); + assert_se(PAGE_ALIGN(page_size() * 123 + 1) == page_size() * 124); + assert_se(PAGE_ALIGN(SIZE_MAX - page_size() - 1) == SIZE_MAX - page_size() + 1); + assert_se(PAGE_ALIGN(SIZE_MAX - page_size() ) == SIZE_MAX - page_size() + 1); + assert_se(PAGE_ALIGN(SIZE_MAX - page_size() + 1) == SIZE_MAX - page_size() + 1); + assert_se(PAGE_ALIGN(SIZE_MAX - page_size() + 2) == SIZE_MAX); /* overflow */ + assert_se(PAGE_ALIGN(SIZE_MAX) == SIZE_MAX); /* overflow */ + + assert_se(PAGE_ALIGN_U64(page_size() - 1) == page_size()); + assert_se(PAGE_ALIGN_U64(page_size() ) == page_size()); + assert_se(PAGE_ALIGN_U64(page_size() + 1) == page_size() * 2); + assert_se(PAGE_ALIGN_U64(page_size() * 123 - 1) == page_size() * 123); + assert_se(PAGE_ALIGN_U64(page_size() * 123 ) == page_size() * 123); + assert_se(PAGE_ALIGN_U64(page_size() * 123 + 1) == page_size() * 124); + assert_se(PAGE_ALIGN_U64(UINT64_MAX - page_size() - 1) == UINT64_MAX - page_size() + 1); + assert_se(PAGE_ALIGN_U64(UINT64_MAX - page_size() ) == UINT64_MAX - page_size() + 1); + assert_se(PAGE_ALIGN_U64(UINT64_MAX - page_size() + 1) == UINT64_MAX - page_size() + 1); + assert_se(PAGE_ALIGN_U64(UINT64_MAX - page_size() + 2) == UINT64_MAX); /* overflow */ + assert_se(PAGE_ALIGN_U64(UINT64_MAX) == UINT64_MAX); /* overflow */ + + assert_se(PAGE_ALIGN_DOWN(page_size() - 1) == 0); + assert_se(PAGE_ALIGN_DOWN(page_size() ) == page_size()); + assert_se(PAGE_ALIGN_DOWN(page_size() + 1) == page_size()); + assert_se(PAGE_ALIGN_DOWN(page_size() * 123 - 1) == page_size() * 122); + assert_se(PAGE_ALIGN_DOWN(page_size() * 123 ) == page_size() * 123); + assert_se(PAGE_ALIGN_DOWN(page_size() * 123 + 1) == page_size() * 123); + assert_se(PAGE_ALIGN_DOWN(SIZE_MAX - page_size() - 1) == SIZE_MAX - page_size() * 2 + 1); + assert_se(PAGE_ALIGN_DOWN(SIZE_MAX - page_size() ) == SIZE_MAX - page_size() * 2 + 1); + assert_se(PAGE_ALIGN_DOWN(SIZE_MAX - page_size() + 1) == SIZE_MAX - page_size() + 1); + assert_se(PAGE_ALIGN_DOWN(SIZE_MAX - page_size() + 2) == SIZE_MAX - page_size() + 1); + + assert_se(PAGE_ALIGN_DOWN_U64(page_size() - 1) == 0); + assert_se(PAGE_ALIGN_DOWN_U64(page_size() ) == page_size()); + assert_se(PAGE_ALIGN_DOWN_U64(page_size() + 1) == page_size()); + assert_se(PAGE_ALIGN_DOWN_U64(page_size() * 123 - 1) == page_size() * 122); + assert_se(PAGE_ALIGN_DOWN_U64(page_size() * 123 ) == page_size() * 123); + assert_se(PAGE_ALIGN_DOWN_U64(page_size() * 123 + 1) == page_size() * 123); + assert_se(PAGE_ALIGN_DOWN_U64(SIZE_MAX - page_size() - 1) == SIZE_MAX - page_size() * 2 + 1); + assert_se(PAGE_ALIGN_DOWN_U64(SIZE_MAX - page_size() ) == SIZE_MAX - page_size() * 2 + 1); + assert_se(PAGE_ALIGN_DOWN_U64(SIZE_MAX - page_size() + 1) == SIZE_MAX - page_size() + 1); + assert_se(PAGE_ALIGN_DOWN_U64(SIZE_MAX - page_size() + 2) == SIZE_MAX - page_size() + 1); + + assert_se(PAGE_OFFSET(page_size() - 1) == page_size() - 1); + assert_se(PAGE_OFFSET(page_size() ) == 0); + assert_se(PAGE_OFFSET(page_size() + 1) == 1); + assert_se(PAGE_OFFSET(page_size() * 123 - 1) == page_size() - 1); + assert_se(PAGE_OFFSET(page_size() * 123 ) == 0); + assert_se(PAGE_OFFSET(page_size() * 123 + 1) == 1); + assert_se(PAGE_OFFSET(SIZE_MAX - page_size() - 1) == page_size() - 2); + assert_se(PAGE_OFFSET(SIZE_MAX - page_size() ) == page_size() - 1); + assert_se(PAGE_OFFSET(SIZE_MAX - page_size() + 1) == 0); + assert_se(PAGE_OFFSET(SIZE_MAX - page_size() + 2) == 1); + + assert_se(PAGE_OFFSET_U64(page_size() - 1) == page_size() - 1); + assert_se(PAGE_OFFSET_U64(page_size() ) == 0); + assert_se(PAGE_OFFSET_U64(page_size() + 1) == 1); + assert_se(PAGE_OFFSET_U64(page_size() * 123 - 1) == page_size() - 1); + assert_se(PAGE_OFFSET_U64(page_size() * 123 ) == 0); + assert_se(PAGE_OFFSET_U64(page_size() * 123 + 1) == 1); + assert_se(PAGE_OFFSET_U64(UINT64_MAX - page_size() - 1) == page_size() - 2); + assert_se(PAGE_OFFSET_U64(UINT64_MAX - page_size() ) == page_size() - 1); + assert_se(PAGE_OFFSET_U64(UINT64_MAX - page_size() + 1) == 0); + assert_se(PAGE_OFFSET_U64(UINT64_MAX - page_size() + 2) == 1); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-mempool.c b/src/test/test-mempool.c new file mode 100644 index 0000000..d3bc173 --- /dev/null +++ b/src/test/test-mempool.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "mempool.h" +#include "random-util.h" +#include "tests.h" + +struct element { + uint64_t value; +}; + +DEFINE_MEMPOOL(test_mempool, struct element, 8); + +TEST(mempool_trim) { + +#define NN 4000 + struct element *a[NN]; + size_t n_freed = 0; + + assert_se(!test_mempool.first_pool); + assert_se(!test_mempool.freelist); + + mempool_trim(&test_mempool); + + for (size_t i = 0; i < NN; i++) { + assert_se(a[i] = mempool_alloc_tile(&test_mempool)); + a[i]->value = i; + } + + mempool_trim(&test_mempool); + + /* free up to one third randomly */ + size_t x = 0; + for (size_t i = 0; i < NN/3; i++) { + x = (x + random_u64()) % ELEMENTSOF(a); + assert_se(!a[x] || a[x]->value == x); + + if (a[x]) + n_freed ++; + + a[x] = mempool_free_tile(&test_mempool, a[x]); + } + + mempool_trim(&test_mempool); + + /* free definitely at least one third */ + for (size_t i = 2; i < NN; i += 3) { + assert_se(!a[i] || a[i]->value == i); + if (a[i]) + n_freed ++; + a[i] = mempool_free_tile(&test_mempool, a[i]); + } + + mempool_trim(&test_mempool); + + /* Allocate another set of tiles, which will fill up the free list and allocate some new tiles */ + struct element *b[NN]; + for (size_t i = 0; i < NN; i++) { + assert_se(b[i] = mempool_alloc_tile(&test_mempool)); + b[i]->value = ~(uint64_t) i; + } + + mempool_trim(&test_mempool); + + /* free everything from the original set*/ + + for (size_t i = 0; i < NN; i += 1) { + assert_se(!a[i] || a[i]->value == i); + if (a[i]) + n_freed ++; + a[i] = mempool_free_tile(&test_mempool, a[i]); + } + + mempool_trim(&test_mempool); + + /* and now everything from the second set too */ + + for (size_t i = 0; i < NN; i += 1) { + assert_se(!b[i] || b[i]->value == ~(uint64_t) i); + if (b[i]) + n_freed ++; + b[i] = mempool_free_tile(&test_mempool, b[i]); + } + + assert_se(n_freed == NN * 2); + + mempool_trim(&test_mempool); + + assert_se(!test_mempool.first_pool); + assert_se(!test_mempool.freelist); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-mempress.c b/src/test/test-mempress.c new file mode 100644 index 0000000..26ce4ce --- /dev/null +++ b/src/test/test-mempress.c @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include +#include + +#include "bus-locator.h" +#include "bus-wait-for-jobs.h" +#include "fd-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "signal-util.h" +#include "socket-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "unit-def.h" + +struct fake_pressure_context { + int fifo_fd; + int socket_fd; +}; + +static void *fake_pressure_thread(void *p) { + _cleanup_free_ struct fake_pressure_context *c = ASSERT_PTR(p); + _cleanup_close_ int cfd = -EBADF; + + usleep_safe(150); + + assert_se(write(c->fifo_fd, &(const char) { 'x' }, 1) == 1); + + usleep_safe(150); + + cfd = accept4(c->socket_fd, NULL, NULL, SOCK_CLOEXEC); + assert_se(cfd >= 0); + char buf[STRLEN("hello")+1] = {}; + assert_se(read(cfd, buf, sizeof(buf)-1) == sizeof(buf)-1); + assert_se(streq(buf, "hello")); + assert_se(write(cfd, &(const char) { 'z' }, 1) == 1); + + return 0; +} + +static int fake_pressure_callback(sd_event_source *s, void *userdata) { + int *value = userdata; + const char *d; + + assert_se(s); + assert_se(sd_event_source_get_description(s, &d) >= 0); + + *value *= d[0]; + + log_notice("memory pressure event: %s", d); + + if (*value == 7 * 'f' * 's') + assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + + return 0; +} + +TEST(fake_pressure) { + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *ef = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *j = NULL, *k = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_close_ int fifo_fd = -EBADF, socket_fd = -EBADF; + union sockaddr_union sa; + pthread_t th; + int value = 7; + + assert_se(sd_event_default(&e) >= 0); + + assert_se(mkdtemp_malloc(NULL, &tmp) >= 0); + + assert_se(j = path_join(tmp, "fifo")); + assert_se(mkfifo(j, 0600) >= 0); + fifo_fd = open(j, O_CLOEXEC|O_RDWR|O_NONBLOCK); + assert_se(fifo_fd >= 0); + + assert_se(k = path_join(tmp, "sock")); + socket_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(socket_fd >= 0); + assert_se(sockaddr_un_set_path(&sa.un, k) >= 0); + assert_se(bind(socket_fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) >= 0); + assert_se(listen(socket_fd, 1) >= 0); + + /* Ideally we'd just allocate this on the stack, but AddressSanitizer doesn't like it if threads + * access each other's stack */ + struct fake_pressure_context *fp = new(struct fake_pressure_context, 1); + assert_se(fp); + *fp = (struct fake_pressure_context) { + .fifo_fd = fifo_fd, + .socket_fd = socket_fd, + }; + + assert_se(pthread_create(&th, NULL, fake_pressure_thread, TAKE_PTR(fp)) == 0); + + assert_se(setenv("MEMORY_PRESSURE_WATCH", j, /* override= */ true) >= 0); + assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0); + + assert_se(sd_event_add_memory_pressure(e, &es, fake_pressure_callback, &value) >= 0); + assert_se(sd_event_source_set_description(es, "fifo event source") >= 0); + + assert_se(setenv("MEMORY_PRESSURE_WATCH", k, /* override= */ true) >= 0); + assert_se(setenv("MEMORY_PRESSURE_WRITE", "aGVsbG8K", /* override= */ true) >= 0); + + assert_se(sd_event_add_memory_pressure(e, &ef, fake_pressure_callback, &value) >= 0); + assert_se(sd_event_source_set_description(ef, "socket event source") >= 0); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(value == 7 * 'f' * 's'); + + assert_se(pthread_join(th, NULL) == 0); +} + +struct real_pressure_context { + sd_event_source *pid; +}; + +static int real_pressure_callback(sd_event_source *s, void *userdata) { + struct real_pressure_context *c = ASSERT_PTR(userdata); + const char *d; + + assert_se(s); + assert_se(sd_event_source_get_description(s, &d) >= 0); + + log_notice("real_memory pressure event: %s", d); + + sd_event_trim_memory(); + + assert_se(c->pid); + assert_se(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0) >= 0); + c->pid = NULL; + + return 0; +} + +#define MMAP_SIZE (10 * 1024 * 1024) + +_noreturn_ static void real_pressure_eat_memory(int pipe_fd) { + size_t ate = 0; + + /* Allocates and touches 10M at a time, until runs out of memory */ + + char x; + assert_se(read(pipe_fd, &x, 1) == 1); /* Wait for the GO! */ + + for (;;) { + void *p; + + p = mmap(NULL, MMAP_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + assert_se(p != MAP_FAILED); + + log_info("Eating another %s.", FORMAT_BYTES(MMAP_SIZE)); + + memset(p, random_u32() & 0xFF, MMAP_SIZE); + ate += MMAP_SIZE; + + log_info("Ate %s in total.", FORMAT_BYTES(ate)); + + usleep_safe(50 * USEC_PER_MSEC); + } +} + +static int real_pressure_child_callback(sd_event_source *s, const siginfo_t *si, void *userdata) { + assert_se(s); + assert_se(si); + + log_notice("child dead"); + + assert_se(si->si_signo == SIGCHLD); + assert_se(si->si_status == SIGKILL); + assert_se(si->si_code == CLD_KILLED); + + assert_se(sd_event_exit(sd_event_source_get_event(s), 31) >= 0); + return 0; +} + +TEST(real_pressure) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *scope = NULL; + const char *object; + int r; + pid_t pid; + + r = sd_bus_open_system(&bus); + if (r < 0) { + log_notice_errno(r, "Can't connect to system bus, skipping test: %m"); + return; + } + + assert_se(bus_wait_for_jobs_new(bus, &w) >= 0); + + assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit") >= 0); + assert_se(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64()) >= 0); + assert_se(sd_bus_message_append(m, "ss", scope, "fail") >= 0); + assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0) >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "MemoryAccounting", "b", true) >= 0); + assert_se(sd_bus_message_close_container(m) >= 0); + assert_se(sd_bus_message_append(m, "a(sa(sv))", 0) >= 0); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + log_notice_errno(r, "Can't issue transient unit call, skipping test: %m"); + return; + } + + assert_se(sd_bus_message_read(reply, "o", &object) >= 0); + + assert_se(bus_wait_for_jobs_one(w, object, /* quiet= */ false, /* extra_args= */ NULL) >= 0); + + assert_se(sd_event_default(&e) >= 0); + + assert_se(pipe2(pipe_fd, O_CLOEXEC) >= 0); + + r = safe_fork("(eat-memory)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, &pid); + assert_se(r >= 0); + if (r == 0) { + real_pressure_eat_memory(pipe_fd[0]); + _exit(EXIT_SUCCESS); + } + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + assert_se(sd_event_add_child(e, &cs, pid, WEXITED, real_pressure_child_callback, NULL) >= 0); + assert_se(sd_event_source_set_child_process_own(cs, true) >= 0); + + assert_se(unsetenv("MEMORY_PRESSURE_WATCH") >= 0); + assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0); + + struct real_pressure_context context = { + .pid = cs, + }; + + r = sd_event_add_memory_pressure(e, &es, real_pressure_callback, &context); + if (r < 0) { + log_notice_errno(r, "Can't allocate memory pressure fd, skipping test: %m"); + return; + } + + assert_se(sd_event_source_set_description(es, "real pressure event source") >= 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "full") > 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "full") == 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "some") > 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0); + assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) > 0); + assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) == 0); + assert_se(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT) >= 0); + + _cleanup_free_ char *uo = NULL; + assert_se(uo = unit_dbus_path_from_name(scope)); + + uint64_t mcurrent = UINT64_MAX; + assert_se(sd_bus_get_property_trivial(bus, "org.freedesktop.systemd1", uo, "org.freedesktop.systemd1.Scope", "MemoryCurrent", &error, 't', &mcurrent) >= 0); + + printf("current: %" PRIu64 "\n", mcurrent); + if (mcurrent == UINT64_MAX) { + log_notice_errno(r, "Memory accounting not available, skipping test: %m"); + return; + } + + m = sd_bus_message_unref(m); + + assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties") >= 0); + assert_se(sd_bus_message_append(m, "sb", scope, true) >= 0); + assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "MemoryHigh", "t", mcurrent + (15 * 1024 * 1024)) >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "MemoryMax", "t", mcurrent + (50 * 1024 * 1024)) >= 0); + assert_se(sd_bus_message_close_container(m) >= 0); + + assert_se(sd_bus_call(bus, m, 0, NULL, NULL) >= 0); + + /* Generate some memory allocations via mempool */ +#define NN (1024) + Hashmap **h = new(Hashmap*, NN); + for (int i = 0; i < NN; i++) + h[i] = hashmap_new(NULL); + for (int i = 0; i < NN; i++) + hashmap_free(h[i]); + free(h); + + /* Now start eating memory */ + assert_se(write(pipe_fd[1], &(const char) { 'x' }, 1) == 1); + + assert_se(sd_event_loop(e) >= 0); + int ex = 0; + assert_se(sd_event_get_exit_code(e, &ex) >= 0); + assert_se(ex == 31); +} + +static int outro(void) { + hashmap_trim_pools(); + return 0; +} + +DEFINE_TEST_MAIN_FULL(LOG_DEBUG, NULL, outro); diff --git a/src/test/test-memstream-util.c b/src/test/test-memstream-util.c new file mode 100644 index 0000000..254bdca --- /dev/null +++ b/src/test/test-memstream-util.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memstream-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(memstream_done) { + _cleanup_(memstream_done) MemStream m = {}; + + assert_se(memstream_init(&m)); +} + +TEST(memstream_empty) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + size_t sz; + + assert_se(memstream_init(&m)); + assert_se(memstream_finalize(&m, &buf, &sz) >= 0); + assert_se(streq(buf, "")); + assert_se(sz == 0); +} + +TEST(memstream) { + _cleanup_(memstream_done) MemStream m = {}; + _cleanup_free_ char *buf = NULL; + size_t sz; + FILE *f; + + assert_se(f = memstream_init(&m)); + fputs("hoge", f); + fputs("おはよう!", f); + fputs(u8"😀😀😀", f); + assert_se(memstream_finalize(&m, &buf, &sz) >= 0); + assert_se(streq(buf, u8"hogeおはよう!😀😀😀")); + assert_se(sz == strlen(u8"hogeおはよう!😀😀😀")); + + buf = mfree(buf); + + assert_se(f = memstream_init(&m)); + fputs("second", f); + assert_se(memstream_finalize(&m, &buf, &sz) >= 0); + assert_se(streq(buf, "second")); + assert_se(sz == strlen("second")); +} + +TEST(memstream_dump) { + _cleanup_(memstream_done) MemStream m = {}; + FILE *f; + + assert_se(f = memstream_init(&m)); + fputs("first", f); + assert_se(memstream_dump(LOG_DEBUG, &m) >= 0); + + assert_se(f = memstream_init(&m)); + fputs("second", f); + assert_se(memstream_dump(LOG_DEBUG, &m) >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-mkdir.c b/src/test/test-mkdir.c new file mode 100644 index 0000000..4820b32 --- /dev/null +++ b/src/test/test-mkdir.c @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "capability-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "path-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" + +TEST(mkdir_p_safe) { + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_free_ char *p = NULL, *q = NULL; + int r; + + assert_se(mkdtemp_malloc("/tmp/test-mkdir-XXXXXX", &tmp) >= 0); + + assert_se(p = path_join(tmp, "run/aaa/bbb")); + assert_se(mkdir_p(p, 0755) >= 0); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "run/ccc/ddd")); + assert_se(mkdir_p_safe(tmp, p, 0755, UID_INVALID, GID_INVALID, 0) >= 0); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "var/run")); + assert_se(mkdir_parents_safe(tmp, p, 0755, UID_INVALID, GID_INVALID, 0) >= 0); + assert_se(symlink("../run", p) >= 0); + assert_se(is_dir(p, false) == 0); + assert_se(is_dir(p, true) > 0); + + assert_se(mkdir_safe(p, 0755, UID_INVALID, GID_INVALID, 0) == -ENOTDIR); + assert_se(mkdir_safe(p, 0755, UID_INVALID, GID_INVALID, MKDIR_IGNORE_EXISTING) >= 0); + assert_se(mkdir_safe(p, 0755, UID_INVALID, GID_INVALID, MKDIR_FOLLOW_SYMLINK) >= 0); + assert_se(is_dir(p, false) == 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "var/run/hoge/foo/baz")); + assert_se(mkdir_p_safe(tmp, p, 0755, UID_INVALID, GID_INVALID, 0) >= 0); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "not-exists")); + assert_se(q = path_join(p, "aaa")); + assert_se(mkdir_p_safe(p, q, 0755, UID_INVALID, GID_INVALID, 0) == -ENOENT); + + p = mfree(p); + q = mfree(q); + assert_se(p = path_join(tmp, "regular-file")); + assert_se(q = path_join(p, "aaa")); + assert_se(touch(p) >= 0); + assert_se(mkdir_p_safe(p, q, 0755, UID_INVALID, GID_INVALID, 0) == -ENOTDIR); + + p = mfree(p); + q = mfree(q); + assert_se(p = path_join(tmp, "symlink")); + assert_se(q = path_join(p, "hoge/foo")); + assert_se(symlink("aaa", p) >= 0); + assert_se(mkdir_p_safe(tmp, q, 0755, UID_INVALID, GID_INVALID, 0) >= 0); + assert_se(is_dir(q, false) > 0); + assert_se(is_dir(q, true) > 0); + q = mfree(q); + assert_se(q = path_join(tmp, "aaa/hoge/foo")); + assert_se(is_dir(q, false) > 0); + assert_se(is_dir(q, true) > 0); + + assert_se(mkdir_p_safe(tmp, "/tmp/test-mkdir-outside", 0755, UID_INVALID, GID_INVALID, 0) == -ENOTDIR); + + p = mfree(p); + assert_se(p = path_join(tmp, "zero-mode/should-fail-to-create-child")); + assert_se(mkdir_parents_safe(tmp, p, 0000, UID_INVALID, GID_INVALID, 0) >= 0); + r = safe_fork("(test-mkdir-no-cap)", FORK_DEATHSIG_SIGTERM | FORK_WAIT | FORK_LOG, NULL); + if (r == 0) { + (void) capability_bounding_set_drop(0, /* right_now = */ true); + assert_se(mkdir_p_safe(tmp, p, 0000, UID_INVALID, GID_INVALID, 0) == -EACCES); + _exit(EXIT_SUCCESS); + } + assert_se(r >= 0); +} + +TEST(mkdir_p_root) { + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_free_ char *p = NULL; + + assert_se(mkdtemp_malloc("/tmp/test-mkdir-XXXXXX", &tmp) >= 0); + + assert_se(p = path_join(tmp, "run/aaa/bbb")); + assert_se(mkdir_p_root(tmp, "/run/aaa/bbb", UID_INVALID, GID_INVALID, 0755, NULL) >= 0); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "var/run")); + assert_se(mkdir_parents_safe(tmp, p, 0755, UID_INVALID, GID_INVALID, 0) >= 0); + assert_se(symlink("../run", p) >= 0); + assert_se(is_dir(p, false) == 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "var/run/hoge/foo/baz")); + assert_se(mkdir_p_root(tmp, "/var/run/hoge/foo/baz", UID_INVALID, GID_INVALID, 0755, NULL) >= 0); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + + p = mfree(p); + assert_se(p = path_join(tmp, "not-exists")); + assert_se(mkdir_p_root(p, "/aaa", UID_INVALID, GID_INVALID, 0755, NULL) == -ENOENT); + + p = mfree(p); + assert_se(p = path_join(tmp, "regular-file")); + assert_se(touch(p) >= 0); + assert_se(mkdir_p_root(p, "/aaa", UID_INVALID, GID_INVALID, 0755, NULL) == -ENOTDIR); + + /* FIXME: The tests below do not work. + p = mfree(p); + assert_se(p = path_join(tmp, "symlink")); + assert_se(symlink("aaa", p) >= 0); + assert_se(mkdir_p_root(tmp, "/symlink/hoge/foo", UID_INVALID, GID_INVALID, 0755) >= 0); + p = mfree(p); + assert_se(p = path_join(tmp, "symlink/hoge/foo")); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + p = mfree(p); + assert_se(p = path_join(tmp, "aaa/hoge/foo")); + assert_se(is_dir(p, false) > 0); + assert_se(is_dir(p, true) > 0); + */ +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-modhex.c b/src/test/test-modhex.c new file mode 100644 index 0000000..6725732 --- /dev/null +++ b/src/test/test-modhex.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "recovery-key.h" +#include "alloc-util.h" +#include "string-util.h" +#include "tests.h" + +static void test_normalize_recovery_key(const char *t, const char *expected) { + _cleanup_free_ char *z = NULL; + int r; + + assert_se(t); + + r = normalize_recovery_key(t, &z); + assert_se(expected ? + (r >= 0 && streq(z, expected)) : + (r == -EINVAL && z == NULL)); +} + +TEST(normalize_recovery_key_all) { + test_normalize_recovery_key("iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj", + "iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj"); + + test_normalize_recovery_key("iefgcelhbiduvkjvcjvuncnkvlfchdidjhtuhhdeurkllkegilkjgbrthjkbgktj", + "iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj"); + + test_normalize_recovery_key("IEFGCELH-BIDUVKJV-CJVUNCNK-VLFCHDID-JHTUHHDE-URKLLKEG-ILKJGBRT-HJKBGKTJ", + "iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj"); + + test_normalize_recovery_key("IEFGCELHBIDUVKJVCJVUNCNKVLFCHDIDJHTUHHDEURKLLKEGILKJGBRTHJKBGKTJ", + "iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj"); + + test_normalize_recovery_key("Iefgcelh-Biduvkjv-Cjvuncnk-Vlfchdid-Jhtuhhde-Urkllkeg-Ilkjgbrt-Hjkbgktj", + "iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj"); + + test_normalize_recovery_key("Iefgcelhbiduvkjvcjvuncnkvlfchdidjhtuhhdeurkllkegilkjgbrthjkbgktj", + "iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj"); + + test_normalize_recovery_key("iefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgkt", NULL); + test_normalize_recovery_key("iefgcelhbiduvkjvcjvuncnkvlfchdidjhtuhhdeurkllkegilkjgbrthjkbgkt", NULL); + test_normalize_recovery_key("IEFGCELHBIDUVKJVCJVUNCNKVLFCHDIDJHTUHHDEURKLLKEGILKJGBRTHJKBGKT", NULL); + + test_normalize_recovery_key("xefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj", NULL); + test_normalize_recovery_key("Xefgcelh-biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj", NULL); + test_normalize_recovery_key("iefgcelh+biduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj", NULL); + test_normalize_recovery_key("iefgcelhebiduvkjv-cjvuncnk-vlfchdid-jhtuhhde-urkllkeg-ilkjgbrt-hjkbgktj", NULL); + + test_normalize_recovery_key("", NULL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-mount-util.c b/src/test/test-mount-util.c new file mode 100644 index 0000000..c3d0acb --- /dev/null +++ b/src/test/test-mount-util.c @@ -0,0 +1,509 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "capability-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "libmount-util.h" +#include "missing_magic.h" +#include "missing_mount.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(mount_option_mangle) { + char *opts = NULL; + unsigned long f; + + assert_se(mount_option_mangle(NULL, MS_RDONLY|MS_NOSUID, &f, &opts) == 0); + assert_se(f == (MS_RDONLY|MS_NOSUID)); + assert_se(opts == NULL); + + assert_se(mount_option_mangle("", MS_RDONLY|MS_NOSUID, &f, &opts) == 0); + assert_se(f == (MS_RDONLY|MS_NOSUID)); + assert_se(opts == NULL); + + assert_se(mount_option_mangle("ro,nosuid,nodev,noexec", 0, &f, &opts) == 0); + assert_se(f == (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC)); + assert_se(opts == NULL); + + assert_se(mount_option_mangle("ro,nosuid,nodev,noexec,mode=0755", 0, &f, &opts) == 0); + assert_se(f == (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC)); + assert_se(streq(opts, "mode=0755")); + opts = mfree(opts); + + assert_se(mount_option_mangle("rw,nosuid,foo,hogehoge,nodev,mode=0755", 0, &f, &opts) == 0); + assert_se(f == (MS_NOSUID|MS_NODEV)); + assert_se(streq(opts, "foo,hogehoge,mode=0755")); + opts = mfree(opts); + + assert_se(mount_option_mangle("rw,nosuid,nodev,noexec,relatime,net_cls,net_prio", MS_RDONLY, &f, &opts) == 0); + assert_se(f == (MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME)); + assert_se(streq(opts, "net_cls,net_prio")); + opts = mfree(opts); + + assert_se(mount_option_mangle("rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000", MS_RDONLY, &f, &opts) == 0); + assert_se(f == (MS_NOSUID|MS_NODEV|MS_RELATIME)); + assert_se(streq(opts, "size=1630748k,mode=0700,uid=1000,gid=1000")); + opts = mfree(opts); + + assert_se(mount_option_mangle("size=1630748k,rw,gid=1000,,,nodev,relatime,,mode=0700,nosuid,uid=1000", MS_RDONLY, &f, &opts) == 0); + assert_se(f == (MS_NOSUID|MS_NODEV|MS_RELATIME)); + assert_se(streq(opts, "size=1630748k,gid=1000,mode=0700,uid=1000")); + opts = mfree(opts); + + assert_se(mount_option_mangle("rw,exec,size=8143984k,nr_inodes=2035996,mode=0755", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, &f, &opts) == 0); + assert_se(f == (MS_NOSUID|MS_NODEV)); + assert_se(streq(opts, "size=8143984k,nr_inodes=2035996,mode=0755")); + opts = mfree(opts); + + assert_se(mount_option_mangle("rw,relatime,fmask=0022,,,dmask=0022", MS_RDONLY, &f, &opts) == 0); + assert_se(f == MS_RELATIME); + assert_se(streq(opts, "fmask=0022,dmask=0022")); + opts = mfree(opts); + + assert_se(mount_option_mangle("rw,relatime,fmask=0022,dmask=0022,\"hogehoge", MS_RDONLY, &f, &opts) < 0); + + assert_se(mount_option_mangle("mode=01777,size=10%,nr_inodes=400k,uid=496107520,gid=496107520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\"", 0, &f, &opts) == 0); + assert_se(f == 0); + assert_se(streq(opts, "mode=01777,size=10%,nr_inodes=400k,uid=496107520,gid=496107520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\"")); + opts = mfree(opts); +} + +static void test_mount_flags_to_string_one(unsigned long flags, const char *expected) { + _cleanup_free_ char *x = NULL; + int r; + + r = mount_flags_to_string(flags, &x); + log_info("flags: %#lX → %d/\"%s\"", flags, r, strnull(x)); + assert_se(r >= 0); + assert_se(streq(x, expected)); +} + +TEST(mount_flags_to_string) { + test_mount_flags_to_string_one(0, "0"); + test_mount_flags_to_string_one(MS_RDONLY, "MS_RDONLY"); + test_mount_flags_to_string_one(MS_NOSUID, "MS_NOSUID"); + test_mount_flags_to_string_one(MS_NODEV, "MS_NODEV"); + test_mount_flags_to_string_one(MS_NOEXEC, "MS_NOEXEC"); + test_mount_flags_to_string_one(MS_SYNCHRONOUS, "MS_SYNCHRONOUS"); + test_mount_flags_to_string_one(MS_REMOUNT, "MS_REMOUNT"); + test_mount_flags_to_string_one(MS_MANDLOCK, "MS_MANDLOCK"); + test_mount_flags_to_string_one(MS_DIRSYNC, "MS_DIRSYNC"); + test_mount_flags_to_string_one(MS_NOSYMFOLLOW, "MS_NOSYMFOLLOW"); + test_mount_flags_to_string_one(MS_NOATIME, "MS_NOATIME"); + test_mount_flags_to_string_one(MS_NODIRATIME, "MS_NODIRATIME"); + test_mount_flags_to_string_one(MS_BIND, "MS_BIND"); + test_mount_flags_to_string_one(MS_MOVE, "MS_MOVE"); + test_mount_flags_to_string_one(MS_REC, "MS_REC"); + test_mount_flags_to_string_one(MS_SILENT, "MS_SILENT"); + test_mount_flags_to_string_one(MS_POSIXACL, "MS_POSIXACL"); + test_mount_flags_to_string_one(MS_UNBINDABLE, "MS_UNBINDABLE"); + test_mount_flags_to_string_one(MS_PRIVATE, "MS_PRIVATE"); + test_mount_flags_to_string_one(MS_SLAVE, "MS_SLAVE"); + test_mount_flags_to_string_one(MS_SHARED, "MS_SHARED"); + test_mount_flags_to_string_one(MS_RELATIME, "MS_RELATIME"); + test_mount_flags_to_string_one(MS_KERNMOUNT, "MS_KERNMOUNT"); + test_mount_flags_to_string_one(MS_I_VERSION, "MS_I_VERSION"); + test_mount_flags_to_string_one(MS_STRICTATIME, "MS_STRICTATIME"); + test_mount_flags_to_string_one(MS_LAZYTIME, "MS_LAZYTIME"); + test_mount_flags_to_string_one(MS_LAZYTIME|MS_STRICTATIME, "MS_STRICTATIME|MS_LAZYTIME"); + test_mount_flags_to_string_one(UINT_MAX, + "MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS|MS_REMOUNT|" + "MS_MANDLOCK|MS_DIRSYNC|MS_NOSYMFOLLOW|MS_NOATIME|MS_NODIRATIME|" + "MS_BIND|MS_MOVE|MS_REC|MS_SILENT|MS_POSIXACL|MS_UNBINDABLE|" + "MS_PRIVATE|MS_SLAVE|MS_SHARED|MS_RELATIME|MS_KERNMOUNT|" + "MS_I_VERSION|MS_STRICTATIME|MS_LAZYTIME|fc000200"); +} + +TEST(bind_remount_recursive) { + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_free_ char *subdir = NULL; + + if (geteuid() != 0 || have_effective_cap(CAP_SYS_ADMIN) <= 0) { + (void) log_tests_skipped("not running privileged"); + return; + } + + assert_se(mkdtemp_malloc("/tmp/XXXXXX", &tmp) >= 0); + subdir = path_join(tmp, "subdir"); + assert_se(subdir); + assert_se(mkdir(subdir, 0755) >= 0); + + FOREACH_STRING(p, "/usr", "/sys", "/", tmp) { + pid_t pid; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + struct statvfs svfs; + /* child */ + assert_se(detach_mount_namespace() >= 0); + + /* Check that the subdir is writable (it must be because it's in /tmp) */ + assert_se(statvfs(subdir, &svfs) >= 0); + assert_se(!FLAGS_SET(svfs.f_flag, ST_RDONLY)); + + /* Make the subdir a bind mount */ + assert_se(mount_nofollow(subdir, subdir, NULL, MS_BIND|MS_REC, NULL) >= 0); + + /* Ensure it's still writable */ + assert_se(statvfs(subdir, &svfs) >= 0); + assert_se(!FLAGS_SET(svfs.f_flag, ST_RDONLY)); + + /* Now mark the path we currently run for read-only */ + assert_se(bind_remount_recursive(p, MS_RDONLY, MS_RDONLY, path_equal(p, "/sys") ? STRV_MAKE("/sys/kernel") : NULL) >= 0); + + /* Ensure that this worked on the top-level */ + assert_se(statvfs(p, &svfs) >= 0); + assert_se(FLAGS_SET(svfs.f_flag, ST_RDONLY)); + + /* And ensure this had an effect on the subdir exactly if we are talking about a path above the subdir */ + assert_se(statvfs(subdir, &svfs) >= 0); + assert_se(FLAGS_SET(svfs.f_flag, ST_RDONLY) == !!path_startswith(subdir, p)); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("test-remount-rec", pid, WAIT_LOG) == EXIT_SUCCESS); + } +} + +TEST(bind_remount_one) { + pid_t pid; + + if (geteuid() != 0 || have_effective_cap(CAP_SYS_ADMIN) <= 0) { + (void) log_tests_skipped("not running privileged"); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + /* child */ + + _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; + + assert_se(detach_mount_namespace() >= 0); + + assert_se(fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo) >= 0); + + assert_se(bind_remount_one_with_mountinfo("/run", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) >= 0); + assert_se(bind_remount_one_with_mountinfo("/run", MS_NOEXEC, MS_RDONLY|MS_NOEXEC, proc_self_mountinfo) >= 0); + assert_se(bind_remount_one_with_mountinfo("/proc/idontexist", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) == -ENOENT); + assert_se(bind_remount_one_with_mountinfo("/proc/self", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) == -EINVAL); + assert_se(bind_remount_one_with_mountinfo("/", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) >= 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("test-remount-one", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(make_mount_point_inode) { + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + const char *src_file, *src_dir, *dst_file, *dst_dir; + struct stat st; + + assert_se(mkdtemp_malloc(NULL, &d) >= 0); + + src_file = strjoina(d, "/src/file"); + src_dir = strjoina(d, "/src/dir"); + dst_file = strjoina(d, "/dst/file"); + dst_dir = strjoina(d, "/dst/dir"); + + assert_se(mkdir_p(src_dir, 0755) >= 0); + assert_se(mkdir_parents(dst_file, 0755) >= 0); + assert_se(touch(src_file) >= 0); + + assert_se(make_mount_point_inode_from_path(src_file, dst_file, 0755) >= 0); + assert_se(make_mount_point_inode_from_path(src_dir, dst_dir, 0755) >= 0); + + assert_se(stat(dst_dir, &st) == 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se(stat(dst_file, &st) == 0); + assert_se(S_ISREG(st.st_mode)); + assert_se(!(S_IXUSR & st.st_mode)); + assert_se(!(S_IXGRP & st.st_mode)); + assert_se(!(S_IXOTH & st.st_mode)); + + assert_se(unlink(dst_file) == 0); + assert_se(rmdir(dst_dir) == 0); + + assert_se(stat(src_file, &st) == 0); + assert_se(make_mount_point_inode_from_stat(&st, dst_file, 0755) >= 0); + assert_se(stat(src_dir, &st) == 0); + assert_se(make_mount_point_inode_from_stat(&st, dst_dir, 0755) >= 0); + + assert_se(stat(dst_dir, &st) == 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se(stat(dst_file, &st) == 0); + assert_se(S_ISREG(st.st_mode)); + assert_se(!(S_IXUSR & st.st_mode)); + assert_se(!(S_IXGRP & st.st_mode)); + assert_se(!(S_IXOTH & st.st_mode)); +} + +TEST(make_mount_switch_root) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_free_ char *s = NULL; + int r; + + if (geteuid() != 0 || have_effective_cap(CAP_SYS_ADMIN) <= 0) { + (void) log_tests_skipped("not running privileged"); + return; + } + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + + assert_se(asprintf(&s, "%s/somerandomname%" PRIu64, t, random_u64()) >= 0); + assert_se(s); + assert_se(touch(s) >= 0); + + for (int force_ms_move = 0; force_ms_move < 2; force_ms_move++) { + r = safe_fork("(switch-root)", + FORK_RESET_SIGNALS | + FORK_CLOSE_ALL_FDS | + FORK_DEATHSIG_SIGTERM | + FORK_WAIT | + FORK_REOPEN_LOG | + FORK_LOG | + FORK_NEW_MOUNTNS | + FORK_MOUNTNS_SLAVE, + NULL); + assert_se(r >= 0); + + if (r == 0) { + assert_se(make_mount_point(t) >= 0); + assert_se(mount_switch_root_full(t, /* mount_propagation_flag= */ 0, force_ms_move) >= 0); + + assert_se(access(ASSERT_PTR(strrchr(s, '/')), F_OK) >= 0); /* absolute */ + assert_se(access(ASSERT_PTR(strrchr(s, '/')) + 1, F_OK) >= 0); /* relative */ + assert_se(access(s, F_OK) < 0 && errno == ENOENT); /* doesn't exist in our new environment */ + + _exit(EXIT_SUCCESS); + } + } +} + +TEST(umount_recursive) { + static const struct { + const char *prefix; + const char * const keep[3]; + } test_table[] = { + { + .prefix = NULL, + .keep = {}, + }, + { + .prefix = "/run", + .keep = {}, + }, + { + .prefix = NULL, + .keep = { "/dev/shm", NULL }, + }, + { + .prefix = "/dev", + .keep = { "/dev/pts", "/dev/shm", NULL }, + }, + }; + + int r; + + FOREACH_ARRAY(t, test_table, ELEMENTSOF(test_table)) { + + r = safe_fork("(umount-rec)", + FORK_RESET_SIGNALS | + FORK_CLOSE_ALL_FDS | + FORK_DEATHSIG_SIGTERM | + FORK_WAIT | + FORK_REOPEN_LOG | + FORK_LOG | + FORK_NEW_MOUNTNS | + FORK_MOUNTNS_SLAVE, + NULL); + + if (ERRNO_IS_NEG_PRIVILEGE(r)) + return (void) log_notice("Skipping umount_recursive() test, lacking privileges"); + + assert_se(r >= 0); + if (r == 0) { /* child */ + _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL; + _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *k = NULL; + + /* Open /p/s/m file before we unmount everything (which might include /proc/) */ + f = fopen("/proc/self/mountinfo", "re"); + if (!f) { + log_error_errno(errno, "Failed to open /proc/self/mountinfo: %m"); + _exit(EXIT_FAILURE); + } + + assert_se(k = strv_join((char**) t->keep, " ")); + log_info("detaching just %s (keep: %s)", strna(t->prefix), strna(empty_to_null(k))); + + assert_se(umount_recursive_full(t->prefix, MNT_DETACH, (char**) t->keep) >= 0); + + r = libmount_parse("/proc/self/mountinfo", f, &table, &iter); + if (r < 0) { + log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m"); + _exit(EXIT_FAILURE); + } + + for (;;) { + struct libmnt_fs *fs; + + r = mnt_table_next_fs(table, iter, &fs); + if (r == 1) + break; + if (r < 0) { + log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m"); + _exit(EXIT_FAILURE); + } + + log_debug("left after complete umount: %s", mnt_fs_get_target(fs)); + } + + _exit(EXIT_SUCCESS); + } + } +} + +TEST(fd_make_mount_point) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_free_ char *s = NULL; + int r; + + if (geteuid() != 0 || have_effective_cap(CAP_SYS_ADMIN) <= 0) { + (void) log_tests_skipped("not running privileged"); + return; + } + + assert_se(mkdtemp_malloc(NULL, &t) >= 0); + + assert_se(asprintf(&s, "%s/somerandomname%" PRIu64, t, random_u64()) >= 0); + assert_se(s); + assert_se(mkdir(s, 0700) >= 0); + + r = safe_fork("(make_mount-point)", + FORK_RESET_SIGNALS | + FORK_CLOSE_ALL_FDS | + FORK_DEATHSIG_SIGTERM | + FORK_WAIT | + FORK_REOPEN_LOG | + FORK_LOG | + FORK_NEW_MOUNTNS | + FORK_MOUNTNS_SLAVE, + NULL); + assert_se(r >= 0); + + if (r == 0) { + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + + fd = open(s, O_PATH|O_CLOEXEC); + assert_se(fd >= 0); + + assert_se(fd_is_mount_point(fd, NULL, AT_SYMLINK_FOLLOW) == 0); + + assert_se(fd_make_mount_point(fd) > 0); + + /* Reopen the inode so that we end up on the new mount */ + fd2 = open(s, O_PATH|O_CLOEXEC); + + assert_se(fd_is_mount_point(fd2, NULL, AT_SYMLINK_FOLLOW) > 0); + + assert_se(fd_make_mount_point(fd2) == 0); + + _exit(EXIT_SUCCESS); + } +} + +TEST(bind_mount_submounts) { + _cleanup_(rmdir_and_freep) char *a = NULL, *b = NULL; + _cleanup_free_ char *x = NULL; + int r; + + assert_se(mkdtemp_malloc(NULL, &a) >= 0); + r = mount_nofollow_verbose(LOG_INFO, "tmpfs", a, "tmpfs", 0, NULL); + if (ERRNO_IS_NEG_PRIVILEGE(r)) + return (void) log_tests_skipped("Skipping bind_mount_submounts() test, lacking privileges"); + + assert_se(r >= 0); + + assert_se(x = path_join(a, "foo")); + assert_se(touch(x) >= 0); + free(x); + + assert_se(x = path_join(a, "x")); + assert_se(mkdir(x, 0755) >= 0); + assert_se(mount_nofollow_verbose(LOG_INFO, "tmpfs", x, "tmpfs", 0, NULL) >= 0); + free(x); + + assert_se(x = path_join(a, "x/xx")); + assert_se(touch(x) >= 0); + free(x); + + assert_se(x = path_join(a, "y")); + assert_se(mkdir(x, 0755) >= 0); + assert_se(mount_nofollow_verbose(LOG_INFO, "tmpfs", x, "tmpfs", 0, NULL) >= 0); + free(x); + + assert_se(x = path_join(a, "y/yy")); + assert_se(touch(x) >= 0); + free(x); + + assert_se(mkdtemp_malloc(NULL, &b) >= 0); + assert_se(mount_nofollow_verbose(LOG_INFO, "tmpfs", b, "tmpfs", 0, NULL) >= 0); + + assert_se(x = path_join(b, "x")); + assert_se(mkdir(x, 0755) >= 0); + free(x); + + assert_se(x = path_join(b, "y")); + assert_se(mkdir(x, 0755) >= 0); + free(x); + + assert_se(bind_mount_submounts(a, b) >= 0); + + assert_se(x = path_join(b, "foo")); + assert_se(access(x, F_OK) < 0 && errno == ENOENT); + free(x); + + assert_se(x = path_join(b, "x/xx")); + assert_se(access(x, F_OK) >= 0); + free(x); + + assert_se(x = path_join(b, "y/yy")); + assert_se(access(x, F_OK) >= 0); + free(x); + + assert_se(x = path_join(b, "x")); + assert_se(path_is_mount_point(x, NULL, 0) > 0); + free(x); + + assert_se(x = path_join(b, "y")); + assert_se(path_is_mount_point(x, NULL, 0) > 0); + + assert_se(umount_recursive(a, 0) >= 0); + assert_se(umount_recursive(b, 0) >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-mountpoint-util.c b/src/test/test-mountpoint-util.c new file mode 100644 index 0000000..ff447c6 --- /dev/null +++ b/src/test/test-mountpoint-util.c @@ -0,0 +1,434 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "constants.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "log.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static void test_mount_propagation_flag_one(const char *name, int ret, unsigned long expected) { + unsigned long flags; + + log_info("/* %s(%s) */", __func__, strnull(name)); + + assert_se(mount_propagation_flag_from_string(name, &flags) == ret); + + if (ret >= 0) { + const char *c; + + assert_se(flags == expected); + + c = mount_propagation_flag_to_string(flags); + if (isempty(name)) + assert_se(isempty(c)); + else + assert_se(streq(c, name)); + } +} + +TEST(mount_propagation_flag) { + test_mount_propagation_flag_one("shared", 0, MS_SHARED); + test_mount_propagation_flag_one("slave", 0, MS_SLAVE); + test_mount_propagation_flag_one("private", 0, MS_PRIVATE); + test_mount_propagation_flag_one(NULL, 0, 0); + test_mount_propagation_flag_one("", 0, 0); + test_mount_propagation_flag_one("xxxx", -EINVAL, 0); + test_mount_propagation_flag_one(" ", -EINVAL, 0); +} + +TEST(mnt_id) { + _cleanup_fclose_ FILE *f = NULL; + _cleanup_hashmap_free_free_ Hashmap *h = NULL; + char *p; + void *k; + int r; + + assert_se(f = fopen("/proc/self/mountinfo", "re")); + assert_se(h = hashmap_new(&trivial_hash_ops)); + + for (;;) { + _cleanup_free_ char *line = NULL, *path = NULL; + int mnt_id; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r == 0) + break; + assert_se(r > 0); + + assert_se(sscanf(line, "%i %*s %*s %*s %ms", &mnt_id, &path) == 2); +#if HAS_FEATURE_MEMORY_SANITIZER + /* We don't know the length of the string, so we need to unpoison it one char at a time */ + for (const char *c = path; ;c++) { + msan_unpoison(c, 1); + if (!*c) + break; + } +#endif + log_debug("mountinfo: %s → %i", path, mnt_id); + + assert_se(hashmap_put(h, INT_TO_PTR(mnt_id), path) >= 0); + path = NULL; + } + + HASHMAP_FOREACH_KEY(p, k, h) { + int mnt_id = PTR_TO_INT(k), mnt_id2; + const char *q; + + r = path_get_mnt_id(p, &mnt_id2); + if (r < 0) { + log_debug_errno(r, "Failed to get the mnt id of %s: %m", p); + continue; + } + + if (mnt_id == mnt_id2) { + log_debug("mnt ids of %s is %i.", p, mnt_id); + continue; + } else + log_debug("mnt ids of %s are %i (from /proc/self/mountinfo), %i (from path_get_mnt_id()).", p, mnt_id, mnt_id2); + + /* The ids don't match? This can easily happen e.g. running with "unshare --mount-proc". + * See #11505. */ + assert_se(q = hashmap_get(h, INT_TO_PTR(mnt_id2))); + + assert_se((r = path_is_mount_point(p, NULL, 0)) >= 0); + if (r == 0) { + /* If the path is not a mount point anymore, then it must be a sub directory of + * the path corresponds to mnt_id2. */ + log_debug("The path %s for mnt id %i is not a mount point.", p, mnt_id2); + assert_se(!isempty(path_startswith(p, q))); + } else { + /* If the path is still a mount point, then it must be equivalent to the path + * corresponds to mnt_id2 */ + log_debug("There are multiple mounts on the same path %s.", p); + assert_se(path_equal(p, q)); + } + } +} + +TEST(path_is_mount_point) { + int fd; + char tmp_dir[] = "/tmp/test-path-is-mount-point-XXXXXX"; + _cleanup_free_ char *file1 = NULL, *file2 = NULL, *link1 = NULL, *link2 = NULL; + _cleanup_free_ char *dir1 = NULL, *dir1file = NULL, *dirlink1 = NULL, *dirlink1file = NULL; + _cleanup_free_ char *dir2 = NULL, *dir2file = NULL; + + assert_se(path_is_mount_point("/", NULL, AT_SYMLINK_FOLLOW) > 0); + assert_se(path_is_mount_point("/", NULL, 0) > 0); + assert_se(path_is_mount_point("//", NULL, AT_SYMLINK_FOLLOW) > 0); + assert_se(path_is_mount_point("//", NULL, 0) > 0); + + assert_se(path_is_mount_point("/proc", NULL, AT_SYMLINK_FOLLOW) > 0); + assert_se(path_is_mount_point("/proc", NULL, 0) > 0); + assert_se(path_is_mount_point("/proc/", NULL, AT_SYMLINK_FOLLOW) > 0); + assert_se(path_is_mount_point("/proc/", NULL, 0) > 0); + + assert_se(path_is_mount_point("/proc/1", NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point("/proc/1", NULL, 0) == 0); + assert_se(path_is_mount_point("/proc/1/", NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point("/proc/1/", NULL, 0) == 0); + + assert_se(path_is_mount_point("/sys", NULL, AT_SYMLINK_FOLLOW) > 0); + assert_se(path_is_mount_point("/sys", NULL, 0) > 0); + assert_se(path_is_mount_point("/sys/", NULL, AT_SYMLINK_FOLLOW) > 0); + assert_se(path_is_mount_point("/sys/", NULL, 0) > 0); + + /* we'll create a hierarchy of different kinds of dir/file/link + * layouts: + * + * /file1, /file2 + * /link1 -> file1, /link2 -> file2 + * /dir1/ + * /dir1/file + * /dirlink1 -> dir1 + * /dirlink1file -> dirlink1/file + * /dir2/ + * /dir2/file + */ + + /* file mountpoints */ + assert_se(mkdtemp(tmp_dir) != NULL); + file1 = path_join(tmp_dir, "file1"); + assert_se(file1); + file2 = path_join(tmp_dir, "file2"); + assert_se(file2); + fd = open(file1, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0664); + assert_se(fd > 0); + close(fd); + fd = open(file2, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0664); + assert_se(fd > 0); + close(fd); + link1 = path_join(tmp_dir, "link1"); + assert_se(link1); + assert_se(symlink("file1", link1) == 0); + link2 = path_join(tmp_dir, "link2"); + assert_se(link1); + assert_se(symlink("file2", link2) == 0); + + assert_se(path_is_mount_point(file1, NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point(file1, NULL, 0) == 0); + assert_se(path_is_mount_point(link1, NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point(link1, NULL, 0) == 0); + + /* directory mountpoints */ + dir1 = path_join(tmp_dir, "dir1"); + assert_se(dir1); + assert_se(mkdir(dir1, 0755) == 0); + dirlink1 = path_join(tmp_dir, "dirlink1"); + assert_se(dirlink1); + assert_se(symlink("dir1", dirlink1) == 0); + dirlink1file = path_join(tmp_dir, "dirlink1file"); + assert_se(dirlink1file); + assert_se(symlink("dirlink1/file", dirlink1file) == 0); + dir2 = path_join(tmp_dir, "dir2"); + assert_se(dir2); + assert_se(mkdir(dir2, 0755) == 0); + + assert_se(path_is_mount_point(dir1, NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point(dir1, NULL, 0) == 0); + assert_se(path_is_mount_point(dirlink1, NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point(dirlink1, NULL, 0) == 0); + + /* file in subdirectory mountpoints */ + dir1file = path_join(dir1, "file"); + assert_se(dir1file); + fd = open(dir1file, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0664); + assert_se(fd > 0); + close(fd); + + assert_se(path_is_mount_point(dir1file, NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point(dir1file, NULL, 0) == 0); + assert_se(path_is_mount_point(dirlink1file, NULL, AT_SYMLINK_FOLLOW) == 0); + assert_se(path_is_mount_point(dirlink1file, NULL, 0) == 0); + + /* these tests will only work as root */ + if (mount(file1, file2, NULL, MS_BIND, NULL) >= 0) { + int rf, rt, rdf, rdt, rlf, rlt, rl1f, rl1t; + const char *file2d; + + /* files */ + /* capture results in vars, to avoid dangling mounts on failure */ + log_info("%s: %s", __func__, file2); + rf = path_is_mount_point(file2, NULL, 0); + rt = path_is_mount_point(file2, NULL, AT_SYMLINK_FOLLOW); + + file2d = strjoina(file2, "/"); + log_info("%s: %s", __func__, file2d); + rdf = path_is_mount_point(file2d, NULL, 0); + rdt = path_is_mount_point(file2d, NULL, AT_SYMLINK_FOLLOW); + + log_info("%s: %s", __func__, link2); + rlf = path_is_mount_point(link2, NULL, 0); + rlt = path_is_mount_point(link2, NULL, AT_SYMLINK_FOLLOW); + + assert_se(umount(file2) == 0); + + assert_se(rf == 1); + assert_se(rt == 1); + assert_se(rdf == -ENOTDIR); + assert_se(rdt == -ENOTDIR); + assert_se(rlf == 0); + assert_se(rlt == 1); + + /* dirs */ + dir2file = path_join(dir2, "file"); + assert_se(dir2file); + fd = open(dir2file, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0664); + assert_se(fd > 0); + close(fd); + + assert_se(mount(dir2, dir1, NULL, MS_BIND, NULL) >= 0); + + log_info("%s: %s", __func__, dir1); + rf = path_is_mount_point(dir1, NULL, 0); + rt = path_is_mount_point(dir1, NULL, AT_SYMLINK_FOLLOW); + log_info("%s: %s", __func__, dirlink1); + rlf = path_is_mount_point(dirlink1, NULL, 0); + rlt = path_is_mount_point(dirlink1, NULL, AT_SYMLINK_FOLLOW); + log_info("%s: %s", __func__, dirlink1file); + /* its parent is a mount point, but not /file itself */ + rl1f = path_is_mount_point(dirlink1file, NULL, 0); + rl1t = path_is_mount_point(dirlink1file, NULL, AT_SYMLINK_FOLLOW); + + assert_se(umount(dir1) == 0); + + assert_se(rf == 1); + assert_se(rt == 1); + assert_se(rlf == 0); + assert_se(rlt == 1); + assert_se(rl1f == 0); + assert_se(rl1t == 0); + + } else + log_info("Skipping bind mount file test"); + + assert_se(rm_rf(tmp_dir, REMOVE_ROOT|REMOVE_PHYSICAL) == 0); +} + +TEST(fd_is_mount_point) { + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + _cleanup_close_ int fd = -EBADF; + int r; + + fd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY); + assert_se(fd >= 0); + + /* Not allowed, since "/" is a path, not a plain filename */ + assert_se(fd_is_mount_point(fd, "/", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, ".", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "./", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "..", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "../", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "/proc", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "/proc/", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "proc/sys", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "proc/sys/", 0) == -EINVAL); + + /* This one definitely is a mount point */ + assert_se(fd_is_mount_point(fd, "proc", 0) > 0); + assert_se(fd_is_mount_point(fd, "proc/", 0) > 0); + + safe_close(fd); + fd = open("/tmp", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY); + assert_se(fd >= 0); + + assert_se(mkdtemp_malloc("/tmp/not-mounted-XXXXXX", &tmpdir) >= 0); + assert_se(fd_is_mount_point(fd, basename(tmpdir), 0) == 0); + assert_se(fd_is_mount_point(fd, strjoina(basename(tmpdir), "/"), 0) == 0); + + safe_close(fd); + fd = open("/proc", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY); + assert_se(fd >= 0); + + assert_se(fd_is_mount_point(fd, NULL, 0) > 0); + assert_se(fd_is_mount_point(fd, "", 0) == -EINVAL); + assert_se(fd_is_mount_point(fd, "version", 0) == 0); + + safe_close(fd); + fd = open("/proc/version", O_RDONLY|O_CLOEXEC|O_NOCTTY); + assert_se(fd >= 0); + + r = fd_is_mount_point(fd, NULL, 0); + assert_se(IN_SET(r, 0, -ENOTDIR)); /* on old kernels we can't determine if regular files are mount points if we have no directory fd */ + assert_se(fd_is_mount_point(fd, "", 0) == -EINVAL); +} + +TEST(ms_nosymfollow_supported) { + log_info("MS_NOSYMFOLLOW supported: %s", yes_no(ms_nosymfollow_supported())); +} + +TEST(mount_option_supported) { + int r; + + r = mount_option_supported("tmpfs", "size", "64M"); + log_info("tmpfs supports size=64M: %s (%i)", r < 0 ? "don't know" : yes_no(r), r); + assert_se(r > 0 || r == -EAGAIN || (r < 0 && ERRNO_IS_PRIVILEGE(r))); + + r = mount_option_supported("ext4", "discard", NULL); + log_info("ext4 supports discard: %s (%i)", r < 0 ? "don't know" : yes_no(r), r); + assert_se(r > 0 || r == -EAGAIN || (r < 0 && ERRNO_IS_PRIVILEGE(r))); + + r = mount_option_supported("tmpfs", "idontexist", "64M"); + log_info("tmpfs supports idontexist: %s (%i)", r < 0 ? "don't know" : yes_no(r), r); + assert_se(r == 0 || r == -EAGAIN || (r < 0 && ERRNO_IS_PRIVILEGE(r))); + + r = mount_option_supported("tmpfs", "ialsodontexist", NULL); + log_info("tmpfs supports ialsodontexist: %s (%i)", r < 0 ? "don't know" : yes_no(r), r); + assert_se(r == 0 || r == -EAGAIN || (r < 0 && ERRNO_IS_PRIVILEGE(r))); + + r = mount_option_supported("proc", "hidepid", "1"); + log_info("proc supports hidepid=1: %s (%i)", r < 0 ? "don't know" : yes_no(r), r); + assert_se(r >= 0 || r == -EAGAIN || (r < 0 && ERRNO_IS_PRIVILEGE(r))); +} + +TEST(fstype_can_discard) { + assert_se(fstype_can_discard("ext4")); + assert_se(!fstype_can_discard("squashfs")); + assert_se(!fstype_can_discard("iso9660")); +} + +TEST(fstype_can_norecovery) { + assert_se(fstype_can_norecovery("ext4")); + assert_se(!fstype_can_norecovery("vfat")); + assert_se(!fstype_can_norecovery("tmpfs")); +} + +TEST(fstype_can_umask) { + assert_se(fstype_can_umask("vfat")); + assert_se(!fstype_can_umask("tmpfs")); +} + +TEST(path_get_mnt_id_at_null) { + _cleanup_close_ int root_fd = -EBADF, run_fd = -EBADF; + int id1, id2; + + assert_se(path_get_mnt_id_at(AT_FDCWD, "/run/", &id1) >= 0); + assert_se(id1 > 0); + + assert_se(path_get_mnt_id_at(AT_FDCWD, "/run", &id2) >= 0); + assert_se(id1 == id2); + id2 = -1; + + root_fd = open("/", O_DIRECTORY|O_CLOEXEC); + assert_se(root_fd >= 0); + + assert_se(path_get_mnt_id_at(root_fd, "/run/", &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; + + assert_se(path_get_mnt_id_at(root_fd, "/run", &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; + + assert_se(path_get_mnt_id_at(root_fd, "run", &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; + + assert_se(path_get_mnt_id_at(root_fd, "run/", &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; + + run_fd = openat(root_fd, "run", O_DIRECTORY|O_CLOEXEC); + assert_se(run_fd >= 0); + + id2 = -1; + assert_se(path_get_mnt_id_at(run_fd, "", &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; + + assert_se(path_get_mnt_id_at(run_fd, NULL, &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; + + assert_se(path_get_mnt_id_at(run_fd, ".", &id2) >= 0); + assert_se(id1 = id2); + id2 = -1; +} + +static int intro(void) { + /* let's move into our own mount namespace with all propagation from the host turned off, so + * that /proc/self/mountinfo is static and constant for the whole time our test runs. */ + + if (unshare(CLONE_NEWNS) < 0) { + if (!ERRNO_IS_PRIVILEGE(errno)) + return log_error_errno(errno, "Failed to detach mount namespace: %m"); + + log_notice("Lacking privilege to create separate mount namespace, proceeding in originating mount namespace."); + } else + assert_se(mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL) >= 0); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c new file mode 100644 index 0000000..65d0825 --- /dev/null +++ b/src/test/test-namespace.c @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "namespace.h" +#include "process-util.h" +#include "string-util.h" +#include "tests.h" +#include "user-util.h" +#include "virt.h" + +TEST(namespace_cleanup_tmpdir) { + { + _cleanup_(namespace_cleanup_tmpdirp) char *dir; + assert_se(dir = strdup(RUN_SYSTEMD_EMPTY)); + } + + { + _cleanup_(namespace_cleanup_tmpdirp) char *dir; + assert_se(dir = strdup("/tmp/systemd-test-namespace.XXXXXX")); + assert_se(mkdtemp(dir)); + } +} + +static void test_tmpdir_one(const char *id, const char *A, const char *B) { + _cleanup_free_ char *a, *b; + struct stat x, y; + char *c, *d; + + assert_se(setup_tmp_dirs(id, &a, &b) == 0); + + assert_se(stat(a, &x) >= 0); + assert_se(stat(b, &y) >= 0); + + assert_se(S_ISDIR(x.st_mode)); + assert_se(S_ISDIR(y.st_mode)); + + if (!streq(a, RUN_SYSTEMD_EMPTY)) { + assert_se(startswith(a, A)); + assert_se((x.st_mode & 01777) == 0700); + c = strjoina(a, "/tmp"); + assert_se(stat(c, &x) >= 0); + assert_se(S_ISDIR(x.st_mode)); + assert_se(FLAGS_SET(x.st_mode, 01777)); + assert_se(rmdir(c) >= 0); + assert_se(rmdir(a) >= 0); + } + + if (!streq(b, RUN_SYSTEMD_EMPTY)) { + assert_se(startswith(b, B)); + assert_se((y.st_mode & 01777) == 0700); + d = strjoina(b, "/tmp"); + assert_se(stat(d, &y) >= 0); + assert_se(S_ISDIR(y.st_mode)); + assert_se(FLAGS_SET(y.st_mode, 01777)); + assert_se(rmdir(d) >= 0); + assert_se(rmdir(b) >= 0); + } +} + +TEST(tmpdir) { + _cleanup_free_ char *x = NULL, *y = NULL, *z = NULL, *zz = NULL; + sd_id128_t bid; + + assert_se(sd_id128_get_boot(&bid) >= 0); + + x = strjoin("/tmp/systemd-private-", SD_ID128_TO_STRING(bid), "-abcd.service-"); + y = strjoin("/var/tmp/systemd-private-", SD_ID128_TO_STRING(bid), "-abcd.service-"); + assert_se(x && y); + + test_tmpdir_one("abcd.service", x, y); + + z = strjoin("/tmp/systemd-private-", SD_ID128_TO_STRING(bid), "-sys-devices-pci0000:00-0000:00:1a.0-usb3-3\\x2d1-3\\x2d1:1.0-bluetooth-hci0.device-"); + zz = strjoin("/var/tmp/systemd-private-", SD_ID128_TO_STRING(bid), "-sys-devices-pci0000:00-0000:00:1a.0-usb3-3\\x2d1-3\\x2d1:1.0-bluetooth-hci0.device-"); + + assert_se(z && zz); + + test_tmpdir_one("sys-devices-pci0000:00-0000:00:1a.0-usb3-3\\x2d1-3\\x2d1:1.0-bluetooth-hci0.device", z, zz); +} + +static void test_shareable_ns(unsigned long nsflag) { + _cleanup_close_pair_ int s[2] = EBADF_PAIR; + pid_t pid1, pid2, pid3; + int r, n = 0; + siginfo_t si; + + if (geteuid() > 0) { + (void) log_tests_skipped("not root"); + return; + } + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, s) >= 0); + + pid1 = fork(); + assert_se(pid1 >= 0); + + if (pid1 == 0) { + r = setup_shareable_ns(s, nsflag); + assert_se(r >= 0); + _exit(r); + } + + pid2 = fork(); + assert_se(pid2 >= 0); + + if (pid2 == 0) { + r = setup_shareable_ns(s, nsflag); + assert_se(r >= 0); + exit(r); + } + + pid3 = fork(); + assert_se(pid3 >= 0); + + if (pid3 == 0) { + r = setup_shareable_ns(s, nsflag); + assert_se(r >= 0); + exit(r); + } + + r = wait_for_terminate(pid1, &si); + assert_se(r >= 0); + assert_se(si.si_code == CLD_EXITED); + n += si.si_status; + + r = wait_for_terminate(pid2, &si); + assert_se(r >= 0); + assert_se(si.si_code == CLD_EXITED); + n += si.si_status; + + r = wait_for_terminate(pid3, &si); + assert_se(r >= 0); + assert_se(si.si_code == CLD_EXITED); + n += si.si_status; + + assert_se(n == 1); +} + +TEST(netns) { + test_shareable_ns(CLONE_NEWNET); +} + +TEST(ipcns) { + test_shareable_ns(CLONE_NEWIPC); +} + +TEST(protect_kernel_logs) { + static const NamespaceParameters p = { + .runtime_scope = RUNTIME_SCOPE_SYSTEM, + .protect_kernel_logs = true, + }; + pid_t pid; + int r; + + if (geteuid() > 0) { + (void) log_tests_skipped("not root"); + return; + } + + /* In a container we likely don't have access to /dev/kmsg */ + if (detect_container() > 0) { + (void) log_tests_skipped("in container"); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + _cleanup_close_ int fd = -EBADF; + + fd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC); + assert_se(fd > 0); + + r = setup_namespace(&p, NULL); + assert_se(r == 0); + + assert_se(setresuid(UID_NOBODY, UID_NOBODY, UID_NOBODY) >= 0); + assert_se(open("/dev/kmsg", O_RDONLY | O_CLOEXEC) < 0); + assert_se(errno == EACCES); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("ns-kernellogs", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +static int intro(void) { + if (!have_namespaces()) + return log_tests_skipped("Don't have namespace support"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-net-naming-scheme.c b/src/test/test-net-naming-scheme.c new file mode 100644 index 0000000..f7ec5a6 --- /dev/null +++ b/src/test/test-net-naming-scheme.c @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "netif-naming-scheme.h" +#include "string-util.h" +#include "tests.h" + +#ifdef _DEFAULT_NET_NAMING_SCHEME +/* The primary purpose of this check is to verify that _DEFAULT_NET_NAMING_SCHEME_TEST + * is a valid identifier. If an invalid name is given during configuration, this will + * fail with a name error. */ +assert_cc(_DEFAULT_NET_NAMING_SCHEME >= 0); +#endif + +TEST(default_net_naming_scheme) { + const NamingScheme *n; + assert_se(n = naming_scheme_from_name(DEFAULT_NET_NAMING_SCHEME)); + log_info("default → %s", n->name); + + assert_se(naming_scheme_from_name(n->name) == n); +} + +TEST(naming_scheme_conversions) { + const NamingScheme *n; + assert_se(n = naming_scheme_from_name("latest")); + log_info("latest → %s", n->name); + + assert_se(n = naming_scheme_from_name("v238")); + assert_se(streq(n->name, "v238")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-netlink-manual.c b/src/test/test-netlink-manual.c new file mode 100644 index 0000000..6543c61 --- /dev/null +++ b/src/test/test-netlink-manual.c @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "macro.h" +#include "module-util.h" +#include "tests.h" + +static int load_module(const char *mod_name) { + _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL; + _cleanup_(kmod_module_unref_listp) struct kmod_list *list = NULL; + struct kmod_list *l; + int r; + + ctx = kmod_new(NULL, NULL); + if (!ctx) + return log_oom(); + + r = kmod_module_new_from_lookup(ctx, mod_name, &list); + if (r < 0) + return r; + + kmod_list_foreach(l, list) { + _cleanup_(kmod_module_unrefp) struct kmod_module *mod = NULL; + + mod = kmod_module_get_module(l); + + r = kmod_module_probe_insert_module(mod, 0, NULL, NULL, NULL, NULL); + if (r > 0) + r = -EINVAL; + } + + return r; +} + +static int test_tunnel_configure(sd_netlink *rtnl) { + int r; + sd_netlink_message *m, *n; + struct in_addr local, remote; + + /* skip test if module cannot be loaded */ + r = load_module("ipip"); + if (r < 0) + return log_tests_skipped_errno(r, "failed to load module 'ipip'"); + + r = load_module("sit"); + if (r < 0) + return log_tests_skipped_errno(r, "failed to load module 'sit'"); + + if (getuid() != 0) + return log_tests_skipped("not root"); + + /* IPIP tunnel */ + assert_se(sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0) >= 0); + assert_se(m); + + assert_se(sd_netlink_message_append_string(m, IFLA_IFNAME, "ipip-tunnel") >= 0); + assert_se(sd_netlink_message_append_u32(m, IFLA_MTU, 1234)>= 0); + + assert_se(sd_netlink_message_open_container(m, IFLA_LINKINFO) >= 0); + + assert_se(sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipip") >= 0); + + inet_pton(AF_INET, "192.168.21.1", &local.s_addr); + assert_se(sd_netlink_message_append_u32(m, IFLA_IPTUN_LOCAL, local.s_addr) >= 0); + + inet_pton(AF_INET, "192.168.21.2", &remote.s_addr); + assert_se(sd_netlink_message_append_u32(m, IFLA_IPTUN_REMOTE, remote.s_addr) >= 0); + + assert_se(sd_netlink_message_close_container(m) >= 0); + assert_se(sd_netlink_message_close_container(m) >= 0); + + assert_se(sd_netlink_call(rtnl, m, -1, 0) == 1); + + assert_se((m = sd_netlink_message_unref(m)) == NULL); + + /* sit */ + assert_se(sd_rtnl_message_new_link(rtnl, &n, RTM_NEWLINK, 0) >= 0); + assert_se(n); + + assert_se(sd_netlink_message_append_string(n, IFLA_IFNAME, "sit-tunnel") >= 0); + assert_se(sd_netlink_message_append_u32(n, IFLA_MTU, 1234)>= 0); + + assert_se(sd_netlink_message_open_container(n, IFLA_LINKINFO) >= 0); + + assert_se(sd_netlink_message_open_container_union(n, IFLA_INFO_DATA, "sit") >= 0); + + assert_se(sd_netlink_message_append_u8(n, IFLA_IPTUN_PROTO, IPPROTO_IPIP) >= 0); + + inet_pton(AF_INET, "192.168.21.3", &local.s_addr); + assert_se(sd_netlink_message_append_u32(n, IFLA_IPTUN_LOCAL, local.s_addr) >= 0); + + inet_pton(AF_INET, "192.168.21.4", &remote.s_addr); + assert_se(sd_netlink_message_append_u32(n, IFLA_IPTUN_REMOTE, remote.s_addr) >= 0); + + assert_se(sd_netlink_message_close_container(n) >= 0); + assert_se(sd_netlink_message_close_container(n) >= 0); + + assert_se(sd_netlink_call(rtnl, n, -1, 0) == 1); + + assert_se((n = sd_netlink_message_unref(n)) == NULL); + + return EXIT_SUCCESS; +} + +int main(int argc, char *argv[]) { + sd_netlink *rtnl; + int r; + + test_setup_logging(LOG_INFO); + + assert_se(sd_netlink_open(&rtnl) >= 0); + assert_se(rtnl); + + r = test_tunnel_configure(rtnl); + + assert_se((rtnl = sd_netlink_unref(rtnl)) == NULL); + + return r; +} diff --git a/src/test/test-nft-set.c b/src/test/test-nft-set.c new file mode 100644 index 0000000..bb0c902 --- /dev/null +++ b/src/test/test-nft-set.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "firewall-util.h" +#include "in-addr-util.h" +#include "log.h" +#include "netlink-internal.h" +#include "parse-util.h" +#include "string-util.h" +#include "tests.h" + +int main(int argc, char **argv) { + int r; + + assert_se(argc == 7); + + test_setup_logging(LOG_DEBUG); + + if (getuid() != 0) + return log_tests_skipped("not root"); + + int nfproto; + nfproto = nfproto_from_string(argv[2]); + assert_se(nfproto_is_valid(nfproto)); + + const char *table = argv[3], *set = argv[4]; + + FirewallContext *ctx; + r = fw_ctx_new(&ctx); + assert_se(r == 0); + + bool add; + if (streq(argv[1], "add")) + add = true; + else + add = false; + + if (streq(argv[5], "uint32")) { + uint32_t element; + + r = safe_atou32(argv[6], &element); + assert_se(r == 0); + + r = nft_set_element_modify_any(ctx, add, nfproto, table, set, &element, sizeof(element)); + assert_se(r == 0); + } else if (streq(argv[5], "uint64")) { + uint64_t element; + + r = safe_atou64(argv[6], &element); + assert_se(r == 0); + + r = nft_set_element_modify_any(ctx, add, nfproto, table, set, &element, sizeof(element)); + assert_se(r == 0); + } else if (streq(argv[5], "in_addr")) { + union in_addr_union addr; + int af; + + r = in_addr_from_string_auto(argv[6], &af, &addr); + assert_se(r == 0); + + r = nft_set_element_modify_ip(ctx, add, nfproto, af, table, set, &addr); + assert_se(r == 0); + } else if (streq(argv[5], "network")) { + union in_addr_union addr; + int af; + unsigned char prefixlen; + + r = in_addr_prefix_from_string_auto(argv[6], &af, &addr, &prefixlen); + assert_se(r == 0); + + r = nft_set_element_modify_iprange(ctx, add, nfproto, af, table, set, &addr, prefixlen); + assert_se(r == 0); + } + + return 0; +} diff --git a/src/test/test-ns.c b/src/test/test-ns.c new file mode 100644 index 0000000..97b9fc9 --- /dev/null +++ b/src/test/test-ns.c @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "log.h" +#include "namespace.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + const char * const writable[] = { + "/home", + "-/home/lennart/projects/foobar", /* this should be masked automatically */ + NULL + }; + + const char * const readonly[] = { + /* "/", */ + /* "/usr", */ + "/boot", + "/lib", + "/usr/lib", + "-/lib64", + "-/usr/lib64", + NULL + }; + + const char * const exec[] = { + "/lib", + "/usr", + "-/lib64", + "-/usr/lib64", + NULL + }; + + const char * const no_exec[] = { + "/var", + NULL + }; + + const char *inaccessible[] = { + "/home/lennart/projects", + NULL + }; + + static const BindMount bind_mount = { + .source = (char*) "/usr/bin", + .destination = (char*) "/etc/systemd", + .read_only = true, + }; + + static const TemporaryFileSystem tmpfs = { + .path = (char*) "/var", + .options = (char*) "ro", + }; + + char *root_directory; + char *projects_directory; + int r; + char tmp_dir[] = "/tmp/systemd-private-XXXXXX", + var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX"; + + test_setup_logging(LOG_DEBUG); + + assert_se(mkdtemp(tmp_dir)); + assert_se(mkdtemp(var_tmp_dir)); + + root_directory = getenv("TEST_NS_CHROOT"); + projects_directory = getenv("TEST_NS_PROJECTS"); + + if (projects_directory) + inaccessible[0] = projects_directory; + + log_info("Inaccessible directory: '%s'", inaccessible[0]); + if (root_directory) + log_info("Chroot: '%s'", root_directory); + else + log_info("Not chrooted"); + + NamespaceParameters p = { + .runtime_scope = RUNTIME_SCOPE_SYSTEM, + + .root_directory = root_directory, + + .read_write_paths = (char**) writable, + .read_only_paths = (char**) readonly, + .inaccessible_paths = (char**) inaccessible, + + .exec_paths = (char**) exec, + .no_exec_paths = (char**) no_exec, + + .tmp_dir = tmp_dir, + .var_tmp_dir = var_tmp_dir, + + .bind_mounts = &bind_mount, + .n_bind_mounts = 1, + + .temporary_filesystems = &tmpfs, + .n_temporary_filesystems = 1, + + .private_dev = true, + .protect_control_groups = true, + .protect_kernel_tunables = true, + .protect_kernel_modules = true, + .protect_proc = PROTECT_PROC_NOACCESS, + .proc_subset = PROC_SUBSET_PID, + }; + + r = setup_namespace(&p, NULL); + if (r < 0) { + log_error_errno(r, "Failed to set up namespace: %m"); + + log_info("Usage:\n" + " sudo TEST_NS_PROJECTS=/home/lennart/projects ./test-ns\n" + " sudo TEST_NS_CHROOT=/home/alban/debian-tree TEST_NS_PROJECTS=/home/alban/debian-tree/home/alban/Documents ./test-ns"); + + return 1; + } + + execl("/bin/sh", "/bin/sh", NULL); + log_error_errno(errno, "execl(): %m"); + + return 1; +} diff --git a/src/test/test-nscd-flush.c b/src/test/test-nscd-flush.c new file mode 100644 index 0000000..1a5a808 --- /dev/null +++ b/src/test/test-nscd-flush.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "main-func.h" +#include "nscd-flush.h" +#include "strv.h" +#include "tests.h" + +static int run(int argc, char *argv[]) { + int r; + + test_setup_logging(LOG_DEBUG); + + r = nscd_flush_cache(STRV_MAKE("group", "passwd", "hosts")); + if (r < 0) + return log_error_errno(r, "Failed to flush NSCD cache"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-nss-hosts.c b/src/test/test-nss-hosts.c new file mode 100644 index 0000000..72a9c64 --- /dev/null +++ b/src/test/test-nss-hosts.c @@ -0,0 +1,495 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "dlfcn-util.h" +#include "env-util.h" +#include "errno-list.h" +#include "format-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "in-addr-util.h" +#include "local-addresses.h" +#include "log.h" +#include "main-func.h" +#include "nss-test-util.h" +#include "nss-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static size_t arg_bufsize = 1024; + +static const char* af_to_string(int family, char *buf, size_t buf_len) { + const char *name; + + if (family == AF_UNSPEC) + return "*"; + + name = af_to_name(family); + if (name) + return name; + + (void) snprintf(buf, buf_len, "%i", family); + return buf; +} + +static int print_gaih_addrtuples(const struct gaih_addrtuple *tuples) { + int r, n = 0; + + for (const struct gaih_addrtuple *it = tuples; it; it = it->next) { + _cleanup_free_ char *a = NULL; + union in_addr_union u; + char family_name[DECIMAL_STR_MAX(int)]; + + memcpy(&u, it->addr, 16); + r = in_addr_to_string(it->family, &u, &a); + assert_se(IN_SET(r, 0, -EAFNOSUPPORT)); + if (r == -EAFNOSUPPORT) + assert_se(a = hexmem(it->addr, 16)); + + log_info(" \"%s\" %s %s %s", + it->name, + af_to_string(it->family, family_name, sizeof family_name), + a, + FORMAT_IFNAME_FULL(it->scopeid, FORMAT_IFNAME_IFINDEX_WITH_PERCENT)); + + n++; + } + return n; +} + +static void print_struct_hostent(struct hostent *host, const char *canon) { + log_info(" \"%s\"", host->h_name); + STRV_FOREACH(s, host->h_aliases) + log_info(" alias \"%s\"", *s); + STRV_FOREACH(s, host->h_addr_list) { + union in_addr_union u; + _cleanup_free_ char *a = NULL; + char family_name[DECIMAL_STR_MAX(int)]; + int r; + + assert_se((unsigned) host->h_length == FAMILY_ADDRESS_SIZE(host->h_addrtype)); + memcpy(&u, *s, host->h_length); + r = in_addr_to_string(host->h_addrtype, &u, &a); + assert_se(r == 0); + log_info(" %s %s", + af_to_string(host->h_addrtype, family_name, sizeof family_name), + a); + } + if (canon) + log_info(" canonical: \"%s\"", canon); +} + +static void test_gethostbyname4_r(void *handle, const char *module, const char *name) { + const char *fname; + _nss_gethostbyname4_r_t f; + char buffer[arg_bufsize]; + struct gaih_addrtuple *pat = NULL; + int errno1 = 999, errno2 = 999; /* nss-dns doesn't set those */ + int32_t ttl = INT32_MAX; /* nss-dns wants to return the lowest ttl, + and will access this variable through *ttlp, + so we need to set it to something. + I'm not sure if this is a bug in nss-dns + or not. */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + int n; + + fname = strjoina("_nss_", module, "_gethostbyname4_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(name, &pat, buffer, sizeof buffer, &errno1, &errno2, &ttl); + if (status == NSS_STATUS_SUCCESS) { + log_info("%s(\"%s\") → status=%s%-20spat=buffer+0x%"PRIxPTR" errno=%d/%s h_errno=%d/%s ttl=%"PRIi32, + fname, name, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + pat ? (uintptr_t) pat - (uintptr_t) buffer : 0, + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2), + ttl); + n = print_gaih_addrtuples(pat); + } else { + log_info("%s(\"%s\") → status=%s%-20spat=0x%p errno=%d/%s h_errno=%d/%s", + fname, name, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + pat, + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2)); + n = 0; + } + + if (STR_IN_SET(module, "resolve", "mymachines") && status == NSS_STATUS_UNAVAIL) + return; + + if (streq(name, "localhost")) { + if (streq(module, "myhostname")) { + assert_se(status == NSS_STATUS_SUCCESS); + assert_se(n == socket_ipv6_is_enabled() + 1); + + } else if (streq(module, "resolve") && getenv_bool_secure("SYSTEMD_NSS_RESOLVE_SYNTHESIZE") != 0) { + assert_se(status == NSS_STATUS_SUCCESS); + if (socket_ipv6_is_enabled()) + assert_se(n == 2); + else + assert_se(n <= 2); /* Even if IPv6 is disabled, /etc/hosts may contain ::1. */ + } + } +} + +static void test_gethostbyname3_r(void *handle, const char *module, const char *name, int af) { + const char *fname; + _nss_gethostbyname3_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999, errno2 = 999; /* nss-dns doesn't set those */ + int32_t ttl = INT32_MAX; /* nss-dns wants to return the lowest ttl, + and will access this variable through *ttlp, + so we need to set it to something. + I'm not sure if this is a bug in nss-dns + or not. */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct hostent host; + char *canon; + char family_name[DECIMAL_STR_MAX(int)]; + + fname = strjoina("_nss_", module, "_gethostbyname3_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(name, af, &host, buffer, sizeof buffer, &errno1, &errno2, &ttl, &canon); + log_info("%s(\"%s\", %s) → status=%s%-20serrno=%d/%s h_errno=%d/%s ttl=%"PRIi32, + fname, name, af_to_string(af, family_name, sizeof family_name), + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2), + ttl); + if (status == NSS_STATUS_SUCCESS) + print_struct_hostent(&host, canon); +} + +static void test_gethostbyname2_r(void *handle, const char *module, const char *name, int af) { + const char *fname; + _nss_gethostbyname2_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999, errno2 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct hostent host; + char family_name[DECIMAL_STR_MAX(int)]; + + fname = strjoina("_nss_", module, "_gethostbyname2_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(name, af, &host, buffer, sizeof buffer, &errno1, &errno2); + log_info("%s(\"%s\", %s) → status=%s%-20serrno=%d/%s h_errno=%d/%s", + fname, name, af_to_string(af, family_name, sizeof family_name), + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2)); + if (status == NSS_STATUS_SUCCESS) + print_struct_hostent(&host, NULL); +} + +static void test_gethostbyname_r(void *handle, const char *module, const char *name) { + const char *fname; + _nss_gethostbyname_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999, errno2 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct hostent host; + + fname = strjoina("_nss_", module, "_gethostbyname_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(name, &host, buffer, sizeof buffer, &errno1, &errno2); + log_info("%s(\"%s\") → status=%s%-20serrno=%d/%s h_errno=%d/%s", + fname, name, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2)); + if (status == NSS_STATUS_SUCCESS) + print_struct_hostent(&host, NULL); +} + +static void test_gethostbyaddr2_r(void *handle, + const char *module, + const void* addr, socklen_t len, + int af) { + + const char *fname; + _nss_gethostbyaddr2_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999, errno2 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct hostent host; + int32_t ttl = INT32_MAX; + _cleanup_free_ char *addr_pretty = NULL; + + fname = strjoina("_nss_", module, "_gethostbyaddr2_r"); + f = dlsym(handle, fname); + + log_full_errno(f ? LOG_DEBUG : LOG_INFO, errno, + "dlsym(0x%p, %s) → 0x%p: %m", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + assert_se(in_addr_to_string(af, addr, &addr_pretty) >= 0); + + status = f(addr, len, af, &host, buffer, sizeof buffer, &errno1, &errno2, &ttl); + log_info("%s(\"%s\") → status=%s%-20serrno=%d/%s h_errno=%d/%s ttl=%"PRIi32, + fname, addr_pretty, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2), + ttl); + if (status == NSS_STATUS_SUCCESS) + print_struct_hostent(&host, NULL); +} + +static void test_gethostbyaddr_r(void *handle, + const char *module, + const void* addr, socklen_t len, + int af) { + + const char *fname; + _nss_gethostbyaddr_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999, errno2 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct hostent host; + _cleanup_free_ char *addr_pretty = NULL; + + fname = strjoina("_nss_", module, "_gethostbyaddr_r"); + f = dlsym(handle, fname); + + log_full_errno(f ? LOG_DEBUG : LOG_INFO, errno, + "dlsym(0x%p, %s) → 0x%p: %m", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + assert_se(in_addr_to_string(af, addr, &addr_pretty) >= 0); + + status = f(addr, len, af, &host, buffer, sizeof buffer, &errno1, &errno2); + log_info("%s(\"%s\") → status=%s%-20serrno=%d/%s h_errno=%d/%s", + fname, addr_pretty, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---", + errno2, hstrerror(errno2)); + if (status == NSS_STATUS_SUCCESS) + print_struct_hostent(&host, NULL); +} + +static void test_byname(void *handle, const char *module, const char *name) { + test_gethostbyname4_r(handle, module, name); + puts(""); + + test_gethostbyname3_r(handle, module, name, AF_INET); + puts(""); + test_gethostbyname3_r(handle, module, name, AF_INET6); + puts(""); + test_gethostbyname3_r(handle, module, name, AF_UNSPEC); + puts(""); + test_gethostbyname3_r(handle, module, name, AF_UNIX); + puts(""); + + test_gethostbyname2_r(handle, module, name, AF_INET); + puts(""); + test_gethostbyname2_r(handle, module, name, AF_INET6); + puts(""); + test_gethostbyname2_r(handle, module, name, AF_UNSPEC); + puts(""); + test_gethostbyname2_r(handle, module, name, AF_UNIX); + puts(""); + + test_gethostbyname_r(handle, module, name); + puts(""); +} + +static void test_byaddr(void *handle, + const char *module, + const void* addr, socklen_t len, + int af) { + test_gethostbyaddr2_r(handle, module, addr, len, af); + puts(""); + + test_gethostbyaddr_r(handle, module, addr, len, af); + puts(""); +} + +static int make_addresses(struct local_address **addresses) { + int n; + _cleanup_free_ struct local_address *addrs = NULL; + + n = local_addresses(NULL, 0, AF_UNSPEC, &addrs); + if (n < 0) + log_info_errno(n, "Failed to query local addresses: %m"); + + assert_se(GREEDY_REALLOC(addrs, n + 3)); + + addrs[n++] = (struct local_address) { .family = AF_INET, + .address.in = { htobe32(0x7F000001) } }; + addrs[n++] = (struct local_address) { .family = AF_INET, + .address.in = { htobe32(0x7F000002) } }; + addrs[n++] = (struct local_address) { .family = AF_INET6, + .address.in6 = in6addr_loopback }; + + *addresses = TAKE_PTR(addrs); + return n; +} + +static int test_one_module(const char *dir, + const char *module, + char **names, + struct local_address *addresses, + int n_addresses) { + + log_info("======== %s ========", module); + + _cleanup_(dlclosep) void *handle = nss_open_handle(dir, module, RTLD_LAZY|RTLD_NODELETE); + if (!handle) + return -EINVAL; + + STRV_FOREACH(name, names) + test_byname(handle, module, *name); + + for (int i = 0; i < n_addresses; i++) + test_byaddr(handle, module, + &addresses[i].address, + FAMILY_ADDRESS_SIZE(addresses[i].family), + addresses[i].family); + + log_info(" "); + return 0; +} + +static int parse_argv(int argc, char **argv, + char ***the_modules, + char ***the_names, + struct local_address **the_addresses, int *n_addresses) { + + _cleanup_strv_free_ char **modules = NULL, **names = NULL; + _cleanup_free_ struct local_address *addrs = NULL; + const char *p; + int r, n = 0; + + p = getenv("SYSTEMD_TEST_NSS_BUFSIZE"); + if (p) { + r = safe_atozu(p, &arg_bufsize); + if (r < 0) + return log_error_errno(r, "Failed to parse $SYSTEMD_TEST_NSS_BUFSIZE"); + } + + if (argc > 1) + modules = strv_new(argv[1]); + else + modules = strv_new( +#if ENABLE_NSS_MYHOSTNAME + "myhostname", +#endif +#if ENABLE_NSS_RESOLVE + "resolve", +#endif +#if ENABLE_NSS_MYMACHINES + "mymachines", +#endif + NULL); + assert_se(modules); + + if (argc > 2) { + int family; + union in_addr_union address; + + STRV_FOREACH(name, argv + 2) { + r = in_addr_from_string_auto(*name, &family, &address); + if (r < 0) { + /* assume this is a name */ + r = strv_extend(&names, *name); + if (r < 0) + return r; + } else { + assert_se(GREEDY_REALLOC0(addrs, n + 1)); + + addrs[n++] = (struct local_address) { .family = family, + .address = address }; + } + } + } else { + _cleanup_free_ char *hostname = NULL; + assert_se(hostname = gethostname_malloc()); + assert_se(names = strv_new("localhost", + "_gateway", + "_outbound", + hostname, + slow_tests_enabled() ? "foo_no_such_host" : NULL)); + + n = make_addresses(&addrs); + assert_se(n >= 0); + } + + *the_modules = TAKE_PTR(modules); + *the_names = TAKE_PTR(names); + *the_addresses = TAKE_PTR(addrs); + *n_addresses = n; + return 0; +} + +static int run(int argc, char **argv) { + _cleanup_free_ char *dir = NULL; + _cleanup_strv_free_ char **modules = NULL, **names = NULL; + _cleanup_free_ struct local_address *addresses = NULL; + int n_addresses = 0; + int r; + + test_setup_logging(LOG_INFO); + + r = parse_argv(argc, argv, &modules, &names, &addresses, &n_addresses); + if (r < 0) + return log_error_errno(r, "Failed to parse arguments: %m"); + + assert_se(path_extract_directory(argv[0], &dir) >= 0); + + STRV_FOREACH(module, modules) { + r = test_one_module(dir, *module, names, addresses, n_addresses); + if (r < 0) + return r; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-nss-users.c b/src/test/test-nss-users.c new file mode 100644 index 0000000..5178779 --- /dev/null +++ b/src/test/test-nss-users.c @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "dlfcn-util.h" +#include "errno-list.h" +#include "format-util.h" +#include "log.h" +#include "main-func.h" +#include "nss-test-util.h" +#include "nss-util.h" +#include "path-util.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "user-util.h" + +static size_t arg_bufsize = 1024; + +static void print_struct_passwd(const struct passwd *pwd) { + log_info(" \"%s\" / "UID_FMT":"GID_FMT, + pwd->pw_name, pwd->pw_uid, pwd->pw_gid); + log_info(" passwd=\"%s\"", pwd->pw_passwd); + log_info(" gecos=\"%s\"", pwd->pw_gecos); + log_info(" dir=\"%s\"", pwd->pw_dir); + log_info(" shell=\"%s\"", pwd->pw_shell); +} + +static void print_struct_group(const struct group *gr) { + _cleanup_free_ char *members = NULL; + + log_info(" \"%s\" / "GID_FMT, + gr->gr_name, gr->gr_gid); + log_info(" passwd=\"%s\"", gr->gr_passwd); + + assert_se(members = strv_join(gr->gr_mem, ", ")); + // FIXME: use shell_maybe_quote(SHELL_ESCAPE_EMPTY) when it becomes available + log_info(" members=%s", members); +} + +static void test_getpwnam_r(void *handle, const char *module, const char *name) { + const char *fname; + _nss_getpwnam_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct passwd pwd; + + fname = strjoina("_nss_", module, "_getpwnam_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(name, &pwd, buffer, sizeof buffer, &errno1); + log_info("%s(\"%s\") → status=%s%-20serrno=%d/%s", + fname, name, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---"); + if (status == NSS_STATUS_SUCCESS) + print_struct_passwd(&pwd); +} + +static void test_getgrnam_r(void *handle, const char *module, const char *name) { + const char *fname; + _nss_getgrnam_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct group gr; + + fname = strjoina("_nss_", module, "_getgrnam_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(name, &gr, buffer, sizeof buffer, &errno1); + log_info("%s(\"%s\") → status=%s%-20serrno=%d/%s", + fname, name, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---"); + if (status == NSS_STATUS_SUCCESS) + print_struct_group(&gr); +} + +static void test_getpwuid_r(void *handle, const char *module, uid_t uid) { + const char *fname; + _nss_getpwuid_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct passwd pwd; + + fname = strjoina("_nss_", module, "_getpwuid_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(uid, &pwd, buffer, sizeof buffer, &errno1); + log_info("%s("UID_FMT") → status=%s%-20serrno=%d/%s", + fname, uid, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---"); + if (status == NSS_STATUS_SUCCESS) + print_struct_passwd(&pwd); +} + +static void test_getgrgid_r(void *handle, const char *module, gid_t gid) { + const char *fname; + _nss_getgrgid_r_t f; + char buffer[arg_bufsize]; + int errno1 = 999; /* nss-dns doesn't set those */ + enum nss_status status; + char pretty_status[DECIMAL_STR_MAX(enum nss_status)]; + struct group gr; + + fname = strjoina("_nss_", module, "_getgrgid_r"); + f = dlsym(handle, fname); + log_debug("dlsym(0x%p, %s) → 0x%p", handle, fname, f); + if (!f) { + log_info("%s not defined", fname); + return; + } + + status = f(gid, &gr, buffer, sizeof buffer, &errno1); + log_info("%s("GID_FMT") → status=%s%-20serrno=%d/%s", + fname, gid, + nss_status_to_string(status, pretty_status, sizeof pretty_status), "\n", + errno1, errno_to_name(errno1) ?: "---"); + if (status == NSS_STATUS_SUCCESS) + print_struct_group(&gr); +} + +static void test_byname(void *handle, const char *module, const char *name) { + test_getpwnam_r(handle, module, name); + test_getgrnam_r(handle, module, name); + puts(""); +} + +static void test_byuid(void *handle, const char *module, uid_t uid) { + test_getpwuid_r(handle, module, uid); + test_getgrgid_r(handle, module, uid); + puts(""); +} + +static int test_one_module(const char *dir, + const char *module, + char **names) { + + log_info("======== %s ========", module); + + _cleanup_(dlclosep) void *handle = nss_open_handle(dir, module, RTLD_LAZY|RTLD_NODELETE); + if (!handle) + return -EINVAL; + + STRV_FOREACH(name, names) + test_byname(handle, module, *name); + + STRV_FOREACH(name, names) { + uid_t uid; + + assert_cc(sizeof(uid_t) == sizeof(uint32_t)); + /* We use safe_atou32 because we don't want to refuse invalid uids. */ + if (safe_atou32(*name, &uid) < 0) + continue; + + test_byuid(handle, module, uid); + } + + log_info(" "); + return 0; +} + +static int parse_argv(int argc, char **argv, + char ***the_modules, + char ***the_names) { + + _cleanup_strv_free_ char **modules = NULL, **names = NULL; + const char *p; + int r; + + p = getenv("SYSTEMD_TEST_NSS_BUFSIZE"); + if (p) { + r = safe_atozu(p, &arg_bufsize); + if (r < 0) + return log_error_errno(r, "Failed to parse $SYSTEMD_TEST_NSS_BUFSIZE"); + } + + if (argc > 1) + modules = strv_new(argv[1]); + else + modules = strv_new( +#if ENABLE_NSS_SYSTEMD + "systemd", +#endif +#if ENABLE_NSS_MYMACHINES + "mymachines", +#endif + NULL); + assert_se(modules); + + if (argc > 2) + names = strv_copy(strv_skip(argv, 2)); + else + names = strv_new("root", + NOBODY_USER_NAME, + "foo_no_such_user", + "0", + "65534"); + assert_se(names); + + *the_modules = TAKE_PTR(modules); + *the_names = TAKE_PTR(names); + return 0; +} + +static int run(int argc, char **argv) { + _cleanup_free_ char *dir = NULL; + _cleanup_strv_free_ char **modules = NULL, **names = NULL; + int r; + + test_setup_logging(LOG_INFO); + + r = parse_argv(argc, argv, &modules, &names); + if (r < 0) + return log_error_errno(r, "Failed to parse arguments: %m"); + + assert_se(path_extract_directory(argv[0], &dir) >= 0); + + STRV_FOREACH(module, modules) { + r = test_one_module(dir, *module, names); + if (r < 0) + return r; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-nulstr-util.c b/src/test/test-nulstr-util.c new file mode 100644 index 0000000..95c25f1 --- /dev/null +++ b/src/test/test-nulstr-util.c @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "nulstr-util.h" +#include "set.h" +#include "strv.h" +#include "tests.h" + +TEST(strv_split_nulstr) { + _cleanup_strv_free_ char **l = NULL; + const char nulstr[] = "str0\0str1\0str2\0str3\0"; + + l = strv_split_nulstr(nulstr); + assert_se(l); + + assert_se(streq(l[0], "str0")); + assert_se(streq(l[1], "str1")); + assert_se(streq(l[2], "str2")); + assert_se(streq(l[3], "str3")); +} + +#define strv_parse_nulstr_full_one(s, n, e0, e1) \ + ({ \ + _cleanup_strv_free_ char **v0 = NULL, **v1 = NULL; \ + \ + assert_se(v0 = strv_parse_nulstr_full(s, n, false)); \ + assert_se(strv_equal(v0, e0)); \ + assert_se(v1 = strv_parse_nulstr_full(s, n, true)); \ + assert_se(strv_equal(v1, e1)); \ + }) + +TEST(strv_parse_nulstr_full) { + const char nulstr1[] = "hoge\0hoge2\0hoge3\0\0hoge5\0\0xxx"; + const char nulstr2[] = "hoge\0hoge2\0hoge3\0\0hoge5\0\0xxx\0\0\0"; + + strv_parse_nulstr_full_one(nulstr1, sizeof(nulstr1) - 1, + STRV_MAKE("hoge", "hoge2", "hoge3", "", "hoge5", "", "xxx"), + STRV_MAKE("hoge", "hoge2", "hoge3", "", "hoge5", "", "xxx")); + + strv_parse_nulstr_full_one(nulstr2, sizeof(nulstr2) - 1, + STRV_MAKE("hoge", "hoge2", "hoge3", "", "hoge5", "", "xxx", "", ""), + STRV_MAKE("hoge", "hoge2", "hoge3", "", "hoge5", "", "xxx")); + + strv_parse_nulstr_full_one(((const char[0]) {}), 0, + STRV_MAKE_EMPTY, STRV_MAKE_EMPTY); + + strv_parse_nulstr_full_one(((const char[1]) { 0 }), 1, + STRV_MAKE(""), STRV_MAKE_EMPTY); + + strv_parse_nulstr_full_one(((const char[1]) { 'x' }), 1, + STRV_MAKE("x"), STRV_MAKE("x")); + + strv_parse_nulstr_full_one(((const char[2]) { 0, 0 }), 2, + STRV_MAKE("", ""), STRV_MAKE_EMPTY); + + strv_parse_nulstr_full_one(((const char[2]) { 'x', 0 }), 2, + STRV_MAKE("x"), STRV_MAKE("x")); + + strv_parse_nulstr_full_one(((const char[3]) { 0, 0, 0 }), 3, + STRV_MAKE("", "", ""), STRV_MAKE_EMPTY); + + strv_parse_nulstr_full_one(((const char[3]) { 'x', 0, 0 }), 3, + STRV_MAKE("x", ""), STRV_MAKE("x")); + + strv_parse_nulstr_full_one(((const char[3]) { 0, 'x', 0 }), 3, + STRV_MAKE("", "x"), STRV_MAKE("", "x")); + + strv_parse_nulstr_full_one(((const char[3]) { 0, 0, 'x' }), 3, + STRV_MAKE("", "", "x"), STRV_MAKE("", "", "x")); + + strv_parse_nulstr_full_one(((const char[3]) { 'x', 'x', 0 }), 3, + STRV_MAKE("xx"), STRV_MAKE("xx")); + + strv_parse_nulstr_full_one(((const char[3]) { 0, 'x', 'x' }), 3, + STRV_MAKE("", "xx"), STRV_MAKE("", "xx")); + + strv_parse_nulstr_full_one(((const char[3]) { 'x', 0, 'x' }), 3, + STRV_MAKE("x", "x"), STRV_MAKE("x", "x")); + + strv_parse_nulstr_full_one(((const char[3]) { 'x', 'x', 'x' }), 3, + STRV_MAKE("xxx"), STRV_MAKE("xxx")); +} + +static void test_strv_make_nulstr_one(char **l) { + _cleanup_free_ char *b = NULL, *c = NULL; + _cleanup_strv_free_ char **q = NULL; + size_t n, m; + unsigned i = 0; + + log_info("/* %s */", __func__); + + assert_se(strv_make_nulstr(l, &b, &n) >= 0); + assert_se(q = strv_parse_nulstr(b, n)); + assert_se(strv_equal(l, q)); + + assert_se(strv_make_nulstr(q, &c, &m) >= 0); + assert_se(memcmp_nn(b, n, c, m) == 0); + + NULSTR_FOREACH(s, b) + assert_se(streq(s, l[i++])); + assert_se(i == strv_length(l)); +} + +TEST(strv_make_nulstr) { + test_strv_make_nulstr_one(NULL); + test_strv_make_nulstr_one(STRV_MAKE(NULL)); + test_strv_make_nulstr_one(STRV_MAKE("foo")); + test_strv_make_nulstr_one(STRV_MAKE("foo", "bar")); + test_strv_make_nulstr_one(STRV_MAKE("foo", "bar", "quuux")); +} + +TEST(set_make_nulstr) { + _cleanup_set_free_free_ Set *set = NULL; + size_t len = 0; + int r; + + { + /* Unallocated and empty set. */ + static const char expect[] = { 0x00, 0x00 }; + _cleanup_free_ char *nulstr = NULL; + + r = set_make_nulstr(set, &nulstr, &len); + assert_se(r == 0); + assert_se(len == 0); + assert_se(memcmp(expect, nulstr, len + 2) == 0); + } + + { + /* Allocated by empty set. */ + static const char expect[] = { 0x00, 0x00 }; + _cleanup_free_ char *nulstr = NULL; + + set = set_new(NULL); + assert_se(set); + + r = set_make_nulstr(set, &nulstr, &len); + assert_se(r == 0); + assert_se(len == 0); + assert_se(memcmp(expect, nulstr, len + 2) == 0); + } + + { + /* Non-empty set. */ + static const char expect[] = { 'a', 'a', 'a', 0x00, 0x00 }; + _cleanup_free_ char *nulstr = NULL; + + assert_se(set_put_strdup(&set, "aaa") >= 0); + + r = set_make_nulstr(set, &nulstr, &len); + assert_se(r == 0); + assert_se(len == 4); + assert_se(memcmp(expect, nulstr, len + 1) == 0); + } +} + +static void test_strv_make_nulstr_binary_one(char **l, const char *b, size_t n) { + _cleanup_strv_free_ char **z = NULL; + _cleanup_free_ char *a = NULL; + size_t m; + + assert_se(strv_make_nulstr(l, &a, &m) >= 0); + assert_se(memcmp_nn(a, m, b, n) == 0); + assert_se(z = strv_parse_nulstr(a, m)); + assert_se(strv_equal(l, z)); +} + +TEST(strv_make_nulstr_binary) { + test_strv_make_nulstr_binary_one(NULL, (const char[0]) {}, 0); + test_strv_make_nulstr_binary_one(STRV_MAKE(NULL), (const char[0]) {}, 0); + test_strv_make_nulstr_binary_one(STRV_MAKE(""), (const char[1]) { 0 }, 1); + test_strv_make_nulstr_binary_one(STRV_MAKE("", ""), (const char[2]) { 0, 0 }, 2); + test_strv_make_nulstr_binary_one(STRV_MAKE("x", ""), (const char[3]) { 'x', 0, 0 }, 3); + test_strv_make_nulstr_binary_one(STRV_MAKE("", "x"), (const char[3]) { 0, 'x', 0 }, 3); + test_strv_make_nulstr_binary_one(STRV_MAKE("", "", ""), (const char[3]) { 0, 0, 0 }, 3); + test_strv_make_nulstr_binary_one(STRV_MAKE("x", "", ""), (const char[4]) { 'x', 0, 0, 0 }, 4); + test_strv_make_nulstr_binary_one(STRV_MAKE("", "x", ""), (const char[4]) { 0, 'x', 0, 0 }, 4); + test_strv_make_nulstr_binary_one(STRV_MAKE("", "", "x"), (const char[4]) { 0, 0, 'x', 0 }, 4); + test_strv_make_nulstr_binary_one(STRV_MAKE("x", "x", ""), (const char[5]) { 'x', 0, 'x', 0, 0 }, 5); + test_strv_make_nulstr_binary_one(STRV_MAKE("", "x", "x"), (const char[5]) { 0, 'x', 0, 'x', 0 }, 5); + test_strv_make_nulstr_binary_one(STRV_MAKE("x", "", "x"), (const char[5]) { 'x', 0, 0, 'x', 0 }, 5); + test_strv_make_nulstr_binary_one(STRV_MAKE("x", "x", "x"), (const char[6]) { 'x', 0, 'x', 0, 'x', 0 }, 6); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-open-file.c b/src/test/test-open-file.c new file mode 100644 index 0000000..1b938ec --- /dev/null +++ b/src/test/test-open-file.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "open-file.h" +#include "string-util.h" +#include "tests.h" + +TEST(open_file_parse) { + _cleanup_(open_file_freep) OpenFile *of = NULL; + int r; + + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:read-only", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == OPENFILE_READ_ONLY); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "mnt")); + assert_se(of->flags == 0); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == 0); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt::read-only", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "mnt")); + assert_se(of->flags == OPENFILE_READ_ONLY); + + of = open_file_free(of); + r = open_file_parse("../file.dat:file:read-only", &of); + + assert_se(r == -EINVAL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:rw", &of); + + assert_se(r == -EINVAL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:append", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == OPENFILE_APPEND); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:truncate", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == OPENFILE_TRUNCATE); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:read-only,append", &of); + + assert_se(r == -EINVAL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:read-only,truncate", &of); + + assert_se(r == -EINVAL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:append,truncate", &of); + + assert_se(r == -EINVAL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:read-only,read-only", &of); + + assert_se(r == -EINVAL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:graceful", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == OPENFILE_GRACEFUL); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:read-only,graceful", &of); + + assert_se(r >= 0); + assert_se(streq(of->path, "/proc/1/ns/mnt")); + assert_se(streq(of->fdname, "host-mount-namespace")); + assert_se(of->flags == (OPENFILE_READ_ONLY | OPENFILE_GRACEFUL)); + + of = open_file_free(of); + r = open_file_parse("/proc/1/ns/mnt:host-mount-namespace:read-only:other", &of); + + assert_se(r == -EINVAL); +} + +TEST(open_file_to_string) { + _cleanup_free_ char *s = NULL; + _cleanup_(open_file_freep) OpenFile *of = NULL; + int r; + + assert_se(of = new (OpenFile, 1)); + *of = (OpenFile){ .path = strdup("/proc/1/ns/mnt"), + .fdname = strdup("host-mount-namespace"), + .flags = OPENFILE_READ_ONLY }; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt:host-mount-namespace:read-only")); + + s = mfree(s); + of->flags = OPENFILE_APPEND; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt:host-mount-namespace:append")); + + s = mfree(s); + of->flags = OPENFILE_TRUNCATE; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt:host-mount-namespace:truncate")); + + s = mfree(s); + of->flags = OPENFILE_GRACEFUL; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt:host-mount-namespace:graceful")); + + s = mfree(s); + of->flags = OPENFILE_READ_ONLY | OPENFILE_GRACEFUL; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt:host-mount-namespace:read-only,graceful")); + + s = mfree(s); + of->flags = 0; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt:host-mount-namespace")); + + s = mfree(s); + assert_se(free_and_strdup(&of->fdname, "mnt")); + of->flags = OPENFILE_READ_ONLY; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/proc/1/ns/mnt::read-only")); + + s = mfree(s); + assert_se(free_and_strdup(&of->path, "/path:with:colon")); + assert_se(free_and_strdup(&of->fdname, "path:with:colon")); + of->flags = 0; + + r = open_file_to_string(of, &s); + + assert_se(r >= 0); + assert_se(streq(s, "/path\\:with\\:colon")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-openssl.c b/src/test/test-openssl.c new file mode 100644 index 0000000..dfdd1ab --- /dev/null +++ b/src/test/test-openssl.c @@ -0,0 +1,483 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "hexdecoct.h" +#include "openssl-util.h" +#include "tests.h" + +TEST(openssl_pkey_from_pem) { + DEFINE_HEX_PTR(key_ecc, "2d2d2d2d2d424547494e205055424c4943204b45592d2d2d2d2d0a4d466b77457759484b6f5a497a6a3043415159494b6f5a497a6a30444151634451674145726a6e4575424c73496c3972687068777976584e50686a346a426e500a44586e794a304b395579724e6764365335413532542b6f5376746b436a365a726c34685847337741515558706f426c532b7448717452714c35513d3d0a2d2d2d2d2d454e44205055424c4943204b45592d2d2d2d2d0a"); + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey_ecc = NULL; + assert_se(openssl_pkey_from_pem(key_ecc, key_ecc_len, &pkey_ecc) >= 0); + + _cleanup_free_ void *x = NULL, *y = NULL; + size_t x_len, y_len; + int curve_id; + assert_se(ecc_pkey_to_curve_x_y(pkey_ecc, &curve_id, &x, &x_len, &y, &y_len) >= 0); + assert_se(curve_id == NID_X9_62_prime256v1); + + DEFINE_HEX_PTR(expected_x, "ae39c4b812ec225f6b869870caf5cd3e18f88c19cf0d79f22742bd532acd81de"); + assert_se(memcmp_nn(x, x_len, expected_x, expected_x_len) == 0); + + DEFINE_HEX_PTR(expected_y, "92e40e764fea12bed9028fa66b9788571b7c004145e9a01952fad1eab51a8be5"); + assert_se(memcmp_nn(y, y_len, expected_y, expected_y_len) == 0); + + DEFINE_HEX_PTR(key_rsa, "2d2d2d2d2d424547494e205055424c4943204b45592d2d2d2d2d0a4d494942496a414e42676b71686b6947397730424151454641414f43415138414d49494243674b4341514541795639434950652f505852337a436f63787045300a6a575262546c3568585844436b472f584b79374b6d2f4439584942334b734f5a31436a5937375571372f674359363170697838697552756a73413464503165380a593445336c68556d374a332b6473766b626f4b64553243626d52494c2f6675627771694c4d587a41673342575278747234547545443533527a373634554650640a307a70304b68775231496230444c67772f344e67566f314146763378784b4d6478774d45683567676b73733038326332706c354a504e32587677426f744e6b4d0a5471526c745a4a35355244436170696e7153334577376675646c4e735851357746766c7432377a7637344b585165616d704c59433037584f6761304c676c536b0a79754774586b6a50542f735542544a705374615769674d5a6f714b7479563463515a58436b4a52684459614c47587673504233687a766d5671636e6b47654e540a65774944415141420a2d2d2d2d2d454e44205055424c4943204b45592d2d2d2d2d0a"); + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey_rsa = NULL; + assert_se(openssl_pkey_from_pem(key_rsa, key_rsa_len, &pkey_rsa) >= 0); + + _cleanup_free_ void *n = NULL, *e = NULL; + size_t n_len, e_len; + assert_se(rsa_pkey_to_n_e(pkey_rsa, &n, &n_len, &e, &e_len) >= 0); + + DEFINE_HEX_PTR(expected_n, "c95f4220f7bf3d7477cc2a1cc691348d645b4e5e615d70c2906fd72b2eca9bf0fd5c80772ac399d428d8efb52aeff80263ad698b1f22b91ba3b00e1d3f57bc638137961526ec9dfe76cbe46e829d53609b99120bfdfb9bc2a88b317cc0837056471b6be13b840f9dd1cfbeb85053ddd33a742a1c11d486f40cb830ff8360568d4016fdf1c4a31dc7030487982092cb34f36736a65e493cdd97bf0068b4d90c4ea465b59279e510c26a98a7a92dc4c3b7ee76536c5d0e7016f96ddbbcefef829741e6a6a4b602d3b5ce81ad0b8254a4cae1ad5e48cf4ffb140532694ad6968a0319a2a2adc95e1c4195c29094610d868b197bec3c1de1cef995a9c9e419e3537b"); + assert_se(memcmp_nn(n, n_len, expected_n, expected_n_len) == 0); + + DEFINE_HEX_PTR(expected_e, "010001"); + assert_se(memcmp_nn(e, e_len, expected_e, expected_e_len) == 0); +} + +TEST(rsa_pkey_n_e) { + DEFINE_HEX_PTR(n, "e3975a2124a7c9fe57752d106314ff62f6da731632eac221f1c0255bdcf2a34eeb21e3ab89ba8759ddad3b68be99463c7f03f3d004028a35e6f7c6596aeab2558d490f1e1c38aed2ff796bda8d6d55704eefb6ac55842dd6e606bb707f66acc02f0db2aed0dabab885bd0c850f1bdc8ac4b6bc1f74858db8ca2ab57a3d4217c091e9cd78727a2e36b8126ea629e81fecc69b0bea601000a6c0b749c5be16f53f4fa9f208a581d804234eb6526ba3fee9822d58d1ab9cac2761d7f630eb7ad6054dff0856d41aea219e1adfd87256aa1532202a070f4b1044e718d1f38bbc5a4b1fcb024f04afaafda5edeacfdf0d0bdf35c359acd059e3edb5024e588458f9b5"); + uint32_t e = htobe32(0x10001); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + assert_se(rsa_pkey_from_n_e(n, n_len, &e, sizeof(e), &pkey) >= 0); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new((EVP_PKEY*) pkey, NULL); + assert_se(ctx); + assert_se(EVP_PKEY_verify_init(ctx) == 1); + + const char *msg = "this is a secret"; + DEFINE_HEX_PTR(sig, "14b53e0c6ad99a350c3d7811e8160f4ae03ad159815bb91bddb9735b833588df2eac221fbd3fc4ece0dd63bfaeddfdaf4ae67021e759f3638bc194836413414f54e8c4d01c9c37fa4488ea2ef772276b8a33822a53c97b1c35acfb4bc621cfb8fad88f0cf7d5491f05236886afbf9ed47f9469536482f50f74a20defa59d99676bed62a17b5eb98641df5a2f8080fa4b24f2749cc152fa65ba34c14022fcb27f1b36f52021950d7b9b6c3042c50b84cfb7d55a5f9235bfd58e1bf1f604eb93416c5fb5fd90cb68f1270dfa9daf67f52c604f62c2f2beee5e7e672b0e6e9833dd43dba99b77668540c850c9a81a5ea7aaf6297383e6135bd64572362333121fc7"); + assert_se(EVP_PKEY_verify(ctx, sig, sig_len, (unsigned char*) msg, strlen(msg)) == 1); + + DEFINE_HEX_PTR(invalid_sig, "1234"); + assert_se(EVP_PKEY_verify(ctx, invalid_sig, invalid_sig_len, (unsigned char*) msg, strlen(msg)) != 1); + + _cleanup_free_ void *n2 = NULL, *e2 = NULL; + size_t n2_size, e2_size; + assert_se(rsa_pkey_to_n_e(pkey, &n2, &n2_size, &e2, &e2_size) >= 0); + assert_se(memcmp_nn(n, n_len, n2, n2_size) == 0); + assert_se(e2_size <= sizeof(uint32_t)); + assert_se(memcmp(&((uint8_t*) &e)[sizeof(uint32_t) - e2_size], e2, e2_size) == 0); +} + +TEST(ecc_pkey_curve_x_y) { + int curveid = NID_X9_62_prime256v1; + DEFINE_HEX_PTR(x, "2830d2c8f65d3efbef12303b968b91692f8bd04045dcb8a9656374e4ae61d818"); + DEFINE_HEX_PTR(y, "8a80750f76729defdcc2a4bc1a91c22e60109dd6e1ffde634a650a20bab172e9"); + + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + assert_se(ecc_pkey_from_curve_x_y(curveid, x, x_len, y, y_len, &pkey) >= 0); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new((EVP_PKEY*) pkey, NULL); + assert_se(ctx); + assert_se(EVP_PKEY_verify_init(ctx) == 1); + + const char *msg = "this is a secret"; + DEFINE_HEX_PTR(sig, "3045022100f6ca10f7ed57a020679899b26dd5ac5a1079265885e2a6477f527b6a3f02b5ca02207b550eb3e7b69360aff977f7f6afac99c3f28266b6c5338ce373f6b59263000a"); + assert_se(EVP_PKEY_verify(ctx, sig, sig_len, (unsigned char*) msg, strlen(msg)) == 1); + + DEFINE_HEX_PTR(invalid_sig, "1234"); + assert_se(EVP_PKEY_verify(ctx, invalid_sig, invalid_sig_len, (unsigned char*) msg, strlen(msg)) != 1); + + _cleanup_free_ void *x2 = NULL, *y2 = NULL; + size_t x2_size, y2_size; + int curveid2; + assert_se(ecc_pkey_to_curve_x_y(pkey, &curveid2, &x2, &x2_size, &y2, &y2_size) >= 0); + assert_se(curveid == curveid2); + assert_se(memcmp_nn(x, x_len, x2, x2_size) == 0); + assert_se(memcmp_nn(y, y_len, y2, y2_size) == 0); +} + +TEST(invalid) { + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + + DEFINE_HEX_PTR(key, "2d2d2d2d2d424547494e205055424c4943204b45592d2d2d2d2d0a4d466b7b"); + assert_se(openssl_pkey_from_pem(key, key_len, &pkey) == -EIO); + assert_se(pkey == NULL); +} + +static const struct { + const char *alg; + size_t size; +} digest_size_table[] = { + /* SHA1 "family" */ + { "sha1", 20, }, +#if OPENSSL_VERSION_MAJOR >= 3 + { "sha-1", 20, }, +#endif + /* SHA2 family */ + { "sha224", 28, }, + { "sha256", 32, }, + { "sha384", 48, }, + { "sha512", 64, }, +#if OPENSSL_VERSION_MAJOR >= 3 + { "sha-224", 28, }, + { "sha2-224", 28, }, + { "sha-256", 32, }, + { "sha2-256", 32, }, + { "sha-384", 48, }, + { "sha2-384", 48, }, + { "sha-512", 64, }, + { "sha2-512", 64, }, +#endif + /* SHA3 family */ + { "sha3-224", 28, }, + { "sha3-256", 32, }, + { "sha3-384", 48, }, + { "sha3-512", 64, }, + /* SM3 family */ + { "sm3", 32, }, + /* MD5 family */ + { "md5", 16, }, +}; + +TEST(digest_size) { + size_t size; + + FOREACH_ARRAY(t, digest_size_table, ELEMENTSOF(digest_size_table)) { + assert(openssl_digest_size(t->alg, &size) >= 0); + assert_se(size == t->size); + + _cleanup_free_ char *uppercase_alg = strdup(t->alg); + assert_se(uppercase_alg); + assert_se(openssl_digest_size(ascii_strupper(uppercase_alg), &size) >= 0); + assert_se(size == t->size); + } + + assert_se(openssl_digest_size("invalid.alg", &size) == -EOPNOTSUPP); +} + +static void verify_digest(const char *digest_alg, const struct iovec *data, size_t n_data, const char *expect) { + _cleanup_free_ void *digest = NULL; + size_t digest_size; + int r; + + r = openssl_digest_many(digest_alg, data, n_data, &digest, &digest_size); + if (r == -EOPNOTSUPP) + return; + assert_se(r >= 0); + + DEFINE_HEX_PTR(e, expect); + assert_se(memcmp_nn(e, e_len, digest, digest_size) == 0); +} + +#define _DEFINE_DIGEST_TEST(uniq, alg, expect, ...) \ + const struct iovec UNIQ_T(i, uniq)[] = { __VA_ARGS__ }; \ + verify_digest(alg, \ + UNIQ_T(i, uniq), \ + ELEMENTSOF(UNIQ_T(i, uniq)), \ + expect); +#define DEFINE_DIGEST_TEST(alg, expect, ...) _DEFINE_DIGEST_TEST(UNIQ, alg, expect, __VA_ARGS__) +#define DEFINE_SHA1_TEST(expect, ...) DEFINE_DIGEST_TEST("SHA1", expect, __VA_ARGS__) +#define DEFINE_SHA256_TEST(expect, ...) DEFINE_DIGEST_TEST("SHA256", expect, __VA_ARGS__) +#define DEFINE_SHA384_TEST(expect, ...) DEFINE_DIGEST_TEST("SHA384", expect, __VA_ARGS__) +#define DEFINE_SHA512_TEST(expect, ...) DEFINE_DIGEST_TEST("SHA512", expect, __VA_ARGS__) + +TEST(digest_many) { + const struct iovec test = IOVEC_MAKE_STRING("test"); + + /* Empty digests */ + DEFINE_SHA1_TEST("da39a3ee5e6b4b0d3255bfef95601890afd80709"); + DEFINE_SHA256_TEST("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); + DEFINE_SHA384_TEST("38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b"); + DEFINE_SHA512_TEST("cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"); + + DEFINE_SHA1_TEST("a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", test); + DEFINE_SHA256_TEST("9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08", test); + DEFINE_SHA384_TEST("768412320f7b0aa5812fce428dc4706b3cae50e02a64caa16a782249bfe8efc4b7ef1ccb126255d196047dfedf17a0a9", test); + DEFINE_SHA512_TEST("ee26b0dd4af7e749aa1a8ee3c10ae9923f618980772e473f8819a5d4940e0db27ac185f8a0e1d5f84f88bc887fd67b143732c304cc5fa9ad8e6f57f50028a8ff", test); + + DEFINE_HEX_PTR(h1, "e9ff2b6dfbc03b8dd0471a0f23840334e3ef51c64a325945524563c0375284a092751eca8d084fae22f74a104559a0ee8339d1845538481e674e6d31d4f63089"); + DEFINE_HEX_PTR(h2, "5b6e809933a1b8d5a4a6bb62e20b36ae82d9408141e7479d0aa067273bd2d04007fb1977bad549d54330a49ed98f82b495ba"); + DEFINE_HEX_PTR(h3, "d2aeef94d7ba2a"); + DEFINE_HEX_PTR(h4, "1557db45ded3e38c79b5bb25c83ade42fa7d13047ef1b9a0b21a3c2ab2d4eee5c75e2927ce643163addbda65331035850a436c0acffc723f419e1d1cbf04c9064e6d850580c0732a12600f9feb"); + + const struct iovec i1 = IOVEC_MAKE(h1, h1_len); + const struct iovec i2 = IOVEC_MAKE(h2, h2_len); + const struct iovec i3 = IOVEC_MAKE(h3, h3_len); + const struct iovec i4 = IOVEC_MAKE(h4, h4_len); + + DEFINE_SHA1_TEST("8e7c659a6331508b06adf98b430759dafb92fc43", i1, i2, i3, i4); + DEFINE_SHA256_TEST("4d6be38798786a5500651c1a02d96aa010e9d7b2bece1695294cd396d456cde8", i1, i2, i3, i4); + DEFINE_SHA384_TEST("82e6ec14f8d90f1ae1fd4fb7f415ea6fdb674515b13092e3e548a8d37a8faed30cda8ea613ec2a015a51bc578dacc995", i1, i2, i3, i4); + DEFINE_SHA512_TEST("21fe5beb15927257a9143ff59010e51d4c65c7c5237b0cd9a8db3c3fabe429be3a0759f9ace3cdd70f6ea543f998bec9bc3308833d70aa1bd380364de872a62c", i1, i2, i3, i4); + + DEFINE_SHA256_TEST("0e0ed67d6717dc08dd6f472f6c35107a92b8c2695dcba344b884436f97a9eb4d", i1, i1, i1, i4); + + DEFINE_SHA256_TEST("8fe8b8d1899c44bfb82e1edc4ff92642db5b2cb25c4210ea06c3846c757525a8", i1, i1, i1, i4, i4, i4, i4, i3, i3, i2); +} + +static void verify_hmac( + const char *digest_alg, + const char *key, + const struct iovec *data, + size_t n_data, + const char *expect) { + + DEFINE_HEX_PTR(k, key); + DEFINE_HEX_PTR(e, expect); + _cleanup_free_ void *digest = NULL; + size_t digest_size; + + if (n_data == 0) { + assert_se(openssl_hmac(digest_alg, k, k_len, NULL, 0, &digest, &digest_size) == 0); + assert_se(memcmp_nn(e, e_len, digest, digest_size) == 0); + digest = mfree(digest); + } else if(n_data == 1) { + assert_se(openssl_hmac(digest_alg, k, k_len, data[0].iov_base, data[0].iov_len, &digest, &digest_size) == 0); + assert_se(memcmp_nn(e, e_len, digest, digest_size) == 0); + digest = mfree(digest); + } + + assert_se(openssl_hmac_many(digest_alg, k, k_len, data, n_data, &digest, &digest_size) == 0); + assert_se(memcmp_nn(e, e_len, digest, digest_size) == 0); +} + +#define _DEFINE_HMAC_TEST(uniq, alg, key, expect, ...) \ + const struct iovec UNIQ_T(i, uniq)[] = { __VA_ARGS__ }; \ + verify_hmac(alg, \ + key, \ + UNIQ_T(i, uniq), \ + ELEMENTSOF(UNIQ_T(i, uniq)), \ + expect); +#define DEFINE_HMAC_TEST(alg, key, expect, ...) _DEFINE_HMAC_TEST(UNIQ, alg, key, expect, __VA_ARGS__) +#define DEFINE_HMAC_SHA1_TEST(key, expect, ...) DEFINE_HMAC_TEST("SHA1", key, expect, __VA_ARGS__) +#define DEFINE_HMAC_SHA256_TEST(key, expect, ...) DEFINE_HMAC_TEST("SHA256", key, expect, __VA_ARGS__) +#define DEFINE_HMAC_SHA384_TEST(key, expect, ...) DEFINE_HMAC_TEST("SHA384", key, expect, __VA_ARGS__) +#define DEFINE_HMAC_SHA512_TEST(key, expect, ...) DEFINE_HMAC_TEST("SHA512", key, expect, __VA_ARGS__) + +TEST(hmac_many) { + const char *key1 = "760eb6845073862c1914c6d188bf8214", + *key2 = "0628d1a5f83fce99779e12e2336d87046d42d74b755f00d9f72350668860fd00", + *key3 = "b61158912b76348c54f104629924be4178b8a9c9459c3a6e9daa1885445a61fccc1aa0f749c31f3ade4e227f64dd0e86a94b25c2e181f044af22d0a8c07074c3"; + const struct iovec test = IOVEC_MAKE_STRING("test"); + + /* Empty digests */ + DEFINE_HMAC_SHA1_TEST(key1, "EB9725FC9A99A652C3171E0863984AC42461F88B"); + DEFINE_HMAC_SHA256_TEST(key1, "82A15D4DD5F583CF8F06D3E447DF0FDFF95A24E29229934B48BD0A5B4E0ADC85"); + DEFINE_HMAC_SHA384_TEST(key1, "C60F15C4E18736750D91095ADA148C4179825A487CCA3AE047A2FB94F85A5587AB6AF57678AA79715FEF848129C108C3"); + DEFINE_HMAC_SHA512_TEST(key1, "2B10DC9BFC0349400F8965482EA149C1C51C865BB7B16097623F41C14CF6C8A678724BFAE0CE842EED899C12CC17B5D8C4287F72BE788532FE7CF0BE2EBCD447"); + + DEFINE_HMAC_SHA1_TEST(key2, "F9AA74F129681E91807EB264EA6E1B5C5F9B4CFD"); + DEFINE_HMAC_SHA256_TEST(key2, "B4ADEBF8B3044A5B0668B742C0A49B61D8380F89938C84794C92567F5A33CC7D"); + DEFINE_HMAC_SHA384_TEST(key2, "E5EACAB7A13CF5BE60FA228D771E183CD6E57536BB9EAFC34A6BB52B1B1324BD6FB8A1713F91EC040790AE97F5672D53"); + DEFINE_HMAC_SHA512_TEST(key2, "75A597D83A6270FC3204DE741E76DEFCF42D3E1812C71E41EEA8C0F23C07315822E83BE8B54705CB00FEF4CE1BAF80E3975414925C83BF3719CEBC27DD133F7D"); + + DEFINE_HMAC_SHA1_TEST(key3, "4B8EACB3C3935ACC8C58995C89F16020FC993569"); + DEFINE_HMAC_SHA256_TEST(key3, "520E8C0323A1994D58EF5456611BCB6CD701399B24F8FBA0B5A3CD3186780E8E"); + DEFINE_HMAC_SHA384_TEST(key3, "52ADAF691EFDC377B7349EAA45EE1BFAFA27CAC1FFE08B942C80426D1CA9F3464E3A71D611DA0B415435E82D6EE9F34A"); + DEFINE_HMAC_SHA512_TEST(key3, "22D8C17BAF591E07CD2BD58A1B3D76D5904EC45C9099F0171A243F07611E25208A395833BC3F9BBD425636FD8D574BE1A1A367DCB6C40AD3C06E2B57E8FD2729"); + + /* test message */ + DEFINE_HMAC_SHA1_TEST(key2, "DEE6313BE6391523D0B2B326890F13A65F3965B2", test); + DEFINE_HMAC_SHA256_TEST(key2, "496FF3E9DA52B2B490CD5EAE23457F8A33E61AB7B42F6E6374B7629CFBE1FCED", test); + DEFINE_HMAC_SHA384_TEST(key2, "F5223F750D671453CA6159C1354242DB13E0189CB79AC73E4964F623181B00C811A596F7CE3408DDE06B96C6D792F41E", test); + DEFINE_HMAC_SHA512_TEST(key2, "8755A8B0D85D89AFFE7A15702BBA0F835CDE454334EC952ED777A30035D6BD9407EA5DF8DCB89814C1DF7EE215022EA68D9D2BC4E4B299CD6F55CD60C269A706", test); + + DEFINE_HEX_PTR(h1, "e9ff2b6dfbc03b8dd0471a0f23840334e3ef51c64a325945524563c0375284a092751eca8d084fae22f74a104559a0ee8339d1845538481e674e6d31d4f63089"); + DEFINE_HEX_PTR(h2, "5b6e809933a1b8d5a4a6bb62e20b36ae82d9408141e7479d0aa067273bd2d04007fb1977bad549d54330a49ed98f82b495ba"); + DEFINE_HEX_PTR(h3, "d2aeef94d7ba2a"); + DEFINE_HEX_PTR(h4, "1557db45ded3e38c79b5bb25c83ade42fa7d13047ef1b9a0b21a3c2ab2d4eee5c75e2927ce643163addbda65331035850a436c0acffc723f419e1d1cbf04c9064e6d850580c0732a12600f9feb"); + + const struct iovec i1 = IOVEC_MAKE(h1, h1_len); + const struct iovec i2 = IOVEC_MAKE(h2, h2_len); + const struct iovec i3 = IOVEC_MAKE(h3, h3_len); + const struct iovec i4 = IOVEC_MAKE(h4, h4_len); + + DEFINE_HMAC_SHA1_TEST(key2, "28C041532012BFF1B7C87B2A15A8C43EB8037D27", i1, i2, i3, i4); + DEFINE_HMAC_SHA256_TEST(key2, "F8A1FBDEE3CD383EA2B4940A3C8E72F443DB5B247016C9F84E2D2FEF3C5A0A23", i1, i2, i3, i4); + DEFINE_HMAC_SHA384_TEST(key2, "4D2AB0516F1F5C73BD0761407E0AF42361C1CAE761685FC65D1199598315EE3DCA4DB88E4D96FB06C2DA215A33FA9CE9", i1, i2, i3, i4); + DEFINE_HMAC_SHA512_TEST(key2, "E9BF8FC6FDE75FD5E4EF2DF399EE675C57B60C59A7B331F30535FDE68D8072185552E9A8BFA2008C52437F1BCC1472D16FBCF2A77C37339752938E42D2642150", i1, i2, i3, i4); + + DEFINE_HMAC_SHA256_TEST(key3, "94D4E4B55368A533F6A7FDCC3B93E1F283BB1CA387BB5D14FAFF44A009EDF040", i1, i1, i1, i4); + + DEFINE_HMAC_SHA256_TEST(key3, "5BE1F4D9C2AFAA2BB3F58FCE967BC7D3084BB8F512659875BDA634991145B0F0", i1, i1, i1, i4, i4, i4, i4, i3, i3, i2); +} + +TEST(kdf_kb_hmac_derive) { +#if OPENSSL_VERSION_MAJOR >= 3 + _cleanup_free_ void *derived_key = NULL; + + DEFINE_HEX_PTR(key, "d7ac57124f28371eacaec475b74869d26b4cd64586412a607ce0a9e0c63d468c"); + const char *salt = "salty chocolate"; + DEFINE_HEX_PTR(info, "6721a2012d9554f5a64593ed3eaa8fe15e6a21e1c8c8736ea4d234eb55b9e31a"); + DEFINE_HEX_PTR(expected_derived_key, "A9DA9CEEB9578DBE7DD2862F82898B086E85FF2D10C4E8EC5BD99D0D7F003A2DE1574EB4BD789C03EF5235259BCB3A009DA303EA4DB4CA6BF507DB7C5A063279"); + + assert_se(kdf_kb_hmac_derive("COUNTER", "SHA256", key, key_len, salt, strlen(salt), info, info_len, /* seed= */ NULL, /* seed_size= */ 0, 64, &derived_key) >= 0); + assert_se(memcmp_nn(derived_key, 64, expected_derived_key, expected_derived_key_len) == 0); +#else + log_tests_skipped("KDF-KB requires OpenSSL >= 3"); +#endif +} + +#if OPENSSL_VERSION_MAJOR >= 3 +static void check_ss_derive(const char *hex_key, const char *hex_salt, const char *hex_info, const char *hex_expected) { + DEFINE_HEX_PTR(key, hex_key); + DEFINE_HEX_PTR(salt, hex_salt); + DEFINE_HEX_PTR(info, hex_info); + DEFINE_HEX_PTR(expected, hex_expected); + + _cleanup_free_ void *derived_key = NULL; + assert_se(kdf_ss_derive("SHA256", key, key_len, salt, salt_len, info, info_len, expected_len, &derived_key) >= 0); + assert_se(memcmp_nn(derived_key, expected_len, expected, expected_len) == 0); +} +#endif + +TEST(kdf_ss_derive) { +#if OPENSSL_VERSION_MAJOR >= 3 + check_ss_derive( + "01166ad6b05d1fad8cdb50d1902170e9", + "feea805789dc8d0b57da5d4d61886b1a", + "af4cb6d1d0a996e21e3788584165e2ae", + "46CECAB4544E11EF986641BA6F843FAFFD111D3974C34E3B9592311E8579C6BD"); + + check_ss_derive( + "d1c39e37260d79d6e766f1d1412c4b61fc0801db469b97c897b0fbcaebea5178", + "b75e3b65d1bb845dee581c7e14cfebc6e882946e90273b77ebe289faaf7de248", + "ed25a0043d6c1eb28296da1f9ab138dafee18f4c937bfc43601d4ee6e7634199", + "30EB1A1E9DEA7DE4DDB8F3FDF50A01E3"); + /* Same inputs as above, but derive more bytes */ + check_ss_derive( + "d1c39e37260d79d6e766f1d1412c4b61fc0801db469b97c897b0fbcaebea5178", + "b75e3b65d1bb845dee581c7e14cfebc6e882946e90273b77ebe289faaf7de248", + "ed25a0043d6c1eb28296da1f9ab138dafee18f4c937bfc43601d4ee6e7634199", + "30EB1A1E9DEA7DE4DDB8F3FDF50A01E30581D606C1228D98AFF691DF743AC2EE9D99EFD2AE1946C079AA18C9524877FA65D5065F0DAED058AB3416AF80EB2B73"); +#else + log_tests_skipped("KDF-SS requires OpenSSL >= 3"); +#endif +} + +static void check_cipher( + const char *alg, + size_t bits, + const char *mode, + const char *hex_key, + const char *hex_iv, + const struct iovec data[], + size_t n_data, + const char *hex_expected) { + + _cleanup_free_ void *enc_buf = NULL; + size_t enc_buf_len; + + DEFINE_HEX_PTR(key, hex_key); + DEFINE_HEX_PTR(iv, hex_iv); + DEFINE_HEX_PTR(expected, hex_expected); + + if (n_data == 0) { + assert_se(openssl_cipher(alg, bits, mode, key, key_len, iv, iv_len, NULL, 0, &enc_buf, &enc_buf_len) >= 0); + assert_se(memcmp_nn(enc_buf, enc_buf_len, expected, expected_len) == 0); + enc_buf = mfree(enc_buf); + } else if (n_data == 1) { + assert_se(openssl_cipher(alg, bits, mode, key, key_len, iv, iv_len, data[0].iov_base, data[0].iov_len, &enc_buf, &enc_buf_len) >= 0); + assert_se(memcmp_nn(enc_buf, enc_buf_len, expected, expected_len) == 0); + enc_buf = mfree(enc_buf); + } + + assert_se(openssl_cipher_many(alg, bits, mode, key, key_len, iv, iv_len, data, n_data, &enc_buf, &enc_buf_len) >= 0); + assert_se(memcmp_nn(enc_buf, enc_buf_len, expected, expected_len) == 0); +} + +TEST(openssl_cipher) { + struct iovec data[] = { + IOVEC_MAKE_STRING("my"), + IOVEC_MAKE_STRING(" "), + IOVEC_MAKE_STRING("secret"), + IOVEC_MAKE_STRING(" "), + IOVEC_MAKE_STRING("text"), + IOVEC_MAKE_STRING("!"), + }; + + check_cipher( + "aes", 256, "cfb", + "32c62bbaeb0decc5c874b8e0148f86475b5bb10a36f7078a75a6f11704c2f06a", + /* hex_iv= */ NULL, + data, ELEMENTSOF(data), + "bd4a46f8762bf4bef4430514aaec5e"); + + check_cipher( + "aes", 256, "cfb", + "32c62bbaeb0decc5c874b8e0148f86475b5bb10a36f7078a75a6f11704c2f06a", + "00000000000000000000000000000000", + data, ELEMENTSOF(data), + "bd4a46f8762bf4bef4430514aaec5e"); + + check_cipher( + "aes", 256, "cfb", + "32c62bbaeb0decc5c874b8e0148f86475b5bb10a36f7078a75a6f11704c2f06a", + "9088fd5c4ad9b9419eced86283021a59", + data, ELEMENTSOF(data), + "6dfbf8dc972f9a462ad7427a1fa41a"); + + check_cipher( + "aes", 256, "cfb", + "32c62bbaeb0decc5c874b8e0148f86475b5bb10a36f7078a75a6f11704c2f06a", + /* hex_iv= */ NULL, + &data[2], 1, + "a35605f9763c"); + + check_cipher( + "aes", 256, "cfb", + "32c62bbaeb0decc5c874b8e0148f86475b5bb10a36f7078a75a6f11704c2f06a", + /* hex_iv= */ NULL, + /* data= */ NULL, /* n_data= */ 0, + /* expected= */ NULL); + + check_cipher( + "aes", 128, "cfb", + "b8fe4b89f6f25dd58cadceb68c99d508", + /* hex_iv= */ NULL, + data, ELEMENTSOF(data), + "9c0fe3abb904ab419d950ae00c93a1"); + + check_cipher( + "aes", 128, "cfb", + "b8fe4b89f6f25dd58cadceb68c99d508", + "00000000000000000000000000000000", + data, ELEMENTSOF(data), + "9c0fe3abb904ab419d950ae00c93a1"); + + check_cipher( + "aes", 128, "cfb", + "b8fe4b89f6f25dd58cadceb68c99d508", + "9088fd5c4ad9b9419eced86283021a59", + data, ELEMENTSOF(data), + "e765617aceb1326f5309008c14f4e1"); + + check_cipher( + "aes", 128, "cfb", + "b8fe4b89f6f25dd58cadceb68c99d508", + /* hex_iv= */ NULL, + /* data= */ NULL, /* n_data= */ 0, + /* expected= */ NULL); + + check_cipher( + "aes", 128, "cfb", + "b8fe4b89f6f25dd58cadceb68c99d508", + "00000000000000000000000000000000", + /* data= */ NULL, /* n_data= */ 0, + /* expected= */ NULL); +} + +TEST(ecc_ecdh) { + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkeyA = NULL, *pkeyB = NULL, *pkeyC = NULL; + _cleanup_free_ void *secretAB = NULL, *secretBA = NULL, *secretAC = NULL, *secretCA = NULL; + size_t secretAB_size, secretBA_size, secretAC_size, secretCA_size; + + assert_se(ecc_pkey_new(NID_X9_62_prime256v1, &pkeyA) >= 0); + assert_se(ecc_pkey_new(NID_X9_62_prime256v1, &pkeyB) >= 0); + assert_se(ecc_pkey_new(NID_X9_62_prime256v1, &pkeyC) >= 0); + + assert_se(ecc_ecdh(pkeyA, pkeyB, &secretAB, &secretAB_size) >= 0); + assert_se(ecc_ecdh(pkeyB, pkeyA, &secretBA, &secretBA_size) >= 0); + assert_se(ecc_ecdh(pkeyA, pkeyC, &secretAC, &secretAC_size) >= 0); + assert_se(ecc_ecdh(pkeyC, pkeyA, &secretCA, &secretCA_size) >= 0); + + assert_se(memcmp_nn(secretAB, secretAB_size, secretBA, secretBA_size) == 0); + assert_se(memcmp_nn(secretAC, secretAC_size, secretCA, secretCA_size) == 0); + assert_se(memcmp_nn(secretAC, secretAC_size, secretAB, secretAB_size) != 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-ordered-set.c b/src/test/test-ordered-set.c new file mode 100644 index 0000000..c055411 --- /dev/null +++ b/src/test/test-ordered-set.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "ordered-set.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +TEST(set_steal_first) { + _cleanup_ordered_set_free_ OrderedSet *m = NULL; + int seen[3] = {}; + char *val; + + m = ordered_set_new(&string_hash_ops); + assert_se(m); + + assert_se(ordered_set_put(m, (void*) "1") == 1); + assert_se(ordered_set_put(m, (void*) "22") == 1); + assert_se(ordered_set_put(m, (void*) "333") == 1); + + ordered_set_print(stdout, "SET=", m); + + while ((val = ordered_set_steal_first(m))) + seen[strlen(val) - 1]++; + + assert_se(seen[0] == 1 && seen[1] == 1 && seen[2] == 1); + + assert_se(ordered_set_isempty(m)); + + ordered_set_print(stdout, "SET=", m); +} + +typedef struct Item { + int seen; +} Item; +static void item_seen(Item *item) { + item->seen++; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(item_hash_ops, void, trivial_hash_func, trivial_compare_func, Item, item_seen); + +TEST(set_free_with_hash_ops) { + OrderedSet *m; + struct Item items[4] = {}; + + assert_se(m = ordered_set_new(&item_hash_ops)); + + for (size_t i = 0; i < ELEMENTSOF(items) - 1; i++) + assert_se(ordered_set_put(m, items + i) == 1); + + for (size_t i = 0; i < ELEMENTSOF(items) - 1; i++) + assert_se(ordered_set_put(m, items + i) == 0); /* We get 0 here, because we use trivial hash + * ops. Also see below... */ + + m = ordered_set_free(m); + assert_se(items[0].seen == 1); + assert_se(items[1].seen == 1); + assert_se(items[2].seen == 1); + assert_se(items[3].seen == 0); +} + +TEST(set_put) { + _cleanup_ordered_set_free_ OrderedSet *m = NULL; + _cleanup_free_ char **t = NULL, *str = NULL; + + m = ordered_set_new(&string_hash_ops); + assert_se(m); + + assert_se(ordered_set_put(m, (void*) "1") == 1); + assert_se(ordered_set_put(m, (void*) "22") == 1); + assert_se(ordered_set_put(m, (void*) "333") == 1); + assert_se(ordered_set_put(m, (void*) "333") == 0); + assert_se(ordered_set_remove(m, (void*) "333")); + assert_se(ordered_set_put(m, (void*) "333") == 1); + assert_se(ordered_set_put(m, (void*) "333") == 0); + assert_se(ordered_set_put(m, (void*) "22") == 0); + + assert_se(str = strdup("333")); + assert_se(ordered_set_put(m, str) == -EEXIST); /* ... and we get -EEXIST here, because we use + * non-trivial hash ops. */ + + assert_se(t = ordered_set_get_strv(m)); + assert_se(streq(t[0], "1")); + assert_se(streq(t[1], "22")); + assert_se(streq(t[2], "333")); + assert_se(!t[3]); + + ordered_set_print(stdout, "FOO=", m); +} + +TEST(set_put_string_set) { + _cleanup_ordered_set_free_ OrderedSet *m = NULL, *q = NULL; + _cleanup_free_ char **final = NULL; /* "just free" because the strings are in the set */ + + assert_se(ordered_set_put_strdup(&m, "1") == 1); + assert_se(ordered_set_put_strdup(&m, "22") == 1); + assert_se(ordered_set_put_strdup(&m, "333") == 1); + + assert_se(ordered_set_put_strdup(&q, "11") == 1); + assert_se(ordered_set_put_strdup(&q, "22") == 1); + assert_se(ordered_set_put_strdup(&q, "33") == 1); + + assert_se(ordered_set_put_string_set(&m, q) == 2); + + assert_se(final = ordered_set_get_strv(m)); + assert_se(strv_equal(final, STRV_MAKE("1", "22", "333", "11", "33"))); + + ordered_set_print(stdout, "BAR=", m); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-os-util.c b/src/test/test-os-util.c new file mode 100644 index 0000000..84e55e1 --- /dev/null +++ b/src/test/test-os-util.c @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include "fileio.h" +#include "fs-util.h" +#include "log.h" +#include "mkdir.h" +#include "os-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(path_is_os_tree) { + assert_se(path_is_os_tree("/") > 0); + assert_se(path_is_os_tree("/etc") == 0); + assert_se(path_is_os_tree("/idontexist") == -ENOENT); +} + +TEST(parse_os_release) { + /* Let's assume that we're running in a valid system, so os-release is available */ + _cleanup_free_ char *id = NULL, *id2 = NULL, *name = NULL, *foobar = NULL; + assert_se(parse_os_release(NULL, "ID", &id) == 0); + log_info("ID: %s", id); + + assert_se(setenv("SYSTEMD_OS_RELEASE", "/dev/null", 1) == 0); + assert_se(parse_os_release(NULL, "ID", &id2) == 0); + log_info("ID: %s", strnull(id2)); + + _cleanup_(unlink_tempfilep) char tmpfile[] = "/tmp/test-os-util.XXXXXX"; + assert_se(write_tmpfile(tmpfile, + "ID=the-id \n" + "NAME=the-name") == 0); + + assert_se(setenv("SYSTEMD_OS_RELEASE", tmpfile, 1) == 0); + assert_se(parse_os_release(NULL, "ID", &id, "NAME", &name) == 0); + log_info("ID: %s NAME: %s", id, name); + assert_se(streq(id, "the-id")); + assert_se(streq(name, "the-name")); + + _cleanup_(unlink_tempfilep) char tmpfile2[] = "/tmp/test-os-util.XXXXXX"; + assert_se(write_tmpfile(tmpfile2, + "ID=\"ignored\" \n" + "ID=\"the-id\" \n" + "NAME='the-name'") == 0); + + assert_se(setenv("SYSTEMD_OS_RELEASE", tmpfile2, 1) == 0); + assert_se(parse_os_release(NULL, "ID", &id, "NAME", &name) == 0); + log_info("ID: %s NAME: %s", id, name); + assert_se(streq(id, "the-id")); + assert_se(streq(name, "the-name")); + + assert_se(parse_os_release(NULL, "FOOBAR", &foobar) == 0); + log_info("FOOBAR: %s", strnull(foobar)); + assert_se(foobar == NULL); + + assert_se(unsetenv("SYSTEMD_OS_RELEASE") == 0); +} + +TEST(parse_extension_release) { + /* Let's assume that we have a valid extension image */ + _cleanup_free_ char *id = NULL, *version_id = NULL, *foobar = NULL, *a = NULL, *b = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tempdir = NULL; + + int r = mkdtemp_malloc("/tmp/test-os-util.XXXXXX", &tempdir); + if (r < 0) + log_error_errno(r, "Failed to setup working directory: %m"); + + assert_se(a = path_join(tempdir, "/usr/lib/extension-release.d/extension-release.test")); + assert_se(mkdir_parents(a, 0777) >= 0); + + r = write_string_file(a, "ID=the-id \n VERSION_ID=the-version-id", WRITE_STRING_FILE_CREATE); + if (r < 0) + log_error_errno(r, "Failed to write file: %m"); + + assert_se(parse_extension_release(tempdir, IMAGE_SYSEXT, "test", false, "ID", &id, "VERSION_ID", &version_id) == 0); + log_info("ID: %s VERSION_ID: %s", id, version_id); + assert_se(streq(id, "the-id")); + assert_se(streq(version_id, "the-version-id")); + + assert_se(b = path_join(tempdir, "/etc/extension-release.d/extension-release.tester")); + assert_se(mkdir_parents(b, 0777) >= 0); + + r = write_string_file(b, "ID=\"ignored\" \n ID=\"the-id\" \n VERSION_ID='the-version-id'", WRITE_STRING_FILE_CREATE); + if (r < 0) + log_error_errno(r, "Failed to write file: %m"); + + assert_se(parse_extension_release(tempdir, IMAGE_CONFEXT, "tester", false, "ID", &id, "VERSION_ID", &version_id) == 0); + log_info("ID: %s VERSION_ID: %s", id, version_id); + assert_se(streq(id, "the-id")); + assert_se(streq(version_id, "the-version-id")); + + assert_se(parse_extension_release(tempdir, IMAGE_CONFEXT, "tester", false, "FOOBAR", &foobar) == 0); + log_info("FOOBAR: %s", strnull(foobar)); + assert_se(foobar == NULL); + + assert_se(parse_extension_release(tempdir, IMAGE_SYSEXT, "test", false, "FOOBAR", &foobar) == 0); + log_info("FOOBAR: %s", strnull(foobar)); + assert_se(foobar == NULL); +} + +TEST(load_os_release_pairs) { + _cleanup_(unlink_tempfilep) char tmpfile[] = "/tmp/test-os-util.XXXXXX"; + assert_se(write_tmpfile(tmpfile, + "ID=\"ignored\" \n" + "ID=\"the-id\" \n" + "NAME='the-name'") == 0); + + assert_se(setenv("SYSTEMD_OS_RELEASE", tmpfile, 1) == 0); + + _cleanup_strv_free_ char **pairs = NULL; + assert_se(load_os_release_pairs(NULL, &pairs) == 0); + assert_se(strv_equal(pairs, STRV_MAKE("ID", "the-id", + "NAME", "the-name"))); + + assert_se(unsetenv("SYSTEMD_OS_RELEASE") == 0); +} + +TEST(os_release_support_ended) { + int r; + + assert_se(os_release_support_ended("1999-01-01", false, NULL) == true); + assert_se(os_release_support_ended("2037-12-31", false, NULL) == false); + assert_se(os_release_support_ended("-1-1-1", true, NULL) == -EINVAL); + + r = os_release_support_ended(NULL, false, NULL); + if (r < 0) + log_info_errno(r, "Failed to check host: %m"); + else + log_info_errno(r, "Host is supported: %s", yes_no(!r)); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-parse-argument.c b/src/test/test-parse-argument.c new file mode 100644 index 0000000..cf3d542 --- /dev/null +++ b/src/test/test-parse-argument.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "parse-argument.h" +#include "stdio-util.h" +#include "tests.h" + +TEST(parse_json_argument) { + JsonFormatFlags flags = JSON_FORMAT_PRETTY; + + assert_se(parse_json_argument("help", &flags) == 0); + assert_se(flags == JSON_FORMAT_PRETTY); + + assert_se(parse_json_argument("off", &flags) == 1); + assert_se(flags == JSON_FORMAT_OFF); +} + +TEST(parse_path_argument) { + _cleanup_free_ char *path = NULL; + + assert_se(parse_path_argument("help", false, &path) == 0); + assert_se(streq(basename(path), "help")); + + assert_se(parse_path_argument("/", false, &path) == 0); + assert_se(streq(path, "/")); + + assert_se(parse_path_argument("/", true, &path) == 0); + assert_se(path == NULL); +} + +TEST(parse_signal_argument) { + int signal = -1; + + assert_se(parse_signal_argument("help", &signal) == 0); + assert_se(signal == -1); + + assert_se(parse_signal_argument("list", &signal) == 0); + assert_se(signal == -1); + + assert_se(parse_signal_argument("SIGABRT", &signal) == 1); + assert_se(signal == SIGABRT); + + assert_se(parse_signal_argument("ABRT", &signal) == 1); + assert_se(signal == SIGABRT); + + char buf[DECIMAL_STR_MAX(int)]; + xsprintf(buf, "%d", SIGABRT); + assert_se(parse_signal_argument(buf, &signal) == 1); + assert_se(signal == SIGABRT); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-parse-helpers.c b/src/test/test-parse-helpers.c new file mode 100644 index 0000000..052e251 --- /dev/null +++ b/src/test/test-parse-helpers.c @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "macro.h" +#include "parse-helpers.h" +#include "tests.h" + +static void test_valid_item( + const char *str, + int expected_af, + int expected_ip_protocol, + uint16_t expected_nr_ports, + uint16_t expected_port_min) { + uint16_t nr_ports, port_min; + int af, ip_protocol; + + assert_se(parse_socket_bind_item(str, &af, &ip_protocol, &nr_ports, &port_min) >= 0); + assert_se(af == expected_af); + assert_se(ip_protocol == expected_ip_protocol); + assert_se(nr_ports == expected_nr_ports); + assert_se(port_min == expected_port_min); + + log_info("%s: \"%s\" ok", __func__, str); +} + +static void test_invalid_item(const char *str) { + uint16_t nr_ports, port_min; + int af, ip_protocol; + + assert_se(parse_socket_bind_item(str, &af, &ip_protocol, &nr_ports, &port_min) == -EINVAL); + + log_info("%s: \"%s\" ok", __func__, str); +} + +TEST(valid_items) { + test_valid_item("any", AF_UNSPEC, 0, 0, 0); + test_valid_item("ipv4", AF_INET, 0, 0, 0); + test_valid_item("ipv6", AF_INET6, 0, 0, 0); + test_valid_item("ipv4:any", AF_INET, 0, 0, 0); + test_valid_item("ipv6:any", AF_INET6, 0, 0, 0); + test_valid_item("tcp", AF_UNSPEC, IPPROTO_TCP, 0, 0); + test_valid_item("udp", AF_UNSPEC, IPPROTO_UDP, 0, 0); + test_valid_item("tcp:any", AF_UNSPEC, IPPROTO_TCP, 0, 0); + test_valid_item("udp:any", AF_UNSPEC, IPPROTO_UDP, 0, 0); + test_valid_item("6666", AF_UNSPEC, 0, 1, 6666); + test_valid_item("6666-6667", AF_UNSPEC, 0, 2, 6666); + test_valid_item("65535", AF_UNSPEC, 0, 1, 65535); + test_valid_item("1-65535", AF_UNSPEC, 0, 65535, 1); + test_valid_item("ipv4:tcp", AF_INET, IPPROTO_TCP, 0, 0); + test_valid_item("ipv4:udp", AF_INET, IPPROTO_UDP, 0, 0); + test_valid_item("ipv6:tcp", AF_INET6, IPPROTO_TCP, 0, 0); + test_valid_item("ipv6:udp", AF_INET6, IPPROTO_UDP, 0, 0); + test_valid_item("ipv4:6666", AF_INET, 0, 1, 6666); + test_valid_item("ipv6:6666", AF_INET6, 0, 1, 6666); + test_valid_item("tcp:6666", AF_UNSPEC, IPPROTO_TCP, 1, 6666); + test_valid_item("udp:6666", AF_UNSPEC, IPPROTO_UDP, 1, 6666); + test_valid_item("ipv4:tcp:6666", AF_INET, IPPROTO_TCP, 1, 6666); + test_valid_item("ipv6:tcp:6666", AF_INET6, IPPROTO_TCP, 1, 6666); + test_valid_item("ipv6:udp:6666-6667", AF_INET6, IPPROTO_UDP, 2, 6666); + test_valid_item("ipv6:tcp:any", AF_INET6, IPPROTO_TCP, 0, 0); +} + +TEST(invalid_items) { + test_invalid_item(""); + test_invalid_item(":"); + test_invalid_item("::"); + test_invalid_item("any:"); + test_invalid_item("meh"); + test_invalid_item("zupa:meh"); + test_invalid_item("zupa:meh:eh"); + test_invalid_item("ip"); + test_invalid_item("dccp"); + test_invalid_item("ipv6meh"); + test_invalid_item("ipv6::"); + test_invalid_item("ipv6:ipv6"); + test_invalid_item("ipv6:icmp"); + test_invalid_item("ipv6:tcp:0"); + test_invalid_item("65536"); + test_invalid_item("0-65535"); + test_invalid_item("ipv6:tcp:6666-6665"); + test_invalid_item("ipv6:tcp:6666-100000"); + test_invalid_item("ipv6::6666"); + test_invalid_item("ipv6:tcp:any:"); + test_invalid_item("ipv6:tcp:any:ipv6"); + test_invalid_item("ipv6:tcp:6666:zupa"); + test_invalid_item("ipv6:tcp:6666:any"); + test_invalid_item("ipv6:tcp:6666 zupa"); + test_invalid_item("ipv6:tcp:6666: zupa"); + test_invalid_item("ipv6:tcp:6666\n zupa"); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-parse-util.c b/src/test/test-parse-util.c new file mode 100644 index 0000000..58d22b6 --- /dev/null +++ b/src/test/test-parse-util.c @@ -0,0 +1,979 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-list.h" +#include "log.h" +#include "parse-util.h" +#include "string-util.h" +#include "tests.h" + +TEST(parse_boolean) { + assert_se(parse_boolean("1") == 1); + assert_se(parse_boolean("y") == 1); + assert_se(parse_boolean("Y") == 1); + assert_se(parse_boolean("yes") == 1); + assert_se(parse_boolean("YES") == 1); + assert_se(parse_boolean("true") == 1); + assert_se(parse_boolean("TRUE") == 1); + assert_se(parse_boolean("on") == 1); + assert_se(parse_boolean("ON") == 1); + + assert_se(parse_boolean("0") == 0); + assert_se(parse_boolean("n") == 0); + assert_se(parse_boolean("N") == 0); + assert_se(parse_boolean("no") == 0); + assert_se(parse_boolean("NO") == 0); + assert_se(parse_boolean("false") == 0); + assert_se(parse_boolean("FALSE") == 0); + assert_se(parse_boolean("off") == 0); + assert_se(parse_boolean("OFF") == 0); + + assert_se(parse_boolean("garbage") < 0); + assert_se(parse_boolean("") < 0); + assert_se(parse_boolean("full") < 0); +} + +TEST(parse_pid) { + int r; + pid_t pid; + + r = parse_pid("100", &pid); + assert_se(r == 0); + assert_se(pid == 100); + + r = parse_pid("0x7FFFFFFF", &pid); + assert_se(r == 0); + assert_se(pid == 2147483647); + + pid = 65; /* pid is left unchanged on ERANGE. Set to known arbitrary value. */ + r = parse_pid("0", &pid); + assert_se(r == -ERANGE); + assert_se(pid == 65); + + pid = 65; /* pid is left unchanged on ERANGE. Set to known arbitrary value. */ + r = parse_pid("-100", &pid); + assert_se(r == -ERANGE); + assert_se(pid == 65); + + pid = 65; /* pid is left unchanged on ERANGE. Set to known arbitrary value. */ + r = parse_pid("0xFFFFFFFFFFFFFFFFF", &pid); + assert_se(r == -ERANGE); + assert_se(pid == 65); + + r = parse_pid("junk", &pid); + assert_se(r == -EINVAL); + + r = parse_pid("", &pid); + assert_se(r == -EINVAL); +} + +TEST(parse_mode) { + mode_t m; + + assert_se(parse_mode("-1", &m) < 0); + assert_se(parse_mode("+1", &m) < 0); + assert_se(parse_mode("", &m) < 0); + assert_se(parse_mode("888", &m) < 0); + assert_se(parse_mode("77777", &m) < 0); + + assert_se(parse_mode("544", &m) >= 0 && m == 0544); + assert_se(parse_mode("0544", &m) >= 0 && m == 0544); + assert_se(parse_mode("00544", &m) >= 0 && m == 0544); + assert_se(parse_mode("777", &m) >= 0 && m == 0777); + assert_se(parse_mode("0777", &m) >= 0 && m == 0777); + assert_se(parse_mode("00777", &m) >= 0 && m == 0777); + assert_se(parse_mode("7777", &m) >= 0 && m == 07777); + assert_se(parse_mode("07777", &m) >= 0 && m == 07777); + assert_se(parse_mode("007777", &m) >= 0 && m == 07777); + assert_se(parse_mode("0", &m) >= 0 && m == 0); + assert_se(parse_mode(" 1", &m) >= 0 && m == 1); +} + +TEST(parse_size_iec) { + uint64_t bytes; + + assert_se(parse_size("", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("111", 1024, &bytes) == 0); + assert_se(bytes == 111); + + assert_se(parse_size("111.4", 1024, &bytes) == 0); + assert_se(bytes == 111); + + assert_se(parse_size(" 112 B", 1024, &bytes) == 0); + assert_se(bytes == 112); + + assert_se(parse_size(" 112.6 B", 1024, &bytes) == 0); + assert_se(bytes == 112); + + assert_se(parse_size("3.5 K", 1024, &bytes) == 0); + assert_se(bytes == 3*1024 + 512); + + assert_se(parse_size("3. K", 1024, &bytes) == 0); + assert_se(bytes == 3*1024); + + assert_se(parse_size("3.0 K", 1024, &bytes) == 0); + assert_se(bytes == 3*1024); + + assert_se(parse_size("3. 0 K", 1024, &bytes) == -EINVAL); + + assert_se(parse_size(" 4 M 11.5K", 1024, &bytes) == 0); + assert_se(bytes == 4*1024*1024 + 11 * 1024 + 512); + + assert_se(parse_size("3B3.5G", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("3.5G3B", 1024, &bytes) == 0); + assert_se(bytes == 3ULL*1024*1024*1024 + 512*1024*1024 + 3); + + assert_se(parse_size("3.5G 4B", 1024, &bytes) == 0); + assert_se(bytes == 3ULL*1024*1024*1024 + 512*1024*1024 + 4); + + assert_se(parse_size("3B3G4T", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("4T3G3B", 1024, &bytes) == 0); + assert_se(bytes == (4ULL*1024 + 3)*1024*1024*1024 + 3); + + assert_se(parse_size(" 4 T 3 G 3 B", 1024, &bytes) == 0); + assert_se(bytes == (4ULL*1024 + 3)*1024*1024*1024 + 3); + + assert_se(parse_size("12P", 1024, &bytes) == 0); + assert_se(bytes == 12ULL * 1024*1024*1024*1024*1024); + + assert_se(parse_size("12P12P", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("3E 2P", 1024, &bytes) == 0); + assert_se(bytes == (3 * 1024 + 2ULL) * 1024*1024*1024*1024*1024); + + assert_se(parse_size("12X", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("12.5X", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("12.5e3", 1024, &bytes) == -EINVAL); + + assert_se(parse_size("1024E", 1024, &bytes) == -ERANGE); + assert_se(parse_size("-1", 1024, &bytes) == -ERANGE); + assert_se(parse_size("-1024E", 1024, &bytes) == -ERANGE); + + assert_se(parse_size("-1024P", 1024, &bytes) == -ERANGE); + + assert_se(parse_size("-10B 20K", 1024, &bytes) == -ERANGE); +} + +TEST(parse_size_si) { + uint64_t bytes; + + assert_se(parse_size("", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("111", 1000, &bytes) == 0); + assert_se(bytes == 111); + + assert_se(parse_size("111.4", 1000, &bytes) == 0); + assert_se(bytes == 111); + + assert_se(parse_size(" 112 B", 1000, &bytes) == 0); + assert_se(bytes == 112); + + assert_se(parse_size(" 112.6 B", 1000, &bytes) == 0); + assert_se(bytes == 112); + + assert_se(parse_size("3.5 K", 1000, &bytes) == 0); + assert_se(bytes == 3*1000 + 500); + + assert_se(parse_size("3. K", 1000, &bytes) == 0); + assert_se(bytes == 3*1000); + + assert_se(parse_size("3.0 K", 1000, &bytes) == 0); + assert_se(bytes == 3*1000); + + assert_se(parse_size("3. 0 K", 1000, &bytes) == -EINVAL); + + assert_se(parse_size(" 4 M 11.5K", 1000, &bytes) == 0); + assert_se(bytes == 4*1000*1000 + 11 * 1000 + 500); + + assert_se(parse_size("3B3.5G", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("3.5G3B", 1000, &bytes) == 0); + assert_se(bytes == 3ULL*1000*1000*1000 + 500*1000*1000 + 3); + + assert_se(parse_size("3.5G 4B", 1000, &bytes) == 0); + assert_se(bytes == 3ULL*1000*1000*1000 + 500*1000*1000 + 4); + + assert_se(parse_size("3B3G4T", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("4T3G3B", 1000, &bytes) == 0); + assert_se(bytes == (4ULL*1000 + 3)*1000*1000*1000 + 3); + + assert_se(parse_size(" 4 T 3 G 3 B", 1000, &bytes) == 0); + assert_se(bytes == (4ULL*1000 + 3)*1000*1000*1000 + 3); + + assert_se(parse_size("12P", 1000, &bytes) == 0); + assert_se(bytes == 12ULL * 1000*1000*1000*1000*1000); + + assert_se(parse_size("12P12P", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("3E 2P", 1000, &bytes) == 0); + assert_se(bytes == (3 * 1000 + 2ULL) * 1000*1000*1000*1000*1000); + + assert_se(parse_size("12X", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("12.5X", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("12.5e3", 1000, &bytes) == -EINVAL); + + assert_se(parse_size("1000E", 1000, &bytes) == -ERANGE); + assert_se(parse_size("-1", 1000, &bytes) == -ERANGE); + assert_se(parse_size("-1000E", 1000, &bytes) == -ERANGE); + + assert_se(parse_size("-1000P", 1000, &bytes) == -ERANGE); + + assert_se(parse_size("-10B 20K", 1000, &bytes) == -ERANGE); +} + +TEST(parse_range) { + unsigned lower, upper; + + /* Successful cases */ + assert_se(parse_range("111", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 111); + + assert_se(parse_range("111-123", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 123); + + assert_se(parse_range("123-111", &lower, &upper) == 0); + assert_se(lower == 123); + assert_se(upper == 111); + + assert_se(parse_range("123-123", &lower, &upper) == 0); + assert_se(lower == 123); + assert_se(upper == 123); + + assert_se(parse_range("0", &lower, &upper) == 0); + assert_se(lower == 0); + assert_se(upper == 0); + + assert_se(parse_range("0-15", &lower, &upper) == 0); + assert_se(lower == 0); + assert_se(upper == 15); + + assert_se(parse_range("15-0", &lower, &upper) == 0); + assert_se(lower == 15); + assert_se(upper == 0); + + assert_se(parse_range("128-65535", &lower, &upper) == 0); + assert_se(lower == 128); + assert_se(upper == 65535); + + assert_se(parse_range("1024-4294967295", &lower, &upper) == 0); + assert_se(lower == 1024); + assert_se(upper == 4294967295); + + /* Leading whitespace is acceptable */ + assert_se(parse_range(" 111", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 111); + + assert_se(parse_range(" 111-123", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 123); + + assert_se(parse_range("111- 123", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 123); + + assert_se(parse_range("\t111-\t123", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 123); + + assert_se(parse_range(" \t 111- \t 123", &lower, &upper) == 0); + assert_se(lower == 111); + assert_se(upper == 123); + + /* Error cases, make sure they fail as expected */ + lower = upper = 9999; + assert_se(parse_range("111garbage", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("garbage111", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("garbage", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111-123garbage", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111garbage-123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + /* Empty string */ + lower = upper = 9999; + assert_se(parse_range("", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + /* 111--123 will pass -123 to safe_atou which returns -ERANGE for negative */ + assert_se(parse_range("111--123", &lower, &upper) == -ERANGE); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("-123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("-111-123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111-123-", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111.4-123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111-123.4", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111,4-123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111-123,4", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + /* Error on trailing dash */ + assert_se(parse_range("111-", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111-123-", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111--", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111- ", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + /* Whitespace is not a separator */ + assert_se(parse_range("111 123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111\t123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111 \t 123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + /* Trailing whitespace is invalid (from safe_atou) */ + assert_se(parse_range("111 ", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111-123 ", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111 -123", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111 -123 ", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111\t-123\t", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + assert_se(parse_range("111 \t -123 \t ", &lower, &upper) == -EINVAL); + assert_se(lower == 9999); + assert_se(upper == 9999); + + /* Out of the "unsigned" range, this is 1<<64 */ + assert_se(parse_range("0-18446744073709551616", &lower, &upper) == -ERANGE); + assert_se(lower == 9999); + assert_se(upper == 9999); +} + +TEST(safe_atou_bounded) { + int r; + unsigned x; + + r = safe_atou_bounded("12345", 12, 20000, &x); + assert_se(r == 0); + assert_se(x == 12345); + + r = safe_atou_bounded("12", 12, 20000, &x); + assert_se(r == 0); + assert_se(x == 12); + + r = safe_atou_bounded("20000", 12, 20000, &x); + assert_se(r == 0); + assert_se(x == 20000); + + r = safe_atou_bounded("-1", 12, 20000, &x); + assert_se(r == -ERANGE); + + r = safe_atou_bounded("11", 12, 20000, &x); + assert_se(r == -ERANGE); + + r = safe_atou_bounded("20001", 12, 20000, &x); + assert_se(r == -ERANGE); +} + +TEST(safe_atolli) { + int r; + long long l; + + r = safe_atolli("12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atolli(" 12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atolli("-12345", &l); + assert_se(r == 0); + assert_se(l == -12345); + + r = safe_atolli(" -12345", &l); + assert_se(r == 0); + assert_se(l == -12345); + + r = safe_atolli("0x5", &l); + assert_se(r == 0); + assert_se(l == 5); + + r = safe_atolli("0o6", &l); + assert_se(r == 0); + assert_se(l == 6); + + r = safe_atolli("0B101", &l); + assert_se(r == 0); + assert_se(l == 5); + + r = safe_atolli("12345678901234567890", &l); + assert_se(r == -ERANGE); + + r = safe_atolli("-12345678901234567890", &l); + assert_se(r == -ERANGE); + + r = safe_atolli("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atolli("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atolli("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atolli("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atou16) { + int r; + uint16_t l; + + r = safe_atou16("12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atou16(" 12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atou16("+12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atou16(" +12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atou16("123456", &l); + assert_se(r == -ERANGE); + + r = safe_atou16("-1", &l); + assert_se(r == -ERANGE); + + r = safe_atou16(" -1", &l); + assert_se(r == -ERANGE); + + r = safe_atou16("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atou16("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atou16("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atou16("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atoi16) { + int r; + int16_t l; + + r = safe_atoi16("-12345", &l); + assert_se(r == 0); + assert_se(l == -12345); + + r = safe_atoi16(" -12345", &l); + assert_se(r == 0); + assert_se(l == -12345); + + r = safe_atoi16("+12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atoi16(" +12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atoi16("32767", &l); + assert_se(r == 0); + assert_se(l == 32767); + + r = safe_atoi16(" 32767", &l); + assert_se(r == 0); + assert_se(l == 32767); + + r = safe_atoi16("0o11", &l); + assert_se(r == 0); + assert_se(l == 9); + + r = safe_atoi16("0B110", &l); + assert_se(r == 0); + assert_se(l == 6); + + r = safe_atoi16("36536", &l); + assert_se(r == -ERANGE); + + r = safe_atoi16("-32769", &l); + assert_se(r == -ERANGE); + + r = safe_atoi16("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atoi16("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atoi16("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atoi16("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atoux16) { + int r; + uint16_t l; + + r = safe_atoux16("1234", &l); + assert_se(r == 0); + assert_se(l == 0x1234); + + r = safe_atoux16("abcd", &l); + assert_se(r == 0); + assert_se(l == 0xabcd); + + r = safe_atoux16(" 1234", &l); + assert_se(r == 0); + assert_se(l == 0x1234); + + r = safe_atoux16("12345", &l); + assert_se(r == -ERANGE); + + r = safe_atoux16("-1", &l); + assert_se(r == -ERANGE); + + r = safe_atoux16(" -1", &l); + assert_se(r == -ERANGE); + + r = safe_atoux16("0b1", &l); + assert_se(r == 0); + assert_se(l == 177); + + r = safe_atoux16("0o70", &l); + assert_se(r == -EINVAL); + + r = safe_atoux16("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atoux16("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atoux16("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atoux16("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atou64) { + int r; + uint64_t l; + + r = safe_atou64("12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atou64(" 12345", &l); + assert_se(r == 0); + assert_se(l == 12345); + + r = safe_atou64("0o11", &l); + assert_se(r == 0); + assert_se(l == 9); + + r = safe_atou64("0b11", &l); + assert_se(r == 0); + assert_se(l == 3); + + r = safe_atou64("18446744073709551617", &l); + assert_se(r == -ERANGE); + + r = safe_atou64("-1", &l); + assert_se(r == -ERANGE); + + r = safe_atou64(" -1", &l); + assert_se(r == -ERANGE); + + r = safe_atou64("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atou64("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atou64("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atou64("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atoi64) { + int r; + int64_t l; + + r = safe_atoi64("-12345", &l); + assert_se(r == 0); + assert_se(l == -12345); + + r = safe_atoi64(" -12345", &l); + assert_se(r == 0); + assert_se(l == -12345); + + r = safe_atoi64("32767", &l); + assert_se(r == 0); + assert_se(l == 32767); + + r = safe_atoi64(" 32767", &l); + assert_se(r == 0); + assert_se(l == 32767); + + r = safe_atoi64(" 0o20", &l); + assert_se(r == 0); + assert_se(l == 16); + + r = safe_atoi64(" 0b01010", &l); + assert_se(r == 0); + assert_se(l == 10); + + r = safe_atoi64("9223372036854775813", &l); + assert_se(r == -ERANGE); + + r = safe_atoi64("-9223372036854775813", &l); + assert_se(r == -ERANGE); + + r = safe_atoi64("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atoi64("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atoi64("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atoi64("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atoux64) { + int r; + uint64_t l; + + r = safe_atoux64("12345", &l); + assert_se(r == 0); + assert_se(l == 0x12345); + + r = safe_atoux64(" 12345", &l); + assert_se(r == 0); + assert_se(l == 0x12345); + + r = safe_atoux64("0x12345", &l); + assert_se(r == 0); + assert_se(l == 0x12345); + + r = safe_atoux64("0b11011", &l); + assert_se(r == 0); + assert_se(l == 11603985); + + r = safe_atoux64("+12345", &l); + assert_se(r == 0); + assert_se(l == 0x12345); + + r = safe_atoux64(" +12345", &l); + assert_se(r == 0); + assert_se(l == 0x12345); + + r = safe_atoux64("+0x12345", &l); + assert_se(r == 0); + assert_se(l == 0x12345); + + r = safe_atoux64("+0b11011", &l); + assert_se(r == 0); + assert_se(l == 11603985); + + r = safe_atoux64("0o11011", &l); + assert_se(r == -EINVAL); + + r = safe_atoux64("18446744073709551617", &l); + assert_se(r == -ERANGE); + + r = safe_atoux64("-1", &l); + assert_se(r == -ERANGE); + + r = safe_atoux64(" -1", &l); + assert_se(r == -ERANGE); + + r = safe_atoux64("junk", &l); + assert_se(r == -EINVAL); + + r = safe_atoux64("123x", &l); + assert_se(r == -EINVAL); + + r = safe_atoux64("12.3", &l); + assert_se(r == -EINVAL); + + r = safe_atoux64("", &l); + assert_se(r == -EINVAL); +} + +TEST(safe_atod) { + int r; + double d; + char *e; + + r = safe_atod("junk", &d); + assert_se(r == -EINVAL); + + r = safe_atod("0.2244", &d); + assert_se(r == 0); + assert_se(fabs(d - 0.2244) < 0.000001); + + r = safe_atod("0,5", &d); + assert_se(r == -EINVAL); + + errno = 0; + strtod("0,5", &e); + assert_se(*e == ','); + + r = safe_atod("", &d); + assert_se(r == -EINVAL); + + /* Check if this really is locale independent */ + if (setlocale(LC_NUMERIC, "de_DE.utf8")) { + + r = safe_atod("0.2244", &d); + assert_se(r == 0); + assert_se(fabs(d - 0.2244) < 0.000001); + + r = safe_atod("0,5", &d); + assert_se(r == -EINVAL); + + errno = 0; + assert_se(fabs(strtod("0,5", &e) - 0.5) < 0.00001); + + r = safe_atod("", &d); + assert_se(r == -EINVAL); + } + + /* And check again, reset */ + assert_se(setlocale(LC_NUMERIC, "C")); + + r = safe_atod("0.2244", &d); + assert_se(r == 0); + assert_se(fabs(d - 0.2244) < 0.000001); + + r = safe_atod("0,5", &d); + assert_se(r == -EINVAL); + + errno = 0; + strtod("0,5", &e); + assert_se(*e == ','); + + r = safe_atod("", &d); + assert_se(r == -EINVAL); +} + +TEST(parse_nice) { + int n; + + assert_se(parse_nice("0", &n) >= 0 && n == 0); + assert_se(parse_nice("+0", &n) >= 0 && n == 0); + assert_se(parse_nice("-1", &n) >= 0 && n == -1); + assert_se(parse_nice("-2", &n) >= 0 && n == -2); + assert_se(parse_nice("1", &n) >= 0 && n == 1); + assert_se(parse_nice("2", &n) >= 0 && n == 2); + assert_se(parse_nice("+1", &n) >= 0 && n == 1); + assert_se(parse_nice("+2", &n) >= 0 && n == 2); + assert_se(parse_nice("-20", &n) >= 0 && n == -20); + assert_se(parse_nice("19", &n) >= 0 && n == 19); + assert_se(parse_nice("+19", &n) >= 0 && n == 19); + + assert_se(parse_nice("", &n) == -EINVAL); + assert_se(parse_nice("-", &n) == -EINVAL); + assert_se(parse_nice("+", &n) == -EINVAL); + assert_se(parse_nice("xx", &n) == -EINVAL); + assert_se(parse_nice("-50", &n) == -ERANGE); + assert_se(parse_nice("50", &n) == -ERANGE); + assert_se(parse_nice("+50", &n) == -ERANGE); + assert_se(parse_nice("-21", &n) == -ERANGE); + assert_se(parse_nice("20", &n) == -ERANGE); + assert_se(parse_nice("+20", &n) == -ERANGE); +} + +TEST(parse_errno) { + assert_se(parse_errno("EILSEQ") == EILSEQ); + assert_se(parse_errno("EINVAL") == EINVAL); + assert_se(parse_errno("0") == 0); + assert_se(parse_errno("1") == 1); + assert_se(parse_errno("4095") == 4095); + + assert_se(parse_errno("-1") == -ERANGE); + assert_se(parse_errno("-3") == -ERANGE); + assert_se(parse_errno("4096") == -ERANGE); + + assert_se(parse_errno("") == -EINVAL); + assert_se(parse_errno("12.3") == -EINVAL); + assert_se(parse_errno("123junk") == -EINVAL); + assert_se(parse_errno("junk123") == -EINVAL); + assert_se(parse_errno("255EILSEQ") == -EINVAL); + assert_se(parse_errno("EINVAL12") == -EINVAL); + assert_se(parse_errno("-EINVAL") == -EINVAL); + assert_se(parse_errno("EINVALaaa") == -EINVAL); +} + +TEST(parse_fd) { + assert_se(parse_fd("0") == 0); + assert_se(parse_fd("1") == 1); + + assert_se(parse_fd("-1") == -EBADF); + assert_se(parse_fd("-3") == -EBADF); + + assert_se(parse_fd("") == -EINVAL); + assert_se(parse_fd("12.3") == -EINVAL); + assert_se(parse_fd("123junk") == -EINVAL); + assert_se(parse_fd("junk123") == -EINVAL); +} + +TEST(parse_mtu) { + uint32_t mtu = 0; + + assert_se(parse_mtu(AF_UNSPEC, "1500", &mtu) >= 0 && mtu == 1500); + assert_se(parse_mtu(AF_UNSPEC, "1400", &mtu) >= 0 && mtu == 1400); + assert_se(parse_mtu(AF_UNSPEC, "65535", &mtu) >= 0 && mtu == 65535); + assert_se(parse_mtu(AF_UNSPEC, "65536", &mtu) >= 0 && mtu == 65536); + assert_se(parse_mtu(AF_UNSPEC, "4294967295", &mtu) >= 0 && mtu == 4294967295); + assert_se(parse_mtu(AF_UNSPEC, "500", &mtu) >= 0 && mtu == 500); + assert_se(parse_mtu(AF_UNSPEC, "1280", &mtu) >= 0 && mtu == 1280); + assert_se(parse_mtu(AF_UNSPEC, "4294967296", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_UNSPEC, "68", &mtu) >= 0 && mtu == 68); + assert_se(parse_mtu(AF_UNSPEC, "67", &mtu) >= 0 && mtu == 67); + assert_se(parse_mtu(AF_UNSPEC, "0", &mtu) >= 0 && mtu == 0); + assert_se(parse_mtu(AF_UNSPEC, "", &mtu) == -EINVAL); + + assert_se(parse_mtu(AF_INET, "1500", &mtu) >= 0 && mtu == 1500); + assert_se(parse_mtu(AF_INET, "1400", &mtu) >= 0 && mtu == 1400); + assert_se(parse_mtu(AF_INET, "65535", &mtu) >= 0 && mtu == 65535); + assert_se(parse_mtu(AF_INET, "65536", &mtu) >= 0 && mtu == 65536); + assert_se(parse_mtu(AF_INET, "4294967295", &mtu) >= 0 && mtu == 4294967295); + assert_se(parse_mtu(AF_INET, "500", &mtu) >= 0 && mtu == 500); + assert_se(parse_mtu(AF_INET, "1280", &mtu) >= 0 && mtu == 1280); + assert_se(parse_mtu(AF_INET, "4294967296", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_INET, "68", &mtu) >= 0 && mtu == 68); + assert_se(parse_mtu(AF_INET, "67", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_INET, "0", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_INET, "", &mtu) == -EINVAL); + + assert_se(parse_mtu(AF_INET6, "1280", &mtu) >= 0 && mtu == 1280); + assert_se(parse_mtu(AF_INET6, "1279", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_INET6, "4294967296", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_INET6, "68", &mtu) == -ERANGE); + assert_se(parse_mtu(AF_INET6, "", &mtu) == -EINVAL); +} + +TEST(parse_loadavg_fixed_point) { + loadavg_t fp; + + assert_se(parse_loadavg_fixed_point("1.23", &fp) == 0); + assert_se(LOADAVG_INT_SIDE(fp) == 1); + assert_se(LOADAVG_DECIMAL_SIDE(fp) == 23); + + assert_se(parse_loadavg_fixed_point("1.80", &fp) == 0); + assert_se(LOADAVG_INT_SIDE(fp) == 1); + assert_se(LOADAVG_DECIMAL_SIDE(fp) == 80); + + assert_se(parse_loadavg_fixed_point("0.07", &fp) == 0); + assert_se(LOADAVG_INT_SIDE(fp) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(fp) == 7); + + assert_se(parse_loadavg_fixed_point("0.00", &fp) == 0); + assert_se(LOADAVG_INT_SIDE(fp) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(fp) == 0); + + assert_se(parse_loadavg_fixed_point("4096.57", &fp) == 0); + assert_se(LOADAVG_INT_SIDE(fp) == 4096); + assert_se(LOADAVG_DECIMAL_SIDE(fp) == 57); + + /* Caps out at 2 digit fracs */ + assert_se(parse_loadavg_fixed_point("1.100", &fp) == -ERANGE); + + assert_se(parse_loadavg_fixed_point("4096.4096", &fp) == -ERANGE); + assert_se(parse_loadavg_fixed_point("-4000.5", &fp) == -ERANGE); + assert_se(parse_loadavg_fixed_point("18446744073709551615.5", &fp) == -ERANGE); + assert_se(parse_loadavg_fixed_point("foobar", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point("3333", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point("1.2.3", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point(".", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point("", &fp) == -EINVAL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-path-lookup.c b/src/test/test-path-lookup.c new file mode 100644 index 0000000..431a859 --- /dev/null +++ b/src/test/test-path-lookup.c @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "log.h" +#include "path-lookup.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static void test_paths_one(RuntimeScope scope) { + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_(lookup_paths_free) LookupPaths lp_without_env = {}; + _cleanup_(lookup_paths_free) LookupPaths lp_with_env = {}; + char *systemd_unit_path; + + assert_se(mkdtemp_malloc("/tmp/test-path-lookup.XXXXXXX", &tmp) >= 0); + + assert_se(unsetenv("SYSTEMD_UNIT_PATH") == 0); + assert_se(lookup_paths_init(&lp_without_env, scope, 0, NULL) >= 0); + assert_se(!strv_isempty(lp_without_env.search_path)); + lookup_paths_log(&lp_without_env); + + systemd_unit_path = strjoina(tmp, "/systemd-unit-path"); + assert_se(setenv("SYSTEMD_UNIT_PATH", systemd_unit_path, 1) == 0); + assert_se(lookup_paths_init(&lp_with_env, scope, 0, NULL) == 0); + assert_se(strv_length(lp_with_env.search_path) == 1); + assert_se(streq(lp_with_env.search_path[0], systemd_unit_path)); + lookup_paths_log(&lp_with_env); + assert_se(strv_equal(lp_with_env.search_path, STRV_MAKE(systemd_unit_path))); +} + +TEST(paths) { + test_paths_one(RUNTIME_SCOPE_SYSTEM); + test_paths_one(RUNTIME_SCOPE_USER); + test_paths_one(RUNTIME_SCOPE_GLOBAL); +} + +TEST(user_and_global_paths) { + _cleanup_(lookup_paths_free) LookupPaths lp_global = {}, lp_user = {}; + char **u, **g; + unsigned k = 0; + + assert_se(unsetenv("SYSTEMD_UNIT_PATH") == 0); + assert_se(unsetenv("XDG_DATA_DIRS") == 0); + assert_se(unsetenv("XDG_CONFIG_DIRS") == 0); + + assert_se(lookup_paths_init(&lp_global, RUNTIME_SCOPE_GLOBAL, 0, NULL) == 0); + assert_se(lookup_paths_init(&lp_user, RUNTIME_SCOPE_USER, 0, NULL) == 0); + g = lp_global.search_path; + u = lp_user.search_path; + + /* Go over all entries in global search path, and verify + * that they also exist in the user search path. Skip any + * entries in user search path which don't exist in the global + * one, but not vice versa. */ + STRV_FOREACH(p, g) { + while (u[k] && !streq(*p, u[k])) { + log_info("+ %s", u[k]); + k++; + } + log_info(" %s", *p); + assert_se(u[k]); /* If NULL, we didn't find a matching entry */ + k++; + } + STRV_FOREACH(p, u + k) + log_info("+ %s", *p); +} + +static void test_generator_binary_paths_one(RuntimeScope scope) { + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_strv_free_ char **gp_without_env = NULL; + _cleanup_strv_free_ char **env_gp_without_env = NULL; + _cleanup_strv_free_ char **gp_with_env = NULL; + _cleanup_strv_free_ char **env_gp_with_env = NULL; + char *systemd_generator_path = NULL; + char *systemd_env_generator_path = NULL; + + assert_se(mkdtemp_malloc("/tmp/test-path-lookup.XXXXXXX", &tmp) >= 0); + + assert_se(unsetenv("SYSTEMD_GENERATOR_PATH") == 0); + assert_se(unsetenv("SYSTEMD_ENVIRONMENT_GENERATOR_PATH") == 0); + + gp_without_env = generator_binary_paths(scope); + env_gp_without_env = env_generator_binary_paths(scope); + + log_info("Generators dirs (%s):", runtime_scope_to_string(scope)); + STRV_FOREACH(dir, gp_without_env) + log_info(" %s", *dir); + + log_info("Environment generators dirs (%s):", runtime_scope_to_string(scope)); + STRV_FOREACH(dir, env_gp_without_env) + log_info(" %s", *dir); + + assert_se(!strv_isempty(gp_without_env)); + assert_se(!strv_isempty(env_gp_without_env)); + + systemd_generator_path = strjoina(tmp, "/systemd-generator-path"); + systemd_env_generator_path = strjoina(tmp, "/systemd-environment-generator-path"); + assert_se(setenv("SYSTEMD_GENERATOR_PATH", systemd_generator_path, 1) == 0); + assert_se(setenv("SYSTEMD_ENVIRONMENT_GENERATOR_PATH", systemd_env_generator_path, 1) == 0); + + gp_with_env = generator_binary_paths(scope); + env_gp_with_env = env_generator_binary_paths(scope); + + log_info("Generators dirs (%s):", runtime_scope_to_string(scope)); + STRV_FOREACH(dir, gp_with_env) + log_info(" %s", *dir); + + log_info("Environment generators dirs (%s):", runtime_scope_to_string(scope)); + STRV_FOREACH(dir, env_gp_with_env) + log_info(" %s", *dir); + + assert_se(strv_equal(gp_with_env, STRV_MAKE(systemd_generator_path))); + assert_se(strv_equal(env_gp_with_env, STRV_MAKE(systemd_env_generator_path))); +} + +TEST(generator_binary_paths) { + test_generator_binary_paths_one(RUNTIME_SCOPE_SYSTEM); + test_generator_binary_paths_one(RUNTIME_SCOPE_USER); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-path-util.c b/src/test/test-path-util.c new file mode 100644 index 0000000..f5a4256 --- /dev/null +++ b/src/test/test-path-util.c @@ -0,0 +1,1308 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "alloc-util.h" +#include "exec-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "path-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(print_paths) { + log_info("DEFAULT_PATH=%s", DEFAULT_PATH); + log_info("DEFAULT_USER_PATH=%s", DEFAULT_USER_PATH); +} + +TEST(path) { + assert_se(path_is_absolute("/")); + assert_se(!path_is_absolute("./")); + + assert_se(streq(basename("./aa/bb/../file.da."), "file.da.")); + assert_se(streq(basename("/aa///.file"), ".file")); + assert_se(streq(basename("/aa///file..."), "file...")); + assert_se(streq(basename("file.../"), "")); + + assert_se(PATH_IN_SET("/bin", "/", "/bin", "/foo")); + assert_se(PATH_IN_SET("/bin", "/bin")); + assert_se(PATH_IN_SET("/bin", "/foo/bar", "/bin")); + assert_se(PATH_IN_SET("/", "/", "/", "/foo/bar")); + assert_se(!PATH_IN_SET("/", "/abc", "/def")); + + assert_se(path_equal_ptr(NULL, NULL)); + assert_se(path_equal_ptr("/a", "/a")); + assert_se(!path_equal_ptr("/a", "/b")); + assert_se(!path_equal_ptr("/a", NULL)); + assert_se(!path_equal_ptr(NULL, "/a")); +} + +TEST(is_path) { + assert_se(!is_path("foo")); + assert_se(!is_path("dos.ext")); + assert_se( is_path("/dir")); + assert_se( is_path("a/b")); + assert_se( is_path("a/b.ext")); + + assert_se(!is_path(".")); + assert_se(!is_path("")); + assert_se(!is_path("..")); + + assert_se( is_path("/dev")); + assert_se( is_path("/./dev")); + assert_se( is_path("/./dev/.")); + assert_se( is_path("/./dev.")); + assert_se( is_path("//dev")); + assert_se( is_path("///dev")); + assert_se( is_path("/dev/")); + assert_se( is_path("///dev/")); + assert_se( is_path("/./dev/")); + assert_se( is_path("/../dev/")); + assert_se( is_path("/dev/sda")); + assert_se( is_path("/dev/sda5")); + assert_se( is_path("/dev/sda5b3")); + assert_se( is_path("/dev/sda5b3/idontexit")); + assert_se( is_path("/../dev/sda")); + assert_se( is_path("/../../dev/sda5")); + assert_se( is_path("/../../../dev/sda5b3")); + assert_se( is_path("/.././.././dev/sda5b3/idontexit")); + assert_se( is_path("/sys")); + assert_se( is_path("/sys/")); + assert_se( is_path("/./sys")); + assert_se( is_path("/./sys/.")); + assert_se( is_path("/./sys.")); + assert_se( is_path("/sys/what")); + assert_se( is_path("/sys/something/..")); + assert_se( is_path("/sys/something/../")); + assert_se( is_path("/sys////")); + assert_se( is_path("/sys////.")); + assert_se( is_path("/sys/..")); + assert_se( is_path("/sys/../")); + assert_se( is_path("/usr/../dev/sda")); +} + +TEST(is_device_path) { + assert_se(!is_device_path("foo")); + assert_se(!is_device_path("dos.ext")); + assert_se(!is_device_path("/dir")); + assert_se(!is_device_path("a/b")); + assert_se(!is_device_path("a/b.ext")); + + assert_se(!is_device_path(".")); + assert_se(!is_device_path("")); + assert_se(!is_device_path("..")); + + assert_se(!is_device_path("/dev")); + assert_se(!is_device_path("/./dev")); + assert_se(!is_device_path("/./dev/.")); + assert_se(!is_device_path("/./dev.")); + assert_se( is_device_path("/./dev/foo")); + assert_se( is_device_path("/./dev/./foo")); + assert_se(!is_device_path("/./dev./foo")); + assert_se(!is_device_path("//dev")); + assert_se(!is_device_path("///dev")); + assert_se(!is_device_path("/dev/")); + assert_se(!is_device_path("///dev/")); + assert_se(!is_device_path("/./dev/")); + assert_se(!is_device_path("/../dev/")); + assert_se( is_device_path("/dev/sda")); + assert_se( is_device_path("/dev/sda5")); + assert_se( is_device_path("/dev/sda5b3")); + assert_se( is_device_path("/dev/sda5b3/idontexit")); + assert_se(!is_device_path("/../dev/sda")); + assert_se(!is_device_path("/../../dev/sda5")); + assert_se(!is_device_path("/../../../dev/sda5b3")); + assert_se(!is_device_path("/.././.././dev/sda5b3/idontexit")); + assert_se(!is_device_path("/sys")); + assert_se(!is_device_path("/sys/")); + assert_se(!is_device_path("/./sys")); + assert_se(!is_device_path("/./sys/.")); + assert_se(!is_device_path("/./sys.")); + assert_se( is_device_path("/./sys/foo")); + assert_se( is_device_path("/./sys/./foo")); + assert_se(!is_device_path("/./sys./foo")); + assert_se( is_device_path("/sys/what")); + assert_se( is_device_path("/sys/something/..")); + assert_se( is_device_path("/sys/something/../")); + assert_se(!is_device_path("/sys////")); + assert_se(!is_device_path("/sys////.")); + assert_se( is_device_path("/sys/..")); + assert_se( is_device_path("/sys/../")); + assert_se(!is_device_path("/usr/../dev/sda")); +} + +static void test_path_simplify_one(const char *in, const char *out, PathSimplifyFlags flags) { + char *p; + + p = strdupa_safe(in); + path_simplify_full(p, flags); + log_debug("/* test_path_simplify(%s) → %s (expected: %s) */", in, p, out); + assert_se(streq(p, out)); +} + +TEST(path_simplify) { + _cleanup_free_ char *hoge = NULL, *hoge_out = NULL; + char foo[NAME_MAX * 2]; + + test_path_simplify_one("", "", 0); + test_path_simplify_one("aaa/bbb////ccc", "aaa/bbb/ccc", 0); + test_path_simplify_one("//aaa/.////ccc", "/aaa/ccc", 0); + test_path_simplify_one("///", "/", 0); + test_path_simplify_one("///", "/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("///.//", "/", 0); + test_path_simplify_one("///.//.///", "/", 0); + test_path_simplify_one("////.././///../.", "/", 0); + test_path_simplify_one(".", ".", 0); + test_path_simplify_one("./", ".", 0); + test_path_simplify_one("./", "./", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one(".///.//./.", ".", 0); + test_path_simplify_one(".///.//././/", ".", 0); + test_path_simplify_one("//./aaa///.//./.bbb/..///c.//d.dd///..eeee/.", + "/aaa/.bbb/../c./d.dd/..eeee", 0); + test_path_simplify_one("//./aaa///.//./.bbb/..///c.//d.dd///..eeee/..", + "/aaa/.bbb/../c./d.dd/..eeee/..", 0); + test_path_simplify_one(".//./aaa///.//./.bbb/..///c.//d.dd///..eeee/..", + "aaa/.bbb/../c./d.dd/..eeee/..", 0); + test_path_simplify_one("..//./aaa///.//./.bbb/..///c.//d.dd///..eeee/..", + "../aaa/.bbb/../c./d.dd/..eeee/..", 0); + test_path_simplify_one("abc///", "abc/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + + test_path_simplify_one("/../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../abc///", "/abc/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../abc///", "/abc", 0); + test_path_simplify_one("/../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../abc///..", "/abc/..", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../abc///../", "/abc/../", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../abc///../", "/abc/..", 0); + + test_path_simplify_one("/../../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../../abc///", "/abc/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../../abc///", "/abc", 0); + test_path_simplify_one("/../../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../../abc///../..", "/abc/../..", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../../abc///../../", "/abc/../../", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/../../abc///../../", "/abc/../..", 0); + + test_path_simplify_one("/.././../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.././../abc///", "/abc/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.././../abc///", "/abc", 0); + test_path_simplify_one("/.././../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.././../abc///../..", "/abc/../..", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.././../abc///../../", "/abc/../../", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.././../abc///../../", "/abc/../..", 0); + + test_path_simplify_one("/./.././../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/./.././../abc///", "/abc/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/./.././../abc///", "/abc", 0); + test_path_simplify_one("/./.././../abc", "/abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/./.././../abc///../..", "/abc/../..", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/./.././../abc///../../", "/abc/../../", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/./.././../abc///../../", "/abc/../..", 0); + + test_path_simplify_one("/.../abc", "/.../abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.../abc///", "/.../abc/", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.../abc///", "/.../abc", 0); + test_path_simplify_one("/.../abc", "/.../abc", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.../abc///...", "/.../abc/...", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.../abc///.../", "/.../abc/.../", PATH_SIMPLIFY_KEEP_TRAILING_SLASH); + test_path_simplify_one("/.../abc///.../", "/.../abc/...", 0); + + memset(foo, 'a', sizeof(foo) -1); + char_array_0(foo); + + test_path_simplify_one(foo, foo, 0); + + hoge = strjoin("/", foo); + assert_se(hoge); + test_path_simplify_one(hoge, hoge, 0); + hoge = mfree(hoge); + + hoge = strjoin("a////.//././//./b///././/./c/////././//./", foo, "//.//////d/e/.//f/"); + assert_se(hoge); + + hoge_out = strjoin("a/b/c/", foo, "//.//////d/e/.//f/"); + assert_se(hoge_out); + + test_path_simplify_one(hoge, hoge_out, 0); +} + +static void test_path_compare_one(const char *a, const char *b, int expected) { + int r; + + assert_se(path_compare(a, a) == 0); + assert_se(path_compare(b, b) == 0); + + r = path_compare(a, b); + assert_se((r > 0) == (expected > 0) && (r < 0) == (expected < 0)); + r = path_compare(b, a); + assert_se((r < 0) == (expected > 0) && (r > 0) == (expected < 0)); + + assert_se(path_equal(a, a) == 1); + assert_se(path_equal(b, b) == 1); + assert_se(path_equal(a, b) == (expected == 0)); + assert_se(path_equal(b, a) == (expected == 0)); +} + +TEST(path_compare) { + test_path_compare_one("/goo", "/goo", 0); + test_path_compare_one("/goo", "/goo", 0); + test_path_compare_one("//goo", "/goo", 0); + test_path_compare_one("//goo/////", "/goo", 0); + test_path_compare_one("goo/////", "goo", 0); + test_path_compare_one("/goo/boo", "/goo//boo", 0); + test_path_compare_one("//goo/boo", "/goo/boo//", 0); + test_path_compare_one("//goo/././//./boo//././//", "/goo/boo//.", 0); + test_path_compare_one("/.", "//.///", 0); + test_path_compare_one("/x", "x/", 1); + test_path_compare_one("x/", "/", -1); + test_path_compare_one("/x/./y", "x/y", 1); + test_path_compare_one("/x/./y", "/x/y", 0); + test_path_compare_one("/x/./././y", "/x/y/././.", 0); + test_path_compare_one("./x/./././y", "./x/y/././.", 0); + test_path_compare_one(".", "./.", 0); + test_path_compare_one(".", "././.", 0); + test_path_compare_one("./..", ".", 1); + test_path_compare_one("x/.y", "x/y", -1); + test_path_compare_one("foo", "/foo", -1); + test_path_compare_one("/foo", "/foo/bar", -1); + test_path_compare_one("/foo/aaa", "/foo/b", -1); + test_path_compare_one("/foo/aaa", "/foo/b/a", -1); + test_path_compare_one("/foo/a", "/foo/aaa", -1); + test_path_compare_one("/foo/a/b", "/foo/aaa", -1); +} + +static void test_path_compare_filename_one(const char *a, const char *b, int expected) { + int r; + + assert_se(path_compare_filename(a, a) == 0); + assert_se(path_compare_filename(b, b) == 0); + + r = path_compare_filename(a, b); + assert_se((r > 0) == (expected > 0) && (r < 0) == (expected < 0)); + r = path_compare_filename(b, a); + assert_se((r < 0) == (expected > 0) && (r > 0) == (expected < 0)); + + assert_se(path_equal_filename(a, a) == 1); + assert_se(path_equal_filename(b, b) == 1); + assert_se(path_equal_filename(a, b) == (expected == 0)); + assert_se(path_equal_filename(b, a) == (expected == 0)); +} + +TEST(path_compare_filename) { + test_path_compare_filename_one("/goo", "/goo", 0); + test_path_compare_filename_one("/goo", "/goo", 0); + test_path_compare_filename_one("//goo", "/goo", 0); + test_path_compare_filename_one("//goo/////", "/goo", 0); + test_path_compare_filename_one("goo/////", "goo", 0); + test_path_compare_filename_one("/goo/boo", "/goo//boo", 0); + test_path_compare_filename_one("//goo/boo", "/goo/boo//", 0); + test_path_compare_filename_one("//goo/././//./boo//././//", "/goo/boo//.", 0); + test_path_compare_filename_one("/.", "//.///", -1); + test_path_compare_filename_one("/x", "x/", 0); + test_path_compare_filename_one("x/", "/", 1); + test_path_compare_filename_one("/x/./y", "x/y", 0); + test_path_compare_filename_one("/x/./y", "/x/y", 0); + test_path_compare_filename_one("/x/./././y", "/x/y/././.", 0); + test_path_compare_filename_one("./x/./././y", "./x/y/././.", 0); + test_path_compare_filename_one(".", "./.", -1); + test_path_compare_filename_one(".", "././.", -1); + test_path_compare_filename_one("./..", ".", 1); + test_path_compare_filename_one("x/.y", "x/y", -1); + test_path_compare_filename_one("foo", "/foo", 0); + test_path_compare_filename_one("/foo", "/foo/bar", 1); + test_path_compare_filename_one("/foo/aaa", "/foo/b", -1); + test_path_compare_filename_one("/foo/aaa", "/foo/b/a", 1); + test_path_compare_filename_one("/foo/a", "/foo/aaa", -1); + test_path_compare_filename_one("/foo/a/b", "/foo/aaa", 1); + test_path_compare_filename_one("/a/c", "/b/c", 0); + test_path_compare_filename_one("/a", "/a", 0); + test_path_compare_filename_one("/a/b", "/a/c", -1); + test_path_compare_filename_one("/b", "/c", -1); +} + +TEST(path_equal_root) { + /* Nail down the details of how path_equal("/", ...) works. */ + + assert_se(path_equal("/", "/")); + assert_se(path_equal("/", "//")); + + assert_se(path_equal("/", "/./")); + assert_se(!path_equal("/", "/../")); + + assert_se(!path_equal("/", "/.../")); + + /* Make sure that files_same works as expected. */ + + assert_se(inode_same("/", "/", 0) > 0); + assert_se(inode_same("/", "/", AT_SYMLINK_NOFOLLOW) > 0); + assert_se(inode_same("/", "//", 0) > 0); + assert_se(inode_same("/", "//", AT_SYMLINK_NOFOLLOW) > 0); + + assert_se(inode_same("/", "/./", 0) > 0); + assert_se(inode_same("/", "/./", AT_SYMLINK_NOFOLLOW) > 0); + assert_se(inode_same("/", "/../", 0) > 0); + assert_se(inode_same("/", "/../", AT_SYMLINK_NOFOLLOW) > 0); + + assert_se(inode_same("/", "/.../", 0) == -ENOENT); + assert_se(inode_same("/", "/.../", AT_SYMLINK_NOFOLLOW) == -ENOENT); + + /* The same for path_equal_or_files_same. */ + + assert_se(path_equal_or_inode_same("/", "/", 0)); + assert_se(path_equal_or_inode_same("/", "/", AT_SYMLINK_NOFOLLOW)); + assert_se(path_equal_or_inode_same("/", "//", 0)); + assert_se(path_equal_or_inode_same("/", "//", AT_SYMLINK_NOFOLLOW)); + + assert_se(path_equal_or_inode_same("/", "/./", 0)); + assert_se(path_equal_or_inode_same("/", "/./", AT_SYMLINK_NOFOLLOW)); + assert_se(path_equal_or_inode_same("/", "/../", 0)); + assert_se(path_equal_or_inode_same("/", "/../", AT_SYMLINK_NOFOLLOW)); + + assert_se(!path_equal_or_inode_same("/", "/.../", 0)); + assert_se(!path_equal_or_inode_same("/", "/.../", AT_SYMLINK_NOFOLLOW)); +} + +TEST(find_executable_full) { + char *p; + char* test_file_name; + _cleanup_close_ int fd = -EBADF; + char fn[] = "/tmp/test-XXXXXX"; + + assert_se(find_executable_full("sh", NULL, NULL, true, &p, NULL) == 0); + puts(p); + assert_se(streq(basename(p), "sh")); + free(p); + + assert_se(find_executable_full("sh", NULL, NULL, false, &p, NULL) == 0); + puts(p); + assert_se(streq(basename(p), "sh")); + free(p); + + _cleanup_free_ char *oldpath = NULL; + p = getenv("PATH"); + if (p) + assert_se(oldpath = strdup(p)); + + assert_se(unsetenv("PATH") == 0); + + assert_se(find_executable_full("sh", NULL, NULL, true, &p, NULL) == 0); + puts(p); + assert_se(streq(basename(p), "sh")); + free(p); + + assert_se(find_executable_full("sh", NULL, NULL, false, &p, NULL) == 0); + puts(p); + assert_se(streq(basename(p), "sh")); + free(p); + + if (oldpath) + assert_se(setenv("PATH", oldpath, true) >= 0); + + assert_se((fd = mkostemp_safe(fn)) >= 0); + assert_se(fchmod(fd, 0755) >= 0); + + test_file_name = basename(fn); + + assert_se(find_executable_full(test_file_name, NULL, STRV_MAKE("/doesnotexist", "/tmp", "/bin"), false, &p, NULL) == 0); + puts(p); + assert_se(streq(p, fn)); + free(p); + + (void) unlink(fn); + assert_se(find_executable_full(test_file_name, NULL, STRV_MAKE("/doesnotexist", "/tmp", "/bin"), false, &p, NULL) == -ENOENT); +} + +TEST(find_executable) { + char *p; + + assert_se(find_executable("/bin/sh", &p) == 0); + puts(p); + assert_se(path_equal(p, "/bin/sh")); + free(p); + + assert_se(find_executable(saved_argv[0], &p) == 0); + puts(p); + assert_se(endswith(p, "/test-path-util")); + assert_se(path_is_absolute(p)); + free(p); + + assert_se(find_executable("sh", &p) == 0); + puts(p); + assert_se(endswith(p, "/sh")); + assert_se(path_is_absolute(p)); + free(p); + + assert_se(find_executable("/bin/touch", &p) == 0); + assert_se(streq(p, "/bin/touch")); + free(p); + + assert_se(find_executable("touch", &p) == 0); + assert_se(path_is_absolute(p)); + assert_se(streq(basename(p), "touch")); + free(p); + + assert_se(find_executable("xxxx-xxxx", &p) == -ENOENT); + assert_se(find_executable("/some/dir/xxxx-xxxx", &p) == -ENOENT); + assert_se(find_executable("/proc/filesystems", &p) == -EACCES); +} + +static void test_find_executable_exec_one(const char *path) { + _cleanup_free_ char *t = NULL; + _cleanup_close_ int fd = -EBADF; + pid_t pid; + int r; + + r = find_executable_full(path, NULL, NULL, false, &t, &fd); + + log_info_errno(r, "%s: %s → %s: %d/%m", __func__, path, t ?: "-", fd); + + assert_se(fd > STDERR_FILENO); + assert_se(path_is_absolute(t)); + if (path_is_absolute(path)) + assert_se(streq(t, path)); + + pid = fork(); + assert_se(pid >= 0); + if (pid == 0) { + r = fexecve_or_execve(fd, t, STRV_MAKE(t, "--version"), STRV_MAKE(NULL)); + log_error_errno(r, "[f]execve: %m"); + _exit(EXIT_FAILURE); + } + + assert_se(wait_for_terminate_and_check(t, pid, WAIT_LOG) == 0); +} + +TEST(find_executable_exec) { + test_find_executable_exec_one("touch"); + test_find_executable_exec_one("/bin/touch"); + + _cleanup_free_ char *script = NULL; + assert_se(get_testdata_dir("test-path-util/script.sh", &script) >= 0); + test_find_executable_exec_one(script); +} + +TEST(prefixes) { + static const char* const values[] = { + "/a/b/c/d", + "/a/b/c", + "/a/b", + "/a", + "", + NULL + }; + unsigned i; + char s[PATH_MAX]; + bool b; + + i = 0; + PATH_FOREACH_PREFIX_MORE(s, "/a/b/c/d") { + log_error("---%s---", s); + assert_se(streq(s, values[i++])); + } + assert_se(values[i] == NULL); + + i = 1; + PATH_FOREACH_PREFIX(s, "/a/b/c/d") { + log_error("---%s---", s); + assert_se(streq(s, values[i++])); + } + assert_se(values[i] == NULL); + + i = 0; + PATH_FOREACH_PREFIX_MORE(s, "////a////b////c///d///////") + assert_se(streq(s, values[i++])); + assert_se(values[i] == NULL); + + i = 1; + PATH_FOREACH_PREFIX(s, "////a////b////c///d///////") + assert_se(streq(s, values[i++])); + assert_se(values[i] == NULL); + + PATH_FOREACH_PREFIX(s, "////") + assert_not_reached(); + + b = false; + PATH_FOREACH_PREFIX_MORE(s, "////") { + assert_se(!b); + assert_se(streq(s, "")); + b = true; + } + assert_se(b); + + PATH_FOREACH_PREFIX(s, "") + assert_not_reached(); + + b = false; + PATH_FOREACH_PREFIX_MORE(s, "") { + assert_se(!b); + assert_se(streq(s, "")); + b = true; + } +} + +TEST(path_join) { +#define test_join(expected, ...) { \ + _cleanup_free_ char *z = NULL; \ + z = path_join(__VA_ARGS__); \ + log_debug("got \"%s\", expected \"%s\"", z, expected); \ + assert_se(streq(z, expected)); \ + } + + test_join("/root/a/b/c", "/root", "/a/b", "/c"); + test_join("/root/a/b/c", "/root", "a/b", "c"); + test_join("/root/a/b/c", "/root", "/a/b", "c"); + test_join("/root/c", "/root", "/", "c"); + test_join("/root/", "/root", "/", NULL); + + test_join("/a/b/c", "", "/a/b", "/c"); + test_join("a/b/c", "", "a/b", "c"); + test_join("/a/b/c", "", "/a/b", "c"); + test_join("/c", "", "/", "c"); + test_join("/", "", "/", NULL); + + test_join("/a/b/c", NULL, "/a/b", "/c"); + test_join("a/b/c", NULL, "a/b", "c"); + test_join("/a/b/c", NULL, "/a/b", "c"); + test_join("/c", NULL, "/", "c"); + test_join("/", NULL, "/", NULL); + + test_join("", "", NULL); + test_join("", NULL, ""); + test_join("", NULL, NULL); + + test_join("foo/bar", "foo", "bar"); + test_join("foo/bar", "", "foo", "bar"); + test_join("foo/bar", NULL, "foo", NULL, "bar"); + test_join("foo/bar", "", "foo", "", "bar", ""); + test_join("foo/bar", "", "", "", "", "foo", "", "", "", "bar", "", "", ""); + + test_join("//foo///bar//", "", "/", "", "/foo/", "", "/", "", "/bar/", "", "/", ""); + test_join("/foo/bar/", "/", "foo", "/", "bar", "/"); + test_join("foo/bar/baz", "foo", "bar", "baz"); + test_join("foo/bar/baz", "foo/", "bar", "/baz"); + test_join("foo//bar//baz", "foo/", "/bar/", "/baz"); + test_join("//foo////bar////baz//", "//foo/", "///bar/", "///baz//"); +} + +TEST(path_extend) { + _cleanup_free_ char *p = NULL; + + assert_se(path_extend(&p, "foo", "bar", "baz") == p); + assert_se(streq(p, "foo/bar/baz")); + + assert_se(path_extend(&p, "foo", "bar", "baz") == p); + assert_se(streq(p, "foo/bar/baz/foo/bar/baz")); + + p = mfree(p); + assert_se(path_extend(&p, "foo") == p); + assert_se(streq(p, "foo")); + + assert_se(path_extend(&p, "/foo") == p); + assert_se(streq(p, "foo/foo")); + assert_se(path_extend(&p, "/waaaah/wahhh//") == p); + assert_se(streq(p, "foo/foo/waaaah/wahhh//")); /* path_extend() does not drop redundant slashes */ + assert_se(path_extend(&p, "/aaa/bbb/") == p); + assert_se(streq(p, "foo/foo/waaaah/wahhh///aaa/bbb/")); /* but not add an extra slash */ + + assert_se(free_and_strdup(&p, "/") >= 0); + assert_se(path_extend(&p, "foo") == p); + assert_se(streq(p, "/foo")); +} + +TEST(fsck_exists) { + /* Ensure we use a sane default for PATH. */ + assert_se(unsetenv("PATH") == 0); + + /* We might or might not find one of these, so keep the test lax. */ + assert_se(fsck_exists_for_fstype("minix") >= 0); + + assert_se(fsck_exists_for_fstype("AbCdE") == 0); + assert_se(fsck_exists_for_fstype("/../bin/") == 0); +} + +static void test_path_make_relative_one(const char *from, const char *to, const char *expected) { + _cleanup_free_ char *z = NULL; + int r; + + log_info("/* %s(%s, %s) */", __func__, from, to); + + r = path_make_relative(from, to, &z); + assert_se((r >= 0) == !!expected); + assert_se(streq_ptr(z, expected)); +} + +TEST(path_make_relative) { + test_path_make_relative_one("some/relative/path", "/some/path", NULL); + test_path_make_relative_one("/some/path", "some/relative/path", NULL); + test_path_make_relative_one("/some/dotdot/../path", "/some/path", NULL); + + test_path_make_relative_one("/", "/", "."); + test_path_make_relative_one("/", "/some/path", "some/path"); + test_path_make_relative_one("/some/path", "/some/path", "."); + test_path_make_relative_one("/some/path", "/some/path/in/subdir", "in/subdir"); + test_path_make_relative_one("/some/path", "/", "../.."); + test_path_make_relative_one("/some/path", "/some/other/path", "../other/path"); + test_path_make_relative_one("/some/path/./dot", "/some/further/path", "../../further/path"); + test_path_make_relative_one("//extra.//.//./.slashes//./won't////fo.ol///anybody//", "/././/extra././/.slashes////ar.e/.just/././.fine///", "../../../ar.e/.just/.fine"); +} + +static void test_path_make_relative_parent_one(const char *from, const char *to, const char *expected) { + _cleanup_free_ char *z = NULL; + int r; + + log_info("/* %s(%s, %s) */", __func__, from, to); + + r = path_make_relative_parent(from, to, &z); + assert_se((r >= 0) == !!expected); + assert_se(streq_ptr(z, expected)); +} + +TEST(path_make_relative_parent) { + test_path_make_relative_parent_one("some/relative/path/hoge", "/some/path", NULL); + test_path_make_relative_parent_one("/some/path/hoge", "some/relative/path", NULL); + test_path_make_relative_parent_one("/some/dotdot/../path/hoge", "/some/path", NULL); + test_path_make_relative_parent_one("/", "/aaa", NULL); + + test_path_make_relative_parent_one("/hoge", "/", "."); + test_path_make_relative_parent_one("/hoge", "/some/path", "some/path"); + test_path_make_relative_parent_one("/some/path/hoge", "/some/path", "."); + test_path_make_relative_parent_one("/some/path/hoge", "/some/path/in/subdir", "in/subdir"); + test_path_make_relative_parent_one("/some/path/hoge", "/", "../.."); + test_path_make_relative_parent_one("/some/path/hoge", "/some/other/path", "../other/path"); + test_path_make_relative_parent_one("/some/path/./dot/hoge", "/some/further/path", "../../further/path"); + test_path_make_relative_parent_one("//extra.//.//./.slashes//./won't////fo.ol///anybody//hoge", "/././/extra././/.slashes////ar.e/.just/././.fine///", "../../../ar.e/.just/.fine"); +} + +TEST(path_strv_resolve) { + char tmp_dir[] = "/tmp/test-path-util-XXXXXX"; + _cleanup_strv_free_ char **search_dirs = NULL; + _cleanup_strv_free_ char **absolute_dirs = NULL; + + assert_se(mkdtemp(tmp_dir) != NULL); + + search_dirs = strv_new("/dir1", "/dir2", "/dir3"); + assert_se(search_dirs); + STRV_FOREACH(d, search_dirs) { + char *p = path_join(tmp_dir, *d); + assert_se(p); + assert_se(strv_push(&absolute_dirs, p) == 0); + } + + assert_se(mkdir(absolute_dirs[0], 0700) == 0); + assert_se(mkdir(absolute_dirs[1], 0700) == 0); + assert_se(symlink("dir2", absolute_dirs[2]) == 0); + + path_strv_resolve(search_dirs, tmp_dir); + assert_se(streq(search_dirs[0], "/dir1")); + assert_se(streq(search_dirs[1], "/dir2")); + assert_se(streq(search_dirs[2], "/dir2")); + + assert_se(rm_rf(tmp_dir, REMOVE_ROOT|REMOVE_PHYSICAL) == 0); +} + +static void test_path_startswith_one(const char *path, const char *prefix, const char *skipped, const char *expected) { + const char *p, *q; + + log_debug("/* %s(%s, %s) */", __func__, path, prefix); + + p = path_startswith(path, prefix); + assert_se(streq_ptr(p, expected)); + if (p) { + q = strjoina(skipped, p); + assert_se(streq(q, path)); + assert_se(p == path + strlen(skipped)); + } +} + +TEST(path_startswith) { + test_path_startswith_one("/foo/bar/barfoo/", "/foo", "/foo/", "bar/barfoo/"); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/", "/foo/", "bar/barfoo/"); + test_path_startswith_one("/foo/bar/barfoo/", "/", "/", "foo/bar/barfoo/"); + test_path_startswith_one("/foo/bar/barfoo/", "////", "/", "foo/bar/barfoo/"); + test_path_startswith_one("/foo/bar/barfoo/", "/foo//bar/////barfoo///", "/foo/bar/barfoo/", ""); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar/barfoo////", "/foo/bar/barfoo/", ""); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar///barfoo/", "/foo/bar/barfoo/", ""); + test_path_startswith_one("/foo/bar/barfoo/", "/foo////bar/barfoo/", "/foo/bar/barfoo/", ""); + test_path_startswith_one("/foo/bar/barfoo/", "////foo/bar/barfoo/", "/foo/bar/barfoo/", ""); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar/barfoo", "/foo/bar/barfoo/", ""); + + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo", "/foo/./", "bar///barfoo/./."); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo/", "/foo/./", "bar///barfoo/./."); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/", "/", "foo/./bar///barfoo/./."); + test_path_startswith_one("/foo/./bar///barfoo/./.", "////", "/", "foo/./bar///barfoo/./."); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo//bar/////barfoo///", "/foo/./bar///barfoo/./.", ""); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo/bar/barfoo////", "/foo/./bar///barfoo/./.", ""); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo/bar///barfoo/", "/foo/./bar///barfoo/./.", ""); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo////bar/barfoo/", "/foo/./bar///barfoo/./.", ""); + test_path_startswith_one("/foo/./bar///barfoo/./.", "////foo/bar/barfoo/", "/foo/./bar///barfoo/./.", ""); + test_path_startswith_one("/foo/./bar///barfoo/./.", "/foo/bar/barfoo", "/foo/./bar///barfoo/./.", ""); + + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar/barfooa/", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar/barfooa", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "/bar/foo", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "/f/b/b/", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar/barfo", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "/foo/bar/bar", NULL, NULL); + test_path_startswith_one("/foo/bar/barfoo/", "/fo", NULL, NULL); +} + +static void test_prefix_root_one(const char *r, const char *p, const char *expected) { + _cleanup_free_ char *s = NULL; + const char *t; + + assert_se(s = path_join(r, p)); + assert_se(path_equal_ptr(s, expected)); + + t = prefix_roota(r, p); + assert_se(t); + assert_se(path_equal_ptr(t, expected)); +} + +TEST(prefix_root) { + test_prefix_root_one("/", "/foo", "/foo"); + test_prefix_root_one(NULL, "/foo", "/foo"); + test_prefix_root_one("", "/foo", "/foo"); + test_prefix_root_one("///", "/foo", "/foo"); + test_prefix_root_one("/", "////foo", "/foo"); + test_prefix_root_one(NULL, "////foo", "/foo"); + test_prefix_root_one("/", "foo", "/foo"); + test_prefix_root_one("", "foo", "foo"); + test_prefix_root_one(NULL, "foo", "foo"); + + test_prefix_root_one("/foo", "/bar", "/foo/bar"); + test_prefix_root_one("/foo", "bar", "/foo/bar"); + test_prefix_root_one("foo", "bar", "foo/bar"); + test_prefix_root_one("/foo/", "/bar", "/foo/bar"); + test_prefix_root_one("/foo/", "//bar", "/foo/bar"); + test_prefix_root_one("/foo///", "//bar", "/foo/bar"); +} + +TEST(file_in_same_dir) { + char *t; + + assert_se(file_in_same_dir("/", "a", &t) == -EADDRNOTAVAIL); + + assert_se(file_in_same_dir("/", "/a", &t) >= 0); + assert_se(streq(t, "/a")); + free(t); + + assert_se(file_in_same_dir("", "a", &t) == -EINVAL); + + assert_se(file_in_same_dir("a/", "x", &t) >= 0); + assert_se(streq(t, "x")); + free(t); + + assert_se(file_in_same_dir("bar/foo", "bar", &t) >= 0); + assert_se(streq(t, "bar/bar")); + free(t); +} + +static void test_path_find_first_component_one( + const char *path, + bool accept_dot_dot, + char **expected, + int ret) { + + log_debug("/* %s(\"%s\", accept_dot_dot=%s) */", __func__, strnull(path), yes_no(accept_dot_dot)); + + for (const char *p = path;;) { + const char *e; + int r; + + r = path_find_first_component(&p, accept_dot_dot, &e); + if (r <= 0) { + if (r == 0) { + if (path) { + assert_se(p == path + strlen_ptr(path)); + assert_se(isempty(p)); + } else + assert_se(!p); + assert_se(!e); + } + assert_se(r == ret); + assert_se(strv_isempty(expected)); + return; + } + + assert_se(e); + assert_se(strcspn(e, "/") == (size_t) r); + assert_se(strlen_ptr(*expected) == (size_t) r); + assert_se(strneq(e, *expected++, r)); + + assert_se(p); + log_debug("p=%s", p); + if (!isempty(*expected)) + assert_se(startswith(p, *expected)); + else if (ret >= 0) { + assert_se(p == path + strlen_ptr(path)); + assert_se(isempty(p)); + } + } +} + +TEST(path_find_first_component) { + _cleanup_free_ char *hoge = NULL; + char foo[NAME_MAX * 2]; + + test_path_find_first_component_one(NULL, false, NULL, 0); + test_path_find_first_component_one("", false, NULL, 0); + test_path_find_first_component_one("/", false, NULL, 0); + test_path_find_first_component_one(".", false, NULL, 0); + test_path_find_first_component_one("./", false, NULL, 0); + test_path_find_first_component_one("./.", false, NULL, 0); + test_path_find_first_component_one("..", false, NULL, -EINVAL); + test_path_find_first_component_one("/..", false, NULL, -EINVAL); + test_path_find_first_component_one("./..", false, NULL, -EINVAL); + test_path_find_first_component_one("////./././//.", false, NULL, 0); + test_path_find_first_component_one("a/b/c", false, STRV_MAKE("a", "b", "c"), 0); + test_path_find_first_component_one("././//.///aa/bbb//./ccc", false, STRV_MAKE("aa", "bbb", "ccc"), 0); + test_path_find_first_component_one("././//.///aa/.../../bbb//./ccc/.", false, STRV_MAKE("aa", "..."), -EINVAL); + test_path_find_first_component_one("//./aaa///.//./.bbb/..///c.//d.dd///..eeee/.", false, STRV_MAKE("aaa", ".bbb"), -EINVAL); + test_path_find_first_component_one("a/foo./b//././/", false, STRV_MAKE("a", "foo.", "b"), 0); + + test_path_find_first_component_one(NULL, true, NULL, 0); + test_path_find_first_component_one("", true, NULL, 0); + test_path_find_first_component_one("/", true, NULL, 0); + test_path_find_first_component_one(".", true, NULL, 0); + test_path_find_first_component_one("./", true, NULL, 0); + test_path_find_first_component_one("./.", true, NULL, 0); + test_path_find_first_component_one("..", true, STRV_MAKE(".."), 0); + test_path_find_first_component_one("/..", true, STRV_MAKE(".."), 0); + test_path_find_first_component_one("./..", true, STRV_MAKE(".."), 0); + test_path_find_first_component_one("////./././//.", true, NULL, 0); + test_path_find_first_component_one("a/b/c", true, STRV_MAKE("a", "b", "c"), 0); + test_path_find_first_component_one("././//.///aa/bbb//./ccc", true, STRV_MAKE("aa", "bbb", "ccc"), 0); + test_path_find_first_component_one("././//.///aa/.../../bbb//./ccc/.", true, STRV_MAKE("aa", "...", "..", "bbb", "ccc"), 0); + test_path_find_first_component_one("//./aaa///.//./.bbb/..///c.//d.dd///..eeee/.", true, STRV_MAKE("aaa", ".bbb", "..", "c.", "d.dd", "..eeee"), 0); + test_path_find_first_component_one("a/foo./b//././/", true, STRV_MAKE("a", "foo.", "b"), 0); + + memset(foo, 'a', sizeof(foo) -1); + char_array_0(foo); + + test_path_find_first_component_one(foo, false, NULL, -EINVAL); + test_path_find_first_component_one(foo, true, NULL, -EINVAL); + + hoge = strjoin("a/b/c/", foo, "//d/e/.//f/"); + assert_se(hoge); + + test_path_find_first_component_one(hoge, false, STRV_MAKE("a", "b", "c"), -EINVAL); + test_path_find_first_component_one(hoge, true, STRV_MAKE("a", "b", "c"), -EINVAL); +} + +static void test_path_find_last_component_one( + const char *path, + bool accept_dot_dot, + char **expected, + int ret) { + + log_debug("/* %s(\"%s\", accept_dot_dot=%s) */", __func__, strnull(path), yes_no(accept_dot_dot)); + + for (const char *next = NULL;;) { + const char *e; + int r; + + r = path_find_last_component(path, accept_dot_dot, &next, &e); + if (r <= 0) { + if (r == 0) { + assert_se(next == path); + assert_se(!e); + } + assert_se(r == ret); + assert_se(strv_isempty(expected)); + return; + } + + assert_se(e); + assert_se(strcspn(e, "/") == (size_t) r); + assert_se(strlen_ptr(*expected) == (size_t) r); + assert_se(strneq(e, *expected++, r)); + + assert_se(next); + log_debug("path=%s\nnext=%s", path, next); + if (!isempty(*expected)) { + assert_se(next < path + strlen(path)); + assert_se(next >= path + strlen(*expected)); + assert_se(startswith(next - strlen(*expected), *expected)); + } else if (ret >= 0) + assert_se(next == path); + } +} + +TEST(path_find_last_component) { + _cleanup_free_ char *hoge = NULL; + char foo[NAME_MAX * 2]; + + test_path_find_last_component_one(NULL, false, NULL, 0); + test_path_find_last_component_one("", false, NULL, 0); + test_path_find_last_component_one("/", false, NULL, 0); + test_path_find_last_component_one(".", false, NULL, 0); + test_path_find_last_component_one("./", false, NULL, 0); + test_path_find_last_component_one("./.", false, NULL, 0); + test_path_find_last_component_one("..", false, NULL, -EINVAL); + test_path_find_last_component_one("/..", false, NULL, -EINVAL); + test_path_find_last_component_one("./..", false, NULL, -EINVAL); + test_path_find_last_component_one("////./././//.", false, NULL, 0); + test_path_find_last_component_one("a/b/c", false, STRV_MAKE("c", "b", "a"), 0); + test_path_find_last_component_one("././//.///aa./.bbb//./ccc/././/", false, STRV_MAKE("ccc", ".bbb", "aa."), 0); + test_path_find_last_component_one("././//.///aa/../.../bbb//./ccc/.", false, STRV_MAKE("ccc", "bbb", "..."), -EINVAL); + test_path_find_last_component_one("//./aaa///.//./.bbb/..///c.//d.dd///..eeee/.", false, STRV_MAKE("..eeee", "d.dd", "c."), -EINVAL); + + test_path_find_last_component_one(NULL, true, NULL, 0); + test_path_find_last_component_one("", true, NULL, 0); + test_path_find_last_component_one("/", true, NULL, 0); + test_path_find_last_component_one(".", true, NULL, 0); + test_path_find_last_component_one("./", true, NULL, 0); + test_path_find_last_component_one("./.", true, NULL, 0); + test_path_find_last_component_one("..", true, STRV_MAKE(".."), 0); + test_path_find_last_component_one("/..", true, STRV_MAKE(".."), 0); + test_path_find_last_component_one("./..", true, STRV_MAKE(".."), 0); + test_path_find_last_component_one("////./././//.", true, NULL, 0); + test_path_find_last_component_one("a/b/c", true, STRV_MAKE("c", "b", "a"), 0); + test_path_find_last_component_one("././//.///aa./.bbb//./ccc/././/", true, STRV_MAKE("ccc", ".bbb", "aa."), 0); + test_path_find_last_component_one("././//.///aa/../.../bbb//./ccc/.", true, STRV_MAKE("ccc", "bbb", "...", "..", "aa"), 0); + test_path_find_last_component_one("//./aaa///.//./.bbb/..///c.//d.dd///..eeee/.", true, STRV_MAKE("..eeee", "d.dd", "c.", "..", ".bbb", "aaa"), 0); + + memset(foo, 'a', sizeof(foo) -1); + char_array_0(foo); + + test_path_find_last_component_one(foo, false, NULL, -EINVAL); + test_path_find_last_component_one(foo, true, NULL, -EINVAL); + + hoge = strjoin(foo, "/a/b/c/"); + assert_se(hoge); + + test_path_find_last_component_one(hoge, false, STRV_MAKE("c", "b", "a"), -EINVAL); + test_path_find_last_component_one(hoge, true, STRV_MAKE("c", "b", "a"), -EINVAL); +} + +TEST(last_path_component) { + assert_se(last_path_component(NULL) == NULL); + assert_se(streq(last_path_component("a/b/c"), "c")); + assert_se(streq(last_path_component("a/b/c/"), "c/")); + assert_se(streq(last_path_component("/"), "/")); + assert_se(streq(last_path_component("//"), "/")); + assert_se(streq(last_path_component("///"), "/")); + assert_se(streq(last_path_component("."), ".")); + assert_se(streq(last_path_component("./."), ".")); + assert_se(streq(last_path_component("././"), "./")); + assert_se(streq(last_path_component("././/"), ".//")); + assert_se(streq(last_path_component("/foo/a"), "a")); + assert_se(streq(last_path_component("/foo/a/"), "a/")); + assert_se(streq(last_path_component(""), "")); + assert_se(streq(last_path_component("a"), "a")); + assert_se(streq(last_path_component("a/"), "a/")); + assert_se(streq(last_path_component("/a"), "a")); + assert_se(streq(last_path_component("/a/"), "a/")); +} + +static void test_path_extract_filename_one(const char *input, const char *output, int ret) { + _cleanup_free_ char *k = NULL; + int r; + + r = path_extract_filename(input, &k); + log_info("%s → %s/%s [expected: %s/%s]", + strnull(input), + strnull(k), r < 0 ? STRERROR(r) : "-", + strnull(output), ret < 0 ? STRERROR(ret) : "-"); + assert_se(streq_ptr(k, output)); + assert_se(r == ret); +} + +TEST(path_extract_filename) { + test_path_extract_filename_one(NULL, NULL, -EINVAL); + test_path_extract_filename_one("a/b/c", "c", 0); + test_path_extract_filename_one("a/b/c/", "c", O_DIRECTORY); + test_path_extract_filename_one("/", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("//", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("///", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("/.", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one(".", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("./", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("./.", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("././", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("././/", NULL, -EADDRNOTAVAIL); + test_path_extract_filename_one("/foo/a", "a", 0); + test_path_extract_filename_one("/foo/a/", "a", O_DIRECTORY); + test_path_extract_filename_one("", NULL, -EINVAL); + test_path_extract_filename_one("a", "a", 0); + test_path_extract_filename_one("a/", "a", O_DIRECTORY); + test_path_extract_filename_one("a/././//.", "a", O_DIRECTORY); + test_path_extract_filename_one("/a", "a", 0); + test_path_extract_filename_one("/a/", "a", O_DIRECTORY); + test_path_extract_filename_one("/a//./.", "a", O_DIRECTORY); + test_path_extract_filename_one("/////////////a/////////////", "a", O_DIRECTORY); + test_path_extract_filename_one("//./a/.///b./././.c//./d//.", "d", O_DIRECTORY); + test_path_extract_filename_one("xx/.", "xx", O_DIRECTORY); + test_path_extract_filename_one("xx/..", NULL, -EINVAL); + test_path_extract_filename_one("..", NULL, -EINVAL); + test_path_extract_filename_one("/..", NULL, -EINVAL); + test_path_extract_filename_one("../", NULL, -EINVAL); +} + +static void test_path_extract_directory_one(const char *input, const char *output, int ret) { + _cleanup_free_ char *k = NULL; + int r; + + r = path_extract_directory(input, &k); + log_info("%s → %s/%s [expected: %s/%s]", + strnull(input), + strnull(k), r < 0 ? STRERROR(r) : "-", + strnull(output), STRERROR(ret)); + assert_se(streq_ptr(k, output)); + assert_se(r == ret); + + /* Extra safety check: let's make sure that if we split out the filename too (and it works) the + * joined parts are identical to the original again */ + if (r >= 0) { + _cleanup_free_ char *f = NULL; + + r = path_extract_filename(input, &f); + if (r >= 0) { + _cleanup_free_ char *j = NULL; + + assert_se(j = path_join(k, f)); + assert_se(path_equal(input, j)); + } + } +} + +TEST(path_extract_directory) { + test_path_extract_directory_one(NULL, NULL, -EINVAL); + test_path_extract_directory_one("a/b/c", "a/b", 0); + test_path_extract_directory_one("a/b/c/", "a/b", 0); + test_path_extract_directory_one("/", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("//", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("///", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("/.", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one(".", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("./", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("./.", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("././", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("././/", NULL, -EADDRNOTAVAIL); + test_path_extract_directory_one("/foo/a", "/foo", 0); + test_path_extract_directory_one("/foo/a/", "/foo", 0); + test_path_extract_directory_one("", NULL, -EINVAL); + test_path_extract_directory_one("a", NULL, -EDESTADDRREQ); + test_path_extract_directory_one("a/", NULL, -EDESTADDRREQ); + test_path_extract_directory_one("a/././//.", NULL, -EDESTADDRREQ); + test_path_extract_directory_one("/a", "/", 0); + test_path_extract_directory_one("/a/", "/", 0); + test_path_extract_directory_one("/a//./.", "/", 0); + test_path_extract_directory_one("/////////////a/////////////", "/", 0); + test_path_extract_directory_one("//./a/.///b./././.c//./d//.", "/a/b./.c", 0); + test_path_extract_directory_one("xx/.", NULL, -EDESTADDRREQ); + test_path_extract_directory_one("xx/..", NULL, -EINVAL); + test_path_extract_directory_one("..", NULL, -EINVAL); + test_path_extract_directory_one("/..", NULL, -EINVAL); + test_path_extract_directory_one("../", NULL, -EINVAL); +} + +TEST(filename_is_valid) { + char foo[NAME_MAX+2]; + + assert_se(!filename_is_valid("")); + assert_se(!filename_is_valid("/bar/foo")); + assert_se(!filename_is_valid("/")); + assert_se(!filename_is_valid(".")); + assert_se(!filename_is_valid("..")); + assert_se(!filename_is_valid("bar/foo")); + assert_se(!filename_is_valid("bar/foo/")); + assert_se(!filename_is_valid("bar//")); + + memset(foo, 'a', sizeof(foo) - 1); + char_array_0(foo); + + assert_se(!filename_is_valid(foo)); + + assert_se(filename_is_valid("foo_bar-333")); + assert_se(filename_is_valid("o.o")); +} + +static void test_path_is_valid_and_safe_one(const char *p, bool ret) { + log_debug("/* %s(\"%s\") */", __func__, strnull(p)); + + assert_se(path_is_valid(p) == ret); + if (ret) + ret = !streq(p, "..") && + !startswith(p, "../") && + !endswith(p, "/..") && + !strstr(p, "/../"); + assert_se(path_is_safe(p) == ret); +} + +TEST(path_is_valid_and_safe) { + char foo[PATH_MAX+2]; + const char *c; + + test_path_is_valid_and_safe_one("", false); + test_path_is_valid_and_safe_one("/bar/foo", true); + test_path_is_valid_and_safe_one("/bar/foo/", true); + test_path_is_valid_and_safe_one("/bar/foo/", true); + test_path_is_valid_and_safe_one("//bar//foo//", true); + test_path_is_valid_and_safe_one("/", true); + test_path_is_valid_and_safe_one("/////", true); + test_path_is_valid_and_safe_one("/////.///.////...///..//.", true); + test_path_is_valid_and_safe_one(".", true); + test_path_is_valid_and_safe_one("..", true); + test_path_is_valid_and_safe_one("bar/foo", true); + test_path_is_valid_and_safe_one("bar/foo/", true); + test_path_is_valid_and_safe_one("bar//", true); + + memset(foo, 'a', sizeof(foo) -1); + char_array_0(foo); + + test_path_is_valid_and_safe_one(foo, false); + + c = strjoina("/xxx/", foo, "/yyy"); + test_path_is_valid_and_safe_one(c, false); + + test_path_is_valid_and_safe_one("foo_bar-333", true); + test_path_is_valid_and_safe_one("o.o", true); +} + +TEST(hidden_or_backup_file) { + assert_se(hidden_or_backup_file(".hidden")); + assert_se(hidden_or_backup_file("..hidden")); + assert_se(!hidden_or_backup_file("hidden.")); + + assert_se(hidden_or_backup_file("backup~")); + assert_se(hidden_or_backup_file(".backup~")); + + assert_se(hidden_or_backup_file("lost+found")); + assert_se(hidden_or_backup_file("aquota.user")); + assert_se(hidden_or_backup_file("aquota.group")); + + assert_se(hidden_or_backup_file("test.rpmnew")); + assert_se(hidden_or_backup_file("test.dpkg-old")); + assert_se(hidden_or_backup_file("test.dpkg-remove")); + assert_se(hidden_or_backup_file("test.swp")); + + assert_se(!hidden_or_backup_file("test.rpmnew.")); + assert_se(!hidden_or_backup_file("test.dpkg-old.foo")); +} + +TEST(skip_dev_prefix) { + assert_se(streq(skip_dev_prefix("/"), "/")); + assert_se(streq(skip_dev_prefix("/dev"), "")); + assert_se(streq(skip_dev_prefix("/dev/"), "")); + assert_se(streq(skip_dev_prefix("/dev/foo"), "foo")); + assert_se(streq(skip_dev_prefix("/dev/foo/bar"), "foo/bar")); + assert_se(streq(skip_dev_prefix("//dev"), "")); + assert_se(streq(skip_dev_prefix("//dev//"), "")); + assert_se(streq(skip_dev_prefix("/dev///foo"), "foo")); + assert_se(streq(skip_dev_prefix("///dev///foo///bar"), "foo///bar")); + assert_se(streq(skip_dev_prefix("//foo"), "//foo")); + assert_se(streq(skip_dev_prefix("foo"), "foo")); +} + +TEST(empty_or_root) { + assert_se(empty_or_root(NULL)); + assert_se(empty_or_root("")); + assert_se(empty_or_root("/")); + assert_se(empty_or_root("//")); + assert_se(empty_or_root("///")); + assert_se(empty_or_root("/////////////////")); + assert_se(!empty_or_root("xxx")); + assert_se(!empty_or_root("/xxx")); + assert_se(!empty_or_root("/xxx/")); + assert_se(!empty_or_root("//yy//")); +} + +TEST(path_startswith_set) { + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar", "/foo/quux", "/foo/bar", "/zzz"), "")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar", "/foo/quux", "/foo/", "/zzz"), "bar")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar", "/foo/quux", "/foo", "/zzz"), "bar")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar", "/foo/quux", "/", "/zzz"), "foo/bar")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar", "/foo/quux", "", "/zzz"), NULL)); + + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar2", "/foo/quux", "/foo/bar", "/zzz"), NULL)); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar2", "/foo/quux", "/foo/", "/zzz"), "bar2")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar2", "/foo/quux", "/foo", "/zzz"), "bar2")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar2", "/foo/quux", "/", "/zzz"), "foo/bar2")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo/bar2", "/foo/quux", "", "/zzz"), NULL)); + + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo2/bar", "/foo/quux", "/foo/bar", "/zzz"), NULL)); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo2/bar", "/foo/quux", "/foo/", "/zzz"), NULL)); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo2/bar", "/foo/quux", "/foo", "/zzz"), NULL)); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo2/bar", "/foo/quux", "/", "/zzz"), "foo2/bar")); + assert_se(streq_ptr(PATH_STARTSWITH_SET("/foo2/bar", "/foo/quux", "", "/zzz"), NULL)); +} + +TEST(path_startswith_strv) { + assert_se(streq_ptr(path_startswith_strv("/foo/bar", STRV_MAKE("/foo/quux", "/foo/bar", "/zzz")), "")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar", STRV_MAKE("/foo/quux", "/foo/", "/zzz")), "bar")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar", STRV_MAKE("/foo/quux", "/foo", "/zzz")), "bar")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar", STRV_MAKE("/foo/quux", "/", "/zzz")), "foo/bar")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar", STRV_MAKE("/foo/quux", "", "/zzz")), NULL)); + + assert_se(streq_ptr(path_startswith_strv("/foo/bar2", STRV_MAKE("/foo/quux", "/foo/bar", "/zzz")), NULL)); + assert_se(streq_ptr(path_startswith_strv("/foo/bar2", STRV_MAKE("/foo/quux", "/foo/", "/zzz")), "bar2")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar2", STRV_MAKE("/foo/quux", "/foo", "/zzz")), "bar2")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar2", STRV_MAKE("/foo/quux", "/", "/zzz")), "foo/bar2")); + assert_se(streq_ptr(path_startswith_strv("/foo/bar2", STRV_MAKE("/foo/quux", "", "/zzz")), NULL)); + + assert_se(streq_ptr(path_startswith_strv("/foo2/bar", STRV_MAKE("/foo/quux", "/foo/bar", "/zzz")), NULL)); + assert_se(streq_ptr(path_startswith_strv("/foo2/bar", STRV_MAKE("/foo/quux", "/foo/", "/zzz")), NULL)); + assert_se(streq_ptr(path_startswith_strv("/foo2/bar", STRV_MAKE("/foo/quux", "/foo", "/zzz")), NULL)); + assert_se(streq_ptr(path_startswith_strv("/foo2/bar", STRV_MAKE("/foo/quux", "/", "/zzz")), "foo2/bar")); + assert_se(streq_ptr(path_startswith_strv("/foo2/bar", STRV_MAKE("/foo/quux", "", "/zzz")), NULL)); +} + +static void test_path_glob_can_match_one(const char *pattern, const char *prefix, const char *expected) { + _cleanup_free_ char *result = NULL; + + log_debug("%s(%s, %s, %s)", __func__, pattern, prefix, strnull(expected)); + + assert_se(path_glob_can_match(pattern, prefix, &result) == !!expected); + assert_se(streq_ptr(result, expected)); +} + +TEST(path_glob_can_match) { + test_path_glob_can_match_one("/foo/hoge/aaa", "/foo/hoge/aaa/bbb", NULL); + test_path_glob_can_match_one("/foo/hoge/aaa", "/foo/hoge/aaa", "/foo/hoge/aaa"); + test_path_glob_can_match_one("/foo/hoge/aaa", "/foo/hoge", "/foo/hoge/aaa"); + test_path_glob_can_match_one("/foo/hoge/aaa", "/foo", "/foo/hoge/aaa"); + test_path_glob_can_match_one("/foo/hoge/aaa", "/", "/foo/hoge/aaa"); + + test_path_glob_can_match_one("/foo/*/aaa", "/foo/hoge/aaa/bbb", NULL); + test_path_glob_can_match_one("/foo/*/aaa", "/foo/hoge/aaa", "/foo/hoge/aaa"); + test_path_glob_can_match_one("/foo/*/aaa", "/foo/hoge", "/foo/hoge/aaa"); + test_path_glob_can_match_one("/foo/*/aaa", "/foo", "/foo/*/aaa"); + test_path_glob_can_match_one("/foo/*/aaa", "/", "/foo/*/aaa"); + + test_path_glob_can_match_one("/foo/*/*/aaa", "/foo/xxx/yyy/aaa/bbb", NULL); + test_path_glob_can_match_one("/foo/*/*/aaa", "/foo/xxx/yyy/aaa", "/foo/xxx/yyy/aaa"); + test_path_glob_can_match_one("/foo/*/*/aaa", "/foo/xxx/yyy", "/foo/xxx/yyy/aaa"); + test_path_glob_can_match_one("/foo/*/*/aaa", "/foo/xxx", "/foo/xxx/*/aaa"); + test_path_glob_can_match_one("/foo/*/*/aaa", "/foo", "/foo/*/*/aaa"); + test_path_glob_can_match_one("/foo/*/*/aaa", "/", "/foo/*/*/aaa"); + + test_path_glob_can_match_one("/foo/*/aaa/*", "/foo/xxx/aaa/bbb/ccc", NULL); + test_path_glob_can_match_one("/foo/*/aaa/*", "/foo/xxx/aaa/bbb", "/foo/xxx/aaa/bbb"); + test_path_glob_can_match_one("/foo/*/aaa/*", "/foo/xxx/ccc", NULL); + test_path_glob_can_match_one("/foo/*/aaa/*", "/foo/xxx/aaa", "/foo/xxx/aaa/*"); + test_path_glob_can_match_one("/foo/*/aaa/*", "/foo/xxx", "/foo/xxx/aaa/*"); + test_path_glob_can_match_one("/foo/*/aaa/*", "/foo", "/foo/*/aaa/*"); + test_path_glob_can_match_one("/foo/*/aaa/*", "/", "/foo/*/aaa/*"); +} + +TEST(print_MAX) { + log_info("PATH_MAX=%zu\n" + "FILENAME_MAX=%zu\n" + "NAME_MAX=%zu", + (size_t) PATH_MAX, + (size_t) FILENAME_MAX, + (size_t) NAME_MAX); + + assert_cc(FILENAME_MAX == PATH_MAX); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-path.c b/src/test/test-path.c new file mode 100644 index 0000000..22ed88f --- /dev/null +++ b/src/test/test-path.c @@ -0,0 +1,418 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "all-units.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "manager.h" +#include "mkdir.h" +#include "path-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "unit.h" + +typedef void (*test_function_t)(Manager *m); + +static int setup_test(Manager **m) { + char **tests_path = STRV_MAKE("exists", "existsglobFOOBAR", "changed", "modified", "unit", + "directorynotempty", "makedirectory"); + Manager *tmp = NULL; + int r; + + assert_se(m); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &tmp); + if (manager_errno_skip_test(r)) + return log_tests_skipped_errno(r, "manager_new"); + assert_se(r >= 0); + assert_se(manager_startup(tmp, NULL, NULL, NULL) >= 0); + + STRV_FOREACH(test_path, tests_path) { + _cleanup_free_ char *p = NULL; + + p = strjoin("/tmp/test-path_", *test_path); + assert_se(p); + + (void) rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL); + } + + *m = tmp; + + return 0; +} + +static void shutdown_test(Manager *m) { + assert_se(m); + + manager_free(m); +} + +static Service *service_for_path(Manager *m, Path *path, const char *service_name) { + _cleanup_free_ char *tmp = NULL; + Unit *service_unit = NULL; + + assert_se(m); + assert_se(path); + + if (!service_name) { + assert_se(tmp = strreplace(UNIT(path)->id, ".path", ".service")); + service_unit = manager_get_unit(m, tmp); + } else + service_unit = manager_get_unit(m, service_name); + assert_se(service_unit); + + return SERVICE(service_unit); +} + +static int _check_states(unsigned line, + Manager *m, Path *path, Service *service, PathState path_state, ServiceState service_state) { + assert_se(m); + assert_se(service); + + usec_t end = now(CLOCK_MONOTONIC) + 30 * USEC_PER_SEC; + + while (path->state != path_state || service->state != service_state || + path->result != PATH_SUCCESS || service->result != SERVICE_SUCCESS) { + + assert_se(sd_event_run(m->event, 100 * USEC_PER_MSEC) >= 0); + + usec_t n = now(CLOCK_MONOTONIC); + log_info("line %u: %s: state = %s; result = %s (left: %" PRIi64 ")", + line, + UNIT(path)->id, + path_state_to_string(path->state), + path_result_to_string(path->result), + (int64_t) (end - n)); + log_info("line %u: %s: state = %s; result = %s", + line, + UNIT(service)->id, + service_state_to_string(service->state), + service_result_to_string(service->result)); + + if (service->state == SERVICE_FAILED && service->main_exec_status.status == EXIT_CGROUP) { + const char *ci = ci_environment(); + + /* On a general purpose system we may fail to start the service for reasons which are + * not under our control: permission limits, resource exhaustion, etc. Let's skip the + * test in those cases. On developer machines we require proper setup. */ + if (!ci) + return log_notice_errno(SYNTHETIC_ERRNO(ECANCELED), + "Failed to start service %s, aborting test: %s/%s", + UNIT(service)->id, + service_state_to_string(service->state), + service_result_to_string(service->result)); + + /* On Salsa we can't setup cgroups so the unit always fails. The test checks if it + * can but continues if it cannot at the beginning, but on Salsa it fails here. */ + if (streq(ci, "salsa-ci")) + exit(EXIT_TEST_SKIP); + } + + if (n >= end) { + log_error("Test timeout when testing %s", UNIT(path)->id); + exit(EXIT_FAILURE); + } + } + + return 0; +} +#define check_states(...) _check_states(__LINE__, __VA_ARGS__) + +static void test_path_exists(Manager *m) { + const char *test_path = "/tmp/test-path_exists"; + Unit *unit = NULL; + Path *path = NULL; + Service *service = NULL; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-exists.path", NULL, &unit) >= 0); + + path = PATH(unit); + service = service_for_path(m, path, NULL); + + assert_se(unit_start(unit, NULL) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(touch(test_path) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + /* Service restarts if file still exists */ + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + assert_se(rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL) == 0); + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(unit_stop(unit) >= 0); +} + +static void test_path_existsglob(Manager *m) { + const char *test_path = "/tmp/test-path_existsglobFOOBAR"; + Unit *unit = NULL; + Path *path = NULL; + Service *service = NULL; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-existsglob.path", NULL, &unit) >= 0); + + path = PATH(unit); + service = service_for_path(m, path, NULL); + + assert_se(unit_start(unit, NULL) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(touch(test_path) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + /* Service restarts if file still exists */ + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + assert_se(rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL) == 0); + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(unit_stop(unit) >= 0); +} + +static void test_path_changed(Manager *m) { + const char *test_path = "/tmp/test-path_changed"; + FILE *f; + Unit *unit = NULL; + Path *path = NULL; + Service *service = NULL; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-changed.path", NULL, &unit) >= 0); + + path = PATH(unit); + service = service_for_path(m, path, NULL); + + assert_se(unit_start(unit, NULL) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(touch(test_path) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + /* Service does not restart if file still exists */ + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + f = fopen(test_path, "w"); + assert_se(f); + fclose(f); + + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + (void) rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL); + assert_se(unit_stop(unit) >= 0); +} + +static void test_path_modified(Manager *m) { + _cleanup_fclose_ FILE *f = NULL; + const char *test_path = "/tmp/test-path_modified"; + Unit *unit = NULL; + Path *path = NULL; + Service *service = NULL; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-modified.path", NULL, &unit) >= 0); + + path = PATH(unit); + service = service_for_path(m, path, NULL); + + assert_se(unit_start(unit, NULL) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(touch(test_path) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + /* Service does not restart if file still exists */ + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + f = fopen(test_path, "w"); + assert_se(f); + fputs("test", f); + + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + (void) rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL); + assert_se(unit_stop(unit) >= 0); +} + +static void test_path_unit(Manager *m) { + const char *test_path = "/tmp/test-path_unit"; + Unit *unit = NULL; + Path *path = NULL; + Service *service = NULL; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-unit.path", NULL, &unit) >= 0); + + path = PATH(unit); + service = service_for_path(m, path, "path-mycustomunit.service"); + + assert_se(unit_start(unit, NULL) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(touch(test_path) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + assert_se(rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL) == 0); + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(unit_stop(unit) >= 0); +} + +static void test_path_directorynotempty(Manager *m) { + const char *test_file, *test_path = "/tmp/test-path_directorynotempty/"; + Unit *unit = NULL; + Path *path = NULL; + Service *service = NULL; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-directorynotempty.path", NULL, &unit) >= 0); + + path = PATH(unit); + service = service_for_path(m, path, NULL); + + assert_se(access(test_path, F_OK) < 0); + + assert_se(unit_start(unit, NULL) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + /* MakeDirectory default to no */ + assert_se(access(test_path, F_OK) < 0); + + assert_se(mkdir_p(test_path, 0755) >= 0); + test_file = strjoina(test_path, "test_file"); + assert_se(touch(test_file) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + /* Service restarts if directory is still not empty */ + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_RUNNING, SERVICE_RUNNING) < 0) + return; + + assert_se(rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL) == 0); + assert_se(unit_stop(UNIT(service)) >= 0); + if (check_states(m, path, service, PATH_WAITING, SERVICE_DEAD) < 0) + return; + + assert_se(unit_stop(unit) >= 0); +} + +static void test_path_makedirectory_directorymode(Manager *m) { + const char *test_path = "/tmp/test-path_makedirectory/"; + Unit *unit = NULL; + struct stat s; + + assert_se(m); + + assert_se(manager_load_startable_unit_or_warn(m, "path-makedirectory.path", NULL, &unit) >= 0); + + assert_se(access(test_path, F_OK) < 0); + + assert_se(unit_start(unit, NULL) >= 0); + + /* Check if the directory has been created */ + assert_se(access(test_path, F_OK) >= 0); + + /* Check the mode we specified with DirectoryMode=0744 */ + assert_se(stat(test_path, &s) >= 0); + assert_se((s.st_mode & S_IRWXU) == 0700); + assert_se((s.st_mode & S_IRWXG) == 0040); + assert_se((s.st_mode & S_IRWXO) == 0004); + + assert_se(unit_stop(unit) >= 0); + (void) rm_rf(test_path, REMOVE_ROOT|REMOVE_PHYSICAL); +} + +int main(int argc, char *argv[]) { + static const test_function_t tests[] = { + test_path_exists, + test_path_existsglob, + test_path_changed, + test_path_modified, + test_path_unit, + test_path_directorynotempty, + test_path_makedirectory_directorymode, + NULL, + }; + + _cleanup_free_ char *test_path = NULL; + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + + umask(022); + + test_setup_logging(LOG_INFO); + + assert_se(get_testdata_dir("test-path", &test_path) >= 0); + assert_se(set_unit_path(test_path) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + for (const test_function_t *test = tests; *test; test++) { + Manager *m = NULL; + int r; + + /* We create a clean environment for each test */ + r = setup_test(&m); + if (r != 0) + return r; + + (*test)(m); + + shutdown_test(m); + } + + return 0; +} diff --git a/src/test/test-percent-util.c b/src/test/test-percent-util.c new file mode 100644 index 0000000..7e8e11b --- /dev/null +++ b/src/test/test-percent-util.c @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "percent-util.h" +#include "tests.h" +#include "time-util.h" + +TEST(parse_percent) { + assert_se(parse_percent("") == -EINVAL); + assert_se(parse_percent("foo") == -EINVAL); + assert_se(parse_percent("0") == -EINVAL); + assert_se(parse_percent("0.1") == -EINVAL); + assert_se(parse_percent("50") == -EINVAL); + assert_se(parse_percent("100") == -EINVAL); + assert_se(parse_percent("-1") == -EINVAL); + assert_se(parse_percent("0%") == 0); + assert_se(parse_percent("55%") == 55); + assert_se(parse_percent("100%") == 100); + assert_se(parse_percent("-7%") == -ERANGE); + assert_se(parse_percent("107%") == -ERANGE); + assert_se(parse_percent("%") == -EINVAL); + assert_se(parse_percent("%%") == -EINVAL); + assert_se(parse_percent("%1") == -EINVAL); + assert_se(parse_percent("1%%") == -EINVAL); + assert_se(parse_percent("3.2%") == -EINVAL); +} + +TEST(parse_percent_unbounded) { + assert_se(parse_percent_unbounded("101%") == 101); + assert_se(parse_percent_unbounded("400%") == 400); +} + +TEST(parse_permille) { + assert_se(parse_permille("") == -EINVAL); + assert_se(parse_permille("foo") == -EINVAL); + assert_se(parse_permille("0") == -EINVAL); + assert_se(parse_permille("50") == -EINVAL); + assert_se(parse_permille("100") == -EINVAL); + assert_se(parse_permille("-1") == -EINVAL); + assert_se(parse_permille("0.1") == -EINVAL); + assert_se(parse_permille("5%") == 50); + assert_se(parse_permille("5.5%") == 55); + assert_se(parse_permille("5.12%") == -EINVAL); + + assert_se(parse_permille("0‰") == 0); + assert_se(parse_permille("555‰") == 555); + assert_se(parse_permille("1000‰") == 1000); + assert_se(parse_permille("-7‰") == -ERANGE); + assert_se(parse_permille("1007‰") == -ERANGE); + assert_se(parse_permille("‰") == -EINVAL); + assert_se(parse_permille("‰‰") == -EINVAL); + assert_se(parse_permille("‰1") == -EINVAL); + assert_se(parse_permille("1‰‰") == -EINVAL); + assert_se(parse_permille("3.2‰") == -EINVAL); + assert_se(parse_permille("0.1‰") == -EINVAL); + + assert_se(parse_permille("0%") == 0); + assert_se(parse_permille("55%") == 550); + assert_se(parse_permille("55.5%") == 555); + assert_se(parse_permille("100%") == 1000); + assert_se(parse_permille("-7%") == -ERANGE); + assert_se(parse_permille("107%") == -ERANGE); + assert_se(parse_permille("%") == -EINVAL); + assert_se(parse_permille("%%") == -EINVAL); + assert_se(parse_permille("%1") == -EINVAL); + assert_se(parse_permille("1%%") == -EINVAL); + assert_se(parse_permille("3.21%") == -EINVAL); + assert_se(parse_permille("0.1%") == 1); +} + +TEST(parse_permille_unbounded) { + assert_se(parse_permille_unbounded("1001‰") == 1001); + assert_se(parse_permille_unbounded("4000‰") == 4000); + assert_se(parse_permille_unbounded("2147483647‰") == 2147483647); + assert_se(parse_permille_unbounded("2147483648‰") == -ERANGE); + assert_se(parse_permille_unbounded("4294967295‰") == -ERANGE); + assert_se(parse_permille_unbounded("4294967296‰") == -ERANGE); + + assert_se(parse_permille_unbounded("101%") == 1010); + assert_se(parse_permille_unbounded("400%") == 4000); + assert_se(parse_permille_unbounded("214748364.7%") == 2147483647); + assert_se(parse_permille_unbounded("214748364.8%") == -ERANGE); + assert_se(parse_permille_unbounded("429496729.5%") == -ERANGE); + assert_se(parse_permille_unbounded("429496729.6%") == -ERANGE); +} + +TEST(parse_permyriad) { + assert_se(parse_permyriad("") == -EINVAL); + assert_se(parse_permyriad("foo") == -EINVAL); + assert_se(parse_permyriad("0") == -EINVAL); + assert_se(parse_permyriad("50") == -EINVAL); + assert_se(parse_permyriad("100") == -EINVAL); + assert_se(parse_permyriad("-1") == -EINVAL); + + assert_se(parse_permyriad("0‱") == 0); + assert_se(parse_permyriad("555‱") == 555); + assert_se(parse_permyriad("1000‱") == 1000); + assert_se(parse_permyriad("-7‱") == -ERANGE); + assert_se(parse_permyriad("10007‱") == -ERANGE); + assert_se(parse_permyriad("‱") == -EINVAL); + assert_se(parse_permyriad("‱‱") == -EINVAL); + assert_se(parse_permyriad("‱1") == -EINVAL); + assert_se(parse_permyriad("1‱‱") == -EINVAL); + assert_se(parse_permyriad("3.2‱") == -EINVAL); + + assert_se(parse_permyriad("0‰") == 0); + assert_se(parse_permyriad("555.5‰") == 5555); + assert_se(parse_permyriad("1000.0‰") == 10000); + assert_se(parse_permyriad("-7‰") == -ERANGE); + assert_se(parse_permyriad("1007‰") == -ERANGE); + assert_se(parse_permyriad("‰") == -EINVAL); + assert_se(parse_permyriad("‰‰") == -EINVAL); + assert_se(parse_permyriad("‰1") == -EINVAL); + assert_se(parse_permyriad("1‰‰") == -EINVAL); + assert_se(parse_permyriad("3.22‰") == -EINVAL); + + assert_se(parse_permyriad("0%") == 0); + assert_se(parse_permyriad("55%") == 5500); + assert_se(parse_permyriad("55.5%") == 5550); + assert_se(parse_permyriad("55.50%") == 5550); + assert_se(parse_permyriad("55.53%") == 5553); + assert_se(parse_permyriad("100%") == 10000); + assert_se(parse_permyriad("-7%") == -ERANGE); + assert_se(parse_permyriad("107%") == -ERANGE); + assert_se(parse_permyriad("%") == -EINVAL); + assert_se(parse_permyriad("%%") == -EINVAL); + assert_se(parse_permyriad("%1") == -EINVAL); + assert_se(parse_permyriad("1%%") == -EINVAL); + assert_se(parse_permyriad("3.212%") == -EINVAL); +} + +TEST(parse_permyriad_unbounded) { + assert_se(parse_permyriad_unbounded("1001‱") == 1001); + assert_se(parse_permyriad_unbounded("4000‱") == 4000); + assert_se(parse_permyriad_unbounded("2147483647‱") == 2147483647); + assert_se(parse_permyriad_unbounded("2147483648‱") == -ERANGE); + assert_se(parse_permyriad_unbounded("4294967295‱") == -ERANGE); + assert_se(parse_permyriad_unbounded("4294967296‱") == -ERANGE); + + assert_se(parse_permyriad_unbounded("101‰") == 1010); + assert_se(parse_permyriad_unbounded("400‰") == 4000); + assert_se(parse_permyriad_unbounded("214748364.7‰") == 2147483647); + assert_se(parse_permyriad_unbounded("214748364.8‰") == -ERANGE); + assert_se(parse_permyriad_unbounded("429496729.5‰") == -ERANGE); + assert_se(parse_permyriad_unbounded("429496729.6‰") == -ERANGE); + + assert_se(parse_permyriad_unbounded("99%") == 9900); + assert_se(parse_permyriad_unbounded("40%") == 4000); + assert_se(parse_permyriad_unbounded("21474836.47%") == 2147483647); + assert_se(parse_permyriad_unbounded("21474836.48%") == -ERANGE); + assert_se(parse_permyriad_unbounded("42949672.95%") == -ERANGE); + assert_se(parse_permyriad_unbounded("42949672.96%") == -ERANGE); +} + +TEST(scale) { + /* Check some fixed values */ + assert_se(UINT32_SCALE_FROM_PERCENT(0) == 0); + assert_se(UINT32_SCALE_FROM_PERCENT(50) == UINT32_MAX/2+1); + assert_se(UINT32_SCALE_FROM_PERCENT(100) == UINT32_MAX); + + assert_se(UINT32_SCALE_FROM_PERMILLE(0) == 0); + assert_se(UINT32_SCALE_FROM_PERMILLE(500) == UINT32_MAX/2+1); + assert_se(UINT32_SCALE_FROM_PERMILLE(1000) == UINT32_MAX); + + assert_se(UINT32_SCALE_FROM_PERMYRIAD(0) == 0); + assert_se(UINT32_SCALE_FROM_PERMYRIAD(5000) == UINT32_MAX/2+1); + assert_se(UINT32_SCALE_FROM_PERMYRIAD(10000) == UINT32_MAX); + + /* Make sure there's no numeric noise on the 0%…100% scale when converting from percent and back. */ + for (int percent = 0; percent <= 100; percent++) { + log_debug("%i%% → %" PRIu32 " → %i%%", + percent, + UINT32_SCALE_FROM_PERCENT(percent), + UINT32_SCALE_TO_PERCENT(UINT32_SCALE_FROM_PERCENT(percent))); + + assert_se(UINT32_SCALE_TO_PERCENT(UINT32_SCALE_FROM_PERCENT(percent)) == percent); + } + + /* Make sure there's no numeric noise on the 0‰…1000‰ scale when converting from permille and back. */ + for (int permille = 0; permille <= 1000; permille++) { + log_debug("%i‰ → %" PRIu32 " → %i‰", + permille, + UINT32_SCALE_FROM_PERMILLE(permille), + UINT32_SCALE_TO_PERMILLE(UINT32_SCALE_FROM_PERMILLE(permille))); + + assert_se(UINT32_SCALE_TO_PERMILLE(UINT32_SCALE_FROM_PERMILLE(permille)) == permille); + } + + /* Make sure there's no numeric noise on the 0‱…10000‱ scale when converting from permyriad and back. */ + for (int permyriad = 0; permyriad <= 10000; permyriad++) { + log_debug("%i‱ → %" PRIu32 " → %i‱", + permyriad, + UINT32_SCALE_FROM_PERMYRIAD(permyriad), + UINT32_SCALE_TO_PERMYRIAD(UINT32_SCALE_FROM_PERMYRIAD(permyriad))); + + assert_se(UINT32_SCALE_TO_PERMYRIAD(UINT32_SCALE_FROM_PERMYRIAD(permyriad)) == permyriad); + } +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-pretty-print.c b/src/test/test-pretty-print.c new file mode 100644 index 0000000..52b2bc8 --- /dev/null +++ b/src/test/test-pretty-print.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "macro.h" +#include "pretty-print.h" +#include "strv.h" +#include "tests.h" + +#define CYLON_WIDTH 6 + +static void test_draw_cylon_one(unsigned pos) { + char buf[CYLON_WIDTH + CYLON_BUFFER_EXTRA + 1]; + + log_debug("/* %s(%u) */", __func__, pos); + + assert(pos <= CYLON_WIDTH + 1); + + memset(buf, 0xff, sizeof(buf)); + draw_cylon(buf, sizeof(buf), CYLON_WIDTH, pos); + assert_se(strlen(buf) < sizeof(buf)); +} + +TEST(draw_cylon) { + bool saved = log_get_show_color(); + + log_show_color(false); + for (unsigned i = 0; i <= CYLON_WIDTH + 1; i++) + test_draw_cylon_one(i); + + log_show_color(true); + for (unsigned i = 0; i <= CYLON_WIDTH + 1; i++) + test_draw_cylon_one(i); + + log_show_color(saved); +} + +TEST(terminal_urlify) { + _cleanup_free_ char *formatted = NULL; + + assert_se(terminal_urlify("https://www.freedesktop.org/wiki/Software/systemd", "systemd homepage", &formatted) >= 0); + printf("Hey, consider visiting the %s right now! It is very good!\n", formatted); + + formatted = mfree(formatted); + + assert_se(terminal_urlify_path("/etc/fstab", "this link to your /etc/fstab", &formatted) >= 0); + printf("Or click on %s to have a look at it!\n", formatted); +} + +TEST(cat_files) { + assert_se(cat_files("/no/such/file", NULL, 0) == -ENOENT); + assert_se(cat_files(NULL, NULL, 0) == 0); + + if (access("/etc/fstab", R_OK) >= 0) + assert_se(cat_files("/etc/fstab", STRV_MAKE("/etc/fstab", "/etc/fstab"), 0) == 0); +} + +TEST(red_green_cross_check_mark) { + bool b = false; + + printf("yea: <%s>\n", GREEN_CHECK_MARK()); + printf("nay: <%s>\n", RED_CROSS_MARK()); + + printf("%s → %s → %s → %s\n", + COLOR_MARK_BOOL(b), + COLOR_MARK_BOOL(!b), + COLOR_MARK_BOOL(!!b), + COLOR_MARK_BOOL(!!!b)); +} + +TEST(print_separator) { + print_separator(); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-prioq.c b/src/test/test-prioq.c new file mode 100644 index 0000000..540863c --- /dev/null +++ b/src/test/test-prioq.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "prioq.h" +#include "set.h" +#include "siphash24.h" +#include "sort-util.h" +#include "tests.h" + +#define SET_SIZE 1024*4 + +static int unsigned_compare(const unsigned *a, const unsigned *b) { + return CMP(*a, *b); +} + +TEST(unsigned) { + _cleanup_(prioq_freep) Prioq *q = NULL; + unsigned buffer[SET_SIZE], i, u, n; + + srand(0); + + assert_se(q = prioq_new(trivial_compare_func)); + + for (i = 0; i < ELEMENTSOF(buffer); i++) { + u = (unsigned) rand(); + buffer[i] = u; + assert_se(prioq_put(q, UINT_TO_PTR(u), NULL) >= 0); + + n = prioq_size(q); + assert_se(prioq_remove(q, UINT_TO_PTR(u), &n) == 0); + } + + typesafe_qsort(buffer, ELEMENTSOF(buffer), unsigned_compare); + + for (i = 0; i < ELEMENTSOF(buffer); i++) { + assert_se(prioq_size(q) == ELEMENTSOF(buffer) - i); + + u = PTR_TO_UINT(prioq_pop(q)); + assert_se(buffer[i] == u); + } + + assert_se(prioq_isempty(q)); +} + +struct test { + unsigned value; + unsigned idx; +}; + +static int test_compare(const struct test *x, const struct test *y) { + return CMP(x->value, y->value); +} + +static void test_hash(const struct test *x, struct siphash *state) { + siphash24_compress(&x->value, sizeof(x->value), state); +} + +DEFINE_PRIVATE_HASH_OPS(test_hash_ops, struct test, test_hash, test_compare); + +TEST(struct) { + _cleanup_(prioq_freep) Prioq *q = NULL; + _cleanup_set_free_ Set *s = NULL; + unsigned previous = 0, i; + struct test *t; + + srand(0); + + assert_se(q = prioq_new((compare_func_t) test_compare)); + assert_se(s = set_new(&test_hash_ops)); + + assert_se(prioq_peek(q) == NULL); + assert_se(prioq_peek_by_index(q, 0) == NULL); + assert_se(prioq_peek_by_index(q, 1) == NULL); + assert_se(prioq_peek_by_index(q, UINT_MAX) == NULL); + + for (i = 0; i < SET_SIZE; i++) { + assert_se(t = new0(struct test, 1)); + t->value = (unsigned) rand(); + + assert_se(prioq_put(q, t, &t->idx) >= 0); + + if (i % 4 == 0) + assert_se(set_consume(s, t) >= 0); + } + + for (i = 0; i < SET_SIZE; i++) + assert_se(prioq_peek_by_index(q, i)); + assert_se(prioq_peek_by_index(q, SET_SIZE) == NULL); + + unsigned count = 0; + PRIOQ_FOREACH_ITEM(q, t) { + assert_se(t); + count++; + } + assert_se(count == SET_SIZE); + + while ((t = set_steal_first(s))) { + assert_se(prioq_remove(q, t, &t->idx) == 1); + assert_se(prioq_remove(q, t, &t->idx) == 0); + assert_se(prioq_remove(q, t, NULL) == 0); + + free(t); + } + + for (i = 0; i < SET_SIZE * 3 / 4; i++) { + assert_se(prioq_size(q) == (SET_SIZE * 3 / 4) - i); + + assert_se(t = prioq_pop(q)); + assert_se(prioq_remove(q, t, &t->idx) == 0); + assert_se(prioq_remove(q, t, NULL) == 0); + assert_se(previous <= t->value); + + previous = t->value; + free(t); + } + + assert_se(prioq_isempty(q)); + assert_se(set_isempty(s)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-proc-cmdline.c b/src/test/test-proc-cmdline.c new file mode 100644 index 0000000..8b5bbb0 --- /dev/null +++ b/src/test/test-proc-cmdline.c @@ -0,0 +1,354 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "env-util.h" +#include "errno-util.h" +#include "initrd-util.h" +#include "log.h" +#include "macro.h" +#include "nulstr-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "special.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static int obj; + +static int parse_item(const char *key, const char *value, void *data) { + assert_se(key); + assert_se(data == &obj); + + log_info("kernel cmdline option <%s> = <%s>", key, strna(value)); + return 0; +} + +TEST(proc_cmdline_parse) { + assert_se(proc_cmdline_parse(parse_item, &obj, PROC_CMDLINE_STRIP_RD_PREFIX) >= 0); +} + +TEST(proc_cmdline_override) { + _cleanup_free_ char *line = NULL, *value = NULL; + _cleanup_strv_free_ char **args = NULL; + + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=foo_bar=quux wuff-piep=tuet zumm some_arg_with_space='foo bar' and_one_more=\"zzz aaa\"") == 0); + assert_se(putenv((char*) "SYSTEMD_EFI_OPTIONS=different") == 0); + + /* First test if the overrides for /proc/cmdline still work */ + assert_se(proc_cmdline(&line) >= 0); + assert_se(streq(line, "foo_bar=quux wuff-piep=tuet zumm some_arg_with_space='foo bar' and_one_more=\"zzz aaa\"")); + line = mfree(line); + assert_se(proc_cmdline_strv(&args) >= 0); + assert_se(strv_equal(args, STRV_MAKE("foo_bar=quux", "wuff-piep=tuet", "zumm", "some_arg_with_space=foo bar", "and_one_more=zzz aaa"))); + args = strv_free(args); + + /* Test if parsing makes uses of the override */ + assert_se(proc_cmdline_get_key("foo_bar", 0, &value) > 0 && streq_ptr(value, "quux")); + value = mfree(value); + + assert_se(proc_cmdline_get_key("some_arg_with_space", 0, &value) > 0 && streq_ptr(value, "foo bar")); + value = mfree(value); + + assert_se(proc_cmdline_get_key("and_one_more", 0, &value) > 0 && streq_ptr(value, "zzz aaa")); + value = mfree(value); + + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=hoge") == 0); + assert_se(putenv((char*) "SYSTEMD_EFI_OPTIONS=foo_bar=quux wuff-piep=tuet zumm some_arg_with_space='foo bar' and_one_more=\"zzz aaa\"") == 0); + + assert_se(proc_cmdline(&line) >= 0); + assert_se(streq(line, "hoge")); + line = mfree(line); + assert_se(proc_cmdline_strv(&args) >= 0); + assert_se(strv_equal(args, STRV_MAKE("hoge"))); + args = strv_free(args); + +#if ENABLE_EFI + assert_se(proc_cmdline_get_key("foo_bar", 0, &value) > 0 && streq_ptr(value, "quux")); + value = mfree(value); + + assert_se(proc_cmdline_get_key("some_arg_with_space", 0, &value) > 0 && streq_ptr(value, "foo bar")); + value = mfree(value); + + assert_se(proc_cmdline_get_key("and_one_more", 0, &value) > 0 && streq_ptr(value, "zzz aaa")); + value = mfree(value); +#endif +} + +static int parse_item_given(const char *key, const char *value, void *data) { + assert_se(key); + assert_se(data); + + bool *strip = data; + + log_info("%s: option <%s> = <%s>", __func__, key, strna(value)); + if (proc_cmdline_key_streq(key, "foo_bar")) + assert_se(streq(value, "quux")); + else if (proc_cmdline_key_streq(key, "wuff-piep")) + assert_se(streq(value, "tuet ")); + else if (proc_cmdline_key_streq(key, "space")) + assert_se(streq(value, "x y z")); + else if (proc_cmdline_key_streq(key, "miepf")) + assert_se(streq(value, "uuu")); + else if (in_initrd() && *strip && proc_cmdline_key_streq(key, "zumm")) + assert_se(!value); + else if (in_initrd() && !*strip && proc_cmdline_key_streq(key, "rd.zumm")) + assert_se(!value); + else + assert_not_reached(); + + return 0; +} + +static void test_proc_cmdline_given_one(bool flip_initrd) { + log_info("/* %s (flip: %s) */", __func__, yes_no(flip_initrd)); + + if (flip_initrd) + in_initrd_force(!in_initrd()); + + bool t = true, f = false; + assert_se(proc_cmdline_parse(parse_item_given, &t, PROC_CMDLINE_STRIP_RD_PREFIX) >= 0); + assert_se(proc_cmdline_parse(parse_item_given, &f, 0) >= 0); + + if (flip_initrd) + in_initrd_force(!in_initrd()); +} + +TEST(proc_cmdline_given) { + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=foo_bar=quux wuff-piep=\"tuet \" rd.zumm space='x y z' miepf=\"uuu\"") == 0); + assert_se(putenv((char*) "SYSTEMD_EFI_OPTIONS=miepf=\"uuu\"") == 0); + + test_proc_cmdline_given_one(false); + /* Repeat the same thing, but now flip our ininitrdness */ + test_proc_cmdline_given_one(true); +} + +TEST(proc_cmdline_get_key) { + _cleanup_free_ char *value = NULL; + + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=foo_bar=quux wuff-piep=tuet zumm-ghh spaaace='ö ü ß' ticks=\"''\"\n\nkkk=uuu\n\n\n") == 0); + + assert_se(proc_cmdline_get_key("", 0, &value) == -EINVAL); + assert_se(proc_cmdline_get_key("abc", 0, NULL) == 0); + assert_se(proc_cmdline_get_key("abc", 0, &value) == 0 && value == NULL); + assert_se(proc_cmdline_get_key("abc", PROC_CMDLINE_VALUE_OPTIONAL, &value) == 0 && value == NULL); + + assert_se(proc_cmdline_get_key("foo_bar", 0, &value) > 0 && streq_ptr(value, "quux")); + value = mfree(value); + assert_se(proc_cmdline_get_key("foo_bar", PROC_CMDLINE_VALUE_OPTIONAL, &value) > 0 && streq_ptr(value, "quux")); + value = mfree(value); + assert_se(proc_cmdline_get_key("foo_bar", 0, NULL) == 0); + assert_se(proc_cmdline_get_key("foo-bar", 0, &value) > 0 && streq_ptr(value, "quux")); + value = mfree(value); + assert_se(proc_cmdline_get_key("foo-bar", PROC_CMDLINE_VALUE_OPTIONAL, &value) > 0 && streq_ptr(value, "quux")); + value = mfree(value); + assert_se(proc_cmdline_get_key("foo-bar", 0, NULL) == 0); + assert_se(proc_cmdline_get_key("foo-bar", PROC_CMDLINE_VALUE_OPTIONAL, NULL) == -EINVAL); + + assert_se(proc_cmdline_get_key("wuff-piep", 0, &value) > 0 && streq_ptr(value, "tuet")); + value = mfree(value); + assert_se(proc_cmdline_get_key("wuff-piep", PROC_CMDLINE_VALUE_OPTIONAL, &value) > 0 && streq_ptr(value, "tuet")); + value = mfree(value); + assert_se(proc_cmdline_get_key("wuff_piep", 0, &value) > 0 && streq_ptr(value, "tuet")); + value = mfree(value); + assert_se(proc_cmdline_get_key("wuff_piep", PROC_CMDLINE_VALUE_OPTIONAL, &value) > 0 && streq_ptr(value, "tuet")); + value = mfree(value); + assert_se(proc_cmdline_get_key("wuff_piep", 0, NULL) == 0); + assert_se(proc_cmdline_get_key("wuff_piep", PROC_CMDLINE_VALUE_OPTIONAL, NULL) == -EINVAL); + + assert_se(proc_cmdline_get_key("zumm-ghh", 0, &value) == 0 && value == NULL); + assert_se(proc_cmdline_get_key("zumm-ghh", PROC_CMDLINE_VALUE_OPTIONAL, &value) > 0 && value == NULL); + assert_se(proc_cmdline_get_key("zumm-ghh", 0, NULL) > 0); + assert_se(proc_cmdline_get_key("zumm_ghh", 0, &value) == 0 && value == NULL); + assert_se(proc_cmdline_get_key("zumm_ghh", PROC_CMDLINE_VALUE_OPTIONAL, &value) > 0 && value == NULL); + assert_se(proc_cmdline_get_key("zumm_ghh", 0, NULL) > 0); + + assert_se(proc_cmdline_get_key("spaaace", 0, &value) > 0 && streq_ptr(value, "ö ü ß")); + value = mfree(value); + + assert_se(proc_cmdline_get_key("ticks", 0, &value) > 0 && streq_ptr(value, "''")); + value = mfree(value); + + assert_se(proc_cmdline_get_key("kkk", 0, &value) > 0 && streq_ptr(value, "uuu")); +} + +TEST(proc_cmdline_get_bool) { + bool value = false; + + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=foo_bar bar-waldo=1 x_y-z=0 quux=miep\nda=yes\nthe=1") == 0); + assert_se(putenv((char*) "SYSTEMD_EFI_OPTIONS=") == 0); + + assert_se(proc_cmdline_get_bool("", /* flags = */ 0, &value) == -EINVAL); + assert_se(proc_cmdline_get_bool("abc", /* flags = */ 0, &value) == 0 && value == false); + assert_se(proc_cmdline_get_bool("unspecified", PROC_CMDLINE_TRUE_WHEN_MISSING, &value) == 0 && value == true); + assert_se(proc_cmdline_get_bool("foo_bar", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("foo-bar", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("bar-waldo", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("bar_waldo", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("x_y-z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("x-y-z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("x-y_z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("x_y_z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("quux", /* flags = */ 0, &value) == -EINVAL && value == false); + assert_se(proc_cmdline_get_bool("da", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("the", /* flags = */ 0, &value) > 0 && value == true); +} + +#if ENABLE_EFI +TEST(proc_cmdline_get_bool_efi) { + bool value = false; + + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=") == 0); + assert_se(putenv((char*) "SYSTEMD_EFI_OPTIONS=foo_bar bar-waldo=1 x_y-z=0 quux=miep\nda=yes\nthe=1") == 0); + + assert_se(proc_cmdline_get_bool("", /* flags = */ 0, &value) == -EINVAL); + assert_se(proc_cmdline_get_bool("abc", /* flags = */ 0, &value) == 0 && value == false); + assert_se(proc_cmdline_get_bool("foo_bar", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("foo-bar", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("bar-waldo", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("bar_waldo", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("x_y-z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("x-y-z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("x-y_z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("x_y_z", /* flags = */ 0, &value) > 0 && value == false); + assert_se(proc_cmdline_get_bool("quux", /* flags = */ 0, &value) == -EINVAL && value == false); + assert_se(proc_cmdline_get_bool("da", /* flags = */ 0, &value) > 0 && value == true); + assert_se(proc_cmdline_get_bool("the", /* flags = */ 0, &value) > 0 && value == true); +} +#endif + +TEST(proc_cmdline_get_key_many) { + _cleanup_free_ char *value1 = NULL, *value2 = NULL, *value3 = NULL, *value4 = NULL, *value5 = NULL, *value6 = NULL, *value7 = NULL; + + assert_se(putenv((char*) "SYSTEMD_PROC_CMDLINE=foo_bar=quux wuff-piep=tuet zumm SPACE='one two' doubleticks=\" aaa aaa \"\n\nzummm='\n'\n") == 0); + + assert_se(proc_cmdline_get_key_many(0, + "wuff-piep", &value3, + "foo_bar", &value1, + "idontexist", &value2, + "zumm", &value4, + "SPACE", &value5, + "doubleticks", &value6, + "zummm", &value7) == 5); + + assert_se(streq_ptr(value1, "quux")); + assert_se(!value2); + assert_se(streq_ptr(value3, "tuet")); + assert_se(!value4); + assert_se(streq_ptr(value5, "one two")); + assert_se(streq_ptr(value6, " aaa aaa ")); + assert_se(streq_ptr(value7, "\n")); +} + +TEST(proc_cmdline_key_streq) { + assert_se(proc_cmdline_key_streq("", "")); + assert_se(proc_cmdline_key_streq("a", "a")); + assert_se(!proc_cmdline_key_streq("", "a")); + assert_se(!proc_cmdline_key_streq("a", "")); + assert_se(proc_cmdline_key_streq("a", "a")); + assert_se(!proc_cmdline_key_streq("a", "b")); + assert_se(proc_cmdline_key_streq("x-y-z", "x-y-z")); + assert_se(proc_cmdline_key_streq("x-y-z", "x_y_z")); + assert_se(proc_cmdline_key_streq("x-y-z", "x-y_z")); + assert_se(proc_cmdline_key_streq("x-y-z", "x_y-z")); + assert_se(proc_cmdline_key_streq("x_y-z", "x-y_z")); + assert_se(!proc_cmdline_key_streq("x_y-z", "x-z_z")); +} + +TEST(proc_cmdline_key_startswith) { + assert_se(proc_cmdline_key_startswith("", "")); + assert_se(proc_cmdline_key_startswith("x", "")); + assert_se(!proc_cmdline_key_startswith("", "x")); + assert_se(proc_cmdline_key_startswith("x", "x")); + assert_se(!proc_cmdline_key_startswith("x", "y")); + assert_se(!proc_cmdline_key_startswith("foo-bar", "quux")); + assert_se(proc_cmdline_key_startswith("foo-bar", "foo")); + assert_se(proc_cmdline_key_startswith("foo-bar", "foo-bar")); + assert_se(proc_cmdline_key_startswith("foo-bar", "foo_bar")); + assert_se(proc_cmdline_key_startswith("foo-bar", "foo_")); + assert_se(!proc_cmdline_key_startswith("foo-bar", "foo_xx")); +} + +#define test_proc_cmdline_filter_pid1_args_one(nulstr, expected) \ + ({ \ + _cleanup_strv_free_ char **a = NULL, **b = NULL; \ + const char s[] = (nulstr); \ + \ + /* This emulates pid_get_cmdline_strv(). */ \ + assert_se(a = strv_parse_nulstr_full(s, ELEMENTSOF(s), \ + /* drop_trailing_nuls = */ true)); \ + assert_se(proc_cmdline_filter_pid1_args(a, &b) >= 0); \ + assert_se(strv_equal(b, expected)); \ + }) + +TEST(proc_cmdline_filter_pid1_args) { + test_proc_cmdline_filter_pid1_args_one("systemd\0", + STRV_MAKE_EMPTY); + + /* short option */ + test_proc_cmdline_filter_pid1_args_one("systemd\0" + "-a\0" /* unknown option */ + "-abc\0" /* unknown options */ + "-h\0" /* known option */ + "-hDbs\0" /* known options */ + "-hsx\0" /* mixed (known and unknown) options */ + "-z\0drop1\0" /* option with argument */ + "-z\0-z\0accept1\0" /* the second -z is handled as argument */ + "-az\0drop2\0" /* options with argument */ + "-za\0accept2\0" /* options with argument */ + "-z\0--\0-x\0", /* "--" is handled as argument */ + STRV_MAKE("accept1", "accept2")); + + /* long option */ + test_proc_cmdline_filter_pid1_args_one("systemd\0" + "--unknown\0accept1\0" /* unknown option */ + "--system\0accept2\0" /* no argument */ + "--log-level\0drop1\0" /* required argument (separated with space) */ + "--log-level=drop2\0accept3\0" /* required argument (concatenated with '=') */ + "--log-level\0--log-level\0accept4\0" /* the second "--log-level" is handled as argument */ + "--log-level\0--\0-x\0" /* "--" is handled as argument */ + "--log-color\0--log-level\0drop3\0" /* optional argument ("--log-level" is handled as another option) */ + "--log-color\0accept5\0" /* optional argument (separated with space) */ + "--log-color=drop4\0accept6\0" /* optional argument (concatenated with '=') */ + "--log-color\0--\0" /* "--" is _not_ handled as argument, and remaining strings are accepted */ + "remaining\0-x\0--foo\0", + STRV_MAKE("accept1", "accept2", "accept3", "accept4", "accept5", "accept6", "remaining", "-x", "--foo")); + + /* test for "--" */ + test_proc_cmdline_filter_pid1_args_one("systemd\0" + "-a\0" + "--dropped\0" + "--\0" /* remaining strings are accepted */ + "-x\0" + "-abc\0" + "--hoge\0" + "accepted\0", + STRV_MAKE("-x", "-abc", "--hoge", "accepted")); + + /* test for space */ + test_proc_cmdline_filter_pid1_args_one("/usr/lib/systemd/systemd\0" + "--switched-root\0" + "--system\0" + "--deserialize\030\0" /* followed with space */ + "--deserialize=31\0" /* followed with '=' */ + "--exit-code=42\0" + "\0\0\0" + "systemd.log_level=debug\0" + "--unit\0foo.target\0" + " ' quoted '\0" + "systemd.log_target=console\0" + "\t\0" + " arg with space \0" + "3\0" + "\0\0\0", + STRV_MAKE("", "", "", "systemd.log_level=debug", " ' quoted '", "systemd.log_target=console", "\t", " arg with space ", "3")); +} + +static int intro(void) { + if (access("/proc/cmdline", R_OK) < 0 && ERRNO_IS_PRIVILEGE(errno)) + return log_tests_skipped("can't read /proc/cmdline"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-process-util.c b/src/test/test-process-util.c new file mode 100644 index 0000000..957e214 --- /dev/null +++ b/src/test/test-process-util.c @@ -0,0 +1,954 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +#include +#endif + +#include "alloc-util.h" +#include "architecture.h" +#include "dirent-util.h" +#include "errno-list.h" +#include "errno-util.h" +#include "fd-util.h" +#include "ioprio-util.h" +#include "log.h" +#include "macro.h" +#include "missing_sched.h" +#include "missing_syscall.h" +#include "namespace-util.h" +#include "parse-util.h" +#include "process-util.h" +#include "procfs-util.h" +#include "rlimit-util.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "terminal-util.h" +#include "tests.h" +#include "user-util.h" +#include "virt.h" + +static void test_pid_get_comm_one(pid_t pid) { + struct stat st; + _cleanup_free_ char *a = NULL, *c = NULL, *d = NULL, *f = NULL, *i = NULL; + _cleanup_free_ char *env = NULL; + char path[STRLEN("/proc//comm") + DECIMAL_STR_MAX(pid_t)]; + pid_t e; + uid_t u; + gid_t g; + dev_t h; + int r; + + log_info("/* %s */", __func__); + + xsprintf(path, "/proc/"PID_FMT"/comm", pid); + + if (stat(path, &st) == 0) { + assert_se(pid_get_comm(pid, &a) >= 0); + log_info("PID"PID_FMT" comm: '%s'", pid, a); + } else + log_warning("%s not exist.", path); + + assert_se(pid_get_cmdline(pid, 0, PROCESS_CMDLINE_COMM_FALLBACK, &c) >= 0); + log_info("PID"PID_FMT" cmdline: '%s'", pid, c); + + assert_se(pid_get_cmdline(pid, 8, 0, &d) >= 0); + log_info("PID"PID_FMT" cmdline truncated to 8: '%s'", pid, d); + + free(d); + assert_se(pid_get_cmdline(pid, 1, 0, &d) >= 0); + log_info("PID"PID_FMT" cmdline truncated to 1: '%s'", pid, d); + + r = get_process_ppid(pid, &e); + assert_se(pid == 1 ? r == -EADDRNOTAVAIL : r >= 0); + if (r >= 0) { + log_info("PID"PID_FMT" PPID: "PID_FMT, pid, e); + assert_se(e > 0); + } + + assert_se(pid_is_kernel_thread(pid) == 0 || pid != 1); + + r = get_process_exe(pid, &f); + assert_se(r >= 0 || r == -EACCES); + log_info("PID"PID_FMT" exe: '%s'", pid, strna(f)); + + assert_se(pid_get_uid(pid, &u) == 0); + log_info("PID"PID_FMT" UID: "UID_FMT, pid, u); + + assert_se(get_process_gid(pid, &g) == 0); + log_info("PID"PID_FMT" GID: "GID_FMT, pid, g); + + r = get_process_environ(pid, &env); + assert_se(r >= 0 || r == -EACCES); + log_info("PID"PID_FMT" strlen(environ): %zi", pid, env ? (ssize_t)strlen(env) : (ssize_t)-errno); + + if (!detect_container()) + assert_se(get_ctty_devnr(pid, &h) == -ENXIO || pid != 1); + + (void) getenv_for_pid(pid, "PATH", &i); + log_info("PID"PID_FMT" $PATH: '%s'", pid, strna(i)); +} + +TEST(pid_get_comm) { + if (saved_argc > 1) { + pid_t pid = 0; + + (void) parse_pid(saved_argv[1], &pid); + test_pid_get_comm_one(pid); + } else { + TEST_REQ_RUNNING_SYSTEMD(test_pid_get_comm_one(1)); + test_pid_get_comm_one(getpid()); + } +} + +static void test_pid_get_cmdline_one(pid_t pid) { + _cleanup_free_ char *c = NULL, *d = NULL, *e = NULL, *f = NULL, *g = NULL, *h = NULL, *joined = NULL; + _cleanup_strv_free_ char **strv_a = NULL, **strv_b = NULL; + int r; + + r = pid_get_cmdline(pid, SIZE_MAX, 0, &c); + log_info("PID "PID_FMT": %s", pid, r >= 0 ? c : errno_to_name(r)); + + r = pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &d); + log_info(" %s", r >= 0 ? d : errno_to_name(r)); + + r = pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE, &e); + log_info(" %s", r >= 0 ? e : errno_to_name(r)); + + r = pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE | PROCESS_CMDLINE_COMM_FALLBACK, &f); + log_info(" %s", r >= 0 ? f : errno_to_name(r)); + + r = pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &g); + log_info(" %s", r >= 0 ? g : errno_to_name(r)); + + r = pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX | PROCESS_CMDLINE_COMM_FALLBACK, &h); + log_info(" %s", r >= 0 ? h : errno_to_name(r)); + + r = pid_get_cmdline_strv(pid, 0, &strv_a); + if (r >= 0) + assert_se(joined = strv_join(strv_a, "\", \"")); + log_info(" \"%s\"", r >= 0 ? joined : errno_to_name(r)); + + joined = mfree(joined); + + r = pid_get_cmdline_strv(pid, PROCESS_CMDLINE_COMM_FALLBACK, &strv_b); + if (r >= 0) + assert_se(joined = strv_join(strv_b, "\", \"")); + log_info(" \"%s\"", r >= 0 ? joined : errno_to_name(r)); +} + +TEST(pid_get_cmdline) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert_se(proc_dir_open(&d) >= 0); + + for (;;) { + pid_t pid; + + r = proc_dir_read(d, &pid); + assert_se(r >= 0); + + if (r == 0) /* EOF */ + break; + + test_pid_get_cmdline_one(pid); + } +} + +static void test_pid_get_comm_escape_one(const char *input, const char *output) { + _cleanup_free_ char *n = NULL; + + log_debug("input: <%s> — output: <%s>", input, output); + + assert_se(prctl(PR_SET_NAME, input) >= 0); + assert_se(pid_get_comm(0, &n) >= 0); + + log_debug("got: <%s>", n); + + assert_se(streq_ptr(n, output)); +} + +TEST(pid_get_comm_escape) { + _cleanup_free_ char *saved = NULL; + + assert_se(pid_get_comm(0, &saved) >= 0); + + test_pid_get_comm_escape_one("", ""); + test_pid_get_comm_escape_one("foo", "foo"); + test_pid_get_comm_escape_one("012345678901234", "012345678901234"); + test_pid_get_comm_escape_one("0123456789012345", "012345678901234"); + test_pid_get_comm_escape_one("äöüß", "\\303\\244\\303\\266\\303\\274\\303\\237"); + test_pid_get_comm_escape_one("xäöüß", "x\\303\\244\\303\\266\\303\\274\\303\\237"); + test_pid_get_comm_escape_one("xxäöüß", "xx\\303\\244\\303\\266\\303\\274\\303\\237"); + test_pid_get_comm_escape_one("xxxäöüß", "xxx\\303\\244\\303\\266\\303\\274\\303\\237"); + test_pid_get_comm_escape_one("xxxxäöüß", "xxxx\\303\\244\\303\\266\\303\\274\\303\\237"); + test_pid_get_comm_escape_one("xxxxxäöüß", "xxxxx\\303\\244\\303\\266\\303\\274\\303\\237"); + + assert_se(prctl(PR_SET_NAME, saved) >= 0); +} + +TEST(pid_is_unwaited) { + pid_t pid; + + pid = fork(); + assert_se(pid >= 0); + if (pid == 0) { + _exit(EXIT_SUCCESS); + } else { + int status; + + assert_se(waitpid(pid, &status, 0) == pid); + assert_se(pid_is_unwaited(pid) == 0); + } + assert_se(pid_is_unwaited(getpid_cached()) > 0); + assert_se(pid_is_unwaited(-1) < 0); +} + +TEST(pid_is_alive) { + pid_t pid; + + pid = fork(); + assert_se(pid >= 0); + if (pid == 0) { + _exit(EXIT_SUCCESS); + } else { + int status; + + assert_se(waitpid(pid, &status, 0) == pid); + assert_se(pid_is_alive(pid) == 0); + } + assert_se(pid_is_alive(getpid_cached()) > 0); + assert_se(pid_is_alive(-1) < 0); +} + +TEST(personality) { + assert_se(personality_to_string(PER_LINUX)); + assert_se(!personality_to_string(PERSONALITY_INVALID)); + + assert_se(streq(personality_to_string(PER_LINUX), architecture_to_string(native_architecture()))); + + assert_se(personality_from_string(personality_to_string(PER_LINUX)) == PER_LINUX); + assert_se(personality_from_string(architecture_to_string(native_architecture())) == PER_LINUX); + +#ifdef __x86_64__ + assert_se(streq_ptr(personality_to_string(PER_LINUX), "x86-64")); + assert_se(streq_ptr(personality_to_string(PER_LINUX32), "x86")); + + assert_se(personality_from_string("x86-64") == PER_LINUX); + assert_se(personality_from_string("x86") == PER_LINUX32); + assert_se(personality_from_string("ia64") == PERSONALITY_INVALID); + assert_se(personality_from_string(NULL) == PERSONALITY_INVALID); + + assert_se(personality_from_string(personality_to_string(PER_LINUX32)) == PER_LINUX32); +#endif +} + +TEST(pid_get_cmdline_harder) { + char path[] = "/tmp/test-cmdlineXXXXXX"; + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *line = NULL; + _cleanup_strv_free_ char **args = NULL; + pid_t pid; + int r; + + if (geteuid() != 0) { + log_info("Skipping %s: not root", __func__); + return; + } + + if (!have_namespaces()) { + log_notice("Testing without namespaces, skipping %s", __func__); + return; + } + +#if HAVE_VALGRIND_VALGRIND_H + /* valgrind patches open(/proc//cmdline) + * so, test_pid_get_cmdline_harder fails always + * See https://github.com/systemd/systemd/pull/3555#issuecomment-226564908 */ + if (RUNNING_ON_VALGRIND) { + log_info("Skipping %s: running on valgrind", __func__); + return; + } +#endif + + pid = fork(); + if (pid > 0) { + siginfo_t si; + + (void) wait_for_terminate(pid, &si); + + assert_se(si.si_code == CLD_EXITED); + assert_se(si.si_status == 0); + + return; + } + + assert_se(pid == 0); + + r = detach_mount_namespace(); + if (r < 0) { + log_warning_errno(r, "detach mount namespace failed: %m"); + assert_se(ERRNO_IS_PRIVILEGE(r)); + return; + } + + fd = mkostemp(path, O_CLOEXEC); + assert_se(fd >= 0); + + /* Note that we don't unmount the following bind-mount at the end of the test because the kernel + * will clear up its /proc/PID/ hierarchy automatically as soon as the test stops. */ + if (mount(path, "/proc/self/cmdline", "bind", MS_BIND, NULL) < 0) { + /* This happens under selinux… Abort the test in this case. */ + log_warning_errno(errno, "mount(..., \"/proc/self/cmdline\", \"bind\", ...) failed: %m"); + assert_se(IN_SET(errno, EPERM, EACCES)); + return; + } + + /* Set RLIMIT_STACK to infinity to test we don't try to allocate unnecessarily large values to read + * the cmdline. */ + if (setrlimit(RLIMIT_STACK, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0) + log_warning("Testing without RLIMIT_STACK=infinity"); + + assert_se(unlink(path) >= 0); + + assert_se(prctl(PR_SET_NAME, "testa") >= 0); + + assert_se(pid_get_cmdline(0, SIZE_MAX, 0, &line) == -ENOENT); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "[testa]")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_QUOTE, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "\"[testa]\"")); /* quoting is enabled here */ + line = mfree(line); + + assert_se(pid_get_cmdline(0, 0, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 1, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 2, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 3, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[t…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 4, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[te…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 5, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[tes…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 6, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[test…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 7, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[testa]")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 8, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "[testa]")); + line = mfree(line); + + assert_se(pid_get_cmdline_strv(0, PROCESS_CMDLINE_COMM_FALLBACK, &args) >= 0); + assert_se(strv_equal(args, STRV_MAKE("[testa]"))); + args = strv_free(args); + + /* Test with multiple arguments that don't require quoting */ + + assert_se(write(fd, "foo\0bar", 8) == 8); + + assert_se(pid_get_cmdline(0, SIZE_MAX, 0, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + assert_se(streq(line, "foo bar")); + line = mfree(line); + + assert_se(pid_get_cmdline_strv(0, PROCESS_CMDLINE_COMM_FALLBACK, &args) >= 0); + assert_se(strv_equal(args, STRV_MAKE("foo", "bar"))); + args = strv_free(args); + + assert_se(write(fd, "quux", 4) == 4); + assert_se(pid_get_cmdline(0, SIZE_MAX, 0, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar quux")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar quux")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 1, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 2, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "f…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 3, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "fo…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 4, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 5, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo …")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 6, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo b…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 7, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo ba…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 8, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 9, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar …")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 10, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar q…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 11, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar qu…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 12, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar quux")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 13, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar quux")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 14, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar quux")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 1000, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "foo bar quux")); + line = mfree(line); + + assert_se(pid_get_cmdline_strv(0, PROCESS_CMDLINE_COMM_FALLBACK, &args) >= 0); + assert_se(strv_equal(args, STRV_MAKE("foo", "bar", "quux"))); + args = strv_free(args); + + assert_se(ftruncate(fd, 0) >= 0); + assert_se(prctl(PR_SET_NAME, "aaaa bbbb cccc") >= 0); + + assert_se(pid_get_cmdline(0, SIZE_MAX, 0, &line) == -ENOENT); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "[aaaa bbbb cccc]")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 10, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "[aaaa bbb…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 11, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "[aaaa bbbb…")); + line = mfree(line); + + assert_se(pid_get_cmdline(0, 12, PROCESS_CMDLINE_COMM_FALLBACK, &line) >= 0); + log_debug("'%s'", line); + assert_se(streq(line, "[aaaa bbbb …")); + line = mfree(line); + + assert_se(pid_get_cmdline_strv(0, PROCESS_CMDLINE_COMM_FALLBACK, &args) >= 0); + assert_se(strv_equal(args, STRV_MAKE("[aaaa bbbb cccc]"))); + args = strv_free(args); + + /* Test with multiple arguments that do require quoting */ + +#define CMDLINE1 "foo\0'bar'\0\"bar$\"\0x y z\0!``\0" +#define EXPECT1 "foo \"'bar'\" \"\\\"bar\\$\\\"\" \"x y z\" \"!\\`\\`\"" +#define EXPECT1p "foo $'\\'bar\\'' $'\"bar$\"' $'x y z' $'!``'" +#define EXPECT1v STRV_MAKE("foo", "'bar'", "\"bar$\"", "x y z", "!``") + + assert_se(lseek(fd, SEEK_SET, 0) == 0); + assert_se(write(fd, CMDLINE1, sizeof CMDLINE1) == sizeof CMDLINE1); + assert_se(ftruncate(fd, sizeof CMDLINE1) == 0); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_QUOTE, &line) >= 0); + log_debug("got: ==%s==", line); + log_debug("exp: ==%s==", EXPECT1); + assert_se(streq(line, EXPECT1)); + line = mfree(line); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &line) >= 0); + log_debug("got: ==%s==", line); + log_debug("exp: ==%s==", EXPECT1p); + assert_se(streq(line, EXPECT1p)); + line = mfree(line); + + assert_se(pid_get_cmdline_strv(0, 0, &args) >= 0); + assert_se(strv_equal(args, EXPECT1v)); + args = strv_free(args); + +#define CMDLINE2 "foo\0\1\2\3\0\0" +#define EXPECT2 "foo \"\\001\\002\\003\"" +#define EXPECT2p "foo $'\\001\\002\\003'" +#define EXPECT2v STRV_MAKE("foo", "\1\2\3") + + assert_se(lseek(fd, SEEK_SET, 0) == 0); + assert_se(write(fd, CMDLINE2, sizeof CMDLINE2) == sizeof CMDLINE2); + assert_se(ftruncate(fd, sizeof CMDLINE2) == 0); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_QUOTE, &line) >= 0); + log_debug("got: ==%s==", line); + log_debug("exp: ==%s==", EXPECT2); + assert_se(streq(line, EXPECT2)); + line = mfree(line); + + assert_se(pid_get_cmdline(0, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &line) >= 0); + log_debug("got: ==%s==", line); + log_debug("exp: ==%s==", EXPECT2p); + assert_se(streq(line, EXPECT2p)); + line = mfree(line); + + assert_se(pid_get_cmdline_strv(0, 0, &args) >= 0); + assert_se(strv_equal(args, EXPECT2v)); + args = strv_free(args); + + safe_close(fd); + _exit(EXIT_SUCCESS); +} + +TEST(getpid_cached) { + siginfo_t si; + pid_t a, b, c, d, e, f, child; + + a = raw_getpid(); + b = getpid_cached(); + c = getpid(); + + assert_se(a == b && a == c); + + child = fork(); + assert_se(child >= 0); + + if (child == 0) { + /* In child */ + a = raw_getpid(); + b = getpid_cached(); + c = getpid(); + + assert_se(a == b && a == c); + _exit(EXIT_SUCCESS); + } + + d = raw_getpid(); + e = getpid_cached(); + f = getpid(); + + assert_se(a == d && a == e && a == f); + + assert_se(wait_for_terminate(child, &si) >= 0); + assert_se(si.si_status == 0); + assert_se(si.si_code == CLD_EXITED); +} + +TEST(getpid_measure) { + usec_t t, q; + + unsigned long long iterations = slow_tests_enabled() ? 1000000 : 1000; + + log_info("/* %s (%llu iterations) */", __func__, iterations); + + t = now(CLOCK_MONOTONIC); + for (unsigned long long i = 0; i < iterations; i++) + (void) getpid(); + q = now(CLOCK_MONOTONIC) - t; + + log_info(" glibc getpid(): %lf μs each", (double) q / iterations); + + iterations *= 50; /* _cached() is about 50 times faster, so we need more iterations */ + + t = now(CLOCK_MONOTONIC); + for (unsigned long long i = 0; i < iterations; i++) + (void) getpid_cached(); + q = now(CLOCK_MONOTONIC) - t; + + log_info("getpid_cached(): %lf μs each", (double) q / iterations); +} + +TEST(safe_fork) { + siginfo_t status; + pid_t pid; + int r; + + BLOCK_SIGNALS(SIGCHLD); + + r = safe_fork("(test-child)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_REOPEN_LOG, &pid); + assert_se(r >= 0); + + if (r == 0) { + /* child */ + usleep_safe(100 * USEC_PER_MSEC); + + _exit(88); + } + + assert_se(wait_for_terminate(pid, &status) >= 0); + assert_se(status.si_code == CLD_EXITED); + assert_se(status.si_status == 88); +} + +TEST(pid_to_ptr) { + assert_se(PTR_TO_PID(NULL) == 0); + assert_se(PID_TO_PTR(0) == NULL); + + assert_se(PTR_TO_PID(PID_TO_PTR(1)) == 1); + assert_se(PTR_TO_PID(PID_TO_PTR(2)) == 2); + assert_se(PTR_TO_PID(PID_TO_PTR(-1)) == -1); + assert_se(PTR_TO_PID(PID_TO_PTR(-2)) == -2); + + assert_se(PTR_TO_PID(PID_TO_PTR(INT16_MAX)) == INT16_MAX); + assert_se(PTR_TO_PID(PID_TO_PTR(INT16_MIN)) == INT16_MIN); + + assert_se(PTR_TO_PID(PID_TO_PTR(INT32_MAX)) == INT32_MAX); + assert_se(PTR_TO_PID(PID_TO_PTR(INT32_MIN)) == INT32_MIN); +} + +static void test_ioprio_class_from_to_string_one(const char *val, int expected, int normalized) { + assert_se(ioprio_class_from_string(val) == expected); + if (expected >= 0) { + _cleanup_free_ char *s = NULL; + unsigned ret; + int combined; + + assert_se(ioprio_class_to_string_alloc(expected, &s) == 0); + /* We sometimes get a class number and sometimes a name back */ + assert_se(streq(s, val) || + safe_atou(val, &ret) == 0); + + /* Make sure normalization works, i.e. NONE → BE gets normalized */ + combined = ioprio_normalize(ioprio_prio_value(expected, 0)); + assert_se(ioprio_prio_class(combined) == normalized); + assert_se(expected != IOPRIO_CLASS_NONE || ioprio_prio_data(combined) == 4); + } +} + +TEST(ioprio_class_from_to_string) { + test_ioprio_class_from_to_string_one("none", IOPRIO_CLASS_NONE, IOPRIO_CLASS_BE); + test_ioprio_class_from_to_string_one("realtime", IOPRIO_CLASS_RT, IOPRIO_CLASS_RT); + test_ioprio_class_from_to_string_one("best-effort", IOPRIO_CLASS_BE, IOPRIO_CLASS_BE); + test_ioprio_class_from_to_string_one("idle", IOPRIO_CLASS_IDLE, IOPRIO_CLASS_IDLE); + test_ioprio_class_from_to_string_one("0", IOPRIO_CLASS_NONE, IOPRIO_CLASS_BE); + test_ioprio_class_from_to_string_one("1", 1, 1); + test_ioprio_class_from_to_string_one("7", 7, 7); + test_ioprio_class_from_to_string_one("8", 8, 8); + test_ioprio_class_from_to_string_one("9", -EINVAL, -EINVAL); + test_ioprio_class_from_to_string_one("-1", -EINVAL, -EINVAL); +} + +TEST(setpriority_closest) { + int r; + + r = safe_fork("(test-setprio)", + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL); + assert_se(r >= 0); + + if (r == 0) { + bool full_test; + int p, q; + /* child */ + + /* rlimit of 30 equals nice level of -10 */ + if (setrlimit(RLIMIT_NICE, &RLIMIT_MAKE_CONST(30)) < 0) { + /* If this fails we are probably unprivileged or in a userns of some kind, let's skip + * the full test */ + assert_se(ERRNO_IS_PRIVILEGE(errno)); + full_test = false; + } else { + /* However, if the hard limit was above 30, setrlimit would succeed unprivileged, so + * check if the UID/GID can be changed before enabling the full test. */ + if (setresgid(GID_NOBODY, GID_NOBODY, GID_NOBODY) < 0) { + assert_se(ERRNO_IS_PRIVILEGE(errno)); + full_test = false; + } else if (setresuid(UID_NOBODY, UID_NOBODY, UID_NOBODY) < 0) { + assert_se(ERRNO_IS_PRIVILEGE(errno)); + full_test = false; + } else + full_test = true; + } + + errno = 0; + p = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0); + + /* It should always be possible to set our nice level to the current one */ + assert_se(setpriority_closest(p) > 0); + + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && p == q); + + /* It should also be possible to set the nice level to one higher */ + if (p < PRIO_MAX-1) { + assert_se(setpriority_closest(++p) > 0); + + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && p == q); + } + + /* It should also be possible to set the nice level to two higher */ + if (p < PRIO_MAX-1) { + assert_se(setpriority_closest(++p) > 0); + + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && p == q); + } + + if (full_test) { + /* These two should work, given the RLIMIT_NICE we set above */ + assert_se(setpriority_closest(-10) > 0); + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && q == -10); + + assert_se(setpriority_closest(-9) > 0); + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && q == -9); + + /* This should succeed but should be clamped to the limit */ + assert_se(setpriority_closest(-11) == 0); + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && q == -10); + + assert_se(setpriority_closest(-8) > 0); + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && q == -8); + + /* This should succeed but should be clamped to the limit */ + assert_se(setpriority_closest(-12) == 0); + errno = 0; + q = getpriority(PRIO_PROCESS, 0); + assert_se(errno == 0 && q == -10); + } + + _exit(EXIT_SUCCESS); + } +} + +TEST(get_process_ppid) { + uint64_t limit; + int r; + + assert_se(get_process_ppid(1, NULL) == -EADDRNOTAVAIL); + + /* the process with the PID above the global limit definitely doesn't exist. Verify that */ + assert_se(procfs_get_pid_max(&limit) >= 0); + log_debug("kernel.pid_max = %"PRIu64, limit); + + if (limit < INT_MAX) { + r = get_process_ppid(limit + 1, NULL); + log_debug_errno(r, "get_process_limit(%"PRIu64") → %d/%m", limit + 1, r); + assert(r == -ESRCH); + } + + for (pid_t pid = 0;;) { + _cleanup_free_ char *c1 = NULL, *c2 = NULL; + pid_t ppid; + + r = get_process_ppid(pid, &ppid); + if (r == -EADDRNOTAVAIL) { + log_info("No further parent PID"); + break; + } + + assert_se(r >= 0); + + assert_se(pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &c1) >= 0); + assert_se(pid_get_cmdline(ppid, SIZE_MAX, PROCESS_CMDLINE_COMM_FALLBACK, &c2) >= 0); + + log_info("Parent of " PID_FMT " (%s) is " PID_FMT " (%s).", pid, c1, ppid, c2); + + pid = ppid; + } +} + +TEST(set_oom_score_adjust) { + int a, b, r; + + assert_se(get_oom_score_adjust(&a) >= 0); + + r = set_oom_score_adjust(OOM_SCORE_ADJ_MIN); + assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r)); + + if (r >= 0) { + assert_se(get_oom_score_adjust(&b) >= 0); + assert_se(b == OOM_SCORE_ADJ_MIN); + } + + assert_se(set_oom_score_adjust(a) >= 0); + assert_se(get_oom_score_adjust(&b) >= 0); + assert_se(b == a); +} + +static void* dummy_thread(void *p) { + int fd = PTR_TO_FD(p); + char x; + + /* let main thread know we are ready */ + assert_se(write(fd, &(const char) { 'x' }, 1) == 1); + + /* wait for the main thread to tell us to shut down */ + assert_se(read(fd, &x, 1) == 1); + return NULL; +} + +TEST(get_process_threads) { + int r; + + /* Run this test in a child, so that we can guarantee there's exactly one thread around in the child */ + r = safe_fork("(nthreads)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_WAIT|FORK_LOG, NULL); + assert_se(r >= 0); + + if (r == 0) { + _cleanup_close_pair_ int pfd[2] = EBADF_PAIR, ppfd[2] = EBADF_PAIR; + pthread_t t, tt; + char x; + + assert_se(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, pfd) >= 0); + assert_se(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, ppfd) >= 0); + + assert_se(get_process_threads(0) == 1); + assert_se(pthread_create(&t, NULL, &dummy_thread, FD_TO_PTR(pfd[0])) == 0); + assert_se(read(pfd[1], &x, 1) == 1); + assert_se(get_process_threads(0) == 2); + assert_se(pthread_create(&tt, NULL, &dummy_thread, FD_TO_PTR(ppfd[0])) == 0); + assert_se(read(ppfd[1], &x, 1) == 1); + assert_se(get_process_threads(0) == 3); + + assert_se(write(pfd[1], &(const char) { 'x' }, 1) == 1); + assert_se(pthread_join(t, NULL) == 0); + + /* the value reported via /proc/ is decreased asynchronously, and there appears to be no nice + * way to sync on it. Hence we do the weak >= 2 check, even though == 2 is what we'd actually + * like to check here */ + assert_se(get_process_threads(0) >= 2); + + assert_se(write(ppfd[1], &(const char) { 'x' }, 1) == 1); + assert_se(pthread_join(tt, NULL) == 0); + + /* similar here */ + assert_se(get_process_threads(0) >= 1); + + _exit(EXIT_SUCCESS); + } +} + +TEST(is_reaper_process) { + int r; + + r = safe_fork("(regular)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + + assert_se(is_reaper_process() == 0); + _exit(EXIT_SUCCESS); + } + + r = safe_fork("(newpid)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + + if (unshare(CLONE_NEWPID) < 0) { + if (ERRNO_IS_PRIVILEGE(errno) || ERRNO_IS_NOT_SUPPORTED(errno)) { + log_notice("Skipping CLONE_NEWPID reaper check, lacking privileges/support"); + _exit(EXIT_SUCCESS); + } + } + + r = safe_fork("(newpid1)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* grandchild, which is PID1 in a pidns */ + assert_se(getpid_cached() == 1); + assert_se(is_reaper_process() > 0); + _exit(EXIT_SUCCESS); + } + + _exit(EXIT_SUCCESS); + } + + r = safe_fork("(subreaper)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_WAIT, NULL); + assert_se(r >= 0); + if (r == 0) { + /* child */ + assert_se(make_reaper_process(true) >= 0); + + assert_se(is_reaper_process() > 0); + _exit(EXIT_SUCCESS); + } +} + +static int intro(void) { + log_show_color(true); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-procfs-util.c b/src/test/test-procfs-util.c new file mode 100644 index 0000000..2b19a98 --- /dev/null +++ b/src/test/test-procfs-util.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "errno-util.h" +#include "format-util.h" +#include "log.h" +#include "procfs-util.h" +#include "process-util.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + nsec_t nsec; + uint64_t v, pid_max, threads_max, limit; + int r; + + test_setup_logging(LOG_DEBUG); + + assert_se(procfs_cpu_get_usage(&nsec) >= 0); + log_info("Current system CPU time: %s", FORMAT_TIMESPAN(nsec/NSEC_PER_USEC, 1)); + + assert_se(procfs_memory_get_used(&v) >= 0); + log_info("Current memory usage: %s", FORMAT_BYTES(v)); + + assert_se(procfs_tasks_get_current(&v) >= 0); + log_info("Current number of tasks: %" PRIu64, v); + + pid_max = TASKS_MAX; + r = procfs_get_pid_max(&pid_max); + if (r == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(r)) + return log_tests_skipped_errno(r, "can't get pid max"); + assert(r >= 0); + log_info("kernel.pid_max: %"PRIu64, pid_max); + + threads_max = TASKS_MAX; + r = procfs_get_threads_max(&threads_max); + if (r == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(r)) + return log_tests_skipped_errno(r, "can't get threads max"); + assert(r >= 0); + log_info("kernel.threads-max: %"PRIu64, threads_max); + + limit = MIN(pid_max - (pid_max > 0), threads_max); + + assert_se(r >= 0); + log_info("Limit of tasks: %" PRIu64, limit); + assert_se(limit > 0); + + /* This call should never fail, as we're trying to set it to the same limit */ + assert(procfs_tasks_set_limit(limit) >= 0); + + if (limit > 100) { + log_info("Reducing limit by one to %"PRIu64"…", limit-1); + + r = procfs_tasks_set_limit(limit-1); + if (IN_SET(r, -ENOENT, -EROFS) || ERRNO_IS_PRIVILEGE(r)) + return log_tests_skipped_errno(r, "can't set tasks limit"); + assert_se(r >= 0); + + assert_se(procfs_get_pid_max(&v) >= 0); + /* We never decrease the pid_max, so it shouldn't have changed */ + assert_se(v == pid_max); + + assert_se(procfs_get_threads_max(&v) >= 0); + assert_se(v == limit-1); + + assert_se(procfs_tasks_set_limit(limit) >= 0); + + assert_se(procfs_get_pid_max(&v) >= 0); + assert_se(v == pid_max); + + assert_se(procfs_get_threads_max(&v) >= 0); + assert_se(v == limit); + } + + return 0; +} diff --git a/src/test/test-psi-util.c b/src/test/test-psi-util.c new file mode 100644 index 0000000..4ce0811 --- /dev/null +++ b/src/test/test-psi-util.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "parse-util.h" +#include "psi-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(read_mem_pressure) { + _cleanup_(unlink_tempfilep) char path[] = "/tmp/pressurereadtestXXXXXX"; + _cleanup_close_ int fd = -EBADF; + ResourcePressure rp; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se((fd = mkostemp_safe(path)) >= 0); + + assert_se(read_resource_pressure("/verylikelynonexistentpath", PRESSURE_TYPE_SOME, &rp) < 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + assert_se(write_string_file(path, "herpdederp\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + /* Pressure file with some invalid values */ + assert_se(write_string_file(path, "some avg10=0.22=55 avg60=0.17=8 avg300=1.11=00 total=58761459\n" + "full avg10=0.23=55 avg60=0.16=8 avg300=1.08=00 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + /* Same pressure valid values as below but with duplicate avg60 field */ + assert_se(write_string_file(path, "some avg10=0.22 avg60=0.17 avg60=0.18 avg300=1.11 total=58761459\n" + "full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + assert_se(write_string_file(path, "some avg10=0.22 avg60=0.17 avg300=1.11 total=58761459\n" + "full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) == 0); + assert_se(LOADAVG_INT_SIDE(rp.avg10) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg10) == 22); + assert_se(LOADAVG_INT_SIDE(rp.avg60) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg60) == 17); + assert_se(LOADAVG_INT_SIDE(rp.avg300) == 1); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg300) == 11); + assert_se(rp.total == 58761459); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_FULL, &rp) == 0); + assert_se(LOADAVG_INT_SIDE(rp.avg10) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg10) == 23); + assert_se(LOADAVG_INT_SIDE(rp.avg60) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg60) == 16); + assert_se(LOADAVG_INT_SIDE(rp.avg300) == 1); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg300) == 8); + assert_se(rp.total == 58464525); + + /* Pressure file with extra unsupported fields */ + assert_se(write_string_file(path, "some avg5=0.55 avg10=0.22 avg60=0.17 avg300=1.11 total=58761459\n" + "full avg10=0.23 avg60=0.16 avg300=1.08 avg600=2.00 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) == 0); + assert_se(LOADAVG_INT_SIDE(rp.avg10) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg10) == 22); + assert_se(LOADAVG_INT_SIDE(rp.avg60) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg60) == 17); + assert_se(LOADAVG_INT_SIDE(rp.avg300) == 1); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg300) == 11); + assert_se(rp.total == 58761459); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_FULL, &rp) == 0); + assert_se(LOADAVG_INT_SIDE(rp.avg10) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg10) == 23); + assert_se(LOADAVG_INT_SIDE(rp.avg60) == 0); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg60) == 16); + assert_se(LOADAVG_INT_SIDE(rp.avg300) == 1); + assert_se(LOADAVG_DECIMAL_SIDE(rp.avg300) == 8); + assert_se(rp.total == 58464525); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-qrcode-util.c b/src/test/test-qrcode-util.c new file mode 100644 index 0000000..221ad85 --- /dev/null +++ b/src/test/test-qrcode-util.c @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "locale-util.h" +#include "main-func.h" +#include "qrcode-util.h" +#include "tests.h" + +static int run(int argc, char **argv) { + int r; + + test_setup_logging(LOG_DEBUG); + + assert_se(setenv("SYSTEMD_COLORS", "1", 1) == 0); /* Force the qrcode to be printed */ + + r = print_qrcode(stdout, "This should say \"TEST\"", "TEST"); + if (r == -EOPNOTSUPP) + return log_tests_skipped("not supported"); + if (r < 0) + return log_error_errno(r, "Failed to print QR code: %m"); + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/test/test-random-util.c b/src/test/test-random-util.c new file mode 100644 index 0000000..e597271 --- /dev/null +++ b/src/test/test-random-util.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "hexdecoct.h" +#include "log.h" +#include "memory-util.h" +#include "random-util.h" +#include "terminal-util.h" +#include "tests.h" + +TEST(random_bytes) { + uint8_t buf[16] = {}; + + for (size_t i = 1; i < sizeof buf; i++) { + random_bytes(buf, i); + if (i + 1 < sizeof buf) + assert_se(buf[i] == 0); + + hexdump(stdout, buf, i); + } +} + +TEST(crypto_random_bytes) { + uint8_t buf[16] = {}; + + for (size_t i = 1; i < sizeof buf; i++) { + assert_se(crypto_random_bytes(buf, i) == 0); + if (i + 1 < sizeof buf) + assert_se(buf[i] == 0); + + hexdump(stdout, buf, i); + } +} + +#define TOTAL 100000 + +static void test_random_u64_range_one(unsigned mod) { + log_info("/* %s(%u) */", __func__, mod); + + unsigned max = 0, count[mod]; + zero(count); + + for (unsigned i = 0; i < TOTAL; i++) { + uint64_t x; + + x = random_u64_range(mod); + + count[x]++; + max = MAX(max, count[x]); + } + + /* Print histogram: vertical axis — value, horizontal axis — count. + * + * The expected value is always TOTAL/mod, because the distribution should be flat. The expected + * variance is TOTAL×p×(1-p), where p==1/mod, and standard deviation the root of the variance. + * Assert that the deviation from the expected value is less than 6 standard deviations. + */ + unsigned scale = 2 * max / (columns() < 20 ? 80 : columns() - 20); + double exp = (double) TOTAL / mod; + + for (size_t i = 0; i < mod; i++) { + double dev = (count[i] - exp) / sqrt(exp * (mod > 1 ? mod - 1 : 1) / mod); + log_debug("%02zu: %5u (%+.3f)%*s", + i, count[i], dev, + (int) (count[i] / scale), "x"); + + assert_se(fabs(dev) < 6); /* 6 sigma is excessive, but this check should be enough to + * identify catastrophic failure while minimizing false + * positives. */ + } +} + +TEST(random_u64_range) { + for (unsigned mod = 1; mod < 29; mod++) + test_random_u64_range_one(mod); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-ratelimit.c b/src/test/test-ratelimit.c new file mode 100644 index 0000000..de208c7 --- /dev/null +++ b/src/test/test-ratelimit.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "macro.h" +#include "ratelimit.h" +#include "tests.h" +#include "time-util.h" + +TEST(ratelimit_below) { + int i; + RateLimit ratelimit = { 1 * USEC_PER_SEC, 10 }; + + for (i = 0; i < 10; i++) + assert_se(ratelimit_below(&ratelimit)); + assert_se(!ratelimit_below(&ratelimit)); + sleep(1); + for (i = 0; i < 10; i++) + assert_se(ratelimit_below(&ratelimit)); + + ratelimit = (const RateLimit) { 0, 10 }; + for (i = 0; i < 10000; i++) + assert_se(ratelimit_below(&ratelimit)); +} + +TEST(ratelimit_num_dropped) { + int i; + RateLimit ratelimit = { 1 * USEC_PER_SEC, 10 }; + + for (i = 0; i < 10; i++) { + assert_se(ratelimit_below(&ratelimit)); + assert_se(ratelimit_num_dropped(&ratelimit) == 0); + } + assert_se(!ratelimit_below(&ratelimit)); + assert_se(ratelimit_num_dropped(&ratelimit) == 1); + assert_se(!ratelimit_below(&ratelimit)); + assert_se(ratelimit_num_dropped(&ratelimit) == 2); + sleep(1); + assert_se(ratelimit_below(&ratelimit)); + assert_se(ratelimit_num_dropped(&ratelimit) == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-raw-clone.c b/src/test/test-raw-clone.c new file mode 100644 index 0000000..23ec7d1 --- /dev/null +++ b/src/test/test-raw-clone.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "errno-util.h" +#include "format-util.h" +#include "missing_syscall.h" +#include "raw-clone.h" +#include "tests.h" + +TEST(raw_clone) { + pid_t parent, pid, pid2; + + parent = getpid(); + log_info("before clone: getpid()→"PID_FMT, parent); + assert_se(raw_getpid() == parent); + + pid = raw_clone(0); + assert_se(pid >= 0); + + pid2 = raw_getpid(); + log_info("raw_clone: "PID_FMT" getpid()→"PID_FMT" raw_getpid()→"PID_FMT, + pid, getpid(), pid2); + if (pid == 0) { + assert_se(pid2 != parent); + _exit(EXIT_SUCCESS); + } else { + int status; + + assert_se(pid2 == parent); + waitpid(pid, &status, __WCLONE); + assert_se(WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SUCCESS); + } + + errno = 0; + assert_se(raw_clone(CLONE_FS|CLONE_NEWNS) == -1); + assert_se(errno == EINVAL || ERRNO_IS_PRIVILEGE(errno)); /* Certain container environments prohibit namespaces to us, don't fail in that case */ +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-recurse-dir.c b/src/test/test-recurse-dir.c new file mode 100644 index 0000000..8684d06 --- /dev/null +++ b/src/test/test-recurse-dir.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "fd-util.h" +#include "log.h" +#include "missing_magic.h" +#include "recurse-dir.h" +#include "strv.h" +#include "tests.h" + +static char **list_nftw = NULL; + +static int nftw_cb( + const char *fpath, + const struct stat *sb, + int typeflag, + struct FTW *ftwbuf) { + + if (ftwbuf->level == 0) /* skip top-level */ + return FTW_CONTINUE; + + switch (typeflag) { + + case FTW_F: + log_debug("ftw found %s", fpath); + assert_se(strv_extend(&list_nftw, fpath) >= 0); + break; + + case FTW_SL: + log_debug("ftw found symlink %s, ignoring.", fpath); + break; + + case FTW_D: + log_debug("ftw entering %s", fpath); + assert_se(strv_extendf(&list_nftw, "%s/", fpath) >= 0); + break; + + case FTW_DNR: + log_debug("ftw open directory failed %s", fpath); + break; + + case FTW_NS: + log_debug("ftw stat inode failed %s", fpath); + break; + + case FTW_DP: + case FTW_SLN: + default: + assert_not_reached(); + } + + return FTW_CONTINUE; +} + +static int recurse_dir_callback( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + char ***l = userdata; + + assert_se(path); + assert_se(de); + + switch (event) { + + case RECURSE_DIR_ENTRY: + assert_se(!IN_SET(de->d_type, DT_UNKNOWN, DT_DIR)); + + log_debug("found %s%s", path, + de->d_type == DT_LNK ? ", ignoring." : ""); + + if (de->d_type != DT_LNK) + assert_se(strv_extend(l, path) >= 0); + break; + + case RECURSE_DIR_ENTER: + assert_se(de->d_type == DT_DIR); + + log_debug("entering %s", path); + assert_se(strv_extendf(l, "%s/", path) >= 0); + break; + + case RECURSE_DIR_LEAVE: + log_debug("leaving %s", path); + break; + + case RECURSE_DIR_SKIP_MOUNT: + log_debug("skipping mount %s", path); + break; + + case RECURSE_DIR_SKIP_DEPTH: + log_debug("skipping depth %s", path); + break; + + case RECURSE_DIR_SKIP_OPEN_DIR_ERROR_BASE...RECURSE_DIR_SKIP_OPEN_DIR_ERROR_MAX: + log_debug_errno(event - RECURSE_DIR_SKIP_OPEN_DIR_ERROR_BASE, "failed to open dir %s: %m", path); + break; + + case RECURSE_DIR_SKIP_OPEN_INODE_ERROR_BASE...RECURSE_DIR_SKIP_OPEN_INODE_ERROR_MAX: + log_debug_errno(event - RECURSE_DIR_SKIP_OPEN_INODE_ERROR_BASE, "failed to open inode %s: %m", path); + break; + + case RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE...RECURSE_DIR_SKIP_STAT_INODE_ERROR_MAX: + log_debug_errno(event - RECURSE_DIR_SKIP_STAT_INODE_ERROR_BASE, "failed to stat inode %s: %m", path); + break; + + default: + assert_not_reached(); + } + + return RECURSE_DIR_CONTINUE; +} + +int main(int argc, char *argv[]) { + _cleanup_strv_free_ char **list_recurse_dir = NULL; + const char *p; + usec_t t1, t2, t3, t4; + _cleanup_close_ int fd = -EBADF; + + log_show_color(true); + test_setup_logging(LOG_INFO); + + if (argc > 1) + p = argv[1]; + else + p = "/usr/share/man"; /* something hopefully reasonably stable while we run (and limited in size) */ + + fd = open(p, O_DIRECTORY|O_CLOEXEC); + if (fd < 0 && errno == ENOENT) + return log_tests_skipped_errno(errno, "Couldn't open directory %s", p); + assert_se(fd >= 0); + + /* If the test directory is on an overlayfs then files and their directory may return different + * st_dev in stat results, which confuses nftw into thinking they're on different filesystems and + * won't return the result when the FTW_MOUNT flag is set. */ + if (fd_is_fs_type(fd, OVERLAYFS_SUPER_MAGIC)) + return log_tests_skipped("nftw mountpoint detection produces false-positives on overlayfs"); + + /* Enumerate the specified dirs in full, once via nftw(), and once via recurse_dir(), and ensure the + * results are identical. nftw() sometimes skips symlinks (see + * https://github.com/systemd/systemd/issues/29603), so ignore them to avoid bogus errors. */ + + t1 = now(CLOCK_MONOTONIC); + assert_se(recurse_dir(fd, p, 0, UINT_MAX, RECURSE_DIR_SORT|RECURSE_DIR_ENSURE_TYPE|RECURSE_DIR_SAME_MOUNT, recurse_dir_callback, &list_recurse_dir) >= 0); + t2 = now(CLOCK_MONOTONIC); + + t3 = now(CLOCK_MONOTONIC); + assert_se(nftw(p, nftw_cb, 64, FTW_PHYS|FTW_MOUNT) >= 0); + t4 = now(CLOCK_MONOTONIC); + + log_info("recurse_dir(): %s – nftw(): %s", FORMAT_TIMESPAN(t2 - t1, 1), FORMAT_TIMESPAN(t4 - t3, 1)); + + strv_sort(list_recurse_dir); + strv_sort(list_nftw); + + for (size_t i = 0;; i++) { + const char *a = list_nftw ? list_nftw[i] : NULL, + *b = list_recurse_dir ? list_recurse_dir[i] : NULL; + + if (!streq_ptr(a, b)) { + log_error("entry %zu different: %s vs %s", i, strna(a), strna(b)); + assert_not_reached(); + } + + if (!a) + break; + } + + list_nftw = strv_free(list_nftw); + return 0; +} diff --git a/src/test/test-replace-var.c b/src/test/test-replace-var.c new file mode 100644 index 0000000..f861b27 --- /dev/null +++ b/src/test/test-replace-var.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "macro.h" +#include "replace-var.h" +#include "string-util.h" +#include "tests.h" + +static char *lookup(const char *variable, void *userdata) { + return strjoin("<<<", variable, ">>>"); +} + +TEST(replace_var) { + char *r; + + assert_se(r = replace_var("@@@foobar@xyz@HALLO@foobar@test@@testtest@TEST@...@@@", lookup, NULL)); + puts(r); + assert_se(streq(r, "@@@foobar@xyz<<>>foobar@test@@testtest<<>>...@@@")); + free(r); +} + +TEST(strreplace) { + char *r; + + assert_se(r = strreplace("XYZFFFFXYZFFFFXYZ", "XYZ", "ABC")); + puts(r); + assert_se(streq(r, "ABCFFFFABCFFFFABC")); + free(r); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-rlimit-util.c b/src/test/test-rlimit-util.c new file mode 100644 index 0000000..86d0c04 --- /dev/null +++ b/src/test/test-rlimit-util.c @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "capability-util.h" +#include "macro.h" +#include "missing_resource.h" +#include "rlimit-util.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" + +static void test_rlimit_parse_format_one(int resource, const char *string, rlim_t soft, rlim_t hard, int ret, const char *formatted) { + _cleanup_free_ char *f = NULL; + struct rlimit rl = { + .rlim_cur = 4711, + .rlim_max = 4712, + }, rl2 = { + .rlim_cur = 4713, + .rlim_max = 4714 + }; + + assert_se(rlimit_parse(resource, string, &rl) == ret); + if (ret < 0) + return; + + assert_se(rl.rlim_cur == soft); + assert_se(rl.rlim_max == hard); + + assert_se(rlimit_format(&rl, &f) >= 0); + assert_se(streq(formatted, f)); + + assert_se(rlimit_parse(resource, formatted, &rl2) >= 0); + assert_se(memcmp(&rl, &rl2, sizeof(struct rlimit)) == 0); +} + +TEST(rlimit_parse_format) { + test_rlimit_parse_format_one(RLIMIT_NOFILE, "4:5", 4, 5, 0, "4:5"); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "6", 6, 6, 0, "6"); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "infinity", RLIM_INFINITY, RLIM_INFINITY, 0, "infinity"); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "infinity:infinity", RLIM_INFINITY, RLIM_INFINITY, 0, "infinity"); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "8:infinity", 8, RLIM_INFINITY, 0, "8:infinity"); + test_rlimit_parse_format_one(RLIMIT_CPU, "25min:13h", (25*USEC_PER_MINUTE) / USEC_PER_SEC, (13*USEC_PER_HOUR) / USEC_PER_SEC, 0, "1500:46800"); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "", 0, 0, -EINVAL, NULL); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "5:4", 0, 0, -EILSEQ, NULL); + test_rlimit_parse_format_one(RLIMIT_NOFILE, "5:4:3", 0, 0, -EINVAL, NULL); + test_rlimit_parse_format_one(RLIMIT_NICE, "20", 20, 20, 0, "20"); + test_rlimit_parse_format_one(RLIMIT_NICE, "40", 40, 40, 0, "40"); + test_rlimit_parse_format_one(RLIMIT_NICE, "41", 41, 41, -ERANGE, "41"); + test_rlimit_parse_format_one(RLIMIT_NICE, "0", 0, 0, 0, "0"); + test_rlimit_parse_format_one(RLIMIT_NICE, "-7", 27, 27, 0, "27"); + test_rlimit_parse_format_one(RLIMIT_NICE, "-20", 40, 40, 0, "40"); + test_rlimit_parse_format_one(RLIMIT_NICE, "-21", 41, 41, -ERANGE, "41"); + test_rlimit_parse_format_one(RLIMIT_NICE, "-0", 20, 20, 0, "20"); + test_rlimit_parse_format_one(RLIMIT_NICE, "+7", 13, 13, 0, "13"); + test_rlimit_parse_format_one(RLIMIT_NICE, "+19", 1, 1, 0, "1"); + test_rlimit_parse_format_one(RLIMIT_NICE, "+20", 0, 0, -ERANGE, "0"); + test_rlimit_parse_format_one(RLIMIT_NICE, "+0", 20, 20, 0, "20"); +} + +TEST(rlimit_from_string) { + assert_se(rlimit_from_string("NOFILE") == RLIMIT_NOFILE); + assert_se(rlimit_from_string("LimitNOFILE") == -EINVAL); + assert_se(rlimit_from_string("RLIMIT_NOFILE") == -EINVAL); + assert_se(rlimit_from_string("xxxNOFILE") == -EINVAL); + assert_se(rlimit_from_string("DefaultLimitNOFILE") == -EINVAL); +} + +TEST(rlimit_from_string_harder) { + assert_se(rlimit_from_string_harder("NOFILE") == RLIMIT_NOFILE); + assert_se(rlimit_from_string_harder("LimitNOFILE") == RLIMIT_NOFILE); + assert_se(rlimit_from_string_harder("RLIMIT_NOFILE") == RLIMIT_NOFILE); + assert_se(rlimit_from_string_harder("xxxNOFILE") == -EINVAL); + assert_se(rlimit_from_string_harder("DefaultLimitNOFILE") == -EINVAL); +} + +TEST(rlimit_to_string_all) { + for (int i = 0; i < _RLIMIT_MAX; i++) { + _cleanup_free_ char *prefixed = NULL; + const char *p; + + assert_se(p = rlimit_to_string(i)); + log_info("%i = %s", i, p); + + assert_se(rlimit_from_string(p) == i); + assert_se(rlimit_from_string_harder(p) == i); + + assert_se(prefixed = strjoin("Limit", p)); + + assert_se(rlimit_from_string(prefixed) < 0); + assert_se(rlimit_from_string_harder(prefixed) == i); + + prefixed = mfree(prefixed); + assert_se(prefixed = strjoin("RLIMIT_", p)); + + assert_se(rlimit_from_string(prefixed) < 0); + assert_se(rlimit_from_string_harder(prefixed) == i); + } +} + +TEST(setrlimit) { + struct rlimit old, new, high; + struct rlimit err = { + .rlim_cur = 10, + .rlim_max = 5, + }; + + assert_se(drop_capability(CAP_SYS_RESOURCE) == 0); + + assert_se(getrlimit(RLIMIT_NOFILE, &old) == 0); + new.rlim_cur = MIN(5U, old.rlim_max); + new.rlim_max = old.rlim_max; + assert_se(setrlimit(RLIMIT_NOFILE, &new) >= 0); + + assert_se(streq_ptr(rlimit_to_string(RLIMIT_NOFILE), "NOFILE")); + assert_se(rlimit_to_string(-1) == NULL); + + assert_se(getrlimit(RLIMIT_NOFILE, &old) == 0); + assert_se(setrlimit_closest(RLIMIT_NOFILE, &old) == 0); + assert_se(getrlimit(RLIMIT_NOFILE, &new) == 0); + assert_se(old.rlim_cur == new.rlim_cur); + assert_se(old.rlim_max == new.rlim_max); + + assert_se(getrlimit(RLIMIT_NOFILE, &old) == 0); + high = RLIMIT_MAKE_CONST(old.rlim_max == RLIM_INFINITY ? old.rlim_max : old.rlim_max + 1); + assert_se(setrlimit_closest(RLIMIT_NOFILE, &high) == 0); + assert_se(getrlimit(RLIMIT_NOFILE, &new) == 0); + assert_se(new.rlim_max == old.rlim_max); + assert_se(new.rlim_cur == new.rlim_max); + + assert_se(getrlimit(RLIMIT_NOFILE, &old) == 0); + assert_se(setrlimit_closest(RLIMIT_NOFILE, &err) == -EINVAL); + assert_se(getrlimit(RLIMIT_NOFILE, &new) == 0); + assert_se(old.rlim_cur == new.rlim_cur); + assert_se(old.rlim_max == new.rlim_max); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-rm-rf.c b/src/test/test-rm-rf.c new file mode 100644 index 0000000..4c69bd2 --- /dev/null +++ b/src/test/test-rm-rf.c @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "process-util.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static void test_rm_rf_chmod_inner(void) { + _cleanup_(rm_rf_physical_and_freep) char *d = NULL; + const char *a, *b, *x, *y; + struct stat st; + + assert_se(getuid() != 0); + + assert_se(mkdtemp_malloc("/tmp/test-rm-rf.XXXXXXX", &d) >= 0); + a = strjoina(d, "/a"); + b = strjoina(a, "/b"); + x = strjoina(d, "/x"); + y = strjoina(x, "/y"); + + assert_se(mkdir(x, 0700) >= 0); + assert_se(mknod(y, S_IFREG | 0600, 0) >= 0); + + assert_se(chmod(y, 0400) >= 0); + assert_se(chmod(x, 0500) >= 0); + assert_se(chmod(d, 0500) >= 0); + + assert_se(rm_rf(d, REMOVE_PHYSICAL) == -EACCES); + + assert_se(access(d, F_OK) >= 0); + assert_se(access(x, F_OK) >= 0); + assert_se(access(y, F_OK) >= 0); + + assert_se(rm_rf(d, REMOVE_PHYSICAL|REMOVE_CHMOD) >= 0); + + assert_se(access(d, F_OK) >= 0); + assert_se(access(x, F_OK) < 0 && errno == ENOENT); + assert_se(access(y, F_OK) < 0 && errno == ENOENT); + + assert_se(mkdir(a, 0700) >= 0); + assert_se(mkdir(b, 0700) >= 0); + assert_se(mkdir(x, 0700) >= 0); + assert_se(mknod(y, S_IFREG | 0600, 0) >= 0); + + assert_se(chmod(b, 0000) >= 0); + assert_se(chmod(a, 0000) >= 0); + assert_se(chmod(y, 0000) >= 0); + assert_se(chmod(x, 0000) >= 0); + assert_se(chmod(d, 0500) >= 0); + + assert_se(rm_rf(d, REMOVE_PHYSICAL|REMOVE_CHMOD|REMOVE_CHMOD_RESTORE|REMOVE_ONLY_DIRECTORIES) == -ENOTEMPTY); + + assert_se(access(a, F_OK) < 0 && errno == ENOENT); + assert_se(access(d, F_OK) >= 0); + assert_se(stat(d, &st) >= 0 && (st.st_mode & 07777) == 0500); + assert_se(access(x, F_OK) >= 0); + assert_se(stat(x, &st) >= 0 && (st.st_mode & 07777) == 0000); + assert_se(chmod(x, 0700) >= 0); + assert_se(access(y, F_OK) >= 0); + assert_se(stat(y, &st) >= 0 && (st.st_mode & 07777) == 0000); + + assert_se(chmod(y, 0000) >= 0); + assert_se(chmod(x, 0000) >= 0); + assert_se(chmod(d, 0000) >= 0); + + assert_se(rm_rf(d, REMOVE_PHYSICAL|REMOVE_CHMOD|REMOVE_CHMOD_RESTORE) >= 0); + + assert_se(stat(d, &st) >= 0 && (st.st_mode & 07777) == 0000); + assert_se(access(d, F_OK) >= 0); + assert_se(chmod(d, 0700) >= 0); + assert_se(access(x, F_OK) < 0 && errno == ENOENT); + + assert_se(mkdir(x, 0700) >= 0); + assert_se(mknod(y, S_IFREG | 0600, 0) >= 0); + + assert_se(chmod(y, 0000) >= 0); + assert_se(chmod(x, 0000) >= 0); + assert_se(chmod(d, 0000) >= 0); + + assert_se(rm_rf(d, REMOVE_PHYSICAL|REMOVE_CHMOD|REMOVE_ROOT) >= 0); + + assert_se(access(d, F_OK) < 0 && errno == ENOENT); +} + +TEST(rm_rf_chmod) { + int r; + + if (getuid() == 0) { + /* This test only works unpriv (as only then the access mask for the owning user matters), + * hence drop privs here */ + + r = safe_fork("(setresuid)", FORK_DEATHSIG_SIGTERM|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* child */ + + assert_se(setresuid(1, 1, 1) >= 0); + + test_rm_rf_chmod_inner(); + _exit(EXIT_SUCCESS); + } + + return; + } + + test_rm_rf_chmod_inner(); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-sbat.c b/src/test/test-sbat.c new file mode 100644 index 0000000..d8546b1 --- /dev/null +++ b/src/test/test-sbat.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* We include efi_config.h after undefining PROJECT_VERSION which is also defined in config.h. */ +#undef PROJECT_VERSION +#include "efi_config.h" + +#include "build.h" +#include "sbat.h" +#include "tests.h" + +TEST(sbat_section_text) { + log_info("---SBAT-----------&<----------------------------------------\n" + "%s" + "%s" + "------------------>&-----------------------------------------", +#ifdef SBAT_DISTRO + SBAT_BOOT_SECTION_TEXT, + SBAT_STUB_SECTION_TEXT +#else + "(not defined)", + "" +#endif + ); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-sched-prio.c b/src/test/test-sched-prio.c new file mode 100644 index 0000000..3c3b8dc --- /dev/null +++ b/src/test/test-sched-prio.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2012 Holger Hans Peter Freyther +***/ + +#include + +#include "all-units.h" +#include "macro.h" +#include "manager.h" +#include "rm-rf.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *idle_ok, *idle_bad, *rr_ok, *rr_bad, *rr_sched; + Service *ser; + int r; + + test_setup_logging(LOG_INFO); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + /* prepare the test */ + _cleanup_free_ char *unit_dir = NULL; + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m); + if (manager_errno_skip_test(r)) + return log_tests_skipped_errno(r, "manager_new"); + assert_se(r >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + /* load idle ok */ + assert_se(manager_load_startable_unit_or_warn(m, "sched_idle_ok.service", NULL, &idle_ok) >= 0); + ser = SERVICE(idle_ok); + assert_se(ser->exec_context.cpu_sched_policy == SCHED_OTHER); + assert_se(ser->exec_context.cpu_sched_priority == 0); + + /* + * load idle bad. This should print a warning but we have no way to look at it. + */ + assert_se(manager_load_startable_unit_or_warn(m, "sched_idle_bad.service", NULL, &idle_bad) >= 0); + ser = SERVICE(idle_ok); + assert_se(ser->exec_context.cpu_sched_policy == SCHED_OTHER); + assert_se(ser->exec_context.cpu_sched_priority == 0); + + /* + * load rr ok. + * Test that the default priority is moving from 0 to 1. + */ + assert_se(manager_load_startable_unit_or_warn(m, "sched_rr_ok.service", NULL, &rr_ok) >= 0); + ser = SERVICE(rr_ok); + assert_se(ser->exec_context.cpu_sched_policy == SCHED_RR); + assert_se(ser->exec_context.cpu_sched_priority == 1); + + /* + * load rr bad. + * Test that the value of 0 and 100 is ignored. + */ + assert_se(manager_load_startable_unit_or_warn(m, "sched_rr_bad.service", NULL, &rr_bad) >= 0); + ser = SERVICE(rr_bad); + assert_se(ser->exec_context.cpu_sched_policy == SCHED_RR); + assert_se(ser->exec_context.cpu_sched_priority == 1); + + /* + * load rr change. + * Test that anything between 1 and 99 can be set. + */ + assert_se(manager_load_startable_unit_or_warn(m, "sched_rr_change.service", NULL, &rr_sched) >= 0); + ser = SERVICE(rr_sched); + assert_se(ser->exec_context.cpu_sched_policy == SCHED_RR); + assert_se(ser->exec_context.cpu_sched_priority == 99); + + return EXIT_SUCCESS; +} diff --git a/src/test/test-sd-hwdb.c b/src/test/test-sd-hwdb.c new file mode 100644 index 0000000..ecb6118 --- /dev/null +++ b/src/test/test-sd-hwdb.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-hwdb.h" + +#include "alloc-util.h" +#include "errno-util.h" +#include "errno.h" +#include "hwdb-internal.h" +#include "nulstr-util.h" +#include "tests.h" + +TEST(failed_enumerate) { + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + const char *key, *value; + + assert_se(sd_hwdb_new(&hwdb) == 0); + + assert_se(sd_hwdb_seek(hwdb, "no-such-modalias-should-exist") == 0); + + assert_se(sd_hwdb_enumerate(hwdb, &key, &value) == 0); + assert_se(sd_hwdb_enumerate(hwdb, &key, NULL) == -EINVAL); + assert_se(sd_hwdb_enumerate(hwdb, NULL, &value) == -EINVAL); +} + +#define DELL_MODALIAS \ + "evdev:atkbd:dmi:bvnXXX:bvrYYY:bdZZZ:svnDellXXX:pnYYY:" + +TEST(basic_enumerate) { + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + const char *key, *value; + size_t len1 = 0, len2 = 0; + int r; + + assert_se(sd_hwdb_new(&hwdb) == 0); + + assert_se(sd_hwdb_seek(hwdb, DELL_MODALIAS) == 0); + + for (;;) { + r = sd_hwdb_enumerate(hwdb, &key, &value); + assert_se(IN_SET(r, 0, 1)); + if (r == 0) + break; + assert_se(key); + assert_se(value); + log_debug("A: \"%s\" → \"%s\"", key, value); + len1 += strlen(key) + strlen(value); + } + + SD_HWDB_FOREACH_PROPERTY(hwdb, DELL_MODALIAS, key, value) { + log_debug("B: \"%s\" → \"%s\"", key, value); + len2 += strlen(key) + strlen(value); + } + + assert_se(len1 == len2); +} + +TEST(sd_hwdb_new_from_path) { + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + int r; + + assert_se(sd_hwdb_new_from_path(NULL, &hwdb) == -EINVAL); + assert_se(sd_hwdb_new_from_path("", &hwdb) == -EINVAL); + assert_se(sd_hwdb_new_from_path("/path/that/should/not/exist", &hwdb) < 0); + + NULSTR_FOREACH(hwdb_bin_path, hwdb_bin_paths) { + r = sd_hwdb_new_from_path(hwdb_bin_path, &hwdb); + if (r >= 0) + break; + } + + assert_se(r >= 0); +} + +static int intro(void) { + _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL; + int r; + + r = sd_hwdb_new(&hwdb); + if (r == -ENOENT || ERRNO_IS_PRIVILEGE(r)) + return log_tests_skipped_errno(r, "cannot open hwdb"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-sd-path.c b/src/test/test-sd-path.c new file mode 100644 index 0000000..4f23e3b --- /dev/null +++ b/src/test/test-sd-path.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-path.h" + +#include "alloc-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +TEST(sd_path_lookup) { + for (uint64_t i = 0; i < _SD_PATH_MAX; i++) { + _cleanup_free_ char *t = NULL, *s = NULL; + int r; + + r = sd_path_lookup(i, NULL, &t); + if (i == SD_PATH_USER_RUNTIME && r == -ENXIO) + continue; + assert_se(r == 0); + assert_se(t); + log_info("%02"PRIu64": \"%s\"", i, t); + + assert_se(sd_path_lookup(i, "suffix", &s) == 0); + assert_se(s); + log_info("%02"PRIu64": \"%s\"", i, s); + assert_se(endswith(s, "/suffix")); + } + + char *tt; + assert_se(sd_path_lookup(_SD_PATH_MAX, NULL, &tt) == -EOPNOTSUPP); +} + +TEST(sd_path_lookup_strv) { + for (uint64_t i = 0; i < _SD_PATH_MAX; i++) { + _cleanup_strv_free_ char **t = NULL, **s = NULL; + int r; + + r = sd_path_lookup_strv(i, NULL, &t); + if (i == SD_PATH_USER_RUNTIME && r == -ENXIO) + continue; + assert_se(r == 0); + assert_se(t); + log_info("%02"PRIu64":", i); + STRV_FOREACH(item, t) + log_debug(" %s", *item); + + assert_se(sd_path_lookup_strv(i, "suffix", &s) == 0); + assert_se(s); + log_info("%02"PRIu64":", i); + STRV_FOREACH(item, s) { + assert_se(endswith(*item, "/suffix")); + log_debug(" %s", *item); + } + } + + char *tt; + assert_se(sd_path_lookup(_SD_PATH_MAX, NULL, &tt) == -EOPNOTSUPP); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c new file mode 100644 index 0000000..279a155 --- /dev/null +++ b/src/test/test-seccomp.c @@ -0,0 +1,1234 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if HAVE_VALGRIND_VALGRIND_H +#include +#endif + +#include "alloc-util.h" +#include "capability-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "macro.h" +#include "memory-util.h" +#include "missing_sched.h" +#include "missing_syscall.h" +#include "nsflags.h" +#include "nulstr-util.h" +#include "process-util.h" +#include "raw-clone.h" +#include "rm-rf.h" +#include "seccomp-util.h" +#include "set.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "virt.h" + +/* __NR_socket may be invalid due to libseccomp */ +#if !defined(__NR_socket) || __NR_socket < 0 || defined(__i386__) || defined(__s390x__) || defined(__s390__) || defined(__powerpc64__) || defined(__powerpc__) +/* On these archs, socket() is implemented via the socketcall() syscall multiplexer, + * and we can't restrict it hence via seccomp. */ +# define SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN 1 +#else +# define SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN 0 +#endif + +static bool have_seccomp_privs(void) { + return geteuid() == 0 && have_effective_cap(CAP_SYS_ADMIN) > 0; /* If we are root but CAP_SYS_ADMIN we can't do caps (unless we also do NNP) */ +} + +TEST(parse_syscall_and_errno) { + _cleanup_free_ char *n = NULL; + int e; + + assert_se(parse_syscall_and_errno("uname:EILSEQ", &n, &e) >= 0); + assert_se(streq(n, "uname")); + assert_se(e == errno_from_name("EILSEQ") && e >= 0); + n = mfree(n); + + assert_se(parse_syscall_and_errno("uname:EINVAL", &n, &e) >= 0); + assert_se(streq(n, "uname")); + assert_se(e == errno_from_name("EINVAL") && e >= 0); + n = mfree(n); + + assert_se(parse_syscall_and_errno("@sync:4095", &n, &e) >= 0); + assert_se(streq(n, "@sync")); + assert_se(e == 4095); + n = mfree(n); + + /* If errno is omitted, then e is set to -1 */ + assert_se(parse_syscall_and_errno("mount", &n, &e) >= 0); + assert_se(streq(n, "mount")); + assert_se(e == -1); + n = mfree(n); + + /* parse_syscall_and_errno() does not check the syscall name is valid or not. */ + assert_se(parse_syscall_and_errno("hoge:255", &n, &e) >= 0); + assert_se(streq(n, "hoge")); + assert_se(e == 255); + n = mfree(n); + + /* 0 is also a valid errno. */ + assert_se(parse_syscall_and_errno("hoge:0", &n, &e) >= 0); + assert_se(streq(n, "hoge")); + assert_se(e == 0); + n = mfree(n); + + assert_se(parse_syscall_and_errno("hoge:kill", &n, &e) >= 0); + assert_se(streq(n, "hoge")); + assert_se(e == SECCOMP_ERROR_NUMBER_KILL); + n = mfree(n); + + /* The function checks the syscall name is empty or not. */ + assert_se(parse_syscall_and_errno("", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno(":255", &n, &e) == -EINVAL); + + /* errno must be a valid errno name or number between 0 and ERRNO_MAX == 4095, or "kill" */ + assert_se(parse_syscall_and_errno("hoge:4096", &n, &e) == -ERANGE); + assert_se(parse_syscall_and_errno("hoge:-3", &n, &e) == -ERANGE); + assert_se(parse_syscall_and_errno("hoge:12.3", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno("hoge:123junk", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno("hoge:junk123", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno("hoge:255:EILSEQ", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno("hoge:-EINVAL", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno("hoge:EINVALaaa", &n, &e) == -EINVAL); + assert_se(parse_syscall_and_errno("hoge:", &n, &e) == -EINVAL); +} + +TEST(seccomp_arch_to_string) { + uint32_t a, b; + const char *name; + + a = seccomp_arch_native(); + assert_se(a > 0); + name = seccomp_arch_to_string(a); + assert_se(name); + assert_se(seccomp_arch_from_string(name, &b) >= 0); + assert_se(a == b); +} + +TEST(architecture_table) { + const char *n2; + + NULSTR_FOREACH(n, + "native\0" + "x86\0" + "x86-64\0" + "x32\0" + "arm\0" + "arm64\0" +#ifdef SCMP_ARCH_LOONGARCH64 + "loongarch64\0" +#endif + "mips\0" + "mips64\0" + "mips64-n32\0" + "mips-le\0" + "mips64-le\0" + "mips64-le-n32\0" + "parisc\0" + "parisc64\0" + "ppc\0" + "ppc64\0" + "ppc64-le\0" +#ifdef SCMP_ARCH_RISCV64 + "riscv64\0" +#endif + "s390\0" + "s390x\0") { + uint32_t c; + + assert_se(seccomp_arch_from_string(n, &c) >= 0); + n2 = seccomp_arch_to_string(c); + log_info("seccomp-arch: %s → 0x%"PRIx32" → %s", n, c, n2); + assert_se(streq_ptr(n, n2)); + } +} + +TEST(syscall_filter_set_find) { + assert_se(!syscall_filter_set_find(NULL)); + assert_se(!syscall_filter_set_find("")); + assert_se(!syscall_filter_set_find("quux")); + assert_se(!syscall_filter_set_find("@quux")); + + assert_se(syscall_filter_set_find("@clock") == syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK); + assert_se(syscall_filter_set_find("@default") == syscall_filter_sets + SYSCALL_FILTER_SET_DEFAULT); + assert_se(syscall_filter_set_find("@raw-io") == syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO); +} + +TEST(filter_sets) { + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) { + pid_t pid; + +#if HAVE_VALGRIND_VALGRIND_H + if (RUNNING_ON_VALGRIND && IN_SET(i, SYSCALL_FILTER_SET_DEFAULT, SYSCALL_FILTER_SET_BASIC_IO, SYSCALL_FILTER_SET_SIGNAL)) { + /* valgrind at least requires rt_sigprocmask(), read(), write(). */ + log_info("Running on valgrind, skipping %s", syscall_filter_sets[i].name); + continue; + } +#endif +#if HAS_FEATURE_ADDRESS_SANITIZER + if (IN_SET(i, SYSCALL_FILTER_SET_DEFAULT, SYSCALL_FILTER_SET_BASIC_IO, SYSCALL_FILTER_SET_SIGNAL)) { + /* ASAN at least requires sigaltstack(), read(), write(). */ + log_info("Running on address sanitizer, skipping %s", syscall_filter_sets[i].name); + continue; + } +#endif + + log_info("Testing %s", syscall_filter_sets[i].name); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { /* Child? */ + int fd, r; + + /* If we look at the default set (or one that includes it), allow-list instead of deny-list */ + if (IN_SET(i, SYSCALL_FILTER_SET_DEFAULT, + SYSCALL_FILTER_SET_SYSTEM_SERVICE, + SYSCALL_FILTER_SET_KNOWN)) + r = seccomp_load_syscall_filter_set(SCMP_ACT_ERRNO(EUCLEAN), syscall_filter_sets + i, SCMP_ACT_ALLOW, true); + else + r = seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EUCLEAN), true); + if (r < 0) + _exit(EXIT_FAILURE); + + /* Test the sycall filter with one random system call */ + fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC); + if (IN_SET(i, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_DEFAULT)) + assert_se(fd < 0 && errno == EUCLEAN); + else { + assert_se(fd >= 0); + safe_close(fd); + } + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check(syscall_filter_sets[i].name, pid, WAIT_LOG) == EXIT_SUCCESS); + } +} + +TEST(filter_sets_ordered) { + /* Ensure "@default" always remains at the beginning of the list */ + assert_se(SYSCALL_FILTER_SET_DEFAULT == 0); + assert_se(streq(syscall_filter_sets[0].name, "@default")); + + /* Ensure "@known" always remains at the end of the list */ + assert_se(SYSCALL_FILTER_SET_KNOWN == _SYSCALL_FILTER_SET_MAX - 1); + assert_se(streq(syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].name, "@known")); + + for (size_t i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) { + const char *p = NULL; + + /* Make sure each group has a description */ + assert_se(!isempty(syscall_filter_sets[0].help)); + + /* Make sure the groups are ordered alphabetically, except for the first and last entries */ + assert_se(i < 2 || i == _SYSCALL_FILTER_SET_MAX - 1 || + strcmp(syscall_filter_sets[i-1].name, syscall_filter_sets[i].name) < 0); + + NULSTR_FOREACH(k, syscall_filter_sets[i].value) { + + /* Ensure each syscall list is in itself ordered, but groups before names */ + assert_se(!p || + (*p == '@' && *k != '@') || + (((*p == '@' && *k == '@') || + (*p != '@' && *k != '@')) && + strcmp(p, k) < 0)); + + p = k; + } + } +} + +TEST(restrict_namespace) { + char *s = NULL; + unsigned long ul; + pid_t pid; + + if (!have_namespaces()) { + log_notice("Testing without namespaces, skipping %s", __func__); + return; + } + + assert_se(namespace_flags_to_string(0, &s) == 0 && isempty(s)); + s = mfree(s); + assert_se(namespace_flags_to_string(CLONE_NEWNS, &s) == 0 && streq(s, "mnt")); + s = mfree(s); + assert_se(namespace_flags_to_string(CLONE_NEWNS|CLONE_NEWIPC, &s) == 0 && streq(s, "ipc mnt")); + s = mfree(s); + assert_se(namespace_flags_to_string(CLONE_NEWCGROUP, &s) == 0 && streq(s, "cgroup")); + s = mfree(s); + + assert_se(namespace_flags_from_string("mnt", &ul) == 0 && ul == CLONE_NEWNS); + assert_se(namespace_flags_from_string(NULL, &ul) == 0 && ul == 0); + assert_se(namespace_flags_from_string("", &ul) == 0 && ul == 0); + assert_se(namespace_flags_from_string("uts", &ul) == 0 && ul == CLONE_NEWUTS); + assert_se(namespace_flags_from_string("mnt uts ipc", &ul) == 0 && ul == (CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC)); + + assert_se(namespace_flags_to_string(CLONE_NEWUTS, &s) == 0 && streq(s, "uts")); + assert_se(namespace_flags_from_string(s, &ul) == 0 && ul == CLONE_NEWUTS); + s = mfree(s); + assert_se(namespace_flags_from_string("ipc", &ul) == 0 && ul == CLONE_NEWIPC); + assert_se(namespace_flags_to_string(ul, &s) == 0 && streq(s, "ipc")); + s = mfree(s); + + assert_se(namespace_flags_to_string(NAMESPACE_FLAGS_ALL, &s) == 0); + assert_se(streq(s, "cgroup ipc net mnt pid user uts")); + assert_se(namespace_flags_from_string(s, &ul) == 0 && ul == NAMESPACE_FLAGS_ALL); + s = mfree(s); + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping remaining tests in %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping remaining tests in %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + + assert_se(seccomp_restrict_namespaces(CLONE_NEWNS|CLONE_NEWNET) >= 0); + + assert_se(unshare(CLONE_NEWNS) == 0); + assert_se(unshare(CLONE_NEWNET) == 0); + assert_se(unshare(CLONE_NEWUTS) == -1); + assert_se(errno == EPERM); + assert_se(unshare(CLONE_NEWIPC) == -1); + assert_se(errno == EPERM); + assert_se(unshare(CLONE_NEWNET|CLONE_NEWUTS) == -1); + assert_se(errno == EPERM); + + /* We use fd 0 (stdin) here, which of course will fail with EINVAL on setns(). Except of course our + * seccomp filter worked, and hits first and makes it return EPERM */ + assert_se(setns(0, CLONE_NEWNS) == -1); + assert_se(errno == EINVAL); + assert_se(setns(0, CLONE_NEWNET) == -1); + assert_se(errno == EINVAL); + assert_se(setns(0, CLONE_NEWUTS) == -1); + assert_se(errno == EPERM); + assert_se(setns(0, CLONE_NEWIPC) == -1); + assert_se(errno == EPERM); + assert_se(setns(0, CLONE_NEWNET|CLONE_NEWUTS) == -1); + assert_se(errno == EPERM); + assert_se(setns(0, 0) == -1); + assert_se(errno == EPERM); + + pid = raw_clone(CLONE_NEWNS); + assert_se(pid >= 0); + if (pid == 0) + _exit(EXIT_SUCCESS); + pid = raw_clone(CLONE_NEWNET); + assert_se(pid >= 0); + if (pid == 0) + _exit(EXIT_SUCCESS); + pid = raw_clone(CLONE_NEWUTS); + assert_se(pid < 0); + assert_se(errno == EPERM); + pid = raw_clone(CLONE_NEWIPC); + assert_se(pid < 0); + assert_se(errno == EPERM); + pid = raw_clone(CLONE_NEWNET|CLONE_NEWUTS); + assert_se(pid < 0); + assert_se(errno == EPERM); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("nsseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(protect_sysctl) { + pid_t pid; + _cleanup_free_ char *seccomp = NULL; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + /* in containers _sysctl() is likely missing anyway */ + if (detect_container() > 0) { + log_notice("Testing in container, skipping %s", __func__); + return; + } + + assert_se(get_proc_field("/proc/self/status", "Seccomp", WHITESPACE, &seccomp) == 0); + if (!streq(seccomp, "0")) + log_warning("Warning: seccomp filter detected, results may be unreliable for %s", __func__); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { +#if defined __NR__sysctl && __NR__sysctl >= 0 + assert_se(syscall(__NR__sysctl, NULL) < 0); + assert_se(IN_SET(errno, EFAULT, ENOSYS)); +#endif + + assert_se(seccomp_protect_sysctl() >= 0); + +#if HAVE_VALGRIND_VALGRIND_H + if (RUNNING_ON_VALGRIND) { + log_info("Running on valgrind, skipping syscall/EPERM test"); + _exit(EXIT_SUCCESS); + } +#endif + +#if defined __NR__sysctl && __NR__sysctl >= 0 + assert_se(syscall(__NR__sysctl, 0, 0, 0) < 0); + assert_se(errno == EPERM); +#endif + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("sysctlseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(protect_syslog) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + /* in containers syslog() is likely missing anyway */ + if (detect_container() > 0) { + log_notice("Testing in container, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { +#if defined __NR_syslog && __NR_syslog >= 0 + assert_se(syscall(__NR_syslog, -1, NULL, 0) < 0); + assert_se(errno == EINVAL); +#endif + + assert_se(seccomp_protect_syslog() >= 0); + +#if defined __NR_syslog && __NR_syslog >= 0 + assert_se(syscall(__NR_syslog, 0, 0, 0) < 0); + assert_se(errno == EPERM); +#endif + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("syslogseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(restrict_address_families) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + int fd; + Set *s; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + fd = socket(AF_UNIX, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + fd = socket(AF_NETLINK, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + assert_se(s = set_new(NULL)); + assert_se(set_put(s, INT_TO_PTR(AF_UNIX)) >= 0); + + assert_se(seccomp_restrict_address_families(s, false) >= 0); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + fd = socket(AF_UNIX, SOCK_DGRAM, 0); +#if SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN + assert_se(fd >= 0); + safe_close(fd); +#else + assert_se(fd < 0); + assert_se(errno == EAFNOSUPPORT); +#endif + + fd = socket(AF_NETLINK, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + set_clear(s); + + assert_se(set_put(s, INT_TO_PTR(AF_INET)) >= 0); + + assert_se(seccomp_restrict_address_families(s, true) >= 0); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + fd = socket(AF_UNIX, SOCK_DGRAM, 0); +#if SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN + assert_se(fd >= 0); + safe_close(fd); +#else + assert_se(fd < 0); + assert_se(errno == EAFNOSUPPORT); +#endif + + fd = socket(AF_NETLINK, SOCK_DGRAM, 0); +#if SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN + assert_se(fd >= 0); + safe_close(fd); +#else + assert_se(fd < 0); + assert_se(errno == EAFNOSUPPORT); +#endif + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("socketseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(restrict_realtime) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + /* in containers RT privs are likely missing anyway */ + if (detect_container() > 0) { + log_notice("Testing in container, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + /* On some CI environments, the restriction may be already enabled. */ + if (sched_setscheduler(0, SCHED_FIFO, &(struct sched_param) { .sched_priority = 1 }) < 0) { + log_full_errno(errno == EPERM ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to set scheduler parameter for FIFO: %m"); + assert(errno == EPERM); + } + if (sched_setscheduler(0, SCHED_RR, &(struct sched_param) { .sched_priority = 1 }) < 0) { + log_full_errno(errno == EPERM ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to set scheduler parameter for RR: %m"); + assert(errno == EPERM); + } + + assert_se(sched_setscheduler(0, SCHED_IDLE, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_BATCH, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_OTHER, &(struct sched_param) {}) >= 0); + + assert_se(seccomp_restrict_realtime_full(ENOANO) >= 0); + + assert_se(sched_setscheduler(0, SCHED_IDLE, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_BATCH, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_OTHER, &(struct sched_param) {}) >= 0); + + assert_se(sched_setscheduler(0, SCHED_FIFO, &(struct sched_param) { .sched_priority = 1 }) < 0); + assert_se(errno == ENOANO); + assert_se(sched_setscheduler(0, SCHED_RR, &(struct sched_param) { .sched_priority = 1 }) < 0); + assert_se(errno == ENOANO); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("realtimeseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(memory_deny_write_execute_mmap) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } +#if HAVE_VALGRIND_VALGRIND_H + if (RUNNING_ON_VALGRIND) { + log_notice("Running on valgrind, skipping %s", __func__); + return; + } +#endif +#if HAS_FEATURE_ADDRESS_SANITIZER + log_notice("Running on address sanitizer, skipping %s", __func__); + return; +#endif + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + void *p; + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert_se(p != MAP_FAILED); + assert_se(munmap(p, page_size()) >= 0); + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert_se(p != MAP_FAILED); + assert_se(munmap(p, page_size()) >= 0); + + assert_se(seccomp_memory_deny_write_execute() >= 0); + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); +#if defined(__x86_64__) || defined(__i386__) || defined(__powerpc64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) + assert_se(p == MAP_FAILED); + assert_se(errno == EPERM); +#endif + /* Depending on kernel, libseccomp, and glibc versions, other architectures + * might fail or not. Let's not assert success. */ + if (p != MAP_FAILED) + assert_se(munmap(p, page_size()) == 0); + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert_se(p != MAP_FAILED); + assert_se(munmap(p, page_size()) >= 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("memoryseccomp-mmap", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(memory_deny_write_execute_shmat) { + int shmid; + pid_t pid; + uint32_t arch; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + log_debug("arch %s: SCMP_SYS(mmap) = %d", seccomp_arch_to_string(arch), SCMP_SYS(mmap)); + log_debug("arch %s: SCMP_SYS(mmap2) = %d", seccomp_arch_to_string(arch), SCMP_SYS(mmap2)); + log_debug("arch %s: SCMP_SYS(shmget) = %d", seccomp_arch_to_string(arch), SCMP_SYS(shmget)); + log_debug("arch %s: SCMP_SYS(shmat) = %d", seccomp_arch_to_string(arch), SCMP_SYS(shmat)); + log_debug("arch %s: SCMP_SYS(shmdt) = %d", seccomp_arch_to_string(arch), SCMP_SYS(shmdt)); + } + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs() || have_effective_cap(CAP_IPC_OWNER) <= 0) { + log_notice("Not privileged, skipping %s", __func__); + return; + } +#if HAVE_VALGRIND_VALGRIND_H + if (RUNNING_ON_VALGRIND) { + log_notice("Running on valgrind, skipping %s", __func__); + return; + } +#endif +#if HAS_FEATURE_ADDRESS_SANITIZER + log_notice("Running on address sanitizer, skipping %s", __func__); + return; +#endif + + shmid = shmget(IPC_PRIVATE, page_size(), 0); + assert_se(shmid >= 0); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + void *p; + + p = shmat(shmid, NULL, 0); + assert_se(p != MAP_FAILED); + assert_se(shmdt(p) == 0); + + p = shmat(shmid, NULL, SHM_EXEC); + assert_se(p != MAP_FAILED); + assert_se(shmdt(p) == 0); + + assert_se(seccomp_memory_deny_write_execute() >= 0); + + p = shmat(shmid, NULL, SHM_EXEC); + log_debug_errno(p == MAP_FAILED ? errno : 0, "shmat(SHM_EXEC): %m"); +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) + assert_se(p == MAP_FAILED); + assert_se(errno == EPERM); +#endif + /* Depending on kernel, libseccomp, and glibc versions, other architectures + * might fail or not. Let's not assert success. */ + if (p != MAP_FAILED) + assert_se(shmdt(p) == 0); + + p = shmat(shmid, NULL, 0); + log_debug_errno(p == MAP_FAILED ? errno : 0, "shmat(0): %m"); + assert_se(p != MAP_FAILED); + assert_se(shmdt(p) == 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("memoryseccomp-shmat", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(restrict_archs) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + _cleanup_set_free_ Set *s = NULL; + + assert_se(access("/", F_OK) >= 0); + + assert_se(s = set_new(NULL)); + +#ifdef __x86_64__ + assert_se(set_put(s, UINT32_TO_PTR(SCMP_ARCH_X86+1)) >= 0); +#endif + assert_se(seccomp_restrict_archs(s) >= 0); + + assert_se(access("/", F_OK) >= 0); + assert_se(seccomp_restrict_archs(NULL) >= 0); + + assert_se(access("/", F_OK) >= 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("archseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(load_syscall_filter_set_raw) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + _cleanup_hashmap_free_ Hashmap *s = NULL; + + assert_se(access("/", F_OK) >= 0); + assert_se(poll(NULL, 0, 0) == 0); + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, scmp_act_kill_process(), true) >= 0); + assert_se(access("/", F_OK) >= 0); + assert_se(poll(NULL, 0, 0) == 0); + + assert_se(s = hashmap_new(NULL)); +#if defined __NR_access && __NR_access >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_access + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has access()"); +#endif +#if defined __NR_faccessat && __NR_faccessat >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has faccessat()"); +#endif +#if defined __NR_faccessat2 && __NR_faccessat2 >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat2 + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has faccessat2()"); +#endif + + assert_se(!hashmap_isempty(s)); + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUCLEAN), true) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EUCLEAN); + + assert_se(poll(NULL, 0, 0) == 0); + + hashmap_clear(s); +#if defined __NR_access && __NR_access >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_access + 1), INT_TO_PTR(EILSEQ)) >= 0); +#endif +#if defined __NR_faccessat && __NR_faccessat >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat + 1), INT_TO_PTR(EILSEQ)) >= 0); +#endif +#if defined __NR_faccessat2 && __NR_faccessat2 >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat2 + 1), INT_TO_PTR(EILSEQ)) >= 0); +#endif + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUCLEAN), true) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EILSEQ); + + assert_se(poll(NULL, 0, 0) == 0); + + hashmap_clear(s); +#if defined __NR_poll && __NR_poll >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_poll + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has poll()"); +#endif +#if defined __NR_ppoll && __NR_ppoll >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_ppoll + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has ppoll()"); +#endif +#if defined __NR_ppoll_time64 && __NR_ppoll_time64 >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_ppoll_time64 + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has ppoll_time64()"); +#endif + + assert_se(!hashmap_isempty(s)); + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUNATCH), true) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EILSEQ); + + assert_se(poll(NULL, 0, 0) < 0); + assert_se(errno == EUNATCH); + + hashmap_clear(s); +#if defined __NR_poll && __NR_poll >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_poll + 1), INT_TO_PTR(EILSEQ)) >= 0); +#endif +#if defined __NR_ppoll && __NR_ppoll >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_ppoll + 1), INT_TO_PTR(EILSEQ)) >= 0); +#endif +#if defined __NR_ppoll_time64 && __NR_ppoll_time64 >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_ppoll_time64 + 1), INT_TO_PTR(EILSEQ)) >= 0); +#endif + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUNATCH), true) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EILSEQ); + + assert_se(poll(NULL, 0, 0) < 0); + assert_se(errno == EILSEQ); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("syscallrawseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(native_syscalls_filtered) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + _cleanup_set_free_ Set *arch_s = NULL; + _cleanup_hashmap_free_ Hashmap *s = NULL; + + /* Passing "native" or an empty set is equivalent, just do both here. */ + assert_se(arch_s = set_new(NULL)); + assert_se(seccomp_restrict_archs(arch_s) >= 0); + assert_se(set_put(arch_s, SCMP_ARCH_NATIVE) >= 0); + assert_se(seccomp_restrict_archs(arch_s) >= 0); + + assert_se(access("/", F_OK) >= 0); + assert_se(poll(NULL, 0, 0) == 0); + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, scmp_act_kill_process(), true) >= 0); + assert_se(access("/", F_OK) >= 0); + assert_se(poll(NULL, 0, 0) == 0); + + assert_se(s = hashmap_new(NULL)); +#if defined __NR_access && __NR_access >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_access + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has access()"); +#endif +#if defined __NR_faccessat && __NR_faccessat >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has faccessat()"); +#endif +#if defined __NR_faccessat2 && __NR_faccessat2 >= 0 + assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat2 + 1), INT_TO_PTR(-1)) >= 0); + log_debug("has faccessat2()"); +#endif + + assert_se(!hashmap_isempty(s)); + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUCLEAN), true) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EUCLEAN); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("nativeseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +TEST(lock_personality) { + unsigned long current; + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + assert_se(opinionated_personality(¤t) >= 0); + /* On ppc64le sanitizers disable ASLR (i.e. by setting ADDR_NO_RANDOMIZE), + * which opinionated_personality() doesn't return. Let's tweak the current + * personality ourselves in such cases. + * See: https://github.com/llvm/llvm-project/commit/78f7a6eaa601bfdd6ae70ffd3da2254c21ff77f9 + */ + if (FLAGS_SET(safe_personality(PERSONALITY_INVALID), ADDR_NO_RANDOMIZE)) + current |= ADDR_NO_RANDOMIZE; + + log_info("current personality=0x%lX", current); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + assert_se(seccomp_lock_personality(current) >= 0); + + assert_se((unsigned long) safe_personality(current) == current); + + /* Note, we also test that safe_personality() works correctly, by checking whether errno is properly + * set, in addition to the return value */ + errno = 0; + assert_se(safe_personality(PER_LINUX | MMAP_PAGE_ZERO) == -EPERM); + assert_se(errno == EPERM); + + if (!FLAGS_SET(current, ADDR_NO_RANDOMIZE)) + assert_se(safe_personality(PER_LINUX | ADDR_NO_RANDOMIZE) == -EPERM); + assert_se(safe_personality(PER_LINUX | ADDR_COMPAT_LAYOUT) == -EPERM); + assert_se(safe_personality(PER_LINUX | READ_IMPLIES_EXEC) == -EPERM); + assert_se(safe_personality(PER_LINUX_32BIT) == -EPERM); + assert_se(safe_personality(PER_SVR4) == -EPERM); + assert_se(safe_personality(PER_BSD) == -EPERM); + assert_se(safe_personality(current == PER_LINUX ? PER_LINUX32 : PER_LINUX) == -EPERM); + assert_se(safe_personality(PER_LINUX32_3GB) == -EPERM); + assert_se(safe_personality(PER_UW7) == -EPERM); + assert_se(safe_personality(0x42) == -EPERM); + + assert_se(safe_personality(PERSONALITY_INVALID) == -EPERM); /* maybe remove this later */ + + assert_se((unsigned long) personality(current) == current); + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("lockpersonalityseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +static int real_open(const char *path, int flags, mode_t mode) { + /* glibc internally calls openat() when open() is requested. Let's hence define our own wrapper for + * testing purposes that calls the real syscall, on architectures where SYS_open is defined. On + * other architectures, let's just fall back to the glibc call. */ + +#if defined __NR_open && __NR_open >= 0 + return (int) syscall(__NR_open, path, flags, mode); +#else + return open(path, flags, mode); +#endif +} + +static int try_fchmodat2(int dirfd, const char *path, mode_t mode, int flags) { + int r; + + /* glibc does not provide a direct wrapper for fchmodat2(). Let's hence define our own wrapper for + * testing purposes that calls the real syscall, on architectures and in environments where + * SYS_fchmodat2 is defined. Otherwise, let's just fall back to the glibc fchmodat() call. */ + + /* Not supported by fchmodat() */ + assert_se(!FLAGS_SET(flags, AT_EMPTY_PATH)); + + r = RET_NERRNO(fchmodat2(dirfd, path, mode, flags)); + if (r != -ENOSYS) + return r; + + /* The syscall might still be unsupported by kernel or libseccomp. */ + return RET_NERRNO(fchmodat(dirfd, path, mode, flags)); +} + +TEST(restrict_suid_sgid) { + pid_t pid; + + if (!is_seccomp_available()) { + log_notice("Seccomp not available, skipping %s", __func__); + return; + } + if (!have_seccomp_privs()) { + log_notice("Not privileged, skipping %s", __func__); + return; + } + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + char path[] = "/tmp/suidsgidXXXXXX", dir[] = "/tmp/suidsgiddirXXXXXX"; + int fd = -EBADF, k = -EBADF; + const char *z; + + fd = mkostemp_safe(path); + assert_se(fd >= 0); + + assert_se(mkdtemp(dir)); + z = strjoina(dir, "/test"); + + assert_se(chmod(path, 0755 | S_ISUID) >= 0); + assert_se(chmod(path, 0755 | S_ISGID) >= 0); + assert_se(chmod(path, 0755 | S_ISGID | S_ISUID) >= 0); + assert_se(chmod(path, 0755) >= 0); + + assert_se(fchmod(fd, 0755 | S_ISUID) >= 0); + assert_se(fchmod(fd, 0755 | S_ISGID) >= 0); + assert_se(fchmod(fd, 0755 | S_ISGID | S_ISUID) >= 0); + assert_se(fchmod(fd, 0755) >= 0); + + assert_se(fchmodat(AT_FDCWD, path, 0755 | S_ISUID, 0) >= 0); + assert_se(fchmodat(AT_FDCWD, path, 0755 | S_ISGID, 0) >= 0); + assert_se(fchmodat(AT_FDCWD, path, 0755 | S_ISGID | S_ISUID, 0) >= 0); + assert_se(fchmodat(AT_FDCWD, path, 0755, 0) >= 0); + + assert_se(try_fchmodat2(AT_FDCWD, path, 0755 | S_ISUID, 0) >= 0); + assert_se(try_fchmodat2(AT_FDCWD, path, 0755 | S_ISGID, 0) >= 0); + assert_se(try_fchmodat2(AT_FDCWD, path, 0755 | S_ISGID | S_ISUID, 0) >= 0); + assert_se(try_fchmodat2(AT_FDCWD, path, 0755, 0) >= 0); + + k = real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISGID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID | S_ISGID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = creat(z, 0644 | S_ISUID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = creat(z, 0644 | S_ISGID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = creat(z, 0644 | S_ISUID | S_ISGID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = creat(z, 0644); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISGID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID | S_ISGID); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + k = openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + assert_se(mkdir(z, 0755 | S_ISUID) >= 0); + assert_se(rmdir(z) >= 0); + assert_se(mkdir(z, 0755 | S_ISGID) >= 0); + assert_se(rmdir(z) >= 0); + assert_se(mkdir(z, 0755 | S_ISUID | S_ISGID) >= 0); + assert_se(rmdir(z) >= 0); + assert_se(mkdir(z, 0755) >= 0); + assert_se(rmdir(z) >= 0); + + assert_se(mkdirat(AT_FDCWD, z, 0755 | S_ISUID) >= 0); + assert_se(rmdir(z) >= 0); + assert_se(mkdirat(AT_FDCWD, z, 0755 | S_ISGID) >= 0); + assert_se(rmdir(z) >= 0); + assert_se(mkdirat(AT_FDCWD, z, 0755 | S_ISUID | S_ISGID) >= 0); + assert_se(rmdir(z) >= 0); + assert_se(mkdirat(AT_FDCWD, z, 0755) >= 0); + assert_se(rmdir(z) >= 0); + + assert_se(mknod(z, S_IFREG | 0755 | S_ISUID, 0) >= 0); + assert_se(unlink(z) >= 0); + assert_se(mknod(z, S_IFREG | 0755 | S_ISGID, 0) >= 0); + assert_se(unlink(z) >= 0); + assert_se(mknod(z, S_IFREG | 0755 | S_ISUID | S_ISGID, 0) >= 0); + assert_se(unlink(z) >= 0); + assert_se(mknod(z, S_IFREG | 0755, 0) >= 0); + assert_se(unlink(z) >= 0); + + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755 | S_ISUID, 0) >= 0); + assert_se(unlink(z) >= 0); + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755 | S_ISGID, 0) >= 0); + assert_se(unlink(z) >= 0); + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755 | S_ISUID | S_ISGID, 0) >= 0); + assert_se(unlink(z) >= 0); + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755, 0) >= 0); + assert_se(unlink(z) >= 0); + + assert_se(seccomp_restrict_suid_sgid() >= 0); + + assert_se(chmod(path, 0775 | S_ISUID) < 0 && errno == EPERM); + assert_se(chmod(path, 0775 | S_ISGID) < 0 && errno == EPERM); + assert_se(chmod(path, 0775 | S_ISGID | S_ISUID) < 0 && errno == EPERM); + assert_se(chmod(path, 0775) >= 0); + + assert_se(fchmod(fd, 0775 | S_ISUID) < 0 && errno == EPERM); + assert_se(fchmod(fd, 0775 | S_ISGID) < 0 && errno == EPERM); + assert_se(fchmod(fd, 0775 | S_ISGID | S_ISUID) < 0 && errno == EPERM); + assert_se(fchmod(fd, 0775) >= 0); + + assert_se(fchmodat(AT_FDCWD, path, 0755 | S_ISUID, 0) < 0 && errno == EPERM); + assert_se(fchmodat(AT_FDCWD, path, 0755 | S_ISGID, 0) < 0 && errno == EPERM); + assert_se(fchmodat(AT_FDCWD, path, 0755 | S_ISGID | S_ISUID, 0) < 0 && errno == EPERM); + assert_se(fchmodat(AT_FDCWD, path, 0755, 0) >= 0); + + assert_se(try_fchmodat2(AT_FDCWD, path, 0755 | S_ISUID, 0) < 0 && errno == EPERM); + assert_se(try_fchmodat2(AT_FDCWD, path, 0755 | S_ISGID, 0) < 0 && errno == EPERM); + assert_se(try_fchmodat2(AT_FDCWD, path, 0755 | S_ISGID | S_ISUID, 0) < 0 && errno == EPERM); + assert_se(try_fchmodat2(AT_FDCWD, path, 0755, 0) >= 0); + + assert_se(real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID) < 0 && errno == EPERM); + assert_se(real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISGID) < 0 && errno == EPERM); + assert_se(real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID | S_ISGID) < 0 && errno == EPERM); + k = real_open(z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + assert_se(creat(z, 0644 | S_ISUID) < 0 && errno == EPERM); + assert_se(creat(z, 0644 | S_ISGID) < 0 && errno == EPERM); + assert_se(creat(z, 0644 | S_ISUID | S_ISGID) < 0 && errno == EPERM); + k = creat(z, 0644); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + assert_se(openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID) < 0 && errno == EPERM); + assert_se(openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISGID) < 0 && errno == EPERM); + assert_se(openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644 | S_ISUID | S_ISGID) < 0 && errno == EPERM); + k = openat(AT_FDCWD, z, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL, 0644); + k = safe_close(k); + assert_se(unlink(z) >= 0); + + assert_se(mkdir(z, 0755 | S_ISUID) < 0 && errno == EPERM); + assert_se(mkdir(z, 0755 | S_ISGID) < 0 && errno == EPERM); + assert_se(mkdir(z, 0755 | S_ISUID | S_ISGID) < 0 && errno == EPERM); + assert_se(mkdir(z, 0755) >= 0); + assert_se(rmdir(z) >= 0); + + assert_se(mkdirat(AT_FDCWD, z, 0755 | S_ISUID) < 0 && errno == EPERM); + assert_se(mkdirat(AT_FDCWD, z, 0755 | S_ISGID) < 0 && errno == EPERM); + assert_se(mkdirat(AT_FDCWD, z, 0755 | S_ISUID | S_ISGID) < 0 && errno == EPERM); + assert_se(mkdirat(AT_FDCWD, z, 0755) >= 0); + assert_se(rmdir(z) >= 0); + + assert_se(mknod(z, S_IFREG | 0755 | S_ISUID, 0) < 0 && errno == EPERM); + assert_se(mknod(z, S_IFREG | 0755 | S_ISGID, 0) < 0 && errno == EPERM); + assert_se(mknod(z, S_IFREG | 0755 | S_ISUID | S_ISGID, 0) < 0 && errno == EPERM); + assert_se(mknod(z, S_IFREG | 0755, 0) >= 0); + assert_se(unlink(z) >= 0); + + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755 | S_ISUID, 0) < 0 && errno == EPERM); + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755 | S_ISGID, 0) < 0 && errno == EPERM); + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755 | S_ISUID | S_ISGID, 0) < 0 && errno == EPERM); + assert_se(mknodat(AT_FDCWD, z, S_IFREG | 0755, 0) >= 0); + assert_se(unlink(z) >= 0); + + assert_se(unlink(path) >= 0); + assert_se(rm_rf(dir, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_check("suidsgidseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-secure-bits.c b/src/test/test-secure-bits.c new file mode 100644 index 0000000..27e6a20 --- /dev/null +++ b/src/test/test-secure-bits.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "securebits-util.h" +#include "strv.h" +#include "tests.h" +#include "unit-file.h" + +static const char * const string_bits[] = { + "keep-caps", + "keep-caps-locked", + "no-setuid-fixup", + "no-setuid-fixup-locked", + "noroot", + "noroot-locked", + NULL +}; + +TEST(secure_bits_basic) { + _cleanup_free_ char *joined = NULL, *str = NULL; + int r; + + /* Check if converting each bit from string and back to string yields + * the same value */ + STRV_FOREACH(bit, string_bits) { + _cleanup_free_ char *s = NULL; + + r = secure_bits_from_string(*bit); + assert_se(r > 0); + assert_se(secure_bits_is_valid(r)); + assert_se(secure_bits_to_string_alloc(r, &s) >= 0); + printf("%s = 0x%x = %s\n", *bit, (unsigned)r, s); + assert_se(streq(*bit, s)); + } + + /* Ditto, but with all bits at once */ + joined = strv_join((char**)string_bits, " "); + assert_se(joined); + r = secure_bits_from_string(joined); + assert_se(r > 0); + assert_se(secure_bits_is_valid(r)); + assert_se(secure_bits_to_string_alloc(r, &str) >= 0); + printf("%s = 0x%x = %s\n", joined, (unsigned)r, str); + assert_se(streq(joined, str)); + + str = mfree(str); + + /* Empty string */ + assert_se(secure_bits_from_string("") == 0); + assert_se(secure_bits_from_string(" ") == 0); + + /* Only invalid entries */ + assert_se(secure_bits_from_string("foo bar baz") == 0); + + /* Empty secure bits */ + assert_se(secure_bits_to_string_alloc(0, &str) >= 0); + assert_se(isempty(str)); + + str = mfree(str); + + /* Bits to string with check */ + assert_se(secure_bits_to_string_alloc_with_check(INT_MAX, &str) == -EINVAL); + assert_se(str == NULL); + assert_se(secure_bits_to_string_alloc_with_check( + (1 << SECURE_KEEP_CAPS) | (1 << SECURE_KEEP_CAPS_LOCKED), + &str) >= 0); + assert_se(streq(str, "keep-caps keep-caps-locked")); +} + +TEST(secure_bits_mix) { + static struct sbit_table { + const char *input; + const char *expected; + } sbit_table[] = { + { "keep-caps keep-caps keep-caps", "keep-caps" }, + { "keep-caps noroot keep-caps", "keep-caps noroot" }, + { "noroot foo bar baz noroot", "noroot" }, + { "noroot \"foo\" \"bar keep-caps", "noroot" }, + { "\"noroot foo\" bar keep-caps", "keep-caps" }, + {} + }; + + for (const struct sbit_table *s = sbit_table; s->input; s++) { + _cleanup_free_ char *str = NULL; + int r; + + r = secure_bits_from_string(s->input); + assert_se(r > 0); + assert_se(secure_bits_is_valid(r)); + assert_se(secure_bits_to_string_alloc(r, &str) >= 0); + printf("%s = 0x%x = %s\n", s->input, (unsigned)r, str); + assert_se(streq(s->expected, str)); + } +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-selinux.c b/src/test/test-selinux.c new file mode 100644 index 0000000..04b5ba1 --- /dev/null +++ b/src/test/test-selinux.c @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "log.h" +#include "selinux-util.h" +#include "string-util.h" +#include "tests.h" +#include "time-util.h" + +static void test_testing(void) { + bool b; + + log_info("============ %s ==========", __func__); + + b = mac_selinux_use(); + log_info("mac_selinux_use → %s", yes_no(b)); + + b = mac_selinux_use(); + log_info("mac_selinux_use → %s", yes_no(b)); + + mac_selinux_retest(); + + b = mac_selinux_use(); + log_info("mac_selinux_use → %s", yes_no(b)); + + b = mac_selinux_use(); + log_info("mac_selinux_use → %s", yes_no(b)); +} + +static void test_loading(void) { + usec_t n1, n2; + int r; + + log_info("============ %s ==========", __func__); + + n1 = now(CLOCK_MONOTONIC); + r = mac_selinux_init(); + n2 = now(CLOCK_MONOTONIC); + log_info_errno(r, "mac_selinux_init → %d %.2fs (%m)", r, (n2 - n1)/1e6); +} + +static void test_cleanup(void) { + usec_t n1, n2; + + log_info("============ %s ==========", __func__); + + n1 = now(CLOCK_MONOTONIC); + mac_selinux_finish(); + n2 = now(CLOCK_MONOTONIC); + log_info("mac_selinux_finish → %.2fs", (n2 - n1)/1e6); +} + +static void test_misc(const char* fname) { + _cleanup_(mac_selinux_freep) char *label = NULL, *label2 = NULL, *label3 = NULL; + int r; + _cleanup_close_ int fd = -EBADF; + + log_info("============ %s ==========", __func__); + + r = mac_selinux_get_our_label(&label); + log_info_errno(r, "mac_selinux_get_our_label → %d, \"%s\" (%m)", + r, strnull(label)); + + r = mac_selinux_get_create_label_from_exe(fname, &label2); + log_info_errno(r, "mac_selinux_create_label_from_exe → %d, \"%s\" (%m)", + r, strnull(label2)); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + + r = mac_selinux_get_child_mls_label(fd, fname, label2, &label3); + log_info_errno(r, "mac_selinux_get_child_mls_label → %d, \"%s\" (%m)", + r, strnull(label3)); +} + +static void test_create_file_prepare(const char* fname) { + int r; + + log_info("============ %s ==========", __func__); + + r = mac_selinux_create_file_prepare(fname, S_IRWXU); + log_info_errno(r, "mac_selinux_create_file_prepare → %d (%m)", r); + + mac_selinux_create_file_clear(); +} + +int main(int argc, char **argv) { + const char *path = SYSTEMD_BINARY_PATH; + if (argc >= 2) + path = argv[1]; + + test_setup_logging(LOG_DEBUG); + + test_testing(); + test_loading(); + test_misc(path); + test_create_file_prepare(path); + test_cleanup(); + + return 0; +} diff --git a/src/test/test-serialize.c b/src/test/test-serialize.c new file mode 100644 index 0000000..8f74472 --- /dev/null +++ b/src/test/test-serialize.c @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "log.h" +#include "serialize.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static char long_string[LONG_LINE_MAX+1]; + +TEST(serialize_item) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(serialize_item(f, "a", NULL) == 0); + assert_se(serialize_item(f, "a", "bbb") == 1); + assert_se(serialize_item(f, "a", "bbb") == 1); + assert_se(serialize_bool_elide(f, "c", true) == 1); + assert_se(serialize_bool_elide(f, "d", false) == 0); + assert_se(serialize_item(f, "a", long_string) == -EINVAL); + assert_se(serialize_item(f, long_string, "a") == -EINVAL); + assert_se(serialize_item(f, long_string, long_string) == -EINVAL); + + rewind(f); + + _cleanup_free_ char *line1 = NULL, *line2 = NULL, *line3 = NULL, *line4 = NULL; + assert_se(read_line(f, LONG_LINE_MAX, &line1) > 0); + assert_se(streq(line1, "a=bbb")); + assert_se(read_line(f, LONG_LINE_MAX, &line2) > 0); + assert_se(streq(line2, "a=bbb")); + assert_se(read_line(f, LONG_LINE_MAX, &line3) > 0); + assert_se(streq(line3, "c=yes")); + assert_se(read_line(f, LONG_LINE_MAX, &line4) == 0); + assert_se(streq(line4, "")); +} + +TEST(serialize_item_escaped) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(serialize_item_escaped(f, "a", NULL) == 0); + assert_se(serialize_item_escaped(f, "a", "bbb") == 1); + assert_se(serialize_item_escaped(f, "a", "bbb") == 1); + assert_se(serialize_item_escaped(f, "a", long_string) == -EINVAL); + assert_se(serialize_item_escaped(f, long_string, "a") == -EINVAL); + assert_se(serialize_item_escaped(f, long_string, long_string) == -EINVAL); + + rewind(f); + + _cleanup_free_ char *line1 = NULL, *line2 = NULL, *line3 = NULL; + assert_se(read_line(f, LONG_LINE_MAX, &line1) > 0); + assert_se(streq(line1, "a=bbb")); + assert_se(read_line(f, LONG_LINE_MAX, &line2) > 0); + assert_se(streq(line2, "a=bbb")); + assert_se(read_line(f, LONG_LINE_MAX, &line3) == 0); + assert_se(streq(line3, "")); +} + +TEST(serialize_usec) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(serialize_usec(f, "usec1", USEC_INFINITY) == 0); + assert_se(serialize_usec(f, "usec2", 0) == 1); + assert_se(serialize_usec(f, "usec3", USEC_INFINITY-1) == 1); + + rewind(f); + + _cleanup_free_ char *line1 = NULL, *line2 = NULL; + usec_t x; + + assert_se(read_line(f, LONG_LINE_MAX, &line1) > 0); + assert_se(streq(line1, "usec2=0")); + assert_se(deserialize_usec(line1 + 6, &x) == 0); + assert_se(x == 0); + + assert_se(read_line(f, LONG_LINE_MAX, &line2) > 0); + assert_se(startswith(line2, "usec3=")); + assert_se(deserialize_usec(line2 + 6, &x) == 0); + assert_se(x == USEC_INFINITY-1); +} + +TEST(serialize_strv) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + char **strv = STRV_MAKE("a", "b", "foo foo", + "nasty1 \"", + "\"nasty2 ", + "nasty3 '", + "\"nasty4 \"", + "nasty5\n", + "\nnasty5\nfoo=bar", + "\nnasty5\nfoo=bar"); + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(serialize_strv(f, "strv1", NULL) == 0); + assert_se(serialize_strv(f, "strv2", STRV_MAKE_EMPTY) == 0); + assert_se(serialize_strv(f, "strv3", strv) == 1); + assert_se(serialize_strv(f, "strv4", STRV_MAKE(long_string)) == -EINVAL); + + rewind(f); + + _cleanup_strv_free_ char **strv2 = NULL; + for (;;) { + _cleanup_free_ char *line = NULL; + int r; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r == 0) + break; + assert_se(r > 0); + + const char *t = startswith(line, "strv3="); + assert_se(t); + assert_se(deserialize_strv(t, &strv2) >= 0); + } + + assert_se(strv_equal(strv, strv2)); +} + +TEST(deserialize_environment) { + _cleanup_strv_free_ char **env; + + assert_se(env = strv_new("A=1")); + + assert_se(deserialize_environment("B=2", &env) >= 0); + assert_se(deserialize_environment("FOO%%=a\\177b\\nc\\td e", &env) >= 0); + + assert_se(strv_equal(env, STRV_MAKE("A=1", "B=2", "FOO%%=a\177b\nc\td e"))); + + assert_se(deserialize_environment("foo\\", &env) < 0); + assert_se(deserialize_environment("bar\\_baz", &env) < 0); +} + +TEST(serialize_environment) { + _cleanup_strv_free_ char **env = NULL, **env2 = NULL; + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-env-util.XXXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(env = strv_new("A=1", + "B=2", + "C=ąęółń", + "D=D=a\\x0Ab", + "FOO%%=a\177b\nc\td e")); + + assert_se(serialize_strv(f, "env", env) == 1); + assert_se(fflush_and_check(f) == 0); + + rewind(f); + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *l; + + r = read_line(f, LONG_LINE_MAX, &line); + assert_se(r >= 0); + + if (r == 0) + break; + + l = strstrip(line); + + assert_se(startswith(l, "env=")); + + r = deserialize_environment(l+4, &env2); + assert_se(r >= 0); + } + assert_se(feof(f)); + + assert_se(strv_equal(env, env2)); +} + +TEST(serialize_item_hexmem) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(serialize_item_hexmem(f, "a", NULL, 0) == 0); + assert_se(serialize_item_hexmem(f, "a", (uint8_t []){0xff, 0xff, 0xff}, sizeof(uint8_t) * 3) == 1); + + rewind(f); + + _cleanup_free_ char *line = NULL; + assert_se(read_line(f, LONG_LINE_MAX, &line) > 0); + assert_se(streq(line, "a=ffffff")); + +} + +TEST(serialize_item_base64mem) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(serialize_item_base64mem(f, "a", NULL, 0) == 0); + assert_se(serialize_item_base64mem(f, "a", (uint8_t []){0xff, 0xff, 0xff}, sizeof(uint8_t) * 3) == 1); + + rewind(f); + + _cleanup_free_ char *line = NULL; + assert_se(read_line(f, LONG_LINE_MAX, &line) > 0); + assert_se(streq(line, "a=////")); +} + +TEST(serialize_string_set) { + _cleanup_(unlink_tempfilep) char fn[] = "/tmp/test-serialize.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_set_free_free_ Set *s = NULL; + _cleanup_free_ char *line1 = NULL, *line2 = NULL; + char *p, *q; + + assert_se(fmkostemp_safe(fn, "r+", &f) == 0); + log_info("/* %s (%s) */", __func__, fn); + + assert_se(set_ensure_allocated(&s, &string_hash_ops) >= 0); + + assert_se(serialize_string_set(f, "a", s) == 0); + + assert_se(set_put_strsplit(s, "abc def,ghi jkl", ",", 0) >= 0); + + assert_se(serialize_string_set(f, "a", s) == 1); + + rewind(f); + + assert_se(read_line(f, LONG_LINE_MAX, &line1) > 0); + assert_se((p = startswith(line1, "a="))); + + assert_se(read_line(f, LONG_LINE_MAX, &line2) > 0); + assert_se((q = startswith(line2, "a="))); + + assert_se(!streq(p, q)); + assert_se(STR_IN_SET(p, "abc def", "ghi jkl")); + assert_se(STR_IN_SET(q, "abc def", "ghi jkl")); +} + +static int intro(void) { + memset(long_string, 'x', sizeof(long_string)-1); + char_array_0(long_string); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-set-disable-mempool.c b/src/test/test-set-disable-mempool.c new file mode 100644 index 0000000..91244b2 --- /dev/null +++ b/src/test/test-set-disable-mempool.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "mempool.h" +#include "process-util.h" +#include "set.h" +#include "tests.h" + +#define NUM 100 + +static void* thread(void *p) { + Set **s = p; + + assert_se(s); + assert_se(*s); + + assert_se(!is_main_thread()); + assert_se(mempool_enabled); + assert_se(!mempool_enabled()); + + assert_se(set_size(*s) == NUM); + *s = set_free(*s); + + return NULL; +} + +static void test_one(const char *val) { + pthread_t t; + int x[NUM] = {}; + unsigned i; + Set *s; + + log_info("Testing with SYSTEMD_MEMPOOL=%s", val); + assert_se(setenv("SYSTEMD_MEMPOOL", val, true) == 0); + + assert_se(is_main_thread()); + assert_se(mempool_enabled); /* It is a weak symbol, but we expect it to be available */ + assert_se(!mempool_enabled()); + + assert_se(s = set_new(NULL)); + for (i = 0; i < NUM; i++) + assert_se(set_put(s, &x[i])); + + assert_se(pthread_create(&t, NULL, thread, &s) == 0); + assert_se(pthread_join(t, NULL) == 0); + + assert_se(!s); +} + +TEST(disable_mempool) { + test_one("0"); + /* The value $SYSTEMD_MEMPOOL= is cached. So the following + * test should also succeed. */ + test_one("1"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-set.c b/src/test/test-set.c new file mode 100644 index 0000000..0d5a6a1 --- /dev/null +++ b/src/test/test-set.c @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "random-util.h" +#include "set.h" +#include "strv.h" +#include "tests.h" + +TEST(set_steal_first) { + _cleanup_set_free_ Set *m = NULL; + int seen[3] = {}; + char *val; + + m = set_new(&string_hash_ops); + assert_se(m); + + assert_se(set_put(m, (void*) "1") == 1); + assert_se(set_put(m, (void*) "22") == 1); + assert_se(set_put(m, (void*) "333") == 1); + + while ((val = set_steal_first(m))) + seen[strlen(val) - 1]++; + + assert_se(seen[0] == 1 && seen[1] == 1 && seen[2] == 1); + + assert_se(set_isempty(m)); +} + +typedef struct Item { + int seen; +} Item; +static void item_seen(Item *item) { + item->seen++; +} + +TEST(set_free_with_destructor) { + Set *m; + struct Item items[4] = {}; + + assert_se(m = set_new(NULL)); + for (size_t i = 0; i < ELEMENTSOF(items) - 1; i++) + assert_se(set_put(m, items + i) == 1); + + m = set_free_with_destructor(m, item_seen); + assert_se(items[0].seen == 1); + assert_se(items[1].seen == 1); + assert_se(items[2].seen == 1); + assert_se(items[3].seen == 0); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(item_hash_ops, void, trivial_hash_func, trivial_compare_func, Item, item_seen); + +TEST(set_free_with_hash_ops) { + Set *m; + struct Item items[4] = {}; + + assert_se(m = set_new(&item_hash_ops)); + for (size_t i = 0; i < ELEMENTSOF(items) - 1; i++) + assert_se(set_put(m, items + i) == 1); + + m = set_free(m); + assert_se(items[0].seen == 1); + assert_se(items[1].seen == 1); + assert_se(items[2].seen == 1); + assert_se(items[3].seen == 0); +} + +TEST(set_put) { + _cleanup_set_free_ Set *m = NULL; + + m = set_new(&string_hash_ops); + assert_se(m); + + assert_se(set_put(m, (void*) "1") == 1); + assert_se(set_put(m, (void*) "22") == 1); + assert_se(set_put(m, (void*) "333") == 1); + assert_se(set_put(m, (void*) "333") == 0); + assert_se(set_remove(m, (void*) "333")); + assert_se(set_put(m, (void*) "333") == 1); + assert_se(set_put(m, (void*) "333") == 0); + assert_se(set_put(m, (void*) "22") == 0); + + _cleanup_free_ char **t = set_get_strv(m); + assert_se(strv_contains(t, "1")); + assert_se(strv_contains(t, "22")); + assert_se(strv_contains(t, "333")); + assert_se(strv_length(t) == 3); +} + +TEST(set_put_strndup) { + _cleanup_set_free_ Set *m = NULL; + + assert_se(set_put_strndup(&m, "12345", 0) == 1); + assert_se(set_put_strndup(&m, "12345", 1) == 1); + assert_se(set_put_strndup(&m, "12345", 2) == 1); + assert_se(set_put_strndup(&m, "12345", 3) == 1); + assert_se(set_put_strndup(&m, "12345", 4) == 1); + assert_se(set_put_strndup(&m, "12345", 5) == 1); + assert_se(set_put_strndup(&m, "12345", 6) == 0); + + assert_se(set_contains(m, "")); + assert_se(set_contains(m, "1")); + assert_se(set_contains(m, "12")); + assert_se(set_contains(m, "123")); + assert_se(set_contains(m, "1234")); + assert_se(set_contains(m, "12345")); + + assert_se(set_size(m) == 6); +} + +TEST(set_put_strdup) { + _cleanup_set_free_ Set *m = NULL; + + assert_se(set_put_strdup(&m, "aaa") == 1); + assert_se(set_put_strdup(&m, "aaa") == 0); + assert_se(set_put_strdup(&m, "bbb") == 1); + assert_se(set_put_strdup(&m, "bbb") == 0); + assert_se(set_put_strdup(&m, "aaa") == 0); + + assert_se(set_contains(m, "aaa")); + assert_se(set_contains(m, "bbb")); + + assert_se(set_size(m) == 2); +} + +TEST(set_put_strdupv) { + _cleanup_set_free_ Set *m = NULL; + + assert_se(set_put_strdupv(&m, STRV_MAKE("aaa", "aaa", "bbb", "bbb", "aaa")) == 2); + assert_se(set_put_strdupv(&m, STRV_MAKE("aaa", "aaa", "bbb", "bbb", "ccc")) == 1); + + assert_se(set_contains(m, "aaa")); + assert_se(set_contains(m, "bbb")); + assert_se(set_contains(m, "ccc")); + + assert_se(set_size(m) == 3); +} + +TEST(set_ensure_allocated) { + _cleanup_set_free_ Set *m = NULL; + + assert_se(set_ensure_allocated(&m, &string_hash_ops) == 1); + assert_se(set_ensure_allocated(&m, &string_hash_ops) == 0); + assert_se(set_ensure_allocated(&m, NULL) == 0); + assert_se(set_size(m) == 0); +} + +TEST(set_copy) { + _cleanup_set_free_ Set *s = NULL; + _cleanup_set_free_free_ Set *copy = NULL; + char *key1, *key2, *key3, *key4; + + key1 = strdup("key1"); + assert_se(key1); + key2 = strdup("key2"); + assert_se(key2); + key3 = strdup("key3"); + assert_se(key3); + key4 = strdup("key4"); + assert_se(key4); + + s = set_new(&string_hash_ops); + assert_se(s); + + assert_se(set_put(s, key1) >= 0); + assert_se(set_put(s, key2) >= 0); + assert_se(set_put(s, key3) >= 0); + assert_se(set_put(s, key4) >= 0); + + copy = set_copy(s); + assert_se(copy); + + assert_se(set_equal(s, copy)); +} + +TEST(set_ensure_put) { + _cleanup_set_free_ Set *m = NULL; + + assert_se(set_ensure_put(&m, &string_hash_ops, "a") == 1); + assert_se(set_ensure_put(&m, &string_hash_ops, "a") == 0); + assert_se(set_ensure_put(&m, NULL, "a") == 0); + assert_se(set_ensure_put(&m, &string_hash_ops, "b") == 1); + assert_se(set_ensure_put(&m, &string_hash_ops, "b") == 0); + assert_se(set_ensure_put(&m, &string_hash_ops, "a") == 0); + assert_se(set_size(m) == 2); +} + +TEST(set_ensure_consume) { + _cleanup_set_free_ Set *m = NULL; + char *s, *t; + + assert_se(s = strdup("a")); + assert_se(set_ensure_consume(&m, &string_hash_ops_free, s) == 1); + + assert_se(t = strdup("a")); + assert_se(set_ensure_consume(&m, &string_hash_ops_free, t) == 0); + + assert_se(t = strdup("a")); + assert_se(set_ensure_consume(&m, &string_hash_ops_free, t) == 0); + + assert_se(t = strdup("b")); + assert_se(set_ensure_consume(&m, &string_hash_ops_free, t) == 1); + + assert_se(t = strdup("b")); + assert_se(set_ensure_consume(&m, &string_hash_ops_free, t) == 0); + + assert_se(set_size(m) == 2); +} + +TEST(set_strjoin) { + _cleanup_set_free_ Set *m = NULL; + _cleanup_free_ char *joined = NULL; + + /* Empty set */ + assert_se(set_strjoin(m, NULL, false, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, "", false, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, " ", false, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, "xxx", false, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, NULL, true, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, "", true, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, " ", true, &joined) >= 0); + assert_se(!joined); + assert_se(set_strjoin(m, "xxx", true, &joined) >= 0); + assert_se(!joined); + + /* Single entry */ + assert_se(set_put_strdup(&m, "aaa") == 1); + assert_se(set_strjoin(m, NULL, false, &joined) >= 0); + assert_se(streq(joined, "aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, "", false, &joined) >= 0); + assert_se(streq(joined, "aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, " ", false, &joined) >= 0); + assert_se(streq(joined, "aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, "xxx", false, &joined) >= 0); + assert_se(streq(joined, "aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, NULL, true, &joined) >= 0); + assert_se(streq(joined, "aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, "", true, &joined) >= 0); + assert_se(streq(joined, "aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, " ", true, &joined) >= 0); + assert_se(streq(joined, " aaa ")); + joined = mfree(joined); + assert_se(set_strjoin(m, "xxx", true, &joined) >= 0); + assert_se(streq(joined, "xxxaaaxxx")); + + /* Two entries */ + assert_se(set_put_strdup(&m, "bbb") == 1); + assert_se(set_put_strdup(&m, "aaa") == 0); + joined = mfree(joined); + assert_se(set_strjoin(m, NULL, false, &joined) >= 0); + assert_se(STR_IN_SET(joined, "aaabbb", "bbbaaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, "", false, &joined) >= 0); + assert_se(STR_IN_SET(joined, "aaabbb", "bbbaaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, " ", false, &joined) >= 0); + assert_se(STR_IN_SET(joined, "aaa bbb", "bbb aaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, "xxx", false, &joined) >= 0); + assert_se(STR_IN_SET(joined, "aaaxxxbbb", "bbbxxxaaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, NULL, true, &joined) >= 0); + assert_se(STR_IN_SET(joined, "aaabbb", "bbbaaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, "", true, &joined) >= 0); + assert_se(STR_IN_SET(joined, "aaabbb", "bbbaaa")); + joined = mfree(joined); + assert_se(set_strjoin(m, " ", true, &joined) >= 0); + assert_se(STR_IN_SET(joined, " aaa bbb ", " bbb aaa ")); + joined = mfree(joined); + assert_se(set_strjoin(m, "xxx", true, &joined) >= 0); + assert_se(STR_IN_SET(joined, "xxxaaaxxxbbbxxx", "xxxbbbxxxaaaxxx")); +} + +TEST(set_equal) { + _cleanup_set_free_ Set *a = NULL, *b = NULL; + void *p; + int r; + + assert_se(a = set_new(NULL)); + assert_se(b = set_new(NULL)); + + assert_se(set_equal(a, a)); + assert_se(set_equal(b, b)); + assert_se(set_equal(a, b)); + assert_se(set_equal(b, a)); + assert_se(set_equal(NULL, a)); + assert_se(set_equal(NULL, b)); + assert_se(set_equal(a, NULL)); + assert_se(set_equal(b, NULL)); + assert_se(set_equal(NULL, NULL)); + + for (unsigned i = 0; i < 333; i++) { + p = INT32_TO_PTR(1 + (random_u32() & 0xFFFU)); + + r = set_put(a, p); + assert_se(r >= 0 || r == -EEXIST); + } + + assert_se(set_put(a, INT32_TO_PTR(0x1000U)) >= 0); + + assert_se(set_size(a) >= 2); + assert_se(set_size(a) <= 334); + + assert_se(!set_equal(a, b)); + assert_se(!set_equal(b, a)); + assert_se(!set_equal(a, NULL)); + + SET_FOREACH(p, a) + assert_se(set_put(b, p) >= 0); + + assert_se(set_equal(a, b)); + assert_se(set_equal(b, a)); + + assert_se(set_remove(a, INT32_TO_PTR(0x1000U)) == INT32_TO_PTR(0x1000U)); + + assert_se(!set_equal(a, b)); + assert_se(!set_equal(b, a)); + + assert_se(set_remove(b, INT32_TO_PTR(0x1000U)) == INT32_TO_PTR(0x1000U)); + + assert_se(set_equal(a, b)); + assert_se(set_equal(b, a)); + + assert_se(set_put(b, INT32_TO_PTR(0x1001U)) >= 0); + + assert_se(!set_equal(a, b)); + assert_se(!set_equal(b, a)); + + assert_se(set_put(a, INT32_TO_PTR(0x1001U)) >= 0); + + assert_se(set_equal(a, b)); + assert_se(set_equal(b, a)); + + set_clear(a); + + assert_se(!set_equal(a, b)); + assert_se(!set_equal(b, a)); + + set_clear(b); + + assert_se(set_equal(a, b)); + assert_se(set_equal(b, a)); +} + +TEST(set_fnmatch) { + _cleanup_set_free_ Set *match = NULL, *nomatch = NULL; + + assert_se(set_put_strdup(&match, "aaa") >= 0); + assert_se(set_put_strdup(&match, "bbb*") >= 0); + assert_se(set_put_strdup(&match, "*ccc") >= 0); + + assert_se(set_put_strdup(&nomatch, "a*") >= 0); + assert_se(set_put_strdup(&nomatch, "bbb") >= 0); + assert_se(set_put_strdup(&nomatch, "ccc*") >= 0); + + assert_se(set_fnmatch(NULL, NULL, "")); + assert_se(set_fnmatch(NULL, NULL, "hoge")); + + assert_se(set_fnmatch(match, NULL, "aaa")); + assert_se(set_fnmatch(match, NULL, "bbb")); + assert_se(set_fnmatch(match, NULL, "bbbXXX")); + assert_se(set_fnmatch(match, NULL, "ccc")); + assert_se(set_fnmatch(match, NULL, "XXXccc")); + assert_se(!set_fnmatch(match, NULL, "")); + assert_se(!set_fnmatch(match, NULL, "aaaa")); + assert_se(!set_fnmatch(match, NULL, "XXbbb")); + assert_se(!set_fnmatch(match, NULL, "cccXX")); + + assert_se(set_fnmatch(NULL, nomatch, "")); + assert_se(set_fnmatch(NULL, nomatch, "Xa")); + assert_se(set_fnmatch(NULL, nomatch, "bbbb")); + assert_se(set_fnmatch(NULL, nomatch, "XXXccc")); + assert_se(!set_fnmatch(NULL, nomatch, "a")); + assert_se(!set_fnmatch(NULL, nomatch, "aXXXX")); + assert_se(!set_fnmatch(NULL, nomatch, "bbb")); + assert_se(!set_fnmatch(NULL, nomatch, "ccc")); + assert_se(!set_fnmatch(NULL, nomatch, "cccXXX")); + + assert_se(set_fnmatch(match, nomatch, "bbbbb")); + assert_se(set_fnmatch(match, nomatch, "XXccc")); + assert_se(!set_fnmatch(match, nomatch, "")); + assert_se(!set_fnmatch(match, nomatch, "a")); + assert_se(!set_fnmatch(match, nomatch, "aaa")); + assert_se(!set_fnmatch(match, nomatch, "b")); + assert_se(!set_fnmatch(match, nomatch, "bbb")); + assert_se(!set_fnmatch(match, nomatch, "ccc")); + assert_se(!set_fnmatch(match, nomatch, "ccccc")); + assert_se(!set_fnmatch(match, nomatch, "cccXX")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-sha256.c b/src/test/test-sha256.c new file mode 100644 index 0000000..f168e4c --- /dev/null +++ b/src/test/test-sha256.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "hexdecoct.h" +#include "sha256.h" +#include "string-util.h" +#include "tests.h" + +static void sha256_process_string(const char *key, struct sha256_ctx *ctx) { + sha256_process_bytes(key, strlen(key), ctx); +} + +static void test_sha256_one(const char *key, const char *expect) { + uint8_t result[SHA256_DIGEST_SIZE + 3]; + _cleanup_free_ char *str = NULL; + struct sha256_ctx ctx; + + log_debug("\"%s\" → %s", key, expect); + + assert_se(str = new(char, strlen(key) + 4)); + + /* This tests unaligned buffers. */ + + for (size_t i = 0; i < 4; i++) { + strcpy(str + i, key); + + for (size_t j = 0; j < 4; j++) { + _cleanup_free_ char *hex_result = NULL; + + sha256_init_ctx(&ctx); + sha256_process_string(str + i, &ctx); + sha256_finish_ctx(&ctx, result + j); + + hex_result = hexmem(result + j, SHA256_DIGEST_SIZE); + assert_se(streq_ptr(hex_result, expect)); + } + } +} + +TEST(sha256) { + /* Results compared with output of 'echo -n "" | sha256sum -' */ + + test_sha256_one("abcdefghijklmnopqrstuvwxyz", + "71c480df93d6ae2f1efad1447c66c9525e316218cf51fc8d9ed832f2daf18b73"); + test_sha256_one("ほげほげあっちょんぶりけ", + "ce7225683653be3b74861c5a4323b6baf3c3ceb361413ca99e3a5b52c04411bd"); + test_sha256_one("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789", + "9cfe7faff7054298ca87557e15a10262de8d3eee77827417fbdfea1c41b9ec23"); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-sigbus.c b/src/test/test-sigbus.c new file mode 100644 index 0000000..299463c --- /dev/null +++ b/src/test/test-sigbus.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#if HAVE_VALGRIND_VALGRIND_H +# include +#endif + +#include "fd-util.h" +#include "fs-util.h" +#include "memory-util.h" +#include "sigbus.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_close_ int fd = -EBADF; + char template[] = "/tmp/sigbus-test-XXXXXX"; + void *addr = NULL; + uint8_t *p; + + test_setup_logging(LOG_INFO); + +#if HAS_FEATURE_ADDRESS_SANITIZER + return log_tests_skipped("address-sanitizer is enabled"); +#endif +#if HAVE_VALGRIND_VALGRIND_H + if (RUNNING_ON_VALGRIND) + return log_tests_skipped("This test cannot run on valgrind"); +#endif + + sigbus_install(); + + assert_se(sigbus_pop(&addr) == 0); + + assert_se((fd = mkostemp(template, O_RDWR|O_CREAT|O_EXCL)) >= 0); + assert_se(unlink(template) >= 0); + assert_se(posix_fallocate_loop(fd, 0, page_size() * 8) >= 0); + + p = mmap(NULL, page_size() * 16, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + assert_se(p != MAP_FAILED); + + assert_se(sigbus_pop(&addr) == 0); + + p[0] = 0xFF; + assert_se(sigbus_pop(&addr) == 0); + + p[page_size()] = 0xFF; + assert_se(sigbus_pop(&addr) == 0); + + p[page_size()*8] = 0xFF; + p[page_size()*8+1] = 0xFF; + p[page_size()*10] = 0xFF; + assert_se(sigbus_pop(&addr) > 0); + assert_se(addr == p + page_size() * 8); + assert_se(sigbus_pop(&addr) > 0); + assert_se(addr == p + page_size() * 10); + assert_se(sigbus_pop(&addr) == 0); + + sigbus_reset(); +} diff --git a/src/test/test-signal-util.c b/src/test/test-signal-util.c new file mode 100644 index 0000000..335066a --- /dev/null +++ b/src/test/test-signal-util.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "log.h" +#include "macro.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "tests.h" +#include "process-util.h" + +#define info(sig) log_info(#sig " = " STRINGIFY(sig) " = %d", sig) + +TEST(rt_signals) { + info(SIGRTMIN); + info(SIGRTMAX); + + /* We use signals SIGRTMIN+0 to SIGRTMIN+24 unconditionally */ + assert_se(SIGRTMAX - SIGRTMIN >= 24); +} + +static void test_signal_to_string_one(int val) { + const char *p; + + assert_se(p = signal_to_string(val)); + + assert_se(signal_from_string(p) == val); + + p = strjoina("SIG", p); + assert_se(signal_from_string(p) == val); +} + +static void test_signal_from_string_one(const char *s, int val) { + const char *p; + + assert_se(signal_from_string(s) == val); + + p = strjoina("SIG", s); + assert_se(signal_from_string(p) == val); +} + +static void test_signal_from_string_number(const char *s, int val) { + const char *p; + + assert_se(signal_from_string(s) == val); + + p = strjoina("SIG", s); + assert_se(signal_from_string(p) == -EINVAL); +} + +TEST(signal_from_string) { + char buf[STRLEN("RTMIN+") + DECIMAL_STR_MAX(int) + 1]; + + test_signal_to_string_one(SIGHUP); + test_signal_to_string_one(SIGTERM); + test_signal_to_string_one(SIGRTMIN); + test_signal_to_string_one(SIGRTMIN+3); + test_signal_to_string_one(SIGRTMAX-4); + + test_signal_from_string_one("RTMIN", SIGRTMIN); + test_signal_from_string_one("RTMAX", SIGRTMAX); + + xsprintf(buf, "RTMIN+%d", SIGRTMAX-SIGRTMIN); + test_signal_from_string_one(buf, SIGRTMAX); + + xsprintf(buf, "RTMIN+%d", INT_MAX); + test_signal_from_string_one(buf, -ERANGE); + + xsprintf(buf, "RTMAX-%d", SIGRTMAX-SIGRTMIN); + test_signal_from_string_one(buf, SIGRTMIN); + + xsprintf(buf, "RTMAX-%d", INT_MAX); + test_signal_from_string_one(buf, -ERANGE); + + test_signal_from_string_one("", -EINVAL); + test_signal_from_string_one("hup", -EINVAL); + test_signal_from_string_one("HOGEHOGE", -EINVAL); + + test_signal_from_string_one("RTMIN-5", -EINVAL); + test_signal_from_string_one("RTMIN- 5", -EINVAL); + test_signal_from_string_one("RTMIN -5", -EINVAL); + test_signal_from_string_one("RTMIN+ 5", -EINVAL); + test_signal_from_string_one("RTMIN +5", -EINVAL); + test_signal_from_string_one("RTMIN+100", -ERANGE); + test_signal_from_string_one("RTMIN+-3", -EINVAL); + test_signal_from_string_one("RTMIN++3", -EINVAL); + test_signal_from_string_one("RTMIN+HUP", -EINVAL); + test_signal_from_string_one("RTMIN3", -EINVAL); + + test_signal_from_string_one("RTMAX+5", -EINVAL); + test_signal_from_string_one("RTMAX+ 5", -EINVAL); + test_signal_from_string_one("RTMAX +5", -EINVAL); + test_signal_from_string_one("RTMAX- 5", -EINVAL); + test_signal_from_string_one("RTMAX -5", -EINVAL); + test_signal_from_string_one("RTMAX-100", -ERANGE); + test_signal_from_string_one("RTMAX-+3", -EINVAL); + test_signal_from_string_one("RTMAX--3", -EINVAL); + test_signal_from_string_one("RTMAX-HUP", -EINVAL); + + test_signal_from_string_number("3", 3); + test_signal_from_string_number("+5", 5); + test_signal_from_string_number(" +5", 5); + test_signal_from_string_number("10000", -ERANGE); + test_signal_from_string_number("-2", -ERANGE); +} + +TEST(block_signals) { + assert_se(signal_is_blocked(SIGUSR1) == 0); + assert_se(signal_is_blocked(SIGALRM) == 0); + assert_se(signal_is_blocked(SIGVTALRM) == 0); + + { + BLOCK_SIGNALS(SIGUSR1, SIGVTALRM); + + assert_se(signal_is_blocked(SIGUSR1) > 0); + assert_se(signal_is_blocked(SIGALRM) == 0); + assert_se(signal_is_blocked(SIGVTALRM) > 0); + } + + assert_se(signal_is_blocked(SIGUSR1) == 0); + assert_se(signal_is_blocked(SIGALRM) == 0); + assert_se(signal_is_blocked(SIGVTALRM) == 0); +} + +TEST(ignore_signals) { + assert_se(ignore_signals(SIGINT) >= 0); + assert_se(kill(getpid_cached(), SIGINT) >= 0); + assert_se(ignore_signals(SIGUSR1, SIGUSR2, SIGTERM, SIGPIPE) >= 0); + assert_se(kill(getpid_cached(), SIGUSR1) >= 0); + assert_se(kill(getpid_cached(), SIGUSR2) >= 0); + assert_se(kill(getpid_cached(), SIGTERM) >= 0); + assert_se(kill(getpid_cached(), SIGPIPE) >= 0); + assert_se(default_signals(SIGINT, SIGUSR1, SIGUSR2, SIGTERM, SIGPIPE) >= 0); +} + +TEST(pop_pending_signal) { + + assert_se(signal_is_blocked(SIGUSR1) == 0); + assert_se(signal_is_blocked(SIGUSR2) == 0); + assert_se(pop_pending_signal(SIGUSR1) == 0); + assert_se(pop_pending_signal(SIGUSR2) == 0); + + { + BLOCK_SIGNALS(SIGUSR1, SIGUSR2); + + assert_se(signal_is_blocked(SIGUSR1) > 0); + assert_se(signal_is_blocked(SIGUSR2) > 0); + + assert_se(pop_pending_signal(SIGUSR1) == 0); + assert_se(pop_pending_signal(SIGUSR2) == 0); + + assert_se(raise(SIGUSR1) >= 0); + + assert_se(pop_pending_signal(SIGUSR2) == 0); + assert_se(pop_pending_signal(SIGUSR1) == SIGUSR1); + assert_se(pop_pending_signal(SIGUSR1) == 0); + + assert_se(raise(SIGUSR1) >= 0); + assert_se(raise(SIGUSR2) >= 0); + + assert_cc(SIGUSR1 < SIGUSR2); + + assert_se(pop_pending_signal(SIGUSR1, SIGUSR2) == SIGUSR1); + assert_se(pop_pending_signal(SIGUSR1, SIGUSR2) == SIGUSR2); + assert_se(pop_pending_signal(SIGUSR1, SIGUSR2) == 0); + } + + assert_se(signal_is_blocked(SIGUSR1) == 0); + assert_se(signal_is_blocked(SIGUSR2) == 0); + assert_se(pop_pending_signal(SIGUSR1) == 0); + assert_se(pop_pending_signal(SIGUSR2) == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-siphash24.c b/src/test/test-siphash24.c new file mode 100644 index 0000000..de91eb2 --- /dev/null +++ b/src/test/test-siphash24.c @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memory-util.h" +#include "siphash24.h" +#include "tests.h" + +#define ITERATIONS 10000000ULL + +static void test_alignment_one(const uint8_t *in, size_t len, const uint8_t *key) { + struct siphash state = {}; + uint64_t out; + unsigned i, j; + + out = siphash24(in, len, key); + assert_se(out == 0xa129ca6149be45e5); + + /* verify the internal state as given in the above paper */ + siphash24_init(&state, key); + assert_se(state.v0 == 0x7469686173716475); + assert_se(state.v1 == 0x6b617f6d656e6665); + assert_se(state.v2 == 0x6b7f62616d677361); + assert_se(state.v3 == 0x7b6b696e727e6c7b); + siphash24_compress(in, len, &state); + assert_se(state.v0 == 0x4a017198de0a59e0); + assert_se(state.v1 == 0x0d52f6f62a4f59a4); + assert_se(state.v2 == 0x634cb3577b01fd3d); + assert_se(state.v3 == 0xa5224d6f55c7d9c8); + out = siphash24_finalize(&state); + assert_se(out == 0xa129ca6149be45e5); + assert_se(state.v0 == 0xf6bcd53893fecff1); + assert_se(state.v1 == 0x54b9964c7ea0d937); + assert_se(state.v2 == 0x1b38329c099bb55a); + assert_se(state.v3 == 0x1814bb89ad7be679); + + /* verify that decomposing the input in three chunks gives the + same result */ + for (i = 0; i < len; i++) { + for (j = i; j < len; j++) { + siphash24_init(&state, key); + siphash24_compress(in, i, &state); + siphash24_compress(&in[i], j - i, &state); + siphash24_compress(&in[j], len - j, &state); + out = siphash24_finalize(&state); + assert_se(out == 0xa129ca6149be45e5); + } + } +} + +TEST(alignment) { + const uint8_t in[15] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e }; + const uint8_t key[16] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + uint8_t in_buf[20]; + + /* Test with same input but different alignments. */ + memcpy(in_buf, in, sizeof(in)); + test_alignment_one(in_buf, sizeof(in), key); + memcpy(in_buf + 1, in, sizeof(in)); + test_alignment_one(in_buf + 1, sizeof(in), key); + memcpy(in_buf + 2, in, sizeof(in)); + test_alignment_one(in_buf + 2, sizeof(in), key); + memcpy(in_buf + 4, in, sizeof(in)); + test_alignment_one(in_buf + 4, sizeof(in), key); +} + +TEST(short_hashes) { + const uint8_t one[] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; + const uint8_t key[16] = { 0x22, 0x24, 0x41, 0x22, 0x55, 0x77, 0x88, 0x07, + 0x23, 0x09, 0x23, 0x14, 0x0c, 0x33, 0x0e, 0x0f}; + uint8_t two[sizeof one] = {}; + + struct siphash state1 = {}, state2 = {}; + unsigned i, j; + + siphash24_init(&state1, key); + siphash24_init(&state2, key); + + /* hashing 1, 2, 3, 4, 5, ..., 16 bytes, with the byte after the buffer different */ + for (i = 1; i <= sizeof one; i++) { + siphash24_compress(one, i, &state1); + + two[i-1] = one[i-1]; + siphash24_compress(two, i, &state2); + + assert_se(memcmp(&state1, &state2, sizeof state1) == 0); + } + + /* hashing n and 1, n and 2, n and 3, ..., n-1 and 1, n-2 and 2, ... */ + for (i = sizeof one; i > 0; i--) { + zero(two); + + for (j = 1; j <= sizeof one; j++) { + siphash24_compress(one, i, &state1); + siphash24_compress(one, j, &state1); + + siphash24_compress(one, i, &state2); + two[j-1] = one[j-1]; + siphash24_compress(two, j, &state2); + + assert_se(memcmp(&state1, &state2, sizeof state1) == 0); + } + } +} + +/* see https://131002.net/siphash/siphash.pdf, Appendix A */ +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-sizeof.c b/src/test/test-sizeof.c new file mode 100644 index 0000000..ea0c587 --- /dev/null +++ b/src/test/test-sizeof.c @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#define __STDC_WANT_IEC_60559_TYPES_EXT__ +#include + +#include "time-util.h" + +/* Print information about various types. Useful when diagnosing + * gcc diagnostics on an unfamiliar architecture. */ + +DISABLE_WARNING_TYPE_LIMITS; + +#define info_no_sign(t) \ + printf("%s → %zu bits, %zu byte alignment\n", STRINGIFY(t), \ + sizeof(t)*CHAR_BIT, \ + alignof(t)) + +#define info(t) \ + printf("%s → %zu bits%s, %zu byte alignment\n", STRINGIFY(t), \ + sizeof(t)*CHAR_BIT, \ + strstr(STRINGIFY(t), "signed") ? "" : \ + (t)-1 < (t)0 ? ", signed" : ", unsigned", \ + alignof(t)) + +#define check_no_sign(t, size) \ + do { \ + info_no_sign(t); \ + assert_se(sizeof(t) == size); \ + } while (false) + +#define check(t, size) \ + do { \ + info(t); \ + assert_se(sizeof(t) == size); \ + } while (false) + +enum Enum { + enum_value, +}; + +enum BigEnum { + big_enum_value = UINT64_C(1), +}; + +enum BigEnum2 { + big_enum2_pos = UINT64_C(1), + big_enum2_neg = UINT64_C(-1), +}; + +int main(void) { + int (*function_pointer)(void); + + check_no_sign(dev_t, SIZEOF_DEV_T); + check_no_sign(ino_t, SIZEOF_INO_T); + check_no_sign(rlim_t, SIZEOF_RLIM_T); + check(time_t, SIZEOF_TIME_T); + check(typeof(((struct timex *)0)->freq), SIZEOF_TIMEX_MEMBER); + + info_no_sign(typeof(function_pointer)); + info_no_sign(void*); + info(char*); + + info(char); + info(signed char); + info(unsigned char); + info(short unsigned); + info(unsigned); + info(unsigned long); + info(unsigned long long); +#ifdef __GLIBC__ + info(__syscall_ulong_t); + info(__syscall_slong_t); +#endif + info(intmax_t); + info(uintmax_t); + + info(float); + info(double); + info(long double); + +#ifdef FLT128_MAX + info(_Float128); + info(_Float64); + info(_Float64x); + info(_Float32); + info(_Float32x); +#endif + + info(size_t); + info(ssize_t); + info(usec_t); +#ifdef __GLIBC__ + info(__time_t); +#endif + info(pid_t); + info(uid_t); + info(gid_t); + info(socklen_t); + +#ifdef __GLIBC__ + info(__cpu_mask); +#endif + + info(enum Enum); + info(enum BigEnum); + info(enum BigEnum2); + assert_cc(sizeof(enum BigEnum2) == 8); + printf("big_enum2_pos → %zu\n", sizeof(big_enum2_pos)); + printf("big_enum2_neg → %zu\n", sizeof(big_enum2_neg)); + + printf("timeval: %zu\n", sizeof(struct timeval)); + printf("timespec: %zu\n", sizeof(struct timespec)); + + void *x = malloc(100); + + printf("local variable: %p\n", &function_pointer); + printf("glibc function: %p\n", memcpy); + printf("heap allocation: %p\n", x); + free(x); + + return 0; +} diff --git a/src/test/test-sleep-config.c b/src/test/test-sleep-config.c new file mode 100644 index 0000000..112fec6 --- /dev/null +++ b/src/test/test-sleep-config.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "efivars.h" +#include "errno-util.h" +#include "log.h" +#include "sleep-config.h" +#include "strv.h" +#include "tests.h" + +TEST(parse_sleep_config) { + _cleanup_(sleep_config_freep) SleepConfig *sleep_config = NULL; + + assert_se(parse_sleep_config(&sleep_config) == 0); + + _cleanup_free_ char *sum = NULL, *sus = NULL, *him = NULL, *his = NULL, *hym = NULL, *hys = NULL; + + sum = strv_join(sleep_config->modes[SLEEP_SUSPEND], ", "); + sus = strv_join(sleep_config->states[SLEEP_SUSPEND], ", "); + him = strv_join(sleep_config->modes[SLEEP_HIBERNATE], ", "); + his = strv_join(sleep_config->states[SLEEP_HIBERNATE], ", "); + hym = strv_join(sleep_config->modes[SLEEP_HYBRID_SLEEP], ", "); + hys = strv_join(sleep_config->states[SLEEP_HYBRID_SLEEP], ", "); + log_debug(" allow_suspend: %s", yes_no(sleep_config->allow[SLEEP_SUSPEND])); + log_debug(" allow_hibernate: %s", yes_no(sleep_config->allow[SLEEP_HIBERNATE])); + log_debug(" allow_s2h: %s", yes_no(sleep_config->allow[SLEEP_SUSPEND_THEN_HIBERNATE])); + log_debug(" allow_hybrid_sleep: %s", yes_no(sleep_config->allow[SLEEP_HYBRID_SLEEP])); + log_debug(" suspend modes: %s", sum); + log_debug(" states: %s", sus); + log_debug(" hibernate modes: %s", him); + log_debug(" states: %s", his); + log_debug(" hybrid modes: %s", hym); + log_debug(" states: %s", hys); +} + +TEST(sleep_supported) { + _cleanup_strv_free_ char + **standby = strv_new("standby"), + **mem = strv_new("mem"), + **disk = strv_new("disk"), + **suspend = strv_new("suspend"), + **reboot = strv_new("reboot"), + **platform = strv_new("platform"), + **shutdown = strv_new("shutdown"), + **freeze = strv_new("freeze"); + int r; + + printf("Secure boot: %sd\n", enable_disable(is_efi_secure_boot())); + + log_info("/= individual sleep modes =/"); + log_info("Standby configured: %s", yes_no(sleep_state_supported(standby) > 0)); + log_info("Suspend configured: %s", yes_no(sleep_state_supported(mem) > 0)); + log_info("Hibernate configured: %s", yes_no(sleep_state_supported(disk) > 0)); + log_info("Hibernate+Suspend (Hybrid-Sleep) configured: %s", yes_no(sleep_mode_supported(suspend) > 0)); + log_info("Hibernate+Reboot configured: %s", yes_no(sleep_mode_supported(reboot) > 0)); + log_info("Hibernate+Platform configured: %s", yes_no(sleep_mode_supported(platform) > 0)); + log_info("Hibernate+Shutdown configured: %s", yes_no(sleep_mode_supported(shutdown) > 0)); + log_info("Freeze configured: %s", yes_no(sleep_state_supported(freeze) > 0)); + + log_info("/= high-level sleep verbs =/"); + r = sleep_supported(SLEEP_SUSPEND); + log_info("Suspend configured and possible: %s", r >= 0 ? yes_no(r) : STRERROR(r)); + r = sleep_supported(SLEEP_HIBERNATE); + log_info("Hibernation configured and possible: %s", r >= 0 ? yes_no(r) : STRERROR(r)); + r = sleep_supported(SLEEP_HYBRID_SLEEP); + log_info("Hybrid-sleep configured and possible: %s", r >= 0 ? yes_no(r) : STRERROR(r)); + r = sleep_supported(SLEEP_SUSPEND_THEN_HIBERNATE); + log_info("Suspend-then-Hibernate configured and possible: %s", r >= 0 ? yes_no(r) : STRERROR(r)); +} + +static int intro(void) { + if (getuid() != 0) + log_warning("This program is unlikely to work for unprivileged users"); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-socket-bind.c b/src/test/test-socket-bind.c new file mode 100644 index 0000000..84a8978 --- /dev/null +++ b/src/test/test-socket-bind.c @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-socket-bind.h" +#include "load-fragment.h" +#include "manager.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "service.h" +#include "strv.h" +#include "tests.h" +#include "unit.h" +#include "virt.h" + +static int find_netcat_executable(char **ret_path) { + char **candidates = STRV_MAKE("ncat", "nc", "netcat"); + int r = 0; + + STRV_FOREACH(c, candidates) { + r = find_executable(*c, ret_path); + if (r == 0) + break; + } + + return r; +} + +static int test_socket_bind( + Manager *m, + const char *unit_name, + const char *netcat_path, + const char *port, + char **allow_rules, + char **deny_rules) { + _cleanup_free_ char *exec_start = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + CGroupContext *cc = NULL; + int cld_code, r; + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, unit_name) == 0); + assert_se(cc = unit_get_cgroup_context(u)); + + STRV_FOREACH(rule, allow_rules) { + r = config_parse_cgroup_socket_bind( + u->id, "filename", 1, "Service", 1, "SocketBindAllow", 0, + *rule, &cc->socket_bind_allow, u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to parse SocketBindAllow: %m"); + } + + fprintf(stderr, "SocketBindAllow: "); + cgroup_context_dump_socket_bind_items(cc->socket_bind_allow, stderr); + fputc('\n', stderr); + + STRV_FOREACH(rule, deny_rules) { + r = config_parse_cgroup_socket_bind( + u->id, "filename", 1, "Service", 1, "SocketBindDeny", 0, + *rule, &cc->socket_bind_deny, u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to parse SocketBindDeny: %m"); + } + + fprintf(stderr, "SocketBindDeny: "); + cgroup_context_dump_socket_bind_items(cc->socket_bind_deny, stderr); + fputc('\n', stderr); + + exec_start = strjoin("-timeout --preserve-status -sSIGTERM 1s ", netcat_path, " -l ", port, " -vv"); + assert_se(exec_start != NULL); + + r = config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", + SERVICE_EXEC_START, exec_start, SERVICE(u)->exec_command, u); + if (r < 0) + return log_error_errno(r, "Failed to parse ExecStart"); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + r = unit_start(u, NULL); + if (r < 0) + return log_error_errno(r, "Unit start failed %m"); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) { + r = sd_event_run(m->event, UINT64_MAX); + if (r < 0) + return log_error_errno(errno, "Event run failed %m"); + } + + cld_code = SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code; + if (cld_code != CLD_EXITED) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "ExecStart didn't exited, code='%s'", sigchld_code_to_string(cld_code)); + + if (SERVICE(u)->state != SERVICE_DEAD) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Service is not dead"); + + return 0; +} + +int main(int argc, char *argv[]) { + _cleanup_free_ char *unit_dir = NULL, *netcat_path = NULL; + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + struct rlimit rl; + int r; + + test_setup_logging(LOG_DEBUG); + + if (detect_container() > 0) + return log_tests_skipped("test-socket-bind fails inside LXC and Docker containers: https://github.com/systemd/systemd/issues/9666"); + + assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0); + rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE); + (void) setrlimit_closest(RLIMIT_MEMLOCK, &rl); + + if (!can_memlock()) + return log_tests_skipped("Can't use mlock()"); + + r = bpf_socket_bind_supported(); + if (r <= 0) + return log_tests_skipped("socket-bind is not supported"); + + if (find_netcat_executable(&netcat_path) != 0) + return log_tests_skipped("Cannot find netcat executable"); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + assert_se(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "2000", STRV_MAKE("2000"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "2000", STRV_MAKE("ipv6:2001-2002"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "6666", STRV_MAKE("ipv4:6666", "6667"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "6666", STRV_MAKE("6667", "6668", ""), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "7777", STRV_MAKE_EMPTY, STRV_MAKE_EMPTY) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "8888", STRV_MAKE("any"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "8888", STRV_MAKE("ipv6:tcp:8888-8889"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "10000", STRV_MAKE("ipv6:udp:9999-10000"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "6666", STRV_MAKE("ipv4:tcp:6666"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "6666", STRV_MAKE("ipv4:udp:6666"), STRV_MAKE("any")) >= 0); + assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "6666", STRV_MAKE("tcp:6666"), STRV_MAKE("any")) >= 0); + + return 0; +} diff --git a/src/test/test-socket-netlink.c b/src/test/test-socket-netlink.c new file mode 100644 index 0000000..6dbd50f --- /dev/null +++ b/src/test/test-socket-netlink.c @@ -0,0 +1,372 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "missing_network.h" +#include "tests.h" +#include "socket-netlink.h" +#include "string-util.h" + +static void test_socket_address_parse_one(const char *in, int ret, int family, const char *expected) { + SocketAddress a; + _cleanup_free_ char *out = NULL; + int r; + + r = socket_address_parse(&a, in); + if (r >= 0) { + r = socket_address_print(&a, &out); + if (r < 0) + log_error_errno(r, "Printing failed for \"%s\": %m", in); + assert_se(r >= 0); + assert_se(a.type == 0); + } + + log_info("\"%s\" → %s %d → \"%s\" (expect %d / \"%s\")", + in, + r >= 0 ? "✓" : "✗", r, + empty_to_dash(out), + ret, + ret >= 0 ? expected ?: in : "-"); + assert_se(r == ret); + if (r >= 0) { + assert_se(a.sockaddr.sa.sa_family == family); + assert_se(streq(out, expected ?: in)); + } +} + +TEST(socket_address_parse) { + test_socket_address_parse_one("junk", -EINVAL, 0, NULL); + test_socket_address_parse_one("192.168.1.1", -EINVAL, 0, NULL); + test_socket_address_parse_one(".168.1.1", -EINVAL, 0, NULL); + test_socket_address_parse_one("989.168.1.1", -EINVAL, 0, NULL); + test_socket_address_parse_one("192.168.1.1:65536", -ERANGE, 0, NULL); + test_socket_address_parse_one("192.168.1.1:0", -EINVAL, 0, NULL); + test_socket_address_parse_one("0", -EINVAL, 0, NULL); + test_socket_address_parse_one("65536", -ERANGE, 0, NULL); + + const int default_family = socket_ipv6_is_supported() ? AF_INET6 : AF_INET; + + test_socket_address_parse_one("65535", 0, default_family, + default_family == AF_INET6 ? "[::]:65535": "0.0.0.0:65535"); + + /* The checks below will pass even if ipv6 is disabled in + * kernel. The underlying glibc's inet_pton() is just a string + * parser and doesn't make any syscalls. */ + + test_socket_address_parse_one("[::1]", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]8888", -EINVAL, 0, NULL); + test_socket_address_parse_one("::1", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]:0", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]:65536", -ERANGE, 0, NULL); + test_socket_address_parse_one("[a:b:1]:8888", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]%lo:1234", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]%lo:0", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]%lo", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]%lo%lo:1234", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]% lo:1234", -EINVAL, 0, NULL); + + test_socket_address_parse_one("8888", 0, default_family, + default_family == AF_INET6 ? "[::]:8888": "0.0.0.0:8888"); + test_socket_address_parse_one("[2001:0db8:0000:85a3:0000:0000:ac1f:8001]:8888", 0, AF_INET6, + "[2001:db8:0:85a3::ac1f:8001]:8888"); + test_socket_address_parse_one("[::1]:8888", 0, AF_INET6, NULL); + test_socket_address_parse_one("[::1]:1234%lo", 0, AF_INET6, NULL); + test_socket_address_parse_one("[::1]:0%lo", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]%lo", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]:1234%lo%lo", -EINVAL, 0, NULL); + test_socket_address_parse_one("[::1]:1234%xxxxasdf", -ENODEV, 0, NULL); + test_socket_address_parse_one("192.168.1.254:8888", 0, AF_INET, NULL); + test_socket_address_parse_one("/foo/bar", 0, AF_UNIX, NULL); + test_socket_address_parse_one("/", -EINVAL, 0, NULL); + test_socket_address_parse_one("@abstract", 0, AF_UNIX, NULL); + + { + char aaa[SUN_PATH_LEN + 1] = "@"; + + memset(aaa + 1, 'a', SUN_PATH_LEN - 1); + char_array_0(aaa); + + test_socket_address_parse_one(aaa, -EINVAL, 0, NULL); + + aaa[SUN_PATH_LEN - 1] = '\0'; + test_socket_address_parse_one(aaa, 0, AF_UNIX, NULL); + } + + test_socket_address_parse_one("vsock:2:1234", 0, AF_VSOCK, NULL); + test_socket_address_parse_one("vsock::1234", 0, AF_VSOCK, NULL); + test_socket_address_parse_one("vsock:2:1234x", -EINVAL, 0, NULL); + test_socket_address_parse_one("vsock:2x:1234", -EINVAL, 0, NULL); + test_socket_address_parse_one("vsock:2", -EINVAL, 0, NULL); +} + +TEST(socket_address_parse_netlink) { + SocketAddress a; + + assert_se(socket_address_parse_netlink(&a, "junk") < 0); + assert_se(socket_address_parse_netlink(&a, "") < 0); + + assert_se(socket_address_parse_netlink(&a, "route") >= 0); + assert_se(a.sockaddr.nl.nl_family == AF_NETLINK); + assert_se(a.sockaddr.nl.nl_groups == 0); + assert_se(a.protocol == NETLINK_ROUTE); + assert_se(socket_address_parse_netlink(&a, "route") >= 0); + assert_se(socket_address_parse_netlink(&a, "route 10") >= 0); + assert_se(a.sockaddr.nl.nl_family == AF_NETLINK); + assert_se(a.sockaddr.nl.nl_groups == 10); + assert_se(a.protocol == NETLINK_ROUTE); + + /* With spaces and tabs */ + assert_se(socket_address_parse_netlink(&a, " kobject-uevent ") >= 0); + assert_se(a.sockaddr.nl.nl_family == AF_NETLINK); + assert_se(a.sockaddr.nl.nl_groups == 0); + assert_se(a.protocol == NETLINK_KOBJECT_UEVENT); + assert_se(socket_address_parse_netlink(&a, " \t kobject-uevent \t 10") >= 0); + assert_se(a.sockaddr.nl.nl_family == AF_NETLINK); + assert_se(a.sockaddr.nl.nl_groups == 10); + assert_se(a.protocol == NETLINK_KOBJECT_UEVENT); + assert_se(socket_address_parse_netlink(&a, "kobject-uevent\t10") >= 0); + assert_se(a.sockaddr.nl.nl_family == AF_NETLINK); + assert_se(a.sockaddr.nl.nl_groups == 10); + assert_se(a.protocol == NETLINK_KOBJECT_UEVENT); + + /* trailing space is not supported */ + assert_se(socket_address_parse_netlink(&a, "kobject-uevent\t10 ") < 0); + + /* Group must be unsigned */ + assert_se(socket_address_parse_netlink(&a, "kobject-uevent -1") < 0); + + /* oss-fuzz #6884 */ + assert_se(socket_address_parse_netlink(&a, "\xff") < 0); +} + +TEST(socket_address_equal) { + SocketAddress a, b; + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se(socket_address_parse(&b, "192.168.1.1:888") >= 0); + assert_se(!socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se(socket_address_parse(&b, "192.16.1.1:8888") >= 0); + assert_se(!socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se(socket_address_parse(&b, "8888") >= 0); + assert_se(!socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se(socket_address_parse(&b, "/foo/bar/") >= 0); + assert_se(!socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se(socket_address_parse(&b, "192.168.1.1:8888") >= 0); + assert_se(socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "/foo/bar") >= 0); + assert_se(socket_address_parse(&b, "/foo/bar") >= 0); + assert_se(socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "[::1]:8888") >= 0); + assert_se(socket_address_parse(&b, "[::1]:8888") >= 0); + assert_se(socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "@abstract") >= 0); + assert_se(socket_address_parse(&b, "@abstract") >= 0); + assert_se(socket_address_equal(&a, &b)); + + assert_se(socket_address_parse_netlink(&a, "firewall") >= 0); + assert_se(socket_address_parse_netlink(&b, "firewall") >= 0); + assert_se(socket_address_equal(&a, &b)); + + assert_se(socket_address_parse(&a, "vsock:2:1234") >= 0); + assert_se(socket_address_parse(&b, "vsock:2:1234") >= 0); + assert_se(socket_address_equal(&a, &b)); + assert_se(socket_address_parse(&b, "vsock:2:1235") >= 0); + assert_se(!socket_address_equal(&a, &b)); + assert_se(socket_address_parse(&b, "vsock:3:1234") >= 0); + assert_se(!socket_address_equal(&a, &b)); +} + +TEST(socket_address_get_path) { + SocketAddress a; + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se(!socket_address_get_path(&a)); + + assert_se(socket_address_parse(&a, "@abstract") >= 0); + assert_se(!socket_address_get_path(&a)); + + assert_se(socket_address_parse(&a, "[::1]:8888") >= 0); + assert_se(!socket_address_get_path(&a)); + + assert_se(socket_address_parse(&a, "/foo/bar") >= 0); + assert_se(streq(socket_address_get_path(&a), "/foo/bar")); + + assert_se(socket_address_parse(&a, "vsock:2:1234") >= 0); + assert_se(!socket_address_get_path(&a)); +} + +TEST(socket_address_is) { + SocketAddress a; + + assert_se(socket_address_parse(&a, "192.168.1.1:8888") >= 0); + assert_se( socket_address_is(&a, "192.168.1.1:8888", 0 /* unspecified yet */)); + assert_se(!socket_address_is(&a, "route", 0)); + assert_se(!socket_address_is(&a, "route", SOCK_STREAM)); + assert_se(!socket_address_is(&a, "192.168.1.1:8888", SOCK_RAW)); + assert_se(!socket_address_is(&a, "192.168.1.1:8888", SOCK_STREAM)); + a.type = SOCK_STREAM; + assert_se( socket_address_is(&a, "192.168.1.1:8888", SOCK_STREAM)); +} + +TEST(socket_address_is_netlink) { + SocketAddress a; + + assert_se(socket_address_parse_netlink(&a, "route 10") >= 0); + assert_se( socket_address_is_netlink(&a, "route 10")); + assert_se(!socket_address_is_netlink(&a, "192.168.1.1:8888")); + assert_se(!socket_address_is_netlink(&a, "route 1")); +} + +static void test_in_addr_ifindex_to_string_one(int f, const char *a, int ifindex, const char *b) { + _cleanup_free_ char *r = NULL; + union in_addr_union ua, uuaa; + int ff, ifindex2; + + assert_se(in_addr_from_string(f, a, &ua) >= 0); + assert_se(in_addr_ifindex_to_string(f, &ua, ifindex, &r) >= 0); + printf("test_in_addr_ifindex_to_string_one: %s == %s\n", b, r); + assert_se(streq(b, r)); + + assert_se(in_addr_ifindex_from_string_auto(b, &ff, &uuaa, &ifindex2) >= 0); + assert_se(ff == f); + assert_se(in_addr_equal(f, &ua, &uuaa)); + assert_se(ifindex2 == ifindex || ifindex2 == 0); +} + +TEST(in_addr_ifindex_to_string) { + test_in_addr_ifindex_to_string_one(AF_INET, "192.168.0.1", 7, "192.168.0.1"); + test_in_addr_ifindex_to_string_one(AF_INET, "10.11.12.13", 9, "10.11.12.13"); + test_in_addr_ifindex_to_string_one(AF_INET6, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 10, "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"); + test_in_addr_ifindex_to_string_one(AF_INET6, "::1", 11, "::1"); + test_in_addr_ifindex_to_string_one(AF_INET6, "fe80::", 12, "fe80::%12"); + test_in_addr_ifindex_to_string_one(AF_INET6, "fe80::", 0, "fe80::"); + test_in_addr_ifindex_to_string_one(AF_INET6, "fe80::14", 12, "fe80::14%12"); + test_in_addr_ifindex_to_string_one(AF_INET6, "fe80::15", -7, "fe80::15"); + test_in_addr_ifindex_to_string_one(AF_INET6, "fe80::16", LOOPBACK_IFINDEX, "fe80::16%1"); +} + +TEST(in_addr_ifindex_from_string_auto) { + int family, ifindex; + union in_addr_union ua; + + /* Most in_addr_ifindex_from_string_auto() invocations have already been tested above, but let's test some more */ + + assert_se(in_addr_ifindex_from_string_auto("fe80::17", &family, &ua, &ifindex) >= 0); + assert_se(family == AF_INET6); + assert_se(ifindex == 0); + + assert_se(in_addr_ifindex_from_string_auto("fe80::18%19", &family, &ua, &ifindex) >= 0); + assert_se(family == AF_INET6); + assert_se(ifindex == 19); + + assert_se(in_addr_ifindex_from_string_auto("fe80::18%lo", &family, &ua, &ifindex) >= 0); + assert_se(family == AF_INET6); + assert_se(ifindex == LOOPBACK_IFINDEX); + + assert_se(in_addr_ifindex_from_string_auto("fe80::19%thisinterfacecantexist", &family, &ua, &ifindex) == -ENODEV); +} + +static void test_in_addr_ifindex_name_from_string_auto_one(const char *a, const char *expected) { + int family, ifindex; + union in_addr_union ua; + _cleanup_free_ char *server_name = NULL; + + assert_se(in_addr_ifindex_name_from_string_auto(a, &family, &ua, &ifindex, &server_name) >= 0); + assert_se(streq_ptr(server_name, expected)); +} + +TEST(in_addr_ifindex_name_from_string_auto) { + test_in_addr_ifindex_name_from_string_auto_one("192.168.0.1", NULL); + test_in_addr_ifindex_name_from_string_auto_one("192.168.0.1#test.com", "test.com"); + test_in_addr_ifindex_name_from_string_auto_one("fe80::18%19", NULL); + test_in_addr_ifindex_name_from_string_auto_one("fe80::18%19#another.test.com", "another.test.com"); +} + +static void test_in_addr_port_ifindex_name_from_string_auto_one(const char *str, int family, uint16_t port, int ifindex, + const char *server_name, const char *str_repr) { + union in_addr_union a; + uint16_t p; + int f, i; + char *fake; + + log_info("%s: %s", __func__, str); + + { + _cleanup_free_ char *name = NULL, *x = NULL; + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, &p, &i, &name) == 0); + assert_se(family == f); + assert_se(port == p); + assert_se(ifindex == i); + assert_se(streq_ptr(server_name, name)); + assert_se(in_addr_port_ifindex_name_to_string(f, &a, p, i, name, &x) >= 0); + assert_se(streq(str_repr ?: str, x)); + } + + if (port > 0) + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, NULL, &i, &fake) == -EINVAL); + else { + _cleanup_free_ char *name = NULL, *x = NULL; + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, NULL, &i, &name) == 0); + assert_se(family == f); + assert_se(ifindex == i); + assert_se(streq_ptr(server_name, name)); + assert_se(in_addr_port_ifindex_name_to_string(f, &a, 0, i, name, &x) >= 0); + assert_se(streq(str_repr ?: str, x)); + } + + if (ifindex > 0) + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, &p, NULL, &fake) == -EINVAL); + else { + _cleanup_free_ char *name = NULL, *x = NULL; + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, &p, NULL, &name) == 0); + assert_se(family == f); + assert_se(port == p); + assert_se(streq_ptr(server_name, name)); + assert_se(in_addr_port_ifindex_name_to_string(f, &a, p, 0, name, &x) >= 0); + assert_se(streq(str_repr ?: str, x)); + } + + if (server_name) + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, &p, &i, NULL) == -EINVAL); + else { + _cleanup_free_ char *x = NULL; + assert_se(in_addr_port_ifindex_name_from_string_auto(str, &f, &a, &p, &i, NULL) == 0); + assert_se(family == f); + assert_se(port == p); + assert_se(ifindex == i); + assert_se(in_addr_port_ifindex_name_to_string(f, &a, p, i, NULL, &x) >= 0); + assert_se(streq(str_repr ?: str, x)); + } +} + +TEST(in_addr_port_ifindex_name_from_string_auto) { + test_in_addr_port_ifindex_name_from_string_auto_one("192.168.0.1", AF_INET, 0, 0, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("192.168.0.1#test.com", AF_INET, 0, 0, "test.com", NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("192.168.0.1:53", AF_INET, 53, 0, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("192.168.0.1:53#example.com", AF_INET, 53, 0, "example.com", NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("fe80::18", AF_INET6, 0, 0, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("fe80::18#hoge.com", AF_INET6, 0, 0, "hoge.com", NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("fe80::18%19", AF_INET6, 0, 19, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("fe80::18%lo", AF_INET6, 0, 1, NULL, "fe80::18%1"); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53", AF_INET6, 53, 0, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53%19", AF_INET6, 53, 19, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53%lo", AF_INET6, 53, 1, NULL, "[fe80::18]:53%1"); + test_in_addr_port_ifindex_name_from_string_auto_one("fe80::18%19#hoge.com", AF_INET6, 0, 19, "hoge.com", NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53#hoge.com", AF_INET6, 53, 0, "hoge.com", NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53%19", AF_INET6, 53, 19, NULL, NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53%19#hoge.com", AF_INET6, 53, 19, "hoge.com", NULL); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53%lo", AF_INET6, 53, 1, NULL, "[fe80::18]:53%1"); + test_in_addr_port_ifindex_name_from_string_auto_one("[fe80::18]:53%lo#hoge.com", AF_INET6, 53, 1, "hoge.com", "[fe80::18]:53%1#hoge.com"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-socket-util.c b/src/test/test-socket-util.c new file mode 100644 index 0000000..e9c776a --- /dev/null +++ b/src/test/test-socket-util.c @@ -0,0 +1,593 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "async.h" +#include "escape.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fs-util.h" +#include "in-addr-util.h" +#include "iovec-util.h" +#include "log.h" +#include "macro.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "socket-util.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +assert_cc(SUN_PATH_LEN == 108); + +TEST(ifname_valid) { + assert_se( ifname_valid("foo")); + assert_se( ifname_valid("eth0")); + + assert_se(!ifname_valid("0")); + assert_se(!ifname_valid("99")); + assert_se( ifname_valid("a99")); + assert_se( ifname_valid("99a")); + + assert_se(!ifname_valid(NULL)); + assert_se(!ifname_valid("")); + assert_se(!ifname_valid(" ")); + assert_se(!ifname_valid(" foo")); + assert_se(!ifname_valid("bar\n")); + assert_se(!ifname_valid(".")); + assert_se(!ifname_valid("..")); + assert_se(ifname_valid("foo.bar")); + assert_se(!ifname_valid("x:y")); + + assert_se( ifname_valid_full("xxxxxxxxxxxxxxx", 0)); + assert_se(!ifname_valid_full("xxxxxxxxxxxxxxxx", 0)); + assert_se( ifname_valid_full("xxxxxxxxxxxxxxxx", IFNAME_VALID_ALTERNATIVE)); + assert_se( ifname_valid_full("xxxxxxxxxxxxxxxx", IFNAME_VALID_ALTERNATIVE)); + assert_se(!ifname_valid_full("999", IFNAME_VALID_ALTERNATIVE)); + assert_se( ifname_valid_full("999", IFNAME_VALID_ALTERNATIVE | IFNAME_VALID_NUMERIC)); + assert_se(!ifname_valid_full("0", IFNAME_VALID_ALTERNATIVE | IFNAME_VALID_NUMERIC)); +} + +static void test_socket_print_unix_one(const char *in, size_t len_in, const char *expected) { + _cleanup_free_ char *out = NULL, *c = NULL; + + assert_se(len_in <= SUN_PATH_LEN); + SocketAddress a = { .sockaddr = { .un = { .sun_family = AF_UNIX } }, + .size = offsetof(struct sockaddr_un, sun_path) + len_in, + .type = SOCK_STREAM, + }; + memcpy(a.sockaddr.un.sun_path, in, len_in); + + assert_se(socket_address_print(&a, &out) >= 0); + assert_se(c = cescape(in)); + log_info("\"%s\" → \"%s\" (expect \"%s\")", in, out, expected); + assert_se(streq(out, expected)); +} + +TEST(socket_print_unix) { + /* Some additional tests for abstract addresses which we don't parse */ + + test_socket_print_unix_one("\0\0\0\0", 4, "@\\000\\000\\000"); + test_socket_print_unix_one("@abs", 5, "@abs"); + test_socket_print_unix_one("\n", 2, "\\n"); + test_socket_print_unix_one("", 1, ""); + test_socket_print_unix_one("\0", 1, ""); + test_socket_print_unix_one("\0_________________________there's 108 characters in this string_____________________________________________", 108, + "@_________________________there\\'s 108 characters in this string_____________________________________________"); + test_socket_print_unix_one("////////////////////////////////////////////////////////////////////////////////////////////////////////////", 108, + "////////////////////////////////////////////////////////////////////////////////////////////////////////////"); + test_socket_print_unix_one("\0\a\b\n\255", 6, "@\\a\\b\\n\\255\\000"); +} + +TEST(sockaddr_equal) { + union sockaddr_union a = { + .in.sin_family = AF_INET, + .in.sin_port = 0, + .in.sin_addr.s_addr = htobe32(INADDR_ANY), + }; + union sockaddr_union b = { + .in.sin_family = AF_INET, + .in.sin_port = 0, + .in.sin_addr.s_addr = htobe32(INADDR_ANY), + }; + union sockaddr_union c = { + .in.sin_family = AF_INET, + .in.sin_port = 0, + .in.sin_addr.s_addr = htobe32(1234), + }; + union sockaddr_union d = { + .in6.sin6_family = AF_INET6, + .in6.sin6_port = 0, + .in6.sin6_addr = IN6ADDR_ANY_INIT, + }; + union sockaddr_union e = { + .vm.svm_family = AF_VSOCK, + .vm.svm_port = 0, + .vm.svm_cid = VMADDR_CID_ANY, + }; + + assert_se(sockaddr_equal(&a, &a)); + assert_se(sockaddr_equal(&a, &b)); + assert_se(sockaddr_equal(&d, &d)); + assert_se(sockaddr_equal(&e, &e)); + assert_se(!sockaddr_equal(&a, &c)); + assert_se(!sockaddr_equal(&b, &c)); + assert_se(!sockaddr_equal(&a, &e)); +} + +TEST(sockaddr_un_len) { + static const struct sockaddr_un fs = { + .sun_family = AF_UNIX, + .sun_path = "/foo/bar/waldo", + }; + + static const struct sockaddr_un abstract = { + .sun_family = AF_UNIX, + .sun_path = "\0foobar", + }; + + assert_se(SOCKADDR_UN_LEN(fs) == offsetof(struct sockaddr_un, sun_path) + strlen(fs.sun_path) + 1); + assert_se(SOCKADDR_UN_LEN(abstract) == offsetof(struct sockaddr_un, sun_path) + 1 + strlen(abstract.sun_path + 1)); +} + +TEST(in_addr_is_multicast) { + union in_addr_union a, b; + int f; + + assert_se(in_addr_from_string_auto("192.168.3.11", &f, &a) >= 0); + assert_se(in_addr_is_multicast(f, &a) == 0); + + assert_se(in_addr_from_string_auto("224.0.0.1", &f, &a) >= 0); + assert_se(in_addr_is_multicast(f, &a) == 1); + + assert_se(in_addr_from_string_auto("FF01:0:0:0:0:0:0:1", &f, &b) >= 0); + assert_se(in_addr_is_multicast(f, &b) == 1); + + assert_se(in_addr_from_string_auto("2001:db8::c:69b:aeff:fe53:743e", &f, &b) >= 0); + assert_se(in_addr_is_multicast(f, &b) == 0); +} + +TEST(getpeercred_getpeergroups) { + int r; + + r = safe_fork("(getpeercred)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + static const gid_t gids[] = { 3, 4, 5, 6, 7 }; + gid_t *test_gids; + size_t n_test_gids; + uid_t test_uid; + gid_t test_gid; + struct ucred ucred; + int pair[2]; + + if (geteuid() == 0) { + test_uid = 1; + test_gid = 2; + test_gids = (gid_t*) gids; + n_test_gids = ELEMENTSOF(gids); + + assert_se(setgroups(n_test_gids, test_gids) >= 0); + assert_se(setresgid(test_gid, test_gid, test_gid) >= 0); + assert_se(setresuid(test_uid, test_uid, test_uid) >= 0); + + } else { + long ngroups_max; + + test_uid = getuid(); + test_gid = getgid(); + + ngroups_max = sysconf(_SC_NGROUPS_MAX); + assert_se(ngroups_max > 0); + + test_gids = newa(gid_t, ngroups_max); + + r = getgroups(ngroups_max, test_gids); + assert_se(r >= 0); + n_test_gids = (size_t) r; + } + + assert_se(socketpair(AF_UNIX, SOCK_STREAM, 0, pair) >= 0); + + assert_se(getpeercred(pair[0], &ucred) >= 0); + + assert_se(ucred.uid == test_uid); + assert_se(ucred.gid == test_gid); + assert_se(ucred.pid == getpid_cached()); + + { + _cleanup_free_ gid_t *peer_groups = NULL; + + r = getpeergroups(pair[0], &peer_groups); + assert_se(r >= 0 || IN_SET(r, -EOPNOTSUPP, -ENOPROTOOPT)); + + if (r >= 0) { + assert_se((size_t) r == n_test_gids); + assert_se(memcmp(peer_groups, test_gids, sizeof(gid_t) * n_test_gids) == 0); + } + } + + safe_close_pair(pair); + _exit(EXIT_SUCCESS); + } +} + +TEST(passfd_read) { + static const char file_contents[] = "test contents for passfd"; + _cleanup_close_pair_ int pair[2]; + int r; + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) >= 0); + + r = safe_fork("(passfd_read)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* Child */ + pair[0] = safe_close(pair[0]); + + char tmpfile[] = "/tmp/test-socket-util-passfd-read-XXXXXX"; + assert_se(write_tmpfile(tmpfile, file_contents) == 0); + + _cleanup_close_ int tmpfd = open(tmpfile, O_RDONLY); + assert_se(tmpfd >= 0); + assert_se(unlink(tmpfile) == 0); + + assert_se(send_one_fd(pair[1], tmpfd, MSG_DONTWAIT) == 0); + _exit(EXIT_SUCCESS); + } + + /* Parent */ + char buf[64]; + struct iovec iov = IOVEC_MAKE(buf, sizeof(buf)-1); + _cleanup_close_ int fd; + + pair[1] = safe_close(pair[1]); + + assert_se(receive_one_fd_iov(pair[0], &iov, 1, MSG_DONTWAIT, &fd) == 0); + + assert_se(fd >= 0); + r = read(fd, buf, sizeof(buf)-1); + assert_se(r >= 0); + buf[r] = 0; + assert_se(streq(buf, file_contents)); +} + +TEST(passfd_contents_read) { + _cleanup_close_pair_ int pair[2]; + static const char file_contents[] = "test contents in the file"; + static const char wire_contents[] = "test contents on the wire"; + int r; + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) >= 0); + + r = safe_fork("(passfd_contents_read)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* Child */ + struct iovec iov = IOVEC_MAKE_STRING(wire_contents); + char tmpfile[] = "/tmp/test-socket-util-passfd-contents-read-XXXXXX"; + + pair[0] = safe_close(pair[0]); + + assert_se(write_tmpfile(tmpfile, file_contents) == 0); + + _cleanup_close_ int tmpfd = open(tmpfile, O_RDONLY); + assert_se(tmpfd >= 0); + assert_se(unlink(tmpfile) == 0); + + assert_se(send_one_fd_iov(pair[1], tmpfd, &iov, 1, MSG_DONTWAIT) > 0); + _exit(EXIT_SUCCESS); + } + + /* Parent */ + char buf[64]; + struct iovec iov = IOVEC_MAKE(buf, sizeof(buf)-1); + _cleanup_close_ int fd; + ssize_t k; + + pair[1] = safe_close(pair[1]); + + k = receive_one_fd_iov(pair[0], &iov, 1, MSG_DONTWAIT, &fd); + assert_se(k > 0); + buf[k] = 0; + assert_se(streq(buf, wire_contents)); + + assert_se(fd >= 0); + r = read(fd, buf, sizeof(buf)-1); + assert_se(r >= 0); + buf[r] = 0; + assert_se(streq(buf, file_contents)); +} + +TEST(pass_many_fds_contents_read) { + _cleanup_close_pair_ int pair[2]; + static const char file_contents[][STRLEN("test contents in the fileX") + 1] = { + "test contents in the file0", + "test contents in the file1", + "test contents in the file2" + }; + static const char wire_contents[] = "test contents on the wire"; + int r; + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) >= 0); + + r = safe_fork("(passfd_contents_read)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* Child */ + struct iovec iov = IOVEC_MAKE_STRING(wire_contents); + char tmpfile[][STRLEN("/tmp/test-socket-util-passfd-contents-read-XXXXXX") + 1] = { + "/tmp/test-socket-util-passfd-contents-read-XXXXXX", + "/tmp/test-socket-util-passfd-contents-read-XXXXXX", + "/tmp/test-socket-util-passfd-contents-read-XXXXXX" + }; + int tmpfds[3] = EBADF_TRIPLET; + + pair[0] = safe_close(pair[0]); + + for (size_t i = 0; i < 3; ++i) { + assert_se(write_tmpfile(tmpfile[i], file_contents[i]) == 0); + tmpfds[i] = open(tmpfile[i], O_RDONLY); + assert_se(tmpfds[i] >= 0); + assert_se(unlink(tmpfile[i]) == 0); + } + + assert_se(send_many_fds_iov(pair[1], tmpfds, 3, &iov, 1, MSG_DONTWAIT) > 0); + close_many(tmpfds, 3); + _exit(EXIT_SUCCESS); + } + + /* Parent */ + char buf[64]; + struct iovec iov = IOVEC_MAKE(buf, sizeof(buf)-1); + _cleanup_free_ int *fds = NULL; + size_t n_fds = 0; + ssize_t k; + + pair[1] = safe_close(pair[1]); + + k = receive_many_fds_iov(pair[0], &iov, 1, &fds, &n_fds, MSG_DONTWAIT); + assert_se(k > 0); + buf[k] = 0; + assert_se(streq(buf, wire_contents)); + + assert_se(n_fds == 3); + + for (size_t i = 0; i < 3; ++i) { + assert_se(fds[i] >= 0); + r = read(fds[i], buf, sizeof(buf)-1); + assert_se(r >= 0); + buf[r] = 0; + assert_se(streq(buf, file_contents[i])); + safe_close(fds[i]); + } +} + +TEST(receive_nopassfd) { + _cleanup_close_pair_ int pair[2]; + static const char wire_contents[] = "no fd passed here"; + int r; + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) >= 0); + + r = safe_fork("(receive_nopassfd)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* Child */ + struct iovec iov = IOVEC_MAKE_STRING(wire_contents); + + pair[0] = safe_close(pair[0]); + + assert_se(send_one_fd_iov(pair[1], -1, &iov, 1, MSG_DONTWAIT) > 0); + _exit(EXIT_SUCCESS); + } + + /* Parent */ + char buf[64]; + struct iovec iov = IOVEC_MAKE(buf, sizeof(buf)-1); + int fd = -999; + ssize_t k; + + pair[1] = safe_close(pair[1]); + + k = receive_one_fd_iov(pair[0], &iov, 1, MSG_DONTWAIT, &fd); + assert_se(k > 0); + buf[k] = 0; + assert_se(streq(buf, wire_contents)); + + /* no fd passed here, confirm it was reset */ + assert_se(fd == -EBADF); +} + +TEST(send_nodata_nofd) { + _cleanup_close_pair_ int pair[2]; + int r; + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) >= 0); + + r = safe_fork("(send_nodata_nofd)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* Child */ + pair[0] = safe_close(pair[0]); + + assert_se(send_one_fd_iov(pair[1], -1, NULL, 0, MSG_DONTWAIT) == -EINVAL); + _exit(EXIT_SUCCESS); + } + + /* Parent */ + char buf[64]; + struct iovec iov = IOVEC_MAKE(buf, sizeof(buf)-1); + int fd = -999; + ssize_t k; + + pair[1] = safe_close(pair[1]); + + k = receive_one_fd_iov(pair[0], &iov, 1, MSG_DONTWAIT, &fd); + /* recvmsg() will return errno EAGAIN if nothing was sent */ + assert_se(k == -EAGAIN); + + /* receive_one_fd_iov returned error, so confirm &fd wasn't touched */ + assert_se(fd == -999); +} + +TEST(send_emptydata) { + _cleanup_close_pair_ int pair[2]; + int r; + + assert_se(socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) >= 0); + + r = safe_fork("(send_emptydata)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT, NULL); + assert_se(r >= 0); + + if (r == 0) { + /* Child */ + struct iovec iov = IOVEC_MAKE_STRING(""); /* zero-length iov */ + assert_se(iov.iov_len == 0); + + pair[0] = safe_close(pair[0]); + + /* This will succeed, since iov is set. */ + assert_se(send_one_fd_iov(pair[1], -1, &iov, 1, MSG_DONTWAIT) == 0); + _exit(EXIT_SUCCESS); + } + + /* Parent */ + char buf[64]; + struct iovec iov = IOVEC_MAKE(buf, sizeof(buf)-1); + int fd = -999; + ssize_t k; + + pair[1] = safe_close(pair[1]); + + k = receive_one_fd_iov(pair[0], &iov, 1, MSG_DONTWAIT, &fd); + /* receive_one_fd_iov() returns -EIO if an fd is not found and no data was returned. */ + assert_se(k == -EIO); + + /* receive_one_fd_iov returned error, so confirm &fd wasn't touched */ + assert_se(fd == -999); +} + +TEST(flush_accept) { + _cleanup_close_ int listen_stream, listen_dgram, listen_seqpacket, connect_stream, connect_dgram, connect_seqpacket; + static const union sockaddr_union sa = { .un.sun_family = AF_UNIX }; + union sockaddr_union lsa; + socklen_t l; + + listen_stream = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + assert_se(listen_stream >= 0); + + listen_dgram = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + assert_se(listen_dgram >= 0); + + listen_seqpacket = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + assert_se(listen_seqpacket >= 0); + + assert_se(flush_accept(listen_stream) < 0); + assert_se(flush_accept(listen_dgram) < 0); + assert_se(flush_accept(listen_seqpacket) < 0); + + assert_se(bind(listen_stream, &sa.sa, sizeof(sa_family_t)) >= 0); + assert_se(bind(listen_dgram, &sa.sa, sizeof(sa_family_t)) >= 0); + assert_se(bind(listen_seqpacket, &sa.sa, sizeof(sa_family_t)) >= 0); + + assert_se(flush_accept(listen_stream) < 0); + assert_se(flush_accept(listen_dgram) < 0); + assert_se(flush_accept(listen_seqpacket) < 0); + + assert_se(listen(listen_stream, SOMAXCONN_DELUXE) >= 0); + assert_se(listen(listen_dgram, SOMAXCONN_DELUXE) < 0); + assert_se(listen(listen_seqpacket, SOMAXCONN_DELUXE) >= 0); + + assert_se(flush_accept(listen_stream) >= 0); + assert_se(flush_accept(listen_dgram) < 0); + assert_se(flush_accept(listen_seqpacket) >= 0); + + connect_stream = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + assert_se(connect_stream >= 0); + + connect_dgram = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + assert_se(connect_dgram >= 0); + + connect_seqpacket = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + assert_se(connect_seqpacket >= 0); + + l = sizeof(lsa); + assert_se(getsockname(listen_stream, &lsa.sa, &l) >= 0); + assert_se(connect(connect_stream, &lsa.sa, l) >= 0); + + l = sizeof(lsa); + assert_se(getsockname(listen_dgram, &lsa.sa, &l) >= 0); + assert_se(connect(connect_dgram, &lsa.sa, l) >= 0); + + l = sizeof(lsa); + assert_se(getsockname(listen_seqpacket, &lsa.sa, &l) >= 0); + assert_se(connect(connect_seqpacket, &lsa.sa, l) >= 0); + + assert_se(flush_accept(listen_stream) >= 0); + assert_se(flush_accept(listen_dgram) < 0); + assert_se(flush_accept(listen_seqpacket) >= 0); +} + +TEST(ipv6_enabled) { + log_info("IPv6 supported: %s", yes_no(socket_ipv6_is_supported())); + log_info("IPv6 enabled: %s", yes_no(socket_ipv6_is_enabled())); +} + +TEST(sockaddr_un_set_path) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_(unlink_and_freep) char *sh = NULL; + _cleanup_free_ char *j = NULL; + union sockaddr_union sa; + _cleanup_close_ int fd1, fd2, fd3; + + assert_se(mkdtemp_malloc("/tmp/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaXXXXXX", &t) >= 0); + assert_se(strlen(t) > SUN_PATH_LEN); + + assert_se(j = path_join(t, "sock")); + assert_se(sockaddr_un_set_path(&sa.un, j) == -ENAMETOOLONG); /* too long for AF_UNIX socket */ + + assert_se(asprintf(&sh, "/tmp/%" PRIx64, random_u64()) >= 0); + assert_se(symlink(t, sh) >= 0); /* create temporary symlink, to access it anyway */ + + free(j); + assert_se(j = path_join(sh, "sock")); + assert_se(sockaddr_un_set_path(&sa.un, j) >= 0); + + fd1 = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(fd1 >= 0); + assert_se(bind(fd1, &sa.sa, SOCKADDR_LEN(sa)) >= 0); + assert_se(listen(fd1, 1) >= 0); + + sh = unlink_and_free(sh); /* remove temporary symlink */ + + fd2 = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(fd2 >= 0); + assert_se(connect(fd2, &sa.sa, SOCKADDR_LEN(sa)) < 0); + assert_se(errno == ENOENT); /* we removed the symlink, must fail */ + + free(j); + assert_se(j = path_join(t, "sock")); + + fd3 = open(j, O_CLOEXEC|O_PATH|O_NOFOLLOW); + assert_se(fd3 > 0); + assert_se(sockaddr_un_set_path(&sa.un, FORMAT_PROC_FD_PATH(fd3)) >= 0); /* connect via O_PATH instead, circumventing 108ch limit */ + + assert_se(connect(fd2, &sa.sa, SOCKADDR_LEN(sa)) >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-specifier.c b/src/test/test-specifier.c new file mode 100644 index 0000000..d6a8b79 --- /dev/null +++ b/src/test/test-specifier.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "log.h" +#include "specifier.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "unit-file.h" + +static void test_specifier_escape_one(const char *a, const char *b) { + _cleanup_free_ char *x = NULL; + + x = specifier_escape(a); + assert_se(streq_ptr(x, b)); +} + +TEST(specifier_escape) { + test_specifier_escape_one(NULL, NULL); + test_specifier_escape_one("", ""); + test_specifier_escape_one("%", "%%"); + test_specifier_escape_one("foo bar", "foo bar"); + test_specifier_escape_one("foo%bar", "foo%%bar"); + test_specifier_escape_one("%%%%%", "%%%%%%%%%%"); +} + +static void test_specifier_escape_strv_one(char **a, char **b) { + _cleanup_strv_free_ char **x = NULL; + + assert_se(specifier_escape_strv(a, &x) >= 0); + assert_se(strv_equal(x, b)); +} + +TEST(specifier_escape_strv) { + test_specifier_escape_strv_one(NULL, NULL); + test_specifier_escape_strv_one(STRV_MAKE(NULL), STRV_MAKE(NULL)); + test_specifier_escape_strv_one(STRV_MAKE(""), STRV_MAKE("")); + test_specifier_escape_strv_one(STRV_MAKE("foo"), STRV_MAKE("foo")); + test_specifier_escape_strv_one(STRV_MAKE("%"), STRV_MAKE("%%")); + test_specifier_escape_strv_one(STRV_MAKE("foo", "%", "foo%", "%foo", "foo%foo", "quux", "%%%"), + STRV_MAKE("foo", "%%", "foo%%", "%%foo", "foo%%foo", "quux", "%%%%%%")); +} + +/* Any specifier functions which don't need an argument. */ +static const Specifier specifier_table[] = { + COMMON_SYSTEM_SPECIFIERS, + + COMMON_CREDS_SPECIFIERS(RUNTIME_SCOPE_USER), + { 'h', specifier_user_home, NULL }, + + COMMON_TMP_SPECIFIERS, + {} +}; + +TEST(specifier_printf) { + static const Specifier table[] = { + { 'X', specifier_string, (char*) "AAAA" }, + { 'Y', specifier_string, (char*) "BBBB" }, + { 'e', specifier_string, NULL }, + COMMON_SYSTEM_SPECIFIERS, + {} + }; + + _cleanup_free_ char *w = NULL; + int r; + + r = specifier_printf("xxx a=%X b=%Y e=%e yyy", SIZE_MAX, table, NULL, NULL, &w); + assert_se(r >= 0); + assert_se(w); + + puts(w); + assert_se(streq(w, "xxx a=AAAA b=BBBB e= yyy")); + + free(w); + r = specifier_printf("boot=%b, host=%H, pretty=%q, version=%v, arch=%a, empty=%e", SIZE_MAX, table, NULL, NULL, &w); + assert_se(r >= 0); + assert_se(w); + puts(w); + + w = mfree(w); + specifier_printf("os=%o, os-version=%w, build=%B, variant=%W, empty=%e%e%e", SIZE_MAX, table, NULL, NULL, &w); + if (w) + puts(w); +} + +TEST(specifier_real_path) { + static const Specifier table[] = { + { 'p', specifier_string, "/dev/initctl" }, + { 'y', specifier_real_path, "/dev/initctl" }, + { 'Y', specifier_real_directory, "/dev/initctl" }, + { 'w', specifier_real_path, "/dev/tty" }, + { 'W', specifier_real_directory, "/dev/tty" }, + {} + }; + + _cleanup_free_ char *w = NULL; + int r; + + r = specifier_printf("p=%p y=%y Y=%Y w=%w W=%W", SIZE_MAX, table, NULL, NULL, &w); + assert_se(r >= 0 || r == -ENOENT); + assert_se(w || r == -ENOENT); + puts(strnull(w)); + + /* /dev/initctl should normally be a symlink to /run/initctl */ + if (inode_same("/dev/initctl", "/run/initctl", 0) > 0) + assert_se(streq(w, "p=/dev/initctl y=/run/initctl Y=/run w=/dev/tty W=/dev")); +} + +TEST(specifier_real_path_missing_file) { + static const Specifier table[] = { + { 'p', specifier_string, "/dev/-no-such-file--" }, + { 'y', specifier_real_path, "/dev/-no-such-file--" }, + { 'Y', specifier_real_directory, "/dev/-no-such-file--" }, + {} + }; + + _cleanup_free_ char *w = NULL; + int r; + + r = specifier_printf("p=%p y=%y", SIZE_MAX, table, NULL, NULL, &w); + assert_se(r == -ENOENT); + + r = specifier_printf("p=%p Y=%Y", SIZE_MAX, table, NULL, NULL, &w); + assert_se(r == -ENOENT); +} + +TEST(specifiers) { + int r; + + for (const Specifier *s = specifier_table; s->specifier; s++) { + char spec[3]; + _cleanup_free_ char *resolved = NULL; + + xsprintf(spec, "%%%c", s->specifier); + + r = specifier_printf(spec, SIZE_MAX, specifier_table, NULL, NULL, &resolved); + if (s->specifier == 'm' && IN_SET(r, -EUNATCH, -ENOMEDIUM, -ENOPKG)) /* machine-id might be missing in build chroots */ + continue; + assert_se(r >= 0); + + log_info("%%%c → %s", s->specifier, resolved); + } +} + +/* Bunch of specifiers that are not part of the common lists */ +TEST(specifiers_assorted) { + const sd_id128_t id = SD_ID128_ALLF; + const uint64_t llu = UINT64_MAX; + const Specifier table[] = { + /* Used in src/partition/repart.c */ + { 'a', specifier_uuid, &id }, + { 'b', specifier_uint64, &llu }, + {} + }; + + for (const Specifier *s = table; s->specifier; s++) { + char spec[3]; + _cleanup_free_ char *resolved = NULL; + int r; + + xsprintf(spec, "%%%c", s->specifier); + + r = specifier_printf(spec, SIZE_MAX, table, NULL, NULL, &resolved); + assert_se(r >= 0); + + log_info("%%%c → %s", s->specifier, resolved); + } +} + +TEST(specifiers_missing_data_ok) { + _cleanup_free_ char *resolved = NULL; + + assert_se(setenv("SYSTEMD_OS_RELEASE", "/dev/null", 1) == 0); + assert_se(specifier_printf("%A-%B-%M-%o-%w-%W", SIZE_MAX, specifier_table, NULL, NULL, &resolved) >= 0); + assert_se(streq(resolved, "-----")); + + assert_se(setenv("SYSTEMD_OS_RELEASE", "/nosuchfileordirectory", 1) == 0); + assert_se(specifier_printf("%A-%B-%M-%o-%w-%W", SIZE_MAX, specifier_table, NULL, NULL, &resolved) == -EUNATCH); + assert_se(streq(resolved, "-----")); + + assert_se(unsetenv("SYSTEMD_OS_RELEASE") == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-stat-util.c b/src/test/test-stat-util.c new file mode 100644 index 0000000..5aca207 --- /dev/null +++ b/src/test/test-stat-util.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "errno-list.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "mountpoint-util.h" +#include "namespace-util.h" +#include "path-util.h" +#include "rm-rf.h" +#include "stat-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +TEST(null_or_empty_path) { + assert_se(null_or_empty_path("/dev/null") == 1); + assert_se(null_or_empty_path("/dev/tty") == 1); /* We assume that any character device is "empty", bleh. */ + assert_se(null_or_empty_path("../../../../../../../../../../../../../../../../../../../../dev/null") == 1); + assert_se(null_or_empty_path("/proc/self/exe") == 0); + assert_se(null_or_empty_path("/nosuchfileordir") == -ENOENT); +} + +TEST(null_or_empty_path_with_root) { + assert_se(null_or_empty_path_with_root("/dev/null", NULL) == 1); + assert_se(null_or_empty_path_with_root("/dev/null", "/") == 1); + assert_se(null_or_empty_path_with_root("/dev/null", "/.././../") == 1); + assert_se(null_or_empty_path_with_root("/dev/null", "/.././..") == 1); + assert_se(null_or_empty_path_with_root("../../../../../../../../../../../../../../../../../../../../dev/null", NULL) == 1); + assert_se(null_or_empty_path_with_root("../../../../../../../../../../../../../../../../../../../../dev/null", "/") == 1); + assert_se(null_or_empty_path_with_root("/proc/self/exe", NULL) == 0); + assert_se(null_or_empty_path_with_root("/proc/self/exe", "/") == 0); + assert_se(null_or_empty_path_with_root("/nosuchfileordir", NULL) == -ENOENT); + assert_se(null_or_empty_path_with_root("/nosuchfileordir", "/.././../") == -ENOENT); + assert_se(null_or_empty_path_with_root("/nosuchfileordir", "/.././..") == -ENOENT); + assert_se(null_or_empty_path_with_root("/foobar/barbar/dev/null", "/foobar/barbar") == 1); + assert_se(null_or_empty_path_with_root("/foobar/barbar/dev/null", "/foobar/barbar/") == 1); +} + +TEST(inode_same) { + _cleanup_close_ int fd = -EBADF; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-files_same.XXXXXX"; + _cleanup_(unlink_tempfilep) char name_alias[] = "/tmp/test-files_same.alias"; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se(symlink(name, name_alias) >= 0); + + assert_se(inode_same(name, name, 0)); + assert_se(inode_same(name, name, AT_SYMLINK_NOFOLLOW)); + assert_se(inode_same(name, name_alias, 0)); + assert_se(!inode_same(name, name_alias, AT_SYMLINK_NOFOLLOW)); +} + +TEST(is_symlink) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-is_symlink.XXXXXX"; + _cleanup_(unlink_tempfilep) char name_link[] = "/tmp/test-is_symlink.link"; + _cleanup_close_ int fd = -EBADF; + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se(symlink(name, name_link) >= 0); + + assert_se(is_symlink(name) == 0); + assert_se(is_symlink(name_link) == 1); + assert_se(is_symlink("/a/file/which/does/not/exist/i/guess") < 0); +} + +TEST(path_is_fs_type) { + /* run might not be a mount point in build chroots */ + if (path_is_mount_point("/run", NULL, AT_SYMLINK_FOLLOW) > 0) { + assert_se(path_is_fs_type("/run", TMPFS_MAGIC) > 0); + assert_se(path_is_fs_type("/run", BTRFS_SUPER_MAGIC) == 0); + } + if (path_is_mount_point("/proc", NULL, AT_SYMLINK_FOLLOW) > 0) { + assert_se(path_is_fs_type("/proc", PROC_SUPER_MAGIC) > 0); + assert_se(path_is_fs_type("/proc", BTRFS_SUPER_MAGIC) == 0); + } + assert_se(path_is_fs_type("/i-dont-exist", BTRFS_SUPER_MAGIC) == -ENOENT); +} + +TEST(path_is_temporary_fs) { + int r; + + FOREACH_STRING(s, "/", "/run", "/sys", "/sys/", "/proc", "/i-dont-exist", "/var", "/var/lib") { + r = path_is_temporary_fs(s); + + log_info_errno(r, "path_is_temporary_fs(\"%s\"): %d, %s", + s, r, r < 0 ? errno_to_name(r) : yes_no(r)); + } + + /* run might not be a mount point in build chroots */ + if (path_is_mount_point("/run", NULL, AT_SYMLINK_FOLLOW) > 0) + assert_se(path_is_temporary_fs("/run") > 0); + assert_se(path_is_temporary_fs("/proc") == 0); + assert_se(path_is_temporary_fs("/i-dont-exist") == -ENOENT); +} + +TEST(path_is_read_only_fs) { + int r; + + FOREACH_STRING(s, "/", "/run", "/sys", "/sys/", "/proc", "/i-dont-exist", "/var", "/var/lib") { + r = path_is_read_only_fs(s); + + log_info_errno(r, "path_is_read_only_fs(\"%s\"): %d, %s", + s, r, r < 0 ? errno_to_name(r) : yes_no(r)); + } + + if (path_is_mount_point("/sys", NULL, AT_SYMLINK_FOLLOW) > 0) + assert_se(IN_SET(path_is_read_only_fs("/sys"), 0, 1)); + + assert_se(path_is_read_only_fs("/proc") == 0); + assert_se(path_is_read_only_fs("/i-dont-exist") == -ENOENT); +} + +TEST(fd_is_ns) { + _cleanup_close_ int fd = -EBADF; + + assert_se(fd_is_ns(STDIN_FILENO, CLONE_NEWNET) == 0); + assert_se(fd_is_ns(STDERR_FILENO, CLONE_NEWNET) == 0); + assert_se(fd_is_ns(STDOUT_FILENO, CLONE_NEWNET) == 0); + + fd = open("/proc/self/ns/mnt", O_CLOEXEC|O_RDONLY); + if (fd < 0) { + assert_se(errno == ENOENT); + log_notice("Path %s not found, skipping test", "/proc/self/ns/mnt"); + return; + } + assert_se(fd >= 0); + assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 0, -EUCLEAN)); + fd = safe_close(fd); + + assert_se((fd = open("/proc/self/ns/ipc", O_CLOEXEC|O_RDONLY)) >= 0); + assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWIPC), 1, -EUCLEAN)); + fd = safe_close(fd); + + assert_se((fd = open("/proc/self/ns/net", O_CLOEXEC|O_RDONLY)) >= 0); + assert_se(IN_SET(fd_is_ns(fd, CLONE_NEWNET), 1, -EUCLEAN)); +} + +TEST(dir_is_empty) { + _cleanup_(rm_rf_physical_and_freep) char *empty_dir = NULL; + _cleanup_free_ char *j = NULL, *jj = NULL, *jjj = NULL; + + assert_se(dir_is_empty_at(AT_FDCWD, "/proc", /* ignore_hidden_or_backup= */ true) == 0); + assert_se(dir_is_empty_at(AT_FDCWD, "/icertainlydontexistdoi", /* ignore_hidden_or_backup= */ true) == -ENOENT); + + assert_se(mkdtemp_malloc("/tmp/emptyXXXXXX", &empty_dir) >= 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ true) > 0); + + j = path_join(empty_dir, "zzz"); + assert_se(j); + assert_se(touch(j) >= 0); + + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ true) == 0); + + jj = path_join(empty_dir, "ppp"); + assert_se(jj); + assert_se(touch(jj) >= 0); + + jjj = path_join(empty_dir, ".qqq"); + assert_se(jjj); + assert_se(touch(jjj) >= 0); + + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ true) == 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ false) == 0); + assert_se(unlink(j) >= 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ true) == 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ false) == 0); + assert_se(unlink(jj) >= 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ true) > 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ false) == 0); + assert_se(unlink(jjj) >= 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ true) > 0); + assert_se(dir_is_empty_at(AT_FDCWD, empty_dir, /* ignore_hidden_or_backup= */ false) > 0); +} + +static int intro(void) { + log_show_color(true); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-static-destruct.c b/src/test/test-static-destruct.c new file mode 100644 index 0000000..ef8648f --- /dev/null +++ b/src/test/test-static-destruct.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "static-destruct.h" +#include "strv.h" +#include "tests.h" + +static int foo = 0; +static int bar = 0; +static int baz = 0; +static char *memory = NULL; +static char **strings = NULL; +static size_t n_strings = 0; +static int *integers = NULL; +static size_t n_integers = 0; + +static void test_destroy(int *b) { + (*b)++; +} + +static void test_strings_destroy(char **array, size_t n) { + assert_se(n == 3); + assert_se(strv_equal(array, STRV_MAKE("a", "bbb", "ccc"))); + + strv_free(array); +} + +static void test_integers_destroy(int *array, size_t n) { + assert_se(n == 10); + + for (size_t i = 0; i < n; i++) + assert_se(array[i] == (int)(i * i)); + + free(array); +} + +STATIC_DESTRUCTOR_REGISTER(foo, test_destroy); +STATIC_DESTRUCTOR_REGISTER(bar, test_destroy); +STATIC_DESTRUCTOR_REGISTER(bar, test_destroy); +STATIC_DESTRUCTOR_REGISTER(baz, test_destroy); +STATIC_DESTRUCTOR_REGISTER(baz, test_destroy); +STATIC_DESTRUCTOR_REGISTER(baz, test_destroy); +STATIC_DESTRUCTOR_REGISTER(memory, freep); +STATIC_ARRAY_DESTRUCTOR_REGISTER(strings, n_strings, test_strings_destroy); +STATIC_ARRAY_DESTRUCTOR_REGISTER(integers, n_integers, test_integers_destroy); + +TEST(static_destruct) { + assert_se(foo == 0 && bar == 0 && baz == 0); + assert_se(memory = strdup("hallo")); + assert_se(strings = strv_new("a", "bbb", "ccc")); + n_strings = strv_length(strings); + n_integers = 10; + assert_se(integers = new(int, n_integers)); + for (size_t i = 0; i < n_integers; i++) + integers[i] = i * i; + + static_destruct(); + + assert_se(foo == 1 && bar == 2 && baz == 3); + assert_se(!memory); + assert_se(!strings); + assert_se(n_strings == 0); + assert_se(!integers); + assert_se(n_integers == 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-strbuf.c b/src/test/test-strbuf.c new file mode 100644 index 0000000..39a7142 --- /dev/null +++ b/src/test/test-strbuf.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "nulstr-util.h" +#include "strbuf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static ssize_t add_string(struct strbuf *sb, const char *s) { + return strbuf_add_string(sb, s, strlen(s)); +} + +TEST(strbuf) { + _cleanup_(strbuf_freep) struct strbuf *sb = NULL; + _cleanup_strv_free_ char **l = NULL; + ssize_t a, b, c, d, e, f, g, h; + + sb = strbuf_new(); + + a = add_string(sb, "waldo"); + b = add_string(sb, "foo"); + c = add_string(sb, "bar"); + d = add_string(sb, "waldo"); /* duplicate */ + e = add_string(sb, "aldo"); /* duplicate */ + f = add_string(sb, "do"); /* duplicate */ + g = add_string(sb, "waldorf"); /* not a duplicate: matches from tail */ + h = add_string(sb, ""); + + /* check the content of the buffer directly */ + l = strv_parse_nulstr(sb->buf, sb->len); + assert_se(l); + + assert_se(streq(l[0], "")); /* root */ + assert_se(streq(l[1], "waldo")); + assert_se(streq(l[2], "foo")); + assert_se(streq(l[3], "bar")); + assert_se(streq(l[4], "waldorf")); + assert_se(l[5] == NULL); + + assert_se(sb->nodes_count == 5); /* root + 4 non-duplicates */ + assert_se(sb->dedup_count == 4); + assert_se(sb->in_count == 8); + + assert_se(sb->in_len == 29); /* length of all strings added */ + assert_se(sb->dedup_len == 11); /* length of all strings duplicated */ + assert_se(sb->len == 23); /* buffer length: in - dedup + \0 for each node */ + + /* check the returned offsets and the respective content in the buffer */ + assert_se(a == 1); + assert_se(b == 7); + assert_se(c == 11); + assert_se(d == 1); + assert_se(e == 2); + assert_se(f == 4); + assert_se(g == 15); + assert_se(h == 0); + + assert_se(streq(sb->buf + a, "waldo")); + assert_se(streq(sb->buf + b, "foo")); + assert_se(streq(sb->buf + c, "bar")); + assert_se(streq(sb->buf + d, "waldo")); + assert_se(streq(sb->buf + e, "aldo")); + assert_se(streq(sb->buf + f, "do")); + assert_se(streq(sb->buf + g, "waldorf")); + assert_se(streq(sb->buf + h, "")); + + strbuf_complete(sb); + assert_se(sb->root == NULL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-string-util.c b/src/test/test-string-util.c new file mode 100644 index 0000000..a8fd45d --- /dev/null +++ b/src/test/test-string-util.c @@ -0,0 +1,1327 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "locale-util.h" +#include "macro.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "utf8.h" + +TEST(string_erase) { + char *x; + x = strdupa_safe(""); + assert_se(streq(string_erase(x), "")); + + x = strdupa_safe("1"); + assert_se(streq(string_erase(x), "")); + + x = strdupa_safe("123456789"); + assert_se(streq(string_erase(x), "")); + + assert_se(x[1] == '\0'); + assert_se(x[2] == '\0'); + assert_se(x[3] == '\0'); + assert_se(x[4] == '\0'); + assert_se(x[5] == '\0'); + assert_se(x[6] == '\0'); + assert_se(x[7] == '\0'); + assert_se(x[8] == '\0'); + assert_se(x[9] == '\0'); +} + +static void test_free_and_strndup_one(char **t, const char *src, size_t l, const char *expected, bool change) { + log_debug("%s: \"%s\", \"%s\", %zu (expect \"%s\", %s)", + __func__, strnull(*t), strnull(src), l, strnull(expected), yes_no(change)); + + int r = free_and_strndup(t, src, l); + assert_se(streq_ptr(*t, expected)); + assert_se(r == change); /* check that change occurs only when necessary */ +} + +TEST(free_and_strndup) { + static const struct test_case { + const char *src; + size_t len; + const char *expected; + } cases[] = { + {"abc", 0, ""}, + {"abc", 0, ""}, + {"abc", 1, "a"}, + {"abc", 2, "ab"}, + {"abc", 3, "abc"}, + {"abc", 4, "abc"}, + {"abc", 5, "abc"}, + {"abc", 5, "abc"}, + {"abc", 4, "abc"}, + {"abc", 3, "abc"}, + {"abc", 2, "ab"}, + {"abc", 1, "a"}, + {"abc", 0, ""}, + + {"", 0, ""}, + {"", 1, ""}, + {"", 2, ""}, + {"", 0, ""}, + {"", 1, ""}, + {"", 2, ""}, + {"", 2, ""}, + {"", 1, ""}, + {"", 0, ""}, + + {NULL, 0, NULL}, + + {"foo", 3, "foo"}, + {"foobar", 6, "foobar"}, + }; + + _cleanup_free_ char *t = NULL; + const char *prev_expected = t; + + for (unsigned i = 0; i < ELEMENTSOF(cases); i++) { + test_free_and_strndup_one(&t, + cases[i].src, cases[i].len, cases[i].expected, + !streq_ptr(cases[i].expected, prev_expected)); + prev_expected = t; + } +} + +TEST(ascii_strcasecmp_n) { + assert_se(ascii_strcasecmp_n("", "", 0) == 0); + assert_se(ascii_strcasecmp_n("", "", 1) == 0); + assert_se(ascii_strcasecmp_n("", "a", 1) < 0); + assert_se(ascii_strcasecmp_n("", "a", 2) < 0); + assert_se(ascii_strcasecmp_n("a", "", 1) > 0); + assert_se(ascii_strcasecmp_n("a", "", 2) > 0); + assert_se(ascii_strcasecmp_n("a", "a", 1) == 0); + assert_se(ascii_strcasecmp_n("a", "a", 2) == 0); + assert_se(ascii_strcasecmp_n("a", "b", 1) < 0); + assert_se(ascii_strcasecmp_n("a", "b", 2) < 0); + assert_se(ascii_strcasecmp_n("b", "a", 1) > 0); + assert_se(ascii_strcasecmp_n("b", "a", 2) > 0); + assert_se(ascii_strcasecmp_n("xxxxyxxxx", "xxxxYxxxx", 9) == 0); + assert_se(ascii_strcasecmp_n("xxxxxxxxx", "xxxxyxxxx", 9) < 0); + assert_se(ascii_strcasecmp_n("xxxxXxxxx", "xxxxyxxxx", 9) < 0); + assert_se(ascii_strcasecmp_n("xxxxxxxxx", "xxxxYxxxx", 9) < 0); + assert_se(ascii_strcasecmp_n("xxxxXxxxx", "xxxxYxxxx", 9) < 0); + + assert_se(ascii_strcasecmp_n("xxxxYxxxx", "xxxxYxxxx", 9) == 0); + assert_se(ascii_strcasecmp_n("xxxxyxxxx", "xxxxxxxxx", 9) > 0); + assert_se(ascii_strcasecmp_n("xxxxyxxxx", "xxxxXxxxx", 9) > 0); + assert_se(ascii_strcasecmp_n("xxxxYxxxx", "xxxxxxxxx", 9) > 0); + assert_se(ascii_strcasecmp_n("xxxxYxxxx", "xxxxXxxxx", 9) > 0); +} + +TEST(ascii_strcasecmp_nn) { + assert_se(ascii_strcasecmp_nn("", 0, "", 0) == 0); + assert_se(ascii_strcasecmp_nn("", 0, "", 1) < 0); + assert_se(ascii_strcasecmp_nn("", 1, "", 0) > 0); + assert_se(ascii_strcasecmp_nn("", 1, "", 1) == 0); + + assert_se(ascii_strcasecmp_nn("aaaa", 4, "aaAa", 4) == 0); + assert_se(ascii_strcasecmp_nn("aaa", 3, "aaAa", 4) < 0); + assert_se(ascii_strcasecmp_nn("aaa", 4, "aaAa", 4) < 0); + assert_se(ascii_strcasecmp_nn("aaaa", 4, "aaA", 3) > 0); + assert_se(ascii_strcasecmp_nn("aaaa", 4, "AAA", 4) > 0); + + assert_se(ascii_strcasecmp_nn("aaaa", 4, "bbbb", 4) < 0); + assert_se(ascii_strcasecmp_nn("aaAA", 4, "BBbb", 4) < 0); + assert_se(ascii_strcasecmp_nn("BBbb", 4, "aaaa", 4) > 0); +} + +TEST(cellescape) { + char buf[40]; + + assert_se(streq(cellescape(buf, 1, ""), "")); + assert_se(streq(cellescape(buf, 1, "1"), "")); + assert_se(streq(cellescape(buf, 1, "12"), "")); + + assert_se(streq(cellescape(buf, 2, ""), "")); + assert_se(streq(cellescape(buf, 2, "1"), "1")); + assert_se(streq(cellescape(buf, 2, "12"), ".")); + assert_se(streq(cellescape(buf, 2, "123"), ".")); + + assert_se(streq(cellescape(buf, 3, ""), "")); + assert_se(streq(cellescape(buf, 3, "1"), "1")); + assert_se(streq(cellescape(buf, 3, "12"), "12")); + assert_se(streq(cellescape(buf, 3, "123"), "..")); + assert_se(streq(cellescape(buf, 3, "1234"), "..")); + + assert_se(streq(cellescape(buf, 4, ""), "")); + assert_se(streq(cellescape(buf, 4, "1"), "1")); + assert_se(streq(cellescape(buf, 4, "12"), "12")); + assert_se(streq(cellescape(buf, 4, "123"), "123")); + assert_se(streq(cellescape(buf, 4, "1234"), is_locale_utf8() ? "…" : "...")); + assert_se(streq(cellescape(buf, 4, "12345"), is_locale_utf8() ? "…" : "...")); + + assert_se(streq(cellescape(buf, 5, ""), "")); + assert_se(streq(cellescape(buf, 5, "1"), "1")); + assert_se(streq(cellescape(buf, 5, "12"), "12")); + assert_se(streq(cellescape(buf, 5, "123"), "123")); + assert_se(streq(cellescape(buf, 5, "1234"), "1234")); + assert_se(streq(cellescape(buf, 5, "12345"), is_locale_utf8() ? "1…" : "1...")); + assert_se(streq(cellescape(buf, 5, "123456"), is_locale_utf8() ? "1…" : "1...")); + + assert_se(streq(cellescape(buf, 1, "\020"), "")); + assert_se(streq(cellescape(buf, 2, "\020"), ".")); + assert_se(streq(cellescape(buf, 3, "\020"), "..")); + assert_se(streq(cellescape(buf, 4, "\020"), is_locale_utf8() ? "…" : "...")); + assert_se(streq(cellescape(buf, 5, "\020"), "\\020")); + + assert_se(streq(cellescape(buf, 5, "1234\020"), is_locale_utf8() ? "1…" : "1...")); + assert_se(streq(cellescape(buf, 6, "1234\020"), is_locale_utf8() ? "12…" : "12...")); + assert_se(streq(cellescape(buf, 7, "1234\020"), is_locale_utf8() ? "123…" : "123...")); + assert_se(streq(cellescape(buf, 8, "1234\020"), is_locale_utf8() ? "1234…" : "1234...")); + assert_se(streq(cellescape(buf, 9, "1234\020"), "1234\\020")); + + assert_se(streq(cellescape(buf, 1, "\t\n"), "")); + assert_se(streq(cellescape(buf, 2, "\t\n"), ".")); + assert_se(streq(cellescape(buf, 3, "\t\n"), "..")); + assert_se(streq(cellescape(buf, 4, "\t\n"), is_locale_utf8() ? "…" : "...")); + assert_se(streq(cellescape(buf, 5, "\t\n"), "\\t\\n")); + + assert_se(streq(cellescape(buf, 5, "1234\t\n"), is_locale_utf8() ? "1…" : "1...")); + assert_se(streq(cellescape(buf, 6, "1234\t\n"), is_locale_utf8() ? "12…" : "12...")); + assert_se(streq(cellescape(buf, 7, "1234\t\n"), is_locale_utf8() ? "123…" : "123...")); + assert_se(streq(cellescape(buf, 8, "1234\t\n"), is_locale_utf8() ? "1234…" : "1234...")); + assert_se(streq(cellescape(buf, 9, "1234\t\n"), "1234\\t\\n")); + + assert_se(streq(cellescape(buf, 4, "x\t\020\n"), is_locale_utf8() ? "…" : "...")); + assert_se(streq(cellescape(buf, 5, "x\t\020\n"), is_locale_utf8() ? "x…" : "x...")); + assert_se(streq(cellescape(buf, 6, "x\t\020\n"), is_locale_utf8() ? "x…" : "x...")); + assert_se(streq(cellescape(buf, 7, "x\t\020\n"), is_locale_utf8() ? "x\\t…" : "x\\t...")); + assert_se(streq(cellescape(buf, 8, "x\t\020\n"), is_locale_utf8() ? "x\\t…" : "x\\t...")); + assert_se(streq(cellescape(buf, 9, "x\t\020\n"), is_locale_utf8() ? "x\\t…" : "x\\t...")); + assert_se(streq(cellescape(buf, 10, "x\t\020\n"), "x\\t\\020\\n")); + + assert_se(streq(cellescape(buf, 6, "1\011"), "1\\t")); + assert_se(streq(cellescape(buf, 6, "1\020"), "1\\020")); + assert_se(streq(cellescape(buf, 6, "1\020x"), is_locale_utf8() ? "1…" : "1...")); + + assert_se(streq(cellescape(buf, 40, "1\020"), "1\\020")); + assert_se(streq(cellescape(buf, 40, "1\020x"), "1\\020x")); + + assert_se(streq(cellescape(buf, 40, "\a\b\f\n\r\t\v\\\"'"), "\\a\\b\\f\\n\\r\\t\\v\\\\\\\"\\'")); + assert_se(streq(cellescape(buf, 6, "\a\b\f\n\r\t\v\\\"'"), is_locale_utf8() ? "\\a…" : "\\a...")); + assert_se(streq(cellescape(buf, 7, "\a\b\f\n\r\t\v\\\"'"), is_locale_utf8() ? "\\a…" : "\\a...")); + assert_se(streq(cellescape(buf, 8, "\a\b\f\n\r\t\v\\\"'"), is_locale_utf8() ? "\\a\\b…" : "\\a\\b...")); + + assert_se(streq(cellescape(buf, sizeof buf, "1\020"), "1\\020")); + assert_se(streq(cellescape(buf, sizeof buf, "1\020x"), "1\\020x")); +} + +TEST(streq_ptr) { + assert_se(streq_ptr(NULL, NULL)); + assert_se(!streq_ptr("abc", "cdef")); +} + +TEST(strstrip) { + char *ret, input[] = " hello, waldo. "; + + ret = strstrip(input); + assert_se(streq(ret, "hello, waldo.")); +} + +TEST(strextend) { + _cleanup_free_ char *str = NULL; + + assert_se(strextend(&str, NULL)); + assert_se(streq_ptr(str, "")); + assert_se(strextend(&str, "", "0", "", "", "123")); + assert_se(streq_ptr(str, "0123")); + assert_se(strextend(&str, "456", "78", "9")); + assert_se(streq_ptr(str, "0123456789")); +} + +TEST(strextend_with_separator) { + _cleanup_free_ char *str = NULL; + + assert_se(strextend_with_separator(&str, NULL, NULL)); + assert_se(streq_ptr(str, "")); + str = mfree(str); + + assert_se(strextend_with_separator(&str, "...", NULL)); + assert_se(streq_ptr(str, "")); + assert_se(strextend_with_separator(&str, "...", NULL)); + assert_se(streq_ptr(str, "")); + str = mfree(str); + + assert_se(strextend_with_separator(&str, "xyz", "a", "bb", "ccc")); + assert_se(streq_ptr(str, "axyzbbxyzccc")); + str = mfree(str); + + assert_se(strextend_with_separator(&str, ",", "start", "", "1", "234")); + assert_se(streq_ptr(str, "start,,1,234")); + assert_se(strextend_with_separator(&str, ";", "more", "5", "678")); + assert_se(streq_ptr(str, "start,,1,234;more;5;678")); +} + +TEST(strrep) { + _cleanup_free_ char *one = NULL, *three = NULL, *zero = NULL; + char *onea, *threea; + + one = strrep("waldo", 1); + three = strrep("waldo", 3); + zero = strrep("waldo", 0); + + assert_se(streq(one, "waldo")); + assert_se(streq(three, "waldowaldowaldo")); + assert_se(streq(zero, "")); + + onea = strrepa("waldo", 1); + threea = strrepa("waldo", 3); + + assert_se(streq(onea, "waldo")); + assert_se(streq(threea, "waldowaldowaldo")); +} + +TEST(string_has_cc) { + assert_se(string_has_cc("abc\1", NULL)); + assert_se(string_has_cc("abc\x7f", NULL)); + assert_se(string_has_cc("abc\x7f", NULL)); + assert_se(string_has_cc("abc\t\x7f", "\t")); + assert_se(string_has_cc("abc\t\x7f", "\t")); + assert_se(string_has_cc("\x7f", "\t")); + assert_se(string_has_cc("\x7f", "\t\a")); + + assert_se(!string_has_cc("abc\t\t", "\t")); + assert_se(!string_has_cc("abc\t\t\a", "\t\a")); + assert_se(!string_has_cc("a\ab\tc", "\t\a")); +} + +TEST(ascii_strlower) { + char a[] = "AabBcC Jk Ii Od LKJJJ kkd LK"; + assert_se(streq(ascii_strlower(a), "aabbcc jk ii od lkjjj kkd lk")); +} + +TEST(strshorten) { + char s[] = "foobar"; + + assert_se(strlen(strshorten(s, 6)) == 6); + assert_se(strlen(strshorten(s, 12)) == 6); + assert_se(strlen(strshorten(s, 2)) == 2); + assert_se(strlen(strshorten(s, 0)) == 0); +} + +TEST(strjoina) { + char *actual; + + actual = strjoina("", "foo", "bar"); + assert_se(streq(actual, "foobar")); + + actual = strjoina("foo", "bar", "baz"); + assert_se(streq(actual, "foobarbaz")); + + actual = strjoina("foo", "", "bar", "baz"); + assert_se(streq(actual, "foobarbaz")); + + actual = strjoina("foo"); + assert_se(streq(actual, "foo")); + + actual = strjoina(NULL); + assert_se(streq(actual, "")); + + actual = strjoina(NULL, "foo"); + assert_se(streq(actual, "")); + + actual = strjoina("foo", NULL, "bar"); + assert_se(streq(actual, "foo")); + + actual = strjoina("/sys/fs/cgroup/", "dn", "/a/b/c", "/cgroup.procs"); + assert_se(streq(actual, "/sys/fs/cgroup/dn/a/b/c/cgroup.procs")); + + actual = strjoina("/sys/fs/cgroup/", "dn", NULL, NULL); + assert_se(streq(actual, "/sys/fs/cgroup/dn")); +} + +TEST(strjoin) { + char *actual; + + actual = strjoin("", "foo", "bar"); + assert_se(streq(actual, "foobar")); + mfree(actual); + + actual = strjoin("foo", "bar", "baz"); + assert_se(streq(actual, "foobarbaz")); + mfree(actual); + + actual = strjoin("foo", "", "bar", "baz"); + assert_se(streq(actual, "foobarbaz")); + mfree(actual); + + actual = strjoin("foo", NULL); + assert_se(streq(actual, "foo")); + mfree(actual); + + actual = strjoin(NULL, NULL); + assert_se(streq(actual, "")); + mfree(actual); + + actual = strjoin(NULL, "foo"); + assert_se(streq(actual, "")); + mfree(actual); + + actual = strjoin("foo", NULL, "bar"); + assert_se(streq(actual, "foo")); + mfree(actual); +} + +TEST(strcmp_ptr) { + assert_se(strcmp_ptr(NULL, NULL) == 0); + assert_se(strcmp_ptr("", NULL) > 0); + assert_se(strcmp_ptr("foo", NULL) > 0); + assert_se(strcmp_ptr(NULL, "") < 0); + assert_se(strcmp_ptr(NULL, "bar") < 0); + assert_se(strcmp_ptr("foo", "bar") > 0); + assert_se(strcmp_ptr("bar", "baz") < 0); + assert_se(strcmp_ptr("foo", "foo") == 0); + assert_se(strcmp_ptr("", "") == 0); +} + +TEST(foreach_word) { + const char *test = "test abc d\te f "; + const char * const expected[] = { + "test", + "abc", + "d", + "e", + "f", + }; + + size_t i = 0; + int r; + for (const char *p = test;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) { + assert_se(i == ELEMENTSOF(expected)); + break; + } + assert_se(r > 0); + + assert_se(streq(expected[i++], word)); + } +} + +static void check(const char *test, char** expected, bool trailing) { + size_t i = 0; + int r; + + printf("<<<%s>>>\n", test); + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&test, &word, NULL, EXTRACT_UNQUOTE); + if (r == 0) { + assert_se(!trailing); + break; + } else if (r < 0) { + assert_se(trailing); + break; + } + + assert_se(streq(word, expected[i++])); + printf("<%s>\n", word); + } + assert_se(expected[i] == NULL); +} + +TEST(foreach_word_quoted) { + check("test a b c 'd' e '' '' hhh '' '' \"a b c\"", + STRV_MAKE("test", + "a", + "b", + "c", + "d", + "e", + "", + "", + "hhh", + "", + "", + "a b c"), + false); + + check("test \"xxx", + STRV_MAKE("test"), + true); + + check("test\\", + STRV_MAKE_EMPTY, + true); +} + +TEST(endswith) { + assert_se(endswith("foobar", "bar")); + assert_se(endswith("foobar", "")); + assert_se(endswith("foobar", "foobar")); + assert_se(endswith("", "")); + + assert_se(!endswith("foobar", "foo")); + assert_se(!endswith("foobar", "foobarfoofoo")); +} + +TEST(endswith_no_case) { + assert_se(endswith_no_case("fooBAR", "bar")); + assert_se(endswith_no_case("foobar", "")); + assert_se(endswith_no_case("foobar", "FOOBAR")); + assert_se(endswith_no_case("", "")); + + assert_se(!endswith_no_case("foobar", "FOO")); + assert_se(!endswith_no_case("foobar", "FOOBARFOOFOO")); +} + +TEST(delete_chars) { + char *s, input[] = " hello, waldo. abc"; + + s = delete_chars(input, WHITESPACE); + assert_se(streq(s, "hello,waldo.abc")); + assert_se(s == input); +} + +TEST(delete_trailing_chars) { + char *s, + input1[] = " \n \r k \n \r ", + input2[] = "kkkkthiskkkiskkkaktestkkk", + input3[] = "abcdef"; + + s = delete_trailing_chars(input1, WHITESPACE); + assert_se(streq(s, " \n \r k")); + assert_se(s == input1); + + s = delete_trailing_chars(input2, "kt"); + assert_se(streq(s, "kkkkthiskkkiskkkaktes")); + assert_se(s == input2); + + s = delete_trailing_chars(input3, WHITESPACE); + assert_se(streq(s, "abcdef")); + assert_se(s == input3); + + s = delete_trailing_chars(input3, "fe"); + assert_se(streq(s, "abcd")); + assert_se(s == input3); +} + +TEST(delete_trailing_slashes) { + char s1[] = "foobar//", + s2[] = "foobar/", + s3[] = "foobar", + s4[] = ""; + + assert_se(streq(delete_trailing_chars(s1, "_"), "foobar//")); + assert_se(streq(delete_trailing_chars(s1, "/"), "foobar")); + assert_se(streq(delete_trailing_chars(s2, "/"), "foobar")); + assert_se(streq(delete_trailing_chars(s3, "/"), "foobar")); + assert_se(streq(delete_trailing_chars(s4, "/"), "")); +} + +TEST(skip_leading_chars) { + char input1[] = " \n \r k \n \r ", + input2[] = "kkkkthiskkkiskkkaktestkkk", + input3[] = "abcdef"; + + assert_se(streq(skip_leading_chars(input1, WHITESPACE), "k \n \r ")); + assert_se(streq(skip_leading_chars(input2, "k"), "thiskkkiskkkaktestkkk")); + assert_se(streq(skip_leading_chars(input2, "tk"), "hiskkkiskkkaktestkkk")); + assert_se(streq(skip_leading_chars(input3, WHITESPACE), "abcdef")); + assert_se(streq(skip_leading_chars(input3, "bcaef"), "def")); +} + +TEST(in_charset) { + assert_se(in_charset("dddaaabbbcccc", "abcd")); + assert_se(!in_charset("dddaaabbbcccc", "abc f")); +} + +TEST(split_pair) { + _cleanup_free_ char *a = NULL, *b = NULL; + + assert_se(split_pair("", "", &a, &b) == -EINVAL); + assert_se(split_pair("foo=bar", "", &a, &b) == -EINVAL); + assert_se(split_pair("", "=", &a, &b) == -EINVAL); + assert_se(split_pair("foo=bar", "=", &a, &b) >= 0); + assert_se(streq(a, "foo")); + assert_se(streq(b, "bar")); + free(a); + free(b); + assert_se(split_pair("==", "==", &a, &b) >= 0); + assert_se(streq(a, "")); + assert_se(streq(b, "")); + free(a); + free(b); + + assert_se(split_pair("===", "==", &a, &b) >= 0); + assert_se(streq(a, "")); + assert_se(streq(b, "=")); +} + +TEST(first_word) { + assert_se(first_word("Hello", "")); + assert_se(first_word("Hello", "Hello")); + assert_se(first_word("Hello world", "Hello")); + assert_se(first_word("Hello\tworld", "Hello")); + assert_se(first_word("Hello\nworld", "Hello")); + assert_se(first_word("Hello\rworld", "Hello")); + assert_se(first_word("Hello ", "Hello")); + + assert_se(!first_word("Hello", "Hellooo")); + assert_se(!first_word("Hello", "xxxxx")); + assert_se(!first_word("Hellooo", "Hello")); +} + +TEST(strlen_ptr) { + assert_se(strlen_ptr("foo") == 3); + assert_se(strlen_ptr("") == 0); + assert_se(strlen_ptr(NULL) == 0); +} + +TEST(memory_startswith) { + assert_se(streq(memory_startswith("", 0, ""), "")); + assert_se(streq(memory_startswith("", 1, ""), "")); + assert_se(streq(memory_startswith("x", 2, ""), "x")); + assert_se(!memory_startswith("", 1, "x")); + assert_se(!memory_startswith("", 1, "xxxxxxxx")); + assert_se(streq(memory_startswith("xxx", 4, "x"), "xx")); + assert_se(streq(memory_startswith("xxx", 4, "xx"), "x")); + assert_se(streq(memory_startswith("xxx", 4, "xxx"), "")); + assert_se(!memory_startswith("xxx", 4, "xxxx")); +} + +TEST(memory_startswith_no_case) { + assert_se(streq(memory_startswith_no_case("", 0, ""), "")); + assert_se(streq(memory_startswith_no_case("", 1, ""), "")); + assert_se(streq(memory_startswith_no_case("x", 2, ""), "x")); + assert_se(streq(memory_startswith_no_case("X", 2, ""), "X")); + assert_se(!memory_startswith_no_case("", 1, "X")); + assert_se(!memory_startswith_no_case("", 1, "xxxxXXXX")); + assert_se(streq(memory_startswith_no_case("xxx", 4, "X"), "xx")); + assert_se(streq(memory_startswith_no_case("XXX", 4, "x"), "XX")); + assert_se(streq(memory_startswith_no_case("XXX", 4, "X"), "XX")); + assert_se(streq(memory_startswith_no_case("xxx", 4, "XX"), "x")); + assert_se(streq(memory_startswith_no_case("XXX", 4, "xx"), "X")); + assert_se(streq(memory_startswith_no_case("XXX", 4, "XX"), "X")); + assert_se(streq(memory_startswith_no_case("xxx", 4, "XXX"), "")); + assert_se(streq(memory_startswith_no_case("XXX", 4, "xxx"), "")); + assert_se(streq(memory_startswith_no_case("XXX", 4, "XXX"), "")); + + assert_se(memory_startswith_no_case((char[2]){'x', 'x'}, 2, "xx")); + assert_se(memory_startswith_no_case((char[2]){'x', 'X'}, 2, "xX")); + assert_se(memory_startswith_no_case((char[2]){'X', 'x'}, 2, "Xx")); + assert_se(memory_startswith_no_case((char[2]){'X', 'X'}, 2, "XX")); +} + +static void test_string_truncate_lines_one(const char *input, size_t n_lines, const char *output, bool truncation) { + _cleanup_free_ char *b = NULL; + int k; + + assert_se((k = string_truncate_lines(input, n_lines, &b)) >= 0); + assert_se(streq(b, output)); + assert_se(!!k == truncation); +} + +TEST(string_truncate_lines) { + test_string_truncate_lines_one("", 0, "", false); + test_string_truncate_lines_one("", 1, "", false); + test_string_truncate_lines_one("", 2, "", false); + test_string_truncate_lines_one("", 3, "", false); + + test_string_truncate_lines_one("x", 0, "", true); + test_string_truncate_lines_one("x", 1, "x", false); + test_string_truncate_lines_one("x", 2, "x", false); + test_string_truncate_lines_one("x", 3, "x", false); + + test_string_truncate_lines_one("x\n", 0, "", true); + test_string_truncate_lines_one("x\n", 1, "x", false); + test_string_truncate_lines_one("x\n", 2, "x", false); + test_string_truncate_lines_one("x\n", 3, "x", false); + + test_string_truncate_lines_one("x\ny", 0, "", true); + test_string_truncate_lines_one("x\ny", 1, "x", true); + test_string_truncate_lines_one("x\ny", 2, "x\ny", false); + test_string_truncate_lines_one("x\ny", 3, "x\ny", false); + + test_string_truncate_lines_one("x\ny\n", 0, "", true); + test_string_truncate_lines_one("x\ny\n", 1, "x", true); + test_string_truncate_lines_one("x\ny\n", 2, "x\ny", false); + test_string_truncate_lines_one("x\ny\n", 3, "x\ny", false); + + test_string_truncate_lines_one("x\ny\nz", 0, "", true); + test_string_truncate_lines_one("x\ny\nz", 1, "x", true); + test_string_truncate_lines_one("x\ny\nz", 2, "x\ny", true); + test_string_truncate_lines_one("x\ny\nz", 3, "x\ny\nz", false); + + test_string_truncate_lines_one("x\ny\nz\n", 0, "", true); + test_string_truncate_lines_one("x\ny\nz\n", 1, "x", true); + test_string_truncate_lines_one("x\ny\nz\n", 2, "x\ny", true); + test_string_truncate_lines_one("x\ny\nz\n", 3, "x\ny\nz", false); + + test_string_truncate_lines_one("\n", 0, "", false); + test_string_truncate_lines_one("\n", 1, "", false); + test_string_truncate_lines_one("\n", 2, "", false); + test_string_truncate_lines_one("\n", 3, "", false); + + test_string_truncate_lines_one("\n\n", 0, "", false); + test_string_truncate_lines_one("\n\n", 1, "", false); + test_string_truncate_lines_one("\n\n", 2, "", false); + test_string_truncate_lines_one("\n\n", 3, "", false); + + test_string_truncate_lines_one("\n\n\n", 0, "", false); + test_string_truncate_lines_one("\n\n\n", 1, "", false); + test_string_truncate_lines_one("\n\n\n", 2, "", false); + test_string_truncate_lines_one("\n\n\n", 3, "", false); + + test_string_truncate_lines_one("\nx\n\n", 0, "", true); + test_string_truncate_lines_one("\nx\n\n", 1, "", true); + test_string_truncate_lines_one("\nx\n\n", 2, "\nx", false); + test_string_truncate_lines_one("\nx\n\n", 3, "\nx", false); + + test_string_truncate_lines_one("\n\nx\n", 0, "", true); + test_string_truncate_lines_one("\n\nx\n", 1, "", true); + test_string_truncate_lines_one("\n\nx\n", 2, "", true); + test_string_truncate_lines_one("\n\nx\n", 3, "\n\nx", false); +} + +static void test_string_extract_lines_one(const char *input, size_t i, const char *output, bool more) { + _cleanup_free_ char *b = NULL; + int k; + + assert_se((k = string_extract_line(input, i, &b)) >= 0); + assert_se(streq(b ?: input, output)); + assert_se(!!k == more); +} + +TEST(string_extract_line) { + test_string_extract_lines_one("", 0, "", false); + test_string_extract_lines_one("", 1, "", false); + test_string_extract_lines_one("", 2, "", false); + test_string_extract_lines_one("", 3, "", false); + + test_string_extract_lines_one("x", 0, "x", false); + test_string_extract_lines_one("x", 1, "", false); + test_string_extract_lines_one("x", 2, "", false); + test_string_extract_lines_one("x", 3, "", false); + + test_string_extract_lines_one("x\n", 0, "x", false); + test_string_extract_lines_one("x\n", 1, "", false); + test_string_extract_lines_one("x\n", 2, "", false); + test_string_extract_lines_one("x\n", 3, "", false); + + test_string_extract_lines_one("x\ny", 0, "x", true); + test_string_extract_lines_one("x\ny", 1, "y", false); + test_string_extract_lines_one("x\ny", 2, "", false); + test_string_extract_lines_one("x\ny", 3, "", false); + + test_string_extract_lines_one("x\ny\n", 0, "x", true); + test_string_extract_lines_one("x\ny\n", 1, "y", false); + test_string_extract_lines_one("x\ny\n", 2, "", false); + test_string_extract_lines_one("x\ny\n", 3, "", false); + + test_string_extract_lines_one("x\ny\nz", 0, "x", true); + test_string_extract_lines_one("x\ny\nz", 1, "y", true); + test_string_extract_lines_one("x\ny\nz", 2, "z", false); + test_string_extract_lines_one("x\ny\nz", 3, "", false); + + test_string_extract_lines_one("\n", 0, "", false); + test_string_extract_lines_one("\n", 1, "", false); + test_string_extract_lines_one("\n", 2, "", false); + test_string_extract_lines_one("\n", 3, "", false); + + test_string_extract_lines_one("\n\n", 0, "", true); + test_string_extract_lines_one("\n\n", 1, "", false); + test_string_extract_lines_one("\n\n", 2, "", false); + test_string_extract_lines_one("\n\n", 3, "", false); + + test_string_extract_lines_one("\n\n\n", 0, "", true); + test_string_extract_lines_one("\n\n\n", 1, "", true); + test_string_extract_lines_one("\n\n\n", 2, "", false); + test_string_extract_lines_one("\n\n\n", 3, "", false); + + test_string_extract_lines_one("\n\n\n\n", 0, "", true); + test_string_extract_lines_one("\n\n\n\n", 1, "", true); + test_string_extract_lines_one("\n\n\n\n", 2, "", true); + test_string_extract_lines_one("\n\n\n\n", 3, "", false); + + test_string_extract_lines_one("\nx\n\n\n", 0, "", true); + test_string_extract_lines_one("\nx\n\n\n", 1, "x", true); + test_string_extract_lines_one("\nx\n\n\n", 2, "", true); + test_string_extract_lines_one("\nx\n\n\n", 3, "", false); + + test_string_extract_lines_one("\n\nx\n\n", 0, "", true); + test_string_extract_lines_one("\n\nx\n\n", 1, "", true); + test_string_extract_lines_one("\n\nx\n\n", 2, "x", true); + test_string_extract_lines_one("\n\nx\n\n", 3, "", false); + + test_string_extract_lines_one("\n\n\nx\n", 0, "", true); + test_string_extract_lines_one("\n\n\nx\n", 1, "", true); + test_string_extract_lines_one("\n\n\nx\n", 2, "", true); + test_string_extract_lines_one("\n\n\nx\n", 3, "x", false); +} + +TEST(string_contains_word_strv) { + const char *w; + + assert_se(string_contains_word_strv("a b cc", NULL, STRV_MAKE("a", "b"), NULL)); + + assert_se(string_contains_word_strv("a b cc", NULL, STRV_MAKE("a", "b"), &w)); + assert_se(streq(w, "a")); + + assert_se(!string_contains_word_strv("a b cc", NULL, STRV_MAKE("d"), &w)); + assert_se(w == NULL); + + assert_se(string_contains_word_strv("a b cc", NULL, STRV_MAKE("b", "a"), &w)); + assert_se(streq(w, "a")); + + assert_se(string_contains_word_strv("b a b cc", NULL, STRV_MAKE("b", "a", "b"), &w)); + assert_se(streq(w, "b")); + + assert_se(string_contains_word_strv("a b cc", NULL, STRV_MAKE("b", ""), &w)); + assert_se(streq(w, "b")); + + assert_se(!string_contains_word_strv("a b cc", NULL, STRV_MAKE(""), &w)); + assert_se(w == NULL); + + assert_se(string_contains_word_strv("a b cc", " ", STRV_MAKE(""), &w)); + assert_se(streq(w, "")); +} + +TEST(string_contains_word) { + assert_se( string_contains_word("a b cc", NULL, "a")); + assert_se( string_contains_word("a b cc", NULL, "b")); + assert_se(!string_contains_word("a b cc", NULL, "c")); + assert_se( string_contains_word("a b cc", NULL, "cc")); + assert_se(!string_contains_word("a b cc", NULL, "d")); + assert_se(!string_contains_word("a b cc", NULL, "a b")); + assert_se(!string_contains_word("a b cc", NULL, "a b c")); + assert_se(!string_contains_word("a b cc", NULL, "b c")); + assert_se(!string_contains_word("a b cc", NULL, "b cc")); + assert_se(!string_contains_word("a b cc", NULL, "a ")); + assert_se(!string_contains_word("a b cc", NULL, " b ")); + assert_se(!string_contains_word("a b cc", NULL, " cc")); + + assert_se( string_contains_word(" a b\t\tcc", NULL, "a")); + assert_se( string_contains_word(" a b\t\tcc", NULL, "b")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "c")); + assert_se( string_contains_word(" a b\t\tcc", NULL, "cc")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "d")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "a b")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "a b\t\tc")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "b\t\tc")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "b\t\tcc")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, "a ")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, " b ")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, " cc")); + + assert_se(!string_contains_word(" a b\t\tcc", NULL, "")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, " ")); + assert_se(!string_contains_word(" a b\t\tcc", NULL, " ")); + assert_se( string_contains_word(" a b\t\tcc", " ", "")); + assert_se( string_contains_word(" a b\t\tcc", "\t", "")); + assert_se( string_contains_word(" a b\t\tcc", WHITESPACE, "")); + + assert_se( string_contains_word("a:b:cc", ":#", "a")); + assert_se( string_contains_word("a:b:cc", ":#", "b")); + assert_se(!string_contains_word("a:b:cc", ":#", "c")); + assert_se( string_contains_word("a:b:cc", ":#", "cc")); + assert_se(!string_contains_word("a:b:cc", ":#", "d")); + assert_se(!string_contains_word("a:b:cc", ":#", "a:b")); + assert_se(!string_contains_word("a:b:cc", ":#", "a:b:c")); + assert_se(!string_contains_word("a:b:cc", ":#", "b:c")); + assert_se(!string_contains_word("a#b#cc", ":#", "b:cc")); + assert_se( string_contains_word("a#b#cc", ":#", "b")); + assert_se( string_contains_word("a#b#cc", ":#", "cc")); + assert_se(!string_contains_word("a:b:cc", ":#", "a:")); + assert_se(!string_contains_word("a:b cc", ":#", "b")); + assert_se( string_contains_word("a:b cc", ":#", "b cc")); + assert_se(!string_contains_word("a:b:cc", ":#", ":cc")); +} + +static void test_strverscmp_improved_one(const char* a, const char *b, int expected) { + int r = strverscmp_improved(a, b); + + log_info("'%s' %s '%s'%s", + strnull(a), + comparison_operator(r), + strnull(b), + r == expected ? "" : " !!!!!!!!!!!!!"); + assert_se(r == expected); +} + +static void test_strverscmp_improved_newer(const char *older, const char *newer) { + test_strverscmp_improved_one(older, newer, -1); + + assert_se(strverscmp_improved(older, older) == 0); + assert_se(strverscmp_improved(older, newer) < 0); + assert_se(strverscmp_improved(newer, older) > 0); + assert_se(strverscmp_improved(newer, newer) == 0); +} + +TEST(strverscmp_improved) { + static const char * const versions[] = { + "~1", + "", + "ab", + "abb", + "abc", + "0001", + "002", + "12", + "122", + "122.9", + "123~rc1", + "123", + "123-a", + "123-a.1", + "123-a1", + "123-a1.1", + "123-3", + "123-3.1", + "123^patch1", + "123^1", + "123.a-1", + "123.1-1", + "123a-1", + "124", + NULL, + }; + + STRV_FOREACH(p, versions) + STRV_FOREACH(q, p + 1) + test_strverscmp_improved_newer(*p, *q); + + test_strverscmp_improved_newer("123.45-67.88", "123.45-67.89"); + test_strverscmp_improved_newer("123.45-67.89", "123.45-67.89a"); + test_strverscmp_improved_newer("123.45-67.ab", "123.45-67.89"); + test_strverscmp_improved_newer("123.45-67.9", "123.45-67.89"); + test_strverscmp_improved_newer("123.45-67", "123.45-67.89"); + test_strverscmp_improved_newer("123.45-66.89", "123.45-67.89"); + test_strverscmp_improved_newer("123.45-9.99", "123.45-67.89"); + test_strverscmp_improved_newer("123.42-99.99", "123.45-67.89"); + test_strverscmp_improved_newer("123-99.99", "123.45-67.89"); + + /* '~' : pre-releases */ + test_strverscmp_improved_newer("123~rc1-99.99", "123.45-67.89"); + test_strverscmp_improved_newer("123~rc1-99.99", "123-45.67.89"); + test_strverscmp_improved_newer("123~rc1-99.99", "123~rc2-67.89"); + test_strverscmp_improved_newer("123~rc1-99.99", "123^aa2-67.89"); + test_strverscmp_improved_newer("123~rc1-99.99", "123aa2-67.89"); + + /* '-' : separator between version and release. */ + test_strverscmp_improved_newer("123-99.99", "123.45-67.89"); + test_strverscmp_improved_newer("123-99.99", "123^aa2-67.89"); + test_strverscmp_improved_newer("123-99.99", "123aa2-67.89"); + + /* '^' : patch releases */ + test_strverscmp_improved_newer("123^45-67.89", "123.45-67.89"); + test_strverscmp_improved_newer("123^aa1-99.99", "123^aa2-67.89"); + test_strverscmp_improved_newer("123^aa2-67.89", "123aa2-67.89"); + + /* '.' : point release */ + test_strverscmp_improved_newer("123.aa2-67.89", "123aa2-67.89"); + test_strverscmp_improved_newer("123.aa2-67.89", "123.ab2-67.89"); + + /* invalid characters */ + assert_se(strverscmp_improved("123_aa2-67.89", "123aa+2-67.89") == 0); + + /* some corner cases */ + assert_se(strverscmp_improved("123.", "123") > 0); /* One more version segment */ + assert_se(strverscmp_improved("12_3", "123") < 0); /* 12 < 123 */ + assert_se(strverscmp_improved("12_3", "12") > 0); /* 3 > '' */ + assert_se(strverscmp_improved("12_3", "12.3") > 0); /* 3 > '' */ + assert_se(strverscmp_improved("123.0", "123") > 0); /* 0 > '' */ + assert_se(strverscmp_improved("123_0", "123") > 0); /* 0 > '' */ + assert_se(strverscmp_improved("123..0", "123.0") < 0); /* '' < 0 */ + + /* empty strings or strings with ignored characters only */ + assert_se(strverscmp_improved("", NULL) == 0); + assert_se(strverscmp_improved(NULL, "") == 0); + assert_se(strverscmp_improved("0_", "0") == 0); + assert_se(strverscmp_improved("_0_", "0") == 0); + assert_se(strverscmp_improved("_0", "0") == 0); + assert_se(strverscmp_improved("0", "0___") == 0); + assert_se(strverscmp_improved("", "_") == 0); + assert_se(strverscmp_improved("_", "") == 0); + assert_se(strverscmp_improved("_", "_") == 0); + assert_se(strverscmp_improved("", "~") > 0); + assert_se(strverscmp_improved("~", "") < 0); + assert_se(strverscmp_improved("~", "~") == 0); + + /* non-ASCII digits */ + (void) setlocale(LC_NUMERIC, "ar_YE.utf8"); + assert_se(strverscmp_improved("1٠١٢٣٤٥٦٧٨٩", "1") == 0); + + (void) setlocale(LC_NUMERIC, "th_TH.utf8"); + assert_se(strverscmp_improved("1๐๑๒๓๔๕๖๗๘๙", "1") == 0); +} + +#define RPMVERCMP(a, b, c) \ + test_strverscmp_improved_one(STRINGIFY(a), STRINGIFY(b), (c)) + +TEST(strverscmp_improved_rpm) { + /* Tests copied from rmp's rpmio test suite, under the LGPL license: + * https://github.com/rpm-software-management/rpm/blob/master/tests/rpmvercmp.at. + * The original form is retained for easy comparisons and updates. + */ + + RPMVERCMP(1.0, 1.0, 0); + RPMVERCMP(1.0, 2.0, -1); + RPMVERCMP(2.0, 1.0, 1); + + RPMVERCMP(2.0.1, 2.0.1, 0); + RPMVERCMP(2.0, 2.0.1, -1); + RPMVERCMP(2.0.1, 2.0, 1); + + RPMVERCMP(2.0.1a, 2.0.1a, 0); + RPMVERCMP(2.0.1a, 2.0.1, 1); + RPMVERCMP(2.0.1, 2.0.1a, -1); + + RPMVERCMP(5.5p1, 5.5p1, 0); + RPMVERCMP(5.5p1, 5.5p2, -1); + RPMVERCMP(5.5p2, 5.5p1, 1); + + RPMVERCMP(5.5p10, 5.5p10, 0); + RPMVERCMP(5.5p1, 5.5p10, -1); + RPMVERCMP(5.5p10, 5.5p1, 1); + + RPMVERCMP(10xyz, 10.1xyz, 1); /* Note: this is reversed from rpm's vercmp */ + RPMVERCMP(10.1xyz, 10xyz, -1); /* Note: this is reversed from rpm's vercmp */ + + RPMVERCMP(xyz10, xyz10, 0); + RPMVERCMP(xyz10, xyz10.1, -1); + RPMVERCMP(xyz10.1, xyz10, 1); + + RPMVERCMP(xyz.4, xyz.4, 0); + RPMVERCMP(xyz.4, 8, -1); + RPMVERCMP(8, xyz.4, 1); + RPMVERCMP(xyz.4, 2, -1); + RPMVERCMP(2, xyz.4, 1); + + RPMVERCMP(5.5p2, 5.6p1, -1); + RPMVERCMP(5.6p1, 5.5p2, 1); + + RPMVERCMP(5.6p1, 6.5p1, -1); + RPMVERCMP(6.5p1, 5.6p1, 1); + + RPMVERCMP(6.0.rc1, 6.0, 1); + RPMVERCMP(6.0, 6.0.rc1, -1); + + RPMVERCMP(10b2, 10a1, 1); + RPMVERCMP(10a2, 10b2, -1); + + RPMVERCMP(1.0aa, 1.0aa, 0); + RPMVERCMP(1.0a, 1.0aa, -1); + RPMVERCMP(1.0aa, 1.0a, 1); + + RPMVERCMP(10.0001, 10.0001, 0); + RPMVERCMP(10.0001, 10.1, 0); + RPMVERCMP(10.1, 10.0001, 0); + RPMVERCMP(10.0001, 10.0039, -1); + RPMVERCMP(10.0039, 10.0001, 1); + + RPMVERCMP(4.999.9, 5.0, -1); + RPMVERCMP(5.0, 4.999.9, 1); + + RPMVERCMP(20101121, 20101121, 0); + RPMVERCMP(20101121, 20101122, -1); + RPMVERCMP(20101122, 20101121, 1); + + RPMVERCMP(2_0, 2_0, 0); + RPMVERCMP(2.0, 2_0, -1); /* Note: in rpm those compare equal */ + RPMVERCMP(2_0, 2.0, 1); /* Note: in rpm those compare equal */ + + /* RhBug:178798 case */ + RPMVERCMP(a, a, 0); + RPMVERCMP(a+, a+, 0); + RPMVERCMP(a+, a_, 0); + RPMVERCMP(a_, a+, 0); + RPMVERCMP(+a, +a, 0); + RPMVERCMP(+a, _a, 0); + RPMVERCMP(_a, +a, 0); + RPMVERCMP(+_, +_, 0); + RPMVERCMP(_+, +_, 0); + RPMVERCMP(_+, _+, 0); + RPMVERCMP(+, _, 0); + RPMVERCMP(_, +, 0); + + /* Basic testcases for tilde sorting */ + RPMVERCMP(1.0~rc1, 1.0~rc1, 0); + RPMVERCMP(1.0~rc1, 1.0, -1); + RPMVERCMP(1.0, 1.0~rc1, 1); + RPMVERCMP(1.0~rc1, 1.0~rc2, -1); + RPMVERCMP(1.0~rc2, 1.0~rc1, 1); + RPMVERCMP(1.0~rc1~git123, 1.0~rc1~git123, 0); + RPMVERCMP(1.0~rc1~git123, 1.0~rc1, -1); + RPMVERCMP(1.0~rc1, 1.0~rc1~git123, 1); + + /* Basic testcases for caret sorting */ + RPMVERCMP(1.0^, 1.0^, 0); + RPMVERCMP(1.0^, 1.0, 1); + RPMVERCMP(1.0, 1.0^, -1); + RPMVERCMP(1.0^git1, 1.0^git1, 0); + RPMVERCMP(1.0^git1, 1.0, 1); + RPMVERCMP(1.0, 1.0^git1, -1); + RPMVERCMP(1.0^git1, 1.0^git2, -1); + RPMVERCMP(1.0^git2, 1.0^git1, 1); + RPMVERCMP(1.0^git1, 1.01, -1); + RPMVERCMP(1.01, 1.0^git1, 1); + RPMVERCMP(1.0^20160101, 1.0^20160101, 0); + RPMVERCMP(1.0^20160101, 1.0.1, -1); + RPMVERCMP(1.0.1, 1.0^20160101, 1); + RPMVERCMP(1.0^20160101^git1, 1.0^20160101^git1, 0); + RPMVERCMP(1.0^20160102, 1.0^20160101^git1, 1); + RPMVERCMP(1.0^20160101^git1, 1.0^20160102, -1); + + /* Basic testcases for tilde and caret sorting */ + RPMVERCMP(1.0~rc1^git1, 1.0~rc1^git1, 0); + RPMVERCMP(1.0~rc1^git1, 1.0~rc1, 1); + RPMVERCMP(1.0~rc1, 1.0~rc1^git1, -1); + RPMVERCMP(1.0^git1~pre, 1.0^git1~pre, 0); + RPMVERCMP(1.0^git1, 1.0^git1~pre, 1); + RPMVERCMP(1.0^git1~pre, 1.0^git1, -1); + + /* These are included here to document current, arguably buggy behaviors + * for reference purposes and for easy checking against unintended + * behavior changes. */ + log_info("/* RPM version comparison oddities */"); + /* RhBug:811992 case */ + RPMVERCMP(1b.fc17, 1b.fc17, 0); + RPMVERCMP(1b.fc17, 1.fc17, 1); /* Note: this is reversed from rpm's vercmp, WAT! */ + RPMVERCMP(1.fc17, 1b.fc17, -1); + RPMVERCMP(1g.fc17, 1g.fc17, 0); + RPMVERCMP(1g.fc17, 1.fc17, 1); + RPMVERCMP(1.fc17, 1g.fc17, -1); + + /* Non-ascii characters are considered equal so these are all the same, eh… */ + RPMVERCMP(1.1.α, 1.1.α, 0); + RPMVERCMP(1.1.α, 1.1.β, 0); + RPMVERCMP(1.1.β, 1.1.α, 0); + RPMVERCMP(1.1.αα, 1.1.α, 0); + RPMVERCMP(1.1.α, 1.1.ββ, 0); + RPMVERCMP(1.1.ββ, 1.1.αα, 0); +} + +TEST(strextendf) { + _cleanup_free_ char *p = NULL; + + assert_se(strextendf(&p, "<%i>", 77) >= 0); + assert_se(streq(p, "<77>")); + + assert_se(strextendf(&p, "<%i>", 99) >= 0); + assert_se(streq(p, "<77><99>")); + + assert_se(strextendf(&p, "<%80i>", 88) >= 0); + assert_se(streq(p, "<77><99>< 88>")); + + assert_se(strextendf(&p, "<%08x>", 0x1234u) >= 0); + assert_se(streq(p, "<77><99>< 88><00001234>")); + + p = mfree(p); + + assert_se(strextendf_with_separator(&p, ",", "<%i>", 77) >= 0); + assert_se(streq(p, "<77>")); + + assert_se(strextendf_with_separator(&p, ",", "<%i>", 99) >= 0); + assert_se(streq(p, "<77>,<99>")); + + assert_se(strextendf_with_separator(&p, ",", "<%80i>", 88) >= 0); + assert_se(streq(p, "<77>,<99>,< 88>")); + + assert_se(strextendf_with_separator(&p, ",", "<%08x>", 0x1234u) >= 0); + assert_se(streq(p, "<77>,<99>,< 88>,<00001234>")); +} + +TEST(string_replace_char) { + assert_se(streq(string_replace_char(strdupa_safe(""), 'a', 'b'), "")); + assert_se(streq(string_replace_char(strdupa_safe("abc"), 'a', 'b'), "bbc")); + assert_se(streq(string_replace_char(strdupa_safe("hoge"), 'a', 'b'), "hoge")); + assert_se(streq(string_replace_char(strdupa_safe("aaaa"), 'a', 'b'), "bbbb")); + assert_se(streq(string_replace_char(strdupa_safe("aaaa"), 'a', '\t'), "\t\t\t\t")); +} + +TEST(strspn_from_end) { + assert_se(strspn_from_end(NULL, NULL) == 0); + assert_se(strspn_from_end("hoge", NULL) == 0); + assert_se(strspn_from_end(NULL, DIGITS) == 0); + assert_se(strspn_from_end("", DIGITS) == 0); + assert_se(strspn_from_end("hoge", DIGITS) == 0); + assert_se(strspn_from_end("1234", DIGITS) == 4); + assert_se(strspn_from_end("aaa1234", DIGITS) == 4); + assert_se(strspn_from_end("aaa1234aaa", DIGITS) == 0); + assert_se(strspn_from_end("aaa12aa34", DIGITS) == 2); +} + +TEST(streq_skip_trailing_chars) { + /* NULL is WHITESPACE by default*/ + assert_se(streq_skip_trailing_chars("foo bar", "foo bar", NULL)); + assert_se(streq_skip_trailing_chars("foo", "foo", NULL)); + assert_se(streq_skip_trailing_chars("foo bar ", "foo bar", NULL)); + assert_se(streq_skip_trailing_chars("foo bar", "foo bar\t\t", NULL)); + assert_se(streq_skip_trailing_chars("foo bar ", "foo bar\t\t", NULL)); + assert_se(streq_skip_trailing_chars("foo\nbar", "foo\nbar", NULL)); + assert_se(streq_skip_trailing_chars("\t\tfoo bar", "\t\tfoo bar", NULL)); + assert_se(streq_skip_trailing_chars(" foo bar\t", " foo bar\n", NULL)); + + assert_se(!streq_skip_trailing_chars("foobar", "foo bar", NULL)); + assert_se(!streq_skip_trailing_chars("foo\nbar", "foo\tbar", NULL)); + assert_se(!streq_skip_trailing_chars("\t\nfoo bar", "\t foo bar", NULL)); + + assert_se(streq_skip_trailing_chars("foo bar ", "foo bar", WHITESPACE)); + assert_se(!streq_skip_trailing_chars("foo bar ", "foo bar", NEWLINE)); + + assert_se(streq_skip_trailing_chars(NULL, NULL, NULL)); + assert_se(streq_skip_trailing_chars("", "", NULL)); + assert_se(!streq_skip_trailing_chars(NULL, "foo bar", NULL)); + assert_se(!streq_skip_trailing_chars("foo", NULL, NULL)); + assert_se(!streq_skip_trailing_chars("", "f", NULL)); +} + +#define TEST_MAKE_CSTRING_ONE(x, ret, mode, expect) \ + do { \ + _cleanup_free_ char *b = NULL; \ + assert_se(make_cstring((x), ELEMENTSOF(x), (mode), &b) == (ret)); \ + assert_se(streq_ptr(b, (expect))); \ + } while(false) + +TEST(make_cstring) { + static const char test1[] = "this is a test", + test2[] = "", + test3[] = "a", + test4[] = "aa\0aa", + test5[] = { 'b', 'b', 0, 'b' , 'b' }, + test6[] = {}, + test7[] = { 'x' }, + test8[] = { 'x', 'y', 'z' }; + + TEST_MAKE_CSTRING_ONE(test1, -EINVAL, MAKE_CSTRING_REFUSE_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test1, 0, MAKE_CSTRING_ALLOW_TRAILING_NUL, "this is a test"); + TEST_MAKE_CSTRING_ONE(test1, 0, MAKE_CSTRING_REQUIRE_TRAILING_NUL, "this is a test"); + + TEST_MAKE_CSTRING_ONE(test2, -EINVAL, MAKE_CSTRING_REFUSE_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test2, 0, MAKE_CSTRING_ALLOW_TRAILING_NUL, ""); + TEST_MAKE_CSTRING_ONE(test2, 0, MAKE_CSTRING_REQUIRE_TRAILING_NUL, ""); + + TEST_MAKE_CSTRING_ONE(test3, -EINVAL, MAKE_CSTRING_REFUSE_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test3, 0, MAKE_CSTRING_ALLOW_TRAILING_NUL, "a"); + TEST_MAKE_CSTRING_ONE(test3, 0, MAKE_CSTRING_REQUIRE_TRAILING_NUL, "a"); + + TEST_MAKE_CSTRING_ONE(test4, -EINVAL, MAKE_CSTRING_REFUSE_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test4, -EINVAL, MAKE_CSTRING_ALLOW_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test4, -EINVAL, MAKE_CSTRING_REQUIRE_TRAILING_NUL, NULL); + + TEST_MAKE_CSTRING_ONE(test5, -EINVAL, MAKE_CSTRING_REFUSE_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test5, -EINVAL, MAKE_CSTRING_ALLOW_TRAILING_NUL, NULL); + TEST_MAKE_CSTRING_ONE(test5, -EINVAL, MAKE_CSTRING_REQUIRE_TRAILING_NUL, NULL); + + TEST_MAKE_CSTRING_ONE(test6, 0, MAKE_CSTRING_REFUSE_TRAILING_NUL, ""); + TEST_MAKE_CSTRING_ONE(test6, 0, MAKE_CSTRING_ALLOW_TRAILING_NUL, ""); + TEST_MAKE_CSTRING_ONE(test6, -EINVAL, MAKE_CSTRING_REQUIRE_TRAILING_NUL, NULL); + + TEST_MAKE_CSTRING_ONE(test7, 0, MAKE_CSTRING_REFUSE_TRAILING_NUL, "x"); + TEST_MAKE_CSTRING_ONE(test7, 0, MAKE_CSTRING_ALLOW_TRAILING_NUL, "x"); + TEST_MAKE_CSTRING_ONE(test7, -EINVAL, MAKE_CSTRING_REQUIRE_TRAILING_NUL, NULL); + + TEST_MAKE_CSTRING_ONE(test8, 0, MAKE_CSTRING_REFUSE_TRAILING_NUL, "xyz"); + TEST_MAKE_CSTRING_ONE(test8, 0, MAKE_CSTRING_ALLOW_TRAILING_NUL, "xyz"); + TEST_MAKE_CSTRING_ONE(test8, -EINVAL, MAKE_CSTRING_REQUIRE_TRAILING_NUL, NULL); +} + +TEST(find_line_startswith) { + static const char text[] = + "foobar\n" + "this is a test\n" + "foobar: waldo\n" + "more\n" + "\n" + "piff\n" + "foobarfoobar\n" + "iff\n"; + static const char emptystring[] = ""; + + assert_se(find_line_startswith(text, "") == text); + assert_se(find_line_startswith(text, "f") == text+1); + assert_se(find_line_startswith(text, "foobar") == text+6); + assert_se(!find_line_startswith(text, "foobarx")); + assert_se(!find_line_startswith(text, "oobar")); + assert_se(find_line_startswith(text, "t") == text + 8); + assert_se(find_line_startswith(text, "th") == text + 9); + assert_se(find_line_startswith(text, "this") == text + 11); + assert_se(find_line_startswith(text, "foobarf") == text + 54); + assert_se(find_line_startswith(text, "more\n") == text + 41); + assert_se(find_line_startswith(text, "\n") == text + 42); + assert_se(find_line_startswith(text, "iff") == text + 63); + + assert_se(find_line_startswith(emptystring, "") == emptystring); + assert_se(!find_line_startswith(emptystring, "x")); +} + +TEST(strstrafter) { + static const char buffer[] = "abcdefghijklmnopqrstuvwxyz"; + + assert_se(!strstrafter(NULL, NULL)); + assert_se(!strstrafter("", NULL)); + assert_se(!strstrafter(NULL, "")); + assert_se(streq_ptr(strstrafter("", ""), "")); + + assert_se(strstrafter(buffer, "a") == buffer + 1); + assert_se(strstrafter(buffer, "") == buffer); + assert_se(strstrafter(buffer, "ab") == buffer + 2); + assert_se(strstrafter(buffer, "cde") == buffer + 5); + assert_se(strstrafter(buffer, "xyz") == strchr(buffer, 0)); + assert_se(strstrafter(buffer, buffer) == strchr(buffer, 0)); + assert_se(!strstrafter(buffer, "-")); +} + +TEST(version_is_valid) { + assert_se(!version_is_valid(NULL)); + assert_se(!version_is_valid("")); + assert_se(version_is_valid("0")); + assert_se(version_is_valid("5")); + assert_se(version_is_valid("999999")); + assert_se(version_is_valid("999999.5")); + assert_se(version_is_valid("6.2.12-300.fc38.x86_64")); +} + +TEST(strextendn) { + _cleanup_free_ char *x = NULL; + + assert_se(streq_ptr(strextendn(&x, NULL, 0), "")); + x = mfree(x); + + assert_se(streq_ptr(strextendn(&x, "", 0), "")); + x = mfree(x); + + assert_se(streq_ptr(strextendn(&x, "xxx", 3), "xxx")); + assert_se(streq_ptr(strextendn(&x, "xxx", 3), "xxxxxx")); + assert_se(streq_ptr(strextendn(&x, "...", 1), "xxxxxx.")); + assert_se(streq_ptr(strextendn(&x, "...", 2), "xxxxxx...")); + assert_se(streq_ptr(strextendn(&x, "...", 3), "xxxxxx......")); + assert_se(streq_ptr(strextendn(&x, "...", 4), "xxxxxx.........")); + x = mfree(x); +} + +TEST(strlevenshtein) { + assert_se(strlevenshtein(NULL, NULL) == 0); + assert_se(strlevenshtein("", "") == 0); + assert_se(strlevenshtein("", NULL) == 0); + assert_se(strlevenshtein(NULL, "") == 0); + + assert_se(strlevenshtein("a", "a") == 0); + assert_se(strlevenshtein("a", "b") == 1); + assert_se(strlevenshtein("b", "a") == 1); + assert_se(strlevenshtein("a", "") == 1); + assert_se(strlevenshtein("", "a") == 1); + + assert_se(strlevenshtein("xxx", "xxx") == 0); + assert_se(strlevenshtein("xxx", "yyy") == 3); + assert_se(strlevenshtein("yyy", "xxx") == 3); + assert_se(strlevenshtein("xx", "xxx") == 1); + assert_se(strlevenshtein("xxx", "xx") == 1); + assert_se(strlevenshtein("x", "xxx") == 2); + assert_se(strlevenshtein("xxx", "x") == 2); + + assert_se(strlevenshtein("sitting", "kitten") == 3); + assert_se(strlevenshtein("sunday", "saturday") == 3); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-strip-tab-ansi.c b/src/test/test-strip-tab-ansi.c new file mode 100644 index 0000000..6f73d26 --- /dev/null +++ b/src/test/test-strip-tab-ansi.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "pretty-print.h" +#include "string-util.h" +#include "terminal-util.h" +#include "tests.h" + +TEST(strip_tab_ansi) { + _cleanup_free_ char *urlified = NULL, *q = NULL, *qq = NULL; + char *p, *z; + + assert_se(p = strdup("\tFoobar\tbar\twaldo\t")); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + fprintf(stdout, "<%s>\n", p); + assert_se(streq(p, " Foobar bar waldo ")); + free(p); + + assert_se(p = strdup(ANSI_HIGHLIGHT "Hello" ANSI_NORMAL ANSI_HIGHLIGHT_RED " world!" ANSI_NORMAL)); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + fprintf(stdout, "<%s>\n", p); + assert_se(streq(p, "Hello world!")); + free(p); + + assert_se(p = strdup("\x1B[\x1B[\t\x1B[" ANSI_HIGHLIGHT "\x1B[" "Hello" ANSI_NORMAL ANSI_HIGHLIGHT_RED " world!" ANSI_NORMAL)); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + assert_se(streq(p, "\x1B[\x1B[ \x1B[\x1B[Hello world!")); + free(p); + + assert_se(p = strdup("\x1B[waldo")); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + assert_se(streq(p, "\x1B[waldo")); + free(p); + + assert_se(p = strdup("\r\rwaldo")); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + assert_se(streq(p, "\r\rwaldo")); + free(p); + + assert_se(p = strdup("waldo\r\r")); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + assert_se(streq(p, "waldo")); + free(p); + + assert_se(p = strdup("waldo\r\r\n\r\n")); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + assert_se(streq(p, "waldo\n\n")); + free(p); + + assert_se(terminal_urlify_path("/etc/fstab", "i am a fabulous link", &urlified) >= 0); + assert_se(p = strjoin("something ", urlified, " something-else")); + assert_se(q = strdup(p)); + printf("<%s>\n", p); + assert_se(strip_tab_ansi(&p, NULL, NULL)); + printf("<%s>\n", p); + assert_se(streq(p, "something i am a fabulous link something-else")); + p = mfree(p); + + /* Truncate the formatted string in the middle of an ANSI sequence (in which case we shouldn't touch the + * incomplete sequence) */ + z = strstr(q, "fstab"); + if (z) { + *z = 0; + assert_se(qq = strdup(q)); + assert_se(strip_tab_ansi(&q, NULL, NULL)); + assert_se(streq(q, qq)); + } +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-strv.c b/src/test/test-strv.c new file mode 100644 index 0000000..cfd662b --- /dev/null +++ b/src/test/test-strv.c @@ -0,0 +1,1009 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "escape.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +TEST(str_in_set) { + assert_se(STR_IN_SET("x", "x", "y", "z")); + assert_se(!STR_IN_SET("X", "x", "y", "z")); + assert_se(!STR_IN_SET("", "x", "y", "z")); + assert_se(STR_IN_SET("x", "w", "x")); +} + +TEST(strptr_in_set) { + assert_se(STRPTR_IN_SET("x", "x", "y", "z")); + assert_se(!STRPTR_IN_SET("X", "x", "y", "z")); + assert_se(!STRPTR_IN_SET("", "x", "y", "z")); + assert_se(STRPTR_IN_SET("x", "w", "x")); + + assert_se(!STRPTR_IN_SET(NULL, "x", "y", "z")); + assert_se(!STRPTR_IN_SET(NULL, "")); + /* strv cannot contain a null, hence the result below */ + assert_se(!STRPTR_IN_SET(NULL, NULL)); +} + +TEST(startswith_set) { + assert_se(!STARTSWITH_SET("foo", "bar", "baz", "waldo")); + assert_se(!STARTSWITH_SET("foo", "bar")); + + assert_se(STARTSWITH_SET("abc", "a", "ab", "abc")); + assert_se(STARTSWITH_SET("abc", "ax", "ab", "abc")); + assert_se(STARTSWITH_SET("abc", "ax", "abx", "abc")); + assert_se(!STARTSWITH_SET("abc", "ax", "abx", "abcx")); + + assert_se(streq_ptr(STARTSWITH_SET("foobar", "hhh", "kkk", "foo", "zzz"), "bar")); + assert_se(streq_ptr(STARTSWITH_SET("foobar", "hhh", "kkk", "", "zzz"), "foobar")); + assert_se(streq_ptr(STARTSWITH_SET("", "hhh", "kkk", "zzz", ""), "")); +} + +static const char* const input_table_multiple[] = { + "one", + "two", + "three", + NULL, +}; + +static const char* const input_table_quoted[] = { + "one", + " two\t three ", + " four five", + NULL, +}; + +static const char* const input_table_quoted_joined[] = { + "one", + " two\t three " " four five", + NULL, +}; + +static const char* const input_table_one[] = { + "one", + NULL, +}; + +static const char* const input_table_none[] = { + NULL, +}; + +static const char* const input_table_two_empties[] = { + "", + "", + NULL, +}; + +static const char* const input_table_one_empty[] = { + "", + NULL, +}; + +static const char* const input_table_unescape[] = { + "ID_VENDOR=QEMU", + "ID_VENDOR_ENC=QEMUx20x20x20x20", + "ID_MODEL_ENC=QEMUx20HARDDISKx20x20x20", + NULL, +}; + +static const char* const input_table_retain_escape[] = { + "ID_VENDOR=QEMU", + "ID_VENDOR_ENC=QEMU\\x20\\x20\\x20\\x20", + "ID_MODEL_ENC=QEMU\\x20HARDDISK\\x20\\x20\\x20", + NULL, +}; + +TEST(strv_find) { + assert_se(strv_find((char **)input_table_multiple, "three")); + assert_se(!strv_find((char **)input_table_multiple, "four")); +} + +TEST(strv_find_prefix) { + assert_se(strv_find_prefix((char **)input_table_multiple, "o")); + assert_se(strv_find_prefix((char **)input_table_multiple, "one")); + assert_se(strv_find_prefix((char **)input_table_multiple, "")); + assert_se(!strv_find_prefix((char **)input_table_multiple, "xxx")); + assert_se(!strv_find_prefix((char **)input_table_multiple, "onee")); +} + +TEST(strv_find_startswith) { + char *r; + + r = strv_find_startswith((char **)input_table_multiple, "o"); + assert_se(r && streq(r, "ne")); + + r = strv_find_startswith((char **)input_table_multiple, "one"); + assert_se(r && streq(r, "")); + + r = strv_find_startswith((char **)input_table_multiple, ""); + assert_se(r && streq(r, "one")); + + assert_se(!strv_find_startswith((char **)input_table_multiple, "xxx")); + assert_se(!strv_find_startswith((char **)input_table_multiple, "onee")); +} + +TEST(strv_join) { + _cleanup_free_ char *p = strv_join((char **)input_table_multiple, ", "); + assert_se(p); + assert_se(streq(p, "one, two, three")); + + _cleanup_free_ char *q = strv_join((char **)input_table_multiple, ";"); + assert_se(q); + assert_se(streq(q, "one;two;three")); + + _cleanup_free_ char *r = strv_join((char **)input_table_multiple, NULL); + assert_se(r); + assert_se(streq(r, "one two three")); + + _cleanup_free_ char *s = strv_join(STRV_MAKE("1", "2", "3,3"), ","); + assert_se(s); + assert_se(streq(s, "1,2,3,3")); + + _cleanup_free_ char *t = strv_join((char **)input_table_one, ", "); + assert_se(t); + assert_se(streq(t, "one")); + + _cleanup_free_ char *u = strv_join((char **)input_table_none, ", "); + assert_se(u); + assert_se(streq(u, "")); + + _cleanup_free_ char *v = strv_join((char **)input_table_two_empties, ", "); + assert_se(v); + assert_se(streq(v, ", ")); + + _cleanup_free_ char *w = strv_join((char **)input_table_one_empty, ", "); + assert_se(w); + assert_se(streq(w, "")); +} + +TEST(strv_join_full) { + _cleanup_free_ char *p = strv_join_full((char **)input_table_multiple, ", ", "foo", false); + assert_se(p); + assert_se(streq(p, "fooone, footwo, foothree")); + + _cleanup_free_ char *q = strv_join_full((char **)input_table_multiple, ";", "foo", false); + assert_se(q); + assert_se(streq(q, "fooone;footwo;foothree")); + + _cleanup_free_ char *r = strv_join_full(STRV_MAKE("a", "a;b", "a:c"), ";", NULL, true); + assert_se(r); + assert_se(streq(r, "a;a\\;b;a:c")); + + _cleanup_free_ char *s = strv_join_full(STRV_MAKE("a", "a;b", "a;;c", ";", ";x"), ";", NULL, true); + assert_se(s); + assert_se(streq(s, "a;a\\;b;a\\;\\;c;\\;;\\;x")); + + _cleanup_free_ char *t = strv_join_full(STRV_MAKE("a", "a;b", "a:c", ";"), ";", "=", true); + assert_se(t); + assert_se(streq(t, "=a;=a\\;b;=a:c;=\\;")); + t = mfree(t); + + _cleanup_free_ char *u = strv_join_full((char **)input_table_multiple, NULL, "foo", false); + assert_se(u); + assert_se(streq(u, "fooone footwo foothree")); + + _cleanup_free_ char *v = strv_join_full((char **)input_table_one, ", ", "foo", false); + assert_se(v); + assert_se(streq(v, "fooone")); + + _cleanup_free_ char *w = strv_join_full((char **)input_table_none, ", ", "foo", false); + assert_se(w); + assert_se(streq(w, "")); + + _cleanup_free_ char *x = strv_join_full((char **)input_table_two_empties, ", ", "foo", false); + assert_se(x); + assert_se(streq(x, "foo, foo")); + + _cleanup_free_ char *y = strv_join_full((char **)input_table_one_empty, ", ", "foo", false); + assert_se(y); + assert_se(streq(y, "foo")); +} + +static void test_strv_unquote_one(const char *quoted, char **list) { + _cleanup_strv_free_ char **s = NULL; + _cleanup_free_ char *j = NULL; + unsigned i = 0; + int r; + + log_info("/* %s */", __func__); + + r = strv_split_full(&s, quoted, WHITESPACE, EXTRACT_UNQUOTE); + assert_se(r == (int) strv_length(list)); + assert_se(s); + j = strv_join(s, " | "); + assert_se(j); + puts(j); + + STRV_FOREACH(t, s) + assert_se(streq(list[i++], *t)); + + assert_se(list[i] == NULL); +} + +TEST(strv_unquote) { + test_strv_unquote_one(" foo=bar \"waldo\" zzz ", STRV_MAKE("foo=bar", "waldo", "zzz")); + test_strv_unquote_one("", STRV_MAKE_EMPTY); + test_strv_unquote_one(" ", STRV_MAKE_EMPTY); + test_strv_unquote_one(" ", STRV_MAKE_EMPTY); + test_strv_unquote_one(" x", STRV_MAKE("x")); + test_strv_unquote_one("x ", STRV_MAKE("x")); + test_strv_unquote_one(" x ", STRV_MAKE("x")); + test_strv_unquote_one(" \"x\" ", STRV_MAKE("x")); + test_strv_unquote_one(" 'x' ", STRV_MAKE("x")); + test_strv_unquote_one(" 'x\"' ", STRV_MAKE("x\"")); + test_strv_unquote_one(" \"x'\" ", STRV_MAKE("x'")); + test_strv_unquote_one("a '--b=c \"d e\"'", STRV_MAKE("a", "--b=c \"d e\"")); + + /* trailing backslashes */ + test_strv_unquote_one(" x\\\\", STRV_MAKE("x\\")); +} + +static void test_invalid_unquote_one(const char *quoted) { + char **s = NULL; + int r; + + log_info("/* %s */", __func__); + + r = strv_split_full(&s, quoted, WHITESPACE, EXTRACT_UNQUOTE); + assert_se(s == NULL); + assert_se(r == -EINVAL); +} + +TEST(invalid_unquote) { + test_invalid_unquote_one(" x\\"); + test_invalid_unquote_one("a --b='c \"d e\"''"); + test_invalid_unquote_one("a --b='c \"d e\" '\""); + test_invalid_unquote_one("a --b='c \"d e\"garbage"); + test_invalid_unquote_one("'"); + test_invalid_unquote_one("\""); + test_invalid_unquote_one("'x'y'g"); +} + +TEST(strv_split) { + _cleanup_strv_free_erase_ char **l = NULL; + const char str[] = "one,two,three"; + + l = strv_split(str, ","); + assert_se(l); + assert_se(strv_equal(l, (char**) input_table_multiple)); + + strv_free_erase(l); + + l = strv_split(" one two\t three", WHITESPACE); + assert_se(l); + assert_se(strv_equal(l, (char**) input_table_multiple)); + + strv_free_erase(l); + + /* Setting NULL for separator is equivalent to WHITESPACE */ + l = strv_split(" one two\t three", NULL); + assert_se(l); + assert_se(strv_equal(l, (char**) input_table_multiple)); + + strv_free_erase(l); + + assert_se(strv_split_full(&l, " one two\t three", NULL, 0) == 3); + assert_se(strv_equal(l, (char**) input_table_multiple)); + + strv_free_erase(l); + + assert_se(strv_split_full(&l, " 'one' \" two\t three \" ' four five'", NULL, EXTRACT_UNQUOTE) == 3); + assert_se(strv_equal(l, (char**) input_table_quoted)); + + l = strv_free_erase(l); + + /* missing last quote causes extraction to fail. */ + assert_se(strv_split_full(&l, " 'one' \" two\t three \" ' four five", NULL, EXTRACT_UNQUOTE) == -EINVAL); + assert_se(!l); + + /* missing last quote, but the last element is _not_ ignored with EXTRACT_RELAX. */ + assert_se(strv_split_full(&l, " 'one' \" two\t three \" ' four five", NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX) == 3); + assert_se(strv_equal(l, (char**) input_table_quoted)); + + l = strv_free_erase(l); + + /* missing separator between items */ + assert_se(strv_split_full(&l, " 'one' \" two\t three \"' four five'", NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX) == 2); + assert_se(strv_equal(l, (char**) input_table_quoted_joined)); + + l = strv_free_erase(l); + + assert_se(strv_split_full(&l, " 'one' \" two\t three \"' four five", NULL, + EXTRACT_UNQUOTE | EXTRACT_RELAX | EXTRACT_UNESCAPE_RELAX) == 2); + assert_se(strv_equal(l, (char**) input_table_quoted_joined)); + + l = strv_free_erase(l); + + assert_se(strv_split_full(&l, "\\", NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX | EXTRACT_UNESCAPE_RELAX) == 1); + assert_se(strv_equal(l, STRV_MAKE("\\"))); + + l = strv_free_erase(l); + + assert_se(l = strv_split("\\", NULL)); + assert_se(strv_equal(l, STRV_MAKE("\\"))); + + l = strv_free_erase(l); + + assert_se(l = strv_split("aa\\ bb\\", NULL)); + assert_se(strv_equal(l, STRV_MAKE("aa\\", "bb\\"))); + + l = strv_free_erase(l); + + assert_se(l = strv_split("aa\" bb'", NULL)); + assert_se(strv_equal(l, STRV_MAKE("aa\"", "bb'"))); +} + +TEST(strv_split_empty) { + _cleanup_strv_free_ char **l = NULL; + + l = strv_split("", WHITESPACE); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(l = strv_split("", NULL)); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, "", NULL, 0) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, "", NULL, EXTRACT_UNQUOTE) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, "", WHITESPACE, EXTRACT_UNQUOTE) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, "", WHITESPACE, EXTRACT_UNQUOTE | EXTRACT_RELAX) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + strv_free(l); + + l = strv_split(" ", WHITESPACE); + assert_se(l); + assert_se(strv_isempty(l)); + strv_free(l); + + l = strv_split(" ", NULL); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, " ", NULL, 0) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, " ", WHITESPACE, EXTRACT_UNQUOTE) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, " ", NULL, EXTRACT_UNQUOTE) == 0); + assert_se(l); + assert_se(strv_isempty(l)); + l = strv_free(l); + + assert_se(strv_split_full(&l, " ", NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX) == 0); + assert_se(l); + assert_se(strv_isempty(l)); +} + +TEST(strv_split_full) { + _cleanup_strv_free_ char **l = NULL; + const char *str = ":foo\\:bar::waldo:"; + int r; + + r = strv_split_full(&l, str, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + assert_se(r == (int) strv_length(l)); + assert_se(streq_ptr(l[0], "")); + assert_se(streq_ptr(l[1], "foo:bar")); + assert_se(streq_ptr(l[2], "")); + assert_se(streq_ptr(l[3], "waldo")); + assert_se(streq_ptr(l[4], "")); + assert_se(streq_ptr(l[5], NULL)); +} + +TEST(strv_split_and_extend_full) { + _cleanup_strv_free_ char **l = NULL; + const char *str1 = ":foo\\:bar:"; + const char *str2 = "waldo::::::baz"; + int r; + + r = strv_split_and_extend(&l, "", ":", false); + assert_se(r == (int) strv_length(l)); + r = strv_split_and_extend_full(&l, str1, ":", false, EXTRACT_DONT_COALESCE_SEPARATORS); + assert_se(r == (int) strv_length(l)); + assert_se(streq_ptr(l[0], "")); + assert_se(streq_ptr(l[1], "foo:bar")); + assert_se(streq_ptr(l[2], "")); + r = strv_split_and_extend_full(&l, str2, ":", false, 0); + assert_se(r == (int) strv_length(l)); + assert_se(streq_ptr(l[3], "waldo")); + assert_se(streq_ptr(l[4], "baz")); + assert_se(streq_ptr(l[5], NULL)); +} + +TEST(strv_split_colon_pairs) { + _cleanup_strv_free_ char **l = NULL; + const char *str = "one:two three four:five six seven:eight\\:nine ten\\:eleven\\\\", + *str_inval="one:two three:four:five"; + int r; + + r = strv_split_colon_pairs(&l, str); + assert_se(r == (int) strv_length(l)); + assert_se(r == 12); + assert_se(streq_ptr(l[0], "one")); + assert_se(streq_ptr(l[1], "two")); + assert_se(streq_ptr(l[2], "three")); + assert_se(streq_ptr(l[3], "")); + assert_se(streq_ptr(l[4], "four")); + assert_se(streq_ptr(l[5], "five")); + assert_se(streq_ptr(l[6], "six")); + assert_se(streq_ptr(l[7], "")); + assert_se(streq_ptr(l[8], "seven")); + assert_se(streq_ptr(l[9], "eight:nine")); + assert_se(streq_ptr(l[10], "ten:eleven\\")); + assert_se(streq_ptr(l[11], "")); + assert_se(streq_ptr(l[12], NULL)); + + r = strv_split_colon_pairs(&l, str_inval); + assert_se(r == -EINVAL); +} + +TEST(strv_split_newlines) { + unsigned i = 0; + _cleanup_strv_free_ char **l = NULL; + const char str[] = "one\ntwo\nthree"; + + l = strv_split_newlines(str); + assert_se(l); + + STRV_FOREACH(s, l) + assert_se(streq(*s, input_table_multiple[i++])); +} + +TEST(strv_split_newlines_full) { + const char str[] = + "ID_VENDOR=QEMU\n" + "ID_VENDOR_ENC=QEMU\\x20\\x20\\x20\\x20\n" + "ID_MODEL_ENC=QEMU\\x20HARDDISK\\x20\\x20\\x20\n" + "\n\n\n"; + _cleanup_strv_free_ char **l = NULL; + + assert_se(strv_split_newlines_full(&l, str, 0) == 3); + assert_se(strv_equal(l, (char**) input_table_unescape)); + + l = strv_free(l); + + assert_se(strv_split_newlines_full(&l, str, EXTRACT_RETAIN_ESCAPE) == 3); + assert_se(strv_equal(l, (char**) input_table_retain_escape)); +} + +TEST(strv_overlap) { + const char * const input_table[] = { + "one", + "two", + "three", + NULL + }; + const char * const input_table_overlap[] = { + "two", + NULL + }; + const char * const input_table_unique[] = { + "four", + "five", + "six", + NULL + }; + + assert_se(strv_overlap((char **)input_table, (char**)input_table_overlap)); + assert_se(!strv_overlap((char **)input_table, (char**)input_table_unique)); +} + +TEST(strv_sort) { + const char* input_table[] = { + "durian", + "apple", + "citrus", + "CAPITAL LETTERS FIRST", + "banana", + NULL + }; + + strv_sort((char **)input_table); + + assert_se(streq(input_table[0], "CAPITAL LETTERS FIRST")); + assert_se(streq(input_table[1], "apple")); + assert_se(streq(input_table[2], "banana")); + assert_se(streq(input_table[3], "citrus")); + assert_se(streq(input_table[4], "durian")); +} + +TEST(strv_extend_strv_concat) { + _cleanup_strv_free_ char **a = NULL, **b = NULL; + + a = strv_new("without", "suffix"); + b = strv_new("with", "suffix"); + assert_se(a); + assert_se(b); + + assert_se(strv_extend_strv_concat(&a, b, "_suffix") >= 0); + + assert_se(streq(a[0], "without")); + assert_se(streq(a[1], "suffix")); + assert_se(streq(a[2], "with_suffix")); + assert_se(streq(a[3], "suffix_suffix")); +} + +TEST(strv_extend_strv) { + _cleanup_strv_free_ char **a = NULL, **b = NULL, **n = NULL; + + a = strv_new("abc", "def", "ghi"); + b = strv_new("jkl", "mno", "abc", "pqr"); + assert_se(a); + assert_se(b); + + assert_se(strv_extend_strv(&a, b, true) == 3); + + assert_se(streq(a[0], "abc")); + assert_se(streq(a[1], "def")); + assert_se(streq(a[2], "ghi")); + assert_se(streq(a[3], "jkl")); + assert_se(streq(a[4], "mno")); + assert_se(streq(a[5], "pqr")); + assert_se(strv_length(a) == 6); + + assert_se(strv_extend_strv(&n, b, false) >= 0); + assert_se(streq(n[0], "jkl")); + assert_se(streq(n[1], "mno")); + assert_se(streq(n[2], "abc")); + assert_se(streq(n[3], "pqr")); + assert_se(strv_length(n) == 4); +} + +TEST(strv_extend_with_size) { + _cleanup_strv_free_ char **a = NULL; + size_t n = SIZE_MAX; + + a = strv_new("test", "test1"); + assert_se(a); + + assert_se(strv_extend_with_size(&a, &n, "test2") >= 0); + assert_se(n == 3); + assert_se(strv_extend_with_size(&a, &n, "test3") >= 0); + assert_se(n == 4); + + assert_se(streq(a[0], "test")); + assert_se(streq(a[1], "test1")); + assert_se(streq(a[2], "test2")); + assert_se(streq(a[3], "test3")); + assert_se(a[4] == NULL); +} + +TEST(strv_extend) { + _cleanup_strv_free_ char **a = NULL, **b = NULL; + + a = strv_new("test", "test1"); + assert_se(a); + assert_se(strv_extend(&a, "test2") >= 0); + assert_se(strv_extend(&b, "test3") >= 0); + + assert_se(streq(a[0], "test")); + assert_se(streq(a[1], "test1")); + assert_se(streq(a[2], "test2")); + assert_se(streq(b[0], "test3")); +} + +TEST(strv_extendf) { + _cleanup_strv_free_ char **a = NULL, **b = NULL; + + a = strv_new("test", "test1"); + assert_se(a); + assert_se(strv_extendf(&a, "test2 %s %d %s", "foo", 128, "bar") >= 0); + assert_se(strv_extendf(&b, "test3 %s %s %d", "bar", "foo", 128) >= 0); + + assert_se(streq(a[0], "test")); + assert_se(streq(a[1], "test1")); + assert_se(streq(a[2], "test2 foo 128 bar")); + assert_se(streq(b[0], "test3 bar foo 128")); +} + +TEST(strv_foreach) { + _cleanup_strv_free_ char **a; + unsigned i = 0; + + a = strv_new("one", "two", "three"); + assert_se(a); + + STRV_FOREACH(check, a) + assert_se(streq(*check, input_table_multiple[i++])); +} + +TEST(strv_foreach_backwards) { + _cleanup_strv_free_ char **a; + unsigned i = 2; + + a = strv_new("one", "two", "three"); + + assert_se(a); + + STRV_FOREACH_BACKWARDS(check, a) + assert_se(streq_ptr(*check, input_table_multiple[i--])); + + STRV_FOREACH_BACKWARDS(check, (char**) NULL) + assert_not_reached(); + + STRV_FOREACH_BACKWARDS(check, STRV_MAKE_EMPTY) + assert_not_reached(); + + unsigned count = 0; + STRV_FOREACH_BACKWARDS(check, STRV_MAKE("ONE")) + count++; + assert_se(count == 1); +} + +TEST(strv_foreach_pair) { + _cleanup_strv_free_ char **a = NULL; + + a = strv_new("pair_one", "pair_one", + "pair_two", "pair_two", + "pair_three", "pair_three"); + STRV_FOREACH_PAIR(x, y, a) + assert_se(streq(*x, *y)); +} + +static void test_strv_from_stdarg_alloca_one(char **l, const char *first, ...) { + char **j; + unsigned i; + + log_info("/* %s */", __func__); + + j = strv_from_stdarg_alloca(first); + + for (i = 0;; i++) { + assert_se(streq_ptr(l[i], j[i])); + + if (!l[i]) + break; + } +} + +TEST(strv_from_stdarg_alloca) { + test_strv_from_stdarg_alloca_one(STRV_MAKE("foo", "bar"), "foo", "bar", NULL); + test_strv_from_stdarg_alloca_one(STRV_MAKE("foo"), "foo", NULL); + test_strv_from_stdarg_alloca_one(STRV_MAKE_EMPTY, NULL); +} + +TEST(strv_insert) { + _cleanup_strv_free_ char **a = NULL; + + assert_se(strv_insert(&a, 0, strdup("first")) == 0); + assert_se(streq(a[0], "first")); + assert_se(!a[1]); + + assert_se(strv_insert(&a, 0, NULL) == 0); + assert_se(streq(a[0], "first")); + assert_se(!a[1]); + + assert_se(strv_insert(&a, 1, strdup("two")) == 0); + assert_se(streq(a[0], "first")); + assert_se(streq(a[1], "two")); + assert_se(!a[2]); + + assert_se(strv_insert(&a, 4, strdup("tri")) == 0); + assert_se(streq(a[0], "first")); + assert_se(streq(a[1], "two")); + assert_se(streq(a[2], "tri")); + assert_se(!a[3]); + + assert_se(strv_insert(&a, 1, strdup("duo")) == 0); + assert_se(streq(a[0], "first")); + assert_se(streq(a[1], "duo")); + assert_se(streq(a[2], "two")); + assert_se(streq(a[3], "tri")); + assert_se(!a[4]); +} + +TEST(strv_push_prepend) { + _cleanup_strv_free_ char **a = NULL; + + assert_se(a = strv_new("foo", "bar", "three")); + + assert_se(strv_push_prepend(&a, strdup("first")) >= 0); + assert_se(streq(a[0], "first")); + assert_se(streq(a[1], "foo")); + assert_se(streq(a[2], "bar")); + assert_se(streq(a[3], "three")); + assert_se(!a[4]); + + assert_se(strv_consume_prepend(&a, strdup("first2")) >= 0); + assert_se(streq(a[0], "first2")); + assert_se(streq(a[1], "first")); + assert_se(streq(a[2], "foo")); + assert_se(streq(a[3], "bar")); + assert_se(streq(a[4], "three")); + assert_se(!a[5]); +} + +TEST(strv_push_with_size) { + _cleanup_strv_free_ char **a = NULL; + size_t n = 0; + char *i, *j; + + assert_se(i = strdup("foo")); + assert_se(strv_push_with_size(&a, &n, i) >= 0); + assert_se(n == 1); + + assert_se(i = strdup("a")); + assert_se(j = strdup("b")); + assert_se(strv_push_with_size(&a, &n, i) >= 0); + assert_se(n == 2); + assert_se(strv_push_with_size(&a, &n, j) >= 0); + assert_se(n == 3); + + assert_se(streq_ptr(a[0], "foo")); + assert_se(streq_ptr(a[1], "a")); + assert_se(streq_ptr(a[2], "b")); + assert_se(streq_ptr(a[3], NULL)); + + assert_se(n = strv_length(a)); +} + +TEST(strv_push) { + _cleanup_strv_free_ char **a = NULL; + char *i, *j; + + assert_se(i = strdup("foo")); + assert_se(strv_push(&a, i) >= 0); + + assert_se(i = strdup("a")); + assert_se(j = strdup("b")); + assert_se(strv_push_pair(&a, i, j) >= 0); + + assert_se(streq_ptr(a[0], "foo")); + assert_se(streq_ptr(a[1], "a")); + assert_se(streq_ptr(a[2], "b")); + assert_se(streq_ptr(a[3], NULL)); +} + +TEST(strv_compare) { + _cleanup_strv_free_ char **a = NULL; + _cleanup_strv_free_ char **b = NULL; + _cleanup_strv_free_ char **c = NULL; + _cleanup_strv_free_ char **d = NULL; + + a = strv_new("one", "two", "three"); + assert_se(a); + b = strv_new("one", "two", "three"); + assert_se(b); + c = strv_new("one", "two", "three", "four"); + assert_se(c); + d = strv_new(NULL); + assert_se(d); + + assert_se(strv_compare(a, a) == 0); + assert_se(strv_compare(a, b) == 0); + assert_se(strv_compare(d, d) == 0); + assert_se(strv_compare(d, NULL) == 0); + assert_se(strv_compare(NULL, NULL) == 0); + + assert_se(strv_compare(a, c) < 0); + assert_se(strv_compare(b, c) < 0); + assert_se(strv_compare(b, d) == 1); + assert_se(strv_compare(b, NULL) == 1); +} + +TEST(strv_is_uniq) { + _cleanup_strv_free_ char **a = NULL, **b = NULL, **c = NULL, **d = NULL; + + a = strv_new(NULL); + assert_se(a); + assert_se(strv_is_uniq(a)); + + b = strv_new("foo"); + assert_se(b); + assert_se(strv_is_uniq(b)); + + c = strv_new("foo", "bar"); + assert_se(c); + assert_se(strv_is_uniq(c)); + + d = strv_new("foo", "bar", "waldo", "bar", "piep"); + assert_se(d); + assert_se(!strv_is_uniq(d)); +} + +TEST(strv_reverse) { + _cleanup_strv_free_ char **a = NULL, **b = NULL, **c = NULL, **d = NULL; + + a = strv_new(NULL); + assert_se(a); + + strv_reverse(a); + assert_se(strv_isempty(a)); + + b = strv_new("foo"); + assert_se(b); + strv_reverse(b); + assert_se(streq_ptr(b[0], "foo")); + assert_se(streq_ptr(b[1], NULL)); + + c = strv_new("foo", "bar"); + assert_se(c); + strv_reverse(c); + assert_se(streq_ptr(c[0], "bar")); + assert_se(streq_ptr(c[1], "foo")); + assert_se(streq_ptr(c[2], NULL)); + + d = strv_new("foo", "bar", "waldo"); + assert_se(d); + strv_reverse(d); + assert_se(streq_ptr(d[0], "waldo")); + assert_se(streq_ptr(d[1], "bar")); + assert_se(streq_ptr(d[2], "foo")); + assert_se(streq_ptr(d[3], NULL)); +} + +TEST(strv_shell_escape) { + _cleanup_strv_free_ char **v = NULL; + + v = strv_new("foo:bar", "bar,baz", "wal\\do"); + assert_se(v); + assert_se(strv_shell_escape(v, ",:")); + assert_se(streq_ptr(v[0], "foo\\:bar")); + assert_se(streq_ptr(v[1], "bar\\,baz")); + assert_se(streq_ptr(v[2], "wal\\\\do")); + assert_se(streq_ptr(v[3], NULL)); +} + +static void test_strv_skip_one(char **a, size_t n, char **b) { + a = strv_skip(a, n); + assert_se(strv_equal(a, b)); +} + +TEST(strv_skip) { + test_strv_skip_one(STRV_MAKE("foo", "bar", "baz"), 0, STRV_MAKE("foo", "bar", "baz")); + test_strv_skip_one(STRV_MAKE("foo", "bar", "baz"), 1, STRV_MAKE("bar", "baz")); + test_strv_skip_one(STRV_MAKE("foo", "bar", "baz"), 2, STRV_MAKE("baz")); + test_strv_skip_one(STRV_MAKE("foo", "bar", "baz"), 3, STRV_MAKE(NULL)); + test_strv_skip_one(STRV_MAKE("foo", "bar", "baz"), 4, STRV_MAKE(NULL)); + test_strv_skip_one(STRV_MAKE("foo", "bar", "baz"), 55, STRV_MAKE(NULL)); + + test_strv_skip_one(STRV_MAKE("quux"), 0, STRV_MAKE("quux")); + test_strv_skip_one(STRV_MAKE("quux"), 1, STRV_MAKE(NULL)); + test_strv_skip_one(STRV_MAKE("quux"), 55, STRV_MAKE(NULL)); + + test_strv_skip_one(STRV_MAKE(NULL), 0, STRV_MAKE(NULL)); + test_strv_skip_one(STRV_MAKE(NULL), 1, STRV_MAKE(NULL)); + test_strv_skip_one(STRV_MAKE(NULL), 55, STRV_MAKE(NULL)); +} + +TEST(strv_extend_n) { + _cleanup_strv_free_ char **v = NULL; + + v = strv_new("foo", "bar"); + assert_se(v); + + assert_se(strv_extend_n(&v, "waldo", 3) >= 0); + assert_se(strv_extend_n(&v, "piep", 2) >= 0); + + assert_se(streq(v[0], "foo")); + assert_se(streq(v[1], "bar")); + assert_se(streq(v[2], "waldo")); + assert_se(streq(v[3], "waldo")); + assert_se(streq(v[4], "waldo")); + assert_se(streq(v[5], "piep")); + assert_se(streq(v[6], "piep")); + assert_se(v[7] == NULL); + + v = strv_free(v); + + assert_se(strv_extend_n(&v, "foo", 1) >= 0); + assert_se(strv_extend_n(&v, "bar", 0) >= 0); + + assert_se(streq(v[0], "foo")); + assert_se(v[1] == NULL); +} + +TEST(foreach_string) { + const char * const t[] = { + "foo", + "bar", + "waldo", + NULL + }; + + unsigned i = 0; + FOREACH_STRING(x, "foo", "bar", "waldo") + assert_se(streq_ptr(t[i++], x)); + assert_se(i == 3); + + FOREACH_STRING(x, "zzz") + assert_se(streq(x, "zzz")); +} + +TEST(strv_fnmatch) { + _cleanup_strv_free_ char **v = NULL; + size_t pos; + + assert_se(!strv_fnmatch(STRV_MAKE_EMPTY, "a")); + + v = strv_new("xxx", "*\\*", "yyy"); + assert_se(!strv_fnmatch_full(v, "\\", 0, NULL)); + assert_se(strv_fnmatch_full(v, "\\", FNM_NOESCAPE, &pos)); + assert_se(pos == 1); +} + +TEST(strv_extend_join) { + _cleanup_strv_free_ char **v = NULL; + + assert_se(strv_extend_assignment(&v, "MESSAGE", "ABC") >= 0); + assert_se(strv_extend_assignment(&v, "ABC", "QER") >= 0); + assert_se(strv_extend_assignment(&v, "MISSING", NULL) >= 0); + + assert_se(strv_length(v) == 2); + assert_se(streq(v[0], "MESSAGE=ABC")); + assert_se(streq(v[1], "ABC=QER")); +} + +TEST(strv_copy_n) { + char **x = STRV_MAKE("a", "b", "c", "d", "e"); + _cleanup_strv_free_ char **l = NULL; + + l = strv_copy_n(x, 0); + assert_se(strv_equal(l, NULL)); + strv_free(l); + + l = strv_copy_n(x, 0); + assert_se(strv_equal(l, (char**) { NULL })); + strv_free(l); + + l = strv_copy_n(x, 1); + assert_se(strv_equal(l, STRV_MAKE("a"))); + strv_free(l); + + l = strv_copy_n(x, 2); + assert_se(strv_equal(l, STRV_MAKE("a", "b"))); + strv_free(l); + + l = strv_copy_n(x, 3); + assert_se(strv_equal(l, STRV_MAKE("a", "b", "c"))); + strv_free(l); + + l = strv_copy_n(x, 4); + assert_se(strv_equal(l, STRV_MAKE("a", "b", "c", "d"))); + strv_free(l); + + l = strv_copy_n(x, 5); + assert_se(strv_equal(l, STRV_MAKE("a", "b", "c", "d", "e"))); + strv_free(l); + + l = strv_copy_n(x, 6); + assert_se(strv_equal(l, STRV_MAKE("a", "b", "c", "d", "e"))); + strv_free(l); + + l = strv_copy_n(x, SIZE_MAX); + assert_se(strv_equal(l, STRV_MAKE("a", "b", "c", "d", "e"))); +} + +TEST(strv_find_first_field) { + char **haystack = STRV_MAKE("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"); + + assert_se(strv_find_first_field(NULL, NULL) == NULL); + assert_se(strv_find_first_field(NULL, haystack) == NULL); + assert_se(strv_find_first_field(STRV_MAKE("k", "l", "m", "d", "b"), NULL) == NULL); + assert_se(strv_find_first_field(STRV_MAKE("k", "l", "m", "d", "b"), haystack) == NULL); + assert_se(streq_ptr(strv_find_first_field(STRV_MAKE("k", "l", "m", "d", "a", "c"), haystack), "b")); + assert_se(streq_ptr(strv_find_first_field(STRV_MAKE("k", "l", "m", "d", "c", "a"), haystack), "d")); + assert_se(streq_ptr(strv_find_first_field(STRV_MAKE("i", "k", "l", "m", "d", "c", "a", "b"), haystack), "j")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-strxcpyx.c b/src/test/test-strxcpyx.c new file mode 100644 index 0000000..b679522 --- /dev/null +++ b/src/test/test-strxcpyx.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "string-util.h" +#include "strxcpyx.h" +#include "tests.h" + +TEST(strpcpy) { + char target[25]; + char *s = target; + size_t space_left; + bool truncated; + + space_left = sizeof(target); + space_left = strpcpy_full(&s, space_left, "12345", &truncated); + assert_se(!truncated); + space_left = strpcpy_full(&s, space_left, "hey hey hey", &truncated); + assert_se(!truncated); + space_left = strpcpy_full(&s, space_left, "waldo", &truncated); + assert_se(!truncated); + space_left = strpcpy_full(&s, space_left, "ba", &truncated); + assert_se(!truncated); + space_left = strpcpy_full(&s, space_left, "r", &truncated); + assert_se(!truncated); + assert_se(space_left == 1); + assert_se(streq(target, "12345hey hey heywaldobar")); + + space_left = strpcpy_full(&s, space_left, "", &truncated); + assert_se(!truncated); + assert_se(space_left == 1); + assert_se(streq(target, "12345hey hey heywaldobar")); + + space_left = strpcpy_full(&s, space_left, "f", &truncated); + assert_se(truncated); + assert_se(space_left == 0); + assert_se(streq(target, "12345hey hey heywaldobar")); + + space_left = strpcpy_full(&s, space_left, "", &truncated); + assert_se(!truncated); + assert_se(space_left == 0); + assert_se(streq(target, "12345hey hey heywaldobar")); + + space_left = strpcpy_full(&s, space_left, "foo", &truncated); + assert_se(truncated); + assert_se(space_left == 0); + assert_se(streq(target, "12345hey hey heywaldobar")); +} + +TEST(strpcpyf) { + char target[25]; + char *s = target; + size_t space_left; + bool truncated; + + space_left = sizeof(target); + space_left = strpcpyf_full(&s, space_left, &truncated, "space left: %zu. ", space_left); + assert_se(!truncated); + space_left = strpcpyf_full(&s, space_left, &truncated, "foo%s", "bar"); + assert_se(!truncated); + assert_se(space_left == 3); + assert_se(streq(target, "space left: 25. foobar")); + + space_left = strpcpyf_full(&s, space_left, &truncated, "%i", 42); + assert_se(!truncated); + assert_se(space_left == 1); + assert_se(streq(target, "space left: 25. foobar42")); + + space_left = strpcpyf_full(&s, space_left, &truncated, "%s", ""); + assert_se(!truncated); + assert_se(space_left == 1); + assert_se(streq(target, "space left: 25. foobar42")); + + space_left = strpcpyf_full(&s, space_left, &truncated, "%c", 'x'); + assert_se(truncated); + assert_se(space_left == 0); + assert_se(streq(target, "space left: 25. foobar42")); + + space_left = strpcpyf_full(&s, space_left, &truncated, "%s", ""); + assert_se(!truncated); + assert_se(space_left == 0); + assert_se(streq(target, "space left: 25. foobar42")); + + space_left = strpcpyf_full(&s, space_left, &truncated, "abc%s", "hoge"); + assert_se(truncated); + assert_se(space_left == 0); + assert_se(streq(target, "space left: 25. foobar42")); + + /* test overflow */ + s = target; + space_left = strpcpyf_full(&s, 12, &truncated, "00 left: %i. ", 999); + assert_se(truncated); + assert_se(streq(target, "00 left: 99")); + assert_se(space_left == 0); + assert_se(target[12] == '2'); +} + +TEST(strpcpyl) { + char target[25]; + char *s = target; + size_t space_left; + bool truncated; + + space_left = sizeof(target); + space_left = strpcpyl_full(&s, space_left, &truncated, "waldo", " test", " waldo. ", NULL); + assert_se(!truncated); + space_left = strpcpyl_full(&s, space_left, &truncated, "Banana", NULL); + assert_se(!truncated); + assert_se(space_left == 1); + assert_se(streq(target, "waldo test waldo. Banana")); + + space_left = strpcpyl_full(&s, space_left, &truncated, "", "", "", NULL); + assert_se(!truncated); + assert_se(space_left == 1); + assert_se(streq(target, "waldo test waldo. Banana")); + + space_left = strpcpyl_full(&s, space_left, &truncated, "", "x", "", NULL); + assert_se(truncated); + assert_se(space_left == 0); + assert_se(streq(target, "waldo test waldo. Banana")); + + space_left = strpcpyl_full(&s, space_left, &truncated, "hoge", NULL); + assert_se(truncated); + assert_se(space_left == 0); + assert_se(streq(target, "waldo test waldo. Banana")); +} + +TEST(strscpy) { + char target[25]; + size_t space_left; + bool truncated; + + space_left = sizeof(target); + space_left = strscpy_full(target, space_left, "12345", &truncated); + assert_se(!truncated); + + assert_se(streq(target, "12345")); + assert_se(space_left == 20); +} + +TEST(strscpyl) { + char target[25]; + size_t space_left; + bool truncated; + + space_left = sizeof(target); + space_left = strscpyl_full(target, space_left, &truncated, "12345", "waldo", "waldo", NULL); + assert_se(!truncated); + + assert_se(streq(target, "12345waldowaldo")); + assert_se(space_left == 10); +} + +TEST(sd_event_code_migration) { + char b[100 * DECIMAL_STR_MAX(unsigned) + 1]; + char c[100 * DECIMAL_STR_MAX(unsigned) + 1], *p; + unsigned i; + size_t l; + int o, r; + + for (i = o = 0; i < 100; i++) { + r = snprintf(&b[o], sizeof(b) - o, "%u ", i); + assert_se(r >= 0 && r < (int) sizeof(b) - o); + o += r; + } + + p = c; + l = sizeof(c); + for (i = 0; i < 100; i++) + l = strpcpyf(&p, l, "%u ", i); + + assert_se(streq(b, c)); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-sysctl-util.c b/src/test/test-sysctl-util.c new file mode 100644 index 0000000..81207f5 --- /dev/null +++ b/src/test/test-sysctl-util.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-id128.h" + +#include "errno-util.h" +#include "hostname-util.h" +#include "strv.h" +#include "sysctl-util.h" +#include "tests.h" + +static const char* const cases[] = { + "a.b.c", "a/b/c", + "a/b/c", "a/b/c", + "a/b.c/d", "a/b.c/d", + "a.b/c.d", "a/b.c/d", + + "net.ipv4.conf.enp3s0/200.forwarding", "net/ipv4/conf/enp3s0.200/forwarding", + "net/ipv4/conf/enp3s0.200/forwarding", "net/ipv4/conf/enp3s0.200/forwarding", + + "a...b...c", "a/b/c", + "a///b///c", "a/b/c", + ".a...b...c", "a/b/c", + "/a///b///c", "a/b/c", + NULL, +}; + +TEST(sysctl_normalize) { + STRV_FOREACH_PAIR(s, expected, cases) { + _cleanup_free_ char *t; + + assert_se(t = strdup(*s)); + assert_se(sysctl_normalize(t) == t); + + log_info("\"%s\" → \"%s\", expected \"%s\"", *s, t, *expected); + assert_se(streq(t, *expected)); + } +} + +TEST(sysctl_read) { + _cleanup_free_ char *s = NULL; + struct utsname u; + sd_id128_t a, b; + int r; + + assert_se(sysctl_read("kernel/random/boot_id", &s) >= 0); + assert_se(sd_id128_from_string(s, &a) >= 0); + assert_se(sd_id128_get_boot(&b) >= 0); + assert_se(sd_id128_equal(a, b)); + s = mfree(s); + + assert_se(sysctl_read_ip_property(AF_INET, "lo", "forwarding", &s)); + assert_se(STR_IN_SET(s, "0", "1")); + + r = sysctl_write_ip_property(AF_INET, "lo", "forwarding", s); + assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS); + s = mfree(s); + + assert_se(sysctl_read_ip_property(AF_INET, NULL, "ip_forward", &s)); + assert_se(STR_IN_SET(s, "0", "1")); + + r = sysctl_write_ip_property(AF_INET, NULL, "ip_forward", s); + assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS); + s = mfree(s); + + assert_se(sysctl_read("kernel/hostname", &s) >= 0); + assert_se(uname(&u) >= 0); + assert_se(streq_ptr(s, u.nodename)); + + r = sysctl_write("kernel/hostname", s); + assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-tables.c b/src/test/test-tables.c new file mode 100644 index 0000000..8abfba5 --- /dev/null +++ b/src/test/test-tables.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "architecture.h" +#include "automount.h" +#include "cgroup.h" +#include "cgroup-util.h" +#include "compress.h" +#include "condition.h" +#include "confidential-virt.h" +#include "device-private.h" +#include "device.h" +#include "discover-image.h" +#include "execute.h" +#include "import-util.h" +#include "install.h" +#include "job.h" +#include "kill.h" +#include "locale-util.h" +#include "log.h" +#include "logs-show.h" +#include "mount.h" +#include "netif-naming-scheme.h" +#include "path.h" +#include "process-util.h" +#include "resolve-util.h" +#include "rlimit-util.h" +#include "scope.h" +#include "service.h" +#include "show-status.h" +#include "slice.h" +#include "socket-util.h" +#include "socket.h" +#include "swap.h" +#include "target.h" +#include "test-tables.h" +#include "tests.h" +#include "timer.h" +#include "unit-name.h" +#include "unit.h" +#include "virt.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(architecture, ARCHITECTURE); + test_table(assert_type, CONDITION_TYPE); + test_table(automount_result, AUTOMOUNT_RESULT); + test_table(automount_state, AUTOMOUNT_STATE); + test_table(cgroup_controller, CGROUP_CONTROLLER); + test_table(cgroup_device_policy, CGROUP_DEVICE_POLICY); + test_table(cgroup_io_limit_type, CGROUP_IO_LIMIT_TYPE); + test_table(collect_mode, COLLECT_MODE); + test_table(condition_result, CONDITION_RESULT); + test_table(condition_type, CONDITION_TYPE); + test_table(confidential_virtualization, CONFIDENTIAL_VIRTUALIZATION); + test_table(device_action, SD_DEVICE_ACTION); + test_table(device_state, DEVICE_STATE); + test_table(dns_over_tls_mode, DNS_OVER_TLS_MODE); + test_table(dnssec_mode, DNSSEC_MODE); + test_table(emergency_action, EMERGENCY_ACTION); + test_table(exec_directory_type, EXEC_DIRECTORY_TYPE); + test_table(exec_input, EXEC_INPUT); + test_table(exec_keyring_mode, EXEC_KEYRING_MODE); + test_table(exec_output, EXEC_OUTPUT); + test_table(exec_preserve_mode, EXEC_PRESERVE_MODE); + test_table(exec_utmp_mode, EXEC_UTMP_MODE); + test_table(image_type, IMAGE_TYPE); + test_table(import_verify, IMPORT_VERIFY); + test_table(job_mode, JOB_MODE); + test_table(job_result, JOB_RESULT); + test_table(job_state, JOB_STATE); + test_table(job_type, JOB_TYPE); + test_table(kill_mode, KILL_MODE); + test_table(kill_who, KILL_WHO); + test_table(locale_variable, VARIABLE_LC); + test_table(log_target, LOG_TARGET); + test_table(managed_oom_mode, MANAGED_OOM_MODE); + test_table(managed_oom_preference, MANAGED_OOM_PREFERENCE); + test_table(manager_state, MANAGER_STATE); + test_table(manager_timestamp, MANAGER_TIMESTAMP); + test_table(mount_exec_command, MOUNT_EXEC_COMMAND); + test_table(mount_result, MOUNT_RESULT); + test_table(mount_state, MOUNT_STATE); + test_table(name_policy, NAMEPOLICY); + test_table(namespace_type, NAMESPACE_TYPE); + test_table(notify_access, NOTIFY_ACCESS); + test_table(notify_state, NOTIFY_STATE); + test_table(output_mode, OUTPUT_MODE); + test_table(partition_designator, PARTITION_DESIGNATOR); + test_table(path_result, PATH_RESULT); + test_table(path_state, PATH_STATE); + test_table(path_type, PATH_TYPE); + test_table(protect_home, PROTECT_HOME); + test_table(protect_system, PROTECT_SYSTEM); + test_table(resolve_support, RESOLVE_SUPPORT); + test_table(rlimit, RLIMIT); + test_table(scope_result, SCOPE_RESULT); + test_table(scope_state, SCOPE_STATE); + test_table(service_exec_command, SERVICE_EXEC_COMMAND); + test_table(service_restart, SERVICE_RESTART); + test_table(service_restart_mode, SERVICE_RESTART_MODE); + test_table(service_result, SERVICE_RESULT); + test_table(service_state, SERVICE_STATE); + test_table(service_type, SERVICE_TYPE); + test_table(show_status, SHOW_STATUS); + test_table(slice_state, SLICE_STATE); + test_table(socket_address_bind_ipv6_only, SOCKET_ADDRESS_BIND_IPV6_ONLY); + test_table(socket_exec_command, SOCKET_EXEC_COMMAND); + test_table(socket_result, SOCKET_RESULT); + test_table(socket_state, SOCKET_STATE); + test_table(swap_exec_command, SWAP_EXEC_COMMAND); + test_table(swap_result, SWAP_RESULT); + test_table(swap_state, SWAP_STATE); + test_table(target_state, TARGET_STATE); + test_table(timer_base, TIMER_BASE); + test_table(timer_result, TIMER_RESULT); + test_table(timer_state, TIMER_STATE); + test_table(unit_active_state, UNIT_ACTIVE_STATE); + test_table(unit_dependency, UNIT_DEPENDENCY); + test_table(install_change_type, INSTALL_CHANGE_TYPE); + test_table(unit_file_preset_mode, UNIT_FILE_PRESET_MODE); + test_table(unit_file_state, UNIT_FILE_STATE); + test_table(unit_load_state, UNIT_LOAD_STATE); + test_table(unit_type, UNIT_TYPE); + test_table(virtualization, VIRTUALIZATION); + test_table(compression, COMPRESSION); + + assert_cc(sizeof(sd_device_action_t) == sizeof(int64_t)); + + return EXIT_SUCCESS; +} diff --git a/src/test/test-terminal-util.c b/src/test/test-terminal-util.c new file mode 100644 index 0000000..a2b7101 --- /dev/null +++ b/src/test/test-terminal-util.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "path-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +#define LOREM_IPSUM "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor " \ + "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " \ + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " \ + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat " \ + "non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." + +TEST(default_term_for_tty) { + puts(default_term_for_tty("/dev/tty23")); + puts(default_term_for_tty("/dev/ttyS23")); + puts(default_term_for_tty("/dev/tty0")); + puts(default_term_for_tty("/dev/pty0")); + puts(default_term_for_tty("/dev/pts/0")); + puts(default_term_for_tty("/dev/console")); + puts(default_term_for_tty("tty23")); + puts(default_term_for_tty("ttyS23")); + puts(default_term_for_tty("tty0")); + puts(default_term_for_tty("pty0")); + puts(default_term_for_tty("pts/0")); + puts(default_term_for_tty("console")); +} + +TEST(read_one_char) { + _cleanup_fclose_ FILE *file = NULL; + char r; + bool need_nl; + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-read_one_char.XXXXXX"; + + assert_se(fmkostemp_safe(name, "r+", &file) == 0); + + assert_se(fputs("c\n", file) >= 0); + rewind(file); + assert_se(read_one_char(file, &r, 1000000, &need_nl) >= 0); + assert_se(!need_nl); + assert_se(r == 'c'); + assert_se(read_one_char(file, &r, 1000000, &need_nl) < 0); + + rewind(file); + assert_se(fputs("foobar\n", file) >= 0); + rewind(file); + assert_se(read_one_char(file, &r, 1000000, &need_nl) < 0); + + rewind(file); + assert_se(fputs("\n", file) >= 0); + rewind(file); + assert_se(read_one_char(file, &r, 1000000, &need_nl) < 0); +} + +TEST(getttyname_malloc) { + _cleanup_free_ char *ttyname = NULL; + _cleanup_close_ int master = -EBADF; + + assert_se((master = posix_openpt(O_RDWR|O_NOCTTY)) >= 0); + assert_se(getttyname_malloc(master, &ttyname) >= 0); + log_info("ttyname = %s", ttyname); + + assert_se(PATH_IN_SET(ttyname, "ptmx", "pts/ptmx")); +} + +typedef struct { + const char *name; + const char* (*func)(void); +} Color; + +static const Color colors[] = { + { "normal", ansi_normal }, + { "highlight", ansi_highlight }, + { "black", ansi_black }, + { "red", ansi_red }, + { "green", ansi_green }, + { "yellow", ansi_yellow }, + { "blue", ansi_blue }, + { "magenta", ansi_magenta }, + { "cyan", ansi_cyan }, + { "white", ansi_white }, + { "grey", ansi_grey }, + + { "bright-black", ansi_bright_black }, + { "bright-red", ansi_bright_red }, + { "bright-green", ansi_bright_green }, + { "bright-yellow", ansi_bright_yellow }, + { "bright-blue", ansi_bright_blue }, + { "bright-magenta", ansi_bright_magenta }, + { "bright-cyan", ansi_bright_cyan }, + { "bright-white", ansi_bright_white }, + + { "highlight-black", ansi_highlight_black }, + { "highlight-red", ansi_highlight_red }, + { "highlight-green", ansi_highlight_green }, + { "highlight-yellow (original)", _ansi_highlight_yellow }, + { "highlight-yellow (replacement)", ansi_highlight_yellow }, + { "highlight-blue", ansi_highlight_blue }, + { "highlight-magenta", ansi_highlight_magenta }, + { "highlight-cyan", ansi_highlight_cyan }, + { "highlight-white", ansi_highlight_white }, + { "highlight-grey", ansi_highlight_grey }, + + { "underline", ansi_underline }, + { "highlight-underline", ansi_highlight_underline }, + { "highlight-red-underline", ansi_highlight_red_underline }, + { "highlight-green-underline", ansi_highlight_green_underline }, + { "highlight-yellow-underline", ansi_highlight_yellow_underline }, + { "highlight-blue-underline", ansi_highlight_blue_underline }, + { "highlight-magenta-underline", ansi_highlight_magenta_underline }, + { "highlight-grey-underline", ansi_highlight_grey_underline }, +}; + +TEST(colors) { + for (size_t i = 0; i < ELEMENTSOF(colors); i++) + printf("<%s%s%s>\n", colors[i].func(), colors[i].name, ansi_normal()); +} + +TEST(text) { + for (size_t i = 0; !streq(colors[i].name, "underline"); i++) { + bool blwh = strstr(colors[i].name, "black") + || strstr(colors[i].name, "white"); + + printf("\n" + "Testing color %s%s\n%s%s%s\n", + colors[i].name, + blwh ? "" : ", this text should be readable", + colors[i].func(), + LOREM_IPSUM, + ansi_normal()); + } +} + +TEST(get_ctty) { + _cleanup_free_ char *ctty = NULL; + struct stat st; + dev_t devnr; + int r; + + r = get_ctty(0, &devnr, &ctty); + if (r < 0) { + log_notice_errno(r, "Apparently called without a controlling TTY, cutting get_ctty() test short: %m"); + return; + } + + /* In almost all cases STDIN will match our controlling TTY. Let's verify that and then compare paths */ + assert_se(fstat(STDIN_FILENO, &st) >= 0); + if (S_ISCHR(st.st_mode) && st.st_rdev == devnr) { + _cleanup_free_ char *stdin_name = NULL; + + assert_se(getttyname_malloc(STDIN_FILENO, &stdin_name) >= 0); + assert_se(path_equal(stdin_name, ctty)); + } else + log_notice("Not invoked with stdin == ctty, cutting get_ctty() test short"); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-time-util.c b/src/test/test-time-util.c new file mode 100644 index 0000000..53bc779 --- /dev/null +++ b/src/test/test-time-util.c @@ -0,0 +1,1195 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "env-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "random-util.h" +#include "serialize.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "time-util.h" + +#define TRIAL 100u + +TEST(parse_sec) { + usec_t u; + + assert_se(parse_sec("5s", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + assert_se(parse_sec("5s500ms", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC + 500 * USEC_PER_MSEC); + assert_se(parse_sec(" 5s 500ms ", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC + 500 * USEC_PER_MSEC); + assert_se(parse_sec(" 5.5s ", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC + 500 * USEC_PER_MSEC); + assert_se(parse_sec(" 5.5s 0.5ms ", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC + 500 * USEC_PER_MSEC + 500); + assert_se(parse_sec(" .22s ", &u) >= 0); + assert_se(u == 220 * USEC_PER_MSEC); + assert_se(parse_sec(" .50y ", &u) >= 0); + assert_se(u == USEC_PER_YEAR / 2); + assert_se(parse_sec("2.5", &u) >= 0); + assert_se(u == 2500 * USEC_PER_MSEC); + assert_se(parse_sec(".7", &u) >= 0); + assert_se(u == 700 * USEC_PER_MSEC); + assert_se(parse_sec("23us", &u) >= 0); + assert_se(u == 23); + assert_se(parse_sec("23μs", &u) >= 0); /* greek small letter mu */ + assert_se(u == 23); + assert_se(parse_sec("23µs", &u) >= 0); /* micro symbol */ + assert_se(u == 23); + assert_se(parse_sec("infinity", &u) >= 0); + assert_se(u == USEC_INFINITY); + assert_se(parse_sec(" infinity ", &u) >= 0); + assert_se(u == USEC_INFINITY); + assert_se(parse_sec("+3.1s", &u) >= 0); + assert_se(u == 3100 * USEC_PER_MSEC); + assert_se(parse_sec("3.1s.2", &u) >= 0); + assert_se(u == 3300 * USEC_PER_MSEC); + assert_se(parse_sec("3.1 .2", &u) >= 0); + assert_se(u == 3300 * USEC_PER_MSEC); + assert_se(parse_sec("3.1 sec .2 sec", &u) >= 0); + assert_se(u == 3300 * USEC_PER_MSEC); + assert_se(parse_sec("3.1 sec 1.2 sec", &u) >= 0); + assert_se(u == 4300 * USEC_PER_MSEC); + + assert_se(parse_sec(" xyz ", &u) < 0); + assert_se(parse_sec("", &u) < 0); + assert_se(parse_sec(" . ", &u) < 0); + assert_se(parse_sec(" 5. ", &u) < 0); + assert_se(parse_sec(".s ", &u) < 0); + assert_se(parse_sec("-5s ", &u) < 0); + assert_se(parse_sec("-0.3s ", &u) < 0); + assert_se(parse_sec("-0.0s ", &u) < 0); + assert_se(parse_sec("-0.-0s ", &u) < 0); + assert_se(parse_sec("0.-0s ", &u) < 0); + assert_se(parse_sec("3.-0s ", &u) < 0); + assert_se(parse_sec(" infinity .7", &u) < 0); + assert_se(parse_sec(".3 infinity", &u) < 0); + assert_se(parse_sec("3.+1s", &u) < 0); + assert_se(parse_sec("3. 1s", &u) < 0); + assert_se(parse_sec("3.s", &u) < 0); + assert_se(parse_sec("12.34.56", &u) < 0); + assert_se(parse_sec("12..34", &u) < 0); + assert_se(parse_sec("..1234", &u) < 0); + assert_se(parse_sec("1234..", &u) < 0); +} + +TEST(parse_sec_fix_0) { + usec_t u; + + assert_se(parse_sec_fix_0("5s", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + assert_se(parse_sec_fix_0("0s", &u) >= 0); + assert_se(u == USEC_INFINITY); + assert_se(parse_sec_fix_0("0", &u) >= 0); + assert_se(u == USEC_INFINITY); + assert_se(parse_sec_fix_0(" 0", &u) >= 0); + assert_se(u == USEC_INFINITY); +} + +TEST(parse_sec_def_infinity) { + usec_t u; + + assert_se(parse_sec_def_infinity("5s", &u) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + assert_se(parse_sec_def_infinity("", &u) >= 0); + assert_se(u == USEC_INFINITY); + assert_se(parse_sec_def_infinity(" ", &u) >= 0); + assert_se(u == USEC_INFINITY); + assert_se(parse_sec_def_infinity("0s", &u) >= 0); + assert_se(u == 0); + assert_se(parse_sec_def_infinity("0", &u) >= 0); + assert_se(u == 0); + assert_se(parse_sec_def_infinity(" 0", &u) >= 0); + assert_se(u == 0); + assert_se(parse_sec_def_infinity("-5s", &u) < 0); +} + +TEST(parse_time) { + usec_t u; + + assert_se(parse_time("5", &u, 1) >= 0); + assert_se(u == 5); + + assert_se(parse_time("5", &u, USEC_PER_MSEC) >= 0); + assert_se(u == 5 * USEC_PER_MSEC); + + assert_se(parse_time("5", &u, USEC_PER_SEC) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + + assert_se(parse_time("5s", &u, 1) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + + assert_se(parse_time("5s", &u, USEC_PER_SEC) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + + assert_se(parse_time("5s", &u, USEC_PER_MSEC) >= 0); + assert_se(u == 5 * USEC_PER_SEC); + + assert_se(parse_time("11111111111111y", &u, 1) == -ERANGE); + assert_se(parse_time("1.1111111111111y", &u, 1) >= 0); +} + +TEST(parse_nsec) { + nsec_t u; + + assert_se(parse_nsec("5s", &u) >= 0); + assert_se(u == 5 * NSEC_PER_SEC); + assert_se(parse_nsec("5s500ms", &u) >= 0); + assert_se(u == 5 * NSEC_PER_SEC + 500 * NSEC_PER_MSEC); + assert_se(parse_nsec(" 5s 500ms ", &u) >= 0); + assert_se(u == 5 * NSEC_PER_SEC + 500 * NSEC_PER_MSEC); + assert_se(parse_nsec(" 5.5s ", &u) >= 0); + assert_se(u == 5 * NSEC_PER_SEC + 500 * NSEC_PER_MSEC); + assert_se(parse_nsec(" 5.5s 0.5ms ", &u) >= 0); + assert_se(u == 5 * NSEC_PER_SEC + 500 * NSEC_PER_MSEC + 500 * NSEC_PER_USEC); + assert_se(parse_nsec(" .22s ", &u) >= 0); + assert_se(u == 220 * NSEC_PER_MSEC); + assert_se(parse_nsec(" .50y ", &u) >= 0); + assert_se(u == NSEC_PER_YEAR / 2); + assert_se(parse_nsec("2.5", &u) >= 0); + assert_se(u == 2); + assert_se(parse_nsec(".7", &u) >= 0); + assert_se(u == 0); + assert_se(parse_nsec("infinity", &u) >= 0); + assert_se(u == NSEC_INFINITY); + assert_se(parse_nsec(" infinity ", &u) >= 0); + assert_se(u == NSEC_INFINITY); + assert_se(parse_nsec("+3.1s", &u) >= 0); + assert_se(u == 3100 * NSEC_PER_MSEC); + assert_se(parse_nsec("3.1s.2", &u) >= 0); + assert_se(u == 3100 * NSEC_PER_MSEC); + assert_se(parse_nsec("3.1 .2s", &u) >= 0); + assert_se(u == 200 * NSEC_PER_MSEC + 3); + assert_se(parse_nsec("3.1 sec .2 sec", &u) >= 0); + assert_se(u == 3300 * NSEC_PER_MSEC); + assert_se(parse_nsec("3.1 sec 1.2 sec", &u) >= 0); + assert_se(u == 4300 * NSEC_PER_MSEC); + + assert_se(parse_nsec(" xyz ", &u) < 0); + assert_se(parse_nsec("", &u) < 0); + assert_se(parse_nsec(" . ", &u) < 0); + assert_se(parse_nsec(" 5. ", &u) < 0); + assert_se(parse_nsec(".s ", &u) < 0); + assert_se(parse_nsec(" infinity .7", &u) < 0); + assert_se(parse_nsec(".3 infinity", &u) < 0); + assert_se(parse_nsec("-5s ", &u) < 0); + assert_se(parse_nsec("-0.3s ", &u) < 0); + assert_se(parse_nsec("-0.0s ", &u) < 0); + assert_se(parse_nsec("-0.-0s ", &u) < 0); + assert_se(parse_nsec("0.-0s ", &u) < 0); + assert_se(parse_nsec("3.-0s ", &u) < 0); + assert_se(parse_nsec(" infinity .7", &u) < 0); + assert_se(parse_nsec(".3 infinity", &u) < 0); + assert_se(parse_nsec("3.+1s", &u) < 0); + assert_se(parse_nsec("3. 1s", &u) < 0); + assert_se(parse_nsec("3.s", &u) < 0); + assert_se(parse_nsec("12.34.56", &u) < 0); + assert_se(parse_nsec("12..34", &u) < 0); + assert_se(parse_nsec("..1234", &u) < 0); + assert_se(parse_nsec("1234..", &u) < 0); + assert_se(parse_nsec("1111111111111y", &u) == -ERANGE); + assert_se(parse_nsec("1.111111111111y", &u) >= 0); +} + +static void test_format_timespan_one(usec_t x, usec_t accuracy) { + char l[FORMAT_TIMESPAN_MAX]; + const char *t; + usec_t y; + + log_debug(USEC_FMT" (at accuracy "USEC_FMT")", x, accuracy); + + assert_se(t = format_timespan(l, sizeof l, x, accuracy)); + log_debug(" = <%s>", t); + + assert_se(parse_sec(t, &y) >= 0); + log_debug(" = "USEC_FMT, y); + + if (accuracy <= 0) + accuracy = 1; + + assert_se(x / accuracy == y / accuracy); +} + +static void test_format_timespan_accuracy(usec_t accuracy) { + log_info("/* %s accuracy="USEC_FMT" */", __func__, accuracy); + + test_format_timespan_one(0, accuracy); + test_format_timespan_one(1, accuracy); + test_format_timespan_one(1*USEC_PER_SEC, accuracy); + test_format_timespan_one(999*USEC_PER_MSEC, accuracy); + test_format_timespan_one(1234567, accuracy); + test_format_timespan_one(12, accuracy); + test_format_timespan_one(123, accuracy); + test_format_timespan_one(1234, accuracy); + test_format_timespan_one(12345, accuracy); + test_format_timespan_one(123456, accuracy); + test_format_timespan_one(1234567, accuracy); + test_format_timespan_one(12345678, accuracy); + test_format_timespan_one(1200000, accuracy); + test_format_timespan_one(1230000, accuracy); + test_format_timespan_one(1234000, accuracy); + test_format_timespan_one(1234500, accuracy); + test_format_timespan_one(1234560, accuracy); + test_format_timespan_one(1234567, accuracy); + test_format_timespan_one(986087, accuracy); + test_format_timespan_one(500 * USEC_PER_MSEC, accuracy); + test_format_timespan_one(9*USEC_PER_YEAR/5 - 23, accuracy); + test_format_timespan_one(USEC_INFINITY, accuracy); +} + +TEST(format_timespan) { + test_format_timespan_accuracy(1); + test_format_timespan_accuracy(USEC_PER_MSEC); + test_format_timespan_accuracy(USEC_PER_SEC); + + /* See issue #23928. */ + _cleanup_free_ char *buf = NULL; + assert_se(buf = new(char, 5)); + assert_se(buf == format_timespan(buf, 5, 100005, 1000)); +} + +TEST(verify_timezone) { + assert_se(verify_timezone("Europe/Berlin", LOG_DEBUG) == 0); + assert_se(verify_timezone("Australia/Sydney", LOG_DEBUG) == 0); + assert_se(verify_timezone("Europe/Do not exist", LOG_DEBUG) == -EINVAL); + assert_se(verify_timezone("Europe/DoNotExist", LOG_DEBUG) == -ENOENT); + assert_se(verify_timezone("/DoNotExist", LOG_DEBUG) == -EINVAL); + assert_se(verify_timezone("DoNotExist/", LOG_DEBUG) == -EINVAL); +} + +TEST(timezone_is_valid) { + assert_se(timezone_is_valid("Europe/Berlin", LOG_ERR)); + assert_se(timezone_is_valid("Australia/Sydney", LOG_ERR)); + assert_se(!timezone_is_valid("Europe/Do not exist", LOG_ERR)); +} + +TEST(get_timezones) { + _cleanup_strv_free_ char **zones = NULL; + int r; + + r = get_timezones(&zones); + assert_se(r == 0); + + STRV_FOREACH(zone, zones) { + r = verify_timezone(*zone, LOG_ERR); + log_debug_errno(r, "verify_timezone(\"%s\"): %m", *zone); + assert_se(r >= 0 || r == -ENOENT); + } +} + +TEST(usec_add) { + assert_se(usec_add(0, 0) == 0); + assert_se(usec_add(1, 4) == 5); + assert_se(usec_add(USEC_INFINITY, 5) == USEC_INFINITY); + assert_se(usec_add(5, USEC_INFINITY) == USEC_INFINITY); + assert_se(usec_add(USEC_INFINITY-5, 2) == USEC_INFINITY-3); + assert_se(usec_add(USEC_INFINITY-2, 2) == USEC_INFINITY); + assert_se(usec_add(USEC_INFINITY-1, 2) == USEC_INFINITY); + assert_se(usec_add(USEC_INFINITY, 2) == USEC_INFINITY); +} + +TEST(usec_sub_unsigned) { + assert_se(usec_sub_unsigned(0, 0) == 0); + assert_se(usec_sub_unsigned(0, 2) == 0); + assert_se(usec_sub_unsigned(0, USEC_INFINITY) == 0); + assert_se(usec_sub_unsigned(1, 0) == 1); + assert_se(usec_sub_unsigned(1, 1) == 0); + assert_se(usec_sub_unsigned(1, 2) == 0); + assert_se(usec_sub_unsigned(1, 3) == 0); + assert_se(usec_sub_unsigned(1, USEC_INFINITY) == 0); + assert_se(usec_sub_unsigned(USEC_INFINITY-1, 0) == USEC_INFINITY-1); + assert_se(usec_sub_unsigned(USEC_INFINITY-1, 1) == USEC_INFINITY-2); + assert_se(usec_sub_unsigned(USEC_INFINITY-1, 2) == USEC_INFINITY-3); + assert_se(usec_sub_unsigned(USEC_INFINITY-1, USEC_INFINITY-2) == 1); + assert_se(usec_sub_unsigned(USEC_INFINITY-1, USEC_INFINITY-1) == 0); + assert_se(usec_sub_unsigned(USEC_INFINITY-1, USEC_INFINITY) == 0); + assert_se(usec_sub_unsigned(USEC_INFINITY, 0) == USEC_INFINITY); + assert_se(usec_sub_unsigned(USEC_INFINITY, 1) == USEC_INFINITY); + assert_se(usec_sub_unsigned(USEC_INFINITY, 2) == USEC_INFINITY); + assert_se(usec_sub_unsigned(USEC_INFINITY, USEC_INFINITY) == USEC_INFINITY); +} + +TEST(usec_sub_signed) { + assert_se(usec_sub_signed(0, 0) == 0); + assert_se(usec_sub_signed(4, 1) == 3); + assert_se(usec_sub_signed(4, 4) == 0); + assert_se(usec_sub_signed(4, 5) == 0); + + assert_se(usec_sub_signed(USEC_INFINITY-3, -3) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY-3, -4) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY-3, -5) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY, 5) == USEC_INFINITY); + + assert_se(usec_sub_signed(0, INT64_MAX) == 0); + assert_se(usec_sub_signed(0, -INT64_MAX) == INT64_MAX); + assert_se(usec_sub_signed(0, INT64_MIN) == (usec_t) INT64_MAX + 1); + assert_se(usec_sub_signed(0, -(INT64_MIN+1)) == 0); + + assert_se(usec_sub_signed(USEC_INFINITY, INT64_MAX) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY, -INT64_MAX) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY, INT64_MIN) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY, -(INT64_MIN+1)) == USEC_INFINITY); + + assert_se(usec_sub_signed(USEC_INFINITY-1, INT64_MAX) == USEC_INFINITY-1-INT64_MAX); + assert_se(usec_sub_signed(USEC_INFINITY-1, -INT64_MAX) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY-1, INT64_MIN) == USEC_INFINITY); + assert_se(usec_sub_signed(USEC_INFINITY-1, -(INT64_MIN+1)) == USEC_INFINITY-1-((usec_t) (-(INT64_MIN+1)))); +} + +TEST(format_timestamp) { + for (unsigned i = 0; i < TRIAL; i++) { + char buf[CONST_MAX(FORMAT_TIMESTAMP_MAX, FORMAT_TIMESPAN_MAX)]; + usec_t x, y; + + x = random_u64_range(USEC_TIMESTAMP_FORMATTABLE_MAX - USEC_PER_SEC) + USEC_PER_SEC; + + assert_se(format_timestamp(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + assert_se(x / USEC_PER_SEC == y / USEC_PER_SEC); + + assert_se(format_timestamp_style(buf, sizeof(buf), x, TIMESTAMP_UNIX)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + assert_se(x / USEC_PER_SEC == y / USEC_PER_SEC); + + assert_se(format_timestamp_style(buf, sizeof(buf), x, TIMESTAMP_UTC)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + assert_se(x / USEC_PER_SEC == y / USEC_PER_SEC); + + assert_se(format_timestamp_style(buf, sizeof(buf), x, TIMESTAMP_US)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + assert_se(x == y); + + assert_se(format_timestamp_style(buf, sizeof(buf), x, TIMESTAMP_US_UTC)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + assert_se(x == y); + + if (x > 2 * USEC_PER_DAY) { + assert_se(format_timestamp_style(buf, sizeof(buf), x, TIMESTAMP_DATE)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + assert_se(y > usec_sub_unsigned(x, 2 * USEC_PER_DAY) && y < usec_add(x, 2 * USEC_PER_DAY)); + } + + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(parse_timestamp(buf, &y) >= 0); + + /* The two calls above will run with a slightly different local time. Make sure we are in the same + * range however, but give enough leeway that this is unlikely to explode. And of course, + * format_timestamp_relative() scales the accuracy with the distance from the current time up to one + * month, cover for that too. */ + assert_se(y > x ? y - x : x - y <= USEC_PER_MONTH + USEC_PER_DAY); + } +} + +static void test_format_timestamp_impl(usec_t x) { + bool success, override; + const char *xx, *yy; + usec_t y; + + xx = FORMAT_TIMESTAMP(x); + assert_se(xx); + assert_se(parse_timestamp(xx, &y) >= 0); + yy = FORMAT_TIMESTAMP(y); + assert_se(yy); + + success = (x / USEC_PER_SEC == y / USEC_PER_SEC) && streq(xx, yy); + /* Workaround for https://github.com/systemd/systemd/issues/28472 */ + override = !success && + (STRPTR_IN_SET(tzname[0], "CAT", "EAT") || + STRPTR_IN_SET(tzname[1], "CAT", "EAT")) && + DIV_ROUND_UP(y - x, USEC_PER_SEC) == 3600; /* 1 hour, ignore fractional second */ + log_full(success ? LOG_DEBUG : override ? LOG_WARNING : LOG_ERR, + "@" USEC_FMT " → %s → @" USEC_FMT " → %s%s", + x, xx, y, yy, + override ? ", ignoring." : ""); + if (!override) { + assert_se(x / USEC_PER_SEC == y / USEC_PER_SEC); + assert_se(streq(xx, yy)); + } +} + +static void test_format_timestamp_loop(void) { + test_format_timestamp_impl(USEC_PER_SEC); + test_format_timestamp_impl(USEC_TIMESTAMP_FORMATTABLE_MAX_32BIT-1); + test_format_timestamp_impl(USEC_TIMESTAMP_FORMATTABLE_MAX_32BIT); + test_format_timestamp_impl(USEC_TIMESTAMP_FORMATTABLE_MAX-1); + test_format_timestamp_impl(USEC_TIMESTAMP_FORMATTABLE_MAX); + + /* Two cases which trigger https://github.com/systemd/systemd/issues/28472 */ + test_format_timestamp_impl(1504938962980066); + test_format_timestamp_impl(1509482094632752); + + for (unsigned i = 0; i < TRIAL; i++) { + usec_t x; + + x = random_u64_range(USEC_TIMESTAMP_FORMATTABLE_MAX - USEC_PER_SEC) + USEC_PER_SEC; + test_format_timestamp_impl(x); + } +} + +TEST(FORMAT_TIMESTAMP) { + test_format_timestamp_loop(); +} + +static void test_format_timestamp_with_tz_one(const char *tz) { + const char *saved_tz, *colon_tz; + + if (!timezone_is_valid(tz, LOG_DEBUG)) + return; + + log_info("/* %s(%s) */", __func__, tz); + + saved_tz = getenv("TZ"); + + assert_se(colon_tz = strjoina(":", tz)); + assert_se(setenv("TZ", colon_tz, 1) >= 0); + tzset(); + log_debug("%s: tzname[0]=%s, tzname[1]=%s", tz, strempty(tzname[0]), strempty(tzname[1])); + + test_format_timestamp_loop(); + + assert_se(set_unset_env("TZ", saved_tz, true) == 0); + tzset(); +} + +TEST(FORMAT_TIMESTAMP_with_tz) { + _cleanup_strv_free_ char **timezones = NULL; + + test_format_timestamp_with_tz_one("UTC"); + + if (!slow_tests_enabled()) + return (void) log_tests_skipped("slow tests are disabled"); + + assert_se(get_timezones(&timezones) >= 0); + STRV_FOREACH(tz, timezones) + test_format_timestamp_with_tz_one(*tz); +} + +TEST(format_timestamp_relative_full) { + char buf[CONST_MAX(FORMAT_TIMESTAMP_MAX, FORMAT_TIMESPAN_MAX)]; + usec_t x; + + /* Years and months */ + x = now(CLOCK_REALTIME) - (1*USEC_PER_YEAR + 1*USEC_PER_MONTH); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "1 year 1 month ago")); + + x = now(CLOCK_MONOTONIC) + (1*USEC_PER_YEAR + 1.5*USEC_PER_MONTH); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_MONOTONIC, false)); + log_debug("%s", buf); + assert_se(streq(buf, "1 year 1 month left")); + + x = now(CLOCK_REALTIME) - (1*USEC_PER_YEAR + 2*USEC_PER_MONTH); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "1 year 2 months ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_YEAR + 1*USEC_PER_MONTH); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "2 years 1 month ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_YEAR + 2*USEC_PER_MONTH); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "2 years 2 months ago")); + + /* Months and days */ + x = now(CLOCK_REALTIME) - (1*USEC_PER_MONTH + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "1 month 1 day ago")); + + x = now(CLOCK_REALTIME) - (1*USEC_PER_MONTH + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "1 month 2 days ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_MONTH + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "2 months 1 day ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_MONTH + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "2 months 2 days ago")); + + /* Weeks and days */ + x = now(CLOCK_REALTIME) - (1*USEC_PER_WEEK + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "1 week 1 day ago")); + + x = now(CLOCK_REALTIME) - (1*USEC_PER_WEEK + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "1 week 2 days ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_WEEK + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "2 weeks 1 day ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_WEEK + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative_full(buf, sizeof(buf), x, CLOCK_REALTIME, true)); + log_debug("%s", buf); + assert_se(streq(buf, "2 weeks 2 days ago")); +} + +TEST(format_timestamp_relative) { + char buf[CONST_MAX(FORMAT_TIMESTAMP_MAX, FORMAT_TIMESPAN_MAX)]; + usec_t x; + + /* Only testing timestamps in the past so we don't need to add some delta to account for time passing + * by while we are running the tests (unless we're running on potatoes and 24 hours somehow passes + * between our call to now() and format_timestamp_relative's call to now()). */ + + /* Years and months */ + x = now(CLOCK_REALTIME) - (1*USEC_PER_YEAR + 1*USEC_PER_MONTH); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "1 year 1 month ago")); + + x = now(CLOCK_REALTIME) - (1*USEC_PER_YEAR + 2*USEC_PER_MONTH); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "1 year 2 months ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_YEAR + 1*USEC_PER_MONTH); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "2 years 1 month ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_YEAR + 2*USEC_PER_MONTH); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "2 years 2 months ago")); + + /* Months and days */ + x = now(CLOCK_REALTIME) - (1*USEC_PER_MONTH + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "1 month 1 day ago")); + + x = now(CLOCK_REALTIME) - (1*USEC_PER_MONTH + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "1 month 2 days ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_MONTH + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "2 months 1 day ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_MONTH + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "2 months 2 days ago")); + + /* Weeks and days */ + x = now(CLOCK_REALTIME) - (1*USEC_PER_WEEK + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "1 week 1 day ago")); + + x = now(CLOCK_REALTIME) - (1*USEC_PER_WEEK + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "1 week 2 days ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_WEEK + 1*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "2 weeks 1 day ago")); + + x = now(CLOCK_REALTIME) - (2*USEC_PER_WEEK + 2*USEC_PER_DAY); + assert_se(format_timestamp_relative(buf, sizeof(buf), x)); + log_debug("%s", buf); + assert_se(streq(buf, "2 weeks 2 days ago")); +} + +static void test_format_timestamp_one(usec_t val, TimestampStyle style, const char *result) { + char buf[FORMAT_TIMESTAMP_MAX]; + const char *t; + + t = format_timestamp_style(buf, sizeof(buf), val, style); + assert_se(streq_ptr(t, result)); +} + +TEST(format_timestamp_range) { + test_format_timestamp_one(0, TIMESTAMP_UTC, NULL); + test_format_timestamp_one(0, TIMESTAMP_DATE, NULL); + test_format_timestamp_one(0, TIMESTAMP_US_UTC, NULL); + + test_format_timestamp_one(1, TIMESTAMP_UTC, "Thu 1970-01-01 00:00:00 UTC"); + test_format_timestamp_one(1, TIMESTAMP_DATE, "Thu 1970-01-01"); + test_format_timestamp_one(1, TIMESTAMP_US_UTC, "Thu 1970-01-01 00:00:00.000001 UTC"); + + test_format_timestamp_one(USEC_PER_SEC, TIMESTAMP_UTC, "Thu 1970-01-01 00:00:01 UTC"); + test_format_timestamp_one(USEC_PER_SEC, TIMESTAMP_DATE, "Thu 1970-01-01"); + test_format_timestamp_one(USEC_PER_SEC, TIMESTAMP_US_UTC, "Thu 1970-01-01 00:00:01.000000 UTC"); + +#if SIZEOF_TIME_T == 8 + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX, TIMESTAMP_UTC, "Thu 9999-12-30 23:59:59 UTC"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX, TIMESTAMP_DATE, "Thu 9999-12-30"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX + 1, TIMESTAMP_UTC, "--- XXXX-XX-XX XX:XX:XX UTC"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX + 1, TIMESTAMP_US_UTC, "--- XXXX-XX-XX XX:XX:XX.XXXXXX UTC"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX + 1, TIMESTAMP_DATE, "--- XXXX-XX-XX"); +#elif SIZEOF_TIME_T == 4 + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX, TIMESTAMP_UTC, "Mon 2038-01-18 03:14:07 UTC"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX, TIMESTAMP_DATE, "Mon 2038-01-18"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX + 1, TIMESTAMP_UTC, "--- XXXX-XX-XX XX:XX:XX UTC"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX + 1, TIMESTAMP_US_UTC, "--- XXXX-XX-XX XX:XX:XX.XXXXXX UTC"); + test_format_timestamp_one(USEC_TIMESTAMP_FORMATTABLE_MAX + 1, TIMESTAMP_DATE, "--- XXXX-XX-XX"); +#endif + + test_format_timestamp_one(USEC_INFINITY, TIMESTAMP_UTC, NULL); +} + +static void test_parse_timestamp_one(const char *str, usec_t max_diff, usec_t expected) { + usec_t usec = USEC_INFINITY; + int r; + + r = parse_timestamp(str, &usec); + log_debug("/* %s(%s): max_diff="USEC_FMT", expected="USEC_FMT", result="USEC_FMT" */", __func__, str, max_diff, expected, usec); + assert_se(r >= 0); + assert_se(usec >= expected); + assert_se(usec_sub_unsigned(usec, expected) <= max_diff); +} + +static bool time_is_zero(usec_t usec) { + const char *s; + + s = FORMAT_TIMESTAMP(usec); + return strstr(s, " 00:00:00 "); +} + +static bool timezone_equal(usec_t today, usec_t target) { + const char *s, *t, *sz, *tz; + + s = FORMAT_TIMESTAMP(today); + t = FORMAT_TIMESTAMP(target); + assert_se(sz = strrchr(s, ' ')); + assert_se(tz = strrchr(t, ' ')); + log_debug("%s("USEC_FMT", "USEC_FMT") -> %s, %s", __func__, today, target, s, t); + return streq(sz, tz); +} + +static void test_parse_timestamp_impl(const char *tz) { + usec_t today, today2, now_usec; + + /* Invalid: Ensure that systemctl reboot --when=show and --when=cancel + * will not result in ambiguities */ + assert_se(parse_timestamp("show", NULL) == -EINVAL); + assert_se(parse_timestamp("cancel", NULL) == -EINVAL); + + /* UTC */ + test_parse_timestamp_one("Thu 1970-01-01 00:01 UTC", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 1970-01-01 00:00:01 UTC", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 1970-01-01 00:00:01.001 UTC", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Thu 1970-01-01 00:00:01.0010 UTC", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Thu 70-01-01 00:01 UTC", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 70-01-01 00:00:01 UTC", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 70-01-01 00:00:01.001 UTC", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Thu 70-01-01 00:00:01.0010 UTC", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1970-01-01 00:01 UTC", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1970-01-01 00:00:01 UTC", 0, USEC_PER_SEC); + test_parse_timestamp_one("1970-01-01 00:00:01.001 UTC", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1970-01-01 00:00:01.0010 UTC", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("70-01-01 00:01 UTC", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("70-01-01 00:00:01 UTC", 0, USEC_PER_SEC); + test_parse_timestamp_one("70-01-01 00:00:01.001 UTC", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("70-01-01 00:00:01.0010 UTC", 0, USEC_PER_SEC + 1000); + + /* Examples from RFC3339 */ + test_parse_timestamp_one("1985-04-12T23:20:50.52Z", 0, 482196050 * USEC_PER_SEC + 520000); + test_parse_timestamp_one("1996-12-19T16:39:57-08:00", 0, 851042397 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("1996-12-20T00:39:57Z", 0, 851042397 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("1990-12-31T23:59:60Z", 0, 662688000 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("1990-12-31T15:59:60-08:00", 0, 662688000 * USEC_PER_SEC + 000000); + assert_se(parse_timestamp("1937-01-01T12:00:27.87+00:20", NULL) == -EINVAL); /* we don't support pre-epoch timestamps */ + /* We accept timestamps without seconds as well */ + test_parse_timestamp_one("1996-12-20T00:39Z", 0, (851042397 - 57) * USEC_PER_SEC + 000000); + test_parse_timestamp_one("1990-12-31T15:59-08:00", 0, (662688000-60) * USEC_PER_SEC + 000000); + /* We drop day-of-week before parsing the timestamp */ + test_parse_timestamp_one("Thu 1970-01-01T00:01 UTC", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 1970-01-01T00:00:01 UTC", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 1970-01-01T00:01Z", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 1970-01-01T00:00:01Z", 0, USEC_PER_SEC); + /* RFC3339-style timezones can be welded to all formats */ + assert_se(parse_timestamp("today UTC", &today) == 0); + assert_se(parse_timestamp("todayZ", &today2) == 0); + assert_se(today == today2); + assert_se(parse_timestamp("today +0200", &today) == 0); + assert_se(parse_timestamp("today+02:00", &today2) == 0); + assert_se(today == today2); + + /* https://ijmacd.github.io/rfc3339-iso8601/ */ + test_parse_timestamp_one("2023-09-06 12:49:27-00:00", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06 12:49:27.284-00:00", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06 12:49:27.284029Z", 0, 1694004567 * USEC_PER_SEC + 284029); + test_parse_timestamp_one("2023-09-06 12:49:27.284Z", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06 12:49:27.28Z", 0, 1694004567 * USEC_PER_SEC + 280000); + test_parse_timestamp_one("2023-09-06 12:49:27.2Z", 0, 1694004567 * USEC_PER_SEC + 200000); + test_parse_timestamp_one("2023-09-06 12:49:27Z", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06 14:49:27+02:00", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06 14:49:27.2+02:00", 0, 1694004567 * USEC_PER_SEC + 200000); + test_parse_timestamp_one("2023-09-06 14:49:27.28+02:00", 0, 1694004567 * USEC_PER_SEC + 280000); + test_parse_timestamp_one("2023-09-06 14:49:27.284+02:00", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06 14:49:27.284029+02:00", 0, 1694004567 * USEC_PER_SEC + 284029); + test_parse_timestamp_one("2023-09-06T12:49:27+00:00", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06T12:49:27-00:00", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06T12:49:27.284+00:00", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06T12:49:27.284-00:00", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06T12:49:27.284029Z", 0, 1694004567 * USEC_PER_SEC + 284029); + test_parse_timestamp_one("2023-09-06T12:49:27.284Z", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06T12:49:27.28Z", 0, 1694004567 * USEC_PER_SEC + 280000); + test_parse_timestamp_one("2023-09-06T12:49:27.2Z", 0, 1694004567 * USEC_PER_SEC + 200000); + test_parse_timestamp_one("2023-09-06T12:49:27Z", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06T14:49:27+02:00", 0, 1694004567 * USEC_PER_SEC + 000000); + test_parse_timestamp_one("2023-09-06T14:49:27.284+02:00", 0, 1694004567 * USEC_PER_SEC + 284000); + test_parse_timestamp_one("2023-09-06T14:49:27.284029+02:00", 0, 1694004567 * USEC_PER_SEC + 284029); + test_parse_timestamp_one("2023-09-06T21:34:27+08:45", 0, 1694004567 * USEC_PER_SEC + 000000); + + if (timezone_is_valid("Asia/Tokyo", LOG_DEBUG)) { + /* Asia/Tokyo (+0900) */ + test_parse_timestamp_one("Thu 1970-01-01 09:01 Asia/Tokyo", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 1970-01-01 09:00:01 Asia/Tokyo", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 1970-01-01 09:00:01.001 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Thu 1970-01-01 09:00:01.0010 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Thu 70-01-01 09:01 Asia/Tokyo", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 70-01-01 09:00:01 Asia/Tokyo", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 70-01-01 09:00:01.001 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Thu 70-01-01 09:00:01.0010 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1970-01-01 09:01 Asia/Tokyo", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1970-01-01 09:00:01 Asia/Tokyo", 0, USEC_PER_SEC); + test_parse_timestamp_one("1970-01-01 09:00:01.001 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1970-01-01 09:00:01.0010 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("70-01-01 09:01 Asia/Tokyo", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("70-01-01 09:00:01 Asia/Tokyo", 0, USEC_PER_SEC); + test_parse_timestamp_one("70-01-01 09:00:01.001 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("70-01-01 09:00:01.0010 Asia/Tokyo", 0, USEC_PER_SEC + 1000); + } + + if (streq_ptr(tz, "Asia/Tokyo")) { + /* JST (+0900) */ + test_parse_timestamp_one("Thu 1970-01-01 09:01 JST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 1970-01-01 09:00:01 JST", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 1970-01-01 09:00:01.001 JST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Thu 1970-01-01 09:00:01.0010 JST", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Thu 70-01-01 09:01 JST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Thu 70-01-01 09:00:01 JST", 0, USEC_PER_SEC); + test_parse_timestamp_one("Thu 70-01-01 09:00:01.001 JST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Thu 70-01-01 09:00:01.0010 JST", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1970-01-01 09:01 JST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1970-01-01 09:00:01 JST", 0, USEC_PER_SEC); + test_parse_timestamp_one("1970-01-01 09:00:01.001 JST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1970-01-01 09:00:01.0010 JST", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("70-01-01 09:01 JST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("70-01-01 09:00:01 JST", 0, USEC_PER_SEC); + test_parse_timestamp_one("70-01-01 09:00:01.001 JST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("70-01-01 09:00:01.0010 JST", 0, USEC_PER_SEC + 1000); + } + + if (timezone_is_valid("America/New_York", LOG_DEBUG)) { + /* America/New_York (-0500) */ + test_parse_timestamp_one("Wed 1969-12-31 19:01 America/New_York", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 1969-12-31 19:00:01 America/New_York", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 1969-12-31 19:00:01.001 America/New_York", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 1969-12-31 19:00:01.0010 America/New_York", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Wed 69-12-31 19:01 America/New_York", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 69-12-31 19:00:01 America/New_York", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 69-12-31 19:00:01.001 America/New_York", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 69-12-31 19:00:01.0010 America/New_York", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1969-12-31 19:01 America/New_York", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1969-12-31 19:00:01 America/New_York", 0, USEC_PER_SEC); + test_parse_timestamp_one("1969-12-31 19:00:01.001 America/New_York", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1969-12-31 19:00:01.0010 America/New_York", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("69-12-31 19:01 America/New_York", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("69-12-31 19:00:01 America/New_York", 0, USEC_PER_SEC); + test_parse_timestamp_one("69-12-31 19:00:01.001 America/New_York", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("69-12-31 19:00:01.0010 America/New_York", 0, USEC_PER_SEC + 1000); + } + + if (streq_ptr(tz, "America/New_York")) { + /* EST (-0500) */ + test_parse_timestamp_one("Wed 1969-12-31 19:01 EST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 1969-12-31 19:00:01 EST", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 1969-12-31 19:00:01.001 EST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 1969-12-31 19:00:01.0010 EST", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Wed 69-12-31 19:01 EST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 69-12-31 19:00:01 EST", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 69-12-31 19:00:01.001 EST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 69-12-31 19:00:01.0010 EST", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1969-12-31 19:01 EST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1969-12-31 19:00:01 EST", 0, USEC_PER_SEC); + test_parse_timestamp_one("1969-12-31 19:00:01.001 EST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1969-12-31 19:00:01.0010 EST", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("69-12-31 19:01 EST", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("69-12-31 19:00:01 EST", 0, USEC_PER_SEC); + test_parse_timestamp_one("69-12-31 19:00:01.001 EST", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("69-12-31 19:00:01.0010 EST", 0, USEC_PER_SEC + 1000); + } + + /* -06 */ + test_parse_timestamp_one("Wed 1969-12-31 18:01 -06", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01 -06", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01.001 -06", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01.0010 -06", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Wed 69-12-31 18:01 -06", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 69-12-31 18:00:01 -06", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 69-12-31 18:00:01.001 -06", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 69-12-31 18:00:01.0010 -06", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1969-12-31 18:01 -06", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1969-12-31 18:00:01 -06", 0, USEC_PER_SEC); + test_parse_timestamp_one("1969-12-31 18:00:01.001 -06", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1969-12-31 18:00:01.0010 -06", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("69-12-31 18:01 -06", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("69-12-31 18:00:01 -06", 0, USEC_PER_SEC); + test_parse_timestamp_one("69-12-31 18:00:01.001 -06", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("69-12-31 18:00:01.0010 -06", 0, USEC_PER_SEC + 1000); + + /* -0600 */ + test_parse_timestamp_one("Wed 1969-12-31 18:01 -0600", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01 -0600", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01.001 -0600", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01.0010 -0600", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Wed 69-12-31 18:01 -0600", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 69-12-31 18:00:01 -0600", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 69-12-31 18:00:01.001 -0600", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 69-12-31 18:00:01.0010 -0600", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1969-12-31 18:01 -0600", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1969-12-31 18:00:01 -0600", 0, USEC_PER_SEC); + test_parse_timestamp_one("1969-12-31 18:00:01.001 -0600", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1969-12-31 18:00:01.0010 -0600", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("69-12-31 18:01 -0600", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("69-12-31 18:00:01 -0600", 0, USEC_PER_SEC); + test_parse_timestamp_one("69-12-31 18:00:01.001 -0600", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("69-12-31 18:00:01.0010 -0600", 0, USEC_PER_SEC + 1000); + + /* -06:00 */ + test_parse_timestamp_one("Wed 1969-12-31 18:01 -06:00", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01 -06:00", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01.001 -06:00", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 1969-12-31 18:00:01.0010 -06:00", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("Wed 69-12-31 18:01 -06:00", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("Wed 69-12-31 18:00:01 -06:00", 0, USEC_PER_SEC); + test_parse_timestamp_one("Wed 69-12-31 18:00:01.001 -06:00", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("Wed 69-12-31 18:00:01.0010 -06:00", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("1969-12-31 18:01 -06:00", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("1969-12-31 18:00:01 -06:00", 0, USEC_PER_SEC); + test_parse_timestamp_one("1969-12-31 18:00:01.001 -06:00", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("1969-12-31 18:00:01.0010 -06:00", 0, USEC_PER_SEC + 1000); + + test_parse_timestamp_one("69-12-31 18:01 -06:00", 0, USEC_PER_MINUTE); + test_parse_timestamp_one("69-12-31 18:00:01 -06:00", 0, USEC_PER_SEC); + test_parse_timestamp_one("69-12-31 18:00:01.001 -06:00", 0, USEC_PER_SEC + 1000); + test_parse_timestamp_one("69-12-31 18:00:01.0010 -06:00", 0, USEC_PER_SEC + 1000); + + /* without date */ + assert_se(parse_timestamp("today", &today) == 0); + if (time_is_zero(today)) { + test_parse_timestamp_one("00:01", 0, today + USEC_PER_MINUTE); + test_parse_timestamp_one("00:00:01", 0, today + USEC_PER_SEC); + test_parse_timestamp_one("00:00:01.001", 0, today + USEC_PER_SEC + 1000); + test_parse_timestamp_one("00:00:01.0010", 0, today + USEC_PER_SEC + 1000); + + if (timezone_equal(today, today + USEC_PER_DAY) && time_is_zero(today + USEC_PER_DAY)) + test_parse_timestamp_one("tomorrow", 0, today + USEC_PER_DAY); + if (timezone_equal(today, today - USEC_PER_DAY) && time_is_zero(today - USEC_PER_DAY)) + test_parse_timestamp_one("yesterday", 0, today - USEC_PER_DAY); + } + + /* relative */ + assert_se(parse_timestamp("now", &now_usec) == 0); + test_parse_timestamp_one("+5hours", USEC_PER_MINUTE, now_usec + 5 * USEC_PER_HOUR); + if (now_usec >= 10 * USEC_PER_DAY) + test_parse_timestamp_one("-10days", USEC_PER_MINUTE, now_usec - 10 * USEC_PER_DAY); + test_parse_timestamp_one("2weeks left", USEC_PER_MINUTE, now_usec + 2 * USEC_PER_WEEK); + if (now_usec >= 30 * USEC_PER_MINUTE) + test_parse_timestamp_one("30minutes ago", USEC_PER_MINUTE, now_usec - 30 * USEC_PER_MINUTE); +} + +TEST(parse_timestamp) { + test_parse_timestamp_impl(NULL); +} + +static void test_parse_timestamp_with_tz_one(const char *tz) { + const char *saved_tz, *colon_tz; + + if (!timezone_is_valid(tz, LOG_DEBUG)) + return; + + log_info("/* %s(%s) */", __func__, tz); + + saved_tz = getenv("TZ"); + + assert_se(colon_tz = strjoina(":", tz)); + assert_se(setenv("TZ", colon_tz, 1) >= 0); + tzset(); + log_debug("%s: tzname[0]=%s, tzname[1]=%s", tz, strempty(tzname[0]), strempty(tzname[1])); + + test_parse_timestamp_impl(tz); + + assert_se(set_unset_env("TZ", saved_tz, true) == 0); + tzset(); +} + +TEST(parse_timestamp_with_tz) { + _cleanup_strv_free_ char **timezones = NULL; + + test_parse_timestamp_with_tz_one("UTC"); + + if (!slow_tests_enabled()) + return (void) log_tests_skipped("slow tests are disabled"); + + assert_se(get_timezones(&timezones) >= 0); + STRV_FOREACH(tz, timezones) + test_parse_timestamp_with_tz_one(*tz); +} + +TEST(deserialize_dual_timestamp) { + int r; + dual_timestamp t; + + r = deserialize_dual_timestamp("1234 5678", &t); + assert_se(r == 0); + assert_se(t.realtime == 1234); + assert_se(t.monotonic == 5678); + + r = deserialize_dual_timestamp("1234x 5678", &t); + assert_se(r == -EINVAL); + + r = deserialize_dual_timestamp("1234 5678y", &t); + assert_se(r == -EINVAL); + + r = deserialize_dual_timestamp("-1234 5678", &t); + assert_se(r == -EINVAL); + + r = deserialize_dual_timestamp("1234 -5678", &t); + assert_se(r == -EINVAL); + + /* Check that output wasn't modified. */ + assert_se(t.realtime == 1234); + assert_se(t.monotonic == 5678); + + r = deserialize_dual_timestamp("+123 567", &t); + assert_se(r == 0); + assert_se(t.realtime == 123); + assert_se(t.monotonic == 567); + + /* Check that we get "infinity" on overflow. */ + r = deserialize_dual_timestamp("18446744073709551617 0", &t); + assert_se(r == 0); + assert_se(t.realtime == USEC_INFINITY); + assert_se(t.monotonic == 0); +} + +static void assert_similar(usec_t a, usec_t b) { + usec_t d; + + if (a > b) + d = a - b; + else + d = b - a; + + assert_se(d < 10*USEC_PER_SEC); +} + +TEST(usec_shift_clock) { + usec_t rt, mn, bt; + + rt = now(CLOCK_REALTIME); + mn = now(CLOCK_MONOTONIC); + bt = now(CLOCK_BOOTTIME); + + assert_se(usec_shift_clock(USEC_INFINITY, CLOCK_REALTIME, CLOCK_MONOTONIC) == USEC_INFINITY); + + assert_similar(usec_shift_clock(rt + USEC_PER_HOUR, CLOCK_REALTIME, CLOCK_MONOTONIC), mn + USEC_PER_HOUR); + assert_similar(usec_shift_clock(rt + 2*USEC_PER_HOUR, CLOCK_REALTIME, CLOCK_BOOTTIME), bt + 2*USEC_PER_HOUR); + assert_se(usec_shift_clock(rt + 3*USEC_PER_HOUR, CLOCK_REALTIME, CLOCK_REALTIME_ALARM) == rt + 3*USEC_PER_HOUR); + + assert_similar(usec_shift_clock(mn + 4*USEC_PER_HOUR, CLOCK_MONOTONIC, CLOCK_REALTIME_ALARM), rt + 4*USEC_PER_HOUR); + assert_similar(usec_shift_clock(mn + 5*USEC_PER_HOUR, CLOCK_MONOTONIC, CLOCK_BOOTTIME), bt + 5*USEC_PER_HOUR); + assert_se(usec_shift_clock(mn + 6*USEC_PER_HOUR, CLOCK_MONOTONIC, CLOCK_MONOTONIC) == mn + 6*USEC_PER_HOUR); + + assert_similar(usec_shift_clock(bt + 7*USEC_PER_HOUR, CLOCK_BOOTTIME, CLOCK_MONOTONIC), mn + 7*USEC_PER_HOUR); + assert_similar(usec_shift_clock(bt + 8*USEC_PER_HOUR, CLOCK_BOOTTIME, CLOCK_REALTIME_ALARM), rt + 8*USEC_PER_HOUR); + assert_se(usec_shift_clock(bt + 9*USEC_PER_HOUR, CLOCK_BOOTTIME, CLOCK_BOOTTIME) == bt + 9*USEC_PER_HOUR); + + if (mn > USEC_PER_MINUTE) { + assert_similar(usec_shift_clock(rt - 30 * USEC_PER_SEC, CLOCK_REALTIME_ALARM, CLOCK_MONOTONIC), mn - 30 * USEC_PER_SEC); + assert_similar(usec_shift_clock(rt - 50 * USEC_PER_SEC, CLOCK_REALTIME, CLOCK_BOOTTIME), bt - 50 * USEC_PER_SEC); + } +} + +TEST(in_utc_timezone) { + const char *tz = getenv("TZ"); + + assert_se(setenv("TZ", ":UTC", 1) >= 0); + assert_se(in_utc_timezone()); + assert_se(streq(tzname[0], "UTC")); + assert_se(streq(tzname[1], "UTC")); + assert_se(timezone == 0); + assert_se(daylight == 0); + + assert_se(setenv("TZ", ":Europe/Berlin", 1) >= 0); + assert_se(!in_utc_timezone()); + assert_se(streq(tzname[0], "CET")); + assert_se(streq(tzname[1], "CEST")); + + assert_se(set_unset_env("TZ", tz, true) == 0); + tzset(); +} + +TEST(map_clock_usec) { + usec_t nowr, x, y, z; + + x = nowr = now(CLOCK_REALTIME); /* right now */ + y = map_clock_usec(x, CLOCK_REALTIME, CLOCK_MONOTONIC); + z = map_clock_usec(y, CLOCK_MONOTONIC, CLOCK_REALTIME); + /* Converting forth and back will introduce inaccuracies, since we cannot query both clocks atomically, but it should be small. Even on the slowest CI smaller than 1h */ + + assert_se((z > x ? z - x : x - z) < USEC_PER_HOUR); + + assert_se(nowr < USEC_INFINITY - USEC_PER_DAY*7); /* overflow check */ + x = nowr + USEC_PER_DAY*7; /* 1 week from now */ + y = map_clock_usec(x, CLOCK_REALTIME, CLOCK_MONOTONIC); + assert_se(y > 0 && y < USEC_INFINITY); + z = map_clock_usec(y, CLOCK_MONOTONIC, CLOCK_REALTIME); + assert_se(z > 0 && z < USEC_INFINITY); + assert_se((z > x ? z - x : x - z) < USEC_PER_HOUR); + + assert_se(nowr > USEC_PER_DAY * 7); /* underflow check */ + x = nowr - USEC_PER_DAY*7; /* 1 week ago */ + y = map_clock_usec(x, CLOCK_REALTIME, CLOCK_MONOTONIC); + if (y != 0) { /* might underflow if machine is not up long enough for the monotonic clock to be beyond 1w */ + assert_se(y < USEC_INFINITY); + z = map_clock_usec(y, CLOCK_MONOTONIC, CLOCK_REALTIME); + assert_se(z > 0 && z < USEC_INFINITY); + assert_se((z > x ? z - x : x - z) < USEC_PER_HOUR); + } +} + +static void test_timezone_offset_change_one(const char *utc, const char *pretty) { + usec_t x, y, z; + char *s; + + assert_se(parse_timestamp(utc, &x) >= 0); + + s = FORMAT_TIMESTAMP_STYLE(x, TIMESTAMP_UTC); + assert_se(parse_timestamp(s, &y) >= 0); + log_debug("%s -> " USEC_FMT " -> %s -> " USEC_FMT, utc, x, s, y); + assert_se(streq(s, utc)); + assert_se(x == y); + + assert_se(parse_timestamp(pretty, &y) >= 0); + s = FORMAT_TIMESTAMP_STYLE(y, TIMESTAMP_PRETTY); + assert_se(parse_timestamp(s, &z) >= 0); + log_debug("%s -> " USEC_FMT " -> %s -> " USEC_FMT, pretty, y, s, z); + assert_se(streq(s, pretty)); + assert_se(x == y); + assert_se(x == z); +} + +TEST(timezone_offset_change) { + const char *tz = getenv("TZ"); + + /* See issue #26370. */ + + if (timezone_is_valid("Africa/Casablanca", LOG_DEBUG)) { + assert_se(setenv("TZ", ":Africa/Casablanca", 1) >= 0); + tzset(); + log_debug("Africa/Casablanca: tzname[0]=%s, tzname[1]=%s", strempty(tzname[0]), strempty(tzname[1])); + + test_timezone_offset_change_one("Sun 2015-10-25 01:59:59 UTC", "Sun 2015-10-25 02:59:59 +01"); + test_timezone_offset_change_one("Sun 2015-10-25 02:00:00 UTC", "Sun 2015-10-25 02:00:00 +00"); + test_timezone_offset_change_one("Sun 2018-06-17 01:59:59 UTC", "Sun 2018-06-17 01:59:59 +00"); + test_timezone_offset_change_one("Sun 2018-06-17 02:00:00 UTC", "Sun 2018-06-17 03:00:00 +01"); + test_timezone_offset_change_one("Sun 2018-10-28 01:59:59 UTC", "Sun 2018-10-28 02:59:59 +01"); + test_timezone_offset_change_one("Sun 2018-10-28 02:00:00 UTC", "Sun 2018-10-28 03:00:00 +01"); + } + + if (timezone_is_valid("Asia/Atyrau", LOG_DEBUG)) { + assert_se(setenv("TZ", ":Asia/Atyrau", 1) >= 0); + tzset(); + log_debug("Asia/Atyrau: tzname[0]=%s, tzname[1]=%s", strempty(tzname[0]), strempty(tzname[1])); + + test_timezone_offset_change_one("Sat 2004-03-27 21:59:59 UTC", "Sun 2004-03-28 01:59:59 +04"); + test_timezone_offset_change_one("Sat 2004-03-27 22:00:00 UTC", "Sun 2004-03-28 03:00:00 +05"); + test_timezone_offset_change_one("Sat 2004-10-30 21:59:59 UTC", "Sun 2004-10-31 02:59:59 +05"); + test_timezone_offset_change_one("Sat 2004-10-30 22:00:00 UTC", "Sun 2004-10-31 03:00:00 +05"); + } + + if (timezone_is_valid("Chile/EasterIsland", LOG_DEBUG)) { + assert_se(setenv("TZ", ":Chile/EasterIsland", 1) >= 0); + tzset(); + log_debug("Chile/EasterIsland: tzname[0]=%s, tzname[1]=%s", strempty(tzname[0]), strempty(tzname[1])); + + test_timezone_offset_change_one("Sun 1981-10-11 03:59:59 UTC", "Sat 1981-10-10 20:59:59 -07"); + test_timezone_offset_change_one("Sun 1981-10-11 04:00:00 UTC", "Sat 1981-10-10 22:00:00 -06"); + test_timezone_offset_change_one("Sun 1982-03-14 02:59:59 UTC", "Sat 1982-03-13 20:59:59 -06"); + test_timezone_offset_change_one("Sun 1982-03-14 03:00:00 UTC", "Sat 1982-03-13 21:00:00 -06"); + } + + assert_se(set_unset_env("TZ", tz, true) == 0); + tzset(); +} + +static int intro(void) { + /* Tests have hard-coded results that do not expect a specific timezone to be set by the caller */ + assert_se(unsetenv("TZ") >= 0); + + log_info("realtime=" USEC_FMT "\n" + "monotonic=" USEC_FMT "\n" + "boottime=" USEC_FMT "\n", + now(CLOCK_REALTIME), + now(CLOCK_MONOTONIC), + now(CLOCK_BOOTTIME)); + + /* Ensure time_t is signed */ + assert_cc((time_t) -1 < (time_t) 1); + + /* Ensure TIME_T_MAX works correctly */ + uintmax_t x = TIME_T_MAX; + x++; + assert_se((time_t) x < 0); + + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-tmpfile-util.c b/src/test/test-tmpfile-util.c new file mode 100644 index 0000000..4859f62 --- /dev/null +++ b/src/test/test-tmpfile-util.c @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "log.h" +#include "path-util.h" +#include "process-util.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" + +static void test_tempfn_random_one(const char *p, const char *extra, const char *expect, int ret) { + _cleanup_free_ char *s = NULL; + int r; + + r = tempfn_random(p, extra, &s); + log_info("%s+%s → %s vs. %s (%i/%s vs. %i/%s)", + p, strna(extra), strna(s), strna(expect), + r, STRERROR(r), ret, STRERROR(ret)); + + assert_se(!s == !expect); + if (s) { + const char *suffix; + + assert_se(suffix = startswith(s, expect)); + assert_se(in_charset(suffix, HEXDIGITS)); + assert_se(strlen(suffix) == 16); + } + assert_se(ret == r); +} + +TEST(tempfn_random) { + _cleanup_free_ char *dir = NULL, *p = NULL, *q = NULL; + + test_tempfn_random_one("", NULL, NULL, -EINVAL); + test_tempfn_random_one(".", NULL, NULL, -EADDRNOTAVAIL); + test_tempfn_random_one("..", NULL, NULL, -EINVAL); + test_tempfn_random_one("/", NULL, NULL, -EADDRNOTAVAIL); + test_tempfn_random_one("foo", "hoge/aaa", NULL, -EINVAL); + + test_tempfn_random_one("foo", NULL, ".#foo", 0); + test_tempfn_random_one("foo", "bar", ".#barfoo", 0); + test_tempfn_random_one("/tmp/foo", NULL, "/tmp/.#foo", 0); + test_tempfn_random_one("/tmp/foo", "bar", "/tmp/.#barfoo", 0); + test_tempfn_random_one("./foo", NULL, ".#foo", 0); + test_tempfn_random_one("./foo", "bar", ".#barfoo", 0); + test_tempfn_random_one("../foo", NULL, "../.#foo", 0); + test_tempfn_random_one("../foo", "bar", "../.#barfoo", 0); + + test_tempfn_random_one("foo/", NULL, ".#foo", 0); + test_tempfn_random_one("foo/", "bar", ".#barfoo", 0); + test_tempfn_random_one("/tmp/foo/", NULL, "/tmp/.#foo", 0); + test_tempfn_random_one("/tmp/foo/", "bar", "/tmp/.#barfoo", 0); + test_tempfn_random_one("./foo/", NULL, ".#foo", 0); + test_tempfn_random_one("./foo/", "bar", ".#barfoo", 0); + test_tempfn_random_one("../foo/", NULL, "../.#foo", 0); + test_tempfn_random_one("../foo/", "bar", "../.#barfoo", 0); + + assert_se(dir = new(char, PATH_MAX - 20)); + memset(dir, 'x', PATH_MAX - 21); + dir[PATH_MAX - 21] = '\0'; + for (size_t i = 0; i < PATH_MAX - 21; i += NAME_MAX + 1) + dir[i] = '/'; + + assert_se(p = path_join(dir, "a")); + assert_se(q = path_join(dir, ".#a")); + + test_tempfn_random_one(p, NULL, q, 0); + test_tempfn_random_one(p, "b", NULL, -EINVAL); + + p = mfree(p); + q = mfree(q); + + assert_se(p = new(char, NAME_MAX + 1)); + memset(p, 'x', NAME_MAX); + p[NAME_MAX] = '\0'; + + assert_se(q = new(char, NAME_MAX + 1)); + memset(stpcpy(q, ".#"), 'x', NAME_MAX - STRLEN(".#") - 16); + q[NAME_MAX - 16] = '\0'; + + test_tempfn_random_one(p, NULL, q, 0); + + memset(stpcpy(q, ".#hoge"), 'x', NAME_MAX - STRLEN(".#hoge") - 16); + q[NAME_MAX - 16] = '\0'; + + test_tempfn_random_one(p, "hoge", q, 0); +} + +static void test_tempfn_xxxxxx_one(const char *p, const char *extra, const char *expect, int ret) { + _cleanup_free_ char *s = NULL; + int r; + + r = tempfn_xxxxxx(p, extra, &s); + log_info("%s+%s → %s vs. %s (%i/%s vs. %i/%s)", + p, strna(extra), strna(s), strna(expect), + r, STRERROR(r), ret, STRERROR(ret)); + + assert_se(!s == !expect); + if (s) { + const char *suffix; + + assert_se(suffix = startswith(s, expect)); + assert_se(streq(suffix, "XXXXXX")); + } + assert_se(ret == r); +} + +TEST(tempfn_xxxxxx) { + _cleanup_free_ char *dir = NULL, *p = NULL, *q = NULL; + + test_tempfn_xxxxxx_one("", NULL, NULL, -EINVAL); + test_tempfn_xxxxxx_one(".", NULL, NULL, -EADDRNOTAVAIL); + test_tempfn_xxxxxx_one("..", NULL, NULL, -EINVAL); + test_tempfn_xxxxxx_one("/", NULL, NULL, -EADDRNOTAVAIL); + test_tempfn_xxxxxx_one("foo", "hoge/aaa", NULL, -EINVAL); + + test_tempfn_xxxxxx_one("foo", NULL, ".#foo", 0); + test_tempfn_xxxxxx_one("foo", "bar", ".#barfoo", 0); + test_tempfn_xxxxxx_one("/tmp/foo", NULL, "/tmp/.#foo", 0); + test_tempfn_xxxxxx_one("/tmp/foo", "bar", "/tmp/.#barfoo", 0); + test_tempfn_xxxxxx_one("./foo", NULL, ".#foo", 0); + test_tempfn_xxxxxx_one("./foo", "bar", ".#barfoo", 0); + test_tempfn_xxxxxx_one("../foo", NULL, "../.#foo", 0); + test_tempfn_xxxxxx_one("../foo", "bar", "../.#barfoo", 0); + + test_tempfn_xxxxxx_one("foo/", NULL, ".#foo", 0); + test_tempfn_xxxxxx_one("foo/", "bar", ".#barfoo", 0); + test_tempfn_xxxxxx_one("/tmp/foo/", NULL, "/tmp/.#foo", 0); + test_tempfn_xxxxxx_one("/tmp/foo/", "bar", "/tmp/.#barfoo", 0); + test_tempfn_xxxxxx_one("./foo/", NULL, ".#foo", 0); + test_tempfn_xxxxxx_one("./foo/", "bar", ".#barfoo", 0); + test_tempfn_xxxxxx_one("../foo/", NULL, "../.#foo", 0); + test_tempfn_xxxxxx_one("../foo/", "bar", "../.#barfoo", 0); + + assert_se(dir = new(char, PATH_MAX - 10)); + memset(dir, 'x', PATH_MAX - 11); + dir[PATH_MAX - 11] = '\0'; + for (size_t i = 0; i < PATH_MAX - 11; i += NAME_MAX + 1) + dir[i] = '/'; + + assert_se(p = path_join(dir, "a")); + assert_se(q = path_join(dir, ".#a")); + + test_tempfn_xxxxxx_one(p, NULL, q, 0); + test_tempfn_xxxxxx_one(p, "b", NULL, -EINVAL); + + p = mfree(p); + q = mfree(q); + + assert_se(p = new(char, NAME_MAX + 1)); + memset(p, 'x', NAME_MAX); + p[NAME_MAX] = '\0'; + + assert_se(q = new(char, NAME_MAX + 1)); + memset(stpcpy(q, ".#"), 'x', NAME_MAX - STRLEN(".#") - 6); + q[NAME_MAX - 6] = '\0'; + + test_tempfn_xxxxxx_one(p, NULL, q, 0); + + memset(stpcpy(q, ".#hoge"), 'x', NAME_MAX - STRLEN(".#hoge") - 6); + q[NAME_MAX - 6] = '\0'; + + test_tempfn_xxxxxx_one(p, "hoge", q, 0); +} + +static void test_tempfn_random_child_one(const char *p, const char *extra, const char *expect, int ret) { + _cleanup_free_ char *s = NULL; + int r; + + r = tempfn_random_child(p, extra, &s); + log_info_errno(r, "%s+%s → %s vs. %s (%i/%s vs. %i/%s)", + p, strna(extra), strna(s), strna(expect), + r, STRERROR(r), ret, STRERROR(ret)); + + assert_se(!s == !expect); + if (s) { + const char *suffix; + + assert_se(suffix = startswith(s, expect)); + assert_se(in_charset(suffix, HEXDIGITS)); + assert_se(strlen(suffix) == 16); + } + assert_se(ret == r); +} + +TEST(tempfn_random_child) { + _cleanup_free_ char *dir = NULL, *p = NULL, *q = NULL; + + test_tempfn_random_child_one("", NULL, ".#", 0); + test_tempfn_random_child_one(".", NULL, ".#", 0); + test_tempfn_random_child_one("..", NULL, "../.#", 0); + test_tempfn_random_child_one("/", NULL, "/.#", 0); + test_tempfn_random_child_one("foo", "hoge/aaa", NULL, -EINVAL); + + test_tempfn_random_child_one("foo", NULL, "foo/.#", 0); + test_tempfn_random_child_one("foo", "bar", "foo/.#bar", 0); + test_tempfn_random_child_one("/tmp/foo", NULL, "/tmp/foo/.#", 0); + test_tempfn_random_child_one("/tmp/foo", "bar", "/tmp/foo/.#bar", 0); + test_tempfn_random_child_one("./foo", NULL, "foo/.#", 0); + test_tempfn_random_child_one("./foo", "bar", "foo/.#bar", 0); + test_tempfn_random_child_one("../foo", NULL, "../foo/.#", 0); + test_tempfn_random_child_one("../foo", "bar", "../foo/.#bar", 0); + + test_tempfn_random_child_one("foo/", NULL, "foo/.#", 0); + test_tempfn_random_child_one("foo/", "bar", "foo/.#bar", 0); + test_tempfn_random_child_one("/tmp/foo/", NULL, "/tmp/foo/.#", 0); + test_tempfn_random_child_one("/tmp/foo/", "bar", "/tmp/foo/.#bar", 0); + test_tempfn_random_child_one("./foo/", NULL, "foo/.#", 0); + test_tempfn_random_child_one("./foo/", "bar", "foo/.#bar", 0); + test_tempfn_random_child_one("../foo/", NULL, "../foo/.#", 0); + test_tempfn_random_child_one("../foo/", "bar", "../foo/.#bar", 0); + + assert_se(dir = new(char, PATH_MAX - 21)); + memset(dir, 'x', PATH_MAX - 22); + dir[PATH_MAX - 22] = '\0'; + for (size_t i = 0; i < PATH_MAX - 22; i += NAME_MAX + 1) + dir[i] = '/'; + + assert_se(p = path_join(dir, "a")); + assert_se(q = path_join(p, ".#")); + + test_tempfn_random_child_one(p, NULL, q, 0); + test_tempfn_random_child_one(p, "b", NULL, -EINVAL); + + p = mfree(p); + q = mfree(q); + + assert_se(p = new(char, NAME_MAX + 1)); + memset(p, 'x', NAME_MAX); + p[NAME_MAX] = '\0'; + + assert_se(q = path_join(p, ".#")); + + test_tempfn_random_child_one(p, NULL, q, 0); + + assert_se(strextend(&q, "hoge")); + test_tempfn_random_child_one(p, "hoge", q, 0); +} + +TEST(link_tmpfile) { + _cleanup_free_ char *cmd = NULL, *cmd2 = NULL, *ans = NULL, *ans2 = NULL, *d = NULL, *tmp = NULL, *line = NULL; + _cleanup_close_ int fd = -EBADF, fd2 = -EBADF; + const char *p = saved_argv[1] ?: "/tmp"; + char *pattern; + + pattern = strjoina(p, "/systemd-test-XXXXXX"); + + fd = open_tmpfile_unlinkable(p, O_RDWR|O_CLOEXEC); + assert_se(fd >= 0); + + assert_se(asprintf(&cmd, "ls -l /proc/"PID_FMT"/fd/%d", getpid_cached(), fd) > 0); + (void) system(cmd); + assert_se(readlink_malloc(cmd + 6, &ans) >= 0); + log_debug("link1: %s", ans); + assert_se(endswith(ans, " (deleted)")); + + fd2 = mkostemp_safe(pattern); + assert_se(fd2 >= 0); + assert_se(unlink(pattern) == 0); + + assert_se(asprintf(&cmd2, "ls -l /proc/"PID_FMT"/fd/%d", getpid_cached(), fd2) > 0); + (void) system(cmd2); + assert_se(readlink_malloc(cmd2 + 6, &ans2) >= 0); + log_debug("link2: %s", ans2); + assert_se(endswith(ans2, " (deleted)")); + + pattern = strjoina(p, "/tmpfiles-test"); + assert_se(tempfn_random(pattern, NULL, &d) >= 0); + + fd = safe_close(fd); + fd = open_tmpfile_linkable(d, O_RDWR|O_CLOEXEC, &tmp); + assert_se(fd >= 0); + assert_se(write(fd, "foobar\n", 7) == 7); + + assert_se(touch(d) >= 0); + assert_se(link_tmpfile(fd, tmp, d, /* flags= */ 0) == -EEXIST); + assert_se(unlink(d) >= 0); + assert_se(link_tmpfile(fd, tmp, d, /* flags= */ 0) >= 0); + + assert_se(read_one_line_file(d, &line) >= 0); + assert_se(streq(line, "foobar")); + + fd = safe_close(fd); + tmp = mfree(tmp); + + fd = open_tmpfile_linkable(d, O_RDWR|O_CLOEXEC, &tmp); + assert_se(fd >= 0); + + assert_se(write(fd, "waumiau\n", 8) == 8); + + assert_se(link_tmpfile(fd, tmp, d, /* flags= */ 0) == -EEXIST); + assert_se(link_tmpfile(fd, tmp, d, LINK_TMPFILE_REPLACE) >= 0); + + line = mfree(line); + assert_se(read_one_line_file(d, &line) >= 0); + assert_se(streq(line, "waumiau")); + + assert_se(unlink(d) >= 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-tpm2.c b/src/test/test-tpm2.c new file mode 100644 index 0000000..e3c7da8 --- /dev/null +++ b/src/test/test-tpm2.c @@ -0,0 +1,1324 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "hexdecoct.h" +#include "macro.h" +#include "tests.h" +#include "tpm2-util.h" +#include "virt.h" + +TEST(tpm2_pcr_index_from_string) { + assert_se(tpm2_pcr_index_from_string("platform-code") == 0); + assert_se(tpm2_pcr_index_from_string("0") == 0); + assert_se(tpm2_pcr_index_from_string("platform-config") == 1); + assert_se(tpm2_pcr_index_from_string("1") == 1); + assert_se(tpm2_pcr_index_from_string("external-code") == 2); + assert_se(tpm2_pcr_index_from_string("2") == 2); + assert_se(tpm2_pcr_index_from_string("external-config") == 3); + assert_se(tpm2_pcr_index_from_string("3") == 3); + assert_se(tpm2_pcr_index_from_string("boot-loader-code") == 4); + assert_se(tpm2_pcr_index_from_string("4") == 4); + assert_se(tpm2_pcr_index_from_string("boot-loader-config") == 5); + assert_se(tpm2_pcr_index_from_string("5") == 5); + assert_se(tpm2_pcr_index_from_string("secure-boot-policy") == 7); + assert_se(tpm2_pcr_index_from_string("7") == 7); + assert_se(tpm2_pcr_index_from_string("kernel-initrd") == 9); + assert_se(tpm2_pcr_index_from_string("9") == 9); + assert_se(tpm2_pcr_index_from_string("ima") == 10); + assert_se(tpm2_pcr_index_from_string("10") == 10); + assert_se(tpm2_pcr_index_from_string("kernel-boot") == 11); + assert_se(tpm2_pcr_index_from_string("11") == 11); + assert_se(tpm2_pcr_index_from_string("kernel-config") == 12); + assert_se(tpm2_pcr_index_from_string("12") == 12); + assert_se(tpm2_pcr_index_from_string("sysexts") == 13); + assert_se(tpm2_pcr_index_from_string("13") == 13); + assert_se(tpm2_pcr_index_from_string("shim-policy") == 14); + assert_se(tpm2_pcr_index_from_string("14") == 14); + assert_se(tpm2_pcr_index_from_string("system-identity") == 15); + assert_se(tpm2_pcr_index_from_string("15") == 15); + assert_se(tpm2_pcr_index_from_string("debug") == 16); + assert_se(tpm2_pcr_index_from_string("16") == 16); + assert_se(tpm2_pcr_index_from_string("application-support") == 23); + assert_se(tpm2_pcr_index_from_string("23") == 23); + assert_se(tpm2_pcr_index_from_string("hello") == -EINVAL); + assert_se(tpm2_pcr_index_from_string("8") == 8); + assert_se(tpm2_pcr_index_from_string("44") == -EINVAL); + assert_se(tpm2_pcr_index_from_string("-5") == -EINVAL); + assert_se(tpm2_pcr_index_from_string("24") == -EINVAL); +} + +TEST(tpm2_util_pbkdf2_hmac_sha256) { + + /* + * The test vectors from RFC 6070 [1] are for dkLen of 20 as it's SHA1 + * other RFCs I bumped into had various differing dkLen and iter counts, + * so this was generated using Python's hmacmodule. + * + * 1. https://www.rfc-editor.org/rfc/rfc6070.html#page-2 + */ + static const struct { + const uint8_t pass[256]; + size_t passlen; + const uint8_t salt[256]; + size_t saltlen; + uint8_t expected[SHA256_DIGEST_SIZE]; + } test_vectors[] = { + { .pass={'f', 'o', 'o', 'p', 'a', 's', 's'}, .passlen=7, .salt={'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5'}, .saltlen=16, .expected={0xCB, 0xEA, 0x27, 0x23, 0x9A, 0x65, 0x99, 0xF6, 0x8C, 0x26, 0x54, 0x80, 0x5C, 0x63, 0x61, 0xD2, 0x91, 0x0A, 0x60, 0x3F, 0xC2, 0xF5, 0xF0, 0xAB, 0x55, 0x8B, 0x46, 0x07, 0x60, 0x93, 0xAB, 0xCB} }, + { .pass={'f', 'o', 'o', 'p', 'a', 's', 's'}, .passlen=7, .salt={0x00, 'h', 'f', 's', 'd', 'j', 'h', 'f', 'd', 'j', 'h', 'j', 'd', 'f', 's'}, .saltlen=15, .expected={0x2B, 0xDF, 0x52, 0x29, 0x48, 0x3F, 0x98, 0x25, 0x01, 0x19, 0xB4, 0x42, 0xBC, 0xA7, 0x38, 0x5D, 0xCD, 0x08, 0xBD, 0xDC, 0x33, 0xBF, 0x32, 0x5E, 0x31, 0x87, 0x54, 0xFF, 0x2C, 0x23, 0x68, 0xFF} }, + { .pass={'f', 'o', 'o', 'p', 'a', 's', 's'}, .passlen=7, .salt={'m', 'y', 's', 'a', 0x00, 'l', 't'}, .saltlen=7, .expected={0x7C, 0x24, 0xB4, 0x4D, 0x30, 0x11, 0x53, 0x24, 0x87, 0x56, 0x24, 0x10, 0xBA, 0x9F, 0xF2, 0x4E, 0xBB, 0xF5, 0x03, 0x56, 0x2B, 0xB1, 0xA1, 0x92, 0x8B, 0x5F, 0x32, 0x02, 0x23, 0x1F, 0x79, 0xE6} }, + { .pass={'p', 'a', 's', 's', 'w', 'i', 't', 'h', 'n', 'u', 'l', 'l', 0x00, 'p', 'a', 's', 's', 'w', 'o', 'r', 'd'}, .passlen=21, .salt={'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5'}, .saltlen=16, .expected={0xE9, 0x53, 0xB7, 0x1D, 0xAB, 0xD1, 0xC1, 0xF3, 0xC4, 0x7F, 0x18, 0x96, 0xDD, 0xD7, 0x6B, 0xC6, 0x6A, 0xBD, 0xFB, 0x12, 0x7C, 0xF8, 0x68, 0xDC, 0x6E, 0xEF, 0x29, 0xCC, 0x1B, 0x30, 0x5B, 0x74} }, + { .pass={'p', 'a', 's', 's', 'w', 'i', 't', 'h', 'n', 'u', 'l', 'l', 0x00, 'p', 'a', 's', 's', 'w', 'o', 'r', 'd'}, .passlen=21, .salt={0x00, 'h', 'f', 's', 'd', 'j', 'h', 'f', 'd', 'j', 'h', 'j', 'd', 'f', 's'}, .saltlen=15, .expected={0x51, 0xA3, 0x82, 0xA5, 0x2F, 0x48, 0x84, 0xB3, 0x02, 0x0D, 0xC2, 0x42, 0x9A, 0x8F, 0x86, 0xCC, 0x66, 0xFD, 0x65, 0x87, 0x89, 0x07, 0x2B, 0x07, 0x82, 0x42, 0xD6, 0x6D, 0x43, 0xB8, 0xFD, 0xCF} }, + { .pass={'p', 'a', 's', 's', 'w', 'i', 't', 'h', 'n', 'u', 'l', 'l', 0x00, 'p', 'a', 's', 's', 'w', 'o', 'r', 'd'}, .passlen=21, .salt={'m', 'y', 's', 'a', 0x00, 'l', 't'}, .saltlen=7, .expected={0xEC, 0xFB, 0x5D, 0x5F, 0xF6, 0xA6, 0xE0, 0x79, 0x50, 0x64, 0x36, 0x64, 0xA3, 0x9A, 0x5C, 0xF3, 0x7A, 0x87, 0x0B, 0x64, 0x51, 0x59, 0x75, 0x64, 0x8B, 0x78, 0x2B, 0x62, 0x8F, 0x68, 0xD9, 0xCC} }, + { .pass={0x00, 'p', 'a', 's', 's'}, .passlen=5, .salt={'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5'}, .saltlen=16, .expected={0x8A, 0x9A, 0x47, 0x9A, 0x91, 0x22, 0x2F, 0x56, 0x29, 0x4F, 0x26, 0x00, 0xE7, 0xB3, 0xEB, 0x63, 0x6D, 0x51, 0xF2, 0x60, 0x17, 0x08, 0x20, 0x70, 0x82, 0x8F, 0xA3, 0xD7, 0xBE, 0x2B, 0xD5, 0x5D} }, + { .pass={0x00, 'p', 'a', 's', 's'}, .passlen=5, .salt={0x00, 'h', 'f', 's', 'd', 'j', 'h', 'f', 'd', 'j', 'h', 'j', 'd', 'f', 's'}, .saltlen=15, .expected={0x72, 0x3A, 0xF5, 0xF7, 0xCD, 0x6C, 0x12, 0xDD, 0x53, 0x28, 0x46, 0x0C, 0x19, 0x0E, 0xF2, 0x91, 0xDE, 0xEA, 0xF9, 0x6F, 0x74, 0x32, 0x34, 0x3F, 0x84, 0xED, 0x8D, 0x2A, 0xDE, 0xC9, 0xC6, 0x34} }, + { .pass={0x00, 'p', 'a', 's', 's'}, .passlen=5, .salt={'m', 'y', 's', 'a', 0x00, 'l', 't'}, .saltlen=7, .expected={0xE3, 0x07, 0x12, 0xBE, 0xEE, 0xF5, 0x5D, 0x18, 0x72, 0xF4, 0xCF, 0xF1, 0x20, 0x6B, 0xD6, 0x66, 0xCD, 0x7C, 0xE7, 0x4F, 0xC2, 0x16, 0x70, 0x5B, 0x9B, 0x2F, 0x7D, 0xE2, 0x3B, 0x42, 0x3A, 0x1B} }, + }; + + uint8_t res[SHA256_DIGEST_SIZE]; + for(size_t i = 0; i < sizeof(test_vectors)/sizeof(test_vectors[0]); i++) { + + int rc = tpm2_util_pbkdf2_hmac_sha256( + test_vectors[i].pass, + test_vectors[i].passlen, + test_vectors[i].salt, + test_vectors[i].saltlen, + res); + assert_se(rc == 0); + assert_se(memcmp(test_vectors[i].expected, res, SHA256_DIGEST_SIZE) == 0); + } +} + +#if HAVE_TPM2 + +#define POISON(type) \ + ({ \ + type _p; \ + memset(&_p, 0xaa, sizeof(_p)); \ + _p; \ + }) +#define POISON_TPML POISON(TPML_PCR_SELECTION) +#define POISON_TPMS POISON(TPMS_PCR_SELECTION) +#define POISON_U32 POISON(uint32_t) + +static void assert_tpms_pcr_selection_eq(TPMS_PCR_SELECTION *a, TPMS_PCR_SELECTION *b) { + assert_se(a); + assert_se(b); + + assert_se(a->hash == b->hash); + assert_se(a->sizeofSelect == b->sizeofSelect); + + for (size_t i = 0; i < a->sizeofSelect; i++) + assert_se(a->pcrSelect[i] == b->pcrSelect[i]); +} + +static void assert_tpml_pcr_selection_eq(TPML_PCR_SELECTION *a, TPML_PCR_SELECTION *b) { + assert_se(a); + assert_se(b); + + assert_se(a->count == b->count); + for (size_t i = 0; i < a->count; i++) + assert_tpms_pcr_selection_eq(&a->pcrSelections[i], &b->pcrSelections[i]); +} + +static void verify_tpms_pcr_selection(TPMS_PCR_SELECTION *s, uint32_t mask, TPMI_ALG_HASH hash) { + assert_se(s->hash == hash); + assert_se(s->sizeofSelect == 3); + assert_se(s->pcrSelect[0] == (mask & 0xff)); + assert_se(s->pcrSelect[1] == ((mask >> 8) & 0xff)); + assert_se(s->pcrSelect[2] == ((mask >> 16) & 0xff)); + assert_se(s->pcrSelect[3] == 0); + + assert_se(tpm2_tpms_pcr_selection_to_mask(s) == mask); +} + +static void verify_tpml_pcr_selection(TPML_PCR_SELECTION *l, TPMS_PCR_SELECTION s[], size_t count) { + assert_se(l->count == count); + for (size_t i = 0; i < count; i++) { + assert_tpms_pcr_selection_eq(&s[i], &l->pcrSelections[i]); + + TPMI_ALG_HASH hash = l->pcrSelections[i].hash; + verify_tpms_pcr_selection(&l->pcrSelections[i], tpm2_tpml_pcr_selection_to_mask(l, hash), hash); + } +} + +static void _test_pcr_selection_mask_hash(uint32_t mask, TPMI_ALG_HASH hash) { + TPMS_PCR_SELECTION s = POISON_TPMS; + tpm2_tpms_pcr_selection_from_mask(mask, hash, &s); + verify_tpms_pcr_selection(&s, mask, hash); + + TPML_PCR_SELECTION l = POISON_TPML; + tpm2_tpml_pcr_selection_from_mask(mask, hash, &l); + verify_tpml_pcr_selection(&l, &s, 1); + verify_tpms_pcr_selection(&l.pcrSelections[0], mask, hash); + + uint32_t test_masks[] = { + 0x0, 0x1, 0x100, 0x10000, 0xf0f0f0, 0xaaaaaa, 0xffffff, + }; + for (unsigned i = 0; i < ELEMENTSOF(test_masks); i++) { + uint32_t test_mask = test_masks[i]; + + TPMS_PCR_SELECTION a = POISON_TPMS, b = POISON_TPMS, test_s = POISON_TPMS; + tpm2_tpms_pcr_selection_from_mask(test_mask, hash, &test_s); + + a = s; + b = test_s; + tpm2_tpms_pcr_selection_add(&a, &b); + verify_tpms_pcr_selection(&a, UPDATE_FLAG(mask, test_mask, true), hash); + verify_tpms_pcr_selection(&b, test_mask, hash); + + a = s; + b = test_s; + tpm2_tpms_pcr_selection_sub(&a, &b); + verify_tpms_pcr_selection(&a, UPDATE_FLAG(mask, test_mask, false), hash); + verify_tpms_pcr_selection(&b, test_mask, hash); + + a = s; + b = test_s; + tpm2_tpms_pcr_selection_move(&a, &b); + verify_tpms_pcr_selection(&a, UPDATE_FLAG(mask, test_mask, true), hash); + verify_tpms_pcr_selection(&b, 0, hash); + } +} + +TEST(tpms_pcr_selection_mask_and_hash) { + TPMI_ALG_HASH HASH_ALGS[] = { TPM2_ALG_SHA1, TPM2_ALG_SHA256, }; + + for (unsigned i = 0; i < ELEMENTSOF(HASH_ALGS); i++) + for (uint32_t m2 = 0; m2 <= 0xffffff; m2 += 0x50000) + for (uint32_t m1 = 0; m1 <= 0xffff; m1 += 0x500) + for (uint32_t m0 = 0; m0 <= 0xff; m0 += 0x5) + _test_pcr_selection_mask_hash(m0 | m1 | m2, HASH_ALGS[i]); +} + +static void _test_tpms_sw( + TPMI_ALG_HASH hash, + uint32_t mask, + const char *expected_str, + size_t expected_weight) { + + TPMS_PCR_SELECTION s = POISON_TPMS; + tpm2_tpms_pcr_selection_from_mask(mask, hash, &s); + + _cleanup_free_ char *tpms_str = tpm2_tpms_pcr_selection_to_string(&s); + assert_se(streq(tpms_str, expected_str)); + + assert_se(tpm2_tpms_pcr_selection_weight(&s) == expected_weight); + assert_se(tpm2_tpms_pcr_selection_is_empty(&s) == (expected_weight == 0)); +} + +TEST(tpms_pcr_selection_string_and_weight) { + TPMI_ALG_HASH sha1 = TPM2_ALG_SHA1, sha256 = TPM2_ALG_SHA256; + + _test_tpms_sw(sha1, 0, "sha1()", 0); + _test_tpms_sw(sha1, 1, "sha1(0)", 1); + _test_tpms_sw(sha1, 0xf, "sha1(0+1+2+3)", 4); + _test_tpms_sw(sha1, 0x00ff00, "sha1(8+9+10+11+12+13+14+15)", 8); + _test_tpms_sw(sha1, 0xffffff, "sha1(0+1+2+3+4+5+6+7+8+9+10+11+12+13+14+15+16+17+18+19+20+21+22+23)", 24); + _test_tpms_sw(sha256, 0, "sha256()", 0); + _test_tpms_sw(sha256, 1, "sha256(0)", 1); + _test_tpms_sw(sha256, 7, "sha256(0+1+2)", 3); + _test_tpms_sw(sha256, 0xf00000, "sha256(20+21+22+23)", 4); + _test_tpms_sw(sha256, 0xffffff, "sha256(0+1+2+3+4+5+6+7+8+9+10+11+12+13+14+15+16+17+18+19+20+21+22+23)", 24); +} + +static void _tpml_pcr_selection_add_tpms(TPMS_PCR_SELECTION s[], size_t count, TPML_PCR_SELECTION *ret) { + for (size_t i = 0; i < count; i++) + tpm2_tpml_pcr_selection_add_tpms_pcr_selection(ret, &s[i]); +} + +static void _tpml_pcr_selection_sub_tpms(TPMS_PCR_SELECTION s[], size_t count, TPML_PCR_SELECTION *ret) { + for (size_t i = 0; i < count; i++) + tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(ret, &s[i]); +} + +static void _test_tpml_sw( + TPMS_PCR_SELECTION s[], + size_t count, + size_t expected_count, + const char *expected_str, + size_t expected_weight) { + + TPML_PCR_SELECTION l = {}; + _tpml_pcr_selection_add_tpms(s, count, &l); + assert_se(l.count == expected_count); + + _cleanup_free_ char *tpml_str = tpm2_tpml_pcr_selection_to_string(&l); + assert_se(streq(tpml_str, expected_str)); + + assert_se(tpm2_tpml_pcr_selection_weight(&l) == expected_weight); + assert_se(tpm2_tpml_pcr_selection_is_empty(&l) == (expected_weight == 0)); +} + +TEST(tpml_pcr_selection_string_and_weight) { + size_t size = 0xaa; + TPMI_ALG_HASH sha1 = TPM2_ALG_SHA1, + sha256 = TPM2_ALG_SHA256, + sha384 = TPM2_ALG_SHA384, + sha512 = TPM2_ALG_SHA512; + TPMS_PCR_SELECTION s[4] = { POISON_TPMS, POISON_TPMS, POISON_TPMS, POISON_TPMS, }; + + size = 0; + tpm2_tpms_pcr_selection_from_mask(0x000002, sha1 , &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x0080f0, sha384, &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x010100, sha512, &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0xff0000, sha256, &s[size++]); + _test_tpml_sw(s, + size, + /* expected_count= */ 4, + "[sha1(1),sha384(4+5+6+7+15),sha512(8+16),sha256(16+17+18+19+20+21+22+23)]", + /* expected_weight= */ 16); + + size = 0; + tpm2_tpms_pcr_selection_from_mask(0x0403aa, sha512, &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x0080f0, sha256, &s[size++]); + _test_tpml_sw(s, + size, + /* expected_count= */ 2, + "[sha512(1+3+5+7+8+9+18),sha256(4+5+6+7+15)]", + /* expected_weight= */ 12); + + size = 0; + /* Empty hashes should be ignored */ + tpm2_tpms_pcr_selection_from_mask(0x0300ce, sha384, &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0xffffff, sha512, &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x000000, sha1 , &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x330010, sha256, &s[size++]); + _test_tpml_sw(s, + size, + /* expected_count= */ 3, + "[sha384(1+2+3+6+7+16+17),sha512(0+1+2+3+4+5+6+7+8+9+10+11+12+13+14+15+16+17+18+19+20+21+22+23),sha256(4+16+17+20+21)]", + /* expected_weight= */ 36); + + size = 0; + /* Verify same-hash entries are properly combined. */ + tpm2_tpms_pcr_selection_from_mask(0x000001, sha1 , &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x000001, sha256, &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x000010, sha1 , &s[size++]); + tpm2_tpms_pcr_selection_from_mask(0x000010, sha256, &s[size++]); + _test_tpml_sw(s, + size, + /* expected_count= */ 2, + "[sha1(0+4),sha256(0+4)]", + /* expected_weight= */ 4); +} + +/* Test tpml add/sub by changing the tpms individually */ +static void _test_tpml_addsub_tpms( + TPML_PCR_SELECTION *start, + TPMS_PCR_SELECTION add[], + size_t add_count, + TPMS_PCR_SELECTION expected1[], + size_t expected1_count, + TPMS_PCR_SELECTION sub[], + size_t sub_count, + TPMS_PCR_SELECTION expected2[], + size_t expected2_count) { + + TPML_PCR_SELECTION l = *start; + + _tpml_pcr_selection_add_tpms(add, add_count, &l); + verify_tpml_pcr_selection(&l, expected1, expected1_count); + + _tpml_pcr_selection_sub_tpms(sub, sub_count, &l); + verify_tpml_pcr_selection(&l, expected2, expected2_count); +} + +/* Test tpml add/sub by creating new tpmls */ +static void _test_tpml_addsub_tpml( + TPML_PCR_SELECTION *start, + TPMS_PCR_SELECTION add[], + size_t add_count, + TPMS_PCR_SELECTION expected1[], + size_t expected1_count, + TPMS_PCR_SELECTION sub[], + size_t sub_count, + TPMS_PCR_SELECTION expected2[], + size_t expected2_count) { + + TPML_PCR_SELECTION l = {}; + tpm2_tpml_pcr_selection_add(&l, start); + assert_tpml_pcr_selection_eq(&l, start); + + TPML_PCR_SELECTION addl = {}; + _tpml_pcr_selection_add_tpms(add, add_count, &addl); + tpm2_tpml_pcr_selection_add(&l, &addl); + + TPML_PCR_SELECTION e1 = {}; + _tpml_pcr_selection_add_tpms(expected1, expected1_count, &e1); + assert_tpml_pcr_selection_eq(&l, &e1); + + TPML_PCR_SELECTION subl = {}; + _tpml_pcr_selection_add_tpms(sub, sub_count, &subl); + tpm2_tpml_pcr_selection_sub(&l, &subl); + + TPML_PCR_SELECTION e2 = {}; + _tpml_pcr_selection_add_tpms(expected2, expected2_count, &e2); + assert_tpml_pcr_selection_eq(&l, &e2); +} + +#define _test_tpml_addsub(...) \ + ({ \ + _test_tpml_addsub_tpms(__VA_ARGS__); \ + _test_tpml_addsub_tpml(__VA_ARGS__); \ + }) + +TEST(tpml_pcr_selection_add_sub) { + size_t add_count = 0xaa, expected1_count = 0xaa, sub_count = 0xaa, expected2_count = 0xaa; + TPMI_ALG_HASH sha1 = TPM2_ALG_SHA1, + sha256 = TPM2_ALG_SHA256, + sha384 = TPM2_ALG_SHA384, + sha512 = TPM2_ALG_SHA512; + TPML_PCR_SELECTION l = POISON_TPML; + TPMS_PCR_SELECTION add[4] = { POISON_TPMS, POISON_TPMS, POISON_TPMS, POISON_TPMS, }, + sub[4] = { POISON_TPMS, POISON_TPMS, POISON_TPMS, POISON_TPMS, }, + expected1[4] = { POISON_TPMS, POISON_TPMS, POISON_TPMS, POISON_TPMS, }, + expected2[4] = { POISON_TPMS, POISON_TPMS, POISON_TPMS, POISON_TPMS, }; + + l = (TPML_PCR_SELECTION){}; + add_count = 0; + expected1_count = 0; + sub_count = 0; + expected2_count = 0; + tpm2_tpms_pcr_selection_from_mask(0x010101, sha256, &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0x101010, sha256, &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0x0000ff, sha512, &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0x111111, sha256, &expected1[expected1_count++]); + tpm2_tpms_pcr_selection_from_mask(0x0000ff, sha512, &expected1[expected1_count++]); + tpm2_tpms_pcr_selection_from_mask(0x000001, sha256, &sub[sub_count++]); + tpm2_tpms_pcr_selection_from_mask(0xff0000, sha512, &sub[sub_count++]); + tpm2_tpms_pcr_selection_from_mask(0x111110, sha256, &expected2[expected2_count++]); + tpm2_tpms_pcr_selection_from_mask(0x0000ff, sha512, &expected2[expected2_count++]); + _test_tpml_addsub(&l, + add, add_count, + expected1, expected1_count, + sub, sub_count, + expected2, expected2_count); + + l = (TPML_PCR_SELECTION){ + .count = 1, + .pcrSelections[0].hash = sha1, + .pcrSelections[0].sizeofSelect = 3, + .pcrSelections[0].pcrSelect[0] = 0xf0, + }; + add_count = 0; + expected1_count = 0; + sub_count = 0; + expected2_count = 0; + tpm2_tpms_pcr_selection_from_mask(0xff0000, sha256, &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0xffff00, sha384, &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0x0000ff, sha512, &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0xf00000, sha1 , &add[add_count++]); + tpm2_tpms_pcr_selection_from_mask(0xf000f0, sha1 , &expected1[expected1_count++]); + tpm2_tpms_pcr_selection_from_mask(0xff0000, sha256, &expected1[expected1_count++]); + tpm2_tpms_pcr_selection_from_mask(0xffff00, sha384, &expected1[expected1_count++]); + tpm2_tpms_pcr_selection_from_mask(0x0000ff, sha512, &expected1[expected1_count++]); + tpm2_tpms_pcr_selection_from_mask(0x00ffff, sha256, &sub[sub_count++]); + tpm2_tpms_pcr_selection_from_mask(0xf000f0, sha1 , &expected2[expected2_count++]); + tpm2_tpms_pcr_selection_from_mask(0xff0000, sha256, &expected2[expected2_count++]); + tpm2_tpms_pcr_selection_from_mask(0xffff00, sha384, &expected2[expected2_count++]); + tpm2_tpms_pcr_selection_from_mask(0x0000ff, sha512, &expected2[expected2_count++]); + _test_tpml_addsub(&l, + add, add_count, + expected1, expected1_count, + sub, sub_count, + expected2, expected2_count); +} + +static bool digest_check(const TPM2B_DIGEST *digest, const char *expect) { + _cleanup_free_ char *h = NULL; + + assert_se(digest); + assert_se(expect); + + h = hexmem(digest->buffer, digest->size); + assert_se(h); + + return strcaseeq(expect, h); +} + +static void digest_init(TPM2B_DIGEST *digest, const char *hash) { + assert_se(strlen(hash) <= sizeof(digest->buffer) * 2); + + DEFINE_HEX_PTR(h, hash); + + /* Make sure the length matches a known hash algorithm */ + assert_se(IN_SET(h_len, TPM2_SHA1_DIGEST_SIZE, TPM2_SHA256_DIGEST_SIZE, TPM2_SHA384_DIGEST_SIZE, TPM2_SHA512_DIGEST_SIZE)); + + *digest = TPM2B_DIGEST_MAKE(h, h_len); + + assert_se(digest_check(digest, hash)); +} + +TEST(digest_many) { + TPM2B_DIGEST d, d0, d1, d2, d3, d4; + + digest_init(&d0, "0000000000000000000000000000000000000000000000000000000000000000"); + digest_init(&d1, "17b7703d9d00776310ba032e88c1a8c2a9c630ebdd799db622f6631530789175"); + digest_init(&d2, "12998c017066eb0d2a70b94e6ed3192985855ce390f321bbdb832022888bd251"); + digest_init(&d3, "c3a65887fedd3fb4f5d0047e906dff830bcbd1293160909eb4b05f485e7387ad"); + digest_init(&d4, "6491fb4bc08fc0b2ef47fc63db57e249917885e69d8c0d99667df83a59107a33"); + + /* tpm2_digest_init, tpm2_digest_rehash */ + d = (TPM2B_DIGEST){ .size = 1, .buffer = { 2, }, }; + assert_se(tpm2_digest_init(TPM2_ALG_SHA256, &d) == 0); + assert_se(digest_check(&d, "0000000000000000000000000000000000000000000000000000000000000000")); + assert_se(tpm2_digest_rehash(TPM2_ALG_SHA256, &d) == 0); + assert_se(digest_check(&d, "66687aadf862bd776c8fc18b8e9f8e20089714856ee233b3902a591d0d5f2925")); + + d = d1; + assert_se(tpm2_digest_rehash(TPM2_ALG_SHA256, &d) == 0); + assert_se(digest_check(&d, "ab55014b5ace12ba70c3acc887db571585a83539aad3633d252a710f268f405c")); + assert_se(tpm2_digest_init(TPM2_ALG_SHA256, &d) == 0); + assert_se(digest_check(&d, "0000000000000000000000000000000000000000000000000000000000000000")); + + /* tpm2_digest_many_digests */ + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, &d2, 1, false) == 0); + assert_se(digest_check(&d, "56571a1be3fbeab18d215f549095915a004b5788ca0d535be668559129a76f25")); + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, &d2, 1, true) == 0); + assert_se(digest_check(&d, "99dedaee8f4d8d10a8be184399fde8740d5e17ff783ee5c288a4486e4ce3a1fe")); + + const TPM2B_DIGEST da1[] = { d2, d3, }; + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, da1, ELEMENTSOF(da1), false) == 0); + assert_se(digest_check(&d, "525aa13ef9a61827778ec3acf16fbb23b65ae8770b8fb2684d3a33f9457dd6d8")); + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, da1, ELEMENTSOF(da1), true) == 0); + assert_se(digest_check(&d, "399ca2aa98963d1bd81a2b58a7e5cda24bba1be88fb4da9aa73d97706846566b")); + + const TPM2B_DIGEST da2[] = { d3, d2, d0 }; + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, da2, ELEMENTSOF(da2), false) == 0); + assert_se(digest_check(&d, "b26fd22db74d4cd896bff01c61aa498a575e4a553a7fb5a322a5fee36954313e")); + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, da2, ELEMENTSOF(da2), true) == 0); + assert_se(digest_check(&d, "091e79a5b09d4048df49a680f966f3ff67910afe185c3baf9704c9ca45bcf259")); + + const TPM2B_DIGEST da3[] = { d4, d4, d4, d4, d3, d4, d4, d4, d4, }; + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, da3, ELEMENTSOF(da3), false) == 0); + assert_se(digest_check(&d, "8eca947641b6002df79dfb571a7f78b7d0a61370a366f722386dfbe444d18830")); + assert_se(tpm2_digest_many_digests(TPM2_ALG_SHA256, &d, da3, ELEMENTSOF(da3), true) == 0); + assert_se(digest_check(&d, "f9ba17bc0bbe8794e9bcbf112e4d59a11eb68fffbcd5516a746e4857829dff04")); + + /* tpm2_digest_buffer */ + const uint8_t b1[] = { 1, 2, 3, 4, }; + assert_se(tpm2_digest_buffer(TPM2_ALG_SHA256, &d, b1, ELEMENTSOF(b1), false) == 0); + assert_se(digest_check(&d, "9f64a747e1b97f131fabb6b447296c9b6f0201e79fb3c5356e6c77e89b6a806a")); + assert_se(tpm2_digest_buffer(TPM2_ALG_SHA256, &d, b1, ELEMENTSOF(b1), true) == 0); + assert_se(digest_check(&d, "ff3bd307b287e9b29bb572f6ccfd19deb0106d0c4c3c5cfe8a1d03a396092ed4")); + + const void *b2 = d2.buffer; + assert_se(tpm2_digest_buffer(TPM2_ALG_SHA256, &d, b2, d2.size, false) == 0); + assert_se(digest_check(&d, "56571a1be3fbeab18d215f549095915a004b5788ca0d535be668559129a76f25")); + assert_se(tpm2_digest_buffer(TPM2_ALG_SHA256, &d, b2, d2.size, true) == 0); + assert_se(digest_check(&d, "99dedaee8f4d8d10a8be184399fde8740d5e17ff783ee5c288a4486e4ce3a1fe")); + + /* tpm2_digest_many */ + const struct iovec iov1[] = { + IOVEC_MAKE((void*) b1, ELEMENTSOF(b1)), + IOVEC_MAKE(d2.buffer, d2.size), + IOVEC_MAKE(d3.buffer, d3.size), + }; + assert_se(tpm2_digest_many(TPM2_ALG_SHA256, &d, iov1, ELEMENTSOF(iov1), false) == 0); + assert_se(digest_check(&d, "cd7bde4a047af976b6f1b282309976229be59f96a78aa186de32a1aee488ab09")); + assert_se(tpm2_digest_many(TPM2_ALG_SHA256, &d, iov1, ELEMENTSOF(iov1), true) == 0); + assert_se(digest_check(&d, "02ecb0628264235111e0053e271092981c8b15d59cd46617836bee3149a4ecb0")); +} + +static void check_parse_pcr_argument( + const char *arg, + const Tpm2PCRValue *prev_values, + size_t n_prev_values, + const Tpm2PCRValue *expected_values, + size_t n_expected_values) { + + _cleanup_free_ Tpm2PCRValue *values = NULL; + size_t n_values = 0; + + if (n_prev_values > 0) { + assert_se(GREEDY_REALLOC_APPEND(values, n_values, prev_values, n_prev_values)); + assert_se(tpm2_parse_pcr_argument_append(arg, &values, &n_values) == 0); + } else + assert_se(tpm2_parse_pcr_argument(arg, &values, &n_values) == 0); + + assert_se(n_values == n_expected_values); + for (size_t i = 0; i < n_values; i++) { + const Tpm2PCRValue *v = &values[i], *e = &expected_values[i]; + //tpm2_log_debug_pcr_value(e, "Expected value"); + //tpm2_log_debug_pcr_value(v, "Actual value"); + + assert_se(v->index == e->index); + assert_se(v->hash == e->hash); + assert_se(v->value.size == e->value.size); + assert_se(memcmp(v->value.buffer, e->value.buffer, e->value.size) == 0); + } + + size_t hash_count; + assert_se(tpm2_pcr_values_hash_count(expected_values, n_expected_values, &hash_count) == 0); + if (hash_count == 1) { + uint32_t mask = UINT32_MAX, expected_mask = 0; + + if (n_prev_values > 0) + assert_se(tpm2_pcr_values_to_mask(prev_values, n_prev_values, prev_values[0].hash, &mask) == 0); + + assert_se(tpm2_pcr_values_to_mask(expected_values, n_expected_values, expected_values[0].hash, &expected_mask) == 0); + + assert_se(tpm2_parse_pcr_argument_to_mask(arg, &mask) == 0); + assert_se(mask == expected_mask); + } + + size_t old_n_values = n_values; + assert_se(tpm2_parse_pcr_argument_append("", &values, &n_values) == 0); + assert_se(values); + assert_se(n_values == old_n_values); +} + +static void check_parse_pcr_argument_to_mask(const char *arg, int mask) { + uint32_t m = 0; + int r = tpm2_parse_pcr_argument_to_mask(arg, &m); + + if (mask < 0) + assert_se(mask == r); + else + assert_se((uint32_t) mask == m); +} + +TEST(parse_pcr_argument) { + _cleanup_free_ Tpm2PCRValue *t0p = NULL; + size_t n_t0p; + assert_se(tpm2_parse_pcr_argument("", &t0p, &n_t0p) == 0); + assert_se(n_t0p == 0); + assert_se(tpm2_parse_pcr_argument_append("", &t0p, &n_t0p) == 0); + assert_se(n_t0p == 0); + uint32_t m0 = 0xf; + assert_se(tpm2_parse_pcr_argument_to_mask("", &m0) == 0); + assert_se(m0 == 0); + assert_se(tpm2_parse_pcr_argument_to_mask("", &m0) == 0); + assert_se(m0 == 0); + + Tpm2PCRValue t1[] = { + TPM2_PCR_VALUE_MAKE(0, 0, {}), + TPM2_PCR_VALUE_MAKE(4, 0, {}), + TPM2_PCR_VALUE_MAKE(7, 0, {}), + TPM2_PCR_VALUE_MAKE(11, 0, {}), + }; + check_parse_pcr_argument("0,4,7,11", NULL, 0, t1, ELEMENTSOF(t1)); + check_parse_pcr_argument("11,4,7,0", NULL, 0, t1, ELEMENTSOF(t1)); + check_parse_pcr_argument("7,4,0,11", NULL, 0, t1, ELEMENTSOF(t1)); + check_parse_pcr_argument("11,7,4,0", NULL, 0, t1, ELEMENTSOF(t1)); + check_parse_pcr_argument("0+4+7+11", NULL, 0, t1, ELEMENTSOF(t1)); + check_parse_pcr_argument("0,4+7,11", NULL, 0, t1, ELEMENTSOF(t1)); + + Tpm2PCRValue t2[] = { + TPM2_PCR_VALUE_MAKE(0, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(4, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(7, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(11, TPM2_ALG_SHA1, {}), + }; + check_parse_pcr_argument("0:sha1,4,7,11", NULL, 0, t2, ELEMENTSOF(t2)); + check_parse_pcr_argument("11,4,7,0:sha1", NULL, 0, t2, ELEMENTSOF(t2)); + check_parse_pcr_argument("7,4:sha1,0,11", NULL, 0, t2, ELEMENTSOF(t2)); + check_parse_pcr_argument("0:sha1,4:sha1,7:sha1,11:sha1", NULL, 0, t2, ELEMENTSOF(t2)); + check_parse_pcr_argument("0:sha1+4:sha1,11:sha1+7:sha1", NULL, 0, t2, ELEMENTSOF(t2)); + + Tpm2PCRValue t3[] = { + TPM2_PCR_VALUE_MAKE(0, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(1, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(2, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(3, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(4, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(7, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(11, TPM2_ALG_SHA1, {}), + TPM2_PCR_VALUE_MAKE(12, TPM2_ALG_SHA1, {}), + }; + check_parse_pcr_argument("1,2,3,12", t2, ELEMENTSOF(t2), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("12,2,3,1", t2, ELEMENTSOF(t2), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("1,2,3,12:sha1", t1, ELEMENTSOF(t1), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("1,2,3,12:sha1", t2, ELEMENTSOF(t2), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("1:sha1,2,3,12", t1, ELEMENTSOF(t1), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("1:sha1,2,3,12", t2, ELEMENTSOF(t2), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("1:sha1,2:sha1,3:sha1,12:sha1", t1, ELEMENTSOF(t1), t3, ELEMENTSOF(t3)); + check_parse_pcr_argument("1:sha1,2:sha1,3:sha1,12:sha1", t2, ELEMENTSOF(t2), t3, ELEMENTSOF(t3)); + + TPM2B_DIGEST d4; + digest_init(&d4, "FCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2"); + Tpm2PCRValue t4[] = { + TPM2_PCR_VALUE_MAKE(0, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(1, TPM2_ALG_SHA256, d4), + TPM2_PCR_VALUE_MAKE(2, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(3, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(4, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(7, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(11, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(12, TPM2_ALG_SHA256, {}), + }; + check_parse_pcr_argument("1:sha256=0xFCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2,2,3,12", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + check_parse_pcr_argument("12,2,3,1:sha256=FCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + check_parse_pcr_argument("12,2,3,1:sha256=0xFCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + check_parse_pcr_argument("1:sha256=0xFCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2,2,3,12:SHA256", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + check_parse_pcr_argument("1:sha256=0xFCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2,2,3,12", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + check_parse_pcr_argument("1:sha256=FCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2,2:sha256,3:sha256,12:sha256", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + check_parse_pcr_argument("1:sha256=0xFCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2,2:sha256,3:sha256,12:sha256", t1, ELEMENTSOF(t1), t4, ELEMENTSOF(t4)); + + TPM2B_DIGEST d5; + digest_init(&d5, "0F21EADB7F27377668E3C8069BE88D116491FBEE"); + Tpm2PCRValue t5[] = { + TPM2_PCR_VALUE_MAKE(1, TPM2_ALG_SHA1, d5), + TPM2_PCR_VALUE_MAKE(0, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(1, TPM2_ALG_SHA256, d4), + TPM2_PCR_VALUE_MAKE(2, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(3, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(4, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(7, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(11, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(12, TPM2_ALG_SHA256, {}), + TPM2_PCR_VALUE_MAKE(5, TPM2_ALG_SHA384, {}), + TPM2_PCR_VALUE_MAKE(6, TPM2_ALG_SHA512, {}), + }; + check_parse_pcr_argument("0,1:sha256=0xFCE7F1083082B16CFE2B085DD7858BB11A37C09B78E36C79E5A2FD529353C4E2,1:sha1=0F21EADB7F27377668E3C8069BE88D116491FBEE,2,3,4,7,11,12,5:sha384,6:sha512", NULL, 0, t5, ELEMENTSOF(t5)); + check_parse_pcr_argument("1:sha1=0F21EADB7F27377668E3C8069BE88D116491FBEE,6:sha512,5:sha384", t4, ELEMENTSOF(t4), t5, ELEMENTSOF(t5)); + + Tpm2PCRValue *v = NULL; + size_t n_v = 0; + assert_se(tpm2_parse_pcr_argument("1,100", &v, &n_v) < 0); + assert_se(tpm2_parse_pcr_argument("1,2=123456abc", &v, &n_v) < 0); + assert_se(tpm2_parse_pcr_argument("1,2:invalid", &v, &n_v) < 0); + assert_se(tpm2_parse_pcr_argument("1:sha1=invalid", &v, &n_v) < 0); + assert_se(v == NULL); + assert_se(n_v == 0); + + check_parse_pcr_argument_to_mask("", 0x0); + check_parse_pcr_argument_to_mask("0", 0x1); + check_parse_pcr_argument_to_mask("1", 0x2); + check_parse_pcr_argument_to_mask("0,1", 0x3); + check_parse_pcr_argument_to_mask("0+1", 0x3); + check_parse_pcr_argument_to_mask("0-1", -EINVAL); + check_parse_pcr_argument_to_mask("foo", -EINVAL); + check_parse_pcr_argument_to_mask("0,1,2", 0x7); + check_parse_pcr_argument_to_mask("0+1+2", 0x7); + check_parse_pcr_argument_to_mask("0+1,2", 0x7); + check_parse_pcr_argument_to_mask("0,1+2", 0x7); + check_parse_pcr_argument_to_mask("0,2", 0x5); + check_parse_pcr_argument_to_mask("0+2", 0x5); + check_parse_pcr_argument_to_mask("7+application-support", 0x800080); + check_parse_pcr_argument_to_mask("8+boot-loader-code", 0x110); + check_parse_pcr_argument_to_mask("7,shim-policy,4", 0x4090); + check_parse_pcr_argument_to_mask("sysexts,shim-policy+kernel-boot", 0x6800); + check_parse_pcr_argument_to_mask("sysexts,shim+kernel-boot", -EINVAL); + check_parse_pcr_argument_to_mask("sysexts+17+23", 0x822000); + check_parse_pcr_argument_to_mask("6+boot-loader-code,44", -EINVAL); + check_parse_pcr_argument_to_mask("debug+24", -EINVAL); +} + +static const TPMT_PUBLIC test_rsa_template = { + .type = TPM2_ALG_RSA, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = TPMA_OBJECT_RESTRICTED|TPMA_OBJECT_DECRYPT|TPMA_OBJECT_FIXEDTPM|TPMA_OBJECT_FIXEDPARENT|TPMA_OBJECT_SENSITIVEDATAORIGIN|TPMA_OBJECT_USERWITHAUTH, + .parameters.rsaDetail = { + .symmetric = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, + }, + .scheme.scheme = TPM2_ALG_NULL, + .keyBits = 2048, + }, +}; + +static const TPMT_PUBLIC test_ecc_template = { + .type = TPM2_ALG_ECC, + .nameAlg = TPM2_ALG_SHA256, + .objectAttributes = TPMA_OBJECT_RESTRICTED|TPMA_OBJECT_DECRYPT|TPMA_OBJECT_FIXEDTPM|TPMA_OBJECT_FIXEDPARENT|TPMA_OBJECT_SENSITIVEDATAORIGIN|TPMA_OBJECT_USERWITHAUTH, + .parameters.eccDetail = { + .symmetric = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, + }, + .scheme.scheme = TPM2_ALG_NULL, + .curveID = TPM2_ECC_NIST_P256, + .kdf.scheme = TPM2_ALG_NULL, + }, +}; + +static const TPMT_PUBLIC *test_templates[] = { + &test_rsa_template, + &test_ecc_template, +}; + +static void tpm2b_public_rsa_init(TPM2B_PUBLIC *public, const char *rsa_n) { + TPMT_PUBLIC tpmt = test_rsa_template; + + DEFINE_HEX_PTR(key, rsa_n); + tpmt.unique.rsa = TPM2B_PUBLIC_KEY_RSA_MAKE(key, key_len); + + public->size = sizeof(tpmt); + public->publicArea = tpmt; +} + +static void tpm2b_public_ecc_init(TPM2B_PUBLIC *public, TPMI_ECC_CURVE curve, const char *x, const char *y) { + TPMT_PUBLIC tpmt = test_ecc_template; + tpmt.parameters.eccDetail.curveID = curve; + + DEFINE_HEX_PTR(buf_x, x); + tpmt.unique.ecc.x = TPM2B_ECC_PARAMETER_MAKE(buf_x, buf_x_len); + + DEFINE_HEX_PTR(buf_y, y); + tpmt.unique.ecc.y = TPM2B_ECC_PARAMETER_MAKE(buf_y, buf_y_len); + + public->size = sizeof(tpmt); + public->publicArea = tpmt; +} + +#if HAVE_OPENSSL +TEST(tpm2b_public_to_openssl_pkey) { + DEFINE_HEX_PTR(msg, "edc64c6523778961fe9ba03ab7d624b27ca1dd5b01e7734cc6c891d50db04269"); + TPM2B_PUBLIC public; + + /* RSA */ + tpm2b_public_rsa_init(&public, "d71cff5bba2173f0434a389171048e7da8cf8409b892c62946481cc383089bc754324620967fea3d00a02a717cdda4bfe1525ad957d294b88434e0a3933e86fb40f234e4935fd2ba27eb1d21da87efa466b74eb4ad18d26059904643441cf402ee933d138a2151f40459c49d87fef59e2cb822768b2d8689a9b58f82bf9a37e70693f2b2d40dfa388d365c1b1f029a14c4fc8dadb68978ef377d20ff2ca24e7078464c705eab42f531557c9c6dc0df66b506d0c26ef604f8110c64867099267453c71871e7ed22505a09daf102afc34355209ca7680eccc0ed368d148f402fa58cbb6c9d52351f535f09e4e24ad805e149f130edaa2f5e7efed3a4d2d03adb85"); + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey_rsa = NULL; + assert_se(tpm2_tpm2b_public_to_openssl_pkey(&public, &pkey_rsa) >= 0); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx_rsa = EVP_PKEY_CTX_new((EVP_PKEY*) pkey_rsa, NULL); + assert_se(ctx_rsa); + assert_se(EVP_PKEY_verify_init(ctx_rsa) == 1); + assert_se(EVP_PKEY_CTX_set_signature_md(ctx_rsa, EVP_sha256()) > 0); + + DEFINE_HEX_PTR(sig_rsa, "9f70a9e68911be3ec464cae91126328307bf355872127e042d6c61e0a80982872c151033bcf727abfae5fc9500c923120011e7ef4aa5fc690a59a034697b6022c141b4b209e2df6f4b282288cd9181073fbe7158ce113c79d87623423c1f3996ff931e59cc91db74f8e8656215b1436fc93ddec0f1f8fa8510826e674b250f047e6cba94c95ff98072a286baca94646b577974a1e00d56c21944e38960d8ee90511a2f938e5cf1ac7b7cc7ff8e3ac001d321254d3e4f988b90e9f6f873c26ecd0a12a626b3474833cdbb9e9f793238f6c97ee5b75a1a89bb7a7858d34ecfa6d34ac58d95085e6c4fbbebd47a4364be2725c2c6b3fa15d916f3c0b62a66fe76ae"); + assert_se(EVP_PKEY_verify(ctx_rsa, sig_rsa, sig_rsa_len, (unsigned char*) msg, msg_len) == 1); + + /* ECC */ + tpm2b_public_ecc_init(&public, TPM2_ECC_NIST_P256, "6fc0ecf3645c673ab7e86d1ec5b315afb950257c5f68ab23296160006711fac2", "8dd2ef7a2c9ecede91493ba98c8fb3f893aff325c6a1e0f752c657b2d6ca1413"); + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey_ecc = NULL; + assert_se(tpm2_tpm2b_public_to_openssl_pkey(&public, &pkey_ecc) >= 0); + + _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx_ecc = EVP_PKEY_CTX_new((EVP_PKEY*) pkey_ecc, NULL); + assert_se(ctx_ecc); + assert_se(EVP_PKEY_verify_init(ctx_ecc) == 1); + + DEFINE_HEX_PTR(sig_ecc, "304602210092447ac0b5b32e90923f79bb4aba864b9c546a9900cf193a83243d35d189a2110221009a8b4df1dfa85e225eff9c606694d4d205a7a3968c9552f50bc2790209a90001"); + assert_se(EVP_PKEY_verify(ctx_ecc, sig_ecc, sig_ecc_len, (unsigned char*) msg, msg_len) == 1); +} + +static void get_tpm2b_public_from_pem(const void *pem, size_t pem_size, TPM2B_PUBLIC *ret) { + _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL; + TPM2B_PUBLIC p1 = {}, p2 = {}; + + assert(pem); + assert(ret); + + assert_se(openssl_pkey_from_pem(pem, pem_size, &pkey) >= 0); + assert_se(tpm2_tpm2b_public_from_openssl_pkey(pkey, &p1) >= 0); + assert_se(tpm2_tpm2b_public_from_pem(pem, pem_size, &p2) >= 0); + assert_se(memcmp_nn(&p1, sizeof(p1), &p2, sizeof(p2)) == 0); + + *ret = p1; +} + +static void check_tpm2b_public_fingerprint(const TPM2B_PUBLIC *public, const char *hexfp) { + DEFINE_HEX_PTR(expected, hexfp); + _cleanup_free_ void *fp = NULL; + size_t fp_size; + + assert_se(tpm2_tpm2b_public_to_fingerprint(public, &fp, &fp_size) >= 0); + assert_se(memcmp_nn(fp, fp_size, expected, expected_len) == 0); +} + +static void check_tpm2b_public_name(const TPM2B_PUBLIC *public, const char *hexname) { + DEFINE_HEX_PTR(expected, hexname); + TPM2B_NAME name = {}; + + assert_se(tpm2_calculate_pubkey_name(&public->publicArea, &name) >= 0); + assert_se(memcmp_nn(name.name, name.size, expected, expected_len) == 0); +} + +static void check_tpm2b_public_from_ecc_pem(const char *pem, const char *hexx, const char *hexy, const char *hexfp, const char *hexname) { + TPM2B_PUBLIC public = {}; + TPMT_PUBLIC *p = &public.publicArea; + + DEFINE_HEX_PTR(key, pem); + get_tpm2b_public_from_pem(key, key_len, &public); + + assert_se(p->type == TPM2_ALG_ECC); + assert_se(p->parameters.eccDetail.curveID == TPM2_ECC_NIST_P256); + + DEFINE_HEX_PTR(expected_x, hexx); + assert_se(memcmp_nn(p->unique.ecc.x.buffer, p->unique.ecc.x.size, expected_x, expected_x_len) == 0); + + DEFINE_HEX_PTR(expected_y, hexy); + assert_se(memcmp_nn(p->unique.ecc.y.buffer, p->unique.ecc.y.size, expected_y, expected_y_len) == 0); + + check_tpm2b_public_fingerprint(&public, hexfp); + check_tpm2b_public_name(&public, hexname); +} + +static void check_tpm2b_public_from_rsa_pem(const char *pem, const char *hexn, uint32_t exponent, const char *hexfp, const char *hexname) { + TPM2B_PUBLIC public = {}; + TPMT_PUBLIC *p = &public.publicArea; + + DEFINE_HEX_PTR(key, pem); + get_tpm2b_public_from_pem(key, key_len, &public); + + assert_se(p->type == TPM2_ALG_RSA); + + DEFINE_HEX_PTR(expected_n, hexn); + assert_se(memcmp_nn(p->unique.rsa.buffer, p->unique.rsa.size, expected_n, expected_n_len) == 0); + + assert_se(p->parameters.rsaDetail.keyBits == expected_n_len * 8); + + assert_se(p->parameters.rsaDetail.exponent == exponent); + + check_tpm2b_public_fingerprint(&public, hexfp); + check_tpm2b_public_name(&public, hexname); +} + +TEST(tpm2b_public_from_openssl_pkey) { + /* standard ECC key */ + check_tpm2b_public_from_ecc_pem("2d2d2d2d2d424547494e205055424c4943204b45592d2d2d2d2d0a4d466b77457759484b6f5a497a6a3043415159494b6f5a497a6a30444151634451674145726a6e4575424c73496c3972687068777976584e50686a346a426e500a44586e794a304b395579724e6764365335413532542b6f5376746b436a365a726c34685847337741515558706f426c532b7448717452714c35513d3d0a2d2d2d2d2d454e44205055424c4943204b45592d2d2d2d2d0a", + "ae39c4b812ec225f6b869870caf5cd3e18f88c19cf0d79f22742bd532acd81de", + "92e40e764fea12bed9028fa66b9788571b7c004145e9a01952fad1eab51a8be5", + "cd3373293b62a52b48c12100e80ea9bfd806266ce76893a5ec31cb128052d97c", + "000b5c127e4dbaf8fb7bac641e8db25a84a48db876ca7ee3bd317ae1a4554ff72f17"); + + /* standard RSA key */ + check_tpm2b_public_from_rsa_pem("2d2d2d2d2d424547494e205055424c4943204b45592d2d2d2d2d0a4d494942496a414e42676b71686b6947397730424151454641414f43415138414d49494243674b4341514541795639434950652f505852337a436f63787045300a6a575262546c3568585844436b472f584b79374b6d2f4439584942334b734f5a31436a5937375571372f674359363170697838697552756a73413464503165380a593445336c68556d374a332b6473766b626f4b64553243626d52494c2f6675627771694c4d587a41673342575278747234547545443533527a373634554650640a307a70304b68775231496230444c67772f344e67566f314146763378784b4d6478774d45683567676b73733038326332706c354a504e32587677426f744e6b4d0a5471526c745a4a35355244436170696e7153334577376675646c4e735851357746766c7432377a7637344b585165616d704c59433037584f6761304c676c536b0a79754774586b6a50542f735542544a705374615769674d5a6f714b7479563463515a58436b4a52684459614c47587673504233687a766d5671636e6b47654e540a65774944415141420a2d2d2d2d2d454e44205055424c4943204b45592d2d2d2d2d0a", + "c95f4220f7bf3d7477cc2a1cc691348d645b4e5e615d70c2906fd72b2eca9bf0fd5c80772ac399d428d8efb52aeff80263ad698b1f22b91ba3b00e1d3f57bc638137961526ec9dfe76cbe46e829d53609b99120bfdfb9bc2a88b317cc0837056471b6be13b840f9dd1cfbeb85053ddd33a742a1c11d486f40cb830ff8360568d4016fdf1c4a31dc7030487982092cb34f36736a65e493cdd97bf0068b4d90c4ea465b59279e510c26a98a7a92dc4c3b7ee76536c5d0e7016f96ddbbcefef829741e6a6a4b602d3b5ce81ad0b8254a4cae1ad5e48cf4ffb140532694ad6968a0319a2a2adc95e1c4195c29094610d868b197bec3c1de1cef995a9c9e419e3537b", + 0x10001, + "d9186d13a7fd5b3644cee05448f49ad3574e82a2942ff93cf89598d36cca78a9", + "000be1bd75c7976e7a30e9e82223b81a9eff0d42c30618e588db592ed5da94455e81"); + + /* RSA key with non-default (i.e. not 0x10001) exponent */ + check_tpm2b_public_from_rsa_pem("2d2d2d2d2d424547494e205055424c4943204b45592d2d2d2d2d0a4d494942496a414e42676b71686b6947397730424151454641414f43415138414d49494243674b434151454179566c7551664b75565171596a5a71436a657a760a364e4a6f58654c736f702f72765375666330773769544d4f73566741557462515452505451725874397065537a4370524467634378656b6a544144577279304b0a6d59786a7a3634776c6a7030463959383068636a6b6b4b3759414d333054664c4648656c2b377574427370777142467a6e2b385a6659567353434b397354706f0a316c61376e5347514e7451576f36444a366c525a336a676d6d584f61544654416145304a432b7046584273564471736d46326438362f314e51714a755a5154520a575852636954704e58357649792f37766b6c5a6a685569526c78764e594f4e3070636476534a37364e74496e447a3048506f775a38705a454f4d2f4a454f59780a617a4c4a6a644936446b355279593578325a7949375074566a3057537242524f4d696f2b674c6556457a43343456336438315a38445138564e334c69625130330a70514944415141460a2d2d2d2d2d454e44205055424c4943204b45592d2d2d2d2d0a", + "c9596e41f2ae550a988d9a828decefe8d2685de2eca29febbd2b9f734c3b89330eb1580052d6d04d13d342b5edf69792cc2a510e0702c5e9234c00d6af2d0a998c63cfae30963a7417d63cd217239242bb600337d137cb1477a5fbbbad06ca70a811739fef197d856c4822bdb13a68d656bb9d219036d416a3a0c9ea5459de382699739a4c54c0684d090bea455c1b150eab2617677cebfd4d42a26e6504d159745c893a4d5f9bc8cbfeef925663854891971bcd60e374a5c76f489efa36d2270f3d073e8c19f2964438cfc910e6316b32c98dd23a0e4e51c98e71d99c88ecfb558f4592ac144e322a3e80b7951330b8e15dddf3567c0d0f153772e26d0d37a5", + 0x10005, + "c8ca80a687d5972e1d961aaa2cfde2ff2e7a20d85e3ea0382804e70e013d65af", + "000beb8974d36d8cf58fdc87460dda00319e10c94c1b9f222ac9ce29d1c4776246cc"); +} +#endif + +static void check_name(const TPM2B_NAME *name, const char *expect) { + assert_se(name->size == SHA256_DIGEST_SIZE + 2); + + DEFINE_HEX_PTR(e, expect); + assert_se(name->size == e_len); + assert_se(memcmp(name->name, e, e_len) == 0); +} + +TEST(calculate_pubkey_name) { + TPM2B_PUBLIC public; + TPM2B_NAME name; + + /* RSA */ + tpm2b_public_rsa_init(&public, "9ec7341c52093ac40a1965a5df10432513c539adcf905e30577ab6ebc88ffe53cd08cef12ed9bec6125432f4fada3629b8b96d31b8f507aa35029188fe396da823fcb236027f7fbb01b0da3d87be7f999390449ced604bdf7e26c48657cc0671000f1147da195c3861c96642e54427cb7a11572e07567ec3fd6316978abc4bd92b27bb0a0e4958e599804eeb41d682b3b7fc1f960209f80a4fb8a1b64abfd96bf5d554e73cdd6ad1c8becb4fcf5e8f0c3e621d210e5e2f308f6520ad9a966779231b99f06c5989e5a23a9415c8808ab89ce81117632e2f8461cd4428bded40979236aeadafe8de3f51660a45e1dbc87694e6a36360201cca3ff9e7263e712727"); + assert_se(tpm2_calculate_pubkey_name(&public.publicArea, &name) >= 0); + check_name(&name, "000be78f74a470dd92e979ca067cdb2293a35f075e8560b436bd2ccea5da21486a07"); + + /* ECC */ + tpm2b_public_ecc_init(&public, TPM2_ECC_NIST_P256, "238e02ee4fd5598add6b502429f1815418515e4b0d6551c8e816b38cb15451d1", "70c2d491769775ec43ccd5a571c429233e9d30cf0f486c2e01acd6cb32ba93b6"); + assert_se(tpm2_calculate_pubkey_name(&public.publicArea, &name) >= 0); + check_name(&name, "000b302787187ba19c82011c987bd2dcdbb652b3a543ccc5cb0b49c33d4caae604a6"); +} + +TEST(calculate_policy_auth_value) { + TPM2B_DIGEST d; + + digest_init(&d, "0000000000000000000000000000000000000000000000000000000000000000"); + assert_se(tpm2_calculate_policy_auth_value(&d) == 0); + assert_se(digest_check(&d, "8fcd2169ab92694e0c633f1ab772842b8241bbc20288981fc7ac1eddc1fddb0e")); + assert_se(tpm2_calculate_policy_auth_value(&d) == 0); + assert_se(digest_check(&d, "759ebd5ed65100e0b4aa2d04b4b789c2672d92ecc9cdda4b5fa16a303132e008")); +} + +TEST(calculate_policy_authorize) { + TPM2B_PUBLIC public; + TPM2B_DIGEST d; + + /* RSA */ + tpm2b_public_rsa_init(&public, "9ec7341c52093ac40a1965a5df10432513c539adcf905e30577ab6ebc88ffe53cd08cef12ed9bec6125432f4fada3629b8b96d31b8f507aa35029188fe396da823fcb236027f7fbb01b0da3d87be7f999390449ced604bdf7e26c48657cc0671000f1147da195c3861c96642e54427cb7a11572e07567ec3fd6316978abc4bd92b27bb0a0e4958e599804eeb41d682b3b7fc1f960209f80a4fb8a1b64abfd96bf5d554e73cdd6ad1c8becb4fcf5e8f0c3e621d210e5e2f308f6520ad9a966779231b99f06c5989e5a23a9415c8808ab89ce81117632e2f8461cd4428bded40979236aeadafe8de3f51660a45e1dbc87694e6a36360201cca3ff9e7263e712727"); + digest_init(&d, "0000000000000000000000000000000000000000000000000000000000000000"); + assert_se(tpm2_calculate_policy_authorize(&public, NULL, &d) == 0); + assert_se(digest_check(&d, "95213a3784eaab04f427bc7e8851c2f1df0903be8e42428ec25dcefd907baff1")); + assert_se(tpm2_calculate_policy_authorize(&public, NULL, &d) == 0); + assert_se(digest_check(&d, "95213a3784eaab04f427bc7e8851c2f1df0903be8e42428ec25dcefd907baff1")); + + /* ECC */ + tpm2b_public_ecc_init(&public, TPM2_ECC_NIST_P256, "423a89da6f0998f510489ab9682706e762031ef8f9faef2a185eff67065a187e", "996f73291670cef9e303d6cd9fa19ddf2c9c1fb1e283324ca9acca07c405c8d0"); + digest_init(&d, "0000000000000000000000000000000000000000000000000000000000000000"); + assert_se(tpm2_calculate_policy_authorize(&public, NULL, &d) == 0); + assert_se(digest_check(&d, "2a5b705e83f949c27ac4d2e79e54fb5fb0a60f0b37bbd54a0ee1022ba00d3628")); + assert_se(tpm2_calculate_policy_authorize(&public, NULL, &d) == 0); + assert_se(digest_check(&d, "2a5b705e83f949c27ac4d2e79e54fb5fb0a60f0b37bbd54a0ee1022ba00d3628")); +} + +TEST(calculate_policy_pcr) { + TPM2B_DIGEST d, dN[16]; + + digest_init(&dN[ 0], "2124793cbbe60c3a8637d3b84a5d054e87c351e1469a285acc04755e8b204dec"); + digest_init(&dN[ 1], "bf7592f18adcfdc549fc0b94939f5069a24697f9cff4a0dca29014767b97559d"); + digest_init(&dN[ 2], "4b00cff9dee3a364979b2dc241b34568a8ad49fcf2713df259e47dff8875feed"); + digest_init(&dN[ 3], "3d458cfe55cc03ea1f443f1562beec8df51c75e14a9fcf9a7234a13f198e7969"); + digest_init(&dN[ 4], "368f85b3013041dfe203faaa364f00b07c5da7b1e5f1dbf2efb06fa6b9bd92de"); + digest_init(&dN[ 5], "c97c40369691c8e4aa78fb3a52655cd193b780a838b8e23f5f476576919db5e5"); + digest_init(&dN[ 6], "3d458cfe55cc03ea1f443f1562beec8df51c75e14a9fcf9a7234a13f198e7969"); + digest_init(&dN[ 7], "aa1154c9e0a774854ccbed4c8ce7e9b906b3d700a1a8db1772d0341a62dbe51b"); + digest_init(&dN[ 8], "cfde439a2c06af3479ca6bdc60429b90553d65300c5cfcc40004a08c6b5ad81a"); + digest_init(&dN[ 9], "9c2bac22ef5ec84fcdb71c3ebf776cba1247e5da980e5ee08e45666a2edf0b8b"); + digest_init(&dN[10], "9885873f4d7348199ad286f8f2476d4f866940950f6f9fb9f945ed352dbdcbd2"); + digest_init(&dN[11], "42400ab950d21aa79d12cc4fdef67d1087a39ad64900619831c0974dbae54e44"); + digest_init(&dN[12], "767d064382e56ca1ad3bdcc6bc596112e6c2008b593d3570d24c2bfa64c4628c"); + digest_init(&dN[13], "30c16133175959408c9745d8dafadef5daf4b39cb2be04df0d60089bd46d3cc4"); + digest_init(&dN[14], "e3991b7ddd47be7e92726a832d6874c5349b52b789fa0db8b558c69fea29574e"); + digest_init(&dN[15], "852dae3ecb992bdeb13d6002fefeeffdd90feca8b378d56681ef2c885d0e5137"); + + digest_init(&d, "0000000000000000000000000000000000000000000000000000000000000000"); + Tpm2PCRValue v1[] = { + TPM2_PCR_VALUE_MAKE(4, TPM2_ALG_SHA256, dN[4]), + TPM2_PCR_VALUE_MAKE(7, TPM2_ALG_SHA256, dN[7]), + TPM2_PCR_VALUE_MAKE(8, TPM2_ALG_SHA256, dN[8]), + }; + assert_se(tpm2_calculate_policy_pcr(v1, ELEMENTSOF(v1), &d) == 0); + assert_se(digest_check(&d, "76532a0e16f7e6bf6b02918c11f75d99d729fab0cc81d0df2c4284a2c4fe6e05")); + assert_se(tpm2_calculate_policy_pcr(v1, ELEMENTSOF(v1), &d) == 0); + assert_se(digest_check(&d, "97e64bcabb64c1fa4b726528644926c8029f5b4458b0575c98c04fe225629a0b")); + + digest_init(&d, "0000000000000000000000000000000000000000000000000000000000000000"); + Tpm2PCRValue v2[] = { + TPM2_PCR_VALUE_MAKE( 0, TPM2_ALG_SHA256, dN[ 0]), + TPM2_PCR_VALUE_MAKE( 1, TPM2_ALG_SHA256, dN[ 1]), + TPM2_PCR_VALUE_MAKE( 2, TPM2_ALG_SHA256, dN[ 2]), + TPM2_PCR_VALUE_MAKE( 3, TPM2_ALG_SHA256, dN[ 3]), + TPM2_PCR_VALUE_MAKE( 4, TPM2_ALG_SHA256, dN[ 4]), + TPM2_PCR_VALUE_MAKE( 5, TPM2_ALG_SHA256, dN[ 5]), + TPM2_PCR_VALUE_MAKE( 6, TPM2_ALG_SHA256, dN[ 6]), + TPM2_PCR_VALUE_MAKE( 7, TPM2_ALG_SHA256, dN[ 7]), + TPM2_PCR_VALUE_MAKE( 8, TPM2_ALG_SHA256, dN[ 8]), + TPM2_PCR_VALUE_MAKE( 9, TPM2_ALG_SHA256, dN[ 9]), + TPM2_PCR_VALUE_MAKE(10, TPM2_ALG_SHA256, dN[10]), + TPM2_PCR_VALUE_MAKE(11, TPM2_ALG_SHA256, dN[11]), + TPM2_PCR_VALUE_MAKE(12, TPM2_ALG_SHA256, dN[12]), + TPM2_PCR_VALUE_MAKE(13, TPM2_ALG_SHA256, dN[13]), + TPM2_PCR_VALUE_MAKE(14, TPM2_ALG_SHA256, dN[14]), + TPM2_PCR_VALUE_MAKE(15, TPM2_ALG_SHA256, dN[15]), + }; + assert_se(tpm2_calculate_policy_pcr(v2, ELEMENTSOF(v2), &d) == 0); + assert_se(digest_check(&d, "22be4f1674f792d6345cea9427701068f0e8d9f42755dcc0e927e545a68f9c13")); + assert_se(tpm2_calculate_policy_pcr(v2, ELEMENTSOF(v2), &d) == 0); + assert_se(digest_check(&d, "7481fd1b116078eb3ac2456e4ad542c9b46b9b8eb891335771ca8e7c8f8e4415")); +} + +static void check_srk_rsa_template(TPMT_PUBLIC *template) { + assert_se(template->type == TPM2_ALG_RSA); + assert_se(template->nameAlg == TPM2_ALG_SHA256); + assert_se(template->objectAttributes == 0x30472); + assert_se(template->parameters.rsaDetail.symmetric.algorithm == TPM2_ALG_AES); + assert_se(template->parameters.rsaDetail.symmetric.keyBits.sym == 128); + assert_se(template->parameters.rsaDetail.symmetric.mode.sym == TPM2_ALG_CFB); + assert_se(template->parameters.rsaDetail.scheme.scheme == TPM2_ALG_NULL); + assert_se(template->parameters.rsaDetail.keyBits == 2048); +} + +static void check_srk_ecc_template(TPMT_PUBLIC *template) { + assert_se(template->type == TPM2_ALG_ECC); + assert_se(template->nameAlg == TPM2_ALG_SHA256); + assert_se(template->objectAttributes == 0x30472); + assert_se(template->parameters.eccDetail.symmetric.algorithm == TPM2_ALG_AES); + assert_se(template->parameters.eccDetail.symmetric.keyBits.sym == 128); + assert_se(template->parameters.eccDetail.symmetric.mode.sym == TPM2_ALG_CFB); + assert_se(template->parameters.eccDetail.scheme.scheme == TPM2_ALG_NULL); + assert_se(template->parameters.eccDetail.kdf.scheme == TPM2_ALG_NULL); + assert_se(template->parameters.eccDetail.curveID == TPM2_ECC_NIST_P256); +} + +TEST(tpm2_get_srk_template) { + TPMT_PUBLIC template; + + assert_se(tpm2_get_srk_template(TPM2_ALG_RSA, &template) >= 0); + check_srk_rsa_template(&template); + + assert_se(tpm2_get_srk_template(TPM2_ALG_ECC, &template) >= 0); + check_srk_ecc_template(&template); +} + +static void check_best_srk_template(Tpm2Context *c) { + TEST_LOG_FUNC(); + + TPMT_PUBLIC template; + assert_se(tpm2_get_best_srk_template(c, &template) >= 0); + + assert_se(IN_SET(template.type, TPM2_ALG_ECC, TPM2_ALG_RSA)); + + if (template.type == TPM2_ALG_RSA) + check_srk_rsa_template(&template); + else + check_srk_ecc_template(&template); +} + +static void check_test_parms(Tpm2Context *c) { + assert(c); + + TEST_LOG_FUNC(); + + TPMU_PUBLIC_PARMS parms = { + .symDetail.sym = { + .algorithm = TPM2_ALG_AES, + .keyBits.aes = 128, + .mode.aes = TPM2_ALG_CFB, + }, + }; + + /* Test with invalid parms */ + assert_se(!tpm2_test_parms(c, TPM2_ALG_CFB, &parms)); + + TPMU_PUBLIC_PARMS invalid_parms = parms; + invalid_parms.symDetail.sym.keyBits.aes = 1; + assert_se(!tpm2_test_parms(c, TPM2_ALG_SYMCIPHER, &invalid_parms)); + + /* Test with valid parms */ + assert_se(tpm2_test_parms(c, TPM2_ALG_SYMCIPHER, &parms)); +} + +static void check_supports_alg(Tpm2Context *c) { + assert(c); + + TEST_LOG_FUNC(); + + /* Test invalid algs */ + assert_se(!tpm2_supports_alg(c, TPM2_ALG_ERROR)); + assert_se(!tpm2_supports_alg(c, TPM2_ALG_LAST + 1)); + + /* Test valid algs */ + assert_se(tpm2_supports_alg(c, TPM2_ALG_RSA)); + assert_se(tpm2_supports_alg(c, TPM2_ALG_AES)); + assert_se(tpm2_supports_alg(c, TPM2_ALG_CFB)); +} + +static void check_supports_command(Tpm2Context *c) { + assert(c); + + TEST_LOG_FUNC(); + + /* Test invalid commands. TPM specification Part 2 ("Structures") section "TPM_CC (Command Codes)" + * states bits 31:30 and 28:16 are reserved and must be 0. */ + assert_se(!tpm2_supports_command(c, UINT32_C(0x80000000))); + assert_se(!tpm2_supports_command(c, UINT32_C(0x40000000))); + assert_se(!tpm2_supports_command(c, UINT32_C(0x00100000))); + assert_se(!tpm2_supports_command(c, UINT32_C(0x80000144))); + assert_se(!tpm2_supports_command(c, UINT32_C(0x40000144))); + assert_se(!tpm2_supports_command(c, UINT32_C(0x00100144))); + + /* Test valid commands. We should be able to expect all TPMs support these. */ + assert_se(tpm2_supports_command(c, TPM2_CC_Startup)); + assert_se(tpm2_supports_command(c, TPM2_CC_StartAuthSession)); + assert_se(tpm2_supports_command(c, TPM2_CC_Create)); + assert_se(tpm2_supports_command(c, TPM2_CC_CreatePrimary)); + assert_se(tpm2_supports_command(c, TPM2_CC_Unseal)); +} + +static void check_get_or_create_srk(Tpm2Context *c) { + TEST_LOG_FUNC(); + + _cleanup_free_ TPM2B_PUBLIC *public = NULL; + _cleanup_free_ TPM2B_NAME *name = NULL, *qname = NULL; + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + assert_se(tpm2_get_or_create_srk(c, NULL, &public, &name, &qname, &handle) >= 0); + assert_se(public && name && qname && handle); + + _cleanup_free_ TPM2B_PUBLIC *public2 = NULL; + _cleanup_free_ TPM2B_NAME *name2 = NULL, *qname2 = NULL; + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle2 = NULL; + assert_se(tpm2_get_srk(c, NULL, &public2, &name2, &qname2, &handle2) >= 0); + assert_se(public2 && name2 && qname2 && handle2); + + assert_se(memcmp_nn(public, sizeof(*public), public2, sizeof(*public2)) == 0); + assert_se(memcmp_nn(name->name, name->size, name2->name, name2->size) == 0); + assert_se(memcmp_nn(qname->name, qname->size, qname2->name, qname2->size) == 0); +} + +#if HAVE_OPENSSL && OPENSSL_VERSION_MAJOR >= 3 +static void calculate_seal_and_unseal( + Tpm2Context *c, + TPM2_HANDLE parent_index, + const TPM2B_PUBLIC *parent_public) { + + _cleanup_free_ char *secret_string = NULL; + assert_se(asprintf(&secret_string, "The classified documents are in room %x", parent_index) > 0); + size_t secret_size = strlen(secret_string) + 1; + + _cleanup_free_ void *blob = NULL; + size_t blob_size = 0; + _cleanup_free_ void *serialized_parent = NULL; + size_t serialized_parent_size; + assert_se(tpm2_calculate_seal( + parent_index, + parent_public, + /* attributes= */ NULL, + secret_string, secret_size, + /* policy= */ NULL, + /* pin= */ NULL, + /* ret_secret= */ NULL, /* ret_secret_size= */ 0, + &blob, &blob_size, + &serialized_parent, &serialized_parent_size) >= 0); + + _cleanup_free_ void *unsealed_secret = NULL; + size_t unsealed_secret_size; + assert_se(tpm2_unseal( + c, + /* hash_pcr_mask= */ 0, + /* pcr_bank= */ 0, + /* pubkey= */ NULL, /* pubkey_size= */ 0, + /* pubkey_pcr_mask= */ 0, + /* signature= */ NULL, + /* pin= */ NULL, + /* pcrlock_policy= */ NULL, + /* primary_alg= */ 0, + blob, blob_size, + /* known_policy_hash= */ NULL, /* known_policy_hash_size= */ 0, + serialized_parent, serialized_parent_size, + &unsealed_secret, &unsealed_secret_size) >= 0); + + assert_se(memcmp_nn(secret_string, secret_size, unsealed_secret, unsealed_secret_size) == 0); + + char unsealed_string[unsealed_secret_size]; + assert_se(snprintf(unsealed_string, unsealed_secret_size, "%s", (char*) unsealed_secret) == (int) unsealed_secret_size - 1); + log_debug("Unsealed secret is: %s", unsealed_string); +} + +static int check_calculate_seal(Tpm2Context *c) { + assert(c); + int r; + + if (detect_virtualization() == VIRTUALIZATION_NONE && !slow_tests_enabled()) { + log_notice("Skipping slow calculate seal TPM2 tests. Physical system detected, and slow tests disabled."); + return 0; + } + + TEST_LOG_FUNC(); + + _cleanup_free_ TPM2B_PUBLIC *srk_public = NULL; + assert_se(tpm2_get_srk(c, NULL, &srk_public, NULL, NULL, NULL) >= 0); + calculate_seal_and_unseal(c, TPM2_SRK_HANDLE, srk_public); + + TPMI_ALG_ASYM test_algs[] = { TPM2_ALG_RSA, TPM2_ALG_ECC, }; + for (unsigned i = 0; i < ELEMENTSOF(test_algs); i++) { + TPMI_ALG_ASYM alg = test_algs[i]; + + TPM2B_PUBLIC template = { .size = sizeof(TPMT_PUBLIC), }; + assert_se(tpm2_get_srk_template(alg, &template.publicArea) >= 0); + + _cleanup_free_ TPM2B_PUBLIC *public = NULL; + _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL; + assert_se(tpm2_create_primary(c, NULL, &template, NULL, &public, &handle) >= 0); + + /* Once our minimum libtss2-esys version is 2.4.0 or later, this can assume + * tpm2_index_from_handle() should always work. */ + TPM2_HANDLE index; + r = tpm2_index_from_handle(c, handle, &index); + if (r == -EOPNOTSUPP) + return log_tests_skipped("libtss2-esys version too old to support tpm2_index_from_handle()"); + assert_se(r >= 0); + + calculate_seal_and_unseal(c, index, public); + } + + return 0; +} +#endif /* HAVE_OPENSSL && OPENSSL_VERSION_MAJOR >= 3 */ + +static void check_seal_unseal_for_handle(Tpm2Context *c, TPM2_HANDLE handle) { + TPM2B_DIGEST policy = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE); + + assert(c); + + log_debug("Check seal/unseal for handle 0x%" PRIx32, handle); + + _cleanup_free_ void *secret = NULL, *blob = NULL, *srk = NULL, *unsealed_secret = NULL; + size_t secret_size, blob_size, srk_size, unsealed_secret_size; + assert_se(tpm2_seal( + c, + handle, + &policy, + /* pin= */ NULL, + &secret, &secret_size, + &blob, &blob_size, + /* ret_primary_alg= */ NULL, + &srk, &srk_size) >= 0); + + assert_se(tpm2_unseal( + c, + /* hash_pcr_mask= */ 0, + /* pcr_bank= */ 0, + /* pubkey= */ NULL, /* pubkey_size= */ 0, + /* pubkey_pcr_mask= */ 0, + /* signature= */ NULL, + /* pin= */ NULL, + /* pcrlock_policy= */ NULL, + /* primary_alg= */ 0, + blob, blob_size, + /* policy_hash= */ NULL, /* policy_hash_size= */ 0, + srk, srk_size, + &unsealed_secret, &unsealed_secret_size) >= 0); + + assert_se(memcmp_nn(secret, secret_size, unsealed_secret, unsealed_secret_size) == 0); +} + +static void check_seal_unseal(Tpm2Context *c) { + int r; + + assert(c); + + if (detect_virtualization() == VIRTUALIZATION_NONE && !slow_tests_enabled()) { + log_notice("Skipping slow seal/unseal TPM2 tests. Physical system detected, and slow tests disabled."); + return; + } + + TEST_LOG_FUNC(); + + check_seal_unseal_for_handle(c, 0); + check_seal_unseal_for_handle(c, TPM2_SRK_HANDLE); + + FOREACH_ARRAY(template, test_templates, ELEMENTSOF(test_templates)) { + TPM2B_PUBLIC public = { + .publicArea = **template, + .size = sizeof(**template), + }; + _cleanup_(tpm2_handle_freep) Tpm2Handle *transient_handle = NULL; + assert_se(tpm2_create_primary( + c, + /* session= */ NULL, + &public, + /* sensitive= */ NULL, + /* ret_public= */ NULL, + &transient_handle) >= 0); + + TPMI_DH_PERSISTENT transient_handle_index; + r = tpm2_index_from_handle(c, transient_handle, &transient_handle_index); + if (r == -EOPNOTSUPP) { + /* libesys too old */ + log_tests_skipped("libesys too old for tpm2_index_from_handle"); + return; + } + assert_se(r >= 0); + + check_seal_unseal_for_handle(c, transient_handle_index); + } +} + +TEST_RET(tests_which_require_tpm) { + _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL; + int r = 0; + + if (tpm2_context_new(NULL, &c) < 0) + return log_tests_skipped("Could not find TPM"); + + check_test_parms(c); + check_supports_alg(c); + check_supports_command(c); + check_best_srk_template(c); + check_get_or_create_srk(c); + check_seal_unseal(c); + +#if HAVE_OPENSSL && OPENSSL_VERSION_MAJOR >= 3 /* calculating sealed object requires openssl >= 3 */ + r = check_calculate_seal(c); +#endif + + return r; +} + +#endif /* HAVE_TPM2 */ + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-udev-util.c b/src/test/test-udev-util.c new file mode 100644 index 0000000..cb80c69 --- /dev/null +++ b/src/test/test-udev-util.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "macro.h" +#include "string-util.h" +#include "tests.h" +#include "udev-util.h" + +static void test_udev_replace_whitespace_one_len(const char *str, size_t len, const char *expected) { + _cleanup_free_ char *result = NULL; + int r; + + result = new(char, len + 1); + assert_se(result); + r = udev_replace_whitespace(str, result, len); + assert_se((size_t) r == strlen(expected)); + assert_se(streq(result, expected)); +} + +static void test_udev_replace_whitespace_one(const char *str, const char *expected) { + test_udev_replace_whitespace_one_len(str, strlen(str), expected); +} + +TEST(udev_replace_whitespace) { + test_udev_replace_whitespace_one("hogehoge", "hogehoge"); + test_udev_replace_whitespace_one("hoge hoge", "hoge_hoge"); + test_udev_replace_whitespace_one(" hoge hoge ", "hoge_hoge"); + test_udev_replace_whitespace_one(" ", ""); + test_udev_replace_whitespace_one("hoge ", "hoge"); + + test_udev_replace_whitespace_one_len("hoge hoge ", 9, "hoge_hoge"); + test_udev_replace_whitespace_one_len("hoge hoge ", 8, "hoge_hog"); + test_udev_replace_whitespace_one_len("hoge hoge ", 7, "hoge_ho"); + test_udev_replace_whitespace_one_len("hoge hoge ", 6, "hoge_h"); + test_udev_replace_whitespace_one_len("hoge hoge ", 5, "hoge"); + test_udev_replace_whitespace_one_len("hoge hoge ", 4, "hoge"); + test_udev_replace_whitespace_one_len("hoge hoge ", 3, "hog"); + test_udev_replace_whitespace_one_len("hoge hoge ", 2, "ho"); + test_udev_replace_whitespace_one_len("hoge hoge ", 1, "h"); + test_udev_replace_whitespace_one_len("hoge hoge ", 0, ""); + + test_udev_replace_whitespace_one_len(" hoge hoge ", 16, "hoge_hoge"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 15, "hoge_hoge"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 14, "hoge_hog"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 13, "hoge_ho"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 12, "hoge_h"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 11, "hoge"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 10, "hoge"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 9, "hoge"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 8, "hoge"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 7, "hog"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 6, "ho"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 5, "h"); + test_udev_replace_whitespace_one_len(" hoge hoge ", 4, ""); + test_udev_replace_whitespace_one_len(" hoge hoge ", 3, ""); + test_udev_replace_whitespace_one_len(" hoge hoge ", 2, ""); + test_udev_replace_whitespace_one_len(" hoge hoge ", 1, ""); + test_udev_replace_whitespace_one_len(" hoge hoge ", 0, ""); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-uid-alloc-range.c b/src/test/test-uid-alloc-range.c new file mode 100644 index 0000000..cd06463 --- /dev/null +++ b/src/test/test-uid-alloc-range.c @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "uid-alloc-range.h" + +static void test_read_login_defs_one(const char *path) { + log_info("/* %s(\"%s\") */", __func__, path ?: ""); + + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-user-record.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + if (!path) { + assert_se(fmkostemp_safe(name, "r+", &f) == 0); + fprintf(f, + "SYS_UID_MIN "UID_FMT"\n" + "SYS_UID_MAX "UID_FMT"\n" + "SYS_GID_MIN "GID_FMT"\n" + "SYS_GID_MAX "GID_FMT"\n", + (uid_t) (SYSTEM_ALLOC_UID_MIN + 5), + (uid_t) (SYSTEM_UID_MAX + 5), + (gid_t) (SYSTEM_ALLOC_GID_MIN + 5), + (gid_t) (SYSTEM_GID_MAX + 5)); + assert_se(fflush_and_check(f) >= 0); + } + + UGIDAllocationRange defs; + assert_se(read_login_defs(&defs, path ?: name, NULL) >= 0); + + log_info("system_alloc_uid_min="UID_FMT, defs.system_alloc_uid_min); + log_info("system_uid_max="UID_FMT, defs.system_uid_max); + log_info("system_alloc_gid_min="GID_FMT, defs.system_alloc_gid_min); + log_info("system_gid_max="GID_FMT, defs.system_gid_max); + + if (!path) { + uid_t offset = ENABLE_COMPAT_MUTABLE_UID_BOUNDARIES ? 5 : 0; + assert_se(defs.system_alloc_uid_min == SYSTEM_ALLOC_UID_MIN + offset); + assert_se(defs.system_uid_max == SYSTEM_UID_MAX + offset); + assert_se(defs.system_alloc_gid_min == SYSTEM_ALLOC_GID_MIN + offset); + assert_se(defs.system_gid_max == SYSTEM_GID_MAX + offset); + } else if (streq(path, "/dev/null")) { + assert_se(defs.system_alloc_uid_min == SYSTEM_ALLOC_UID_MIN); + assert_se(defs.system_uid_max == SYSTEM_UID_MAX); + assert_se(defs.system_alloc_gid_min == SYSTEM_ALLOC_GID_MIN); + assert_se(defs.system_gid_max == SYSTEM_GID_MAX); + } +} + +TEST(read_login_defs) { + test_read_login_defs_one("/dev/null"); + test_read_login_defs_one("/etc/login.defs"); + test_read_login_defs_one(NULL); +} + +TEST(acquire_ugid_allocation_range) { + const UGIDAllocationRange *defs; + assert_se(defs = acquire_ugid_allocation_range()); + + log_info("system_alloc_uid_min="UID_FMT, defs->system_alloc_uid_min); + log_info("system_uid_max="UID_FMT, defs->system_uid_max); + log_info("system_alloc_gid_min="GID_FMT, defs->system_alloc_gid_min); + log_info("system_gid_max="GID_FMT, defs->system_gid_max); +} + +TEST(uid_is_system) { + uid_t uid = 0; + log_info("uid_is_system("UID_FMT") = %s", uid, yes_no(uid_is_system(uid))); + + uid = 999; + log_info("uid_is_system("UID_FMT") = %s", uid, yes_no(uid_is_system(uid))); + + uid = getuid(); + log_info("uid_is_system("UID_FMT") = %s", uid, yes_no(uid_is_system(uid))); +} + +TEST(gid_is_system) { + gid_t gid = 0; + log_info("gid_is_system("GID_FMT") = %s", gid, yes_no(gid_is_system(gid))); + + gid = 999; + log_info("gid_is_system("GID_FMT") = %s", gid, yes_no(gid_is_system(gid))); + + gid = getgid(); + log_info("gid_is_system("GID_FMT") = %s", gid, yes_no(gid_is_system(gid))); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-uid-range.c b/src/test/test-uid-range.c new file mode 100644 index 0000000..186f6ee --- /dev/null +++ b/src/test/test-uid-range.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "uid-range.h" +#include "user-util.h" +#include "virt.h" + +TEST(uid_range) { + _cleanup_(uid_range_freep) UidRange *p = NULL; + uid_t search; + + assert_se(uid_range_covers(p, 0, 0)); + assert_se(!uid_range_covers(p, 0, 1)); + assert_se(!uid_range_covers(p, 100, UINT32_MAX)); + + assert_se(uid_range_add_str(&p, "500-999") >= 0); + assert_se(p); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 500); + assert_se(p->entries[0].nr == 500); + + assert_se(!uid_range_contains(p, 499)); + assert_se(uid_range_contains(p, 500)); + assert_se(uid_range_contains(p, 999)); + assert_se(!uid_range_contains(p, 1000)); + + assert_se(!uid_range_covers(p, 100, 150)); + assert_se(!uid_range_covers(p, 400, 200)); + assert_se(!uid_range_covers(p, 499, 1)); + assert_se(uid_range_covers(p, 500, 1)); + assert_se(uid_range_covers(p, 501, 10)); + assert_se(uid_range_covers(p, 999, 1)); + assert_se(!uid_range_covers(p, 999, 2)); + assert_se(!uid_range_covers(p, 1000, 1)); + assert_se(!uid_range_covers(p, 1000, 100)); + assert_se(!uid_range_covers(p, 1001, 100)); + + search = UID_INVALID; + assert_se(uid_range_next_lower(p, &search)); + assert_se(search == 999); + assert_se(uid_range_next_lower(p, &search)); + assert_se(search == 998); + search = 501; + assert_se(uid_range_next_lower(p, &search)); + assert_se(search == 500); + assert_se(uid_range_next_lower(p, &search) == -EBUSY); + + assert_se(uid_range_add_str(&p, "1000") >= 0); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 500); + assert_se(p->entries[0].nr == 501); + + assert_se(uid_range_add_str(&p, "30-40") >= 0); + assert_se(p->n_entries == 2); + assert_se(p->entries[0].start == 30); + assert_se(p->entries[0].nr == 11); + assert_se(p->entries[1].start == 500); + assert_se(p->entries[1].nr == 501); + + assert_se(uid_range_add_str(&p, "60-70") >= 0); + assert_se(p->n_entries == 3); + assert_se(p->entries[0].start == 30); + assert_se(p->entries[0].nr == 11); + assert_se(p->entries[1].start == 60); + assert_se(p->entries[1].nr == 11); + assert_se(p->entries[2].start == 500); + assert_se(p->entries[2].nr == 501); + + assert_se(uid_range_add_str(&p, "20-2000") >= 0); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 20); + assert_se(p->entries[0].nr == 1981); + + assert_se(uid_range_add_str(&p, "2002") >= 0); + assert_se(p->n_entries == 2); + assert_se(p->entries[0].start == 20); + assert_se(p->entries[0].nr == 1981); + assert_se(p->entries[1].start == 2002); + assert_se(p->entries[1].nr == 1); + + assert_se(uid_range_add_str(&p, "2001") >= 0); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 20); + assert_se(p->entries[0].nr == 1983); +} + +TEST(load_userns) { + _cleanup_(uid_range_freep) UidRange *p = NULL; + _cleanup_(unlink_and_freep) char *fn = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + r = uid_range_load_userns(&p, NULL); + if (r < 0 && ERRNO_IS_NOT_SUPPORTED(r)) + return; + + assert_se(r >= 0); + assert_se(uid_range_contains(p, getuid())); + + r = running_in_userns(); + if (r == 0) { + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 0); + assert_se(p->entries[0].nr == UINT32_MAX); + + assert_se(uid_range_covers(p, 0, UINT32_MAX)); + } + + assert_se(fopen_temporary_child(NULL, &f, &fn) >= 0); + fputs("0 0 20\n" + "100 0 20\n", f); + assert_se(fflush_and_check(f) >= 0); + + p = uid_range_free(p); + + assert_se(uid_range_load_userns(&p, fn) >= 0); + + assert_se(uid_range_contains(p, 0)); + assert_se(uid_range_contains(p, 19)); + assert_se(!uid_range_contains(p, 20)); + + assert_se(!uid_range_contains(p, 99)); + assert_se(uid_range_contains(p, 100)); + assert_se(uid_range_contains(p, 119)); + assert_se(!uid_range_contains(p, 120)); +} + +TEST(uid_range_coalesce) { + _cleanup_(uid_range_freep) UidRange *p = NULL; + + for (size_t i = 0; i < 10; i++) { + assert_se(uid_range_add_internal(&p, i * 10, 10, /* coalesce = */ false) >= 0); + assert_se(uid_range_add_internal(&p, i * 10 + 5, 10, /* coalesce = */ false) >= 0); + } + + assert_se(uid_range_add_internal(&p, 100, 1, /* coalesce = */ true) >= 0); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 0); + assert_se(p->entries[0].nr == 105); + + p = uid_range_free(p); + + for (size_t i = 0; i < 10; i++) { + assert_se(uid_range_add_internal(&p, (10 - i) * 10, 10, /* coalesce = */ false) >= 0); + assert_se(uid_range_add_internal(&p, (10 - i) * 10 + 5, 10, /* coalesce = */ false) >= 0); + } + + assert_se(uid_range_add_internal(&p, 100, 1, /* coalesce = */ true) >= 0); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 10); + assert_se(p->entries[0].nr == 105); + + p = uid_range_free(p); + + for (size_t i = 0; i < 10; i++) { + assert_se(uid_range_add_internal(&p, i * 10, 10, /* coalesce = */ false) >= 0); + assert_se(uid_range_add_internal(&p, i * 10 + 5, 10, /* coalesce = */ false) >= 0); + assert_se(uid_range_add_internal(&p, (10 - i) * 10, 10, /* coalesce = */ false) >= 0); + assert_se(uid_range_add_internal(&p, (10 - i) * 10 + 5, 10, /* coalesce = */ false) >= 0); + } + assert_se(uid_range_add_internal(&p, 100, 1, /* coalesce = */ true) >= 0); + assert_se(p->n_entries == 1); + assert_se(p->entries[0].start == 0); + assert_se(p->entries[0].nr == 115); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-umask-util.c b/src/test/test-umask-util.c new file mode 100644 index 0000000..8316dfb --- /dev/null +++ b/src/test/test-umask-util.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "process-util.h" +#include "tests.h" +#include "umask-util.h" + +int main(int argc, char *argv[]) { + size_t n; + mode_t u, t; + + test_setup_logging(LOG_DEBUG); + + u = umask(0111); + + n = 0; + WITH_UMASK(0123) { + assert_se(umask(000) == 0123); + n++; + + assert_se(get_process_umask(0, &t) == 0); + assert_se(t == 000); + } + + assert_se(n == 1); + assert_se(umask(u) == 0111); + + assert_se(get_process_umask(getpid_cached(), &t) == 0); + assert_se(t == u); + + WITH_UMASK(0135) { + assert_se(umask(000) == 0135); + n++; + + assert_se(get_process_umask(0, &t) == 0); + assert_se(t == 000); + } + + assert_se(n == 2); + assert_se(umask(0111) == u); + + assert_se(get_process_umask(0, &t) == 0); + assert_se(t == 0111); + + WITH_UMASK(0315) { + assert_se(umask(000) == 0315); + n++; + break; + } + + assert_se(n == 3); + assert_se(umask(u) == 0111); + + assert_se(get_process_umask(0, &t) == 0); + assert_se(t == u); + + return EXIT_SUCCESS; +} diff --git a/src/test/test-unaligned.c b/src/test/test-unaligned.c new file mode 100644 index 0000000..728c193 --- /dev/null +++ b/src/test/test-unaligned.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "memory-util.h" +#include "sparse-endian.h" +#include "tests.h" +#include "unaligned.h" + +static uint8_t data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, +}; + +TEST(be) { + uint8_t scratch[16]; + + assert_se(unaligned_read_be16(&data[0]) == 0x0001); + assert_se(unaligned_read_be16(&data[1]) == 0x0102); + + assert_se(unaligned_read_be32(&data[0]) == 0x00010203); + assert_se(unaligned_read_be32(&data[1]) == 0x01020304); + assert_se(unaligned_read_be32(&data[2]) == 0x02030405); + assert_se(unaligned_read_be32(&data[3]) == 0x03040506); + + assert_se(unaligned_read_be64(&data[0]) == 0x0001020304050607); + assert_se(unaligned_read_be64(&data[1]) == 0x0102030405060708); + assert_se(unaligned_read_be64(&data[2]) == 0x0203040506070809); + assert_se(unaligned_read_be64(&data[3]) == 0x030405060708090a); + assert_se(unaligned_read_be64(&data[4]) == 0x0405060708090a0b); + assert_se(unaligned_read_be64(&data[5]) == 0x05060708090a0b0c); + assert_se(unaligned_read_be64(&data[6]) == 0x060708090a0b0c0d); + assert_se(unaligned_read_be64(&data[7]) == 0x0708090a0b0c0d0e); + + zero(scratch); + unaligned_write_be16(&scratch[0], 0x0001); + assert_se(memcmp(&scratch[0], &data[0], sizeof(uint16_t)) == 0); + zero(scratch); + unaligned_write_be16(&scratch[1], 0x0102); + assert_se(memcmp(&scratch[1], &data[1], sizeof(uint16_t)) == 0); + + zero(scratch); + unaligned_write_be32(&scratch[0], 0x00010203); + assert_se(memcmp(&scratch[0], &data[0], sizeof(uint32_t)) == 0); + zero(scratch); + unaligned_write_be32(&scratch[1], 0x01020304); + assert_se(memcmp(&scratch[1], &data[1], sizeof(uint32_t)) == 0); + zero(scratch); + unaligned_write_be32(&scratch[2], 0x02030405); + assert_se(memcmp(&scratch[2], &data[2], sizeof(uint32_t)) == 0); + zero(scratch); + unaligned_write_be32(&scratch[3], 0x03040506); + assert_se(memcmp(&scratch[3], &data[3], sizeof(uint32_t)) == 0); + + zero(scratch); + unaligned_write_be64(&scratch[0], 0x0001020304050607); + assert_se(memcmp(&scratch[0], &data[0], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[1], 0x0102030405060708); + assert_se(memcmp(&scratch[1], &data[1], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[2], 0x0203040506070809); + assert_se(memcmp(&scratch[2], &data[2], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[3], 0x030405060708090a); + assert_se(memcmp(&scratch[3], &data[3], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[4], 0x0405060708090a0b); + assert_se(memcmp(&scratch[4], &data[4], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[5], 0x05060708090a0b0c); + assert_se(memcmp(&scratch[5], &data[5], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[6], 0x060708090a0b0c0d); + assert_se(memcmp(&scratch[6], &data[6], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_be64(&scratch[7], 0x0708090a0b0c0d0e); + assert_se(memcmp(&scratch[7], &data[7], sizeof(uint64_t)) == 0); +} + +TEST(le) { + uint8_t scratch[16]; + + assert_se(unaligned_read_le16(&data[0]) == 0x0100); + assert_se(unaligned_read_le16(&data[1]) == 0x0201); + + assert_se(unaligned_read_le32(&data[0]) == 0x03020100); + assert_se(unaligned_read_le32(&data[1]) == 0x04030201); + assert_se(unaligned_read_le32(&data[2]) == 0x05040302); + assert_se(unaligned_read_le32(&data[3]) == 0x06050403); + + assert_se(unaligned_read_le64(&data[0]) == 0x0706050403020100); + assert_se(unaligned_read_le64(&data[1]) == 0x0807060504030201); + assert_se(unaligned_read_le64(&data[2]) == 0x0908070605040302); + assert_se(unaligned_read_le64(&data[3]) == 0x0a09080706050403); + assert_se(unaligned_read_le64(&data[4]) == 0x0b0a090807060504); + assert_se(unaligned_read_le64(&data[5]) == 0x0c0b0a0908070605); + assert_se(unaligned_read_le64(&data[6]) == 0x0d0c0b0a09080706); + assert_se(unaligned_read_le64(&data[7]) == 0x0e0d0c0b0a090807); + + zero(scratch); + unaligned_write_le16(&scratch[0], 0x0100); + assert_se(memcmp(&scratch[0], &data[0], sizeof(uint16_t)) == 0); + zero(scratch); + unaligned_write_le16(&scratch[1], 0x0201); + assert_se(memcmp(&scratch[1], &data[1], sizeof(uint16_t)) == 0); + + zero(scratch); + unaligned_write_le32(&scratch[0], 0x03020100); + + assert_se(memcmp(&scratch[0], &data[0], sizeof(uint32_t)) == 0); + zero(scratch); + unaligned_write_le32(&scratch[1], 0x04030201); + assert_se(memcmp(&scratch[1], &data[1], sizeof(uint32_t)) == 0); + zero(scratch); + unaligned_write_le32(&scratch[2], 0x05040302); + assert_se(memcmp(&scratch[2], &data[2], sizeof(uint32_t)) == 0); + zero(scratch); + unaligned_write_le32(&scratch[3], 0x06050403); + assert_se(memcmp(&scratch[3], &data[3], sizeof(uint32_t)) == 0); + + zero(scratch); + unaligned_write_le64(&scratch[0], 0x0706050403020100); + assert_se(memcmp(&scratch[0], &data[0], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[1], 0x0807060504030201); + assert_se(memcmp(&scratch[1], &data[1], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[2], 0x0908070605040302); + assert_se(memcmp(&scratch[2], &data[2], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[3], 0x0a09080706050403); + assert_se(memcmp(&scratch[3], &data[3], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[4], 0x0B0A090807060504); + assert_se(memcmp(&scratch[4], &data[4], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[5], 0x0c0b0a0908070605); + assert_se(memcmp(&scratch[5], &data[5], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[6], 0x0d0c0b0a09080706); + assert_se(memcmp(&scratch[6], &data[6], sizeof(uint64_t)) == 0); + zero(scratch); + unaligned_write_le64(&scratch[7], 0x0e0d0c0b0a090807); + assert_se(memcmp(&scratch[7], &data[7], sizeof(uint64_t)) == 0); +} + +TEST(ne) { + uint16_t x = 4711; + uint32_t y = 123456; + uint64_t z = 9876543210; + + /* Note that we don't bother actually testing alignment issues in this function, after all the _ne() functions + * are just aliases for the _le() or _be() implementations, which we test extensively above. Hence, in this + * function, just ensure that they map to the right version on the local architecture. */ + + assert_se(unaligned_read_ne16(&x) == 4711); + assert_se(unaligned_read_ne32(&y) == 123456); + assert_se(unaligned_read_ne64(&z) == 9876543210); + + unaligned_write_ne16(&x, 1); + unaligned_write_ne32(&y, 2); + unaligned_write_ne64(&z, 3); + + assert_se(x == 1); + assert_se(y == 2); + assert_se(z == 3); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-unit-file.c b/src/test/test-unit-file.c new file mode 100644 index 0000000..9f8787b --- /dev/null +++ b/src/test/test-unit-file.c @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "initrd-util.h" +#include "path-lookup.h" +#include "set.h" +#include "special.h" +#include "strv.h" +#include "tests.h" +#include "unit-file.h" + +TEST(unit_validate_alias_symlink_and_warn) { + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a.service", "/other/b.service") == 0); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a.service", "/other/b.socket") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a.service", "/other/b.foobar") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@.service", "/other/b@.service") == 0); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@.service", "/other/b@.socket") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@XXX.service", "/other/b@YYY.service") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@XXX.service", "/other/b@YYY.socket") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@.service", "/other/b@YYY.service") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@XXX.service", "/other/b@XXX.service") == 0); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@XXX.service", "/other/b@.service") == 0); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@.service", "/other/b.service") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a.service", "/other/b@.service") == -EXDEV); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a@.slice", "/other/b.slice") == -EINVAL); + assert_se(unit_validate_alias_symlink_or_warn(LOG_INFO, "/path/a.slice", "/other/b.slice") == -EINVAL); +} + +TEST(unit_file_build_name_map) { + _cleanup_(lookup_paths_free) LookupPaths lp = {}; + _cleanup_hashmap_free_ Hashmap *unit_ids = NULL; + _cleanup_hashmap_free_ Hashmap *unit_names = NULL; + const char *k, *dst; + char **v, **ids; + usec_t mtime = 0; + int r; + + ids = strv_skip(saved_argv, 1); + + assert_se(lookup_paths_init(&lp, RUNTIME_SCOPE_SYSTEM, 0, NULL) >= 0); + + assert_se(unit_file_build_name_map(&lp, &mtime, &unit_ids, &unit_names, NULL) == 1); + + HASHMAP_FOREACH_KEY(dst, k, unit_ids) + log_info("ids: %s → %s", k, dst); + + HASHMAP_FOREACH_KEY(v, k, unit_names) { + _cleanup_free_ char *j = strv_join(v, ", "); + log_info("aliases: %s ← %s", k, j); + } + + char buf[FORMAT_TIMESTAMP_MAX]; + log_debug("Last modification time: %s", format_timestamp(buf, sizeof buf, mtime)); + + r = unit_file_build_name_map(&lp, &mtime, &unit_ids, &unit_names, NULL); + assert_se(IN_SET(r, 0, 1)); + if (r == 0) + log_debug("Cache rebuild skipped based on mtime."); + + STRV_FOREACH(id, ids) { + const char *fragment, *name; + _cleanup_set_free_free_ Set *names = NULL; + log_info("*** %s ***", *id); + r = unit_file_find_fragment(unit_ids, + unit_names, + *id, + &fragment, + &names); + assert_se(r == 0); + log_info("fragment: %s", fragment); + log_info("names:"); + SET_FOREACH(name, names) + log_info(" %s", name); + } + + /* Make sure everything still works if we don't collect names. */ + STRV_FOREACH(id, ids) { + const char *fragment; + log_info("*** %s ***", *id); + r = unit_file_find_fragment(unit_ids, + unit_names, + *id, + &fragment, + NULL); + assert_se(r == 0); + log_info("fragment: %s", fragment); + } +} + +TEST(runlevel_to_target) { + in_initrd_force(false); + assert_se(streq_ptr(runlevel_to_target(NULL), NULL)); + assert_se(streq_ptr(runlevel_to_target("unknown-runlevel"), NULL)); + assert_se(streq_ptr(runlevel_to_target("rd.unknown-runlevel"), NULL)); + assert_se(streq_ptr(runlevel_to_target("3"), SPECIAL_MULTI_USER_TARGET)); + assert_se(streq_ptr(runlevel_to_target("rd.rescue"), NULL)); + + in_initrd_force(true); + assert_se(streq_ptr(runlevel_to_target(NULL), NULL)); + assert_se(streq_ptr(runlevel_to_target("unknown-runlevel"), NULL)); + assert_se(streq_ptr(runlevel_to_target("rd.unknown-runlevel"), NULL)); + assert_se(streq_ptr(runlevel_to_target("3"), NULL)); + assert_se(streq_ptr(runlevel_to_target("rd.rescue"), SPECIAL_RESCUE_TARGET)); +} + +static int intro(void) { + log_show_color(true); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-unit-name.c b/src/test/test-unit-name.c new file mode 100644 index 0000000..8e9332c --- /dev/null +++ b/src/test/test-unit-name.c @@ -0,0 +1,1009 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "all-units.h" +#include "glob-util.h" +#include "format-util.h" +#include "hostname-util.h" +#include "macro.h" +#include "manager.h" +#include "path-util.h" +#include "rm-rf.h" +#include "special.h" +#include "specifier.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "unit-def.h" +#include "unit-name.h" +#include "unit-printf.h" +#include "unit.h" +#include "user-util.h" + +static char *runtime_dir = NULL; + +STATIC_DESTRUCTOR_REGISTER(runtime_dir, rm_rf_physical_and_freep); + +static void test_unit_name_is_valid_one(const char *name, UnitNameFlags flags, bool expected) { + log_info("%s ( %s%s%s ): %s", + name, + (flags & UNIT_NAME_PLAIN) ? "plain" : "", + (flags & UNIT_NAME_INSTANCE) ? " instance" : "", + (flags & UNIT_NAME_TEMPLATE) ? " template" : "", + yes_no(expected)); + assert_se(unit_name_is_valid(name, flags) == expected); +} + +TEST(unit_name_is_valid) { + test_unit_name_is_valid_one("foo.service", UNIT_NAME_ANY, true); + test_unit_name_is_valid_one("foo.service", UNIT_NAME_PLAIN, true); + test_unit_name_is_valid_one("foo.service", UNIT_NAME_INSTANCE, false); + test_unit_name_is_valid_one("foo.service", UNIT_NAME_TEMPLATE, false); + test_unit_name_is_valid_one("foo.service", UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE, false); + + test_unit_name_is_valid_one("foo@bar.service", UNIT_NAME_ANY, true); + test_unit_name_is_valid_one("foo@bar.service", UNIT_NAME_PLAIN, false); + test_unit_name_is_valid_one("foo@bar.service", UNIT_NAME_INSTANCE, true); + test_unit_name_is_valid_one("foo@bar.service", UNIT_NAME_TEMPLATE, false); + test_unit_name_is_valid_one("foo@bar.service", UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE, true); + + test_unit_name_is_valid_one("foo@bar@bar.service", UNIT_NAME_ANY, true); + test_unit_name_is_valid_one("foo@bar@bar.service", UNIT_NAME_PLAIN, false); + test_unit_name_is_valid_one("foo@bar@bar.service", UNIT_NAME_INSTANCE, true); + test_unit_name_is_valid_one("foo@bar@bar.service", UNIT_NAME_TEMPLATE, false); + test_unit_name_is_valid_one("foo@bar@bar.service", UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE, true); + + test_unit_name_is_valid_one("foo@.service", UNIT_NAME_ANY, true); + test_unit_name_is_valid_one("foo@.service", UNIT_NAME_PLAIN, false); + test_unit_name_is_valid_one("foo@.service", UNIT_NAME_INSTANCE, false); + test_unit_name_is_valid_one("foo@.service", UNIT_NAME_TEMPLATE, true); + test_unit_name_is_valid_one("foo@.service", UNIT_NAME_INSTANCE|UNIT_NAME_TEMPLATE, true); + test_unit_name_is_valid_one(".test.service", UNIT_NAME_PLAIN, true); + test_unit_name_is_valid_one(".test@.service", UNIT_NAME_TEMPLATE, true); + test_unit_name_is_valid_one("_strange::::.service", UNIT_NAME_ANY, true); + + test_unit_name_is_valid_one(".service", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("foo.waldo", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("@.service", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("@piep.service", UNIT_NAME_ANY, false); + + test_unit_name_is_valid_one("user@1000.slice", UNIT_NAME_ANY, true); + test_unit_name_is_valid_one("user@1000.slice", UNIT_NAME_INSTANCE, true); + test_unit_name_is_valid_one("user@1000.slice", UNIT_NAME_TEMPLATE, false); + + test_unit_name_is_valid_one("foo@%i.service", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("foo@%i.service", UNIT_NAME_INSTANCE, false); + test_unit_name_is_valid_one("foo@%%i.service", UNIT_NAME_INSTANCE, false); + test_unit_name_is_valid_one("foo@%%i%f.service", UNIT_NAME_INSTANCE, false); + test_unit_name_is_valid_one("foo@%F.service", UNIT_NAME_INSTANCE, false); + + test_unit_name_is_valid_one("foo.target.wants/plain.service", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("foo.target.conf/foo.conf", UNIT_NAME_ANY, false); + test_unit_name_is_valid_one("foo.target.requires/plain.socket", UNIT_NAME_ANY, false); +} + +static void test_unit_name_replace_instance_one(const char *pattern, const char *repl, const char *expected, int ret) { + _cleanup_free_ char *t = NULL; + assert_se(unit_name_replace_instance(pattern, repl, &t) == ret); + puts(strna(t)); + assert_se(streq_ptr(t, expected)); +} + +TEST(unit_name_replace_instance) { + test_unit_name_replace_instance_one("foo@.service", "waldo", "foo@waldo.service", 0); + test_unit_name_replace_instance_one("foo@xyz.service", "waldo", "foo@waldo.service", 0); + test_unit_name_replace_instance_one("xyz", "waldo", NULL, -EINVAL); + test_unit_name_replace_instance_one("", "waldo", NULL, -EINVAL); + test_unit_name_replace_instance_one("foo.service", "waldo", NULL, -EINVAL); + test_unit_name_replace_instance_one(".service", "waldo", NULL, -EINVAL); + test_unit_name_replace_instance_one("foo@", "waldo", NULL, -EINVAL); + test_unit_name_replace_instance_one("@bar", "waldo", NULL, -EINVAL); +} + +static void test_unit_name_from_path_one(const char *path, const char *suffix, const char *expected, int ret) { + _cleanup_free_ char *t = NULL; + int r; + + assert_se(unit_name_from_path(path, suffix, &t) == ret); + puts(strna(t)); + assert_se(streq_ptr(t, expected)); + + if (t) { + _cleanup_free_ char *k = NULL; + + /* We don't support converting hashed unit names back to paths */ + r = unit_name_to_path(t, &k); + if (r == -ENAMETOOLONG) + return; + assert(r == 0); + + puts(strna(k)); + assert_se(path_equal(k, empty_to_root(path))); + } +} + +TEST(unit_name_is_hashed) { + assert_se(!unit_name_is_hashed("")); + assert_se(!unit_name_is_hashed("foo@bar.service")); + assert_se(!unit_name_is_hashed("foo@.service")); + assert_se(unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_7736d9ed33c2ec55.mount")); + assert_se(!unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_7736D9ED33C2EC55.mount")); + assert_se(!unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!7736d9ed33c2ec55.mount")); + assert_se(!unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_7736d9gd33c2ec55.mount")); + assert_se(!unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_.mount")); + assert_se(!unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_2103e1466b87f7f7@waldo.mount")); + assert_se(!unit_name_is_hashed("waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_2103e1466b87f7f7@.mount")); +} + +TEST(unit_name_from_path) { + test_unit_name_from_path_one("/waldo", ".mount", "waldo.mount", 0); + test_unit_name_from_path_one("/waldo/quuix", ".mount", "waldo-quuix.mount", 0); + test_unit_name_from_path_one("/waldo/quuix/", ".mount", "waldo-quuix.mount", 0); + test_unit_name_from_path_one("", ".mount", "-.mount", 0); + test_unit_name_from_path_one("/", ".mount", "-.mount", 0); + test_unit_name_from_path_one("///", ".mount", "-.mount", 0); + test_unit_name_from_path_one("/foo/../bar", ".mount", NULL, -EINVAL); + test_unit_name_from_path_one("/foo/./bar", ".mount", "foo-bar.mount", 0); + test_unit_name_from_path_one("/waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", ".mount", + "waldoaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_7736d9ed33c2ec55.mount", 0); +} + +static void test_unit_name_from_path_instance_one(const char *pattern, const char *path, const char *suffix, const char *expected, int ret) { + _cleanup_free_ char *t = NULL; + + assert_se(unit_name_from_path_instance(pattern, path, suffix, &t) == ret); + puts(strna(t)); + assert_se(streq_ptr(t, expected)); + + if (t) { + _cleanup_free_ char *k = NULL, *v = NULL; + + assert_se(unit_name_to_instance(t, &k) > 0); + assert_se(unit_name_path_unescape(k, &v) == 0); + assert_se(path_equal(v, empty_to_root(path))); + } +} + +TEST(unit_name_from_path_instance) { + test_unit_name_from_path_instance_one("waldo", "/waldo", ".mount", "waldo@waldo.mount", 0); + test_unit_name_from_path_instance_one("waldo", "/waldo////quuix////", ".mount", "waldo@waldo-quuix.mount", 0); + test_unit_name_from_path_instance_one("waldo", "/", ".mount", "waldo@-.mount", 0); + test_unit_name_from_path_instance_one("waldo", "", ".mount", "waldo@-.mount", 0); + test_unit_name_from_path_instance_one("waldo", "///", ".mount", "waldo@-.mount", 0); + test_unit_name_from_path_instance_one("waldo", "..", ".mount", NULL, -EINVAL); + test_unit_name_from_path_instance_one("waldo", "/foo", ".waldi", NULL, -EINVAL); + test_unit_name_from_path_instance_one("wa--ldo", "/--", ".mount", "wa--ldo@\\x2d\\x2d.mount", 0); +} + +static void test_unit_name_to_path_one(const char *unit, const char *path, int ret) { + _cleanup_free_ char *p = NULL; + + assert_se(unit_name_to_path(unit, &p) == ret); + assert_se(streq_ptr(path, p)); +} + +TEST(unit_name_to_path) { + test_unit_name_to_path_one("home.mount", "/home", 0); + test_unit_name_to_path_one("home-lennart.mount", "/home/lennart", 0); + test_unit_name_to_path_one("home-lennart-.mount", NULL, -EINVAL); + test_unit_name_to_path_one("-home-lennart.mount", NULL, -EINVAL); + test_unit_name_to_path_one("-home--lennart.mount", NULL, -EINVAL); + test_unit_name_to_path_one("home-..-lennart.mount", NULL, -EINVAL); + test_unit_name_to_path_one("", NULL, -EINVAL); + test_unit_name_to_path_one("home/foo", NULL, -EINVAL); +} + +static void test_unit_name_mangle_one(bool allow_globs, const char *pattern, const char *expect, int ret) { + _cleanup_free_ char *t = NULL; + int r; + + r = unit_name_mangle(pattern, (allow_globs * UNIT_NAME_MANGLE_GLOB) | UNIT_NAME_MANGLE_WARN, &t); + log_debug("%s: %s -> %d, %s", __func__, pattern, r, strnull(t)); + + assert_se(r == ret); + puts(strna(t)); + assert_se(streq_ptr(t, expect)); + + if (t) { + _cleanup_free_ char *k = NULL; + + assert_se(unit_name_is_valid(t, UNIT_NAME_ANY) || + (allow_globs && string_is_glob(t))); + + assert_se(unit_name_mangle(t, (allow_globs * UNIT_NAME_MANGLE_GLOB) | UNIT_NAME_MANGLE_WARN, &k) == 0); + assert_se(streq_ptr(t, k)); + } +} + +TEST(unit_name_mangle) { + test_unit_name_mangle_one(false, "foo.service", "foo.service", 0); + test_unit_name_mangle_one(false, "/home", "home.mount", 1); + test_unit_name_mangle_one(false, "/dev/sda", "dev-sda.device", 1); + test_unit_name_mangle_one(false, "üxknürz.service", "\\xc3\\xbcxkn\\xc3\\xbcrz.service", 1); + test_unit_name_mangle_one(false, "foobar-meh...waldi.service", "foobar-meh...waldi.service", 0); + test_unit_name_mangle_one(false, "_____####----.....service", "_____\\x23\\x23\\x23\\x23----.....service", 1); + test_unit_name_mangle_one(false, "_____##@;;;,,,##----.....service", "_____\\x23\\x23@\\x3b\\x3b\\x3b\\x2c\\x2c\\x2c\\x23\\x23----.....service", 1); + test_unit_name_mangle_one(false, "xxx@@@@/////\\\\\\\\\\yyy.service", "xxx@@@@-----\\\\\\\\\\yyy.service", 1); + test_unit_name_mangle_one(false, "", NULL, -EINVAL); + + test_unit_name_mangle_one(true, "foo.service", "foo.service", 0); + test_unit_name_mangle_one(true, "foo", "foo.service", 1); + test_unit_name_mangle_one(true, "foo*", "foo*", 0); + test_unit_name_mangle_one(true, "ü*", "\\xc3\\xbc*", 1); +} + +static void test_unit_name_mangle_with_suffix_one(const char *arg, int expected, const char *expected_name) { + _cleanup_free_ char *s = NULL; + int r; + + r = unit_name_mangle_with_suffix(arg, NULL, 0, ".service", &s); + log_debug("%s: %s -> %d, %s", __func__, arg, r, strnull(s)); + + assert_se(r == expected); + assert_se(streq_ptr(s, expected_name)); +} + +TEST(unit_name_mangle_with_suffix) { + test_unit_name_mangle_with_suffix_one("", -EINVAL, NULL); + + test_unit_name_mangle_with_suffix_one("/dev", 1, "dev.mount"); + test_unit_name_mangle_with_suffix_one("/../dev", 1, "dev.mount"); + test_unit_name_mangle_with_suffix_one("/../dev/.", 1, "dev.mount"); + /* We don't skip the last '..', and it makes this an invalid device or mount name */ + test_unit_name_mangle_with_suffix_one("/.././dev/..", 1, "-..-.-dev-...service"); + test_unit_name_mangle_with_suffix_one("/.././dev", 1, "dev.mount"); + test_unit_name_mangle_with_suffix_one("/./.././../dev/", 1, "dev.mount"); + + test_unit_name_mangle_with_suffix_one("/dev/sda", 1, "dev-sda.device"); + test_unit_name_mangle_with_suffix_one("/dev/sda5", 1, "dev-sda5.device"); + + test_unit_name_mangle_with_suffix_one("/sys", 1, "sys.mount"); + test_unit_name_mangle_with_suffix_one("/../sys", 1, "sys.mount"); + test_unit_name_mangle_with_suffix_one("/../sys/.", 1, "sys.mount"); + /* We don't skip the last '..', and it makes this an invalid device or mount name */ + test_unit_name_mangle_with_suffix_one("/.././sys/..", 1, "-..-.-sys-...service"); + test_unit_name_mangle_with_suffix_one("/.././sys", 1, "sys.mount"); + test_unit_name_mangle_with_suffix_one("/./.././../sys/", 1, "sys.mount"); + + test_unit_name_mangle_with_suffix_one("/proc", 1, "proc.mount"); + test_unit_name_mangle_with_suffix_one("/../proc", 1, "proc.mount"); + test_unit_name_mangle_with_suffix_one("/../proc/.", 1, "proc.mount"); + /* We don't skip the last '..', and it makes this an invalid device or mount name */ + test_unit_name_mangle_with_suffix_one("/.././proc/..", 1, "-..-.-proc-...service"); + test_unit_name_mangle_with_suffix_one("/.././proc", 1, "proc.mount"); + test_unit_name_mangle_with_suffix_one("/./.././../proc/", 1, "proc.mount"); +} + +TEST_RET(unit_printf, .sd_booted = true) { + _cleanup_free_ char + *architecture, *os_image_version, *boot_id = NULL, *os_build_id, + *hostname, *short_hostname, *pretty_hostname, + *machine_id = NULL, *os_image_id, *os_id, *os_version_id, *os_variant_id, + *user, *group, *uid, *gid, *home, *shell, + *tmp_dir, *var_tmp_dir; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_close_ int fd = -EBADF; + Unit *u; + int r; + + _cleanup_(unlink_tempfilep) char filename[] = "/tmp/test-unit_printf.XXXXXX"; + fd = mkostemp_safe(filename); + assert_se(fd >= 0); + + /* Using the specifier functions is admittedly a bit circular, but we don't want to reimplement the + * logic a second time. We're at least testing that the hookup works. */ + assert_se(specifier_architecture('a', NULL, NULL, NULL, &architecture) >= 0); + assert_se(architecture); + assert_se(specifier_os_image_version('A', NULL, NULL, NULL, &os_image_version) >= 0); + if (sd_booted() > 0) { + assert_se(specifier_boot_id('b', NULL, NULL, NULL, &boot_id) >= 0); + assert_se(boot_id); + } + assert_se(specifier_os_build_id('B', NULL, NULL, NULL, &os_build_id) >= 0); + assert_se(hostname = gethostname_malloc()); + assert_se(specifier_short_hostname('l', NULL, NULL, NULL, &short_hostname) == 0); + assert_se(short_hostname); + assert_se(specifier_pretty_hostname('q', NULL, NULL, NULL, &pretty_hostname) == 0); + assert_se(pretty_hostname); + if (sd_id128_get_machine(NULL) >= 0) { + assert_se(specifier_machine_id('m', NULL, NULL, NULL, &machine_id) >= 0); + assert_se(machine_id); + } + assert_se(specifier_os_image_id('M', NULL, NULL, NULL, &os_image_id) >= 0); + assert_se(specifier_os_id('o', NULL, NULL, NULL, &os_id) >= 0); + assert_se(specifier_os_version_id('w', NULL, NULL, NULL, &os_version_id) >= 0); + assert_se(specifier_os_variant_id('W', NULL, NULL, NULL, &os_variant_id) >= 0); + assert_se(user = uid_to_name(getuid())); + assert_se(group = gid_to_name(getgid())); + assert_se(asprintf(&uid, UID_FMT, getuid())); + assert_se(asprintf(&gid, UID_FMT, getgid())); + assert_se(get_home_dir(&home) >= 0); + assert_se(get_shell(&shell) >= 0); + assert_se(specifier_tmp_dir('T', NULL, NULL, NULL, &tmp_dir) >= 0); + assert_se(tmp_dir); + assert_se(specifier_var_tmp_dir('V', NULL, NULL, NULL, &var_tmp_dir) >= 0); + assert_se(var_tmp_dir); + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) + return log_tests_skipped_errno(r, "manager_new"); + assert_se(r == 0); + + assert_se(free_and_strdup(&m->cgroup_root, "/cgroup-root") == 1); + +#define expect(unit, pattern, _expected) \ + { \ + _cleanup_free_ char *t = NULL; \ + assert_se(unit_full_printf(unit, pattern, &t) >= 0); \ + const char *expected = strempty(_expected); \ + printf("%s: result: %s\n expect: %s\n", pattern, t, expected); \ + assert_se(fnmatch(expected, t, FNM_NOESCAPE) == 0); \ + } + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "blah.service") == 0); + assert_se(unit_add_name(u, "blah.service") == 0); + + /* We need *a* file that exists, but it doesn't even need to have the right suffix. */ + assert_se(free_and_strdup(&u->fragment_path, filename) == 1); + + /* This sets the slice to /app.slice. */ + assert_se(unit_set_default_slice(u) == 1); + + /* general tests */ + expect(u, "%%", "%"); + expect(u, "%%s", "%s"); + expect(u, "%,", "%,"); + expect(u, "%", "%"); + + /* normal unit */ + expect(u, "%a", architecture); + expect(u, "%A", os_image_version); + if (boot_id) + expect(u, "%b", boot_id); + expect(u, "%B", os_build_id); + expect(u, "%H", hostname); + expect(u, "%l", short_hostname); + expect(u, "%q", pretty_hostname); + if (machine_id) + expect(u, "%m", machine_id); + expect(u, "%M", os_image_id); + expect(u, "%o", os_id); + expect(u, "%w", os_version_id); + expect(u, "%W", os_variant_id); + expect(u, "%g", group); + expect(u, "%G", gid); + expect(u, "%u", user); + expect(u, "%U", uid); + expect(u, "%T", tmp_dir); + expect(u, "%V", var_tmp_dir); + + expect(u, "%i", ""); + expect(u, "%I", ""); + expect(u, "%j", "blah"); + expect(u, "%J", "blah"); + expect(u, "%n", "blah.service"); + expect(u, "%N", "blah"); + expect(u, "%p", "blah"); + expect(u, "%P", "blah"); + expect(u, "%f", "/blah"); + expect(u, "%y", filename); + expect(u, "%Y", "/tmp"); + expect(u, "%C", m->prefix[EXEC_DIRECTORY_CACHE]); + expect(u, "%d", "*/credentials/blah.service"); + expect(u, "%E", m->prefix[EXEC_DIRECTORY_CONFIGURATION]); + expect(u, "%L", m->prefix[EXEC_DIRECTORY_LOGS]); + expect(u, "%S", m->prefix[EXEC_DIRECTORY_STATE]); + expect(u, "%t", m->prefix[EXEC_DIRECTORY_RUNTIME]); + expect(u, "%h", home); + expect(u, "%s", shell); + + /* deprecated */ + expect(u, "%c", "/cgroup-root/app.slice/blah.service"); + expect(u, "%r", "/cgroup-root/app.slice"); + expect(u, "%R", "/cgroup-root"); + + /* templated */ + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "blah@foo-foo.service") == 0); + assert_se(unit_add_name(u, "blah@foo-foo.service") == 0); + + assert_se(free_and_strdup(&u->fragment_path, filename) == 1); + + /* This sets the slice to /app.slice/app-blah.slice. */ + assert_se(unit_set_default_slice(u) == 1); + + expect(u, "%i", "foo-foo"); + expect(u, "%I", "foo/foo"); + expect(u, "%j", "blah"); + expect(u, "%J", "blah"); + expect(u, "%n", "blah@foo-foo.service"); + expect(u, "%N", "blah@foo-foo"); + expect(u, "%p", "blah"); + expect(u, "%P", "blah"); + expect(u, "%f", "/foo/foo"); + expect(u, "%y", filename); + expect(u, "%Y", "/tmp"); + expect(u, "%C", m->prefix[EXEC_DIRECTORY_CACHE]); + expect(u, "%d", "*/credentials/blah@foo-foo.service"); + expect(u, "%E", m->prefix[EXEC_DIRECTORY_CONFIGURATION]); + expect(u, "%L", m->prefix[EXEC_DIRECTORY_LOGS]); + expect(u, "%S", m->prefix[EXEC_DIRECTORY_STATE]); + expect(u, "%t", m->prefix[EXEC_DIRECTORY_RUNTIME]); + expect(u, "%h", home); + expect(u, "%s", shell); + + /* deprecated */ + expect(u, "%c", "/cgroup-root/app.slice/app-blah.slice/blah@foo-foo.service"); + expect(u, "%r", "/cgroup-root/app.slice/app-blah.slice"); + expect(u, "%R", "/cgroup-root"); + + /* templated with components */ + assert_se(u = unit_new(m, sizeof(Slice))); + assert_se(unit_add_name(u, "blah-blah\\x2d.slice") == 0); + + expect(u, "%i", ""); + expect(u, "%I", ""); + expect(u, "%j", "blah\\x2d"); + expect(u, "%J", "blah-"); + expect(u, "%n", "blah-blah\\x2d.slice"); + expect(u, "%N", "blah-blah\\x2d"); + expect(u, "%p", "blah-blah\\x2d"); + expect(u, "%P", "blah/blah-"); + expect(u, "%f", "/blah/blah-"); + + /* deprecated */ + expect(u, "%c", "/cgroup-root/blah-blah\\x2d.slice"); + expect(u, "%r", "/cgroup-root"); + expect(u, "%R", "/cgroup-root"); + +#undef expect + + return 0; +} + +TEST(unit_instance_is_valid) { + assert_se(unit_instance_is_valid("fooBar")); + assert_se(unit_instance_is_valid("foo-bar")); + assert_se(unit_instance_is_valid("foo.stUff")); + assert_se(unit_instance_is_valid("fOo123.stuff")); + assert_se(unit_instance_is_valid("@f_oo123.Stuff")); + + assert_se(!unit_instance_is_valid("$¢£")); + assert_se(!unit_instance_is_valid("")); + assert_se(!unit_instance_is_valid("foo bar")); + assert_se(!unit_instance_is_valid("foo/bar")); +} + +TEST(unit_prefix_is_valid) { + assert_se(unit_prefix_is_valid("fooBar")); + assert_se(unit_prefix_is_valid("foo-bar")); + assert_se(unit_prefix_is_valid("foo.stUff")); + assert_se(unit_prefix_is_valid("fOo123.stuff")); + assert_se(unit_prefix_is_valid("foo123.Stuff")); + + assert_se(!unit_prefix_is_valid("$¢£")); + assert_se(!unit_prefix_is_valid("")); + assert_se(!unit_prefix_is_valid("foo bar")); + assert_se(!unit_prefix_is_valid("foo/bar")); + assert_se(!unit_prefix_is_valid("@foo-bar")); +} + +TEST(unit_name_change_suffix) { + char *t; + + assert_se(unit_name_change_suffix("foo.mount", ".service", &t) == 0); + assert_se(streq(t, "foo.service")); + free(t); + + assert_se(unit_name_change_suffix("foo@stuff.service", ".socket", &t) == 0); + assert_se(streq(t, "foo@stuff.socket")); + free(t); +} + +TEST(unit_name_build) { + char *t; + + assert_se(unit_name_build("foo", "bar", ".service", &t) == 0); + assert_se(streq(t, "foo@bar.service")); + free(t); + + assert_se(unit_name_build("fo0-stUff_b", "bar", ".mount", &t) == 0); + assert_se(streq(t, "fo0-stUff_b@bar.mount")); + free(t); + + assert_se(unit_name_build("foo", NULL, ".service", &t) == 0); + assert_se(streq(t, "foo.service")); + free(t); +} + +TEST(slice_name_is_valid) { + assert_se( slice_name_is_valid(SPECIAL_ROOT_SLICE)); + assert_se( slice_name_is_valid("foo.slice")); + assert_se( slice_name_is_valid("foo-bar.slice")); + assert_se( slice_name_is_valid("foo-bar-baz.slice")); + assert_se(!slice_name_is_valid("-foo-bar-baz.slice")); + assert_se(!slice_name_is_valid("foo-bar-baz-.slice")); + assert_se(!slice_name_is_valid("-foo-bar-baz-.slice")); + assert_se(!slice_name_is_valid("foo-bar--baz.slice")); + assert_se(!slice_name_is_valid("foo--bar--baz.slice")); + assert_se(!slice_name_is_valid(".slice")); + assert_se(!slice_name_is_valid("")); + assert_se(!slice_name_is_valid("foo.service")); + + assert_se(!slice_name_is_valid("foo@.slice")); + assert_se(!slice_name_is_valid("foo@bar.slice")); + assert_se(!slice_name_is_valid("foo-bar@baz.slice")); + assert_se(!slice_name_is_valid("foo@bar@baz.slice")); + assert_se(!slice_name_is_valid("foo@bar-baz.slice")); + assert_se(!slice_name_is_valid("-foo-bar-baz@.slice")); + assert_se(!slice_name_is_valid("foo-bar-baz@-.slice")); + assert_se(!slice_name_is_valid("foo-bar-baz@a--b.slice")); + assert_se(!slice_name_is_valid("-foo-bar-baz@-.slice")); + assert_se(!slice_name_is_valid("foo-bar--baz@.slice")); + assert_se(!slice_name_is_valid("foo--bar--baz@.slice")); + assert_se(!slice_name_is_valid("@.slice")); + assert_se(!slice_name_is_valid("foo@bar.service")); +} + +TEST(build_subslice) { + char *a; + char *b; + + assert_se(slice_build_subslice(SPECIAL_ROOT_SLICE, "foo", &a) >= 0); + assert_se(slice_build_subslice(a, "bar", &b) >= 0); + free(a); + assert_se(slice_build_subslice(b, "barfoo", &a) >= 0); + free(b); + assert_se(slice_build_subslice(a, "foobar", &b) >= 0); + free(a); + assert_se(streq(b, "foo-bar-barfoo-foobar.slice")); + free(b); + + assert_se(slice_build_subslice("foo.service", "bar", &a) < 0); + assert_se(slice_build_subslice("foo", "bar", &a) < 0); +} + +static void test_build_parent_slice_one(const char *name, const char *expect, int ret) { + _cleanup_free_ char *s = NULL; + + assert_se(slice_build_parent_slice(name, &s) == ret); + assert_se(streq_ptr(s, expect)); +} + +TEST(build_parent_slice) { + test_build_parent_slice_one(SPECIAL_ROOT_SLICE, NULL, 0); + test_build_parent_slice_one("foo.slice", SPECIAL_ROOT_SLICE, 1); + test_build_parent_slice_one("foo-bar.slice", "foo.slice", 1); + test_build_parent_slice_one("foo-bar-baz.slice", "foo-bar.slice", 1); + test_build_parent_slice_one("foo-bar--baz.slice", NULL, -EINVAL); + test_build_parent_slice_one("-foo-bar.slice", NULL, -EINVAL); + test_build_parent_slice_one("foo-bar-.slice", NULL, -EINVAL); + test_build_parent_slice_one("foo-bar.service", NULL, -EINVAL); + test_build_parent_slice_one(".slice", NULL, -EINVAL); + test_build_parent_slice_one("foo@bar.slice", NULL, -EINVAL); + test_build_parent_slice_one("foo-bar@baz.slice", NULL, -EINVAL); + test_build_parent_slice_one("foo-bar--@baz.slice", NULL, -EINVAL); + test_build_parent_slice_one("-foo-bar@bar.slice", NULL, -EINVAL); + test_build_parent_slice_one("foo-bar@-.slice", NULL, -EINVAL); + test_build_parent_slice_one("foo@bar.service", NULL, -EINVAL); + test_build_parent_slice_one("@.slice", NULL, -EINVAL); +} + +TEST(unit_name_to_instance) { + UnitNameFlags r; + char *instance; + + r = unit_name_to_instance("foo@bar.service", &instance); + assert_se(r == UNIT_NAME_INSTANCE); + assert_se(streq(instance, "bar")); + free(instance); + + r = unit_name_to_instance("foo@.service", &instance); + assert_se(r == UNIT_NAME_TEMPLATE); + assert_se(streq(instance, "")); + free(instance); + + r = unit_name_to_instance("fo0-stUff_b@b.service", &instance); + assert_se(r == UNIT_NAME_INSTANCE); + assert_se(streq(instance, "b")); + free(instance); + + r = unit_name_to_instance("foo.service", &instance); + assert_se(r == UNIT_NAME_PLAIN); + assert_se(!instance); + + r = unit_name_to_instance("fooj@unk", &instance); + assert_se(r < 0); + assert_se(!instance); + + r = unit_name_to_instance("foo@", &instance); + assert_se(r < 0); + assert_se(!instance); +} + +TEST(unit_name_escape) { + _cleanup_free_ char *r = NULL; + + r = unit_name_escape("ab+-c.a/bc@foo.service"); + assert_se(r); + assert_se(streq(r, "ab\\x2b\\x2dc.a-bc\\x40foo.service")); +} + +static void test_u_n_t_one(const char *name, const char *expected, int ret) { + _cleanup_free_ char *f = NULL; + + assert_se(unit_name_template(name, &f) == ret); + printf("got: %s, expected: %s\n", strna(f), strna(expected)); + assert_se(streq_ptr(f, expected)); +} + +TEST(unit_name_template) { + test_u_n_t_one("foo@bar.service", "foo@.service", 0); + test_u_n_t_one("foo.mount", NULL, -EINVAL); +} + +static void test_unit_name_path_unescape_one(const char *name, const char *path, int ret) { + _cleanup_free_ char *p = NULL; + + assert_se(unit_name_path_unescape(name, &p) == ret); + assert_se(streq_ptr(path, p)); +} + +TEST(unit_name_path_unescape) { + test_unit_name_path_unescape_one("foo", "/foo", 0); + test_unit_name_path_unescape_one("foo-bar", "/foo/bar", 0); + test_unit_name_path_unescape_one("foo-.bar", "/foo/.bar", 0); + test_unit_name_path_unescape_one("foo-bar-baz", "/foo/bar/baz", 0); + test_unit_name_path_unescape_one("-", "/", 0); + test_unit_name_path_unescape_one("--", NULL, -EINVAL); + test_unit_name_path_unescape_one("-foo-bar", NULL, -EINVAL); + test_unit_name_path_unescape_one("foo--bar", NULL, -EINVAL); + test_unit_name_path_unescape_one("foo-bar-", NULL, -EINVAL); + test_unit_name_path_unescape_one(".-bar", NULL, -EINVAL); + test_unit_name_path_unescape_one("foo-..", NULL, -EINVAL); + test_unit_name_path_unescape_one("", NULL, -EINVAL); +} + +static void test_unit_name_to_prefix_one(const char *input, int ret, const char *output) { + _cleanup_free_ char *k = NULL; + + assert_se(unit_name_to_prefix(input, &k) == ret); + assert_se(streq_ptr(k, output)); +} + +TEST(unit_name_to_prefix) { + test_unit_name_to_prefix_one("foobar.service", 0, "foobar"); + test_unit_name_to_prefix_one("", -EINVAL, NULL); + test_unit_name_to_prefix_one("foobar", -EINVAL, NULL); + test_unit_name_to_prefix_one(".service", -EINVAL, NULL); + test_unit_name_to_prefix_one("quux.quux", -EINVAL, NULL); + test_unit_name_to_prefix_one("quux.mount", 0, "quux"); + test_unit_name_to_prefix_one("quux-quux.mount", 0, "quux-quux"); + test_unit_name_to_prefix_one("quux@bar.mount", 0, "quux"); + test_unit_name_to_prefix_one("quux-@.mount", 0, "quux-"); + test_unit_name_to_prefix_one("@.mount", -EINVAL, NULL); +} + +static void test_unit_name_from_dbus_path_one(const char *input, int ret, const char *output) { + _cleanup_free_ char *k = NULL; + + assert_se(unit_name_from_dbus_path(input, &k) == ret); + assert_se(streq_ptr(k, output)); +} + +TEST(unit_name_from_dbus_path) { + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dbus_2esocket", 0, "dbus.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/_2d_2emount", 0, "-.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/_2d_2eslice", 0, "-.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/accounts_2ddaemon_2eservice", 0, "accounts-daemon.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/auditd_2eservice", 0, "auditd.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/basic_2etarget", 0, "basic.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/bluetooth_2etarget", 0, "bluetooth.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/boot_2eautomount", 0, "boot.automount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/boot_2emount", 0, "boot.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/btrfs_2emount", 0, "btrfs.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/cryptsetup_2dpre_2etarget", 0, "cryptsetup-pre.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/cryptsetup_2etarget", 0, "cryptsetup.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dbus_2eservice", 0, "dbus.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dbus_2esocket", 0, "dbus.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dcdrom_2edevice", 0, "dev-cdrom.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dINTEL_5fSSDSA2M120G2GC_5fCVPO044405HH120QGN_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dINTEL_SSDSA2M120G2GC_CVPO044405HH120QGN.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dINTEL_5fSSDSA2M120G2GC_5fCVPO044405HH120QGN_5cx2dpart1_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dINTEL_SSDSA2M120G2GC_CVPO044405HH120QGN\\x2dpart1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dINTEL_5fSSDSA2M160G2GC_5fCVPO951003RY160AGN_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dINTEL_SSDSA2M160G2GC_CVPO951003RY160AGN.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dINTEL_5fSSDSA2M160G2GC_5fCVPO951003RY160AGN_5cx2dpart1_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dINTEL_SSDSA2M160G2GC_CVPO951003RY160AGN\\x2dpart1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dINTEL_5fSSDSA2M160G2GC_5fCVPO951003RY160AGN_5cx2dpart2_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dINTEL_SSDSA2M160G2GC_CVPO951003RY160AGN\\x2dpart2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dINTEL_5fSSDSA2M160G2GC_5fCVPO951003RY160AGN_5cx2dpart3_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dINTEL_SSDSA2M160G2GC_CVPO951003RY160AGN\\x2dpart3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2data_5cx2dTSSTcorp_5fCDDVDW_5fTS_5cx2dL633C_5fR6176GLZB14646_2edevice", 0, "dev-disk-by\\x2did-ata\\x2dTSSTcorp_CDDVDW_TS\\x2dL633C_R6176GLZB14646.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2dwwn_5cx2d0x50015179591245ae_2edevice", 0, "dev-disk-by\\x2did-wwn\\x2d0x50015179591245ae.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2dwwn_5cx2d0x50015179591245ae_5cx2dpart1_2edevice", 0, "dev-disk-by\\x2did-wwn\\x2d0x50015179591245ae\\x2dpart1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2dwwn_5cx2d0x50015179591245ae_5cx2dpart2_2edevice", 0, "dev-disk-by\\x2did-wwn\\x2d0x50015179591245ae\\x2dpart2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2dwwn_5cx2d0x50015179591245ae_5cx2dpart3_2edevice", 0, "dev-disk-by\\x2did-wwn\\x2d0x50015179591245ae\\x2dpart3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2dwwn_5cx2d0x500151795946eab5_2edevice", 0, "dev-disk-by\\x2did-wwn\\x2d0x500151795946eab5.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2did_2dwwn_5cx2d0x500151795946eab5_5cx2dpart1_2edevice", 0, "dev-disk-by\\x2did-wwn\\x2d0x500151795946eab5\\x2dpart1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dlabel_2d_5cxe3_5cx82_5cxb7_5cxe3_5cx82_5cxb9_5cxe3_5cx83_5cx86_5cxe3_5cx83_5cxa0_5cxe3_5cx81_5cxa7_5cxe4_5cxba_5cx88_5cxe7_5cxb4_5cx84_5cxe6_5cxb8_5cx88_5cxe3_5cx81_5cxbf_2edevice", 0, "dev-disk-by\\x2dlabel-\\xe3\\x82\\xb7\\xe3\\x82\\xb9\\xe3\\x83\\x86\\xe3\\x83\\xa0\\xe3\\x81\\xa7\\xe4\\xba\\x88\\xe7\\xb4\\x84\\xe6\\xb8\\x88\\xe3\\x81\\xbf.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpartuuid_2d59834e50_5cx2d01_2edevice", 0, "dev-disk-by\\x2dpartuuid-59834e50\\x2d01.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpartuuid_2d63e2a7b3_5cx2d01_2edevice", 0, "dev-disk-by\\x2dpartuuid-63e2a7b3\\x2d01.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpartuuid_2d63e2a7b3_5cx2d02_2edevice", 0, "dev-disk-by\\x2dpartuuid-63e2a7b3\\x2d02.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpartuuid_2d63e2a7b3_5cx2d03_2edevice", 0, "dev-disk-by\\x2dpartuuid-63e2a7b3\\x2d03.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d1_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d1_5cx2dpart1_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d1\\x2dpart1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d1_5cx2dpart2_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d1\\x2dpart2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d1_5cx2dpart3_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d1\\x2dpart3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d2_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d6_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d6.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2dpath_2dpci_5cx2d0000_3a00_3a1f_2e2_5cx2data_5cx2d6_5cx2dpart1_2edevice", 0, "dev-disk-by\\x2dpath-pci\\x2d0000:00:1f.2\\x2data\\x2d6\\x2dpart1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2duuid_2d1A34E3F034E3CD37_2edevice", 0, "dev-disk-by\\x2duuid-1A34E3F034E3CD37.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2duuid_2dB670EBFE70EBC2EB_2edevice", 0, "dev-disk-by\\x2duuid-B670EBFE70EBC2EB.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2duuid_2dFCD4F509D4F4C6C4_2edevice", 0, "dev-disk-by\\x2duuid-FCD4F509D4F4C6C4.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2ddisk_2dby_5cx2duuid_2db49ead57_5cx2d907c_5cx2d446c_5cx2db405_5cx2d5ca6cd865f5e_2edevice", 0, "dev-disk-by\\x2duuid-b49ead57\\x2d907c\\x2d446c\\x2db405\\x2d5ca6cd865f5e.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dhugepages_2emount", 0, "dev-hugepages.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dmqueue_2emount", 0, "dev-mqueue.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2drfkill_2edevice", 0, "dev-rfkill.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsda1_2edevice", 0, "dev-sda1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsda2_2edevice", 0, "dev-sda2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsda3_2edevice", 0, "dev-sda3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsda_2edevice", 0, "dev-sda.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsdb1_2edevice", 0, "dev-sdb1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsdb_2edevice", 0, "dev-sdb.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dsr0_2edevice", 0, "dev-sr0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS0_2edevice", 0, "dev-ttyS0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS10_2edevice", 0, "dev-ttyS10.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS11_2edevice", 0, "dev-ttyS11.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS12_2edevice", 0, "dev-ttyS12.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS13_2edevice", 0, "dev-ttyS13.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS14_2edevice", 0, "dev-ttyS14.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS15_2edevice", 0, "dev-ttyS15.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS16_2edevice", 0, "dev-ttyS16.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS17_2edevice", 0, "dev-ttyS17.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS18_2edevice", 0, "dev-ttyS18.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS19_2edevice", 0, "dev-ttyS19.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS1_2edevice", 0, "dev-ttyS1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS20_2edevice", 0, "dev-ttyS20.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS21_2edevice", 0, "dev-ttyS21.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS22_2edevice", 0, "dev-ttyS22.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS23_2edevice", 0, "dev-ttyS23.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS24_2edevice", 0, "dev-ttyS24.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS25_2edevice", 0, "dev-ttyS25.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS26_2edevice", 0, "dev-ttyS26.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS27_2edevice", 0, "dev-ttyS27.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS28_2edevice", 0, "dev-ttyS28.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS29_2edevice", 0, "dev-ttyS29.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS2_2edevice", 0, "dev-ttyS2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS30_2edevice", 0, "dev-ttyS30.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS31_2edevice", 0, "dev-ttyS31.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS3_2edevice", 0, "dev-ttyS3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS4_2edevice", 0, "dev-ttyS4.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS5_2edevice", 0, "dev-ttyS5.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS6_2edevice", 0, "dev-ttyS6.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS7_2edevice", 0, "dev-ttyS7.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS8_2edevice", 0, "dev-ttyS8.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dev_2dttyS9_2edevice", 0, "dev-ttyS9.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dcmdline_2eservice", 0, "dracut-cmdline.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dinitqueue_2eservice", 0, "dracut-initqueue.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dmount_2eservice", 0, "dracut-mount.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dpre_2dmount_2eservice", 0, "dracut-pre-mount.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dpre_2dpivot_2eservice", 0, "dracut-pre-pivot.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dpre_2dtrigger_2eservice", 0, "dracut-pre-trigger.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dpre_2dudev_2eservice", 0, "dracut-pre-udev.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/dracut_2dshutdown_2eservice", 0, "dracut-shutdown.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/ebtables_2eservice", 0, "ebtables.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/emergency_2eservice", 0, "emergency.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/emergency_2etarget", 0, "emergency.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/fedora_2dimport_2dstate_2eservice", 0, "fedora-import-state.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/fedora_2dreadonly_2eservice", 0, "fedora-readonly.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/firewalld_2eservice", 0, "firewalld.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/getty_2dpre_2etarget", 0, "getty-pre.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/getty_2etarget", 0, "getty.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/getty_40tty1_2eservice", 0, "getty@tty1.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/graphical_2etarget", 0, "graphical.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/home_2emount", 0, "home.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/init_2escope", 0, "init.scope"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2dcleanup_2eservice", 0, "initrd-cleanup.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2dfs_2etarget", 0, "initrd-fs.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2dparse_2detc_2eservice", 0, "initrd-parse-etc.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2droot_2ddevice_2etarget", 0, "initrd-root-device.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2droot_2dfs_2etarget", 0, "initrd-root-fs.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2dswitch_2droot_2eservice", 0, "initrd-switch-root.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2dswitch_2droot_2etarget", 0, "initrd-switch-root.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2dudevadm_2dcleanup_2ddb_2eservice", 0, "initrd-udevadm-cleanup-db.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/initrd_2etarget", 0, "initrd.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/ip6tables_2eservice", 0, "ip6tables.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/ipset_2eservice", 0, "ipset.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/iptables_2eservice", 0, "iptables.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/irqbalance_2eservice", 0, "irqbalance.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/kmod_2dstatic_2dnodes_2eservice", 0, "kmod-static-nodes.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/ldconfig_2eservice", 0, "ldconfig.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/lightdm_2eservice", 0, "lightdm.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/livesys_2dlate_2eservice", 0, "livesys-late.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/lm_5fsensors_2eservice", 0, "lm_sensors.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/local_2dfs_2dpre_2etarget", 0, "local-fs-pre.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/local_2dfs_2etarget", 0, "local-fs.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/machines_2etarget", 0, "machines.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/mcelog_2eservice", 0, "mcelog.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/multi_2duser_2etarget", 0, "multi-user.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/network_2dpre_2etarget", 0, "network-pre.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/network_2etarget", 0, "network.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/nss_2dlookup_2etarget", 0, "nss-lookup.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/nss_2duser_2dlookup_2etarget", 0, "nss-user-lookup.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/paths_2etarget", 0, "paths.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/plymouth_2dquit_2dwait_2eservice", 0, "plymouth-quit-wait.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/plymouth_2dquit_2eservice", 0, "plymouth-quit.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/plymouth_2dstart_2eservice", 0, "plymouth-start.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/polkit_2eservice", 0, "polkit.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/proc_2dsys_2dfs_2dbinfmt_5fmisc_2eautomount", 0, "proc-sys-fs-binfmt_misc.automount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/proc_2dsys_2dfs_2dbinfmt_5fmisc_2emount", 0, "proc-sys-fs-binfmt_misc.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/rc_2dlocal_2eservice", 0, "rc-local.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/remote_2dcryptsetup_2etarget", 0, "remote-cryptsetup.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/remote_2dfs_2dpre_2etarget", 0, "remote-fs-pre.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/remote_2dfs_2etarget", 0, "remote-fs.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/rescue_2eservice", 0, "rescue.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/rescue_2etarget", 0, "rescue.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/run_2duser_2d1000_2emount", 0, "run-user-1000.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/session_2d2_2escope", 0, "session-2.scope"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/shutdown_2etarget", 0, "shutdown.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/slices_2etarget", 0, "slices.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/smartd_2eservice", 0, "smartd.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sockets_2etarget", 0, "sockets.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sound_2etarget", 0, "sound.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sshd_2dkeygen_2etarget", 0, "sshd-keygen.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sshd_2dkeygen_40ecdsa_2eservice", 0, "sshd-keygen@ecdsa.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sshd_2dkeygen_40ed25519_2eservice", 0, "sshd-keygen@ed25519.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sshd_2dkeygen_40rsa_2eservice", 0, "sshd-keygen@rsa.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sshd_2eservice", 0, "sshd.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/swap_2etarget", 0, "swap.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a02_2e0_2dbacklight_2dacpi_5fvideo0_2edevice", 0, "sys-devices-pci0000:00-0000:00:02.0-backlight-acpi_video0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a02_2e0_2ddrm_2dcard0_2dcard0_5cx2dLVDS_5cx2d1_2dintel_5fbacklight_2edevice", 0, "sys-devices-pci0000:00-0000:00:02.0-drm-card0-card0\\x2dLVDS\\x2d1-intel_backlight.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1a_2e0_2dusb1_2d1_5cx2d1_2d1_5cx2d1_2e6_2d1_5cx2d1_2e6_3a1_2e0_2dbluetooth_2dhci0_2edevice", 0, "sys-devices-pci0000:00-0000:00:1a.0-usb1-1\\x2d1-1\\x2d1.6-1\\x2d1.6:1.0-bluetooth-hci0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1b_2e0_2dsound_2dcard0_2edevice", 0, "sys-devices-pci0000:00-0000:00:1b.0-sound-card0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1c_2e0_2d0000_3a02_3a00_2e0_2dnet_2dwlp2s0_2edevice", 0, "sys-devices-pci0000:00-0000:00:1c.0-0000:02:00.0-net-wlp2s0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1c_2e2_2d0000_3a04_3a00_2e0_2dnet_2denp4s0_2edevice", 0, "sys-devices-pci0000:00-0000:00:1c.2-0000:04:00.0-net-enp4s0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data1_2dhost0_2dtarget0_3a0_3a0_2d0_3a0_3a0_3a0_2dblock_2dsda_2dsda1_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata1-host0-target0:0:0-0:0:0:0-block-sda-sda1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data1_2dhost0_2dtarget0_3a0_3a0_2d0_3a0_3a0_3a0_2dblock_2dsda_2dsda2_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata1-host0-target0:0:0-0:0:0:0-block-sda-sda2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data1_2dhost0_2dtarget0_3a0_3a0_2d0_3a0_3a0_3a0_2dblock_2dsda_2dsda3_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata1-host0-target0:0:0-0:0:0:0-block-sda-sda3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data1_2dhost0_2dtarget0_3a0_3a0_2d0_3a0_3a0_3a0_2dblock_2dsda_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata1-host0-target0:0:0-0:0:0:0-block-sda.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data2_2dhost1_2dtarget1_3a0_3a0_2d1_3a0_3a0_3a0_2dblock_2dsr0_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata2-host1-target1:0:0-1:0:0:0-block-sr0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data6_2dhost5_2dtarget5_3a0_3a0_2d5_3a0_3a0_3a0_2dblock_2dsdb_2dsdb1_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata6-host5-target5:0:0-5:0:0:0-block-sdb-sdb1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dpci0000_3a00_2d0000_3a00_3a1f_2e2_2data6_2dhost5_2dtarget5_3a0_3a0_2d5_3a0_3a0_3a0_2dblock_2dsdb_2edevice", 0, "sys-devices-pci0000:00-0000:00:1f.2-ata6-host5-target5:0:0-5:0:0:0-block-sdb.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS0_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS10_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS10.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS11_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS11.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS12_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS12.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS13_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS13.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS14_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS14.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS15_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS15.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS16_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS16.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS17_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS17.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS18_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS18.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS19_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS19.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS1_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS1.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS20_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS20.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS21_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS21.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS22_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS22.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS23_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS23.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS24_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS24.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS25_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS25.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS26_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS26.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS27_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS27.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS28_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS28.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS29_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS29.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS2_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS2.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS30_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS30.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS31_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS31.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS3_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS3.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS4_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS4.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS5_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS5.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS6_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS6.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS7_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS7.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS8_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS8.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dplatform_2dserial8250_2dtty_2dttyS9_2edevice", 0, "sys-devices-platform-serial8250-tty-ttyS9.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2ddevices_2dvirtual_2dmisc_2drfkill_2edevice", 0, "sys-devices-virtual-misc-rfkill.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dfs_2dfuse_2dconnections_2emount", 0, "sys-fs-fuse-connections.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dkernel_2dconfig_2emount", 0, "sys-kernel-config.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dkernel_2ddebug_2emount", 0, "sys-kernel-debug.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dmodule_2dconfigfs_2edevice", 0, "sys-module-configfs.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dsubsystem_2dbluetooth_2ddevices_2dhci0_2edevice", 0, "sys-subsystem-bluetooth-devices-hci0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dsubsystem_2dnet_2ddevices_2denp4s0_2edevice", 0, "sys-subsystem-net-devices-enp4s0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sys_2dsubsystem_2dnet_2ddevices_2dwlp2s0_2edevice", 0, "sys-subsystem-net-devices-wlp2s0.device"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sysinit_2etarget", 0, "sysinit.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/syslog_2eservice", 0, "syslog.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/syslog_2esocket", 0, "syslog.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/syslog_2etarget", 0, "syslog.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/sysroot_2emount", 0, "sysroot.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/system_2dgetty_2eslice", 0, "system-getty.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/system_2dsshd_5cx2dkeygen_2eslice", 0, "system-sshd\\x2dkeygen.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/system_2dsystemd_5cx2dbacklight_2eslice", 0, "system-systemd\\x2dbacklight.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/system_2dsystemd_5cx2dcoredump_2eslice", 0, "system-systemd\\x2dcoredump.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/system_2duser_5cx2druntime_5cx2ddir_2eslice", 0, "system-user\\x2druntime\\x2ddir.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/system_2eslice", 0, "system.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dask_2dpassword_2dconsole_2epath", 0, "systemd-ask-password-console.path"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dask_2dpassword_2dconsole_2eservice", 0, "systemd-ask-password-console.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dask_2dpassword_2dwall_2epath", 0, "systemd-ask-password-wall.path"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dask_2dpassword_2dwall_2eservice", 0, "systemd-ask-password-wall.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dbacklight_40backlight_3aacpi_5fvideo0_2eservice", 0, "systemd-backlight@backlight:acpi_video0.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dbacklight_40backlight_3aintel_5fbacklight_2eservice", 0, "systemd-backlight@backlight:intel_backlight.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dbinfmt_2eservice", 0, "systemd-binfmt.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dcoredump_2esocket", 0, "systemd-coredump.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dcoredump_400_2eservice", 0, "systemd-coredump@0.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dfirstboot_2eservice", 0, "systemd-firstboot.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dfsck_2droot_2eservice", 0, SPECIAL_FSCK_ROOT_SERVICE); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dhwdb_2dupdate_2eservice", 0, "systemd-hwdb-update.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dinitctl_2eservice", 0, "systemd-initctl.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dinitctl_2esocket", 0, "systemd-initctl.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2djournal_2dcatalog_2dupdate_2eservice", 0, "systemd-journal-catalog-update.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2djournal_2dflush_2eservice", 0, "systemd-journal-flush.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2djournald_2daudit_2esocket", 0, "systemd-journald-audit.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2djournald_2ddev_2dlog_2esocket", 0, "systemd-journald-dev-log.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2djournald_2eservice", 0, "systemd-journald.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2djournald_2esocket", 0, "systemd-journald.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dlogind_2eservice", 0, "systemd-logind.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dmachine_2did_2dcommit_2eservice", 0, "systemd-machine-id-commit.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dmodules_2dload_2eservice", 0, "systemd-modules-load.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dnetworkd_2eservice", 0, "systemd-networkd.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dnetworkd_2esocket", 0, "systemd-networkd.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2drandom_2dseed_2eservice", 0, "systemd-random-seed.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dremount_2dfs_2eservice", 0, "systemd-remount-fs.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dresolved_2eservice", 0, "systemd-resolved.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2drfkill_2eservice", 0, "systemd-rfkill.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2drfkill_2esocket", 0, "systemd-rfkill.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dsysctl_2eservice", 0, "systemd-sysctl.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dsysusers_2eservice", 0, "systemd-sysusers.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dtimesyncd_2eservice", 0, "systemd-timesyncd.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dtmpfiles_2dclean_2eservice", 0, "systemd-tmpfiles-clean.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dtmpfiles_2dclean_2etimer", 0, "systemd-tmpfiles-clean.timer"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dtmpfiles_2dsetup_2ddev_2eservice", 0, "systemd-tmpfiles-setup-dev.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dtmpfiles_2dsetup_2eservice", 0, "systemd-tmpfiles-setup.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dudev_2dtrigger_2eservice", 0, "systemd-udev-trigger.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dudevd_2dcontrol_2esocket", 0, "systemd-udevd-control.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dudevd_2dkernel_2esocket", 0, "systemd-udevd-kernel.socket"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dudevd_2eservice", 0, "systemd-udevd.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dupdate_2ddone_2eservice", 0, "systemd-update-done.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dupdate_2dutmp_2drunlevel_2eservice", 0, "systemd-update-utmp-runlevel.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dupdate_2dutmp_2eservice", 0, "systemd-update-utmp.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2duser_2dsessions_2eservice", 0, "systemd-user-sessions.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/systemd_2dvconsole_2dsetup_2eservice", 0, "systemd-vconsole-setup.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/time_2dsync_2etarget", 0, "time-sync.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/timers_2etarget", 0, "timers.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/tmp_2emount", 0, "tmp.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/umount_2etarget", 0, "umount.target"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/unbound_2danchor_2eservice", 0, "unbound-anchor.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/unbound_2danchor_2etimer", 0, "unbound-anchor.timer"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/upower_2eservice", 0, "upower.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/user_2d1000_2eslice", 0, "user-1000.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/user_2druntime_2ddir_401000_2eservice", 0, "user-runtime-dir@1000.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/user_2eslice", 0, "user.slice"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/user_401000_2eservice", 0, "user@1000.service"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/usr_2dlocal_2dtexlive_2emount", 0, "usr-local-texlive.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/var_2dlib_2dmachines_2emount", 0, "var-lib-machines.mount"); + test_unit_name_from_dbus_path_one("/org/freedesktop/systemd1/unit/wpa_5fsupplicant_2eservice", 0, "wpa_supplicant.service"); +} + +TEST(unit_name_prefix_equal) { + assert_se(unit_name_prefix_equal("a.service", "a.service")); + assert_se(unit_name_prefix_equal("a.service", "a.mount")); + assert_se(unit_name_prefix_equal("a@b.service", "a.service")); + assert_se(unit_name_prefix_equal("a@b.service", "a@c.service")); + + assert_se(!unit_name_prefix_equal("a.service", "b.service")); + assert_se(!unit_name_prefix_equal("a.service", "b.mount")); + assert_se(!unit_name_prefix_equal("a@a.service", "b.service")); + assert_se(!unit_name_prefix_equal("a@a.service", "b@a.service")); + assert_se(!unit_name_prefix_equal("a", "b")); + assert_se(!unit_name_prefix_equal("a", "a")); +} + +static int intro(void) { + if (enter_cgroup_subroot(NULL) == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(runtime_dir = setup_fake_runtime_dir()); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-unit-serialize.c b/src/test/test-unit-serialize.c new file mode 100644 index 0000000..7a1e8a0 --- /dev/null +++ b/src/test/test-unit-serialize.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "rm-rf.h" +#include "service.h" +#include "tests.h" + +static char *runtime_dir = NULL; + +STATIC_DESTRUCTOR_REGISTER(runtime_dir, rm_rf_physical_and_freep); + +#define EXEC_START_ABSOLUTE \ + "ExecStart 0 /bin/sh \"sh\" \"-e\" \"-x\" \"-c\" \"systemctl --state=failed --no-legend --no-pager >/failed ; systemctl daemon-reload ; echo OK >/testok\"" +#define EXEC_START_RELATIVE \ + "ExecStart 0 sh \"sh\" \"-e\" \"-x\" \"-c\" \"systemctl --state=failed --no-legend --no-pager >/failed ; systemctl daemon-reload ; echo OK >/testok\"" + +static void test_deserialize_exec_command_one(Manager *m, const char *key, const char *line, int expected) { + _cleanup_(unit_freep) Unit *u = NULL; + int r; + + assert_se(unit_new_for_name(m, sizeof(Service), "test.service", &u) >= 0); + + r = service_deserialize_exec_command(u, key, line); + log_debug("[%s] → %d (expected: %d)", line, r, expected); + assert_se(r == expected); + + /* Note that the command doesn't match any command in the empty list of commands in 's', so it is + * always rejected with "Current command vanished from the unit file", and we don't leak anything. */ +} + +TEST(deserialize_exec_command) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + r = manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_MINIMAL, &m); + if (manager_errno_skip_test(r)) { + log_notice_errno(r, "Skipping test: manager_new: %m"); + return; + } + + assert_se(r >= 0); + + test_deserialize_exec_command_one(m, "main-command", EXEC_START_ABSOLUTE, 0); + test_deserialize_exec_command_one(m, "main-command", EXEC_START_RELATIVE, 0); + test_deserialize_exec_command_one(m, "control-command", EXEC_START_ABSOLUTE, 0); + test_deserialize_exec_command_one(m, "control-command", EXEC_START_RELATIVE, 0); + + test_deserialize_exec_command_one(m, "control-command", "ExecStart 0 /bin/sh \"sh\"", 0); + test_deserialize_exec_command_one(m, "control-command", "ExecStart 0 /no/command ", -EINVAL); + test_deserialize_exec_command_one(m, "control-command", "ExecStart 0 /bad/quote \"", -EINVAL); + test_deserialize_exec_command_one(m, "control-command", "ExecStart s /bad/id x y z", -EINVAL); + test_deserialize_exec_command_one(m, "control-command", "ExecStart 11", -EINVAL); + test_deserialize_exec_command_one(m, "control-command", "ExecWhat 11 /a/b c d e", -EINVAL); +} + +static int intro(void) { + if (enter_cgroup_subroot(NULL) == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(runtime_dir = setup_fake_runtime_dir()); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro); diff --git a/src/test/test-user-util.c b/src/test/test-user-util.c new file mode 100644 index 0000000..db76cde --- /dev/null +++ b/src/test/test-user-util.c @@ -0,0 +1,484 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "format-util.h" +#include "libcrypt-util.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "path-util.h" +#include "string-util.h" +#include "tests.h" +#include "user-util.h" + +static void test_uid_to_name_one(uid_t uid, const char *name) { + _cleanup_free_ char *t = NULL; + + log_info("/* %s("UID_FMT", \"%s\") */", __func__, uid, name); + + assert_se(t = uid_to_name(uid)); + if (!synthesize_nobody() && streq(name, NOBODY_USER_NAME)) { + log_info("(skipping detailed tests because nobody is not synthesized)"); + return; + } + assert_se(streq_ptr(t, name)); +} + +TEST(uid_to_name) { + test_uid_to_name_one(0, "root"); + test_uid_to_name_one(UID_NOBODY, NOBODY_USER_NAME); + test_uid_to_name_one(0xFFFF, "65535"); + test_uid_to_name_one(0xFFFFFFFF, "4294967295"); +} + +static void test_gid_to_name_one(gid_t gid, const char *name) { + _cleanup_free_ char *t = NULL; + + log_info("/* %s("GID_FMT", \"%s\") */", __func__, gid, name); + + assert_se(t = gid_to_name(gid)); + if (!synthesize_nobody() && streq(name, NOBODY_GROUP_NAME)) { + log_info("(skipping detailed tests because nobody is not synthesized)"); + return; + } + assert_se(streq_ptr(t, name)); +} + +TEST(gid_to_name) { + test_gid_to_name_one(0, "root"); + test_gid_to_name_one(GID_NOBODY, NOBODY_GROUP_NAME); + test_gid_to_name_one(0xFFFF, "65535"); + test_gid_to_name_one(0xFFFFFFFF, "4294967295"); +} + +TEST(parse_uid) { + int r; + uid_t uid; + + r = parse_uid("0", &uid); + assert_se(r == 0); + assert_se(uid == 0); + + r = parse_uid("1", &uid); + assert_se(r == 0); + assert_se(uid == 1); + + r = parse_uid("01", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 1); + + r = parse_uid("001", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 1); + + r = parse_uid("100", &uid); + assert_se(r == 0); + assert_se(uid == 100); + + r = parse_uid("65535", &uid); + assert_se(r == -ENXIO); + assert_se(uid == 100); + + r = parse_uid("0x1234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("0o1234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("0b1234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("+1234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("-1234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid(" 1234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("01234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("001234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("0001234", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("-0", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("+0", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("00", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("000", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); + + r = parse_uid("asdsdas", &uid); + assert_se(r == -EINVAL); + assert_se(uid == 100); +} + +TEST(uid_ptr) { + assert_se(UID_TO_PTR(0) != NULL); + assert_se(UID_TO_PTR(1000) != NULL); + + assert_se(PTR_TO_UID(UID_TO_PTR(0)) == 0); + assert_se(PTR_TO_UID(UID_TO_PTR(1000)) == 1000); +} + +TEST(valid_user_group_name_relaxed) { + assert_se(!valid_user_group_name(NULL, VALID_USER_RELAX)); + assert_se(!valid_user_group_name("", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("1", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("65535", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("-1", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("foo\nbar", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("0123456789012345678901234567890123456789", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("aaa:bbb", VALID_USER_RELAX|VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name(".aaa:bbb", VALID_USER_RELAX|VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name(".", VALID_USER_RELAX)); + assert_se(!valid_user_group_name("..", VALID_USER_RELAX)); + + assert_se(valid_user_group_name("root", VALID_USER_RELAX)); + assert_se(valid_user_group_name("lennart", VALID_USER_RELAX)); + assert_se(valid_user_group_name("LENNART", VALID_USER_RELAX)); + assert_se(valid_user_group_name("_kkk", VALID_USER_RELAX)); + assert_se(valid_user_group_name("kkk-", VALID_USER_RELAX)); + assert_se(valid_user_group_name("kk-k", VALID_USER_RELAX)); + assert_se(valid_user_group_name("eff.eff", VALID_USER_RELAX)); + assert_se(valid_user_group_name("eff.", VALID_USER_RELAX)); + assert_se(valid_user_group_name("-kkk", VALID_USER_RELAX)); + assert_se(valid_user_group_name("rööt", VALID_USER_RELAX)); + assert_se(valid_user_group_name(".eff", VALID_USER_RELAX)); + assert_se(valid_user_group_name(".1", VALID_USER_RELAX)); + assert_se(valid_user_group_name(".65535", VALID_USER_RELAX)); + assert_se(valid_user_group_name(".-1", VALID_USER_RELAX)); + assert_se(valid_user_group_name(".-kkk", VALID_USER_RELAX)); + assert_se(valid_user_group_name(".rööt", VALID_USER_RELAX)); + assert_se(valid_user_group_name("...", VALID_USER_RELAX)); + + assert_se(valid_user_group_name("some5", VALID_USER_RELAX)); + assert_se(valid_user_group_name("5some", VALID_USER_RELAX)); + assert_se(valid_user_group_name("INNER5NUMBER", VALID_USER_RELAX)); + + assert_se(valid_user_group_name("piff.paff@ad.domain.example", VALID_USER_RELAX)); + assert_se(valid_user_group_name("Dāvis", VALID_USER_RELAX)); +} + +TEST(valid_user_group_name) { + assert_se(!valid_user_group_name(NULL, 0)); + assert_se(!valid_user_group_name("", 0)); + assert_se(!valid_user_group_name("1", 0)); + assert_se(!valid_user_group_name("65535", 0)); + assert_se(!valid_user_group_name("-1", 0)); + assert_se(!valid_user_group_name("-kkk", 0)); + assert_se(!valid_user_group_name("rööt", 0)); + assert_se(!valid_user_group_name(".", 0)); + assert_se(!valid_user_group_name(".eff", 0)); + assert_se(!valid_user_group_name("foo\nbar", 0)); + assert_se(!valid_user_group_name("0123456789012345678901234567890123456789", 0)); + assert_se(!valid_user_group_name("aaa:bbb", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name(".", 0)); + assert_se(!valid_user_group_name("..", 0)); + assert_se(!valid_user_group_name("...", 0)); + assert_se(!valid_user_group_name(".1", 0)); + assert_se(!valid_user_group_name(".65535", 0)); + assert_se(!valid_user_group_name(".-1", 0)); + assert_se(!valid_user_group_name(".-kkk", 0)); + assert_se(!valid_user_group_name(".rööt", 0)); + assert_se(!valid_user_group_name(".aaa:bbb", VALID_USER_ALLOW_NUMERIC)); + + assert_se(valid_user_group_name("root", 0)); + assert_se(valid_user_group_name("lennart", 0)); + assert_se(valid_user_group_name("LENNART", 0)); + assert_se(valid_user_group_name("_kkk", 0)); + assert_se(valid_user_group_name("kkk-", 0)); + assert_se(valid_user_group_name("kk-k", 0)); + assert_se(!valid_user_group_name("eff.eff", 0)); + assert_se(!valid_user_group_name("eff.", 0)); + + assert_se(valid_user_group_name("some5", 0)); + assert_se(!valid_user_group_name("5some", 0)); + assert_se(valid_user_group_name("INNER5NUMBER", 0)); + + assert_se(!valid_user_group_name("piff.paff@ad.domain.example", 0)); + assert_se(!valid_user_group_name("Dāvis", 0)); +} + +TEST(valid_user_group_name_or_numeric_relaxed) { + assert_se(!valid_user_group_name(NULL, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("0", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("1", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("65534", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("65535", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("65536", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("-1", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("foo\nbar", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("0123456789012345678901234567890123456789", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("aaa:bbb", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name(".", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(!valid_user_group_name("..", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + + assert_se(valid_user_group_name("root", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("lennart", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("LENNART", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("_kkk", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("kkk-", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("kk-k", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("-kkk", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("rööt", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name(".eff", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("eff.eff", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("eff.", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("...", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + + assert_se(valid_user_group_name("some5", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("5some", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("INNER5NUMBER", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + + assert_se(valid_user_group_name("piff.paff@ad.domain.example", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); + assert_se(valid_user_group_name("Dāvis", VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX)); +} + +TEST(valid_user_group_name_or_numeric) { + assert_se(!valid_user_group_name(NULL, VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("0", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("1", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("65534", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("65535", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("65536", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("-1", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("-kkk", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("rööt", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name(".", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("..", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("...", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name(".eff", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("eff.eff", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("eff.", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("foo\nbar", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("0123456789012345678901234567890123456789", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("aaa:bbb", VALID_USER_ALLOW_NUMERIC)); + + assert_se(valid_user_group_name("root", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("lennart", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("LENNART", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("_kkk", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("kkk-", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("kk-k", VALID_USER_ALLOW_NUMERIC)); + + assert_se(valid_user_group_name("some5", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("5some", VALID_USER_ALLOW_NUMERIC)); + assert_se(valid_user_group_name("INNER5NUMBER", VALID_USER_ALLOW_NUMERIC)); + + assert_se(!valid_user_group_name("piff.paff@ad.domain.example", VALID_USER_ALLOW_NUMERIC)); + assert_se(!valid_user_group_name("Dāvis", VALID_USER_ALLOW_NUMERIC)); +} + +TEST(valid_gecos) { + assert_se(!valid_gecos(NULL)); + assert_se(valid_gecos("")); + assert_se(valid_gecos("test")); + assert_se(valid_gecos("Ümläüt")); + assert_se(!valid_gecos("In\nvalid")); + assert_se(!valid_gecos("In:valid")); +} + +TEST(valid_home) { + assert_se(!valid_home(NULL)); + assert_se(!valid_home("")); + assert_se(!valid_home(".")); + assert_se(!valid_home("/home/..")); + assert_se(!valid_home("/home/../")); + assert_se(!valid_home("/home\n/foo")); + assert_se(!valid_home("./piep")); + assert_se(!valid_home("piep")); + assert_se(!valid_home("/home/user:lennart")); + + assert_se(valid_home("/")); + assert_se(valid_home("/home")); + assert_se(valid_home("/home/foo")); +} + +static void test_get_user_creds_one(const char *id, const char *name, uid_t uid, gid_t gid, const char *home, const char *shell) { + const char *rhome = NULL; + const char *rshell = NULL; + uid_t ruid = UID_INVALID; + gid_t rgid = GID_INVALID; + int r; + + log_info("/* %s(\"%s\", \"%s\", "UID_FMT", "GID_FMT", \"%s\", \"%s\") */", + __func__, id, name, uid, gid, home, shell); + + r = get_user_creds(&id, &ruid, &rgid, &rhome, &rshell, 0); + log_info_errno(r, "got \"%s\", "UID_FMT", "GID_FMT", \"%s\", \"%s\": %m", + id, ruid, rgid, strnull(rhome), strnull(rshell)); + if (!synthesize_nobody() && streq(name, NOBODY_USER_NAME)) { + log_info("(skipping detailed tests because nobody is not synthesized)"); + return; + } + assert_se(r == 0); + assert_se(streq_ptr(id, name)); + assert_se(ruid == uid); + assert_se(rgid == gid); + assert_se(path_equal(rhome, home)); +} + +TEST(get_user_creds) { + test_get_user_creds_one("root", "root", 0, 0, "/root", DEFAULT_USER_SHELL); + test_get_user_creds_one("0", "root", 0, 0, "/root", DEFAULT_USER_SHELL); + test_get_user_creds_one(NOBODY_USER_NAME, NOBODY_USER_NAME, UID_NOBODY, GID_NOBODY, "/", NOLOGIN); + test_get_user_creds_one("65534", NOBODY_USER_NAME, UID_NOBODY, GID_NOBODY, "/", NOLOGIN); +} + +static void test_get_group_creds_one(const char *id, const char *name, gid_t gid) { + gid_t rgid = GID_INVALID; + int r; + + log_info("/* %s(\"%s\", \"%s\", "GID_FMT") */", __func__, id, name, gid); + + r = get_group_creds(&id, &rgid, 0); + log_info_errno(r, "got \"%s\", "GID_FMT": %m", id, rgid); + if (!synthesize_nobody() && streq(name, NOBODY_GROUP_NAME)) { + log_info("(skipping detailed tests because nobody is not synthesized)"); + return; + } + assert_se(r == 0); + assert_se(streq_ptr(id, name)); + assert_se(rgid == gid); +} + +TEST(get_group_creds) { + test_get_group_creds_one("root", "root", 0); + test_get_group_creds_one("0", "root", 0); + test_get_group_creds_one(NOBODY_GROUP_NAME, NOBODY_GROUP_NAME, GID_NOBODY); + test_get_group_creds_one("65534", NOBODY_GROUP_NAME, GID_NOBODY); +} + +TEST(make_salt) { + _cleanup_free_ char *s, *t; + + assert_se(make_salt(&s) == 0); + log_info("got %s", s); + + assert_se(make_salt(&t) == 0); + log_info("got %s", t); + + assert_se(!streq(s, t)); +} + +TEST(in_gid) { + assert_se(in_gid(getgid()) >= 0); + assert_se(in_gid(getegid()) >= 0); + assert_se(in_gid(GID_INVALID) < 0); + assert_se(in_gid(TTY_GID) == 0); /* The TTY gid is for owning ttys, it would be really really weird if we were in it. */ +} + +TEST(gid_lists_ops) { + static const gid_t l1[] = { 5, 10, 15, 20, 25}; + static const gid_t l2[] = { 1, 2, 3, 15, 20, 25}; + static const gid_t l3[] = { 5, 10, 15, 20, 25, 26, 27}; + static const gid_t l4[] = { 25, 26, 20, 15, 5, 27, 10}; + + static const gid_t result1[] = {1, 2, 3, 5, 10, 15, 20, 25, 26, 27}; + static const gid_t result2[] = {5, 10, 15, 20, 25, 26, 27}; + + _cleanup_free_ gid_t *gids = NULL; + _cleanup_free_ gid_t *res1 = NULL; + _cleanup_free_ gid_t *res2 = NULL; + _cleanup_free_ gid_t *res3 = NULL; + _cleanup_free_ gid_t *res4 = NULL; + int nresult; + + nresult = merge_gid_lists(l2, ELEMENTSOF(l2), l3, ELEMENTSOF(l3), &res1); + assert_se(nresult >= 0); + assert_se(memcmp_nn(res1, nresult, result1, ELEMENTSOF(result1)) == 0); + + nresult = merge_gid_lists(NULL, 0, l2, ELEMENTSOF(l2), &res2); + assert_se(nresult >= 0); + assert_se(memcmp_nn(res2, nresult, l2, ELEMENTSOF(l2)) == 0); + + nresult = merge_gid_lists(l1, ELEMENTSOF(l1), l1, ELEMENTSOF(l1), &res3); + assert_se(nresult >= 0); + assert_se(memcmp_nn(l1, ELEMENTSOF(l1), res3, nresult) == 0); + + nresult = merge_gid_lists(l1, ELEMENTSOF(l1), l4, ELEMENTSOF(l4), &res4); + assert_se(nresult >= 0); + assert_se(memcmp_nn(result2, ELEMENTSOF(result2), res4, nresult) == 0); + + nresult = getgroups_alloc(&gids); + assert_se(nresult >= 0 || nresult == -EINVAL || nresult == -ENOMEM); + assert_se(gids); +} + +TEST(parse_uid_range) { + uid_t a = 4711, b = 4711; + + assert_se(parse_uid_range("", &a, &b) == -EINVAL && a == 4711 && b == 4711); + assert_se(parse_uid_range(" ", &a, &b) == -EINVAL && a == 4711 && b == 4711); + assert_se(parse_uid_range("x", &a, &b) == -EINVAL && a == 4711 && b == 4711); + + assert_se(parse_uid_range("0", &a, &b) >= 0 && a == 0 && b == 0); + assert_se(parse_uid_range("1", &a, &b) >= 0 && a == 1 && b == 1); + assert_se(parse_uid_range("2-2", &a, &b) >= 0 && a == 2 && b == 2); + assert_se(parse_uid_range("3-3", &a, &b) >= 0 && a == 3 && b == 3); + assert_se(parse_uid_range("4-5", &a, &b) >= 0 && a == 4 && b == 5); + + assert_se(parse_uid_range("7-6", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("-1", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("01", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("001", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("+1", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("1--1", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range(" 1", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range(" 1-2", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("1 -2", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("1- 2", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("1-2 ", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("01-2", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("1-02", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("001-2", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range("1-002", &a, &b) == -EINVAL && a == 4 && b == 5); + assert_se(parse_uid_range(" 01", &a, &b) == -EINVAL && a == 4 && b == 5); +} + +static void test_mangle_gecos_one(const char *input, const char *expected) { + _cleanup_free_ char *p = NULL; + + assert_se(p = mangle_gecos(input)); + assert_se(streq(p, expected)); + assert_se(valid_gecos(p)); +} + +TEST(mangle_gecos) { + test_mangle_gecos_one("", ""); + test_mangle_gecos_one("root", "root"); + test_mangle_gecos_one("wuff\nwuff", "wuff wuff"); + test_mangle_gecos_one("wuff:wuff", "wuff wuff"); + test_mangle_gecos_one("wuff\r\n:wuff", "wuff wuff"); + test_mangle_gecos_one("\n--wüff-wäff-wöff::", " --wüff-wäff-wöff "); + test_mangle_gecos_one("\xc3\x28", " ("); + test_mangle_gecos_one("\xe2\x28\xa1", " ( "); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c new file mode 100644 index 0000000..a0d7dc1 --- /dev/null +++ b/src/test/test-utf8.c @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "utf8.h" + +TEST(utf8_is_printable) { + assert_se(utf8_is_printable("ascii is valid\tunicode", 22)); + assert_se(utf8_is_printable("\342\204\242", 3)); + assert_se(!utf8_is_printable("\341\204", 2)); + assert_se(utf8_is_printable("ąę", 4)); + assert_se(!utf8_is_printable("\r", 1)); + assert_se(utf8_is_printable("\n", 1)); + assert_se(utf8_is_printable("\t", 1)); +} + +TEST(utf8_n_is_valid) { + assert_se( utf8_is_valid_n("ascii is valid unicode", 21)); + assert_se( utf8_is_valid_n("ascii is valid unicode", 22)); + assert_se(!utf8_is_valid_n("ascii is valid unicode", 23)); + assert_se( utf8_is_valid_n("\342\204\242", 0)); + assert_se(!utf8_is_valid_n("\342\204\242", 1)); + assert_se(!utf8_is_valid_n("\342\204\242", 2)); + assert_se( utf8_is_valid_n("\342\204\242", 3)); + assert_se(!utf8_is_valid_n("\342\204\242", 4)); + assert_se( utf8_is_valid_n("", 0)); + assert_se( utf8_is_valid_n("", 1)); + assert_se( utf8_is_valid_n("", 2)); + assert_se( utf8_is_valid_n("", 3)); + assert_se( utf8_is_valid_n("", 4)); + assert_se(!utf8_is_valid_n("", 5)); +} + +TEST(utf8_is_valid) { + assert_se(utf8_is_valid("ascii is valid unicode")); + assert_se(utf8_is_valid("\342\204\242")); + assert_se(!utf8_is_valid("\341\204")); +} + +TEST(ascii_is_valid) { + assert_se( ascii_is_valid("alsdjf\t\vbarr\nba z")); + assert_se(!ascii_is_valid("\342\204\242")); + assert_se(!ascii_is_valid("\341\204")); +} + +TEST(ascii_is_valid_n) { + assert_se( ascii_is_valid_n("alsdjf\t\vbarr\nba z", 17)); + assert_se( ascii_is_valid_n("alsdjf\t\vbarr\nba z", 16)); + assert_se(!ascii_is_valid_n("alsdjf\t\vbarr\nba z", 18)); + assert_se(!ascii_is_valid_n("\342\204\242", 3)); + assert_se(!ascii_is_valid_n("\342\204\242", 2)); + assert_se(!ascii_is_valid_n("\342\204\242", 1)); + assert_se( ascii_is_valid_n("\342\204\242", 0)); +} + +static void test_utf8_to_ascii_one(const char *s, int r_expected, const char *expected) { + _cleanup_free_ char *ans = NULL; + int r; + + r = utf8_to_ascii(s, '*', &ans); + log_debug("\"%s\" → %d/\"%s\" (expected %d/\"%s\")", s, r, strnull(ans), r_expected, strnull(expected)); + assert_se(r == r_expected); + assert_se(streq_ptr(ans, expected)); +} + +TEST(utf8_to_ascii) { + test_utf8_to_ascii_one("asdf", 0, "asdf"); + test_utf8_to_ascii_one("dąb", 0, "d*b"); + test_utf8_to_ascii_one("żęśłą óźń", 0, "***** ***"); + test_utf8_to_ascii_one("\342\204\242", 0, "*"); + test_utf8_to_ascii_one("\342\204", -EINVAL, NULL); /* truncated */ + test_utf8_to_ascii_one("\342", -EINVAL, NULL); /* truncated */ + test_utf8_to_ascii_one("\302\256", 0, "*"); + test_utf8_to_ascii_one("", 0, ""); + test_utf8_to_ascii_one(" ", 0, " "); + test_utf8_to_ascii_one("\t", 0, "\t"); + test_utf8_to_ascii_one("串", 0, "*"); + test_utf8_to_ascii_one("…👊🔪💐…", 0, "*****"); +} + +TEST(utf8_encoded_valid_unichar) { + assert_se(utf8_encoded_valid_unichar("\342\204\242", 1) == -EINVAL); /* truncated */ + assert_se(utf8_encoded_valid_unichar("\342\204\242", 2) == -EINVAL); /* truncated */ + assert_se(utf8_encoded_valid_unichar("\342\204\242", 3) == 3); + assert_se(utf8_encoded_valid_unichar("\342\204\242", 4) == 3); + assert_se(utf8_encoded_valid_unichar("\302\256", 1) == -EINVAL); /* truncated */ + assert_se(utf8_encoded_valid_unichar("\302\256", 2) == 2); + assert_se(utf8_encoded_valid_unichar("\302\256", 3) == 2); + assert_se(utf8_encoded_valid_unichar("\302\256", SIZE_MAX) == 2); + assert_se(utf8_encoded_valid_unichar("a", 1) == 1); + assert_se(utf8_encoded_valid_unichar("a", 2) == 1); + assert_se(utf8_encoded_valid_unichar("\341\204", 1) == -EINVAL); /* truncated, potentially valid */ + assert_se(utf8_encoded_valid_unichar("\341\204", 2) == -EINVAL); /* truncated, potentially valid */ + assert_se(utf8_encoded_valid_unichar("\341\204", 3) == -EINVAL); + assert_se(utf8_encoded_valid_unichar("\341\204\341\204", 4) == -EINVAL); + assert_se(utf8_encoded_valid_unichar("\341\204\341\204", 5) == -EINVAL); +} + +TEST(utf8_escape_invalid) { + _cleanup_free_ char *p1 = NULL, *p2 = NULL, *p3 = NULL; + + p1 = utf8_escape_invalid("goo goo goo"); + log_debug("\"%s\"", p1); + assert_se(utf8_is_valid(p1)); + + p2 = utf8_escape_invalid("\341\204\341\204"); + log_debug("\"%s\"", p2); + assert_se(utf8_is_valid(p2)); + + p3 = utf8_escape_invalid("\341\204"); + log_debug("\"%s\"", p3); + assert_se(utf8_is_valid(p3)); +} + +TEST(utf8_escape_non_printable) { + _cleanup_free_ char *p1 = NULL, *p2 = NULL, *p3 = NULL, *p4 = NULL, *p5 = NULL, *p6 = NULL; + + p1 = utf8_escape_non_printable("goo goo goo"); + log_debug("\"%s\"", p1); + assert_se(utf8_is_valid(p1)); + + p2 = utf8_escape_non_printable("\341\204\341\204"); + log_debug("\"%s\"", p2); + assert_se(utf8_is_valid(p2)); + + p3 = utf8_escape_non_printable("\341\204"); + log_debug("\"%s\"", p3); + assert_se(utf8_is_valid(p3)); + + p4 = utf8_escape_non_printable("ąę\n가너도루\n1234\n\341\204\341\204\n\001 \019\20\a"); + log_debug("\"%s\"", p4); + assert_se(utf8_is_valid(p4)); + + p5 = utf8_escape_non_printable("\001 \019\20\a"); + log_debug("\"%s\"", p5); + assert_se(utf8_is_valid(p5)); + + p6 = utf8_escape_non_printable("\xef\xbf\x30\x13"); + log_debug("\"%s\"", p6); + assert_se(utf8_is_valid(p6)); +} + +TEST(utf8_escape_non_printable_full) { + FOREACH_STRING(s, + "goo goo goo", /* ASCII */ + "\001 \019\20\a", /* control characters */ + "\xef\xbf\x30\x13") /* misplaced continuation bytes followed by a digit and cc */ + for (size_t cw = 0; cw < 22; cw++) { + _cleanup_free_ char *p = NULL, *q = NULL; + size_t ew; + + p = utf8_escape_non_printable_full(s, cw, false); + ew = utf8_console_width(p); + log_debug("%02zu \"%s\" (%zu wasted)", cw, p, cw - ew); + assert_se(utf8_is_valid(p)); + assert_se(ew <= cw); + + q = utf8_escape_non_printable_full(s, cw, true); + ew = utf8_console_width(q); + log_debug(" \"%s\" (%zu wasted)", q, cw - ew); + assert_se(utf8_is_valid(q)); + assert_se(ew <= cw); + if (cw > 0) + assert_se(endswith(q, "…")); + } +} + +TEST(utf16_to_utf8) { + const char16_t utf16[] = { htole16('a'), htole16(0xd800), htole16('b'), htole16(0xdc00), htole16('c'), htole16(0xd801), htole16(0xdc37) }; + static const char utf8[] = { 'a', 'b', 'c', 0xf0, 0x90, 0x90, 0xb7 }; + _cleanup_free_ char16_t *b = NULL; + _cleanup_free_ char *a = NULL; + + /* Convert UTF-16 to UTF-8, filtering embedded bad chars */ + a = utf16_to_utf8(utf16, sizeof(utf16)); + assert_se(a); + assert_se(memcmp(a, utf8, sizeof(utf8)) == 0); + + /* Convert UTF-8 to UTF-16, and back */ + b = utf8_to_utf16(utf8, sizeof(utf8)); + assert_se(b); + + free(a); + a = utf16_to_utf8(b, SIZE_MAX); + assert_se(a); + assert_se(strlen(a) == sizeof(utf8)); + assert_se(memcmp(a, utf8, sizeof(utf8)) == 0); +} + +TEST(utf8_n_codepoints) { + assert_se(utf8_n_codepoints("abc") == 3); + assert_se(utf8_n_codepoints("zażółcić gęślą jaźń") == 19); + assert_se(utf8_n_codepoints("串") == 1); + assert_se(utf8_n_codepoints("") == 0); + assert_se(utf8_n_codepoints("…👊🔪💐…") == 5); + assert_se(utf8_n_codepoints("\xF1") == SIZE_MAX); +} + +TEST(utf8_console_width) { + assert_se(utf8_console_width("abc") == 3); + assert_se(utf8_console_width("zażółcić gęślą jaźń") == 19); + assert_se(utf8_console_width("串") == 2); + assert_se(utf8_console_width("") == 0); + assert_se(utf8_console_width("…👊🔪💐…") == 8); + assert_se(utf8_console_width("\xF1") == SIZE_MAX); +} + +TEST(utf8_to_utf16) { + FOREACH_STRING(p, + "abc", + "zażółcić gęślą jaźń", + "串", + "", + "…👊🔪💐…") { + + _cleanup_free_ char16_t *a = NULL; + _cleanup_free_ char *b = NULL; + + a = utf8_to_utf16(p, SIZE_MAX); + assert_se(a); + + b = utf16_to_utf8(a, SIZE_MAX); + assert_se(b); + assert_se(streq(p, b)); + } +} + +static int intro(void) { + log_show_color(true); + return EXIT_SUCCESS; +} + +DEFINE_TEST_MAIN_WITH_INTRO(LOG_INFO, intro); diff --git a/src/test/test-utmp.c b/src/test/test-utmp.c new file mode 100644 index 0000000..06a0fce --- /dev/null +++ b/src/test/test-utmp.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "format-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "utmp-wtmp.h" +#include "tests.h" + +#ifndef UT_LINESIZE +# define UT_LINESIZE 32 +#endif +#ifndef UT_NAMESIZE +# define UT_NAMESIZE 32 +#endif +#ifndef UT_HOSTSIZE +# define UT_HOSTSIZE 256 +#endif + +TEST(dump_run_utmp) { + _unused_ _cleanup_(utxent_cleanup) bool utmpx = false; + + utmpx = utxent_start(); + + for (struct utmpx *u; (u = getutxent()); ) { + char _type_buf[DECIMAL_STR_MAX(short)]; + const char *type = + u->ut_type == EMPTY ? "EMPTY" : + u->ut_type == RUN_LVL ? "RUN_LVL" : + u->ut_type == BOOT_TIME ? "BOOT_TIME" : + u->ut_type == NEW_TIME ? "NEW_TIME" : + u->ut_type == OLD_TIME ? "OLD_TIME" : + u->ut_type == INIT_PROCESS ? "INIT_PROCESS" : + u->ut_type == LOGIN_PROCESS ? "LOGIN_PROCESS" : + u->ut_type == USER_PROCESS ? "USER_PROCESS" : + u->ut_type == DEAD_PROCESS ? "DEAD_PROCESS" : + u->ut_type == ACCOUNTING ? "ACCOUNTING" : + _type_buf; + if (type == _type_buf) + xsprintf(_type_buf, "%hd", u->ut_type); + + union in_addr_union addr = {}; + memcpy(&addr, u->ut_addr_v6, MIN(sizeof(addr), sizeof(u->ut_addr_v6))); + bool is_ipv4 = memeqzero((const uint8_t*) &addr + 4, sizeof(addr) - 4); + + log_info("%14s %10"PID_PRI" line=%-7.*s id=%-4.4s name=%-8.*s session=%lu host=%.*s addr=%s", + type, + u->ut_pid, + UT_LINESIZE, u->ut_line, + u->ut_id, + UT_NAMESIZE, u->ut_user, + (long unsigned) u->ut_session, + UT_HOSTSIZE, u->ut_host, + IN_ADDR_TO_STRING(is_ipv4 ? AF_INET : AF_INET6, &addr)); + } +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-varlink-idl.c b/src/test/test-varlink-idl.c new file mode 100644 index 0000000..cbdb9c6 --- /dev/null +++ b/src/test/test-varlink-idl.c @@ -0,0 +1,385 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "fd-util.h" +#include "pretty-print.h" +#include "tests.h" +#include "varlink.h" +#include "varlink-idl.h" +#include "varlink-io.systemd.h" +#include "varlink-io.systemd.Journal.h" +#include "varlink-io.systemd.ManagedOOM.h" +#include "varlink-io.systemd.PCRExtend.h" +#include "varlink-io.systemd.Resolve.Monitor.h" +#include "varlink-io.systemd.Resolve.h" +#include "varlink-io.systemd.UserDatabase.h" +#include "varlink-io.systemd.oom.h" +#include "varlink-io.systemd.service.h" +#include "varlink-io.systemd.sysext.h" +#include "varlink-org.varlink.service.h" + +static VARLINK_DEFINE_ENUM_TYPE( + EnumTest, + VARLINK_DEFINE_ENUM_VALUE(foo), + VARLINK_DEFINE_ENUM_VALUE(bar), + VARLINK_DEFINE_ENUM_VALUE(baz)); + +static VARLINK_DEFINE_STRUCT_TYPE( + NestedStructTest, + VARLINK_DEFINE_FIELD(x, VARLINK_INT, 0)); + +static VARLINK_DEFINE_STRUCT_TYPE( + StructTest, + + VARLINK_DEFINE_FIELD(bbb, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD(bbbn, VARLINK_BOOL, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(bbba, VARLINK_BOOL, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(bbbna, VARLINK_BOOL, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(bbbm, VARLINK_BOOL, VARLINK_MAP), + VARLINK_DEFINE_FIELD(bbbnm, VARLINK_BOOL, VARLINK_NULLABLE|VARLINK_MAP), + + VARLINK_DEFINE_FIELD(iii, VARLINK_INT, 0), + VARLINK_DEFINE_FIELD(iiin, VARLINK_INT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(iiia, VARLINK_INT, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(iiina, VARLINK_INT, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(iiim, VARLINK_INT, VARLINK_MAP), + VARLINK_DEFINE_FIELD(iiinm, VARLINK_INT, VARLINK_NULLABLE|VARLINK_MAP), + + VARLINK_DEFINE_FIELD(fff, VARLINK_FLOAT, 0), + VARLINK_DEFINE_FIELD(fffn, VARLINK_FLOAT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(fffa, VARLINK_FLOAT, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(fffna, VARLINK_FLOAT, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(fffm, VARLINK_FLOAT, VARLINK_MAP), + VARLINK_DEFINE_FIELD(fffnm, VARLINK_FLOAT, VARLINK_NULLABLE|VARLINK_MAP), + + VARLINK_DEFINE_FIELD(sss, VARLINK_STRING, 0), + VARLINK_DEFINE_FIELD(sssn, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(sssa, VARLINK_STRING, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(sssna, VARLINK_STRING, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(sssm, VARLINK_STRING, VARLINK_MAP), + VARLINK_DEFINE_FIELD(sssnm, VARLINK_STRING, VARLINK_NULLABLE|VARLINK_MAP), + + VARLINK_DEFINE_FIELD(ooo, VARLINK_OBJECT, 0), + VARLINK_DEFINE_FIELD(ooon, VARLINK_OBJECT, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD(oooa, VARLINK_OBJECT, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(ooona, VARLINK_OBJECT, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD(ooom, VARLINK_OBJECT, VARLINK_MAP), + VARLINK_DEFINE_FIELD(ooonm, VARLINK_OBJECT, VARLINK_NULLABLE|VARLINK_MAP), + + VARLINK_DEFINE_FIELD_BY_TYPE(eee, EnumTest, 0), + VARLINK_DEFINE_FIELD_BY_TYPE(eeen, EnumTest, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD_BY_TYPE(eeea, EnumTest, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD_BY_TYPE(eeena, EnumTest, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD_BY_TYPE(eeem, EnumTest, VARLINK_MAP), + VARLINK_DEFINE_FIELD_BY_TYPE(eeenm, EnumTest, VARLINK_NULLABLE|VARLINK_MAP), + + VARLINK_DEFINE_FIELD_BY_TYPE(nnn, NestedStructTest, 0), + VARLINK_DEFINE_FIELD_BY_TYPE(nnnn, NestedStructTest, VARLINK_NULLABLE), + VARLINK_DEFINE_FIELD_BY_TYPE(nnna, NestedStructTest, VARLINK_ARRAY), + VARLINK_DEFINE_FIELD_BY_TYPE(nnnna, NestedStructTest, VARLINK_NULLABLE|VARLINK_ARRAY), + VARLINK_DEFINE_FIELD_BY_TYPE(nnnm, NestedStructTest, VARLINK_MAP), + VARLINK_DEFINE_FIELD_BY_TYPE(nnnnm, NestedStructTest, VARLINK_NULLABLE|VARLINK_MAP)); + +static VARLINK_DEFINE_METHOD( + MethodTest, + VARLINK_DEFINE_INPUT(x, VARLINK_BOOL, 0), + VARLINK_DEFINE_INPUT_BY_TYPE(y, EnumTest, 0), + VARLINK_DEFINE_INPUT_BY_TYPE(z, StructTest, 0), + VARLINK_DEFINE_OUTPUT(x, VARLINK_BOOL, 0), + VARLINK_DEFINE_OUTPUT_BY_TYPE(y, EnumTest, 0), + VARLINK_DEFINE_OUTPUT_BY_TYPE(z, StructTest, 0)); + +static VARLINK_DEFINE_ERROR( + ErrorTest, + VARLINK_DEFINE_FIELD(x, VARLINK_BOOL, 0), + VARLINK_DEFINE_FIELD_BY_TYPE(y, EnumTest, 0), + VARLINK_DEFINE_FIELD_BY_TYPE(z, StructTest, 0)); + +static VARLINK_DEFINE_INTERFACE( + xyz_test, + "xyz.test", + &vl_type_EnumTest, + &vl_type_NestedStructTest, + &vl_type_StructTest, + &vl_method_MethodTest, + &vl_error_ErrorTest); + +static void test_parse_format_one(const VarlinkInterface *iface) { + _cleanup_(varlink_interface_freep) VarlinkInterface *parsed = NULL; + _cleanup_free_ char *text = NULL, *text2 = NULL; + + assert_se(iface); + + assert_se(varlink_idl_dump(stdout, /* use_colors=*/ true, iface) >= 0); + assert_se(varlink_idl_consistent(iface, LOG_ERR) >= 0); + assert_se(varlink_idl_format(iface, &text) >= 0); + assert_se(varlink_idl_parse(text, NULL, NULL, &parsed) >= 0); + assert_se(varlink_idl_consistent(parsed, LOG_ERR) >= 0); + assert_se(varlink_idl_format(parsed, &text2) >= 0); + assert_se(streq(text, text2)); +} + +TEST(parse_format) { + test_parse_format_one(&vl_interface_org_varlink_service); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_UserDatabase); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_Journal); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_Resolve); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_Resolve_Monitor); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_ManagedOOM); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_oom); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_PCRExtend); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_service); + print_separator(); + test_parse_format_one(&vl_interface_io_systemd_sysext); + print_separator(); + test_parse_format_one(&vl_interface_xyz_test); +} + +TEST(parse) { + _cleanup_(varlink_interface_freep) VarlinkInterface *parsed = NULL; + + /* This one has (nested) enonymous enums and structs */ + static const char text[] = + "interface quu.waa\n" + "type Fooenum ( a, b, c )\n" + "type Barstruct ( a : (x, y, z), b : (x : int), c: (f, ff, fff), d: object, e : (sub : (subsub: (subsubsub: string, subsubsub2: (iii, ooo)))))" + ; + + assert_se(varlink_idl_parse(text, NULL, NULL, &parsed) >= 0); + test_parse_format_one(parsed); + + assert_se(varlink_idl_parse("interface org.freedesktop.Foo\n" + "type Foo (b: bool, c: foo, c: int)", NULL, NULL, NULL) == -ENETUNREACH); /* unresolved type */ + assert_se(varlink_idl_parse("interface org.freedesktop.Foo\n" + "type Foo ()", NULL, NULL, NULL) == -EBADMSG); /* empty struct/enum */ + +} + +TEST(interface_name_is_valid) { + assert_se(!varlink_idl_interface_name_is_valid(NULL)); + assert_se(!varlink_idl_interface_name_is_valid("")); + assert_se(!varlink_idl_interface_name_is_valid(",")); + assert_se(!varlink_idl_interface_name_is_valid(".")); + assert_se(!varlink_idl_interface_name_is_valid("-")); + assert_se(varlink_idl_interface_name_is_valid("a")); + assert_se(varlink_idl_interface_name_is_valid("a.a")); + assert_se(!varlink_idl_interface_name_is_valid("-.a")); + assert_se(!varlink_idl_interface_name_is_valid("-a.a")); + assert_se(!varlink_idl_interface_name_is_valid("a-.a")); + assert_se(varlink_idl_interface_name_is_valid("a-a.a")); + assert_se(!varlink_idl_interface_name_is_valid("a-a.a-")); + assert_se(!varlink_idl_interface_name_is_valid("a-a.-a")); + assert_se(!varlink_idl_interface_name_is_valid("a-a.-")); + assert_se(varlink_idl_interface_name_is_valid("a-a.a-a")); + assert_se(varlink_idl_interface_name_is_valid("io.systemd.Foobar")); +} + +TEST(symbol_name_is_valid) { + assert_se(!varlink_idl_symbol_name_is_valid(NULL)); + assert_se(!varlink_idl_symbol_name_is_valid("")); + assert_se(!varlink_idl_symbol_name_is_valid("_")); + assert_se(!varlink_idl_symbol_name_is_valid("_foo")); + assert_se(varlink_idl_symbol_name_is_valid("Foofoo")); + assert_se(varlink_idl_symbol_name_is_valid("Foo")); + assert_se(varlink_idl_symbol_name_is_valid("Foo0")); + assert_se(!varlink_idl_symbol_name_is_valid("0Foo")); + assert_se(!varlink_idl_symbol_name_is_valid("foo")); + assert_se(varlink_idl_symbol_name_is_valid("Foo0foo")); + assert_se(!varlink_idl_symbol_name_is_valid("bool")); + assert_se(!varlink_idl_symbol_name_is_valid("int")); + assert_se(!varlink_idl_symbol_name_is_valid("float")); + assert_se(!varlink_idl_symbol_name_is_valid("string")); + assert_se(!varlink_idl_symbol_name_is_valid("object")); +} + +TEST(field_name_is_valid) { + assert_se(!varlink_idl_field_name_is_valid(NULL)); + assert_se(!varlink_idl_field_name_is_valid("")); + assert_se(!varlink_idl_field_name_is_valid("_")); + assert_se(!varlink_idl_field_name_is_valid("_foo")); + assert_se(!varlink_idl_field_name_is_valid("_foo_")); + assert_se(!varlink_idl_field_name_is_valid("foo_")); + assert_se(varlink_idl_field_name_is_valid("foo_foo")); + assert_se(varlink_idl_field_name_is_valid("f_o_o_f_o_o")); + assert_se(!varlink_idl_field_name_is_valid("foo__foo")); + assert_se(varlink_idl_field_name_is_valid("Foofoo")); + assert_se(varlink_idl_field_name_is_valid("Foo")); + assert_se(varlink_idl_field_name_is_valid("Foo0")); + assert_se(!varlink_idl_field_name_is_valid("0Foo")); + assert_se(varlink_idl_field_name_is_valid("foo")); + assert_se(varlink_idl_field_name_is_valid("Foo0foo")); + assert_se(varlink_idl_field_name_is_valid("foo0foo")); +} + +TEST(validate_json) { + + _cleanup_(varlink_interface_freep) VarlinkInterface *parsed = NULL; + + /* This one has (nested) enonymous enums and structs */ + static const char text[] = + "interface validate.test\n" + "method Mymethod ( a:string, b:int, c:?bool, d:[]int, e:?[string]bool, f:?(piff, paff), g:(f:float) ) -> ()\n"; + + assert_se(varlink_idl_parse(text, NULL, NULL, &parsed) >= 0); + test_parse_format_one(parsed); + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + assert_se(json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("a", JSON_BUILD_STRING("x")), + JSON_BUILD_PAIR("b", JSON_BUILD_UNSIGNED(44)), + JSON_BUILD_PAIR("d", JSON_BUILD_ARRAY(JSON_BUILD_UNSIGNED(5), JSON_BUILD_UNSIGNED(7), JSON_BUILD_UNSIGNED(107))), + JSON_BUILD_PAIR("g", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("f", JSON_BUILD_REAL(0.5f)))))) >= 0); + + json_variant_dump(v, JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO, stdout, NULL); + + const VarlinkSymbol* symbol = ASSERT_PTR(varlink_idl_find_symbol(parsed, VARLINK_METHOD, "Mymethod")); + + assert_se(varlink_idl_validate_method_call(symbol, v, NULL) >= 0); +} + +static int test_recursive_one(unsigned depth) { + _cleanup_(varlink_interface_freep) VarlinkInterface *parsed = NULL; + _cleanup_free_ char *pre = NULL, *post = NULL, *text = NULL; + static const char header[] = + "interface recursive.test\n" + "type Foo (\n"; + + /* Generate a chain of nested structures, i.e. a: (a: (... (int))...) */ + pre = strrep("a:(", depth); + post = strrep(")", depth); + if (!pre || !post) + return log_oom(); + + text = strjoin(header, pre, "int", post, ")"); + if (!text) + return log_oom(); + + return varlink_idl_parse(text, NULL, NULL, &parsed); +} + +TEST(recursive) { + assert_se(test_recursive_one(32) >= 0); + assert_se(test_recursive_one(64) >= 0); + + /* We should handle this gracefully without a stack overflow */ + assert_se(test_recursive_one(65) < 0); + assert_se(test_recursive_one(20000) < 0 ); +} + +static int test_method(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + JsonVariant *foo = json_variant_by_key(parameters, "foo"), *bar = json_variant_by_key(parameters, "bar"); + + return varlink_replyb(link, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("waldo", json_variant_unsigned(foo) * json_variant_unsigned(bar)), + JSON_BUILD_PAIR_UNSIGNED("quux", json_variant_unsigned(foo) + json_variant_unsigned(bar)))); +} + +static int done_method(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + assert_se(sd_event_exit(varlink_get_event(link), 0) >= 0); + return 0; +} + +static VARLINK_DEFINE_METHOD( + TestMethod, + VARLINK_DEFINE_INPUT(foo, VARLINK_INT, 0), + VARLINK_DEFINE_INPUT(bar, VARLINK_INT, 0), + VARLINK_DEFINE_INPUT(optional, VARLINK_STRING, VARLINK_NULLABLE), + VARLINK_DEFINE_OUTPUT(waldo, VARLINK_INT, 0), + VARLINK_DEFINE_OUTPUT(quux, VARLINK_INT, 0)); + +static VARLINK_DEFINE_METHOD(Done); + +static VARLINK_DEFINE_INTERFACE( + xyz, + "xyz", + &vl_method_TestMethod, + &vl_method_Done); + + +static void* server_thread(void *userdata) { + _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + + assert_se(varlink_server_new(&server, 0) >= 0); + assert_se(varlink_server_add_interface(server, &vl_interface_xyz) >= 0); + assert_se(varlink_server_bind_method(server, "xyz.TestMethod", test_method) >= 0); + assert_se(varlink_server_bind_method(server, "xyz.Done", done_method) >= 0); + + assert_se(sd_event_new(&event) >= 0); + assert_se(varlink_server_attach_event(server, event, 0) >= 0); + + assert_se(varlink_server_add_connection(server, PTR_TO_FD(userdata), NULL) >= 0); + + assert_se(sd_event_loop(event) >= 0); + return NULL; +} + +TEST(validate_method_call) { + _cleanup_close_pair_ int fd[2] = EBADF_PAIR; + _cleanup_(varlink_unrefp) Varlink *v = NULL; + pthread_t t; + + assert_se(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0, fd) >= 0); + assert_se(pthread_create(&t, NULL, server_thread, FD_TO_PTR(TAKE_FD(fd[1]))) == 0); + assert_se(varlink_connect_fd(&v, TAKE_FD(fd[0])) >= 0); + + JsonVariant *reply = NULL; + const char *error_id = NULL; + assert_se(varlink_callb(v, "xyz.TestMethod", &reply, &error_id, NULL, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("foo", 8), + JSON_BUILD_PAIR_UNSIGNED("bar", 9))) >= 0); + + _cleanup_(json_variant_unrefp) JsonVariant *expected_reply = NULL; + assert_se(json_build(&expected_reply, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("waldo", 8*9), + JSON_BUILD_PAIR_UNSIGNED("quux", 8+9))) >= 0); + + assert_se(!error_id); + + json_variant_dump(reply, JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO, NULL, NULL); + json_variant_dump(expected_reply, JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO, NULL, NULL); + assert_se(json_variant_equal(reply, expected_reply)); + + assert_se(varlink_callb(v, "xyz.TestMethod", &reply, &error_id, NULL, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("foo", 9), + JSON_BUILD_PAIR_UNSIGNED("bar", 8), + JSON_BUILD_PAIR_STRING("optional", "pfft"))) >= 0); + + assert_se(!error_id); + assert_se(json_variant_equal(reply, expected_reply)); + + assert_se(varlink_callb(v, "xyz.TestMethod", &reply, &error_id, NULL, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_UNSIGNED("foo", 8), + JSON_BUILD_PAIR_UNSIGNED("bar", 9), + JSON_BUILD_PAIR_STRING("zzz", "pfft"))) >= 0); + assert_se(streq_ptr(error_id, VARLINK_ERROR_INVALID_PARAMETER)); + + assert_se(varlink_callb(v, "xyz.TestMethod", &reply, &error_id, NULL, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("foo", "wuff"), + JSON_BUILD_PAIR_UNSIGNED("bar", 9))) >= 0); + assert_se(streq_ptr(error_id, VARLINK_ERROR_INVALID_PARAMETER)); + + assert_se(varlink_send(v, "xyz.Done", NULL) >= 0); + assert_se(varlink_flush(v) >= 0); + assert_se(pthread_join(t, NULL) == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-varlink.c b/src/test/test-varlink.c new file mode 100644 index 0000000..2617ed0 --- /dev/null +++ b/src/test/test-varlink.c @@ -0,0 +1,376 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-event.h" + +#include "data-fd-util.h" +#include "fd-util.h" +#include "json.h" +#include "rm-rf.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" +#include "varlink.h" + +/* Let's pick some high value, that is higher than the largest listen() backlog, but leaves enough room below + the typical RLIMIT_NOFILE value of 1024 so that we can process both sides of each socket in our + process. Or in other words: "OVERLOAD_CONNECTIONS * 2 + x < 1024" should hold, for some small x that + should cover any auxiliary fds, the listener server fds, stdin/stdout/stderr and whatever else. */ +#define OVERLOAD_CONNECTIONS 333 + +static int n_done = 0; +static int block_write_fd = -EBADF; + +static int method_something(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *ret = NULL; + JsonVariant *a, *b; + int64_t x, y; + int r; + + a = json_variant_by_key(parameters, "a"); + if (!a) + return varlink_error(link, "io.test.BadParameters", NULL); + + x = json_variant_integer(a); + + b = json_variant_by_key(parameters, "b"); + if (!b) + return varlink_error(link, "io.test.BadParameters", NULL); + + y = json_variant_integer(b); + + r = json_build(&ret, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("sum", JSON_BUILD_INTEGER(x + y)))); + if (r < 0) + return r; + + return varlink_reply(link, ret); +} + +static int method_something_more(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *ret = NULL; + int r; + + struct Something { + int x; + int y; + }; + + static const JsonDispatch dispatch_table[] = { + { "a", JSON_VARIANT_INTEGER, json_dispatch_int, offsetof(struct Something, x), JSON_MANDATORY }, + { "b", JSON_VARIANT_INTEGER, json_dispatch_int, offsetof(struct Something, y), JSON_MANDATORY}, + {} + }; + struct Something s = {}; + + r = varlink_dispatch(link, parameters, dispatch_table, &s); + if (r != 0) + return r; + + for (int i = 0; i < 5; i++) { + _cleanup_(json_variant_unrefp) JsonVariant *w = NULL; + + r = json_build(&w, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("sum", JSON_BUILD_INTEGER(s.x + (s.y * i))))); + if (r < 0) + return r; + + r = varlink_notify(link, w); + if (r < 0) + return r; + } + + r = json_build(&ret, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("sum", JSON_BUILD_INTEGER(s.x + (s.y * 5))))); + if (r < 0) + return r; + + return varlink_reply(link, ret); +} + +static void test_fd(int fd, const void *buf, size_t n) { + char rbuf[n + 1]; + ssize_t m; + + m = read(fd, rbuf, n + 1); + assert_se(m >= 0); + assert_se(memcmp_nn(buf, n, rbuf, m) == 0); +} + +static int method_passfd(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *ret = NULL; + JsonVariant *a; + int r; + + a = json_variant_by_key(parameters, "fd"); + if (!a) + return varlink_error(link, "io.test.BadParameters", NULL); + + assert_se(streq_ptr(json_variant_string(a), "whoop")); + + int xx = varlink_peek_fd(link, 0), + yy = varlink_peek_fd(link, 1), + zz = varlink_peek_fd(link, 2); + + log_info("%i %i %i", xx, yy, zz); + + assert_se(xx >= 0); + assert_se(yy >= 0); + assert_se(zz >= 0); + + test_fd(xx, "foo", 3); + test_fd(yy, "bar", 3); + test_fd(zz, "quux", 4); + + _cleanup_close_ int vv = acquire_data_fd("miau", 4, 0); + _cleanup_close_ int ww = acquire_data_fd("wuff", 4, 0); + + assert_se(vv >= 0); + assert_se(ww >= 0); + + r = json_build(&ret, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("yo", JSON_BUILD_INTEGER(88)))); + if (r < 0) + return r; + + assert_se(varlink_push_fd(link, vv) == 0); + assert_se(varlink_push_fd(link, ww) == 1); + + TAKE_FD(vv); + TAKE_FD(ww); + + return varlink_reply(link, ret); +} + +static int method_done(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + if (++n_done == 2) + sd_event_exit(varlink_get_event(link), EXIT_FAILURE); + + return 0; +} + +static int reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) { + JsonVariant *sum; + + sum = json_variant_by_key(parameters, "sum"); + + assert_se(json_variant_integer(sum) == 7+22); + + if (++n_done == 2) + sd_event_exit(varlink_get_event(link), EXIT_FAILURE); + + return 0; +} + +static int on_connect(VarlinkServer *s, Varlink *link, void *userdata) { + uid_t uid = UID_INVALID; + + assert_se(s); + assert_se(link); + + assert_se(varlink_get_peer_uid(link, &uid) >= 0); + assert_se(getuid() == uid); + assert_se(varlink_set_allow_fd_passing_input(link, true) >= 0); + assert_se(varlink_set_allow_fd_passing_output(link, true) >= 0); + + return 0; +} + +static int overload_reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) { + + /* This method call reply should always be called with a disconnection, since the method call should + * be talking to an overloaded server */ + + log_debug("Over reply triggered with error: %s", strna(error_id)); + assert_se(streq(error_id, VARLINK_ERROR_DISCONNECTED)); + sd_event_exit(varlink_get_event(link), 0); + + return 0; +} + +static void flood_test(const char *address) { + _cleanup_(varlink_flush_close_unrefp) Varlink *c = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ Varlink **connections = NULL; + size_t k; + char x = 'x'; + + log_debug("Flooding server..."); + + /* Block the main event loop while we flood */ + assert_se(write(block_write_fd, &x, sizeof(x)) == sizeof(x)); + + assert_se(sd_event_default(&e) >= 0); + + /* Flood the server with connections */ + assert_se(connections = new0(Varlink*, OVERLOAD_CONNECTIONS)); + for (k = 0; k < OVERLOAD_CONNECTIONS; k++) { + _cleanup_free_ char *t = NULL; + log_debug("connection %zu", k); + assert_se(varlink_connect_address(connections + k, address) >= 0); + + assert_se(asprintf(&t, "flood-%zu", k) >= 0); + assert_se(varlink_set_description(connections[k], t) >= 0); + assert_se(varlink_attach_event(connections[k], e, k) >= 0); + assert_se(varlink_sendb(connections[k], "io.test.Rubbish", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("id", JSON_BUILD_INTEGER(k)))) >= 0); + } + + /* Then, create one more, which should fail */ + log_debug("Creating overload connection..."); + assert_se(varlink_connect_address(&c, address) >= 0); + assert_se(varlink_set_description(c, "overload-client") >= 0); + assert_se(varlink_attach_event(c, e, k) >= 0); + assert_se(varlink_bind_reply(c, overload_reply) >= 0); + assert_se(varlink_invokeb(c, "io.test.Overload", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("foo", JSON_BUILD_CONST_STRING("bar")))) >= 0); + + /* Unblock it */ + log_debug("Unblocking server..."); + block_write_fd = safe_close(block_write_fd); + + /* This loop will terminate as soon as the overload reply callback is called */ + assert_se(sd_event_loop(e) >= 0); + + /* And close all connections again */ + for (k = 0; k < OVERLOAD_CONNECTIONS; k++) + connections[k] = varlink_unref(connections[k]); +} + +static void *thread(void *arg) { + _cleanup_(varlink_flush_close_unrefp) Varlink *c = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *i = NULL, *j = NULL; + JsonVariant *o = NULL, *k = NULL; + const char *error_id; + VarlinkReplyFlags flags = 0; + const char *e; + int x = 0; + + assert_se(json_build(&i, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("a", JSON_BUILD_INTEGER(88)), + JSON_BUILD_PAIR("b", JSON_BUILD_INTEGER(99)))) >= 0); + + assert_se(varlink_connect_address(&c, arg) >= 0); + assert_se(varlink_set_description(c, "thread-client") >= 0); + assert_se(varlink_set_allow_fd_passing_input(c, true) >= 0); + assert_se(varlink_set_allow_fd_passing_output(c, true) >= 0); + + assert_se(varlink_collect(c, "io.test.DoSomethingMore", i, &j, &error_id, &flags) >= 0); + + assert_se(!error_id); + assert_se(!flags); + assert_se(json_variant_is_array(j) && !json_variant_is_blank_array(j)); + + JSON_VARIANT_ARRAY_FOREACH(k, j) { + assert_se(json_variant_integer(json_variant_by_key(k, "sum")) == 88 + (99 * x)); + x++; + } + assert_se(x == 6); + + assert_se(varlink_call(c, "io.test.DoSomething", i, &o, &e, NULL) >= 0); + assert_se(json_variant_integer(json_variant_by_key(o, "sum")) == 88 + 99); + assert_se(!e); + + int fd1 = acquire_data_fd("foo", 3, 0); + int fd2 = acquire_data_fd("bar", 3, 0); + int fd3 = acquire_data_fd("quux", 4, 0); + + assert_se(fd1 >= 0); + assert_se(fd2 >= 0); + assert_se(fd3 >= 0); + + assert_se(varlink_push_fd(c, fd1) == 0); + assert_se(varlink_push_fd(c, fd2) == 1); + assert_se(varlink_push_fd(c, fd3) == 2); + + assert_se(varlink_callb(c, "io.test.PassFD", &o, &e, NULL, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("fd", JSON_BUILD_STRING("whoop")))) >= 0); + + int fd4 = varlink_peek_fd(c, 0); + int fd5 = varlink_peek_fd(c, 1); + + assert_se(fd4 >= 0); + assert_se(fd5 >= 0); + + test_fd(fd4, "miau", 4); + test_fd(fd5, "wuff", 4); + + assert_se(varlink_callb(c, "io.test.IDontExist", &o, &e, NULL, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("x", JSON_BUILD_REAL(5.5)))) >= 0); + assert_se(streq_ptr(json_variant_string(json_variant_by_key(o, "method")), "io.test.IDontExist")); + assert_se(streq(e, VARLINK_ERROR_METHOD_NOT_FOUND)); + + flood_test(arg); + + assert_se(varlink_send(c, "io.test.Done", NULL) >= 0); + + return NULL; +} + +static int block_fd_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + char c; + + assert_se(fd_nonblock(fd, false) >= 0); + + assert_se(read(fd, &c, sizeof(c)) == sizeof(c)); + /* When a character is written to this pipe we'll block until the pipe is closed. */ + + assert_se(read(fd, &c, sizeof(c)) == 0); + + assert_se(fd_nonblock(fd, true) >= 0); + + assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0); + + return 0; +} + +int main(int argc, char *argv[]) { + _cleanup_(sd_event_source_unrefp) sd_event_source *block_event = NULL; + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + _cleanup_(varlink_flush_close_unrefp) Varlink *c = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_close_pair_ int block_fds[2] = EBADF_PAIR; + pthread_t t; + const char *sp; + + test_setup_logging(LOG_DEBUG); + + assert_se(mkdtemp_malloc("/tmp/varlink-test-XXXXXX", &tmpdir) >= 0); + sp = strjoina(tmpdir, "/socket"); + + assert_se(sd_event_default(&e) >= 0); + + assert_se(pipe2(block_fds, O_NONBLOCK|O_CLOEXEC) >= 0); + assert_se(sd_event_add_io(e, &block_event, block_fds[0], EPOLLIN, block_fd_handler, NULL) >= 0); + assert_se(sd_event_source_set_priority(block_event, SD_EVENT_PRIORITY_IMPORTANT) >= 0); + block_write_fd = TAKE_FD(block_fds[1]); + + assert_se(varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID) >= 0); + assert_se(varlink_server_set_description(s, "our-server") >= 0); + + assert_se(varlink_server_bind_method(s, "io.test.PassFD", method_passfd) >= 0); + assert_se(varlink_server_bind_method(s, "io.test.DoSomething", method_something) >= 0); + assert_se(varlink_server_bind_method(s, "io.test.DoSomethingMore", method_something_more) >= 0); + assert_se(varlink_server_bind_method(s, "io.test.Done", method_done) >= 0); + assert_se(varlink_server_bind_connect(s, on_connect) >= 0); + assert_se(varlink_server_listen_address(s, sp, 0600) >= 0); + assert_se(varlink_server_attach_event(s, e, 0) >= 0); + assert_se(varlink_server_set_connections_max(s, OVERLOAD_CONNECTIONS) >= 0); + + assert_se(json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("a", JSON_BUILD_INTEGER(7)), + JSON_BUILD_PAIR("b", JSON_BUILD_INTEGER(22)))) >= 0); + + assert_se(varlink_connect_address(&c, sp) >= 0); + assert_se(varlink_set_description(c, "main-client") >= 0); + assert_se(varlink_bind_reply(c, reply) >= 0); + + assert_se(varlink_invoke(c, "io.test.DoSomething", v) >= 0); + + assert_se(varlink_attach_event(c, e, 0) >= 0); + + assert_se(pthread_create(&t, NULL, thread, (void*) sp) == 0); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(pthread_join(t, NULL) == 0); + + return 0; +} diff --git a/src/test/test-verbs.c b/src/test/test-verbs.c new file mode 100644 index 0000000..6e30794 --- /dev/null +++ b/src/test/test-verbs.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "macro.h" +#include "strv.h" +#include "tests.h" +#include "verbs.h" + +static int noop_dispatcher(int argc, char *argv[], void *userdata) { + return 0; +} + +#define test_dispatch_one(argv, verbs, expected) \ + optind = 0; \ + assert_se(dispatch_verb(strv_length(argv), argv, verbs, NULL) == expected); + +TEST(verbs) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, noop_dispatcher }, + { "list-images", VERB_ANY, 1, 0, noop_dispatcher }, + { "list", VERB_ANY, 2, VERB_DEFAULT, noop_dispatcher }, + { "status", 2, VERB_ANY, 0, noop_dispatcher }, + { "show", VERB_ANY, VERB_ANY, 0, noop_dispatcher }, + { "terminate", 2, VERB_ANY, 0, noop_dispatcher }, + { "login", 2, 2, 0, noop_dispatcher }, + { "copy-to", 3, 4, 0, noop_dispatcher }, + {} + }; + + /* not found */ + test_dispatch_one(STRV_MAKE("command-not-found"), verbs, -EINVAL); + + /* found */ + test_dispatch_one(STRV_MAKE("show"), verbs, 0); + + /* found, too few args */ + test_dispatch_one(STRV_MAKE("copy-to", "foo"), verbs, -EINVAL); + + /* found, meets min args */ + test_dispatch_one(STRV_MAKE("status", "foo", "bar"), verbs, 0); + + /* found, too many args */ + test_dispatch_one(STRV_MAKE("copy-to", "foo", "bar", "baz", "quux", "qaax"), verbs, -EINVAL); + + /* no verb, but a default is set */ + test_dispatch_one(STRV_MAKE_EMPTY, verbs, 0); +} + +TEST(verbs_no_default) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, noop_dispatcher }, + {}, + }; + + test_dispatch_one(STRV_MAKE(NULL), verbs, -EINVAL); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-watch-pid.c b/src/test/test-watch-pid.c new file mode 100644 index 0000000..b0c2c06 --- /dev/null +++ b/src/test/test-watch-pid.c @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "log.h" +#include "manager.h" +#include "process-util.h" +#include "rm-rf.h" +#include "service.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + Unit *a, *b, *c, *u; + int r; + + test_setup_logging(LOG_DEBUG); + + if (getuid() != 0) + return log_tests_skipped("not root"); + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + _cleanup_free_ char *unit_dir = NULL; + assert_se(get_testdata_dir("units/", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + + assert_se(runtime_dir = setup_fake_runtime_dir()); + + assert_se(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL, NULL) >= 0); + + assert_se(a = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(a, "a.service") >= 0); + assert_se(set_isempty(a->pids)); + + assert_se(b = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(b, "b.service") >= 0); + assert_se(set_isempty(b->pids)); + + assert_se(c = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(c, "c.service") >= 0); + assert_se(set_isempty(c->pids)); + + /* Fork off a child so that we have a PID to watch */ + _cleanup_(sigkill_waitp) pid_t pid = 0; + pid = fork(); + if (pid == 0) { + /* Child */ + pause(); + _exit(EXIT_SUCCESS); + } + + assert_se(pid >= 0); + + assert_se(hashmap_isempty(m->watch_pids)); + assert_se(manager_get_unit_by_pid(m, pid) == NULL); + + assert_se(unit_watch_pid(a, pid, false) >= 0); + assert_se(manager_get_unit_by_pid(m, pid) == a); + + assert_se(unit_watch_pid(a, pid, false) >= 0); + assert_se(manager_get_unit_by_pid(m, pid) == a); + + assert_se(unit_watch_pid(b, pid, false) >= 0); + u = manager_get_unit_by_pid(m, pid); + assert_se(u == a || u == b); + + assert_se(unit_watch_pid(b, pid, false) >= 0); + u = manager_get_unit_by_pid(m, pid); + assert_se(u == a || u == b); + + assert_se(unit_watch_pid(c, pid, false) >= 0); + u = manager_get_unit_by_pid(m, pid); + assert_se(u == a || u == b || u == c); + + assert_se(unit_watch_pid(c, pid, false) >= 0); + u = manager_get_unit_by_pid(m, pid); + assert_se(u == a || u == b || u == c); + + unit_unwatch_pid(b, pid); + u = manager_get_unit_by_pid(m, pid); + assert_se(u == a || u == c); + + unit_unwatch_pid(b, pid); + u = manager_get_unit_by_pid(m, pid); + assert_se(u == a || u == c); + + unit_unwatch_pid(a, pid); + assert_se(manager_get_unit_by_pid(m, pid) == c); + + unit_unwatch_pid(a, pid); + assert_se(manager_get_unit_by_pid(m, pid) == c); + + unit_unwatch_pid(c, pid); + assert_se(manager_get_unit_by_pid(m, pid) == NULL); + + unit_unwatch_pid(c, pid); + assert_se(manager_get_unit_by_pid(m, pid) == NULL); + + return 0; +} diff --git a/src/test/test-watchdog.c b/src/test/test-watchdog.c new file mode 100644 index 0000000..70d069c --- /dev/null +++ b/src/test/test-watchdog.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "log.h" +#include "tests.h" +#include "watchdog.h" + +int main(int argc, char *argv[]) { + usec_t t; + unsigned i, count; + int r; + bool slow; + + test_setup_logging(LOG_DEBUG); + + slow = slow_tests_enabled(); + + t = slow ? 10 * USEC_PER_SEC : 2 * USEC_PER_SEC; + count = slow ? 5 : 3; + + r = watchdog_setup(t); + if (r < 0) + log_warning_errno(r, "Failed to open watchdog: %m"); + + for (i = 0; i < count; i++) { + t = watchdog_runtime_wait(); + log_info("Sleeping " USEC_FMT " microseconds...", t); + usleep_safe(t); + log_info("Pinging..."); + r = watchdog_ping(); + if (r < 0) + log_warning_errno(r, "Failed to ping watchdog: %m"); + } + + watchdog_close(true); + return 0; +} diff --git a/src/test/test-web-util.c b/src/test/test-web-util.c new file mode 100644 index 0000000..d376d4a --- /dev/null +++ b/src/test/test-web-util.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "macro.h" +#include "tests.h" +#include "web-util.h" + +TEST(is_valid_documentation_url) { + assert_se(documentation_url_is_valid("https://www.freedesktop.org/wiki/Software/systemd")); + assert_se(documentation_url_is_valid("https://www.kernel.org/doc/Documentation/binfmt_misc.txt")); /* dead */ + assert_se(documentation_url_is_valid("https://www.kernel.org/doc/Documentation/admin-guide/binfmt-misc.rst")); + assert_se(documentation_url_is_valid("https://docs.kernel.org/admin-guide/binfmt-misc.html")); + assert_se(documentation_url_is_valid("file:/foo/foo")); + assert_se(documentation_url_is_valid("man:systemd.special(7)")); + assert_se(documentation_url_is_valid("info:bar")); + + assert_se(!documentation_url_is_valid("foo:")); + assert_se(!documentation_url_is_valid("info:")); + assert_se(!documentation_url_is_valid("")); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/test/test-xattr-util.c b/src/test/test-xattr-util.c new file mode 100644 index 0000000..85901c9 --- /dev/null +++ b/src/test/test-xattr-util.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "macro.h" +#include "rm-rf.h" +#include "string-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "xattr-util.h" + +TEST(getxattr_at_malloc) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_free_ char *value = NULL; + _cleanup_close_ int fd = -EBADF; + const char *x; + int r; + + fd = mkdtemp_open("/var/tmp/test-xattrtestXXXXXX", O_RDONLY|O_NOCTTY, &t); + assert_se(fd >= 0); + x = strjoina(t, "/test"); + assert_se(touch(x) >= 0); + + r = setxattr(x, "user.foo", "bar", 3, 0); + if (r < 0 && ERRNO_IS_NOT_SUPPORTED(errno)) + return (void) log_tests_skipped_errno(errno, "no xattrs supported on /var/tmp"); + assert_se(r >= 0); + + assert_se(getxattr_at_malloc(fd, "test", "user.foo", 0, &value) == 3); + assert_se(memcmp(value, "bar", 3) == 0); + value = mfree(value); + + assert_se(getxattr_at_malloc(AT_FDCWD, x, "user.foo", 0, &value) == 3); + assert_se(memcmp(value, "bar", 3) == 0); + value = mfree(value); + + safe_close(fd); + fd = open("/", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY); + assert_se(fd >= 0); + r = getxattr_at_malloc(fd, "usr", "user.idontexist", 0, &value); + assert_se(r < 0 && ERRNO_IS_XATTR_ABSENT(r)); + + safe_close(fd); + fd = open(x, O_PATH|O_CLOEXEC); + assert_se(fd >= 0); + assert_se(getxattr_at_malloc(fd, NULL, "user.foo", 0, &value) == 3); + assert_se(streq(value, "bar")); +} + +TEST(getcrtime) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int fd = -EBADF; + usec_t usec, k; + int r; + + fd = mkdtemp_open("/var/tmp/test-xattrtestXXXXXX", 0, &t); + assert_se(fd >= 0); + + r = fd_getcrtime(fd, &usec); + if (r < 0) + log_debug_errno(r, "btime: %m"); + else + log_debug("btime: %s", FORMAT_TIMESTAMP(usec)); + + k = now(CLOCK_REALTIME); + + r = fd_setcrtime(fd, 1519126446UL * USEC_PER_SEC); + if (!IN_SET(r, -EOPNOTSUPP, -ENOTTY)) { + assert_se(fd_getcrtime(fd, &usec) >= 0); + assert_se(k < 1519126446UL * USEC_PER_SEC || + usec == 1519126446UL * USEC_PER_SEC); + } +} + +static void verify_xattr(int dfd, const char *expected) { + _cleanup_free_ char *value = NULL; + + assert_se(getxattr_at_malloc(dfd, "test", "user.foo", 0, &value) == (int) strlen(expected)); + assert_se(streq(value, expected)); +} + +TEST(xsetxattr) { + _cleanup_(rm_rf_physical_and_freep) char *t = NULL; + _cleanup_close_ int dfd = -EBADF, fd = -EBADF; + const char *x; + int r; + + dfd = mkdtemp_open("/var/tmp/test-xattrtestXXXXXX", O_PATH, &t); + assert_se(dfd >= 0); + x = strjoina(t, "/test"); + assert_se(touch(x) >= 0); + + /* by full path */ + r = xsetxattr(AT_FDCWD, x, "user.foo", "fullpath", SIZE_MAX, 0); + if (r < 0 && ERRNO_IS_NOT_SUPPORTED(r)) + return (void) log_tests_skipped_errno(r, "no xattrs supported on /var/tmp"); + assert_se(r >= 0); + verify_xattr(dfd, "fullpath"); + + /* by dirfd */ + assert_se(xsetxattr(dfd, "test", "user.foo", "dirfd", SIZE_MAX, 0) >= 0); + verify_xattr(dfd, "dirfd"); + + /* by fd (O_PATH) */ + fd = openat(dfd, "test", O_PATH|O_CLOEXEC); + assert_se(fd >= 0); + assert_se(xsetxattr(fd, NULL, "user.foo", "fd_opath", SIZE_MAX, 0) >= 0); + verify_xattr(dfd, "fd_opath"); + assert_se(xsetxattr(fd, "", "user.foo", "fd_opath", SIZE_MAX, 0) == -EINVAL); + assert_se(xsetxattr(fd, "", "user.foo", "fd_opath_empty", SIZE_MAX, AT_EMPTY_PATH) >= 0); + verify_xattr(dfd, "fd_opath_empty"); + fd = safe_close(fd); + + fd = openat(dfd, "test", O_RDONLY|O_CLOEXEC); + assert_se(xsetxattr(fd, NULL, "user.foo", "fd_regular", SIZE_MAX, 0) >= 0); + verify_xattr(dfd, "fd_regular"); + assert_se(xsetxattr(fd, "", "user.foo", "fd_regular_empty", SIZE_MAX, 0) == -EINVAL); + assert_se(xsetxattr(fd, "", "user.foo", "fd_regular_empty", SIZE_MAX, AT_EMPTY_PATH) >= 0); + verify_xattr(dfd, "fd_regular_empty"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/test/test-xml.c b/src/test/test-xml.c new file mode 100644 index 0000000..a8cb635 --- /dev/null +++ b/src/test/test-xml.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "string-util.h" +#include "tests.h" +#include "xml.h" + +static void test_one(const char *data, ...) { + void *state = NULL; + va_list ap; + + va_start(ap, data); + + for (;;) { + _cleanup_free_ char *name = NULL; + int t, tt; + const char *nn; + + t = xml_tokenize(&data, &name, &state, NULL); + assert_se(t >= 0); + + tt = va_arg(ap, int); + assert_se(tt >= 0); + + assert_se(t == tt); + if (t == XML_END) + break; + + nn = va_arg(ap, const char *); + assert_se(streq_ptr(nn, name)); + } + + va_end(ap); +} + +int main(int argc, char *argv[]) { + + test_setup_logging(LOG_DEBUG); + + test_one("", XML_END); + + test_one("", + XML_TAG_OPEN, "foo", + XML_TAG_CLOSE, "foo", + XML_END); + + test_one("", + XML_TAG_OPEN, "foo", + XML_ATTRIBUTE_NAME, "waldo", + XML_ATTRIBUTE_VALUE, "piep", + XML_ATTRIBUTE_NAME, "meh", + XML_ATTRIBUTE_VALUE, "huhu", + XML_TAG_CLOSE_EMPTY, NULL, + XML_END); + + test_one("xxxx\n" + " ", + XML_TEXT, "xxxx\n", + XML_TAG_OPEN, "foo", + XML_TEXT, " ", + XML_TEXT, " ", + XML_TAG_CLOSE, "foo", + XML_END); + + return 0; +} diff --git a/src/timedate/meson.build b/src/timedate/meson.build new file mode 100644 index 0000000..48054bb --- /dev/null +++ b/src/timedate/meson.build @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-timedated', + 'dbus' : true, + 'conditions' : ['ENABLE_TIMEDATED'], + 'sources' : files('timedated.c'), + }, + executable_template + { + 'name' : 'timedatectl', + 'public' : true, + 'conditions' : ['ENABLE_TIMEDATECTL'], + 'sources' : files('timedatectl.c'), + 'dependencies' : libm, + }, +] + +if conf.get('ENABLE_TIMEDATED') == 1 + install_data('org.freedesktop.timedate1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.timedate1.service', + install_dir : dbussystemservicedir) + install_data('org.freedesktop.timedate1.policy', + install_dir : polkitpolicydir) +endif diff --git a/src/timedate/org.freedesktop.timedate1.conf b/src/timedate/org.freedesktop.timedate1.conf new file mode 100644 index 0000000..f91d7b0 --- /dev/null +++ b/src/timedate/org.freedesktop.timedate1.conf @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/timedate/org.freedesktop.timedate1.policy b/src/timedate/org.freedesktop.timedate1.policy new file mode 100644 index 0000000..14bcf2a --- /dev/null +++ b/src/timedate/org.freedesktop.timedate1.policy @@ -0,0 +1,62 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Set system time + Authentication is required to set the system time. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + org.freedesktop.timedate1.set-timezone org.freedesktop.timedate1.set-ntp + + + + Set system timezone + Authentication is required to set the system timezone. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Set RTC to local timezone or UTC + Authentication is required to control whether the RTC stores the local or UTC time. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + + Turn network time synchronization on or off + Authentication is required to control whether network time synchronization shall be enabled. + + auth_admin_keep + auth_admin_keep + auth_admin_keep + + + + diff --git a/src/timedate/org.freedesktop.timedate1.service b/src/timedate/org.freedesktop.timedate1.service new file mode 100644 index 0000000..6b82d70 --- /dev/null +++ b/src/timedate/org.freedesktop.timedate1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.timedate1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.timedate1.service diff --git a/src/timedate/timedatectl.c b/src/timedate/timedatectl.c new file mode 100644 index 0000000..418faa5 --- /dev/null +++ b/src/timedate/timedatectl.c @@ -0,0 +1,1039 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" + +#include "build.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-map-properties.h" +#include "bus-print-properties.h" +#include "env-util.h" +#include "format-table.h" +#include "in-addr-util.h" +#include "main-func.h" +#include "pager.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "sparse-endian.h" +#include "spawn-polkit-agent.h" +#include "string-table.h" +#include "strv.h" +#include "terminal-util.h" +#include "verbs.h" + +static PagerFlags arg_pager_flags = 0; +static bool arg_ask_password = true; +static BusTransport arg_transport = BUS_TRANSPORT_LOCAL; +static char *arg_host = NULL; +static bool arg_adjust_system_clock = false; +static bool arg_monitor = false; +static char **arg_property = NULL; +static BusPrintPropertyFlags arg_print_flags = 0; + +typedef struct StatusInfo { + usec_t time; + const char *timezone; + + usec_t rtc_time; + bool rtc_local; + + bool ntp_capable; + bool ntp_active; + bool ntp_synced; +} StatusInfo; + +static int print_status_info(const StatusInfo *i) { + _cleanup_(table_unrefp) Table *table = NULL; + const char *old_tz = NULL, *tz, *tz_colon; + bool have_time = false; + char a[LINE_MAX]; + TableCell *cell; + struct tm tm; + time_t sec; + size_t n; + int r; + + assert(i); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + (void) table_set_ellipsize_percent(table, cell, 100); + + assert_se(cell = table_get_cell(table, 0, 1)); + (void) table_set_ellipsize_percent(table, cell, 100); + + /* Save the old $TZ */ + tz = getenv("TZ"); + if (tz) + old_tz = strdupa_safe(tz); + + /* Set the new $TZ */ + tz_colon = strjoina(":", isempty(i->timezone) ? "UTC" : i->timezone); + if (setenv("TZ", tz_colon, true) < 0) + log_warning_errno(errno, "Failed to set TZ environment variable, ignoring: %m"); + else + tzset(); + + if (i->time != 0) { + sec = (time_t) (i->time / USEC_PER_SEC); + have_time = true; + } else if (IN_SET(arg_transport, BUS_TRANSPORT_LOCAL, BUS_TRANSPORT_MACHINE)) { + sec = time(NULL); + have_time = true; + } else + log_warning("Could not get time from timedated and not operating locally, ignoring."); + + n = have_time ? strftime(a, sizeof a, "%a %Y-%m-%d %H:%M:%S %Z", localtime_r(&sec, &tm)) : 0; + r = table_add_many(table, + TABLE_FIELD, "Local time", + TABLE_STRING, n > 0 ? a : "n/a"); + if (r < 0) + return table_log_add_error(r); + + n = have_time ? strftime(a, sizeof a, "%a %Y-%m-%d %H:%M:%S UTC", gmtime_r(&sec, &tm)) : 0; + r = table_add_many(table, + TABLE_FIELD, "Universal time", + TABLE_STRING, n > 0 ? a : "n/a"); + if (r < 0) + return table_log_add_error(r); + + if (i->rtc_time > 0) { + time_t rtc_sec; + + rtc_sec = (time_t) (i->rtc_time / USEC_PER_SEC); + n = strftime(a, sizeof a, "%a %Y-%m-%d %H:%M:%S", gmtime_r(&rtc_sec, &tm)); + } else + n = 0; + r = table_add_many(table, + TABLE_FIELD, "RTC time", + TABLE_STRING, n > 0 ? a : "n/a"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Time zone"); + if (r < 0) + return table_log_add_error(r); + + n = have_time ? strftime(a, sizeof a, "%Z, %z", localtime_r(&sec, &tm)) : 0; + r = table_add_cell_stringf(table, NULL, "%s (%s)", strna(i->timezone), n > 0 ? a : "n/a"); + if (r < 0) + return table_log_add_error(r); + + /* Restore the $TZ */ + r = set_unset_env("TZ", old_tz, true); + if (r < 0) + log_warning_errno(r, "Failed to set TZ environment variable, ignoring: %m"); + else + tzset(); + + r = table_add_many(table, + TABLE_FIELD, "System clock synchronized", + TABLE_BOOLEAN, i->ntp_synced, + TABLE_FIELD, "NTP service", + TABLE_STRING, i->ntp_capable ? (i->ntp_active ? "active" : "inactive") : "n/a", + TABLE_FIELD, "RTC in local TZ", + TABLE_BOOLEAN, i->rtc_local); + if (r < 0) + return table_log_add_error(r); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + if (i->rtc_local) + printf("\n%s" + "Warning: The system is configured to read the RTC time in the local time zone.\n" + " This mode cannot be fully supported. It will create various problems\n" + " with time zone changes and daylight saving time adjustments. The RTC\n" + " time is never updated, it relies on external facilities to maintain it.\n" + " If at all possible, use RTC in UTC by calling\n" + " 'timedatectl set-local-rtc 0'.%s\n", ansi_highlight(), ansi_normal()); + + return 0; +} + +static int show_status(int argc, char **argv, void *userdata) { + StatusInfo info = {}; + static const struct bus_properties_map map[] = { + { "Timezone", "s", NULL, offsetof(StatusInfo, timezone) }, + { "LocalRTC", "b", NULL, offsetof(StatusInfo, rtc_local) }, + { "NTP", "b", NULL, offsetof(StatusInfo, ntp_active) }, + { "CanNTP", "b", NULL, offsetof(StatusInfo, ntp_capable) }, + { "NTPSynchronized", "b", NULL, offsetof(StatusInfo, ntp_synced) }, + { "TimeUSec", "t", NULL, offsetof(StatusInfo, time) }, + { "RTCTimeUSec", "t", NULL, offsetof(StatusInfo, rtc_time) }, + {} + }; + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + r = bus_map_all_properties(bus, + "org.freedesktop.timedate1", + "/org/freedesktop/timedate1", + map, + BUS_MAP_BOOLEAN_AS_BOOL, + &error, + &m, + &info); + if (r < 0) + return log_error_errno(r, "Failed to query server: %s", bus_error_message(&error, r)); + + return print_status_info(&info); +} + +static int show_properties(int argc, char **argv, void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + r = bus_print_all_properties(bus, + "org.freedesktop.timedate1", + "/org/freedesktop/timedate1", + NULL, + arg_property, + arg_print_flags, + NULL); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static int set_time(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + bool relative = false, interactive = arg_ask_password; + sd_bus *bus = userdata; + usec_t t; + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = parse_timestamp(argv[1], &t); + if (r < 0) + return log_error_errno(r, "Failed to parse time specification '%s': %m", argv[1]); + + r = bus_call_method( + bus, + bus_timedate, + "SetTime", + &error, + NULL, + "xbb", (int64_t) t, relative, interactive); + if (r < 0) + return log_error_errno(r, "Failed to set time: %s", bus_error_message(&error, r)); + + return 0; +} + +static int set_timezone(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + int r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method(bus, bus_timedate, "SetTimezone", &error, NULL, "sb", argv[1], arg_ask_password); + if (r < 0) + return log_error_errno(r, "Failed to set time zone: %s", bus_error_message(&error, r)); + + return 0; +} + +static int set_local_rtc(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + int r, b; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + b = parse_boolean(argv[1]); + if (b < 0) + return log_error_errno(b, "Failed to parse local RTC setting '%s': %m", argv[1]); + + r = bus_call_method( + bus, + bus_timedate, + "SetLocalRTC", + &error, + NULL, + "bbb", b, arg_adjust_system_clock, arg_ask_password); + if (r < 0) + return log_error_errno(r, "Failed to set local RTC: %s", bus_error_message(&error, r)); + + return 0; +} + +static int set_ntp(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + int b, r; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + b = parse_boolean(argv[1]); + if (b < 0) + return log_error_errno(b, "Failed to parse NTP setting '%s': %m", argv[1]); + + r = bus_message_new_method_call(bus, &m, bus_timedate, "SetNTP"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "bb", b, arg_ask_password); + if (r < 0) + return bus_log_create_error(r); + + /* Reloading the daemon may take long, hence set a longer timeout here */ + r = sd_bus_call(bus, m, DAEMON_RELOAD_TIMEOUT_SEC, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to set ntp: %s", bus_error_message(&error, r)); + + return 0; +} + +static int list_timezones(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = userdata; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int r; + _cleanup_strv_free_ char **zones = NULL; + + r = bus_call_method(bus, bus_timedate, "ListTimezones", &error, &reply, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request list of time zones: %s", + bus_error_message(&error, r)); + + r = sd_bus_message_read_strv(reply, &zones); + if (r < 0) + return bus_log_parse_error(r); + + pager_open(arg_pager_flags); + strv_print(zones); + + return 0; +} + +typedef struct NTPStatusInfo { + const char *server_name; + char *server_address; + usec_t poll_interval, poll_max, poll_min; + usec_t root_distance_max; + + uint32_t leap, version, mode, stratum; + int32_t precision; + usec_t root_delay, root_dispersion; + union { + char str[5]; + uint32_t val; + } reference; + usec_t origin, recv, trans, dest; + + bool spike; + uint64_t packet_count; + usec_t jitter; + + int64_t freq; +} NTPStatusInfo; + +static void ntp_status_info_clear(NTPStatusInfo *p) { + p->server_address = mfree(p->server_address); +} + +static const char * const ntp_leap_table[4] = { + [0] = "normal", + [1] = "last minute of the day has 61 seconds", + [2] = "last minute of the day has 59 seconds", + [3] = "not synchronized", +}; + +DISABLE_WARNING_TYPE_LIMITS; +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(ntp_leap, uint32_t); +REENABLE_WARNING; + +static int print_ntp_status_info(NTPStatusInfo *i) { + usec_t delay, t14, t23, offset, root_distance; + _cleanup_(table_unrefp) Table *table = NULL; + bool offset_sign; + TableCell *cell; + int r; + + assert(i); + + table = table_new_vertical(); + if (!table) + return log_oom(); + + assert_se(cell = table_get_cell(table, 0, 0)); + (void) table_set_ellipsize_percent(table, cell, 100); + + assert_se(cell = table_get_cell(table, 0, 1)); + (void) table_set_ellipsize_percent(table, cell, 100); + + /* + * "Timestamp Name ID When Generated + * ------------------------------------------------------------ + * Originate Timestamp T1 time request sent by client + * Receive Timestamp T2 time request received by server + * Transmit Timestamp T3 time reply sent by server + * Destination Timestamp T4 time reply received by client + * + * The round-trip delay, d, and system clock offset, t, are defined as: + * d = (T4 - T1) - (T3 - T2) t = ((T2 - T1) + (T3 - T4)) / 2" + */ + + r = table_add_cell(table, NULL, TABLE_FIELD, "Server"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s (%s)", strna(i->server_address), strna(i->server_name)); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Poll interval"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s (min: %s; max %s)", + FORMAT_TIMESPAN(i->poll_interval, 0), + FORMAT_TIMESPAN(i->poll_min, 0), + FORMAT_TIMESPAN(i->poll_max, 0)); + if (r < 0) + return table_log_add_error(r); + + if (i->packet_count == 0) { + r = table_add_many(table, + TABLE_FIELD, "Packet count", + TABLE_STRING, "0"); + if (r < 0) + return table_log_add_error(r); + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; + } + + if (i->dest < i->origin || i->trans < i->recv || i->dest - i->origin < i->trans - i->recv) { + log_error("Invalid NTP response"); + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; + } + + delay = (i->dest - i->origin) - (i->trans - i->recv); + + t14 = i->origin + i->dest; + t23 = i->recv + i->trans; + offset_sign = t14 < t23; + offset = (offset_sign ? t23 - t14 : t14 - t23) / 2; + + root_distance = i->root_delay / 2 + i->root_dispersion; + + r = table_add_many(table, + TABLE_FIELD, "Leap", + TABLE_STRING, ntp_leap_to_string(i->leap), + TABLE_FIELD, "Version", + TABLE_UINT32, i->version, + TABLE_FIELD, "Stratum", + TABLE_UINT32, i->stratum, + TABLE_FIELD, "Reference"); + if (r < 0) + return table_log_add_error(r); + + if (i->stratum <= 1) + r = table_add_cell(table, NULL, TABLE_STRING, i->reference.str); + else + r = table_add_cell_stringf(table, NULL, "%" PRIX32, be32toh(i->reference.val)); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Precision"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s (%" PRIi32 ")", + FORMAT_TIMESPAN(DIV_ROUND_UP((nsec_t) (exp2(i->precision) * NSEC_PER_SEC), NSEC_PER_USEC), 0), + i->precision); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Root distance"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s (max: %s)", + FORMAT_TIMESPAN(root_distance, 0), + FORMAT_TIMESPAN(i->root_distance_max, 0)); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell(table, NULL, TABLE_FIELD, "Offset"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%s%s", + offset_sign ? "+" : "-", + FORMAT_TIMESPAN(offset, 0)); + if (r < 0) + return table_log_add_error(r); + + r = table_add_many(table, + TABLE_FIELD, "Delay", + TABLE_STRING, FORMAT_TIMESPAN(delay, 0), + TABLE_FIELD, "Jitter", + TABLE_STRING, FORMAT_TIMESPAN(i->jitter, 0), + TABLE_FIELD, "Packet count", + TABLE_UINT64, i->packet_count); + if (r < 0) + return table_log_add_error(r); + + if (!i->spike) { + r = table_add_cell(table, NULL, TABLE_FIELD, "Frequency"); + if (r < 0) + return table_log_add_error(r); + + r = table_add_cell_stringf(table, NULL, "%+.3fppm", (double) i->freq / 0x10000); + if (r < 0) + return table_log_add_error(r); + } + + r = table_print(table, NULL); + if (r < 0) + return table_log_print_error(r); + + return 0; +} + +static int map_server_address(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + char **p = (char **) userdata; + const void *d; + int family, r; + size_t sz; + + assert(p); + + r = sd_bus_message_enter_container(m, 'r', "iay"); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "i", &family); + if (r < 0) + return r; + + r = sd_bus_message_read_array(m, 'y', &d, &sz); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + if (sz == 0 && family == AF_UNSPEC) { + *p = mfree(*p); + return 0; + } + + if (!IN_SET(family, AF_INET, AF_INET6)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown address family %i", family); + + if (sz != FAMILY_ADDRESS_SIZE(family)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid address size"); + + r = in_addr_to_string(family, d, p); + if (r < 0) + return r; + + return 0; +} + +static int map_ntp_message(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) { + NTPStatusInfo *p = ASSERT_PTR(userdata); + const void *d; + size_t sz; + int32_t b; + int r; + + r = sd_bus_message_enter_container(m, 'r', "uuuuittayttttbtt"); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "uuuuitt", + &p->leap, &p->version, &p->mode, &p->stratum, &p->precision, + &p->root_delay, &p->root_dispersion); + if (r < 0) + return r; + + r = sd_bus_message_read_array(m, 'y', &d, &sz); + if (r < 0) + return r; + + r = sd_bus_message_read(m, "ttttbtt", + &p->origin, &p->recv, &p->trans, &p->dest, + &b, &p->packet_count, &p->jitter); + if (r < 0) + return r; + + r = sd_bus_message_exit_container(m); + if (r < 0) + return r; + + if (sz != 4) + return -EINVAL; + + memcpy(p->reference.str, d, sz); + + p->spike = b; + + return 0; +} + +static int show_timesync_status_once(sd_bus *bus) { + static const struct bus_properties_map map_timesync[] = { + { "ServerName", "s", NULL, offsetof(NTPStatusInfo, server_name) }, + { "ServerAddress", "(iay)", map_server_address, offsetof(NTPStatusInfo, server_address) }, + { "PollIntervalUSec", "t", NULL, offsetof(NTPStatusInfo, poll_interval) }, + { "PollIntervalMinUSec", "t", NULL, offsetof(NTPStatusInfo, poll_min) }, + { "PollIntervalMaxUSec", "t", NULL, offsetof(NTPStatusInfo, poll_max) }, + { "RootDistanceMaxUSec", "t", NULL, offsetof(NTPStatusInfo, root_distance_max) }, + { "NTPMessage", "(uuuuittayttttbtt)", map_ntp_message, 0 }, + { "Frequency", "x", NULL, offsetof(NTPStatusInfo, freq) }, + {} + }; + _cleanup_(ntp_status_info_clear) NTPStatusInfo info = {}; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL; + int r; + + assert(bus); + + r = bus_map_all_properties(bus, + "org.freedesktop.timesync1", + "/org/freedesktop/timesync1", + map_timesync, + BUS_MAP_BOOLEAN_AS_BOOL, + &error, + &m, + &info); + if (r < 0) + return log_error_errno(r, "Failed to query server: %s", bus_error_message(&error, r)); + + if (arg_monitor && !terminal_is_dumb()) + fputs(ANSI_HOME_CLEAR, stdout); + + print_ntp_status_info(&info); + + return 0; +} + +static int on_properties_changed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + const char *name; + int r; + + assert(m); + + r = sd_bus_message_read(m, "s", &name); + if (r < 0) + return bus_log_parse_error(r); + + if (!streq_ptr(name, "org.freedesktop.timesync1.Manager")) + return 0; + + return show_timesync_status_once(sd_bus_message_get_bus(m)); +} + +static int show_timesync_status(int argc, char **argv, void *userdata) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + r = show_timesync_status_once(bus); + if (r < 0) + return r; + + if (!arg_monitor) + return 0; + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get event loop: %m"); + + r = sd_bus_match_signal(bus, + NULL, + "org.freedesktop.timesync1", + "/org/freedesktop/timesync1", + "org.freedesktop.DBus.Properties", + "PropertiesChanged", + on_properties_changed, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request match for PropertiesChanged signal: %m"); + + r = sd_bus_attach_event(bus, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + return 0; +} + +static int print_timesync_property(const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags) { + char type; + const char *contents; + int r; + + assert(name); + assert(m); + + r = sd_bus_message_peek_type(m, &type, &contents); + if (r < 0) + return r; + + switch (type) { + + case SD_BUS_TYPE_STRUCT: + if (streq(name, "NTPMessage")) { + _cleanup_(ntp_status_info_clear) NTPStatusInfo i = {}; + + r = map_ntp_message(NULL, NULL, m, NULL, &i); + if (r < 0) + return r; + + if (i.packet_count == 0) + return 1; + + if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE)) { + fputs(name, stdout); + fputc('=', stdout); + } + + printf("{ Leap=%u, Version=%u, Mode=%u, Stratum=%u, Precision=%i,", + i.leap, i.version, i.mode, i.stratum, i.precision); + printf(" RootDelay=%s,", FORMAT_TIMESPAN(i.root_delay, 0)); + printf(" RootDispersion=%s,", FORMAT_TIMESPAN(i.root_dispersion, 0)); + + if (i.stratum == 1) + printf(" Reference=%s,", i.reference.str); + else + printf(" Reference=%" PRIX32 ",", be32toh(i.reference.val)); + + printf(" OriginateTimestamp=%s,", FORMAT_TIMESTAMP(i.origin)); + printf(" ReceiveTimestamp=%s,", FORMAT_TIMESTAMP(i.recv)); + printf(" TransmitTimestamp=%s,", FORMAT_TIMESTAMP(i.trans)); + printf(" DestinationTimestamp=%s,", FORMAT_TIMESTAMP(i.dest)); + printf(" Ignored=%s, PacketCount=%" PRIu64 ",", + yes_no(i.spike), i.packet_count); + printf(" Jitter=%s }\n", FORMAT_TIMESPAN(i.jitter, 0)); + + return 1; + + } else if (streq(name, "ServerAddress")) { + _cleanup_free_ char *str = NULL; + + r = map_server_address(NULL, NULL, m, NULL, &str); + if (r < 0) + return r; + + bus_print_property_value(name, expected_value, flags, str); + + return 1; + } + break; + } + + return 0; +} + +static int show_timesync(int argc, char **argv, void *userdata) { + sd_bus *bus = ASSERT_PTR(userdata); + int r; + + r = bus_print_all_properties(bus, + "org.freedesktop.timesync1", + "/org/freedesktop/timesync1", + print_timesync_property, + arg_property, + arg_print_flags, + NULL); + if (r < 0) + return bus_log_parse_error(r); + + return 0; +} + +static int parse_ifindex_bus(sd_bus *bus, const char *str) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + int32_t i; + int r; + + assert(bus); + assert(str); + + r = parse_ifindex(str); + if (r > 0) + return r; + assert(r < 0); + + r = bus_call_method(bus, bus_network_mgr, "GetLinkByName", &error, &reply, "s", str); + if (r < 0) + return log_error_errno(r, "Failed to get ifindex of interfaces %s: %s", str, bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "io", &i, NULL); + if (r < 0) + return bus_log_create_error(r); + + return i; +} + +static int verb_ntp_servers(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *req = NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int ifindex, r; + + ifindex = parse_ifindex_bus(bus, argv[1]); + if (ifindex < 0) + return ifindex; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_message_new_method_call(bus, &req, bus_network_mgr, "SetLinkNTP"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(req, "i", ifindex); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(req, argv + 2); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, req, 0, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to set NTP servers: %s", bus_error_message(&error, r)); + + return 0; +} + +static int verb_revert(int argc, char **argv, void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + sd_bus *bus = ASSERT_PTR(userdata); + int ifindex, r; + + ifindex = parse_ifindex_bus(bus, argv[1]); + if (ifindex < 0) + return ifindex; + + polkit_agent_open_if_enabled(arg_transport, arg_ask_password); + + r = bus_call_method(bus, bus_network_mgr, "RevertLinkNTP", &error, NULL, "i", ifindex); + if (r < 0) + return log_error_errno(r, "Failed to revert interface configuration: %s", bus_error_message(&error, r)); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("timedatectl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n" + "\n%sQuery or change system time and date settings.%s\n" + "\nCommands:\n" + " status Show current time settings\n" + " show Show properties of systemd-timedated\n" + " set-time TIME Set system time\n" + " set-timezone ZONE Set system time zone\n" + " list-timezones Show known time zones\n" + " set-local-rtc BOOL Control whether RTC is in local time\n" + " set-ntp BOOL Enable or disable network time synchronization\n" + "\nsystemd-timesyncd Commands:\n" + " timesync-status Show status of systemd-timesyncd\n" + " show-timesync Show properties of systemd-timesyncd\n" + " ntp-servers INTERFACE SERVER…\n" + " Set the interface specific NTP servers\n" + " revert INTERFACE Revert the interface specific NTP servers\n" + "\nOptions:\n" + " -h --help Show this help message\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-ask-password Do not prompt for password\n" + " -H --host=[USER@]HOST Operate on remote host\n" + " -M --machine=CONTAINER Operate on local container\n" + " --adjust-system-clock Adjust system clock when changing local RTC mode\n" + " --monitor Monitor status of systemd-timesyncd\n" + " -p --property=NAME Show only properties by this name\n" + " -a --all Show all properties, including empty ones\n" + " --value When showing properties, only print the value\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int verb_help(int argc, char **argv, void *userdata) { + return help(); +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_ADJUST_SYSTEM_CLOCK, + ARG_NO_ASK_PASSWORD, + ARG_MONITOR, + ARG_VALUE, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "host", required_argument, NULL, 'H' }, + { "machine", required_argument, NULL, 'M' }, + { "no-ask-password", no_argument, NULL, ARG_NO_ASK_PASSWORD }, + { "adjust-system-clock", no_argument, NULL, ARG_ADJUST_SYSTEM_CLOCK }, + { "monitor", no_argument, NULL, ARG_MONITOR }, + { "property", required_argument, NULL, 'p' }, + { "all", no_argument, NULL, 'a' }, + { "value", no_argument, NULL, ARG_VALUE }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hH:M:p:a", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'H': + arg_transport = BUS_TRANSPORT_REMOTE; + arg_host = optarg; + break; + + case 'M': + arg_transport = BUS_TRANSPORT_MACHINE; + arg_host = optarg; + break; + + case ARG_NO_ASK_PASSWORD: + arg_ask_password = false; + break; + + case ARG_ADJUST_SYSTEM_CLOCK: + arg_adjust_system_clock = true; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_MONITOR: + arg_monitor = true; + break; + + case 'p': { + r = strv_extend(&arg_property, optarg); + if (r < 0) + return log_oom(); + + /* If the user asked for a particular + * property, show it to them, even if it is + * empty. */ + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + break; + } + + case 'a': + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_SHOW_EMPTY, true); + break; + + case ARG_VALUE: + SET_FLAG(arg_print_flags, BUS_PRINT_PROPERTY_ONLY_VALUE, true); + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int timedatectl_main(sd_bus *bus, int argc, char *argv[]) { + static const Verb verbs[] = { + { "status", VERB_ANY, 1, VERB_DEFAULT, show_status }, + { "show", VERB_ANY, 1, 0, show_properties }, + { "set-time", 2, 2, 0, set_time }, + { "set-timezone", 2, 2, 0, set_timezone }, + { "list-timezones", VERB_ANY, 1, 0, list_timezones }, + { "set-local-rtc", 2, 2, 0, set_local_rtc }, + { "set-ntp", 2, 2, 0, set_ntp }, + { "timesync-status", VERB_ANY, 1, 0, show_timesync_status }, + { "show-timesync", VERB_ANY, 1, 0, show_timesync }, + { "ntp-servers", 3, VERB_ANY, 0, verb_ntp_servers }, + { "revert", 2, 2, 0, verb_revert }, + { "help", VERB_ANY, VERB_ANY, 0, verb_help }, /* Not documented, but supported since it is created. */ + {} + }; + + return dispatch_verb(argc, argv, verbs, bus); +} + +static int run(int argc, char *argv[]) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + setlocale(LC_ALL, ""); + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = bus_connect_transport(arg_transport, arg_host, RUNTIME_SCOPE_SYSTEM, &bus); + if (r < 0) + return bus_log_connect_error(r, arg_transport); + + return timedatectl_main(bus, argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/timedate/timedated.c b/src/timedate/timedated.c new file mode 100644 index 0000000..c7be30f --- /dev/null +++ b/src/timedate/timedated.c @@ -0,0 +1,1162 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-bus.h" +#include "sd-event.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "bus-common-errors.h" +#include "bus-error.h" +#include "bus-get-properties.h" +#include "bus-locator.h" +#include "bus-log-control-api.h" +#include "bus-map-properties.h" +#include "bus-polkit.h" +#include "bus-unit-util.h" +#include "clock-util.h" +#include "conf-files.h" +#include "constants.h" +#include "fd-util.h" +#include "fileio-label.h" +#include "fileio.h" +#include "fs-util.h" +#include "hashmap.h" +#include "list.h" +#include "main-func.h" +#include "memory-util.h" +#include "missing_capability.h" +#include "path-util.h" +#include "selinux-util.h" +#include "service-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "unit-def.h" +#include "unit-name.h" +#include "user-util.h" + +#define NULL_ADJTIME_UTC "0.0 0 0\n0\nUTC\n" +#define NULL_ADJTIME_LOCAL "0.0 0 0\n0\nLOCAL\n" + +#define UNIT_LIST_DIRS (const char* const*) CONF_PATHS_STRV("systemd/ntp-units.d") + +typedef struct UnitStatusInfo { + char *name; + char *load_state; + char *unit_file_state; + char *active_state; + char *path; + + LIST_FIELDS(struct UnitStatusInfo, units); +} UnitStatusInfo; + +typedef struct Context { + char *zone; + bool local_rtc; + Hashmap *polkit_registry; + sd_bus_message *cache; + + sd_bus_slot *slot_job_removed; + + LIST_HEAD(UnitStatusInfo, units); +} Context; + +#define log_unit_full_errno_zerook(unit, level, error, ...) \ + ({ \ + const UnitStatusInfo *_u = (unit); \ + _u ? log_object_internal(level, error, PROJECT_FILE, __LINE__, __func__, "UNIT=", _u->name, NULL, NULL, ##__VA_ARGS__) : \ + log_internal(level, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \ + }) + +#define log_unit_full_errno(unit, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_unit_full_errno_zerook(unit, level, _error, ##__VA_ARGS__); \ + }) + +#define log_unit_full(unit, level, ...) (void) log_unit_full_errno_zerook(unit, level, 0, ##__VA_ARGS__) + +#define log_unit_debug(unit, ...) log_unit_full(unit, LOG_DEBUG, ##__VA_ARGS__) +#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, ##__VA_ARGS__) +#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, ##__VA_ARGS__) +#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, ##__VA_ARGS__) +#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, ##__VA_ARGS__) + +#define log_unit_debug_errno(unit, error, ...) log_unit_full_errno(unit, LOG_DEBUG, error, ##__VA_ARGS__) +#define log_unit_info_errno(unit, error, ...) log_unit_full_errno(unit, LOG_INFO, error, ##__VA_ARGS__) +#define log_unit_notice_errno(unit, error, ...) log_unit_full_errno(unit, LOG_NOTICE, error, ##__VA_ARGS__) +#define log_unit_warning_errno(unit, error, ...) log_unit_full_errno(unit, LOG_WARNING, error, ##__VA_ARGS__) +#define log_unit_error_errno(unit, error, ...) log_unit_full_errno(unit, LOG_ERR, error, ##__VA_ARGS__) + +static void unit_status_info_clear(UnitStatusInfo *p) { + assert(p); + + p->load_state = mfree(p->load_state); + p->unit_file_state = mfree(p->unit_file_state); + p->active_state = mfree(p->active_state); +} + +static UnitStatusInfo *unit_status_info_free(UnitStatusInfo *p) { + if (!p) + return NULL; + + unit_status_info_clear(p); + free(p->name); + free(p->path); + return mfree(p); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(UnitStatusInfo*, unit_status_info_free); + +static void context_clear(Context *c) { + assert(c); + + free(c->zone); + bus_verify_polkit_async_registry_free(c->polkit_registry); + sd_bus_message_unref(c->cache); + + sd_bus_slot_unref(c->slot_job_removed); + + LIST_CLEAR(units, c->units, unit_status_info_free); +} + +static int context_add_ntp_service(Context *c, const char *s, const char *source) { + _cleanup_(unit_status_info_freep) UnitStatusInfo *unit = NULL; + + assert(c); + assert(s); + assert(source); + + if (!unit_name_is_valid(s, UNIT_NAME_PLAIN)) + return -EINVAL; + + /* Do not add this if it is already listed */ + LIST_FOREACH(units, u, c->units) + if (streq(u->name, s)) + return 0; + + unit = new0(UnitStatusInfo, 1); + if (!unit) + return -ENOMEM; + + unit->name = strdup(s); + if (!unit->name) + return -ENOMEM; + + LIST_APPEND(units, c->units, unit); + log_unit_debug(unit, "added from %s.", source); + TAKE_PTR(unit); + + return 0; +} + +static int context_parse_ntp_services_from_environment(Context *c) { + const char *env, *p; + int r; + + assert(c); + + env = getenv("SYSTEMD_TIMEDATED_NTP_SERVICES"); + if (!env) + return 0; + + log_debug("Using list of ntp services from environment variable $SYSTEMD_TIMEDATED_NTP_SERVICES=%s.", env); + + for (p = env;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ":", 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_error("Invalid syntax, ignoring: %s", env); + break; + } + + r = context_add_ntp_service(c, word, "$SYSTEMD_TIMEDATED_NTP_SERVICES"); + if (r < 0) + log_warning_errno(r, "Failed to add NTP service \"%s\", ignoring: %m", word); + } + + return 1; +} + +static int context_parse_ntp_services_from_disk(Context *c) { + _cleanup_strv_free_ char **files = NULL; + int r; + + r = conf_files_list_strv(&files, ".list", NULL, CONF_FILES_FILTER_MASKED, UNIT_LIST_DIRS); + if (r < 0) + return log_error_errno(r, "Failed to enumerate .list files: %m"); + + STRV_FOREACH(f, files) { + _cleanup_fclose_ FILE *file = NULL; + + log_debug("Reading file '%s'", *f); + + r = fopen_unlocked(*f, "re", &file); + if (r < 0) { + log_error_errno(r, "Failed to open %s, ignoring: %m", *f); + continue; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(file, LINE_MAX, &line); + if (r < 0) { + log_error_errno(r, "Failed to read %s, ignoring: %m", *f); + continue; + } + if (r == 0) + break; + + if (isempty(line) || startswith(line, "#")) + continue; + + r = context_add_ntp_service(c, line, *f); + if (r < 0) + log_warning_errno(r, "Failed to add NTP service \"%s\", ignoring: %m", line); + } + } + + return 1; +} + +static int context_parse_ntp_services(Context *c) { + int r; + + r = context_parse_ntp_services_from_environment(c); + if (r != 0) + return r; + + return context_parse_ntp_services_from_disk(c); +} + +static int context_ntp_service_is_active(Context *c) { + int count = 0; + + assert(c); + + /* Call context_update_ntp_status() to update UnitStatusInfo before calling this. */ + + LIST_FOREACH(units, info, c->units) + count += !STRPTR_IN_SET(info->active_state, "inactive", "failed"); + + return count; +} + +static int context_ntp_service_exists(Context *c) { + int count = 0; + + assert(c); + + /* Call context_update_ntp_status() to update UnitStatusInfo before calling this. */ + + LIST_FOREACH(units, info, c->units) + count += streq_ptr(info->load_state, "loaded"); + + return count; +} + +static int context_read_data(Context *c) { + _cleanup_free_ char *t = NULL; + int r; + + assert(c); + + r = get_timezone(&t); + if (r == -EINVAL) + log_warning_errno(r, "/etc/localtime should be a symbolic link to a time zone data file in /usr/share/zoneinfo/."); + else if (r < 0) + log_warning_errno(r, "Failed to get target of /etc/localtime: %m"); + + free_and_replace(c->zone, t); + + c->local_rtc = clock_is_localtime(NULL) > 0; + + return 0; +} + +static int context_write_data_timezone(Context *c) { + _cleanup_free_ char *p = NULL; + const char *source; + + assert(c); + + /* No timezone is very similar to UTC. Hence in either of these cases link the UTC file in. Except if + * it isn't installed, in which case we remove the symlink altogether. Since glibc defaults to an + * internal version of UTC in that case behaviour is mostly equivalent. We still prefer creating the + * symlink though, since things are more self explanatory then. */ + + if (isempty(c->zone) || streq(c->zone, "UTC")) { + + if (access("/usr/share/zoneinfo/UTC", F_OK) < 0) { + + if (unlink("/etc/localtime") < 0 && errno != ENOENT) + return -errno; + + return 0; + } + + source = "../usr/share/zoneinfo/UTC"; + } else { + p = path_join("../usr/share/zoneinfo", c->zone); + if (!p) + return -ENOMEM; + + source = p; + } + + return symlink_atomic(source, "/etc/localtime"); +} + +static int context_write_data_local_rtc(Context *c) { + _cleanup_free_ char *s = NULL, *w = NULL; + int r; + + assert(c); + + r = read_full_file("/etc/adjtime", &s, NULL); + if (r < 0) { + if (r != -ENOENT) + return r; + + if (!c->local_rtc) + return 0; + + w = strdup(NULL_ADJTIME_LOCAL); + if (!w) + return -ENOMEM; + } else { + char *p; + const char *e = "\n"; /* default if there is less than 3 lines */ + const char *prepend = ""; + size_t a, b; + + p = strchrnul(s, '\n'); + if (*p == '\0') + /* only one line, no \n terminator */ + prepend = "\n0\n"; + else if (p[1] == '\0') { + /* only one line, with \n terminator */ + ++p; + prepend = "0\n"; + } else { + p = strchr(p+1, '\n'); + if (!p) { + /* only two lines, no \n terminator */ + prepend = "\n"; + p = s + strlen(s); + } else { + char *end; + /* third line might have a \n terminator or not */ + p++; + end = strchr(p, '\n'); + /* if we actually have a fourth line, use that as suffix "e", otherwise the default \n */ + if (end) + e = end; + } + } + + a = p - s; + b = strlen(e); + + w = new(char, a + (c->local_rtc ? 5 : 3) + strlen(prepend) + b + 1); + if (!w) + return -ENOMEM; + + *(char*) mempcpy(stpcpy(stpcpy(mempcpy(w, s, a), prepend), c->local_rtc ? "LOCAL" : "UTC"), e, b) = 0; + + if (streq(w, NULL_ADJTIME_UTC)) { + if (unlink("/etc/adjtime") < 0) + if (errno != ENOENT) + return -errno; + + return 0; + } + } + + r = mac_init(); + if (r < 0) + return r; + + return write_string_file_atomic_label("/etc/adjtime", w); +} + +static int context_update_ntp_status(Context *c, sd_bus *bus, sd_bus_message *m) { + static const struct bus_properties_map map[] = { + { "LoadState", "s", NULL, offsetof(UnitStatusInfo, load_state) }, + { "ActiveState", "s", NULL, offsetof(UnitStatusInfo, active_state) }, + { "UnitFileState", "s", NULL, offsetof(UnitStatusInfo, unit_file_state) }, + {} + }; + int r; + + assert(c); + assert(bus); + + /* Suppress calling context_update_ntp_status() multiple times within single DBus transaction. */ + if (m) { + if (m == c->cache) + return 0; + + sd_bus_message_unref(c->cache); + c->cache = sd_bus_message_ref(m); + } + + LIST_FOREACH(units, u, c->units) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *path = NULL; + + unit_status_info_clear(u); + + path = unit_dbus_path_from_name(u->name); + if (!path) + return -ENOMEM; + + r = bus_map_all_properties( + bus, + "org.freedesktop.systemd1", + path, + map, + BUS_MAP_STRDUP, + &error, + NULL, + u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to get properties: %s", bus_error_message(&error, r)); + } + + return 0; +} + +static int match_job_removed(sd_bus_message *m, void *userdata, sd_bus_error *error) { + Context *c = ASSERT_PTR(userdata); + const char *path; + unsigned n = 0; + int r; + + assert(m); + + r = sd_bus_message_read(m, "uoss", NULL, &path, NULL, NULL); + if (r < 0) { + bus_log_parse_error(r); + return 0; + } + + LIST_FOREACH(units, u, c->units) + if (streq_ptr(path, u->path)) + u->path = mfree(u->path); + else + n += !!u->path; + + if (n == 0) { + c->slot_job_removed = sd_bus_slot_unref(c->slot_job_removed); + + (void) sd_bus_emit_properties_changed(sd_bus_message_get_bus(m), + "/org/freedesktop/timedate1", "org.freedesktop.timedate1", "NTP", + NULL); + } + + return 0; +} + +static int unit_start_or_stop(UnitStatusInfo *u, sd_bus *bus, sd_bus_error *error, bool start) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + const char *path; + int r; + + assert(u); + assert(bus); + assert(error); + + r = bus_call_method( + bus, + bus_systemd_mgr, + start ? "StartUnit" : "StopUnit", + error, + &reply, + "ss", + u->name, + "replace"); + log_unit_full_errno_zerook(u, r < 0 ? LOG_WARNING : LOG_DEBUG, r, + "%s unit: %m", start ? "Starting" : "Stopping"); + if (r < 0) + return r; + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + r = free_and_strdup(&u->path, path); + if (r < 0) + return log_oom(); + + return 0; +} + +static int unit_enable_or_disable(UnitStatusInfo *u, sd_bus *bus, sd_bus_error *error, bool enable) { + int r; + + assert(u); + assert(bus); + assert(error); + + /* Call context_update_ntp_status() to update UnitStatusInfo before calling this. */ + + if (streq(u->unit_file_state, "enabled") == enable) { + log_unit_debug(u, "already %sd.", enable_disable(enable)); + return 0; + } + + log_unit_info(u, "%s unit.", enable ? "Enabling" : "Disabling"); + + if (enable) + r = bus_call_method( + bus, + bus_systemd_mgr, + "EnableUnitFiles", + error, + NULL, + "asbb", 1, + u->name, + false, true); + else + r = bus_call_method( + bus, + bus_systemd_mgr, + "DisableUnitFiles", + error, + NULL, + "asb", 1, + u->name, + false); + if (r < 0) + return r; + + r = bus_service_manager_reload(bus); + if (r < 0) + return r; + + return 0; +} + +static bool ntp_synced(void) { + struct timex txc = {}; + + if (adjtimex(&txc) < 0) + return false; + + /* Consider the system clock synchronized if the reported maximum error is smaller than the maximum + * value (16 seconds). Ignore the STA_UNSYNC flag as it may have been set to prevent the kernel from + * touching the RTC. */ + return txc.maxerror < 16000000; +} + +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_time, "t", now(CLOCK_REALTIME)); +static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_ntp_sync, "b", ntp_synced()); + +static int property_get_rtc_time( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + struct tm tm = {}; + usec_t t = 0; + int r; + + r = clock_get_hwclock(&tm); + if (r == -EBUSY) + log_warning("/dev/rtc is busy. Is somebody keeping it open continuously? That's not a good idea... Returning a bogus RTC timestamp."); + else if (r == -ENOENT) + log_debug("/dev/rtc not found."); + else if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read RTC: %m"); + else + t = (usec_t) timegm(&tm) * USEC_PER_SEC; + + return sd_bus_message_append(reply, "t", t); +} + +static int property_get_can_ntp( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Context *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + assert(error); + + if (c->slot_job_removed) + /* When the previous request is not finished, then assume NTP is enabled. */ + return sd_bus_message_append(reply, "b", true); + + r = context_update_ntp_status(c, bus, reply); + if (r < 0) + return r; + + return sd_bus_message_append(reply, "b", context_ntp_service_exists(c) > 0); +} + +static int property_get_ntp( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Context *c = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(property); + assert(reply); + assert(error); + + if (c->slot_job_removed) + /* When the previous request is not finished, then assume NTP is active. */ + return sd_bus_message_append(reply, "b", true); + + r = context_update_ntp_status(c, bus, reply); + if (r < 0) + return r; + + return sd_bus_message_append(reply, "b", context_ntp_service_is_active(c) > 0); +} + +static int method_set_timezone(sd_bus_message *m, void *userdata, sd_bus_error *error) { + Context *c = ASSERT_PTR(userdata); + int interactive, r; + const char *z; + + assert(m); + + r = sd_bus_message_read(m, "sb", &z, &interactive); + if (r < 0) + return r; + + if (!timezone_is_valid(z, LOG_DEBUG)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid or not installed time zone '%s'", z); + + if (streq_ptr(z, c->zone)) + return sd_bus_reply_method_return(m, NULL); + + r = bus_verify_polkit_async( + m, + CAP_SYS_TIME, + "org.freedesktop.timedate1.set-timezone", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = free_and_strdup(&c->zone, z); + if (r < 0) + return r; + + /* 1. Write new configuration file */ + r = context_write_data_timezone(c); + if (r < 0) { + log_error_errno(r, "Failed to set time zone: %m"); + return sd_bus_error_set_errnof(error, r, "Failed to set time zone: %m"); + } + + /* 2. Make glibc notice the new timezone */ + tzset(); + + /* 3. Tell the kernel our timezone */ + r = clock_set_timezone(NULL); + if (r < 0) + log_debug_errno(r, "Failed to tell kernel about timezone, ignoring: %m"); + + if (c->local_rtc) { + struct timespec ts; + struct tm tm; + + /* 4. Sync RTC from system clock, with the new delta */ + assert_se(clock_gettime(CLOCK_REALTIME, &ts) == 0); + assert_se(localtime_r(&ts.tv_sec, &tm)); + + r = clock_set_hwclock(&tm); + if (r < 0) + log_debug_errno(r, "Failed to sync time to hardware clock, ignoring: %m"); + } + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_TIMEZONE_CHANGE_STR, + "TIMEZONE=%s", c->zone, + "TIMEZONE_SHORTNAME=%s", tzname[daylight], + "DAYLIGHT=%i", daylight, + LOG_MESSAGE("Changed time zone to '%s' (%s).", c->zone, tzname[daylight])); + + (void) sd_bus_emit_properties_changed(sd_bus_message_get_bus(m), + "/org/freedesktop/timedate1", "org.freedesktop.timedate1", "Timezone", + NULL); + + return sd_bus_reply_method_return(m, NULL); +} + +static int method_set_local_rtc(sd_bus_message *m, void *userdata, sd_bus_error *error) { + int lrtc, fix_system, interactive; + Context *c = ASSERT_PTR(userdata); + struct timespec ts; + int r; + + assert(m); + + r = sd_bus_message_read(m, "bbb", &lrtc, &fix_system, &interactive); + if (r < 0) + return r; + + if (lrtc == c->local_rtc && !fix_system) + return sd_bus_reply_method_return(m, NULL); + + r = bus_verify_polkit_async( + m, + CAP_SYS_TIME, + "org.freedesktop.timedate1.set-local-rtc", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; + + if (lrtc != c->local_rtc) { + c->local_rtc = lrtc; + + /* 1. Write new configuration file */ + r = context_write_data_local_rtc(c); + if (r < 0) { + log_error_errno(r, "Failed to set RTC to %s: %m", lrtc ? "local" : "UTC"); + return sd_bus_error_set_errnof(error, r, "Failed to set RTC to %s: %m", lrtc ? "local" : "UTC"); + } + } + + /* 2. Tell the kernel our timezone */ + r = clock_set_timezone(NULL); + if (r < 0) + log_debug_errno(r, "Failed to tell kernel about timezone, ignoring: %m"); + + /* 3. Synchronize clocks */ + assert_se(clock_gettime(CLOCK_REALTIME, &ts) == 0); + + if (fix_system) { + struct tm tm; + + /* Sync system clock from RTC; first, initialize the timezone fields of struct tm. */ + localtime_or_gmtime_r(&ts.tv_sec, &tm, !c->local_rtc); + + /* Override the main fields of struct tm, but not the timezone fields */ + r = clock_get_hwclock(&tm); + if (r < 0) + log_debug_errno(r, "Failed to get hardware clock, ignoring: %m"); + else { + /* And set the system clock with this */ + ts.tv_sec = mktime_or_timegm(&tm, !c->local_rtc); + + if (clock_settime(CLOCK_REALTIME, &ts) < 0) + log_debug_errno(errno, "Failed to update system clock, ignoring: %m"); + } + + } else { + struct tm tm; + + /* Sync RTC from system clock */ + localtime_or_gmtime_r(&ts.tv_sec, &tm, !c->local_rtc); + + r = clock_set_hwclock(&tm); + if (r < 0) + log_debug_errno(r, "Failed to sync time to hardware clock, ignoring: %m"); + } + + log_info("RTC configured to %s time.", c->local_rtc ? "local" : "UTC"); + + (void) sd_bus_emit_properties_changed(sd_bus_message_get_bus(m), + "/org/freedesktop/timedate1", "org.freedesktop.timedate1", "LocalRTC", + NULL); + + return sd_bus_reply_method_return(m, NULL); +} + +static int method_set_time(sd_bus_message *m, void *userdata, sd_bus_error *error) { + sd_bus *bus = sd_bus_message_get_bus(m); + char buf[FORMAT_TIMESTAMP_MAX]; + int relative, interactive, r; + Context *c = ASSERT_PTR(userdata); + int64_t utc; + struct timespec ts; + usec_t start; + struct tm tm; + + assert(m); + + if (c->slot_job_removed) + return sd_bus_error_set(error, BUS_ERROR_AUTOMATIC_TIME_SYNC_ENABLED, "Previous request is not finished, refusing."); + + r = context_update_ntp_status(c, bus, m); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to update context: %m"); + + if (context_ntp_service_is_active(c) > 0) + return sd_bus_error_set(error, BUS_ERROR_AUTOMATIC_TIME_SYNC_ENABLED, "Automatic time synchronization is enabled"); + + /* this only gets used if dbus does not provide a timestamp */ + start = now(CLOCK_MONOTONIC); + + r = sd_bus_message_read(m, "xbb", &utc, &relative, &interactive); + if (r < 0) + return r; + + if (!relative && utc <= 0) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid absolute time"); + + if (relative && utc == 0) + return sd_bus_reply_method_return(m, NULL); + + if (relative) { + usec_t n, x; + + n = now(CLOCK_REALTIME); + x = n + utc; + + if ((utc > 0 && x < n) || + (utc < 0 && x > n)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Time value overflow"); + + timespec_store(&ts, x); + } else + timespec_store(&ts, (usec_t) utc); + + r = bus_verify_polkit_async( + m, + CAP_SYS_TIME, + "org.freedesktop.timedate1.set-time", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; + + /* adjust ts for time spent in program */ + r = sd_bus_message_get_monotonic_usec(m, &start); + /* when sd_bus_message_get_monotonic_usec() returns -ENODATA it does not modify &start */ + if (r < 0 && r != -ENODATA) + return r; + + timespec_store(&ts, timespec_load(&ts) + (now(CLOCK_MONOTONIC) - start)); + + /* Set system clock */ + if (clock_settime(CLOCK_REALTIME, &ts) < 0) { + log_error_errno(errno, "Failed to set local time: %m"); + return sd_bus_error_set_errnof(error, errno, "Failed to set local time: %m"); + } + + /* Sync down to RTC */ + localtime_or_gmtime_r(&ts.tv_sec, &tm, !c->local_rtc); + + r = clock_set_hwclock(&tm); + if (r < 0) + log_debug_errno(r, "Failed to update hardware clock, ignoring: %m"); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR, + "REALTIME="USEC_FMT, timespec_load(&ts), + LOG_MESSAGE("Changed local time to %s", strnull(format_timestamp(buf, sizeof(buf), timespec_load(&ts))))); + + return sd_bus_reply_method_return(m, NULL); +} + +static int method_set_ntp(sd_bus_message *m, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_slot_unrefp) sd_bus_slot *slot = NULL; + sd_bus *bus = sd_bus_message_get_bus(m); + Context *c = ASSERT_PTR(userdata); + const UnitStatusInfo *selected = NULL; + int enable, interactive, q, r; + + assert(m); + assert(bus); + + r = sd_bus_message_read(m, "bb", &enable, &interactive); + if (r < 0) + return r; + + r = context_update_ntp_status(c, bus, m); + if (r < 0) + return r; + + if (context_ntp_service_exists(c) <= 0) + return sd_bus_error_set(error, BUS_ERROR_NO_NTP_SUPPORT, "NTP not supported"); + + r = bus_verify_polkit_async( + m, + CAP_SYS_TIME, + "org.freedesktop.timedate1.set-ntp", + NULL, + interactive, + UID_INVALID, + &c->polkit_registry, + error); + if (r < 0) + return r; + if (r == 0) + return 1; + + /* This method may be called frequently. Forget the previous job if it has not completed yet. */ + LIST_FOREACH(units, u, c->units) + u->path = mfree(u->path); + + if (!c->slot_job_removed) { + r = bus_match_signal_async( + bus, + &slot, + bus_systemd_mgr, + "JobRemoved", + match_job_removed, NULL, c); + if (r < 0) + return r; + } + + if (enable) + LIST_FOREACH(units, u, c->units) { + bool enable_this_one = !selected; + + if (!streq(u->load_state, "loaded")) + continue; + + r = unit_enable_or_disable(u, bus, error, enable_this_one); + if (r < 0) + /* If enablement failed, don't start this unit. */ + enable_this_one = false; + + r = unit_start_or_stop(u, bus, error, enable_this_one); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to %s %sd NTP unit, ignoring: %m", + enable_this_one ? "start" : "stop", + enable_disable(enable_this_one)); + if (enable_this_one) + selected = u; + } + else + LIST_FOREACH(units, u, c->units) { + if (!streq(u->load_state, "loaded")) + continue; + + q = unit_enable_or_disable(u, bus, error, false); + if (q < 0) + r = q; + + q = unit_start_or_stop(u, bus, error, false); + if (q < 0) + r = q; + } + + if (r < 0) + return r; + if (enable && !selected) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "No NTP service found to enable."); + + if (slot) + c->slot_job_removed = TAKE_PTR(slot); + + if (selected) + log_info("Set NTP to enabled (%s).", selected->name); + else + log_info("Set NTP to disabled."); + + return sd_bus_reply_method_return(m, NULL); +} + +static int method_list_timezones(sd_bus_message *m, void *userdata, sd_bus_error *error) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_strv_free_ char **zones = NULL; + int r; + + assert(m); + + r = get_timezones(&zones); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to read list of time zones: %m"); + + r = sd_bus_message_new_method_return(m, &reply); + if (r < 0) + return r; + + r = sd_bus_message_append_strv(reply, zones); + if (r < 0) + return r; + + return sd_bus_send(NULL, reply, NULL); +} + +static const sd_bus_vtable timedate_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("Timezone", "s", NULL, offsetof(Context, zone), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("LocalRTC", "b", bus_property_get_bool, offsetof(Context, local_rtc), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("CanNTP", "b", property_get_can_ntp, 0, 0), + SD_BUS_PROPERTY("NTP", "b", property_get_ntp, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("NTPSynchronized", "b", property_get_ntp_sync, 0, 0), + SD_BUS_PROPERTY("TimeUSec", "t", property_get_time, 0, 0), + SD_BUS_PROPERTY("RTCTimeUSec", "t", property_get_rtc_time, 0, 0), + + SD_BUS_METHOD_WITH_ARGS("SetTime", + SD_BUS_ARGS("x", usec_utc, "b", relative, "b", interactive), + SD_BUS_NO_RESULT, + method_set_time, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetTimezone", + SD_BUS_ARGS("s", timezone, "b", interactive), + SD_BUS_NO_RESULT, + method_set_timezone, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetLocalRTC", + SD_BUS_ARGS("b", local_rtc, "b", fix_system, "b", interactive), + SD_BUS_NO_RESULT, + method_set_local_rtc, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("SetNTP", + SD_BUS_ARGS("b", use_ntp, "b", interactive), + SD_BUS_NO_RESULT, + method_set_ntp, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("ListTimezones", + SD_BUS_NO_ARGS, + SD_BUS_RESULT("as", timezones), + method_list_timezones, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END, +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/timedate1", + "org.freedesktop.timedate1", + .vtables = BUS_VTABLES(timedate_vtable), +}; + +static int connect_bus(Context *c, sd_event *event, sd_bus **_bus) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + assert(c); + assert(event); + assert(_bus); + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to get system bus connection: %m"); + + r = bus_add_implementation(bus, &manager_object, c); + if (r < 0) + return r; + + r = bus_log_control_api_register(bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.timedate1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + *_bus = TAKE_PTR(bus); + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(context_clear) Context context = {}; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int r; + + log_setup(); + + r = service_parse_argv("systemd-timedated.service", + "Manage the system clock and timezone and NTP enablement.", + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object), + argc, argv); + if (r <= 0) + return r; + + umask(0022); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + (void) sd_event_set_watchdog(event, true); + + r = sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to install SIGINT handler: %m"); + + r = sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to install SIGTERM handler: %m"); + + r = connect_bus(&context, event, &bus); + if (r < 0) + return r; + + (void) sd_bus_negotiate_timestamp(bus, true); + + r = context_read_data(&context); + if (r < 0) + return log_error_errno(r, "Failed to read time zone data: %m"); + + r = context_parse_ntp_services(&context); + if (r < 0) + return r; + + r = bus_event_loop_with_idle(event, bus, "org.freedesktop.timedate1", DEFAULT_EXIT_USEC, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/timesync/80-systemd-timesync.list b/src/timesync/80-systemd-timesync.list new file mode 100644 index 0000000..95e15f7 --- /dev/null +++ b/src/timesync/80-systemd-timesync.list @@ -0,0 +1,4 @@ +# This file is part of systemd. +# See systemd-timedated.service(8) for more information. + +systemd-timesyncd.service diff --git a/src/timesync/meson.build b/src/timesync/meson.build new file mode 100644 index 0000000..6844480 --- /dev/null +++ b/src/timesync/meson.build @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +sources = files( + 'timesyncd-conf.c', + 'timesyncd-manager.c', + 'timesyncd-server.c', +) + +systemd_timesyncd_sources = files( + 'timesyncd.c', + 'timesyncd-bus.c', +) + +sources += custom_target( + 'timesyncd-gperf.c', + input : 'timesyncd-gperf.gperf', + output : 'timesyncd-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +if get_option('link-timesyncd-shared') + timesyncd_link_with = [libshared] +else + timesyncd_link_with = [libsystemd_static, + libshared_static, + libbasic_gcrypt] +endif + +libtimesyncd_core = static_library( + 'timesyncd-core', + sources, + include_directories : includes, + dependencies : userspace, + link_with : timesyncd_link_with, + build_by_default : false) + +executables += [ + libexec_template + { + 'name' : 'systemd-timesyncd', + 'conditions' : ['ENABLE_TIMESYNCD'], + 'sources' : systemd_timesyncd_sources, + 'link_with' : libtimesyncd_core, + 'dependencies' : [ + libm, + threads, + ], + }, + libexec_template + { + 'name' : 'systemd-time-wait-sync', + 'conditions' : ['ENABLE_TIMESYNCD'], + 'sources' : files('wait-sync.c'), + 'link_with' : libtimesyncd_core, + }, + test_template + { + 'sources' : files('test-timesync.c'), + 'link_with' : [ + libshared, + libtimesyncd_core, + ], + 'dependencies' : libm, + }, +] + +custom_target( + 'timesyncd.conf', + input : 'timesyncd.conf.in', + output : 'timesyncd.conf', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : conf.get('ENABLE_TIMESYNCD') == 1 and install_sysconfdir_samples, + install_dir : pkgconfigfiledir) + +if conf.get('ENABLE_TIMESYNCD') == 1 + install_data('org.freedesktop.timesync1.conf', + install_dir : dbuspolicydir) + install_data('org.freedesktop.timesync1.service', + install_dir : dbussystemservicedir) + install_data('80-systemd-timesync.list', + install_dir : ntpservicelistdir) + install_data('org.freedesktop.timesync1.policy', + install_dir : polkitpolicydir) +endif diff --git a/src/timesync/org.freedesktop.timesync1.conf b/src/timesync/org.freedesktop.timesync1.conf new file mode 100644 index 0000000..d33b864 --- /dev/null +++ b/src/timesync/org.freedesktop.timesync1.conf @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/timesync/org.freedesktop.timesync1.policy b/src/timesync/org.freedesktop.timesync1.policy new file mode 100644 index 0000000..e73965c --- /dev/null +++ b/src/timesync/org.freedesktop.timesync1.policy @@ -0,0 +1,32 @@ + + + + + + + + The systemd Project + https://systemd.io + + + Set runtime NTP servers + Authentication is required to set runtime NTP servers. + + auth_admin + auth_admin + auth_admin_keep + + unix-user:systemd-timesync + + + diff --git a/src/timesync/org.freedesktop.timesync1.service b/src/timesync/org.freedesktop.timesync1.service new file mode 100644 index 0000000..98878d6 --- /dev/null +++ b/src/timesync/org.freedesktop.timesync1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.timesync1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.timesync1.service diff --git a/src/timesync/test-timesync.c b/src/timesync/test-timesync.c new file mode 100644 index 0000000..7993e4c --- /dev/null +++ b/src/timesync/test-timesync.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* Some unit tests for the helper functions in timesyncd. */ + +#include "log.h" +#include "macro.h" +#include "timesyncd-conf.h" +#include "tests.h" + +TEST(manager_parse_string) { + /* Make sure that NTP_SERVERS is configured to something + * that we can actually parse successfully. */ + + _cleanup_(manager_freep) Manager *m = NULL; + + assert_se(manager_new(&m) == 0); + + assert_se(!m->have_fallbacks); + assert_se(manager_parse_server_string(m, SERVER_FALLBACK, NTP_SERVERS) == 0); + assert_se(m->have_fallbacks); + assert_se(manager_parse_fallback_string(m, NTP_SERVERS) == 0); + + assert_se(manager_parse_server_string(m, SERVER_SYSTEM, "time1.foobar.com time2.foobar.com axrfav.,avf..ra 12345..123") == 0); + assert_se(manager_parse_server_string(m, SERVER_FALLBACK, "time1.foobar.com time2.foobar.com axrfav.,avf..ra 12345..123") == 0); + assert_se(manager_parse_server_string(m, SERVER_LINK, "time1.foobar.com time2.foobar.com axrfav.,avf..ra 12345..123") == 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/timesync/timesyncd-bus.c b/src/timesync/timesyncd-bus.c new file mode 100644 index 0000000..7237080 --- /dev/null +++ b/src/timesync/timesyncd-bus.c @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-get-properties.h" +#include "bus-internal.h" +#include "bus-log-control-api.h" +#include "bus-polkit.h" +#include "bus-protocol.h" +#include "bus-util.h" +#include "dns-domain.h" +#include "in-addr-util.h" +#include "log.h" +#include "macro.h" +#include "strv.h" +#include "time-util.h" +#include "timesyncd-bus.h" +#include "user-util.h" + +static int property_get_servers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ServerName **s = ASSERT_PTR(userdata); + int r; + + assert(bus); + assert(reply); + + r = sd_bus_message_open_container(reply, 'a', "s"); + if (r < 0) + return r; + + LIST_FOREACH(names, p, *s) { + r = sd_bus_message_append(reply, "s", p->string); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + +static int method_set_runtime_servers(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_strv_free_ char **msg_names = NULL; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = sd_bus_message_read_strv(message, &msg_names); + if (r < 0) + return r; + + STRV_FOREACH(name, msg_names) { + r = dns_name_is_valid_or_address(*name); + if (r < 0) + return log_error_errno(r, "Failed to check validity of NTP server name or address '%s': %m", *name); + if (r == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NTP server name or address, refusing: %s", *name); + } + + r = bus_verify_polkit_async(message, CAP_NET_ADMIN, + "org.freedesktop.timesync1.set-runtime-servers", + NULL, true, UID_INVALID, + &m->polkit_registry, error); + if (r < 0) + return r; + if (r == 0) + /* Polkit will call us back */ + return 1; + + manager_flush_runtime_servers(m); + + STRV_FOREACH(name, msg_names) { + r = server_name_new(m, NULL, SERVER_RUNTIME, *name); + if (r < 0) { + manager_flush_runtime_servers(m); + + return log_error_errno(r, "Failed to add runtime server '%s': %m", *name); + } + } + + m->exhausted_servers = true; + manager_set_server_name(m, NULL); + (void) manager_connect(m); + + return sd_bus_reply_method_return(message, NULL); +} + +static int property_get_current_server_name( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ServerName **s = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + return sd_bus_message_append(reply, "s", *s ? (*s)->string : NULL); +} + +static int property_get_current_server_address( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ServerAddress *a; + int r; + + assert(bus); + assert(reply); + assert(userdata); + + a = *(ServerAddress **) userdata; + + if (!a) + return sd_bus_message_append(reply, "(iay)", AF_UNSPEC, 0); + + assert(IN_SET(a->sockaddr.sa.sa_family, AF_INET, AF_INET6)); + + r = sd_bus_message_open_container(reply, 'r', "iay"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", a->sockaddr.sa.sa_family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', + a->sockaddr.sa.sa_family == AF_INET ? (void*) &a->sockaddr.in.sin_addr : (void*) &a->sockaddr.in6.sin6_addr, + FAMILY_ADDRESS_SIZE(a->sockaddr.sa.sa_family)); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static usec_t ntp_ts_short_to_usec(const struct ntp_ts_short *ts) { + return be16toh(ts->sec) * USEC_PER_SEC + (be16toh(ts->frac) * USEC_PER_SEC) / (usec_t) 0x10000ULL; +} + +static usec_t ntp_ts_to_usec(const struct ntp_ts *ts) { + return (be32toh(ts->sec) - OFFSET_1900_1970) * USEC_PER_SEC + (be32toh(ts->frac) * USEC_PER_SEC) / (usec_t) 0x100000000ULL; +} + +static int property_get_ntp_message( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(reply); + + r = sd_bus_message_open_container(reply, 'r', "uuuuittayttttbtt"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "uuuuitt", + NTP_FIELD_LEAP(m->ntpmsg.field), + NTP_FIELD_VERSION(m->ntpmsg.field), + NTP_FIELD_MODE(m->ntpmsg.field), + m->ntpmsg.stratum, + m->ntpmsg.precision, + ntp_ts_short_to_usec(&m->ntpmsg.root_delay), + ntp_ts_short_to_usec(&m->ntpmsg.root_dispersion)); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', m->ntpmsg.refid, 4); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "ttttbtt", + timespec_load(&m->origin_time), + ntp_ts_to_usec(&m->ntpmsg.recv_time), + ntp_ts_to_usec(&m->ntpmsg.trans_time), + timespec_load(&m->dest_time), + m->spike, + m->packet_count, + (usec_t) (m->samples_jitter * USEC_PER_SEC)); + if (r < 0) + return r; + + return sd_bus_message_close_container(reply); +} + +static const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + + SD_BUS_PROPERTY("LinkNTPServers", "as", property_get_servers, offsetof(Manager, link_servers), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("SystemNTPServers", "as", property_get_servers, offsetof(Manager, system_servers), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("RuntimeNTPServers", "as", property_get_servers, offsetof(Manager, runtime_servers), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("FallbackNTPServers", "as", property_get_servers, offsetof(Manager, fallback_servers), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("ServerName", "s", property_get_current_server_name, offsetof(Manager, current_server_name), 0), + SD_BUS_PROPERTY("ServerAddress", "(iay)", property_get_current_server_address, offsetof(Manager, current_server_address), 0), + SD_BUS_PROPERTY("RootDistanceMaxUSec", "t", bus_property_get_usec, offsetof(Manager, root_distance_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PollIntervalMinUSec", "t", bus_property_get_usec, offsetof(Manager, poll_interval_min_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PollIntervalMaxUSec", "t", bus_property_get_usec, offsetof(Manager, poll_interval_max_usec), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PollIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, poll_interval_usec), 0), + SD_BUS_PROPERTY("NTPMessage", "(uuuuittayttttbtt)", property_get_ntp_message, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Frequency", "x", NULL, offsetof(Manager, drift_freq), 0), + + SD_BUS_METHOD_WITH_ARGS("SetRuntimeNTPServers", + SD_BUS_ARGS("as", runtime_servers), + SD_BUS_NO_RESULT, + method_set_runtime_servers, + SD_BUS_VTABLE_UNPRIVILEGED), + + SD_BUS_VTABLE_END +}; + +int manager_connect_bus(Manager *m) { + int r; + + assert(m); + + if (m->bus) + return 0; + + r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-timesync"); + if (r < 0) + return log_error_errno(r, "Failed to connect to bus: %m"); + + r = sd_bus_add_object_vtable(m->bus, NULL, "/org/freedesktop/timesync1", "org.freedesktop.timesync1.Manager", manager_vtable, m); + if (r < 0) + return log_error_errno(r, "Failed to add manager object vtable: %m"); + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.timesync1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + return 0; +} diff --git a/src/timesync/timesyncd-bus.h b/src/timesync/timesyncd-bus.h new file mode 100644 index 0000000..83db216 --- /dev/null +++ b/src/timesync/timesyncd-bus.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "timesyncd-manager.h" + +int manager_connect_bus(Manager *m); diff --git a/src/timesync/timesyncd-conf.c b/src/timesync/timesyncd-conf.c new file mode 100644 index 0000000..9c0b6f7 --- /dev/null +++ b/src/timesync/timesyncd-conf.c @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "constants.h" +#include "dns-domain.h" +#include "extract-word.h" +#include "string-util.h" +#include "timesyncd-conf.h" +#include "timesyncd-manager.h" +#include "timesyncd-server.h" + +int manager_parse_server_string(Manager *m, ServerType type, const char *string) { + ServerName *first; + int r; + + assert(m); + assert(string); + + first = type == SERVER_FALLBACK ? m->fallback_servers : m->system_servers; + + if (type == SERVER_FALLBACK) + m->have_fallbacks = true; + + for (;;) { + _cleanup_free_ char *word = NULL; + bool found = false; + + r = extract_first_word(&string, &word, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to parse timesyncd server syntax \"%s\": %m", string); + if (r == 0) + break; + + r = dns_name_is_valid_or_address(word); + if (r < 0) + return log_error_errno(r, "Failed to check validity of NTP server name or address '%s': %m", word); + if (r == 0) { + log_error("Invalid NTP server name or address, ignoring: %s", word); + continue; + } + + /* Filter out duplicates */ + LIST_FOREACH(names, n, first) + if (streq_ptr(n->string, word)) { + found = true; + break; + } + + if (found) + continue; + + r = server_name_new(m, NULL, type, word); + if (r < 0) + return r; + } + + return 0; +} + +int manager_parse_fallback_string(Manager *m, const char *string) { + if (m->have_fallbacks) + return 0; + + return manager_parse_server_string(m, SERVER_FALLBACK, string); +} + +int config_parse_servers( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Manager *m = userdata; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) + manager_flush_server_names(m, ltype); + else { + r = manager_parse_server_string(m, ltype, rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse NTP server string '%s', ignoring: %m", rvalue); + return 0; + } + } + + return 0; +} + +int manager_parse_config_file(Manager *m) { + int r; + + assert(m); + + r = config_parse_config_file("timesyncd.conf", "Time\0", + config_item_perf_lookup, timesyncd_gperf_lookup, + CONFIG_PARSE_WARN, m); + if (r < 0) + return r; + + if (m->poll_interval_min_usec < 16 * USEC_PER_SEC) { + log_warning("Invalid PollIntervalMinSec=. Using default value."); + m->poll_interval_min_usec = NTP_POLL_INTERVAL_MIN_USEC; + } + + if (m->poll_interval_max_usec < m->poll_interval_min_usec) { + log_warning("PollIntervalMaxSec= is smaller than PollIntervalMinSec=. Using default value."); + m->poll_interval_max_usec = MAX(NTP_POLL_INTERVAL_MAX_USEC, m->poll_interval_min_usec * 32); + } + + if (m->connection_retry_usec < 1 * USEC_PER_SEC) { + log_warning("Invalid ConnectionRetrySec=. Using default value."); + m->connection_retry_usec = DEFAULT_CONNECTION_RETRY_USEC; + } + + return r; +} diff --git a/src/timesync/timesyncd-conf.h b/src/timesync/timesyncd-conf.h new file mode 100644 index 0000000..d6b9060 --- /dev/null +++ b/src/timesync/timesyncd-conf.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "conf-parser.h" +#include "timesyncd-manager.h" + +const struct ConfigPerfItem* timesyncd_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +int manager_parse_server_string(Manager *m, ServerType type, const char *string); + +CONFIG_PARSER_PROTOTYPE(config_parse_servers); + +int manager_parse_config_file(Manager *m); +int manager_parse_fallback_string(Manager *m, const char *string); diff --git a/src/timesync/timesyncd-gperf.gperf b/src/timesync/timesyncd-gperf.gperf new file mode 100644 index 0000000..731dea1 --- /dev/null +++ b/src/timesync/timesyncd-gperf.gperf @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "timesyncd-conf.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name timesyncd_gperf_hash +%define lookup-function-name timesyncd_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Time.NTP, config_parse_servers, SERVER_SYSTEM, 0 +Time.Servers, config_parse_servers, SERVER_SYSTEM, 0 +Time.FallbackNTP, config_parse_servers, SERVER_FALLBACK, 0 +Time.RootDistanceMaxSec, config_parse_sec, 0, offsetof(Manager, root_distance_max_usec) +Time.PollIntervalMinSec, config_parse_sec, 0, offsetof(Manager, poll_interval_min_usec) +Time.PollIntervalMaxSec, config_parse_sec, 0, offsetof(Manager, poll_interval_max_usec) +Time.ConnectionRetrySec, config_parse_sec, 0, offsetof(Manager, connection_retry_usec) +Time.SaveIntervalSec, config_parse_sec, 0, offsetof(Manager, save_time_interval_usec) diff --git a/src/timesync/timesyncd-manager.c b/src/timesync/timesyncd-manager.c new file mode 100644 index 0000000..1998ba9 --- /dev/null +++ b/src/timesync/timesyncd-manager.c @@ -0,0 +1,1287 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-daemon.h" +#include "sd-messages.h" + +#include "alloc-util.h" +#include "bus-polkit.h" +#include "common-signal.h" +#include "dns-domain.h" +#include "event-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "list.h" +#include "log.h" +#include "logarithm.h" +#include "network-util.h" +#include "ratelimit.h" +#include "resolve-private.h" +#include "random-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "time-util.h" +#include "timesyncd-conf.h" +#include "timesyncd-manager.h" +#include "user-util.h" + +#ifndef ADJ_SETOFFSET +#define ADJ_SETOFFSET 0x0100 /* add 'time' to current time */ +#endif + +/* Expected accuracy of time synchronization; used to adjust the poll interval */ +#define NTP_ACCURACY_SEC 0.2 + +/* + * Maximum delta in seconds which the system clock is gradually adjusted + * (slewed) to approach the network time. Deltas larger that this are set by + * letting the system time jump. The kernel's limit for adjtime is 0.5s. + */ +#define NTP_MAX_ADJUST 0.4 + +/* Default of maximum acceptable root distance in microseconds. */ +#define NTP_ROOT_DISTANCE_MAX_USEC (5 * USEC_PER_SEC) + +/* Maximum number of missed replies before selecting another source. */ +#define NTP_MAX_MISSED_REPLIES 2 + +#define RATELIMIT_INTERVAL_USEC (10*USEC_PER_SEC) +#define RATELIMIT_BURST 10 + +#define TIMEOUT_USEC (10*USEC_PER_SEC) + +static int manager_arm_timer(Manager *m, usec_t next); +static int manager_clock_watch_setup(Manager *m); +static int manager_listen_setup(Manager *m); +static void manager_listen_stop(Manager *m); +static int manager_save_time_and_rearm(Manager *m, usec_t t); + +static double ntp_ts_short_to_d(const struct ntp_ts_short *ts) { + return be16toh(ts->sec) + (be16toh(ts->frac) / 65536.0); +} + +static double ntp_ts_to_d(const struct ntp_ts *ts) { + return be32toh(ts->sec) + ((double)be32toh(ts->frac) / UINT_MAX); +} + +static double ts_to_d(const struct timespec *ts) { + return ts->tv_sec + (1.0e-9 * ts->tv_nsec); +} + +static int manager_timeout(sd_event_source *source, usec_t usec, void *userdata) { + _cleanup_free_ char *pretty = NULL; + Manager *m = ASSERT_PTR(userdata); + + assert(m->current_server_name); + assert(m->current_server_address); + + server_address_pretty(m->current_server_address, &pretty); + log_info("Timed out waiting for reply from %s (%s).", strna(pretty), m->current_server_name->string); + + return manager_connect(m); +} + +static int manager_send_request(Manager *m) { + _cleanup_free_ char *pretty = NULL; + struct ntp_msg ntpmsg = { + /* + * "The client initializes the NTP message header, sends the request + * to the server, and strips the time of day from the Transmit + * Timestamp field of the reply. For this purpose, all the NTP + * header fields are set to 0, except the Mode, VN, and optional + * Transmit Timestamp fields." + */ + .field = NTP_FIELD(0, 4, NTP_MODE_CLIENT), + }; + ssize_t len; + int r; + + assert(m); + assert(m->current_server_name); + assert(m->current_server_address); + + m->event_timeout = sd_event_source_unref(m->event_timeout); + + r = manager_listen_setup(m); + if (r < 0) { + log_warning_errno(r, "Failed to set up connection socket: %m"); + return manager_connect(m); + } + + /* + * Generate a random number as transmit timestamp, to ensure we get + * a full 64 bits of entropy to make it hard for off-path attackers + * to inject random time to us. + */ + random_bytes(&m->request_nonce, sizeof(m->request_nonce)); + ntpmsg.trans_time = m->request_nonce; + + server_address_pretty(m->current_server_address, &pretty); + + /* + * Record the transmit timestamp. This should be as close as possible to + * the send-to to ensure the timestamp is reasonably accurate + */ + assert_se(clock_gettime(CLOCK_BOOTTIME, &m->trans_time_mon) >= 0); + assert_se(clock_gettime(CLOCK_REALTIME, &m->trans_time) >= 0); + + len = sendto(m->server_socket, &ntpmsg, sizeof(ntpmsg), MSG_DONTWAIT, &m->current_server_address->sockaddr.sa, m->current_server_address->socklen); + if (len == sizeof(ntpmsg)) { + m->pending = true; + log_debug("Sent NTP request to %s (%s).", strna(pretty), m->current_server_name->string); + } else { + log_debug_errno(errno, "Sending NTP request to %s (%s) failed: %m", strna(pretty), m->current_server_name->string); + return manager_connect(m); + } + + /* re-arm timer with increasing timeout, in case the packets never arrive back */ + if (m->retry_interval == 0) + m->retry_interval = NTP_RETRY_INTERVAL_MIN_USEC; + else + m->retry_interval = MIN(m->retry_interval * 4/3, NTP_RETRY_INTERVAL_MAX_USEC); + + r = manager_arm_timer(m, m->retry_interval); + if (r < 0) + return log_error_errno(r, "Failed to rearm timer: %m"); + + m->missed_replies++; + if (m->missed_replies > NTP_MAX_MISSED_REPLIES) { + r = sd_event_add_time( + m->event, + &m->event_timeout, + CLOCK_BOOTTIME, + now(CLOCK_BOOTTIME) + TIMEOUT_USEC, 0, + manager_timeout, m); + if (r < 0) + return log_error_errno(r, "Failed to arm timeout timer: %m"); + } + + return 0; +} + +static int manager_timer(sd_event_source *source, usec_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + return manager_send_request(m); +} + +static int manager_arm_timer(Manager *m, usec_t next) { + int r; + + assert(m); + + if (next == 0) { + m->event_timer = sd_event_source_unref(m->event_timer); + return 0; + } + + if (m->event_timer) { + r = sd_event_source_set_time_relative(m->event_timer, next); + if (r < 0) + return r; + + return sd_event_source_set_enabled(m->event_timer, SD_EVENT_ONESHOT); + } + + return sd_event_add_time_relative( + m->event, + &m->event_timer, + CLOCK_BOOTTIME, + next, 0, + manager_timer, m); +} + +static int manager_clock_watch(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + /* rearm timer */ + manager_clock_watch_setup(m); + + /* skip our own jumps */ + if (m->jumped) { + m->jumped = false; + return 0; + } + + /* resync */ + log_debug("System time changed. Resyncing."); + m->poll_resync = true; + + return manager_send_request(m); +} + +/* wake up when the system time changes underneath us */ +static int manager_clock_watch_setup(Manager *m) { + int r; + + assert(m); + + m->event_clock_watch = sd_event_source_disable_unref(m->event_clock_watch); + + r = event_add_time_change(m->event, &m->event_clock_watch, manager_clock_watch, m); + if (r < 0) + return log_error_errno(r, "Failed to create clock watch event source: %m"); + + return 0; +} + +static int manager_adjust_clock(Manager *m, double offset, int leap_sec) { + struct timex tmx; + + assert(m); + + /* For small deltas, tell the kernel to gradually adjust the system clock to the NTP time, larger + * deltas are just directly set. */ + if (fabs(offset) < NTP_MAX_ADJUST) { + tmx = (struct timex) { + .modes = ADJ_STATUS | ADJ_NANO | ADJ_OFFSET | ADJ_TIMECONST | ADJ_MAXERROR | ADJ_ESTERROR, + .status = STA_PLL, + .offset = offset * NSEC_PER_SEC, + .constant = log2i(m->poll_interval_usec / USEC_PER_SEC) - 4, + }; + + log_debug(" adjust (slew): %+.3f sec", offset); + } else { + tmx = (struct timex) { + .modes = ADJ_STATUS | ADJ_NANO | ADJ_SETOFFSET | ADJ_MAXERROR | ADJ_ESTERROR, + + /* ADJ_NANO uses nanoseconds in the microseconds field */ + .time.tv_sec = (long)offset, + .time.tv_usec = (offset - (double) (long) offset) * NSEC_PER_SEC, + }; + + /* the kernel expects -0.3s as {-1, 7000.000.000} */ + if (tmx.time.tv_usec < 0) { + tmx.time.tv_sec -= 1; + tmx.time.tv_usec += NSEC_PER_SEC; + } + + m->jumped = true; + log_debug(" adjust (jump): %+.3f sec", offset); + } + + /* An unset STA_UNSYNC will enable the kernel's 11-minute mode, which syncs the system time + * periodically to the RTC. + * + * In case the RTC runs in local time, never touch the RTC, we have no way to properly handle + * daylight saving changes and mobile devices moving between time zones. */ + if (m->rtc_local_time) + tmx.status |= STA_UNSYNC; + + switch (leap_sec) { + case 1: + tmx.status |= STA_INS; + break; + case -1: + tmx.status |= STA_DEL; + break; + } + + if (clock_adjtime(CLOCK_REALTIME, &tmx) < 0) + return -errno; + + m->drift_freq = tmx.freq; + + log_debug(" status : %04i %s\n" + " time now : %"PRI_TIME".%03"PRI_USEC"\n" + " constant : %"PRI_TIMEX"\n" + " offset : %+.3f sec\n" + " freq offset : %+"PRI_TIMEX" (%+"PRI_TIMEX" ppm)\n", + tmx.status, tmx.status & STA_UNSYNC ? "unsync" : "sync", + tmx.time.tv_sec, tmx.time.tv_usec / NSEC_PER_MSEC, + tmx.constant, + (double)tmx.offset / NSEC_PER_SEC, + tmx.freq, tmx.freq / 65536); + + return 0; +} + +static bool manager_sample_spike_detection(Manager *m, double offset, double delay) { + unsigned i, idx_cur, idx_new, idx_min; + double jitter; + double j; + + assert(m); + + m->packet_count++; + + /* ignore initial sample */ + if (m->packet_count == 1) + return false; + + /* store the current data in our samples array */ + idx_cur = m->samples_idx; + idx_new = (idx_cur + 1) % ELEMENTSOF(m->samples); + m->samples_idx = idx_new; + m->samples[idx_new].offset = offset; + m->samples[idx_new].delay = delay; + + /* calculate new jitter value from the RMS differences relative to the lowest delay sample */ + jitter = m->samples_jitter; + for (idx_min = idx_cur, i = 0; i < ELEMENTSOF(m->samples); i++) + if (m->samples[i].delay > 0 && m->samples[i].delay < m->samples[idx_min].delay) + idx_min = i; + + j = 0; + for (i = 0; i < ELEMENTSOF(m->samples); i++) + j += pow(m->samples[i].offset - m->samples[idx_min].offset, 2); + m->samples_jitter = sqrt(j / (ELEMENTSOF(m->samples) - 1)); + + /* ignore samples when resyncing */ + if (m->poll_resync) + return false; + + /* always accept offset if we are farther off than the round-trip delay */ + if (fabs(offset) > delay) + return false; + + /* we need a few samples before looking at them */ + if (m->packet_count < 4) + return false; + + /* do not accept anything worse than the maximum possible error of the best sample */ + if (fabs(offset) > m->samples[idx_min].delay) + return true; + + /* compare the difference between the current offset to the previous offset and jitter */ + return fabs(offset - m->samples[idx_cur].offset) > 3 * jitter; +} + +static void manager_adjust_poll(Manager *m, double offset, bool spike) { + assert(m); + + if (m->poll_resync) { + m->poll_interval_usec = m->poll_interval_min_usec; + m->poll_resync = false; + return; + } + + /* set to minimal poll interval */ + if (!spike && fabs(offset) > NTP_ACCURACY_SEC) { + m->poll_interval_usec = m->poll_interval_min_usec; + return; + } + + /* increase polling interval */ + if (fabs(offset) < NTP_ACCURACY_SEC * 0.25) { + if (m->poll_interval_usec < m->poll_interval_max_usec) + m->poll_interval_usec *= 2; + return; + } + + /* decrease polling interval */ + if (spike || fabs(offset) > NTP_ACCURACY_SEC * 0.75) { + if (m->poll_interval_usec > m->poll_interval_min_usec) + m->poll_interval_usec /= 2; + return; + } +} + +static int manager_receive_response(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + struct ntp_msg ntpmsg; + + struct iovec iov = { + .iov_base = &ntpmsg, + .iov_len = sizeof(ntpmsg), + }; + /* This needs to be initialized with zero. See #20741. */ + CMSG_BUFFER_TYPE(CMSG_SPACE_TIMESPEC) control = {}; + union sockaddr_union server_addr; + struct msghdr msghdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + .msg_name = &server_addr, + .msg_namelen = sizeof(server_addr), + }; + struct timespec *recv_time; + triple_timestamp dts; + ssize_t len; + double origin, receive, trans, dest, delay, offset, root_distance; + bool spike; + int leap_sec, r; + + assert(source); + + if (revents & (EPOLLHUP|EPOLLERR)) { + log_warning("Server connection returned error."); + return manager_connect(m); + } + + len = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT); + if (len == -EAGAIN) + return 0; + if (len < 0) { + log_warning_errno(len, "Error receiving message, disconnecting: %m"); + return manager_connect(m); + } + + /* Too short or too long packet? */ + if (iov.iov_len < sizeof(struct ntp_msg) || (msghdr.msg_flags & MSG_TRUNC)) { + log_warning("Invalid response from server. Disconnecting."); + return manager_connect(m); + } + + if (!m->current_server_name || + !m->current_server_address || + !sockaddr_equal(&server_addr, &m->current_server_address->sockaddr)) { + log_debug("Response from unknown server."); + return 0; + } + + recv_time = CMSG_FIND_AND_COPY_DATA(&msghdr, SOL_SOCKET, SCM_TIMESTAMPNS, struct timespec); + if (!recv_time) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Packet timestamp missing."); + + if (!m->pending) { + log_debug("Unexpected reply. Ignoring."); + return 0; + } + + m->missed_replies = 0; + + /* check the transmit request nonce was properly returned in the origin_time field */ + if (ntpmsg.origin_time.sec != m->request_nonce.sec || ntpmsg.origin_time.frac != m->request_nonce.frac) { + log_debug("Invalid reply; not our transmit time. Ignoring."); + return 0; + } + + m->event_timeout = sd_event_source_unref(m->event_timeout); + + if (be32toh(ntpmsg.recv_time.sec) < TIME_EPOCH + OFFSET_1900_1970 || + be32toh(ntpmsg.trans_time.sec) < TIME_EPOCH + OFFSET_1900_1970) { + log_debug("Invalid reply, returned times before epoch. Ignoring."); + return manager_connect(m); + } + + if (NTP_FIELD_LEAP(ntpmsg.field) == NTP_LEAP_NOTINSYNC || + ntpmsg.stratum == 0 || ntpmsg.stratum >= 16) { + log_debug("Server is not synchronized. Disconnecting."); + return manager_connect(m); + } + + if (!IN_SET(NTP_FIELD_VERSION(ntpmsg.field), 3, 4)) { + log_debug("Response NTPv%d. Disconnecting.", NTP_FIELD_VERSION(ntpmsg.field)); + return manager_connect(m); + } + + if (NTP_FIELD_MODE(ntpmsg.field) != NTP_MODE_SERVER) { + log_debug("Unsupported mode %d. Disconnecting.", NTP_FIELD_MODE(ntpmsg.field)); + return manager_connect(m); + } + + root_distance = ntp_ts_short_to_d(&ntpmsg.root_delay) / 2 + ntp_ts_short_to_d(&ntpmsg.root_dispersion); + if (root_distance > (double) m->root_distance_max_usec / (double) USEC_PER_SEC) { + log_info("Server has too large root distance. Disconnecting."); + return manager_connect(m); + } + + /* valid packet */ + m->pending = false; + m->retry_interval = 0; + + /* Stop listening */ + manager_listen_stop(m); + + /* announce leap seconds */ + if (NTP_FIELD_LEAP(ntpmsg.field) & NTP_LEAP_PLUSSEC) + leap_sec = 1; + else if (NTP_FIELD_LEAP(ntpmsg.field) & NTP_LEAP_MINUSSEC) + leap_sec = -1; + else + leap_sec = 0; + + /* + * "Timestamp Name ID When Generated + * ------------------------------------------------------------ + * Originate Timestamp T1 time request sent by client + * Receive Timestamp T2 time request received by server + * Transmit Timestamp T3 time reply sent by server + * Destination Timestamp T4 time reply received by client + * + * The round-trip delay, d, and system clock offset, t, are defined as: + * d = (T4 - T1) - (T3 - T2) t = ((T2 - T1) + (T3 - T4)) / 2" + */ + origin = ts_to_d(&m->trans_time) + OFFSET_1900_1970; + receive = ntp_ts_to_d(&ntpmsg.recv_time); + trans = ntp_ts_to_d(&ntpmsg.trans_time); + dest = ts_to_d(recv_time) + OFFSET_1900_1970; + + offset = ((receive - origin) + (trans - dest)) / 2; + delay = (dest - origin) - (trans - receive); + + spike = manager_sample_spike_detection(m, offset, delay); + + manager_adjust_poll(m, offset, spike); + + log_debug("NTP response:\n" + " leap : %i\n" + " version : %i\n" + " mode : %i\n" + " stratum : %u\n" + " precision : %.6f sec (%i)\n" + " root distance: %.6f sec\n" + " reference : %.4s\n" + " origin : %.3f\n" + " receive : %.3f\n" + " transmit : %.3f\n" + " dest : %.3f\n" + " offset : %+.3f sec\n" + " delay : %+.3f sec\n" + " packet count : %"PRIu64"\n" + " jitter : %.3f%s\n" + " poll interval: " USEC_FMT "\n", + NTP_FIELD_LEAP(ntpmsg.field), + NTP_FIELD_VERSION(ntpmsg.field), + NTP_FIELD_MODE(ntpmsg.field), + ntpmsg.stratum, + exp2(ntpmsg.precision), ntpmsg.precision, + root_distance, + ntpmsg.stratum == 1 ? ntpmsg.refid : "n/a", + origin - OFFSET_1900_1970, + receive - OFFSET_1900_1970, + trans - OFFSET_1900_1970, + dest - OFFSET_1900_1970, + offset, delay, + m->packet_count, + m->samples_jitter, spike ? " spike" : "", + m->poll_interval_usec / USEC_PER_SEC); + + /* Get current monotonic/realtime clocks immediately before adjusting the latter */ + triple_timestamp_now(&dts); + + if (!spike) { + /* Fix up our idea of the time. */ + dts.realtime = (usec_t) (dts.realtime + offset * USEC_PER_SEC); + + r = manager_adjust_clock(m, offset, leap_sec); + if (r < 0) + log_error_errno(r, "Failed to call clock_adjtime(): %m"); + + (void) manager_save_time_and_rearm(m, dts.realtime); + + /* If touch fails, there isn't much we can do. Maybe it'll work next time. */ + r = touch("/run/systemd/timesync/synchronized"); + if (r < 0) + log_debug_errno(r, "Failed to touch /run/systemd/timesync/synchronized, ignoring: %m"); + } + + /* Save NTP response */ + m->ntpmsg = ntpmsg; + m->origin_time = m->trans_time; + m->dest_time = *recv_time; + m->spike = spike; + + log_debug("interval/delta/delay/jitter/drift " USEC_FMT "s/%+.3fs/%.3fs/%.3fs/%+"PRIi64"ppm%s", + m->poll_interval_usec / USEC_PER_SEC, offset, delay, m->samples_jitter, m->drift_freq / 65536, + spike ? " (ignored)" : ""); + + if (sd_bus_is_ready(m->bus) > 0) + (void) sd_bus_emit_properties_changed( + m->bus, + "/org/freedesktop/timesync1", + "org.freedesktop.timesync1.Manager", + "NTPMessage", + NULL); + + if (!m->talking) { + _cleanup_free_ char *pretty = NULL; + + m->talking = true; + + (void) server_address_pretty(m->current_server_address, &pretty); + + log_info("Contacted time server %s (%s).", strna(pretty), m->current_server_name->string); + (void) sd_notifyf(false, "STATUS=Contacted time server %s (%s).", strna(pretty), m->current_server_name->string); + } + + if (!spike && !m->synchronized) { + m->synchronized = true; + + log_struct(LOG_INFO, + LOG_MESSAGE("Initial clock synchronization to %s.", + FORMAT_TIMESTAMP_STYLE(dts.realtime, TIMESTAMP_US)), + "MESSAGE_ID=" SD_MESSAGE_TIME_SYNC_STR, + "MONOTONIC_USEC=" USEC_FMT, dts.monotonic, + "REALTIME_USEC=" USEC_FMT, dts.realtime, + "BOOTTIME_USEC=" USEC_FMT, dts.boottime); + } + + r = manager_arm_timer(m, m->poll_interval_usec); + if (r < 0) + return log_error_errno(r, "Failed to rearm timer: %m"); + + return 0; +} + +static int manager_listen_setup(Manager *m) { + union sockaddr_union addr = {}; + int r; + + assert(m); + + if (m->server_socket >= 0) + return 0; + + assert(!m->event_receive); + assert(m->current_server_address); + + addr.sa.sa_family = m->current_server_address->sockaddr.sa.sa_family; + + m->server_socket = socket(addr.sa.sa_family, SOCK_DGRAM | SOCK_CLOEXEC, 0); + if (m->server_socket < 0) + return -errno; + + r = bind(m->server_socket, &addr.sa, m->current_server_address->socklen); + if (r < 0) + return -errno; + + r = setsockopt_int(m->server_socket, SOL_SOCKET, SO_TIMESTAMPNS, true); + if (r < 0) + return r; + + (void) socket_set_option(m->server_socket, addr.sa.sa_family, IP_TOS, IPV6_TCLASS, IPTOS_DSCP_EF); + + return sd_event_add_io(m->event, &m->event_receive, m->server_socket, EPOLLIN, manager_receive_response, m); +} + +static void manager_listen_stop(Manager *m) { + assert(m); + + m->event_receive = sd_event_source_unref(m->event_receive); + m->server_socket = safe_close(m->server_socket); +} + +static int manager_begin(Manager *m) { + _cleanup_free_ char *pretty = NULL; + int r; + + assert(m); + assert_return(m->current_server_name, -EHOSTUNREACH); + assert_return(m->current_server_address, -EHOSTUNREACH); + + m->talking = false; + m->missed_replies = NTP_MAX_MISSED_REPLIES; + if (m->poll_interval_usec == 0) + m->poll_interval_usec = m->poll_interval_min_usec; + + server_address_pretty(m->current_server_address, &pretty); + log_debug("Connecting to time server %s (%s).", strna(pretty), m->current_server_name->string); + (void) sd_notifyf(false, "STATUS=Connecting to time server %s (%s).", strna(pretty), m->current_server_name->string); + + r = manager_clock_watch_setup(m); + if (r < 0) + return r; + + return manager_send_request(m); +} + +void manager_set_server_name(Manager *m, ServerName *n) { + assert(m); + + if (m->current_server_name == n) + return; + + m->current_server_name = n; + m->current_server_address = NULL; + + manager_disconnect(m); + + if (n) + log_debug("Selected server %s.", n->string); +} + +void manager_set_server_address(Manager *m, ServerAddress *a) { + assert(m); + + if (m->current_server_address == a) + return; + + m->current_server_address = a; + /* If a is NULL, we are just clearing the address, without + * changing the name. Keep the existing name in that case. */ + if (a) + m->current_server_name = a->name; + + manager_disconnect(m); + + if (a) { + _cleanup_free_ char *pretty = NULL; + server_address_pretty(a, &pretty); + log_debug("Selected address %s of server %s.", strna(pretty), a->name->string); + } +} + +static int manager_resolve_handler(sd_resolve_query *q, int ret, const struct addrinfo *ai, Manager *m) { + int r; + + assert(q); + assert(m); + assert(m->current_server_name); + + m->resolve_query = sd_resolve_query_unref(m->resolve_query); + + if (ret != 0) { + log_debug("Failed to resolve %s: %s", m->current_server_name->string, gai_strerror(ret)); + + /* Try next host */ + return manager_connect(m); + } + + for (; ai; ai = ai->ai_next) { + _cleanup_free_ char *pretty = NULL; + ServerAddress *a; + + assert(ai->ai_addr); + assert(ai->ai_addrlen >= offsetof(struct sockaddr, sa_data)); + + if (!IN_SET(ai->ai_addr->sa_family, AF_INET, AF_INET6)) { + log_debug("Ignoring unsuitable address protocol for %s.", m->current_server_name->string); + continue; + } + + r = server_address_new(m->current_server_name, &a, (const union sockaddr_union*) ai->ai_addr, ai->ai_addrlen); + if (r < 0) + return log_error_errno(r, "Failed to add server address: %m"); + + server_address_pretty(a, &pretty); + log_debug("Resolved address %s for %s.", pretty, m->current_server_name->string); + } + + if (!m->current_server_name->addresses) { + log_error("Failed to find suitable address for host %s.", m->current_server_name->string); + + /* Try next host */ + return manager_connect(m); + } + + manager_set_server_address(m, m->current_server_name->addresses); + + return manager_begin(m); +} + +static int manager_retry_connect(sd_event_source *source, usec_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + return manager_connect(m); +} + +int manager_connect(Manager *m) { + int r; + + assert(m); + + manager_disconnect(m); + + m->event_retry = sd_event_source_unref(m->event_retry); + if (!ratelimit_below(&m->ratelimit)) { + log_debug("Delaying attempts to contact servers."); + + r = sd_event_add_time_relative(m->event, &m->event_retry, CLOCK_BOOTTIME, m->connection_retry_usec, + 0, manager_retry_connect, m); + if (r < 0) + return log_error_errno(r, "Failed to create retry timer: %m"); + + return 0; + } + + /* If we already are operating on some address, switch to the + * next one. */ + if (m->current_server_address && m->current_server_address->addresses_next) + manager_set_server_address(m, m->current_server_address->addresses_next); + else { + /* Hmm, we are through all addresses, let's look for the next host instead */ + if (m->current_server_name && m->current_server_name->names_next) + manager_set_server_name(m, m->current_server_name->names_next); + else { + ServerName *f; + bool restart = true; + + /* Our current server name list is exhausted, + * let's find the next one to iterate. First we try the runtime list, then the system list, + * then the link list. After having processed the link list we jump back to the system list + * if no runtime server list. + * However, if all lists are empty, we change to the fallback list. */ + if (!m->current_server_name || m->current_server_name->type == SERVER_LINK) { + f = m->runtime_servers; + if (!f) + f = m->system_servers; + if (!f) + f = m->link_servers; + } else { + f = m->link_servers; + if (f) + restart = false; + else { + f = m->runtime_servers; + if (!f) + f = m->system_servers; + } + } + + if (!f) + f = m->fallback_servers; + + if (!f) { + manager_set_server_name(m, NULL); + log_debug("No server found."); + return 0; + } + + if (restart && !m->exhausted_servers && m->poll_interval_usec > 0) { + log_debug("Waiting after exhausting servers."); + r = sd_event_add_time_relative(m->event, &m->event_retry, CLOCK_BOOTTIME, m->poll_interval_usec, 0, manager_retry_connect, m); + if (r < 0) + return log_error_errno(r, "Failed to create retry timer: %m"); + + m->exhausted_servers = true; + + /* Increase the polling interval */ + if (m->poll_interval_usec < m->poll_interval_max_usec) + m->poll_interval_usec *= 2; + + return 0; + } + + m->exhausted_servers = false; + + manager_set_server_name(m, f); + } + + /* Tell the resolver to reread /etc/resolv.conf, in + * case it changed. */ + res_init(); + + /* Flush out any previously resolved addresses */ + server_name_flush_addresses(m->current_server_name); + + log_debug("Resolving %s...", m->current_server_name->string); + + struct addrinfo hints = { + .ai_flags = AI_NUMERICSERV|AI_ADDRCONFIG, + .ai_socktype = SOCK_DGRAM, + .ai_family = socket_ipv6_is_supported() ? AF_UNSPEC : AF_INET, + }; + + r = resolve_getaddrinfo(m->resolve, &m->resolve_query, m->current_server_name->string, "123", &hints, manager_resolve_handler, NULL, m); + if (r < 0) + return log_error_errno(r, "Failed to create resolver: %m"); + + return 1; + } + + r = manager_begin(m); + if (r < 0) + return r; + + return 1; +} + +void manager_disconnect(Manager *m) { + assert(m); + + m->resolve_query = sd_resolve_query_unref(m->resolve_query); + + m->event_timer = sd_event_source_unref(m->event_timer); + + manager_listen_stop(m); + + m->event_clock_watch = sd_event_source_disable_unref(m->event_clock_watch); + + m->event_timeout = sd_event_source_unref(m->event_timeout); + + (void) sd_notify(false, "STATUS=Idle."); +} + +void manager_flush_server_names(Manager *m, ServerType t) { + assert(m); + + if (t == SERVER_SYSTEM) + while (m->system_servers) + server_name_free(m->system_servers); + + if (t == SERVER_LINK) + while (m->link_servers) + server_name_free(m->link_servers); + + if (t == SERVER_FALLBACK) + while (m->fallback_servers) + server_name_free(m->fallback_servers); + + if (t == SERVER_RUNTIME) + manager_flush_runtime_servers(m); +} + +void manager_flush_runtime_servers(Manager *m) { + assert(m); + + while (m->runtime_servers) + server_name_free(m->runtime_servers); +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + manager_disconnect(m); + manager_flush_server_names(m, SERVER_SYSTEM); + manager_flush_server_names(m, SERVER_LINK); + manager_flush_server_names(m, SERVER_RUNTIME); + manager_flush_server_names(m, SERVER_FALLBACK); + + sd_event_source_unref(m->event_retry); + + sd_event_source_unref(m->network_event_source); + sd_network_monitor_unref(m->network_monitor); + + sd_event_source_unref(m->event_save_time); + + sd_event_source_unref(m->deferred_ntp_server_event_source); + + sd_resolve_unref(m->resolve); + sd_event_unref(m->event); + + sd_bus_flush_close_unref(m->bus); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + + return mfree(m); +} + +static int manager_network_read_link_servers(Manager *m) { + _cleanup_strv_free_ char **ntp = NULL; + bool changed = false; + int r; + + assert(m); + + r = sd_network_get_ntp(&ntp); + if (r < 0 && r != -ENODATA) { + if (r == -ENOMEM) + log_oom(); + else + log_debug_errno(r, "Failed to get link NTP servers: %m"); + goto clear; + } + + LIST_FOREACH(names, n, m->link_servers) + n->marked = true; + + STRV_FOREACH(i, ntp) { + bool found = false; + + r = dns_name_is_valid_or_address(*i); + if (r < 0) { + log_error_errno(r, "Failed to check validity of NTP server name or address '%s': %m", *i); + goto clear; + } else if (r == 0) { + log_error("Invalid NTP server name or address, ignoring: %s", *i); + continue; + } + + LIST_FOREACH(names, n, m->link_servers) + if (streq(n->string, *i)) { + n->marked = false; + found = true; + break; + } + + if (!found) { + r = server_name_new(m, NULL, SERVER_LINK, *i); + if (r < 0) { + log_oom(); + goto clear; + } + + changed = true; + } + } + + LIST_FOREACH(names, n, m->link_servers) + if (n->marked) { + server_name_free(n); + changed = true; + } + + return changed; + +clear: + manager_flush_server_names(m, SERVER_LINK); + return r; +} + +bool manager_is_connected(Manager *m) { + assert(m); + + /* Return true when the manager is sending a request, resolving a server name, or + * in a poll interval. */ + return m->server_socket >= 0 || m->resolve_query || m->event_timer; +} + +static int manager_network_event_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + bool changed, connected, online; + int r; + + sd_network_monitor_flush(m->network_monitor); + + /* When manager_network_read_link_servers() failed, we assume that the servers are changed. */ + changed = manager_network_read_link_servers(m); + + /* check if the machine is online */ + online = network_is_online(); + + /* check if the client is currently connected */ + connected = manager_is_connected(m); + + if (connected && !online) { + log_info("No network connectivity, watching for changes."); + manager_disconnect(m); + + } else if ((!connected || changed) && online) { + log_info("Network configuration changed, trying to establish connection."); + + if (m->current_server_address) + r = manager_begin(m); + else + r = manager_connect(m); + if (r < 0) + return r; + } + + return 0; +} + +static int manager_network_monitor_listen(Manager *m) { + int r, fd, events; + + assert(m); + + r = sd_network_monitor_new(&m->network_monitor, NULL); + if (r == -ENOENT) { + log_info("systemd does not appear to be running, not listening for systemd-networkd events."); + return 0; + } + if (r < 0) + return r; + + fd = sd_network_monitor_get_fd(m->network_monitor); + if (fd < 0) + return fd; + + events = sd_network_monitor_get_events(m->network_monitor); + if (events < 0) + return events; + + r = sd_event_add_io(m->event, &m->network_event_source, fd, events, manager_network_event_handler, m); + if (r < 0) + return r; + + return 0; +} + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .root_distance_max_usec = NTP_ROOT_DISTANCE_MAX_USEC, + .poll_interval_min_usec = NTP_POLL_INTERVAL_MIN_USEC, + .poll_interval_max_usec = NTP_POLL_INTERVAL_MAX_USEC, + + .connection_retry_usec = DEFAULT_CONNECTION_RETRY_USEC, + + .server_socket = -EBADF, + + .ratelimit = (const RateLimit) { + RATELIMIT_INTERVAL_USEC, + RATELIMIT_BURST + }, + + .save_time_interval_usec = DEFAULT_SAVE_TIME_INTERVAL_USEC, + }; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + (void) sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + (void) sd_event_add_signal(m->event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + (void) sd_event_set_watchdog(m->event, true); + + /* Load previous synchronization state */ + r = access("/run/systemd/timesync/synchronized", F_OK); + if (r < 0 && errno != ENOENT) + log_debug_errno(errno, "Failed to determine whether /run/systemd/timesync/synchronized exists, ignoring: %m"); + m->synchronized = r >= 0; + + r = sd_resolve_default(&m->resolve); + if (r < 0) + return r; + + r = sd_resolve_attach_event(m->resolve, m->event, 0); + if (r < 0) + return r; + + r = manager_network_monitor_listen(m); + if (r < 0) + return r; + + (void) manager_network_read_link_servers(m); + + *ret = TAKE_PTR(m); + + return 0; +} + +static int manager_save_time_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + (void) manager_save_time_and_rearm(m, USEC_INFINITY); + return 0; +} + +int manager_setup_save_time_event(Manager *m) { + int r; + + assert(m); + assert(!m->event_save_time); + + if (m->save_time_interval_usec == USEC_INFINITY) + return 0; + + /* NB: we'll accumulate scheduling latencies here, but this doesn't matter */ + r = sd_event_add_time_relative( + m->event, &m->event_save_time, + CLOCK_BOOTTIME, + m->save_time_interval_usec, + 10 * USEC_PER_SEC, + manager_save_time_handler, m); + if (r < 0) + return log_error_errno(r, "Failed to add save time event: %m"); + + (void) sd_event_source_set_description(m->event_save_time, "save-time"); + + return 0; +} + +static int manager_save_time_and_rearm(Manager *m, usec_t t) { + int r; + + assert(m); + + /* Updates the timestamp file to the specified time. If 't' is USEC_INFINITY uses the current system + * clock, but otherwise uses the specified timestamp. Note that whenever we acquire an NTP sync the + * specified timestamp value might be more accurate than the system clock, since the latter is + * subject to slow adjustments. */ + r = touch_file(CLOCK_FILE, false, t, UID_INVALID, GID_INVALID, MODE_INVALID); + if (r < 0) + log_debug_errno(r, "Failed to update " CLOCK_FILE ", ignoring: %m"); + + m->save_on_exit = true; + + if (m->save_time_interval_usec != USEC_INFINITY) { + r = sd_event_source_set_time_relative(m->event_save_time, m->save_time_interval_usec); + if (r < 0) + return log_error_errno(r, "Failed to rearm save time event: %m"); + + r = sd_event_source_set_enabled(m->event_save_time, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable save time event: %m"); + } + + return 0; +} + +static const char* ntp_server_property_name[_SERVER_TYPE_MAX] = { + [SERVER_SYSTEM] = "SystemNTPServers", + [SERVER_FALLBACK] = "FallbackNTPServers", + [SERVER_LINK] = "LinkNTPServers", + [SERVER_RUNTIME] = "RuntimeNTPServers", +}; + +static int ntp_server_emit_changed_strv(Manager *manager, char **properties) { + assert(manager); + assert(properties); + + if (sd_bus_is_ready(manager->bus) <= 0) + return 0; + + return sd_bus_emit_properties_changed_strv( + manager->bus, + "/org/freedesktop/timesync1", + "org.freedesktop.timesync1.Manager", + properties); +} + +static int on_deferred_ntp_server(sd_event_source *s, void *userdata) { + int r; + _cleanup_strv_free_ char **p = NULL; + Manager *m = ASSERT_PTR(userdata); + + m->deferred_ntp_server_event_source = sd_event_source_disable_unref(m->deferred_ntp_server_event_source); + + for (int type = SERVER_SYSTEM; type < _SERVER_TYPE_MAX; type++) + if (m->ntp_server_change_mask & (1U << type)) + if (strv_extend(&p, ntp_server_property_name[type]) < 0) + log_oom(); + + m->ntp_server_change_mask = 0; + + if (strv_isempty(p)) + return log_error_errno(SYNTHETIC_ERRNO(ENOMEM), "Failed to build ntp server event strv!"); + + r = ntp_server_emit_changed_strv(m, p); + if (r < 0) + log_warning_errno(r, "Could not emit ntp server changed properties, ignoring: %m"); + + return 0; +} + +int bus_manager_emit_ntp_server_changed(Manager *m) { + int r; + + assert(m); + + if (m->deferred_ntp_server_event_source) + return 0; + + if (!m->event) + return 0; + + if (IN_SET(sd_event_get_state(m->event), SD_EVENT_FINISHED, SD_EVENT_EXITING)) + return 0; + + r = sd_event_add_defer(m->event, &m->deferred_ntp_server_event_source, on_deferred_ntp_server, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate ntp server event source: %m"); + + (void) sd_event_source_set_description(m->deferred_ntp_server_event_source, "deferred-ntp-server"); + + return 1; +} diff --git a/src/timesync/timesyncd-manager.h b/src/timesync/timesyncd-manager.h new file mode 100644 index 0000000..f444787 --- /dev/null +++ b/src/timesync/timesyncd-manager.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "sd-bus.h" +#include "sd-event.h" +#include "sd-network.h" +#include "sd-resolve.h" + +#include "hashmap.h" +#include "list.h" +#include "ratelimit.h" +#include "time-util.h" +#include "timesyncd-ntp-message.h" + +typedef struct Manager Manager; + +#include "timesyncd-server.h" + +/* + * "A client MUST NOT under any conditions use a poll interval less + * than 15 seconds." + */ +#define NTP_POLL_INTERVAL_MIN_USEC (32 * USEC_PER_SEC) +#define NTP_POLL_INTERVAL_MAX_USEC (2048 * USEC_PER_SEC) + +#define NTP_RETRY_INTERVAL_MIN_USEC (15 * USEC_PER_SEC) +#define NTP_RETRY_INTERVAL_MAX_USEC (6 * 60 * USEC_PER_SEC) /* 6 minutes */ + +#define DEFAULT_CONNECTION_RETRY_USEC (30 * USEC_PER_SEC) + +#define DEFAULT_SAVE_TIME_INTERVAL_USEC (60 * USEC_PER_SEC) + +#define STATE_DIR "/var/lib/systemd/timesync" +#define CLOCK_FILE STATE_DIR "/clock" + +struct Manager { + sd_bus *bus; + sd_event *event; + sd_resolve *resolve; + + LIST_HEAD(ServerName, system_servers); + LIST_HEAD(ServerName, link_servers); + LIST_HEAD(ServerName, runtime_servers); + LIST_HEAD(ServerName, fallback_servers); + + bool have_fallbacks:1; + + RateLimit ratelimit; + bool exhausted_servers; + + /* network */ + sd_event_source *network_event_source; + sd_network_monitor *network_monitor; + + /* peer */ + sd_resolve_query *resolve_query; + sd_event_source *event_receive; + ServerName *current_server_name; + ServerAddress *current_server_address; + int server_socket; + int missed_replies; + uint64_t packet_count; + sd_event_source *event_timeout; + bool talking; + + /* PolicyKit */ + Hashmap *polkit_registry; + + /* last sent packet */ + struct timespec trans_time_mon; + struct timespec trans_time; + struct ntp_ts request_nonce; + usec_t retry_interval; + usec_t connection_retry_usec; + bool pending; + + /* poll timer */ + sd_event_source *event_timer; + usec_t poll_interval_usec; + usec_t poll_interval_min_usec; + usec_t poll_interval_max_usec; + bool poll_resync; + + /* history data */ + struct { + double offset; + double delay; + } samples[8]; + unsigned samples_idx; + double samples_jitter; + usec_t root_distance_max_usec; + + /* last change */ + bool jumped; + int64_t drift_freq; + + /* watch for time changes */ + sd_event_source *event_clock_watch; + + /* Retry connections */ + sd_event_source *event_retry; + + /* RTC runs in local time, leave it alone */ + bool rtc_local_time; + + /* NTP response */ + struct ntp_msg ntpmsg; + struct timespec origin_time, dest_time; + bool spike; + + /* Indicates whether we ever managed to set the local clock from NTP */ + bool synchronized; + + /* save time event */ + sd_event_source *event_save_time; + usec_t save_time_interval_usec; + bool save_on_exit; + + /* Used to coalesce bus PropertiesChanged events */ + sd_event_source *deferred_ntp_server_event_source; + unsigned ntp_server_change_mask; +}; + +int manager_new(Manager **ret); +Manager* manager_free(Manager *m); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +void manager_set_server_name(Manager *m, ServerName *n); +void manager_set_server_address(Manager *m, ServerAddress *a); +void manager_flush_server_names(Manager *m, ServerType t); +void manager_flush_runtime_servers(Manager *m); + +int manager_connect(Manager *m); +void manager_disconnect(Manager *m); +bool manager_is_connected(Manager *m); + +int manager_setup_save_time_event(Manager *m); + +int bus_manager_emit_ntp_server_changed(Manager *m); diff --git a/src/timesync/timesyncd-ntp-message.h b/src/timesync/timesyncd-ntp-message.h new file mode 100644 index 0000000..76ed9ec --- /dev/null +++ b/src/timesync/timesyncd-ntp-message.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sparse-endian.h" + +/* NTP protocol, packet header */ +#define NTP_LEAP_PLUSSEC 1 +#define NTP_LEAP_MINUSSEC 2 +#define NTP_LEAP_NOTINSYNC 3 +#define NTP_MODE_CLIENT 3 +#define NTP_MODE_SERVER 4 +#define NTP_FIELD_LEAP(f) (((f) >> 6) & 3) +#define NTP_FIELD_VERSION(f) (((f) >> 3) & 7) +#define NTP_FIELD_MODE(f) ((f) & 7) +#define NTP_FIELD(l, v, m) (((l) << 6) | ((v) << 3) | (m)) + +/* + * "NTP timestamps are represented as a 64-bit unsigned fixed-point number, + * in seconds relative to 0h on 1 January 1900." + */ +#define OFFSET_1900_1970 UINT64_C(2208988800) + +struct ntp_ts { + be32_t sec; + be32_t frac; +} _packed_; + +struct ntp_ts_short { + be16_t sec; + be16_t frac; +} _packed_; + +struct ntp_msg { + uint8_t field; + uint8_t stratum; + int8_t poll; + int8_t precision; + struct ntp_ts_short root_delay; + struct ntp_ts_short root_dispersion; + char refid[4]; + struct ntp_ts reference_time; + struct ntp_ts origin_time; + struct ntp_ts recv_time; + struct ntp_ts trans_time; +} _packed_; diff --git a/src/timesync/timesyncd-server.c b/src/timesync/timesyncd-server.c new file mode 100644 index 0000000..0f68203 --- /dev/null +++ b/src/timesync/timesyncd-server.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "string-table.h" +#include "timesyncd-server.h" + +static const char * const server_type_table[_SERVER_TYPE_MAX] = { + [SERVER_SYSTEM] = "system", + [SERVER_FALLBACK] = "fallback", + [SERVER_LINK] = "link", + [SERVER_RUNTIME] = "runtime", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(server_type, ServerType); + +int server_address_new( + ServerName *n, + ServerAddress **ret, + const union sockaddr_union *sockaddr, + socklen_t socklen) { + + ServerAddress *a, *tail; + + assert(n); + assert(sockaddr); + assert(socklen >= offsetof(struct sockaddr, sa_data)); + assert(socklen <= sizeof(union sockaddr_union)); + + a = new(ServerAddress, 1); + if (!a) + return -ENOMEM; + + *a = (ServerAddress) { + .name = n, + .socklen = socklen, + }; + + memcpy(&a->sockaddr, sockaddr, socklen); + + tail = LIST_FIND_TAIL(addresses, n->addresses); + LIST_INSERT_AFTER(addresses, n->addresses, tail, a); + + if (ret) + *ret = a; + + return 0; +} + +ServerAddress* server_address_free(ServerAddress *a) { + if (!a) + return NULL; + + if (a->name) { + LIST_REMOVE(addresses, a->name->addresses, a); + + if (a->name->manager && a->name->manager->current_server_address == a) + manager_set_server_address(a->name->manager, NULL); + } + + return mfree(a); +} + +static int enable_ntp_server_defer_event(Manager *m, ServerType type) { + int r; + + assert(m); + assert((type >= 0) && (type < _SERVER_TYPE_MAX)); + + m->ntp_server_change_mask |= 1U << type; + + r = bus_manager_emit_ntp_server_changed(m); + if (r < 0) + return r; + + return 1; +} + +int server_name_new( + Manager *m, + ServerName **ret, + ServerType type, + const char *string) { + int r; + ServerName *n; + + assert(m); + assert(string); + + n = new(ServerName, 1); + if (!n) + return -ENOMEM; + + *n = (ServerName) { + .manager = m, + .type = type, + .string = strdup(string), + }; + + if (!n->string) { + free(n); + return -ENOMEM; + } + + switch (type) { + case SERVER_SYSTEM: + LIST_APPEND(names, m->system_servers, n); + break; + case SERVER_LINK: + LIST_APPEND(names, m->link_servers, n); + break; + case SERVER_FALLBACK: + LIST_APPEND(names, m->fallback_servers, n); + break; + case SERVER_RUNTIME: + LIST_APPEND(names, m->runtime_servers, n); + break; + default: + assert_not_reached(); + } + + r = enable_ntp_server_defer_event(m, type); + if (r < 0) + log_debug_errno(r, "Failed to enable ntp server defer event, ignoring: %m"); + + if (type != SERVER_FALLBACK && + m->current_server_name && + m->current_server_name->type == SERVER_FALLBACK) + manager_set_server_name(m, NULL); + + log_debug("Added new %s server %s.", server_type_to_string(type), string); + + if (ret) + *ret = n; + + return 0; +} + +ServerName *server_name_free(ServerName *n) { + int r; + + if (!n) + return NULL; + + server_name_flush_addresses(n); + + if (n->manager) { + if (n->type == SERVER_SYSTEM) + LIST_REMOVE(names, n->manager->system_servers, n); + else if (n->type == SERVER_LINK) + LIST_REMOVE(names, n->manager->link_servers, n); + else if (n->type == SERVER_FALLBACK) + LIST_REMOVE(names, n->manager->fallback_servers, n); + else if (n->type == SERVER_RUNTIME) + LIST_REMOVE(names, n->manager->runtime_servers, n); + else + assert_not_reached(); + + r = enable_ntp_server_defer_event(n->manager, n->type); + if (r < 0) + log_debug_errno(r, "Failed to enable ntp server defer event, ignoring: %m"); + + if (n->manager->current_server_name == n) + manager_set_server_name(n->manager, NULL); + } + + log_debug("Removed server %s.", n->string); + + free(n->string); + return mfree(n); +} + +void server_name_flush_addresses(ServerName *n) { + assert(n); + + while (n->addresses) + server_address_free(n->addresses); +} diff --git a/src/timesync/timesyncd-server.h b/src/timesync/timesyncd-server.h new file mode 100644 index 0000000..e22917a --- /dev/null +++ b/src/timesync/timesyncd-server.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "list.h" +#include "socket-util.h" + +typedef struct ServerAddress ServerAddress; +typedef struct ServerName ServerName; + +typedef enum ServerType { + SERVER_SYSTEM, + SERVER_FALLBACK, + SERVER_LINK, + SERVER_RUNTIME, + _SERVER_TYPE_MAX, + _SERVER_TYPE_INVALID = -EINVAL, +} ServerType; + +#include "timesyncd-manager.h" + +struct ServerAddress { + ServerName *name; + + union sockaddr_union sockaddr; + socklen_t socklen; + + LIST_FIELDS(ServerAddress, addresses); +}; + +struct ServerName { + Manager *manager; + + ServerType type; + char *string; + + bool marked:1; + + LIST_HEAD(ServerAddress, addresses); + LIST_FIELDS(ServerName, names); +}; + +int server_address_new(ServerName *n, ServerAddress **ret, const union sockaddr_union *sockaddr, socklen_t socklen); +ServerAddress* server_address_free(ServerAddress *a); +static inline int server_address_pretty(ServerAddress *a, char **pretty) { + return sockaddr_pretty(&a->sockaddr.sa, a->socklen, true, true, pretty); +} + +int server_name_new(Manager *m, ServerName **ret, ServerType type,const char *string); +ServerName *server_name_free(ServerName *n); +void server_name_flush_addresses(ServerName *n); diff --git a/src/timesync/timesyncd.c b/src/timesync/timesyncd.c new file mode 100644 index 0000000..1d8ebec --- /dev/null +++ b/src/timesync/timesyncd.c @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-daemon.h" +#include "sd-event.h" +#include "sd-messages.h" + +#include "capability-util.h" +#include "clock-util.h" +#include "daemon-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "network-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "timesyncd-bus.h" +#include "timesyncd-conf.h" +#include "timesyncd-manager.h" +#include "user-util.h" + +static int advance_tstamp(int fd, const struct stat *st) { + assert_se(fd >= 0); + assert_se(st); + + /* So here's the problem: whenever we read the timestamp we'd like to ensure the next time we won't + * restore the exact same time again, but one at least one step further (so that comparing mtimes of + * the timestamp file is a reliable check that timesync did its thing). But file systems have + * different timestamp accuracy: traditional fat has 2s granularity, and even ext2 and friends expose + * different granularity depending on selected inode size during formatting! Hence, to ensure the + * timestamp definitely is increased, here's what we'll do: we'll first try to increase the timestamp + * by 1μs, write that and read it back. If it was updated, great. But if it was not, we'll instead + * increase the timestamp by 10μs, and do the same, then 100μs, then 1ms, and so on, until it works, + * or we reach 10s. If it still didn't work then, the fs is just broken and we give up. */ + + usec_t target = MAX3(now(CLOCK_REALTIME), + TIME_EPOCH * USEC_PER_SEC, + timespec_load(&st->st_mtim)); + + for (usec_t a = 1; a <= 10 * USEC_PER_SEC; a *= 10) { /* 1μs, 10μs, 100μs, 1ms, … 10s */ + struct timespec ts[2]; + struct stat new_st; + + /* Bump to the maximum of the old timestamp advanced by the specified unit, */ + usec_t c = usec_add(target, a); + + timespec_store(&ts[0], c); + ts[1] = ts[0]; + + if (futimens(fd, ts) < 0) { + /* If this doesn't work at all, log, don't fail but give up */ + log_warning_errno(errno, "Unable to update mtime of timestamp file, ignoring: %m"); + return 0; + } + + if (fstat(fd, &new_st) < 0) + return log_error_errno(errno, "Failed to stat timestamp file: %m"); + + if (timespec_load(&new_st.st_mtim) > target) { + log_debug("Successfully bumped timestamp file."); + return 1; + } + + log_debug("Tried to advance timestamp file by " USEC_FMT ", but this didn't work, file system timestamp granularity too coarse?", a); + } + + log_debug("Gave up trying to advance timestamp file."); + return 0; +} + +static int load_clock_timestamp(uid_t uid, gid_t gid) { + usec_t min = TIME_EPOCH * USEC_PER_SEC, ct; + _cleanup_close_ int fd = -EBADF; + int r; + + /* Let's try to make sure that the clock is always monotonically increasing, by saving the clock + * whenever we have a new NTP time, or when we shut down, and restoring it when we start again. This + * is particularly helpful on systems lacking a battery backed RTC. We also will adjust the time to + * at least the build time of systemd. */ + + fd = open(CLOCK_FILE, O_RDWR|O_CLOEXEC, 0644); + if (fd < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Unable to open timestamp file '" CLOCK_FILE "', ignoring: %m"); + + r = mkdir_safe_label(STATE_DIR, 0755, uid, gid, + MKDIR_FOLLOW_SYMLINK | MKDIR_WARN_MODE); + if (r < 0) + log_debug_errno(r, "Failed to create state directory, ignoring: %m"); + + /* create stamp file with the compiled-in date */ + r = touch_file(CLOCK_FILE, /* parents= */ false, min, uid, gid, 0644); + if (r < 0) + log_debug_errno(r, "Failed to create %s, ignoring: %m", CLOCK_FILE); + } else { + struct stat st; + usec_t stamp; + + /* check if the recorded time is later than the compiled-in one */ + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Unable to stat timestamp file '" CLOCK_FILE "': %m"); + + stamp = timespec_load(&st.st_mtim); + if (stamp > min) + min = stamp; + + /* Try to fix the access mode, so that we can still touch the file after dropping + * privileges */ + r = fchmod_and_chown(fd, 0644, uid, gid); + if (r < 0) + log_full_errno(ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to chmod or chown %s, ignoring: %m", CLOCK_FILE); + + (void) advance_tstamp(fd, &st); + } + + ct = now(CLOCK_REALTIME); + if (ct > min) + return 0; + + /* Not that it matters much, but we actually restore the clock to n+1 here rather than n, simply + * because we read n as time previously already and we want to progress here, i.e. not report the + * same time again. */ + if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(min+1)) < 0) { + log_warning_errno(errno, "Failed to restore system clock, ignoring: %m"); + return 0; + } + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_TIME_BUMP_STR, + "REALTIME_USEC=" USEC_FMT, min+1, + LOG_MESSAGE("System clock time unset or jumped backwards, restored from recorded timestamp: %s", + FORMAT_TIMESTAMP(min+1))); + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = NULL; + const char *user = "systemd-timesync"; + uid_t uid, uid_current; + gid_t gid; + int r; + + log_set_facility(LOG_CRON); + log_setup(); + + umask(0022); + + if (argc != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program does not take arguments."); + + uid = uid_current = geteuid(); + gid = getegid(); + + if (uid_current == 0) { + r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); + if (r < 0) + return log_error_errno(r, "Cannot resolve user name %s: %m", user); + } + + r = load_clock_timestamp(uid, gid); + if (r < 0) + return r; + + /* Drop privileges, but only if we have been started as root. If we are not running as root we assume all + * privileges are already dropped. */ + if (uid_current == 0) { + r = drop_privileges(uid, gid, (1ULL << CAP_SYS_TIME)); + if (r < 0) + return log_error_errno(r, "Failed to drop privileges: %m"); + } + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to allocate manager: %m"); + + r = manager_connect_bus(m); + if (r < 0) + return log_error_errno(r, "Could not connect to bus: %m"); + + if (clock_is_localtime(NULL) > 0) { + log_info("The system is configured to read the RTC time in the local time zone. " + "This mode cannot be fully supported. All system time to RTC updates are disabled."); + m->rtc_local_time = true; + } + + r = manager_parse_config_file(m); + if (r < 0) + log_warning_errno(r, "Failed to parse configuration file: %m"); + + r = manager_parse_fallback_string(m, NTP_SERVERS); + if (r < 0) + return log_error_errno(r, "Failed to parse fallback server strings: %m"); + + log_debug("systemd-timesyncd running as pid " PID_FMT, getpid_cached()); + + notify_message = notify_start("READY=1\n" + "STATUS=Daemon is running", + NOTIFY_STOPPING); + + r = manager_setup_save_time_event(m); + if (r < 0) + return r; + + if (network_is_online()) { + r = manager_connect(m); + if (r < 0) + return r; + } + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + /* if we got an authoritative time, store it in the file system */ + if (m->save_on_exit) { + r = touch(CLOCK_FILE); + if (r < 0) + log_debug_errno(r, "Failed to touch " CLOCK_FILE ", ignoring: %m"); + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/timesync/timesyncd.conf.in b/src/timesync/timesyncd.conf.in new file mode 100644 index 0000000..6ef41cf --- /dev/null +++ b/src/timesync/timesyncd.conf.in @@ -0,0 +1,26 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/systemd/timesyncd.conf.d/ directory. The latter is generally +# recommended. Defaults can be restored by simply deleting the main +# configuration file and all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config systemd/timesyncd.conf' to display the full config. +# +# See timesyncd.conf(5) for details. + +[Time] +#NTP= +#FallbackNTP={{NTP_SERVERS}} +#RootDistanceMaxSec=5 +#PollIntervalMinSec=32 +#PollIntervalMaxSec=2048 +#ConnectionRetrySec=30 +#SaveIntervalSec=60 diff --git a/src/timesync/wait-sync.c b/src/timesync/wait-sync.c new file mode 100644 index 0000000..832e117 --- /dev/null +++ b/src/timesync/wait-sync.c @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* systemd service to wait until kernel realtime clock is synchronized */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-event.h" + +#include "fd-util.h" +#include "inotify-util.h" +#include "main-func.h" +#include "signal-util.h" +#include "time-util.h" + +typedef struct ClockState { + int timerfd_fd; /* non-negative is descriptor from timerfd_create */ + int adjtime_state; /* return value from last adjtimex(2) call */ + sd_event_source *timerfd_event_source; /* non-null is the active io event source */ + int inotify_fd; + sd_event_source *inotify_event_source; + int run_systemd_wd; + int run_systemd_timesync_wd; + bool has_watchfile; +} ClockState; + +static void clock_state_release_timerfd(ClockState *sp) { + sp->timerfd_event_source = sd_event_source_unref(sp->timerfd_event_source); + sp->timerfd_fd = safe_close(sp->timerfd_fd); +} + +static void clock_state_release(ClockState *sp) { + clock_state_release_timerfd(sp); + sp->inotify_event_source = sd_event_source_unref(sp->inotify_event_source); + sp->inotify_fd = safe_close(sp->inotify_fd); +} + +static int clock_state_update(ClockState *sp, sd_event *event); + +static int update_notify_run_systemd_timesync(ClockState *sp) { + sp->run_systemd_timesync_wd = inotify_add_watch(sp->inotify_fd, "/run/systemd/timesync", IN_CREATE|IN_DELETE_SELF); + return sp->run_systemd_timesync_wd; +} + +static int timerfd_handler(sd_event_source *s, + int fd, + uint32_t revents, + void *userdata) { + ClockState *sp = userdata; + + return clock_state_update(sp, sd_event_source_get_event(s)); +} + +static void process_inotify_event(sd_event *event, ClockState *sp, struct inotify_event *e) { + if (e->wd == sp->run_systemd_wd) { + /* Only thing we care about is seeing if we can start watching /run/systemd/timesync. */ + if (sp->run_systemd_timesync_wd < 0) + update_notify_run_systemd_timesync(sp); + } else if (e->wd == sp->run_systemd_timesync_wd) { + if (e->mask & IN_DELETE_SELF) { + /* Somebody removed /run/systemd/timesync. */ + (void) inotify_rm_watch(sp->inotify_fd, sp->run_systemd_timesync_wd); + sp->run_systemd_timesync_wd = -1; + } else + /* Somebody might have created /run/systemd/timesync/synchronized. */ + clock_state_update(sp, event); + } +} + +static int inotify_handler(sd_event_source *s, + int fd, + uint32_t revents, + void *userdata) { + sd_event *event = sd_event_source_get_event(s); + ClockState *sp = userdata; + union inotify_event_buffer buffer; + ssize_t l; + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return log_warning_errno(errno, "Lost access to inotify: %m"); + } + FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) + process_inotify_event(event, sp, e); + + return 0; +} + +static int clock_state_update( + ClockState *sp, + sd_event *event) { + + struct timex tx = {}; + usec_t t; + int r; + + clock_state_release_timerfd(sp); + + /* The kernel supports cancelling timers whenever its realtime clock is "set" (which can happen in a variety of + * ways, generally adjustments of at least 500 ms). The way this module works is we set up a timerfd that will + * wake when the clock is set, and when that happens we read the clock synchronization state from the return + * value of adjtimex(2), which supports the NTP time adjustment protocol. + * + * The kernel determines whether the clock is synchronized using driver-specific tests, based on time + * information passed by an application, generally through adjtimex(2). If the application asserts the clock is + * synchronized, but does not also do something that "sets the clock", the timer will not be cancelled and + * synchronization will not be detected. + * + * Similarly, this service will never complete if the application sets the time without also providing + * information that adjtimex(2) can use to determine that the clock is synchronized. This generally doesn't + * happen, but can if the system has a hardware clock that is accurate enough that the adjustment is too small + * to be a "set". + * + * Both these failure-to-detect situations are covered by having the presence/creation of + * /run/systemd/timesync/synchronized, which is considered sufficient to indicate a synchronized clock even if + * the kernel has not been updated. + * + * For timesyncd the initial setting of the time uses settimeofday(2), which sets the clock but does not mark + * it synchronized. When an NTP source is selected it sets the clock again with clock_adjtime(2) which marks it + * synchronized and also touches /run/systemd/timesync/synchronized which covers the case when the clock wasn't + * "set". */ + + r = time_change_fd(); + if (r < 0) { + log_error_errno(r, "Failed to create timerfd: %m"); + goto finish; + } + sp->timerfd_fd = r; + + r = adjtimex(&tx); + if (r < 0) { + log_error_errno(errno, "Failed to read adjtimex state: %m"); + goto finish; + } + sp->adjtime_state = r; + + if (tx.status & STA_NANO) + tx.time.tv_usec /= 1000; + t = timeval_load(&tx.time); + + log_info("adjtime state %i status %x time %s", sp->adjtime_state, (unsigned) tx.status, + FORMAT_TIMESTAMP_STYLE(t, TIMESTAMP_US_UTC) ?: "unrepresentable"); + + sp->has_watchfile = access("/run/systemd/timesync/synchronized", F_OK) >= 0; + if (sp->has_watchfile) + /* Presence of watch file overrides adjtime_state */ + r = 0; + else if (sp->adjtime_state == TIME_ERROR) { + /* Not synchronized. Do a one-shot wait on the descriptor and inform the caller we need to keep + * running. */ + r = sd_event_add_io(event, &sp->timerfd_event_source, sp->timerfd_fd, + EPOLLIN, timerfd_handler, sp); + if (r < 0) { + log_error_errno(r, "Failed to create time change monitor source: %m"); + goto finish; + } + r = 1; + } else + /* Synchronized; we can exit. */ + r = 0; + + finish: + if (r <= 0) + (void) sd_event_exit(event, r); + return r; +} + +static int run(int argc, char * argv[]) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(clock_state_release) ClockState state = { + .timerfd_fd = -EBADF, + .inotify_fd = -EBADF, + .run_systemd_wd = -1, + .run_systemd_timesync_wd = -1, + }; + int r; + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to create sigterm event source: %m"); + + r = sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to create sigint event source: %m"); + + r = sd_event_set_watchdog(event, true); + if (r < 0) + return log_error_errno(r, "Failed to create watchdog event source: %m"); + + r = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (r < 0) + return log_error_errno(errno, "Failed to create inotify descriptor: %m"); + + state.inotify_fd = r; + + r = sd_event_add_io(event, &state.inotify_event_source, state.inotify_fd, + EPOLLIN, inotify_handler, &state); + if (r < 0) + return log_error_errno(r, "Failed to create notify event source: %m"); + + r = inotify_add_watch_and_warn(state.inotify_fd, "/run/systemd/", IN_CREATE); + if (r < 0) + return r; + + state.run_systemd_wd = r; + + (void) update_notify_run_systemd_timesync(&state); + + r = clock_state_update(&state, event); + if (r > 0) { + r = sd_event_loop(event); + if (r < 0) + log_error_errno(r, "Failed in event loop: %m"); + } + + if (state.has_watchfile) + log_debug("Exit enabled by: /run/systemd/timesync/synchronized"); + + if (state.adjtime_state == TIME_ERROR) + log_info("Exit without adjtimex synchronized."); + + return r; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/tmpfiles/meson.build b/src/tmpfiles/meson.build new file mode 100644 index 0000000..8a24a21 --- /dev/null +++ b/src/tmpfiles/meson.build @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_tmpfiles_sources = files( + 'tmpfiles.c', + 'offline-passwd.c', +) + +executables += [ + executable_template + { + 'name' : 'systemd-tmpfiles', + 'public' : true, + 'conditions' : ['ENABLE_TMPFILES'], + 'sources' : systemd_tmpfiles_sources, + 'dependencies' : libacl, + }, + executable_template + { + 'name' : 'systemd-tmpfiles.standalone', + 'public' : have_standalone_binaries, + 'conditions' : ['ENABLE_TMPFILES'], + 'sources' : systemd_tmpfiles_sources, + 'c_args' : '-DSTANDALONE', + 'link_with' : [ + libbasic, + libbasic_gcrypt, + libshared_static, + libsystemd_static, + ], + 'dependencies' : libacl, + 'build_by_default' : have_standalone_binaries, + 'install' : have_standalone_binaries, + }, + test_template + { + 'sources' : files( + 'test-offline-passwd.c', + 'offline-passwd.c', + ), + }, +] diff --git a/src/tmpfiles/offline-passwd.c b/src/tmpfiles/offline-passwd.c new file mode 100644 index 0000000..7b5592b --- /dev/null +++ b/src/tmpfiles/offline-passwd.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "chase.h" +#include "fd-util.h" +#include "offline-passwd.h" +#include "path-util.h" +#include "user-util.h" + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(uid_gid_hash_ops, char, string_hash_func, string_compare_func, free); + +static int open_passwd_file(const char *root, const char *fname, FILE **ret_file) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + _cleanup_fclose_ FILE *f = NULL; + + fd = chase_and_open(fname, root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC, &p); + if (fd < 0) + return fd; + + f = fdopen(fd, "r"); + if (!f) + return -errno; + + TAKE_FD(fd); + + if (DEBUG_LOGGING) { + _cleanup_free_ char *bn = NULL; + + (void) path_extract_filename(fname, &bn); + log_debug("Reading %s entries from %s...", strna(bn), p); + } + + *ret_file = TAKE_PTR(f); + return 0; +} + +static int populate_uid_cache(const char *root, Hashmap **ret) { + _cleanup_hashmap_free_ Hashmap *cache = NULL; + int r; + + cache = hashmap_new(&uid_gid_hash_ops); + if (!cache) + return -ENOMEM; + + /* The directory list is hardcoded here: /etc is the standard, and rpm-ostree uses /usr/lib. This + * could be made configurable, but I don't see the point right now. */ + + FOREACH_STRING(fname, "/etc/passwd", "/usr/lib/passwd") { + _cleanup_fclose_ FILE *f = NULL; + + r = open_passwd_file(root, fname, &f); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + + struct passwd *pw; + while ((r = fgetpwent_sane(f, &pw)) > 0) { + _cleanup_free_ char *n = NULL; + + n = strdup(pw->pw_name); + if (!n) + return -ENOMEM; + + r = hashmap_put(cache, n, UID_TO_PTR(pw->pw_uid)); + if (IN_SET(r, 0, -EEXIST)) + continue; + if (r < 0) + return r; + TAKE_PTR(n); + } + } + + *ret = TAKE_PTR(cache); + return 0; +} + +static int populate_gid_cache(const char *root, Hashmap **ret) { + _cleanup_hashmap_free_ Hashmap *cache = NULL; + int r; + + cache = hashmap_new(&uid_gid_hash_ops); + if (!cache) + return -ENOMEM; + + FOREACH_STRING(fname, "/etc/group", "/usr/lib/group") { + _cleanup_fclose_ FILE *f = NULL; + + r = open_passwd_file(root, fname, &f); + if (r == -ENOENT) + continue; + if (r < 0) + return r; + + struct group *gr; + while ((r = fgetgrent_sane(f, &gr)) > 0) { + _cleanup_free_ char *n = NULL; + + n = strdup(gr->gr_name); + if (!n) + return -ENOMEM; + + r = hashmap_put(cache, n, GID_TO_PTR(gr->gr_gid)); + if (IN_SET(r, 0, -EEXIST)) + continue; + if (r < 0) + return r; + TAKE_PTR(n); + } + } + + *ret = TAKE_PTR(cache); + return 0; +} + +int name_to_uid_offline( + const char *root, + const char *user, + uid_t *ret_uid, + Hashmap **cache) { + + void *found; + int r; + + assert(user); + assert(ret_uid); + assert(cache); + + if (!*cache) { + r = populate_uid_cache(root, cache); + if (r < 0) + return r; + } + + found = hashmap_get(*cache, user); + if (!found) + return -ESRCH; + + *ret_uid = PTR_TO_UID(found); + return 0; +} + +int name_to_gid_offline( + const char *root, + const char *group, + gid_t *ret_gid, + Hashmap **cache) { + + void *found; + int r; + + assert(group); + assert(ret_gid); + assert(cache); + + if (!*cache) { + r = populate_gid_cache(root, cache); + if (r < 0) + return r; + } + + found = hashmap_get(*cache, group); + if (!found) + return -ESRCH; + + *ret_gid = PTR_TO_GID(found); + return 0; +} diff --git a/src/tmpfiles/offline-passwd.h b/src/tmpfiles/offline-passwd.h new file mode 100644 index 0000000..587af7b --- /dev/null +++ b/src/tmpfiles/offline-passwd.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "hashmap.h" + +int name_to_uid_offline(const char *root, const char *user, uid_t *ret_uid, Hashmap **cache); +int name_to_gid_offline(const char *root, const char *group, gid_t *ret_gid, Hashmap **cache); diff --git a/src/tmpfiles/test-offline-passwd.c b/src/tmpfiles/test-offline-passwd.c new file mode 100644 index 0000000..ef49582 --- /dev/null +++ b/src/tmpfiles/test-offline-passwd.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "offline-passwd.h" +#include "user-util.h" +#include "format-util.h" +#include "tests.h" + +static char *arg_root = NULL; + +static void test_resolve_one(const char *name) { + bool relaxed = name || arg_root; + + if (!name) + name = "root"; + + log_info("/* %s(\"%s\") */", __func__, name); + + _cleanup_hashmap_free_ Hashmap *uid_cache = NULL, *gid_cache = NULL; + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + int r; + + r = name_to_uid_offline(arg_root, name, &uid, &uid_cache); + log_info_errno(r, "name_to_uid_offline: %s → "UID_FMT": %m", name, uid); + assert_se(relaxed || r == 0); + + r = name_to_uid_offline(arg_root, name, &uid, &uid_cache); + log_info_errno(r, "name_to_uid_offline: %s → "UID_FMT": %m", name, uid); + assert_se(relaxed || r == 0); + + r = name_to_gid_offline(arg_root, name, &gid, &gid_cache); + log_info_errno(r, "name_to_gid_offline: %s → "GID_FMT": %m", name, gid); + assert_se(relaxed || r == 0); + + r = name_to_gid_offline(arg_root, name, &gid, &gid_cache); + log_info_errno(r, "name_to_gid_offline: %s → "GID_FMT": %m", name, gid); + assert_se(relaxed || r == 0); +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "root", required_argument, NULL, 'r' }, + {} + }; + + int c; + + assert_se(argc >= 0); + assert_se(argv); + + while ((c = getopt_long(argc, argv, "r:", options, NULL)) >= 0) + switch (c) { + case 'r': + arg_root = optarg; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 0; +} + +int main(int argc, char **argv) { + int r; + + test_setup_logging(LOG_DEBUG); + + r = parse_argv(argc, argv); + if (r < 0) + return r; + + if (optind >= argc) + test_resolve_one(NULL); + else + while (optind < argc) + test_resolve_one(argv[optind++]); + + return 0; +} diff --git a/src/tmpfiles/tmpfiles.c b/src/tmpfiles/tmpfiles.c new file mode 100644 index 0000000..bc83aab --- /dev/null +++ b/src/tmpfiles/tmpfiles.c @@ -0,0 +1,4576 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-path.h" + +#include "acl-util.h" +#include "alloc-util.h" +#include "btrfs-util.h" +#include "build.h" +#include "capability-util.h" +#include "chase.h" +#include "chattr-util.h" +#include "conf-files.h" +#include "constants.h" +#include "copy.h" +#include "creds-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "dissect-image.h" +#include "env-util.h" +#include "errno-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "hexdecoct.h" +#include "io-util.h" +#include "label-util.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "missing_stat.h" +#include "missing_syscall.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "nulstr-util.h" +#include "offline-passwd.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "pretty-print.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "selinux-util.h" +#include "set.h" +#include "sort-util.h" +#include "specifier.h" +#include "stdio-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "umask-util.h" +#include "user-util.h" +#include "virt.h" + +/* This reads all files listed in /etc/tmpfiles.d/?*.conf and creates + * them in the file system. This is intended to be used to create + * properly owned directories beneath /tmp, /var/tmp, /run, which are + * volatile and hence need to be recreated on bootup. */ + +typedef enum OperationMask { + OPERATION_CREATE = 1 << 0, + OPERATION_REMOVE = 1 << 1, + OPERATION_CLEAN = 1 << 2, +} OperationMask; + +typedef enum ItemType { + /* These ones take file names */ + CREATE_FILE = 'f', + TRUNCATE_FILE = 'F', /* deprecated: use f+ */ + CREATE_DIRECTORY = 'd', + TRUNCATE_DIRECTORY = 'D', + CREATE_SUBVOLUME = 'v', + CREATE_SUBVOLUME_INHERIT_QUOTA = 'q', + CREATE_SUBVOLUME_NEW_QUOTA = 'Q', + CREATE_FIFO = 'p', + CREATE_SYMLINK = 'L', + CREATE_CHAR_DEVICE = 'c', + CREATE_BLOCK_DEVICE = 'b', + COPY_FILES = 'C', + + /* These ones take globs */ + WRITE_FILE = 'w', + EMPTY_DIRECTORY = 'e', + SET_XATTR = 't', + RECURSIVE_SET_XATTR = 'T', + SET_ACL = 'a', + RECURSIVE_SET_ACL = 'A', + SET_ATTRIBUTE = 'h', + RECURSIVE_SET_ATTRIBUTE = 'H', + IGNORE_PATH = 'x', + IGNORE_DIRECTORY_PATH = 'X', + REMOVE_PATH = 'r', + RECURSIVE_REMOVE_PATH = 'R', + RELABEL_PATH = 'z', + RECURSIVE_RELABEL_PATH = 'Z', + ADJUST_MODE = 'm', /* legacy, 'z' is identical to this */ +} ItemType; + +typedef enum AgeBy { + AGE_BY_ATIME = 1 << 0, + AGE_BY_BTIME = 1 << 1, + AGE_BY_CTIME = 1 << 2, + AGE_BY_MTIME = 1 << 3, + + /* All file timestamp types are checked by default. */ + AGE_BY_DEFAULT_FILE = AGE_BY_ATIME | AGE_BY_BTIME | AGE_BY_CTIME | AGE_BY_MTIME, + AGE_BY_DEFAULT_DIR = AGE_BY_ATIME | AGE_BY_BTIME | AGE_BY_MTIME, +} AgeBy; + +typedef struct Item { + ItemType type; + + char *path; + char *argument; + void *binary_argument; /* set if binary data, in which case it takes precedence over 'argument' */ + size_t binary_argument_size; + char **xattrs; +#if HAVE_ACL + acl_t acl_access; + acl_t acl_access_exec; + acl_t acl_default; +#endif + uid_t uid; + gid_t gid; + mode_t mode; + usec_t age; + AgeBy age_by_file, age_by_dir; + + dev_t major_minor; + unsigned attribute_value; + unsigned attribute_mask; + + bool uid_set:1; + bool gid_set:1; + bool mode_set:1; + bool uid_only_create:1; + bool gid_only_create:1; + bool mode_only_create:1; + bool age_set:1; + bool mask_perms:1; + bool attribute_set:1; + + bool keep_first_level:1; + + bool append_or_force:1; + + bool allow_failure:1; + + bool try_replace:1; + + OperationMask done; +} Item; + +typedef struct ItemArray { + Item *items; + size_t n_items; + + struct ItemArray *parent; + Set *children; +} ItemArray; + +typedef enum DirectoryType { + DIRECTORY_RUNTIME, + DIRECTORY_STATE, + DIRECTORY_CACHE, + DIRECTORY_LOGS, + _DIRECTORY_TYPE_MAX, +} DirectoryType; + +typedef enum { + CREATION_NORMAL, + CREATION_EXISTING, + CREATION_FORCE, + _CREATION_MODE_MAX, + _CREATION_MODE_INVALID = -EINVAL, +} CreationMode; + +static CatFlags arg_cat_flags = CAT_CONFIG_OFF; +static RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM; +static OperationMask arg_operation = 0; +static bool arg_boot = false; +static bool arg_graceful = false; +static PagerFlags arg_pager_flags = 0; + +static char **arg_include_prefixes = NULL; +static char **arg_exclude_prefixes = NULL; +static char *arg_root = NULL; +static char *arg_image = NULL; +static char *arg_replace = NULL; +static ImagePolicy *arg_image_policy = NULL; + +#define MAX_DEPTH 256 + +typedef struct Context { + OrderedHashmap *items, *globs; + Set *unix_sockets; +} Context; + +STATIC_DESTRUCTOR_REGISTER(arg_include_prefixes, freep); +STATIC_DESTRUCTOR_REGISTER(arg_exclude_prefixes, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); + +static const char *const creation_mode_verb_table[_CREATION_MODE_MAX] = { + [CREATION_NORMAL] = "Created", + [CREATION_EXISTING] = "Found existing", + [CREATION_FORCE] = "Created replacement", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(creation_mode_verb, CreationMode); + +static void context_done(Context *c) { + assert(c); + + ordered_hashmap_free(c->items); + ordered_hashmap_free(c->globs); + + set_free(c->unix_sockets); +} + +/* Different kinds of errors that mean that information is not available in the environment. */ +static bool ERRNO_IS_NOINFO(int r) { + return IN_SET(abs(r), + EUNATCH, /* os-release or machine-id missing */ + ENOMEDIUM, /* machine-id or another file empty */ + ENOPKG, /* machine-id is uninitialized */ + ENXIO); /* env var is unset */ +} + +static int specifier_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + struct table_entry { + uint64_t type; + const char *suffix; + }; + + static const struct table_entry paths_system[] = { + [DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME }, + [DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE }, + [DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE }, + [DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS }, + }; + + static const struct table_entry paths_user[] = { + [DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME }, + [DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE }, + [DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE }, + [DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" }, + }; + + const struct table_entry *paths; + _cleanup_free_ char *p = NULL; + unsigned i; + int r; + + assert_cc(ELEMENTSOF(paths_system) == ELEMENTSOF(paths_user)); + paths = arg_runtime_scope == RUNTIME_SCOPE_USER ? paths_user : paths_system; + + i = PTR_TO_UINT(data); + assert(i < ELEMENTSOF(paths_system)); + + r = sd_path_lookup(paths[i].type, paths[i].suffix, &p); + if (r < 0) + return r; + + if (arg_root) { + _cleanup_free_ char *j = NULL; + + j = path_join(arg_root, p); + if (!j) + return -ENOMEM; + + *ret = TAKE_PTR(j); + } else + *ret = TAKE_PTR(p); + + return 0; +} + +static int log_unresolvable_specifier(const char *filename, unsigned line) { + static bool notified = false; + + /* In system mode, this is called when /etc is not fully initialized and some specifiers are + * unresolvable. In user mode, this is called when some variables are not defined. These cases are + * not considered a fatal error, so log at LOG_NOTICE only for the first time and then downgrade this + * to LOG_DEBUG for the rest. + * + * If we're running in a chroot (--root was used or sd_booted() reports that systemd is not running), + * always use LOG_DEBUG. We may be called to initialize a chroot before booting and there is no + * expectation that machine-id and other files will be populated. + */ + + int log_level = notified || arg_root || running_in_chroot() > 0 ? + LOG_DEBUG : LOG_NOTICE; + + log_syntax(NULL, + log_level, + filename, line, 0, + "Failed to resolve specifier: %s, skipping.", + arg_runtime_scope == RUNTIME_SCOPE_USER ? "Required $XDG_... variable not defined" : "uninitialized /etc/ detected"); + + if (!notified) + log_full(log_level, + "All rules containing unresolvable specifiers will be skipped."); + + notified = true; + return 0; +} + +static int user_config_paths(char*** ret) { + _cleanup_strv_free_ char **config_dirs = NULL, **data_dirs = NULL; + _cleanup_free_ char *persistent_config = NULL, *runtime_config = NULL, *data_home = NULL; + _cleanup_strv_free_ char **res = NULL; + int r; + + r = xdg_user_dirs(&config_dirs, &data_dirs); + if (r < 0) + return r; + + r = xdg_user_config_dir(&persistent_config, "/user-tmpfiles.d"); + if (r < 0 && !ERRNO_IS_NOINFO(r)) + return r; + + r = xdg_user_runtime_dir(&runtime_config, "/user-tmpfiles.d"); + if (r < 0 && !ERRNO_IS_NOINFO(r)) + return r; + + r = xdg_user_data_dir(&data_home, "/user-tmpfiles.d"); + if (r < 0 && !ERRNO_IS_NOINFO(r)) + return r; + + r = strv_extend_strv_concat(&res, config_dirs, "/user-tmpfiles.d"); + if (r < 0) + return r; + + r = strv_extend(&res, persistent_config); + if (r < 0) + return r; + + r = strv_extend(&res, runtime_config); + if (r < 0) + return r; + + r = strv_extend(&res, data_home); + if (r < 0) + return r; + + r = strv_extend_strv_concat(&res, data_dirs, "/user-tmpfiles.d"); + if (r < 0) + return r; + + r = path_strv_make_absolute_cwd(res); + if (r < 0) + return r; + + *ret = TAKE_PTR(res); + return 0; +} + +static bool needs_glob(ItemType t) { + return IN_SET(t, + WRITE_FILE, + IGNORE_PATH, + IGNORE_DIRECTORY_PATH, + REMOVE_PATH, + RECURSIVE_REMOVE_PATH, + EMPTY_DIRECTORY, + ADJUST_MODE, + RELABEL_PATH, + RECURSIVE_RELABEL_PATH, + SET_XATTR, + RECURSIVE_SET_XATTR, + SET_ACL, + RECURSIVE_SET_ACL, + SET_ATTRIBUTE, + RECURSIVE_SET_ATTRIBUTE); +} + +static bool takes_ownership(ItemType t) { + return IN_SET(t, + CREATE_FILE, + TRUNCATE_FILE, + CREATE_DIRECTORY, + EMPTY_DIRECTORY, + TRUNCATE_DIRECTORY, + CREATE_SUBVOLUME, + CREATE_SUBVOLUME_INHERIT_QUOTA, + CREATE_SUBVOLUME_NEW_QUOTA, + CREATE_FIFO, + CREATE_SYMLINK, + CREATE_CHAR_DEVICE, + CREATE_BLOCK_DEVICE, + COPY_FILES, + WRITE_FILE, + IGNORE_PATH, + IGNORE_DIRECTORY_PATH, + REMOVE_PATH, + RECURSIVE_REMOVE_PATH); +} + +static struct Item* find_glob(OrderedHashmap *h, const char *match) { + ItemArray *j; + + ORDERED_HASHMAP_FOREACH(j, h) { + size_t n; + + for (n = 0; n < j->n_items; n++) { + Item *item = j->items + n; + + if (fnmatch(item->path, match, FNM_PATHNAME|FNM_PERIOD) == 0) + return item; + } + } + + return NULL; +} + +static int load_unix_sockets(Context *c) { + _cleanup_set_free_ Set *sockets = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + if (c->unix_sockets) + return 0; + + /* We maintain a cache of the sockets we found in /proc/net/unix to speed things up a little. */ + + f = fopen("/proc/net/unix", "re"); + if (!f) + return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to open /proc/net/unix, ignoring: %m"); + + /* Skip header */ + r = read_line(f, LONG_LINE_MAX, NULL); + if (r < 0) + return log_warning_errno(r, "Failed to skip /proc/net/unix header line: %m"); + if (r == 0) + return log_warning_errno(SYNTHETIC_ERRNO(EIO), "Premature end of file reading /proc/net/unix."); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/net/unix line, ignoring: %m"); + if (r == 0) /* EOF */ + break; + + p = strchr(line, ':'); + if (!p) + continue; + + if (strlen(p) < 37) + continue; + + p += 37; + p += strspn(p, WHITESPACE); + p += strcspn(p, WHITESPACE); /* skip one more word */ + p += strspn(p, WHITESPACE); + + if (!path_is_absolute(p)) + continue; + + r = set_put_strdup_full(&sockets, &path_hash_ops_free, p); + if (r < 0) + return log_warning_errno(r, "Failed to add AF_UNIX socket to set, ignoring: %m"); + } + + c->unix_sockets = TAKE_PTR(sockets); + return 1; +} + +static bool unix_socket_alive(Context *c, const char *fn) { + assert(c); + assert(fn); + + if (load_unix_sockets(c) < 0) + return true; /* We don't know, so assume yes */ + + return set_contains(c->unix_sockets, fn); +} + +/* Accessors for the argument in binary format */ +static const void* item_binary_argument(const Item *i) { + assert(i); + return i->binary_argument ?: i->argument; +} + +static size_t item_binary_argument_size(const Item *i) { + assert(i); + return i->binary_argument ? i->binary_argument_size : strlen_ptr(i->argument); +} + +static DIR* xopendirat_nomod(int dirfd, const char *path) { + DIR *dir; + + dir = xopendirat(dirfd, path, O_NOFOLLOW|O_NOATIME); + if (dir) + return dir; + + if (!IN_SET(errno, ENOENT, ELOOP)) + log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path); + + if (errno != EPERM) + return NULL; + + dir = xopendirat(dirfd, path, O_NOFOLLOW); + if (!dir) + log_debug_errno(errno, "Cannot open %sdirectory \"%s\": %m", dirfd == AT_FDCWD ? "" : "sub", path); + + return dir; +} + +static DIR* opendir_nomod(const char *path) { + return xopendirat_nomod(AT_FDCWD, path); +} + +static nsec_t load_statx_timestamp_nsec(const struct statx_timestamp *ts) { + assert(ts); + + if (ts->tv_sec < 0) + return NSEC_INFINITY; + + if ((nsec_t) ts->tv_sec >= (UINT64_MAX - ts->tv_nsec) / NSEC_PER_SEC) + return NSEC_INFINITY; + + return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec; +} + +static bool needs_cleanup( + nsec_t atime, + nsec_t btime, + nsec_t ctime, + nsec_t mtime, + nsec_t cutoff, + const char *sub_path, + AgeBy age_by, + bool is_dir) { + + if (FLAGS_SET(age_by, AGE_BY_MTIME) && mtime != NSEC_INFINITY && mtime >= cutoff) { + /* Follows spelling in stat(1). */ + log_debug("%s \"%s\": modify time %s is too new.", + is_dir ? "Directory" : "File", + sub_path, + FORMAT_TIMESTAMP_STYLE(mtime / NSEC_PER_USEC, TIMESTAMP_US)); + + return false; + } + + if (FLAGS_SET(age_by, AGE_BY_ATIME) && atime != NSEC_INFINITY && atime >= cutoff) { + log_debug("%s \"%s\": access time %s is too new.", + is_dir ? "Directory" : "File", + sub_path, + FORMAT_TIMESTAMP_STYLE(atime / NSEC_PER_USEC, TIMESTAMP_US)); + + return false; + } + + /* + * Note: Unless explicitly specified by the user, "ctime" is ignored + * by default for directories, because we change it when deleting. + */ + if (FLAGS_SET(age_by, AGE_BY_CTIME) && ctime != NSEC_INFINITY && ctime >= cutoff) { + log_debug("%s \"%s\": change time %s is too new.", + is_dir ? "Directory" : "File", + sub_path, + FORMAT_TIMESTAMP_STYLE(ctime / NSEC_PER_USEC, TIMESTAMP_US)); + + return false; + } + + if (FLAGS_SET(age_by, AGE_BY_BTIME) && btime != NSEC_INFINITY && btime >= cutoff) { + log_debug("%s \"%s\": birth time %s is too new.", + is_dir ? "Directory" : "File", + sub_path, + FORMAT_TIMESTAMP_STYLE(btime / NSEC_PER_USEC, TIMESTAMP_US)); + + return false; + } + + return true; +} + +static int dir_cleanup( + Context *c, + Item *i, + const char *p, + DIR *d, + nsec_t self_atime_nsec, + nsec_t self_mtime_nsec, + nsec_t cutoff_nsec, + dev_t rootdev_major, + dev_t rootdev_minor, + bool mountpoint, + int maxdepth, + bool keep_this_level, + AgeBy age_by_file, + AgeBy age_by_dir) { + + bool deleted = false; + int r = 0; + + assert(c); + assert(i); + assert(d); + + FOREACH_DIRENT_ALL(de, d, break) { + _cleanup_free_ char *sub_path = NULL; + nsec_t atime_nsec, mtime_nsec, ctime_nsec, btime_nsec; + + if (dot_or_dot_dot(de->d_name)) + continue; + + /* If statx() is supported, use it. It's preferable over fstatat() since it tells us + * explicitly where we are looking at a mount point, for free as side information. Determining + * the same information without statx() is hard, see the complexity of path_is_mount_point(), + * and also much slower as it requires a number of syscalls instead of just one. Hence, when + * we have modern statx() we use it instead of fstat() and do proper mount point checks, + * while on older kernels's well do traditional st_dev based detection of mount points. + * + * Using statx() for detecting mount points also has the benefit that we handle weird file + * systems such as overlayfs better where each file is originating from a different + * st_dev. */ + + STRUCT_STATX_DEFINE(sx); + + r = statx_fallback( + dirfd(d), de->d_name, + AT_SYMLINK_NOFOLLOW|AT_NO_AUTOMOUNT, + STATX_TYPE|STATX_MODE|STATX_UID|STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_BTIME, + &sx); + if (r == -ENOENT) + continue; + if (r < 0) { + /* FUSE, NFS mounts, SELinux might return EACCES */ + r = log_full_errno(r == -EACCES ? LOG_DEBUG : LOG_ERR, r, + "statx(%s/%s) failed: %m", p, de->d_name); + continue; + } + + if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) { + /* Yay, we have the mount point API, use it */ + if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) { + log_debug("Ignoring \"%s/%s\": different mount points.", p, de->d_name); + continue; + } + } else { + /* So we might have statx() but the STATX_ATTR_MOUNT_ROOT flag is not supported, fall + * back to traditional stx_dev checking. */ + if (sx.stx_dev_major != rootdev_major || + sx.stx_dev_minor != rootdev_minor) { + log_debug("Ignoring \"%s/%s\": different filesystem.", p, de->d_name); + continue; + } + + /* Try to detect bind mounts of the same filesystem instance; they do not differ in device + * major/minors. This type of query is not supported on all kernels or filesystem types + * though. */ + if (S_ISDIR(sx.stx_mode)) { + int q; + + q = fd_is_mount_point(dirfd(d), de->d_name, 0); + if (q < 0) + log_debug_errno(q, "Failed to determine whether \"%s/%s\" is a mount point, ignoring: %m", p, de->d_name); + else if (q > 0) { + log_debug("Ignoring \"%s/%s\": different mount of the same filesystem.", p, de->d_name); + continue; + } + } + } + + atime_nsec = FLAGS_SET(sx.stx_mask, STATX_ATIME) ? load_statx_timestamp_nsec(&sx.stx_atime) : 0; + mtime_nsec = FLAGS_SET(sx.stx_mask, STATX_MTIME) ? load_statx_timestamp_nsec(&sx.stx_mtime) : 0; + ctime_nsec = FLAGS_SET(sx.stx_mask, STATX_CTIME) ? load_statx_timestamp_nsec(&sx.stx_ctime) : 0; + btime_nsec = FLAGS_SET(sx.stx_mask, STATX_BTIME) ? load_statx_timestamp_nsec(&sx.stx_btime) : 0; + + sub_path = path_join(p, de->d_name); + if (!sub_path) { + r = log_oom(); + goto finish; + } + + /* Is there an item configured for this path? */ + if (ordered_hashmap_get(c->items, sub_path)) { + log_debug("Ignoring \"%s\": a separate entry exists.", sub_path); + continue; + } + + if (find_glob(c->globs, sub_path)) { + log_debug("Ignoring \"%s\": a separate glob exists.", sub_path); + continue; + } + + if (S_ISDIR(sx.stx_mode)) { + _cleanup_closedir_ DIR *sub_dir = NULL; + + if (mountpoint && + streq(de->d_name, "lost+found") && + sx.stx_uid == 0) { + log_debug("Ignoring directory \"%s\".", sub_path); + continue; + } + + if (maxdepth <= 0) + log_warning("Reached max depth on \"%s\".", sub_path); + else { + int q; + + sub_dir = xopendirat_nomod(dirfd(d), de->d_name); + if (!sub_dir) { + if (errno != ENOENT) + r = log_warning_errno(errno, "Opening directory \"%s\" failed, ignoring: %m", sub_path); + + continue; + } + + if (flock(dirfd(sub_dir), LOCK_EX|LOCK_NB) < 0) { + log_debug_errno(errno, "Couldn't acquire shared BSD lock on directory \"%s\", skipping: %m", sub_path); + continue; + } + + q = dir_cleanup(c, i, + sub_path, sub_dir, + atime_nsec, mtime_nsec, cutoff_nsec, + rootdev_major, rootdev_minor, + false, maxdepth-1, false, + age_by_file, age_by_dir); + if (q < 0) + r = q; + } + + /* Note: if you are wondering why we don't support the sticky bit for excluding + * directories from cleaning like we do it for other file system objects: well, the + * sticky bit already has a meaning for directories, so we don't want to overload + * that. */ + + if (keep_this_level) { + log_debug("Keeping directory \"%s\".", sub_path); + continue; + } + + /* + * Check the file timestamps of an entry against the + * given cutoff time; delete if it is older. + */ + if (!needs_cleanup(atime_nsec, btime_nsec, ctime_nsec, mtime_nsec, + cutoff_nsec, sub_path, age_by_dir, true)) + continue; + + log_debug("Removing directory \"%s\".", sub_path); + if (unlinkat(dirfd(d), de->d_name, AT_REMOVEDIR) < 0) + if (!IN_SET(errno, ENOENT, ENOTEMPTY)) + r = log_warning_errno(errno, "Failed to remove directory \"%s\", ignoring: %m", sub_path); + + } else { + _cleanup_close_ int fd = -EBADF; + + /* Skip files for which the sticky bit is set. These are semantics we define, and are + * unknown elsewhere. See XDG_RUNTIME_DIR specification for details. */ + if (sx.stx_mode & S_ISVTX) { + log_debug("Skipping \"%s\": sticky bit set.", sub_path); + continue; + } + + if (mountpoint && + S_ISREG(sx.stx_mode) && + sx.stx_uid == 0 && + STR_IN_SET(de->d_name, + ".journal", + "aquota.user", + "aquota.group")) { + log_debug("Skipping \"%s\".", sub_path); + continue; + } + + /* Ignore sockets that are listed in /proc/net/unix */ + if (S_ISSOCK(sx.stx_mode) && unix_socket_alive(c, sub_path)) { + log_debug("Skipping \"%s\": live socket.", sub_path); + continue; + } + + /* Ignore device nodes */ + if (S_ISCHR(sx.stx_mode) || S_ISBLK(sx.stx_mode)) { + log_debug("Skipping \"%s\": a device.", sub_path); + continue; + } + + /* Keep files on this level around if this is requested */ + if (keep_this_level) { + log_debug("Keeping \"%s\".", sub_path); + continue; + } + + if (!needs_cleanup(atime_nsec, btime_nsec, ctime_nsec, mtime_nsec, + cutoff_nsec, sub_path, age_by_file, false)) + continue; + + fd = xopenat(dirfd(d), + de->d_name, + O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME|O_NONBLOCK, + /* xopen_flags = */ 0, + /* mode = */ 0); + if (fd < 0 && !IN_SET(fd, -ENOENT, -ELOOP)) + log_warning_errno(fd, "Opening file \"%s\" failed, ignoring: %m", sub_path); + if (fd >= 0 && flock(fd, LOCK_EX|LOCK_NB) < 0 && errno == EAGAIN) { + log_debug_errno(errno, "Couldn't acquire shared BSD lock on file \"%s\", skipping: %m", sub_path); + continue; + } + + log_debug("Removing \"%s\".", sub_path); + if (unlinkat(dirfd(d), de->d_name, 0) < 0) + if (errno != ENOENT) + r = log_warning_errno(errno, "Failed to remove \"%s\", ignoring: %m", sub_path); + + deleted = true; + } + } + +finish: + if (deleted) { + struct timespec ts[2]; + + log_debug("Restoring access and modification time on \"%s\": %s, %s", + p, + FORMAT_TIMESTAMP_STYLE(self_atime_nsec / NSEC_PER_USEC, TIMESTAMP_US), + FORMAT_TIMESTAMP_STYLE(self_mtime_nsec / NSEC_PER_USEC, TIMESTAMP_US)); + + timespec_store_nsec(ts + 0, self_atime_nsec); + timespec_store_nsec(ts + 1, self_mtime_nsec); + + /* Restore original directory timestamps */ + if (futimens(dirfd(d), ts) < 0) + log_warning_errno(errno, "Failed to revert timestamps of '%s', ignoring: %m", p); + } + + return r; +} + +static bool dangerous_hardlinks(void) { + _cleanup_free_ char *value = NULL; + static int cached = -1; + int r; + + /* Check whether the fs.protected_hardlinks sysctl is on. If we can't determine it we assume its off, as that's + * what the upstream default is. */ + + if (cached >= 0) + return cached; + + r = read_one_line_file("/proc/sys/fs/protected_hardlinks", &value); + if (r < 0) { + log_debug_errno(r, "Failed to read fs.protected_hardlinks sysctl: %m"); + return true; + } + + r = parse_boolean(value); + if (r < 0) { + log_debug_errno(r, "Failed to parse fs.protected_hardlinks sysctl: %m"); + return true; + } + + cached = r == 0; + return cached; +} + +static bool hardlink_vulnerable(const struct stat *st) { + assert(st); + + return !S_ISDIR(st->st_mode) && st->st_nlink > 1 && dangerous_hardlinks(); +} + +static mode_t process_mask_perms(mode_t mode, mode_t current) { + + if ((current & 0111) == 0) + mode &= ~0111; + if ((current & 0222) == 0) + mode &= ~0222; + if ((current & 0444) == 0) + mode &= ~0444; + if (!S_ISDIR(current)) + mode &= ~07000; /* remove sticky/sgid/suid bit, unless directory */ + + return mode; +} + +static int fd_set_perms( + Context *c, + Item *i, + int fd, + const char *path, + const struct stat *st, + CreationMode creation) { + + bool do_chown, do_chmod; + struct stat stbuf; + mode_t new_mode; + uid_t new_uid; + gid_t new_gid; + int r; + + assert(c); + assert(i); + assert(fd >= 0); + assert(path); + + if (!i->mode_set && !i->uid_set && !i->gid_set) + goto shortcut; + + if (!st) { + if (fstat(fd, &stbuf) < 0) + return log_error_errno(errno, "fstat(%s) failed: %m", path); + st = &stbuf; + } + + if (hardlink_vulnerable(st)) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Refusing to set permissions on hardlinked file %s while the fs.protected_hardlinks sysctl is turned off.", + path); + new_uid = i->uid_set && (creation != CREATION_EXISTING || !i->uid_only_create) ? i->uid : st->st_uid; + new_gid = i->gid_set && (creation != CREATION_EXISTING || !i->gid_only_create) ? i->gid : st->st_gid; + + /* Do we need a chown()? */ + do_chown = (new_uid != st->st_uid) || (new_gid != st->st_gid); + + /* Calculate the mode to apply */ + new_mode = i->mode_set && (creation != CREATION_EXISTING || !i->mode_only_create) ? + (i->mask_perms ? process_mask_perms(i->mode, st->st_mode) : i->mode) : + (st->st_mode & 07777); + + do_chmod = ((new_mode ^ st->st_mode) & 07777) != 0; + + if (do_chmod && do_chown) { + /* Before we issue the chmod() let's reduce the access mode to the common bits of the old and + * the new mode. That way there's no time window where the file exists under the old owner + * with more than the old access modes — and not under the new owner with more than the new + * access modes either. */ + + if (S_ISLNK(st->st_mode)) + log_debug("Skipping temporary mode fix for symlink %s.", path); + else { + mode_t m = new_mode & st->st_mode; /* Mask new mode by old mode */ + + if (((m ^ st->st_mode) & 07777) == 0) + log_debug("\"%s\" matches temporary mode %o already.", path, m); + else { + log_debug("Temporarily changing \"%s\" to mode %o.", path, m); + r = fchmod_opath(fd, m); + if (r < 0) + return log_error_errno(r, "fchmod() of %s failed: %m", path); + } + } + } + + if (do_chown) { + log_debug("Changing \"%s\" to owner "UID_FMT":"GID_FMT, path, new_uid, new_gid); + + if (fchownat(fd, "", + new_uid != st->st_uid ? new_uid : UID_INVALID, + new_gid != st->st_gid ? new_gid : GID_INVALID, + AT_EMPTY_PATH) < 0) + return log_error_errno(errno, "fchownat() of %s failed: %m", path); + } + + /* Now, apply the final mode. We do this in two cases: when the user set a mode explicitly, or after a + * chown(), since chown()'s mangle the access mode in regards to sgid/suid in some conditions. */ + if (do_chmod || do_chown) { + if (S_ISLNK(st->st_mode)) + log_debug("Skipping mode fix for symlink %s.", path); + else { + log_debug("Changing \"%s\" to mode %o.", path, new_mode); + r = fchmod_opath(fd, new_mode); + if (r < 0) + return log_error_errno(r, "fchmod() of %s failed: %m", path); + } + } + +shortcut: + return label_fix_full(fd, /* inode_path= */ NULL, /* label_path= */ path, 0); +} + +static int path_open_parent_safe(const char *path, bool allow_failure) { + _cleanup_free_ char *dn = NULL; + int r, fd; + + if (!path_is_normalized(path)) + return log_full_errno(allow_failure ? LOG_INFO : LOG_ERR, + SYNTHETIC_ERRNO(EINVAL), + "Failed to open parent of '%s': path not normalized%s.", + path, + allow_failure ? ", ignoring" : ""); + + r = path_extract_directory(path, &dn); + if (r < 0) + return log_full_errno(allow_failure ? LOG_INFO : LOG_ERR, + r, + "Unable to determine parent directory of '%s'%s: %m", + path, + allow_failure ? ", ignoring" : ""); + + r = chase(dn, arg_root, allow_failure ? CHASE_SAFE : CHASE_SAFE|CHASE_WARN, NULL, &fd); + if (r == -ENOLINK) /* Unsafe symlink: already covered by CHASE_WARN */ + return r; + if (r < 0) + return log_full_errno(allow_failure ? LOG_INFO : LOG_ERR, + r, + "Failed to open path '%s'%s: %m", + dn, + allow_failure ? ", ignoring" : ""); + + return fd; +} + +static int path_open_safe(const char *path) { + int r, fd; + + /* path_open_safe() returns a file descriptor opened with O_PATH after + * verifying that the path doesn't contain unsafe transitions, except + * for its final component as the function does not follow symlink. */ + + assert(path); + + if (!path_is_normalized(path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to open invalid path '%s'.", path); + + r = chase(path, arg_root, CHASE_SAFE|CHASE_WARN|CHASE_NOFOLLOW, NULL, &fd); + if (r == -ENOLINK) + return r; /* Unsafe symlink: already covered by CHASE_WARN */ + if (r < 0) + return log_error_errno(r, "Failed to open path %s: %m", path); + + return fd; +} + +static int path_set_perms( + Context *c, + Item *i, + const char *path, + CreationMode creation) { + + _cleanup_close_ int fd = -EBADF; + + assert(c); + assert(i); + assert(path); + + fd = path_open_safe(path); + if (fd < 0) + return fd; + + return fd_set_perms(c, i, fd, path, /* st= */ NULL, creation); +} + +static int parse_xattrs_from_arg(Item *i) { + const char *p; + int r; + + assert(i); + + assert_se(p = i->argument); + for (;;) { + _cleanup_free_ char *name = NULL, *value = NULL, *xattr = NULL; + + r = extract_first_word(&p, &xattr, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE); + if (r < 0) + log_warning_errno(r, "Failed to parse extended attribute '%s', ignoring: %m", p); + if (r <= 0) + break; + + r = split_pair(xattr, "=", &name, &value); + if (r < 0) { + log_warning_errno(r, "Failed to parse extended attribute, ignoring: %s", xattr); + continue; + } + + if (isempty(name) || isempty(value)) { + log_warning("Malformed extended attribute found, ignoring: %s", xattr); + continue; + } + + if (strv_push_pair(&i->xattrs, name, value) < 0) + return log_oom(); + + name = value = NULL; + } + + return 0; +} + +static int fd_set_xattrs( + Context *c, + Item *i, + int fd, + const char *path, + const struct stat *st, + CreationMode creation) { + + assert(c); + assert(i); + assert(fd >= 0); + assert(path); + + STRV_FOREACH_PAIR(name, value, i->xattrs) { + log_debug("Setting extended attribute '%s=%s' on %s.", *name, *value, path); + if (setxattr(FORMAT_PROC_FD_PATH(fd), *name, *value, strlen(*value), 0) < 0) + return log_error_errno(errno, "Setting extended attribute %s=%s on %s failed: %m", + *name, *value, path); + } + return 0; +} + +static int path_set_xattrs( + Context *c, + Item *i, + const char *path, + CreationMode creation) { + + _cleanup_close_ int fd = -EBADF; + + assert(c); + assert(i); + assert(path); + + fd = path_open_safe(path); + if (fd < 0) + return fd; + + return fd_set_xattrs(c, i, fd, path, /* st = */ NULL, creation); +} + +static int parse_acls_from_arg(Item *item) { +#if HAVE_ACL + int r; + + assert(item); + + /* If append_or_force (= modify) is set, we will not modify the acl + * afterwards, so the mask can be added now if necessary. */ + + r = parse_acl(item->argument, &item->acl_access, &item->acl_access_exec, + &item->acl_default, !item->append_or_force); + if (r < 0) + log_full_errno(arg_graceful && IN_SET(r, -EINVAL, -ENOENT, -ESRCH) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to parse ACL \"%s\", ignoring: %m", item->argument); +#else + log_warning("ACLs are not supported, ignoring."); +#endif + + return 0; +} + +#if HAVE_ACL +static int parse_acl_cond_exec( + const char *path, + acl_t access, /* could be empty (NULL) */ + acl_t cond_exec, + const struct stat *st, + bool append, + acl_t *ret) { + + _cleanup_(acl_freep) acl_t parsed = NULL; + acl_entry_t entry; + acl_permset_t permset; + bool has_exec; + int r; + + assert(path); + assert(ret); + assert(st); + + parsed = access ? acl_dup(access) : acl_init(0); + if (!parsed) + return -errno; + + /* Since we substitute 'X' with 'x' in parse_acl(), we just need to copy the entries over + * for directories */ + if (S_ISDIR(st->st_mode)) { + for (r = acl_get_entry(cond_exec, ACL_FIRST_ENTRY, &entry); + r > 0; + r = acl_get_entry(cond_exec, ACL_NEXT_ENTRY, &entry)) { + + acl_entry_t parsed_entry; + + if (acl_create_entry(&parsed, &parsed_entry) < 0) + return -errno; + + if (acl_copy_entry(parsed_entry, entry) < 0) + return -errno; + } + if (r < 0) + return -errno; + + goto finish; + } + + has_exec = st->st_mode & S_IXUSR; + + if (!has_exec && append) { + _cleanup_(acl_freep) acl_t old = NULL; + + old = acl_get_file(path, ACL_TYPE_ACCESS); + if (!old) + return -errno; + + for (r = acl_get_entry(old, ACL_FIRST_ENTRY, &entry); + r > 0; + r = acl_get_entry(old, ACL_NEXT_ENTRY, &entry)) { + + if (acl_get_permset(entry, &permset) < 0) + return -errno; + + r = acl_get_perm(permset, ACL_EXECUTE); + if (r < 0) + return -errno; + if (r > 0) { + has_exec = true; + break; + } + } + if (r < 0) + return -errno; + } + + /* Check if we're about to set the execute bit in acl_access */ + if (!has_exec && access) { + for (r = acl_get_entry(access, ACL_FIRST_ENTRY, &entry); + r > 0; + r = acl_get_entry(access, ACL_NEXT_ENTRY, &entry)) { + + if (acl_get_permset(entry, &permset) < 0) + return -errno; + + r = acl_get_perm(permset, ACL_EXECUTE); + if (r < 0) + return -errno; + if (r > 0) { + has_exec = true; + break; + } + } + if (r < 0) + return -errno; + } + + for (r = acl_get_entry(cond_exec, ACL_FIRST_ENTRY, &entry); + r > 0; + r = acl_get_entry(cond_exec, ACL_NEXT_ENTRY, &entry)) { + + acl_entry_t parsed_entry; + + if (acl_create_entry(&parsed, &parsed_entry) < 0) + return -errno; + + if (acl_copy_entry(parsed_entry, entry) < 0) + return -errno; + + if (!has_exec) { + if (acl_get_permset(parsed_entry, &permset) < 0) + return -errno; + + if (acl_delete_perm(permset, ACL_EXECUTE) < 0) + return -errno; + } + } + if (r < 0) + return -errno; + +finish: + if (!append) { /* want_mask = true */ + r = calc_acl_mask_if_needed(&parsed); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(parsed); + + return 0; +} + +static int path_set_acl( + Context *c, + const char *path, + const char *pretty, + acl_type_t type, + acl_t acl, + bool modify) { + + _cleanup_(acl_free_charpp) char *t = NULL; + _cleanup_(acl_freep) acl_t dup = NULL; + int r; + + assert(c); + + /* Returns 0 for success, positive error if already warned, negative error otherwise. */ + + if (modify) { + r = acls_for_file(path, type, acl, &dup); + if (r < 0) + return r; + + r = calc_acl_mask_if_needed(&dup); + if (r < 0) + return r; + } else { + dup = acl_dup(acl); + if (!dup) + return -errno; + + /* the mask was already added earlier if needed */ + } + + r = add_base_acls_if_needed(&dup, path); + if (r < 0) + return r; + + t = acl_to_any_text(dup, NULL, ',', TEXT_ABBREVIATE); + log_debug("Setting %s ACL %s on %s.", + type == ACL_TYPE_ACCESS ? "access" : "default", + strna(t), pretty); + + r = acl_set_file(path, type, dup); + if (r < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + /* No error if filesystem doesn't support ACLs. Return negative. */ + return -errno; + else + /* Return positive to indicate we already warned */ + return -log_error_errno(errno, + "Setting %s ACL \"%s\" on %s failed: %m", + type == ACL_TYPE_ACCESS ? "access" : "default", + strna(t), pretty); + } + return 0; +} +#endif + +static int fd_set_acls( + Context *c, + Item *item, + int fd, + const char *path, + const struct stat *st, + CreationMode creation) { + + int r = 0; +#if HAVE_ACL + _cleanup_(acl_freep) acl_t access_with_exec_parsed = NULL; + struct stat stbuf; + + assert(c); + assert(item); + assert(fd >= 0); + assert(path); + + if (!st) { + if (fstat(fd, &stbuf) < 0) + return log_error_errno(errno, "fstat(%s) failed: %m", path); + st = &stbuf; + } + + if (hardlink_vulnerable(st)) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "Refusing to set ACLs on hardlinked file %s while the fs.protected_hardlinks sysctl is turned off.", + path); + + if (S_ISLNK(st->st_mode)) { + log_debug("Skipping ACL fix for symlink %s.", path); + return 0; + } + + if (item->acl_access_exec) { + r = parse_acl_cond_exec(FORMAT_PROC_FD_PATH(fd), + item->acl_access, + item->acl_access_exec, + st, + item->append_or_force, + &access_with_exec_parsed); + if (r < 0) + return log_error_errno(r, "Failed to parse conditionalized execute bit for \"%s\": %m", path); + + r = path_set_acl(c, FORMAT_PROC_FD_PATH(fd), path, ACL_TYPE_ACCESS, access_with_exec_parsed, item->append_or_force); + } else if (item->acl_access) + r = path_set_acl(c, FORMAT_PROC_FD_PATH(fd), path, ACL_TYPE_ACCESS, item->acl_access, item->append_or_force); + + /* set only default acls to folders */ + if (r == 0 && item->acl_default && S_ISDIR(st->st_mode)) + r = path_set_acl(c, FORMAT_PROC_FD_PATH(fd), path, ACL_TYPE_DEFAULT, item->acl_default, item->append_or_force); + + if (ERRNO_IS_NOT_SUPPORTED(r)) { + log_debug_errno(r, "ACLs not supported by file system at %s", path); + return 0; + } + + if (r > 0) + return -r; /* already warned in path_set_acl */ + + /* The above procfs paths don't work if /proc is not mounted. */ + if (r == -ENOENT && proc_mounted() == 0) + r = -ENOSYS; + + if (r < 0) + return log_error_errno(r, "ACL operation on \"%s\" failed: %m", path); +#endif + return r; +} + +static int path_set_acls( + Context *c, + Item *item, + const char *path, + CreationMode creation) { + + int r = 0; +#if HAVE_ACL + _cleanup_close_ int fd = -EBADF; + + assert(c); + assert(item); + assert(path); + + fd = path_open_safe(path); + if (fd < 0) + return fd; + + r = fd_set_acls(c, item, fd, path, /* st= */ NULL, creation); +#endif + return r; +} + +static int parse_attribute_from_arg(Item *item) { + + static const struct { + char character; + unsigned value; + } attributes[] = { + { 'A', FS_NOATIME_FL }, /* do not update atime */ + { 'S', FS_SYNC_FL }, /* Synchronous updates */ + { 'D', FS_DIRSYNC_FL }, /* dirsync behaviour (directories only) */ + { 'a', FS_APPEND_FL }, /* writes to file may only append */ + { 'c', FS_COMPR_FL }, /* Compress file */ + { 'd', FS_NODUMP_FL }, /* do not dump file */ + { 'e', FS_EXTENT_FL }, /* Extents */ + { 'i', FS_IMMUTABLE_FL }, /* Immutable file */ + { 'j', FS_JOURNAL_DATA_FL }, /* Reserved for ext3 */ + { 's', FS_SECRM_FL }, /* Secure deletion */ + { 'u', FS_UNRM_FL }, /* Undelete */ + { 't', FS_NOTAIL_FL }, /* file tail should not be merged */ + { 'T', FS_TOPDIR_FL }, /* Top of directory hierarchies */ + { 'C', FS_NOCOW_FL }, /* Do not cow file */ + { 'P', FS_PROJINHERIT_FL }, /* Inherit the quota project ID */ + }; + + enum { + MODE_ADD, + MODE_DEL, + MODE_SET + } mode = MODE_ADD; + + unsigned value = 0, mask = 0; + const char *p; + + assert(item); + + p = item->argument; + if (p) { + if (*p == '+') { + mode = MODE_ADD; + p++; + } else if (*p == '-') { + mode = MODE_DEL; + p++; + } else if (*p == '=') { + mode = MODE_SET; + p++; + } + } + + if (isempty(p) && mode != MODE_SET) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Setting file attribute on '%s' needs an attribute specification.", + item->path); + + for (; p && *p ; p++) { + unsigned i, v; + + for (i = 0; i < ELEMENTSOF(attributes); i++) + if (*p == attributes[i].character) + break; + + if (i >= ELEMENTSOF(attributes)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown file attribute '%c' on '%s'.", + *p, item->path); + + v = attributes[i].value; + + SET_FLAG(value, v, IN_SET(mode, MODE_ADD, MODE_SET)); + + mask |= v; + } + + if (mode == MODE_SET) + mask |= CHATTR_ALL_FL; + + assert(mask != 0); + + item->attribute_mask = mask; + item->attribute_value = value; + item->attribute_set = true; + + return 0; +} + +static int fd_set_attribute( + Context *c, + Item *item, + int fd, + const char *path, + const struct stat *st, + CreationMode creation) { + + _cleanup_close_ int procfs_fd = -EBADF; + struct stat stbuf; + unsigned f; + int r; + + assert(c); + assert(item); + assert(fd >= 0); + assert(path); + + if (!item->attribute_set || item->attribute_mask == 0) + return 0; + + if (!st) { + if (fstat(fd, &stbuf) < 0) + return log_error_errno(errno, "fstat(%s) failed: %m", path); + st = &stbuf; + } + + /* Issuing the file attribute ioctls on device nodes is not safe, as that will be delivered to the + * drivers, not the file system containing the device node. */ + if (!S_ISREG(st->st_mode) && !S_ISDIR(st->st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Setting file flags is only supported on regular files and directories, cannot set on '%s'.", + path); + + f = item->attribute_value & item->attribute_mask; + + /* Mask away directory-specific flags */ + if (!S_ISDIR(st->st_mode)) + f &= ~FS_DIRSYNC_FL; + + procfs_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_NOATIME); + if (procfs_fd < 0) + return log_error_errno(procfs_fd, "Failed to re-open '%s': %m", path); + + unsigned previous, current; + r = chattr_full(procfs_fd, NULL, f, item->attribute_mask, &previous, ¤t, CHATTR_FALLBACK_BITWISE); + if (r == -ENOANO) + log_warning("Cannot set file attributes for '%s', maybe due to incompatibility in specified attributes, " + "previous=0x%08x, current=0x%08x, expected=0x%08x, ignoring.", + path, previous, current, (previous & ~item->attribute_mask) | (f & item->attribute_mask)); + else if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r, + "Cannot set file attributes for '%s', value=0x%08x, mask=0x%08x, ignoring: %m", + path, item->attribute_value, item->attribute_mask); + + return 0; +} + +static int path_set_attribute( + Context *c, + Item *item, + const char *path, + CreationMode creation) { + + _cleanup_close_ int fd = -EBADF; + + assert(c); + assert(item); + + if (!item->attribute_set || item->attribute_mask == 0) + return 0; + + fd = path_open_safe(path); + if (fd < 0) + return fd; + + return fd_set_attribute(c, item, fd, path, /* st= */ NULL, creation); +} + +static int write_argument_data(Item *i, int fd, const char *path) { + int r; + + assert(i); + assert(fd >= 0); + assert(path); + + if (item_binary_argument_size(i) == 0) + return 0; + + assert(item_binary_argument(i)); + + log_debug("Writing to \"%s\".", path); + + r = loop_write(fd, item_binary_argument(i), item_binary_argument_size(i)); + if (r < 0) + return log_error_errno(r, "Failed to write file \"%s\": %m", path); + + return 0; +} + +static int write_one_file(Context *c, Item *i, const char *path, CreationMode creation) { + _cleanup_close_ int fd = -EBADF, dir_fd = -EBADF; + _cleanup_free_ char *bn = NULL; + int r; + + assert(c); + assert(i); + assert(path); + assert(i->type == WRITE_FILE); + + r = path_extract_filename(path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", path); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot open path '%s' for writing, is a directory.", path); + + /* Validate the path and keep the fd on the directory for opening the file so we're sure that it + * can't be changed behind our back. */ + dir_fd = path_open_parent_safe(path, i->allow_failure); + if (dir_fd < 0) + return dir_fd; + + /* Follows symlinks */ + fd = openat(dir_fd, bn, + O_NONBLOCK|O_CLOEXEC|O_WRONLY|O_NOCTTY|(i->append_or_force ? O_APPEND : 0), + i->mode); + if (fd < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "Not writing missing file \"%s\": %m", path); + return 0; + } + + if (i->allow_failure) + return log_debug_errno(errno, "Failed to open file \"%s\", ignoring: %m", path); + + return log_error_errno(errno, "Failed to open file \"%s\": %m", path); + } + + /* 'w' is allowed to write into any kind of files. */ + + r = write_argument_data(i, fd, path); + if (r < 0) + return r; + + return fd_set_perms(c, i, fd, path, NULL, creation); +} + +static int create_file( + Context *c, + Item *i, + const char *path) { + + _cleanup_close_ int fd = -EBADF, dir_fd = -EBADF; + _cleanup_free_ char *bn = NULL; + struct stat stbuf, *st = NULL; + CreationMode creation; + int r = 0; + + assert(c); + assert(i); + assert(path); + assert(i->type == CREATE_FILE); + + /* 'f' operates on regular files exclusively. */ + + r = path_extract_filename(path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", path); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot open path '%s' for writing, is a directory.", path); + + /* Validate the path and keep the fd on the directory for opening the file so we're sure that it + * can't be changed behind our back. */ + dir_fd = path_open_parent_safe(path, i->allow_failure); + if (dir_fd < 0) + return dir_fd; + + WITH_UMASK(0000) { + mac_selinux_create_file_prepare(path, S_IFREG); + fd = RET_NERRNO(openat(dir_fd, bn, O_CREAT|O_EXCL|O_NOFOLLOW|O_NONBLOCK|O_CLOEXEC|O_WRONLY|O_NOCTTY, i->mode)); + mac_selinux_create_file_clear(); + } + + if (fd < 0) { + /* Even on a read-only filesystem, open(2) returns EEXIST if the file already exists. It + * returns EROFS only if it needs to create the file. */ + if (fd != -EEXIST) + return log_error_errno(fd, "Failed to create file %s: %m", path); + + /* Re-open the file. At that point it must exist since open(2) failed with EEXIST. We still + * need to check if the perms/mode need to be changed. For read-only filesystems, we let + * fd_set_perms() report the error if the perms need to be modified. */ + fd = openat(dir_fd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH, i->mode); + if (fd < 0) + return log_error_errno(errno, "Failed to re-open file %s: %m", path); + + if (fstat(fd, &stbuf) < 0) + return log_error_errno(errno, "stat(%s) failed: %m", path); + + if (!S_ISREG(stbuf.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s exists and is not a regular file.", + path); + + st = &stbuf; + creation = CREATION_EXISTING; + } else { + r = write_argument_data(i, fd, path); + if (r < 0) + return r; + + creation = CREATION_NORMAL; + } + + return fd_set_perms(c, i, fd, path, st, creation); +} + +static int truncate_file( + Context *c, + Item *i, + const char *path) { + + _cleanup_close_ int fd = -EBADF, dir_fd = -EBADF; + _cleanup_free_ char *bn = NULL; + struct stat stbuf, *st = NULL; + CreationMode creation; + bool erofs = false; + int r = 0; + + assert(c); + assert(i); + assert(path); + assert(i->type == TRUNCATE_FILE || (i->type == CREATE_FILE && i->append_or_force)); + + /* We want to operate on regular file exclusively especially since O_TRUNC is unspecified if the file + * is neither a regular file nor a fifo nor a terminal device. Therefore we first open the file and + * make sure it's a regular one before truncating it. */ + + r = path_extract_filename(path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", path); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot open path '%s' for truncation, is a directory.", path); + + /* Validate the path and keep the fd on the directory for opening the file so we're sure that it + * can't be changed behind our back. */ + dir_fd = path_open_parent_safe(path, i->allow_failure); + if (dir_fd < 0) + return dir_fd; + + creation = CREATION_EXISTING; + fd = RET_NERRNO(openat(dir_fd, bn, O_NOFOLLOW|O_NONBLOCK|O_CLOEXEC|O_WRONLY|O_NOCTTY, i->mode)); + if (fd == -ENOENT) { + creation = CREATION_NORMAL; /* Didn't work without O_CREATE, try again with */ + + WITH_UMASK(0000) { + mac_selinux_create_file_prepare(path, S_IFREG); + fd = RET_NERRNO(openat(dir_fd, bn, O_CREAT|O_NOFOLLOW|O_NONBLOCK|O_CLOEXEC|O_WRONLY|O_NOCTTY, i->mode)); + mac_selinux_create_file_clear(); + } + } + + if (fd < 0) { + if (fd != -EROFS) + return log_error_errno(fd, "Failed to open/create file %s: %m", path); + + /* On a read-only filesystem, we don't want to fail if the target is already empty and the + * perms are set. So we still proceed with the sanity checks and let the remaining operations + * fail with EROFS if they try to modify the target file. */ + + fd = openat(dir_fd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH, i->mode); + if (fd < 0) { + if (errno == ENOENT) + return log_error_errno(SYNTHETIC_ERRNO(EROFS), + "Cannot create file %s on a read-only file system.", + path); + + return log_error_errno(errno, "Failed to re-open file %s: %m", path); + } + + erofs = true; + creation = CREATION_EXISTING; + } + + if (fstat(fd, &stbuf) < 0) + return log_error_errno(errno, "stat(%s) failed: %m", path); + + if (!S_ISREG(stbuf.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), + "%s exists and is not a regular file.", + path); + + if (stbuf.st_size > 0) { + if (ftruncate(fd, 0) < 0) { + r = erofs ? -EROFS : -errno; + return log_error_errno(r, "Failed to truncate file %s: %m", path); + } + } else + st = &stbuf; + + log_debug("\"%s\" has been created.", path); + + if (item_binary_argument(i)) { + r = write_argument_data(i, fd, path); + if (r < 0) + return r; + } + + return fd_set_perms(c, i, fd, path, st, creation); +} + +static int copy_files(Context *c, Item *i) { + _cleanup_close_ int dfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *bn = NULL; + struct stat st, a; + int r; + + log_debug("Copying tree \"%s\" to \"%s\".", i->argument, i->path); + + r = path_extract_filename(i->path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", i->path); + + /* Validate the path and use the returned directory fd for copying the target so we're sure that the + * path can't be changed behind our back. */ + dfd = path_open_parent_safe(i->path, i->allow_failure); + if (dfd < 0) + return dfd; + + r = copy_tree_at(AT_FDCWD, i->argument, + dfd, bn, + i->uid_set ? i->uid : UID_INVALID, + i->gid_set ? i->gid : GID_INVALID, + COPY_REFLINK | ((i->append_or_force) ? COPY_MERGE : COPY_MERGE_EMPTY) | COPY_MAC_CREATE | COPY_HARDLINKS, + NULL, NULL); + + fd = openat(dfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) { + if (r < 0) /* Look at original error first */ + return log_error_errno(r, "Failed to copy files to %s: %m", i->path); + + return log_error_errno(errno, "Failed to openat(%s): %m", i->path); + } + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (stat(i->argument, &a) < 0) + return log_error_errno(errno, "Failed to stat(%s): %m", i->argument); + + if (((st.st_mode ^ a.st_mode) & S_IFMT) != 0) { + log_debug("Can't copy to %s, file exists already and is of different type", i->path); + return 0; + } + + return fd_set_perms(c, i, fd, i->path, &st, _CREATION_MODE_INVALID); +} + +static int create_directory_or_subvolume( + const char *path, + mode_t mode, + bool subvol, + bool allow_failure, + struct stat *ret_st, + CreationMode *ret_creation) { + + _cleanup_free_ char *bn = NULL; + _cleanup_close_ int pfd = -EBADF; + CreationMode creation; + struct stat st; + int r, fd; + + assert(path); + + r = path_extract_filename(path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", path); + + pfd = path_open_parent_safe(path, allow_failure); + if (pfd < 0) + return pfd; + + if (subvol) { + r = getenv_bool("SYSTEMD_TMPFILES_FORCE_SUBVOL"); + if (r < 0) { + if (r != -ENXIO) /* env var is unset */ + log_warning_errno(r, "Cannot parse value of $SYSTEMD_TMPFILES_FORCE_SUBVOL, ignoring."); + r = btrfs_is_subvol(empty_to_root(arg_root)) > 0; + } + if (r == 0) + /* Don't create a subvolume unless the root directory is one, too. We do this under + * the assumption that if the root directory is just a plain directory (i.e. very + * light-weight), we shouldn't try to split it up into subvolumes (i.e. more + * heavy-weight). Thus, chroot() environments and suchlike will get a full brtfs + * subvolume set up below their tree only if they specifically set up a btrfs + * subvolume for the root dir too. */ + + subvol = false; + else { + WITH_UMASK((~mode) & 0777) + r = btrfs_subvol_make(pfd, bn); + } + } else + r = 0; + + if (!subvol || ERRNO_IS_NEG_NOT_SUPPORTED(r)) + WITH_UMASK(0000) + r = mkdirat_label(pfd, bn, mode); + + creation = r >= 0 ? CREATION_NORMAL : CREATION_EXISTING; + + fd = openat(pfd, bn, O_NOFOLLOW|O_CLOEXEC|O_DIRECTORY|O_PATH); + if (fd < 0) { + /* We couldn't open it because it is not actually a directory? */ + if (errno == ENOTDIR) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "\"%s\" already exists and is not a directory.", path); + + /* Then look at the original error */ + if (r < 0) + return log_full_errno(allow_failure ? LOG_INFO : LOG_ERR, + r, + "Failed to create directory or subvolume \"%s\"%s: %m", + path, + allow_failure ? ", ignoring" : ""); + + return log_error_errno(errno, "Failed to open directory/subvolume we just created '%s': %m", path); + } + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", path); + + assert(S_ISDIR(st.st_mode)); /* we used O_DIRECTORY above */ + + log_debug("%s directory \"%s\".", creation_mode_verb_to_string(creation), path); + + if (ret_st) + *ret_st = st; + if (ret_creation) + *ret_creation = creation; + + return fd; +} + +static int create_directory( + Context *c, + Item *i, + const char *path) { + + _cleanup_close_ int fd = -EBADF; + CreationMode creation; + struct stat st; + + assert(c); + assert(i); + assert(IN_SET(i->type, CREATE_DIRECTORY, TRUNCATE_DIRECTORY)); + + fd = create_directory_or_subvolume(path, i->mode, /* subvol= */ false, i->allow_failure, &st, &creation); + if (fd == -EEXIST) + return 0; + if (fd < 0) + return fd; + + return fd_set_perms(c, i, fd, path, &st, creation); +} + +static int create_subvolume( + Context *c, + Item *i, + const char *path) { + + _cleanup_close_ int fd = -EBADF; + CreationMode creation; + struct stat st; + int r, q = 0; + + assert(c); + assert(i); + assert(IN_SET(i->type, CREATE_SUBVOLUME, CREATE_SUBVOLUME_NEW_QUOTA, CREATE_SUBVOLUME_INHERIT_QUOTA)); + + fd = create_directory_or_subvolume(path, i->mode, /* subvol = */ true, i->allow_failure, &st, &creation); + if (fd == -EEXIST) + return 0; + if (fd < 0) + return fd; + + if (creation == CREATION_NORMAL && + IN_SET(i->type, CREATE_SUBVOLUME_NEW_QUOTA, CREATE_SUBVOLUME_INHERIT_QUOTA)) { + r = btrfs_subvol_auto_qgroup_fd(fd, 0, i->type == CREATE_SUBVOLUME_NEW_QUOTA); + if (r == -ENOTTY) + log_debug_errno(r, "Couldn't adjust quota for subvolume \"%s\" (unsupported fs or dir not a subvolume): %m", i->path); + else if (r == -EROFS) + log_debug_errno(r, "Couldn't adjust quota for subvolume \"%s\" (fs is read-only).", i->path); + else if (r == -ENOTCONN) + log_debug_errno(r, "Couldn't adjust quota for subvolume \"%s\" (quota support is disabled).", i->path); + else if (r < 0) + q = log_error_errno(r, "Failed to adjust quota for subvolume \"%s\": %m", i->path); + else if (r > 0) + log_debug("Adjusted quota for subvolume \"%s\".", i->path); + else if (r == 0) + log_debug("Quota for subvolume \"%s\" already in place, no change made.", i->path); + } + + r = fd_set_perms(c, i, fd, path, &st, creation); + if (q < 0) /* prefer the quota change error from above */ + return q; + + return r; +} + +static int empty_directory( + Context *c, + Item *i, + const char *path, + CreationMode creation) { + + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + assert(c); + assert(i); + assert(i->type == EMPTY_DIRECTORY); + + r = chase(path, arg_root, CHASE_SAFE|CHASE_WARN, NULL, &fd); + if (r == -ENOLINK) /* Unsafe symlink: already covered by CHASE_WARN */ + return r; + if (r == -ENOENT) { + /* Option "e" operates only on existing objects. Do not print errors about non-existent files + * or directories */ + log_debug_errno(r, "Skipping missing directory: %s", path); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to open directory '%s': %m", path); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", path); + if (!S_ISDIR(st.st_mode)) { + log_warning("'%s' already exists and is not a directory.", path); + return 0; + } + + return fd_set_perms(c, i, fd, path, &st, creation); +} + +static int create_device( + Context *c, + Item *i, + mode_t file_type) { + + _cleanup_close_ int dfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *bn = NULL; + CreationMode creation; + struct stat st; + int r; + + assert(c); + assert(i); + assert(IN_SET(i->type, CREATE_BLOCK_DEVICE, CREATE_CHAR_DEVICE)); + assert(IN_SET(file_type, S_IFBLK, S_IFCHR)); + + r = path_extract_filename(i->path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", i->path); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot open path '%s' for creating device node, is a directory.", i->path); + + /* Validate the path and use the returned directory fd for copying the target so we're sure that the + * path can't be changed behind our back. */ + dfd = path_open_parent_safe(i->path, i->allow_failure); + if (dfd < 0) + return dfd; + + WITH_UMASK(0000) { + mac_selinux_create_file_prepare(i->path, file_type); + r = RET_NERRNO(mknodat(dfd, bn, i->mode | file_type, i->major_minor)); + mac_selinux_create_file_clear(); + } + creation = r >= 0 ? CREATION_NORMAL : CREATION_EXISTING; + + /* Try to open the inode via O_PATH, regardless if we could create it or not. Maybe everything is in + * order anyway and we hence can ignore the error to create the device node */ + fd = openat(dfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) { + /* OK, so opening the inode failed, let's look at the original error then. */ + + if (r < 0) { + if (ERRNO_IS_PRIVILEGE(r)) + goto handle_privilege; + + return log_error_errno(r, "Failed to create device node '%s': %m", i->path); + } + + return log_error_errno(errno, "Failed to open device node '%s' we just created: %m", i->path); + } + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (((st.st_mode ^ file_type) & S_IFMT) != 0) { + + if (i->append_or_force) { + fd = safe_close(fd); + + WITH_UMASK(0000) { + mac_selinux_create_file_prepare(i->path, file_type); + r = mknodat_atomic(dfd, bn, i->mode | file_type, i->major_minor); + mac_selinux_create_file_clear(); + } + if (ERRNO_IS_PRIVILEGE(r)) + goto handle_privilege; + if (IN_SET(r, -EISDIR, -EEXIST, -ENOTEMPTY)) { + r = rm_rf_child(dfd, bn, REMOVE_PHYSICAL); + if (r < 0) + return log_error_errno(r, "rm -rf %s failed: %m", i->path); + + mac_selinux_create_file_prepare(i->path, file_type); + r = RET_NERRNO(mknodat(dfd, bn, i->mode | file_type, i->major_minor)); + mac_selinux_create_file_clear(); + } + if (r < 0) + return log_error_errno(r, "Failed to create device node '%s': %m", i->path); + + fd = openat(dfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) + return log_error_errno(errno, "Failed to open device node we just created '%s': %m", i->path); + + /* Validate type before change ownership below */ + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (((st.st_mode ^ file_type) & S_IFMT) != 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), "Device node we just created is not a device node, refusing."); + + creation = CREATION_FORCE; + } else { + log_warning("\"%s\" already exists and is not a device node.", i->path); + return 0; + } + } + + log_debug("%s %s device node \"%s\" %u:%u.", + creation_mode_verb_to_string(creation), + i->type == CREATE_BLOCK_DEVICE ? "block" : "char", + i->path, major(i->mode), minor(i->mode)); + + return fd_set_perms(c, i, fd, i->path, &st, creation); + +handle_privilege: + log_debug_errno(r, + "We lack permissions, possibly because of cgroup configuration; " + "skipping creation of device node '%s'.", i->path); + return 0; +} + +static int create_fifo(Context *c, Item *i) { + _cleanup_close_ int pfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *bn = NULL; + CreationMode creation; + struct stat st; + int r; + + assert(c); + assert(i); + assert(i->type == CREATE_FIFO); + + r = path_extract_filename(i->path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", i->path); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot open path '%s' for creating FIFO, is a directory.", i->path); + + pfd = path_open_parent_safe(i->path, i->allow_failure); + if (pfd < 0) + return pfd; + + WITH_UMASK(0000) { + mac_selinux_create_file_prepare(i->path, S_IFIFO); + r = RET_NERRNO(mkfifoat(pfd, bn, i->mode)); + mac_selinux_create_file_clear(); + } + + creation = r >= 0 ? CREATION_NORMAL : CREATION_EXISTING; + + /* Open the inode via O_PATH, regardless if we managed to create it or not. Maybe it is already the FIFO we want */ + fd = openat(pfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) { + if (r < 0) + return log_error_errno(r, "Failed to create FIFO %s: %m", i->path); /* original error! */ + + return log_error_errno(errno, "Failed to open FIFO we just created %s: %m", i->path); + } + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (!S_ISFIFO(st.st_mode)) { + + if (i->append_or_force) { + fd = safe_close(fd); + + WITH_UMASK(0000) { + mac_selinux_create_file_prepare(i->path, S_IFIFO); + r = mkfifoat_atomic(pfd, bn, i->mode); + mac_selinux_create_file_clear(); + } + if (IN_SET(r, -EISDIR, -EEXIST, -ENOTEMPTY)) { + r = rm_rf_child(pfd, bn, REMOVE_PHYSICAL); + if (r < 0) + return log_error_errno(r, "rm -rf %s failed: %m", i->path); + + mac_selinux_create_file_prepare(i->path, S_IFIFO); + r = RET_NERRNO(mkfifoat(pfd, bn, i->mode)); + mac_selinux_create_file_clear(); + } + if (r < 0) + return log_error_errno(r, "Failed to create FIFO %s: %m", i->path); + + fd = openat(pfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) + return log_error_errno(errno, "Failed to open FIFO we just created '%s': %m", i->path); + + /* Validate type before change ownership below */ + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (!S_ISFIFO(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), "FIFO inode we just created is not a FIFO, refusing."); + + creation = CREATION_FORCE; + } else { + log_warning("\"%s\" already exists and is not a FIFO.", i->path); + return 0; + } + } + + log_debug("%s fifo \"%s\".", creation_mode_verb_to_string(creation), i->path); + + return fd_set_perms(c, i, fd, i->path, &st, creation); +} + +static int create_symlink(Context *c, Item *i) { + _cleanup_close_ int pfd = -EBADF, fd = -EBADF; + _cleanup_free_ char *bn = NULL; + CreationMode creation; + struct stat st; + bool good = false; + int r; + + assert(c); + assert(i); + + r = path_extract_filename(i->path, &bn); + if (r < 0) + return log_error_errno(r, "Failed to extract filename from path '%s': %m", i->path); + if (r == O_DIRECTORY) + return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Cannot open path '%s' for creating FIFO, is a directory.", i->path); + + pfd = path_open_parent_safe(i->path, i->allow_failure); + if (pfd < 0) + return pfd; + + mac_selinux_create_file_prepare(i->path, S_IFLNK); + r = RET_NERRNO(symlinkat(i->argument, pfd, bn)); + mac_selinux_create_file_clear(); + + creation = r >= 0 ? CREATION_NORMAL : CREATION_EXISTING; + + fd = openat(pfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) { + if (r < 0) + return log_error_errno(r, "Failed to create symlink '%s': %m", i->path); /* original error! */ + + return log_error_errno(errno, "Failed to open symlink we just created '%s': %m", i->path); + } + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (S_ISLNK(st.st_mode)) { + _cleanup_free_ char *x = NULL; + + r = readlinkat_malloc(fd, "", &x); + if (r < 0) + return log_error_errno(r, "readlinkat(%s) failed: %m", i->path); + + good = streq(x, i->argument); + } else + good = false; + + if (!good) { + if (!i->append_or_force) { + log_debug("\"%s\" is not a symlink or does not point to the correct path.", i->path); + return 0; + } + + fd = safe_close(fd); + + mac_selinux_create_file_prepare(i->path, S_IFLNK); + r = symlinkat_atomic_full(i->argument, pfd, bn, /* make_relative= */ false); + mac_selinux_create_file_clear(); + if (IN_SET(r, -EISDIR, -EEXIST, -ENOTEMPTY)) { + r = rm_rf_child(pfd, bn, REMOVE_PHYSICAL); + if (r < 0) + return log_error_errno(r, "rm -rf %s failed: %m", i->path); + + mac_selinux_create_file_prepare(i->path, S_IFLNK); + r = RET_NERRNO(symlinkat(i->argument, pfd, i->path)); + mac_selinux_create_file_clear(); + } + if (r < 0) + return log_error_errno(r, "symlink(%s, %s) failed: %m", i->argument, i->path); + + fd = openat(pfd, bn, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (fd < 0) + return log_error_errno(errno, "Failed to open symlink we just created '%s': %m", i->path); + + /* Validate type before change ownership below */ + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat(%s): %m", i->path); + + if (!S_ISLNK(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(EBADF), "Symlink we just created is not a symlink, refusing."); + + creation = CREATION_FORCE; + } + + log_debug("%s symlink \"%s\".", creation_mode_verb_to_string(creation), i->path); + return fd_set_perms(c, i, fd, i->path, &st, creation); +} + +typedef int (*action_t)(Context *c, Item *i, const char *path, CreationMode creation); +typedef int (*fdaction_t)(Context *c, Item *i, int fd, const char *path, const struct stat *st, CreationMode creation); + +static int item_do( + Context *c, + Item *i, + int fd, + const char *path, + CreationMode creation, + fdaction_t action) { + + struct stat st; + int r = 0, q; + + assert(c); + assert(i); + assert(path); + assert(fd >= 0); + + if (fstat(fd, &st) < 0) { + r = log_error_errno(errno, "fstat() on file failed: %m"); + goto finish; + } + + /* This returns the first error we run into, but nevertheless tries to go on */ + r = action(c, i, fd, path, &st, creation); + + if (S_ISDIR(st.st_mode)) { + _cleanup_closedir_ DIR *d = NULL; + + /* The passed 'fd' was opened with O_PATH. We need to convert it into a 'regular' fd before + * reading the directory content. */ + d = opendir(FORMAT_PROC_FD_PATH(fd)); + if (!d) { + log_error_errno(errno, "Failed to opendir() '%s': %m", FORMAT_PROC_FD_PATH(fd)); + if (r == 0) + r = -errno; + goto finish; + } + + FOREACH_DIRENT_ALL(de, d, q = -errno; goto finish) { + int de_fd; + + if (dot_or_dot_dot(de->d_name)) + continue; + + de_fd = openat(fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_PATH); + if (de_fd < 0) + q = log_error_errno(errno, "Failed to open() file '%s': %m", de->d_name); + else { + _cleanup_free_ char *de_path = NULL; + + de_path = path_join(path, de->d_name); + if (!de_path) + q = log_oom(); + else + /* Pass ownership of dirent fd over */ + q = item_do(c, i, de_fd, de_path, CREATION_EXISTING, action); + } + + if (q < 0 && r == 0) + r = q; + } + } +finish: + safe_close(fd); + return r; +} + +static int glob_item(Context *c, Item *i, action_t action) { + _cleanup_globfree_ glob_t g = { + .gl_opendir = (void *(*)(const char *)) opendir_nomod, + }; + int r = 0, k; + + assert(c); + assert(i); + + k = safe_glob(i->path, GLOB_NOSORT|GLOB_BRACE, &g); + if (k < 0 && k != -ENOENT) + return log_error_errno(k, "glob(%s) failed: %m", i->path); + + STRV_FOREACH(fn, g.gl_pathv) { + /* We pass CREATION_EXISTING here, since if we are globbing for it, it always has to exist */ + k = action(c, i, *fn, CREATION_EXISTING); + if (k < 0 && r == 0) + r = k; + } + + return r; +} + +static int glob_item_recursively( + Context *c, + Item *i, + fdaction_t action) { + + _cleanup_globfree_ glob_t g = { + .gl_opendir = (void *(*)(const char *)) opendir_nomod, + }; + int r = 0, k; + + k = safe_glob(i->path, GLOB_NOSORT|GLOB_BRACE, &g); + if (k < 0 && k != -ENOENT) + return log_error_errno(k, "glob(%s) failed: %m", i->path); + + STRV_FOREACH(fn, g.gl_pathv) { + _cleanup_close_ int fd = -EBADF; + + /* Make sure we won't trigger/follow file object (such as + * device nodes, automounts, ...) pointed out by 'fn' with + * O_PATH. Note, when O_PATH is used, flags other than + * O_CLOEXEC, O_DIRECTORY, and O_NOFOLLOW are ignored. */ + + fd = open(*fn, O_CLOEXEC|O_NOFOLLOW|O_PATH); + if (fd < 0) { + log_error_errno(errno, "Opening '%s' failed: %m", *fn); + if (r == 0) + r = -errno; + continue; + } + + k = item_do(c, i, fd, *fn, CREATION_EXISTING, action); + if (k < 0 && r == 0) + r = k; + + /* we passed fd ownership to the previous call */ + fd = -EBADF; + } + + return r; +} + +static int rm_if_wrong_type_safe( + mode_t mode, + int parent_fd, + const struct stat *parent_st, /* Only used if follow_links below is true. */ + const char *name, + int flags) { + _cleanup_free_ char *parent_name = NULL; + bool follow_links = !FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW); + struct stat st; + int r; + + assert(name); + assert((mode & ~S_IFMT) == 0); + assert(!follow_links || parent_st); + assert((flags & ~AT_SYMLINK_NOFOLLOW) == 0); + + if (!filename_is_valid(name)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "\"%s\" is not a valid filename.", name); + + r = fstatat_harder(parent_fd, name, &st, flags, REMOVE_CHMOD | REMOVE_CHMOD_RESTORE); + if (r < 0) { + (void) fd_get_path(parent_fd, &parent_name); + return log_full_errno(r == -ENOENT? LOG_DEBUG : LOG_ERR, r, + "Failed to stat \"%s\" at \"%s\": %m", name, strna(parent_name)); + } + + /* Fail before removing anything if this is an unsafe transition. */ + if (follow_links && unsafe_transition(parent_st, &st)) { + (void) fd_get_path(parent_fd, &parent_name); + return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), + "Unsafe transition from \"%s\" to \"%s\".", parent_name, name); + } + + if ((st.st_mode & S_IFMT) == mode) + return 0; + + (void) fd_get_path(parent_fd, &parent_name); + log_notice("Wrong file type 0o%o; rm -rf \"%s/%s\"", st.st_mode & S_IFMT, strna(parent_name), name); + + /* If the target of the symlink was the wrong type, the link needs to be removed instead of the + * target, so make sure it is identified as a link and not a directory. */ + if (follow_links) { + r = fstatat_harder(parent_fd, name, &st, AT_SYMLINK_NOFOLLOW, REMOVE_CHMOD | REMOVE_CHMOD_RESTORE); + if (r < 0) + return log_error_errno(r, "Failed to stat \"%s\" at \"%s\": %m", name, strna(parent_name)); + } + + /* Do not remove mount points. */ + r = fd_is_mount_point(parent_fd, name, follow_links ? AT_SYMLINK_FOLLOW : 0); + if (r < 0) + (void) log_warning_errno(r, "Failed to check if \"%s/%s\" is a mount point: %m; Continuing", + strna(parent_name), name); + else if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "Not removing \"%s/%s\" because it is a mount point.", strna(parent_name), name); + + if ((st.st_mode & S_IFMT) == S_IFDIR) { + _cleanup_close_ int child_fd = -EBADF; + + child_fd = openat(parent_fd, name, O_NOCTTY | O_CLOEXEC | O_DIRECTORY); + if (child_fd < 0) + return log_error_errno(errno, "Failed to open \"%s\" at \"%s\": %m", name, strna(parent_name)); + + r = rm_rf_children(TAKE_FD(child_fd), REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL, &st); + if (r < 0) + return log_error_errno(r, "Failed to remove contents of \"%s\" at \"%s\": %m", name, strna(parent_name)); + + r = unlinkat_harder(parent_fd, name, AT_REMOVEDIR, REMOVE_CHMOD | REMOVE_CHMOD_RESTORE); + } else + r = unlinkat_harder(parent_fd, name, 0, REMOVE_CHMOD | REMOVE_CHMOD_RESTORE); + if (r < 0) + return log_error_errno(r, "Failed to remove \"%s\" at \"%s\": %m", name, strna(parent_name)); + + /* This is covered by the log_notice "Wrong file type..." It is logged earlier because it gives + * context to other error messages that might follow. */ + return -ENOENT; +} + +/* If child_mode is non-zero, rm_if_wrong_type_safe will be executed for the last path component. */ +static int mkdir_parents_rm_if_wrong_type(mode_t child_mode, const char *path) { + _cleanup_close_ int parent_fd = -EBADF; + struct stat parent_st; + size_t path_len; + int r; + + assert(path); + assert((child_mode & ~S_IFMT) == 0); + + path_len = strlen(path); + + if (!is_path(path)) + /* rm_if_wrong_type_safe already logs errors. */ + return child_mode != 0 ? rm_if_wrong_type_safe(child_mode, AT_FDCWD, NULL, path, AT_SYMLINK_NOFOLLOW) : 0; + + if (child_mode != 0 && endswith(path, "/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Trailing path separators are only allowed if child_mode is not set; got \"%s\"", path); + + /* Get the parent_fd and stat. */ + parent_fd = openat(AT_FDCWD, path_is_absolute(path) ? "/" : ".", O_NOCTTY | O_CLOEXEC | O_DIRECTORY); + if (parent_fd < 0) + return log_error_errno(errno, "Failed to open root: %m"); + + if (fstat(parent_fd, &parent_st) < 0) + return log_error_errno(errno, "Failed to stat root: %m"); + + /* Check every parent directory in the path, except the last component */ + for (const char *e = path;;) { + _cleanup_close_ int next_fd = -EBADF; + char t[path_len + 1]; + const char *s; + + /* Find the start of the next path component. */ + s = e + strspn(e, "/"); + /* Find the end of the next path component. */ + e = s + strcspn(s, "/"); + + /* Copy the path component to t so it can be a null terminated string. */ + *((char*) mempcpy(t, s, e - s)) = 0; + + /* Is this the last component? If so, then check the type */ + if (*e == 0) + return child_mode != 0 ? rm_if_wrong_type_safe(child_mode, parent_fd, &parent_st, t, AT_SYMLINK_NOFOLLOW) : 0; + + r = rm_if_wrong_type_safe(S_IFDIR, parent_fd, &parent_st, t, 0); + /* Remove dangling symlinks. */ + if (r == -ENOENT) + r = rm_if_wrong_type_safe(S_IFDIR, parent_fd, &parent_st, t, AT_SYMLINK_NOFOLLOW); + if (r == -ENOENT) { + WITH_UMASK(0000) + r = mkdirat_label(parent_fd, t, 0755); + if (r < 0) { + _cleanup_free_ char *parent_name = NULL; + + (void) fd_get_path(parent_fd, &parent_name); + return log_error_errno(r, "Failed to mkdir \"%s\" at \"%s\": %m", t, strnull(parent_name)); + } + } else if (r < 0) + /* rm_if_wrong_type_safe already logs errors. */ + return r; + + next_fd = RET_NERRNO(openat(parent_fd, t, O_NOCTTY | O_CLOEXEC | O_DIRECTORY)); + if (next_fd < 0) { + _cleanup_free_ char *parent_name = NULL; + + (void) fd_get_path(parent_fd, &parent_name); + return log_error_errno(next_fd, "Failed to open \"%s\" at \"%s\": %m", t, strnull(parent_name)); + } + r = RET_NERRNO(fstat(next_fd, &parent_st)); + if (r < 0) { + _cleanup_free_ char *parent_name = NULL; + + (void) fd_get_path(parent_fd, &parent_name); + return log_error_errno(r, "Failed to stat \"%s\" at \"%s\": %m", t, strnull(parent_name)); + } + + close_and_replace(parent_fd, next_fd); + } +} + +static int mkdir_parents_item(Item *i, mode_t child_mode) { + int r; + if (i->try_replace) { + r = mkdir_parents_rm_if_wrong_type(child_mode, i->path); + if (r < 0 && r != -ENOENT) + return r; + } else + WITH_UMASK(0000) + (void) mkdir_parents_label(i->path, 0755); + + return 0; +} + +static int create_item(Context *c, Item *i) { + int r; + + assert(c); + assert(i); + + log_debug("Running create action for entry %c %s", (char) i->type, i->path); + + switch (i->type) { + + case IGNORE_PATH: + case IGNORE_DIRECTORY_PATH: + case REMOVE_PATH: + case RECURSIVE_REMOVE_PATH: + return 0; + + case TRUNCATE_FILE: + case CREATE_FILE: + r = mkdir_parents_item(i, S_IFREG); + if (r < 0) + return r; + + if ((i->type == CREATE_FILE && i->append_or_force) || i->type == TRUNCATE_FILE) + r = truncate_file(c, i, i->path); + else + r = create_file(c, i, i->path); + if (r < 0) + return r; + break; + + case COPY_FILES: + r = mkdir_parents_item(i, 0); + if (r < 0) + return r; + + r = copy_files(c, i); + if (r < 0) + return r; + break; + + case WRITE_FILE: + r = glob_item(c, i, write_one_file); + if (r < 0) + return r; + + break; + + case CREATE_DIRECTORY: + case TRUNCATE_DIRECTORY: + r = mkdir_parents_item(i, S_IFDIR); + if (r < 0) + return r; + + r = create_directory(c, i, i->path); + if (r < 0) + return r; + break; + + case CREATE_SUBVOLUME: + case CREATE_SUBVOLUME_INHERIT_QUOTA: + case CREATE_SUBVOLUME_NEW_QUOTA: + r = mkdir_parents_item(i, S_IFDIR); + if (r < 0) + return r; + + r = create_subvolume(c, i, i->path); + if (r < 0) + return r; + break; + + case EMPTY_DIRECTORY: + r = glob_item(c, i, empty_directory); + if (r < 0) + return r; + break; + + case CREATE_FIFO: + r = mkdir_parents_item(i, S_IFIFO); + if (r < 0) + return r; + + r = create_fifo(c, i); + if (r < 0) + return r; + break; + + case CREATE_SYMLINK: + r = mkdir_parents_item(i, S_IFLNK); + if (r < 0) + return r; + + r = create_symlink(c, i); + if (r < 0) + return r; + + break; + + case CREATE_BLOCK_DEVICE: + case CREATE_CHAR_DEVICE: + if (have_effective_cap(CAP_MKNOD) <= 0) { + /* In a container we lack CAP_MKNOD. We shouldn't attempt to create the device node in that + * case to avoid noise, and we don't support virtualized devices in containers anyway. */ + + log_debug("We lack CAP_MKNOD, skipping creation of device node %s.", i->path); + return 0; + } + + r = mkdir_parents_item(i, i->type == CREATE_BLOCK_DEVICE ? S_IFBLK : S_IFCHR); + if (r < 0) + return r; + + r = create_device(c, i, i->type == CREATE_BLOCK_DEVICE ? S_IFBLK : S_IFCHR); + if (r < 0) + return r; + + break; + + case ADJUST_MODE: + case RELABEL_PATH: + r = glob_item(c, i, path_set_perms); + if (r < 0) + return r; + break; + + case RECURSIVE_RELABEL_PATH: + r = glob_item_recursively(c, i, fd_set_perms); + if (r < 0) + return r; + break; + + case SET_XATTR: + r = glob_item(c, i, path_set_xattrs); + if (r < 0) + return r; + break; + + case RECURSIVE_SET_XATTR: + r = glob_item_recursively(c, i, fd_set_xattrs); + if (r < 0) + return r; + break; + + case SET_ACL: + r = glob_item(c, i, path_set_acls); + if (r < 0) + return r; + break; + + case RECURSIVE_SET_ACL: + r = glob_item_recursively(c, i, fd_set_acls); + if (r < 0) + return r; + break; + + case SET_ATTRIBUTE: + r = glob_item(c, i, path_set_attribute); + if (r < 0) + return r; + break; + + case RECURSIVE_SET_ATTRIBUTE: + r = glob_item_recursively(c, i, fd_set_attribute); + if (r < 0) + return r; + break; + } + + return 0; +} + +static int remove_item_instance( + Context *c, + Item *i, + const char *instance, + CreationMode creation) { + + int r; + + assert(c); + assert(i); + + switch (i->type) { + + case REMOVE_PATH: + if (remove(instance) < 0 && errno != ENOENT) + return log_error_errno(errno, "rm(%s): %m", instance); + + break; + + case RECURSIVE_REMOVE_PATH: + /* FIXME: we probably should use dir_cleanup() here instead of rm_rf() so that 'x' is honoured. */ + log_debug("rm -rf \"%s\"", instance); + r = rm_rf(instance, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "rm_rf(%s): %m", instance); + + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int remove_item(Context *c, Item *i) { + int r; + + assert(c); + assert(i); + + log_debug("Running remove action for entry %c %s", (char) i->type, i->path); + + switch (i->type) { + + case TRUNCATE_DIRECTORY: + /* FIXME: we probably should use dir_cleanup() here instead of rm_rf() so that 'x' is honoured. */ + log_debug("rm -rf \"%s\"", i->path); + r = rm_rf(i->path, REMOVE_PHYSICAL); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "rm_rf(%s): %m", i->path); + + return 0; + + case REMOVE_PATH: + case RECURSIVE_REMOVE_PATH: + return glob_item(c, i, remove_item_instance); + + default: + return 0; + } +} + +static char *age_by_to_string(AgeBy ab, bool is_dir) { + static const char ab_map[] = { 'a', 'b', 'c', 'm' }; + size_t j = 0; + char *ret; + + ret = new(char, ELEMENTSOF(ab_map) + 1); + if (!ret) + return NULL; + + for (size_t i = 0; i < ELEMENTSOF(ab_map); i++) + if (FLAGS_SET(ab, 1U << i)) + ret[j++] = is_dir ? ascii_toupper(ab_map[i]) : ab_map[i]; + + ret[j] = 0; + return ret; +} + +static int clean_item_instance( + Context *c, + Item *i, + const char* instance, + CreationMode creation) { + + _cleanup_closedir_ DIR *d = NULL; + STRUCT_STATX_DEFINE(sx); + int mountpoint, r; + usec_t cutoff, n; + + assert(i); + + if (!i->age_set) + return 0; + + n = now(CLOCK_REALTIME); + if (n < i->age) + return 0; + + cutoff = n - i->age; + + d = opendir_nomod(instance); + if (!d) { + if (IN_SET(errno, ENOENT, ENOTDIR)) { + log_debug_errno(errno, "Directory \"%s\": %m", instance); + return 0; + } + + return log_error_errno(errno, "Failed to open directory %s: %m", instance); + } + + r = statx_fallback(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx); + if (r < 0) + return log_error_errno(r, "statx(%s) failed: %m", instance); + + if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) + mountpoint = FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT); + else { + struct stat ps; + + if (fstatat(dirfd(d), "..", &ps, AT_SYMLINK_NOFOLLOW) != 0) + return log_error_errno(errno, "stat(%s/..) failed: %m", i->path); + + mountpoint = + sx.stx_dev_major != major(ps.st_dev) || + sx.stx_dev_minor != minor(ps.st_dev) || + sx.stx_ino != ps.st_ino; + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *ab_f = NULL, *ab_d = NULL; + + ab_f = age_by_to_string(i->age_by_file, false); + if (!ab_f) + return log_oom(); + + ab_d = age_by_to_string(i->age_by_dir, true); + if (!ab_d) + return log_oom(); + + log_debug("Cleanup threshold for %s \"%s\" is %s; age-by: %s%s", + mountpoint ? "mount point" : "directory", + instance, + FORMAT_TIMESTAMP_STYLE(cutoff, TIMESTAMP_US), + ab_f, ab_d); + } + + return dir_cleanup(c, i, instance, d, + load_statx_timestamp_nsec(&sx.stx_atime), + load_statx_timestamp_nsec(&sx.stx_mtime), + cutoff * NSEC_PER_USEC, + sx.stx_dev_major, sx.stx_dev_minor, mountpoint, + MAX_DEPTH, i->keep_first_level, + i->age_by_file, i->age_by_dir); +} + +static int clean_item(Context *c, Item *i) { + assert(c); + assert(i); + + log_debug("Running clean action for entry %c %s", (char) i->type, i->path); + + switch (i->type) { + + case CREATE_DIRECTORY: + case CREATE_SUBVOLUME: + case CREATE_SUBVOLUME_INHERIT_QUOTA: + case CREATE_SUBVOLUME_NEW_QUOTA: + case TRUNCATE_DIRECTORY: + case IGNORE_PATH: + case COPY_FILES: + clean_item_instance(c, i, i->path, CREATION_EXISTING); + return 0; + + case EMPTY_DIRECTORY: + case IGNORE_DIRECTORY_PATH: + return glob_item(c, i, clean_item_instance); + + default: + return 0; + } +} + +static int process_item( + Context *c, + Item *i, + OperationMask operation) { + + OperationMask todo; + _cleanup_free_ char *_path = NULL; + const char *path; + int r, q, p; + + assert(c); + assert(i); + + todo = operation & ~i->done; + if (todo == 0) /* Everything already done? */ + return 0; + + i->done |= operation; + + path = i->path; + if (string_is_glob(path)) { + /* We can't easily check whether a glob matches any autofs path, so let's do the check only + * for the non-glob part. */ + + r = glob_non_glob_prefix(path, &_path); + if (r < 0 && r != -ENOENT) + return log_debug_errno(r, "Failed to deglob path: %m"); + if (r >= 0) + path = _path; + } + + r = chase(path, arg_root, CHASE_NO_AUTOFS|CHASE_NONEXISTENT|CHASE_WARN, NULL, NULL); + if (r == -EREMOTE) { + log_notice_errno(r, "Skipping %s", i->path); /* We log the configured path, to not confuse the user. */ + return 0; + } + if (r < 0) + log_debug_errno(r, "Failed to determine whether '%s' is below autofs, ignoring: %m", i->path); + + r = FLAGS_SET(operation, OPERATION_CREATE) ? create_item(c, i) : 0; + /* Failure can only be tolerated for create */ + if (i->allow_failure) + r = 0; + + q = FLAGS_SET(operation, OPERATION_REMOVE) ? remove_item(c, i) : 0; + p = FLAGS_SET(operation, OPERATION_CLEAN) ? clean_item(c, i) : 0; + + return r < 0 ? r : + q < 0 ? q : + p; +} + +static int process_item_array( + Context *c, + ItemArray *array, + OperationMask operation) { + + int r = 0; + size_t n; + + assert(c); + assert(array); + + /* Create any parent first. */ + if (FLAGS_SET(operation, OPERATION_CREATE) && array->parent) + r = process_item_array(c, array->parent, operation & OPERATION_CREATE); + + /* Clean up all children first */ + if ((operation & (OPERATION_REMOVE|OPERATION_CLEAN)) && !set_isempty(array->children)) { + ItemArray *cc; + + SET_FOREACH(cc, array->children) { + int k; + + k = process_item_array(c, cc, operation & (OPERATION_REMOVE|OPERATION_CLEAN)); + if (k < 0 && r == 0) + r = k; + } + } + + for (n = 0; n < array->n_items; n++) { + int k; + + k = process_item(c, array->items + n, operation); + if (k < 0 && r == 0) + r = k; + } + + return r; +} + +static void item_free_contents(Item *i) { + assert(i); + free(i->path); + free(i->argument); + free(i->binary_argument); + strv_free(i->xattrs); + +#if HAVE_ACL + if (i->acl_access) + acl_free(i->acl_access); + + if (i->acl_access_exec) + acl_free(i->acl_access_exec); + + if (i->acl_default) + acl_free(i->acl_default); +#endif +} + +static ItemArray* item_array_free(ItemArray *a) { + size_t n; + + if (!a) + return NULL; + + for (n = 0; n < a->n_items; n++) + item_free_contents(a->items + n); + + set_free(a->children); + free(a->items); + return mfree(a); +} + +static int item_compare(const Item *a, const Item *b) { + /* Make sure that the ownership taking item is put first, so + * that we first create the node, and then can adjust it */ + + if (takes_ownership(a->type) && !takes_ownership(b->type)) + return -1; + if (!takes_ownership(a->type) && takes_ownership(b->type)) + return 1; + + return CMP(a->type, b->type); +} + +static bool item_compatible(const Item *a, const Item *b) { + assert(a); + assert(b); + assert(streq(a->path, b->path)); + + if (takes_ownership(a->type) && takes_ownership(b->type)) + /* check if the items are the same */ + return memcmp_nn(item_binary_argument(a), item_binary_argument_size(a), + item_binary_argument(b), item_binary_argument_size(b)) == 0 && + + a->uid_set == b->uid_set && + a->uid == b->uid && + a->uid_only_create == b->uid_only_create && + + a->gid_set == b->gid_set && + a->gid == b->gid && + a->gid_only_create == b->gid_only_create && + + a->mode_set == b->mode_set && + a->mode == b->mode && + a->mode_only_create == b->mode_only_create && + + a->age_set == b->age_set && + a->age == b->age && + + a->age_by_file == b->age_by_file && + a->age_by_dir == b->age_by_dir && + + a->mask_perms == b->mask_perms && + + a->keep_first_level == b->keep_first_level && + + a->major_minor == b->major_minor; + + return true; +} + +static bool should_include_path(const char *path) { + STRV_FOREACH(prefix, arg_exclude_prefixes) + if (path_startswith(path, *prefix)) { + log_debug("Entry \"%s\" matches exclude prefix \"%s\", skipping.", + path, *prefix); + return false; + } + + STRV_FOREACH(prefix, arg_include_prefixes) + if (path_startswith(path, *prefix)) { + log_debug("Entry \"%s\" matches include prefix \"%s\".", path, *prefix); + return true; + } + + /* no matches, so we should include this path only if we have no allow list at all */ + if (strv_isempty(arg_include_prefixes)) + return true; + + log_debug("Entry \"%s\" does not match any include prefix, skipping.", path); + return false; +} + +static int specifier_expansion_from_arg(const Specifier *specifier_table, Item *i) { + int r; + + assert(i); + + if (!i->argument) + return 0; + + switch (i->type) { + case COPY_FILES: + case CREATE_SYMLINK: + case CREATE_FILE: + case TRUNCATE_FILE: + case WRITE_FILE: { + _cleanup_free_ char *unescaped = NULL, *resolved = NULL; + ssize_t l; + + l = cunescape(i->argument, 0, &unescaped); + if (l < 0) + return log_error_errno(l, "Failed to unescape parameter to write: %s", i->argument); + + r = specifier_printf(unescaped, PATH_MAX-1, specifier_table, arg_root, NULL, &resolved); + if (r < 0) + return r; + + return free_and_replace(i->argument, resolved); + } + case SET_XATTR: + case RECURSIVE_SET_XATTR: + STRV_FOREACH(xattr, i->xattrs) { + _cleanup_free_ char *resolved = NULL; + + r = specifier_printf(*xattr, SIZE_MAX, specifier_table, arg_root, NULL, &resolved); + if (r < 0) + return r; + + free_and_replace(*xattr, resolved); + } + return 0; + + default: + return 0; + } +} + +static int patch_var_run(const char *fname, unsigned line, char **path) { + const char *k; + char *n; + + assert(path); + assert(*path); + + /* Optionally rewrites lines referencing /var/run/, to use /run/ instead. Why bother? tmpfiles merges lines in + * some cases and detects conflicts in others. If files/directories are specified through two equivalent lines + * this is problematic as neither case will be detected. Ideally we'd detect these cases by resolving symlinks + * early, but that's precisely not what we can do here as this code very likely is running very early on, at a + * time where the paths in question are not available yet, or even more importantly, our own tmpfiles rules + * might create the paths that are intermediary to the listed paths. We can't really cover the generic case, + * but the least we can do is cover the specific case of /var/run vs. /run, as /var/run is a legacy name for + * /run only, and we explicitly document that and require that on systemd systems the former is a symlink to + * the latter. Moreover files below this path are by far the primary use case for tmpfiles.d/. */ + + k = path_startswith(*path, "/var/run/"); + if (isempty(k)) /* Don't complain about other paths than /var/run, and not about /var/run itself either. */ + return 0; + + n = path_join("/run", k); + if (!n) + return log_oom(); + + /* Also log about this briefly. We do so at LOG_NOTICE level, as we fixed up the situation automatically, hence + * there's no immediate need for action by the user. However, in the interest of making things less confusing + * to the user, let's still inform the user that these snippets should really be updated. */ + log_syntax(NULL, LOG_NOTICE, fname, line, 0, + "Line references path below legacy directory /var/run/, updating %s → %s; please update the tmpfiles.d/ drop-in file accordingly.", + *path, n); + + free_and_replace(*path, n); + + return 0; +} + +static int find_uid(const char *user, uid_t *ret_uid, Hashmap **cache) { + int r; + + assert(user); + assert(ret_uid); + + /* First: parse as numeric UID string */ + r = parse_uid(user, ret_uid); + if (r >= 0) + return r; + + /* Second: pass to NSS if we are running "online" */ + if (!arg_root) + return get_user_creds(&user, ret_uid, NULL, NULL, NULL, 0); + + /* Third, synthesize "root" unconditionally */ + if (streq(user, "root")) { + *ret_uid = 0; + return 0; + } + + /* Fourth: use fgetpwent() to read /etc/passwd directly, if we are "offline" */ + return name_to_uid_offline(arg_root, user, ret_uid, cache); +} + +static int find_gid(const char *group, gid_t *ret_gid, Hashmap **cache) { + int r; + + assert(group); + assert(ret_gid); + + /* First: parse as numeric GID string */ + r = parse_gid(group, ret_gid); + if (r >= 0) + return r; + + /* Second: pass to NSS if we are running "online" */ + if (!arg_root) + return get_group_creds(&group, ret_gid, 0); + + /* Third, synthesize "root" unconditionally */ + if (streq(group, "root")) { + *ret_gid = 0; + return 0; + } + + /* Fourth: use fgetgrent() to read /etc/group directly, if we are "offline" */ + return name_to_gid_offline(arg_root, group, ret_gid, cache); +} + +static int parse_age_by_from_arg(const char *age_by_str, Item *item) { + AgeBy ab_f = 0, ab_d = 0; + + static const struct { + char age_by_chr; + AgeBy age_by_flag; + } age_by_types[] = { + { 'a', AGE_BY_ATIME }, + { 'b', AGE_BY_BTIME }, + { 'c', AGE_BY_CTIME }, + { 'm', AGE_BY_MTIME }, + }; + + assert(age_by_str); + assert(item); + + if (isempty(age_by_str)) + return -EINVAL; + + for (const char *s = age_by_str; *s != 0; s++) { + size_t i; + + /* Ignore whitespace. */ + if (strchr(WHITESPACE, *s)) + continue; + + for (i = 0; i < ELEMENTSOF(age_by_types); i++) { + /* Check lower-case for files, upper-case for directories. */ + if (*s == age_by_types[i].age_by_chr) { + ab_f |= age_by_types[i].age_by_flag; + break; + } else if (*s == ascii_toupper(age_by_types[i].age_by_chr)) { + ab_d |= age_by_types[i].age_by_flag; + break; + } + } + + /* Invalid character. */ + if (i >= ELEMENTSOF(age_by_types)) + return -EINVAL; + } + + /* No match. */ + if (ab_f == 0 && ab_d == 0) + return -EINVAL; + + item->age_by_file = ab_f > 0 ? ab_f : AGE_BY_DEFAULT_FILE; + item->age_by_dir = ab_d > 0 ? ab_d : AGE_BY_DEFAULT_DIR; + + return 0; +} + +static bool is_duplicated_item(ItemArray *existing, const Item *i) { + + assert(existing); + assert(i); + + for (size_t n = 0; n < existing->n_items; n++) { + const Item *e = existing->items + n; + + if (item_compatible(e, i)) + continue; + + /* Only multiple 'w+' lines for the same path are allowed. */ + if (e->type != WRITE_FILE || !e->append_or_force || + i->type != WRITE_FILE || !i->append_or_force) + return true; + } + + return false; +} + +static int parse_line( + Context *c, + const char *fname, + unsigned line, + const char *buffer, + bool *invalid_config, + Hashmap **uid_cache, + Hashmap **gid_cache) { + + _cleanup_free_ char *action = NULL, *mode = NULL, *user = NULL, *group = NULL, *age = NULL, *path = NULL; + _cleanup_(item_free_contents) Item i = { + /* The "age-by" argument considers all file timestamp types by default. */ + .age_by_file = AGE_BY_DEFAULT_FILE, + .age_by_dir = AGE_BY_DEFAULT_DIR, + }; + ItemArray *existing; + OrderedHashmap *h; + int r, pos; + bool append_or_force = false, boot = false, allow_failure = false, try_replace = false, + unbase64 = false, from_cred = false, missing_user_or_group = false; + + assert(c); + assert(fname); + assert(line >= 1); + assert(buffer); + + const Specifier specifier_table[] = { + { 'a', specifier_architecture, NULL }, + { 'b', specifier_boot_id, NULL }, + { 'B', specifier_os_build_id, NULL }, + { 'H', specifier_hostname, NULL }, + { 'l', specifier_short_hostname, NULL }, + { 'm', specifier_machine_id, NULL }, + { 'o', specifier_os_id, NULL }, + { 'v', specifier_kernel_release, NULL }, + { 'w', specifier_os_version_id, NULL }, + { 'W', specifier_os_variant_id, NULL }, + + { 'h', specifier_user_home, NULL }, + + { 'C', specifier_directory, UINT_TO_PTR(DIRECTORY_CACHE) }, + { 'L', specifier_directory, UINT_TO_PTR(DIRECTORY_LOGS) }, + { 'S', specifier_directory, UINT_TO_PTR(DIRECTORY_STATE) }, + { 't', specifier_directory, UINT_TO_PTR(DIRECTORY_RUNTIME) }, + + COMMON_CREDS_SPECIFIERS(arg_runtime_scope), + COMMON_TMP_SPECIFIERS, + {} + }; + + r = extract_many_words( + &buffer, + NULL, + EXTRACT_UNQUOTE | EXTRACT_CUNESCAPE, + &action, + &path, + &mode, + &user, + &group, + &age, + NULL); + if (r < 0) { + if (IN_SET(r, -EINVAL, -EBADSLT)) + /* invalid quoting and such or an unknown specifier */ + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to parse line: %m"); + } else if (r < 2) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "Syntax error."); + } + + if (!empty_or_dash(buffer)) { + i.argument = strdup(buffer); + if (!i.argument) + return log_oom(); + } + + if (isempty(action)) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "Command too short '%s'.", action); + } + + for (pos = 1; action[pos]; pos++) { + if (action[pos] == '!' && !boot) + boot = true; + else if (action[pos] == '+' && !append_or_force) + append_or_force = true; + else if (action[pos] == '-' && !allow_failure) + allow_failure = true; + else if (action[pos] == '=' && !try_replace) + try_replace = true; + else if (action[pos] == '~' && !unbase64) + unbase64 = true; + else if (action[pos] == '^' && !from_cred) + from_cred = true; + else { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "Unknown modifiers in command '%s'", action); + } + } + + if (boot && !arg_boot) { + log_syntax(NULL, LOG_DEBUG, fname, line, 0, "Ignoring entry %s \"%s\" because --boot is not specified.", action, path); + return 0; + } + + i.type = action[0]; + i.append_or_force = append_or_force; + i.allow_failure = allow_failure; + i.try_replace = try_replace; + + r = specifier_printf(path, PATH_MAX-1, specifier_table, arg_root, NULL, &i.path); + if (ERRNO_IS_NOINFO(r)) + return log_unresolvable_specifier(fname, line); + if (r < 0) { + if (IN_SET(r, -EINVAL, -EBADSLT)) + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to replace specifiers in '%s': %m", path); + } + + r = patch_var_run(fname, line, &i.path); + if (r < 0) + return r; + + if (!path_is_absolute(i.path)) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), + "Path '%s' not absolute.", i.path); + } + + path_simplify(i.path); + + switch (i.type) { + + case CREATE_DIRECTORY: + case CREATE_SUBVOLUME: + case CREATE_SUBVOLUME_INHERIT_QUOTA: + case CREATE_SUBVOLUME_NEW_QUOTA: + case EMPTY_DIRECTORY: + case TRUNCATE_DIRECTORY: + case CREATE_FIFO: + case IGNORE_PATH: + case IGNORE_DIRECTORY_PATH: + case REMOVE_PATH: + case RECURSIVE_REMOVE_PATH: + case ADJUST_MODE: + case RELABEL_PATH: + case RECURSIVE_RELABEL_PATH: + if (i.argument) + log_syntax(NULL, + LOG_WARNING, + fname, + line, + 0, + "%c lines don't take argument fields, ignoring.", + (char) i.type); + + break; + + case CREATE_FILE: + case TRUNCATE_FILE: + break; + + case CREATE_SYMLINK: + if (unbase64) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "base64 decoding not supported for symlink targets."); + } + break; + + case WRITE_FILE: + if (!i.argument) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "Write file requires argument."); + } + break; + + case COPY_FILES: + if (unbase64) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "base64 decoding not supported for copy sources."); + } + break; + + case CREATE_CHAR_DEVICE: + case CREATE_BLOCK_DEVICE: + if (unbase64) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "base64 decoding not supported for device node creation."); + } + + if (!i.argument) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "Device file requires argument."); + } + + r = parse_devnum(i.argument, &i.major_minor); + if (r < 0) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Can't parse device file major/minor '%s'.", i.argument); + } + + break; + + case SET_XATTR: + case RECURSIVE_SET_XATTR: + if (unbase64) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "base64 decoding not supported for extended attributes."); + } + if (!i.argument) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), + "Set extended attribute requires argument."); + } + r = parse_xattrs_from_arg(&i); + if (r < 0) + return r; + break; + + case SET_ACL: + case RECURSIVE_SET_ACL: + if (unbase64) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "base64 decoding not supported for ACLs."); + } + if (!i.argument) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), + "Set ACLs requires argument."); + } + r = parse_acls_from_arg(&i); + if (r < 0) + return r; + break; + + case SET_ATTRIBUTE: + case RECURSIVE_SET_ATTRIBUTE: + if (unbase64) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "base64 decoding not supported for file attributes."); + } + if (!i.argument) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), + "Set file attribute requires argument."); + } + r = parse_attribute_from_arg(&i); + if (IN_SET(r, -EINVAL, -EBADSLT)) + *invalid_config = true; + if (r < 0) + return r; + break; + + default: + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), + "Unknown command type '%c'.", (char) i.type); + } + + if (!should_include_path(i.path)) + return 0; + + if (!unbase64) { + /* Do specifier expansion except if base64 mode is enabled */ + r = specifier_expansion_from_arg(specifier_table, &i); + if (ERRNO_IS_NOINFO(r)) + return log_unresolvable_specifier(fname, line); + if (r < 0) { + if (IN_SET(r, -EINVAL, -EBADSLT)) + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to substitute specifiers in argument: %m"); + } + } + + switch (i.type) { + case CREATE_SYMLINK: + if (!i.argument) { + i.argument = path_join("/usr/share/factory", i.path); + if (!i.argument) + return log_oom(); + } + break; + + case COPY_FILES: + if (!i.argument) { + i.argument = path_join("/usr/share/factory", i.path); + if (!i.argument) + return log_oom(); + } else if (!path_is_absolute(i.argument)) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EBADMSG), "Source path '%s' is not absolute.", i.argument); + + } + + if (!empty_or_root(arg_root)) { + char *p; + + p = path_join(arg_root, i.argument); + if (!p) + return log_oom(); + free_and_replace(i.argument, p); + } + + path_simplify(i.argument); + + if (laccess(i.argument, F_OK) == -ENOENT) { + /* Silently skip over lines where the source file is missing. */ + log_syntax(NULL, LOG_DEBUG, fname, line, 0, "Copy source path '%s' does not exist, skipping line.", i.argument); + return 0; + } + + break; + + default: + break; + } + + if (from_cred) { + if (!i.argument) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), "Reading from credential requested, but no credential name specified."); + if (!credential_name_valid(i.argument)) + return log_syntax(NULL, LOG_ERR, fname, line, SYNTHETIC_ERRNO(EINVAL), "Credential name not valid: %s", i.argument); + + r = read_credential(i.argument, &i.binary_argument, &i.binary_argument_size); + if (IN_SET(r, -ENXIO, -ENOENT)) { + /* Silently skip over lines that have no credentials passed */ + log_syntax(NULL, LOG_DEBUG, fname, line, 0, + "Credential '%s' not specified, skipping line.", i.argument); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to read credential '%s': %m", i.argument); + } + + /* If base64 decoding is requested, do so now */ + if (unbase64 && item_binary_argument(&i)) { + _cleanup_free_ void *data = NULL; + size_t data_size = 0; + + r = unbase64mem(item_binary_argument(&i), item_binary_argument_size(&i), &data, &data_size); + if (r < 0) + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to base64 decode specified argument '%s': %m", i.argument); + + free_and_replace(i.binary_argument, data); + i.binary_argument_size = data_size; + } + + if (!empty_or_root(arg_root)) { + char *p; + + p = path_join(arg_root, i.path); + if (!p) + return log_oom(); + free_and_replace(i.path, p); + } + + if (!empty_or_dash(user)) { + const char *u; + + u = startswith(user, ":"); + if (u) + i.uid_only_create = true; + else + u = user; + + r = find_uid(u, &i.uid, uid_cache); + if (r == -ESRCH && arg_graceful) { + log_syntax(NULL, LOG_DEBUG, fname, line, r, + "%s: user '%s' not found, not adjusting ownership.", i.path, u); + missing_user_or_group = true; + } else if (r < 0) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to resolve user '%s': %m", u); + } else + i.uid_set = true; + } + + if (!empty_or_dash(group)) { + const char *g; + + g = startswith(group, ":"); + if (g) + i.gid_only_create = true; + else + g = group; + + r = find_gid(g, &i.gid, gid_cache); + if (r == -ESRCH && arg_graceful) { + log_syntax(NULL, LOG_DEBUG, fname, line, r, + "%s: group '%s' not found, not adjusting ownership.", i.path, g); + missing_user_or_group = true; + } else if (r < 0) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Failed to resolve group '%s': %m", g); + } else + i.gid_set = true; + } + + if (!empty_or_dash(mode)) { + const char *mm; + unsigned m; + + for (mm = mode;; mm++) { + if (*mm == '~') + i.mask_perms = true; + else if (*mm == ':') + i.mode_only_create = true; + else + break; + } + + r = parse_mode(mm, &m); + if (r < 0) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Invalid mode '%s'.", mode); + } + + i.mode = m; + i.mode_set = true; + } else + i.mode = IN_SET(i.type, + CREATE_DIRECTORY, + TRUNCATE_DIRECTORY, + CREATE_SUBVOLUME, + CREATE_SUBVOLUME_INHERIT_QUOTA, + CREATE_SUBVOLUME_NEW_QUOTA) ? 0755 : 0644; + + if (missing_user_or_group && (i.mode & ~0777) != 0) { + /* Refuse any special bits for nodes where we couldn't resolve the ownership properly. */ + mode_t adjusted = i.mode & 0777; + log_syntax(NULL, LOG_INFO, fname, line, 0, + "Changing mode 0%o to 0%o because of changed ownership.", i.mode, adjusted); + i.mode = adjusted; + } + + if (!empty_or_dash(age)) { + const char *a = age; + _cleanup_free_ char *seconds = NULL, *age_by = NULL; + + if (*a == '~') { + i.keep_first_level = true; + a++; + } + + /* Format: "age-by:age"; where age-by is "[abcmABCM]+". */ + r = split_pair(a, ":", &age_by, &seconds); + if (r == -ENOMEM) + return log_oom(); + if (r < 0 && r != -EINVAL) + return log_error_errno(r, "Failed to parse age-by for '%s': %m", age); + if (r >= 0) { + /* We found a ":", parse the "age-by" part. */ + r = parse_age_by_from_arg(age_by, &i); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Invalid age-by '%s'.", age_by); + } + + /* For parsing the "age" part, after the ":". */ + a = seconds; + } + + r = parse_sec(a, &i.age); + if (r < 0) { + *invalid_config = true; + return log_syntax(NULL, LOG_ERR, fname, line, r, "Invalid age '%s'.", a); + } + + i.age_set = true; + } + + h = needs_glob(i.type) ? c->globs : c->items; + + existing = ordered_hashmap_get(h, i.path); + if (existing) { + if (is_duplicated_item(existing, &i)) { + log_syntax(NULL, LOG_NOTICE, fname, line, 0, + "Duplicate line for path \"%s\", ignoring.", i.path); + return 0; + } + } else { + existing = new0(ItemArray, 1); + if (!existing) + return log_oom(); + + r = ordered_hashmap_put(h, i.path, existing); + if (r < 0) { + free(existing); + return log_oom(); + } + } + + if (!GREEDY_REALLOC(existing->items, existing->n_items + 1)) + return log_oom(); + + existing->items[existing->n_items++] = TAKE_STRUCT(i); + + /* Sort item array, to enforce stable ordering of application */ + typesafe_qsort(existing->items, existing->n_items, item_compare); + + return 0; +} + +static int cat_config(char **config_dirs, char **args) { + _cleanup_strv_free_ char **files = NULL; + int r; + + r = conf_files_list_with_replacement(arg_root, config_dirs, arg_replace, &files, NULL); + if (r < 0) + return r; + + pager_open(arg_pager_flags); + + return cat_files(NULL, files, arg_cat_flags); +} + +static int exclude_default_prefixes(void) { + int r; + + /* Provide an easy way to exclude virtual/memory file systems from what we do here. Useful in + * combination with --root= where we probably don't want to apply stuff to these dirs as they are + * likely over-mounted if the root directory is actually used, and it wouldbe less than ideal to have + * all kinds of files created/adjusted underneath these mount points. */ + + r = strv_extend_strv( + &arg_exclude_prefixes, + STRV_MAKE("/dev", + "/proc", + "/run", + "/sys"), + true); + if (r < 0) + return log_oom(); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-tmpfiles", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [CONFIGURATION FILE...]\n" + "\n%sCreates, deletes and cleans up volatile and temporary files and directories.%s\n\n" + " -h --help Show this help\n" + " --user Execute user configuration\n" + " --version Show package version\n" + " --cat-config Show configuration files\n" + " --tldr Show non-comment parts of configuration\n" + " --create Create marked files/directories\n" + " --clean Clean up marked directories\n" + " --remove Remove marked files/directories\n" + " --boot Execute actions only safe at boot\n" + " --graceful Quietly ignore unknown users or groups\n" + " --prefix=PATH Only apply rules with the specified prefix\n" + " --exclude-prefix=PATH Ignore rules with the specified prefix\n" + " -E Ignore rules prefixed with /dev, /proc, /run, /sys\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --image=PATH Operate on disk image as filesystem root\n" + " --image-policy=POLICY Specify disk image dissection policy\n" + " --replace=PATH Treat arguments as replacement for PATH\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_CAT_CONFIG, + ARG_TLDR, + ARG_USER, + ARG_CREATE, + ARG_CLEAN, + ARG_REMOVE, + ARG_BOOT, + ARG_GRACEFUL, + ARG_PREFIX, + ARG_EXCLUDE_PREFIX, + ARG_ROOT, + ARG_IMAGE, + ARG_IMAGE_POLICY, + ARG_REPLACE, + ARG_NO_PAGER, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "user", no_argument, NULL, ARG_USER }, + { "version", no_argument, NULL, ARG_VERSION }, + { "cat-config", no_argument, NULL, ARG_CAT_CONFIG }, + { "tldr", no_argument, NULL, ARG_TLDR }, + { "create", no_argument, NULL, ARG_CREATE }, + { "clean", no_argument, NULL, ARG_CLEAN }, + { "remove", no_argument, NULL, ARG_REMOVE }, + { "boot", no_argument, NULL, ARG_BOOT }, + { "graceful", no_argument, NULL, ARG_GRACEFUL }, + { "prefix", required_argument, NULL, ARG_PREFIX }, + { "exclude-prefix", required_argument, NULL, ARG_EXCLUDE_PREFIX }, + { "root", required_argument, NULL, ARG_ROOT }, + { "image", required_argument, NULL, ARG_IMAGE }, + { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "replace", required_argument, NULL, ARG_REPLACE }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hE", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_CAT_CONFIG: + arg_cat_flags = CAT_CONFIG_ON; + break; + + case ARG_TLDR: + arg_cat_flags = CAT_TLDR; + break; + + case ARG_USER: + arg_runtime_scope = RUNTIME_SCOPE_USER; + break; + + case ARG_CREATE: + arg_operation |= OPERATION_CREATE; + break; + + case ARG_CLEAN: + arg_operation |= OPERATION_CLEAN; + break; + + case ARG_REMOVE: + arg_operation |= OPERATION_REMOVE; + break; + + case ARG_BOOT: + arg_boot = true; + break; + + case ARG_GRACEFUL: + arg_graceful = true; + break; + + case ARG_PREFIX: + if (strv_push(&arg_include_prefixes, optarg) < 0) + return log_oom(); + break; + + case ARG_EXCLUDE_PREFIX: + if (strv_push(&arg_exclude_prefixes, optarg) < 0) + return log_oom(); + break; + + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root); + if (r < 0) + return r; + break; + + case ARG_IMAGE: +#ifdef STANDALONE + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "This systemd-tmpfiles version is compiled without support for --image=."); +#else + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; +#endif + /* Imply -E here since it makes little sense to create files persistently in the /run mountpoint of a disk image */ + _fallthrough_; + + case 'E': + r = exclude_default_prefixes(); + if (r < 0) + return r; + + break; + + case ARG_IMAGE_POLICY: + r = parse_image_policy_argument(optarg, &arg_image_policy); + if (r < 0) + return r; + break; + + case ARG_REPLACE: + if (!path_is_absolute(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "The argument to --replace= must be an absolute path."); + if (!endswith(optarg, ".conf")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "The argument to --replace= must have the extension '.conf'."); + + arg_replace = optarg; + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_operation == 0 && arg_cat_flags == CAT_CONFIG_OFF) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "You need to specify at least one of --clean, --create, or --remove."); + + if (arg_replace && arg_cat_flags != CAT_CONFIG_OFF) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Option --replace= is not supported with --cat-config/--tldr."); + + if (arg_replace && optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "When --replace= is given, some configuration items must be specified."); + + if (arg_root && arg_runtime_scope == RUNTIME_SCOPE_USER) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Combination of --user and --root= is not supported."); + + if (arg_image && arg_root) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Please specify either --root= or --image=, the combination of both is not supported."); + + return 1; +} + +static int read_config_file( + Context *c, + char **config_dirs, + const char *fn, + bool ignore_enoent, + bool *invalid_config) { + + _cleanup_hashmap_free_ Hashmap *uid_cache = NULL, *gid_cache = NULL; + _cleanup_fclose_ FILE *_f = NULL; + _cleanup_free_ char *pp = NULL; + unsigned v = 0; + FILE *f; + ItemArray *ia; + int r = 0; + + assert(c); + assert(fn); + + if (streq(fn, "-")) { + log_debug("Reading config from stdin%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + fn = ""; + f = stdin; + } else { + r = search_and_fopen(fn, "re", arg_root, (const char**) config_dirs, &_f, &pp); + if (r < 0) { + if (ignore_enoent && r == -ENOENT) { + log_debug_errno(r, "Failed to open \"%s\", ignoring: %m", fn); + return 0; + } + + return log_error_errno(r, "Failed to open '%s': %m", fn); + } + + log_debug("Reading config file \"%s\"%s", pp, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + fn = pp; + f = _f; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + bool invalid_line = false; + int k; + + k = read_stripped_line(f, LONG_LINE_MAX, &line); + if (k < 0) + return log_error_errno(k, "Failed to read '%s': %m", fn); + if (k == 0) + break; + + v++; + + if (IN_SET(line[0], 0, '#')) + continue; + + k = parse_line(c, fn, v, line, &invalid_line, &uid_cache, &gid_cache); + if (k < 0) { + if (invalid_line) + /* Allow reporting with a special code if the caller requested this */ + *invalid_config = true; + else if (r == 0) + /* The first error becomes our return value */ + r = k; + } + } + + /* we have to determine age parameter for each entry of type X */ + ORDERED_HASHMAP_FOREACH(ia, c->globs) + for (size_t ni = 0; ni < ia->n_items; ni++) { + ItemArray *ja; + Item *i = ia->items + ni, *candidate_item = NULL; + + if (i->type != IGNORE_DIRECTORY_PATH) + continue; + + ORDERED_HASHMAP_FOREACH(ja, c->items) + for (size_t nj = 0; nj < ja->n_items; nj++) { + Item *j = ja->items + nj; + + if (!IN_SET(j->type, CREATE_DIRECTORY, + TRUNCATE_DIRECTORY, + CREATE_SUBVOLUME, + CREATE_SUBVOLUME_INHERIT_QUOTA, + CREATE_SUBVOLUME_NEW_QUOTA)) + continue; + + if (path_equal(j->path, i->path)) { + candidate_item = j; + break; + } + + if (candidate_item + ? (path_startswith(j->path, candidate_item->path) && fnmatch(i->path, j->path, FNM_PATHNAME | FNM_PERIOD) == 0) + : path_startswith(i->path, j->path) != NULL) + candidate_item = j; + } + + if (candidate_item && candidate_item->age_set) { + i->age = candidate_item->age; + i->age_set = true; + } + } + + if (ferror(f)) { + log_error_errno(errno, "Failed to read from file %s: %m", fn); + if (r == 0) + r = -EIO; + } + + return r; +} + +static int parse_arguments( + Context *c, + char **config_dirs, + char **args, + bool *invalid_config) { + int r; + + assert(c); + + STRV_FOREACH(arg, args) { + r = read_config_file(c, config_dirs, *arg, false, invalid_config); + if (r < 0) + return r; + } + + return 0; +} + +static int read_config_files( + Context *c, + char **config_dirs, + char **args, + bool *invalid_config) { + + _cleanup_strv_free_ char **files = NULL; + _cleanup_free_ char *p = NULL; + int r; + + assert(c); + + r = conf_files_list_with_replacement(arg_root, config_dirs, arg_replace, &files, &p); + if (r < 0) + return r; + + STRV_FOREACH(f, files) + if (p && path_equal(*f, p)) { + log_debug("Parsing arguments at position \"%s\"%s", *f, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = parse_arguments(c, config_dirs, args, invalid_config); + if (r < 0) + return r; + } else + /* Just warn, ignore result otherwise. + * read_config_file() has some debug output, so no need to print anything. */ + (void) read_config_file(c, config_dirs, *f, true, invalid_config); + + return 0; +} + +static int read_credential_lines(Context *c, bool *invalid_config) { + + _cleanup_free_ char *j = NULL; + const char *d; + int r; + + assert(c); + + r = get_credentials_dir(&d); + if (r == -ENXIO) + return 0; + if (r < 0) + return log_error_errno(r, "Failed to get credentials directory: %m"); + + j = path_join(d, "tmpfiles.extra"); + if (!j) + return log_oom(); + + (void) read_config_file(c, /* config_dirs= */ NULL, j, /* ignore_enoent= */ true, invalid_config); + return 0; +} + +static int link_parent(Context *c, ItemArray *a) { + const char *path; + char *prefix; + int r; + + assert(c); + assert(a); + + /* Finds the closest "parent" item array for the specified item array. Then registers the specified item array + * as child of it, and fills the parent in, linking them both ways. This allows us to later create parents + * before their children, and clean up/remove children before their parents. */ + + if (a->n_items <= 0) + return 0; + + path = a->items[0].path; + prefix = newa(char, strlen(path) + 1); + PATH_FOREACH_PREFIX(prefix, path) { + ItemArray *j; + + j = ordered_hashmap_get(c->items, prefix); + if (!j) + j = ordered_hashmap_get(c->globs, prefix); + if (j) { + r = set_ensure_put(&j->children, NULL, a); + if (r < 0) + return log_oom(); + + a->parent = j; + return 1; + } + } + + return 0; +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(item_array_hash_ops, char, string_hash_func, string_compare_func, + ItemArray, item_array_free); + +static int run(int argc, char *argv[]) { +#ifndef STANDALONE + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(umount_and_freep) char *mounted_dir = NULL; +#endif + _cleanup_strv_free_ char **config_dirs = NULL; + _cleanup_(context_done) Context c = {}; + bool invalid_config = false; + ItemArray *a; + enum { + PHASE_REMOVE_AND_CLEAN, + PHASE_CREATE, + _PHASE_MAX + } phase; + int r, k; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + log_setup(); + + /* We require /proc/ for a lot of our operations, i.e. for adjusting access modes, for anything + * SELinux related, for recursive operation, for xattr, acl and chattr handling, for btrfs stuff and + * a lot more. It's probably the majority of invocations where /proc/ is required. Since people + * apparently invoke it without anyway and are surprised about the failures, let's catch this early + * and output a nice and friendly warning. */ + if (proc_mounted() == 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOSYS), + "/proc/ is not mounted, but required for successful operation of systemd-tmpfiles. " + "Please mount /proc/. Alternatively, consider using the --root= or --image= switches."); + + /* Descending down file system trees might take a lot of fds */ + (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE); + + switch (arg_runtime_scope) { + + case RUNTIME_SCOPE_USER: + r = user_config_paths(&config_dirs); + if (r < 0) + return log_error_errno(r, "Failed to initialize configuration directory list: %m"); + break; + + case RUNTIME_SCOPE_SYSTEM: + config_dirs = strv_split_nulstr(CONF_PATHS_NULSTR("tmpfiles.d")); + if (!config_dirs) + return log_oom(); + break; + + default: + assert_not_reached(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *t = NULL; + + STRV_FOREACH(i, config_dirs) { + _cleanup_free_ char *j = NULL; + + j = path_join(arg_root, *i); + if (!j) + return log_oom(); + + if (!strextend(&t, "\n\t", j)) + return log_oom(); + } + + log_debug("Looking for configuration files in (higher priority first):%s", t); + } + + if (arg_cat_flags != CAT_CONFIG_OFF) + return cat_config(config_dirs, argv + optind); + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + +#ifndef STANDALONE + if (arg_image) { + assert(!arg_root); + + r = mount_image_privately_interactively( + arg_image, + arg_image_policy, + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_VALIDATE_OS | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_FSCK | + DISSECT_IMAGE_GROWFS, + &mounted_dir, + /* ret_dir_fd= */ NULL, + &loop_device); + if (r < 0) + return r; + + arg_root = strdup(mounted_dir); + if (!arg_root) + return log_oom(); + } +#else + assert(!arg_image); +#endif + + c.items = ordered_hashmap_new(&item_array_hash_ops); + c.globs = ordered_hashmap_new(&item_array_hash_ops); + if (!c.items || !c.globs) + return log_oom(); + + /* If command line arguments are specified along with --replace, read all + * configuration files and insert the positional arguments at the specified + * place. Otherwise, if command line arguments are specified, execute just + * them, and finally, without --replace= or any positional arguments, just + * read configuration and execute it. + */ + if (arg_replace || optind >= argc) + r = read_config_files(&c, config_dirs, argv + optind, &invalid_config); + else + r = parse_arguments(&c, config_dirs, argv + optind, &invalid_config); + if (r < 0) + return r; + + r = read_credential_lines(&c, &invalid_config); + if (r < 0) + return r; + + /* Let's now link up all child/parent relationships */ + ORDERED_HASHMAP_FOREACH(a, c.items) { + r = link_parent(&c, a); + if (r < 0) + return r; + } + ORDERED_HASHMAP_FOREACH(a, c.globs) { + r = link_parent(&c, a); + if (r < 0) + return r; + } + + /* If multiple operations are requested, let's first run the remove/clean operations, and only then the create + * operations. i.e. that we first clean out the platform we then build on. */ + for (phase = 0; phase < _PHASE_MAX; phase++) { + OperationMask op; + + if (phase == PHASE_REMOVE_AND_CLEAN) + op = arg_operation & (OPERATION_REMOVE|OPERATION_CLEAN); + else if (phase == PHASE_CREATE) + op = arg_operation & OPERATION_CREATE; + else + assert_not_reached(); + + if (op == 0) /* Nothing requested in this phase */ + continue; + + /* The non-globbing ones usually create things, hence we apply them first */ + ORDERED_HASHMAP_FOREACH(a, c.items) { + k = process_item_array(&c, a, op); + if (k < 0 && r >= 0) + r = k; + } + + /* The globbing ones usually alter things, hence we apply them second. */ + ORDERED_HASHMAP_FOREACH(a, c.globs) { + k = process_item_array(&c, a, op); + if (k < 0 && r >= 0) + r = k; + } + } + + if (ERRNO_IS_RESOURCE(r)) + return r; + if (invalid_config) + return EX_DATAERR; + if (r < 0) + return EX_CANTCREAT; + return 0; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/tpm2-setup/meson.build b/src/tpm2-setup/meson.build new file mode 100644 index 0000000..c85721c --- /dev/null +++ b/src/tpm2-setup/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-tpm2-setup', + 'sources' : files('tpm2-setup.c'), + 'conditions' : [ + 'ENABLE_BOOTLOADER', + 'HAVE_OPENSSL', + 'HAVE_TPM2', + ], + 'dependencies' : [ + libopenssl, + ], + }, +] diff --git a/src/tpm2-setup/tpm2-setup.c b/src/tpm2-setup/tpm2-setup.c new file mode 100644 index 0000000..0be7ffc --- /dev/null +++ b/src/tpm2-setup/tpm2-setup.c @@ -0,0 +1,373 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "build.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "main-func.h" +#include "mkdir.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "terminal-util.h" +#include "tmpfile-util.h" +#include "tpm2-util.h" + +static char *arg_tpm2_device = NULL; +static bool arg_early = false; + +STATIC_DESTRUCTOR_REGISTER(arg_tpm2_device, freep); + +#define TPM2_SRK_PEM_PERSISTENT_PATH "/var/lib/systemd/tpm2-srk-public-key.pem" +#define TPM2_SRK_PEM_RUNTIME_PATH "/run/systemd/tpm2-srk-public-key.pem" + +#define TPM2_SRK_TPM2B_PUBLIC_PERSISTENT_PATH "/var/lib/systemd/tpm2-srk-public-key.tpm2b_public" +#define TPM2_SRK_TPM2B_PUBLIC_RUNTIME_PATH "/run/systemd/tpm2-srk-public-key.tpm2b_public" + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-tpm2-setup", "8", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...]\n" + "\n%5$sSet up the TPM2 Storage Root Key (SRK).%6$s\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --tpm2-device=PATH\n" + " Pick TPM2 device\n" + " --early=BOOL Store SRK public key in /run/ rather than /var/lib/\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_TPM2_DEVICE, + ARG_EARLY, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "tpm2-device", required_argument, NULL, ARG_TPM2_DEVICE }, + { "early", required_argument, NULL, ARG_EARLY }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_TPM2_DEVICE: + if (streq(optarg, "list")) + return tpm2_list_devices(); + + if (free_and_strdup(&arg_tpm2_device, streq(optarg, "auto") ? NULL : optarg) < 0) + return log_oom(); + + break; + + case ARG_EARLY: + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --early= argument: %s", optarg); + + arg_early = r; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program expects no argument."); + + return 1; +} + +struct public_key_data { + EVP_PKEY *pkey; /* as OpenSSL object */ + TPM2B_PUBLIC *public; /* in TPM2 format */ + void *fingerprint; + size_t fingerprint_size; + char *fingerprint_hex; + char *path; +}; + +static void public_key_data_done(struct public_key_data *d) { + assert(d); + + if (d->pkey) { + EVP_PKEY_free(d->pkey); + d->pkey = NULL; + } + if (d->public) { + Esys_Freep(&d->public); + d->public = NULL; + } + d->fingerprint = mfree(d->fingerprint); + d->fingerprint_size = 0; + d->fingerprint_hex = mfree(d->fingerprint_hex); + d->path = mfree(d->path); +} + +static int public_key_make_fingerprint(struct public_key_data *d) { + int r; + + assert(d); + assert(d->pkey); + assert(!d->fingerprint); + assert(!d->fingerprint_hex); + + r = pubkey_fingerprint(d->pkey, EVP_sha256(), &d->fingerprint, &d->fingerprint_size); + if (r < 0) + return log_error_errno(r, "Failed to calculate fingerprint of public key: %m"); + + d->fingerprint_hex = hexmem(d->fingerprint, d->fingerprint_size); + if (!d->fingerprint_hex) + return log_oom(); + + return 0; +} + +static int load_public_key_disk(const char *path, struct public_key_data *ret) { + _cleanup_(public_key_data_done) struct public_key_data data = {}; + _cleanup_free_ char *blob = NULL; + size_t blob_size; + int r; + + assert(path); + assert(ret); + + r = read_full_file(path, &blob, &blob_size); + if (r < 0) { + if (r != -ENOENT) + return log_error_errno(r, "Failed to read '%s': %m", path); + + log_debug("SRK public key file '%s' does not exist.", path); + } else { + log_debug("Loaded SRK public key from '%s'.", path); + + r = openssl_pkey_from_pem(blob, blob_size, &data.pkey); + if (r < 0) + return log_error_errno(r, "Failed to parse SRK public key file '%s': %m", path); + + r = public_key_make_fingerprint(&data); + if (r < 0) + return r; + + log_debug("Loaded SRK public key fingerprint: %s", data.fingerprint_hex); + } + + data.path = strdup(path); + if (!data.path) + return log_oom(); + + *ret = data; + data = (struct public_key_data) {}; + + return 0; +} + +static int load_public_key_tpm2(struct public_key_data *ret) { + _cleanup_(public_key_data_done) struct public_key_data data = {}; + _cleanup_(tpm2_context_unrefp) Tpm2Context *c = NULL; + int r; + + assert(ret); + + r = tpm2_context_new(arg_tpm2_device, &c); + if (r < 0) + return log_error_errno(r, "Failed to create TPM2 context: %m"); + + r = tpm2_get_or_create_srk( + c, + /* session= */ NULL, + &data.public, + /* ret_name= */ NULL, + /* ret_qname= */ NULL, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to get or create SRK: %m"); + if (r > 0) + log_info("New SRK generated and stored in the TPM."); + else + log_info("SRK already stored in the TPM."); + + r = tpm2_tpm2b_public_to_openssl_pkey(data.public, &data.pkey); + if (r < 0) + return log_error_errno(r, "Failed to convert TPM2 SRK public key to OpenSSL public key: %m"); + + r = public_key_make_fingerprint(&data); + if (r < 0) + return r; + + log_info("SRK fingerprint is %s.", data.fingerprint_hex); + + *ret = data; + data = (struct public_key_data) {}; + + return 0; +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + umask(0022); + + _cleanup_(public_key_data_done) struct public_key_data runtime_key = {}, persistent_key = {}, tpm2_key = {}; + + r = load_public_key_disk(TPM2_SRK_PEM_RUNTIME_PATH, &runtime_key); + if (r < 0) + return r; + + if (!arg_early) { + r = load_public_key_disk(TPM2_SRK_PEM_PERSISTENT_PATH, &persistent_key); + if (r < 0) + return r; + + if (runtime_key.pkey && persistent_key.pkey && + memcmp_nn(runtime_key.fingerprint, runtime_key.fingerprint_size, + persistent_key.fingerprint, persistent_key.fingerprint_size) != 0) { + + /* One of those days we might want to add a stricter policy option here, that refuses + * to boot when the SRK changes. For now, let's just warn and proceed, in order not + * to break OS images that are moved around PCs. */ + + log_notice("Saved persistent SRK (%s) and runtime SRK differ (fingerprint %s vs. %s), updating persistent SRK.", + persistent_key.path, persistent_key.fingerprint_hex, runtime_key.fingerprint_hex); + + public_key_data_done(&persistent_key); + } + } + + r = load_public_key_tpm2(&tpm2_key); + if (r < 0) + return r; + + assert(tpm2_key.pkey); + + if (runtime_key.pkey) { + if (memcmp_nn(tpm2_key.fingerprint, tpm2_key.fingerprint_size, + runtime_key.fingerprint, runtime_key.fingerprint_size) != 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), + "Saved runtime SRK differs from TPM SRK, refusing."); + + if (arg_early) { + log_info("SRK saved in '%s' matches SRK in TPM2.", runtime_key.path); + return 0; + } + } + + if (persistent_key.pkey) { + if (memcmp_nn(tpm2_key.fingerprint, tpm2_key.fingerprint_size, + persistent_key.fingerprint, persistent_key.fingerprint_size) == 0) { + log_info("SRK saved in '%s' matches SRK in TPM2.", persistent_key.path); + return 0; + } + + /* As above, we probably want a stricter policy option here, one day. */ + + log_notice("Saved persistent SRK (%s) and TPM SRK differ (fingerprint %s vs. %s), updating persistent SRK.", + persistent_key.path, persistent_key.fingerprint_hex, tpm2_key.fingerprint_hex); + + public_key_data_done(&persistent_key); + } + + const char *pem_path = arg_early ? TPM2_SRK_PEM_RUNTIME_PATH : TPM2_SRK_PEM_PERSISTENT_PATH; + (void) mkdir_parents(pem_path, 0755); + + /* Write out public key (note that we only do that as a help to the user, we don't make use of this ever */ + _cleanup_(unlink_and_freep) char *t = NULL; + _cleanup_fclose_ FILE *f = NULL; + r = fopen_tmpfile_linkable(pem_path, O_WRONLY, &t, &f); + if (r < 0) + return log_error_errno(r, "Failed to open SRK public key file '%s' for writing: %m", pem_path); + + if (PEM_write_PUBKEY(f, tpm2_key.pkey) <= 0) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to write SRK public key file '%s'.", pem_path); + + if (fchmod(fileno(f), 0444) < 0) + return log_error_errno(errno, "Failed to adjust access mode of SRK public key file '%s' to 0444: %m", pem_path); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to sync SRK key to disk: %m"); + + r = flink_tmpfile(f, t, pem_path, LINK_TMPFILE_SYNC|LINK_TMPFILE_REPLACE); + if (r < 0) + return log_error_errno(r, "Failed to move SRK public key file to '%s': %m", pem_path); + + f = safe_fclose(f); + t = mfree(t); + + log_info("SRK public key saved to '%s' in PEM format.", pem_path); + + const char *tpm2b_public_path = arg_early ? TPM2_SRK_TPM2B_PUBLIC_RUNTIME_PATH : TPM2_SRK_TPM2B_PUBLIC_PERSISTENT_PATH; + (void) mkdir_parents(tpm2b_public_path, 0755); + + /* Now also write this out in TPM2B_PUBLIC format */ + r = fopen_tmpfile_linkable(tpm2b_public_path, O_WRONLY, &t, &f); + if (r < 0) + return log_error_errno(r, "Failed to open SRK public key file '%s' for writing: %m", tpm2b_public_path); + + _cleanup_free_ void *marshalled = NULL; + size_t marshalled_size = 0; + r = tpm2_marshal_public(tpm2_key.public, &marshalled, &marshalled_size); + if (r < 0) + return log_error_errno(r, "Failed to marshal TPM2_PUBLIC key."); + + if (fwrite(marshalled, 1, marshalled_size, f) != marshalled_size) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Failed to write SRK public key file '%s'.", tpm2b_public_path); + + if (fchmod(fileno(f), 0444) < 0) + return log_error_errno(errno, "Failed to adjust access mode of SRK public key file '%s' to 0444: %m", tpm2b_public_path); + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to sync SRK key to disk: %m"); + + r = flink_tmpfile(f, t, tpm2b_public_path, LINK_TMPFILE_SYNC|LINK_TMPFILE_REPLACE); + if (r < 0) + return log_error_errno(r, "Failed to move SRK public key file to '%s': %m", tpm2b_public_path); + + log_info("SRK public key saved to '%s' in TPM2B_PUBLIC format.", tpm2b_public_path); + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/tty-ask-password-agent/meson.build b/src/tty-ask-password-agent/meson.build new file mode 100644 index 0000000..ad0c73b --- /dev/null +++ b/src/tty-ask-password-agent/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + executable_template + { + 'name' : 'systemd-tty-ask-password-agent', + 'public' : true, + 'sources' : files('tty-ask-password-agent.c'), + }, +] diff --git a/src/tty-ask-password-agent/tty-ask-password-agent.c b/src/tty-ask-password-agent/tty-ask-password-agent.c new file mode 100644 index 0000000..3a30bfe --- /dev/null +++ b/src/tty-ask-password-agent/tty-ask-password-agent.c @@ -0,0 +1,710 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2015 Werner Fink +***/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "ask-password-api.h" +#include "build.h" +#include "conf-parser.h" +#include "constants.h" +#include "dirent-util.h" +#include "exit-status.h" +#include "fd-util.h" +#include "fileio.h" +#include "hashmap.h" +#include "inotify-util.h" +#include "io-util.h" +#include "macro.h" +#include "main-func.h" +#include "memory-util.h" +#include "mkdir-label.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "wall.h" + +static enum { + ACTION_LIST, + ACTION_QUERY, + ACTION_WATCH, + ACTION_WALL, +} arg_action = ACTION_QUERY; + +static bool arg_plymouth = false; +static bool arg_console = false; +static const char *arg_device = NULL; + +static int send_passwords(const char *socket_name, char **passwords) { + _cleanup_(erase_and_freep) char *packet = NULL; + _cleanup_close_ int socket_fd = -EBADF; + union sockaddr_union sa; + socklen_t sa_len; + size_t packet_length = 1; + char *d; + ssize_t n; + int r; + + assert(socket_name); + + r = sockaddr_un_set_path(&sa.un, socket_name); + if (r < 0) + return r; + sa_len = r; + + STRV_FOREACH(p, passwords) + packet_length += strlen(*p) + 1; + + packet = new(char, packet_length); + if (!packet) + return -ENOMEM; + + packet[0] = '+'; + + d = packet + 1; + STRV_FOREACH(p, passwords) + d = stpcpy(d, *p) + 1; + + socket_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (socket_fd < 0) + return log_debug_errno(errno, "socket(): %m"); + + n = sendto(socket_fd, packet, packet_length, MSG_NOSIGNAL, &sa.sa, sa_len); + if (n < 0) + return log_debug_errno(errno, "sendto(): %m"); + + return (int) n; +} + +static bool wall_tty_match(const char *path, bool is_local, void *userdata) { + _cleanup_free_ char *p = NULL; + _cleanup_close_ int fd = -EBADF; + struct stat st; + + assert(path_is_absolute(path)); + + if (lstat(path, &st) < 0) { + log_debug_errno(errno, "Failed to stat %s: %m", path); + return true; + } + + if (!S_ISCHR(st.st_mode)) { + log_debug("%s is not a character device.", path); + return true; + } + + /* We use named pipes to ensure that wall messages suggesting + * password entry are not printed over password prompts + * already shown. We use the fact here that opening a pipe in + * non-blocking mode for write-only will succeed only if + * there's some writer behind it. Using pipes has the + * advantage that the block will automatically go away if the + * process dies. */ + + if (asprintf(&p, "/run/systemd/ask-password-block/%u:%u", major(st.st_rdev), minor(st.st_rdev)) < 0) { + log_oom(); + return true; + } + + fd = open(p, O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) { + log_debug_errno(errno, "Failed to open the wall pipe: %m"); + return 1; + } + + /* What, we managed to open the pipe? Then this tty is filtered. */ + return 0; +} + +static int agent_ask_password_tty( + const char *message, + usec_t until, + AskPasswordFlags flags, + const char *flag_file, + char ***ret) { + + int tty_fd = -EBADF, r; + const char *con = arg_device ?: "/dev/console"; + + if (arg_console) { + tty_fd = acquire_terminal(con, ACQUIRE_TERMINAL_WAIT, USEC_INFINITY); + if (tty_fd < 0) + return log_error_errno(tty_fd, "Failed to acquire %s: %m", con); + + r = reset_terminal_fd(tty_fd, true); + if (r < 0) + log_warning_errno(r, "Failed to reset terminal, ignoring: %m"); + + log_info("Starting password query on %s.", con); + } + + r = ask_password_tty(tty_fd, message, NULL, until, flags, flag_file, ret); + + if (arg_console) { + tty_fd = safe_close(tty_fd); + release_terminal(); + + if (r >= 0) + log_info("Password query on %s finished successfully.", con); + } + + return r; +} + +static int process_one_password_file(const char *filename) { + _cleanup_free_ char *socket_name = NULL, *message = NULL; + bool accept_cached = false, echo = false, silent = false; + uint64_t not_after = 0; + pid_t pid = 0; + + const ConfigTableItem items[] = { + { "Ask", "Socket", config_parse_string, CONFIG_PARSE_STRING_SAFE, &socket_name }, + { "Ask", "NotAfter", config_parse_uint64, 0, ¬_after }, + { "Ask", "Message", config_parse_string, 0, &message }, + { "Ask", "PID", config_parse_pid, 0, &pid }, + { "Ask", "AcceptCached", config_parse_bool, 0, &accept_cached }, + { "Ask", "Echo", config_parse_bool, 0, &echo }, + { "Ask", "Silent", config_parse_bool, 0, &silent }, + {} + }; + + int r; + + assert(filename); + + r = config_parse(NULL, filename, NULL, + NULL, + config_item_table_lookup, items, + CONFIG_PARSE_RELAXED|CONFIG_PARSE_WARN, + NULL, + NULL); + if (r < 0) + return r; + + if (!socket_name) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Invalid password file %s", filename); + + if (not_after > 0 && now(CLOCK_MONOTONIC) > not_after) + return 0; + + if (pid > 0 && pid_is_alive(pid) <= 0) + return 0; + + switch (arg_action) { + case ACTION_LIST: + printf("'%s' (PID " PID_FMT ")\n", strna(message), pid); + return 0; + + case ACTION_WALL: { + _cleanup_free_ char *msg = NULL; + + if (asprintf(&msg, + "Password entry required for \'%s\' (PID " PID_FMT ").\r\n" + "Please enter password with the systemd-tty-ask-password-agent tool.", + strna(message), + pid) < 0) + return log_oom(); + + (void) wall(msg, NULL, NULL, wall_tty_match, NULL); + return 0; + } + case ACTION_QUERY: + case ACTION_WATCH: { + _cleanup_strv_free_erase_ char **passwords = NULL; + AskPasswordFlags flags = 0; + + if (access(socket_name, W_OK) < 0) { + if (arg_action == ACTION_QUERY) + log_info("Not querying '%s' (PID " PID_FMT "), lacking privileges.", strna(message), pid); + + return 0; + } + + SET_FLAG(flags, ASK_PASSWORD_ACCEPT_CACHED, accept_cached); + SET_FLAG(flags, ASK_PASSWORD_CONSOLE_COLOR, arg_console); + SET_FLAG(flags, ASK_PASSWORD_ECHO, echo); + SET_FLAG(flags, ASK_PASSWORD_SILENT, silent); + + if (arg_plymouth) + r = ask_password_plymouth(message, not_after, flags, filename, &passwords); + else + r = agent_ask_password_tty(message, not_after, flags, filename, &passwords); + if (r < 0) { + /* If the query went away, that's OK */ + if (IN_SET(r, -ETIME, -ENOENT)) + return 0; + + return log_error_errno(r, "Failed to query password: %m"); + } + + if (strv_isempty(passwords)) + return -ECANCELED; + + r = send_passwords(socket_name, passwords); + if (r < 0) + return log_error_errno(r, "Failed to send: %m"); + break; + }} + + return 0; +} + +static int wall_tty_block(void) { + _cleanup_free_ char *p = NULL; + dev_t devnr; + int fd, r; + + r = get_ctty_devnr(0, &devnr); + if (r == -ENXIO) /* We have no controlling tty */ + return -ENOTTY; + if (r < 0) + return log_error_errno(r, "Failed to get controlling TTY: %m"); + + if (asprintf(&p, "/run/systemd/ask-password-block/%u:%u", major(devnr), minor(devnr)) < 0) + return log_oom(); + + (void) mkdir_parents_label(p, 0700); + (void) mkfifo(p, 0600); + + fd = open(p, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return log_debug_errno(errno, "Failed to open %s: %m", p); + + return fd; +} + +static int process_password_files(void) { + _cleanup_closedir_ DIR *d = NULL; + int r = 0; + + d = opendir("/run/systemd/ask-password"); + if (!d) { + if (errno == ENOENT) + return 0; + + return log_error_errno(errno, "Failed to open /run/systemd/ask-password: %m"); + } + + FOREACH_DIRENT(de, d, return log_error_errno(errno, "Failed to read directory: %m")) { + _cleanup_free_ char *p = NULL; + int q; + + /* We only support /run on tmpfs, hence we can rely on + * d_type to be reliable */ + + if (de->d_type != DT_REG) + continue; + + if (!startswith(de->d_name, "ask.")) + continue; + + p = path_join("/run/systemd/ask-password", de->d_name); + if (!p) + return log_oom(); + + q = process_one_password_file(p); + if (q < 0 && r == 0) + r = q; + } + + return r; +} + +static int process_and_watch_password_files(bool watch) { + enum { + FD_SIGNAL, + FD_INOTIFY, + _FD_MAX + }; + + _unused_ _cleanup_close_ int tty_block_fd = -EBADF; + _cleanup_close_ int notify = -EBADF, signal_fd = -EBADF; + struct pollfd pollfd[_FD_MAX]; + sigset_t mask; + int r; + + tty_block_fd = wall_tty_block(); + + (void) mkdir_p_label("/run/systemd/ask-password", 0755); + + assert_se(sigemptyset(&mask) >= 0); + assert_se(sigset_add_many(&mask, SIGTERM, -1) >= 0); + assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) >= 0); + + if (watch) { + signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC); + if (signal_fd < 0) + return log_error_errno(errno, "Failed to allocate signal file descriptor: %m"); + + pollfd[FD_SIGNAL] = (struct pollfd) { .fd = signal_fd, .events = POLLIN }; + + notify = inotify_init1(IN_CLOEXEC); + if (notify < 0) + return log_error_errno(errno, "Failed to allocate directory watch: %m"); + + r = inotify_add_watch_and_warn(notify, "/run/systemd/ask-password", IN_CLOSE_WRITE|IN_MOVED_TO); + if (r < 0) + return r; + + pollfd[FD_INOTIFY] = (struct pollfd) { .fd = notify, .events = POLLIN }; + } + + for (;;) { + usec_t timeout = USEC_INFINITY; + + r = process_password_files(); + if (r < 0) { + if (r == -ECANCELED) + /* Disable poll() timeout since at least one password has + * been skipped and therefore one file remains and is + * unlikely to trigger any events. */ + timeout = 0; + else + /* FIXME: we should do something here since otherwise the service + * requesting the password won't notice the error and will wait + * indefinitely. */ + log_error_errno(r, "Failed to process password: %m"); + } + + if (!watch) + break; + + r = ppoll_usec(pollfd, _FD_MAX, timeout); + if (r == -EINTR) + continue; + if (r < 0) + return r; + + if (pollfd[FD_INOTIFY].revents != 0) + (void) flush_fd(notify); + + if (pollfd[FD_SIGNAL].revents != 0) + break; + } + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-tty-ask-password-agent", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "%sProcess system password requests.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --list Show pending password requests\n" + " --query Process pending password requests\n" + " --watch Continuously process password requests\n" + " --wall Continuously forward password requests to wall\n" + " --plymouth Ask question with Plymouth instead of on TTY\n" + " --console[=DEVICE] Ask question on /dev/console (or DEVICE if specified)\n" + " instead of the current TTY\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_LIST = 0x100, + ARG_QUERY, + ARG_WATCH, + ARG_WALL, + ARG_PLYMOUTH, + ARG_CONSOLE, + ARG_VERSION + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "list", no_argument, NULL, ARG_LIST }, + { "query", no_argument, NULL, ARG_QUERY }, + { "watch", no_argument, NULL, ARG_WATCH }, + { "wall", no_argument, NULL, ARG_WALL }, + { "plymouth", no_argument, NULL, ARG_PLYMOUTH }, + { "console", optional_argument, NULL, ARG_CONSOLE }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_LIST: + arg_action = ACTION_LIST; + break; + + case ARG_QUERY: + arg_action = ACTION_QUERY; + break; + + case ARG_WATCH: + arg_action = ACTION_WATCH; + break; + + case ARG_WALL: + arg_action = ACTION_WALL; + break; + + case ARG_PLYMOUTH: + arg_plymouth = true; + break; + + case ARG_CONSOLE: + arg_console = true; + if (optarg) { + + if (isempty(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Empty console device path is not allowed."); + + arg_device = optarg; + } + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s takes no arguments.", program_invocation_short_name); + + if (arg_plymouth || arg_console) { + + if (!IN_SET(arg_action, ACTION_QUERY, ACTION_WATCH)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Options --query and --watch conflict."); + + if (arg_plymouth && arg_console) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Options --plymouth and --console conflict."); + } + + return 1; +} + +/* + * To be able to ask on all terminal devices of /dev/console the devices are collected. If more than one + * device is found, then on each of the terminals an inquiring task is forked. Every task has its own session + * and its own controlling terminal. If one of the tasks does handle a password, the remaining tasks will be + * terminated. + */ +static int ask_on_this_console(const char *tty, pid_t *ret_pid, char **arguments) { + static const struct sigaction sigchld = { + .sa_handler = nop_signal_handler, + .sa_flags = SA_NOCLDSTOP | SA_RESTART, + }; + static const struct sigaction sighup = { + .sa_handler = SIG_DFL, + .sa_flags = SA_RESTART, + }; + int r; + + assert_se(sigaction(SIGCHLD, &sigchld, NULL) >= 0); + assert_se(sigaction(SIGHUP, &sighup, NULL) >= 0); + assert_se(sigprocmask_many(SIG_UNBLOCK, NULL, SIGHUP, SIGCHLD, -1) >= 0); + + r = safe_fork("(sd-passwd)", FORK_RESET_SIGNALS|FORK_LOG, ret_pid); + if (r < 0) + return r; + if (r == 0) { + assert_se(prctl(PR_SET_PDEATHSIG, SIGHUP) >= 0); + + STRV_FOREACH(i, arguments) { + char *k; + + if (!streq(*i, "--console")) + continue; + + k = strjoin("--console=", tty); + if (!k) { + log_oom(); + _exit(EXIT_FAILURE); + } + + free_and_replace(*i, k); + } + + execv(SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH, arguments); + _exit(EXIT_FAILURE); + } + + return 0; +} + +static void terminate_agents(Set *pids) { + sigset_t set; + void *p; + int r, signum; + + /* + * Request termination of the remaining processes as those + * are not required anymore. + */ + SET_FOREACH(p, pids) + (void) kill(PTR_TO_PID(p), SIGTERM); + + /* + * Collect the processes which have go away. + */ + assert_se(sigemptyset(&set) >= 0); + assert_se(sigaddset(&set, SIGCHLD) >= 0); + + while (!set_isempty(pids)) { + siginfo_t status = {}; + + r = waitid(P_ALL, 0, &status, WEXITED|WNOHANG); + if (r < 0 && errno == EINTR) + continue; + + if (r == 0 && status.si_pid > 0) { + set_remove(pids, PID_TO_PTR(status.si_pid)); + continue; + } + + signum = sigtimedwait(&set, NULL, TIMESPEC_STORE(50 * USEC_PER_MSEC)); + if (signum < 0) { + if (errno != EAGAIN) + log_error_errno(errno, "sigtimedwait() failed: %m"); + break; + } + assert(signum == SIGCHLD); + } + + /* + * Kill hanging processes. + */ + SET_FOREACH(p, pids) { + log_warning("Failed to terminate child %d, killing it", PTR_TO_PID(p)); + (void) kill(PTR_TO_PID(p), SIGKILL); + } +} + +static int ask_on_consoles(char *argv[]) { + _cleanup_set_free_ Set *pids = NULL; + _cleanup_strv_free_ char **consoles = NULL, **arguments = NULL; + siginfo_t status = {}; + pid_t pid; + int r; + + r = get_kernel_consoles(&consoles); + if (r < 0) + return log_error_errno(r, "Failed to determine devices of /dev/console: %m"); + + pids = set_new(NULL); + if (!pids) + return log_oom(); + + arguments = strv_copy(argv); + if (!arguments) + return log_oom(); + + /* Start an agent on each console. */ + STRV_FOREACH(tty, consoles) { + r = ask_on_this_console(*tty, &pid, arguments); + if (r < 0) + return r; + + if (set_put(pids, PID_TO_PTR(pid)) < 0) + return log_oom(); + } + + /* Wait for an agent to exit. */ + for (;;) { + zero(status); + + if (waitid(P_ALL, 0, &status, WEXITED) < 0) { + if (errno == EINTR) + continue; + + return log_error_errno(errno, "waitid() failed: %m"); + } + + set_remove(pids, PID_TO_PTR(status.si_pid)); + break; + } + + if (!is_clean_exit(status.si_code, status.si_status, EXIT_CLEAN_DAEMON, NULL)) + log_error("Password agent failed with: %d", status.si_status); + + terminate_agents(pids); + return 0; +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + umask(0022); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_console && !arg_device) + /* + * Spawn a separate process for each console device. + */ + return ask_on_consoles(argv); + + if (arg_device) { + /* + * Later on, a controlling terminal will be acquired, + * therefore the current process has to become a session + * leader and should not have a controlling terminal already. + */ + (void) setsid(); + (void) release_terminal(); + } + + return process_and_watch_password_files(!IN_SET(arg_action, ACTION_QUERY, ACTION_LIST)); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/ata_id/ata_id.c b/src/udev/ata_id/ata_id.c new file mode 100644 index 0000000..4dd7e54 --- /dev/null +++ b/src/udev/ata_id/ata_id.c @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ata_id - reads product/serial number from ATA drives + * + * Copyright © 2009-2010 David Zeuthen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "build.h" +#include "device-nodes.h" +#include "fd-util.h" +#include "log.h" +#include "main-func.h" +#include "memory-util.h" +#include "udev-util.h" +#include "unaligned.h" + +#define COMMAND_TIMEOUT_MSEC (30 * 1000) + +static bool arg_export = false; +static const char *arg_device = NULL; + +static int disk_scsi_inquiry_command( + int fd, + void *buf, + size_t buf_len) { + + uint8_t cdb[6] = { + /* INQUIRY, see SPC-4 section 6.4 */ + [0] = 0x12, /* OPERATION CODE: INQUIRY */ + [3] = (buf_len >> 8), /* ALLOCATION LENGTH */ + [4] = (buf_len & 0xff), + }; + uint8_t sense[32] = {}; + struct sg_io_v4 io_v4 = { + .guard = 'Q', + .protocol = BSG_PROTOCOL_SCSI, + .subprotocol = BSG_SUB_PROTOCOL_SCSI_CMD, + .request_len = sizeof(cdb), + .request = (uintptr_t) cdb, + .max_response_len = sizeof(sense), + .response = (uintptr_t) sense, + .din_xfer_len = buf_len, + .din_xferp = (uintptr_t) buf, + .timeout = COMMAND_TIMEOUT_MSEC, + }; + + if (ioctl(fd, SG_IO, &io_v4) != 0) { + if (errno != EINVAL) + return log_debug_errno(errno, "ioctl v4 failed: %m"); + + /* could be that the driver doesn't do version 4, try version 3 */ + struct sg_io_hdr io_hdr = { + .interface_id = 'S', + .cmdp = (unsigned char*) cdb, + .cmd_len = sizeof (cdb), + .dxferp = buf, + .dxfer_len = buf_len, + .sbp = sense, + .mx_sb_len = sizeof(sense), + .dxfer_direction = SG_DXFER_FROM_DEV, + .timeout = COMMAND_TIMEOUT_MSEC, + }; + + if (ioctl(fd, SG_IO, &io_hdr) != 0) + return log_debug_errno(errno, "ioctl v3 failed: %m"); + + /* even if the ioctl succeeds, we need to check the return value */ + if (io_hdr.status != 0 || + io_hdr.host_status != 0 || + io_hdr.driver_status != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "ioctl v3 failed"); + + } else { + /* even if the ioctl succeeds, we need to check the return value */ + if (io_v4.device_status != 0 || + io_v4.transport_status != 0 || + io_v4.driver_status != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "ioctl v4 failed"); + } + + return 0; +} + +static int disk_identify_command( + int fd, + void *buf, + size_t buf_len) { + + uint8_t cdb[12] = { + /* + * ATA Pass-Through 12 byte command, as described in + * + * T10 04-262r8 ATA Command Pass-Through + * + * from http://www.t10.org/ftp/t10/document.04/04-262r8.pdf + */ + [0] = 0xa1, /* OPERATION CODE: 12 byte pass through */ + [1] = 4 << 1, /* PROTOCOL: PIO Data-in */ + [2] = 0x2e, /* OFF_LINE=0, CK_COND=1, T_DIR=1, BYT_BLOK=1, T_LENGTH=2 */ + [3] = 0, /* FEATURES */ + [4] = 1, /* SECTORS */ + [5] = 0, /* LBA LOW */ + [6] = 0, /* LBA MID */ + [7] = 0, /* LBA HIGH */ + [8] = 0 & 0x4F, /* SELECT */ + [9] = 0xEC, /* Command: ATA IDENTIFY DEVICE */ + }; + uint8_t sense[32] = {}; + uint8_t *desc = sense + 8; + struct sg_io_v4 io_v4 = { + .guard = 'Q', + .protocol = BSG_PROTOCOL_SCSI, + .subprotocol = BSG_SUB_PROTOCOL_SCSI_CMD, + .request_len = sizeof(cdb), + .request = (uintptr_t) cdb, + .max_response_len = sizeof(sense), + .response = (uintptr_t) sense, + .din_xfer_len = buf_len, + .din_xferp = (uintptr_t) buf, + .timeout = COMMAND_TIMEOUT_MSEC, + }; + + if (ioctl(fd, SG_IO, &io_v4) != 0) { + if (errno != EINVAL) + return log_debug_errno(errno, "ioctl v4 failed: %m"); + + /* could be that the driver doesn't do version 4, try version 3 */ + struct sg_io_hdr io_hdr = { + .interface_id = 'S', + .cmdp = (unsigned char*) cdb, + .cmd_len = sizeof (cdb), + .dxferp = buf, + .dxfer_len = buf_len, + .sbp = sense, + .mx_sb_len = sizeof (sense), + .dxfer_direction = SG_DXFER_FROM_DEV, + .timeout = COMMAND_TIMEOUT_MSEC, + }; + + if (ioctl(fd, SG_IO, &io_hdr) != 0) + return log_debug_errno(errno, "ioctl v3 failed: %m"); + } else { + if (!((sense[0] & 0x7f) == 0x72 && desc[0] == 0x9 && desc[1] == 0x0c) && + !((sense[0] & 0x7f) == 0x70 && sense[12] == 0x00 && sense[13] == 0x1d)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "ioctl v4 failed: %m"); + } + + return 0; +} + +static int disk_identify_packet_device_command( + int fd, + void *buf, + size_t buf_len) { + + uint8_t cdb[16] = { + /* + * ATA Pass-Through 16 byte command, as described in + * + * T10 04-262r8 ATA Command Pass-Through + * + * from http://www.t10.org/ftp/t10/document.04/04-262r8.pdf + */ + [0] = 0x85, /* OPERATION CODE: 16 byte pass through */ + [1] = 4 << 1, /* PROTOCOL: PIO Data-in */ + [2] = 0x2e, /* OFF_LINE=0, CK_COND=1, T_DIR=1, BYT_BLOK=1, T_LENGTH=2 */ + [3] = 0, /* FEATURES */ + [4] = 0, /* FEATURES */ + [5] = 0, /* SECTORS */ + [6] = 1, /* SECTORS */ + [7] = 0, /* LBA LOW */ + [8] = 0, /* LBA LOW */ + [9] = 0, /* LBA MID */ + [10] = 0, /* LBA MID */ + [11] = 0, /* LBA HIGH */ + [12] = 0, /* LBA HIGH */ + [13] = 0, /* DEVICE */ + [14] = 0xA1, /* Command: ATA IDENTIFY PACKET DEVICE */ + [15] = 0, /* CONTROL */ + }; + uint8_t sense[32] = {}; + uint8_t *desc = sense + 8; + struct sg_io_v4 io_v4 = { + .guard = 'Q', + .protocol = BSG_PROTOCOL_SCSI, + .subprotocol = BSG_SUB_PROTOCOL_SCSI_CMD, + .request_len = sizeof (cdb), + .request = (uintptr_t) cdb, + .max_response_len = sizeof (sense), + .response = (uintptr_t) sense, + .din_xfer_len = buf_len, + .din_xferp = (uintptr_t) buf, + .timeout = COMMAND_TIMEOUT_MSEC, + }; + + if (ioctl(fd, SG_IO, &io_v4) != 0) { + if (errno != EINVAL) + return log_debug_errno(errno, "ioctl v4 failed: %m"); + + /* could be that the driver doesn't do version 4, try version 3 */ + struct sg_io_hdr io_hdr = { + .interface_id = 'S', + .cmdp = (unsigned char*) cdb, + .cmd_len = sizeof (cdb), + .dxferp = buf, + .dxfer_len = buf_len, + .sbp = sense, + .mx_sb_len = sizeof (sense), + .dxfer_direction = SG_DXFER_FROM_DEV, + .timeout = COMMAND_TIMEOUT_MSEC, + }; + + if (ioctl(fd, SG_IO, &io_hdr) != 0) + return log_debug_errno(errno, "ioctl v3 failed: %m"); + } else { + if ((sense[0] & 0x7f) != 0x72 || desc[0] != 0x9 || desc[1] != 0x0c) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "ioctl v4 failed: %m"); + } + + return 0; +} + +/** + * disk_identify_get_string: + * @identify: A block of IDENTIFY data + * @offset_words: Offset of the string to get, in words. + * @dest: Destination buffer for the string. + * @dest_len: Length of destination buffer, in bytes. + * + * Copies the ATA string from @identify located at @offset_words into @dest. + */ +static void disk_identify_get_string( + uint8_t identify[512], + unsigned offset_words, + char *dest, + size_t dest_len) { + + unsigned c1; + unsigned c2; + + while (dest_len > 0) { + c1 = identify[offset_words * 2 + 1]; + c2 = identify[offset_words * 2]; + *dest = c1; + dest++; + *dest = c2; + dest++; + offset_words++; + dest_len -= 2; + } +} + +static void disk_identify_fixup_string( + uint8_t identify[512], + unsigned offset_words, + size_t len) { + assert(offset_words < 512/2); + disk_identify_get_string(identify, offset_words, + (char *) identify + offset_words * 2, len); +} + +static void disk_identify_fixup_uint16(uint8_t identify[512], unsigned offset_words) { + assert(offset_words < 512/2); + unaligned_write_ne16(identify + offset_words * 2, + unaligned_read_le16(identify + offset_words * 2)); +} + +/** + * disk_identify: + * @fd: File descriptor for the block device. + * @out_identify: Return location for IDENTIFY data. + * + * Sends the IDENTIFY DEVICE or IDENTIFY PACKET DEVICE command to the + * device represented by @fd. If successful, then the result will be + * copied into @out_identify. + * + * This routine is based on code from libatasmart, LGPL v2.1. + * + * Returns: 0 if the data was successfully obtained, otherwise + * non-zero with errno set. + */ +static int disk_identify(int fd, + uint8_t out_identify[512], + int *ret_peripheral_device_type) { + uint8_t inquiry_buf[36]; + int peripheral_device_type, r; + + /* init results */ + memzero(out_identify, 512); + + /* If we were to use ATA PASS_THROUGH (12) on an ATAPI device + * we could accidentally blank media. This is because MMC's BLANK + * command has the same op-code (0x61). + * + * To prevent this from happening we bail out if the device + * isn't a Direct Access Block Device, e.g. SCSI type 0x00 + * (CD/DVD devices are type 0x05). So we send a SCSI INQUIRY + * command first... libata is handling this via its SCSI + * emulation layer. + * + * This also ensures that we're actually dealing with a device + * that understands SCSI commands. + * + * (Yes, it is a bit perverse that we're tunneling the ATA + * command through SCSI and relying on the ATA driver + * emulating SCSI well-enough...) + * + * (See commit 160b069c25690bfb0c785994c7c3710289179107 for + * the original bug-fix and see http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=556635 + * for the original bug-report.) + */ + r = disk_scsi_inquiry_command(fd, inquiry_buf, sizeof inquiry_buf); + if (r < 0) + return r; + + /* SPC-4, section 6.4.2: Standard INQUIRY data */ + peripheral_device_type = inquiry_buf[0] & 0x1f; + if (peripheral_device_type == 0x05) { + r = disk_identify_packet_device_command(fd, out_identify, 512); + if (r < 0) + return r; + + } else { + if (!IN_SET(peripheral_device_type, 0x00, 0x14)) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Unsupported device type."); + + /* OK, now issue the IDENTIFY DEVICE command */ + r = disk_identify_command(fd, out_identify, 512); + if (r < 0) + return r; + } + + /* Check if IDENTIFY data is all NUL bytes - if so, bail */ + bool all_nul_bytes = true; + for (size_t n = 0; n < 512; n++) + if (out_identify[n] != '\0') { + all_nul_bytes = false; + break; + } + + if (all_nul_bytes) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "IDENTIFY data is all zeroes."); + + if (ret_peripheral_device_type) + *ret_peripheral_device_type = peripheral_device_type; + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "export", no_argument, NULL, 'x' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "xh", options, NULL)) >= 0) + switch (c) { + case 'x': + arg_export = true; + break; + case 'h': + printf("%s [OPTIONS...] DEVICE\n\n" + " -x --export Print values as environment keys\n" + " -h --help Show this help text\n" + " --version Show package version\n", + program_invocation_short_name); + return 0; + case 'v': + return version(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (!argv[optind]) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "DEVICE argument missing."); + + arg_device = argv[optind]; + return 1; +} + +static int run(int argc, char *argv[]) { + struct hd_driveid id; + union { + uint8_t byte[512]; + uint16_t wyde[256]; + } identify; + char model[41], model_enc[256], serial[21], revision[9]; + _cleanup_close_ int fd = -EBADF; + uint16_t word; + int r, peripheral_device_type = -1; + + log_set_target(LOG_TARGET_AUTO); + udev_parse_config(); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + fd = open(ASSERT_PTR(arg_device), O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Cannot open %s: %m", arg_device); + + if (disk_identify(fd, identify.byte, &peripheral_device_type) >= 0) { + /* + * fix up only the fields from the IDENTIFY data that we are going to + * use and copy it into the hd_driveid struct for convenience + */ + disk_identify_fixup_string(identify.byte, 10, 20); /* serial */ + disk_identify_fixup_string(identify.byte, 23, 8); /* fwrev */ + disk_identify_fixup_string(identify.byte, 27, 40); /* model */ + disk_identify_fixup_uint16(identify.byte, 0); /* configuration */ + disk_identify_fixup_uint16(identify.byte, 75); /* queue depth */ + disk_identify_fixup_uint16(identify.byte, 76); /* SATA capabilities */ + disk_identify_fixup_uint16(identify.byte, 82); /* command set supported */ + disk_identify_fixup_uint16(identify.byte, 83); /* command set supported */ + disk_identify_fixup_uint16(identify.byte, 84); /* command set supported */ + disk_identify_fixup_uint16(identify.byte, 85); /* command set supported */ + disk_identify_fixup_uint16(identify.byte, 86); /* command set supported */ + disk_identify_fixup_uint16(identify.byte, 87); /* command set supported */ + disk_identify_fixup_uint16(identify.byte, 89); /* time required for SECURITY ERASE UNIT */ + disk_identify_fixup_uint16(identify.byte, 90); /* time required for enhanced SECURITY ERASE UNIT */ + disk_identify_fixup_uint16(identify.byte, 91); /* current APM values */ + disk_identify_fixup_uint16(identify.byte, 94); /* current AAM value */ + disk_identify_fixup_uint16(identify.byte, 108); /* WWN */ + disk_identify_fixup_uint16(identify.byte, 109); /* WWN */ + disk_identify_fixup_uint16(identify.byte, 110); /* WWN */ + disk_identify_fixup_uint16(identify.byte, 111); /* WWN */ + disk_identify_fixup_uint16(identify.byte, 128); /* device lock function */ + disk_identify_fixup_uint16(identify.byte, 217); /* nominal media rotation rate */ + memcpy(&id, identify.byte, sizeof id); + } else { + /* If this fails, then try HDIO_GET_IDENTITY */ + if (ioctl(fd, HDIO_GET_IDENTITY, &id) != 0) + return log_debug_errno(errno, "%s: HDIO_GET_IDENTITY failed: %m", arg_device); + } + + memcpy(model, id.model, 40); + model[40] = '\0'; + encode_devnode_name(model, model_enc, sizeof(model_enc)); + udev_replace_whitespace((char *) id.model, model, 40); + udev_replace_chars(model, NULL); + udev_replace_whitespace((char *) id.serial_no, serial, 20); + udev_replace_chars(serial, NULL); + udev_replace_whitespace((char *) id.fw_rev, revision, 8); + udev_replace_chars(revision, NULL); + + if (arg_export) { + /* Set this to convey the disk speaks the ATA protocol */ + printf("ID_ATA=1\n"); + + if ((id.config >> 8) & 0x80) { + /* This is an ATAPI device */ + switch ((id.config >> 8) & 0x1f) { + case 0: + printf("ID_TYPE=cd\n"); + break; + case 1: + printf("ID_TYPE=tape\n"); + break; + case 5: + printf("ID_TYPE=cd\n"); + break; + case 7: + printf("ID_TYPE=optical\n"); + break; + default: + printf("ID_TYPE=generic\n"); + break; + } + } else + printf("ID_TYPE=disk\n"); + printf("ID_BUS=ata\n"); + printf("ID_MODEL=%s\n", model); + printf("ID_MODEL_ENC=%s\n", model_enc); + printf("ID_REVISION=%s\n", revision); + if (serial[0] != '\0') { + printf("ID_SERIAL=%s_%s\n", model, serial); + printf("ID_SERIAL_SHORT=%s\n", serial); + } else + printf("ID_SERIAL=%s\n", model); + + if (id.command_set_1 & (1<<5)) { + printf("ID_ATA_WRITE_CACHE=1\n"); + printf("ID_ATA_WRITE_CACHE_ENABLED=%d\n", (id.cfs_enable_1 & (1<<5)) ? 1 : 0); + } + if (id.command_set_1 & (1<<10)) { + printf("ID_ATA_FEATURE_SET_HPA=1\n"); + printf("ID_ATA_FEATURE_SET_HPA_ENABLED=%d\n", (id.cfs_enable_1 & (1<<10)) ? 1 : 0); + + /* + * TODO: use the READ NATIVE MAX ADDRESS command to get the native max address + * so it is easy to check whether the protected area is in use. + */ + } + if (id.command_set_1 & (1<<3)) { + printf("ID_ATA_FEATURE_SET_PM=1\n"); + printf("ID_ATA_FEATURE_SET_PM_ENABLED=%d\n", (id.cfs_enable_1 & (1<<3)) ? 1 : 0); + } + if (id.command_set_1 & (1<<1)) { + printf("ID_ATA_FEATURE_SET_SECURITY=1\n"); + printf("ID_ATA_FEATURE_SET_SECURITY_ENABLED=%d\n", (id.cfs_enable_1 & (1<<1)) ? 1 : 0); + printf("ID_ATA_FEATURE_SET_SECURITY_ERASE_UNIT_MIN=%d\n", id.trseuc * 2); + if ((id.cfs_enable_1 & (1<<1))) /* enabled */ { + if (id.dlf & (1<<8)) + printf("ID_ATA_FEATURE_SET_SECURITY_LEVEL=maximum\n"); + else + printf("ID_ATA_FEATURE_SET_SECURITY_LEVEL=high\n"); + } + if (id.dlf & (1<<5)) + printf("ID_ATA_FEATURE_SET_SECURITY_ENHANCED_ERASE_UNIT_MIN=%d\n", id.trsEuc * 2); + if (id.dlf & (1<<4)) + printf("ID_ATA_FEATURE_SET_SECURITY_EXPIRE=1\n"); + if (id.dlf & (1<<3)) + printf("ID_ATA_FEATURE_SET_SECURITY_FROZEN=1\n"); + if (id.dlf & (1<<2)) + printf("ID_ATA_FEATURE_SET_SECURITY_LOCKED=1\n"); + } + if (id.command_set_1 & (1<<0)) { + printf("ID_ATA_FEATURE_SET_SMART=1\n"); + printf("ID_ATA_FEATURE_SET_SMART_ENABLED=%d\n", (id.cfs_enable_1 & (1<<0)) ? 1 : 0); + } + if (id.command_set_2 & (1<<9)) { + printf("ID_ATA_FEATURE_SET_AAM=1\n"); + printf("ID_ATA_FEATURE_SET_AAM_ENABLED=%d\n", (id.cfs_enable_2 & (1<<9)) ? 1 : 0); + printf("ID_ATA_FEATURE_SET_AAM_VENDOR_RECOMMENDED_VALUE=%d\n", id.acoustic >> 8); + printf("ID_ATA_FEATURE_SET_AAM_CURRENT_VALUE=%d\n", id.acoustic & 0xff); + } + if (id.command_set_2 & (1<<5)) { + printf("ID_ATA_FEATURE_SET_PUIS=1\n"); + printf("ID_ATA_FEATURE_SET_PUIS_ENABLED=%d\n", (id.cfs_enable_2 & (1<<5)) ? 1 : 0); + } + if (id.command_set_2 & (1<<3)) { + printf("ID_ATA_FEATURE_SET_APM=1\n"); + printf("ID_ATA_FEATURE_SET_APM_ENABLED=%d\n", (id.cfs_enable_2 & (1<<3)) ? 1 : 0); + if ((id.cfs_enable_2 & (1<<3))) + printf("ID_ATA_FEATURE_SET_APM_CURRENT_VALUE=%d\n", id.CurAPMvalues & 0xff); + } + if (id.command_set_2 & (1<<0)) + printf("ID_ATA_DOWNLOAD_MICROCODE=1\n"); + + /* + * Word 76 indicates the capabilities of a SATA device. A PATA device shall set + * word 76 to 0000h or FFFFh. If word 76 is set to 0000h or FFFFh, then + * the device does not claim compliance with the Serial ATA specification and words + * 76 through 79 are not valid and shall be ignored. + */ + + word = identify.wyde[76]; + if (!IN_SET(word, 0x0000, 0xffff)) { + printf("ID_ATA_SATA=1\n"); + /* + * If bit 2 of word 76 is set to one, then the device supports the Gen2 + * signaling rate of 3.0 Gb/s (see SATA 2.6). + * + * If bit 1 of word 76 is set to one, then the device supports the Gen1 + * signaling rate of 1.5 Gb/s (see SATA 2.6). + */ + if (word & (1<<2)) + printf("ID_ATA_SATA_SIGNAL_RATE_GEN2=1\n"); + if (word & (1<<1)) + printf("ID_ATA_SATA_SIGNAL_RATE_GEN1=1\n"); + } + + /* Word 217 indicates the nominal media rotation rate of the device */ + word = identify.wyde[217]; + if (word == 0x0001) + printf ("ID_ATA_ROTATION_RATE_RPM=0\n"); /* non-rotating e.g. SSD */ + else if (word >= 0x0401 && word <= 0xfffe) + printf ("ID_ATA_ROTATION_RATE_RPM=%d\n", word); + + /* + * Words 108-111 contain a mandatory World Wide Name (WWN) in the NAA IEEE Registered identifier + * format. Word 108 bits (15:12) shall contain 5h, indicating that the naming authority is IEEE. + * All other values are reserved. + */ + word = identify.wyde[108]; + if ((word & 0xf000) == 0x5000) { + uint64_t wwwn; + + wwwn = identify.wyde[108]; + wwwn <<= 16; + wwwn |= identify.wyde[109]; + wwwn <<= 16; + wwwn |= identify.wyde[110]; + wwwn <<= 16; + wwwn |= identify.wyde[111]; + printf("ID_WWN=0x%1$" PRIx64 "\n" + "ID_WWN_WITH_EXTENSION=0x%1$" PRIx64 "\n", + wwwn); + } + + /* from Linux's include/linux/ata.h */ + if (IN_SET(identify.wyde[0], 0x848a, 0x844a) || + (identify.wyde[83] & 0xc004) == 0x4004) + printf("ID_ATA_CFA=1\n"); + + if (peripheral_device_type >= 0) + printf("ID_ATA_PERIPHERAL_DEVICE_TYPE=%d\n", peripheral_device_type); + } else { + if (serial[0] != '\0') + printf("%s_%s\n", model, serial); + else + printf("%s\n", model); + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/cdrom_id/cdrom_id.c b/src/udev/cdrom_id/cdrom_id.c new file mode 100644 index 0000000..9285dd8 --- /dev/null +++ b/src/udev/cdrom_id/cdrom_id.c @@ -0,0 +1,1023 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * cdrom_id - optical drive and media information prober + */ + +#include +#include +#include +#include +#include +#include + +#include "build.h" +#include "fd-util.h" +#include "main-func.h" +#include "memory-util.h" +#include "random-util.h" +#include "sort-util.h" +#include "string-table.h" +#include "string-util.h" +#include "udev-util.h" +#include "unaligned.h" + +static bool arg_eject = false; +static bool arg_lock = false; +static bool arg_unlock = false; +static const char *arg_node = NULL; + +typedef enum Feature { + FEATURE_RW_NONREMOVABLE = 0x01, + FEATURE_RW_REMOVABLE = 0x02, + + FEATURE_MO_SE = 0x03, /* sector erase */ + FEATURE_MO_WO = 0x04, /* write once */ + FEATURE_MO_AS = 0x05, /* advance storage */ + + FEATURE_CD_ROM = 0x08, + FEATURE_CD_R = 0x09, + FEATURE_CD_RW = 0x0a, + + FEATURE_DVD_ROM = 0x10, + FEATURE_DVD_R = 0x11, + FEATURE_DVD_RAM = 0x12, + FEATURE_DVD_RW_RO = 0x13, /* restricted overwrite mode */ + FEATURE_DVD_RW_SEQ = 0x14, /* sequential mode */ + FEATURE_DVD_R_DL_SEQ = 0x15, /* sequential recording */ + FEATURE_DVD_R_DL_JR = 0x16, /* jump recording */ + FEATURE_DVD_RW_DL = 0x17, + FEATURE_DVD_R_DDR = 0x18, /* download disc recording - dvd for css managed recording */ + FEATURE_DVD_PLUS_RW = 0x1a, + FEATURE_DVD_PLUS_R = 0x1b, + + FEATURE_DDCD_ROM = 0x20, + FEATURE_DDCD_R = 0x21, + FEATURE_DDCD_RW = 0x22, + + FEATURE_DVD_PLUS_RW_DL = 0x2a, + FEATURE_DVD_PLUS_R_DL = 0x2b, + + FEATURE_BD = 0x40, + FEATURE_BD_R_SRM = 0x41, /* sequential recording mode */ + FEATURE_BD_R_RRM = 0x42, /* random recording mode */ + FEATURE_BD_RE = 0x43, + + FEATURE_HDDVD = 0x50, + FEATURE_HDDVD_R = 0x51, + FEATURE_HDDVD_RAM = 0x52, + FEATURE_HDDVD_RW = 0x53, + FEATURE_HDDVD_R_DL = 0x58, + FEATURE_HDDVD_RW_DL = 0x5a, + + FEATURE_MRW, + FEATURE_MRW_W, + + _FEATURE_MAX, + _FEATURE_INVALID = -EINVAL, +} Feature; + +typedef enum MediaState { + MEDIA_STATE_BLANK = 0, + MEDIA_STATE_APPENDABLE = 1, + MEDIA_STATE_COMPLETE = 2, + MEDIA_STATE_OTHER = 3, + _MEDIA_STATE_MAX, + _MEDIA_STATE_INVALID = -EINVAL, +} MediaState; + +typedef struct Context { + int fd; + + Feature *drive_features; + size_t n_drive_feature; + + Feature media_feature; + bool has_media; + + MediaState media_state; + unsigned media_session_next; + unsigned media_session_count; + unsigned media_track_count; + unsigned media_track_count_data; + unsigned media_track_count_audio; + uint64_t media_session_last_offset; +} Context; + +#define CONTEXT_EMPTY { \ + .fd = -EBADF, \ + .media_feature = _FEATURE_INVALID, \ + .media_state = _MEDIA_STATE_INVALID, \ + } + +static void context_clear(Context *c) { + if (!c) + return; + + safe_close(c->fd); + free(c->drive_features); +} + +static bool drive_has_feature(const Context *c, Feature f) { + assert(c); + + for (size_t i = 0; i < c->n_drive_feature; i++) + if (c->drive_features[i] == f) + return true; + + return false; +} + +static int set_drive_feature(Context *c, Feature f) { + assert(c); + + if (drive_has_feature(c, f)) + return 0; + + if (!GREEDY_REALLOC(c->drive_features, c->n_drive_feature + 1)) + return -ENOMEM; + + c->drive_features[c->n_drive_feature++] = f; + return 1; +} + +#define ERRCODE(s) ((((s)[2] & 0x0F) << 16) | ((s)[12] << 8) | ((s)[13])) +#define SK(errcode) (((errcode) >> 16) & 0xFU) +#define ASC(errcode) (((errcode) >> 8) & 0xFFU) +#define ASCQ(errcode) ((errcode) & 0xFFU) +#define CHECK_CONDITION 0x01 + +static int log_scsi_debug_errno(int error, const char *msg) { + assert(error != 0); + + /* error < 0 means errno-style error, error > 0 means SCSI error */ + + if (error < 0) + return log_debug_errno(error, "Failed to %s: %m", msg); + + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Failed to %s with SK=%X/ASC=%02X/ACQ=%02X", + msg, SK(error), ASC(error), ASCQ(error)); +} + +struct scsi_cmd { + struct cdrom_generic_command cgc; + union { + struct request_sense s; + unsigned char u[18]; + } _sense; + struct sg_io_hdr sg_io; +}; + +static void scsi_cmd_init(struct scsi_cmd *cmd) { + memzero(cmd, sizeof(struct scsi_cmd)); + cmd->cgc.quiet = 1; + cmd->cgc.sense = &cmd->_sense.s; + cmd->sg_io.interface_id = 'S'; + cmd->sg_io.mx_sb_len = sizeof(cmd->_sense); + cmd->sg_io.cmdp = cmd->cgc.cmd; + cmd->sg_io.sbp = cmd->_sense.u; + cmd->sg_io.flags = SG_FLAG_LUN_INHIBIT | SG_FLAG_DIRECT_IO; +} + +static void scsi_cmd_set(struct scsi_cmd *cmd, size_t i, unsigned char arg) { + cmd->sg_io.cmd_len = i + 1; + cmd->cgc.cmd[i] = arg; +} + +static int scsi_cmd_run(struct scsi_cmd *cmd, int fd, unsigned char *buf, size_t bufsize) { + int r; + + assert(cmd); + assert(fd >= 0); + assert(buf || bufsize == 0); + + /* Return 0 on success. On failure, return negative errno or positive error code. */ + + if (bufsize > 0) { + cmd->sg_io.dxferp = buf; + cmd->sg_io.dxfer_len = bufsize; + cmd->sg_io.dxfer_direction = SG_DXFER_FROM_DEV; + } else + cmd->sg_io.dxfer_direction = SG_DXFER_NONE; + + if (ioctl(fd, SG_IO, &cmd->sg_io) < 0) + return -errno; + + if ((cmd->sg_io.info & SG_INFO_OK_MASK) != SG_INFO_OK) { + if (cmd->sg_io.masked_status & CHECK_CONDITION) { + r = ERRCODE(cmd->_sense.u); + if (r != 0) + return r; + } + return -EIO; + } + + return 0; +} + +static int scsi_cmd_run_and_log(struct scsi_cmd *cmd, int fd, unsigned char *buf, size_t bufsize, const char *msg) { + int r; + + assert(msg); + + r = scsi_cmd_run(cmd, fd, buf, bufsize); + if (r != 0) + return log_scsi_debug_errno(r, msg); + + return 0; +} + +static int media_lock(int fd, bool lock) { + /* disable the kernel's lock logic */ + if (ioctl(fd, CDROM_CLEAR_OPTIONS, CDO_LOCK) < 0) + log_debug_errno(errno, "Failed to issue ioctl(CDROM_CLEAR_OPTIONS, CDO_LOCK), ignoring: %m"); + + if (ioctl(fd, CDROM_LOCKDOOR, lock ? 1 : 0) < 0) + return log_debug_errno(errno, "Failed to issue ioctl(CDROM_LOCKDOOR): %m"); + + return 0; +} + +static int media_eject(int fd) { + struct scsi_cmd sc; + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_START_STOP_UNIT); + scsi_cmd_set(&sc, 4, 0x02); + scsi_cmd_set(&sc, 5, 0); + + return scsi_cmd_run_and_log(&sc, fd, NULL, 0, "start/stop unit"); +} + +static int cd_capability_compat(Context *c) { + int capability, r; + + assert(c); + + capability = ioctl(c->fd, CDROM_GET_CAPABILITY, NULL); + if (capability < 0) + return log_debug_errno(errno, "CDROM_GET_CAPABILITY failed"); + + if (capability & CDC_CD_R) { + r = set_drive_feature(c, FEATURE_CD_R); + if (r < 0) + return log_oom_debug(); + } + if (capability & CDC_CD_RW) { + r = set_drive_feature(c, FEATURE_CD_RW); + if (r < 0) + return log_oom_debug(); + } + if (capability & CDC_DVD) { + r = set_drive_feature(c, FEATURE_DVD_ROM); + if (r < 0) + return log_oom_debug(); + } + if (capability & CDC_DVD_R) { + r = set_drive_feature(c, FEATURE_DVD_R); + if (r < 0) + return log_oom_debug(); + } + if (capability & CDC_DVD_RAM) { + r = set_drive_feature(c, FEATURE_DVD_RAM); + if (r < 0) + return log_oom_debug(); + } + if (capability & CDC_MRW) { + r = set_drive_feature(c, FEATURE_MRW); + if (r < 0) + return log_oom_debug(); + } + if (capability & CDC_MRW_W) { + r = set_drive_feature(c, FEATURE_MRW_W); + if (r < 0) + return log_oom_debug(); + } + + return 0; +} + +static int cd_media_compat(Context *c) { + int r; + + assert(c); + + r = ioctl(c->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (r < 0) + return log_debug_errno(errno, "ioctl(CDROM_DRIVE_STATUS) failed: %m"); + if (r != CDS_DISC_OK) + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "ioctl(CDROM_DRIVE_STATUS) → %d (%s), ignoring.", + r, + r == CDS_NO_INFO ? "no info" : + r == CDS_NO_DISC ? "no disc" : + r == CDS_TRAY_OPEN ? "tray open" : + r == CDS_DRIVE_NOT_READY ? "drive not ready" : + "unknown status"); + + c->has_media = true; + return 0; +} + +static int cd_inquiry(Context *c) { + struct scsi_cmd sc; + unsigned char inq[36]; + int r; + + assert(c); + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_INQUIRY); + scsi_cmd_set(&sc, 4, sizeof(inq)); + scsi_cmd_set(&sc, 5, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, inq, sizeof(inq), "inquire"); + if (r < 0) + return r; + + if ((inq[0] & 0x1F) != 5) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Not an MMC unit."); + + log_debug("INQUIRY: [%.8s][%.16s][%.4s]", inq + 8, inq + 16, inq + 32); + return 0; +} + +static int feature_profiles(Context *c, const unsigned char *profiles, size_t size) { + int r; + + assert(c); + + for (size_t i = 0; i + 4 <= size; i += 4) { + r = set_drive_feature(c, (Feature) unaligned_read_be16(&profiles[i])); + if (r < 0) + return log_oom_debug(); + } + + return 1; +} + +static int cd_profiles_old_mmc(Context *c) { + disc_information discinfo; + struct scsi_cmd sc; + size_t len; + int r; + + assert(c); + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_DISC_INFO); + scsi_cmd_set(&sc, 8, sizeof(discinfo.disc_information_length)); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, (unsigned char *)&discinfo.disc_information_length, sizeof(discinfo.disc_information_length), "read disc information"); + if (r >= 0) { + /* Not all drives have the same disc_info length, so requeue + * packet with the length the drive tells us it can supply */ + len = be16toh(discinfo.disc_information_length) + sizeof(discinfo.disc_information_length); + if (len > sizeof(discinfo)) + len = sizeof(discinfo); + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_DISC_INFO); + scsi_cmd_set(&sc, 8, len); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, (unsigned char *)&discinfo, len, "read disc information"); + } + if (r < 0) { + if (c->has_media) { + log_debug("No current profile, but disc is present; assuming CD-ROM."); + c->media_feature = FEATURE_CD_ROM; + c->media_track_count = 1; + c->media_track_count_data = 1; + return 1; + } else + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "no current profile, assuming no media."); + }; + + c->has_media = true; + + if (discinfo.erasable) + c->media_feature = FEATURE_CD_RW; + else if (discinfo.disc_status < 2 && drive_has_feature(c, FEATURE_CD_R)) + c->media_feature = FEATURE_CD_R; + else + c->media_feature = FEATURE_CD_ROM; + + return 0; +} + +static int cd_profiles(Context *c) { + struct scsi_cmd sc; + unsigned char features[65530]; + unsigned cur_profile; + size_t len; + int r; + + assert(c); + + /* First query the current profile */ + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_GET_CONFIGURATION); + scsi_cmd_set(&sc, 8, 8); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run(&sc, c->fd, features, 8); + if (r != 0) { + /* handle pre-MMC2 drives which do not support GET CONFIGURATION */ + if (r > 0 && SK(r) == 0x5 && IN_SET(ASC(r), 0x20, 0x24)) { + log_debug("Drive is pre-MMC2 and does not support 46h get configuration command; " + "trying to work around the problem."); + return cd_profiles_old_mmc(c); + } + + return log_scsi_debug_errno(r, "get configuration"); + } + + cur_profile = unaligned_read_be16(&features[6]); + if (cur_profile > 0) { + log_debug("current profile 0x%02x", cur_profile); + c->media_feature = (Feature) cur_profile; + c->has_media = true; + } else { + log_debug("no current profile, assuming no media"); + c->has_media = false; + } + + len = unaligned_read_be32(features); + log_debug("GET CONFIGURATION: size of features buffer %zu", len); + + if (len > sizeof(features)) { + log_debug("Cannot get features in a single query, truncating."); + len = sizeof(features); + } else if (len <= 8) + len = sizeof(features); + + /* Now get the full feature buffer */ + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_GET_CONFIGURATION); + scsi_cmd_set(&sc, 7, (len >> 8) & 0xff); + scsi_cmd_set(&sc, 8, len & 0xff); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, features, len, "get configuration"); + if (r < 0) + return r; + + /* parse the length once more, in case the drive decided to have other features suddenly :) */ + len = unaligned_read_be32(features); + log_debug("GET CONFIGURATION: size of features buffer %zu", len); + + if (len > sizeof(features)) { + log_debug("Cannot get features in a single query, truncating."); + len = sizeof(features); + } + + /* device features */ + for (size_t i = 8; i + 4 < len; i += 4 + features[i + 3]) { + unsigned feature; + + feature = unaligned_read_be16(&features[i]); + + switch (feature) { + case 0x00: + log_debug("GET CONFIGURATION: feature 'profiles', with %u entries", features[i + 3] / 4); + feature_profiles(c, features + i + 4, MIN(features[i + 3], len - i - 4)); + break; + default: + log_debug("GET CONFIGURATION: feature 0x%04x , with 0x%02x bytes", feature, features[i + 3]); + break; + } + } + + return c->has_media; +} + +static const char * const media_state_table[_MEDIA_STATE_MAX] = { + [MEDIA_STATE_BLANK] = "blank", + [MEDIA_STATE_APPENDABLE] = "appendable", + [MEDIA_STATE_COMPLETE] = "complete", + [MEDIA_STATE_OTHER] = "other", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(media_state, MediaState); + +static int dvd_ram_media_update_state(Context *c) { + struct scsi_cmd sc; + unsigned char dvdstruct[8]; + unsigned char format[12]; + unsigned char len; + int r; + + assert(c); + + /* Return 1 if media state is determined. */ + + if (c->media_feature != FEATURE_DVD_RAM) + return 0; + + /* a write protected dvd-ram may report "complete" status */ + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_DVD_STRUCTURE); + scsi_cmd_set(&sc, 7, 0xC0); + scsi_cmd_set(&sc, 9, sizeof(dvdstruct)); + scsi_cmd_set(&sc, 11, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, dvdstruct, sizeof(dvdstruct), "read DVD structure"); + if (r < 0) + return r; + + if (dvdstruct[4] & 0x02) { + c->media_state = MEDIA_STATE_COMPLETE; + log_debug("Write-protected DVD-RAM media inserted"); + return 1; + } + + /* let's make sure we don't try to read unformatted media */ + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_FORMAT_CAPACITIES); + scsi_cmd_set(&sc, 8, sizeof(format)); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, format, sizeof(format), "read DVD format capacities"); + if (r < 0) + return r; + + len = format[3]; + if (len & 7 || len < 16) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid format capacities length."); + + switch (format[8] & 3) { + case 1: + /* This means that last format was interrupted or failed, blank dvd-ram discs are + * factory formatted. Take no action here as it takes quite a while to reformat a + * dvd-ram and it's not automatically started. */ + log_debug("Unformatted DVD-RAM media inserted."); + return 1; + + case 2: + log_debug("Formatted DVD-RAM media inserted."); + return 0; + + case 3: + c->has_media = false; + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "Format capacities returned no media."); + } + + return 0; +} + +static int dvd_media_update_state(Context *c) { + struct scsi_cmd sc; + unsigned char buffer[32 * 2048]; + int r; + + r = dvd_ram_media_update_state(c); + if (r != 0) + return r; + + /* Take a closer look at formatted media (unformatted DVD+RW + * has "blank" status", DVD-RAM was examined earlier) and check + * for ISO and UDF PVDs or a fs superblock presence and do it + * in one ioctl (we need just sectors 0 and 16) */ + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_10); + scsi_cmd_set(&sc, 5, 0); + scsi_cmd_set(&sc, 8, sizeof(buffer)/2048); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, buffer, sizeof(buffer), "read first 32 blocks"); + if (r < 0) { + c->has_media = false; + return r; + } + + /* if any non-zero data is found in sector 16 (iso and udf) or + * eventually 0 (fat32 boot sector, ext2 superblock, etc), disc + * is assumed non-blank */ + + for (size_t offset = 32768; offset < 32768 + 2048; offset++) + if (buffer[offset] != 0) { + log_debug("Data in block 16, assuming complete."); + return 0; + } + + for (size_t offset = 0; offset < 2048; offset++) + if (buffer[offset] != 0) { + log_debug("Data in block 0, assuming complete."); + return 0; + } + + log_debug("No data in blocks 0 or 16, assuming blank."); + c->media_state = MEDIA_STATE_BLANK; + return 0; +} + +static int cd_media_info(Context *c) { + struct scsi_cmd sc; + unsigned char header[32]; + MediaState state; + int r; + + assert(c); + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_DISC_INFO); + scsi_cmd_set(&sc, 8, sizeof(header)); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, header, sizeof(header), "read disc information"); + if (r < 0) + return r; + + c->has_media = true; + log_debug("disk type %02x", header[8]); + + state = (MediaState) (header[2] & 0x03); + log_debug("hardware reported media status: %s", strna(media_state_to_string(state))); + + /* exclude plain CDROM, some fake cdroms return 0 for "blank" media here */ + if (c->media_feature != FEATURE_CD_ROM) + c->media_state = state; + + /* fresh DVD-RW in restricted overwrite mode reports itself as + * "appendable"; change it to "blank" to make it consistent with what + * gets reported after blanking, and what userspace expects. */ + if (c->media_feature == FEATURE_DVD_RW_RO && state == MEDIA_STATE_APPENDABLE) + c->media_state = MEDIA_STATE_BLANK; + + /* DVD+RW discs (and DVD-RW in restricted mode) once formatted are + * always "complete", DVD-RAM are "other" or "complete" if the disc is + * write protected; we need to check the contents if it is blank */ + if (IN_SET(c->media_feature, FEATURE_DVD_RW_RO, FEATURE_DVD_PLUS_RW, FEATURE_DVD_PLUS_RW_DL, FEATURE_DVD_RAM) && + IN_SET(state, MEDIA_STATE_COMPLETE, MEDIA_STATE_OTHER)) { + r = dvd_media_update_state(c); + if (r < 0) + return r; + } + + /* "other" is e. g. DVD-RAM, can't append sessions there; DVDs in + * restricted overwrite mode can never append, only in sequential mode */ + if (c->media_feature != FEATURE_DVD_RW_RO && IN_SET(state, MEDIA_STATE_BLANK, MEDIA_STATE_APPENDABLE)) + c->media_session_next = header[10] << 8 | header[5]; + c->media_session_count = header[9] << 8 | header[4]; + c->media_track_count = header[11] << 8 | header[6]; + + return 0; +} + +static int cd_media_toc(Context *c) { + struct scsi_cmd sc; + unsigned char header[12]; + unsigned char toc[65536]; + unsigned num_tracks; + size_t len; + int r; + + assert(c); + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_TOC_PMA_ATIP); + scsi_cmd_set(&sc, 6, 1); + scsi_cmd_set(&sc, 8, sizeof(header)); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, header, sizeof(header), "read TOC"); + if (r < 0) + return r; + + len = unaligned_read_be16(header) + 2; + log_debug("READ TOC: len: %zu, start track: %u, end track: %u", len, header[2], header[3]); + + if (len > sizeof(toc)) + return -1; + /* empty media has no tracks */ + if (len < 8) + return 0; + + /* 2: first track, 3: last track */ + num_tracks = header[3] - header[2] + 1; + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_TOC_PMA_ATIP); + scsi_cmd_set(&sc, 6, header[2]); /* First Track/Session Number */ + scsi_cmd_set(&sc, 7, (len >> 8) & 0xff); + scsi_cmd_set(&sc, 8, len & 0xff); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, toc, len, "read TOC (tracks)"); + if (r < 0) + return r; + + /* Take care to not iterate beyond the last valid track as specified in + * the TOC, but also avoid going beyond the TOC length, just in case + * the last track number is invalidly large */ + for (size_t i = 4; i + 8 <= len && num_tracks > 0; i += 8, --num_tracks) { + bool is_data_track; + uint32_t block; + + is_data_track = (toc[i + 1] & 0x04) != 0; + block = unaligned_read_be32(&toc[i + 4]); + + log_debug("track=%u info=0x%x(%s) start_block=%"PRIu32, + toc[i + 2], toc[i + 1] & 0x0FU, is_data_track ? "data":"audio", block); + + if (is_data_track) + c->media_track_count_data++; + else + c->media_track_count_audio++; + } + + scsi_cmd_init(&sc); + scsi_cmd_set(&sc, 0, GPCMD_READ_TOC_PMA_ATIP); + scsi_cmd_set(&sc, 2, 1); /* Session Info */ + scsi_cmd_set(&sc, 8, sizeof(header)); + scsi_cmd_set(&sc, 9, 0); + r = scsi_cmd_run_and_log(&sc, c->fd, header, sizeof(header), "read TOC (multi session)"); + if (r < 0) + return r; + + len = unaligned_read_be32(&header[8]); + log_debug("last track %u starts at block %zu", header[4+2], len); + c->media_session_last_offset = (uint64_t) len * 2048; + + return 0; +} + +static int open_drive(Context *c) { + int fd; + + assert(c); + assert(c->fd < 0); + + for (int cnt = 0;; cnt++) { + fd = open(arg_node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (fd >= 0) + break; + if (++cnt >= 20 || errno != EBUSY) + return log_debug_errno(errno, "Unable to open '%s': %m", arg_node); + + (void) usleep_safe(100 * USEC_PER_MSEC + random_u64_range(100 * USEC_PER_MSEC)); + } + + log_debug("probing: '%s'", arg_node); + c->fd = fd; + return 0; +} + +typedef struct FeatureToString { + Feature feature; + const char *str; +} FeatureToString; + +static const FeatureToString feature_to_string[] = { + { .feature = FEATURE_RW_NONREMOVABLE, .str = "RW_NONREMOVABLE", }, + { .feature = FEATURE_RW_REMOVABLE, .str = "RW_REMOVABLE", }, + + { .feature = FEATURE_MO_SE, .str = "MO_SE", }, + { .feature = FEATURE_MO_WO, .str = "MO_WO", }, + { .feature = FEATURE_MO_AS, .str = "MO_AS", }, + + { .feature = FEATURE_CD_ROM, .str = "CD", }, + { .feature = FEATURE_CD_R, .str = "CD_R", }, + { .feature = FEATURE_CD_RW, .str = "CD_RW", }, + + { .feature = FEATURE_DVD_ROM, .str = "DVD", }, + { .feature = FEATURE_DVD_R, .str = "DVD_R", }, + { .feature = FEATURE_DVD_RAM, .str = "DVD_RAM", }, + { .feature = FEATURE_DVD_RW_RO, .str = "DVD_RW_RO", }, + { .feature = FEATURE_DVD_RW_SEQ, .str = "DVD_RW_SEQ", }, + { .feature = FEATURE_DVD_R_DL_SEQ, .str = "DVD_R_DL_SEQ", }, + { .feature = FEATURE_DVD_R_DL_JR, .str = "DVD_R_DL_JR", }, + { .feature = FEATURE_DVD_RW_DL, .str = "DVD_RW_DL", }, + { .feature = FEATURE_DVD_R_DDR, .str = "DVD_R_DDR", }, + { .feature = FEATURE_DVD_PLUS_RW, .str = "DVD_PLUS_RW", }, + { .feature = FEATURE_DVD_PLUS_R, .str = "DVD_PLUS_R", }, + + { .feature = FEATURE_DDCD_ROM, .str = "DDCD", }, + { .feature = FEATURE_DDCD_R, .str = "DDCD_R", }, + { .feature = FEATURE_DDCD_RW, .str = "DDCD_RW", }, + + { .feature = FEATURE_DVD_PLUS_RW_DL, .str = "DVD_PLUS_RW_DL", }, + { .feature = FEATURE_DVD_PLUS_R_DL, .str = "DVD_PLUS_R_DL", }, + + { .feature = FEATURE_BD, .str = "BD", }, + { .feature = FEATURE_BD_R_SRM, .str = "BD_R_SRM", }, + { .feature = FEATURE_BD_R_RRM, .str = "BD_R_RRM", }, + { .feature = FEATURE_BD_RE, .str = "BD_RE", }, + + { .feature = FEATURE_HDDVD, .str = "HDDVD", }, + { .feature = FEATURE_HDDVD_R, .str = "HDDVD_R", }, + { .feature = FEATURE_HDDVD_RAM, .str = "HDDVD_RAM", }, + { .feature = FEATURE_HDDVD_RW, .str = "HDDVD_RW", }, + { .feature = FEATURE_HDDVD_R_DL, .str = "HDDVD_R_DL", }, + { .feature = FEATURE_HDDVD_RW_DL, .str = "HDDVD_RW_DL", }, + + { .feature = FEATURE_MRW, .str = "MRW", }, + { .feature = FEATURE_MRW_W, .str = "MRW_W", }, +}; + +static int feature_to_string_compare_func(const FeatureToString *a, const FeatureToString *b) { + assert(a); + assert(b); + + return CMP(a->feature, b->feature); +} + +static void print_feature(Feature feature, const char *prefix) { + const FeatureToString *found, in = { + .feature = feature, + }; + + assert(prefix); + + found = typesafe_bsearch(&in, feature_to_string, ELEMENTSOF(feature_to_string), feature_to_string_compare_func); + if (!found) + return (void) log_debug("Unknown feature 0x%02x, ignoring.", (unsigned) feature); + + printf("%s_%s=1\n", prefix, found->str); +} + +static void print_properties(const Context *c) { + const char *state; + + assert(c); + + printf("ID_CDROM=1\n"); + for (size_t i = 0; i < c->n_drive_feature; i++) + print_feature(c->drive_features[i], "ID_CDROM"); + + if (drive_has_feature(c, FEATURE_MO_SE) || + drive_has_feature(c, FEATURE_MO_WO) || + drive_has_feature(c, FEATURE_MO_AS)) + printf("ID_CDROM_MO=1\n"); + + if (drive_has_feature(c, FEATURE_DVD_RW_RO) || + drive_has_feature(c, FEATURE_DVD_RW_SEQ)) + printf("ID_CDROM_DVD_RW=1\n"); + + if (drive_has_feature(c, FEATURE_DVD_R_DL_SEQ) || + drive_has_feature(c, FEATURE_DVD_R_DL_JR)) + printf("ID_CDROM_DVD_R_DL=1\n"); + + if (drive_has_feature(c, FEATURE_DVD_R_DDR)) + printf("ID_CDROM_DVD_R=1\n"); + + if (drive_has_feature(c, FEATURE_BD_R_SRM) || + drive_has_feature(c, FEATURE_BD_R_RRM)) + printf("ID_CDROM_BD_R=1\n"); + + if (c->has_media) { + printf("ID_CDROM_MEDIA=1\n"); + print_feature(c->media_feature, "ID_CDROM_MEDIA"); + + if (IN_SET(c->media_feature, FEATURE_MO_SE, FEATURE_MO_WO, FEATURE_MO_AS)) + printf("ID_CDROM_MEDIA_MO=1\n"); + + if (IN_SET(c->media_feature, FEATURE_DVD_RW_RO, FEATURE_DVD_RW_SEQ)) + printf("ID_CDROM_MEDIA_DVD_RW=1\n"); + + if (IN_SET(c->media_feature, FEATURE_DVD_R_DL_SEQ, FEATURE_DVD_R_DL_JR)) + printf("ID_CDROM_MEDIA_DVD_R_DL=1\n"); + + if (c->media_feature == FEATURE_DVD_R_DDR) + printf("ID_CDROM_MEDIA_DVD_R=1\n"); + + if (IN_SET(c->media_feature, FEATURE_BD_R_SRM, FEATURE_BD_R_RRM)) + printf("ID_CDROM_MEDIA_BD_R=1\n"); + } + + state = media_state_to_string(c->media_state); + if (state) + printf("ID_CDROM_MEDIA_STATE=%s\n", state); + if (c->media_session_next > 0) + printf("ID_CDROM_MEDIA_SESSION_NEXT=%u\n", c->media_session_next); + if (c->media_session_count > 0) + printf("ID_CDROM_MEDIA_SESSION_COUNT=%u\n", c->media_session_count); + if (c->media_session_count > 1 && c->media_session_last_offset > 0) + printf("ID_CDROM_MEDIA_SESSION_LAST_OFFSET=%" PRIu64 "\n", c->media_session_last_offset); + if (c->media_track_count > 0) + printf("ID_CDROM_MEDIA_TRACK_COUNT=%u\n", c->media_track_count); + if (c->media_track_count_audio > 0) + printf("ID_CDROM_MEDIA_TRACK_COUNT_AUDIO=%u\n", c->media_track_count_audio); + if (c->media_track_count_data > 0) + printf("ID_CDROM_MEDIA_TRACK_COUNT_DATA=%u\n", c->media_track_count_data); +} + +static int help(void) { + printf("%s [OPTIONS...] DEVICE\n\n" + " -l --lock-media Lock the media (to enable eject request events)\n" + " -u --unlock-media Unlock the media\n" + " -e --eject-media Eject the media\n" + " -d --debug Print debug messages to stderr\n" + " -h --help Show this help text\n" + " --version Show package version\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "lock-media", no_argument, NULL, 'l' }, + { "unlock-media", no_argument, NULL, 'u' }, + { "eject-media", no_argument, NULL, 'e' }, + { "debug", no_argument, NULL, 'd' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "deluh", options, NULL)) >= 0) + switch (c) { + case 'l': + arg_lock = true; + break; + case 'u': + arg_unlock = true; + break; + case 'e': + arg_eject = true; + break; + case 'd': + log_set_target(LOG_TARGET_CONSOLE); + log_set_max_level(LOG_DEBUG); + log_open(); + break; + case 'h': + return help(); + case 'v': + return version(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + arg_node = argv[optind]; + if (!arg_node) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No device specified."); + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(context_clear) Context c = CONTEXT_EMPTY; + int r; + + log_set_target(LOG_TARGET_AUTO); + udev_parse_config(); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = open_drive(&c); + if (r < 0) + return r; + + /* same data as original cdrom_id */ + r = cd_capability_compat(&c); + if (r < 0) + return r; + + /* check for media - don't bail if there's no media as we still need to + * to read profiles */ + (void) cd_media_compat(&c); + + /* check if drive talks MMC */ + if (cd_inquiry(&c) < 0) + goto work; + + r = cd_profiles(&c); /* read drive and possibly current profile */ + if (r > 0) { + /* at this point we are guaranteed to have media in the drive - find out more about it */ + + /* get session/track info */ + (void) cd_media_toc(&c); + + /* get writable media state */ + (void) cd_media_info(&c); + } + +work: + /* lock the media, so we enable eject button events */ + if (arg_lock && c.has_media) { + log_debug("PREVENT_ALLOW_MEDIUM_REMOVAL (lock)"); + (void) media_lock(c.fd, true); + } + + if (arg_unlock && c.has_media) { + log_debug("PREVENT_ALLOW_MEDIUM_REMOVAL (unlock)"); + (void) media_lock(c.fd, false); + } + + if (arg_eject) { + log_debug("PREVENT_ALLOW_MEDIUM_REMOVAL (unlock)"); + (void) media_lock(c.fd, false); + log_debug("START_STOP_UNIT (eject)"); + (void) media_eject(c.fd); + } + + print_properties(&c); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/dmi_memory_id/dmi_memory_id.c b/src/udev/dmi_memory_id/dmi_memory_id.c new file mode 100644 index 0000000..3f89cc7 --- /dev/null +++ b/src/udev/dmi_memory_id/dmi_memory_id.c @@ -0,0 +1,721 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * System Memory information + * + * Copyright (C) 2000-2002 Alan Cox + * Copyright (C) 2002-2020 Jean Delvare + * Copyright (C) 2020 Bastien Nocera + * + * Unless specified otherwise, all references are aimed at the "System + * Management BIOS Reference Specification, Version 3.7.0" document, + * available from http://www.dmtf.org/standards/smbios. + * + * Note to contributors: + * Please reference every value you add or modify, especially if the + * information does not come from the above mentioned specification. + * + * Additional references: + * - Intel AP-485 revision 36 + * "Intel Processor Identification and the CPUID Instruction" + * http://www.intel.com/support/processors/sb/cs-009861.htm + * - DMTF Common Information Model + * CIM Schema version 2.19.1 + * http://www.dmtf.org/standards/cim/ + * - IPMI 2.0 revision 1.0 + * "Intelligent Platform Management Interface Specification" + * http://developer.intel.com/design/servers/ipmi/spec.htm + * - AMD publication #25481 revision 2.28 + * "CPUID Specification" + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/25481.pdf + * - BIOS Integrity Services Application Programming Interface version 1.0 + * http://www.intel.com/design/archives/wfm/downloads/bisspec.htm + * - DMTF DSP0239 version 1.1.0 + * "Management Component Transport Protocol (MCTP) IDs and Codes" + * http://www.dmtf.org/standards/pmci + * - "TPM Main, Part 2 TPM Structures" + * Specification version 1.2, level 2, revision 116 + * https://trustedcomputinggroup.org/tpm-main-specification/ + * - "PC Client Platform TPM Profile (PTP) Specification" + * Family "2.0", Level 00, Revision 00.43, January 26, 2015 + * https://trustedcomputinggroup.org/pc-client-platform-tpm-profile-ptp-specification/ + * - "RedFish Host Interface Specification" (DMTF DSP0270) + * https://www.dmtf.org/sites/default/files/DSP0270_1.0.1.pdf + */ + +#include + +#include "alloc-util.h" +#include "build.h" +#include "fileio.h" +#include "main-func.h" +#include "string-util.h" +#include "udev-util.h" +#include "unaligned.h" + +#define SUPPORTED_SMBIOS_VER 0x030300 + +#define OUT_OF_SPEC_STR "" + +#define SYS_FIRMWARE_DIR "/sys/firmware/dmi/tables" +#define SYS_ENTRY_FILE SYS_FIRMWARE_DIR "/smbios_entry_point" +#define SYS_TABLE_FILE SYS_FIRMWARE_DIR "/DMI" + +/* + * Per SMBIOS v2.8.0 and later, all structures assume a little-endian + * ordering convention. + */ +#define WORD(x) (unaligned_read_le16(x)) +#define DWORD(x) (unaligned_read_le32(x)) +#define QWORD(x) (unaligned_read_le64(x)) + +struct dmi_header { + uint8_t type; + uint8_t length; + uint16_t handle; + const uint8_t *data; +}; + +static const char *arg_source_file = NULL; + +static bool verify_checksum(const uint8_t *buf, size_t len) { + uint8_t sum = 0; + + for (size_t a = 0; a < len; a++) + sum += buf[a]; + return sum == 0; +} + +/* + * Type-independent Stuff + */ + +static const char *dmi_string(const struct dmi_header *dm, uint8_t s) { + const char *bp = (const char *) dm->data; + + if (s == 0) + return "Not Specified"; + + bp += dm->length; + for (;s > 1 && !isempty(bp); s--) + bp += strlen(bp) + 1; + + if (isempty(bp)) + return ""; + + return bp; +} + +typedef enum { + MEMORY_SIZE_UNIT_BYTES, + MEMORY_SIZE_UNIT_KB +} MemorySizeUnit; + +static void dmi_print_memory_size( + const char *attr_prefix, const char *attr_suffix, + int slot_num, uint64_t code, MemorySizeUnit unit) { + if (unit == MEMORY_SIZE_UNIT_KB) + code <<= 10; + + if (slot_num >= 0) + printf("%s_%i_%s=%"PRIu64"\n", attr_prefix, slot_num, attr_suffix, code); + else + printf("%s_%s=%"PRIu64"\n", attr_prefix, attr_suffix, code); +} + +/* + * 7.17 Physical Memory Array (Type 16) + */ + +static void dmi_memory_array_location(uint8_t code) { + /* 7.17.1 */ + static const char *location[] = { + [0x01] = "Other", + [0x02] = "Unknown", + [0x03] = "System Board Or Motherboard", + [0x04] = "ISA Add-on Card", + [0x05] = "EISA Add-on Card", + [0x06] = "PCI Add-on Card", + [0x07] = "MCA Add-on Card", + [0x08] = "PCMCIA Add-on Card", + [0x09] = "Proprietary Add-on Card", + [0x0A] = "NuBus", + }; + static const char *location_0xA0[] = { + [0x00] = "PC-98/C20 Add-on Card", /* 0xA0 */ + [0x01] = "PC-98/C24 Add-on Card", /* 0xA1 */ + [0x02] = "PC-98/E Add-on Card", /* 0xA2 */ + [0x03] = "PC-98/Local Bus Add-on Card", /* 0xA3 */ + [0x04] = "CXL Add-on Card", /* 0xA4 */ + }; + const char *str = OUT_OF_SPEC_STR; + + if (code < ELEMENTSOF(location) && location[code]) + str = location[code]; + else if (code >= 0xA0 && code < (ELEMENTSOF(location_0xA0) + 0xA0)) + str = location_0xA0[code - 0xA0]; + + printf("MEMORY_ARRAY_LOCATION=%s\n", str); +} + +static void dmi_memory_array_ec_type(uint8_t code) { + /* 7.17.3 */ + static const char *type[] = { + [0x01] = "Other", + [0x02] = "Unknown", + [0x03] = "None", + [0x04] = "Parity", + [0x05] = "Single-bit ECC", + [0x06] = "Multi-bit ECC", + [0x07] = "CRC", + }; + + if (code != 0x03) /* Do not print "None". */ + printf("MEMORY_ARRAY_EC_TYPE=%s\n", + code < ELEMENTSOF(type) && type[code] ? type[code] : OUT_OF_SPEC_STR); +} + +/* + * 7.18 Memory Device (Type 17) + */ + +static void dmi_memory_device_string( + const char *attr_suffix, unsigned slot_num, + const struct dmi_header *h, uint8_t s) { + char *str; + + str = strdupa_safe(dmi_string(h, s)); + str = strstrip(str); + if (!isempty(str)) + printf("MEMORY_DEVICE_%u_%s=%s\n", slot_num, attr_suffix, str); +} + +static void dmi_memory_device_width( + const char *attr_suffix, + unsigned slot_num, uint16_t code) { + + /* If no memory module is present, width may be 0 */ + if (!IN_SET(code, 0, 0xFFFF)) + printf("MEMORY_DEVICE_%u_%s=%u\n", slot_num, attr_suffix, code); +} + +static void dmi_memory_device_size(unsigned slot_num, uint16_t code) { + if (code == 0) + return (void) printf("MEMORY_DEVICE_%u_PRESENT=0\n", slot_num); + if (code == 0xFFFF) + return; + + uint64_t s = code & 0x7FFF; + if (!(code & 0x8000)) + s <<= 10; + dmi_print_memory_size("MEMORY_DEVICE", "SIZE", slot_num, s, MEMORY_SIZE_UNIT_KB); +} + +static void dmi_memory_device_extended_size(unsigned slot_num, uint32_t code) { + uint64_t capacity = (uint64_t) code * 1024 * 1024; + + printf("MEMORY_DEVICE_%u_SIZE=%"PRIu64"\n", slot_num, capacity); +} + +static void dmi_memory_device_rank(unsigned slot_num, uint8_t code) { + code &= 0x0F; + if (code != 0) + printf("MEMORY_DEVICE_%u_RANK=%u\n", slot_num, code); +} + +static void dmi_memory_device_voltage_value( + const char *attr_suffix, + unsigned slot_num, uint16_t code) { + if (code == 0) + return; + if (code % 100 != 0) + printf("MEMORY_DEVICE_%u_%s=%g\n", slot_num, attr_suffix, (double)code / 1000); + else + printf("MEMORY_DEVICE_%u_%s=%.1g\n", slot_num, attr_suffix, (double)code / 1000); +} + +static void dmi_memory_device_form_factor(unsigned slot_num, uint8_t code) { + /* 7.18.1 */ + static const char *form_factor[] = { + [0x01] = "Other", + [0x02] = "Unknown", + [0x03] = "SIMM", + [0x04] = "SIP", + [0x05] = "Chip", + [0x06] = "DIP", + [0x07] = "ZIP", + [0x08] = "Proprietary Card", + [0x09] = "DIMM", + [0x0A] = "TSOP", + [0x0B] = "Row Of Chips", + [0x0C] = "RIMM", + [0x0D] = "SODIMM", + [0x0E] = "SRIMM", + [0x0F] = "FB-DIMM", + [0x10] = "Die", + }; + + printf("MEMORY_DEVICE_%u_FORM_FACTOR=%s\n", slot_num, + code < ELEMENTSOF(form_factor) && form_factor[code] ? form_factor[code] : OUT_OF_SPEC_STR); +} + +static void dmi_memory_device_set(unsigned slot_num, uint8_t code) { + if (code == 0xFF) + printf("MEMORY_DEVICE_%u_SET=%s\n", slot_num, "Unknown"); + else if (code != 0) + printf("MEMORY_DEVICE_%u_SET=%"PRIu8"\n", slot_num, code); +} + +static void dmi_memory_device_type(unsigned slot_num, uint8_t code) { + /* 7.18.2 */ + static const char *type[] = { + [0x01] = "Other", + [0x02] = "Unknown", + [0x03] = "DRAM", + [0x04] = "EDRAM", + [0x05] = "VRAM", + [0x06] = "SRAM", + [0x07] = "RAM", + [0x08] = "ROM", + [0x09] = "Flash", + [0x0A] = "EEPROM", + [0x0B] = "FEPROM", + [0x0C] = "EPROM", + [0x0D] = "CDRAM", + [0x0E] = "3DRAM", + [0x0F] = "SDRAM", + [0x10] = "SGRAM", + [0x11] = "RDRAM", + [0x12] = "DDR", + [0x13] = "DDR2", + [0x14] = "DDR2 FB-DIMM", + [0x15] = "Reserved", + [0x16] = "Reserved", + [0x17] = "Reserved", + [0x18] = "DDR3", + [0x19] = "FBD2", + [0x1A] = "DDR4", + [0x1B] = "LPDDR", + [0x1C] = "LPDDR2", + [0x1D] = "LPDDR3", + [0x1E] = "LPDDR4", + [0x1F] = "Logical non-volatile device", + [0x20] = "HBM", + [0x21] = "HBM2", + [0x22] = "DDR5", + [0x23] = "LPDDR5", + [0x24] = "HBM3", + }; + + printf("MEMORY_DEVICE_%u_TYPE=%s\n", slot_num, + code < ELEMENTSOF(type) && type[code] ? type[code] : OUT_OF_SPEC_STR); +} + +static void dmi_memory_device_type_detail(unsigned slot_num, uint16_t code) { + /* 7.18.3 */ + static const char *detail[] = { + [1] = "Other", + [2] = "Unknown", + [3] = "Fast-paged", + [4] = "Static Column", + [5] = "Pseudo-static", + [6] = "RAMBUS", + [7] = "Synchronous", + [8] = "CMOS", + [9] = "EDO", + [10] = "Window DRAM", + [11] = "Cache DRAM", + [12] = "Non-Volatile", + [13] = "Registered (Buffered)", + [14] = "Unbuffered (Unregistered)", + [15] = "LRDIMM", + }; + + if ((code & 0xFFFE) == 0) + printf("MEMORY_DEVICE_%u_TYPE_DETAIL=%s\n", slot_num, "None"); + else { + bool first_element = true; + + printf("MEMORY_DEVICE_%u_TYPE_DETAIL=", slot_num); + for (size_t i = 1; i < ELEMENTSOF(detail); i++) + if (code & (1 << i)) { + printf("%s%s", first_element ? "" : " ", detail[i]); + first_element = false; + } + printf("\n"); + } +} + +static void dmi_memory_device_speed( + const char *attr_suffix, + unsigned slot_num, uint16_t code) { + if (code != 0) + printf("MEMORY_DEVICE_%u_%s=%u\n", slot_num, attr_suffix, code); +} + +static void dmi_memory_device_technology(unsigned slot_num, uint8_t code) { + /* 7.18.6 */ + static const char * const technology[] = { + [0x01] = "Other", + [0x02] = "Unknown", + [0x03] = "DRAM", + [0x04] = "NVDIMM-N", + [0x05] = "NVDIMM-F", + [0x06] = "NVDIMM-P", + [0x07] = "Intel Optane persistent memory", + }; + + printf("MEMORY_DEVICE_%u_MEMORY_TECHNOLOGY=%s\n", slot_num, + code < ELEMENTSOF(technology) && technology[code] ? technology[code] : OUT_OF_SPEC_STR); +} + +static void dmi_memory_device_operating_mode_capability(unsigned slot_num, uint16_t code) { + /* 7.18.7 */ + static const char * const mode[] = { + [1] = "Other", + [2] = "Unknown", + [3] = "Volatile memory", + [4] = "Byte-accessible persistent memory", + [5] = "Block-accessible persistent memory", + }; + + if ((code & 0xFFFE) != 0) { + bool first_element = true; + + printf("MEMORY_DEVICE_%u_MEMORY_OPERATING_MODE_CAPABILITY=", slot_num); + for (size_t i = 1; i < ELEMENTSOF(mode); i++) + if (code & (1 << i)) { + printf("%s%s", first_element ? "" : " ", mode[i]); + first_element = false; + } + printf("\n"); + } +} + +static void dmi_memory_device_manufacturer_id( + const char *attr_suffix, + unsigned slot_num, uint16_t code) { + /* 7.18.8 */ + /* 7.18.10 */ + /* LSB is 7-bit Odd Parity number of continuation codes */ + if (code != 0) + printf("MEMORY_DEVICE_%u_%s=Bank %d, Hex 0x%02X\n", slot_num, attr_suffix, + (code & 0x7F) + 1, code >> 8); +} + +static void dmi_memory_device_product_id( + const char *attr_suffix, + unsigned slot_num, uint16_t code) { + /* 7.18.9 */ + /* 7.18.11 */ + if (code != 0) + printf("MEMORY_DEVICE_%u_%s=0x%04X\n", slot_num, attr_suffix, code); +} + +static void dmi_memory_device_size_detail( + const char *attr_suffix, + unsigned slot_num, uint64_t code) { + /* 7.18.12 */ + /* 7.18.13 */ + if (!IN_SET(code, 0x0LU, 0xFFFFFFFFFFFFFFFFLU)) + dmi_print_memory_size("MEMORY_DEVICE", attr_suffix, slot_num, code, MEMORY_SIZE_UNIT_BYTES); +} + +static void dmi_decode(const struct dmi_header *h, + unsigned *next_slot_num) { + const uint8_t *data = h->data; + unsigned slot_num; + + /* + * Note: DMI types 37 and 42 are untested + */ + switch (h->type) { + case 16: /* 7.17 Physical Memory Array */ + log_debug("Physical Memory Array"); + if (h->length < 0x0F) + break; + + if (data[0x05] != 0x03) /* 7.17.2, Use == "System Memory" */ + break; + + log_debug("Use: System Memory"); + dmi_memory_array_location(data[0x04]); + dmi_memory_array_ec_type(data[0x06]); + if (DWORD(data + 0x07) != 0x80000000) + dmi_print_memory_size("MEMORY_ARRAY", "MAX_CAPACITY", -1, DWORD(data + 0x07), MEMORY_SIZE_UNIT_KB); + else if (h->length >= 0x17) + dmi_print_memory_size("MEMORY_ARRAY", "MAX_CAPACITY", -1, QWORD(data + 0x0F), MEMORY_SIZE_UNIT_BYTES); + + break; + + case 17: /* 7.18 Memory Device */ + slot_num = *next_slot_num; + *next_slot_num = slot_num + 1; + + log_debug("Memory Device: %u", slot_num); + if (h->length < 0x15) + break; + + dmi_memory_device_width("TOTAL_WIDTH", slot_num, WORD(data + 0x08)); + dmi_memory_device_width("DATA_WIDTH", slot_num, WORD(data + 0x0A)); + if (h->length >= 0x20 && WORD(data + 0x0C) == 0x7FFF) + dmi_memory_device_extended_size(slot_num, DWORD(data + 0x1C)); + else + dmi_memory_device_size(slot_num, WORD(data + 0x0C)); + dmi_memory_device_form_factor(slot_num, data[0x0E]); + dmi_memory_device_set(slot_num, data[0x0F]); + dmi_memory_device_string("LOCATOR", slot_num, h, data[0x10]); + dmi_memory_device_string("BANK_LOCATOR", slot_num, h, data[0x11]); + dmi_memory_device_type(slot_num, data[0x12]); + dmi_memory_device_type_detail(slot_num, WORD(data + 0x13)); + if (h->length < 0x17) + break; + + dmi_memory_device_speed("SPEED_MTS", slot_num, WORD(data + 0x15)); + if (h->length < 0x1B) + break; + + dmi_memory_device_string("MANUFACTURER", slot_num, h, data[0x17]); + dmi_memory_device_string("SERIAL_NUMBER", slot_num, h, data[0x18]); + dmi_memory_device_string("ASSET_TAG", slot_num, h, data[0x19]); + dmi_memory_device_string("PART_NUMBER", slot_num, h, data[0x1A]); + if (h->length < 0x1C) + break; + + dmi_memory_device_rank(slot_num, data[0x1B]); + if (h->length < 0x22) + break; + + dmi_memory_device_speed("CONFIGURED_SPEED_MTS", slot_num, WORD(data + 0x20)); + if (h->length < 0x28) + break; + + dmi_memory_device_voltage_value("MINIMUM_VOLTAGE", slot_num, WORD(data + 0x22)); + dmi_memory_device_voltage_value("MAXIMUM_VOLTAGE", slot_num, WORD(data + 0x24)); + dmi_memory_device_voltage_value("CONFIGURED_VOLTAGE", slot_num, WORD(data + 0x26)); + if (h->length < 0x34) + break; + + dmi_memory_device_technology(slot_num, data[0x28]); + dmi_memory_device_operating_mode_capability(slot_num, WORD(data + 0x29)); + dmi_memory_device_string("FIRMWARE_VERSION", slot_num, h, data[0x2B]); + dmi_memory_device_manufacturer_id("MODULE_MANUFACTURER_ID", slot_num, WORD(data + 0x2C)); + dmi_memory_device_product_id("MODULE_PRODUCT_ID", slot_num, WORD(data + 0x2E)); + dmi_memory_device_manufacturer_id("MEMORY_SUBSYSTEM_CONTROLLER_MANUFACTURER_ID", + slot_num, WORD(data + 0x30)); + dmi_memory_device_product_id("MEMORY_SUBSYSTEM_CONTROLLER_PRODUCT_ID", + slot_num, WORD(data + 0x32)); + if (h->length < 0x3C) + break; + + dmi_memory_device_size_detail("NON_VOLATILE_SIZE", slot_num, QWORD(data + 0x34)); + if (h->length < 0x44) + break; + + dmi_memory_device_size_detail("VOLATILE_SIZE", slot_num, QWORD(data + 0x3C)); + if (h->length < 0x4C) + break; + + dmi_memory_device_size_detail("CACHE_SIZE", slot_num, QWORD(data + 0x44)); + if (h->length < 0x54) + break; + + dmi_memory_device_size_detail("LOGICAL_SIZE", slot_num, QWORD(data + 0x4C)); + + break; + } +} + +static void dmi_table_decode(const uint8_t *buf, size_t len, uint16_t num) { + const uint8_t *data = buf; + unsigned next_slot_num = 0; + + /* 4 is the length of an SMBIOS structure header */ + for (uint16_t i = 0; (i < num || num == 0) && data + 4 <= buf + len; i++) { + struct dmi_header h = (struct dmi_header) { + .type = data[0], + .length = data[1], + .handle = WORD(data + 2), + .data = data, + }; + bool display = !IN_SET(h.type, 126, 127); + const uint8_t *next; + + /* If a short entry is found (less than 4 bytes), not only it + * is invalid, but we cannot reliably locate the next entry. + * Better stop at this point, and let the user know their + * table is broken. */ + if (h.length < 4) + break; + + /* In quiet mode, stop decoding at end of table marker */ + if (h.type == 127) + break; + + /* Look for the next handle */ + next = data + h.length; + while ((size_t)(next - buf + 1) < len && (next[0] != 0 || next[1] != 0)) + next++; + next += 2; + + /* Make sure the whole structure fits in the table */ + if ((size_t)(next - buf) > len) + break; + + if (display) + dmi_decode(&h, &next_slot_num); + + data = next; + } + if (next_slot_num > 0) + printf("MEMORY_ARRAY_NUM_DEVICES=%u\n", next_slot_num); +} + +static int dmi_table(int64_t base, uint32_t len, uint16_t num, const char *devmem, bool no_file_offset) { + _cleanup_free_ uint8_t *buf = NULL; + size_t size; + int r; + + /* + * When reading from sysfs or from a dump file, the file may be + * shorter than announced. For SMBIOS v3 this is expected, as we + * only know the maximum table size, not the actual table size. + * For older implementations (and for SMBIOS v3 too), this + * would be the result of the kernel truncating the table on + * parse error. + */ + r = read_full_file_full(AT_FDCWD, devmem, no_file_offset ? 0 : base, len, + 0, NULL, (char **) &buf, &size); + if (r < 0) + return log_error_errno(r, "Failed to read table: %m"); + + dmi_table_decode(buf, size, num); + + return 0; +} + +/* Same thing for SMBIOS3 entry points */ +static int smbios3_decode(const uint8_t *buf, const char *devmem, bool no_file_offset) { + uint64_t offset; + + /* Don't let checksum run beyond the buffer */ + if (buf[0x06] > 0x20) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Entry point length too large (%"PRIu8" bytes, expected %u).", + buf[0x06], 0x18U); + + if (!verify_checksum(buf, buf[0x06])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to verify checksum."); + + offset = QWORD(buf + 0x10); + +#if __SIZEOF_SIZE_T__ != 8 + if (!no_file_offset && (offset >> 32) != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "64-bit addresses not supported on 32-bit systems."); +#endif + + return dmi_table(offset, DWORD(buf + 0x0C), 0, devmem, no_file_offset); +} + +static int smbios_decode(const uint8_t *buf, const char *devmem, bool no_file_offset) { + /* Don't let checksum run beyond the buffer */ + if (buf[0x05] > 0x20) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Entry point length too large (%"PRIu8" bytes, expected %u).", + buf[0x05], 0x1FU); + + if (!verify_checksum(buf, buf[0x05]) + || memcmp(buf + 0x10, "_DMI_", 5) != 0 + || !verify_checksum(buf + 0x10, 0x0F)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to verify checksum."); + + return dmi_table(DWORD(buf + 0x18), WORD(buf + 0x16), WORD(buf + 0x1C), + devmem, no_file_offset); +} + +static int legacy_decode(const uint8_t *buf, const char *devmem, bool no_file_offset) { + if (!verify_checksum(buf, 0x0F)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to verify checksum."); + + return dmi_table(DWORD(buf + 0x08), WORD(buf + 0x06), WORD(buf + 0x0C), + devmem, no_file_offset); +} + +static int help(void) { + printf("%s [OPTIONS...]\n\n" + " -F --from-dump FILE Read DMI information from a binary file\n" + " -h --help Show this help text\n" + " --version Show package version\n", + program_invocation_short_name); + return 0; +} + +static int parse_argv(int argc, char * const *argv) { + static const struct option options[] = { + { "from-dump", required_argument, NULL, 'F' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "F:hV", options, NULL)) >= 0) + switch (c) { + case 'F': + arg_source_file = optarg; + break; + case 'V': + return version(); + case 'h': + return help(); + case '?': + return -EINVAL; + case 'v': + return version(); + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char* const* argv) { + _cleanup_free_ uint8_t *buf = NULL; + bool no_file_offset = false; + size_t size; + int r; + + log_set_target(LOG_TARGET_AUTO); + udev_parse_config(); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + /* Read from dump if so instructed */ + r = read_full_file_full(AT_FDCWD, + arg_source_file ?: SYS_ENTRY_FILE, + 0, 0x20, 0, NULL, (char **) &buf, &size); + if (r < 0) + return log_full_errno(!arg_source_file && r == -ENOENT ? LOG_DEBUG : LOG_ERR, + r, "Reading \"%s\" failed: %m", + arg_source_file ?: SYS_ENTRY_FILE); + + if (!arg_source_file) { + arg_source_file = SYS_TABLE_FILE; + no_file_offset = true; + } + + if (size >= 24 && memory_startswith(buf, size, "_SM3_")) + return smbios3_decode(buf, arg_source_file, no_file_offset); + if (size >= 31 && memory_startswith(buf, size, "_SM_")) + return smbios_decode(buf, arg_source_file, no_file_offset); + if (size >= 15 && memory_startswith(buf, size, "_DMI_")) + return legacy_decode(buf, arg_source_file, no_file_offset); + + return -EINVAL; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/fido_id/fido_id.c b/src/udev/fido_id/fido_id.c new file mode 100644 index 0000000..e01f37d --- /dev/null +++ b/src/udev/fido_id/fido_id.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* + * Identifies FIDO CTAP1 ("U2F")/CTAP2 security tokens based on the usage declared in their report + * descriptor and outputs suitable environment variables. + * + * Inspired by Andrew Lutomirski's 'u2f-hidraw-policy.c' + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "build.h" +#include "device-private.h" +#include "device-util.h" +#include "fd-util.h" +#include "fido_id_desc.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "path-util.h" +#include "string-util.h" +#include "udev-util.h" + +static const char *arg_device = NULL; + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + case 'h': + printf("%s [OPTIONS...] SYSFS_PATH\n\n" + " -h --help Show this help text\n" + " --version Show package version\n", + program_invocation_short_name); + return 0; + case 'v': + return version(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (argc > 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Error: unexpected argument."); + + arg_device = argv[optind]; + return 1; +} + +static int run(int argc, char **argv) { + _cleanup_(sd_device_unrefp) struct sd_device *device = NULL; + _cleanup_free_ char *desc_path = NULL; + _cleanup_close_ int fd = -EBADF; + struct sd_device *hid_device; + const char *sys_path; + uint8_t desc[HID_MAX_DESCRIPTOR_SIZE]; + ssize_t desc_len; + int r; + + log_set_target(LOG_TARGET_AUTO); + udev_parse_config(); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_device) { + r = sd_device_new_from_syspath(&device, arg_device); + if (r < 0) + return log_error_errno(r, "Failed to get device from syspath %s: %m", arg_device); + } else { + r = device_new_from_strv(&device, environ); + if (r < 0) + return log_error_errno(r, "Failed to get current device from environment: %m"); + } + + r = sd_device_get_parent(device, &hid_device); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get parent HID device: %m"); + + r = sd_device_get_syspath(hid_device, &sys_path); + if (r < 0) + return log_device_error_errno(hid_device, r, "Failed to get syspath for HID device: %m"); + + desc_path = path_join(sys_path, "report_descriptor"); + if (!desc_path) + return log_oom(); + + fd = open(desc_path, O_RDONLY | O_NOFOLLOW | O_CLOEXEC | O_NOCTTY); + if (fd < 0) + return log_device_error_errno(hid_device, errno, + "Failed to open report descriptor at '%s': %m", desc_path); + + desc_len = read(fd, desc, sizeof(desc)); + if (desc_len < 0) + return log_device_error_errno(hid_device, errno, + "Failed to read report descriptor at '%s': %m", desc_path); + if (desc_len == 0) + return log_device_debug_errno(hid_device, SYNTHETIC_ERRNO(EINVAL), + "Empty report descriptor at '%s'.", desc_path); + + r = is_fido_security_token_desc(desc, desc_len); + if (r < 0) + return log_device_debug_errno(hid_device, r, + "Failed to parse report descriptor at '%s'.", desc_path); + if (r > 0) { + printf("ID_FIDO_TOKEN=1\n"); + printf("ID_SECURITY_TOKEN=1\n"); + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/fido_id/fido_id_desc.c b/src/udev/fido_id/fido_id_desc.c new file mode 100644 index 0000000..2dfa759 --- /dev/null +++ b/src/udev/fido_id/fido_id_desc.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* Inspired by Andrew Lutomirski's 'u2f-hidraw-policy.c' */ + +#include +#include +#include +#include + +#include "fido_id_desc.h" + +#define HID_RPTDESC_FIRST_BYTE_LONG_ITEM 0xfeu +#define HID_RPTDESC_TYPE_GLOBAL 0x1u +#define HID_RPTDESC_TYPE_LOCAL 0x2u +#define HID_RPTDESC_TAG_USAGE_PAGE 0x0u +#define HID_RPTDESC_TAG_USAGE 0x0u + +/* + * HID usage for FIDO CTAP1 ("U2F") and CTAP2 security tokens. + * https://fidoalliance.org/specs/fido-u2f-v1.0-ps-20141009/fido-u2f-u2f_hid.h-v1.0-ps-20141009.txt + * https://fidoalliance.org/specs/fido-v2.0-ps-20190130/fido-client-to-authenticator-protocol-v2.0-ps-20190130.html#usb-discovery + * https://www.usb.org/sites/default/files/hutrr48.pdf + */ +#define FIDO_FULL_USAGE_CTAPHID 0xf1d00001u + +/* + * Parses a HID report descriptor and identifies FIDO CTAP1 ("U2F")/CTAP2 security tokens based on their + * declared usage. + * A positive return value indicates that the report descriptor belongs to a FIDO security token. + * https://www.usb.org/sites/default/files/documents/hid1_11.pdf (Section 6.2.2) + */ +int is_fido_security_token_desc(const uint8_t *desc, size_t desc_len) { + uint32_t usage = 0; + + for (size_t pos = 0; pos < desc_len; ) { + uint8_t tag, type, size_code; + size_t size; + uint32_t value; + + /* Report descriptors consists of short items (1-5 bytes) and long items (3-258 bytes). */ + if (desc[pos] == HID_RPTDESC_FIRST_BYTE_LONG_ITEM) { + /* No long items are defined in the spec; skip them. + * The length of the data in a long item is contained in the byte after the long + * item tag. The header consists of three bytes: special long item tag, length, + * actual tag. */ + if (pos + 1 >= desc_len) + return -EINVAL; + pos += desc[pos + 1] + 3; + continue; + } + + /* The first byte of a short item encodes tag, type and size. */ + tag = desc[pos] >> 4; /* Bits 7 to 4 */ + type = (desc[pos] >> 2) & 0x3; /* Bits 3 and 2 */ + size_code = desc[pos] & 0x3; /* Bits 1 and 0 */ + /* Size is coded as follows: + * 0 -> 0 bytes, 1 -> 1 byte, 2 -> 2 bytes, 3 -> 4 bytes + */ + size = size_code < 3 ? size_code : 4; + /* Consume header byte. */ + pos++; + + /* Extract the item value coded on size bytes. */ + if (pos + size > desc_len) + return -EINVAL; + value = 0; + for (size_t i = 0; i < size; i++) + value |= (uint32_t) desc[pos + i] << (8 * i); + /* Consume value bytes. */ + pos += size; + + if (type == HID_RPTDESC_TYPE_GLOBAL && tag == HID_RPTDESC_TAG_USAGE_PAGE) { + /* A usage page is a 16 bit value coded on at most 16 bits. */ + if (size > 2) + return -EINVAL; + /* A usage page sets the upper 16 bits of a following usage. */ + usage = (value & 0x0000ffffu) << 16; + } + + if (type == HID_RPTDESC_TYPE_LOCAL && tag == HID_RPTDESC_TAG_USAGE) { + /* A usage is a 32 bit value, but is prepended with the current usage page if + * coded on less than 4 bytes (that is, at most 2 bytes). */ + if (size == 4) + usage = value; + else + usage = (usage & 0xffff0000u) | (value & 0x0000ffffu); + if (usage == FIDO_FULL_USAGE_CTAPHID) + return 1; + } + } + + return 0; +} diff --git a/src/udev/fido_id/fido_id_desc.h b/src/udev/fido_id/fido_id_desc.h new file mode 100644 index 0000000..57af57e --- /dev/null +++ b/src/udev/fido_id/fido_id_desc.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include +#include + +int is_fido_security_token_desc(const uint8_t *desc, size_t desc_len); diff --git a/src/udev/fido_id/fuzz-fido-id-desc.c b/src/udev/fido_id/fuzz-fido-id-desc.c new file mode 100644 index 0000000..040d77a --- /dev/null +++ b/src/udev/fido_id/fuzz-fido-id-desc.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "fido_id_desc.h" +#include "fuzz.h" +#include "log.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + fuzz_setup_logging(); + + if (outside_size_range(size, 0, HID_MAX_DESCRIPTOR_SIZE)) + return 0; + + (void) is_fido_security_token_desc(data, size); + + return 0; +} diff --git a/src/udev/fido_id/test-fido-id-desc.c b/src/udev/fido_id/test-fido-id-desc.c new file mode 100644 index 0000000..36c777a --- /dev/null +++ b/src/udev/fido_id/test-fido-id-desc.c @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "fido_id_desc.h" +#include "macro.h" +#include "tests.h" + +TEST(is_fido_security_token_desc__fido) { + static const uint8_t FIDO_HID_DESC_1[] = { + 0x06, 0xd0, 0xf1, 0x09, 0x01, 0xa1, 0x01, 0x09, 0x20, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, + 0x08, 0x95, 0x40, 0x81, 0x02, 0x09, 0x21, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, 0x08, 0x95, + 0x40, 0x91, 0x02, 0xc0, + }; + assert_se(is_fido_security_token_desc(FIDO_HID_DESC_1, sizeof(FIDO_HID_DESC_1)) > 0); + + static const uint8_t FIDO_HID_DESC_2[] = { + 0x05, 0x01, 0x09, 0x06, 0xa1, 0x01, 0x05, 0x07, 0x19, 0xe0, 0x29, 0xe7, 0x15, 0x00, 0x25, + 0x01, 0x75, 0x01, 0x95, 0x08, 0x81, 0x02, 0x95, 0x01, 0x75, 0x08, 0x81, 0x01, 0x95, 0x05, + 0x75, 0x01, 0x05, 0x08, 0x19, 0x01, 0x29, 0x05, 0x91, 0x02, 0x95, 0x01, 0x75, 0x03, 0x91, + 0x01, 0x95, 0x06, 0x75, 0x08, 0x15, 0x00, 0x25, 0x65, 0x05, 0x07, 0x19, 0x00, 0x29, 0x65, + 0x81, 0x00, 0x09, 0x03, 0x75, 0x08, 0x95, 0x08, 0xb1, 0x02, 0xc0, + 0x06, 0xd0, 0xf1, 0x09, 0x01, 0xa1, 0x01, 0x09, 0x20, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, + 0x08, 0x95, 0x40, 0x81, 0x02, 0x09, 0x21, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, 0x08, 0x95, + 0x40, 0x91, 0x02, 0xc0, + }; + assert_se(is_fido_security_token_desc(FIDO_HID_DESC_2, sizeof(FIDO_HID_DESC_2)) > 0); +} + +TEST(is_fido_security_token_desc__non_fido) { + /* Wrong usage page */ + static const uint8_t NON_FIDO_HID_DESC_1[] = { + 0x06, 0xd0, 0xf0, 0x09, 0x01, 0xa1, 0x01, 0x09, 0x20, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, + 0x08, 0x95, 0x40, 0x81, 0x02, 0x09, 0x21, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, 0x08, 0x95, + 0x40, 0x91, 0x02, 0xc0, + }; + assert_se(is_fido_security_token_desc(NON_FIDO_HID_DESC_1, sizeof(NON_FIDO_HID_DESC_1)) == 0); + + /* Wrong usage */ + static const uint8_t NON_FIDO_HID_DESC_2[] = { + 0x06, 0xd0, 0xf1, 0x09, 0x02, 0xa1, 0x01, 0x09, 0x20, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, + 0x08, 0x95, 0x40, 0x81, 0x02, 0x09, 0x21, 0x15, 0x00, 0x26, 0xff, 0x00, 0x75, 0x08, 0x95, + 0x40, 0x91, 0x02, 0xc0, + }; + assert_se(is_fido_security_token_desc(NON_FIDO_HID_DESC_2, sizeof(NON_FIDO_HID_DESC_2)) == 0); + + static const uint8_t NON_FIDO_HID_DESC_3[] = { + 0x05, 0x01, 0x09, 0x06, 0xa1, 0x01, 0x05, 0x07, 0x19, 0xe0, 0x29, 0xe7, 0x15, 0x00, 0x25, + 0x01, 0x75, 0x01, 0x95, 0x08, 0x81, 0x02, 0x95, 0x01, 0x75, 0x08, 0x81, 0x01, 0x95, 0x05, + 0x75, 0x01, 0x05, 0x08, 0x19, 0x01, 0x29, 0x05, 0x91, 0x02, 0x95, 0x01, 0x75, 0x03, 0x91, + 0x01, 0x95, 0x06, 0x75, 0x08, 0x15, 0x00, 0x25, 0x65, 0x05, 0x07, 0x19, 0x00, 0x29, 0x65, + 0x81, 0x00, 0x09, 0x03, 0x75, 0x08, 0x95, 0x08, 0xb1, 0x02, 0xc0, + }; + assert_se(is_fido_security_token_desc(NON_FIDO_HID_DESC_3, sizeof(NON_FIDO_HID_DESC_3)) == 0); +} + +TEST(is_fido_security_token_desc__invalid) { + /* Size coded on 1 byte, but no byte given */ + static const uint8_t INVALID_HID_DESC_1[] = { 0x01 }; + assert_se(is_fido_security_token_desc(INVALID_HID_DESC_1, sizeof(INVALID_HID_DESC_1)) < 0); + + /* Size coded on 2 bytes, but only 1 byte given */ + static const uint8_t INVALID_HID_DESC_2[] = { 0x02, 0x01 }; + assert_se(is_fido_security_token_desc(INVALID_HID_DESC_2, sizeof(INVALID_HID_DESC_2)) < 0); + + /* Size coded on 4 bytes, but only 3 bytes given */ + static const uint8_t INVALID_HID_DESC_3[] = { 0x03, 0x01, 0x02, 0x03 }; + assert_se(is_fido_security_token_desc(INVALID_HID_DESC_3, sizeof(INVALID_HID_DESC_3)) < 0); + + /* Long item without a size byte */ + static const uint8_t INVALID_HID_DESC_4[] = { 0xfe }; + assert_se(is_fido_security_token_desc(INVALID_HID_DESC_4, sizeof(INVALID_HID_DESC_4)) < 0); + + /* Usage pages are coded on at most 2 bytes */ + static const uint8_t INVALID_HID_DESC_5[] = { 0x07, 0x01, 0x02, 0x03, 0x04 }; + assert_se(is_fido_security_token_desc(INVALID_HID_DESC_5, sizeof(INVALID_HID_DESC_5)) < 0); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/udev/fuzz-udev-rule-parse-value.c b/src/udev/fuzz-udev-rule-parse-value.c new file mode 100644 index 0000000..1817c15 --- /dev/null +++ b/src/udev/fuzz-udev-rule-parse-value.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "fuzz.h" +#include "udev-rules.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_free_ char *str = NULL; + int r; + char *value = UINT_TO_PTR(0x12345678U); + char *endpos = UINT_TO_PTR(0x87654321U); + + fuzz_setup_logging(); + + assert_se(str = malloc(size + 1)); + memcpy(str, data, size); + str[size] = '\0'; + + r = udev_rule_parse_value(str, &value, &endpos); + if (r < 0) { + /* not modified on failure */ + assert_se(value == UINT_TO_PTR(0x12345678U)); + assert_se(endpos == UINT_TO_PTR(0x87654321U)); + } else { + assert_se(endpos <= str + size); + assert_se(endpos > str + 1); + } + + return 0; +} diff --git a/src/udev/fuzz-udev-rules.c b/src/udev/fuzz-udev-rules.c new file mode 100644 index 0000000..0a1056d --- /dev/null +++ b/src/udev/fuzz-udev-rules.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "fd-util.h" +#include "fs-util.h" +#include "fuzz.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "udev-rules.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(udev_rules_freep) UdevRules *rules = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(unlink_tempfilep) char filename[] = "/tmp/fuzz-udev-rules.XXXXXX"; + int r; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(fmkostemp_safe(filename, "r+", &f) == 0); + if (size != 0) + assert_se(fwrite(data, size, 1, f) == 1); + fflush(f); + + assert_se(rules = udev_rules_new(RESOLVE_NAME_EARLY)); + r = udev_rules_parse_file(rules, filename, /* extra_checks = */ false, NULL); + log_info_errno(r, "Parsing %s: %m", filename); + assert_se(r >= 0 || /* OK */ + r == -ENOBUFS); /* line length exceeded */ + + return 0; +} diff --git a/src/udev/fuzz-udev-rules.options b/src/udev/fuzz-udev-rules.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/udev/fuzz-udev-rules.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/udev/generate-keyboard-keys-gperf.sh b/src/udev/generate-keyboard-keys-gperf.sh new file mode 100755 index 0000000..9f4364c --- /dev/null +++ b/src/udev/generate-keyboard-keys-gperf.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu + +# shellcheck disable=SC1004 +awk ' + BEGIN { + print "%{\n\ +#if __GNUC__ >= 7\n\ +_Pragma(\"GCC diagnostic ignored \\\"-Wimplicit-fallthrough\\\"\")\n\ +#endif\n\ +%}" + print "struct key_name { const char* name; unsigned short id; };" + print "%null-strings" + print "%%" + } + + /^KEY_/ { print tolower(substr($1 ,5)) ", " $1 } + { print tolower($1) ", " $1 } +' <"${1:?}" diff --git a/src/udev/generate-keyboard-keys-list.sh b/src/udev/generate-keyboard-keys-list.sh new file mode 100755 index 0000000..ead3113 --- /dev/null +++ b/src/udev/generate-keyboard-keys-list.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eu +set -o pipefail + +${1:?} -dM -include linux/input.h - / { next } + /^#define[ \t]+(KEY|BTN)_[^ ]+[ \t]+[0-9BK]/ { print $2 } +' diff --git a/src/udev/iocost/iocost.c b/src/udev/iocost/iocost.c new file mode 100644 index 0000000..2b2633e --- /dev/null +++ b/src/udev/iocost/iocost.c @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "build.h" +#include "cgroup-util.h" +#include "conf-parser.h" +#include "device-util.h" +#include "devnum-util.h" +#include "main-func.h" +#include "path-util.h" +#include "pretty-print.h" +#include "udev-util.h" +#include "verbs.h" + +static char *arg_target_solution = NULL; +STATIC_DESTRUCTOR_REGISTER(arg_target_solution, freep); + +static int parse_config(void) { + static const ConfigTableItem items[] = { + { "IOCost", "TargetSolution", config_parse_string, 0, &arg_target_solution }, + }; + int r; + + r = config_parse( + NULL, + "/etc/udev/iocost.conf", + NULL, + "IOCost\0", + config_item_table_lookup, + items, + CONFIG_PARSE_WARN, + NULL, + NULL); + if (r < 0) + return r; + + if (!arg_target_solution) { + arg_target_solution = strdup("naive"); + if (!arg_target_solution) + return log_oom(); + } + + log_debug("Target solution: %s", arg_target_solution); + return 0; +} + +static int help(void) { + printf("%s [OPTIONS...]\n\n" + "Set up iocost model and qos solutions for block devices\n" + "\nCommands:\n" + " apply [SOLUTION] Apply solution for the device if\n" + " found, do nothing otherwise\n" + " query Query the known solution for\n" + " the device\n" + "\nOptions:\n" + " -h --help Show this help\n" + " --version Show package version\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + {} + }; + + int c; + + assert(argc >= 1); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int get_known_solutions(sd_device *device, int log_level, char ***ret_solutions, const char **ret_selected) { + _cleanup_free_ char **s = NULL; + const char *value, *found; + int r; + + assert(ret_solutions); + assert(ret_selected); + + r = sd_device_get_property_value(device, "IOCOST_SOLUTIONS", &value); + if (r == -ENOENT) + return log_device_full_errno(device, log_level, r, "No iocost solution found for device."); + if (r < 0) + return log_device_error_errno(device, r, "Failed to query solutions from device: %m"); + + s = strv_split(value, WHITESPACE); + if (!s) + return log_oom(); + if (strv_isempty(s)) + return log_device_error_errno(device, SYNTHETIC_ERRNO(EINVAL), + "IOCOST_SOLUTIONS exists in hwdb but is empty."); + + found = strv_find(s, arg_target_solution); + if (found) { + *ret_selected = found; + log_device_debug(device, "Selected solution based on target solution: %s", *ret_selected); + } else { + *ret_selected = s[0]; + log_device_debug(device, "Selected first available solution: %s", *ret_selected); + } + + *ret_solutions = TAKE_PTR(s); + return 0; +} + +static int query_named_solution( + sd_device *device, + const char *name, + const char **ret_model, + const char **ret_qos) { + + _cleanup_free_ char *upper_name = NULL, *qos_key = NULL, *model_key = NULL; + const char *qos, *model; + int r; + + assert(name); + assert(ret_qos); + assert(ret_model); + + upper_name = strdup(name); + if (!upper_name) + return log_oom(); + + ascii_strupper(upper_name); + string_replace_char(upper_name, '-', '_'); + + qos_key = strjoin("IOCOST_QOS_", upper_name); + if (!qos_key) + return log_oom(); + + model_key = strjoin("IOCOST_MODEL_", upper_name); + if (!model_key) + return log_oom(); + + r = sd_device_get_property_value(device, qos_key, &qos); + if (r == -ENOENT) + return log_device_debug_errno(device, r, "No value found for key %s, skipping iocost logic.", qos_key); + if (r < 0) + return log_device_error_errno(device, r, "Failed to obtain QoS for iocost solution from device: %m"); + + r = sd_device_get_property_value(device, model_key, &model); + if (r == -ENOENT) + return log_device_debug_errno(device, r, "No value found for key %s, skipping iocost logic.", model_key); + if (r < 0) + return log_device_error_errno(device, r, "Failed to obtain model for iocost solution from device: %m"); + + *ret_qos = qos; + *ret_model = model; + + return 0; +} + +static int apply_solution_for_path(const char *path, const char *name) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _cleanup_strv_free_ char **solutions = NULL; + _cleanup_free_ char *qos = NULL, *model = NULL; + const char *qos_params, *model_params; + dev_t devnum; + int r; + + r = sd_device_new_from_path(&device, path); + if (r < 0) + return log_error_errno(r, "Error looking up device: %m"); + + r = sd_device_get_devnum(device, &devnum); + if (r < 0) + return log_device_error_errno(device, r, "Error getting devnum: %m"); + + if (!name) { + r = get_known_solutions(device, LOG_DEBUG, &solutions, &name); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + } + + r = query_named_solution(device, name, &model_params, &qos_params); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + if (asprintf(&qos, DEVNUM_FORMAT_STR " enable=1 ctrl=user %s", DEVNUM_FORMAT_VAL(devnum), qos_params) < 0) + return log_oom(); + + if (asprintf(&model, DEVNUM_FORMAT_STR " model=linear ctrl=user %s", DEVNUM_FORMAT_VAL(devnum), model_params) < 0) + return log_oom(); + + log_debug("Applying iocost parameters to %s using solution '%s'\n" + "\tio.cost.qos: %s\n" + "\tio.cost.model: %s\n", + path, name, qos, model); + + r = cg_set_attribute("io", NULL, "io.cost.qos", qos); + if (r < 0) { + log_device_full_errno(device, r == -ENOENT ? LOG_DEBUG : LOG_ERR, r, "Failed to set io.cost.qos: %m"); + return r == -ENOENT ? 0 : r; + } + + r = cg_set_attribute("io", NULL, "io.cost.model", model); + if (r < 0) { + log_device_full_errno(device, r == -ENOENT ? LOG_DEBUG : LOG_ERR, r, "Failed to set io.cost.model: %m"); + return r == -ENOENT ? 0 : r; + } + + return 0; +} + +static int query_solutions_for_path(const char *path) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + _cleanup_strv_free_ char **solutions = NULL; + const char *selected_solution, *model_name; + int r; + + r = sd_device_new_from_path(&device, path); + if (r < 0) + return log_error_errno(r, "Error looking up device: %m"); + + r = device_get_model_string(device, &model_name); + if (r == -ENOENT) { + log_device_info(device, "Device model not found"); + return 0; + } + if (r < 0) + return log_device_error_errno(device, r, "Model name for device %s is unknown", path); + + r = get_known_solutions(device, LOG_INFO, &solutions, &selected_solution); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + log_info("Known solutions for %s model name: \"%s\"\n" + "Preferred solution: %s\n" + "Solution that would be applied: %s", + path, model_name, + arg_target_solution, selected_solution); + + STRV_FOREACH(s, solutions) { + const char *model, *qos; + + if (query_named_solution(device, *s, &model, &qos) < 0) + continue; + + log_info("%s: io.cost.qos: %s\n" + "%s: io.cost.model: %s", *s, qos, *s, model); + } + + return 0; +} + +static int verb_query(int argc, char *argv[], void *userdata) { + return query_solutions_for_path(ASSERT_PTR(argv[1])); +} + +static int verb_apply(int argc, char *argv[], void *userdata) { + return apply_solution_for_path( + ASSERT_PTR(argv[1]), + argc > 2 ? ASSERT_PTR(argv[2]) : NULL); +} + +static int iocost_main(int argc, char *argv[]) { + static const Verb verbs[] = { + { "query", 2, 2, 0, verb_query }, + { "apply", 2, 3, 0, verb_apply }, + {}, + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = parse_config(); + if (r < 0) + return r; + + return iocost_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/iocost/iocost.conf b/src/udev/iocost/iocost.conf new file mode 100644 index 0000000..c0eeea3 --- /dev/null +++ b/src/udev/iocost/iocost.conf @@ -0,0 +1,20 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file (or a copy of it placed in +# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in +# the /etc/udev/iocost.conf.d/ directory. The latter is generally recommended. +# Defaults can be restored by simply deleting the main configuration file and +# all drop-ins located in /etc/. +# +# Use 'systemd-analyze cat-config udev/iocost.conf' to display the full config. +# +# See iocost.conf(5) for details. + +[IOCost] +#TargetSolution=naive diff --git a/src/udev/meson.build b/src/udev/meson.build new file mode 100644 index 0000000..824ec47 --- /dev/null +++ b/src/udev/meson.build @@ -0,0 +1,273 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +udevadm_sources = files( + 'udevadm-control.c', + 'udevadm-hwdb.c', + 'udevadm-info.c', + 'udevadm-lock.c', + 'udevadm-monitor.c', + 'udevadm-settle.c', + 'udevadm-test-builtin.c', + 'udevadm-test.c', + 'udevadm-trigger.c', + 'udevadm-util.c', + 'udevadm-verify.c', + 'udevadm-wait.c', + 'udevadm.c', + 'udevd.c', +) + +libudevd_core_sources = files( + 'net/link-config.c', + 'udev-ctrl.c', + 'udev-event.c', + 'udev-format.c', + 'udev-manager.c', + 'udev-node.c', + 'udev-rules.c', + 'udev-spawn.c', + 'udev-watch.c', + 'udev-worker.c', + 'udev-builtin-btrfs.c', + 'udev-builtin-hwdb.c', + 'udev-builtin-input_id.c', + 'udev-builtin-keyboard.c', + 'udev-builtin-net_driver.c', + 'udev-builtin-net_id.c', + 'udev-builtin-net_setup_link.c', + 'udev-builtin-path_id.c', + 'udev-builtin-usb_id.c', + 'udev-builtin.c', +) + +if conf.get('HAVE_KMOD') == 1 + libudevd_core_sources += files('udev-builtin-kmod.c') +endif + +if conf.get('HAVE_BLKID') == 1 + libudevd_core_sources += files('udev-builtin-blkid.c') +endif + +if conf.get('HAVE_ACL') == 1 + libudevd_core_sources += files('udev-builtin-uaccess.c') +endif + +############################################################ + +generate_keyboard_keys_list = find_program('generate-keyboard-keys-list.sh') +keyboard_keys_list_txt = custom_target( + 'keyboard-keys-list.txt', + output : 'keyboard-keys-list.txt', + command : [generate_keyboard_keys_list, cpp], + capture : true) + +generate_keyboard_keys_gperf = find_program('generate-keyboard-keys-gperf.sh') +fname = 'keyboard-keys-from-name.gperf' +gperf_file = custom_target( + fname, + input : keyboard_keys_list_txt, + output : fname, + command : [generate_keyboard_keys_gperf, '@INPUT@'], + capture : true) + +fname = 'keyboard-keys-from-name.h' +keyboard_keys_from_name_h = custom_target( + fname, + input : gperf_file, + output : fname, + command : [gperf, + '-L', 'ANSI-C', '-t', + '-N', 'keyboard_lookup_key', + '-H', 'hash_key_name', + '-p', '-C', + '@INPUT@'], + capture : true) + +############################################################ + +udev_link_gperf_gperf = files('net/link-config-gperf.gperf') + +link_config_gperf_c = custom_target( + 'link-config-gperf.c', + input : udev_link_gperf_gperf, + output : 'link-config-gperf.c', + command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@']) + +############################################################ + +if get_option('link-udev-shared') + udev_link_with = [libshared] + udev_rpath = pkglibdir +else + udev_link_with = [libshared_static, + libsystemd_static] + udev_rpath = '' +endif + +############################################################ + +libudevd_core = static_library( + 'udev-core', + libudevd_core_sources, + link_config_gperf_c, + keyboard_keys_from_name_h, + include_directories : includes + include_directories('net'), + link_with : udev_link_with, + dependencies : [libblkid, + libkmod, + userspace], + build_by_default : false) + +udev_dependencies = [ + libacl, + libblkid, + libkmod, + threads, +] + +udev_plugin_template = executable_template + { + 'public' : true, + 'link_with' : udev_link_with, + 'install_rpath' : udev_rpath, + 'install_dir' : udevlibexecdir, +} + +udev_common_template = { + 'link_with' : [ + libshared, + libudevd_core, + ], + 'dependencies' : [ + libacl, + threads, + ], +} +udev_test_template = test_template + udev_common_template +udev_fuzz_template = fuzz_template + udev_common_template + +executables += [ + executable_template + { + 'name' : 'udevadm', + 'public' : true, + 'sources' : udevadm_sources, + 'link_with' : [libudevd_core], + 'dependencies' : udev_dependencies, + 'install_rpath' : udev_rpath, + }, + udev_plugin_template + { + 'name' : 'ata_id', + 'sources' : files('ata_id/ata_id.c'), + }, + udev_plugin_template + { + 'name' : 'cdrom_id', + 'sources' : files('cdrom_id/cdrom_id.c'), + }, + udev_plugin_template + { + 'name' : 'dmi_memory_id', + 'conditions' : ['HAVE_DMI'], + 'sources' : files('dmi_memory_id/dmi_memory_id.c'), + }, + udev_plugin_template + { + 'name' : 'fido_id', + 'sources' : files( + 'fido_id/fido_id.c', + 'fido_id/fido_id_desc.c', + ), + }, + udev_plugin_template + { + 'name' : 'iocost', + 'sources' : files('iocost/iocost.c'), + }, + udev_plugin_template + { + 'name' : 'mtd_probe', + 'sources' : files( + 'mtd_probe/mtd_probe.c', + 'mtd_probe/probe_smartmedia.c', + ), + }, + udev_plugin_template + { + 'name' : 'scsi_id', + 'sources' : files( + 'scsi_id/scsi_id.c', + 'scsi_id/scsi_serial.c', + ), + }, + udev_plugin_template + { + 'name' : 'v4l_id', + 'sources' : files('v4l_id/v4l_id.c'), + }, + test_template + { + 'sources' : files( + 'fido_id/test-fido-id-desc.c', + 'fido_id/fido_id_desc.c', + ), + 'suite' : 'udev', + }, + udev_test_template + { + 'sources' : files('net/test-link-config-tables.c'), + 'suite' : 'udev', + }, + udev_test_template + { + 'sources' : files('test-udev-builtin.c'), + }, + udev_test_template + { + 'sources' : files('test-udev-format.c'), + }, + udev_test_template + { + 'sources' : files('test-udev-manager.c'), + }, + udev_test_template + { + 'sources' : files('test-udev-node.c'), + }, + udev_test_template + { + 'sources' : files('test-udev-rule-runner.c'), + 'dependencies' : udev_dependencies + [ + libselinux, + ], + 'type' : 'manual', + }, + udev_test_template + { + 'sources' : files('test-udev-rules.c'), + }, + udev_test_template + { + 'sources' : files('test-udev-spawn.c'), + }, + fuzz_template + { + 'sources' : files( + 'fido_id/fuzz-fido-id-desc.c', + 'fido_id/fido_id_desc.c', + ), + }, + udev_fuzz_template + { + 'sources' : files('net/fuzz-link-parser.c'), + }, + udev_fuzz_template + { + 'sources' : files('fuzz-udev-rule-parse-value.c'), + }, + udev_fuzz_template + { + 'sources' : files('fuzz-udev-rules.c'), + }, +] + +meson.add_install_script(sh, '-c', ln_s.format(bindir / 'udevadm', + libexecdir / 'systemd-udevd')) + +if install_sysconfdir_samples + install_data('udev.conf', + install_dir : configfiledir / 'udev') + install_data('iocost/iocost.conf', + install_dir : configfiledir / 'udev') +endif + +udev_pc = custom_target( + 'udev.pc', + input : 'udev.pc.in', + output : 'udev.pc', + command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'], + install : pkgconfigdatadir != 'no', + install_tag : 'devel', + install_dir : pkgconfigdatadir) + +if install_sysconfdir + install_emptydir(sysconfdir / 'udev/rules.d') +endif diff --git a/src/udev/mtd_probe/mtd_probe.c b/src/udev/mtd_probe/mtd_probe.c new file mode 100644 index 0000000..1035320 --- /dev/null +++ b/src/udev/mtd_probe/mtd_probe.c @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © 2010 - Maxim Levitsky + * + * mtd_probe is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mtd_probe is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with mtd_probe; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "build.h" +#include "fd-util.h" +#include "main-func.h" +#include "mtd_probe.h" + +static const char *arg_device = NULL; + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + case 'h': + printf("%s /dev/mtd[n]\n\n" + " -h --help Show this help text\n" + " --version Show package version\n", + program_invocation_short_name); + return 0; + case 'v': + return version(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (argc > 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Error: unexpected argument."); + + arg_device = argv[optind]; + return 1; +} + +static int run(int argc, char** argv) { + _cleanup_close_ int mtd_fd = -EBADF; + mtd_info_t mtd_info; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + mtd_fd = open(argv[1], O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (mtd_fd < 0) + return log_error_errno(errno, "Failed to open: %m"); + + if (ioctl(mtd_fd, MEMGETINFO, &mtd_info) < 0) + return log_error_errno(errno, "MEMGETINFO ioctl failed: %m"); + + return probe_smart_media(mtd_fd, &mtd_info); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/mtd_probe/mtd_probe.h b/src/udev/mtd_probe/mtd_probe.h new file mode 100644 index 0000000..ae03a7d --- /dev/null +++ b/src/udev/mtd_probe/mtd_probe.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +/* + * Copyright © 2010 - Maxim Levitsky + * + * mtd_probe is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mtd_probe is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with mtd_probe; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +#include + +#include "macro.h" + +/* Full oob structure as written on the flash */ +struct sm_oob { + uint32_t reserved; + uint8_t data_status; + uint8_t block_status; + uint8_t lba_copy1[2]; + uint8_t ecc2[3]; + uint8_t lba_copy2[2]; + uint8_t ecc1[3]; +} _packed_; + +/* one sector is always 512 bytes, but it can consist of two nand pages */ +#define SM_SECTOR_SIZE 512 + +/* oob area is also 16 bytes, but might be from two pages */ +#define SM_OOB_SIZE 16 + +/* This is maximum zone size, and all devices that have more that one zone + have this size */ +#define SM_MAX_ZONE_SIZE 1024 + +/* support for small page nand */ +#define SM_SMALL_PAGE 256 +#define SM_SMALL_OOB_SIZE 8 + +int probe_smart_media(int mtd_fd, mtd_info_t *info); diff --git a/src/udev/mtd_probe/probe_smartmedia.c b/src/udev/mtd_probe/probe_smartmedia.c new file mode 100644 index 0000000..368fab8 --- /dev/null +++ b/src/udev/mtd_probe/probe_smartmedia.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © 2010 - Maxim Levitsky + * + * mtd_probe is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mtd_probe is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with mtd_probe; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "mtd_probe.h" + +static const uint8_t cis_signature[] = { + 0x01, 0x03, 0xD9, 0x01, 0xFF, 0x18, 0x02, 0xDF, 0x01, 0x20 +}; + +int probe_smart_media(int mtd_fd, mtd_info_t* info) { + int sector_size; + int block_size; + int size_in_megs; + int spare_count; + _cleanup_free_ uint8_t *cis_buffer = NULL; + int offset; + int cis_found = 0; + + cis_buffer = malloc(SM_SECTOR_SIZE); + if (!cis_buffer) + return log_oom(); + + if (info->type != MTD_NANDFLASH) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Not marked MTD_NANDFLASH."); + + sector_size = info->writesize; + block_size = info->erasesize; + size_in_megs = info->size / (1024 * 1024); + + if (!IN_SET(sector_size, SM_SECTOR_SIZE, SM_SMALL_PAGE)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "Unexpected sector size: %i", sector_size); + + switch (size_in_megs) { + case 1: + case 2: + spare_count = 6; + break; + case 4: + spare_count = 12; + break; + default: + spare_count = 24; + break; + } + + for (offset = 0; offset < block_size * spare_count; offset += sector_size) { + (void) lseek(mtd_fd, SEEK_SET, offset); + + if (read(mtd_fd, cis_buffer, SM_SECTOR_SIZE) == SM_SECTOR_SIZE) { + cis_found = 1; + break; + } + } + + if (!cis_found) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "CIS not found"); + + if (memcmp(cis_buffer, cis_signature, sizeof(cis_signature)) != 0 && + memcmp(cis_buffer + SM_SMALL_PAGE, cis_signature, sizeof(cis_signature)) != 0) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "CIS signature didn't match"); + + printf("MTD_FTL=smartmedia\n"); + return 0; +} diff --git a/src/udev/net/fuzz-link-parser.c b/src/udev/net/fuzz-link-parser.c new file mode 100644 index 0000000..2833162 --- /dev/null +++ b/src/udev/net/fuzz-link-parser.c @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fs-util.h" +#include "fuzz.h" +#include "link-config.h" +#include "tmpfile-util.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(link_config_ctx_freep) LinkConfigContext *ctx = NULL; + _cleanup_(unlink_tempfilep) char filename[] = "/tmp/fuzz-link-config.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(fmkostemp_safe(filename, "r+", &f) == 0); + if (size != 0) + assert_se(fwrite(data, size, 1, f) == 1); + + fflush(f); + assert_se(link_config_ctx_new(&ctx) >= 0); + (void) link_load_one(ctx, filename); + return 0; +} diff --git a/src/udev/net/fuzz-link-parser.options b/src/udev/net/fuzz-link-parser.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/udev/net/fuzz-link-parser.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/udev/net/link-config-gperf.gperf b/src/udev/net/link-config-gperf.gperf new file mode 100644 index 0000000..240f16e --- /dev/null +++ b/src/udev/net/link-config-gperf.gperf @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +%{ +#if __GNUC__ >= 7 +_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +#endif +#include +#include "conf-parser.h" +#include "ethtool-util.h" +#include "link-config.h" +#include "net-condition.h" +#include "netif-sriov.h" +#include "socket-util.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name link_config_gperf_hash +%define lookup-function-name link_config_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Match.MACAddress, config_parse_hw_addrs, 0, offsetof(LinkConfig, match.hw_addr) +Match.PermanentMACAddress, config_parse_hw_addrs, 0, offsetof(LinkConfig, match.permanent_hw_addr) +Match.OriginalName, config_parse_match_ifnames, 0, offsetof(LinkConfig, match.ifname) +Match.Path, config_parse_match_strv, 0, offsetof(LinkConfig, match.path) +Match.Driver, config_parse_match_strv, 0, offsetof(LinkConfig, match.driver) +Match.Type, config_parse_match_strv, 0, offsetof(LinkConfig, match.iftype) +Match.Kind, config_parse_match_strv, 0, offsetof(LinkConfig, match.kind) +Match.Property, config_parse_match_property, 0, offsetof(LinkConfig, match.property) +Match.Host, config_parse_net_condition, CONDITION_HOST, offsetof(LinkConfig, conditions) +Match.Virtualization, config_parse_net_condition, CONDITION_VIRTUALIZATION, offsetof(LinkConfig, conditions) +Match.KernelCommandLine, config_parse_net_condition, CONDITION_KERNEL_COMMAND_LINE, offsetof(LinkConfig, conditions) +Match.KernelVersion, config_parse_net_condition, CONDITION_KERNEL_VERSION, offsetof(LinkConfig, conditions) +Match.Credential, config_parse_net_condition, CONDITION_CREDENTIAL, offsetof(LinkConfig, conditions) +Match.Architecture, config_parse_net_condition, CONDITION_ARCHITECTURE, offsetof(LinkConfig, conditions) +Match.Firmware, config_parse_net_condition, CONDITION_FIRMWARE, offsetof(LinkConfig, conditions) +Link.Description, config_parse_string, 0, offsetof(LinkConfig, description) +Link.MACAddressPolicy, config_parse_mac_address_policy, 0, offsetof(LinkConfig, mac_address_policy) +Link.MACAddress, config_parse_hw_addr, 0, offsetof(LinkConfig, hw_addr) +Link.NamePolicy, config_parse_name_policy, 0, offsetof(LinkConfig, name_policy) +Link.Name, config_parse_ifname, 0, offsetof(LinkConfig, name) +Link.AlternativeName, config_parse_ifnames, IFNAME_VALID_ALTERNATIVE, offsetof(LinkConfig, alternative_names) +Link.AlternativeNamesPolicy, config_parse_alternative_names_policy, 0, offsetof(LinkConfig, alternative_names_policy) +Link.Alias, config_parse_ifalias, 0, offsetof(LinkConfig, alias) +Link.TransmitQueues, config_parse_rx_tx_queues, 0, offsetof(LinkConfig, txqueues) +Link.ReceiveQueues, config_parse_rx_tx_queues, 0, offsetof(LinkConfig, rxqueues) +Link.TransmitQueueLength, config_parse_txqueuelen, 0, offsetof(LinkConfig, txqueuelen) +Link.MTUBytes, config_parse_mtu, AF_UNSPEC, offsetof(LinkConfig, mtu) +Link.BitsPerSecond, config_parse_si_uint64, 0, offsetof(LinkConfig, speed) +Link.Duplex, config_parse_duplex, 0, offsetof(LinkConfig, duplex) +Link.AutoNegotiation, config_parse_tristate, 0, offsetof(LinkConfig, autonegotiation) +Link.WakeOnLan, config_parse_wol, 0, offsetof(LinkConfig, wol) +Link.WakeOnLanPassword, config_parse_wol_password, 0, 0 +Link.Port, config_parse_port, 0, offsetof(LinkConfig, port) +Link.ReceiveChecksumOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_RXCSUM]) +Link.TransmitChecksumOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_TXCSUM]) +Link.GenericSegmentationOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_GSO]) +Link.TCPSegmentationOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_TSO]) +Link.TCP6SegmentationOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_TSO6]) +Link.UDPSegmentationOffload, config_parse_warn_compat, DISABLED_LEGACY, 0 +Link.GenericReceiveOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_GRO]) +Link.GenericReceiveOffloadHardware, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_GRO_HW]) +Link.LargeReceiveOffload, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_LRO]) +Link.ReceiveVLANCTAGHardwareAcceleration, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_HW_VLAN_CTAG_RX]) +Link.TransmitVLANCTAGHardwareAcceleration, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_HW_VLAN_CTAG_TX]) +Link.ReceiveVLANCTAGFilter, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_HW_VLAN_CTAG_FILTER]) +Link.TransmitVLANSTAGHardwareAcceleration, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_HW_VLAN_STAG_TX]) +Link.NTupleFilter, config_parse_tristate, 0, offsetof(LinkConfig, features[NET_DEV_FEAT_NTUPLE]) +Link.RxChannels, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, channels.rx) +Link.TxChannels, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, channels.tx) +Link.OtherChannels, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, channels.other) +Link.CombinedChannels, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, channels.combined) +Link.Advertise, config_parse_advertise, 0, offsetof(LinkConfig, advertise) +Link.RxBufferSize, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, ring.rx) +Link.RxMiniBufferSize, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, ring.rx_mini) +Link.RxJumboBufferSize, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, ring.rx_jumbo) +Link.TxBufferSize, config_parse_ring_buffer_or_channel, 0, offsetof(LinkConfig, ring.tx) +Link.RxFlowControl, config_parse_tristate, 0, offsetof(LinkConfig, rx_flow_control) +Link.TxFlowControl, config_parse_tristate, 0, offsetof(LinkConfig, tx_flow_control) +Link.AutoNegotiationFlowControl, config_parse_tristate, 0, offsetof(LinkConfig, autoneg_flow_control) +Link.GenericSegmentOffloadMaxBytes, config_parse_iec_size, 0, offsetof(LinkConfig, gso_max_size) +Link.GenericSegmentOffloadMaxSegments, config_parse_uint32, 0, offsetof(LinkConfig, gso_max_segments) +Link.RxCoalesceSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.rx_coalesce_usecs) +Link.RxMaxCoalescedFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.rx_max_coalesced_frames) +Link.RxCoalesceIrqSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.rx_coalesce_usecs_irq) +Link.RxMaxCoalescedIrqFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.rx_max_coalesced_frames_irq) +Link.TxCoalesceSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.tx_coalesce_usecs) +Link.TxMaxCoalescedFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.tx_max_coalesced_frames) +Link.TxCoalesceIrqSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.tx_coalesce_usecs_irq) +Link.TxMaxCoalescedIrqFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.tx_max_coalesced_frames_irq) +Link.StatisticsBlockCoalesceSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.stats_block_coalesce_usecs) +Link.UseAdaptiveRxCoalesce, config_parse_tristate, 0, offsetof(LinkConfig, coalesce.use_adaptive_rx_coalesce) +Link.UseAdaptiveTxCoalesce, config_parse_tristate, 0, offsetof(LinkConfig, coalesce.use_adaptive_tx_coalesce) +Link.CoalescePacketRateLow, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.pkt_rate_low) +Link.RxCoalesceLowSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.rx_coalesce_usecs_low) +Link.RxMaxCoalescedLowFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.rx_max_coalesced_frames_low) +Link.TxCoalesceLowSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.tx_coalesce_usecs_low) +Link.TxMaxCoalescedLowFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.tx_max_coalesced_frames_low) +Link.CoalescePacketRateHigh, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.pkt_rate_high) +Link.RxCoalesceHighSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.rx_coalesce_usecs_high) +Link.RxMaxCoalescedHighFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.rx_max_coalesced_frames_high) +Link.TxCoalesceHighSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.tx_coalesce_usecs_high) +Link.TxMaxCoalescedHighFrames, config_parse_coalesce_u32, 0, offsetof(LinkConfig, coalesce.tx_max_coalesced_frames_high) +Link.CoalescePacketRateSampleIntervalSec, config_parse_coalesce_sec, 0, offsetof(LinkConfig, coalesce.rate_sample_interval) +Link.MDI, config_parse_mdi, 0, offsetof(LinkConfig, mdi) +Link.SR-IOVVirtualFunctions, config_parse_sr_iov_num_vfs, 0, offsetof(LinkConfig, sr_iov_num_vfs) +SR-IOV.VirtualFunction, config_parse_sr_iov_uint32, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.VLANId, config_parse_sr_iov_uint32, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.QualityOfService, config_parse_sr_iov_uint32, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.VLANProtocol, config_parse_sr_iov_vlan_proto, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.MACSpoofCheck, config_parse_sr_iov_boolean, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.QueryReceiveSideScaling, config_parse_sr_iov_boolean, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.Trust, config_parse_sr_iov_boolean, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.LinkState, config_parse_sr_iov_link_state, 0, offsetof(LinkConfig, sr_iov_by_section) +SR-IOV.MACAddress, config_parse_sr_iov_mac, 0, offsetof(LinkConfig, sr_iov_by_section) diff --git a/src/udev/net/link-config.c b/src/udev/net/link-config.c new file mode 100644 index 0000000..910ec27 --- /dev/null +++ b/src/udev/net/link-config.c @@ -0,0 +1,1133 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "sd-device.h" +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "arphrd-util.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "constants.h" +#include "creds-util.h" +#include "device-private.h" +#include "device-util.h" +#include "ethtool-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "link-config.h" +#include "log-link.h" +#include "memory-util.h" +#include "net-condition.h" +#include "netif-sriov.h" +#include "netif-util.h" +#include "netlink-util.h" +#include "parse-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "random-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "utf8.h" + +struct LinkConfigContext { + LIST_HEAD(LinkConfig, configs); + int ethtool_fd; + Hashmap *stats_by_path; +}; + +static LinkConfig* link_config_free(LinkConfig *config) { + if (!config) + return NULL; + + free(config->filename); + strv_free(config->dropins); + + net_match_clear(&config->match); + condition_free_list(config->conditions); + + free(config->description); + free(config->name_policy); + free(config->name); + strv_free(config->alternative_names); + free(config->alternative_names_policy); + free(config->alias); + free(config->wol_password_file); + erase_and_free(config->wol_password); + + ordered_hashmap_free_with_destructor(config->sr_iov_by_section, sr_iov_free); + + return mfree(config); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(LinkConfig*, link_config_free); + +static void link_configs_free(LinkConfigContext *ctx) { + if (!ctx) + return; + + ctx->stats_by_path = hashmap_free(ctx->stats_by_path); + + LIST_FOREACH(configs, config, ctx->configs) + link_config_free(config); +} + +LinkConfigContext *link_config_ctx_free(LinkConfigContext *ctx) { + if (!ctx) + return NULL; + + safe_close(ctx->ethtool_fd); + link_configs_free(ctx); + return mfree(ctx); +} + +int link_config_ctx_new(LinkConfigContext **ret) { + _cleanup_(link_config_ctx_freep) LinkConfigContext *ctx = NULL; + + if (!ret) + return -EINVAL; + + ctx = new(LinkConfigContext, 1); + if (!ctx) + return -ENOMEM; + + *ctx = (LinkConfigContext) { + .ethtool_fd = -EBADF, + }; + + *ret = TAKE_PTR(ctx); + + return 0; +} + +static int link_parse_wol_password(LinkConfig *config, const char *str) { + _cleanup_(erase_and_freep) uint8_t *p = NULL; + int r; + + assert(config); + assert(str); + + assert_cc(sizeof(struct ether_addr) == SOPASS_MAX); + + p = new(uint8_t, SOPASS_MAX); + if (!p) + return -ENOMEM; + + /* Reuse parse_ether_addr(), as their formats are equivalent. */ + r = parse_ether_addr(str, (struct ether_addr*) p); + if (r < 0) + return r; + + erase_and_free(config->wol_password); + config->wol_password = TAKE_PTR(p); + return 0; +} + +static int link_read_wol_password_from_file(LinkConfig *config) { + _cleanup_(erase_and_freep) char *password = NULL; + int r; + + assert(config); + + if (!config->wol_password_file) + return 0; + + r = read_full_file_full( + AT_FDCWD, config->wol_password_file, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_SECURE | READ_FULL_FILE_WARN_WORLD_READABLE | READ_FULL_FILE_CONNECT_SOCKET, + NULL, &password, NULL); + if (r < 0) + return r; + + return link_parse_wol_password(config, password); +} + +static int link_read_wol_password_from_cred(LinkConfig *config) { + _cleanup_free_ char *base = NULL, *cred_name = NULL; + _cleanup_(erase_and_freep) char *password = NULL; + int r; + + assert(config); + assert(config->filename); + + if (config->wol == UINT32_MAX) + return 0; /* WakeOnLan= is not specified. */ + if (!FLAGS_SET(config->wol, WAKE_MAGICSECURE)) + return 0; /* secureon is not specified in WakeOnLan=. */ + if (config->wol_password) + return 0; /* WakeOnLanPassword= is specified. */ + if (config->wol_password_file) + return 0; /* a file name is specified in WakeOnLanPassword=, but failed to read it. */ + + r = path_extract_filename(config->filename, &base); + if (r < 0) + return r; + + cred_name = strjoin(base, ".wol.password"); + if (!cred_name) + return -ENOMEM; + + r = read_credential(cred_name, (void**) &password, NULL); + if (r == -ENOENT) + r = read_credential("wol.password", (void**) &password, NULL); + if (r < 0) + return r; + + return link_parse_wol_password(config, password); +} + +static int link_adjust_wol_options(LinkConfig *config) { + int r; + + assert(config); + + r = link_read_wol_password_from_file(config); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_warning_errno(r, "Failed to read WakeOnLan password from %s, ignoring: %m", config->wol_password_file); + + r = link_read_wol_password_from_cred(config); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_warning_errno(r, "Failed to read WakeOnLan password from credential, ignoring: %m"); + + if (config->wol != UINT32_MAX && config->wol_password) + /* Enable WAKE_MAGICSECURE flag when WakeOnLanPassword=. Note that when + * WakeOnLanPassword= is set without WakeOnLan=, then ethtool_set_wol() enables + * WAKE_MAGICSECURE flag and other flags are not changed. */ + config->wol |= WAKE_MAGICSECURE; + + return 0; +} + +int link_load_one(LinkConfigContext *ctx, const char *filename) { + _cleanup_(link_config_freep) LinkConfig *config = NULL; + _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL; + _cleanup_free_ char *name = NULL; + const char *dropin_dirname; + size_t i; + int r; + + assert(ctx); + assert(filename); + + r = null_or_empty_path(filename); + if (r < 0) + return log_warning_errno(r, "Failed to check if \"%s\" is empty: %m", filename); + if (r > 0) { + log_debug("Skipping empty file: %s", filename); + return 0; + } + + name = strdup(filename); + if (!name) + return log_oom(); + + config = new(LinkConfig, 1); + if (!config) + return log_oom(); + + *config = (LinkConfig) { + .filename = TAKE_PTR(name), + .mac_address_policy = MAC_ADDRESS_POLICY_NONE, + .wol = UINT32_MAX, /* UINT32_MAX means do not change WOL setting. */ + .duplex = _DUP_INVALID, + .port = _NET_DEV_PORT_INVALID, + .autonegotiation = -1, + .rx_flow_control = -1, + .tx_flow_control = -1, + .autoneg_flow_control = -1, + .txqueuelen = UINT32_MAX, + .coalesce.use_adaptive_rx_coalesce = -1, + .coalesce.use_adaptive_tx_coalesce = -1, + .mdi = ETH_TP_MDI_INVALID, + .sr_iov_num_vfs = UINT32_MAX, + }; + + for (i = 0; i < ELEMENTSOF(config->features); i++) + config->features[i] = -1; + + dropin_dirname = strjoina(basename(filename), ".d"); + r = config_parse_many( + STRV_MAKE_CONST(filename), + NETWORK_DIRS, + dropin_dirname, + /* root = */ NULL, + "Match\0" + "Link\0" + "SR-IOV\0", + config_item_perf_lookup, link_config_gperf_lookup, + CONFIG_PARSE_WARN, config, &stats_by_path, + &config->dropins); + if (r < 0) + return r; /* config_parse_many() logs internally. */ + + if (ctx->stats_by_path) { + r = hashmap_move(ctx->stats_by_path, stats_by_path); + if (r < 0) + log_warning_errno(r, "Failed to save stats of '%s' and its drop-in configs, ignoring: %m", filename); + } else + ctx->stats_by_path = TAKE_PTR(stats_by_path); + + if (net_match_is_empty(&config->match) && !config->conditions) { + log_warning("%s: No valid settings found in the [Match] section, ignoring file. " + "To match all interfaces, add OriginalName=* in the [Match] section.", + filename); + return 0; + } + + if (!condition_test_list(config->conditions, environ, NULL, NULL, NULL)) { + log_debug("%s: Conditions do not match the system environment, skipping.", filename); + return 0; + } + + if (IN_SET(config->mac_address_policy, MAC_ADDRESS_POLICY_PERSISTENT, MAC_ADDRESS_POLICY_RANDOM) && + config->hw_addr.length > 0) + log_warning("%s: MACAddress= in [Link] section will be ignored when MACAddressPolicy= " + "is set to \"persistent\" or \"random\".", + filename); + + r = link_adjust_wol_options(config); + if (r < 0) + return r; /* link_adjust_wol_options() logs internally. */ + + r = sr_iov_drop_invalid_sections(config->sr_iov_num_vfs, config->sr_iov_by_section); + if (r < 0) + return r; /* sr_iov_drop_invalid_sections() logs internally. */ + + log_debug("Parsed configuration file \"%s\"", filename); + + LIST_PREPEND(configs, ctx->configs, TAKE_PTR(config)); + return 0; +} + +static int device_unsigned_attribute(sd_device *device, const char *attr, unsigned *type) { + const char *s; + int r; + + r = sd_device_get_sysattr_value(device, attr, &s); + if (r < 0) + return log_device_debug_errno(device, r, "Failed to query %s: %m", attr); + + r = safe_atou(s, type); + if (r < 0) + return log_device_warning_errno(device, r, "Failed to parse %s \"%s\": %m", attr, s); + + log_device_debug(device, "Device has %s=%u", attr, *type); + return 0; +} + +int link_config_load(LinkConfigContext *ctx) { + _cleanup_strv_free_ char **files = NULL; + int r; + + assert(ctx); + + link_configs_free(ctx); + + r = conf_files_list_strv(&files, ".link", NULL, 0, NETWORK_DIRS); + if (r < 0) + return log_error_errno(r, "failed to enumerate link files: %m"); + + STRV_FOREACH_BACKWARDS(f, files) + (void) link_load_one(ctx, *f); + + return 0; +} + +bool link_config_should_reload(LinkConfigContext *ctx) { + _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL; + int r; + + assert(ctx); + + r = config_get_stats_by_path(".link", NULL, 0, NETWORK_DIRS, /* check_dropins = */ true, &stats_by_path); + if (r < 0) { + log_warning_errno(r, "Failed to get stats of .link files, ignoring: %m"); + return true; + } + + return !stats_by_path_equal(ctx->stats_by_path, stats_by_path); +} + +Link *link_free(Link *link) { + if (!link) + return NULL; + + sd_device_unref(link->device); + free(link->kind); + strv_free(link->altnames); + return mfree(link); +} + +int link_new(LinkConfigContext *ctx, sd_netlink **rtnl, sd_device *device, Link **ret) { + _cleanup_(link_freep) Link *link = NULL; + int r; + + assert(ctx); + assert(rtnl); + assert(device); + assert(ret); + + link = new(Link, 1); + if (!link) + return -ENOMEM; + + *link = (Link) { + .device = sd_device_ref(device), + }; + + r = sd_device_get_sysname(device, &link->ifname); + if (r < 0) + return r; + + r = sd_device_get_ifindex(device, &link->ifindex); + if (r < 0) + return r; + + r = sd_device_get_action(device, &link->action); + if (r < 0) + return r; + + r = device_unsigned_attribute(device, "name_assign_type", &link->name_assign_type); + if (r < 0) + log_link_debug_errno(link, r, "Failed to get \"name_assign_type\" attribute, ignoring: %m"); + + r = device_unsigned_attribute(device, "addr_assign_type", &link->addr_assign_type); + if (r < 0) + log_link_debug_errno(link, r, "Failed to get \"addr_assign_type\" attribute, ignoring: %m"); + + r = rtnl_get_link_info(rtnl, link->ifindex, &link->iftype, &link->flags, + &link->kind, &link->hw_addr, &link->permanent_hw_addr); + if (r < 0) + return r; + + if (link->hw_addr.length > 0 && link->permanent_hw_addr.length == 0) { + r = ethtool_get_permanent_hw_addr(&ctx->ethtool_fd, link->ifname, &link->permanent_hw_addr); + if (r < 0) + log_link_debug_errno(link, r, "Failed to get permanent hardware address, ignoring: %m"); + } + + r = sd_device_get_property_value(link->device, "ID_NET_DRIVER", &link->driver); + if (r < 0 && r != -ENOENT) + log_link_debug_errno(link, r, "Failed to get driver, ignoring: %m"); + + *ret = TAKE_PTR(link); + return 0; +} + +int link_get_config(LinkConfigContext *ctx, Link *link) { + int r; + + assert(ctx); + assert(link); + + /* Do not configure loopback interfaces by .link files. */ + if (link->flags & IFF_LOOPBACK) + return -ENOENT; + + LIST_FOREACH(configs, config, ctx->configs) { + r = net_match_config( + &config->match, + link->device, + &link->hw_addr, + &link->permanent_hw_addr, + link->driver, + link->iftype, + link->kind, + link->ifname, + /* alternative_names = */ NULL, + /* wlan_iftype = */ 0, + /* ssid = */ NULL, + /* bssid = */ NULL); + if (r < 0) + return r; + if (r == 0) + continue; + + if (config->match.ifname && !strv_contains(config->match.ifname, "*") && link->name_assign_type == NET_NAME_ENUM) + log_link_warning(link, "Config file %s is applied to device based on potentially unpredictable interface name.", + config->filename); + else + log_link_debug(link, "Config file %s is applied", config->filename); + + link->config = config; + return 0; + } + + return -ENOENT; +} + +static int link_apply_ethtool_settings(Link *link, int *ethtool_fd) { + LinkConfig *config; + const char *name; + int r; + + assert(link); + assert(link->config); + assert(ethtool_fd); + + config = link->config; + name = link->ifname; + + r = ethtool_set_glinksettings(ethtool_fd, name, + config->autonegotiation, config->advertise, + config->speed, config->duplex, config->port, config->mdi); + if (r < 0) { + if (config->autonegotiation >= 0) + log_link_warning_errno(link, r, "Could not %s auto negotiation, ignoring: %m", + enable_disable(config->autonegotiation)); + + if (!eqzero(config->advertise)) + log_link_warning_errno(link, r, "Could not set advertise mode, ignoring: %m"); + + if (config->speed > 0) + log_link_warning_errno(link, r, "Could not set speed to %"PRIu64"Mbps, ignoring: %m", + DIV_ROUND_UP(config->speed, 1000000)); + + if (config->duplex >= 0) + log_link_warning_errno(link, r, "Could not set duplex to %s, ignoring: %m", + duplex_to_string(config->duplex)); + + if (config->port >= 0) + log_link_warning_errno(link, r, "Could not set port to '%s', ignoring: %m", + port_to_string(config->port)); + + if (config->mdi != ETH_TP_MDI_INVALID) + log_link_warning_errno(link, r, "Could not set MDI-X to '%s', ignoring: %m", + mdi_to_string(config->mdi)); + } + + r = ethtool_set_wol(ethtool_fd, name, config->wol, config->wol_password); + if (r < 0) { + _cleanup_free_ char *str = NULL; + + (void) wol_options_to_string_alloc(config->wol, &str); + log_link_warning_errno(link, r, "Could not set WakeOnLan%s%s, ignoring: %m", + isempty(str) ? "" : " to ", strempty(str)); + } + + r = ethtool_set_features(ethtool_fd, name, config->features); + if (r < 0) + log_link_warning_errno(link, r, "Could not set offload features, ignoring: %m"); + + r = ethtool_set_channels(ethtool_fd, name, &config->channels); + if (r < 0) + log_link_warning_errno(link, r, "Could not set channels, ignoring: %m"); + + r = ethtool_set_nic_buffer_size(ethtool_fd, name, &config->ring); + if (r < 0) + log_link_warning_errno(link, r, "Could not set ring buffer, ignoring: %m"); + + r = ethtool_set_flow_control(ethtool_fd, name, config->rx_flow_control, config->tx_flow_control, config->autoneg_flow_control); + if (r < 0) + log_link_warning_errno(link, r, "Could not set flow control, ignoring: %m"); + + r = ethtool_set_nic_coalesce_settings(ethtool_fd, name, &config->coalesce); + if (r < 0) + log_link_warning_errno(link, r, "Could not set coalesce settings, ignoring: %m"); + + return 0; +} + +static bool hw_addr_is_valid(Link *link, const struct hw_addr_data *hw_addr) { + assert(link); + assert(hw_addr); + + switch (link->iftype) { + case ARPHRD_ETHER: + /* Refuse all zero and all 0xFF. */ + assert(hw_addr->length == ETH_ALEN); + return !ether_addr_is_null(&hw_addr->ether) && !ether_addr_is_broadcast(&hw_addr->ether); + + case ARPHRD_INFINIBAND: + /* The last 8 bytes cannot be zero. */ + assert(hw_addr->length == INFINIBAND_ALEN); + return !memeqzero(hw_addr->bytes + INFINIBAND_ALEN - 8, 8); + + default: + assert_not_reached(); + } +} + +static int link_generate_new_hw_addr(Link *link, struct hw_addr_data *ret) { + struct hw_addr_data hw_addr = HW_ADDR_NULL; + bool is_static = false; + uint8_t *p; + size_t len; + int r; + + assert(link); + assert(link->config); + assert(link->device); + assert(ret); + + if (link->hw_addr.length == 0) + goto finalize; + + if (link->config->mac_address_policy == MAC_ADDRESS_POLICY_NONE) { + log_link_debug(link, "Using static MAC address."); + hw_addr = link->config->hw_addr; + is_static = true; + goto finalize; + } + + if (!IN_SET(link->iftype, ARPHRD_ETHER, ARPHRD_INFINIBAND)) + goto finalize; + + switch (link->addr_assign_type) { + case NET_ADDR_SET: + log_link_debug(link, "MAC address on the device already set by userspace."); + goto finalize; + case NET_ADDR_STOLEN: + log_link_debug(link, "MAC address on the device already set based on another device."); + goto finalize; + case NET_ADDR_RANDOM: + case NET_ADDR_PERM: + break; + default: + log_link_warning(link, "Unknown addr_assign_type %u, ignoring", link->addr_assign_type); + goto finalize; + } + + if ((link->config->mac_address_policy == MAC_ADDRESS_POLICY_RANDOM) == (link->addr_assign_type == NET_ADDR_RANDOM)) { + log_link_debug(link, "MAC address on the device already matches policy \"%s\".", + mac_address_policy_to_string(link->config->mac_address_policy)); + goto finalize; + } + + hw_addr = (struct hw_addr_data) { + .length = arphrd_to_hw_addr_len(link->iftype), + }; + + switch (link->iftype) { + case ARPHRD_ETHER: + p = hw_addr.bytes; + len = hw_addr.length; + break; + case ARPHRD_INFINIBAND: + p = hw_addr.bytes + INFINIBAND_ALEN - 8; + len = 8; + break; + default: + assert_not_reached(); + } + + if (link->config->mac_address_policy == MAC_ADDRESS_POLICY_RANDOM) + /* We require genuine randomness here, since we want to make sure we won't collide with other + * systems booting up at the very same time. */ + for (;;) { + random_bytes(p, len); + if (hw_addr_is_valid(link, &hw_addr)) + break; + } + + else { + uint64_t result; + + r = net_get_unique_predictable_data(link->device, + naming_scheme_has(NAMING_STABLE_VIRTUAL_MACS), + &result); + if (r < 0) + return log_link_warning_errno(link, r, "Could not generate persistent MAC address: %m"); + + assert(len <= sizeof(result)); + memcpy(p, &result, len); + if (!hw_addr_is_valid(link, &hw_addr)) + return log_link_warning_errno(link, SYNTHETIC_ERRNO(EINVAL), + "Could not generate valid persistent MAC address: %m"); + } + +finalize: + + r = net_verify_hardware_address(link->ifname, is_static, link->iftype, &link->hw_addr, &hw_addr); + if (r < 0) + return r; + + if (hw_addr_equal(&link->hw_addr, &hw_addr)) { + *ret = HW_ADDR_NULL; + return 0; + } + + if (hw_addr.length > 0) + log_link_debug(link, "Applying %s MAC address: %s", + link->config->mac_address_policy == MAC_ADDRESS_POLICY_NONE ? "static" : + mac_address_policy_to_string(link->config->mac_address_policy), + HW_ADDR_TO_STR(&hw_addr)); + + *ret = hw_addr; + return 0; +} + +static int link_apply_rtnl_settings(Link *link, sd_netlink **rtnl) { + struct hw_addr_data hw_addr = {}; + LinkConfig *config; + int r; + + assert(link); + assert(link->config); + assert(rtnl); + + config = link->config; + + (void) link_generate_new_hw_addr(link, &hw_addr); + + r = rtnl_set_link_properties(rtnl, link->ifindex, config->alias, &hw_addr, + config->txqueues, config->rxqueues, config->txqueuelen, + config->mtu, config->gso_max_size, config->gso_max_segments); + if (r < 0) + log_link_warning_errno(link, r, + "Could not set Alias=, MACAddress=/MACAddressPolicy=, " + "TransmitQueues=, ReceiveQueues=, TransmitQueueLength=, MTUBytes=, " + "GenericSegmentOffloadMaxBytes= or GenericSegmentOffloadMaxSegments=, " + "ignoring: %m"); + + return 0; +} + +static bool enable_name_policy(void) { + static int cached = -1; + bool b; + int r; + + if (cached >= 0) + return cached; + + r = proc_cmdline_get_bool("net.ifnames", /* flags = */ 0, &b); + if (r < 0) + log_warning_errno(r, "Failed to parse net.ifnames= kernel command line option, ignoring: %m"); + if (r <= 0) + return (cached = true); + + if (!b) + log_info("Network interface NamePolicy= disabled on kernel command line."); + + return (cached = b); +} + +static int link_generate_new_name(Link *link) { + LinkConfig *config; + sd_device *device; + + assert(link); + assert(link->config); + assert(link->device); + + config = link->config; + device = link->device; + + if (link->action != SD_DEVICE_ADD) { + log_link_debug(link, "Skipping to apply Name= and NamePolicy= on '%s' uevent.", + device_action_to_string(link->action)); + goto no_rename; + } + + if (IN_SET(link->name_assign_type, NET_NAME_USER, NET_NAME_RENAMED) && + !naming_scheme_has(NAMING_ALLOW_RERENAMES)) { + log_link_debug(link, "Device already has a name given by userspace, not renaming."); + goto no_rename; + } + + if (enable_name_policy() && config->name_policy) + for (NamePolicy *policy = config->name_policy; *policy != _NAMEPOLICY_INVALID; policy++) { + const char *new_name = NULL; + + switch (*policy) { + case NAMEPOLICY_KERNEL: + if (link->name_assign_type != NET_NAME_PREDICTABLE) + continue; + + /* The kernel claims to have given a predictable name, keep it. */ + log_link_debug(link, "Policy *%s*: keeping predictable kernel name", + name_policy_to_string(*policy)); + goto no_rename; + case NAMEPOLICY_KEEP: + if (!IN_SET(link->name_assign_type, NET_NAME_USER, NET_NAME_RENAMED)) + continue; + + log_link_debug(link, "Policy *%s*: keeping existing userspace name", + name_policy_to_string(*policy)); + goto no_rename; + case NAMEPOLICY_DATABASE: + (void) sd_device_get_property_value(device, "ID_NET_NAME_FROM_DATABASE", &new_name); + break; + case NAMEPOLICY_ONBOARD: + (void) sd_device_get_property_value(device, "ID_NET_NAME_ONBOARD", &new_name); + break; + case NAMEPOLICY_SLOT: + (void) sd_device_get_property_value(device, "ID_NET_NAME_SLOT", &new_name); + break; + case NAMEPOLICY_PATH: + (void) sd_device_get_property_value(device, "ID_NET_NAME_PATH", &new_name); + break; + case NAMEPOLICY_MAC: + (void) sd_device_get_property_value(device, "ID_NET_NAME_MAC", &new_name); + break; + default: + assert_not_reached(); + } + if (ifname_valid(new_name)) { + log_link_debug(link, "Policy *%s* yields \"%s\".", name_policy_to_string(*policy), new_name); + link->new_name = new_name; + return 0; + } + } + + if (link->config->name) { + log_link_debug(link, "Policies didn't yield a name, using specified Name=%s.", link->config->name); + link->new_name = link->config->name; + return 0; + } + + log_link_debug(link, "Policies didn't yield a name and Name= is not given, not renaming."); +no_rename: + link->new_name = link->ifname; + return 0; +} + +static int link_generate_alternative_names(Link *link) { + _cleanup_strv_free_ char **altnames = NULL; + LinkConfig *config; + sd_device *device; + int r; + + assert(link); + config = ASSERT_PTR(link->config); + device = ASSERT_PTR(link->device); + assert(!link->altnames); + + if (link->action != SD_DEVICE_ADD) { + log_link_debug(link, "Skipping to apply AlternativeNames= and AlternativeNamesPolicy= on '%s' uevent.", + device_action_to_string(link->action)); + return 0; + } + + if (config->alternative_names) { + altnames = strv_copy(config->alternative_names); + if (!altnames) + return log_oom(); + } + + if (config->alternative_names_policy) + for (NamePolicy *p = config->alternative_names_policy; *p != _NAMEPOLICY_INVALID; p++) { + const char *n = NULL; + + switch (*p) { + case NAMEPOLICY_DATABASE: + (void) sd_device_get_property_value(device, "ID_NET_NAME_FROM_DATABASE", &n); + break; + case NAMEPOLICY_ONBOARD: + (void) sd_device_get_property_value(device, "ID_NET_NAME_ONBOARD", &n); + break; + case NAMEPOLICY_SLOT: + (void) sd_device_get_property_value(device, "ID_NET_NAME_SLOT", &n); + break; + case NAMEPOLICY_PATH: + (void) sd_device_get_property_value(device, "ID_NET_NAME_PATH", &n); + break; + case NAMEPOLICY_MAC: + (void) sd_device_get_property_value(device, "ID_NET_NAME_MAC", &n); + break; + default: + assert_not_reached(); + } + if (ifname_valid_full(n, IFNAME_VALID_ALTERNATIVE)) { + r = strv_extend(&altnames, n); + if (r < 0) + return log_oom(); + } + } + + link->altnames = TAKE_PTR(altnames); + return 0; +} + +static int sr_iov_configure(Link *link, sd_netlink **rtnl, SRIOV *sr_iov) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(link); + assert(rtnl); + assert(link->ifindex > 0); + + if (!*rtnl) { + r = sd_netlink_open(rtnl); + if (r < 0) + return r; + } + + r = sd_rtnl_message_new_link(*rtnl, &req, RTM_SETLINK, link->ifindex); + if (r < 0) + return r; + + r = sr_iov_set_netlink_message(sr_iov, req); + if (r < 0) + return r; + + r = sd_netlink_call(*rtnl, req, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +static int link_apply_sr_iov_config(Link *link, sd_netlink **rtnl) { + SRIOV *sr_iov; + uint32_t n; + int r; + + assert(link); + assert(link->config); + assert(link->device); + + r = sr_iov_set_num_vfs(link->device, link->config->sr_iov_num_vfs, link->config->sr_iov_by_section); + if (r < 0) + log_link_warning_errno(link, r, "Failed to set the number of SR-IOV virtual functions, ignoring: %m"); + + if (ordered_hashmap_isempty(link->config->sr_iov_by_section)) + return 0; + + r = sr_iov_get_num_vfs(link->device, &n); + if (r < 0) { + log_link_warning_errno(link, r, "Failed to get the number of SR-IOV virtual functions, ignoring [SR-IOV] sections: %m"); + return 0; + } + if (n == 0) { + log_link_warning(link, "No SR-IOV virtual function exists, ignoring [SR-IOV] sections: %m"); + return 0; + } + + ORDERED_HASHMAP_FOREACH(sr_iov, link->config->sr_iov_by_section) { + if (sr_iov->vf >= n) { + log_link_warning(link, "SR-IOV virtual function %"PRIu32" does not exist, ignoring.", sr_iov->vf); + continue; + } + + r = sr_iov_configure(link, rtnl, sr_iov); + if (r < 0) + log_link_warning_errno(link, r, + "Failed to configure SR-IOV virtual function %"PRIu32", ignoring: %m", + sr_iov->vf); + } + + return 0; +} + +int link_apply_config(LinkConfigContext *ctx, sd_netlink **rtnl, Link *link) { + int r; + + assert(ctx); + assert(rtnl); + assert(link); + + if (!IN_SET(link->action, SD_DEVICE_ADD, SD_DEVICE_BIND, SD_DEVICE_MOVE)) { + log_link_debug(link, "Skipping to apply .link settings on '%s' uevent.", + device_action_to_string(link->action)); + + link->new_name = link->ifname; + return 0; + } + + r = link_apply_ethtool_settings(link, &ctx->ethtool_fd); + if (r < 0) + return r; + + r = link_apply_rtnl_settings(link, rtnl); + if (r < 0) + return r; + + r = link_generate_new_name(link); + if (r < 0) + return r; + + r = link_generate_alternative_names(link); + if (r < 0) + return r; + + r = link_apply_sr_iov_config(link, rtnl); + if (r < 0) + return r; + + return 0; +} + +int config_parse_ifalias( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char **s = ASSERT_PTR(data); + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + *s = mfree(*s); + return 0; + } + + if (!ascii_is_valid(rvalue)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Interface alias is not ASCII clean, ignoring assignment: %s", rvalue); + return 0; + } + + if (strlen(rvalue) >= IFALIASZ) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "Interface alias is too long, ignoring assignment: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(s, rvalue); +} + +int config_parse_rx_tx_queues( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t k, *v = data; + int r; + + if (isempty(rvalue)) { + *v = 0; + return 0; + } + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s=, ignoring assignment: %s.", lvalue, rvalue); + return 0; + } + if (k == 0 || k > 4096) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid %s=, ignoring assignment: %s.", lvalue, rvalue); + return 0; + } + + *v = k; + return 0; +} + +int config_parse_txqueuelen( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint32_t k, *v = data; + int r; + + if (isempty(rvalue)) { + *v = UINT32_MAX; + return 0; + } + + r = safe_atou32(rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s=, ignoring assignment: %s.", lvalue, rvalue); + return 0; + } + if (k == UINT32_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid %s=, ignoring assignment: %s.", lvalue, rvalue); + return 0; + } + + *v = k; + return 0; +} + +int config_parse_wol_password( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + LinkConfig *config = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + config->wol_password = erase_and_free(config->wol_password); + config->wol_password_file = mfree(config->wol_password_file); + return 0; + } + + if (path_is_absolute(rvalue) && path_is_safe(rvalue)) { + config->wol_password = erase_and_free(config->wol_password); + return free_and_strdup_warn(&config->wol_password_file, rvalue); + } + + warn_file_is_world_accessible(filename, NULL, unit, line); + + r = link_parse_wol_password(config, rvalue); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse %s=, ignoring assignment: %s.", lvalue, rvalue); + return 0; + } + + config->wol_password_file = mfree(config->wol_password_file); + return 0; +} + +static const char* const mac_address_policy_table[_MAC_ADDRESS_POLICY_MAX] = { + [MAC_ADDRESS_POLICY_PERSISTENT] = "persistent", + [MAC_ADDRESS_POLICY_RANDOM] = "random", + [MAC_ADDRESS_POLICY_NONE] = "none", +}; + +DEFINE_STRING_TABLE_LOOKUP(mac_address_policy, MACAddressPolicy); +DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT( + config_parse_mac_address_policy, + mac_address_policy, + MACAddressPolicy, + MAC_ADDRESS_POLICY_NONE, + "Failed to parse MAC address policy"); + +DEFINE_CONFIG_PARSE_ENUMV(config_parse_name_policy, name_policy, NamePolicy, + _NAMEPOLICY_INVALID, + "Failed to parse interface name policy"); + +DEFINE_CONFIG_PARSE_ENUMV(config_parse_alternative_names_policy, alternative_names_policy, NamePolicy, + _NAMEPOLICY_INVALID, + "Failed to parse alternative names policy"); diff --git a/src/udev/net/link-config.h b/src/udev/net/link-config.h new file mode 100644 index 0000000..bab9d12 --- /dev/null +++ b/src/udev/net/link-config.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-device.h" +#include "sd-netlink.h" + +#include "condition.h" +#include "conf-parser.h" +#include "ethtool-util.h" +#include "hashmap.h" +#include "list.h" +#include "net-condition.h" +#include "netif-naming-scheme.h" + +typedef struct LinkConfigContext LinkConfigContext; +typedef struct LinkConfig LinkConfig; + +typedef enum MACAddressPolicy { + MAC_ADDRESS_POLICY_PERSISTENT, + MAC_ADDRESS_POLICY_RANDOM, + MAC_ADDRESS_POLICY_NONE, + _MAC_ADDRESS_POLICY_MAX, + _MAC_ADDRESS_POLICY_INVALID = -EINVAL, +} MACAddressPolicy; + +typedef struct Link { + int ifindex; + const char *ifname; + const char *new_name; + char **altnames; + + LinkConfig *config; + sd_device *device; + sd_device_action_t action; + + char *kind; + const char *driver; + uint16_t iftype; + uint32_t flags; + struct hw_addr_data hw_addr; + struct hw_addr_data permanent_hw_addr; + unsigned name_assign_type; + unsigned addr_assign_type; +} Link; + +struct LinkConfig { + char *filename; + char **dropins; + + NetMatch match; + LIST_HEAD(Condition, conditions); + + char *description; + struct hw_addr_data hw_addr; + MACAddressPolicy mac_address_policy; + NamePolicy *name_policy; + NamePolicy *alternative_names_policy; + char *name; + char **alternative_names; + char *alias; + uint32_t txqueues; + uint32_t rxqueues; + uint32_t txqueuelen; + uint32_t mtu; + uint32_t gso_max_segments; + size_t gso_max_size; + uint64_t speed; + Duplex duplex; + int autonegotiation; + uint32_t advertise[N_ADVERTISE]; + uint32_t wol; + char *wol_password_file; + uint8_t *wol_password; + NetDevPort port; + int features[_NET_DEV_FEAT_MAX]; + netdev_channels channels; + netdev_ring_param ring; + int rx_flow_control; + int tx_flow_control; + int autoneg_flow_control; + netdev_coalesce_param coalesce; + uint8_t mdi; + + uint32_t sr_iov_num_vfs; + OrderedHashmap *sr_iov_by_section; + + LIST_FIELDS(LinkConfig, configs); +}; + +int link_config_ctx_new(LinkConfigContext **ret); +LinkConfigContext* link_config_ctx_free(LinkConfigContext *ctx); +DEFINE_TRIVIAL_CLEANUP_FUNC(LinkConfigContext*, link_config_ctx_free); + +int link_load_one(LinkConfigContext *ctx, const char *filename); +int link_config_load(LinkConfigContext *ctx); +bool link_config_should_reload(LinkConfigContext *ctx); + +int link_new(LinkConfigContext *ctx, sd_netlink **rtnl, sd_device *device, Link **ret); +Link *link_free(Link *link); +DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_free); + +int link_get_config(LinkConfigContext *ctx, Link *link); +int link_apply_config(LinkConfigContext *ctx, sd_netlink **rtnl, Link *link); + +const char *mac_address_policy_to_string(MACAddressPolicy p) _const_; +MACAddressPolicy mac_address_policy_from_string(const char *p) _pure_; + +/* gperf lookup function */ +const struct ConfigPerfItem* link_config_gperf_lookup(const char *key, GPERF_LEN_TYPE length); + +CONFIG_PARSER_PROTOTYPE(config_parse_ifalias); +CONFIG_PARSER_PROTOTYPE(config_parse_rx_tx_queues); +CONFIG_PARSER_PROTOTYPE(config_parse_txqueuelen); +CONFIG_PARSER_PROTOTYPE(config_parse_wol_password); +CONFIG_PARSER_PROTOTYPE(config_parse_mac_address_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_name_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_alternative_names_policy); diff --git a/src/udev/net/test-link-config-tables.c b/src/udev/net/test-link-config-tables.c new file mode 100644 index 0000000..a433232 --- /dev/null +++ b/src/udev/net/test-link-config-tables.c @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "link-config.h" +#include "test-tables.h" +#include "tests.h" + +int main(int argc, char **argv) { + test_setup_logging(LOG_DEBUG); + + test_table(mac_address_policy, MAC_ADDRESS_POLICY); + + return EXIT_SUCCESS; +} diff --git a/src/udev/scsi_id/README b/src/udev/scsi_id/README new file mode 100644 index 0000000..9cfe739 --- /dev/null +++ b/src/udev/scsi_id/README @@ -0,0 +1,4 @@ +scsi_id - generate a SCSI unique identifier for a given SCSI device + +Please send questions, comments or patches to or +. diff --git a/src/udev/scsi_id/scsi.h b/src/udev/scsi_id/scsi.h new file mode 100644 index 0000000..ee3e401 --- /dev/null +++ b/src/udev/scsi_id/scsi.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +/* + * scsi.h + * + * General scsi and linux scsi specific defines and structs. + * + * Copyright (C) IBM Corp. 2003 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 of the License. + */ + +#include + +struct scsi_ioctl_command { + unsigned inlen; /* excluding scsi command length */ + unsigned outlen; + unsigned char data[1]; + /* on input, scsi command starts here then opt. data */ +}; + +/* + * Default 5 second timeout + */ +#define DEF_TIMEOUT 5000 + +#define SENSE_BUFF_LEN 32 + +/* + * The request buffer size passed to the SCSI INQUIRY commands, use 254, + * as this is a nice value for some devices, especially some of the usb + * mass storage devices. + */ +#define SCSI_INQ_BUFF_LEN 254 + +/* + * SCSI INQUIRY vendor and model (really product) lengths. + */ +#define VENDOR_LENGTH 8 +#define MODEL_LENGTH 16 + +#define INQUIRY_CMD 0x12 +#define INQUIRY_CMDLEN 6 + +/* + * INQUIRY VPD page 0x83 identifier descriptor related values. Reference the + * SCSI Primary Commands specification for details. + */ + +/* + * id type values of id descriptors. These are assumed to fit in 4 bits. + */ +#define SCSI_ID_VENDOR_SPECIFIC 0 +#define SCSI_ID_T10_VENDOR 1 +#define SCSI_ID_EUI_64 2 +#define SCSI_ID_NAA 3 +#define SCSI_ID_RELPORT 4 +#define SCSI_ID_TGTGROUP 5 +#define SCSI_ID_LUNGROUP 6 +#define SCSI_ID_MD5 7 +#define SCSI_ID_NAME 8 + +/* + * Supported NAA values. These fit in 4 bits, so the "don't care" value + * cannot conflict with real values. + */ +#define SCSI_ID_NAA_DONT_CARE 0xff +#define SCSI_ID_NAA_IEEE_REG 0x05 +#define SCSI_ID_NAA_IEEE_REG_EXTENDED 0x06 + +/* + * Supported Code Set values. + */ +#define SCSI_ID_BINARY 1 +#define SCSI_ID_ASCII 2 + +struct scsi_id_search_values { + u_char id_type; + u_char naa_type; + u_char code_set; +}; + +/* + * Following are the "true" SCSI status codes. Linux has traditionally + * used a 1 bit right and masked version of these. So now CHECK_CONDITION + * and friends (in ) are deprecated. + */ +#define SCSI_CHECK_CONDITION 0x02 +#define SCSI_CONDITION_MET 0x04 +#define SCSI_BUSY 0x08 +#define SCSI_IMMEDIATE 0x10 +#define SCSI_IMMEDIATE_CONDITION_MET 0x14 +#define SCSI_RESERVATION_CONFLICT 0x18 +#define SCSI_COMMAND_TERMINATED 0x22 +#define SCSI_TASK_SET_FULL 0x28 +#define SCSI_ACA_ACTIVE 0x30 +#define SCSI_TASK_ABORTED 0x40 diff --git a/src/udev/scsi_id/scsi_id.c b/src/udev/scsi_id/scsi_id.c new file mode 100644 index 0000000..6308c52 --- /dev/null +++ b/src/udev/scsi_id/scsi_id.c @@ -0,0 +1,515 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © IBM Corp. 2003 + * Copyright © SUSE Linux Products GmbH, 2006 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "build.h" +#include "device-nodes.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "parse-util.h" +#include "scsi_id.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "udev-util.h" + +static const struct option options[] = { + { "device", required_argument, NULL, 'd' }, + { "config", required_argument, NULL, 'f' }, + { "page", required_argument, NULL, 'p' }, + { "denylisted", no_argument, NULL, 'b' }, + { "allowlisted", no_argument, NULL, 'g' }, + { "blacklisted", no_argument, NULL, 'b' }, /* backward compat */ + { "whitelisted", no_argument, NULL, 'g' }, /* backward compat */ + { "replace-whitespace", no_argument, NULL, 'u' }, + { "sg-version", required_argument, NULL, 's' }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, /* don't advertise -V */ + { "export", no_argument, NULL, 'x' }, + { "help", no_argument, NULL, 'h' }, + {} +}; + +static bool all_good = false; +static bool dev_specified = false; +static char config_file[MAX_PATH_LEN] = "/etc/scsi_id.config"; +static enum page_code default_page_code = PAGE_UNSPECIFIED; +static int sg_version = 4; +static bool reformat_serial = false; +static bool export = false; +static char vendor_str[64]; +static char model_str[64]; +static char vendor_enc_str[256]; +static char model_enc_str[256]; +static char revision_str[16]; +static char type_str[16]; + +static void set_type(unsigned type_num, char *to, size_t len) { + const char *type; + + switch (type_num) { + case 0: + type = "disk"; + break; + case 1: + type = "tape"; + break; + case 4: + type = "optical"; + break; + case 5: + type = "cd"; + break; + case 7: + type = "optical"; + break; + case 0xe: + type = "disk"; + break; + case 0xf: + type = "optical"; + break; + default: + type = "generic"; + break; + } + strscpy(to, len, type); +} + +/* + * get_file_options: + * + * If vendor == NULL, find a line in the config file with only "OPTIONS="; + * if vendor and model are set find the first OPTIONS line in the config + * file that matches. Set argc and argv to match the OPTIONS string. + * + * vendor and model can end in '\n'. + */ +static int get_file_options(const char *vendor, const char *model, + int *argc, char ***newargv) { + _cleanup_free_ char *vendor_in = NULL, *model_in = NULL, *options_in = NULL; /* read in from file */ + _cleanup_strv_free_ char **options_argv = NULL; + _cleanup_fclose_ FILE *f = NULL; + int lineno, r; + + f = fopen(config_file, "re"); + if (!f) { + if (errno == ENOENT) + return 1; + else { + log_error_errno(errno, "can't open %s: %m", config_file); + return -1; + } + } + + *newargv = NULL; + lineno = 0; + for (;;) { + _cleanup_free_ char *buffer = NULL, *key = NULL, *value = NULL; + const char *buf; + + vendor_in = model_in = options_in = NULL; + + r = read_line(f, MAX_BUFFER_LEN, &buffer); + if (r < 0) + return log_error_errno(r, "read_line() on line %d of %s failed: %m", lineno, config_file); + if (r == 0) + break; + buf = buffer; + lineno++; + + while (isspace(*buf)) + buf++; + + /* blank or all whitespace line */ + if (*buf == '\0') + continue; + + /* comment line */ + if (*buf == '#') + continue; + + r = extract_many_words(&buf, "=\",\n", 0, &key, &value, NULL); + if (r < 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Error parsing config file line %d '%s'", lineno, buffer); + + if (strcaseeq(key, "VENDOR")) { + vendor_in = TAKE_PTR(value); + + key = mfree(key); + r = extract_many_words(&buf, "=\",\n", 0, &key, &value, NULL); + if (r < 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Error parsing config file line %d '%s'", lineno, buffer); + + if (strcaseeq(key, "MODEL")) { + model_in = TAKE_PTR(value); + + key = mfree(key); + r = extract_many_words(&buf, "=\",\n", 0, &key, &value, NULL); + if (r < 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Error parsing config file line %d '%s'", lineno, buffer); + } + } + + if (strcaseeq(key, "OPTIONS")) + options_in = TAKE_PTR(value); + + /* + * Only allow: [vendor=foo[,model=bar]]options=stuff + */ + if (!options_in || (!vendor_in && model_in)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Error parsing config file line %d '%s'", lineno, buffer); + if (!vendor) { + if (!vendor_in) + break; + } else if (vendor_in && + startswith(vendor, vendor_in) && + (!model_in || startswith(model, model_in))) { + /* + * Matched vendor and optionally model. + * + * Note: a short vendor_in or model_in can + * give a partial match (that is FOO + * matches FOOBAR). + */ + break; + } + + vendor_in = mfree(vendor_in); + model_in = mfree(model_in); + options_in = mfree(options_in); + + } + + if (vendor_in == NULL && model_in == NULL && options_in == NULL) + return 1; /* No matches */ + + /* + * Something matched. Allocate newargv, and store + * values found in options_in. + */ + options_argv = strv_split(options_in, " \t"); + if (!options_argv) + return log_oom(); + r = strv_prepend(&options_argv, ""); /* getopt skips over argv[0] */ + if (r < 0) + return r; + *newargv = TAKE_PTR(options_argv); + *argc = strv_length(*newargv); + + return 0; +} + +static void help(void) { + printf("Usage: %s [OPTION...] DEVICE\n\n" + "SCSI device identification.\n\n" + " -h --help Print this message\n" + " --version Print version of the program\n\n" + " -d --device= Device node for SG_IO commands\n" + " -f --config= Location of config file\n" + " -p --page=0x80|0x83|pre-spc3-83 SCSI page (0x80, 0x83, pre-spc3-83)\n" + " -s --sg-version=3|4 Use SGv3 or SGv4\n" + " -b --denylisted Treat device as denylisted\n" + " -g --allowlisted Treat device as allowlisted\n" + " -u --replace-whitespace Replace all whitespace by underscores\n" + " -v --verbose Verbose logging\n" + " -x --export Print values as environment keys\n", + program_invocation_short_name); +} + +static int set_options(int argc, char **argv, + char *maj_min_dev) { + int option; + + /* + * optind is a global extern used by getopt. Since we can call + * set_options twice (once for command line, and once for config + * file) we have to reset this back to 1. + */ + optind = 1; + while ((option = getopt_long(argc, argv, "d:f:gp:uvVxhbs:", options, NULL)) >= 0) + switch (option) { + case 'b': + all_good = false; + break; + + case 'd': + dev_specified = true; + strscpy(maj_min_dev, MAX_PATH_LEN, optarg); + break; + + case 'f': + strscpy(config_file, MAX_PATH_LEN, optarg); + break; + + case 'g': + all_good = true; + break; + + case 'h': + help(); + exit(EXIT_SUCCESS); + + case 'p': + if (streq(optarg, "0x80")) + default_page_code = PAGE_80; + else if (streq(optarg, "0x83")) + default_page_code = PAGE_83; + else if (streq(optarg, "pre-spc3-83")) + default_page_code = PAGE_83_PRE_SPC3; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown page code '%s'", + optarg); + break; + + case 's': + sg_version = atoi(optarg); + if (sg_version < 3 || sg_version > 4) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown SG version '%s'", + optarg); + break; + + case 'u': + reformat_serial = true; + break; + + case 'v': + log_set_target(LOG_TARGET_CONSOLE); + log_set_max_level(LOG_DEBUG); + log_open(); + break; + + case 'V': + version(); + exit(EXIT_SUCCESS); + + case 'x': + export = true; + break; + + case '?': + return -1; + + default: + assert_not_reached(); + } + + if (optind < argc && !dev_specified) { + dev_specified = true; + strscpy(maj_min_dev, MAX_PATH_LEN, argv[optind]); + } + + return 0; +} + +static int per_dev_options(struct scsi_id_device *dev_scsi, int *good_bad, int *page_code) { + _cleanup_strv_free_ char **newargv = NULL; + int retval; + int newargc; + int option; + + *good_bad = all_good; + *page_code = default_page_code; + + retval = get_file_options(vendor_str, model_str, &newargc, &newargv); + + optind = 1; /* reset this global extern */ + while (retval == 0) { + option = getopt_long(newargc, newargv, "bgp:", options, NULL); + if (option == -1) + break; + + switch (option) { + case 'b': + *good_bad = 0; + break; + + case 'g': + *good_bad = 1; + break; + + case 'p': + if (streq(optarg, "0x80")) { + *page_code = PAGE_80; + } else if (streq(optarg, "0x83")) { + *page_code = PAGE_83; + } else if (streq(optarg, "pre-spc3-83")) { + *page_code = PAGE_83_PRE_SPC3; + } else { + log_error("Unknown page code '%s'", optarg); + retval = -1; + } + break; + + default: + log_error("Unknown or bad option '%c' (0x%x)", option, (unsigned) option); + retval = -1; + break; + } + } + + return retval; +} + +static int set_inq_values(struct scsi_id_device *dev_scsi, const char *path) { + int retval; + + dev_scsi->use_sg = sg_version; + + retval = scsi_std_inquiry(dev_scsi, path); + if (retval) + return retval; + + encode_devnode_name(dev_scsi->vendor, vendor_enc_str, sizeof(vendor_enc_str)); + encode_devnode_name(dev_scsi->model, model_enc_str, sizeof(model_enc_str)); + + udev_replace_whitespace(dev_scsi->vendor, vendor_str, sizeof(vendor_str)-1); + udev_replace_chars(vendor_str, NULL); + udev_replace_whitespace(dev_scsi->model, model_str, sizeof(model_str)-1); + udev_replace_chars(model_str, NULL); + set_type(dev_scsi->type, type_str, sizeof(type_str)); + udev_replace_whitespace(dev_scsi->revision, revision_str, sizeof(revision_str)-1); + udev_replace_chars(revision_str, NULL); + return 0; +} + +/* + * scsi_id: try to get an id, if one is found, printf it to stdout. + * returns a value passed to exit() - 0 if printed an id, else 1. + */ +static int scsi_id(char *maj_min_dev) { + struct scsi_id_device dev_scsi = {}; + int good_dev; + int page_code; + int retval = 0; + + if (set_inq_values(&dev_scsi, maj_min_dev) < 0) { + retval = 1; + goto out; + } + + /* get per device (vendor + model) options from the config file */ + per_dev_options(&dev_scsi, &good_dev, &page_code); + if (!good_dev) { + retval = 1; + goto out; + } + + /* read serial number from mode pages (no values for optical drives) */ + scsi_get_serial(&dev_scsi, maj_min_dev, page_code, MAX_SERIAL_LEN); + + if (export) { + char serial_str[MAX_SERIAL_LEN]; + + printf("ID_SCSI=1\n"); + printf("ID_VENDOR=%s\n", vendor_str); + printf("ID_VENDOR_ENC=%s\n", vendor_enc_str); + printf("ID_MODEL=%s\n", model_str); + printf("ID_MODEL_ENC=%s\n", model_enc_str); + printf("ID_REVISION=%s\n", revision_str); + printf("ID_TYPE=%s\n", type_str); + if (dev_scsi.serial[0] != '\0') { + udev_replace_whitespace(dev_scsi.serial, serial_str, sizeof(serial_str)-1); + udev_replace_chars(serial_str, NULL); + printf("ID_SERIAL=%s\n", serial_str); + udev_replace_whitespace(dev_scsi.serial_short, serial_str, sizeof(serial_str)-1); + udev_replace_chars(serial_str, NULL); + printf("ID_SERIAL_SHORT=%s\n", serial_str); + } + if (dev_scsi.wwn[0] != '\0') { + printf("ID_WWN=0x%s\n", dev_scsi.wwn); + if (dev_scsi.wwn_vendor_extension[0] != '\0') { + printf("ID_WWN_VENDOR_EXTENSION=0x%s\n", dev_scsi.wwn_vendor_extension); + printf("ID_WWN_WITH_EXTENSION=0x%s%s\n", dev_scsi.wwn, dev_scsi.wwn_vendor_extension); + } else + printf("ID_WWN_WITH_EXTENSION=0x%s\n", dev_scsi.wwn); + } + if (dev_scsi.tgpt_group[0] != '\0') + printf("ID_TARGET_PORT=%s\n", dev_scsi.tgpt_group); + if (dev_scsi.unit_serial_number[0] != '\0') + printf("ID_SCSI_SERIAL=%s\n", dev_scsi.unit_serial_number); + goto out; + } + + if (dev_scsi.serial[0] == '\0') { + retval = 1; + goto out; + } + + if (reformat_serial) { + char serial_str[MAX_SERIAL_LEN]; + + udev_replace_whitespace(dev_scsi.serial, serial_str, sizeof(serial_str)-1); + udev_replace_chars(serial_str, NULL); + printf("%s\n", serial_str); + goto out; + } + + printf("%s\n", dev_scsi.serial); +out: + return retval; +} + +int main(int argc, char **argv) { + _cleanup_strv_free_ char **newargv = NULL; + int retval = 0; + char maj_min_dev[MAX_PATH_LEN]; + int newargc; + + log_set_target(LOG_TARGET_AUTO); + udev_parse_config(); + log_parse_environment(); + log_open(); + + /* + * Get config file options. + */ + retval = get_file_options(NULL, NULL, &newargc, &newargv); + if (retval < 0) { + retval = 1; + goto exit; + } + if (retval == 0) { + assert(newargv); + + if (set_options(newargc, newargv, maj_min_dev) < 0) { + retval = 2; + goto exit; + } + } + + /* + * Get command line options (overriding any config file settings). + */ + if (set_options(argc, argv, maj_min_dev) < 0) + exit(EXIT_FAILURE); + + if (!dev_specified) { + log_error("No device specified."); + retval = 1; + goto exit; + } + + retval = scsi_id(maj_min_dev); + +exit: + log_close(); + return retval; +} diff --git a/src/udev/scsi_id/scsi_id.h b/src/udev/scsi_id/scsi_id.h new file mode 100644 index 0000000..9ab3341 --- /dev/null +++ b/src/udev/scsi_id/scsi_id.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +/* + * Copyright © IBM Corp. 2003 + */ + +#define MAX_PATH_LEN 512 + +/* + * MAX_ATTR_LEN: maximum length of the result of reading a sysfs + * attribute. + */ +#define MAX_ATTR_LEN 256 + +/* + * MAX_SERIAL_LEN: the maximum length of the serial number, including + * added prefixes such as vendor and product (model) strings. + */ +#define MAX_SERIAL_LEN 256 + +/* + * MAX_BUFFER_LEN: maximum buffer size and line length used while reading + * the config file. + */ +#define MAX_BUFFER_LEN 256 + +struct scsi_id_device { + char vendor[9]; + char model[17]; + char revision[5]; + char kernel[64]; + char serial[MAX_SERIAL_LEN]; + char serial_short[MAX_SERIAL_LEN]; + unsigned type; + int use_sg; + + /* Always from page 0x80 e.g. 'B3G1P8500RWT' - may not be unique */ + char unit_serial_number[MAX_SERIAL_LEN]; + + /* NULs if not set - otherwise hex encoding using lower-case e.g. '50014ee0016eb572' */ + char wwn[17]; + + /* NULs if not set - otherwise hex encoding using lower-case e.g. '0xe00000d80000' */ + char wwn_vendor_extension[17]; + + /* NULs if not set - otherwise decimal number */ + char tgpt_group[8]; +}; + +int scsi_std_inquiry(struct scsi_id_device *dev_scsi, const char *devname); +int scsi_get_serial(struct scsi_id_device *dev_scsi, const char *devname, + int page_code, int len); + +/* + * Page code values. + */ +enum page_code { + PAGE_83_PRE_SPC3 = -0x83, + PAGE_UNSPECIFIED = 0x00, + PAGE_80 = 0x80, + PAGE_83 = 0x83, +}; diff --git a/src/udev/scsi_id/scsi_serial.c b/src/udev/scsi_id/scsi_serial.c new file mode 100644 index 0000000..aed6082 --- /dev/null +++ b/src/udev/scsi_id/scsi_serial.c @@ -0,0 +1,892 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © IBM Corp. 2003 + * + * Author: Patrick Mansfield + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "devnum-util.h" +#include "memory-util.h" +#include "random-util.h" +#include "scsi.h" +#include "scsi_id.h" +#include "string-util.h" + +/* + * A priority based list of id, naa, and binary/ascii for the identifier + * descriptor in VPD page 0x83. + * + * Brute force search for a match starting with the first value in the + * following id_search_list. This is not a performance issue, since there + * is normally one or some small number of descriptors. + */ +static const struct scsi_id_search_values id_search_list[] = { + { SCSI_ID_TGTGROUP, SCSI_ID_NAA_DONT_CARE, SCSI_ID_BINARY }, + { SCSI_ID_NAA, SCSI_ID_NAA_IEEE_REG_EXTENDED, SCSI_ID_BINARY }, + { SCSI_ID_NAA, SCSI_ID_NAA_IEEE_REG_EXTENDED, SCSI_ID_ASCII }, + { SCSI_ID_NAA, SCSI_ID_NAA_IEEE_REG, SCSI_ID_BINARY }, + { SCSI_ID_NAA, SCSI_ID_NAA_IEEE_REG, SCSI_ID_ASCII }, + /* + * Devices already exist using NAA values that are now marked + * reserved. These should not conflict with other values, or it is + * a bug in the device. As long as we find the IEEE extended one + * first, we really don't care what other ones are used. Using + * don't care here means that a device that returns multiple + * non-IEEE descriptors in a random order will get different + * names. + */ + { SCSI_ID_NAA, SCSI_ID_NAA_DONT_CARE, SCSI_ID_BINARY }, + { SCSI_ID_NAA, SCSI_ID_NAA_DONT_CARE, SCSI_ID_ASCII }, + { SCSI_ID_EUI_64, SCSI_ID_NAA_DONT_CARE, SCSI_ID_BINARY }, + { SCSI_ID_EUI_64, SCSI_ID_NAA_DONT_CARE, SCSI_ID_ASCII }, + { SCSI_ID_T10_VENDOR, SCSI_ID_NAA_DONT_CARE, SCSI_ID_BINARY }, + { SCSI_ID_T10_VENDOR, SCSI_ID_NAA_DONT_CARE, SCSI_ID_ASCII }, + { SCSI_ID_VENDOR_SPECIFIC, SCSI_ID_NAA_DONT_CARE, SCSI_ID_BINARY }, + { SCSI_ID_VENDOR_SPECIFIC, SCSI_ID_NAA_DONT_CARE, SCSI_ID_ASCII }, +}; + +static const char hex_str[]="0123456789abcdef"; + +/* + * Values returned in the result/status, only the ones used by the code + * are used here. + */ + +#define DID_NO_CONNECT 0x01 /* Unable to connect before timeout */ +#define DID_BUS_BUSY 0x02 /* Bus remain busy until timeout */ +#define DID_TIME_OUT 0x03 /* Timed out for some other reason */ +#define DID_TRANSPORT_DISRUPTED 0x0e /* Transport disrupted and should retry */ +#define DRIVER_TIMEOUT 0x06 +#define DRIVER_SENSE 0x08 /* Sense_buffer has been set */ + +/* The following "category" function returns one of the following */ +#define SG_ERR_CAT_CLEAN 0 /* No errors or other information */ +#define SG_ERR_CAT_MEDIA_CHANGED 1 /* interpreted from sense buffer */ +#define SG_ERR_CAT_RESET 2 /* interpreted from sense buffer */ +#define SG_ERR_CAT_TIMEOUT 3 +#define SG_ERR_CAT_RECOVERED 4 /* Successful command after recovered err */ +#define SG_ERR_CAT_NOTSUPPORTED 5 /* Illegal / unsupported command */ +#define SG_ERR_CAT_RETRY 6 /* Command should be retried */ +#define SG_ERR_CAT_SENSE 98 /* Something else in the sense buffer */ +#define SG_ERR_CAT_OTHER 99 /* Some other error/warning */ + +static int do_scsi_page80_inquiry(struct scsi_id_device *dev_scsi, int fd, + char *serial, char *serial_short, int max_len); + +static int sg_err_category_new(int scsi_status, int msg_status, int + host_status, int driver_status, const + unsigned char *sense_buffer, int sb_len) { + scsi_status &= 0x7e; + + /* + * XXX change to return only two values - failed or OK. + */ + + if (!scsi_status && !host_status && !driver_status) + return SG_ERR_CAT_CLEAN; + + if (IN_SET(scsi_status, SCSI_CHECK_CONDITION, SCSI_COMMAND_TERMINATED) || + (driver_status & 0xf) == DRIVER_SENSE) { + if (sense_buffer && (sb_len > 2)) { + int sense_key; + unsigned char asc; + + if (sense_buffer[0] & 0x2) { + sense_key = sense_buffer[1] & 0xf; + asc = sense_buffer[2]; + } else { + sense_key = sense_buffer[2] & 0xf; + asc = (sb_len > 12) ? sense_buffer[12] : 0; + } + + if (sense_key == RECOVERED_ERROR) + return SG_ERR_CAT_RECOVERED; + else if (sense_key == UNIT_ATTENTION) { + if (0x28 == asc) + return SG_ERR_CAT_MEDIA_CHANGED; + if (0x29 == asc) + return SG_ERR_CAT_RESET; + } else if (sense_key == ILLEGAL_REQUEST) + return SG_ERR_CAT_NOTSUPPORTED; + } + return SG_ERR_CAT_SENSE; + } + if (host_status) { + if (IN_SET(host_status, DID_NO_CONNECT, DID_BUS_BUSY, DID_TIME_OUT)) + return SG_ERR_CAT_TIMEOUT; + if (host_status == DID_TRANSPORT_DISRUPTED) + return SG_ERR_CAT_RETRY; + } + if (driver_status) { + if (driver_status == DRIVER_TIMEOUT) + return SG_ERR_CAT_TIMEOUT; + } + return SG_ERR_CAT_OTHER; +} + +static int sg_err_category3(struct sg_io_hdr *hp) { + return sg_err_category_new(hp->status, hp->msg_status, + hp->host_status, hp->driver_status, + hp->sbp, hp->sb_len_wr); +} + +static int sg_err_category4(struct sg_io_v4 *hp) { + return sg_err_category_new(hp->device_status, 0, + hp->transport_status, hp->driver_status, + (unsigned char *)(uintptr_t)hp->response, + hp->response_len); +} + +static int scsi_dump_sense(struct scsi_id_device *dev_scsi, + unsigned char *sense_buffer, int sb_len) { + int s; + unsigned code, sense_class, sense_key, asc, ascq; + + /* + * Figure out and print the sense key, asc and ascq. + * + * If you want to suppress these for a particular drive model, add + * a deny list entry in the scsi_id config file. + * + * XXX We probably need to: lookup the sense/asc/ascq in a retry + * table, and if found return 1 (after dumping the sense, asc, and + * ascq). So, if/when we get something like a power on/reset, + * we'll retry the command. + */ + + if (sb_len < 1) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: sense buffer empty", + dev_scsi->kernel); + + sense_class = (sense_buffer[0] >> 4) & 0x07; + code = sense_buffer[0] & 0xf; + + if (sense_class == 7) { + /* + * extended sense data. + */ + s = sense_buffer[7] + 8; + if (sb_len < s) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: sense buffer too small %d bytes, %d bytes too short", + dev_scsi->kernel, sb_len, + s - sb_len); + + if (IN_SET(code, 0x0, 0x1)) { + sense_key = sense_buffer[2] & 0xf; + if (s < 14) + /* + * Possible? + */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: sense result too small %d bytes", + dev_scsi->kernel, s); + + asc = sense_buffer[12]; + ascq = sense_buffer[13]; + } else if (IN_SET(code, 0x2, 0x3)) { + sense_key = sense_buffer[1] & 0xf; + asc = sense_buffer[2]; + ascq = sense_buffer[3]; + } else + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: invalid sense code 0x%x", + dev_scsi->kernel, code); + + log_debug("%s: sense key 0x%x ASC 0x%x ASCQ 0x%x", + dev_scsi->kernel, sense_key, asc, ascq); + } else { + if (sb_len < 4) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: sense buffer too small %d bytes, %d bytes too short", + dev_scsi->kernel, sb_len, + 4 - sb_len); + + if (sense_buffer[0] < 15) + log_debug("%s: old sense key: 0x%x", dev_scsi->kernel, sense_buffer[0] & 0x0fu); + else + log_debug("%s: sense = %2x %2x", + dev_scsi->kernel, sense_buffer[0], sense_buffer[2]); + log_debug("%s: non-extended sense class %u code 0x%0x", + dev_scsi->kernel, sense_class, code); + + } + + return -1; +} + +static int scsi_dump(struct scsi_id_device *dev_scsi, struct sg_io_hdr *io) { + if (!io->status && !io->host_status && !io->msg_status && + !io->driver_status) + /* + * Impossible, should not be called. + */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: called with no error", + __func__); + + log_debug("%s: sg_io failed status 0x%x 0x%x 0x%x 0x%x", + dev_scsi->kernel, io->driver_status, io->host_status, io->msg_status, io->status); + if (io->status == SCSI_CHECK_CONDITION) + return scsi_dump_sense(dev_scsi, io->sbp, io->sb_len_wr); + else + return -1; +} + +static int scsi_dump_v4(struct scsi_id_device *dev_scsi, struct sg_io_v4 *io) { + if (!io->device_status && !io->transport_status && + !io->driver_status) + /* + * Impossible, should not be called. + */ + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: called with no error", + __func__); + + log_debug("%s: sg_io failed status 0x%x 0x%x 0x%x", + dev_scsi->kernel, io->driver_status, io->transport_status, io->device_status); + if (io->device_status == SCSI_CHECK_CONDITION) + return scsi_dump_sense(dev_scsi, (unsigned char *)(uintptr_t)io->response, + io->response_len); + else + return -1; +} + +static int scsi_inquiry(struct scsi_id_device *dev_scsi, int fd, + unsigned char evpd, unsigned char page, + unsigned char *buf, unsigned buflen) { + unsigned char inq_cmd[INQUIRY_CMDLEN] = + { INQUIRY_CMD, evpd, page, 0, buflen, 0 }; + unsigned char sense[SENSE_BUFF_LEN]; + void *io_buf; + struct sg_io_v4 io_v4; + struct sg_io_hdr io_hdr; + int retry = 3; /* rather random */ + int retval; + + if (buflen > SCSI_INQ_BUFF_LEN) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "buflen %u too long", buflen); + +resend: + if (dev_scsi->use_sg == 4) { + memzero(&io_v4, sizeof(struct sg_io_v4)); + io_v4.guard = 'Q'; + io_v4.protocol = BSG_PROTOCOL_SCSI; + io_v4.subprotocol = BSG_SUB_PROTOCOL_SCSI_CMD; + io_v4.request_len = sizeof(inq_cmd); + io_v4.request = (uintptr_t)inq_cmd; + io_v4.max_response_len = sizeof(sense); + io_v4.response = (uintptr_t)sense; + io_v4.din_xfer_len = buflen; + io_v4.din_xferp = (uintptr_t)buf; + io_buf = (void *)&io_v4; + } else { + memzero(&io_hdr, sizeof(struct sg_io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.dxfer_len = buflen; + io_hdr.dxferp = buf; + io_hdr.cmdp = inq_cmd; + io_hdr.sbp = sense; + io_hdr.timeout = DEF_TIMEOUT; + io_buf = (void *)&io_hdr; + } + + retval = ioctl(fd, SG_IO, io_buf); + if (retval < 0) { + if (IN_SET(errno, EINVAL, ENOSYS) && dev_scsi->use_sg == 4) { + dev_scsi->use_sg = 3; + goto resend; + } + log_debug_errno(errno, "%s: ioctl failed: %m", dev_scsi->kernel); + goto error; + } + + if (dev_scsi->use_sg == 4) + retval = sg_err_category4(io_buf); + else + retval = sg_err_category3(io_buf); + + switch (retval) { + case SG_ERR_CAT_NOTSUPPORTED: + buf[1] = 0; + _fallthrough_; + case SG_ERR_CAT_CLEAN: + case SG_ERR_CAT_RECOVERED: + retval = 0; + break; + case SG_ERR_CAT_RETRY: + break; + + default: + if (dev_scsi->use_sg == 4) + retval = scsi_dump_v4(dev_scsi, io_buf); + else + retval = scsi_dump(dev_scsi, io_buf); + } + + if (!retval) { + retval = buflen; + } else if (retval > 0) { + if (--retry > 0) + goto resend; + retval = -1; + } + +error: + if (retval < 0) + log_debug("%s: Unable to get INQUIRY vpd %d page 0x%x.", + dev_scsi->kernel, evpd, page); + + return retval; +} + +/* Get list of supported EVPD pages */ +static int do_scsi_page0_inquiry(struct scsi_id_device *dev_scsi, int fd, + unsigned char *buffer, unsigned len) { + int retval; + + memzero(buffer, len); + retval = scsi_inquiry(dev_scsi, fd, 1, 0x0, buffer, len); + if (retval < 0) + return 1; + + if (buffer[1] != 0) { + log_debug("%s: page 0 not available.", dev_scsi->kernel); + return 1; + } + if (buffer[3] > len) { + log_debug("%s: page 0 buffer too long %d", dev_scsi->kernel, buffer[3]); + return 1; + } + + /* + * Following check is based on code once included in the 2.5.x + * kernel. + * + * Some ill behaved devices return the standard inquiry here + * rather than the evpd data, snoop the data to verify. + */ + if (buffer[3] > MODEL_LENGTH) { + /* + * If the vendor id appears in the page assume the page is + * invalid. + */ + if (strneq((char*) buffer + VENDOR_LENGTH, dev_scsi->vendor, VENDOR_LENGTH)) { + log_debug("%s: invalid page0 data", dev_scsi->kernel); + return 1; + } + } + return 0; +} + +static int append_vendor_model( + const struct scsi_id_device *dev_scsi, + char buf[static VENDOR_LENGTH + MODEL_LENGTH]) { + + assert(dev_scsi); + assert(buf); + + if (strnlen(dev_scsi->vendor, VENDOR_LENGTH) != VENDOR_LENGTH) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: bad vendor string \"%s\"", + dev_scsi->kernel, dev_scsi->vendor); + if (strnlen(dev_scsi->model, MODEL_LENGTH) != MODEL_LENGTH) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: bad model string \"%s\"", + dev_scsi->kernel, dev_scsi->model); + memcpy(buf, dev_scsi->vendor, VENDOR_LENGTH); + memcpy(buf + VENDOR_LENGTH, dev_scsi->model, MODEL_LENGTH); + return VENDOR_LENGTH + MODEL_LENGTH; +} + +/* + * check_fill_0x83_id - check the page 0x83 id, if OK allocate and fill + * serial number. + */ +static int check_fill_0x83_id(struct scsi_id_device *dev_scsi, + unsigned char *page_83, + const struct scsi_id_search_values + *id_search, char *serial, char *serial_short, + int max_len, char *wwn, + char *wwn_vendor_extension, char *tgpt_group) { + int i, j, s, len; + + /* + * ASSOCIATION must be with the device (value 0) + * or with the target port for SCSI_ID_TGTPORT + */ + if ((page_83[1] & 0x30) == 0x10) { + if (id_search->id_type != SCSI_ID_TGTGROUP) + return 1; + } else if ((page_83[1] & 0x30) != 0) + return 1; + + if ((page_83[1] & 0x0f) != id_search->id_type) + return 1; + + /* + * Possibly check NAA sub-type. + */ + if ((id_search->naa_type != SCSI_ID_NAA_DONT_CARE) && + (id_search->naa_type != (page_83[4] & 0xf0) >> 4)) + return 1; + + /* + * Check for matching code set - ASCII or BINARY. + */ + if ((page_83[0] & 0x0f) != id_search->code_set) + return 1; + + /* + * page_83[3]: identifier length + */ + len = page_83[3]; + if ((page_83[0] & 0x0f) != SCSI_ID_ASCII) + /* + * If not ASCII, use two bytes for each binary value. + */ + len *= 2; + + /* + * Add one byte for the NUL termination, and one for the id_type. + */ + len += 2; + if (id_search->id_type == SCSI_ID_VENDOR_SPECIFIC) + len += VENDOR_LENGTH + MODEL_LENGTH; + + if (max_len < len) { + log_debug("%s: length %d too short - need %d", + dev_scsi->kernel, max_len, len); + return 1; + } + + if (id_search->id_type == SCSI_ID_TGTGROUP && tgpt_group != NULL) { + unsigned group; + + group = ((unsigned)page_83[6] << 8) | page_83[7]; + sprintf(tgpt_group,"%x", group); + return 1; + } + + serial[0] = hex_str[id_search->id_type]; + + /* + * For SCSI_ID_VENDOR_SPECIFIC prepend the vendor and model before + * the id since it is not unique across all vendors and models, + * this differs from SCSI_ID_T10_VENDOR, where the vendor is + * included in the identifier. + */ + if (id_search->id_type == SCSI_ID_VENDOR_SPECIFIC) + if (append_vendor_model(dev_scsi, serial + 1) < 0) + return 1; + + i = 4; /* offset to the start of the identifier */ + s = j = strlen(serial); + if ((page_83[0] & 0x0f) == SCSI_ID_ASCII) { + /* + * ASCII descriptor. + */ + while (i < (4 + page_83[3])) + serial[j++] = page_83[i++]; + } else { + /* + * Binary descriptor, convert to ASCII, using two bytes of + * ASCII for each byte in the page_83. + */ + while (i < (4 + page_83[3])) { + serial[j++] = hex_str[(page_83[i] & 0xf0) >> 4]; + serial[j++] = hex_str[page_83[i] & 0x0f]; + i++; + } + } + + strcpy(serial_short, serial + s); + + if (id_search->id_type == SCSI_ID_NAA && wwn != NULL) { + strncpy(wwn, serial + s, 16); + if (wwn_vendor_extension) + strncpy(wwn_vendor_extension, serial + s + 16, 16); + } + + return 0; +} + +/* Extract the raw binary from VPD 0x83 pre-SPC devices */ +static int check_fill_0x83_prespc3(struct scsi_id_device *dev_scsi, + unsigned char *page_83, + const struct scsi_id_search_values + *id_search, char *serial, char *serial_short, int max_len) { + int i, j; + + serial[0] = hex_str[SCSI_ID_NAA]; + /* serial has been memset to zero before */ + j = strlen(serial); /* j = 1; */ + + for (i = 0; (i < page_83[3]) && (j < max_len-3); ++i) { + serial[j++] = hex_str[(page_83[4+i] & 0xf0) >> 4]; + serial[j++] = hex_str[ page_83[4+i] & 0x0f]; + } + serial[max_len-1] = 0; + strncpy(serial_short, serial, max_len-1); + return 0; +} + +/* Get device identification VPD page */ +static int do_scsi_page83_inquiry(struct scsi_id_device *dev_scsi, int fd, + char *serial, char *serial_short, int len, + char *unit_serial_number, char *wwn, + char *wwn_vendor_extension, char *tgpt_group) { + int retval; + unsigned id_ind, j; + unsigned char page_83[SCSI_INQ_BUFF_LEN]; + + /* also pick up the page 80 serial number */ + do_scsi_page80_inquiry(dev_scsi, fd, NULL, unit_serial_number, MAX_SERIAL_LEN); + + memzero(page_83, SCSI_INQ_BUFF_LEN); + retval = scsi_inquiry(dev_scsi, fd, 1, PAGE_83, page_83, + SCSI_INQ_BUFF_LEN); + if (retval < 0) + return 1; + + if (page_83[1] != PAGE_83) { + log_debug("%s: Invalid page 0x83", dev_scsi->kernel); + return 1; + } + + /* + * XXX Some devices (IBM 3542) return all spaces for an identifier if + * the LUN is not actually configured. This leads to identifiers of + * the form: "1 ". + */ + + /* + * Model 4, 5, and (some) model 6 EMC Symmetrix devices return + * a page 83 reply according to SCSI-2 format instead of SPC-2/3. + * + * The SCSI-2 page 83 format returns an IEEE WWN in binary + * encoded hexi-decimal in the 16 bytes following the initial + * 4-byte page 83 reply header. + * + * Both the SPC-2 and SPC-3 formats return an IEEE WWN as part + * of an Identification descriptor. The 3rd byte of the first + * Identification descriptor is a reserved (BSZ) byte field. + * + * Reference the 7th byte of the page 83 reply to determine + * whether the reply is compliant with SCSI-2 or SPC-2/3 + * specifications. A zero value in the 7th byte indicates + * an SPC-2/3 conformant reply, (i.e., the reserved field of the + * first Identification descriptor). This byte will be non-zero + * for a SCSI-2 conformant page 83 reply from these EMC + * Symmetrix models since the 7th byte of the reply corresponds + * to the 4th and 5th nibbles of the 6-byte OUI for EMC, that is, + * 0x006048. + */ + + if (page_83[6] != 0) + return check_fill_0x83_prespc3(dev_scsi, page_83, id_search_list, + serial, serial_short, len); + + /* + * Search for a match in the prioritized id_search_list - since WWN ids + * come first we can pick up the WWN in check_fill_0x83_id(). + */ + for (id_ind = 0; + id_ind < sizeof(id_search_list)/sizeof(id_search_list[0]); + id_ind++) { + /* + * Examine each descriptor returned. There is normally only + * one or a small number of descriptors. + */ + for (j = 4; j <= ((unsigned)page_83[2] << 8) + (unsigned)page_83[3] + 3; j += page_83[j + 3] + 4) { + retval = check_fill_0x83_id(dev_scsi, page_83 + j, + id_search_list + id_ind, + serial, serial_short, len, + wwn, wwn_vendor_extension, + tgpt_group); + if (!retval) + return retval; + else if (retval < 0) + return retval; + } + } + return 1; +} + +/* + * Get device identification VPD page for older SCSI-2 device which is not + * compliant with either SPC-2 or SPC-3 format. + * + * Return the hard coded error code value 2 if the page 83 reply is not + * conformant to the SCSI-2 format. + */ +static int do_scsi_page83_prespc3_inquiry(struct scsi_id_device *dev_scsi, int fd, + char *serial, char *serial_short, int len) { + int retval; + int i, j; + unsigned char page_83[SCSI_INQ_BUFF_LEN]; + + memzero(page_83, SCSI_INQ_BUFF_LEN); + retval = scsi_inquiry(dev_scsi, fd, 1, PAGE_83, page_83, SCSI_INQ_BUFF_LEN); + if (retval < 0) + return 1; + + if (page_83[1] != PAGE_83) { + log_debug("%s: Invalid page 0x83", dev_scsi->kernel); + return 1; + } + /* + * Model 4, 5, and (some) model 6 EMC Symmetrix devices return + * a page 83 reply according to SCSI-2 format instead of SPC-2/3. + * + * The SCSI-2 page 83 format returns an IEEE WWN in binary + * encoded hexi-decimal in the 16 bytes following the initial + * 4-byte page 83 reply header. + * + * Both the SPC-2 and SPC-3 formats return an IEEE WWN as part + * of an Identification descriptor. The 3rd byte of the first + * Identification descriptor is a reserved (BSZ) byte field. + * + * Reference the 7th byte of the page 83 reply to determine + * whether the reply is compliant with SCSI-2 or SPC-2/3 + * specifications. A zero value in the 7th byte indicates + * an SPC-2/3 conformant reply, (i.e., the reserved field of the + * first Identification descriptor). This byte will be non-zero + * for a SCSI-2 conformant page 83 reply from these EMC + * Symmetrix models since the 7th byte of the reply corresponds + * to the 4th and 5th nibbles of the 6-byte OUI for EMC, that is, + * 0x006048. + */ + if (page_83[6] == 0) + return 2; + + serial[0] = hex_str[SCSI_ID_NAA]; + /* + * The first four bytes contain data, not a descriptor. + */ + i = 4; + j = strlen(serial); + /* + * Binary descriptor, convert to ASCII, + * using two bytes of ASCII for each byte + * in the page_83. + */ + while (i < (page_83[3]+4)) { + serial[j++] = hex_str[(page_83[i] & 0xf0) >> 4]; + serial[j++] = hex_str[page_83[i] & 0x0f]; + i++; + } + return 0; +} + +/* Get unit serial number VPD page */ +static int do_scsi_page80_inquiry(struct scsi_id_device *dev_scsi, int fd, + char *serial, char *serial_short, int max_len) { + int retval; + int ser_ind; + int i; + int len; + unsigned char buf[SCSI_INQ_BUFF_LEN]; + + memzero(buf, SCSI_INQ_BUFF_LEN); + retval = scsi_inquiry(dev_scsi, fd, 1, PAGE_80, buf, SCSI_INQ_BUFF_LEN); + if (retval < 0) + return retval; + + if (buf[1] != PAGE_80) { + log_debug("%s: Invalid page 0x80", dev_scsi->kernel); + return 1; + } + + len = 1 + VENDOR_LENGTH + MODEL_LENGTH + buf[3]; + if (max_len < len) { + log_debug("%s: length %d too short - need %d", + dev_scsi->kernel, max_len, len); + return 1; + } + /* + * Prepend 'S' to avoid unlikely collision with page 0x83 vendor + * specific type where we prepend '0' + vendor + model. + */ + len = buf[3]; + if (serial) { + serial[0] = 'S'; + ser_ind = append_vendor_model(dev_scsi, serial + 1); + if (ser_ind < 0) + return 1; + ser_ind++; /* for the leading 'S' */ + for (i = 4; i < len + 4; i++, ser_ind++) + serial[ser_ind] = buf[i]; + } + if (serial_short) { + memcpy(serial_short, buf + 4, len); + serial_short[len] = '\0'; + } + return 0; +} + +int scsi_std_inquiry(struct scsi_id_device *dev_scsi, const char *devname) { + int fd; + unsigned char buf[SCSI_INQ_BUFF_LEN]; + struct stat statbuf; + int err = 0; + + fd = open(devname, O_RDONLY | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + if (fd < 0) { + log_debug_errno(errno, "scsi_id: cannot open %s: %m", devname); + return 1; + } + + if (fstat(fd, &statbuf) < 0) { + log_debug_errno(errno, "scsi_id: cannot stat %s: %m", devname); + err = 2; + goto out; + } + format_devnum(statbuf.st_rdev, dev_scsi->kernel); + + memzero(buf, SCSI_INQ_BUFF_LEN); + err = scsi_inquiry(dev_scsi, fd, 0, 0, buf, SCSI_INQ_BUFF_LEN); + if (err < 0) + goto out; + + err = 0; + memcpy(dev_scsi->vendor, buf + 8, 8); + dev_scsi->vendor[8] = '\0'; + memcpy(dev_scsi->model, buf + 16, 16); + dev_scsi->model[16] = '\0'; + memcpy(dev_scsi->revision, buf + 32, 4); + dev_scsi->revision[4] = '\0'; + dev_scsi->type = buf[0] & 0x1f; + +out: + close(fd); + return err; +} + +int scsi_get_serial(struct scsi_id_device *dev_scsi, const char *devname, + int page_code, int len) { + unsigned char page0[SCSI_INQ_BUFF_LEN]; + int fd = -EBADF; + int cnt; + int ind; + int retval; + + memzero(dev_scsi->serial, len); + for (cnt = 20; cnt > 0; cnt--) { + fd = open(devname, O_RDONLY | O_NONBLOCK | O_CLOEXEC | O_NOCTTY); + if (fd >= 0 || errno != EBUSY) + break; + + usleep_safe(200U*USEC_PER_MSEC + random_u64_range(100U*USEC_PER_MSEC)); + } + if (fd < 0) + return 1; + + if (page_code == PAGE_80) { + if (do_scsi_page80_inquiry(dev_scsi, fd, dev_scsi->serial, dev_scsi->serial_short, len)) { + retval = 1; + goto completed; + } else { + retval = 0; + goto completed; + } + } else if (page_code == PAGE_83) { + if (do_scsi_page83_inquiry(dev_scsi, fd, dev_scsi->serial, dev_scsi->serial_short, len, dev_scsi->unit_serial_number, dev_scsi->wwn, dev_scsi->wwn_vendor_extension, dev_scsi->tgpt_group)) { + retval = 1; + goto completed; + } else { + retval = 0; + goto completed; + } + } else if (page_code == PAGE_83_PRE_SPC3) { + retval = do_scsi_page83_prespc3_inquiry(dev_scsi, fd, dev_scsi->serial, dev_scsi->serial_short, len); + if (retval) { + /* + * Fallback to servicing a SPC-2/3 compliant page 83 + * inquiry if the page 83 reply format does not + * conform to pre-SPC3 expectations. + */ + if (retval == 2) { + if (do_scsi_page83_inquiry(dev_scsi, fd, dev_scsi->serial, dev_scsi->serial_short, len, dev_scsi->unit_serial_number, dev_scsi->wwn, dev_scsi->wwn_vendor_extension, dev_scsi->tgpt_group)) { + retval = 1; + goto completed; + } else { + retval = 0; + goto completed; + } + } + else { + retval = 1; + goto completed; + } + } else { + retval = 0; + goto completed; + } + } else if (page_code != 0x00) { + log_debug("%s: unsupported page code 0x%d", dev_scsi->kernel, page_code); + retval = 1; + goto completed; + } + + /* + * Get page 0, the page of the pages. By default, try from best to + * worst of supported pages: 0x83 then 0x80. + */ + if (do_scsi_page0_inquiry(dev_scsi, fd, page0, SCSI_INQ_BUFF_LEN)) { + /* + * Don't try anything else. Black list if a specific page + * should be used for this vendor+model, or maybe have an + * optional fall-back to page 0x80 or page 0x83. + */ + retval = 1; + goto completed; + } + + for (ind = 4; ind <= page0[3] + 3; ind++) + if (page0[ind] == PAGE_83) + if (!do_scsi_page83_inquiry(dev_scsi, fd, + dev_scsi->serial, dev_scsi->serial_short, len, dev_scsi->unit_serial_number, dev_scsi->wwn, dev_scsi->wwn_vendor_extension, dev_scsi->tgpt_group)) { + /* + * Success + */ + retval = 0; + goto completed; + } + + for (ind = 4; ind <= page0[3] + 3; ind++) + if (page0[ind] == PAGE_80) + if (!do_scsi_page80_inquiry(dev_scsi, fd, + dev_scsi->serial, dev_scsi->serial_short, len)) { + /* + * Success + */ + retval = 0; + goto completed; + } + retval = 1; + +completed: + close(fd); + return retval; +} diff --git a/src/udev/test-udev-builtin.c b/src/udev/test-udev-builtin.c new file mode 100644 index 0000000..2ce7f19 --- /dev/null +++ b/src/udev/test-udev-builtin.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "tests.h" +#include "udev-builtin.h" + +TEST(udev_builtin_cmd_to_ptr) { + /* Those could have been static asserts, but ({}) is not allowed there. */ +#if HAVE_BLKID + assert_se(UDEV_BUILTIN_CMD_TO_PTR(UDEV_BUILTIN_BLKID)); + assert_se(PTR_TO_UDEV_BUILTIN_CMD(UDEV_BUILTIN_CMD_TO_PTR(UDEV_BUILTIN_BLKID)) == UDEV_BUILTIN_BLKID); +#endif + assert_se(UDEV_BUILTIN_CMD_TO_PTR(UDEV_BUILTIN_BTRFS)); + assert_se(PTR_TO_UDEV_BUILTIN_CMD(UDEV_BUILTIN_CMD_TO_PTR(UDEV_BUILTIN_BTRFS)) == UDEV_BUILTIN_BTRFS); + assert_se(PTR_TO_UDEV_BUILTIN_CMD(UDEV_BUILTIN_CMD_TO_PTR(_UDEV_BUILTIN_INVALID)) == _UDEV_BUILTIN_INVALID); + + assert_se(PTR_TO_UDEV_BUILTIN_CMD(NULL) == _UDEV_BUILTIN_INVALID); + assert_se(PTR_TO_UDEV_BUILTIN_CMD((void*) 10000) == _UDEV_BUILTIN_INVALID); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/udev/test-udev-format.c b/src/udev/test-udev-format.c new file mode 100644 index 0000000..d8e3808 --- /dev/null +++ b/src/udev/test-udev-format.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "string-util.h" +#include "tests.h" +#include "udev-format.h" + +static void test_udev_resolve_subsys_kernel_one(const char *str, bool read_value, int retval, const char *expected) { + char result[PATH_MAX] = ""; + int r; + + r = udev_resolve_subsys_kernel(str, result, sizeof(result), read_value); + log_info("\"%s\" → expect: \"%s\", %d, actual: \"%s\", %d", str, strnull(expected), retval, result, r); + assert_se(r == retval); + if (r >= 0) + assert_se(streq(result, expected)); +} + +TEST(udev_resolve_subsys_kernel) { + test_udev_resolve_subsys_kernel_one("hoge", false, -EINVAL, NULL); + test_udev_resolve_subsys_kernel_one("[hoge", false, -EINVAL, NULL); + test_udev_resolve_subsys_kernel_one("[hoge/foo", false, -EINVAL, NULL); + test_udev_resolve_subsys_kernel_one("[hoge/]", false, -EINVAL, NULL); + + test_udev_resolve_subsys_kernel_one("[net/lo]", false, 0, "/sys/devices/virtual/net/lo"); + test_udev_resolve_subsys_kernel_one("[net/lo]/", false, 0, "/sys/devices/virtual/net/lo"); + test_udev_resolve_subsys_kernel_one("[net/lo]hoge", false, 0, "/sys/devices/virtual/net/lo/hoge"); + test_udev_resolve_subsys_kernel_one("[net/lo]/hoge", false, 0, "/sys/devices/virtual/net/lo/hoge"); + + test_udev_resolve_subsys_kernel_one("[net/lo]", true, -EINVAL, NULL); + test_udev_resolve_subsys_kernel_one("[net/lo]/", true, -EINVAL, NULL); + test_udev_resolve_subsys_kernel_one("[net/lo]hoge", true, 0, ""); + test_udev_resolve_subsys_kernel_one("[net/lo]/hoge", true, 0, ""); + test_udev_resolve_subsys_kernel_one("[net/lo]address", true, 0, "00:00:00:00:00:00"); + test_udev_resolve_subsys_kernel_one("[net/lo]/address", true, 0, "00:00:00:00:00:00"); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/udev/test-udev-manager.c b/src/udev/test-udev-manager.c new file mode 100644 index 0000000..d444b0b --- /dev/null +++ b/src/udev/test-udev-manager.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "tests.h" +#include "udev-manager.h" + +TEST(devpath_conflict) { + assert_se(!devpath_conflict(NULL, NULL)); + assert_se(!devpath_conflict(NULL, "/devices/pci0000:00/0000:00:1c.4")); + assert_se(!devpath_conflict("/devices/pci0000:00/0000:00:1c.4", NULL)); + assert_se(!devpath_conflict("/devices/pci0000:00/0000:00:1c.4", "/devices/pci0000:00/0000:00:00.0")); + assert_se(!devpath_conflict("/devices/virtual/net/veth99", "/devices/virtual/net/veth999")); + + assert_se(devpath_conflict("/devices/pci0000:00/0000:00:1c.4", "/devices/pci0000:00/0000:00:1c.4")); + assert_se(devpath_conflict("/devices/pci0000:00/0000:00:1c.4", "/devices/pci0000:00/0000:00:1c.4/0000:3c:00.0")); + assert_se(devpath_conflict("/devices/pci0000:00/0000:00:1c.4/0000:3c:00.0/nvme/nvme0/nvme0n1", + "/devices/pci0000:00/0000:00:1c.4/0000:3c:00.0/nvme/nvme0/nvme0n1/nvme0n1p1")); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/udev/test-udev-node.c b/src/udev/test-udev-node.c new file mode 100644 index 0000000..b5eaa0d --- /dev/null +++ b/src/udev/test-udev-node.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "tests.h" +#include "udev-node.h" + +static void test_udev_node_escape_path_one(const char *path, const char *expected) { + char buf[NAME_MAX+1]; + size_t r; + + r = udev_node_escape_path(path, buf, sizeof buf); + log_debug("udev_node_escape_path(%s) -> %s (expected: %s)", path, buf, expected); + assert_se(r == strlen(expected)); + assert_se(streq(buf, expected)); +} + +TEST(udev_node_escape_path) { + char a[NAME_MAX+1], b[NAME_MAX+1]; + + test_udev_node_escape_path_one("/disk/by-id/nvme-eui.1922908022470001001b448b44ccb9d6", "\\x2fdisk\\x2fby-id\\x2fnvme-eui.1922908022470001001b448b44ccb9d6"); + test_udev_node_escape_path_one("/disk/by-id/nvme-eui.1922908022470001001b448b44ccb9d6-part1", "\\x2fdisk\\x2fby-id\\x2fnvme-eui.1922908022470001001b448b44ccb9d6-part1"); + test_udev_node_escape_path_one("/disk/by-id/nvme-eui.1922908022470001001b448b44ccb9d6-part2", "\\x2fdisk\\x2fby-id\\x2fnvme-eui.1922908022470001001b448b44ccb9d6-part2"); + test_udev_node_escape_path_one("/disk/by-id/nvme-WDC_PC_SN720_SDAQNTW-512G-1001_192290802247", "\\x2fdisk\\x2fby-id\\x2fnvme-WDC_PC_SN720_SDAQNTW-512G-1001_192290802247"); + test_udev_node_escape_path_one("/disk/by-id/nvme-WDC_PC_SN720_SDAQNTW-512G-1001_192290802247-part1", "\\x2fdisk\\x2fby-id\\x2fnvme-WDC_PC_SN720_SDAQNTW-512G-1001_192290802247-part1"); + test_udev_node_escape_path_one("/disk/by-id/nvme-WDC_PC_SN720_SDAQNTW-512G-1001_192290802247-part2", "\\x2fdisk\\x2fby-id\\x2fnvme-WDC_PC_SN720_SDAQNTW-512G-1001_192290802247-part2"); + test_udev_node_escape_path_one("/disk/by-id/usb-Generic-_SD_MMC_20120501030900000-0:0", "\\x2fdisk\\x2fby-id\\x2fusb-Generic-_SD_MMC_20120501030900000-0:0"); + + memset(a, 'a', sizeof(a) - 1); + memcpy(a, "/disk/by-id/", strlen("/disk/by-id/")); + char_array_0(a); + + memset(b, 'a', sizeof(b) - 1); + memcpy(b, "\\x2fdisk\\x2fby-id\\x2f", strlen("\\x2fdisk\\x2fby-id\\x2f")); + strcpy(b + sizeof(b) - 12, "N3YhcCqFeID"); + + test_udev_node_escape_path_one(a, b); + + strcpy(a + sizeof(a) - 12 - 9, "N3YhcCqFeID"); + strcpy(b + sizeof(b) - 12, "L1oK9iKWdmi"); + test_udev_node_escape_path_one(a, b); + + strcpy(a + sizeof(a) - 12 - 9, "a"); + strcpy(b + sizeof(b) - 12, "A7oaHBRuuZq"); + test_udev_node_escape_path_one(a, b); + + a[sizeof(a) - 12 - 9] = '\0'; + b[sizeof(a) - 12] = '\0'; + test_udev_node_escape_path_one(a, b); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/udev/test-udev-rule-runner.c b/src/udev/test-udev-rule-runner.c new file mode 100644 index 0000000..72296b3 --- /dev/null +++ b/src/udev/test-udev-rule-runner.c @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2003-2004 Greg Kroah-Hartman +***/ + +#include +#include +#include +#include +#include +#include +#include + +#include "device-private.h" +#include "fs-util.h" +#include "log.h" +#include "main-func.h" +#include "mkdir-label.h" +#include "mount-util.h" +#include "namespace-util.h" +#include "parse-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "tests.h" +#include "udev-event.h" +#include "udev-spawn.h" +#include "version.h" + +static int device_new_from_synthetic_event(sd_device **ret, const char *syspath, const char *action) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + sd_device_action_t a; + int r; + + assert(ret); + assert(syspath); + assert(action); + + a = device_action_from_string(action); + if (a < 0) + return a; + + r = sd_device_new_from_syspath(&dev, syspath); + if (r < 0) + return r; + + r = device_read_uevent_file(dev); + if (r < 0) + return r; + + r = device_set_action(dev, a); + if (r < 0) + return r; + + *ret = TAKE_PTR(dev); + return 0; +} + +static int fake_filesystems(void) { + static const struct fakefs { + const char *src; + const char *target; + const char *error; + bool ignore_mount_error; + } fakefss[] = { + { "tmpfs/sys", "/sys", "Failed to mount test /sys", false }, + { "tmpfs/dev", "/dev", "Failed to mount test /dev", false }, + { "run", "/run", "Failed to mount test /run", false }, + { "run", "/etc/udev/rules.d", "Failed to mount empty /etc/udev/rules.d", true }, + { "run", UDEVLIBEXECDIR "/rules.d", "Failed to mount empty " UDEVLIBEXECDIR "/rules.d", true }, + }; + int r; + + r = detach_mount_namespace(); + if (r < 0) + return log_error_errno(r, "Failed to detach mount namespace: %m"); + + for (size_t i = 0; i < ELEMENTSOF(fakefss); i++) { + r = mount_nofollow_verbose(fakefss[i].ignore_mount_error ? LOG_NOTICE : LOG_ERR, + fakefss[i].src, fakefss[i].target, NULL, MS_BIND, NULL); + if (r < 0 && !fakefss[i].ignore_mount_error) + return r; + } + + return 0; +} + +static int run(int argc, char *argv[]) { + _cleanup_(udev_rules_freep) UdevRules *rules = NULL; + _cleanup_(udev_event_freep) UdevEvent *event = NULL; + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *devpath, *devname, *action; + int r; + + test_setup_logging(LOG_INFO); + + if (!IN_SET(argc, 2, 3, 4)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program needs between one and three arguments, %d given", argc - 1); + + r = fake_filesystems(); + if (r < 0) + return r; + + /* Let's make sure the test runs with selinux assumed disabled. */ +#if HAVE_SELINUX + fini_selinuxmnt(); +#endif + mac_selinux_retest(); + + if (argc == 2) { + if (!streq(argv[1], "check")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Unknown argument: %s", argv[1]); + + return 0; + } + + log_debug("version %s", GIT_VERSION); + + r = mac_init(); + if (r < 0) + return r; + + action = argv[1]; + devpath = argv[2]; + + if (argv[3]) { + unsigned us; + + r = safe_atou(argv[3], &us); + if (r < 0) + return log_error_errno(r, "Invalid delay '%s': %m", argv[3]); + usleep_safe(us); + } + + assert_se(udev_rules_load(&rules, RESOLVE_NAME_EARLY) == 0); + + const char *syspath = strjoina("/sys", devpath); + r = device_new_from_synthetic_event(&dev, syspath, action); + if (r < 0) + return log_debug_errno(r, "Failed to open device '%s'", devpath); + + assert_se(event = udev_event_new(dev, 0, NULL, log_get_max_level())); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, SIGHUP, SIGCHLD, -1) >= 0); + + /* do what devtmpfs usually provides us */ + if (sd_device_get_devname(dev, &devname) >= 0) { + const char *subsystem; + mode_t mode = 0600; + + if (sd_device_get_subsystem(dev, &subsystem) >= 0 && streq(subsystem, "block")) + mode |= S_IFBLK; + else + mode |= S_IFCHR; + + if (!streq(action, "remove")) { + dev_t devnum = makedev(0, 0); + + (void) mkdir_parents_label(devname, 0755); + (void) sd_device_get_devnum(dev, &devnum); + if (mknod(devname, mode, devnum) < 0) + return log_error_errno(errno, "mknod() failed for '%s': %m", devname); + } else { + if (unlink(devname) < 0) + return log_error_errno(errno, "unlink('%s') failed: %m", devname); + (void) rmdir_parents(devname, "/dev"); + } + } + + udev_event_execute_rules(event, -1, 3 * USEC_PER_SEC, SIGKILL, NULL, rules); + udev_event_execute_run(event, 3 * USEC_PER_SEC, SIGKILL); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/udev/test-udev-rules.c b/src/udev/test-udev-rules.c new file mode 100644 index 0000000..b62b08b --- /dev/null +++ b/src/udev/test-udev-rules.c @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "string-util.h" +#include "tests.h" +#include "udev-rules.h" + +static void test_udev_rule_parse_value_one(const char *in, const char *expected_value, int expected_retval) { + _cleanup_free_ char *str = NULL; + char *value = UINT_TO_PTR(0x12345678U); + char *endpos = UINT_TO_PTR(0x87654321U); + + log_info("/* %s (%s, %s, %d) */", __func__, in, strnull(expected_value), expected_retval); + + assert_se(str = strdup(in)); + assert_se(udev_rule_parse_value(str, &value, &endpos) == expected_retval); + if (expected_retval < 0) { + /* not modified on failure */ + assert_se(value == UINT_TO_PTR(0x12345678U)); + assert_se(endpos == UINT_TO_PTR(0x87654321U)); + } else { + assert_se(streq_ptr(value, expected_value)); + assert_se(endpos == str + strlen(in)); + /* + * The return value must be terminated by two subsequent NULs + * so it could be safely interpreted as nulstr. + */ + assert_se(value[strlen(value) + 1] == '\0'); + } +} + +TEST(udev_rule_parse_value) { + /* input: "valid operand" + * parsed: valid operand + * use the following command to help generate textual C strings: + * python3 -c 'import json; print(json.dumps(input()))' */ + test_udev_rule_parse_value_one("\"valid operand\"", "valid operand", 0); + /* input: "va'l\'id\"op\"erand" + * parsed: va'l\'id"op"erand */ + test_udev_rule_parse_value_one("\"va'l\\'id\\\"op\\\"erand\"", "va'l\\'id\"op\"erand", 0); + test_udev_rule_parse_value_one("no quotes", NULL, -EINVAL); + test_udev_rule_parse_value_one("\"\\\\a\\b\\x\\y\"", "\\\\a\\b\\x\\y", 0); + test_udev_rule_parse_value_one("\"reject\0nul\"", NULL, -EINVAL); + /* input: e"" */ + test_udev_rule_parse_value_one("e\"\"", "", 0); + /* input: e"1234" */ + test_udev_rule_parse_value_one("e\"1234\"", "1234", 0); + /* input: e"\"" */ + test_udev_rule_parse_value_one("e\"\\\"\"", "\"", 0); + /* input: e"\ */ + test_udev_rule_parse_value_one("e\"\\", NULL, -EINVAL); + /* input: e"\" */ + test_udev_rule_parse_value_one("e\"\\\"", NULL, -EINVAL); + /* input: e"\\" */ + test_udev_rule_parse_value_one("e\"\\\\\"", "\\", 0); + /* input: e"\\\" */ + test_udev_rule_parse_value_one("e\"\\\\\\\"", NULL, -EINVAL); + /* input: e"\\\"" */ + test_udev_rule_parse_value_one("e\"\\\\\\\"\"", "\\\"", 0); + /* input: e"\\\\" */ + test_udev_rule_parse_value_one("e\"\\\\\\\\\"", "\\\\", 0); + /* input: e"operand with newline\n" */ + test_udev_rule_parse_value_one("e\"operand with newline\\n\"", "operand with newline\n", 0); + /* input: e"single\rcharacter\t\aescape\bsequence" */ + test_udev_rule_parse_value_one( + "e\"single\\rcharacter\\t\\aescape\\bsequence\"", "single\rcharacter\t\aescape\bsequence", 0); + /* input: e"reject\invalid escape sequence" */ + test_udev_rule_parse_value_one("e\"reject\\invalid escape sequence", NULL, -EINVAL); + /* input: e"\ */ + test_udev_rule_parse_value_one("e\"\\", NULL, -EINVAL); + /* input: "s\u1d1c\u1d04\u029c \u1d1c\u0274\u026a\u1d04\u1d0f\u1d05\u1d07 \U0001d568\U0001d560\U0001d568" */ + test_udev_rule_parse_value_one( + "e\"s\\u1d1c\\u1d04\\u029c \\u1d1c\\u0274\\u026a\\u1d04\\u1d0f\\u1d05\\u1d07 \\U0001d568\\U0001d560\\U0001d568\"", + "s\xe1\xb4\x9c\xe1\xb4\x84\xca\x9c \xe1\xb4\x9c\xc9\xb4\xc9\xaa\xe1\xb4\x84\xe1\xb4\x8f\xe1\xb4\x85\xe1\xb4\x87 \xf0\x9d\x95\xa8\xf0\x9d\x95\xa0\xf0\x9d\x95\xa8", + 0); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/udev/test-udev-spawn.c b/src/udev/test-udev-spawn.c new file mode 100644 index 0000000..4f43fac --- /dev/null +++ b/src/udev/test-udev-spawn.c @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "path-util.h" +#include "signal-util.h" +#include "strv.h" +#include "tests.h" +#include "udev-event.h" +#include "udev-spawn.h" + +#define BUF_SIZE 1024 + +static void test_event_spawn_core(bool with_pidfd, const char *cmd, char *result_buf, size_t buf_size) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_(udev_event_freep) UdevEvent *event = NULL; + + assert_se(setenv("SYSTEMD_PIDFD", yes_no(with_pidfd), 1) >= 0); + + assert_se(sd_device_new_from_syspath(&dev, "/sys/class/net/lo") >= 0); + assert_se(event = udev_event_new(dev, 0, NULL, LOG_DEBUG)); + assert_se(udev_event_spawn(event, 5 * USEC_PER_SEC, SIGKILL, false, cmd, result_buf, buf_size, NULL) == 0); + + assert_se(unsetenv("SYSTEMD_PIDFD") >= 0); +} + +static void test_event_spawn_cat(bool with_pidfd, size_t buf_size) { + _cleanup_strv_free_ char **lines = NULL; + _cleanup_free_ char *cmd = NULL; + char result_buf[BUF_SIZE]; + + log_debug("/* %s(%s) */", __func__, yes_no(with_pidfd)); + + assert_se(find_executable("cat", &cmd) >= 0); + assert_se(strextend_with_separator(&cmd, " ", "/sys/class/net/lo/uevent")); + + test_event_spawn_core(with_pidfd, cmd, result_buf, + buf_size >= BUF_SIZE ? BUF_SIZE : buf_size); + + assert_se(lines = strv_split_newlines(result_buf)); + strv_print(lines); + + if (buf_size >= BUF_SIZE) { + assert_se(strv_contains(lines, "INTERFACE=lo")); + assert_se(strv_contains(lines, "IFINDEX=1")); + } +} + +static void test_event_spawn_self(const char *self, const char *arg, bool with_pidfd) { + _cleanup_strv_free_ char **lines = NULL; + _cleanup_free_ char *cmd = NULL; + char result_buf[BUF_SIZE]; + + log_debug("/* %s(%s, %s) */", __func__, arg, yes_no(with_pidfd)); + + assert_se(cmd = strjoin(self, " ", arg)); + + test_event_spawn_core(with_pidfd, cmd, result_buf, BUF_SIZE); + + assert_se(lines = strv_split_newlines(result_buf)); + strv_print(lines); + + assert_se(strv_contains(lines, "aaa")); + assert_se(strv_contains(lines, "bbb")); +} + +static void test1(void) { + fprintf(stdout, "aaa\nbbb"); + fprintf(stderr, "ccc\nddd"); +} + +static void test2(void) { + char buf[16384]; + + fprintf(stdout, "aaa\nbbb"); + + memset(buf, 'a', sizeof(buf) - 1); + char_array_0(buf); + fputs(buf, stderr); +} + +int main(int argc, char *argv[]) { + _cleanup_free_ char *self = NULL; + + if (argc > 1) { + if (streq(argv[1], "test1")) + test1(); + else if (streq(argv[1], "test2")) + test2(); + else + assert_not_reached(); + + return 0; + } + + test_setup_logging(LOG_DEBUG); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + + test_event_spawn_cat(true, SIZE_MAX); + test_event_spawn_cat(false, SIZE_MAX); + test_event_spawn_cat(true, 5); + test_event_spawn_cat(false, 5); + + assert_se(path_make_absolute_cwd(argv[0], &self) >= 0); + path_simplify(self); + + test_event_spawn_self(self, "test1", true); + test_event_spawn_self(self, "test1", false); + + test_event_spawn_self(self, "test2", true); + test_event_spawn_self(self, "test2", false); + + return 0; +} diff --git a/src/udev/udev-builtin-blkid.c b/src/udev/udev-builtin-blkid.c new file mode 100644 index 0000000..11419a3 --- /dev/null +++ b/src/udev/udev-builtin-blkid.c @@ -0,0 +1,474 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * probe disks for filesystems and partitions + * + * Copyright © 2011 Karel Zak + */ + +#if HAVE_VALGRIND_MEMCHECK_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "blkid-util.h" +#include "device-util.h" +#include "devnum-util.h" +#include "efi-loader.h" +#include "errno-util.h" +#include "fd-util.h" +#include "gpt.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "udev-builtin.h" + +static void print_property(sd_device *dev, bool test, const char *name, const char *value) { + char s[256]; + + s[0] = '\0'; + + if (streq(name, "TYPE")) { + udev_builtin_add_property(dev, test, "ID_FS_TYPE", value); + + } else if (streq(name, "USAGE")) { + udev_builtin_add_property(dev, test, "ID_FS_USAGE", value); + + } else if (streq(name, "VERSION")) { + udev_builtin_add_property(dev, test, "ID_FS_VERSION", value); + + } else if (streq(name, "UUID")) { + blkid_safe_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_UUID", s); + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_UUID_ENC", s); + + } else if (streq(name, "UUID_SUB")) { + blkid_safe_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_UUID_SUB", s); + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_UUID_SUB_ENC", s); + + } else if (streq(name, "LABEL")) { + blkid_safe_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_LABEL", s); + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_LABEL_ENC", s); + + } else if (STR_IN_SET(name, "FSSIZE", "FSLASTBLOCK", "FSBLOCKSIZE")) { + strscpyl(s, sizeof(s), "ID_FS_", name + 2, NULL); + udev_builtin_add_property(dev, test, s, value); + + } else if (streq(name, "PTTYPE")) { + udev_builtin_add_property(dev, test, "ID_PART_TABLE_TYPE", value); + + } else if (streq(name, "PTUUID")) { + udev_builtin_add_property(dev, test, "ID_PART_TABLE_UUID", value); + + } else if (streq(name, "PART_ENTRY_NAME")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_PART_ENTRY_NAME", s); + + } else if (streq(name, "PART_ENTRY_TYPE")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_PART_ENTRY_TYPE", s); + + } else if (startswith(name, "PART_ENTRY_")) { + strscpyl(s, sizeof(s), "ID_", name, NULL); + udev_builtin_add_property(dev, test, s, value); + + } else if (streq(name, "SYSTEM_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_SYSTEM_ID", s); + + } else if (streq(name, "PUBLISHER_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_PUBLISHER_ID", s); + + } else if (streq(name, "APPLICATION_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_APPLICATION_ID", s); + + } else if (streq(name, "BOOT_SYSTEM_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_BOOT_SYSTEM_ID", s); + + } else if (streq(name, "VOLUME_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_VOLUME_ID", s); + + } else if (streq(name, "LOGICAL_VOLUME_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_LOGICAL_VOLUME_ID", s); + + } else if (streq(name, "VOLUME_SET_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_VOLUME_SET_ID", s); + + } else if (streq(name, "DATA_PREPARER_ID")) { + blkid_encode_string(value, s, sizeof(s)); + udev_builtin_add_property(dev, test, "ID_FS_DATA_PREPARER_ID", s); + } +} + +static int find_gpt_root(sd_device *dev, blkid_probe pr, bool test) { + +#if defined(SD_GPT_ROOT_NATIVE) && ENABLE_EFI + + _cleanup_free_ char *root_label = NULL; + bool found_esp_or_xbootldr = false; + sd_id128_t root_id = SD_ID128_NULL; + int r; + + assert(pr); + + /* Iterate through the partitions on this disk, and see if the UEFI ESP or XBOOTLDR partition we + * booted from is on it. If so, find the first root disk, and add a property indicating its partition + * UUID. */ + + errno = 0; + blkid_partlist pl = blkid_probe_get_partitions(pr); + if (!pl) + return errno_or_else(ENOMEM); + + int nvals = blkid_partlist_numof_partitions(pl); + for (int i = 0; i < nvals; i++) { + blkid_partition pp; + const char *label; + sd_id128_t type, id; + + pp = blkid_partlist_get_partition(pl, i); + if (!pp) + continue; + + r = blkid_partition_get_uuid_id128(pp, &id); + if (r < 0) { + log_debug_errno(r, "Failed to get partition UUID, ignoring: %m"); + continue; + } + + r = blkid_partition_get_type_id128(pp, &type); + if (r < 0) { + log_debug_errno(r, "Failed to get partition type UUID, ignoring: %m"); + continue; + } + + label = blkid_partition_get_name(pp); /* returns NULL if empty */ + + if (sd_id128_in_set(type, SD_GPT_ESP, SD_GPT_XBOOTLDR)) { + sd_id128_t esp_or_xbootldr; + + /* We found an ESP or XBOOTLDR, let's see if it matches the ESP/XBOOTLDR we booted from. */ + + r = efi_loader_get_device_part_uuid(&esp_or_xbootldr); + if (r < 0) + return r; + + if (sd_id128_equal(id, esp_or_xbootldr)) + found_esp_or_xbootldr = true; + + } else if (sd_id128_equal(type, SD_GPT_ROOT_NATIVE)) { + unsigned long long flags; + + flags = blkid_partition_get_flags(pp); + if (flags & SD_GPT_FLAG_NO_AUTO) + continue; + + /* We found a suitable root partition, let's remember the first one, or the one with + * the newest version, as determined by comparing the partition labels. */ + + if (sd_id128_is_null(root_id) || strverscmp_improved(label, root_label) > 0) { + root_id = id; + + r = free_and_strdup(&root_label, label); + if (r < 0) + return r; + } + } + } + + /* We found the ESP/XBOOTLDR on this disk, and also found a root partition, nice! Let's export its + * UUID */ + if (found_esp_or_xbootldr && !sd_id128_is_null(root_id)) + udev_builtin_add_property(dev, test, "ID_PART_GPT_AUTO_ROOT_UUID", SD_ID128_TO_UUID_STRING(root_id)); +#endif + + return 0; +} + +static int probe_superblocks(blkid_probe pr) { + struct stat st; + int rc; + + /* TODO: Return negative errno. */ + + if (fstat(blkid_probe_get_fd(pr), &st)) + return -errno; + + blkid_probe_enable_partitions(pr, 1); + + if (!S_ISCHR(st.st_mode) && + blkid_probe_get_size(pr) <= 1024 * 1440 && + blkid_probe_is_wholedisk(pr)) { + /* + * check if the small disk is partitioned, if yes then + * don't probe for filesystems. + */ + blkid_probe_enable_superblocks(pr, 0); + + rc = blkid_do_fullprobe(pr); + if (rc < 0) + return rc; /* -1 = error, 1 = nothing, 0 = success */ + + if (blkid_probe_lookup_value(pr, "PTTYPE", NULL, NULL) == 0) + return 0; /* partition table detected */ + } + + blkid_probe_set_partitions_flags(pr, BLKID_PARTS_ENTRY_DETAILS); + blkid_probe_enable_superblocks(pr, 1); + + return blkid_do_safeprobe(pr); +} + +static int read_loopback_backing_inode( + sd_device *dev, + int fd, + dev_t *ret_devno, + ino_t *ret_inode, + char **ret_fname) { + + _cleanup_free_ char *fn = NULL; + struct loop_info64 info; + const char *name; + int r; + + assert(dev); + assert(fd >= 0); + assert(ret_devno); + assert(ret_inode); + assert(ret_fname); + + /* Retrieves various fields of the current loopback device backing file, so that we can ultimately + * use it to create stable symlinks to loopback block devices, based on what they are backed by. We + * pick up inode/device as well as file name field. Note that we pick up the "lo_file_name" field + * here, which is an arbitrary free-form string provided by userspace. We do not return the sysfs + * attribute loop/backing_file here, because that is directly accessible from udev rules anyway. And + * sometimes, depending on context, it's a good thing to return the string userspace can freely pick + * over the string automatically generated by the kernel. */ + + r = sd_device_get_sysname(dev, &name); + if (r < 0) + return r; + + if (!startswith(name, "loop")) + goto notloop; + + if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + goto notloop; + + return -errno; + } + +#if HAVE_VALGRIND_MEMCHECK_H + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + + if (isempty((char*) info.lo_file_name) || + strnlen((char*) info.lo_file_name, sizeof(info.lo_file_name)-1) == sizeof(info.lo_file_name)-1) + /* Don't pick up file name if it is unset or possibly truncated. (Note: the kernel silently + * truncates the string passed from userspace by LOOP_SET_STATUS64 ioctl. See + * loop_set_status_from_info() in drivers/block/loop.c. Hence, we can't really know the file + * name is truncated if it uses sizeof(info.lo_file_name)-1 as length; it could also mean the + * string is just that long and wasn't truncated — but the fact is simply that we cannot know + * in that case if it was truncated or not. Thus, we assume the worst and suppress — at least + * for now. For shorter strings we know for sure it wasn't truncated, hence that's always + * safe.) */ + fn = NULL; + else { + fn = memdup_suffix0(info.lo_file_name, sizeof(info.lo_file_name)); + if (!fn) + return -ENOMEM; + } + + *ret_inode = info.lo_inode; + *ret_devno = info.lo_device; + *ret_fname = TAKE_PTR(fn); + return 1; + + +notloop: + *ret_devno = 0; + *ret_inode = 0; + *ret_fname = NULL; + return 0; +} + +static int builtin_blkid(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + const char *devnode, *root_partition = NULL, *data, *name; + _cleanup_(blkid_free_probep) blkid_probe pr = NULL; + _cleanup_free_ char *backing_fname = NULL; + bool noraid = false, is_gpt = false; + _cleanup_close_ int fd = -EBADF; + ino_t backing_inode = 0; + dev_t backing_devno = 0; + int64_t offset = 0; + int r; + + static const struct option options[] = { + { "offset", required_argument, NULL, 'o' }, + { "hint", required_argument, NULL, 'H' }, + { "noraid", no_argument, NULL, 'R' }, + {} + }; + + errno = 0; + pr = blkid_new_probe(); + if (!pr) + return log_device_debug_errno(dev, errno_or_else(ENOMEM), "Failed to create blkid prober: %m"); + + for (;;) { + int option; + + option = getopt_long(argc, argv, "o:H:R", options, NULL); + if (option == -1) + break; + + switch (option) { + case 'H': +#if HAVE_BLKID_PROBE_SET_HINT + errno = 0; + r = blkid_probe_set_hint(pr, optarg, 0); + if (r < 0) + return log_device_error_errno(dev, errno_or_else(ENOMEM), "Failed to use '%s' probing hint: %m", optarg); + break; +#else + /* Use the hint = as probing offset for old versions */ + optarg = strchr(optarg, '='); + if (!optarg) + /* no value means 0, do nothing for old versions */ + break; + ++optarg; + _fallthrough_; +#endif + case 'o': + r = safe_atoi64(optarg, &offset); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to parse '%s' as an integer: %m", optarg); + if (offset < 0) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), "Invalid offset %"PRIi64": %m", offset); + break; + case 'R': + noraid = true; + break; + } + } + + blkid_probe_set_superblocks_flags(pr, + BLKID_SUBLKS_LABEL | BLKID_SUBLKS_UUID | + BLKID_SUBLKS_TYPE | BLKID_SUBLKS_SECTYPE | +#ifdef BLKID_SUBLKS_FSINFO + BLKID_SUBLKS_FSINFO | +#endif + BLKID_SUBLKS_USAGE | BLKID_SUBLKS_VERSION); + + if (noraid) + blkid_probe_filter_superblocks_usage(pr, BLKID_FLTR_NOTIN, BLKID_USAGE_RAID); + + r = sd_device_get_devname(dev, &devnode); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device name: %m"); + + fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) { + bool ignore = ERRNO_IS_DEVICE_ABSENT(fd); + log_device_debug_errno(dev, fd, "Failed to open block device %s%s: %m", + devnode, ignore ? ", ignoring" : ""); + return ignore ? 0 : fd; + } + + errno = 0; + r = blkid_probe_set_device(pr, fd, offset, 0); + if (r < 0) + return log_device_debug_errno(dev, errno_or_else(ENOMEM), "Failed to set device to blkid prober: %m"); + + log_device_debug(dev, "Probe %s with %sraid and offset=%"PRIi64, devnode, noraid ? "no" : "", offset); + + r = probe_superblocks(pr); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to probe superblocks: %m"); + + /* If the device is a partition then its parent passed the root partition UUID to the device */ + (void) sd_device_get_property_value(dev, "ID_PART_GPT_AUTO_ROOT_UUID", &root_partition); + + errno = 0; + int nvals = blkid_probe_numof_values(pr); + if (nvals < 0) + return log_device_debug_errno(dev, errno_or_else(ENOMEM), "Failed to get number of probed values: %m"); + + for (int i = 0; i < nvals; i++) { + if (blkid_probe_get_value(pr, i, &name, &data, NULL) < 0) + continue; + + print_property(dev, test, name, data); + + /* Is this a disk with GPT partition table? */ + if (streq(name, "PTTYPE") && streq(data, "gpt")) + is_gpt = true; + + /* Is this a partition that matches the root partition + * property inherited from the parent? */ + if (root_partition && streq(name, "PART_ENTRY_UUID") && streq(data, root_partition)) + udev_builtin_add_property(dev, test, "ID_PART_GPT_AUTO_ROOT", "1"); + } + + if (is_gpt) + find_gpt_root(dev, pr, test); + + r = read_loopback_backing_inode( + dev, + fd, + &backing_devno, + &backing_inode, + &backing_fname); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to read loopback backing inode, ignoring: %m"); + else if (r > 0) { + udev_builtin_add_propertyf(dev, test, "ID_LOOP_BACKING_DEVICE", DEVNUM_FORMAT_STR, DEVNUM_FORMAT_VAL(backing_devno)); + udev_builtin_add_propertyf(dev, test, "ID_LOOP_BACKING_INODE", "%" PRIu64, (uint64_t) backing_inode); + + if (backing_fname) { + /* In the worst case blkid_encode_string() will blow up to 4x the string + * length. Hence size the buffer to 4x of the longest string + * read_loopback_backing_inode() might return */ + char encoded[sizeof_field(struct loop_info64, lo_file_name) * 4 + 1]; + + assert(strlen(backing_fname) < ELEMENTSOF(encoded) / 4); + blkid_encode_string(backing_fname, encoded, ELEMENTSOF(encoded)); + + udev_builtin_add_property(dev, test, "ID_LOOP_BACKING_FILENAME", backing_fname); + udev_builtin_add_property(dev, test, "ID_LOOP_BACKING_FILENAME_ENC", encoded); + } + } + + return 0; +} + +const UdevBuiltin udev_builtin_blkid = { + .name = "blkid", + .cmd = builtin_blkid, + .help = "Filesystem and partition probing", + .run_once = true, +}; diff --git a/src/udev/udev-builtin-btrfs.c b/src/udev/udev-builtin-btrfs.c new file mode 100644 index 0000000..9b12aeb --- /dev/null +++ b/src/udev/udev-builtin-btrfs.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include "device-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "udev-builtin.h" + +static int builtin_btrfs(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + struct btrfs_ioctl_vol_args args = {}; + _cleanup_close_ int fd = -EBADF; + int r; + + if (argc != 3 || !streq(argv[1], "ready")) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), "Invalid arguments"); + + fd = open("/dev/btrfs-control", O_RDWR|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + if (ERRNO_IS_DEVICE_ABSENT(errno)) { + /* Driver not installed? Then we aren't ready. This is useful in initrds that lack + * btrfs.ko. After the host transition (where btrfs.ko will hopefully become + * available) the device can be retriggered and will then be considered ready. */ + udev_builtin_add_property(dev, test, "ID_BTRFS_READY", "0"); + return 0; + } + + return log_device_debug_errno(dev, errno, "Failed to open /dev/btrfs-control: %m"); + } + + strscpy(args.name, sizeof(args.name), argv[2]); + r = ioctl(fd, BTRFS_IOC_DEVICES_READY, &args); + if (r < 0) + return log_device_debug_errno(dev, errno, "Failed to call BTRFS_IOC_DEVICES_READY: %m"); + + udev_builtin_add_property(dev, test, "ID_BTRFS_READY", one_zero(r == 0)); + return 0; +} + +const UdevBuiltin udev_builtin_btrfs = { + .name = "btrfs", + .cmd = builtin_btrfs, + .help = "btrfs volume management", +}; diff --git a/src/udev/udev-builtin-hwdb.c b/src/udev/udev-builtin-hwdb.c new file mode 100644 index 0000000..19e07e7 --- /dev/null +++ b/src/udev/udev-builtin-hwdb.c @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "sd-hwdb.h" + +#include "alloc-util.h" +#include "device-util.h" +#include "hwdb-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "udev-builtin.h" + +static sd_hwdb *hwdb; + +int udev_builtin_hwdb_lookup(sd_device *dev, + const char *prefix, const char *modalias, + const char *filter, bool test) { + _cleanup_free_ char *lookup = NULL; + const char *key, *value; + int n = 0, r; + + if (!hwdb) + return -ENOENT; + + if (prefix) { + lookup = strjoin(prefix, modalias); + if (!lookup) + return -ENOMEM; + modalias = lookup; + } + + SD_HWDB_FOREACH_PROPERTY(hwdb, modalias, key, value) { + if (filter && fnmatch(filter, key, FNM_NOESCAPE) != 0) + continue; + + r = udev_builtin_add_property(dev, test, key, value); + if (r < 0) + return r; + n++; + } + return n; +} + +static const char *modalias_usb(sd_device *dev, char *s, size_t size) { + const char *v, *p, *n = NULL; + uint16_t vn, pn; + + if (sd_device_get_sysattr_value(dev, "idVendor", &v) < 0) + return NULL; + if (sd_device_get_sysattr_value(dev, "idProduct", &p) < 0) + return NULL; + if (safe_atoux16(v, &vn) < 0) + return NULL; + if (safe_atoux16(p, &pn) < 0) + return NULL; + (void) sd_device_get_sysattr_value(dev, "product", &n); + + (void) snprintf(s, size, "usb:v%04Xp%04X:%s", vn, pn, strempty(n)); + return s; +} + +static int udev_builtin_hwdb_search(sd_device *dev, sd_device *srcdev, + const char *subsystem, const char *prefix, + const char *filter, bool test) { + char s[LINE_MAX]; + bool last = false; + int r = 0; + + assert(dev); + + if (!srcdev) + srcdev = dev; + + for (sd_device *d = srcdev; d; ) { + const char *dsubsys, *devtype, *modalias = NULL; + + if (sd_device_get_subsystem(d, &dsubsys) < 0) + goto next; + + /* look only at devices of a specific subsystem */ + if (subsystem && !streq(dsubsys, subsystem)) + goto next; + + (void) sd_device_get_property_value(d, "MODALIAS", &modalias); + + if (streq(dsubsys, "usb") && + sd_device_get_devtype(d, &devtype) >= 0 && + streq(devtype, "usb_device")) { + /* if the usb_device does not have a modalias, compose one */ + if (!modalias) + modalias = modalias_usb(d, s, sizeof(s)); + + /* avoid looking at any parent device, they are usually just a USB hub */ + last = true; + } + + if (!modalias) + goto next; + + log_device_debug(dev, "hwdb modalias key: \"%s\"", modalias); + + r = udev_builtin_hwdb_lookup(dev, prefix, modalias, filter, test); + if (r > 0) + break; + + if (last) + break; +next: + if (sd_device_get_parent(d, &d) < 0) + break; + } + + return r; +} + +static int builtin_hwdb(UdevEvent *event, int argc, char *argv[], bool test) { + static const struct option options[] = { + { "filter", required_argument, NULL, 'f' }, + { "device", required_argument, NULL, 'd' }, + { "subsystem", required_argument, NULL, 's' }, + { "lookup-prefix", required_argument, NULL, 'p' }, + {} + }; + const char *filter = NULL; + const char *device = NULL; + const char *subsystem = NULL; + const char *prefix = NULL; + _cleanup_(sd_device_unrefp) sd_device *srcdev = NULL; + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + int r; + + if (!hwdb) + return -EINVAL; + + for (;;) { + int option; + + option = getopt_long(argc, argv, "f:d:s:p:", options, NULL); + if (option == -1) + break; + + switch (option) { + case 'f': + filter = optarg; + break; + + case 'd': + device = optarg; + break; + + case 's': + subsystem = optarg; + break; + + case 'p': + prefix = optarg; + break; + } + } + + /* query a specific key given as argument */ + if (argv[optind]) { + r = udev_builtin_hwdb_lookup(dev, prefix, argv[optind], filter, test); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to look up hwdb: %m"); + if (r == 0) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ENODATA), "No entry found from hwdb."); + return r; + } + + /* read data from another device than the device we will store the data */ + if (device) { + r = sd_device_new_from_device_id(&srcdev, device); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to create sd_device object '%s': %m", device); + } + + r = udev_builtin_hwdb_search(dev, srcdev, subsystem, prefix, filter, test); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to look up hwdb: %m"); + if (r == 0) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ENODATA), "No entry found from hwdb."); + return r; +} + +/* called at udev startup and reload */ +static int builtin_hwdb_init(void) { + int r; + + if (hwdb) + return 0; + + r = sd_hwdb_new(&hwdb); + if (r < 0) + return r; + + return 0; +} + +/* called on udev shutdown and reload request */ +static void builtin_hwdb_exit(void) { + hwdb = sd_hwdb_unref(hwdb); +} + +/* called every couple of seconds during event activity; 'true' if config has changed */ +static bool builtin_hwdb_should_reload(void) { + if (hwdb_should_reload(hwdb)) { + log_debug("hwdb needs reloading."); + return true; + } + + return false; +} + +const UdevBuiltin udev_builtin_hwdb = { + .name = "hwdb", + .cmd = builtin_hwdb, + .init = builtin_hwdb_init, + .exit = builtin_hwdb_exit, + .should_reload = builtin_hwdb_should_reload, + .help = "Hardware database", +}; diff --git a/src/udev/udev-builtin-input_id.c b/src/udev/udev-builtin-input_id.c new file mode 100644 index 0000000..295e8d2 --- /dev/null +++ b/src/udev/udev-builtin-input_id.c @@ -0,0 +1,433 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * expose input properties via udev + * + * Portions Copyright © 2004 David Zeuthen, + * Copyright © 2014 Carlos Garnacho + */ + +#include +#include +#include +#include +#include + +#include "device-util.h" +#include "fd-util.h" +#include "missing_input.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "udev-builtin.h" + +/* we must use this kernel-compatible implementation */ +#define BITS_PER_LONG (sizeof(unsigned long) * 8) +#define NBITS(x) ((((x)-1)/BITS_PER_LONG)+1) +#define OFF(x) ((x)%BITS_PER_LONG) +#define BIT(x) (1UL<> OFF(bit)) & 1) + +struct range { + unsigned start; + unsigned end; +}; + +/* key code ranges above BTN_MISC (start is inclusive, stop is exclusive) */ +static const struct range high_key_blocks[] = { + { KEY_OK, BTN_DPAD_UP }, + { KEY_ALS_TOGGLE, BTN_TRIGGER_HAPPY } +}; + +static int abs_size_mm(const struct input_absinfo *absinfo) { + /* Resolution is defined to be in units/mm for ABS_X/Y */ + return (absinfo->maximum - absinfo->minimum) / absinfo->resolution; +} + +static void extract_info(sd_device *dev, bool test) { + char width[DECIMAL_STR_MAX(int)], height[DECIMAL_STR_MAX(int)]; + struct input_absinfo xabsinfo = {}, yabsinfo = {}; + _cleanup_close_ int fd = -EBADF; + + fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return; + + if (ioctl(fd, EVIOCGABS(ABS_X), &xabsinfo) < 0 || + ioctl(fd, EVIOCGABS(ABS_Y), &yabsinfo) < 0) + return; + + if (xabsinfo.resolution <= 0 || yabsinfo.resolution <= 0) + return; + + xsprintf(width, "%d", abs_size_mm(&xabsinfo)); + xsprintf(height, "%d", abs_size_mm(&yabsinfo)); + + udev_builtin_add_property(dev, test, "ID_INPUT_WIDTH_MM", width); + udev_builtin_add_property(dev, test, "ID_INPUT_HEIGHT_MM", height); +} + +/* + * Read a capability attribute and return bitmask. + * @param dev sd_device + * @param attr sysfs attribute name (e. g. "capabilities/key") + * @param bitmask: Output array which has a sizeof of bitmask_size + */ +static void get_cap_mask(sd_device *pdev, const char* attr, + unsigned long *bitmask, size_t bitmask_size, + bool test) { + const char *v; + char text[4096]; + unsigned i; + char* word; + unsigned long val; + int r; + + if (sd_device_get_sysattr_value(pdev, attr, &v) < 0) + v = ""; + + xsprintf(text, "%s", v); + log_device_debug(pdev, "%s raw kernel attribute: %s", attr, text); + + memzero(bitmask, bitmask_size); + i = 0; + while ((word = strrchr(text, ' '))) { + r = safe_atolu_full(word+1, 16, &val); + if (r < 0) + log_device_debug_errno(pdev, r, "Ignoring %s block which failed to parse: %m", attr); + else if (i < bitmask_size / sizeof(unsigned long)) + bitmask[i] = val; + else + log_device_debug(pdev, "Ignoring %s block %lX which is larger than maximum size", attr, val); + *word = '\0'; + i++; + } + r = safe_atolu_full(text, 16, &val); + if (r < 0) + log_device_debug_errno(pdev, r, "Ignoring %s block which failed to parse: %m", attr); + else if (i < bitmask_size / sizeof(unsigned long)) + bitmask[i] = val; + else + log_device_debug(pdev, "Ignoring %s block %lX which is larger than maximum size", attr, val); + + if (test && DEBUG_LOGGING) { + log_device_debug(pdev, "%s decoded bit map:", attr); + + val = bitmask_size / sizeof (unsigned long); + /* skip trailing zeros */ + while (bitmask[val-1] == 0 && val > 0) + --val; + + /* IN_SET() cannot be used in assert_cc(). */ + assert_cc(sizeof(unsigned long) == 4 || sizeof(unsigned long) == 8); + for (unsigned long j = 0; j < val; j++) + log_device_debug(pdev, + sizeof(unsigned long) == 4 ? " bit %4lu: %08lX\n" : " bit %4lu: %016lX\n", + j * BITS_PER_LONG, bitmask[j]); + } +} + +static struct input_id get_input_id(sd_device *dev) { + const char *v; + struct input_id id = {}; + + if (sd_device_get_sysattr_value(dev, "id/bustype", &v) >= 0) + (void) safe_atoux16(v, &id.bustype); + if (sd_device_get_sysattr_value(dev, "id/vendor", &v) >= 0) + (void) safe_atoux16(v, &id.vendor); + if (sd_device_get_sysattr_value(dev, "id/product", &v) >= 0) + (void) safe_atoux16(v, &id.product); + if (sd_device_get_sysattr_value(dev, "id/version", &v) >= 0) + (void) safe_atoux16(v, &id.version); + + return id; +} + +/* pointer devices */ +static bool test_pointers(sd_device *dev, + const struct input_id *id, + const unsigned long* bitmask_ev, + const unsigned long* bitmask_abs, + const unsigned long* bitmask_key, + const unsigned long* bitmask_rel, + const unsigned long* bitmask_props, + bool test) { + bool has_abs_coordinates = false; + bool has_rel_coordinates = false; + bool has_mt_coordinates = false; + size_t num_joystick_axes = 0; + size_t num_joystick_buttons = 0; + bool has_pad_buttons = false; + bool is_direct = false; + bool has_touch = false; + bool has_3d_coordinates = false; + bool has_keys = false; + bool has_stylus = false; + bool has_pen = false; + bool finger_but_no_pen = false; + bool has_mouse_button = false; + bool is_mouse = false; + bool is_abs_mouse = false; + bool is_touchpad = false; + bool is_touchscreen = false; + bool is_tablet = false; + bool is_tablet_pad = false; + bool is_joystick = false; + bool is_accelerometer = false; + bool is_pointing_stick = false; + bool has_wheel = false; + + has_keys = test_bit(EV_KEY, bitmask_ev); + has_abs_coordinates = test_bit(ABS_X, bitmask_abs) && test_bit(ABS_Y, bitmask_abs); + has_3d_coordinates = has_abs_coordinates && test_bit(ABS_Z, bitmask_abs); + is_accelerometer = test_bit(INPUT_PROP_ACCELEROMETER, bitmask_props); + + if (!has_keys && has_3d_coordinates) + is_accelerometer = true; + + if (is_accelerometer) { + udev_builtin_add_property(dev, test, "ID_INPUT_ACCELEROMETER", "1"); + return true; + } + + is_pointing_stick = test_bit(INPUT_PROP_POINTING_STICK, bitmask_props); + has_stylus = test_bit(BTN_STYLUS, bitmask_key); + has_pen = test_bit(BTN_TOOL_PEN, bitmask_key); + finger_but_no_pen = test_bit(BTN_TOOL_FINGER, bitmask_key) && !test_bit(BTN_TOOL_PEN, bitmask_key); + for (int button = BTN_MOUSE; button < BTN_JOYSTICK && !has_mouse_button; button++) + has_mouse_button = test_bit(button, bitmask_key); + has_rel_coordinates = test_bit(EV_REL, bitmask_ev) && test_bit(REL_X, bitmask_rel) && test_bit(REL_Y, bitmask_rel); + has_mt_coordinates = test_bit(ABS_MT_POSITION_X, bitmask_abs) && test_bit(ABS_MT_POSITION_Y, bitmask_abs); + + /* unset has_mt_coordinates if devices claims to have all abs axis */ + if (has_mt_coordinates && test_bit(ABS_MT_SLOT, bitmask_abs) && test_bit(ABS_MT_SLOT - 1, bitmask_abs)) + has_mt_coordinates = false; + is_direct = test_bit(INPUT_PROP_DIRECT, bitmask_props); + has_touch = test_bit(BTN_TOUCH, bitmask_key); + has_pad_buttons = test_bit(BTN_0, bitmask_key) && test_bit(BTN_1, bitmask_key) && !has_pen; + has_wheel = test_bit(EV_REL, bitmask_ev) && (test_bit(REL_WHEEL, bitmask_rel) || test_bit(REL_HWHEEL, bitmask_rel)); + + /* joysticks don't necessarily have buttons; e. g. + * rudders/pedals are joystick-like, but buttonless; they have + * other fancy axes. Others have buttons only but no axes. + * + * The BTN_JOYSTICK range starts after the mouse range, so a mouse + * with more than 16 buttons runs into the joystick range (e.g. Mad + * Catz Mad Catz M.M.O.TE). Skip those. + */ + if (!test_bit(BTN_JOYSTICK - 1, bitmask_key)) { + for (int button = BTN_JOYSTICK; button < BTN_DIGI; button++) + if (test_bit(button, bitmask_key)) + num_joystick_buttons++; + for (int button = BTN_TRIGGER_HAPPY1; button <= BTN_TRIGGER_HAPPY40; button++) + if (test_bit(button, bitmask_key)) + num_joystick_buttons++; + for (int button = BTN_DPAD_UP; button <= BTN_DPAD_RIGHT; button++) + if (test_bit(button, bitmask_key)) + num_joystick_buttons++; + } + for (int axis = ABS_RX; axis < ABS_PRESSURE; axis++) + if (test_bit(axis, bitmask_abs)) + num_joystick_axes++; + + if (has_abs_coordinates) { + if (has_stylus || has_pen) + is_tablet = true; + else if (finger_but_no_pen && !is_direct) + is_touchpad = true; + else if (has_mouse_button) + /* This path is taken by VMware's USB mouse, which has + * absolute axes, but no touch/pressure button. */ + is_abs_mouse = true; + else if (has_touch || is_direct) + is_touchscreen = true; + else if (num_joystick_buttons > 0 || num_joystick_axes > 0) + is_joystick = true; + } else if (num_joystick_buttons > 0 || num_joystick_axes > 0) + is_joystick = true; + + if (has_mt_coordinates) { + if (has_stylus || has_pen) + is_tablet = true; + else if (finger_but_no_pen && !is_direct) + is_touchpad = true; + else if (has_touch || is_direct) + is_touchscreen = true; + } + + if (is_tablet && has_pad_buttons) + is_tablet_pad = true; + + if (has_pad_buttons && has_wheel && !has_rel_coordinates) { + is_tablet = true; + is_tablet_pad = true; + } + + if (!is_tablet && !is_touchpad && !is_joystick && + has_mouse_button && + (has_rel_coordinates || + !has_abs_coordinates)) /* mouse buttons and no axis */ + is_mouse = true; + + /* There is no such thing as an i2c mouse */ + if (is_mouse && id->bustype == BUS_I2C) + is_pointing_stick = true; + + /* Joystick un-detection. Some keyboards have random joystick buttons + * set. Avoid those being labeled as ID_INPUT_JOYSTICK with some heuristics. + * The well-known keys represent a (randomly picked) set of key groups. + * A joystick may have one of those but probably not several. And a joystick with less than 2 buttons + * or axes is not a joystick either. + * libinput uses similar heuristics, any changes here should be added to libinput too. + */ + if (is_joystick) { + static const unsigned int well_known_keyboard_keys[] = { + KEY_LEFTCTRL, KEY_CAPSLOCK, KEY_NUMLOCK, KEY_INSERT, + KEY_MUTE, KEY_CALC, KEY_FILE, KEY_MAIL, KEY_PLAYPAUSE, + KEY_BRIGHTNESSDOWN, + }; + size_t num_well_known_keys = 0; + + if (has_keys) + for (size_t i = 0; i < ELEMENTSOF(well_known_keyboard_keys); i++) + if (test_bit(well_known_keyboard_keys[i], bitmask_key)) + num_well_known_keys++; + + if (num_well_known_keys >= 4 || num_joystick_buttons + num_joystick_axes < 2) { + log_device_debug(dev, "Input device has %zu joystick buttons and %zu axes but also %zu keyboard key sets, " + "assuming this is a keyboard, not a joystick.", + num_joystick_buttons, num_joystick_axes, num_well_known_keys); + is_joystick = false; + } + + if (has_wheel && has_pad_buttons) { + log_device_debug(dev, "Input device has %zu joystick buttons as well as tablet pad buttons, " + "assuming this is a tablet pad, not a joystick.", num_joystick_buttons); + + is_joystick = false; + } + } + + if (is_pointing_stick) + udev_builtin_add_property(dev, test, "ID_INPUT_POINTINGSTICK", "1"); + if (is_mouse || is_abs_mouse) + udev_builtin_add_property(dev, test, "ID_INPUT_MOUSE", "1"); + if (is_touchpad) + udev_builtin_add_property(dev, test, "ID_INPUT_TOUCHPAD", "1"); + if (is_touchscreen) + udev_builtin_add_property(dev, test, "ID_INPUT_TOUCHSCREEN", "1"); + if (is_joystick) + udev_builtin_add_property(dev, test, "ID_INPUT_JOYSTICK", "1"); + if (is_tablet) + udev_builtin_add_property(dev, test, "ID_INPUT_TABLET", "1"); + if (is_tablet_pad) + udev_builtin_add_property(dev, test, "ID_INPUT_TABLET_PAD", "1"); + + return is_tablet || is_mouse || is_abs_mouse || is_touchpad || is_touchscreen || is_joystick || is_pointing_stick; +} + +/* key like devices */ +static bool test_key(sd_device *dev, + const unsigned long* bitmask_ev, + const unsigned long* bitmask_key, + bool test) { + + bool found = false; + + /* do we have any KEY_* capability? */ + if (!test_bit(EV_KEY, bitmask_ev)) { + log_device_debug(dev, "test_key: no EV_KEY capability"); + return false; + } + + /* only consider KEY_* here, not BTN_* */ + for (size_t i = 0; i < BTN_MISC/BITS_PER_LONG && !found; i++) { + if (bitmask_key[i]) + found = true; + + log_device_debug(dev, "test_key: checking bit block %zu for any keys; found=%s", + i * BITS_PER_LONG, yes_no(found)); + } + /* If there are no keys in the lower block, check the higher blocks */ + for (size_t block = 0; block < sizeof(high_key_blocks) / sizeof(struct range) && !found; block++) + for (unsigned i = high_key_blocks[block].start; i < high_key_blocks[block].end && !found; i++) + if (test_bit(i, bitmask_key)) { + log_device_debug(dev, "test_key: Found key %x in high block", i); + found = true; + } + + if (found) + udev_builtin_add_property(dev, test, "ID_INPUT_KEY", "1"); + + /* the first 32 bits are ESC, numbers, and Q to D; if we have all of + * those, consider it a full keyboard; do not test KEY_RESERVED, though */ + if (FLAGS_SET(bitmask_key[0], 0xFFFFFFFE)) { + udev_builtin_add_property(dev, test, "ID_INPUT_KEYBOARD", "1"); + return true; + } + + return found; +} + +static int builtin_input_id(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *pdev, *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + unsigned long bitmask_ev[NBITS(EV_MAX)]; + unsigned long bitmask_abs[NBITS(ABS_MAX)]; + unsigned long bitmask_key[NBITS(KEY_MAX)]; + unsigned long bitmask_rel[NBITS(REL_MAX)]; + unsigned long bitmask_props[NBITS(INPUT_PROP_MAX)]; + const char *sysname; + bool is_pointer; + bool is_key; + + /* walk up the parental chain until we find the real input device; the + * argument is very likely a subdevice of this, like eventN */ + for (pdev = dev; pdev; ) { + const char *s; + + if (sd_device_get_sysattr_value(pdev, "capabilities/ev", &s) >= 0) + break; + + if (sd_device_get_parent_with_subsystem_devtype(pdev, "input", NULL, &pdev) >= 0) + continue; + + pdev = NULL; + break; + } + + if (pdev) { + struct input_id id = get_input_id(pdev); + + /* Use this as a flag that input devices were detected, so that this + * program doesn't need to be called more than once per device */ + udev_builtin_add_property(dev, test, "ID_INPUT", "1"); + get_cap_mask(pdev, "capabilities/ev", bitmask_ev, sizeof(bitmask_ev), test); + get_cap_mask(pdev, "capabilities/abs", bitmask_abs, sizeof(bitmask_abs), test); + get_cap_mask(pdev, "capabilities/rel", bitmask_rel, sizeof(bitmask_rel), test); + get_cap_mask(pdev, "capabilities/key", bitmask_key, sizeof(bitmask_key), test); + get_cap_mask(pdev, "properties", bitmask_props, sizeof(bitmask_props), test); + is_pointer = test_pointers(dev, &id, bitmask_ev, bitmask_abs, + bitmask_key, bitmask_rel, + bitmask_props, test); + is_key = test_key(dev, bitmask_ev, bitmask_key, test); + /* Some evdev nodes have only a scrollwheel */ + if (!is_pointer && !is_key && test_bit(EV_REL, bitmask_ev) && + (test_bit(REL_WHEEL, bitmask_rel) || test_bit(REL_HWHEEL, bitmask_rel))) + udev_builtin_add_property(dev, test, "ID_INPUT_KEY", "1"); + if (test_bit(EV_SW, bitmask_ev)) + udev_builtin_add_property(dev, test, "ID_INPUT_SWITCH", "1"); + + } + + if (sd_device_get_sysname(dev, &sysname) >= 0 && + startswith(sysname, "event")) + extract_info(dev, test); + + return 0; +} + +const UdevBuiltin udev_builtin_input_id = { + .name = "input_id", + .cmd = builtin_input_id, + .help = "Input device properties", +}; diff --git a/src/udev/udev-builtin-keyboard.c b/src/udev/udev-builtin-keyboard.c new file mode 100644 index 0000000..3903bc4 --- /dev/null +++ b/src/udev/udev-builtin-keyboard.c @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "device-util.h" +#include "fd-util.h" +#include "parse-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "udev-builtin.h" + +static const struct key_name *keyboard_lookup_key(const char *str, GPERF_LEN_TYPE len); +#include "keyboard-keys-from-name.h" + +static int install_force_release(sd_device *dev, const unsigned *release, unsigned release_count) { + sd_device *atkbd; + const char *cur; + char codes[4096]; + char *s; + size_t l; + unsigned i; + int r; + + assert(dev); + assert(release); + + r = sd_device_get_parent_with_subsystem_devtype(dev, "serio", NULL, &atkbd); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get serio parent: %m"); + + r = sd_device_get_sysattr_value(atkbd, "force_release", &cur); + if (r < 0) + return log_device_error_errno(atkbd, r, "Failed to get force-release attribute: %m"); + + s = codes; + l = sizeof(codes); + + /* copy current content */ + l = strpcpy(&s, l, cur); + + /* append new codes */ + for (i = 0; i < release_count; i++) + l = strpcpyf(&s, l, ",%u", release[i]); + + log_device_debug(atkbd, "keyboard: updating force-release list with '%s'", codes); + r = sd_device_set_sysattr_value(atkbd, "force_release", codes); + if (r < 0) + return log_device_error_errno(atkbd, r, "Failed to set force-release attribute: %m"); + + return 0; +} + +static int map_keycode(sd_device *dev, int fd, int scancode, const char *keycode) { + struct { + unsigned scan; + unsigned key; + } map; + const struct key_name *k; + unsigned keycode_num; + int r; + + /* translate identifier to key code */ + k = keyboard_lookup_key(keycode, strlen(keycode)); + if (k) + keycode_num = k->id; + else { + /* check if it's a numeric code already */ + r = safe_atou(keycode, &keycode_num); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to parse key identifier '%s': %m", keycode); + } + + map.scan = scancode; + map.key = keycode_num; + + log_device_debug(dev, "keyboard: mapping scan code %u (0x%x) to key code %u (0x%x)", + map.scan, map.scan, map.key, map.key); + + if (ioctl(fd, EVIOCSKEYCODE, &map) < 0) + return log_device_error_errno(dev, errno, "Failed to call EVIOCSKEYCODE with scan code 0x%x, and key code %u: %m", map.scan, map.key); + + return 0; +} + +static const char* parse_token(const char *current, int32_t *val_out) { + char *next; + int32_t val; + + if (!current) + return NULL; + + val = strtol(current, &next, 0); + if (*next && *next != ':') + return NULL; + + if (next != current) + *val_out = val; + + if (*next) + next++; + + return next; +} + +static int override_abs(sd_device *dev, int fd, unsigned evcode, const char *value) { + struct input_absinfo absinfo; + const char *next; + + if (ioctl(fd, EVIOCGABS(evcode), &absinfo) < 0) + return log_device_error_errno(dev, errno, "Failed to call EVIOCGABS"); + + next = parse_token(value, &absinfo.minimum); + next = parse_token(next, &absinfo.maximum); + next = parse_token(next, &absinfo.resolution); + next = parse_token(next, &absinfo.fuzz); + next = parse_token(next, &absinfo.flat); + if (!next) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "Failed to parse EV_ABS override '%s'", value); + + log_device_debug(dev, "keyboard: %x overridden with %"PRIi32"/%"PRIi32"/%"PRIi32"/%"PRIi32"/%"PRIi32, + evcode, absinfo.minimum, absinfo.maximum, absinfo.resolution, absinfo.fuzz, absinfo.flat); + if (ioctl(fd, EVIOCSABS(evcode), &absinfo) < 0) + return log_device_error_errno(dev, errno, "Failed to call EVIOCSABS"); + + return 0; +} + +static int set_trackpoint_sensitivity(sd_device *dev, const char *value) { + sd_device *pdev; + char val_s[DECIMAL_STR_MAX(int)]; + int r, val_i; + + assert(dev); + assert(value); + + /* The sensitivity sysfs attr belongs to the serio parent device */ + r = sd_device_get_parent_with_subsystem_devtype(dev, "serio", NULL, &pdev); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get serio parent: %m"); + + r = safe_atoi(value, &val_i); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to parse POINTINGSTICK_SENSITIVITY '%s': %m", value); + else if (val_i < 0 || val_i > 255) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(ERANGE), "POINTINGSTICK_SENSITIVITY %d outside range [0..255]", val_i); + + xsprintf(val_s, "%d", val_i); + + r = sd_device_set_sysattr_value(pdev, "sensitivity", val_s); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to write 'sensitivity' attribute: %m"); + + return 0; +} + +static int builtin_keyboard(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + unsigned release[1024]; + unsigned release_count = 0; + _cleanup_close_ int fd = -EBADF; + const char *node; + int has_abs = -1, r; + + r = sd_device_get_devname(dev, &node); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get device name: %m"); + + FOREACH_DEVICE_PROPERTY(dev, key, value) + if (startswith(key, "KEYBOARD_KEY_")) { + const char *keycode = value; + unsigned scancode; + + /* KEYBOARD_KEY_= */ + r = safe_atou_full(key + 13, 16, &scancode); + if (r < 0) { + log_device_warning_errno(dev, r, "Failed to parse scan code from \"%s\", ignoring: %m", key); + continue; + } + + /* a leading '!' needs a force-release entry */ + if (keycode[0] == '!') { + keycode++; + + release[release_count] = scancode; + if (release_count < ELEMENTSOF(release)-1) + release_count++; + + if (keycode[0] == '\0') + continue; + } + + if (fd < 0) { + fd = sd_device_open(dev, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return log_device_error_errno(dev, fd, "Failed to open device '%s': %m", node); + } + + (void) map_keycode(dev, fd, scancode, keycode); + } else if (startswith(key, "EVDEV_ABS_")) { + unsigned evcode; + + /* EVDEV_ABS_=:::: */ + r = safe_atou_full(key + 10, 16, &evcode); + if (r < 0) { + log_device_warning_errno(dev, r, "Failed to parse EV_ABS code from \"%s\", ignoring: %m", key); + continue; + } + + if (fd < 0) { + fd = sd_device_open(dev, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return log_device_error_errno(dev, fd, "Failed to open device '%s': %m", node); + } + + if (has_abs == -1) { + unsigned long bits; + int rc; + + rc = ioctl(fd, EVIOCGBIT(0, sizeof(bits)), &bits); + if (rc < 0) + return log_device_error_errno(dev, errno, "Failed to set EVIOCGBIT"); + + has_abs = !!(bits & (1 << EV_ABS)); + if (!has_abs) + log_device_warning(dev, "EVDEV_ABS override set but no EV_ABS present on device"); + } + + if (!has_abs) + continue; + + (void) override_abs(dev, fd, evcode, value); + } else if (streq(key, "POINTINGSTICK_SENSITIVITY")) + (void) set_trackpoint_sensitivity(dev, value); + + /* install list of force-release codes */ + if (release_count > 0) + (void) install_force_release(dev, release, release_count); + + return 0; +} + +const UdevBuiltin udev_builtin_keyboard = { + .name = "keyboard", + .cmd = builtin_keyboard, + .help = "Keyboard scancode mapping and touchpad/pointingstick characteristics", +}; diff --git a/src/udev/udev-builtin-kmod.c b/src/udev/udev-builtin-kmod.c new file mode 100644 index 0000000..3ab5c48 --- /dev/null +++ b/src/udev/udev-builtin-kmod.c @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * load kernel modules + * + * Copyright © 2011 ProFUSION embedded systems + */ + +#include +#include +#include +#include + +#include "device-util.h" +#include "module-util.h" +#include "string-util.h" +#include "strv.h" +#include "udev-builtin.h" + +static struct kmod_ctx *ctx = NULL; + +_printf_(6,0) static void udev_kmod_log(void *data, int priority, const char *file, int line, const char *fn, const char *format, va_list args) { + log_internalv(priority, 0, file, line, fn, format, args); +} + +static int builtin_kmod(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + int r; + + if (!ctx) + return 0; + + if (argc < 2 || !streq(argv[1], "load")) + return log_device_warning_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "%s: expected: load [module…]", argv[0]); + + char **modules = strv_skip(argv, 2); + if (strv_isempty(modules)) { + const char *modalias; + + r = sd_device_get_property_value(dev, "MODALIAS", &modalias); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to read property \"MODALIAS\"."); + + (void) module_load_and_warn(ctx, modalias, /* verbose = */ false); + } else + STRV_FOREACH(module, modules) + (void) module_load_and_warn(ctx, *module, /* verbose = */ false); + + return 0; +} + +/* called at udev startup and reload */ +static int builtin_kmod_init(void) { + if (ctx) + return 0; + + ctx = kmod_new(NULL, NULL); + if (!ctx) + return -ENOMEM; + + log_debug("Loading kernel module index."); + kmod_set_log_fn(ctx, udev_kmod_log, NULL); + kmod_load_resources(ctx); + return 0; +} + +/* called on udev shutdown and reload request */ +static void builtin_kmod_exit(void) { + log_debug("Unload kernel module index."); + ctx = kmod_unref(ctx); +} + +/* called every couple of seconds during event activity; 'true' if config has changed */ +static bool builtin_kmod_should_reload(void) { + if (!ctx) + return false; + + if (kmod_validate_resources(ctx) != KMOD_RESOURCES_OK) { + log_debug("Kernel module index needs reloading."); + return true; + } + + return false; +} + +const UdevBuiltin udev_builtin_kmod = { + .name = "kmod", + .cmd = builtin_kmod, + .init = builtin_kmod_init, + .exit = builtin_kmod_exit, + .should_reload = builtin_kmod_should_reload, + .help = "Kernel module loader", + .run_once = false, +}; diff --git a/src/udev/udev-builtin-net_driver.c b/src/udev/udev-builtin-net_driver.c new file mode 100644 index 0000000..f1642a4 --- /dev/null +++ b/src/udev/udev-builtin-net_driver.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "device-util.h" +#include "errno-util.h" +#include "ethtool-util.h" +#include "fd-util.h" +#include "log.h" +#include "string-util.h" +#include "udev-builtin.h" + +static int builtin_net_driver_set_driver(UdevEvent *event, int argc, char **argv, bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + _cleanup_close_ int ethtool_fd = -EBADF; + _cleanup_free_ char *driver = NULL; + const char *sysname; + int r; + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get sysname: %m"); + + r = ethtool_get_driver(ðtool_fd, sysname, &driver); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) { + log_device_debug_errno(dev, r, "Querying driver name via ethtool API is not supported by device '%s', ignoring: %m", sysname); + return 0; + } + if (r == -ENODEV) { + log_device_debug_errno(dev, r, "Device already vanished, ignoring."); + return 0; + } + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get driver for '%s': %m", sysname); + + return udev_builtin_add_property(event->dev, test, "ID_NET_DRIVER", driver); +} + +const UdevBuiltin udev_builtin_net_driver = { + .name = "net_driver", + .cmd = builtin_net_driver_set_driver, + .help = "Set driver for network device", + .run_once = true, +}; diff --git a/src/udev/udev-builtin-net_id.c b/src/udev/udev-builtin-net_id.c new file mode 100644 index 0000000..91b4008 --- /dev/null +++ b/src/udev/udev-builtin-net_id.c @@ -0,0 +1,1366 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* + * Predictable network interface device names based on: + * - firmware/bios-provided index numbers for on-board devices + * - firmware-provided pci-express hotplug slot index number + * - physical/geographical location of the hardware + * - the interface's MAC address + * + * https://systemd.io/PREDICTABLE_INTERFACE_NAMES + * + * When the code here is changed, man/systemd.net-naming-scheme.xml must be updated too. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "chase.h" +#include "device-private.h" +#include "device-util.h" +#include "dirent-util.h" +#include "ether-addr-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "glyph-util.h" +#include "netif-naming-scheme.h" +#include "parse-util.h" +#include "proc-cmdline.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "strxcpyx.h" +#include "udev-builtin.h" + +#define ONBOARD_14BIT_INDEX_MAX ((1U << 14) - 1) +#define ONBOARD_16BIT_INDEX_MAX ((1U << 16) - 1) + +/* skip intermediate virtio devices */ +static sd_device *device_skip_virtio(sd_device *dev) { + /* there can only ever be one virtio bus per parent device, so we can + * safely ignore any virtio buses. see + * http://lists.linuxfoundation.org/pipermail/virtualization/2015-August/030331.html */ + while (dev) { + const char *subsystem; + + if (sd_device_get_subsystem(dev, &subsystem) < 0) + break; + + if (!streq(subsystem, "virtio")) + break; + + if (sd_device_get_parent(dev, &dev) < 0) + return NULL; + } + + return dev; +} + +static int get_matching_parent( + sd_device *dev, + char * const *parent_subsystems, + bool skip_virtio, + sd_device **ret) { + + sd_device *parent; + int r; + + assert(dev); + + r = sd_device_get_parent(dev, &parent); + if (r < 0) + return r; + + if (skip_virtio) { + /* skip virtio subsystem if present */ + parent = device_skip_virtio(parent); + if (!parent) + return -ENODEV; + } + + if (!strv_isempty(parent_subsystems)) { + const char *subsystem; + + /* check if our direct parent is in an expected subsystem. */ + r = sd_device_get_subsystem(parent, &subsystem); + if (r < 0) + return r; + + if (!strv_contains(parent_subsystems, subsystem)) + return -ENODEV; + } + + if (ret) + *ret = parent; + + return 0; +} + +static int get_first_syspath_component(sd_device *dev, const char *prefix, char **ret) { + _cleanup_free_ char *buf = NULL; + const char *syspath, *p, *q; + int r; + + assert(dev); + assert(prefix); + assert(ret); + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) + return r; + + p = path_startswith(syspath, prefix); + if (!p) + return -EINVAL; + + r = path_find_first_component(&p, /* accept_dot_dot = */ false, &q); + if (r < 0) + return r; + + buf = strndup(q, r); + if (!buf) + return -ENOMEM; + + *ret = TAKE_PTR(buf); + return r; /* return the length of the string */ +} + +static int get_virtfn_info(sd_device *pcidev, sd_device **ret_physfn_pcidev, char **ret_suffix) { + _cleanup_(sd_device_unrefp) sd_device *physfn_pcidev = NULL; + const char *syspath, *name; + int r; + + assert(pcidev); + assert(ret_physfn_pcidev); + assert(ret_suffix); + + r = sd_device_get_syspath(pcidev, &syspath); + if (r < 0) + return r; + + /* Get physical function's pci device. */ + r = sd_device_new_child(&physfn_pcidev, pcidev, "physfn"); + if (r < 0) + return r; + + /* Find the virtual function number by finding the right virtfn link. */ + FOREACH_DEVICE_CHILD_WITH_SUFFIX(physfn_pcidev, child, name) { + const char *n, *s; + + /* Only accepts e.g. virtfn0, virtfn1, and so on. */ + n = startswith(name, "virtfn"); + if (isempty(n) || !in_charset(n, DIGITS)) + continue; + + if (sd_device_get_syspath(child, &s) < 0) + continue; + + if (streq(s, syspath)) { + char *suffix; + + suffix = strjoin("v", n); + if (!suffix) + return -ENOMEM; + + *ret_physfn_pcidev = sd_device_ref(physfn_pcidev); + *ret_suffix = suffix; + return 0; + } + } + + return -ENOENT; +} + +static int get_dev_port(sd_device *dev, bool fallback_to_dev_id, unsigned *ret) { + unsigned v; + int r; + + assert(dev); + assert(ret); + + /* Get kernel provided port index for the case when multiple ports on a single PCI function. */ + + r = device_get_sysattr_unsigned(dev, "dev_port", &v); + if (r < 0) + return r; + if (r > 0) { + /* Found a positive index. Let's use it. */ + *ret = v; + return 1; /* positive */ + } + assert(v == 0); + + /* With older kernels IP-over-InfiniBand network interfaces sometimes erroneously provide the port + * number in the 'dev_id' sysfs attribute instead of 'dev_port', which thus stays initialized as 0. */ + + if (fallback_to_dev_id) { + unsigned iftype; + + r = device_get_sysattr_unsigned(dev, "type", &iftype); + if (r < 0) + return r; + + fallback_to_dev_id = (iftype == ARPHRD_INFINIBAND); + } + + if (fallback_to_dev_id) + return device_get_sysattr_unsigned(dev, "dev_id", ret); + + /* Otherwise, return the original index 0. */ + *ret = 0; + return 0; /* zero */ +} + +static int get_port_specifier(sd_device *dev, bool fallback_to_dev_id, char **ret) { + const char *phys_port_name; + unsigned dev_port; + char *buf; + int r; + + assert(dev); + assert(ret); + + /* First, try to use the kernel provided front panel port name for multiple port PCI device. */ + r = sd_device_get_sysattr_value(dev, "phys_port_name", &phys_port_name); + if (r >= 0 && !isempty(phys_port_name)) { + if (naming_scheme_has(NAMING_SR_IOV_R)) { + int vf_id = -1; + + /* Check if phys_port_name indicates virtual device representor. */ + (void) sscanf(phys_port_name, "pf%*uvf%d", &vf_id); + + if (vf_id >= 0) { + /* For VF representor append 'r'. */ + if (asprintf(&buf, "r%d", vf_id) < 0) + return log_oom_debug(); + + *ret = buf; + return 1; + } + } + + /* Otherwise, use phys_port_name as is. */ + buf = strjoin("n", phys_port_name); + if (!buf) + return log_oom_debug(); + + *ret = buf; + return 1; + } + + /* Then, try to use the kernel provided port index for the case when multiple ports on a single PCI + * function. */ + r = get_dev_port(dev, fallback_to_dev_id, &dev_port); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device port index: %m"); + if (r > 0) { + assert(dev_port > 0); + if (asprintf(&buf, "d%u", dev_port) < 0) + return log_oom_debug(); + + *ret = buf; + return 1; + } + + *ret = NULL; + return 0; +} + +static bool is_valid_onboard_index(unsigned idx) { + /* Some BIOSes report rubbish indexes that are excessively high (2^24-1 is an index VMware likes to + * report for example). Let's define a cut-off where we don't consider the index reliable anymore. We + * pick some arbitrary cut-off, which is somewhere beyond the realistic number of physical network + * interface a system might have. Ideally the kernel would already filter this crap for us, but it + * doesn't currently. The initial cut-off value (2^14-1) was too conservative for s390 PCI which + * allows for index values up 2^16-1 which is now enabled with the NAMING_16BIT_INDEX naming flag. */ + return idx <= (naming_scheme_has(NAMING_16BIT_INDEX) ? ONBOARD_16BIT_INDEX_MAX : ONBOARD_14BIT_INDEX_MAX); +} + +static int pci_get_onboard_index(sd_device *dev, unsigned *ret) { + unsigned idx; + int r; + + assert(dev); + assert(ret); + + /* ACPI _DSM — device specific method for naming a PCI or PCI Express device */ + r = device_get_sysattr_unsigned(dev, "acpi_index", &idx); + if (r < 0) + /* SMBIOS type 41 — Onboard Devices Extended Information */ + r = device_get_sysattr_unsigned(dev, "index", &idx); + if (r < 0) + return log_device_debug_errno(dev, r, "Could not obtain onboard index: %m"); + + if (idx == 0 && !naming_scheme_has(NAMING_ZERO_ACPI_INDEX)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "Naming scheme does not allow onboard index==0."); + if (!is_valid_onboard_index(idx)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ENOENT), + "Not a valid onboard index: %u", idx); + + *ret = idx; + return 0; +} + +static int names_pci_onboard(sd_device *dev, sd_device *pci_dev, const char *prefix, const char *suffix, bool test) { + _cleanup_free_ char *port = NULL; + unsigned idx = 0; /* avoid false maybe-uninitialized warning */ + int r; + + assert(dev); + assert(pci_dev); + assert(prefix); + + /* retrieve on-board index number from firmware */ + r = pci_get_onboard_index(pci_dev, &idx); + if (r < 0) + return r; + + r = get_port_specifier(dev, /* fallback_to_dev_id = */ false, &port); + if (r < 0) + return r; + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%so%u%s%s", prefix, idx, strempty(port), strempty(suffix))) + udev_builtin_add_property(dev, test, "ID_NET_NAME_ONBOARD", str); + + log_device_debug(dev, "Onboard index identifier: index=%u port=%s %s %s", + idx, strna(port), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), empty_to_na(str)); + + return 0; +} + +static int names_pci_onboard_label(sd_device *dev, sd_device *pci_dev, const char *prefix, bool test) { + const char *label; + int r; + + assert(dev); + assert(prefix); + + /* retrieve on-board label from firmware */ + r = sd_device_get_sysattr_value(pci_dev, "label", &label); + if (r < 0) + return log_device_debug_errno(pci_dev, r, "Failed to get PCI onboard label: %m"); + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%s%s", + naming_scheme_has(NAMING_LABEL_NOPREFIX) ? "" : prefix, + label)) + udev_builtin_add_property(dev, test, "ID_NET_LABEL_ONBOARD", str); + + log_device_debug(dev, "Onboard label from PCI device: %s", label); + return 0; +} + +/* read the 256 bytes PCI configuration space to check the multi-function bit */ +static int is_pci_multifunction(sd_device *dev) { + _cleanup_free_ uint8_t *config = NULL; + const char *filename, *syspath; + size_t len; + int r; + + assert(dev); + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) + return r; + + filename = strjoina(syspath, "/config"); + r = read_virtual_file(filename, PCI_HEADER_TYPE + 1, (char **) &config, &len); + if (r < 0) + return r; + if (len < PCI_HEADER_TYPE + 1) + return -EINVAL; + +#ifndef PCI_HEADER_TYPE_MULTIFUNC +#define PCI_HEADER_TYPE_MULTIFUNC 0x80 +#endif + + /* bit 0-6 header type, bit 7 multi/single function device */ + return config[PCI_HEADER_TYPE] & PCI_HEADER_TYPE_MULTIFUNC; +} + +static bool is_pci_ari_enabled(sd_device *dev) { + assert(dev); + + return device_get_sysattr_bool(dev, "ari_enabled") > 0; +} + +static bool is_pci_bridge(sd_device *dev) { + const char *v, *p; + + assert(dev); + + if (sd_device_get_sysattr_value(dev, "modalias", &v) < 0) + return false; + + if (!startswith(v, "pci:")) + return false; + + p = strrchr(v, 's'); + if (!p) + return false; + if (p[1] != 'c') + return false; + + /* PCI device subclass 04 corresponds to PCI bridge */ + bool b = strneq(p + 2, "04", 2); + if (b) + log_device_debug(dev, "Device is a PCI bridge."); + return b; +} + +static int parse_hotplug_slot_from_function_id(sd_device *dev, int slots_dirfd, uint32_t *ret) { + uint64_t function_id; + char filename[NAME_MAX+1]; + const char *attr; + int r; + + /* The /function_id attribute is unique to the s390 PCI driver. If present, we know that the + * slot's directory name for this device is /sys/bus/pci/slots/XXXXXXXX/ where XXXXXXXX is the fixed + * length 8 hexadecimal character string representation of function_id. Therefore we can short cut + * here and just check for the existence of the slot directory. As this directory has to exist, we're + * emitting a debug message for the unlikely case it's not found. Note that the domain part doesn't + * belong to the slot name here because there's a 1-to-1 relationship between PCI function and its + * hotplug slot. See https://docs.kernel.org/s390/pci.html for more details. */ + + assert(dev); + assert(slots_dirfd >= 0); + assert(ret); + + if (!naming_scheme_has(NAMING_SLOT_FUNCTION_ID)) { + *ret = 0; + return 0; + } + + if (sd_device_get_sysattr_value(dev, "function_id", &attr) < 0) { + *ret = 0; + return 0; + } + + r = safe_atou64(attr, &function_id); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse function_id, ignoring: %s", attr); + + if (function_id <= 0 || function_id > UINT32_MAX) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "Invalid function id (0x%"PRIx64"), ignoring.", + function_id); + + if (!snprintf_ok(filename, sizeof(filename), "%08"PRIx64, function_id)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ENAMETOOLONG), + "PCI slot path is too long, ignoring."); + + if (faccessat(slots_dirfd, filename, F_OK, 0) < 0) + return log_device_debug_errno(dev, errno, "Cannot access %s under pci slots, ignoring: %m", filename); + + *ret = (uint32_t) function_id; + return 1; /* Found. We should ignore domain part. */ +} + +static int pci_get_hotplug_slot_from_address( + sd_device *dev, + sd_device *pci, + DIR *dir, + uint32_t *ret) { + + const char *sysname; + int r; + + assert(dev); + assert(pci); + assert(dir); + assert(ret); + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get sysname: %m"); + + rewinddir(dir); + FOREACH_DIRENT_ALL(de, dir, break) { + _cleanup_free_ char *path = NULL; + const char *address; + uint32_t slot; + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (de->d_type != DT_DIR) + continue; + + r = safe_atou32(de->d_name, &slot); + if (r < 0 || slot <= 0) + continue; + + path = path_join("slots", de->d_name, "address"); + if (!path) + return log_oom_debug(); + + if (sd_device_get_sysattr_value(pci, path, &address) < 0) + continue; + + /* match slot address with device by stripping the function */ + if (!startswith(sysname, address)) + continue; + + *ret = slot; + return 1; /* found */ + } + + *ret = 0; + return 0; /* not found */ +} + +static int pci_get_hotplug_slot(sd_device *dev, uint32_t *ret) { + _cleanup_(sd_device_unrefp) sd_device *pci = NULL; + _cleanup_closedir_ DIR *dir = NULL; + int r; + + assert(dev); + assert(ret); + + /* ACPI _SUN — slot user number */ + r = sd_device_new_from_subsystem_sysname(&pci, "subsystem", "pci"); + if (r < 0) + return log_debug_errno(r, "Failed to create sd_device object for pci subsystem: %m"); + + r = device_opendir(pci, "slots", &dir); + if (r < 0) + return log_device_debug_errno(dev, r, "Cannot open 'slots' subdirectory: %m"); + + for (sd_device *slot_dev = dev; slot_dev; ) { + uint32_t slot = 0; /* avoid false maybe-uninitialized warning */ + + r = parse_hotplug_slot_from_function_id(slot_dev, dirfd(dir), &slot); + if (r < 0) + return r; + if (r > 0) { + *ret = slot; + return 1; /* domain should be ignored. */ + } + + r = pci_get_hotplug_slot_from_address(slot_dev, pci, dir, &slot); + if (r < 0) + return r; + if (r > 0) { + /* We found the match between PCI device and slot. However, we won't use the slot + * index if the device is a PCI bridge, because it can have other child devices that + * will try to claim the same index and that would create name collision. */ + if (naming_scheme_has(NAMING_BRIDGE_NO_SLOT) && is_pci_bridge(slot_dev)) { + if (naming_scheme_has(NAMING_BRIDGE_MULTIFUNCTION_SLOT) && is_pci_multifunction(dev) <= 0) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ESTALE), + "Not using slot information because the PCI device associated with " + "the hotplug slot is a bridge and the PCI device has a single function."); + + if (!naming_scheme_has(NAMING_BRIDGE_MULTIFUNCTION_SLOT)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ESTALE), + "Not using slot information because the PCI device is a bridge."); + } + + *ret = slot; + return 0; /* domain can be still used. */ + } + + if (sd_device_get_parent_with_subsystem_devtype(slot_dev, "pci", NULL, &slot_dev) < 0) + break; + } + + return -ENOENT; +} + +static int get_pci_slot_specifiers( + sd_device *dev, + char **ret_domain, + char **ret_bus_and_slot, + char **ret_func) { + + _cleanup_free_ char *domain_spec = NULL, *bus_and_slot_spec = NULL, *func_spec = NULL; + unsigned domain, bus, slot, func; + const char *sysname; + int r; + + assert(dev); + assert(ret_domain); + assert(ret_bus_and_slot); + assert(ret_func); + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get sysname: %m"); + + r = sscanf(sysname, "%x:%x:%x.%u", &domain, &bus, &slot, &func); + if (r != 4) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "Failed to parse slot information from PCI device sysname."); + + if (naming_scheme_has(NAMING_NPAR_ARI) && + is_pci_ari_enabled(dev)) + /* ARI devices support up to 256 functions on a single device ("slot"), and interpret the + * traditional 5-bit slot and 3-bit function number as a single 8-bit function number, + * where the slot makes up the upper 5 bits. */ + func += slot * 8; + + if (domain > 0 && asprintf(&domain_spec, "P%u", domain) < 0) + return log_oom_debug(); + + if (asprintf(&bus_and_slot_spec, "p%us%u", bus, slot) < 0) + return log_oom_debug(); + + if ((func > 0 || is_pci_multifunction(dev) > 0) && + asprintf(&func_spec, "f%u", func) < 0) + return log_oom_debug(); + + *ret_domain = TAKE_PTR(domain_spec); + *ret_bus_and_slot = TAKE_PTR(bus_and_slot_spec); + *ret_func = TAKE_PTR(func_spec); + return 0; +} + +static int names_pci_slot(sd_device *dev, sd_device *pci_dev, const char *prefix, const char *suffix, bool test) { + _cleanup_free_ char *domain = NULL, *bus_and_slot = NULL, *func = NULL, *port = NULL; + uint32_t hotplug_slot = 0; /* avoid false maybe-uninitialized warning */ + char str[ALTIFNAMSIZ]; + int r; + + assert(dev); + assert(pci_dev); + assert(prefix); + + r = get_pci_slot_specifiers(pci_dev, &domain, &bus_and_slot, &func); + if (r < 0) + return r; + + r = get_port_specifier(dev, /* fallback_to_dev_id = */ true, &port); + if (r < 0) + return r; + + /* compose a name based on the raw kernel's PCI bus, slot numbers */ + if (snprintf_ok(str, sizeof str, "%s%s%s%s%s%s", + prefix, strempty(domain), bus_and_slot, strempty(func), strempty(port), strempty(suffix))) + udev_builtin_add_property(dev, test, "ID_NET_NAME_PATH", str); + + log_device_debug(dev, "PCI path identifier: domain=%s bus_and_slot=%s func=%s port=%s %s %s", + strna(domain), bus_and_slot, strna(func), strna(port), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), empty_to_na(str)); + + r = pci_get_hotplug_slot(pci_dev, &hotplug_slot); + if (r < 0) + return r; + if (r > 0) + /* If the hotplug slot is found through the function ID, then drop the domain from the name. + * See comments in parse_hotplug_slot_from_function_id(). */ + domain = mfree(domain); + + if (snprintf_ok(str, sizeof str, "%s%ss%"PRIu32"%s%s%s", + prefix, strempty(domain), hotplug_slot, strempty(func), strempty(port), strempty(suffix))) + udev_builtin_add_property(dev, test, "ID_NET_NAME_SLOT", str); + + log_device_debug(dev, "Slot identifier: domain=%s slot=%"PRIu32" func=%s port=%s %s %s", + strna(domain), hotplug_slot, strna(func), strna(port), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), empty_to_na(str)); + + return 0; +} + +static int names_vio(sd_device *dev, const char *prefix, bool test) { + _cleanup_free_ char *s = NULL; + unsigned slotid; + int r; + + assert(dev); + assert(prefix); + + /* get ibmveth/ibmvnic slot-based names. */ + + /* check if our direct parent is a VIO device with no other bus in-between */ + if (get_matching_parent(dev, STRV_MAKE("vio"), /* skip_virtio = */ false, NULL) < 0) + return 0; + + log_device_debug(dev, "Parent device is in the vio subsystem."); + + /* The devices' $DEVPATH number is tied to (virtual) hardware (slot id + * selected in the HMC), thus this provides a reliable naming (e.g. + * "/devices/vio/30000002/net/eth1"); we ignore the bus number, as + * there should only ever be one bus, and then remove leading zeros. */ + r = get_first_syspath_component(dev, "/sys/devices/vio/", &s); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get VIO bus ID and slot ID: %m"); + + if (r != 8) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "VIO bus ID and slot ID have invalid length: %s", s); + + if (!in_charset(s, HEXDIGITS)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "VIO bus ID and slot ID contain invalid characters: %s", s); + + /* Parse only slot ID (the last 4 hexdigits). */ + r = safe_atou_full(s + 4, 16, &slotid); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse VIO slot from '%s': %m", s); + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%sv%u", prefix, slotid)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_SLOT", str); + log_device_debug(dev, "Vio slot identifier: slotid=%u %s %s", + slotid, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + return 0; +} + +static int names_platform(sd_device *dev, const char *prefix, bool test) { + _cleanup_free_ char *p = NULL; + const char *validchars; + char *vendor, *model_str, *instance_str; + unsigned model, instance; + int r; + + assert(dev); + assert(prefix); + + /* get ACPI path names for ARM64 platform devices */ + + /* check if our direct parent is a platform device with no other bus in-between */ + if (get_matching_parent(dev, STRV_MAKE("platform"), /* skip_virtio = */ false, NULL) < 0) + return 0; + + log_device_debug(dev, "Parent device is in the platform subsystem."); + + r = get_first_syspath_component(dev, "/sys/devices/platform/", &p); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get platform ID: %m"); + + /* Platform devices are named after ACPI table match, and instance id + * eg. "/sys/devices/platform/HISI00C2:00" + * The Vendor (3 or 4 char), followed by hexadecimal model number : instance id. */ + if (r == 10 && p[7] == ':') { + /* 3 char vendor string */ + vendor = strndupa(p, 3); + model_str = strndupa(p + 3, 4); + instance_str = strndupa(p + 8, 2); + validchars = UPPERCASE_LETTERS; + } else if (r == 11 && p[8] == ':') { + /* 4 char vendor string */ + vendor = strndupa(p, 4); + model_str = strndupa(p + 4, 4); + instance_str = strndupa(p + 9, 2); + validchars = UPPERCASE_LETTERS DIGITS; + } else + return -EOPNOTSUPP; + + if (!in_charset(vendor, validchars)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ENOENT), + "Platform vendor contains invalid characters: %s", vendor); + + ascii_strlower(vendor); + + r = safe_atou_full(model_str, 16, &model); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse model number \"%s\": %m", model_str); + + r = safe_atou_full(instance_str, 16, &instance); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse instance id \"%s\": %m", instance_str); + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%sa%s%xi%u", prefix, vendor, model, instance)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_PATH", str); + log_device_debug(dev, "Platform identifier: vendor=%s model=%x instance=%u %s %s", + vendor, model, instance, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + return 0; +} + +static int names_devicetree(sd_device *dev, const char *prefix, bool test) { + _cleanup_(sd_device_unrefp) sd_device *aliases_dev = NULL, *ofnode_dev = NULL, *devicetree_dev = NULL; + const char *ofnode_path, *ofnode_syspath, *devicetree_syspath; + sd_device *parent; + int r; + + assert(dev); + assert(prefix); + + if (!naming_scheme_has(NAMING_DEVICETREE_ALIASES)) + return 0; + + /* only ethernet supported for now */ + if (!streq(prefix, "en")) + return -EOPNOTSUPP; + + /* check if our direct parent has an of_node */ + r = sd_device_get_parent(dev, &parent); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get parent device: %m"); + + r = sd_device_new_child(&ofnode_dev, parent, "of_node"); + if (r < 0) + return log_device_debug_errno(parent, r, "Failed to get 'of_node' child device: %m"); + + r = sd_device_get_syspath(ofnode_dev, &ofnode_syspath); + if (r < 0) + return log_device_debug_errno(ofnode_dev, r, "Failed to get syspath: %m"); + + /* /proc/device-tree should be a symlink to /sys/firmware/devicetree/base. */ + r = sd_device_new_from_path(&devicetree_dev, "/proc/device-tree"); + if (r < 0) + return log_debug_errno(r, "Failed to create sd-device object from '/proc/device-tree': %m"); + + r = sd_device_get_syspath(devicetree_dev, &devicetree_syspath); + if (r < 0) + return log_device_debug_errno(devicetree_dev, r, "Failed to get syspath: %m"); + + /* + * Example paths: + * devicetree_syspath = /sys/firmware/devicetree/base + * ofnode_syspath = /sys/firmware/devicetree/base/soc/ethernet@deadbeef + * ofnode_path = soc/ethernet@deadbeef + */ + ofnode_path = path_startswith(ofnode_syspath, devicetree_syspath); + if (!ofnode_path) + return log_device_debug_errno(ofnode_dev, SYNTHETIC_ERRNO(EINVAL), + "The device '%s' is not a child device of '%s': %m", + ofnode_syspath, devicetree_syspath); + + /* Get back our leading / to match the contents of the aliases */ + ofnode_path--; + assert(path_is_absolute(ofnode_path)); + + r = sd_device_new_child(&aliases_dev, devicetree_dev, "aliases"); + if (r < 0) + return log_device_debug_errno(devicetree_dev, r, + "Failed to get 'aliases' child device: %m"); + + FOREACH_DEVICE_SYSATTR(aliases_dev, alias) { + const char *alias_path, *alias_index, *conflict; + unsigned i; + + alias_index = startswith(alias, "ethernet"); + if (!alias_index) + continue; + + if (sd_device_get_sysattr_value(aliases_dev, alias, &alias_path) < 0) + continue; + + if (!path_equal(ofnode_path, alias_path)) + continue; + + /* If there's no index, we default to 0... */ + if (isempty(alias_index)) { + i = 0; + conflict = "ethernet0"; + } else { + r = safe_atou(alias_index, &i); + if (r < 0) + return log_device_debug_errno(dev, r, + "Could not get index of alias %s: %m", alias); + conflict = "ethernet"; + } + + /* ...but make sure we don't have an alias conflict */ + if (i == 0 && sd_device_get_sysattr_value(aliases_dev, conflict, NULL) >= 0) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EEXIST), + "Ethernet alias conflict: ethernet and ethernet0 both exist"); + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%sd%u", prefix, i)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_ONBOARD", str); + log_device_debug(dev, "devicetree identifier: alias_index=%u %s \"%s\"", + i, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + return 0; + } + + return -ENOENT; +} + +static int names_pci(sd_device *dev, const char *prefix, bool test) { + _cleanup_(sd_device_unrefp) sd_device *physfn_pcidev = NULL; + _cleanup_free_ char *virtfn_suffix = NULL; + sd_device *parent; + + assert(dev); + assert(prefix); + + /* check if our direct parent is a PCI device with no other bus in-between */ + if (get_matching_parent(dev, STRV_MAKE("pci"), /* skip_virtio = */ true, &parent) < 0) + return 0; + + /* If this is an SR-IOV virtual device, get base name using physical device and add virtfn suffix. */ + if (naming_scheme_has(NAMING_SR_IOV_V) && + get_virtfn_info(parent, &physfn_pcidev, &virtfn_suffix) >= 0) + parent = physfn_pcidev; + else + (void) names_pci_onboard_label(dev, parent, prefix, test); + + (void) names_pci_onboard(dev, parent, prefix, virtfn_suffix, test); + (void) names_pci_slot(dev, parent, prefix, virtfn_suffix, test); + return 0; +} + +static int get_usb_specifier(sd_device *dev, char **ret) { + char *ports, *config, *interf, *s, *buf; + const char *sysname; + int r; + + assert(dev); + assert(ret); + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get sysname: %m"); + + /* get USB port number chain, configuration, interface */ + s = strchr(sysname, '-'); + if (!s) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "sysname \"%s\" does not have '-' in the expected place.", sysname); + + ports = strdupa_safe(s + 1); + s = strchr(ports, ':'); + if (!s) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "sysname \"%s\" does not have ':' in the expected place.", sysname); + + *s = '\0'; + config = s + 1; + s = strchr(config, '.'); + if (!s) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "sysname \"%s\" does not have '.' in the expected place.", sysname); + + *s = '\0'; + interf = s + 1; + + /* prefix every port number in the chain with "u" */ + string_replace_char(ports, '.', 'u'); + + /* suppress the common config == 1 */ + if (streq(config, "1")) + config = NULL; + + /* suppress the interface == 0 */ + if (streq(interf, "0")) + interf = NULL; + + buf = strjoin("u", ports, + config ? "c" : "", strempty(config), + interf ? "i" : "", strempty(interf)); + if (!buf) + return log_oom_debug(); + + log_device_debug(dev, "USB name identifier: ports=%s config=%s interface=%s %s %s", + ports, strna(config), strna(interf), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), buf); + + *ret = buf; + return 0; +} + +static int names_usb(sd_device *dev, const char *prefix, bool test) { + _cleanup_free_ char *suffix = NULL; + sd_device *usbdev, *pcidev; + int r; + + assert(dev); + assert(prefix); + + /* USB device */ + + r = sd_device_get_parent_with_subsystem_devtype(dev, "usb", "usb_interface", &usbdev); + if (r < 0) + return log_device_debug_errno(dev, r, "Could not find usb parent device: %m"); + + r = get_usb_specifier(usbdev, &suffix); + if (r < 0) + return r; + + /* If the USB bus is on PCI bus, then suffix the USB specifier to the name based on the PCI bus. */ + r = sd_device_get_parent_with_subsystem_devtype(usbdev, "pci", NULL, &pcidev); + if (r >= 0) + return names_pci_slot(dev, pcidev, prefix, suffix, test); + + if (r != -ENOENT || !naming_scheme_has(NAMING_USB_HOST)) + return log_device_debug_errno(usbdev, r, "Failed to get parent PCI bus: %m"); + + /* Otherwise, e.g. on-chip asics that have USB ports, use the USB specifier as is. */ + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%s%s", prefix, suffix)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_PATH", str); + + return 0; +} + +static int get_bcma_specifier(sd_device *dev, char **ret) { + const char *sysname; + char *buf = NULL; + unsigned core; + int r; + + assert(dev); + assert(ret); + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get sysname: %m"); + + /* bus num:core num */ + r = sscanf(sysname, "bcma%*u:%u", &core); + if (r != 1) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "Failed to parse bcma device information."); + + /* suppress the common core == 0 */ + if (core > 0 && asprintf(&buf, "b%u", core) < 0) + return log_oom_debug(); + + log_device_debug(dev, "BCMA core identifier: core=%u %s \"%s\"", + core, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), strna(buf)); + + *ret = buf; + return 0; +} + +static int names_bcma(sd_device *dev, const char *prefix, bool test) { + _cleanup_free_ char *suffix = NULL; + sd_device *bcmadev, *pcidev; + int r; + + assert(dev); + assert(prefix); + + r = sd_device_get_parent_with_subsystem_devtype(dev, "bcma", NULL, &bcmadev); + if (r < 0) + return log_device_debug_errno(dev, r, "Could not get bcma parent device: %m"); + + r = sd_device_get_parent_with_subsystem_devtype(bcmadev, "pci", NULL, &pcidev); + if (r < 0) + return log_device_debug_errno(dev, r, "Could not get pci parent device: %m"); + + r = get_bcma_specifier(bcmadev, &suffix); + if (r < 0) + return r; + + return names_pci_slot(dev, pcidev, prefix, suffix, test); +} + +static int names_ccw(sd_device *dev, const char *prefix, bool test) { + sd_device *cdev; + const char *bus_id; + size_t bus_id_start, bus_id_len; + int r; + + assert(dev); + assert(prefix); + + /* get path names for Linux on System z network devices */ + + if (get_matching_parent(dev, STRV_MAKE("ccwgroup", "ccw"), /* skip_virtio = */ true, &cdev) < 0) + return 0; + + log_device_debug(dev, "Device is CCW."); + + /* Retrieve bus-ID of the CCW device. The bus-ID uniquely + * identifies the network device on the Linux on System z channel + * subsystem. Note that the bus-ID contains lowercase characters. + */ + r = sd_device_get_sysname(cdev, &bus_id); + if (r < 0) + return log_device_debug_errno(cdev, r, "Failed to get sysname: %m"); + + /* Check the length of the bus-ID. Rely on the fact that the kernel provides a correct bus-ID; + * alternatively, improve this check and parse and verify each bus-ID part... + */ + bus_id_len = strlen(bus_id); + if (!IN_SET(bus_id_len, 8, 9)) + return log_device_debug_errno(cdev, SYNTHETIC_ERRNO(EINVAL), "Invalid bus_id: %s", bus_id); + + /* Strip leading zeros from the bus id for aesthetic purposes. This + * keeps the ccw names stable, yet much shorter in general case of + * bus_id 0.0.0600 -> 600. This is similar to e.g. how PCI domain is + * not prepended when it is zero. Preserve the last 0 for 0.0.0000. + */ + bus_id_start = strspn(bus_id, ".0"); + bus_id += bus_id_start < bus_id_len ? bus_id_start : bus_id_len - 1; + + /* Use the CCW bus-ID as network device name */ + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%sc%s", prefix, bus_id)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_PATH", str); + log_device_debug(dev, "CCW identifier: ccw_busid=%s %s \"%s\"", + bus_id, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + return 0; +} + +/* IEEE Organizationally Unique Identifier vendor string */ +static int ieee_oui(sd_device *dev, const struct hw_addr_data *hw_addr, bool test) { + char str[32]; + + assert(dev); + assert(hw_addr); + + if (hw_addr->length != 6) + return -EOPNOTSUPP; + + /* skip commonly misused 00:00:00 (Xerox) prefix */ + if (hw_addr->bytes[0] == 0 && + hw_addr->bytes[1] == 0 && + hw_addr->bytes[2] == 0) + return -EINVAL; + + xsprintf(str, "OUI:%02X%02X%02X%02X%02X%02X", + hw_addr->bytes[0], + hw_addr->bytes[1], + hw_addr->bytes[2], + hw_addr->bytes[3], + hw_addr->bytes[4], + hw_addr->bytes[5]); + + return udev_builtin_hwdb_lookup(dev, NULL, str, NULL, test); +} + +static int names_mac(sd_device *dev, const char *prefix, bool test) { + unsigned iftype, assign_type; + struct hw_addr_data hw_addr; + const char *s; + int r; + + assert(dev); + assert(prefix); + + r = device_get_sysattr_unsigned(dev, "type", &iftype); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to read 'type' attribute: %m"); + + /* The persistent part of a hardware address of an InfiniBand NIC is 8 bytes long. We cannot + * fit this much in an iface name. + * TODO: but it can be used as alternative names?? */ + if (iftype == ARPHRD_INFINIBAND) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Not generating MAC name for infiniband device."); + + /* check for NET_ADDR_PERM, skip random MAC addresses */ + r = device_get_sysattr_unsigned(dev, "addr_assign_type", &assign_type); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to read/parse addr_assign_type: %m"); + + if (assign_type != NET_ADDR_PERM) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), + "addr_assign_type=%u, MAC address is not permanent.", assign_type); + + r = sd_device_get_sysattr_value(dev, "address", &s); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to read 'address' attribute: %m"); + + r = parse_hw_addr(s, &hw_addr); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse 'address' attribute: %m"); + + if (hw_addr.length != 6) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Not generating MAC name for device with MAC address of length %zu.", + hw_addr.length); + + char str[ALTIFNAMSIZ]; + xsprintf(str, "%sx%s", prefix, HW_ADDR_TO_STR_FULL(&hw_addr, HW_ADDR_TO_STRING_NO_COLON)); + udev_builtin_add_property(dev, test, "ID_NET_NAME_MAC", str); + log_device_debug(dev, "MAC address identifier: hw_addr=%s %s %s", + HW_ADDR_TO_STR(&hw_addr), + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + + (void) ieee_oui(dev, &hw_addr, test); + return 0; +} + +static int names_netdevsim(sd_device *dev, const char *prefix, bool test) { + sd_device *netdevsimdev; + const char *sysnum, *phys_port_name; + unsigned addr; + int r; + + assert(dev); + assert(prefix); + + /* get netdevsim path names */ + + if (!naming_scheme_has(NAMING_NETDEVSIM)) + return 0; + + r = sd_device_get_parent_with_subsystem_devtype(dev, "netdevsim", NULL, &netdevsimdev); + if (r < 0) + return r; + + r = sd_device_get_sysnum(netdevsimdev, &sysnum); + if (r < 0) + return log_device_debug_errno(netdevsimdev, r, "Failed to get device sysnum: %m"); + + r = safe_atou(sysnum, &addr); + if (r < 0) + return log_device_debug_errno(netdevsimdev, r, "Failed to parse device sysnum: %m"); + + r = sd_device_get_sysattr_value(dev, "phys_port_name", &phys_port_name); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get 'phys_port_name' attribute: %m"); + if (isempty(phys_port_name)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EOPNOTSUPP), + "The 'phys_port_name' attribute is empty."); + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%si%un%s", prefix, addr, phys_port_name)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_PATH", str); + log_device_debug(dev, "Netdevsim identifier: address=%u, port_name=%s %s %s", + addr, phys_port_name, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + return 0; +} + +static int names_xen(sd_device *dev, const char *prefix, bool test) { + _cleanup_free_ char *vif = NULL; + const char *p; + unsigned id; + int r; + + assert(dev); + assert(prefix); + + /* get xen vif "slot" based names. */ + + if (!naming_scheme_has(NAMING_XEN_VIF)) + return 0; + + /* check if our direct parent is a Xen VIF device with no other bus in-between */ + if (get_matching_parent(dev, STRV_MAKE("xen"), /* skip_virtio = */ false, NULL) < 0) + return 0; + + /* Use the vif-n name to extract "n" */ + r = get_first_syspath_component(dev, "/sys/devices/", &vif); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get Xen VIF name: %m"); + + p = startswith(vif, "vif-"); + if (!p) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL), "Invalid vif name: %s: %m", vif); + + r = safe_atou_full(p, SAFE_ATO_REFUSE_PLUS_MINUS | SAFE_ATO_REFUSE_LEADING_ZERO | + SAFE_ATO_REFUSE_LEADING_WHITESPACE | 10, &id); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to parse vif index from '%s': %m", p); + + char str[ALTIFNAMSIZ]; + if (snprintf_ok(str, sizeof str, "%sX%u", prefix, id)) + udev_builtin_add_property(dev, test, "ID_NET_NAME_SLOT", str); + log_device_debug(dev, "Xen identifier: id=%u %s %s", + id, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), str + strlen(prefix)); + return 0; +} + +static int get_ifname_prefix(sd_device *dev, const char **ret) { + unsigned iftype; + int r; + + assert(dev); + assert(ret); + + r = device_get_sysattr_unsigned(dev, "type", &iftype); + if (r < 0) + return r; + + /* handle only ARPHRD_ETHER, ARPHRD_SLIP and ARPHRD_INFINIBAND devices */ + switch (iftype) { + case ARPHRD_ETHER: { + const char *s = NULL; + + r = sd_device_get_devtype(dev, &s); + if (r < 0 && r != -ENOENT) + return r; + + if (streq_ptr(s, "wlan")) + *ret = "wl"; + else if (streq_ptr(s, "wwan")) + *ret = "ww"; + else + *ret = "en"; + return 0; + } + case ARPHRD_INFINIBAND: + if (!naming_scheme_has(NAMING_INFINIBAND)) + return -EOPNOTSUPP; + + *ret = "ib"; + return 0; + + case ARPHRD_SLIP: + *ret = "sl"; + return 0; + + default: + return -EOPNOTSUPP; + } +} + +static int device_is_stacked(sd_device *dev) { + int ifindex, iflink, r; + + assert(dev); + + r = sd_device_get_ifindex(dev, &ifindex); + if (r < 0) + return r; + + r = device_get_sysattr_int(dev, "iflink", &iflink); + if (r < 0) + return r; + + return ifindex != iflink; +} + +static int builtin_net_id(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + const char *prefix; + int r; + + /* skip stacked devices, like VLANs, ... */ + r = device_is_stacked(dev); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to check if the device is stacked: %m"); + if (r > 0) + return 0; + + r = get_ifname_prefix(dev, &prefix); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to determine prefix for network interface naming, ignoring: %m"); + return 0; + } + + udev_builtin_add_property(dev, test, "ID_NET_NAMING_SCHEME", naming_scheme()->name); + + (void) names_mac(dev, prefix, test); + (void) names_devicetree(dev, prefix, test); + (void) names_ccw(dev, prefix, test); + (void) names_vio(dev, prefix, test); + (void) names_platform(dev, prefix, test); + (void) names_netdevsim(dev, prefix, test); + (void) names_xen(dev, prefix, test); + (void) names_pci(dev, prefix, test); + (void) names_usb(dev, prefix, test); + (void) names_bcma(dev, prefix, test); + + return 0; +} + +static int builtin_net_id_init(void) { + /* Load naming scheme here to suppress log messages in workers. */ + naming_scheme(); + return 0; +} + +const UdevBuiltin udev_builtin_net_id = { + .name = "net_id", + .cmd = builtin_net_id, + .init = builtin_net_id_init, + .help = "Network device properties", +}; diff --git a/src/udev/udev-builtin-net_setup_link.c b/src/udev/udev-builtin-net_setup_link.c new file mode 100644 index 0000000..a308a21 --- /dev/null +++ b/src/udev/udev-builtin-net_setup_link.c @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "device-util.h" +#include "escape.h" +#include "errno-util.h" +#include "link-config.h" +#include "log.h" +#include "string-util.h" +#include "strv.h" +#include "udev-builtin.h" + +static LinkConfigContext *ctx = NULL; + +static int builtin_net_setup_link(UdevEvent *event, int argc, char **argv, bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + _cleanup_(link_freep) Link *link = NULL; + _cleanup_free_ char *joined = NULL; + int r; + + if (argc > 1) + return log_device_error_errno(dev, SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments."); + + r = link_new(ctx, &event->rtnl, dev, &link); + if (r == -ENODEV) { + log_device_debug_errno(dev, r, "Link vanished while getting information, ignoring."); + return 0; + } + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get link information: %m"); + + r = link_get_config(ctx, link); + if (r < 0) { + if (r == -ENOENT) { + log_device_debug_errno(dev, r, "No matching link configuration found, ignoring device."); + return 0; + } + + return log_device_error_errno(dev, r, "Failed to get link config: %m"); + } + + r = link_apply_config(ctx, &event->rtnl, link); + if (r == -ENODEV) + log_device_debug_errno(dev, r, "Link vanished while applying configuration, ignoring."); + else if (r < 0) + log_device_warning_errno(dev, r, "Could not apply link configuration, ignoring: %m"); + + udev_builtin_add_property(dev, test, "ID_NET_LINK_FILE", link->config->filename); + if (link->new_name) + udev_builtin_add_property(dev, test, "ID_NET_NAME", link->new_name); + + event->altnames = TAKE_PTR(link->altnames); + + STRV_FOREACH(d, link->config->dropins) { + _cleanup_free_ char *escaped = NULL; + + escaped = xescape(*d, ":"); + if (!escaped) + return log_oom(); + + if (!strextend_with_separator(&joined, ":", escaped)) + return log_oom(); + } + + udev_builtin_add_property(dev, test, "ID_NET_LINK_FILE_DROPINS", joined); + + return 0; +} + +static int builtin_net_setup_link_init(void) { + int r; + + if (ctx) + return 0; + + r = link_config_ctx_new(&ctx); + if (r < 0) + return r; + + r = link_config_load(ctx); + if (r < 0) + return r; + + log_debug("Created link configuration context."); + return 0; +} + +static void builtin_net_setup_link_exit(void) { + ctx = link_config_ctx_free(ctx); + log_debug("Unloaded link configuration context."); +} + +static bool builtin_net_setup_link_should_reload(void) { + if (!ctx) + return false; + + if (link_config_should_reload(ctx)) { + log_debug("Link configuration context needs reloading."); + return true; + } + + return false; +} + +const UdevBuiltin udev_builtin_net_setup_link = { + .name = "net_setup_link", + .cmd = builtin_net_setup_link, + .init = builtin_net_setup_link_init, + .exit = builtin_net_setup_link_exit, + .should_reload = builtin_net_setup_link_should_reload, + .help = "Configure network link", + .run_once = false, +}; diff --git a/src/udev/udev-builtin-path_id.c b/src/udev/udev-builtin-path_id.c new file mode 100644 index 0000000..467c9a6 --- /dev/null +++ b/src/udev/udev-builtin-path_id.c @@ -0,0 +1,896 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * compose persistent device path + * + * Logic based on Hannes Reinecke's shell script. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "device-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "strv.h" +#include "sysexits.h" +#include "udev-builtin.h" +#include "udev-util.h" + +_printf_(2,3) +static void path_prepend(char **path, const char *fmt, ...) { + va_list va; + _cleanup_free_ char *pre = NULL; + int r; + + va_start(va, fmt); + r = vasprintf(&pre, fmt, va); + va_end(va); + if (r < 0) { + log_oom(); + exit(EX_OSERR); + } + + if (*path) { + char *new; + + new = strjoin(pre, "-", *path); + if (!new) { + log_oom(); + exit(EX_OSERR); + } + + free_and_replace(*path, new); + } else + *path = TAKE_PTR(pre); +} + +/* +** Linux only supports 32 bit luns. +** See drivers/scsi/scsi_scan.c::scsilun_to_int() for more details. +*/ +static int format_lun_number(sd_device *dev, char **path) { + const char *sysnum; + unsigned long lun; + int r; + + r = sd_device_get_sysnum(dev, &sysnum); + if (r < 0) + return r; + if (!sysnum) + return -ENOENT; + + r = safe_atolu_full(sysnum, 10, &lun); + if (r < 0) + return r; + if (lun < 256) + /* address method 0, peripheral device addressing with bus id of zero */ + path_prepend(path, "lun-%lu", lun); + else + /* handle all other lun addressing methods by using a variant of the original lun format */ + path_prepend(path, "lun-0x%04lx%04lx00000000", lun & 0xffff, (lun >> 16) & 0xffff); + + return 0; +} + +static sd_device *skip_subsystem(sd_device *dev, const char *subsys) { + sd_device *parent; + + assert(dev); + assert(subsys); + + /* Unlike the function name, this drops multiple parent devices EXCEPT FOR THE LAST ONE. + * The last one will be dropped at the end of the loop in builtin_path_id(). + * E.g. + * Input: /sys/devices/pci0000:00/0000:00:14.0/usb1/1-1/1-1:1.0 + * Output: /sys/devices/pci0000:00/0000:00:14.0/usb1 + */ + + for (parent = dev; ; ) { + const char *subsystem; + + if (sd_device_get_subsystem(parent, &subsystem) < 0) + break; + + if (!streq(subsystem, subsys)) + break; + + dev = parent; + if (sd_device_get_parent(dev, &parent) < 0) + break; + } + + return dev; +} + +static sd_device *handle_scsi_fibre_channel(sd_device *parent, char **path) { + sd_device *targetdev; + _cleanup_(sd_device_unrefp) sd_device *fcdev = NULL; + const char *port, *sysname; + _cleanup_free_ char *lun = NULL; + + assert(parent); + assert(path); + + if (sd_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_target", &targetdev) < 0) + return NULL; + if (sd_device_get_sysname(targetdev, &sysname) < 0) + return NULL; + if (sd_device_new_from_subsystem_sysname(&fcdev, "fc_transport", sysname) < 0) + return NULL; + if (sd_device_get_sysattr_value(fcdev, "port_name", &port) < 0) + return NULL; + + format_lun_number(parent, &lun); + path_prepend(path, "fc-%s-%s", port, lun); + return parent; +} + +static sd_device *handle_scsi_sas_wide_port(sd_device *parent, char **path) { + sd_device *targetdev, *target_parent; + _cleanup_(sd_device_unrefp) sd_device *sasdev = NULL; + const char *sas_address, *sysname; + _cleanup_free_ char *lun = NULL; + + assert(parent); + assert(path); + + if (sd_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_target", &targetdev) < 0) + return NULL; + if (sd_device_get_parent(targetdev, &target_parent) < 0) + return NULL; + if (sd_device_get_sysname(target_parent, &sysname) < 0) + return NULL; + if (sd_device_new_from_subsystem_sysname(&sasdev, "sas_device", sysname) < 0) + return NULL; + if (sd_device_get_sysattr_value(sasdev, "sas_address", &sas_address) < 0) + return NULL; + + format_lun_number(parent, &lun); + path_prepend(path, "sas-%s-%s", sas_address, lun); + return parent; +} + +static sd_device *handle_scsi_sas(sd_device *parent, char **path) { + sd_device *targetdev, *target_parent, *port, *expander; + _cleanup_(sd_device_unrefp) sd_device *target_sasdev = NULL, *expander_sasdev = NULL, *port_sasdev = NULL; + const char *sas_address = NULL; + const char *phy_id; + const char *phy_count, *sysname; + _cleanup_free_ char *lun = NULL; + + assert(parent); + assert(path); + + if (sd_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_target", &targetdev) < 0) + return NULL; + if (sd_device_get_parent(targetdev, &target_parent) < 0) + return NULL; + if (sd_device_get_sysname(target_parent, &sysname) < 0) + return NULL; + /* Get sas device */ + if (sd_device_new_from_subsystem_sysname(&target_sasdev, "sas_device", sysname) < 0) + return NULL; + /* The next parent is sas port */ + if (sd_device_get_parent(target_parent, &port) < 0) + return NULL; + if (sd_device_get_sysname(port, &sysname) < 0) + return NULL; + /* Get port device */ + if (sd_device_new_from_subsystem_sysname(&port_sasdev, "sas_port", sysname) < 0) + return NULL; + if (sd_device_get_sysattr_value(port_sasdev, "num_phys", &phy_count) < 0) + return NULL; + + /* Check if we are simple disk */ + if (strncmp(phy_count, "1", 2) != 0) + return handle_scsi_sas_wide_port(parent, path); + + /* Get connected phy */ + if (sd_device_get_sysattr_value(target_sasdev, "phy_identifier", &phy_id) < 0) + return NULL; + + /* The port's parent is either hba or expander */ + if (sd_device_get_parent(port, &expander) < 0) + return NULL; + + if (sd_device_get_sysname(expander, &sysname) < 0) + return NULL; + /* Get expander device */ + if (sd_device_new_from_subsystem_sysname(&expander_sasdev, "sas_device", sysname) >= 0) { + /* Get expander's address */ + if (sd_device_get_sysattr_value(expander_sasdev, "sas_address", &sas_address) < 0) + return NULL; + } + + format_lun_number(parent, &lun); + if (sas_address) + path_prepend(path, "sas-exp%s-phy%s-%s", sas_address, phy_id, lun); + else + path_prepend(path, "sas-phy%s-%s", phy_id, lun); + + return parent; +} + +static sd_device *handle_scsi_iscsi(sd_device *parent, char **path) { + sd_device *transportdev; + _cleanup_(sd_device_unrefp) sd_device *sessiondev = NULL, *conndev = NULL; + const char *target, *connname, *addr, *port; + _cleanup_free_ char *lun = NULL; + const char *sysname, *sysnum; + + assert(parent); + assert(path); + + /* find iscsi session */ + for (transportdev = parent; ; ) { + + if (sd_device_get_parent(transportdev, &transportdev) < 0) + return NULL; + if (sd_device_get_sysname(transportdev, &sysname) < 0) + return NULL; + if (startswith(sysname, "session")) + break; + } + + /* find iscsi session device */ + if (sd_device_new_from_subsystem_sysname(&sessiondev, "iscsi_session", sysname) < 0) + return NULL; + + if (sd_device_get_sysattr_value(sessiondev, "targetname", &target) < 0) + return NULL; + + if (sd_device_get_sysnum(transportdev, &sysnum) < 0 || !sysnum) + return NULL; + connname = strjoina("connection", sysnum, ":0"); + if (sd_device_new_from_subsystem_sysname(&conndev, "iscsi_connection", connname) < 0) + return NULL; + + if (sd_device_get_sysattr_value(conndev, "persistent_address", &addr) < 0) + return NULL; + if (sd_device_get_sysattr_value(conndev, "persistent_port", &port) < 0) + return NULL; + + format_lun_number(parent, &lun); + path_prepend(path, "ip-%s:%s-iscsi-%s-%s", addr, port, target, lun); + return parent; +} + +static sd_device *handle_scsi_ata(sd_device *parent, char **path, char **compat_path) { + sd_device *targetdev, *target_parent; + _cleanup_(sd_device_unrefp) sd_device *atadev = NULL; + const char *port_no, *sysname, *name; + unsigned host, bus, target, lun; + + assert(parent); + assert(path); + + if (sd_device_get_sysname(parent, &name) < 0) + return NULL; + if (sscanf(name, "%u:%u:%u:%u", &host, &bus, &target, &lun) != 4) + return NULL; + + if (sd_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_host", &targetdev) < 0) + return NULL; + + if (sd_device_get_parent(targetdev, &target_parent) < 0) + return NULL; + + if (sd_device_get_sysname(target_parent, &sysname) < 0) + return NULL; + if (sd_device_new_from_subsystem_sysname(&atadev, "ata_port", sysname) < 0) + return NULL; + + if (sd_device_get_sysattr_value(atadev, "port_no", &port_no) < 0) + return NULL; + + if (bus != 0) + /* Devices behind port multiplier have a bus != 0 */ + path_prepend(path, "ata-%s.%u.0", port_no, bus); + else + /* Master/slave are distinguished by target id */ + path_prepend(path, "ata-%s.%u", port_no, target); + + /* old compatible persistent link for ATA devices */ + if (compat_path) + path_prepend(compat_path, "ata-%s", port_no); + + return parent; +} + +static sd_device *handle_scsi_default(sd_device *parent, char **path) { + sd_device *hostdev; + int host, bus, target, lun; + const char *name, *base, *pos; + _cleanup_closedir_ DIR *dir = NULL; + int basenum = -1; + + assert(parent); + assert(path); + + if (sd_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_host", &hostdev) < 0) + return NULL; + + if (sd_device_get_sysname(parent, &name) < 0) + return NULL; + if (sscanf(name, "%d:%d:%d:%d", &host, &bus, &target, &lun) != 4) + return NULL; + + /* + * Rebase host offset to get the local relative number + * + * Note: This is by definition racy, unreliable and too simple. + * Please do not copy this model anywhere. It's just a left-over + * from the time we had no idea how things should look like in + * the end. + * + * Making assumptions about a global in-kernel counter and use + * that to calculate a local offset is a very broken concept. It + * can only work as long as things are in strict order. + * + * The kernel needs to export the instance/port number of a + * controller directly, without the need for rebase magic like + * this. Manual driver unbind/bind, parallel hotplug/unplug will + * get into the way of this "I hope it works" logic. + */ + + if (sd_device_get_syspath(hostdev, &base) < 0) + return NULL; + pos = strrchr(base, '/'); + if (!pos) + return NULL; + + base = strndupa_safe(base, pos - base); + dir = opendir(base); + if (!dir) + return NULL; + + FOREACH_DIRENT_ALL(de, dir, break) { + unsigned i; + + if (de->d_name[0] == '.') + continue; + if (!IN_SET(de->d_type, DT_DIR, DT_LNK)) + continue; + if (!startswith(de->d_name, "host")) + continue; + if (safe_atou_full(&de->d_name[4], 10, &i) < 0) + continue; + /* + * find the smallest number; the host really needs to export its + * own instance number per parent device; relying on the global host + * enumeration and plainly rebasing the numbers sounds unreliable + */ + if (basenum == -1 || (int) i < basenum) + basenum = i; + } + if (basenum == -1) + return hostdev; + host -= basenum; + + path_prepend(path, "scsi-%i:%i:%i:%i", host, bus, target, lun); + return hostdev; +} + +static sd_device *handle_scsi_hyperv(sd_device *parent, char **path, size_t guid_str_len) { + sd_device *hostdev; + sd_device *vmbusdev; + const char *guid_str; + _cleanup_free_ char *lun = NULL; + char guid[39]; + + assert(parent); + assert(path); + assert(guid_str_len < sizeof(guid)); + + if (sd_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_host", &hostdev) < 0) + return NULL; + + if (sd_device_get_parent(hostdev, &vmbusdev) < 0) + return NULL; + + if (sd_device_get_sysattr_value(vmbusdev, "device_id", &guid_str) < 0) + return NULL; + + if (strlen(guid_str) < guid_str_len || guid_str[0] != '{' || guid_str[guid_str_len-1] != '}') + return NULL; + + size_t k = 0; + for (size_t i = 1; i < guid_str_len-1; i++) { + if (guid_str[i] == '-') + continue; + guid[k++] = guid_str[i]; + } + guid[k] = '\0'; + + format_lun_number(parent, &lun); + path_prepend(path, "vmbus-%s-%s", guid, lun); + return parent; +} + +static sd_device *handle_scsi(sd_device *parent, char **path, char **compat_path, bool *supported_parent) { + const char *devtype, *id, *name; + + if (sd_device_get_devtype(parent, &devtype) < 0 || + !streq(devtype, "scsi_device")) + return parent; + + /* firewire */ + if (sd_device_get_sysattr_value(parent, "ieee1394_id", &id) >= 0) { + path_prepend(path, "ieee1394-0x%s", id); + *supported_parent = true; + return skip_subsystem(parent, "scsi"); + } + + /* scsi sysfs does not have a "subsystem" for the transport */ + if (sd_device_get_syspath(parent, &name) < 0) + return NULL; + + if (strstr(name, "/rport-")) { + *supported_parent = true; + return handle_scsi_fibre_channel(parent, path); + } + + if (strstr(name, "/end_device-")) { + *supported_parent = true; + return handle_scsi_sas(parent, path); + } + + if (strstr(name, "/session")) { + *supported_parent = true; + return handle_scsi_iscsi(parent, path); + } + + if (strstr(name, "/ata")) + return handle_scsi_ata(parent, path, compat_path); + + if (strstr(name, "/vmbus_")) + return handle_scsi_hyperv(parent, path, 37); + else if (strstr(name, "/VMBUS")) + return handle_scsi_hyperv(parent, path, 38); + + return handle_scsi_default(parent, path); +} + +static sd_device *handle_cciss(sd_device *parent, char **path) { + const char *str; + unsigned controller, disk; + + if (sd_device_get_sysname(parent, &str) < 0) + return NULL; + if (sscanf(str, "c%ud%u%*s", &controller, &disk) != 2) + return NULL; + + path_prepend(path, "cciss-disk%u", disk); + return skip_subsystem(parent, "cciss"); +} + +static void handle_scsi_tape(sd_device *dev, char **path) { + const char *name; + + /* must be the last device in the syspath */ + if (*path) + return; + + if (sd_device_get_sysname(dev, &name) < 0) + return; + + if (startswith(name, "nst") && strchr("lma", name[3])) + path_prepend(path, "nst%c", name[3]); + else if (startswith(name, "st") && strchr("lma", name[2])) + path_prepend(path, "st%c", name[2]); +} + +static int get_usb_revision(sd_device *dev) { + uint8_t protocol; + const char *s; + int r; + + assert(dev); + + /* Returns usb revision 1, 2, or 3. */ + + r = sd_device_get_sysattr_value(dev, "bDeviceProtocol", &s); + if (r < 0) + return r; + + r = safe_atou8_full(s, 16, &protocol); + if (r < 0) + return r; + + switch (protocol) { + case USB_HUB_PR_HS_NO_TT: /* Full speed hub (USB1) or Hi-speed hub without TT (USB2) */ + + /* See speed_show() in drivers/usb/core/sysfs.c of the kernel. */ + r = sd_device_get_sysattr_value(dev, "speed", &s); + if (r < 0) + return r; + + if (streq(s, "480")) + return 2; + + return 1; + + case USB_HUB_PR_HS_SINGLE_TT: /* Hi-speed hub with single TT */ + case USB_HUB_PR_HS_MULTI_TT: /* Hi-speed hub with multiple TT */ + return 2; + + case USB_HUB_PR_SS: /* Super speed hub */ + return 3; + + default: + return -EPROTONOSUPPORT; + } +} + +static sd_device *handle_usb(sd_device *parent, char **path) { + const char *devtype, *str, *port; + int r; + + if (sd_device_get_devtype(parent, &devtype) < 0) + return parent; + if (!STR_IN_SET(devtype, "usb_interface", "usb_device")) + return parent; + + if (sd_device_get_sysname(parent, &str) < 0) + return parent; + port = strchr(str, '-'); + if (!port) + return parent; + port++; + + parent = skip_subsystem(parent, "usb"); + if (!parent) + return NULL; + + /* USB host number may change across reboots (and probably even without reboot). The part after USB + * host number is determined by device topology and so does not change. Hence, drop the host number + * and always use '0' instead. + * + * xHCI host controllers may register two (or more?) USB root hubs for USB 2.0 and USB 3.0, and the + * sysname, whose host number replaced with 0, of a device under the hubs may conflict with others. + * To avoid the conflict, let's include the USB revision of the root hub to the PATH_ID. + * See issue https://github.com/systemd/systemd/issues/19406 for more details. */ + r = get_usb_revision(parent); + if (r < 0) { + log_device_debug_errno(parent, r, "Failed to get the USB revision number, ignoring: %m"); + path_prepend(path, "usb-0:%s", port); + } else { + assert(r > 0); + path_prepend(path, "usbv%i-0:%s", r, port); + } + + return parent; +} + +static sd_device *handle_bcma(sd_device *parent, char **path) { + const char *sysname; + unsigned core; + + if (sd_device_get_sysname(parent, &sysname) < 0) + return NULL; + if (sscanf(sysname, "bcma%*u:%u", &core) != 1) + return NULL; + + path_prepend(path, "bcma-%u", core); + return parent; +} + +/* Handle devices of AP bus in System z platform. */ +static sd_device *handle_ap(sd_device *parent, char **path) { + const char *type, *func; + + assert(parent); + assert(path); + + if (sd_device_get_sysattr_value(parent, "type", &type) >= 0 && + sd_device_get_sysattr_value(parent, "ap_functions", &func) >= 0) + path_prepend(path, "ap-%s-%s", type, func); + else { + const char *sysname; + + if (sd_device_get_sysname(parent, &sysname) >= 0) + path_prepend(path, "ap-%s", sysname); + } + + return skip_subsystem(parent, "ap"); +} + +static int find_real_nvme_parent(sd_device *dev, sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *nvme = NULL; + const char *sysname, *end, *devpath; + int r; + + /* If the device belongs to "nvme-subsystem" (not to be confused with "nvme"), which happens when + * NVMe multipathing is enabled in the kernel (/sys/module/nvme_core/parameters/multipath is Y), + * then the syspath is something like the following: + * /sys/devices/virtual/nvme-subsystem/nvme-subsys0/nvme0n1 + * Hence, we need to find the 'real parent' in "nvme" subsystem, e.g, + * /sys/devices/pci0000:00/0000:00:1c.4/0000:3c:00.0/nvme/nvme0 */ + + assert(dev); + assert(ret); + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return r; + + /* The sysname format of nvme block device is nvme%d[c%d]n%d[p%d], e.g. nvme0n1p2 or nvme0c1n2. + * (Note, nvme device with 'c' can be ignored, as they are hidden. ) + * The sysname format of nvme subsystem device is nvme%d. + * See nvme_alloc_ns() and nvme_init_ctrl() in drivers/nvme/host/core.c for more details. */ + end = startswith(sysname, "nvme"); + if (!end) + return -ENXIO; + + end += strspn(end, DIGITS); + sysname = strndupa(sysname, end - sysname); + + r = sd_device_new_from_subsystem_sysname(&nvme, "nvme", sysname); + if (r < 0) + return r; + + r = sd_device_get_devpath(nvme, &devpath); + if (r < 0) + return r; + + /* If the 'real parent' is (still) virtual, e.g. for nvmf disks, refuse to set ID_PATH. */ + if (path_startswith(devpath, "/devices/virtual/")) + return -ENXIO; + + *ret = TAKE_PTR(nvme); + return 0; +} + +static void add_id_with_usb_revision(sd_device *dev, bool test, char *path) { + char *p; + int r; + + assert(dev); + assert(path); + + /* When the path contains the USB revision, let's adds ID_PATH_WITH_USB_REVISION property and + * drop the version specifier for later use. */ + + p = strstrafter(path, "-usbv"); + if (!p) + return; + if (!ascii_isdigit(p[0])) + return; + if (p[1] != '-') + return; + + r = udev_builtin_add_property(dev, test, "ID_PATH_WITH_USB_REVISION", path); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to add ID_PATH_WITH_USB_REVISION property, ignoring: %m"); + + /* Drop the USB revision specifier for backward compatibility. */ + memmove(p - 1, p + 1, strlen(p + 1) + 1); +} + +static void add_id_tag(sd_device *dev, bool test, const char *path) { + char tag[UDEV_NAME_SIZE]; + size_t i = 0; + int r; + + /* compose valid udev tag name */ + for (const char *p = path; *p; p++) { + if (ascii_isdigit(*p) || + ascii_isalpha(*p) || + *p == '-') { + tag[i++] = *p; + continue; + } + + /* skip all leading '_' */ + if (i == 0) + continue; + + /* avoid second '_' */ + if (tag[i-1] == '_') + continue; + + tag[i++] = '_'; + } + /* strip trailing '_' */ + while (i > 0 && tag[i-1] == '_') + i--; + tag[i] = '\0'; + + r = udev_builtin_add_property(dev, test, "ID_PATH_TAG", tag); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to add ID_PATH_TAG property, ignoring: %m"); +} + +static int builtin_path_id(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + _cleanup_(sd_device_unrefp) sd_device *dev_other_branch = NULL; + _cleanup_free_ char *path = NULL, *compat_path = NULL; + bool supported_transport = false, supported_parent = false; + const char *subsystem; + int r; + + /* walk up the chain of devices and compose path */ + for (sd_device *parent = dev; parent; ) { + const char *subsys, *sysname; + + if (sd_device_get_subsystem(parent, &subsys) < 0 || + sd_device_get_sysname(parent, &sysname) < 0) { + ; + } else if (streq(subsys, "scsi_tape")) { + handle_scsi_tape(parent, &path); + } else if (streq(subsys, "scsi")) { + parent = handle_scsi(parent, &path, &compat_path, &supported_parent); + supported_transport = true; + } else if (streq(subsys, "cciss")) { + parent = handle_cciss(parent, &path); + supported_transport = true; + } else if (streq(subsys, "usb")) { + parent = handle_usb(parent, &path); + supported_transport = true; + } else if (streq(subsys, "bcma")) { + parent = handle_bcma(parent, &path); + supported_transport = true; + } else if (streq(subsys, "serio")) { + const char *sysnum; + + if (sd_device_get_sysnum(parent, &sysnum) >= 0 && sysnum) { + path_prepend(&path, "serio-%s", sysnum); + parent = skip_subsystem(parent, "serio"); + } + } else if (streq(subsys, "pci")) { + path_prepend(&path, "pci-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "pci-%s", sysname); + parent = skip_subsystem(parent, "pci"); + supported_parent = true; + } else if (streq(subsys, "platform")) { + path_prepend(&path, "platform-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "platform-%s", sysname); + parent = skip_subsystem(parent, "platform"); + supported_transport = true; + supported_parent = true; + } else if (streq(subsys, "amba")) { + path_prepend(&path, "amba-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "amba-%s", sysname); + parent = skip_subsystem(parent, "amba"); + supported_transport = true; + supported_parent = true; + } else if (streq(subsys, "acpi")) { + path_prepend(&path, "acpi-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "acpi-%s", sysname); + parent = skip_subsystem(parent, "acpi"); + supported_parent = true; + } else if (streq(subsys, "xen")) { + path_prepend(&path, "xen-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "xen-%s", sysname); + parent = skip_subsystem(parent, "xen"); + supported_parent = true; + } else if (streq(subsys, "virtio")) { + parent = skip_subsystem(parent, "virtio"); + supported_transport = true; + } else if (streq(subsys, "scm")) { + path_prepend(&path, "scm-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "scm-%s", sysname); + parent = skip_subsystem(parent, "scm"); + supported_transport = true; + supported_parent = true; + } else if (streq(subsys, "ccw")) { + path_prepend(&path, "ccw-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "ccw-%s", sysname); + parent = skip_subsystem(parent, "ccw"); + supported_transport = true; + supported_parent = true; + } else if (streq(subsys, "ccwgroup")) { + path_prepend(&path, "ccwgroup-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "ccwgroup-%s", sysname); + parent = skip_subsystem(parent, "ccwgroup"); + supported_transport = true; + supported_parent = true; + } else if (streq(subsys, "ap")) { + parent = handle_ap(parent, &path); + supported_transport = true; + supported_parent = true; + } else if (streq(subsys, "iucv")) { + path_prepend(&path, "iucv-%s", sysname); + if (compat_path) + path_prepend(&compat_path, "iucv-%s", sysname); + parent = skip_subsystem(parent, "iucv"); + supported_transport = true; + supported_parent = true; + } else if (STR_IN_SET(subsys, "nvme", "nvme-subsystem")) { + const char *nsid; + + if (sd_device_get_sysattr_value(dev, "nsid", &nsid) >= 0) { + path_prepend(&path, "nvme-%s", nsid); + if (compat_path) + path_prepend(&compat_path, "nvme-%s", nsid); + + if (streq(subsys, "nvme-subsystem")) { + r = find_real_nvme_parent(dev, &dev_other_branch); + if (r < 0) + return r; + + parent = dev_other_branch; + } + + parent = skip_subsystem(parent, "nvme"); + supported_parent = true; + supported_transport = true; + } + } else if (streq(subsys, "spi")) { + const char *sysnum; + + if (sd_device_get_sysnum(parent, &sysnum) >= 0 && sysnum) { + path_prepend(&path, "cs-%s", sysnum); + parent = skip_subsystem(parent, "spi"); + } + } + + if (!parent) + break; + if (sd_device_get_parent(parent, &parent) < 0) + break; + } + + if (!path) + return -ENOENT; + + /* + * Do not return devices with an unknown parent device type. They + * might produce conflicting IDs if the parent does not provide a + * unique and predictable name. + */ + if (!supported_parent) + return -ENOENT; + + /* + * Do not return block devices without a well-known transport. Some + * devices do not expose their buses and do not provide a unique + * and predictable name that way. + */ + if (sd_device_get_subsystem(dev, &subsystem) >= 0 && + streq(subsystem, "block") && + !supported_transport) + return -ENOENT; + + add_id_with_usb_revision(dev, test, path); + + r = udev_builtin_add_property(dev, test, "ID_PATH", path); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to add ID_PATH property, ignoring: %m"); + + add_id_tag(dev, test, path); + + /* + * Compatible link generation for ATA devices + * we assign compat_link to the env variable + * ID_PATH_ATA_COMPAT + */ + if (compat_path) + udev_builtin_add_property(dev, test, "ID_PATH_ATA_COMPAT", compat_path); + + return 0; +} + +const UdevBuiltin udev_builtin_path_id = { + .name = "path_id", + .cmd = builtin_path_id, + .help = "Compose persistent device path", + .run_once = true, +}; diff --git a/src/udev/udev-builtin-uaccess.c b/src/udev/udev-builtin-uaccess.c new file mode 100644 index 0000000..da42ef5 --- /dev/null +++ b/src/udev/udev-builtin-uaccess.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * manage device node user ACL + */ + +#include +#include +#include +#include + +#include "sd-login.h" + +#include "device-util.h" +#include "devnode-acl.h" +#include "errno-util.h" +#include "login-util.h" +#include "log.h" +#include "udev-builtin.h" + +static int builtin_uaccess(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + const char *path = NULL, *seat; + bool changed_acl = false; + uid_t uid; + int r; + + umask(0022); + + /* don't muck around with ACLs when the system is not running systemd */ + if (!logind_running()) + return 0; + + r = sd_device_get_devname(dev, &path); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to get device name: %m"); + goto finish; + } + + if (sd_device_get_property_value(dev, "ID_SEAT", &seat) < 0) + seat = "seat0"; + + r = sd_seat_get_active(seat, NULL, &uid); + if (r < 0) { + if (IN_SET(r, -ENXIO, -ENODATA)) + /* No active session on this seat */ + r = 0; + else + log_device_error_errno(dev, r, "Failed to determine active user on seat %s: %m", seat); + + goto finish; + } + + r = devnode_acl(path, true, false, 0, true, uid); + if (r < 0) { + log_device_full_errno(dev, r == -ENOENT ? LOG_DEBUG : LOG_ERR, r, "Failed to apply ACL: %m"); + goto finish; + } + + changed_acl = true; + r = 0; + +finish: + if (path && !changed_acl) { + int k; + + /* Better be safe than sorry and reset ACL */ + k = devnode_acl(path, true, false, 0, false, 0); + if (k < 0) { + log_device_full_errno(dev, k == -ENOENT ? LOG_DEBUG : LOG_ERR, k, "Failed to apply ACL: %m"); + RET_GATHER(r, k); + } + } + + return r; +} + +const UdevBuiltin udev_builtin_uaccess = { + .name = "uaccess", + .cmd = builtin_uaccess, + .help = "Manage device node user ACL", +}; diff --git a/src/udev/udev-builtin-usb_id.c b/src/udev/udev-builtin-usb_id.c new file mode 100644 index 0000000..8e83c9c --- /dev/null +++ b/src/udev/udev-builtin-usb_id.c @@ -0,0 +1,489 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * USB device properties and persistent device path + * + * Copyright (c) 2005 SUSE Linux Products GmbH, Germany + * Author: Hannes Reinecke + */ + +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "device-nodes.h" +#include "device-util.h" +#include "fd-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "udev-builtin.h" +#include "udev-util.h" + +static void set_usb_iftype(char *to, int if_class_num, size_t len) { + const char *type = "generic"; + + switch (if_class_num) { + case 1: + type = "audio"; + break; + case 2: /* CDC-Control */ + break; + case 3: + type = "hid"; + break; + case 5: /* Physical */ + break; + case 6: + type = "media"; + break; + case 7: + type = "printer"; + break; + case 8: + type = "storage"; + break; + case 9: + type = "hub"; + break; + case 0x0a: /* CDC-Data */ + break; + case 0x0b: /* Chip/Smart Card */ + break; + case 0x0d: /* Content Security */ + break; + case 0x0e: + type = "video"; + break; + case 0xdc: /* Diagnostic Device */ + break; + case 0xe0: /* Wireless Controller */ + break; + case 0xfe: /* Application-specific */ + break; + case 0xff: /* Vendor-specific */ + break; + default: + break; + } + strncpy(to, type, len); + to[len-1] = '\0'; +} + +static int set_usb_mass_storage_ifsubtype(char *to, const char *from, size_t len) { + int type_num = 0; + const char *type = "generic"; + + if (safe_atoi(from, &type_num) >= 0) { + switch (type_num) { + case 1: /* RBC devices */ + type = "rbc"; + break; + case 2: + type = "atapi"; + break; + case 3: + type = "tape"; + break; + case 4: /* UFI */ + type = "floppy"; + break; + case 6: /* Transparent SPC-2 devices */ + type = "scsi"; + break; + default: + break; + } + } + strscpy(to, len, type); + return type_num; +} + +static void set_scsi_type(char *to, const char *from, size_t len) { + unsigned type_num; + const char *type = "generic"; + + if (safe_atou(from, &type_num) >= 0) { + switch (type_num) { + case 0: + case 0xe: + type = "disk"; + break; + case 1: + type = "tape"; + break; + case 4: + case 7: + case 0xf: + type = "optical"; + break; + case 5: + type = "cd"; + break; + default: + break; + } + } + strscpy(to, len, type); +} + +#define USB_DT_DEVICE 0x01 +#define USB_DT_INTERFACE 0x04 + +static int dev_if_packed_info(sd_device *dev, char *ifs_str, size_t len) { + _cleanup_close_ int fd = -EBADF; + ssize_t size; + unsigned char buf[18 + 65535]; + size_t pos = 0; + unsigned strpos = 0; + const char *filename, *syspath; + int r; + struct usb_interface_descriptor { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bInterfaceNumber; + uint8_t bAlternateSetting; + uint8_t bNumEndpoints; + uint8_t bInterfaceClass; + uint8_t bInterfaceSubClass; + uint8_t bInterfaceProtocol; + uint8_t iInterface; + } _packed_; + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) + return r; + + filename = strjoina(syspath, "/descriptors"); + fd = open(filename, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_device_debug_errno(dev, errno, "Failed to open \"%s\": %m", filename); + + size = read(fd, buf, sizeof(buf)); + if (size < 18) + return log_device_warning_errno(dev, SYNTHETIC_ERRNO(EIO), + "Short read from \"%s\"", filename); + assert((size_t) size <= sizeof buf); + + ifs_str[0] = '\0'; + while (pos + sizeof(struct usb_interface_descriptor) < (size_t) size && + strpos + 7 < len - 2) { + + struct usb_interface_descriptor *desc; + char if_str[8]; + + desc = (struct usb_interface_descriptor *) (buf + pos); + if (desc->bLength < 3) + break; + if (desc->bLength > size - sizeof(struct usb_interface_descriptor)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EIO), + "Corrupt data read from \"%s\"", filename); + pos += desc->bLength; + + if (desc->bDescriptorType != USB_DT_INTERFACE) + continue; + + if (snprintf(if_str, 8, ":%02x%02x%02x", + desc->bInterfaceClass, + desc->bInterfaceSubClass, + desc->bInterfaceProtocol) != 7) + continue; + + if (strstr(ifs_str, if_str)) + continue; + + memcpy(&ifs_str[strpos], if_str, 8), + strpos += 7; + } + + if (strpos > 0) { + ifs_str[strpos++] = ':'; + ifs_str[strpos++] = '\0'; + } + + return 0; +} + +/* + * A unique USB identification is generated like this: + * + * 1.) Get the USB device type from InterfaceClass and InterfaceSubClass + * 2.) If the device type is 'Mass-Storage/SPC-2' or 'Mass-Storage/RBC', + * use the SCSI vendor and model as USB-Vendor and USB-model. + * 3.) Otherwise, use the USB manufacturer and product as + * USB-Vendor and USB-model. Any non-printable characters + * in those strings will be skipped; a slash '/' will be converted + * into a full stop '.'. + * 4.) If that fails, too, we will use idVendor and idProduct + * as USB-Vendor and USB-model. + * 5.) The USB identification is the USB-vendor and USB-model + * string concatenated with an underscore '_'. + * 6.) If the device supplies a serial number, this number + * is concatenated with the identification with an underscore '_'. + */ +static int builtin_usb_id(UdevEvent *event, int argc, char *argv[], bool test) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + char vendor_str[64] = ""; + char vendor_str_enc[256]; + const char *vendor_id; + char model_str[64] = ""; + char model_str_enc[256]; + const char *product_id; + char serial_str[UDEV_NAME_SIZE] = ""; + char packed_if_str[UDEV_NAME_SIZE] = ""; + char revision_str[64] = ""; + char type_str[64] = ""; + char instance_str[64] = ""; + const char *ifnum = NULL; + const char *driver = NULL; + char serial[256]; + + sd_device *dev_interface, *dev_usb; + const char *if_class, *if_subclass; + unsigned if_class_num; + int protocol = 0; + size_t l; + char *s; + + const char *syspath, *sysname, *devtype, *interface_syspath; + int r; + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) + return r; + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return r; + + /* shortcut, if we are called directly for a "usb_device" type */ + if (sd_device_get_devtype(dev, &devtype) >= 0 && streq(devtype, "usb_device")) { + dev_if_packed_info(dev, packed_if_str, sizeof(packed_if_str)); + dev_usb = dev; + goto fallback; + } + + /* usb interface directory */ + r = sd_device_get_parent_with_subsystem_devtype(dev, "usb", "usb_interface", &dev_interface); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to access usb_interface: %m"); + + r = sd_device_get_syspath(dev_interface, &interface_syspath); + if (r < 0) + return log_device_debug_errno(dev_interface, r, "Failed to get syspath: %m"); + (void) sd_device_get_sysattr_value(dev_interface, "bInterfaceNumber", &ifnum); + (void) sd_device_get_sysattr_value(dev_interface, "driver", &driver); + + r = sd_device_get_sysattr_value(dev_interface, "bInterfaceClass", &if_class); + if (r < 0) + return log_device_debug_errno(dev_interface, r, "Failed to get bInterfaceClass attribute: %m"); + + r = safe_atou_full(if_class, 16, &if_class_num); + if (r < 0) + return log_device_debug_errno(dev_interface, r, "Failed to parse if_class: %m"); + if (if_class_num == 8) { + /* mass storage */ + if (sd_device_get_sysattr_value(dev_interface, "bInterfaceSubClass", &if_subclass) >= 0) + protocol = set_usb_mass_storage_ifsubtype(type_str, if_subclass, sizeof(type_str)-1); + } else + set_usb_iftype(type_str, if_class_num, sizeof(type_str)-1); + + log_device_debug(dev_interface, "if_class:%u protocol:%i", if_class_num, protocol); + + /* usb device directory */ + r = sd_device_get_parent_with_subsystem_devtype(dev_interface, "usb", "usb_device", &dev_usb); + if (r < 0) + return log_device_debug_errno(dev_interface, r, "Failed to find parent 'usb' device"); + + /* all interfaces of the device in a single string */ + dev_if_packed_info(dev_usb, packed_if_str, sizeof(packed_if_str)); + + /* mass storage : SCSI or ATAPI */ + if (IN_SET(protocol, 6, 2)) { + sd_device *dev_scsi; + const char *scsi_sysname, *scsi_model, *scsi_vendor, *scsi_type, *scsi_rev; + int host, bus, target, lun; + + /* get scsi device */ + r = sd_device_get_parent_with_subsystem_devtype(dev, "scsi", "scsi_device", &dev_scsi); + if (r < 0) { + log_device_debug_errno(dev, r, "Unable to find parent SCSI device"); + goto fallback; + } + if (sd_device_get_sysname(dev_scsi, &scsi_sysname) < 0) + goto fallback; + if (sscanf(scsi_sysname, "%d:%d:%d:%d", &host, &bus, &target, &lun) != 4) { + log_device_debug(dev_scsi, "Invalid SCSI device"); + goto fallback; + } + + /* Generic SPC-2 device */ + r = sd_device_get_sysattr_value(dev_scsi, "vendor", &scsi_vendor); + if (r < 0) { + log_device_debug_errno(dev_scsi, r, "Failed to get SCSI vendor attribute: %m"); + goto fallback; + } + encode_devnode_name(scsi_vendor, vendor_str_enc, sizeof(vendor_str_enc)); + udev_replace_whitespace(scsi_vendor, vendor_str, sizeof(vendor_str)-1); + udev_replace_chars(vendor_str, NULL); + + r = sd_device_get_sysattr_value(dev_scsi, "model", &scsi_model); + if (r < 0) { + log_device_debug_errno(dev_scsi, r, "Failed to get SCSI model attribute: %m"); + goto fallback; + } + encode_devnode_name(scsi_model, model_str_enc, sizeof(model_str_enc)); + udev_replace_whitespace(scsi_model, model_str, sizeof(model_str)-1); + udev_replace_chars(model_str, NULL); + + r = sd_device_get_sysattr_value(dev_scsi, "type", &scsi_type); + if (r < 0) { + log_device_debug_errno(dev_scsi, r, "Failed to get SCSI type attribute: %m"); + goto fallback; + } + set_scsi_type(type_str, scsi_type, sizeof(type_str)-1); + + r = sd_device_get_sysattr_value(dev_scsi, "rev", &scsi_rev); + if (r < 0) { + log_device_debug_errno(dev_scsi, r, "Failed to get SCSI revision attribute: %m"); + goto fallback; + } + udev_replace_whitespace(scsi_rev, revision_str, sizeof(revision_str)-1); + udev_replace_chars(revision_str, NULL); + + /* + * some broken devices have the same identifiers + * for all luns, export the target:lun number + */ + sprintf(instance_str, "%d:%d", target, lun); + } + +fallback: + r = sd_device_get_sysattr_value(dev_usb, "idVendor", &vendor_id); + if (r < 0) + return log_device_debug_errno(dev_usb, r, "Failed to get idVendor attribute: %m"); + + r = sd_device_get_sysattr_value(dev_usb, "idProduct", &product_id); + if (r < 0) + return log_device_debug_errno(dev_usb, r, "Failed to get idProduct attribute: %m"); + + /* fall back to USB vendor & device */ + if (vendor_str[0] == '\0') { + const char *usb_vendor; + + if (sd_device_get_sysattr_value(dev_usb, "manufacturer", &usb_vendor) < 0) + usb_vendor = vendor_id; + encode_devnode_name(usb_vendor, vendor_str_enc, sizeof(vendor_str_enc)); + udev_replace_whitespace(usb_vendor, vendor_str, sizeof(vendor_str)-1); + udev_replace_chars(vendor_str, NULL); + } + + if (model_str[0] == '\0') { + const char *usb_model; + + if (sd_device_get_sysattr_value(dev_usb, "product", &usb_model) < 0) + usb_model = product_id; + encode_devnode_name(usb_model, model_str_enc, sizeof(model_str_enc)); + udev_replace_whitespace(usb_model, model_str, sizeof(model_str)-1); + udev_replace_chars(model_str, NULL); + } + + if (revision_str[0] == '\0') { + const char *usb_rev; + + if (sd_device_get_sysattr_value(dev_usb, "bcdDevice", &usb_rev) >= 0) { + udev_replace_whitespace(usb_rev, revision_str, sizeof(revision_str)-1); + udev_replace_chars(revision_str, NULL); + } + } + + if (serial_str[0] == '\0') { + const char *usb_serial; + + if (sd_device_get_sysattr_value(dev_usb, "serial", &usb_serial) >= 0) { + /* http://msdn.microsoft.com/en-us/library/windows/hardware/gg487321.aspx */ + for (const unsigned char *p = (unsigned char*) usb_serial; *p != '\0'; p++) + if (*p < 0x20 || *p > 0x7f || *p == ',') { + usb_serial = NULL; + break; + } + + if (usb_serial) { + udev_replace_whitespace(usb_serial, serial_str, sizeof(serial_str)-1); + udev_replace_chars(serial_str, NULL); + } + } + } + + s = serial; + l = strpcpyl(&s, sizeof(serial), vendor_str, "_", model_str, NULL); + if (!isempty(serial_str)) + l = strpcpyl(&s, l, "_", serial_str, NULL); + + if (!isempty(instance_str)) + strpcpyl(&s, l, "-", instance_str, NULL); + + if (sd_device_get_property_value(dev, "ID_BUS", NULL) >= 0) + log_device_debug(dev, "ID_BUS property is already set, setting only properties prefixed with \"ID_USB_\"."); + else { + udev_builtin_add_property(dev, test, "ID_BUS", "usb"); + + udev_builtin_add_property(dev, test, "ID_MODEL", model_str); + udev_builtin_add_property(dev, test, "ID_MODEL_ENC", model_str_enc); + udev_builtin_add_property(dev, test, "ID_MODEL_ID", product_id); + + udev_builtin_add_property(dev, test, "ID_SERIAL", serial); + if (!isempty(serial_str)) + udev_builtin_add_property(dev, test, "ID_SERIAL_SHORT", serial_str); + + udev_builtin_add_property(dev, test, "ID_VENDOR", vendor_str); + udev_builtin_add_property(dev, test, "ID_VENDOR_ENC", vendor_str_enc); + udev_builtin_add_property(dev, test, "ID_VENDOR_ID", vendor_id); + + udev_builtin_add_property(dev, test, "ID_REVISION", revision_str); + + if (!isempty(type_str)) + udev_builtin_add_property(dev, test, "ID_TYPE", type_str); + + if (!isempty(instance_str)) + udev_builtin_add_property(dev, test, "ID_INSTANCE", instance_str); + } + + /* Also export the same values in the above by prefixing ID_USB_. */ + udev_builtin_add_property(dev, test, "ID_USB_MODEL", model_str); + udev_builtin_add_property(dev, test, "ID_USB_MODEL_ENC", model_str_enc); + udev_builtin_add_property(dev, test, "ID_USB_MODEL_ID", product_id); + udev_builtin_add_property(dev, test, "ID_USB_SERIAL", serial); + if (!isempty(serial_str)) + udev_builtin_add_property(dev, test, "ID_USB_SERIAL_SHORT", serial_str); + + udev_builtin_add_property(dev, test, "ID_USB_VENDOR", vendor_str); + udev_builtin_add_property(dev, test, "ID_USB_VENDOR_ENC", vendor_str_enc); + udev_builtin_add_property(dev, test, "ID_USB_VENDOR_ID", vendor_id); + + udev_builtin_add_property(dev, test, "ID_USB_REVISION", revision_str); + + if (!isempty(type_str)) + udev_builtin_add_property(dev, test, "ID_USB_TYPE", type_str); + + if (!isempty(instance_str)) + udev_builtin_add_property(dev, test, "ID_USB_INSTANCE", instance_str); + + if (!isempty(packed_if_str)) + udev_builtin_add_property(dev, test, "ID_USB_INTERFACES", packed_if_str); + if (ifnum) + udev_builtin_add_property(dev, test, "ID_USB_INTERFACE_NUM", ifnum); + if (driver) + udev_builtin_add_property(dev, test, "ID_USB_DRIVER", driver); + return 0; +} + +const UdevBuiltin udev_builtin_usb_id = { + .name = "usb_id", + .cmd = builtin_usb_id, + .help = "USB device properties", + .run_once = true, +}; diff --git a/src/udev/udev-builtin.c b/src/udev/udev-builtin.c new file mode 100644 index 0000000..bcc2018 --- /dev/null +++ b/src/udev/udev-builtin.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "device-private.h" +#include "device-util.h" +#include "string-util.h" +#include "strv.h" +#include "udev-builtin.h" + +static bool initialized; + +static const UdevBuiltin *const builtins[_UDEV_BUILTIN_MAX] = { +#if HAVE_BLKID + [UDEV_BUILTIN_BLKID] = &udev_builtin_blkid, +#endif + [UDEV_BUILTIN_BTRFS] = &udev_builtin_btrfs, + [UDEV_BUILTIN_HWDB] = &udev_builtin_hwdb, + [UDEV_BUILTIN_INPUT_ID] = &udev_builtin_input_id, + [UDEV_BUILTIN_KEYBOARD] = &udev_builtin_keyboard, +#if HAVE_KMOD + [UDEV_BUILTIN_KMOD] = &udev_builtin_kmod, +#endif + [UDEV_BUILTIN_NET_DRIVER] = &udev_builtin_net_driver, + [UDEV_BUILTIN_NET_ID] = &udev_builtin_net_id, + [UDEV_BUILTIN_NET_LINK] = &udev_builtin_net_setup_link, + [UDEV_BUILTIN_PATH_ID] = &udev_builtin_path_id, + [UDEV_BUILTIN_USB_ID] = &udev_builtin_usb_id, +#if HAVE_ACL + [UDEV_BUILTIN_UACCESS] = &udev_builtin_uaccess, +#endif +}; + +void udev_builtin_init(void) { + if (initialized) + return; + + for (UdevBuiltinCommand i = 0; i < _UDEV_BUILTIN_MAX; i++) + if (builtins[i] && builtins[i]->init) + builtins[i]->init(); + + initialized = true; +} + +void udev_builtin_exit(void) { + if (!initialized) + return; + + for (UdevBuiltinCommand i = 0; i < _UDEV_BUILTIN_MAX; i++) + if (builtins[i] && builtins[i]->exit) + builtins[i]->exit(); + + initialized = false; +} + +bool udev_builtin_should_reload(void) { + for (UdevBuiltinCommand i = 0; i < _UDEV_BUILTIN_MAX; i++) + if (builtins[i] && builtins[i]->should_reload && builtins[i]->should_reload()) + return true; + return false; +} + +void udev_builtin_list(void) { + for (UdevBuiltinCommand i = 0; i < _UDEV_BUILTIN_MAX; i++) + if (builtins[i]) + fprintf(stderr, " %-14s %s\n", builtins[i]->name, builtins[i]->help); +} + +const char *udev_builtin_name(UdevBuiltinCommand cmd) { + assert(cmd >= 0 && cmd < _UDEV_BUILTIN_MAX); + + if (!builtins[cmd]) + return NULL; + + return builtins[cmd]->name; +} + +bool udev_builtin_run_once(UdevBuiltinCommand cmd) { + assert(cmd >= 0 && cmd < _UDEV_BUILTIN_MAX); + + if (!builtins[cmd]) + return false; + + return builtins[cmd]->run_once; +} + +UdevBuiltinCommand udev_builtin_lookup(const char *command) { + size_t n; + + assert(command); + + command += strspn(command, WHITESPACE); + n = strcspn(command, WHITESPACE); + for (UdevBuiltinCommand i = 0; i < _UDEV_BUILTIN_MAX; i++) + if (builtins[i] && strneq(builtins[i]->name, command, n)) + return i; + + return _UDEV_BUILTIN_INVALID; +} + +int udev_builtin_run(UdevEvent *event, UdevBuiltinCommand cmd, const char *command, bool test) { + _cleanup_strv_free_ char **argv = NULL; + int r; + + assert(event); + assert(event->dev); + assert(cmd >= 0 && cmd < _UDEV_BUILTIN_MAX); + assert(command); + + if (!builtins[cmd]) + return -EOPNOTSUPP; + + r = strv_split_full(&argv, command, NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX | EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return r; + + /* we need '0' here to reset the internal state */ + optind = 0; + return builtins[cmd]->cmd(event, strv_length(argv), argv, test); +} + +int udev_builtin_add_property(sd_device *dev, bool test, const char *key, const char *val) { + int r; + + assert(dev); + assert(key); + + r = device_add_property(dev, key, val); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to add property '%s%s%s'", + key, val ? "=" : "", strempty(val)); + + if (test) + printf("%s=%s\n", key, strempty(val)); + + return 0; +} + +int udev_builtin_add_propertyf(sd_device *dev, bool test, const char *key, const char *valf, ...) { + _cleanup_free_ char *val = NULL; + va_list ap; + int r; + + assert(dev); + assert(key); + assert(valf); + + va_start(ap, valf); + r = vasprintf(&val, valf, ap); + va_end(ap); + if (r < 0) + return log_oom_debug(); + + return udev_builtin_add_property(dev, test, key, val); +} diff --git a/src/udev/udev-builtin.h b/src/udev/udev-builtin.h new file mode 100644 index 0000000..fcd41d6 --- /dev/null +++ b/src/udev/udev-builtin.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include + +#include "sd-device.h" +#include "sd-netlink.h" + +#include "macro.h" +#include "udev-event.h" + +typedef enum UdevBuiltinCommand { +#if HAVE_BLKID + UDEV_BUILTIN_BLKID, +#endif + UDEV_BUILTIN_BTRFS, + UDEV_BUILTIN_HWDB, + UDEV_BUILTIN_INPUT_ID, + UDEV_BUILTIN_KEYBOARD, +#if HAVE_KMOD + UDEV_BUILTIN_KMOD, +#endif + UDEV_BUILTIN_NET_DRIVER, + UDEV_BUILTIN_NET_ID, + UDEV_BUILTIN_NET_LINK, + UDEV_BUILTIN_PATH_ID, + UDEV_BUILTIN_USB_ID, +#if HAVE_ACL + UDEV_BUILTIN_UACCESS, +#endif + _UDEV_BUILTIN_MAX, + _UDEV_BUILTIN_INVALID = -EINVAL, +} UdevBuiltinCommand; + +typedef struct UdevBuiltin { + const char *name; + int (*cmd)(UdevEvent *event, int argc, char *argv[], bool test); + const char *help; + int (*init)(void); + void (*exit)(void); + bool (*should_reload)(void); + bool run_once; +} UdevBuiltin; + +#define UDEV_BUILTIN_CMD_TO_PTR(u) \ + ({ \ + UdevBuiltinCommand _u = (u); \ + _u < 0 ? NULL : (void*)(intptr_t) (_u + 1); \ + }) + +#define PTR_TO_UDEV_BUILTIN_CMD(p) \ + ({ \ + void *_p = (p); \ + _p && (intptr_t)(_p) <= _UDEV_BUILTIN_MAX ? \ + (UdevBuiltinCommand)((intptr_t)_p - 1) : _UDEV_BUILTIN_INVALID; \ + }) + +#if HAVE_BLKID +extern const UdevBuiltin udev_builtin_blkid; +#endif +extern const UdevBuiltin udev_builtin_btrfs; +extern const UdevBuiltin udev_builtin_hwdb; +extern const UdevBuiltin udev_builtin_input_id; +extern const UdevBuiltin udev_builtin_keyboard; +#if HAVE_KMOD +extern const UdevBuiltin udev_builtin_kmod; +#endif +extern const UdevBuiltin udev_builtin_net_driver; +extern const UdevBuiltin udev_builtin_net_id; +extern const UdevBuiltin udev_builtin_net_setup_link; +extern const UdevBuiltin udev_builtin_path_id; +extern const UdevBuiltin udev_builtin_usb_id; +#if HAVE_ACL +extern const UdevBuiltin udev_builtin_uaccess; +#endif + +void udev_builtin_init(void); +void udev_builtin_exit(void); +UdevBuiltinCommand udev_builtin_lookup(const char *command); +const char *udev_builtin_name(UdevBuiltinCommand cmd); +bool udev_builtin_run_once(UdevBuiltinCommand cmd); +int udev_builtin_run(UdevEvent *event, UdevBuiltinCommand cmd, const char *command, bool test); +void udev_builtin_list(void); +bool udev_builtin_should_reload(void); +int udev_builtin_add_property(sd_device *dev, bool test, const char *key, const char *val); +int udev_builtin_add_propertyf(sd_device *dev, bool test, const char *key, const char *valf, ...) _printf_(4, 5); +int udev_builtin_hwdb_lookup(sd_device *dev, const char *prefix, const char *modalias, + const char *filter, bool test); diff --git a/src/udev/udev-ctrl.c b/src/udev/udev-ctrl.c new file mode 100644 index 0000000..2871634 --- /dev/null +++ b/src/udev/udev-ctrl.c @@ -0,0 +1,353 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sd-event.h" + +#include "alloc-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "iovec-util.h" +#include "socket-util.h" +#include "strxcpyx.h" +#include "udev-ctrl.h" + +/* wire protocol magic must match */ +#define UDEV_CTRL_MAGIC 0xdead1dea + +typedef struct UdevCtrlMessageWire { + char version[16]; + unsigned magic; + UdevCtrlMessageType type; + UdevCtrlMessageValue value; +} UdevCtrlMessageWire; + +struct UdevCtrl { + unsigned n_ref; + int sock; + int sock_connect; + union sockaddr_union saddr; + socklen_t addrlen; + bool bound; + bool connected; + sd_event *event; + sd_event_source *event_source; + sd_event_source *event_source_connect; + udev_ctrl_handler_t callback; + void *userdata; +}; + +int udev_ctrl_new_from_fd(UdevCtrl **ret, int fd) { + _cleanup_close_ int sock = -EBADF; + UdevCtrl *uctrl; + + assert(ret); + + if (fd < 0) { + sock = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0); + if (sock < 0) + return log_error_errno(errno, "Failed to create socket: %m"); + } + + uctrl = new(UdevCtrl, 1); + if (!uctrl) + return -ENOMEM; + + *uctrl = (UdevCtrl) { + .n_ref = 1, + .sock = fd >= 0 ? fd : TAKE_FD(sock), + .sock_connect = -EBADF, + .bound = fd >= 0, + }; + + uctrl->saddr.un = (struct sockaddr_un) { + .sun_family = AF_UNIX, + .sun_path = "/run/udev/control", + }; + + uctrl->addrlen = SOCKADDR_UN_LEN(uctrl->saddr.un); + + *ret = TAKE_PTR(uctrl); + return 0; +} + +int udev_ctrl_enable_receiving(UdevCtrl *uctrl) { + assert(uctrl); + + if (uctrl->bound) + return 0; + + (void) sockaddr_un_unlink(&uctrl->saddr.un); + if (bind(uctrl->sock, &uctrl->saddr.sa, uctrl->addrlen) < 0) + return log_error_errno(errno, "Failed to bind udev control socket: %m"); + + if (listen(uctrl->sock, 0) < 0) + return log_error_errno(errno, "Failed to listen udev control socket: %m"); + + uctrl->bound = true; + return 0; +} + +static void udev_ctrl_disconnect(UdevCtrl *uctrl) { + if (!uctrl) + return; + + uctrl->event_source_connect = sd_event_source_unref(uctrl->event_source_connect); + uctrl->sock_connect = safe_close(uctrl->sock_connect); +} + +static UdevCtrl *udev_ctrl_free(UdevCtrl *uctrl) { + assert(uctrl); + + udev_ctrl_disconnect(uctrl); + + sd_event_source_unref(uctrl->event_source); + safe_close(uctrl->sock); + + sd_event_unref(uctrl->event); + return mfree(uctrl); +} + +DEFINE_TRIVIAL_REF_UNREF_FUNC(UdevCtrl, udev_ctrl, udev_ctrl_free); + +int udev_ctrl_attach_event(UdevCtrl *uctrl, sd_event *event) { + int r; + + assert_return(uctrl, -EINVAL); + assert_return(!uctrl->event, -EBUSY); + + if (event) + uctrl->event = sd_event_ref(event); + else { + r = sd_event_default(&uctrl->event); + if (r < 0) + return r; + } + + return 0; +} + +sd_event_source *udev_ctrl_get_event_source(UdevCtrl *uctrl) { + assert(uctrl); + + return uctrl->event_source; +} + +static void udev_ctrl_disconnect_and_listen_again(UdevCtrl *uctrl) { + udev_ctrl_disconnect(uctrl); + udev_ctrl_unref(uctrl); + (void) sd_event_source_set_enabled(uctrl->event_source, SD_EVENT_ON); + /* We don't return NULL here because uctrl is not freed */ +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(UdevCtrl*, udev_ctrl_disconnect_and_listen_again, NULL); + +static int udev_ctrl_connection_event_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + _cleanup_(udev_ctrl_disconnect_and_listen_againp) UdevCtrl *uctrl = NULL; + UdevCtrlMessageWire msg_wire; + struct iovec iov = IOVEC_MAKE(&msg_wire, sizeof(UdevCtrlMessageWire)); + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + struct msghdr smsg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct ucred *cred; + ssize_t size; + + assert(userdata); + + /* When UDEV_CTRL_EXIT is received, manager unref udev_ctrl object. + * To avoid the object freed, let's increment the refcount. */ + uctrl = udev_ctrl_ref(userdata); + + size = next_datagram_size_fd(fd); + if (size < 0) + return log_error_errno(size, "Failed to get size of message: %m"); + if (size == 0) + return 0; /* Client disconnects? */ + + size = recvmsg_safe(fd, &smsg, 0); + if (size == -EINTR) + return 0; + if (size < 0) + return log_error_errno(size, "Failed to receive ctrl message: %m"); + + cmsg_close_all(&smsg); + + cred = CMSG_FIND_DATA(&smsg, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!cred) { + log_error("No sender credentials received, ignoring message"); + return 0; + } + + if (cred->uid != 0) { + log_error("Invalid sender uid "UID_FMT", ignoring message", cred->uid); + return 0; + } + + if (msg_wire.magic != UDEV_CTRL_MAGIC) { + log_error("Message magic 0x%08x doesn't match, ignoring message", msg_wire.magic); + return 0; + } + + if (msg_wire.type == _UDEV_CTRL_END_MESSAGES) + return 0; + + if (uctrl->callback) + (void) uctrl->callback(uctrl, msg_wire.type, &msg_wire.value, uctrl->userdata); + + /* Do not disconnect and wait for next message. */ + uctrl = udev_ctrl_unref(uctrl); + return 0; +} + +static int udev_ctrl_event_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + UdevCtrl *uctrl = ASSERT_PTR(userdata); + _cleanup_close_ int sock = -EBADF; + struct ucred ucred; + int r; + + sock = accept4(fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK); + if (sock < 0) { + if (ERRNO_IS_ACCEPT_AGAIN(errno)) + return 0; + + return log_error_errno(errno, "Failed to accept ctrl connection: %m"); + } + + /* check peer credential of connection */ + r = getpeercred(sock, &ucred); + if (r < 0) { + log_error_errno(r, "Failed to receive credentials of ctrl connection: %m"); + return 0; + } + + if (ucred.uid > 0) { + log_error("Invalid sender uid "UID_FMT", closing connection", ucred.uid); + return 0; + } + + /* enable receiving of the sender credentials in the messages */ + r = setsockopt_int(sock, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + log_warning_errno(r, "Failed to set SO_PASSCRED, ignoring: %m"); + + r = sd_event_add_io(uctrl->event, &uctrl->event_source_connect, sock, EPOLLIN, udev_ctrl_connection_event_handler, uctrl); + if (r < 0) { + log_error_errno(r, "Failed to create event source for udev control connection: %m"); + return 0; + } + + (void) sd_event_source_set_description(uctrl->event_source_connect, "udev-ctrl-connection"); + + /* Do not accept multiple connection. */ + (void) sd_event_source_set_enabled(uctrl->event_source, SD_EVENT_OFF); + + uctrl->sock_connect = TAKE_FD(sock); + return 0; +} + +int udev_ctrl_start(UdevCtrl *uctrl, udev_ctrl_handler_t callback, void *userdata) { + int r; + + assert(uctrl); + + if (!uctrl->event) { + r = udev_ctrl_attach_event(uctrl, NULL); + if (r < 0) + return r; + } + + r = udev_ctrl_enable_receiving(uctrl); + if (r < 0) + return r; + + uctrl->callback = callback; + uctrl->userdata = userdata; + + r = sd_event_add_io(uctrl->event, &uctrl->event_source, uctrl->sock, EPOLLIN, udev_ctrl_event_handler, uctrl); + if (r < 0) + return r; + + (void) sd_event_source_set_description(uctrl->event_source, "udev-ctrl"); + + return 0; +} + +int udev_ctrl_send(UdevCtrl *uctrl, UdevCtrlMessageType type, const void *data) { + UdevCtrlMessageWire ctrl_msg_wire = { + .version = "udev-" STRINGIFY(PROJECT_VERSION), + .magic = UDEV_CTRL_MAGIC, + .type = type, + }; + + if (type == UDEV_CTRL_SET_ENV) { + assert(data); + strscpy(ctrl_msg_wire.value.buf, sizeof(ctrl_msg_wire.value.buf), data); + } else if (IN_SET(type, UDEV_CTRL_SET_LOG_LEVEL, UDEV_CTRL_SET_CHILDREN_MAX)) + ctrl_msg_wire.value.intval = PTR_TO_INT(data); + + if (!uctrl->connected) { + if (connect(uctrl->sock, &uctrl->saddr.sa, uctrl->addrlen) < 0) + return -errno; + uctrl->connected = true; + } + + if (send(uctrl->sock, &ctrl_msg_wire, sizeof(ctrl_msg_wire), 0) < 0) + return -errno; + + return 0; +} + +int udev_ctrl_wait(UdevCtrl *uctrl, usec_t timeout) { + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source_io = NULL, *source_timeout = NULL; + int r; + + assert(uctrl); + + if (uctrl->sock < 0) + return 0; + if (!uctrl->connected) + return 0; + + r = udev_ctrl_send(uctrl, _UDEV_CTRL_END_MESSAGES, NULL); + if (r < 0) + return r; + + if (timeout == 0) + return 0; + + if (!uctrl->event) { + r = udev_ctrl_attach_event(uctrl, NULL); + if (r < 0) + return r; + } + + r = sd_event_add_io(uctrl->event, &source_io, uctrl->sock, EPOLLIN, NULL, INT_TO_PTR(0)); + if (r < 0) + return r; + + (void) sd_event_source_set_description(source_io, "udev-ctrl-wait-io"); + + if (timeout != USEC_INFINITY) { + r = sd_event_add_time_relative( + uctrl->event, &source_timeout, CLOCK_BOOTTIME, + timeout, + 0, NULL, INT_TO_PTR(-ETIMEDOUT)); + if (r < 0) + return r; + + (void) sd_event_source_set_description(source_timeout, "udev-ctrl-wait-timeout"); + } + + return sd_event_loop(uctrl->event); +} diff --git a/src/udev/udev-ctrl.h b/src/udev/udev-ctrl.h new file mode 100644 index 0000000..11fc0b6 --- /dev/null +++ b/src/udev/udev-ctrl.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include "sd-event.h" + +#include "macro.h" +#include "time-util.h" + +typedef struct UdevCtrl UdevCtrl; + +typedef enum UdevCtrlMessageType { + _UDEV_CTRL_END_MESSAGES, + UDEV_CTRL_SET_LOG_LEVEL, + UDEV_CTRL_STOP_EXEC_QUEUE, + UDEV_CTRL_START_EXEC_QUEUE, + UDEV_CTRL_RELOAD, + UDEV_CTRL_SET_ENV, + UDEV_CTRL_SET_CHILDREN_MAX, + UDEV_CTRL_PING, + UDEV_CTRL_EXIT, +} UdevCtrlMessageType; + +typedef union UdevCtrlMessageValue { + int intval; + char buf[256]; +} UdevCtrlMessageValue; + +typedef int (*udev_ctrl_handler_t)(UdevCtrl *udev_ctrl, UdevCtrlMessageType type, + const UdevCtrlMessageValue *value, void *userdata); + +int udev_ctrl_new_from_fd(UdevCtrl **ret, int fd); +static inline int udev_ctrl_new(UdevCtrl **ret) { + return udev_ctrl_new_from_fd(ret, -1); +} + +int udev_ctrl_enable_receiving(UdevCtrl *uctrl); +UdevCtrl *udev_ctrl_ref(UdevCtrl *uctrl); +UdevCtrl *udev_ctrl_unref(UdevCtrl *uctrl); +int udev_ctrl_attach_event(UdevCtrl *uctrl, sd_event *event); +int udev_ctrl_start(UdevCtrl *uctrl, udev_ctrl_handler_t callback, void *userdata); +sd_event_source *udev_ctrl_get_event_source(UdevCtrl *uctrl); + +int udev_ctrl_wait(UdevCtrl *uctrl, usec_t timeout); + +int udev_ctrl_send(UdevCtrl *uctrl, UdevCtrlMessageType type, const void *data); +static inline int udev_ctrl_send_set_log_level(UdevCtrl *uctrl, int priority) { + return udev_ctrl_send(uctrl, UDEV_CTRL_SET_LOG_LEVEL, INT_TO_PTR(priority)); +} + +static inline int udev_ctrl_send_stop_exec_queue(UdevCtrl *uctrl) { + return udev_ctrl_send(uctrl, UDEV_CTRL_STOP_EXEC_QUEUE, NULL); +} + +static inline int udev_ctrl_send_start_exec_queue(UdevCtrl *uctrl) { + return udev_ctrl_send(uctrl, UDEV_CTRL_START_EXEC_QUEUE, NULL); +} + +static inline int udev_ctrl_send_reload(UdevCtrl *uctrl) { + return udev_ctrl_send(uctrl, UDEV_CTRL_RELOAD, NULL); +} + +static inline int udev_ctrl_send_set_env(UdevCtrl *uctrl, const char *key) { + return udev_ctrl_send(uctrl, UDEV_CTRL_SET_ENV, key); +} + +static inline int udev_ctrl_send_set_children_max(UdevCtrl *uctrl, int count) { + return udev_ctrl_send(uctrl, UDEV_CTRL_SET_CHILDREN_MAX, INT_TO_PTR(count)); +} + +static inline int udev_ctrl_send_ping(UdevCtrl *uctrl) { + return udev_ctrl_send(uctrl, UDEV_CTRL_PING, NULL); +} + +static inline int udev_ctrl_send_exit(UdevCtrl *uctrl) { + return udev_ctrl_send(uctrl, UDEV_CTRL_EXIT, NULL); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(UdevCtrl*, udev_ctrl_unref); diff --git a/src/udev/udev-event.c b/src/udev/udev-event.c new file mode 100644 index 0000000..ed22c8b --- /dev/null +++ b/src/udev/udev-event.c @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "alloc-util.h" +#include "device-internal.h" +#include "device-private.h" +#include "device-util.h" +#include "fs-util.h" +#include "netif-naming-scheme.h" +#include "netlink-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "udev-event.h" +#include "udev-node.h" +#include "udev-trace.h" +#include "udev-util.h" +#include "udev-watch.h" +#include "user-util.h" + +UdevEvent *udev_event_new(sd_device *dev, usec_t exec_delay_usec, sd_netlink *rtnl, int log_level) { + UdevEvent *event; + + assert(dev); + + event = new(UdevEvent, 1); + if (!event) + return NULL; + + *event = (UdevEvent) { + .dev = sd_device_ref(dev), + .birth_usec = now(CLOCK_MONOTONIC), + .exec_delay_usec = exec_delay_usec, + .rtnl = sd_netlink_ref(rtnl), + .uid = UID_INVALID, + .gid = GID_INVALID, + .mode = MODE_INVALID, + .log_level_was_debug = log_level == LOG_DEBUG, + .default_log_level = log_level, + }; + + return event; +} + +UdevEvent *udev_event_free(UdevEvent *event) { + if (!event) + return NULL; + + sd_device_unref(event->dev); + sd_device_unref(event->dev_db_clone); + sd_netlink_unref(event->rtnl); + ordered_hashmap_free_free_key(event->run_list); + ordered_hashmap_free_free_free(event->seclabel_list); + free(event->program_result); + free(event->name); + strv_free(event->altnames); + + return mfree(event); +} + +static int device_rename(sd_device *device, const char *name) { + _cleanup_free_ char *new_syspath = NULL; + const char *s; + int r; + + assert(device); + assert(name); + + if (!filename_is_valid(name)) + return -EINVAL; + + r = sd_device_get_syspath(device, &s); + if (r < 0) + return r; + + r = path_extract_directory(s, &new_syspath); + if (r < 0) + return r; + + if (!path_extend(&new_syspath, name)) + return -ENOMEM; + + if (!path_is_safe(new_syspath)) + return -EINVAL; + + /* At the time this is called, the renamed device may not exist yet. Hence, we cannot validate + * the new syspath. */ + r = device_set_syspath(device, new_syspath, /* verify = */ false); + if (r < 0) + return r; + + r = sd_device_get_property_value(device, "INTERFACE", &s); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + /* like DEVPATH_OLD, INTERFACE_OLD is not saved to the db, but only stays around for the current event */ + r = device_add_property_internal(device, "INTERFACE_OLD", s); + if (r < 0) + return r; + + return device_add_property_internal(device, "INTERFACE", name); +} + +static int rename_netif(UdevEvent *event) { + _cleanup_free_ char *old_syspath = NULL, *old_sysname = NULL; + const char *s; + sd_device *dev; + int ifindex, r; + + assert(event); + + if (!event->name) + return 0; /* No new name is requested. */ + + dev = ASSERT_PTR(event->dev); + + r = sd_device_get_ifindex(dev, &ifindex); + if (r == -ENOENT) + return 0; /* Device is not a network interface. */ + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get ifindex: %m"); + + if (naming_scheme_has(NAMING_REPLACE_STRICTLY) && + !ifname_valid(event->name)) { + log_device_warning(dev, "Invalid network interface name, ignoring: %s", event->name); + return 0; + } + + r = sd_device_get_sysname(dev, &s); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get sysname: %m"); + + if (streq(event->name, s)) + return 0; /* The interface name is already requested name. */ + + old_sysname = strdup(s); + if (!old_sysname) + return -ENOMEM; + + r = sd_device_get_syspath(dev, &s); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get syspath: %m"); + + old_syspath = strdup(s); + if (!old_syspath) + return -ENOMEM; + + r = device_rename(dev, event->name); + if (r < 0) { + /* Here and below, use dev_db_clone for logging, otherwise, logged message is prefixed with + * the new interface name, and e.g. 'networkctl status INTERFACE' does not show the message. */ + log_device_warning_errno(event->dev_db_clone, r, + "Failed to update properties with new name '%s': %m", event->name); + goto revert; + } + + /* Set ID_RENAMING boolean property here. It will be dropped when the corresponding move uevent is processed. */ + r = device_add_property(dev, "ID_RENAMING", "1"); + if (r < 0) { + log_device_warning_errno(event->dev_db_clone, r, "Failed to add 'ID_RENAMING' property: %m"); + goto revert; + } + + /* Also set ID_RENAMING boolean property to cloned sd_device object and save it to database + * before calling rtnl_set_link_name(). Otherwise, clients (e.g., systemd-networkd) may receive + * RTM_NEWLINK netlink message before the database is updated. */ + r = device_add_property(event->dev_db_clone, "ID_RENAMING", "1"); + if (r < 0) { + log_device_warning_errno(event->dev_db_clone, r, "Failed to add 'ID_RENAMING' property: %m"); + goto revert; + } + + r = device_update_db(event->dev_db_clone); + if (r < 0) { + log_device_debug_errno(event->dev_db_clone, r, "Failed to update database under /run/udev/data/: %m"); + goto revert; + } + + r = rtnl_set_link_name(&event->rtnl, ifindex, event->name, event->altnames); + if (r < 0) { + if (r == -EBUSY) { + log_device_info(event->dev_db_clone, + "Network interface '%s' is already up, cannot rename to '%s'.", + old_sysname, event->name); + r = 0; + } else + log_device_error_errno(event->dev_db_clone, r, + "Failed to rename network interface %i from '%s' to '%s': %m", + ifindex, old_sysname, event->name); + goto revert; + } + + log_device_debug(dev, "Network interface %i is renamed from '%s' to '%s'", ifindex, old_sysname, event->name); + return 1; + +revert: + /* Restore 'dev_db_clone' */ + (void) device_add_property(event->dev_db_clone, "ID_RENAMING", NULL); + (void) device_update_db(event->dev_db_clone); + + /* Restore 'dev' */ + (void) device_set_syspath(dev, old_syspath, /* verify = */ false); + if (sd_device_get_property_value(dev, "INTERFACE_OLD", &s) >= 0) { + (void) device_add_property_internal(dev, "INTERFACE", s); + (void) device_add_property_internal(dev, "INTERFACE_OLD", NULL); + } + (void) device_add_property(dev, "ID_RENAMING", NULL); + + return r; +} + +static int assign_altnames(UdevEvent *event) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + int ifindex, r; + const char *s; + + if (strv_isempty(event->altnames)) + return 0; + + r = sd_device_get_ifindex(dev, &ifindex); + if (r == -ENOENT) + return 0; /* Device is not a network interface. */ + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get ifindex: %m"); + + r = sd_device_get_sysname(dev, &s); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to get sysname: %m"); + + /* Filter out the current interface name. */ + strv_remove(event->altnames, s); + + r = rtnl_append_link_alternative_names(&event->rtnl, ifindex, event->altnames); + if (r < 0) + log_device_full_errno(dev, r == -EOPNOTSUPP ? LOG_DEBUG : LOG_WARNING, r, + "Could not set AlternativeName= or apply AlternativeNamesPolicy=, ignoring: %m"); + + return 0; +} + +static int update_devnode(UdevEvent *event) { + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + int r; + + r = sd_device_get_devnum(dev, NULL); + if (r == -ENOENT) + return 0; + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get devnum: %m"); + + if (!uid_is_valid(event->uid)) { + r = device_get_devnode_uid(dev, &event->uid); + if (r < 0 && r != -ENOENT) + return log_device_error_errno(dev, r, "Failed to get devnode UID: %m"); + } + + if (!gid_is_valid(event->gid)) { + r = device_get_devnode_gid(dev, &event->gid); + if (r < 0 && r != -ENOENT) + return log_device_error_errno(dev, r, "Failed to get devnode GID: %m"); + } + + if (event->mode == MODE_INVALID) { + r = device_get_devnode_mode(dev, &event->mode); + if (r < 0 && r != -ENOENT) + return log_device_error_errno(dev, r, "Failed to get devnode mode: %m"); + } + + bool apply_mac = device_for_action(dev, SD_DEVICE_ADD); + + r = udev_node_apply_permissions(dev, apply_mac, event->mode, event->uid, event->gid, event->seclabel_list); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to apply devnode permissions: %m"); + + return udev_node_update(dev, event->dev_db_clone); +} + +static int event_execute_rules_on_remove( + UdevEvent *event, + int inotify_fd, + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list, + UdevRules *rules) { + + sd_device *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + int r; + + r = device_read_db_internal(dev, true); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to read database under /run/udev/data/: %m"); + + r = device_tag_index(dev, NULL, false); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to remove corresponding tag files under /run/udev/tag/, ignoring: %m"); + + r = device_delete_db(dev); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to delete database under /run/udev/data/, ignoring: %m"); + + r = udev_watch_end(inotify_fd, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to remove inotify watch, ignoring: %m"); + + r = udev_rules_apply_to_event(rules, event, timeout_usec, timeout_signal, properties_list); + + if (sd_device_get_devnum(dev, NULL) >= 0) + (void) udev_node_remove(dev); + + return r; +} + +static int copy_all_tags(sd_device *d, sd_device *s) { + int r; + + assert(d); + + if (!s) + return 0; + + FOREACH_DEVICE_TAG(s, tag) { + r = device_add_tag(d, tag, false); + if (r < 0) + return r; + } + + return 0; +} + +int udev_event_execute_rules( + UdevEvent *event, + int inotify_fd, /* This may be negative */ + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list, + UdevRules *rules) { + + sd_device_action_t action; + sd_device *dev; + int r; + + dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + assert(rules); + + r = sd_device_get_action(dev, &action); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get ACTION: %m"); + + if (action == SD_DEVICE_REMOVE) + return event_execute_rules_on_remove(event, inotify_fd, timeout_usec, timeout_signal, properties_list, rules); + + /* Disable watch during event processing. */ + r = udev_watch_end(inotify_fd, dev); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to remove inotify watch, ignoring: %m"); + + r = device_clone_with_db(dev, &event->dev_db_clone); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to clone sd_device object: %m"); + + r = copy_all_tags(dev, event->dev_db_clone); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to copy all tags from old database entry, ignoring: %m"); + + /* Drop previously added property for safety to make IMPORT{db}="ID_RENAMING" not work. This is + * mostly for 'move' uevent, but let's do unconditionally. Why? If a network interface is renamed in + * initrd, then udevd may lose the 'move' uevent during switching root. Usually, we do not set the + * persistent flag for network interfaces, but user may set it. Just for safety. */ + r = device_add_property(event->dev_db_clone, "ID_RENAMING", NULL); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to remove 'ID_RENAMING' property: %m"); + + DEVICE_TRACE_POINT(rules_start, dev); + + r = udev_rules_apply_to_event(rules, event, timeout_usec, timeout_signal, properties_list); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to apply udev rules: %m"); + + DEVICE_TRACE_POINT(rules_finished, dev); + + if (action == SD_DEVICE_ADD) { + r = rename_netif(event); + if (r < 0) + return r; + if (r == 0) + (void) assign_altnames(event); + } + + r = update_devnode(event); + if (r < 0) + return r; + + /* preserve old, or get new initialization timestamp */ + r = device_ensure_usec_initialized(dev, event->dev_db_clone); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to set initialization timestamp: %m"); + + /* (re)write database file */ + r = device_tag_index(dev, event->dev_db_clone, true); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to update tags under /run/udev/tag/: %m"); + + r = device_update_db(dev); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to update database under /run/udev/data/: %m"); + + device_set_is_initialized(dev); + + return 0; +} diff --git a/src/udev/udev-event.h b/src/udev/udev-event.h new file mode 100644 index 0000000..6b94fd0 --- /dev/null +++ b/src/udev/udev-event.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +/* + * Copyright © 2003 Greg Kroah-Hartman + */ + +#include +#include + +#include "sd-device.h" +#include "sd-netlink.h" + +#include "hashmap.h" +#include "macro.h" +#include "time-util.h" +#include "udev-rules.h" +#include "user-util.h" + +typedef struct UdevEvent { + sd_device *dev; + sd_device *dev_parent; + sd_device *dev_db_clone; + char *name; + char **altnames; + char *program_result; + mode_t mode; + uid_t uid; + gid_t gid; + OrderedHashmap *seclabel_list; + OrderedHashmap *run_list; + usec_t exec_delay_usec; + usec_t birth_usec; + sd_netlink *rtnl; + unsigned builtin_run; + unsigned builtin_ret; + UdevRuleEscapeType esc:8; + bool inotify_watch; + bool inotify_watch_final; + bool group_final; + bool owner_final; + bool mode_final; + bool name_final; + bool devlink_final; + bool run_final; + bool log_level_was_debug; + int default_log_level; +} UdevEvent; + +UdevEvent *udev_event_new(sd_device *dev, usec_t exec_delay_usec, sd_netlink *rtnl, int log_level); +UdevEvent *udev_event_free(UdevEvent *event); +DEFINE_TRIVIAL_CLEANUP_FUNC(UdevEvent*, udev_event_free); + +int udev_event_execute_rules( + UdevEvent *event, + int inotify_fd, + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list, + UdevRules *rules); diff --git a/src/udev/udev-format.c b/src/udev/udev-format.c new file mode 100644 index 0000000..05ed9fd --- /dev/null +++ b/src/udev/udev-format.c @@ -0,0 +1,550 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "device-util.h" +#include "errno-util.h" +#include "parse-util.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "udev-event.h" +#include "udev-format.h" +#include "udev-util.h" + +typedef enum { + FORMAT_SUBST_DEVNODE, + FORMAT_SUBST_ATTR, + FORMAT_SUBST_ENV, + FORMAT_SUBST_KERNEL, + FORMAT_SUBST_KERNEL_NUMBER, + FORMAT_SUBST_DRIVER, + FORMAT_SUBST_DEVPATH, + FORMAT_SUBST_ID, + FORMAT_SUBST_MAJOR, + FORMAT_SUBST_MINOR, + FORMAT_SUBST_RESULT, + FORMAT_SUBST_PARENT, + FORMAT_SUBST_NAME, + FORMAT_SUBST_LINKS, + FORMAT_SUBST_ROOT, + FORMAT_SUBST_SYS, + _FORMAT_SUBST_TYPE_MAX, + _FORMAT_SUBST_TYPE_INVALID = -EINVAL, +} FormatSubstitutionType; + +struct subst_map_entry { + const char *name; + const char fmt; + FormatSubstitutionType type; +}; + +static const struct subst_map_entry map[] = { + { .name = "devnode", .fmt = 'N', .type = FORMAT_SUBST_DEVNODE }, + { .name = "tempnode", .fmt = 'N', .type = FORMAT_SUBST_DEVNODE }, /* deprecated */ + { .name = "attr", .fmt = 's', .type = FORMAT_SUBST_ATTR }, + { .name = "sysfs", .fmt = 's', .type = FORMAT_SUBST_ATTR }, /* deprecated */ + { .name = "env", .fmt = 'E', .type = FORMAT_SUBST_ENV }, + { .name = "kernel", .fmt = 'k', .type = FORMAT_SUBST_KERNEL }, + { .name = "number", .fmt = 'n', .type = FORMAT_SUBST_KERNEL_NUMBER }, + { .name = "driver", .fmt = 'd', .type = FORMAT_SUBST_DRIVER }, + { .name = "devpath", .fmt = 'p', .type = FORMAT_SUBST_DEVPATH }, + { .name = "id", .fmt = 'b', .type = FORMAT_SUBST_ID }, + { .name = "major", .fmt = 'M', .type = FORMAT_SUBST_MAJOR }, + { .name = "minor", .fmt = 'm', .type = FORMAT_SUBST_MINOR }, + { .name = "result", .fmt = 'c', .type = FORMAT_SUBST_RESULT }, + { .name = "parent", .fmt = 'P', .type = FORMAT_SUBST_PARENT }, + { .name = "name", .fmt = 'D', .type = FORMAT_SUBST_NAME }, + { .name = "links", .fmt = 'L', .type = FORMAT_SUBST_LINKS }, + { .name = "root", .fmt = 'r', .type = FORMAT_SUBST_ROOT }, + { .name = "sys", .fmt = 'S', .type = FORMAT_SUBST_SYS }, +}; + +static const char *format_type_to_string(FormatSubstitutionType t) { + for (size_t i = 0; i < ELEMENTSOF(map); i++) + if (map[i].type == t) + return map[i].name; + return NULL; +} + +static char format_type_to_char(FormatSubstitutionType t) { + for (size_t i = 0; i < ELEMENTSOF(map); i++) + if (map[i].type == t) + return map[i].fmt; + return '\0'; +} + +static int get_subst_type(const char **str, bool strict, FormatSubstitutionType *ret_type, char ret_attr[static UDEV_PATH_SIZE]) { + const char *p = *str, *q = NULL; + size_t i; + + assert(str); + assert(*str); + assert(ret_type); + assert(ret_attr); + + if (*p == '$') { + p++; + if (*p == '$') { + *str = p; + return 0; + } + for (i = 0; i < ELEMENTSOF(map); i++) + if ((q = startswith(p, map[i].name))) + break; + } else if (*p == '%') { + p++; + if (*p == '%') { + *str = p; + return 0; + } + + for (i = 0; i < ELEMENTSOF(map); i++) + if (*p == map[i].fmt) { + q = p + 1; + break; + } + } else + return 0; + if (!q) + /* When 'strict' flag is set, then '$' and '%' must be escaped. */ + return strict ? -EINVAL : 0; + + if (*q == '{') { + const char *start, *end; + size_t len; + + start = q + 1; + end = strchr(start, '}'); + if (!end) + return -EINVAL; + + len = end - start; + if (len == 0 || len >= UDEV_PATH_SIZE) + return -EINVAL; + + strnscpy(ret_attr, UDEV_PATH_SIZE, start, len); + q = end + 1; + } else + *ret_attr = '\0'; + + *str = q; + *ret_type = map[i].type; + return 1; +} + +static int safe_atou_optional_plus(const char *s, unsigned *ret) { + const char *p; + int r; + + assert(s); + assert(ret); + + /* Returns 1 if plus, 0 if no plus, negative on error */ + + p = endswith(s, "+"); + if (p) + s = strndupa_safe(s, p - s); + + r = safe_atou(s, ret); + if (r < 0) + return r; + + return !!p; +} + +static ssize_t udev_event_subst_format( + UdevEvent *event, + FormatSubstitutionType type, + const char *attr, + char *dest, + size_t l, + Hashmap *global_props, + bool *ret_truncated) { + + sd_device *parent, *dev = ASSERT_PTR(ASSERT_PTR(event)->dev); + const char *val = NULL; + bool truncated = false; + char *s = dest; + int r; + + switch (type) { + case FORMAT_SUBST_DEVPATH: + r = sd_device_get_devpath(dev, &val); + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + case FORMAT_SUBST_KERNEL: + r = sd_device_get_sysname(dev, &val); + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + case FORMAT_SUBST_KERNEL_NUMBER: + r = sd_device_get_sysnum(dev, &val); + if (r == -ENOENT) + goto null_terminate; + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + case FORMAT_SUBST_ID: + if (!event->dev_parent) + goto null_terminate; + r = sd_device_get_sysname(event->dev_parent, &val); + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + case FORMAT_SUBST_DRIVER: + if (!event->dev_parent) + goto null_terminate; + r = sd_device_get_driver(event->dev_parent, &val); + if (r == -ENOENT) + goto null_terminate; + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + case FORMAT_SUBST_MAJOR: + case FORMAT_SUBST_MINOR: { + dev_t devnum; + + r = sd_device_get_devnum(dev, &devnum); + if (r < 0 && r != -ENOENT) + return r; + strpcpyf_full(&s, l, &truncated, "%u", r < 0 ? 0 : type == FORMAT_SUBST_MAJOR ? major(devnum) : minor(devnum)); + break; + } + case FORMAT_SUBST_RESULT: { + unsigned index = 0; /* 0 means whole string */ + bool has_plus; + + if (!event->program_result) + goto null_terminate; + + if (!isempty(attr)) { + r = safe_atou_optional_plus(attr, &index); + if (r < 0) + return r; + + has_plus = r; + } + + if (index == 0) + strpcpy_full(&s, l, event->program_result, &truncated); + else { + const char *start, *p; + unsigned i; + + p = skip_leading_chars(event->program_result, NULL); + + for (i = 1; i < index; i++) { + while (*p && !strchr(WHITESPACE, *p)) + p++; + p = skip_leading_chars(p, NULL); + if (*p == '\0') + break; + } + if (i != index) { + log_device_debug(dev, "requested part of result string not found"); + goto null_terminate; + } + + start = p; + /* %c{2+} copies the whole string from the second part on */ + if (has_plus) + strpcpy_full(&s, l, start, &truncated); + else { + while (*p && !strchr(WHITESPACE, *p)) + p++; + strnpcpy_full(&s, l, start, p - start, &truncated); + } + } + break; + } + case FORMAT_SUBST_ATTR: { + char vbuf[UDEV_NAME_SIZE]; + int count; + bool t; + + if (isempty(attr)) + return -EINVAL; + + /* try to read the value specified by "[dmi/id]product_name" */ + if (udev_resolve_subsys_kernel(attr, vbuf, sizeof(vbuf), true) == 0) + val = vbuf; + + /* try to read the attribute the device */ + if (!val) + (void) sd_device_get_sysattr_value(dev, attr, &val); + + /* try to read the attribute of the parent device, other matches have selected */ + if (!val && event->dev_parent && event->dev_parent != dev) + (void) sd_device_get_sysattr_value(event->dev_parent, attr, &val); + + if (!val) + goto null_terminate; + + /* strip trailing whitespace, and replace unwanted characters */ + if (val != vbuf) + strscpy_full(vbuf, sizeof(vbuf), val, &truncated); + delete_trailing_chars(vbuf, NULL); + count = udev_replace_chars(vbuf, UDEV_ALLOWED_CHARS_INPUT); + if (count > 0) + log_device_debug(dev, "%i character(s) replaced", count); + strpcpy_full(&s, l, vbuf, &t); + truncated = truncated || t; + break; + } + case FORMAT_SUBST_PARENT: + r = sd_device_get_parent(dev, &parent); + if (r == -ENOENT) + goto null_terminate; + if (r < 0) + return r; + r = sd_device_get_devname(parent, &val); + if (r == -ENOENT) + goto null_terminate; + if (r < 0) + return r; + strpcpy_full(&s, l, val + STRLEN("/dev/"), &truncated); + break; + case FORMAT_SUBST_DEVNODE: + r = sd_device_get_devname(dev, &val); + if (r == -ENOENT) + goto null_terminate; + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + case FORMAT_SUBST_NAME: + if (event->name) + strpcpy_full(&s, l, event->name, &truncated); + else if (sd_device_get_devname(dev, &val) >= 0) + strpcpy_full(&s, l, val + STRLEN("/dev/"), &truncated); + else { + r = sd_device_get_sysname(dev, &val); + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + } + break; + case FORMAT_SUBST_LINKS: + FOREACH_DEVICE_DEVLINK(dev, link) { + if (s == dest) + strpcpy_full(&s, l, link + STRLEN("/dev/"), &truncated); + else + strpcpyl_full(&s, l, &truncated, " ", link + STRLEN("/dev/"), NULL); + if (truncated) + break; + } + if (s == dest) + goto null_terminate; + break; + case FORMAT_SUBST_ROOT: + strpcpy_full(&s, l, "/dev", &truncated); + break; + case FORMAT_SUBST_SYS: + strpcpy_full(&s, l, "/sys", &truncated); + break; + case FORMAT_SUBST_ENV: + if (isempty(attr)) + return -EINVAL; + r = device_get_property_value_with_fallback(dev, attr, global_props, &val); + if (r == -ENOENT) + goto null_terminate; + if (r < 0) + return r; + strpcpy_full(&s, l, val, &truncated); + break; + default: + assert_not_reached(); + } + + if (ret_truncated) + *ret_truncated = truncated; + + return s - dest; + +null_terminate: + if (ret_truncated) + *ret_truncated = truncated; + + *s = '\0'; + return 0; +} + +size_t udev_event_apply_format( + UdevEvent *event, + const char *src, + char *dest, + size_t size, + bool replace_whitespace, + Hashmap *global_props, + bool *ret_truncated) { + + bool truncated = false; + const char *s = ASSERT_PTR(src); + int r; + + assert(event); + assert(event->dev); + assert(dest); + assert(size > 0); + + while (*s) { + FormatSubstitutionType type; + char attr[UDEV_PATH_SIZE]; + ssize_t subst_len; + bool t; + + r = get_subst_type(&s, false, &type, attr); + if (r < 0) { + log_device_warning_errno(event->dev, r, "Invalid format string, ignoring: %s", src); + break; + } else if (r == 0) { + if (size < 2) { + /* need space for this char and the terminating NUL */ + truncated = true; + break; + } + *dest++ = *s++; + size--; + continue; + } + + subst_len = udev_event_subst_format(event, type, attr, dest, size, global_props, &t); + if (subst_len < 0) { + log_device_warning_errno(event->dev, subst_len, + "Failed to substitute variable '$%s' or apply format '%%%c', ignoring: %m", + format_type_to_string(type), format_type_to_char(type)); + break; + } + + truncated = truncated || t; + + /* FORMAT_SUBST_RESULT handles spaces itself */ + if (replace_whitespace && type != FORMAT_SUBST_RESULT) + /* udev_replace_whitespace can replace in-place, + * and does nothing if subst_len == 0 */ + subst_len = udev_replace_whitespace(dest, dest, subst_len); + + dest += subst_len; + size -= subst_len; + } + + assert(size >= 1); + + if (ret_truncated) + *ret_truncated = truncated; + + *dest = '\0'; + return size; +} + +int udev_check_format(const char *value, size_t *offset, const char **hint) { + FormatSubstitutionType type; + const char *s = value; + char attr[UDEV_PATH_SIZE]; + int r; + + while (*s) { + r = get_subst_type(&s, true, &type, attr); + if (r < 0) { + if (offset) + *offset = s - value; + if (hint) + *hint = "invalid substitution type"; + return r; + } else if (r == 0) { + s++; + continue; + } + + if (IN_SET(type, FORMAT_SUBST_ATTR, FORMAT_SUBST_ENV) && isempty(attr)) { + if (offset) + *offset = s - value; + if (hint) + *hint = "attribute value missing"; + return -EINVAL; + } + + if (type == FORMAT_SUBST_RESULT && !isempty(attr)) { + unsigned i; + + r = safe_atou_optional_plus(attr, &i); + if (r < 0) { + if (offset) + *offset = s - value; + if (hint) + *hint = "attribute value not a valid number"; + return r; + } + } + } + + return 0; +} + +int udev_resolve_subsys_kernel(const char *string, char *result, size_t maxsize, bool read_value) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_free_ char *temp = NULL; + char *subsys, *sysname, *attr; + const char *val; + int r; + + assert(string); + assert(result); + + /* handle "[/]" format */ + + if (string[0] != '[') + return -EINVAL; + + temp = strdup(string); + if (!temp) + return -ENOMEM; + + subsys = &temp[1]; + + sysname = strchr(subsys, '/'); + if (!sysname) + return -EINVAL; + sysname[0] = '\0'; + sysname = &sysname[1]; + + attr = strchr(sysname, ']'); + if (!attr) + return -EINVAL; + attr[0] = '\0'; + attr = &attr[1]; + if (attr[0] == '/') + attr = &attr[1]; + if (attr[0] == '\0') + attr = NULL; + + if (read_value && !attr) + return -EINVAL; + + r = sd_device_new_from_subsystem_sysname(&dev, subsys, sysname); + if (r < 0) + return r; + + if (read_value) { + r = sd_device_get_sysattr_value(dev, attr, &val); + if (r < 0 && !ERRNO_IS_PRIVILEGE(r) && r != -ENOENT) + return r; + if (r >= 0) + strscpy(result, maxsize, val); + else + result[0] = '\0'; + log_debug("value '[%s/%s]%s' is '%s'", subsys, sysname, attr, result); + } else { + r = sd_device_get_syspath(dev, &val); + if (r < 0) + return r; + + strscpyl(result, maxsize, val, attr ? "/" : NULL, attr ?: NULL, NULL); + log_debug("path '[%s/%s]%s' is '%s'", subsys, sysname, strempty(attr), result); + } + return 0; +} diff --git a/src/udev/udev-format.h b/src/udev/udev-format.h new file mode 100644 index 0000000..92fef9b --- /dev/null +++ b/src/udev/udev-format.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include +#include + +typedef struct UdevEvent UdevEvent; + +#define UDEV_ALLOWED_CHARS_INPUT "/ $%?," + +size_t udev_event_apply_format( + UdevEvent *event, + const char *src, + char *dest, + size_t size, + bool replace_whitespace, + Hashmap *global_props, + bool *ret_truncated); +int udev_check_format(const char *value, size_t *offset, const char **hint); + +int udev_resolve_subsys_kernel(const char *string, char *result, size_t maxsize, bool read_value); diff --git a/src/udev/udev-manager.c b/src/udev/udev-manager.c new file mode 100644 index 0000000..8077e51 --- /dev/null +++ b/src/udev/udev-manager.c @@ -0,0 +1,1352 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "blockdev-util.h" +#include "cgroup-util.h" +#include "common-signal.h" +#include "cpu-set-util.h" +#include "daemon-util.h" +#include "device-monitor-private.h" +#include "device-private.h" +#include "device-util.h" +#include "errno-list.h" +#include "event-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "hashmap.h" +#include "inotify-util.h" +#include "iovec-util.h" +#include "limits-util.h" +#include "list.h" +#include "mkdir.h" +#include "process-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "syslog-util.h" +#include "udev-builtin.h" +#include "udev-ctrl.h" +#include "udev-event.h" +#include "udev-manager.h" +#include "udev-node.h" +#include "udev-spawn.h" +#include "udev-trace.h" +#include "udev-util.h" +#include "udev-watch.h" +#include "udev-worker.h" + +#define WORKER_NUM_MAX UINT64_C(2048) + +#define EVENT_RETRY_INTERVAL_USEC (200 * USEC_PER_MSEC) +#define EVENT_RETRY_TIMEOUT_USEC (3 * USEC_PER_MINUTE) + +typedef enum EventState { + EVENT_UNDEF, + EVENT_QUEUED, + EVENT_RUNNING, +} EventState; + +typedef struct Event { + Manager *manager; + Worker *worker; + EventState state; + + sd_device *dev; + + sd_device_action_t action; + uint64_t seqnum; + uint64_t blocker_seqnum; + const char *id; + const char *devpath; + const char *devpath_old; + const char *devnode; + + /* Used when the device is locked by another program. */ + usec_t retry_again_next_usec; + usec_t retry_again_timeout_usec; + sd_event_source *retry_event_source; + + sd_event_source *timeout_warning_event; + sd_event_source *timeout_event; + + LIST_FIELDS(Event, event); +} Event; + +typedef enum WorkerState { + WORKER_UNDEF, + WORKER_RUNNING, + WORKER_IDLE, + WORKER_KILLED, + WORKER_KILLING, +} WorkerState; + +typedef struct Worker { + Manager *manager; + pid_t pid; + sd_event_source *child_event_source; + sd_device_monitor *monitor; + WorkerState state; + Event *event; +} Worker; + +static Event *event_free(Event *event) { + if (!event) + return NULL; + + assert(event->manager); + + LIST_REMOVE(event, event->manager->events, event); + sd_device_unref(event->dev); + + sd_event_source_unref(event->retry_event_source); + sd_event_source_unref(event->timeout_warning_event); + sd_event_source_unref(event->timeout_event); + + if (event->worker) + event->worker->event = NULL; + + return mfree(event); +} + +static void event_queue_cleanup(Manager *manager, EventState match_state) { + LIST_FOREACH(event, event, manager->events) { + if (match_state != EVENT_UNDEF && match_state != event->state) + continue; + + event_free(event); + } +} + +static Worker *worker_free(Worker *worker) { + if (!worker) + return NULL; + + if (worker->manager) + hashmap_remove(worker->manager->workers, PID_TO_PTR(worker->pid)); + + sd_event_source_unref(worker->child_event_source); + sd_device_monitor_unref(worker->monitor); + event_free(worker->event); + + return mfree(worker); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(Worker*, worker_free); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(worker_hash_op, void, trivial_hash_func, trivial_compare_func, Worker, worker_free); + +Manager* manager_free(Manager *manager) { + if (!manager) + return NULL; + + udev_builtin_exit(); + + hashmap_free_free_free(manager->properties); + udev_rules_free(manager->rules); + + hashmap_free(manager->workers); + event_queue_cleanup(manager, EVENT_UNDEF); + + safe_close(manager->inotify_fd); + safe_close_pair(manager->worker_watch); + + sd_device_monitor_unref(manager->monitor); + udev_ctrl_unref(manager->ctrl); + + sd_event_source_unref(manager->inotify_event); + sd_event_source_unref(manager->kill_workers_event); + sd_event_source_unref(manager->memory_pressure_event_source); + sd_event_source_unref(manager->sigrtmin18_event_source); + sd_event_unref(manager->event); + + free(manager->cgroup); + return mfree(manager); +} + +static int on_sigchld(sd_event_source *s, const siginfo_t *si, void *userdata); + +static int worker_new(Worker **ret, Manager *manager, sd_device_monitor *worker_monitor, pid_t pid) { + _cleanup_(worker_freep) Worker *worker = NULL; + int r; + + assert(ret); + assert(manager); + assert(worker_monitor); + assert(pid > 1); + + /* close monitor, but keep address around */ + device_monitor_disconnect(worker_monitor); + + worker = new(Worker, 1); + if (!worker) + return -ENOMEM; + + *worker = (Worker) { + .monitor = sd_device_monitor_ref(worker_monitor), + .pid = pid, + }; + + r = sd_event_add_child(manager->event, &worker->child_event_source, pid, WEXITED, on_sigchld, worker); + if (r < 0) + return r; + + r = hashmap_ensure_put(&manager->workers, &worker_hash_op, PID_TO_PTR(pid), worker); + if (r < 0) + return r; + + worker->manager = manager; + + *ret = TAKE_PTR(worker); + return 0; +} + +static void manager_kill_workers(Manager *manager, bool force) { + Worker *worker; + + assert(manager); + + HASHMAP_FOREACH(worker, manager->workers) { + if (worker->state == WORKER_KILLED) + continue; + + if (worker->state == WORKER_RUNNING && !force) { + worker->state = WORKER_KILLING; + continue; + } + + worker->state = WORKER_KILLED; + (void) kill(worker->pid, SIGTERM); + } +} + +static void manager_exit(Manager *manager) { + assert(manager); + + manager->exit = true; + + (void) sd_notify(/* unset= */ false, NOTIFY_STOPPING); + + /* close sources of new events and discard buffered events */ + manager->ctrl = udev_ctrl_unref(manager->ctrl); + + manager->inotify_event = sd_event_source_disable_unref(manager->inotify_event); + manager->inotify_fd = safe_close(manager->inotify_fd); + + manager->monitor = sd_device_monitor_unref(manager->monitor); + + /* discard queued events and kill workers */ + event_queue_cleanup(manager, EVENT_QUEUED); + manager_kill_workers(manager, true); +} + +static void notify_ready(Manager *manager) { + int r; + + assert(manager); + + r = sd_notifyf(/* unset= */ false, + "READY=1\n" + "STATUS=Processing with %u children at max", manager->children_max); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); +} + +/* reload requested, HUP signal received, rules changed, builtin changed */ +static void manager_reload(Manager *manager, bool force) { + _cleanup_(udev_rules_freep) UdevRules *rules = NULL; + usec_t now_usec; + int r; + + assert(manager); + + assert_se(sd_event_now(manager->event, CLOCK_MONOTONIC, &now_usec) >= 0); + if (!force && now_usec < usec_add(manager->last_usec, 3 * USEC_PER_SEC)) + /* check for changed config, every 3 seconds at most */ + return; + manager->last_usec = now_usec; + + /* Reload SELinux label database, to make the child inherit the up-to-date database. */ + mac_selinux_maybe_reload(); + + /* Nothing changed. It is not necessary to reload. */ + if (!udev_rules_should_reload(manager->rules) && !udev_builtin_should_reload()) { + + if (!force) + return; + + /* If we eat this up, then tell our service manager to just continue */ + (void) sd_notifyf(/* unset= */ false, + "RELOADING=1\n" + "STATUS=Skipping configuration reloading, nothing changed.\n" + "MONOTONIC_USEC=" USEC_FMT, now(CLOCK_MONOTONIC)); + } else { + (void) sd_notifyf(/* unset= */ false, + "RELOADING=1\n" + "STATUS=Flushing configuration...\n" + "MONOTONIC_USEC=" USEC_FMT, now(CLOCK_MONOTONIC)); + + manager_kill_workers(manager, false); + + udev_builtin_exit(); + udev_builtin_init(); + + r = udev_rules_load(&rules, manager->resolve_name_timing); + if (r < 0) + log_warning_errno(r, "Failed to read udev rules, using the previously loaded rules, ignoring: %m"); + else + udev_rules_free_and_replace(manager->rules, rules); + } + + notify_ready(manager); +} + +static int on_kill_workers_event(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + + log_debug("Cleanup idle workers"); + manager_kill_workers(manager, false); + + return 1; +} + +static int on_event_timeout(sd_event_source *s, uint64_t usec, void *userdata) { + Event *event = ASSERT_PTR(userdata); + + assert(event->manager); + assert(event->worker); + + kill_and_sigcont(event->worker->pid, event->manager->timeout_signal); + event->worker->state = WORKER_KILLED; + + log_device_error(event->dev, "Worker ["PID_FMT"] processing SEQNUM=%"PRIu64" killed", event->worker->pid, event->seqnum); + + return 1; +} + +static int on_event_timeout_warning(sd_event_source *s, uint64_t usec, void *userdata) { + Event *event = ASSERT_PTR(userdata); + + assert(event->worker); + + log_device_warning(event->dev, "Worker ["PID_FMT"] processing SEQNUM=%"PRIu64" is taking a long time", event->worker->pid, event->seqnum); + + return 1; +} + +static void worker_attach_event(Worker *worker, Event *event) { + Manager *manager; + sd_event *e; + + assert(worker); + assert(worker->manager); + assert(event); + assert(!event->worker); + assert(!worker->event); + + worker->state = WORKER_RUNNING; + worker->event = event; + event->state = EVENT_RUNNING; + event->worker = worker; + + manager = worker->manager; + e = manager->event; + + (void) sd_event_add_time_relative(e, &event->timeout_warning_event, CLOCK_MONOTONIC, + udev_warn_timeout(manager->timeout_usec), USEC_PER_SEC, + on_event_timeout_warning, event); + + (void) sd_event_add_time_relative(e, &event->timeout_event, CLOCK_MONOTONIC, + manager->timeout_usec, USEC_PER_SEC, + on_event_timeout, event); +} + +static int worker_spawn(Manager *manager, Event *event) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *worker_monitor = NULL; + Worker *worker; + pid_t pid; + int r; + + /* listen for new events */ + r = device_monitor_new_full(&worker_monitor, MONITOR_GROUP_NONE, -1); + if (r < 0) + return r; + + (void) sd_device_monitor_set_description(worker_monitor, "worker"); + + /* allow the main daemon netlink address to send devices to the worker */ + r = device_monitor_allow_unicast_sender(worker_monitor, manager->monitor); + if (r < 0) + return log_error_errno(r, "Worker: Failed to set unicast sender: %m"); + + r = device_monitor_enable_receiving(worker_monitor); + if (r < 0) + return log_error_errno(r, "Worker: Failed to enable receiving of device: %m"); + + r = safe_fork("(udev-worker)", FORK_DEATHSIG_SIGTERM, &pid); + if (r < 0) { + event->state = EVENT_QUEUED; + return log_error_errno(r, "Failed to fork() worker: %m"); + } + if (r == 0) { + _cleanup_(udev_worker_done) UdevWorker w = { + .monitor = TAKE_PTR(worker_monitor), + .properties = TAKE_PTR(manager->properties), + .rules = TAKE_PTR(manager->rules), + .pipe_fd = TAKE_FD(manager->worker_watch[WRITE_END]), + .inotify_fd = TAKE_FD(manager->inotify_fd), + .exec_delay_usec = manager->exec_delay_usec, + .timeout_usec = manager->timeout_usec, + .timeout_signal = manager->timeout_signal, + .log_level = manager->log_level, + .blockdev_read_only = manager->blockdev_read_only, + }; + + /* Worker process */ + r = udev_worker_main(&w, event->dev); + log_close(); + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + r = worker_new(&worker, manager, worker_monitor, pid); + if (r < 0) + return log_error_errno(r, "Failed to create worker object: %m"); + + worker_attach_event(worker, event); + + log_device_debug(event->dev, "Worker ["PID_FMT"] is forked for processing SEQNUM=%"PRIu64".", pid, event->seqnum); + return 0; +} + +static int event_run(Event *event) { + static bool log_children_max_reached = true; + Manager *manager; + Worker *worker; + int r; + + assert(event); + assert(event->manager); + + log_device_uevent(event->dev, "Device ready for processing"); + + (void) event_source_disable(event->retry_event_source); + + manager = event->manager; + HASHMAP_FOREACH(worker, manager->workers) { + if (worker->state != WORKER_IDLE) + continue; + + r = device_monitor_send_device(manager->monitor, worker->monitor, event->dev); + if (r < 0) { + log_device_error_errno(event->dev, r, "Worker ["PID_FMT"] did not accept message, killing the worker: %m", + worker->pid); + (void) kill(worker->pid, SIGKILL); + worker->state = WORKER_KILLED; + continue; + } + worker_attach_event(worker, event); + return 1; /* event is now processing. */ + } + + if (hashmap_size(manager->workers) >= manager->children_max) { + /* Avoid spamming the debug logs if the limit is already reached and + * many events still need to be processed */ + if (log_children_max_reached && manager->children_max > 1) { + log_debug("Maximum number (%u) of children reached.", hashmap_size(manager->workers)); + log_children_max_reached = false; + } + return 0; /* no free worker */ + } + + /* Re-enable the debug message for the next batch of events */ + log_children_max_reached = true; + + /* start new worker and pass initial device */ + r = worker_spawn(manager, event); + if (r < 0) + return r; + + return 1; /* event is now processing. */ +} + +bool devpath_conflict(const char *a, const char *b) { + /* This returns true when two paths are equivalent, or one is a child of another. */ + + if (!a || !b) + return false; + + for (; *a != '\0' && *b != '\0'; a++, b++) + if (*a != *b) + return false; + + return *a == '/' || *b == '/' || *a == *b; +} + +static int event_is_blocked(Event *event) { + Event *loop_event = NULL; + int r; + + /* lookup event for identical, parent, child device */ + + assert(event); + assert(event->manager); + assert(event->blocker_seqnum <= event->seqnum); + + if (event->retry_again_next_usec > 0) { + usec_t now_usec; + + r = sd_event_now(event->manager->event, CLOCK_BOOTTIME, &now_usec); + if (r < 0) + return r; + + if (event->retry_again_next_usec > now_usec) + return true; + } + + if (event->blocker_seqnum == event->seqnum) + /* we have checked previously and no blocker found */ + return false; + + LIST_FOREACH(event, e, event->manager->events) { + loop_event = e; + + /* we already found a later event, earlier cannot block us, no need to check again */ + if (loop_event->seqnum < event->blocker_seqnum) + continue; + + /* event we checked earlier still exists, no need to check again */ + if (loop_event->seqnum == event->blocker_seqnum) + return true; + + /* found ourself, no later event can block us */ + if (loop_event->seqnum >= event->seqnum) + goto no_blocker; + + /* found event we have not checked */ + break; + } + + assert(loop_event); + assert(loop_event->seqnum > event->blocker_seqnum && + loop_event->seqnum < event->seqnum); + + /* check if queue contains events we depend on */ + LIST_FOREACH(event, e, loop_event) { + loop_event = e; + + /* found ourself, no later event can block us */ + if (loop_event->seqnum >= event->seqnum) + goto no_blocker; + + if (streq_ptr(loop_event->id, event->id)) + break; + + if (devpath_conflict(event->devpath, loop_event->devpath) || + devpath_conflict(event->devpath, loop_event->devpath_old) || + devpath_conflict(event->devpath_old, loop_event->devpath)) + break; + + if (event->devnode && streq_ptr(event->devnode, loop_event->devnode)) + break; + } + + assert(loop_event); + + log_device_debug(event->dev, "SEQNUM=%" PRIu64 " blocked by SEQNUM=%" PRIu64, + event->seqnum, loop_event->seqnum); + + event->blocker_seqnum = loop_event->seqnum; + return true; + +no_blocker: + event->blocker_seqnum = event->seqnum; + return false; +} + +static int event_queue_start(Manager *manager) { + int r; + + assert(manager); + + if (!manager->events || manager->exit || manager->stop_exec_queue) + return 0; + + /* To make the stack directory /run/udev/links cleaned up later. */ + manager->udev_node_needs_cleanup = true; + + r = event_source_disable(manager->kill_workers_event); + if (r < 0) + log_warning_errno(r, "Failed to disable event source for cleaning up idle workers, ignoring: %m"); + + manager_reload(manager, /* force = */ false); + + LIST_FOREACH(event, event, manager->events) { + if (event->state != EVENT_QUEUED) + continue; + + /* do not start event if parent or child event is still running or queued */ + r = event_is_blocked(event); + if (r > 0) + continue; + if (r < 0) + log_device_warning_errno(event->dev, r, + "Failed to check dependencies for event (SEQNUM=%"PRIu64", ACTION=%s), " + "assuming there is no blocking event, ignoring: %m", + event->seqnum, + strna(device_action_to_string(event->action))); + + r = event_run(event); + if (r <= 0) /* 0 means there are no idle workers. Let's escape from the loop. */ + return r; + } + + return 0; +} + +static int on_event_retry(sd_event_source *s, uint64_t usec, void *userdata) { + /* This does nothing. The on_post() callback will start the event if there exists an idle worker. */ + return 1; +} + +static int event_requeue(Event *event) { + usec_t now_usec; + int r; + + assert(event); + assert(event->manager); + assert(event->manager->event); + + event->timeout_warning_event = sd_event_source_disable_unref(event->timeout_warning_event); + event->timeout_event = sd_event_source_disable_unref(event->timeout_event); + + /* add a short delay to suppress busy loop */ + r = sd_event_now(event->manager->event, CLOCK_BOOTTIME, &now_usec); + if (r < 0) + return log_device_warning_errno(event->dev, r, + "Failed to get current time, " + "skipping event (SEQNUM=%"PRIu64", ACTION=%s): %m", + event->seqnum, strna(device_action_to_string(event->action))); + + if (event->retry_again_timeout_usec > 0 && event->retry_again_timeout_usec <= now_usec) + return log_device_warning_errno(event->dev, SYNTHETIC_ERRNO(ETIMEDOUT), + "The underlying block device is locked by a process more than %s, " + "skipping event (SEQNUM=%"PRIu64", ACTION=%s).", + FORMAT_TIMESPAN(EVENT_RETRY_TIMEOUT_USEC, USEC_PER_MINUTE), + event->seqnum, strna(device_action_to_string(event->action))); + + event->retry_again_next_usec = usec_add(now_usec, EVENT_RETRY_INTERVAL_USEC); + if (event->retry_again_timeout_usec == 0) + event->retry_again_timeout_usec = usec_add(now_usec, EVENT_RETRY_TIMEOUT_USEC); + + r = event_reset_time_relative(event->manager->event, &event->retry_event_source, + CLOCK_MONOTONIC, EVENT_RETRY_INTERVAL_USEC, 0, + on_event_retry, NULL, + 0, "retry-event", true); + if (r < 0) + return log_device_warning_errno(event->dev, r, "Failed to reset timer event source for retrying event, " + "skipping event (SEQNUM=%"PRIu64", ACTION=%s): %m", + event->seqnum, strna(device_action_to_string(event->action))); + + if (event->worker && event->worker->event == event) + event->worker->event = NULL; + event->worker = NULL; + + event->state = EVENT_QUEUED; + return 0; +} + +static int event_queue_assume_block_device_unlocked(Manager *manager, sd_device *dev) { + const char *devname; + int r; + + /* When a new event for a block device is queued or we get an inotify event, assume that the + * device is not locked anymore. The assumption may not be true, but that should not cause any + * issues, as in that case events will be requeued soon. */ + + r = udev_get_whole_disk(dev, NULL, &devname); + if (r <= 0) + return r; + + LIST_FOREACH(event, event, manager->events) { + const char *event_devname; + + if (event->state != EVENT_QUEUED) + continue; + + if (event->retry_again_next_usec == 0) + continue; + + if (udev_get_whole_disk(event->dev, NULL, &event_devname) <= 0) + continue; + + if (!streq(devname, event_devname)) + continue; + + event->retry_again_next_usec = 0; + } + + return 0; +} + +static int event_queue_insert(Manager *manager, sd_device *dev) { + const char *devpath, *devpath_old = NULL, *id = NULL, *devnode = NULL; + sd_device_action_t action; + uint64_t seqnum; + Event *event; + int r; + + assert(manager); + assert(dev); + + /* We only accepts devices received by device monitor. */ + r = sd_device_get_seqnum(dev, &seqnum); + if (r < 0) + return r; + + r = sd_device_get_action(dev, &action); + if (r < 0) + return r; + + r = sd_device_get_devpath(dev, &devpath); + if (r < 0) + return r; + + r = sd_device_get_property_value(dev, "DEVPATH_OLD", &devpath_old); + if (r < 0 && r != -ENOENT) + return r; + + r = device_get_device_id(dev, &id); + if (r < 0 && r != -ENOENT) + return r; + + r = sd_device_get_devname(dev, &devnode); + if (r < 0 && r != -ENOENT) + return r; + + event = new(Event, 1); + if (!event) + return -ENOMEM; + + *event = (Event) { + .manager = manager, + .dev = sd_device_ref(dev), + .seqnum = seqnum, + .action = action, + .id = id, + .devpath = devpath, + .devpath_old = devpath_old, + .devnode = devnode, + .state = EVENT_QUEUED, + }; + + if (!manager->events) { + r = touch("/run/udev/queue"); + if (r < 0) + log_warning_errno(r, "Failed to touch /run/udev/queue, ignoring: %m"); + } + + LIST_APPEND(event, manager->events, event); + + log_device_uevent(dev, "Device is queued"); + + return 0; +} + +static int on_uevent(sd_device_monitor *monitor, sd_device *dev, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + int r; + + DEVICE_TRACE_POINT(kernel_uevent_received, dev); + + device_ensure_usec_initialized(dev, NULL); + + r = event_queue_insert(manager, dev); + if (r < 0) { + log_device_error_errno(dev, r, "Failed to insert device into event queue: %m"); + return 1; + } + + (void) event_queue_assume_block_device_unlocked(manager, dev); + + return 1; +} + +static int on_worker(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + + for (;;) { + EventResult result; + struct iovec iovec = IOVEC_MAKE(&result, sizeof(result)); + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + ssize_t size; + struct ucred *ucred; + Worker *worker; + + size = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT); + if (size == -EINTR) + continue; + if (size == -EAGAIN) + /* nothing more to read */ + break; + if (size < 0) + return log_error_errno(size, "Failed to receive message: %m"); + + cmsg_close_all(&msghdr); + + if (size != sizeof(result)) { + log_warning("Ignoring worker message with invalid size %zi bytes", size); + continue; + } + + ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!ucred || ucred->pid <= 0) { + log_warning("Ignoring worker message without valid PID"); + continue; + } + + /* lookup worker who sent the signal */ + worker = hashmap_get(manager->workers, PID_TO_PTR(ucred->pid)); + if (!worker) { + log_debug("Worker ["PID_FMT"] returned, but is no longer tracked", ucred->pid); + continue; + } + + if (worker->state == WORKER_KILLING) { + worker->state = WORKER_KILLED; + (void) kill(worker->pid, SIGTERM); + } else if (worker->state != WORKER_KILLED) + worker->state = WORKER_IDLE; + + /* worker returned */ + if (result == EVENT_RESULT_TRY_AGAIN && + event_requeue(worker->event) < 0) + udev_broadcast_result(manager->monitor, worker->event->dev, -ETIMEDOUT); + + /* When event_requeue() succeeds, worker->event is NULL, and event_free() handles NULL gracefully. */ + event_free(worker->event); + } + + return 1; +} + +static void manager_set_default_children_max(Manager *manager) { + uint64_t cpu_limit, mem_limit, cpu_count = 1; + int r; + + assert(manager); + + if (manager->children_max != 0) + return; + + r = cpus_in_affinity_mask(); + if (r < 0) + log_warning_errno(r, "Failed to determine number of local CPUs, ignoring: %m"); + else + cpu_count = r; + + cpu_limit = cpu_count * 2 + 16; + mem_limit = MAX(physical_memory() / (128*1024*1024), UINT64_C(10)); + + manager->children_max = MIN3(cpu_limit, mem_limit, WORKER_NUM_MAX); + log_debug("Set children_max to %u", manager->children_max); +} + +/* receive the udevd message from userspace */ +static int on_ctrl_msg(UdevCtrl *uctrl, UdevCtrlMessageType type, const UdevCtrlMessageValue *value, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + int r; + + assert(value); + + switch (type) { + case UDEV_CTRL_SET_LOG_LEVEL: + if ((value->intval & LOG_PRIMASK) != value->intval) { + log_debug("Received invalid udev control message (SET_LOG_LEVEL, %i), ignoring.", value->intval); + break; + } + + log_debug("Received udev control message (SET_LOG_LEVEL), setting log_level=%i", value->intval); + + r = log_get_max_level(); + if (r == value->intval) + break; + + log_set_max_level(value->intval); + manager->log_level = value->intval; + manager_kill_workers(manager, false); + break; + case UDEV_CTRL_STOP_EXEC_QUEUE: + log_debug("Received udev control message (STOP_EXEC_QUEUE)"); + manager->stop_exec_queue = true; + break; + case UDEV_CTRL_START_EXEC_QUEUE: + log_debug("Received udev control message (START_EXEC_QUEUE)"); + manager->stop_exec_queue = false; + /* It is not necessary to call event_queue_start() here, as it will be called in on_post() if necessary. */ + break; + case UDEV_CTRL_RELOAD: + log_debug("Received udev control message (RELOAD)"); + manager_reload(manager, /* force = */ true); + break; + case UDEV_CTRL_SET_ENV: { + _unused_ _cleanup_free_ char *old_val = NULL, *old_key = NULL; + _cleanup_free_ char *key = NULL, *val = NULL; + const char *eq; + + eq = strchr(value->buf, '='); + if (!eq) { + log_error("Invalid key format '%s'", value->buf); + return 1; + } + + key = strndup(value->buf, eq - value->buf); + if (!key) { + log_oom(); + return 1; + } + + old_val = hashmap_remove2(manager->properties, key, (void **) &old_key); + + r = hashmap_ensure_allocated(&manager->properties, &string_hash_ops); + if (r < 0) { + log_oom(); + return 1; + } + + eq++; + if (isempty(eq)) + log_debug("Received udev control message (ENV), unsetting '%s'", key); + else { + val = strdup(eq); + if (!val) { + log_oom(); + return 1; + } + + log_debug("Received udev control message (ENV), setting '%s=%s'", key, val); + + r = hashmap_put(manager->properties, key, val); + if (r < 0) { + log_oom(); + return 1; + } + } + + key = val = NULL; + manager_kill_workers(manager, false); + break; + } + case UDEV_CTRL_SET_CHILDREN_MAX: + if (value->intval < 0) { + log_debug("Received invalid udev control message (SET_MAX_CHILDREN, %i), ignoring.", value->intval); + return 0; + } + + log_debug("Received udev control message (SET_MAX_CHILDREN), setting children_max=%i", value->intval); + manager->children_max = value->intval; + + /* When 0 is specified, determine the maximum based on the system resources. */ + manager_set_default_children_max(manager); + + notify_ready(manager); + break; + case UDEV_CTRL_PING: + log_debug("Received udev control message (PING)"); + break; + case UDEV_CTRL_EXIT: + log_debug("Received udev control message (EXIT)"); + manager_exit(manager); + break; + default: + log_debug("Received unknown udev control message, ignoring"); + } + + return 1; +} + +static int synthesize_change_one(sd_device *dev, sd_device *target) { + int r; + + if (DEBUG_LOGGING) { + const char *syspath = NULL; + (void) sd_device_get_syspath(target, &syspath); + log_device_debug(dev, "device is closed, synthesising 'change' on %s", strna(syspath)); + } + + r = sd_device_trigger(target, SD_DEVICE_CHANGE); + if (r < 0) + return log_device_debug_errno(target, r, "Failed to trigger 'change' uevent: %m"); + + DEVICE_TRACE_POINT(synthetic_change_event, dev); + + return 0; +} + +static int synthesize_change(sd_device *dev) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + bool part_table_read; + const char *sysname; + int r, k; + + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return r; + + if (startswith(sysname, "dm-") || block_device_is_whole_disk(dev) <= 0) + return synthesize_change_one(dev, dev); + + r = blockdev_reread_partition_table(dev); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to re-read partition table, ignoring: %m"); + part_table_read = r >= 0; + + /* search for partitions */ + r = partition_enumerator_new(dev, &e); + if (r < 0) + return r; + + /* We have partitions and re-read the table, the kernel already sent out a "change" + * event for the disk, and "remove/add" for all partitions. */ + if (part_table_read && sd_device_enumerator_get_device_first(e)) + return 0; + + /* We have partitions but re-reading the partition table did not work, synthesize + * "change" for the disk and all partitions. */ + r = synthesize_change_one(dev, dev); + FOREACH_DEVICE(e, d) { + k = synthesize_change_one(dev, d); + if (k < 0 && r >= 0) + r = k; + } + + return r; +} + +static int on_inotify(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + union inotify_event_buffer buffer; + ssize_t l; + int r; + + l = read(fd, &buffer, sizeof(buffer)); + if (l < 0) { + if (ERRNO_IS_TRANSIENT(errno)) + return 0; + + return log_error_errno(errno, "Failed to read inotify fd: %m"); + } + + FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *devnode; + + /* Do not handle IN_IGNORED here. Especially, do not try to call udev_watch_end() from the + * main process. Otherwise, the pair of the symlinks may become inconsistent, and several + * garbage may remain. The old symlinks are removed by a worker that processes the + * corresponding 'remove' uevent; + * udev_event_execute_rules() -> event_execute_rules_on_remove() -> udev_watch_end(). */ + + if (!FLAGS_SET(e->mask, IN_CLOSE_WRITE)) + continue; + + r = device_new_from_watch_handle(&dev, e->wd); + if (r < 0) { + /* Device may be removed just after closed. */ + log_debug_errno(r, "Failed to create sd_device object from watch handle, ignoring: %m"); + continue; + } + + r = sd_device_get_devname(dev, &devnode); + if (r < 0) { + /* Also here, device may be already removed. */ + log_device_debug_errno(dev, r, "Failed to get device node, ignoring: %m"); + continue; + } + + log_device_debug(dev, "Received inotify event for %s.", devnode); + + (void) event_queue_assume_block_device_unlocked(manager, dev); + (void) synthesize_change(dev); + } + + return 0; +} + +static int on_sigterm(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + + manager_exit(manager); + + return 1; +} + +static int on_sighup(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + + manager_reload(manager, /* force = */ true); + + return 1; +} + +static int on_sigchld(sd_event_source *s, const siginfo_t *si, void *userdata) { + Worker *worker = ASSERT_PTR(userdata); + Manager *manager = ASSERT_PTR(worker->manager); + sd_device *dev = worker->event ? ASSERT_PTR(worker->event->dev) : NULL; + EventResult result; + + assert(si); + + switch (si->si_code) { + case CLD_EXITED: + if (si->si_status == 0) + log_device_debug(dev, "Worker ["PID_FMT"] exited.", si->si_pid); + else + log_device_warning(dev, "Worker ["PID_FMT"] exited with return code %i.", + si->si_pid, si->si_status); + result = EVENT_RESULT_EXIT_STATUS_BASE + si->si_status; + break; + + case CLD_KILLED: + case CLD_DUMPED: + log_device_warning(dev, "Worker ["PID_FMT"] terminated by signal %i (%s).", + si->si_pid, si->si_status, signal_to_string(si->si_status)); + result = EVENT_RESULT_SIGNAL_BASE + si->si_status; + break; + + default: + assert_not_reached(); + } + + if (result != EVENT_RESULT_SUCCESS && dev) { + /* delete state from disk */ + device_delete_db(dev); + device_tag_index(dev, NULL, false); + + /* Forward kernel event to libudev listeners */ + udev_broadcast_result(manager->monitor, dev, result); + } + + worker_free(worker); + + return 1; +} + +static int on_post(sd_event_source *s, void *userdata) { + Manager *manager = ASSERT_PTR(userdata); + + if (manager->events) { + /* Try to process pending events if idle workers exist. Why is this necessary? + * When a worker finished an event and became idle, even if there was a pending event, + * the corresponding device might have been locked and the processing of the event + * delayed for a while, preventing the worker from processing the event immediately. + * Now, the device may be unlocked. Let's try again! */ + event_queue_start(manager); + return 1; + } + + /* There are no queued events. Let's remove /run/udev/queue and clean up the idle processes. */ + + if (unlink("/run/udev/queue") < 0) { + if (errno != ENOENT) + log_warning_errno(errno, "Failed to unlink /run/udev/queue, ignoring: %m"); + } else + log_debug("No events are queued, removing /run/udev/queue."); + + if (!hashmap_isempty(manager->workers)) { + /* There are idle workers */ + (void) event_reset_time_relative(manager->event, &manager->kill_workers_event, + CLOCK_MONOTONIC, 3 * USEC_PER_SEC, USEC_PER_SEC, + on_kill_workers_event, manager, + 0, "kill-workers-event", false); + return 1; + } + + /* There are no idle workers. */ + + if (manager->udev_node_needs_cleanup) { + (void) udev_node_cleanup(); + manager->udev_node_needs_cleanup = false; + } + + if (manager->exit) + return sd_event_exit(manager->event, 0); + + if (manager->cgroup) + /* cleanup possible left-over processes in our cgroup */ + (void) cg_kill(manager->cgroup, SIGKILL, CGROUP_IGNORE_SELF, /* set=*/ NULL, /* kill_log= */ NULL, /* userdata= */ NULL); + + return 1; +} + +Manager* manager_new(void) { + Manager *manager; + + manager = new(Manager, 1); + if (!manager) + return NULL; + + *manager = (Manager) { + .inotify_fd = -EBADF, + .worker_watch = EBADF_PAIR, + .log_level = LOG_INFO, + .resolve_name_timing = RESOLVE_NAME_EARLY, + .timeout_usec = 180 * USEC_PER_SEC, + .timeout_signal = SIGKILL, + }; + + return manager; +} + +int manager_init(Manager *manager, int fd_ctrl, int fd_uevent) { + _cleanup_free_ char *cgroup = NULL; + int r; + + assert(manager); + + r = udev_ctrl_new_from_fd(&manager->ctrl, fd_ctrl); + if (r < 0) + return log_error_errno(r, "Failed to initialize udev control socket: %m"); + + r = udev_ctrl_enable_receiving(manager->ctrl); + if (r < 0) + return log_error_errno(r, "Failed to bind udev control socket: %m"); + + r = device_monitor_new_full(&manager->monitor, MONITOR_GROUP_KERNEL, fd_uevent); + if (r < 0) + return log_error_errno(r, "Failed to initialize device monitor: %m"); + + (void) sd_device_monitor_set_description(manager->monitor, "manager"); + + r = device_monitor_enable_receiving(manager->monitor); + if (r < 0) + return log_error_errno(r, "Failed to bind netlink socket: %m"); + + manager->log_level = log_get_max_level(); + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (r < 0) + log_debug_errno(r, "Failed to get cgroup, ignoring: %m"); + else if (endswith(cgroup, "/udev")) { /* If we are in a subcgroup /udev/ we assume it was delegated to us */ + log_debug("Running in delegated subcgroup '%s'.", cgroup); + manager->cgroup = TAKE_PTR(cgroup); + } + + return 0; +} + +int manager_main(Manager *manager) { + int fd_worker, r; + + manager_set_default_children_max(manager); + + /* unnamed socket from workers to the main daemon */ + r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, manager->worker_watch); + if (r < 0) + return log_error_errno(errno, "Failed to create socketpair for communicating with workers: %m"); + + fd_worker = manager->worker_watch[READ_END]; + + r = setsockopt_int(fd_worker, SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "Failed to enable SO_PASSCRED: %m"); + + manager->inotify_fd = inotify_init1(IN_CLOEXEC); + if (manager->inotify_fd < 0) + return log_error_errno(errno, "Failed to create inotify descriptor: %m"); + + udev_watch_restore(manager->inotify_fd); + + /* block and listen to all signals on signalfd */ + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, SIGHUP, SIGCHLD, SIGRTMIN+18, -1) >= 0); + + r = sd_event_default(&manager->event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = sd_event_add_signal(manager->event, NULL, SIGINT, on_sigterm, manager); + if (r < 0) + return log_error_errno(r, "Failed to create SIGINT event source: %m"); + + r = sd_event_add_signal(manager->event, NULL, SIGTERM, on_sigterm, manager); + if (r < 0) + return log_error_errno(r, "Failed to create SIGTERM event source: %m"); + + r = sd_event_add_signal(manager->event, NULL, SIGHUP, on_sighup, manager); + if (r < 0) + return log_error_errno(r, "Failed to create SIGHUP event source: %m"); + + r = sd_event_set_watchdog(manager->event, true); + if (r < 0) + return log_error_errno(r, "Failed to create watchdog event source: %m"); + + r = udev_ctrl_attach_event(manager->ctrl, manager->event); + if (r < 0) + return log_error_errno(r, "Failed to attach event to udev control: %m"); + + r = udev_ctrl_start(manager->ctrl, on_ctrl_msg, manager); + if (r < 0) + return log_error_errno(r, "Failed to start udev control: %m"); + + /* This needs to be after the inotify and uevent handling, to make sure + * that the ping is send back after fully processing the pending uevents + * (including the synthetic ones we may create due to inotify events). + */ + r = sd_event_source_set_priority(udev_ctrl_get_event_source(manager->ctrl), SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to set IDLE event priority for udev control event source: %m"); + + r = sd_event_add_io(manager->event, &manager->inotify_event, manager->inotify_fd, EPOLLIN, on_inotify, manager); + if (r < 0) + return log_error_errno(r, "Failed to create inotify event source: %m"); + + r = sd_device_monitor_attach_event(manager->monitor, manager->event); + if (r < 0) + return log_error_errno(r, "Failed to attach event to device monitor: %m"); + + r = sd_device_monitor_start(manager->monitor, on_uevent, manager); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + r = sd_event_add_io(manager->event, NULL, fd_worker, EPOLLIN, on_worker, manager); + if (r < 0) + return log_error_errno(r, "Failed to create worker event source: %m"); + + r = sd_event_add_post(manager->event, NULL, on_post, manager); + if (r < 0) + return log_error_errno(r, "Failed to create post event source: %m"); + + /* Eventually, we probably want to do more here on memory pressure, for example, kill idle workers immediately */ + r = sd_event_add_memory_pressure(manager->event, &manager->memory_pressure_event_source, NULL, NULL); + if (r < 0) + log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to allocate memory pressure watch, ignoring: %m"); + + r = sd_event_add_signal(manager->event, &manager->memory_pressure_event_source, SIGRTMIN+18, sigrtmin18_handler, NULL); + if (r < 0) + return log_error_errno(r, "Failed to allocate SIGRTMIN+18 event source, ignoring: %m"); + + manager->last_usec = now(CLOCK_MONOTONIC); + + udev_builtin_init(); + + r = udev_rules_load(&manager->rules, manager->resolve_name_timing); + if (r < 0) + return log_error_errno(r, "Failed to read udev rules: %m"); + + r = udev_rules_apply_static_dev_perms(manager->rules); + if (r < 0) + log_warning_errno(r, "Failed to apply permissions on static device nodes, ignoring: %m"); + + notify_ready(manager); + + r = sd_event_loop(manager->event); + if (r < 0) + log_error_errno(r, "Event loop failed: %m"); + + (void) sd_notify(/* unset= */ false, NOTIFY_STOPPING); + return r; +} diff --git a/src/udev/udev-manager.h b/src/udev/udev-manager.h new file mode 100644 index 0000000..afbc67f --- /dev/null +++ b/src/udev/udev-manager.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include + +#include "sd-device.h" +#include "sd-event.h" + +#include "hashmap.h" +#include "macro.h" +#include "time-util.h" +#include "udev-ctrl.h" +#include "udev-rules.h" + +typedef struct Event Event; +typedef struct Worker Worker; + +typedef struct Manager { + sd_event *event; + Hashmap *workers; + LIST_HEAD(Event, events); + char *cgroup; + int log_level; + + UdevRules *rules; + Hashmap *properties; + + sd_device_monitor *monitor; + UdevCtrl *ctrl; + int worker_watch[2]; + + /* used by udev-watch */ + int inotify_fd; + sd_event_source *inotify_event; + + sd_event_source *kill_workers_event; + + sd_event_source *memory_pressure_event_source; + sd_event_source *sigrtmin18_event_source; + + usec_t last_usec; + + ResolveNameTiming resolve_name_timing; + unsigned children_max; + usec_t exec_delay_usec; + usec_t timeout_usec; + int timeout_signal; + bool blockdev_read_only; + + bool udev_node_needs_cleanup; + bool stop_exec_queue; + bool exit; +} Manager; + +Manager* manager_new(void); +Manager* manager_free(Manager *manager); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_init(Manager *manager, int fd_ctrl, int fd_uevent); +int manager_main(Manager *manager); + +bool devpath_conflict(const char *a, const char *b); diff --git a/src/udev/udev-node.c b/src/udev/udev-node.c new file mode 100644 index 0000000..e12c26c --- /dev/null +++ b/src/udev/udev-node.c @@ -0,0 +1,790 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include + +#include "sd-id128.h" + +#include "alloc-util.h" +#include "device-private.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "label-util.h" +#include "mkdir-label.h" +#include "parse-util.h" +#include "path-util.h" +#include "selinux-util.h" +#include "smack-util.h" +#include "stat-util.h" +#include "string-util.h" +#include "udev-node.h" +#include "user-util.h" + +#define UDEV_NODE_HASH_KEY SD_ID128_MAKE(b9,6a,f1,ce,40,31,44,1a,9e,19,ec,8b,ae,f3,e3,2f) + +int udev_node_cleanup(void) { + _cleanup_closedir_ DIR *dir = NULL; + + /* This must not be called when any workers exist. It would cause a race between mkdir() called + * by stack_directory_lock() and unlinkat() called by this. */ + + dir = opendir("/run/udev/links"); + if (!dir) { + if (errno == ENOENT) + return 0; + + return log_debug_errno(errno, "Failed to open directory '/run/udev/links', ignoring: %m"); + } + + FOREACH_DIRENT_ALL(de, dir, break) { + _cleanup_free_ char *lockfile = NULL; + + if (de->d_name[0] == '.') + continue; + + if (de->d_type != DT_DIR) + continue; + + /* As commented in the above, this is called when no worker exists, hence the file is not + * locked. On a later uevent, the lock file will be created if necessary. So, we can safely + * remove the file now. */ + lockfile = path_join(de->d_name, ".lock"); + if (!lockfile) + return log_oom_debug(); + + if (unlinkat(dirfd(dir), lockfile, 0) < 0 && errno != ENOENT) { + log_debug_errno(errno, "Failed to remove '/run/udev/links/%s', ignoring: %m", lockfile); + continue; + } + + if (unlinkat(dirfd(dir), de->d_name, AT_REMOVEDIR) < 0 && errno != ENOTEMPTY) + log_debug_errno(errno, "Failed to remove '/run/udev/links/%s', ignoring: %m", de->d_name); + } + + return 0; +} + +static int node_symlink(sd_device *dev, const char *devnode, const char *slink) { + struct stat st; + int r; + + assert(dev); + assert(slink); + + if (!devnode) { + r = sd_device_get_devname(dev, &devnode); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device node: %m"); + } + + if (lstat(slink, &st) >= 0) { + if (!S_ISLNK(st.st_mode)) + return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EEXIST), + "Conflicting inode '%s' found, symlink to '%s' will not be created.", + slink, devnode); + } else if (errno != ENOENT) + return log_device_debug_errno(dev, errno, "Failed to lstat() '%s': %m", slink); + + r = mkdir_parents_label(slink, 0755); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to create parent directory of '%s': %m", slink); + + /* use relative link */ + r = symlink_atomic_full_label(devnode, slink, /* make_relative = */ true); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to create symlink '%s' to '%s': %m", slink, devnode); + + log_device_debug(dev, "Successfully created symlink '%s' to '%s'", slink, devnode); + return 0; +} + +static int stack_directory_read_one(int dirfd, const char *id, char **devnode, int *priority) { + _cleanup_free_ char *buf = NULL; + int tmp_prio, r; + + assert(dirfd >= 0); + assert(id); + assert(priority); + + /* This reads priority and device node from the symlink under /run/udev/links (or udev database). + * If 'devnode' is NULL, obtained priority is always set to '*priority'. If 'devnode' is non-NULL, + * this updates '*devnode' and '*priority'. */ + + /* First, let's try to read the entry with the new format, which should replace the old format pretty + * quickly. */ + r = readlinkat_malloc(dirfd, id, &buf); + if (r >= 0) { + char *colon; + + /* With the new format, the devnode and priority can be obtained from symlink itself. */ + + colon = strchr(buf, ':'); + if (!colon || colon == buf) + return -EINVAL; + + *colon = '\0'; + + /* Of course, this check is racy, but it is not necessary to be perfect. Even if the device + * node will be removed after this check, we will receive 'remove' uevent, and the invalid + * symlink will be removed during processing the event. The check is just for shortening the + * timespan that the symlink points to a non-existing device node. */ + if (access(colon + 1, F_OK) < 0) + return -ENODEV; + + r = safe_atoi(buf, &tmp_prio); + if (r < 0) + return r; + + if (!devnode) + goto finalize; + + if (*devnode && tmp_prio <= *priority) + return 0; /* Unchanged */ + + r = free_and_strdup(devnode, colon + 1); + if (r < 0) + return r; + + } else if (r == -EINVAL) { /* Not a symlink ? try the old format */ + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *val; + + /* Old format. The devnode and priority must be obtained from uevent and udev database. */ + + r = sd_device_new_from_device_id(&dev, id); + if (r < 0) + return r; + + r = device_get_devlink_priority(dev, &tmp_prio); + if (r < 0) + return r; + + if (!devnode) + goto finalize; + + if (*devnode && tmp_prio <= *priority) + return 0; /* Unchanged */ + + r = sd_device_get_devname(dev, &val); + if (r < 0) + return r; + + r = free_and_strdup(devnode, val); + if (r < 0) + return r; + + } else + return r == -ENOENT ? -ENODEV : r; + +finalize: + *priority = tmp_prio; + return 1; /* Updated */ +} + +static int stack_directory_find_prioritized_devnode(sd_device *dev, int dirfd, bool add, char **ret) { + _cleanup_closedir_ DIR *dir = NULL; + _cleanup_free_ char *devnode = NULL; + int r, priority; + const char *id; + + assert(dev); + assert(dirfd >= 0); + assert(ret); + + /* Find device node of device with highest priority. This returns 1 if a device found, 0 if no + * device found, or a negative errno on error. */ + + if (add) { + const char *n; + + r = device_get_devlink_priority(dev, &priority); + if (r < 0) + return r; + + r = sd_device_get_devname(dev, &n); + if (r < 0) + return r; + + devnode = strdup(n); + if (!devnode) + return -ENOMEM; + } + + dir = xopendirat(dirfd, ".", O_NOFOLLOW); + if (!dir) + return -errno; + + r = device_get_device_id(dev, &id); + if (r < 0) + return r; + + FOREACH_DIRENT(de, dir, break) { + + /* skip ourself */ + if (streq(de->d_name, id)) + continue; + + r = stack_directory_read_one(dirfd, de->d_name, &devnode, &priority); + if (r < 0 && r != -ENODEV) + log_debug_errno(r, "Failed to read '%s', ignoring: %m", de->d_name); + } + + *ret = TAKE_PTR(devnode); + return !!*ret; +} + +static int stack_directory_update(sd_device *dev, int fd, bool add) { + const char *id; + int r; + + assert(dev); + assert(fd >= 0); + + r = device_get_device_id(dev, &id); + if (r < 0) + return r; + + if (add) { + _cleanup_free_ char *data = NULL, *buf = NULL; + const char *devname; + int priority; + + r = sd_device_get_devname(dev, &devname); + if (r < 0) + return r; + + r = device_get_devlink_priority(dev, &priority); + if (r < 0) + return r; + + if (asprintf(&data, "%i:%s", priority, devname) < 0) + return -ENOMEM; + + if (readlinkat_malloc(fd, id, &buf) >= 0 && streq(buf, data)) + return 0; /* Unchanged. */ + + (void) unlinkat(fd, id, 0); + + if (symlinkat(data, fd, id) < 0) + return -errno; + + } else { + if (unlinkat(fd, id, 0) < 0) { + if (errno == ENOENT) + return 0; /* Unchanged. */ + return -errno; + } + } + + return 1; /* Updated. */ +} + +size_t udev_node_escape_path(const char *src, char *dest, size_t size) { + size_t i, j; + uint64_t h; + + assert(src); + assert(dest); + assert(size >= 12); + + for (i = 0, j = 0; src[i] != '\0'; i++) { + if (src[i] == '/') { + if (j+4 >= size - 12 + 1) + goto toolong; + memcpy(&dest[j], "\\x2f", 4); + j += 4; + } else if (src[i] == '\\') { + if (j+4 >= size - 12 + 1) + goto toolong; + memcpy(&dest[j], "\\x5c", 4); + j += 4; + } else { + if (j+1 >= size - 12 + 1) + goto toolong; + dest[j] = src[i]; + j++; + } + } + dest[j] = '\0'; + return j; + +toolong: + /* If the input path is too long to encode as a filename, then let's suffix with a string + * generated from the hash of the path. */ + + h = siphash24_string(src, UDEV_NODE_HASH_KEY.bytes); + + for (unsigned k = 0; k <= 10; k++) + dest[size - k - 2] = urlsafe_base64char((h >> (k * 6)) & 63); + + dest[size - 1] = '\0'; + return size - 1; +} + +static int stack_directory_get_name(const char *slink, char **ret) { + _cleanup_free_ char *s = NULL, *dirname = NULL; + char name_enc[NAME_MAX+1]; + const char *name; + int r; + + assert(slink); + assert(ret); + + r = path_simplify_alloc(slink, &s); + if (r < 0) + return r; + + if (!path_is_normalized(s)) + return -EINVAL; + + name = path_startswith(s, "/dev"); + if (empty_or_root(name)) + return -EINVAL; + + udev_node_escape_path(name, name_enc, sizeof(name_enc)); + + dirname = path_join("/run/udev/links", name_enc); + if (!dirname) + return -ENOMEM; + + *ret = TAKE_PTR(dirname); + return 0; +} + +static int stack_directory_open(sd_device *dev, const char *slink, int *ret_dirfd, int *ret_lockfd) { + _cleanup_close_ int dirfd = -EBADF, lockfd = -EBADF; + _cleanup_free_ char *dirname = NULL; + int r; + + assert(dev); + assert(slink); + assert(ret_dirfd); + assert(ret_lockfd); + + r = stack_directory_get_name(slink, &dirname); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to build stack directory name for '%s': %m", slink); + + r = mkdir_parents(dirname, 0755); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to create stack directory '%s': %m", dirname); + + dirfd = open_mkdir_at(AT_FDCWD, dirname, O_CLOEXEC | O_DIRECTORY | O_NOFOLLOW | O_RDONLY, 0755); + if (dirfd < 0) + return log_device_debug_errno(dev, dirfd, "Failed to open stack directory '%s': %m", dirname); + + lockfd = openat(dirfd, ".lock", O_CLOEXEC | O_NOFOLLOW | O_RDONLY | O_CREAT, 0600); + if (lockfd < 0) + return log_device_debug_errno(dev, errno, "Failed to create lock file for stack directory '%s': %m", dirname); + + if (flock(lockfd, LOCK_EX) < 0) + return log_device_debug_errno(dev, errno, "Failed to place a lock on lock file for %s: %m", dirname); + + *ret_dirfd = TAKE_FD(dirfd); + *ret_lockfd = TAKE_FD(lockfd); + return 0; +} + +static int node_get_current(const char *slink, int dirfd, char **ret_id, int *ret_prio) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + _cleanup_free_ char *id_dup = NULL; + const char *id; + int r; + + assert(slink); + assert(dirfd >= 0); + assert(ret_id); + + r = sd_device_new_from_devname(&dev, slink); + if (r < 0) + return r; + + r = device_get_device_id(dev, &id); + if (r < 0) + return r; + + id_dup = strdup(id); + if (!id_dup) + return -ENOMEM; + + if (ret_prio) { + r = stack_directory_read_one(dirfd, id, NULL, ret_prio); + if (r < 0) + return r; + } + + *ret_id = TAKE_PTR(id_dup); + return 0; +} + +static int link_update(sd_device *dev, const char *slink, bool add) { + _cleanup_free_ char *current_id = NULL, *devnode = NULL; + _cleanup_close_ int dirfd = -EBADF, lockfd = -EBADF; + int r, current_prio; + + assert(dev); + assert(slink); + + r = stack_directory_open(dev, slink, &dirfd, &lockfd); + if (r < 0) + return r; + + r = node_get_current(slink, dirfd, ¤t_id, add ? ¤t_prio : NULL); + if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r)) + return log_device_debug_errno(dev, r, "Failed to get the current device node priority for '%s': %m", slink); + + r = stack_directory_update(dev, dirfd, add); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to update stack directory for '%s': %m", slink); + + if (current_id) { + const char *id; + + r = device_get_device_id(dev, &id); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device id: %m"); + + if (add) { + int prio; + + r = device_get_devlink_priority(dev, &prio); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get devlink priority: %m"); + + if (streq(current_id, id)) { + if (current_prio <= prio) + /* The devlink is ours and already exists, and the new priority is + * equal or higher than the previous. Hence, it is not necessary to + * recreate it. */ + return 0; + + /* The devlink priority is downgraded. Another device may have a higher + * priority now. Let's find the device node with the highest priority. */ + } else { + if (current_prio > prio) + /* The devlink with a higher priority already exists and is owned by + * another device. Hence, it is not necessary to recreate it. */ + return 0; + + /* This device has the equal or a higher priority than the current. Let's + * create the devlink to our device node. */ + return node_symlink(dev, NULL, slink); + } + + } else { + if (!streq(current_id, id)) + /* The devlink already exists and is owned by another device. Hence, it is + * not necessary to recreate it. */ + return 0; + + /* The current devlink is ours, and the target device will be removed. Hence, we need + * to search the device that has the highest priority. and update the devlink. */ + } + } else { + /* The requested devlink does not exist, or the target device does not exist and the devlink + * points to a non-existing device. Let's search the device that has the highest priority, + * and update the devlink. */ + ; + } + + r = stack_directory_find_prioritized_devnode(dev, dirfd, add, &devnode); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to determine device node with the highest priority for '%s': %m", slink); + if (r > 0) + return node_symlink(dev, devnode, slink); + + log_device_debug(dev, "No reference left for '%s', removing", slink); + + if (unlink(slink) < 0 && errno != ENOENT) + log_device_debug_errno(dev, errno, "Failed to remove '%s', ignoring: %m", slink); + + (void) rmdir_parents(slink, "/dev"); + + return 0; +} + +static int device_get_devpath_by_devnum(sd_device *dev, char **ret) { + const char *subsystem; + dev_t devnum; + int r; + + assert(dev); + assert(ret); + + r = sd_device_get_subsystem(dev, &subsystem); + if (r < 0) + return r; + + r = sd_device_get_devnum(dev, &devnum); + if (r < 0) + return r; + + return device_path_make_major_minor(streq(subsystem, "block") ? S_IFBLK : S_IFCHR, devnum, ret); +} + +int udev_node_update(sd_device *dev, sd_device *dev_old) { + _cleanup_free_ char *filename = NULL; + int r; + + assert(dev); + assert(dev_old); + + /* update possible left-over symlinks */ + FOREACH_DEVICE_DEVLINK(dev_old, devlink) { + /* check if old link name still belongs to this device */ + if (device_has_devlink(dev, devlink)) + continue; + + log_device_debug(dev, + "Removing/updating old device symlink '%s', which is no longer belonging to this device.", + devlink); + + r = link_update(dev, devlink, /* add = */ false); + if (r < 0) + log_device_warning_errno(dev, r, + "Failed to remove/update device symlink '%s', ignoring: %m", + devlink); + } + + /* create/update symlinks, add symlinks to name index */ + FOREACH_DEVICE_DEVLINK(dev, devlink) { + r = link_update(dev, devlink, /* add = */ true); + if (r < 0) + log_device_warning_errno(dev, r, + "Failed to create/update device symlink '%s', ignoring: %m", + devlink); + } + + r = device_get_devpath_by_devnum(dev, &filename); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device path: %m"); + + /* always add /dev/{block,char}/$major:$minor */ + r = node_symlink(dev, NULL, filename); + if (r < 0) + return log_device_warning_errno(dev, r, "Failed to create device symlink '%s': %m", filename); + + return 0; +} + +int udev_node_remove(sd_device *dev) { + _cleanup_free_ char *filename = NULL; + int r; + + assert(dev); + + /* remove/update symlinks, remove symlinks from name index */ + FOREACH_DEVICE_DEVLINK(dev, devlink) { + r = link_update(dev, devlink, /* add = */ false); + if (r < 0) + log_device_warning_errno(dev, r, + "Failed to remove/update device symlink '%s', ignoring: %m", + devlink); + } + + r = device_get_devpath_by_devnum(dev, &filename); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device path: %m"); + + /* remove /dev/{block,char}/$major:$minor */ + if (unlink(filename) < 0 && errno != ENOENT) + return log_device_debug_errno(dev, errno, "Failed to remove '%s': %m", filename); + + return 0; +} + +static int udev_node_apply_permissions_impl( + sd_device *dev, /* can be NULL, only used for logging. */ + int node_fd, + const char *devnode, + bool apply_mac, + mode_t mode, + uid_t uid, + gid_t gid, + OrderedHashmap *seclabel_list) { + + bool apply_mode, apply_uid, apply_gid; + struct stat stats; + int r; + + assert(node_fd >= 0); + assert(devnode); + + if (fstat(node_fd, &stats) < 0) + return log_device_debug_errno(dev, errno, "cannot stat() node %s: %m", devnode); + + /* If group is set, but mode is not set, "upgrade" mode for the group. */ + if (mode == MODE_INVALID && gid_is_valid(gid) && gid > 0) + mode = 0660; + + apply_mode = mode != MODE_INVALID && (stats.st_mode & 0777) != (mode & 0777); + apply_uid = uid_is_valid(uid) && stats.st_uid != uid; + apply_gid = gid_is_valid(gid) && stats.st_gid != gid; + + if (apply_mode || apply_uid || apply_gid || apply_mac) { + bool selinux = false, smack = false; + const char *name, *label; + + if (apply_mode || apply_uid || apply_gid) { + log_device_debug(dev, "Setting permissions %s, uid=" UID_FMT ", gid=" GID_FMT ", mode=%#o", + devnode, + uid_is_valid(uid) ? uid : stats.st_uid, + gid_is_valid(gid) ? gid : stats.st_gid, + mode != MODE_INVALID ? mode & 0777 : stats.st_mode & 0777); + + r = fchmod_and_chown(node_fd, mode, uid, gid); + if (r < 0) + log_device_full_errno(dev, r == -ENOENT ? LOG_DEBUG : LOG_ERR, r, + "Failed to set owner/mode of %s to uid=" UID_FMT + ", gid=" GID_FMT ", mode=%#o: %m", + devnode, + uid_is_valid(uid) ? uid : stats.st_uid, + gid_is_valid(gid) ? gid : stats.st_gid, + mode != MODE_INVALID ? mode & 0777 : stats.st_mode & 0777); + } else + log_device_debug(dev, "Preserve permissions of %s, uid=" UID_FMT ", gid=" GID_FMT ", mode=%#o", + devnode, + uid_is_valid(uid) ? uid : stats.st_uid, + gid_is_valid(gid) ? gid : stats.st_gid, + mode != MODE_INVALID ? mode & 0777 : stats.st_mode & 0777); + + /* apply SECLABEL{$module}=$label */ + ORDERED_HASHMAP_FOREACH_KEY(label, name, seclabel_list) { + int q; + + if (streq(name, "selinux")) { + selinux = true; + + q = mac_selinux_apply_fd(node_fd, devnode, label); + if (q < 0) + log_device_full_errno(dev, q == -ENOENT ? LOG_DEBUG : LOG_ERR, q, + "SECLABEL: failed to set SELinux label '%s': %m", label); + else + log_device_debug(dev, "SECLABEL: set SELinux label '%s'", label); + + } else if (streq(name, "smack")) { + smack = true; + + q = mac_smack_apply_fd(node_fd, SMACK_ATTR_ACCESS, label); + if (q < 0) + log_device_full_errno(dev, q == -ENOENT ? LOG_DEBUG : LOG_ERR, q, + "SECLABEL: failed to set SMACK label '%s': %m", label); + else + log_device_debug(dev, "SECLABEL: set SMACK label '%s'", label); + + } else + log_device_error(dev, "SECLABEL: unknown subsystem, ignoring '%s'='%s'", name, label); + } + + /* set the defaults */ + if (!selinux) + (void) mac_selinux_fix_full(node_fd, NULL, devnode, LABEL_IGNORE_ENOENT); + if (!smack) + (void) mac_smack_apply_fd(node_fd, SMACK_ATTR_ACCESS, NULL); + } + + /* always update timestamp when we re-use the node, like on media change events */ + r = futimens_opath(node_fd, NULL); + if (r < 0) + log_device_debug_errno(dev, r, "Failed to adjust timestamp of node %s: %m", devnode); + + return 0; +} + +int udev_node_apply_permissions( + sd_device *dev, + bool apply_mac, + mode_t mode, + uid_t uid, + gid_t gid, + OrderedHashmap *seclabel_list) { + + const char *devnode; + _cleanup_close_ int node_fd = -EBADF; + int r; + + assert(dev); + + r = sd_device_get_devname(dev, &devnode); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get devname: %m"); + + node_fd = sd_device_open(dev, O_PATH|O_CLOEXEC); + if (node_fd < 0) { + if (ERRNO_IS_DEVICE_ABSENT(node_fd)) { + log_device_debug_errno(dev, node_fd, "Device node %s is missing, skipping handling.", devnode); + return 0; /* This is necessarily racey, so ignore missing the device */ + } + + return log_device_debug_errno(dev, node_fd, "Cannot open node %s: %m", devnode); + } + + return udev_node_apply_permissions_impl(dev, node_fd, devnode, apply_mac, mode, uid, gid, seclabel_list); +} + +int static_node_apply_permissions( + const char *name, + mode_t mode, + uid_t uid, + gid_t gid, + char **tags) { + + _cleanup_free_ char *unescaped_filename = NULL; + _cleanup_close_ int node_fd = -EBADF; + const char *devnode; + struct stat stats; + int r; + + assert(name); + + if (uid == UID_INVALID && gid == GID_INVALID && mode == MODE_INVALID && !tags) + return 0; + + devnode = strjoina("/dev/", name); + + node_fd = open(devnode, O_PATH|O_CLOEXEC); + if (node_fd < 0) { + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", devnode); + return 0; + } + + if (fstat(node_fd, &stats) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", devnode); + + if (!S_ISBLK(stats.st_mode) && !S_ISCHR(stats.st_mode)) { + log_warning("%s is neither block nor character device, ignoring.", devnode); + return 0; + } + + if (!strv_isempty(tags)) { + unescaped_filename = xescape(name, "/."); + if (!unescaped_filename) + return log_oom(); + } + + /* export the tags to a directory as symlinks, allowing otherwise dead nodes to be tagged */ + STRV_FOREACH(t, tags) { + _cleanup_free_ char *p = NULL; + + p = path_join("/run/udev/static_node-tags/", *t, unescaped_filename); + if (!p) + return log_oom(); + + r = mkdir_parents(p, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create parent directory for %s: %m", p); + + r = symlink(devnode, p); + if (r < 0 && errno != EEXIST) + return log_error_errno(errno, "Failed to create symlink %s -> %s: %m", p, devnode); + } + + return udev_node_apply_permissions_impl(NULL, node_fd, devnode, false, mode, uid, gid, NULL); +} diff --git a/src/udev/udev-node.h b/src/udev/udev-node.h new file mode 100644 index 0000000..0c545e4 --- /dev/null +++ b/src/udev/udev-node.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include +#include + +#include "sd-device.h" + +#include "hashmap.h" + +int udev_node_apply_permissions( + sd_device *dev, + bool apply_mac, + mode_t mode, + uid_t uid, + gid_t gid, + OrderedHashmap *seclabel_list); +int static_node_apply_permissions( + const char *name, + mode_t mode, + uid_t uid, + gid_t gid, + char **tags); + +int udev_node_remove(sd_device *dev); +int udev_node_update(sd_device *dev, sd_device *dev_old); +int udev_node_cleanup(void); + +size_t udev_node_escape_path(const char *src, char *dest, size_t size); diff --git a/src/udev/udev-rules.c b/src/udev/udev-rules.c new file mode 100644 index 0000000..5f12002 --- /dev/null +++ b/src/udev/udev-rules.c @@ -0,0 +1,2965 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include + +#include "alloc-util.h" +#include "architecture.h" +#include "conf-files.h" +#include "conf-parser.h" +#include "confidential-virt.h" +#include "constants.h" +#include "device-private.h" +#include "device-util.h" +#include "dirent-util.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "glob-util.h" +#include "list.h" +#include "mkdir.h" +#include "netif-naming-scheme.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "socket-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "strv.h" +#include "strxcpyx.h" +#include "sysctl-util.h" +#include "syslog-util.h" +#include "udev-builtin.h" +#include "udev-event.h" +#include "udev-format.h" +#include "udev-node.h" +#include "udev-rules.h" +#include "udev-spawn.h" +#include "udev-trace.h" +#include "udev-util.h" +#include "user-util.h" +#include "virt.h" + +#define RULES_DIRS ((const char* const*) CONF_PATHS_STRV("udev/rules.d")) + +typedef enum { + OP_MATCH, /* == */ + OP_NOMATCH, /* != */ + OP_ADD, /* += */ + OP_REMOVE, /* -= */ + OP_ASSIGN, /* = */ + OP_ASSIGN_FINAL, /* := */ + _OP_TYPE_MAX, + _OP_TYPE_INVALID = -EINVAL, +} UdevRuleOperatorType; + +typedef enum { + MATCH_TYPE_EMPTY, /* empty string */ + MATCH_TYPE_PLAIN, /* no special characters */ + MATCH_TYPE_PLAIN_WITH_EMPTY, /* no special characters with empty string, e.g., "|foo" */ + MATCH_TYPE_GLOB, /* shell globs ?,*,[] */ + MATCH_TYPE_GLOB_WITH_EMPTY, /* shell globs ?,*,[] with empty string, e.g., "|foo*" */ + MATCH_TYPE_SUBSYSTEM, /* "subsystem", "bus", or "class" */ + _MATCH_TYPE_MAX, + _MATCH_TYPE_INVALID = -EINVAL, +} UdevRuleMatchType; + +typedef enum { + SUBST_TYPE_PLAIN, /* no substitution */ + SUBST_TYPE_FORMAT, /* % or $ */ + SUBST_TYPE_SUBSYS, /* "[/]" format */ + _SUBST_TYPE_MAX, + _SUBST_TYPE_INVALID = -EINVAL, +} UdevRuleSubstituteType; + +typedef enum { + /* lvalues which take match or nomatch operator */ + TK_M_ACTION, /* string, device_get_action() */ + TK_M_DEVPATH, /* path, sd_device_get_devpath() */ + TK_M_KERNEL, /* string, sd_device_get_sysname() */ + TK_M_DEVLINK, /* strv, sd_device_get_devlink_first(), sd_device_get_devlink_next() */ + TK_M_NAME, /* string, name of network interface */ + TK_M_ENV, /* string, device property, takes key through attribute */ + TK_M_CONST, /* string, system-specific hard-coded constant */ + TK_M_TAG, /* strv, sd_device_get_tag_first(), sd_device_get_tag_next() */ + TK_M_SUBSYSTEM, /* string, sd_device_get_subsystem() */ + TK_M_DRIVER, /* string, sd_device_get_driver() */ + TK_M_ATTR, /* string, takes filename through attribute, sd_device_get_sysattr_value(), udev_resolve_subsys_kernel(), etc. */ + TK_M_SYSCTL, /* string, takes kernel parameter through attribute */ + + /* matches parent parameters */ + TK_M_PARENTS_KERNEL, /* string */ + TK_M_PARENTS_SUBSYSTEM, /* string */ + TK_M_PARENTS_DRIVER, /* string */ + TK_M_PARENTS_ATTR, /* string */ + TK_M_PARENTS_TAG, /* strv */ + + TK_M_TEST, /* path, optionally mode_t can be specified by attribute, test the existence of a file */ + TK_M_PROGRAM, /* string, execute a program */ + TK_M_IMPORT_FILE, /* path */ + TK_M_IMPORT_PROGRAM, /* string, import properties from the result of program */ + TK_M_IMPORT_BUILTIN, /* string, import properties from the result of built-in command */ + TK_M_IMPORT_DB, /* string, import properties from database */ + TK_M_IMPORT_CMDLINE, /* string, kernel command line */ + TK_M_IMPORT_PARENT, /* string, parent property */ + TK_M_RESULT, /* string, result of TK_M_PROGRAM */ + +#define _TK_M_MAX (TK_M_RESULT + 1) +#define _TK_A_MIN _TK_M_MAX + + /* lvalues which take one of assign operators */ + TK_A_OPTIONS_STRING_ESCAPE_NONE, /* no argument */ + TK_A_OPTIONS_STRING_ESCAPE_REPLACE, /* no argument */ + TK_A_OPTIONS_DB_PERSIST, /* no argument */ + TK_A_OPTIONS_INOTIFY_WATCH, /* boolean */ + TK_A_OPTIONS_DEVLINK_PRIORITY, /* int */ + TK_A_OPTIONS_LOG_LEVEL, /* string of log level or "reset" */ + TK_A_OWNER, /* user name */ + TK_A_GROUP, /* group name */ + TK_A_MODE, /* mode string */ + TK_A_OWNER_ID, /* uid_t */ + TK_A_GROUP_ID, /* gid_t */ + TK_A_MODE_ID, /* mode_t */ + TK_A_TAG, /* string */ + TK_A_OPTIONS_STATIC_NODE, /* device path, /dev/... */ + TK_A_SECLABEL, /* string with attribute */ + TK_A_ENV, /* string with attribute */ + TK_A_NAME, /* ifname */ + TK_A_DEVLINK, /* string */ + TK_A_ATTR, /* string with attribute */ + TK_A_SYSCTL, /* string with attribute */ + TK_A_RUN_BUILTIN, /* string */ + TK_A_RUN_PROGRAM, /* string */ + + _TK_TYPE_MAX, + _TK_TYPE_INVALID = -EINVAL, +} UdevRuleTokenType; + +typedef enum { + LINE_HAS_NAME = 1 << 0, /* has NAME= */ + LINE_HAS_DEVLINK = 1 << 1, /* has SYMLINK=, OWNER=, GROUP= or MODE= */ + LINE_HAS_STATIC_NODE = 1 << 2, /* has OPTIONS=static_node */ + LINE_HAS_GOTO = 1 << 3, /* has GOTO= */ + LINE_HAS_LABEL = 1 << 4, /* has LABEL= */ + LINE_UPDATE_SOMETHING = 1 << 5, /* has other TK_A_* or TK_M_IMPORT tokens */ + LINE_IS_REFERENCED = 1 << 6, /* is referenced by GOTO */ +} UdevRuleLineType; + +typedef struct UdevRuleFile UdevRuleFile; +typedef struct UdevRuleLine UdevRuleLine; +typedef struct UdevRuleToken UdevRuleToken; + +struct UdevRuleToken { + UdevRuleTokenType type:8; + UdevRuleOperatorType op:8; + UdevRuleMatchType match_type:8; + UdevRuleSubstituteType attr_subst_type:7; + bool attr_match_remove_trailing_whitespace:1; + const char *value; + void *data; + + UdevRuleLine *rule_line; + LIST_FIELDS(UdevRuleToken, tokens); +}; + +struct UdevRuleLine { + char *line; + unsigned line_number; + UdevRuleLineType type; + + const char *label; + const char *goto_label; + UdevRuleLine *goto_line; + + UdevRuleFile *rule_file; + LIST_HEAD(UdevRuleToken, tokens); + LIST_FIELDS(UdevRuleLine, rule_lines); +}; + +struct UdevRuleFile { + char *filename; + unsigned issues; /* used by "udevadm verify" */ + + UdevRules *rules; + LIST_HEAD(UdevRuleLine, rule_lines); + LIST_FIELDS(UdevRuleFile, rule_files); +}; + +struct UdevRules { + ResolveNameTiming resolve_name_timing; + Hashmap *known_users; + Hashmap *known_groups; + Hashmap *stats_by_path; + LIST_HEAD(UdevRuleFile, rule_files); +}; + +#define LINE_GET_RULES(line) \ + ASSERT_PTR(ASSERT_PTR(ASSERT_PTR(line)->rule_file)->rules) + +/*** Logging helpers ***/ + +#define log_udev_rule_internal(device, file, line_nr, level, error, fmt, ...) \ + ({ \ + int _lv = (level); \ + sd_device *_dev = (device); \ + UdevRuleFile *_f = (file); \ + const char *_n = _f ? _f->filename : NULL; \ + \ + if (!_dev && _f) \ + _f->issues |= (1U << _lv); \ + \ + log_device_full_errno_zerook( \ + _dev, _lv, error, "%s:%u " fmt, \ + strna(_n), line_nr, \ + ##__VA_ARGS__); \ + }) + +/* Mainly used when applying tokens to the event device. */ +#define log_event_full_errno_zerook(device, token, ...) \ + ({ \ + UdevRuleToken *_t = (token); \ + UdevRuleLine *_l = _t ? _t->rule_line : NULL; \ + \ + log_udev_rule_internal( \ + device, \ + _l ? _l->rule_file : NULL, \ + _l ? _l->line_number : 0, \ + __VA_ARGS__); \ + }) + +#define log_event_full_errno(device, token, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_event_full_errno_zerook( \ + device, token, level, _error, ##__VA_ARGS__); \ + }) + +#define log_event_full(device, token, level, ...) (void) log_event_full_errno_zerook(device, token, level, 0, __VA_ARGS__) + +#define log_event_debug(device, token, ...) log_event_full(device, token, LOG_DEBUG, __VA_ARGS__) +#define log_event_info(device, token, ...) log_event_full(device, token, LOG_INFO, __VA_ARGS__) +#define log_event_notice(device, token, ...) log_event_full(device, token, LOG_NOTICE, __VA_ARGS__) +#define log_event_warning(device, token, ...) log_event_full(device, token, LOG_WARNING, __VA_ARGS__) +#define log_event_error(device, token, ...) log_event_full(device, token, LOG_ERR, __VA_ARGS__) + +#define log_event_debug_errno(device, token, error, ...) log_event_full_errno(device, token, LOG_DEBUG, error, __VA_ARGS__) +#define log_event_info_errno(device, token, error, ...) log_event_full_errno(device, token, LOG_INFO, error, __VA_ARGS__) +#define log_event_notice_errno(device, token, error, ...) log_event_full_errno(device, token, LOG_NOTICE, error, __VA_ARGS__) +#define log_event_warning_errno(device, token, error, ...) log_event_full_errno(device, token, LOG_WARNING, error, __VA_ARGS__) +#define log_event_error_errno(device, token, error, ...) log_event_full_errno(device, token, LOG_ERR, error, __VA_ARGS__) + +/* Mainly used when parsing .rules files. */ +#define log_file_full_errno_zerook(...) \ + log_udev_rule_internal(NULL, __VA_ARGS__) + +#define log_file_error(file, line_nr, ...) \ + log_file_full_errno_zerook(file, line_nr, LOG_ERR, 0, __VA_ARGS__) + +#define log_line_full_errno_zerook(line, ...) \ + ({ \ + UdevRuleLine *_l = (line); \ + log_file_full_errno_zerook( \ + _l ? _l->rule_file : NULL, \ + _l ? _l->line_number : 0, \ + __VA_ARGS__); \ + }) + +#define log_line_full_errno(line, level, error, ...) \ + ({ \ + int _error = (error); \ + ASSERT_NON_ZERO(_error); \ + log_line_full_errno_zerook( \ + line, level, _error, ##__VA_ARGS__); \ + }) + +#define log_line_full(line, level, ...) (void) log_line_full_errno_zerook(line, level, 0, __VA_ARGS__) + +#define log_line_debug(line, ...) log_line_full(line, LOG_DEBUG, __VA_ARGS__) +#define log_line_info(line, ...) log_line_full(line, LOG_INFO, __VA_ARGS__) +#define log_line_notice(line, ...) log_line_full(line, LOG_NOTICE, __VA_ARGS__) +#define log_line_warning(line, ...) log_line_full(line, LOG_WARNING, __VA_ARGS__) +#define log_line_error(line, ...) log_line_full(line, LOG_ERR, __VA_ARGS__) + +#define log_line_debug_errno(line, error, ...) log_line_full_errno(line, LOG_DEBUG, error, __VA_ARGS__) +#define log_line_info_errno(line, error, ...) log_line_full_errno(line, LOG_INFO, error, __VA_ARGS__) +#define log_line_notice_errno(line, error, ...) log_line_full_errno(line, LOG_NOTICE, error, __VA_ARGS__) +#define log_line_warning_errno(line, error, ...) log_line_full_errno(line, LOG_WARNING, error, __VA_ARGS__) +#define log_line_error_errno(line, error, ...) log_line_full_errno(line, LOG_ERR, error, __VA_ARGS__) + +#define _log_line_invalid_token(line, key, type) \ + log_line_error_errno(line, SYNTHETIC_ERRNO(EINVAL), \ + "Invalid %s for %s.", type, key) + +#define log_line_invalid_op(line, key) _log_line_invalid_token(line, key, "operator") +#define log_line_invalid_attr(line, key) _log_line_invalid_token(line, key, "attribute") + +#define log_line_invalid_attr_format(line, key, attr, offset, hint) \ + log_line_error_errno(line, SYNTHETIC_ERRNO(EINVAL), \ + "Invalid attribute \"%s\" for %s (char %zu: %s), ignoring.", \ + attr, key, offset, hint) +#define log_line_invalid_value(line, key, value, offset, hint) \ + log_line_error_errno(line, SYNTHETIC_ERRNO(EINVAL), \ + "Invalid value \"%s\" for %s (char %zu: %s), ignoring.", \ + value, key, offset, hint) + +static void log_unknown_owner(sd_device *dev, UdevRuleLine *line, int error, const char *entity, const char *name) { + assert(line); + ASSERT_NON_ZERO(error); + + if (IN_SET(abs(error), ENOENT, ESRCH)) + log_udev_rule_internal(dev, line->rule_file, line->line_number, LOG_ERR, error, + "Unknown %s '%s', ignoring.", entity, name); + else + log_udev_rule_internal(dev, line->rule_file, line->line_number, LOG_ERR, error, + "Failed to resolve %s '%s', ignoring: %m", entity, name); +} + +static void log_event_truncated( + sd_device *dev, + UdevRuleToken *token, + const char *what, + const char *format, + const char *key, + bool is_match) { + + if (is_match) + log_event_debug(dev, token, + "The %s is truncated while substituting into '%s', " + "assuming the %s key does not match.", + what, format, key); + else + log_event_warning(dev, token, + "The %s is truncated while substituting into '%s', " + "refusing to apply the %s key.", + what, format, key); +} + +/*** Other functions ***/ + +static UdevRuleToken *udev_rule_token_free(UdevRuleToken *token) { + if (!token) + return NULL; + + if (token->rule_line) + LIST_REMOVE(tokens, token->rule_line->tokens, token); + + return mfree(token); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(UdevRuleToken*, udev_rule_token_free); + +static void udev_rule_line_clear_tokens(UdevRuleLine *rule_line) { + assert(rule_line); + + LIST_FOREACH(tokens, i, rule_line->tokens) + udev_rule_token_free(i); +} + +static UdevRuleLine *udev_rule_line_free(UdevRuleLine *rule_line) { + if (!rule_line) + return NULL; + + udev_rule_line_clear_tokens(rule_line); + + if (rule_line->rule_file) + LIST_REMOVE(rule_lines, rule_line->rule_file->rule_lines, rule_line); + + free(rule_line->line); + return mfree(rule_line); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(UdevRuleLine*, udev_rule_line_free); + +static UdevRuleFile *udev_rule_file_free(UdevRuleFile *rule_file) { + if (!rule_file) + return NULL; + + LIST_FOREACH(rule_lines, i, rule_file->rule_lines) + udev_rule_line_free(i); + + if (rule_file->rules) + LIST_REMOVE(rule_files, rule_file->rules->rule_files, rule_file); + + free(rule_file->filename); + return mfree(rule_file); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(UdevRuleFile*, udev_rule_file_free); + +UdevRules *udev_rules_free(UdevRules *rules) { + if (!rules) + return NULL; + + LIST_FOREACH(rule_files, i, rules->rule_files) + udev_rule_file_free(i); + + hashmap_free_free_key(rules->known_users); + hashmap_free_free_key(rules->known_groups); + hashmap_free(rules->stats_by_path); + return mfree(rules); +} + +static int rule_resolve_user(UdevRuleLine *rule_line, const char *name, uid_t *ret) { + Hashmap **known_users = &LINE_GET_RULES(rule_line)->known_users; + _cleanup_free_ char *n = NULL; + uid_t uid; + void *val; + int r; + + assert(name); + assert(ret); + + val = hashmap_get(*known_users, name); + if (val) { + *ret = PTR_TO_UID(val); + return 0; + } + + r = get_user_creds(&name, &uid, NULL, NULL, NULL, USER_CREDS_ALLOW_MISSING); + if (r < 0) { + log_unknown_owner(NULL, rule_line, r, "user", name); + *ret = UID_INVALID; + return 0; + } + + n = strdup(name); + if (!n) + return -ENOMEM; + + r = hashmap_ensure_put(known_users, &string_hash_ops, n, UID_TO_PTR(uid)); + if (r < 0) + return r; + + TAKE_PTR(n); + *ret = uid; + return 0; +} + +static int rule_resolve_group(UdevRuleLine *rule_line, const char *name, gid_t *ret) { + Hashmap **known_groups = &LINE_GET_RULES(rule_line)->known_groups; + _cleanup_free_ char *n = NULL; + gid_t gid; + void *val; + int r; + + assert(name); + assert(ret); + + val = hashmap_get(*known_groups, name); + if (val) { + *ret = PTR_TO_GID(val); + return 0; + } + + r = get_group_creds(&name, &gid, USER_CREDS_ALLOW_MISSING); + if (r < 0) { + log_unknown_owner(NULL, rule_line, r, "group", name); + *ret = GID_INVALID; + return 0; + } + + n = strdup(name); + if (!n) + return -ENOMEM; + + r = hashmap_ensure_put(known_groups, &string_hash_ops, n, GID_TO_PTR(gid)); + if (r < 0) + return r; + + TAKE_PTR(n); + *ret = gid; + return 0; +} + +static UdevRuleSubstituteType rule_get_substitution_type(const char *str) { + assert(str); + + if (str[0] == '[') + return SUBST_TYPE_SUBSYS; + if (strchr(str, '%') || strchr(str, '$')) + return SUBST_TYPE_FORMAT; + return SUBST_TYPE_PLAIN; +} + +static bool type_has_nulstr_value(UdevRuleTokenType type) { + return type < TK_M_TEST || type == TK_M_RESULT; +} + +static int rule_line_add_token(UdevRuleLine *rule_line, UdevRuleTokenType type, UdevRuleOperatorType op, char *value, void *data) { + _cleanup_(udev_rule_token_freep) UdevRuleToken *token = NULL; + UdevRuleMatchType match_type = _MATCH_TYPE_INVALID; + UdevRuleSubstituteType subst_type = _SUBST_TYPE_INVALID; + bool remove_trailing_whitespace = false; + size_t len; + + assert(rule_line); + assert(type >= 0 && type < _TK_TYPE_MAX); + assert(op >= 0 && op < _OP_TYPE_MAX); + + if (type < _TK_M_MAX) { + assert(value); + assert(IN_SET(op, OP_MATCH, OP_NOMATCH)); + + if (type == TK_M_SUBSYSTEM && STR_IN_SET(value, "subsystem", "bus", "class")) + match_type = MATCH_TYPE_SUBSYSTEM; + else if (isempty(value)) + match_type = MATCH_TYPE_EMPTY; + else if (streq(value, "?*")) { + /* Convert KEY=="?*" -> KEY!="" */ + match_type = MATCH_TYPE_EMPTY; + op = op == OP_MATCH ? OP_NOMATCH : OP_MATCH; + } else if (string_is_glob(value)) + match_type = MATCH_TYPE_GLOB; + else + match_type = MATCH_TYPE_PLAIN; + + if (type_has_nulstr_value(type)) { + /* Convert value string to nulstr. */ + bool bar = true, empty = false; + char *a, *b; + + for (a = b = value; *a != '\0'; a++) { + if (*a != '|') { + *b++ = *a; + bar = false; + } else { + if (bar) + empty = true; + else + *b++ = '\0'; + bar = true; + } + } + *b = '\0'; + + /* Make sure the value is end, so NULSTR_FOREACH can read correct match */ + if (b < a) + b[1] = '\0'; + + if (bar) + empty = true; + + if (empty) { + if (match_type == MATCH_TYPE_GLOB) + match_type = MATCH_TYPE_GLOB_WITH_EMPTY; + if (match_type == MATCH_TYPE_PLAIN) + match_type = MATCH_TYPE_PLAIN_WITH_EMPTY; + } + } + } + + if (IN_SET(type, TK_M_ATTR, TK_M_PARENTS_ATTR)) { + assert(value); + assert(data); + + len = strlen(value); + if (len > 0 && !isspace(value[len - 1])) + remove_trailing_whitespace = true; + + subst_type = rule_get_substitution_type(data); + } + + token = new(UdevRuleToken, 1); + if (!token) + return -ENOMEM; + + *token = (UdevRuleToken) { + .type = type, + .op = op, + .value = value, + .data = data, + .match_type = match_type, + .attr_subst_type = subst_type, + .attr_match_remove_trailing_whitespace = remove_trailing_whitespace, + .rule_line = rule_line, + }; + + LIST_APPEND(tokens, rule_line->tokens, token); + + if (token->type == TK_A_NAME) + SET_FLAG(rule_line->type, LINE_HAS_NAME, true); + + else if (IN_SET(token->type, TK_A_DEVLINK, + TK_A_OWNER, TK_A_GROUP, TK_A_MODE, + TK_A_OWNER_ID, TK_A_GROUP_ID, TK_A_MODE_ID)) + SET_FLAG(rule_line->type, LINE_HAS_DEVLINK, true); + + else if (token->type == TK_A_OPTIONS_STATIC_NODE) + SET_FLAG(rule_line->type, LINE_HAS_STATIC_NODE, true); + + else if (token->type >= _TK_A_MIN || + IN_SET(token->type, TK_M_PROGRAM, + TK_M_IMPORT_FILE, TK_M_IMPORT_PROGRAM, TK_M_IMPORT_BUILTIN, + TK_M_IMPORT_DB, TK_M_IMPORT_CMDLINE, TK_M_IMPORT_PARENT)) + SET_FLAG(rule_line->type, LINE_UPDATE_SOMETHING, true); + + TAKE_PTR(token); + return 0; +} + +static void check_value_format_and_warn(UdevRuleLine *line, const char *key, const char *value, bool nonempty) { + size_t offset; + const char *hint; + + if (nonempty && isempty(value)) + log_line_invalid_value(line, key, value, (size_t) 0, "empty value"); + else if (udev_check_format(value, &offset, &hint) < 0) + log_line_invalid_value(line, key, value, offset + 1, hint); +} + +static int check_attr_format_and_warn(UdevRuleLine *line, const char *key, const char *value) { + size_t offset; + const char *hint; + + if (isempty(value)) + return log_line_invalid_attr(line, key); + if (udev_check_format(value, &offset, &hint) < 0) + log_line_invalid_attr_format(line, key, value, offset + 1, hint); + return 0; +} + +static int parse_token(UdevRuleLine *rule_line, const char *key, char *attr, UdevRuleOperatorType op, char *value) { + ResolveNameTiming resolve_name_timing = LINE_GET_RULES(rule_line)->resolve_name_timing; + bool is_match = IN_SET(op, OP_MATCH, OP_NOMATCH); + int r; + + assert(key); + assert(value); + + if (streq(key, "ACTION")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_ACTION, op, value, NULL); + } else if (streq(key, "DEVPATH")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_DEVPATH, op, value, NULL); + } else if (streq(key, "KERNEL")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_KERNEL, op, value, NULL); + } else if (streq(key, "SYMLINK")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) { + check_value_format_and_warn(rule_line, key, value, false); + r = rule_line_add_token(rule_line, TK_A_DEVLINK, op, value, NULL); + } else + r = rule_line_add_token(rule_line, TK_M_DEVLINK, op, value, NULL); + } else if (streq(key, "NAME")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ADD) { + log_line_warning(rule_line, "%s key takes '==', '!=', '=', or ':=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (!is_match) { + if (streq(value, "%k")) + return log_line_error_errno(rule_line, SYNTHETIC_ERRNO(EINVAL), + "Ignoring NAME=\"%%k\", as it will take no effect."); + if (isempty(value)) + return log_line_error_errno(rule_line, SYNTHETIC_ERRNO(EINVAL), + "Ignoring NAME=\"\", as udev will not delete any network interfaces."); + check_value_format_and_warn(rule_line, key, value, false); + + r = rule_line_add_token(rule_line, TK_A_NAME, op, value, NULL); + } else + r = rule_line_add_token(rule_line, TK_M_NAME, op, value, NULL); + } else if (streq(key, "ENV")) { + if (isempty(attr)) + return log_line_invalid_attr(rule_line, key); + if (op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ASSIGN_FINAL) { + log_line_warning(rule_line, "%s key takes '==', '!=', '=', or '+=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (!is_match) { + if (STR_IN_SET(attr, + "ACTION", "DEVLINKS", "DEVNAME", "DEVPATH", "DEVTYPE", "DRIVER", + "IFINDEX", "MAJOR", "MINOR", "SEQNUM", "SUBSYSTEM", "TAGS")) + return log_line_error_errno(rule_line, SYNTHETIC_ERRNO(EINVAL), + "Invalid ENV attribute. '%s' cannot be set.", attr); + + check_value_format_and_warn(rule_line, key, value, false); + + r = rule_line_add_token(rule_line, TK_A_ENV, op, value, attr); + } else + r = rule_line_add_token(rule_line, TK_M_ENV, op, value, attr); + } else if (streq(key, "CONST")) { + if (isempty(attr) || !STR_IN_SET(attr, "arch", "virt")) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + r = rule_line_add_token(rule_line, TK_M_CONST, op, value, attr); + } else if (streq(key, "TAG")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (op == OP_ASSIGN_FINAL) { + log_line_warning(rule_line, "%s key takes '==', '!=', '=', or '+=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (!is_match) { + check_value_format_and_warn(rule_line, key, value, true); + + r = rule_line_add_token(rule_line, TK_A_TAG, op, value, NULL); + } else + r = rule_line_add_token(rule_line, TK_M_TAG, op, value, NULL); + } else if (streq(key, "SUBSYSTEM")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + if (STR_IN_SET(value, "bus", "class")) + log_line_warning(rule_line, "\"%s\" must be specified as \"subsystem\".", value); + + r = rule_line_add_token(rule_line, TK_M_SUBSYSTEM, op, value, NULL); + } else if (streq(key, "DRIVER")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_DRIVER, op, value, NULL); + } else if (streq(key, "ATTR")) { + r = check_attr_format_and_warn(rule_line, key, attr); + if (r < 0) + return r; + if (op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (IN_SET(op, OP_ADD, OP_ASSIGN_FINAL)) { + log_line_warning(rule_line, "%s key takes '==', '!=', or '=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (!is_match) { + check_value_format_and_warn(rule_line, key, value, false); + r = rule_line_add_token(rule_line, TK_A_ATTR, op, value, attr); + } else + r = rule_line_add_token(rule_line, TK_M_ATTR, op, value, attr); + } else if (streq(key, "SYSCTL")) { + r = check_attr_format_and_warn(rule_line, key, attr); + if (r < 0) + return r; + if (op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (IN_SET(op, OP_ADD, OP_ASSIGN_FINAL)) { + log_line_warning(rule_line, "%s key takes '==', '!=', or '=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (!is_match) { + check_value_format_and_warn(rule_line, key, value, false); + r = rule_line_add_token(rule_line, TK_A_SYSCTL, op, value, attr); + } else + r = rule_line_add_token(rule_line, TK_M_SYSCTL, op, value, attr); + } else if (streq(key, "KERNELS")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_PARENTS_KERNEL, op, value, NULL); + } else if (streq(key, "SUBSYSTEMS")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_PARENTS_SUBSYSTEM, op, value, NULL); + } else if (streq(key, "DRIVERS")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_PARENTS_DRIVER, op, value, NULL); + } else if (streq(key, "ATTRS")) { + r = check_attr_format_and_warn(rule_line, key, attr); + if (r < 0) + return r; + if (!is_match) + return log_line_invalid_op(rule_line, key); + + if (startswith(attr, "device/")) + log_line_warning(rule_line, "'device' link may not be available in future kernels."); + if (strstr(attr, "../")) + log_line_warning(rule_line, "Direct reference to parent sysfs directory, may break in future kernels."); + + r = rule_line_add_token(rule_line, TK_M_PARENTS_ATTR, op, value, attr); + } else if (streq(key, "TAGS")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_PARENTS_TAG, op, value, NULL); + } else if (streq(key, "TEST")) { + mode_t mode = MODE_INVALID; + + if (!isempty(attr)) { + r = parse_mode(attr, &mode); + if (r < 0) + return log_line_error_errno(rule_line, r, "Failed to parse mode '%s': %m", attr); + } + check_value_format_and_warn(rule_line, key, value, true); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_TEST, op, value, MODE_TO_PTR(mode)); + } else if (streq(key, "PROGRAM")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + check_value_format_and_warn(rule_line, key, value, true); + if (op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (!is_match) + op = OP_MATCH; + + r = rule_line_add_token(rule_line, TK_M_PROGRAM, op, value, NULL); + } else if (streq(key, "IMPORT")) { + if (isempty(attr)) + return log_line_invalid_attr(rule_line, key); + check_value_format_and_warn(rule_line, key, value, true); + if (op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (!is_match) + op = OP_MATCH; + + if (streq(attr, "file")) + r = rule_line_add_token(rule_line, TK_M_IMPORT_FILE, op, value, NULL); + else if (streq(attr, "program")) { + UdevBuiltinCommand cmd; + + cmd = udev_builtin_lookup(value); + if (cmd >= 0) { + log_line_debug(rule_line, "Found builtin command '%s' for %s, replacing attribute.", value, key); + r = rule_line_add_token(rule_line, TK_M_IMPORT_BUILTIN, op, value, UDEV_BUILTIN_CMD_TO_PTR(cmd)); + } else + r = rule_line_add_token(rule_line, TK_M_IMPORT_PROGRAM, op, value, NULL); + } else if (streq(attr, "builtin")) { + UdevBuiltinCommand cmd; + + cmd = udev_builtin_lookup(value); + if (cmd < 0) + return log_line_error_errno(rule_line, SYNTHETIC_ERRNO(EINVAL), + "Unknown builtin command: %s", value); + r = rule_line_add_token(rule_line, TK_M_IMPORT_BUILTIN, op, value, UDEV_BUILTIN_CMD_TO_PTR(cmd)); + } else if (streq(attr, "db")) + r = rule_line_add_token(rule_line, TK_M_IMPORT_DB, op, value, NULL); + else if (streq(attr, "cmdline")) + r = rule_line_add_token(rule_line, TK_M_IMPORT_CMDLINE, op, value, NULL); + else if (streq(attr, "parent")) + r = rule_line_add_token(rule_line, TK_M_IMPORT_PARENT, op, value, NULL); + else + return log_line_invalid_attr(rule_line, key); + } else if (streq(key, "RESULT")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (!is_match) + return log_line_invalid_op(rule_line, key); + + r = rule_line_add_token(rule_line, TK_M_RESULT, op, value, NULL); + } else if (streq(key, "OPTIONS")) { + char *tmp; + + if (attr) + return log_line_invalid_attr(rule_line, key); + if (is_match || op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ADD) + op = OP_ASSIGN; + + if (streq(value, "string_escape=none")) + r = rule_line_add_token(rule_line, TK_A_OPTIONS_STRING_ESCAPE_NONE, op, NULL, NULL); + else if (streq(value, "string_escape=replace")) + r = rule_line_add_token(rule_line, TK_A_OPTIONS_STRING_ESCAPE_REPLACE, op, NULL, NULL); + else if (streq(value, "db_persist")) + r = rule_line_add_token(rule_line, TK_A_OPTIONS_DB_PERSIST, op, NULL, NULL); + else if (streq(value, "watch")) + r = rule_line_add_token(rule_line, TK_A_OPTIONS_INOTIFY_WATCH, op, NULL, INT_TO_PTR(1)); + else if (streq(value, "nowatch")) + r = rule_line_add_token(rule_line, TK_A_OPTIONS_INOTIFY_WATCH, op, NULL, INT_TO_PTR(0)); + else if ((tmp = startswith(value, "static_node="))) + r = rule_line_add_token(rule_line, TK_A_OPTIONS_STATIC_NODE, op, tmp, NULL); + else if ((tmp = startswith(value, "link_priority="))) { + int prio; + + r = safe_atoi(tmp, &prio); + if (r < 0) + return log_line_error_errno(rule_line, r, "Failed to parse link priority '%s': %m", tmp); + r = rule_line_add_token(rule_line, TK_A_OPTIONS_DEVLINK_PRIORITY, op, NULL, INT_TO_PTR(prio)); + } else if ((tmp = startswith(value, "log_level="))) { + int level; + + if (streq(tmp, "reset")) + level = -1; + else { + level = log_level_from_string(tmp); + if (level < 0) + return log_line_error_errno(rule_line, level, "Failed to parse log level '%s': %m", tmp); + } + r = rule_line_add_token(rule_line, TK_A_OPTIONS_LOG_LEVEL, op, NULL, INT_TO_PTR(level)); + } else { + log_line_warning(rule_line, "Invalid value for OPTIONS key, ignoring: '%s'", value); + return 0; + } + } else if (streq(key, "OWNER")) { + uid_t uid; + + if (attr) + return log_line_invalid_attr(rule_line, key); + if (is_match || op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ADD) { + log_line_warning(rule_line, "%s key takes '=' or ':=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (parse_uid(value, &uid) >= 0) + r = rule_line_add_token(rule_line, TK_A_OWNER_ID, op, NULL, UID_TO_PTR(uid)); + else if (resolve_name_timing == RESOLVE_NAME_EARLY && + rule_get_substitution_type(value) == SUBST_TYPE_PLAIN) { + r = rule_resolve_user(rule_line, value, &uid); + if (r < 0) + return log_line_error_errno(rule_line, r, "Failed to resolve user name '%s': %m", value); + + r = rule_line_add_token(rule_line, TK_A_OWNER_ID, op, NULL, UID_TO_PTR(uid)); + } else if (resolve_name_timing != RESOLVE_NAME_NEVER) { + check_value_format_and_warn(rule_line, key, value, true); + r = rule_line_add_token(rule_line, TK_A_OWNER, op, value, NULL); + } else { + log_line_debug(rule_line, "User name resolution is disabled, ignoring %s=\"%s\".", key, value); + return 0; + } + } else if (streq(key, "GROUP")) { + gid_t gid; + + if (attr) + return log_line_invalid_attr(rule_line, key); + if (is_match || op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ADD) { + log_line_warning(rule_line, "%s key takes '=' or ':=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (parse_gid(value, &gid) >= 0) + r = rule_line_add_token(rule_line, TK_A_GROUP_ID, op, NULL, GID_TO_PTR(gid)); + else if (resolve_name_timing == RESOLVE_NAME_EARLY && + rule_get_substitution_type(value) == SUBST_TYPE_PLAIN) { + r = rule_resolve_group(rule_line, value, &gid); + if (r < 0) + return log_line_error_errno(rule_line, r, "Failed to resolve group name '%s': %m", value); + + r = rule_line_add_token(rule_line, TK_A_GROUP_ID, op, NULL, GID_TO_PTR(gid)); + } else if (resolve_name_timing != RESOLVE_NAME_NEVER) { + check_value_format_and_warn(rule_line, key, value, true); + r = rule_line_add_token(rule_line, TK_A_GROUP, op, value, NULL); + } else { + log_line_debug(rule_line, "Resolving group name is disabled, ignoring GROUP=\"%s\".", value); + return 0; + } + } else if (streq(key, "MODE")) { + mode_t mode; + + if (attr) + return log_line_invalid_attr(rule_line, key); + if (is_match || op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ADD) { + log_line_warning(rule_line, "%s key takes '=' or ':=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + if (parse_mode(value, &mode) >= 0) + r = rule_line_add_token(rule_line, TK_A_MODE_ID, op, NULL, MODE_TO_PTR(mode)); + else { + check_value_format_and_warn(rule_line, key, value, true); + r = rule_line_add_token(rule_line, TK_A_MODE, op, value, NULL); + } + } else if (streq(key, "SECLABEL")) { + if (isempty(attr)) + return log_line_invalid_attr(rule_line, key); + check_value_format_and_warn(rule_line, key, value, true); + if (is_match || op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + if (op == OP_ASSIGN_FINAL) { + log_line_warning(rule_line, "%s key takes '=' or '+=' operator, assuming '='.", key); + op = OP_ASSIGN; + } + + r = rule_line_add_token(rule_line, TK_A_SECLABEL, op, value, attr); + } else if (streq(key, "RUN")) { + if (is_match || op == OP_REMOVE) + return log_line_invalid_op(rule_line, key); + check_value_format_and_warn(rule_line, key, value, true); + if (!attr || streq(attr, "program")) + r = rule_line_add_token(rule_line, TK_A_RUN_PROGRAM, op, value, NULL); + else if (streq(attr, "builtin")) { + UdevBuiltinCommand cmd; + + cmd = udev_builtin_lookup(value); + if (cmd < 0) + return log_line_error_errno(rule_line, SYNTHETIC_ERRNO(EINVAL), + "Unknown builtin command '%s', ignoring.", value); + r = rule_line_add_token(rule_line, TK_A_RUN_BUILTIN, op, value, UDEV_BUILTIN_CMD_TO_PTR(cmd)); + } else + return log_line_invalid_attr(rule_line, key); + } else if (streq(key, "GOTO")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (op != OP_ASSIGN) + return log_line_invalid_op(rule_line, key); + if (FLAGS_SET(rule_line->type, LINE_HAS_GOTO)) { + log_line_warning(rule_line, "Contains multiple GOTO keys, ignoring GOTO=\"%s\".", value); + return 0; + } + + rule_line->goto_label = value; + SET_FLAG(rule_line->type, LINE_HAS_GOTO, true); + return 1; + } else if (streq(key, "LABEL")) { + if (attr) + return log_line_invalid_attr(rule_line, key); + if (op != OP_ASSIGN) + return log_line_invalid_op(rule_line, key); + if (FLAGS_SET(rule_line->type, LINE_HAS_LABEL)) + log_line_warning(rule_line, "Contains multiple LABEL keys, ignoring LABEL=\"%s\".", + rule_line->label); + + rule_line->label = value; + SET_FLAG(rule_line->type, LINE_HAS_LABEL, true); + return 1; + } else + return log_line_error_errno(rule_line, SYNTHETIC_ERRNO(EINVAL), "Invalid key '%s'.", key); + if (r < 0) + return log_oom(); + + return 1; +} + +static UdevRuleOperatorType parse_operator(const char *op) { + assert(op); + + if (startswith(op, "==")) + return OP_MATCH; + if (startswith(op, "!=")) + return OP_NOMATCH; + if (startswith(op, "+=")) + return OP_ADD; + if (startswith(op, "-=")) + return OP_REMOVE; + if (startswith(op, "=")) + return OP_ASSIGN; + if (startswith(op, ":=")) + return OP_ASSIGN_FINAL; + + return _OP_TYPE_INVALID; +} + +static void check_token_delimiters(UdevRuleLine *rule_line, const char *line) { + assert(rule_line); + + size_t n_comma = 0; + bool ws_before_comma = false, ws_after_comma = false; + const char *p; + + for (p = line; !isempty(p); ++p) { + if (*p == ',') + ++n_comma; + else if (strchr(WHITESPACE, *p)) { + if (n_comma > 0) + ws_after_comma = true; + else + ws_before_comma = true; + } else + break; + } + + if (line == rule_line->line) { + /* this is the first token of the rule */ + if (n_comma > 0) + log_line_notice(rule_line, "style: stray leading comma."); + } else if (isempty(p)) { + /* there are no more tokens in the rule */ + if (n_comma > 0) + log_line_notice(rule_line, "style: stray trailing comma."); + } else { + /* single comma is expected */ + if (n_comma == 0) + log_line_notice(rule_line, "style: a comma between tokens is expected."); + else if (n_comma > 1) + log_line_notice(rule_line, "style: more than one comma between tokens."); + + /* whitespace after comma is expected */ + if (n_comma > 0) { + if (ws_before_comma) + log_line_notice(rule_line, "style: stray whitespace before comma."); + if (!ws_after_comma) + log_line_notice(rule_line, "style: whitespace after comma is expected."); + } else if (!ws_before_comma && !ws_after_comma) + log_line_notice(rule_line, "style: whitespace between tokens is expected."); + } +} + +int udev_rule_parse_value(char *str, char **ret_value, char **ret_endpos) { + char *i, *j; + bool is_escaped; + + /* value must be double quotated */ + is_escaped = str[0] == 'e'; + str += is_escaped; + if (str[0] != '"') + return -EINVAL; + + if (!is_escaped) { + /* unescape double quotation '\"'->'"' */ + for (j = str, i = str + 1; *i != '"'; i++, j++) { + if (*i == '\0') + return -EINVAL; + if (i[0] == '\\' && i[1] == '"') + i++; + *j = *i; + } + j[0] = '\0'; + /* + * The return value must be terminated by two subsequent NULs + * so it could be safely interpreted as nulstr. + */ + j[1] = '\0'; + } else { + _cleanup_free_ char *unescaped = NULL; + ssize_t l; + + /* find the end position of value */ + for (i = str + 1; *i != '"'; i++) { + if (i[0] == '\\') + i++; + if (*i == '\0') + return -EINVAL; + } + i[0] = '\0'; + + l = cunescape_length(str + 1, i - (str + 1), 0, &unescaped); + if (l < 0) + return l; + + assert(l <= i - (str + 1)); + memcpy(str, unescaped, l + 1); + /* + * The return value must be terminated by two subsequent NULs + * so it could be safely interpreted as nulstr. + */ + str[l + 1] = '\0'; + } + + *ret_value = str; + *ret_endpos = i + 1; + return 0; +} + +static int parse_line(char **line, char **ret_key, char **ret_attr, UdevRuleOperatorType *ret_op, char **ret_value) { + char *key_begin, *key_end, *attr, *tmp; + UdevRuleOperatorType op; + int r; + + assert(line); + assert(*line); + assert(ret_key); + assert(ret_op); + assert(ret_value); + + key_begin = skip_leading_chars(*line, WHITESPACE ","); + + if (isempty(key_begin)) + return 0; + + for (key_end = key_begin; ; key_end++) { + if (key_end[0] == '\0') + return -EINVAL; + if (strchr(WHITESPACE "={", key_end[0])) + break; + if (strchr("+-!:", key_end[0]) && key_end[1] == '=') + break; + } + if (key_end[0] == '{') { + attr = key_end + 1; + tmp = strchr(attr, '}'); + if (!tmp) + return -EINVAL; + *tmp++ = '\0'; + } else { + attr = NULL; + tmp = key_end; + } + + tmp = skip_leading_chars(tmp, NULL); + op = parse_operator(tmp); + if (op < 0) + return -EINVAL; + + key_end[0] = '\0'; + + tmp += op == OP_ASSIGN ? 1 : 2; + tmp = skip_leading_chars(tmp, NULL); + r = udev_rule_parse_value(tmp, ret_value, line); + if (r < 0) + return r; + + *ret_key = key_begin; + *ret_attr = attr; + *ret_op = op; + return 1; +} + +static void check_tokens_order(UdevRuleLine *rule_line) { + bool has_result = false; + + assert(rule_line); + + LIST_FOREACH(tokens, t, rule_line->tokens) + if (t->type == TK_M_RESULT) + has_result = true; + else if (has_result && t->type == TK_M_PROGRAM) { + log_line_warning(rule_line, "Reordering RESULT check after PROGRAM assignment."); + break; + } +} + +static void sort_tokens(UdevRuleLine *rule_line) { + assert(rule_line); + + UdevRuleToken *old_tokens = TAKE_PTR(rule_line->tokens); + + while (old_tokens) { + UdevRuleToken *min_token = NULL; + + LIST_FOREACH(tokens, t, old_tokens) + if (!min_token || min_token->type > t->type) + min_token = t; + + LIST_REMOVE(tokens, old_tokens, min_token); + LIST_APPEND(tokens, rule_line->tokens, min_token); + } +} + +static int rule_add_line(UdevRuleFile *rule_file, const char *line_str, unsigned line_nr, bool extra_checks) { + _cleanup_(udev_rule_line_freep) UdevRuleLine *rule_line = NULL; + _cleanup_free_ char *line = NULL; + char *p; + int r; + + assert(rule_file); + assert(line_str); + + if (isempty(line_str)) + return 0; + + line = strdup(line_str); + if (!line) + return log_oom(); + + rule_line = new(UdevRuleLine, 1); + if (!rule_line) + return log_oom(); + + *rule_line = (UdevRuleLine) { + .line = TAKE_PTR(line), + .line_number = line_nr, + .rule_file = rule_file, + }; + + LIST_APPEND(rule_lines, rule_file->rule_lines, rule_line); + + for (p = rule_line->line; !isempty(p); ) { + char *key, *attr, *value; + UdevRuleOperatorType op; + + if (extra_checks) + check_token_delimiters(rule_line, p); + + r = parse_line(&p, &key, &attr, &op, &value); + if (r < 0) + return log_line_error_errno(rule_line, r, "Invalid key/value pair, ignoring."); + if (r == 0) + break; + + r = parse_token(rule_line, key, attr, op, value); + if (r < 0) + return r; + } + + if (rule_line->type == 0) { + log_line_warning(rule_line, "The line has no effect, ignoring."); + return 0; + } + + if (extra_checks) + check_tokens_order(rule_line); + + sort_tokens(rule_line); + TAKE_PTR(rule_line); + return 0; +} + +static void rule_resolve_goto(UdevRuleFile *rule_file) { + assert(rule_file); + + /* link GOTOs to LABEL rules in this file to be able to fast-forward */ + LIST_FOREACH(rule_lines, line, rule_file->rule_lines) { + if (!FLAGS_SET(line->type, LINE_HAS_GOTO)) + continue; + + LIST_FOREACH(rule_lines, i, line->rule_lines_next) + if (streq_ptr(i->label, line->goto_label)) { + line->goto_line = i; + SET_FLAG(i->type, LINE_IS_REFERENCED, true); + break; + } + + if (!line->goto_line) { + log_line_error(line, "GOTO=\"%s\" has no matching label, ignoring.", + line->goto_label); + + SET_FLAG(line->type, LINE_HAS_GOTO, false); + line->goto_label = NULL; + + if ((line->type & ~(LINE_HAS_LABEL|LINE_IS_REFERENCED)) == 0) { + log_line_warning(line, "The line has no effect any more, dropping."); + /* LINE_IS_REFERENCED implies LINE_HAS_LABEL */ + if (line->type & LINE_HAS_LABEL) + udev_rule_line_clear_tokens(line); + else + udev_rule_line_free(line); + } + } + } +} + +static bool token_data_is_string(UdevRuleTokenType type) { + return IN_SET(type, TK_M_ENV, + TK_M_CONST, + TK_M_ATTR, + TK_M_SYSCTL, + TK_M_PARENTS_ATTR, + TK_A_SECLABEL, + TK_A_ENV, + TK_A_ATTR, + TK_A_SYSCTL); +} + +static bool token_type_and_data_eq(const UdevRuleToken *a, const UdevRuleToken *b) { + assert(a); + assert(b); + + return a->type == b->type && + (token_data_is_string(a->type) ? streq_ptr(a->data, b->data) : (a->data == b->data)); +} + +static bool nulstr_eq(const char *a, const char *b) { + NULSTR_FOREACH(i, a) + if (!nulstr_contains(b, i)) + return false; + + NULSTR_FOREACH(i, b) + if (!nulstr_contains(a, i)) + return false; + + return true; +} + +static bool token_type_and_value_eq(const UdevRuleToken *a, const UdevRuleToken *b) { + assert(a); + assert(b); + + if (a->type != b->type || + a->match_type != b->match_type) + return false; + + /* token value is ignored for certain match types */ + if (IN_SET(a->match_type, MATCH_TYPE_EMPTY, MATCH_TYPE_SUBSYSTEM)) + return true; + + return type_has_nulstr_value(a->type) ? nulstr_eq(a->value, b->value) : + streq_ptr(a->value, b->value); +} + +static bool conflicting_op(UdevRuleOperatorType a, UdevRuleOperatorType b) { + return (a == OP_MATCH && b == OP_NOMATCH) || + (a == OP_NOMATCH && b == OP_MATCH); +} + +/* test whether all fields besides UdevRuleOperatorType of two tokens match */ +static bool tokens_eq(const UdevRuleToken *a, const UdevRuleToken *b) { + assert(a); + assert(b); + + return a->attr_subst_type == b->attr_subst_type && + a->attr_match_remove_trailing_whitespace == b->attr_match_remove_trailing_whitespace && + token_type_and_value_eq(a, b) && + token_type_and_data_eq(a, b); +} + +static bool nulstr_tokens_conflict(const UdevRuleToken *a, const UdevRuleToken *b) { + assert(a); + assert(b); + + if (!(a->type == b->type && + type_has_nulstr_value(a->type) && + a->op == b->op && + a->op == OP_MATCH && + a->match_type == b->match_type && + a->attr_subst_type == b->attr_subst_type && + a->attr_match_remove_trailing_whitespace == b->attr_match_remove_trailing_whitespace && + token_type_and_data_eq(a, b))) + return false; + + if (a->match_type == MATCH_TYPE_PLAIN) { + NULSTR_FOREACH(i, a->value) + if (nulstr_contains(b->value, i)) + return false; + return true; + } + + if (a->match_type == MATCH_TYPE_GLOB) { + NULSTR_FOREACH(i, a->value) { + size_t i_n = strcspn(i, GLOB_CHARS); + if (i_n == 0) + return false; + NULSTR_FOREACH(j, b->value) { + size_t j_n = strcspn(j, GLOB_CHARS); + if (j_n == 0 || strneq(i, j, MIN(i_n, j_n))) + return false; + } + + } + return true; + } + + return false; +} + +static void udev_check_unused_labels(UdevRuleLine *line) { + assert(line); + + if (FLAGS_SET(line->type, LINE_HAS_LABEL) && + !FLAGS_SET(line->type, LINE_IS_REFERENCED)) + log_line_notice(line, "style: LABEL=\"%s\" is unused.", line->label); +} + +static void udev_check_conflicts_duplicates(UdevRuleLine *line) { + assert(line); + + bool conflicts = false, duplicates = false; + + LIST_FOREACH(tokens, token, line->tokens) + LIST_FOREACH(tokens, i, token->tokens_next) { + bool new_conflicts = false, new_duplicates = false; + + if (tokens_eq(token, i)) { + if (!duplicates && token->op == i->op) + new_duplicates = true; + if (!conflicts && conflicting_op(token->op, i->op)) + new_conflicts = true; + } else if (!conflicts && nulstr_tokens_conflict(token, i)) + new_conflicts = true; + else + continue; + + if (new_duplicates) { + duplicates = new_duplicates; + log_line_warning(line, "duplicate expressions."); + } + if (new_conflicts) { + conflicts = new_conflicts; + log_line_error(line, "conflicting match expressions, the line has no effect."); + } + if (conflicts && duplicates) + return; + } +} + +static void udev_check_rule_line(UdevRuleLine *line) { + udev_check_unused_labels(line); + udev_check_conflicts_duplicates(line); +} + +int udev_rules_parse_file(UdevRules *rules, const char *filename, bool extra_checks, UdevRuleFile **ret) { + _cleanup_(udev_rule_file_freep) UdevRuleFile *rule_file = NULL; + _cleanup_free_ char *continuation = NULL, *name = NULL; + _cleanup_fclose_ FILE *f = NULL; + bool ignore_line = false; + unsigned line_nr = 0; + struct stat st; + int r; + + assert(rules); + assert(filename); + + f = fopen(filename, "re"); + if (!f) { + if (extra_checks) + return -errno; + + if (errno == ENOENT) + return 0; + + return log_warning_errno(errno, "Failed to open %s, ignoring: %m", filename); + } + + if (fstat(fileno(f), &st) < 0) + return log_warning_errno(errno, "Failed to stat %s, ignoring: %m", filename); + + if (null_or_empty(&st)) { + log_debug("Skipping empty file: %s", filename); + if (ret) + *ret = NULL; + return 0; + } + + r = hashmap_put_stats_by_path(&rules->stats_by_path, filename, &st); + if (r < 0) + return log_warning_errno(errno, "Failed to save stat for %s, ignoring: %m", filename); + + (void) fd_warn_permissions(filename, fileno(f)); + + log_debug("Reading rules file: %s", filename); + + name = strdup(filename); + if (!name) + return log_oom(); + + rule_file = new(UdevRuleFile, 1); + if (!rule_file) + return log_oom(); + + *rule_file = (UdevRuleFile) { + .filename = TAKE_PTR(name), + .rules = rules, + }; + + LIST_APPEND(rule_files, rules->rule_files, rule_file); + + for (;;) { + _cleanup_free_ char *buf = NULL; + size_t len; + char *line; + + r = read_line(f, UDEV_LINE_SIZE, &buf); + if (r < 0) + return r; + if (r == 0) + break; + + line_nr++; + line = skip_leading_chars(buf, NULL); + + /* Lines beginning with '#' are ignored regardless of line continuation. */ + if (line[0] == '#') + continue; + + len = strlen(line); + + if (continuation && !ignore_line) { + if (strlen(continuation) + len >= UDEV_LINE_SIZE) + ignore_line = true; + + if (!strextend(&continuation, line)) + return log_oom(); + + if (!ignore_line) { + line = continuation; + len = strlen(line); + } + } + + if (len > 0 && line[len - 1] == '\\') { + if (ignore_line) + continue; + + line[len - 1] = '\0'; + if (!continuation) { + continuation = strdup(line); + if (!continuation) + return log_oom(); + } + + continue; + } + + if (ignore_line) + log_file_error(rule_file, line_nr, "Line is too long, ignored."); + else if (len > 0) + (void) rule_add_line(rule_file, line, line_nr, extra_checks); + + continuation = mfree(continuation); + ignore_line = false; + } + + if (continuation) + log_file_error(rule_file, line_nr, + "Unexpected EOF after line continuation, line ignored."); + + rule_resolve_goto(rule_file); + + if (extra_checks) + LIST_FOREACH(rule_lines, line, rule_file->rule_lines) + udev_check_rule_line(line); + + if (ret) + *ret = rule_file; + + TAKE_PTR(rule_file); + return 1; +} + +unsigned udev_rule_file_get_issues(UdevRuleFile *rule_file) { + assert(rule_file); + + return rule_file->issues; +} + +UdevRules* udev_rules_new(ResolveNameTiming resolve_name_timing) { + assert(resolve_name_timing >= 0 && resolve_name_timing < _RESOLVE_NAME_TIMING_MAX); + + UdevRules *rules = new(UdevRules, 1); + if (!rules) + return NULL; + + *rules = (UdevRules) { + .resolve_name_timing = resolve_name_timing, + }; + + return rules; +} + +int udev_rules_load(UdevRules **ret_rules, ResolveNameTiming resolve_name_timing) { + _cleanup_(udev_rules_freep) UdevRules *rules = NULL; + _cleanup_strv_free_ char **files = NULL; + int r; + + rules = udev_rules_new(resolve_name_timing); + if (!rules) + return -ENOMEM; + + r = conf_files_list_strv(&files, ".rules", NULL, 0, RULES_DIRS); + if (r < 0) + return log_debug_errno(r, "Failed to enumerate rules files: %m"); + + STRV_FOREACH(f, files) { + r = udev_rules_parse_file(rules, *f, /* extra_checks = */ false, NULL); + if (r < 0) + log_debug_errno(r, "Failed to read rules file %s, ignoring: %m", *f); + } + + *ret_rules = TAKE_PTR(rules); + return 0; +} + +bool udev_rules_should_reload(UdevRules *rules) { + _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL; + int r; + + if (!rules) + return true; + + r = config_get_stats_by_path(".rules", NULL, 0, RULES_DIRS, /* check_dropins = */ false, &stats_by_path); + if (r < 0) { + log_warning_errno(r, "Failed to get stats of udev rules, ignoring: %m"); + return true; + } + + if (!stats_by_path_equal(rules->stats_by_path, stats_by_path)) { + log_debug("Udev rules need reloading"); + return true; + } + + return false; +} + +static bool token_match_string(UdevRuleToken *token, const char *str) { + const char *value; + bool match = false; + + assert(token); + assert(token->value); + assert(token->type < _TK_M_MAX); + + str = strempty(str); + value = token->value; + + switch (token->match_type) { + case MATCH_TYPE_EMPTY: + match = isempty(str); + break; + case MATCH_TYPE_SUBSYSTEM: + match = STR_IN_SET(str, "subsystem", "class", "bus"); + break; + case MATCH_TYPE_PLAIN_WITH_EMPTY: + if (isempty(str)) { + match = true; + break; + } + _fallthrough_; + case MATCH_TYPE_PLAIN: + NULSTR_FOREACH(i, value) + if (streq(i, str)) { + match = true; + break; + } + break; + case MATCH_TYPE_GLOB_WITH_EMPTY: + if (isempty(str)) { + match = true; + break; + } + _fallthrough_; + case MATCH_TYPE_GLOB: + NULSTR_FOREACH(i, value) + if ((fnmatch(i, str, 0) == 0)) { + match = true; + break; + } + break; + default: + assert_not_reached(); + } + + return token->op == (match ? OP_MATCH : OP_NOMATCH); +} + +static bool token_match_attr(UdevRuleToken *token, sd_device *dev, UdevEvent *event) { + char nbuf[UDEV_NAME_SIZE], vbuf[UDEV_NAME_SIZE]; + const char *name, *value; + bool truncated; + + assert(token); + assert(IN_SET(token->type, TK_M_ATTR, TK_M_PARENTS_ATTR)); + assert(dev); + assert(event); + + name = token->data; + + switch (token->attr_subst_type) { + case SUBST_TYPE_FORMAT: + (void) udev_event_apply_format(event, name, nbuf, sizeof(nbuf), false, NULL, &truncated); + if (truncated) { + log_event_truncated(dev, token, "sysfs attribute name", name, + token->type == TK_M_ATTR ? "ATTR" : "ATTRS", /* is_match = */ true); + return false; + } + + name = nbuf; + _fallthrough_; + case SUBST_TYPE_PLAIN: + if (sd_device_get_sysattr_value(dev, name, &value) < 0) + return false; + break; + case SUBST_TYPE_SUBSYS: + if (udev_resolve_subsys_kernel(name, vbuf, sizeof(vbuf), true) < 0) + return false; + value = vbuf; + break; + default: + assert_not_reached(); + } + + /* remove trailing whitespace, if not asked to match for it */ + if (token->attr_match_remove_trailing_whitespace) { + if (value != vbuf) { + strscpy(vbuf, sizeof(vbuf), value); + value = vbuf; + } + + delete_trailing_chars(vbuf, NULL); + } + + return token_match_string(token, value); +} + +static int get_property_from_string(char *line, char **ret_key, char **ret_value) { + char *key, *val; + size_t len; + + assert(line); + assert(ret_key); + assert(ret_value); + + /* find key */ + key = skip_leading_chars(line, NULL); + + /* comment or empty line */ + if (IN_SET(key[0], '#', '\0')) { + *ret_key = *ret_value = NULL; + return 0; + } + + /* split key/value */ + val = strchr(key, '='); + if (!val) + return -EINVAL; + *val++ = '\0'; + + key = strstrip(key); + if (isempty(key)) + return -EINVAL; + + val = strstrip(val); + if (isempty(val)) + return -EINVAL; + + /* unquote */ + if (IN_SET(val[0], '"', '\'')) { + len = strlen(val); + if (len == 1 || val[len-1] != val[0]) + return -EINVAL; + val[len-1] = '\0'; + val++; + } + + *ret_key = key; + *ret_value = val; + return 1; +} + +static int import_parent_into_properties(sd_device *dev, const char *filter) { + sd_device *parent; + int r; + + assert(dev); + assert(filter); + + r = sd_device_get_parent(dev, &parent); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + FOREACH_DEVICE_PROPERTY(parent, key, val) { + if (fnmatch(filter, key, 0) != 0) + continue; + r = device_add_property(dev, key, val); + if (r < 0) + return r; + } + + return 1; +} + +static int attr_subst_subdir(char attr[static UDEV_PATH_SIZE]) { + _cleanup_closedir_ DIR *dir = NULL; + char buf[UDEV_PATH_SIZE], *p; + const char *tail; + size_t len, size; + bool truncated; + + assert(attr); + + tail = strstr(attr, "/*/"); + if (!tail) + return 0; + + len = tail - attr + 1; /* include slash at the end */ + tail += 2; /* include slash at the beginning */ + + p = buf; + size = sizeof(buf); + size -= strnpcpy_full(&p, size, attr, len, &truncated); + if (truncated) + return -ENOENT; + + dir = opendir(buf); + if (!dir) + return -errno; + + FOREACH_DIRENT_ALL(de, dir, break) { + if (de->d_name[0] == '.') + continue; + + strscpyl_full(p, size, &truncated, de->d_name, tail, NULL); + if (truncated) + continue; + + if (faccessat(dirfd(dir), p, F_OK, 0) < 0) + continue; + + strcpy(attr, buf); + return 0; + } + + return -ENOENT; +} + +static size_t udev_replace_ifname(char *str) { + size_t replaced = 0; + + assert(str); + + /* See ifname_valid_full(). */ + + for (char *p = str; *p != '\0'; p++) + if (!ifname_valid_char(*p)) { + *p = '_'; + replaced++; + } + + return replaced; +} + +static int udev_rule_apply_token_to_event( + UdevRuleToken *token, + sd_device *dev, + UdevEvent *event, + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list) { + + int r; + + assert(token); + assert(dev); + assert(event); + + /* This returns the following values: + * 0 on the current token does not match the event, + * 1 on the current token matches the event, and + * negative errno on some critical errors. */ + + switch (token->type) { + case TK_M_ACTION: { + sd_device_action_t a; + + r = sd_device_get_action(dev, &a); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to get uevent action type: %m"); + + return token_match_string(token, device_action_to_string(a)); + } + case TK_M_DEVPATH: { + const char *val; + + r = sd_device_get_devpath(dev, &val); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to get devpath: %m"); + + return token_match_string(token, val); + } + case TK_M_KERNEL: + case TK_M_PARENTS_KERNEL: { + const char *val; + + r = sd_device_get_sysname(dev, &val); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to get sysname: %m"); + + return token_match_string(token, val); + } + case TK_M_DEVLINK: + FOREACH_DEVICE_DEVLINK(dev, val) + if (token_match_string(token, strempty(startswith(val, "/dev/"))) == (token->op == OP_MATCH)) + return token->op == OP_MATCH; + return token->op == OP_NOMATCH; + case TK_M_NAME: + return token_match_string(token, event->name); + case TK_M_ENV: { + const char *val = NULL; + + (void) device_get_property_value_with_fallback(dev, token->data, properties_list, &val); + + return token_match_string(token, val); + } + case TK_M_CONST: { + const char *val, *k = token->data; + + if (streq(k, "arch")) + val = architecture_to_string(uname_architecture()); + else if (streq(k, "virt")) + val = virtualization_to_string(detect_virtualization()); + else if (streq(k, "cvm")) + val = confidential_virtualization_to_string(detect_confidential_virtualization()); + else + assert_not_reached(); + return token_match_string(token, val); + } + case TK_M_TAG: + case TK_M_PARENTS_TAG: + FOREACH_DEVICE_CURRENT_TAG(dev, val) + if (token_match_string(token, val) == (token->op == OP_MATCH)) + return token->op == OP_MATCH; + return token->op == OP_NOMATCH; + case TK_M_SUBSYSTEM: + case TK_M_PARENTS_SUBSYSTEM: { + const char *val; + + r = sd_device_get_subsystem(dev, &val); + if (r == -ENOENT) + val = NULL; + else if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to get subsystem: %m"); + + return token_match_string(token, val); + } + case TK_M_DRIVER: + case TK_M_PARENTS_DRIVER: { + const char *val; + + r = sd_device_get_driver(dev, &val); + if (r == -ENOENT) + val = NULL; + else if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to get driver: %m"); + + return token_match_string(token, val); + } + case TK_M_ATTR: + case TK_M_PARENTS_ATTR: + return token_match_attr(token, dev, event); + case TK_M_SYSCTL: { + _cleanup_free_ char *value = NULL; + char buf[UDEV_PATH_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, token->data, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "sysctl entry name", token->data, "SYSCTL", /* is_match = */ true); + return false; + } + + r = sysctl_read(sysctl_normalize(buf), &value); + if (r < 0 && r != -ENOENT) + return log_event_error_errno(dev, token, r, "Failed to read sysctl '%s': %m", buf); + + return token_match_string(token, strstrip(value)); + } + case TK_M_TEST: { + mode_t mode = PTR_TO_MODE(token->data); + char buf[UDEV_PATH_SIZE]; + struct stat statbuf; + bool match, truncated; + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "file name", token->value, "TEST", /* is_match = */ true); + return false; + } + + if (!path_is_absolute(buf) && + udev_resolve_subsys_kernel(buf, buf, sizeof(buf), false) < 0) { + char tmp[UDEV_PATH_SIZE]; + const char *val; + + r = sd_device_get_syspath(dev, &val); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to get syspath: %m"); + + strscpy_full(tmp, sizeof(tmp), buf, &truncated); + assert(!truncated); + strscpyl_full(buf, sizeof(buf), &truncated, val, "/", tmp, NULL); + if (truncated) + return false; + } + + r = attr_subst_subdir(buf); + if (r == -ENOENT) + return token->op == OP_NOMATCH; + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to test for the existence of '%s': %m", buf); + + if (stat(buf, &statbuf) < 0) + return token->op == OP_NOMATCH; + + if (mode == MODE_INVALID) + return token->op == OP_MATCH; + + match = (statbuf.st_mode & mode) > 0; + return token->op == (match ? OP_MATCH : OP_NOMATCH); + } + case TK_M_PROGRAM: { + char buf[UDEV_LINE_SIZE], result[UDEV_LINE_SIZE]; + bool truncated; + size_t count; + + event->program_result = mfree(event->program_result); + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "command", token->value, "PROGRAM", /* is_match = */ true); + return false; + } + + log_event_debug(dev, token, "Running PROGRAM '%s'", buf); + + r = udev_event_spawn(event, timeout_usec, timeout_signal, true, buf, result, sizeof(result), NULL); + if (r != 0) { + if (r < 0) + log_event_warning_errno(dev, token, r, "Failed to execute \"%s\": %m", buf); + else /* returned value is positive when program fails */ + log_event_debug(dev, token, "Command \"%s\" returned %d (error)", buf, r); + return token->op == OP_NOMATCH; + } + + delete_trailing_chars(result, "\n"); + count = udev_replace_chars(result, UDEV_ALLOWED_CHARS_INPUT); + if (count > 0) + log_event_debug(dev, token, + "Replaced %zu character(s) in result of \"%s\"", + count, buf); + + event->program_result = strdup(result); + return token->op == OP_MATCH; + } + case TK_M_IMPORT_FILE: { + _cleanup_fclose_ FILE *f = NULL; + char buf[UDEV_PATH_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "file name to be imported", token->value, "IMPORT", /* is_match = */ true); + return false; + } + + log_event_debug(dev, token, "Importing properties from '%s'", buf); + + f = fopen(buf, "re"); + if (!f) { + if (errno != ENOENT) + return log_event_error_errno(dev, token, errno, "Failed to open '%s': %m", buf); + return token->op == OP_NOMATCH; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + char *key, *value; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) { + log_event_debug_errno(dev, token, r, "Failed to read '%s', ignoring: %m", buf); + return token->op == OP_NOMATCH; + } + if (r == 0) + break; + + r = get_property_from_string(line, &key, &value); + if (r < 0) { + log_event_debug_errno(dev, token, r, + "Failed to parse key and value from '%s', ignoring: %m", + line); + continue; + } + if (r == 0) + continue; + + r = device_add_property(dev, key, value); + if (r < 0) + return log_event_error_errno(dev, token, r, + "Failed to add property %s=%s: %m", + key, value); + } + + return token->op == OP_MATCH; + } + case TK_M_IMPORT_PROGRAM: { + _cleanup_strv_free_ char **lines = NULL; + char buf[UDEV_LINE_SIZE], result[UDEV_LINE_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "command", token->value, "IMPORT", /* is_match = */ true); + return false; + } + + log_event_debug(dev, token, "Importing properties from results of '%s'", buf); + + r = udev_event_spawn(event, timeout_usec, timeout_signal, true, buf, result, sizeof result, &truncated); + if (r != 0) { + if (r < 0) + log_event_warning_errno(dev, token, r, "Failed to execute '%s', ignoring: %m", buf); + else /* returned value is positive when program fails */ + log_event_debug(dev, token, "Command \"%s\" returned %d (error), ignoring", buf, r); + return token->op == OP_NOMATCH; + } + + if (truncated) { + bool found = false; + + /* Drop the last line. */ + for (char *p = PTR_SUB1(buf + strlen(buf), buf); p; p = PTR_SUB1(p, buf)) + if (strchr(NEWLINE, *p)) { + *p = '\0'; + found = true; + } else if (found) + break; + } + + r = strv_split_newlines_full(&lines, result, EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_event_warning_errno(dev, token, r, + "Failed to extract lines from result of command \"%s\", ignoring: %m", buf); + return false; + } + + STRV_FOREACH(line, lines) { + char *key, *value; + + r = get_property_from_string(*line, &key, &value); + if (r < 0) { + log_event_debug_errno(dev, token, r, + "Failed to parse key and value from '%s', ignoring: %m", + *line); + continue; + } + if (r == 0) + continue; + + r = device_add_property(dev, key, value); + if (r < 0) + return log_event_error_errno(dev, token, r, + "Failed to add property %s=%s: %m", + key, value); + } + + return token->op == OP_MATCH; + } + case TK_M_IMPORT_BUILTIN: { + UdevBuiltinCommand cmd = PTR_TO_UDEV_BUILTIN_CMD(token->data); + assert(cmd >= 0 && cmd < _UDEV_BUILTIN_MAX); + unsigned mask = 1U << (int) cmd; + char buf[UDEV_LINE_SIZE]; + bool truncated; + + if (udev_builtin_run_once(cmd)) { + /* check if we ran already */ + if (event->builtin_run & mask) { + log_event_debug(dev, token, "Skipping builtin '%s' in IMPORT key", + udev_builtin_name(cmd)); + /* return the result from earlier run */ + return token->op == (event->builtin_ret & mask ? OP_NOMATCH : OP_MATCH); + } + /* mark as ran */ + event->builtin_run |= mask; + } + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "builtin command", token->value, "IMPORT", /* is_match = */ true); + return false; + } + + log_event_debug(dev, token, "Importing properties from results of builtin command '%s'", buf); + + r = udev_builtin_run(event, cmd, buf, false); + if (r < 0) { + /* remember failure */ + log_event_debug_errno(dev, token, r, "Failed to run builtin '%s': %m", buf); + event->builtin_ret |= mask; + } + return token->op == (r >= 0 ? OP_MATCH : OP_NOMATCH); + } + case TK_M_IMPORT_DB: { + const char *val; + + if (!event->dev_db_clone) + return token->op == OP_NOMATCH; + r = sd_device_get_property_value(event->dev_db_clone, token->value, &val); + if (r == -ENOENT) + return token->op == OP_NOMATCH; + if (r < 0) + return log_event_error_errno(dev, token, r, + "Failed to get property '%s' from database: %m", + token->value); + + r = device_add_property(dev, token->value, val); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to add property '%s=%s': %m", + token->value, val); + return token->op == OP_MATCH; + } + case TK_M_IMPORT_CMDLINE: { + _cleanup_free_ char *value = NULL; + + r = proc_cmdline_get_key(token->value, PROC_CMDLINE_VALUE_OPTIONAL|PROC_CMDLINE_IGNORE_EFI_OPTIONS, &value); + if (r < 0) + return log_event_error_errno(dev, token, r, + "Failed to read '%s' option from /proc/cmdline: %m", + token->value); + if (r == 0) + return token->op == OP_NOMATCH; + + r = device_add_property(dev, token->value, value ?: "1"); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to add property '%s=%s': %m", + token->value, value ?: "1"); + return token->op == OP_MATCH; + } + case TK_M_IMPORT_PARENT: { + char buf[UDEV_PATH_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "property name", token->value, "IMPORT", /* is_match = */ true); + return false; + } + + r = import_parent_into_properties(dev, buf); + if (r < 0) + return log_event_error_errno(dev, token, r, + "Failed to import properties '%s' from parent: %m", + buf); + return token->op == (r > 0 ? OP_MATCH : OP_NOMATCH); + } + case TK_M_RESULT: + return token_match_string(token, event->program_result); + case TK_A_OPTIONS_STRING_ESCAPE_NONE: + event->esc = ESCAPE_NONE; + break; + case TK_A_OPTIONS_STRING_ESCAPE_REPLACE: + event->esc = ESCAPE_REPLACE; + break; + case TK_A_OPTIONS_DB_PERSIST: + device_set_db_persist(dev); + break; + case TK_A_OPTIONS_INOTIFY_WATCH: + if (event->inotify_watch_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->inotify_watch_final = true; + + event->inotify_watch = token->data; + break; + case TK_A_OPTIONS_DEVLINK_PRIORITY: + device_set_devlink_priority(dev, PTR_TO_INT(token->data)); + break; + case TK_A_OPTIONS_LOG_LEVEL: { + int level = PTR_TO_INT(token->data); + + if (level < 0) + level = event->default_log_level; + + log_set_max_level(level); + + if (level == LOG_DEBUG && !event->log_level_was_debug) { + /* The log level becomes LOG_DEBUG at first time. Let's log basic information. */ + log_device_uevent(dev, "The log level is changed to 'debug' while processing device"); + event->log_level_was_debug = true; + } + + break; + } + case TK_A_OWNER: { + char owner[UDEV_NAME_SIZE]; + const char *ow = owner; + bool truncated; + + if (event->owner_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->owner_final = true; + + (void) udev_event_apply_format(event, token->value, owner, sizeof(owner), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "user name", token->value, "OWNER", /* is_match = */ false); + break; + } + + r = get_user_creds(&ow, &event->uid, NULL, NULL, NULL, USER_CREDS_ALLOW_MISSING); + if (r < 0) + log_unknown_owner(dev, token->rule_line, r, "user", owner); + else + log_event_debug(dev, token, "OWNER %s(%u)", owner, event->uid); + break; + } + case TK_A_GROUP: { + char group[UDEV_NAME_SIZE]; + const char *gr = group; + bool truncated; + + if (event->group_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->group_final = true; + + (void) udev_event_apply_format(event, token->value, group, sizeof(group), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "group name", token->value, "GROUP", /* is_match = */ false); + break; + } + + r = get_group_creds(&gr, &event->gid, USER_CREDS_ALLOW_MISSING); + if (r < 0) + log_unknown_owner(dev, token->rule_line, r, "group", group); + else + log_event_debug(dev, token, "GROUP %s(%u)", group, event->gid); + break; + } + case TK_A_MODE: { + char mode_str[UDEV_NAME_SIZE]; + bool truncated; + + if (event->mode_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->mode_final = true; + + (void) udev_event_apply_format(event, token->value, mode_str, sizeof(mode_str), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "mode", token->value, "MODE", /* is_match = */ false); + break; + } + + r = parse_mode(mode_str, &event->mode); + if (r < 0) + log_event_error_errno(dev, token, r, "Failed to parse mode '%s', ignoring: %m", mode_str); + else + log_event_debug(dev, token, "MODE %#o", event->mode); + break; + } + case TK_A_OWNER_ID: + if (event->owner_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->owner_final = true; + if (!token->data) + break; + event->uid = PTR_TO_UID(token->data); + log_event_debug(dev, token, "OWNER %u", event->uid); + break; + case TK_A_GROUP_ID: + if (event->group_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->group_final = true; + if (!token->data) + break; + event->gid = PTR_TO_GID(token->data); + log_event_debug(dev, token, "GROUP %u", event->gid); + break; + case TK_A_MODE_ID: + if (event->mode_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->mode_final = true; + if (!token->data) + break; + event->mode = PTR_TO_MODE(token->data); + log_event_debug(dev, token, "MODE %#o", event->mode); + break; + case TK_A_SECLABEL: { + _cleanup_free_ char *name = NULL, *label = NULL; + char label_str[UDEV_LINE_SIZE] = {}; + bool truncated; + + name = strdup(token->data); + if (!name) + return log_oom(); + + (void) udev_event_apply_format(event, token->value, label_str, sizeof(label_str), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "security label", token->value, "SECLABEL", /* is_match = */ false); + break; + } + + if (!isempty(label_str)) + label = strdup(label_str); + else + label = strdup(token->value); + if (!label) + return log_oom(); + + if (token->op == OP_ASSIGN) + ordered_hashmap_clear_free_free(event->seclabel_list); + + r = ordered_hashmap_ensure_put(&event->seclabel_list, NULL, name, label); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to store SECLABEL{%s}='%s': %m", name, label); + + log_event_debug(dev, token, "SECLABEL{%s}='%s'", name, label); + + TAKE_PTR(name); + TAKE_PTR(label); + break; + } + case TK_A_ENV: { + const char *val, *name = token->data; + char value_new[UDEV_NAME_SIZE], *p = value_new; + size_t count, l = sizeof(value_new); + bool truncated; + + if (isempty(token->value)) { + if (token->op == OP_ADD) + break; + r = device_add_property(dev, name, NULL); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to remove property '%s': %m", name); + break; + } + + if (token->op == OP_ADD && + device_get_property_value_with_fallback(dev, name, properties_list, &val) >= 0) { + l = strpcpyl_full(&p, l, &truncated, val, " ", NULL); + if (truncated) { + log_event_warning(dev, token, + "The buffer for the property '%s' is full, " + "refusing to append the new value '%s'.", name, token->value); + break; + } + } + + (void) udev_event_apply_format(event, token->value, p, l, false, properties_list, &truncated); + if (truncated) { + _cleanup_free_ char *key_with_name = strjoin("ENV{", name, "}"); + log_event_truncated(dev, token, "property value", token->value, + key_with_name ?: "ENV", /* is_match = */ false); + break; + } + + if (event->esc == ESCAPE_REPLACE) { + count = udev_replace_chars(p, NULL); + if (count > 0) + log_event_debug(dev, token, + "Replaced %zu slash(es) from result of ENV{%s}%s=\"%s\"", + count, name, token->op == OP_ADD ? "+" : "", token->value); + } + + r = device_add_property(dev, name, value_new); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to add property '%s=%s': %m", name, value_new); + break; + } + case TK_A_TAG: { + char buf[UDEV_PATH_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "tag name", token->value, "TAG", /* is_match = */ false); + break; + } + + if (token->op == OP_ASSIGN) + device_cleanup_tags(dev); + + if (token->op == OP_REMOVE) + device_remove_tag(dev, buf); + else { + r = device_add_tag(dev, buf, true); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_event_warning_errno(dev, token, r, "Failed to add tag '%s', ignoring: %m", buf); + } + break; + } + case TK_A_NAME: { + char buf[UDEV_PATH_SIZE]; + bool truncated; + size_t count; + + if (event->name_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->name_final = true; + + if (sd_device_get_ifindex(dev, NULL) < 0) { + log_event_error(dev, token, + "Only network interfaces can be renamed, ignoring NAME=\"%s\".", + token->value); + break; + } + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "network interface name", token->value, "NAME", /* is_match = */ false); + break; + } + + if (IN_SET(event->esc, ESCAPE_UNSET, ESCAPE_REPLACE)) { + if (naming_scheme_has(NAMING_REPLACE_STRICTLY)) + count = udev_replace_ifname(buf); + else + count = udev_replace_chars(buf, "/"); + if (count > 0) + log_event_debug(dev, token, + "Replaced %zu character(s) from result of NAME=\"%s\"", + count, token->value); + } + r = free_and_strdup_warn(&event->name, buf); + if (r < 0) + return r; + + log_event_debug(dev, token, "NAME '%s'", event->name); + break; + } + case TK_A_DEVLINK: { + char buf[UDEV_PATH_SIZE]; + bool truncated; + size_t count; + + if (event->devlink_final) + break; + if (sd_device_get_devnum(dev, NULL) < 0) + break; + if (token->op == OP_ASSIGN_FINAL) + event->devlink_final = true; + if (IN_SET(token->op, OP_ASSIGN, OP_ASSIGN_FINAL)) + device_cleanup_devlinks(dev); + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), + /* replace_whitespace = */ event->esc != ESCAPE_NONE, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "symbolic link path", token->value, "SYMLINK", /* is_match = */ false); + break; + } + + /* By default or string_escape=none, allow multiple symlinks separated by spaces. */ + if (event->esc == ESCAPE_UNSET) + count = udev_replace_chars(buf, /* allow = */ "/ "); + else if (event->esc == ESCAPE_REPLACE) + count = udev_replace_chars(buf, /* allow = */ "/"); + else + count = 0; + if (count > 0) + log_event_debug(dev, token, + "Replaced %zu character(s) from result of SYMLINK=\"%s\"", + count, token->value); + + for (const char *p = buf;;) { + _cleanup_free_ char *path = NULL; + + r = extract_first_word(&p, &path, NULL, EXTRACT_RETAIN_ESCAPE); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_warning_errno(r, "Failed to extract first path in SYMLINK=, ignoring: %m"); + break; + } + if (r == 0) + break; + + if (token->op == OP_REMOVE) { + r = device_remove_devlink(dev, path); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_event_warning_errno(dev, token, r, "Failed to remove devlink '%s', ignoring: %m", path); + else if (r > 0) + log_event_debug(dev, token, "Dropped SYMLINK '%s'", path); + } else { + r = device_add_devlink(dev, path); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_event_warning_errno(dev, token, r, "Failed to add devlink '%s', ignoring: %m", path); + else if (r > 0) + log_event_debug(dev, token, "Added SYMLINK '%s'", path); + } + } + break; + } + case TK_A_ATTR: { + char buf[UDEV_PATH_SIZE], value[UDEV_NAME_SIZE]; + const char *val, *key_name = token->data; + bool truncated; + + if (udev_resolve_subsys_kernel(key_name, buf, sizeof(buf), false) < 0 && + sd_device_get_syspath(dev, &val) >= 0) { + strscpyl_full(buf, sizeof(buf), &truncated, val, "/", key_name, NULL); + if (truncated) { + log_event_warning(dev, token, + "The path to the attribute '%s/%s' is too long, refusing to set the attribute.", + val, key_name); + break; + } + } + + r = attr_subst_subdir(buf); + if (r < 0) { + log_event_error_errno(dev, token, r, "Could not find file matches '%s', ignoring: %m", buf); + break; + } + (void) udev_event_apply_format(event, token->value, value, sizeof(value), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "attribute value", token->value, "ATTR", /* is_match = */ false); + break; + } + + log_event_debug(dev, token, "ATTR '%s' writing '%s'", buf, value); + r = write_string_file(buf, value, + WRITE_STRING_FILE_VERIFY_ON_FAILURE | + WRITE_STRING_FILE_DISABLE_BUFFER | + WRITE_STRING_FILE_AVOID_NEWLINE | + WRITE_STRING_FILE_VERIFY_IGNORE_NEWLINE); + if (r < 0) + log_event_error_errno(dev, token, r, "Failed to write ATTR{%s}, ignoring: %m", buf); + break; + } + case TK_A_SYSCTL: { + char buf[UDEV_PATH_SIZE], value[UDEV_NAME_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, token->data, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "sysctl entry name", token->data, "SYSCTL", /* is_match = */ false); + break; + } + + (void) udev_event_apply_format(event, token->value, value, sizeof(value), false, properties_list, &truncated); + if (truncated) { + _cleanup_free_ char *key_with_name = strjoin("SYSCTL{", buf, "}"); + log_event_truncated(dev, token, "sysctl value", token->value, + key_with_name ?: "SYSCTL", /* is_match = */ false); + break; + } + + sysctl_normalize(buf); + log_event_debug(dev, token, "SYSCTL '%s' writing '%s'", buf, value); + r = sysctl_write(buf, value); + if (r < 0) + log_event_error_errno(dev, token, r, "Failed to write SYSCTL{%s}='%s', ignoring: %m", buf, value); + break; + } + case TK_A_RUN_BUILTIN: + case TK_A_RUN_PROGRAM: { + _cleanup_free_ char *cmd = NULL; + char buf[UDEV_LINE_SIZE]; + bool truncated; + + if (event->run_final) + break; + if (token->op == OP_ASSIGN_FINAL) + event->run_final = true; + + if (IN_SET(token->op, OP_ASSIGN, OP_ASSIGN_FINAL)) + ordered_hashmap_clear_free_key(event->run_list); + + (void) udev_event_apply_format(event, token->value, buf, sizeof(buf), false, properties_list, &truncated); + if (truncated) { + log_event_truncated(dev, token, "command", token->value, + token->type == TK_A_RUN_BUILTIN ? "RUN{builtin}" : "RUN{program}", + /* is_match = */ false); + break; + } + + cmd = strdup(buf); + if (!cmd) + return log_oom(); + + r = ordered_hashmap_ensure_put(&event->run_list, NULL, cmd, token->data); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_event_error_errno(dev, token, r, "Failed to store command '%s': %m", cmd); + + TAKE_PTR(cmd); + + log_event_debug(dev, token, "RUN '%s'", token->value); + break; + } + case TK_A_OPTIONS_STATIC_NODE: + /* do nothing for events. */ + break; + default: + assert_not_reached(); + } + + return true; +} + +static bool token_is_for_parents(UdevRuleToken *token) { + return token->type >= TK_M_PARENTS_KERNEL && token->type <= TK_M_PARENTS_TAG; +} + +static int udev_rule_apply_parent_token_to_event( + UdevRuleToken *head_token, + UdevEvent *event, + int timeout_signal) { + + int r; + + assert(head_token); + assert(event); + + event->dev_parent = ASSERT_PTR(event->dev); + + for (;;) { + LIST_FOREACH(tokens, token, head_token) { + if (!token_is_for_parents(token)) + return true; /* All parent tokens match. */ + + r = udev_rule_apply_token_to_event(token, event->dev_parent, event, 0, timeout_signal, NULL); + if (r < 0) + return r; + if (r == 0) + break; + } + if (r > 0) + /* All parent tokens match, and no more token (except for GOTO) in the line. */ + return true; + + if (sd_device_get_parent(event->dev_parent, &event->dev_parent) < 0) { + event->dev_parent = NULL; + return false; + } + } +} + +static int udev_rule_apply_line_to_event( + UdevRuleLine *line, + UdevEvent *event, + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list, + UdevRuleLine **next_line) { + + UdevRuleLineType mask = LINE_HAS_GOTO | LINE_UPDATE_SOMETHING; + bool parents_done = false; + sd_device_action_t action; + int r; + + assert(line); + assert(event); + assert(next_line); + + r = sd_device_get_action(event->dev, &action); + if (r < 0) + return r; + + if (action != SD_DEVICE_REMOVE) { + if (sd_device_get_devnum(event->dev, NULL) >= 0) + mask |= LINE_HAS_DEVLINK; + + if (sd_device_get_ifindex(event->dev, NULL) >= 0) + mask |= LINE_HAS_NAME; + } + + if ((line->type & mask) == 0) + return 0; + + event->esc = ESCAPE_UNSET; + + DEVICE_TRACE_POINT(rules_apply_line, event->dev, line->rule_file->filename, line->line_number); + + LIST_FOREACH(tokens, token, line->tokens) { + if (token_is_for_parents(token)) { + if (parents_done) + continue; + + r = udev_rule_apply_parent_token_to_event(token, event, timeout_signal); + if (r <= 0) + return r; + + parents_done = true; + continue; + } + + r = udev_rule_apply_token_to_event(token, event->dev, event, timeout_usec, timeout_signal, properties_list); + if (r <= 0) + return r; + } + + if (line->goto_line) + *next_line = line->goto_line; /* update next_line only when the line has GOTO token. */ + + return 0; +} + +int udev_rules_apply_to_event( + UdevRules *rules, + UdevEvent *event, + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list) { + + int r; + + assert(rules); + assert(event); + + LIST_FOREACH(rule_files, file, rules->rule_files) + LIST_FOREACH_WITH_NEXT(rule_lines, line, next_line, file->rule_lines) { + r = udev_rule_apply_line_to_event(line, event, timeout_usec, timeout_signal, properties_list, &next_line); + if (r < 0) + return r; + } + + return 0; +} + +static int udev_rule_line_apply_static_dev_perms(UdevRuleLine *rule_line) { + _cleanup_strv_free_ char **tags = NULL; + uid_t uid = UID_INVALID; + gid_t gid = GID_INVALID; + mode_t mode = MODE_INVALID; + int r; + + assert(rule_line); + + if (!FLAGS_SET(rule_line->type, LINE_HAS_STATIC_NODE)) + return 0; + + LIST_FOREACH(tokens, token, rule_line->tokens) + if (token->type == TK_A_OWNER_ID) + uid = PTR_TO_UID(token->data); + else if (token->type == TK_A_GROUP_ID) + gid = PTR_TO_GID(token->data); + else if (token->type == TK_A_MODE_ID) + mode = PTR_TO_MODE(token->data); + else if (token->type == TK_A_TAG) { + r = strv_extend(&tags, token->value); + if (r < 0) + return log_oom(); + } else if (token->type == TK_A_OPTIONS_STATIC_NODE) { + r = static_node_apply_permissions(token->value, mode, uid, gid, tags); + if (r < 0) + return r; + } + + return 0; +} + +int udev_rules_apply_static_dev_perms(UdevRules *rules) { + int r; + + assert(rules); + + LIST_FOREACH(rule_files, file, rules->rule_files) + LIST_FOREACH(rule_lines, line, file->rule_lines) { + r = udev_rule_line_apply_static_dev_perms(line); + if (r < 0) + return r; + } + + return 0; +} + +static const char* const resolve_name_timing_table[_RESOLVE_NAME_TIMING_MAX] = { + [RESOLVE_NAME_NEVER] = "never", + [RESOLVE_NAME_LATE] = "late", + [RESOLVE_NAME_EARLY] = "early", +}; + +DEFINE_STRING_TABLE_LOOKUP(resolve_name_timing, ResolveNameTiming); diff --git a/src/udev/udev-rules.h b/src/udev/udev-rules.h new file mode 100644 index 0000000..4352312 --- /dev/null +++ b/src/udev/udev-rules.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include "alloc-util.h" +#include "hashmap.h" +#include "time-util.h" + +#define UDEV_NAME_SIZE 512 +#define UDEV_PATH_SIZE 1024 +#define UDEV_LINE_SIZE 16384 + +typedef struct UdevRuleFile UdevRuleFile; +typedef struct UdevRules UdevRules; +typedef struct UdevEvent UdevEvent; + +typedef enum { + ESCAPE_UNSET, + ESCAPE_NONE, /* OPTIONS="string_escape=none" */ + ESCAPE_REPLACE, /* OPTIONS="string_escape=replace" */ + _ESCAPE_TYPE_MAX, + _ESCAPE_TYPE_INVALID = -EINVAL, +} UdevRuleEscapeType; + +typedef enum ResolveNameTiming { + RESOLVE_NAME_NEVER, + RESOLVE_NAME_LATE, + RESOLVE_NAME_EARLY, + _RESOLVE_NAME_TIMING_MAX, + _RESOLVE_NAME_TIMING_INVALID = -EINVAL, +} ResolveNameTiming; + +int udev_rule_parse_value(char *str, char **ret_value, char **ret_endpos); +int udev_rules_parse_file(UdevRules *rules, const char *filename, bool extra_checks, UdevRuleFile **ret); +unsigned udev_rule_file_get_issues(UdevRuleFile *rule_file); +UdevRules* udev_rules_new(ResolveNameTiming resolve_name_timing); +int udev_rules_load(UdevRules **ret_rules, ResolveNameTiming resolve_name_timing); +UdevRules *udev_rules_free(UdevRules *rules); +DEFINE_TRIVIAL_CLEANUP_FUNC(UdevRules*, udev_rules_free); +#define udev_rules_free_and_replace(a, b) free_and_replace_full(a, b, udev_rules_free) + +bool udev_rules_should_reload(UdevRules *rules); +int udev_rules_apply_to_event(UdevRules *rules, UdevEvent *event, + usec_t timeout_usec, + int timeout_signal, + Hashmap *properties_list); +int udev_rules_apply_static_dev_perms(UdevRules *rules); + +ResolveNameTiming resolve_name_timing_from_string(const char *s) _pure_; +const char *resolve_name_timing_to_string(ResolveNameTiming i) _const_; diff --git a/src/udev/udev-spawn.c b/src/udev/udev-spawn.c new file mode 100644 index 0000000..67a3005 --- /dev/null +++ b/src/udev/udev-spawn.c @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "sd-event.h" + +#include "device-private.h" +#include "device-util.h" +#include "fd-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "strv.h" +#include "udev-builtin.h" +#include "udev-event.h" +#include "udev-spawn.h" +#include "udev-trace.h" + +typedef struct Spawn { + sd_device *device; + const char *cmd; + pid_t pid; + usec_t timeout_warn_usec; + usec_t timeout_usec; + int timeout_signal; + usec_t event_birth_usec; + bool accept_failure; + int fd_stdout; + int fd_stderr; + char *result; + size_t result_size; + size_t result_len; + bool truncated; +} Spawn; + +static int on_spawn_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + Spawn *spawn = ASSERT_PTR(userdata); + char buf[4096], *p; + size_t size; + ssize_t l; + int r; + + assert(fd == spawn->fd_stdout || fd == spawn->fd_stderr); + assert(!spawn->result || spawn->result_len < spawn->result_size); + + if (fd == spawn->fd_stdout && spawn->result) { + p = spawn->result + spawn->result_len; + size = spawn->result_size - spawn->result_len; + } else { + p = buf; + size = sizeof(buf); + } + + l = read(fd, p, size - (p == buf)); + if (l < 0) { + if (errno == EAGAIN) + goto reenable; + + log_device_error_errno(spawn->device, errno, + "Failed to read stdout of '%s': %m", spawn->cmd); + + return 0; + } + + if ((size_t) l == size) { + log_device_warning(spawn->device, "Truncating stdout of '%s' up to %zu byte.", + spawn->cmd, spawn->result_size); + l--; + spawn->truncated = true; + } + + p[l] = '\0'; + if (fd == spawn->fd_stdout && spawn->result) + spawn->result_len += l; + + /* Log output only if we watch stderr. */ + if (l > 0 && spawn->fd_stderr >= 0) { + _cleanup_strv_free_ char **v = NULL; + + r = strv_split_newlines_full(&v, p, EXTRACT_RETAIN_ESCAPE); + if (r < 0) + log_device_debug(spawn->device, + "Failed to split output from '%s'(%s), ignoring: %m", + spawn->cmd, fd == spawn->fd_stdout ? "out" : "err"); + + STRV_FOREACH(q, v) + log_device_debug(spawn->device, "'%s'(%s) '%s'", spawn->cmd, + fd == spawn->fd_stdout ? "out" : "err", *q); + } + + if (l == 0 || spawn->truncated) + return 0; + +reenable: + /* Re-enable the event source if we did not encounter EOF */ + + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_device_error_errno(spawn->device, r, + "Failed to reactivate IO source of '%s'", spawn->cmd); + return 0; +} + +static int on_spawn_timeout(sd_event_source *s, uint64_t usec, void *userdata) { + Spawn *spawn = ASSERT_PTR(userdata); + + DEVICE_TRACE_POINT(spawn_timeout, spawn->device, spawn->cmd); + + kill_and_sigcont(spawn->pid, spawn->timeout_signal); + + log_device_error(spawn->device, "Spawned process '%s' ["PID_FMT"] timed out after %s, killing", + spawn->cmd, spawn->pid, + FORMAT_TIMESPAN(spawn->timeout_usec, USEC_PER_SEC)); + + return 1; +} + +static int on_spawn_timeout_warning(sd_event_source *s, uint64_t usec, void *userdata) { + Spawn *spawn = ASSERT_PTR(userdata); + + log_device_warning(spawn->device, "Spawned process '%s' ["PID_FMT"] is taking longer than %s to complete", + spawn->cmd, spawn->pid, + FORMAT_TIMESPAN(spawn->timeout_warn_usec, USEC_PER_SEC)); + + return 1; +} + +static int on_spawn_sigchld(sd_event_source *s, const siginfo_t *si, void *userdata) { + Spawn *spawn = ASSERT_PTR(userdata); + int ret = -EIO; + + switch (si->si_code) { + case CLD_EXITED: + if (si->si_status == 0) + log_device_debug(spawn->device, "Process '%s' succeeded.", spawn->cmd); + else + log_device_full(spawn->device, spawn->accept_failure ? LOG_DEBUG : LOG_WARNING, + "Process '%s' failed with exit code %i.", spawn->cmd, si->si_status); + ret = si->si_status; + break; + case CLD_KILLED: + case CLD_DUMPED: + log_device_error(spawn->device, "Process '%s' terminated by signal %s.", spawn->cmd, signal_to_string(si->si_status)); + break; + default: + log_device_error(spawn->device, "Process '%s' failed due to unknown reason.", spawn->cmd); + } + + DEVICE_TRACE_POINT(spawn_exit, spawn->device, spawn->cmd); + + sd_event_exit(sd_event_source_get_event(s), ret); + return 1; +} + +static int spawn_wait(Spawn *spawn) { + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *sigchld_source = NULL; + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *stdout_source = NULL; + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *stderr_source = NULL; + int r; + + assert(spawn); + + r = sd_event_new(&e); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to allocate sd-event object: %m"); + + if (spawn->timeout_usec > 0) { + usec_t usec, age_usec; + + usec = now(CLOCK_MONOTONIC); + age_usec = usec - spawn->event_birth_usec; + if (age_usec < spawn->timeout_usec) { + if (spawn->timeout_warn_usec > 0 && + spawn->timeout_warn_usec < spawn->timeout_usec && + spawn->timeout_warn_usec > age_usec) { + spawn->timeout_warn_usec -= age_usec; + + r = sd_event_add_time(e, NULL, CLOCK_MONOTONIC, + usec + spawn->timeout_warn_usec, USEC_PER_SEC, + on_spawn_timeout_warning, spawn); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to create timeout warning event source: %m"); + } + + spawn->timeout_usec -= age_usec; + + r = sd_event_add_time(e, NULL, CLOCK_MONOTONIC, + usec + spawn->timeout_usec, USEC_PER_SEC, on_spawn_timeout, spawn); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to create timeout event source: %m"); + } + } + + if (spawn->fd_stdout >= 0) { + r = sd_event_add_io(e, &stdout_source, spawn->fd_stdout, EPOLLIN, on_spawn_io, spawn); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to create stdio event source: %m"); + r = sd_event_source_set_enabled(stdout_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to enable stdio event source: %m"); + } + + if (spawn->fd_stderr >= 0) { + r = sd_event_add_io(e, &stderr_source, spawn->fd_stderr, EPOLLIN, on_spawn_io, spawn); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to create stderr event source: %m"); + r = sd_event_source_set_enabled(stderr_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to enable stderr event source: %m"); + } + + r = sd_event_add_child(e, &sigchld_source, spawn->pid, WEXITED, on_spawn_sigchld, spawn); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to create sigchild event source: %m"); + /* SIGCHLD should be processed after IO is complete */ + r = sd_event_source_set_priority(sigchld_source, SD_EVENT_PRIORITY_NORMAL + 1); + if (r < 0) + return log_device_debug_errno(spawn->device, r, "Failed to set priority to sigchild event source: %m"); + + return sd_event_loop(e); +} + +int udev_event_spawn( + UdevEvent *event, + usec_t timeout_usec, + int timeout_signal, + bool accept_failure, + const char *cmd, + char *result, + size_t result_size, + bool *ret_truncated) { + + _cleanup_close_pair_ int outpipe[2] = EBADF_PAIR, errpipe[2] = EBADF_PAIR; + _cleanup_strv_free_ char **argv = NULL; + char **envp = NULL; + Spawn spawn; + pid_t pid; + int r; + + assert(event); + assert(event->dev); + assert(result || result_size == 0); + + /* pipes from child to parent */ + if (result || log_get_max_level() >= LOG_INFO) + if (pipe2(outpipe, O_NONBLOCK|O_CLOEXEC) != 0) + return log_device_error_errno(event->dev, errno, + "Failed to create pipe for command '%s': %m", cmd); + + if (log_get_max_level() >= LOG_INFO) + if (pipe2(errpipe, O_NONBLOCK|O_CLOEXEC) != 0) + return log_device_error_errno(event->dev, errno, + "Failed to create pipe for command '%s': %m", cmd); + + r = strv_split_full(&argv, cmd, NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX | EXTRACT_RETAIN_ESCAPE); + if (r < 0) + return log_device_error_errno(event->dev, r, "Failed to split command: %m"); + + if (isempty(argv[0])) + return log_device_error_errno(event->dev, SYNTHETIC_ERRNO(EINVAL), + "Invalid command '%s'", cmd); + + /* allow programs in /usr/lib/udev/ to be called without the path */ + if (!path_is_absolute(argv[0])) { + char *program; + + program = path_join(UDEVLIBEXECDIR, argv[0]); + if (!program) + return log_oom(); + + free_and_replace(argv[0], program); + } + + r = device_get_properties_strv(event->dev, &envp); + if (r < 0) + return log_device_error_errno(event->dev, r, "Failed to get device properties"); + + log_device_debug(event->dev, "Starting '%s'", cmd); + + r = safe_fork_full("(spawn)", + (int[]) { -EBADF, outpipe[WRITE_END], errpipe[WRITE_END] }, + NULL, 0, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, + &pid); + if (r < 0) + return log_device_error_errno(event->dev, r, + "Failed to fork() to execute command '%s': %m", cmd); + if (r == 0) { + DEVICE_TRACE_POINT(spawn_exec, event->dev, cmd); + execve(argv[0], argv, envp); + _exit(EXIT_FAILURE); + } + + /* parent closed child's ends of pipes */ + outpipe[WRITE_END] = safe_close(outpipe[WRITE_END]); + errpipe[WRITE_END] = safe_close(errpipe[WRITE_END]); + + spawn = (Spawn) { + .device = event->dev, + .cmd = cmd, + .pid = pid, + .accept_failure = accept_failure, + .timeout_warn_usec = udev_warn_timeout(timeout_usec), + .timeout_usec = timeout_usec, + .timeout_signal = timeout_signal, + .event_birth_usec = event->birth_usec, + .fd_stdout = outpipe[READ_END], + .fd_stderr = errpipe[READ_END], + .result = result, + .result_size = result_size, + }; + r = spawn_wait(&spawn); + if (r < 0) + return log_device_error_errno(event->dev, r, + "Failed to wait for spawned command '%s': %m", cmd); + + if (result) + result[spawn.result_len] = '\0'; + + if (ret_truncated) + *ret_truncated = spawn.truncated; + + return r; /* 0 for success, and positive if the program failed */ +} + +void udev_event_execute_run(UdevEvent *event, usec_t timeout_usec, int timeout_signal) { + const char *command; + void *val; + int r; + + ORDERED_HASHMAP_FOREACH_KEY(val, command, event->run_list) { + UdevBuiltinCommand builtin_cmd = PTR_TO_UDEV_BUILTIN_CMD(val); + + if (builtin_cmd != _UDEV_BUILTIN_INVALID) { + log_device_debug(event->dev, "Running built-in command \"%s\"", command); + r = udev_builtin_run(event, builtin_cmd, command, false); + if (r < 0) + log_device_debug_errno(event->dev, r, "Failed to run built-in command \"%s\", ignoring: %m", command); + } else { + if (event->exec_delay_usec > 0) { + log_device_debug(event->dev, "Delaying execution of \"%s\" for %s.", + command, FORMAT_TIMESPAN(event->exec_delay_usec, USEC_PER_SEC)); + (void) usleep_safe(event->exec_delay_usec); + } + + log_device_debug(event->dev, "Running command \"%s\"", command); + + r = udev_event_spawn(event, timeout_usec, timeout_signal, false, command, NULL, 0, NULL); + if (r < 0) + log_device_warning_errno(event->dev, r, "Failed to execute '%s', ignoring: %m", command); + else if (r > 0) /* returned value is positive when program fails */ + log_device_debug(event->dev, "Command \"%s\" returned %d (error), ignoring.", command, r); + } + } +} diff --git a/src/udev/udev-spawn.h b/src/udev/udev-spawn.h new file mode 100644 index 0000000..5efea2e --- /dev/null +++ b/src/udev/udev-spawn.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include +#include + +#include "macro.h" +#include "time-util.h" + +#define READ_END 0 +#define WRITE_END 1 + +typedef struct UdevEvent UdevEvent; + +int udev_event_spawn( + UdevEvent *event, + usec_t timeout_usec, + int timeout_signal, + bool accept_failure, + const char *cmd, + char *result, + size_t ressize, + bool *ret_truncated); +void udev_event_execute_run(UdevEvent *event, usec_t timeout_usec, int timeout_signal); + +static inline usec_t udev_warn_timeout(usec_t timeout_usec) { + return DIV_ROUND_UP(timeout_usec, 3); +} diff --git a/src/udev/udev-trace.h b/src/udev/udev-trace.h new file mode 100644 index 0000000..5e94390 --- /dev/null +++ b/src/udev/udev-trace.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#if HAVE_SYS_SDT_H +#define SDT_USE_VARIADIC +#include + +#include "device-private.h" +#include "device-util.h" +#include "errno-util.h" + +/* Each trace point can have different number of additional arguments. Note that when the macro is used only + * additional arguments are listed in the macro invocation! + * + * Default arguments for each trace point are as follows: + * - arg0 - action + * - arg1 - sysname + * - arg2 - syspath + * - arg3 - subsystem + */ +#define DEVICE_TRACE_POINT(name, dev, ...) \ + do { \ + PROTECT_ERRNO; \ + const char *_n = NULL, *_p = NULL, *_s = NULL; \ + sd_device *_d = (dev); \ + sd_device_action_t _a = _SD_DEVICE_ACTION_INVALID; \ + (void) sd_device_get_action(_d, &_a); \ + (void) sd_device_get_sysname(_d, &_n); \ + (void) sd_device_get_syspath(_d, &_p); \ + (void) sd_device_get_subsystem(_d, &_s); \ + STAP_PROBEV(udev, name, device_action_to_string(_a), _n, _p, _s __VA_OPT__(,) __VA_ARGS__);\ + } while (false); +#else +#define DEVICE_TRACE_POINT(name, dev, ...) ((void) 0) +#endif diff --git a/src/udev/udev-watch.c b/src/udev/udev-watch.c new file mode 100644 index 0000000..58c8279 --- /dev/null +++ b/src/udev/udev-watch.c @@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © 2009 Canonical Ltd. + * Copyright © 2009 Scott James Remnant + */ + +#include + +#include "alloc-util.h" +#include "device-private.h" +#include "device-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "parse-util.h" +#include "rm-rf.h" +#include "stdio-util.h" +#include "string-util.h" +#include "udev-util.h" +#include "udev-watch.h" + +int device_new_from_watch_handle_at(sd_device **ret, int dirfd, int wd) { + char path_wd[STRLEN("/run/udev/watch/") + DECIMAL_STR_MAX(int)]; + _cleanup_free_ char *id = NULL; + int r; + + assert(ret); + + if (wd < 0) + return -EBADF; + + if (dirfd >= 0) { + xsprintf(path_wd, "%d", wd); + r = readlinkat_malloc(dirfd, path_wd, &id); + } else { + xsprintf(path_wd, "/run/udev/watch/%d", wd); + r = readlink_malloc(path_wd, &id); + } + if (r < 0) + return r; + + return sd_device_new_from_device_id(ret, id); +} + +int udev_watch_restore(int inotify_fd) { + _cleanup_closedir_ DIR *dir = NULL; + int r; + + /* Move any old watches directory out of the way, and then restore the watches. */ + + assert(inotify_fd >= 0); + + (void) rm_rf("/run/udev/watch.old", REMOVE_ROOT); + + if (rename("/run/udev/watch", "/run/udev/watch.old") < 0) { + if (errno == ENOENT) + return 0; + + r = log_warning_errno(errno, + "Failed to move watches directory '/run/udev/watch/'. " + "Old watches will not be restored: %m"); + goto finalize; + } + + dir = opendir("/run/udev/watch.old"); + if (!dir) { + r = log_warning_errno(errno, + "Failed to open old watches directory '/run/udev/watch.old/'. " + "Old watches will not be restored: %m"); + goto finalize; + } + + FOREACH_DIRENT_ALL(de, dir, break) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int wd; + + /* For backward compatibility, read symlink from watch handle to device ID. This is necessary + * when udevd is restarted after upgrading from v248 or older. The new format (ID -> wd) was + * introduced by e7f781e473f5119bf9246208a6de9f6b76a39c5d (v249). */ + + if (dot_or_dot_dot(de->d_name)) + continue; + + if (safe_atoi(de->d_name, &wd) < 0) + continue; + + r = device_new_from_watch_handle_at(&dev, dirfd(dir), wd); + if (r < 0) { + log_full_errno(r == -ENODEV ? LOG_DEBUG : LOG_WARNING, r, + "Failed to create sd_device object from saved watch handle '%i', ignoring: %m", + wd); + continue; + } + + (void) udev_watch_begin(inotify_fd, dev); + } + + r = 0; + +finalize: + (void) rm_rf("/run/udev/watch.old", REMOVE_ROOT); + return r; +} + +static int udev_watch_clear(sd_device *dev, int dirfd, int *ret_wd) { + _cleanup_free_ char *wd_str = NULL, *buf = NULL; + const char *id; + int wd = -1, r; + + assert(dev); + assert(dirfd >= 0); + + r = device_get_device_id(dev, &id); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device ID: %m"); + + /* 1. read symlink ID -> wd */ + r = readlinkat_malloc(dirfd, id, &wd_str); + if (r == -ENOENT) { + if (ret_wd) + *ret_wd = -1; + return 0; + } + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to read symlink '/run/udev/watch/%s': %m", id); + goto finalize; + } + + r = safe_atoi(wd_str, &wd); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to parse watch handle from symlink '/run/udev/watch/%s': %m", id); + goto finalize; + } + + if (wd < 0) { + r = log_device_debug_errno(dev, SYNTHETIC_ERRNO(EBADF), "Invalid watch handle %i.", wd); + goto finalize; + } + + /* 2. read symlink wd -> ID */ + r = readlinkat_malloc(dirfd, wd_str, &buf); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to read symlink '/run/udev/watch/%s': %m", wd_str); + goto finalize; + } + + /* 3. check if the symlink wd -> ID is owned by the device. */ + if (!streq(buf, id)) { + r = log_device_debug_errno(dev, SYNTHETIC_ERRNO(ENOENT), + "Symlink '/run/udev/watch/%s' is owned by another device '%s'.", wd_str, buf); + goto finalize; + } + + /* 4. remove symlink wd -> ID. + * In the above, we already confirmed that the symlink is owned by us. Hence, no other workers remove + * the symlink and cannot create a new symlink with the same filename but to a different ID. Hence, + * the removal below is safe even the steps in this function are not atomic. */ + if (unlinkat(dirfd, wd_str, 0) < 0 && errno != ENOENT) + log_device_debug_errno(dev, errno, "Failed to remove '/run/udev/watch/%s', ignoring: %m", wd_str); + + if (ret_wd) + *ret_wd = wd; + r = 0; + +finalize: + /* 5. remove symlink ID -> wd. + * The file is always owned by the device. Hence, it is safe to remove it unconditionally. */ + if (unlinkat(dirfd, id, 0) < 0 && errno != ENOENT) + log_device_debug_errno(dev, errno, "Failed to remove '/run/udev/watch/%s': %m", id); + + return r; +} + +int udev_watch_begin(int inotify_fd, sd_device *dev) { + char wd_str[DECIMAL_STR_MAX(int)]; + _cleanup_close_ int dirfd = -EBADF; + const char *devnode, *id; + int wd, r; + + assert(inotify_fd >= 0); + assert(dev); + + if (device_for_action(dev, SD_DEVICE_REMOVE)) + return 0; + + r = sd_device_get_devname(dev, &devnode); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device node: %m"); + + r = device_get_device_id(dev, &id); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get device ID: %m"); + + r = dirfd = open_mkdir_at(AT_FDCWD, "/run/udev/watch", O_CLOEXEC | O_RDONLY, 0755); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to create and open '/run/udev/watch/': %m"); + + /* 1. Clear old symlinks */ + (void) udev_watch_clear(dev, dirfd, NULL); + + /* 2. Add inotify watch */ + log_device_debug(dev, "Adding watch on '%s'", devnode); + wd = inotify_add_watch(inotify_fd, devnode, IN_CLOSE_WRITE); + if (wd < 0) + return log_device_debug_errno(dev, errno, "Failed to watch device node '%s': %m", devnode); + + xsprintf(wd_str, "%d", wd); + + /* 3. Create new symlinks */ + if (symlinkat(wd_str, dirfd, id) < 0) { + r = log_device_debug_errno(dev, errno, "Failed to create symlink '/run/udev/watch/%s' to '%s': %m", id, wd_str); + goto on_failure; + } + + if (symlinkat(id, dirfd, wd_str) < 0) { + /* Possibly, the watch handle is previously assigned to another device, and udev_watch_end() + * is not called for the device yet. */ + r = log_device_debug_errno(dev, errno, "Failed to create symlink '/run/udev/watch/%s' to '%s': %m", wd_str, id); + goto on_failure; + } + + return 0; + +on_failure: + (void) unlinkat(dirfd, id, 0); + (void) inotify_rm_watch(inotify_fd, wd); + return r; +} + +int udev_watch_end(int inotify_fd, sd_device *dev) { + _cleanup_close_ int dirfd = -EBADF; + int wd, r; + + assert(dev); + + /* This may be called by 'udevadm test'. In that case, inotify_fd is not initialized. */ + if (inotify_fd < 0) + return 0; + + if (sd_device_get_devname(dev, NULL) < 0) + return 0; + + dirfd = RET_NERRNO(open("/run/udev/watch", O_CLOEXEC | O_DIRECTORY | O_NOFOLLOW | O_RDONLY)); + if (dirfd == -ENOENT) + return 0; + if (dirfd < 0) + return log_device_debug_errno(dev, dirfd, "Failed to open '/run/udev/watch/': %m"); + + /* First, clear symlinks. */ + r = udev_watch_clear(dev, dirfd, &wd); + if (r < 0) + return r; + + /* Then, remove inotify watch. */ + log_device_debug(dev, "Removing watch handle %i.", wd); + (void) inotify_rm_watch(inotify_fd, wd); + + return 0; +} diff --git a/src/udev/udev-watch.h b/src/udev/udev-watch.h new file mode 100644 index 0000000..c454dee --- /dev/null +++ b/src/udev/udev-watch.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include "sd-device.h" + +int device_new_from_watch_handle_at(sd_device **ret, int dirfd, int wd); +static inline int device_new_from_watch_handle(sd_device **ret, int wd) { + return device_new_from_watch_handle_at(ret, -1, wd); +} + +int udev_watch_restore(int inotify_fd); +int udev_watch_begin(int inotify_fd, sd_device *dev); +int udev_watch_end(int inotify_fd, sd_device *dev); diff --git a/src/udev/udev-worker.c b/src/udev/udev-worker.c new file mode 100644 index 0000000..53722b2 --- /dev/null +++ b/src/udev/udev-worker.c @@ -0,0 +1,352 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "common-signal.h" +#include "device-monitor-private.h" +#include "device-private.h" +#include "device-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "io-util.h" +#include "path-util.h" +#include "process-util.h" +#include "signal-util.h" +#include "string-util.h" +#include "udev-event.h" +#include "udev-spawn.h" +#include "udev-trace.h" +#include "udev-util.h" +#include "udev-watch.h" +#include "udev-worker.h" + +void udev_worker_done(UdevWorker *worker) { + assert(worker); + + sd_event_unref(worker->event); + sd_netlink_unref(worker->rtnl); + sd_device_monitor_unref(worker->monitor); + hashmap_free(worker->properties); + udev_rules_free(worker->rules); + safe_close(worker->pipe_fd); +} + +int udev_get_whole_disk(sd_device *dev, sd_device **ret_device, const char **ret_devname) { + const char *val; + int r; + + assert(dev); + + if (device_for_action(dev, SD_DEVICE_REMOVE)) + goto irrelevant; + + r = sd_device_get_sysname(dev, &val); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get sysname: %m"); + + /* Exclude the following devices: + * For "dm-", see the comment added by e918a1b5a94f270186dca59156354acd2a596494. + * For "md", see the commit message of 2e5b17d01347d3c3118be2b8ad63d20415dbb1f0, + * but not sure the assumption is still valid even when partitions are created on the md + * devices, surprisingly which seems to be possible, see PR #22973. + * For "drbd", see the commit message of fee854ee8ccde0cd28e0f925dea18cce35f3993d. */ + if (STARTSWITH_SET(val, "dm-", "md", "drbd")) + goto irrelevant; + + r = block_device_get_whole_disk(dev, &dev); + if (IN_SET(r, + -ENOTBLK, /* The device is not a block device. */ + -ENODEV /* The whole disk device was not found, it may already be removed. */)) + goto irrelevant; + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get whole disk device: %m"); + + r = sd_device_get_devname(dev, &val); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get devname: %m"); + + if (ret_device) + *ret_device = dev; + if (ret_devname) + *ret_devname = val; + return 1; + +irrelevant: + if (ret_device) + *ret_device = NULL; + if (ret_devname) + *ret_devname = NULL; + return 0; +} + +static int worker_lock_whole_disk(sd_device *dev, int *ret_fd) { + _cleanup_close_ int fd = -EBADF; + sd_device *dev_whole_disk; + const char *val; + int r; + + assert(dev); + assert(ret_fd); + + /* Take a shared lock on the device node; this establishes a concept of device "ownership" to + * serialize device access. External processes holding an exclusive lock will cause udev to skip the + * event handling; in the case udev acquired the lock, the external process can block until udev has + * finished its event handling. */ + + r = udev_get_whole_disk(dev, &dev_whole_disk, &val); + if (r < 0) + return r; + if (r == 0) + goto nolock; + + fd = sd_device_open(dev_whole_disk, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) { + bool ignore = ERRNO_IS_DEVICE_ABSENT(fd); + + log_device_debug_errno(dev, fd, "Failed to open '%s'%s: %m", val, ignore ? ", ignoring" : ""); + if (!ignore) + return fd; + + goto nolock; + } + + if (flock(fd, LOCK_SH|LOCK_NB) < 0) + return log_device_debug_errno(dev, errno, "Failed to flock(%s): %m", val); + + *ret_fd = TAKE_FD(fd); + return 1; + +nolock: + *ret_fd = -EBADF; + return 0; +} + +static int worker_mark_block_device_read_only(sd_device *dev) { + _cleanup_close_ int fd = -EBADF; + const char *val; + int state = 1, r; + + assert(dev); + + /* Do this only once, when the block device is new. If the device is later retriggered let's not + * toggle the bit again, so that people can boot up with full read-only mode and then unset the bit + * for specific devices only. */ + if (!device_for_action(dev, SD_DEVICE_ADD)) + return 0; + + r = sd_device_get_subsystem(dev, &val); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get subsystem: %m"); + + if (!streq(val, "block")) + return 0; + + r = sd_device_get_sysname(dev, &val); + if (r < 0) + return log_device_debug_errno(dev, r, "Failed to get sysname: %m"); + + /* Exclude synthetic devices for now, this is supposed to be a safety feature to avoid modification + * of physical devices, and what sits on top of those doesn't really matter if we don't allow the + * underlying block devices to receive changes. */ + if (STARTSWITH_SET(val, "dm-", "md", "drbd", "loop", "nbd", "zram")) + return 0; + + fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return log_device_debug_errno(dev, fd, "Failed to open '%s', ignoring: %m", val); + + if (ioctl(fd, BLKROSET, &state) < 0) + return log_device_warning_errno(dev, errno, "Failed to mark block device '%s' read-only: %m", val); + + log_device_info(dev, "Successfully marked block device '%s' read-only.", val); + return 0; +} + +static int worker_process_device(UdevWorker *worker, sd_device *dev) { + _cleanup_(udev_event_freep) UdevEvent *udev_event = NULL; + _cleanup_close_ int fd_lock = -EBADF; + int r; + + assert(worker); + assert(dev); + + log_device_uevent(dev, "Processing device"); + + udev_event = udev_event_new(dev, worker->exec_delay_usec, worker->rtnl, worker->log_level); + if (!udev_event) + return -ENOMEM; + + /* If this is a block device and the device is locked currently via the BSD advisory locks, + * someone else is using it exclusively. We don't run our udev rules now to not interfere. + * Instead of processing the event, we requeue the event and will try again after a delay. + * + * The user-facing side of this: https://systemd.io/BLOCK_DEVICE_LOCKING */ + r = worker_lock_whole_disk(dev, &fd_lock); + if (r == -EAGAIN) + return EVENT_RESULT_TRY_AGAIN; + if (r < 0) + return r; + + if (worker->blockdev_read_only) + (void) worker_mark_block_device_read_only(dev); + + /* apply rules, create node, symlinks */ + r = udev_event_execute_rules( + udev_event, + worker->inotify_fd, + worker->timeout_usec, + worker->timeout_signal, + worker->properties, + worker->rules); + if (r < 0) + return r; + + udev_event_execute_run(udev_event, worker->timeout_usec, worker->timeout_signal); + + if (!worker->rtnl) + /* in case rtnl was initialized */ + worker->rtnl = sd_netlink_ref(udev_event->rtnl); + + if (udev_event->inotify_watch) { + r = udev_watch_begin(worker->inotify_fd, dev); + if (r < 0 && r != -ENOENT) /* The device may be already removed, ignore -ENOENT. */ + log_device_warning_errno(dev, r, "Failed to add inotify watch, ignoring: %m"); + } + + log_device_uevent(dev, "Device processed"); + return 0; +} + +void udev_broadcast_result(sd_device_monitor *monitor, sd_device *dev, EventResult result) { + int r; + + assert(dev); + + /* On exit, manager->monitor is already NULL. */ + if (!monitor) + return; + + if (result != EVENT_RESULT_SUCCESS) { + (void) device_add_property(dev, "UDEV_WORKER_FAILED", "1"); + + switch (result) { + case EVENT_RESULT_NERRNO_MIN ... EVENT_RESULT_NERRNO_MAX: { + const char *str; + + (void) device_add_propertyf(dev, "UDEV_WORKER_ERRNO", "%i", -result); + + str = errno_to_name(result); + if (str) + (void) device_add_property(dev, "UDEV_WORKER_ERRNO_NAME", str); + break; + } + case EVENT_RESULT_EXIT_STATUS_BASE ... EVENT_RESULT_EXIT_STATUS_MAX: + (void) device_add_propertyf(dev, "UDEV_WORKER_EXIT_STATUS", "%i", result - EVENT_RESULT_EXIT_STATUS_BASE); + break; + + case EVENT_RESULT_TRY_AGAIN: + assert_not_reached(); + break; + + case EVENT_RESULT_SIGNAL_BASE ... EVENT_RESULT_SIGNAL_MAX: { + const char *str; + + (void) device_add_propertyf(dev, "UDEV_WORKER_SIGNAL", "%i", result - EVENT_RESULT_SIGNAL_BASE); + + str = signal_to_string(result - EVENT_RESULT_SIGNAL_BASE); + if (str) + (void) device_add_property(dev, "UDEV_WORKER_SIGNAL_NAME", str); + break; + } + default: + log_device_warning(dev, "Unknown event result \"%i\", ignoring.", result); + } + } + + r = device_monitor_send_device(monitor, NULL, dev); + if (r < 0) + log_device_warning_errno(dev, r, + "Failed to broadcast event to libudev listeners, ignoring: %m"); +} + +static int worker_send_result(UdevWorker *worker, EventResult result) { + assert(worker); + assert(worker->pipe_fd >= 0); + + return loop_write(worker->pipe_fd, &result, sizeof(result)); +} + +static int worker_device_monitor_handler(sd_device_monitor *monitor, sd_device *dev, void *userdata) { + UdevWorker *worker = ASSERT_PTR(userdata); + int r; + + assert(dev); + + r = worker_process_device(worker, dev); + if (r == EVENT_RESULT_TRY_AGAIN) + /* if we couldn't acquire the flock(), then requeue the event */ + log_device_debug(dev, "Block device is currently locked, requeueing the event."); + else { + if (r < 0) + log_device_warning_errno(dev, r, "Failed to process device, ignoring: %m"); + + /* send processed event back to libudev listeners */ + udev_broadcast_result(monitor, dev, r); + } + + /* send udevd the result of the event execution */ + r = worker_send_result(worker, r); + if (r < 0) + log_device_warning_errno(dev, r, "Failed to send signal to main daemon, ignoring: %m"); + + /* Reset the log level, as it might be changed by "OPTIONS=log_level=". */ + log_set_max_level(worker->log_level); + + return 1; +} + +int udev_worker_main(UdevWorker *worker, sd_device *dev) { + int r; + + assert(worker); + assert(worker->monitor); + assert(dev); + + DEVICE_TRACE_POINT(worker_spawned, dev, getpid_cached()); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, -1) >= 0); + + /* Reset OOM score, we only protect the main daemon. */ + r = set_oom_score_adjust(0); + if (r < 0) + log_debug_errno(r, "Failed to reset OOM score, ignoring: %m"); + + r = sd_event_new(&worker->event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = sd_event_add_signal(worker->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to set SIGTERM event: %m"); + + r = sd_device_monitor_attach_event(worker->monitor, worker->event); + if (r < 0) + return log_error_errno(r, "Failed to attach event loop to device monitor: %m"); + + r = sd_device_monitor_start(worker->monitor, worker_device_monitor_handler, worker); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + /* Process first device */ + (void) worker_device_monitor_handler(worker->monitor, dev, worker); + + r = sd_event_loop(worker->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} diff --git a/src/udev/udev-worker.h b/src/udev/udev-worker.h new file mode 100644 index 0000000..05c319e --- /dev/null +++ b/src/udev/udev-worker.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include + +#include "sd-device.h" +#include "sd-event.h" +#include "sd-netlink.h" + +#include "errno-list.h" +#include "hashmap.h" +#include "time-util.h" + +typedef struct UdevRules UdevRules; + +typedef struct UdevWorker { + sd_event *event; + sd_netlink *rtnl; + sd_device_monitor *monitor; + + Hashmap *properties; + UdevRules *rules; + + int pipe_fd; + int inotify_fd; /* Do not close! */ + + usec_t exec_delay_usec; + usec_t timeout_usec; + int timeout_signal; + int log_level; + bool blockdev_read_only; +} UdevWorker; + +/* passed from worker to main process */ +typedef enum EventResult { + EVENT_RESULT_NERRNO_MIN = -ERRNO_MAX, + EVENT_RESULT_NERRNO_MAX = -1, + EVENT_RESULT_SUCCESS = 0, + EVENT_RESULT_EXIT_STATUS_BASE = 0, + EVENT_RESULT_EXIT_STATUS_MAX = 255, + EVENT_RESULT_TRY_AGAIN = 256, /* when the block device is locked by another process. */ + EVENT_RESULT_SIGNAL_BASE = 257, + EVENT_RESULT_SIGNAL_MAX = EVENT_RESULT_SIGNAL_BASE + _NSIG, + _EVENT_RESULT_MAX, + _EVENT_RESULT_INVALID = -EINVAL, +} EventResult; + +void udev_worker_done(UdevWorker *worker); +int udev_worker_main(UdevWorker *worker, sd_device *dev); + +void udev_broadcast_result(sd_device_monitor *monitor, sd_device *dev, EventResult result); +int udev_get_whole_disk(sd_device *dev, sd_device **ret_device, const char **ret_devname); diff --git a/src/udev/udev.conf b/src/udev/udev.conf new file mode 100644 index 0000000..07d7f0c --- /dev/null +++ b/src/udev/udev.conf @@ -0,0 +1,11 @@ +# see udev.conf(5) for details +# +# udevd is also started in the initrd. When this file is modified you might +# also want to rebuild the initrd, so that it will include the modified configuration. + +#udev_log=info +#children_max= +#exec_delay= +#event_timeout=180 +#timeout_signal=SIGKILL +#resolve_names=early diff --git a/src/udev/udev.pc.in b/src/udev/udev.pc.in new file mode 100644 index 0000000..cbf7693 --- /dev/null +++ b/src/udev/udev.pc.in @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +Name: udev +Description: udev +Version: {{PROJECT_VERSION}} + +udev_dir={{UDEVLIBEXECDIR}} +udevdir=${udev_dir} diff --git a/src/udev/udevadm-control.c b/src/udev/udevadm-control.c new file mode 100644 index 0000000..64615f5 --- /dev/null +++ b/src/udev/udevadm-control.c @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "parse-util.h" +#include "process-util.h" +#include "static-destruct.h" +#include "strv.h" +#include "syslog-util.h" +#include "time-util.h" +#include "udevadm.h" +#include "udev-ctrl.h" +#include "virt.h" + +static char **arg_env = NULL; +static usec_t arg_timeout = 60 * USEC_PER_SEC; +static bool arg_ping = false; +static bool arg_reload = false; +static bool arg_exit = false; +static int arg_max_children = -1; +static int arg_log_level = -1; +static int arg_start_exec_queue = -1; + +STATIC_DESTRUCTOR_REGISTER(arg_env, strv_freep); + +static int help(void) { + printf("%s control OPTION\n\n" + "Control the udev daemon.\n\n" + " -h --help Show this help\n" + " -V --version Show package version\n" + " -e --exit Instruct the daemon to cleanup and exit\n" + " -l --log-level=LEVEL Set the udev log level for the daemon\n" + " -s --stop-exec-queue Do not execute events, queue only\n" + " -S --start-exec-queue Execute events, flush queue\n" + " -R --reload Reload rules and databases\n" + " -p --property=KEY=VALUE Set a global property for all events\n" + " -m --children-max=N Maximum number of children\n" + " --ping Wait for udev to respond to a ping message\n" + " -t --timeout=SECONDS Maximum time to block for a reply\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_PING = 0x100, + }; + + static const struct option options[] = { + { "exit", no_argument, NULL, 'e' }, + { "log-level", required_argument, NULL, 'l' }, + { "log-priority", required_argument, NULL, 'l' }, /* for backward compatibility */ + { "stop-exec-queue", no_argument, NULL, 's' }, + { "start-exec-queue", no_argument, NULL, 'S' }, + { "reload", no_argument, NULL, 'R' }, + { "reload-rules", no_argument, NULL, 'R' }, /* alias for -R */ + { "property", required_argument, NULL, 'p' }, + { "env", required_argument, NULL, 'p' }, /* alias for -p */ + { "children-max", required_argument, NULL, 'm' }, + { "ping", no_argument, NULL, ARG_PING }, + { "timeout", required_argument, NULL, 't' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + if (argc <= 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This command expects one or more options."); + + while ((c = getopt_long(argc, argv, "el:sSRp:m:t:Vh", options, NULL)) >= 0) + switch (c) { + + case 'e': + arg_exit = true; + break; + + case 'l': + arg_log_level = log_level_from_string(optarg); + if (arg_log_level < 0) + return log_error_errno(arg_log_level, "Failed to parse log level '%s': %m", optarg); + break; + + case 's': + arg_start_exec_queue = false; + break; + + case 'S': + arg_start_exec_queue = true; + break; + + case 'R': + arg_reload = true; + break; + + case 'p': + if (!strchr(optarg, '=')) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "expect = instead of '%s'", optarg); + + r = strv_extend(&arg_env, optarg); + if (r < 0) + return log_error_errno(r, "Failed to extend environment: %m"); + + break; + + case 'm': { + unsigned i; + r = safe_atou(optarg, &i); + if (r < 0) + return log_error_errno(r, "Failed to parse maximum number of children '%s': %m", optarg); + arg_max_children = i; + break; + } + + case ARG_PING: + arg_ping = true; + break; + + case 't': + r = parse_sec(optarg, &arg_timeout); + if (r < 0) + return log_error_errno(r, "Failed to parse timeout value '%s': %m", optarg); + break; + + case 'V': + return print_version(); + + case 'h': + return help(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Extraneous argument: %s", argv[optind]); + + return 1; +} + +int control_main(int argc, char *argv[], void *userdata) { + _cleanup_(udev_ctrl_unrefp) UdevCtrl *uctrl = NULL; + int r; + + if (running_in_chroot() > 0) { + log_info("Running in chroot, ignoring request."); + return 0; + } + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = udev_ctrl_new(&uctrl); + if (r < 0) + return log_error_errno(r, "Failed to initialize udev control: %m"); + + if (arg_exit) { + r = udev_ctrl_send_exit(uctrl); + if (r < 0) + return log_error_errno(r, "Failed to send exit request: %m"); + return 0; + } + + if (arg_log_level >= 0) { + r = udev_ctrl_send_set_log_level(uctrl, arg_log_level); + if (r < 0) + return log_error_errno(r, "Failed to send request to set log level: %m"); + } + + if (arg_start_exec_queue == false) { + r = udev_ctrl_send_stop_exec_queue(uctrl); + if (r < 0) + return log_error_errno(r, "Failed to send request to stop exec queue: %m"); + } + + if (arg_start_exec_queue == true) { + r = udev_ctrl_send_start_exec_queue(uctrl); + if (r < 0) + return log_error_errno(r, "Failed to send request to start exec queue: %m"); + } + + if (arg_reload) { + r = udev_ctrl_send_reload(uctrl); + if (r < 0) + return log_error_errno(r, "Failed to send reload request: %m"); + } + + STRV_FOREACH(env, arg_env) { + r = udev_ctrl_send_set_env(uctrl, *env); + if (r < 0) + return log_error_errno(r, "Failed to send request to update environment: %m"); + } + + if (arg_max_children >= 0) { + r = udev_ctrl_send_set_children_max(uctrl, arg_max_children); + if (r < 0) + return log_error_errno(r, "Failed to send request to set number of children: %m"); + } + + if (arg_ping) { + r = udev_ctrl_send_ping(uctrl); + if (r < 0) + return log_error_errno(r, "Failed to send a ping message: %m"); + } + + r = udev_ctrl_wait(uctrl, arg_timeout); + if (r < 0) + return log_error_errno(r, "Failed to wait for daemon to reply: %m"); + + return 0; +} diff --git a/src/udev/udevadm-hwdb.c b/src/udev/udevadm-hwdb.c new file mode 100644 index 0000000..2f5429f --- /dev/null +++ b/src/udev/udevadm-hwdb.c @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "hwdb-util.h" +#include "udevadm.h" + +static const char *arg_test = NULL; +static const char *arg_root = NULL; +static const char *arg_hwdb_bin_dir = NULL; +static bool arg_update = false; +static bool arg_strict = false; + +static int help(void) { + printf("%s hwdb [OPTIONS]\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n" + " -u --update Update the hardware database\n" + " -s --strict When updating, return non-zero exit value on any parsing error\n" + " --usr Generate in " UDEVLIBEXECDIR " instead of /etc/udev\n" + " -t --test=MODALIAS Query database and print result\n" + " -r --root=PATH Alternative root path in the filesystem\n\n" + "NOTE:\n" + "The sub-command 'hwdb' is deprecated, and is left for backwards compatibility.\n" + "Please use systemd-hwdb instead.\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_USR = 0x100, + }; + + static const struct option options[] = { + { "update", no_argument, NULL, 'u' }, + { "usr", no_argument, NULL, ARG_USR }, + { "strict", no_argument, NULL, 's' }, + { "test", required_argument, NULL, 't' }, + { "root", required_argument, NULL, 'r' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int c; + + while ((c = getopt_long(argc, argv, "ust:r:Vh", options, NULL)) >= 0) + switch (c) { + case 'u': + arg_update = true; + break; + case ARG_USR: + arg_hwdb_bin_dir = UDEVLIBEXECDIR; + break; + case 's': + arg_strict = true; + break; + case 't': + arg_test = optarg; + break; + case 'r': + arg_root = optarg; + break; + case 'V': + return print_version(); + case 'h': + return help(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + return 1; +} + +int hwdb_main(int argc, char *argv[], void *userdata) { + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (!arg_update && !arg_test) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Either --update or --test must be used."); + + log_notice("udevadm hwdb is deprecated. Use systemd-hwdb instead."); + + if (arg_update) { + r = hwdb_update(arg_root, arg_hwdb_bin_dir, arg_strict, true); + if (r < 0) + return r; + } + + if (arg_test) + return hwdb_query(arg_test, NULL); + + return 0; +} diff --git a/src/udev/udevadm-info.c b/src/udev/udevadm-info.c new file mode 100644 index 0000000..4cd9ad4 --- /dev/null +++ b/src/udev/udevadm-info.c @@ -0,0 +1,1120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-device.h" + +#include "alloc-util.h" +#include "device-enumerator-private.h" +#include "device-private.h" +#include "device-util.h" +#include "devnum-util.h" +#include "dirent-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "glyph-util.h" +#include "json.h" +#include "pager.h" +#include "parse-argument.h" +#include "sort-util.h" +#include "static-destruct.h" +#include "string-table.h" +#include "string-util.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "udevadm.h" +#include "udevadm-util.h" + +typedef enum ActionType { + ACTION_QUERY, + ACTION_ATTRIBUTE_WALK, + ACTION_DEVICE_ID_FILE, + ACTION_TREE, + ACTION_EXPORT, +} ActionType; + +typedef enum QueryType { + QUERY_NAME, + QUERY_PATH, + QUERY_SYMLINK, + QUERY_PROPERTY, + QUERY_ALL, +} QueryType; + +static char **arg_properties = NULL; +static bool arg_root = false; +static bool arg_export = false; +static bool arg_value = false; +static const char *arg_export_prefix = NULL; +static usec_t arg_wait_for_initialization_timeout = 0; +PagerFlags arg_pager_flags = 0; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; + +/* Put a limit on --tree descent level to not exhaust our stack */ +#define TREE_DEPTH_MAX 64 + +static bool skip_attribute(const char *name) { + assert(name); + + /* Those are either displayed separately or should not be shown at all. */ + return STR_IN_SET(name, + "uevent", + "dev", + "modalias", + "resource", + "driver", + "subsystem", + "module"); +} + +typedef struct SysAttr { + const char *name; + const char *value; +} SysAttr; + +STATIC_DESTRUCTOR_REGISTER(arg_properties, strv_freep); + +static int sysattr_compare(const SysAttr *a, const SysAttr *b) { + assert(a); + assert(b); + + return strcmp(a->name, b->name); +} + +static int print_all_attributes(sd_device *device, bool is_parent) { + _cleanup_free_ SysAttr *sysattrs = NULL; + const char *value; + size_t n_items = 0; + int r; + + assert(device); + + value = NULL; + (void) sd_device_get_devpath(device, &value); + printf(" looking at %sdevice '%s':\n", is_parent ? "parent " : "", strempty(value)); + + value = NULL; + (void) sd_device_get_sysname(device, &value); + printf(" %s==\"%s\"\n", is_parent ? "KERNELS" : "KERNEL", strempty(value)); + + value = NULL; + (void) sd_device_get_subsystem(device, &value); + printf(" %s==\"%s\"\n", is_parent ? "SUBSYSTEMS" : "SUBSYSTEM", strempty(value)); + + value = NULL; + (void) sd_device_get_driver(device, &value); + printf(" %s==\"%s\"\n", is_parent ? "DRIVERS" : "DRIVER", strempty(value)); + + FOREACH_DEVICE_SYSATTR(device, name) { + size_t len; + + if (skip_attribute(name)) + continue; + + r = sd_device_get_sysattr_value(device, name, &value); + if (r >= 0) { + /* skip any values that look like a path */ + if (value[0] == '/') + continue; + + /* skip nonprintable attributes */ + len = strlen(value); + while (len > 0 && isprint((unsigned char) value[len-1])) + len--; + if (len > 0) + continue; + + } else if (ERRNO_IS_PRIVILEGE(r)) + value = "(not readable)"; + else + continue; + + if (!GREEDY_REALLOC(sysattrs, n_items + 1)) + return log_oom(); + + sysattrs[n_items] = (SysAttr) { + .name = name, + .value = value, + }; + n_items++; + } + + typesafe_qsort(sysattrs, n_items, sysattr_compare); + + for (size_t i = 0; i < n_items; i++) + printf(" %s{%s}==\"%s\"\n", is_parent ? "ATTRS" : "ATTR", sysattrs[i].name, sysattrs[i].value); + + puts(""); + + return 0; +} + +static int print_device_chain(sd_device *device) { + sd_device *child, *parent; + int r; + + assert(device); + + printf("\n" + "Udevadm info starts with the device specified by the devpath and then\n" + "walks up the chain of parent devices. It prints for every device\n" + "found, all possible attributes in the udev rules key format.\n" + "A rule to match, can be composed by the attributes of the device\n" + "and the attributes from one single parent device.\n" + "\n"); + + r = print_all_attributes(device, false); + if (r < 0) + return r; + + for (child = device; sd_device_get_parent(child, &parent) >= 0; child = parent) { + r = print_all_attributes(parent, true); + if (r < 0) + return r; + } + + return 0; +} + +static int print_record(sd_device *device, const char *prefix) { + const char *str, *subsys; + dev_t devnum; + uint64_t q; + int i, ifi; + + assert(device); + + prefix = strempty(prefix); + + /* We don't show syspath here, because it's identical to devpath (modulo the "/sys" prefix). + * + * We don't show action/seqnum here because that only makes sense for records synthesized from + * uevents, not for those synthesized from database entries. + * + * We don't show sysattrs here, because they can be expensive and potentially issue expensive driver + * IO. + * + * Coloring: let's be conservative with coloring. Let's use it to group related fields. Right now: + * + * • white for fields that give the device a name + * • green for fields that categorize the device into subsystem/devtype and similar + * • cyan for fields about associated device nodes/symlinks/network interfaces and such + * • magenta for block device diskseq + * • yellow for driver info + * • no color for regular properties */ + + assert_se(sd_device_get_devpath(device, &str) >= 0); + printf("%sP: %s%s%s\n", prefix, ansi_highlight_white(), str, ansi_normal()); + + if (sd_device_get_sysname(device, &str) >= 0) + printf("%sM: %s%s%s\n", prefix, ansi_highlight_white(), str, ansi_normal()); + + if (sd_device_get_sysnum(device, &str) >= 0) + printf("%sR: %s%s%s\n", prefix, ansi_highlight_white(), str, ansi_normal()); + + if (sd_device_get_subsystem(device, &subsys) >= 0) + printf("%sU: %s%s%s\n", prefix, ansi_highlight_green(), subsys, ansi_normal()); + + if (sd_device_get_devtype(device, &str) >= 0) + printf("%sT: %s%s%s\n", prefix, ansi_highlight_green(), str, ansi_normal()); + + if (sd_device_get_devnum(device, &devnum) >= 0) + printf("%sD: %s%c %u:%u%s\n", + prefix, + ansi_highlight_cyan(), + streq_ptr(subsys, "block") ? 'b' : 'c', major(devnum), minor(devnum), + ansi_normal()); + + if (sd_device_get_ifindex(device, &ifi) >= 0) + printf("%sI: %s%i%s\n", prefix, ansi_highlight_cyan(), ifi, ansi_normal()); + + if (sd_device_get_devname(device, &str) >= 0) { + const char *val; + + assert_se(val = path_startswith(str, "/dev/")); + printf("%sN: %s%s%s\n", prefix, ansi_highlight_cyan(), val, ansi_normal()); + + if (device_get_devlink_priority(device, &i) >= 0) + printf("%sL: %s%i%s\n", prefix, ansi_highlight_cyan(), i, ansi_normal()); + + FOREACH_DEVICE_DEVLINK(device, link) { + assert_se(val = path_startswith(link, "/dev/")); + printf("%sS: %s%s%s\n", prefix, ansi_highlight_cyan(), val, ansi_normal()); + } + } + + if (sd_device_get_diskseq(device, &q) >= 0) + printf("%sQ: %s%" PRIu64 "%s\n", prefix, ansi_highlight_magenta(), q, ansi_normal()); + + if (sd_device_get_driver(device, &str) >= 0) + printf("%sV: %s%s%s\n", prefix, ansi_highlight_yellow4(), str, ansi_normal()); + + FOREACH_DEVICE_PROPERTY(device, key, val) + printf("%sE: %s=%s\n", prefix, key, val); + + if (isempty(prefix)) + puts(""); + return 0; +} + +static int record_to_json(sd_device *device, JsonVariant **ret) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + const char *str; + int r; + + assert(device); + assert(ret); + + /* We don't show any shorthand fields here as done in print_record() except for SYSNAME and SYSNUM as + * all the other ones have a matching property which will already be included. */ + + if (sd_device_get_sysname(device, &str) >= 0) { + r = json_variant_set_field_string(&v, "SYSNAME", str); + if (r < 0) + return r; + } + + if (sd_device_get_sysnum(device, &str) >= 0) { + r = json_variant_set_field_string(&v, "SYSNUM", str); + if (r < 0) + return r; + } + + FOREACH_DEVICE_PROPERTY(device, key, val) { + r = json_variant_set_field_string(&v, key, val); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(v); + return 0; +} + +static int stat_device(const char *name, bool export, const char *prefix) { + struct stat statbuf; + + assert(name); + + if (stat(name, &statbuf) != 0) + return -errno; + + if (export) { + if (!prefix) + prefix = "INFO_"; + printf("%sMAJOR=%u\n" + "%sMINOR=%u\n", + prefix, major(statbuf.st_dev), + prefix, minor(statbuf.st_dev)); + } else + printf("%u:%u\n", major(statbuf.st_dev), minor(statbuf.st_dev)); + return 0; +} + +static int export_devices(sd_device_enumerator *e) { + sd_device *d; + int r; + + assert(e); + + r = device_enumerator_scan_devices(e); + if (r < 0) + return log_error_errno(r, "Failed to scan devices: %m"); + + pager_open(arg_pager_flags); + + FOREACH_DEVICE_AND_SUBSYSTEM(e, d) + if (arg_json_format_flags & JSON_FORMAT_OFF) + (void) print_record(d, NULL); + else { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = record_to_json(d, &v); + if (r < 0) + return r; + + (void) json_variant_dump(v, arg_json_format_flags, stdout, NULL); + } + + return 0; +} + +static void cleanup_dir(DIR *dir, mode_t mask, int depth) { + assert(dir); + + if (depth <= 0) + return; + + FOREACH_DIRENT_ALL(dent, dir, break) { + struct stat stats; + + if (dot_or_dot_dot(dent->d_name)) + continue; + if (fstatat(dirfd(dir), dent->d_name, &stats, AT_SYMLINK_NOFOLLOW) < 0) + continue; + if ((stats.st_mode & mask) != 0) + continue; + if (S_ISDIR(stats.st_mode)) { + _cleanup_closedir_ DIR *subdir = NULL; + + subdir = xopendirat(dirfd(dir), dent->d_name, O_NOFOLLOW); + if (!subdir) + log_debug_errno(errno, "Failed to open subdirectory '%s', ignoring: %m", dent->d_name); + else + cleanup_dir(subdir, mask, depth-1); + + (void) unlinkat(dirfd(dir), dent->d_name, AT_REMOVEDIR); + } else + (void) unlinkat(dirfd(dir), dent->d_name, 0); + } +} + +/* + * Assume that dir is a directory with file names matching udev data base + * entries for devices in /run/udev/data (such as "b8:16"), and removes + * all files except those that haven't been deleted in /run/udev/data + * (i.e. they were skipped during db cleanup because of the db_persist flag). + */ +static void cleanup_dir_after_db_cleanup(DIR *dir, DIR *datadir) { + assert(dir); + assert(datadir); + + FOREACH_DIRENT_ALL(dent, dir, break) { + if (dot_or_dot_dot(dent->d_name)) + continue; + + if (faccessat(dirfd(datadir), dent->d_name, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) + /* The corresponding udev database file still exists. + * Assuming the persistent flag is set for the database. */ + continue; + + (void) unlinkat(dirfd(dir), dent->d_name, 0); + } +} + +static void cleanup_dirs_after_db_cleanup(DIR *dir, DIR *datadir) { + assert(dir); + assert(datadir); + + FOREACH_DIRENT_ALL(dent, dir, break) { + struct stat stats; + + if (dot_or_dot_dot(dent->d_name)) + continue; + if (fstatat(dirfd(dir), dent->d_name, &stats, AT_SYMLINK_NOFOLLOW) < 0) + continue; + if (S_ISDIR(stats.st_mode)) { + _cleanup_closedir_ DIR *subdir = NULL; + + subdir = xopendirat(dirfd(dir), dent->d_name, O_NOFOLLOW); + if (!subdir) + log_debug_errno(errno, "Failed to open subdirectory '%s', ignoring: %m", dent->d_name); + else + cleanup_dir_after_db_cleanup(subdir, datadir); + + (void) unlinkat(dirfd(dir), dent->d_name, AT_REMOVEDIR); + } else + (void) unlinkat(dirfd(dir), dent->d_name, 0); + } +} + +static void cleanup_db(void) { + _cleanup_closedir_ DIR *dir1 = NULL, *dir2 = NULL, *dir3 = NULL, *dir4 = NULL; + + dir1 = opendir("/run/udev/data"); + if (dir1) + cleanup_dir(dir1, S_ISVTX, 1); + + dir2 = opendir("/run/udev/links"); + if (dir2) + cleanup_dirs_after_db_cleanup(dir2, dir1); + + dir3 = opendir("/run/udev/tags"); + if (dir3) + cleanup_dirs_after_db_cleanup(dir3, dir1); + + dir4 = opendir("/run/udev/static_node-tags"); + if (dir4) + cleanup_dir(dir4, 0, 2); + + /* Do not remove /run/udev/watch. It will be handled by udevd well on restart. + * And should not be removed by external program when udevd is running. */ +} + +static int query_device(QueryType query, sd_device* device) { + int r; + + assert(device); + + switch (query) { + case QUERY_NAME: { + const char *node; + + r = sd_device_get_devname(device, &node); + if (r < 0) + return log_error_errno(r, "No device node found: %m"); + + if (!arg_root) + assert_se(node = path_startswith(node, "/dev/")); + printf("%s\n", node); + return 0; + } + + case QUERY_SYMLINK: { + const char *prefix = ""; + + FOREACH_DEVICE_DEVLINK(device, devlink) { + if (!arg_root) + assert_se(devlink = path_startswith(devlink, "/dev/")); + printf("%s%s", prefix, devlink); + prefix = " "; + } + puts(""); + return 0; + } + + case QUERY_PATH: { + const char *devpath; + + r = sd_device_get_devpath(device, &devpath); + if (r < 0) + return log_error_errno(r, "Failed to get device path: %m"); + + printf("%s\n", devpath); + return 0; + } + + case QUERY_PROPERTY: + FOREACH_DEVICE_PROPERTY(device, key, value) { + if (arg_properties && !strv_contains(arg_properties, key)) + continue; + + if (arg_export) + printf("%s%s='%s'\n", strempty(arg_export_prefix), key, value); + else if (arg_value) + printf("%s\n", value); + else + printf("%s=%s\n", key, value); + } + + return 0; + + case QUERY_ALL: + if (arg_json_format_flags & JSON_FORMAT_OFF) + return print_record(device, NULL); + else { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = record_to_json(device, &v); + if (r < 0) + return r; + + (void) json_variant_dump(v, arg_json_format_flags, stdout, NULL); + } + + return 0; + + default: + assert_not_reached(); + } +} + +static int help(void) { + printf("%s info [OPTIONS] [DEVPATH|FILE]\n\n" + "Query sysfs or the udev database.\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n" + " -q --query=TYPE Query device information:\n" + " name Name of device node\n" + " symlink Pointing to node\n" + " path sysfs device path\n" + " property The device properties\n" + " all All values\n" + " --property=NAME Show only properties by this name\n" + " --value When showing properties, print only their values\n" + " -p --path=SYSPATH sysfs device path used for query or attribute walk\n" + " -n --name=NAME Node or symlink name used for query or attribute walk\n" + " -r --root Prepend dev directory to path names\n" + " -a --attribute-walk Print all key matches walking along the chain\n" + " of parent devices\n" + " -t --tree Show tree of devices\n" + " -d --device-id-of-file=FILE Print major:minor of device containing this file\n" + " -x --export Export key/value pairs\n" + " -P --export-prefix Export the key name with a prefix\n" + " -e --export-db Export the content of the udev database\n" + " -c --cleanup-db Clean up the udev database\n" + " -w --wait-for-initialization[=SECONDS]\n" + " Wait for device to be initialized\n" + " --no-pager Do not pipe output into a pager\n" + " --json=pretty|short|off Generate JSON output\n" + " --subsystem-match=SUBSYSTEM\n" + " Query devices matching a subsystem\n" + " --subsystem-nomatch=SUBSYSTEM\n" + " Query devices not matching a subsystem\n" + " --attr-match=FILE[=VALUE]\n" + " Query devices that match an attribute\n" + " --attr-nomatch=FILE[=VALUE]\n" + " Query devices that do not match an attribute\n" + " --property-match=KEY=VALUE\n" + " Query devices with matching properties\n" + " --tag-match=TAG Query devices with a matching tag\n" + " --sysname-match=NAME Query devices with this /sys path\n" + " --name-match=NAME Query devices with this /dev name\n" + " --parent-match=NAME Query devices with this parent device\n" + " --initialized-match Query devices that are already initialized\n" + " --initialized-nomatch Query devices that are not initialized yet\n", + program_invocation_short_name); + + return 0; +} + +static int draw_tree( + sd_device *parent, + sd_device *const array[], size_t n, + const char *prefix, + unsigned level); + +static int output_tree_device( + sd_device *device, + const char *str, + const char *prefix, + bool more, + sd_device *const array[], size_t n, + unsigned level) { + + _cleanup_free_ char *subprefix = NULL, *subsubprefix = NULL; + + assert(device); + assert(str); + + prefix = strempty(prefix); + + printf("%s%s%s\n", prefix, special_glyph(more ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT), str); + + subprefix = strjoin(prefix, special_glyph(more ? SPECIAL_GLYPH_TREE_VERTICAL : SPECIAL_GLYPH_TREE_SPACE)); + if (!subprefix) + return log_oom(); + + subsubprefix = strjoin(subprefix, special_glyph(SPECIAL_GLYPH_VERTICAL_DOTTED), " "); + if (!subsubprefix) + return log_oom(); + + (void) print_record(device, subsubprefix); + + return draw_tree(device, array, n, subprefix, level + 1); +} + +static int draw_tree( + sd_device *parent, + sd_device *const array[], size_t n, + const char *prefix, + unsigned level) { + + const char *parent_path; + size_t i = 0; + int r; + + if (n == 0) + return 0; + + assert(array); + + if (parent) { + r = sd_device_get_devpath(parent, &parent_path); + if (r < 0) + return log_error_errno(r, "Failed to get sysfs path of parent device: %m"); + } else + parent_path = NULL; + + if (level > TREE_DEPTH_MAX) { + log_warning("Eliding tree below '%s', too deep.", strna(parent_path)); + return 0; + } + + while (i < n) { + sd_device *device = array[i]; + const char *device_path, *str; + bool more = false; + size_t j; + + r = sd_device_get_devpath(device, &device_path); + if (r < 0) + return log_error_errno(r, "Failed to get sysfs path of enumerated device: %m"); + + /* Scan through the subsequent devices looking children of the device we are looking at. */ + for (j = i + 1; j < n; j++) { + sd_device *next = array[j]; + const char *next_path; + + r = sd_device_get_devpath(next, &next_path); + if (r < 0) + return log_error_errno(r, "Failed to get sysfs of child device: %m"); + + if (!path_startswith(next_path, device_path)) { + more = !parent_path || path_startswith(next_path, parent_path); + break; + } + } + + /* Determine the string to display for this node. If we are at the top of the tree, the full + * device path so far, otherwise just the part suffixing the parent's device path. */ + str = parent ? ASSERT_PTR(path_startswith(device_path, parent_path)) : device_path; + + r = output_tree_device(device, str, prefix, more, array + i + 1, j - i - 1, level); + if (r < 0) + return r; + + i = j; + } + + return 0; +} + +static int print_tree(sd_device* below) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *below_path; + sd_device **array; + size_t n = 0; + int r; + + if (below) { + r = sd_device_get_devpath(below, &below_path); + if (r < 0) + return log_error_errno(r, "Failed to get sysfs path of device: %m"); + + } else + below_path = NULL; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return log_error_errno(r, "Failed to allocate device enumerator: %m"); + + if (below) { + r = sd_device_enumerator_add_match_parent(e, below); + if (r < 0) + return log_error_errno(r, "Failed to install parent enumerator match: %m"); + } + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return log_error_errno(r, "Failed to enable enumeration of uninitialized devices: %m"); + + r = device_enumerator_scan_devices_and_subsystems(e); + if (r < 0) + return log_error_errno(r, "Failed to scan for devices and subsystems: %m"); + + if (below) { + /* This must be called after device_enumerator_scan_devices_and_subsystems(). */ + r = device_enumerator_add_parent_devices(e, below); + if (r < 0) + return log_error_errno(r, "Failed to add parent devices: %m"); + } + + assert_se(array = device_enumerator_get_devices(e, &n)); + + if (n == 0) { + log_info("No items."); + return 0; + } + + r = draw_tree(NULL, array, n, NULL, 0); + if (r < 0) + return r; + + printf("\n%zu items shown.\n", n); + return 0; +} + +static int ensure_device_enumerator(sd_device_enumerator **e) { + int r; + + assert(e); + + if (*e) + return 0; + + r = sd_device_enumerator_new(e); + if (r < 0) + return log_error_errno(r, "Failed to create device enumerator: %m"); + + r = sd_device_enumerator_allow_uninitialized(*e); + if (r < 0) + return log_error_errno(r, "Failed to allow uninitialized devices: %m"); + + return 0; +} + +static int parse_key_value_argument(const char *s, char **key, char **value) { + _cleanup_free_ char *k = NULL, *v = NULL; + int r; + + assert(s); + assert(key); + assert(value); + + r = extract_many_words(&s, "=", EXTRACT_DONT_COALESCE_SEPARATORS, &k, &v, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse key/value pair %s: %m", s); + if (r < 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing '=' in key/value pair %s.", s); + + if (!filename_is_valid(k)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not a valid key name", k); + + free_and_replace(*key, k); + free_and_replace(*value, v); + return 0; +} + +int info_main(int argc, char *argv[], void *userdata) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_strv_free_ char **devices = NULL; + _cleanup_free_ char *name = NULL; + int c, r, ret; + + enum { + ARG_PROPERTY = 0x100, + ARG_VALUE, + ARG_NO_PAGER, + ARG_JSON, + ARG_SUBSYSTEM_MATCH, + ARG_SUBSYSTEM_NOMATCH, + ARG_ATTR_MATCH, + ARG_ATTR_NOMATCH, + ARG_PROPERTY_MATCH, + ARG_TAG_MATCH, + ARG_SYSNAME_MATCH, + ARG_NAME_MATCH, + ARG_PARENT_MATCH, + ARG_INITIALIZED_MATCH, + ARG_INITIALIZED_NOMATCH, + }; + + static const struct option options[] = { + { "attribute-walk", no_argument, NULL, 'a' }, + { "tree", no_argument, NULL, 't' }, + { "cleanup-db", no_argument, NULL, 'c' }, + { "device-id-of-file", required_argument, NULL, 'd' }, + { "export", no_argument, NULL, 'x' }, + { "export-db", no_argument, NULL, 'e' }, + { "export-prefix", required_argument, NULL, 'P' }, + { "help", no_argument, NULL, 'h' }, + { "name", required_argument, NULL, 'n' }, + { "path", required_argument, NULL, 'p' }, + { "property", required_argument, NULL, ARG_PROPERTY }, + { "query", required_argument, NULL, 'q' }, + { "root", no_argument, NULL, 'r' }, + { "value", no_argument, NULL, ARG_VALUE }, + { "version", no_argument, NULL, 'V' }, + { "wait-for-initialization", optional_argument, NULL, 'w' }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "json", required_argument, NULL, ARG_JSON }, + { "subsystem-match", required_argument, NULL, ARG_SUBSYSTEM_MATCH }, + { "subsystem-nomatch", required_argument, NULL, ARG_SUBSYSTEM_NOMATCH }, + { "attr-match", required_argument, NULL, ARG_ATTR_MATCH }, + { "attr-nomatch", required_argument, NULL, ARG_ATTR_NOMATCH }, + { "property-match", required_argument, NULL, ARG_PROPERTY_MATCH }, + { "tag-match", required_argument, NULL, ARG_TAG_MATCH }, + { "sysname-match", required_argument, NULL, ARG_SYSNAME_MATCH }, + { "name-match", required_argument, NULL, ARG_NAME_MATCH }, + { "parent-match", required_argument, NULL, ARG_PARENT_MATCH }, + { "initialized-match", no_argument, NULL, ARG_INITIALIZED_MATCH }, + { "initialized-nomatch", no_argument, NULL, ARG_INITIALIZED_NOMATCH }, + {} + }; + + ActionType action = ACTION_QUERY; + QueryType query = QUERY_ALL; + + while ((c = getopt_long(argc, argv, "atced:n:p:q:rxP:w::Vh", options, NULL)) >= 0) + switch (c) { + case ARG_PROPERTY: + /* Make sure that if the empty property list was specified, we won't show any + properties. */ + if (isempty(optarg) && !arg_properties) { + arg_properties = new0(char*, 1); + if (!arg_properties) + return log_oom(); + } else { + r = strv_split_and_extend(&arg_properties, optarg, ",", true); + if (r < 0) + return log_oom(); + } + break; + case ARG_VALUE: + arg_value = true; + break; + case 'n': + case 'p': { + const char *prefix = c == 'n' ? "/dev/" : "/sys/"; + char *path; + + path = path_join(path_startswith(optarg, prefix) ? NULL : prefix, optarg); + if (!path) + return log_oom(); + + r = strv_consume(&devices, path); + if (r < 0) + return log_oom(); + break; + } + + case 'q': + action = ACTION_QUERY; + if (streq(optarg, "property") || streq(optarg, "env")) + query = QUERY_PROPERTY; + else if (streq(optarg, "name")) + query = QUERY_NAME; + else if (streq(optarg, "symlink")) + query = QUERY_SYMLINK; + else if (streq(optarg, "path")) + query = QUERY_PATH; + else if (streq(optarg, "all")) + query = QUERY_ALL; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "unknown query type"); + break; + case 'r': + arg_root = true; + break; + case 'd': + action = ACTION_DEVICE_ID_FILE; + r = free_and_strdup(&name, optarg); + if (r < 0) + return log_oom(); + break; + case 'a': + action = ACTION_ATTRIBUTE_WALK; + break; + case 't': + action = ACTION_TREE; + break; + case 'e': + action = ACTION_EXPORT; + break; + case 'c': + cleanup_db(); + return 0; + case 'x': + arg_export = true; + break; + case 'P': + arg_export = true; + arg_export_prefix = optarg; + break; + case 'w': + if (optarg) { + r = parse_sec(optarg, &arg_wait_for_initialization_timeout); + if (r < 0) + return log_error_errno(r, "Failed to parse timeout value: %m"); + } else + arg_wait_for_initialization_timeout = USEC_INFINITY; + break; + case 'V': + return print_version(); + case 'h': + return help(); + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + break; + + case ARG_SUBSYSTEM_MATCH: + case ARG_SUBSYSTEM_NOMATCH: + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_subsystem(e, optarg, c == ARG_SUBSYSTEM_MATCH); + if (r < 0) + return log_error_errno(r, "Failed to add%s subsystem match '%s': %m", + c == ARG_SUBSYSTEM_MATCH ? "" : " negative", optarg); + + break; + + case ARG_ATTR_MATCH: + case ARG_ATTR_NOMATCH: { + _cleanup_free_ char *k = NULL, *v = NULL; + + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = parse_key_value_argument(optarg, &k, &v); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysattr(e, k, v, c == ARG_ATTR_MATCH); + if (r < 0) + return log_error_errno(r, "Failed to add%s sysattr match '%s=%s': %m", + c == ARG_ATTR_MATCH ? "" : " negative", k, v); + break; + } + + case ARG_PROPERTY_MATCH: { + _cleanup_free_ char *k = NULL, *v = NULL; + + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = parse_key_value_argument(optarg, &k, &v); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_property_required(e, k, v); + if (r < 0) + return log_error_errno(r, "Failed to add property match '%s=%s': %m", k, v); + break; + } + + case ARG_TAG_MATCH: + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_tag(e, optarg); + if (r < 0) + return log_error_errno(r, "Failed to add tag match '%s': %m", optarg); + break; + + case ARG_SYSNAME_MATCH: + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_sysname(e, optarg); + if (r < 0) + return log_error_errno(r, "Failed to add sysname match '%s': %m", optarg); + break; + + case ARG_NAME_MATCH: + case ARG_PARENT_MATCH: { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + r = find_device(optarg, c == ARG_NAME_MATCH ? "/dev" : "/sys", &dev); + if (r < 0) + return log_error_errno(r, "Failed to open the device '%s': %m", optarg); + + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = device_enumerator_add_match_parent_incremental(e, dev); + if (r < 0) + return log_error_errno(r, "Failed to add parent match '%s': %m", optarg); + break; + } + + case ARG_INITIALIZED_MATCH: + case ARG_INITIALIZED_NOMATCH: + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + r = device_enumerator_add_match_is_initialized(e, c == ARG_INITIALIZED_MATCH ? MATCH_INITIALIZED_YES : MATCH_INITIALIZED_NO); + if (r < 0) + return log_error_errno(r, "Failed to set initialized filter: %m"); + break; + + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (action == ACTION_DEVICE_ID_FILE) { + if (argv[optind]) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Positional arguments are not allowed with -d/--device-id-of-file."); + assert(name); + return stat_device(name, arg_export, arg_export_prefix); + } + + if (action == ACTION_EXPORT) { + r = ensure_device_enumerator(&e); + if (r < 0) + return r; + + return export_devices(e); + } + + r = strv_extend_strv(&devices, argv + optind, false); + if (r < 0) + return log_error_errno(r, "Failed to build argument list: %m"); + + if (action != ACTION_TREE && strv_isempty(devices)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "A device name or path is required"); + if (IN_SET(action, ACTION_ATTRIBUTE_WALK, ACTION_TREE) && strv_length(devices) > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Only one device may be specified with -a/--attribute-walk and -t/--tree"); + + if (arg_export && arg_value) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "-x/--export or -P/--export-prefix cannot be used with --value"); + + pager_open(arg_pager_flags); + + if (strv_isempty(devices)) { + assert(action == ACTION_TREE); + return print_tree(NULL); + } + + ret = 0; + STRV_FOREACH(p, devices) { + _cleanup_(sd_device_unrefp) sd_device *device = NULL; + + r = find_device(*p, NULL, &device); + if (r < 0) { + if (r == -EINVAL) + log_error_errno(r, "Bad argument \"%s\", expected an absolute path in /dev/ or /sys/ or a unit name: %m", *p); + else + log_error_errno(r, "Unknown device \"%s\": %m", *p); + + if (ret == 0) + ret = r; + continue; + } + + if (arg_wait_for_initialization_timeout > 0) { + sd_device *d; + + r = device_wait_for_initialization( + device, + NULL, + arg_wait_for_initialization_timeout, + &d); + if (r < 0) + return r; + + sd_device_unref(device); + device = d; + } + + if (action == ACTION_QUERY) + r = query_device(query, device); + else if (action == ACTION_ATTRIBUTE_WALK) + r = print_device_chain(device); + else if (action == ACTION_TREE) + r = print_tree(device); + else + assert_not_reached(); + if (r < 0) + return r; + } + + return ret; +} diff --git a/src/udev/udevadm-lock.c b/src/udev/udevadm-lock.c new file mode 100644 index 0000000..bc2d5e7 --- /dev/null +++ b/src/udev/udevadm-lock.c @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include +#include +#include + +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "device-util.h" +#include "fd-util.h" +#include "fdset.h" +#include "lock-util.h" +#include "main-func.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "strv.h" +#include "time-util.h" +#include "udevadm.h" + +static usec_t arg_timeout_usec = USEC_INFINITY; +static char **arg_devices = NULL; +static char **arg_backing = NULL; +static char **arg_cmdline = NULL; +static bool arg_print = false; + +STATIC_DESTRUCTOR_REGISTER(arg_devices, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_backing, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_cmdline, strv_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("udevadm", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND\n" + "%s [OPTIONS...] --print\n" + "\n%sLock a block device and run a command.%s\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n" + " -d --device=DEVICE Block device to lock\n" + " -b --backing=FILE File whose backing block device to lock\n" + " -t --timeout=SECS Block at most the specified time waiting for lock\n" + " -p --print Only show which block device the lock would be taken on\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "device", required_argument, NULL, 'd' }, + { "backing", required_argument, NULL, 'b' }, + { "timeout", required_argument, NULL, 't' }, + { "print", no_argument, NULL, 'p' }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, arg_print ? "hVd:b:t:p" : "+hVd:b:t:p", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case 'V': + return print_version(); + + case 'd': + case 'b': { + _cleanup_free_ char *s = NULL; + char ***l = c == 'd' ? &arg_devices : &arg_backing; + + r = path_make_absolute_cwd(optarg, &s); + if (r < 0) + return log_error_errno(r, "Failed to make path '%s' absolute: %m", optarg); + + path_simplify(s); + + if (strv_consume(l, TAKE_PTR(s)) < 0) + return log_oom(); + + strv_uniq(*l); + break; + } + + case 't': + r = parse_sec(optarg, &arg_timeout_usec); + if (r < 0) + return log_error_errno(r, "Failed to parse --timeout= parameter: %s", optarg); + break; + + case 'p': + arg_print = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_print) { + if (optind != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No arguments expected"); + } else { + if (optind + 1 > argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too few arguments, command to execute."); + + arg_cmdline = strv_copy(argv + optind); + if (!arg_cmdline) + return log_oom(); + } + + if (strv_isempty(arg_devices) && strv_isempty(arg_backing)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No devices to lock specified, refusing."); + + return 1; +} + +static int find_devno( + dev_t **devnos, + size_t *n_devnos, + const char *device, + bool backing) { + + dev_t devt; + int r; + + assert(devnos); + assert(n_devnos); + assert(*devnos || *n_devnos == 0); + assert(device); + + r = path_get_whole_disk(device, backing, &devt); + if (r < 0) + return log_error_errno(r, "Failed to find whole block device for '%s': %m", device); + + if (typesafe_bsearch(&devt, *devnos, *n_devnos, devt_compare_func)) { + log_debug("Device %u:%u already listed for locking, ignoring.", major(devt), minor(devt)); + return 0; + } + + if (!GREEDY_REALLOC(*devnos, *n_devnos + 1)) + return log_oom(); + + (*devnos)[(*n_devnos)++] = devt; + + /* Immediately sort again, to ensure the binary search above will work for the next device we add */ + typesafe_qsort(*devnos, *n_devnos, devt_compare_func); + return 1; +} + +static int lock_device( + const char *path, + dev_t devno, + usec_t deadline) { + + _cleanup_close_ int fd = -EBADF; + struct stat st; + int r; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", path); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", path); + + /* Extra safety: check that the device still refers to what we think it refers to */ + if (!S_ISBLK(st.st_mode) || st.st_rdev != devno) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "Path '%s' no longer refers to specified block device %u:%u: %m", path, major(devno), minor(devno)); + + r = lock_generic(fd, LOCK_BSD, LOCK_EX|LOCK_NB); + if (r < 0) { + if (r != -EAGAIN) + return log_error_errno(r, "Failed to lock device '%s': %m", path); + + if (deadline == 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Device '%s' is currently locked.", path); + + if (deadline == USEC_INFINITY) { + log_info("Device '%s' is currently locked, waiting%s", path, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = lock_generic(fd, LOCK_BSD, LOCK_EX); + } else { + usec_t left = usec_sub_unsigned(deadline, now(CLOCK_MONOTONIC)); + + log_info("Device '%s' is currently locked, waiting %s%s", + path, FORMAT_TIMESPAN(left, 0), + special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + r = lock_generic_with_timeout(fd, LOCK_BSD, LOCK_EX, left); + if (r == -ETIMEDOUT) + return log_error_errno(r, "Timeout reached."); + } + if (r < 0) + return log_error_errno(r, "Failed to lock device '%s': %m", path); + } + + log_debug("Successfully locked %s (%u:%u)%s", path, major(devno), minor(devno), special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + return TAKE_FD(fd); +} + +int lock_main(int argc, char *argv[], void *userdata) { + _cleanup_fdset_free_ FDSet *fds = NULL; + _cleanup_free_ dev_t *devnos = NULL; + size_t n_devnos = 0; + usec_t deadline; + pid_t pid; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + STRV_FOREACH(i, arg_devices) { + r = find_devno(&devnos, &n_devnos, *i, /* backing= */ false); + if (r < 0) + return r; + } + + STRV_FOREACH(i, arg_backing) { + r = find_devno(&devnos, &n_devnos, *i, /* backing= */ true); + if (r < 0) + return r; + } + + assert(n_devnos > 0); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + if (!timestamp_is_set(arg_timeout_usec)) + deadline = arg_timeout_usec; + else + deadline = usec_add(now(CLOCK_MONOTONIC), arg_timeout_usec); + + for (size_t i = 0; i < n_devnos; i++) { + _cleanup_free_ char *node = NULL; + + r = devname_from_devnum(S_IFBLK, devnos[i], &node); + if (r < 0) + return log_error_errno(r, "Failed to format block device path: %m"); + + if (arg_print) + printf("%s\n", node); + else { + _cleanup_close_ int fd = -EBADF; + + fd = lock_device(node, devnos[i], deadline); + if (fd < 0) + return fd; + + r = fdset_consume(fds, TAKE_FD(fd)); + if (r < 0) + return log_oom(); + } + } + + if (arg_print) + return EXIT_SUCCESS; + + /* Ignore SIGINT and allow the forked process to receive it */ + (void) ignore_signals(SIGINT); + + r = safe_fork("(lock)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + + execvp(arg_cmdline[0], arg_cmdline); + log_open(); + log_error_errno(errno, "Failed to execute %s: %m", arg_cmdline[0]); + _exit(EXIT_FAILURE); + } + + return wait_for_terminate_and_check(arg_cmdline[0], pid, 0); +} diff --git a/src/udev/udevadm-monitor.c b/src/udev/udevadm-monitor.c new file mode 100644 index 0000000..27c4853 --- /dev/null +++ b/src/udev/udevadm-monitor.c @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include + +#include "sd-device.h" +#include "sd-event.h" + +#include "alloc-util.h" +#include "device-monitor-private.h" +#include "device-private.h" +#include "device-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "hashmap.h" +#include "set.h" +#include "signal-util.h" +#include "string-util.h" +#include "udevadm.h" +#include "virt.h" +#include "time-util.h" + +static bool arg_show_property = false; +static bool arg_print_kernel = false; +static bool arg_print_udev = false; +static Set *arg_tag_filter = NULL; +static Hashmap *arg_subsystem_filter = NULL; + +static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) { + sd_device_action_t action = _SD_DEVICE_ACTION_INVALID; + const char *devpath = NULL, *subsystem = NULL; + MonitorNetlinkGroup group = PTR_TO_INT(userdata); + struct timespec ts; + + assert(device); + assert(IN_SET(group, MONITOR_GROUP_UDEV, MONITOR_GROUP_KERNEL)); + + (void) sd_device_get_action(device, &action); + (void) sd_device_get_devpath(device, &devpath); + (void) sd_device_get_subsystem(device, &subsystem); + + assert_se(clock_gettime(CLOCK_MONOTONIC, &ts) == 0); + + printf("%-6s[%"PRI_TIME".%06"PRI_NSEC"] %-8s %s (%s)\n", + group == MONITOR_GROUP_UDEV ? "UDEV" : "KERNEL", + ts.tv_sec, (nsec_t)ts.tv_nsec/1000, + strna(device_action_to_string(action)), + devpath, subsystem); + + if (arg_show_property) { + FOREACH_DEVICE_PROPERTY(device, key, value) + printf("%s=%s\n", key, value); + + printf("\n"); + } + + return 0; +} + +static int setup_monitor(MonitorNetlinkGroup sender, sd_event *event, sd_device_monitor **ret) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; + const char *subsystem, *devtype, *tag; + int r; + + r = device_monitor_new_full(&monitor, sender, -1); + if (r < 0) + return log_error_errno(r, "Failed to create netlink socket: %m"); + + r = sd_device_monitor_attach_event(monitor, event); + if (r < 0) + return log_error_errno(r, "Failed to attach event: %m"); + + HASHMAP_FOREACH_KEY(devtype, subsystem, arg_subsystem_filter) { + r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, subsystem, devtype); + if (r < 0) + return log_error_errno(r, "Failed to apply subsystem filter '%s%s%s': %m", + subsystem, devtype ? "/" : "", strempty(devtype)); + } + + SET_FOREACH(tag, arg_tag_filter) { + r = sd_device_monitor_filter_add_match_tag(monitor, tag); + if (r < 0) + return log_error_errno(r, "Failed to apply tag filter '%s': %m", tag); + } + + r = sd_device_monitor_start(monitor, device_monitor_handler, INT_TO_PTR(sender)); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + (void) sd_device_monitor_set_description(monitor, sender == MONITOR_GROUP_UDEV ? "udev" : "kernel"); + + *ret = TAKE_PTR(monitor); + return 0; +} + +static int help(void) { + printf("%s monitor [OPTIONS]\n\n" + "Listen to kernel and udev events.\n\n" + " -h --help Show this help\n" + " -V --version Show package version\n" + " -p --property Print the event properties\n" + " -k --kernel Print kernel uevents\n" + " -u --udev Print udev events\n" + " -s --subsystem-match=SUBSYSTEM[/DEVTYPE] Filter events by subsystem\n" + " -t --tag-match=TAG Filter events by tag\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "property", no_argument, NULL, 'p' }, + { "environment", no_argument, NULL, 'e' }, /* alias for -p */ + { "kernel", no_argument, NULL, 'k' }, + { "udev", no_argument, NULL, 'u' }, + { "subsystem-match", required_argument, NULL, 's' }, + { "tag-match", required_argument, NULL, 't' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int r, c; + + while ((c = getopt_long(argc, argv, "pekus:t:Vh", options, NULL)) >= 0) + switch (c) { + case 'p': + case 'e': + arg_show_property = true; + break; + case 'k': + arg_print_kernel = true; + break; + case 'u': + arg_print_udev = true; + break; + case 's': { + _cleanup_free_ char *subsystem = NULL, *devtype = NULL; + const char *slash; + + slash = strchr(optarg, '/'); + if (slash) { + devtype = strdup(slash + 1); + if (!devtype) + return -ENOMEM; + + subsystem = strndup(optarg, slash - optarg); + } else + subsystem = strdup(optarg); + + if (!subsystem) + return -ENOMEM; + + r = hashmap_ensure_put(&arg_subsystem_filter, NULL, subsystem, devtype); + if (r < 0) + return r; + + TAKE_PTR(subsystem); + TAKE_PTR(devtype); + break; + } + case 't': + /* optarg is stored in argv[], so we don't need to copy it */ + r = set_ensure_put(&arg_tag_filter, &string_hash_ops, optarg); + if (r < 0) + return r; + break; + + case 'V': + return print_version(); + case 'h': + return help(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (!arg_print_kernel && !arg_print_udev) { + arg_print_kernel = true; + arg_print_udev = true; + } + + return 1; +} + +int monitor_main(int argc, char *argv[], void *userdata) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *kernel_monitor = NULL, *udev_monitor = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + goto finalize; + + if (running_in_chroot() > 0) { + log_info("Running in chroot, ignoring request."); + return 0; + } + + /* Callers are expecting to see events as they happen: Line buffering */ + setlinebuf(stdout); + + r = sd_event_default(&event); + if (r < 0) { + log_error_errno(r, "Failed to initialize event: %m"); + goto finalize; + } + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + + printf("monitor will print the received events for:\n"); + if (arg_print_udev) { + r = setup_monitor(MONITOR_GROUP_UDEV, event, &udev_monitor); + if (r < 0) + goto finalize; + + printf("UDEV - the event which udev sends out after rule processing\n"); + } + + if (arg_print_kernel) { + r = setup_monitor(MONITOR_GROUP_KERNEL, event, &kernel_monitor); + if (r < 0) + goto finalize; + + printf("KERNEL - the kernel uevent\n"); + } + printf("\n"); + + r = sd_event_loop(event); + if (r < 0) { + log_error_errno(r, "Failed to run event loop: %m"); + goto finalize; + } + + r = 0; + +finalize: + hashmap_free_free_free(arg_subsystem_filter); + set_free(arg_tag_filter); + + return r; +} diff --git a/src/udev/udevadm-settle.c b/src/udev/udevadm-settle.c new file mode 100644 index 0000000..c236a70 --- /dev/null +++ b/src/udev/udevadm-settle.c @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © 2009 Canonical Ltd. + * Copyright © 2009 Scott James Remnant + */ + +#include +#include + +#include "sd-bus.h" +#include "sd-event.h" +#include "sd-login.h" +#include "sd-messages.h" + +#include "bus-util.h" +#include "path-util.h" +#include "strv.h" +#include "time-util.h" +#include "udev-ctrl.h" +#include "udev-util.h" +#include "udevadm.h" +#include "unit-def.h" +#include "virt.h" + +static usec_t arg_timeout_usec = 120 * USEC_PER_SEC; +static const char *arg_exists = NULL; + +static int help(void) { + printf("%s settle [OPTIONS]\n\n" + "Wait for pending udev events.\n\n" + " -h --help Show this help\n" + " -V --version Show package version\n" + " -t --timeout=SEC Maximum time to wait for events\n" + " -E --exit-if-exists=FILE Stop waiting if file exists\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "timeout", required_argument, NULL, 't' }, + { "exit-if-exists", required_argument, NULL, 'E' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + { "seq-start", required_argument, NULL, 's' }, /* removed */ + { "seq-end", required_argument, NULL, 'e' }, /* removed */ + { "quiet", no_argument, NULL, 'q' }, /* removed */ + {} + }; + + int c, r; + + while ((c = getopt_long(argc, argv, "t:E:Vhs:e:q", options, NULL)) >= 0) { + switch (c) { + case 't': + r = parse_sec(optarg, &arg_timeout_usec); + if (r < 0) + return log_error_errno(r, "Failed to parse timeout value '%s': %m", optarg); + break; + case 'E': + if (!path_is_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid path: %s", optarg); + + arg_exists = optarg; + break; + case 'V': + return print_version(); + case 'h': + return help(); + case 's': + case 'e': + case 'q': + return log_info_errno(SYNTHETIC_ERRNO(EINVAL), + "Option -%c no longer supported.", + c); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + } + + return 1; +} + +static int emit_deprecation_warning(void) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_strv_free_ char **a = NULL; + _cleanup_free_ char *unit = NULL; + int r; + + r = sd_pid_get_unit(0, &unit); + if (r < 0) { + log_debug_errno(r, "Failed to determine unit we run in, ignoring: %m"); + return 0; + } + + if (!streq(unit, "systemd-udev-settle.service")) + return 0; + + r = bus_connect_system_systemd(&bus); + if (r < 0) + log_debug_errno(r, "Failed to open connection to systemd, skipping dependency queries: %m"); + else { + _cleanup_strv_free_ char **b = NULL; + _cleanup_free_ char *unit_path = NULL; + + unit_path = unit_dbus_path_from_name("systemd-udev-settle.service"); + if (!unit_path) + return -ENOMEM; + + (void) sd_bus_get_property_strv( + bus, + "org.freedesktop.systemd1", + unit_path, + "org.freedesktop.systemd1.Unit", + "WantedBy", + NULL, + &a); + + (void) sd_bus_get_property_strv( + bus, + "org.freedesktop.systemd1", + unit_path, + "org.freedesktop.systemd1.Unit", + "RequiredBy", + NULL, + &b); + + r = strv_extend_strv(&a, b, true); + if (r < 0) + return r; + } + + if (strv_isempty(a)) + /* Print a simple message if we cannot determine the dependencies */ + log_notice("systemd-udev-settle.service is deprecated."); + else { + /* Print a longer, structured message if we can acquire the dependencies (this should be the + * common case). This is hooked up with a catalog entry and everything. */ + _cleanup_free_ char *t = NULL; + + t = strv_join(a, ", "); + if (!t) + return -ENOMEM; + + log_struct(LOG_NOTICE, + LOG_MESSAGE("systemd-udev-settle.service is deprecated. Please fix %s not to pull it in.", t), + "OFFENDING_UNITS=%s", t, + "MESSAGE_ID=" SD_MESSAGE_SYSTEMD_UDEV_SETTLE_DEPRECATED_STR); + } + + return 0; +} + +static bool check(void) { + int r; + + if (arg_exists) { + if (access(arg_exists, F_OK) >= 0) + return true; + + if (errno != ENOENT) + log_warning_errno(errno, "Failed to check the existence of \"%s\", ignoring: %m", arg_exists); + } + + /* exit if queue is empty */ + r = udev_queue_is_empty(); + if (r < 0) + log_warning_errno(r, "Failed to check if udev queue is empty, ignoring: %m"); + + return r > 0; +} + +static int on_inotify(sd_event_source *s, const struct inotify_event *event, void *userdata) { + assert(s); + + if (check()) + return sd_event_exit(sd_event_source_get_event(s), 0); + + return 0; +} + +int settle_main(int argc, char *argv[], void *userdata) { + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (running_in_chroot() > 0) { + log_info("Running in chroot, ignoring request."); + return 0; + } + + (void) emit_deprecation_warning(); + + if (getuid() == 0) { + _cleanup_(udev_ctrl_unrefp) UdevCtrl *uctrl = NULL; + + /* guarantee that the udev daemon isn't pre-processing */ + + r = udev_ctrl_new(&uctrl); + if (r < 0) + return log_error_errno(r, "Failed to create control socket for udev daemon: %m"); + + r = udev_ctrl_send_ping(uctrl); + if (r < 0) { + log_debug_errno(r, "Failed to connect to udev daemon, ignoring: %m"); + return 0; + } + + r = udev_ctrl_wait(uctrl, MAX(5 * USEC_PER_SEC, arg_timeout_usec)); + if (r < 0) + return log_error_errno(r, "Failed to wait for daemon to reply: %m"); + } else { + /* For non-privileged users, at least check if udevd is running. */ + if (access("/run/udev/control", F_OK) < 0) + return log_error_errno(errno, + errno == ENOENT ? "systemd-udevd is not running." : + "Failed to check if /run/udev/control exists: %m"); + } + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default sd-event object: %m"); + + r = sd_event_add_inotify(event, NULL, "/run/udev" , IN_DELETE, on_inotify, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add inotify watch for /run/udev: %m"); + + if (arg_timeout_usec != USEC_INFINITY) { + r = sd_event_add_time_relative(event, NULL, CLOCK_BOOTTIME, arg_timeout_usec, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)); + if (r < 0) + return log_error_errno(r, "Failed to add timer event source: %m"); + } + + /* Check before entering the event loop, as the udev queue may be already empty. */ + if (check()) + return 0; + + r = sd_event_loop(event); + if (r == -ETIMEDOUT) + return log_error_errno(r, "Timed out for waiting the udev queue being empty."); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} diff --git a/src/udev/udevadm-test-builtin.c b/src/udev/udevadm-test-builtin.c new file mode 100644 index 0000000..088b4da --- /dev/null +++ b/src/udev/udevadm-test-builtin.c @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include +#include +#include +#include + +#include "log.h" +#include "udev-builtin.h" +#include "udevadm.h" +#include "udevadm-util.h" + +static sd_device_action_t arg_action = SD_DEVICE_ADD; +static const char *arg_command = NULL; +static const char *arg_syspath = NULL; + +static int help(void) { + printf("%s test-builtin [OPTIONS] COMMAND DEVPATH\n\n" + "Test a built-in command.\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n\n" + " -a --action=ACTION|help Set action string\n" + "Commands:\n", + program_invocation_short_name); + + udev_builtin_list(); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "action", required_argument, NULL, 'a' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int r, c; + + while ((c = getopt_long(argc, argv, "a:Vh", options, NULL)) >= 0) + switch (c) { + case 'a': + r = parse_device_action(optarg, &arg_action); + if (r < 0) + return log_error_errno(r, "Invalid action '%s'", optarg); + if (r == 0) + return 0; + break; + case 'V': + return print_version(); + case 'h': + return help(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + arg_command = argv[optind++]; + if (!arg_command) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Command missing."); + + arg_syspath = argv[optind++]; + if (!arg_syspath) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "device is missing."); + + return 1; +} + +int builtin_main(int argc, char *argv[], void *userdata) { + _cleanup_(udev_event_freep) UdevEvent *event = NULL; + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + UdevBuiltinCommand cmd; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + udev_builtin_init(); + + cmd = udev_builtin_lookup(arg_command); + if (cmd < 0) { + r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown command '%s'", arg_command); + goto finish; + } + + r = find_device_with_action(arg_syspath, arg_action, &dev); + if (r < 0) { + log_error_errno(r, "Failed to open device '%s': %m", arg_syspath); + goto finish; + } + + event = udev_event_new(dev, 0, NULL, LOG_DEBUG); + if (!event) { + r = log_oom(); + goto finish; + } + + r = udev_builtin_run(event, cmd, arg_command, true); + if (r < 0) { + log_debug_errno(r, "Builtin command '%s' fails: %m", arg_command); + goto finish; + } + + r = 0; +finish: + udev_builtin_exit(); + return r; +} diff --git a/src/udev/udevadm-test.c b/src/udev/udevadm-test.c new file mode 100644 index 0000000..e1afd7d --- /dev/null +++ b/src/udev/udevadm-test.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © 2003-2004 Greg Kroah-Hartman + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-device.h" + +#include "device-private.h" +#include "device-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strxcpyx.h" +#include "udev-builtin.h" +#include "udev-event.h" +#include "udev-format.h" +#include "udevadm-util.h" +#include "udevadm.h" + +static sd_device_action_t arg_action = SD_DEVICE_ADD; +static ResolveNameTiming arg_resolve_name_timing = RESOLVE_NAME_EARLY; +static const char *arg_syspath = NULL; + +static int help(void) { + + printf("%s test [OPTIONS] DEVPATH\n\n" + "Test an event run.\n\n" + " -h --help Show this help\n" + " -V --version Show package version\n" + " -a --action=ACTION|help Set action string\n" + " -N --resolve-names=early|late|never When to resolve names\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "action", required_argument, NULL, 'a' }, + { "resolve-names", required_argument, NULL, 'N' }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int r, c; + + while ((c = getopt_long(argc, argv, "a:N:Vh", options, NULL)) >= 0) + switch (c) { + case 'a': + r = parse_device_action(optarg, &arg_action); + if (r < 0) + return log_error_errno(r, "Invalid action '%s'", optarg); + if (r == 0) + return 0; + break; + case 'N': + arg_resolve_name_timing = resolve_name_timing_from_string(optarg); + if (arg_resolve_name_timing < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--resolve-names= must be early, late or never"); + break; + case 'V': + return print_version(); + case 'h': + return help(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + arg_syspath = argv[optind]; + if (!arg_syspath) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "syspath parameter missing."); + + return 1; +} + +int test_main(int argc, char *argv[], void *userdata) { + _cleanup_(udev_rules_freep) UdevRules *rules = NULL; + _cleanup_(udev_event_freep) UdevEvent *event = NULL; + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *cmd; + sigset_t mask, sigmask_orig; + void *val; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + printf("This program is for debugging only, it does not run any program\n" + "specified by a RUN key. It may show incorrect results, because\n" + "some values may be different, or not available at a simulation run.\n" + "\n"); + + assert_se(sigprocmask(SIG_SETMASK, NULL, &sigmask_orig) >= 0); + + udev_builtin_init(); + + r = udev_rules_load(&rules, arg_resolve_name_timing); + if (r < 0) { + log_error_errno(r, "Failed to read udev rules: %m"); + goto out; + } + + r = find_device_with_action(arg_syspath, arg_action, &dev); + if (r < 0) { + log_error_errno(r, "Failed to open device '%s': %m", arg_syspath); + goto out; + } + + /* don't read info from the db */ + device_seal(dev); + + event = udev_event_new(dev, 0, NULL, LOG_DEBUG); + + assert_se(sigfillset(&mask) >= 0); + assert_se(sigprocmask(SIG_SETMASK, &mask, &sigmask_orig) >= 0); + + udev_event_execute_rules(event, -1, 60 * USEC_PER_SEC, SIGKILL, NULL, rules); + + FOREACH_DEVICE_PROPERTY(dev, key, value) + printf("%s=%s\n", key, value); + + ORDERED_HASHMAP_FOREACH_KEY(val, cmd, event->run_list) { + char program[UDEV_PATH_SIZE]; + bool truncated; + + (void) udev_event_apply_format(event, cmd, program, sizeof(program), false, NULL, &truncated); + if (truncated) + log_warning("The command '%s' is truncated while substituting into '%s'.", program, cmd); + printf("run: '%s'\n", program); + } + + r = 0; +out: + udev_builtin_exit(); + return r; +} diff --git a/src/udev/udevadm-trigger.c b/src/udev/udevadm-trigger.c new file mode 100644 index 0000000..e0f487d --- /dev/null +++ b/src/udev/udevadm-trigger.c @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include + +#include "sd-device.h" +#include "sd-event.h" + +#include "device-enumerator-private.h" +#include "device-private.h" +#include "device-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "id128-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "process-util.h" +#include "set.h" +#include "static-destruct.h" +#include "string-util.h" +#include "strv.h" +#include "udevadm.h" +#include "udevadm-util.h" +#include "udev-ctrl.h" +#include "virt.h" + +static bool arg_verbose = false; +static bool arg_dry_run = false; +static bool arg_quiet = false; +static bool arg_uuid = false; +static bool arg_settle = false; + +static int exec_list( + sd_device_enumerator *e, + sd_device_action_t action, + Set **ret_settle_path_or_ids) { + + _cleanup_set_free_ Set *settle_path_or_ids = NULL; + int uuid_supported = -1; + const char *action_str; + sd_device *d; + int r, ret = 0; + + assert(e); + + action_str = device_action_to_string(action); + + FOREACH_DEVICE_AND_SUBSYSTEM(e, d) { + sd_id128_t id = SD_ID128_NULL; + const char *syspath; + + r = sd_device_get_syspath(d, &syspath); + if (r < 0) { + log_debug_errno(r, "Failed to get syspath of enumerated devices, ignoring: %m"); + continue; + } + + if (arg_verbose) + printf("%s\n", syspath); + + if (arg_dry_run) + continue; + + /* Use the UUID mode if the user explicitly asked for it, or if --settle has been specified, + * so that we can recognize our own uevent. */ + r = sd_device_trigger_with_uuid(d, action, (arg_uuid || arg_settle) && uuid_supported != 0 ? &id : NULL); + if (r == -EINVAL && !arg_uuid && arg_settle && uuid_supported < 0) { + /* If we specified a UUID because of the settling logic, and we got EINVAL this might + * be caused by an old kernel which doesn't know the UUID logic (pre-4.13). Let's try + * if it works without the UUID logic then. */ + r = sd_device_trigger(d, action); + if (r != -EINVAL) + uuid_supported = false; /* dropping the uuid stuff changed the return code, + * hence don't bother next time */ + } + if (r < 0) { + /* ENOENT may be returned when a device does not have /uevent or is already + * removed. Hence, this is logged at debug level and ignored. + * + * ENODEV may be returned by some buggy device drivers e.g. /sys/devices/vio. + * See, + * https://github.com/systemd/systemd/issues/13652#issuecomment-535129791 and + * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1845319. + * So, this error is ignored, but logged at warning level to encourage people to + * fix the driver. + * + * EROFS is returned when /sys is read only. In that case, all subsequent + * writes will also fail, hence return immediately. + * + * EACCES or EPERM may be returned when this is invoked by non-priviledged user. + * We do NOT return immediately, but continue operation and propagate the error. + * Why? Some device can be owned by a user, e.g., network devices configured in + * a network namespace. See, https://github.com/systemd/systemd/pull/18559 and + * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ebb4a4bf76f164457184a3f43ebc1552416bc823 + * + * All other errors are logged at error level, but let's continue the operation, + * and propagate the error. + */ + + bool ignore = IN_SET(r, -ENOENT, -ENODEV); + int level = + arg_quiet ? LOG_DEBUG : + r == -ENOENT ? LOG_DEBUG : + r == -ENODEV ? LOG_WARNING : LOG_ERR; + + log_device_full_errno(d, level, r, + "Failed to write '%s' to '%s/uevent'%s: %m", + action_str, syspath, ignore ? ", ignoring" : ""); + + if (r == -EROFS) + return r; + if (ret == 0 && !ignore) + ret = r; + continue; + } else + log_device_debug(d, "Triggered device with action '%s'.", action_str); + + if (uuid_supported < 0) + uuid_supported = true; + + /* If the user asked for it, write event UUID to stdout */ + if (arg_uuid) + printf(SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(id)); + + if (arg_settle) { + if (uuid_supported) { + sd_id128_t *dup; + + dup = newdup(sd_id128_t, &id, 1); + if (!dup) + return log_oom(); + + r = set_ensure_consume(&settle_path_or_ids, &id128_hash_ops_free, dup); + } else { + char *dup; + + dup = strdup(syspath); + if (!dup) + return log_oom(); + + r = set_ensure_consume(&settle_path_or_ids, &path_hash_ops_free, dup); + } + if (r < 0) + return log_oom(); + } + } + + if (ret_settle_path_or_ids) + *ret_settle_path_or_ids = TAKE_PTR(settle_path_or_ids); + + return ret; +} + +static int device_monitor_handler(sd_device_monitor *m, sd_device *dev, void *userdata) { + Set *settle_path_or_ids = * (Set**) ASSERT_PTR(userdata); + const char *syspath; + sd_id128_t id; + int r; + + assert(dev); + + r = sd_device_get_syspath(dev, &syspath); + if (r < 0) { + log_device_debug_errno(dev, r, "Failed to get syspath of device event, ignoring: %m"); + return 0; + } + + if (sd_device_get_trigger_uuid(dev, &id) >= 0) { + _cleanup_free_ sd_id128_t *saved = NULL; + + saved = set_remove(settle_path_or_ids, &id); + if (!saved) { + log_device_debug(dev, "Got uevent not matching expected UUID, ignoring."); + return 0; + } + } else { + _cleanup_free_ char *saved = NULL; + + saved = set_remove(settle_path_or_ids, syspath); + if (!saved) { + const char *old_sysname; + + /* When the device is renamed, the new name is broadcast, and the old name is saved + * in INTERFACE_OLD. + * + * TODO: remove support for INTERFACE_OLD when kernel baseline is bumped to 4.13 or + * higher. See 1193448cb68e5a90cab027e16a093bbd367e9494. + */ + + if (sd_device_get_property_value(dev, "INTERFACE_OLD", &old_sysname) >= 0) { + _cleanup_free_ char *dir = NULL, *old_syspath = NULL; + + r = path_extract_directory(syspath, &dir); + if (r < 0) { + log_device_debug_errno(dev, r, + "Failed to extract directory from '%s', ignoring: %m", + syspath); + return 0; + } + + old_syspath = path_join(dir, old_sysname); + if (!old_syspath) { + log_oom_debug(); + return 0; + } + + saved = set_remove(settle_path_or_ids, old_syspath); + } + } + if (!saved) { + log_device_debug(dev, "Got uevent for unexpected device, ignoring."); + return 0; + } + } + + if (arg_verbose) + printf("settle %s\n", syspath); + + if (arg_uuid) + printf("settle " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(id)); + + if (set_isempty(settle_path_or_ids)) + return sd_event_exit(sd_device_monitor_get_event(m), 0); + + return 0; +} + +static char* keyval(const char *str, const char **key, const char **val) { + char *buf, *pos; + + buf = strdup(str); + if (!buf) + return NULL; + + pos = strchr(buf, '='); + if (pos) { + pos[0] = 0; + pos++; + } + + *key = buf; + *val = pos; + + return buf; +} + +static int help(void) { + printf("%s trigger [OPTIONS] DEVPATH\n\n" + "Request events from the kernel.\n\n" + " -h --help Show this help\n" + " -V --version Show package version\n" + " -v --verbose Print the list of devices while running\n" + " -n --dry-run Do not actually trigger the events\n" + " -q --quiet Suppress error logging in triggering events\n" + " -t --type= Type of events to trigger\n" + " devices sysfs devices (default)\n" + " subsystems sysfs subsystems and drivers\n" + " all sysfs devices, subsystems, and drivers\n" + " -c --action=ACTION|help Event action value, default is \"change\"\n" + " -s --subsystem-match=SUBSYSTEM Trigger devices from a matching subsystem\n" + " -S --subsystem-nomatch=SUBSYSTEM Exclude devices from a matching subsystem\n" + " -a --attr-match=FILE[=VALUE] Trigger devices with a matching attribute\n" + " -A --attr-nomatch=FILE[=VALUE] Exclude devices with a matching attribute\n" + " -p --property-match=KEY=VALUE Trigger devices with a matching property\n" + " -g --tag-match=TAG Trigger devices with a matching tag\n" + " -y --sysname-match=NAME Trigger devices with this /sys path\n" + " --name-match=NAME Trigger devices with this /dev name\n" + " -b --parent-match=NAME Trigger devices with that parent device\n" + " --initialized-match Trigger devices that are already initialized\n" + " --initialized-nomatch Trigger devices that are not initialized yet\n" + " -w --settle Wait for the triggered events to complete\n" + " --wait-daemon[=SECONDS] Wait for udevd daemon to be initialized\n" + " before triggering uevents\n" + " --uuid Print synthetic uevent UUID\n" + " --prioritized-subsystem=SUBSYSTEM[,SUBSYSTEM…]\n" + " Trigger devices from a matching subsystem first\n", + program_invocation_short_name); + + return 0; +} + +int trigger_main(int argc, char *argv[], void *userdata) { + enum { + ARG_NAME = 0x100, + ARG_PING, + ARG_UUID, + ARG_PRIORITIZED_SUBSYSTEM, + ARG_INITIALIZED_MATCH, + ARG_INITIALIZED_NOMATCH, + }; + + static const struct option options[] = { + { "verbose", no_argument, NULL, 'v' }, + { "dry-run", no_argument, NULL, 'n' }, + { "quiet", no_argument, NULL, 'q' }, + { "type", required_argument, NULL, 't' }, + { "action", required_argument, NULL, 'c' }, + { "subsystem-match", required_argument, NULL, 's' }, + { "subsystem-nomatch", required_argument, NULL, 'S' }, + { "attr-match", required_argument, NULL, 'a' }, + { "attr-nomatch", required_argument, NULL, 'A' }, + { "property-match", required_argument, NULL, 'p' }, + { "tag-match", required_argument, NULL, 'g' }, + { "sysname-match", required_argument, NULL, 'y' }, + { "name-match", required_argument, NULL, ARG_NAME }, + { "parent-match", required_argument, NULL, 'b' }, + { "initialized-match", no_argument, NULL, ARG_INITIALIZED_MATCH }, + { "initialized-nomatch", no_argument, NULL, ARG_INITIALIZED_NOMATCH }, + { "settle", no_argument, NULL, 'w' }, + { "wait-daemon", optional_argument, NULL, ARG_PING }, + { "version", no_argument, NULL, 'V' }, + { "help", no_argument, NULL, 'h' }, + { "uuid", no_argument, NULL, ARG_UUID }, + { "prioritized-subsystem", required_argument, NULL, ARG_PRIORITIZED_SUBSYSTEM }, + {} + }; + enum { + TYPE_DEVICES, + TYPE_SUBSYSTEMS, + TYPE_ALL, + } device_type = TYPE_DEVICES; + sd_device_action_t action = SD_DEVICE_CHANGE; + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *m = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_set_free_ Set *settle_path_or_ids = NULL; + usec_t ping_timeout_usec = 5 * USEC_PER_SEC; + bool ping = false; + int c, r; + + if (running_in_chroot() > 0) { + log_info("Running in chroot, ignoring request."); + return 0; + } + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + while ((c = getopt_long(argc, argv, "vnqt:c:s:S:a:A:p:g:y:b:wVh", options, NULL)) >= 0) { + _cleanup_free_ char *buf = NULL; + const char *key, *val; + + switch (c) { + case 'v': + arg_verbose = true; + break; + case 'n': + arg_dry_run = true; + break; + case 'q': + arg_quiet = true; + break; + case 't': + if (streq(optarg, "devices")) + device_type = TYPE_DEVICES; + else if (streq(optarg, "subsystems")) + device_type = TYPE_SUBSYSTEMS; + else if (streq(optarg, "all")) + device_type = TYPE_ALL; + else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown type --type=%s", optarg); + break; + case 'c': + r = parse_device_action(optarg, &action); + if (r < 0) + return log_error_errno(r, "Unknown action '%s'", optarg); + if (r == 0) + return 0; + break; + case 's': + r = sd_device_enumerator_add_match_subsystem(e, optarg, true); + if (r < 0) + return log_error_errno(r, "Failed to add subsystem match '%s': %m", optarg); + break; + case 'S': + r = sd_device_enumerator_add_match_subsystem(e, optarg, false); + if (r < 0) + return log_error_errno(r, "Failed to add negative subsystem match '%s': %m", optarg); + break; + case 'a': + buf = keyval(optarg, &key, &val); + if (!buf) + return log_oom(); + r = sd_device_enumerator_add_match_sysattr(e, key, val, true); + if (r < 0) + return log_error_errno(r, "Failed to add sysattr match '%s=%s': %m", key, val); + break; + case 'A': + buf = keyval(optarg, &key, &val); + if (!buf) + return log_oom(); + r = sd_device_enumerator_add_match_sysattr(e, key, val, false); + if (r < 0) + return log_error_errno(r, "Failed to add negative sysattr match '%s=%s': %m", key, val); + break; + case 'p': + buf = keyval(optarg, &key, &val); + if (!buf) + return log_oom(); + r = sd_device_enumerator_add_match_property(e, key, val); + if (r < 0) + return log_error_errno(r, "Failed to add property match '%s=%s': %m", key, val); + break; + case 'g': + r = sd_device_enumerator_add_match_tag(e, optarg); + if (r < 0) + return log_error_errno(r, "Failed to add tag match '%s': %m", optarg); + break; + case 'y': + r = sd_device_enumerator_add_match_sysname(e, optarg); + if (r < 0) + return log_error_errno(r, "Failed to add sysname match '%s': %m", optarg); + break; + case 'b': { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + r = find_device(optarg, "/sys", &dev); + if (r < 0) + return log_error_errno(r, "Failed to open the device '%s': %m", optarg); + + r = device_enumerator_add_match_parent_incremental(e, dev); + if (r < 0) + return log_error_errno(r, "Failed to add parent match '%s': %m", optarg); + break; + } + case 'w': + arg_settle = true; + break; + + case ARG_NAME: { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + r = find_device(optarg, "/dev", &dev); + if (r < 0) + return log_error_errno(r, "Failed to open the device '%s': %m", optarg); + + r = device_enumerator_add_match_parent_incremental(e, dev); + if (r < 0) + return log_error_errno(r, "Failed to add parent match '%s': %m", optarg); + break; + } + + case ARG_PING: + ping = true; + if (optarg) { + r = parse_sec(optarg, &ping_timeout_usec); + if (r < 0) + log_error_errno(r, "Failed to parse timeout value '%s', ignoring: %m", optarg); + } + break; + + case ARG_UUID: + arg_uuid = true; + break; + + case ARG_PRIORITIZED_SUBSYSTEM: { + _cleanup_strv_free_ char **subsystems = NULL; + + subsystems = strv_split(optarg, ","); + if (!subsystems) + return log_error_errno(r, "Failed to parse prioritized subsystem '%s': %m", optarg); + + STRV_FOREACH(p, subsystems) { + r = device_enumerator_add_prioritized_subsystem(e, *p); + if (r < 0) + return log_error_errno(r, "Failed to add prioritized subsystem '%s': %m", *p); + } + break; + } + case ARG_INITIALIZED_MATCH: + case ARG_INITIALIZED_NOMATCH: + r = device_enumerator_add_match_is_initialized(e, c == ARG_INITIALIZED_MATCH ? MATCH_INITIALIZED_YES : MATCH_INITIALIZED_NO); + if (r < 0) + return log_error_errno(r, "Failed to set initialized filter: %m"); + break; + case 'V': + return print_version(); + case 'h': + return help(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + } + + if (ping) { + _cleanup_(udev_ctrl_unrefp) UdevCtrl *uctrl = NULL; + + r = udev_ctrl_new(&uctrl); + if (r < 0) + return log_error_errno(r, "Failed to initialize udev control: %m"); + + r = udev_ctrl_send_ping(uctrl); + if (r < 0) + return log_error_errno(r, "Failed to connect to udev daemon: %m"); + + r = udev_ctrl_wait(uctrl, ping_timeout_usec); + if (r < 0) + return log_error_errno(r, "Failed to wait for daemon to reply: %m"); + } + + for (; optind < argc; optind++) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + + r = find_device(argv[optind], NULL, &dev); + if (r < 0) + return log_error_errno(r, "Failed to open the device '%s': %m", argv[optind]); + + r = device_enumerator_add_match_parent_incremental(e, dev); + if (r < 0) + return log_error_errno(r, "Failed to add parent match '%s': %m", argv[optind]); + } + + if (arg_settle) { + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event: %m"); + + r = sd_device_monitor_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to create device monitor object: %m"); + + r = sd_device_monitor_attach_event(m, event); + if (r < 0) + return log_error_errno(r, "Failed to attach event to device monitor: %m"); + + r = sd_device_monitor_start(m, device_monitor_handler, &settle_path_or_ids); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + } + + switch (device_type) { + case TYPE_SUBSYSTEMS: + r = device_enumerator_scan_subsystems(e); + if (r < 0) + return log_error_errno(r, "Failed to scan subsystems: %m"); + break; + case TYPE_DEVICES: + r = device_enumerator_scan_devices(e); + if (r < 0) + return log_error_errno(r, "Failed to scan devices: %m"); + break; + case TYPE_ALL: + r = device_enumerator_scan_devices_and_subsystems(e); + if (r < 0) + return log_error_errno(r, "Failed to scan devices and subsystems: %m"); + break; + default: + assert_not_reached(); + } + + r = exec_list(e, action, arg_settle ? &settle_path_or_ids : NULL); + if (r < 0) + return r; + + if (!set_isempty(settle_path_or_ids)) { + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + } + + return 0; +} diff --git a/src/udev/udevadm-util.c b/src/udev/udevadm-util.c new file mode 100644 index 0000000..2447eda --- /dev/null +++ b/src/udev/udevadm-util.c @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-util.h" +#include "device-private.h" +#include "path-util.h" +#include "udevadm-util.h" +#include "unit-name.h" + +static int find_device_from_unit(const char *unit_name, sd_device **ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *unit_path = NULL, *syspath = NULL; + int r; + + if (!unit_name_is_valid(unit_name, UNIT_NAME_PLAIN)) + return -EINVAL; + + if (unit_name_to_type(unit_name) != UNIT_DEVICE) + return -EINVAL; + + r = bus_connect_system_systemd(&bus); + if (r < 0) { + _cleanup_free_ char *path = NULL; + + log_debug_errno(r, "Failed to open connection to systemd, using unit name as syspath: %m"); + + r = unit_name_to_path(unit_name, &path); + if (r < 0) + return log_debug_errno(r, "Failed to convert \"%s\" to a device path: %m", unit_name); + + return sd_device_new_from_path(ret, path); + } + + unit_path = unit_dbus_path_from_name(unit_name); + if (!unit_path) + return -ENOMEM; + + r = sd_bus_get_property_string( + bus, + "org.freedesktop.systemd1", + unit_path, + "org.freedesktop.systemd1.Device", + "SysFSPath", + &error, + &syspath); + if (r < 0) + return log_debug_errno(r, "Failed to get SysFSPath= dbus property for %s: %s", + unit_name, bus_error_message(&error, r)); + + return sd_device_new_from_syspath(ret, syspath); +} + +int find_device(const char *id, const char *prefix, sd_device **ret) { + assert(id); + assert(ret); + + if (sd_device_new_from_path(ret, id) >= 0) + return 0; + + if (prefix && !path_startswith(id, prefix)) { + _cleanup_free_ char *path = NULL; + + path = path_join(prefix, id); + if (!path) + return -ENOMEM; + + if (sd_device_new_from_path(ret, path) >= 0) + return 0; + } + + /* if a path is provided, then it cannot be a unit name. Let's return earlier. */ + if (is_path(id)) + return -ENODEV; + + /* Check if the argument looks like a device unit name. */ + return find_device_from_unit(id, ret); +} + +int find_device_with_action(const char *id, sd_device_action_t action, sd_device **ret) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int r; + + assert(id); + assert(ret); + assert(action >= 0 && action < _SD_DEVICE_ACTION_MAX); + + r = find_device(id, "/sys", &dev); + if (r < 0) + return r; + + r = device_read_uevent_file(dev); + if (r < 0) + return r; + + r = device_set_action(dev, action); + if (r < 0) + return r; + + *ret = TAKE_PTR(dev); + return 0; +} + +int parse_device_action(const char *str, sd_device_action_t *action) { + sd_device_action_t a; + + assert(str); + assert(action); + + if (streq(str, "help")) { + dump_device_action_table(); + return 0; + } + + a = device_action_from_string(str); + if (a < 0) + return a; + + *action = a; + return 1; +} diff --git a/src/udev/udevadm-util.h b/src/udev/udevadm-util.h new file mode 100644 index 0000000..7fb4556 --- /dev/null +++ b/src/udev/udevadm-util.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include "sd-device.h" + +int find_device(const char *id, const char *prefix, sd_device **ret); +int find_device_with_action(const char *id, sd_device_action_t action, sd_device **ret); +int parse_device_action(const char *str, sd_device_action_t *action); diff --git a/src/udev/udevadm-verify.c b/src/udev/udevadm-verify.c new file mode 100644 index 0000000..3220250 --- /dev/null +++ b/src/udev/udevadm-verify.c @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include +#include +#include +#include +#include + +#include "conf-files.h" +#include "constants.h" +#include "log.h" +#include "parse-argument.h" +#include "pretty-print.h" +#include "stat-util.h" +#include "static-destruct.h" +#include "strv.h" +#include "udev-rules.h" +#include "udevadm.h" + +static ResolveNameTiming arg_resolve_name_timing = RESOLVE_NAME_EARLY; +static char *arg_root = NULL; +static bool arg_summary = true; +static bool arg_style = true; + +STATIC_DESTRUCTOR_REGISTER(arg_root, freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("udevadm", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s verify [OPTIONS] [FILE...]\n" + "\n%sVerify udev rules files.%s\n\n" + " -h --help Show this help\n" + " -V --version Show package version\n" + " -N --resolve-names=early|never When to resolve names\n" + " --root=PATH Operate on an alternate filesystem root\n" + " --no-summary Do not show summary\n" + " --no-style Ignore style issues\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_ROOT = 0x100, + ARG_NO_SUMMARY, + ARG_NO_STYLE, + }; + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "resolve-names", required_argument, NULL, 'N' }, + { "root", required_argument, NULL, ARG_ROOT }, + { "no-summary", no_argument, NULL, ARG_NO_SUMMARY }, + { "no-style", no_argument, NULL, ARG_NO_STYLE }, + {} + }; + + int r, c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hVN:", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + case 'V': + return print_version(); + case 'N': + arg_resolve_name_timing = resolve_name_timing_from_string(optarg); + if (arg_resolve_name_timing < 0) + return log_error_errno(arg_resolve_name_timing, + "--resolve-names= takes \"early\" or \"never\""); + /* + * In the verifier "late" has the effect of "never", + * and "never" would generate irrelevant diagnostics, + * so map "never" to "late". + */ + if (arg_resolve_name_timing == RESOLVE_NAME_NEVER) + arg_resolve_name_timing = RESOLVE_NAME_LATE; + break; + case ARG_ROOT: + r = parse_path_argument(optarg, /* suppress_root= */ true, &arg_root); + if (r < 0) + return r; + break; + case ARG_NO_SUMMARY: + arg_summary = false; + break; + + case ARG_NO_STYLE: + arg_style = false; + break; + + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (arg_root && optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Combination of --root= and FILEs is not supported."); + + return 1; +} + +static int verify_rules_file(UdevRules *rules, const char *fname) { + UdevRuleFile *file; + int r; + + r = udev_rules_parse_file(rules, fname, /* extra_checks = */ true, &file); + if (r < 0) + return log_error_errno(r, "Failed to parse rules file %s: %m", fname); + if (r == 0) /* empty file. */ + return 0; + + unsigned issues = udev_rule_file_get_issues(file); + unsigned mask = (1U << LOG_ERR) | (1U << LOG_WARNING); + if (issues & mask) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: udev rules check failed.", fname); + + if (arg_style && (issues & (1U << LOG_NOTICE))) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "%s: udev rules have style issues.", fname); + + return 0; +} + +static int verify_rules_filelist(UdevRules *rules, char **files, size_t *fail_count, size_t *success_count, bool walk_dirs); + +static int verify_rules_dir(UdevRules *rules, const char *dir, size_t *fail_count, size_t *success_count) { + int r; + _cleanup_strv_free_ char **files = NULL; + + assert(rules); + assert(dir); + assert(fail_count); + assert(success_count); + + r = conf_files_list(&files, ".rules", NULL, 0, dir); + if (r < 0) + return log_error_errno(r, "Failed to enumerate rules files: %m"); + + return verify_rules_filelist(rules, files, fail_count, success_count, /* walk_dirs */ false); +} + +static int verify_rules_filelist(UdevRules *rules, char **files, size_t *fail_count, size_t *success_count, bool walk_dirs) { + int r, rv = 0; + + assert(rules); + assert(files); + assert(fail_count); + assert(success_count); + + STRV_FOREACH(fp, files) { + if (walk_dirs && is_dir(*fp, /* follow = */ true) > 0) + r = verify_rules_dir(rules, *fp, fail_count, success_count); + else { + r = verify_rules_file(rules, *fp); + if (r < 0) + ++(*fail_count); + else + ++(*success_count); + } + if (r < 0 && rv >= 0) + rv = r; + } + + return rv; +} + +static int verify_rules(UdevRules *rules, char **files) { + size_t fail_count = 0, success_count = 0; + int r; + + assert(rules); + assert(files); + + r = verify_rules_filelist(rules, files, &fail_count, &success_count, /* walk_dirs */ true); + + if (arg_summary) + printf("\n%s%zu udev rules files have been checked.%s\n" + " Success: %zu\n" + "%s Fail: %zu%s\n", + ansi_highlight(), + fail_count + success_count, + ansi_normal(), + success_count, + fail_count > 0 ? ansi_highlight_red() : "", + fail_count, + fail_count > 0 ? ansi_normal() : ""); + + return r; +} + +int verify_main(int argc, char *argv[], void *userdata) { + _cleanup_(udev_rules_freep) UdevRules *rules = NULL; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + rules = udev_rules_new(arg_resolve_name_timing); + if (!rules) + return -ENOMEM; + + if (optind == argc) { + const char* const* rules_dirs = STRV_MAKE_CONST(CONF_PATHS("udev/rules.d")); + _cleanup_strv_free_ char **files = NULL; + + r = conf_files_list_strv(&files, ".rules", arg_root, 0, rules_dirs); + if (r < 0) + return log_error_errno(r, "Failed to enumerate rules files: %m"); + if (arg_root && strv_isempty(files)) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "No rules files found in %s.", arg_root); + + return verify_rules(rules, files); + } + + return verify_rules(rules, strv_skip(argv, optind)); +} diff --git a/src/udev/udevadm-wait.c b/src/udev/udevadm-wait.c new file mode 100644 index 0000000..e6620c2 --- /dev/null +++ b/src/udev/udevadm-wait.c @@ -0,0 +1,456 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include + +#include "sd-event.h" + +#include "alloc-util.h" +#include "chase.h" +#include "device-monitor-private.h" +#include "device-util.h" +#include "errno-util.h" +#include "event-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "inotify-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "static-destruct.h" +#include "string-table.h" +#include "strv.h" +#include "udev-util.h" +#include "udevadm.h" + +typedef enum WaitUntil { + WAIT_UNTIL_INITIALIZED, + WAIT_UNTIL_ADDED, + WAIT_UNTIL_REMOVED, + _WAIT_UNTIL_MAX, + _WAIT_UNTIL_INVALID = -EINVAL, +} WaitUntil; + +static WaitUntil arg_wait_until = WAIT_UNTIL_INITIALIZED; +static usec_t arg_timeout_usec = USEC_INFINITY; +static bool arg_settle = false; +static char **arg_devices = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_devices, strv_freep); + +static const char * const wait_until_table[_WAIT_UNTIL_MAX] = { + [WAIT_UNTIL_INITIALIZED] = "initialized", + [WAIT_UNTIL_ADDED] = "added", + [WAIT_UNTIL_REMOVED] = "removed", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(wait_until, WaitUntil); + +static int check_device(const char *path) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + int r; + + assert(path); + + if (arg_wait_until == WAIT_UNTIL_REMOVED) { + r = laccess(path, F_OK); + if (r == -ENOENT) + return true; + if (r < 0) + return r; + return false; + } + + r = sd_device_new_from_path(&dev, path); + if (r == -ENODEV) + return false; + if (r < 0) + return r; + + if (arg_wait_until == WAIT_UNTIL_INITIALIZED) + return sd_device_get_is_initialized(dev); + + return true; +} + +static bool check(void) { + int r; + + if (arg_settle) { + r = udev_queue_is_empty(); + if (r == 0) + return false; + if (r < 0) + log_warning_errno(r, "Failed to check if udev queue is empty, assuming empty: %m"); + } + + STRV_FOREACH(p, arg_devices) { + r = check_device(*p); + if (r <= 0) { + if (r < 0) + log_warning_errno(r, "Failed to check if device \"%s\" is %s, assuming not %s: %m", + *p, + wait_until_to_string(arg_wait_until), + wait_until_to_string(arg_wait_until)); + return false; + } + } + + return true; +} + +static int check_and_exit(sd_event *event) { + int r; + + assert(event); + + if (check()) { + r = sd_event_exit(event, 0); + if (r < 0) + return r; + + return 1; + } + + return 0; +} + +static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) { + const char *name; + int r; + + assert(monitor); + assert(device); + + if (device_for_action(device, SD_DEVICE_REMOVE) != (arg_wait_until == WAIT_UNTIL_REMOVED)) + return 0; + + if (arg_wait_until == WAIT_UNTIL_REMOVED) + /* On removed event, the received device may not contain enough information. + * Let's unconditionally check all requested devices are removed. */ + return check_and_exit(sd_device_monitor_get_event(monitor)); + + /* For other events, at first check if the received device matches with the requested devices, + * to avoid calling check() so many times within a short time. */ + + r = sd_device_get_sysname(device, &name); + if (r < 0) { + log_device_warning_errno(device, r, "Failed to get sysname of received device, ignoring: %m"); + return 0; + } + + STRV_FOREACH(p, arg_devices) { + const char *s; + + if (!path_startswith(*p, "/sys")) + continue; + + r = path_find_last_component(*p, false, NULL, &s); + if (r < 0) { + log_warning_errno(r, "Failed to extract filename from \"%s\", ignoring: %m", *p); + continue; + } + if (r == 0) + continue; + + if (strneq(s, name, r)) + return check_and_exit(sd_device_monitor_get_event(monitor)); + } + + r = sd_device_get_devname(device, &name); + if (r < 0) { + if (r != -ENOENT) + log_device_warning_errno(device, r, "Failed to get devname of received device, ignoring: %m"); + return 0; + } + + if (path_strv_contains(arg_devices, name)) + return check_and_exit(sd_device_monitor_get_event(monitor)); + + FOREACH_DEVICE_DEVLINK(device, link) + if (path_strv_contains(arg_devices, link)) + return check_and_exit(sd_device_monitor_get_event(monitor)); + + return 0; +} + +static int setup_monitor(sd_event *event, MonitorNetlinkGroup group, const char *description, sd_device_monitor **ret) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; + int r; + + assert(event); + assert(ret); + + r = device_monitor_new_full(&monitor, group, /* fd = */ -1); + if (r < 0) + return r; + + r = sd_device_monitor_attach_event(monitor, event); + if (r < 0) + return r; + + r = sd_device_monitor_set_description(monitor, description); + if (r < 0) + return r; + + r = sd_device_monitor_start(monitor, device_monitor_handler, NULL); + if (r < 0) + return r; + + *ret = TAKE_PTR(monitor); + return 0; +} + +static int on_inotify(sd_event_source *s, const struct inotify_event *event, void *userdata) { + return check_and_exit(sd_event_source_get_event(s)); +} + +static int setup_inotify(sd_event *event) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(event); + + if (!arg_settle) + return 0; + + r = sd_event_add_inotify(event, &s, "/run/udev" , IN_CREATE | IN_DELETE, on_inotify, NULL); + if (r < 0) + return r; + + r = sd_event_source_set_description(s, "inotify-event-source"); + if (r < 0) + return r; + + return sd_event_source_set_floating(s, true); +} + +static int setup_timer(sd_event *event) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(event); + + if (arg_timeout_usec == USEC_INFINITY) + return 0; + + r = sd_event_add_time_relative(event, &s, CLOCK_BOOTTIME, arg_timeout_usec, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)); + if (r < 0) + return r; + + r = sd_event_source_set_description(s, "timeout-event-source"); + if (r < 0) + return r; + + return sd_event_source_set_floating(s, true); +} + +static int reset_timer(sd_event *e, sd_event_source **s); + +static int on_periodic_timer(sd_event_source *s, uint64_t usec, void *userdata) { + static unsigned counter = 0; + sd_event *e; + int r; + + assert(s); + + e = sd_event_source_get_event(s); + + /* Even if all devices exists, we try to wait for uevents to be emitted from kernel. */ + if (check()) + counter++; + else + counter = 0; + + if (counter >= 2) { + log_debug("All requested devices popped up without receiving kernel uevents."); + return sd_event_exit(e, 0); + } + + r = reset_timer(e, &s); + if (r < 0) + log_warning_errno(r, "Failed to reset periodic timer event source, ignoring: %m"); + + return 0; +} + +static int reset_timer(sd_event *e, sd_event_source **s) { + return event_reset_time_relative(e, s, CLOCK_BOOTTIME, 250 * USEC_PER_MSEC, 0, + on_periodic_timer, NULL, 0, "periodic-timer-event-source", false); +} + +static int setup_periodic_timer(sd_event *event) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(event); + + r = reset_timer(event, &s); + if (r < 0) + return r; + + /* Set the lower priority than device monitor, to make uevents always dispatched first. */ + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL + 1); + if (r < 0) + return r; + + return sd_event_source_set_floating(s, true); +} + +static int help(void) { + printf("%s wait [OPTIONS] DEVICE [DEVICE…]\n\n" + "Wait for devices or device symlinks being created.\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n" + " -t --timeout=SEC Maximum time to wait for the device\n" + " --initialized=BOOL Wait for devices being initialized by systemd-udevd\n" + " --removed Wait for devices being removed\n" + " --settle Also wait for all queued events being processed\n", + program_invocation_short_name); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_INITIALIZED = 0x100, + ARG_REMOVED, + ARG_SETTLE, + }; + + static const struct option options[] = { + { "timeout", required_argument, NULL, 't' }, + { "initialized", required_argument, NULL, ARG_INITIALIZED }, + { "removed", no_argument, NULL, ARG_REMOVED }, + { "settle", no_argument, NULL, ARG_SETTLE }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + {} + }; + + int c, r; + + while ((c = getopt_long(argc, argv, "t:hV", options, NULL)) >= 0) + switch (c) { + case 't': + r = parse_sec(optarg, &arg_timeout_usec); + if (r < 0) + return log_error_errno(r, "Failed to parse -t/--timeout= parameter: %s", optarg); + break; + + case ARG_INITIALIZED: + r = parse_boolean(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --initialized= parameter: %s", optarg); + arg_wait_until = r ? WAIT_UNTIL_INITIALIZED : WAIT_UNTIL_ADDED; + break; + + case ARG_REMOVED: + arg_wait_until = WAIT_UNTIL_REMOVED; + break; + + case ARG_SETTLE: + arg_settle = true; + break; + + case 'V': + return print_version(); + + case 'h': + return help(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too few arguments, expected at least one device path or device symlink."); + + arg_devices = strv_copy(argv + optind); + if (!arg_devices) + return log_oom(); + + return 1; /* work to do */ +} + +int wait_main(int argc, char *argv[], void *userdata) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *udev_monitor = NULL, *kernel_monitor = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + STRV_FOREACH(p, arg_devices) { + path_simplify(*p); + + if (!path_is_safe(*p)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Device path cannot contain \"..\"."); + + if (!is_device_path(*p)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Specified path \"%s\" does not start with \"/dev/\" or \"/sys/\".", *p); + } + + /* Check before configuring event sources, as devices may be already initialized. */ + if (check()) + return 0; + + r = sd_event_default(&event); + if (r < 0) + return log_error_errno(r, "Failed to initialize sd-event: %m"); + + r = setup_timer(event); + if (r < 0) + return log_error_errno(r, "Failed to set up timeout: %m"); + + r = setup_inotify(event); + if (r < 0) + return log_error_errno(r, "Failed to set up inotify: %m"); + + r = setup_monitor(event, MONITOR_GROUP_UDEV, "udev-uevent-monitor-event-source", &udev_monitor); + if (r < 0) + return log_error_errno(r, "Failed to set up udev uevent monitor: %m"); + + if (arg_wait_until == WAIT_UNTIL_ADDED) { + /* If --initialized=no is specified, it is not necessary to wait uevents for the specified + * devices to be processed by udevd. Hence, let's listen on the kernel's uevent stream. Then, + * we may be able to finish this program earlier when udevd is very busy. + * Note, we still need to also setup udev monitor, as this may be invoked with a devlink + * (e.g. /dev/disk/by-id/foo). In that case, the devlink may not exist when we received a + * uevent from kernel, as the udevd may not finish to process the uevent yet. Hence, we need + * to wait until the event is processed by udevd. */ + r = setup_monitor(event, MONITOR_GROUP_KERNEL, "kernel-uevent-monitor-event-source", &kernel_monitor); + if (r < 0) + return log_error_errno(r, "Failed to set up kernel uevent monitor: %m"); + + /* This is a workaround for issues #24360 and #24450. + * For some reasons, the kernel sometimes does not emit uevents for loop block device on + * attach. Hence, without the periodic timer, no event source for this program will be + * triggered, and this will be timed out. + * Theoretically, inotify watch may be better, but this program typically expected to run in + * a short time. Hence, let's use the simpler periodic timer event source here. */ + r = setup_periodic_timer(event); + if (r < 0) + return log_error_errno(r, "Failed to set up periodic timer: %m"); + } + + /* Check before entering the event loop, as devices may be initialized during setting up event sources. */ + if (check()) + return 0; + + r = sd_event_loop(event); + if (r == -ETIMEDOUT) + return log_error_errno(r, "Timed out for waiting devices being %s.", + wait_until_to_string(arg_wait_until)); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} diff --git a/src/udev/udevadm.c b/src/udev/udevadm.c new file mode 100644 index 0000000..687b927 --- /dev/null +++ b/src/udev/udevadm.c @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "main-func.h" +#include "pretty-print.h" +#include "process-util.h" +#include "selinux-util.h" +#include "string-util.h" +#include "udev-util.h" +#include "udevadm.h" +#include "udevd.h" +#include "verbs.h" + +static int help(void) { + static const char *const short_descriptions[][2] = { + { "info", "Query sysfs or the udev database" }, + { "trigger", "Request events from the kernel" }, + { "settle", "Wait for pending udev events" }, + { "control", "Control the udev daemon" }, + { "monitor", "Listen to kernel and udev events" }, + { "test", "Test an event run" }, + { "test-builtin", "Test a built-in command" }, + { "verify", "Verify udev rules files" }, + { "wait", "Wait for device or device symlink" }, + { "lock", "Lock a block device" }, + }; + + _cleanup_free_ char *link = NULL; + size_t i; + int r; + + r = terminal_urlify_man("udevadm", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [--help] [--version] [--debug] COMMAND [COMMAND OPTIONS]\n\n" + "Send control commands or test the device manager.\n\n" + "Commands:\n", + program_invocation_short_name); + + for (i = 0; i < ELEMENTSOF(short_descriptions); i++) + printf(" %-12s %s\n", short_descriptions[i][0], short_descriptions[i][1]); + + printf("\nSee the %s for details.\n", link); + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "debug", no_argument, NULL, 'd' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + {} + }; + int c; + + assert(argc >= 0); + assert(argv); + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + while ((c = getopt_long(argc, argv, "+dhV", options, NULL)) >= 0) + switch (c) { + + case 'd': + log_set_max_level(LOG_DEBUG); + break; + + case 'h': + return help(); + + case 'V': + return print_version(); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; /* work to do */ +} + +static int version_main(int argc, char *argv[], void *userdata) { + return print_version(); +} + +static int help_main(int argc, char *argv[], void *userdata) { + return help(); +} + +static int udevadm_main(int argc, char *argv[]) { + static const Verb verbs[] = { + { "info", VERB_ANY, VERB_ANY, 0, info_main }, + { "trigger", VERB_ANY, VERB_ANY, 0, trigger_main }, + { "settle", VERB_ANY, VERB_ANY, 0, settle_main }, + { "control", VERB_ANY, VERB_ANY, 0, control_main }, + { "monitor", VERB_ANY, VERB_ANY, 0, monitor_main }, + { "hwdb", VERB_ANY, VERB_ANY, 0, hwdb_main }, + { "test", VERB_ANY, VERB_ANY, 0, test_main }, + { "test-builtin", VERB_ANY, VERB_ANY, 0, builtin_main }, + { "wait", VERB_ANY, VERB_ANY, 0, wait_main }, + { "lock", VERB_ANY, VERB_ANY, 0, lock_main }, + { "verify", VERB_ANY, VERB_ANY, 0, verify_main }, + { "version", VERB_ANY, VERB_ANY, 0, version_main }, + { "help", VERB_ANY, VERB_ANY, 0, help_main }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r; + + if (invoked_as(argv, "udevd")) + return run_udevd(argc, argv); + + udev_parse_config(); + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = mac_init(); + if (r < 0) + return r; + + return udevadm_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/udev/udevadm.h b/src/udev/udevadm.h new file mode 100644 index 0000000..7920a70 --- /dev/null +++ b/src/udev/udevadm.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +#include + +#include "macro.h" + +int info_main(int argc, char *argv[], void *userdata); +int trigger_main(int argc, char *argv[], void *userdata); +int settle_main(int argc, char *argv[], void *userdata); +int control_main(int argc, char *argv[], void *userdata); +int monitor_main(int argc, char *argv[], void *userdata); +int hwdb_main(int argc, char *argv[], void *userdata); +int test_main(int argc, char *argv[], void *userdata); +int builtin_main(int argc, char *argv[], void *userdata); +int verify_main(int argc, char *argv[], void *userdata); +int wait_main(int argc, char *argv[], void *userdata); +int lock_main(int argc, char *argv[], void *userdata); + +static inline int print_version(void) { + /* Dracut relies on the version being a single integer */ + puts(STRINGIFY(PROJECT_VERSION)); + return 0; +} diff --git a/src/udev/udevd.c b/src/udev/udevd.c new file mode 100644 index 0000000..2ed4282 --- /dev/null +++ b/src/udev/udevd.c @@ -0,0 +1,408 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright © 2004 Chris Friesen + * Copyright © 2009 Canonical Ltd. + * Copyright © 2009 Scott James Remnant + */ + +#include +#include + +#include "sd-daemon.h" + +#include "env-file.h" +#include "errno-util.h" +#include "fd-util.h" +#include "mkdir.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "selinux-util.h" +#include "signal-util.h" +#include "syslog-util.h" +#include "udev-manager.h" +#include "udev-util.h" +#include "udevd.h" +#include "version.h" + +static bool arg_debug = false; +static int arg_daemonize = false; + +static int listen_fds(int *ret_ctrl, int *ret_netlink) { + int ctrl_fd = -EBADF, netlink_fd = -EBADF; + int fd, n; + + assert(ret_ctrl); + assert(ret_netlink); + + n = sd_listen_fds(true); + if (n < 0) + return n; + + for (fd = SD_LISTEN_FDS_START; fd < n + SD_LISTEN_FDS_START; fd++) { + if (sd_is_socket(fd, AF_UNIX, SOCK_SEQPACKET, -1) > 0) { + if (ctrl_fd >= 0) + return -EINVAL; + ctrl_fd = fd; + continue; + } + + if (sd_is_socket(fd, AF_NETLINK, SOCK_RAW, -1) > 0) { + if (netlink_fd >= 0) + return -EINVAL; + netlink_fd = fd; + continue; + } + + return -EINVAL; + } + + *ret_ctrl = ctrl_fd; + *ret_netlink = netlink_fd; + + return 0; +} + +static int manager_parse_udev_config(Manager *manager) { + _cleanup_free_ char *log_val = NULL, *children_max = NULL, *exec_delay = NULL, + *event_timeout = NULL, *resolve_names = NULL, *timeout_signal = NULL; + int r; + + assert(manager); + + r = parse_env_file(NULL, "/etc/udev/udev.conf", + "udev_log", &log_val, + "children_max", &children_max, + "exec_delay", &exec_delay, + "event_timeout", &event_timeout, + "resolve_names", &resolve_names, + "timeout_signal", &timeout_signal); + if (r == -ENOENT) + return 0; + if (r < 0) + return r; + + r = udev_set_max_log_level(log_val); + if (r < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to set udev log level '%s', ignoring: %m", log_val); + + if (children_max) { + r = safe_atou(children_max, &manager->children_max); + if (r < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to parse children_max=%s, ignoring: %m", children_max); + } + + if (exec_delay) { + r = parse_sec(exec_delay, &manager->exec_delay_usec); + if (r < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to parse exec_delay=%s, ignoring: %m", exec_delay); + } + + if (event_timeout) { + r = parse_sec(event_timeout, &manager->timeout_usec); + if (r < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to parse event_timeout=%s, ignoring: %m", event_timeout); + } + + if (resolve_names) { + ResolveNameTiming t; + + t = resolve_name_timing_from_string(resolve_names); + if (t < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to parse resolve_names=%s, ignoring.", resolve_names); + else + manager->resolve_name_timing = t; + } + + if (timeout_signal) { + r = signal_from_string(timeout_signal); + if (r < 0) + log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r, + "Failed to parse timeout_signal=%s, ignoring: %m", timeout_signal); + else + manager->timeout_signal = r; + } + + return 0; +} + +/* + * read the kernel command line, in case we need to get into debug mode + * udev.log_level= syslog priority + * udev.children_max= events are fully serialized if set to 1 + * udev.exec_delay= delay execution of every executed program + * udev.event_timeout= seconds to wait before terminating an event + * udev.blockdev_read_only<=bool> mark all block devices read-only when they appear + */ +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + Manager *manager = ASSERT_PTR(data); + int r; + + assert(key); + + if (proc_cmdline_key_streq(key, "udev.log_level") || + proc_cmdline_key_streq(key, "udev.log_priority")) { /* kept for backward compatibility */ + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = log_level_from_string(value); + if (r >= 0) + log_set_max_level(r); + + } else if (proc_cmdline_key_streq(key, "udev.event_timeout")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &manager->timeout_usec); + + } else if (proc_cmdline_key_streq(key, "udev.children_max")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = safe_atou(value, &manager->children_max); + + } else if (proc_cmdline_key_streq(key, "udev.exec_delay")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = parse_sec(value, &manager->exec_delay_usec); + + } else if (proc_cmdline_key_streq(key, "udev.timeout_signal")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = signal_from_string(value); + if (r > 0) + manager->timeout_signal = r; + + } else if (proc_cmdline_key_streq(key, "udev.blockdev_read_only")) { + + if (!value) + manager->blockdev_read_only = true; + else { + r = parse_boolean(value); + if (r < 0) + log_warning_errno(r, "Failed to parse udev.blockdev-read-only argument, ignoring: %s", value); + else + manager->blockdev_read_only = r; + } + + if (manager->blockdev_read_only) + log_notice("All physical block devices will be marked read-only."); + + return 0; + + } else { + if (startswith(key, "udev.")) + log_warning("Unknown udev kernel command line option \"%s\", ignoring.", key); + + return 0; + } + + if (r < 0) + log_warning_errno(r, "Failed to parse \"%s=%s\", ignoring: %m", key, value); + + return 0; +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-udevd.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "Rule-based manager for device events and files.\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n" + " -d --daemon Detach and run in the background\n" + " -D --debug Enable debug output\n" + " -c --children-max=INT Set maximum number of workers\n" + " -e --exec-delay=SECONDS Seconds to wait before executing RUN=\n" + " -t --event-timeout=SECONDS Seconds to wait before terminating an event\n" + " -N --resolve-names=early|late|never\n" + " When to resolve users and groups\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[], Manager *manager) { + enum { + ARG_TIMEOUT_SIGNAL, + }; + + static const struct option options[] = { + { "daemon", no_argument, NULL, 'd' }, + { "debug", no_argument, NULL, 'D' }, + { "children-max", required_argument, NULL, 'c' }, + { "exec-delay", required_argument, NULL, 'e' }, + { "event-timeout", required_argument, NULL, 't' }, + { "resolve-names", required_argument, NULL, 'N' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "timeout-signal", required_argument, NULL, ARG_TIMEOUT_SIGNAL }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + assert(manager); + + while ((c = getopt_long(argc, argv, "c:de:Dt:N:hV", options, NULL)) >= 0) { + switch (c) { + + case 'd': + arg_daemonize = true; + break; + case 'c': + r = safe_atou(optarg, &manager->children_max); + if (r < 0) + log_warning_errno(r, "Failed to parse --children-max= value '%s', ignoring: %m", optarg); + break; + case 'e': + r = parse_sec(optarg, &manager->exec_delay_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse --exec-delay= value '%s', ignoring: %m", optarg); + break; + case ARG_TIMEOUT_SIGNAL: + r = signal_from_string(optarg); + if (r <= 0) + log_warning_errno(r, "Failed to parse --timeout-signal= value '%s', ignoring: %m", optarg); + else + manager->timeout_signal = r; + + break; + case 't': + r = parse_sec(optarg, &manager->timeout_usec); + if (r < 0) + log_warning_errno(r, "Failed to parse --event-timeout= value '%s', ignoring: %m", optarg); + break; + case 'D': + arg_debug = true; + break; + case 'N': { + ResolveNameTiming t; + + t = resolve_name_timing_from_string(optarg); + if (t < 0) + log_warning("Invalid --resolve-names= value '%s', ignoring.", optarg); + else + manager->resolve_name_timing = t; + break; + } + case 'h': + return help(); + case 'V': + printf("%s\n", GIT_VERSION); + return 0; + case '?': + return -EINVAL; + default: + assert_not_reached(); + + } + } + + return 1; +} + +int run_udevd(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *manager = NULL; + int fd_ctrl = -EBADF, fd_uevent = -EBADF; + int r; + + log_set_target(LOG_TARGET_AUTO); + log_open(); + + manager = manager_new(); + if (!manager) + return log_oom(); + + manager_parse_udev_config(manager); + + log_parse_environment(); + log_open(); /* Done again to update after reading configuration. */ + + r = parse_argv(argc, argv, manager); + if (r <= 0) + return r; + + r = proc_cmdline_parse(parse_proc_cmdline_item, manager, PROC_CMDLINE_STRIP_RD_PREFIX); + if (r < 0) + log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m"); + + if (arg_debug) { + log_set_target(LOG_TARGET_CONSOLE); + log_set_max_level(LOG_DEBUG); + } + + r = must_be_root(); + if (r < 0) + return r; + + /* set umask before creating any file/directory */ + umask(022); + + r = mac_init(); + if (r < 0) + return r; + + /* Make sure we can have plenty fds (for example for pidfds) */ + (void) rlimit_nofile_bump(-1); + + r = RET_NERRNO(mkdir("/run/udev", 0755)); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Failed to create /run/udev: %m"); + + r = listen_fds(&fd_ctrl, &fd_uevent); + if (r < 0) + return log_error_errno(r, "Failed to listen on fds: %m"); + + r = manager_init(manager, fd_ctrl, fd_uevent); + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + + if (arg_daemonize) { + pid_t pid; + + log_info("Starting systemd-udevd version " GIT_VERSION); + + /* connect /dev/null to stdin, stdout, stderr */ + if (log_get_max_level() < LOG_DEBUG) { + r = make_null_stdio(); + if (r < 0) + log_warning_errno(r, "Failed to redirect standard streams to /dev/null: %m"); + } + + pid = fork(); + if (pid < 0) + return log_error_errno(errno, "Failed to fork daemon: %m"); + if (pid > 0) + /* parent */ + return 0; + + /* child */ + (void) setsid(); + } + + return manager_main(manager); +} diff --git a/src/udev/udevd.h b/src/udev/udevd.h new file mode 100644 index 0000000..583e895 --- /dev/null +++ b/src/udev/udevd.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#pragma once + +int run_udevd(int argc, char *argv[]); diff --git a/src/udev/v4l_id/v4l_id.c b/src/udev/v4l_id/v4l_id.c new file mode 100644 index 0000000..30527e9 --- /dev/null +++ b/src/udev/v4l_id/v4l_id.c @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2009 Filippo Argiolas + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details: + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "build.h" +#include "fd-util.h" +#include "main-func.h" + +static const char *arg_device = NULL; + +static int parse_argv(int argc, char *argv[]) { + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + {} + }; + int c; + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + switch (c) { + case 'h': + printf("%s [OPTIONS...] DEVICE\n\n" + "Video4Linux device identification.\n\n" + " -h --help Show this help text\n" + " --version Show package version\n", + program_invocation_short_name); + return 0; + case 'v': + return version(); + case '?': + return -EINVAL; + default: + assert_not_reached(); + } + + if (!argv[optind]) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "DEVICE argument missing."); + + arg_device = argv[optind]; + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_close_ int fd = -EBADF; + struct v4l2_capability v2cap; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + fd = open(arg_device, O_RDONLY|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", arg_device); + + if (ioctl(fd, VIDIOC_QUERYCAP, &v2cap) == 0) { + int capabilities; + + printf("ID_V4L_VERSION=2\n"); + printf("ID_V4L_PRODUCT=%s\n", v2cap.card); + printf("ID_V4L_CAPABILITIES=:"); + + if (v2cap.capabilities & V4L2_CAP_DEVICE_CAPS) + capabilities = v2cap.device_caps; + else + capabilities = v2cap.capabilities; + + if ((capabilities & V4L2_CAP_VIDEO_CAPTURE) > 0 || + (capabilities & V4L2_CAP_VIDEO_CAPTURE_MPLANE) > 0) + printf("capture:"); + if ((capabilities & V4L2_CAP_VIDEO_OUTPUT) > 0 || + (capabilities & V4L2_CAP_VIDEO_OUTPUT_MPLANE) > 0) + printf("video_output:"); + if ((capabilities & V4L2_CAP_VIDEO_OVERLAY) > 0) + printf("video_overlay:"); + if ((capabilities & V4L2_CAP_AUDIO) > 0) + printf("audio:"); + if ((capabilities & V4L2_CAP_TUNER) > 0) + printf("tuner:"); + if ((capabilities & V4L2_CAP_RADIO) > 0) + printf("radio:"); + printf("\n"); + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/ukify/test/example.signing.crt.base64 b/src/ukify/test/example.signing.crt.base64 new file mode 100644 index 0000000..694d13b --- /dev/null +++ b/src/ukify/test/example.signing.crt.base64 @@ -0,0 +1,23 @@ +LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURsVENDQW4yZ0F3SUJBZ0lVTzlqUWhhblhj +b3ViOERzdXlMMWdZbksrR1lvd0RRWUpLb1pJaHZjTkFRRUwKQlFBd1dURUxNQWtHQTFVRUJoTUNX +Rmd4RlRBVEJnTlZCQWNNREVSbFptRjFiSFFnUTJsMGVURWNNQm9HQTFVRQpDZ3dUUkdWbVlYVnNk +Q0JEYjIxd1lXNTVJRXgwWkRFVk1CTUdBMVVFQXd3TWEyVjVJSE5wWjI1cGJtbG5NQ0FYCkRUSXlN +VEF5T1RFM01qY3dNVm9ZRHpNd01qSXdNekF4TVRjeU56QXhXakJaTVFzd0NRWURWUVFHRXdKWVdE +RVYKTUJNR0ExVUVCd3dNUkdWbVlYVnNkQ0JEYVhSNU1Sd3dHZ1lEVlFRS0RCTkVaV1poZFd4MElF +TnZiWEJoYm5rZwpUSFJrTVJVd0V3WURWUVFEREF4clpYa2djMmxuYm1sdWFXY3dnZ0VpTUEwR0NT +cUdTSWIzRFFFQkFRVUFBNElCCkR3QXdnZ0VLQW9JQkFRREtVeHR4Y0d1aGYvdUp1SXRjWEhvdW0v +RE9RL1RJM3BzUWlaR0ZWRkJzbHBicU5wZDUKa2JDaUFMNmgrY1FYaGRjUmlOT1dBR0wyMFZ1T2Rv +VTZrYzlkdklGQnFzKzc2NHhvWGY1UGd2SlhvQUxSUGxDZAp4YVdPQzFsOFFIRHpxZ09SdnREMWNI +WFoveTkvZ1YxVU1GK1FlYm12aUhRN0U4eGw1T2h5MG1TQVZYRDhBTitsCjdpMUR6N0NuTzhrMVph +alhqYXlpNWV1WEV0TnFSZXNuVktRRElTQ0t2STFueUxySWxHRU1GZmFuUmRLQWthZ3MKalJnTmVh +T3N3aklHNjV6UzFVdjJTZXcxVFpIaFhtUmd5TzRVT0JySHZlSml2T2hObzU3UlRKd0M2K2lGY0FG +aApSSnorVmM2QUlSSkI1ZWtJUmdCN3VDNEI5ZmwydXdZKytMODNBZ01CQUFHalV6QlJNQjBHQTFV +ZERnUVdCQlFqCllIMnpzVFlPQU51MkcweXk1QkxlOHBvbWZUQWZCZ05WSFNNRUdEQVdnQlFqWUgy +enNUWU9BTnUyRzB5eTVCTGUKOHBvbWZUQVBCZ05WSFJNQkFmOEVCVEFEQVFIL01BMEdDU3FHU0li +M0RRRUJDd1VBQTRJQkFRQ2dxcmFXaE51dQptUmZPUjVxcURVcC83RkpIL1N6Zk1vaDBHL2lWRkhv +OUpSS0tqMUZ2Q0VZc1NmeThYTmdaUDI5eS81Z0h4cmcrCjhwZWx6bWJLczdhUTRPK01TcmIzTm11 +V1IzT0M0alBoNENrM09ZbDlhQy9iYlJqSWFvMDJ6K29XQWNZZS9xYTEKK2ZsemZWVEUwMHJ5V1RM +K0FJdDFEZEVqaG01WXNtYlgvbWtacUV1TjBtSVhhRXhSVE9walczUWRNeVRQaURTdApvanQvQWMv +R2RUWDd0QkhPTk44Z3djaC91V293aVNORERMUm1wM2VScnlOZ3RPKzBISUd5Qm16ZWNsM0VlVEo2 +CnJzOGRWUFhqR1Z4dlZDb2tqQllrOWdxbkNGZEJCMGx4VXVNZldWdVkyRUgwSjI3aGh4SXNFc3ls +VTNIR1EyK2MKN1JicVY4VTNSRzA4Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K diff --git a/src/ukify/test/example.signing.key.base64 b/src/ukify/test/example.signing.key.base64 new file mode 100644 index 0000000..88baedb --- /dev/null +++ b/src/ukify/test/example.signing.key.base64 @@ -0,0 +1,30 @@ +LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZB +QVNDQktnd2dnU2tBZ0VBQW9JQkFRREtVeHR4Y0d1aGYvdUoKdUl0Y1hIb3VtL0RPUS9USTNwc1Fp +WkdGVkZCc2xwYnFOcGQ1a2JDaUFMNmgrY1FYaGRjUmlOT1dBR0wyMFZ1Twpkb1U2a2M5ZHZJRkJx +cys3NjR4b1hmNVBndkpYb0FMUlBsQ2R4YVdPQzFsOFFIRHpxZ09SdnREMWNIWFoveTkvCmdWMVVN +RitRZWJtdmlIUTdFOHhsNU9oeTBtU0FWWEQ4QU4rbDdpMUR6N0NuTzhrMVphalhqYXlpNWV1WEV0 +TnEKUmVzblZLUURJU0NLdkkxbnlMcklsR0VNRmZhblJkS0FrYWdzalJnTmVhT3N3aklHNjV6UzFV +djJTZXcxVFpIaApYbVJneU80VU9Cckh2ZUppdk9oTm81N1JUSndDNitpRmNBRmhSSnorVmM2QUlS +SkI1ZWtJUmdCN3VDNEI5ZmwyCnV3WSsrTDgzQWdNQkFBRUNnZ0VBQkhZQ28rU3JxdHJzaStQU3hz +MlBNQm5tSEZZcFBvaVIrTEpmMEFYRTVEQUoKMGM0MFZzemNqU1hoRGljNHFLQWQxdGdpZWlzMkEy +VW9WS0xPV3pVOTBqNUd4MURoMWEzaTRhWTQ1ajNuNUFDMgpMekRsakNVQWVucExsYzdCN3MxdjJM +WFJXNmdJSVM5Y043NTlkVTYvdktyQ2FsbGkzcTZZRWlNUzhQMHNsQnZFCkZtdEc1elFsOVJjV0gr +cHBqdzlIMTJSZ3BldUVJVEQ2cE0vd2xwcXZHRlUwcmZjM0NjMHhzaWdNTnh1Z1FJNGgKbnpjWDVs +OEs0SHdvbmhOTG9TYkh6OU5BK3p3QkpuUlZVSWFaaEVjSThtaEVPWHRaRkpYc01aRnhjS2l3SHFS +dApqUUVHOHJRa3lPLytXMmR5Z2czV1lNYXE1OWpUWVdIOUsrQmFyeEMzRVFLQmdRRFBNSFMycjgz +ZUpRTTlreXpkCndDdnlmWGhQVlVtbVJnOGwyWng0aC9tci9mNUdDeW5SdzRzT2JuZGVQd29tZ1Iz +cFBleFFGWlFFSExoZ1RGY3UKVk5uYXcrTzBFL1VnL01pRGswZDNXU0hVZXZPZnM1cEM2b3hYNjNT +VENwNkVLT2VEZlpVMW9OeHRsZ0YyRVhjcgpmVlZpSzFKRGk3N2dtaENLcFNGcjBLK3gyUUtCZ1FE +NS9VUC9hNU52clExdUhnKzR0SzJZSFhSK1lUOFREZG00Ck8xZmh5TU5lOHRYSkd5UUJjTktVTWg2 +M2VyR1MwWlRWdGdkNHlGS3RuOGtLU2U4TmlacUl1aitVUVIyZ3pEQVAKQ2VXcXl2Y2pRNmovU1Yw +WjVvKzlTNytiOStpWWx5RTg2bGZobHh5Z21aNnptYisxUUNteUtNVUdBNis5VmUvMgo1MHhDMXBB +L2p3S0JnUUNEOHA4UnpVcDFZK3I1WnVaVzN0RGVJSXZqTWpTeVFNSGE0QWhuTm1tSjREcjBUcDIy +CmFpci82TmY2WEhsUlpqOHZVSEZUMnpvbG1FalBneTZ1WWZsUCtocmtqeVU0ZWVRVTcxRy9Mek45 +UjBRcCs4Nk4KT1NSaHhhQzdHRE0xaFh0VFlVSUtJa1RmUVgzeXZGTEJqcE0yN3RINEZHSmVWWitk +UEdiWmE5REltUUtCZ1FENQpHTU5qeExiQnhhY25QYThldG5KdnE1SUR5RFRJY0xtc1dQMkZ6cjNX +WTVSZzhybGE4aWZ5WVVxNE92cXNPRWZjCjk2ZlVVNUFHejd2TWs4VXZNUmtaK3JRVnJ4aXR2Q2g3 +STdxRkIvOWdWVEFWU080TE8vR29oczBqeGRBd0ZBK2IKbWtyOVQ4ekh2cXNqZlNWSW51bXRTL0Nl +d0plaHl2cjBoSjg1em9Fbnd3S0JnR1h6UXVDSjJDb3NHVVhEdnlHKwpyRzVBd3pUZGd0bHg4YTBK +NTg1OWtZbVd0cW5WYUJmbFdrRmNUcHNEaGZ2ZWVDUkswc29VRlNPWkcranpsbWJrCkpRL09aVkZJ +dG9MSVZCeE9qeWVXNlhUSkJXUzFRSkVHckkwY0tTbXNKcENtUXVPdUxMVnZYczV0U21CVmc5RXQK +MjZzUkZwcjVWWmsrZlNRa3RhbkM4NGV1Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K diff --git a/src/ukify/test/example.tpm2-pcr-private.pem.base64 b/src/ukify/test/example.tpm2-pcr-private.pem.base64 new file mode 100644 index 0000000..586b28e --- /dev/null +++ b/src/ukify/test/example.tpm2-pcr-private.pem.base64 @@ -0,0 +1,30 @@ +LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZB +QVNDQktnd2dnU2tBZ0VBQW9JQkFRQzVuOHFhbzVNZ1BJUVcKc0F5Y2R3dnB1bjdNNHlRSW9FL3I3 +ekFGTG1hZlBXclo3d2JaaUIyTkY1MVdHOEo4bnlDQkI3M0RLcmZaeWs5cwphQXdXVW5RR2t0dGFv +RXpXRzZSRTM3dXdQOUpVM09YdklTNTBhcy9KSHVHNlJPYmE2V0NOOFp2TTdkZGpvTDFKCkZlYnBS +SXI1Vi82VStMTFhrUnRNYVczUnZ6T0xYeU1NT2QzOEcxZ0d0VlRHcm90ejVldFgrTUNVU2lOVGFE +OVUKN1dEZXVsZXVpMlRnK1I3TGRoSXg3ZTQ5cEhRM3d6a1NxeFQ4SGpoU3ZURWpITWVSNjIwaUhF +ZW9uYzdsMXVnagpzY1pwTktHdk13bXUvU2ptWFp6UkpOdjVOU0txcEVnQll2RnFkS3dUdlc4MWl6 +SUFvN3paMkx6NDJYb25zSWJ2CjNrbGZqTG1mQWdNQkFBRUNnZ0VBQXozYm8yeTAzb3kvLzhkdVNQ +TTVSWWtvdXJwQ3dGWFFYMzNyV0VQUnJmazgKR3ZjMkp1bGVIcjhwVTc0alhOcklqZ2hORTVIMDZQ +eEQrOUFyV2Q1eHdVV2lTQWhobnlHWGNrNTM4Q0dGTWs4egpRc1JSRTk1anA0Ny9BU28vMzlYUWhs +b1FUdmxlV0JLUUM2MHl2YU1oVEM1eHR6ZEtwRUlYK0hNazVGTlMrcDJVCmxtL3AzVE1YWDl1bmc5 +Mk9pTzUzV1VreFpQN2cwTVJHbGJrNzhqc1dkdjFYY0tLRjhuVmU5WC9NR1lTYlVLNy8KM2NYazFR +WTRUdVZaQlBFSE12RFRpWWwxbmdDd1ZuL2MyY3JQU3hJRFdFWlhEdm90SFUwQkNQZURVckxGa0F5 +cQpDaloza3MzdEh4am42STkraEVNcUJDMzY1MHFjdDNkZ0RVV2loc2MzdVFLQmdRRG1mVTNKc29K +QWFOdmxCbXgyClhzRDRqbXlXV1F2Z244cVNVNG03a2JKdmprcUJ6VnB0T0ZsYmk2ejZHOXR6ZHNX +a0dJSjh3T0ZRb1hlM0dKOFIKSlVpeEFXTWZOM1JURGo5VjVXbzZJdE5EbzM1N3dNbVVYOW1qeThF +YXp0RE1BckdSNGJva0Q5RjY3clhqSGdSMQpaZVcvSDlUWHFUV1l4VHl6UDB3ZDBQeUZ4d0tCZ1FE +T0swWHVQS0o0WG00WmFCemN0OTdETXdDcFBSVmVvUWU3CmkzQjRJQ3orWFZ4cVM2amFTY2xNeEVm +Nk5tM2tLNERDR1dwVkpXcm9qNjlMck1KWnQzTlI2VUJ5NzNqUVBSamsKRXk5N3YrR04yVGwwNjFw +ZUxUM0dRS2RhT2VxWldpdElOcFc1dUxHL1poMGhoRUY5c1lSVTRtUFYwUWpla2kvdgp1bnVmcWx0 +TmFRS0JnQTl6TE1pdFg0L0R0NkcxZVlYQnVqdXZDRlpYcDdVcDRPRklHajVwZU1XRGl6a0NNK0tJ +CldXMEtndERORnp1NUpXeG5mQyt5bWlmV2V2alovS2Vna1N2VVJQbXR0TzF3VWd5RzhVVHVXcXo1 +QTV4MkFzMGcKVTYxb0ZneWUrbDRDZkRha0k5OFE5R0RDS1kwTTBRMnhnK0g0MTBLUmhCYzJlV2dt +Z1FxcW5KSzNBb0dCQU1rZgpnOWZXQlBVQndjdzlPYkxFR0tjNkVSSUlTZG1IbytCOE5kcXFJTnAv +djFEZXdEazZ0QXFVakZiMlZCdTdxSjh4ClpmN3NRcS9ldzdaQ01WS09XUXgyVEc0VFdUdGo3dTFJ +SGhGTjdiNlFRN0hnaXNiR3diV3VpdFBGSGl3OXYyMXgKK253MFJnb2VscHFFeDlMVG92R2Y3SjdB +ampONlR4TkJTNnBGNlUzSkFvR0JBT0tnbHlRWDJpVG5oMXd4RG1TVQo4RXhoQVN3S09iNS8yRmx4 +aUhtUHVjNTZpS0tHY0lkV1cxMUdtbzdJakNzSTNvRm9iRkFjKzBXZkMvQTdMNWlmCjNBYVNWcmh0 +cThRRklRaUtyYUQ0YlRtRk9Famg5QVVtUHMrWnd1OE9lSXJBSWtwZDV3YmlhTEJLd0pRbVdtSFAK +dUNBRTA3cXlSWXJ0c3QvcnVSSG5IdFA1Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K diff --git a/src/ukify/test/example.tpm2-pcr-private2.pem.base64 b/src/ukify/test/example.tpm2-pcr-private2.pem.base64 new file mode 100644 index 0000000..d21a3d6 --- /dev/null +++ b/src/ukify/test/example.tpm2-pcr-private2.pem.base64 @@ -0,0 +1,30 @@ +LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2QUlCQURBTkJna3Foa2lHOXcwQkFRRUZB +QVNDQktZd2dnU2lBZ0VBQW9JQkFRQzJ2Nk1oZHg3a3VjUHIKbmtFNFIrY3FnV2Y5T3B1c2h2M2o3 +SG50K08wdi84d2l2T1BFNTlLMHYvRWJOOG94TDZEWUNXU0JCRU4vREJ5MgpMUTYwbldSdHBZN2Ju +bEcrcEtVeTRvSDRNZXZCR2JqZUhrak9LU3dNYVVWNGs4UmVSSjg4cVZ1U1MxSnVORW1NCmd5SERF +NGFPNG5ndG5UUFZZdzUydVBIcG1rN0E4VFdXN2lLZE5JWWZWOCtuR1pENXIzRWllekRsUUNORG54 +UkcKdm5uSFZ6VFhZR3RwY2xaeWlJclpVekpBNFFPZnRueXB5UDVrQS94NVM1MU9QeGFxWlA3eGtP +S0NicUUvZmZvMApFTi9rTno0N0ZoUGUxbVBHUkZZWldHZXg0aWFPdHlLdHhnU1FYYkdlNEVoeVR4 +SjJlT3U4QUVoVklTdjh6UU9nClNtbWx2UGQvQWdNQkFBRUNnZ0VBUUFpRERRRlR3bG96QTVhMmpK +VnBNdlFYNzF0L1c2TUxTRGMrZS90cWhKU1IKUHlUSGZHR3NhMmdMLy9qNjhHUWJiRWRTUDRDeWM4 +eFhMU0E1bEdESDVVR0svbm9KYzQ3MlVZK2JjYzl3SjMrdgpUcWoyNHNIN2JMZmdQMEVybjhwVXIy +azZMRmNYSVlWUnRobm1sUmQ4NFFrS2loVVlxZTdsRFFWOXdsZ3V1eHpRCnBmVEtDTWk1bXJlYjIx +OExHS0QrMUxjVmVYZjExamc3Z2JnMllLZ1dOQ2R3VmIyUzJ5V0hTTjBlT3hPd21kWXIKSUVCekpG +eEc2MFJxSlJ1RzVIam9iemc2cy9ycUo1THFta3JhUWh6bHFPQVZLblpGOHppbG9vcDhXUXBQY3RN +cwp0cHBjczhtYkFkWHpoSTVjN0U1VVpDM2NJcEd6SE4raDZLK0F3R3ZEeVFLQmdRRDRBOTdQM29v +dGhoMHZHQmFWCnZWOXhHTm1YbW5TeUg0b29HcmJvaG1JWkkwVlhZdms5dWViSUJjbDZRMUx4WnN3 +UFpRMVh5TUhpTjY1Z0E1emgKai9HZGcrdDlvcU5CZml0TUFrUTl1aWxvaXlKVWhYblk5REMvRitl +ZksycEpNbHdkci9qWEttRHpkQUZBVDgyWQpWRmJ3MlpLVi9GNEJNMUtCdDBZN0RPTmlad0tCZ1FD +OG9kZk0waytqL25VSzQ4TEV2MGtGbVNMdWdnTVlkM3hVCmZibmx0cUhFTVpJZU45OFVHK2hBWEdw +dU1Ya0JPM2Mwcm5ZRDVXZkNBRzFxT1V2ZTZzdHd6N0VuK3hWdlkvcWEKU3ZTaDRzMzhnZlBIeXhR +aGJvNWRwQTZUT3pwT0MyVi9rVXBVRUdJSmVVVllhQ05uWXNpUjRWUGVWL1lvR1htSwpQV29KbnAw +REtRS0JnQlk3cXBheDJXczVVWlp1TDJBZkNOWkhwd0hySzdqb0VPZUZkWTRrdGRpUkM5OUlsUlZP +CmUvekVZQXBnektldFVtK3kzRjVaTmVCRW81SWg0TWRyc3ZvdTRFWno5UFNqRGRpVGYzQ1ZKcThq +Z2VGWDBkTjgKR0g2WTh2K1cwY0ZjRFZ2djhYdkFaYzZOUUt0Mk8vVUM0b1JXek1nN1JtWVBKcjlR +SWJDYmVDclRBb0dBTjdZbApJbDFMSUVoYkVTaExzZ2c4N09aWnBzL0hVa2FYOWV4Y0p6aFZkcmlk +UzBkOUgxZE90Uk9XYTQwNUMrQWdTUEx0CjhDQ2xFR3RINVlPZW9Pdi93Z1hWY05WN2N6YTRJVEhh +SnFYeDZJNEpEZzB3bU44cU5RWHJPQmphRTRyU0kyY3AKNk1JZDhtWmEwTTJSQjB2cHFRdy8xUDl0 +dUZJdHoySnNHd001cEdFQ2dZQVVnQVV3WENBcEtZVkZFRmxHNlBhYwpvdTBhdzdGNm1aMi9NNUcv +ek9tMHFDYnNXdGFNU09TdUEvNmlVOXB0NDBaWUFONFUvd2ZxbncyVkVoRnA3dzFNCnpZWmJCRDBx +ZVlkcDRmc1NuWXFMZmJBVmxQLzB6dmEzdkwwMlJFa25WalBVSnAvaGpKVWhBK21WN252VDZ5VjQK +cTg4SWVvOEx3Q1c1c2Jtd2lyU3Btdz09Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K diff --git a/src/ukify/test/example.tpm2-pcr-public.pem.base64 b/src/ukify/test/example.tpm2-pcr-public.pem.base64 new file mode 100644 index 0000000..728a0f5 --- /dev/null +++ b/src/ukify/test/example.tpm2-pcr-public.pem.base64 @@ -0,0 +1,8 @@ +LS0tLS1CRUdJTiBQVUJMSUMgS0VZLS0tLS0KTUlJQklqQU5CZ2txaGtpRzl3MEJBUUVGQUFPQ0FR +OEFNSUlCQ2dLQ0FRRUF1Wi9LbXFPVElEeUVGckFNbkhjTAo2YnArek9Na0NLQlA2Kzh3QlM1bW56 +MXEyZThHMllnZGpSZWRWaHZDZko4Z2dRZTl3eXEzMmNwUGJHZ01GbEowCkJwTGJXcUJNMWh1a1JO +KzdzRC9TVk56bDd5RXVkR3JQeVI3aHVrVG0ydWxnamZHYnpPM1hZNkM5U1JYbTZVU0sKK1ZmK2xQ +aXkxNUViVEdsdDBiOHppMThqRERuZC9CdFlCclZVeHE2TGMrWHJWL2pBbEVvalUyZy9WTzFnM3Jw +WApyb3RrNFBrZXkzWVNNZTN1UGFSME44TTVFcXNVL0I0NFVyMHhJeHpIa2V0dEloeEhxSjNPNWRi +b0k3SEdhVFNoCnJ6TUpydjBvNWwyYzBTVGIrVFVpcXFSSUFXTHhhblNzRTcxdk5Zc3lBS084MmRp +OCtObDZKN0NHNzk1Slg0eTUKbndJREFRQUIKLS0tLS1FTkQgUFVCTElDIEtFWS0tLS0tCg== diff --git a/src/ukify/test/example.tpm2-pcr-public2.pem.base64 b/src/ukify/test/example.tpm2-pcr-public2.pem.base64 new file mode 100644 index 0000000..44bb3ee --- /dev/null +++ b/src/ukify/test/example.tpm2-pcr-public2.pem.base64 @@ -0,0 +1,8 @@ +LS0tLS1CRUdJTiBQVUJMSUMgS0VZLS0tLS0KTUlJQklqQU5CZ2txaGtpRzl3MEJBUUVGQUFPQ0FR +OEFNSUlCQ2dLQ0FRRUF0citqSVhjZTVMbkQ2NTVCT0VmbgpLb0ZuL1RxYnJJYjk0K3g1N2ZqdEwv +L01JcnpqeE9mU3RML3hHemZLTVMrZzJBbGtnUVJEZnd3Y3RpME90SjFrCmJhV08yNTVSdnFTbE11 +S0IrREhyd1JtNDNoNUl6aWtzREdsRmVKUEVYa1NmUEtsYmtrdFNialJKaklNaHd4T0cKanVKNExa +MHoxV01PZHJqeDZacE93UEUxbHU0aW5UU0dIMWZQcHhtUSthOXhJbnN3NVVBalE1OFVScjU1eDFj +MAoxMkJyYVhKV2NvaUsyVk15UU9FRG43WjhxY2orWkFQOGVVdWRUajhXcW1UKzhaRGlnbTZoUDMz +Nk5CRGY1RGMrCk94WVQzdFpqeGtSV0dWaG5zZUltanJjaXJjWUVrRjJ4bnVCSWNrOFNkbmpydkFC +SVZTRXIvTTBEb0VwcHBiejMKZndJREFRQUIKLS0tLS1FTkQgUFVCTElDIEtFWS0tLS0tCg== diff --git a/src/ukify/test/meson.build b/src/ukify/test/meson.build new file mode 100644 index 0000000..2df196b --- /dev/null +++ b/src/ukify/test/meson.build @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if want_ukify and want_tests != 'false' + have_pytest_flakes = pymod.find_installation( + 'python3', + required : false, + modules : ['pytest_flakes'], + ).found() + + args = ['-v'] + if have_pytest_flakes + args += ['--flakes'] + endif + + test('test-ukify', + files('test_ukify.py'), + args: args, + env : test_env, + timeout : 120, + suite : 'ukify') +endif diff --git a/src/ukify/test/test_ukify.py b/src/ukify/test/test_ukify.py new file mode 100755 index 0000000..5866447 --- /dev/null +++ b/src/ukify/test/test_ukify.py @@ -0,0 +1,876 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later + +# pylint: disable=unused-import,import-outside-toplevel,useless-else-on-loop +# pylint: disable=consider-using-with,wrong-import-position,unspecified-encoding +# pylint: disable=protected-access,redefined-outer-name + +import base64 +import json +import os +import pathlib +import re +import shutil +import subprocess +import sys +import tempfile +import textwrap + +try: + import pytest +except ImportError as e: + print(str(e), file=sys.stderr) + sys.exit(77) + +try: + # pyflakes: noqa + import pefile # noqa +except ImportError as e: + print(str(e), file=sys.stderr) + sys.exit(77) + +# We import ukify.py, which is a template file. But only __version__ is +# substituted, which we don't care about here. Having the .py suffix makes it +# easier to import the file. +sys.path.append(os.path.dirname(__file__) + '/..') +import ukify + +build_root = os.getenv('PROJECT_BUILD_ROOT') +try: + slow_tests = bool(int(os.getenv('SYSTEMD_SLOW_TESTS', '1'))) +except ValueError: + slow_tests = True + +arg_tools = ['--tools', build_root] if build_root else [] + +def systemd_measure(): + opts = ukify.create_parser().parse_args(arg_tools) + return ukify.find_tool('systemd-measure', opts=opts) + +def test_guess_efi_arch(): + arch = ukify.guess_efi_arch() + assert arch in ukify.EFI_ARCHES + +def test_shell_join(): + assert ukify.shell_join(['a', 'b', ' ']) == "a b ' '" + +def test_round_up(): + assert ukify.round_up(0) == 0 + assert ukify.round_up(4095) == 4096 + assert ukify.round_up(4096) == 4096 + assert ukify.round_up(4097) == 8192 + +def test_namespace_creation(): + ns = ukify.create_parser().parse_args(()) + assert ns.linux is None + assert ns.initrd is None + +def test_config_example(): + ex = ukify.config_example() + assert '[UKI]' in ex + assert 'Splash = BMP' in ex + +def test_apply_config(tmp_path): + config = tmp_path / 'config1.conf' + config.write_text(textwrap.dedent( + f''' + [UKI] + Linux = LINUX + Initrd = initrd1 initrd2 + initrd3 + Cmdline = 1 2 3 4 5 + 6 7 8 + OSRelease = @some/path1 + DeviceTree = some/path2 + Splash = some/path3 + Uname = 1.2.3 + EFIArch=arm + Stub = some/path4 + PCRBanks = sha512,sha1 + SigningEngine = engine1 + SecureBootPrivateKey = some/path5 + SecureBootCertificate = some/path6 + SignKernel = no + + [PCRSignature:NAME] + PCRPrivateKey = some/path7 + PCRPublicKey = some/path8 + Phases = {':'.join(ukify.KNOWN_PHASES)} + ''')) + + ns = ukify.create_parser().parse_args(['build']) + ns.linux = None + ns.initrd = [] + ukify.apply_config(ns, config) + + assert ns.linux == pathlib.Path('LINUX') + assert ns.initrd == [pathlib.Path('initrd1'), + pathlib.Path('initrd2'), + pathlib.Path('initrd3')] + assert ns.cmdline == '1 2 3 4 5\n6 7 8' + assert ns.os_release == '@some/path1' + assert ns.devicetree == pathlib.Path('some/path2') + assert ns.splash == pathlib.Path('some/path3') + assert ns.efi_arch == 'arm' + assert ns.stub == pathlib.Path('some/path4') + assert ns.pcr_banks == ['sha512', 'sha1'] + assert ns.signing_engine == 'engine1' + assert ns.sb_key == 'some/path5' + assert ns.sb_cert == 'some/path6' + assert ns.sign_kernel is False + + assert ns._groups == ['NAME'] + assert ns.pcr_private_keys == [pathlib.Path('some/path7')] + assert ns.pcr_public_keys == [pathlib.Path('some/path8')] + assert ns.phase_path_groups == [['enter-initrd:leave-initrd:sysinit:ready:shutdown:final']] + + ukify.finalize_options(ns) + + assert ns.linux == pathlib.Path('LINUX') + assert ns.initrd == [pathlib.Path('initrd1'), + pathlib.Path('initrd2'), + pathlib.Path('initrd3')] + assert ns.cmdline == '1 2 3 4 5 6 7 8' + assert ns.os_release == pathlib.Path('some/path1') + assert ns.devicetree == pathlib.Path('some/path2') + assert ns.splash == pathlib.Path('some/path3') + assert ns.efi_arch == 'arm' + assert ns.stub == pathlib.Path('some/path4') + assert ns.pcr_banks == ['sha512', 'sha1'] + assert ns.signing_engine == 'engine1' + assert ns.sb_key == 'some/path5' + assert ns.sb_cert == 'some/path6' + assert ns.sign_kernel is False + + assert ns._groups == ['NAME'] + assert ns.pcr_private_keys == [pathlib.Path('some/path7')] + assert ns.pcr_public_keys == [pathlib.Path('some/path8')] + assert ns.phase_path_groups == [['enter-initrd:leave-initrd:sysinit:ready:shutdown:final']] + +def test_parse_args_minimal(): + with pytest.raises(ValueError): + ukify.parse_args([]) + + opts = ukify.parse_args('arg1 arg2'.split()) + assert opts.linux == pathlib.Path('arg1') + assert opts.initrd == [pathlib.Path('arg2')] + assert opts.os_release in (pathlib.Path('/etc/os-release'), + pathlib.Path('/usr/lib/os-release')) + +def test_parse_args_many_deprecated(): + opts = ukify.parse_args( + ['/ARG1', '///ARG2', '/ARG3 WITH SPACE', + '--cmdline=a b c', + '--os-release=K1=V1\nK2=V2', + '--devicetree=DDDDTTTT', + '--splash=splash', + '--pcrpkey=PATH', + '--uname=1.2.3', + '--stub=STUBPATH', + '--pcr-private-key=PKEY1', + '--pcr-public-key=PKEY2', + '--pcr-banks=SHA1,SHA256', + '--signing-engine=ENGINE', + '--secureboot-private-key=SBKEY', + '--secureboot-certificate=SBCERT', + '--sign-kernel', + '--no-sign-kernel', + '--tools=TOOLZ///', + '--output=OUTPUT', + '--measure', + '--no-measure', + ]) + assert opts.linux == pathlib.Path('/ARG1') + assert opts.initrd == [pathlib.Path('/ARG2'), pathlib.Path('/ARG3 WITH SPACE')] + assert opts.cmdline == 'a b c' + assert opts.os_release == 'K1=V1\nK2=V2' + assert opts.devicetree == pathlib.Path('DDDDTTTT') + assert opts.splash == pathlib.Path('splash') + assert opts.pcrpkey == pathlib.Path('PATH') + assert opts.uname == '1.2.3' + assert opts.stub == pathlib.Path('STUBPATH') + assert opts.pcr_private_keys == [pathlib.Path('PKEY1')] + assert opts.pcr_public_keys == [pathlib.Path('PKEY2')] + assert opts.pcr_banks == ['SHA1', 'SHA256'] + assert opts.signing_engine == 'ENGINE' + assert opts.sb_key == 'SBKEY' + assert opts.sb_cert == 'SBCERT' + assert opts.sign_kernel is False + assert opts.tools == [pathlib.Path('TOOLZ/')] + assert opts.output == pathlib.Path('OUTPUT') + assert opts.measure is False + +def test_parse_args_many(): + opts = ukify.parse_args( + ['build', + '--linux=/ARG1', + '--initrd=///ARG2', + '--initrd=/ARG3 WITH SPACE', + '--cmdline=a b c', + '--os-release=K1=V1\nK2=V2', + '--devicetree=DDDDTTTT', + '--splash=splash', + '--pcrpkey=PATH', + '--uname=1.2.3', + '--stub=STUBPATH', + '--pcr-private-key=PKEY1', + '--pcr-public-key=PKEY2', + '--pcr-banks=SHA1,SHA256', + '--signing-engine=ENGINE', + '--secureboot-private-key=SBKEY', + '--secureboot-certificate=SBCERT', + '--sign-kernel', + '--no-sign-kernel', + '--tools=TOOLZ///', + '--output=OUTPUT', + '--measure', + '--no-measure', + ]) + assert opts.linux == pathlib.Path('/ARG1') + assert opts.initrd == [pathlib.Path('/ARG2'), pathlib.Path('/ARG3 WITH SPACE')] + assert opts.cmdline == 'a b c' + assert opts.os_release == 'K1=V1\nK2=V2' + assert opts.devicetree == pathlib.Path('DDDDTTTT') + assert opts.splash == pathlib.Path('splash') + assert opts.pcrpkey == pathlib.Path('PATH') + assert opts.uname == '1.2.3' + assert opts.stub == pathlib.Path('STUBPATH') + assert opts.pcr_private_keys == [pathlib.Path('PKEY1')] + assert opts.pcr_public_keys == [pathlib.Path('PKEY2')] + assert opts.pcr_banks == ['SHA1', 'SHA256'] + assert opts.signing_engine == 'ENGINE' + assert opts.sb_key == 'SBKEY' + assert opts.sb_cert == 'SBCERT' + assert opts.sign_kernel is False + assert opts.tools == [pathlib.Path('TOOLZ/')] + assert opts.output == pathlib.Path('OUTPUT') + assert opts.measure is False + +def test_parse_sections(): + opts = ukify.parse_args( + ['build', + '--linux=/ARG1', + '--initrd=/ARG2', + '--section=test:TESTTESTTEST', + '--section=test2:@FILE', + ]) + + assert opts.linux == pathlib.Path('/ARG1') + assert opts.initrd == [pathlib.Path('/ARG2')] + assert len(opts.sections) == 2 + + assert opts.sections[0].name == 'test' + assert isinstance(opts.sections[0].content, pathlib.Path) + assert opts.sections[0].tmpfile + assert opts.sections[0].measure is False + + assert opts.sections[1].name == 'test2' + assert opts.sections[1].content == pathlib.Path('FILE') + assert opts.sections[1].tmpfile is None + assert opts.sections[1].measure is False + +def test_config_priority(tmp_path): + config = tmp_path / 'config1.conf' + # config: use pesign and give certdir + certname + config.write_text(textwrap.dedent( + f''' + [UKI] + Linux = LINUX + Initrd = initrd1 initrd2 + initrd3 + Cmdline = 1 2 3 4 5 + 6 7 8 + OSRelease = @some/path1 + DeviceTree = some/path2 + Splash = some/path3 + Uname = 1.2.3 + EFIArch = arm + Stub = some/path4 + PCRBanks = sha512,sha1 + SigningEngine = engine1 + SecureBootSigningTool = pesign + SecureBootCertificateDir = some/path5 + SecureBootCertificateName = some/name1 + SignKernel = no + + [PCRSignature:NAME] + PCRPrivateKey = some/path7 + PCRPublicKey = some/path8 + Phases = {':'.join(ukify.KNOWN_PHASES)} + ''')) + + # args: use sbsign and give key + cert, should override pesign + opts = ukify.parse_args( + ['build', + '--linux=/ARG1', + '--initrd=///ARG2', + '--initrd=/ARG3 WITH SPACE', + '--cmdline= a b c ', + '--os-release=K1=V1\nK2=V2', + '--devicetree=DDDDTTTT', + '--splash=splash', + '--pcrpkey=PATH', + '--uname=1.2.3', + '--stub=STUBPATH', + '--pcr-private-key=PKEY1', + '--pcr-public-key=PKEY2', + '--pcr-banks=SHA1,SHA256', + '--signing-engine=ENGINE', + '--signtool=sbsign', + '--secureboot-private-key=SBKEY', + '--secureboot-certificate=SBCERT', + '--sign-kernel', + '--no-sign-kernel', + '--tools=TOOLZ///', + '--output=OUTPUT', + '--measure', + ]) + + ukify.apply_config(opts, config) + ukify.finalize_options(opts) + + assert opts.linux == pathlib.Path('/ARG1') + assert opts.initrd == [pathlib.Path('initrd1'), + pathlib.Path('initrd2'), + pathlib.Path('initrd3'), + pathlib.Path('/ARG2'), + pathlib.Path('/ARG3 WITH SPACE')] + assert opts.cmdline == 'a b c' + assert opts.os_release == 'K1=V1\nK2=V2' + assert opts.devicetree == pathlib.Path('DDDDTTTT') + assert opts.splash == pathlib.Path('splash') + assert opts.pcrpkey == pathlib.Path('PATH') + assert opts.uname == '1.2.3' + assert opts.stub == pathlib.Path('STUBPATH') + assert opts.pcr_private_keys == [pathlib.Path('PKEY1'), + pathlib.Path('some/path7')] + assert opts.pcr_public_keys == [pathlib.Path('PKEY2'), + pathlib.Path('some/path8')] + assert opts.pcr_banks == ['SHA1', 'SHA256'] + assert opts.signing_engine == 'ENGINE' + assert opts.signtool == 'sbsign' # from args + assert opts.sb_key == 'SBKEY' # from args + assert opts.sb_cert == 'SBCERT' # from args + assert opts.sb_certdir == 'some/path5' # from config + assert opts.sb_cert_name == 'some/name1' # from config + assert opts.sign_kernel is False + assert opts.tools == [pathlib.Path('TOOLZ/')] + assert opts.output == pathlib.Path('OUTPUT') + assert opts.measure is True + +def test_help(capsys): + with pytest.raises(SystemExit): + ukify.parse_args(['--help']) + out = capsys.readouterr() + assert '--section' in out.out + assert not out.err + +def test_help_display(capsys): + with pytest.raises(SystemExit): + ukify.parse_args(['inspect', '--help']) + out = capsys.readouterr() + assert '--section' in out.out + assert not out.err + +def test_help_error_deprecated(capsys): + with pytest.raises(SystemExit): + ukify.parse_args(['a', 'b', '--no-such-option']) + out = capsys.readouterr() + assert not out.out + assert '--no-such-option' in out.err + assert len(out.err.splitlines()) == 1 + +def test_help_error(capsys): + with pytest.raises(SystemExit): + ukify.parse_args(['build', '--no-such-option']) + out = capsys.readouterr() + assert not out.out + assert '--no-such-option' in out.err + assert len(out.err.splitlines()) == 1 + +@pytest.fixture(scope='session') +def kernel_initrd(): + opts = ukify.create_parser().parse_args(arg_tools) + bootctl = ukify.find_tool('bootctl', opts=opts) + if bootctl is None: + return None + + try: + text = subprocess.check_output([bootctl, 'list', '--json=short'], + text=True) + except subprocess.CalledProcessError: + return None + + items = json.loads(text) + + for item in items: + try: + linux = f"{item['root']}{item['linux']}" + initrd = f"{item['root']}{item['initrd'][0].split(' ')[0]}" + except (KeyError, IndexError): + continue + return ['--linux', linux, '--initrd', initrd] + else: + return None + +def test_check_splash(): + try: + # pyflakes: noqa + import PIL # noqa + except ImportError: + pytest.skip('PIL not available') + + with pytest.raises(OSError): + ukify.check_splash(os.devnull) + +def test_basic_operation(kernel_initrd, tmp_path): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + + output = f'{tmp_path}/basic.efi' + opts = ukify.parse_args([ + 'build', + *kernel_initrd, + f'--output={output}', + ]) + try: + ukify.check_inputs(opts) + except OSError as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + # let's check that objdump likes the resulting file + subprocess.check_output(['objdump', '-h', output]) + + shutil.rmtree(tmp_path) + +def test_sections(kernel_initrd, tmp_path): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + + output = f'{tmp_path}/basic.efi' + opts = ukify.parse_args([ + 'build', + *kernel_initrd, + f'--output={output}', + '--uname=1.2.3', + '--cmdline=ARG1 ARG2 ARG3', + '--os-release=K1=V1\nK2=V2\n', + '--section=.test:CONTENTZ', + ]) + + try: + ukify.check_inputs(opts) + except OSError as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + # let's check that objdump likes the resulting file + dump = subprocess.check_output(['objdump', '-h', output], text=True) + + for sect in 'text osrel cmdline linux initrd uname test'.split(): + assert re.search(fr'^\s*\d+\s+\.{sect}\s+[0-9a-f]+', dump, re.MULTILINE) + + shutil.rmtree(tmp_path) + +def test_addon(tmp_path): + output = f'{tmp_path}/addon.efi' + args = [ + 'build', + f'--output={output}', + '--cmdline=ARG1 ARG2 ARG3', + """--sbat=sbat,1,foo +foo,1 +bar,2 +""", + '--section=.test:CONTENTZ', + """--sbat=sbat,1,foo +baz,3 +""" + ] + if stub := os.getenv('EFI_ADDON'): + args += [f'--stub={stub}'] + expected_exceptions = () + else: + expected_exceptions = (FileNotFoundError,) + + opts = ukify.parse_args(args) + try: + ukify.check_inputs(opts) + except expected_exceptions as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + # let's check that objdump likes the resulting file + dump = subprocess.check_output(['objdump', '-h', output], text=True) + + for sect in 'text cmdline test sbat'.split(): + assert re.search(fr'^\s*\d+\s+\.{sect}\s+[0-9a-f]+', dump, re.MULTILINE) + + pe = pefile.PE(output, fast_load=True) + found = False + + for section in pe.sections: + if section.Name.rstrip(b"\x00").decode() == ".sbat": + assert found is False + split = section.get_data().rstrip(b"\x00").decode().splitlines() + assert split == ["sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md", "foo,1", "bar,2", "baz,3"] + found = True + + assert found is True + + +def unbase64(filename): + tmp = tempfile.NamedTemporaryFile() + base64.decode(filename.open('rb'), tmp) + tmp.flush() + return tmp + + +def test_uname_scraping(kernel_initrd): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + + assert kernel_initrd[0] == '--linux' + uname = ukify.Uname.scrape(kernel_initrd[1]) + assert re.match(r'\d+\.\d+\.\d+', uname) + +@pytest.mark.skipif(not slow_tests, reason='slow') +def test_efi_signing_sbsign(kernel_initrd, tmp_path): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + if not shutil.which('sbsign'): + pytest.skip('sbsign not found') + + ourdir = pathlib.Path(__file__).parent + cert = unbase64(ourdir / 'example.signing.crt.base64') + key = unbase64(ourdir / 'example.signing.key.base64') + + output = f'{tmp_path}/signed.efi' + opts = ukify.parse_args([ + 'build', + *kernel_initrd, + f'--output={output}', + '--uname=1.2.3', + '--cmdline=ARG1 ARG2 ARG3', + f'--secureboot-certificate={cert.name}', + f'--secureboot-private-key={key.name}', + ]) + + try: + ukify.check_inputs(opts) + except OSError as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + if shutil.which('sbverify'): + # let's check that sbverify likes the resulting file + dump = subprocess.check_output([ + 'sbverify', + '--cert', cert.name, + output, + ], text=True) + + assert 'Signature verification OK' in dump + + shutil.rmtree(tmp_path) + +@pytest.mark.skipif(not slow_tests, reason='slow') +def test_efi_signing_pesign(kernel_initrd, tmp_path): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + if not shutil.which('pesign'): + pytest.skip('pesign not found') + + nss_db = f'{tmp_path}/nss_db' + name = 'Test_Secureboot' + author = 'systemd' + + subprocess.check_call(['mkdir', '-p', nss_db]) + cmd = f'certutil -N --empty-password -d {nss_db}'.split(' ') + subprocess.check_call(cmd) + cmd = f'efikeygen -d {nss_db} -S -k -c CN={author} -n {name}'.split(' ') + subprocess.check_call(cmd) + + output = f'{tmp_path}/signed.efi' + opts = ukify.parse_args([ + 'build', + *kernel_initrd, + f'--output={output}', + '--uname=1.2.3', + '--signtool=pesign', + '--cmdline=ARG1 ARG2 ARG3', + f'--secureboot-certificate-name={name}', + f'--secureboot-certificate-dir={nss_db}', + ]) + + try: + ukify.check_inputs(opts) + except OSError as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + # let's check that sbverify likes the resulting file + dump = subprocess.check_output([ + 'pesign', '-S', + '-i', output, + ], text=True) + + assert f"The signer's common name is {author}" in dump + + shutil.rmtree(tmp_path) + +def test_inspect(kernel_initrd, tmp_path, capsys): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + if not shutil.which('sbsign'): + pytest.skip('sbsign not found') + + ourdir = pathlib.Path(__file__).parent + cert = unbase64(ourdir / 'example.signing.crt.base64') + key = unbase64(ourdir / 'example.signing.key.base64') + + output = f'{tmp_path}/signed2.efi' + uname_arg='1.2.3' + osrel_arg='Linux' + cmdline_arg='ARG1 ARG2 ARG3' + + args = [ + 'build', + *kernel_initrd, + f'--cmdline={cmdline_arg}', + f'--os-release={osrel_arg}', + f'--uname={uname_arg}', + f'--output={output}', + ] + if slow_tests: + args += [ + f'--secureboot-certificate={cert.name}', + f'--secureboot-private-key={key.name}', + ] + + opts = ukify.parse_args(args) + + ukify.check_inputs(opts) + ukify.make_uki(opts) + + opts = ukify.parse_args(['inspect', output]) + ukify.inspect_sections(opts) + + text = capsys.readouterr().out + + expected_osrel = f'.osrel:\n size: {len(osrel_arg)}' + assert expected_osrel in text + expected_cmdline = f'.cmdline:\n size: {len(cmdline_arg)}' + assert expected_cmdline in text + expected_uname = f'.uname:\n size: {len(uname_arg)}' + assert expected_uname in text + + expected_initrd = '.initrd:\n size:' + assert expected_initrd in text + expected_linux = '.linux:\n size:' + assert expected_linux in text + + shutil.rmtree(tmp_path) + +@pytest.mark.skipif(not slow_tests, reason='slow') +def test_pcr_signing(kernel_initrd, tmp_path): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + if systemd_measure() is None: + pytest.skip('systemd-measure not found') + + ourdir = pathlib.Path(__file__).parent + pub = unbase64(ourdir / 'example.tpm2-pcr-public.pem.base64') + priv = unbase64(ourdir / 'example.tpm2-pcr-private.pem.base64') + + output = f'{tmp_path}/signed.efi' + args = [ + 'build', + *kernel_initrd, + f'--output={output}', + '--uname=1.2.3', + '--cmdline=ARG1 ARG2 ARG3', + '--os-release=ID=foobar\n', + '--pcr-banks=sha1', # use sha1 because it doesn't really matter + f'--pcr-private-key={priv.name}', + ] + arg_tools + + # If the public key is not explicitly specified, it is derived automatically. Let's make sure everything + # works as expected both when the public keys is specified explicitly and when it is derived from the + # private key. + for extra in ([f'--pcrpkey={pub.name}', f'--pcr-public-key={pub.name}'], []): + opts = ukify.parse_args(args + extra) + try: + ukify.check_inputs(opts) + except OSError as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + # let's check that objdump likes the resulting file + dump = subprocess.check_output(['objdump', '-h', output], text=True) + + for sect in 'text osrel cmdline linux initrd uname pcrsig'.split(): + assert re.search(fr'^\s*\d+\s+\.{sect}\s+[0-9a-f]+', dump, re.MULTILINE) + + # objcopy fails when called without an output argument (EPERM). + # It also fails when called with /dev/null (file truncated). + # It also fails when called with /dev/zero (because it reads the + # output file, infinitely in this case.) + # So let's just call it with a dummy output argument. + subprocess.check_call([ + 'objcopy', + *(f'--dump-section=.{n}={tmp_path}/out.{n}' for n in ( + 'pcrpkey', 'pcrsig', 'osrel', 'uname', 'cmdline')), + output, + tmp_path / 'dummy', + ], + text=True) + + assert open(tmp_path / 'out.pcrpkey').read() == open(pub.name).read() + assert open(tmp_path / 'out.osrel').read() == 'ID=foobar\n' + assert open(tmp_path / 'out.uname').read() == '1.2.3' + assert open(tmp_path / 'out.cmdline').read() == 'ARG1 ARG2 ARG3' + sig = open(tmp_path / 'out.pcrsig').read() + sig = json.loads(sig) + assert list(sig.keys()) == ['sha1'] + assert len(sig['sha1']) == 4 # four items for four phases + + shutil.rmtree(tmp_path) + +@pytest.mark.skipif(not slow_tests, reason='slow') +def test_pcr_signing2(kernel_initrd, tmp_path): + if kernel_initrd is None: + pytest.skip('linux+initrd not found') + if systemd_measure() is None: + pytest.skip('systemd-measure not found') + + ourdir = pathlib.Path(__file__).parent + pub = unbase64(ourdir / 'example.tpm2-pcr-public.pem.base64') + priv = unbase64(ourdir / 'example.tpm2-pcr-private.pem.base64') + pub2 = unbase64(ourdir / 'example.tpm2-pcr-public2.pem.base64') + priv2 = unbase64(ourdir / 'example.tpm2-pcr-private2.pem.base64') + + # simulate a microcode file + with open(f'{tmp_path}/microcode', 'wb') as microcode: + microcode.write(b'1234567890') + + output = f'{tmp_path}/signed.efi' + assert kernel_initrd[0] == '--linux' + opts = ukify.parse_args([ + 'build', + *kernel_initrd[:2], + f'--initrd={microcode.name}', + *kernel_initrd[2:], + f'--output={output}', + '--uname=1.2.3', + '--cmdline=ARG1 ARG2 ARG3', + '--os-release=ID=foobar\n', + '--pcr-banks=sha1', + f'--pcrpkey={pub2.name}', + f'--pcr-public-key={pub.name}', + f'--pcr-private-key={priv.name}', + '--phases=enter-initrd enter-initrd:leave-initrd', + f'--pcr-public-key={pub2.name}', + f'--pcr-private-key={priv2.name}', + '--phases=sysinit ready shutdown final', # yes, those phase paths are not reachable + ] + arg_tools) + + try: + ukify.check_inputs(opts) + except OSError as e: + pytest.skip(str(e)) + + ukify.make_uki(opts) + + # let's check that objdump likes the resulting file + dump = subprocess.check_output(['objdump', '-h', output], text=True) + + for sect in 'text osrel cmdline linux initrd uname pcrsig'.split(): + assert re.search(fr'^\s*\d+\s+\.{sect}\s+[0-9a-f]+', dump, re.MULTILINE) + + subprocess.check_call([ + 'objcopy', + *(f'--dump-section=.{n}={tmp_path}/out.{n}' for n in ( + 'pcrpkey', 'pcrsig', 'osrel', 'uname', 'cmdline', 'initrd')), + output, + tmp_path / 'dummy', + ], + text=True) + + assert open(tmp_path / 'out.pcrpkey').read() == open(pub2.name).read() + assert open(tmp_path / 'out.osrel').read() == 'ID=foobar\n' + assert open(tmp_path / 'out.uname').read() == '1.2.3' + assert open(tmp_path / 'out.cmdline').read() == 'ARG1 ARG2 ARG3' + assert open(tmp_path / 'out.initrd', 'rb').read(10) == b'1234567890' + + sig = open(tmp_path / 'out.pcrsig').read() + sig = json.loads(sig) + assert list(sig.keys()) == ['sha1'] + assert len(sig['sha1']) == 6 # six items for six phases paths + + shutil.rmtree(tmp_path) + +def test_key_cert_generation(tmp_path): + opts = ukify.parse_args([ + 'genkey', + f"--pcr-public-key={tmp_path / 'pcr1.pub.pem'}", + f"--pcr-private-key={tmp_path / 'pcr1.priv.pem'}", + '--phases=enter-initrd enter-initrd:leave-initrd', + f"--pcr-public-key={tmp_path / 'pcr2.pub.pem'}", + f"--pcr-private-key={tmp_path / 'pcr2.priv.pem'}", + '--phases=sysinit ready', + f"--secureboot-private-key={tmp_path / 'sb.priv.pem'}", + f"--secureboot-certificate={tmp_path / 'sb.cert.pem'}", + ]) + assert opts.verb == 'genkey' + ukify.check_cert_and_keys_nonexistent(opts) + + pytest.importorskip('cryptography') + + ukify.generate_keys(opts) + + if not shutil.which('openssl'): + return + + for key in (tmp_path / 'pcr1.priv.pem', + tmp_path / 'pcr2.priv.pem', + tmp_path / 'sb.priv.pem'): + out = subprocess.check_output([ + 'openssl', 'rsa', + '-in', key, + '-text', + '-noout', + ], text = True) + assert 'Private-Key' in out + assert '2048 bit' in out + + for pub in (tmp_path / 'pcr1.pub.pem', + tmp_path / 'pcr2.pub.pem'): + out = subprocess.check_output([ + 'openssl', 'rsa', + '-pubin', + '-in', pub, + '-text', + '-noout', + ], text = True) + assert 'Public-Key' in out + assert '2048 bit' in out + + out = subprocess.check_output([ + 'openssl', 'x509', + '-in', tmp_path / 'sb.cert.pem', + '-text', + '-noout', + ], text = True) + assert 'Certificate' in out + assert re.search(r'Issuer: CN\s?=\s?SecureBoot signing key on host', out) + +if __name__ == '__main__': + sys.exit(pytest.main(sys.argv)) diff --git a/src/ukify/ukify.py b/src/ukify/ukify.py new file mode 100755 index 0000000..6abf1b6 --- /dev/null +++ b/src/ukify/ukify.py @@ -0,0 +1,1668 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# systemd is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with systemd; If not, see . + +# pylint: disable=import-outside-toplevel,consider-using-with,unused-argument +# pylint: disable=unnecessary-lambda-assignment + +import argparse +import configparser +import contextlib +import collections +import dataclasses +import datetime +import fnmatch +import itertools +import json +import os +import pathlib +import pprint +import pydoc +import re +import shlex +import shutil +import socket +import subprocess +import sys +import tempfile +import textwrap +from hashlib import sha256 +from typing import (Any, + Callable, + IO, + Optional, + Sequence, + Union) + +import pefile # type: ignore + +__version__ = '{{PROJECT_VERSION}} ({{GIT_VERSION}})' + +EFI_ARCH_MAP = { + # host_arch glob : [efi_arch, 32_bit_efi_arch if mixed mode is supported] + 'x86_64' : ['x64', 'ia32'], + 'i[3456]86' : ['ia32'], + 'aarch64' : ['aa64'], + 'armv[45678]*l': ['arm'], + 'loongarch32' : ['loongarch32'], + 'loongarch64' : ['loongarch64'], + 'riscv32' : ['riscv32'], + 'riscv64' : ['riscv64'], +} +EFI_ARCHES: list[str] = sum(EFI_ARCH_MAP.values(), []) + +# Default configuration directories and file name. +# When the user does not specify one, the directories are searched in this order and the first file found is used. +DEFAULT_CONFIG_DIRS = ['/run/systemd', '/etc/systemd', '/usr/local/lib/systemd', '/usr/lib/systemd'] +DEFAULT_CONFIG_FILE = 'ukify.conf' + +class Style: + bold = "\033[0;1;39m" if sys.stderr.isatty() else "" + gray = "\033[0;38;5;245m" if sys.stderr.isatty() else "" + red = "\033[31;1m" if sys.stderr.isatty() else "" + yellow = "\033[33;1m" if sys.stderr.isatty() else "" + reset = "\033[0m" if sys.stderr.isatty() else "" + + +def guess_efi_arch(): + arch = os.uname().machine + + for glob, mapping in EFI_ARCH_MAP.items(): + if fnmatch.fnmatch(arch, glob): + efi_arch, *fallback = mapping + break + else: + raise ValueError(f'Unsupported architecture {arch}') + + # This makes sense only on some architectures, but it also probably doesn't + # hurt on others, so let's just apply the check everywhere. + if fallback: + fw_platform_size = pathlib.Path('/sys/firmware/efi/fw_platform_size') + try: + size = fw_platform_size.read_text().strip() + except FileNotFoundError: + pass + else: + if int(size) == 32: + efi_arch = fallback[0] + + # print(f'Host arch {arch!r}, EFI arch {efi_arch!r}') + return efi_arch + + +def page(text: str, enabled: Optional[bool]) -> None: + if enabled: + # Initialize less options from $SYSTEMD_LESS or provide a suitable fallback. + os.environ['LESS'] = os.getenv('SYSTEMD_LESS', 'FRSXMK') + pydoc.pager(text) + else: + print(text) + + +def shell_join(cmd): + # TODO: drop in favour of shlex.join once shlex.join supports pathlib.Path. + return ' '.join(shlex.quote(str(x)) for x in cmd) + + +def round_up(x, blocksize=4096): + return (x + blocksize - 1) // blocksize * blocksize + + +def try_import(modname, name=None): + try: + return __import__(modname) + except ImportError as e: + raise ValueError(f'Kernel is compressed with {name or modname}, but module unavailable') from e + + +def maybe_decompress(filename): + """Decompress file if compressed. Return contents.""" + f = open(filename, 'rb') + start = f.read(4) + f.seek(0) + + if start.startswith(b'\x7fELF'): + # not compressed + return f.read() + + if start.startswith(b'MZ'): + # not compressed aarch64 and riscv64 + return f.read() + + if start.startswith(b'\x1f\x8b'): + gzip = try_import('gzip') + return gzip.open(f).read() + + if start.startswith(b'\x28\xb5\x2f\xfd'): + zstd = try_import('zstd') + return zstd.uncompress(f.read()) + + if start.startswith(b'\x02\x21\x4c\x18'): + lz4 = try_import('lz4.frame', 'lz4') + return lz4.frame.decompress(f.read()) + + if start.startswith(b'\x04\x22\x4d\x18'): + print('Newer lz4 stream format detected! This may not boot!') + lz4 = try_import('lz4.frame', 'lz4') + return lz4.frame.decompress(f.read()) + + if start.startswith(b'\x89LZO'): + # python3-lzo is not packaged for Fedora + raise NotImplementedError('lzo decompression not implemented') + + if start.startswith(b'BZh'): + bz2 = try_import('bz2', 'bzip2') + return bz2.open(f).read() + + if start.startswith(b'\x5d\x00\x00'): + lzma = try_import('lzma') + return lzma.open(f).read() + + raise NotImplementedError(f'unknown file format (starts with {start})') + + +class Uname: + # This class is here purely as a namespace for the functions + + VERSION_PATTERN = r'(?P[a-z0-9._-]+) \([^ )]+\) (?:#.*)' + + NOTES_PATTERN = r'^\s+Linux\s+0x[0-9a-f]+\s+OPEN\n\s+description data: (?P[0-9a-f ]+)\s*$' + + # Linux version 6.0.8-300.fc37.ppc64le (mockbuild@buildvm-ppc64le-03.iad2.fedoraproject.org) + # (gcc (GCC) 12.2.1 20220819 (Red Hat 12.2.1-2), GNU ld version 2.38-24.fc37) + # #1 SMP Fri Nov 11 14:39:11 UTC 2022 + TEXT_PATTERN = rb'Linux version (?P\d\.\S+) \(' + + @classmethod + def scrape_x86(cls, filename, opts=None): + # Based on https://gitlab.archlinux.org/archlinux/mkinitcpio/mkinitcpio/-/blob/master/functions#L136 + # and https://www.kernel.org/doc/html/latest/x86/boot.html#the-real-mode-kernel-header + with open(filename, 'rb') as f: + f.seek(0x202) + magic = f.read(4) + if magic != b'HdrS': + raise ValueError('Real-Mode Kernel Header magic not found') + f.seek(0x20E) + offset = f.read(1)[0] + f.read(1)[0]*256 # Pointer to kernel version string + f.seek(0x200 + offset) + text = f.read(128) + text = text.split(b'\0', maxsplit=1)[0] + text = text.decode() + + if not (m := re.match(cls.VERSION_PATTERN, text)): + raise ValueError(f'Cannot parse version-host-release uname string: {text!r}') + return m.group('version') + + @classmethod + def scrape_elf(cls, filename, opts=None): + readelf = find_tool('readelf', opts=opts) + + cmd = [ + readelf, + '--notes', + filename, + ] + + print('+', shell_join(cmd)) + try: + notes = subprocess.check_output(cmd, stderr=subprocess.PIPE, text=True) + except subprocess.CalledProcessError as e: + raise ValueError(e.stderr.strip()) from e + + if not (m := re.search(cls.NOTES_PATTERN, notes, re.MULTILINE)): + raise ValueError('Cannot find Linux version note') + + text = ''.join(chr(int(c, 16)) for c in m.group('version').split()) + return text.rstrip('\0') + + @classmethod + def scrape_generic(cls, filename, opts=None): + # import libarchive + # libarchive-c fails with + # ArchiveError: Unrecognized archive format (errno=84, retcode=-30, archive_p=94705420454656) + + # Based on https://gitlab.archlinux.org/archlinux/mkinitcpio/mkinitcpio/-/blob/master/functions#L209 + + text = maybe_decompress(filename) + if not (m := re.search(cls.TEXT_PATTERN, text)): + raise ValueError(f'Cannot find {cls.TEXT_PATTERN!r} in {filename}') + + return m.group('version').decode() + + @classmethod + def scrape(cls, filename, opts=None): + for func in (cls.scrape_x86, cls.scrape_elf, cls.scrape_generic): + try: + version = func(filename, opts=opts) + print(f'Found uname version: {version}') + return version + except ValueError as e: + print(str(e)) + return None + +DEFAULT_SECTIONS_TO_SHOW = { + '.linux' : 'binary', + '.initrd' : 'binary', + '.splash' : 'binary', + '.dtb' : 'binary', + '.cmdline' : 'text', + '.osrel' : 'text', + '.uname' : 'text', + '.pcrpkey' : 'text', + '.pcrsig' : 'text', + '.sbat' : 'text', + '.sbom' : 'binary', +} + +@dataclasses.dataclass +class Section: + name: str + content: Optional[pathlib.Path] + tmpfile: Optional[IO] = None + measure: bool = False + output_mode: Optional[str] = None + + @classmethod + def create(cls, name, contents, **kwargs): + if isinstance(contents, (str, bytes)): + mode = 'wt' if isinstance(contents, str) else 'wb' + tmp = tempfile.NamedTemporaryFile(mode=mode, prefix=f'tmp{name}') + tmp.write(contents) + tmp.flush() + contents = pathlib.Path(tmp.name) + else: + tmp = None + + return cls(name, contents, tmpfile=tmp, **kwargs) + + @classmethod + def parse_input(cls, s): + try: + name, contents, *rest = s.split(':') + except ValueError as e: + raise ValueError(f'Cannot parse section spec (name or contents missing): {s!r}') from e + if rest: + raise ValueError(f'Cannot parse section spec (extraneous parameters): {s!r}') + + if contents.startswith('@'): + contents = pathlib.Path(contents[1:]) + + sec = cls.create(name, contents) + sec.check_name() + return sec + + @classmethod + def parse_output(cls, s): + if not (m := re.match(r'([a-zA-Z0-9_.]+):(text|binary)(?:@(.+))?', s)): + raise ValueError(f'Cannot parse section spec: {s!r}') + + name, ttype, out = m.groups() + out = pathlib.Path(out) if out else None + + return cls.create(name, out, output_mode=ttype) + + def size(self): + return self.content.stat().st_size + + def check_name(self): + # PE section names with more than 8 characters are legal, but our stub does + # not support them. + if not self.name.isascii() or not self.name.isprintable(): + raise ValueError(f'Bad section name: {self.name!r}') + if len(self.name) > 8: + raise ValueError(f'Section name too long: {self.name!r}') + + +@dataclasses.dataclass +class UKI: + executable: list[Union[pathlib.Path, str]] + sections: list[Section] = dataclasses.field(default_factory=list, init=False) + + def add_section(self, section): + if section.name in [s.name for s in self.sections]: + raise ValueError(f'Duplicate section {section.name}') + + self.sections += [section] + + +def parse_banks(s): + banks = re.split(r',|\s+', s) + # TODO: do some sanity checking here + return banks + + +KNOWN_PHASES = ( + 'enter-initrd', + 'leave-initrd', + 'sysinit', + 'ready', + 'shutdown', + 'final', +) + +def parse_phase_paths(s): + # Split on commas or whitespace here. Commas might be hard to parse visually. + paths = re.split(r',|\s+', s) + + for path in paths: + for phase in path.split(':'): + if phase not in KNOWN_PHASES: + raise argparse.ArgumentTypeError(f'Unknown boot phase {phase!r} ({path=})') + + return paths + + +def check_splash(filename): + if filename is None: + return + + # import is delayed, to avoid import when the splash image is not used + try: + from PIL import Image + except ImportError: + return + + img = Image.open(filename, formats=['BMP']) + print(f'Splash image {filename} is {img.width}×{img.height} pixels') + + +def check_inputs(opts): + for name, value in vars(opts).items(): + if name in {'output', 'tools'}: + continue + + if isinstance(value, pathlib.Path): + # Open file to check that we can read it, or generate an exception + value.open().close() + elif isinstance(value, list): + for item in value: + if isinstance(item, pathlib.Path): + item.open().close() + + check_splash(opts.splash) + + +def check_cert_and_keys_nonexistent(opts): + # Raise if any of the keys and certs are found on disk + paths = itertools.chain( + (opts.sb_key, opts.sb_cert), + *((priv_key, pub_key) + for priv_key, pub_key, _ in key_path_groups(opts))) + for path in paths: + if path and path.exists(): + raise ValueError(f'{path} is present') + + +def find_tool(name, fallback=None, opts=None): + if opts and opts.tools: + for d in opts.tools: + tool = d / name + if tool.exists(): + return tool + + if shutil.which(name) is not None: + return name + + if fallback is None: + print(f"Tool {name} not installed!") + + return fallback + +def combine_signatures(pcrsigs): + combined = collections.defaultdict(list) + for pcrsig in pcrsigs: + for bank, sigs in pcrsig.items(): + for sig in sigs: + if sig not in combined[bank]: + combined[bank] += [sig] + return json.dumps(combined) + + +def key_path_groups(opts): + if not opts.pcr_private_keys: + return + + n_priv = len(opts.pcr_private_keys) + pub_keys = opts.pcr_public_keys or [None] * n_priv + pp_groups = opts.phase_path_groups or [None] * n_priv + + yield from zip(opts.pcr_private_keys, + pub_keys, + pp_groups) + + +def call_systemd_measure(uki, linux, opts): + measure_tool = find_tool('systemd-measure', + '/usr/lib/systemd/systemd-measure', + opts=opts) + + banks = opts.pcr_banks or () + + # PCR measurement + + if opts.measure: + pp_groups = opts.phase_path_groups or [] + + cmd = [ + measure_tool, + 'calculate', + f'--linux={linux}', + *(f"--{s.name.removeprefix('.')}={s.content}" + for s in uki.sections + if s.measure), + *(f'--bank={bank}' + for bank in banks), + # For measurement, the keys are not relevant, so we can lump all the phase paths + # into one call to systemd-measure calculate. + *(f'--phase={phase_path}' + for phase_path in itertools.chain.from_iterable(pp_groups)), + ] + + print('+', shell_join(cmd)) + subprocess.check_call(cmd) + + # PCR signing + + if opts.pcr_private_keys: + pcrsigs = [] + + cmd = [ + measure_tool, + 'sign', + f'--linux={linux}', + *(f"--{s.name.removeprefix('.')}={s.content}" + for s in uki.sections + if s.measure), + *(f'--bank={bank}' + for bank in banks), + ] + + for priv_key, pub_key, group in key_path_groups(opts): + extra = [f'--private-key={priv_key}'] + if pub_key: + extra += [f'--public-key={pub_key}'] + extra += [f'--phase={phase_path}' for phase_path in group or ()] + + print('+', shell_join(cmd + extra)) + pcrsig = subprocess.check_output(cmd + extra, text=True) + pcrsig = json.loads(pcrsig) + pcrsigs += [pcrsig] + + combined = combine_signatures(pcrsigs) + uki.add_section(Section.create('.pcrsig', combined)) + + +def join_initrds(initrds): + if not initrds: + return None + if len(initrds) == 1: + return initrds[0] + + seq = [] + for file in initrds: + initrd = file.read_bytes() + n = len(initrd) + padding = b'\0' * (round_up(n, 4) - n) # pad to 32 bit alignment + seq += [initrd, padding] + + return b''.join(seq) + + +def pairwise(iterable): + a, b = itertools.tee(iterable) + next(b, None) + return zip(a, b) + + +class PEError(Exception): + pass + + +def pe_add_sections(uki: UKI, output: str): + pe = pefile.PE(uki.executable, fast_load=True) + + # Old stubs do not have the symbol/string table stripped, even though image files should not have one. + if symbol_table := pe.FILE_HEADER.PointerToSymbolTable: + symbol_table_size = 18 * pe.FILE_HEADER.NumberOfSymbols + if string_table_size := pe.get_dword_from_offset(symbol_table + symbol_table_size): + symbol_table_size += string_table_size + + # Let's be safe and only strip it if it's at the end of the file. + if symbol_table + symbol_table_size == len(pe.__data__): + pe.__data__ = pe.__data__[:symbol_table] + pe.FILE_HEADER.PointerToSymbolTable = 0 + pe.FILE_HEADER.NumberOfSymbols = 0 + pe.FILE_HEADER.IMAGE_FILE_LOCAL_SYMS_STRIPPED = True + + # Old stubs might have been stripped, leading to unaligned raw data values, so let's fix them up here. + # pylint thinks that Structure doesn't have various members that it has… + # pylint: disable=no-member + + for i, section in enumerate(pe.sections): + oldp = section.PointerToRawData + oldsz = section.SizeOfRawData + section.PointerToRawData = round_up(oldp, pe.OPTIONAL_HEADER.FileAlignment) + section.SizeOfRawData = round_up(oldsz, pe.OPTIONAL_HEADER.FileAlignment) + padp = section.PointerToRawData - oldp + padsz = section.SizeOfRawData - oldsz + + for later_section in pe.sections[i+1:]: + later_section.PointerToRawData += padp + padsz + + pe.__data__ = pe.__data__[:oldp] + bytes(padp) + pe.__data__[oldp:oldp+oldsz] + bytes(padsz) + pe.__data__[oldp+oldsz:] + + # We might not have any space to add new sections. Let's try our best to make some space by padding the + # SizeOfHeaders to a multiple of the file alignment. This is safe because the first section's data starts + # at a multiple of the file alignment, so all space before that is unused. + pe.OPTIONAL_HEADER.SizeOfHeaders = round_up(pe.OPTIONAL_HEADER.SizeOfHeaders, pe.OPTIONAL_HEADER.FileAlignment) + pe = pefile.PE(data=pe.write(), fast_load=True) + + warnings = pe.get_warnings() + if warnings: + raise PEError(f'pefile warnings treated as errors: {warnings}') + + security = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']] + if security.VirtualAddress != 0: + # We could strip the signatures, but why would anyone sign the stub? + raise PEError('Stub image is signed, refusing.') + + for section in uki.sections: + new_section = pefile.SectionStructure(pe.__IMAGE_SECTION_HEADER_format__, pe=pe) + new_section.__unpack__(b'\0' * new_section.sizeof()) + + offset = pe.sections[-1].get_file_offset() + new_section.sizeof() + if offset + new_section.sizeof() > pe.OPTIONAL_HEADER.SizeOfHeaders: + raise PEError(f'Not enough header space to add section {section.name}.') + + assert section.content + data = section.content.read_bytes() + + new_section.set_file_offset(offset) + new_section.Name = section.name.encode() + new_section.Misc_VirtualSize = len(data) + # Non-stripped stubs might still have an unaligned symbol table at the end, making their size + # unaligned, so we make sure to explicitly pad the pointer to new sections to an aligned offset. + new_section.PointerToRawData = round_up(len(pe.__data__), pe.OPTIONAL_HEADER.FileAlignment) + new_section.SizeOfRawData = round_up(len(data), pe.OPTIONAL_HEADER.FileAlignment) + new_section.VirtualAddress = round_up( + pe.sections[-1].VirtualAddress + pe.sections[-1].Misc_VirtualSize, + pe.OPTIONAL_HEADER.SectionAlignment, + ) + + new_section.IMAGE_SCN_MEM_READ = True + if section.name == '.linux': + # Old kernels that use EFI handover protocol will be executed inline. + new_section.IMAGE_SCN_CNT_CODE = True + else: + new_section.IMAGE_SCN_CNT_INITIALIZED_DATA = True + + # Special case, mostly for .sbat: the stub will already have a .sbat section, but we want to append + # the one from the kernel to it. It should be small enough to fit in the existing section, so just + # swap the data. + for i, s in enumerate(pe.sections): + if s.Name.rstrip(b"\x00").decode() == section.name: + if new_section.Misc_VirtualSize > s.SizeOfRawData: + raise PEError(f'Not enough space in existing section {section.name} to append new data.') + + padding = bytes(new_section.SizeOfRawData - new_section.Misc_VirtualSize) + pe.__data__ = pe.__data__[:s.PointerToRawData] + data + padding + pe.__data__[pe.sections[i+1].PointerToRawData:] + s.SizeOfRawData = new_section.SizeOfRawData + s.Misc_VirtualSize = new_section.Misc_VirtualSize + break + else: + pe.__data__ = pe.__data__[:] + bytes(new_section.PointerToRawData - len(pe.__data__)) + data + bytes(new_section.SizeOfRawData - len(data)) + + pe.FILE_HEADER.NumberOfSections += 1 + pe.OPTIONAL_HEADER.SizeOfInitializedData += new_section.Misc_VirtualSize + pe.__structures__.append(new_section) + pe.sections.append(new_section) + + pe.OPTIONAL_HEADER.CheckSum = 0 + pe.OPTIONAL_HEADER.SizeOfImage = round_up( + pe.sections[-1].VirtualAddress + pe.sections[-1].Misc_VirtualSize, + pe.OPTIONAL_HEADER.SectionAlignment, + ) + + pe.write(output) + +def merge_sbat(input_pe: [pathlib.Path], input_text: [str]) -> str: + sbat = [] + + for f in input_pe: + try: + pe = pefile.PE(f, fast_load=True) + except pefile.PEFormatError: + print(f"{f} is not a valid PE file, not extracting SBAT section.") + continue + + for section in pe.sections: + if section.Name.rstrip(b"\x00").decode() == ".sbat": + split = section.get_data().rstrip(b"\x00").decode().splitlines() + if not split[0].startswith('sbat,'): + print(f"{f} does not contain a valid SBAT section, skipping.") + continue + # Filter out the sbat line, we'll add it back later, there needs to be only one and it + # needs to be first. + sbat += split[1:] + + for t in input_text: + if t.startswith('@'): + t = pathlib.Path(t[1:]).read_text() + split = t.splitlines() + if not split[0].startswith('sbat,'): + print(f"{t} does not contain a valid SBAT section, skipping.") + continue + sbat += split[1:] + + return 'sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md\n' + '\n'.join(sbat) + "\n\x00" + +def signer_sign(cmd): + print('+', shell_join(cmd)) + subprocess.check_call(cmd) + +def find_sbsign(opts=None): + return find_tool('sbsign', opts=opts) + +def sbsign_sign(sbsign_tool, input_f, output_f, opts=None): + sign_invocation = [ + sbsign_tool, + '--key', opts.sb_key, + '--cert', opts.sb_cert, + input_f, + '--output', output_f, + ] + if opts.signing_engine is not None: + sign_invocation += ['--engine', opts.signing_engine] + signer_sign(sign_invocation) + +def find_pesign(opts=None): + return find_tool('pesign', opts=opts) + +def pesign_sign(pesign_tool, input_f, output_f, opts=None): + sign_invocation = [ + pesign_tool, '-s', '--force', + '-n', opts.sb_certdir, + '-c', opts.sb_cert_name, + '-i', input_f, + '-o', output_f, + ] + signer_sign(sign_invocation) + +SBVERIFY = { + 'name': 'sbverify', + 'option': '--list', + 'output': 'No signature table present', +} + +PESIGCHECK = { + 'name': 'pesign', + 'option': '-i', + 'output': 'No signatures found.', + 'flags': '-S' +} + +def verify(tool, opts): + verify_tool = find_tool(tool['name'], opts=opts) + cmd = [ + verify_tool, + tool['option'], + opts.linux, + ] + if 'flags' in tool: + cmd.append(tool['flags']) + + print('+', shell_join(cmd)) + info = subprocess.check_output(cmd, text=True) + + return tool['output'] in info + +def make_uki(opts): + # kernel payload signing + + sign_tool = None + sign_args_present = opts.sb_key or opts.sb_cert_name + sign_kernel = opts.sign_kernel + sign = None + linux = opts.linux + + if sign_args_present: + if opts.signtool == 'sbsign': + sign_tool = find_sbsign(opts=opts) + sign = sbsign_sign + verify_tool = SBVERIFY + else: + sign_tool = find_pesign(opts=opts) + sign = pesign_sign + verify_tool = PESIGCHECK + + if sign_tool is None: + raise ValueError(f'{opts.signtool}, required for signing, is not installed') + + if sign_kernel is None and opts.linux is not None: + # figure out if we should sign the kernel + sign_kernel = verify(verify_tool, opts) + + if sign_kernel: + linux_signed = tempfile.NamedTemporaryFile(prefix='linux-signed') + linux = pathlib.Path(linux_signed.name) + sign(sign_tool, opts.linux, linux, opts=opts) + + if opts.uname is None and opts.linux is not None: + print('Kernel version not specified, starting autodetection 😖.') + opts.uname = Uname.scrape(opts.linux, opts=opts) + + uki = UKI(opts.stub) + initrd = join_initrds(opts.initrd) + + pcrpkey = opts.pcrpkey + if pcrpkey is None: + if opts.pcr_public_keys and len(opts.pcr_public_keys) == 1: + pcrpkey = opts.pcr_public_keys[0] + elif opts.pcr_private_keys and len(opts.pcr_private_keys) == 1: + import cryptography.hazmat.primitives.serialization as serialization + privkey = serialization.load_pem_private_key(opts.pcr_private_keys[0].read_bytes(), password=None) + pcrpkey = privkey.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ) + + sections = [ + # name, content, measure? + ('.osrel', opts.os_release, True ), + ('.cmdline', opts.cmdline, True ), + ('.dtb', opts.devicetree, True ), + ('.uname', opts.uname, True ), + ('.splash', opts.splash, True ), + ('.pcrpkey', pcrpkey, True ), + ('.initrd', initrd, True ), + + # linux shall be last to leave breathing room for decompression. + # We'll add it later. + ] + + for name, content, measure in sections: + if content: + uki.add_section(Section.create(name, content, measure=measure)) + + # systemd-measure doesn't know about those extra sections + for section in opts.sections: + uki.add_section(section) + + if linux is not None: + # Merge the .sbat sections from stub, kernel and parameter, so that revocation can be done on either. + uki.add_section(Section.create('.sbat', merge_sbat([opts.stub, linux], opts.sbat), measure=True)) + else: + # Addons don't use the stub so we add SBAT manually + if not opts.sbat: + opts.sbat = ["""sbat,1,SBAT Version,sbat,1,https://github.com/rhboot/shim/blob/main/SBAT.md +uki,1,UKI,uki,1,https://www.freedesktop.org/software/systemd/man/systemd-stub.html +"""] + uki.add_section(Section.create('.sbat', merge_sbat([], opts.sbat), measure=False)) + + # PCR measurement and signing + + # We pass in the contents for .linux separately because we need them to do the measurement but can't add + # the section yet because we want .linux to be the last section. Make sure any other sections are added + # before this function is called. + call_systemd_measure(uki, linux, opts=opts) + + # UKI creation + + if linux is not None: + uki.add_section(Section.create('.linux', linux, measure=True)) + + if sign_args_present: + unsigned = tempfile.NamedTemporaryFile(prefix='uki') + unsigned_output = unsigned.name + else: + unsigned_output = opts.output + + pe_add_sections(uki, unsigned_output) + + # UKI signing + + if sign_args_present: + assert sign + sign(sign_tool, unsigned_output, opts.output, opts=opts) + + # We end up with no executable bits, let's reapply them + os.umask(umask := os.umask(0)) + os.chmod(opts.output, 0o777 & ~umask) + + print(f"Wrote {'signed' if sign_args_present else 'unsigned'} {opts.output}") + + + +@contextlib.contextmanager +def temporary_umask(mask: int): + # Drop bits from umask + old = os.umask(0) + os.umask(old | mask) + try: + yield + finally: + os.umask(old) + + +def generate_key_cert_pair( + common_name: str, + valid_days: int, + keylength: int = 2048, +) -> tuple[bytes]: + + from cryptography import x509 + from cryptography.hazmat.primitives import serialization, hashes + from cryptography.hazmat.primitives.asymmetric import rsa + + # We use a keylength of 2048 bits. That is what Microsoft documents as + # supported/expected: + # https://learn.microsoft.com/en-us/windows-hardware/manufacture/desktop/windows-secure-boot-key-creation-and-management-guidance?view=windows-11#12-public-key-cryptography + + now = datetime.datetime.now(datetime.timezone.utc) + + key = rsa.generate_private_key( + public_exponent=65537, + key_size=keylength, + ) + cert = x509.CertificateBuilder( + ).subject_name( + x509.Name([x509.NameAttribute(x509.oid.NameOID.COMMON_NAME, common_name)]) + ).issuer_name( + x509.Name([x509.NameAttribute(x509.oid.NameOID.COMMON_NAME, common_name)]) + ).not_valid_before( + now, + ).not_valid_after( + now + datetime.timedelta(days=valid_days) + ).serial_number( + x509.random_serial_number() + ).public_key( + key.public_key() + ).add_extension( + x509.BasicConstraints(ca=False, path_length=None), + critical=True, + ).sign( + private_key=key, + algorithm=hashes.SHA256(), + ) + + cert_pem = cert.public_bytes( + encoding=serialization.Encoding.PEM, + ) + key_pem = key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption(), + ) + + return key_pem, cert_pem + + +def generate_priv_pub_key_pair(keylength : int = 2048) -> tuple[bytes]: + from cryptography.hazmat.primitives import serialization + from cryptography.hazmat.primitives.asymmetric import rsa + + key = rsa.generate_private_key( + public_exponent=65537, + key_size=keylength, + ) + priv_key_pem = key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption(), + ) + pub_key_pem = key.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ) + + return priv_key_pem, pub_key_pem + + +def generate_keys(opts): + work = False + + # This will generate keys and certificates and write them to the paths that + # are specified as input paths. + if opts.sb_key or opts.sb_cert: + fqdn = socket.getfqdn() + cn = f'SecureBoot signing key on host {fqdn}' + key_pem, cert_pem = generate_key_cert_pair( + common_name=cn, + valid_days=opts.sb_cert_validity, + ) + print(f'Writing SecureBoot private key to {opts.sb_key}') + with temporary_umask(0o077): + opts.sb_key.write_bytes(key_pem) + print(f'Writing SecureBoot certificate to {opts.sb_cert}') + opts.sb_cert.write_bytes(cert_pem) + + work = True + + for priv_key, pub_key, _ in key_path_groups(opts): + priv_key_pem, pub_key_pem = generate_priv_pub_key_pair() + + print(f'Writing private key for PCR signing to {priv_key}') + with temporary_umask(0o077): + priv_key.write_bytes(priv_key_pem) + if pub_key: + print(f'Writing public key for PCR signing to {pub_key}') + pub_key.write_bytes(pub_key_pem) + + work = True + + if not work: + raise ValueError('genkey: --secureboot-private-key=/--secureboot-certificate= or --pcr-private-key/--pcr-public-key must be specified') + + +def inspect_section(opts, section): + name = section.Name.rstrip(b"\x00").decode() + + # find the config for this section in opts and whether to show it + config = opts.sections_by_name.get(name, None) + show = (config or + opts.all or + (name in DEFAULT_SECTIONS_TO_SHOW and not opts.sections)) + if not show: + return name, None + + ttype = config.output_mode if config else DEFAULT_SECTIONS_TO_SHOW.get(name, 'binary') + + size = section.Misc_VirtualSize + # TODO: Use ignore_padding once we can depend on a newer version of pefile + data = section.get_data(length=size) + digest = sha256(data).hexdigest() + + struct = { + 'size' : size, + 'sha256' : digest, + } + + if ttype == 'text': + try: + struct['text'] = data.decode() + except UnicodeDecodeError as e: + print(f"Section {name!r} is not valid text: {e}") + struct['text'] = '(not valid UTF-8)' + + if config and config.content: + assert isinstance(config.content, pathlib.Path) + config.content.write_bytes(data) + + if opts.json == 'off': + print(f"{name}:\n size: {size} bytes\n sha256: {digest}") + if ttype == 'text': + text = textwrap.indent(struct['text'].rstrip(), ' ' * 4) + print(f" text:\n{text}") + + return name, struct + + +def inspect_sections(opts): + indent = 4 if opts.json == 'pretty' else None + + for file in opts.files: + pe = pefile.PE(file, fast_load=True) + gen = (inspect_section(opts, section) for section in pe.sections) + descs = {key:val for (key, val) in gen if val} + if opts.json != 'off': + json.dump(descs, sys.stdout, indent=indent) + + +@dataclasses.dataclass(frozen=True) +class ConfigItem: + @staticmethod + def config_list_prepend( + namespace: argparse.Namespace, + group: Optional[str], + dest: str, + value: Any, + ) -> None: + "Prepend value to namespace." + + assert not group + + old = getattr(namespace, dest, []) + if old is None: + old = [] + setattr(namespace, dest, value + old) + + @staticmethod + def config_set_if_unset( + namespace: argparse.Namespace, + group: Optional[str], + dest: str, + value: Any, + ) -> None: + "Set namespace. to value only if it was None" + + assert not group + + if getattr(namespace, dest) is None: + setattr(namespace, dest, value) + + @staticmethod + def config_set( + namespace: argparse.Namespace, + group: Optional[str], + dest: str, + value: Any, + ) -> None: + "Set namespace. to value only if it was None" + + assert not group + + setattr(namespace, dest, value) + + @staticmethod + def config_set_group( + namespace: argparse.Namespace, + group: Optional[str], + dest: str, + value: Any, + ) -> None: + "Set namespace.[idx] to value, with idx derived from group" + + # pylint: disable=protected-access + if group not in namespace._groups: + namespace._groups += [group] + idx = namespace._groups.index(group) + + old = getattr(namespace, dest, None) + if old is None: + old = [] + setattr(namespace, dest, + old + ([None] * (idx - len(old))) + [value]) + + @staticmethod + def parse_boolean(s: str) -> bool: + "Parse 1/true/yes/y/t/on as true and 0/false/no/n/f/off/None as false" + s_l = s.lower() + if s_l in {'1', 'true', 'yes', 'y', 't', 'on'}: + return True + if s_l in {'0', 'false', 'no', 'n', 'f', 'off'}: + return False + raise ValueError('f"Invalid boolean literal: {s!r}') + + # arguments for argparse.ArgumentParser.add_argument() + name: Union[str, tuple[str, str]] + dest: Optional[str] = None + metavar: Optional[str] = None + type: Optional[Callable] = None + nargs: Optional[str] = None + action: Optional[Union[str, Callable]] = None + default: Any = None + version: Optional[str] = None + choices: Optional[tuple[str, ...]] = None + const: Optional[Any] = None + help: Optional[str] = None + + # metadata for config file parsing + config_key: Optional[str] = None + config_push: Callable[[argparse.Namespace, Optional[str], str, Any], None] = \ + config_set_if_unset + + def _names(self) -> tuple[str, ...]: + return self.name if isinstance(self.name, tuple) else (self.name,) + + def argparse_dest(self) -> str: + # It'd be nice if argparse exported this, but I don't see that in the API + if self.dest: + return self.dest + return self._names()[0].lstrip('-').replace('-', '_') + + def add_to(self, parser: argparse.ArgumentParser): + kwargs = { key:val + for key in dataclasses.asdict(self) + if (key not in ('name', 'config_key', 'config_push') and + (val := getattr(self, key)) is not None) } + args = self._names() + parser.add_argument(*args, **kwargs) + + def apply_config(self, namespace, section, group, key, value) -> None: + assert f'{section}/{key}' == self.config_key + dest = self.argparse_dest() + + conv: Callable[[str], Any] + if self.action == argparse.BooleanOptionalAction: + # We need to handle this case separately: the options are called + # --foo and --no-foo, and no argument is parsed. But in the config + # file, we have Foo=yes or Foo=no. + conv = self.parse_boolean + elif self.type: + conv = self.type + else: + conv = lambda s:s + + # This is a bit ugly, but --initrd is the only option which is specified + # with multiple args on the command line and a space-separated list in the + # config file. + if self.name == '--initrd': + value = [conv(v) for v in value.split()] + else: + value = conv(value) + + self.config_push(namespace, group, dest, value) + + def config_example(self) -> tuple[Optional[str], Optional[str], Optional[str]]: + if not self.config_key: + return None, None, None + section_name, key = self.config_key.split('/', 1) + if section_name.endswith(':'): + section_name += 'NAME' + if self.choices: + value = '|'.join(self.choices) + else: + value = self.metavar or self.argparse_dest().upper() + return (section_name, key, value) + + +VERBS = ('build', 'genkey', 'inspect') + +CONFIG_ITEMS = [ + ConfigItem( + 'positional', + metavar = 'VERB', + nargs = '*', + help = argparse.SUPPRESS, + ), + + ConfigItem( + '--version', + action = 'version', + version = f'ukify {__version__}', + ), + + ConfigItem( + '--summary', + help = 'print parsed config and exit', + action = 'store_true', + ), + + ConfigItem( + '--linux', + type = pathlib.Path, + help = 'vmlinuz file [.linux section]', + config_key = 'UKI/Linux', + ), + + ConfigItem( + '--initrd', + metavar = 'INITRD', + type = pathlib.Path, + action = 'append', + help = 'initrd file [part of .initrd section]', + config_key = 'UKI/Initrd', + config_push = ConfigItem.config_list_prepend, + ), + + ConfigItem( + ('--config', '-c'), + metavar = 'PATH', + type = pathlib.Path, + help = 'configuration file', + ), + + ConfigItem( + '--cmdline', + metavar = 'TEXT|@PATH', + help = 'kernel command line [.cmdline section]', + config_key = 'UKI/Cmdline', + ), + + ConfigItem( + '--os-release', + metavar = 'TEXT|@PATH', + help = 'path to os-release file [.osrel section]', + config_key = 'UKI/OSRelease', + ), + + ConfigItem( + '--devicetree', + metavar = 'PATH', + type = pathlib.Path, + help = 'Device Tree file [.dtb section]', + config_key = 'UKI/DeviceTree', + ), + ConfigItem( + '--splash', + metavar = 'BMP', + type = pathlib.Path, + help = 'splash image bitmap file [.splash section]', + config_key = 'UKI/Splash', + ), + ConfigItem( + '--pcrpkey', + metavar = 'KEY', + type = pathlib.Path, + help = 'embedded public key to seal secrets to [.pcrpkey section]', + config_key = 'UKI/PCRPKey', + ), + ConfigItem( + '--uname', + metavar='VERSION', + help='"uname -r" information [.uname section]', + config_key = 'UKI/Uname', + ), + + ConfigItem( + '--efi-arch', + metavar = 'ARCH', + choices = ('ia32', 'x64', 'arm', 'aa64', 'riscv64'), + help = 'target EFI architecture', + config_key = 'UKI/EFIArch', + ), + + ConfigItem( + '--stub', + type = pathlib.Path, + help = 'path to the sd-stub file [.text,.data,… sections]', + config_key = 'UKI/Stub', + ), + + ConfigItem( + '--sbat', + metavar = 'TEXT|@PATH', + help = 'SBAT policy [.sbat section]', + default = [], + action = 'append', + config_key = 'UKI/SBAT', + ), + + ConfigItem( + '--section', + dest = 'sections', + metavar = 'NAME:TEXT|@PATH', + action = 'append', + default = [], + help = 'section as name and contents [NAME section] or section to print', + ), + + ConfigItem( + '--pcr-banks', + metavar = 'BANK…', + type = parse_banks, + config_key = 'UKI/PCRBanks', + ), + + ConfigItem( + '--signing-engine', + metavar = 'ENGINE', + help = 'OpenSSL engine to use for signing', + config_key = 'UKI/SigningEngine', + ), + ConfigItem( + '--signtool', + choices = ('sbsign', 'pesign'), + dest = 'signtool', + help = 'whether to use sbsign or pesign. It will also be inferred by the other \ + parameters given: when using --secureboot-{private-key/certificate}, sbsign \ + will be used, otherwise pesign will be used', + config_key = 'UKI/SecureBootSigningTool', + ), + ConfigItem( + '--secureboot-private-key', + dest = 'sb_key', + help = 'required by --signtool=sbsign. Path to key file or engine-specific designation for SB signing', + config_key = 'UKI/SecureBootPrivateKey', + ), + ConfigItem( + '--secureboot-certificate', + dest = 'sb_cert', + help = 'required by --signtool=sbsign. sbsign needs a path to certificate file or engine-specific designation for SB signing', + config_key = 'UKI/SecureBootCertificate', + ), + ConfigItem( + '--secureboot-certificate-dir', + dest = 'sb_certdir', + default = '/etc/pki/pesign', + help = 'required by --signtool=pesign. Path to nss certificate database directory for PE signing. Default is /etc/pki/pesign', + config_key = 'UKI/SecureBootCertificateDir', + config_push = ConfigItem.config_set + ), + ConfigItem( + '--secureboot-certificate-name', + dest = 'sb_cert_name', + help = 'required by --signtool=pesign. pesign needs a certificate nickname of nss certificate database entry to use for PE signing', + config_key = 'UKI/SecureBootCertificateName', + ), + ConfigItem( + '--secureboot-certificate-validity', + metavar = 'DAYS', + type = int, + dest = 'sb_cert_validity', + default = 365 * 10, + help = "period of validity (in days) for a certificate created by 'genkey'", + config_key = 'UKI/SecureBootCertificateValidity', + config_push = ConfigItem.config_set + ), + + ConfigItem( + '--sign-kernel', + action = argparse.BooleanOptionalAction, + help = 'Sign the embedded kernel', + config_key = 'UKI/SignKernel', + ), + + ConfigItem( + '--pcr-private-key', + dest = 'pcr_private_keys', + metavar = 'PATH', + type = pathlib.Path, + action = 'append', + help = 'private part of the keypair for signing PCR signatures', + config_key = 'PCRSignature:/PCRPrivateKey', + config_push = ConfigItem.config_set_group, + ), + ConfigItem( + '--pcr-public-key', + dest = 'pcr_public_keys', + metavar = 'PATH', + type = pathlib.Path, + action = 'append', + help = 'public part of the keypair for signing PCR signatures', + config_key = 'PCRSignature:/PCRPublicKey', + config_push = ConfigItem.config_set_group, + ), + ConfigItem( + '--phases', + dest = 'phase_path_groups', + metavar = 'PHASE-PATH…', + type = parse_phase_paths, + action = 'append', + help = 'phase-paths to create signatures for', + config_key = 'PCRSignature:/Phases', + config_push = ConfigItem.config_set_group, + ), + + ConfigItem( + '--tools', + type = pathlib.Path, + action = 'append', + help = 'Directories to search for tools (systemd-measure, …)', + ), + + ConfigItem( + ('--output', '-o'), + type = pathlib.Path, + help = 'output file path', + ), + + ConfigItem( + '--measure', + action = argparse.BooleanOptionalAction, + help = 'print systemd-measure output for the UKI', + ), + + ConfigItem( + '--json', + choices = ('pretty', 'short', 'off'), + default = 'off', + help = 'generate JSON output', + ), + ConfigItem( + '-j', + dest='json', + action='store_const', + const='pretty', + help='equivalent to --json=pretty', + ), + + ConfigItem( + '--all', + help = 'print all sections', + action = 'store_true', + ), +] + +CONFIGFILE_ITEMS = { item.config_key:item + for item in CONFIG_ITEMS + if item.config_key } + + +def apply_config(namespace, filename=None): + if filename is None: + if namespace.config: + # Config set by the user, use that. + filename = namespace.config + print(f'Using config file: {filename}') + else: + # Try to look for a config file then use the first one found. + for config_dir in DEFAULT_CONFIG_DIRS: + filename = pathlib.Path(config_dir) / DEFAULT_CONFIG_FILE + if filename.is_file(): + # Found a config file, use it. + print(f'Using found config file: {filename}') + break + else: + # No config file specified or found, nothing to do. + return + + # Fill in ._groups based on --pcr-public-key=, --pcr-private-key=, and --phases=. + assert '_groups' not in namespace + n_pcr_priv = len(namespace.pcr_private_keys or ()) + namespace._groups = list(range(n_pcr_priv)) # pylint: disable=protected-access + + cp = configparser.ConfigParser( + comment_prefixes='#', + inline_comment_prefixes='#', + delimiters='=', + empty_lines_in_values=False, + interpolation=None, + strict=False) + # Do not make keys lowercase + cp.optionxform = lambda option: option + + # The API is not great. + read = cp.read(filename) + if not read: + raise IOError(f'Failed to read {filename}') + + for section_name, section in cp.items(): + idx = section_name.find(':') + if idx >= 0: + section_name, group = section_name[:idx+1], section_name[idx+1:] + if not section_name or not group: + raise ValueError('Section name components cannot be empty') + if ':' in group: + raise ValueError('Section name cannot contain more than one ":"') + else: + group = None + for key, value in section.items(): + if item := CONFIGFILE_ITEMS.get(f'{section_name}/{key}'): + item.apply_config(namespace, section_name, group, key, value) + else: + print(f'Unknown config setting [{section_name}] {key}=') + + +def config_example(): + prev_section = None + for item in CONFIG_ITEMS: + section, key, value = item.config_example() + if section: + if prev_section != section: + if prev_section: + yield '' + yield f'[{section}]' + prev_section = section + yield f'{key} = {value}' + + +class PagerHelpAction(argparse._HelpAction): # pylint: disable=protected-access + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: Union[str, Sequence[Any], None] = None, + option_string: Optional[str] = None + ) -> None: + page(parser.format_help(), True) + parser.exit() + + +def create_parser(): + p = argparse.ArgumentParser( + description='Build and sign Unified Kernel Images', + usage='\n ' + textwrap.dedent('''\ + ukify {b}build{e} [--linux=LINUX] [--initrd=INITRD] [options…] + ukify {b}genkey{e} [options…] + ukify {b}inspect{e} FILE… [options…] + ''').format(b=Style.bold, e=Style.reset), + allow_abbrev=False, + add_help=False, + epilog='\n '.join(('config file:', *config_example())), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + for item in CONFIG_ITEMS: + item.add_to(p) + + # Suppress printing of usage synopsis on errors + p.error = lambda message: p.exit(2, f'{p.prog}: error: {message}\n') + + # Make --help paged + p.add_argument( + '-h', '--help', + action=PagerHelpAction, + help='show this help message and exit', + ) + + return p + + +def finalize_options(opts): + # Figure out which syntax is being used, one of: + # ukify verb --arg --arg --arg + # ukify linux initrd… + if len(opts.positional) >= 1 and opts.positional[0] == 'inspect': + opts.verb = opts.positional[0] + opts.files = opts.positional[1:] + if not opts.files: + raise ValueError('file(s) to inspect must be specified') + if len(opts.files) > 1 and opts.json != 'off': + # We could allow this in the future, but we need to figure out the right structure + raise ValueError('JSON output is not allowed with multiple files') + elif len(opts.positional) == 1 and opts.positional[0] in VERBS: + opts.verb = opts.positional[0] + elif opts.linux or opts.initrd: + raise ValueError('--linux/--initrd options cannot be used with positional arguments') + else: + print("Assuming obsolete command line syntax with no verb. Please use 'build'.") + if opts.positional: + opts.linux = pathlib.Path(opts.positional[0]) + # If we have initrds from parsing config files, append our positional args at the end + opts.initrd = (opts.initrd or []) + [pathlib.Path(arg) for arg in opts.positional[1:]] + opts.verb = 'build' + + # Check that --pcr-public-key=, --pcr-private-key=, and --phases= + # have either the same number of arguments are are not specified at all. + n_pcr_pub = None if opts.pcr_public_keys is None else len(opts.pcr_public_keys) + n_pcr_priv = None if opts.pcr_private_keys is None else len(opts.pcr_private_keys) + n_phase_path_groups = None if opts.phase_path_groups is None else len(opts.phase_path_groups) + if n_pcr_pub is not None and n_pcr_pub != n_pcr_priv: + raise ValueError('--pcr-public-key= specifications must match --pcr-private-key=') + if n_phase_path_groups is not None and n_phase_path_groups != n_pcr_priv: + raise ValueError('--phases= specifications must match --pcr-private-key=') + + if opts.cmdline and opts.cmdline.startswith('@'): + opts.cmdline = pathlib.Path(opts.cmdline[1:]) + elif opts.cmdline: + # Drop whitespace from the command line. If we're reading from a file, + # we copy the contents verbatim. But configuration specified on the command line + # or in the config file may contain additional whitespace that has no meaning. + opts.cmdline = ' '.join(opts.cmdline.split()) + + if opts.os_release and opts.os_release.startswith('@'): + opts.os_release = pathlib.Path(opts.os_release[1:]) + elif not opts.os_release and opts.linux: + p = pathlib.Path('/etc/os-release') + if not p.exists(): + p = pathlib.Path('/usr/lib/os-release') + opts.os_release = p + + if opts.efi_arch is None: + opts.efi_arch = guess_efi_arch() + + if opts.stub is None: + if opts.linux is not None: + opts.stub = pathlib.Path(f'/usr/lib/systemd/boot/efi/linux{opts.efi_arch}.efi.stub') + else: + opts.stub = pathlib.Path(f'/usr/lib/systemd/boot/efi/addon{opts.efi_arch}.efi.stub') + + if opts.signing_engine is None: + if opts.sb_key: + opts.sb_key = pathlib.Path(opts.sb_key) + if opts.sb_cert: + opts.sb_cert = pathlib.Path(opts.sb_cert) + + if bool(opts.sb_key) ^ bool(opts.sb_cert): + # one param only given, sbsign needs both + raise ValueError('--secureboot-private-key= and --secureboot-certificate= must be specified together') + elif bool(opts.sb_key) and bool(opts.sb_cert): + # both param given, infer sbsign and in case it was given, ensure signtool=sbsign + if opts.signtool and opts.signtool != 'sbsign': + raise ValueError(f'Cannot provide --signtool={opts.signtool} with --secureboot-private-key= and --secureboot-certificate=') + opts.signtool = 'sbsign' + elif bool(opts.sb_cert_name): + # sb_cert_name given, infer pesign and in case it was given, ensure signtool=pesign + if opts.signtool and opts.signtool != 'pesign': + raise ValueError(f'Cannot provide --signtool={opts.signtool} with --secureboot-certificate-name=') + opts.signtool = 'pesign' + + if opts.sign_kernel and not opts.sb_key and not opts.sb_cert_name: + raise ValueError('--sign-kernel requires either --secureboot-private-key= and --secureboot-certificate= (for sbsign) or --secureboot-certificate-name= (for pesign) to be specified') + + if opts.verb == 'build' and opts.output is None: + if opts.linux is None: + raise ValueError('--output= must be specified when building a PE addon') + suffix = '.efi' if opts.sb_key or opts.sb_cert_name else '.unsigned.efi' + opts.output = opts.linux.name + suffix + + # Now that we know if we're inputting or outputting, really parse section config + f = Section.parse_output if opts.verb == 'inspect' else Section.parse_input + opts.sections = [f(s) for s in opts.sections] + # A convenience dictionary to make it easy to look up sections + opts.sections_by_name = {s.name:s for s in opts.sections} + + if opts.summary: + # TODO: replace pprint() with some fancy formatting. + pprint.pprint(vars(opts)) + sys.exit() + + +def parse_args(args=None): + opts = create_parser().parse_args(args) + apply_config(opts) + finalize_options(opts) + return opts + + +def main(): + opts = parse_args() + if opts.verb == 'build': + check_inputs(opts) + make_uki(opts) + elif opts.verb == 'genkey': + check_cert_and_keys_nonexistent(opts) + generate_keys(opts) + elif opts.verb == 'inspect': + inspect_sections(opts) + else: + assert False + + +if __name__ == '__main__': + main() diff --git a/src/update-done/meson.build b/src/update-done/meson.build new file mode 100644 index 0000000..89a79b6 --- /dev/null +++ b/src/update-done/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-update-done', + 'sources' : files('update-done.c'), + }, +] diff --git a/src/update-done/update-done.c b/src/update-done/update-done.c new file mode 100644 index 0000000..f448b3b --- /dev/null +++ b/src/update-done/update-done.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "fileio-label.h" +#include "selinux-util.h" +#include "time-util.h" + +#define MESSAGE \ + "# This file was created by systemd-update-done. Its only \n" \ + "# purpose is to hold a timestamp of the time this directory\n" \ + "# was updated. See man:systemd-update-done.service(8).\n" + +static int apply_timestamp(const char *path, struct timespec *ts) { + _cleanup_free_ char *message = NULL; + int r; + + /* + * We store the timestamp both as mtime of the file and in the file itself, + * to support filesystems which cannot store nanosecond-precision timestamps. + */ + + if (asprintf(&message, + MESSAGE + "TIMESTAMP_NSEC=" NSEC_FMT "\n", + timespec_load_nsec(ts)) < 0) + return log_oom(); + + r = write_string_file_atomic_label_ts(path, message, ts); + if (r == -EROFS) + log_debug_errno(r, "Cannot create \"%s\", file system is read-only.", path); + else if (r < 0) + return log_error_errno(r, "Failed to write \"%s\": %m", path); + return 0; +} + +int main(int argc, char *argv[]) { + struct stat st; + int r, q = 0; + + log_setup(); + + if (stat("/usr", &st) < 0) { + log_error_errno(errno, "Failed to stat /usr: %m"); + return EXIT_FAILURE; + } + + r = mac_init(); + if (r < 0) + return EXIT_FAILURE; + + r = apply_timestamp("/etc/.updated", &st.st_mtim); + q = apply_timestamp("/var/.updated", &st.st_mtim); + + return r < 0 || q < 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/src/update-utmp/meson.build b/src/update-utmp/meson.build new file mode 100644 index 0000000..1db4445 --- /dev/null +++ b/src/update-utmp/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-update-utmp', + 'conditions' : ['ENABLE_UTMP'], + 'sources' : files('update-utmp.c'), + 'dependencies' : libaudit, + }, +] diff --git a/src/update-utmp/update-utmp.c b/src/update-utmp/update-utmp.c new file mode 100644 index 0000000..4ee935e --- /dev/null +++ b/src/update-utmp/update-utmp.c @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#if HAVE_AUDIT +#include +#endif + +#include "sd-bus.h" + +#include "alloc-util.h" +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-util.h" +#include "format-util.h" +#include "log.h" +#include "macro.h" +#include "main-func.h" +#include "process-util.h" +#include "random-util.h" +#include "special.h" +#include "stdio-util.h" +#include "strv.h" +#include "unit-name.h" +#include "utmp-wtmp.h" +#include "verbs.h" + +typedef struct Context { + sd_bus *bus; +#if HAVE_AUDIT + int audit_fd; +#endif +} Context; + +static void context_clear(Context *c) { + assert(c); + + c->bus = sd_bus_flush_close_unref(c->bus); +#if HAVE_AUDIT + if (c->audit_fd >= 0) + audit_close(c->audit_fd); + c->audit_fd = -EBADF; +#endif +} + +static int get_startup_monotonic_time(Context *c, usec_t *ret) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(c); + assert(ret); + + r = bus_get_property_trivial( + c->bus, + bus_systemd_mgr, + "UserspaceTimestampMonotonic", + &error, + 't', ret); + if (r < 0) + return log_warning_errno(r, "Failed to get timestamp, ignoring: %s", bus_error_message(&error, r)); + + return 0; +} + +static int get_current_runlevel(Context *c) { + static const struct { + const int runlevel; + const char *special; + } table[] = { + /* The first target of this list that is active or has a job scheduled wins. We prefer + * runlevels 5 and 3 here over the others, since these are the main runlevels used on Fedora. + * It might make sense to change the order on some distributions. */ + { '5', SPECIAL_GRAPHICAL_TARGET }, + { '3', SPECIAL_MULTI_USER_TARGET }, + { '1', SPECIAL_RESCUE_TARGET }, + }; + int r; + + assert(c); + + for (unsigned n_attempts = 0;;) { + FOREACH_ARRAY(e, table, ELEMENTSOF(table)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_free_ char *state = NULL, *path = NULL; + + path = unit_dbus_path_from_name(e->special); + if (!path) + return log_oom(); + + r = sd_bus_get_property_string( + c->bus, + "org.freedesktop.systemd1", + path, + "org.freedesktop.systemd1.Unit", + "ActiveState", + &error, + &state); + if ((r == -ENOTCONN || + sd_bus_error_has_names(&error, + SD_BUS_ERROR_NO_REPLY, + SD_BUS_ERROR_DISCONNECTED)) && + ++n_attempts < 64) { + + /* systemd might have dropped off momentarily, let's not make this an error, + * and wait some random time. Let's pick a random time in the range 0ms…250ms, + * linearly scaled by the number of failed attempts. */ + + usec_t usec = random_u64_range(UINT64_C(10) * USEC_PER_MSEC + + UINT64_C(240) * USEC_PER_MSEC * n_attempts/64); + log_debug_errno(r, "Failed to get state of %s, retrying after %s: %s", + e->special, FORMAT_TIMESPAN(usec, USEC_PER_MSEC), bus_error_message(&error, r)); + (void) usleep_safe(usec); + goto reconnect; + } + if (r < 0) + return log_warning_errno(r, "Failed to get state of %s: %s", e->special, bus_error_message(&error, r)); + + if (STR_IN_SET(state, "active", "reloading")) + return e->runlevel; + } + + return 0; + +reconnect: + c->bus = sd_bus_flush_close_unref(c->bus); + r = bus_connect_system_systemd(&c->bus); + if (r < 0) + return log_error_errno(r, "Failed to reconnect to system bus: %m"); + } +} + +static int on_reboot(int argc, char *argv[], void *userdata) { + Context *c = ASSERT_PTR(userdata); + usec_t t = 0, boottime; + int r, q = 0; + + /* We finished start-up, so let's write the utmp record and send the audit msg. */ + +#if HAVE_AUDIT + if (c->audit_fd >= 0) + if (audit_log_user_comm_message(c->audit_fd, AUDIT_SYSTEM_BOOT, "", "systemd-update-utmp", NULL, NULL, NULL, 1) < 0 && + errno != EPERM) + q = log_error_errno(errno, "Failed to send audit message: %m"); +#endif + + /* If this call fails, then utmp_put_reboot() will fix to the current time. */ + (void) get_startup_monotonic_time(c, &t); + boottime = map_clock_usec(t, CLOCK_MONOTONIC, CLOCK_REALTIME); + /* We query the recorded monotonic time here (instead of the system clock CLOCK_REALTIME), even + * though we actually want the system clock time. That's because there's a likely chance that the + * system clock wasn't set right during early boot. By manually converting the monotonic clock to the + * system clock here we can compensate for incorrectly set clocks during early boot. */ + + r = utmp_put_reboot(boottime); + if (r < 0) + return log_error_errno(r, "Failed to write utmp record: %m"); + + return q; +} + +static int on_shutdown(int argc, char *argv[], void *userdata) { + int r, q = 0; + + /* We started shut-down, so let's write the utmp record and send the audit msg. */ + +#if HAVE_AUDIT + Context *c = ASSERT_PTR(userdata); + + if (c->audit_fd >= 0) + if (audit_log_user_comm_message(c->audit_fd, AUDIT_SYSTEM_SHUTDOWN, "", "systemd-update-utmp", NULL, NULL, NULL, 1) < 0 && + errno != EPERM) + q = log_error_errno(errno, "Failed to send audit message: %m"); +#endif + + r = utmp_put_shutdown(); + if (r < 0) + return log_error_errno(r, "Failed to write utmp record: %m"); + + return q; +} + +static int on_runlevel(int argc, char *argv[], void *userdata) { + Context *c = ASSERT_PTR(userdata); + int r, q = 0, previous, runlevel; + + /* We finished changing runlevel, so let's write the utmp record and send the audit msg. */ + + /* First, get last runlevel */ + r = utmp_get_runlevel(&previous, NULL); + if (r < 0) { + if (!IN_SET(r, -ESRCH, -ENOENT)) + return log_error_errno(r, "Failed to get the last runlevel from utmp: %m"); + + previous = 0; + } + + /* Secondly, get new runlevel */ + runlevel = get_current_runlevel(c); + if (runlevel < 0) + return runlevel; + if (runlevel == 0) { + log_warning("Failed to get the current runlevel, utmp update skipped."); + return 0; + } + + if (previous == runlevel) + return 0; + +#if HAVE_AUDIT + if (c->audit_fd >= 0) { + char s[STRLEN("old-level=_ new-level=_") + 1]; + + xsprintf(s, "old-level=%c new-level=%c", + previous > 0 ? previous : 'N', + runlevel); + + if (audit_log_user_comm_message(c->audit_fd, AUDIT_SYSTEM_RUNLEVEL, s, + "systemd-update-utmp", NULL, NULL, NULL, 1) < 0 && errno != EPERM) + q = log_error_errno(errno, "Failed to send audit message: %m"); + } +#endif + + r = utmp_put_runlevel(runlevel, previous); + if (r < 0 && !IN_SET(r, -ESRCH, -ENOENT)) + return log_error_errno(r, "Failed to write utmp record: %m"); + + return q; +} + +static int run(int argc, char *argv[]) { + static const Verb verbs[] = { + { "reboot", 1, 1, 0, on_reboot }, + { "shutdown", 1, 1, 0, on_shutdown }, + { "runlevel", 1, 1, 0, on_runlevel }, + {} + }; + + _cleanup_(context_clear) Context c = { +#if HAVE_AUDIT + .audit_fd = -EBADF, +#endif + }; + int r; + + log_setup(); + + umask(0022); + +#if HAVE_AUDIT + /* If the kernel lacks netlink or audit support, don't worry about it. */ + c.audit_fd = audit_open(); + if (c.audit_fd < 0) + log_full_errno(IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT) ? LOG_DEBUG : LOG_WARNING, + errno, "Failed to connect to audit log, ignoring: %m"); +#endif + r = bus_connect_system_systemd(&c.bus); + if (r < 0) + return log_error_errno(r, "Failed to get D-Bus connection: %m"); + + return dispatch_verb(argc, argv, verbs, &c); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/user-sessions/meson.build b/src/user-sessions/meson.build new file mode 100644 index 0000000..938e526 --- /dev/null +++ b/src/user-sessions/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-user-sessions', + 'conditions' : ['HAVE_PAM'], + 'sources' : files('user-sessions.c'), + }, +] diff --git a/src/user-sessions/user-sessions.c b/src/user-sessions/user-sessions.c new file mode 100644 index 0000000..58054f8 --- /dev/null +++ b/src/user-sessions/user-sessions.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "fileio.h" +#include "fileio-label.h" +#include "fs-util.h" +#include "main-func.h" +#include "log.h" +#include "selinux-util.h" +#include "string-util.h" + +static int run(int argc, char *argv[]) { + int r; + + if (argc != 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program requires one argument."); + + log_setup(); + + umask(0022); + + r = mac_init(); + if (r < 0) + return r; + + /* We only touch /run/nologin. See create_shutdown_run_nologin_or_warn() for details. */ + + if (streq(argv[1], "start")) + return unlink_or_warn("/run/nologin"); + if (streq(argv[1], "stop")) + return create_shutdown_run_nologin_or_warn(); + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown verb '%s'.", argv[1]); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/userdb/meson.build b/src/userdb/meson.build new file mode 100644 index 0000000..2d701c8 --- /dev/null +++ b/src/userdb/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-userwork', + 'conditions' : ['ENABLE_USERDB'], + 'sources' : files('userwork.c'), + 'dependencies' : threads, + }, + libexec_template + { + 'name' : 'systemd-userdbd', + 'conditions' : ['ENABLE_USERDB'], + 'sources' : files( + 'userdbd-manager.c', + 'userdbd.c', + ), + 'dependencies' : threads, + }, + executable_template + { + 'name' : 'userdbctl', + 'conditions' : ['ENABLE_USERDB'], + 'sources' : files('userdbctl.c'), + 'dependencies' : threads, + }, +] diff --git a/src/userdb/userdbctl.c b/src/userdb/userdbctl.c new file mode 100644 index 0000000..a7db3fb --- /dev/null +++ b/src/userdb/userdbctl.c @@ -0,0 +1,1334 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "build.h" +#include "dirent-util.h" +#include "errno-list.h" +#include "escape.h" +#include "fd-util.h" +#include "format-table.h" +#include "format-util.h" +#include "main-func.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "socket-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "uid-range.h" +#include "user-record-show.h" +#include "user-util.h" +#include "userdb.h" +#include "verbs.h" + +static enum { + OUTPUT_CLASSIC, + OUTPUT_TABLE, + OUTPUT_FRIENDLY, + OUTPUT_JSON, + _OUTPUT_INVALID = -EINVAL, +} arg_output = _OUTPUT_INVALID; + +static PagerFlags arg_pager_flags = 0; +static bool arg_legend = true; +static char** arg_services = NULL; +static UserDBFlags arg_userdb_flags = 0; +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static bool arg_chain = false; + +STATIC_DESTRUCTOR_REGISTER(arg_services, strv_freep); + +static const char *user_disposition_to_color(UserDisposition d) { + assert(d >= 0); + assert(d < _USER_DISPOSITION_MAX); + + switch (d) { + case USER_INTRINSIC: + return ansi_red(); + + case USER_SYSTEM: + case USER_DYNAMIC: + return ansi_green(); + + case USER_CONTAINER: + return ansi_cyan(); + + case USER_RESERVED: + return ansi_red(); + + default: + return NULL; + } +} + +static int show_user(UserRecord *ur, Table *table) { + int r; + + assert(ur); + + switch (arg_output) { + + case OUTPUT_CLASSIC: + if (!uid_is_valid(ur->uid)) + break; + + printf("%s:x:" UID_FMT ":" GID_FMT ":%s:%s:%s\n", + ur->user_name, + ur->uid, + user_record_gid(ur), + strempty(user_record_real_name(ur)), + user_record_home_directory(ur), + user_record_shell(ur)); + + break; + + case OUTPUT_JSON: + json_variant_dump(ur->json, arg_json_format_flags, NULL, 0); + break; + + case OUTPUT_FRIENDLY: + user_record_show(ur, true); + + if (ur->incomplete) { + fflush(stdout); + log_warning("Warning: lacking rights to acquire privileged fields of user record of '%s', output incomplete.", ur->user_name); + } + + break; + + case OUTPUT_TABLE: { + UserDisposition d; + + assert(table); + d = user_record_disposition(ur); + + r = table_add_many( + table, + TABLE_STRING, "", + TABLE_STRING, ur->user_name, + TABLE_SET_COLOR, user_disposition_to_color(d), + TABLE_STRING, user_disposition_to_string(d), + TABLE_UID, ur->uid, + TABLE_GID, user_record_gid(ur), + TABLE_STRING, empty_to_null(ur->real_name), + TABLE_STRING, user_record_home_directory(ur), + TABLE_STRING, user_record_shell(ur), + TABLE_INT, 0); + if (r < 0) + return table_log_add_error(r); + + break; + } + + default: + assert_not_reached(); + } + + return 0; +} + +static const struct { + uid_t first, last; + const char *name; + UserDisposition disposition; +} uid_range_table[] = { + { + .first = 1, + .last = SYSTEM_UID_MAX, + .name = "system", + .disposition = USER_SYSTEM, + }, + { + .first = DYNAMIC_UID_MIN, + .last = DYNAMIC_UID_MAX, + .name = "dynamic system", + .disposition = USER_DYNAMIC, + }, + { + .first = CONTAINER_UID_BASE_MIN, + .last = CONTAINER_UID_BASE_MAX, + .name = "container", + .disposition = USER_CONTAINER, + }, +#if ENABLE_HOMED + { + .first = HOME_UID_MIN, + .last = HOME_UID_MAX, + .name = "systemd-homed", + .disposition = USER_REGULAR, + }, +#endif + { + .first = MAP_UID_MIN, + .last = MAP_UID_MAX, + .name = "mapped", + .disposition = USER_REGULAR, + }, +}; + +static int table_add_uid_boundaries(Table *table, const UidRange *p) { + int r; + + assert(table); + + for (size_t i = 0; i < ELEMENTSOF(uid_range_table); i++) { + _cleanup_free_ char *name = NULL, *comment = NULL; + + if (!uid_range_covers(p, uid_range_table[i].first, uid_range_table[i].last - uid_range_table[i].first + 1)) + continue; + + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_DOWN), + " begin ", uid_range_table[i].name, " users ", + special_glyph(SPECIAL_GLYPH_ARROW_DOWN)); + if (!name) + return log_oom(); + + comment = strjoin("First ", uid_range_table[i].name, " user"); + if (!comment) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_TOP), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, user_disposition_to_string(uid_range_table[i].disposition), + TABLE_SET_COLOR, ansi_grey(), + TABLE_UID, uid_range_table[i].first, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_STRING, comment, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_INT, -1); /* sort before any other entry with the same UID */ + if (r < 0) + return table_log_add_error(r); + + free(name); + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_UP), + " end ", uid_range_table[i].name, " users ", + special_glyph(SPECIAL_GLYPH_ARROW_UP)); + if (!name) + return log_oom(); + + free(comment); + comment = strjoin("Last ", uid_range_table[i].name, " user"); + if (!comment) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_RIGHT), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, user_disposition_to_string(uid_range_table[i].disposition), + TABLE_SET_COLOR, ansi_grey(), + TABLE_UID, uid_range_table[i].last, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_STRING, comment, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_INT, 1); /* sort after any other entry with the same UID */ + if (r < 0) + return table_log_add_error(r); + } + + return ELEMENTSOF(uid_range_table) * 2; +} + +static int add_unavailable_uid(Table *table, uid_t start, uid_t end) { + _cleanup_free_ char *name = NULL; + int r; + + assert(table); + assert(start <= end); + + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_DOWN), + " begin unavailable users ", + special_glyph(SPECIAL_GLYPH_ARROW_DOWN)); + if (!name) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_TOP), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_UID, start, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_STRING, "First unavailable user", + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_INT, -1); /* sort before an other entry with the same UID */ + if (r < 0) + return table_log_add_error(r); + + free(name); + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_UP), + " end unavailable users ", + special_glyph(SPECIAL_GLYPH_ARROW_UP)); + if (!name) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_RIGHT), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_UID, end, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_STRING, "Last unavailable user", + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_EMPTY, + TABLE_INT, 1); /* sort after any other entry with the same UID */ + if (r < 0) + return table_log_add_error(r); + + return 2; +} + +static int table_add_uid_map( + Table *table, + const UidRange *p, + int (*add_unavailable)(Table *t, uid_t start, uid_t end)) { + + uid_t focus = 0; + int n_added = 0, r; + + assert(table); + assert(add_unavailable); + + for (size_t i = 0; p && i < p->n_entries; i++) { + UidRangeEntry *x = p->entries + i; + + if (focus < x->start) { + r = add_unavailable(table, focus, x->start-1); + if (r < 0) + return r; + + n_added += r; + } + + if (x->start > UINT32_MAX - x->nr) { /* overflow check */ + focus = UINT32_MAX; + break; + } + + focus = x->start + x->nr; + } + + if (focus < UINT32_MAX-1) { + r = add_unavailable(table, focus, UINT32_MAX-1); + if (r < 0) + return r; + + n_added += r; + } + + return n_added; +} + +static int display_user(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + bool draw_separator = false; + int ret = 0, r; + + if (arg_output < 0) + arg_output = argc > 1 ? OUTPUT_FRIENDLY : OUTPUT_TABLE; + + if (arg_output == OUTPUT_TABLE) { + table = table_new(" ", "name", "disposition", "uid", "gid", "realname", "home", "shell", "order"); + if (!table) + return log_oom(); + + (void) table_set_align_percent(table, table_get_cell(table, 0, 3), 100); + (void) table_set_align_percent(table, table_get_cell(table, 0, 4), 100); + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + (void) table_set_sort(table, (size_t) 3, (size_t) 8); + (void) table_set_display(table, (size_t) 0, (size_t) 1, (size_t) 2, (size_t) 3, (size_t) 4, (size_t) 5, (size_t) 6, (size_t) 7); + } + + if (argc > 1) + STRV_FOREACH(i, argv + 1) { + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + uid_t uid; + + if (parse_uid(*i, &uid) >= 0) + r = userdb_by_uid(uid, arg_userdb_flags, &ur); + else + r = userdb_by_name(*i, arg_userdb_flags, &ur); + if (r < 0) { + if (r == -ESRCH) + log_error_errno(r, "User %s does not exist.", *i); + else if (r == -EHOSTDOWN) + log_error_errno(r, "Selected user database service is not available for this request."); + else + log_error_errno(r, "Failed to find user %s: %m", *i); + + if (ret >= 0) + ret = r; + } else { + if (draw_separator && arg_output == OUTPUT_FRIENDLY) + putchar('\n'); + + r = show_user(ur, table); + if (r < 0) + return r; + + draw_separator = true; + } + } + else { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + + r = userdb_all(arg_userdb_flags, &iterator); + if (r == -ENOLINK) /* ENOLINK → Didn't find answer without Varlink, and didn't try Varlink because was configured to off. */ + log_debug_errno(r, "No entries found. (Didn't check via Varlink.)"); + else if (r == -ESRCH) /* ESRCH → Couldn't find any suitable entry, but we checked all sources */ + log_debug_errno(r, "No entries found."); + else if (r < 0) + return log_error_errno(r, "Failed to enumerate users: %m"); + else { + for (;;) { + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + + r = userdb_iterator_get(iterator, &ur); + if (r == -ESRCH) + break; + if (r == -EHOSTDOWN) + return log_error_errno(r, "Selected user database service is not available for this request."); + if (r < 0) + return log_error_errno(r, "Failed acquire next user: %m"); + + if (draw_separator && arg_output == OUTPUT_FRIENDLY) + putchar('\n'); + + r = show_user(ur, table); + if (r < 0) + return r; + + draw_separator = true; + } + } + } + + if (table) { + _cleanup_(uid_range_freep) UidRange *uid_range = NULL; + int boundary_lines, uid_map_lines; + + r = uid_range_load_userns(&uid_range, "/proc/self/uid_map"); + if (r < 0) + log_debug_errno(r, "Failed to load /proc/self/uid_map, ignoring: %m"); + + boundary_lines = table_add_uid_boundaries(table, uid_range); + if (boundary_lines < 0) + return boundary_lines; + + uid_map_lines = table_add_uid_map(table, uid_range, add_unavailable_uid); + if (uid_map_lines < 0) + return uid_map_lines; + + if (table_get_rows(table) > 1) { + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, arg_legend); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + size_t k; + + k = table_get_rows(table) - 1 - boundary_lines - uid_map_lines; + if (k > 0) + printf("\n%zu users listed.\n", k); + else + printf("No users.\n"); + } + } + + return ret; +} + +static int show_group(GroupRecord *gr, Table *table) { + int r; + + assert(gr); + + switch (arg_output) { + + case OUTPUT_CLASSIC: { + _cleanup_free_ char *m = NULL; + + if (!gid_is_valid(gr->gid)) + break; + + m = strv_join(gr->members, ","); + if (!m) + return log_oom(); + + printf("%s:x:" GID_FMT ":%s\n", + gr->group_name, + gr->gid, + m); + break; + } + + case OUTPUT_JSON: + json_variant_dump(gr->json, arg_json_format_flags, NULL, 0); + break; + + case OUTPUT_FRIENDLY: + group_record_show(gr, true); + + if (gr->incomplete) { + fflush(stdout); + log_warning("Warning: lacking rights to acquire privileged fields of group record of '%s', output incomplete.", gr->group_name); + } + + break; + + case OUTPUT_TABLE: { + UserDisposition d; + + assert(table); + d = group_record_disposition(gr); + + r = table_add_many( + table, + TABLE_STRING, "", + TABLE_STRING, gr->group_name, + TABLE_SET_COLOR, user_disposition_to_color(d), + TABLE_STRING, user_disposition_to_string(d), + TABLE_GID, gr->gid, + TABLE_STRING, gr->description, + TABLE_INT, 0); + if (r < 0) + return table_log_add_error(r); + + break; + } + + default: + assert_not_reached(); + } + + return 0; +} + +static int table_add_gid_boundaries(Table *table, const UidRange *p) { + int r; + + assert(table); + + for (size_t i = 0; i < ELEMENTSOF(uid_range_table); i++) { + _cleanup_free_ char *name = NULL, *comment = NULL; + + if (!uid_range_covers(p, uid_range_table[i].first, uid_range_table[i].last)) + continue; + + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_DOWN), + " begin ", uid_range_table[i].name, " groups ", + special_glyph(SPECIAL_GLYPH_ARROW_DOWN)); + if (!name) + return log_oom(); + + comment = strjoin("First ", uid_range_table[i].name, " group"); + if (!comment) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_TOP), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, user_disposition_to_string(uid_range_table[i].disposition), + TABLE_SET_COLOR, ansi_grey(), + TABLE_GID, uid_range_table[i].first, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, comment, + TABLE_SET_COLOR, ansi_grey(), + TABLE_INT, -1); /* sort before any other entry with the same GID */ + if (r < 0) + return table_log_add_error(r); + + free(name); + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_UP), + " end ", uid_range_table[i].name, " groups ", + special_glyph(SPECIAL_GLYPH_ARROW_UP)); + if (!name) + return log_oom(); + + free(comment); + comment = strjoin("Last ", uid_range_table[i].name, " group"); + if (!comment) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_RIGHT), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, user_disposition_to_string(uid_range_table[i].disposition), + TABLE_SET_COLOR, ansi_grey(), + TABLE_GID, uid_range_table[i].last, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, comment, + TABLE_SET_COLOR, ansi_grey(), + TABLE_INT, 1); /* sort after any other entry with the same GID */ + if (r < 0) + return table_log_add_error(r); + } + + return ELEMENTSOF(uid_range_table) * 2; +} + +static int add_unavailable_gid(Table *table, uid_t start, uid_t end) { + _cleanup_free_ char *name = NULL; + int r; + + assert(table); + assert(start <= end); + + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_DOWN), + " begin unavailable groups ", + special_glyph(SPECIAL_GLYPH_ARROW_DOWN)); + if (!name) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_TOP), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_GID, start, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, "First unavailable group", + TABLE_SET_COLOR, ansi_grey(), + TABLE_INT, -1); /* sort before any other entry with the same GID */ + if (r < 0) + return table_log_add_error(r); + + free(name); + name = strjoin(special_glyph(SPECIAL_GLYPH_ARROW_UP), + " end unavailable groups ", + special_glyph(SPECIAL_GLYPH_ARROW_UP)); + if (!name) + return log_oom(); + + r = table_add_many( + table, + TABLE_STRING, special_glyph(SPECIAL_GLYPH_TREE_RIGHT), + TABLE_STRING, name, + TABLE_SET_COLOR, ansi_grey(), + TABLE_EMPTY, + TABLE_GID, end, + TABLE_SET_COLOR, ansi_grey(), + TABLE_STRING, "Last unavailable group", + TABLE_SET_COLOR, ansi_grey(), + TABLE_INT, 1); /* sort after any other entry with the same GID */ + if (r < 0) + return table_log_add_error(r); + + return 2; +} + +static int display_group(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + bool draw_separator = false; + int ret = 0, r; + + if (arg_output < 0) + arg_output = argc > 1 ? OUTPUT_FRIENDLY : OUTPUT_TABLE; + + if (arg_output == OUTPUT_TABLE) { + table = table_new(" ", "name", "disposition", "gid", "description", "order"); + if (!table) + return log_oom(); + + (void) table_set_align_percent(table, table_get_cell(table, 0, 3), 100); + table_set_ersatz_string(table, TABLE_ERSATZ_DASH); + (void) table_set_sort(table, (size_t) 3, (size_t) 5); + (void) table_set_display(table, (size_t) 0, (size_t) 1, (size_t) 2, (size_t) 3, (size_t) 4); + } + + if (argc > 1) + STRV_FOREACH(i, argv + 1) { + _cleanup_(group_record_unrefp) GroupRecord *gr = NULL; + gid_t gid; + + if (parse_gid(*i, &gid) >= 0) + r = groupdb_by_gid(gid, arg_userdb_flags, &gr); + else + r = groupdb_by_name(*i, arg_userdb_flags, &gr); + if (r < 0) { + if (r == -ESRCH) + log_error_errno(r, "Group %s does not exist.", *i); + else if (r == -EHOSTDOWN) + log_error_errno(r, "Selected group database service is not available for this request."); + else + log_error_errno(r, "Failed to find group %s: %m", *i); + + if (ret >= 0) + ret = r; + } else { + if (draw_separator && arg_output == OUTPUT_FRIENDLY) + putchar('\n'); + + r = show_group(gr, table); + if (r < 0) + return r; + + draw_separator = true; + } + } + else { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + + r = groupdb_all(arg_userdb_flags, &iterator); + if (r == -ENOLINK) + log_debug_errno(r, "No entries found. (Didn't check via Varlink.)"); + else if (r == -ESRCH) + log_debug_errno(r, "No entries found."); + else if (r < 0) + return log_error_errno(r, "Failed to enumerate groups: %m"); + else { + for (;;) { + _cleanup_(group_record_unrefp) GroupRecord *gr = NULL; + + r = groupdb_iterator_get(iterator, &gr); + if (r == -ESRCH) + break; + if (r == -EHOSTDOWN) + return log_error_errno(r, "Selected group database service is not available for this request."); + if (r < 0) + return log_error_errno(r, "Failed acquire next group: %m"); + + if (draw_separator && arg_output == OUTPUT_FRIENDLY) + putchar('\n'); + + r = show_group(gr, table); + if (r < 0) + return r; + + draw_separator = true; + } + } + } + + if (table) { + _cleanup_(uid_range_freep) UidRange *gid_range = NULL; + int boundary_lines, gid_map_lines; + + r = uid_range_load_userns(&gid_range, "/proc/self/gid_map"); + if (r < 0) + log_debug_errno(r, "Failed to load /proc/self/gid_map, ignoring: %m"); + + boundary_lines = table_add_gid_boundaries(table, gid_range); + if (boundary_lines < 0) + return boundary_lines; + + gid_map_lines = table_add_uid_map(table, gid_range, add_unavailable_gid); + if (gid_map_lines < 0) + return gid_map_lines; + + if (table_get_rows(table) > 1) { + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, arg_legend); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + size_t k; + + k = table_get_rows(table) - 1 - boundary_lines - gid_map_lines; + if (k > 0) + printf("\n%zu groups listed.\n", k); + else + printf("No groups.\n"); + } + } + + return ret; +} + +static int show_membership(const char *user, const char *group, Table *table) { + int r; + + assert(user); + assert(group); + + switch (arg_output) { + + case OUTPUT_CLASSIC: + /* Strictly speaking there's no 'classic' output for this concept, but let's output it in + * similar style to the classic output for user/group info */ + + printf("%s:%s\n", user, group); + break; + + case OUTPUT_JSON: { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("user", JSON_BUILD_STRING(user)), + JSON_BUILD_PAIR("group", JSON_BUILD_STRING(group)))); + if (r < 0) + return log_error_errno(r, "Failed to build JSON object: %m"); + + json_variant_dump(v, arg_json_format_flags, NULL, NULL); + break; + } + + case OUTPUT_FRIENDLY: + /* Hmm, this is not particularly friendly, but not sure how we could do this better */ + printf("%s: %s\n", group, user); + break; + + case OUTPUT_TABLE: + assert(table); + + r = table_add_many( + table, + TABLE_STRING, user, + TABLE_STRING, group); + if (r < 0) + return table_log_add_error(r); + + break; + + default: + assert_not_reached(); + } + + return 0; +} + +static int display_memberships(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *table = NULL; + int ret = 0, r; + + if (arg_output < 0) + arg_output = OUTPUT_TABLE; + + if (arg_output == OUTPUT_TABLE) { + table = table_new("user", "group"); + if (!table) + return log_oom(); + + (void) table_set_sort(table, (size_t) 0, (size_t) 1); + } + + if (argc > 1) + STRV_FOREACH(i, argv + 1) { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + + if (streq(argv[0], "users-in-group")) { + r = membershipdb_by_group(*i, arg_userdb_flags, &iterator); + if (r < 0) + return log_error_errno(r, "Failed to enumerate users in group: %m"); + } else if (streq(argv[0], "groups-of-user")) { + r = membershipdb_by_user(*i, arg_userdb_flags, &iterator); + if (r < 0) + return log_error_errno(r, "Failed to enumerate groups of user: %m"); + } else + assert_not_reached(); + + for (;;) { + _cleanup_free_ char *user = NULL, *group = NULL; + + r = membershipdb_iterator_get(iterator, &user, &group); + if (r == -ESRCH) + break; + if (r == -EHOSTDOWN) + return log_error_errno(r, "Selected membership database service is not available for this request."); + if (r < 0) + return log_error_errno(r, "Failed acquire next membership: %m"); + + r = show_membership(user, group, table); + if (r < 0) + return r; + } + } + else { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + + r = membershipdb_all(arg_userdb_flags, &iterator); + if (r == -ENOLINK) + log_debug_errno(r, "No entries found. (Didn't check via Varlink.)"); + else if (r == -ESRCH) + log_debug_errno(r, "No entries found."); + else if (r < 0) + return log_error_errno(r, "Failed to enumerate memberships: %m"); + else { + for (;;) { + _cleanup_free_ char *user = NULL, *group = NULL; + + r = membershipdb_iterator_get(iterator, &user, &group); + if (r == -ESRCH) + break; + if (r == -EHOSTDOWN) + return log_error_errno(r, "Selected membership database service is not available for this request."); + if (r < 0) + return log_error_errno(r, "Failed acquire next membership: %m"); + + r = show_membership(user, group, table); + if (r < 0) + return r; + } + } + } + + if (table) { + if (table_get_rows(table) > 1) { + r = table_print_with_pager(table, arg_json_format_flags, arg_pager_flags, arg_legend); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend) { + if (table_get_rows(table) > 1) + printf("\n%zu memberships listed.\n", table_get_rows(table) - 1); + else + printf("No memberships.\n"); + } + } + + return ret; +} + +static int display_services(int argc, char *argv[], void *userdata) { + _cleanup_(table_unrefp) Table *t = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r; + + d = opendir("/run/systemd/userdb/"); + if (!d) { + if (errno == ENOENT) { + log_info("No services."); + return 0; + } + + return log_error_errno(errno, "Failed to open /run/systemd/userdb/: %m"); + } + + t = table_new("service", "listening"); + if (!t) + return log_oom(); + + (void) table_set_sort(t, (size_t) 0); + + FOREACH_DIRENT(de, d, return -errno) { + _cleanup_free_ char *j = NULL, *no = NULL; + _cleanup_close_ int fd = -EBADF; + + j = path_join("/run/systemd/userdb/", de->d_name); + if (!j) + return log_oom(); + + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate AF_UNIX/SOCK_STREAM socket: %m"); + + r = connect_unix_path(fd, dirfd(d), de->d_name); + if (r < 0) { + no = strjoin("No (", errno_to_name(r), ")"); + if (!no) + return log_oom(); + } + + r = table_add_many(t, + TABLE_STRING, de->d_name, + TABLE_STRING, no ?: "yes", + TABLE_SET_COLOR, no ? ansi_highlight_red() : ansi_highlight_green()); + if (r < 0) + return table_log_add_error(r); + } + + if (table_get_rows(t) > 1) { + r = table_print_with_pager(t, arg_json_format_flags, arg_pager_flags, arg_legend); + if (r < 0) + return table_log_print_error(r); + } + + if (arg_legend && arg_output != OUTPUT_JSON) { + if (table_get_rows(t) > 1) + printf("\n%zu services listed.\n", table_get_rows(t) - 1); + else + printf("No services.\n"); + } + + return 0; +} + +static int ssh_authorized_keys(int argc, char *argv[], void *userdata) { + _cleanup_(user_record_unrefp) UserRecord *ur = NULL; + char **chain_invocation; + int r; + + assert(argc >= 2); + + if (arg_chain) { + /* If --chain is specified, the rest of the command line is the chain command */ + + if (argc < 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "No chain command line specified, refusing."); + + /* Make similar restrictions on the chain command as OpenSSH itself makes on the primary command. */ + if (!path_is_absolute(argv[2])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Chain invocation of ssh-authorized-keys commands requires an absolute binary path argument."); + + if (!path_is_normalized(argv[2])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Chain invocation of ssh-authorized-keys commands requires an normalized binary path argument."); + + chain_invocation = argv + 2; + } else { + /* If --chain is not specified, then refuse any further arguments */ + + if (argc > 2) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too many arguments."); + + chain_invocation = NULL; + } + + r = userdb_by_name(argv[1], arg_userdb_flags, &ur); + if (r == -ESRCH) + log_error_errno(r, "User %s does not exist.", argv[1]); + else if (r == -EHOSTDOWN) + log_error_errno(r, "Selected user database service is not available for this request."); + else if (r == -EINVAL) + log_error_errno(r, "Failed to find user %s: %m (Invalid user name?)", argv[1]); + else if (r < 0) + log_error_errno(r, "Failed to find user %s: %m", argv[1]); + else { + if (strv_isempty(ur->ssh_authorized_keys)) + log_debug("User record for %s has no public SSH keys.", argv[1]); + else + STRV_FOREACH(i, ur->ssh_authorized_keys) + printf("%s\n", *i); + + if (ur->incomplete) { + fflush(stdout); + log_warning("Warning: lacking rights to acquire privileged fields of user record of '%s', output incomplete.", ur->user_name); + } + } + + if (chain_invocation) { + if (DEBUG_LOGGING) { + _cleanup_free_ char *s = NULL; + + s = quote_command_line(chain_invocation, SHELL_ESCAPE_EMPTY); + if (!s) + return log_oom(); + + log_debug("Chain invoking: %s", s); + } + + fflush(stdout); + execv(chain_invocation[0], chain_invocation); + if (errno == ENOENT) /* Let's handle ENOENT gracefully */ + log_warning_errno(errno, "Chain executable '%s' does not exist, ignoring chain invocation.", chain_invocation[0]); + else { + log_error_errno(errno, "Failed to invoke chain executable '%s': %m", chain_invocation[0]); + if (r >= 0) + r = -errno; + } + } + + return r; +} + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("userdbctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND ...\n\n" + "%sShow user and group information.%s\n" + "\nCommands:\n" + " user [USER…] Inspect user\n" + " group [GROUP…] Inspect group\n" + " users-in-group [GROUP…] Show users that are members of specified groups\n" + " groups-of-user [USER…] Show groups the specified users are members of\n" + " services Show enabled database services\n" + " ssh-authorized-keys USER Show SSH authorized keys for user\n" + "\nOptions:\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --no-legend Do not show the headers and footers\n" + " --output=MODE Select output mode (classic, friendly, table, json)\n" + " -j Equivalent to --output=json\n" + " -s --service=SERVICE[:SERVICE…]\n" + " Query the specified service\n" + " --with-nss=BOOL Control whether to include glibc NSS data\n" + " -N Do not synthesize or include glibc NSS data\n" + " (Same as --synthesize=no --with-nss=no)\n" + " --synthesize=BOOL Synthesize root/nobody user\n" + " --with-dropin=BOOL Control whether to include drop-in records\n" + " --with-varlink=BOOL Control whether to talk to services at all\n" + " --multiplexer=BOOL Control whether to use the multiplexer\n" + " --json=pretty|short JSON output mode\n" + " --chain Chain another command\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_NO_LEGEND, + ARG_OUTPUT, + ARG_WITH_NSS, + ARG_WITH_DROPIN, + ARG_WITH_VARLINK, + ARG_SYNTHESIZE, + ARG_MULTIPLEXER, + ARG_JSON, + ARG_CHAIN, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "no-legend", no_argument, NULL, ARG_NO_LEGEND }, + { "output", required_argument, NULL, ARG_OUTPUT }, + { "service", required_argument, NULL, 's' }, + { "with-nss", required_argument, NULL, ARG_WITH_NSS }, + { "with-dropin", required_argument, NULL, ARG_WITH_DROPIN }, + { "with-varlink", required_argument, NULL, ARG_WITH_VARLINK }, + { "synthesize", required_argument, NULL, ARG_SYNTHESIZE }, + { "multiplexer", required_argument, NULL, ARG_MULTIPLEXER }, + { "json", required_argument, NULL, ARG_JSON }, + { "chain", no_argument, NULL, ARG_CHAIN }, + {} + }; + + const char *e; + int r; + + assert(argc >= 0); + assert(argv); + + /* We are going to update this environment variable with our own, hence let's first read what is already set */ + e = getenv("SYSTEMD_ONLY_USERDB"); + if (e) { + char **l; + + l = strv_split(e, ":"); + if (!l) + return log_oom(); + + strv_free(arg_services); + arg_services = l; + } + + /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long() + * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */ + optind = 0; + + for (;;) { + int c; + + c = getopt_long(argc, argv, + arg_chain ? "+hjs:N" : "hjs:N", /* When --chain was used disable parsing of further switches */ + options, NULL); + if (c < 0) + break; + + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_NO_LEGEND: + arg_legend = false; + break; + + case ARG_OUTPUT: + if (isempty(optarg)) + arg_output = _OUTPUT_INVALID; + else if (streq(optarg, "classic")) + arg_output = OUTPUT_CLASSIC; + else if (streq(optarg, "friendly")) + arg_output = OUTPUT_FRIENDLY; + else if (streq(optarg, "json")) + arg_output = OUTPUT_JSON; + else if (streq(optarg, "table")) + arg_output = OUTPUT_TABLE; + else if (streq(optarg, "help")) { + puts("classic\n" + "friendly\n" + "json\n" + "table"); + return 0; + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid --output= mode: %s", optarg); + + arg_json_format_flags = arg_output == OUTPUT_JSON ? JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR_AUTO : JSON_FORMAT_OFF; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + + arg_output = FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF) ? _OUTPUT_INVALID : OUTPUT_JSON; + break; + + case 'j': + arg_json_format_flags = JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR_AUTO; + arg_output = OUTPUT_JSON; + break; + + case 's': + if (isempty(optarg)) + arg_services = strv_free(arg_services); + else { + _cleanup_strv_free_ char **l = NULL; + + l = strv_split(optarg, ":"); + if (!l) + return log_oom(); + + r = strv_extend_strv(&arg_services, l, true); + if (r < 0) + return log_oom(); + } + + break; + + case 'N': + arg_userdb_flags |= USERDB_EXCLUDE_NSS|USERDB_DONT_SYNTHESIZE; + break; + + case ARG_WITH_NSS: + r = parse_boolean_argument("--with-nss=", optarg, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_userdb_flags, USERDB_EXCLUDE_NSS, !r); + break; + + case ARG_WITH_DROPIN: + r = parse_boolean_argument("--with-dropin=", optarg, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_userdb_flags, USERDB_EXCLUDE_DROPIN, !r); + break; + + case ARG_WITH_VARLINK: + r = parse_boolean_argument("--with-varlink=", optarg, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_userdb_flags, USERDB_EXCLUDE_VARLINK, !r); + break; + + case ARG_SYNTHESIZE: + r = parse_boolean_argument("--synthesize=", optarg, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_userdb_flags, USERDB_DONT_SYNTHESIZE, !r); + break; + + case ARG_MULTIPLEXER: + r = parse_boolean_argument("--multiplexer=", optarg, NULL); + if (r < 0) + return r; + + SET_FLAG(arg_userdb_flags, USERDB_AVOID_MULTIPLEXER, !r); + break; + + case ARG_CHAIN: + arg_chain = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + } + + return 1; +} + +static int run(int argc, char *argv[]) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "user", VERB_ANY, VERB_ANY, VERB_DEFAULT, display_user }, + { "group", VERB_ANY, VERB_ANY, 0, display_group }, + { "users-in-group", VERB_ANY, VERB_ANY, 0, display_memberships }, + { "groups-of-user", VERB_ANY, VERB_ANY, 0, display_memberships }, + { "services", VERB_ANY, 1, 0, display_services }, + + /* This one is a helper for sshd_config's AuthorizedKeysCommand= setting, it's not a + * user-facing verb and thus should not appear in man pages or --help texts. */ + { "ssh-authorized-keys", 2, VERB_ANY, 0, ssh_authorized_keys }, + {} + }; + + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + if (arg_services) { + _cleanup_free_ char *e = NULL; + + e = strv_join(arg_services, ":"); + if (!e) + return log_oom(); + + if (setenv("SYSTEMD_ONLY_USERDB", e, true) < 0) + return log_error_errno(r, "Failed to set $SYSTEMD_ONLY_USERDB: %m"); + + log_info("Enabled services: %s", e); + } else + assert_se(unsetenv("SYSTEMD_ONLY_USERDB") == 0); + + return dispatch_verb(argc, argv, verbs, NULL); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/userdb/userdbd-manager.c b/src/userdb/userdbd-manager.c new file mode 100644 index 0000000..c1dfe47 --- /dev/null +++ b/src/userdb/userdbd-manager.c @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "sd-daemon.h" + +#include "common-signal.h" +#include "fd-util.h" +#include "fs-util.h" +#include "mkdir.h" +#include "process-util.h" +#include "set.h" +#include "signal-util.h" +#include "socket-util.h" +#include "stdio-util.h" +#include "umask-util.h" +#include "userdbd-manager.h" + +#define LISTEN_TIMEOUT_USEC (25 * USEC_PER_SEC) + +static int start_workers(Manager *m, bool explicit_request); + +static int on_worker_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + assert_se(!set_remove(m->workers_dynamic, s) != !set_remove(m->workers_fixed, s)); + sd_event_source_disable_unref(s); + + if (si->si_code == CLD_EXITED) { + if (si->si_status == EXIT_SUCCESS) + log_debug("Worker " PID_FMT " exited successfully.", si->si_pid); + else + log_warning("Worker " PID_FMT " died with a failure exit status %i, ignoring.", si->si_pid, si->si_status); + } else if (si->si_code == CLD_KILLED) + log_warning("Worker " PID_FMT " was killed by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status)); + else if (si->si_code == CLD_DUMPED) + log_warning("Worker " PID_FMT " dumped core by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status)); + else + log_warning("Can't handle SIGCHLD of this type"); + + (void) start_workers(m, /* explicit_request= */ false); /* Fill up workers again if we fell below the low watermark */ + return 0; +} + +static int on_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + (void) start_workers(m, /* explicit_request=*/ true); /* Workers told us there's more work, let's add one more worker as long as we are below the high watermark */ + return 0; +} + +static int on_deferred_start_worker(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + + assert(s); + + m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source); + + (void) start_workers(m, /* explicit_request=*/ false); + return 0; +} + +DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR( + event_source_hash_ops, + sd_event_source, + (void (*)(const sd_event_source*, struct siphash*)) trivial_hash_func, + (int (*)(const sd_event_source*, const sd_event_source*)) trivial_compare_func, + sd_event_source_disable_unref); + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + m = new(Manager, 1); + if (!m) + return -ENOMEM; + + *m = (Manager) { + .listen_fd = -EBADF, + .worker_ratelimit = { + .interval = 2 * USEC_PER_SEC, + .burst = 2500, + }, + }; + + r = sd_event_new(&m->event); + if (r < 0) + return r; + + r = sd_event_set_signal_exit(m->event, true); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, (SIGRTMIN+18)|SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL); + if (r < 0) + return r; + + r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL); + if (r < 0) + log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m"); + + r = sd_event_set_watchdog(m->event, true); + if (r < 0) + log_debug_errno(r, "Failed to enable watchdog handling, ignoring: %m"); + + r = sd_event_add_signal(m->event, NULL, SIGUSR2|SD_EVENT_SIGNAL_PROCMASK, on_sigusr2, m); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return 0; +} + +Manager* manager_free(Manager *m) { + if (!m) + return NULL; + + set_free(m->workers_fixed); + set_free(m->workers_dynamic); + + m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source); + + safe_close(m->listen_fd); + + sd_event_unref(m->event); + + return mfree(m); +} + +static size_t manager_current_workers(Manager *m) { + assert(m); + + return set_size(m->workers_fixed) + set_size(m->workers_dynamic); +} + +static int start_one_worker(Manager *m) { + _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source = NULL; + bool fixed; + pid_t pid; + int r; + + assert(m); + + fixed = set_size(m->workers_fixed) < USERDB_WORKERS_MIN; + + r = safe_fork_full( + "(sd-worker)", + /* stdio_fds= */ NULL, + &m->listen_fd, 1, + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_CLOSE_ALL_FDS, + &pid); + if (r < 0) + return log_error_errno(r, "Failed to fork new worker child: %m"); + if (r == 0) { + char pids[DECIMAL_STR_MAX(pid_t)]; + /* Child */ + + if (m->listen_fd == 3) { + r = fd_cloexec(3, false); + if (r < 0) { + log_error_errno(r, "Failed to turn off O_CLOEXEC for fd 3: %m"); + _exit(EXIT_FAILURE); + } + } else { + if (dup2(m->listen_fd, 3) < 0) { /* dup2() creates with O_CLOEXEC off */ + log_error_errno(errno, "Failed to move listen fd to 3: %m"); + _exit(EXIT_FAILURE); + } + + safe_close(m->listen_fd); + } + + xsprintf(pids, PID_FMT, pid); + if (setenv("LISTEN_PID", pids, 1) < 0) { + log_error_errno(errno, "Failed to set $LISTEN_PID: %m"); + _exit(EXIT_FAILURE); + } + + if (setenv("LISTEN_FDS", "1", 1) < 0) { + log_error_errno(errno, "Failed to set $LISTEN_FDS: %m"); + _exit(EXIT_FAILURE); + } + + + if (setenv("USERDB_FIXED_WORKER", one_zero(fixed), 1) < 0) { + log_error_errno(errno, "Failed to set $USERDB_FIXED_WORKER: %m"); + _exit(EXIT_FAILURE); + } + + /* execl("/home/lennart/projects/systemd/build/systemd-userwork", "systemd-userwork", "xxxxxxxxxxxxxxxx", NULL); /\* With some extra space rename_process() can make use of *\/ */ + /* execl("/usr/bin/valgrind", "valgrind", "/home/lennart/projects/systemd/build/systemd-userwork", "systemd-userwork", "xxxxxxxxxxxxxxxx", NULL); /\* With some extra space rename_process() can make use of *\/ */ + + execl(SYSTEMD_USERWORK_PATH, "systemd-userwork", "xxxxxxxxxxxxxxxx", NULL); /* With some extra space rename_process() can make use of */ + log_error_errno(errno, "Failed start worker process: %m"); + _exit(EXIT_FAILURE); + } + + r = sd_event_add_child(m->event, &source, pid, WEXITED, on_worker_exit, m); + if (r < 0) + return log_error_errno(r, "Failed to watch child " PID_FMT ": %m", pid); + + r = set_ensure_put( + fixed ? &m->workers_fixed : &m->workers_dynamic, + &event_source_hash_ops, + source); + if (r < 0) + return log_error_errno(r, "Failed to add child process to set: %m"); + + TAKE_PTR(source); + + return 0; +} + +static int start_workers(Manager *m, bool explicit_request) { + int r; + + assert(m); + + for (;;) { + size_t n; + + n = manager_current_workers(m); + if (n >= USERDB_WORKERS_MIN && (!explicit_request || n >= USERDB_WORKERS_MAX)) + break; + + if (!ratelimit_below(&m->worker_ratelimit)) { + + /* If we keep starting workers too often but none sticks, let's fail the whole + * daemon, something is wrong */ + if (n == 0) { + sd_event_exit(m->event, EXIT_FAILURE); + return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "Worker threads requested too frequently, but worker count is zero, something is wrong."); + } + + /* Otherwise, let's stop spawning more for a while. */ + log_warning("Worker threads requested too frequently, not starting new ones for a while."); + + if (!m->deferred_start_worker_event_source) { + r = sd_event_add_time( + m->event, + &m->deferred_start_worker_event_source, + CLOCK_MONOTONIC, + ratelimit_end(&m->worker_ratelimit), + /* accuracy_usec= */ 0, + on_deferred_start_worker, + m); + if (r < 0) + return log_error_errno(r, "Failed to allocate deferred start worker event source: %m"); + } + + break; + } + + r = start_one_worker(m); + if (r < 0) + return r; + + explicit_request = false; + } + + return 0; +} + +int manager_startup(Manager *m) { + int n, r; + + assert(m); + assert(m->listen_fd < 0); + + n = sd_listen_fds(false); + if (n < 0) + return log_error_errno(n, "Failed to determine number of passed file descriptors: %m"); + if (n > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected one listening fd, got %i.", n); + if (n == 1) + m->listen_fd = SD_LISTEN_FDS_START; + else { + static const union sockaddr_union sockaddr = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/userdb/io.systemd.Multiplexer", + }; + + r = mkdir_p("/run/systemd/userdb", 0755); + if (r < 0) + return log_error_errno(r, "Failed to create /run/systemd/userdb: %m"); + + m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (m->listen_fd < 0) + return log_error_errno(errno, "Failed to bind on socket: %m"); + + (void) sockaddr_un_unlink(&sockaddr.un); + + WITH_UMASK(0000) + if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0) + return log_error_errno(errno, "Failed to bind socket: %m"); + + r = symlink_idempotent("io.systemd.Multiplexer", + "/run/systemd/userdb/io.systemd.NameServiceSwitch", false); + if (r < 0) + return log_error_errno(r, "Failed to bind io.systemd.Multiplexer: %m"); + + r = symlink_idempotent("io.systemd.Multiplexer", + "/run/systemd/userdb/io.systemd.DropIn", false); + if (r < 0) + return log_error_errno(r, "Failed to bind io.systemd.Multiplexer: %m"); + + if (listen(m->listen_fd, SOMAXCONN_DELUXE) < 0) + return log_error_errno(errno, "Failed to listen on socket: %m"); + } + + /* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be + * GC'ed on idle */ + if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0) + return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m"); + + return start_workers(m, /* explicit_request= */ false); +} diff --git a/src/userdb/userdbd-manager.h b/src/userdb/userdbd-manager.h new file mode 100644 index 0000000..c39f79d --- /dev/null +++ b/src/userdb/userdbd-manager.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-event.h" + +typedef struct Manager Manager; + +#include "hashmap.h" +#include "ratelimit.h" + +#define USERDB_WORKERS_MIN 3 +#define USERDB_WORKERS_MAX 4096 + +struct Manager { + sd_event *event; + + Set *workers_fixed; /* Workers 0…USERDB_WORKERS_MIN */ + Set *workers_dynamic; /* Workers USERD_WORKERS_MIN+1…USERDB_WORKERS_MAX */ + + int listen_fd; + + RateLimit worker_ratelimit; + + sd_event_source *deferred_start_worker_event_source; +}; + +int manager_new(Manager **ret); +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_startup(Manager *m); diff --git a/src/userdb/userdbd.c b/src/userdb/userdbd.c new file mode 100644 index 0000000..89ac9c7 --- /dev/null +++ b/src/userdb/userdbd.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "daemon-util.h" +#include "userdbd-manager.h" +#include "log.h" +#include "main-func.h" +#include "signal-util.h" + +/* This service offers two Varlink services, both implementing io.systemd.UserDatabase: + * + * → io.systemd.NameServiceSwitch: this is a compatibility interface for glibc NSS: it responds to + * name lookups by checking the classic NSS interfaces and responding that. + * + * → io.systemd.Multiplexer: this multiplexes lookup requests to all Varlink services that have a + * socket in /run/systemd/userdb/. It's supposed to simplify clients that don't want to implement + * the full iterative logic on their own. + * + * → io.systemd.DropIn: this makes JSON user/group records dropped into /run/userdb/ available as + * regular users. + */ + +static int run(int argc, char *argv[]) { + _cleanup_(manager_freep) Manager *m = NULL; + _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL; + int r; + + log_setup(); + + umask(0022); + + if (argc != 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments."); + + if (setenv("SYSTEMD_BYPASS_USERDB", "io.systemd.NameServiceSwitch:io.systemd.Multiplexer:io.systemd.DropIn", 1) < 0) + return log_error_errno(errno, "Failed to set $SYSTEMD_BYPASS_USERDB: %m"); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Could not create manager: %m"); + + r = manager_startup(m); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + + notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/userdb/userwork.c b/src/userdb/userwork.c new file mode 100644 index 0000000..b49dbbd --- /dev/null +++ b/src/userdb/userwork.c @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "sd-daemon.h" + +#include "env-util.h" +#include "fd-util.h" +#include "group-record.h" +#include "io-util.h" +#include "main-func.h" +#include "process-util.h" +#include "strv.h" +#include "time-util.h" +#include "user-record-nss.h" +#include "user-record.h" +#include "user-util.h" +#include "userdb.h" +#include "varlink.h" +#include "varlink-io.systemd.UserDatabase.h" + +#define ITERATIONS_MAX 64U +#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE) +#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC) +#define CONNECTION_IDLE_USEC (15 * USEC_PER_SEC) +#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC) + +typedef struct LookupParameters { + const char *user_name; + const char *group_name; + union { + uid_t uid; + gid_t gid; + }; + const char *service; +} LookupParameters; + +static int add_nss_service(JsonVariant **v) { + _cleanup_(json_variant_unrefp) JsonVariant *status = NULL, *z = NULL; + sd_id128_t mid; + int r; + + assert(v); + + /* Patch in service field if it's missing. The assumption here is that this field is unset only for + * NSS records */ + + if (json_variant_by_key(*v, "service")) + return 0; + + r = sd_id128_get_machine(&mid); + if (r < 0) + return r; + + status = json_variant_ref(json_variant_by_key(*v, "status")); + z = json_variant_ref(json_variant_by_key(status, SD_ID128_TO_STRING(mid))); + + if (json_variant_by_key(z, "service")) + return 0; + + r = json_variant_set_field_string(&z, "service", "io.systemd.NameServiceSwitch"); + if (r < 0) + return r; + + r = json_variant_set_field(&status, SD_ID128_TO_STRING(mid), z); + if (r < 0) + return r; + + return json_variant_set_field(v, "status", status); +} + +static int build_user_json(Varlink *link, UserRecord *ur, JsonVariant **ret) { + _cleanup_(user_record_unrefp) UserRecord *stripped = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + UserRecordLoadFlags flags; + uid_t peer_uid; + bool trusted; + int r; + + assert(ur); + assert(ret); + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) { + log_debug_errno(r, "Unable to query peer UID, ignoring: %m"); + trusted = false; + } else + trusted = peer_uid == 0 || peer_uid == ur->uid; + + flags = USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_BINDING|USER_RECORD_STRIP_SECRET|USER_RECORD_ALLOW_STATUS|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_PERMISSIVE; + if (trusted) + flags |= USER_RECORD_ALLOW_PRIVILEGED; + else + flags |= USER_RECORD_STRIP_PRIVILEGED; + + r = user_record_clone(ur, flags, &stripped); + if (r < 0) + return r; + + stripped->incomplete = + ur->incomplete || + (FLAGS_SET(ur->mask, USER_RECORD_PRIVILEGED) && + !FLAGS_SET(stripped->mask, USER_RECORD_PRIVILEGED)); + + v = json_variant_ref(stripped->json); + r = add_nss_service(&v); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v)), + JSON_BUILD_PAIR("incomplete", JSON_BUILD_BOOLEAN(stripped->incomplete)))); +} + +static int userdb_flags_from_service(Varlink *link, const char *service, UserDBFlags *ret) { + assert(link); + assert(ret); + + if (streq_ptr(service, "io.systemd.NameServiceSwitch")) + *ret = USERDB_NSS_ONLY|USERDB_AVOID_MULTIPLEXER; + else if (streq_ptr(service, "io.systemd.DropIn")) + *ret = USERDB_DROPIN_ONLY|USERDB_AVOID_MULTIPLEXER; + else if (streq_ptr(service, "io.systemd.Multiplexer")) + *ret = USERDB_AVOID_MULTIPLEXER; + else + return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL); + + return 0; +} + +static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 }, + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(user_record_unrefp) UserRecord *hr = NULL; + LookupParameters p = { + .uid = UID_INVALID, + }; + UserDBFlags userdb_flags; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = userdb_flags_from_service(link, p.service, &userdb_flags); + if (r != 0) /* return value of < 0 means error (as usual); > 0 means 'already processed and replied, + * we are done'; == 0 means 'not processed, caller should process now' */ + return r; + + if (uid_is_valid(p.uid)) + r = userdb_by_uid(p.uid, userdb_flags, &hr); + else if (p.user_name) + r = userdb_by_name(p.user_name, userdb_flags, &hr); + else { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *last = NULL; + + r = userdb_all(userdb_flags, &iterator); + if (IN_SET(r, -ESRCH, -ENOLINK)) + /* We turn off Varlink lookups in various cases (e.g. in case we only enable DropIn + * backend) — this might make userdb_all return ENOLINK (which indicates that varlink + * was off and no other suitable source or entries were found). Let's hide this + * implementation detail and always return NoRecordFound in this case, since from a + * client's perspective it's irrelevant if there was no entry at all or just not on + * the service that the query was limited to. */ + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + for (;;) { + _cleanup_(user_record_unrefp) UserRecord *z = NULL; + + r = userdb_iterator_get(iterator, &z); + if (r == -ESRCH) + break; + if (r < 0) + return r; + + if (last) { + r = varlink_notify(link, last); + if (r < 0) + return r; + + last = json_variant_unref(last); + } + + r = build_user_json(link, z, &last); + if (r < 0) + return r; + } + + if (!last) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + + return varlink_reply(link, last); + } + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) { + log_debug_errno(r, "User lookup failed abnormally: %m"); + return varlink_error(link, "io.systemd.UserDatabase.ServiceNotAvailable", NULL); + } + + if ((uid_is_valid(p.uid) && hr->uid != p.uid) || + (p.user_name && !streq(hr->user_name, p.user_name))) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_user_json(link, hr, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int build_group_json(Varlink *link, GroupRecord *gr, JsonVariant **ret) { + _cleanup_(group_record_unrefp) GroupRecord *stripped = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + UserRecordLoadFlags flags; + uid_t peer_uid; + bool trusted; + int r; + + assert(gr); + assert(ret); + + r = varlink_get_peer_uid(link, &peer_uid); + if (r < 0) { + log_debug_errno(r, "Unable to query peer UID, ignoring: %m"); + trusted = false; + } else + trusted = peer_uid == 0; + + flags = USER_RECORD_REQUIRE_REGULAR|USER_RECORD_ALLOW_PER_MACHINE|USER_RECORD_ALLOW_BINDING|USER_RECORD_STRIP_SECRET|USER_RECORD_ALLOW_STATUS|USER_RECORD_ALLOW_SIGNATURE|USER_RECORD_PERMISSIVE; + if (trusted) + flags |= USER_RECORD_ALLOW_PRIVILEGED; + else + flags |= USER_RECORD_STRIP_PRIVILEGED; + + r = group_record_clone(gr, flags, &stripped); + if (r < 0) + return r; + + stripped->incomplete = + gr->incomplete || + (FLAGS_SET(gr->mask, USER_RECORD_PRIVILEGED) && + !FLAGS_SET(stripped->mask, USER_RECORD_PRIVILEGED)); + + v = json_variant_ref(gr->json); + r = add_nss_service(&v); + if (r < 0) + return r; + + return json_build(ret, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v)), + JSON_BUILD_PAIR("incomplete", JSON_BUILD_BOOLEAN(stripped->incomplete)))); +} + +static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + + static const JsonDispatch dispatch_table[] = { + { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + _cleanup_(group_record_unrefp) GroupRecord *g = NULL; + LookupParameters p = { + .gid = GID_INVALID, + }; + UserDBFlags userdb_flags; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = userdb_flags_from_service(link, p.service, &userdb_flags); + if (r != 0) + return r; + + if (gid_is_valid(p.gid)) + r = groupdb_by_gid(p.gid, userdb_flags, &g); + else if (p.group_name) + r = groupdb_by_name(p.group_name, userdb_flags, &g); + else { + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *last = NULL; + + r = groupdb_all(userdb_flags, &iterator); + if (IN_SET(r, -ESRCH, -ENOLINK)) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + for (;;) { + _cleanup_(group_record_unrefp) GroupRecord *z = NULL; + + r = groupdb_iterator_get(iterator, &z); + if (r == -ESRCH) + break; + if (r < 0) + return r; + + if (last) { + r = varlink_notify(link, last); + if (r < 0) + return r; + + last = json_variant_unref(last); + } + + r = build_group_json(link, z, &last); + if (r < 0) + return r; + } + + if (!last) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + + return varlink_reply(link, last); + } + if (r == -ESRCH) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) { + log_debug_errno(r, "Group lookup failed abnormally: %m"); + return varlink_error(link, "io.systemd.UserDatabase.ServiceNotAvailable", NULL); + } + + if ((uid_is_valid(p.gid) && g->gid != p.gid) || + (p.group_name && !streq(g->group_name, p.group_name))) + return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL); + + r = build_group_json(link, g, &v); + if (r < 0) + return r; + + return varlink_reply(link, v); +} + +static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { + static const JsonDispatch dispatch_table[] = { + { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 }, + { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 }, + { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 }, + {} + }; + + _cleanup_free_ char *last_user_name = NULL, *last_group_name = NULL; + _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL; + LookupParameters p = {}; + UserDBFlags userdb_flags; + int r; + + assert(parameters); + + r = varlink_dispatch(link, parameters, dispatch_table, &p); + if (r != 0) + return r; + + r = userdb_flags_from_service(link, p.service, &userdb_flags); + if (r != 0) + return r; + + if (p.group_name) + r = membershipdb_by_group(p.group_name, userdb_flags, &iterator); + else if (p.user_name) + r = membershipdb_by_user(p.user_name, userdb_flags, &iterator); + else + r = membershipdb_all(userdb_flags, &iterator); + if (IN_SET(r, -ESRCH, -ENOLINK)) + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *user_name = NULL, *group_name = NULL; + + r = membershipdb_iterator_get(iterator, &user_name, &group_name); + if (r == -ESRCH) + break; + if (r < 0) + return r; + + /* If both group + user are specified do a-posteriori filtering */ + if (p.group_name && p.user_name && !streq(group_name, p.group_name)) + continue; + + if (last_user_name) { + assert(last_group_name); + + r = varlink_notifyb(link, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(last_user_name)), + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last_group_name)))); + if (r < 0) + return r; + } + + free_and_replace(last_user_name, user_name); + free_and_replace(last_group_name, group_name); + } + + if (!last_user_name) { + assert(!last_group_name); + return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); + } + + assert(last_group_name); + + return varlink_replyb(link, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(last_user_name)), + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last_group_name)))); +} + +static int process_connection(VarlinkServer *server, int fd) { + _cleanup_(varlink_close_unrefp) Varlink *vl = NULL; + int r; + + r = varlink_server_add_connection(server, fd, &vl); + if (r < 0) { + fd = safe_close(fd); + return log_error_errno(r, "Failed to add connection: %m"); + } + + vl = varlink_ref(vl); + + for (;;) { + r = varlink_process(vl); + if (r == -ENOTCONN) { + log_debug("Connection terminated."); + break; + } + if (r < 0) + return log_error_errno(r, "Failed to process connection: %m"); + if (r > 0) + continue; + + r = varlink_wait(vl, CONNECTION_IDLE_USEC); + if (r < 0) + return log_error_errno(r, "Failed to wait for connection events: %m"); + if (r == 0) + break; + } + + return 0; +} + +static int run(int argc, char *argv[]) { + usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY; + _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL; + unsigned n_iterations = 0; + int m, listen_fd, r; + + log_setup(); + + m = sd_listen_fds(false); + if (m < 0) + return log_error_errno(m, "Failed to determine number of listening fds: %m"); + if (m == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received."); + if (m > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time."); + + listen_fd = SD_LISTEN_FDS_START; + + r = fd_nonblock(listen_fd, false); + if (r < 0) + return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m"); + + r = varlink_server_new(&server, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate server: %m"); + + r = varlink_server_add_interface(server, &vl_interface_io_systemd_UserDatabase); + if (r < 0) + return log_error_errno(r, "Failed to add UserDatabase interface to varlink server: %m"); + + r = varlink_server_bind_method_many( + server, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships); + if (r < 0) + return log_error_errno(r, "Failed to bind methods: %m"); + + r = getenv_bool("USERDB_FIXED_WORKER"); + if (r < 0) + return log_error_errno(r, "Failed to parse USERDB_FIXED_WORKER: %m"); + listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC; + + r = userdb_block_nss_systemd(true); + if (r < 0) + return log_error_errno(r, "Failed to disable userdb NSS compatibility: %m"); + + start_time = now(CLOCK_MONOTONIC); + + for (;;) { + _cleanup_close_ int fd = -EBADF; + usec_t n; + + /* Exit the worker in regular intervals, to flush out all memory use */ + if (n_iterations++ > ITERATIONS_MAX) { + log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations); + break; + } + + n = now(CLOCK_MONOTONIC); + if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) { + log_debug("Exiting worker, ran for %s, that's enough.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0)); + break; + } + + if (last_busy_usec == USEC_INFINITY) + last_busy_usec = n; + else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) { + log_debug("Exiting worker, been idle for %s.", + FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0)); + break; + } + + (void) rename_process("systemd-userwork: waiting..."); + fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC)); + (void) rename_process("systemd-userwork: processing..."); + + if (fd == -EAGAIN) + continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected + * after a while, let's check if it's time to exit though. */ + if (fd == -EINTR) + continue; /* Might be that somebody attached via strace, let's just continue in that + * case */ + if (fd < 0) + return log_error_errno(fd, "Failed to accept() from listening socket: %m"); + + if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) { + /* We only slept a very short time? If so, let's see if there are more sockets + * pending, and if so, let's ask our parent for more workers */ + + r = fd_wait_for_event(listen_fd, POLLIN, 0); + if (r < 0) + return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m"); + + if (FLAGS_SET(r, POLLIN)) { + pid_t parent; + + parent = getppid(); + if (parent <= 1) + return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent already died?"); + + if (kill(parent, SIGUSR2) < 0) + return log_error_errno(errno, "Failed to kill our own parent: %m"); + } + } + + (void) process_connection(server, TAKE_FD(fd)); + last_busy_usec = USEC_INFINITY; + } + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/varlinkctl/meson.build b/src/varlinkctl/meson.build new file mode 100644 index 0000000..c1074dc --- /dev/null +++ b/src/varlinkctl/meson.build @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +varlinkctl_sources = files( + 'varlinkctl.c', +) + +executables += [ + executable_template + { + 'name' : 'varlinkctl', + 'public' : true, + 'sources' : varlinkctl_sources, + }, +] diff --git a/src/varlinkctl/varlinkctl.c b/src/varlinkctl/varlinkctl.c new file mode 100644 index 0000000..64105c7 --- /dev/null +++ b/src/varlinkctl/varlinkctl.c @@ -0,0 +1,529 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "build.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-table.h" +#include "main-func.h" +#include "pager.h" +#include "parse-argument.h" +#include "path-util.h" +#include "pretty-print.h" +#include "terminal-util.h" +#include "varlink.h" +#include "verbs.h" +#include "version.h" + +static JsonFormatFlags arg_json_format_flags = JSON_FORMAT_OFF; +static PagerFlags arg_pager_flags = 0; +static VarlinkMethodFlags arg_method_flags = 0; + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("varlinkctl", "1", &link); + if (r < 0) + return log_oom(); + + pager_open(arg_pager_flags); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%5$sIntrospect Varlink Services.%6$s\n" + "\n%3$sCommands:%4$s\n" + " info ADDRESS Show service information\n" + " list-interfaces ADDRESS\n" + " List interfaces implemented by service\n" + " introspect ADDRESS INTERFACE\n" + " Show interface definition\n" + " call ADDRESS METHOD [PARAMS]\n" + " Invoke method\n" + " validate-idl [FILE] Validate interface description\n" + " help Show this help\n" + "\n%3$sOptions:%4$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + " --more Request multiple responses\n" + " --oneway Do not request response\n" + " --json=MODE Output as JSON\n" + " -j Same as --json=pretty on tty, --json=short otherwise\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int verb_help(int argc, char **argv, void *userdata) { + return help(); +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_MORE, + ARG_ONEWAY, + ARG_JSON, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "more", no_argument, NULL, ARG_MORE }, + { "oneway", no_argument, NULL, ARG_ONEWAY }, + { "json", required_argument, NULL, ARG_JSON }, + {}, + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "hj", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_MORE: + arg_method_flags = (arg_method_flags & ~VARLINK_METHOD_ONEWAY) | VARLINK_METHOD_MORE; + break; + + case ARG_ONEWAY: + arg_method_flags = (arg_method_flags & ~VARLINK_METHOD_MORE) | VARLINK_METHOD_ONEWAY; + break; + + case ARG_JSON: + r = parse_json_argument(optarg, &arg_json_format_flags); + if (r <= 0) + return r; + + break; + + case 'j': + arg_json_format_flags = JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_COLOR_AUTO; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + /* If more than one reply is expected, imply JSON-SEQ output */ + if (FLAGS_SET(arg_method_flags, VARLINK_METHOD_MORE)) + arg_json_format_flags |= JSON_FORMAT_SEQ; + + return 1; +} + +static int varlink_connect_auto(Varlink **ret, const char *where) { + int r; + + assert(ret); + assert(where); + + if (STARTSWITH_SET(where, "/", "./")) { /* If the string starts with a slash or dot slash we use it as a file system path */ + _cleanup_close_ int fd = -EBADF; + struct stat st; + + fd = open(where, O_PATH|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", where); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", where); + + /* Is this a socket in the fs? Then connect() to it. */ + if (S_ISSOCK(st.st_mode)) { + r = varlink_connect_address(ret, FORMAT_PROC_FD_PATH(fd)); + if (r < 0) + return log_error_errno(r, "Failed to connect to '%s': %m", where); + + return 0; + } + + /* Is this an executable binary? Then fork it off. */ + if (S_ISREG(st.st_mode) && (st.st_mode & 0111)) { + r = varlink_connect_exec(ret, where, STRV_MAKE(where)); /* Ideally we'd use FORMAT_PROC_FD_PATH(fd) here too, but that breaks the #! logic */ + if (r < 0) + return log_error_errno(r, "Failed to spawn '%s' process: %m", where); + + return 0; + } + + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unrecognized path '%s' is neither an AF_UNIX socket, nor an executable binary.", where); + } + + /* Otherwise assume this is an URL */ + r = varlink_connect_url(ret, where); + if (r < 0) + return log_error_errno(r, "Failed to connect to URL '%s': %m", where); + + return 0; +} + +typedef struct GetInfoData { + const char *vendor; + const char *product; + const char *version; + const char *url; + char **interfaces; +} GetInfoData; + +static void get_info_data_done(GetInfoData *d) { + assert(d); + + d->interfaces = strv_free(d->interfaces); +} + +static int verb_info(int argc, char *argv[], void *userdata) { + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + const char *url; + int r; + + assert(argc == 2); + url = argv[1]; + + r = varlink_connect_auto(&vl, url); + if (r < 0) + return r; + + JsonVariant *reply = NULL; + const char *error = NULL; + r = varlink_call(vl, "org.varlink.service.GetInfo", NULL, &reply, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to issue GetInfo() call: %m"); + if (error) + return log_error_errno(SYNTHETIC_ERRNO(EBADE), "Method call GetInfo() failed: %s", error); + + pager_open(arg_pager_flags); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + static const struct JsonDispatch dispatch_table[] = { + { "vendor", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(GetInfoData, vendor), JSON_MANDATORY }, + { "product", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(GetInfoData, product), JSON_MANDATORY }, + { "version", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(GetInfoData, version), JSON_MANDATORY }, + { "url", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(GetInfoData, url), JSON_MANDATORY }, + { "interfaces", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(GetInfoData, interfaces), JSON_MANDATORY }, + {} + }; + _cleanup_(get_info_data_done) GetInfoData data = {}; + + r = json_dispatch(reply, dispatch_table, JSON_LOG, &data); + if (r < 0) + return r; + + strv_sort(data.interfaces); + + if (streq_ptr(argv[0], "list-interfaces")) { + STRV_FOREACH(i, data.interfaces) + puts(*i); + } else { + _cleanup_(table_unrefp) Table *t = NULL; + + t = table_new_vertical(); + if (!t) + return log_oom(); + + r = table_add_many( + t, + TABLE_FIELD, "Vendor", + TABLE_STRING, data.vendor, + TABLE_FIELD, "Product", + TABLE_STRING, data.product, + TABLE_FIELD, "Version", + TABLE_STRING, data.version, + TABLE_FIELD, "URL", + TABLE_STRING, data.url, + TABLE_SET_URL, data.url, + TABLE_FIELD, "Interfaces", + TABLE_STRV, data.interfaces); + if (r < 0) + return table_log_add_error(r); + + r = table_print(t, NULL); + if (r < 0) + return table_log_print_error(r); + } + } else { + JsonVariant *v; + + v = streq_ptr(argv[0], "list-interfaces") ? + json_variant_by_key(reply, "interfaces") : reply; + + json_variant_dump(v, arg_json_format_flags, stdout, NULL); + } + + return 0; +} + +typedef struct GetInterfaceDescriptionData { + const char *description; +} GetInterfaceDescriptionData; + +static int verb_introspect(int argc, char *argv[], void *userdata) { + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + const char *url, *interface; + int r; + + assert(argc == 3); + url = argv[1]; + interface = argv[2]; + + r = varlink_connect_auto(&vl, url); + if (r < 0) + return r; + + JsonVariant *reply = NULL; + const char *error = NULL; + r = varlink_callb(vl, "org.varlink.service.GetInterfaceDescription", &reply, &error, NULL, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("interface", interface))); + if (r < 0) + return log_error_errno(r, "Failed to issue GetInterfaceDescription() call: %m"); + if (error) + return log_error_errno(SYNTHETIC_ERRNO(EBADE), "Method call GetInterfaceDescription() failed: %s", error); + + pager_open(arg_pager_flags); + + if (FLAGS_SET(arg_json_format_flags, JSON_FORMAT_OFF)) { + static const struct JsonDispatch dispatch_table[] = { + { "description", JSON_VARIANT_STRING, json_dispatch_const_string, 0, JSON_MANDATORY }, + {} + }; + _cleanup_(varlink_interface_freep) VarlinkInterface *vi = NULL; + const char *description = NULL; + unsigned line = 0, column = 0; + + r = json_dispatch(reply, dispatch_table, JSON_LOG, &description); + if (r < 0) + return r; + + /* Try to parse the returned description, so that we can add syntax highlighting */ + r = varlink_idl_parse(ASSERT_PTR(description), &line, &column, &vi); + if (r < 0) { + log_warning_errno(r, "Failed to parse returned interface description at %u:%u, showing raw interface description: %m", line, column); + + fputs(description, stdout); + if (!endswith(description, "\n")) + fputs("\n", stdout); + } else { + r = varlink_idl_dump(stdout, /* use_colors= */ -1, vi); + if (r < 0) + return log_error_errno(r, "Failed to format parsed interface description: %m"); + } + } else + json_variant_dump(reply, arg_json_format_flags, stdout, NULL); + + return 0; +} + +static int reply_callback( + Varlink *link, + JsonVariant *parameters, + const char *error, + VarlinkReplyFlags flags, + void *userdata) { + + int r; + + assert(link); + + if (error) { + /* Propagate the error we received via sd_notify() */ + (void) sd_notifyf(/* unset_environment= */ false, "VARLINKERROR=%s", error); + + r = log_error_errno(SYNTHETIC_ERRNO(EBADE), "Method call failed: %s", error); + } else + r = 0; + + json_variant_dump(parameters, arg_json_format_flags, stdout, NULL); + return r; +} + +static int verb_call(int argc, char *argv[], void *userdata) { + _cleanup_(json_variant_unrefp) JsonVariant *jp = NULL; + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + const char *url, *method, *parameter; + unsigned line = 0, column = 0; + int r; + + assert(argc >= 3); + assert(argc <= 4); + url = argv[1]; + method = argv[2]; + parameter = argc > 3 && !streq(argv[3], "-") ? argv[3] : NULL; + + arg_json_format_flags &= ~JSON_FORMAT_OFF; + + if (parameter) { + /* is correct, as dispatch_verb() shifts arguments by one for the verb. */ + r = json_parse_with_source(parameter, "", 0, &jp, &line, &column); + if (r < 0) + return log_error_errno(r, "Failed to parse parameters at :%u:%u: %m", line, column); + } else { + r = json_parse_file_at(stdin, AT_FDCWD, "", 0, &jp, &line, &column); + if (r < 0) + return log_error_errno(r, "Failed to parse parameters at :%u:%u: %m", line, column); + } + + r = varlink_connect_auto(&vl, url); + if (r < 0) + return r; + + if (arg_method_flags & VARLINK_METHOD_ONEWAY) { + r = varlink_send(vl, method, jp); + if (r < 0) + return log_error_errno(r, "Failed to issue %s() call: %m", method); + + r = varlink_flush(vl); + if (r < 0) + return log_error_errno(r, "Failed to flush Varlink connection: %m"); + + } else if (arg_method_flags & VARLINK_METHOD_MORE) { + + varlink_set_userdata(vl, (void*) method); + + r = varlink_bind_reply(vl, reply_callback); + if (r < 0) + return log_error_errno(r, "Failed to bind reply callback: %m"); + + r = varlink_observe(vl, method, jp); + if (r < 0) + return log_error_errno(r, "Failed to issue %s() call: %m", method); + + for (;;) { + r = varlink_is_idle(vl); + if (r < 0) + return log_error_errno(r, "Failed to check if varlink connection is idle: %m"); + if (r > 0) + break; + + r = varlink_process(vl); + if (r < 0) + return log_error_errno(r, "Failed to process varlink connection: %m"); + if (r != 0) + continue; + + r = varlink_wait(vl, USEC_INFINITY); + if (r < 0) + return log_error_errno(r, "Failed to wait for varlink connection events: %m"); + } + } else { + JsonVariant *reply = NULL; + const char *error = NULL; + + r = varlink_call(vl, method, jp, &reply, &error, NULL); + if (r < 0) + return log_error_errno(r, "Failed to issue %s() call: %m", method); + + /* If the server returned an error to us, then fail, but first output the associated parameters */ + if (error) { + /* Propagate the error we received via sd_notify() */ + (void) sd_notifyf(/* unset_environment= */ false, "VARLINKERROR=%s", error); + + r = log_error_errno(SYNTHETIC_ERRNO(EBADE), "Method call %s() failed: %s", method, error); + } else + r = 0; + + pager_open(arg_pager_flags); + + json_variant_dump(reply, arg_json_format_flags, stdout, NULL); + return r; + } + + return 0; +} + +static int verb_validate_idl(int argc, char *argv[], void *userdata) { + _cleanup_(varlink_interface_freep) VarlinkInterface *vi = NULL; + _cleanup_free_ char *text = NULL; + const char *fname; + unsigned line = 1, column = 1; + int r; + + fname = argc > 1 ? argv[1] : NULL; + + if (fname) { + r = read_full_file(fname, &text, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read interface description file '%s': %m", fname); + } else { + r = read_full_stream(stdin, &text, NULL); + if (r < 0) + return log_error_errno(r, "Failed to read interface description from stdin: %m"); + + fname = ""; + } + + r = varlink_idl_parse(text, &line, &column, &vi); + if (r == -EBADMSG) + return log_error_errno(r, "%s:%u:%u: Bad syntax.", fname, line, column); + if (r == -ENETUNREACH) + return log_error_errno(r, "%s:%u:%u: Failed to parse interface description due an unresolved type.", fname, line, column); + if (r < 0) + return log_error_errno(r, "%s:%u:%u: Failed to parse interface description: %m", fname, line, column); + + r = varlink_idl_consistent(vi, LOG_ERR); + if (r == -EUCLEAN) + return log_error_errno(r, "Interface is inconsistent."); + if (r == -ENOTUNIQ) + return log_error_errno(r, "Field or symbol not unique in interface."); + if (r < 0) + return log_error_errno(r, "Failed to check interface for consistency: %m"); + + pager_open(arg_pager_flags); + + r = varlink_idl_dump(stdout, /* use_colors= */ -1, vi); + if (r < 0) + return log_error_errno(r, "Failed to format parsed interface description: %m"); + + return 0; +} + +static int varlinkctl_main(int argc, char *argv[]) { + static const Verb verbs[] = { + { "info", 2, 2, 0, verb_info }, + { "list-interfaces", 2, 2, 0, verb_info }, + { "introspect", 3, 3, 0, verb_introspect }, + { "call", 3, 4, 0, verb_call }, + { "validate-idl", 1, 2, 0, verb_validate_idl }, + { "help", VERB_ANY, VERB_ANY, 0, verb_help }, + {} + }; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +static int run(int argc, char *argv[]) { + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return varlinkctl_main(argc, argv); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/vconsole/meson.build b/src/vconsole/meson.build new file mode 100644 index 0000000..111083c --- /dev/null +++ b/src/vconsole/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-vconsole-setup', + 'conditions' : ['ENABLE_VCONSOLE'], + 'sources' : files('vconsole-setup.c'), + }, +] diff --git a/src/vconsole/vconsole-setup.c b/src/vconsole/vconsole-setup.c new file mode 100644 index 0000000..4d82c65 --- /dev/null +++ b/src/vconsole/vconsole-setup.c @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +/*** + Copyright © 2016 Michal Soltys +***/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "creds-util.h" +#include "dev-setup.h" +#include "env-file.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "io-util.h" +#include "locale-util.h" +#include "log.h" +#include "main-func.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "signal-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "virt.h" + +typedef enum VCMeta { + VC_KEYMAP, + VC_KEYMAP_TOGGLE, + VC_FONT, + VC_FONT_MAP, + VC_FONT_UNIMAP, + _VC_META_MAX, + _VC_META_INVALID = -EINVAL, +} VCMeta; + +typedef struct Context { + char *config[_VC_META_MAX]; +} Context; + +static const char * const vc_meta_names[_VC_META_MAX] = { + [VC_KEYMAP] = "vconsole.keymap", + [VC_KEYMAP_TOGGLE] = "vconsole.keymap_toggle", + [VC_FONT] = "vconsole.font", + [VC_FONT_MAP] = "vconsole.font_map", + [VC_FONT_UNIMAP] = "vconsole.font_unimap", +}; + +/* compatibility with obsolete multiple-dot scheme */ +static const char * const vc_meta_compat_names[_VC_META_MAX] = { + [VC_KEYMAP_TOGGLE] = "vconsole.keymap.toggle", + [VC_FONT_MAP] = "vconsole.font.map", + [VC_FONT_UNIMAP] = "vconsole.font.unimap", +}; + +static const char * const vc_env_names[_VC_META_MAX] = { + [VC_KEYMAP] = "KEYMAP", + [VC_KEYMAP_TOGGLE] = "KEYMAP_TOGGLE", + [VC_FONT] = "FONT", + [VC_FONT_MAP] = "FONT_MAP", + [VC_FONT_UNIMAP] = "FONT_UNIMAP", +}; + +static void context_done(Context *c) { + assert(c); + + FOREACH_ARRAY(cc, c->config, _VC_META_MAX) + free(*cc); +} + +static void context_merge_config( + Context *dst, + Context *src, + Context *src_compat) { + + assert(dst); + assert(src); + + for (VCMeta i = 0; i < _VC_META_MAX; i++) + if (src->config[i]) + free_and_replace(dst->config[i], src->config[i]); + else if (src_compat && src_compat->config[i]) + free_and_replace(dst->config[i], src_compat->config[i]); +} + +static const char* context_get_config(Context *c, VCMeta meta) { + assert(c); + assert(meta >= 0 && meta < _VC_META_MAX); + + if (meta == VC_KEYMAP) + return isempty(c->config[VC_KEYMAP]) ? SYSTEMD_DEFAULT_KEYMAP : c->config[VC_KEYMAP]; + + return empty_to_null(c->config[meta]); +} + +static int context_read_creds(Context *c) { + _cleanup_(context_done) Context v = {}; + int r; + + assert(c); + + r = read_credential_strings_many( + vc_meta_names[VC_KEYMAP], &v.config[VC_KEYMAP], + vc_meta_names[VC_KEYMAP_TOGGLE], &v.config[VC_KEYMAP_TOGGLE], + vc_meta_names[VC_FONT], &v.config[VC_FONT], + vc_meta_names[VC_FONT_MAP], &v.config[VC_FONT_MAP], + vc_meta_names[VC_FONT_UNIMAP], &v.config[VC_FONT_UNIMAP]); + if (r < 0) + log_warning_errno(r, "Failed to import credentials, ignoring: %m"); + + context_merge_config(c, &v, NULL); + return 0; +} + +static int context_read_env(Context *c) { + _cleanup_(context_done) Context v = {}; + int r; + + assert(c); + + r = parse_env_file( + NULL, "/etc/vconsole.conf", + vc_env_names[VC_KEYMAP], &v.config[VC_KEYMAP], + vc_env_names[VC_KEYMAP_TOGGLE], &v.config[VC_KEYMAP_TOGGLE], + vc_env_names[VC_FONT], &v.config[VC_FONT], + vc_env_names[VC_FONT_MAP], &v.config[VC_FONT_MAP], + vc_env_names[VC_FONT_UNIMAP], &v.config[VC_FONT_UNIMAP]); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to read /etc/vconsole.conf, ignoring: %m"); + return r; + } + + context_merge_config(c, &v, NULL); + return 0; +} + +static int context_read_proc_cmdline(Context *c) { + _cleanup_(context_done) Context v = {}, w = {}; + int r; + + assert(c); + + r = proc_cmdline_get_key_many( + PROC_CMDLINE_STRIP_RD_PREFIX, + vc_meta_names[VC_KEYMAP], &v.config[VC_KEYMAP], + vc_meta_names[VC_KEYMAP_TOGGLE], &v.config[VC_KEYMAP_TOGGLE], + vc_meta_names[VC_FONT], &v.config[VC_FONT], + vc_meta_names[VC_FONT_MAP], &v.config[VC_FONT_MAP], + vc_meta_names[VC_FONT_UNIMAP], &v.config[VC_FONT_UNIMAP], + vc_meta_compat_names[VC_KEYMAP_TOGGLE], &w.config[VC_KEYMAP_TOGGLE], + vc_meta_compat_names[VC_FONT_MAP], &w.config[VC_FONT_MAP], + vc_meta_compat_names[VC_FONT_UNIMAP], &w.config[VC_FONT_UNIMAP]); + if (r < 0) { + if (r != -ENOENT) + log_warning_errno(r, "Failed to read /proc/cmdline, ignoring: %m"); + return r; + } + + context_merge_config(c, &v, &w); + return 0; +} + +static void context_load_config(Context *c) { + assert(c); + + /* Load data from credentials (lowest priority) */ + (void) context_read_creds(c); + + /* Load data from configuration file (middle priority) */ + (void) context_read_env(c); + + /* Let the kernel command line override /etc/vconsole.conf (highest priority) */ + (void) context_read_proc_cmdline(c); +} + +static int verify_vc_device(int fd) { + unsigned char data[] = { + TIOCL_GETFGCONSOLE, + }; + + return RET_NERRNO(ioctl(fd, TIOCLINUX, data)); +} + +static int verify_vc_allocation(unsigned idx) { + char vcname[sizeof("/dev/vcs") + DECIMAL_STR_MAX(unsigned) - 2]; + + xsprintf(vcname, "/dev/vcs%u", idx); + + return RET_NERRNO(access(vcname, F_OK)); +} + +static int verify_vc_allocation_byfd(int fd) { + struct vt_stat vcs = {}; + + if (ioctl(fd, VT_GETSTATE, &vcs) < 0) + return -errno; + + return verify_vc_allocation(vcs.v_active); +} + +static int verify_vc_kbmode(int fd) { + int curr_mode; + + /* + * Make sure we only adjust consoles in K_XLATE or K_UNICODE mode. + * Otherwise we would (likely) interfere with X11's processing of the + * key events. + * + * https://lists.freedesktop.org/archives/systemd-devel/2013-February/008573.html + */ + + if (ioctl(fd, KDGKBMODE, &curr_mode) < 0) + return -errno; + + return IN_SET(curr_mode, K_XLATE, K_UNICODE) ? 0 : -EBUSY; +} + +static int toggle_utf8_vc(const char *name, int fd, bool utf8) { + int r; + struct termios tc = {}; + + assert(name); + assert(fd >= 0); + + r = ioctl(fd, KDSKBMODE, utf8 ? K_UNICODE : K_XLATE); + if (r < 0) + return log_warning_errno(errno, "Failed to %s UTF-8 kbdmode on %s: %m", enable_disable(utf8), name); + + r = loop_write(fd, utf8 ? "\033%G" : "\033%@", SIZE_MAX); + if (r < 0) + return log_warning_errno(r, "Failed to %s UTF-8 term processing on %s: %m", enable_disable(utf8), name); + + r = tcgetattr(fd, &tc); + if (r >= 0) { + SET_FLAG(tc.c_iflag, IUTF8, utf8); + r = tcsetattr(fd, TCSANOW, &tc); + } + if (r < 0) + return log_warning_errno(errno, "Failed to %s iutf8 flag on %s: %m", enable_disable(utf8), name); + + log_debug("UTF-8 kbdmode %sd on %s", enable_disable(utf8), name); + return 0; +} + +static int toggle_utf8_sysfs(bool utf8) { + int r; + + r = write_string_file("/sys/module/vt/parameters/default_utf8", one_zero(utf8), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_warning_errno(r, "Failed to %s sysfs UTF-8 flag: %m", enable_disable(utf8)); + + log_debug("Sysfs UTF-8 flag %sd", enable_disable(utf8)); + return 0; +} + +static int keyboard_load_and_wait(const char *vc, Context *c, bool utf8) { + const char *map, *map_toggle, *args[8]; + unsigned i = 0; + pid_t pid; + int r; + + assert(vc); + assert(c); + + map = context_get_config(c, VC_KEYMAP); + map_toggle = context_get_config(c, VC_KEYMAP_TOGGLE); + + /* An empty map means kernel map */ + if (isempty(map) || streq(map, "@kernel")) + return 0; + + args[i++] = KBD_LOADKEYS; + args[i++] = "-q"; + args[i++] = "-C"; + args[i++] = vc; + if (utf8) + args[i++] = "-u"; + args[i++] = map; + if (map_toggle) + args[i++] = map_toggle; + args[i++] = NULL; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *cmd = NULL; + + cmd = strv_join((char**) args, " "); + log_debug("Executing \"%s\"...", strnull(cmd)); + } + + r = safe_fork("(loadkeys)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + execv(args[0], (char **) args); + _exit(EXIT_FAILURE); + } + + return wait_for_terminate_and_check(KBD_LOADKEYS, pid, WAIT_LOG); +} + +static int font_load_and_wait(const char *vc, Context *c) { + const char *font, *map, *unimap, *args[9]; + unsigned i = 0; + pid_t pid; + int r; + + assert(vc); + assert(c); + + font = context_get_config(c, VC_FONT); + map = context_get_config(c, VC_FONT_MAP); + unimap = context_get_config(c, VC_FONT_UNIMAP); + + /* Any part can be set independently */ + if (!font && !map && !unimap) + return 0; + + args[i++] = KBD_SETFONT; + args[i++] = "-C"; + args[i++] = vc; + if (map) { + args[i++] = "-m"; + args[i++] = map; + } + if (unimap) { + args[i++] = "-u"; + args[i++] = unimap; + } + if (font) + args[i++] = font; + args[i++] = NULL; + + if (DEBUG_LOGGING) { + _cleanup_free_ char *cmd = NULL; + + cmd = strv_join((char**) args, " "); + log_debug("Executing \"%s\"...", strnull(cmd)); + } + + r = safe_fork("(setfont)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + execv(args[0], (char **) args); + _exit(EXIT_FAILURE); + } + + /* setfont returns EX_OSERR when ioctl(KDFONTOP/PIO_FONTX/PIO_FONTX) fails. This might mean various + * things, but in particular lack of a graphical console. Let's be generous and not treat this as an + * error. */ + r = wait_for_terminate_and_check(KBD_SETFONT, pid, WAIT_LOG_ABNORMAL); + if (r == EX_OSERR) + log_notice(KBD_SETFONT " failed with a \"system error\" (EX_OSERR), ignoring."); + else if (r >= 0 && r != EXIT_SUCCESS) + log_error(KBD_SETFONT " failed with exit status %i.", r); + + return r; +} + +/* + * A newly allocated VT uses the font from the source VT. Here + * we update all possibly already allocated VTs with the configured + * font. It also allows to restart systemd-vconsole-setup.service, + * to apply a new font to all VTs. + * + * We also setup per-console utf8 related stuff: kbdmode, term + * processing, stty iutf8. + */ +static void setup_remaining_vcs(int src_fd, unsigned src_idx, bool utf8) { + struct console_font_op cfo = { + .op = KD_FONT_OP_GET, + .width = UINT_MAX, .height = UINT_MAX, + .charcount = UINT_MAX, + }; + struct unimapinit adv = {}; + struct unimapdesc unimapd; + _cleanup_free_ struct unipair* unipairs = NULL; + _cleanup_free_ void *fontbuf = NULL; + int log_level = LOG_WARNING; + int r; + + unipairs = new(struct unipair, USHRT_MAX); + if (!unipairs) + return (void) log_oom(); + + /* get metadata of the current font (width, height, count) */ + r = ioctl(src_fd, KDFONTOP, &cfo); + if (r < 0) { + /* We might be called to operate on the dummy console (to setup keymap + * mainly) when fbcon deferred takeover is used for example. In such case, + * setting font is not supported and is expected to fail. */ + if (errno == ENOSYS) + log_level = LOG_DEBUG; + + log_full_errno(log_level, errno, + "KD_FONT_OP_GET failed while trying to get the font metadata: %m"); + } else { + /* verify parameter sanity first */ + if (cfo.width > 32 || cfo.height > 32 || cfo.charcount > 512) + log_warning("Invalid font metadata - width: %u (max 32), height: %u (max 32), count: %u (max 512)", + cfo.width, cfo.height, cfo.charcount); + else { + /* + * Console fonts supported by the kernel are limited in size to 32 x 32 and maximum 512 + * characters. Thus with 1 bit per pixel it requires up to 65536 bytes. The height always + * requires 32 per glyph, regardless of the actual height - see the comment above #define + * max_font_size 65536 in drivers/tty/vt/vt.c for more details. + */ + fontbuf = malloc_multiply((cfo.width + 7) / 8 * 32, cfo.charcount); + if (!fontbuf) { + log_oom(); + return; + } + /* get fonts from the source console */ + cfo.data = fontbuf; + r = ioctl(src_fd, KDFONTOP, &cfo); + if (r < 0) + log_warning_errno(errno, "KD_FONT_OP_GET failed while trying to read the font data: %m"); + else { + unimapd.entries = unipairs; + unimapd.entry_ct = USHRT_MAX; + r = ioctl(src_fd, GIO_UNIMAP, &unimapd); + if (r < 0) + log_warning_errno(errno, "GIO_UNIMAP failed while trying to read unicode mappings: %m"); + else + cfo.op = KD_FONT_OP_SET; + } + } + } + + if (cfo.op != KD_FONT_OP_SET) + log_full(log_level, "Fonts will not be copied to remaining consoles"); + + for (unsigned i = 1; i <= 63; i++) { + char ttyname[sizeof("/dev/tty63")]; + _cleanup_close_ int fd_d = -EBADF; + + if (i == src_idx || verify_vc_allocation(i) < 0) + continue; + + /* try to open terminal */ + xsprintf(ttyname, "/dev/tty%u", i); + fd_d = open_terminal(ttyname, O_RDWR|O_CLOEXEC|O_NOCTTY); + if (fd_d < 0) { + log_warning_errno(fd_d, "Unable to open tty%u, fonts will not be copied: %m", i); + continue; + } + + if (verify_vc_kbmode(fd_d) < 0) + continue; + + (void) toggle_utf8_vc(ttyname, fd_d, utf8); + + if (cfo.op != KD_FONT_OP_SET) + continue; + + r = ioctl(fd_d, KDFONTOP, &cfo); + if (r < 0) { + int last_errno, mode; + + /* The fonts couldn't have been copied. It might be due to the + * terminal being in graphical mode. In this case the kernel + * returns -EINVAL which is too generic for distinguishing this + * specific case. So we need to retrieve the terminal mode and if + * the graphical mode is in used, let's assume that something else + * is using the terminal and the failure was expected as we + * shouldn't have tried to copy the fonts. */ + + last_errno = errno; + if (ioctl(fd_d, KDGETMODE, &mode) >= 0 && mode != KD_TEXT) + log_debug("KD_FONT_OP_SET skipped: tty%u is not in text mode", i); + else + log_warning_errno(last_errno, "KD_FONT_OP_SET failed, fonts will not be copied to tty%u: %m", i); + + continue; + } + + /* Copy unicode translation table unimapd is a ushort count and a pointer + * to an array of struct unipair { ushort, ushort }. */ + r = ioctl(fd_d, PIO_UNIMAPCLR, &adv); + if (r < 0) { + log_warning_errno(errno, "PIO_UNIMAPCLR failed, unimaps might be incorrect for tty%u: %m", i); + continue; + } + + r = ioctl(fd_d, PIO_UNIMAP, &unimapd); + if (r < 0) { + log_warning_errno(errno, "PIO_UNIMAP failed, unimaps might be incorrect for tty%u: %m", i); + continue; + } + + log_debug("Font and unimap successfully copied to %s", ttyname); + } +} + +static int find_source_vc(char **ret_path, unsigned *ret_idx) { + int r, err = 0; + + assert(ret_path); + assert(ret_idx); + + for (unsigned i = 1; i <= 63; i++) { + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *path = NULL; + + r = verify_vc_allocation(i); + if (r < 0) { + log_debug_errno(r, "VC %u existence check failed, skipping: %m", i); + RET_GATHER(err, r); + continue; + } + + if (asprintf(&path, "/dev/tty%u", i) < 0) + return log_oom(); + + fd = open_terminal(path, O_RDWR|O_CLOEXEC|O_NOCTTY); + if (fd < 0) { + log_debug_errno(fd, "Failed to open terminal %s, ignoring: %m", path); + RET_GATHER(err, r); + continue; + } + r = verify_vc_kbmode(fd); + if (r < 0) { + log_debug_errno(r, "Failed to check VC %s keyboard mode: %m", path); + RET_GATHER(err, r); + continue; + } + + /* all checks passed, return this one as a source console */ + *ret_idx = i; + *ret_path = TAKE_PTR(path); + return TAKE_FD(fd); + } + + return log_error_errno(err, "No usable source console found: %m"); +} + +static int verify_source_vc(char **ret_path, const char *src_vc) { + _cleanup_close_ int fd = -EBADF; + char *path; + int r; + + fd = open_terminal(src_vc, O_RDWR|O_CLOEXEC|O_NOCTTY); + if (fd < 0) + return log_error_errno(fd, "Failed to open %s: %m", src_vc); + + r = verify_vc_device(fd); + if (r < 0) + return log_error_errno(r, "Device %s is not a virtual console: %m", src_vc); + + r = verify_vc_allocation_byfd(fd); + if (r < 0) + return log_error_errno(r, "Virtual console %s is not allocated: %m", src_vc); + + r = verify_vc_kbmode(fd); + if (r < 0) + return log_error_errno(r, "Virtual console %s is not in K_XLATE or K_UNICODE: %m", src_vc); + + path = strdup(src_vc); + if (!path) + return log_oom(); + + *ret_path = path; + return TAKE_FD(fd); +} + +static int run(int argc, char **argv) { + _cleanup_(context_done) Context c = {}; + _cleanup_free_ char *vc = NULL; + _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF; + bool utf8, keyboard_ok; + unsigned idx = 0; + int r; + + log_setup(); + + umask(0022); + + if (argv[1]) + fd = verify_source_vc(&vc, argv[1]); + else + fd = find_source_vc(&vc, &idx); + if (fd < 0) + return fd; + + utf8 = is_locale_utf8(); + + context_load_config(&c); + + /* Take lock around the remaining operation to avoid being interrupted by a tty reset operation + * performed for services with TTYVHangup=yes. */ + lock_fd = lock_dev_console(); + if (lock_fd < 0) { + log_full_errno(lock_fd == -ENOENT ? LOG_DEBUG : LOG_ERR, + lock_fd, + "Failed to lock /dev/console%s: %m", + lock_fd == -ENOENT ? ", ignoring" : ""); + if (lock_fd != -ENOENT) + return lock_fd; + } + + (void) toggle_utf8_sysfs(utf8); + (void) toggle_utf8_vc(vc, fd, utf8); + + r = font_load_and_wait(vc, &c); + keyboard_ok = keyboard_load_and_wait(vc, &c, utf8) == 0; + + if (idx > 0) { + if (r == 0) + setup_remaining_vcs(fd, idx, utf8); + else + log_full(r == EX_OSERR ? LOG_NOTICE : LOG_WARNING, + "Setting source virtual console failed, ignoring remaining ones."); + } + + return IN_SET(r, 0, EX_OSERR) && keyboard_ok ? EXIT_SUCCESS : EXIT_FAILURE; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/veritysetup/meson.build b/src/veritysetup/meson.build new file mode 100644 index 0000000..a7468df --- /dev/null +++ b/src/veritysetup/meson.build @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-veritysetup', + 'conditions' : ['HAVE_LIBCRYPTSETUP'], + 'sources' : files('veritysetup.c'), + 'dependencies' : libcryptsetup, + }, + generator_template + { + 'name' : 'systemd-veritysetup-generator', + 'conditions' : ['HAVE_LIBCRYPTSETUP'], + 'sources' : files('veritysetup-generator.c'), + }, +] diff --git a/src/veritysetup/veritysetup-generator.c b/src/veritysetup/veritysetup-generator.c new file mode 100644 index 0000000..d55d4aa --- /dev/null +++ b/src/veritysetup/veritysetup-generator.c @@ -0,0 +1,526 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fstab-util.h" +#include "generator.h" +#include "hexdecoct.h" +#include "id128-util.h" +#include "main-func.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "specifier.h" +#include "string-util.h" +#include "unit-name.h" + +#define SYSTEMD_VERITYSETUP_SERVICE_ROOT "systemd-veritysetup@root.service" +#define SYSTEMD_VERITYSETUP_SERVICE_USR "systemd-veritysetup@usr.service" + +static const char *arg_dest = NULL; +static bool arg_enabled = true; +static bool arg_read_veritytab = true; +static const char *arg_veritytab = NULL; +static char *arg_root_hash = NULL; +static char *arg_root_data_what = NULL; +static char *arg_root_hash_what = NULL; +static char *arg_root_options = NULL; +static char *arg_usr_hash = NULL; +static char *arg_usr_data_what = NULL; +static char *arg_usr_hash_what = NULL; +static char *arg_usr_options = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root_data_what, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root_hash_what, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root_options, freep); +STATIC_DESTRUCTOR_REGISTER(arg_usr_hash, freep); +STATIC_DESTRUCTOR_REGISTER(arg_usr_data_what, freep); +STATIC_DESTRUCTOR_REGISTER(arg_usr_hash_what, freep); +STATIC_DESTRUCTOR_REGISTER(arg_usr_options, freep); + +static int create_special_device( + const char *name, + const char *service, + const char *roothash, + const char *data_what, + const char *hash_what, + const char *options) { + + _cleanup_free_ char *u = NULL, *v = NULL, *d = NULL, *e = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + + /* Creates a systemd-veritysetup@.service instance for the special kernel cmdline specified root + usr devices. */ + + assert(name); + assert(service); + + /* If all three pieces of information are missing, then verity is turned off */ + if (!roothash && !data_what && !hash_what) + return 0; + + /* if one of them is missing however, the data is simply incomplete and this is an error */ + if (!roothash) + log_error("Verity information for %s incomplete, root hash unspecified.", name); + if (!data_what) + log_error("Verity information for %s incomplete, data device unspecified.", name); + if (!hash_what) + log_error("Verity information for %s incomplete, hash device unspecified.", name); + + if (!roothash || !data_what || !hash_what) + return -EINVAL; + + log_debug("Using %s verity data device %s, hash device %s, options %s, and hash %s.", name, data_what, hash_what, options, roothash); + + u = fstab_node_to_udev_node(data_what); + if (!u) + return log_oom(); + v = fstab_node_to_udev_node(hash_what); + if (!v) + return log_oom(); + + r = unit_name_from_path(u, ".device", &d); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name: %m"); + r = unit_name_from_path(v, ".device", &e); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name: %m"); + + r = generator_open_unit_file(arg_dest, NULL, service, &f); + if (r < 0) + return r; + + r = generator_write_veritysetup_unit_section(f, "/proc/cmdline"); + if (r < 0) + return r; + + fprintf(f, + "Before=veritysetup.target\n" + "BindsTo=%s %s\n" + "After=%s %s\n", + d, e, + d, e); + + r = generator_write_veritysetup_service_section(f, name, u, v, roothash, options); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write file unit %s: %m", service); + + r = generator_add_symlink(arg_dest, "veritysetup.target", "requires", service); + if (r < 0) + return r; + + return 0; +} + +static int create_root_device(void) { + return create_special_device("root", SYSTEMD_VERITYSETUP_SERVICE_ROOT, arg_root_hash, arg_root_data_what, arg_root_hash_what, arg_root_options); +} + +static int create_usr_device(void) { + return create_special_device("usr", SYSTEMD_VERITYSETUP_SERVICE_USR, arg_usr_hash, arg_usr_data_what, arg_usr_hash_what, arg_usr_options); +} + +static int parse_proc_cmdline_item(const char *key, const char *value, void *data) { + int r; + + assert(key); + + if (streq(key, "systemd.verity")) { + + r = value ? parse_boolean(value) : 1; + if (r < 0) + log_warning("Failed to parse verity= kernel command line switch %s. Ignoring.", value); + else + arg_enabled = r; + + } else if (streq(key, "veritytab")) { + + r = value ? parse_boolean(value) : 1; + if (r < 0) + log_warning("Failed to parse veritytab= kernel command line switch %s. Ignoring.", value); + else + arg_read_veritytab = r; + + } else if (streq(key, "roothash")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_root_hash, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.verity_root_data")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_root_data_what, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.verity_root_hash")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_root_hash_what, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.verity_root_options")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_root_options, value); + if (r < 0) + return log_oom(); + + } else if (streq(key, "usrhash")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_usr_hash, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.verity_usr_data")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_usr_data_what, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.verity_usr_hash")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_usr_hash_what, value); + if (r < 0) + return log_oom(); + + } else if (proc_cmdline_key_streq(key, "systemd.verity_usr_options")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = free_and_strdup(&arg_usr_options, value); + if (r < 0) + return log_oom(); + + } + + return 0; +} + +static int determine_device( + const char *name, + const char *hash, + char **data_what, + char **hash_what) { + + sd_id128_t data_uuid, verity_uuid; + _cleanup_free_ void *m = NULL; + size_t l; + int r; + + assert(name); + assert(data_what); + assert(hash_what); + + if (!hash) + return 0; + + if (*data_what && *hash_what) + return 0; + + r = unhexmem(hash, strlen(hash), &m, &l); + if (r < 0) + return log_error_errno(r, "Failed to parse hash: %s", hash); + if (l < sizeof(sd_id128_t)) { + log_debug("Root hash for %s is shorter than 128 bits (32 characters), ignoring for discovering verity partition.", name); + return 0; + } + + if (!*data_what) { + memcpy(&data_uuid, m, sizeof(data_uuid)); + + *data_what = path_join("/dev/disk/by-partuuid", SD_ID128_TO_UUID_STRING(data_uuid)); + if (!*data_what) + return log_oom(); + } + + if (!*hash_what) { + memcpy(&verity_uuid, (uint8_t*) m + l - sizeof(verity_uuid), sizeof(verity_uuid)); + + *hash_what = path_join("/dev/disk/by-partuuid", SD_ID128_TO_UUID_STRING(verity_uuid)); + if (!*hash_what) + return log_oom(); + } + + log_info("Using data device %s and hash device %s for %s.", *data_what, *hash_what, name); + + return 1; +} + +static int determine_devices(void) { + int r; + + r = determine_device("root", arg_root_hash, &arg_root_data_what, &arg_root_hash_what); + if (r < 0) + return r; + + return determine_device("usr", arg_usr_hash, &arg_usr_data_what, &arg_usr_hash_what); +} + +static bool attach_in_initrd(const char *name, const char *options) { + assert(name); + + /* Imply x-initrd.attach in case the volume name is among those defined in the Discoverable Partition + * Specification for partitions that we require to be mounted during the initrd → host transition, + * i.e. for the root fs itself, and /usr/. This mirrors similar behaviour in + * systemd-fstab-generator. */ + + return fstab_test_option(options, "x-initrd.attach\0") || + STR_IN_SET(name, "root", "usr"); +} + +static int create_veritytab_device( + const char *name, + const char *data_device, + const char *hash_device, + const char *roothash, + const char *options, + const char *source) { + + _cleanup_free_ char *n = NULL, *dd = NULL, *du = NULL, *hd = NULL, *hu = NULL, *e = NULL, + *du_escaped = NULL, *hu_escaped = NULL, *name_escaped = NULL; + _cleanup_fclose_ FILE *f = NULL; + const char *dmname; + bool noauto, nofail, netdev, need_loop = false; + int r; + + /* Creates a systemd-veritysetup@.service instance for volumes specified in /etc/veritytab. */ + + assert(name); + assert(data_device); + assert(hash_device); + assert(roothash); + + noauto = fstab_test_yes_no_option(options, "noauto\0" "auto\0"); + nofail = fstab_test_yes_no_option(options, "nofail\0" "fail\0"); + netdev = fstab_test_option(options, "_netdev\0"); + + name_escaped = specifier_escape(name); + if (!name_escaped) + return log_oom(); + + e = unit_name_escape(name); + if (!e) + return log_oom(); + + du = fstab_node_to_udev_node(data_device); + if (!du) + return log_oom(); + + hu = fstab_node_to_udev_node(hash_device); + if (!hu) + return log_oom(); + + r = unit_name_build("systemd-veritysetup", e, ".service", &n); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name: %m"); + + du_escaped = specifier_escape(du); + if (!du_escaped) + return log_oom(); + + hu_escaped = specifier_escape(hu); + if (!hu_escaped) + return log_oom(); + + r = unit_name_from_path(du, ".device", &dd); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name: %m"); + + r = unit_name_from_path(hu, ".device", &hd); + if (r < 0) + return log_error_errno(r, "Failed to generate unit name: %m"); + + r = generator_open_unit_file(arg_dest, NULL, n, &f); + if (r < 0) + return r; + + r = generator_write_veritysetup_unit_section(f, source); + if (r < 0) + return r; + + if (netdev) + fprintf(f, "After=remote-fs-pre.target\n"); + + /* If initrd takes care of attaching the disk then it should also detach it during shutdown. */ + if (!attach_in_initrd(name, options)) + fprintf(f, + "Conflicts=umount.target\n" + "Before=umount.target\n"); + + if (!nofail) + fprintf(f, + "Before=%s\n", + netdev ? "remote-veritysetup.target" : "veritysetup.target"); + + if (path_startswith(du, "/dev/")) + fprintf(f, + "BindsTo=%s\n" + "After=%s\n", + dd, dd); + else { + fprintf(f, "RequiresMountsFor=%s\n", du_escaped); + need_loop = true; + } + + if (path_startswith(hu, "/dev/")) + fprintf(f, + "BindsTo=%s\n" + "After=%s\n", + hd, hd); + else { + fprintf(f, "RequiresMountsFor=%s\n", hu_escaped); + need_loop = true; + } + + if (need_loop) + /* For loopback devices make sure to explicitly load loop.ko, as this code might run very + * early where device nodes created via systemd-tmpfiles-setup-dev.service might not be + * around yet. Hence let's sync on the module itself. */ + fprintf(f, + "Wants=modprobe@loop.service\n" + "After=modprobe@loop.service\n"); + + r = generator_write_veritysetup_service_section(f, name, du, hu, roothash, options); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit file %s: %m", n); + + if (!noauto) { + r = generator_add_symlink(arg_dest, + netdev ? "remote-veritysetup.target" : "veritysetup.target", + nofail ? "wants" : "requires", n); + if (r < 0) + return r; + } + + dmname = strjoina("dev-mapper-", e, ".device"); + return generator_add_symlink(arg_dest, dmname, "requires", n); +} + +static int add_veritytab_devices(void) { + _cleanup_fclose_ FILE *f = NULL; + unsigned veritytab_line = 0; + int r; + + if (!arg_read_veritytab) + return 0; + + r = fopen_unlocked(arg_veritytab, "re", &f); + if (r < 0) { + if (errno != ENOENT) + log_error_errno(errno, "Failed to open %s: %m", arg_veritytab); + return 0; + } + + for (;;) { + _cleanup_free_ char *line = NULL, *name = NULL, *data_device = NULL, *hash_device = NULL, + *roothash = NULL, *options = NULL; + char *data_uuid, *hash_uuid; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_error_errno(r, "Failed to read %s: %m", arg_veritytab); + if (r == 0) + break; + + veritytab_line++; + + if (IN_SET(line[0], 0, '#')) + continue; + + r = sscanf(line, "%ms %ms %ms %ms %ms", &name, &data_device, &hash_device, &roothash, &options); + if (!IN_SET(r, 4, 5)) { + log_error("Failed to parse %s:%u, ignoring.", arg_veritytab, veritytab_line); + continue; + } + + data_uuid = startswith(data_device, "UUID="); + if (!data_uuid) + data_uuid = path_startswith(data_device, "/dev/disk/by-uuid/"); + + hash_uuid = startswith(hash_device, "UUID="); + if (!hash_uuid) + hash_uuid = path_startswith(hash_device, "/dev/disk/by-uuid/"); + + r = create_veritytab_device( + name, + data_device, + hash_device, + roothash, + options, + arg_veritytab); + if (r < 0) + return r; + } + + return 0; +} + +static int run(const char *dest, const char *dest_early, const char *dest_late) { + int r; + + assert_se(arg_dest = dest); + + arg_veritytab = getenv("SYSTEMD_VERITYTAB") ?: "/etc/veritytab"; + + r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, PROC_CMDLINE_STRIP_RD_PREFIX); + if (r < 0) + return log_warning_errno(r, "Failed to parse kernel command line: %m"); + + if (!arg_enabled) + return 0; + + r = add_veritytab_devices(); + if (r < 0) + return r; + + r = determine_devices(); + if (r < 0) + return r; + + r = create_root_device(); + if (r < 0) + return r; + + return create_usr_device(); +} + +DEFINE_MAIN_GENERATOR_FUNCTION(run); diff --git a/src/veritysetup/veritysetup.c b/src/veritysetup/veritysetup.c new file mode 100644 index 0000000..d73c2d3 --- /dev/null +++ b/src/veritysetup/veritysetup.c @@ -0,0 +1,431 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "alloc-util.h" +#include "cryptsetup-util.h" +#include "fileio.h" +#include "fstab-util.h" +#include "hexdecoct.h" +#include "log.h" +#include "main-func.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "string-util.h" +#include "terminal-util.h" + +static char *arg_hash = NULL; +static bool arg_superblock = true; +static int arg_format = 1; +static uint64_t arg_data_block_size = 4096; +static uint64_t arg_hash_block_size = 4096; +static uint64_t arg_data_blocks = 0; +static uint64_t arg_hash_offset = 0; +static void *arg_salt = NULL; +static uint64_t arg_salt_size = 32; +static char *arg_uuid = NULL; +static uint32_t arg_activate_flags = CRYPT_ACTIVATE_READONLY; +static char *arg_fec_what = NULL; +static uint64_t arg_fec_offset = 0; +static uint64_t arg_fec_roots = 2; +static char *arg_root_hash_signature = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_hash, freep); +STATIC_DESTRUCTOR_REGISTER(arg_salt, freep); +STATIC_DESTRUCTOR_REGISTER(arg_uuid, freep); +STATIC_DESTRUCTOR_REGISTER(arg_fec_what, freep); +STATIC_DESTRUCTOR_REGISTER(arg_root_hash_signature, freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-veritysetup@.service", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s attach VOLUME DATADEVICE HASHDEVICE ROOTHASH [OPTIONS]\n" + "%s detach VOLUME\n\n" + "Attach or detach a verity protected block device.\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + program_invocation_short_name, + link); + + return 0; +} + +static int save_roothashsig_option(const char *option, bool strict) { + int r; + + if (path_is_absolute(option) || startswith(option, "base64:")) { + if (!HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Activation of verity device with signature requested, but cryptsetup does not support crypt_activate_by_signed_key()."); + + r = free_and_strdup_warn(&arg_root_hash_signature, option); + if (r < 0) + return r; + + return true; + } + + if (!strict) + return false; + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "root-hash-signature= expects either full path to signature file or " + "base64 string encoding signature prefixed by base64:."); +} + +static int parse_block_size(const char *t, uint64_t *size) { + uint64_t u; + int r; + + r = parse_size(t, 1024, &u); + if (r < 0) + return r; + + if (u < 512 || u > (512 * 1024)) + return -ERANGE; + + if ((u % 512) != 0 || !ISPOWEROF2(u)) + return -EINVAL; + + *size = u; + + return 0; +} + +static int parse_options(const char *options) { + int r; + + /* backward compatibility with the obsolete ROOTHASHSIG positional argument */ + r = save_roothashsig_option(options, /* strict= */ false); + if (r < 0) + return r; + if (r > 0) { + log_warning("Usage of ROOTHASHSIG positional argument is deprecated. " + "Please use the option root-hash-signature=%s instead.", options); + return 0; + } + + for (;;) { + _cleanup_free_ char *word = NULL; + char *val; + + r = extract_first_word(&options, &word, ",", EXTRACT_DONT_COALESCE_SEPARATORS | EXTRACT_UNESCAPE_SEPARATORS); + if (r < 0) + return log_error_errno(r, "Failed to parse options: %m"); + if (r == 0) + break; + + if (STR_IN_SET(word, "noauto", "auto", "nofail", "fail", "_netdev")) + continue; + + if (isempty(word)) + continue; + else if (streq(word, "ignore-corruption")) + arg_activate_flags |= CRYPT_ACTIVATE_IGNORE_CORRUPTION; + else if (streq(word, "restart-on-corruption")) + arg_activate_flags |= CRYPT_ACTIVATE_RESTART_ON_CORRUPTION; + else if (streq(word, "ignore-zero-blocks")) + arg_activate_flags |= CRYPT_ACTIVATE_IGNORE_ZERO_BLOCKS; +#ifdef CRYPT_ACTIVATE_CHECK_AT_MOST_ONCE + else if (streq(word, "check-at-most-once")) + arg_activate_flags |= CRYPT_ACTIVATE_CHECK_AT_MOST_ONCE; +#endif +#ifdef CRYPT_ACTIVATE_PANIC_ON_CORRUPTION + else if (streq(word, "panic-on-corruption")) + arg_activate_flags |= CRYPT_ACTIVATE_PANIC_ON_CORRUPTION; +#endif + else if ((val = startswith(word, "superblock="))) { + + r = parse_boolean(val); + if (r < 0) + return log_error_errno(r, "Failed to parse boolean '%s': %m", word); + + arg_superblock = r; + } else if ((val = startswith(word, "format="))) { + + if (!STR_IN_SET(val, "0", "1")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "format= expects either 0 (original Chrome OS version) or " + "1 (modern version)."); + + arg_format = val[0] - '0'; + } else if ((val = startswith(word, "data-block-size="))) { + uint64_t sz; + + r = parse_block_size(val, &sz); + if (r < 0) + return log_error_errno(r, "Failed to parse size '%s': %m", word); + + arg_data_block_size = sz; + } else if ((val = startswith(word, "hash-block-size="))) { + uint64_t sz; + + r = parse_block_size(val, &sz); + if (r < 0) + return log_error_errno(r, "Failed to parse size '%s': %m", word); + + arg_hash_block_size = sz; + } else if ((val = startswith(word, "data-blocks="))) { + uint64_t u; + + r = safe_atou64(val, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse number '%s': %m", word); + + arg_data_blocks = u; + } else if ((val = startswith(word, "hash-offset="))) { + uint64_t off; + + r = parse_size(val, 1024, &off); + if (r < 0) + return log_error_errno(r, "Failed to parse offset '%s': %m", word); + if (off % 512 != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "hash-offset= expects a 512-byte aligned value."); + + arg_hash_offset = off; + } else if ((val = startswith(word, "salt="))) { + + if (!string_is_safe(val)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "salt= is not valid."); + + if (isempty(val)) { + arg_salt = mfree(arg_salt); + arg_salt_size = 32; + } else if (streq(val, "-")) { + arg_salt = mfree(arg_salt); + arg_salt_size = 0; + } else { + size_t l; + void *m; + + r = unhexmem(val, strlen(val), &m, &l); + if (r < 0) + return log_error_errno(r, "Failed to parse salt '%s': %m", word); + + free_and_replace(arg_salt, m); + arg_salt_size = l; + } + } else if ((val = startswith(word, "uuid="))) { + + r = sd_id128_from_string(val, NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse UUID '%s': %m", word); + + r = free_and_strdup(&arg_uuid, val); + if (r < 0) + return log_oom(); + } else if ((val = startswith(word, "hash="))) { + + r = free_and_strdup(&arg_hash, val); + if (r < 0) + return log_oom(); + } else if ((val = startswith(word, "fec-device="))) { + _cleanup_free_ char *what = NULL; + + what = fstab_node_to_udev_node(val); + if (!what) + return log_oom(); + + if (!path_is_absolute(what)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "fec-device= expects an absolute path."); + + if (!path_is_normalized(what)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "fec-device= expects an normalized path."); + + r = free_and_strdup(&arg_fec_what, what); + if (r < 0) + return log_oom(); + } else if ((val = startswith(word, "fec-offset="))) { + uint64_t off; + + r = parse_size(val, 1024, &off); + if (r < 0) + return log_error_errno(r, "Failed to parse offset '%s': %m", word); + if (off % 512 != 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "fec-offset= expects a 512-byte aligned value."); + + arg_fec_offset = off; + } else if ((val = startswith(word, "fec-roots="))) { + uint64_t u; + + r = safe_atou64(val, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse number '%s', ignoring: %m", word); + if (u < 2 || u > 24) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "fec-rootfs= expects a value between 2 and 24 (including)."); + + arg_fec_roots = u; + } else if ((val = startswith(word, "root-hash-signature="))) { + r = save_roothashsig_option(val, /* strict= */ true); + if (r < 0) + return r; + + } else + log_warning("Encountered unknown option '%s', ignoring.", word); + } + + return r; +} + +static int run(int argc, char *argv[]) { + _cleanup_(crypt_freep) struct crypt_device *cd = NULL; + const char *verb; + int r; + + if (argv_looks_like_help(argc, argv)) + return help(); + + if (argc < 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program requires at least two arguments."); + + log_setup(); + + cryptsetup_enable_logging(NULL); + + umask(0022); + + verb = argv[1]; + + if (streq(verb, "attach")) { + const char *volume, *data_device, *verity_device, *root_hash, *options; + _cleanup_free_ void *m = NULL; + struct crypt_params_verity p = {}; + crypt_status_info status; + size_t l; + + if (argc < 6) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "attach requires at least four arguments."); + + volume = argv[2]; + data_device = argv[3]; + verity_device = argv[4]; + root_hash = argv[5]; + options = mangle_none(argc > 6 ? argv[6] : NULL); + + if (!filename_is_valid(volume)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume name '%s' is not valid.", volume); + + r = unhexmem(root_hash, SIZE_MAX, &m, &l); + if (r < 0) + return log_error_errno(r, "Failed to parse root hash: %m"); + + r = crypt_init(&cd, verity_device); + if (r < 0) + return log_error_errno(r, "Failed to open verity device %s: %m", verity_device); + + cryptsetup_enable_logging(cd); + + status = crypt_status(cd, volume); + if (IN_SET(status, CRYPT_ACTIVE, CRYPT_BUSY)) { + log_info("Volume %s already active.", volume); + return 0; + } + + if (options) { + r = parse_options(options); + if (r < 0) + return log_error_errno(r, "Failed to parse options: %m"); + } + + if (arg_superblock) { + p = (struct crypt_params_verity) { + .fec_device = arg_fec_what, + .hash_area_offset = arg_hash_offset, + .fec_area_offset = arg_fec_offset, + .fec_roots = arg_fec_roots, + }; + + r = crypt_load(cd, CRYPT_VERITY, &p); + if (r < 0) + return log_error_errno(r, "Failed to load verity superblock: %m"); + } else { + p = (struct crypt_params_verity) { + .hash_name = arg_hash, + .data_device = data_device, + .fec_device = arg_fec_what, + .salt = arg_salt, + .salt_size = arg_salt_size, + .hash_type = arg_format, + .data_block_size = arg_data_block_size, + .hash_block_size = arg_hash_block_size, + .data_size = arg_data_blocks, + .hash_area_offset = arg_hash_offset, + .fec_area_offset = arg_fec_offset, + .fec_roots = arg_fec_roots, + .flags = CRYPT_VERITY_NO_HEADER, + }; + + r = crypt_format(cd, CRYPT_VERITY, NULL, NULL, arg_uuid, NULL, 0, &p); + if (r < 0) + return log_error_errno(r, "Failed to format verity superblock: %m"); + } + + r = crypt_set_data_device(cd, data_device); + if (r < 0) + return log_error_errno(r, "Failed to configure data device: %m"); + + if (arg_root_hash_signature) { +#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY + _cleanup_free_ char *hash_sig = NULL; + size_t hash_sig_size; + char *value; + + if ((value = startswith(arg_root_hash_signature, "base64:"))) { + r = unbase64mem(value, strlen(value), (void *)&hash_sig, &hash_sig_size); + if (r < 0) + return log_error_errno(r, "Failed to parse root hash signature '%s': %m", arg_root_hash_signature); + } else { + r = read_full_file_full( + AT_FDCWD, arg_root_hash_signature, UINT64_MAX, SIZE_MAX, + READ_FULL_FILE_CONNECT_SOCKET, + NULL, + &hash_sig, &hash_sig_size); + if (r < 0) + return log_error_errno(r, "Failed to read root hash signature: %m"); + } + + r = crypt_activate_by_signed_key(cd, volume, m, l, hash_sig, hash_sig_size, arg_activate_flags); +#else + assert_not_reached(); +#endif + } else + r = crypt_activate_by_volume_key(cd, volume, m, l, arg_activate_flags); + if (r < 0) + return log_error_errno(r, "Failed to set up verity device '%s': %m", volume); + + } else if (streq(verb, "detach")) { + const char *volume; + + volume = argv[2]; + + if (!filename_is_valid(volume)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Volume name '%s' is not valid.", volume); + + r = crypt_init_by_name(&cd, volume); + if (r == -ENODEV) { + log_info("Volume %s 'already' inactive.", volume); + return 0; + } + if (r < 0) + return log_error_errno(r, "crypt_init_by_name() for volume '%s' failed: %m", volume); + + cryptsetup_enable_logging(cd); + + r = crypt_deactivate(cd, volume); + if (r < 0) + return log_error_errno(r, "Failed to deactivate volume '%s': %m", volume); + + } else + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown verb %s.", verb); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/version/version.h.in b/src/version/version.h.in new file mode 100644 index 0000000..083779a --- /dev/null +++ b/src/version/version.h.in @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later + * + * Detailed project version that includes git commit when not built from a release. + * Use this in preference to PROJECT_VERSION, with the following exceptions: + * - where a simplified form is expected for compatibility, for example + * 'udevadm version', + * - where a simplified machine-parsable form is more useful, for example + * pkgconfig files and version information written to binary files. + */ +#define GIT_VERSION "@VCS_TAG@" diff --git a/src/vmspawn/meson.build b/src/vmspawn/meson.build new file mode 100644 index 0000000..800d7c3 --- /dev/null +++ b/src/vmspawn/meson.build @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libvmspawn_core_sources = files( + 'vmspawn-settings.c', + 'vmspawn-util.c', +) +libvmspawn_core = static_library( + 'vmspawn-core', + libvmspawn_core_sources, + include_directories : includes, + dependencies : [userspace], + build_by_default : false) + +vmspawn_libs = [ + libvmspawn_core, + libshared, +] + +executables += [ + executable_template + { + 'name' : 'systemd-vmspawn', + 'public' : true, + 'conditions': ['ENABLE_VMSPAWN'], + 'sources' : files('vmspawn.c'), + 'link_with' : vmspawn_libs, + } +] diff --git a/src/vmspawn/vmspawn-settings.c b/src/vmspawn/vmspawn-settings.c new file mode 100644 index 0000000..cb1a463 --- /dev/null +++ b/src/vmspawn/vmspawn-settings.c @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "vmspawn-settings.h" diff --git a/src/vmspawn/vmspawn-settings.h b/src/vmspawn/vmspawn-settings.h new file mode 100644 index 0000000..268a874 --- /dev/null +++ b/src/vmspawn/vmspawn-settings.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +typedef enum SettingsMask { + SETTING_START_MODE = UINT64_C(1) << 0, + SETTING_DIRECTORY = UINT64_C(1) << 26, + SETTING_CREDENTIALS = UINT64_C(1) << 30, + _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX +} SettingsMask; diff --git a/src/vmspawn/vmspawn-util.c b/src/vmspawn/vmspawn-util.c new file mode 100644 index 0000000..b5b5eaf --- /dev/null +++ b/src/vmspawn/vmspawn-util.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "architecture.h" +#include "conf-files.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "json.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "random-util.h" +#include "recurse-dir.h" +#include "siphash24.h" +#include "socket-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" +#include "vmspawn-util.h" + +OvmfConfig* ovmf_config_free(OvmfConfig *config) { + if (!config) + return NULL; + + free(config->path); + free(config->vars); + return mfree(config); +} + +int qemu_check_kvm_support(void) { + if (access("/dev/kvm", F_OK) >= 0) + return true; + if (errno == ENOENT) { + log_debug_errno(errno, "/dev/kvm not found. Not using KVM acceleration."); + return false; + } + if (errno == EPERM) { + log_debug_errno(errno, "Permission denied to access /dev/kvm. Not using KVM acceleration."); + return false; + } + + return -errno; +} + +int qemu_check_vsock_support(void) { + _cleanup_close_ int fd = -EBADF; + /* Just using access() will just check if the device node exists, but not whether a + * device driver is behind it (this is a common case since systemd-tmpfiles creates + * the device node on boot, typically). + * + * Hence we open() the path to see if there's actually something behind. + * + * If not this should return ENODEV. + */ + + fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); + if (fd >= 0) + return true; + if (errno == ENODEV) { + log_debug_errno(errno, "/dev/vhost-vsock device doesn't exist. Not adding a vsock device to the virtual machine."); + return false; + } + if (errno == EPERM) { + log_debug_errno(errno, "Permission denied to access /dev/vhost-vsock. Not adding a vsock device to the virtual machine."); + return false; + } + + return -errno; +} + +/* holds the data retrieved from the QEMU firmware interop JSON data */ +typedef struct FirmwareData { + char **features; + char *firmware; + char *vars; +} FirmwareData; + +static FirmwareData* firmware_data_free(FirmwareData *fwd) { + if (!fwd) + return NULL; + + fwd->features = strv_free(fwd->features); + fwd->firmware = mfree(fwd->firmware); + fwd->vars = mfree(fwd->vars); + + return mfree(fwd); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(FirmwareData*, firmware_data_free); + +static int firmware_executable(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, firmware), JSON_MANDATORY }, + { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, 0, userdata); +} + +static int firmware_nvram_template(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, vars), JSON_MANDATORY }, + { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, 0, userdata); +} + +static int firmware_mapping(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "device", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "executable", JSON_VARIANT_OBJECT, firmware_executable, 0, JSON_MANDATORY }, + { "nvram-template", JSON_VARIANT_OBJECT, firmware_nvram_template, 0, JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, 0, userdata); +} + +int find_ovmf_config(int search_sb, OvmfConfig **ret) { + _cleanup_(ovmf_config_freep) OvmfConfig *config = NULL; + _cleanup_free_ char *user_firmware_dir = NULL; + _cleanup_strv_free_ char **conf_files = NULL; + int r; + + /* Search in: + * - $XDG_CONFIG_HOME/qemu/firmware + * - /etc/qemu/firmware + * - /usr/share/qemu/firmware + * + * Prioritising entries in "more specific" directories + */ + + r = xdg_user_config_dir(&user_firmware_dir, "/qemu/firmware"); + if (r < 0) + return r; + + r = conf_files_list_strv(&conf_files, ".json", NULL, CONF_FILES_FILTER_MASKED|CONF_FILES_REGULAR, + STRV_MAKE_CONST(user_firmware_dir, "/etc/qemu/firmware", "/usr/share/qemu/firmware")); + if (r < 0) + return log_debug_errno(r, "Failed to list config files: %m"); + + STRV_FOREACH(file, conf_files) { + _cleanup_(firmware_data_freep) FirmwareData *fwd = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *config_json = NULL; + _cleanup_free_ char *contents = NULL; + size_t contents_sz = 0; + + r = read_full_file(*file, &contents, &contents_sz); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to read contents of %s - ignoring: %m", *file); + continue; + } + + r = json_parse(contents, 0, &config_json, NULL, NULL); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to parse the JSON in %s - ignoring: %m", *file); + continue; + } + + static const JsonDispatch table[] = { + { "description", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "interface-types", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + { "mapping", JSON_VARIANT_OBJECT, firmware_mapping, 0, JSON_MANDATORY }, + { "targets", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + { "features", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(FirmwareData, features), JSON_MANDATORY }, + { "tags", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + {} + }; + + fwd = new0(FirmwareData, 1); + if (!fwd) + return -ENOMEM; + + r = json_dispatch(config_json, table, 0, fwd); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to extract the required fields from the JSON in %s - ignoring: %m", *file); + continue; + } + + int sb_present = !!strv_find(fwd->features, "secure-boot"); + + /* exclude firmware which doesn't match our Secure Boot requirements */ + if (search_sb >= 0 && search_sb != sb_present) { + log_debug("Skipping %s, firmware doesn't fit required Secure Boot configuration", *file); + continue; + } + + config = new0(OvmfConfig, 1); + if (!config) + return -ENOMEM; + + config->path = TAKE_PTR(fwd->firmware); + config->vars = TAKE_PTR(fwd->vars); + config->supports_sb = sb_present; + break; + } + + if (!config) + return -ENOENT; + + if (ret) + *ret = TAKE_PTR(config); + + return 0; +} + +int find_qemu_binary(char **ret_qemu_binary) { + int r; + + /* + * On success the path to the qemu binary will be stored in `req_qemu_binary` + * + * If the qemu binary cannot be found -ENOENT will be returned. + * If the native architecture is not supported by qemu -EOPNOTSUPP will be returned; + */ + + static const char *architecture_to_qemu_table[_ARCHITECTURE_MAX] = { + [ARCHITECTURE_ARM64] = "aarch64", /* differs from our name */ + [ARCHITECTURE_ARM] = "arm", + [ARCHITECTURE_ALPHA] = "alpha", + [ARCHITECTURE_X86_64] = "x86_64", /* differs from our name */ + [ARCHITECTURE_X86] = "i386", /* differs from our name */ + [ARCHITECTURE_LOONGARCH64] = "loongarch64", + [ARCHITECTURE_MIPS64_LE] = "mips", /* differs from our name */ + [ARCHITECTURE_MIPS_LE] = "mips", /* differs from our name */ + [ARCHITECTURE_PARISC] = "hppa", /* differs from our name */ + [ARCHITECTURE_PPC64_LE] = "ppc", /* differs from our name */ + [ARCHITECTURE_PPC64] = "ppc", /* differs from our name */ + [ARCHITECTURE_PPC] = "ppc", + [ARCHITECTURE_RISCV32] = "riscv32", + [ARCHITECTURE_RISCV64] = "riscv64", + [ARCHITECTURE_S390X] = "s390x", + }; + + FOREACH_STRING(s, "qemu", "qemu-kvm") { + r = find_executable(s, ret_qemu_binary); + if (r == 0) + return 0; + + if (r != -ENOENT) + return r; + } + + const char *arch_qemu = architecture_to_qemu_table[native_architecture()]; + if (!arch_qemu) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Architecture %s not supported by qemu", architecture_to_string(native_architecture())); + + _cleanup_free_ char *qemu_arch_specific = NULL; + qemu_arch_specific = strjoin("qemu-system-", arch_qemu); + if (!qemu_arch_specific) + return -ENOMEM; + + return find_executable(qemu_arch_specific, ret_qemu_binary); +} + +int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock) { + /* this is an arbitrary value picked from /dev/urandom */ + static const uint8_t sip_key[HASH_KEY_SIZE] = { + 0x03, 0xad, 0xf0, 0xa4, + 0x59, 0x2c, 0x77, 0x11, + 0xda, 0x39, 0x0c, 0xba, + 0xf5, 0x4c, 0x80, 0x52 + }; + struct siphash machine_hash_state, state; + _cleanup_close_ int vfd = -EBADF; + int r; + + /* uint64_t is required here for the ioctl call, but valid CIDs are only 32 bits */ + uint64_t cid = *ASSERT_PTR(machine_cid); + + assert(machine); + assert(ret_child_sock); + + /* Fix the CID of the AF_VSOCK socket passed to qemu + * + * If the user has passed us a CID (machine_cid != VMADDR_CID_ANY), then attempt to bind to that CID + * and error if we cannot. + * + * Otherwise hash the machine name to get a random CID and attempt to bind to that. + * If it is occupied add more information into the hash and try again. + * If after 64 attempts this hasn't worked fallback to truly random CIDs. + * If after another 64 attempts this hasn't worked then give up and return EADDRNOTAVAIL. + */ + + /* remove O_CLOEXEC before this fd is passed to QEMU */ + vfd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); + if (vfd < 0) + return log_debug_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m"); + + if (cid != VMADDR_CID_ANY) { + r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + if (r < 0) + return log_debug_errno(errno, "Failed to set CID for child vsock with user provided CID %" PRIu64 ": %m", cid); + *ret_child_sock = TAKE_FD(vfd); + return 0; + } + + siphash24_init(&machine_hash_state, sip_key); + siphash24_compress_string(machine, &machine_hash_state); + for (unsigned i = 0; i < 64; i++) { + state = machine_hash_state; + siphash24_compress_safe(&i, sizeof i, &state); + uint64_t hash = siphash24_finalize(&state); + + cid = 3 + (hash % (UINT_MAX - 4)); + r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + if (r >= 0) { + *machine_cid = cid; + *ret_child_sock = TAKE_FD(vfd); + return 0; + } + if (errno != EADDRINUSE) + return -errno; + } + + for (unsigned i = 0; i < 64; i++) { + cid = 3 + random_u64_range(UINT_MAX - 4); + r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + if (r >= 0) { + *machine_cid = cid; + *ret_child_sock = TAKE_FD(vfd); + return 0; + } + + if (errno != EADDRINUSE) + return -errno; + } + + return log_debug_errno(SYNTHETIC_ERRNO(EADDRNOTAVAIL), "Failed to assign a CID to the guest vsock"); +} diff --git a/src/vmspawn/vmspawn-util.h b/src/vmspawn/vmspawn-util.h new file mode 100644 index 0000000..53ad7dd --- /dev/null +++ b/src/vmspawn/vmspawn-util.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include +#include "macro.h" + +#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__) +#define ARCHITECTURE_SUPPORTS_SMBIOS 1 +#else +#define ARCHITECTURE_SUPPORTS_SMBIOS 0 +#endif + +typedef struct OvmfConfig { + char *path; + char *vars; + bool supports_sb; +} OvmfConfig; + +OvmfConfig* ovmf_config_free(OvmfConfig *ovmf_config); +DEFINE_TRIVIAL_CLEANUP_FUNC(OvmfConfig*, ovmf_config_free); + +int qemu_check_kvm_support(void); +int qemu_check_vsock_support(void); +int find_ovmf_config(int search_sb, OvmfConfig **ret_ovmf_config); +int find_qemu_binary(char **ret_qemu_binary); +int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock); diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c new file mode 100644 index 0000000..ebae681 --- /dev/null +++ b/src/vmspawn/vmspawn.c @@ -0,0 +1,766 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "architecture.h" +#include "build.h" +#include "common-signal.h" +#include "copy.h" +#include "creds-util.h" +#include "escape.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "log.h" +#include "machine-credential.h" +#include "main-func.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "sd-event.h" +#include "signal-util.h" +#include "socket-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "vmspawn-settings.h" +#include "vmspawn-util.h" + +static PagerFlags arg_pager_flags = 0; +static char *arg_image = NULL; +static char *arg_machine = NULL; +static char *arg_qemu_smp = NULL; +static uint64_t arg_qemu_mem = 2ULL * 1024ULL * 1024ULL * 1024ULL; +static int arg_qemu_kvm = -1; +static int arg_qemu_vsock = -1; +static uint64_t arg_vsock_cid = UINT64_MAX; +static bool arg_qemu_gui = false; +static int arg_secure_boot = -1; +static MachineCredential *arg_credentials = NULL; +static size_t arg_n_credentials = 0; +static SettingsMask arg_settings_mask = 0; +static char **arg_parameters = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_machine, freep); +STATIC_DESTRUCTOR_REGISTER(arg_qemu_smp, freep); +STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("systemd-vmspawn", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] [ARGUMENTS...]\n\n" + "%5$sSpawn a command or OS in a virtual machine.%6$s\n\n" + " -h --help Show this help\n" + " --version Print version string\n" + " --no-pager Do not pipe output into a pager\n\n" + "%3$sImage:%4$s\n" + " -i --image=PATH Root file system disk image (or device node) for\n" + " the virtual machine\n\n" + "%3$sHost Configuration:%4$s\n" + " --qemu-smp=SMP Configure guest's SMP settings\n" + " --qemu-mem=MEM Configure guest's RAM size\n" + " --qemu-kvm=BOOL Configure whether to use KVM or not\n" + " --qemu-vsock=BOOL Configure whether to use qemu with a vsock or not\n" + " --vsock-cid= Specify the CID to use for the qemu guest's vsock\n" + " --qemu-gui Start QEMU in graphical mode\n" + " --secure-boot=BOOL Configure whether to search for firmware which\n" + " supports Secure Boot\n\n" + "%3$sSystem Identity:%4$s\n" + " -M --machine=NAME Set the machine name for the container\n" + "%3$sCredentials:%4$s\n" + " --set-credential=ID:VALUE\n" + " Pass a credential with literal value to container.\n" + " --load-credential=ID:PATH\n" + " Load credential to pass to container from file or\n" + " AF_UNIX stream socket.\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_QEMU_SMP, + ARG_QEMU_MEM, + ARG_QEMU_KVM, + ARG_QEMU_VSOCK, + ARG_VSOCK_CID, + ARG_QEMU_GUI, + ARG_SECURE_BOOT, + ARG_SET_CREDENTIAL, + ARG_LOAD_CREDENTIAL, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "image", required_argument, NULL, 'i' }, + { "machine", required_argument, NULL, 'M' }, + { "qemu-smp", required_argument, NULL, ARG_QEMU_SMP }, + { "qemu-mem", required_argument, NULL, ARG_QEMU_MEM }, + { "qemu-kvm", required_argument, NULL, ARG_QEMU_KVM }, + { "qemu-vsock", required_argument, NULL, ARG_QEMU_VSOCK }, + { "vsock-cid", required_argument, NULL, ARG_VSOCK_CID }, + { "qemu-gui", no_argument, NULL, ARG_QEMU_GUI }, + { "secure-boot", required_argument, NULL, ARG_SECURE_BOOT }, + { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, + { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + optind = 0; + while ((c = getopt_long(argc, argv, "+hi:M", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'i': + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + + case 'M': + if (isempty(optarg)) + arg_machine = mfree(arg_machine); + else { + if (!hostname_is_valid(optarg, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid machine name: %s", optarg); + + r = free_and_strdup(&arg_machine, optarg); + if (r < 0) + return log_oom(); + } + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_QEMU_SMP: + r = free_and_strdup_warn(&arg_qemu_smp, optarg); + if (r < 0) + return r; + break; + + case ARG_QEMU_MEM: + r = parse_size(optarg, 1024, &arg_qemu_mem); + if (r < 0) + return log_error_errno(r, "Failed to parse --qemu-mem=%s: %m", optarg); + break; + + case ARG_QEMU_KVM: + r = parse_tristate(optarg, &arg_qemu_kvm); + if (r < 0) + return log_error_errno(r, "Failed to parse --qemu-kvm=%s: %m", optarg); + break; + + case ARG_QEMU_VSOCK: + r = parse_tristate(optarg, &arg_qemu_vsock); + if (r < 0) + return log_error_errno(r, "Failed to parse --qemu-vsock=%s: %m", optarg); + break; + + case ARG_VSOCK_CID: { + unsigned cid; + if (isempty(optarg)) + cid = VMADDR_CID_ANY; + else { + r = safe_atou_bounded(optarg, 3, UINT_MAX - 1, &cid); + if (r == -ERANGE) + return log_error_errno(r, "Invalid value for --vsock-cid=: %m"); + if (r < 0) + return log_error_errno(r, "Failed to parse --vsock-cid=%s: %m", optarg); + } + arg_vsock_cid = (uint64_t)cid; + break; + } + + case ARG_QEMU_GUI: + arg_qemu_gui = true; + break; + + case ARG_SECURE_BOOT: + r = parse_tristate(optarg, &arg_secure_boot); + if (r < 0) + return log_error_errno(r, "Failed to parse --secure-boot=%s: %m", optarg); + break; + + case ARG_SET_CREDENTIAL: { + r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg); + if (r < 0) + return r; + arg_settings_mask |= SETTING_CREDENTIALS; + break; + } + + case ARG_LOAD_CREDENTIAL: { + r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_CREDENTIALS; + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (argc > optind) { + strv_free(arg_parameters); + arg_parameters = strv_copy(argv + optind); + if (!arg_parameters) + return log_oom(); + + arg_settings_mask |= SETTING_START_MODE; + } + + return 1; +} + +static int open_vsock(void) { + _cleanup_close_ int vsock_fd = -EBADF; + int r; + static const union sockaddr_union bind_addr = { + .vm.svm_family = AF_VSOCK, + .vm.svm_cid = VMADDR_CID_ANY, + .vm.svm_port = VMADDR_PORT_ANY, + }; + + vsock_fd = socket(AF_VSOCK, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (vsock_fd < 0) + return log_error_errno(errno, "Failed to open AF_VSOCK socket: %m"); + + r = bind(vsock_fd, &bind_addr.sa, sizeof(bind_addr.vm)); + if (r < 0) + return log_error_errno(errno, "Failed to bind to vsock to address %u:%u: %m", bind_addr.vm.svm_cid, bind_addr.vm.svm_port); + + r = listen(vsock_fd, SOMAXCONN_DELUXE); + if (r < 0) + return log_error_errno(errno, "Failed to listen on vsock: %m"); + + return TAKE_FD(vsock_fd); +} + +static int vmspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + char buf[NOTIFY_BUFFER_MAX+1]; + const char *p = NULL; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + }; + ssize_t n; + _cleanup_strv_free_ char **tags = NULL; + int r, *exit_status = ASSERT_PTR(userdata); + + n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; + if (n == -EXFULL) { + log_warning_errno(n, "Got message with truncated control data, ignoring: %m"); + return 0; + } + if (n < 0) + return log_warning_errno(n, "Couldn't read notification socket: %m"); + + if ((size_t) n >= sizeof(buf)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + buf[n] = 0; + tags = strv_split(buf, "\n\r"); + if (!tags) + return log_oom(); + + STRV_FOREACH(s, tags) + log_debug("Received tag %s from notify socket", *s); + + if (strv_contains(tags, "READY=1")) { + r = sd_notify(false, "READY=1\n"); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + } + + p = strv_find_startswith(tags, "STATUS="); + if (p) + (void) sd_notifyf(false, "STATUS=VM running: %s", p); + + p = strv_find_startswith(tags, "EXIT_STATUS="); + if (p) { + r = safe_atoi(p, exit_status); + if (r < 0) + log_warning_errno(r, "Failed to parse exit status from %s, ignoring: %m", p); + } + + /* we will only receive one message from each connection so disable this source once one is received */ + source = sd_event_source_disable_unref(source); + + return 0; +} + +static int vmspawn_dispatch_vsock_connections(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + int r; + sd_event *event; + _cleanup_close_ int conn_fd = -EBADF; + + assert(userdata); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for vsock fd."); + return 0; + } + + conn_fd = accept4(fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK); + if (conn_fd < 0) { + log_warning_errno(errno, "Failed to accept connection from vsock fd (%m), ignoring..."); + return 0; + } + + event = sd_event_source_get_event(source); + if (!event) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to retrieve event from event source, exiting task"); + + /* add a new floating task to read from the connection */ + r = sd_event_add_io(event, NULL, conn_fd, revents, vmspawn_dispatch_notify_fd, userdata); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify connection event source: %m"); + + /* conn_fd is now owned by the event loop so don't clean it up */ + TAKE_FD(conn_fd); + + return 0; +} + +static int setup_notify_parent(sd_event *event, int fd, int *exit_status, sd_event_source **notify_event_source) { + int r; + + r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, vmspawn_dispatch_vsock_connections, exit_status); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify socket event source: %m"); + + (void) sd_event_source_set_description(*notify_event_source, "vmspawn-notify-sock"); + + return 0; +} + +static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + pid_t pid; + + pid = PTR_TO_PID(userdata); + if (pid > 0) { + /* TODO: actually talk to qemu and ask the guest to shutdown here */ + if (kill(pid, SIGKILL) >= 0) { + log_info("Trying to halt qemu. Send SIGTERM again to trigger vmspawn to immediately terminate."); + sd_event_source_set_userdata(s, NULL); + return 0; + } + } + + sd_event_exit(sd_event_source_get_event(s), 0); + return 0; +} + +static int on_child_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { + sd_event_exit(sd_event_source_get_event(s), 0); + return 0; +} + +static int cmdline_add_vsock(char ***cmdline, int vsock_fd) { + int r; + + r = strv_extend(cmdline, "-smbios"); + if (r < 0) + return r; + + union sockaddr_union addr; + socklen_t addr_len = sizeof addr.vm; + r = getsockname(vsock_fd, &addr.sa, &addr_len); + if (r < 0) + return -errno; + assert(addr_len >= sizeof addr.vm); + assert(addr.vm.svm_family == AF_VSOCK); + + log_info("Using vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port); + r = strv_extendf(cmdline, "type=11,value=io.systemd.credential:vmm.notify_socket=vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port); + if (r < 0) + return r; + + return 0; +} + +static int run_virtual_machine(void) { + _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL; + _cleanup_strv_free_ char **cmdline = NULL; + _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL; + int r; + _cleanup_close_ int vsock_fd = -EBADF; + + bool use_kvm = arg_qemu_kvm > 0; + if (arg_qemu_kvm < 0) { + r = qemu_check_kvm_support(); + if (r < 0) + return log_error_errno(r, "Failed to check for KVM support: %m"); + use_kvm = r; + } + + r = find_ovmf_config(arg_secure_boot, &ovmf_config); + if (r < 0) + return log_error_errno(r, "Failed to find OVMF config: %m"); + + /* only warn if the user hasn't disabled secureboot */ + if (!ovmf_config->supports_sb && arg_secure_boot) + log_warning("Couldn't find OVMF firmware blob with Secure Boot support, " + "falling back to OVMF firmware blobs without Secure Boot support."); + + const char *accel = use_kvm ? "kvm" : "tcg"; + if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE)) + machine = strjoin("type=virt,accel=", accel); + else + machine = strjoin("type=q35,accel=", accel, ",smm=", on_off(ovmf_config->supports_sb)); + if (!machine) + return log_oom(); + + r = find_qemu_binary(&qemu_binary); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "Native architecture is not supported by qemu."); + if (r < 0) + return log_error_errno(r, "Failed to find QEMU binary: %m"); + + if (asprintf(&mem, "%.4fM", (double)arg_qemu_mem / (1024.0 * 1024.0)) < 0) + return log_oom(); + + cmdline = strv_new( + qemu_binary, + "-machine", machine, + "-smp", arg_qemu_smp ?: "1", + "-m", mem, + "-object", "rng-random,filename=/dev/urandom,id=rng0", + "-device", "virtio-rng-pci,rng=rng0,id=rng-device0", + "-nic", "user,model=virtio-net-pci" + ); + if (!cmdline) + return log_oom(); + + bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS; + if (arg_qemu_vsock < 0) { + r = qemu_check_vsock_support(); + if (r < 0) + return log_error_errno(r, "Failed to check for VSock support: %m"); + + use_vsock = r; + } + + unsigned child_cid = VMADDR_CID_ANY; + _cleanup_close_ int child_vsock_fd = -EBADF; + if (use_vsock) { + if (arg_vsock_cid < UINT_MAX) + child_cid = (unsigned)arg_vsock_cid; + + r = vsock_fix_child_cid(&child_cid, arg_machine, &child_vsock_fd); + if (r < 0) + return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m"); + + r = strv_extend(&cmdline, "-device"); + if (r < 0) + return log_oom(); + + log_debug("vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); + r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); + if (r < 0) + return log_oom(); + } + + r = strv_extend_strv(&cmdline, STRV_MAKE("-cpu", "max"), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + + if (arg_qemu_gui) { + r = strv_extend_strv(&cmdline, STRV_MAKE("-vga", "virtio"), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + } else { + r = strv_extend_strv(&cmdline, STRV_MAKE( + "-nographic", + "-nodefaults", + "-chardev", "stdio,mux=on,id=console,signal=off", + "-serial", "chardev:console", + "-mon", "console" + ), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + } + + if (ARCHITECTURE_SUPPORTS_SMBIOS) { + ssize_t n; + FOREACH_ARRAY(cred, arg_credentials, arg_n_credentials) { + _cleanup_free_ char *cred_data_b64 = NULL; + + n = base64mem(cred->data, cred->size, &cred_data_b64); + if (n < 0) + return log_oom(); + + r = strv_extend(&cmdline, "-smbios"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "type=11,value=io.systemd.credential.binary:%s=%s", cred->id, cred_data_b64); + if (r < 0) + return log_oom(); + } + } + + r = strv_extend(&cmdline, "-drive"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "if=pflash,format=raw,readonly=on,file=%s", ovmf_config->path); + if (r < 0) + return log_oom(); + + _cleanup_(unlink_and_freep) char *ovmf_vars_to = NULL; + if (ovmf_config->supports_sb) { + const char *ovmf_vars_from = ovmf_config->vars; + _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF; + + r = tempfn_random_child(NULL, "vmspawn-", &ovmf_vars_to); + if (r < 0) + return r; + + source_fd = open(ovmf_vars_from, O_RDONLY|O_CLOEXEC); + if (source_fd < 0) + return log_error_errno(source_fd, "Failed to open OVMF vars file %s: %m", ovmf_vars_from); + + target_fd = open(ovmf_vars_to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600); + if (target_fd < 0) + return log_error_errno(errno, "Failed to create regular file for OVMF vars at %s: %m", ovmf_vars_to); + + r = copy_bytes(source_fd, target_fd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return log_error_errno(r, "Failed to copy bytes from %s to %s: %m", ovmf_vars_from, ovmf_vars_to); + + /* These aren't always available so don't raise an error if they fail */ + (void) copy_xattr(source_fd, NULL, target_fd, NULL, 0); + (void) copy_access(source_fd, target_fd); + (void) copy_times(source_fd, target_fd, 0); + + r = strv_extend_strv(&cmdline, STRV_MAKE( + "-global", "ICH9-LPC.disable_s3=1", + "-global", "driver=cfi.pflash01,property=secure,value=on", + "-drive" + ), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "file=%s,if=pflash,format=raw", ovmf_vars_to); + if (r < 0) + return log_oom(); + } + + r = strv_extend(&cmdline, "-drive"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "if=none,id=mkosi,file=%s,format=raw", arg_image); + if (r < 0) + return log_oom(); + + r = strv_extend_strv(&cmdline, STRV_MAKE( + "-device", "virtio-scsi-pci,id=scsi", + "-device", "scsi-hd,drive=mkosi,bootindex=1" + ), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + + if (!strv_isempty(arg_parameters)) { + if (ARCHITECTURE_SUPPORTS_SMBIOS) { + _cleanup_free_ char *kcl = strv_join(arg_parameters, " "); + if (!kcl) + return log_oom(); + + r = strv_extend(&cmdline, "-smbios"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "type=11,value=io.systemd.stub.kernel-cmdline-extra=%s", kcl); + if (r < 0) + return log_oom(); + } else + log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS"); + } + + if (use_vsock) { + vsock_fd = open_vsock(); + if (vsock_fd < 0) + return log_error_errno(vsock_fd, "Failed to open vsock: %m"); + + r = cmdline_add_vsock(&cmdline, vsock_fd); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to call getsockname on vsock: %m"); + } + + _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event source: %m"); + + (void) sd_event_set_watchdog(event, true); + + pid_t child_pid; + r = safe_fork_full( + qemu_binary, + NULL, + &child_vsock_fd, 1, /* pass the vsock fd to qemu */ + FORK_CLOEXEC_OFF, + &child_pid); + if (r < 0) + return log_error_errno(r, "Failed to fork off %s: %m", qemu_binary); + if (r == 0) { + /* set TERM and LANG if they are missing */ + if (setenv("TERM", "vt220", 0) < 0) + return log_oom(); + + if (setenv("LANG", "C.UTF-8", 0) < 0) + return log_oom(); + + execve(qemu_binary, cmdline, environ); + log_error_errno(errno, "Failed to execve %s: %m", qemu_binary); + _exit(EXIT_FAILURE); + } + + + int exit_status = INT_MAX; + if (use_vsock) { + r = setup_notify_parent(event, vsock_fd, &exit_status, ¬ify_event_source); + if (r < 0) + return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m"); + } + + /* shutdown qemu when we are shutdown */ + (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(child_pid)); + (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(child_pid)); + + (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + + /* Exit when the child exits */ + (void) sd_event_add_child(event, NULL, child_pid, WEXITED, on_child_exit, NULL); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + if (use_vsock) { + if (exit_status == INT_MAX) { + log_debug("Couldn't retrieve inner EXIT_STATUS from vsock"); + return EXIT_SUCCESS; + } + if (exit_status != 0) + log_warning("Non-zero exit code received: %d", exit_status); + return exit_status; + } + + return 0; +} + +static int determine_names(void) { + int r; + + if (!arg_image) + return log_error_errno(SYNTHETIC_ERRNO(-EINVAL), "Missing required argument -i/--image=, quitting"); + + if (!arg_machine) { + char *e; + + r = path_extract_filename(arg_image, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image); + + /* Truncate suffix if there is one */ + e = endswith(arg_machine, ".raw"); + if (e) + *e = 0; + + hostname_cleanup(arg_machine); + if (!hostname_is_valid(arg_machine, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M."); + } + + return 0; +} + +static int run(int argc, char *argv[]) { + int r, ret = EXIT_SUCCESS; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + goto finish; + + r = determine_names(); + if (r < 0) + goto finish; + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + + r = run_virtual_machine(); + if (r > 0) + ret = r; +finish: + machine_credential_free_all(arg_credentials, arg_n_credentials); + + if (r < 0) + return r; + + return ret; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/volatile-root/meson.build b/src/volatile-root/meson.build new file mode 100644 index 0000000..83e3628 --- /dev/null +++ b/src/volatile-root/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-volatile-root', + 'conditions' : ['ENABLE_INITRD'], + 'sources' : files('volatile-root.c'), + }, +] diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c new file mode 100644 index 0000000..27be7bd --- /dev/null +++ b/src/volatile-root/volatile-root.c @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "blockdev-util.h" +#include "chase.h" +#include "devnum-util.h" +#include "escape.h" +#include "main-func.h" +#include "mkdir.h" +#include "mount-util.h" +#include "mountpoint-util.h" +#include "path-util.h" +#include "string-util.h" +#include "volatile-util.h" + +static int make_volatile(const char *path) { + _cleanup_free_ char *old_usr = NULL; + int r; + + assert(path); + + r = chase("/usr", path, CHASE_PREFIX_ROOT, &old_usr, NULL); + if (r < 0) + return log_error_errno(r, "/usr not available in old root: %m"); + + r = mkdir_p("/run/systemd/volatile-sysroot", 0700); + if (r < 0) + return log_error_errno(r, "Couldn't generate volatile sysroot directory: %m"); + + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=0755" TMPFS_LIMITS_ROOTFS); + if (r < 0) + goto finish_rmdir; + + if (mkdir("/run/systemd/volatile-sysroot/usr", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /usr directory: %m"); + goto finish_umount; + } + + r = mount_nofollow_verbose(LOG_ERR, old_usr, "/run/systemd/volatile-sysroot/usr", NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + goto finish_umount; + + r = bind_remount_recursive("/run/systemd/volatile-sysroot/usr", MS_RDONLY, MS_RDONLY, NULL); + if (r < 0) { + log_error_errno(r, "Failed to remount /usr read-only: %m"); + goto finish_umount; + } + + r = umount_recursive(path, 0); + if (r < 0) { + log_error_errno(r, "Failed to unmount %s: %m", path); + goto finish_umount; + } + + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) + log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC, ignoring: %m", path); + + r = mount_nofollow_verbose(LOG_ERR, "/run/systemd/volatile-sysroot", path, NULL, MS_MOVE, NULL); + +finish_umount: + (void) umount_recursive("/run/systemd/volatile-sysroot", 0); + +finish_rmdir: + (void) rmdir("/run/systemd/volatile-sysroot"); + + return r; +} + +static int make_overlay(const char *path) { + _cleanup_free_ char *escaped_path = NULL; + bool tmpfs_mounted = false; + const char *options = NULL; + int r; + + assert(path); + + r = mkdir_p("/run/systemd/overlay-sysroot", 0700); + if (r < 0) + return log_error_errno(r, "Couldn't create overlay sysroot directory: %m"); + + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=0755" TMPFS_LIMITS_ROOTFS); + if (r < 0) + goto finish; + + tmpfs_mounted = true; + + if (mkdir("/run/systemd/overlay-sysroot/upper", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/upper: %m"); + goto finish; + } + + if (mkdir("/run/systemd/overlay-sysroot/work", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/work: %m"); + goto finish; + } + + escaped_path = shell_escape(path, ",:"); + if (!escaped_path) { + r = log_oom(); + goto finish; + } + + options = strjoina("lowerdir=", escaped_path, ",upperdir=/run/systemd/overlay-sysroot/upper,workdir=/run/systemd/overlay-sysroot/work"); + r = mount_nofollow_verbose(LOG_ERR, "overlay", path, "overlay", 0, options); + +finish: + if (tmpfs_mounted) + (void) umount_verbose(LOG_ERR, "/run/systemd/overlay-sysroot", UMOUNT_NOFOLLOW); + + (void) rmdir("/run/systemd/overlay-sysroot"); + return r; +} + +static int run(int argc, char *argv[]) { + VolatileMode m = _VOLATILE_MODE_INVALID; + const char *path; + dev_t devt; + int r; + + log_setup(); + + if (argc > 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Too many arguments. Expected directory and mode."); + + r = query_volatile_mode(&m); + if (r < 0) + return log_error_errno(r, "Failed to determine volatile mode from kernel command line: %m"); + if (r == 0 && argc >= 2) { + /* The kernel command line always wins. However if nothing was set there, the argument passed here wins instead. */ + m = volatile_mode_from_string(argv[1]); + if (m < 0) + return log_error_errno(m, "Couldn't parse volatile mode: %s", argv[1]); + } + + if (argc < 3) + path = "/sysroot"; + else { + path = argv[2]; + + if (isempty(path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Directory name cannot be empty."); + if (!path_is_absolute(path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Directory must be specified as absolute path."); + if (path_equal(path, "/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Directory cannot be the root directory."); + } + + if (!IN_SET(m, VOLATILE_YES, VOLATILE_OVERLAY)) + return 0; + + r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not a mount point.", path); + + r = path_is_temporary_fs(path); + if (r < 0) + return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path); + if (r > 0) { + log_info("%s already is a temporary file system.", path); + return 0; + } + + /* We are about to replace the root directory with something else. Later code might want to know what we + * replaced here, hence let's save that information as a symlink we can later use. (This is particularly + * relevant for the overlayfs case where we'll fully obstruct the view onto the underlying device, hence + * querying the backing device node from the file system directly is no longer possible. */ + r = get_block_device_harder(path, &devt); + if (r < 0) + return log_error_errno(r, "Failed to determine device major/minor of %s: %m", path); + else if (r > 0) { /* backed by block device */ + _cleanup_free_ char *dn = NULL; + + r = device_path_make_major_minor(S_IFBLK, devt, &dn); + if (r < 0) + return log_error_errno(r, "Failed to format device node path: %m"); + + if (symlink(dn, "/run/systemd/volatile-root") < 0) + log_warning_errno(errno, "Failed to create symlink /run/systemd/volatile-root: %m"); + } + + if (m == VOLATILE_YES) + return make_volatile(path); + else { + assert(m == VOLATILE_OVERLAY); + return make_overlay(path); + } +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/xdg-autostart-generator/fuzz-xdg-desktop.c b/src/xdg-autostart-generator/fuzz-xdg-desktop.c new file mode 100644 index 0000000..9aca797 --- /dev/null +++ b/src/xdg-autostart-generator/fuzz-xdg-desktop.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "fuzz.h" +#include "rm-rf.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "xdg-autostart-service.h" + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/fuzz-xdg-desktop.XXXXXX"; + _cleanup_close_ int fd = -EBADF; + _cleanup_(xdg_autostart_service_freep) XdgAutostartService *service = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL; + + if (outside_size_range(size, 0, 65536)) + return 0; + + fuzz_setup_logging(); + + assert_se(mkdtemp_malloc("/tmp/fuzz-xdg-desktop-XXXXXX", &tmpdir) >= 0); + + fd = mkostemp_safe(name); + assert_se(fd >= 0); + assert_se(write(fd, data, size) == (ssize_t) size); + + assert_se(service = xdg_autostart_service_parse_desktop(name)); + assert_se(service->name = strdup("fuzz-xdg-desktop.service")); + (void) xdg_autostart_service_generate_unit(service, tmpdir); + + return 0; +} diff --git a/src/xdg-autostart-generator/fuzz-xdg-desktop.options b/src/xdg-autostart-generator/fuzz-xdg-desktop.options new file mode 100644 index 0000000..678d526 --- /dev/null +++ b/src/xdg-autostart-generator/fuzz-xdg-desktop.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/src/xdg-autostart-generator/meson.build b/src/xdg-autostart-generator/meson.build new file mode 100644 index 0000000..c938932 --- /dev/null +++ b/src/xdg-autostart-generator/meson.build @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_xdg_autostart_generator_sources = files( + 'xdg-autostart-generator.c', + 'xdg-autostart-service.c', +) + +executables += [ + executable_template + { + 'name' : 'systemd-xdg-autostart-generator', + 'conditions' : ['ENABLE_XDG_AUTOSTART'], + 'sources' : systemd_xdg_autostart_generator_sources, + 'install_dir' : usergeneratordir, + }, + libexec_template + { + 'name' : 'systemd-xdg-autostart-condition', + 'conditions' : ['ENABLE_XDG_AUTOSTART'], + 'sources' : files('xdg-autostart-condition.c'), + }, + test_template + { + 'sources' : files( + 'test-xdg-autostart.c', + 'xdg-autostart-service.c', + ), + }, + fuzz_template + { + 'sources' : files( + 'fuzz-xdg-desktop.c', + 'xdg-autostart-service.c', + ), + }, +] diff --git a/src/xdg-autostart-generator/test-xdg-autostart.c b/src/xdg-autostart-generator/test-xdg-autostart.c new file mode 100644 index 0000000..81f85d6 --- /dev/null +++ b/src/xdg-autostart-generator/test-xdg-autostart.c @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "fd-util.h" +#include "fs-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "xdg-autostart-service.h" + +TEST(translate_name) { + _cleanup_free_ char *t = NULL; + + assert_se(t = xdg_autostart_service_translate_name("a-b.blub.desktop")); + assert_se(streq(t, "app-a\\x2db.blub@autostart.service")); +} + +static void test_xdg_format_exec_start_one(const char *exec, const char *expected) { + _cleanup_free_ char* out = NULL; + + xdg_autostart_format_exec_start(exec, &out); + log_info("In: '%s', out: '%s', expected: '%s'", exec, out, expected); + assert_se(streq(out, expected)); +} + +TEST(xdg_format_exec_start) { + _cleanup_free_ char *home = NULL; + _cleanup_free_ char *expected1 = NULL, *expected2 = NULL; + + assert_se(get_home_dir(&home) >= 0); + + test_xdg_format_exec_start_one("/bin/sleep 100", "/bin/sleep 100"); + + /* All standardised % identifiers are stripped. */ + test_xdg_format_exec_start_one("/bin/sleep %f \"%F\" %u %U %d %D\t%n %N %i %c %k %v %m", "/bin/sleep"); + + /* Unknown % identifier currently remain, but are escaped. */ + test_xdg_format_exec_start_one("/bin/sleep %X \"%Y\"", "/bin/sleep %%X %%Y"); + + test_xdg_format_exec_start_one("/bin/sleep \";\\\"\"", "/bin/sleep \";\\\"\""); + + /* tilde is expanded only if standalone or at the start of a path */ + expected1 = strjoin("/bin/ls ", home); + test_xdg_format_exec_start_one("/bin/ls ~", expected1); + expected2 = strjoin("/bin/ls ", home, "/foo"); + test_xdg_format_exec_start_one("/bin/ls \"~/foo\"", expected2); + test_xdg_format_exec_start_one("/bin/ls ~foo", "/bin/ls ~foo"); + test_xdg_format_exec_start_one("/bin/ls foo~", "/bin/ls foo~"); +} + +static const char* const xdg_desktop_file[] = { + ("[Desktop Entry]\n" + "Exec\t =\t /bin/sleep 100\n" /* Whitespace Before/After = must be ignored */ + "OnlyShowIn = A;B;\n" + "NotShowIn=C;;D\\\\\\;;E\n"), /* "C", "", "D\;", "E" */ + + ("[Desktop Entry]\n" + "Exec=a\n" + "Exec=b\n"), + + ("[Desktop Entry]\n" + "Hidden=\t true\n"), + ("[Desktop Entry]\n" + "Hidden=\t True\n"), +}; + +static void test_xdg_desktop_parse_one(unsigned i, const char *s) { + _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-xdg-autostart-parser.XXXXXX"; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_(xdg_autostart_service_freep) XdgAutostartService *service = NULL; + + log_info("== %s[%u] ==", __func__, i); + + assert_se(fmkostemp_safe(name, "r+", &f) == 0); + assert_se(fwrite(s, strlen(s), 1, f) == 1); + rewind(f); + + assert_se(service = xdg_autostart_service_parse_desktop(name)); + + switch (i) { + case 0: + assert_se(streq(service->exec_string, "/bin/sleep 100")); + assert_se(strv_equal(service->only_show_in, STRV_MAKE("A", "B"))); + assert_se(strv_equal(service->not_show_in, STRV_MAKE("C", "D\\;", "E"))); + assert_se(!service->hidden); + break; + case 1: + /* The second entry is not permissible and will be ignored (and error logged). */ + assert_se(streq(service->exec_string, "a")); + break; + case 2: + case 3: + assert_se(service->hidden); + break; + } +} + +TEST(xdg_desktop_parse) { + for (size_t i = 0; i < ELEMENTSOF(xdg_desktop_file); i++) + test_xdg_desktop_parse_one(i, xdg_desktop_file[i]); +} + +DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/src/xdg-autostart-generator/xdg-autostart-condition.c b/src/xdg-autostart-generator/xdg-autostart-condition.c new file mode 100644 index 0000000..9ceea61 --- /dev/null +++ b/src/xdg-autostart-generator/xdg-autostart-condition.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "main-func.h" +#include "strv.h" + +/* + * This binary is intended to be run as an ExecCondition= in units generated + * by the xdg-autostart-generator. It does the appropriate checks against + * XDG_CURRENT_DESKTOP that are too advanced for simple ConditionEnvironment= + * matches. + */ + +static int run(int argc, char *argv[]) { + _cleanup_strv_free_ char **only_show_in = NULL, **not_show_in = NULL, **desktops = NULL; + const char *xdg_current_desktop; + + if (argc != 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Wrong argument count. Expected the OnlyShowIn= and NotShowIn= sets, each colon separated."); + + xdg_current_desktop = getenv("XDG_CURRENT_DESKTOP"); + if (xdg_current_desktop) { + desktops = strv_split(xdg_current_desktop, ":"); + if (!desktops) + return log_oom(); + } + + only_show_in = strv_split(argv[1], ":"); + not_show_in = strv_split(argv[2], ":"); + if (!only_show_in || !not_show_in) + return log_oom(); + + /* Each desktop in XDG_CURRENT_DESKTOP needs to be matched in order. */ + STRV_FOREACH(d, desktops) { + if (strv_contains(only_show_in, *d)) + return 0; + if (strv_contains(not_show_in, *d)) + return 1; + } + + /* non-zero exit code when only_show_in has a proper value */ + return !strv_isempty(only_show_in); +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/src/xdg-autostart-generator/xdg-autostart-generator.c b/src/xdg-autostart-generator/xdg-autostart-generator.c new file mode 100644 index 0000000..616c017 --- /dev/null +++ b/src/xdg-autostart-generator/xdg-autostart-generator.c @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "dirent-util.h" +#include "fd-util.h" +#include "generator.h" +#include "glyph-util.h" +#include "hashmap.h" +#include "log.h" +#include "main-func.h" +#include "nulstr-util.h" +#include "path-lookup.h" +#include "stat-util.h" +#include "string-util.h" +#include "strv.h" +#include "xdg-autostart-service.h" + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(xdgautostartservice_hash_ops, char, string_hash_func, string_compare_func, XdgAutostartService, xdg_autostart_service_free); + +static int enumerate_xdg_autostart(Hashmap *all_services) { + _cleanup_strv_free_ char **autostart_dirs = NULL; + _cleanup_strv_free_ char **config_dirs = NULL; + _unused_ _cleanup_strv_free_ char **data_dirs = NULL; + _cleanup_free_ char *user_config_autostart_dir = NULL; + int r; + + r = xdg_user_config_dir(&user_config_autostart_dir, "/autostart"); + if (r < 0) + return r; + r = strv_extend(&autostart_dirs, user_config_autostart_dir); + if (r < 0) + return r; + + r = xdg_user_dirs(&config_dirs, &data_dirs); + if (r < 0) + return r; + r = strv_extend_strv_concat(&autostart_dirs, config_dirs, "/autostart"); + if (r < 0) + return r; + + STRV_FOREACH(path, autostart_dirs) { + _cleanup_closedir_ DIR *d = NULL; + + log_debug("Scanning autostart directory \"%s\"%s", *path, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + d = opendir(*path); + if (!d) { + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Opening %s failed, ignoring: %m", *path); + continue; + } + + FOREACH_DIRENT(de, d, log_warning_errno(errno, "Failed to enumerate directory %s, ignoring: %m", *path)) { + struct stat st; + if (fstatat(dirfd(d), de->d_name, &st, 0) < 0) { + log_warning_errno(errno, "%s/%s: stat() failed, ignoring: %m", *path, de->d_name); + continue; + } + + if (!S_ISREG(st.st_mode)) { + log_debug("%s/%s: not a regular file, ignoring.", *path, de->d_name); + continue; + } + + _cleanup_free_ char *name = xdg_autostart_service_translate_name(de->d_name); + if (!name) + return log_oom(); + + if (hashmap_contains(all_services, name)) { + log_debug("%s/%s: we have already seen \"%s\", ignoring.", + *path, de->d_name, name); + continue; + } + + _cleanup_free_ char *fpath = path_join(*path, de->d_name); + if (!fpath) + return log_oom(); + + _cleanup_(xdg_autostart_service_freep) XdgAutostartService *service = + xdg_autostart_service_parse_desktop(fpath); + if (!service) + return log_oom(); + service->name = TAKE_PTR(name); + + r = hashmap_put(all_services, service->name, service); + if (r < 0) + return log_oom(); + TAKE_PTR(service); + } + } + + return 0; +} + +static int run(const char *dest, const char *dest_early, const char *dest_late) { + _cleanup_hashmap_free_ Hashmap *all_services = NULL; + XdgAutostartService *service; + int r; + + assert_se(dest_late); + + all_services = hashmap_new(&xdgautostartservice_hash_ops); + if (!all_services) + return log_oom(); + + r = enumerate_xdg_autostart(all_services); + if (r < 0) + return r; + + HASHMAP_FOREACH(service, all_services) + (void) xdg_autostart_service_generate_unit(service, dest_late); + + return 0; +} + +DEFINE_MAIN_GENERATOR_FUNCTION(run); diff --git a/src/xdg-autostart-generator/xdg-autostart-service.c b/src/xdg-autostart-generator/xdg-autostart-service.c new file mode 100644 index 0000000..480d100 --- /dev/null +++ b/src/xdg-autostart-generator/xdg-autostart-service.c @@ -0,0 +1,695 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#include "xdg-autostart-service.h" + +#include "conf-parser.h" +#include "escape.h" +#include "fd-util.h" +#include "fileio.h" +#include "generator.h" +#include "log.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "specifier.h" +#include "string-util.h" +#include "strv.h" +#include "unit-name.h" +#include "user-util.h" + +XdgAutostartService* xdg_autostart_service_free(XdgAutostartService *s) { + if (!s) + return NULL; + + free(s->name); + free(s->path); + free(s->description); + + free(s->type); + free(s->exec_string); + free(s->working_directory); + + strv_free(s->only_show_in); + strv_free(s->not_show_in); + + free(s->try_exec); + free(s->autostart_condition); + free(s->kde_autostart_condition); + + free(s->gnome_autostart_phase); + + return mfree(s); +} + +char *xdg_autostart_service_translate_name(const char *name) { + _cleanup_free_ char *c = NULL, *escaped = NULL; + char *res; + + c = strdup(name); + if (!c) + return NULL; + + res = endswith(c, ".desktop"); + if (res) + *res = '\0'; + + escaped = unit_name_escape(c); + if (!escaped) + return NULL; + + return strjoin("app-", escaped, "@autostart.service"); +} + +static int xdg_config_parse_bool( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + bool *b = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), "Invalid value for boolean: %s", rvalue); + *b = r; + return 0; +} + +/* Unescapes the string in-place, returns non-zero status on error. */ +static int xdg_unescape_string( + const char *unit, + const char *filename, + int line, + char *str) { + + char *in; + char *out; + + assert(str); + + in = out = str; + + for (; *in; in++, out++) { + if (*in == '\\') { + /* Move forward, and ensure it is a valid escape. */ + in++; + + switch (*in) { + case 's': + *out = ' '; + break; + case 'n': + *out = '\n'; + break; + case 't': + *out = '\t'; + break; + case 'r': + *out = '\r'; + break; + case '\\': + *out = '\\'; + break; + case ';': + /* Technically only permitted for strv. */ + *out = ';'; + break; + default: + return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL), "Undefined escape sequence \\%c.", *in); + } + + continue; + } + + *out = *in; + } + *out = '\0'; + + return 0; +} + +/* Note: We do not bother with unescaping the strings, hence the _raw postfix. */ +static int xdg_config_parse_string( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *res = NULL; + char **out = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* XDG does not allow duplicate definitions. */ + if (*out) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Key %s was defined multiple times, ignoring.", lvalue); + return 0; + } + + res = strdup(rvalue); + if (!res) + return log_oom(); + + r = xdg_unescape_string(unit, filename, line, res); + if (r < 0) + return r; + + *out = TAKE_PTR(res); + return 0; +} + +static int strv_strndup_unescape_and_push( + const char *unit, + const char *filename, + unsigned line, + char ***sv, + size_t *n, + const char *start, + const char *end) { + + if (end == start) + return 0; + + _cleanup_free_ char *copy = NULL; + int r; + + copy = strndup(start, end - start); + if (!copy) + return log_oom(); + + r = xdg_unescape_string(unit, filename, line, copy); + if (r < 0) + return r; + + if (!GREEDY_REALLOC(*sv, *n + 2)) /* One extra for NULL */ + return log_oom(); + + (*sv)[*n] = TAKE_PTR(copy); + (*sv)[*n + 1] = NULL; + (*n)++; + + return 0; +} + +static int xdg_config_parse_strv( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***ret_sv = ASSERT_PTR(data); + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + /* XDG does not allow duplicate definitions. */ + if (*ret_sv) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Key %s was already defined, ignoring.", lvalue); + return 0; + } + + size_t n = 0; + _cleanup_strv_free_ char **sv = NULL; + + if (!GREEDY_REALLOC0(sv, 1)) + return log_oom(); + + /* We cannot use strv_split because it does not handle escaping correctly. */ + const char *start = rvalue, *end; + + for (end = start; *end; end++) { + if (*end == '\\') { + /* Move forward, and ensure it is a valid escape. */ + end++; + if (!strchr("sntr\\;", *end)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Undefined escape sequence \\%c.", *end); + return 0; + } + continue; + } + + if (*end == ';') { + r = strv_strndup_unescape_and_push(unit, filename, line, + &sv, &n, + start, end); + if (r < 0) + return r; + + start = end + 1; + } + } + + /* Handle the trailing entry after the last separator */ + r = strv_strndup_unescape_and_push(unit, filename, line, + &sv, &n, + start, end); + if (r < 0) + return r; + + *ret_sv = TAKE_PTR(sv); + return 0; +} + +static int xdg_config_item_table_lookup( + const void *table, + const char *section, + const char *lvalue, + ConfigParserCallback *ret_func, + int *ret_ltype, + void **ret_data, + void *userdata) { + + assert(lvalue); + + /* Ignore any keys with [] as those are translations. */ + if (strchr(lvalue, '[')) { + *ret_func = NULL; + *ret_ltype = 0; + *ret_data = NULL; + return 1; + } + + return config_item_table_lookup(table, section, lvalue, ret_func, ret_ltype, ret_data, userdata); +} + +XdgAutostartService *xdg_autostart_service_parse_desktop(const char *path) { + _cleanup_(xdg_autostart_service_freep) XdgAutostartService *service = NULL; + int r; + + service = new0(XdgAutostartService, 1); + if (!service) + return NULL; + + service->path = strdup(path); + if (!service->path) + return NULL; + + const ConfigTableItem items[] = { + { "Desktop Entry", "Name", xdg_config_parse_string, 0, &service->description }, + { "Desktop Entry", "Exec", xdg_config_parse_string, 0, &service->exec_string }, + { "Desktop Entry", "Path", xdg_config_parse_string, 0, &service->working_directory }, + { "Desktop Entry", "TryExec", xdg_config_parse_string, 0, &service->try_exec }, + { "Desktop Entry", "Type", xdg_config_parse_string, 0, &service->type }, + { "Desktop Entry", "OnlyShowIn", xdg_config_parse_strv, 0, &service->only_show_in }, + { "Desktop Entry", "NotShowIn", xdg_config_parse_strv, 0, &service->not_show_in }, + { "Desktop Entry", "Hidden", xdg_config_parse_bool, 0, &service->hidden }, + { "Desktop Entry", "AutostartCondition", xdg_config_parse_string, 0, &service->autostart_condition }, + { "Desktop Entry", "X-KDE-autostart-condition", xdg_config_parse_string, 0, &service->kde_autostart_condition }, + { "Desktop Entry", "X-GNOME-Autostart-Phase", xdg_config_parse_string, 0, &service->gnome_autostart_phase }, + { "Desktop Entry", "X-systemd-skip", xdg_config_parse_bool, 0, &service->systemd_skip }, + + /* Common entries that we do not use currently. */ + { "Desktop Entry", "Categories", NULL, 0, NULL}, + { "Desktop Entry", "Comment", NULL, 0, NULL}, + { "Desktop Entry", "DBusActivatable", NULL, 0, NULL}, + { "Desktop Entry", "Encoding", NULL, 0, NULL}, + { "Desktop Entry", "GenericName", NULL, 0, NULL}, + { "Desktop Entry", "Icon", NULL, 0, NULL}, + { "Desktop Entry", "Keywords", NULL, 0, NULL}, + { "Desktop Entry", "MimeType", NULL, 0, NULL}, + { "Desktop Entry", "NoDisplay", NULL, 0, NULL}, + { "Desktop Entry", "StartupNotify", NULL, 0, NULL}, + { "Desktop Entry", "StartupWMClass", NULL, 0, NULL}, + { "Desktop Entry", "Terminal", NULL, 0, NULL}, + { "Desktop Entry", "URL", NULL, 0, NULL}, + { "Desktop Entry", "Version", NULL, 0, NULL}, + {} + }; + + r = config_parse(NULL, service->path, NULL, + "Desktop Entry\0", + xdg_config_item_table_lookup, items, + CONFIG_PARSE_RELAXED | CONFIG_PARSE_WARN, + service, + NULL); + /* If parsing failed, only hide the file so it will still mask others. */ + if (r < 0) { + log_warning_errno(r, "Failed to parse %s, ignoring it", service->path); + service->hidden = true; + } + + return TAKE_PTR(service); +} + +int xdg_autostart_format_exec_start( + const char *exec, + char **ret_exec_start) { + + _cleanup_strv_free_ char **exec_split = NULL; + char *res; + size_t n, i; + bool first_arg; + int r; + + /* + * Unfortunately, there is a mismatch between systemd's idea of $PATH and XDGs. I.e. we need to + * ensure that we have an absolute path to support cases where $PATH has been modified from the + * default set. + * + * Note that this is only needed for development environments though; so while it is important, this + * should have no effect in production environments. + * + * To be compliant with the XDG specification, we also need to strip certain parameters and + * such. Doing so properly makes parsing the command line unavoidable. + * + * NOTE: Technically, XDG only specifies " as quotes, while this also accepts '. + */ + r = strv_split_full(&exec_split, exec, NULL, EXTRACT_UNQUOTE | EXTRACT_RELAX); + if (r < 0) + return r; + + if (strv_isempty(exec_split)) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Exec line is empty"); + + first_arg = true; + for (i = n = 0; exec_split[i]; i++) { + _cleanup_free_ char *c = NULL, *raw = NULL, *percent = NULL, *tilde_expanded = NULL; + ssize_t l; + + l = cunescape(exec_split[i], 0, &c); + if (l < 0) + return log_debug_errno(l, "Failed to unescape '%s': %m", exec_split[i]); + + if (first_arg) { + _cleanup_free_ char *executable = NULL; + + /* This is the executable, find it in $PATH */ + first_arg = false; + r = find_executable(c, &executable); + if (r < 0) + return log_info_errno(r, "Exec binary '%s' does not exist: %m", c); + + free_and_replace(exec_split[n++], executable); + continue; + } + + /* + * Remove any standardised XDG fields; we assume they never appear as part of another + * argument as that just does not make any sense as they can be empty (GLib will e.g. turn + * "%f" into an empty argument). Other implementations may handle this differently. + */ + if (STR_IN_SET(c, + "%f", "%F", + "%u", "%U", + "%d", "%D", + "%n", "%N", + "%i", /* Location of icon, could be implemented. */ + "%c", /* Translated application name, could be implemented. */ + "%k", /* Location of desktop file, could be implemented. */ + "%v", + "%m" + )) + continue; + + /* + * %% -> % and then % -> %% means that we correctly quote any % and also quote any left over + * (and invalid) % specifier from the desktop file. + */ + raw = strreplace(c, "%%", "%"); + if (!raw) + return log_oom(); + percent = strreplace(raw, "%", "%%"); + if (!percent) + return log_oom(); + + /* + * Expand ~ if it comes at the beginning of an argument to form a path. + * + * The specification does not mandate this, but we do it anyway for compatibility with + * older KDE code, which supported a more shell-like syntax for users making custom entries. + */ + if (percent[0] == '~' && (isempty(percent + 1) || path_is_absolute(percent + 1))) { + _cleanup_free_ char *home = NULL; + + r = get_home_dir(&home); + if (r < 0) + return r; + + tilde_expanded = path_join(home, &percent[1]); + if (!tilde_expanded) + return log_oom(); + free_and_replace(exec_split[n++], tilde_expanded); + } else + free_and_replace(exec_split[n++], percent); + } + for (; exec_split[n]; n++) + exec_split[n] = mfree(exec_split[n]); + + res = quote_command_line(exec_split, SHELL_ESCAPE_EMPTY); + if (!res) + return log_oom(); + + *ret_exec_start = res; + return 0; +} + +static int xdg_autostart_generate_desktop_condition( + const XdgAutostartService *service, + FILE *f, + const char *test_binary, + const char *condition) { + + int r; + + /* Generate an ExecCondition for GNOME autostart condition */ + if (!isempty(condition)) { + _cleanup_free_ char *gnome_autostart_condition_path = NULL, *e_autostart_condition = NULL; + + r = find_executable(test_binary, &gnome_autostart_condition_path); + if (r < 0) { + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "%s: ExecCondition executable %s not found, unit will not be started automatically: %m", + service->path, test_binary); + fprintf(f, "# ExecCondition using %s skipped due to missing binary.\n", test_binary); + return 0; + } + + e_autostart_condition = cescape(condition); + if (!e_autostart_condition) + return log_oom(); + + log_debug("%s: ExecCondition converted to %s --condition \"%s\"%s", + service->path, gnome_autostart_condition_path, e_autostart_condition, + special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + + fprintf(f, + "ExecCondition=%s --condition \"%s\"\n", + gnome_autostart_condition_path, + e_autostart_condition); + } + + return 0; +} + +int xdg_autostart_service_generate_unit( + const XdgAutostartService *service, + const char *dest) { + + _cleanup_free_ char *path_escaped = NULL, *exec_start = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_strv_free_ char **only_show_in = NULL, **not_show_in = NULL; + int r; + + assert(service); + + /* Nothing to do for hidden services. */ + if (service->hidden) { + log_debug("%s: not generating unit, entry is hidden.", service->path); + return 0; + } + + if (service->systemd_skip) { + log_debug("%s: not generating unit, marked as skipped by generator.", service->path); + return 0; + } + + /* Nothing to do if type is not Application. */ + if (!streq_ptr(service->type, "Application")) { + log_debug("%s: not generating unit, Type=%s is not supported.", service->path, service->type); + return 0; + } + + if (!service->exec_string) { + log_warning("%s: not generating unit, no Exec= line.", service->path); + return 0; + } + + if (service->only_show_in) { + only_show_in = strv_copy(service->only_show_in); + if (!only_show_in) + return log_oom(); + } + + if (service->not_show_in) { + not_show_in = strv_copy(service->not_show_in); + if (!not_show_in) + return log_oom(); + } + + /* The TryExec key cannot be checked properly from the systemd unit, it is trivial to check using + * find_executable though. */ + if (service->try_exec) { + r = find_executable(service->try_exec, NULL); + if (r < 0) { + log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, + "%s: not generating unit, could not find TryExec= binary %s: %m", + service->path, service->try_exec); + return 0; + } + } + + r = xdg_autostart_format_exec_start(service->exec_string, &exec_start); + if (r < 0) { + log_full_errno(r == -ENOENT ? LOG_INFO : LOG_WARNING, r, + r == -ENOENT ? "%s: not generating unit, executable specified in Exec= does not exist." + : "%s: not generating unit, error parsing Exec= line: %m", + service->path); + return 0; + } + + if (service->gnome_autostart_phase) { + /* There is no explicit value for the "Application" phase. + * + * On GNOME secondary startup mechanism handles desktop files with startup phases set. + * We want to mark these as "NotShowIn=GNOME" + * + * If that means no-one will load them, we can get skip it entirely. + */ + if (strv_contains(only_show_in, "GNOME")) { + strv_remove(only_show_in, "GNOME"); + + if (strv_isempty(only_show_in)) { + log_debug("%s: GNOME startup phases are handled separately. Skipping.", + service->path); + return 0; + } + } + log_debug("%s: GNOME startup phases are handled separately, marking as NotShowIn=GNOME.", + service->path); + + if (strv_extend(¬_show_in, "GNOME") < 0) + return log_oom(); + } + + path_escaped = specifier_escape(service->path); + if (!path_escaped) + return log_oom(); + + r = generator_open_unit_file(dest, /* source = */ NULL, service->name, &f); + if (r < 0) + return r; + + fprintf(f, + "[Unit]\n" + "Documentation=man:systemd-xdg-autostart-generator(8)\n" + "SourcePath=%s\n" + "PartOf=graphical-session.target\n\n", + path_escaped); + + if (service->description) { + _cleanup_free_ char *t = NULL; + + t = specifier_escape(service->description); + if (!t) + return log_oom(); + + fprintf(f, "Description=%s\n", t); + } + + /* Only start after the session is ready. */ + fprintf(f, + "After=graphical-session.target\n"); + + fprintf(f, + "\n[Service]\n" + "Type=exec\n" + "ExitType=cgroup\n" + "ExecStart=:%s\n" + "Restart=no\n" + "TimeoutStopSec=5s\n" + "Slice=app.slice\n", + exec_start); + + if (service->working_directory) { + _cleanup_free_ char *e_working_directory = NULL; + + e_working_directory = cescape(service->working_directory); + if (!e_working_directory) + return log_oom(); + + fprintf(f, "WorkingDirectory=-%s\n", e_working_directory); + } + + /* Generate an ExecCondition to check $XDG_CURRENT_DESKTOP */ + if (!strv_isempty(only_show_in) || !strv_isempty(not_show_in)) { + _cleanup_free_ char *only_show_in_string = NULL, *not_show_in_string = NULL, *e_only_show_in = NULL, *e_not_show_in = NULL; + + only_show_in_string = strv_join(only_show_in, ":"); + not_show_in_string = strv_join(not_show_in, ":"); + if (!only_show_in_string || !not_show_in_string) + return log_oom(); + + e_only_show_in = cescape(only_show_in_string); + e_not_show_in = cescape(not_show_in_string); + if (!e_only_show_in || !e_not_show_in) + return log_oom(); + + /* Just assume the values are reasonably sane */ + fprintf(f, + "ExecCondition=" LIBEXECDIR "/systemd-xdg-autostart-condition \"%s\" \"%s\"\n", + e_only_show_in, + e_not_show_in); + } + + r = xdg_autostart_generate_desktop_condition(service, f, + "gnome-systemd-autostart-condition", + service->autostart_condition); + if (r < 0) + return r; + + r = xdg_autostart_generate_desktop_condition(service, f, + "kde-systemd-start-condition", + service->kde_autostart_condition); + if (r < 0) + return r; + + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write unit %s: %m", service->name); + + log_debug("%s: symlinking %s in xdg-desktop-autostart.target/.wants%s", + service->path, service->name, special_glyph(SPECIAL_GLYPH_ELLIPSIS)); + return generator_add_symlink(dest, "xdg-desktop-autostart.target", "wants", service->name); +} diff --git a/src/xdg-autostart-generator/xdg-autostart-service.h b/src/xdg-autostart-generator/xdg-autostart-service.h new file mode 100644 index 0000000..61a4a73 --- /dev/null +++ b/src/xdg-autostart-generator/xdg-autostart-service.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "macro.h" + +typedef struct XdgAutostartService { + char *name; + char *path; + char *description; /* Name in XDG desktop file */ + + char *type; /* Purely as an assertion check */ + char *exec_string; + char *working_directory; + + char **only_show_in; + char **not_show_in; + + char *try_exec; + char *autostart_condition; /* This is mostly GNOME specific */ + char *kde_autostart_condition; + + char *gnome_autostart_phase; + + bool hidden; + bool systemd_skip; + +} XdgAutostartService; + +XdgAutostartService * xdg_autostart_service_free(XdgAutostartService *s); +DEFINE_TRIVIAL_CLEANUP_FUNC(XdgAutostartService*, xdg_autostart_service_free); + +char *xdg_autostart_service_translate_name(const char *name); +int xdg_autostart_format_exec_start(const char *exec, char **ret_exec_start); + +XdgAutostartService *xdg_autostart_service_parse_desktop(const char *path); +int xdg_autostart_service_generate_unit(const XdgAutostartService *service, const char *dest); -- cgit v1.2.3